996 files changed, 128953 insertions, 220621 deletions
diff --git a/third_party/aom/.clang-format b/third_party/aom/.clang-format
index c1483199e..e76a526e4 100644
--- a/third_party/aom/.clang-format
+++ b/third_party/aom/.clang-format
@@ -1,12 +1,12 @@
 ---
 Language:        Cpp
 # BasedOnStyle:  Google
-# Generated with clang-format 4.0.1
+# Generated with clang-format 5.0.0
 AccessModifierOffset: -1
 AlignAfterOpenBracket: Align
 AlignConsecutiveAssignments: false
 AlignConsecutiveDeclarations: false
-AlignEscapedNewlinesLeft: true
+AlignEscapedNewlines: Left
 AlignOperands:   true
 AlignTrailingComments: true
 AllowAllParametersOfDeclarationOnNextLine: true
@@ -33,14 +33,20 @@ BraceWrapping:
   BeforeCatch:     false
   BeforeElse:      false
   IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
 BreakBeforeBinaryOperators: None
 BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
 BreakAfterJavaFieldAnnotations: false
 BreakStringLiterals: true
 ColumnLimit:     80
 CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
 ConstructorInitializerAllOnOneLineOrOnePerLine: false
 ConstructorInitializerIndentWidth: 4
 ContinuationIndentWidth: 4
@@ -48,7 +54,11 @@ Cpp11BracedListStyle: false
 DerivePointerAlignment: false
 DisableFormat:   false
 ExperimentalAutoDetectBinPacking: false
-ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
 IncludeCategories:
   - Regex:           '^<.*\.h>'
     Priority:        1
@@ -70,6 +80,7 @@ NamespaceIndentation: None
 ObjCBlockIndentWidth: 2
 ObjCSpaceAfterProperty: false
 ObjCSpaceBeforeProtocolList: false
+PenaltyBreakAssignment: 2
 PenaltyBreakBeforeFirstCallParameter: 1
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
@@ -79,6 +90,7 @@ PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Right
 ReflowComments:  true
 SortIncludes:    false
+SortUsingDeclarations: true
 SpaceAfterCStyleCast: false
 SpaceAfterTemplateKeyword: true
 SpaceBeforeAssignmentOperators: true
diff --git a/third_party/aom/.cmake-format.py b/third_party/aom/.cmake-format.py
new file mode 100644
index 000000000..cebad0742
--- /dev/null
+++ b/third_party/aom/.cmake-format.py
@@ -0,0 +1,47 @@
+# How wide to allow formatted cmake files
+line_width = 80
+
+# How many spaces to tab for indent
+tab_size = 2
+
+# If arglists are longer than this, break them always. This introduces some
+# interesting effects with complicated 'if' statements. However, we want file
+# lists to look reasonable. Try to strike a balance.
+max_subargs_per_line = 10
+
+# If true, separate flow control names from their parentheses with a space
+separate_ctrl_name_with_space = False
+
+# If true, separate function names from parentheses with a space
+separate_fn_name_with_space = False
+
+# If a statement is wrapped to more than one line, than dangle the closing
+# parenthesis on it's own line
+dangle_parens = False
+
+# What character to use for bulleted lists
+bullet_char = u'*'
+
+# What character to use as punctuation after numerals in an enumerated list
+enum_char = u'.'
+
+# What style line endings to use in the output.
+line_ending = u'unix'
+
+# Format command names consistently as 'lower' or 'upper' case
+command_case = u'lower'
+
+# Specify structure for custom cmake functions
+additional_commands = {
+  "foo": {
+    "flags": [
+      "BAR",
+      "BAZ"
+    ],
+    "kwargs": {
+      "HEADERS": "*",
+      "DEPENDS": "*",
+      "SOURCES": "*"
+    }
+  }
+}
diff --git a/third_party/aom/.mailmap b/third_party/aom/.mailmap
index 4672e5ccb..bbe4525b1 100644
--- a/third_party/aom/.mailmap
+++ b/third_party/aom/.mailmap
@@ -23,6 +23,7 @@ Ralph Giles <giles@xiph.org> <giles@entropywave.com>
 Ralph Giles <giles@xiph.org> <giles@mozilla.com>
 Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
 Sami Pietilä <samipietila@google.com>
+Sarah Parker <sarahparker@google.com>
 Tamar Levy <tamar.levy@intel.com>
 Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
@@ -30,3 +31,4 @@ Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.co
 Tom Finegan <tomfinegan@google.com>
 Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
+Yaowu Xu <yaowu@google.com> <yaowu@yaowu-macbookpro.roam.corp.google.com>
diff --git a/third_party/aom/CMakeLists.txt b/third_party/aom/CMakeLists.txt
index 59338b8b5..0b445722d 100644
--- a/third_party/aom/CMakeLists.txt
+++ b/third_party/aom/CMakeLists.txt
@@ -1,34 +1,59 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
 cmake_minimum_required(VERSION 3.5)
 
-if (NOT EMSCRIPTEN)
-  if (NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Release" CACHE
-      "Build type: Debug, Release, RelWithDebInfo or MinSizeRel" STRING FORCE)
-  endif ()
-endif ()
+if(NOT EMSCRIPTEN)
+  if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release"
+        CACHE "Build type: Debug, Release, RelWithDebInfo or MinSizeRel" STRING
+              FORCE)
+  endif()
+endif()
 
-option(ENABLE_ADOPTED_EXPERIMENTS "Enable adopted experiments." ON)
 option(ENABLE_CCACHE "Enable ccache support." OFF)
+option(ENABLE_DECODE_PERF_TESTS "Enables decoder performance tests" OFF)
 option(ENABLE_DISTCC "Enable distcc support." OFF)
 option(ENABLE_DOCS "Enable documentation generation (doxygen required)." ON)
+option(ENABLE_ENCODE_PERF_TESTS "Enables encoder performance tests" OFF)
 option(ENABLE_EXAMPLES "Enables build of example code." ON)
 option(ENABLE_GOMA "Enable goma support." OFF)
 option(ENABLE_IDE_TEST_HOSTING
        "Enables running tests within IDEs like Visual Studio and Xcode." OFF)
 option(ENABLE_NASM "Use nasm instead of yasm for x86 assembly." OFF)
+option(ENABLE_TESTDATA "Enables unit test data download targets." ON)
+option(ENABLE_TESTS "Enables unit tests." ON)
 option(ENABLE_TOOLS "Enable applications in tools sub directory." ON)
 option(ENABLE_WERROR "Converts warnings to errors at compile time." OFF)
 
+# ARM assembly/intrinsics flags.
+option(ENABLE_NEON "Enables NEON optimizations on ARM targets." ON)
+
+# MIPS assembly/intrinsics flags.
+option(ENABLE_DSPR2 "Enables DSPR2 optimizations on MIPS targets." OFF)
+option(ENABLE_MSA "Enables MSA optimizations on MIPS targets." OFF)
+
+# VSX intrinsics flags.
+option(ENABLE_VSX "Enables VSX optimizations on PowerPC targets." ON)
+
+# x86/x86_64 assembly/intrinsics flags.
+option(ENABLE_MMX "Enables MMX optimizations on x86/x86_64 targets." ON)
+option(ENABLE_SSE "Enables SSE optimizations on x86/x86_64 targets." ON)
+option(ENABLE_SSE2 "Enables SSE2 optimizations on x86/x86_64 targets." ON)
+option(ENABLE_SSE3 "Enables SSE3 optimizations on x86/x86_64 targets." ON)
+option(ENABLE_SSSE3 "Enables SSSE3 optimizations on x86/x86_64 targets." ON)
+option(ENABLE_SSE4_1 "Enables SSE4_1 optimizations on x86/x86_64 targets." ON)
+option(ENABLE_SSE4_2 "Enables SSE4_2 optimizations on x86/x86_64 targets." ON)
+option(ENABLE_AVX "Enables AVX optimizations on x86/x86_64 targets." ON)
+option(ENABLE_AVX2 "Enables AVX2 optimizations on x86/x86_64 targets." ON)
+
 # $BUILD_SHARED_LIBS is a CMake built-in-- it's listed here for visibility.
 option(BUILD_SHARED_LIBS "CMake should generate a shared library build." OFF)
 
@@ -36,13 +61,18 @@ project(AOM C CXX)
 
 set(AOM_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
 set(AOM_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+set(INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/include"
+    CACHE PATH "Installation path of includes")
+set(LIB_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/lib"
+    CACHE PATH "Installation path of libraries")
 
-if ("${AOM_ROOT}" STREQUAL "${AOM_CONFIG_DIR}")
+if("${AOM_ROOT}" STREQUAL "${AOM_CONFIG_DIR}")
   message(FATAL_ERROR
-          "Building from within the aom source tree is not supported.\n"
-          "Hint: mkdir -p ../aom_build && cd ../aom_build\n"
-          "Run cmake from there.")
-endif ()
+            "Building from within the aom source tree is not supported.\n"
+            "Hint: Run these commands\n" "$ rm -rf CMakeCache.txt CMakeFiles\n"
+            "$ mkdir -p ../aom_build\n" "$ cd ../aom_build\n"
+            "And re-run CMake from the aom_build directory.")
+endif()
 
 include("${AOM_ROOT}/build/cmake/aom_configure.cmake")
 include("${AOM_ROOT}/aom_dsp/aom_dsp.cmake")
@@ -55,188 +85,172 @@ include("${AOM_ROOT}/test/test.cmake")
 include("${AOM_ROOT}/build/cmake/sanitizers.cmake")
 include("${AOM_ROOT}/build/cmake/util.cmake")
 
-set(AOM_RTCD_SOURCES
-    "${AOM_CONFIG_DIR}/aom_dsp_rtcd.h"
-    "${AOM_CONFIG_DIR}/aom_scale_rtcd.h"
-    "${AOM_CONFIG_DIR}/av1_rtcd.h"
-    "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl"
-    "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c"
-    "${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl"
-    "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c"
-    "${AOM_ROOT}/av1/common/av1_rtcd_defs.pl"
-    "${AOM_ROOT}/av1/common/av1_rtcd.c"
-    "${AOM_ROOT}/build/make/rtcd.pl")
-
-# TODO(tomfinegan): Use libwebm's cmake support directly.
-set(AOM_LIBWEBM_SOURCES
-    "${AOM_ROOT}/third_party/libwebm/common/hdr_util.cc"
-    "${AOM_ROOT}/third_party/libwebm/common/hdr_util.h"
-    "${AOM_ROOT}/third_party/libwebm/common/webmids.h"
-    "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxer.cc"
-    "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxer.h"
-    "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxertypes.h"
-    "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc"
-    "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxerutil.h"
-    "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvwriter.cc"
-    "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvwriter.h"
-    "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvparser.cc"
-    "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvparser.h"
-    "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvreader.cc"
-    "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvreader.h")
-
-set(AOM_LIBYUV_SOURCES
-    "${AOM_ROOT}/third_party/libyuv/include/libyuv/basic_types.h"
-    "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert.h"
-    "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert_argb.h"
-    "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert_from.h"
-    "${AOM_ROOT}/third_party/libyuv/include/libyuv/cpu_id.h"
-    "${AOM_ROOT}/third_party/libyuv/include/libyuv/planar_functions.h"
-    "${AOM_ROOT}/third_party/libyuv/include/libyuv/rotate.h"
-    "${AOM_ROOT}/third_party/libyuv/include/libyuv/row.h"
-    "${AOM_ROOT}/third_party/libyuv/include/libyuv/scale.h"
-    "${AOM_ROOT}/third_party/libyuv/include/libyuv/scale_row.h"
-    "${AOM_ROOT}/third_party/libyuv/source/cpu_id.cc"
-    "${AOM_ROOT}/third_party/libyuv/source/planar_functions.cc"
-    "${AOM_ROOT}/third_party/libyuv/source/row_any.cc"
-    "${AOM_ROOT}/third_party/libyuv/source/row_common.cc"
-    "${AOM_ROOT}/third_party/libyuv/source/row_gcc.cc"
-    "${AOM_ROOT}/third_party/libyuv/source/row_mips.cc"
-    "${AOM_ROOT}/third_party/libyuv/source/row_neon.cc"
-    "${AOM_ROOT}/third_party/libyuv/source/row_neon64.cc"
-    "${AOM_ROOT}/third_party/libyuv/source/row_win.cc"
-    "${AOM_ROOT}/third_party/libyuv/source/scale.cc"
-    "${AOM_ROOT}/third_party/libyuv/source/scale_any.cc"
-    "${AOM_ROOT}/third_party/libyuv/source/scale_common.cc"
-    "${AOM_ROOT}/third_party/libyuv/source/scale_gcc.cc"
-    "${AOM_ROOT}/third_party/libyuv/source/scale_mips.cc"
-    "${AOM_ROOT}/third_party/libyuv/source/scale_neon.cc"
-    "${AOM_ROOT}/third_party/libyuv/source/scale_neon64.cc"
-    "${AOM_ROOT}/third_party/libyuv/source/scale_win.cc")
-
-set(AOM_SOURCES
-    "${AOM_CONFIG_DIR}/aom_config.c"
-    "${AOM_CONFIG_DIR}/aom_config.h"
-    "${AOM_ROOT}/aom/aom.h"
-    "${AOM_ROOT}/aom/aom_codec.h"
-    "${AOM_ROOT}/aom/aom_decoder.h"
-    "${AOM_ROOT}/aom/aom_encoder.h"
-    "${AOM_ROOT}/aom/aom_frame_buffer.h"
-    "${AOM_ROOT}/aom/aom_image.h"
-    "${AOM_ROOT}/aom/aom_integer.h"
-    "${AOM_ROOT}/aom/aomcx.h"
-    "${AOM_ROOT}/aom/aomdx.h"
-    "${AOM_ROOT}/aom/internal/aom_codec_internal.h"
-    "${AOM_ROOT}/aom/src/aom_codec.c"
-    "${AOM_ROOT}/aom/src/aom_decoder.c"
-    "${AOM_ROOT}/aom/src/aom_encoder.c"
-    "${AOM_ROOT}/aom/src/aom_image.c")
-
-set(AOM_COMMON_APP_UTIL_SOURCES
-    "${AOM_ROOT}/args.c"
-    "${AOM_ROOT}/args.h"
-    "${AOM_ROOT}/md5_utils.c"
-    "${AOM_ROOT}/md5_utils.h"
-    "${AOM_ROOT}/tools_common.c"
-    "${AOM_ROOT}/tools_common.h"
-    "${AOM_ROOT}/video_common.h"
-    "${AOM_ROOT}/y4menc.c"
-    "${AOM_ROOT}/y4menc.h")
-
-set(AOM_DECODER_APP_UTIL_SOURCES
-    "${AOM_ROOT}/ivfdec.c"
-    "${AOM_ROOT}/ivfdec.h"
-    "${AOM_ROOT}/video_reader.c"
-    "${AOM_ROOT}/video_reader.h")
-
-set(AOM_ENCODER_APP_UTIL_SOURCES
-    "${AOM_ROOT}/ivfenc.c"
-    "${AOM_ROOT}/ivfenc.h"
-    "${AOM_ROOT}/video_writer.c"
-    "${AOM_ROOT}/video_writer.h"
-    "${AOM_ROOT}/warnings.c"
-    "${AOM_ROOT}/warnings.h"
-    "${AOM_ROOT}/y4minput.c"
-    "${AOM_ROOT}/y4minput.h"
-    "${AOM_ROOT}/examples/encoder_util.h"
-    "${AOM_ROOT}/examples/encoder_util.c")
-
-set(AOM_ENCODER_STATS_SOURCES
-    "${AOM_ROOT}/aomstats.c"
-    "${AOM_ROOT}/aomstats.h"
-    "${AOM_ROOT}/rate_hist.c"
-    "${AOM_ROOT}/rate_hist.h")
-
-set(AOM_PKG_CONFIG_SOURCES "${AOM_CONFIG_DIR}/aom.pc")
-
-set(AOM_VERSION_SOURCES "${AOM_CONFIG_DIR}/aom_version.h")
-
-set(AOM_WEBM_DECODER_SOURCES
-    "${AOM_ROOT}/webmdec.cc"
-    "${AOM_ROOT}/webmdec.h")
-
-set(AOM_WEBM_ENCODER_SOURCES
-    "${AOM_ROOT}/webmenc.cc"
-    "${AOM_ROOT}/webmenc.h")
-
-include_directories(${AOM_ROOT} ${AOM_CONFIG_DIR})
+list(APPEND AOM_RTCD_SOURCES
+            "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h"
+            "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h"
+            "${AOM_CONFIG_DIR}/config/av1_rtcd.h"
+            "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl"
+            "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c"
+            "${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl"
+            "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c"
+            "${AOM_ROOT}/av1/common/av1_rtcd_defs.pl"
+            "${AOM_ROOT}/av1/common/av1_rtcd.c"
+            "${AOM_ROOT}/build/make/rtcd.pl")
+
+list(APPEND AOM_LIBWEBM_SOURCES
+            "${AOM_ROOT}/third_party/libwebm/common/hdr_util.cc"
+            "${AOM_ROOT}/third_party/libwebm/common/hdr_util.h"
+            "${AOM_ROOT}/third_party/libwebm/common/webmids.h"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxer.cc"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxer.h"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxertypes.h"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxerutil.h"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvwriter.cc"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvwriter.h"
+            "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvparser.cc"
+            "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvparser.h"
+            "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvreader.cc"
+            "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvreader.h")
+
+list(APPEND AOM_LIBYUV_SOURCES
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/basic_types.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert_argb.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert_from.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/cpu_id.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/planar_functions.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/rotate.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/row.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/scale.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/scale_row.h"
+            "${AOM_ROOT}/third_party/libyuv/source/cpu_id.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/planar_functions.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_any.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_common.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_gcc.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_mips.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_neon.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_neon64.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_win.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_any.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_common.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_gcc.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_mips.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_neon.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_neon64.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_win.cc")
+
+list(APPEND AOM_SOURCES
+            "${AOM_CONFIG_DIR}/config/aom_config.c"
+            "${AOM_CONFIG_DIR}/config/aom_config.h"
+            "${AOM_ROOT}/aom/aom.h"
+            "${AOM_ROOT}/aom/aom_codec.h"
+            "${AOM_ROOT}/aom/aom_decoder.h"
+            "${AOM_ROOT}/aom/aom_encoder.h"
+            "${AOM_ROOT}/aom/aom_frame_buffer.h"
+            "${AOM_ROOT}/aom/aom_image.h"
+            "${AOM_ROOT}/aom/aom_integer.h"
+            "${AOM_ROOT}/aom/aomcx.h"
+            "${AOM_ROOT}/aom/aomdx.h"
+            "${AOM_ROOT}/aom/internal/aom_codec_internal.h"
+            "${AOM_ROOT}/aom/src/aom_codec.c"
+            "${AOM_ROOT}/aom/src/aom_decoder.c"
+            "${AOM_ROOT}/aom/src/aom_encoder.c"
+            "${AOM_ROOT}/aom/src/aom_image.c"
+            "${AOM_ROOT}/aom/src/aom_integer.c")
+
+list(APPEND AOM_COMMON_APP_UTIL_SOURCES "${AOM_ROOT}/common/args.c"
+            "${AOM_ROOT}/common/args.h" "${AOM_ROOT}/common/md5_utils.c"
+            "${AOM_ROOT}/common/md5_utils.h"
+            "${AOM_ROOT}/common/tools_common.c"
+            "${AOM_ROOT}/common/tools_common.h"
+            "${AOM_ROOT}/common/video_common.h" "${AOM_ROOT}/common/y4menc.c"
+            "${AOM_ROOT}/common/y4menc.h")
+
+list(APPEND AOM_DECODER_APP_UTIL_SOURCES "${AOM_ROOT}/common/ivfdec.c"
+            "${AOM_ROOT}/common/ivfdec.h" "${AOM_ROOT}/common/obudec.c"
+            "${AOM_ROOT}/common/obudec.h" "${AOM_ROOT}/common/video_reader.c"
+            "${AOM_ROOT}/common/video_reader.h")
+
+list(APPEND AOM_ENCODER_APP_UTIL_SOURCES
+            "${AOM_ROOT}/common/ivfenc.c"
+            "${AOM_ROOT}/common/ivfenc.h"
+            "${AOM_ROOT}/common/video_writer.c"
+            "${AOM_ROOT}/common/video_writer.h"
+            "${AOM_ROOT}/common/warnings.c"
+            "${AOM_ROOT}/common/warnings.h"
+            "${AOM_ROOT}/common/y4minput.c"
+            "${AOM_ROOT}/common/y4minput.h"
+            "${AOM_ROOT}/examples/encoder_util.h"
+            "${AOM_ROOT}/examples/encoder_util.c")
+
+list(APPEND AOM_ENCODER_STATS_SOURCES "${AOM_ROOT}/stats/aomstats.c"
+            "${AOM_ROOT}/stats/aomstats.h" "${AOM_ROOT}/stats/rate_hist.c"
+            "${AOM_ROOT}/stats/rate_hist.h")
+
+list(APPEND AOM_PKG_CONFIG_SOURCES "${AOM_CONFIG_DIR}/aom.pc")
+
+list(APPEND AOM_VERSION_SOURCES "${AOM_CONFIG_DIR}/config/aom_version.h")
+
+list(APPEND AOM_WEBM_DECODER_SOURCES "${AOM_ROOT}/common/webmdec.cc"
+            "${AOM_ROOT}/common/webmdec.h")
+
+list(APPEND AOM_WEBM_ENCODER_SOURCES "${AOM_ROOT}/common/webmenc.cc"
+            "${AOM_ROOT}/common/webmenc.h")
+
+include_directories(${AOM_ROOT} ${AOM_CONFIG_DIR} ${AOM_ROOT}/apps
+                    ${AOM_ROOT}/common ${AOM_ROOT}/examples ${AOM_ROOT}/stats)
 
 # Targets
 add_library(aom_version ${AOM_VERSION_SOURCES})
 add_dummy_source_file_to_target(aom_version c)
-add_custom_command(
-  OUTPUT "${AOM_CONFIG_DIR}/aom_version.h"
-  COMMAND ${CMAKE_COMMAND}
-  ARGS -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
-    -DAOM_ROOT=${AOM_ROOT}
-    -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
-    -DPERL_EXECUTABLE=${PERL_EXECUTABLE}
-    -P "${AOM_ROOT}/build/cmake/version.cmake"
-  COMMENT "Writing aom_version.h"
-  VERBATIM)
+add_custom_command(OUTPUT "${AOM_CONFIG_DIR}/config/aom_version.h"
+                   COMMAND ${CMAKE_COMMAND} ARGS
+                           -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                           -DAOM_ROOT=${AOM_ROOT}
+                           -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
+                           -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P
+                           "${AOM_ROOT}/build/cmake/version.cmake"
+                   COMMENT "Writing aom_version.h" VERBATIM)
 
 add_custom_target(aom_version_check
-  COMMAND ${CMAKE_COMMAND}
-    -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
-    -DAOM_ROOT=${AOM_ROOT}
-    -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
-    -DPERL_EXECUTABLE=${PERL_EXECUTABLE}
-    -P "${AOM_ROOT}/build/cmake/version.cmake"
-  COMMENT "Updating version info if necessary."
-  VERBATIM)
+                  COMMAND ${CMAKE_COMMAND} -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                          -DAOM_ROOT=${AOM_ROOT}
+                          -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
+                          -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P
+                          "${AOM_ROOT}/build/cmake/version.cmake"
+                  COMMENT "Updating version info if necessary." VERBATIM)
 add_dependencies(aom_version aom_version_check)
 
-if (NOT MSVC)
+if(NOT MSVC)
   add_library(aom_pc ${AOM_PKG_CONFIG_SOURCES})
   add_dummy_source_file_to_target(aom_pc c)
-  add_custom_command(
-    OUTPUT "${AOM_CONFIG_DIR}/aom.pc"
-    COMMAND ${CMAKE_COMMAND}
-    ARGS -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
-      -DAOM_ROOT=${AOM_ROOT}
-      -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}
-      -DCMAKE_PROJECT_NAME=${CMAKE_PROJECT_NAME}
-      -DCONFIG_MULTITHREAD=${CONFIG_MULTITHREAD}
-      -DHAVE_PTHREAD_H=${HAVE_PTHREAD_H}
-      -P "${AOM_ROOT}/build/cmake/pkg_config.cmake"
-      COMMENT "Writing aom.pc"
-      VERBATIM)
+  add_custom_command(OUTPUT "${AOM_CONFIG_DIR}/aom.pc"
+                     COMMAND ${CMAKE_COMMAND} ARGS
+                             -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                             -DAOM_ROOT=${AOM_ROOT}
+                             -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}
+                             -DCMAKE_PROJECT_NAME=${CMAKE_PROJECT_NAME}
+                             -DCONFIG_MULTITHREAD=${CONFIG_MULTITHREAD}
+                             -DHAVE_PTHREAD_H=${HAVE_PTHREAD_H} -P
+                             "${AOM_ROOT}/build/cmake/pkg_config.cmake"
+                     COMMENT "Writing aom.pc" VERBATIM)
   add_dependencies(aom_pc aom_version)
-endif ()
+endif()
 
 # TODO(tomfinegan): Move rtcd target setup where it belongs for each rtcd
 # source.
 add_rtcd_build_step("${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl"
-                    "${AOM_CONFIG_DIR}/aom_dsp_rtcd.h"
-                    "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c"
-                    "aom_dsp_rtcd")
+                    "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h"
+                    "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c" "aom_dsp_rtcd")
 add_rtcd_build_step("${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl"
-                    "${AOM_CONFIG_DIR}/aom_scale_rtcd.h"
-                    "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c"
-                    "aom_scale_rtcd")
+                    "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h"
+                    "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c" "aom_scale_rtcd")
 add_rtcd_build_step("${AOM_ROOT}/av1/common/av1_rtcd_defs.pl"
-                    "${AOM_CONFIG_DIR}/av1_rtcd.h"
-                    "${AOM_ROOT}/av1/common/av1_rtcd.c"
-                    "av1_rtcd")
+                    "${AOM_CONFIG_DIR}/config/av1_rtcd.h"
+                    "${AOM_ROOT}/av1/common/av1_rtcd.c" "av1_rtcd")
 
 add_library(aom_rtcd OBJECT ${AOM_RTCD_SOURCES})
 add_dependencies(aom_rtcd aom_version)
@@ -244,9 +258,9 @@ add_dependencies(aom_rtcd aom_version)
 add_library(aom_encoder_stats OBJECT ${AOM_ENCODER_STATS_SOURCES})
 add_library(aom ${AOM_SOURCES} $<TARGET_OBJECTS:aom_rtcd>)
 
-if (NOT MSVC AND NOT APPLE)
+if(NOT MSVC AND NOT APPLE)
   target_link_libraries(aom ${AOM_LIB_LINK_TYPE} m)
-endif ()
+endif()
 
 # List of object and static library targets.
 set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_rtcd aom_encoder_stats aom_mem
@@ -261,365 +275,513 @@ setup_aom_scale_targets()
 setup_av1_targets()
 
 # Make all library targets depend on aom_rtcd to make sure it builds first.
-foreach (aom_lib ${AOM_LIB_TARGETS})
-  if (NOT "${aom_lib}" STREQUAL "aom_rtcd")
+foreach(aom_lib ${AOM_LIB_TARGETS})
+  if(NOT "${aom_lib}" STREQUAL "aom_rtcd")
     add_dependencies(${aom_lib} aom_rtcd)
-  endif ()
-endforeach ()
+  endif()
+endforeach()
 
-# Generate a stub file containing the C function usage_exit(). Users of the
+# Generate C/C++ stub files containing the function usage_exit(). Users of the
 # aom_common_app_util library must define this function. This is a convenience
 # to allow omission of the function from applications that might want to use
-# other pieces of the util support without defining the usage_exit().
+# other pieces of the util support without defining usage_exit().
 file(WRITE "${AOM_CONFIG_DIR}/usage_exit.c" "void usage_exit(void) {}")
+file(WRITE "${AOM_CONFIG_DIR}/usage_exit.cc"
+           "extern \"C\" void usage_exit(void) {}")
 
 #
 # Application and application support targets.
 #
-if (CONFIG_UNIT_TESTS OR ENABLE_EXAMPLES OR ENABLE_TOOLS)
+if(ENABLE_EXAMPLES OR ENABLE_TESTS OR ENABLE_TOOLS)
   add_library(aom_common_app_util OBJECT ${AOM_COMMON_APP_UTIL_SOURCES})
-  if (CONFIG_AV1_DECODER)
+  if(CONFIG_AV1_DECODER)
     add_library(aom_decoder_app_util OBJECT ${AOM_DECODER_APP_UTIL_SOURCES})
-  endif ()
-  if (CONFIG_AV1_ENCODER)
+    # obudec depends on internal headers that require *rtcd.h
+    add_dependencies(aom_decoder_app_util aom_rtcd)
+  endif()
+  if(CONFIG_AV1_ENCODER)
     add_library(aom_encoder_app_util OBJECT ${AOM_ENCODER_APP_UTIL_SOURCES})
-  endif ()
-endif ()
+  endif()
+endif()
 
-if (CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
-  add_executable(aomdec
-                 "${AOM_ROOT}/aomdec.c"
+if((CONFIG_AV1_DECODER OR CONFIG_AV1_ENCODER) AND ENABLE_EXAMPLES)
+  add_executable(resize_util "${AOM_ROOT}/examples/resize_util.c"
+                 $<TARGET_OBJECTS:aom_common_app_util>)
+  list(APPEND AOM_APP_TARGETS resize_util)
+endif()
+
+if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
+  add_executable(aomdec "${AOM_ROOT}/apps/aomdec.c"
+                 $<TARGET_OBJECTS:aom_common_app_util>
+                 $<TARGET_OBJECTS:aom_decoder_app_util>)
+  add_executable(decode_to_md5 "${AOM_ROOT}/examples/decode_to_md5.c"
                  $<TARGET_OBJECTS:aom_common_app_util>
                  $<TARGET_OBJECTS:aom_decoder_app_util>)
-  add_executable(decode_to_md5
-                 "${AOM_ROOT}/examples/decode_to_md5.c"
+  add_executable(decode_with_drops "${AOM_ROOT}/examples/decode_with_drops.c"
                  $<TARGET_OBJECTS:aom_common_app_util>
                  $<TARGET_OBJECTS:aom_decoder_app_util>)
-  add_executable(decode_with_drops
-                 "${AOM_ROOT}/examples/decode_with_drops.c"
+  add_executable(simple_decoder "${AOM_ROOT}/examples/simple_decoder.c"
                  $<TARGET_OBJECTS:aom_common_app_util>
                  $<TARGET_OBJECTS:aom_decoder_app_util>)
-  add_executable(simple_decoder
-                 "${AOM_ROOT}/examples/simple_decoder.c"
+  add_executable(scalable_decoder "${AOM_ROOT}/examples/scalable_decoder.c"
                  $<TARGET_OBJECTS:aom_common_app_util>
                  $<TARGET_OBJECTS:aom_decoder_app_util>)
 
-  if (CONFIG_ANALYZER)
-    add_executable(analyzer
-                   "${AOM_ROOT}/examples/analyzer.cc"
+  if(CONFIG_ANALYZER)
+    add_executable(analyzer "${AOM_ROOT}/examples/analyzer.cc"
                    $<TARGET_OBJECTS:aom_common_app_util>
                    $<TARGET_OBJECTS:aom_decoder_app_util>)
-    target_link_libraries(analyzer
-                          ${AOM_LIB_LINK_TYPE} ${wxWidgets_LIBRARIES})
-    set(AOM_APP_TARGETS ${AOM_APP_TARGETS} analyzer)
-    set(AOM_DECODER_EXAMPLE_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS} analyzer)
-  endif ()
-
-  if (CONFIG_INSPECTION)
-    add_executable(inspect
-                   "${AOM_ROOT}/examples/inspect.c"
+    target_link_libraries(analyzer ${AOM_LIB_LINK_TYPE} ${wxWidgets_LIBRARIES})
+    list(APPEND AOM_APP_TARGETS analyzer)
+    list(APPEND AOM_DECODER_EXAMPLE_TARGETS analyzer)
+  endif()
+
+  if(CONFIG_INSPECTION)
+    add_executable(inspect "${AOM_ROOT}/examples/inspect.c"
                    $<TARGET_OBJECTS:aom_common_app_util>
                    $<TARGET_OBJECTS:aom_decoder_app_util>)
-    set(AOM_DECODER_EXAMPLE_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS} inspect)
+    list(APPEND AOM_DECODER_EXAMPLE_TARGETS inspect)
 
-    if (EMSCRIPTEN)
+    if(EMSCRIPTEN)
       add_preproc_definition(_POSIX_SOURCE)
-      append_link_flag_to_target("inspect" "-s TOTAL_MEMORY=134217728")
+      append_link_flag_to_target("inspect" "-s TOTAL_MEMORY=402653184")
       append_link_flag_to_target("inspect" "-s MODULARIZE=1")
       append_link_flag_to_target("inspect"
                                  "-s EXPORT_NAME=\"\'DecoderModule\'\"")
       append_link_flag_to_target("inspect" "--memory-init-file 0")
 
-      if ("${CMAKE_BUILD_TYPE}" STREQUAL "")
-        # Default to -O3 when no build type specified.
+      if("${CMAKE_BUILD_TYPE}" STREQUAL "")
+
+        # Default to -O3 when no build type is specified.
         append_compiler_flag("-O3")
-      endif ()
+      endif()
 
       em_link_post_js(inspect "${AOM_ROOT}/tools/inspect-post.js")
-    endif ()
-  endif ()
+    endif()
+  endif()
 
   # Maintain a list of decoder example targets.
-  set(AOM_DECODER_EXAMPLE_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS}
-      aomdec decode_to_md5 decode_with_drops simple_decoder)
+  list(APPEND AOM_DECODER_EXAMPLE_TARGETS aomdec decode_to_md5
+              decode_with_drops scalable_decoder simple_decoder)
 
   # Add decoder examples to the app targets list.
-  set(AOM_APP_TARGETS ${AOM_APP_TARGETS} ${AOM_DECODER_EXAMPLE_TARGETS})
-endif ()
+  list(APPEND AOM_APP_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS})
+endif()
 
-if (CONFIG_AV1_ENCODER)
-  if (ENABLE_EXAMPLES)
-    add_executable(aomenc
-                   "${AOM_ROOT}/aomenc.c"
+if(CONFIG_AV1_ENCODER)
+  if(ENABLE_EXAMPLES)
+    add_executable(aomenc "${AOM_ROOT}/apps/aomenc.c"
                    $<TARGET_OBJECTS:aom_common_app_util>
                    $<TARGET_OBJECTS:aom_encoder_app_util>
                    $<TARGET_OBJECTS:aom_encoder_stats>)
-    add_executable(lossless_encoder
-                   "${AOM_ROOT}/examples/lossless_encoder.c"
+    add_executable(lossless_encoder "${AOM_ROOT}/examples/lossless_encoder.c"
                    $<TARGET_OBJECTS:aom_common_app_util>
                    $<TARGET_OBJECTS:aom_encoder_app_util>)
-    add_executable(set_maps
-                   "${AOM_ROOT}/examples/set_maps.c"
+    add_executable(set_maps "${AOM_ROOT}/examples/set_maps.c"
                    $<TARGET_OBJECTS:aom_common_app_util>
                    $<TARGET_OBJECTS:aom_encoder_app_util>)
-    add_executable(simple_encoder
-                   "${AOM_ROOT}/examples/simple_encoder.c"
+    add_executable(simple_encoder "${AOM_ROOT}/examples/simple_encoder.c"
                    $<TARGET_OBJECTS:aom_common_app_util>
                    $<TARGET_OBJECTS:aom_encoder_app_util>)
-    add_executable(twopass_encoder
-                   "${AOM_ROOT}/examples/twopass_encoder.c"
+    add_executable(twopass_encoder "${AOM_ROOT}/examples/twopass_encoder.c"
                    $<TARGET_OBJECTS:aom_common_app_util>
                    $<TARGET_OBJECTS:aom_encoder_app_util>)
-
-    # Maintain a list of encoder example targets.
-    set(AOM_ENCODER_EXAMPLE_TARGETS
-        aomenc lossless_encoder set_maps simple_encoder twopass_encoder)
-
-    # Add encoder examples to app target list.
-    set(AOM_APP_TARGETS ${AOM_APP_TARGETS} ${AOM_ENCODER_EXAMPLE_TARGETS})
-  endif ()
-
-  if (ENABLE_TOOLS AND CONFIG_ENTROPY_STATS)
-    # TODO(tomfinegan): Sort out why a simple link command with
-    # aom_entropy_optimizer.c won't work on macos, but dragging in all the
-    # helper machinery allows the link to succeed.
-    add_executable(aom_entropy_optimizer
-                   "${AOM_CONFIG_DIR}/usage_exit.c"
-                   "${AOM_ROOT}/tools/aom_entropy_optimizer.c"
+    add_executable(noise_model "${AOM_ROOT}/examples/noise_model.c"
                    $<TARGET_OBJECTS:aom_common_app_util>
                    $<TARGET_OBJECTS:aom_encoder_app_util>)
+    add_executable(scalable_encoder "${AOM_ROOT}/examples/scalable_encoder.c"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+
+    # Maintain a list of encoder example targets.
+    list(APPEND AOM_ENCODER_EXAMPLE_TARGETS aomenc lossless_encoder noise_model
+                set_maps simple_encoder scalable_encoder twopass_encoder)
+  endif()
+
+  if(ENABLE_TOOLS)
+    if(CONFIG_ENTROPY_STATS AND NOT BUILD_SHARED_LIBS)
+
+      # TODO(tomfinegan): Sort out why a simple link command with
+      # aom_entropy_optimizer.c won't work on macos, but dragging in all the
+      # helper machinery allows the link to succeed.
+      add_executable(aom_entropy_optimizer "${AOM_CONFIG_DIR}/usage_exit.c"
+                     "${AOM_ROOT}/tools/aom_entropy_optimizer.c"
+                     $<TARGET_OBJECTS:aom_common_app_util>
+                     $<TARGET_OBJECTS:aom_encoder_app_util>)
+
+      # Maintain a list of encoder tool targets.
+      list(APPEND AOM_ENCODER_TOOL_TARGETS aom_entropy_optimizer)
+    endif()
+  endif()
 
-    # Maintain a list of encoder tool targets.
-    set(AOM_ENCODER_TOOL_TARGETS
-        ${AOM_ENCODER_TOOL_TARGETS} aom_entropy_optimizer)
+  # Add encoder examples and tools to the targets list.
+  list(APPEND AOM_APP_TARGETS ${AOM_ENCODER_EXAMPLE_TARGETS}
+              ${AOM_ENCODER_TOOL_TARGETS})
+endif()
 
-      # Add encoder tools to app target list.
-    set(AOM_APP_TARGETS ${AOM_APP_TARGETS} ${AOM_ENCODER_TOOL_TARGETS})
-  endif ()
-endif ()
+if(ENABLE_EXAMPLES)
 
-if (ENABLE_EXAMPLES)
   # Maintain a separate variable listing only the examples to facilitate
   # installation of example programs into an examples sub directory of
   # $AOM_DIST_DIR/bin when building the dist target.
-  set(AOM_EXAMPLE_TARGETS
-      ${AOM_DECODER_EXAMPLE_TARGETS} ${AOM_ENCODER_EXAMPLE_TARGETS})
-endif ()
+  list(APPEND AOM_EXAMPLE_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS}
+              ${AOM_ENCODER_EXAMPLE_TARGETS})
+endif()
+
+if(ENABLE_TOOLS)
+  if(CONFIG_AV1_DECODER)
+    require_cxx_flag_nomsvc("-std=c++11" NO)
+    add_executable(dump_obu "${AOM_CONFIG_DIR}/usage_exit.cc"
+                   "${AOM_ROOT}/tools/dump_obu.cc"
+                   "${AOM_ROOT}/tools/obu_parser.cc"
+                   "${AOM_ROOT}/tools/obu_parser.h"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_decoder_app_util>)
 
-if (ENABLE_TOOLS)
-  # Maintain a separate variable listing only the examples to facilitate
-  # installation of example programs into an tools sub directory of
-  # $AOM_DIST_DIR/bin when building the dist target.
-  set(AOM_TOOL_TARGETS ${AOM_DECODER_TOOL_TARGETS} ${AOM_ENCODER_TOOL_TARGETS})
-endif ()
+    list(APPEND AOM_TOOL_TARGETS dump_obu)
+    list(APPEND AOM_APP_TARGETS dump_obu)
 
-if (ENABLE_EXAMPLES AND CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
-  add_executable(aom_cx_set_ref
-                 "${AOM_ROOT}/examples/aom_cx_set_ref.c"
+    if(NOT MSVC)
+      target_compile_options(dump_obu PUBLIC -std=c++11)
+    endif()
+
+    # Maintain a separate variable listing only the examples to facilitate
+    # installation of example programs into an tools sub directory of
+    # $AOM_DIST_DIR/bin when building the dist target.
+    list(APPEND AOM_TOOL_TARGETS ${AOM_DECODER_TOOL_TARGETS}
+                ${AOM_ENCODER_TOOL_TARGETS})
+  endif()
+endif()
+
+if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
+  add_executable(aom_cx_set_ref "${AOM_ROOT}/examples/aom_cx_set_ref.c"
                  $<TARGET_OBJECTS:aom_common_app_util>
                  $<TARGET_OBJECTS:aom_encoder_app_util>)
-  set(AOM_EXAMPLE_TARGETS ${AOM_EXAMPLE_TARGETS} aom_cx_set_ref)
-  set(AOM_APP_TARGETS ${AOM_APP_TARGETS} aom_cx_set_ref)
-endif ()
+  list(APPEND AOM_EXAMPLE_TARGETS aom_cx_set_ref)
+  list(APPEND AOM_APP_TARGETS aom_cx_set_ref)
+endif()
 
-foreach (aom_app ${AOM_APP_TARGETS})
+if(ENABLE_EXAMPLES AND CONFIG_AV1_ENCODER)
+  add_executable(lightfield_encoder "${AOM_ROOT}/examples/lightfield_encoder.c"
+                 $<TARGET_OBJECTS:aom_common_app_util>
+                 $<TARGET_OBJECTS:aom_encoder_app_util>)
+  list(APPEND AOM_EXAMPLE_TARGETS lightfield_encoder)
+  list(APPEND AOM_APP_TARGETS lightfield_encoder)
+endif()
+
+if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER)
+  add_executable(lightfield_tile_list_decoder
+                 "${AOM_ROOT}/examples/lightfield_tile_list_decoder.c"
+                 $<TARGET_OBJECTS:aom_common_app_util>
+                 $<TARGET_OBJECTS:aom_decoder_app_util>)
+  list(APPEND AOM_EXAMPLE_TARGETS lightfield_tile_list_decoder)
+  list(APPEND AOM_APP_TARGETS lightfield_tile_list_decoder)
+endif()
+
+if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER)
+  add_executable(lightfield_decoder "${AOM_ROOT}/examples/lightfield_decoder.c"
+                 $<TARGET_OBJECTS:aom_common_app_util>
+                 $<TARGET_OBJECTS:aom_decoder_app_util>)
+  list(APPEND AOM_EXAMPLE_TARGETS lightfield_decoder)
+  list(APPEND AOM_APP_TARGETS lightfield_decoder)
+endif()
+
+if(ENABLE_EXAMPLES AND CONFIG_AV1_ENCODER AND CONFIG_AV1_DECODER)
+  add_executable(lightfield_bitstream_parsing
+                 "${AOM_ROOT}/examples/lightfield_bitstream_parsing.c"
+                 $<TARGET_OBJECTS:aom_common_app_util>
+                 $<TARGET_OBJECTS:aom_encoder_app_util>
+                 $<TARGET_OBJECTS:aom_decoder_app_util>)
+  list(APPEND AOM_EXAMPLE_TARGETS lightfield_bitstream_parsing)
+  list(APPEND AOM_APP_TARGETS lightfield_bitstream_parsing)
+endif()
+
+foreach(aom_app ${AOM_APP_TARGETS})
   target_link_libraries(${aom_app} ${AOM_LIB_LINK_TYPE} aom)
-endforeach ()
+endforeach()
 
-if (CONFIG_UNIT_TESTS OR ENABLE_EXAMPLES OR ENABLE_TOOLS)
-  if (CONFIG_LIBYUV)
+if(ENABLE_EXAMPLES OR ENABLE_TESTS OR ENABLE_TOOLS)
+  if(CONFIG_LIBYUV)
     add_library(yuv OBJECT ${AOM_LIBYUV_SOURCES})
-    if (NOT MSVC)
+    if(NOT MSVC)
       target_compile_options(yuv PRIVATE -Wno-unused-parameter)
-    endif ()
+    endif()
     include_directories("${AOM_ROOT}/third_party/libyuv/include")
 
     # Add to existing targets.
-    foreach (aom_app ${AOM_APP_TARGETS})
+    foreach(aom_app ${AOM_APP_TARGETS})
       target_sources(${aom_app} PRIVATE $<TARGET_OBJECTS:yuv>)
       set_property(TARGET ${aom_app} PROPERTY LINKER_LANGUAGE CXX)
-    endforeach ()
-  endif ()
+    endforeach()
+  endif()
+
+  if(CONFIG_WEBM_IO)
+    require_cxx_flag_nomsvc("-std=c++11" NO)
 
-  if (CONFIG_WEBM_IO)
     add_library(webm OBJECT ${AOM_LIBWEBM_SOURCES})
     include_directories("${AOM_ROOT}/third_party/libwebm")
     target_compile_definitions(webm PRIVATE __STDC_CONSTANT_MACROS)
     target_compile_definitions(webm PRIVATE __STDC_LIMIT_MACROS)
 
-    if (NOT MSVC)
+    if(NOT MSVC)
       target_compile_options(webm PRIVATE -Wno-shadow)
-    endif ()
+      target_compile_options(webm PUBLIC -std=c++11)
+    endif()
 
     # Add to existing targets.
-    if (CONFIG_AV1_DECODER)
+    if(CONFIG_AV1_DECODER)
       target_sources(aom_decoder_app_util PRIVATE ${AOM_WEBM_DECODER_SOURCES})
-    endif ()
+    endif()
 
-    if (CONFIG_AV1_ENCODER)
+    if(CONFIG_AV1_ENCODER)
       target_sources(aom_encoder_app_util PRIVATE ${AOM_WEBM_ENCODER_SOURCES})
-    endif ()
+    endif()
 
-    foreach (aom_app ${AOM_APP_TARGETS})
+    foreach(aom_app ${AOM_APP_TARGETS})
       target_sources(${aom_app} PRIVATE $<TARGET_OBJECTS:webm>)
       set_property(TARGET ${aom_app} PROPERTY LINKER_LANGUAGE CXX)
-     endforeach ()
-  endif ()
-endif ()
+    endforeach()
+  endif()
+endif()
+
+if(ENABLE_TESTS)
 
-if (CONFIG_UNIT_TESTS)
   # Create test_libaom target and the targets it depends on.
   setup_aom_test_targets()
-endif ()
+endif()
 
-if (HAVE_PTHREAD_H AND CONFIG_MULTITHREAD)
+if(HAVE_PTHREAD_H AND CONFIG_MULTITHREAD)
   find_package(Threads)
-  foreach (app_target ${AOM_APP_TARGETS})
+  foreach(app_target ${AOM_APP_TARGETS})
     target_link_libraries(${app_target} ${AOM_LIB_LINK_TYPE} Threads::Threads)
-  endforeach ()
-endif ()
+  endforeach()
+endif()
+
+if(XCODE)
+
+  # TODO(tomfinegan): Make sure target has no C++ files before doing this as
+  # it's not necessary in that case.
+  if(CONFIG_LIBYUV OR CONFIG_WEBM_IO)
 
-if (XCODE)
-  if (CONFIG_LIBYUV OR CONFIG_WEBM_IO)
     # The Xcode generator does not obey LINKER_LANGUAGE. Because of the issue
     # what looks like a C++ file needs to be in any target that Xcode will link
-    # when the target contains a C++ dependency.
-    # Without this Xcode will try to link with the C linker, which always ends
-    # badly when a dependency actually includes C++.
+    # when the target contains a C++ dependency. Without this Xcode will try to
+    # link with the C linker, which always ends badly when a dependency actually
+    # includes C++.
+
     # Note: LINKER_LANGUAGE is explicitly set to C++ for all targets touched
     # here, it really is the Xcode generator's fault, or just a deficiency in
     # Xcode itself.
-    foreach (aom_app ${AOM_APP_TARGETS})
+    foreach(aom_app ${AOM_APP_TARGETS})
       add_dummy_source_file_to_target("${aom_app}" "cc")
-    endforeach ()
-  endif ()
-endif ()
+    endforeach()
+  endif()
+endif()
 
-if (ENABLE_EXAMPLES AND "${CMAKE_GENERATOR}" MATCHES "Makefiles$")
-  # Users of the configure build expect the example targets to be built in the
-  # examples sub directory of the configured build directory after running make.
+if(ENABLE_EXAMPLES AND "${CMAKE_GENERATOR}" MATCHES "Makefiles$")
+
+  # For historical purposes place the example binaries in the example directory.
   file(MAKE_DIRECTORY "${AOM_CONFIG_DIR}/examples")
 
-  foreach (target ${AOM_EXAMPLE_TARGETS})
-    if (NOT "${target}" MATCHES "aomdec\|aomenc")
-      set_target_properties(${target} PROPERTIES
-                            RUNTIME_OUTPUT_DIRECTORY
-                            "${AOM_CONFIG_DIR}/examples")
-    endif ()
-  endforeach ()
+  foreach(target ${AOM_EXAMPLE_TARGETS})
+    if(NOT "${target}" MATCHES "aomdec\|aomenc")
+      set_target_properties(${target}
+                            PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+                                       "${AOM_CONFIG_DIR}/examples")
+    endif()
+  endforeach()
+
+  if(ENABLE_TOOLS AND AOM_TOOL_TARGETS)
 
-  if (ENABLE_TOOLS AND AOM_TOOL_TARGETS)
     # The same expectation is true for tool targets.
     file(MAKE_DIRECTORY "${AOM_CONFIG_DIR}/tools")
-    set_target_properties(${AOM_TOOL_TARGETS} PROPERTIES
-                          RUNTIME_OUTPUT_DIRECTORY "${AOM_CONFIG_DIR}/tools")
-  endif ()
-endif ()
+    set_target_properties(${AOM_TOOL_TARGETS}
+                          PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+                                     "${AOM_CONFIG_DIR}/tools")
+  endif()
+endif()
 
-if (BUILD_SHARED_LIBS)
+if(BUILD_SHARED_LIBS)
   include("${AOM_ROOT}/build/cmake/exports.cmake")
   setup_exports_target()
   set_target_properties(aom PROPERTIES SOVERSION 0)
-endif ()
+endif()
 
 # Handle user supplied compile and link flags last to ensure they're obeyed.
 set_user_flags()
 
 # Aomedia documentation rule.
-if (ENABLE_DOCS)
+if(ENABLE_DOCS)
   include(FindDoxygen)
-  if (DOXYGEN_FOUND)
+  if(DOXYGEN_FOUND)
     include("${AOM_ROOT}/docs.cmake")
     setup_documentation_targets()
-  else ()
+  else()
     message("--- Cannot find doxygen, ENABLE_DOCS turned off.")
     set(ENABLE_DOCS OFF)
-  endif ()
-endif ()
-
-# Aomedia install rule.
-set(AOM_INSTALL_INCS
-    "${AOM_ROOT}/aom/aom.h"
-    "${AOM_ROOT}/aom/aom_codec.h"
-    "${AOM_ROOT}/aom/aom_frame_buffer.h"
-    "${AOM_ROOT}/aom/aom_image.h"
-    "${AOM_ROOT}/aom/aom_integer.h"
-    "${AOM_ROOT}/aom/aom.h")
-
-if (CONFIG_AV1_DECODER)
-  if (ENABLE_EXAMPLES)
-    set(AOM_INSTALL_BINS ${AOM_INSTALL_BINS} aomdec)
-  endif ()
-
-  set(AOM_INSTALL_INCS
-      ${AOM_INSTALL_INCS}
-      "${AOM_ROOT}/aom/aom_decoder.h"
-      "${AOM_ROOT}/aom/aomdx.h")
-endif ()
-
-if (CONFIG_AV1_ENCODER)
-  if (ENABLE_EXAMPLES)
-    set(AOM_INSTALL_BINS ${AOM_INSTALL_BINS} aomenc)
-  endif ()
-
-  set(AOM_INSTALL_INCS
-      ${AOM_INSTALL_INCS}
-      "${AOM_ROOT}/aom/aomcx.h"
-      "${AOM_ROOT}/aom/aom_encoder.h")
-endif ()
-
-set(AOM_INSTALL_LIBS aom)
-
-install(FILES ${AOM_INSTALL_INCS}
-        DESTINATION "${CMAKE_INSTALL_PREFIX}/include/aom")
-install(FILES "${AOM_CONFIG_DIR}/aom.pc"
-        DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/pkgconfig")
-install(TARGETS ${AOM_INSTALL_LIBS} DESTINATION "${CMAKE_INSTALL_PREFIX}/lib")
-
-if (ENABLE_EXAMPLES)
-  install(TARGETS ${AOM_INSTALL_BINS} DESTINATION "${CMAKE_INSTALL_PREFIX}/bin")
-endif ()
+  endif()
+endif()
+
+if(NOT (MSVC OR XCODE))
+
+  # Aomedia install rule.
+  list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aom.h"
+              "${AOM_ROOT}/aom/aom_codec.h"
+              "${AOM_ROOT}/aom/aom_frame_buffer.h"
+              "${AOM_ROOT}/aom/aom_image.h" "${AOM_ROOT}/aom/aom_integer.h"
+              "${AOM_ROOT}/aom/aom.h")
+
+  if(CONFIG_AV1_DECODER)
+    if(ENABLE_EXAMPLES)
+      list(APPEND AOM_INSTALL_BINS aomdec)
+    endif()
+
+    list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aom_decoder.h"
+                "${AOM_ROOT}/aom/aomdx.h")
+  endif()
+
+  if(CONFIG_AV1_ENCODER)
+    if(ENABLE_EXAMPLES)
+      list(APPEND AOM_INSTALL_BINS aomenc)
+    endif()
+
+    list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aomcx.h"
+                "${AOM_ROOT}/aom/aom_encoder.h")
+  endif()
+
+  set(AOM_INSTALL_LIBS aom)
+
+  install(FILES ${AOM_INSTALL_INCS} DESTINATION "${INCLUDE_INSTALL_DIR}/aom")
+  install(FILES "${AOM_CONFIG_DIR}/aom.pc" DESTINATION
+                "${LIB_INSTALL_DIR}/pkgconfig")
+  install(TARGETS ${AOM_INSTALL_LIBS} DESTINATION "${LIB_INSTALL_DIR}")
+
+  if(ENABLE_EXAMPLES)
+    install(TARGETS ${AOM_INSTALL_BINS} DESTINATION
+                    "${CMAKE_INSTALL_PREFIX}/bin")
+  endif()
+endif()
 
 # Aomedia dist rule.
-if (CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
-  set(AOM_DIST_APPS ${AOM_DIST_APPS} $<TARGET_FILE:aomdec>)
-endif ()
-if (CONFIG_AV1_ENCODER AND ENABLE_EXAMPLES)
-  set(AOM_DIST_APPS ${AOM_DIST_APPS} $<TARGET_FILE:aomenc>)
-endif ()
-
-if (ENABLE_EXAMPLES)
-  foreach (example ${AOM_EXAMPLE_TARGETS})
+if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
+  list(APPEND AOM_DIST_APPS $<TARGET_FILE:aomdec>)
+endif()
+if(CONFIG_AV1_ENCODER AND ENABLE_EXAMPLES)
+  list(APPEND AOM_DIST_APPS $<TARGET_FILE:aomenc>)
+endif()
+
+if(ENABLE_EXAMPLES)
+  foreach(example ${AOM_EXAMPLE_TARGETS})
     list(APPEND AOM_DIST_EXAMPLES $<TARGET_FILE:${example}>)
-  endforeach ()
-endif ()
+  endforeach()
+endif()
 
-if (ENABLE_TOOLS)
-  foreach (tool ${AOM_TOOL_TARGETS})
+if(ENABLE_TOOLS)
+  foreach(tool ${AOM_TOOL_TARGETS})
     list(APPEND AOM_DIST_TOOLS $<TARGET_FILE:${tool}>)
-  endforeach ()
-endif ()
+  endforeach()
+endif()
 
-if (NOT AOM_DIST_DIR)
+if(NOT AOM_DIST_DIR)
   set(AOM_DIST_DIR "${AOM_CONFIG_DIR}/dist")
-endif ()
+endif()
 
 add_custom_target(dist
-                  COMMAND ${CMAKE_COMMAND}
-                  -DAOM_ROOT=${AOM_ROOT}
-                  -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
-                  -DAOM_DIST_DIR=${AOM_DIST_DIR}
-                  -DAOM_DIST_APPS="${AOM_DIST_APPS}"
-                  -DAOM_DIST_EXAMPLES="${AOM_DIST_EXAMPLES}"
-                  -DAOM_DIST_TOOLS="${AOM_DIST_TOOLS}"
-                  -DAOM_DIST_INCLUDES="${AOM_INSTALL_INCS}"
-                  -DAOM_DIST_LIBS=$<TARGET_FILE:aom>
-                  -DENABLE_DOCS=${ENABLE_DOCS}
-                  -P "${AOM_ROOT}/build/cmake/dist.cmake"
+                  COMMAND ${CMAKE_COMMAND} -DAOM_ROOT=${AOM_ROOT}
+                          -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                          -DAOM_DIST_DIR=${AOM_DIST_DIR}
+                          -DAOM_DIST_APPS="${AOM_DIST_APPS}"
+                          -DAOM_DIST_EXAMPLES="${AOM_DIST_EXAMPLES}"
+                          -DAOM_DIST_TOOLS="${AOM_DIST_TOOLS}"
+                          -DAOM_DIST_INCLUDES="${AOM_INSTALL_INCS}"
+                          -DAOM_DIST_LIBS=$<TARGET_FILE:aom>
+                          -DENABLE_DOCS=${ENABLE_DOCS} -P
+                          "${AOM_ROOT}/build/cmake/dist.cmake"
                   DEPENDS ${AOM_INSTALL_BINS} ${AOM_INSTALL_LIBS}
-                  ${AOM_INSTALL_INCS} ${AOM_EXAMPLE_TARGETS}
-                  ${AOM_TOOL_TARGETS})
+                          ${AOM_INSTALL_INCS} ${AOM_EXAMPLE_TARGETS}
+                          ${AOM_TOOL_TARGETS})
 
-if (ENABLE_DOCS)
+if(ENABLE_DOCS)
   add_dependencies(dist docs)
-endif ()
+endif()
+
+# Collect all variables containing libaom source files.
+get_cmake_property(all_cmake_vars VARIABLES)
+foreach(var ${all_cmake_vars})
+  if("${var}" MATCHES "SOURCES$\|_INTRIN_\|_ASM_" AND NOT "${var}" MATCHES
+     "_APP_\|DOXYGEN\|LIBWEBM\|LIBYUV\|_PKG_\|TEST")
+    list(APPEND aom_source_vars ${var})
+  endif()
+endforeach()
+
+# Libaom_srcs.txt generation.
+set(libaom_srcs_txt_file "${AOM_CONFIG_DIR}/libaom_srcs.txt")
+file(WRITE "${libaom_srcs_txt_file}" "# This file is generated. DO NOT EDIT.\n")
+
+# Static source file list first.
+foreach(aom_source_var ${aom_source_vars})
+  foreach(file ${${aom_source_var}})
+    if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}")
+      string(REPLACE "${AOM_ROOT}/" "" file "${file}")
+      file(APPEND "${libaom_srcs_txt_file}" "${file}\n")
+    endif()
+  endforeach()
+endforeach()
+
+file(APPEND
+       "${libaom_srcs_txt_file}"
+       "# Files below this line are generated by the libaom build system.\n")
+foreach(aom_source_var ${aom_source_vars})
+  foreach(file ${${aom_source_var}})
+    if("${file}" MATCHES "${AOM_CONFIG_DIR}")
+      string(REPLACE "${AOM_CONFIG_DIR}/" "" file "${file}")
+      file(APPEND "${libaom_srcs_txt_file}" "${file}\n")
+    endif()
+  endforeach()
+endforeach()
+
+# Libaom_srcs.gni generation.
+set(libaom_srcs_gni_file "${AOM_CONFIG_DIR}/libaom_srcs.gni")
+file(WRITE "${libaom_srcs_gni_file}" "# This file is generated. DO NOT EDIT.\n")
+
+foreach(aom_source_var ${aom_source_vars})
+  if("${${aom_source_var}}" MATCHES "${AOM_ROOT}")
+    string(TOLOWER ${aom_source_var} aom_source_var_lowercase)
+    file(APPEND "${libaom_srcs_gni_file}" "\n${aom_source_var_lowercase} = [\n")
+  endif()
+
+  foreach(file ${${aom_source_var}})
+    if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}")
+      string(REPLACE "${AOM_ROOT}" "//third_party/libaom/source/libaom" file
+                     "${file}")
+      file(APPEND "${libaom_srcs_gni_file}" "  \"${file}\",\n")
+    endif()
+  endforeach()
+
+  if("${${aom_source_var}}" MATCHES "${AOM_ROOT}")
+    file(APPEND "${libaom_srcs_gni_file}" "]\n")
+  endif()
+endforeach()
+
+file(APPEND
+       "${libaom_srcs_gni_file}"
+       "\n# Files below this line are generated by the libaom build system.\n")
+
+foreach(aom_source_var ${aom_source_vars})
+  if("${${aom_source_var}}" MATCHES "${AOM_CONFIG_DIR}")
+    string(TOLOWER ${aom_source_var} aom_source_var_lowercase)
+    file(APPEND "${libaom_srcs_gni_file}"
+                "\n${aom_source_var_lowercase}_gen = [\n")
+  endif()
+  foreach(file ${${aom_source_var}})
+    if(NOT "${file}" MATCHES "${AOM_ROOT}")
+      string(REPLACE "${AOM_CONFIG_DIR}" "//third_party/libaom/source/libaom"
+                     file "${file}")
+      file(APPEND "${libaom_srcs_gni_file}" "  \"${file}\",\n")
+    endif()
+  endforeach()
+
+  if("${${aom_source_var}}" MATCHES "${AOM_CONFIG_DIR}")
+    file(APPEND "${libaom_srcs_gni_file}" "]\n")
+  endif()
+endforeach()
diff --git a/third_party/aom/README b/third_party/aom/README
deleted file mode 100644
index 983a71343..000000000
--- a/third_party/aom/README
+++ /dev/null
@@ -1,172 +0,0 @@
-README - 9 March 2017
-
-***************************
-DEPRECATED -- SEE README.md
-***************************
-
-Welcome to the AV1 Codec SDK!
-
-COMPILING THE APPLICATIONS/LIBRARIES:
-  The build system used is similar to autotools. Building generally consists of
-  "configuring" with your desired build options, then using GNU make to build
-  the application.
-
-  1. Prerequisites
-
-    * All x86 targets require the Yasm[1] assembler be installed.
-    * All Windows builds require that Cygwin[2] be installed.
-    * Building the documentation requires Doxygen[3]. If you do not
-      have this package, the install-docs option will be disabled.
-    * Downloading the data for the unit tests requires curl[4] and sha1sum.
-      sha1sum is provided via the GNU coreutils, installed by default on
-      many *nix platforms, as well as MinGW and Cygwin. If coreutils is not
-      available, a compatible version of sha1sum can be built from
-      source[5]. These requirements are optional if not running the unit
-      tests.
-
-    [1]: http://www.tortall.net/projects/yasm
-    [2]: http://www.cygwin.com
-    [3]: http://www.doxygen.org
-    [4]: http://curl.haxx.se
-    [5]: http://www.microbrew.org/tools/md5sha1sum/
-
-  2. Out-of-tree builds
-  Out of tree builds are a supported method of building the application. For
-  an out of tree build, the source tree is kept separate from the object
-  files produced during compilation. For instance:
-
-    $ mkdir build
-    $ cd build
-    $ ../libaom/configure <options>
-    $ make
-
-  3. Configuration options
-  The 'configure' script supports a number of options. The --help option can be
-  used to get a list of supported options:
-    $ ../libaom/configure --help
-
-  4. Cross development
-  For cross development, the most notable option is the --target option. The
-  most up-to-date list of supported targets can be found at the bottom of the
-  --help output of the configure script. As of this writing, the list of
-  available targets is:
-
-    arm64-darwin-gcc
-    armv7-android-gcc
-    armv7-darwin-gcc
-    armv7-linux-rvct
-    armv7-linux-gcc
-    armv7-none-rvct
-    armv7-win32-vs12
-    armv7-win32-vs14
-    armv7-win32-vs15
-    armv7s-darwin-gcc
-    mips32-linux-gcc
-    mips64-linux-gcc
-    sparc-solaris-gcc
-    x86-android-gcc
-    x86-darwin8-gcc
-    x86-darwin8-icc
-    x86-darwin9-gcc
-    x86-darwin9-icc
-    x86-darwin10-gcc
-    x86-darwin11-gcc
-    x86-darwin12-gcc
-    x86-darwin13-gcc
-    x86-darwin14-gcc
-    x86-darwin15-gcc
-    x86-darwin16-gcc
-    x86-iphonesimulator-gcc
-    x86-linux-gcc
-    x86-linux-icc
-    x86-os2-gcc
-    x86-solaris-gcc
-    x86-win32-gcc
-    x86-win32-vs12
-    x86-win32-vs14
-    x86-win32-vs15
-    x86_64-android-gcc
-    x86_64-darwin9-gcc
-    x86_64-darwin10-gcc
-    x86_64-darwin11-gcc
-    x86_64-darwin12-gcc
-    x86_64-darwin13-gcc
-    x86_64-darwin14-gcc
-    x86_64-darwin15-gcc
-    x86_64-darwin16-gcc
-    x86_64-iphonesimulator-gcc
-    x86_64-linux-gcc
-    x86_64-linux-icc
-    x86_64-solaris-gcc
-    x86_64-win64-gcc
-    x86_64-win64-vs12
-    x86_64-win64-vs14
-    x86_64-win64-vs15
-    generic-gnu
-
-  The generic-gnu target, in conjunction with the CROSS environment variable,
-  can be used to cross compile architectures that aren't explicitly listed, if
-  the toolchain is a cross GNU (gcc/binutils) toolchain. Other POSIX toolchains
-  will likely work as well. For instance, to build using the mipsel-linux-uclibc
-  toolchain, the following command could be used (note, POSIX SH syntax, adapt
-  to your shell as necessary):
-
-    $ CROSS=mipsel-linux-uclibc- ../libaom/configure
-
-  In addition, the executables to be invoked can be overridden by specifying the
-  environment variables: CC, AR, LD, AS, STRIP, NM. Additional flags can be
-  passed to these executables with CFLAGS, LDFLAGS, and ASFLAGS.
-
-  5. Configuration errors
-  If the configuration step fails, the first step is to look in the error log.
-  This defaults to config.log. This should give a good indication of what went
-  wrong. If not, contact us for support.
-
-AV1 TEST VECTORS:
-  The test vectors can be downloaded and verified using the build system after
-  running configure. To specify an alternate directory the
-  LIBAOM_TEST_DATA_PATH environment variable can be used.
-
-  $ ./configure --enable-unit-tests
-  $ LIBAOM_TEST_DATA_PATH=../-test-data make testdata
-
-UNIT TESTS:
-  The unit tests (consisting mainly of the test_libaom binary) can be run using
-  make. This will download the test data if necessary.
-
-  $ ../libaom/configure --enable-unit-tests
-  $ make test
-
-  Test may be run in parallel using make -j which supports up to 10 shards by
-  default.
-  $ make -j10 test
-
-  If you have additional cores you can scale the tests to match:
-  $ shards=$(nproc); \
-    make -j$shards test \
-    NUM_SHARDS=$shards SHARDS="$(seq -s' ' 0 $(( shards - 1 )))" \
-    && echo "success"
-
-  The GTEST_FILTER environment variable (equivalent to --gtest_filter) can be
-  used to control which tests are run while sharding:
-  $ GTEST_FILTER='SSE2*' make -j10 test
-
-CODE STYLE:
-  The coding style used by this project is enforced with clang-format using the
-  configuration contained in the .clang-format file in the root of the
-  repository.
-
-  Before pushing changes for review you can format your code with:
-  # Apply clang-format to modified .c, .h and .cc files
-  $ clang-format -i --style=file \
-    $(git diff --name-only --diff-filter=ACMR '*.[hc]' '*.cc')
-
-  Check the .clang-format file for the version used to generate it if there is
-  any difference between your local formatting and the review system.
-
-  See also: http://clang.llvm.org/docs/ClangFormat.html
-
-SUPPORT
-  This library is an open source project supported by its community. Please
-  please email webm-discuss@webmproject.org for help.
-
diff --git a/third_party/aom/README.md b/third_party/aom/README.md
index acedb105c..f5446f9d9 100644
--- a/third_party/aom/README.md
+++ b/third_party/aom/README.md
@@ -1,5 +1,44 @@
 # AV1 Codec Library
 
+## Contents
+1. [Building the lib and applications](#building-the-library-and-applications)
+    - [Prerequisites](#prerequisites)
+    - [Get the code](#get-the-code)
+    - [Basics](#basic-build)
+    - [Configuration options](#configuration-options)
+    - [Dylib builds](#dylib-builds)
+    - [Debugging](#debugging)
+    - [Cross compiling](#cross-compiling)
+    - [Sanitizer support](#sanitizers)
+    - [MSVC builds](#microsoft-visual-studio-builds)
+    - [Xcode builds](#xcode-builds)
+    - [Emscripten builds](#emscripten-builds)
+    - [Extra Build Flags](#extra-build-flags)
+2. [Testing the library](#testing-the-av1-codec)
+    - [Basics](#testing-basics)
+        - [Unit tests](#1_unit-tests)
+        - [Example tests](#2_example-tests)
+        - [Encoder tests](#3_encoder-tests)
+    - [IDE hosted tests](#ide-hosted-tests)
+    - [Downloading test data](#downloading-the-test-data)
+    - [Adding a new test data file](#adding-a-new-test-data-file)
+    - [Additional test data](#additional-test-data)
+    - [Sharded testing](#sharded-testing)
+        - [Running tests directly](#1_running-test_libaom-directly)
+        - [Running tests via CMake](#2_running-the-tests-via-the-cmake-build)
+3. [Coding style](#coding-style)
+4. [Submitting patches](#submitting-patches)
+    - [Login cookie](#login-cookie)
+    - [Contributor agreement](#contributor-agreement)
+    - [Testing your code](#testing-your-code)
+    - [Commit message hook](#commit-message-hook)
+    - [Upload your change](#upload-your-change)
+    - [Incorporating Reviewer Comments](#incorporating-reviewer-comments)
+    - [Submitting your change](#submitting-your-change)
+    - [Viewing change status](#viewing-the-status-of-uploaded-changes)
+5. [Support](#support)
+6. [Bug reports](#bug-reports)
+
 ## Building the library and applications
 
 ### Prerequisites
@@ -14,6 +53,17 @@
  7. Emscripten builds require the portable
    [EMSDK](https://kripken.github.io/emscripten-site/index.html).
 
+### Get the code
+
+The AV1 library source code is stored in the Alliance for Open Media Git
+repository:
+
+~~~
+    $ git clone https://aomedia.googlesource.com/aom
+    # By default, the above command stores the source in the aom directory:
+    $ cd aom
+~~~
+
 ### Basic build
 
 CMake replaces the configure step typical of many projects. Running CMake will
@@ -21,8 +71,10 @@ produce configuration and build files for the currently selected CMake
 generator. For most systems the default generator is Unix Makefiles. The basic
 form of a makefile build is the following:
 
+~~~
     $ cmake path/to/aom
     $ make
+~~~
 
 The above will generate a makefile build that produces the AV1 library and
 applications for the current host system after the make step completes
@@ -39,10 +91,10 @@ varieties:
  2. AV1 codec configuration options. These have the form `CONFIG_FEATURE`.
 
 Both types of options are set at the time CMake is run. The following example
-enables ccache and disables high bit depth:
+enables ccache and disables the AV1 encoder:
 
 ~~~
-    $ cmake path/to/aom -DENABLE_CCACHE=1 -DCONFIG_HIGHBITDEPTH=0
+    $ cmake path/to/aom -DENABLE_CCACHE=1 -DCONFIG_AV1_ENCODER=0
     $ make
 ~~~
 
@@ -102,8 +154,10 @@ The toolchain files available at the time of this writing are:
 
  - arm64-ios.cmake
  - arm64-linux-gcc.cmake
+ - arm64-mingw-gcc.cmake
  - armv7-ios.cmake
  - armv7-linux-gcc.cmake
+ - armv7-mingw-gcc.cmake
  - armv7s-ios.cmake
  - mips32-linux-gcc.cmake
  - mips64-linux-gcc.cmake
@@ -194,11 +248,11 @@ appropriately using the emsdk\_env script.
         -DENABLE_CCACHE=1 \
         -DAOM_TARGET_CPU=generic \
         -DENABLE_DOCS=0 \
+        -DENABLE_TESTS=0 \
         -DCONFIG_ACCOUNTING=1 \
         -DCONFIG_INSPECTION=1 \
         -DCONFIG_MULTITHREAD=0 \
         -DCONFIG_RUNTIME_CPU_DETECT=0 \
-        -DCONFIG_UNIT_TESTS=0 \
         -DCONFIG_WEBM_IO=0 \
         -DCMAKE_TOOLCHAIN_FILE=path/to/emsdk-portable/.../Emscripten.cmake
 ~~~
@@ -217,12 +271,32 @@ appropriately using the emsdk\_env script.
     $ path/to/AOMAnalyzer path/to/examples/inspect.js path/to/av1/input/file
 ~~~
 
+### Extra build flags
+
+Three variables allow for passing of additional flags to the build system.
+
+- AOM\_EXTRA\_C\_FLAGS
+- AOM\_EXTRA\_CXX\_FLAGS
+- AOM\_EXTRA\_EXE\_LINKER\_FLAGS
+
+The build system attempts to ensure the flags passed through the above variables
+are passed to tools last in order to allow for override of default behavior.
+These flags can be used, for example, to enable asserts in a release build:
+
+~~~
+    $ cmake path/to/aom \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DAOM_EXTRA_C_FLAGS=-UNDEBUG \
+        -DAOM_EXTRA_CXX_FLAGS=-UNDEBUG
+~~~
 
 ## Testing the AV1 codec
 
 ### Testing basics
 
-Currently there are two types of tests in the AV1 codec repository.
+There are several methods of testing the AV1 codec. All of these methods require
+the presence of the AV1 source code and a working build of the AV1 library and
+applications.
 
 #### 1. Unit tests:
 
@@ -253,6 +327,57 @@ The example tests require a bash shell and can be run in the following manner:
     $ path/to/aom/test/examples.sh --bin-path examples
 ~~~
 
+#### 3. Encoder tests:
+
+When making a change to the encoder run encoder tests to confirm that your
+change has a positive or negligible impact on encode quality. When running these
+tests the build configuration should be changed to enable internal encoder
+statistics:
+
+~~~
+    $ cmake path/to/aom -DCONFIG_INTERNAL_STATS=1
+    $ make
+~~~
+
+The repository contains scripts intended to make running these tests as simple
+as possible. The following example demonstrates creating a set of baseline clips
+for comparison to results produced after making your change to libaom:
+
+~~~
+    # This will encode all Y4M files in the current directory using the
+    # settings specified to create the encoder baseline statistical data:
+    $ cd path/to/test/inputs
+    # This command line assumes that run_encodes.sh, its helper script
+    # best_encode.sh, and the aomenc you intend to test are all within a
+    # directory in your PATH.
+    $ run_encodes.sh 200 500 50 baseline
+~~~
+
+After making your change and creating the baseline clips, you'll need to run
+encodes that include your change(s) to confirm that things are working as
+intended:
+
+~~~
+    # This will encode all Y4M files in the current directory using the
+    # settings specified to create the statistical data for your change:
+    $ cd path/to/test/inputs
+    # This command line assumes that run_encodes.sh, its helper script
+    # best_encode.sh, and the aomenc you intend to test are all within a
+    # directory in your PATH.
+    $ run_encodes.sh 200 500 50 mytweak
+~~~
+
+After creating both data sets you can use `test/visual_metrics.py` to generate a
+report that can be viewed in a web browser:
+
+~~~
+    $ visual_metrics.py metrics_template.html "*stt" baseline mytweak \
+      > mytweak.html
+~~~
+
+You can view the report by opening mytweak.html in a web browser.
+
+
 ### IDE hosted tests
 
 By default the generated projects files created by CMake will not include the
@@ -283,6 +408,25 @@ rule:
 
 The above make command will only download and verify the test data.
 
+### Adding a new test data file
+
+First, add the new test data file to the `aom-test-data` bucket of the
+`aomedia-testing` project on Google Cloud Platform. You may need to ask someone
+with the necessary access permissions to do this for you.
+
+Once the new test data file has been added to `aom-test-data`, create a CL to
+add the name of the new test data file to `test/test_data_util.cmake` and add
+the SHA1 checksum of the new test data file to `test/test-data.sha1`. (The SHA1
+checksum of a file can be calculated by running the `sha1sum` command on the
+file.)
+
+### Additional test data
+
+The test data mentioned above is strictly intended for unit testing.
+
+Additional input data for testing the encoder can be obtained from:
+https://media.xiph.org/video/derf/
+
 ### Sharded testing
 
 The AV1 codec library unit tests are built upon gtest which supports sharding of
@@ -291,9 +435,10 @@ test jobs. Sharded test runs can be achieved in a couple of ways.
 #### 1. Running test\_libaom directly:
 
 ~~~
-   # Set the environment variable GTEST_TOTAL_SHARDS to 9 to run 10 test shards
+   # Set the environment variable GTEST_TOTAL_SHARDS to control the number of
+   # shards.
+   $ export GTEST_TOTAL_SHARDS=10
    # (GTEST shard indexing is 0 based).
-   $ export GTEST_TOTAL_SHARDS=9
    $ seq 0 $(( $GTEST_TOTAL_SHARDS - 1 )) \
        | xargs -n 1 -P 0 -I{} env GTEST_SHARD_INDEX={} ./test_libaom
 ~~~
@@ -322,8 +467,20 @@ is the default maximum value.
 
 ## Coding style
 
+We are using the Google C Coding Style defined by the
+[Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
+
 The coding style used by this project is enforced with clang-format using the
-configuration contained in the .clang-format file in the root of the repository.
+configuration contained in the
+[.clang-format](https://chromium.googlesource.com/webm/aom/+/master/.clang-format)
+file in the root of the repository.
+
+You can download clang-format using your system's package manager, or directly
+from [llvm.org](http://llvm.org/releases/download.html). You can also view the
+[documentation](https://clang.llvm.org/docs/ClangFormat.html) on llvm.org.
+Output from clang-format varies by clang-format version, for best results your
+version should match the one used on Jenkins. You can find the clang-format
+version by reading the comment in the `.clang-format` file linked above.
 
 Before pushing changes for review you can format your code with:
 
@@ -336,7 +493,116 @@ Before pushing changes for review you can format your code with:
 Check the .clang-format file for the version used to generate it if there is any
 difference between your local formatting and the review system.
 
-See also: http://clang.llvm.org/docs/ClangFormat.html
+Some Git installations have clang-format integration. Here are some examples:
+
+~~~
+    # Apply clang-format to all staged changes:
+    $ git clang-format
+
+    # Clang format all staged and unstaged changes:
+    $ git clang-format -f
+
+    # Clang format all staged and unstaged changes interactively:
+    $ git clang-format -f -p
+~~~
+
+## Submitting patches
+
+We manage the submission of patches using the
+[Gerrit](https://www.gerritcodereview.com/) code review tool. This tool
+implements a workflow on top of the Git version control system to ensure that
+all changes get peer reviewed and tested prior to their distribution.
+
+### Login cookie
+
+Browse to [AOMedia Git index](https://aomedia.googlesource.com/) and login with
+your account (Gmail credentials, for example). Next, follow the
+`Generate Password` Password link at the top of the page. You’ll be given
+instructions for creating a cookie to use with our Git repos.
+
+### Contributor agreement
+
+You will be required to execute a
+[contributor agreement](http://aomedia.org/license) to ensure that the AOMedia
+Project has the right to distribute your changes.
+
+### Testing your code
+
+The testing basics are covered in the [testing section](#testing-the-av1-codec)
+above.
+
+In addition to the local tests, many more (e.g. asan, tsan, valgrind) will run
+through Jenkins instances upon upload to gerrit.
+
+### Commit message hook
+
+Gerrit requires that each submission include a unique Change-Id. You can assign
+one manually using git commit --amend, but it’s easier to automate it with the
+commit-msg hook provided by Gerrit.
+
+Copy commit-msg to the `.git/hooks` directory of your local repo. Here's an
+example:
+
+~~~
+    $ curl -Lo aom/.git/hooks/commit-msg https://chromium-review.googlesource.com/tools/hooks/commit-msg
+
+    # Next, ensure that the downloaded commit-msg script is executable:
+    $ chmod u+x aom/.git/hooks/commit-msg
+~~~
+
+See the Gerrit
+[documentation](https://gerrit-review.googlesource.com/Documentation/user-changeid.html)
+for more information.
+
+### Upload your change
+
+The command line to upload your patch looks like this:
+
+~~~
+    $ git push https://aomedia-review.googlesource.com/aom HEAD:refs/for/master
+~~~
+
+### Incorporating reviewer comments
+
+If you previously uploaded a change to Gerrit and the Approver has asked for
+changes, follow these steps:
+
+1. Edit the files to make the changes the reviewer has requested.
+2. Recommit your edits using the --amend flag, for example:
+
+~~~
+   $ git commit -a --amend
+~~~
+
+3. Use the same git push command as above to upload to Gerrit again for another
+   review cycle.
+
+In general, you should not rebase your changes when doing updates in response to
+review. Doing so can make it harder to follow the evolution of your change in
+the diff view.
+
+### Submitting your change
+
+Once your change has been Approved and Verified, you can “submit” it through the
+Gerrit UI. This will usually automatically rebase your change onto the branch
+specified.
+
+Sometimes this can’t be done automatically. If you run into this problem, you
+must rebase your changes manually:
+
+~~~
+    $ git fetch
+    $ git rebase origin/branchname
+~~~
+
+If there are any conflicts, resolve them as you normally would with Git. When
+you’re done, reupload your change.
+
+### Viewing the status of uploaded changes
+
+To check the status of a change that you uploaded, open
+[Gerrit](https://aomedia-review.googlesource.com/), sign in, and click My >
+Changes.
 
 ## Support
 
diff --git a/third_party/aom/aom/aom.h b/third_party/aom/aom/aom.h
index fecbeaf56..c5ef2517d 100644
--- a/third_party/aom/aom/aom.h
+++ b/third_party/aom/aom/aom.h
@@ -31,8 +31,8 @@
 #ifndef AOM_AOM_H_
 #define AOM_AOM_H_
 
-#include "./aom_codec.h"
-#include "./aom_image.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_image.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -63,6 +63,8 @@ enum aom_com_control_id {
   AOM_COMMON_CTRL_ID_MAX,
 
   AV1_GET_NEW_FRAME_IMAGE = 192, /**< get a pointer to the new frame */
+  AV1_COPY_NEW_FRAME_IMAGE =
+      193, /**< copy the new frame to an external buffer */
 
   AOM_DECODER_CTRL_ID_START = 256
 };
@@ -104,8 +106,9 @@ typedef struct aom_postproc_cfg {
  * Define the data struct to access av1 reference frames.
  */
 typedef struct av1_ref_frame {
-  int idx;         /**< frame index to get (input) */
-  aom_image_t img; /**< img structure to populate (output) */
+  int idx;              /**< frame index to get (input) */
+  int use_external_ref; /**< Directly use external ref buffer(decoder only) */
+  aom_image_t img;      /**< img structure to populate (output) */
 } av1_ref_frame_t;
 
 /*!\cond */
@@ -131,6 +134,8 @@ AOM_CTRL_USE_TYPE(AV1_COPY_REFERENCE, av1_ref_frame_t *)
 #define AOM_CTRL_AV1_COPY_REFERENCE
 AOM_CTRL_USE_TYPE(AV1_GET_NEW_FRAME_IMAGE, aom_image_t *)
 #define AOM_CTRL_AV1_GET_NEW_FRAME_IMAGE
+AOM_CTRL_USE_TYPE(AV1_COPY_NEW_FRAME_IMAGE, aom_image_t *)
+#define AOM_CTRL_AV1_COPY_NEW_FRAME_IMAGE
 
 /*!\endcond */
 /*! @} - end defgroup aom */
diff --git a/third_party/aom/aom/aom_codec.h b/third_party/aom/aom/aom_codec.h
index 1d301d16b..63e358624 100644
--- a/third_party/aom/aom/aom_codec.h
+++ b/third_party/aom/aom/aom_codec.h
@@ -43,37 +43,37 @@
 extern "C" {
 #endif
 
-#include "./aom_integer.h"
-#include "./aom_image.h"
+#include "aom/aom_image.h"
+#include "aom/aom_integer.h"
 
 /*!\brief Decorator indicating a function is deprecated */
-#ifndef DEPRECATED
+#ifndef AOM_DEPRECATED
 #if defined(__GNUC__) && __GNUC__
-#define DEPRECATED __attribute__((deprecated))
+#define AOM_DEPRECATED __attribute__((deprecated))
 #elif defined(_MSC_VER)
-#define DEPRECATED
+#define AOM_DEPRECATED
 #else
-#define DEPRECATED
+#define AOM_DEPRECATED
 #endif
-#endif /* DEPRECATED */
+#endif /* AOM_DEPRECATED */
 
-#ifndef DECLSPEC_DEPRECATED
+#ifndef AOM_DECLSPEC_DEPRECATED
 #if defined(__GNUC__) && __GNUC__
-#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
+#define AOM_DECLSPEC_DEPRECATED /**< \copydoc #AOM_DEPRECATED */
 #elif defined(_MSC_VER)
-/*!\brief \copydoc #DEPRECATED */
-#define DECLSPEC_DEPRECATED __declspec(deprecated)
+/*!\brief \copydoc #AOM_DEPRECATED */
+#define AOM_DECLSPEC_DEPRECATED __declspec(deprecated)
 #else
-#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
+#define AOM_DECLSPEC_DEPRECATED /**< \copydoc #AOM_DEPRECATED */
 #endif
-#endif /* DECLSPEC_DEPRECATED */
+#endif /* AOM_DECLSPEC_DEPRECATED */
 
 /*!\brief Decorator indicating a function is potentially unused */
-#ifdef UNUSED
+#ifdef AOM_UNUSED
 #elif defined(__GNUC__) || defined(__clang__)
-#define UNUSED __attribute__((unused))
+#define AOM_UNUSED __attribute__((unused))
 #else
-#define UNUSED
+#define AOM_UNUSED
 #endif
 
 /*!\brief Decorator indicating that given struct/union/enum is packed */
@@ -433,7 +433,7 @@ aom_codec_err_t aom_codec_control_(aom_codec_ctx_t *ctx, int ctrl_id, ...);
  */
 #define AOM_CTRL_USE_TYPE(id, typ)                                           \
   static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *, int, typ) \
-      UNUSED;                                                                \
+      AOM_UNUSED;                                                            \
                                                                              \
   static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *ctx,        \
                                                 int ctrl_id, typ data) {     \
@@ -450,13 +450,13 @@ aom_codec_err_t aom_codec_control_(aom_codec_ctx_t *ctx, int ctrl_id, ...);
  * It defines a static function with the correctly typed arguments as a
  * wrapper to the type-unsafe internal function.
  */
-#define AOM_CTRL_USE_TYPE_DEPRECATED(id, typ)                        \
-  DECLSPEC_DEPRECATED static aom_codec_err_t aom_codec_control_##id( \
-      aom_codec_ctx_t *, int, typ) DEPRECATED UNUSED;                \
-                                                                     \
-  DECLSPEC_DEPRECATED static aom_codec_err_t aom_codec_control_##id( \
-      aom_codec_ctx_t *ctx, int ctrl_id, typ data) {                 \
-    return aom_codec_control_(ctx, ctrl_id, data);                   \
+#define AOM_CTRL_USE_TYPE_DEPRECATED(id, typ)                            \
+  AOM_DECLSPEC_DEPRECATED static aom_codec_err_t aom_codec_control_##id( \
+      aom_codec_ctx_t *, int, typ) AOM_DEPRECATED AOM_UNUSED;            \
+                                                                         \
+  AOM_DECLSPEC_DEPRECATED static aom_codec_err_t aom_codec_control_##id( \
+      aom_codec_ctx_t *ctx, int ctrl_id, typ data) {                     \
+    return aom_codec_control_(ctx, ctrl_id, data);                       \
   } /**<\hideinitializer*/
 
 /*!\brief aom_codec_control void type definition macro
@@ -471,7 +471,7 @@ aom_codec_err_t aom_codec_control_(aom_codec_ctx_t *ctx, int ctrl_id, ...);
  */
 #define AOM_CTRL_VOID(id)                                               \
   static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *, int) \
-      UNUSED;                                                           \
+      AOM_UNUSED;                                                       \
                                                                         \
   static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *ctx,   \
                                                 int ctrl_id) {          \
@@ -480,6 +480,48 @@ aom_codec_err_t aom_codec_control_(aom_codec_ctx_t *ctx, int ctrl_id, ...);
 
 #endif
 
+/*!\brief OBU types. */
+typedef enum ATTRIBUTE_PACKED {
+  OBU_SEQUENCE_HEADER = 1,
+  OBU_TEMPORAL_DELIMITER = 2,
+  OBU_FRAME_HEADER = 3,
+  OBU_TILE_GROUP = 4,
+  OBU_METADATA = 5,
+  OBU_FRAME = 6,
+  OBU_REDUNDANT_FRAME_HEADER = 7,
+  OBU_TILE_LIST = 8,
+  OBU_PADDING = 15,
+} OBU_TYPE;
+
+/*!\brief OBU metadata types. */
+typedef enum {
+  OBU_METADATA_TYPE_AOM_RESERVED_0 = 0,
+  OBU_METADATA_TYPE_HDR_CLL = 1,
+  OBU_METADATA_TYPE_HDR_MDCV = 2,
+  OBU_METADATA_TYPE_SCALABILITY = 3,
+  OBU_METADATA_TYPE_ITUT_T35 = 4,
+  OBU_METADATA_TYPE_TIMECODE = 5,
+} OBU_METADATA_TYPE;
+
+/*!\brief Returns string representation of OBU_TYPE.
+ *
+ * \param[in]     type            The OBU_TYPE to convert to string.
+ */
+const char *aom_obu_type_to_string(OBU_TYPE type);
+
+/*!\brief Config Options
+ *
+ * This type allows to enumerate and control options defined for control
+ * via config file at runtime.
+ */
+typedef struct cfg_options {
+  /*!\brief Reflects if ext_partition should be enabled
+   *
+   * If this value is non-zero it enabled the feature
+   */
+  unsigned int ext_partition;
+} cfg_options_t;
+
 /*!@} - end defgroup codec*/
 #ifdef __cplusplus
 }
diff --git a/third_party/aom/aom/aom_codec.mk b/third_party/aom/aom/aom_codec.mk
deleted file mode 100644
index 33bd3fe3b..000000000
--- a/third_party/aom/aom/aom_codec.mk
+++ /dev/null
@@ -1,42 +0,0 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-
-API_EXPORTS += exports
-
-API_SRCS-$(CONFIG_AV1_ENCODER) += aom.h
-API_SRCS-$(CONFIG_AV1_ENCODER) += aomcx.h
-API_DOC_SRCS-$(CONFIG_AV1_ENCODER) += aom.h
-API_DOC_SRCS-$(CONFIG_AV1_ENCODER) += aomcx.h
-
-API_SRCS-$(CONFIG_AV1_DECODER) += aom.h
-API_SRCS-$(CONFIG_AV1_DECODER) += aomdx.h
-API_DOC_SRCS-$(CONFIG_AV1_DECODER) += aom.h
-API_DOC_SRCS-$(CONFIG_AV1_DECODER) += aomdx.h
-
-API_DOC_SRCS-yes += aom_codec.h
-API_DOC_SRCS-yes += aom_decoder.h
-API_DOC_SRCS-yes += aom_encoder.h
-API_DOC_SRCS-yes += aom_frame_buffer.h
-API_DOC_SRCS-yes += aom_image.h
-
-API_SRCS-yes += src/aom_decoder.c
-API_SRCS-yes += aom_decoder.h
-API_SRCS-yes += src/aom_encoder.c
-API_SRCS-yes += aom_encoder.h
-API_SRCS-yes += internal/aom_codec_internal.h
-API_SRCS-yes += src/aom_codec.c
-API_SRCS-yes += src/aom_image.c
-API_SRCS-yes += aom_codec.h
-API_SRCS-yes += aom_codec.mk
-API_SRCS-yes += aom_frame_buffer.h
-API_SRCS-yes += aom_image.h
-API_SRCS-yes += aom_integer.h
diff --git a/third_party/aom/aom/aom_decoder.h b/third_party/aom/aom/aom_decoder.h
index ceab93453..3bbdcd7e2 100644
--- a/third_party/aom/aom/aom_decoder.h
+++ b/third_party/aom/aom/aom_decoder.h
@@ -30,8 +30,8 @@
 extern "C" {
 #endif
 
-#include "./aom_codec.h"
-#include "./aom_frame_buffer.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_frame_buffer.h"
 
 /*!\brief Current ABI version number
  *
@@ -74,8 +74,6 @@ extern "C" {
 /*!\brief The input frame should be passed to the decoder one fragment at a
  * time */
 #define AOM_CODEC_USE_INPUT_FRAGMENTS 0x40000
-/*!\brief Enable frame-based multi-threading */
-#define AOM_CODEC_USE_FRAME_THREADING 0x80000
 
 /*!\brief Stream properties
  *
@@ -83,9 +81,12 @@ extern "C" {
  * stream.
  */
 typedef struct aom_codec_stream_info {
-  unsigned int w;     /**< Width (or 0 for unknown/default) */
-  unsigned int h;     /**< Height (or 0 for unknown/default) */
-  unsigned int is_kf; /**< Current frame is a keyframe */
+  unsigned int w;                      /**< Width (or 0 for unknown/default) */
+  unsigned int h;                      /**< Height (or 0 for unknown/default) */
+  unsigned int is_kf;                  /**< Current frame is a keyframe */
+  unsigned int number_spatial_layers;  /**< Number of spatial layers */
+  unsigned int number_temporal_layers; /**< Number of temporal layers */
+  unsigned int is_annexb;              /**< Is Bitstream in Annex-B format */
 } aom_codec_stream_info_t;
 
 /* REQUIRED FUNCTIONS
@@ -104,6 +105,7 @@ typedef struct aom_codec_dec_cfg {
   unsigned int w;       /**< Width */
   unsigned int h;       /**< Height */
   unsigned int allow_lowbitdepth; /**< Allow use of low-bitdepth coding path */
+  cfg_options_t cfg;              /**< Options defined per config attributes */
 } aom_codec_dec_cfg_t;            /**< alias for struct aom_codec_dec_cfg */
 
 /*!\brief Initialize a decoder instance
@@ -149,7 +151,9 @@ aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx,
  * \param[in]      iface   Pointer to the algorithm interface
  * \param[in]      data    Pointer to a block of data to parse
  * \param[in]      data_sz Size of the data buffer
- * \param[in,out]  si      Pointer to stream info to update.
+ * \param[in,out]  si      Pointer to stream info to update. The is_annexb
+ *                         member \ref MUST be properly initialized. This
+ *                         function sets the rest of the members.
  *
  * \retval #AOM_CODEC_OK
  *     Bitstream is parsable and stream information updated.
@@ -160,8 +164,7 @@ aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx,
  *     buffer was too short.
  */
 aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface,
-                                           const uint8_t *data,
-                                           unsigned int data_sz,
+                                           const uint8_t *data, size_t data_sz,
                                            aom_codec_stream_info_t *si);
 
 /*!\brief Return information about the current stream.
@@ -203,8 +206,6 @@ aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx,
  * \param[in] data_sz      Size of the coded data, in bytes.
  * \param[in] user_priv    Application specific data to associate with
  *                         this frame.
- * \param[in] deadline     Soft deadline the decoder should attempt to meet,
- *                         in us. Set to zero for unlimited.
  *
  * \return Returns #AOM_CODEC_OK if the coded data was processed completely
  *         and future pictures can be decoded without error. Otherwise,
@@ -212,8 +213,7 @@ aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx,
  *         for recoverability capabilities.
  */
 aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data,
-                                 unsigned int data_sz, void *user_priv,
-                                 long deadline);
+                                 size_t data_sz, void *user_priv);
 
 /*!\brief Decoded frames iterator
  *
diff --git a/third_party/aom/aom/aom_encoder.h b/third_party/aom/aom/aom_encoder.h
index 208ba011f..6003088ed 100644
--- a/third_party/aom/aom/aom_encoder.h
+++ b/third_party/aom/aom/aom_encoder.h
@@ -30,7 +30,7 @@
 extern "C" {
 #endif
 
-#include "./aom_codec.h"
+#include "aom/aom_codec.h"
 
 /*!\brief Current ABI version number
  *
@@ -159,6 +159,8 @@ typedef struct aom_codec_cx_pkt {
        * Only applicable when "output partition" mode is enabled. First
        * partition has id 0.*/
       int partition_id;
+      /*!\brief size of the visible frame in this packet */
+      size_t vis_frame_size;
     } frame;                            /**< data for compressed frame packet */
     aom_fixed_buf_t twopass_stats;      /**< data for two-pass packet */
     aom_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */
@@ -282,6 +284,25 @@ typedef struct aom_codec_enc_cfg {
    */
   unsigned int g_h;
 
+  /*!\brief Max number of frames to encode
+   *
+   */
+  unsigned int g_limit;
+
+  /*!\brief Forced maximum width of the frame
+   *
+   * If this value is non-zero then it is used to force the maximum frame
+   * width written in write_sequence_header().
+   */
+  unsigned int g_forced_max_frame_width;
+
+  /*!\brief Forced maximum height of the frame
+   *
+   * If this value is non-zero then it is used to force the maximum frame
+   * height written in write_sequence_header().
+   */
+  unsigned int g_forced_max_frame_height;
+
   /*!\brief Bit-depth of the codec
    *
    * This value identifies the bit_depth of the codec,
@@ -586,6 +607,11 @@ typedef struct aom_codec_enc_cfg {
    * keyframing settings (kf)
    */
 
+  /*!\brief Option to enable forward reference key frame
+   *
+   */
+  int fwd_kf_enabled;
+
   /*!\brief Keyframe placement mode
    *
    * This value indicates whether the encoder should place keyframes at a
@@ -612,6 +638,28 @@ typedef struct aom_codec_enc_cfg {
    */
   unsigned int kf_max_dist;
 
+  /*!\brief sframe interval
+   *
+   * This value, expressed as a number of frames, forces the encoder to code
+   * an S-Frame every sframe_dist frames.
+   */
+  unsigned int sframe_dist;
+
+  /*!\brief sframe insertion mode
+   *
+   * This value must be set to 1 or 2, and tells the encoder how to insert
+   * S-Frames. It will only have an effect if sframe_dist != 0.
+   *
+   * If altref is enabled:
+   *   - if sframe_mode == 1, the considered frame will be made into an
+   *     S-Frame only if it is an altref frame
+   *   - if sframe_mode == 2, the next altref frame will be made into an
+   *     S-Frame.
+   *
+   * Otherwise: the considered frame will be made into an S-Frame.
+   */
+  unsigned int sframe_mode;
+
   /*!\brief Tile coding mode
    *
    * This value indicates the tile coding mode.
@@ -620,6 +668,30 @@ typedef struct aom_codec_enc_cfg {
    */
   unsigned int large_scale_tile;
 
+  /*!\brief Monochrome mode
+   *
+   * If this is nonzero, the encoder will generate a monochrome stream
+   * with no chroma planes.
+   */
+  unsigned int monochrome;
+
+  /*!\brief full_still_picture_hdr
+   *
+   * If this is nonzero, the encoder will generate a full header even for
+   * still picture encoding. if zero, a reduced header is used for still
+   * picture. This flag has no effect when a regular video with more than
+   * a single frame is encoded.
+   */
+  unsigned int full_still_picture_hdr;
+
+  /*!\brief Bitstream syntax mode
+   *
+   * This value indicates the bitstream syntax mode.
+   * A value of 0 indicates bitstream is saved as Section 5 bitstream. A value
+   * of 1 indicates the bitstream is saved in Annex-B format
+   */
+  unsigned int save_as_annexb;
+
   /*!\brief Number of explicit tile widths specified
    *
    * This value indicates the number of tile widths specified
@@ -661,6 +733,11 @@ typedef struct aom_codec_enc_cfg {
    * The number of heights specified is given by tile_height_count
    */
   int tile_heights[MAX_TILE_HEIGHTS];
+
+  /*!\brief Options defined per config file
+   *
+   */
+  cfg_options_t cfg;
 } aom_codec_enc_cfg_t; /**< alias for struct aom_codec_enc_cfg */
 
 /*!\brief Initialize an encoder instance
@@ -782,23 +859,11 @@ aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx,
  */
 aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx);
 
-/*!\brief deadline parameter analogous to  AVx GOOD QUALITY mode. */
-#define AOM_DL_GOOD_QUALITY (1000000)
 /*!\brief Encode a frame
  *
  * Encodes a video frame at the given "presentation time." The presentation
  * time stamp (PTS) \ref MUST be strictly increasing.
  *
- * The encoder supports the notion of a soft real-time deadline. Given a
- * non-zero value to the deadline parameter, the encoder will make a "best
- * effort" guarantee to  return before the given time slice expires. It is
- * implicit that limiting the available time to encode will degrade the
- * output quality. The encoder can be given an unlimited time to produce the
- * best possible frame by specifying a deadline of '0'. This deadline
- * supercedes the AVx notion of "best quality, good quality, realtime".
- * Applications that wish to map these former settings to the new deadline
- * based system can use the symbol #AOM_DL_GOOD_QUALITY.
- *
  * When the last frame has been passed to the encoder, this function should
  * continue to be called, with the img parameter set to NULL. This will
  * signal the end-of-stream condition to the encoder and allow it to encode
@@ -810,7 +875,6 @@ aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx);
  * \param[in]    pts       Presentation time stamp, in timebase units.
  * \param[in]    duration  Duration to show frame, in timebase units.
  * \param[in]    flags     Flags to use for encoding this frame.
- * \param[in]    deadline  Time to spend encoding, in microseconds. (0=infinite)
  *
  * \retval #AOM_CODEC_OK
  *     The configuration was populated.
@@ -821,8 +885,7 @@ aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx);
  */
 aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img,
                                  aom_codec_pts_t pts, unsigned long duration,
-                                 aom_enc_frame_flags_t flags,
-                                 unsigned long deadline);
+                                 aom_enc_frame_flags_t flags);
 
 /*!\brief Set compressed data output buffer
  *
diff --git a/third_party/aom/aom/aom_frame_buffer.h b/third_party/aom/aom/aom_frame_buffer.h
index c87cf749b..b979fcf2b 100644
--- a/third_party/aom/aom/aom_frame_buffer.h
+++ b/third_party/aom/aom/aom_frame_buffer.h
@@ -20,7 +20,7 @@
 extern "C" {
 #endif
 
-#include "./aom_integer.h"
+#include "aom/aom_integer.h"
 
 /*!\brief The maximum number of work buffers used by libaom.
  *  Support maximum 4 threads to decode video in parallel.
diff --git a/third_party/aom/aom/aom_image.h b/third_party/aom/aom/aom_image.h
index 776794960..b01317b3f 100644
--- a/third_party/aom/aom/aom_image.h
+++ b/third_party/aom/aom/aom_image.h
@@ -20,6 +20,8 @@
 extern "C" {
 #endif
 
+#include "aom/aom_integer.h"
+
 /*!\brief Current ABI version number
  *
  * \internal
@@ -38,19 +40,6 @@ extern "C" {
 /*!\brief List of supported image formats */
 typedef enum aom_img_fmt {
   AOM_IMG_FMT_NONE,
-  AOM_IMG_FMT_RGB24,     /**< 24 bit per pixel packed RGB */
-  AOM_IMG_FMT_RGB32,     /**< 32 bit per pixel packed 0RGB */
-  AOM_IMG_FMT_RGB565,    /**< 16 bit per pixel, 565 */
-  AOM_IMG_FMT_RGB555,    /**< 16 bit per pixel, 555 */
-  AOM_IMG_FMT_UYVY,      /**< UYVY packed YUV */
-  AOM_IMG_FMT_YUY2,      /**< YUYV packed YUV */
-  AOM_IMG_FMT_YVYU,      /**< YVYU packed YUV */
-  AOM_IMG_FMT_BGR24,     /**< 24 bit per pixel packed BGR */
-  AOM_IMG_FMT_RGB32_LE,  /**< 32 bit packed BGR0 */
-  AOM_IMG_FMT_ARGB,      /**< 32 bit packed ARGB, alpha=255 */
-  AOM_IMG_FMT_ARGB_LE,   /**< 32 bit packed BGRA, alpha=255 */
-  AOM_IMG_FMT_RGB565_LE, /**< 16 bit per pixel, gggbbbbb rrrrrggg */
-  AOM_IMG_FMT_RGB555_LE, /**< 16 bit per pixel, gggbbbbb 0rrrrrgg */
   AOM_IMG_FMT_YV12 =
       AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_UV_FLIP | 1, /**< planar YVU */
   AOM_IMG_FMT_I420 = AOM_IMG_FMT_PLANAR | 2,
@@ -59,36 +48,79 @@ typedef enum aom_img_fmt {
   AOM_IMG_FMT_AOMI420 = AOM_IMG_FMT_PLANAR | 4,
   AOM_IMG_FMT_I422 = AOM_IMG_FMT_PLANAR | 5,
   AOM_IMG_FMT_I444 = AOM_IMG_FMT_PLANAR | 6,
-  AOM_IMG_FMT_I440 = AOM_IMG_FMT_PLANAR | 7,
   AOM_IMG_FMT_444A = AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_HAS_ALPHA | 6,
   AOM_IMG_FMT_I42016 = AOM_IMG_FMT_I420 | AOM_IMG_FMT_HIGHBITDEPTH,
   AOM_IMG_FMT_I42216 = AOM_IMG_FMT_I422 | AOM_IMG_FMT_HIGHBITDEPTH,
   AOM_IMG_FMT_I44416 = AOM_IMG_FMT_I444 | AOM_IMG_FMT_HIGHBITDEPTH,
-  AOM_IMG_FMT_I44016 = AOM_IMG_FMT_I440 | AOM_IMG_FMT_HIGHBITDEPTH
 } aom_img_fmt_t; /**< alias for enum aom_img_fmt */
 
-/*!\brief List of supported color spaces */
-typedef enum aom_color_space {
-  AOM_CS_UNKNOWN = 0,     /**< Unknown */
-  AOM_CS_BT_601 = 1,      /**< BT.601 */
-  AOM_CS_BT_709 = 2,      /**< BT.709 */
-  AOM_CS_SMPTE_170 = 3,   /**< SMPTE.170 */
-  AOM_CS_SMPTE_240 = 4,   /**< SMPTE.240 */
-  AOM_CS_BT_2020_NCL = 5, /**< BT.2020 non-constant luminance (BT.2100) */
-  AOM_CS_BT_2020_CL = 6,  /**< BT.2020 constant luminance */
-  AOM_CS_SRGB = 7,        /**< sRGB */
-  AOM_CS_ICTCP = 8,       /**< ICtCp, ITU-R BT.2100 */
-  AOM_CS_RESERVED = 9     /**< Values 9..31 are reserved */
-} aom_color_space_t;      /**< alias for enum aom_color_space */
+/*!\brief List of supported color primaries */
+typedef enum aom_color_primaries {
+  AOM_CICP_CP_RESERVED_0 = 0,  /**< For future use */
+  AOM_CICP_CP_BT_709 = 1,      /**< BT.709 */
+  AOM_CICP_CP_UNSPECIFIED = 2, /**< Unspecified */
+  AOM_CICP_CP_RESERVED_3 = 3,  /**< For future use */
+  AOM_CICP_CP_BT_470_M = 4,    /**< BT.470 System M (historical) */
+  AOM_CICP_CP_BT_470_B_G = 5,  /**< BT.470 System B, G (historical) */
+  AOM_CICP_CP_BT_601 = 6,      /**< BT.601 */
+  AOM_CICP_CP_SMPTE_240 = 7,   /**< SMPTE 240 */
+  AOM_CICP_CP_GENERIC_FILM =
+      8, /**< Generic film (color filters using illuminant C) */
+  AOM_CICP_CP_BT_2020 = 9,      /**< BT.2020, BT.2100 */
+  AOM_CICP_CP_XYZ = 10,         /**< SMPTE 428 (CIE 1921 XYZ) */
+  AOM_CICP_CP_SMPTE_431 = 11,   /**< SMPTE RP 431-2 */
+  AOM_CICP_CP_SMPTE_432 = 12,   /**< SMPTE EG 432-1  */
+  AOM_CICP_CP_RESERVED_13 = 13, /**< For future use (values 13 - 21)  */
+  AOM_CICP_CP_EBU_3213 = 22,    /**< EBU Tech. 3213-E  */
+  AOM_CICP_CP_RESERVED_23 = 23  /**< For future use (values 23 - 255)  */
+} aom_color_primaries_t;        /**< alias for enum aom_color_primaries */
 
 /*!\brief List of supported transfer functions */
-typedef enum aom_transfer_function {
-  AOM_TF_UNKNOWN = 0,      /**< Unknown */
-  AOM_TF_BT_709 = 1,       /**< BT.709 */
-  AOM_TF_PQ = 2,           /**< PQ TF BT.2100 / ST.2084 */
-  AOM_TF_HLG = 3,          /**< Hybrid Log-Gamma */
-  AOM_TF_RESERVED = 4      /**< Values 4..31 are reserved */
-} aom_transfer_function_t; /**< alias for enum aom_transfer_function */
+typedef enum aom_transfer_characteristics {
+  AOM_CICP_TC_RESERVED_0 = 0,  /**< For future use */
+  AOM_CICP_TC_BT_709 = 1,      /**< BT.709 */
+  AOM_CICP_TC_UNSPECIFIED = 2, /**< Unspecified */
+  AOM_CICP_TC_RESERVED_3 = 3,  /**< For future use */
+  AOM_CICP_TC_BT_470_M = 4,    /**< BT.470 System M (historical)  */
+  AOM_CICP_TC_BT_470_B_G = 5,  /**< BT.470 System B, G (historical) */
+  AOM_CICP_TC_BT_601 = 6,      /**< BT.601 */
+  AOM_CICP_TC_SMPTE_240 = 7,   /**< SMPTE 240 M */
+  AOM_CICP_TC_LINEAR = 8,      /**< Linear */
+  AOM_CICP_TC_LOG_100 = 9,     /**< Logarithmic (100 : 1 range) */
+  AOM_CICP_TC_LOG_100_SQRT10 =
+      10,                     /**< Logarithmic (100 * Sqrt(10) : 1 range) */
+  AOM_CICP_TC_IEC_61966 = 11, /**< IEC 61966-2-4 */
+  AOM_CICP_TC_BT_1361 = 12,   /**< BT.1361 */
+  AOM_CICP_TC_SRGB = 13,      /**< sRGB or sYCC*/
+  AOM_CICP_TC_BT_2020_10_BIT = 14, /**< BT.2020 10-bit systems */
+  AOM_CICP_TC_BT_2020_12_BIT = 15, /**< BT.2020 12-bit systems */
+  AOM_CICP_TC_SMPTE_2084 = 16,     /**< SMPTE ST 2084, ITU BT.2100 PQ */
+  AOM_CICP_TC_SMPTE_428 = 17,      /**< SMPTE ST 428 */
+  AOM_CICP_TC_HLG = 18,            /**< BT.2100 HLG, ARIB STD-B67 */
+  AOM_CICP_TC_RESERVED_19 = 19     /**< For future use (values 19-255) */
+} aom_transfer_characteristics_t;  /**< alias for enum aom_transfer_function */
+
+/*!\brief List of supported matrix coefficients */
+typedef enum aom_matrix_coefficients {
+  AOM_CICP_MC_IDENTITY = 0,    /**< Identity matrix */
+  AOM_CICP_MC_BT_709 = 1,      /**< BT.709 */
+  AOM_CICP_MC_UNSPECIFIED = 2, /**< Unspecified */
+  AOM_CICP_MC_RESERVED_3 = 3,  /**< For future use */
+  AOM_CICP_MC_FCC = 4,         /**< US FCC 73.628 */
+  AOM_CICP_MC_BT_470_B_G = 5,  /**< BT.470 System B, G (historical) */
+  AOM_CICP_MC_BT_601 = 6,      /**< BT.601 */
+  AOM_CICP_MC_SMPTE_240 = 7,   /**< SMPTE 240 M */
+  AOM_CICP_MC_SMPTE_YCGCO = 8, /**< YCgCo */
+  AOM_CICP_MC_BT_2020_NCL =
+      9, /**< BT.2020 non-constant luminance, BT.2100 YCbCr  */
+  AOM_CICP_MC_BT_2020_CL = 10, /**< BT.2020 constant luminance */
+  AOM_CICP_MC_SMPTE_2085 = 11, /**< SMPTE ST 2085 YDzDx */
+  AOM_CICP_MC_CHROMAT_NCL =
+      12, /**< Chromaticity-derived non-constant luminance */
+  AOM_CICP_MC_CHROMAT_CL = 13, /**< Chromaticity-derived constant luminance */
+  AOM_CICP_MC_ICTCP = 14,      /**< BT.2100 ICtCp */
+  AOM_CICP_MC_RESERVED_15 = 15 /**< For future use (values 15-255)  */
+} aom_matrix_coefficients_t;
 
 /*!\brief List of supported color range */
 typedef enum aom_color_range {
@@ -107,11 +139,13 @@ typedef enum aom_chroma_sample_position {
 
 /**\brief Image Descriptor */
 typedef struct aom_image {
-  aom_img_fmt_t fmt;                /**< Image Format */
-  aom_color_space_t cs;             /**< Color Space */
-  aom_transfer_function_t tf;       /**< transfer function */
-  aom_chroma_sample_position_t csp; /**< chroma sample position */
-  aom_color_range_t range;          /**< Color Range */
+  aom_img_fmt_t fmt;                 /**< Image Format */
+  aom_color_primaries_t cp;          /**< CICP Color Primaries */
+  aom_transfer_characteristics_t tc; /**< CICP Transfer Characteristics */
+  aom_matrix_coefficients_t mc;      /**< CICP Matrix Coefficients */
+  int monochrome;                    /**< Whether image is monochrome */
+  aom_chroma_sample_position_t csp;  /**< chroma sample position */
+  aom_color_range_t range;           /**< Color Range */
 
   /* Image storage dimensions */
   unsigned int w;         /**< Stored image width */
@@ -138,9 +172,13 @@ typedef struct aom_image {
 #define AOM_PLANE_ALPHA 3   /**< A (Transparency) plane */
   unsigned char *planes[4]; /**< pointer to the top left pixel for each plane */
   int stride[4];            /**< stride between rows for each plane */
+  size_t sz;                /**< data size */
 
   int bps; /**< bits per sample (for packed formats) */
 
+  int temporal_id; /**< Temporal layer Id of image */
+  int spatial_id;  /**< Spatial layer Id of image */
+
   /*!\brief The following member may be set by the application to associate
    * data with this image.
    */
@@ -207,6 +245,33 @@ aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w,
                           unsigned int d_h, unsigned int align,
                           unsigned char *img_data);
 
+/*!\brief Open a descriptor, allocating storage for the underlying image with a
+ * border
+ *
+ * Returns a descriptor for storing an image of the given format and its
+ * borders. The storage for the descriptor is allocated on the heap.
+ *
+ * \param[in]    img        Pointer to storage for descriptor. If this parameter
+ *                          is NULL, the storage for the descriptor will be
+ *                          allocated on the heap.
+ * \param[in]    fmt        Format for the image
+ * \param[in]    d_w        Width of the image
+ * \param[in]    d_h        Height of the image
+ * \param[in]    align      Alignment, in bytes, of the image buffer and
+ *                          each row in the image(stride).
+ * \param[in]    size_align Alignment, in bytes, of the image width and height.
+ * \param[in]    border     A border that is padded on four sides of the image.
+ *
+ * \return Returns a pointer to the initialized image descriptor. If the img
+ *         parameter is non-null, the value of the img parameter will be
+ *         returned.
+ */
+aom_image_t *aom_img_alloc_with_border(aom_image_t *img, aom_img_fmt_t fmt,
+                                       unsigned int d_w, unsigned int d_h,
+                                       unsigned int align,
+                                       unsigned int size_align,
+                                       unsigned int border);
+
 /*!\brief Set the rectangle identifying the displayed portion of the image
  *
  * Updates the displayed rectangle (aka viewport) on the image surface to
@@ -217,11 +282,12 @@ aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w,
  * \param[in]    y         topmost row
  * \param[in]    w         width
  * \param[in]    h         height
+ * \param[in]    border    A border that is padded on four sides of the image.
  *
  * \return 0 if the requested rectangle is valid, nonzero otherwise.
  */
 int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
-                     unsigned int w, unsigned int h);
+                     unsigned int w, unsigned int h, unsigned int border);
 
 /*!\brief Flip the image vertically (top for bottom)
  *
diff --git a/third_party/aom/aom/aom_integer.h b/third_party/aom/aom/aom_integer.h
index bd1fe270b..907d4cbec 100644
--- a/third_party/aom/aom/aom_integer.h
+++ b/third_party/aom/aom/aom_integer.h
@@ -8,7 +8,6 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-
 #ifndef AOM_AOM_INTEGER_H_
 #define AOM_AOM_INTEGER_H_
 
@@ -61,6 +60,47 @@ typedef size_t uintptr_t;
 #include <inttypes.h>
 #endif
 
+#if !defined(INT8_MAX)
+#define INT8_MAX 127
+#endif
+
+#if !defined(INT32_MAX)
+#define INT32_MAX 2147483647
+#endif
+
+#if !defined(INT32_MIN)
+#define INT32_MIN (-2147483647 - 1)
+#endif
+
 #define NELEMENTS(x) (int)(sizeof(x) / sizeof(x[0]))
 
+#if defined(__cplusplus)
+extern "C" {
+#endif  // __cplusplus
+
+// Returns size of uint64_t when encoded using LEB128.
+size_t aom_uleb_size_in_bytes(uint64_t value);
+
+// Returns 0 on success, -1 on decode failure.
+// On success, 'value' stores the decoded LEB128 value and 'length' stores
+// the number of bytes decoded.
+int aom_uleb_decode(const uint8_t *buffer, size_t available, uint64_t *value,
+                    size_t *length);
+
+// Encodes LEB128 integer. Returns 0 when successful, and -1 upon failure.
+int aom_uleb_encode(uint64_t value, size_t available, uint8_t *coded_value,
+                    size_t *coded_size);
+
+// Encodes LEB128 integer to size specified. Returns 0 when successful, and -1
+// upon failure.
+// Note: This will write exactly pad_to_size bytes; if the value cannot be
+// encoded in this many bytes, then this will fail.
+int aom_uleb_encode_fixed_size(uint64_t value, size_t available,
+                               size_t pad_to_size, uint8_t *coded_value,
+                               size_t *coded_size);
+
+#if defined(__cplusplus)
+}  // extern "C"
+#endif  // __cplusplus
+
 #endif  // AOM_AOM_INTEGER_H_
diff --git a/third_party/aom/aom/aomcx.h b/third_party/aom/aom/aomcx.h
index e573f986d..4cdb5d332 100644
--- a/third_party/aom/aom/aomcx.h
+++ b/third_party/aom/aom/aomcx.h
@@ -16,8 +16,8 @@
  *
  * @{
  */
-#include "./aom.h"
-#include "./aom_encoder.h"
+#include "aom/aom.h"
+#include "aom/aom_encoder.h"
 
 /*!\file
  * \brief Provides definitions for using AOM or AV1 encoder algorithm within the
@@ -48,14 +48,27 @@ extern aom_codec_iface_t *aom_codec_av1_cx(void);
  * last frame or not automatically.
  */
 #define AOM_EFLAG_NO_REF_LAST (1 << 16)
-
+/*!\brief Don't reference the last2 frame
+ *
+ * When this flag is set, the encoder will not use the last2 frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * last2 frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_LAST2 (1 << 17)
+/*!\brief Don't reference the last3 frame
+ *
+ * When this flag is set, the encoder will not use the last3 frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * last3 frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_LAST3 (1 << 18)
 /*!\brief Don't reference the golden frame
  *
  * When this flag is set, the encoder will not use the golden frame as a
  * predictor. When not set, the encoder will choose whether to use the
  * golden frame or not automatically.
  */
-#define AOM_EFLAG_NO_REF_GF (1 << 17)
+#define AOM_EFLAG_NO_REF_GF (1 << 19)
 
 /*!\brief Don't reference the alternate reference frame
  *
@@ -63,49 +76,72 @@ extern aom_codec_iface_t *aom_codec_av1_cx(void);
  * predictor. When not set, the encoder will choose whether to use the
  * alt ref frame or not automatically.
  */
-#define AOM_EFLAG_NO_REF_ARF (1 << 21)
+#define AOM_EFLAG_NO_REF_ARF (1 << 20)
+/*!\brief Don't reference the bwd reference frame
+ *
+ * When this flag is set, the encoder will not use the bwd ref frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * bwd ref frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_BWD (1 << 21)
+/*!\brief Don't reference the alt2 reference frame
+ *
+ * When this flag is set, the encoder will not use the alt2 ref frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * alt2 ref frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_ARF2 (1 << 22)
 
 /*!\brief Don't update the last frame
  *
  * When this flag is set, the encoder will not update the last frame with
  * the contents of the current frame.
  */
-#define AOM_EFLAG_NO_UPD_LAST (1 << 18)
+#define AOM_EFLAG_NO_UPD_LAST (1 << 23)
 
 /*!\brief Don't update the golden frame
  *
  * When this flag is set, the encoder will not update the golden frame with
  * the contents of the current frame.
  */
-#define AOM_EFLAG_NO_UPD_GF (1 << 22)
+#define AOM_EFLAG_NO_UPD_GF (1 << 24)
 
 /*!\brief Don't update the alternate reference frame
  *
  * When this flag is set, the encoder will not update the alt ref frame with
  * the contents of the current frame.
  */
-#define AOM_EFLAG_NO_UPD_ARF (1 << 23)
-
-/*!\brief Force golden frame update
+#define AOM_EFLAG_NO_UPD_ARF (1 << 25)
+/*!\brief Disable entropy update
  *
- * When this flag is set, the encoder copy the contents of the current frame
- * to the golden frame buffer.
+ * When this flag is set, the encoder will not update its internal entropy
+ * model based on the entropy of this frame.
  */
-#define AOM_EFLAG_FORCE_GF (1 << 19)
-
-/*!\brief Force alternate reference frame update
+#define AOM_EFLAG_NO_UPD_ENTROPY (1 << 26)
+/*!\brief Disable ref frame mvs
  *
- * When this flag is set, the encoder copy the contents of the current frame
- * to the alternate reference frame buffer.
+ * When this flag is set, the encoder will not allow frames to
+ * be encoded using mfmv.
  */
-#define AOM_EFLAG_FORCE_ARF (1 << 24)
-
-/*!\brief Disable entropy update
+#define AOM_EFLAG_NO_REF_FRAME_MVS (1 << 27)
+/*!\brief Enable error resilient frame
  *
- * When this flag is set, the encoder will not update its internal entropy
- * model based on the entropy of this frame.
+ * When this flag is set, the encoder will code frames as error
+ * resilient.
+ */
+#define AOM_EFLAG_ERROR_RESILIENT (1 << 28)
+/*!\brief Enable s frame mode
+ *
+ * When this flag is set, the encoder will code frames as an
+ * s frame.
+ */
+#define AOM_EFLAG_SET_S_FRAME (1 << 29)
+/*!\brief Force primary_ref_frame to PRIMARY_REF_NONE
+ *
+ * When this flag is set, the encoder will set a frame's primary_ref_frame
+ * to PRIMARY_REF_NONE
  */
-#define AOM_EFLAG_NO_UPD_ENTROPY (1 << 20)
+#define AOM_EFLAG_SET_PRIMARY_REF_NONE (1 << 30)
 
 /*!\brief AVx encoder control functions
  *
@@ -131,6 +167,10 @@ enum aome_enc_control_id {
    */
   AOME_SET_SCALEMODE = 11,
 
+  /*!\brief Codec control function to set encoder spatial layer id.
+   */
+  AOME_SET_SPATIAL_LAYER_ID = 12,
+
   /*!\brief Codec control function to set encoder internal speed settings.
    *
    * Changes in this value influences, among others, the encoder's selection
@@ -141,6 +181,10 @@ enum aome_enc_control_id {
    */
   AOME_SET_CPUUSED = 13,
 
+  /*!\brief Speed features for codec development
+   */
+  AOME_SET_DEVSF,
+
   /*!\brief Codec control function to enable automatic set and use alf frames.
    */
   AOME_SET_ENABLEAUTOALTREF,
@@ -199,6 +243,10 @@ enum aome_enc_control_id {
    */
   AOME_SET_MAX_INTRA_BITRATE_PCT,
 
+  /*!\brief Codec control function to set number of spatial layers.
+   */
+  AOME_SET_NUMBER_SPATIAL_LAYERS,
+
   /*!\brief Codec control function to set max data rate for Inter frames.
    *
    * This value controls additional clamping on the maximum size of an
@@ -289,6 +337,26 @@ enum aome_enc_control_id {
    */
   AV1E_SET_FRAME_PARALLEL_DECODING,
 
+  /*!\brief Codec control function to enable error_resilient_mode
+   *
+   * AV1 has a bitstream feature to guarantee parseability of a frame
+   * by turning on the error_resilient_decoding mode, even though the
+   * reference buffers are unreliable or not received.
+   *
+   * By default, this feature is off.
+   */
+  AV1E_SET_ERROR_RESILIENT_MODE,
+
+  /*!\brief Codec control function to enable s_frame_mode
+   *
+   * AV1 has a bitstream feature to designate certain frames as S-frames,
+   * from where we can switch to a different stream,
+   * even though the reference buffers may not be exactly identical.
+   *
+   * By default, this feature is off.
+   */
+  AV1E_SET_S_FRAME_MODE,
+
   /*!\brief Codec control function to set adaptive quantization mode.
    *
    * AV1 has a segment based feature that allows encoder to adaptively change
@@ -326,30 +394,82 @@ enum aome_enc_control_id {
    */
   AV1E_SET_TUNE_CONTENT,
 
+  /*!\brief Codec control function to set CDF update mode.
+   *
+   *  0: no update          1: update on every frame
+   *  2: selectively update
+   */
+  AV1E_SET_CDF_UPDATE_MODE,
+
   /*!\brief Codec control function to set color space info.
-   * \note Valid ranges: 0..9, default is "UNKNOWN".
-   *                     0 = UNKNOWN,
-   *                     1 = BT_601
-   *                     2 = BT_709
-   *                     3 = SMPTE_170
-   *                     4 = SMPTE_240
-   *                     5 = BT_2020_NCL
-   *                     6 = BT_2020_CL
-   *                     7 = SRGB
-   *                     8 = ICtCp
-   *                     9 = RESERVED
-   */
-  AV1E_SET_COLOR_SPACE,
+   * \note Valid ranges: 0..23, default is "Unspecified".
+   *                     0 = For future use
+   *                     1 = BT.709
+   *                     2 = Unspecified
+   *                     3 = For future use
+   *                     4 = BT.470 System M (historical)
+   *                     5 = BT.470 System B, G (historical)
+   *                     6 = BT.601
+   *                     7 = SMPTE 240
+   *                     8 = Generic film (color filters using illuminant C)
+   *                     9 = BT.2020, BT.2100
+   *                     10 = SMPTE 428 (CIE 1921 XYZ)
+   *                     11 = SMPTE RP 431-2
+   *                     12 = SMPTE EG 432-1
+   *                     13 = For future use (values 13 - 21)
+   *                     22 = EBU Tech. 3213-E
+   *                     23 = For future use
+   *
+   */
+  AV1E_SET_COLOR_PRIMARIES,
 
   /*!\brief Codec control function to set transfer function info.
-   * \note Valid ranges: 0..4, default is "UNKNOWN".
-   *                     0 = UNKNOWN,
-   *                     1 = BT_709
-   *                     2 = PQ
-   *                     3 = HLG
-   *                     4 = RESERVED
+   * \note Valid ranges: 0..19, default is "Unspecified".
+   *                     0 = For future use
+   *                     1 = BT.709
+   *                     2 = Unspecified
+   *                     3 = For future use
+   *                     4 = BT.470 System M (historical)
+   *                     5 = BT.470 System B, G (historical)
+   *                     6 = BT.601
+   *                     7 = SMPTE 240 M
+   *                     8 = Linear
+   *                     9 = Logarithmic (100 : 1 range)
+   *                     10 = Logarithmic (100 * Sqrt(10) : 1 range)
+   *                     11 = IEC 61966-2-4
+   *                     12 = BT.1361
+   *                     13 = sRGB or sYCC
+   *                     14 = BT.2020 10-bit systems
+   *                     15 = BT.2020 12-bit systems
+   *                     16 = SMPTE ST 2084, ITU BT.2100 PQ
+   *                     17 = SMPTE ST 428
+   *                     18 = BT.2100 HLG, ARIB STD-B67
+   *                     19 = For future use
+   *
+   */
+  AV1E_SET_TRANSFER_CHARACTERISTICS,
+
+  /*!\brief Codec control function to set transfer function info.
+   * \note Valid ranges: 0..15, default is "Unspecified".
+   *                     0 = Identity matrix
+   *                     1 = BT.709
+   *                     2 = Unspecified
+   *                     3 = For future use
+   *                     4 = US FCC 73.628
+   *                     5 = BT.470 System B, G (historical)
+   *                     6 = BT.601
+   *                     7 = SMPTE 240 M
+   *                     8 = YCgCo
+   *                     9 = BT.2020 non-constant luminance, BT.2100 YCbCr
+   *                     10 = BT.2020 constant luminance
+   *                     11 = SMPTE ST 2085 YDzDx
+   *                     12 = Chromaticity-derived non-constant luminance
+   *                     13 = Chromaticity-derived constant luminance
+   *                     14 = BT.2100 ICtCp
+   *                     15 = For future use
+   *
    */
-  AV1E_SET_TRANSFER_FUNCTION,
+  AV1E_SET_MATRIX_COEFFICIENTS,
 
   /*!\brief Codec control function to set chroma 4:2:0 sample position info.
    * \note Valid ranges: 0..3, default is "UNKNOWN".
@@ -412,10 +532,43 @@ enum aome_enc_control_id {
   /*!\brief Codec control function to enable automatic set and use
    * bwd-pred frames.
    *
-   * Experiment: EXT_REFS
    */
   AOME_SET_ENABLEAUTOBWDREF,
 
+  /*!\brief Codec control function to encode with CDEF.
+   *
+   * CDEF is the constrained directional enhancement filter which is an
+   * in-loop filter aiming to remove coding artifacts
+   *                          0 = do not apply CDEF
+   *                          1 = apply CDEF
+   *
+   *  By default, the encoder applies CDEF.
+   *
+   * Experiment: AOM_CDEF
+   */
+  AV1E_SET_ENABLE_CDEF,
+
+  /*!\brief Codec control function to encode with Loop Restoration Filter.
+   *
+   *                          0 = do not apply Restoration Filter
+   *                          1 = apply Restoration Filter
+   *
+   *  By default, the encoder applies Restoration Filter.
+   *
+   */
+  AV1E_SET_ENABLE_RESTORATION,
+
+  /*!\brief Codec control function to encode without trellis quantization.
+   *
+   *                          0 = apply trellis quantization
+   *                          1 = do not apply trellis quantization
+   *
+   *  By default, the encoder applies trellis optimization on quantized
+   *  coefficients.
+   *
+   */
+  AV1E_SET_DISABLE_TRELLIS_QUANT,
+
   /*!\brief Codec control function to encode with quantisation matrices.
    *
    * AOM can operate with default quantisation matrices dependent on
@@ -427,6 +580,7 @@ enum aome_enc_control_id {
    *
    * Experiment: AOM_QM
    */
+
   AV1E_SET_ENABLE_QM,
 
   /*!\brief Codec control function to set the min quant matrix flatness.
@@ -456,6 +610,45 @@ enum aome_enc_control_id {
    */
   AV1E_SET_QM_MAX,
 
+  /*!\brief Codec control function to set the min quant matrix flatness.
+   *
+   * AOM can operate with different ranges of quantisation matrices.
+   * As quantisation levels increase, the matrices get flatter. This
+   * control sets the flatness for luma (Y).
+   *
+   *  By default, the encoder sets this minimum at half the available
+   *  range.
+   *
+   * Experiment: AOM_QM
+   */
+  AV1E_SET_QM_Y,
+
+  /*!\brief Codec control function to set the min quant matrix flatness.
+   *
+   * AOM can operate with different ranges of quantisation matrices.
+   * As quantisation levels increase, the matrices get flatter. This
+   * control sets the flatness for chroma (U).
+   *
+   *  By default, the encoder sets this minimum at half the available
+   *  range.
+   *
+   * Experiment: AOM_QM
+   */
+  AV1E_SET_QM_U,
+
+  /*!\brief Codec control function to set the min quant matrix flatness.
+   *
+   * AOM can operate with different ranges of quantisation matrices.
+   * As quantisation levels increase, the matrices get flatter. This
+   * control sets the flatness for chrome (V).
+   *
+   *  By default, the encoder sets this minimum at half the available
+   *  range.
+   *
+   * Experiment: AOM_QM
+   */
+  AV1E_SET_QM_V,
+
   /*!\brief Codec control function to encode with dist_8x8.
    *
    *  The dist_8x8 is enabled automatically for model tuning parameters that
@@ -493,15 +686,15 @@ enum aome_enc_control_id {
   AV1E_SET_MTU,
 
   /*!\brief Codec control function to set dependent_horz_tiles.
-  *
-  * In encoding and decoding, AV1 allows enabling dependent horizontal tile
-  * The parameter for this control describes the value of this flag,
-  * which has a valid range [0, 1]:
-  *            0 = disable dependent horizontal tile
-  *            1 = enable dependent horizontal tile,
-  *
-  * By default, the value is 0, i.e. disable dependent horizontal tile.
-  */
+   *
+   * In encoding and decoding, AV1 allows enabling dependent horizontal tile
+   * The parameter for this control describes the value of this flag,
+   * which has a valid range [0, 1]:
+   *            0 = disable dependent horizontal tile
+   *            1 = enable dependent horizontal tile,
+   *
+   * By default, the value is 0, i.e. disable dependent horizontal tile.
+   */
   AV1E_SET_TILE_DEPENDENT_ROWS,
 
   /*!\brief Codec control function to set the number of symbols in an ANS data
@@ -516,14 +709,91 @@ enum aome_enc_control_id {
    */
   AV1E_SET_ANS_WINDOW_SIZE_LOG2,
 
+  /*!\brief Codec control function to turn on / off dual filter
+   * enabling/disabling.
+   *
+   * This will enable or disable dual filter. The default value is 1
+   *
+   */
+  AV1E_SET_ENABLE_DF,
+
+  /*!\brief Codec control function to turn on / off frame order hint for a
+   * few tools:
+   *
+   * joint compound mode
+   * motion field motion vector
+   * ref frame sign bias
+   *
+   * The default value is 1.
+   *
+   */
+  AV1E_SET_ENABLE_ORDER_HINT,
+
+  /*!\brief Codec control function to turn on / off joint compound mode
+   * at sequence level.
+   *
+   * This will enable or disable joint compound mode. The default value is 1.
+   * If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced to 0.
+   *
+   */
+  AV1E_SET_ENABLE_JNT_COMP,
+
+  /*!\brief Codec control function to turn on / off ref frame mvs (mfmv) usage
+   * at sequence level.
+   *
+   * This will enable or disable usage of MFMV. The default value is 1.
+   * If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced to 0.
+   *
+   */
+  AV1E_SET_ENABLE_REF_FRAME_MVS,
+
   /*!\brief Codec control function to set temporal mv prediction
-  * enabling/disabling.
-  *
-  * This will enable or disable temporal mv predicton. The default value is 0.
-  *
-  * Experiment: TEMPMV_SIGNALING
-  */
-  AV1E_SET_DISABLE_TEMPMV,
+   * enabling/disabling at frame level.
+   *
+   * This will enable or disable temporal mv predicton. The default value is 1.
+   * If AV1E_SET_ENABLE_REF_FRAME_MVS is 0, then this flag is forced to 0.
+   *
+   */
+  AV1E_SET_ALLOW_REF_FRAME_MVS,
+
+  /*!\brief Codec control function to turn on / off warped motion usage
+   * at sequence level.
+   *
+   * This will enable or disable usage of warped motion. The default value is 1.
+   *
+   */
+  AV1E_SET_ENABLE_WARPED_MOTION,
+
+  /*!\brief Codec control function to turn on / off warped motion usage
+   * at frame level.
+   *
+   * This will enable or disable usage of warped motion. The default value is 1.
+   * If AV1E_SET_ENABLE_WARPED_MOTION is 0, then this flag is forced to 0.
+   *
+   */
+  AV1E_SET_ALLOW_WARPED_MOTION,
+
+  /*!\brief Codec control function to turn on / off frame superresolution.
+   *
+   * This will enable or disable frame superresolution. The default value is 1
+   * If AV1E_SET_ENABLE_SUPERRES is 0, then this flag is forced to 0.
+   */
+  AV1E_SET_ENABLE_SUPERRES,
+
+  /*!\brief Codec control function to set loop_filter_across_tiles_v_enabled
+   * and loop_filter_across_tiles_h_enabled.
+   * In encoding and decoding, AV1 allows disabling loop filter across tile
+   * boundary The parameter for this control describes the value of this flag,
+   * which has a valid range [0, 1]:
+   *            0 = disable loop filter across tile boundary
+   *            1 = enable loop filter across tile boundary
+   *
+   * By default, the value is 1, i.e. enable loop filter across tile boundary.
+   *
+   * Experiment: LOOPFILTERING_ACROSS_TILES_EXT
+   */
+  AV1E_SET_TILE_LOOPFILTER_V,
+  AV1E_SET_TILE_LOOPFILTER_H,
 
   /*!\brief Codec control function to set loop_filter_across_tiles_enabled.
    *
@@ -540,15 +810,15 @@ enum aome_enc_control_id {
   AV1E_SET_TILE_LOOPFILTER,
 
   /*!\brief Codec control function to set the delta q mode
-  *
-  * AV1 has a segment based feature that allows encoder to adaptively change
-  * quantization parameter for each segment within a frame to improve the
-  * subjective quality. the delta q mode is added on top of segment based
-  * feature, and allows control per 64x64 q and lf delta.This control makes
-  * encoder operate in one of the several DELTA_Q_modes supported.
-  *
-  * By default, encoder operates with DELTAQ_Mode 0(deltaq signaling off).
-  */
+   *
+   * AV1 has a segment based feature that allows encoder to adaptively change
+   * quantization parameter for each segment within a frame to improve the
+   * subjective quality. the delta q mode is added on top of segment based
+   * feature, and allows control per 64x64 q and lf delta.This control makes
+   * encoder operate in one of the several DELTA_Q_modes supported.
+   *
+   * By default, encoder operates with DELTAQ_Mode 0(deltaq signaling off).
+   */
   AV1E_SET_DELTAQ_MODE,
 
   /*!\brief Codec control function to set the single tile decoding mode to 0 or
@@ -567,6 +837,23 @@ enum aome_enc_control_id {
    * 0 : off, 1 : MAX_EXTREME_MV, 2 : MIN_EXTREME_MV
    */
   AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST,
+
+  /*!\brief Codec control function to signal picture timing info in the
+   * bitstream. \note Valid ranges: 0..1, default is "UNKNOWN". 0 = UNKNOWN, 1 =
+   * EQUAL
+   */
+  AV1E_SET_TIMING_INFO_TYPE,
+
+  /*!\brief Codec control function to add film grain parameters (one of several
+   * preset types) info in the bitstream.
+   * \note Valid ranges: 0..11, default is "0". 0 = UNKNOWN,
+   * 1..16 = different test vectors for grain
+   */
+  AV1E_SET_FILM_GRAIN_TEST_VECTOR,
+
+  /*!\brief Codec control function to set the path to the film grain parameters
+   */
+  AV1E_SET_FILM_GRAIN_TABLE,
 };
 
 /*!\brief aom 1-D scaling mode
@@ -637,6 +924,13 @@ typedef enum {
   AOM_CONTENT_INVALID
 } aom_tune_content;
 
+/*!brief AV1 encoder timing info type signaling */
+typedef enum {
+  AOM_TIMING_UNSPECIFIED,
+  AOM_TIMING_EQUAL,
+  AOM_TIMING_DEC_MODEL
+} aom_timing_info_type_t;
+
 /*!\brief Model tuning parameters
  *
  * Changes the encoder to tune for certain types of input material.
@@ -645,10 +939,8 @@ typedef enum {
 typedef enum {
   AOM_TUNE_PSNR,
   AOM_TUNE_SSIM,
-#ifdef CONFIG_DIST_8X8
   AOM_TUNE_CDEF_DIST,
   AOM_TUNE_DAALA_DIST
-#endif
 } aom_tune_metric;
 
 /*!\cond */
@@ -668,8 +960,13 @@ AOM_CTRL_USE_TYPE(AOME_SET_ACTIVEMAP, aom_active_map_t *)
 AOM_CTRL_USE_TYPE(AOME_SET_SCALEMODE, aom_scaling_mode_t *)
 #define AOM_CTRL_AOME_SET_SCALEMODE
 
+AOM_CTRL_USE_TYPE(AOME_SET_SPATIAL_LAYER_ID, int)
+#define AOM_CTRL_AOME_SET_SPATIAL_LAYER_ID
+
 AOM_CTRL_USE_TYPE(AOME_SET_CPUUSED, int)
 #define AOM_CTRL_AOME_SET_CPUUSED
+AOM_CTRL_USE_TYPE(AOME_SET_DEVSF, int)
+#define AOM_CTRL_AOME_SET_DEVSF
 AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOALTREF, unsigned int)
 #define AOM_CTRL_AOME_SET_ENABLEAUTOALTREF
 
@@ -698,6 +995,10 @@ AOM_CTRL_USE_TYPE(AV1E_SET_TILE_ROWS, int)
 AOM_CTRL_USE_TYPE(AV1E_SET_TILE_DEPENDENT_ROWS, int)
 #define AOM_CTRL_AV1E_SET_TILE_DEPENDENT_ROWS
 
+AOM_CTRL_USE_TYPE(AV1E_SET_TILE_LOOPFILTER_V, int)
+#define AOM_CTRL_AV1E_SET_TILE_LOOPFILTER_V
+AOM_CTRL_USE_TYPE(AV1E_SET_TILE_LOOPFILTER_H, int)
+#define AOM_CTRL_AV1E_SET_TILE_LOOPFILTER_H
 AOM_CTRL_USE_TYPE(AV1E_SET_TILE_LOOPFILTER, int)
 #define AOM_CTRL_AV1E_SET_TILE_LOOPFILTER
 
@@ -711,12 +1012,24 @@ AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
 AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTER_BITRATE_PCT, unsigned int)
 #define AOM_CTRL_AOME_SET_MAX_INTER_BITRATE_PCT
 
+AOM_CTRL_USE_TYPE(AOME_SET_NUMBER_SPATIAL_LAYERS, int)
+#define AOME_CTRL_AOME_SET_NUMBER_SPATIAL_LAYERS
+
 AOM_CTRL_USE_TYPE(AV1E_SET_GF_CBR_BOOST_PCT, unsigned int)
 #define AOM_CTRL_AV1E_SET_GF_CBR_BOOST_PCT
 
 AOM_CTRL_USE_TYPE(AV1E_SET_LOSSLESS, unsigned int)
 #define AOM_CTRL_AV1E_SET_LOSSLESS
 
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_CDEF, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_CDEF
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_RESTORATION, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_RESTORATION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_DISABLE_TRELLIS_QUANT, unsigned int)
+#define AOM_CTRL_AV1E_SET_DISABLE_TRELLIS_QUANT
+
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_QM, unsigned int)
 #define AOM_CTRL_AV1E_SET_ENABLE_QM
 
@@ -729,17 +1042,56 @@ AOM_CTRL_USE_TYPE(AV1E_SET_QM_MIN, unsigned int)
 AOM_CTRL_USE_TYPE(AV1E_SET_QM_MAX, unsigned int)
 #define AOM_CTRL_AV1E_SET_QM_MAX
 
+AOM_CTRL_USE_TYPE(AV1E_SET_QM_Y, unsigned int)
+#define AOM_CTRL_AV1E_SET_QM_Y
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QM_U, unsigned int)
+#define AOM_CTRL_AV1E_SET_QM_U
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QM_V, unsigned int)
+#define AOM_CTRL_AV1E_SET_QM_V
+
 AOM_CTRL_USE_TYPE(AV1E_SET_NUM_TG, unsigned int)
 #define AOM_CTRL_AV1E_SET_NUM_TG
 AOM_CTRL_USE_TYPE(AV1E_SET_MTU, unsigned int)
 #define AOM_CTRL_AV1E_SET_MTU
 
-AOM_CTRL_USE_TYPE(AV1E_SET_DISABLE_TEMPMV, unsigned int)
-#define AOM_CTRL_AV1E_SET_DISABLE_TEMPMV
+AOM_CTRL_USE_TYPE(AV1E_SET_TIMING_INFO_TYPE, aom_timing_info_type_t)
+#define AOM_CTRL_AV1E_SET_TIMING_INFO_TYPE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DF, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DF
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ORDER_HINT, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_ORDER_HINT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_JNT_COMP, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_JNT_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_REF_FRAME_MVS, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_REF_FRAME_MVS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_REF_FRAME_MVS, unsigned int)
+#define AOM_CTRL_AV1E_SET_ALLOW_REF_FRAME_MVS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_WARPED_MOTION, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_WARPED_MOTION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_WARPED_MOTION, unsigned int)
+#define AOM_CTRL_AV1E_SET_ALLOW_WARPED_MOTION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SUPERRES, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_SUPERRES
 
 AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PARALLEL_DECODING, unsigned int)
 #define AOM_CTRL_AV1E_SET_FRAME_PARALLEL_DECODING
 
+AOM_CTRL_USE_TYPE(AV1E_SET_ERROR_RESILIENT_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_ERROR_RESILIENT_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_S_FRAME_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_S_FRAME_MODE
+
 AOM_CTRL_USE_TYPE(AV1E_SET_AQ_MODE, unsigned int)
 #define AOM_CTRL_AV1E_SET_AQ_MODE
 
@@ -755,11 +1107,14 @@ AOM_CTRL_USE_TYPE(AV1E_SET_NOISE_SENSITIVITY, unsigned int)
 AOM_CTRL_USE_TYPE(AV1E_SET_TUNE_CONTENT, int) /* aom_tune_content */
 #define AOM_CTRL_AV1E_SET_TUNE_CONTENT
 
-AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_SPACE, int)
-#define AOM_CTRL_AV1E_SET_COLOR_SPACE
+AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_PRIMARIES, int)
+#define AOM_CTRL_AV1E_SET_COLOR_PRIMARIES
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TRANSFER_CHARACTERISTICS, int)
+#define AOM_CTRL_AV1E_SET_TRANSFER_CHARACTERISTICS
 
-AOM_CTRL_USE_TYPE(AV1E_SET_TRANSFER_FUNCTION, int)
-#define AOM_CTRL_AV1E_SET_TRANSFER_FUNCTION
+AOM_CTRL_USE_TYPE(AV1E_SET_MATRIX_COEFFICIENTS, int)
+#define AOM_CTRL_AV1E_SET_MATRIX_COEFFICIENTS
 
 AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SAMPLE_POSITION, int)
 #define AOM_CTRL_AV1E_SET_CHROMA_SAMPLE_POSITION
@@ -801,6 +1156,15 @@ AOM_CTRL_USE_TYPE(AV1E_SET_SINGLE_TILE_DECODING, unsigned int)
 AOM_CTRL_USE_TYPE(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int)
 #define AOM_CTRL_AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST
 
+AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TEST_VECTOR, unsigned int)
+#define AOM_CTRL_AV1E_SET_FILM_GRAIN_TEST_VECTOR
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TABLE, const char *)
+#define AOM_CTRL_AV1E_SET_FILM_GRAIN_TABLE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_CDF_UPDATE_MODE, int)
+#define AOM_CTRL_AV1E_SET_CDF_UPDATE_MODE
+
 /*!\endcond */
 /*! @} - end defgroup aom_encoder */
 #ifdef __cplusplus
diff --git a/third_party/aom/aom/aomdx.h b/third_party/aom/aom/aomdx.h
index 4ca1c7285..7ff21a59b 100644
--- a/third_party/aom/aom/aomdx.h
+++ b/third_party/aom/aom/aomdx.h
@@ -26,7 +26,7 @@ extern "C" {
 #endif
 
 /* Include controls common to both the encoder and decoder */
-#include "./aom.h"
+#include "aom/aom.h"
 
 /*!\name Algorithm interface for AV1
  *
@@ -37,11 +37,9 @@ extern aom_codec_iface_t aom_codec_av1_dx_algo;
 extern aom_codec_iface_t *aom_codec_av1_dx(void);
 /*!@} - end algorithm interface member group*/
 
-#ifndef AOM_ACCOUNTING_H_
 /** Data structure that stores bit accounting for debug
  */
 typedef struct Accounting Accounting;
-#endif
 
 #ifndef AOM_INSPECTION_H_
 /** Callback that inspects decoder frame data.
@@ -62,6 +60,30 @@ typedef struct aom_inspect_init {
   void *inspect_ctx;
 } aom_inspect_init;
 
+/*!\brief Structure to hold a tile's start address and size in the bitstream.
+ *
+ * Defines a structure to hold a tile's start address and size in the bitstream.
+ */
+typedef struct aom_tile_data {
+  /*! Tile data size. */
+  size_t coded_tile_data_size;
+  /*! Tile's start address. */
+  const void *coded_tile_data;
+  /*! Extra size information. */
+  size_t extra_size;
+} aom_tile_data;
+
+/*!\brief Structure to hold the external reference frame pointer.
+ *
+ * Define a structure to hold the external reference frame pointer.
+ */
+typedef struct av1_ext_ref_frame {
+  /*! Start pointer of external references. */
+  aom_image_t *img;
+  /*! Number of available external references. */
+  int num;
+} av1_ext_ref_frame_t;
+
 /*!\enum aom_dec_control_id
  * \brief AOM decoder control functions
  *
@@ -84,13 +106,6 @@ enum aom_dec_control_id {
    */
   AOMD_GET_LAST_REF_USED,
 
-  /** decryption function to decrypt encoded buffer data immediately
-   * before decoding. Takes a aom_decrypt_init, which contains
-   * a callback function and opaque context pointer.
-   */
-  AOMD_SET_DECRYPTOR,
-  // AOMD_SET_DECRYPTOR = AOMD_SET_DECRYPTOR,
-
   /** control function to get the dimensions that the current frame is decoded
    * at. This may be different to the intended display size for the frame as
    * specified in the wrapper or frame header (see AV1D_GET_DISPLAY_SIZE). */
@@ -149,6 +164,50 @@ enum aom_dec_control_id {
    */
   AV1_SET_DECODE_TILE_ROW,
   AV1_SET_DECODE_TILE_COL,
+  /** control function to set the tile coding mode. A value that is equal to
+   *  zero indicates the tiles are coded in normal tile mode. A value that is
+   *  1 indicates the tiles are coded in large-scale tile mode.
+   */
+  AV1_SET_TILE_MODE,
+  /** control function to get the frame header information of an encoded frame
+   * in the bitstream. This provides a way to access a frame's header data.
+   */
+  AV1D_GET_FRAME_HEADER_INFO,
+  /** control function to get the start address and size of a tile in the coded
+   * bitstream. This provides a way to access a specific tile's bitstream data.
+   */
+  AV1D_GET_TILE_DATA,
+  /** control function to set the external references' pointers in the decoder.
+   *  This is used while decoding the tile list OBU in large-scale tile coding
+   *  mode.
+   */
+  AV1D_SET_EXT_REF_PTR,
+  /** control function to enable the ext-tile software debug and testing code in
+   * the decoder.
+   */
+  AV1D_EXT_TILE_DEBUG,
+
+  /** control function to indicate whether bitstream is in Annex-B format. */
+  AV1D_SET_IS_ANNEXB,
+
+  /** control function to indicate which operating point to use. A scalable
+   *  stream may define multiple operating points, each of which defines a
+   *  set of temporal and spatial layers to be processed. The operating point
+   *  index may take a value between 0 and operating_points_cnt_minus_1 (which
+   *  is at most 31).
+   */
+  AV1D_SET_OPERATING_POINT,
+
+  /** control function to indicate whether to output one frame per temporal
+   *  unit (the default), or one frame per spatial layer.
+   *  In a scalable stream, each temporal unit corresponds to a single "frame"
+   *  of video, and within a temporal unit there may be multiple spatial layers
+   *  with different versions of that frame.
+   *  For video playback, only the highest-quality version (within the
+   *  selected operating point) is needed, but for some use cases it is useful
+   *  to have access to multiple versions of a frame when they are available.
+   */
+  AV1D_SET_OUTPUT_ALL_LAYERS,
 
   /** control function to set an aom_inspect_cb callback that is invoked each
    * time a frame is decoded.  When compiled without --enable-inspection, this
@@ -159,24 +218,6 @@ enum aom_dec_control_id {
   AOM_DECODER_CTRL_ID_MAX,
 };
 
-/** Decrypt n bytes of data from input -> output, using the decrypt_state
- *  passed in AOMD_SET_DECRYPTOR.
- */
-typedef void (*aom_decrypt_cb)(void *decrypt_state, const unsigned char *input,
-                               unsigned char *output, int count);
-
-/*!\brief Structure to hold decryption state
- *
- * Defines a structure to hold the decryption state and access function.
- */
-typedef struct aom_decrypt_init {
-  /*! Decrypt callback. */
-  aom_decrypt_cb decrypt_cb;
-
-  /*! Decryption state. */
-  void *decrypt_state;
-} aom_decrypt_init;
-
 /*!\cond */
 /*!\brief AOM decoder control function parameter type
  *
@@ -193,10 +234,6 @@ AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_USED, int *)
 #define AOM_CTRL_AOMD_GET_LAST_REF_USED
 AOM_CTRL_USE_TYPE(AOMD_GET_LAST_QUANTIZER, int *)
 #define AOM_CTRL_AOMD_GET_LAST_QUANTIZER
-AOM_CTRL_USE_TYPE(AOMD_SET_DECRYPTOR, aom_decrypt_init *)
-#define AOM_CTRL_AOMD_SET_DECRYPTOR
-// AOM_CTRL_USE_TYPE(AOMD_SET_DECRYPTOR, aom_decrypt_init *)
-//#define AOM_CTRL_AOMD_SET_DECRYPTOR
 AOM_CTRL_USE_TYPE(AV1D_GET_DISPLAY_SIZE, int *)
 #define AOM_CTRL_AV1D_GET_DISPLAY_SIZE
 AOM_CTRL_USE_TYPE(AV1D_GET_BIT_DEPTH, unsigned int *)
@@ -211,6 +248,22 @@ AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_ROW, int)
 #define AOM_CTRL_AV1_SET_DECODE_TILE_ROW
 AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_COL, int)
 #define AOM_CTRL_AV1_SET_DECODE_TILE_COL
+AOM_CTRL_USE_TYPE(AV1_SET_TILE_MODE, unsigned int)
+#define AOM_CTRL_AV1_SET_TILE_MODE
+AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_HEADER_INFO, aom_tile_data *)
+#define AOM_CTRL_AV1D_GET_FRAME_HEADER_INFO
+AOM_CTRL_USE_TYPE(AV1D_GET_TILE_DATA, aom_tile_data *)
+#define AOM_CTRL_AV1D_GET_TILE_DATA
+AOM_CTRL_USE_TYPE(AV1D_SET_EXT_REF_PTR, av1_ext_ref_frame_t *)
+#define AOM_CTRL_AV1D_SET_EXT_REF_PTR
+AOM_CTRL_USE_TYPE(AV1D_EXT_TILE_DEBUG, unsigned int)
+#define AOM_CTRL_AV1D_EXT_TILE_DEBUG
+AOM_CTRL_USE_TYPE(AV1D_SET_IS_ANNEXB, unsigned int)
+#define AOM_CTRL_AV1D_SET_IS_ANNEXB
+AOM_CTRL_USE_TYPE(AV1D_SET_OPERATING_POINT, int)
+#define AOM_CTRL_AV1D_SET_OPERATING_POINT
+AOM_CTRL_USE_TYPE(AV1D_SET_OUTPUT_ALL_LAYERS, int)
+#define AOM_CTRL_AV1D_SET_OUTPUT_ALL_LAYERS
 AOM_CTRL_USE_TYPE(AV1_SET_INSPECTION_CALLBACK, aom_inspect_init *)
 #define AOM_CTRL_AV1_SET_INSPECTION_CALLBACK
 /*!\endcond */
diff --git a/third_party/aom/aom/exports_com b/third_party/aom/aom/exports_com
index 897b712a7..a87a4536c 100644
--- a/third_party/aom/aom/exports_com
+++ b/third_party/aom/aom/exports_com
@@ -16,3 +16,8 @@ text aom_img_plane_width
 text aom_img_plane_height
 text aom_img_set_rect
 text aom_img_wrap
+text aom_img_alloc_with_border
+text aom_uleb_decode
+text aom_uleb_encode
+text aom_uleb_encode_fixed_size
+text aom_uleb_size_in_bytes
diff --git a/third_party/aom/aom/internal/aom_codec_internal.h b/third_party/aom/aom/internal/aom_codec_internal.h
index 5aacef7ad..84ea4eefa 100644
--- a/third_party/aom/aom/internal/aom_codec_internal.h
+++ b/third_party/aom/aom/internal/aom_codec_internal.h
@@ -104,16 +104,15 @@ typedef aom_codec_err_t (*aom_codec_destroy_fn_t)(aom_codec_alg_priv_t *ctx);
  *
  * \param[in]      data    Pointer to a block of data to parse
  * \param[in]      data_sz Size of the data buffer
- * \param[in,out]  si      Pointer to stream info to update. The size member
- *                         \ref MUST be properly initialized, but \ref MAY be
- *                         clobbered by the algorithm. This parameter \ref MAY
- *                         be NULL.
+ * \param[in,out]  si      Pointer to stream info to update. The is_annexb
+ *                         member \ref MUST be properly initialized. This
+ *                         function sets the rest of the members.
  *
  * \retval #AOM_CODEC_OK
  *     Bitstream is parsable and stream information updated
  */
 typedef aom_codec_err_t (*aom_codec_peek_si_fn_t)(const uint8_t *data,
-                                                  unsigned int data_sz,
+                                                  size_t data_sz,
                                                   aom_codec_stream_info_t *si);
 
 /*!\brief Return information about the current stream.
@@ -121,10 +120,7 @@ typedef aom_codec_err_t (*aom_codec_peek_si_fn_t)(const uint8_t *data,
  * Returns information about the stream that has been parsed during decoding.
  *
  * \param[in]      ctx     Pointer to this instance's context
- * \param[in,out]  si      Pointer to stream info to update. The size member
- *                         \ref MUST be properly initialized, but \ref MAY be
- *                         clobbered by the algorithm. This parameter \ref MAY
- *                         be NULL.
+ * \param[in,out]  si      Pointer to stream info to update
  *
  * \retval #AOM_CODEC_OK
  *     Bitstream is parsable and stream information updated
@@ -195,9 +191,8 @@ typedef const struct aom_codec_ctrl_fn_map {
  */
 typedef aom_codec_err_t (*aom_codec_decode_fn_t)(aom_codec_alg_priv_t *ctx,
                                                  const uint8_t *data,
-                                                 unsigned int data_sz,
-                                                 void *user_priv,
-                                                 long deadline);
+                                                 size_t data_sz,
+                                                 void *user_priv);
 
 /*!\brief Decoded frames iterator
  *
@@ -252,8 +247,7 @@ typedef aom_codec_err_t (*aom_codec_encode_fn_t)(aom_codec_alg_priv_t *ctx,
                                                  const aom_image_t *img,
                                                  aom_codec_pts_t pts,
                                                  unsigned long duration,
-                                                 aom_enc_frame_flags_t flags,
-                                                 unsigned long deadline);
+                                                 aom_enc_frame_flags_t flags);
 typedef const aom_codec_cx_pkt_t *(*aom_codec_get_cx_data_fn_t)(
     aom_codec_alg_priv_t *ctx, aom_codec_iter_t *iter);
 
diff --git a/third_party/aom/aom/src/aom_codec.c b/third_party/aom/aom/src/aom_codec.c
index 873d75876..733bffb25 100644
--- a/third_party/aom/aom/src/aom_codec.c
+++ b/third_party/aom/aom/src/aom_codec.c
@@ -15,9 +15,12 @@
  */
 #include <stdarg.h>
 #include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_version.h"
+
 #include "aom/aom_integer.h"
 #include "aom/internal/aom_codec_internal.h"
-#include "aom_version.h"
 
 #define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
 
@@ -136,3 +139,19 @@ void aom_internal_error(struct aom_internal_error_info *info,
 void aom_merge_corrupted_flag(int *corrupted, int value) {
   *corrupted |= value;
 }
+
+const char *aom_obu_type_to_string(OBU_TYPE type) {
+  switch (type) {
+    case OBU_SEQUENCE_HEADER: return "OBU_SEQUENCE_HEADER";
+    case OBU_TEMPORAL_DELIMITER: return "OBU_TEMPORAL_DELIMITER";
+    case OBU_FRAME_HEADER: return "OBU_FRAME_HEADER";
+    case OBU_REDUNDANT_FRAME_HEADER: return "OBU_REDUNDANT_FRAME_HEADER";
+    case OBU_FRAME: return "OBU_FRAME";
+    case OBU_TILE_GROUP: return "OBU_TILE_GROUP";
+    case OBU_METADATA: return "OBU_METADATA";
+    case OBU_TILE_LIST: return "OBU_TILE_LIST";
+    case OBU_PADDING: return "OBU_PADDING";
+    default: break;
+  }
+  return "<Invalid OBU Type>";
+}
diff --git a/third_party/aom/aom/src/aom_decoder.c b/third_party/aom/aom/src/aom_decoder.c
index 75eb81089..e0cec10b6 100644
--- a/third_party/aom/aom/src/aom_decoder.c
+++ b/third_party/aom/aom/src/aom_decoder.c
@@ -61,8 +61,7 @@ aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx,
 }
 
 aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface,
-                                           const uint8_t *data,
-                                           unsigned int data_sz,
+                                           const uint8_t *data, size_t data_sz,
                                            aom_codec_stream_info_t *si) {
   aom_codec_err_t res;
 
@@ -99,8 +98,7 @@ aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx,
 }
 
 aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data,
-                                 unsigned int data_sz, void *user_priv,
-                                 long deadline) {
+                                 size_t data_sz, void *user_priv) {
   aom_codec_err_t res;
 
   /* Sanity checks */
@@ -110,8 +108,7 @@ aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data,
   else if (!ctx->iface || !ctx->priv)
     res = AOM_CODEC_ERROR;
   else {
-    res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv,
-                                 deadline);
+    res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv);
   }
 
   return SAVE_STATUS(ctx, res);
diff --git a/third_party/aom/aom/src/aom_encoder.c b/third_party/aom/aom/src/aom_encoder.c
index ac84c888a..22765d6a6 100644
--- a/third_party/aom/aom/src/aom_encoder.c
+++ b/third_party/aom/aom/src/aom_encoder.c
@@ -13,7 +13,7 @@
  * \brief Provides the high level interface to wrap encoder algorithms.
  *
  */
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #if HAVE_FEXCEPT
 #define _GNU_SOURCE
@@ -171,14 +171,14 @@ aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface,
     }
   }
 
+  /* default values */
+  if (cfg) {
+    cfg->cfg.ext_partition = 1;
+  }
+
   return res;
 }
 
-/* clang-format off */
-#define FLOATING_POINT_BEGIN_SCOPE do {
-#define FLOATING_POINT_END_SCOPE } while (0);
-/* clang-format on */
-
 #if ARCH_X86 || ARCH_X86_64
 /* On X86, disable the x87 unit's internal 80 bit precision for better
  * consistency with the SSE unit's 64 bit precision.
@@ -201,20 +201,21 @@ aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface,
 #define FLOATING_POINT_RESTORE_EXCEPTIONS
 #endif  // HAVE_FEXCEPT && CONFIG_DEBUG
 
+/* clang-format off */
 #define FLOATING_POINT_INIT    \
-  FLOATING_POINT_BEGIN_SCOPE   \
+  do {                         \
   FLOATING_POINT_SET_PRECISION \
   FLOATING_POINT_SET_EXCEPTIONS
 
 #define FLOATING_POINT_RESTORE      \
   FLOATING_POINT_RESTORE_EXCEPTIONS \
   FLOATING_POINT_RESTORE_PRECISION  \
-  FLOATING_POINT_END_SCOPE
+  } while (0);
+/* clang-format on */
 
 aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img,
                                  aom_codec_pts_t pts, unsigned long duration,
-                                 aom_enc_frame_flags_t flags,
-                                 unsigned long deadline) {
+                                 aom_enc_frame_flags_t flags) {
   aom_codec_err_t res = AOM_CODEC_OK;
 
   if (!ctx || (img && !duration))
@@ -232,8 +233,8 @@ aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img,
     FLOATING_POINT_INIT
 
     if (num_enc == 1)
-      res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, flags,
-                                   deadline);
+      res =
+          ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, flags);
     else {
       /* Multi-resolution encoding:
        * Encode multi-levels in reverse order. For example,
@@ -247,7 +248,7 @@ aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img,
 
       for (i = num_enc - 1; i >= 0; i--) {
         if ((res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration,
-                                          flags, deadline)))
+                                          flags)))
           break;
 
         ctx--;
diff --git a/third_party/aom/aom/src/aom_image.c b/third_party/aom/aom/src/aom_image.c
index e1176bd45..437f0241e 100644
--- a/third_party/aom/aom/src/aom_image.c
+++ b/third_party/aom/aom/src/aom_image.c
@@ -16,14 +16,22 @@
 #include "aom/aom_integer.h"
 #include "aom_mem/aom_mem.h"
 
-static aom_image_t *img_alloc_helper(aom_image_t *img, aom_img_fmt_t fmt,
-                                     unsigned int d_w, unsigned int d_h,
-                                     unsigned int buf_align,
-                                     unsigned int stride_align,
-                                     unsigned char *img_data) {
+static INLINE unsigned int align_image_dimension(unsigned int d,
+                                                 unsigned int subsampling,
+                                                 unsigned int size_align) {
+  unsigned int align;
+
+  align = (1 << subsampling) - 1;
+  align = (size_align - 1 > align) ? (size_align - 1) : align;
+  return ((d + align) & ~align);
+}
+
+static aom_image_t *img_alloc_helper(
+    aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, unsigned int d_h,
+    unsigned int buf_align, unsigned int stride_align, unsigned int size_align,
+    unsigned char *img_data, unsigned int border) {
   unsigned int h, w, s, xcs, ycs, bps;
   unsigned int stride_in_bytes;
-  int align;
 
   /* Treat align==0 like align==1 */
   if (!buf_align) buf_align = 1;
@@ -37,31 +45,22 @@ static aom_image_t *img_alloc_helper(aom_image_t *img, aom_img_fmt_t fmt,
   /* Validate alignment (must be power of 2) */
   if (stride_align & (stride_align - 1)) goto fail;
 
+  /* Treat align==0 like align==1 */
+  if (!size_align) size_align = 1;
+
+  /* Validate alignment (must be power of 2) */
+  if (size_align & (size_align - 1)) goto fail;
+
   /* Get sample size for this format */
   switch (fmt) {
-    case AOM_IMG_FMT_RGB32:
-    case AOM_IMG_FMT_RGB32_LE:
-    case AOM_IMG_FMT_ARGB:
-    case AOM_IMG_FMT_ARGB_LE: bps = 32; break;
-    case AOM_IMG_FMT_RGB24:
-    case AOM_IMG_FMT_BGR24: bps = 24; break;
-    case AOM_IMG_FMT_RGB565:
-    case AOM_IMG_FMT_RGB565_LE:
-    case AOM_IMG_FMT_RGB555:
-    case AOM_IMG_FMT_RGB555_LE:
-    case AOM_IMG_FMT_UYVY:
-    case AOM_IMG_FMT_YUY2:
-    case AOM_IMG_FMT_YVYU: bps = 16; break;
     case AOM_IMG_FMT_I420:
     case AOM_IMG_FMT_YV12:
     case AOM_IMG_FMT_AOMI420:
     case AOM_IMG_FMT_AOMYV12: bps = 12; break;
     case AOM_IMG_FMT_I422:
-    case AOM_IMG_FMT_I440: bps = 16; break;
     case AOM_IMG_FMT_I444: bps = 24; break;
     case AOM_IMG_FMT_I42016: bps = 24; break;
     case AOM_IMG_FMT_I42216:
-    case AOM_IMG_FMT_I44016: bps = 32; break;
     case AOM_IMG_FMT_I44416: bps = 48; break;
     default: bps = 16; break;
   }
@@ -80,22 +79,19 @@ static aom_image_t *img_alloc_helper(aom_image_t *img, aom_img_fmt_t fmt,
 
   switch (fmt) {
     case AOM_IMG_FMT_I420:
-    case AOM_IMG_FMT_I440:
     case AOM_IMG_FMT_YV12:
     case AOM_IMG_FMT_AOMI420:
     case AOM_IMG_FMT_AOMYV12:
-    case AOM_IMG_FMT_I42016:
-    case AOM_IMG_FMT_I44016: ycs = 1; break;
+    case AOM_IMG_FMT_I42016: ycs = 1; break;
     default: ycs = 0; break;
   }
 
   /* Calculate storage sizes given the chroma subsampling */
-  align = (1 << xcs) - 1;
-  w = (d_w + align) & ~align;
-  align = (1 << ycs) - 1;
-  h = (d_h + align) & ~align;
+  w = align_image_dimension(d_w, xcs, size_align);
+  h = align_image_dimension(d_h, ycs, size_align);
+
   s = (fmt & AOM_IMG_FMT_PLANAR) ? w : bps * w / 8;
-  s = (s + stride_align - 1) & ~(stride_align - 1);
+  s = (s + 2 * border + stride_align - 1) & ~(stride_align - 1);
   stride_in_bytes = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? s * 2 : s;
 
   /* Allocate the new image */
@@ -112,9 +108,10 @@ static aom_image_t *img_alloc_helper(aom_image_t *img, aom_img_fmt_t fmt,
   img->img_data = img_data;
 
   if (!img_data) {
-    const uint64_t alloc_size = (fmt & AOM_IMG_FMT_PLANAR)
-                                    ? (uint64_t)h * s * bps / 8
-                                    : (uint64_t)h * s;
+    const uint64_t alloc_size =
+        (fmt & AOM_IMG_FMT_PLANAR)
+            ? (uint64_t)(h + 2 * border) * stride_in_bytes * bps / 8
+            : (uint64_t)(h + 2 * border) * stride_in_bytes;
 
     if (alloc_size != (size_t)alloc_size) goto fail;
 
@@ -126,6 +123,7 @@ static aom_image_t *img_alloc_helper(aom_image_t *img, aom_img_fmt_t fmt,
 
   img->fmt = fmt;
   img->bit_depth = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
+  // aligned width and aligned height
   img->w = w;
   img->h = h;
   img->x_chroma_shift = xcs;
@@ -137,7 +135,7 @@ static aom_image_t *img_alloc_helper(aom_image_t *img, aom_img_fmt_t fmt,
   img->stride[AOM_PLANE_U] = img->stride[AOM_PLANE_V] = stride_in_bytes >> xcs;
 
   /* Default viewport to entire image */
-  if (!aom_img_set_rect(img, 0, 0, d_w, d_h)) return img;
+  if (!aom_img_set_rect(img, 0, 0, d_w, d_h, border)) return img;
 
 fail:
   aom_img_free(img);
@@ -147,7 +145,7 @@ fail:
 aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt,
                            unsigned int d_w, unsigned int d_h,
                            unsigned int align) {
-  return img_alloc_helper(img, fmt, d_w, d_h, align, align, NULL);
+  return img_alloc_helper(img, fmt, d_w, d_h, align, align, 1, NULL, 0);
 }
 
 aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w,
@@ -155,17 +153,29 @@ aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w,
                           unsigned char *img_data) {
   /* By setting buf_align = 1, we don't change buffer alignment in this
    * function. */
-  return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, img_data);
+  return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, 1, img_data, 0);
+}
+
+aom_image_t *aom_img_alloc_with_border(aom_image_t *img, aom_img_fmt_t fmt,
+                                       unsigned int d_w, unsigned int d_h,
+                                       unsigned int align,
+                                       unsigned int size_align,
+                                       unsigned int border) {
+  return img_alloc_helper(img, fmt, d_w, d_h, align, align, size_align, NULL,
+                          border);
 }
 
 int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
-                     unsigned int w, unsigned int h) {
+                     unsigned int w, unsigned int h, unsigned int border) {
   unsigned char *data;
 
   if (x + w <= img->w && y + h <= img->h) {
     img->d_w = w;
     img->d_h = h;
 
+    x += border;
+    y += border;
+
     /* Calculate plane pointers */
     if (!(img->fmt & AOM_IMG_FMT_PLANAR)) {
       img->planes[AOM_PLANE_PACKED] =
@@ -178,29 +188,30 @@ int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
       if (img->fmt & AOM_IMG_FMT_HAS_ALPHA) {
         img->planes[AOM_PLANE_ALPHA] =
             data + x * bytes_per_sample + y * img->stride[AOM_PLANE_ALPHA];
-        data += img->h * img->stride[AOM_PLANE_ALPHA];
+        data += (img->h + 2 * border) * img->stride[AOM_PLANE_ALPHA];
       }
 
       img->planes[AOM_PLANE_Y] =
           data + x * bytes_per_sample + y * img->stride[AOM_PLANE_Y];
-      data += img->h * img->stride[AOM_PLANE_Y];
+      data += (img->h + 2 * border) * img->stride[AOM_PLANE_Y];
 
+      unsigned int uv_border_h = border >> img->y_chroma_shift;
+      unsigned int uv_x = x >> img->x_chroma_shift;
+      unsigned int uv_y = y >> img->y_chroma_shift;
       if (!(img->fmt & AOM_IMG_FMT_UV_FLIP)) {
         img->planes[AOM_PLANE_U] =
-            data + (x >> img->x_chroma_shift) * bytes_per_sample +
-            (y >> img->y_chroma_shift) * img->stride[AOM_PLANE_U];
-        data += (img->h >> img->y_chroma_shift) * img->stride[AOM_PLANE_U];
+            data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U];
+        data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) *
+                img->stride[AOM_PLANE_U];
         img->planes[AOM_PLANE_V] =
-            data + (x >> img->x_chroma_shift) * bytes_per_sample +
-            (y >> img->y_chroma_shift) * img->stride[AOM_PLANE_V];
+            data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V];
       } else {
         img->planes[AOM_PLANE_V] =
-            data + (x >> img->x_chroma_shift) * bytes_per_sample +
-            (y >> img->y_chroma_shift) * img->stride[AOM_PLANE_V];
-        data += (img->h >> img->y_chroma_shift) * img->stride[AOM_PLANE_V];
+            data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V];
+        data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) *
+                img->stride[AOM_PLANE_V];
         img->planes[AOM_PLANE_U] =
-            data + (x >> img->x_chroma_shift) * bytes_per_sample +
-            (y >> img->y_chroma_shift) * img->stride[AOM_PLANE_U];
+            data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U];
       }
     }
     return 0;
diff --git a/third_party/aom/aom/src/aom_integer.c b/third_party/aom/aom/src/aom_integer.c
new file mode 100644
index 000000000..7edfd0de8
--- /dev/null
+++ b/third_party/aom/aom/src/aom_integer.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+
+static const size_t kMaximumLeb128Size = 8;
+static const uint8_t kLeb128ByteMask = 0x7f;  // Binary: 01111111
+
+// Disallow values larger than 32-bits to ensure consistent behavior on 32 and
+// 64 bit targets: value is typically used to determine buffer allocation size
+// when decoded.
+static const uint64_t kMaximumLeb128Value = UINT32_MAX;
+
+size_t aom_uleb_size_in_bytes(uint64_t value) {
+  size_t size = 0;
+  do {
+    ++size;
+  } while ((value >>= 7) != 0);
+  return size;
+}
+
+int aom_uleb_decode(const uint8_t *buffer, size_t available, uint64_t *value,
+                    size_t *length) {
+  if (buffer && value) {
+    *value = 0;
+    for (size_t i = 0; i < kMaximumLeb128Size && i < available; ++i) {
+      const uint8_t decoded_byte = *(buffer + i) & kLeb128ByteMask;
+      *value |= ((uint64_t)decoded_byte) << (i * 7);
+      if ((*(buffer + i) >> 7) == 0) {
+        if (length) {
+          *length = i + 1;
+        }
+
+        // Fail on values larger than 32-bits to ensure consistent behavior on
+        // 32 and 64 bit targets: value is typically used to determine buffer
+        // allocation size.
+        if (*value > UINT32_MAX) return -1;
+
+        return 0;
+      }
+    }
+  }
+
+  // If we get here, either the buffer/value pointers were invalid,
+  // or we ran over the available space
+  return -1;
+}
+
+int aom_uleb_encode(uint64_t value, size_t available, uint8_t *coded_value,
+                    size_t *coded_size) {
+  const size_t leb_size = aom_uleb_size_in_bytes(value);
+  if (value > kMaximumLeb128Value || leb_size > kMaximumLeb128Size ||
+      leb_size > available || !coded_value || !coded_size) {
+    return -1;
+  }
+
+  for (size_t i = 0; i < leb_size; ++i) {
+    uint8_t byte = value & 0x7f;
+    value >>= 7;
+
+    if (value != 0) byte |= 0x80;  // Signal that more bytes follow.
+
+    *(coded_value + i) = byte;
+  }
+
+  *coded_size = leb_size;
+  return 0;
+}
+
+int aom_uleb_encode_fixed_size(uint64_t value, size_t available,
+                               size_t pad_to_size, uint8_t *coded_value,
+                               size_t *coded_size) {
+  if (value > kMaximumLeb128Value || !coded_value || !coded_size ||
+      available < pad_to_size || pad_to_size > kMaximumLeb128Size) {
+    return -1;
+  }
+  const uint64_t limit = 1ULL << (7 * pad_to_size);
+  if (value >= limit) {
+    // Can't encode 'value' within 'pad_to_size' bytes
+    return -1;
+  }
+
+  for (size_t i = 0; i < pad_to_size; ++i) {
+    uint8_t byte = value & 0x7f;
+    value >>= 7;
+
+    if (i < pad_to_size - 1) byte |= 0x80;  // Signal that more bytes follow.
+
+    *(coded_value + i) = byte;
+  }
+
+  assert(value == 0);
+
+  *coded_size = pad_to_size;
+  return 0;
+}
diff --git a/third_party/aom/aom_dsp/add_noise.c b/third_party/aom/aom_dsp/add_noise.c
index 389cf2049..bfb3e7e00 100644
--- a/third_party/aom/aom_dsp/add_noise.c
+++ b/third_party/aom/aom_dsp/add_noise.c
@@ -12,8 +12,8 @@
 #include <math.h>
 #include <stdlib.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
diff --git a/third_party/aom/aom_dsp/ans.h b/third_party/aom/aom_dsp/ans.h
deleted file mode 100644
index a7a2f0eab..000000000
--- a/third_party/aom/aom_dsp/ans.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_ANS_H_
-#define AOM_DSP_ANS_H_
-// Constants, types and utilities for Asymmetric Numeral Systems
-// http://arxiv.org/abs/1311.2540v2
-
-#include <assert.h>
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/prob.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-// Use windowed ANS, size is passed in at initialization
-#define ANS_MAX_SYMBOLS 1
-#define ANS_REVERSE 1
-
-typedef uint8_t AnsP8;
-#define ANS_P8_PRECISION 256u
-#define ANS_P8_SHIFT 8
-#define RANS_PROB_BITS 15
-#define RANS_PRECISION (1u << RANS_PROB_BITS)
-
-// L_BASE is the ANS base state. L_BASE % PRECISION must be 0.
-#define L_BASE (1u << 17)
-#define IO_BASE 256
-// Range I = { L_BASE, L_BASE + 1, ..., L_BASE * IO_BASE - 1 }
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-#endif  // AOM_DSP_ANS_H_
diff --git a/third_party/aom/aom_dsp/ansreader.h b/third_party/aom/aom_dsp/ansreader.h
deleted file mode 100644
index e50c63b2d..000000000
--- a/third_party/aom/aom_dsp/ansreader.h
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_ANSREADER_H_
-#define AOM_DSP_ANSREADER_H_
-// An implementation of Asymmetric Numeral Systems
-// http://arxiv.org/abs/1311.2540v2
-// Implements decoding of:
-// * rABS (range Asymmetric Binary Systems), a boolean coder
-// * rANS (range Asymmetric Numeral Systems), a multi-symbol coder
-
-#include <assert.h>
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/prob.h"
-#include "aom_dsp/ans.h"
-#include "aom_ports/mem_ops.h"
-#if CONFIG_ACCOUNTING
-#include "av1/decoder/accounting.h"
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-struct AnsDecoder {
-  const uint8_t *buf;
-  int buf_offset;
-  uint32_t state;
-#if ANS_MAX_SYMBOLS
-  int symbols_left;
-  int window_size;
-#endif
-#if CONFIG_ACCOUNTING
-  Accounting *accounting;
-#endif
-};
-
-static INLINE int ans_read_reinit(struct AnsDecoder *const ans);
-
-static INLINE unsigned refill_state(struct AnsDecoder *const ans,
-                                    unsigned state) {
-#if ANS_REVERSE
-  while (state < L_BASE && ans->buf_offset < 0) {
-    state = state * IO_BASE + ans->buf[ans->buf_offset++];
-  }
-#else
-  while (state < L_BASE && ans->buf_offset > 0) {
-    state = state * IO_BASE + ans->buf[--ans->buf_offset];
-  }
-#endif
-  return state;
-}
-
-// Decode one rABS encoded boolean where the probability of the value being zero
-// is p0.
-static INLINE int rabs_read(struct AnsDecoder *ans, AnsP8 p0) {
-#if ANS_MAX_SYMBOLS
-  if (ans->symbols_left-- == 0) {
-    ans_read_reinit(ans);
-    ans->symbols_left--;
-  }
-#endif
-  unsigned state = refill_state(ans, ans->state);
-  const unsigned quotient = state / ANS_P8_PRECISION;
-  const unsigned remainder = state % ANS_P8_PRECISION;
-  const int value = remainder >= p0;
-  const unsigned qp0 = quotient * p0;
-  if (value)
-    state = state - qp0 - p0;
-  else
-    state = qp0 + remainder;
-  ans->state = state;
-  return value;
-}
-
-// Decode one rABS encoded boolean where the probability of the value being zero
-// is one half.
-static INLINE int rabs_read_bit(struct AnsDecoder *ans) {
-#if ANS_MAX_SYMBOLS
-  if (ans->symbols_left-- == 0) {
-    ans_read_reinit(ans);
-    ans->symbols_left--;
-  }
-#endif
-  unsigned state = refill_state(ans, ans->state);
-  const int value = !!(state & 0x80);
-  ans->state = ((state >> 1) & ~0x7F) | (state & 0x7F);
-  return value;
-}
-
-struct rans_dec_sym {
-  uint8_t val;
-  aom_cdf_prob prob;
-  aom_cdf_prob cum_prob;  // not-inclusive
-};
-
-static INLINE void fetch_sym(struct rans_dec_sym *out, const aom_cdf_prob *cdf,
-                             aom_cdf_prob rem) {
-  int i;
-  aom_cdf_prob cum_prob = 0, top_prob;
-  // TODO(skal): if critical, could be a binary search.
-  // Or, better, an O(1) alias-table.
-  for (i = 0; rem >= (top_prob = cdf[i]); ++i) {
-    cum_prob = top_prob;
-  }
-  out->val = i;
-  out->prob = top_prob - cum_prob;
-  out->cum_prob = cum_prob;
-}
-
-static INLINE int rans_read(struct AnsDecoder *ans, const aom_cdf_prob *tab) {
-  unsigned rem;
-  unsigned quo;
-  struct rans_dec_sym sym;
-#if ANS_MAX_SYMBOLS
-  if (ans->symbols_left-- == 0) {
-    ans_read_reinit(ans);
-    ans->symbols_left--;
-  }
-#endif
-  ans->state = refill_state(ans, ans->state);
-  quo = ans->state / RANS_PRECISION;
-  rem = ans->state % RANS_PRECISION;
-  fetch_sym(&sym, tab, rem);
-  ans->state = quo * sym.prob + rem - sym.cum_prob;
-  return sym.val;
-}
-
-static INLINE int ans_read_init(struct AnsDecoder *const ans,
-                                const uint8_t *const buf, int offset) {
-  unsigned x;
-  if (offset < 1) return 1;
-#if ANS_REVERSE
-  ans->buf = buf + offset;
-  ans->buf_offset = -offset;
-  x = buf[0];
-  if ((x & 0x80) == 0) {  // Marker is 0xxx xxxx
-    if (offset < 2) return 1;
-    ans->buf_offset += 2;
-    ans->state = mem_get_be16(buf) & 0x7FFF;
-#if L_BASE * IO_BASE > (1 << 23)
-  } else if ((x & 0xC0) == 0x80) {  // Marker is 10xx xxxx
-    if (offset < 3) return 1;
-    ans->buf_offset += 3;
-    ans->state = mem_get_be24(buf) & 0x3FFFFF;
-  } else {  // Marker is 11xx xxxx
-    if (offset < 4) return 1;
-    ans->buf_offset += 4;
-    ans->state = mem_get_be32(buf) & 0x3FFFFFFF;
-#else
-  } else {  // Marker is 1xxx xxxx
-    if (offset < 3) return 1;
-    ans->buf_offset += 3;
-    ans->state = mem_get_be24(buf) & 0x7FFFFF;
-#endif
-  }
-#else
-  ans->buf = buf;
-  x = buf[offset - 1];
-  if ((x & 0x80) == 0) {  // Marker is 0xxx xxxx
-    if (offset < 2) return 1;
-    ans->buf_offset = offset - 2;
-    ans->state = mem_get_le16(buf + offset - 2) & 0x7FFF;
-  } else if ((x & 0xC0) == 0x80) {  // Marker is 10xx xxxx
-    if (offset < 3) return 1;
-    ans->buf_offset = offset - 3;
-    ans->state = mem_get_le24(buf + offset - 3) & 0x3FFFFF;
-  } else if ((x & 0xE0) == 0xE0) {  // Marker is 111x xxxx
-    if (offset < 4) return 1;
-    ans->buf_offset = offset - 4;
-    ans->state = mem_get_le32(buf + offset - 4) & 0x1FFFFFFF;
-  } else {
-    // Marker 110x xxxx implies this byte is a superframe marker
-    return 1;
-  }
-#endif  // ANS_REVERSE
-#if CONFIG_ACCOUNTING
-  ans->accounting = NULL;
-#endif
-  ans->state += L_BASE;
-  if (ans->state >= L_BASE * IO_BASE) return 1;
-#if ANS_MAX_SYMBOLS
-  assert(ans->window_size > 1);
-  ans->symbols_left = ans->window_size;
-#endif
-  return 0;
-}
-
-#if ANS_REVERSE
-static INLINE int ans_read_reinit(struct AnsDecoder *const ans) {
-  return ans_read_init(ans, ans->buf + ans->buf_offset, -ans->buf_offset);
-}
-#endif
-
-static INLINE int ans_read_end(const struct AnsDecoder *const ans) {
-  return ans->buf_offset == 0 && ans->state < L_BASE;
-}
-
-static INLINE int ans_reader_has_error(const struct AnsDecoder *const ans) {
-  return ans->state < L_BASE / RANS_PRECISION;
-}
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-#endif  // AOM_DSP_ANSREADER_H_
diff --git a/third_party/aom/aom_dsp/answriter.h b/third_party/aom/aom_dsp/answriter.h
deleted file mode 100644
index 353acf1a9..000000000
--- a/third_party/aom/aom_dsp/answriter.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_ANSWRITER_H_
-#define AOM_DSP_ANSWRITER_H_
-// An implementation of Asymmetric Numeral Systems
-// http://arxiv.org/abs/1311.2540v2
-// Implements encoding of:
-// * rABS (range Asymmetric Binary Systems), a boolean coder
-// * rANS (range Asymmetric Numeral Systems), a multi-symbol coder
-
-#include <assert.h>
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/ans.h"
-#include "aom_dsp/prob.h"
-#include "aom_ports/mem_ops.h"
-#include "av1/common/odintrin.h"
-
-#if RANS_PRECISION <= OD_DIVU_DMAX
-#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
-  do {                                                     \
-    quotient = OD_DIVU_SMALL((dividend), (divisor));       \
-    remainder = (dividend) - (quotient) * (divisor);       \
-  } while (0)
-#else
-#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
-  do {                                                     \
-    quotient = (dividend) / (divisor);                     \
-    remainder = (dividend) % (divisor);                    \
-  } while (0)
-#endif
-
-#define ANS_DIV8(dividend, divisor) OD_DIVU_SMALL((dividend), (divisor))
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-struct AnsCoder {
-  uint8_t *buf;
-  int buf_offset;
-  uint32_t state;
-};
-
-static INLINE void ans_write_init(struct AnsCoder *const ans,
-                                  uint8_t *const buf) {
-  ans->buf = buf;
-  ans->buf_offset = 0;
-  ans->state = L_BASE;
-}
-
-static INLINE int ans_write_end(struct AnsCoder *const ans) {
-  uint32_t state;
-  int ans_size;
-  assert(ans->state >= L_BASE);
-  assert(ans->state < L_BASE * IO_BASE);
-  state = ans->state - L_BASE;
-  if (state < (1u << 15)) {
-    mem_put_le16(ans->buf + ans->buf_offset, (0x00u << 15) + state);
-    ans_size = ans->buf_offset + 2;
-#if ANS_REVERSE
-#if L_BASE * IO_BASE > (1 << 23)
-  } else if (state < (1u << 22)) {
-    mem_put_le24(ans->buf + ans->buf_offset, (0x02u << 22) + state);
-    ans_size = ans->buf_offset + 3;
-  } else if (state < (1u << 30)) {
-    mem_put_le32(ans->buf + ans->buf_offset, (0x03u << 30) + state);
-    ans_size = ans->buf_offset + 4;
-#else
-  } else if (state < (1u << 23)) {
-    mem_put_le24(ans->buf + ans->buf_offset, (0x01u << 23) + state);
-    ans_size = ans->buf_offset + 3;
-#endif
-#else
-  } else if (state < (1u << 22)) {
-    mem_put_le24(ans->buf + ans->buf_offset, (0x02u << 22) + state);
-    ans_size = ans->buf_offset + 3;
-  } else if (state < (1u << 29)) {
-    mem_put_le32(ans->buf + ans->buf_offset, (0x07u << 29) + state);
-    ans_size = ans->buf_offset + 4;
-#endif
-  } else {
-    assert(0 && "State is too large to be serialized");
-    return ans->buf_offset;
-  }
-#if ANS_REVERSE
-  {
-    int i;
-    uint8_t tmp;
-    for (i = 0; i < (ans_size >> 1); i++) {
-      tmp = ans->buf[i];
-      ans->buf[i] = ans->buf[ans_size - 1 - i];
-      ans->buf[ans_size - 1 - i] = tmp;
-    }
-    ans->buf += ans_size;
-    ans->buf_offset = 0;
-    ans->state = L_BASE;
-  }
-#endif
-  return ans_size;
-}
-
-// Write one boolean using rABS where p0 is the probability of the value being
-// zero.
-static INLINE void rabs_write(struct AnsCoder *ans, int value, AnsP8 p0) {
-  const AnsP8 p = ANS_P8_PRECISION - p0;
-  const unsigned l_s = value ? p : p0;
-  unsigned state = ans->state;
-  while (state >= L_BASE / ANS_P8_PRECISION * IO_BASE * l_s) {
-    ans->buf[ans->buf_offset++] = state % IO_BASE;
-    state /= IO_BASE;
-  }
-  const unsigned quotient = ANS_DIV8(state, l_s);
-  const unsigned remainder = state - quotient * l_s;
-  ans->state = quotient * ANS_P8_PRECISION + remainder + (value ? p0 : 0);
-}
-
-// Encode one symbol using rANS.
-// cum_prob: The cumulative probability before this symbol (the offset of
-// the symbol in the symbol cycle)
-// prob: The probability of this symbol (l_s from the paper)
-// RANS_PRECISION takes the place of m from the paper.
-static INLINE void rans_write(struct AnsCoder *ans, aom_cdf_prob cum_prob,
-                              aom_cdf_prob prob) {
-  unsigned quotient, remainder;
-  while (ans->state >= L_BASE / RANS_PRECISION * IO_BASE * prob) {
-    ans->buf[ans->buf_offset++] = ans->state % IO_BASE;
-    ans->state /= IO_BASE;
-  }
-  ANS_DIVREM(quotient, remainder, ans->state, prob);
-  ans->state = quotient * RANS_PRECISION + remainder + cum_prob;
-}
-
-#undef ANS_DIV8
-#undef ANS_DIVREM
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-#endif  // AOM_DSP_ANSWRITER_H_
diff --git a/third_party/aom/aom_dsp/aom_convolve.c b/third_party/aom/aom_dsp/aom_convolve.c
index c903ea52d..bba37e227 100644
--- a/third_party/aom/aom_dsp/aom_convolve.c
+++ b/third_party/aom/aom_dsp/aom_convolve.c
@@ -12,73 +12,40 @@
 #include <assert.h>
 #include <string.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_ports/mem.h"
 
+static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+  return sum;
+}
+
+static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride,
+                                      const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
+  return sum;
+}
+
 static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const InterpKernel *x_filters, int x0_q4,
                            int x_step_q4, int w, int h) {
-  int x, y;
   src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
+  for (int y = 0; y < h; ++y) {
     int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
+    for (int x = 0; x < w; ++x) {
       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
-      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *x_filters, int x0_qn,
-                                   int x_step_qn, int w, int h) {
-  int x, y;
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_qn = x0_qn;
-    for (x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];  // q8
-      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(x_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *const x_filter = x_filters[x_filter_idx];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+      const int sum = horz_scalar_product(src_x, x_filter);
       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      x_qn += x_step_qn;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const InterpKernel *x_filters, int x0_q4,
-                               int x_step_q4, int w, int h) {
-  int x, y;
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
-      dst[x] = ROUND_POWER_OF_TWO(
-          dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
       x_q4 += x_step_q4;
     }
     src += src_stride;
@@ -86,97 +53,19 @@ static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void convolve_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                       uint8_t *dst, ptrdiff_t dst_stride,
-                                       const InterpKernel *x_filters, int x0_qn,
-                                       int x_step_qn, int w, int h) {
-  int x, y;
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_qn = x0_qn;
-    for (x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];
-      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(x_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *const x_filter = x_filters[x_filter_idx];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
-      dst[x] = ROUND_POWER_OF_TWO(
-          dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
-      x_qn += x_step_qn;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
 static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const InterpKernel *y_filters, int y0_q4,
                           int y_step_q4, int w, int h) {
-  int x, y;
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
 
-  for (x = 0; x < w; ++x) {
+  for (int x = 0; x < w; ++x) {
     int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
+    for (int y = 0; y < h; ++y) {
       const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void convolve_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const InterpKernel *y_filters, int y0_qn,
-                                  int y_step_qn, int w, int h) {
-  int x, y;
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (x = 0; x < w; ++x) {
-    int y_qn = y0_qn;
-    for (y = 0; y < h; ++y) {
-      const unsigned char *src_y =
-          &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter =
-          y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
+      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
       dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      y_qn += y_step_qn;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const InterpKernel *y_filters, int y0_q4,
-                              int y_step_q4, int w, int h) {
-  int x, y;
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
-      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
-          dst[y * dst_stride] +
-              clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
-          1);
       y_q4 += y_step_q4;
     }
     ++src;
@@ -184,103 +73,6 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void convolve_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const InterpKernel *y_filters, int y0_qn,
-                                      int y_step_qn, int w, int h) {
-  int x, y;
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (x = 0; x < w; ++x) {
-    int y_qn = y0_qn;
-    for (y = 0; y < h; ++y) {
-      const unsigned char *src_y =
-          &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter =
-          y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
-          dst[y * dst_stride] +
-              clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
-          1);
-      y_qn += y_step_qn;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const InterpKernel *const x_filters,
-                     int x0_q4, int x_step_q4,
-                     const InterpKernel *const y_filters, int y0_q4,
-                     int y_step_q4, int w, int h) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
-  int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
-                 MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
-                 intermediate_height);
-  convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst,
-                dst_stride, y_filters, y0_q4, y_step_q4, w, h);
-}
-
-static void convolve_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *const x_filters, int x0_qn,
-                             int x_step_qn, const InterpKernel *const y_filters,
-                             int y0_qn, int y_step_qn, int w, int h) {
-  // TODO(afergs): Update comment here
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_qn = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
-  int intermediate_height =
-      (((h - 1) * y_step_qn + y0_qn) >> SCALE_SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-
-  assert(y_step_qn <= SCALE_SUBPEL_BITS * 2);
-  assert(x_step_qn <= SCALE_SUBPEL_BITS * 2);
-
-  convolve_horiz_scale_c(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
-                         temp, MAX_SB_SIZE, x_filters, x0_qn, x_step_qn, w,
-                         intermediate_height);
-  convolve_vert_scale_c(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
-                        dst, dst_stride, y_filters, y0_qn, y_step_qn, w, h);
-}
-
 static const InterpKernel *get_filter_base(const int16_t *filter) {
   // NOTE: This assumes that the filter table is 256-byte aligned.
   // TODO(agrange) Modify to make independent of table alignment.
@@ -306,52 +98,6 @@ void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                  w, h);
 }
 
-void aom_convolve8_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int subpel_x,
-                                 int x_step_qn, const int16_t *filter_y,
-                                 int subpel_y, int y_step_qn, int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-
-  (void)subpel_y;
-  (void)filter_y;
-  (void)y_step_qn;
-
-  convolve_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
-                         x_step_qn, w, h);
-}
-
-void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
-                               int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
-  (void)y_step_q4;
-
-  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                     x_step_q4, w, h);
-}
-
-void aom_convolve8_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int subpel_x,
-                                     int x_step_qn, const int16_t *filter_y,
-                                     int subpel_y, int y_step_qn, int w,
-                                     int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-
-  (void)subpel_y;
-  (void)filter_y;
-  (void)y_step_qn;
-
-  convolve_avg_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x,
-                             subpel_x, x_step_qn, w, h);
-}
-
 void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x, int x_step_q4,
@@ -367,109 +113,6 @@ void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                 w, h);
 }
 
-void aom_convolve8_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int subpel_x,
-                                int x_step_qn, const int16_t *filter_y,
-                                int subpel_y, int y_step_qn, int w, int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-
-  (void)subpel_x;
-  (void)filter_x;
-  (void)x_step_qn;
-
-  convolve_vert_scale_c(src, src_stride, dst, dst_stride, filters_y, subpel_y,
-                        y_step_qn, w, h);
-}
-
-void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
-  (void)x_step_q4;
-
-  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
-                    y_step_q4, w, h);
-}
-
-void aom_convolve8_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const int16_t *filter_x, int subpel_x,
-                                    int x_step_qn, const int16_t *filter_y,
-                                    int subpel_y, int y_step_qn, int w, int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-
-  (void)subpel_x;
-  (void)filter_x;
-  (void)x_step_qn;
-
-  convolve_avg_vert_scale_c(src, src_stride, dst, dst_stride, filters_y,
-                            subpel_y, y_step_qn, w, h);
-}
-
-void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const int16_t *filter_x,
-                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                     int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-           filters_y, y0_q4, y_step_q4, w, h);
-}
-
-void aom_convolve8_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int subpel_x, int x_step_qn,
-                           const int16_t *filter_y, int subpel_y, int y_step_qn,
-                           int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-
-  convolve_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
-                   x_step_qn, filters_y, subpel_y, y_step_qn, w, h);
-}
-
-void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-
-  aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4,
-                  filter_y, y_step_q4, w, h);
-  aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
-                     h);
-}
-
-void aom_convolve8_avg_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int subpel_x,
-                               int x_step_qn, const int16_t *filter_y,
-                               int subpel_y, int y_step_qn, int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-
-  aom_convolve8_scale_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, subpel_x,
-                        x_step_qn, filter_y, subpel_y, y_step_qn, w, h);
-  aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
-                     h);
-}
-
 void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                          ptrdiff_t dst_stride, const int16_t *filter_x,
                          int filter_x_stride, const int16_t *filter_y,
@@ -488,330 +131,34 @@ void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   }
 }
 
-void aom_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        int filter_x_stride, const int16_t *filter_y,
-                        int filter_y_stride, int w, int h) {
-  int x, y;
-
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
-
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                        int w, int h) {
-  aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
-}
-
-void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                       ptrdiff_t dst_stride, const int16_t *filter_x,
-                       int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                       int w, int h) {
-  aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                       filter_y, y_step_q4, w, h);
-}
-
-void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const int16_t *filter_x,
-                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                     int w, int h) {
-  aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                  filter_y, y_step_q4, w, h);
-}
-
-void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
-                            int h) {
-  aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
-}
-
-void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
-  aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w, h);
-}
-
-void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                      filter_y, y_step_q4, w, h);
-}
-
-// TODO(afergs): Make sure this works too
-#if CONFIG_LOOP_RESTORATION
-static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *x_filters, int x0_q4,
-                                   int x_step_q4, int w, int h) {
-  int x, y, k;
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
-      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
-                          src_x[SUBPEL_TAPS / 2 - 1]);
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const InterpKernel *y_filters, int y0_q4,
-                                  int y_step_q4, int w, int h) {
-  int x, y, k;
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
-      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] =
-          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
-                     src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]);
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *const x_filters, int x0_q4,
-                             int x_step_q4, const InterpKernel *const y_filters,
-                             int y0_q4, int y_step_q4, int w, int h) {
-  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
-  int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
-                         temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
-                         intermediate_height);
-  convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
-                        dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
-                                   int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
-  (void)y_step_q4;
-
-  convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                         x_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
-  (void)x_step_q4;
-
-  convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
-                        y_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
-                             int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                   x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
-}
-
-static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
-                                       uint16_t *dst, ptrdiff_t dst_stride,
-                                       const InterpKernel *x_filters, int x0_q4,
-                                       int x_step_q4, int w, int h) {
-  const int bd = 8;
-  int x, y, k;
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int sum = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
-                (1 << (bd + FILTER_BITS - 1));
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
-      dst[x] =
-          (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
-                          0, EXTRAPREC_CLAMP_LIMIT(bd) - 1);
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const InterpKernel *y_filters, int y0_q4,
-                                      int y_step_q4, int w, int h) {
-  const int bd = 8;
-  int x, y, k;
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
-      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int sum =
-          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
-          (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] =
-          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS));
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void convolve_add_src_hip(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const InterpKernel *const x_filters, int x0_q4,
-                                 int x_step_q4,
-                                 const InterpKernel *const y_filters, int y0_q4,
-                                 int y_step_q4, int w, int h) {
-  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
-  int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                             src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
-                             x_step_q4, w, intermediate_height);
-  convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                            MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
-                            y_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride,
-                                       uint16_t *dst, ptrdiff_t dst_stride,
-                                       const int16_t *filter_x, int x_step_q4,
-                                       const int16_t *filter_y, int y_step_q4,
-                                       int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
-  (void)y_step_q4;
-
-  convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                             x_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const int16_t *filter_x, int x_step_q4,
-                                      const int16_t *filter_y, int y_step_q4,
-                                      int w, int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
-  (void)x_step_q4;
-
-  convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y, y0_q4,
-                            y_step_q4, w, h);
+static INLINE int highbd_vert_scalar_product(const uint16_t *a,
+                                             ptrdiff_t a_stride,
+                                             const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
+  return sum;
 }
 
-void aom_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4, int w,
-                                 int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                       x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
+static INLINE int highbd_horz_scalar_product(const uint16_t *a,
+                                             const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+  return sum;
 }
-#endif  // CONFIG_LOOP_RESTORATION
 
-// TODO(afergs): Make sure this works too
-#if CONFIG_HIGHBITDEPTH
 static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
                                   uint8_t *dst8, ptrdiff_t dst_stride,
                                   const InterpKernel *x_filters, int x0_q4,
                                   int x_step_q4, int w, int h, int bd) {
-  int x, y;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
+  for (int y = 0; y < h; ++y) {
     int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
+    for (int x = 0; x < w; ++x) {
       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+      const int sum = highbd_horz_scalar_product(src_x, x_filter);
       dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
       x_q4 += x_step_q4;
     }
@@ -820,47 +167,19 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
-static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
-                                      uint8_t *dst8, ptrdiff_t dst_stride,
-                                      const InterpKernel *x_filters, int x0_q4,
-                                      int x_step_q4, int w, int h, int bd) {
-  int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
-      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
-      dst[x] = ROUND_POWER_OF_TWO(
-          dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
-          1);
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
 static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
                                  uint8_t *dst8, ptrdiff_t dst_stride,
                                  const InterpKernel *y_filters, int y0_q4,
                                  int y_step_q4, int w, int h, int bd) {
-  int x, y;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (x = 0; x < w; ++x) {
+  for (int x = 0; x < w; ++x) {
     int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
+    for (int y = 0; y < h; ++y) {
       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
+      const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
       dst[y * dst_stride] =
           clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
       y_q4 += y_step_q4;
@@ -870,67 +189,6 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
-static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
-                                     uint8_t *dst8, ptrdiff_t dst_stride,
-                                     const InterpKernel *y_filters, int y0_q4,
-                                     int y_step_q4, int w, int h, int bd) {
-  int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
-      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
-          dst[y * dst_stride] +
-              clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
-          1);
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const InterpKernel *const x_filters, int x0_q4,
-                            int x_step_q4, const InterpKernel *const y_filters,
-                            int y0_q4, int y_step_q4, int w, int h, int bd) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
-  int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
-                        CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4,
-                        x_step_q4, w, intermediate_height, bd);
-  highbd_convolve_vert(
-      CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-      MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
-}
-
 void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const int16_t *filter_x, int x_step_q4,
@@ -945,20 +203,6 @@ void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                         x_step_q4, w, h, bd);
 }
 
-void aom_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const int16_t *filter_x, int x_step_q4,
-                                      const int16_t *filter_y, int y_step_q4,
-                                      int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  (void)filter_y;
-  (void)y_step_q4;
-
-  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                            x_step_q4, w, h, bd);
-}
-
 void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x, int x_step_q4,
@@ -973,51 +217,6 @@ void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                        y_step_q4, w, h, bd);
 }
 
-void aom_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int x_step_q4,
-                                     const int16_t *filter_y, int y_step_q4,
-                                     int w, int h, int bd) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-  (void)filter_x;
-  (void)x_step_q4;
-
-  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
-                           y_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
-                            int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-                  filters_y, y0_q4, y_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4, int w,
-                                int h, int bd) {
-  // Fixed size intermediate buffer places limits on parameters.
-  DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-
-  aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
-                         filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
-  aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst,
-                            dst_stride, NULL, 0, NULL, 0, w, h, bd);
-}
-
 void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
                                 uint8_t *dst8, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int filter_x_stride,
@@ -1038,295 +237,3 @@ void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
     dst += dst_stride;
   }
 }
-
-void aom_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
-                               uint8_t *dst8, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int filter_x_stride,
-                               const int16_t *filter_y, int filter_y_stride,
-                               int w, int h, int bd) {
-  int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-  (void)bd;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-#if CONFIG_LOOP_RESTORATION
-static void highbd_convolve_add_src_horiz(const uint8_t *src8,
-                                          ptrdiff_t src_stride, uint8_t *dst8,
-                                          ptrdiff_t dst_stride,
-                                          const InterpKernel *x_filters,
-                                          int x0_q4, int x_step_q4, int w,
-                                          int h, int bd) {
-  int x, y, k;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
-      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
-      dst[x] = clip_pixel_highbd(
-          ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1],
-          bd);
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void highbd_convolve_add_src_vert(const uint8_t *src8,
-                                         ptrdiff_t src_stride, uint8_t *dst8,
-                                         ptrdiff_t dst_stride,
-                                         const InterpKernel *y_filters,
-                                         int y0_q4, int y_step_q4, int w, int h,
-                                         int bd) {
-  int x, y, k;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
-      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] =
-          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
-                                src_y[(SUBPEL_TAPS / 2 - 1) * src_stride],
-                            bd);
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void highbd_convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *const x_filters,
-                                    int x0_q4, int x_step_q4,
-                                    const InterpKernel *const y_filters,
-                                    int y0_q4, int y_step_q4, int w, int h,
-                                    int bd) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
-  int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  highbd_convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                                src_stride, CONVERT_TO_BYTEPTR(temp),
-                                MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
-                                intermediate_height, bd);
-  highbd_convolve_add_src_vert(
-      CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-      MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_add_src_horiz_c(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
-    const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  (void)filter_y;
-  (void)y_step_q4;
-
-  highbd_convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x,
-                                x0_q4, x_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_add_src_vert_c(const uint8_t *src,
-                                         ptrdiff_t src_stride, uint8_t *dst,
-                                         ptrdiff_t dst_stride,
-                                         const int16_t *filter_x, int x_step_q4,
-                                         const int16_t *filter_y, int y_step_q4,
-                                         int w, int h, int bd) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-  (void)filter_x;
-  (void)x_step_q4;
-
-  highbd_convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y,
-                               y0_q4, y_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const int16_t *filter_x, int x_step_q4,
-                                    const int16_t *filter_y, int y_step_q4,
-                                    int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                          x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd);
-}
-
-static void highbd_convolve_add_src_horiz_hip(
-    const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
-    ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
-    int x_step_q4, int w, int h, int bd) {
-  const int extraprec_clamp_limit = EXTRAPREC_CLAMP_LIMIT(bd);
-  int x, y, k;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
-      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int sum = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
-                (1 << (bd + FILTER_BITS - 1));
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
-      dst[x] =
-          (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
-                          0, extraprec_clamp_limit - 1);
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void highbd_convolve_add_src_vert_hip(
-    const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
-    ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
-    int y_step_q4, int w, int h, int bd) {
-  int x, y, k;
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
-      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int sum =
-          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
-          (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = clip_pixel_highbd(
-          ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS), bd);
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void highbd_convolve_add_src_hip(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, const InterpKernel *const x_filters, int x0_q4,
-    int x_step_q4, const InterpKernel *const y_filters, int y0_q4,
-    int y_step_q4, int w, int h, int bd) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
-  int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  highbd_convolve_add_src_horiz_hip(
-      src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, MAX_SB_SIZE,
-      x_filters, x0_q4, x_step_q4, w, intermediate_height, bd);
-  highbd_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                                   MAX_SB_SIZE, dst, dst_stride, y_filters,
-                                   y0_q4, y_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_add_src_horiz_hip_c(
-    const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst,
-    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
-    const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  (void)filter_y;
-  (void)y_step_q4;
-
-  highbd_convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x,
-                                    x0_q4, x_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_add_src_vert_hip_c(
-    const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
-    const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-  (void)filter_x;
-  (void)x_step_q4;
-
-  highbd_convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y,
-                                   y0_q4, y_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_add_src_hip_c(const uint8_t *src,
-                                        ptrdiff_t src_stride, uint8_t *dst,
-                                        ptrdiff_t dst_stride,
-                                        const int16_t *filter_x, int x_step_q4,
-                                        const int16_t *filter_y, int y_step_q4,
-                                        int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  highbd_convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x,
-                              x0_q4, x_step_q4, filters_y, y0_q4, y_step_q4, w,
-                              h, bd);
-}
-
-#endif  // CONFIG_LOOP_RESTORATION
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/aom_convolve.h b/third_party/aom/aom_dsp/aom_convolve.h
index c7943dced..6f5b888e4 100644
--- a/third_party/aom/aom_dsp/aom_convolve.h
+++ b/third_party/aom/aom_dsp/aom_convolve.h
@@ -11,7 +11,8 @@
 #ifndef AOM_DSP_AOM_CONVOLVE_H_
 #define AOM_DSP_AOM_CONVOLVE_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 
 #ifdef __cplusplus
@@ -30,16 +31,11 @@ extern "C" {
 // --Must round-up because block may be located at sub-pixel position.
 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+// TODO(wtc): Update the above comment to explain the value 263 used in aom.
 #define MAX_EXT_SIZE 263
-#else
-#define MAX_EXT_SIZE 135
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
 
-#if CONFIG_AV1 && CONFIG_LOOP_RESTORATION
 #define EXTRAPREC_BITS 2
 #define EXTRAPREC_CLAMP_LIMIT(bd) (1 << ((bd) + 1 + EXTRAPREC_BITS))
-#endif
 
 typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
@@ -47,13 +43,11 @@ typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                               const int16_t *filter_y, int y_step_q4, int w,
                               int h);
 
-#if CONFIG_HIGHBITDEPTH
 typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const int16_t *filter_x, int x_step_q4,
                                      const int16_t *filter_y, int y_step_q4,
                                      int w, int h, int bd);
-#endif
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake
index 11b55caa7..768875f7d 100644
--- a/third_party/aom/aom_dsp/aom_dsp.cmake
+++ b/third_party/aom/aom_dsp/aom_dsp.cmake
@@ -1,475 +1,242 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_AOM_DSP_AOM_DSP_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_DSP_AOM_DSP_CMAKE_)
+  return()
+endif() # AOM_AOM_DSP_AOM_DSP_CMAKE_
 set(AOM_AOM_DSP_AOM_DSP_CMAKE_ 1)
 
-set(AOM_DSP_COMMON_SOURCES
-    "${AOM_ROOT}/aom_dsp/aom_convolve.c"
-    "${AOM_ROOT}/aom_dsp/aom_convolve.h"
-    "${AOM_ROOT}/aom_dsp/aom_dsp_common.h"
-    "${AOM_ROOT}/aom_dsp/aom_filter.h"
-    "${AOM_ROOT}/aom_dsp/aom_simd.h"
-    "${AOM_ROOT}/aom_dsp/aom_simd_inline.h"
-    "${AOM_ROOT}/aom_dsp/blend.h"
-    "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c"
-    "${AOM_ROOT}/aom_dsp/blend_a64_mask.c"
-    "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
-    "${AOM_ROOT}/aom_dsp/intrapred.c"
-    "${AOM_ROOT}/aom_dsp/intrapred_common.h"
-    "${AOM_ROOT}/aom_dsp/loopfilter.c"
-    "${AOM_ROOT}/aom_dsp/prob.c"
-    "${AOM_ROOT}/aom_dsp/prob.h"
-    "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h"
-    "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h"
-    "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h"
-    "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h"
-    "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h"
-    "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h"
-    "${AOM_ROOT}/aom_dsp/subtract.c"
-    "${AOM_ROOT}/aom_dsp/txfm_common.h"
-    "${AOM_ROOT}/aom_dsp/x86/txfm_common_intrin.h")
-
-set(AOM_DSP_COMMON_ASM_SSE2
-    "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.asm"
-    "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm"
-    "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm"
-    "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.asm")
-
-set(AOM_DSP_COMMON_INTRIN_SSE2
-    "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
-    "${AOM_ROOT}/aom_dsp/x86/convolve.h"
-    "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
-    "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h"
-    "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h"
-    "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c")
-
-set(AOM_DSP_COMMON_ASM_SSSE3
-    "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm"
-    "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm"
-    "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.asm")
-
-set(AOM_DSP_COMMON_INTRIN_SSSE3
-    "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c"
-    "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c"
-    "${AOM_ROOT}/aom_dsp/x86/inv_txfm_ssse3.c")
-
-set(AOM_DSP_COMMON_INTRIN_SSE4_1
-    "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c"
-    "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c"
-    "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c")
-
-set(AOM_DSP_COMMON_INTRIN_AVX2
-    "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
-    "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c"
-    "${AOM_ROOT}/aom_dsp/x86/inv_txfm_avx2.c"
-    "${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
-    "${AOM_ROOT}/aom_dsp/x86/inv_txfm_common_avx2.h"
-    "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h")
-
-if (NOT CONFIG_PARALLEL_DEBLOCKING)
-  set(AOM_DSP_COMMON_INTRIN_AVX2
-      ${AOM_DSP_COMMON_INTRIN_AVX2}
-      "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c")
-endif ()
-
-if (NOT CONFIG_EXT_PARTITION)
-  set(AOM_DSP_COMMON_ASM_NEON
-      "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm"
-      "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_asm.asm"
-      "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon_asm.asm"
-      "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon_asm.asm")
-endif ()
-
-set(AOM_DSP_COMMON_ASM_NEON
-    ${AOM_DSP_COMMON_ASM_NEON}
-    "${AOM_ROOT}/aom_dsp/arm/idct16x16_1_add_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/idct16x16_add_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/idct32x32_1_add_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/idct32x32_add_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/idct4x4_1_add_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/idct4x4_add_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/intrapred_neon_asm.asm"
-    "${AOM_ROOT}/aom_dsp/arm/save_reg_neon.asm")
-
-if (NOT CONFIG_PARALLEL_DEBLOCKING)
-  set(AOM_DSP_COMMON_ASM_NEON
-      ${AOM_DSP_COMMON_ASM_NEON}
-      "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.asm"
-      "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.asm"
-      "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.asm"
-      "${AOM_ROOT}/aom_dsp/arm/loopfilter_mb_neon.asm")
-endif ()
-
-if (NOT CONFIG_EXT_PARTITION)
-  set(AOM_DSP_COMMON_INTRIN_NEON
-      "${AOM_ROOT}/aom_dsp/arm/aom_convolve_neon.c")
-endif ()
-
-set(AOM_DSP_COMMON_INTRIN_NEON
-    ${AOM_DSP_COMMON_INTRIN_NEON}
-    "${AOM_ROOT}/aom_dsp/arm/avg_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/idct16x16_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/variance_neon.c")
-
-if (NOT CONFIG_PARALLEL_DEBLOCKING)
-  set(AOM_DSP_COMMON_INTRIN_NEON
-      ${AOM_DSP_COMMON_INTRIN_NEON}
-      "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c")
-endif ()
-
-if ("${AOM_TARGET_CPU}" STREQUAL "arm64")
-  if (NOT CONFIG_EXT_PARTITION)
-    set(AOM_DSP_COMMON_INTRIN_NEON
-        ${AOM_DSP_COMMON_INTRIN_NEON}
-        "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon.c"
-        "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c"
-        "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon.c"
-        "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c")
-  endif ()
-
-  set(AOM_DSP_COMMON_INTRIN_NEON
-      ${AOM_DSP_COMMON_INTRIN_NEON}
-      "${AOM_ROOT}/aom_dsp/arm/idct16x16_1_add_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/idct16x16_add_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/idct32x32_1_add_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/idct32x32_add_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/idct4x4_1_add_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/idct4x4_add_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c")
-
-  if (NOT CONFIG_PARALLEL_DEBLOCKING)
-    set(AOM_DSP_COMMON_INTRIN_NEON
-        ${AOM_DSP_COMMON_INTRIN_NEON}
-        "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.c"
-        "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.c"
-        "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.c")
-  endif ()
-endif ()
-
-set(AOM_DSP_COMMON_INTRIN_DSPR2
-    "${AOM_ROOT}/aom_dsp/mips/common_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/common_dspr2.h"
-    "${AOM_ROOT}/aom_dsp/mips/convolve2_avg_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/convolve2_avg_horiz_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/convolve2_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/convolve2_horiz_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/convolve2_vert_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/convolve8_avg_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/convolve8_avg_horiz_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/convolve8_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/convolve8_horiz_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/convolve8_vert_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/convolve_common_dspr2.h"
-    "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h")
-
-if (NOT CONFIG_PARALLEL_DEBLOCKING)
-  set(AOM_DSP_COMMON_INTRIN_DSPR2
-      ${AOM_DSP_COMMON_INTRIN_DSPR2}
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.c"
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.h"
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_macros_dspr2.h"
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_masks_dspr2.h"
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_dspr2.c"
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c"
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_vert_dspr2.c")
-endif ()
-
-set(AOM_DSP_COMMON_INTRIN_MSA
-    "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_vert_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_horiz_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_vert_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/aom_convolve_avg_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/aom_convolve_copy_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/aom_convolve_msa.h"
-    "${AOM_ROOT}/aom_dsp/mips/fwd_dct32x32_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/fwd_txfm_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/fwd_txfm_msa.h"
-    "${AOM_ROOT}/aom_dsp/mips/idct16x16_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/idct32x32_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/idct4x4_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/idct8x8_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/inv_txfm_msa.h"
-    "${AOM_ROOT}/aom_dsp/mips/macros_msa.h"
-    "${AOM_ROOT}/aom_dsp/mips/txfm_macros_msa.h")
-
-if (NOT CONFIG_PARALLEL_DEBLOCKING)
-  set(AOM_DSP_COMMON_INTRIN_MSA
-      ${AOM_DSP_COMMON_INTRIN_MSA}
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_16_msa.c"
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_4_msa.c"
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_8_msa.c"
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_msa.h")
-endif ()
-
-if (CONFIG_HIGHBITDEPTH)
-  set(AOM_DSP_COMMON_ASM_SSE2
-      ${AOM_DSP_COMMON_ASM_SSE2}
-      "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm"
-      "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm"
-      "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.asm")
-
-  set(AOM_DSP_COMMON_INTRIN_SSE2
-      ${AOM_DSP_COMMON_INTRIN_SSE2}
-      "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
-      "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c")
-
-  set(AOM_DSP_COMMON_INTRIN_SSSE3
-      ${AOM_DSP_COMMON_INTRIN_SSSE3}
-      "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_ssse3.c")
-
-  set(AOM_DSP_COMMON_INTRIN_AVX2
-      ${AOM_DSP_COMMON_INTRIN_AVX2}
-      "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
-      "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_avx2.c"
-      "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c")
-else ()
-  set(AOM_DSP_COMMON_INTRIN_DSPR2
-      ${AOM_DSP_COMMON_INTRIN_DSPR2}
-      "${AOM_ROOT}/aom_dsp/mips/itrans16_dspr2.c"
-      "${AOM_ROOT}/aom_dsp/mips/itrans32_cols_dspr2.c"
-      "${AOM_ROOT}/aom_dsp/mips/itrans32_dspr2.c"
-      "${AOM_ROOT}/aom_dsp/mips/itrans4_dspr2.c"
-      "${AOM_ROOT}/aom_dsp/mips/itrans8_dspr2.c")
-endif ()
-
-if (CONFIG_ANS)
-  set(AOM_DSP_COMMON_SOURCES
-      ${AOM_DSP_COMMON_SOURCES}
-      "${AOM_ROOT}/aom_dsp/ans.h")
-else ()
-  set(AOM_DSP_COMMON_SOURCES
-      ${AOM_DSP_COMMON_SOURCES}
-      "${AOM_ROOT}/aom_dsp/entcode.c"
-      "${AOM_ROOT}/aom_dsp/entcode.h")
-endif ()
-
-if (CONFIG_AV1)
-  set(AOM_DSP_COMMON_SOURCES
-      ${AOM_DSP_COMMON_SOURCES}
-      "${AOM_ROOT}/aom_dsp/inv_txfm.c"
-      "${AOM_ROOT}/aom_dsp/inv_txfm.h")
-
-  set(AOM_DSP_COMMON_ASM_SSE2
-      ${AOM_DSP_COMMON_ASM_SSE2}
-      "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm")
-
-  set(AOM_DSP_COMMON_INTRIN_SSE2
-      ${AOM_DSP_COMMON_INTRIN_SSE2}
-      "${AOM_ROOT}/aom_dsp/x86/inv_txfm_sse2.c"
-      "${AOM_ROOT}/aom_dsp/x86/inv_txfm_sse2.h")
-endif ()
-
-if (CONFIG_AV1_DECODER)
-  set(AOM_DSP_DECODER_SOURCES
-      "${AOM_ROOT}/aom_dsp/binary_codes_reader.c"
-      "${AOM_ROOT}/aom_dsp/binary_codes_reader.h"
-      "${AOM_ROOT}/aom_dsp/bitreader.h"
-      "${AOM_ROOT}/aom_dsp/bitreader_buffer.c"
-      "${AOM_ROOT}/aom_dsp/bitreader_buffer.h")
-
-  if (CONFIG_ANS)
-    set(AOM_DSP_DECODER_SOURCES
-        ${AOM_DSP_DECODER_SOURCES}
-        "${AOM_ROOT}/aom_dsp/ansreader.h")
-  else ()
-    set(AOM_DSP_DECODER_SOURCES
-        ${AOM_DSP_DECODER_SOURCES}
-        "${AOM_ROOT}/aom_dsp/daalaboolreader.c"
-        "${AOM_ROOT}/aom_dsp/daalaboolreader.h"
-        "${AOM_ROOT}/aom_dsp/entdec.c"
-        "${AOM_ROOT}/aom_dsp/entdec.h")
-  endif ()
-endif ()
-
-if (CONFIG_AV1_ENCODER)
-  set(AOM_DSP_ENCODER_SOURCES
-      "${AOM_ROOT}/aom_dsp/binary_codes_writer.c"
-      "${AOM_ROOT}/aom_dsp/binary_codes_writer.h"
-      "${AOM_ROOT}/aom_dsp/bitwriter.h"
-      "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c"
-      "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h"
-      "${AOM_ROOT}/aom_dsp/psnr.c"
-      "${AOM_ROOT}/aom_dsp/psnr.h"
-      "${AOM_ROOT}/aom_dsp/sad.c"
-      "${AOM_ROOT}/aom_dsp/variance.c"
-      "${AOM_ROOT}/aom_dsp/variance.h")
-
-  set(AOM_DSP_ENCODER_ASM_SSE2
-      ${AOM_DSP_ENCODER_ASM_SSE2}
-      "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_impl_sse2.asm"
-      "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
-      "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm"
-      "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm"
-      "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm")
-
-  set(AOM_DSP_ENCODER_INTRIN_SSE2
-      "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c")
-
-  set(AOM_DSP_ENCODER_ASM_SSSE3
-      "${AOM_ROOT}/aom_dsp/x86/sad_ssse3.asm")
-
-  set(AOM_DSP_ENCODER_ASM_SSSE3_X86_64
-      "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm"
-      "${AOM_ROOT}/aom_dsp/x86/ssim_opt_x86_64.asm")
-
-  set(AOM_DSP_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/aom_dsp/x86/sad_sse3.asm")
-  set(AOM_DSP_ENCODER_ASM_SSE4_1 "${AOM_ROOT}/aom_dsp/x86/sad_sse4.asm")
-
-  set(AOM_DSP_ENCODER_INTRIN_AVX2
-      "${AOM_ROOT}/aom_dsp/x86/fwd_dct32x32_impl_avx2.h"
-      "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_avx2.c"
-      "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_avx2.h"
-      "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c"
-      "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c"
-      "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c"
-      "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c"
-      "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c"
-      "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c")
-
-  if (CONFIG_AV1_ENCODER)
-    set(AOM_DSP_ENCODER_SOURCES
-        ${AOM_DSP_ENCODER_SOURCES}
-        "${AOM_ROOT}/aom_dsp/avg.c"
-        "${AOM_ROOT}/aom_dsp/fwd_txfm.c"
-        "${AOM_ROOT}/aom_dsp/fwd_txfm.h"
-        "${AOM_ROOT}/aom_dsp/quantize.c"
-        "${AOM_ROOT}/aom_dsp/quantize.h"
-        "${AOM_ROOT}/aom_dsp/sum_squares.c")
-
-    set(AOM_DSP_ENCODER_INTRIN_SSE2
-        ${AOM_DSP_ENCODER_INTRIN_SSE2}
-        "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse2.c"
-        "${AOM_ROOT}/aom_dsp/x86/fwd_dct32_8cols_sse2.c"
-        "${AOM_ROOT}/aom_dsp/x86/fwd_dct32x32_impl_sse2.h"
-        "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h"
-        "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c"
-        "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h"
-        "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_sse2.c"
-        "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c"
-        "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c"
-        "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c")
-
-    set(AOM_DSP_ENCODER_ASM_SSSE3_X86_64
-        ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64}
-        "${AOM_ROOT}/aom_dsp/x86/avg_ssse3_x86_64.asm"
-        "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm")
-
-    set(AOM_DSP_ENCODER_AVX_ASM_X86_64
-        ${AOM_DSP_ENCODER_AVX_ASM_X86_64}
-        "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm")
-
-    set(AOM_DSP_ENCODER_INTRIN_MSA
-        "${AOM_ROOT}/aom_dsp/mips/sad_msa.c"
-        "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c"
-        "${AOM_ROOT}/aom_dsp/mips/variance_msa.c"
-        "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c")
-
-      set(AOM_DSP_ENCODER_INTRIN_SSSE3
-          ${AOM_DSP_ENCODER_INTRIN_SSSE3}
-          "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
-          "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c")
-
-    if (CONFIG_HIGHBITDEPTH)
-      set(AOM_DSP_ENCODER_INTRIN_SSE2
-          ${AOM_DSP_ENCODER_INTRIN_SSE2}
-          "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c")
-    endif ()
-  endif ()
-
-  if (CONFIG_HIGHBITDEPTH)
-    set(AOM_DSP_ENCODER_ASM_SSE2
-        ${AOM_DSP_ENCODER_ASM_SSE2}
-        "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm"
-        "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm"
-        "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm"
-        "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm")
-
-    set(AOM_DSP_ENCODER_INTRIN_SSE2
-        ${AOM_DSP_ENCODER_INTRIN_SSE2}
-        "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c")
-
-    set(AOM_DSP_ENCODER_INTRIN_SSE4_1
-        ${AOM_DSP_ENCODER_INTRIN_SSE4_1}
-        "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c")
-
-    set(AOM_DSP_ENCODER_INTRIN_AVX2
-        ${AOM_DSP_ENCODER_INTRIN_AVX2}
-        "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c")
-  endif ()
-
-  if (CONFIG_ANS)
-    set(AOM_DSP_ENCODER_SOURCES
-        ${AOM_DSP_ENCODER_SOURCES}
-        "${AOM_ROOT}/aom_dsp/answriter.h"
-        "${AOM_ROOT}/aom_dsp/buf_ans.c"
-        "${AOM_ROOT}/aom_dsp/buf_ans.h")
-  else ()
-    set(AOM_DSP_ENCODER_SOURCES
-        ${AOM_DSP_ENCODER_SOURCES}
-        "${AOM_ROOT}/aom_dsp/daalaboolwriter.c"
-        "${AOM_ROOT}/aom_dsp/daalaboolwriter.h"
-        "${AOM_ROOT}/aom_dsp/entenc.c"
-        "${AOM_ROOT}/aom_dsp/entenc.h")
-  endif ()
-
-  if (CONFIG_INTERNAL_STATS)
-    set(AOM_DSP_ENCODER_SOURCES
-        ${AOM_DSP_ENCODER_SOURCES}
-        "${AOM_ROOT}/aom_dsp/fastssim.c"
-        "${AOM_ROOT}/aom_dsp/psnrhvs.c"
-        "${AOM_ROOT}/aom_dsp/ssim.c"
-        "${AOM_ROOT}/aom_dsp/ssim.h")
-  endif ()
-endif ()
-
-if (CONFIG_LOOP_RESTORATION)
-  set(AOM_DSP_COMMON_INTRIN_SSE2
-      ${AOM_DSP_COMMON_INTRIN_SSE2}
-      "${AOM_ROOT}/aom_dsp/x86/aom_convolve_hip_sse2.c")
-
-  if (CONFIG_HIGHBITDEPTH)
-    set(AOM_DSP_COMMON_INTRIN_SSSE3
-      ${AOM_DSP_COMMON_INTRIN_SSSE3}
-        "${AOM_ROOT}/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c")
-  endif ()
-endif ()
-
-if (CONFIG_MOTION_VAR)
-  set(AOM_DSP_ENCODER_INTRIN_SSE4_1
-      ${AOM_DSP_ENCODER_INTRIN_SSE4_1}
-      "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
-      "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
-endif ()
+list(APPEND AOM_DSP_COMMON_SOURCES
+            "${AOM_ROOT}/aom_dsp/aom_convolve.c"
+            "${AOM_ROOT}/aom_dsp/aom_convolve.h"
+            "${AOM_ROOT}/aom_dsp/aom_dsp_common.h"
+            "${AOM_ROOT}/aom_dsp/aom_filter.h"
+            "${AOM_ROOT}/aom_dsp/aom_simd.h"
+            "${AOM_ROOT}/aom_dsp/aom_simd_inline.h"
+            "${AOM_ROOT}/aom_dsp/blend.h"
+            "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c"
+            "${AOM_ROOT}/aom_dsp/blend_a64_mask.c"
+            "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
+            "${AOM_ROOT}/aom_dsp/entcode.c"
+            "${AOM_ROOT}/aom_dsp/entcode.h"
+            "${AOM_ROOT}/aom_dsp/fft.c"
+            "${AOM_ROOT}/aom_dsp/fft_common.h"
+            "${AOM_ROOT}/aom_dsp/intrapred.c"
+            "${AOM_ROOT}/aom_dsp/intrapred_common.h"
+            "${AOM_ROOT}/aom_dsp/loopfilter.c"
+            "${AOM_ROOT}/aom_dsp/prob.h"
+            "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h"
+            "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h"
+            "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h"
+            "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h"
+            "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h"
+            "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h"
+            "${AOM_ROOT}/aom_dsp/subtract.c"
+            "${AOM_ROOT}/aom_dsp/txfm_common.h"
+            "${AOM_ROOT}/aom_dsp/x86/convolve_common_intrin.h")
+
+list(APPEND AOM_DSP_COMMON_ASM_SSE2
+            "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
+            "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
+            "${AOM_ROOT}/aom_dsp/x86/convolve.h"
+            "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/mem_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/transpose_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h")
+
+list(APPEND AOM_DSP_COMMON_ASM_SSSE3
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1
+            "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c"
+            "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c"
+            "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
+            "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h"
+            "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON
+            "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_DSPR2
+            "${AOM_ROOT}/aom_dsp/mips/common_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/common_dspr2.h"
+            "${AOM_ROOT}/aom_dsp/mips/convolve2_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve2_horiz_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve2_vert_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve8_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve8_horiz_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve8_vert_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve_common_dspr2.h"
+            "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_MSA
+            "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_horiz_msa.c"
+            "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_vert_msa.c"
+            "${AOM_ROOT}/aom_dsp/mips/aom_convolve_copy_msa.c"
+            "${AOM_ROOT}/aom_dsp/mips/aom_convolve_msa.h"
+            "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c"
+            "${AOM_ROOT}/aom_dsp/mips/macros_msa.h")
+
+if(CONFIG_AV1_DECODER)
+  list(APPEND AOM_DSP_DECODER_SOURCES
+              "${AOM_ROOT}/aom_dsp/binary_codes_reader.c"
+              "${AOM_ROOT}/aom_dsp/binary_codes_reader.h"
+              "${AOM_ROOT}/aom_dsp/bitreader.h"
+              "${AOM_ROOT}/aom_dsp/bitreader_buffer.c"
+              "${AOM_ROOT}/aom_dsp/bitreader_buffer.h"
+              "${AOM_ROOT}/aom_dsp/daalaboolreader.c"
+              "${AOM_ROOT}/aom_dsp/daalaboolreader.h"
+              "${AOM_ROOT}/aom_dsp/entdec.c"
+              "${AOM_ROOT}/aom_dsp/entdec.h"
+              "${AOM_ROOT}/aom_dsp/grain_synthesis.c"
+              "${AOM_ROOT}/aom_dsp/grain_synthesis.h")
+endif()
+
+if(CONFIG_AV1_ENCODER)
+  list(APPEND AOM_DSP_ENCODER_SOURCES
+              "${AOM_ROOT}/aom_dsp/binary_codes_writer.c"
+              "${AOM_ROOT}/aom_dsp/binary_codes_writer.h"
+              "${AOM_ROOT}/aom_dsp/bitwriter.h"
+              "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c"
+              "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h"
+              "${AOM_ROOT}/aom_dsp/daalaboolwriter.c"
+              "${AOM_ROOT}/aom_dsp/daalaboolwriter.h"
+              "${AOM_ROOT}/aom_dsp/entenc.c"
+              "${AOM_ROOT}/aom_dsp/entenc.h"
+              "${AOM_ROOT}/aom_dsp/fwd_txfm.c"
+              "${AOM_ROOT}/aom_dsp/grain_table.c"
+              "${AOM_ROOT}/aom_dsp/grain_table.h"
+              "${AOM_ROOT}/aom_dsp/noise_model.c"
+              "${AOM_ROOT}/aom_dsp/noise_model.h"
+              "${AOM_ROOT}/aom_dsp/noise_util.c"
+              "${AOM_ROOT}/aom_dsp/noise_util.h"
+              "${AOM_ROOT}/aom_dsp/psnr.c"
+              "${AOM_ROOT}/aom_dsp/psnr.h"
+              "${AOM_ROOT}/aom_dsp/quantize.c"
+              "${AOM_ROOT}/aom_dsp/quantize.h"
+              "${AOM_ROOT}/aom_dsp/sad.c"
+              "${AOM_ROOT}/aom_dsp/sad_av1.c"
+              "${AOM_ROOT}/aom_dsp/sum_squares.c"
+              "${AOM_ROOT}/aom_dsp/variance.c"
+              "${AOM_ROOT}/aom_dsp/variance.h")
+
+  list(APPEND AOM_DSP_ENCODER_ASM_SSE2
+              "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_impl_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_SSE2
+              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h"
+              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h"
+              "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c")
+
+  list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64
+              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm"
+              "${AOM_ROOT}/aom_dsp/x86/ssim_opt_x86_64.asm")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
+              "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c")
+
+  list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64
+              "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm")
+
+  list(APPEND AOM_DSP_ENCODER_AVX_ASM_X86_64
+              "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3
+              "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h"
+              "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
+              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c"
+              "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
+              "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
+              "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/variance_neon.c")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/sad_msa.c"
+              "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c"
+              "${AOM_ROOT}/aom_dsp/mips/variance_msa.c"
+              "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c")
+
+  if(CONFIG_INTERNAL_STATS)
+    list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/fastssim.c"
+                "${AOM_ROOT}/aom_dsp/psnrhvs.c" "${AOM_ROOT}/aom_dsp/ssim.c"
+                "${AOM_ROOT}/aom_dsp/ssim.h")
+  endif()
+endif()
 
 # Creates aom_dsp build targets. Must not be called until after libaom target
 # has been created.
-function (setup_aom_dsp_targets)
+function(setup_aom_dsp_targets)
   add_library(aom_dsp_common OBJECT ${AOM_DSP_COMMON_SOURCES})
   list(APPEND AOM_LIB_TARGETS aom_dsp_common)
   create_dummy_source_file("aom_av1" "c" "dummy_source_file")
@@ -481,113 +248,97 @@ function (setup_aom_dsp_targets)
   # dummy source file to the aom_dsp target.
   add_dummy_source_file_to_target("aom_dsp" "c")
 
-  if (CONFIG_AV1_DECODER)
+  if(CONFIG_AV1_DECODER)
     add_library(aom_dsp_decoder OBJECT ${AOM_DSP_DECODER_SOURCES})
-    set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_decoder)
+    list(APPEND AOM_LIB_TARGETS aom_dsp_decoder)
     target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_decoder>)
-  endif ()
+  endif()
 
-  if (CONFIG_AV1_ENCODER)
+  if(CONFIG_AV1_ENCODER)
     add_library(aom_dsp_encoder OBJECT ${AOM_DSP_ENCODER_SOURCES})
-    set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_encoder)
+    list(APPEND AOM_LIB_TARGETS aom_dsp_encoder)
     target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_encoder>)
-  endif ()
+  endif()
 
-  if (HAVE_SSE2)
+  if(HAVE_SSE2)
     add_asm_library("aom_dsp_common_sse2" "AOM_DSP_COMMON_ASM_SSE2" "aom")
     add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_common"
-                                   "AOM_DSP_COMMON_INTRIN_SSE2" "aom")
+                                  "AOM_DSP_COMMON_INTRIN_SSE2" "aom")
 
-    if (CONFIG_AV1_ENCODER)
-      add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2"
-                      "aom")
+    if(CONFIG_AV1_ENCODER)
+      add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2" "aom")
       add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_encoder"
                                     "AOM_DSP_ENCODER_INTRIN_SSE2" "aom")
     endif()
-  endif ()
-
-  if (HAVE_SSE3 AND CONFIG_AV1_ENCODER)
-    add_asm_library("aom_dsp_encoder_sse3" "AOM_DSP_ENCODER_INTRIN_SSE3" "aom")
-  endif ()
+  endif()
 
-  if (HAVE_SSSE3)
+  if(HAVE_SSSE3)
     add_asm_library("aom_dsp_common_ssse3" "AOM_DSP_COMMON_ASM_SSSE3" "aom")
     add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_common"
                                   "AOM_DSP_COMMON_INTRIN_SSSE3" "aom")
 
-    if (CONFIG_AV1_ENCODER)
-      if ("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+    if(CONFIG_AV1_ENCODER)
+      if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
         list(APPEND AOM_DSP_ENCODER_ASM_SSSE3
-             ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64})
-      endif ()
+                    ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64})
+      endif()
       add_asm_library("aom_dsp_encoder_ssse3" "AOM_DSP_ENCODER_ASM_SSSE3" "aom")
-      if (AOM_DSP_ENCODER_INTRIN_SSSE3)
-        add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder"
-                                      "AOM_DSP_ENCODER_INTRIN_SSSE3" "aom")
-      endif ()
-    endif ()
-  endif ()
-
-  if (HAVE_SSE4_1)
+      add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_SSSE3" "aom")
+    endif()
+  endif()
+
+  if(HAVE_SSE4_1)
     add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_common"
                                   "AOM_DSP_COMMON_INTRIN_SSE4_1" "aom")
-    if (CONFIG_AV1_ENCODER)
-      if (AOM_DSP_ENCODER_INTRIN_SSE4_1)
-        add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder"
-                                      "AOM_DSP_ENCODER_INTRIN_SSE4_1" "aom")
-      endif ()
-      add_asm_library("aom_dsp_encoder_sse4_1" "AOM_DSP_ENCODER_ASM_SSE4_1"
-                      "aom")
-    endif ()
-  endif ()
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_SSE4_1" "aom")
+    endif()
+  endif()
 
-  if (HAVE_AVX AND "${AOM_TARGET_CPU}" STREQUAL "x86_64")
-    if (CONFIG_AV1_ENCODER)
+  if(HAVE_AVX AND "${AOM_TARGET_CPU}" STREQUAL "x86_64")
+    if(CONFIG_AV1_ENCODER)
       add_asm_library("aom_dsp_encoder_avx" "AOM_DSP_ENCODER_AVX_ASM_X86_64"
                       "aom")
-    endif ()
-  endif ()
+    endif()
+  endif()
 
-  if (HAVE_AVX2)
+  if(HAVE_AVX2)
     add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_common"
                                   "AOM_DSP_COMMON_INTRIN_AVX2" "aom")
-    if (CONFIG_AV1_ENCODER)
+    if(CONFIG_AV1_ENCODER)
       add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_encoder"
                                     "AOM_DSP_ENCODER_INTRIN_AVX2" "aom")
-    endif ()
-  endif ()
-
-  if (HAVE_NEON_ASM)
-    if (AOM_ADS2GAS_REQUIRED)
-      add_gas_asm_library("aom_dsp_common_neon" "AOM_DSP_COMMON_ASM_NEON" "aom")
-    else ()
-      add_asm_library("aom_dsp_common_neon" "AOM_DSP_COMMON_ASM_NEON" "aom")
-    endif ()
-  endif ()
-
-  if (HAVE_NEON)
+    endif()
+  endif()
+
+  if(HAVE_NEON)
     add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
                                   "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON"
                                   "aom")
-  endif ()
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
+                                    "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_NEON" "aom")
+    endif()
+  endif()
 
-  if (HAVE_DSPR2)
+  if(HAVE_DSPR2)
     add_intrinsics_object_library("" "dspr2" "aom_dsp_common"
                                   "AOM_DSP_COMMON_INTRIN_DSPR2" "aom")
-  endif ()
+  endif()
 
-  if (HAVE_MSA)
+  if(HAVE_MSA)
     add_intrinsics_object_library("" "msa" "aom_dsp_common"
                                   "AOM_DSP_COMMON_INTRIN_MSA" "aom")
-    if (CONFIG_AV1_ENCODER)
+    if(CONFIG_AV1_ENCODER)
       add_intrinsics_object_library("" "msa" "aom_dsp_encoder"
                                     "AOM_DSP_ENCODER_INTRIN_MSA" "aom")
-    endif ()
-  endif ()
+    endif()
+  endif()
 
   # Pass the new lib targets up to the parent scope instance of
   # $AOM_LIB_TARGETS.
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
-endfunction ()
-
-endif ()  # AOM_AOM_DSP_AOM_DSP_CMAKE_
+endfunction()
diff --git a/third_party/aom/aom_dsp/aom_dsp.mk b/third_party/aom/aom_dsp/aom_dsp.mk
deleted file mode 100644
index 950db0216..000000000
--- a/third_party/aom/aom_dsp/aom_dsp.mk
+++ /dev/null
@@ -1,439 +0,0 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-
-DSP_SRCS-yes += aom_dsp.mk
-DSP_SRCS-yes += aom_dsp_common.h
-
-DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h
-
-DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/synonyms.h
-
-# bit reader
-DSP_SRCS-yes += prob.h
-DSP_SRCS-yes += prob.c
-DSP_SRCS-$(CONFIG_ANS) += ans.h
-
-ifeq ($(CONFIG_AV1_ENCODER),yes)
-ifeq ($(CONFIG_ANS),yes)
-DSP_SRCS-yes += answriter.h
-DSP_SRCS-yes += buf_ans.h
-DSP_SRCS-yes += buf_ans.c
-else
-DSP_SRCS-yes += entenc.c
-DSP_SRCS-yes += entenc.h
-DSP_SRCS-yes += daalaboolwriter.c
-DSP_SRCS-yes += daalaboolwriter.h
-endif
-DSP_SRCS-yes += bitwriter.h
-DSP_SRCS-yes += bitwriter_buffer.c
-DSP_SRCS-yes += bitwriter_buffer.h
-DSP_SRCS-yes += binary_codes_writer.c
-DSP_SRCS-yes += binary_codes_writer.h
-DSP_SRCS-yes += psnr.c
-DSP_SRCS-yes += psnr.h
-DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c
-DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h
-DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c
-DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c
-endif
-
-ifeq ($(CONFIG_AV1_DECODER),yes)
-ifeq ($(CONFIG_ANS),yes)
-DSP_SRCS-yes += ansreader.h
-else
-DSP_SRCS-yes += entdec.c
-DSP_SRCS-yes += entdec.h
-DSP_SRCS-yes += daalaboolreader.c
-DSP_SRCS-yes += daalaboolreader.h
-endif
-DSP_SRCS-yes += bitreader.h
-DSP_SRCS-yes += bitreader_buffer.c
-DSP_SRCS-yes += bitreader_buffer.h
-DSP_SRCS-yes += binary_codes_reader.c
-DSP_SRCS-yes += binary_codes_reader.h
-endif
-
-# intra predictions
-DSP_SRCS-yes += intrapred.c
-DSP_SRCS-yes += intrapred_common.h
-
-ifneq ($(CONFIG_ANS),yes)
-DSP_SRCS-yes += entcode.c
-DSP_SRCS-yes += entcode.h
-endif
-
-DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
-DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
-DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
-DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm
-
-DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.c
-DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.c
-DSP_SRCS-$(HAVE_AVX2) += x86/intrapred_avx2.c
-
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_SSE)  += x86/highbd_intrapred_sse2.asm
-DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
-DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.c
-DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_ssse3.c
-DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_avx2.c
-endif  # CONFIG_HIGHBITDEPTH
-
-DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
-DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c
-DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred4_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred8_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred16_dspr2.c
-
-DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.h
-DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.c
-
-# inter predictions
-DSP_SRCS-yes            += blend.h
-DSP_SRCS-yes            += blend_a64_mask.c
-DSP_SRCS-yes            += blend_a64_hmask.c
-DSP_SRCS-yes            += blend_a64_vmask.c
-DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_sse4.h
-DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c
-DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c
-DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c
-
-# interpolation filters
-DSP_SRCS-yes += aom_convolve.c
-DSP_SRCS-yes += aom_convolve.h
-DSP_SRCS-yes += aom_filter.h
-
-DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h
-DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/aom_asm_stubs.c
-DSP_SRCS-$(HAVE_SSE2)  += x86/aom_subpixel_8t_sse2.asm
-DSP_SRCS-$(HAVE_SSE2)  += x86/aom_subpixel_bilinear_sse2.asm
-DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm
-DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_bilinear_ssse3.asm
-DSP_SRCS-$(HAVE_AVX2)  += x86/aom_subpixel_8t_intrin_avx2.c
-DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_intrin_ssse3.c
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_SSE2)  += x86/aom_high_subpixel_8t_sse2.asm
-DSP_SRCS-$(HAVE_SSE2)  += x86/aom_high_subpixel_bilinear_sse2.asm
-DSP_SRCS-$(HAVE_AVX2)  += x86/highbd_convolve_avx2.c
-endif
-DSP_SRCS-$(HAVE_SSE2)  += x86/aom_convolve_copy_sse2.asm
-
-ifneq ($(CONFIG_EXT_PARTITION),yes)
-ifeq ($(HAVE_NEON_ASM),yes)
-DSP_SRCS-yes += arm/aom_convolve_copy_neon_asm$(ASM)
-DSP_SRCS-yes += arm/aom_convolve8_avg_neon_asm$(ASM)
-DSP_SRCS-yes += arm/aom_convolve8_neon_asm$(ASM)
-DSP_SRCS-yes += arm/aom_convolve_avg_neon_asm$(ASM)
-DSP_SRCS-yes += arm/aom_convolve_neon.c
-else
-ifeq ($(HAVE_NEON),yes)
-DSP_SRCS-yes += arm/aom_convolve_copy_neon.c
-DSP_SRCS-yes += arm/aom_convolve8_avg_neon.c
-DSP_SRCS-yes += arm/aom_convolve8_neon.c
-DSP_SRCS-yes += arm/aom_convolve_avg_neon.c
-DSP_SRCS-yes += arm/aom_convolve_neon.c
-endif  # HAVE_NEON
-endif  # HAVE_NEON_ASM
-endif  # CONFIG_EXT_PARTITION
-
-# common (msa)
-DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_horiz_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_vert_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_horiz_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_vert_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_avg_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_copy_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_msa.h
-
-# common (dspr2)
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve_common_dspr2.h
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_avg_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_avg_horiz_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_horiz_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_vert_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_avg_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_avg_horiz_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_horiz_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_vert_dspr2.c
-
-# loop filters
-DSP_SRCS-yes += loopfilter.c
-
-DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/loopfilter_sse2.c
-DSP_SRCS-$(HAVE_SSE2)                += x86/lpf_common_sse2.h
-
-ifneq ($(CONFIG_PARALLEL_DEBLOCKING),yes)
-DSP_SRCS-$(HAVE_AVX2)   += x86/loopfilter_avx2.c
-
-DSP_SRCS-$(HAVE_NEON)   += arm/loopfilter_neon.c
-ifeq ($(HAVE_NEON_ASM),yes)
-DSP_SRCS-yes  += arm/loopfilter_mb_neon$(ASM)
-DSP_SRCS-yes  += arm/loopfilter_16_neon$(ASM)
-DSP_SRCS-yes  += arm/loopfilter_8_neon$(ASM)
-DSP_SRCS-yes  += arm/loopfilter_4_neon$(ASM)
-else
-ifeq ($(HAVE_NEON),yes)
-DSP_SRCS-yes   += arm/loopfilter_16_neon.c
-DSP_SRCS-yes   += arm/loopfilter_8_neon.c
-DSP_SRCS-yes   += arm/loopfilter_4_neon.c
-endif  # HAVE_NEON
-endif  # HAVE_NEON_ASM
-
-DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_msa.h
-DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_16_msa.c
-DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_8_msa.c
-DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_4_msa.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_filters_dspr2.h
-DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_filters_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_macros_dspr2.h
-DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_masks_dspr2.h
-DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_horiz_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_vert_dspr2.c
-endif  # !CONFIG_PARALLEL_DEBLOCKING
-
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_loopfilter_sse2.c
-DSP_SRCS-$(HAVE_AVX2)   += x86/highbd_loopfilter_avx2.c
-endif  # CONFIG_HIGHBITDEPTH
-
-DSP_SRCS-yes            += txfm_common.h
-DSP_SRCS-yes            += x86/txfm_common_intrin.h
-DSP_SRCS-$(HAVE_AVX2)   += x86/common_avx2.h
-DSP_SRCS-$(HAVE_SSE2)   += x86/txfm_common_sse2.h
-DSP_SRCS-$(HAVE_SSSE3)  += x86/obmc_intrinsic_ssse3.h
-DSP_SRCS-$(HAVE_MSA)    += mips/txfm_macros_msa.h
-
-# forward transform
-ifneq ($(findstring yes,$(CONFIG_AV1)$(CONFIG_PVQ)),)
-DSP_SRCS-$(HAVE_AVX2)   += x86/txfm_common_avx2.h
-ifeq ($(CONFIG_AV1_ENCODER),yes)
-DSP_SRCS-yes            += fwd_txfm.c
-DSP_SRCS-yes            += fwd_txfm.h
-DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.h
-DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.c
-DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32_8cols_sse2.c
-DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_impl_sse2.h
-DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32x32_impl_sse2.h
-ifeq ($(ARCH_X86_64),yes)
-DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3_x86_64.asm
-endif
-DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.h
-DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
-DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
-DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
-DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
-DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
-DSP_SRCS-$(HAVE_MSA)    += mips/fwd_dct32x32_msa.c
-endif  # CONFIG_AV1_ENCODER
-endif  # CONFIG_AV1
-
-# inverse transform
-ifeq ($(CONFIG_AV1), yes)
-DSP_SRCS-yes            += inv_txfm.h
-DSP_SRCS-yes            += inv_txfm.c
-DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.h
-DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.c
-DSP_SRCS-$(HAVE_SSE2)   += x86/inv_wht_sse2.asm
-DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3.c
-DSP_SRCS-$(HAVE_AVX2)   += x86/inv_txfm_common_avx2.h
-DSP_SRCS-$(HAVE_AVX2)   += x86/inv_txfm_avx2.c
-
-ifeq ($(HAVE_NEON_ASM),yes)
-DSP_SRCS-yes  += arm/save_reg_neon$(ASM)
-DSP_SRCS-yes  += arm/idct4x4_1_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct4x4_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct8x8_1_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct8x8_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct16x16_1_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct16x16_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct32x32_1_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct32x32_add_neon$(ASM)
-else
-ifeq ($(HAVE_NEON),yes)
-DSP_SRCS-yes  += arm/idct4x4_1_add_neon.c
-DSP_SRCS-yes  += arm/idct4x4_add_neon.c
-DSP_SRCS-yes  += arm/idct8x8_1_add_neon.c
-DSP_SRCS-yes  += arm/idct8x8_add_neon.c
-DSP_SRCS-yes  += arm/idct16x16_1_add_neon.c
-DSP_SRCS-yes  += arm/idct16x16_add_neon.c
-DSP_SRCS-yes  += arm/idct32x32_1_add_neon.c
-DSP_SRCS-yes  += arm/idct32x32_add_neon.c
-endif  # HAVE_NEON
-endif  # HAVE_NEON_ASM
-DSP_SRCS-$(HAVE_NEON)  += arm/idct16x16_neon.c
-
-DSP_SRCS-$(HAVE_MSA)   += mips/inv_txfm_msa.h
-DSP_SRCS-$(HAVE_MSA)   += mips/idct4x4_msa.c
-DSP_SRCS-$(HAVE_MSA)   += mips/idct8x8_msa.c
-DSP_SRCS-$(HAVE_MSA)   += mips/idct16x16_msa.c
-DSP_SRCS-$(HAVE_MSA)   += mips/idct32x32_msa.c
-
-ifneq ($(CONFIG_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_DSPR2) += mips/inv_txfm_dspr2.h
-DSP_SRCS-$(HAVE_DSPR2) += mips/itrans4_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
-endif  # CONFIG_HIGHBITDEPTH
-
-ifeq ($(CONFIG_LOOP_RESTORATION),yes)
-DSP_SRCS-$(HAVE_SSE2)   += x86/aom_convolve_hip_sse2.c
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_SSSE3)  += x86/aom_highbd_convolve_hip_ssse3.c
-endif
-endif  # CONFIG_LOOP_RESTORATION
-endif  # CONFIG_AV1
-
-# quantization
-ifneq ($(filter yes,$(CONFIG_AV1_ENCODER)),)
-DSP_SRCS-yes            += quantize.c
-DSP_SRCS-yes            += quantize.h
-
-DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c
-
-DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
-DSP_SRCS-$(HAVE_AVX2)   += x86/highbd_quantize_intrin_avx2.c
-
-ifeq ($(ARCH_X86_64),yes)
-DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3_x86_64.asm
-DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx_x86_64.asm
-endif
-
-# avg
-DSP_SRCS-yes           += avg.c
-DSP_SRCS-$(HAVE_SSE2)  += x86/avg_intrin_sse2.c
-DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
-DSP_SRCS-$(HAVE_NEON)  += arm/hadamard_neon.c
-ifeq ($(ARCH_X86_64),yes)
-DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
-endif
-
-# high bit depth subtract
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_subtract_sse2.c
-endif
-
-endif  # CONFIG_AV1_ENCODER
-
-ifeq ($(CONFIG_AV1_ENCODER),yes)
-DSP_SRCS-yes            += sum_squares.c
-
-DSP_SRCS-$(HAVE_SSE2)   += x86/sum_squares_sse2.c
-endif # CONFIG_AV1_ENCODER
-
-ifeq ($(CONFIG_AV1_ENCODER),yes)
-DSP_SRCS-yes            += sad.c
-DSP_SRCS-yes            += subtract.c
-
-DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c
-DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c
-DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c
-
-DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c
-DSP_SRCS-$(HAVE_MSA)    += mips/subtract_msa.c
-
-DSP_SRCS-$(HAVE_SSE3)   += x86/sad_sse3.asm
-DSP_SRCS-$(HAVE_SSSE3)  += x86/sad_ssse3.asm
-DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
-DSP_SRCS-$(HAVE_AVX2)   += x86/sad4d_avx2.c
-DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
-
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_AVX2)   += x86/sad_highbd_avx2.c
-endif
-
-ifeq ($(CONFIG_AV1_ENCODER),yes)
-DSP_SRCS-$(HAVE_SSSE3)  += x86/masked_sad_intrin_ssse3.c
-DSP_SRCS-$(HAVE_SSSE3)  += x86/masked_variance_intrin_ssse3.c
-ifeq ($(CONFIG_MOTION_VAR),yes)
-DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_sad_sse4.c
-DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c
-endif  #CONFIG_MOTION_VAR
-ifeq ($(CONFIG_EXT_PARTITION),yes)
-DSP_SRCS-$(HAVE_AVX2) += x86/sad_impl_avx2.c
-endif
-endif  #CONFIG_AV1_ENCODER
-
-DSP_SRCS-$(HAVE_SSE)    += x86/sad4d_sse2.asm
-DSP_SRCS-$(HAVE_SSE)    += x86/sad_sse2.asm
-DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
-DSP_SRCS-$(HAVE_SSE2)   += x86/sad_sse2.asm
-DSP_SRCS-$(HAVE_SSE2)   += x86/subtract_sse2.asm
-
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
-DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
-endif  # CONFIG_HIGHBITDEPTH
-
-endif  # CONFIG_AV1_ENCODER
-
-ifneq ($(filter yes,$(CONFIG_AV1_ENCODER)),)
-DSP_SRCS-yes            += variance.c
-DSP_SRCS-yes            += variance.h
-
-DSP_SRCS-$(HAVE_NEON)   += arm/subpel_variance_neon.c
-DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
-
-DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
-DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
-
-DSP_SRCS-$(HAVE_SSE)    += x86/variance_sse2.c
-DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
-DSP_SRCS-$(HAVE_SSE2)   += x86/halfpix_variance_sse2.c
-DSP_SRCS-$(HAVE_SSE2)   += x86/halfpix_variance_impl_sse2.asm
-DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
-DSP_SRCS-$(HAVE_AVX2)   += x86/variance_impl_avx2.c
-
-ifeq ($(ARCH_X86_64),yes)
-DSP_SRCS-$(HAVE_SSE2)   += x86/ssim_opt_x86_64.asm
-endif  # ARCH_X86_64
-
-DSP_SRCS-$(HAVE_SSE)    += x86/subpel_variance_sse2.asm
-DSP_SRCS-$(HAVE_SSE2)   += x86/subpel_variance_sse2.asm  # Contains SSE2 and SSSE3
-
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
-DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_variance_sse4.c
-DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
-DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
-endif  # CONFIG_HIGHBITDEPTH
-endif  # CONFIG_AV1_ENCODER
-
-DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
-
-DSP_SRCS-yes += aom_dsp_rtcd.c
-DSP_SRCS-yes += aom_dsp_rtcd_defs.pl
-
-DSP_SRCS-yes += aom_simd.h
-DSP_SRCS-yes += aom_simd_inline.h
-DSP_SRCS-yes += simd/v64_intrinsics.h
-DSP_SRCS-yes += simd/v64_intrinsics_c.h
-DSP_SRCS-yes += simd/v128_intrinsics.h
-DSP_SRCS-yes += simd/v128_intrinsics_c.h
-DSP_SRCS-yes += simd/v256_intrinsics.h
-DSP_SRCS-yes += simd/v256_intrinsics_c.h
-DSP_SRCS-yes += simd/v256_intrinsics_v128.h
-DSP_SRCS-$(HAVE_SSE2) += simd/v64_intrinsics_x86.h
-DSP_SRCS-$(HAVE_SSE2) += simd/v128_intrinsics_x86.h
-DSP_SRCS-$(HAVE_SSE2) += simd/v256_intrinsics_x86.h
-DSP_SRCS-$(HAVE_NEON) += simd/v64_intrinsics_arm.h
-DSP_SRCS-$(HAVE_NEON) += simd/v128_intrinsics_arm.h
-DSP_SRCS-$(HAVE_NEON) += simd/v256_intrinsics_arm.h
-
-$(eval $(call rtcd_h_template,aom_dsp_rtcd,aom_dsp/aom_dsp_rtcd_defs.pl))
diff --git a/third_party/aom/aom_dsp/aom_dsp_common.h b/third_party/aom/aom_dsp/aom_dsp_common.h
index 3d3bcba37..c5dc9a834 100644
--- a/third_party/aom/aom_dsp/aom_dsp_common.h
+++ b/third_party/aom/aom_dsp/aom_dsp_common.h
@@ -12,7 +12,8 @@
 #ifndef AOM_DSP_AOM_DSP_COMMON_H_
 #define AOM_DSP_AOM_DSP_COMMON_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 
@@ -21,11 +22,7 @@ extern "C" {
 #endif
 
 #ifndef MAX_SB_SIZE
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
 #define MAX_SB_SIZE 128
-#else
-#define MAX_SB_SIZE 64
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
 #endif  // ndef MAX_SB_SIZE
 
 #define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
@@ -52,22 +49,14 @@ extern "C" {
 #define UNLIKELY(v) (v)
 #endif
 
-typedef uint16_t qm_val_t;
+typedef uint8_t qm_val_t;
 #define AOM_QM_BITS 5
 
-#if CONFIG_HIGHBITDEPTH
 // Note:
 // tran_low_t  is the datatype used for final transform coefficients.
 // tran_high_t is the datatype used for intermediate transform stages.
 typedef int64_t tran_high_t;
 typedef int32_t tran_low_t;
-#else
-// Note:
-// tran_low_t  is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-typedef int32_t tran_high_t;
-typedef int16_t tran_low_t;
-#endif  // CONFIG_HIGHBITDEPTH
 
 static INLINE uint8_t clip_pixel(int val) {
   return (val > 255) ? 255 : (val < 0) ? 0 : val;
@@ -77,10 +66,6 @@ static INLINE int clamp(int value, int low, int high) {
   return value < low ? low : (value > high ? high : value);
 }
 
-static INLINE uint32_t clamp32u(uint32_t value, uint32_t low, uint32_t high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
 static INLINE int64_t clamp64(int64_t value, int64_t low, int64_t high) {
   return value < low ? low : (value > high ? high : value);
 }
@@ -98,6 +83,14 @@ static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
   }
 }
 
+// The result of this branchless code is equivalent to (value < 0 ? 0 : value)
+// or max(0, value) and might be faster in some cases.
+// Care should be taken since the behavior of right shifting signed type
+// negative value is undefined by C standards and implementation defined,
+static INLINE unsigned int negative_to_zero(int value) {
+  return value & ~(value >> (sizeof(value) * 8 - 1));
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd.c b/third_party/aom/aom_dsp/aom_dsp_rtcd.c
index 11a57d382..5d7d4515b 100644
--- a/third_party/aom/aom_dsp/aom_dsp_rtcd.c
+++ b/third_party/aom/aom_dsp/aom_dsp_rtcd.c
@@ -8,9 +8,11 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #define RTCD_C
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_ports/aom_once.h"
 
 void aom_dsp_rtcd() { once(setup_rtcd_internal); }
diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
index f4f6c64d4..a8ac5eb5c 100755
--- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1,3 +1,13 @@
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
 sub aom_dsp_forward_decls() {
 print <<EOF
 /*
@@ -7,6 +17,7 @@ print <<EOF
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "av1/common/enums.h"
+#include "av1/common/blockd.h"
 
 EOF
 }
@@ -28,11 +39,7 @@ if ($opts{arch} eq "x86_64") {
   $avx2_x86_64 = 'avx2';
 }
 
-if (aom_config("CONFIG_EXT_PARTITION") eq "yes") {
-  @block_widths = (4, 8, 16, 32, 64, 128)
-} else {
-  @block_widths = (4, 8, 16, 32, 64)
-}
+@block_widths = (4, 8, 16, 32, 64, 128);
 
 @block_sizes = ();
 foreach $w (@block_widths) {
@@ -40,36 +47,24 @@ foreach $w (@block_widths) {
     push @block_sizes, [$w, $h] if ($w <= 2*$h && $h <= 2*$w) ;
   }
 }
-if (aom_config("CONFIG_EXT_PARTITION_TYPES") eq "yes") {
-  push @block_sizes, [4, 16];
-  push @block_sizes, [16, 4];
-  push @block_sizes, [8, 32];
-  push @block_sizes, [32, 8];
-  push @block_sizes, [16, 64];
-  push @block_sizes, [64, 16];
-  if (aom_config("CONFIG_EXT_PARTITION") eq "yes") {
-      push @block_sizes, [32, 128];
-      push @block_sizes, [128, 32];
-  }
-}
-
-@tx_dims = (2, 4, 8, 16, 32);
-if (aom_config("CONFIG_TX64X64") eq "yes") {
-  push @tx_dims, '64';
-}
-
+push @block_sizes, [4, 16];
+push @block_sizes, [16, 4];
+push @block_sizes, [8, 32];
+push @block_sizes, [32, 8];
+push @block_sizes, [16, 64];
+push @block_sizes, [64, 16];
+
+@tx_dims = (2, 4, 8, 16, 32, 64);
 @tx_sizes = ();
 foreach $w (@tx_dims) {
   push @tx_sizes, [$w, $w];
   foreach $h (@tx_dims) {
     push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 2*$h || $h == 2*$w));
+    push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 4*$h || $h == 4*$w));
   }
 }
 
-@pred_names = qw/dc dc_top dc_left dc_128 v h d207e d63e d45e d117 d135 d153 paeth smooth/;
-if (aom_config("CONFIG_SMOOTH_HV") eq "yes") {
-  push @pred_names, qw/smooth_v smooth_h/;
-}
+@pred_names = qw/dc dc_top dc_left dc_128 v h paeth smooth smooth_v smooth_h/;
 
 #
 # Intra prediction
@@ -80,73 +75,125 @@ foreach (@tx_sizes) {
   foreach $pred_name (@pred_names) {
     add_proto "void", "aom_${pred_name}_predictor_${w}x${h}",
               "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-    if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-      add_proto "void", "aom_highbd_${pred_name}_predictor_${w}x${h}",
-                "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    }
+    add_proto "void", "aom_highbd_${pred_name}_predictor_${w}x${h}",
+              "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   }
 }
 
 specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/;
 specialize qw/aom_dc_top_predictor_4x8 sse2/;
+specialize qw/aom_dc_top_predictor_4x16 sse2/;
 specialize qw/aom_dc_top_predictor_8x4 sse2/;
 specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/;
 specialize qw/aom_dc_top_predictor_8x16 sse2/;
+specialize qw/aom_dc_top_predictor_8x32 sse2/;
+specialize qw/aom_dc_top_predictor_16x4 sse2/;
 specialize qw/aom_dc_top_predictor_16x8 sse2/;
 specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/;
 specialize qw/aom_dc_top_predictor_16x32 sse2/;
+specialize qw/aom_dc_top_predictor_16x64 sse2/;
+specialize qw/aom_dc_top_predictor_32x8 sse2/;
 specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/;
 specialize qw/aom_dc_top_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/;
 specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/;
 specialize qw/aom_dc_left_predictor_4x8 sse2/;
+specialize qw/aom_dc_left_predictor_4x16 sse2/;
 specialize qw/aom_dc_left_predictor_8x4 sse2/;
 specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/;
 specialize qw/aom_dc_left_predictor_8x16 sse2/;
+specialize qw/aom_dc_left_predictor_8x32 sse2/;
+specialize qw/aom_dc_left_predictor_16x4 sse2/;
 specialize qw/aom_dc_left_predictor_16x8 sse2/;
 specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/;
 specialize qw/aom_dc_left_predictor_16x32 sse2/;
+specialize qw/aom_dc_left_predictor_16x64 sse2/;
+specialize qw/aom_dc_left_predictor_32x8 sse2/;
 specialize qw/aom_dc_left_predictor_32x16 sse2 avx2/;
 specialize qw/aom_dc_left_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_left_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x16 sse2 avx2/;
 specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/;
 specialize qw/aom_dc_128_predictor_4x8 sse2/;
+specialize qw/aom_dc_128_predictor_4x16 sse2/;
 specialize qw/aom_dc_128_predictor_8x4 sse2/;
 specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/;
 specialize qw/aom_dc_128_predictor_8x16 sse2/;
+specialize qw/aom_dc_128_predictor_8x32 sse2/;
+specialize qw/aom_dc_128_predictor_16x4 sse2/;
 specialize qw/aom_dc_128_predictor_16x8 sse2/;
 specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/;
 specialize qw/aom_dc_128_predictor_16x32 sse2/;
+specialize qw/aom_dc_128_predictor_16x64 sse2/;
+specialize qw/aom_dc_128_predictor_32x8 sse2/;
 specialize qw/aom_dc_128_predictor_32x16 sse2 avx2/;
 specialize qw/aom_dc_128_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x16 sse2 avx2/;
 specialize qw/aom_v_predictor_4x4 neon msa sse2/;
 specialize qw/aom_v_predictor_4x8 sse2/;
+specialize qw/aom_v_predictor_4x16 sse2/;
 specialize qw/aom_v_predictor_8x4 sse2/;
 specialize qw/aom_v_predictor_8x8 neon msa sse2/;
 specialize qw/aom_v_predictor_8x16 sse2/;
+specialize qw/aom_v_predictor_8x32 sse2/;
+specialize qw/aom_v_predictor_16x4 sse2/;
 specialize qw/aom_v_predictor_16x8 sse2/;
 specialize qw/aom_v_predictor_16x16 neon msa sse2/;
 specialize qw/aom_v_predictor_16x32 sse2/;
+specialize qw/aom_v_predictor_16x64 sse2/;
+specialize qw/aom_v_predictor_32x8 sse2/;
 specialize qw/aom_v_predictor_32x16 sse2 avx2/;
 specialize qw/aom_v_predictor_32x32 neon msa sse2 avx2/;
+specialize qw/aom_v_predictor_32x64 sse2 avx2/;
+specialize qw/aom_v_predictor_64x64 sse2 avx2/;
+specialize qw/aom_v_predictor_64x32 sse2 avx2/;
+specialize qw/aom_v_predictor_64x16 sse2 avx2/;
 specialize qw/aom_h_predictor_4x8 sse2/;
+specialize qw/aom_h_predictor_4x16 sse2/;
 specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/;
 specialize qw/aom_h_predictor_8x4 sse2/;
 specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/;
 specialize qw/aom_h_predictor_8x16 sse2/;
+specialize qw/aom_h_predictor_8x32 sse2/;
+specialize qw/aom_h_predictor_16x4 sse2/;
 specialize qw/aom_h_predictor_16x8 sse2/;
 specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/;
 specialize qw/aom_h_predictor_16x32 sse2/;
+specialize qw/aom_h_predictor_16x64 sse2/;
+specialize qw/aom_h_predictor_32x8 sse2/;
 specialize qw/aom_h_predictor_32x16 sse2/;
 specialize qw/aom_h_predictor_32x32 neon msa sse2 avx2/;
+specialize qw/aom_h_predictor_32x64 sse2/;
+specialize qw/aom_h_predictor_64x64 sse2/;
+specialize qw/aom_h_predictor_64x32 sse2/;
+specialize qw/aom_h_predictor_64x16 sse2/;
 specialize qw/aom_paeth_predictor_4x4 ssse3/;
 specialize qw/aom_paeth_predictor_4x8 ssse3/;
+specialize qw/aom_paeth_predictor_4x16 ssse3/;
 specialize qw/aom_paeth_predictor_8x4 ssse3/;
 specialize qw/aom_paeth_predictor_8x8 ssse3/;
 specialize qw/aom_paeth_predictor_8x16 ssse3/;
+specialize qw/aom_paeth_predictor_8x32 ssse3/;
+specialize qw/aom_paeth_predictor_16x4 ssse3/;
 specialize qw/aom_paeth_predictor_16x8 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_16x16 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_16x32 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_16x64 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_32x8 ssse3/;
 specialize qw/aom_paeth_predictor_32x16 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_32x32 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_32x64 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_64x32 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_64x64 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_64x16 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_16x8 ssse3/;
 specialize qw/aom_paeth_predictor_16x16 ssse3/;
 specialize qw/aom_paeth_predictor_16x32 ssse3/;
@@ -154,34 +201,86 @@ specialize qw/aom_paeth_predictor_32x16 ssse3/;
 specialize qw/aom_paeth_predictor_32x32 ssse3/;
 specialize qw/aom_smooth_predictor_4x4 ssse3/;
 specialize qw/aom_smooth_predictor_4x8 ssse3/;
+specialize qw/aom_smooth_predictor_4x16 ssse3/;
 specialize qw/aom_smooth_predictor_8x4 ssse3/;
 specialize qw/aom_smooth_predictor_8x8 ssse3/;
 specialize qw/aom_smooth_predictor_8x16 ssse3/;
+specialize qw/aom_smooth_predictor_8x32 ssse3/;
+specialize qw/aom_smooth_predictor_16x4 ssse3/;
 specialize qw/aom_smooth_predictor_16x8 ssse3/;
 specialize qw/aom_smooth_predictor_16x16 ssse3/;
 specialize qw/aom_smooth_predictor_16x32 ssse3/;
+specialize qw/aom_smooth_predictor_16x64 ssse3/;
+specialize qw/aom_smooth_predictor_32x8 ssse3/;
 specialize qw/aom_smooth_predictor_32x16 ssse3/;
 specialize qw/aom_smooth_predictor_32x32 ssse3/;
-
-specialize qw/aom_d63e_predictor_4x4 ssse3/;
-specialize qw/aom_d135_predictor_4x4 neon/;
-specialize qw/aom_d153_predictor_4x4 ssse3/;
+specialize qw/aom_smooth_predictor_32x64 ssse3/;
+specialize qw/aom_smooth_predictor_64x64 ssse3/;
+specialize qw/aom_smooth_predictor_64x32 ssse3/;
+specialize qw/aom_smooth_predictor_64x16 ssse3/;
+
+specialize qw/aom_smooth_v_predictor_4x4 ssse3/;
+specialize qw/aom_smooth_v_predictor_4x8 ssse3/;
+specialize qw/aom_smooth_v_predictor_4x16 ssse3/;
+specialize qw/aom_smooth_v_predictor_8x4 ssse3/;
+specialize qw/aom_smooth_v_predictor_8x8 ssse3/;
+specialize qw/aom_smooth_v_predictor_8x16 ssse3/;
+specialize qw/aom_smooth_v_predictor_8x32 ssse3/;
+specialize qw/aom_smooth_v_predictor_16x4 ssse3/;
+specialize qw/aom_smooth_v_predictor_16x8 ssse3/;
+specialize qw/aom_smooth_v_predictor_16x16 ssse3/;
+specialize qw/aom_smooth_v_predictor_16x32 ssse3/;
+specialize qw/aom_smooth_v_predictor_16x64 ssse3/;
+specialize qw/aom_smooth_v_predictor_32x8 ssse3/;
+specialize qw/aom_smooth_v_predictor_32x16 ssse3/;
+specialize qw/aom_smooth_v_predictor_32x32 ssse3/;
+specialize qw/aom_smooth_v_predictor_32x64 ssse3/;
+specialize qw/aom_smooth_v_predictor_64x64 ssse3/;
+specialize qw/aom_smooth_v_predictor_64x32 ssse3/;
+specialize qw/aom_smooth_v_predictor_64x16 ssse3/;
+
+specialize qw/aom_smooth_h_predictor_4x4 ssse3/;
+specialize qw/aom_smooth_h_predictor_4x8 ssse3/;
+specialize qw/aom_smooth_h_predictor_4x16 ssse3/;
+specialize qw/aom_smooth_h_predictor_8x4 ssse3/;
+specialize qw/aom_smooth_h_predictor_8x8 ssse3/;
+specialize qw/aom_smooth_h_predictor_8x16 ssse3/;
+specialize qw/aom_smooth_h_predictor_8x32 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x4 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x8 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x16 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x32 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x64 ssse3/;
+specialize qw/aom_smooth_h_predictor_32x8 ssse3/;
+specialize qw/aom_smooth_h_predictor_32x16 ssse3/;
+specialize qw/aom_smooth_h_predictor_32x32 ssse3/;
+specialize qw/aom_smooth_h_predictor_32x64 ssse3/;
+specialize qw/aom_smooth_h_predictor_64x64 ssse3/;
+specialize qw/aom_smooth_h_predictor_64x32 ssse3/;
+specialize qw/aom_smooth_h_predictor_64x16 ssse3/;
+
+# TODO(yunqingwang): optimize rectangular DC_PRED to replace division
+# by multiply and shift.
 specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/;
 specialize qw/aom_dc_predictor_4x8 sse2/;
-specialize qw/aom_d153_predictor_8x8 ssse3/;
+specialize qw/aom_dc_predictor_4x16 sse2/;
 specialize qw/aom_dc_predictor_8x4 sse2/;
 specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/;
 specialize qw/aom_dc_predictor_8x16 sse2/;
-specialize qw/aom_d153_predictor_16x16 ssse3/;
+specialize qw/aom_dc_predictor_8x32 sse2/;
+specialize qw/aom_dc_predictor_16x4 sse2/;
 specialize qw/aom_dc_predictor_16x8 sse2/;
 specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/;
 specialize qw/aom_dc_predictor_16x32 sse2/;
-specialize qw/aom_d153_predictor_32x32 ssse3/;
-
+specialize qw/aom_dc_predictor_16x64 sse2/;
+specialize qw/aom_dc_predictor_32x8 sse2/;
 specialize qw/aom_dc_predictor_32x16 sse2 avx2/;
 specialize qw/aom_dc_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_predictor_64x16 sse2 avx2/;
 
-if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
   specialize qw/aom_highbd_v_predictor_4x4 sse2/;
   specialize qw/aom_highbd_v_predictor_4x8 sse2/;
   specialize qw/aom_highbd_v_predictor_8x4 sse2/;
@@ -192,16 +291,21 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
   specialize qw/aom_highbd_v_predictor_16x32 sse2/;
   specialize qw/aom_highbd_v_predictor_32x16 sse2/;
   specialize qw/aom_highbd_v_predictor_32x32 sse2/;
-  specialize qw/aom_highbd_dc_predictor_4x4 sse2/;
+
+  # TODO(yunqingwang): optimize rectangular DC_PRED to replace division
+  # by multiply and shift.
+  specialize qw/aom_highbd_dc_predictor_4x4 sse2 neon/;
   specialize qw/aom_highbd_dc_predictor_4x8 sse2/;
   specialize qw/aom_highbd_dc_predictor_8x4 sse2/;;
-  specialize qw/aom_highbd_dc_predictor_8x8 sse2/;;
+  specialize qw/aom_highbd_dc_predictor_8x8 sse2 neon/;;
   specialize qw/aom_highbd_dc_predictor_8x16 sse2/;;
   specialize qw/aom_highbd_dc_predictor_16x8 sse2/;
-  specialize qw/aom_highbd_dc_predictor_16x16 sse2/;
+  specialize qw/aom_highbd_dc_predictor_16x16 sse2 neon/;
   specialize qw/aom_highbd_dc_predictor_16x32 sse2/;
   specialize qw/aom_highbd_dc_predictor_32x16 sse2/;
-  specialize qw/aom_highbd_dc_predictor_32x32 sse2/;
+  specialize qw/aom_highbd_dc_predictor_32x32 sse2 neon/;
+  specialize qw/aom_highbd_dc_predictor_64x64 neon/;
+
   specialize qw/aom_highbd_h_predictor_4x4 sse2/;
   specialize qw/aom_highbd_h_predictor_4x8 sse2/;
   specialize qw/aom_highbd_h_predictor_8x4 sse2/;
@@ -242,253 +346,129 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
   specialize qw/aom_highbd_dc_left_predictor_32x32 sse2/;
   specialize qw/aom_highbd_dc_top_predictor_32x32 sse2/;
   specialize qw/aom_highbd_dc_128_predictor_32x32 sse2/;
-  
-  specialize qw/aom_highbd_d117_predictor_4x4 sse2/;
-  specialize qw/aom_highbd_d117_predictor_8x8 ssse3/;
-  specialize qw/aom_highbd_d117_predictor_16x16 ssse3/;
-  specialize qw/aom_highbd_d117_predictor_32x32 ssse3/;
-  specialize qw/aom_highbd_d135_predictor_4x4 sse2/;
-  specialize qw/aom_highbd_d135_predictor_8x8 ssse3/;
-  specialize qw/aom_highbd_d135_predictor_16x16 ssse3/;
-  specialize qw/aom_highbd_d135_predictor_32x32 ssse3/;
-  specialize qw/aom_highbd_d153_predictor_4x4 sse2/;
-  specialize qw/aom_highbd_d153_predictor_8x8 ssse3/;
-  specialize qw/aom_highbd_d153_predictor_16x16 ssse3/;
-  specialize qw/aom_highbd_d153_predictor_32x32 ssse3/;
-
-  specialize qw/aom_highbd_d45e_predictor_4x4 sse2/;
-  specialize qw/aom_highbd_d45e_predictor_4x8 sse2/;
-  specialize qw/aom_highbd_d45e_predictor_8x4 sse2/;
-  specialize qw/aom_highbd_d45e_predictor_8x8 sse2/;
-  specialize qw/aom_highbd_d45e_predictor_8x16 sse2/;
-  specialize qw/aom_highbd_d45e_predictor_16x8 avx2/;
-  specialize qw/aom_highbd_d45e_predictor_16x16 avx2/;
-  specialize qw/aom_highbd_d45e_predictor_16x32 avx2/;
-  specialize qw/aom_highbd_d45e_predictor_32x16 avx2/;
-  specialize qw/aom_highbd_d45e_predictor_32x32 avx2/;
-}  # CONFIG_HIGHBITDEPTH
 
 #
 # Sub Pixel Filters
 #
 add_proto qw/void aom_convolve_copy/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve_avg/,              "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8/,                 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 add_proto qw/void aom_convolve8_horiz/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 add_proto qw/void aom_convolve8_vert/,            "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_avg/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_avg_horiz/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_avg_vert/,        "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_2d/,                 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_horiz/,              "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_vert/,               "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_avg_2d/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_avg_horiz/,          "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_avg_vert/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-
-add_proto qw/void aom_convolve8_horiz_scale/,     "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_vert_scale/,      "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_avg_horiz_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_avg_vert_scale/,  "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_scale/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_avg_scale/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
 
 specialize qw/aom_convolve_copy       sse2      /;
-specialize qw/aom_convolve_avg        sse2      /;
-specialize qw/aom_convolve8           sse2 ssse3/, "$avx2_ssse3";
 specialize qw/aom_convolve8_horiz     sse2 ssse3/, "$avx2_ssse3";
 specialize qw/aom_convolve8_vert      sse2 ssse3/, "$avx2_ssse3";
-specialize qw/aom_convolve8_avg       sse2 ssse3/;
-specialize qw/aom_convolve8_avg_horiz sse2 ssse3/;
-specialize qw/aom_convolve8_avg_vert  sse2 ssse3/;
-specialize qw/aom_scaled_2d                ssse3/;
-
-if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
-  add_proto qw/void aom_convolve8_add_src/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-  add_proto qw/void aom_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-  add_proto qw/void aom_convolve8_add_src_vert/,  "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-  add_proto qw/void aom_convolve8_add_src_hip/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-  add_proto qw/void aom_convolve8_add_src_horiz_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-  add_proto qw/void aom_convolve8_add_src_vert_hip/,  "const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-
-  specialize qw/aom_convolve8_add_src ssse3/;
-  specialize qw/aom_convolve8_add_src_horiz ssse3/;
-  specialize qw/aom_convolve8_add_src_vert ssse3/;
-  specialize qw/aom_convolve8_add_src_hip sse2/;
-}  # CONFIG_LOOP_RESTORATION
-
-# TODO(any): These need to be extended to up to 128x128 block sizes
-if (!(aom_config("CONFIG_AV1") eq "yes" && aom_config("CONFIG_EXT_PARTITION") eq "yes")) {
-  specialize qw/aom_convolve_copy       neon dspr2 msa/;
-  specialize qw/aom_convolve_avg        neon dspr2 msa/;
-  specialize qw/aom_convolve8           neon dspr2 msa/;
-  specialize qw/aom_convolve8_horiz     neon dspr2 msa/;
-  specialize qw/aom_convolve8_vert      neon dspr2 msa/;
-  specialize qw/aom_convolve8_avg       neon dspr2 msa/;
-  specialize qw/aom_convolve8_avg_horiz neon dspr2 msa/;
-  specialize qw/aom_convolve8_avg_vert  neon dspr2 msa/;
-}
 
-if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/aom_highbd_convolve_copy sse2 avx2/;
+add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/aom_highbd_convolve_copy sse2 avx2/;
 
-  add_proto qw/void aom_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/aom_highbd_convolve_avg sse2 avx2/;
+add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/aom_highbd_convolve8_horiz avx2/, "$sse2_x86_64";
 
-  add_proto qw/void aom_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/aom_highbd_convolve8 avx2/, "$sse2_x86_64";
-
-  add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/aom_highbd_convolve8_horiz avx2/, "$sse2_x86_64";
-
-  add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/aom_highbd_convolve8_vert avx2/, "$sse2_x86_64";
-
-  add_proto qw/void aom_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/aom_highbd_convolve8_avg avx2/, "$sse2_x86_64";
-
-  add_proto qw/void aom_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/aom_highbd_convolve8_avg_horiz avx2/, "$sse2_x86_64";
-
-  add_proto qw/void aom_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/aom_highbd_convolve8_avg_vert avx2/, "$sse2_x86_64";
-
-  if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
-    add_proto qw/void aom_highbd_convolve8_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-    add_proto qw/void aom_highbd_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-    add_proto qw/void aom_highbd_convolve8_add_src_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-    add_proto qw/void aom_highbd_convolve8_add_src_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-    add_proto qw/void aom_highbd_convolve8_add_src_horiz_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-    add_proto qw/void aom_highbd_convolve8_add_src_vert_hip/, "const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-
-    specialize qw/aom_highbd_convolve8_add_src/, "$sse2_x86_64";
-    specialize qw/aom_highbd_convolve8_add_src_hip ssse3/;
-    # The _horiz/_vert functions are currently unused, so we don't bother
-    # specialising them.
-  }  # CONFIG_LOOP_RESTORATION
-}  # CONFIG_HIGHBITDEPTH
+add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/aom_highbd_convolve8_vert avx2/, "$sse2_x86_64";
 
 #
 # Loopfilter
 #
-add_proto qw/void aom_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
-  specialize qw/aom_lpf_vertical_16 sse2/;
-} else {
-  specialize qw/aom_lpf_vertical_16 sse2 neon_asm dspr2 msa/;
-  $aom_lpf_vertical_16_neon_asm=aom_lpf_vertical_16_neon;
-}
+add_proto qw/void aom_lpf_vertical_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_14 sse2 neon/;
 
-add_proto qw/void aom_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") {
-  specialize qw/aom_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/;
-  $aom_lpf_vertical_16_dual_neon_asm=aom_lpf_vertical_16_dual_neon;
-}
+add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_14_dual sse2/;
+
+add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_6 sse2/;
 
 add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
-  specialize qw/aom_lpf_vertical_8 sse2/;
-} else {
-  specialize qw/aom_lpf_vertical_8 sse2 neon dspr2 msa/;
-}
+specialize qw/aom_lpf_vertical_8 sse2 neon/;
 
 add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") {
-  specialize qw/aom_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
-  $aom_lpf_vertical_8_dual_neon_asm=aom_lpf_vertical_8_dual_neon;
-}
+specialize qw/aom_lpf_vertical_8_dual sse2/;
 
 add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
-  specialize qw/aom_lpf_vertical_4 sse2/;
-} else {
-  specialize qw/aom_lpf_vertical_4 sse2 neon dspr2 msa/;
-}
+specialize qw/aom_lpf_vertical_4 sse2/;
 
 add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") {
-  specialize qw/aom_lpf_vertical_4_dual sse2 neon dspr2 msa/;
-}
+specialize qw/aom_lpf_vertical_4_dual sse2/;
 
-add_proto qw/void aom_lpf_horizontal_edge_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
-  specialize qw/aom_lpf_horizontal_edge_8 sse2/;
-} else {
-  specialize qw/aom_lpf_horizontal_edge_8 sse2 avx2 neon_asm dspr2 msa/;
-  $aom_lpf_horizontal_edge_8_neon_asm=aom_lpf_horizontal_edge_8_neon;
-}
+add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_14 sse2/;
 
-add_proto qw/void aom_lpf_horizontal_edge_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
-  specialize qw/aom_lpf_horizontal_edge_16 sse2/;
-} else {
-  specialize qw/aom_lpf_horizontal_edge_16 sse2 avx2 neon_asm dspr2 msa/;
-  $aom_lpf_horizontal_edge_16_neon_asm=aom_lpf_horizontal_edge_16_neon;
-}
+add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_14_dual sse2/;
+
+add_proto qw/void aom_lpf_horizontal_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_6 sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_6_dual sse2/;
 
 add_proto qw/void aom_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
-  specialize qw/aom_lpf_horizontal_8 sse2/;
-} else {
-  specialize qw/aom_lpf_horizontal_8 sse2 neon dspr2 msa/;
-}
+specialize qw/aom_lpf_horizontal_8 sse2 neon/;
 
 add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") {
-  specialize qw/aom_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
-  $aom_lpf_horizontal_8_dual_neon_asm=aom_lpf_horizontal_8_dual_neon;
-}
+specialize qw/aom_lpf_horizontal_8_dual sse2/;
 
 add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
-  specialize qw/aom_lpf_horizontal_4 sse2/;
-} else {
-  specialize qw/aom_lpf_horizontal_4 sse2 neon dspr2 msa/;
-}
+specialize qw/aom_lpf_horizontal_4 sse2/;
 
 add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") {
-  specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
-}
+specialize qw/aom_lpf_horizontal_4_dual sse2/;
+
+add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_vertical_14 sse2/;
+
+add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_vertical_14_dual sse2 avx2/;
+
+add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_vertical_8 sse2/;
 
-if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void aom_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_vertical_16 sse2/;
+add_proto qw/void aom_highbd_lpf_vertical_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_vertical_6 sse2/;
 
-  add_proto qw/void aom_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_vertical_16_dual sse2 avx2/;
+add_proto qw/void aom_lpf_vertical_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_6_dual sse2/;
 
-  add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_vertical_8 sse2/;
+add_proto qw/void aom_highbd_lpf_vertical_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_vertical_6_dual sse2/;
 
-  add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/;
+add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/;
 
-  add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_vertical_4 sse2/;
+add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_vertical_4 sse2/;
 
-  add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/;
+add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/;
 
-  add_proto qw/void aom_highbd_lpf_horizontal_edge_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_horizontal_edge_8 sse2/;
+add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_horizontal_14 sse2/;
 
-  add_proto qw/void aom_highbd_lpf_horizontal_edge_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_horizontal_edge_16 sse2 avx2/;
+add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd";
+specialize qw/aom_highbd_lpf_horizontal_14_dual sse2 avx2/;
 
-  add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_horizontal_8 sse2/;
+add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_horizontal_6 sse2/;
 
-  add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/;
+add_proto qw/void aom_highbd_lpf_horizontal_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_horizontal_6_dual sse2/;
 
-  add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_horizontal_4 sse2/;
+add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_horizontal_8 sse2/;
 
-  add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/;
-}  # CONFIG_HIGHBITDEPTH
+add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/;
+
+add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_horizontal_4 sse2/;
+
+add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/;
+
+# Helper functions.
+add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit";
+specialize "av1_round_shift_array", qw/sse4_1 neon/;
 
 #
 # Encoder functions.
@@ -497,170 +477,43 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
 #
 # Forward transform
 #
-if ((aom_config("CONFIG_AV1_ENCODER") eq "yes") || (aom_config("CONFIG_PVQ") eq "yes")){
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct4x4 sse2/;
-
-    add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct4x4_1 sse2/;
-
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes"){
     add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
     specialize qw/aom_fdct8x8 sse2/, "$ssse3_x86_64";
 
-    add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct16x16 sse2/;
-
-    add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct32x32 sse2 avx2/;
-
-    add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct32x32_rd sse2 avx2/;
-
     # High bit depth
-    add_proto qw/void aom_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_highbd_fdct4x4 sse2/;
-
     add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
     specialize qw/aom_highbd_fdct8x8 sse2/;
 
-    add_proto qw/void aom_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_highbd_fdct16x16 sse2/;
-
-    add_proto qw/void aom_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_highbd_fdct32x32 sse2/;
-
-    add_proto qw/void aom_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_highbd_fdct32x32_rd sse2/;
-
-  } else {
-    add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct4x4 sse2 msa/;
+    # FFT/IFFT (float) only used for denoising (and noise power spectral density estimation)
+    add_proto qw/void aom_fft2x2_float/, "const float *input, float *temp, float *output";
 
-    add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct4x4_1 sse2/;
+    add_proto qw/void aom_fft4x4_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft4x4_float                  sse2/;
 
-    add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
-
-    add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct16x16 sse2 msa/;
-
-    add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct32x32 sse2 avx2 msa/;
-
-    add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct32x32_rd sse2 avx2 msa/;
-  }  # CONFIG_HIGHBITDEPTH
-}  # CONFIG_AV1_ENCODER
-
-#
-# Inverse transform
-if (aom_config("CONFIG_AV1") eq "yes") {
-  add_proto qw/void aom_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-
-  add_proto qw/void aom_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_iwht4x4_16_add sse2/;
-
-  add_proto qw/void aom_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-
-  add_proto qw/void aom_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-
-  add_proto qw/void aom_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct4x4_16_add sse2/;
-
-  add_proto qw/void aom_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct4x4_1_add sse2/;
-
-  add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct8x8_64_add sse2 ssse3/;
-
-  add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct8x8_12_add sse2 ssse3/;
-
-  add_proto qw/void aom_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct8x8_1_add sse2/;
-
-  add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct16x16_256_add sse2 avx2/;
-
-  add_proto qw/void aom_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct16x16_38_add avx2/;
-
-  add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct16x16_10_add sse2 avx2/;
-
-  add_proto qw/void aom_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct16x16_1_add sse2 avx2/;
-
-  add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct32x32_1024_add sse2 ssse3 avx2/;
-
-  add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct32x32_135_add sse2 ssse3 avx2/;
-  # Need to add 135 eob idct32x32 implementations.
-  $aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2;
-
-  add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct32x32_34_add sse2 ssse3 avx2/;
-
-  add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct32x32_1_add sse2 avx2/;
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-  } else {
-    add_proto qw/void aom_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct4x4_1_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void aom_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct4x4_16_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void aom_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct8x8_1_add sse2 neon dspr2 msa/;
+    add_proto qw/void aom_fft8x8_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft8x8_float avx2             sse2/;
 
-    add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct8x8_64_add sse2 ssse3 neon dspr2 msa/;
+    add_proto qw/void aom_fft16x16_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft16x16_float avx2           sse2/;
 
-    add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct8x8_12_add sse2 ssse3 neon dspr2 msa/;
+    add_proto qw/void aom_fft32x32_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft32x32_float avx2           sse2/;
 
-    add_proto qw/void aom_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct16x16_1_add sse2 avx2 neon dspr2 msa/;
+    add_proto qw/void aom_ifft2x2_float/, "const float *input, float *temp, float *output";
 
-    add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct16x16_256_add sse2 avx2 neon dspr2 msa/;
+    add_proto qw/void aom_ifft4x4_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft4x4_float                 sse2/;
 
-    add_proto qw/void aom_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct16x16_38_add avx2/;
+    add_proto qw/void aom_ifft8x8_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft8x8_float avx2            sse2/;
 
-    add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct16x16_10_add sse2 avx2 neon dspr2 msa/;
+    add_proto qw/void aom_ifft16x16_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft16x16_float avx2          sse2/;
 
-    add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct32x32_1024_add sse2 ssse3 avx2 neon dspr2 msa/;
-
-    add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct32x32_135_add sse2 ssse3 avx2 neon dspr2 msa/;
-    # Need to add 135 eob idct32x32 implementations.
-    $aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2;
-    $aom_idct32x32_135_add_neon=aom_idct32x32_1024_add_neon;
-    $aom_idct32x32_135_add_dspr2=aom_idct32x32_1024_add_dspr2;
-    $aom_idct32x32_135_add_msa=aom_idct32x32_1024_add_msa;
-
-    add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct32x32_34_add sse2 ssse3 avx2 neon dspr2 msa/;
-    # Need to add 34 eob idct32x32 neon implementation.
-    $aom_idct32x32_34_add_neon=aom_idct32x32_1024_add_neon;
-
-    add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct32x32_1_add sse2 avx2 neon dspr2 msa/;
-
-    add_proto qw/void aom_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_iwht4x4_1_add msa/;
-
-    add_proto qw/void aom_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_iwht4x4_16_add msa sse2/;
-  }  # CONFIG_HIGHBITDEPTH
-}  # CONFIG_AV1
+    add_proto qw/void aom_ifft32x32_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft32x32_float avx2          sse2/;
+}  # CONFIG_AV1_ENCODER
 
 #
 # Quantization
@@ -685,29 +538,26 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 
 }  # CONFIG_AV1_ENCODER
-if (aom_config("CONFIG_AV1") eq "yes") {
-  #
-  # Alpha blending with mask
-  #
-  if (aom_config("CONFIG_CONVOLVE_ROUND") eq "yes") {
-    add_proto qw/void aom_blend_a64_d32_mask/, "int32_t *dst, uint32_t dst_stride, const int32_t *src0, uint32_t src0_stride, const int32_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
-  }
-  add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
-  add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
-  add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
-  specialize "aom_blend_a64_mask", qw/sse4_1/;
-  specialize "aom_blend_a64_hmask", qw/sse4_1/;
-  specialize "aom_blend_a64_vmask", qw/sse4_1/;
-
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd";
-    add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
-    add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
-    specialize "aom_highbd_blend_a64_mask", qw/sse4_1/;
-    specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/;
-    specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/;
-  }
-}  # CONFIG_AV1
+
+#
+# Alpha blending with mask
+#
+add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params";
+specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 neon/;
+add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd";
+add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby";
+add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
+add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
+specialize "aom_blend_a64_mask", qw/sse4_1/;
+specialize "aom_blend_a64_hmask", qw/sse4_1 neon/;
+specialize "aom_blend_a64_vmask", qw/sse4_1 neon/;
+
+add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd";
+add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
+add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
+specialize "aom_highbd_blend_a64_mask", qw/sse4_1/;
+specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/;
+specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/;
 
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   #
@@ -716,6 +566,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
   specialize qw/aom_subtract_block neon msa sse2/;
 
+  add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+  specialize qw/aom_highbd_subtract_block sse2/;
+
   if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     #
     # Sum of Squares
@@ -729,53 +582,13 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
 
   #
-  # Avg
-  #
-  if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
-    #
-    # Avg
-    #
-    specialize qw/aom_avg_8x8 sse2 neon msa/;
-    if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-      add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
-      specialize qw/aom_highbd_subtract_block sse2/;
-    }
-
-    #
-    # Minmax
-    #
-    add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-    specialize qw/aom_minmax_8x8 sse2 neon/;
-    if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-      add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-    }
-
-    add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
-    specialize qw/aom_hadamard_8x8 sse2 neon/, "$ssse3_x86_64";
-
-    add_proto qw/void aom_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
-    specialize qw/aom_hadamard_16x16 sse2 neon/;
-
-    add_proto qw/int aom_satd/, "const int16_t *coeff, int length";
-    specialize qw/aom_satd sse2 neon/;
-
-    add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, int ref_stride, int height";
-    specialize qw/aom_int_pro_row sse2 neon/;
-
-    add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, int width";
-    specialize qw/aom_int_pro_col sse2 neon/;
-
-    add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, int bwl";
-    specialize qw/aom_vector_var neon sse2/;
-  }  # CONFIG_AV1_ENCODER
-
-  #
   # Single block SAD / Single block Avg SAD
   #
   foreach (@block_sizes) {
     ($w, $h) = @$_;
     add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
     add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+    add_proto qw/unsigned int/, "aom_jnt_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param";
   }
 
   specialize qw/aom_sad128x128    avx2          sse2/;
@@ -812,7 +625,59 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_sad4x8_avg          msa sse2/;
   specialize qw/aom_sad4x4_avg          msa sse2/;
 
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+  specialize qw/aom_sad4x16      sse2/;
+  specialize qw/aom_sad16x4      sse2/;
+  specialize qw/aom_sad8x32      sse2/;
+  specialize qw/aom_sad32x8      sse2/;
+  specialize qw/aom_sad16x64     sse2/;
+  specialize qw/aom_sad64x16     sse2/;
+
+  specialize qw/aom_sad4x16_avg  sse2/;
+  specialize qw/aom_sad16x4_avg  sse2/;
+  specialize qw/aom_sad8x32_avg  sse2/;
+  specialize qw/aom_sad32x8_avg  sse2/;
+  specialize qw/aom_sad16x64_avg sse2/;
+  specialize qw/aom_sad64x16_avg sse2/;
+
+  specialize qw/aom_jnt_sad128x128_avg ssse3/;
+  specialize qw/aom_jnt_sad128x64_avg  ssse3/;
+  specialize qw/aom_jnt_sad64x128_avg  ssse3/;
+  specialize qw/aom_jnt_sad64x64_avg   ssse3/;
+  specialize qw/aom_jnt_sad64x32_avg   ssse3/;
+  specialize qw/aom_jnt_sad32x64_avg   ssse3/;
+  specialize qw/aom_jnt_sad32x32_avg   ssse3/;
+  specialize qw/aom_jnt_sad32x16_avg   ssse3/;
+  specialize qw/aom_jnt_sad16x32_avg   ssse3/;
+  specialize qw/aom_jnt_sad16x16_avg   ssse3/;
+  specialize qw/aom_jnt_sad16x8_avg    ssse3/;
+  specialize qw/aom_jnt_sad8x16_avg    ssse3/;
+  specialize qw/aom_jnt_sad8x8_avg     ssse3/;
+  specialize qw/aom_jnt_sad8x4_avg     ssse3/;
+  specialize qw/aom_jnt_sad4x8_avg     ssse3/;
+  specialize qw/aom_jnt_sad4x4_avg     ssse3/;
+
+  specialize qw/aom_jnt_sad4x16_avg     ssse3/;
+  specialize qw/aom_jnt_sad16x4_avg     ssse3/;
+  specialize qw/aom_jnt_sad8x32_avg     ssse3/;
+  specialize qw/aom_jnt_sad32x8_avg     ssse3/;
+  specialize qw/aom_jnt_sad16x64_avg     ssse3/;
+  specialize qw/aom_jnt_sad64x16_avg     ssse3/;
+
+  add_proto qw/unsigned int/, "aom_sad4xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+  add_proto qw/unsigned int/, "aom_sad8xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+  add_proto qw/unsigned int/, "aom_sad16xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+  add_proto qw/unsigned int/, "aom_sad32xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+  add_proto qw/unsigned int/, "aom_sad64xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+  add_proto qw/unsigned int/, "aom_sad128xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+
+  specialize qw/aom_sad4xh   sse2/;
+  specialize qw/aom_sad8xh   sse2/;
+  specialize qw/aom_sad16xh  sse2/;
+  specialize qw/aom_sad32xh  sse2/;
+  specialize qw/aom_sad64xh  sse2/;
+  specialize qw/aom_sad128xh sse2/;
+
+
     foreach (@block_sizes) {
       ($w, $h) = @$_;
       add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
@@ -821,31 +686,45 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
         specialize "aom_highbd_sad${w}x${h}", qw/sse2/;
         specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/;
       }
+      add_proto qw/unsigned int/, "aom_highbd_jnt_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const JNT_COMP_PARAMS* jcp_param";
     }
     specialize qw/aom_highbd_sad128x128 avx2/;
     specialize qw/aom_highbd_sad128x64  avx2/;
     specialize qw/aom_highbd_sad64x128  avx2/;
-    specialize qw/aom_highbd_sad64x64   avx2/;
-    specialize qw/aom_highbd_sad64x32   avx2/;
-    specialize qw/aom_highbd_sad32x64   avx2/;
-    specialize qw/aom_highbd_sad32x32   avx2/;
-    specialize qw/aom_highbd_sad32x16   avx2/;
-    specialize qw/aom_highbd_sad16x32   avx2/;
-    specialize qw/aom_highbd_sad16x16   avx2/;
-    specialize qw/aom_highbd_sad16x8    avx2/;
+    specialize qw/aom_highbd_sad64x64   avx2 sse2/;
+    specialize qw/aom_highbd_sad64x32   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x64   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x32   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x16   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x32   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x16   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x8    avx2 sse2/;
+    specialize qw/aom_highbd_sad8x4     sse2/;
 
     specialize qw/aom_highbd_sad128x128_avg avx2/;
     specialize qw/aom_highbd_sad128x64_avg  avx2/;
     specialize qw/aom_highbd_sad64x128_avg  avx2/;
-    specialize qw/aom_highbd_sad64x64_avg   avx2/;
-    specialize qw/aom_highbd_sad64x32_avg   avx2/;
-    specialize qw/aom_highbd_sad32x64_avg   avx2/;
-    specialize qw/aom_highbd_sad32x32_avg   avx2/;
-    specialize qw/aom_highbd_sad32x16_avg   avx2/;
-    specialize qw/aom_highbd_sad16x32_avg   avx2/;
-    specialize qw/aom_highbd_sad16x16_avg   avx2/;
-    specialize qw/aom_highbd_sad16x8_avg    avx2/;
-  }
+    specialize qw/aom_highbd_sad64x64_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad64x32_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x64_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x32_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x16_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x32_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x16_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x8_avg    avx2 sse2/;
+    specialize qw/aom_highbd_sad8x4_avg     sse2/;
+
+    specialize qw/aom_highbd_sad16x4       sse2/;
+    specialize qw/aom_highbd_sad8x32       sse2/;
+    specialize qw/aom_highbd_sad32x8       sse2/;
+    specialize qw/aom_highbd_sad16x64      sse2/;
+    specialize qw/aom_highbd_sad64x16      sse2/;
+
+    specialize qw/aom_highbd_sad16x4_avg   sse2/;
+    specialize qw/aom_highbd_sad8x32_avg   sse2/;
+    specialize qw/aom_highbd_sad32x8_avg   sse2/;
+    specialize qw/aom_highbd_sad16x64_avg  sse2/;
+    specialize qw/aom_highbd_sad64x16_avg  sse2/;
 
   #
   # Masked SAD
@@ -856,90 +735,34 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     specialize "aom_masked_sad${w}x${h}", qw/ssse3/;
   }
 
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+
     foreach (@block_sizes) {
       ($w, $h) = @$_;
       add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
       specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3/;
     }
-  }
+
 
   #
   # OBMC SAD
   #
-  if (aom_config("CONFIG_MOTION_VAR") eq "yes") {
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
+    if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
+       specialize "aom_obmc_sad${w}x${h}", qw/sse4_1/;
+    }
+  }
+
+
     foreach (@block_sizes) {
       ($w, $h) = @$_;
-      add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
+      add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
       if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
-         specialize "aom_obmc_sad${w}x${h}", qw/sse4_1/;
-      }
-    }
-
-    if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-      foreach (@block_sizes) {
-        ($w, $h) = @$_;
-        add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
-        if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
-          specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1/;
-        }
+        specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1/;
       }
     }
-  }
 
-  #
-  # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
-  #
-  # Blocks of 3
-  foreach $s (@block_widths) {
-    add_proto qw/void/, "aom_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  }
-  specialize qw/aom_sad64x64x3            msa/;
-  specialize qw/aom_sad32x32x3            msa/;
-  specialize qw/aom_sad16x16x3 sse3 ssse3 msa/;
-  specialize qw/aom_sad8x8x3   sse3       msa/;
-  specialize qw/aom_sad4x4x3   sse3       msa/;
-
-  add_proto qw/void/, "aom_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_sad16x8x3 sse3 ssse3 msa/;
-  add_proto qw/void/, "aom_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_sad8x16x3 sse3 msa/;
-
-  # Blocks of 8
-  foreach $s (@block_widths) {
-    add_proto qw/void/, "aom_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  }
-  specialize qw/aom_sad64x64x8        msa/;
-  specialize qw/aom_sad32x32x8        msa/;
-  specialize qw/aom_sad16x16x8 sse4_1 msa/;
-  specialize qw/aom_sad8x8x8   sse4_1 msa/;
-  specialize qw/aom_sad4x4x8   sse4_1 msa/;
-
-  add_proto qw/void/, "aom_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_sad16x8x8 sse4_1 msa/;
-  add_proto qw/void/, "aom_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_sad8x16x8 sse4_1 msa/;
-  add_proto qw/void/, "aom_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_sad8x4x8 msa/;
-  add_proto qw/void/, "aom_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_sad4x8x8 msa/;
-
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    foreach $s (@block_widths) {
-      # Blocks of 3
-      add_proto qw/void/, "aom_highbd_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-      # Blocks of 8
-      add_proto qw/void/, "aom_highbd_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-    }
-    # Blocks of 3
-    add_proto qw/void/, "aom_highbd_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-    add_proto qw/void/, "aom_highbd_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-    # Blocks of 8
-    add_proto qw/void/, "aom_highbd_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-    add_proto qw/void/, "aom_highbd_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-    add_proto qw/void/, "aom_highbd_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-    add_proto qw/void/, "aom_highbd_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  }
 
   #
   # Multi-block SAD, comparing a reference to N independent blocks
@@ -966,29 +789,47 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_sad4x8x4d               msa sse2/;
   specialize qw/aom_sad4x4x4d               msa sse2/;
 
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    #
-    # Multi-block SAD, comparing a reference to N independent blocks
-    #
-    foreach (@block_sizes) {
-      ($w, $h) = @$_;
-      add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-      if ($w != 128 && $h != 128) {
-        specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
-      }
+  specialize qw/aom_sad4x16x4d  sse2/;
+  specialize qw/aom_sad16x4x4d  sse2/;
+  specialize qw/aom_sad8x32x4d  sse2/;
+  specialize qw/aom_sad32x8x4d  sse2/;
+  specialize qw/aom_sad16x64x4d sse2/;
+  specialize qw/aom_sad64x16x4d sse2/;
+
+  #
+  # Multi-block SAD, comparing a reference to N independent blocks
+  #
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+    if ($w != 128 && $h != 128) {
+      specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
     }
-    specialize qw/aom_highbd_sad128x128x4d avx2/;
-    specialize qw/aom_highbd_sad128x64x4d  avx2/;
-    specialize qw/aom_highbd_sad64x128x4d  avx2/;
-    specialize qw/aom_highbd_sad64x64x4d   avx2/;
-    specialize qw/aom_highbd_sad64x32x4d   avx2/;
-    specialize qw/aom_highbd_sad32x64x4d   avx2/;
-    specialize qw/aom_highbd_sad32x32x4d   avx2/;
-    specialize qw/aom_highbd_sad32x16x4d   avx2/;
-    specialize qw/aom_highbd_sad16x32x4d   avx2/;
-    specialize qw/aom_highbd_sad16x16x4d   avx2/;
-    specialize qw/aom_highbd_sad16x8x4d    avx2/;
   }
+  specialize qw/aom_highbd_sad128x128x4d avx2/;
+  specialize qw/aom_highbd_sad128x64x4d  avx2/;
+  specialize qw/aom_highbd_sad64x128x4d  avx2/;
+  specialize qw/aom_highbd_sad64x64x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad64x32x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad32x64x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad32x32x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad32x16x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad16x32x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad16x16x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad16x8x4d    sse2 avx2/;
+  specialize qw/aom_highbd_sad8x16x4d    sse2/;
+  specialize qw/aom_highbd_sad8x8x4d     sse2/;
+  specialize qw/aom_highbd_sad8x4x4d     sse2/;
+  specialize qw/aom_highbd_sad4x8x4d     sse2/;
+  specialize qw/aom_highbd_sad4x4x4d     sse2/;
+
+  specialize qw/aom_highbd_sad4x16x4d  sse2/;
+  specialize qw/aom_highbd_sad16x4x4d  sse2/;
+  specialize qw/aom_highbd_sad8x32x4d  sse2/;
+  specialize qw/aom_highbd_sad32x8x4d  sse2/;
+  specialize qw/aom_highbd_sad16x64x4d sse2/;
+  specialize qw/aom_highbd_sad64x16x4d sse2/;
+
 
   #
   # Structured Similarity (SSIM)
@@ -1000,9 +841,8 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
     specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64";
 
-    if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-      add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    }
+    add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+
   }
 }  # CONFIG_AV1_ENCODER
 
@@ -1015,8 +855,8 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
   add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
 
-  specialize qw/aom_get16x16var sse2 avx2 neon msa/;
-  specialize qw/aom_get8x8var   sse2      neon msa/;
+  specialize qw/aom_get16x16var           neon msa/;
+  specialize qw/aom_get8x8var             neon msa/;
 
 
   add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
@@ -1029,7 +869,6 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_mse8x16           sse2           msa/;
   specialize qw/aom_mse8x8            sse2           msa/;
 
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
     foreach $bd (8, 10, 12) {
       add_proto qw/void/, "aom_highbd_${bd}_get16x16var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
       add_proto qw/void/, "aom_highbd_${bd}_get8x8var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
@@ -1042,25 +881,48 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
       specialize "aom_highbd_${bd}_mse16x16", qw/sse2/;
       specialize "aom_highbd_${bd}_mse8x8", qw/sse2/;
     }
-  }
+
 
   #
-  # ...
   #
-  add_proto qw/void aom_upsampled_pred/, "uint8_t *comp_pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride";
+  #
+  add_proto qw/void aom_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                          const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3,
+                                          int subpel_y_q3, const uint8_t *ref, int ref_stride";
   specialize qw/aom_upsampled_pred sse2/;
-  add_proto qw/void aom_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride";
+
+  add_proto qw/void aom_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                   const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+                                                   int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                                                   int ref_stride";
   specialize qw/aom_comp_avg_upsampled_pred sse2/;
 
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void aom_highbd_upsampled_pred/, "uint16_t *comp_pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, int bd";
-    specialize qw/aom_highbd_upsampled_pred sse2/;
-    add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, int bd";
-    specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
-  }
+  add_proto qw/void aom_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                       const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+                                                       int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                                                       int ref_stride, const JNT_COMP_PARAMS *jcp_param";
+  specialize qw/aom_jnt_comp_avg_upsampled_pred ssse3/;
 
+
+  add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                 const MV *const mv, uint16_t *comp_pred, int width, int height, int subpel_x_q3,
+                                                 int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd";
+  specialize qw/aom_highbd_upsampled_pred sse2/;
+
+  add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                          const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+                                                          int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd";
+  specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
+
+  add_proto qw/void aom_highbd_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                              const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+                                                              int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+                                                              int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param";
+  specialize qw/aom_highbd_jnt_comp_avg_upsampled_pred sse2/;
+
+
+  #
   #
-  # ...
   #
   add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *";
   add_proto qw/unsigned int aom_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
@@ -1082,27 +944,33 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
     add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
     add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    add_proto qw/uint32_t/, "aom_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param";
   }
-
+  specialize qw/aom_variance128x128   sse2 avx2         /;
+  specialize qw/aom_variance128x64    sse2 avx2         /;
+  specialize qw/aom_variance64x128    sse2 avx2         /;
   specialize qw/aom_variance64x64     sse2 avx2 neon msa/;
   specialize qw/aom_variance64x32     sse2 avx2 neon msa/;
-  specialize qw/aom_variance32x64     sse2      neon msa/;
+  specialize qw/aom_variance32x64     sse2 avx2 neon msa/;
   specialize qw/aom_variance32x32     sse2 avx2 neon msa/;
   specialize qw/aom_variance32x16     sse2 avx2 msa/;
-  specialize qw/aom_variance16x32     sse2      msa/;
+  specialize qw/aom_variance16x32     sse2 avx2 msa/;
   specialize qw/aom_variance16x16     sse2 avx2 neon msa/;
-  specialize qw/aom_variance16x8      sse2      neon msa/;
+  specialize qw/aom_variance16x8      sse2 avx2 neon msa/;
   specialize qw/aom_variance8x16      sse2      neon msa/;
   specialize qw/aom_variance8x8       sse2      neon msa/;
   specialize qw/aom_variance8x4       sse2           msa/;
   specialize qw/aom_variance4x8       sse2           msa/;
   specialize qw/aom_variance4x4       sse2           msa/;
 
+  specialize qw/aom_sub_pixel_variance128x128   avx2          sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance128x64    avx2          sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x128    avx2          sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance64x64     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x32               msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x64               msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x32     avx2      msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x64     avx2      msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance32x32     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x16               msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x16     avx2      msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance16x32               msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance16x16          neon msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance16x8                msa sse2 ssse3/;
@@ -1112,73 +980,100 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_sub_pixel_variance4x8                 msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance4x4                 msa sse2 ssse3/;
 
-  specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance64x32      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x64      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x16      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x32      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x16      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x8       msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x16       msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x8        msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x4        msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance4x8        msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance4x4        msa sse2 ssse3/;
-
-  if (aom_config("CONFIG_EXT_PARTITION_TYPES") eq "yes") {
-    specialize qw/aom_variance4x16 sse2/;
-    specialize qw/aom_variance16x4 sse2/;
-    specialize qw/aom_variance8x32 sse2/;
-    specialize qw/aom_variance32x8 sse2/;
-    specialize qw/aom_variance16x64 sse2/;
-    specialize qw/aom_variance64x16 sse2/;
-    specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_variance16x4 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_variance16x64 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_variance64x16 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/;
-  }
-
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    foreach $bd (8, 10, 12) {
-      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/aom_sub_pixel_avg_variance128x128 avx2     sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance128x64  avx2     sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x128  avx2     sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x64   avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x32   avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x64   avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x32   avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x16   avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x32        msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x16        msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x8         msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x16         msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x8          msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x4          msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance4x8          msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance4x4          msa sse2 ssse3/;
+
+  specialize qw/aom_variance4x16 sse2/;
+  specialize qw/aom_variance16x4 sse2 avx2/;
+  specialize qw/aom_variance8x32 sse2/;
+  specialize qw/aom_variance32x8 sse2 avx2/;
+  specialize qw/aom_variance16x64 sse2 avx2/;
+  specialize qw/aom_variance64x16 sse2 avx2/;
+  specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x4 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x64 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x16 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/;
+
+  specialize qw/aom_jnt_sub_pixel_avg_variance64x64 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance64x32 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance32x64 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance32x32 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance32x16 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance16x32 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance16x16 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance16x8  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance8x16  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance8x8   ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance8x4   ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance4x8   ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance4x4   ssse3/;
+
+  specialize qw/aom_jnt_sub_pixel_avg_variance4x16  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance16x4  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance8x32  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance32x8  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance16x64 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance64x16 ssse3/;
+
+  specialize qw/aom_jnt_sub_pixel_avg_variance128x128  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance128x64   ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance64x128   ssse3/;
+
+
+  foreach $bd (8, 10, 12) {
+    add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+    add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+    add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 
-      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-      foreach (@block_sizes) {
-        ($w, $h) = @$_;
-        add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-        add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-        add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-        if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
-          specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2";
-        }
-        # TODO(david.barker): When ext-partition-types is enabled, we currently
-        # don't have vectorized 4x16 highbd variance functions
-        if ($w == 4 && $h == 4) {
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
+        specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2";
+      }
+      # TODO(david.barker): When ext-partition-types is enabled, we currently
+      # don't have vectorized 4x16 highbd variance functions
+      if ($w == 4 && $h == 4) {
           specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1";
         }
-        if ($w != 128 && $h != 128 && $w != 4) {
-          specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/;
-          specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/;
-        }
-        if ($w == 4 && $h == 4) {
-          specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
-          specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
-        }
+      if ($w != 128 && $h != 128 && $w != 4) {
+        specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/;
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/;
+      }
+      if ($w == 4 && $h == 4) {
+        specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
       }
+
+      add_proto qw/uint32_t/, "aom_highbd_${bd}_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS* jcp_param";
     }
-  }  # CONFIG_HIGHBITDEPTH
+  }
 
   #
   # Masked Variance / Masked Subpixel Variance
@@ -1189,7 +1084,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
   }
 
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+
     foreach $bd ("_8_", "_10_", "_12_") {
       foreach (@block_sizes) {
         ($w, $h) = @$_;
@@ -1197,30 +1092,28 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
         specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
       }
     }
-  }
+
 
   #
   # OBMC Variance / OBMC Subpixel Variance
   #
-  if (aom_config("CONFIG_MOTION_VAR") eq "yes") {
-    foreach (@block_sizes) {
-      ($w, $h) = @$_;
-      add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-      add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-      specialize "aom_obmc_variance${w}x${h}", q/sse4_1/;
-    }
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+    add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+    specialize "aom_obmc_variance${w}x${h}", q/sse4_1/;
+  }
 
-    if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-      foreach $bd ("_", "_10_", "_12_") {
-        foreach (@block_sizes) {
-          ($w, $h) = @$_;
-          add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-          add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-          specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/;
-        }
+
+    foreach $bd ("_", "_10_", "_12_") {
+      foreach (@block_sizes) {
+        ($w, $h) = @$_;
+        add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+        add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+        specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/;
       }
     }
-  }
+
 
   add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
@@ -1260,7 +1153,6 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
   add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
-
   #
   # Specialty Subpixel
   #
@@ -1277,7 +1169,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   # Comp Avg
   #
   add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+
+  add_proto qw/void aom_jnt_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
+  specialize qw/aom_jnt_comp_avg_pred ssse3/;
+
+
     add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
     specialize qw/aom_highbd_12_variance64x64 sse2/;
 
@@ -1415,6 +1311,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
     add_proto qw/void aom_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
 
+    add_proto qw/void aom_highbd_jnt_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
+    specialize qw/aom_highbd_jnt_comp_avg_pred sse2/;
+
     #
     # Subpixel Variance
     #
@@ -1634,14 +1533,15 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
     add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
 
-  }  # CONFIG_HIGHBITDEPTH
+
 
   add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
-  add_proto qw/void aom_comp_mask_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void aom_highbd_comp_mask_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
-    add_proto qw/void aom_highbd_comp_mask_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int bd";
-  }
+  specialize qw/aom_comp_mask_pred ssse3 avx2/;
+
+  add_proto qw/void aom_highbd_comp_mask_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
+  add_proto qw/void aom_highbd_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+                                                           int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int bd";
+
 
 }  # CONFIG_AV1_ENCODER
 
diff --git a/third_party/aom/aom_dsp/aom_filter.h b/third_party/aom/aom_dsp/aom_filter.h
index 58e8bb284..fd4f51b29 100644
--- a/third_party/aom/aom_dsp/aom_filter.h
+++ b/third_party/aom/aom_dsp/aom_filter.h
@@ -31,6 +31,13 @@ extern "C" {
 #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
 #define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2)
 
+#define RS_SUBPEL_BITS 6
+#define RS_SUBPEL_MASK ((1 << RS_SUBPEL_BITS) - 1)
+#define RS_SCALE_SUBPEL_BITS 14
+#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
+#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
+#define RS_SCALE_EXTRA_OFF (1 << (RS_SCALE_EXTRA_BITS - 1))
+
 typedef int16_t InterpKernel[SUBPEL_TAPS];
 
 #define BIL_SUBPEL_BITS 3
diff --git a/third_party/aom/aom_dsp/aom_simd.h b/third_party/aom/aom_dsp/aom_simd.h
index 469fd8ed2..392b36627 100644
--- a/third_party/aom/aom_dsp/aom_simd.h
+++ b/third_party/aom/aom_dsp/aom_simd.h
@@ -18,8 +18,9 @@
 #include <intrin.h>
 #endif
 
-#include "./aom_config.h"
-#include "./aom_simd_inline.h"
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_simd_inline.h"
 
 #define SIMD_CHECK 1  // Sanity checks in C equivalents
 
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c
deleted file mode 100644
index 09429d6d2..000000000
--- a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c
+++ /dev/null
@@ -1,364 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
-                                       int16x4_t dsrc2, int16x4_t dsrc3,
-                                       int16x4_t dsrc4, int16x4_t dsrc5,
-                                       int16x4_t dsrc6, int16x4_t dsrc7,
-                                       int16x8_t q0s16) {
-  int32x4_t qdst;
-  int16x4_t d0s16, d1s16;
-
-  d0s16 = vget_low_s16(q0s16);
-  d1s16 = vget_high_s16(q0s16);
-
-  qdst = vmull_lane_s16(dsrc0, d0s16, 0);
-  qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
-  qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
-  qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
-  qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
-  qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
-  qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
-  qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
-  return qdst;
-}
-
-void aom_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y,  // unused
-                                  int y_step_q4,            // unused
-                                  int w, int h) {
-  int width;
-  const uint8_t *s;
-  uint8_t *d;
-  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
-  uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
-  uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16;
-  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
-  int16x8_t q0s16;
-  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
-  int32x4_t q1s32, q2s32, q14s32, q15s32;
-  uint16x8x2_t q0x2u16;
-  uint8x8x2_t d0x2u8, d1x2u8;
-  uint32x2x2_t d0x2u32;
-  uint16x4x2_t d0x2u16, d1x2u16;
-  uint32x4x2_t q0x2u32;
-
-  assert(x_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y_step_q4;
-  (void)filter_y;
-
-  q0s16 = vld1q_s16(filter_x);
-
-  src -= 3;                // adjust for taps
-  for (; h > 0; h -= 4) {  // loop_horiz_v
-    s = src;
-    d24u8 = vld1_u8(s);
-    s += src_stride;
-    d25u8 = vld1_u8(s);
-    s += src_stride;
-    d26u8 = vld1_u8(s);
-    s += src_stride;
-    d27u8 = vld1_u8(s);
-
-    q12u8 = vcombine_u8(d24u8, d25u8);
-    q13u8 = vcombine_u8(d26u8, d27u8);
-
-    q0x2u16 =
-        vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
-    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
-    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
-    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
-    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
-    d0x2u8 = vtrn_u8(d24u8, d25u8);
-    d1x2u8 = vtrn_u8(d26u8, d27u8);
-
-    __builtin_prefetch(src + src_stride * 4);
-    __builtin_prefetch(src + src_stride * 5);
-
-    q8u16 = vmovl_u8(d0x2u8.val[0]);
-    q9u16 = vmovl_u8(d0x2u8.val[1]);
-    q10u16 = vmovl_u8(d1x2u8.val[0]);
-    q11u16 = vmovl_u8(d1x2u8.val[1]);
-
-    src += 7;
-    d16u16 = vget_low_u16(q8u16);
-    d17u16 = vget_high_u16(q8u16);
-    d18u16 = vget_low_u16(q9u16);
-    d19u16 = vget_high_u16(q9u16);
-    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
-    q9u16 = vcombine_u16(d17u16, d19u16);
-
-    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));         // vmov 23 21
-    for (width = w; width > 0; width -= 4, src += 4, dst += 4) {  // loop_horiz
-      s = src;
-      d28u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d29u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d31u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d30u32 = vld1_dup_u32((const uint32_t *)s);
-
-      __builtin_prefetch(src + 64);
-
-      d0x2u16 =
-          vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
-      d1x2u16 =
-          vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
-      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
-                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
-      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
-                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
-
-      __builtin_prefetch(src + 64 + src_stride);
-
-      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
-      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
-      q0x2u32 =
-          vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
-
-      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
-      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
-      q12u16 = vmovl_u8(d28u8);
-      q13u16 = vmovl_u8(d29u8);
-
-      __builtin_prefetch(src + 64 + src_stride * 2);
-
-      d = dst;
-      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
-      d += dst_stride;
-      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
-      d += dst_stride;
-      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
-      d += dst_stride;
-      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
-
-      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
-      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
-      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
-      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
-                             d23s16, d24s16, q0s16);
-      q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
-                             d24s16, d26s16, q0s16);
-      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
-                              d26s16, d27s16, q0s16);
-      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
-                              d27s16, d25s16, q0s16);
-
-      __builtin_prefetch(src + 64 + src_stride * 3);
-
-      d2u16 = vqrshrun_n_s32(q1s32, 7);
-      d3u16 = vqrshrun_n_s32(q2s32, 7);
-      d4u16 = vqrshrun_n_s32(q14s32, 7);
-      d5u16 = vqrshrun_n_s32(q15s32, 7);
-
-      q1u16 = vcombine_u16(d2u16, d3u16);
-      q2u16 = vcombine_u16(d4u16, d5u16);
-
-      d2u8 = vqmovn_u16(q1u16);
-      d3u8 = vqmovn_u16(q2u16);
-
-      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
-      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
-                         vreinterpret_u32_u16(d0x2u16.val[1]));
-      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
-                       vreinterpret_u8_u32(d0x2u32.val[1]));
-
-      q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
-      q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
-
-      q1u8 = vrhaddq_u8(q1u8, q3u8);
-
-      d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
-      d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
-
-      d = dst;
-      vst1_lane_u32((uint32_t *)d, d2u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d2u32, 1);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 1);
-
-      q8u16 = q9u16;
-      d20s16 = d23s16;
-      q11u16 = q12u16;
-      q9u16 = q13u16;
-      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    }
-    src += src_stride * 4 - w - 7;
-    dst += dst_stride * 4 - w;
-  }
-  return;
-}
-
-void aom_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x,  // unused
-                                 int x_step_q4,            // unused
-                                 const int16_t *filter_y, int y_step_q4, int w,
-                                 int h) {
-  int height;
-  const uint8_t *s;
-  uint8_t *d;
-  uint8x8_t d2u8, d3u8;
-  uint32x2_t d2u32, d3u32, d6u32, d7u32;
-  uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
-  uint8x16_t q1u8, q3u8;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16;
-  uint16x4_t d2u16, d3u16, d4u16, d5u16;
-  int16x8_t q0s16;
-  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
-  int32x4_t q1s32, q2s32, q14s32, q15s32;
-
-  assert(y_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y_step_q4;
-  (void)filter_x;
-
-  src -= src_stride * 3;
-  q0s16 = vld1q_s16(filter_y);
-  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
-    s = src;
-    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
-    s += src_stride;
-    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
-    s += src_stride;
-    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
-    s += src_stride;
-    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
-    s += src_stride;
-    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
-    s += src_stride;
-    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
-    s += src_stride;
-    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
-    s += src_stride;
-
-    q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
-    q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
-    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
-    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
-
-    d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-    d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d = dst;
-    for (height = h; height > 0; height -= 4) {  // loop_vert
-      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
-      s += src_stride;
-      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
-      s += src_stride;
-      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
-      s += src_stride;
-      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
-      s += src_stride;
-
-      q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
-      q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
-
-      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
-      d += dst_stride;
-      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
-      d += dst_stride;
-      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
-      d += dst_stride;
-      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
-      d -= dst_stride * 3;
-
-      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
-      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
-      d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-      d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
-      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
-      __builtin_prefetch(s);
-      __builtin_prefetch(s + src_stride);
-      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
-                             d22s16, d24s16, q0s16);
-      __builtin_prefetch(s + src_stride * 2);
-      __builtin_prefetch(s + src_stride * 3);
-      q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
-                             d24s16, d26s16, q0s16);
-      __builtin_prefetch(d);
-      __builtin_prefetch(d + dst_stride);
-      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
-                              d26s16, d27s16, q0s16);
-      __builtin_prefetch(d + dst_stride * 2);
-      __builtin_prefetch(d + dst_stride * 3);
-      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
-                              d27s16, d25s16, q0s16);
-
-      d2u16 = vqrshrun_n_s32(q1s32, 7);
-      d3u16 = vqrshrun_n_s32(q2s32, 7);
-      d4u16 = vqrshrun_n_s32(q14s32, 7);
-      d5u16 = vqrshrun_n_s32(q15s32, 7);
-
-      q1u16 = vcombine_u16(d2u16, d3u16);
-      q2u16 = vcombine_u16(d4u16, d5u16);
-
-      d2u8 = vqmovn_u16(q1u16);
-      d3u8 = vqmovn_u16(q2u16);
-
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
-
-      q1u8 = vrhaddq_u8(q1u8, q3u8);
-
-      d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
-      d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
-
-      vst1_lane_u32((uint32_t *)d, d2u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d2u32, 1);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 1);
-      d += dst_stride;
-
-      q8u16 = q10u16;
-      d18s16 = d22s16;
-      d19s16 = d24s16;
-      q10u16 = q13u16;
-      d22s16 = d25s16;
-    }
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm
deleted file mode 100644
index 80aef992d..000000000
--- a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm
+++ /dev/null
@@ -1,295 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-    ; These functions are only valid when:
-    ; x_step_q4 == 16
-    ; w%4 == 0
-    ; h%4 == 0
-    ; taps == 8
-    ; AV1_FILTER_WEIGHT == 128
-    ; AV1_FILTER_SHIFT == 7
-
-    EXPORT  |aom_convolve8_avg_horiz_neon|
-    EXPORT  |aom_convolve8_avg_vert_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Multiply and accumulate by q0
-    MACRO
-    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
-    vmull.s16 $dst, $src0, d0[0]
-    vmlal.s16 $dst, $src1, d0[1]
-    vmlal.s16 $dst, $src2, d0[2]
-    vmlal.s16 $dst, $src3, d0[3]
-    vmlal.s16 $dst, $src4, d1[0]
-    vmlal.s16 $dst, $src5, d1[1]
-    vmlal.s16 $dst, $src6, d1[2]
-    vmlal.s16 $dst, $src7, d1[3]
-    MEND
-
-; r0    const uint8_t *src
-; r1    int src_stride
-; r2    uint8_t *dst
-; r3    int dst_stride
-; sp[]const int16_t *filter_x
-; sp[]int x_step_q4
-; sp[]const int16_t *filter_y ; unused
-; sp[]int y_step_q4           ; unused
-; sp[]int w
-; sp[]int h
-
-|aom_convolve8_avg_horiz_neon| PROC
-    push            {r4-r10, lr}
-
-    sub             r0, r0, #3              ; adjust for taps
-
-    ldr             r5, [sp, #32]           ; filter_x
-    ldr             r6, [sp, #48]           ; w
-    ldr             r7, [sp, #52]           ; h
-
-    vld1.s16        {q0}, [r5]              ; filter_x
-
-    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
-    add             r8, r8, #4              ; -src_stride * 3 + 4
-
-    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
-    add             r4, r4, #4              ; -dst_stride * 3 + 4
-
-    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
-    sub             r9, r9, #7
-    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
-
-    mov             r10, r6                 ; w loop counter
-
-aom_convolve8_avg_loop_horiz_v
-    vld1.8          {d24}, [r0], r1
-    vld1.8          {d25}, [r0], r1
-    vld1.8          {d26}, [r0], r1
-    vld1.8          {d27}, [r0], r8
-
-    vtrn.16         q12, q13
-    vtrn.8          d24, d25
-    vtrn.8          d26, d27
-
-    pld             [r0, r1, lsl #2]
-
-    vmovl.u8        q8, d24
-    vmovl.u8        q9, d25
-    vmovl.u8        q10, d26
-    vmovl.u8        q11, d27
-
-    ; save a few instructions in the inner loop
-    vswp            d17, d18
-    vmov            d23, d21
-
-    add             r0, r0, #3
-
-aom_convolve8_avg_loop_horiz
-    add             r5, r0, #64
-
-    vld1.32         {d28[]}, [r0], r1
-    vld1.32         {d29[]}, [r0], r1
-    vld1.32         {d31[]}, [r0], r1
-    vld1.32         {d30[]}, [r0], r8
-
-    pld             [r5]
-
-    vtrn.16         d28, d31
-    vtrn.16         d29, d30
-    vtrn.8          d28, d29
-    vtrn.8          d31, d30
-
-    pld             [r5, r1]
-
-    ; extract to s16
-    vtrn.32         q14, q15
-    vmovl.u8        q12, d28
-    vmovl.u8        q13, d29
-
-    pld             [r5, r1, lsl #1]
-
-    ; slightly out of order load to match the existing data
-    vld1.u32        {d6[0]}, [r2], r3
-    vld1.u32        {d7[0]}, [r2], r3
-    vld1.u32        {d6[1]}, [r2], r3
-    vld1.u32        {d7[1]}, [r2], r3
-
-    sub             r2, r2, r3, lsl #2      ; reset for store
-
-    ; src[] * filter_x
-    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
-    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
-    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
-    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
-
-    pld             [r5, -r8]
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; transpose
-    vtrn.16         d2, d3
-    vtrn.32         d2, d3
-    vtrn.8          d2, d3
-
-    ; average the new value and the dst value
-    vrhadd.u8       q1, q1, q3
-
-    vst1.u32        {d2[0]}, [r2@32], r3
-    vst1.u32        {d3[0]}, [r2@32], r3
-    vst1.u32        {d2[1]}, [r2@32], r3
-    vst1.u32        {d3[1]}, [r2@32], r4
-
-    vmov            q8,  q9
-    vmov            d20, d23
-    vmov            q11, q12
-    vmov            q9,  q13
-
-    subs            r6, r6, #4              ; w -= 4
-    bgt             aom_convolve8_avg_loop_horiz
-
-    ; outer loop
-    mov             r6, r10                 ; restore w counter
-    add             r0, r0, r9              ; src += src_stride * 4 - w
-    add             r2, r2, r12             ; dst += dst_stride * 4 - w
-    subs            r7, r7, #4              ; h -= 4
-    bgt aom_convolve8_avg_loop_horiz_v
-
-    pop             {r4-r10, pc}
-
-    ENDP
-
-|aom_convolve8_avg_vert_neon| PROC
-    push            {r4-r8, lr}
-
-    ; adjust for taps
-    sub             r0, r0, r1
-    sub             r0, r0, r1, lsl #1
-
-    ldr             r4, [sp, #32]           ; filter_y
-    ldr             r6, [sp, #40]           ; w
-    ldr             lr, [sp, #44]           ; h
-
-    vld1.s16        {q0}, [r4]              ; filter_y
-
-    lsl             r1, r1, #1
-    lsl             r3, r3, #1
-
-aom_convolve8_avg_loop_vert_h
-    mov             r4, r0
-    add             r7, r0, r1, asr #1
-    mov             r5, r2
-    add             r8, r2, r3, asr #1
-    mov             r12, lr                 ; h loop counter
-
-    vld1.u32        {d16[0]}, [r4], r1
-    vld1.u32        {d16[1]}, [r7], r1
-    vld1.u32        {d18[0]}, [r4], r1
-    vld1.u32        {d18[1]}, [r7], r1
-    vld1.u32        {d20[0]}, [r4], r1
-    vld1.u32        {d20[1]}, [r7], r1
-    vld1.u32        {d22[0]}, [r4], r1
-
-    vmovl.u8        q8, d16
-    vmovl.u8        q9, d18
-    vmovl.u8        q10, d20
-    vmovl.u8        q11, d22
-
-aom_convolve8_avg_loop_vert
-    ; always process a 4x4 block at a time
-    vld1.u32        {d24[0]}, [r7], r1
-    vld1.u32        {d26[0]}, [r4], r1
-    vld1.u32        {d26[1]}, [r7], r1
-    vld1.u32        {d24[1]}, [r4], r1
-
-    ; extract to s16
-    vmovl.u8        q12, d24
-    vmovl.u8        q13, d26
-
-    vld1.u32        {d6[0]}, [r5@32], r3
-    vld1.u32        {d6[1]}, [r8@32], r3
-    vld1.u32        {d7[0]}, [r5@32], r3
-    vld1.u32        {d7[1]}, [r8@32], r3
-
-    pld             [r7]
-    pld             [r4]
-
-    ; src[] * filter_y
-    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
-
-    pld             [r7, r1]
-    pld             [r4, r1]
-
-    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
-
-    pld             [r5]
-    pld             [r8]
-
-    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
-
-    pld             [r5, r3]
-    pld             [r8, r3]
-
-    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; average the new value and the dst value
-    vrhadd.u8       q1, q1, q3
-
-    sub             r5, r5, r3, lsl #1      ; reset for store
-    sub             r8, r8, r3, lsl #1
-
-    vst1.u32        {d2[0]}, [r5@32], r3
-    vst1.u32        {d2[1]}, [r8@32], r3
-    vst1.u32        {d3[0]}, [r5@32], r3
-    vst1.u32        {d3[1]}, [r8@32], r3
-
-    vmov            q8, q10
-    vmov            d18, d22
-    vmov            d19, d24
-    vmov            q10, q13
-    vmov            d22, d25
-
-    subs            r12, r12, #4            ; h -= 4
-    bgt             aom_convolve8_avg_loop_vert
-
-    ; outer loop
-    add             r0, r0, #4
-    add             r2, r2, #4
-    subs            r6, r6, #4              ; w -= 4
-    bgt             aom_convolve8_avg_loop_vert_h
-
-    pop             {r4-r8, pc}
-
-    ENDP
-    END
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c
deleted file mode 100644
index 8ebffb5f9..000000000
--- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
-                                       int16x4_t dsrc2, int16x4_t dsrc3,
-                                       int16x4_t dsrc4, int16x4_t dsrc5,
-                                       int16x4_t dsrc6, int16x4_t dsrc7,
-                                       int16x8_t q0s16) {
-  int32x4_t qdst;
-  int16x4_t d0s16, d1s16;
-
-  d0s16 = vget_low_s16(q0s16);
-  d1s16 = vget_high_s16(q0s16);
-
-  qdst = vmull_lane_s16(dsrc0, d0s16, 0);
-  qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
-  qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
-  qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
-  qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
-  qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
-  qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
-  qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
-  return qdst;
-}
-
-void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y,  // unused
-                              int y_step_q4,            // unused
-                              int w, int h) {
-  int width;
-  const uint8_t *s, *psrc;
-  uint8_t *d, *pdst;
-  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
-  uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
-  uint8x16_t q12u8, q13u8, q14u8, q15u8;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16;
-  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
-  int16x8_t q0s16;
-  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
-  int32x4_t q1s32, q2s32, q14s32, q15s32;
-  uint16x8x2_t q0x2u16;
-  uint8x8x2_t d0x2u8, d1x2u8;
-  uint32x2x2_t d0x2u32;
-  uint16x4x2_t d0x2u16, d1x2u16;
-  uint32x4x2_t q0x2u32;
-
-  assert(x_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y_step_q4;
-  (void)filter_y;
-
-  q0s16 = vld1q_s16(filter_x);
-
-  src -= 3;  // adjust for taps
-  for (; h > 0; h -= 4, src += src_stride * 4,
-                dst += dst_stride * 4) {  // loop_horiz_v
-    s = src;
-    d24u8 = vld1_u8(s);
-    s += src_stride;
-    d25u8 = vld1_u8(s);
-    s += src_stride;
-    d26u8 = vld1_u8(s);
-    s += src_stride;
-    d27u8 = vld1_u8(s);
-
-    q12u8 = vcombine_u8(d24u8, d25u8);
-    q13u8 = vcombine_u8(d26u8, d27u8);
-
-    q0x2u16 =
-        vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
-    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
-    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
-    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
-    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
-    d0x2u8 = vtrn_u8(d24u8, d25u8);
-    d1x2u8 = vtrn_u8(d26u8, d27u8);
-
-    __builtin_prefetch(src + src_stride * 4);
-    __builtin_prefetch(src + src_stride * 5);
-    __builtin_prefetch(src + src_stride * 6);
-
-    q8u16 = vmovl_u8(d0x2u8.val[0]);
-    q9u16 = vmovl_u8(d0x2u8.val[1]);
-    q10u16 = vmovl_u8(d1x2u8.val[0]);
-    q11u16 = vmovl_u8(d1x2u8.val[1]);
-
-    d16u16 = vget_low_u16(q8u16);
-    d17u16 = vget_high_u16(q8u16);
-    d18u16 = vget_low_u16(q9u16);
-    d19u16 = vget_high_u16(q9u16);
-    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
-    q9u16 = vcombine_u16(d17u16, d19u16);
-
-    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
-    for (width = w, psrc = src + 7, pdst = dst; width > 0;
-         width -= 4, psrc += 4, pdst += 4) {  // loop_horiz
-      s = psrc;
-      d28u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d29u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d31u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d30u32 = vld1_dup_u32((const uint32_t *)s);
-
-      __builtin_prefetch(psrc + 64);
-
-      d0x2u16 =
-          vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
-      d1x2u16 =
-          vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
-      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
-                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
-      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
-                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
-
-      __builtin_prefetch(psrc + 64 + src_stride);
-
-      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
-      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
-      q0x2u32 =
-          vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
-
-      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
-      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
-      q12u16 = vmovl_u8(d28u8);
-      q13u16 = vmovl_u8(d29u8);
-
-      __builtin_prefetch(psrc + 64 + src_stride * 2);
-
-      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
-      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
-      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
-      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
-                             d23s16, d24s16, q0s16);
-      q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
-                             d24s16, d26s16, q0s16);
-      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
-                              d26s16, d27s16, q0s16);
-      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
-                              d27s16, d25s16, q0s16);
-
-      __builtin_prefetch(psrc + 60 + src_stride * 3);
-
-      d2u16 = vqrshrun_n_s32(q1s32, 7);
-      d3u16 = vqrshrun_n_s32(q2s32, 7);
-      d4u16 = vqrshrun_n_s32(q14s32, 7);
-      d5u16 = vqrshrun_n_s32(q15s32, 7);
-
-      q1u16 = vcombine_u16(d2u16, d3u16);
-      q2u16 = vcombine_u16(d4u16, d5u16);
-
-      d2u8 = vqmovn_u16(q1u16);
-      d3u8 = vqmovn_u16(q2u16);
-
-      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
-      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
-                         vreinterpret_u32_u16(d0x2u16.val[1]));
-      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
-                       vreinterpret_u8_u32(d0x2u32.val[1]));
-
-      d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
-      d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
-
-      d = pdst;
-      vst1_lane_u32((uint32_t *)d, d2u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d2u32, 1);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 1);
-
-      q8u16 = q9u16;
-      d20s16 = d23s16;
-      q11u16 = q12u16;
-      q9u16 = q13u16;
-      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    }
-  }
-  return;
-}
-
-void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x,  // unused
-                             int x_step_q4,            // unused
-                             const int16_t *filter_y, int y_step_q4, int w,
-                             int h) {
-  int height;
-  const uint8_t *s;
-  uint8_t *d;
-  uint32x2_t d2u32, d3u32;
-  uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16;
-  uint16x4_t d2u16, d3u16, d4u16, d5u16;
-  int16x8_t q0s16;
-  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
-  int32x4_t q1s32, q2s32, q14s32, q15s32;
-
-  assert(y_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y_step_q4;
-  (void)filter_x;
-
-  src -= src_stride * 3;
-  q0s16 = vld1q_s16(filter_y);
-  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
-    s = src;
-    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
-    s += src_stride;
-    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
-    s += src_stride;
-    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
-    s += src_stride;
-    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
-    s += src_stride;
-    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
-    s += src_stride;
-    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
-    s += src_stride;
-    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
-    s += src_stride;
-
-    q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
-    q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
-    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
-    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
-
-    d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-    d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d = dst;
-    for (height = h; height > 0; height -= 4) {  // loop_vert
-      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
-      s += src_stride;
-      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
-      s += src_stride;
-      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
-      s += src_stride;
-      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
-      s += src_stride;
-
-      q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
-      q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
-
-      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
-      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
-      d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-      d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
-      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
-      __builtin_prefetch(d);
-      __builtin_prefetch(d + dst_stride);
-      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
-                             d22s16, d24s16, q0s16);
-      __builtin_prefetch(d + dst_stride * 2);
-      __builtin_prefetch(d + dst_stride * 3);
-      q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
-                             d24s16, d26s16, q0s16);
-      __builtin_prefetch(s);
-      __builtin_prefetch(s + src_stride);
-      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
-                              d26s16, d27s16, q0s16);
-      __builtin_prefetch(s + src_stride * 2);
-      __builtin_prefetch(s + src_stride * 3);
-      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
-                              d27s16, d25s16, q0s16);
-
-      d2u16 = vqrshrun_n_s32(q1s32, 7);
-      d3u16 = vqrshrun_n_s32(q2s32, 7);
-      d4u16 = vqrshrun_n_s32(q14s32, 7);
-      d5u16 = vqrshrun_n_s32(q15s32, 7);
-
-      q1u16 = vcombine_u16(d2u16, d3u16);
-      q2u16 = vcombine_u16(d4u16, d5u16);
-
-      d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
-      d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
-
-      vst1_lane_u32((uint32_t *)d, d2u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d2u32, 1);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 1);
-      d += dst_stride;
-
-      q8u16 = q10u16;
-      d18s16 = d22s16;
-      d19s16 = d24s16;
-      q10u16 = q13u16;
-      d22s16 = d25s16;
-    }
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm
deleted file mode 100644
index 38207d864..000000000
--- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm
+++ /dev/null
@@ -1,273 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-    ; These functions are only valid when:
-    ; x_step_q4 == 16
-    ; w%4 == 0
-    ; h%4 == 0
-    ; taps == 8
-    ; AV1_FILTER_WEIGHT == 128
-    ; AV1_FILTER_SHIFT == 7
-
-    EXPORT  |aom_convolve8_horiz_neon|
-    EXPORT  |aom_convolve8_vert_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Multiply and accumulate by q0
-    MACRO
-    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
-    vmull.s16 $dst, $src0, d0[0]
-    vmlal.s16 $dst, $src1, d0[1]
-    vmlal.s16 $dst, $src2, d0[2]
-    vmlal.s16 $dst, $src3, d0[3]
-    vmlal.s16 $dst, $src4, d1[0]
-    vmlal.s16 $dst, $src5, d1[1]
-    vmlal.s16 $dst, $src6, d1[2]
-    vmlal.s16 $dst, $src7, d1[3]
-    MEND
-
-; r0    const uint8_t *src
-; r1    int src_stride
-; r2    uint8_t *dst
-; r3    int dst_stride
-; sp[]const int16_t *filter_x
-; sp[]int x_step_q4
-; sp[]const int16_t *filter_y ; unused
-; sp[]int y_step_q4           ; unused
-; sp[]int w
-; sp[]int h
-
-|aom_convolve8_horiz_neon| PROC
-    push            {r4-r10, lr}
-
-    sub             r0, r0, #3              ; adjust for taps
-
-    ldr             r5, [sp, #32]           ; filter_x
-    ldr             r6, [sp, #48]           ; w
-    ldr             r7, [sp, #52]           ; h
-
-    vld1.s16        {q0}, [r5]              ; filter_x
-
-    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
-    add             r8, r8, #4              ; -src_stride * 3 + 4
-
-    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
-    add             r4, r4, #4              ; -dst_stride * 3 + 4
-
-    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
-    sub             r9, r9, #7
-    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
-
-    mov             r10, r6                 ; w loop counter
-
-aom_convolve8_loop_horiz_v
-    vld1.8          {d24}, [r0], r1
-    vld1.8          {d25}, [r0], r1
-    vld1.8          {d26}, [r0], r1
-    vld1.8          {d27}, [r0], r8
-
-    vtrn.16         q12, q13
-    vtrn.8          d24, d25
-    vtrn.8          d26, d27
-
-    pld             [r0, r1, lsl #2]
-
-    vmovl.u8        q8, d24
-    vmovl.u8        q9, d25
-    vmovl.u8        q10, d26
-    vmovl.u8        q11, d27
-
-    ; save a few instructions in the inner loop
-    vswp            d17, d18
-    vmov            d23, d21
-
-    add             r0, r0, #3
-
-aom_convolve8_loop_horiz
-    add             r5, r0, #64
-
-    vld1.32         {d28[]}, [r0], r1
-    vld1.32         {d29[]}, [r0], r1
-    vld1.32         {d31[]}, [r0], r1
-    vld1.32         {d30[]}, [r0], r8
-
-    pld             [r5]
-
-    vtrn.16         d28, d31
-    vtrn.16         d29, d30
-    vtrn.8          d28, d29
-    vtrn.8          d31, d30
-
-    pld             [r5, r1]
-
-    ; extract to s16
-    vtrn.32         q14, q15
-    vmovl.u8        q12, d28
-    vmovl.u8        q13, d29
-
-    pld             [r5, r1, lsl #1]
-
-    ; src[] * filter_x
-    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
-    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
-    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
-    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
-
-    pld             [r5, -r8]
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; transpose
-    vtrn.16         d2, d3
-    vtrn.32         d2, d3
-    vtrn.8          d2, d3
-
-    vst1.u32        {d2[0]}, [r2@32], r3
-    vst1.u32        {d3[0]}, [r2@32], r3
-    vst1.u32        {d2[1]}, [r2@32], r3
-    vst1.u32        {d3[1]}, [r2@32], r4
-
-    vmov            q8,  q9
-    vmov            d20, d23
-    vmov            q11, q12
-    vmov            q9,  q13
-
-    subs            r6, r6, #4              ; w -= 4
-    bgt             aom_convolve8_loop_horiz
-
-    ; outer loop
-    mov             r6, r10                 ; restore w counter
-    add             r0, r0, r9              ; src += src_stride * 4 - w
-    add             r2, r2, r12             ; dst += dst_stride * 4 - w
-    subs            r7, r7, #4              ; h -= 4
-    bgt aom_convolve8_loop_horiz_v
-
-    pop             {r4-r10, pc}
-
-    ENDP
-
-|aom_convolve8_vert_neon| PROC
-    push            {r4-r8, lr}
-
-    ; adjust for taps
-    sub             r0, r0, r1
-    sub             r0, r0, r1, lsl #1
-
-    ldr             r4, [sp, #32]           ; filter_y
-    ldr             r6, [sp, #40]           ; w
-    ldr             lr, [sp, #44]           ; h
-
-    vld1.s16        {q0}, [r4]              ; filter_y
-
-    lsl             r1, r1, #1
-    lsl             r3, r3, #1
-
-aom_convolve8_loop_vert_h
-    mov             r4, r0
-    add             r7, r0, r1, asr #1
-    mov             r5, r2
-    add             r8, r2, r3, asr #1
-    mov             r12, lr                 ; h loop counter
-
-    vld1.u32        {d16[0]}, [r4], r1
-    vld1.u32        {d16[1]}, [r7], r1
-    vld1.u32        {d18[0]}, [r4], r1
-    vld1.u32        {d18[1]}, [r7], r1
-    vld1.u32        {d20[0]}, [r4], r1
-    vld1.u32        {d20[1]}, [r7], r1
-    vld1.u32        {d22[0]}, [r4], r1
-
-    vmovl.u8        q8, d16
-    vmovl.u8        q9, d18
-    vmovl.u8        q10, d20
-    vmovl.u8        q11, d22
-
-aom_convolve8_loop_vert
-    ; always process a 4x4 block at a time
-    vld1.u32        {d24[0]}, [r7], r1
-    vld1.u32        {d26[0]}, [r4], r1
-    vld1.u32        {d26[1]}, [r7], r1
-    vld1.u32        {d24[1]}, [r4], r1
-
-    ; extract to s16
-    vmovl.u8        q12, d24
-    vmovl.u8        q13, d26
-
-    pld             [r5]
-    pld             [r8]
-
-    ; src[] * filter_y
-    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
-
-    pld             [r5, r3]
-    pld             [r8, r3]
-
-    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
-
-    pld             [r7]
-    pld             [r4]
-
-    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
-
-    pld             [r7, r1]
-    pld             [r4, r1]
-
-    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    vst1.u32        {d2[0]}, [r5@32], r3
-    vst1.u32        {d2[1]}, [r8@32], r3
-    vst1.u32        {d3[0]}, [r5@32], r3
-    vst1.u32        {d3[1]}, [r8@32], r3
-
-    vmov            q8, q10
-    vmov            d18, d22
-    vmov            d19, d24
-    vmov            q10, q13
-    vmov            d22, d25
-
-    subs            r12, r12, #4            ; h -= 4
-    bgt             aom_convolve8_loop_vert
-
-    ; outer loop
-    add             r0, r0, #4
-    add             r2, r2, #4
-    subs            r6, r6, #4              ; w -= 4
-    bgt             aom_convolve8_loop_vert_h
-
-    pop             {r4-r8, pc}
-
-    ENDP
-    END
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c
deleted file mode 100644
index f05d3ceae..000000000
--- a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom/aom_integer.h"
-
-void aom_convolve_avg_neon(const uint8_t *src,    // r0
-                           ptrdiff_t src_stride,  // r1
-                           uint8_t *dst,          // r2
-                           ptrdiff_t dst_stride,  // r3
-                           const int16_t *filter_x, int filter_x_stride,
-                           const int16_t *filter_y, int filter_y_stride, int w,
-                           int h) {
-  uint8_t *d;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8;
-  uint32x2_t d0u32, d2u32;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
-
-  d = dst;
-  if (w > 32) {  // avg64
-    for (; h > 0; h -= 1) {
-      q0u8 = vld1q_u8(src);
-      q1u8 = vld1q_u8(src + 16);
-      q2u8 = vld1q_u8(src + 32);
-      q3u8 = vld1q_u8(src + 48);
-      src += src_stride;
-      q8u8 = vld1q_u8(d);
-      q9u8 = vld1q_u8(d + 16);
-      q10u8 = vld1q_u8(d + 32);
-      q11u8 = vld1q_u8(d + 48);
-      d += dst_stride;
-
-      q0u8 = vrhaddq_u8(q0u8, q8u8);
-      q1u8 = vrhaddq_u8(q1u8, q9u8);
-      q2u8 = vrhaddq_u8(q2u8, q10u8);
-      q3u8 = vrhaddq_u8(q3u8, q11u8);
-
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q1u8);
-      vst1q_u8(dst + 32, q2u8);
-      vst1q_u8(dst + 48, q3u8);
-      dst += dst_stride;
-    }
-  } else if (w == 32) {  // avg32
-    for (; h > 0; h -= 2) {
-      q0u8 = vld1q_u8(src);
-      q1u8 = vld1q_u8(src + 16);
-      src += src_stride;
-      q2u8 = vld1q_u8(src);
-      q3u8 = vld1q_u8(src + 16);
-      src += src_stride;
-      q8u8 = vld1q_u8(d);
-      q9u8 = vld1q_u8(d + 16);
-      d += dst_stride;
-      q10u8 = vld1q_u8(d);
-      q11u8 = vld1q_u8(d + 16);
-      d += dst_stride;
-
-      q0u8 = vrhaddq_u8(q0u8, q8u8);
-      q1u8 = vrhaddq_u8(q1u8, q9u8);
-      q2u8 = vrhaddq_u8(q2u8, q10u8);
-      q3u8 = vrhaddq_u8(q3u8, q11u8);
-
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q1u8);
-      dst += dst_stride;
-      vst1q_u8(dst, q2u8);
-      vst1q_u8(dst + 16, q3u8);
-      dst += dst_stride;
-    }
-  } else if (w > 8) {  // avg16
-    for (; h > 0; h -= 2) {
-      q0u8 = vld1q_u8(src);
-      src += src_stride;
-      q1u8 = vld1q_u8(src);
-      src += src_stride;
-      q2u8 = vld1q_u8(d);
-      d += dst_stride;
-      q3u8 = vld1q_u8(d);
-      d += dst_stride;
-
-      q0u8 = vrhaddq_u8(q0u8, q2u8);
-      q1u8 = vrhaddq_u8(q1u8, q3u8);
-
-      vst1q_u8(dst, q0u8);
-      dst += dst_stride;
-      vst1q_u8(dst, q1u8);
-      dst += dst_stride;
-    }
-  } else if (w == 8) {  // avg8
-    for (; h > 0; h -= 2) {
-      d0u8 = vld1_u8(src);
-      src += src_stride;
-      d1u8 = vld1_u8(src);
-      src += src_stride;
-      d2u8 = vld1_u8(d);
-      d += dst_stride;
-      d3u8 = vld1_u8(d);
-      d += dst_stride;
-
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      q0u8 = vrhaddq_u8(q0u8, q1u8);
-
-      vst1_u8(dst, vget_low_u8(q0u8));
-      dst += dst_stride;
-      vst1_u8(dst, vget_high_u8(q0u8));
-      dst += dst_stride;
-    }
-  } else {  // avg4
-    for (; h > 0; h -= 2) {
-      d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
-      src += src_stride;
-      d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
-      src += src_stride;
-      d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
-      d += dst_stride;
-      d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
-      d += dst_stride;
-
-      d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), vreinterpret_u8_u32(d2u32));
-
-      d0u32 = vreinterpret_u32_u8(d0u8);
-      vst1_lane_u32((uint32_t *)dst, d0u32, 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, d0u32, 1);
-      dst += dst_stride;
-    }
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm
deleted file mode 100644
index 43c300954..000000000
--- a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm
+++ /dev/null
@@ -1,119 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-    EXPORT  |aom_convolve_avg_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-|aom_convolve_avg_neon| PROC
-    push                {r4-r6, lr}
-    ldrd                r4, r5, [sp, #32]
-    mov                 r6, r2
-
-    cmp                 r4, #32
-    bgt                 avg64
-    beq                 avg32
-    cmp                 r4, #8
-    bgt                 avg16
-    beq                 avg8
-    b                   avg4
-
-avg64
-    sub                 lr, r1, #32
-    sub                 r4, r3, #32
-avg64_h
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q0-q1}, [r0]!
-    vld1.8              {q2-q3}, [r0], lr
-    pld                 [r2, r3]
-    vld1.8              {q8-q9},   [r6@128]!
-    vld1.8              {q10-q11}, [r6@128], r4
-    vrhadd.u8           q0, q0, q8
-    vrhadd.u8           q1, q1, q9
-    vrhadd.u8           q2, q2, q10
-    vrhadd.u8           q3, q3, q11
-    vst1.8              {q0-q1}, [r2@128]!
-    vst1.8              {q2-q3}, [r2@128], r4
-    subs                r5, r5, #1
-    bgt                 avg64_h
-    pop                 {r4-r6, pc}
-
-avg32
-    vld1.8              {q0-q1}, [r0], r1
-    vld1.8              {q2-q3}, [r0], r1
-    vld1.8              {q8-q9},   [r6@128], r3
-    vld1.8              {q10-q11}, [r6@128], r3
-    pld                 [r0]
-    vrhadd.u8           q0, q0, q8
-    pld                 [r0, r1]
-    vrhadd.u8           q1, q1, q9
-    pld                 [r6]
-    vrhadd.u8           q2, q2, q10
-    pld                 [r6, r3]
-    vrhadd.u8           q3, q3, q11
-    vst1.8              {q0-q1}, [r2@128], r3
-    vst1.8              {q2-q3}, [r2@128], r3
-    subs                r5, r5, #2
-    bgt                 avg32
-    pop                 {r4-r6, pc}
-
-avg16
-    vld1.8              {q0}, [r0], r1
-    vld1.8              {q1}, [r0], r1
-    vld1.8              {q2}, [r6@128], r3
-    vld1.8              {q3}, [r6@128], r3
-    pld                 [r0]
-    pld                 [r0, r1]
-    vrhadd.u8           q0, q0, q2
-    pld                 [r6]
-    pld                 [r6, r3]
-    vrhadd.u8           q1, q1, q3
-    vst1.8              {q0}, [r2@128], r3
-    vst1.8              {q1}, [r2@128], r3
-    subs                r5, r5, #2
-    bgt                 avg16
-    pop                 {r4-r6, pc}
-
-avg8
-    vld1.8              {d0}, [r0], r1
-    vld1.8              {d1}, [r0], r1
-    vld1.8              {d2}, [r6@64], r3
-    vld1.8              {d3}, [r6@64], r3
-    pld                 [r0]
-    pld                 [r0, r1]
-    vrhadd.u8           q0, q0, q1
-    pld                 [r6]
-    pld                 [r6, r3]
-    vst1.8              {d0}, [r2@64], r3
-    vst1.8              {d1}, [r2@64], r3
-    subs                r5, r5, #2
-    bgt                 avg8
-    pop                 {r4-r6, pc}
-
-avg4
-    vld1.32             {d0[0]}, [r0], r1
-    vld1.32             {d0[1]}, [r0], r1
-    vld1.32             {d2[0]}, [r6@32], r3
-    vld1.32             {d2[1]}, [r6@32], r3
-    vrhadd.u8           d0, d0, d2
-    vst1.32             {d0[0]}, [r2@32], r3
-    vst1.32             {d0[1]}, [r2@32], r3
-    subs                r5, r5, #2
-    bgt                 avg4
-    pop                 {r4-r6, pc}
-    ENDP
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c
deleted file mode 100644
index 9e57c7176..000000000
--- a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom/aom_integer.h"
-
-void aom_convolve_copy_neon(const uint8_t *src,    // r0
-                            ptrdiff_t src_stride,  // r1
-                            uint8_t *dst,          // r2
-                            ptrdiff_t dst_stride,  // r3
-                            const int16_t *filter_x, int filter_x_stride,
-                            const int16_t *filter_y, int filter_y_stride, int w,
-                            int h) {
-  uint8x8_t d0u8, d2u8;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8;
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
-
-  if (w > 32) {  // copy64
-    for (; h > 0; h--) {
-      q0u8 = vld1q_u8(src);
-      q1u8 = vld1q_u8(src + 16);
-      q2u8 = vld1q_u8(src + 32);
-      q3u8 = vld1q_u8(src + 48);
-      src += src_stride;
-
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q1u8);
-      vst1q_u8(dst + 32, q2u8);
-      vst1q_u8(dst + 48, q3u8);
-      dst += dst_stride;
-    }
-  } else if (w == 32) {  // copy32
-    for (; h > 0; h -= 2) {
-      q0u8 = vld1q_u8(src);
-      q1u8 = vld1q_u8(src + 16);
-      src += src_stride;
-      q2u8 = vld1q_u8(src);
-      q3u8 = vld1q_u8(src + 16);
-      src += src_stride;
-
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q1u8);
-      dst += dst_stride;
-      vst1q_u8(dst, q2u8);
-      vst1q_u8(dst + 16, q3u8);
-      dst += dst_stride;
-    }
-  } else if (w > 8) {  // copy16
-    for (; h > 0; h -= 2) {
-      q0u8 = vld1q_u8(src);
-      src += src_stride;
-      q1u8 = vld1q_u8(src);
-      src += src_stride;
-
-      vst1q_u8(dst, q0u8);
-      dst += dst_stride;
-      vst1q_u8(dst, q1u8);
-      dst += dst_stride;
-    }
-  } else if (w == 8) {  // copy8
-    for (; h > 0; h -= 2) {
-      d0u8 = vld1_u8(src);
-      src += src_stride;
-      d2u8 = vld1_u8(src);
-      src += src_stride;
-
-      vst1_u8(dst, d0u8);
-      dst += dst_stride;
-      vst1_u8(dst, d2u8);
-      dst += dst_stride;
-    }
-  } else {  // copy4
-    for (; h > 0; h--) {
-      *(uint32_t *)dst = *(const uint32_t *)src;
-      src += src_stride;
-      dst += dst_stride;
-    }
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm
deleted file mode 100644
index 443d7178a..000000000
--- a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm
+++ /dev/null
@@ -1,87 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-    EXPORT  |aom_convolve_copy_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-|aom_convolve_copy_neon| PROC
-    push                {r4-r5, lr}
-    ldrd                r4, r5, [sp, #28]
-
-    cmp                 r4, #32
-    bgt                 copy64
-    beq                 copy32
-    cmp                 r4, #8
-    bgt                 copy16
-    beq                 copy8
-    b                   copy4
-
-copy64
-    sub                 lr, r1, #32
-    sub                 r3, r3, #32
-copy64_h
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q0-q1}, [r0]!
-    vld1.8              {q2-q3}, [r0], lr
-    vst1.8              {q0-q1}, [r2@128]!
-    vst1.8              {q2-q3}, [r2@128], r3
-    subs                r5, r5, #1
-    bgt                 copy64_h
-    pop                 {r4-r5, pc}
-
-copy32
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q0-q1}, [r0], r1
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q2-q3}, [r0], r1
-    vst1.8              {q0-q1}, [r2@128], r3
-    vst1.8              {q2-q3}, [r2@128], r3
-    subs                r5, r5, #2
-    bgt                 copy32
-    pop                 {r4-r5, pc}
-
-copy16
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q0}, [r0], r1
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q1}, [r0], r1
-    vst1.8              {q0}, [r2@128], r3
-    vst1.8              {q1}, [r2@128], r3
-    subs                r5, r5, #2
-    bgt                 copy16
-    pop                 {r4-r5, pc}
-
-copy8
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {d0}, [r0], r1
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {d2}, [r0], r1
-    vst1.8              {d0}, [r2@64], r3
-    vst1.8              {d2}, [r2@64], r3
-    subs                r5, r5, #2
-    bgt                 copy8
-    pop                 {r4-r5, pc}
-
-copy4
-    ldr                 r12, [r0], r1
-    str                 r12, [r2], r3
-    subs                r5, r5, #1
-    bgt                 copy4
-    pop                 {r4-r5, pc}
-    ENDP
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_neon.c
deleted file mode 100644
index 6c2997e04..000000000
--- a/third_party/aom/aom_dsp/arm/aom_convolve_neon.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-void aom_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                        int w, int h) {
-  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
-   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
-   */
-  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
-
-  // Account for the vertical phase needing 3 lines prior and 4 lines post
-  int intermediate_height = h + 7;
-
-  assert(y_step_q4 == 16);
-  assert(x_step_q4 == 16);
-
-  /* Filter starting 3 lines back. The neon implementation will ignore the
-   * given height and filter a multiple of 4 lines. Since this goes in to
-   * the temp buffer which has lots of extra room and is subsequently discarded
-   * this is safe if somewhat less than ideal.
-   */
-  aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w,
-                           intermediate_height);
-
-  /* Step into the temp buffer 3 lines to get the actual frame data */
-  aom_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
-                          x_step_q4, filter_y, y_step_q4, w, h);
-}
-
-void aom_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
-                            int h) {
-  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
-  int intermediate_height = h + 7;
-
-  assert(y_step_q4 == 16);
-  assert(x_step_q4 == 16);
-
-  /* This implementation has the same issues as above. In addition, we only want
-   * to average the values after both passes.
-   */
-  aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w,
-                           intermediate_height);
-  aom_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
-}
diff --git a/third_party/aom/aom_dsp/arm/avg_neon.c b/third_party/aom/aom_dsp/arm/avg_neon.c
deleted file mode 100644
index 6ff760017..000000000
--- a/third_party/aom/aom_dsp/arm/avg_neon.c
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "./aom_config.h"
-
-#include "aom/aom_integer.h"
-
-static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
-  const uint32x4_t a = vpaddlq_u16(v_16x8);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
-
-// coeff: 16 bits, dynamic range [-32640, 32640].
-// length: value range {16, 64, 256, 1024}.
-int aom_satd_neon(const int16_t *coeff, int length) {
-  const int16x4_t zero = vdup_n_s16(0);
-  int32x4_t accum = vdupq_n_s32(0);
-
-  do {
-    const int16x8_t src0 = vld1q_s16(coeff);
-    const int16x8_t src8 = vld1q_s16(coeff + 8);
-    accum = vabal_s16(accum, vget_low_s16(src0), zero);
-    accum = vabal_s16(accum, vget_high_s16(src0), zero);
-    accum = vabal_s16(accum, vget_low_s16(src8), zero);
-    accum = vabal_s16(accum, vget_high_s16(src8), zero);
-    length -= 16;
-    coeff += 16;
-  } while (length != 0);
-
-  {
-    // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
-    const int64x2_t s0 = vpaddlq_s32(accum);  // cascading summation of 'accum'.
-    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
-                                  vreinterpret_s32_s64(vget_high_s64(s0)));
-    const int satd = vget_lane_s32(s1, 0);
-    return satd;
-  }
-}
-
-void aom_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, int ref_stride,
-                          int height) {
-  int i;
-  uint16x8_t vec_sum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_hi = vdupq_n_u16(0);
-  const int shift_factor = ((height >> 5) + 3) * -1;
-  const int16x8_t vec_shift = vdupq_n_s16(shift_factor);
-
-  for (i = 0; i < height; i += 8) {
-    const uint8x16_t vec_row1 = vld1q_u8(ref);
-    const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);
-    const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);
-    const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);
-    const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);
-    const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);
-    const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);
-    const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));
-
-    ref += ref_stride * 8;
-  }
-
-  vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);
-  vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);
-
-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));
-  hbuf += 8;
-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
-}
-
-int16_t aom_int_pro_col_neon(uint8_t const *ref, const int width) {
-  int i;
-  uint16x8_t vec_sum = vdupq_n_u16(0);
-
-  for (i = 0; i < width; i += 16) {
-    const uint8x16_t vec_row = vld1q_u8(ref);
-    vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
-    vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
-    ref += 16;
-  }
-
-  return horizontal_add_u16x8(vec_sum);
-}
-
-// ref, src = [0, 510] - max diff = 16-bits
-// bwl = {2, 3, 4}, width = {16, 32, 64}
-int aom_vector_var_neon(int16_t const *ref, int16_t const *src, int bwl) {
-  int width = 4 << bwl;
-  int32x4_t sse = vdupq_n_s32(0);
-  int16x8_t total = vdupq_n_s16(0);
-
-  assert(width >= 8);
-  assert((width % 8) == 0);
-
-  do {
-    const int16x8_t r = vld1q_s16(ref);
-    const int16x8_t s = vld1q_s16(src);
-    const int16x8_t diff = vsubq_s16(r, s);  // [-510, 510], 10 bits.
-    const int16x4_t diff_lo = vget_low_s16(diff);
-    const int16x4_t diff_hi = vget_high_s16(diff);
-    sse = vmlal_s16(sse, diff_lo, diff_lo);  // dynamic range 26 bits.
-    sse = vmlal_s16(sse, diff_hi, diff_hi);
-    total = vaddq_s16(total, diff);  // dynamic range 16 bits.
-
-    ref += 8;
-    src += 8;
-    width -= 8;
-  } while (width != 0);
-
-  {
-    // Note: 'total''s pairwise addition could be implemented similarly to
-    // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired
-    // with the summation of 'sse' performed better on a Cortex-A15.
-    const int32x4_t t0 = vpaddlq_s16(total);  // cascading summation of 'total'
-    const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));
-    const int32x2_t t2 = vpadd_s32(t1, t1);
-    const int t = vget_lane_s32(t2, 0);
-    const int64x2_t s0 = vpaddlq_s32(sse);  // cascading summation of 'sse'.
-    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
-                                  vreinterpret_s32_s64(vget_high_s64(s0)));
-    const int s = vget_lane_s32(s1, 0);
-    const int shift_factor = bwl + 2;
-    return s - ((t * t) >> shift_factor);
-  }
-}
-
-void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
-                         int b_stride, int *min, int *max) {
-  // Load and concatenate.
-  const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride));
-  const uint8x16_t a23 =
-      vcombine_u8(vld1_u8(a + 2 * a_stride), vld1_u8(a + 3 * a_stride));
-  const uint8x16_t a45 =
-      vcombine_u8(vld1_u8(a + 4 * a_stride), vld1_u8(a + 5 * a_stride));
-  const uint8x16_t a67 =
-      vcombine_u8(vld1_u8(a + 6 * a_stride), vld1_u8(a + 7 * a_stride));
-
-  const uint8x16_t b01 = vcombine_u8(vld1_u8(b), vld1_u8(b + b_stride));
-  const uint8x16_t b23 =
-      vcombine_u8(vld1_u8(b + 2 * b_stride), vld1_u8(b + 3 * b_stride));
-  const uint8x16_t b45 =
-      vcombine_u8(vld1_u8(b + 4 * b_stride), vld1_u8(b + 5 * b_stride));
-  const uint8x16_t b67 =
-      vcombine_u8(vld1_u8(b + 6 * b_stride), vld1_u8(b + 7 * b_stride));
-
-  // Absolute difference.
-  const uint8x16_t ab01_diff = vabdq_u8(a01, b01);
-  const uint8x16_t ab23_diff = vabdq_u8(a23, b23);
-  const uint8x16_t ab45_diff = vabdq_u8(a45, b45);
-  const uint8x16_t ab67_diff = vabdq_u8(a67, b67);
-
-  // Max values between the Q vectors.
-  const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff);
-  const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff);
-  const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff);
-  const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff);
-
-  const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
-  const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
-
-  // Split to D and start doing pairwise.
-  uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max));
-  uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min));
-
-  // Enough runs of vpmax/min propogate the max/min values to every position.
-  ab_max = vpmax_u8(ab_max, ab_max);
-  ab_min = vpmin_u8(ab_min, ab_min);
-
-  ab_max = vpmax_u8(ab_max, ab_max);
-  ab_min = vpmin_u8(ab_min, ab_min);
-
-  ab_max = vpmax_u8(ab_max, ab_max);
-  ab_min = vpmin_u8(ab_min, ab_min);
-
-  *min = *max = 0;  // Clear high bits
-  // Store directly to avoid costly neon->gpr transfer.
-  vst1_lane_u8((uint8_t *)max, ab_max, 0);
-  vst1_lane_u8((uint8_t *)min, ab_min, 0);
-}
diff --git a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
new file mode 100644
index 000000000..82c0b0e28
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
@@ -0,0 +1,449 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void blend8x1(int16x8_t mask, int16x8_t src_0, int16x8_t src_1,
+                            const int16x8_t v_maxval, int16x8_t *res) {
+  int32x4_t im_res_low, im_res_high;
+  const int16x8_t max_minus_mask = vsubq_s16(v_maxval, mask);
+
+  im_res_low = vmull_s16(vget_low_s16(mask), vget_low_s16(src_0));
+  im_res_low =
+      vmlal_s16(im_res_low, vget_low_s16(max_minus_mask), vget_low_s16(src_1));
+
+  im_res_high = vmull_s16(vget_high_s16(mask), vget_high_s16(src_0));
+  im_res_high = vmlal_s16(im_res_high, vget_high_s16(max_minus_mask),
+                          vget_high_s16(src_1));
+
+  *res = vcombine_s16(vshrn_n_s32(im_res_low, AOM_BLEND_A64_ROUND_BITS),
+                      vshrn_n_s32(im_res_high, AOM_BLEND_A64_ROUND_BITS));
+}
+
+static INLINE void blend_8x4(uint8_t *dst, uint32_t dst_stride,
+                             const CONV_BUF_TYPE *src0, uint32_t src0_stride,
+                             const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+                             int16x8_t mask0, int16x8_t mask1, int16x8_t mask2,
+                             int16x8_t mask3, const int16x8_t v_maxval,
+                             const uint16x8_t vec_round_offset,
+                             const int16x8_t vec_round_bits) {
+  int16x8_t src0_0, src0_1, src0_2, src0_3;
+  int16x8_t src1_0, src1_1, src1_2, src1_3;
+  int16x8_t im_res_0, im_res_1, im_res_2, im_res_3;
+
+  load_s16_8x4((int16_t *)src0, (int32_t)src0_stride, &src0_0, &src0_1, &src0_2,
+               &src0_3);
+  load_s16_8x4((int16_t *)src1, (int32_t)src1_stride, &src1_0, &src1_1, &src1_2,
+               &src1_3);
+
+  blend8x1(mask0, src0_0, src1_0, v_maxval, &im_res_0);
+  blend8x1(mask1, src0_1, src1_1, v_maxval, &im_res_1);
+  blend8x1(mask2, src0_2, src1_2, v_maxval, &im_res_2);
+  blend8x1(mask3, src0_3, src1_3, v_maxval, &im_res_3);
+
+  uint16x8_t im_res1_0 =
+      vqsubq_u16(vreinterpretq_u16_s16(im_res_0), vec_round_offset);
+  uint16x8_t im_res1_1 =
+      vqsubq_u16(vreinterpretq_u16_s16(im_res_1), vec_round_offset);
+  uint16x8_t im_res1_2 =
+      vqsubq_u16(vreinterpretq_u16_s16(im_res_2), vec_round_offset);
+  uint16x8_t im_res1_3 =
+      vqsubq_u16(vreinterpretq_u16_s16(im_res_3), vec_round_offset);
+
+  im_res_0 = vshlq_s16(vreinterpretq_s16_u16(im_res1_0), vec_round_bits);
+  im_res_1 = vshlq_s16(vreinterpretq_s16_u16(im_res1_1), vec_round_bits);
+  im_res_2 = vshlq_s16(vreinterpretq_s16_u16(im_res1_2), vec_round_bits);
+  im_res_3 = vshlq_s16(vreinterpretq_s16_u16(im_res1_3), vec_round_bits);
+
+  vst1_u8((dst + 0 * dst_stride), vqmovun_s16(im_res_0));
+  vst1_u8((dst + 1 * dst_stride), vqmovun_s16(im_res_1));
+  vst1_u8((dst + 2 * dst_stride), vqmovun_s16(im_res_2));
+  vst1_u8((dst + 3 * dst_stride), vqmovun_s16(im_res_3));
+}
+
+static INLINE void blend_4x4(uint8_t *dst, uint32_t dst_stride,
+                             const CONV_BUF_TYPE *src0, uint32_t src0_stride,
+                             const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+                             int16x4_t mask0, int16x4_t mask1, int16x4_t mask2,
+                             int16x4_t mask3, const int16x8_t v_maxval,
+                             const uint16x8_t vec_round_offset,
+                             const int16x8_t vec_round_bits) {
+  int16x8_t src0_0, src0_1;
+  int16x8_t src1_0, src1_1;
+  uint64x2_t tu0, tu1, tu2, tu3;
+  int16x8_t mask0_1, mask2_3;
+  int16x8_t res0, res1;
+
+  load_unaligned_u16_4x4(src0, src0_stride, &tu0, &tu1);
+  load_unaligned_u16_4x4(src1, src1_stride, &tu2, &tu3);
+
+  src0_0 = vreinterpretq_s16_u64(tu0);
+  src0_1 = vreinterpretq_s16_u64(tu1);
+
+  src1_0 = vreinterpretq_s16_u64(tu2);
+  src1_1 = vreinterpretq_s16_u64(tu3);
+
+  mask0_1 = vcombine_s16(mask0, mask1);
+  mask2_3 = vcombine_s16(mask2, mask3);
+
+  blend8x1(mask0_1, src0_0, src1_0, v_maxval, &res0);
+  blend8x1(mask2_3, src0_1, src1_1, v_maxval, &res1);
+
+  uint16x8_t im_res_0 =
+      vqsubq_u16(vreinterpretq_u16_s16(res0), vec_round_offset);
+  uint16x8_t im_res_1 =
+      vqsubq_u16(vreinterpretq_u16_s16(res1), vec_round_offset);
+
+  src0_0 = vshlq_s16(vreinterpretq_s16_u16(im_res_0), vec_round_bits);
+  src0_1 = vshlq_s16(vreinterpretq_s16_u16(im_res_1), vec_round_bits);
+
+  uint8x8_t res_0 = vqmovun_s16(src0_0);
+  uint8x8_t res_1 = vqmovun_s16(src0_1);
+
+  vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), vreinterpret_u32_u8(res_0),
+                0);
+  vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), vreinterpret_u32_u8(res_0),
+                1);
+  vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), vreinterpret_u32_u8(res_1),
+                0);
+  vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), vreinterpret_u32_u8(res_1),
+                1);
+}
+
+void aom_lowbd_blend_a64_d16_mask_neon(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params) {
+  int i = 0;
+  const int bd = 8;
+  int w_tmp = w;
+  const uint8_t *mask_tmp = mask;
+  const CONV_BUF_TYPE *src0_tmp = src0;
+  const CONV_BUF_TYPE *src1_tmp = src1;
+  uint8_t *dst_tmp = dst;
+
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  uint8x8_t s0, s1, s2, s3;
+  uint32x2_t tu0, tu1, tu2, tu3;
+  uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
+  int16x8_t mask0, mask1, mask2, mask3;
+  int16x8_t mask4, mask5, mask6, mask7;
+  int32x4_t m0_32, m1_32, m2_32, m3_32;
+  int32x4_t m4_32, m5_32, m6_32, m7_32;
+  uint8x8_t mask0_l, mask1_l, mask2_l, mask3_l;
+  uint8x8_t mask4_l, mask5_l, mask6_l, mask7_l;
+  int16x4_t mask0_low, mask1_low, mask2_low, mask3_low;
+  const uint16x4_t vec_zero = vdup_n_u16(0);
+  const uint16_t offset = round_offset - (1 << (round_bits - 1));
+  const int16x8_t v_maxval = vdupq_n_s16(AOM_BLEND_A64_MAX_ALPHA);
+  const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
+  const uint16x8_t vec_offset = vdupq_n_u16(offset);
+
+  if (subw == 0 && subh == 0) {
+    if (w_tmp > 7) {
+      do {
+        w_tmp = w;
+        do {
+          load_u8_8x4(mask_tmp, mask_stride, &s0, &s1, &s2, &s3);
+
+          mask0 = vmovl_s8(vreinterpret_s8_u8(s0));
+          mask1 = vmovl_s8(vreinterpret_s8_u8(s1));
+          mask2 = vmovl_s8(vreinterpret_s8_u8(s2));
+          mask3 = vmovl_s8(vreinterpret_s8_u8(s3));
+
+          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
+                    vec_offset, vec_round_bits);
+
+          w_tmp -= 8;
+          mask_tmp += 8;
+          dst_tmp += 8;
+          src0_tmp += 8;
+          src1_tmp += 8;
+        } while (w_tmp > 7);
+        i += 4;
+        mask_tmp += (4 * mask_stride) - w;
+        dst_tmp += (4 * dst_stride) - w;
+        src0_tmp += (4 * src0_stride) - w;
+        src1_tmp += (4 * src1_stride) - w;
+      } while (i < h);
+    } else {
+      do {
+        load_unaligned_u8_4x4(mask_tmp, mask_stride, &tu0, &tu1);
+
+        mask0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+        mask1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
+
+        mask0_low = vget_low_s16(mask0);
+        mask1_low = vget_high_s16(mask0);
+        mask2_low = vget_low_s16(mask1);
+        mask3_low = vget_high_s16(mask1);
+
+        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
+                  v_maxval, vec_offset, vec_round_bits);
+
+        i += 4;
+        mask_tmp += (4 * mask_stride);
+        dst_tmp += (4 * dst_stride);
+        src0_tmp += (4 * src0_stride);
+        src1_tmp += (4 * src1_stride);
+      } while (i < h);
+    }
+  } else if (subw == 1 && subh == 1) {
+    if (w_tmp > 7) {
+      do {
+        w_tmp = w;
+        do {
+          load_u8_16x8(mask_tmp, mask_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+                       &t7);
+
+          mask0 =
+              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t0), vget_low_u8(t1)));
+          mask1 =
+              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t2), vget_low_u8(t3)));
+          mask2 =
+              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t4), vget_low_u8(t5)));
+          mask3 =
+              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t6), vget_low_u8(t7)));
+
+          mask4 = vreinterpretq_s16_u16(
+              vaddl_u8(vget_high_u8(t0), vget_high_u8(t1)));
+          mask5 = vreinterpretq_s16_u16(
+              vaddl_u8(vget_high_u8(t2), vget_high_u8(t3)));
+          mask6 = vreinterpretq_s16_u16(
+              vaddl_u8(vget_high_u8(t4), vget_high_u8(t5)));
+          mask7 = vreinterpretq_s16_u16(
+              vaddl_u8(vget_high_u8(t6), vget_high_u8(t7)));
+
+          m0_32 = vpaddlq_s16(mask0);
+          m1_32 = vpaddlq_s16(mask1);
+          m2_32 = vpaddlq_s16(mask2);
+          m3_32 = vpaddlq_s16(mask3);
+
+          m4_32 = vpaddlq_s16(mask4);
+          m5_32 = vpaddlq_s16(mask5);
+          m6_32 = vpaddlq_s16(mask6);
+          m7_32 = vpaddlq_s16(mask7);
+
+          mask0 =
+              vcombine_s16(vqrshrn_n_s32(m0_32, 2), vqrshrn_n_s32(m4_32, 2));
+          mask1 =
+              vcombine_s16(vqrshrn_n_s32(m1_32, 2), vqrshrn_n_s32(m5_32, 2));
+          mask2 =
+              vcombine_s16(vqrshrn_n_s32(m2_32, 2), vqrshrn_n_s32(m6_32, 2));
+          mask3 =
+              vcombine_s16(vqrshrn_n_s32(m3_32, 2), vqrshrn_n_s32(m7_32, 2));
+
+          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
+                    vec_offset, vec_round_bits);
+
+          w_tmp -= 8;
+          mask_tmp += 16;
+          dst_tmp += 8;
+          src0_tmp += 8;
+          src1_tmp += 8;
+        } while (w_tmp > 7);
+        i += 4;
+        mask_tmp += (8 * mask_stride) - (2 * w);
+        dst_tmp += (4 * dst_stride) - w;
+        src0_tmp += (4 * src0_stride) - w;
+        src1_tmp += (4 * src1_stride) - w;
+      } while (i < h);
+    } else {
+      do {
+        load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
+                    &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
+
+        mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
+        mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
+        mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
+        mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
+
+        m0_32 = vpaddlq_s16(mask0);
+        m1_32 = vpaddlq_s16(mask1);
+        m2_32 = vpaddlq_s16(mask2);
+        m3_32 = vpaddlq_s16(mask3);
+
+        mask0_low = vqrshrn_n_s32(m0_32, 2);
+        mask1_low = vqrshrn_n_s32(m1_32, 2);
+        mask2_low = vqrshrn_n_s32(m2_32, 2);
+        mask3_low = vqrshrn_n_s32(m3_32, 2);
+
+        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
+                  v_maxval, vec_offset, vec_round_bits);
+
+        i += 4;
+        mask_tmp += (8 * mask_stride);
+        dst_tmp += (4 * dst_stride);
+        src0_tmp += (4 * src0_stride);
+        src1_tmp += (4 * src1_stride);
+      } while (i < h);
+    }
+  } else if (subw == 1 && subh == 0) {
+    if (w_tmp > 7) {
+      do {
+        w_tmp = w;
+        do {
+          load_u8_16x4(mask_tmp, mask_stride, &t0, &t1, &t2, &t3);
+
+          mask0 = vreinterpretq_s16_u16(vcombine_u16(
+              vpaddl_u8(vget_low_u8(t0)), vpaddl_u8(vget_high_u8(t0))));
+          mask1 = vreinterpretq_s16_u16(vcombine_u16(
+              vpaddl_u8(vget_low_u8(t1)), vpaddl_u8(vget_high_u8(t1))));
+          mask2 = vreinterpretq_s16_u16(vcombine_u16(
+              vpaddl_u8(vget_low_u8(t2)), vpaddl_u8(vget_high_u8(t2))));
+          mask3 = vreinterpretq_s16_u16(vcombine_u16(
+              vpaddl_u8(vget_low_u8(t3)), vpaddl_u8(vget_high_u8(t3))));
+
+          mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
+          mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
+          mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
+          mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
+
+          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
+                    vec_offset, vec_round_bits);
+          w_tmp -= 8;
+          mask_tmp += 16;
+          dst_tmp += 8;
+          src0_tmp += 8;
+          src1_tmp += 8;
+        } while (w_tmp > 7);
+        i += 4;
+        mask_tmp += (4 * mask_stride) - (2 * w);
+        dst_tmp += (4 * dst_stride) - w;
+        src0_tmp += (4 * src0_stride) - w;
+        src1_tmp += (4 * src1_stride) - w;
+      } while (i < h);
+    } else {
+      do {
+        load_u8_8x4(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
+                    &mask3_l);
+
+        mask0 =
+            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask0_l), vec_zero));
+        mask1 =
+            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask1_l), vec_zero));
+        mask2 =
+            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask2_l), vec_zero));
+        mask3 =
+            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask3_l), vec_zero));
+
+        mask0_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask0, 1)));
+        mask1_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask1, 1)));
+        mask2_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask2, 1)));
+        mask3_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask3, 1)));
+
+        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
+                  v_maxval, vec_offset, vec_round_bits);
+
+        i += 4;
+        mask_tmp += (4 * mask_stride);
+        dst_tmp += (4 * dst_stride);
+        src0_tmp += (4 * src0_stride);
+        src1_tmp += (4 * src1_stride);
+      } while (i < h);
+    }
+  } else {
+    if (w_tmp > 7) {
+      do {
+        w_tmp = w;
+        do {
+          load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
+                      &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
+
+          mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
+          mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
+          mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
+          mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
+
+          mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
+          mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
+          mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
+          mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
+
+          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
+                    vec_offset, vec_round_bits);
+
+          w_tmp -= 8;
+          mask_tmp += 8;
+          dst_tmp += 8;
+          src0_tmp += 8;
+          src1_tmp += 8;
+        } while (w_tmp > 7);
+        i += 4;
+        mask_tmp += (8 * mask_stride) - w;
+        dst_tmp += (4 * dst_stride) - w;
+        src0_tmp += (4 * src0_stride) - w;
+        src1_tmp += (4 * src1_stride) - w;
+      } while (i < h);
+    } else {
+      do {
+        load_unaligned_u8_4x4(mask_tmp, 2 * mask_stride, &tu0, &tu1);
+        load_unaligned_u8_4x4(mask_tmp + mask_stride, 2 * mask_stride, &tu2,
+                              &tu3);
+
+        s0 = vreinterpret_u8_u32(tu0);
+        s1 = vreinterpret_u8_u32(tu1);
+        s2 = vreinterpret_u8_u32(tu2);
+        s3 = vreinterpret_u8_u32(tu3);
+
+        mask0 = vreinterpretq_s16_u16(vaddl_u8(s0, s2));
+        mask1 = vreinterpretq_s16_u16(vaddl_u8(s1, s3));
+
+        mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
+        mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
+
+        mask0_low = vget_low_s16(mask0);
+        mask1_low = vget_high_s16(mask0);
+        mask2_low = vget_low_s16(mask1);
+        mask3_low = vget_high_s16(mask1);
+
+        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
+                  v_maxval, vec_offset, vec_round_bits);
+
+        i += 4;
+        mask_tmp += (8 * mask_stride);
+        dst_tmp += (4 * dst_stride);
+        src0_tmp += (4 * src0_stride);
+        src1_tmp += (4 * src1_stride);
+      } while (i < h);
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
index 1cf8a3a6e..e4300c992 100644
--- a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
+++ b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
@@ -11,7 +11,8 @@
 
 #include <arm_neon.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_dsp/txfm_common.h"
 
 void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
diff --git a/third_party/aom/aom_dsp/arm/hadamard_neon.c b/third_party/aom/aom_dsp/arm/hadamard_neon.c
deleted file mode 100644
index 9baefae47..000000000
--- a/third_party/aom/aom_dsp/arm/hadamard_neon.c
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_dsp_rtcd.h"
-
-static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
-                                 int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
-                                 int16x8_t *a6, int16x8_t *a7) {
-  const int16x8_t b0 = vaddq_s16(*a0, *a1);
-  const int16x8_t b1 = vsubq_s16(*a0, *a1);
-  const int16x8_t b2 = vaddq_s16(*a2, *a3);
-  const int16x8_t b3 = vsubq_s16(*a2, *a3);
-  const int16x8_t b4 = vaddq_s16(*a4, *a5);
-  const int16x8_t b5 = vsubq_s16(*a4, *a5);
-  const int16x8_t b6 = vaddq_s16(*a6, *a7);
-  const int16x8_t b7 = vsubq_s16(*a6, *a7);
-
-  const int16x8_t c0 = vaddq_s16(b0, b2);
-  const int16x8_t c1 = vaddq_s16(b1, b3);
-  const int16x8_t c2 = vsubq_s16(b0, b2);
-  const int16x8_t c3 = vsubq_s16(b1, b3);
-  const int16x8_t c4 = vaddq_s16(b4, b6);
-  const int16x8_t c5 = vaddq_s16(b5, b7);
-  const int16x8_t c6 = vsubq_s16(b4, b6);
-  const int16x8_t c7 = vsubq_s16(b5, b7);
-
-  *a0 = vaddq_s16(c0, c4);
-  *a1 = vsubq_s16(c2, c6);
-  *a2 = vsubq_s16(c0, c4);
-  *a3 = vaddq_s16(c2, c6);
-  *a4 = vaddq_s16(c3, c7);
-  *a5 = vsubq_s16(c3, c7);
-  *a6 = vsubq_s16(c1, c5);
-  *a7 = vaddq_s16(c1, c5);
-}
-
-// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider
-// reversing transpose order which may make it easier for the compiler to
-// reconcile the vtrn.64 moves.
-static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
-                         int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
-                         int16x8_t *a6, int16x8_t *a7) {
-  // Swap 64 bit elements. Goes from:
-  // a0: 00 01 02 03 04 05 06 07
-  // a1: 08 09 10 11 12 13 14 15
-  // a2: 16 17 18 19 20 21 22 23
-  // a3: 24 25 26 27 28 29 30 31
-  // a4: 32 33 34 35 36 37 38 39
-  // a5: 40 41 42 43 44 45 46 47
-  // a6: 48 49 50 51 52 53 54 55
-  // a7: 56 57 58 59 60 61 62 63
-  // to:
-  // a04_lo: 00 01 02 03 32 33 34 35
-  // a15_lo: 08 09 10 11 40 41 42 43
-  // a26_lo: 16 17 18 19 48 49 50 51
-  // a37_lo: 24 25 26 27 56 57 58 59
-  // a04_hi: 04 05 06 07 36 37 38 39
-  // a15_hi: 12 13 14 15 44 45 46 47
-  // a26_hi: 20 21 22 23 52 53 54 55
-  // a37_hi: 28 29 30 31 60 61 62 63
-  const int16x8_t a04_lo = vcombine_s16(vget_low_s16(*a0), vget_low_s16(*a4));
-  const int16x8_t a15_lo = vcombine_s16(vget_low_s16(*a1), vget_low_s16(*a5));
-  const int16x8_t a26_lo = vcombine_s16(vget_low_s16(*a2), vget_low_s16(*a6));
-  const int16x8_t a37_lo = vcombine_s16(vget_low_s16(*a3), vget_low_s16(*a7));
-  const int16x8_t a04_hi = vcombine_s16(vget_high_s16(*a0), vget_high_s16(*a4));
-  const int16x8_t a15_hi = vcombine_s16(vget_high_s16(*a1), vget_high_s16(*a5));
-  const int16x8_t a26_hi = vcombine_s16(vget_high_s16(*a2), vget_high_s16(*a6));
-  const int16x8_t a37_hi = vcombine_s16(vget_high_s16(*a3), vget_high_s16(*a7));
-
-  // Swap 32 bit elements resulting in:
-  // a0246_lo:
-  // 00 01 16 17 32 33 48 49
-  // 02 03 18 19 34 35 50 51
-  // a1357_lo:
-  // 08 09 24 25 40 41 56 57
-  // 10 11 26 27 42 43 58 59
-  // a0246_hi:
-  // 04 05 20 21 36 37 52 53
-  // 06 07 22 23 38 39 54 55
-  // a1657_hi:
-  // 12 13 28 29 44 45 60 61
-  // 14 15 30 31 46 47 62 63
-  const int32x4x2_t a0246_lo =
-      vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo));
-  const int32x4x2_t a1357_lo =
-      vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo));
-  const int32x4x2_t a0246_hi =
-      vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi));
-  const int32x4x2_t a1357_hi =
-      vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi));
-
-  // Swap 16 bit elements resulting in:
-  // b0:
-  // 00 08 16 24 32 40 48 56
-  // 01 09 17 25 33 41 49 57
-  // b1:
-  // 02 10 18 26 34 42 50 58
-  // 03 11 19 27 35 43 51 59
-  // b2:
-  // 04 12 20 28 36 44 52 60
-  // 05 13 21 29 37 45 53 61
-  // b3:
-  // 06 14 22 30 38 46 54 62
-  // 07 15 23 31 39 47 55 63
-  const int16x8x2_t b0 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[0]),
-                                   vreinterpretq_s16_s32(a1357_lo.val[0]));
-  const int16x8x2_t b1 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[1]),
-                                   vreinterpretq_s16_s32(a1357_lo.val[1]));
-  const int16x8x2_t b2 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[0]),
-                                   vreinterpretq_s16_s32(a1357_hi.val[0]));
-  const int16x8x2_t b3 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[1]),
-                                   vreinterpretq_s16_s32(a1357_hi.val[1]));
-
-  *a0 = b0.val[0];
-  *a1 = b0.val[1];
-  *a2 = b1.val[0];
-  *a3 = b1.val[1];
-  *a4 = b2.val[0];
-  *a5 = b2.val[1];
-  *a6 = b3.val[0];
-  *a7 = b3.val[1];
-}
-
-void aom_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
-                           int16_t *coeff) {
-  int16x8_t a0 = vld1q_s16(src_diff);
-  int16x8_t a1 = vld1q_s16(src_diff + src_stride);
-  int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
-  int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
-  int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
-  int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
-  int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
-  int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
-
-  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-
-  transpose8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-
-  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-
-  // Skip the second transpose because it is not required.
-
-  vst1q_s16(coeff + 0, a0);
-  vst1q_s16(coeff + 8, a1);
-  vst1q_s16(coeff + 16, a2);
-  vst1q_s16(coeff + 24, a3);
-  vst1q_s16(coeff + 32, a4);
-  vst1q_s16(coeff + 40, a5);
-  vst1q_s16(coeff + 48, a6);
-  vst1q_s16(coeff + 56, a7);
-}
-
-void aom_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
-                             int16_t *coeff) {
-  int i;
-
-  /* Rearrange 16x16 to 8x32 and remove stride.
-   * Top left first. */
-  aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
-  /* Top right. */
-  aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
-  /* Bottom left. */
-  aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
-  /* Bottom right. */
-  aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
-
-  for (i = 0; i < 64; i += 8) {
-    const int16x8_t a0 = vld1q_s16(coeff + 0);
-    const int16x8_t a1 = vld1q_s16(coeff + 64);
-    const int16x8_t a2 = vld1q_s16(coeff + 128);
-    const int16x8_t a3 = vld1q_s16(coeff + 192);
-
-    const int16x8_t b0 = vhaddq_s16(a0, a1);
-    const int16x8_t b1 = vhsubq_s16(a0, a1);
-    const int16x8_t b2 = vhaddq_s16(a2, a3);
-    const int16x8_t b3 = vhsubq_s16(a2, a3);
-
-    const int16x8_t c0 = vaddq_s16(b0, b2);
-    const int16x8_t c1 = vaddq_s16(b1, b3);
-    const int16x8_t c2 = vsubq_s16(b0, b2);
-    const int16x8_t c3 = vsubq_s16(b1, b3);
-
-    vst1q_s16(coeff + 0, c0);
-    vst1q_s16(coeff + 64, c1);
-    vst1q_s16(coeff + 128, c2);
-    vst1q_s16(coeff + 192, c3);
-
-    coeff += 8;
-  }
-}
diff --git a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c
deleted file mode 100644
index 196b2a890..000000000
--- a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "aom_dsp/inv_txfm.h"
-#include "aom_ports/mem.h"
-
-void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8x8_t d2u8, d3u8, d30u8, d31u8;
-  uint64x1_t d2u64, d3u64, d4u64, d5u64;
-  uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
-  int16x8_t q0s16;
-  uint8_t *d1, *d2;
-  int16_t i, j, a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-
-  q0s16 = vdupq_n_s16(a1);
-  q0u16 = vreinterpretq_u16_s16(q0s16);
-
-  for (d1 = d2 = dest, i = 0; i < 4; i++) {
-    for (j = 0; j < 2; j++) {
-      d2u64 = vld1_u64((const uint64_t *)d1);
-      d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
-      d1 += dest_stride;
-      d4u64 = vld1_u64((const uint64_t *)d1);
-      d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
-      d1 += dest_stride;
-
-      q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
-      q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
-      q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
-      q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
-
-      d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-      d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-      d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-      d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-
-      vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-      vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
-      d2 += dest_stride;
-      vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
-      vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
-      d2 += dest_stride;
-    }
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon_asm.asm
deleted file mode 100644
index d01c4bc03..000000000
--- a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon_asm.asm
+++ /dev/null
@@ -1,201 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-
-
-    EXPORT  |aom_idct16x16_1_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
-;                                    int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|aom_idct16x16_1_add_neon| PROC
-    ldrsh            r0, [r0]
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; out = dct_const_round_shift(input[0] * cospi_16_64)
-    mul              r0, r0, r12               ; input[0] * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; out = dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    mov              r12, r1                   ; save dest
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; a1 = ROUND_POWER_OF_TWO(out, 6)
-    add              r0, r0, #32               ; + (1 <<((6) - 1))
-    asr              r0, r0, #6                ; >> 6
-
-    vdup.s16         q0, r0                    ; duplicate a1
-    mov              r0, #8
-    sub              r2, #8
-
-    ; load destination data row0 - row3
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    ; load destination data row4 - row7
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    ; load destination data row8 - row11
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    ; load destination data row12 - row15
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    bx               lr
-    ENDP             ; |aom_idct16x16_1_add_neon|
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c
deleted file mode 100644
index b4cb7a0cd..000000000
--- a/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c
+++ /dev/null
@@ -1,1295 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_config.h"
-#include "aom_dsp/txfm_common.h"
-
-static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
-                                int16x8_t *q10s16, int16x8_t *q11s16,
-                                int16x8_t *q12s16, int16x8_t *q13s16,
-                                int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
-  int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
-
-  *q8s16 = vcombine_s16(d16s16, d24s16);   // vswp d17, d24
-  *q9s16 = vcombine_s16(d18s16, d26s16);   // vswp d19, d26
-  *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
-  *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
-  *q12s16 = vcombine_s16(d17s16, d25s16);
-  *q13s16 = vcombine_s16(d19s16, d27s16);
-  *q14s16 = vcombine_s16(d21s16, d29s16);
-  *q15s16 = vcombine_s16(d23s16, d31s16);
-
-  q0x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
-  q1x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
-  q2x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
-  q3x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
-
-  q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
-                      vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
-  q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
-                      vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
-  q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
-                      vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
-  q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
-                      vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
-
-  *q8s16 = q0x2s16.val[0];
-  *q9s16 = q0x2s16.val[1];
-  *q10s16 = q1x2s16.val[0];
-  *q11s16 = q1x2s16.val[1];
-  *q12s16 = q2x2s16.val[0];
-  *q13s16 = q2x2s16.val[1];
-  *q14s16 = q3x2s16.val[0];
-  *q15s16 = q3x2s16.val[1];
-  return;
-}
-
-void aom_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
-                                      int output_stride) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
-  uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-  int16x8x2_t q0x2s16;
-
-  q0x2s16 = vld2q_s16(in);
-  q8s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q9s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q10s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q11s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q12s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q13s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q14s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q15s16 = q0x2s16.val[0];
-
-  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-               &q15s16);
-
-  d16s16 = vget_low_s16(q8s16);
-  d17s16 = vget_high_s16(q8s16);
-  d18s16 = vget_low_s16(q9s16);
-  d19s16 = vget_high_s16(q9s16);
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d22s16 = vget_low_s16(q11s16);
-  d23s16 = vget_high_s16(q11s16);
-  d24s16 = vget_low_s16(q12s16);
-  d25s16 = vget_high_s16(q12s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-  d30s16 = vget_low_s16(q15s16);
-  d31s16 = vget_high_s16(q15s16);
-
-  // stage 3
-  d0s16 = vdup_n_s16((int16_t)cospi_28_64);
-  d1s16 = vdup_n_s16((int16_t)cospi_4_64);
-
-  q2s32 = vmull_s16(d18s16, d0s16);
-  q3s32 = vmull_s16(d19s16, d0s16);
-  q5s32 = vmull_s16(d18s16, d1s16);
-  q6s32 = vmull_s16(d19s16, d1s16);
-
-  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
-  q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
-  q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
-
-  d2s16 = vdup_n_s16((int16_t)cospi_12_64);
-  d3s16 = vdup_n_s16((int16_t)cospi_20_64);
-
-  d8s16 = vqrshrn_n_s32(q2s32, 14);
-  d9s16 = vqrshrn_n_s32(q3s32, 14);
-  d14s16 = vqrshrn_n_s32(q5s32, 14);
-  d15s16 = vqrshrn_n_s32(q6s32, 14);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-  q7s16 = vcombine_s16(d14s16, d15s16);
-
-  q2s32 = vmull_s16(d26s16, d2s16);
-  q3s32 = vmull_s16(d27s16, d2s16);
-  q9s32 = vmull_s16(d26s16, d3s16);
-  q15s32 = vmull_s16(d27s16, d3s16);
-
-  q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
-  q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
-  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
-  q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
-
-  d10s16 = vqrshrn_n_s32(q2s32, 14);
-  d11s16 = vqrshrn_n_s32(q3s32, 14);
-  d12s16 = vqrshrn_n_s32(q9s32, 14);
-  d13s16 = vqrshrn_n_s32(q15s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  // stage 4
-  d30s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q2s32 = vmull_s16(d16s16, d30s16);
-  q11s32 = vmull_s16(d17s16, d30s16);
-  q0s32 = vmull_s16(d24s16, d30s16);
-  q1s32 = vmull_s16(d25s16, d30s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_24_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_8_64);
-
-  q3s32 = vaddq_s32(q2s32, q0s32);
-  q12s32 = vaddq_s32(q11s32, q1s32);
-  q13s32 = vsubq_s32(q2s32, q0s32);
-  q1s32 = vsubq_s32(q11s32, q1s32);
-
-  d16s16 = vqrshrn_n_s32(q3s32, 14);
-  d17s16 = vqrshrn_n_s32(q12s32, 14);
-  d18s16 = vqrshrn_n_s32(q13s32, 14);
-  d19s16 = vqrshrn_n_s32(q1s32, 14);
-  q8s16 = vcombine_s16(d16s16, d17s16);
-  q9s16 = vcombine_s16(d18s16, d19s16);
-
-  q0s32 = vmull_s16(d20s16, d31s16);
-  q1s32 = vmull_s16(d21s16, d31s16);
-  q12s32 = vmull_s16(d20s16, d30s16);
-  q13s32 = vmull_s16(d21s16, d30s16);
-
-  q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
-  q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
-  q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
-  q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
-
-  d22s16 = vqrshrn_n_s32(q0s32, 14);
-  d23s16 = vqrshrn_n_s32(q1s32, 14);
-  d20s16 = vqrshrn_n_s32(q12s32, 14);
-  d21s16 = vqrshrn_n_s32(q13s32, 14);
-  q10s16 = vcombine_s16(d20s16, d21s16);
-  q11s16 = vcombine_s16(d22s16, d23s16);
-
-  q13s16 = vsubq_s16(q4s16, q5s16);
-  q4s16 = vaddq_s16(q4s16, q5s16);
-  q14s16 = vsubq_s16(q7s16, q6s16);
-  q15s16 = vaddq_s16(q6s16, q7s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-
-  // stage 5
-  q0s16 = vaddq_s16(q8s16, q11s16);
-  q1s16 = vaddq_s16(q9s16, q10s16);
-  q2s16 = vsubq_s16(q9s16, q10s16);
-  q3s16 = vsubq_s16(q8s16, q11s16);
-
-  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q11s32 = vmull_s16(d26s16, d16s16);
-  q12s32 = vmull_s16(d27s16, d16s16);
-  q9s32 = vmull_s16(d28s16, d16s16);
-  q10s32 = vmull_s16(d29s16, d16s16);
-
-  q6s32 = vsubq_s32(q9s32, q11s32);
-  q13s32 = vsubq_s32(q10s32, q12s32);
-  q9s32 = vaddq_s32(q9s32, q11s32);
-  q10s32 = vaddq_s32(q10s32, q12s32);
-
-  d10s16 = vqrshrn_n_s32(q6s32, 14);
-  d11s16 = vqrshrn_n_s32(q13s32, 14);
-  d12s16 = vqrshrn_n_s32(q9s32, 14);
-  d13s16 = vqrshrn_n_s32(q10s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  // stage 6
-  q8s16 = vaddq_s16(q0s16, q15s16);
-  q9s16 = vaddq_s16(q1s16, q6s16);
-  q10s16 = vaddq_s16(q2s16, q5s16);
-  q11s16 = vaddq_s16(q3s16, q4s16);
-  q12s16 = vsubq_s16(q3s16, q4s16);
-  q13s16 = vsubq_s16(q2s16, q5s16);
-  q14s16 = vsubq_s16(q1s16, q6s16);
-  q15s16 = vsubq_s16(q0s16, q15s16);
-
-  d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
-  d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
-  d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
-  d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
-  d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
-  d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
-  d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
-  d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
-  d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
-  d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
-  d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
-  // store the data
-  output_stride >>= 1;  // output_stride / 2, out is int16_t
-  vst1_u64((uint64_t *)out, d16u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d17u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d18u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d19u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d20u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d21u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d22u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d23u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d24u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d28u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d29u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d30u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d31u64);
-  return;
-}
-
-void aom_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
-                                      int16_t *pass1Output, int16_t skip_adding,
-                                      uint8_t *dest, int dest_stride) {
-  uint8_t *d;
-  uint8x8_t d12u8, d13u8;
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  uint64x1_t d24u64, d25u64, d26u64, d27u64;
-  int64x1_t d12s64, d13s64;
-  uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
-  uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32;
-  int16x8x2_t q0x2s16;
-
-  q0x2s16 = vld2q_s16(src);
-  q8s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q9s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q10s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q11s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q12s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q13s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q14s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q15s16 = q0x2s16.val[0];
-
-  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-               &q15s16);
-
-  d16s16 = vget_low_s16(q8s16);
-  d17s16 = vget_high_s16(q8s16);
-  d18s16 = vget_low_s16(q9s16);
-  d19s16 = vget_high_s16(q9s16);
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d22s16 = vget_low_s16(q11s16);
-  d23s16 = vget_high_s16(q11s16);
-  d24s16 = vget_low_s16(q12s16);
-  d25s16 = vget_high_s16(q12s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-  d30s16 = vget_low_s16(q15s16);
-  d31s16 = vget_high_s16(q15s16);
-
-  // stage 3
-  d12s16 = vdup_n_s16((int16_t)cospi_30_64);
-  d13s16 = vdup_n_s16((int16_t)cospi_2_64);
-
-  q2s32 = vmull_s16(d16s16, d12s16);
-  q3s32 = vmull_s16(d17s16, d12s16);
-  q1s32 = vmull_s16(d16s16, d13s16);
-  q4s32 = vmull_s16(d17s16, d13s16);
-
-  q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
-  q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
-  q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
-  q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
-
-  d0s16 = vqrshrn_n_s32(q2s32, 14);
-  d1s16 = vqrshrn_n_s32(q3s32, 14);
-  d14s16 = vqrshrn_n_s32(q1s32, 14);
-  d15s16 = vqrshrn_n_s32(q4s32, 14);
-  q0s16 = vcombine_s16(d0s16, d1s16);
-  q7s16 = vcombine_s16(d14s16, d15s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_14_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_18_64);
-
-  q2s32 = vmull_s16(d24s16, d30s16);
-  q3s32 = vmull_s16(d25s16, d30s16);
-  q4s32 = vmull_s16(d24s16, d31s16);
-  q5s32 = vmull_s16(d25s16, d31s16);
-
-  q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
-  q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
-  q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
-  q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
-
-  d2s16 = vqrshrn_n_s32(q2s32, 14);
-  d3s16 = vqrshrn_n_s32(q3s32, 14);
-  d12s16 = vqrshrn_n_s32(q4s32, 14);
-  d13s16 = vqrshrn_n_s32(q5s32, 14);
-  q1s16 = vcombine_s16(d2s16, d3s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_22_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_10_64);
-
-  q11s32 = vmull_s16(d20s16, d30s16);
-  q12s32 = vmull_s16(d21s16, d30s16);
-  q4s32 = vmull_s16(d20s16, d31s16);
-  q5s32 = vmull_s16(d21s16, d31s16);
-
-  q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
-  q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
-  q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
-  q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
-
-  d4s16 = vqrshrn_n_s32(q11s32, 14);
-  d5s16 = vqrshrn_n_s32(q12s32, 14);
-  d11s16 = vqrshrn_n_s32(q5s32, 14);
-  d10s16 = vqrshrn_n_s32(q4s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_6_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_26_64);
-
-  q10s32 = vmull_s16(d28s16, d30s16);
-  q11s32 = vmull_s16(d29s16, d30s16);
-  q12s32 = vmull_s16(d28s16, d31s16);
-  q13s32 = vmull_s16(d29s16, d31s16);
-
-  q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
-  q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
-  q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
-  q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
-
-  d6s16 = vqrshrn_n_s32(q10s32, 14);
-  d7s16 = vqrshrn_n_s32(q11s32, 14);
-  d8s16 = vqrshrn_n_s32(q12s32, 14);
-  d9s16 = vqrshrn_n_s32(q13s32, 14);
-  q3s16 = vcombine_s16(d6s16, d7s16);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-
-  // stage 3
-  q9s16 = vsubq_s16(q0s16, q1s16);
-  q0s16 = vaddq_s16(q0s16, q1s16);
-  q10s16 = vsubq_s16(q3s16, q2s16);
-  q11s16 = vaddq_s16(q2s16, q3s16);
-  q12s16 = vaddq_s16(q4s16, q5s16);
-  q13s16 = vsubq_s16(q4s16, q5s16);
-  q14s16 = vsubq_s16(q7s16, q6s16);
-  q7s16 = vaddq_s16(q6s16, q7s16);
-
-  // stage 4
-  d18s16 = vget_low_s16(q9s16);
-  d19s16 = vget_high_s16(q9s16);
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_8_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_24_64);
-
-  q2s32 = vmull_s16(d18s16, d31s16);
-  q3s32 = vmull_s16(d19s16, d31s16);
-  q4s32 = vmull_s16(d28s16, d31s16);
-  q5s32 = vmull_s16(d29s16, d31s16);
-
-  q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
-  q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
-  q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
-  q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
-
-  d12s16 = vqrshrn_n_s32(q2s32, 14);
-  d13s16 = vqrshrn_n_s32(q3s32, 14);
-  d2s16 = vqrshrn_n_s32(q4s32, 14);
-  d3s16 = vqrshrn_n_s32(q5s32, 14);
-  q1s16 = vcombine_s16(d2s16, d3s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  q3s16 = q11s16;
-  q4s16 = q12s16;
-
-  d30s16 = vdup_n_s16(-cospi_8_64);
-  q11s32 = vmull_s16(d26s16, d30s16);
-  q12s32 = vmull_s16(d27s16, d30s16);
-  q8s32 = vmull_s16(d20s16, d30s16);
-  q9s32 = vmull_s16(d21s16, d30s16);
-
-  q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
-  q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
-  q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
-  q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
-
-  d4s16 = vqrshrn_n_s32(q11s32, 14);
-  d5s16 = vqrshrn_n_s32(q12s32, 14);
-  d10s16 = vqrshrn_n_s32(q8s32, 14);
-  d11s16 = vqrshrn_n_s32(q9s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  // stage 5
-  q8s16 = vaddq_s16(q0s16, q3s16);
-  q9s16 = vaddq_s16(q1s16, q2s16);
-  q10s16 = vsubq_s16(q1s16, q2s16);
-  q11s16 = vsubq_s16(q0s16, q3s16);
-  q12s16 = vsubq_s16(q7s16, q4s16);
-  q13s16 = vsubq_s16(q6s16, q5s16);
-  q14s16 = vaddq_s16(q6s16, q5s16);
-  q15s16 = vaddq_s16(q7s16, q4s16);
-
-  // stage 6
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d22s16 = vget_low_s16(q11s16);
-  d23s16 = vget_high_s16(q11s16);
-  d24s16 = vget_low_s16(q12s16);
-  d25s16 = vget_high_s16(q12s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-
-  d14s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q3s32 = vmull_s16(d26s16, d14s16);
-  q4s32 = vmull_s16(d27s16, d14s16);
-  q0s32 = vmull_s16(d20s16, d14s16);
-  q1s32 = vmull_s16(d21s16, d14s16);
-
-  q5s32 = vsubq_s32(q3s32, q0s32);
-  q6s32 = vsubq_s32(q4s32, q1s32);
-  q10s32 = vaddq_s32(q3s32, q0s32);
-  q4s32 = vaddq_s32(q4s32, q1s32);
-
-  d4s16 = vqrshrn_n_s32(q5s32, 14);
-  d5s16 = vqrshrn_n_s32(q6s32, 14);
-  d10s16 = vqrshrn_n_s32(q10s32, 14);
-  d11s16 = vqrshrn_n_s32(q4s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  q0s32 = vmull_s16(d22s16, d14s16);
-  q1s32 = vmull_s16(d23s16, d14s16);
-  q13s32 = vmull_s16(d24s16, d14s16);
-  q6s32 = vmull_s16(d25s16, d14s16);
-
-  q10s32 = vsubq_s32(q13s32, q0s32);
-  q4s32 = vsubq_s32(q6s32, q1s32);
-  q13s32 = vaddq_s32(q13s32, q0s32);
-  q6s32 = vaddq_s32(q6s32, q1s32);
-
-  d6s16 = vqrshrn_n_s32(q10s32, 14);
-  d7s16 = vqrshrn_n_s32(q4s32, 14);
-  d8s16 = vqrshrn_n_s32(q13s32, 14);
-  d9s16 = vqrshrn_n_s32(q6s32, 14);
-  q3s16 = vcombine_s16(d6s16, d7s16);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-
-  // stage 7
-  if (skip_adding != 0) {
-    d = dest;
-    // load the data in pass1
-    q0s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q1s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    d13s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-
-    q12s16 = vaddq_s16(q0s16, q15s16);
-    q13s16 = vaddq_s16(q1s16, q14s16);
-    q12s16 = vrshrq_n_s16(q12s16, 6);
-    q13s16 = vrshrq_n_s16(q13s16, 6);
-    q12u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
-    q13u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-    d += dest_stride;
-    q14s16 = vsubq_s16(q1s16, q14s16);
-    q15s16 = vsubq_s16(q0s16, q15s16);
-
-    q10s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q11s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    d13s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q12s16 = vaddq_s16(q10s16, q5s16);
-    q13s16 = vaddq_s16(q11s16, q4s16);
-    q12s16 = vrshrq_n_s16(q12s16, 6);
-    q13s16 = vrshrq_n_s16(q13s16, 6);
-    q12u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
-    q13u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-    d += dest_stride;
-    q4s16 = vsubq_s16(q11s16, q4s16);
-    q5s16 = vsubq_s16(q10s16, q5s16);
-
-    q0s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q1s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    d13s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q12s16 = vaddq_s16(q0s16, q3s16);
-    q13s16 = vaddq_s16(q1s16, q2s16);
-    q12s16 = vrshrq_n_s16(q12s16, 6);
-    q13s16 = vrshrq_n_s16(q13s16, 6);
-    q12u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
-    q13u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-    d += dest_stride;
-    q2s16 = vsubq_s16(q1s16, q2s16);
-    q3s16 = vsubq_s16(q0s16, q3s16);
-
-    q10s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q11s16 = vld1q_s16(pass1Output);
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    d13s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q12s16 = vaddq_s16(q10s16, q9s16);
-    q13s16 = vaddq_s16(q11s16, q8s16);
-    q12s16 = vrshrq_n_s16(q12s16, 6);
-    q13s16 = vrshrq_n_s16(q13s16, 6);
-    q12u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
-    q13u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-    d += dest_stride;
-    q8s16 = vsubq_s16(q11s16, q8s16);
-    q9s16 = vsubq_s16(q10s16, q9s16);
-
-    // store the data  out 8,9,10,11,12,13,14,15
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q8s16 = vrshrq_n_s16(q8s16, 6);
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q9s16 = vrshrq_n_s16(q9s16, 6);
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q2s16 = vrshrq_n_s16(q2s16, 6);
-    q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q3s16 = vrshrq_n_s16(q3s16, 6);
-    q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q4s16 = vrshrq_n_s16(q4s16, 6);
-    q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q5s16 = vrshrq_n_s16(q5s16, 6);
-    q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q14s16 = vrshrq_n_s16(q14s16, 6);
-    q14u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    q15s16 = vrshrq_n_s16(q15s16, 6);
-    q15u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-  } else {  // skip_adding_dest
-    q0s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q1s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q12s16 = vaddq_s16(q0s16, q15s16);
-    q13s16 = vaddq_s16(q1s16, q14s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q14s16 = vsubq_s16(q1s16, q14s16);
-    q15s16 = vsubq_s16(q0s16, q15s16);
-
-    q10s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q11s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q12s16 = vaddq_s16(q10s16, q5s16);
-    q13s16 = vaddq_s16(q11s16, q4s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q4s16 = vsubq_s16(q11s16, q4s16);
-    q5s16 = vsubq_s16(q10s16, q5s16);
-
-    q0s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q1s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q12s16 = vaddq_s16(q0s16, q3s16);
-    q13s16 = vaddq_s16(q1s16, q2s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q2s16 = vsubq_s16(q1s16, q2s16);
-    q3s16 = vsubq_s16(q0s16, q3s16);
-
-    q10s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q11s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q12s16 = vaddq_s16(q10s16, q9s16);
-    q13s16 = vaddq_s16(q11s16, q8s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q8s16 = vsubq_s16(q11s16, q8s16);
-    q9s16 = vsubq_s16(q10s16, q9s16);
-
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
-  }
-  return;
-}
-
-void aom_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out,
-                                     int output_stride) {
-  int16x4_t d4s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
-  uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
-  int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4_t q6s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q15s32;
-  int16x8x2_t q0x2s16;
-
-  q0x2s16 = vld2q_s16(in);
-  q8s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q9s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q10s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q11s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q12s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q13s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q14s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q15s16 = q0x2s16.val[0];
-
-  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-               &q15s16);
-
-  // stage 3
-  q0s16 = vdupq_n_s16((int16_t)(cospi_28_64 * 2));
-  q1s16 = vdupq_n_s16((int16_t)(cospi_4_64 * 2));
-
-  q4s16 = vqrdmulhq_s16(q9s16, q0s16);
-  q7s16 = vqrdmulhq_s16(q9s16, q1s16);
-
-  // stage 4
-  q1s16 = vdupq_n_s16((int16_t)(cospi_16_64 * 2));
-  d4s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q8s16 = vqrdmulhq_s16(q8s16, q1s16);
-
-  d8s16 = vget_low_s16(q4s16);
-  d9s16 = vget_high_s16(q4s16);
-  d14s16 = vget_low_s16(q7s16);
-  d15s16 = vget_high_s16(q7s16);
-  q9s32 = vmull_s16(d14s16, d4s16);
-  q10s32 = vmull_s16(d15s16, d4s16);
-  q12s32 = vmull_s16(d9s16, d4s16);
-  q11s32 = vmull_s16(d8s16, d4s16);
-
-  q15s32 = vsubq_s32(q10s32, q12s32);
-  q6s32 = vsubq_s32(q9s32, q11s32);
-  q9s32 = vaddq_s32(q9s32, q11s32);
-  q10s32 = vaddq_s32(q10s32, q12s32);
-
-  d11s16 = vqrshrn_n_s32(q15s32, 14);
-  d10s16 = vqrshrn_n_s32(q6s32, 14);
-  d12s16 = vqrshrn_n_s32(q9s32, 14);
-  d13s16 = vqrshrn_n_s32(q10s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  // stage 6
-  q2s16 = vaddq_s16(q8s16, q7s16);
-  q9s16 = vaddq_s16(q8s16, q6s16);
-  q10s16 = vaddq_s16(q8s16, q5s16);
-  q11s16 = vaddq_s16(q8s16, q4s16);
-  q12s16 = vsubq_s16(q8s16, q4s16);
-  q13s16 = vsubq_s16(q8s16, q5s16);
-  q14s16 = vsubq_s16(q8s16, q6s16);
-  q15s16 = vsubq_s16(q8s16, q7s16);
-
-  d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
-  d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
-  d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
-  d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
-  d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
-  d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
-  d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
-  d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
-  d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
-  d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
-  d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
-  // store the data
-  output_stride >>= 1;  // output_stride / 2, out is int16_t
-  vst1_u64((uint64_t *)out, d4u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d5u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d18u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d19u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d20u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d21u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d22u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d23u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d24u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d28u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d29u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d30u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d31u64);
-  return;
-}
-
-void aom_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out,
-                                     int16_t *pass1Output, int16_t skip_adding,
-                                     uint8_t *dest, int dest_stride) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
-  uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
-  uint64x1_t d16u64, d17u64, d18u64, d19u64;
-  uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32;
-  int16x8x2_t q0x2s16;
-  (void)skip_adding;
-  (void)dest;
-  (void)dest_stride;
-
-  q0x2s16 = vld2q_s16(src);
-  q8s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q9s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q10s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q11s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q12s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q13s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q14s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q15s16 = q0x2s16.val[0];
-
-  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-               &q15s16);
-
-  // stage 3
-  q6s16 = vdupq_n_s16((int16_t)(cospi_30_64 * 2));
-  q0s16 = vqrdmulhq_s16(q8s16, q6s16);
-  q6s16 = vdupq_n_s16((int16_t)(cospi_2_64 * 2));
-  q7s16 = vqrdmulhq_s16(q8s16, q6s16);
-
-  q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
-  q14s16 = vdupq_n_s16((int16_t)(cospi_6_64 * 2));
-  q3s16 = vqrdmulhq_s16(q9s16, q15s16);
-  q4s16 = vqrdmulhq_s16(q9s16, q14s16);
-
-  // stage 4
-  d0s16 = vget_low_s16(q0s16);
-  d1s16 = vget_high_s16(q0s16);
-  d6s16 = vget_low_s16(q3s16);
-  d7s16 = vget_high_s16(q3s16);
-  d8s16 = vget_low_s16(q4s16);
-  d9s16 = vget_high_s16(q4s16);
-  d14s16 = vget_low_s16(q7s16);
-  d15s16 = vget_high_s16(q7s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_8_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_24_64);
-
-  q12s32 = vmull_s16(d14s16, d31s16);
-  q5s32 = vmull_s16(d15s16, d31s16);
-  q2s32 = vmull_s16(d0s16, d31s16);
-  q11s32 = vmull_s16(d1s16, d31s16);
-
-  q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
-  q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
-  q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
-  q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
-
-  d2s16 = vqrshrn_n_s32(q12s32, 14);
-  d3s16 = vqrshrn_n_s32(q5s32, 14);
-  d12s16 = vqrshrn_n_s32(q2s32, 14);
-  d13s16 = vqrshrn_n_s32(q11s32, 14);
-  q1s16 = vcombine_s16(d2s16, d3s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  d30s16 = vdup_n_s16(-cospi_8_64);
-  q10s32 = vmull_s16(d8s16, d30s16);
-  q13s32 = vmull_s16(d9s16, d30s16);
-  q8s32 = vmull_s16(d6s16, d30s16);
-  q9s32 = vmull_s16(d7s16, d30s16);
-
-  q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
-  q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
-  q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
-  q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
-
-  d4s16 = vqrshrn_n_s32(q10s32, 14);
-  d5s16 = vqrshrn_n_s32(q13s32, 14);
-  d10s16 = vqrshrn_n_s32(q8s32, 14);
-  d11s16 = vqrshrn_n_s32(q9s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  // stage 5
-  q8s16 = vaddq_s16(q0s16, q3s16);
-  q9s16 = vaddq_s16(q1s16, q2s16);
-  q10s16 = vsubq_s16(q1s16, q2s16);
-  q11s16 = vsubq_s16(q0s16, q3s16);
-  q12s16 = vsubq_s16(q7s16, q4s16);
-  q13s16 = vsubq_s16(q6s16, q5s16);
-  q14s16 = vaddq_s16(q6s16, q5s16);
-  q15s16 = vaddq_s16(q7s16, q4s16);
-
-  // stage 6
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d22s16 = vget_low_s16(q11s16);
-  d23s16 = vget_high_s16(q11s16);
-  d24s16 = vget_low_s16(q12s16);
-  d25s16 = vget_high_s16(q12s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-
-  d14s16 = vdup_n_s16((int16_t)cospi_16_64);
-  q3s32 = vmull_s16(d26s16, d14s16);
-  q4s32 = vmull_s16(d27s16, d14s16);
-  q0s32 = vmull_s16(d20s16, d14s16);
-  q1s32 = vmull_s16(d21s16, d14s16);
-
-  q5s32 = vsubq_s32(q3s32, q0s32);
-  q6s32 = vsubq_s32(q4s32, q1s32);
-  q0s32 = vaddq_s32(q3s32, q0s32);
-  q4s32 = vaddq_s32(q4s32, q1s32);
-
-  d4s16 = vqrshrn_n_s32(q5s32, 14);
-  d5s16 = vqrshrn_n_s32(q6s32, 14);
-  d10s16 = vqrshrn_n_s32(q0s32, 14);
-  d11s16 = vqrshrn_n_s32(q4s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  q0s32 = vmull_s16(d22s16, d14s16);
-  q1s32 = vmull_s16(d23s16, d14s16);
-  q13s32 = vmull_s16(d24s16, d14s16);
-  q6s32 = vmull_s16(d25s16, d14s16);
-
-  q10s32 = vsubq_s32(q13s32, q0s32);
-  q4s32 = vsubq_s32(q6s32, q1s32);
-  q13s32 = vaddq_s32(q13s32, q0s32);
-  q6s32 = vaddq_s32(q6s32, q1s32);
-
-  d6s16 = vqrshrn_n_s32(q10s32, 14);
-  d7s16 = vqrshrn_n_s32(q4s32, 14);
-  d8s16 = vqrshrn_n_s32(q13s32, 14);
-  d9s16 = vqrshrn_n_s32(q6s32, 14);
-  q3s16 = vcombine_s16(d6s16, d7s16);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-
-  // stage 7
-  q0s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q1s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q12s16 = vaddq_s16(q0s16, q15s16);
-  q13s16 = vaddq_s16(q1s16, q14s16);
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  vst1_u64((uint64_t *)out, d24u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += 12;
-  q14s16 = vsubq_s16(q1s16, q14s16);
-  q15s16 = vsubq_s16(q0s16, q15s16);
-
-  q10s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q11s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q12s16 = vaddq_s16(q10s16, q5s16);
-  q13s16 = vaddq_s16(q11s16, q4s16);
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  vst1_u64((uint64_t *)out, d24u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += 12;
-  q4s16 = vsubq_s16(q11s16, q4s16);
-  q5s16 = vsubq_s16(q10s16, q5s16);
-
-  q0s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q1s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q12s16 = vaddq_s16(q0s16, q3s16);
-  q13s16 = vaddq_s16(q1s16, q2s16);
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  vst1_u64((uint64_t *)out, d24u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += 12;
-  q2s16 = vsubq_s16(q1s16, q2s16);
-  q3s16 = vsubq_s16(q0s16, q3s16);
-
-  q10s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q11s16 = vld1q_s16(pass1Output);
-  q12s16 = vaddq_s16(q10s16, q9s16);
-  q13s16 = vaddq_s16(q11s16, q8s16);
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  vst1_u64((uint64_t *)out, d24u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += 12;
-  q8s16 = vsubq_s16(q11s16, q8s16);
-  q9s16 = vsubq_s16(q10s16, q9s16);
-
-  d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
-  d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
-  d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
-  d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
-  d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
-  d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
-  d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
-  d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
-  d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
-  d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
-  d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
-  d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
-  d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
-  d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
-  d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
-  d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
-  vst1_u64((uint64_t *)out, d16u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d17u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d18u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d19u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d4u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d5u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d6u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d7u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d8u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d9u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d10u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d11u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d28u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d29u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d30u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d31u64);
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/idct16x16_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct16x16_add_neon_asm.asm
deleted file mode 100644
index 4a8f8f183..000000000
--- a/third_party/aom/aom_dsp/arm/idct16x16_add_neon_asm.asm
+++ /dev/null
@@ -1,1182 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-    EXPORT  |aom_idct16x16_256_add_neon_pass1|
-    EXPORT  |aom_idct16x16_256_add_neon_pass2|
-    EXPORT  |aom_idct16x16_10_add_neon_pass1|
-    EXPORT  |aom_idct16x16_10_add_neon_pass2|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
-    MACRO
-    TRANSPOSE8X8
-    vswp            d17, d24
-    vswp            d23, d30
-    vswp            d21, d28
-    vswp            d19, d26
-    vtrn.32         q8, q10
-    vtrn.32         q9, q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vtrn.16         q8, q9
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    MEND
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void |aom_idct16x16_256_add_neon_pass1|(int16_t *input,
-;                                          int16_t *output, int output_stride)
-;
-; r0  int16_t input
-; r1  int16_t *output
-; r2  int  output_stride)
-
-; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|aom_idct16x16_256_add_neon_pass1| PROC
-
-    ; TODO(hkuang): Find a better way to load the elements.
-    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
-    vld2.s16        {q8,q9}, [r0]!
-    vld2.s16        {q9,q10}, [r0]!
-    vld2.s16        {q10,q11}, [r0]!
-    vld2.s16        {q11,q12}, [r0]!
-    vld2.s16        {q12,q13}, [r0]!
-    vld2.s16        {q13,q14}, [r0]!
-    vld2.s16        {q14,q15}, [r0]!
-    vld2.s16        {q1,q2}, [r0]!
-    vmov.s16        q15, q1
-
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0xc00
-    add             r3, #0x7c
-
-    ; generate cospi_4_64  = 16069
-    mov             r12, #0x3e00
-    add             r12, #0xc5
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; stage 3
-    vdup.16         d0, r3                    ; duplicate cospi_28_64
-    vdup.16         d1, r12                   ; duplicate cospi_4_64
-
-    ; preloading to avoid stall
-    ; generate cospi_12_64 = 13623
-    mov             r3, #0x3500
-    add             r3, #0x37
-
-    ; generate cospi_20_64 = 9102
-    mov             r12, #0x2300
-    add             r12, #0x8e
-
-    ; step2[4] * cospi_28_64
-    vmull.s16       q2, d18, d0
-    vmull.s16       q3, d19, d0
-
-    ; step2[4] * cospi_4_64
-    vmull.s16       q5, d18, d1
-    vmull.s16       q6, d19, d1
-
-    ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64
-    vmlsl.s16       q2, d30, d1
-    vmlsl.s16       q3, d31, d1
-
-    ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64
-    vmlal.s16       q5, d30, d0
-    vmlal.s16       q6, d31, d0
-
-    vdup.16         d2, r3                    ; duplicate cospi_12_64
-    vdup.16         d3, r12                   ; duplicate cospi_20_64
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d8, q2, #14               ; >> 14
-    vqrshrn.s32     d9, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d14, q5, #14              ; >> 14
-    vqrshrn.s32     d15, q6, #14              ; >> 14
-
-    ; preloading to avoid stall
-    ; generate cospi_16_64 = 11585
-    mov             r3, #0x2d00
-    add             r3, #0x41
-
-    ; generate cospi_24_64 = 6270
-    mov             r12, #0x1800
-    add             r12, #0x7e
-
-    ; step2[5] * cospi_12_64
-    vmull.s16       q2, d26, d2
-    vmull.s16       q3, d27, d2
-
-    ; step2[5] * cospi_20_64
-    vmull.s16       q9, d26, d3
-    vmull.s16       q15, d27, d3
-
-    ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vmlsl.s16       q2, d22, d3
-    vmlsl.s16       q3, d23, d3
-
-    ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64
-    vmlal.s16       q9, d22, d2
-    vmlal.s16       q15, d23, d2
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d10, q2, #14              ; >> 14
-    vqrshrn.s32     d11, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q15, #14             ; >> 14
-
-    ; stage 4
-    vdup.16         d30, r3                   ; cospi_16_64
-
-    ; step1[0] * cospi_16_64
-    vmull.s16       q2, d16, d30
-    vmull.s16       q11, d17, d30
-
-    ; step1[1] * cospi_16_64
-    vmull.s16       q0, d24, d30
-    vmull.s16       q1, d25, d30
-
-    ; generate cospi_8_64 = 15137
-    mov             r3, #0x3b00
-    add             r3, #0x21
-
-    vdup.16         d30, r12                  ; duplicate cospi_24_64
-    vdup.16         d31, r3                   ; duplicate cospi_8_64
-
-    ; temp1 = (step1[0] + step1[1]) * cospi_16_64
-    vadd.s32        q3, q2, q0
-    vadd.s32        q12, q11, q1
-
-    ; temp2 = (step1[0] - step1[1]) * cospi_16_64
-    vsub.s32        q13, q2, q0
-    vsub.s32        q1, q11, q1
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d16, q3, #14              ; >> 14
-    vqrshrn.s32     d17, q12, #14             ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d18, q13, #14             ; >> 14
-    vqrshrn.s32     d19, q1, #14              ; >> 14
-
-    ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-    ; step1[2] * cospi_8_64
-    vmull.s16       q0, d20, d31
-    vmull.s16       q1, d21, d31
-
-    ; step1[2] * cospi_24_64
-    vmull.s16       q12, d20, d30
-    vmull.s16       q13, d21, d30
-
-    ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vmlal.s16       q0, d28, d30
-    vmlal.s16       q1, d29, d30
-
-    ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlsl.s16       q12, d28, d31
-    vmlsl.s16       q13, d29, d31
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d22, q0, #14              ; >> 14
-    vqrshrn.s32     d23, q1, #14              ; >> 14
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d20, q12, #14             ; >> 14
-    vqrshrn.s32     d21, q13, #14             ; >> 14
-
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5];
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5];
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7];
-    vadd.s16        q15, q6, q7               ; step2[7] = step1[6] + step1[7];
-
-    ; generate cospi_16_64 = 11585
-    mov             r3, #0x2d00
-    add             r3, #0x41
-
-    ; stage 5
-    vadd.s16        q0, q8, q11               ; step1[0] = step2[0] + step2[3];
-    vadd.s16        q1, q9, q10               ; step1[1] = step2[1] + step2[2];
-    vsub.s16        q2, q9, q10               ; step1[2] = step2[1] - step2[2];
-    vsub.s16        q3, q8, q11               ; step1[3] = step2[0] - step2[3];
-
-    vdup.16         d16, r3;                  ; duplicate cospi_16_64
-
-    ; step2[5] * cospi_16_64
-    vmull.s16       q11, d26, d16
-    vmull.s16       q12, d27, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; temp1 = (step2[6] - step2[5]) * cospi_16_64
-    vsub.s32        q6, q9, q11
-    vsub.s32        q13, q10, q12
-
-    ; temp2 = (step2[5] + step2[6]) * cospi_16_64
-    vadd.s32        q9, q9, q11
-    vadd.s32        q10, q10, q12
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d10, q6, #14              ; >> 14
-    vqrshrn.s32     d11, q13, #14             ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q10, #14             ; >> 14
-
-    ; stage 6
-    vadd.s16        q8, q0, q15                ; step2[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; step2[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; step2[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; step2[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; step2[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; step2[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; step2[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q15              ; step2[7] = step1[0] - step1[7];
-
-    ; store the data
-    vst1.64         {d16}, [r1], r2
-    vst1.64         {d17}, [r1], r2
-    vst1.64         {d18}, [r1], r2
-    vst1.64         {d19}, [r1], r2
-    vst1.64         {d20}, [r1], r2
-    vst1.64         {d21}, [r1], r2
-    vst1.64         {d22}, [r1], r2
-    vst1.64         {d23}, [r1], r2
-    vst1.64         {d24}, [r1], r2
-    vst1.64         {d25}, [r1], r2
-    vst1.64         {d26}, [r1], r2
-    vst1.64         {d27}, [r1], r2
-    vst1.64         {d28}, [r1], r2
-    vst1.64         {d29}, [r1], r2
-    vst1.64         {d30}, [r1], r2
-    vst1.64         {d31}, [r1], r2
-
-    bx              lr
-    ENDP  ; |aom_idct16x16_256_add_neon_pass1|
-
-;void aom_idct16x16_256_add_neon_pass2(int16_t *src,
-;                                        int16_t *output,
-;                                        int16_t *pass1Output,
-;                                        int16_t skip_adding,
-;                                        uint8_t *dest,
-;                                        int dest_stride)
-;
-; r0  int16_t *src
-; r1  int16_t *output,
-; r2  int16_t *pass1Output,
-; r3  int16_t skip_adding,
-; r4  uint8_t *dest,
-; r5  int dest_stride)
-
-; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|aom_idct16x16_256_add_neon_pass2| PROC
-    push            {r3-r9}
-
-    ; TODO(hkuang): Find a better way to load the elements.
-    ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
-    vld2.s16        {q8,q9}, [r0]!
-    vld2.s16        {q9,q10}, [r0]!
-    vld2.s16        {q10,q11}, [r0]!
-    vld2.s16        {q11,q12}, [r0]!
-    vld2.s16        {q12,q13}, [r0]!
-    vld2.s16        {q13,q14}, [r0]!
-    vld2.s16        {q14,q15}, [r0]!
-    vld2.s16        {q0,q1}, [r0]!
-    vmov.s16        q15, q0;
-
-    ; generate  cospi_30_64 = 1606
-    mov             r3, #0x0600
-    add             r3, #0x46
-
-    ; generate cospi_2_64  = 16305
-    mov             r12, #0x3f00
-    add             r12, #0xb1
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; stage 3
-    vdup.16         d12, r3                   ; duplicate cospi_30_64
-    vdup.16         d13, r12                  ; duplicate cospi_2_64
-
-    ; preloading to avoid stall
-    ; generate cospi_14_64 = 12665
-    mov             r3, #0x3100
-    add             r3, #0x79
-
-    ; generate cospi_18_64 = 10394
-    mov             r12, #0x2800
-    add             r12, #0x9a
-
-    ; step1[8] * cospi_30_64
-    vmull.s16       q2, d16, d12
-    vmull.s16       q3, d17, d12
-
-    ; step1[8] * cospi_2_64
-    vmull.s16       q1, d16, d13
-    vmull.s16       q4, d17, d13
-
-    ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64
-    vmlsl.s16       q2, d30, d13
-    vmlsl.s16       q3, d31, d13
-
-    ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64
-    vmlal.s16       q1, d30, d12
-    vmlal.s16       q4, d31, d12
-
-    vdup.16         d30, r3                   ; duplicate cospi_14_64
-    vdup.16         d31, r12                  ; duplicate cospi_18_64
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d0, q2, #14               ; >> 14
-    vqrshrn.s32     d1, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d14, q1, #14              ; >> 14
-    vqrshrn.s32     d15, q4, #14              ; >> 14
-
-    ; preloading to avoid stall
-    ; generate cospi_22_64 = 7723
-    mov             r3, #0x1e00
-    add             r3, #0x2b
-
-    ; generate cospi_10_64 = 14449
-    mov             r12, #0x3800
-    add             r12, #0x71
-
-    ; step1[9] * cospi_14_64
-    vmull.s16       q2, d24, d30
-    vmull.s16       q3, d25, d30
-
-    ; step1[9] * cospi_18_64
-    vmull.s16       q4, d24, d31
-    vmull.s16       q5, d25, d31
-
-    ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64
-    vmlsl.s16       q2, d22, d31
-    vmlsl.s16       q3, d23, d31
-
-    ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64
-    vmlal.s16       q4, d22, d30
-    vmlal.s16       q5, d23, d30
-
-    vdup.16         d30, r3                   ; duplicate cospi_22_64
-    vdup.16         d31, r12                  ; duplicate cospi_10_64
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d2, q2, #14               ; >> 14
-    vqrshrn.s32     d3, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q4, #14              ; >> 14
-    vqrshrn.s32     d13, q5, #14              ; >> 14
-
-    ; step1[10] * cospi_22_64
-    vmull.s16       q11, d20, d30
-    vmull.s16       q12, d21, d30
-
-    ; step1[10] * cospi_10_64
-    vmull.s16       q4, d20, d31
-    vmull.s16       q5, d21, d31
-
-    ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64
-    vmlsl.s16       q11, d26, d31
-    vmlsl.s16       q12, d27, d31
-
-    ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64
-    vmlal.s16       q4, d26, d30
-    vmlal.s16       q5, d27, d30
-
-    ; preloading to avoid stall
-    ; generate cospi_6_64 = 15679
-    mov             r3, #0x3d00
-    add             r3, #0x3f
-
-    ; generate cospi_26_64 = 4756
-    mov             r12, #0x1200
-    add             r12, #0x94
-
-    vdup.16         d30, r3                   ; duplicate cospi_6_64
-    vdup.16         d31, r12                  ; duplicate cospi_26_64
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d4, q11, #14              ; >> 14
-    vqrshrn.s32     d5, q12, #14              ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d11, q5, #14              ; >> 14
-    vqrshrn.s32     d10, q4, #14              ; >> 14
-
-    ; step1[11] * cospi_6_64
-    vmull.s16       q10, d28, d30
-    vmull.s16       q11, d29, d30
-
-    ; step1[11] * cospi_26_64
-    vmull.s16       q12, d28, d31
-    vmull.s16       q13, d29, d31
-
-    ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64
-    vmlsl.s16       q10, d18, d31
-    vmlsl.s16       q11, d19, d31
-
-    ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64
-    vmlal.s16       q12, d18, d30
-    vmlal.s16       q13, d19, d30
-
-    vsub.s16        q9, q0, q1                ; step1[9]=step2[8]-step2[9]
-    vadd.s16        q0, q0, q1                ; step1[8]=step2[8]+step2[9]
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d6, q10, #14              ; >> 14
-    vqrshrn.s32     d7, q11, #14              ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d8, q12, #14              ; >> 14
-    vqrshrn.s32     d9, q13, #14              ; >> 14
-
-    ; stage 3
-    vsub.s16        q10, q3, q2               ; step1[10]=-step2[10]+step2[11]
-    vadd.s16        q11, q2, q3               ; step1[11]=step2[10]+step2[11]
-    vadd.s16        q12, q4, q5               ; step1[12]=step2[12]+step2[13]
-    vsub.s16        q13, q4, q5               ; step1[13]=step2[12]-step2[13]
-    vsub.s16        q14, q7, q6               ; step1[14]=-step2[14]+tep2[15]
-    vadd.s16        q7, q6, q7                ; step1[15]=step2[14]+step2[15]
-
-    ; stage 4
-    ; generate cospi_24_64 = 6270
-    mov             r3, #0x1800
-    add             r3, #0x7e
-
-    ; generate cospi_8_64 = 15137
-    mov             r12, #0x3b00
-    add             r12, #0x21
-
-    ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
-    vdup.16         d30, r12                  ; duplicate cospi_8_64
-    vdup.16         d31, r3                   ; duplicate cospi_24_64
-
-    ; step1[9] * cospi_24_64
-    vmull.s16       q2, d18, d31
-    vmull.s16       q3, d19, d31
-
-    ; step1[14] * cospi_24_64
-    vmull.s16       q4, d28, d31
-    vmull.s16       q5, d29, d31
-
-    ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
-    vmlal.s16       q2, d28, d30
-    vmlal.s16       q3, d29, d30
-
-    ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
-    vmlsl.s16       q4, d18, d30
-    vmlsl.s16       q5, d19, d30
-
-    rsb             r12, #0
-    vdup.16         d30, r12                  ; duplicate -cospi_8_64
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q2, #14              ; >> 14
-    vqrshrn.s32     d13, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d2, q4, #14               ; >> 14
-    vqrshrn.s32     d3, q5, #14               ; >> 14
-
-    vmov.s16        q3, q11
-    vmov.s16        q4, q12
-
-    ; - step1[13] * cospi_8_64
-    vmull.s16       q11, d26, d30
-    vmull.s16       q12, d27, d30
-
-    ; -step1[10] * cospi_8_64
-    vmull.s16       q8, d20, d30
-    vmull.s16       q9, d21, d30
-
-    ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
-    vmlsl.s16       q11, d20, d31
-    vmlsl.s16       q12, d21, d31
-
-    ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
-    vmlal.s16       q8, d26, d31
-    vmlal.s16       q9, d27, d31
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d4, q11, #14              ; >> 14
-    vqrshrn.s32     d5, q12, #14              ; >> 14
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d10, q8, #14              ; >> 14
-    vqrshrn.s32     d11, q9, #14              ; >> 14
-
-    ; stage 5
-    vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];
-    vadd.s16        q9, q1, q2                ; step1[9] = step2[9]+step2[10];
-    vsub.s16        q10, q1, q2               ; step1[10] = step2[9]-step2[10];
-    vsub.s16        q11, q0, q3               ; step1[11] = step2[8]-step2[11];
-    vsub.s16        q12, q7, q4               ; step1[12] =-step2[12]+step2[15];
-    vsub.s16        q13, q6, q5               ; step1[13] =-step2[13]+step2[14];
-    vadd.s16        q14, q6, q5               ; step1[14] =step2[13]+step2[14];
-    vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];
-
-    ; stage 6.
-    ; generate cospi_16_64 = 11585
-    mov             r12, #0x2d00
-    add             r12, #0x41
-
-    vdup.16         d14, r12                  ; duplicate cospi_16_64
-
-    ; step1[13] * cospi_16_64
-    vmull.s16       q3, d26, d14
-    vmull.s16       q4, d27, d14
-
-    ; step1[10] * cospi_16_64
-    vmull.s16       q0, d20, d14
-    vmull.s16       q1, d21, d14
-
-    ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
-    vsub.s32        q5, q3, q0
-    vsub.s32        q6, q4, q1
-
-    ; temp2 = (step1[10] + step1[13]) * cospi_16_64
-    vadd.s32        q10, q3, q0
-    vadd.s32        q4, q4, q1
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d4, q5, #14               ; >> 14
-    vqrshrn.s32     d5, q6, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d10, q10, #14             ; >> 14
-    vqrshrn.s32     d11, q4, #14              ; >> 14
-
-    ; step1[11] * cospi_16_64
-    vmull.s16       q0, d22, d14
-    vmull.s16       q1, d23, d14
-
-    ; step1[12] * cospi_16_64
-    vmull.s16       q13, d24, d14
-    vmull.s16       q6, d25, d14
-
-    ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
-    vsub.s32        q10, q13, q0
-    vsub.s32        q4, q6, q1
-
-    ; temp2 = (step1[11] + step1[12]) * cospi_16_64
-    vadd.s32        q13, q13, q0
-    vadd.s32        q6, q6, q1
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d6, q10, #14              ; >> 14
-    vqrshrn.s32     d7, q4, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d8, q13, #14              ; >> 14
-    vqrshrn.s32     d9, q6, #14               ; >> 14
-
-    mov              r4, #16                  ; pass1Output stride
-    ldr              r3, [sp]                 ; load skip_adding
-    cmp              r3, #0                   ; check if need adding dest data
-    beq              skip_adding_dest
-
-    ldr              r7, [sp, #28]            ; dest used to save element 0-7
-    mov              r9, r7                   ; save dest pointer for later use
-    ldr              r8, [sp, #32]            ; load dest_stride
-
-    ; stage 7
-    ; load the data in pass1
-    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
-    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
-    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
-    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
-    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
-    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q12                  ; clip pixel
-    vqmovun.s16     d13, q13                  ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
-    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
-    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
-    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
-    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
-    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
-    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q12                  ; clip pixel
-    vqmovun.s16     d13, q13                  ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
-    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
-    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
-    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
-    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
-    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
-    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
-    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q12                  ; clip pixel
-    vqmovun.s16     d13, q13                  ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
-    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
-    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
-    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
-    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
-    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
-    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q12                  ; clip pixel
-    vqmovun.s16     d13, q13                  ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
-    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
-
-    ; store the data  output 8,9,10,11,12,13,14,15
-    vrshr.s16       q8, q8, #6                ; ROUND_POWER_OF_TWO
-    vaddw.u8        q8, q8, d12               ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q8                   ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q9, q9, #6
-    vaddw.u8        q9, q9, d13               ; + dest[j * dest_stride + i]
-    vqmovun.s16     d13, q9                   ; clip pixel
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q2, q2, #6
-    vaddw.u8        q2, q2, d12               ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q2                   ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q3, q3, #6
-    vaddw.u8        q3, q3, d13               ; + dest[j * dest_stride + i]
-    vqmovun.s16     d13, q3                   ; clip pixel
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q4, q4, #6
-    vaddw.u8        q4, q4, d12               ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q4                   ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q5, q5, #6
-    vaddw.u8        q5, q5, d13               ; + dest[j * dest_stride + i]
-    vqmovun.s16     d13, q5                   ; clip pixel
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q14, q14, #6
-    vaddw.u8        q14, q14, d12             ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q14                  ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q15, q15, #6
-    vaddw.u8        q15, q15, d13             ; + dest[j * dest_stride + i]
-    vqmovun.s16     d13, q15                  ; clip pixel
-    vst1.64         {d13}, [r9], r8           ; store the data
-    b               end_idct16x16_pass2
-
-skip_adding_dest
-    ; stage 7
-    ; load the data in pass1
-    mov              r5, #24
-    mov              r3, #8
-
-    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
-    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
-    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
-    vst1.64         {d24}, [r1], r3           ; store output[0]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[1]
-    vst1.64         {d27}, [r1], r5
-    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
-    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
-    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
-    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
-    vst1.64         {d24}, [r1], r3           ; store output[2]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[3]
-    vst1.64         {d27}, [r1], r5
-    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
-    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
-    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
-    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
-    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
-    vst1.64         {d24}, [r1], r3           ; store output[4]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[5]
-    vst1.64         {d27}, [r1], r5
-    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
-    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
-    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
-    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
-    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
-    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
-    vst1.64         {d24}, [r1], r3           ; store output[6]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[7]
-    vst1.64         {d27}, [r1], r5
-
-    ; store the data  output 8,9,10,11,12,13,14,15
-    vst1.64         {d16}, [r1], r3
-    vst1.64         {d17}, [r1], r5
-    vst1.64         {d18}, [r1], r3
-    vst1.64         {d19}, [r1], r5
-    vst1.64         {d4}, [r1], r3
-    vst1.64         {d5}, [r1], r5
-    vst1.64         {d6}, [r1], r3
-    vst1.64         {d7}, [r1], r5
-    vst1.64         {d8}, [r1], r3
-    vst1.64         {d9}, [r1], r5
-    vst1.64         {d10}, [r1], r3
-    vst1.64         {d11}, [r1], r5
-    vst1.64         {d28}, [r1], r3
-    vst1.64         {d29}, [r1], r5
-    vst1.64         {d30}, [r1], r3
-    vst1.64         {d31}, [r1], r5
-end_idct16x16_pass2
-    pop             {r3-r9}
-    bx              lr
-    ENDP  ; |aom_idct16x16_256_add_neon_pass2|
-
-;void |aom_idct16x16_10_add_neon_pass1|(int16_t *input,
-;                                             int16_t *output, int output_stride)
-;
-; r0  int16_t input
-; r1  int16_t *output
-; r2  int  output_stride)
-
-; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|aom_idct16x16_10_add_neon_pass1| PROC
-
-    ; TODO(hkuang): Find a better way to load the elements.
-    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
-    vld2.s16        {q8,q9}, [r0]!
-    vld2.s16        {q9,q10}, [r0]!
-    vld2.s16        {q10,q11}, [r0]!
-    vld2.s16        {q11,q12}, [r0]!
-    vld2.s16        {q12,q13}, [r0]!
-    vld2.s16        {q13,q14}, [r0]!
-    vld2.s16        {q14,q15}, [r0]!
-    vld2.s16        {q1,q2}, [r0]!
-    vmov.s16        q15, q1
-
-    ; generate  cospi_28_64*2 = 6392
-    mov             r3, #0x1800
-    add             r3, #0xf8
-
-    ; generate cospi_4_64*2  = 32138
-    mov             r12, #0x7d00
-    add             r12, #0x8a
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; stage 3
-    vdup.16         q0, r3                    ; duplicate cospi_28_64*2
-    vdup.16         q1, r12                   ; duplicate cospi_4_64*2
-
-    ; The following instructions use vqrdmulh to do the
-    ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply,
-    ; double, and return the high 16 bits, effectively giving >> 15. Doubling
-    ; the constant will change this to >> 14.
-    ; dct_const_round_shift(step2[4] * cospi_28_64);
-    vqrdmulh.s16    q4, q9, q0
-
-    ; preloading to avoid stall
-    ; generate cospi_16_64*2 = 23170
-    mov             r3, #0x5a00
-    add             r3, #0x82
-
-    ; dct_const_round_shift(step2[4] * cospi_4_64);
-    vqrdmulh.s16    q7, q9, q1
-
-    ; stage 4
-    vdup.16         q1, r3                    ; cospi_16_64*2
-
-    ; generate cospi_16_64 = 11585
-    mov             r3, #0x2d00
-    add             r3, #0x41
-
-    vdup.16         d4, r3;                   ; duplicate cospi_16_64
-
-    ; dct_const_round_shift(step1[0] * cospi_16_64)
-    vqrdmulh.s16    q8, q8, q1
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d14, d4
-    vmull.s16       q10, d15, d4
-
-    ; step2[5] * cospi_16_64
-    vmull.s16       q12, d9, d4
-    vmull.s16       q11, d8, d4
-
-    ; temp1 = (step2[6] - step2[5]) * cospi_16_64
-    vsub.s32        q15, q10, q12
-    vsub.s32        q6, q9, q11
-
-    ; temp2 = (step2[5] + step2[6]) * cospi_16_64
-    vadd.s32        q9, q9, q11
-    vadd.s32        q10, q10, q12
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d11, q15, #14             ; >> 14
-    vqrshrn.s32     d10, q6, #14              ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q10, #14             ; >> 14
-
-    ; stage 6
-    vadd.s16        q2, q8, q7                ; step2[0] = step1[0] + step1[7];
-    vadd.s16        q10, q8, q5               ; step2[2] = step1[2] + step1[5];
-    vadd.s16        q11, q8, q4               ; step2[3] = step1[3] + step1[4];
-    vadd.s16        q9, q8, q6                ; step2[1] = step1[1] + step1[6];
-    vsub.s16        q12, q8, q4               ; step2[4] = step1[3] - step1[4];
-    vsub.s16        q13, q8, q5               ; step2[5] = step1[2] - step1[5];
-    vsub.s16        q14, q8, q6               ; step2[6] = step1[1] - step1[6];
-    vsub.s16        q15, q8, q7               ; step2[7] = step1[0] - step1[7];
-
-    ; store the data
-    vst1.64         {d4}, [r1], r2
-    vst1.64         {d5}, [r1], r2
-    vst1.64         {d18}, [r1], r2
-    vst1.64         {d19}, [r1], r2
-    vst1.64         {d20}, [r1], r2
-    vst1.64         {d21}, [r1], r2
-    vst1.64         {d22}, [r1], r2
-    vst1.64         {d23}, [r1], r2
-    vst1.64         {d24}, [r1], r2
-    vst1.64         {d25}, [r1], r2
-    vst1.64         {d26}, [r1], r2
-    vst1.64         {d27}, [r1], r2
-    vst1.64         {d28}, [r1], r2
-    vst1.64         {d29}, [r1], r2
-    vst1.64         {d30}, [r1], r2
-    vst1.64         {d31}, [r1], r2
-
-    bx              lr
-    ENDP  ; |aom_idct16x16_10_add_neon_pass1|
-
-;void aom_idct16x16_10_add_neon_pass2(int16_t *src,
-;                                           int16_t *output,
-;                                           int16_t *pass1Output,
-;                                           int16_t skip_adding,
-;                                           uint8_t *dest,
-;                                           int dest_stride)
-;
-; r0  int16_t *src
-; r1  int16_t *output,
-; r2  int16_t *pass1Output,
-; r3  int16_t skip_adding,
-; r4  uint8_t *dest,
-; r5  int dest_stride)
-
-; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|aom_idct16x16_10_add_neon_pass2| PROC
-    push            {r3-r9}
-
-    ; TODO(hkuang): Find a better way to load the elements.
-    ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
-    vld2.s16        {q8,q9}, [r0]!
-    vld2.s16        {q9,q10}, [r0]!
-    vld2.s16        {q10,q11}, [r0]!
-    vld2.s16        {q11,q12}, [r0]!
-    vld2.s16        {q12,q13}, [r0]!
-    vld2.s16        {q13,q14}, [r0]!
-    vld2.s16        {q14,q15}, [r0]!
-    vld2.s16        {q0,q1}, [r0]!
-    vmov.s16        q15, q0;
-
-    ; generate 2*cospi_30_64 = 3212
-    mov             r3, #0xc00
-    add             r3, #0x8c
-
-    ; generate 2*cospi_2_64  = 32610
-    mov             r12, #0x7f00
-    add             r12, #0x62
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; stage 3
-    vdup.16         q6, r3                    ; duplicate 2*cospi_30_64
-
-    ; dct_const_round_shift(step1[8] * cospi_30_64)
-    vqrdmulh.s16    q0, q8, q6
-
-    vdup.16         q6, r12                   ; duplicate 2*cospi_2_64
-
-    ; dct_const_round_shift(step1[8] * cospi_2_64)
-    vqrdmulh.s16    q7, q8, q6
-
-    ; preloading to avoid stall
-    ; generate 2*cospi_26_64 = 9512
-    mov             r12, #0x2500
-    add             r12, #0x28
-    rsb             r12, #0
-    vdup.16         q15, r12                  ; duplicate -2*cospi_26_64
-
-    ; generate 2*cospi_6_64 = 31358
-    mov             r3, #0x7a00
-    add             r3, #0x7e
-    vdup.16         q14, r3                   ; duplicate 2*cospi_6_64
-
-    ; dct_const_round_shift(- step1[12] * cospi_26_64)
-    vqrdmulh.s16    q3, q9, q15
-
-    ; dct_const_round_shift(step1[12] * cospi_6_64)
-    vqrdmulh.s16    q4, q9, q14
-
-    ; stage 4
-    ; generate cospi_24_64 = 6270
-    mov             r3, #0x1800
-    add             r3, #0x7e
-    vdup.16         d31, r3                   ; duplicate cospi_24_64
-
-    ; generate cospi_8_64 = 15137
-    mov             r12, #0x3b00
-    add             r12, #0x21
-    vdup.16         d30, r12                  ; duplicate cospi_8_64
-
-    ; step1[14] * cospi_24_64
-    vmull.s16       q12, d14, d31
-    vmull.s16       q5, d15, d31
-
-    ; step1[9] * cospi_24_64
-    vmull.s16       q2, d0, d31
-    vmull.s16       q11, d1, d31
-
-    ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
-    vmlsl.s16       q12, d0, d30
-    vmlsl.s16       q5, d1, d30
-
-    ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
-    vmlal.s16       q2, d14, d30
-    vmlal.s16       q11, d15, d30
-
-    rsb              r12, #0
-    vdup.16          d30, r12                 ; duplicate -cospi_8_64
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d2, q12, #14              ; >> 14
-    vqrshrn.s32     d3, q5, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q2, #14              ; >> 14
-    vqrshrn.s32     d13, q11, #14             ; >> 14
-
-    ; - step1[13] * cospi_8_64
-    vmull.s16       q10, d8, d30
-    vmull.s16       q13, d9, d30
-
-    ; -step1[10] * cospi_8_64
-    vmull.s16       q8, d6, d30
-    vmull.s16       q9, d7, d30
-
-    ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64
-    vmlsl.s16       q10, d6, d31
-    vmlsl.s16       q13, d7, d31
-
-    ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
-    vmlal.s16       q8, d8, d31
-    vmlal.s16       q9, d9, d31
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d4, q10, #14              ; >> 14
-    vqrshrn.s32     d5, q13, #14              ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d10, q8, #14              ; >> 14
-    vqrshrn.s32     d11, q9, #14              ; >> 14
-
-    ; stage 5
-    vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];
-    vadd.s16        q9, q1, q2                ; step1[9] = step2[9]+step2[10];
-    vsub.s16        q10, q1, q2               ; step1[10] = step2[9]-step2[10];
-    vsub.s16        q11, q0, q3               ; step1[11] = step2[8]-step2[11];
-    vsub.s16        q12, q7, q4               ; step1[12] =-step2[12]+step2[15];
-    vsub.s16        q13, q6, q5               ; step1[13] =-step2[13]+step2[14];
-    vadd.s16        q14, q6, q5               ; step1[14] =step2[13]+step2[14];
-    vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];
-
-    ; stage 6.
-    ; generate cospi_16_64 = 11585
-    mov             r12, #0x2d00
-    add             r12, #0x41
-
-    vdup.16         d14, r12                  ; duplicate cospi_16_64
-
-    ; step1[13] * cospi_16_64
-    vmull.s16       q3, d26, d14
-    vmull.s16       q4, d27, d14
-
-    ; step1[10] * cospi_16_64
-    vmull.s16       q0, d20, d14
-    vmull.s16       q1, d21, d14
-
-    ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
-    vsub.s32        q5, q3, q0
-    vsub.s32        q6, q4, q1
-
-    ; temp2 = (step1[10] + step1[13]) * cospi_16_64
-    vadd.s32        q0, q3, q0
-    vadd.s32        q1, q4, q1
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d4, q5, #14               ; >> 14
-    vqrshrn.s32     d5, q6, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d10, q0, #14              ; >> 14
-    vqrshrn.s32     d11, q1, #14              ; >> 14
-
-    ; step1[11] * cospi_16_64
-    vmull.s16       q0, d22, d14
-    vmull.s16       q1, d23, d14
-
-    ; step1[12] * cospi_16_64
-    vmull.s16       q13, d24, d14
-    vmull.s16       q6, d25, d14
-
-    ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
-    vsub.s32        q10, q13, q0
-    vsub.s32        q4, q6, q1
-
-    ; temp2 = (step1[11] + step1[12]) * cospi_16_64
-    vadd.s32        q13, q13, q0
-    vadd.s32        q6, q6, q1
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d6, q10, #14              ; >> 14
-    vqrshrn.s32     d7, q4, #14               ; >> 14
-
-    ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64);
-    vqrshrn.s32     d8, q13, #14              ; >> 14
-    vqrshrn.s32     d9, q6, #14               ; >> 14
-
-    mov              r4, #16                  ; pass1Output stride
-    ldr              r3, [sp]                 ; load skip_adding
-
-    ; stage 7
-    ; load the data in pass1
-    mov              r5, #24
-    mov              r3, #8
-
-    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
-    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
-    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
-    vst1.64         {d24}, [r1], r3           ; store output[0]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[1]
-    vst1.64         {d27}, [r1], r5
-    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
-    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
-    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
-    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
-    vst1.64         {d24}, [r1], r3           ; store output[2]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[3]
-    vst1.64         {d27}, [r1], r5
-    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
-    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
-    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
-    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
-    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
-    vst1.64         {d24}, [r1], r3           ; store output[4]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[5]
-    vst1.64         {d27}, [r1], r5
-    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
-    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
-    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
-    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
-    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
-    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
-    vst1.64         {d24}, [r1], r3           ; store output[6]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[7]
-    vst1.64         {d27}, [r1], r5
-
-    ; store the data  output 8,9,10,11,12,13,14,15
-    vst1.64         {d16}, [r1], r3
-    vst1.64         {d17}, [r1], r5
-    vst1.64         {d18}, [r1], r3
-    vst1.64         {d19}, [r1], r5
-    vst1.64         {d4}, [r1], r3
-    vst1.64         {d5}, [r1], r5
-    vst1.64         {d6}, [r1], r3
-    vst1.64         {d7}, [r1], r5
-    vst1.64         {d8}, [r1], r3
-    vst1.64         {d9}, [r1], r5
-    vst1.64         {d10}, [r1], r3
-    vst1.64         {d11}, [r1], r5
-    vst1.64         {d28}, [r1], r3
-    vst1.64         {d29}, [r1], r5
-    vst1.64         {d30}, [r1], r3
-    vst1.64         {d31}, [r1], r5
-end_idct10_16x16_pass2
-    pop             {r3-r9}
-    bx              lr
-    ENDP  ; |aom_idct16x16_10_add_neon_pass2|
-    END
diff --git a/third_party/aom/aom_dsp/arm/idct16x16_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_neon.c
deleted file mode 100644
index db0d4905b..000000000
--- a/third_party/aom/aom_dsp/arm/idct16x16_neon.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/aom_dsp_common.h"
-
-void aom_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output,
-                                      int output_stride);
-void aom_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output,
-                                      int16_t *pass1Output, int16_t skip_adding,
-                                      uint8_t *dest, int dest_stride);
-void aom_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output,
-                                     int output_stride);
-void aom_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output,
-                                     int16_t *pass1Output, int16_t skip_adding,
-                                     uint8_t *dest, int dest_stride);
-
-#if HAVE_NEON_ASM
-/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
-extern void aom_push_neon(int64_t *store);
-extern void aom_pop_neon(int64_t *store);
-#endif  // HAVE_NEON_ASM
-
-void aom_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
-                                int dest_stride) {
-#if HAVE_NEON_ASM
-  int64_t store_reg[8];
-#endif
-  int16_t pass1_output[16 * 16] = { 0 };
-  int16_t row_idct_output[16 * 16] = { 0 };
-
-#if HAVE_NEON_ASM
-  // save d8-d15 register values.
-  aom_push_neon(store_reg);
-#endif
-
-  /* Parallel idct on the upper 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  aom_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  aom_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
-                                   dest, dest_stride);
-
-  /* Parallel idct on the lower 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  aom_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  aom_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8,
-                                   pass1_output, 0, dest, dest_stride);
-
-  /* Parallel idct on the left 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
-                                   pass1_output, 1, dest, dest_stride);
-
-  /* Parallel idct on the right 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
-                                   row_idct_output + 8, pass1_output, 1,
-                                   dest + 8, dest_stride);
-
-#if HAVE_NEON_ASM
-  // restore d8-d15 register values.
-  aom_pop_neon(store_reg);
-#endif
-
-  return;
-}
-
-void aom_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest,
-                               int dest_stride) {
-#if HAVE_NEON_ASM
-  int64_t store_reg[8];
-#endif
-  int16_t pass1_output[16 * 16] = { 0 };
-  int16_t row_idct_output[16 * 16] = { 0 };
-
-#if HAVE_NEON_ASM
-  // save d8-d15 register values.
-  aom_push_neon(store_reg);
-#endif
-
-  /* Parallel idct on the upper 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  aom_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  aom_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
-                                  dest, dest_stride);
-
-  /* Skip Parallel idct on the lower 8 rows as they are all 0s */
-
-  /* Parallel idct on the left 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
-                                   pass1_output, 1, dest, dest_stride);
-
-  /* Parallel idct on the right 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
-                                   row_idct_output + 8, pass1_output, 1,
-                                   dest + 8, dest_stride);
-
-#if HAVE_NEON_ASM
-  // restore d8-d15 register values.
-  aom_pop_neon(store_reg);
-#endif
-
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c
deleted file mode 100644
index 547567c5b..000000000
--- a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_config.h"
-
-#include "aom_dsp/inv_txfm.h"
-#include "aom_ports/mem.h"
-
-static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
-                           uint8x16_t *q9u8, uint8x16_t *q10u8,
-                           uint8x16_t *q11u8, uint8x16_t *q12u8,
-                           uint8x16_t *q13u8, uint8x16_t *q14u8,
-                           uint8x16_t *q15u8) {
-  *q8u8 = vld1q_u8(d);
-  d += d_stride;
-  *q9u8 = vld1q_u8(d);
-  d += d_stride;
-  *q10u8 = vld1q_u8(d);
-  d += d_stride;
-  *q11u8 = vld1q_u8(d);
-  d += d_stride;
-  *q12u8 = vld1q_u8(d);
-  d += d_stride;
-  *q13u8 = vld1q_u8(d);
-  d += d_stride;
-  *q14u8 = vld1q_u8(d);
-  d += d_stride;
-  *q15u8 = vld1q_u8(d);
-  return;
-}
-
-static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
-                                 uint8x16_t *q9u8, uint8x16_t *q10u8,
-                                 uint8x16_t *q11u8, uint8x16_t *q12u8,
-                                 uint8x16_t *q13u8, uint8x16_t *q14u8,
-                                 uint8x16_t *q15u8) {
-  *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
-  *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
-  *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
-  *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
-  *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
-  *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
-  *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
-  *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
-  return;
-}
-
-static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
-                                 uint8x16_t *q9u8, uint8x16_t *q10u8,
-                                 uint8x16_t *q11u8, uint8x16_t *q12u8,
-                                 uint8x16_t *q13u8, uint8x16_t *q14u8,
-                                 uint8x16_t *q15u8) {
-  *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
-  *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
-  *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
-  *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
-  *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
-  *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
-  *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
-  *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
-  return;
-}
-
-static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
-                           uint8x16_t *q9u8, uint8x16_t *q10u8,
-                           uint8x16_t *q11u8, uint8x16_t *q12u8,
-                           uint8x16_t *q13u8, uint8x16_t *q14u8,
-                           uint8x16_t *q15u8) {
-  vst1q_u8(d, *q8u8);
-  d += d_stride;
-  vst1q_u8(d, *q9u8);
-  d += d_stride;
-  vst1q_u8(d, *q10u8);
-  d += d_stride;
-  vst1q_u8(d, *q11u8);
-  d += d_stride;
-  vst1q_u8(d, *q12u8);
-  d += d_stride;
-  vst1q_u8(d, *q13u8);
-  d += d_stride;
-  vst1q_u8(d, *q14u8);
-  d += d_stride;
-  vst1q_u8(d, *q15u8);
-  return;
-}
-
-void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
-  int i, j, dest_stride8;
-  uint8_t *d;
-  int16_t a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-
-  dest_stride8 = dest_stride * 8;
-  if (a1 >= 0) {  // diff_positive_32_32
-    a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
-    q0u8 = vdupq_n_u8(a1);
-    for (i = 0; i < 2; i++, dest += 16) {  // diff_positive_32_32_loop
-      d = dest;
-      for (j = 0; j < 4; j++) {
-        LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                &q14u8, &q15u8);
-        ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                      &q14u8, &q15u8);
-        ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                &q14u8, &q15u8);
-        d += dest_stride8;
-      }
-    }
-  } else {  // diff_negative_32_32
-    a1 = -a1;
-    a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
-    q0u8 = vdupq_n_u8(a1);
-    for (i = 0; i < 2; i++, dest += 16) {  // diff_negative_32_32_loop
-      d = dest;
-      for (j = 0; j < 4; j++) {
-        LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                &q14u8, &q15u8);
-        SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                      &q14u8, &q15u8);
-        ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                &q14u8, &q15u8);
-        d += dest_stride8;
-      }
-    }
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon_asm.asm
deleted file mode 100644
index b04df2d0b..000000000
--- a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon_asm.asm
+++ /dev/null
@@ -1,147 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-
-    EXPORT  |aom_idct32x32_1_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ;TODO(hkuang): put the following macros in a seperate
-    ;file so other idct function could also use them.
-    MACRO
-    LD_16x8          $src, $stride
-    vld1.8           {q8}, [$src], $stride
-    vld1.8           {q9}, [$src], $stride
-    vld1.8           {q10}, [$src], $stride
-    vld1.8           {q11}, [$src], $stride
-    vld1.8           {q12}, [$src], $stride
-    vld1.8           {q13}, [$src], $stride
-    vld1.8           {q14}, [$src], $stride
-    vld1.8           {q15}, [$src], $stride
-    MEND
-
-    MACRO
-    ADD_DIFF_16x8    $diff
-    vqadd.u8         q8, q8, $diff
-    vqadd.u8         q9, q9, $diff
-    vqadd.u8         q10, q10, $diff
-    vqadd.u8         q11, q11, $diff
-    vqadd.u8         q12, q12, $diff
-    vqadd.u8         q13, q13, $diff
-    vqadd.u8         q14, q14, $diff
-    vqadd.u8         q15, q15, $diff
-    MEND
-
-    MACRO
-    SUB_DIFF_16x8    $diff
-    vqsub.u8         q8, q8, $diff
-    vqsub.u8         q9, q9, $diff
-    vqsub.u8         q10, q10, $diff
-    vqsub.u8         q11, q11, $diff
-    vqsub.u8         q12, q12, $diff
-    vqsub.u8         q13, q13, $diff
-    vqsub.u8         q14, q14, $diff
-    vqsub.u8         q15, q15, $diff
-    MEND
-
-    MACRO
-    ST_16x8          $dst, $stride
-    vst1.8           {q8}, [$dst], $stride
-    vst1.8           {q9}, [$dst], $stride
-    vst1.8           {q10},[$dst], $stride
-    vst1.8           {q11},[$dst], $stride
-    vst1.8           {q12},[$dst], $stride
-    vst1.8           {q13},[$dst], $stride
-    vst1.8           {q14},[$dst], $stride
-    vst1.8           {q15},[$dst], $stride
-    MEND
-
-;void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
-;                              int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride
-
-|aom_idct32x32_1_add_neon| PROC
-    push             {lr}
-    pld              [r1]
-    add              r3, r1, #16               ; r3 dest + 16 for second loop
-    ldrsh            r0, [r0]
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; out = dct_const_round_shift(input[0] * cospi_16_64)
-    mul              r0, r0, r12               ; input[0] * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; out = dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    mov              r12, r1                   ; save dest
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; a1 = ROUND_POWER_OF_TWO(out, 6)
-    add              r0, r0, #32               ; + (1 <<((6) - 1))
-    asrs             r0, r0, #6                ; >> 6
-    bge              diff_positive_32_32
-
-diff_negative_32_32
-    neg              r0, r0
-    usat             r0, #8, r0
-    vdup.u8          q0, r0
-    mov              r0, #4
-
-diff_negative_32_32_loop
-    sub              r0, #1
-    LD_16x8          r1, r2
-    SUB_DIFF_16x8    q0
-    ST_16x8          r12, r2
-
-    LD_16x8          r1, r2
-    SUB_DIFF_16x8    q0
-    ST_16x8          r12, r2
-    cmp              r0, #2
-    moveq            r1, r3
-    moveq            r12, r3
-    cmp              r0, #0
-    bne              diff_negative_32_32_loop
-    pop              {pc}
-
-diff_positive_32_32
-    usat             r0, #8, r0
-    vdup.u8          q0, r0
-    mov              r0, #4
-
-diff_positive_32_32_loop
-    sub              r0, #1
-    LD_16x8          r1, r2
-    ADD_DIFF_16x8    q0
-    ST_16x8          r12, r2
-
-    LD_16x8          r1, r2
-    ADD_DIFF_16x8    q0
-    ST_16x8          r12, r2
-    cmp              r0, #2
-    moveq            r1, r3
-    moveq            r12, r3
-    cmp              r0, #0
-    bne              diff_positive_32_32_loop
-    pop              {pc}
-
-    ENDP             ; |aom_idct32x32_1_add_neon|
-    END
diff --git a/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c b/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c
deleted file mode 100644
index a7562c7d5..000000000
--- a/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c
+++ /dev/null
@@ -1,686 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_config.h"
-#include "aom_dsp/txfm_common.h"
-
-#define LOAD_FROM_TRANSPOSED(prev, first, second) \
-  q14s16 = vld1q_s16(trans_buf + first * 8);      \
-  q13s16 = vld1q_s16(trans_buf + second * 8);
-
-#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
-  qA = vld1q_s16(out + first * 32);                   \
-  qB = vld1q_s16(out + second * 32);
-
-#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
-  vst1q_s16(out + first * 32, qA);                   \
-  vst1q_s16(out + second * 32, qB);
-
-#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
-  __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16);
-static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2,
-                                                  int stride, int16x8_t q6s16,
-                                                  int16x8_t q7s16,
-                                                  int16x8_t q8s16,
-                                                  int16x8_t q9s16) {
-  int16x4_t d8s16, d9s16, d10s16, d11s16;
-
-  d8s16 = vld1_s16((int16_t *)p1);
-  p1 += stride;
-  d11s16 = vld1_s16((int16_t *)p2);
-  p2 -= stride;
-  d9s16 = vld1_s16((int16_t *)p1);
-  d10s16 = vld1_s16((int16_t *)p2);
-
-  q7s16 = vrshrq_n_s16(q7s16, 6);
-  q8s16 = vrshrq_n_s16(q8s16, 6);
-  q9s16 = vrshrq_n_s16(q9s16, 6);
-  q6s16 = vrshrq_n_s16(q6s16, 6);
-
-  q7s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16)));
-  q8s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16)));
-  q9s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16)));
-  q6s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16)));
-
-  d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
-  d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
-  d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
-  d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
-
-  vst1_s16((int16_t *)p1, d9s16);
-  p1 -= stride;
-  vst1_s16((int16_t *)p2, d10s16);
-  p2 += stride;
-  vst1_s16((int16_t *)p1, d8s16);
-  vst1_s16((int16_t *)p2, d11s16);
-  return;
-}
-
-#define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \
-  ;                                           \
-  __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16);
-static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2,
-                                                   int stride, int16x8_t q4s16,
-                                                   int16x8_t q5s16,
-                                                   int16x8_t q6s16,
-                                                   int16x8_t q7s16) {
-  int16x4_t d4s16, d5s16, d6s16, d7s16;
-
-  d4s16 = vld1_s16((int16_t *)p1);
-  p1 += stride;
-  d7s16 = vld1_s16((int16_t *)p2);
-  p2 -= stride;
-  d5s16 = vld1_s16((int16_t *)p1);
-  d6s16 = vld1_s16((int16_t *)p2);
-
-  q5s16 = vrshrq_n_s16(q5s16, 6);
-  q6s16 = vrshrq_n_s16(q6s16, 6);
-  q7s16 = vrshrq_n_s16(q7s16, 6);
-  q4s16 = vrshrq_n_s16(q4s16, 6);
-
-  q5s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16)));
-  q6s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16)));
-  q7s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16)));
-  q4s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16)));
-
-  d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
-  d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
-  d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
-  d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
-
-  vst1_s16((int16_t *)p1, d5s16);
-  p1 -= stride;
-  vst1_s16((int16_t *)p2, d6s16);
-  p2 += stride;
-  vst1_s16((int16_t *)p2, d7s16);
-  vst1_s16((int16_t *)p1, d4s16);
-  return;
-}
-
-#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
-  DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
-static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16,
-                                int16_t first_const, int16_t second_const,
-                                int16x8_t *qAs16, int16x8_t *qBs16) {
-  int16x4_t d30s16, d31s16;
-  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
-  int16x4_t dCs16, dDs16, dAs16, dBs16;
-
-  dCs16 = vget_low_s16(q14s16);
-  dDs16 = vget_high_s16(q14s16);
-  dAs16 = vget_low_s16(q13s16);
-  dBs16 = vget_high_s16(q13s16);
-
-  d30s16 = vdup_n_s16(first_const);
-  d31s16 = vdup_n_s16(second_const);
-
-  q8s32 = vmull_s16(dCs16, d30s16);
-  q10s32 = vmull_s16(dAs16, d31s16);
-  q9s32 = vmull_s16(dDs16, d30s16);
-  q11s32 = vmull_s16(dBs16, d31s16);
-  q12s32 = vmull_s16(dCs16, d31s16);
-
-  q8s32 = vsubq_s32(q8s32, q10s32);
-  q9s32 = vsubq_s32(q9s32, q11s32);
-
-  q10s32 = vmull_s16(dDs16, d31s16);
-  q11s32 = vmull_s16(dAs16, d30s16);
-  q15s32 = vmull_s16(dBs16, d30s16);
-
-  q11s32 = vaddq_s32(q12s32, q11s32);
-  q10s32 = vaddq_s32(q10s32, q15s32);
-
-  *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14));
-  *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14));
-  return;
-}
-
-static INLINE void idct32_transpose_pair(int16_t *input, int16_t *t_buf) {
-  int16_t *in;
-  int i;
-  const int stride = 32;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
-  int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
-
-  for (i = 0; i < 4; i++, input += 8) {
-    in = input;
-    q8s16 = vld1q_s16(in);
-    in += stride;
-    q9s16 = vld1q_s16(in);
-    in += stride;
-    q10s16 = vld1q_s16(in);
-    in += stride;
-    q11s16 = vld1q_s16(in);
-    in += stride;
-    q12s16 = vld1q_s16(in);
-    in += stride;
-    q13s16 = vld1q_s16(in);
-    in += stride;
-    q14s16 = vld1q_s16(in);
-    in += stride;
-    q15s16 = vld1q_s16(in);
-
-    d16s16 = vget_low_s16(q8s16);
-    d17s16 = vget_high_s16(q8s16);
-    d18s16 = vget_low_s16(q9s16);
-    d19s16 = vget_high_s16(q9s16);
-    d20s16 = vget_low_s16(q10s16);
-    d21s16 = vget_high_s16(q10s16);
-    d22s16 = vget_low_s16(q11s16);
-    d23s16 = vget_high_s16(q11s16);
-    d24s16 = vget_low_s16(q12s16);
-    d25s16 = vget_high_s16(q12s16);
-    d26s16 = vget_low_s16(q13s16);
-    d27s16 = vget_high_s16(q13s16);
-    d28s16 = vget_low_s16(q14s16);
-    d29s16 = vget_high_s16(q14s16);
-    d30s16 = vget_low_s16(q15s16);
-    d31s16 = vget_high_s16(q15s16);
-
-    q8s16 = vcombine_s16(d16s16, d24s16);   // vswp d17, d24
-    q9s16 = vcombine_s16(d18s16, d26s16);   // vswp d19, d26
-    q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
-    q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
-    q12s16 = vcombine_s16(d17s16, d25s16);
-    q13s16 = vcombine_s16(d19s16, d27s16);
-    q14s16 = vcombine_s16(d21s16, d29s16);
-    q15s16 = vcombine_s16(d23s16, d31s16);
-
-    q0x2s32 =
-        vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q10s16));
-    q1x2s32 =
-        vtrnq_s32(vreinterpretq_s32_s16(q9s16), vreinterpretq_s32_s16(q11s16));
-    q2x2s32 =
-        vtrnq_s32(vreinterpretq_s32_s16(q12s16), vreinterpretq_s32_s16(q14s16));
-    q3x2s32 =
-        vtrnq_s32(vreinterpretq_s32_s16(q13s16), vreinterpretq_s32_s16(q15s16));
-
-    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
-                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
-    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
-                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
-    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
-                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
-    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
-                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
-
-    vst1q_s16(t_buf, q0x2s16.val[0]);
-    t_buf += 8;
-    vst1q_s16(t_buf, q0x2s16.val[1]);
-    t_buf += 8;
-    vst1q_s16(t_buf, q1x2s16.val[0]);
-    t_buf += 8;
-    vst1q_s16(t_buf, q1x2s16.val[1]);
-    t_buf += 8;
-    vst1q_s16(t_buf, q2x2s16.val[0]);
-    t_buf += 8;
-    vst1q_s16(t_buf, q2x2s16.val[1]);
-    t_buf += 8;
-    vst1q_s16(t_buf, q3x2s16.val[0]);
-    t_buf += 8;
-    vst1q_s16(t_buf, q3x2s16.val[1]);
-    t_buf += 8;
-  }
-  return;
-}
-
-static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16,
-                                             int16x8_t q3s16, int16x8_t q6s16,
-                                             int16x8_t q7s16, int16x8_t q8s16,
-                                             int16x8_t q9s16, int16x8_t q10s16,
-                                             int16x8_t q11s16, int16x8_t q12s16,
-                                             int16x8_t q13s16, int16x8_t q14s16,
-                                             int16x8_t q15s16) {
-  int16x8_t q0s16, q1s16, q4s16, q5s16;
-
-  STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
-  STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
-
-  LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
-  STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
-
-  LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
-  q2s16 = vaddq_s16(q10s16, q1s16);
-  q3s16 = vaddq_s16(q11s16, q0s16);
-  q4s16 = vsubq_s16(q11s16, q0s16);
-  q5s16 = vsubq_s16(q10s16, q1s16);
-
-  LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
-  STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
-
-  LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
-  STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
-
-  LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
-  q2s16 = vaddq_s16(q12s16, q1s16);
-  q3s16 = vaddq_s16(q13s16, q0s16);
-  q4s16 = vsubq_s16(q13s16, q0s16);
-  q5s16 = vsubq_s16(q12s16, q1s16);
-
-  LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
-  STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
-
-  LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
-  STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
-
-  LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
-  q2s16 = vaddq_s16(q14s16, q1s16);
-  q3s16 = vaddq_s16(q15s16, q0s16);
-  q4s16 = vsubq_s16(q15s16, q0s16);
-  q5s16 = vsubq_s16(q14s16, q1s16);
-
-  LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
-  STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
-
-  LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
-  STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
-  return;
-}
-
-static INLINE void idct32_bands_end_2nd_pass(
-    int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16,
-    int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16,
-    int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16,
-    int16x8_t q14s16, int16x8_t q15s16) {
-  uint8_t *r6 = dest + 31 * stride;
-  uint8_t *r7 = dest /* +  0 * stride*/;
-  uint8_t *r9 = dest + 15 * stride;
-  uint8_t *r10 = dest + 16 * stride;
-  int str2 = stride << 1;
-  int16x8_t q0s16, q1s16, q4s16, q5s16;
-
-  STORE_COMBINE_CENTER_RESULTS(r10, r9);
-  r10 += str2;
-  r9 -= str2;
-
-  LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-  r7 += str2;
-  r6 -= str2;
-
-  LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
-  q2s16 = vaddq_s16(q10s16, q1s16);
-  q3s16 = vaddq_s16(q11s16, q0s16);
-  q4s16 = vsubq_s16(q11s16, q0s16);
-  q5s16 = vsubq_s16(q10s16, q1s16);
-
-  LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_COMBINE_CENTER_RESULTS(r10, r9);
-  r10 += str2;
-  r9 -= str2;
-
-  LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-  r7 += str2;
-  r6 -= str2;
-
-  LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
-  q2s16 = vaddq_s16(q12s16, q1s16);
-  q3s16 = vaddq_s16(q13s16, q0s16);
-  q4s16 = vsubq_s16(q13s16, q0s16);
-  q5s16 = vsubq_s16(q12s16, q1s16);
-
-  LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_COMBINE_CENTER_RESULTS(r10, r9);
-  r10 += str2;
-  r9 -= str2;
-
-  LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-  r7 += str2;
-  r6 -= str2;
-
-  LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
-  q2s16 = vaddq_s16(q14s16, q1s16);
-  q3s16 = vaddq_s16(q15s16, q0s16);
-  q4s16 = vsubq_s16(q15s16, q0s16);
-  q5s16 = vsubq_s16(q14s16, q1s16);
-
-  LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_COMBINE_CENTER_RESULTS(r10, r9);
-
-  LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-  return;
-}
-
-void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int stride) {
-  int i, idct32_pass_loop;
-  int16_t trans_buf[32 * 8];
-  int16_t pass1[32 * 32];
-  int16_t pass2[32 * 32];
-  int16_t *out;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-
-  for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
-       idct32_pass_loop++,
-      input = pass1,  // the input of pass2 is the result of pass1
-       out = pass2) {
-    for (i = 0; i < 4; i++, input += 32 * 8, out += 8) {  // idct32_bands_loop
-      idct32_transpose_pair(input, trans_buf);
-
-      // -----------------------------------------
-      // BLOCK A: 16-19,28-31
-      // -----------------------------------------
-      // generate 16,17,30,31
-      // part of stage 1
-      LOAD_FROM_TRANSPOSED(0, 1, 31)
-      DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
-      LOAD_FROM_TRANSPOSED(31, 17, 15)
-      DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
-      // part of stage 2
-      q4s16 = vaddq_s16(q0s16, q1s16);
-      q13s16 = vsubq_s16(q0s16, q1s16);
-      q6s16 = vaddq_s16(q2s16, q3s16);
-      q14s16 = vsubq_s16(q2s16, q3s16);
-      // part of stage 3
-      DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
-
-      // generate 18,19,28,29
-      // part of stage 1
-      LOAD_FROM_TRANSPOSED(15, 9, 23)
-      DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
-      LOAD_FROM_TRANSPOSED(23, 25, 7)
-      DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
-      // part of stage 2
-      q13s16 = vsubq_s16(q3s16, q2s16);
-      q3s16 = vaddq_s16(q3s16, q2s16);
-      q14s16 = vsubq_s16(q1s16, q0s16);
-      q2s16 = vaddq_s16(q1s16, q0s16);
-      // part of stage 3
-      DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
-      // part of stage 4
-      q8s16 = vaddq_s16(q4s16, q2s16);
-      q9s16 = vaddq_s16(q5s16, q0s16);
-      q10s16 = vaddq_s16(q7s16, q1s16);
-      q15s16 = vaddq_s16(q6s16, q3s16);
-      q13s16 = vsubq_s16(q5s16, q0s16);
-      q14s16 = vsubq_s16(q7s16, q1s16);
-      STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
-      STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
-      // part of stage 5
-      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
-      STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
-      // part of stage 4
-      q13s16 = vsubq_s16(q4s16, q2s16);
-      q14s16 = vsubq_s16(q6s16, q3s16);
-      // part of stage 5
-      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
-      STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
-
-      // -----------------------------------------
-      // BLOCK B: 20-23,24-27
-      // -----------------------------------------
-      // generate 20,21,26,27
-      // part of stage 1
-      LOAD_FROM_TRANSPOSED(7, 5, 27)
-      DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
-      LOAD_FROM_TRANSPOSED(27, 21, 11)
-      DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
-      // part of stage 2
-      q13s16 = vsubq_s16(q0s16, q1s16);
-      q0s16 = vaddq_s16(q0s16, q1s16);
-      q14s16 = vsubq_s16(q2s16, q3s16);
-      q2s16 = vaddq_s16(q2s16, q3s16);
-      // part of stage 3
-      DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
-
-      // generate 22,23,24,25
-      // part of stage 1
-      LOAD_FROM_TRANSPOSED(11, 13, 19)
-      DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
-      LOAD_FROM_TRANSPOSED(19, 29, 3)
-      DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
-      // part of stage 2
-      q14s16 = vsubq_s16(q4s16, q5s16);
-      q5s16 = vaddq_s16(q4s16, q5s16);
-      q13s16 = vsubq_s16(q6s16, q7s16);
-      q6s16 = vaddq_s16(q6s16, q7s16);
-      // part of stage 3
-      DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
-      // part of stage 4
-      q10s16 = vaddq_s16(q7s16, q1s16);
-      q11s16 = vaddq_s16(q5s16, q0s16);
-      q12s16 = vaddq_s16(q6s16, q2s16);
-      q15s16 = vaddq_s16(q4s16, q3s16);
-      // part of stage 6
-      LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
-      q8s16 = vaddq_s16(q14s16, q11s16);
-      q9s16 = vaddq_s16(q13s16, q10s16);
-      q13s16 = vsubq_s16(q13s16, q10s16);
-      q11s16 = vsubq_s16(q14s16, q11s16);
-      STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
-      LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
-      q8s16 = vsubq_s16(q9s16, q12s16);
-      q10s16 = vaddq_s16(q14s16, q15s16);
-      q14s16 = vsubq_s16(q14s16, q15s16);
-      q12s16 = vaddq_s16(q9s16, q12s16);
-      STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
-      // part of stage 7
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
-      STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
-      q13s16 = q11s16;
-      q14s16 = q8s16;
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
-      STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
-      // part of stage 4
-      q14s16 = vsubq_s16(q5s16, q0s16);
-      q13s16 = vsubq_s16(q6s16, q2s16);
-      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
-      q14s16 = vsubq_s16(q7s16, q1s16);
-      q13s16 = vsubq_s16(q4s16, q3s16);
-      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
-      // part of stage 6
-      LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
-      q8s16 = vaddq_s16(q14s16, q1s16);
-      q9s16 = vaddq_s16(q13s16, q6s16);
-      q13s16 = vsubq_s16(q13s16, q6s16);
-      q1s16 = vsubq_s16(q14s16, q1s16);
-      STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
-      LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
-      q14s16 = vsubq_s16(q8s16, q5s16);
-      q10s16 = vaddq_s16(q8s16, q5s16);
-      q11s16 = vaddq_s16(q9s16, q0s16);
-      q0s16 = vsubq_s16(q9s16, q0s16);
-      STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
-      // part of stage 7
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
-      STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
-      DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16);
-      STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
-
-      // -----------------------------------------
-      // BLOCK C: 8-10,11-15
-      // -----------------------------------------
-      // generate 8,9,14,15
-      // part of stage 2
-      LOAD_FROM_TRANSPOSED(3, 2, 30)
-      DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
-      LOAD_FROM_TRANSPOSED(30, 18, 14)
-      DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
-      // part of stage 3
-      q13s16 = vsubq_s16(q0s16, q1s16);
-      q0s16 = vaddq_s16(q0s16, q1s16);
-      q14s16 = vsubq_s16(q2s16, q3s16);
-      q2s16 = vaddq_s16(q2s16, q3s16);
-      // part of stage 4
-      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
-
-      // generate 10,11,12,13
-      // part of stage 2
-      LOAD_FROM_TRANSPOSED(14, 10, 22)
-      DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
-      LOAD_FROM_TRANSPOSED(22, 26, 6)
-      DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
-      // part of stage 3
-      q14s16 = vsubq_s16(q4s16, q5s16);
-      q5s16 = vaddq_s16(q4s16, q5s16);
-      q13s16 = vsubq_s16(q6s16, q7s16);
-      q6s16 = vaddq_s16(q6s16, q7s16);
-      // part of stage 4
-      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
-      // part of stage 5
-      q8s16 = vaddq_s16(q0s16, q5s16);
-      q9s16 = vaddq_s16(q1s16, q7s16);
-      q13s16 = vsubq_s16(q1s16, q7s16);
-      q14s16 = vsubq_s16(q3s16, q4s16);
-      q10s16 = vaddq_s16(q3s16, q4s16);
-      q15s16 = vaddq_s16(q2s16, q6s16);
-      STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
-      STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
-      // part of stage 6
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
-      STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
-      q13s16 = vsubq_s16(q0s16, q5s16);
-      q14s16 = vsubq_s16(q2s16, q6s16);
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
-      STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
-
-      // -----------------------------------------
-      // BLOCK D: 0-3,4-7
-      // -----------------------------------------
-      // generate 4,5,6,7
-      // part of stage 3
-      LOAD_FROM_TRANSPOSED(6, 4, 28)
-      DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
-      LOAD_FROM_TRANSPOSED(28, 20, 12)
-      DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
-      // part of stage 4
-      q13s16 = vsubq_s16(q0s16, q1s16);
-      q0s16 = vaddq_s16(q0s16, q1s16);
-      q14s16 = vsubq_s16(q2s16, q3s16);
-      q2s16 = vaddq_s16(q2s16, q3s16);
-      // part of stage 5
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
-
-      // generate 0,1,2,3
-      // part of stage 4
-      LOAD_FROM_TRANSPOSED(12, 0, 16)
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
-      LOAD_FROM_TRANSPOSED(16, 8, 24)
-      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
-      // part of stage 5
-      q4s16 = vaddq_s16(q7s16, q6s16);
-      q7s16 = vsubq_s16(q7s16, q6s16);
-      q6s16 = vsubq_s16(q5s16, q14s16);
-      q5s16 = vaddq_s16(q5s16, q14s16);
-      // part of stage 6
-      q8s16 = vaddq_s16(q4s16, q2s16);
-      q9s16 = vaddq_s16(q5s16, q3s16);
-      q10s16 = vaddq_s16(q6s16, q1s16);
-      q11s16 = vaddq_s16(q7s16, q0s16);
-      q12s16 = vsubq_s16(q7s16, q0s16);
-      q13s16 = vsubq_s16(q6s16, q1s16);
-      q14s16 = vsubq_s16(q5s16, q3s16);
-      q15s16 = vsubq_s16(q4s16, q2s16);
-      // part of stage 7
-      LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
-      q2s16 = vaddq_s16(q8s16, q1s16);
-      q3s16 = vaddq_s16(q9s16, q0s16);
-      q4s16 = vsubq_s16(q9s16, q0s16);
-      q5s16 = vsubq_s16(q8s16, q1s16);
-      LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
-      q8s16 = vaddq_s16(q4s16, q1s16);
-      q9s16 = vaddq_s16(q5s16, q0s16);
-      q6s16 = vsubq_s16(q5s16, q0s16);
-      q7s16 = vsubq_s16(q4s16, q1s16);
-
-      if (idct32_pass_loop == 0) {
-        idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
-                                  q10s16, q11s16, q12s16, q13s16, q14s16,
-                                  q15s16);
-      } else {
-        idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16,
-                                  q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
-                                  q14s16, q15s16);
-        dest += 8;
-      }
-    }
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/idct32x32_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct32x32_add_neon_asm.asm
deleted file mode 100644
index e7793fb16..000000000
--- a/third_party/aom/aom_dsp/arm/idct32x32_add_neon_asm.asm
+++ /dev/null
@@ -1,1302 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-;TODO(cd): adjust these constant to be able to use vqdmulh for faster
-;          dct_const_round_shift(a * b) within butterfly calculations.
-cospi_1_64  EQU 16364
-cospi_2_64  EQU 16305
-cospi_3_64  EQU 16207
-cospi_4_64  EQU 16069
-cospi_5_64  EQU 15893
-cospi_6_64  EQU 15679
-cospi_7_64  EQU 15426
-cospi_8_64  EQU 15137
-cospi_9_64  EQU 14811
-cospi_10_64 EQU 14449
-cospi_11_64 EQU 14053
-cospi_12_64 EQU 13623
-cospi_13_64 EQU 13160
-cospi_14_64 EQU 12665
-cospi_15_64 EQU 12140
-cospi_16_64 EQU 11585
-cospi_17_64 EQU 11003
-cospi_18_64 EQU 10394
-cospi_19_64 EQU  9760
-cospi_20_64 EQU  9102
-cospi_21_64 EQU  8423
-cospi_22_64 EQU  7723
-cospi_23_64 EQU  7005
-cospi_24_64 EQU  6270
-cospi_25_64 EQU  5520
-cospi_26_64 EQU  4756
-cospi_27_64 EQU  3981
-cospi_28_64 EQU  3196
-cospi_29_64 EQU  2404
-cospi_30_64 EQU  1606
-cospi_31_64 EQU   804
-
-
-    EXPORT  |aom_idct32x32_1024_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    AREA     Block, CODE, READONLY
-
-    ; --------------------------------------------------------------------------
-    ; Load from transposed_buffer
-    ;   q13 = transposed_buffer[first_offset]
-    ;   q14 = transposed_buffer[second_offset]
-    ;   for proper address calculation, the last offset used when manipulating
-    ;   transposed_buffer must be passed in. use 0 for first use.
-    MACRO
-    LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset
-    ; address calculation with proper stride and loading
-    add r0, #($first_offset  - $prev_offset )*8*2
-    vld1.s16        {q14}, [r0]
-    add r0, #($second_offset - $first_offset)*8*2
-    vld1.s16        {q13}, [r0]
-    ; (used) two registers (q14, q13)
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Load from output (used as temporary storage)
-    ;   reg1 = output[first_offset]
-    ;   reg2 = output[second_offset]
-    ;   for proper address calculation, the last offset used when manipulating
-    ;   output, whether reading or storing) must be passed in. use 0 for first
-    ;   use.
-    MACRO
-    LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
-    ; address calculation with proper stride and loading
-    add r1, #($first_offset  - $prev_offset )*32*2
-    vld1.s16        {$reg1}, [r1]
-    add r1, #($second_offset - $first_offset)*32*2
-    vld1.s16        {$reg2}, [r1]
-    ; (used) two registers ($reg1, $reg2)
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Store into output (sometimes as as temporary storage)
-    ;   output[first_offset] = reg1
-    ;   output[second_offset] = reg2
-    ;   for proper address calculation, the last offset used when manipulating
-    ;   output, whether reading or storing) must be passed in. use 0 for first
-    ;   use.
-    MACRO
-    STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
-    ; address calculation with proper stride and storing
-    add r1, #($first_offset  - $prev_offset )*32*2
-    vst1.16 {$reg1}, [r1]
-    add r1, #($second_offset - $first_offset)*32*2
-    vst1.16 {$reg2}, [r1]
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Combine-add results with current destination content
-    ;   q6-q9 contain the results (out[j * 32 + 0-31])
-    MACRO
-    STORE_COMBINE_CENTER_RESULTS
-    ; load dest[j * dest_stride + 0-31]
-    vld1.s16        {d8}, [r10], r2
-    vld1.s16        {d11}, [r9], r11
-    vld1.s16        {d9}, [r10]
-    vld1.s16        {d10}, [r9]
-    ; ROUND_POWER_OF_TWO
-    vrshr.s16       q7, q7, #6
-    vrshr.s16       q8, q8, #6
-    vrshr.s16       q9, q9, #6
-    vrshr.s16       q6, q6, #6
-    ; add to dest[j * dest_stride + 0-31]
-    vaddw.u8        q7, q7, d9
-    vaddw.u8        q8, q8, d10
-    vaddw.u8        q9, q9, d11
-    vaddw.u8        q6, q6, d8
-    ; clip pixel
-    vqmovun.s16     d9,  q7
-    vqmovun.s16     d10, q8
-    vqmovun.s16     d11, q9
-    vqmovun.s16     d8,  q6
-    ; store back into dest[j * dest_stride + 0-31]
-    vst1.16         {d9}, [r10], r11
-    vst1.16         {d10}, [r9], r2
-    vst1.16         {d8}, [r10]
-    vst1.16         {d11}, [r9]
-    ; update pointers (by dest_stride * 2)
-    sub r9,  r9,  r2, lsl #1
-    add r10, r10, r2, lsl #1
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Combine-add results with current destination content
-    ;   q6-q9 contain the results (out[j * 32 + 0-31])
-    MACRO
-    STORE_COMBINE_CENTER_RESULTS_LAST
-    ; load dest[j * dest_stride + 0-31]
-    vld1.s16        {d8}, [r10], r2
-    vld1.s16        {d11}, [r9], r11
-    vld1.s16        {d9}, [r10]
-    vld1.s16        {d10}, [r9]
-    ; ROUND_POWER_OF_TWO
-    vrshr.s16       q7, q7, #6
-    vrshr.s16       q8, q8, #6
-    vrshr.s16       q9, q9, #6
-    vrshr.s16       q6, q6, #6
-    ; add to dest[j * dest_stride + 0-31]
-    vaddw.u8        q7, q7, d9
-    vaddw.u8        q8, q8, d10
-    vaddw.u8        q9, q9, d11
-    vaddw.u8        q6, q6, d8
-    ; clip pixel
-    vqmovun.s16     d9,  q7
-    vqmovun.s16     d10, q8
-    vqmovun.s16     d11, q9
-    vqmovun.s16     d8,  q6
-    ; store back into dest[j * dest_stride + 0-31]
-    vst1.16         {d9}, [r10], r11
-    vst1.16         {d10}, [r9], r2
-    vst1.16         {d8}, [r10]!
-    vst1.16         {d11}, [r9]!
-    ; update pointers (by dest_stride * 2)
-    sub r9,  r9,  r2, lsl #1
-    add r10, r10, r2, lsl #1
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Combine-add results with current destination content
-    ;   q4-q7 contain the results (out[j * 32 + 0-31])
-    MACRO
-    STORE_COMBINE_EXTREME_RESULTS
-    ; load dest[j * dest_stride + 0-31]
-    vld1.s16        {d4}, [r7], r2
-    vld1.s16        {d7}, [r6], r11
-    vld1.s16        {d5}, [r7]
-    vld1.s16        {d6}, [r6]
-    ; ROUND_POWER_OF_TWO
-    vrshr.s16       q5, q5, #6
-    vrshr.s16       q6, q6, #6
-    vrshr.s16       q7, q7, #6
-    vrshr.s16       q4, q4, #6
-    ; add to dest[j * dest_stride + 0-31]
-    vaddw.u8        q5, q5, d5
-    vaddw.u8        q6, q6, d6
-    vaddw.u8        q7, q7, d7
-    vaddw.u8        q4, q4, d4
-    ; clip pixel
-    vqmovun.s16     d5, q5
-    vqmovun.s16     d6, q6
-    vqmovun.s16     d7, q7
-    vqmovun.s16     d4, q4
-    ; store back into dest[j * dest_stride + 0-31]
-    vst1.16         {d5}, [r7], r11
-    vst1.16         {d6}, [r6], r2
-    vst1.16         {d7}, [r6]
-    vst1.16         {d4}, [r7]
-    ; update pointers (by dest_stride * 2)
-    sub r6, r6, r2, lsl #1
-    add r7, r7, r2, lsl #1
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Combine-add results with current destination content
-    ;   q4-q7 contain the results (out[j * 32 + 0-31])
-    MACRO
-    STORE_COMBINE_EXTREME_RESULTS_LAST
-    ; load dest[j * dest_stride + 0-31]
-    vld1.s16        {d4}, [r7], r2
-    vld1.s16        {d7}, [r6], r11
-    vld1.s16        {d5}, [r7]
-    vld1.s16        {d6}, [r6]
-    ; ROUND_POWER_OF_TWO
-    vrshr.s16       q5, q5, #6
-    vrshr.s16       q6, q6, #6
-    vrshr.s16       q7, q7, #6
-    vrshr.s16       q4, q4, #6
-    ; add to dest[j * dest_stride + 0-31]
-    vaddw.u8        q5, q5, d5
-    vaddw.u8        q6, q6, d6
-    vaddw.u8        q7, q7, d7
-    vaddw.u8        q4, q4, d4
-    ; clip pixel
-    vqmovun.s16     d5, q5
-    vqmovun.s16     d6, q6
-    vqmovun.s16     d7, q7
-    vqmovun.s16     d4, q4
-    ; store back into dest[j * dest_stride + 0-31]
-    vst1.16         {d5}, [r7], r11
-    vst1.16         {d6}, [r6], r2
-    vst1.16         {d7}, [r6]!
-    vst1.16         {d4}, [r7]!
-    ; update pointers (by dest_stride * 2)
-    sub r6, r6, r2, lsl #1
-    add r7, r7, r2, lsl #1
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Touches q8-q12, q15 (q13-q14 are preserved)
-    ; valid output registers are anything but q8-q11
-    MACRO
-    DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
-    ; TODO(cd): have special case to re-use constants when they are similar for
-    ;           consecutive butterflies
-    ; TODO(cd): have special case when both constants are the same, do the
-    ;           additions/subtractions before the multiplies.
-    ; generate the constants
-    ;   generate scalar constants
-    mov             r8,  #$first_constant  & 0xFF00
-    mov             r12, #$second_constant & 0xFF00
-    add             r8,  #$first_constant  & 0x00FF
-    add             r12, #$second_constant & 0x00FF
-    ;   generate vector constants
-    vdup.16         d30, r8
-    vdup.16         d31, r12
-    ; (used) two for inputs (regA-regD), one for constants (q15)
-    ; do some multiplications (ordered for maximum latency hiding)
-    vmull.s16 q8,  $regC, d30
-    vmull.s16 q10, $regA, d31
-    vmull.s16 q9,  $regD, d30
-    vmull.s16 q11, $regB, d31
-    vmull.s16 q12, $regC, d31
-    ; (used) five for intermediate (q8-q12), one for constants (q15)
-    ; do some addition/subtractions (to get back two register)
-    vsub.s32  q8, q8, q10
-    vsub.s32  q9, q9, q11
-    ; do more multiplications (ordered for maximum latency hiding)
-    vmull.s16 q10, $regD, d31
-    vmull.s16 q11, $regA, d30
-    vmull.s16 q15, $regB, d30
-    ; (used) six for intermediate (q8-q12, q15)
-    ; do more addition/subtractions
-    vadd.s32  q11, q12, q11
-    vadd.s32  q10, q10, q15
-    ; (used) four for intermediate (q8-q11)
-    ; dct_const_round_shift
-    vqrshrn.s32 $reg1, q8,  #14
-    vqrshrn.s32 $reg2, q9,  #14
-    vqrshrn.s32 $reg3, q11, #14
-    vqrshrn.s32 $reg4, q10, #14
-    ; (used) two for results, well four d registers
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Touches q8-q12, q15 (q13-q14 are preserved)
-    ; valid output registers are anything but q8-q11
-    MACRO
-    DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
-    DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
-    MEND
-    ; --------------------------------------------------------------------------
-
-;void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
-;
-;   r0  int16_t *input,
-;   r1  uint8_t *dest,
-;   r2  int dest_stride)
-; loop counters
-;   r4  bands loop counter
-;   r5  pass loop counter
-;   r8  transpose loop counter
-; combine-add pointers
-;   r6  dest + 31 * dest_stride, descending (30, 29, 28, ...)
-;   r7  dest +  0 * dest_stride, ascending  (1, 2, 3, ...)
-;   r9  dest + 15 * dest_stride, descending (14, 13, 12, ...)
-;   r10 dest + 16 * dest_stride, ascending  (17, 18, 19, ...)
-
-|aom_idct32x32_1024_add_neon| PROC
-    ; This function does one pass of idct32x32 transform.
-    ;
-    ; This is done by transposing the input and then doing a 1d transform on
-    ; columns. In the first pass, the transposed columns are the original
-    ; rows. In the second pass, after the transposition, the colums are the
-    ; original columns.
-    ; The 1d transform is done by looping over bands of eight columns (the
-    ; idct32_bands loop). For each band, the transform input transposition
-    ; is done on demand, one band of four 8x8 matrices at a time. The four
-    ; matrices are transposed by pairs (the idct32_transpose_pair loop).
-    push  {r4-r11}
-    vpush {d8-d15}
-    ; stack operation
-    ; internal buffer used to transpose 8 lines into before transforming them
-    ;   int16_t transpose_buffer[32 * 8];
-    ;   at sp + [4096, 4607]
-    ; results of the first pass (transpose and transform rows)
-    ;   int16_t pass1[32 * 32];
-    ;   at sp + [0, 2047]
-    ; results of the second pass (transpose and transform columns)
-    ;   int16_t pass2[32 * 32];
-    ;   at sp + [2048, 4095]
-    sub sp, sp, #512+2048+2048
-
-    ; r6  = dest + 31 * dest_stride
-    ; r7  = dest +  0 * dest_stride
-    ; r9  = dest + 15 * dest_stride
-    ; r10 = dest + 16 * dest_stride
-    rsb r6,  r2, r2, lsl #5
-    rsb r9,  r2, r2, lsl #4
-    add r10, r1, r2, lsl #4
-    mov r7, r1
-    add r6, r6, r1
-    add r9, r9, r1
-    ; r11 = -dest_stride
-    neg r11, r2
-    ; r3 = input
-    mov r3, r0
-    ; parameters for first pass
-      ; r0 = transpose_buffer[32 * 8]
-    add r0, sp, #4096
-      ; r1 = pass1[32 * 32]
-    mov r1, sp
-
-    mov r5, #0          ; initialize pass loop counter
-idct32_pass_loop
-    mov r4, #4          ; initialize bands loop counter
-idct32_bands_loop
-    mov r8, #2          ; initialize transpose loop counter
-idct32_transpose_pair_loop
-    ; Load two horizontally consecutive 8x8 16bit data matrices. The first one
-    ; into q0-q7 and the second one into q8-q15. There is a stride of 64,
-    ; adjusted to 32 because of the two post-increments.
-    vld1.s16        {q8},  [r3]!
-    vld1.s16        {q0},  [r3]!
-    add r3, #32
-    vld1.s16        {q9},  [r3]!
-    vld1.s16        {q1},  [r3]!
-    add r3, #32
-    vld1.s16        {q10}, [r3]!
-    vld1.s16        {q2},  [r3]!
-    add r3, #32
-    vld1.s16        {q11}, [r3]!
-    vld1.s16        {q3},  [r3]!
-    add r3, #32
-    vld1.s16        {q12}, [r3]!
-    vld1.s16        {q4},  [r3]!
-    add r3, #32
-    vld1.s16        {q13}, [r3]!
-    vld1.s16        {q5},  [r3]!
-    add r3, #32
-    vld1.s16        {q14}, [r3]!
-    vld1.s16        {q6},  [r3]!
-    add r3, #32
-    vld1.s16        {q15}, [r3]!
-    vld1.s16        {q7},  [r3]!
-
-    ; Transpose the two 8x8 16bit data matrices.
-    vswp            d17, d24
-    vswp            d23, d30
-    vswp            d21, d28
-    vswp            d19, d26
-    vswp            d1,  d8
-    vswp            d7,  d14
-    vswp            d5,  d12
-    vswp            d3,  d10
-    vtrn.32         q8,  q10
-    vtrn.32         q9,  q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vtrn.32         q0,  q2
-    vtrn.32         q1,  q3
-    vtrn.32         q4,  q6
-    vtrn.32         q5,  q7
-    vtrn.16         q8,  q9
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    vtrn.16         q0,  q1
-    vtrn.16         q2,  q3
-    vtrn.16         q4,  q5
-    vtrn.16         q6,  q7
-
-    ; Store both matrices after each other. There is a stride of 32, which
-    ; adjusts to nothing because of the post-increments.
-    vst1.16        {q8},  [r0]!
-    vst1.16        {q9},  [r0]!
-    vst1.16        {q10}, [r0]!
-    vst1.16        {q11}, [r0]!
-    vst1.16        {q12}, [r0]!
-    vst1.16        {q13}, [r0]!
-    vst1.16        {q14}, [r0]!
-    vst1.16        {q15}, [r0]!
-    vst1.16        {q0},  [r0]!
-    vst1.16        {q1},  [r0]!
-    vst1.16        {q2},  [r0]!
-    vst1.16        {q3},  [r0]!
-    vst1.16        {q4},  [r0]!
-    vst1.16        {q5},  [r0]!
-    vst1.16        {q6},  [r0]!
-    vst1.16        {q7},  [r0]!
-
-    ; increment pointers by adjusted stride (not necessary for r0/out)
-    ;   go back by 7*32 for the seven lines moved fully by read and add
-    ;   go back by 32 for the eigth line only read
-    ;   advance by 16*2 to go the next pair
-    sub r3,  r3,  #7*32*2 + 32 - 16*2
-    ; transpose pair loop processing
-    subs r8, r8, #1
-    bne idct32_transpose_pair_loop
-
-    ; restore r0/input to its original value
-    sub r0, r0, #32*8*2
-
-    ; Instead of doing the transforms stage by stage, it is done by loading
-    ; some input values and doing as many stages as possible to minimize the
-    ; storing/loading of intermediate results. To fit within registers, the
-    ; final coefficients are cut into four blocks:
-    ; BLOCK A: 16-19,28-31
-    ; BLOCK B: 20-23,24-27
-    ; BLOCK C: 8-10,11-15
-    ; BLOCK D: 0-3,4-7
-    ; Blocks A and C are straight calculation through the various stages. In
-    ; block B, further calculations are performed using the results from
-    ; block A. In block D, further calculations are performed using the results
-    ; from block C and then the final calculations are done using results from
-    ; block A and B which have been combined at the end of block B.
-
-    ; --------------------------------------------------------------------------
-    ; BLOCK A: 16-19,28-31
-    ; --------------------------------------------------------------------------
-    ; generate 16,17,30,31
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] *  cospi_1_64;
-    ;temp2 = input[1 * 32] *  cospi_1_64 + input[31 * 32] * cospi_31_64;
-    ;step1b[16][i] = dct_const_round_shift(temp1);
-    ;step1b[31][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 0, 1, 31
-    DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;
-    ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;
-    ;step1b[17][i] = dct_const_round_shift(temp1);
-    ;step1b[30][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 31, 17, 15
-    DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;step2[16] =  step1b[16][i] + step1b[17][i];
-    ;step2[17] =  step1b[16][i] - step1b[17][i];
-    ;step2[30] = -step1b[30][i] + step1b[31][i];
-    ;step2[31] =  step1b[30][i] + step1b[31][i];
-    vadd.s16  q4, q0, q1
-    vsub.s16  q13, q0, q1
-    vadd.s16  q6, q2, q3
-    vsub.s16  q14, q2, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;
-    ;temp2 = step1b[30][i] * cospi_4_64  - step1b[17][i] * cospi_28_64;
-    ;step3[17] = dct_const_round_shift(temp1);
-    ;step3[30] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15
-    ; --------------------------------------------------------------------------
-    ; generate 18,19,28,29
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;
-    ;temp2 = input[9 * 32] *  cospi_9_64 + input[23 * 32] * cospi_23_64;
-    ;step1b[18][i] = dct_const_round_shift(temp1);
-    ;step1b[29][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 15, 9, 23
-    DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[25 * 32] *  cospi_7_64 - input[7 * 32] * cospi_25_64;
-    ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;
-    ;step1b[19][i] = dct_const_round_shift(temp1);
-    ;step1b[28][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 23, 25, 7
-    DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;step2[18] = -step1b[18][i] + step1b[19][i];
-    ;step2[19] =  step1b[18][i] + step1b[19][i];
-    ;step2[28] =  step1b[28][i] + step1b[29][i];
-    ;step2[29] =  step1b[28][i] - step1b[29][i];
-    vsub.s16  q13, q3, q2
-    vadd.s16  q3,  q3, q2
-    vsub.s16  q14, q1, q0
-    vadd.s16  q2,  q1, q0
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = step1b[18][i] * (-cospi_4_64)  - step1b[29][i] * (-cospi_28_64);
-    ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);
-    ;step3[29] = dct_const_round_shift(temp1);
-    ;step3[18] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1
-    ; --------------------------------------------------------------------------
-    ; combine 16-19,28-31
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[16] = step1b[16][i] + step1b[19][i];
-    ;step1[17] = step1b[17][i] + step1b[18][i];
-    ;step1[18] = step1b[17][i] - step1b[18][i];
-    ;step1[29] = step1b[30][i] - step1b[29][i];
-    ;step1[30] = step1b[30][i] + step1b[29][i];
-    ;step1[31] = step1b[31][i] + step1b[28][i];
-    vadd.s16  q8,  q4, q2
-    vadd.s16  q9,  q5, q0
-    vadd.s16  q10, q7, q1
-    vadd.s16  q15, q6, q3
-    vsub.s16  q13, q5, q0
-    vsub.s16  q14, q7, q1
-    STORE_IN_OUTPUT 0,  16, 31, q8,  q15
-    STORE_IN_OUTPUT 31, 17, 30, q9,  q10
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;
-    ;temp2 = step1b[29][i] * cospi_8_64  + step1b[18][i] * cospi_24_64;
-    ;step2[18] = dct_const_round_shift(temp1);
-    ;step2[29] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3
-    STORE_IN_OUTPUT 30, 29, 18, q1, q0
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[19] = step1b[16][i] - step1b[19][i];
-    ;step1[28] = step1b[31][i] - step1b[28][i];
-    vsub.s16  q13, q4, q2
-    vsub.s16  q14, q6, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;
-    ;temp2 = step1b[28][i] * cospi_8_64  + step1b[19][i] * cospi_24_64;
-    ;step2[19] = dct_const_round_shift(temp1);
-    ;step2[28] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13
-    STORE_IN_OUTPUT 18, 19, 28, q4, q6
-    ; --------------------------------------------------------------------------
-
-
-    ; --------------------------------------------------------------------------
-    ; BLOCK B: 20-23,24-27
-    ; --------------------------------------------------------------------------
-    ; generate 20,21,26,27
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;
-    ;temp2 = input[5 * 32] *  cospi_5_64 + input[27 * 32] * cospi_27_64;
-    ;step1b[20][i] = dct_const_round_shift(temp1);
-    ;step1b[27][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 7, 5, 27
-    DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;
-    ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;
-    ;step1b[21][i] = dct_const_round_shift(temp1);
-    ;step1b[26][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 27, 21, 11
-    DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;step2[20] =  step1b[20][i] + step1b[21][i];
-    ;step2[21] =  step1b[20][i] - step1b[21][i];
-    ;step2[26] = -step1b[26][i] + step1b[27][i];
-    ;step2[27] =  step1b[26][i] + step1b[27][i];
-    vsub.s16  q13, q0, q1
-    vadd.s16  q0, q0, q1
-    vsub.s16  q14, q2, q3
-    vadd.s16  q2, q2, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;
-    ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;
-    ;step3[21] = dct_const_round_shift(temp1);
-    ;step3[26] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; generate 22,23,24,25
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;
-    ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;
-    ;step1b[22][i] = dct_const_round_shift(temp1);
-    ;step1b[25][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 11, 13, 19
-    DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[29 * 32] *  cospi_3_64 - input[3 * 32] * cospi_29_64;
-    ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;
-    ;step1b[23][i] = dct_const_round_shift(temp1);
-    ;step1b[24][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 19, 29, 3
-    DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;step2[22] = -step1b[22][i] + step1b[23][i];
-    ;step2[23] =  step1b[22][i] + step1b[23][i];
-    ;step2[24] =  step1b[24][i] + step1b[25][i];
-    ;step2[25] =  step1b[24][i] - step1b[25][i];
-    vsub.s16  q14, q4, q5
-    vadd.s16  q5, q4, q5
-    vsub.s16  q13, q6, q7
-    vadd.s16  q6, q6, q7
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);
-    ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);
-    ;step3[25] = dct_const_round_shift(temp1);
-    ;step3[22] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15
-    ; --------------------------------------------------------------------------
-    ; combine 20-23,24-27
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[22] = step1b[22][i] + step1b[21][i];
-    ;step1[23] = step1b[23][i] + step1b[20][i];
-    vadd.s16  q10, q7, q1
-    vadd.s16  q11, q5, q0
-    ;step1[24] = step1b[24][i] + step1b[27][i];
-    ;step1[25] = step1b[25][i] + step1b[26][i];
-    vadd.s16  q12, q6, q2
-    vadd.s16  q15, q4, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;step3[16] = step1b[16][i] + step1b[23][i];
-    ;step3[17] = step1b[17][i] + step1b[22][i];
-    ;step3[22] = step1b[17][i] - step1b[22][i];
-    ;step3[23] = step1b[16][i] - step1b[23][i];
-    LOAD_FROM_OUTPUT 28, 16, 17, q14, q13
-    vadd.s16  q8,  q14, q11
-    vadd.s16  q9,  q13, q10
-    vsub.s16  q13, q13, q10
-    vsub.s16  q11, q14, q11
-    STORE_IN_OUTPUT 17, 17, 16, q9, q8
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;step3[24] = step1b[31][i] - step1b[24][i];
-    ;step3[25] = step1b[30][i] - step1b[25][i];
-    ;step3[30] = step1b[30][i] + step1b[25][i];
-    ;step3[31] = step1b[31][i] + step1b[24][i];
-    LOAD_FROM_OUTPUT 16, 30, 31, q14, q9
-    vsub.s16  q8,  q9,  q12
-    vadd.s16  q10, q14, q15
-    vsub.s16  q14, q14, q15
-    vadd.s16  q12, q9,  q12
-    STORE_IN_OUTPUT 31, 30, 31, q10, q12
-    ; --------------------------------------------------------------------------
-    ; TODO(cd) do some register allocation change to remove these push/pop
-    vpush {q8}  ; [24]
-    vpush {q11} ; [23]
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;
-    ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;
-    ;step1[22] = dct_const_round_shift(temp1);
-    ;step1[25] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
-    STORE_IN_OUTPUT 31, 25, 22, q14, q13
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;
-    ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;
-    ;step1[23] = dct_const_round_shift(temp1);
-    ;step1[24] = dct_const_round_shift(temp2);
-    ; TODO(cd) do some register allocation change to remove these push/pop
-    vpop  {q13} ; [23]
-    vpop  {q14} ; [24]
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
-    STORE_IN_OUTPUT 22, 24, 23, q14, q13
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[20] = step1b[23][i] - step1b[20][i];
-    ;step1[27] = step1b[24][i] - step1b[27][i];
-    vsub.s16  q14, q5, q0
-    vsub.s16  q13, q6, q2
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;temp1 = step1b[20][i] * (-cospi_8_64)  - step1b[27][i] * (-cospi_24_64);
-    ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);
-    ;step2[27] = dct_const_round_shift(temp1);
-    ;step2[20] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[21] = step1b[22][i] - step1b[21][i];
-    ;step1[26] = step1b[25][i] - step1b[26][i];
-    vsub.s16  q14,  q7, q1
-    vsub.s16  q13,  q4, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;temp1 = step1b[21][i] * (-cospi_8_64)  - step1b[26][i] * (-cospi_24_64);
-    ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);
-    ;step2[26] = dct_const_round_shift(temp1);
-    ;step2[21] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;step3[18] = step1b[18][i] + step1b[21][i];
-    ;step3[19] = step1b[19][i] + step1b[20][i];
-    ;step3[20] = step1b[19][i] - step1b[20][i];
-    ;step3[21] = step1b[18][i] - step1b[21][i];
-    LOAD_FROM_OUTPUT 23, 18, 19, q14, q13
-    vadd.s16  q8,  q14, q1
-    vadd.s16  q9,  q13, q6
-    vsub.s16  q13, q13, q6
-    vsub.s16  q1,  q14, q1
-    STORE_IN_OUTPUT 19, 18, 19, q8, q9
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;step3[27] = step1b[28][i] - step1b[27][i];
-    ;step3[28] = step1b[28][i] + step1b[27][i];
-    ;step3[29] = step1b[29][i] + step1b[26][i];
-    ;step3[26] = step1b[29][i] - step1b[26][i];
-    LOAD_FROM_OUTPUT 19, 28, 29, q8, q9
-    vsub.s16  q14, q8, q5
-    vadd.s16  q10, q8, q5
-    vadd.s16  q11, q9, q0
-    vsub.s16  q0, q9, q0
-    STORE_IN_OUTPUT 29, 28, 29, q10, q11
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;
-    ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;
-    ;step1[20] = dct_const_round_shift(temp1);
-    ;step1[27] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
-    STORE_IN_OUTPUT 29, 20, 27, q13, q14
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;
-    ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;
-    ;step1[21] = dct_const_round_shift(temp1);
-    ;step1[26] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1
-    STORE_IN_OUTPUT 27, 21, 26, q1, q0
-    ; --------------------------------------------------------------------------
-
-
-    ; --------------------------------------------------------------------------
-    ; BLOCK C: 8-10,11-15
-    ; --------------------------------------------------------------------------
-    ; generate 8,9,14,15
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;
-    ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;
-    ;step2[8] = dct_const_round_shift(temp1);
-    ;step2[15] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 3, 2, 30
-    DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;
-    ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;
-    ;step2[9] = dct_const_round_shift(temp1);
-    ;step2[14] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 30, 18, 14
-    DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;step3[8] = step1b[8][i] + step1b[9][i];
-    ;step3[9] = step1b[8][i] - step1b[9][i];
-    ;step3[14] = step1b[15][i] - step1b[14][i];
-    ;step3[15] = step1b[15][i] + step1b[14][i];
-    vsub.s16  q13, q0, q1
-    vadd.s16  q0, q0, q1
-    vsub.s16  q14, q2, q3
-    vadd.s16  q2, q2, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;
-    ;temp2 = step1b[14][i] * cospi_8_64  + step1b[9][i] * cospi_24_64;
-    ;step1[9]  = dct_const_round_shift(temp1);
-    ;step1[14] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; generate 10,11,12,13
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;
-    ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;
-    ;step2[10] = dct_const_round_shift(temp1);
-    ;step2[13] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 14, 10, 22
-    DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;
-    ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;
-    ;step2[11] = dct_const_round_shift(temp1);
-    ;step2[12] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 22, 26, 6
-    DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;step3[10] = step1b[11][i] - step1b[10][i];
-    ;step3[11] = step1b[11][i] + step1b[10][i];
-    ;step3[12] = step1b[12][i] + step1b[13][i];
-    ;step3[13] = step1b[12][i] - step1b[13][i];
-    vsub.s16  q14, q4, q5
-    vadd.s16  q5, q4, q5
-    vsub.s16  q13, q6, q7
-    vadd.s16  q6, q6, q7
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;temp1 = step1b[10][i] * (-cospi_8_64)  - step1b[13][i] * (-cospi_24_64);
-    ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);
-    ;step1[13] = dct_const_round_shift(temp1);
-    ;step1[10] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15
-    ; --------------------------------------------------------------------------
-    ; combine 8-10,11-15
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;step2[8]  = step1b[8][i] + step1b[11][i];
-    ;step2[9]  = step1b[9][i] + step1b[10][i];
-    ;step2[10] = step1b[9][i] - step1b[10][i];
-    vadd.s16  q8,  q0, q5
-    vadd.s16  q9,  q1, q7
-    vsub.s16  q13, q1, q7
-    ;step2[13] = step1b[14][i] - step1b[13][i];
-    ;step2[14] = step1b[14][i] + step1b[13][i];
-    ;step2[15] = step1b[15][i] + step1b[12][i];
-    vsub.s16  q14, q3, q4
-    vadd.s16  q10, q3, q4
-    vadd.s16  q15, q2, q6
-    STORE_IN_OUTPUT 26, 8, 15, q8, q15
-    STORE_IN_OUTPUT 15, 9, 14, q9, q10
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;
-    ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;
-    ;step3[10] = dct_const_round_shift(temp1);
-    ;step3[13] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
-    STORE_IN_OUTPUT 14, 13, 10, q3, q1
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;step2[11] = step1b[8][i] - step1b[11][i];
-    ;step2[12] = step1b[15][i] - step1b[12][i];
-    vsub.s16  q13, q0, q5
-    vsub.s16  q14,  q2, q6
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;
-    ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;
-    ;step3[11] = dct_const_round_shift(temp1);
-    ;step3[12] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
-    STORE_IN_OUTPUT 10, 11, 12, q1, q3
-    ; --------------------------------------------------------------------------
-
-
-    ; --------------------------------------------------------------------------
-    ; BLOCK D: 0-3,4-7
-    ; --------------------------------------------------------------------------
-    ; generate 4,5,6,7
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;
-    ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;
-    ;step3[4] = dct_const_round_shift(temp1);
-    ;step3[7] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 6, 4, 28
-    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;
-    ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;
-    ;step3[5] = dct_const_round_shift(temp1);
-    ;step3[6] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 28, 20, 12
-    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[4] = step1b[4][i] + step1b[5][i];
-    ;step1[5] = step1b[4][i] - step1b[5][i];
-    ;step1[6] = step1b[7][i] - step1b[6][i];
-    ;step1[7] = step1b[7][i] + step1b[6][i];
-    vsub.s16  q13, q0, q1
-    vadd.s16  q0, q0, q1
-    vsub.s16  q14, q2, q3
-    vadd.s16  q2, q2, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;
-    ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;
-    ;step2[5] = dct_const_round_shift(temp1);
-    ;step2[6] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; generate 0,1,2,3
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;
-    ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;
-    ;step1[1] = dct_const_round_shift(temp1);
-    ;step1[0] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 12, 0, 16
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;
-    ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;
-    ;step1[2] = dct_const_round_shift(temp1);
-    ;step1[3] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 16, 8, 24
-    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;step2[0] = step1b[0][i] + step1b[3][i];
-    ;step2[1] = step1b[1][i] + step1b[2][i];
-    ;step2[2] = step1b[1][i] - step1b[2][i];
-    ;step2[3] = step1b[0][i] - step1b[3][i];
-    vadd.s16  q4, q7, q6
-    vsub.s16  q7, q7, q6
-    vsub.s16  q6, q5, q14
-    vadd.s16  q5, q5, q14
-    ; --------------------------------------------------------------------------
-    ; combine 0-3,4-7
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;step3[0] = step1b[0][i] + step1b[7][i];
-    ;step3[1] = step1b[1][i] + step1b[6][i];
-    ;step3[2] = step1b[2][i] + step1b[5][i];
-    ;step3[3] = step1b[3][i] + step1b[4][i];
-    vadd.s16  q8,  q4, q2
-    vadd.s16  q9,  q5, q3
-    vadd.s16  q10, q6, q1
-    vadd.s16  q11, q7, q0
-    ;step3[4] = step1b[3][i] - step1b[4][i];
-    ;step3[5] = step1b[2][i] - step1b[5][i];
-    ;step3[6] = step1b[1][i] - step1b[6][i];
-    ;step3[7] = step1b[0][i] - step1b[7][i];
-    vsub.s16  q12, q7, q0
-    vsub.s16  q13, q6, q1
-    vsub.s16  q14, q5, q3
-    vsub.s16  q15, q4, q2
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[0] = step1b[0][i] + step1b[15][i];
-    ;step1[1] = step1b[1][i] + step1b[14][i];
-    ;step1[14] = step1b[1][i] - step1b[14][i];
-    ;step1[15] = step1b[0][i] - step1b[15][i];
-    LOAD_FROM_OUTPUT 12, 14, 15, q0, q1
-    vadd.s16  q2, q8, q1
-    vadd.s16  q3, q9, q0
-    vsub.s16  q4, q9, q0
-    vsub.s16  q5, q8, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[14 * 32] = step1b[14][i] + step1b[17][i];
-    ;output[15 * 32] = step1b[15][i] + step1b[16][i];
-    ;output[16 * 32] = step1b[15][i] - step1b[16][i];
-    ;output[17 * 32] = step1b[14][i] - step1b[17][i];
-    LOAD_FROM_OUTPUT 15, 16, 17, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-
-    cmp r5, #0
-    bgt idct32_bands_end_2nd_pass
-
-idct32_bands_end_1st_pass
-    STORE_IN_OUTPUT 17, 16, 17, q6, q7
-    STORE_IN_OUTPUT 17, 14, 15, q8, q9
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
-    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
-    ;output[30 * 32] = step1b[1][i] - step1b[30][i];
-    ;output[31 * 32] = step1b[0][i] - step1b[31][i];
-    LOAD_FROM_OUTPUT 15, 30, 31, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 31, 30, 31, q6, q7
-    STORE_IN_OUTPUT 31,  0,  1, q4, q5
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[2] = step1b[2][i] + step1b[13][i];
-    ;step1[3] = step1b[3][i] + step1b[12][i];
-    ;step1[12] = step1b[3][i] - step1b[12][i];
-    ;step1[13] = step1b[2][i] - step1b[13][i];
-    LOAD_FROM_OUTPUT 1, 12, 13, q0, q1
-    vadd.s16  q2, q10, q1
-    vadd.s16  q3, q11, q0
-    vsub.s16  q4, q11, q0
-    vsub.s16  q5, q10, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[12 * 32] = step1b[12][i] + step1b[19][i];
-    ;output[13 * 32] = step1b[13][i] + step1b[18][i];
-    ;output[18 * 32] = step1b[13][i] - step1b[18][i];
-    ;output[19 * 32] = step1b[12][i] - step1b[19][i];
-    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_IN_OUTPUT 19, 18, 19, q6, q7
-    STORE_IN_OUTPUT 19, 12, 13, q8, q9
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
-    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
-    ;output[28 * 32] = step1b[3][i] - step1b[28][i];
-    ;output[29 * 32] = step1b[2][i] - step1b[29][i];
-    LOAD_FROM_OUTPUT 13, 28, 29, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 29, 28, 29, q6, q7
-    STORE_IN_OUTPUT 29,  2,  3, q4, q5
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[4] = step1b[4][i] + step1b[11][i];
-    ;step1[5] = step1b[5][i] + step1b[10][i];
-    ;step1[10] = step1b[5][i] - step1b[10][i];
-    ;step1[11] = step1b[4][i] - step1b[11][i];
-    LOAD_FROM_OUTPUT 3, 10, 11, q0, q1
-    vadd.s16  q2, q12, q1
-    vadd.s16  q3, q13, q0
-    vsub.s16  q4, q13, q0
-    vsub.s16  q5, q12, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[10 * 32] = step1b[10][i] + step1b[21][i];
-    ;output[11 * 32] = step1b[11][i] + step1b[20][i];
-    ;output[20 * 32] = step1b[11][i] - step1b[20][i];
-    ;output[21 * 32] = step1b[10][i] - step1b[21][i];
-    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_IN_OUTPUT 21, 20, 21, q6, q7
-    STORE_IN_OUTPUT 21, 10, 11, q8, q9
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
-    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
-    ;output[26 * 32] = step1b[5][i] - step1b[26][i];
-    ;output[27 * 32] = step1b[4][i] - step1b[27][i];
-    LOAD_FROM_OUTPUT 11, 26, 27, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 27, 26, 27, q6, q7
-    STORE_IN_OUTPUT 27,  4,  5, q4, q5
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[6] = step1b[6][i] + step1b[9][i];
-    ;step1[7] = step1b[7][i] + step1b[8][i];
-    ;step1[8] = step1b[7][i] - step1b[8][i];
-    ;step1[9] = step1b[6][i] - step1b[9][i];
-    LOAD_FROM_OUTPUT 5, 8, 9, q0, q1
-    vadd.s16  q2, q14, q1
-    vadd.s16  q3, q15, q0
-    vsub.s16  q4, q15, q0
-    vsub.s16  q5, q14, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
-    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
-    ;output[22 * 32] = step1b[9][i] - step1b[22][i];
-    ;output[23 * 32] = step1b[8][i] - step1b[23][i];
-    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_IN_OUTPUT 23, 22, 23, q6, q7
-    STORE_IN_OUTPUT 23, 8, 9, q8, q9
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
-    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
-    ;output[24 * 32] = step1b[7][i] - step1b[24][i];
-    ;output[25 * 32] = step1b[6][i] - step1b[25][i];
-    LOAD_FROM_OUTPUT 9, 24, 25, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 25, 24, 25, q6, q7
-    STORE_IN_OUTPUT 25,  6,  7, q4, q5
-
-    ; restore r0 by removing the last offset from the last
-    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
-    sub r0, r0, #24*8*2
-    ; restore r1 by removing the last offset from the last
-    ;     operation (STORE_IN_OUTPUT 24,  6,  7) => 7*32*2
-    ; advance by 8 columns => 8*2
-    sub r1, r1, #7*32*2 - 8*2
-    ;   advance by 8 lines (8*32*2)
-    ;   go back by the two pairs from the loop (32*2)
-    add r3, r3, #8*32*2 - 32*2
-
-    ; bands loop processing
-    subs r4, r4, #1
-    bne idct32_bands_loop
-
-    ; parameters for second pass
-    ; the input of pass2 is the result of pass1. we have to remove the offset
-    ;   of 32 columns induced by the above idct32_bands_loop
-    sub r3, r1, #32*2
-      ; r1 = pass2[32 * 32]
-    add r1, sp, #2048
-
-    ; pass loop processing
-    add r5, r5, #1
-    b idct32_pass_loop
-
-idct32_bands_end_2nd_pass
-    STORE_COMBINE_CENTER_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
-    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
-    ;output[30 * 32] = step1b[1][i] - step1b[30][i];
-    ;output[31 * 32] = step1b[0][i] - step1b[31][i];
-    LOAD_FROM_OUTPUT 17, 30, 31, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_COMBINE_EXTREME_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[2] = step1b[2][i] + step1b[13][i];
-    ;step1[3] = step1b[3][i] + step1b[12][i];
-    ;step1[12] = step1b[3][i] - step1b[12][i];
-    ;step1[13] = step1b[2][i] - step1b[13][i];
-    LOAD_FROM_OUTPUT 31, 12, 13, q0, q1
-    vadd.s16  q2, q10, q1
-    vadd.s16  q3, q11, q0
-    vsub.s16  q4, q11, q0
-    vsub.s16  q5, q10, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[12 * 32] = step1b[12][i] + step1b[19][i];
-    ;output[13 * 32] = step1b[13][i] + step1b[18][i];
-    ;output[18 * 32] = step1b[13][i] - step1b[18][i];
-    ;output[19 * 32] = step1b[12][i] - step1b[19][i];
-    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_COMBINE_CENTER_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
-    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
-    ;output[28 * 32] = step1b[3][i] - step1b[28][i];
-    ;output[29 * 32] = step1b[2][i] - step1b[29][i];
-    LOAD_FROM_OUTPUT 19, 28, 29, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_COMBINE_EXTREME_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[4] = step1b[4][i] + step1b[11][i];
-    ;step1[5] = step1b[5][i] + step1b[10][i];
-    ;step1[10] = step1b[5][i] - step1b[10][i];
-    ;step1[11] = step1b[4][i] - step1b[11][i];
-    LOAD_FROM_OUTPUT 29, 10, 11, q0, q1
-    vadd.s16  q2, q12, q1
-    vadd.s16  q3, q13, q0
-    vsub.s16  q4, q13, q0
-    vsub.s16  q5, q12, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[10 * 32] = step1b[10][i] + step1b[21][i];
-    ;output[11 * 32] = step1b[11][i] + step1b[20][i];
-    ;output[20 * 32] = step1b[11][i] - step1b[20][i];
-    ;output[21 * 32] = step1b[10][i] - step1b[21][i];
-    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_COMBINE_CENTER_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
-    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
-    ;output[26 * 32] = step1b[5][i] - step1b[26][i];
-    ;output[27 * 32] = step1b[4][i] - step1b[27][i];
-    LOAD_FROM_OUTPUT 21, 26, 27, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_COMBINE_EXTREME_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[6] = step1b[6][i] + step1b[9][i];
-    ;step1[7] = step1b[7][i] + step1b[8][i];
-    ;step1[8] = step1b[7][i] - step1b[8][i];
-    ;step1[9] = step1b[6][i] - step1b[9][i];
-    LOAD_FROM_OUTPUT 27, 8, 9, q0, q1
-    vadd.s16  q2, q14, q1
-    vadd.s16  q3, q15, q0
-    vsub.s16  q4, q15, q0
-    vsub.s16  q5, q14, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
-    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
-    ;output[22 * 32] = step1b[9][i] - step1b[22][i];
-    ;output[23 * 32] = step1b[8][i] - step1b[23][i];
-    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_COMBINE_CENTER_RESULTS_LAST
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
-    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
-    ;output[24 * 32] = step1b[7][i] - step1b[24][i];
-    ;output[25 * 32] = step1b[6][i] - step1b[25][i];
-    LOAD_FROM_OUTPUT 23, 24, 25, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_COMBINE_EXTREME_RESULTS_LAST
-    ; --------------------------------------------------------------------------
-    ; restore pointers to their initial indices for next band pass by
-    ;     removing/adding dest_stride * 8. The actual increment by eight
-    ;     is taken care of within the _LAST macros.
-    add r6,  r6,  r2, lsl #3
-    add r9,  r9,  r2, lsl #3
-    sub r7,  r7,  r2, lsl #3
-    sub r10, r10, r2, lsl #3
-
-    ; restore r0 by removing the last offset from the last
-    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
-    sub r0, r0, #24*8*2
-    ; restore r1 by removing the last offset from the last
-    ;     operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2
-    ; advance by 8 columns => 8*2
-    sub r1, r1, #25*32*2 - 8*2
-    ;   advance by 8 lines (8*32*2)
-    ;   go back by the two pairs from the loop (32*2)
-    add r3, r3, #8*32*2 - 32*2
-
-    ; bands loop processing
-    subs r4, r4, #1
-    bne idct32_bands_loop
-
-    ; stack operation
-    add sp, sp, #512+2048+2048
-    vpop {d8-d15}
-    pop  {r4-r11}
-    bx              lr
-    ENDP  ; |aom_idct32x32_1024_add_neon|
-    END
diff --git a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c
deleted file mode 100644
index 3df7a901b..000000000
--- a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "aom_dsp/inv_txfm.h"
-#include "aom_ports/mem.h"
-
-void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8x8_t d6u8;
-  uint32x2_t d2u32 = vdup_n_u32(0);
-  uint16x8_t q8u16;
-  int16x8_t q0s16;
-  uint8_t *d1, *d2;
-  int16_t i, a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 4);
-
-  q0s16 = vdupq_n_s16(a1);
-
-  // dc_only_idct_add
-  d1 = d2 = dest;
-  for (i = 0; i < 2; i++) {
-    d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
-    d1 += dest_stride;
-    d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
-    d1 += dest_stride;
-
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32));
-    d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-
-    vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
-    d2 += dest_stride;
-    vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
-    d2 += dest_stride;
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon_asm.asm
deleted file mode 100644
index 6bd733d5d..000000000
--- a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon_asm.asm
+++ /dev/null
@@ -1,71 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-
-
-    EXPORT  |aom_idct4x4_1_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
-;                                  int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|aom_idct4x4_1_add_neon| PROC
-    ldrsh            r0, [r0]
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; out = dct_const_round_shift(input[0] * cospi_16_64)
-    mul              r0, r0, r12               ; input[0] * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; out = dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    mov              r12, r1                   ; save dest
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; a1 = ROUND_POWER_OF_TWO(out, 4)
-    add              r0, r0, #8                ; + (1 <<((4) - 1))
-    asr              r0, r0, #4                ; >> 4
-
-    vdup.s16         q0, r0                    ; duplicate a1
-
-    vld1.32          {d2[0]}, [r1], r2
-    vld1.32          {d2[1]}, [r1], r2
-    vld1.32          {d4[0]}, [r1], r2
-    vld1.32          {d4[1]}, [r1]
-
-    vaddw.u8         q8, q0, d2                ; dest[x] + a1
-    vaddw.u8         q9, q0, d4
-
-    vqmovun.s16      d6, q8                    ; clip_pixel
-    vqmovun.s16      d7, q9
-
-    vst1.32          {d6[0]}, [r12], r2
-    vst1.32          {d6[1]}, [r12], r2
-    vst1.32          {d7[0]}, [r12], r2
-    vst1.32          {d7[1]}, [r12]
-
-    bx               lr
-    ENDP             ; |aom_idct4x4_1_add_neon|
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c b/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c
deleted file mode 100644
index 763be1ab0..000000000
--- a/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "aom_dsp/txfm_common.h"
-
-void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8x8_t d26u8, d27u8;
-  uint32x2_t d26u32, d27u32;
-  uint16x8_t q8u16, q9u16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
-  int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
-  int16x8_t q8s16, q9s16, q13s16, q14s16;
-  int32x4_t q1s32, q13s32, q14s32, q15s32;
-  int16x4x2_t d0x2s16, d1x2s16;
-  int32x4x2_t q0x2s32;
-  uint8_t *d;
-
-  d26u32 = d27u32 = vdup_n_u32(0);
-
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-
-  d16s16 = vget_low_s16(q8s16);
-  d17s16 = vget_high_s16(q8s16);
-  d18s16 = vget_low_s16(q9s16);
-  d19s16 = vget_high_s16(q9s16);
-
-  d0x2s16 = vtrn_s16(d16s16, d17s16);
-  d1x2s16 = vtrn_s16(d18s16, d19s16);
-  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
-  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
-
-  d20s16 = vdup_n_s16((int16_t)cospi_8_64);
-  d21s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q0x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
-  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-
-  d22s16 = vdup_n_s16((int16_t)cospi_24_64);
-
-  // stage 1
-  d23s16 = vadd_s16(d16s16, d18s16);
-  d24s16 = vsub_s16(d16s16, d18s16);
-
-  q15s32 = vmull_s16(d17s16, d22s16);
-  q1s32 = vmull_s16(d17s16, d20s16);
-  q13s32 = vmull_s16(d23s16, d21s16);
-  q14s32 = vmull_s16(d24s16, d21s16);
-
-  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
-  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
-
-  d26s16 = vqrshrn_n_s32(q13s32, 14);
-  d27s16 = vqrshrn_n_s32(q14s32, 14);
-  d29s16 = vqrshrn_n_s32(q15s32, 14);
-  d28s16 = vqrshrn_n_s32(q1s32, 14);
-  q13s16 = vcombine_s16(d26s16, d27s16);
-  q14s16 = vcombine_s16(d28s16, d29s16);
-
-  // stage 2
-  q8s16 = vaddq_s16(q13s16, q14s16);
-  q9s16 = vsubq_s16(q13s16, q14s16);
-
-  d16s16 = vget_low_s16(q8s16);
-  d17s16 = vget_high_s16(q8s16);
-  d18s16 = vget_high_s16(q9s16);  // vswp d18 d19
-  d19s16 = vget_low_s16(q9s16);
-
-  d0x2s16 = vtrn_s16(d16s16, d17s16);
-  d1x2s16 = vtrn_s16(d18s16, d19s16);
-  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
-  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
-
-  q0x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
-  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-
-  // do the transform on columns
-  // stage 1
-  d23s16 = vadd_s16(d16s16, d18s16);
-  d24s16 = vsub_s16(d16s16, d18s16);
-
-  q15s32 = vmull_s16(d17s16, d22s16);
-  q1s32 = vmull_s16(d17s16, d20s16);
-  q13s32 = vmull_s16(d23s16, d21s16);
-  q14s32 = vmull_s16(d24s16, d21s16);
-
-  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
-  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
-
-  d26s16 = vqrshrn_n_s32(q13s32, 14);
-  d27s16 = vqrshrn_n_s32(q14s32, 14);
-  d29s16 = vqrshrn_n_s32(q15s32, 14);
-  d28s16 = vqrshrn_n_s32(q1s32, 14);
-  q13s16 = vcombine_s16(d26s16, d27s16);
-  q14s16 = vcombine_s16(d28s16, d29s16);
-
-  // stage 2
-  q8s16 = vaddq_s16(q13s16, q14s16);
-  q9s16 = vsubq_s16(q13s16, q14s16);
-
-  q8s16 = vrshrq_n_s16(q8s16, 4);
-  q9s16 = vrshrq_n_s16(q9s16, 4);
-
-  d = dest;
-  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
-  d += dest_stride;
-  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
-  d += dest_stride;
-  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
-  d += dest_stride;
-  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
-
-  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-
-  d = dest;
-  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
-  d += dest_stride;
-  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
-  d += dest_stride;
-  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
-  d += dest_stride;
-  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/idct4x4_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct4x4_add_neon_asm.asm
deleted file mode 100644
index 127acf614..000000000
--- a/third_party/aom/aom_dsp/arm/idct4x4_add_neon_asm.asm
+++ /dev/null
@@ -1,193 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-    EXPORT  |aom_idct4x4_16_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    AREA     Block, CODE, READONLY ; name this block of code
-;void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|aom_idct4x4_16_add_neon| PROC
-
-    ; The 2D transform is done with two passes which are actually pretty
-    ; similar. We first transform the rows. This is done by transposing
-    ; the inputs, doing an SIMD column transform (the columns are the
-    ; transposed rows) and then transpose the results (so that it goes back
-    ; in normal/row positions). Then, we transform the columns by doing
-    ; another SIMD column transform.
-    ; So, two passes of a transpose followed by a column transform.
-
-    ; load the inputs into q8-q9, d16-d19
-    vld1.s16        {q8,q9}, [r0]!
-
-    ; generate scalar constants
-    ; cospi_8_64 = 15137 = 0x3b21
-    mov             r0, #0x3b00
-    add             r0, #0x21
-    ; cospi_16_64 = 11585 = 0x2d41
-    mov             r3, #0x2d00
-    add             r3, #0x41
-    ; cospi_24_64 = 6270 = 0x 187e
-    mov             r12, #0x1800
-    add             r12, #0x7e
-
-    ; transpose the input data
-    ; 00 01 02 03   d16
-    ; 10 11 12 13   d17
-    ; 20 21 22 23   d18
-    ; 30 31 32 33   d19
-    vtrn.16         d16, d17
-    vtrn.16         d18, d19
-
-    ; generate constant vectors
-    vdup.16         d20, r0         ; replicate cospi_8_64
-    vdup.16         d21, r3         ; replicate cospi_16_64
-
-    ; 00 10 02 12   d16
-    ; 01 11 03 13   d17
-    ; 20 30 22 32   d18
-    ; 21 31 23 33   d19
-    vtrn.32         q8, q9
-    ; 00 10 20 30   d16
-    ; 01 11 21 31   d17
-    ; 02 12 22 32   d18
-    ; 03 13 23 33   d19
-
-    vdup.16         d22, r12        ; replicate cospi_24_64
-
-    ; do the transform on transposed rows
-
-    ; stage 1
-    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
-    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
-
-    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
-    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
-
-    ; (input[0] + input[2]) * cospi_16_64;
-    ; (input[0] - input[2]) * cospi_16_64;
-    vmull.s16 q13, d23, d21
-    vmull.s16 q14, d24, d21
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
-    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
-    vmlsl.s16 q15, d19, d20
-    vmlal.s16 q1,  d19, d22
-
-    ; dct_const_round_shift
-    vqrshrn.s32 d26, q13, #14
-    vqrshrn.s32 d27, q14, #14
-    vqrshrn.s32 d29, q15, #14
-    vqrshrn.s32 d28, q1,  #14
-
-    ; stage 2
-    ; output[0] = step[0] + step[3];
-    ; output[1] = step[1] + step[2];
-    ; output[3] = step[0] - step[3];
-    ; output[2] = step[1] - step[2];
-    vadd.s16 q8,  q13, q14
-    vsub.s16 q9,  q13, q14
-    vswp     d18, d19
-
-    ; transpose the results
-    ; 00 01 02 03   d16
-    ; 10 11 12 13   d17
-    ; 20 21 22 23   d18
-    ; 30 31 32 33   d19
-    vtrn.16         d16, d17
-    vtrn.16         d18, d19
-    ; 00 10 02 12   d16
-    ; 01 11 03 13   d17
-    ; 20 30 22 32   d18
-    ; 21 31 23 33   d19
-    vtrn.32         q8, q9
-    ; 00 10 20 30   d16
-    ; 01 11 21 31   d17
-    ; 02 12 22 32   d18
-    ; 03 13 23 33   d19
-
-    ; do the transform on columns
-
-    ; stage 1
-    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
-    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
-
-    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
-    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
-
-    ; (input[0] + input[2]) * cospi_16_64;
-    ; (input[0] - input[2]) * cospi_16_64;
-    vmull.s16 q13, d23, d21
-    vmull.s16 q14, d24, d21
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
-    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
-    vmlsl.s16 q15, d19, d20
-    vmlal.s16 q1,  d19, d22
-
-    ; dct_const_round_shift
-    vqrshrn.s32 d26, q13, #14
-    vqrshrn.s32 d27, q14, #14
-    vqrshrn.s32 d29, q15, #14
-    vqrshrn.s32 d28, q1,  #14
-
-    ; stage 2
-    ; output[0] = step[0] + step[3];
-    ; output[1] = step[1] + step[2];
-    ; output[3] = step[0] - step[3];
-    ; output[2] = step[1] - step[2];
-    vadd.s16 q8,  q13, q14
-    vsub.s16 q9,  q13, q14
-
-    ; The results are in two registers, one of them being swapped. This will
-    ; be taken care of by loading the 'dest' value in a swapped fashion and
-    ; also storing them in the same swapped fashion.
-    ; temp_out[0, 1] = d16, d17 = q8
-    ; temp_out[2, 3] = d19, d18 = q9 swapped
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
-    vrshr.s16 q8, q8, #4
-    vrshr.s16 q9, q9, #4
-
-    vld1.32 {d26[0]}, [r1], r2
-    vld1.32 {d26[1]}, [r1], r2
-    vld1.32 {d27[1]}, [r1], r2
-    vld1.32 {d27[0]}, [r1]  ; no post-increment
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
-    vaddw.u8 q8, q8, d26
-    vaddw.u8 q9, q9, d27
-
-    ; clip_pixel
-    vqmovun.s16 d26, q8
-    vqmovun.s16 d27, q9
-
-    ; do the stores in reverse order with negative post-increment, by changing
-    ; the sign of the stride
-    rsb r2, r2, #0
-    vst1.32 {d27[0]}, [r1], r2
-    vst1.32 {d27[1]}, [r1], r2
-    vst1.32 {d26[1]}, [r1], r2
-    vst1.32 {d26[0]}, [r1]  ; no post-increment
-    bx              lr
-    ENDP  ; |aom_idct4x4_16_add_neon|
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c
deleted file mode 100644
index c7926f9e4..000000000
--- a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "aom_dsp/inv_txfm.h"
-#include "aom_ports/mem.h"
-
-void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8x8_t d2u8, d3u8, d30u8, d31u8;
-  uint64x1_t d2u64, d3u64, d4u64, d5u64;
-  uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
-  int16x8_t q0s16;
-  uint8_t *d1, *d2;
-  int16_t i, a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 5);
-
-  q0s16 = vdupq_n_s16(a1);
-  q0u16 = vreinterpretq_u16_s16(q0s16);
-
-  d1 = d2 = dest;
-  for (i = 0; i < 2; i++) {
-    d2u64 = vld1_u64((const uint64_t *)d1);
-    d1 += dest_stride;
-    d3u64 = vld1_u64((const uint64_t *)d1);
-    d1 += dest_stride;
-    d4u64 = vld1_u64((const uint64_t *)d1);
-    d1 += dest_stride;
-    d5u64 = vld1_u64((const uint64_t *)d1);
-    d1 += dest_stride;
-
-    q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
-    q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
-    q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
-    q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
-
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-    d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-    d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
-    d2 += dest_stride;
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon_asm.asm
deleted file mode 100644
index ec07e2053..000000000
--- a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon_asm.asm
+++ /dev/null
@@ -1,91 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-
-
-    EXPORT  |aom_idct8x8_1_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
-;                                  int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|aom_idct8x8_1_add_neon| PROC
-    ldrsh            r0, [r0]
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; out = dct_const_round_shift(input[0] * cospi_16_64)
-    mul              r0, r0, r12               ; input[0] * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; out = dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    mov              r12, r1                   ; save dest
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; a1 = ROUND_POWER_OF_TWO(out, 5)
-    add              r0, r0, #16               ; + (1 <<((5) - 1))
-    asr              r0, r0, #5                ; >> 5
-
-    vdup.s16         q0, r0                    ; duplicate a1
-
-    ; load destination data
-    vld1.64          {d2}, [r1], r2
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r2
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r2
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r2
-    vld1.64          {d17}, [r1]
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r2
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r2
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r2
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r2
-    vst1.64          {d31}, [r12], r2
-
-    bx               lr
-    ENDP             ; |aom_idct8x8_1_add_neon|
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c b/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c
deleted file mode 100644
index 8ad70862d..000000000
--- a/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c
+++ /dev/null
@@ -1,509 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_config.h"
-#include "aom_dsp/txfm_common.h"
-
-static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
-                                int16x8_t *q10s16, int16x8_t *q11s16,
-                                int16x8_t *q12s16, int16x8_t *q13s16,
-                                int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
-  int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
-
-  *q8s16 = vcombine_s16(d16s16, d24s16);   // vswp d17, d24
-  *q9s16 = vcombine_s16(d18s16, d26s16);   // vswp d19, d26
-  *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
-  *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
-  *q12s16 = vcombine_s16(d17s16, d25s16);
-  *q13s16 = vcombine_s16(d19s16, d27s16);
-  *q14s16 = vcombine_s16(d21s16, d29s16);
-  *q15s16 = vcombine_s16(d23s16, d31s16);
-
-  q0x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
-  q1x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
-  q2x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
-  q3x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
-
-  q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
-                      vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
-  q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
-                      vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
-  q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
-                      vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
-  q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
-                      vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
-
-  *q8s16 = q0x2s16.val[0];
-  *q9s16 = q0x2s16.val[1];
-  *q10s16 = q1x2s16.val[0];
-  *q11s16 = q1x2s16.val[1];
-  *q12s16 = q2x2s16.val[0];
-  *q13s16 = q2x2s16.val[1];
-  *q14s16 = q3x2s16.val[0];
-  *q15s16 = q3x2s16.val[1];
-  return;
-}
-
-static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
-                              int16x8_t *q10s16, int16x8_t *q11s16,
-                              int16x8_t *q12s16, int16x8_t *q13s16,
-                              int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-
-  d0s16 = vdup_n_s16((int16_t)cospi_28_64);
-  d1s16 = vdup_n_s16((int16_t)cospi_4_64);
-  d2s16 = vdup_n_s16((int16_t)cospi_12_64);
-  d3s16 = vdup_n_s16((int16_t)cospi_20_64);
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
-
-  q2s32 = vmull_s16(d18s16, d0s16);
-  q3s32 = vmull_s16(d19s16, d0s16);
-  q5s32 = vmull_s16(d26s16, d2s16);
-  q6s32 = vmull_s16(d27s16, d2s16);
-
-  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
-  q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
-  q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
-
-  d8s16 = vqrshrn_n_s32(q2s32, 14);
-  d9s16 = vqrshrn_n_s32(q3s32, 14);
-  d10s16 = vqrshrn_n_s32(q5s32, 14);
-  d11s16 = vqrshrn_n_s32(q6s32, 14);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  q2s32 = vmull_s16(d18s16, d1s16);
-  q3s32 = vmull_s16(d19s16, d1s16);
-  q9s32 = vmull_s16(d26s16, d3s16);
-  q13s32 = vmull_s16(d27s16, d3s16);
-
-  q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
-  q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
-  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
-  q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
-
-  d14s16 = vqrshrn_n_s32(q2s32, 14);
-  d15s16 = vqrshrn_n_s32(q3s32, 14);
-  d12s16 = vqrshrn_n_s32(q9s32, 14);
-  d13s16 = vqrshrn_n_s32(q13s32, 14);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-  q7s16 = vcombine_s16(d14s16, d15s16);
-
-  d0s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q2s32 = vmull_s16(d16s16, d0s16);
-  q3s32 = vmull_s16(d17s16, d0s16);
-  q13s32 = vmull_s16(d16s16, d0s16);
-  q15s32 = vmull_s16(d17s16, d0s16);
-
-  q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
-  q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
-  q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
-  q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
-
-  d0s16 = vdup_n_s16((int16_t)cospi_24_64);
-  d1s16 = vdup_n_s16((int16_t)cospi_8_64);
-
-  d18s16 = vqrshrn_n_s32(q2s32, 14);
-  d19s16 = vqrshrn_n_s32(q3s32, 14);
-  d22s16 = vqrshrn_n_s32(q13s32, 14);
-  d23s16 = vqrshrn_n_s32(q15s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-  *q11s16 = vcombine_s16(d22s16, d23s16);
-
-  q2s32 = vmull_s16(d20s16, d0s16);
-  q3s32 = vmull_s16(d21s16, d0s16);
-  q8s32 = vmull_s16(d20s16, d1s16);
-  q12s32 = vmull_s16(d21s16, d1s16);
-
-  q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
-  q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
-  q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
-
-  d26s16 = vqrshrn_n_s32(q2s32, 14);
-  d27s16 = vqrshrn_n_s32(q3s32, 14);
-  d30s16 = vqrshrn_n_s32(q8s32, 14);
-  d31s16 = vqrshrn_n_s32(q12s32, 14);
-  *q13s16 = vcombine_s16(d26s16, d27s16);
-  *q15s16 = vcombine_s16(d30s16, d31s16);
-
-  q0s16 = vaddq_s16(*q9s16, *q15s16);
-  q1s16 = vaddq_s16(*q11s16, *q13s16);
-  q2s16 = vsubq_s16(*q11s16, *q13s16);
-  q3s16 = vsubq_s16(*q9s16, *q15s16);
-
-  *q13s16 = vsubq_s16(q4s16, q5s16);
-  q4s16 = vaddq_s16(q4s16, q5s16);
-  *q14s16 = vsubq_s16(q7s16, q6s16);
-  q7s16 = vaddq_s16(q7s16, q6s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-
-  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q9s32 = vmull_s16(d28s16, d16s16);
-  q10s32 = vmull_s16(d29s16, d16s16);
-  q11s32 = vmull_s16(d28s16, d16s16);
-  q12s32 = vmull_s16(d29s16, d16s16);
-
-  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
-  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
-  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
-  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
-  d10s16 = vqrshrn_n_s32(q9s32, 14);
-  d11s16 = vqrshrn_n_s32(q10s32, 14);
-  d12s16 = vqrshrn_n_s32(q11s32, 14);
-  d13s16 = vqrshrn_n_s32(q12s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  *q8s16 = vaddq_s16(q0s16, q7s16);
-  *q9s16 = vaddq_s16(q1s16, q6s16);
-  *q10s16 = vaddq_s16(q2s16, q5s16);
-  *q11s16 = vaddq_s16(q3s16, q4s16);
-  *q12s16 = vsubq_s16(q3s16, q4s16);
-  *q13s16 = vsubq_s16(q2s16, q5s16);
-  *q14s16 = vsubq_s16(q1s16, q6s16);
-  *q15s16 = vsubq_s16(q0s16, q7s16);
-  return;
-}
-
-void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8_t *d1, *d2;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8;
-  uint64x1_t d0u64, d1u64, d2u64, d3u64;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  uint16x8_t q8u16, q9u16, q10u16, q11u16;
-
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-  q10s16 = vld1q_s16(input + 16);
-  q11s16 = vld1q_s16(input + 24);
-  q12s16 = vld1q_s16(input + 32);
-  q13s16 = vld1q_s16(input + 40);
-  q14s16 = vld1q_s16(input + 48);
-  q15s16 = vld1q_s16(input + 56);
-
-  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-               &q15s16);
-
-  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-             &q15s16);
-
-  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-               &q15s16);
-
-  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-             &q15s16);
-
-  q8s16 = vrshrq_n_s16(q8s16, 5);
-  q9s16 = vrshrq_n_s16(q9s16, 5);
-  q10s16 = vrshrq_n_s16(q10s16, 5);
-  q11s16 = vrshrq_n_s16(q11s16, 5);
-  q12s16 = vrshrq_n_s16(q12s16, 5);
-  q13s16 = vrshrq_n_s16(q13s16, 5);
-  q14s16 = vrshrq_n_s16(q14s16, 5);
-  q15s16 = vrshrq_n_s16(q15s16, 5);
-
-  d1 = d2 = dest;
-
-  d0u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d1u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d2u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d3u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-  d2 += dest_stride;
-
-  q8s16 = q12s16;
-  q9s16 = q13s16;
-  q10s16 = q14s16;
-  q11s16 = q15s16;
-
-  d0u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d1u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d2u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d3u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-  d2 += dest_stride;
-  return;
-}
-
-void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8_t *d1, *d2;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8;
-  int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
-  int16x4_t d26s16, d27s16, d28s16, d29s16;
-  uint64x1_t d0u64, d1u64, d2u64, d3u64;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  uint16x8_t q8u16, q9u16, q10u16, q11u16;
-  int32x4_t q9s32, q10s32, q11s32, q12s32;
-
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-  q10s16 = vld1q_s16(input + 16);
-  q11s16 = vld1q_s16(input + 24);
-  q12s16 = vld1q_s16(input + 32);
-  q13s16 = vld1q_s16(input + 40);
-  q14s16 = vld1q_s16(input + 48);
-  q15s16 = vld1q_s16(input + 56);
-
-  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-               &q15s16);
-
-  // First transform rows
-  // stage 1
-  q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2);
-  q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2);
-
-  q4s16 = vqrdmulhq_s16(q9s16, q0s16);
-
-  q0s16 = vdupq_n_s16(-(int16_t)cospi_20_64 * 2);
-
-  q7s16 = vqrdmulhq_s16(q9s16, q1s16);
-
-  q1s16 = vdupq_n_s16((int16_t)cospi_12_64 * 2);
-
-  q5s16 = vqrdmulhq_s16(q11s16, q0s16);
-
-  q0s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2);
-
-  q6s16 = vqrdmulhq_s16(q11s16, q1s16);
-
-  // stage 2 & stage 3 - even half
-  q1s16 = vdupq_n_s16((int16_t)cospi_24_64 * 2);
-
-  q9s16 = vqrdmulhq_s16(q8s16, q0s16);
-
-  q0s16 = vdupq_n_s16((int16_t)cospi_8_64 * 2);
-
-  q13s16 = vqrdmulhq_s16(q10s16, q1s16);
-
-  q15s16 = vqrdmulhq_s16(q10s16, q0s16);
-
-  // stage 3 -odd half
-  q0s16 = vaddq_s16(q9s16, q15s16);
-  q1s16 = vaddq_s16(q9s16, q13s16);
-  q2s16 = vsubq_s16(q9s16, q13s16);
-  q3s16 = vsubq_s16(q9s16, q15s16);
-
-  // stage 2 - odd half
-  q13s16 = vsubq_s16(q4s16, q5s16);
-  q4s16 = vaddq_s16(q4s16, q5s16);
-  q14s16 = vsubq_s16(q7s16, q6s16);
-  q7s16 = vaddq_s16(q7s16, q6s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-
-  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
-  q9s32 = vmull_s16(d28s16, d16s16);
-  q10s32 = vmull_s16(d29s16, d16s16);
-  q11s32 = vmull_s16(d28s16, d16s16);
-  q12s32 = vmull_s16(d29s16, d16s16);
-
-  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
-  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
-  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
-  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
-  d10s16 = vqrshrn_n_s32(q9s32, 14);
-  d11s16 = vqrshrn_n_s32(q10s32, 14);
-  d12s16 = vqrshrn_n_s32(q11s32, 14);
-  d13s16 = vqrshrn_n_s32(q12s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  // stage 4
-  q8s16 = vaddq_s16(q0s16, q7s16);
-  q9s16 = vaddq_s16(q1s16, q6s16);
-  q10s16 = vaddq_s16(q2s16, q5s16);
-  q11s16 = vaddq_s16(q3s16, q4s16);
-  q12s16 = vsubq_s16(q3s16, q4s16);
-  q13s16 = vsubq_s16(q2s16, q5s16);
-  q14s16 = vsubq_s16(q1s16, q6s16);
-  q15s16 = vsubq_s16(q0s16, q7s16);
-
-  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-               &q15s16);
-
-  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-             &q15s16);
-
-  q8s16 = vrshrq_n_s16(q8s16, 5);
-  q9s16 = vrshrq_n_s16(q9s16, 5);
-  q10s16 = vrshrq_n_s16(q10s16, 5);
-  q11s16 = vrshrq_n_s16(q11s16, 5);
-  q12s16 = vrshrq_n_s16(q12s16, 5);
-  q13s16 = vrshrq_n_s16(q13s16, 5);
-  q14s16 = vrshrq_n_s16(q14s16, 5);
-  q15s16 = vrshrq_n_s16(q15s16, 5);
-
-  d1 = d2 = dest;
-
-  d0u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d1u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d2u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d3u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-  d2 += dest_stride;
-
-  q8s16 = q12s16;
-  q9s16 = q13s16;
-  q10s16 = q14s16;
-  q11s16 = q15s16;
-
-  d0u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d1u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d2u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d3u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-  d2 += dest_stride;
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/idct8x8_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct8x8_add_neon_asm.asm
deleted file mode 100644
index f3d5f246d..000000000
--- a/third_party/aom/aom_dsp/arm/idct8x8_add_neon_asm.asm
+++ /dev/null
@@ -1,522 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-    EXPORT  |aom_idct8x8_64_add_neon|
-    EXPORT  |aom_idct8x8_12_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are
-    ; loaded in q8-q15. The output will be stored back into q8-q15 registers.
-    ; This macro will touch q0-q7 registers and use them as buffer during
-    ; calculation.
-    MACRO
-    IDCT8x8_1D
-    ; stage 1
-    vdup.16         d0, r3                    ; duplicate cospi_28_64
-    vdup.16         d1, r4                    ; duplicate cospi_4_64
-    vdup.16         d2, r5                    ; duplicate cospi_12_64
-    vdup.16         d3, r6                    ; duplicate cospi_20_64
-
-    ; input[1] * cospi_28_64
-    vmull.s16       q2, d18, d0
-    vmull.s16       q3, d19, d0
-
-    ; input[5] * cospi_12_64
-    vmull.s16       q5, d26, d2
-    vmull.s16       q6, d27, d2
-
-    ; input[1]*cospi_28_64-input[7]*cospi_4_64
-    vmlsl.s16       q2, d30, d1
-    vmlsl.s16       q3, d31, d1
-
-    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vmlsl.s16       q5, d22, d3
-    vmlsl.s16       q6, d23, d3
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d8, q2, #14               ; >> 14
-    vqrshrn.s32     d9, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q5, #14              ; >> 14
-    vqrshrn.s32     d11, q6, #14              ; >> 14
-
-    ; input[1] * cospi_4_64
-    vmull.s16       q2, d18, d1
-    vmull.s16       q3, d19, d1
-
-    ; input[5] * cospi_20_64
-    vmull.s16       q9, d26, d3
-    vmull.s16       q13, d27, d3
-
-    ; input[1]*cospi_4_64+input[7]*cospi_28_64
-    vmlal.s16       q2, d30, d0
-    vmlal.s16       q3, d31, d0
-
-    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
-    vmlal.s16       q9, d22, d2
-    vmlal.s16       q13, d23, d2
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d14, q2, #14              ; >> 14
-    vqrshrn.s32     d15, q3, #14              ; >> 14
-
-    ; stage 2 & stage 3 - even half
-    vdup.16         d0, r7                    ; duplicate cospi_16_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q13, #14              ; >> 14
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q2, d16, d0
-    vmull.s16       q3, d17, d0
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q13, d16, d0
-    vmull.s16       q15, d17, d0
-
-    ; (input[0] + input[2]) * cospi_16_64
-    vmlal.s16       q2,  d24, d0
-    vmlal.s16       q3, d25, d0
-
-    ; (input[0] - input[2]) * cospi_16_64
-    vmlsl.s16       q13, d24, d0
-    vmlsl.s16       q15, d25, d0
-
-    vdup.16         d0, r8                    ; duplicate cospi_24_64
-    vdup.16         d1, r9                    ; duplicate cospi_8_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d18, q2, #14              ; >> 14
-    vqrshrn.s32     d19, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d22, q13, #14              ; >> 14
-    vqrshrn.s32     d23, q15, #14              ; >> 14
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    ; input[1] * cospi_24_64
-    vmull.s16       q2, d20, d0
-    vmull.s16       q3, d21, d0
-
-    ; input[1] * cospi_8_64
-    vmull.s16       q8, d20, d1
-    vmull.s16       q12, d21, d1
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlsl.s16       q2, d28, d1
-    vmlsl.s16       q3, d29, d1
-
-    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vmlal.s16       q8, d28, d0
-    vmlal.s16       q12, d29, d0
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d26, q2, #14              ; >> 14
-    vqrshrn.s32     d27, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d30, q8, #14              ; >> 14
-    vqrshrn.s32     d31, q12, #14              ; >> 14
-
-    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
-    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
-    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
-    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
-
-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
-    ; stage 2 - odd half
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
-    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q11, d28, d16
-    vmull.s16       q12, d29, d16
-
-    ; (step2[6] - step2[5]) * cospi_16_64
-    vmlsl.s16       q9, d26, d16
-    vmlsl.s16       q10, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vmlal.s16       q11, d26, d16
-    vmlal.s16       q12, d27, d16
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q9, #14              ; >> 14
-    vqrshrn.s32     d11, q10, #14             ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q11, #14              ; >> 14
-    vqrshrn.s32     d13, q12, #14             ; >> 14
-
-    ; stage 4
-    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
-    MEND
-
-    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
-    MACRO
-    TRANSPOSE8X8
-    vswp            d17, d24
-    vswp            d23, d30
-    vswp            d21, d28
-    vswp            d19, d26
-    vtrn.32         q8, q10
-    vtrn.32         q9, q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vtrn.16         q8, q9
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    MEND
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|aom_idct8x8_64_add_neon| PROC
-    push            {r4-r9}
-    vpush           {d8-d15}
-    vld1.s16        {q8,q9}, [r0]!
-    vld1.s16        {q10,q11}, [r0]!
-    vld1.s16        {q12,q13}, [r0]!
-    vld1.s16        {q14,q15}, [r0]!
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0x0c00
-    add             r3, #0x7c
-
-    ; generate cospi_4_64  = 16069
-    mov             r4, #0x3e00
-    add             r4, #0xc5
-
-    ; generate cospi_12_64 = 13623
-    mov             r5, #0x3500
-    add             r5, #0x37
-
-    ; generate cospi_20_64 = 9102
-    mov             r6, #0x2300
-    add             r6, #0x8e
-
-    ; generate cospi_16_64 = 11585
-    mov             r7, #0x2d00
-    add             r7, #0x41
-
-    ; generate cospi_24_64 = 6270
-    mov             r8, #0x1800
-    add             r8, #0x7e
-
-    ; generate cospi_8_64 = 15137
-    mov             r9, #0x3b00
-    add             r9, #0x21
-
-    ; First transform rows
-    IDCT8x8_1D
-
-    ; Transpose the matrix
-    TRANSPOSE8X8
-
-    ; Then transform columns
-    IDCT8x8_1D
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
-    vrshr.s16       q8, q8, #5
-    vrshr.s16       q9, q9, #5
-    vrshr.s16       q10, q10, #5
-    vrshr.s16       q11, q11, #5
-    vrshr.s16       q12, q12, #5
-    vrshr.s16       q13, q13, #5
-    vrshr.s16       q14, q14, #5
-    vrshr.s16       q15, q15, #5
-
-    ; save dest pointer
-    mov             r0, r1
-
-    ; load destination data
-    vld1.64         {d0}, [r1], r2
-    vld1.64         {d1}, [r1], r2
-    vld1.64         {d2}, [r1], r2
-    vld1.64         {d3}, [r1], r2
-    vld1.64         {d4}, [r1], r2
-    vld1.64         {d5}, [r1], r2
-    vld1.64         {d6}, [r1], r2
-    vld1.64         {d7}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
-    vaddw.u8        q8, q8, d0
-    vaddw.u8        q9, q9, d1
-    vaddw.u8        q10, q10, d2
-    vaddw.u8        q11, q11, d3
-    vaddw.u8        q12, q12, d4
-    vaddw.u8        q13, q13, d5
-    vaddw.u8        q14, q14, d6
-    vaddw.u8        q15, q15, d7
-
-    ; clip_pixel
-    vqmovun.s16     d0, q8
-    vqmovun.s16     d1, q9
-    vqmovun.s16     d2, q10
-    vqmovun.s16     d3, q11
-    vqmovun.s16     d4, q12
-    vqmovun.s16     d5, q13
-    vqmovun.s16     d6, q14
-    vqmovun.s16     d7, q15
-
-    ; store the data
-    vst1.64         {d0}, [r0], r2
-    vst1.64         {d1}, [r0], r2
-    vst1.64         {d2}, [r0], r2
-    vst1.64         {d3}, [r0], r2
-    vst1.64         {d4}, [r0], r2
-    vst1.64         {d5}, [r0], r2
-    vst1.64         {d6}, [r0], r2
-    vst1.64         {d7}, [r0], r2
-
-    vpop            {d8-d15}
-    pop             {r4-r9}
-    bx              lr
-    ENDP  ; |aom_idct8x8_64_add_neon|
-
-;void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|aom_idct8x8_12_add_neon| PROC
-    push            {r4-r9}
-    vpush           {d8-d15}
-    vld1.s16        {q8,q9}, [r0]!
-    vld1.s16        {q10,q11}, [r0]!
-    vld1.s16        {q12,q13}, [r0]!
-    vld1.s16        {q14,q15}, [r0]!
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0x0c00
-    add             r3, #0x7c
-
-    ; generate cospi_4_64  = 16069
-    mov             r4, #0x3e00
-    add             r4, #0xc5
-
-    ; generate cospi_12_64 = 13623
-    mov             r5, #0x3500
-    add             r5, #0x37
-
-    ; generate cospi_20_64 = 9102
-    mov             r6, #0x2300
-    add             r6, #0x8e
-
-    ; generate cospi_16_64 = 11585
-    mov             r7, #0x2d00
-    add             r7, #0x41
-
-    ; generate cospi_24_64 = 6270
-    mov             r8, #0x1800
-    add             r8, #0x7e
-
-    ; generate cospi_8_64 = 15137
-    mov             r9, #0x3b00
-    add             r9, #0x21
-
-    ; First transform rows
-    ; stage 1
-    ; The following instructions use vqrdmulh to do the
-    ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling
-    ; multiply and shift the result by 16 bits instead of 14 bits. So we need
-    ; to double the constants before multiplying to compensate this.
-    mov             r12, r3, lsl #1
-    vdup.16         q0, r12                   ; duplicate cospi_28_64*2
-    mov             r12, r4, lsl #1
-    vdup.16         q1, r12                   ; duplicate cospi_4_64*2
-
-    ; dct_const_round_shift(input[1] * cospi_28_64)
-    vqrdmulh.s16    q4, q9, q0
-
-    mov             r12, r6, lsl #1
-    rsb             r12, #0
-    vdup.16         q0, r12                   ; duplicate -cospi_20_64*2
-
-    ; dct_const_round_shift(input[1] * cospi_4_64)
-    vqrdmulh.s16    q7, q9, q1
-
-    mov             r12, r5, lsl #1
-    vdup.16         q1, r12                   ; duplicate cospi_12_64*2
-
-    ; dct_const_round_shift(- input[3] * cospi_20_64)
-    vqrdmulh.s16    q5, q11, q0
-
-    mov             r12, r7, lsl #1
-    vdup.16         q0, r12                   ; duplicate cospi_16_64*2
-
-    ; dct_const_round_shift(input[3] * cospi_12_64)
-    vqrdmulh.s16    q6, q11, q1
-
-    ; stage 2 & stage 3 - even half
-    mov             r12, r8, lsl #1
-    vdup.16         q1, r12                   ; duplicate cospi_24_64*2
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrdmulh.s16    q9, q8, q0
-
-    mov             r12, r9, lsl #1
-    vdup.16         q0, r12                   ; duplicate cospi_8_64*2
-
-    ; dct_const_round_shift(input[1] * cospi_24_64)
-    vqrdmulh.s16    q13, q10, q1
-
-    ; dct_const_round_shift(input[1] * cospi_8_64)
-    vqrdmulh.s16    q15, q10, q0
-
-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
-    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
-    vadd.s16        q1, q9, q13               ; output[1] = step[1] + step[2]
-    vsub.s16        q2, q9, q13               ; output[2] = step[1] - step[2]
-    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
-
-    ; stage 2 - odd half
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
-    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q11, d28, d16
-    vmull.s16       q12, d29, d16
-
-    ; (step2[6] - step2[5]) * cospi_16_64
-    vmlsl.s16       q9, d26, d16
-    vmlsl.s16       q10, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vmlal.s16       q11, d26, d16
-    vmlal.s16       q12, d27, d16
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q9, #14              ; >> 14
-    vqrshrn.s32     d11, q10, #14             ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q11, #14              ; >> 14
-    vqrshrn.s32     d13, q12, #14             ; >> 14
-
-    ; stage 4
-    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
-
-    ; Transpose the matrix
-    TRANSPOSE8X8
-
-    ; Then transform columns
-    IDCT8x8_1D
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
-    vrshr.s16       q8, q8, #5
-    vrshr.s16       q9, q9, #5
-    vrshr.s16       q10, q10, #5
-    vrshr.s16       q11, q11, #5
-    vrshr.s16       q12, q12, #5
-    vrshr.s16       q13, q13, #5
-    vrshr.s16       q14, q14, #5
-    vrshr.s16       q15, q15, #5
-
-    ; save dest pointer
-    mov             r0, r1
-
-    ; load destination data
-    vld1.64         {d0}, [r1], r2
-    vld1.64         {d1}, [r1], r2
-    vld1.64         {d2}, [r1], r2
-    vld1.64         {d3}, [r1], r2
-    vld1.64         {d4}, [r1], r2
-    vld1.64         {d5}, [r1], r2
-    vld1.64         {d6}, [r1], r2
-    vld1.64         {d7}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
-    vaddw.u8        q8, q8, d0
-    vaddw.u8        q9, q9, d1
-    vaddw.u8        q10, q10, d2
-    vaddw.u8        q11, q11, d3
-    vaddw.u8        q12, q12, d4
-    vaddw.u8        q13, q13, d5
-    vaddw.u8        q14, q14, d6
-    vaddw.u8        q15, q15, d7
-
-    ; clip_pixel
-    vqmovun.s16     d0, q8
-    vqmovun.s16     d1, q9
-    vqmovun.s16     d2, q10
-    vqmovun.s16     d3, q11
-    vqmovun.s16     d4, q12
-    vqmovun.s16     d5, q13
-    vqmovun.s16     d6, q14
-    vqmovun.s16     d7, q15
-
-    ; store the data
-    vst1.64         {d0}, [r0], r2
-    vst1.64         {d1}, [r0], r2
-    vst1.64         {d2}, [r0], r2
-    vst1.64         {d3}, [r0], r2
-    vst1.64         {d4}, [r0], r2
-    vst1.64         {d5}, [r0], r2
-    vst1.64         {d6}, [r0], r2
-    vst1.64         {d7}, [r0], r2
-
-    vpop            {d8-d15}
-    pop             {r4-r9}
-    bx              lr
-    ENDP  ; |aom_idct8x8_12_add_neon|
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c
index 7d5f64004..69470eeb0 100644
--- a/third_party/aom/aom_dsp/arm/intrapred_neon.c
+++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c
@@ -11,8 +11,9 @@
 
 #include <arm_neon.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 //------------------------------------------------------------------------------
@@ -342,8 +343,6 @@ void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
   vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
 }
 
-#if !HAVE_NEON_ASM
-
 void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
   int i;
@@ -529,4 +528,3 @@ void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
     }
   }
 }
-#endif  // !HAVE_NEON_ASM
diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm b/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm
deleted file mode 100644
index fba9c1b5b..000000000
--- a/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm
+++ /dev/null
@@ -1,287 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-    EXPORT  |aom_v_predictor_4x4_neon|
-    EXPORT  |aom_v_predictor_8x8_neon|
-    EXPORT  |aom_v_predictor_16x16_neon|
-    EXPORT  |aom_v_predictor_32x32_neon|
-    EXPORT  |aom_h_predictor_4x4_neon|
-    EXPORT  |aom_h_predictor_8x8_neon|
-    EXPORT  |aom_h_predictor_16x16_neon|
-    EXPORT  |aom_h_predictor_32x32_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                              const uint8_t *above,
-;                              const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|aom_v_predictor_4x4_neon| PROC
-    vld1.32             {d0[0]}, [r2]
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d0[0]}, [r0], r1
-    bx                  lr
-    ENDP                ; |aom_v_predictor_4x4_neon|
-
-;void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                              const uint8_t *above,
-;                              const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|aom_v_predictor_8x8_neon| PROC
-    vld1.8              {d0}, [r2]
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    bx                  lr
-    ENDP                ; |aom_v_predictor_8x8_neon|
-
-;void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                                const uint8_t *above,
-;                                const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|aom_v_predictor_16x16_neon| PROC
-    vld1.8              {q0}, [r2]
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    bx                  lr
-    ENDP                ; |aom_v_predictor_16x16_neon|
-
-;void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                                const uint8_t *above,
-;                                const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|aom_v_predictor_32x32_neon| PROC
-    vld1.8              {q0, q1}, [r2]
-    mov                 r2, #2
-loop_v
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    subs                r2, r2, #1
-    bgt                 loop_v
-    bx                  lr
-    ENDP                ; |aom_v_predictor_32x32_neon|
-
-;void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                              const uint8_t *above,
-;                              const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|aom_h_predictor_4x4_neon| PROC
-    vld1.32             {d1[0]}, [r3]
-    vdup.8              d0, d1[0]
-    vst1.32             {d0[0]}, [r0], r1
-    vdup.8              d0, d1[1]
-    vst1.32             {d0[0]}, [r0], r1
-    vdup.8              d0, d1[2]
-    vst1.32             {d0[0]}, [r0], r1
-    vdup.8              d0, d1[3]
-    vst1.32             {d0[0]}, [r0], r1
-    bx                  lr
-    ENDP                ; |aom_h_predictor_4x4_neon|
-
-;void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                              const uint8_t *above,
-;                              const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|aom_h_predictor_8x8_neon| PROC
-    vld1.64             {d1}, [r3]
-    vdup.8              d0, d1[0]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[1]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[2]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[3]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[4]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[5]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[6]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[7]
-    vst1.64             {d0}, [r0], r1
-    bx                  lr
-    ENDP                ; |aom_h_predictor_8x8_neon|
-
-;void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                                const uint8_t *above,
-;                                const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|aom_h_predictor_16x16_neon| PROC
-    vld1.8              {q1}, [r3]
-    vdup.8              q0, d2[0]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[1]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[2]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[3]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[4]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[5]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[6]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[7]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[0]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[1]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[2]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[3]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[4]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[5]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[6]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[7]
-    vst1.8              {q0}, [r0], r1
-    bx                  lr
-    ENDP                ; |aom_h_predictor_16x16_neon|
-
-;void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                                const uint8_t *above,
-;                                const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|aom_h_predictor_32x32_neon| PROC
-    sub                 r1, r1, #16
-    mov                 r2, #2
-loop_h
-    vld1.8              {q1}, [r3]!
-    vdup.8              q0, d2[0]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[1]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[2]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[3]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[4]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[5]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[6]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[7]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[0]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[1]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[2]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[3]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[4]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[5]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[6]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[7]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    subs                r2, r2, #1
-    bgt                 loop_h
-    bx                  lr
-    ENDP                ; |aom_h_predictor_32x32_neon|
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c
deleted file mode 100644
index c0562a6ea..000000000
--- a/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-
-static INLINE void loop_filter_neon_16(uint8x16_t qblimit,  // blimit
-                                       uint8x16_t qlimit,   // limit
-                                       uint8x16_t qthresh,  // thresh
-                                       uint8x16_t q3,       // p3
-                                       uint8x16_t q4,       // p2
-                                       uint8x16_t q5,       // p1
-                                       uint8x16_t q6,       // p0
-                                       uint8x16_t q7,       // q0
-                                       uint8x16_t q8,       // q1
-                                       uint8x16_t q9,       // q2
-                                       uint8x16_t q10,      // q3
-                                       uint8x16_t *q5r,     // p1
-                                       uint8x16_t *q6r,     // p0
-                                       uint8x16_t *q7r,     // q0
-                                       uint8x16_t *q8r) {   // q1
-  uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
-  int16x8_t q2s16, q11s16;
-  uint16x8_t q4u16;
-  int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
-  int8x8_t d2s8, d3s8;
-
-  q11u8 = vabdq_u8(q3, q4);
-  q12u8 = vabdq_u8(q4, q5);
-  q13u8 = vabdq_u8(q5, q6);
-  q14u8 = vabdq_u8(q8, q7);
-  q3 = vabdq_u8(q9, q8);
-  q4 = vabdq_u8(q10, q9);
-
-  q11u8 = vmaxq_u8(q11u8, q12u8);
-  q12u8 = vmaxq_u8(q13u8, q14u8);
-  q3 = vmaxq_u8(q3, q4);
-  q15u8 = vmaxq_u8(q11u8, q12u8);
-
-  q9 = vabdq_u8(q6, q7);
-
-  // aom_hevmask
-  q13u8 = vcgtq_u8(q13u8, qthresh);
-  q14u8 = vcgtq_u8(q14u8, qthresh);
-  q15u8 = vmaxq_u8(q15u8, q3);
-
-  q2u8 = vabdq_u8(q5, q8);
-  q9 = vqaddq_u8(q9, q9);
-
-  q15u8 = vcgeq_u8(qlimit, q15u8);
-
-  // aom_filter() function
-  // convert to signed
-  q10 = vdupq_n_u8(0x80);
-  q8 = veorq_u8(q8, q10);
-  q7 = veorq_u8(q7, q10);
-  q6 = veorq_u8(q6, q10);
-  q5 = veorq_u8(q5, q10);
-
-  q2u8 = vshrq_n_u8(q2u8, 1);
-  q9 = vqaddq_u8(q9, q2u8);
-
-  q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
-                   vget_low_s8(vreinterpretq_s8_u8(q6)));
-  q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
-                    vget_high_s8(vreinterpretq_s8_u8(q6)));
-
-  q9 = vcgeq_u8(qblimit, q9);
-
-  q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));
-
-  q14u8 = vorrq_u8(q13u8, q14u8);
-
-  q4u16 = vdupq_n_u16(3);
-  q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
-  q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
-
-  q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
-  q15u8 = vandq_u8(q15u8, q9);
-
-  q1s8 = vreinterpretq_s8_u8(q1u8);
-  q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
-  q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
-
-  q4 = vdupq_n_u8(3);
-  q9 = vdupq_n_u8(4);
-  // aom_filter = clamp(aom_filter + 3 * ( qs0 - ps0))
-  d2s8 = vqmovn_s16(q2s16);
-  d3s8 = vqmovn_s16(q11s16);
-  q1s8 = vcombine_s8(d2s8, d3s8);
-  q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
-  q1s8 = vreinterpretq_s8_u8(q1u8);
-
-  q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
-  q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
-  q2s8 = vshrq_n_s8(q2s8, 3);
-  q1s8 = vshrq_n_s8(q1s8, 3);
-
-  q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
-  q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
-
-  q1s8 = vrshrq_n_s8(q1s8, 1);
-  q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
-
-  q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
-  q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
-
-  *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
-  *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10);
-  *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
-  *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
-  return;
-}
-
-void aom_lpf_horizontal_4_dual_neon(
-    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
-    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
-    const uint8_t *limit1, const uint8_t *thresh1) {
-  uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
-  uint8x16_t qblimit, qlimit, qthresh;
-  uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
-
-  dblimit0 = vld1_u8(blimit0);
-  dlimit0 = vld1_u8(limit0);
-  dthresh0 = vld1_u8(thresh0);
-  dblimit1 = vld1_u8(blimit1);
-  dlimit1 = vld1_u8(limit1);
-  dthresh1 = vld1_u8(thresh1);
-  qblimit = vcombine_u8(dblimit0, dblimit1);
-  qlimit = vcombine_u8(dlimit0, dlimit1);
-  qthresh = vcombine_u8(dthresh0, dthresh1);
-
-  s -= (p << 2);
-
-  q3u8 = vld1q_u8(s);
-  s += p;
-  q4u8 = vld1q_u8(s);
-  s += p;
-  q5u8 = vld1q_u8(s);
-  s += p;
-  q6u8 = vld1q_u8(s);
-  s += p;
-  q7u8 = vld1q_u8(s);
-  s += p;
-  q8u8 = vld1q_u8(s);
-  s += p;
-  q9u8 = vld1q_u8(s);
-  s += p;
-  q10u8 = vld1q_u8(s);
-
-  loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8,
-                      q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8);
-
-  s -= (p * 5);
-  vst1q_u8(s, q5u8);
-  s += p;
-  vst1q_u8(s, q6u8);
-  s += p;
-  vst1q_u8(s, q7u8);
-  s += p;
-  vst1q_u8(s, q8u8);
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_16_neon_asm.asm b/third_party/aom/aom_dsp/arm/loopfilter_16_neon_asm.asm
deleted file mode 100644
index b6e2c9edb..000000000
--- a/third_party/aom/aom_dsp/arm/loopfilter_16_neon_asm.asm
+++ /dev/null
@@ -1,202 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-    EXPORT  |aom_lpf_horizontal_4_dual_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void aom_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
-;                                    const uint8_t *blimit0,
-;                                    const uint8_t *limit0,
-;                                    const uint8_t *thresh0,
-;                                    const uint8_t *blimit1,
-;                                    const uint8_t *limit1,
-;                                    const uint8_t *thresh1)
-; r0    uint8_t *s,
-; r1    int p,
-; r2    const uint8_t *blimit0,
-; r3    const uint8_t *limit0,
-; sp    const uint8_t *thresh0,
-; sp+4  const uint8_t *blimit1,
-; sp+8  const uint8_t *limit1,
-; sp+12 const uint8_t *thresh1,
-
-|aom_lpf_horizontal_4_dual_neon| PROC
-    push        {lr}
-
-    ldr         r12, [sp, #4]              ; load thresh0
-    vld1.8      {d0}, [r2]                 ; load blimit0 to first half q
-    vld1.8      {d2}, [r3]                 ; load limit0 to first half q
-
-    add         r1, r1, r1                 ; double pitch
-    ldr         r2, [sp, #8]               ; load blimit1
-
-    vld1.8      {d4}, [r12]                ; load thresh0 to first half q
-
-    ldr         r3, [sp, #12]              ; load limit1
-    ldr         r12, [sp, #16]             ; load thresh1
-    vld1.8      {d1}, [r2]                 ; load blimit1 to 2nd half q
-
-    sub         r2, r0, r1, lsl #1         ; s[-4 * p]
-
-    vld1.8      {d3}, [r3]                 ; load limit1 to 2nd half q
-    vld1.8      {d5}, [r12]                ; load thresh1 to 2nd half q
-
-    vpush       {d8-d15}                   ; save neon registers
-
-    add         r3, r2, r1, lsr #1         ; s[-3 * p]
-
-    vld1.u8     {q3}, [r2@64], r1          ; p3
-    vld1.u8     {q4}, [r3@64], r1          ; p2
-    vld1.u8     {q5}, [r2@64], r1          ; p1
-    vld1.u8     {q6}, [r3@64], r1          ; p0
-    vld1.u8     {q7}, [r2@64], r1          ; q0
-    vld1.u8     {q8}, [r3@64], r1          ; q1
-    vld1.u8     {q9}, [r2@64]              ; q2
-    vld1.u8     {q10}, [r3@64]             ; q3
-
-    sub         r2, r2, r1, lsl #1
-    sub         r3, r3, r1, lsl #1
-
-    bl          aom_loop_filter_neon_16
-
-    vst1.u8     {q5}, [r2@64], r1          ; store op1
-    vst1.u8     {q6}, [r3@64], r1          ; store op0
-    vst1.u8     {q7}, [r2@64], r1          ; store oq0
-    vst1.u8     {q8}, [r3@64], r1          ; store oq1
-
-    vpop        {d8-d15}                   ; restore neon registers
-
-    pop         {pc}
-    ENDP        ; |aom_lpf_horizontal_4_dual_neon|
-
-; void aom_loop_filter_neon_16();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store. This function uses
-; registers d8-d15, so the calling function must save those registers.
-;
-; r0-r3, r12 PRESERVE
-; q0    blimit
-; q1    limit
-; q2    thresh
-; q3    p3
-; q4    p2
-; q5    p1
-; q6    p0
-; q7    q0
-; q8    q1
-; q9    q2
-; q10   q3
-;
-; Outputs:
-; q5    op1
-; q6    op0
-; q7    oq0
-; q8    oq1
-|aom_loop_filter_neon_16| PROC
-
-    ; filter_mask
-    vabd.u8     q11, q3, q4                 ; m1 = abs(p3 - p2)
-    vabd.u8     q12, q4, q5                 ; m2 = abs(p2 - p1)
-    vabd.u8     q13, q5, q6                 ; m3 = abs(p1 - p0)
-    vabd.u8     q14, q8, q7                 ; m4 = abs(q1 - q0)
-    vabd.u8     q3, q9, q8                  ; m5 = abs(q2 - q1)
-    vabd.u8     q4, q10, q9                 ; m6 = abs(q3 - q2)
-
-    ; only compare the largest value to limit
-    vmax.u8     q11, q11, q12               ; m7 = max(m1, m2)
-    vmax.u8     q12, q13, q14               ; m8 = max(m3, m4)
-
-    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
-
-    vmax.u8     q3, q3, q4                  ; m9 = max(m5, m6)
-
-    vmov.u8     q10, #0x80
-
-    vmax.u8     q15, q11, q12               ; m10 = max(m7, m8)
-
-    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
-    vmax.u8     q15, q15, q3                ; m11 = max(m10, m9)
-
-    vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
-    vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
-
-    veor        q7, q7, q10                 ; qs0
-
-    vcge.u8     q15, q1, q15                ; abs(m11) > limit
-
-    vshr.u8     q2, q2, #1                  ; a = a / 2
-    veor        q6, q6, q10                 ; ps0
-
-    veor        q5, q5, q10                 ; ps1
-    vqadd.u8    q9, q9, q2                  ; a = b + a
-
-    veor        q8, q8, q10                 ; qs1
-
-    vmov.u16    q4, #3
-
-    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
-    vsubl.s8    q11, d15, d13
-
-    vcge.u8     q9, q0, q9                  ; a > blimit
-
-    vqsub.s8    q1, q5, q8                  ; filter = clamp(ps1-qs1)
-    vorr        q14, q13, q14               ; hev
-
-    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
-    vmul.i16    q11, q11, q4
-
-    vand        q1, q1, q14                 ; filter &= hev
-    vand        q15, q15, q9                ; mask
-
-    vmov.u8     q4, #3
-
-    vaddw.s8    q2, q2, d2                  ; filter + 3 * (qs0 - ps0)
-    vaddw.s8    q11, q11, d3
-
-    vmov.u8     q9, #4
-
-    ; filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d2, q2
-    vqmovn.s16  d3, q11
-    vand        q1, q1, q15                 ; filter &= mask
-
-    vqadd.s8    q2, q1, q4                  ; filter2 = clamp(filter+3)
-    vqadd.s8    q1, q1, q9                  ; filter1 = clamp(filter+4)
-    vshr.s8     q2, q2, #3                  ; filter2 >>= 3
-    vshr.s8     q1, q1, #3                  ; filter1 >>= 3
-
-
-    vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + filter2)
-    vqsub.s8    q0, q7, q1                  ; u = clamp(qs0 - filter1)
-
-    ; outer tap adjustments
-    vrshr.s8    q1, q1, #1                  ; filter = ++filter1 >> 1
-
-    veor        q7, q0,  q10                ; *oq0 = u^0x80
-
-    vbic        q1, q1, q14                 ; filter &= ~hev
-
-    vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + filter)
-    vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - filter)
-
-    veor        q6, q11, q10                ; *op0 = u^0x80
-    veor        q5, q13, q10                ; *op1 = u^0x80
-    veor        q8, q12, q10                ; *oq1 = u^0x80
-
-    bx          lr
-    ENDP        ; |aom_loop_filter_neon_16|
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c
deleted file mode 100644
index 2b1f80b81..000000000
--- a/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_dsp_rtcd.h"
-
-static INLINE void loop_filter_neon(uint8x8_t dblimit,   // flimit
-                                    uint8x8_t dlimit,    // limit
-                                    uint8x8_t dthresh,   // thresh
-                                    uint8x8_t d3u8,      // p3
-                                    uint8x8_t d4u8,      // p2
-                                    uint8x8_t d5u8,      // p1
-                                    uint8x8_t d6u8,      // p0
-                                    uint8x8_t d7u8,      // q0
-                                    uint8x8_t d16u8,     // q1
-                                    uint8x8_t d17u8,     // q2
-                                    uint8x8_t d18u8,     // q3
-                                    uint8x8_t *d4ru8,    // p1
-                                    uint8x8_t *d5ru8,    // p0
-                                    uint8x8_t *d6ru8,    // q0
-                                    uint8x8_t *d7ru8) {  // q1
-  uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
-  int16x8_t q12s16;
-  int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
-
-  d19u8 = vabd_u8(d3u8, d4u8);
-  d20u8 = vabd_u8(d4u8, d5u8);
-  d21u8 = vabd_u8(d5u8, d6u8);
-  d22u8 = vabd_u8(d16u8, d7u8);
-  d3u8 = vabd_u8(d17u8, d16u8);
-  d4u8 = vabd_u8(d18u8, d17u8);
-
-  d19u8 = vmax_u8(d19u8, d20u8);
-  d20u8 = vmax_u8(d21u8, d22u8);
-  d3u8 = vmax_u8(d3u8, d4u8);
-  d23u8 = vmax_u8(d19u8, d20u8);
-
-  d17u8 = vabd_u8(d6u8, d7u8);
-
-  d21u8 = vcgt_u8(d21u8, dthresh);
-  d22u8 = vcgt_u8(d22u8, dthresh);
-  d23u8 = vmax_u8(d23u8, d3u8);
-
-  d28u8 = vabd_u8(d5u8, d16u8);
-  d17u8 = vqadd_u8(d17u8, d17u8);
-
-  d23u8 = vcge_u8(dlimit, d23u8);
-
-  d18u8 = vdup_n_u8(0x80);
-  d5u8 = veor_u8(d5u8, d18u8);
-  d6u8 = veor_u8(d6u8, d18u8);
-  d7u8 = veor_u8(d7u8, d18u8);
-  d16u8 = veor_u8(d16u8, d18u8);
-
-  d28u8 = vshr_n_u8(d28u8, 1);
-  d17u8 = vqadd_u8(d17u8, d28u8);
-
-  d19u8 = vdup_n_u8(3);
-
-  d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8));
-
-  d17u8 = vcge_u8(dblimit, d17u8);
-
-  d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8));
-
-  d22u8 = vorr_u8(d21u8, d22u8);
-
-  q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
-
-  d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
-  d23u8 = vand_u8(d23u8, d17u8);
-
-  q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
-
-  d17u8 = vdup_n_u8(4);
-
-  d27s8 = vqmovn_s16(q12s16);
-  d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
-  d27s8 = vreinterpret_s8_u8(d27u8);
-
-  d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
-  d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
-  d28s8 = vshr_n_s8(d28s8, 3);
-  d27s8 = vshr_n_s8(d27s8, 3);
-
-  d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
-  d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
-
-  d27s8 = vrshr_n_s8(d27s8, 1);
-  d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
-
-  d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
-  d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
-
-  *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
-  *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
-  *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
-  *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
-  return;
-}
-
-void aom_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
-                               const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-  uint8_t *s, *psrc;
-  uint8x8_t dblimit, dlimit, dthresh;
-  uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
-
-  dblimit = vld1_u8(blimit);
-  dlimit = vld1_u8(limit);
-  dthresh = vld1_u8(thresh);
-
-  psrc = src - (pitch << 2);
-  for (i = 0; i < 1; i++) {
-    s = psrc + i * 8;
-
-    d3u8 = vld1_u8(s);
-    s += pitch;
-    d4u8 = vld1_u8(s);
-    s += pitch;
-    d5u8 = vld1_u8(s);
-    s += pitch;
-    d6u8 = vld1_u8(s);
-    s += pitch;
-    d7u8 = vld1_u8(s);
-    s += pitch;
-    d16u8 = vld1_u8(s);
-    s += pitch;
-    d17u8 = vld1_u8(s);
-    s += pitch;
-    d18u8 = vld1_u8(s);
-
-    loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
-                     d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
-
-    s -= (pitch * 5);
-    vst1_u8(s, d4u8);
-    s += pitch;
-    vst1_u8(s, d5u8);
-    s += pitch;
-    vst1_u8(s, d6u8);
-    s += pitch;
-    vst1_u8(s, d7u8);
-  }
-  return;
-}
-
-void aom_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
-                             const uint8_t *limit, const uint8_t *thresh) {
-  int i, pitch8;
-  uint8_t *s;
-  uint8x8_t dblimit, dlimit, dthresh;
-  uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
-  uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
-  uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
-  uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
-  uint8x8x4_t d4Result;
-
-  dblimit = vld1_u8(blimit);
-  dlimit = vld1_u8(limit);
-  dthresh = vld1_u8(thresh);
-
-  pitch8 = pitch * 8;
-  for (i = 0; i < 1; i++, src += pitch8) {
-    s = src - (i + 1) * 4;
-
-    d3u8 = vld1_u8(s);
-    s += pitch;
-    d4u8 = vld1_u8(s);
-    s += pitch;
-    d5u8 = vld1_u8(s);
-    s += pitch;
-    d6u8 = vld1_u8(s);
-    s += pitch;
-    d7u8 = vld1_u8(s);
-    s += pitch;
-    d16u8 = vld1_u8(s);
-    s += pitch;
-    d17u8 = vld1_u8(s);
-    s += pitch;
-    d18u8 = vld1_u8(s);
-
-    d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
-    d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
-    d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
-    d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
-
-    d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
-                      vreinterpret_u16_u32(d2tmp2.val[0]));
-    d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
-                      vreinterpret_u16_u32(d2tmp3.val[0]));
-    d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
-                      vreinterpret_u16_u32(d2tmp2.val[1]));
-    d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
-                      vreinterpret_u16_u32(d2tmp3.val[1]));
-
-    d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
-                     vreinterpret_u8_u16(d2tmp5.val[0]));
-    d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
-                     vreinterpret_u8_u16(d2tmp5.val[1]));
-    d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
-                      vreinterpret_u8_u16(d2tmp7.val[0]));
-    d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
-                      vreinterpret_u8_u16(d2tmp7.val[1]));
-
-    d3u8 = d2tmp8.val[0];
-    d4u8 = d2tmp8.val[1];
-    d5u8 = d2tmp9.val[0];
-    d6u8 = d2tmp9.val[1];
-    d7u8 = d2tmp10.val[0];
-    d16u8 = d2tmp10.val[1];
-    d17u8 = d2tmp11.val[0];
-    d18u8 = d2tmp11.val[1];
-
-    loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
-                     d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
-
-    d4Result.val[0] = d4u8;
-    d4Result.val[1] = d5u8;
-    d4Result.val[2] = d6u8;
-    d4Result.val[3] = d7u8;
-
-    src -= 2;
-    vst4_lane_u8(src, d4Result, 0);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 1);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 2);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 3);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 4);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 5);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 6);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 7);
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_4_neon_asm.asm b/third_party/aom/aom_dsp/arm/loopfilter_4_neon_asm.asm
deleted file mode 100644
index 8b54984d5..000000000
--- a/third_party/aom/aom_dsp/arm/loopfilter_4_neon_asm.asm
+++ /dev/null
@@ -1,252 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-    EXPORT  |aom_lpf_horizontal_4_neon|
-    EXPORT  |aom_lpf_vertical_4_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; Currently aom only works on iterations 8 at a time. The aom loop filter
-; works on 16 iterations at a time.
-;
-; void aom_lpf_horizontal_4_neon(uint8_t *s,
-;                                int p /* pitch */,
-;                                const uint8_t *blimit,
-;                                const uint8_t *limit,
-;                                const uint8_t *thresh)
-;
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-|aom_lpf_horizontal_4_neon| PROC
-    push        {lr}
-
-    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
-    ldr         r2, [sp, #4]               ; load thresh
-    add         r1, r1, r1                 ; double pitch
-
-    vld1.8      {d1[]}, [r3]               ; duplicate *limit
-    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
-
-    sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines
-    add         r3, r2, r1, lsr #1         ; set to 3 lines down
-
-    vld1.u8     {d3}, [r2@64], r1          ; p3
-    vld1.u8     {d4}, [r3@64], r1          ; p2
-    vld1.u8     {d5}, [r2@64], r1          ; p1
-    vld1.u8     {d6}, [r3@64], r1          ; p0
-    vld1.u8     {d7}, [r2@64], r1          ; q0
-    vld1.u8     {d16}, [r3@64], r1         ; q1
-    vld1.u8     {d17}, [r2@64]             ; q2
-    vld1.u8     {d18}, [r3@64]             ; q3
-
-    sub         r2, r2, r1, lsl #1
-    sub         r3, r3, r1, lsl #1
-
-    bl          aom_loop_filter_neon
-
-    vst1.u8     {d4}, [r2@64], r1          ; store op1
-    vst1.u8     {d5}, [r3@64], r1          ; store op0
-    vst1.u8     {d6}, [r2@64], r1          ; store oq0
-    vst1.u8     {d7}, [r3@64], r1          ; store oq1
-
-    pop         {pc}
-    ENDP        ; |aom_lpf_horizontal_4_neon|
-
-; Currently aom only works on iterations 8 at a time. The aom loop filter
-; works on 16 iterations at a time.
-;
-; void aom_lpf_vertical_4_neon(uint8_t *s,
-;                              int p /* pitch */,
-;                              const uint8_t *blimit,
-;                              const uint8_t *limit,
-;                              const uint8_t *thresh)
-;
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-|aom_lpf_vertical_4_neon| PROC
-    push        {lr}
-
-    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
-    vld1.8      {d1[]}, [r3]              ; duplicate *limit
-
-    ldr         r3, [sp, #4]              ; load thresh
-    sub         r2, r0, #4                ; move s pointer down by 4 columns
-
-    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
-
-    vld1.u8     {d3}, [r2], r1             ; load s data
-    vld1.u8     {d4}, [r2], r1
-    vld1.u8     {d5}, [r2], r1
-    vld1.u8     {d6}, [r2], r1
-    vld1.u8     {d7}, [r2], r1
-    vld1.u8     {d16}, [r2], r1
-    vld1.u8     {d17}, [r2], r1
-    vld1.u8     {d18}, [r2]
-
-    ;transpose to 8x16 matrix
-    vtrn.32     d3, d7
-    vtrn.32     d4, d16
-    vtrn.32     d5, d17
-    vtrn.32     d6, d18
-
-    vtrn.16     d3, d5
-    vtrn.16     d4, d6
-    vtrn.16     d7, d17
-    vtrn.16     d16, d18
-
-    vtrn.8      d3, d4
-    vtrn.8      d5, d6
-    vtrn.8      d7, d16
-    vtrn.8      d17, d18
-
-    bl          aom_loop_filter_neon
-
-    sub         r0, r0, #2
-
-    ;store op1, op0, oq0, oq1
-    vst4.8      {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
-    vst4.8      {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
-    vst4.8      {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
-    vst4.8      {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
-    vst4.8      {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
-    vst4.8      {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
-    vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
-    vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
-
-    pop         {pc}
-    ENDP        ; |aom_lpf_vertical_4_neon|
-
-; void aom_loop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store. The function does not use
-; registers d8-d15.
-;
-; Inputs:
-; r0-r3, r12 PRESERVE
-; d0    blimit
-; d1    limit
-; d2    thresh
-; d3    p3
-; d4    p2
-; d5    p1
-; d6    p0
-; d7    q0
-; d16   q1
-; d17   q2
-; d18   q3
-;
-; Outputs:
-; d4    op1
-; d5    op0
-; d6    oq0
-; d7    oq1
-|aom_loop_filter_neon| PROC
-    ; filter_mask
-    vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
-    vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
-    vabd.u8     d21, d5, d6                 ; m3 = abs(p1 - p0)
-    vabd.u8     d22, d16, d7                ; m4 = abs(q1 - q0)
-    vabd.u8     d3, d17, d16                ; m5 = abs(q2 - q1)
-    vabd.u8     d4, d18, d17                ; m6 = abs(q3 - q2)
-
-    ; only compare the largest value to limit
-    vmax.u8     d19, d19, d20               ; m1 = max(m1, m2)
-    vmax.u8     d20, d21, d22               ; m2 = max(m3, m4)
-
-    vabd.u8     d17, d6, d7                 ; abs(p0 - q0)
-
-    vmax.u8     d3, d3, d4                  ; m3 = max(m5, m6)
-
-    vmov.u8     d18, #0x80
-
-    vmax.u8     d23, d19, d20               ; m1 = max(m1, m2)
-
-    ; hevmask
-    vcgt.u8     d21, d21, d2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     d22, d22, d2                ; (abs(q1 - q0) > thresh)*-1
-    vmax.u8     d23, d23, d3                ; m1 = max(m1, m3)
-
-    vabd.u8     d28, d5, d16                ; a = abs(p1 - q1)
-    vqadd.u8    d17, d17, d17               ; b = abs(p0 - q0) * 2
-
-    veor        d7, d7, d18                 ; qs0
-
-    vcge.u8     d23, d1, d23                ; abs(m1) > limit
-
-    ; filter() function
-    ; convert to signed
-
-    vshr.u8     d28, d28, #1                ; a = a / 2
-    veor        d6, d6, d18                 ; ps0
-
-    veor        d5, d5, d18                 ; ps1
-    vqadd.u8    d17, d17, d28               ; a = b + a
-
-    veor        d16, d16, d18               ; qs1
-
-    vmov.u8     d19, #3
-
-    vsub.s8     d28, d7, d6                 ; ( qs0 - ps0)
-
-    vcge.u8     d17, d0, d17                ; a > blimit
-
-    vqsub.s8    d27, d5, d16                ; filter = clamp(ps1-qs1)
-    vorr        d22, d21, d22               ; hevmask
-
-    vmull.s8    q12, d28, d19               ; 3 * ( qs0 - ps0)
-
-    vand        d27, d27, d22               ; filter &= hev
-    vand        d23, d23, d17               ; filter_mask
-
-    vaddw.s8    q12, q12, d27               ; filter + 3 * (qs0 - ps0)
-
-    vmov.u8     d17, #4
-
-    ; filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d27, q12
-
-    vand        d27, d27, d23               ; filter &= mask
-
-    vqadd.s8    d28, d27, d19               ; filter2 = clamp(filter+3)
-    vqadd.s8    d27, d27, d17               ; filter1 = clamp(filter+4)
-    vshr.s8     d28, d28, #3                ; filter2 >>= 3
-    vshr.s8     d27, d27, #3                ; filter1 >>= 3
-
-    vqadd.s8    d19, d6, d28                ; u = clamp(ps0 + filter2)
-    vqsub.s8    d26, d7, d27                ; u = clamp(qs0 - filter1)
-
-    ; outer tap adjustments
-    vrshr.s8    d27, d27, #1                ; filter = ++filter1 >> 1
-
-    veor        d6, d26, d18                ; *oq0 = u^0x80
-
-    vbic        d27, d27, d22               ; filter &= ~hev
-
-    vqadd.s8    d21, d5, d27                ; u = clamp(ps1 + filter)
-    vqsub.s8    d20, d16, d27               ; u = clamp(qs1 - filter)
-
-    veor        d5, d19, d18                ; *op0 = u^0x80
-    veor        d4, d21, d18                ; *op1 = u^0x80
-    veor        d7, d20, d18                ; *oq1 = u^0x80
-
-    bx          lr
-    ENDP        ; |aom_loop_filter_neon|
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c
deleted file mode 100644
index c4502fdb5..000000000
--- a/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c
+++ /dev/null
@@ -1,430 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_dsp_rtcd.h"
-
-static INLINE void mbloop_filter_neon(uint8x8_t dblimit,   // mblimit
-                                      uint8x8_t dlimit,    // limit
-                                      uint8x8_t dthresh,   // thresh
-                                      uint8x8_t d3u8,      // p2
-                                      uint8x8_t d4u8,      // p2
-                                      uint8x8_t d5u8,      // p1
-                                      uint8x8_t d6u8,      // p0
-                                      uint8x8_t d7u8,      // q0
-                                      uint8x8_t d16u8,     // q1
-                                      uint8x8_t d17u8,     // q2
-                                      uint8x8_t d18u8,     // q3
-                                      uint8x8_t *d0ru8,    // p1
-                                      uint8x8_t *d1ru8,    // p1
-                                      uint8x8_t *d2ru8,    // p0
-                                      uint8x8_t *d3ru8,    // q0
-                                      uint8x8_t *d4ru8,    // q1
-                                      uint8x8_t *d5ru8) {  // q1
-  uint32_t flat;
-  uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
-  uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
-  int16x8_t q15s16;
-  uint16x8_t q10u16, q14u16;
-  int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
-
-  d19u8 = vabd_u8(d3u8, d4u8);
-  d20u8 = vabd_u8(d4u8, d5u8);
-  d21u8 = vabd_u8(d5u8, d6u8);
-  d22u8 = vabd_u8(d16u8, d7u8);
-  d23u8 = vabd_u8(d17u8, d16u8);
-  d24u8 = vabd_u8(d18u8, d17u8);
-
-  d19u8 = vmax_u8(d19u8, d20u8);
-  d20u8 = vmax_u8(d21u8, d22u8);
-
-  d25u8 = vabd_u8(d6u8, d4u8);
-
-  d23u8 = vmax_u8(d23u8, d24u8);
-
-  d26u8 = vabd_u8(d7u8, d17u8);
-
-  d19u8 = vmax_u8(d19u8, d20u8);
-
-  d24u8 = vabd_u8(d6u8, d7u8);
-  d27u8 = vabd_u8(d3u8, d6u8);
-  d28u8 = vabd_u8(d18u8, d7u8);
-
-  d19u8 = vmax_u8(d19u8, d23u8);
-
-  d23u8 = vabd_u8(d5u8, d16u8);
-  d24u8 = vqadd_u8(d24u8, d24u8);
-
-  d19u8 = vcge_u8(dlimit, d19u8);
-
-  d25u8 = vmax_u8(d25u8, d26u8);
-  d26u8 = vmax_u8(d27u8, d28u8);
-
-  d23u8 = vshr_n_u8(d23u8, 1);
-
-  d25u8 = vmax_u8(d25u8, d26u8);
-
-  d24u8 = vqadd_u8(d24u8, d23u8);
-
-  d20u8 = vmax_u8(d20u8, d25u8);
-
-  d23u8 = vdup_n_u8(1);
-  d24u8 = vcge_u8(dblimit, d24u8);
-
-  d21u8 = vcgt_u8(d21u8, dthresh);
-
-  d20u8 = vcge_u8(d23u8, d20u8);
-
-  d19u8 = vand_u8(d19u8, d24u8);
-
-  d23u8 = vcgt_u8(d22u8, dthresh);
-
-  d20u8 = vand_u8(d20u8, d19u8);
-
-  d22u8 = vdup_n_u8(0x80);
-
-  d23u8 = vorr_u8(d21u8, d23u8);
-
-  q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8));
-
-  d30u8 = vshrn_n_u16(q10u16, 4);
-  flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
-
-  if (flat == 0xffffffff) {  // Check for all 1's, power_branch_only
-    d27u8 = vdup_n_u8(3);
-    d21u8 = vdup_n_u8(2);
-    q14u16 = vaddl_u8(d6u8, d7u8);
-    q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
-    q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
-    q14u16 = vaddw_u8(q14u16, d5u8);
-    *d0ru8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d3u8);
-    q14u16 = vsubw_u8(q14u16, d4u8);
-    q14u16 = vaddw_u8(q14u16, d5u8);
-    q14u16 = vaddw_u8(q14u16, d16u8);
-    *d1ru8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d3u8);
-    q14u16 = vsubw_u8(q14u16, d5u8);
-    q14u16 = vaddw_u8(q14u16, d6u8);
-    q14u16 = vaddw_u8(q14u16, d17u8);
-    *d2ru8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d3u8);
-    q14u16 = vsubw_u8(q14u16, d6u8);
-    q14u16 = vaddw_u8(q14u16, d7u8);
-    q14u16 = vaddw_u8(q14u16, d18u8);
-    *d3ru8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d4u8);
-    q14u16 = vsubw_u8(q14u16, d7u8);
-    q14u16 = vaddw_u8(q14u16, d16u8);
-    q14u16 = vaddw_u8(q14u16, d18u8);
-    *d4ru8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d5u8);
-    q14u16 = vsubw_u8(q14u16, d16u8);
-    q14u16 = vaddw_u8(q14u16, d17u8);
-    q14u16 = vaddw_u8(q14u16, d18u8);
-    *d5ru8 = vqrshrn_n_u16(q14u16, 3);
-  } else {
-    d21u8 = veor_u8(d7u8, d22u8);
-    d24u8 = veor_u8(d6u8, d22u8);
-    d25u8 = veor_u8(d5u8, d22u8);
-    d26u8 = veor_u8(d16u8, d22u8);
-
-    d27u8 = vdup_n_u8(3);
-
-    d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
-    d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
-
-    q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
-
-    d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
-
-    q15s16 = vaddw_s8(q15s16, d29s8);
-
-    d29u8 = vdup_n_u8(4);
-
-    d28s8 = vqmovn_s16(q15s16);
-
-    d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
-
-    d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
-    d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
-    d30s8 = vshr_n_s8(d30s8, 3);
-    d29s8 = vshr_n_s8(d29s8, 3);
-
-    d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
-    d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
-
-    d29s8 = vrshr_n_s8(d29s8, 1);
-    d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
-
-    d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
-    d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
-
-    if (flat == 0) {  // filter_branch_only
-      *d0ru8 = d4u8;
-      *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
-      *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
-      *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
-      *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
-      *d5ru8 = d17u8;
-      return;
-    }
-
-    d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
-    d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
-    d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
-    d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
-
-    d23u8 = vdup_n_u8(2);
-    q14u16 = vaddl_u8(d6u8, d7u8);
-    q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
-    q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
-
-    d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
-
-    q14u16 = vaddw_u8(q14u16, d5u8);
-
-    d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
-
-    d30u8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d3u8);
-    q14u16 = vsubw_u8(q14u16, d4u8);
-    q14u16 = vaddw_u8(q14u16, d5u8);
-    q14u16 = vaddw_u8(q14u16, d16u8);
-
-    d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
-
-    d31u8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d3u8);
-    q14u16 = vsubw_u8(q14u16, d5u8);
-    q14u16 = vaddw_u8(q14u16, d6u8);
-    q14u16 = vaddw_u8(q14u16, d17u8);
-
-    *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
-
-    d23u8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d3u8);
-    q14u16 = vsubw_u8(q14u16, d6u8);
-    q14u16 = vaddw_u8(q14u16, d7u8);
-
-    *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
-
-    q14u16 = vaddw_u8(q14u16, d18u8);
-
-    *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
-
-    d22u8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d4u8);
-    q14u16 = vsubw_u8(q14u16, d7u8);
-    q14u16 = vaddw_u8(q14u16, d16u8);
-
-    d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
-
-    q14u16 = vaddw_u8(q14u16, d18u8);
-
-    d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
-
-    d6u8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d5u8);
-    q14u16 = vsubw_u8(q14u16, d16u8);
-    q14u16 = vaddw_u8(q14u16, d17u8);
-    q14u16 = vaddw_u8(q14u16, d18u8);
-
-    d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
-
-    d7u8 = vqrshrn_n_u16(q14u16, 3);
-
-    *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
-    *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
-    *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
-  }
-  return;
-}
-
-void aom_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
-                               const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-  uint8_t *s, *psrc;
-  uint8x8_t dblimit, dlimit, dthresh;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-  uint8x8_t d16u8, d17u8, d18u8;
-
-  dblimit = vld1_u8(blimit);
-  dlimit = vld1_u8(limit);
-  dthresh = vld1_u8(thresh);
-
-  psrc = src - (pitch << 2);
-  for (i = 0; i < 1; i++) {
-    s = psrc + i * 8;
-
-    d3u8 = vld1_u8(s);
-    s += pitch;
-    d4u8 = vld1_u8(s);
-    s += pitch;
-    d5u8 = vld1_u8(s);
-    s += pitch;
-    d6u8 = vld1_u8(s);
-    s += pitch;
-    d7u8 = vld1_u8(s);
-    s += pitch;
-    d16u8 = vld1_u8(s);
-    s += pitch;
-    d17u8 = vld1_u8(s);
-    s += pitch;
-    d18u8 = vld1_u8(s);
-
-    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
-                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
-                       &d5u8);
-
-    s -= (pitch * 6);
-    vst1_u8(s, d0u8);
-    s += pitch;
-    vst1_u8(s, d1u8);
-    s += pitch;
-    vst1_u8(s, d2u8);
-    s += pitch;
-    vst1_u8(s, d3u8);
-    s += pitch;
-    vst1_u8(s, d4u8);
-    s += pitch;
-    vst1_u8(s, d5u8);
-  }
-  return;
-}
-
-void aom_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
-                             const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-  uint8_t *s;
-  uint8x8_t dblimit, dlimit, dthresh;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-  uint8x8_t d16u8, d17u8, d18u8;
-  uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
-  uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
-  uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
-  uint8x8x4_t d4Result;
-  uint8x8x2_t d2Result;
-
-  dblimit = vld1_u8(blimit);
-  dlimit = vld1_u8(limit);
-  dthresh = vld1_u8(thresh);
-
-  for (i = 0; i < 1; i++) {
-    s = src + (i * (pitch << 3)) - 4;
-
-    d3u8 = vld1_u8(s);
-    s += pitch;
-    d4u8 = vld1_u8(s);
-    s += pitch;
-    d5u8 = vld1_u8(s);
-    s += pitch;
-    d6u8 = vld1_u8(s);
-    s += pitch;
-    d7u8 = vld1_u8(s);
-    s += pitch;
-    d16u8 = vld1_u8(s);
-    s += pitch;
-    d17u8 = vld1_u8(s);
-    s += pitch;
-    d18u8 = vld1_u8(s);
-
-    d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
-    d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
-    d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
-    d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
-
-    d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
-                      vreinterpret_u16_u32(d2tmp2.val[0]));
-    d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
-                      vreinterpret_u16_u32(d2tmp3.val[0]));
-    d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
-                      vreinterpret_u16_u32(d2tmp2.val[1]));
-    d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
-                      vreinterpret_u16_u32(d2tmp3.val[1]));
-
-    d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
-                     vreinterpret_u8_u16(d2tmp5.val[0]));
-    d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
-                     vreinterpret_u8_u16(d2tmp5.val[1]));
-    d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
-                      vreinterpret_u8_u16(d2tmp7.val[0]));
-    d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
-                      vreinterpret_u8_u16(d2tmp7.val[1]));
-
-    d3u8 = d2tmp8.val[0];
-    d4u8 = d2tmp8.val[1];
-    d5u8 = d2tmp9.val[0];
-    d6u8 = d2tmp9.val[1];
-    d7u8 = d2tmp10.val[0];
-    d16u8 = d2tmp10.val[1];
-    d17u8 = d2tmp11.val[0];
-    d18u8 = d2tmp11.val[1];
-
-    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
-                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
-                       &d5u8);
-
-    d4Result.val[0] = d0u8;
-    d4Result.val[1] = d1u8;
-    d4Result.val[2] = d2u8;
-    d4Result.val[3] = d3u8;
-
-    d2Result.val[0] = d4u8;
-    d2Result.val[1] = d5u8;
-
-    s = src - 3;
-    vst4_lane_u8(s, d4Result, 0);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 1);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 2);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 3);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 4);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 5);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 6);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 7);
-
-    s = src + 1;
-    vst2_lane_u8(s, d2Result, 0);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 1);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 2);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 3);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 4);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 5);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 6);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 7);
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_8_neon_asm.asm b/third_party/aom/aom_dsp/arm/loopfilter_8_neon_asm.asm
deleted file mode 100644
index 9f3db66ee..000000000
--- a/third_party/aom/aom_dsp/arm/loopfilter_8_neon_asm.asm
+++ /dev/null
@@ -1,428 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-    EXPORT  |aom_lpf_horizontal_8_neon|
-    EXPORT  |aom_lpf_vertical_8_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; Currently aom only works on iterations 8 at a time. The aom loop filter
-; works on 16 iterations at a time.
-;
-; void aom_lpf_horizontal_8_neon(uint8_t *s, int p,
-;                                const uint8_t *blimit,
-;                                const uint8_t *limit,
-;                                const uint8_t *thresh)
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-|aom_lpf_horizontal_8_neon| PROC
-    push        {r4-r5, lr}
-
-    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
-    ldr         r2, [sp, #12]              ; load thresh
-    add         r1, r1, r1                 ; double pitch
-
-    vld1.8      {d1[]}, [r3]               ; duplicate *limit
-    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
-
-    sub         r3, r0, r1, lsl #1         ; move src pointer down by 4 lines
-    add         r2, r3, r1, lsr #1         ; set to 3 lines down
-
-    vld1.u8     {d3}, [r3@64], r1          ; p3
-    vld1.u8     {d4}, [r2@64], r1          ; p2
-    vld1.u8     {d5}, [r3@64], r1          ; p1
-    vld1.u8     {d6}, [r2@64], r1          ; p0
-    vld1.u8     {d7}, [r3@64], r1          ; q0
-    vld1.u8     {d16}, [r2@64], r1         ; q1
-    vld1.u8     {d17}, [r3@64]             ; q2
-    vld1.u8     {d18}, [r2@64], r1         ; q3
-
-    sub         r3, r3, r1, lsl #1
-    sub         r2, r2, r1, lsl #2
-
-    bl          aom_mbloop_filter_neon
-
-    vst1.u8     {d0}, [r2@64], r1          ; store op2
-    vst1.u8     {d1}, [r3@64], r1          ; store op1
-    vst1.u8     {d2}, [r2@64], r1          ; store op0
-    vst1.u8     {d3}, [r3@64], r1          ; store oq0
-    vst1.u8     {d4}, [r2@64], r1          ; store oq1
-    vst1.u8     {d5}, [r3@64], r1          ; store oq2
-
-    pop         {r4-r5, pc}
-
-    ENDP        ; |aom_lpf_horizontal_8_neon|
-
-; void aom_lpf_vertical_8_neon(uint8_t *s,
-;                              int pitch,
-;                              const uint8_t *blimit,
-;                              const uint8_t *limit,
-;                              const uint8_t *thresh)
-;
-; r0    uint8_t *s,
-; r1    int pitch,
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-|aom_lpf_vertical_8_neon| PROC
-    push        {r4-r5, lr}
-
-    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
-    vld1.8      {d1[]}, [r3]              ; duplicate *limit
-
-    ldr         r3, [sp, #12]             ; load thresh
-    sub         r2, r0, #4                ; move s pointer down by 4 columns
-
-    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
-
-    vld1.u8     {d3}, [r2], r1             ; load s data
-    vld1.u8     {d4}, [r2], r1
-    vld1.u8     {d5}, [r2], r1
-    vld1.u8     {d6}, [r2], r1
-    vld1.u8     {d7}, [r2], r1
-    vld1.u8     {d16}, [r2], r1
-    vld1.u8     {d17}, [r2], r1
-    vld1.u8     {d18}, [r2]
-
-    ;transpose to 8x16 matrix
-    vtrn.32     d3, d7
-    vtrn.32     d4, d16
-    vtrn.32     d5, d17
-    vtrn.32     d6, d18
-
-    vtrn.16     d3, d5
-    vtrn.16     d4, d6
-    vtrn.16     d7, d17
-    vtrn.16     d16, d18
-
-    vtrn.8      d3, d4
-    vtrn.8      d5, d6
-    vtrn.8      d7, d16
-    vtrn.8      d17, d18
-
-    sub         r2, r0, #3
-    add         r3, r0, #1
-
-    bl          aom_mbloop_filter_neon
-
-    ;store op2, op1, op0, oq0
-    vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
-    vst4.8      {d0[1], d1[1], d2[1], d3[1]}, [r2], r1
-    vst4.8      {d0[2], d1[2], d2[2], d3[2]}, [r2], r1
-    vst4.8      {d0[3], d1[3], d2[3], d3[3]}, [r2], r1
-    vst4.8      {d0[4], d1[4], d2[4], d3[4]}, [r2], r1
-    vst4.8      {d0[5], d1[5], d2[5], d3[5]}, [r2], r1
-    vst4.8      {d0[6], d1[6], d2[6], d3[6]}, [r2], r1
-    vst4.8      {d0[7], d1[7], d2[7], d3[7]}, [r2]
-
-    ;store oq1, oq2
-    vst2.8      {d4[0], d5[0]}, [r3], r1
-    vst2.8      {d4[1], d5[1]}, [r3], r1
-    vst2.8      {d4[2], d5[2]}, [r3], r1
-    vst2.8      {d4[3], d5[3]}, [r3], r1
-    vst2.8      {d4[4], d5[4]}, [r3], r1
-    vst2.8      {d4[5], d5[5]}, [r3], r1
-    vst2.8      {d4[6], d5[6]}, [r3], r1
-    vst2.8      {d4[7], d5[7]}, [r3]
-
-    pop         {r4-r5, pc}
-    ENDP        ; |aom_lpf_vertical_8_neon|
-
-; void aom_mbloop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store. The function does not use
-; registers d8-d15.
-;
-; Inputs:
-; r0-r3, r12 PRESERVE
-; d0    blimit
-; d1    limit
-; d2    thresh
-; d3    p3
-; d4    p2
-; d5    p1
-; d6    p0
-; d7    q0
-; d16   q1
-; d17   q2
-; d18   q3
-;
-; Outputs:
-; d0    op2
-; d1    op1
-; d2    op0
-; d3    oq0
-; d4    oq1
-; d5    oq2
-|aom_mbloop_filter_neon| PROC
-    ; filter_mask
-    vabd.u8     d19, d3, d4                ; m1 = abs(p3 - p2)
-    vabd.u8     d20, d4, d5                ; m2 = abs(p2 - p1)
-    vabd.u8     d21, d5, d6                ; m3 = abs(p1 - p0)
-    vabd.u8     d22, d16, d7               ; m4 = abs(q1 - q0)
-    vabd.u8     d23, d17, d16              ; m5 = abs(q2 - q1)
-    vabd.u8     d24, d18, d17              ; m6 = abs(q3 - q2)
-
-    ; only compare the largest value to limit
-    vmax.u8     d19, d19, d20              ; m1 = max(m1, m2)
-    vmax.u8     d20, d21, d22              ; m2 = max(m3, m4)
-
-    vabd.u8     d25, d6, d4                ; m7 = abs(p0 - p2)
-
-    vmax.u8     d23, d23, d24              ; m3 = max(m5, m6)
-
-    vabd.u8     d26, d7, d17               ; m8 = abs(q0 - q2)
-
-    vmax.u8     d19, d19, d20
-
-    vabd.u8     d24, d6, d7                ; m9 = abs(p0 - q0)
-    vabd.u8     d27, d3, d6                ; m10 = abs(p3 - p0)
-    vabd.u8     d28, d18, d7               ; m11 = abs(q3 - q0)
-
-    vmax.u8     d19, d19, d23
-
-    vabd.u8     d23, d5, d16               ; a = abs(p1 - q1)
-    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
-
-    ; abs () > limit
-    vcge.u8     d19, d1, d19
-
-    ; only compare the largest value to thresh
-    vmax.u8     d25, d25, d26              ; m4 = max(m7, m8)
-    vmax.u8     d26, d27, d28              ; m5 = max(m10, m11)
-
-    vshr.u8     d23, d23, #1               ; a = a / 2
-
-    vmax.u8     d25, d25, d26              ; m4 = max(m4, m5)
-
-    vqadd.u8    d24, d24, d23              ; a = b + a
-
-    vmax.u8     d20, d20, d25              ; m2 = max(m2, m4)
-
-    vmov.u8     d23, #1
-    vcge.u8     d24, d0, d24               ; a > blimit
-
-    vcgt.u8     d21, d21, d2               ; (abs(p1 - p0) > thresh)*-1
-
-    vcge.u8     d20, d23, d20              ; flat
-
-    vand        d19, d19, d24              ; mask
-
-    vcgt.u8     d23, d22, d2               ; (abs(q1 - q0) > thresh)*-1
-
-    vand        d20, d20, d19              ; flat & mask
-
-    vmov.u8     d22, #0x80
-
-    vorr        d23, d21, d23              ; hev
-
-    ; This instruction will truncate the "flat & mask" masks down to 4 bits
-    ; each to fit into one 32 bit arm register. The values are stored in
-    ; q10.64[0].
-    vshrn.u16   d30, q10, #4
-    vmov.u32    r4, d30[0]                 ; flat & mask 4bits
-
-    adds        r5, r4, #1                 ; Check for all 1's
-
-    ; If mask and flat are 1's for all vectors, then we only need to execute
-    ; the power branch for all vectors.
-    beq         power_branch_only
-
-    cmp         r4, #0                     ; Check for 0, set flag for later
-
-    ; mbfilter() function
-    ; filter() function
-    ; convert to signed
-    veor        d21, d7, d22               ; qs0
-    veor        d24, d6, d22               ; ps0
-    veor        d25, d5, d22               ; ps1
-    veor        d26, d16, d22              ; qs1
-
-    vmov.u8     d27, #3
-
-    vsub.s8     d28, d21, d24              ; ( qs0 - ps0)
-
-    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
-
-    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
-
-    vand        d29, d29, d23              ; filter &= hev
-
-    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
-
-    vmov.u8     d29, #4
-
-    ; filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d28, q15
-
-    vand        d28, d28, d19              ; filter &= mask
-
-    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
-    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
-    vshr.s8     d30, d30, #3               ; filter2 >>= 3
-    vshr.s8     d29, d29, #3               ; filter1 >>= 3
-
-    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
-    vqsub.s8    d21, d21, d29              ; oq0 = clamp(qs0 - filter1)
-
-    ; outer tap adjustments: ++filter1 >> 1
-    vrshr.s8    d29, d29, #1
-    vbic        d29, d29, d23              ; filter &= ~hev
-
-    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
-    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
-
-    ; If mask and flat are 0's for all vectors, then we only need to execute
-    ; the filter branch for all vectors.
-    beq         filter_branch_only
-
-    ; If mask and flat are mixed then we must perform both branches and
-    ; combine the data.
-    veor        d24, d24, d22              ; *f_op0 = u^0x80
-    veor        d21, d21, d22              ; *f_oq0 = u^0x80
-    veor        d25, d25, d22              ; *f_op1 = u^0x80
-    veor        d26, d26, d22              ; *f_oq1 = u^0x80
-
-    ; At this point we have already executed the filter branch. The filter
-    ; branch does not set op2 or oq2, so use p2 and q2. Execute the power
-    ; branch and combine the data.
-    vmov.u8     d23, #2
-    vaddl.u8    q14, d6, d7                ; r_op2 = p0 + q0
-    vmlal.u8    q14, d3, d27               ; r_op2 += p3 * 3
-    vmlal.u8    q14, d4, d23               ; r_op2 += p2 * 2
-
-    vbif        d0, d4, d20                ; op2 |= p2 & ~(flat & mask)
-
-    vaddw.u8    q14, d5                    ; r_op2 += p1
-
-    vbif        d1, d25, d20               ; op1 |= f_op1 & ~(flat & mask)
-
-    vqrshrn.u16 d30, q14, #3               ; r_op2
-
-    vsubw.u8    q14, d3                    ; r_op1 = r_op2 - p3
-    vsubw.u8    q14, d4                    ; r_op1 -= p2
-    vaddw.u8    q14, d5                    ; r_op1 += p1
-    vaddw.u8    q14, d16                   ; r_op1 += q1
-
-    vbif        d2, d24, d20               ; op0 |= f_op0 & ~(flat & mask)
-
-    vqrshrn.u16 d31, q14, #3               ; r_op1
-
-    vsubw.u8    q14, d3                    ; r_op0 = r_op1 - p3
-    vsubw.u8    q14, d5                    ; r_op0 -= p1
-    vaddw.u8    q14, d6                    ; r_op0 += p0
-    vaddw.u8    q14, d17                   ; r_op0 += q2
-
-    vbit        d0, d30, d20               ; op2 |= r_op2 & (flat & mask)
-
-    vqrshrn.u16 d23, q14, #3               ; r_op0
-
-    vsubw.u8    q14, d3                    ; r_oq0 = r_op0 - p3
-    vsubw.u8    q14, d6                    ; r_oq0 -= p0
-    vaddw.u8    q14, d7                    ; r_oq0 += q0
-
-    vbit        d1, d31, d20               ; op1 |= r_op1 & (flat & mask)
-
-    vaddw.u8    q14, d18                   ; oq0 += q3
-
-    vbit        d2, d23, d20               ; op0 |= r_op0 & (flat & mask)
-
-    vqrshrn.u16 d22, q14, #3               ; r_oq0
-
-    vsubw.u8    q14, d4                    ; r_oq1 = r_oq0 - p2
-    vsubw.u8    q14, d7                    ; r_oq1 -= q0
-    vaddw.u8    q14, d16                   ; r_oq1 += q1
-
-    vbif        d3, d21, d20               ; oq0 |= f_oq0 & ~(flat & mask)
-
-    vaddw.u8    q14, d18                   ; r_oq1 += q3
-
-    vbif        d4, d26, d20               ; oq1 |= f_oq1 & ~(flat & mask)
-
-    vqrshrn.u16 d6, q14, #3                ; r_oq1
-
-    vsubw.u8    q14, d5                    ; r_oq2 = r_oq1 - p1
-    vsubw.u8    q14, d16                   ; r_oq2 -= q1
-    vaddw.u8    q14, d17                   ; r_oq2 += q2
-    vaddw.u8    q14, d18                   ; r_oq2 += q3
-
-    vbif        d5, d17, d20               ; oq2 |= q2 & ~(flat & mask)
-
-    vqrshrn.u16 d7, q14, #3                ; r_oq2
-
-    vbit        d3, d22, d20               ; oq0 |= r_oq0 & (flat & mask)
-    vbit        d4, d6, d20                ; oq1 |= r_oq1 & (flat & mask)
-    vbit        d5, d7, d20                ; oq2 |= r_oq2 & (flat & mask)
-
-    bx          lr
-
-power_branch_only
-    vmov.u8     d27, #3
-    vmov.u8     d21, #2
-    vaddl.u8    q14, d6, d7                ; op2 = p0 + q0
-    vmlal.u8    q14, d3, d27               ; op2 += p3 * 3
-    vmlal.u8    q14, d4, d21               ; op2 += p2 * 2
-    vaddw.u8    q14, d5                    ; op2 += p1
-    vqrshrn.u16 d0, q14, #3                ; op2
-
-    vsubw.u8    q14, d3                    ; op1 = op2 - p3
-    vsubw.u8    q14, d4                    ; op1 -= p2
-    vaddw.u8    q14, d5                    ; op1 += p1
-    vaddw.u8    q14, d16                   ; op1 += q1
-    vqrshrn.u16 d1, q14, #3                ; op1
-
-    vsubw.u8    q14, d3                    ; op0 = op1 - p3
-    vsubw.u8    q14, d5                    ; op0 -= p1
-    vaddw.u8    q14, d6                    ; op0 += p0
-    vaddw.u8    q14, d17                   ; op0 += q2
-    vqrshrn.u16 d2, q14, #3                ; op0
-
-    vsubw.u8    q14, d3                    ; oq0 = op0 - p3
-    vsubw.u8    q14, d6                    ; oq0 -= p0
-    vaddw.u8    q14, d7                    ; oq0 += q0
-    vaddw.u8    q14, d18                   ; oq0 += q3
-    vqrshrn.u16 d3, q14, #3                ; oq0
-
-    vsubw.u8    q14, d4                    ; oq1 = oq0 - p2
-    vsubw.u8    q14, d7                    ; oq1 -= q0
-    vaddw.u8    q14, d16                   ; oq1 += q1
-    vaddw.u8    q14, d18                   ; oq1 += q3
-    vqrshrn.u16 d4, q14, #3                ; oq1
-
-    vsubw.u8    q14, d5                    ; oq2 = oq1 - p1
-    vsubw.u8    q14, d16                   ; oq2 -= q1
-    vaddw.u8    q14, d17                   ; oq2 += q2
-    vaddw.u8    q14, d18                   ; oq2 += q3
-    vqrshrn.u16 d5, q14, #3                ; oq2
-
-    bx          lr
-
-filter_branch_only
-    ; TODO(fgalligan): See if we can rearange registers so we do not need to
-    ; do the 2 vswp.
-    vswp        d0, d4                      ; op2
-    vswp        d5, d17                     ; oq2
-    veor        d2, d24, d22                ; *op0 = u^0x80
-    veor        d3, d21, d22                ; *oq0 = u^0x80
-    veor        d1, d25, d22                ; *op1 = u^0x80
-    veor        d4, d26, d22                ; *oq1 = u^0x80
-
-    bx          lr
-
-    ENDP        ; |aom_mbloop_filter_neon|
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm b/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm
deleted file mode 100644
index 675928860..000000000
--- a/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm
+++ /dev/null
@@ -1,638 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-    EXPORT  |aom_lpf_horizontal_edge_8_neon|
-    EXPORT  |aom_lpf_horizontal_edge_16_neon|
-    EXPORT  |aom_lpf_vertical_16_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; void mb_lpf_horizontal_edge(uint8_t *s, int p,
-;                             const uint8_t *blimit,
-;                             const uint8_t *limit,
-;                             const uint8_t *thresh,
-;                             int count)
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-; r12   int count
-|mb_lpf_horizontal_edge| PROC
-    push        {r4-r8, lr}
-    vpush       {d8-d15}
-    ldr         r4, [sp, #88]              ; load thresh
-
-h_count
-    vld1.8      {d16[]}, [r2]              ; load *blimit
-    vld1.8      {d17[]}, [r3]              ; load *limit
-    vld1.8      {d18[]}, [r4]              ; load *thresh
-
-    sub         r8, r0, r1, lsl #3         ; move src pointer down by 8 lines
-
-    vld1.u8     {d0}, [r8@64], r1          ; p7
-    vld1.u8     {d1}, [r8@64], r1          ; p6
-    vld1.u8     {d2}, [r8@64], r1          ; p5
-    vld1.u8     {d3}, [r8@64], r1          ; p4
-    vld1.u8     {d4}, [r8@64], r1          ; p3
-    vld1.u8     {d5}, [r8@64], r1          ; p2
-    vld1.u8     {d6}, [r8@64], r1          ; p1
-    vld1.u8     {d7}, [r8@64], r1          ; p0
-    vld1.u8     {d8}, [r8@64], r1          ; q0
-    vld1.u8     {d9}, [r8@64], r1          ; q1
-    vld1.u8     {d10}, [r8@64], r1         ; q2
-    vld1.u8     {d11}, [r8@64], r1         ; q3
-    vld1.u8     {d12}, [r8@64], r1         ; q4
-    vld1.u8     {d13}, [r8@64], r1         ; q5
-    vld1.u8     {d14}, [r8@64], r1         ; q6
-    vld1.u8     {d15}, [r8@64], r1         ; q7
-
-    bl          aom_wide_mbfilter_neon
-
-    tst         r7, #1
-    beq         h_mbfilter
-
-    ; flat && mask were not set for any of the channels. Just store the values
-    ; from filter.
-    sub         r8, r0, r1, lsl #1
-
-    vst1.u8     {d25}, [r8@64], r1         ; store op1
-    vst1.u8     {d24}, [r8@64], r1         ; store op0
-    vst1.u8     {d23}, [r8@64], r1         ; store oq0
-    vst1.u8     {d26}, [r8@64], r1         ; store oq1
-
-    b           h_next
-
-h_mbfilter
-    tst         r7, #2
-    beq         h_wide_mbfilter
-
-    ; flat2 was not set for any of the channels. Just store the values from
-    ; mbfilter.
-    sub         r8, r0, r1, lsl #1
-    sub         r8, r8, r1
-
-    vst1.u8     {d18}, [r8@64], r1         ; store op2
-    vst1.u8     {d19}, [r8@64], r1         ; store op1
-    vst1.u8     {d20}, [r8@64], r1         ; store op0
-    vst1.u8     {d21}, [r8@64], r1         ; store oq0
-    vst1.u8     {d22}, [r8@64], r1         ; store oq1
-    vst1.u8     {d23}, [r8@64], r1         ; store oq2
-
-    b           h_next
-
-h_wide_mbfilter
-    sub         r8, r0, r1, lsl #3
-    add         r8, r8, r1
-
-    vst1.u8     {d16}, [r8@64], r1         ; store op6
-    vst1.u8     {d24}, [r8@64], r1         ; store op5
-    vst1.u8     {d25}, [r8@64], r1         ; store op4
-    vst1.u8     {d26}, [r8@64], r1         ; store op3
-    vst1.u8     {d27}, [r8@64], r1         ; store op2
-    vst1.u8     {d18}, [r8@64], r1         ; store op1
-    vst1.u8     {d19}, [r8@64], r1         ; store op0
-    vst1.u8     {d20}, [r8@64], r1         ; store oq0
-    vst1.u8     {d21}, [r8@64], r1         ; store oq1
-    vst1.u8     {d22}, [r8@64], r1         ; store oq2
-    vst1.u8     {d23}, [r8@64], r1         ; store oq3
-    vst1.u8     {d1}, [r8@64], r1          ; store oq4
-    vst1.u8     {d2}, [r8@64], r1          ; store oq5
-    vst1.u8     {d3}, [r8@64], r1          ; store oq6
-
-h_next
-    add         r0, r0, #8
-    subs        r12, r12, #1
-    bne         h_count
-
-    vpop        {d8-d15}
-    pop         {r4-r8, pc}
-
-    ENDP        ; |mb_lpf_horizontal_edge|
-
-; void aom_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch,
-;                                     const uint8_t *blimit,
-;                                     const uint8_t *limit,
-;                                     const uint8_t *thresh)
-; r0    uint8_t *s,
-; r1    int pitch,
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh
-|aom_lpf_horizontal_edge_8_neon| PROC
-    mov r12, #1
-    b mb_lpf_horizontal_edge
-    ENDP        ; |aom_lpf_horizontal_edge_8_neon|
-
-; void aom_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch,
-;                                      const uint8_t *blimit,
-;                                      const uint8_t *limit,
-;                                      const uint8_t *thresh)
-; r0    uint8_t *s,
-; r1    int pitch,
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh
-|aom_lpf_horizontal_edge_16_neon| PROC
-    mov r12, #2
-    b mb_lpf_horizontal_edge
-    ENDP        ; |aom_lpf_horizontal_edge_16_neon|
-
-; void aom_lpf_vertical_16_neon(uint8_t *s, int p,
-;                               const uint8_t *blimit,
-;                               const uint8_t *limit,
-;                               const uint8_t *thresh)
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-|aom_lpf_vertical_16_neon| PROC
-    push        {r4-r8, lr}
-    vpush       {d8-d15}
-    ldr         r4, [sp, #88]              ; load thresh
-
-    vld1.8      {d16[]}, [r2]              ; load *blimit
-    vld1.8      {d17[]}, [r3]              ; load *limit
-    vld1.8      {d18[]}, [r4]              ; load *thresh
-
-    sub         r8, r0, #8
-
-    vld1.8      {d0}, [r8@64], r1
-    vld1.8      {d8}, [r0@64], r1
-    vld1.8      {d1}, [r8@64], r1
-    vld1.8      {d9}, [r0@64], r1
-    vld1.8      {d2}, [r8@64], r1
-    vld1.8      {d10}, [r0@64], r1
-    vld1.8      {d3}, [r8@64], r1
-    vld1.8      {d11}, [r0@64], r1
-    vld1.8      {d4}, [r8@64], r1
-    vld1.8      {d12}, [r0@64], r1
-    vld1.8      {d5}, [r8@64], r1
-    vld1.8      {d13}, [r0@64], r1
-    vld1.8      {d6}, [r8@64], r1
-    vld1.8      {d14}, [r0@64], r1
-    vld1.8      {d7}, [r8@64], r1
-    vld1.8      {d15}, [r0@64], r1
-
-    sub         r0, r0, r1, lsl #3
-
-    vtrn.32     q0, q2
-    vtrn.32     q1, q3
-    vtrn.32     q4, q6
-    vtrn.32     q5, q7
-
-    vtrn.16     q0, q1
-    vtrn.16     q2, q3
-    vtrn.16     q4, q5
-    vtrn.16     q6, q7
-
-    vtrn.8      d0, d1
-    vtrn.8      d2, d3
-    vtrn.8      d4, d5
-    vtrn.8      d6, d7
-
-    vtrn.8      d8, d9
-    vtrn.8      d10, d11
-    vtrn.8      d12, d13
-    vtrn.8      d14, d15
-
-    bl          aom_wide_mbfilter_neon
-
-    tst         r7, #1
-    beq         v_mbfilter
-
-    ; flat && mask were not set for any of the channels. Just store the values
-    ; from filter.
-    sub         r8, r0, #2
-
-    vswp        d23, d25
-
-    vst4.8      {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
-    vst4.8      {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
-    vst4.8      {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
-    vst4.8      {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
-    vst4.8      {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
-    vst4.8      {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
-    vst4.8      {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
-    vst4.8      {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
-
-    b           v_end
-
-v_mbfilter
-    tst         r7, #2
-    beq         v_wide_mbfilter
-
-    ; flat2 was not set for any of the channels. Just store the values from
-    ; mbfilter.
-    sub         r8, r0, #3
-
-    vst3.8      {d18[0], d19[0], d20[0]}, [r8], r1
-    vst3.8      {d21[0], d22[0], d23[0]}, [r0], r1
-    vst3.8      {d18[1], d19[1], d20[1]}, [r8], r1
-    vst3.8      {d21[1], d22[1], d23[1]}, [r0], r1
-    vst3.8      {d18[2], d19[2], d20[2]}, [r8], r1
-    vst3.8      {d21[2], d22[2], d23[2]}, [r0], r1
-    vst3.8      {d18[3], d19[3], d20[3]}, [r8], r1
-    vst3.8      {d21[3], d22[3], d23[3]}, [r0], r1
-    vst3.8      {d18[4], d19[4], d20[4]}, [r8], r1
-    vst3.8      {d21[4], d22[4], d23[4]}, [r0], r1
-    vst3.8      {d18[5], d19[5], d20[5]}, [r8], r1
-    vst3.8      {d21[5], d22[5], d23[5]}, [r0], r1
-    vst3.8      {d18[6], d19[6], d20[6]}, [r8], r1
-    vst3.8      {d21[6], d22[6], d23[6]}, [r0], r1
-    vst3.8      {d18[7], d19[7], d20[7]}, [r8], r1
-    vst3.8      {d21[7], d22[7], d23[7]}, [r0], r1
-
-    b           v_end
-
-v_wide_mbfilter
-    sub         r8, r0, #8
-
-    vtrn.32     d0,  d26
-    vtrn.32     d16, d27
-    vtrn.32     d24, d18
-    vtrn.32     d25, d19
-
-    vtrn.16     d0,  d24
-    vtrn.16     d16, d25
-    vtrn.16     d26, d18
-    vtrn.16     d27, d19
-
-    vtrn.8      d0,  d16
-    vtrn.8      d24, d25
-    vtrn.8      d26, d27
-    vtrn.8      d18, d19
-
-    vtrn.32     d20, d1
-    vtrn.32     d21, d2
-    vtrn.32     d22, d3
-    vtrn.32     d23, d15
-
-    vtrn.16     d20, d22
-    vtrn.16     d21, d23
-    vtrn.16     d1,  d3
-    vtrn.16     d2,  d15
-
-    vtrn.8      d20, d21
-    vtrn.8      d22, d23
-    vtrn.8      d1,  d2
-    vtrn.8      d3,  d15
-
-    vst1.8      {d0}, [r8@64], r1
-    vst1.8      {d20}, [r0@64], r1
-    vst1.8      {d16}, [r8@64], r1
-    vst1.8      {d21}, [r0@64], r1
-    vst1.8      {d24}, [r8@64], r1
-    vst1.8      {d22}, [r0@64], r1
-    vst1.8      {d25}, [r8@64], r1
-    vst1.8      {d23}, [r0@64], r1
-    vst1.8      {d26}, [r8@64], r1
-    vst1.8      {d1}, [r0@64], r1
-    vst1.8      {d27}, [r8@64], r1
-    vst1.8      {d2}, [r0@64], r1
-    vst1.8      {d18}, [r8@64], r1
-    vst1.8      {d3}, [r0@64], r1
-    vst1.8      {d19}, [r8@64], r1
-    vst1.8      {d15}, [r0@64], r1
-
-v_end
-    vpop        {d8-d15}
-    pop         {r4-r8, pc}
-
-    ENDP        ; |aom_lpf_vertical_16_neon|
-
-; void aom_wide_mbfilter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store.
-;
-; r0-r3 PRESERVE
-; d16    blimit
-; d17    limit
-; d18    thresh
-; d0    p7
-; d1    p6
-; d2    p5
-; d3    p4
-; d4    p3
-; d5    p2
-; d6    p1
-; d7    p0
-; d8    q0
-; d9    q1
-; d10   q2
-; d11   q3
-; d12   q4
-; d13   q5
-; d14   q6
-; d15   q7
-|aom_wide_mbfilter_neon| PROC
-    mov         r7, #0
-
-    ; filter_mask
-    vabd.u8     d19, d4, d5                ; abs(p3 - p2)
-    vabd.u8     d20, d5, d6                ; abs(p2 - p1)
-    vabd.u8     d21, d6, d7                ; abs(p1 - p0)
-    vabd.u8     d22, d9, d8                ; abs(q1 - q0)
-    vabd.u8     d23, d10, d9               ; abs(q2 - q1)
-    vabd.u8     d24, d11, d10              ; abs(q3 - q2)
-
-    ; only compare the largest value to limit
-    vmax.u8     d19, d19, d20              ; max(abs(p3 - p2), abs(p2 - p1))
-    vmax.u8     d20, d21, d22              ; max(abs(p1 - p0), abs(q1 - q0))
-    vmax.u8     d23, d23, d24              ; max(abs(q2 - q1), abs(q3 - q2))
-    vmax.u8     d19, d19, d20
-
-    vabd.u8     d24, d7, d8                ; abs(p0 - q0)
-
-    vmax.u8     d19, d19, d23
-
-    vabd.u8     d23, d6, d9                ; a = abs(p1 - q1)
-    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
-
-    ; abs () > limit
-    vcge.u8     d19, d17, d19
-
-    ; flatmask4
-    vabd.u8     d25, d7, d5                ; abs(p0 - p2)
-    vabd.u8     d26, d8, d10               ; abs(q0 - q2)
-    vabd.u8     d27, d4, d7                ; abs(p3 - p0)
-    vabd.u8     d28, d11, d8               ; abs(q3 - q0)
-
-    ; only compare the largest value to thresh
-    vmax.u8     d25, d25, d26              ; max(abs(p0 - p2), abs(q0 - q2))
-    vmax.u8     d26, d27, d28              ; max(abs(p3 - p0), abs(q3 - q0))
-    vmax.u8     d25, d25, d26
-    vmax.u8     d20, d20, d25
-
-    vshr.u8     d23, d23, #1               ; a = a / 2
-    vqadd.u8    d24, d24, d23              ; a = b + a
-
-    vmov.u8     d30, #1
-    vcge.u8     d24, d16, d24              ; (a > blimit * 2 + limit) * -1
-
-    vcge.u8     d20, d30, d20              ; flat
-
-    vand        d19, d19, d24              ; mask
-
-    ; hevmask
-    vcgt.u8     d21, d21, d18              ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     d22, d22, d18              ; (abs(q1 - q0) > thresh)*-1
-    vorr        d21, d21, d22              ; hev
-
-    vand        d16, d20, d19              ; flat && mask
-    vmov        r5, r6, d16
-
-    ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
-    vabd.u8     d22, d3, d7                ; abs(p4 - p0)
-    vabd.u8     d23, d12, d8               ; abs(q4 - q0)
-    vabd.u8     d24, d7, d2                ; abs(p0 - p5)
-    vabd.u8     d25, d8, d13               ; abs(q0 - q5)
-    vabd.u8     d26, d1, d7                ; abs(p6 - p0)
-    vabd.u8     d27, d14, d8               ; abs(q6 - q0)
-    vabd.u8     d28, d0, d7                ; abs(p7 - p0)
-    vabd.u8     d29, d15, d8               ; abs(q7 - q0)
-
-    ; only compare the largest value to thresh
-    vmax.u8     d22, d22, d23              ; max(abs(p4 - p0), abs(q4 - q0))
-    vmax.u8     d23, d24, d25              ; max(abs(p0 - p5), abs(q0 - q5))
-    vmax.u8     d24, d26, d27              ; max(abs(p6 - p0), abs(q6 - q0))
-    vmax.u8     d25, d28, d29              ; max(abs(p7 - p0), abs(q7 - q0))
-
-    vmax.u8     d26, d22, d23
-    vmax.u8     d27, d24, d25
-    vmax.u8     d23, d26, d27
-
-    vcge.u8     d18, d30, d23              ; flat2
-
-    vmov.u8     d22, #0x80
-
-    orrs        r5, r5, r6                 ; Check for 0
-    orreq       r7, r7, #1                 ; Only do filter branch
-
-    vand        d17, d18, d16              ; flat2 && flat && mask
-    vmov        r5, r6, d17
-
-    ; mbfilter() function
-
-    ; filter() function
-    ; convert to signed
-    veor        d23, d8, d22               ; qs0
-    veor        d24, d7, d22               ; ps0
-    veor        d25, d6, d22               ; ps1
-    veor        d26, d9, d22               ; qs1
-
-    vmov.u8     d27, #3
-
-    vsub.s8     d28, d23, d24              ; ( qs0 - ps0)
-    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
-    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
-    vand        d29, d29, d21              ; filter &= hev
-    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
-    vmov.u8     d29, #4
-
-    ; filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d28, q15
-
-    vand        d28, d28, d19              ; filter &= mask
-
-    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
-    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
-    vshr.s8     d30, d30, #3               ; filter2 >>= 3
-    vshr.s8     d29, d29, #3               ; filter1 >>= 3
-
-
-    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
-    vqsub.s8    d23, d23, d29              ; oq0 = clamp(qs0 - filter1)
-
-    ; outer tap adjustments: ++filter1 >> 1
-    vrshr.s8    d29, d29, #1
-    vbic        d29, d29, d21              ; filter &= ~hev
-
-    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
-    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
-
-    veor        d24, d24, d22              ; *f_op0 = u^0x80
-    veor        d23, d23, d22              ; *f_oq0 = u^0x80
-    veor        d25, d25, d22              ; *f_op1 = u^0x80
-    veor        d26, d26, d22              ; *f_oq1 = u^0x80
-
-    tst         r7, #1
-    bxne        lr
-
-    orrs        r5, r5, r6                 ; Check for 0
-    orreq       r7, r7, #2                 ; Only do mbfilter branch
-
-    ; mbfilter flat && mask branch
-    ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
-    ; and using vibt on the q's?
-    vmov.u8     d29, #2
-    vaddl.u8    q15, d7, d8                ; op2 = p0 + q0
-    vmlal.u8    q15, d4, d27               ; op2 = p0 + q0 + p3 * 3
-    vmlal.u8    q15, d5, d29               ; op2 = p0 + q0 + p3 * 3 + p2 * 2
-    vaddl.u8    q10, d4, d5
-    vaddw.u8    q15, d6                    ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
-    vaddl.u8    q14, d6, d9
-    vqrshrn.u16 d18, q15, #3               ; r_op2
-
-    vsub.i16    q15, q10
-    vaddl.u8    q10, d4, d6
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d7, d10
-    vqrshrn.u16 d19, q15, #3               ; r_op1
-
-    vsub.i16    q15, q10
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d8, d11
-    vqrshrn.u16 d20, q15, #3               ; r_op0
-
-    vsubw.u8    q15, d4                    ; oq0 = op0 - p3
-    vsubw.u8    q15, d7                    ; oq0 -= p0
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d9, d11
-    vqrshrn.u16 d21, q15, #3               ; r_oq0
-
-    vsubw.u8    q15, d5                    ; oq1 = oq0 - p2
-    vsubw.u8    q15, d8                    ; oq1 -= q0
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d10, d11
-    vqrshrn.u16 d22, q15, #3               ; r_oq1
-
-    vsubw.u8    q15, d6                    ; oq2 = oq0 - p1
-    vsubw.u8    q15, d9                    ; oq2 -= q1
-    vadd.i16    q15, q14
-    vqrshrn.u16 d27, q15, #3               ; r_oq2
-
-    ; Filter does not set op2 or oq2, so use p2 and q2.
-    vbif        d18, d5, d16               ; t_op2 |= p2 & ~(flat & mask)
-    vbif        d19, d25, d16              ; t_op1 |= f_op1 & ~(flat & mask)
-    vbif        d20, d24, d16              ; t_op0 |= f_op0 & ~(flat & mask)
-    vbif        d21, d23, d16              ; t_oq0 |= f_oq0 & ~(flat & mask)
-    vbif        d22, d26, d16              ; t_oq1 |= f_oq1 & ~(flat & mask)
-
-    vbit        d23, d27, d16              ; t_oq2 |= r_oq2 & (flat & mask)
-    vbif        d23, d10, d16              ; t_oq2 |= q2 & ~(flat & mask)
-
-    tst         r7, #2
-    bxne        lr
-
-    ; wide_mbfilter flat2 && flat && mask branch
-    vmov.u8     d16, #7
-    vaddl.u8    q15, d7, d8                ; op6 = p0 + q0
-    vaddl.u8    q12, d2, d3
-    vaddl.u8    q13, d4, d5
-    vaddl.u8    q14, d1, d6
-    vmlal.u8    q15, d0, d16               ; op6 += p7 * 3
-    vadd.i16    q12, q13
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d2, d9
-    vadd.i16    q15, q12
-    vaddl.u8    q12, d0, d1
-    vaddw.u8    q15, d1
-    vaddl.u8    q13, d0, d2
-    vadd.i16    q14, q15, q14
-    vqrshrn.u16 d16, q15, #4               ; w_op6
-
-    vsub.i16    q15, q14, q12
-    vaddl.u8    q14, d3, d10
-    vqrshrn.u16 d24, q15, #4               ; w_op5
-
-    vsub.i16    q15, q13
-    vaddl.u8    q13, d0, d3
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d4, d11
-    vqrshrn.u16 d25, q15, #4               ; w_op4
-
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d0, d4
-    vsub.i16    q15, q13
-    vsub.i16    q14, q15, q14
-    vqrshrn.u16 d26, q15, #4               ; w_op3
-
-    vaddw.u8    q15, q14, d5               ; op2 += p2
-    vaddl.u8    q14, d0, d5
-    vaddw.u8    q15, d12                   ; op2 += q4
-    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
-    vqrshrn.u16 d27, q15, #4               ; w_op2
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d0, d6
-    vaddw.u8    q15, d6                    ; op1 += p1
-    vaddw.u8    q15, d13                   ; op1 += q5
-    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
-    vqrshrn.u16 d18, q15, #4               ; w_op1
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d0, d7
-    vaddw.u8    q15, d7                    ; op0 += p0
-    vaddw.u8    q15, d14                   ; op0 += q6
-    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
-    vqrshrn.u16 d19, q15, #4               ; w_op0
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d1, d8
-    vaddw.u8    q15, d8                    ; oq0 += q0
-    vaddw.u8    q15, d15                   ; oq0 += q7
-    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
-    vqrshrn.u16 d20, q15, #4               ; w_oq0
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d2, d9
-    vaddw.u8    q15, d9                    ; oq1 += q1
-    vaddl.u8    q4, d10, d15
-    vaddw.u8    q15, d15                   ; oq1 += q7
-    vbif        d20, d21, d17              ; oq0 |= t_oq0 & ~(f2 & f & m)
-    vqrshrn.u16 d21, q15, #4               ; w_oq1
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d3, d10
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d11, d15
-    vbif        d21, d22, d17              ; oq1 |= t_oq1 & ~(f2 & f & m)
-    vqrshrn.u16 d22, q15, #4               ; w_oq2
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d4, d11
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d12, d15
-    vbif        d22, d23, d17              ; oq2 |= t_oq2 & ~(f2 & f & m)
-    vqrshrn.u16 d23, q15, #4               ; w_oq3
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d5, d12
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d13, d15
-    vbif        d16, d1, d17               ; op6 |= p6 & ~(f2 & f & m)
-    vqrshrn.u16 d1, q15, #4                ; w_oq4
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d6, d13
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d14, d15
-    vbif        d24, d2, d17               ; op5 |= p5 & ~(f2 & f & m)
-    vqrshrn.u16 d2, q15, #4                ; w_oq5
-
-    vsub.i16    q15, q14
-    vbif        d25, d3, d17               ; op4 |= p4 & ~(f2 & f & m)
-    vadd.i16    q15, q4
-    vbif        d23, d11, d17              ; oq3 |= q3 & ~(f2 & f & m)
-    vqrshrn.u16 d3, q15, #4                ; w_oq6
-    vbif        d1, d12, d17               ; oq4 |= q4 & ~(f2 & f & m)
-    vbif        d2, d13, d17               ; oq5 |= q5 & ~(f2 & f & m)
-    vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
-
-    bx          lr
-    ENDP        ; |aom_wide_mbfilter_neon|
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
index c90d6bfde..ee1a3c78f 100644
--- a/third_party/aom/aom_dsp/arm/loopfilter_neon.c
+++ b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -11,39 +11,690 @@
 
 #include <arm_neon.h>
 
-#include "./aom_dsp_rtcd.h"
-#include "./aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1,
+                                 uint8x8_t p0q0, const uint8_t blimit,
+                                 const uint8_t limit) {
+  // Calculate mask values for four samples
+  uint32x2x2_t p0q0_p1q1;
+  uint16x8_t temp_16x8;
+  uint16x4_t temp0_16x4, temp1_16x4;
+  uint8x8_t mask_8x8, temp_8x8;
+  const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+  const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit);
+
+  mask_8x8 = vabd_u8(p3q3, p2q2);
+  mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p2q2, p1q1));
+  mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0));
+  mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+  temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+                     vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+  temp_16x8 = vmovl_u8(temp_8x8);
+  temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+  temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+  temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+  temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+  temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  return mask_8x8;
+}
+
+static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2,
+                                       uint8x8_t p1q1, uint8x8_t p0q0) {
+  const uint8x8_t thresh_8x8 = vdup_n_u8(1);  // for bd==8 threshold is always 1
+  uint8x8_t flat_8x8, temp_8x8;
+
+  flat_8x8 = vabd_u8(p1q1, p0q0);
+  flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0));
+  flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p3q3, p0q0));
+  flat_8x8 = vcle_u8(flat_8x8, thresh_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8)));
+  flat_8x8 = vand_u8(flat_8x8, temp_8x8);
+
+  return flat_8x8;
+}
+
+static INLINE uint8x8_t lpf_flat_mask3(uint8x8_t p2q2, uint8x8_t p1q1,
+                                       uint8x8_t p0q0) {
+  const uint8x8_t thresh_8x8 = vdup_n_u8(1);  // for bd==8 threshold is always 1
+  uint8x8_t flat_8x8, temp_8x8;
+
+  flat_8x8 = vabd_u8(p1q1, p0q0);
+  flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0));
+  flat_8x8 = vcle_u8(flat_8x8, thresh_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8)));
+  flat_8x8 = vand_u8(flat_8x8, temp_8x8);
+
+  return flat_8x8;
+}
+
+static INLINE uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1,
+                                         uint8x8_t p0q0, const uint8_t blimit,
+                                         const uint8_t limit) {
+  // Calculate mask3 values for four samples
+  uint32x2x2_t p0q0_p1q1;
+  uint16x8_t temp_16x8;
+  uint16x4_t temp0_16x4, temp1_16x4;
+  uint8x8_t mask_8x8, temp_8x8;
+  const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+  const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit);
+
+  mask_8x8 = vabd_u8(p2q2, p1q1);
+  mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0));
+  mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+  temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+                     vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+  temp_16x8 = vmovl_u8(temp_8x8);
+  temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+  temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+  temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+  temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+  temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  return mask_8x8;
+}
+
+static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4,
+                        uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
+                        uint8x8_t *p0q0, const uint8_t blimit,
+                        const uint8_t limit, const uint8_t thresh) {
+  uint16x8_t out;
+  uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4,
+      out_f14_pq5;
+  uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
+  uint8x8_t out_f4_pq0, out_f4_pq1;
+  uint8x8_t mask_8x8, flat_8x8, flat2_8x8;
+  uint8x8_t q0p0, q1p1, q2p2;
+
+  // Calculate filter masks
+  mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
+  flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
+  flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0);
+  {
+    // filter 4
+    int32x2x2_t ps0_qs0, ps1_qs1;
+    int16x8_t filter_s16;
+    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+    uint8x8_t temp0_8x8, temp1_8x8;
+    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+    int8x8_t op0, oq0, op1, oq1;
+    int8x8_t pq_s0, pq_s1;
+    int8x8_t filter_s8, filter1_s8, filter2_s8;
+    int8x8_t hev_8x8;
+    const int8x8_t sign_mask = vdup_n_s8(0x80);
+    const int8x8_t val_4 = vdup_n_s8(4);
+    const int8x8_t val_3 = vdup_n_s8(3);
+
+    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+    // hev_mask
+    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+    // add outer taps if we have high edge variance
+    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    // inner taps
+    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+    filter_s16 = vmovl_s8(filter_s8);
+    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+    filter_s8 = vqmovn_s16(filter_s16);
+    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+    filter1_s8 = vqadd_s8(filter_s8, val_4);
+    filter2_s8 = vqadd_s8(filter_s8, val_3);
+    filter1_s8 = vshr_n_s8(filter1_s8, 3);
+    filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+    hev_8x8 = vmvn_s8(hev_8x8);
+    filter_s8 = vrshr_n_s8(filter1_s8, 1);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+  }
+  // reverse p and q
+  q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+  q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+  q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
+  {
+    // filter 8
+    uint16x8_t out_pq0, out_pq1, out_pq2;
+    out = vaddl_u8(*p3q3, *p2q2);
+    out = vaddw_u8(out, *p1q1);
+    out = vaddw_u8(out, *p0q0);
+
+    out = vaddw_u8(out, q0p0);
+    out_pq1 = vaddw_u8(out, *p3q3);
+    out_pq2 = vaddw_u8(out_pq1, *p3q3);
+    out_pq2 = vaddw_u8(out_pq2, *p2q2);
+    out_pq1 = vaddw_u8(out_pq1, *p1q1);
+    out_pq1 = vaddw_u8(out_pq1, q1p1);
+
+    out_pq0 = vaddw_u8(out, *p0q0);
+    out_pq0 = vaddw_u8(out_pq0, q1p1);
+    out_pq0 = vaddw_u8(out_pq0, q2p2);
+
+    out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
+    out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
+    out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
+  }
+  {
+    // filter 14
+    uint16x8_t out_pq0, out_pq1, out_pq2, out_pq3, out_pq4, out_pq5;
+    uint16x8_t p6q6_2, p6q6_temp, qp_sum;
+    uint8x8_t qp_rev;
+
+    out = vaddw_u8(out, *p4q4);
+    out = vaddw_u8(out, *p5q5);
+    out = vaddw_u8(out, *p6q6);
+
+    out_pq5 = vaddw_u8(out, *p4q4);
+    out_pq4 = vaddw_u8(out_pq5, *p3q3);
+    out_pq3 = vaddw_u8(out_pq4, *p2q2);
+
+    out_pq5 = vaddw_u8(out_pq5, *p5q5);
+    out_pq4 = vaddw_u8(out_pq4, *p5q5);
+
+    out_pq0 = vaddw_u8(out, *p1q1);
+    out_pq1 = vaddw_u8(out_pq0, *p2q2);
+    out_pq2 = vaddw_u8(out_pq1, *p3q3);
+
+    out_pq0 = vaddw_u8(out_pq0, *p0q0);
+    out_pq1 = vaddw_u8(out_pq1, *p0q0);
+
+    out_pq1 = vaddw_u8(out_pq1, *p6q6);
+    p6q6_2 = vaddl_u8(*p6q6, *p6q6);
+    out_pq2 = vaddq_u16(out_pq2, p6q6_2);
+    p6q6_temp = vaddw_u8(p6q6_2, *p6q6);
+    out_pq3 = vaddq_u16(out_pq3, p6q6_temp);
+    p6q6_temp = vaddw_u8(p6q6_temp, *p6q6);
+    out_pq4 = vaddq_u16(out_pq4, p6q6_temp);
+    p6q6_temp = vaddq_u16(p6q6_temp, p6q6_2);
+    out_pq5 = vaddq_u16(out_pq5, p6q6_temp);
+
+    out_pq4 = vaddw_u8(out_pq4, q1p1);
+
+    qp_sum = vaddl_u8(q2p2, q1p1);
+    out_pq3 = vaddq_u16(out_pq3, qp_sum);
+
+    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p3q3)));
+    qp_sum = vaddw_u8(qp_sum, qp_rev);
+    out_pq2 = vaddq_u16(out_pq2, qp_sum);
+
+    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p4q4)));
+    qp_sum = vaddw_u8(qp_sum, qp_rev);
+    out_pq1 = vaddq_u16(out_pq1, qp_sum);
+
+    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p5q5)));
+    qp_sum = vaddw_u8(qp_sum, qp_rev);
+    out_pq0 = vaddq_u16(out_pq0, qp_sum);
+
+    out_pq0 = vaddw_u8(out_pq0, q0p0);
+
+    out_f14_pq0 = vrshrn_n_u16(out_pq0, 4);
+    out_f14_pq1 = vrshrn_n_u16(out_pq1, 4);
+    out_f14_pq2 = vrshrn_n_u16(out_pq2, 4);
+    out_f14_pq3 = vrshrn_n_u16(out_pq3, 4);
+    out_f14_pq4 = vrshrn_n_u16(out_pq4, 4);
+    out_f14_pq5 = vrshrn_n_u16(out_pq5, 4);
+  }
+  {
+    uint8x8_t filter4_cond, filter8_cond, filter14_cond;
+    filter8_cond = vand_u8(flat_8x8, mask_8x8);
+    filter4_cond = vmvn_u8(filter8_cond);
+    filter14_cond = vand_u8(filter8_cond, flat2_8x8);
+
+    // filter4 outputs
+    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+    // filter8 outputs
+    *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
+    *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
+
+    // filter14 outputs
+    *p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1);
+    *p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2);
+    *p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3);
+    *p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4);
+    *p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5);
+  }
+}
+
+static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
+                       uint8x8_t *p0q0, const uint8_t blimit,
+                       const uint8_t limit, const uint8_t thresh) {
+  uint16x8_t out;
+  uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
+  uint8x8_t out_f4_pq0, out_f4_pq1;
+  uint8x8_t mask_8x8, flat_8x8;
+
+  // Calculate filter masks
+  mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
+  flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
+  {
+    // filter 4
+    int32x2x2_t ps0_qs0, ps1_qs1;
+    int16x8_t filter_s16;
+    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+    uint8x8_t temp0_8x8, temp1_8x8;
+    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+    int8x8_t op0, oq0, op1, oq1;
+    int8x8_t pq_s0, pq_s1;
+    int8x8_t filter_s8, filter1_s8, filter2_s8;
+    int8x8_t hev_8x8;
+    const int8x8_t sign_mask = vdup_n_s8(0x80);
+    const int8x8_t val_4 = vdup_n_s8(4);
+    const int8x8_t val_3 = vdup_n_s8(3);
+
+    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+    // hev_mask
+    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+    // add outer taps if we have high edge variance
+    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    // inner taps
+    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+    filter_s16 = vmovl_s8(filter_s8);
+    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+    filter_s8 = vqmovn_s16(filter_s16);
+    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
 
-void aom_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  aom_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
-  aom_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
+    filter1_s8 = vqadd_s8(filter_s8, val_4);
+    filter2_s8 = vqadd_s8(filter_s8, val_3);
+    filter1_s8 = vshr_n_s8(filter1_s8, 3);
+    filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+    hev_8x8 = vmvn_s8(hev_8x8);
+    filter_s8 = vrshr_n_s8(filter1_s8, 1);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+  }
+  {
+    // filter 8
+    uint16x8_t out_pq0, out_pq1, out_pq2;
+    uint8x8_t q0p0, q1p1, q2p2;
+
+    out = vaddl_u8(*p3q3, *p2q2);
+    out = vaddw_u8(out, *p1q1);
+    out = vaddw_u8(out, *p0q0);
+
+    // reverse p and q
+    q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+    q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+    q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
+
+    out = vaddw_u8(out, q0p0);
+    out_pq1 = vaddw_u8(out, *p3q3);
+    out_pq2 = vaddw_u8(out_pq1, *p3q3);
+    out_pq2 = vaddw_u8(out_pq2, *p2q2);
+    out_pq1 = vaddw_u8(out_pq1, *p1q1);
+    out_pq1 = vaddw_u8(out_pq1, q1p1);
+
+    out_pq0 = vaddw_u8(out, *p0q0);
+    out_pq0 = vaddw_u8(out_pq0, q1p1);
+    out_pq0 = vaddw_u8(out_pq0, q2p2);
+
+    out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
+    out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
+    out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
+  }
+  {
+    uint8x8_t filter4_cond, filter8_cond;
+    filter8_cond = vand_u8(flat_8x8, mask_8x8);
+    filter4_cond = vmvn_u8(filter8_cond);
+
+    // filter4 outputs
+    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+    // filter8 outputs
+    *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
+    *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
+  }
+}
+
+static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
+                       const uint8_t blimit, const uint8_t limit,
+                       const uint8_t thresh) {
+  uint16x8_t out;
+  uint8x8_t out_f6_pq0, out_f6_pq1;
+  uint8x8_t out_f4_pq0, out_f4_pq1;
+  uint8x8_t mask_8x8, flat_8x8;
+
+  // Calculate filter masks
+  mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit);
+  flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0);
+  {
+    // filter 4
+    int32x2x2_t ps0_qs0, ps1_qs1;
+    int16x8_t filter_s16;
+    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+    uint8x8_t temp0_8x8, temp1_8x8;
+    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+    int8x8_t op0, oq0, op1, oq1;
+    int8x8_t pq_s0, pq_s1;
+    int8x8_t filter_s8, filter1_s8, filter2_s8;
+    int8x8_t hev_8x8;
+    const int8x8_t sign_mask = vdup_n_s8(0x80);
+    const int8x8_t val_4 = vdup_n_s8(4);
+    const int8x8_t val_3 = vdup_n_s8(3);
+
+    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+    // hev_mask
+    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+    // add outer taps if we have high edge variance
+    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    // inner taps
+    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+    filter_s16 = vmovl_s8(filter_s8);
+    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+    filter_s8 = vqmovn_s16(filter_s16);
+    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+    filter1_s8 = vqadd_s8(filter_s8, val_4);
+    filter2_s8 = vqadd_s8(filter_s8, val_3);
+    filter1_s8 = vshr_n_s8(filter1_s8, 3);
+    filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+    filter_s8 = vrshr_n_s8(filter1_s8, 1);
+    filter_s8 = vbic_s8(filter_s8, hev_8x8);
+
+    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+  }
+  {
+    // filter 6
+    uint16x8_t out_pq0, out_pq1;
+    uint8x8_t pq_rev;
+
+    out = vaddl_u8(*p0q0, *p1q1);
+    out = vaddq_u16(out, out);
+    out = vaddw_u8(out, *p2q2);
+
+    pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+    out = vaddw_u8(out, pq_rev);
+
+    out_pq0 = vaddw_u8(out, pq_rev);
+    pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+    out_pq0 = vaddw_u8(out_pq0, pq_rev);
+
+    out_pq1 = vaddw_u8(out, *p2q2);
+    out_pq1 = vaddw_u8(out_pq1, *p2q2);
+
+    out_f6_pq0 = vrshrn_n_u16(out_pq0, 3);
+    out_f6_pq1 = vrshrn_n_u16(out_pq1, 3);
+  }
+  {
+    uint8x8_t filter4_cond, filter6_cond;
+    filter6_cond = vand_u8(flat_8x8, mask_8x8);
+    filter4_cond = vmvn_u8(filter6_cond);
+
+    // filter4 outputs
+    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+    // filter6 outputs
+    *p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1);
+  }
+}
+
+void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                              const uint8_t *limit, const uint8_t *thresh) {
+  uint8x16_t row0, row1, row2, row3;
+  uint8x8_t pxp3, p6p2, p5p1, p4p0;
+  uint8x8_t q0q4, q1q5, q2q6, q3qy;
+  uint32x2x2_t p6q6_p2q2, p5q5_p1q1, p4q4_p0q0, pxqx_p3q3;
+  uint32x2_t pq_rev;
+  uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, p6q6;
+
+  // row0: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+  // row1: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+  // row2: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+  // row3: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+  load_u8_8x16(src - 8, stride, &row0, &row1, &row2, &row3);
+
+  pxp3 = vget_low_u8(row0);
+  p6p2 = vget_low_u8(row1);
+  p5p1 = vget_low_u8(row2);
+  p4p0 = vget_low_u8(row3);
+  transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+
+  q0q4 = vget_high_u8(row0);
+  q1q5 = vget_high_u8(row1);
+  q2q6 = vget_high_u8(row2);
+  q3qy = vget_high_u8(row3);
+  transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(q3qy));
+  pxqx_p3q3 = vtrn_u32(vreinterpret_u32_u8(pxp3), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(q1q5));
+  p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5p1), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(q0q4));
+  p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4p0), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(q2q6));
+  p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6p2), pq_rev);
+
+  p0q0 = vreinterpret_u8_u32(p4q4_p0q0.val[1]);
+  p1q1 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
+  p2q2 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
+  p3q3 = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
+  p4q4 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
+  p5q5 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
+  p6q6 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
+
+  lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
+              *thresh);
+
+  pxqx_p3q3 = vtrn_u32(pxqx_p3q3.val[0], vreinterpret_u32_u8(p3q3));
+  p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5q5), vreinterpret_u32_u8(p1q1));
+  p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4q4), vreinterpret_u32_u8(p0q0));
+  p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6q6), vreinterpret_u32_u8(p2q2));
+
+  pxqx_p3q3.val[1] = vrev64_u32(pxqx_p3q3.val[1]);
+  p5q5_p1q1.val[1] = vrev64_u32(p5q5_p1q1.val[1]);
+  p4q4_p0q0.val[1] = vrev64_u32(p4q4_p0q0.val[1]);
+  p6q6_p2q2.val[1] = vrev64_u32(p6q6_p2q2.val[1]);
+
+  q0q4 = vreinterpret_u8_u32(p4q4_p0q0.val[1]);
+  q1q5 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
+  q2q6 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
+  q3qy = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
+  transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+
+  pxp3 = vreinterpret_u8_u32(pxqx_p3q3.val[0]);
+  p6p2 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
+  p5p1 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
+  p4p0 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
+  transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+
+  row0 = vcombine_u8(pxp3, q0q4);
+  row1 = vcombine_u8(p6p2, q1q5);
+  row2 = vcombine_u8(p5p1, q2q6);
+  row3 = vcombine_u8(p4p0, q3qy);
+
+  store_u8_8x16(src - 8, stride, row0, row1, row2, row3);
 }
 
-#if HAVE_NEON_ASM
-void aom_lpf_horizontal_8_dual_neon(
-    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
-    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
-    const uint8_t *limit1, const uint8_t *thresh1) {
-  aom_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
-  aom_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
+void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  uint32x2x2_t p2q2_p1q1, p3q3_p0q0;
+  uint32x2_t pq_rev;
+  uint8x8_t p3q0, p2q1, p1q2, p0q3;
+  uint8x8_t p0q0, p1q1, p2q2, p3q3;
+
+  // row0: p3 p2 p1 p0 | q0 q1 q2 q3
+  // row1: p3 p2 p1 p0 | q0 q1 q2 q3
+  // row2: p3 p2 p1 p0 | q0 q1 q2 q3
+  // row3: p3 p2 p1 p0 | q0 q1 q2 q3
+  load_u8_8x4(src - 4, stride, &p3q0, &p2q1, &p1q2, &p0q3);
+
+  transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q3));
+  p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q0), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2));
+  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev);
+
+  p0q0 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1]));
+  p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+  p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+  p3q3 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
+
+  lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0));
+  p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q3), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1));
+  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev);
+
+  p0q3 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1]));
+  p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+  p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+  p3q0 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
+  transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+
+  store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3);
 }
 
-void aom_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  aom_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
-  aom_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
+void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p0q0, p1q1, p2q2, p3q3;
+
+  p3q3 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 4 * stride)));
+  p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride)));
+  p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride)));
+  p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride)));
+  p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride),
+                                           vreinterpret_u32_u8(p0q0), 1));
+  p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride),
+                                           vreinterpret_u32_u8(p1q1), 1));
+  p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride),
+                                           vreinterpret_u32_u8(p2q2), 1));
+  p3q3 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 3 * stride),
+                                           vreinterpret_u32_u8(p3q3), 1));
+
+  lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  vst1_lane_u32((uint32_t *)(src - 4 * stride), vreinterpret_u32_u8(p3q3), 0);
+  vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0);
+  vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0);
+  vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0);
+  vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1);
+  vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
+  vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
+  vst1_lane_u32((uint32_t *)(src + 3 * stride), vreinterpret_u32_u8(p3q3), 1);
 }
 
-void aom_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit,
-                                   const uint8_t *thresh) {
-  aom_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
-  aom_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
+void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p0q0, p1q1, p2q2;
+
+  p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride)));
+  p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride)));
+  p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride)));
+  p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride),
+                                           vreinterpret_u32_u8(p0q0), 1));
+  p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride),
+                                           vreinterpret_u32_u8(p1q1), 1));
+  p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride),
+                                           vreinterpret_u32_u8(p2q2), 1));
+
+  lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0);
+  vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0);
+  vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0);
+  vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1);
+  vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
+  vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
 }
-#endif  // HAVE_NEON_ASM
diff --git a/third_party/aom/aom_dsp/arm/sad4d_neon.c b/third_party/aom/aom_dsp/arm/sad4d_neon.c
index a1eeaf4b7..606950ab2 100644
--- a/third_party/aom/aom_dsp/arm/sad4d_neon.c
+++ b/third_party/aom/aom_dsp/arm/sad4d_neon.c
@@ -11,8 +11,9 @@
 
 #include <arm_neon.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
diff --git a/third_party/aom/aom_dsp/arm/sad_neon.c b/third_party/aom/aom_dsp/arm/sad_neon.c
index 2f452f55b..a39de91d6 100644
--- a/third_party/aom/aom_dsp/arm/sad_neon.c
+++ b/third_party/aom/aom_dsp/arm/sad_neon.c
@@ -11,7 +11,7 @@
 
 #include <arm_neon.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #include "aom/aom_integer.h"
 
diff --git a/third_party/aom/aom_dsp/arm/save_reg_neon.asm b/third_party/aom/aom_dsp/arm/save_reg_neon.asm
deleted file mode 100644
index e04969823..000000000
--- a/third_party/aom/aom_dsp/arm/save_reg_neon.asm
+++ /dev/null
@@ -1,39 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-    EXPORT  |aom_push_neon|
-    EXPORT  |aom_pop_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-|aom_push_neon| PROC
-    vst1.i64            {d8, d9, d10, d11}, [r0]!
-    vst1.i64            {d12, d13, d14, d15}, [r0]!
-    bx              lr
-
-    ENDP
-
-|aom_pop_neon| PROC
-    vld1.i64            {d8, d9, d10, d11}, [r0]!
-    vld1.i64            {d12, d13, d14, d15}, [r0]!
-    bx              lr
-
-    ENDP
-
-    END
-
diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
index 064b72d6f..44d821821 100644
--- a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
+++ b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
@@ -10,8 +10,9 @@
  */
 
 #include <arm_neon.h>
-#include "./aom_dsp_rtcd.h"
-#include "./aom_config.h"
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
 
 #include "aom_ports/mem.h"
 #include "aom/aom_integer.h"
diff --git a/third_party/aom/aom_dsp/arm/subtract_neon.c b/third_party/aom/aom_dsp/arm/subtract_neon.c
index cb8a2daf8..28f5ace8e 100644
--- a/third_party/aom/aom_dsp/arm/subtract_neon.c
+++ b/third_party/aom/aom_dsp/arm/subtract_neon.c
@@ -11,7 +11,8 @@
 
 #include <arm_neon.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 
 void aom_subtract_block_neon(int rows, int cols, int16_t *diff,
diff --git a/third_party/aom/aom_dsp/arm/variance_neon.c b/third_party/aom/aom_dsp/arm/variance_neon.c
index dbab287e3..74385a601 100644
--- a/third_party/aom/aom_dsp/arm/variance_neon.c
+++ b/third_party/aom/aom_dsp/arm/variance_neon.c
@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include "./aom_dsp_rtcd.h"
-#include "./aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
 
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
diff --git a/third_party/aom/aom_dsp/avg.c b/third_party/aom/aom_dsp/avg.c
deleted file mode 100644
index f732224fd..000000000
--- a/third_party/aom/aom_dsp/avg.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <stdlib.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_ports/mem.h"
-
-// src_diff: first pass, 9 bit, dynamic range [-255, 255]
-//           second pass, 12 bit, dynamic range [-2040, 2040]
-static void hadamard_col8(const int16_t *src_diff, int src_stride,
-                          int16_t *coeff) {
-  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
-  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
-  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
-  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
-  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
-  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
-  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
-  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
-
-  int16_t c0 = b0 + b2;
-  int16_t c1 = b1 + b3;
-  int16_t c2 = b0 - b2;
-  int16_t c3 = b1 - b3;
-  int16_t c4 = b4 + b6;
-  int16_t c5 = b5 + b7;
-  int16_t c6 = b4 - b6;
-  int16_t c7 = b5 - b7;
-
-  coeff[0] = c0 + c4;
-  coeff[7] = c1 + c5;
-  coeff[3] = c2 + c6;
-  coeff[4] = c3 + c7;
-  coeff[2] = c0 - c4;
-  coeff[6] = c1 - c5;
-  coeff[1] = c2 - c6;
-  coeff[5] = c3 - c7;
-}
-
-// The order of the output coeff of the hadamard is not important. For
-// optimization purposes the final transpose may be skipped.
-void aom_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
-                        int16_t *coeff) {
-  int idx;
-  int16_t buffer[64];
-  int16_t *tmp_buf = &buffer[0];
-  for (idx = 0; idx < 8; ++idx) {
-    hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
-                                                   // dynamic range [-255, 255]
-    tmp_buf += 8;
-    ++src_diff;
-  }
-
-  tmp_buf = &buffer[0];
-  for (idx = 0; idx < 8; ++idx) {
-    hadamard_col8(tmp_buf, 8, coeff);  // tmp_buf: 12 bit
-                                       // dynamic range [-2040, 2040]
-    coeff += 8;                        // coeff: 15 bit
-                                       // dynamic range [-16320, 16320]
-    ++tmp_buf;
-  }
-}
-
-// In place 16x16 2D Hadamard transform
-void aom_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
-                          int16_t *coeff) {
-  int idx;
-  for (idx = 0; idx < 4; ++idx) {
-    // src_diff: 9 bit, dynamic range [-255, 255]
-    const int16_t *src_ptr =
-        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
-    aom_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
-  }
-
-  // coeff: 15 bit, dynamic range [-16320, 16320]
-  for (idx = 0; idx < 64; ++idx) {
-    int16_t a0 = coeff[0];
-    int16_t a1 = coeff[64];
-    int16_t a2 = coeff[128];
-    int16_t a3 = coeff[192];
-
-    int16_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
-    int16_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
-    int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
-    int16_t b3 = (a2 - a3) >> 1;
-
-    coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
-    coeff[64] = b1 + b3;
-    coeff[128] = b0 - b2;
-    coeff[192] = b1 - b3;
-
-    ++coeff;
-  }
-}
-
-// coeff: 16 bits, dynamic range [-32640, 32640].
-// length: value range {16, 64, 256, 1024}.
-int aom_satd_c(const int16_t *coeff, int length) {
-  int i;
-  int satd = 0;
-  for (i = 0; i < length; ++i) satd += abs(coeff[i]);
-
-  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
-  return satd;
-}
-
-// Integer projection onto row vectors.
-// height: value range {16, 32, 64}.
-void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, int ref_stride,
-                       int height) {
-  int idx;
-  const int norm_factor = height >> 1;
-  for (idx = 0; idx < 16; ++idx) {
-    int i;
-    hbuf[idx] = 0;
-    // hbuf[idx]: 14 bit, dynamic range [0, 16320].
-    for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
-    // hbuf[idx]: 9 bit, dynamic range [0, 510].
-    hbuf[idx] /= norm_factor;
-    ++ref;
-  }
-}
-
-// width: value range {16, 32, 64}.
-int16_t aom_int_pro_col_c(const uint8_t *ref, int width) {
-  int idx;
-  int16_t sum = 0;
-  // sum: 14 bit, dynamic range [0, 16320]
-  for (idx = 0; idx < width; ++idx) sum += ref[idx];
-  return sum;
-}
-
-// ref: [0 - 510]
-// src: [0 - 510]
-// bwl: {2, 3, 4}
-int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl) {
-  int i;
-  int width = 4 << bwl;
-  int sse = 0, mean = 0, var;
-
-  for (i = 0; i < width; ++i) {
-    int diff = ref[i] - src[i];  // diff: dynamic range [-510, 510], 10 bits.
-    mean += diff;                // mean: dynamic range 16 bits.
-    sse += diff * diff;          // sse:  dynamic range 26 bits.
-  }
-
-  // (mean * mean): dynamic range 31 bits.
-  var = sse - ((mean * mean) >> (bwl + 2));
-  return var;
-}
-
-void aom_minmax_8x8_c(const uint8_t *src, int src_stride, const uint8_t *ref,
-                      int ref_stride, int *min, int *max) {
-  int i, j;
-  *min = 255;
-  *max = 0;
-  for (i = 0; i < 8; ++i, src += src_stride, ref += ref_stride) {
-    for (j = 0; j < 8; ++j) {
-      int diff = abs(src[j] - ref[j]);
-      *min = diff < *min ? diff : *min;
-      *max = diff > *max ? diff : *max;
-    }
-  }
-}
-
-#if CONFIG_HIGHBITDEPTH
-void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
-                             int dp, int *min, int *max) {
-  int i, j;
-  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
-  const uint16_t *d = CONVERT_TO_SHORTPTR(d8);
-  *min = 255;
-  *max = 0;
-  for (i = 0; i < 8; ++i, s += p, d += dp) {
-    for (j = 0; j < 8; ++j) {
-      int diff = abs(s[j] - d[j]);
-      *min = diff < *min ? diff : *min;
-      *max = diff > *max ? diff : *max;
-    }
-  }
-}
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/binary_codes_reader.c b/third_party/aom/aom_dsp/binary_codes_reader.c
index 4f38afbc5..d05c3efdc 100644
--- a/third_party/aom/aom_dsp/binary_codes_reader.c
+++ b/third_party/aom/aom_dsp/binary_codes_reader.c
@@ -33,17 +33,6 @@ static uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) {
   }
 }
 
-int16_t aom_read_primitive_symmetric_(aom_reader *r,
-                                      unsigned int mag_bits ACCT_STR_PARAM) {
-  if (aom_read_bit(r, ACCT_STR_NAME)) {
-    int s = aom_read_bit(r, ACCT_STR_NAME);
-    int16_t x = aom_read_literal(r, mag_bits, ACCT_STR_NAME) + 1;
-    return (s > 0 ? -x : x);
-  } else {
-    return 0;
-  }
-}
-
 uint16_t aom_read_primitive_quniform_(aom_reader *r,
                                       uint16_t n ACCT_STR_PARAM) {
   if (n <= 1) return 0;
@@ -62,76 +51,56 @@ static uint16_t aom_rb_read_primitive_quniform(struct aom_read_bit_buffer *rb,
   return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb);
 }
 
-uint16_t aom_read_primitive_refbilevel_(aom_reader *r, uint16_t n, uint16_t p,
-                                        uint16_t ref ACCT_STR_PARAM) {
-  if (n <= 1) return 0;
-  assert(p > 0 && p <= n);
-  assert(ref < n);
-  int lolimit = ref - p / 2;
-  const int hilimit = lolimit + p - 1;
-  if (lolimit < 0) {
-    lolimit = 0;
-  } else if (hilimit >= n) {
-    lolimit = n - p;
-  }
-  int v;
-  if (aom_read_bit(r, ACCT_STR_NAME)) {
-    v = aom_read_primitive_quniform(r, p, ACCT_STR_NAME) + lolimit;
-  } else {
-    v = aom_read_primitive_quniform(r, n - p, ACCT_STR_NAME);
-    if (v >= lolimit) v += p;
-  }
-  return v;
-}
-
 // Decode finite subexponential code that for a symbol v in [0, n-1] with
 // parameter k
 uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
                                        uint16_t k ACCT_STR_PARAM) {
   int i = 0;
   int mk = 0;
-  uint16_t v;
+
   while (1) {
     int b = (i ? k + i - 1 : k);
     int a = (1 << b);
+
     if (n <= mk + 3 * a) {
-      v = aom_read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk;
-      break;
-    } else {
-      if (aom_read_bit(r, ACCT_STR_NAME)) {
-        i = i + 1;
-        mk += a;
-      } else {
-        v = aom_read_literal(r, b, ACCT_STR_NAME) + mk;
-        break;
-      }
+      return aom_read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk;
+    }
+
+    if (!aom_read_bit(r, ACCT_STR_NAME)) {
+      return aom_read_literal(r, b, ACCT_STR_NAME) + mk;
     }
+
+    i = i + 1;
+    mk += a;
   }
-  return v;
+
+  assert(0);
+  return 0;
 }
 
 static uint16_t aom_rb_read_primitive_subexpfin(struct aom_read_bit_buffer *rb,
                                                 uint16_t n, uint16_t k) {
   int i = 0;
   int mk = 0;
-  uint16_t v;
+
   while (1) {
     int b = (i ? k + i - 1 : k);
     int a = (1 << b);
+
     if (n <= mk + 3 * a) {
-      v = aom_rb_read_primitive_quniform(rb, n - mk) + mk;
-      break;
-    } else {
-      if (aom_rb_read_bit(rb)) {
-        i = i + 1;
-        mk += a;
-      } else {
-        v = aom_rb_read_literal(rb, b) + mk;
-        break;
-      }
+      return aom_rb_read_primitive_quniform(rb, n - mk) + mk;
     }
+
+    if (!aom_rb_read_bit(rb)) {
+      return aom_rb_read_literal(rb, b) + mk;
+    }
+
+    i = i + 1;
+    mk += a;
   }
-  return v;
+
+  assert(0);
+  return 0;
 }
 
 uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
@@ -146,20 +115,19 @@ static uint16_t aom_rb_read_primitive_refsubexpfin(
                                     aom_rb_read_primitive_subexpfin(rb, n, k));
 }
 
-// Decode finite subexponential code that for a symbol v in [-(n-1), n-1] with
-// parameter k based on a reference ref also in [-(n-1), n-1].
-int16_t aom_read_signed_primitive_refsubexpfin_(aom_reader *r, uint16_t n,
-                                                uint16_t k,
-                                                int16_t ref ACCT_STR_PARAM) {
-  ref += n - 1;
-  const uint16_t scaled_n = (n << 1) - 1;
-  return aom_read_primitive_refsubexpfin(r, scaled_n, k, ref, ACCT_STR_NAME) -
-         n + 1;
-}
-
 int16_t aom_rb_read_signed_primitive_refsubexpfin(
     struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref) {
   ref += n - 1;
   const uint16_t scaled_n = (n << 1) - 1;
   return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1;
 }
+
+uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) {
+  int leading_zeros = 0;
+  while (!aom_rb_read_bit(rb)) ++leading_zeros;
+  // Maximum 32 bits.
+  if (leading_zeros >= 32) return UINT32_MAX;
+  const uint32_t base = (1u << leading_zeros) - 1;
+  const uint32_t value = aom_rb_read_literal(rb, leading_zeros);
+  return base + value;
+}
diff --git a/third_party/aom/aom_dsp/binary_codes_reader.h b/third_party/aom/aom_dsp/binary_codes_reader.h
index 8885142c9..5253c6154 100644
--- a/third_party/aom/aom_dsp/binary_codes_reader.h
+++ b/third_party/aom/aom_dsp/binary_codes_reader.h
@@ -18,40 +18,30 @@ extern "C" {
 
 #include <assert.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/bitreader.h"
 #include "aom_dsp/bitreader_buffer.h"
 
-#define aom_read_primitive_symmetric(r, n, ACCT_STR_NAME) \
-  aom_read_primitive_symmetric_(r, n ACCT_STR_ARG(ACCT_STR_NAME))
 #define aom_read_primitive_quniform(r, n, ACCT_STR_NAME) \
   aom_read_primitive_quniform_(r, n ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_primitive_refbilevel(r, n, p, ref, ACCT_STR_NAME) \
-  aom_read_primitive_refbilevel_(r, n, p, ref ACCT_STR_ARG(ACCT_STR_NAME))
 #define aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME) \
   aom_read_primitive_subexpfin_(r, n, k ACCT_STR_ARG(ACCT_STR_NAME))
 #define aom_read_primitive_refsubexpfin(r, n, k, ref, ACCT_STR_NAME) \
   aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_signed_primitive_refsubexpfin(r, n, k, ref, ACCT_STR_NAME) \
-  aom_read_signed_primitive_refsubexpfin_(r, n, k,                          \
-                                          ref ACCT_STR_ARG(ACCT_STR_NAME))
 
-int16_t aom_read_primitive_symmetric_(aom_reader *r,
-                                      unsigned int mag_bits ACCT_STR_PARAM);
 uint16_t aom_read_primitive_quniform_(aom_reader *r, uint16_t n ACCT_STR_PARAM);
-uint16_t aom_read_primitive_refbilevel_(aom_reader *r, uint16_t n, uint16_t p,
-                                        uint16_t ref ACCT_STR_PARAM);
 uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
                                        uint16_t k ACCT_STR_PARAM);
 uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
                                           uint16_t ref ACCT_STR_PARAM);
-int16_t aom_read_signed_primitive_refsubexpfin_(aom_reader *r, uint16_t n,
-                                                uint16_t k,
-                                                int16_t ref ACCT_STR_PARAM);
 
 int16_t aom_rb_read_signed_primitive_refsubexpfin(
     struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref);
+
+uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/aom_dsp/binary_codes_writer.c b/third_party/aom/aom_dsp/binary_codes_writer.c
index e092b6278..8f74f0942 100644
--- a/third_party/aom/aom_dsp/binary_codes_writer.c
+++ b/third_party/aom/aom_dsp/binary_codes_writer.c
@@ -89,61 +89,6 @@ int aom_count_primitive_quniform(uint16_t n, uint16_t v) {
   return v < m ? l - 1 : l;
 }
 
-// Encodes a value v in [0, n-1] based on a reference ref also in [0, n-1]
-// The closest p values of v from ref are coded using a p-ary quasi-unoform
-// short code while the remaining n-p values are coded with a longer code.
-void aom_write_primitive_refbilevel(aom_writer *w, uint16_t n, uint16_t p,
-                                    uint16_t ref, uint16_t v) {
-  if (n <= 1) return;
-  assert(p > 0 && p <= n);
-  assert(ref < n);
-  int lolimit = ref - p / 2;
-  int hilimit = lolimit + p - 1;
-  if (lolimit < 0) {
-    lolimit = 0;
-    hilimit = p - 1;
-  } else if (hilimit >= n) {
-    hilimit = n - 1;
-    lolimit = n - p;
-  }
-  if (v >= lolimit && v <= hilimit) {
-    aom_write_bit(w, 1);
-    v = v - lolimit;
-    aom_write_primitive_quniform(w, p, v);
-  } else {
-    aom_write_bit(w, 0);
-    if (v > hilimit) v -= p;
-    aom_write_primitive_quniform(w, n - p, v);
-  }
-}
-
-int aom_count_primitive_refbilevel(uint16_t n, uint16_t p, uint16_t ref,
-                                   uint16_t v) {
-  if (n <= 1) return 0;
-  assert(p > 0 && p <= n);
-  assert(ref < n);
-  int lolimit = ref - p / 2;
-  int hilimit = lolimit + p - 1;
-  if (lolimit < 0) {
-    lolimit = 0;
-    hilimit = p - 1;
-  } else if (hilimit >= n) {
-    hilimit = n - 1;
-    lolimit = n - p;
-  }
-  int count = 0;
-  if (v >= lolimit && v <= hilimit) {
-    count++;
-    v = v - lolimit;
-    count += aom_count_primitive_quniform(p, v);
-  } else {
-    count++;
-    if (v > hilimit) v -= p;
-    count += aom_count_primitive_quniform(n - p, v);
-  }
-  return count;
-}
-
 // Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
 void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k,
                                    uint16_t v) {
@@ -263,3 +208,15 @@ int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
   const uint16_t scaled_n = (n << 1) - 1;
   return aom_count_primitive_refsubexpfin(scaled_n, k, ref, v);
 }
+
+void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v) {
+  int64_t shift_val = ++v;
+  int leading_zeroes = 1;
+
+  assert(shift_val > 0);
+
+  while (shift_val >>= 1) leading_zeroes += 2;
+
+  aom_wb_write_literal(wb, 0, leading_zeroes >> 1);
+  aom_wb_write_unsigned_literal(wb, v, (leading_zeroes + 1) >> 1);
+}
diff --git a/third_party/aom/aom_dsp/binary_codes_writer.h b/third_party/aom/aom_dsp/binary_codes_writer.h
index 18ad5078f..784c721a6 100644
--- a/third_party/aom/aom_dsp/binary_codes_writer.h
+++ b/third_party/aom/aom_dsp/binary_codes_writer.h
@@ -17,7 +17,8 @@ extern "C" {
 #endif
 
 #include <assert.h>
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/bitwriter.h"
 #include "aom_dsp/bitwriter_buffer.h"
@@ -33,12 +34,6 @@ void aom_write_primitive_symmetric(aom_writer *w, int16_t v,
 // Encodes a value v in [0, n-1] quasi-uniformly
 void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v);
 
-// Encodes a value v in [0, n-1] based on a reference ref also in [0, n-1]
-// The closest p values of v from ref are coded using a p-ary quasi-unoform
-// short code while the remaining n-p values are coded with a longer code.
-void aom_write_primitive_refbilevel(aom_writer *w, uint16_t n, uint16_t p,
-                                    uint16_t ref, uint16_t v);
-
 // Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
 void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k,
                                    uint16_t v);
@@ -61,13 +56,12 @@ void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
 // Functions that counts bits for the above primitives
 int aom_count_primitive_symmetric(int16_t v, unsigned int mag_bits);
 int aom_count_primitive_quniform(uint16_t n, uint16_t v);
-int aom_count_primitive_refbilevel(uint16_t n, uint16_t p, uint16_t ref,
-                                   uint16_t v);
 int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v);
 int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref,
                                      uint16_t v);
 int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
                                             int16_t v);
+void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/aom_dsp/bitreader.h b/third_party/aom/aom_dsp/bitreader.h
index 00424fa76..328935be9 100644
--- a/third_party/aom/aom_dsp/bitreader.h
+++ b/third_party/aom/aom_dsp/bitreader.h
@@ -15,15 +15,11 @@
 #include <assert.h>
 #include <limits.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #include "aom/aomdx.h"
 #include "aom/aom_integer.h"
-#if CONFIG_ANS
-#include "aom_dsp/ansreader.h"
-#else
 #include "aom_dsp/daalaboolreader.h"
-#endif
 #include "aom_dsp/prob.h"
 #include "av1/common/odintrin.h"
 
@@ -50,72 +46,37 @@
 #define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \
   aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
 
-#if CONFIG_LV_MAP
-#define aom_read_bin(r, cdf, nsymbs, ACCT_STR_NAME) \
-  aom_read_bin_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#if CONFIG_ANS
-typedef struct AnsDecoder aom_reader;
-#else
 typedef struct daala_reader aom_reader;
-#endif
 
 static INLINE int aom_reader_init(aom_reader *r, const uint8_t *buffer,
-                                  size_t size, aom_decrypt_cb decrypt_cb,
-                                  void *decrypt_state) {
-  (void)decrypt_cb;
-  (void)decrypt_state;
-#if CONFIG_ANS
-  if (size > INT_MAX) return 1;
-  return ans_read_init(r, buffer, (int)size);
-#else
+                                  size_t size) {
   return aom_daala_reader_init(r, buffer, (int)size);
-#endif
+}
+
+static INLINE const uint8_t *aom_reader_find_begin(aom_reader *r) {
+  return aom_daala_reader_find_begin(r);
 }
 
 static INLINE const uint8_t *aom_reader_find_end(aom_reader *r) {
-#if CONFIG_ANS
-  (void)r;
-  assert(0 && "Use the raw buffer size with ANS");
-  return NULL;
-#else
   return aom_daala_reader_find_end(r);
-#endif
 }
 
 static INLINE int aom_reader_has_error(aom_reader *r) {
-#if CONFIG_ANS
-  return ans_reader_has_error(r);
-#else
   return aom_daala_reader_has_error(r);
-#endif
 }
 
 // Returns the position in the bit reader in bits.
 static INLINE uint32_t aom_reader_tell(const aom_reader *r) {
-#if CONFIG_ANS
-  (void)r;
-  assert(0 && "aom_reader_tell() is unimplemented for ANS");
-  return 0;
-#else
   return aom_daala_reader_tell(r);
-#endif
 }
 
 // Returns the position in the bit reader in 1/8th bits.
 static INLINE uint32_t aom_reader_tell_frac(const aom_reader *r) {
-#if CONFIG_ANS
-  (void)r;
-  assert(0 && "aom_reader_tell_frac() is unimplemented for ANS");
-  return 0;
-#else
   return aom_daala_reader_tell_frac(r);
-#endif
 }
 
 #if CONFIG_ACCOUNTING
@@ -139,11 +100,7 @@ static INLINE void aom_update_symb_counts(const aom_reader *r, int is_binary) {
 
 static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
   int ret;
-#if CONFIG_ANS
-  ret = rabs_read(r, prob);
-#else
   ret = aom_daala_read(r, prob);
-#endif
 #if CONFIG_ACCOUNTING
   if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
   aom_update_symb_counts(r, 1);
@@ -153,15 +110,7 @@ static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
 
 static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
   int ret;
-#if CONFIG_ANS
-  ret = rabs_read_bit(r);  // Non trivial optimization at half probability
-#elif CONFIG_RAWBITS
-  // Note this uses raw bits and is not the same as aom_daala_read(r, 128);
-  // Calls to this function are omitted from raw symbol accounting.
-  ret = aom_daala_read_bit(r);
-#else
   ret = aom_read(r, 128, NULL);  // aom_prob_half
-#endif
 #if CONFIG_ACCOUNTING
   if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
 #endif
@@ -181,12 +130,7 @@ static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
 static INLINE int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf,
                                 int nsymbs ACCT_STR_PARAM) {
   int ret;
-#if CONFIG_ANS
-  (void)nsymbs;
-  ret = rans_read(r, cdf);
-#else
   ret = daala_read_symbol(r, cdf, nsymbs);
-#endif
 
 #if CONFIG_ACCOUNTING
   if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
@@ -199,46 +143,7 @@ static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
                                    int nsymbs ACCT_STR_PARAM) {
   int ret;
   ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
-  update_cdf(cdf, ret, nsymbs);
-  return ret;
-}
-
-#if CONFIG_LV_MAP
-static INLINE int aom_read_bin_(aom_reader *r, aom_cdf_prob *cdf,
-                                int nsymbs ACCT_STR_PARAM) {
-  int ret;
-  ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
-  update_cdf(cdf, ret, nsymbs);
-  return ret;
-}
-#endif
-
-static INLINE int aom_read_tree_as_cdf(aom_reader *r,
-                                       const aom_tree_index *tree,
-                                       const aom_prob *probs) {
-  aom_tree_index i = 0;
-  do {
-    aom_cdf_prob cdf[16];
-    aom_tree_index index[16];
-    int path[16];
-    int dist[16];
-    int nsymbs;
-    int symb;
-    nsymbs = tree_to_cdf(tree, probs, i, cdf, index, path, dist);
-    symb = aom_read_cdf(r, cdf, nsymbs, NULL);
-    OD_ASSERT(symb >= 0 && symb < nsymbs);
-    i = index[symb];
-  } while (i > 0);
-  return -i;
-}
-
-static INLINE int aom_read_tree_(aom_reader *r, const aom_tree_index *tree,
-                                 const aom_prob *probs ACCT_STR_PARAM) {
-  int ret;
-  ret = aom_read_tree_as_cdf(r, tree, probs);
-#if CONFIG_ACCOUNTING
-  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
-#endif
+  if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs);
   return ret;
 }
 
diff --git a/third_party/aom/aom_dsp/bitreader_buffer.c b/third_party/aom/aom_dsp/bitreader_buffer.c
index e51b1cc3a..68fc381f2 100644
--- a/third_party/aom/aom_dsp/bitreader_buffer.c
+++ b/third_party/aom/aom_dsp/bitreader_buffer.c
@@ -8,8 +8,9 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#include "./aom_config.h"
-#include "./bitreader_buffer.h"
+#include "config/aom_config.h"
+
+#include "aom_dsp/bitreader_buffer.h"
 
 size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb) {
   return (rb->bit_offset + 7) >> 3;
@@ -35,9 +36,13 @@ int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) {
   return value;
 }
 
-int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
-  const int value = aom_rb_read_literal(rb, bits);
-  return aom_rb_read_bit(rb) ? -value : value;
+uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb,
+                                      int bits) {
+  uint32_t value = 0;
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--)
+    value |= (uint32_t)aom_rb_read_bit(rb) << bit;
+  return value;
 }
 
 int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
diff --git a/third_party/aom/aom_dsp/bitreader_buffer.h b/third_party/aom/aom_dsp/bitreader_buffer.h
index 22187357e..2dafe11ad 100644
--- a/third_party/aom/aom_dsp/bitreader_buffer.h
+++ b/third_party/aom/aom_dsp/bitreader_buffer.h
@@ -37,7 +37,7 @@ int aom_rb_read_bit(struct aom_read_bit_buffer *rb);
 
 int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits);
 
-int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits);
+uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, int bits);
 
 int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits);
 
diff --git a/third_party/aom/aom_dsp/bitwriter.h b/third_party/aom/aom_dsp/bitwriter.h
index 7d3b34306..de1b1d048 100644
--- a/third_party/aom/aom_dsp/bitwriter.h
+++ b/third_party/aom/aom_dsp/bitwriter.h
@@ -13,13 +13,10 @@
 #define AOM_DSP_BITWRITER_H_
 
 #include <assert.h>
-#include "./aom_config.h"
 
-#if CONFIG_ANS
-#include "aom_dsp/buf_ans.h"
-#else
+#include "config/aom_config.h"
+
 #include "aom_dsp/daalaboolwriter.h"
-#endif
 #include "aom_dsp/prob.h"
 
 #if CONFIG_RD_DEBUG
@@ -31,23 +28,16 @@
 extern "C" {
 #endif
 
-#if CONFIG_ANS
-typedef struct BufAnsCoder aom_writer;
-#else
 typedef struct daala_writer aom_writer;
-#endif
 
 typedef struct TOKEN_STATS {
   int cost;
-#if CONFIG_VAR_TX
 #if CONFIG_RD_DEBUG
   int txb_coeff_cost_map[TXB_COEFF_COST_MAP_SIZE][TXB_COEFF_COST_MAP_SIZE];
 #endif
-#endif
 } TOKEN_STATS;
 
 static INLINE void init_token_stats(TOKEN_STATS *token_stats) {
-#if CONFIG_VAR_TX
 #if CONFIG_RD_DEBUG
   int r, c;
   for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
@@ -56,65 +46,23 @@ static INLINE void init_token_stats(TOKEN_STATS *token_stats) {
     }
   }
 #endif
-#endif
   token_stats->cost = 0;
 }
 
 static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) {
-#if CONFIG_ANS
-  aom_buf_ans_alloc(bc, /* error context*/ NULL);
-  buf_ans_write_init(bc, buffer);
-#else
   aom_daala_start_encode(bc, buffer);
-#endif
 }
 
-static INLINE void aom_stop_encode(aom_writer *bc) {
-#if CONFIG_ANS
-  aom_buf_ans_flush(bc);
-  bc->pos = buf_ans_write_end(bc);
-#else
-  aom_daala_stop_encode(bc);
-#endif
+static INLINE int aom_stop_encode(aom_writer *bc) {
+  return aom_daala_stop_encode(bc);
 }
 
 static INLINE void aom_write(aom_writer *br, int bit, int probability) {
-#if CONFIG_ANS
-  buf_rabs_write(br, bit, probability);
-#else
   aom_daala_write(br, bit, probability);
-#endif
-}
-
-static INLINE void aom_write_record(aom_writer *br, int bit, int probability,
-                                    TOKEN_STATS *token_stats) {
-  aom_write(br, bit, probability);
-#if CONFIG_RD_DEBUG
-  token_stats->cost += av1_cost_bit(probability, bit);
-#else
-  (void)token_stats;
-#endif
 }
 
 static INLINE void aom_write_bit(aom_writer *w, int bit) {
-#if CONFIG_ANS
-  buf_rabs_write_bit(w, bit);
-#elif CONFIG_RAWBITS
-  // Note this uses raw bits and is not the same as aom_daala_write(r, 128);
-  aom_daala_write_bit(w, bit);
-#else
   aom_write(w, bit, 128);  // aom_prob_half
-#endif
-}
-
-static INLINE void aom_write_bit_record(aom_writer *w, int bit,
-                                        TOKEN_STATS *token_stats) {
-  aom_write_bit(w, bit);
-#if CONFIG_RD_DEBUG
-  token_stats->cost += av1_cost_bit(128, bit);  // aom_prob_half
-#else
-  (void)token_stats;
-#endif
 }
 
 static INLINE void aom_write_literal(aom_writer *w, int data, int bits) {
@@ -125,83 +73,13 @@ static INLINE void aom_write_literal(aom_writer *w, int data, int bits) {
 
 static INLINE void aom_write_cdf(aom_writer *w, int symb,
                                  const aom_cdf_prob *cdf, int nsymbs) {
-#if CONFIG_ANS
-  (void)nsymbs;
-  assert(cdf);
-  const aom_cdf_prob cum_prob = symb > 0 ? cdf[symb - 1] : 0;
-  const aom_cdf_prob prob = cdf[symb] - cum_prob;
-  buf_rans_write(w, cum_prob, prob);
-#else
   daala_write_symbol(w, symb, cdf, nsymbs);
-#endif
 }
 
 static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
                                     int nsymbs) {
   aom_write_cdf(w, symb, cdf, nsymbs);
-  update_cdf(cdf, symb, nsymbs);
-}
-
-#if CONFIG_LV_MAP
-static INLINE void aom_write_bin(aom_writer *w, int symb, aom_cdf_prob *cdf,
-                                 int nsymbs) {
-  aom_write_cdf(w, symb, cdf, nsymbs);
-  update_cdf(cdf, symb, nsymbs);
-}
-#endif
-
-static INLINE void aom_write_tree_as_cdf(aom_writer *w,
-                                         const aom_tree_index *tree,
-                                         const aom_prob *probs, int bits,
-                                         int len, aom_tree_index i) {
-  aom_tree_index root;
-  root = i;
-  do {
-    aom_cdf_prob cdf[16];
-    aom_tree_index index[16];
-    int path[16];
-    int dist[16];
-    int nsymbs;
-    int symb;
-    int j;
-    /* Compute the CDF of the binary tree using the given probabilities. */
-    nsymbs = tree_to_cdf(tree, probs, root, cdf, index, path, dist);
-    /* Find the symbol to code. */
-    symb = -1;
-    for (j = 0; j < nsymbs; j++) {
-      /* If this symbol codes a leaf node,  */
-      if (index[j] <= 0) {
-        if (len == dist[j] && path[j] == bits) {
-          symb = j;
-          break;
-        }
-      } else {
-        if (len > dist[j] && path[j] == bits >> (len - dist[j])) {
-          symb = j;
-          break;
-        }
-      }
-    }
-    OD_ASSERT(symb != -1);
-    aom_write_cdf(w, symb, cdf, nsymbs);
-    bits &= (1 << (len - dist[symb])) - 1;
-    len -= dist[symb];
-  } while (len);
-}
-
-static INLINE void aom_write_tree(aom_writer *w, const aom_tree_index *tree,
-                                  const aom_prob *probs, int bits, int len,
-                                  aom_tree_index i) {
-  aom_write_tree_as_cdf(w, tree, probs, bits, len, i);
-}
-
-static INLINE void aom_write_tree_record(aom_writer *w,
-                                         const aom_tree_index *tree,
-                                         const aom_prob *probs, int bits,
-                                         int len, aom_tree_index i,
-                                         TOKEN_STATS *token_stats) {
-  (void)token_stats;
-  aom_write_tree_as_cdf(w, tree, probs, bits, len, i);
+  if (w->allow_update_cdf) update_cdf(cdf, symb, nsymbs);
 }
 
 #ifdef __cplusplus
diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.c b/third_party/aom/aom_dsp/bitwriter_buffer.c
index 1b3dd2913..21314eb2a 100644
--- a/third_party/aom/aom_dsp/bitwriter_buffer.c
+++ b/third_party/aom/aom_dsp/bitwriter_buffer.c
@@ -12,8 +12,13 @@
 #include <limits.h>
 #include <stdlib.h>
 
-#include "./aom_config.h"
-#include "./bitwriter_buffer.h"
+#include "config/aom_config.h"
+
+#include "aom_dsp/bitwriter_buffer.h"
+
+int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb) {
+  return (wb->bit_offset % CHAR_BIT == 0);
+}
 
 uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) {
   return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
@@ -48,6 +53,12 @@ void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) {
   for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
 }
 
+void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb,
+                                   uint32_t data, int bits) {
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
+}
+
 void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data,
                               int bits) {
   int bit;
diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.h b/third_party/aom/aom_dsp/bitwriter_buffer.h
index 1f23dc857..f7f75a097 100644
--- a/third_party/aom/aom_dsp/bitwriter_buffer.h
+++ b/third_party/aom/aom_dsp/bitwriter_buffer.h
@@ -23,6 +23,8 @@ struct aom_write_bit_buffer {
   uint32_t bit_offset;
 };
 
+int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb);
+
 uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb);
 
 void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit);
@@ -31,6 +33,9 @@ void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit);
 
 void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits);
 
+void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb,
+                                   uint32_t data, int bits);
+
 void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data,
                               int bits);
 
diff --git a/third_party/aom/aom_dsp/blend.h b/third_party/aom/aom_dsp/blend.h
index e5297ff83..434bb83a1 100644
--- a/third_party/aom/aom_dsp/blend.h
+++ b/third_party/aom/aom_dsp/blend.h
@@ -39,4 +39,7 @@
 // Blending by averaging.
 #define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
 
+#define DIFF_FACTOR_LOG2 4
+#define DIFF_FACTOR (1 << DIFF_FACTOR_LOG2)
+
 #endif  // AOM_DSP_BLEND_H_
diff --git a/third_party/aom/aom_dsp/blend_a64_hmask.c b/third_party/aom/aom_dsp/blend_a64_hmask.c
index 99b4b8a59..0554b43d1 100644
--- a/third_party/aom/aom_dsp/blend_a64_hmask.c
+++ b/third_party/aom/aom_dsp/blend_a64_hmask.c
@@ -16,12 +16,12 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/blend.h"
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride,
                            const uint8_t *src0, uint32_t src0_stride,
                            const uint8_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int h, int w) {
+                           const uint8_t *mask, int w, int h) {
   int i, j;
 
   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
@@ -40,11 +40,10 @@ void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride,
                                   const uint8_t *src0_8, uint32_t src0_stride,
                                   const uint8_t *src1_8, uint32_t src1_stride,
-                                  const uint8_t *mask, int h, int w, int bd) {
+                                  const uint8_t *mask, int w, int h, int bd) {
   int i, j;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
   const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
@@ -68,4 +67,3 @@ void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride,
     }
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/blend_a64_mask.c b/third_party/aom/aom_dsp/blend_a64_mask.c
index c35fa19f8..992cc5c0c 100644
--- a/third_party/aom/aom_dsp/blend_a64_mask.c
+++ b/third_party/aom/aom_dsp/blend_a64_mask.c
@@ -16,70 +16,209 @@
 #include "aom_dsp/blend.h"
 #include "aom_dsp/aom_dsp_common.h"
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
-#if CONFIG_CONVOLVE_ROUND
 // Blending with alpha mask. Mask values come from the range [0, 64],
 // as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
 // be the same as dst, or dst can be different from both sources.
 
-void aom_blend_a64_d32_mask_c(int32_t *dst, uint32_t dst_stride,
-                              const int32_t *src0, uint32_t src0_stride,
-                              const int32_t *src1, uint32_t src1_stride,
-                              const uint8_t *mask, uint32_t mask_stride, int h,
-                              int w, int subh, int subw) {
+// NOTE(david.barker): The input and output of aom_blend_a64_d32_mask_c() are
+// in a higher intermediate precision, and will later be rounded down to pixel
+// precision.
+// Thus, in order to avoid double-rounding, we want to use normal right shifts
+// within this function, not ROUND_POWER_OF_TWO.
+// This works because of the identity:
+// ROUND_POWER_OF_TWO(x >> y, z) == ROUND_POWER_OF_TWO(x, y+z)
+//
+// In contrast, the output of the non-d32 functions will not be further rounded,
+// so we *should* use ROUND_POWER_OF_TWO there.
+
+void aom_lowbd_blend_a64_d16_mask_c(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params) {
   int i, j;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
 
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
 
-  assert(h >= 1);
-  assert(w >= 1);
+  assert(h >= 4);
+  assert(w >= 4);
   assert(IS_POWER_OF_TWO(h));
   assert(IS_POWER_OF_TWO(w));
 
   if (subw == 0 && subh == 0) {
     for (i = 0; i < h; ++i) {
       for (j = 0; j < w; ++j) {
+        int32_t res;
         const int m = mask[i * mask_stride + j];
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
+        res = ((m * (int32_t)src0[i * src0_stride + j] +
+                (AOM_BLEND_A64_MAX_ALPHA - m) *
+                    (int32_t)src1[i * src1_stride + j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        dst[i * dst_stride + j] =
+            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
       }
     }
   } else if (subw == 1 && subh == 1) {
     for (i = 0; i < h; ++i) {
       for (j = 0; j < w; ++j) {
+        int32_t res;
         const int m = ROUND_POWER_OF_TWO(
             mask[(2 * i) * mask_stride + (2 * j)] +
                 mask[(2 * i + 1) * mask_stride + (2 * j)] +
                 mask[(2 * i) * mask_stride + (2 * j + 1)] +
                 mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
             2);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
+        res = ((m * (int32_t)src0[i * src0_stride + j] +
+                (AOM_BLEND_A64_MAX_ALPHA - m) *
+                    (int32_t)src1[i * src1_stride + j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        dst[i * dst_stride + j] =
+            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
       }
     }
   } else if (subw == 1 && subh == 0) {
     for (i = 0; i < h; ++i) {
       for (j = 0; j < w; ++j) {
+        int32_t res;
         const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
                                     mask[i * mask_stride + (2 * j + 1)]);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
+        res = ((m * (int32_t)src0[i * src0_stride + j] +
+                (AOM_BLEND_A64_MAX_ALPHA - m) *
+                    (int32_t)src1[i * src1_stride + j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        dst[i * dst_stride + j] =
+            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
       }
     }
   } else {
     for (i = 0; i < h; ++i) {
       for (j = 0; j < w; ++j) {
+        int32_t res;
         const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
                                     mask[(2 * i + 1) * mask_stride + j]);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
+        res = ((int32_t)(m * (int32_t)src0[i * src0_stride + j] +
+                         (AOM_BLEND_A64_MAX_ALPHA - m) *
+                             (int32_t)src1[i * src1_stride + j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        dst[i * dst_stride + j] =
+            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
+      }
+    }
+  }
+}
+
+void aom_highbd_blend_a64_d16_mask_c(
+    uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params, const int bd) {
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  // excerpt from clip_pixel_highbd()
+  // set saturation_value to (1 << bd) - 1
+  unsigned int saturation_value;
+  switch (bd) {
+    case 8:
+    default: saturation_value = 255; break;
+    case 10: saturation_value = 1023; break;
+    case 12: saturation_value = 4095; break;
+  }
+
+  if (subw == 0 && subh == 0) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = mask[j];
+        res = ((m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+        dst[j] = AOMMIN(v, saturation_value);
+      }
+      mask += mask_stride;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = ROUND_POWER_OF_TWO(
+            mask[2 * j] + mask[mask_stride + 2 * j] + mask[2 * j + 1] +
+                mask[mask_stride + 2 * j + 1],
+            2);
+        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+              AOM_BLEND_A64_ROUND_BITS;
+        res -= round_offset;
+        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+        dst[j] = AOMMIN(v, saturation_value);
+      }
+      mask += 2 * mask_stride;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = AOM_BLEND_AVG(mask[2 * j], mask[2 * j + 1]);
+        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+              AOM_BLEND_A64_ROUND_BITS;
+        res -= round_offset;
+        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+        dst[j] = AOMMIN(v, saturation_value);
+      }
+      mask += mask_stride;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = AOM_BLEND_AVG(mask[j], mask[mask_stride + j]);
+        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+              AOM_BLEND_A64_ROUND_BITS;
+        res -= round_offset;
+        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+        dst[j] = AOMMIN(v, saturation_value);
       }
+      mask += 2 * mask_stride;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
     }
   }
 }
-#endif  // CONFIG_CONVOLVE_ROUND
 
 // Blending with alpha mask. Mask values come from the range [0, 64],
 // as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
@@ -88,8 +227,8 @@ void aom_blend_a64_d32_mask_c(int32_t *dst, uint32_t dst_stride,
 void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
                           const uint8_t *src0, uint32_t src0_stride,
                           const uint8_t *src1, uint32_t src1_stride,
-                          const uint8_t *mask, uint32_t mask_stride, int h,
-                          int w, int subh, int subw) {
+                          const uint8_t *mask, uint32_t mask_stride, int w,
+                          int h, int subw, int subh) {
   int i, j;
 
   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
@@ -142,12 +281,11 @@ void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
                                  const uint8_t *src0_8, uint32_t src0_stride,
                                  const uint8_t *src1_8, uint32_t src1_stride,
                                  const uint8_t *mask, uint32_t mask_stride,
-                                 int h, int w, int subh, int subw, int bd) {
+                                 int w, int h, int subw, int subh, int bd) {
   int i, j;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
   const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
@@ -205,4 +343,3 @@ void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
     }
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/blend_a64_vmask.c b/third_party/aom/aom_dsp/blend_a64_vmask.c
index 1a5e30e31..4f222e17f 100644
--- a/third_party/aom/aom_dsp/blend_a64_vmask.c
+++ b/third_party/aom/aom_dsp/blend_a64_vmask.c
@@ -16,12 +16,12 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/blend.h"
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride,
                            const uint8_t *src0, uint32_t src0_stride,
                            const uint8_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int h, int w) {
+                           const uint8_t *mask, int w, int h) {
   int i, j;
 
   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
@@ -41,11 +41,10 @@ void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride,
                                   const uint8_t *src0_8, uint32_t src0_stride,
                                   const uint8_t *src1_8, uint32_t src1_stride,
-                                  const uint8_t *mask, int h, int w, int bd) {
+                                  const uint8_t *mask, int w, int h, int bd) {
   int i, j;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
   const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
@@ -70,4 +69,3 @@ void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride,
     }
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/buf_ans.h b/third_party/aom/aom_dsp/buf_ans.h
index f84ff3aed..cf7df1dbf 100644
--- a/third_party/aom/aom_dsp/buf_ans.h
+++ b/third_party/aom/aom_dsp/buf_ans.h
@@ -16,7 +16,8 @@
 // backwards due to ANS's stack like behavior.
 
 #include <assert.h>
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/ans.h"
 #include "aom_dsp/answriter.h"
@@ -47,6 +48,7 @@ struct BufAnsCoder {
   int window_size;
 #endif
   int pos;  // Dummy variable to store the output buffer after closing
+  uint8_t allow_update_cdf;
 };
 
 // Allocate a buffered ANS coder to store size symbols.
diff --git a/third_party/aom/aom_dsp/daalaboolreader.c b/third_party/aom/aom_dsp/daalaboolreader.c
index c6e3ac82d..4e224904e 100644
--- a/third_party/aom/aom_dsp/daalaboolreader.c
+++ b/third_party/aom/aom_dsp/daalaboolreader.c
@@ -24,6 +24,10 @@ int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size) {
   return 0;
 }
 
+const uint8_t *aom_daala_reader_find_begin(daala_reader *r) {
+  return r->buffer;
+}
+
 const uint8_t *aom_daala_reader_find_end(daala_reader *r) {
   return r->buffer_end;
 }
diff --git a/third_party/aom/aom_dsp/daalaboolreader.h b/third_party/aom/aom_dsp/daalaboolreader.h
index 55ff8d3d5..60c197a49 100644
--- a/third_party/aom/aom_dsp/daalaboolreader.h
+++ b/third_party/aom/aom_dsp/daalaboolreader.h
@@ -34,11 +34,13 @@ struct daala_reader {
 #if CONFIG_ACCOUNTING
   Accounting *accounting;
 #endif
+  uint8_t allow_update_cdf;
 };
 
 typedef struct daala_reader daala_reader;
 
 int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size);
+const uint8_t *aom_daala_reader_find_begin(daala_reader *r);
 const uint8_t *aom_daala_reader_find_end(daala_reader *r);
 uint32_t aom_daala_reader_tell(const daala_reader *r);
 uint32_t aom_daala_reader_tell_frac(const daala_reader *r);
@@ -96,12 +98,6 @@ static INLINE int aom_daala_read(daala_reader *r, int prob) {
   return bit;
 }
 
-#if CONFIG_RAWBITS
-static INLINE int aom_daala_read_bit(daala_reader *r) {
-  return od_ec_dec_bits(&r->ec, 1, "aom_bits");
-}
-#endif
-
 static INLINE int aom_daala_reader_has_error(daala_reader *r) {
   return r->ec.error;
 }
diff --git a/third_party/aom/aom_dsp/daalaboolwriter.c b/third_party/aom/aom_dsp/daalaboolwriter.c
index 59af2a243..b24ffbf3f 100644
--- a/third_party/aom/aom_dsp/daalaboolwriter.c
+++ b/third_party/aom/aom_dsp/daalaboolwriter.c
@@ -18,11 +18,14 @@ void aom_daala_start_encode(daala_writer *br, uint8_t *source) {
   od_ec_enc_init(&br->ec, 62025);
 }
 
-void aom_daala_stop_encode(daala_writer *br) {
+int aom_daala_stop_encode(daala_writer *br) {
+  int nb_bits;
   uint32_t daala_bytes;
   unsigned char *daala_data;
   daala_data = od_ec_enc_done(&br->ec, &daala_bytes);
+  nb_bits = od_ec_enc_tell(&br->ec);
   memcpy(br->buffer, daala_data, daala_bytes);
   br->pos = daala_bytes;
   od_ec_enc_clear(&br->ec);
+  return nb_bits;
 }
diff --git a/third_party/aom/aom_dsp/daalaboolwriter.h b/third_party/aom/aom_dsp/daalaboolwriter.h
index 6ec0f0b54..f9c596c73 100644
--- a/third_party/aom/aom_dsp/daalaboolwriter.h
+++ b/third_party/aom/aom_dsp/daalaboolwriter.h
@@ -28,12 +28,13 @@ struct daala_writer {
   unsigned int pos;
   uint8_t *buffer;
   od_ec_enc ec;
+  uint8_t allow_update_cdf;
 };
 
 typedef struct daala_writer daala_writer;
 
 void aom_daala_start_encode(daala_writer *w, uint8_t *buffer);
-void aom_daala_stop_encode(daala_writer *w);
+int aom_daala_stop_encode(daala_writer *w);
 
 static INLINE void aom_daala_write(daala_writer *w, int bit, int prob) {
   int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
@@ -53,12 +54,6 @@ static INLINE void aom_daala_write(daala_writer *w, int bit, int prob) {
   od_ec_encode_bool_q15(&w->ec, bit, p);
 }
 
-#if CONFIG_RAWBITS
-static INLINE void aom_daala_write_bit(daala_writer *w, int bit) {
-  od_ec_enc_bits(&w->ec, bit, 1);
-}
-#endif
-
 static INLINE void daala_write_symbol(daala_writer *w, int symb,
                                       const aom_cdf_prob *cdf, int nsymbs) {
 #if CONFIG_BITSTREAM_DEBUG
diff --git a/third_party/aom/aom_dsp/entcode.c b/third_party/aom/aom_dsp/entcode.c
index ad76b7e3e..aad96c6fc 100644
--- a/third_party/aom/aom_dsp/entcode.c
+++ b/third_party/aom/aom_dsp/entcode.c
@@ -9,10 +9,6 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifdef HAVE_CONFIG_H
-#include "./config.h"
-#endif
-
 #include "aom_dsp/entcode.h"
 
 /*Given the current total integer number of bits used and the current value of
diff --git a/third_party/aom/aom_dsp/entcode.h b/third_party/aom/aom_dsp/entcode.h
index 981a951e6..5c15526e9 100644
--- a/third_party/aom/aom_dsp/entcode.h
+++ b/third_party/aom/aom_dsp/entcode.h
@@ -9,11 +9,16 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#if !defined(_entcode_H)
-#define _entcode_H (1)
+#ifndef AOM_DSP_ENTCODE_H_
+#define AOM_DSP_ENTCODE_H_
+
 #include <limits.h>
 #include <stddef.h>
 #include "av1/common/odintrin.h"
+#include "aom_dsp/prob.h"
+
+#define EC_PROB_SHIFT 6
+#define EC_MIN_PROB 4  // must be <= (1<<EC_PROB_SHIFT)/16
 
 /*OPT: od_ec_window must be at least 32 bits, but if you have fast arithmetic
    on a larger type, you can speed up the decoder by using it here.*/
@@ -21,22 +26,15 @@ typedef uint32_t od_ec_window;
 
 #define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
 
-/*The number of bits to use for the range-coded part of unsigned integers.*/
-#define OD_EC_UINT_BITS (4)
-
 /*The resolution of fractional-precision bit usage measurements, i.e.,
    3 => 1/8th bits.*/
 #define OD_BITRES (3)
 
-/*The value stored in an iCDF is 32768 minus the actual Q15 cumulative
-   probability (an "inverse" CDF).
-  This function converts from one representation to the other (and is its own
-   inverse).*/
-#define OD_ICDF(x) (32768U - (x))
+#define OD_ICDF AOM_ICDF
 
 /*See entcode.c for further documentation.*/
 
 OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total,
                                                uint32_t rng);
 
-#endif
+#endif  // AOM_DSP_ENTCODE_H_
diff --git a/third_party/aom/aom_dsp/entdec.c b/third_party/aom/aom_dsp/entdec.c
index 71dad0df6..b8e9078c3 100644
--- a/third_party/aom/aom_dsp/entdec.c
+++ b/third_party/aom/aom_dsp/entdec.c
@@ -9,11 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifdef HAVE_CONFIG_H
-#include "./config.h"
-#endif
-
+#include <assert.h>
 #include "aom_dsp/entdec.h"
+#include "aom_dsp/prob.h"
 
 /*A range decoder.
   This is an entropy decoder based upon \cite{Mar79}, which is itself a
@@ -75,6 +73,8 @@
   Even relatively modest values like 100 would work fine.*/
 #define OD_EC_LOTS_OF_BITS (0x4000)
 
+/*The return value of od_ec_dec_tell does not change across an od_ec_dec_refill
+   call.*/
 static void od_ec_dec_refill(od_ec_dec *dec) {
   int s;
   od_ec_window dif;
@@ -87,7 +87,7 @@ static void od_ec_dec_refill(od_ec_dec *dec) {
   end = dec->end;
   s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15);
   for (; s >= 0 && bptr < end; s -= 8, bptr++) {
-    OD_ASSERT(s <= OD_EC_WINDOW_SIZE - 8);
+    assert(s <= OD_EC_WINDOW_SIZE - 8);
     dif ^= (od_ec_window)bptr[0] << s;
     cnt += 8;
   }
@@ -111,7 +111,7 @@ static void od_ec_dec_refill(od_ec_dec *dec) {
 static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng,
                                int ret) {
   int d;
-  OD_ASSERT(rng <= 65535U);
+  assert(rng <= 65535U);
   d = 16 - OD_ILOG_NZ(rng);
   dec->cnt -= d;
   /*This is equivalent to shifting in 1's instead of 0's.*/
@@ -127,9 +127,6 @@ static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng,
 void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf,
                     uint32_t storage) {
   dec->buf = buf;
-  dec->eptr = buf + storage;
-  dec->end_window = 0;
-  dec->nend_bits = 0;
   dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8);
   dec->end = buf + storage;
   dec->bptr = buf;
@@ -150,13 +147,14 @@ int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) {
   unsigned r_new;
   unsigned v;
   int ret;
-  OD_ASSERT(0 < f);
-  OD_ASSERT(f < 32768U);
+  assert(0 < f);
+  assert(f < 32768U);
   dif = dec->dif;
   r = dec->rng;
-  OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
-  OD_ASSERT(32768U <= r);
-  v = (r >> 8) * (uint32_t)f >> 7;
+  assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
+  assert(32768U <= r);
+  v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
+  v += EC_MIN_PROB;
   vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
   ret = 1;
   r_new = v;
@@ -170,8 +168,8 @@ int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) {
 
 /*Decodes a symbol given an inverse cumulative distribution function (CDF)
    table in Q15.
-  icdf: 32768 minus the CDF, such that symbol s falls in the range
-         [s > 0 ? (32768 - icdf[s - 1]) : 0, 32768 - icdf[s]).
+  icdf: CDF_PROB_TOP minus the CDF, such that symbol s falls in the range
+         [s > 0 ? (CDF_PROB_TOP - icdf[s - 1]) : 0, CDF_PROB_TOP - icdf[s]).
         The values must be monotonically non-increasing, and icdf[nsyms - 1]
          must be 0.
   nsyms: The number of symbols in the alphabet.
@@ -187,62 +185,28 @@ int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *icdf, int nsyms) {
   (void)nsyms;
   dif = dec->dif;
   r = dec->rng;
-  OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
-  OD_ASSERT(icdf[nsyms - 1] == OD_ICDF(32768U));
-  OD_ASSERT(32768U <= r);
+  const int N = nsyms - 1;
+
+  assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
+  assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP));
+  assert(32768U <= r);
+  assert(7 - EC_PROB_SHIFT - CDF_SHIFT >= 0);
   c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
   v = r;
   ret = -1;
   do {
     u = v;
-    v = (r >> 8) * (uint32_t)icdf[++ret] >> 7;
+    v = ((r >> 8) * (uint32_t)(icdf[++ret] >> EC_PROB_SHIFT) >>
+         (7 - EC_PROB_SHIFT - CDF_SHIFT));
+    v += EC_MIN_PROB * (N - ret);
   } while (c < v);
-  OD_ASSERT(v < u);
-  OD_ASSERT(u <= r);
+  assert(v < u);
+  assert(u <= r);
   r = u - v;
   dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
   return od_ec_dec_normalize(dec, dif, r, ret);
 }
 
-#if CONFIG_RAWBITS
-/*Extracts a sequence of raw bits from the stream.
-  The bits must have been encoded with od_ec_enc_bits().
-  ftb: The number of bits to extract.
-       This must be between 0 and 25, inclusive.
-  Return: The decoded bits.*/
-uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb) {
-  od_ec_window window;
-  int available;
-  uint32_t ret;
-  OD_ASSERT(ftb <= 25);
-  window = dec->end_window;
-  available = dec->nend_bits;
-  if ((unsigned)available < ftb) {
-    const unsigned char *buf;
-    const unsigned char *eptr;
-    buf = dec->buf;
-    eptr = dec->eptr;
-    OD_ASSERT(available <= OD_EC_WINDOW_SIZE - 8);
-    do {
-      if (eptr <= buf) {
-        dec->tell_offs += OD_EC_LOTS_OF_BITS - available;
-        available = OD_EC_LOTS_OF_BITS;
-        break;
-      }
-      window |= (od_ec_window) * --eptr << available;
-      available += 8;
-    } while (available <= OD_EC_WINDOW_SIZE - 8);
-    dec->eptr = eptr;
-  }
-  ret = (uint32_t)window & (((uint32_t)1 << ftb) - 1);
-  window >>= ftb;
-  available -= ftb;
-  dec->end_window = window;
-  dec->nend_bits = available;
-  return ret;
-}
-#endif
-
 /*Returns the number of bits "used" by the decoded symbols so far.
   This same number can be computed in either the encoder or the decoder, and is
    suitable for making coding decisions.
@@ -250,8 +214,7 @@ uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb) {
           This will always be slightly larger than the exact value (e.g., all
            rounding error is in the positive direction).*/
 int od_ec_dec_tell(const od_ec_dec *dec) {
-  return (int)(((dec->end - dec->eptr) + (dec->bptr - dec->buf)) * 8 -
-               dec->cnt - dec->nend_bits + dec->tell_offs);
+  return (int)((dec->bptr - dec->buf) * 8 - dec->cnt + dec->tell_offs);
 }
 
 /*Returns the number of bits "used" by the decoded symbols so far.
diff --git a/third_party/aom/aom_dsp/entdec.h b/third_party/aom/aom_dsp/entdec.h
index 35ac7fe0d..e35c3f99f 100644
--- a/third_party/aom/aom_dsp/entdec.h
+++ b/third_party/aom/aom_dsp/entdec.h
@@ -32,16 +32,10 @@ typedef struct od_ec_dec od_ec_dec;
 struct od_ec_dec {
   /*The start of the current input buffer.*/
   const unsigned char *buf;
-  /*The read pointer for the raw bits.*/
-  const unsigned char *eptr;
-  /*Bits that will be read from/written at the end.*/
-  od_ec_window end_window;
-  /*Number of valid bits in end_window.*/
-  int nend_bits;
   /*An offset used to keep track of tell after reaching the end of the stream.
     This is constant throughout most of the decoding process, but becomes
      important once we hit the end of the buffer and stop incrementing pointers
-     (and instead pretend cnt/nend_bits have lots of bits).*/
+     (and instead pretend cnt has lots of bits).*/
   int32_t tell_offs;
   /*The end of the current input buffer.*/
   const unsigned char *end;
diff --git a/third_party/aom/aom_dsp/entenc.c b/third_party/aom/aom_dsp/entenc.c
index b8c4dc047..6866de9b9 100644
--- a/third_party/aom/aom_dsp/entenc.c
+++ b/third_party/aom/aom_dsp/entenc.c
@@ -9,13 +9,19 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifdef HAVE_CONFIG_H
-#include "./config.h"
-#endif
-
 #include <stdlib.h>
 #include <string.h>
+#include <math.h>
+#include <assert.h>
 #include "aom_dsp/entenc.h"
+#include "aom_dsp/prob.h"
+
+#if OD_MEASURE_EC_OVERHEAD
+#if !defined(M_LOG2E)
+#define M_LOG2E (1.4426950408889634073599246810019)
+#endif
+#define OD_LOG2(x) (M_LOG2E * log(x))
+#endif  // OD_MEASURE_EC_OVERHEAD
 
 /*A range encoder.
   See entdec.c and the references for implementation details \cite{Mar79,MNW98}.
@@ -53,7 +59,7 @@ static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low,
   int c;
   int s;
   c = enc->cnt;
-  OD_ASSERT(rng <= 65535U);
+  assert(rng <= 65535U);
   d = 16 - OD_ILOG_NZ(rng);
   s = c + d;
   /*TODO: Right now we flush every time we have at least one byte available.
@@ -83,13 +89,13 @@ static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low,
     c += 16;
     m = (1 << c) - 1;
     if (s >= 8) {
-      OD_ASSERT(offs < storage);
+      assert(offs < storage);
       buf[offs++] = (uint16_t)(low >> c);
       low &= m;
       c -= 8;
       m >>= 8;
     }
-    OD_ASSERT(offs < storage);
+    assert(offs < storage);
     buf[offs++] = (uint16_t)(low >> c);
     s = c + d - 24;
     low &= m;
@@ -120,9 +126,6 @@ void od_ec_enc_init(od_ec_enc *enc, uint32_t size) {
 
 /*Reinitializes the encoder.*/
 void od_ec_enc_reset(od_ec_enc *enc) {
-  enc->end_offs = 0;
-  enc->end_window = 0;
-  enc->nend_bits = 0;
   enc->offs = 0;
   enc->low = 0;
   enc->rng = 0x8000;
@@ -143,31 +146,42 @@ void od_ec_enc_clear(od_ec_enc *enc) {
 }
 
 /*Encodes a symbol given its frequency in Q15.
-  fl: 32768 minus the cumulative frequency of all symbols that come before the
+  fl: CDF_PROB_TOP minus the cumulative frequency of all symbols that come
+  before the
        one to be encoded.
-  fh: 32768 minus the cumulative frequency of all symbols up to and including
+  fh: CDF_PROB_TOP minus the cumulative frequency of all symbols up to and
+  including
        the one to be encoded.*/
-static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh) {
+static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh, int s,
+                             int nsyms) {
   od_ec_window l;
   unsigned r;
   unsigned u;
   unsigned v;
   l = enc->low;
   r = enc->rng;
-  OD_ASSERT(32768U <= r);
-  OD_ASSERT(fh < fl);
-  OD_ASSERT(fl <= 32768U);
-  if (fl < 32768U) {
-    u = (r >> 8) * (uint32_t)fl >> 7;
-    v = (r >> 8) * (uint32_t)fh >> 7;
+  assert(32768U <= r);
+  assert(fh <= fl);
+  assert(fl <= 32768U);
+  assert(7 - EC_PROB_SHIFT - CDF_SHIFT >= 0);
+  const int N = nsyms - 1;
+  if (fl < CDF_PROB_TOP) {
+    u = ((r >> 8) * (uint32_t)(fl >> EC_PROB_SHIFT) >>
+         (7 - EC_PROB_SHIFT - CDF_SHIFT)) +
+        EC_MIN_PROB * (N - (s - 1));
+    v = ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >>
+         (7 - EC_PROB_SHIFT - CDF_SHIFT)) +
+        EC_MIN_PROB * (N - (s + 0));
     l += r - u;
     r = u - v;
   } else {
-    r -= (r >> 8) * (uint32_t)fh >> 7;
+    r -= ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >>
+          (7 - EC_PROB_SHIFT - CDF_SHIFT)) +
+         EC_MIN_PROB * (N - (s + 0));
   }
   od_ec_enc_normalize(enc, l, r);
 #if OD_MEASURE_EC_OVERHEAD
-  enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / 32768.);
+  enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / CDF_PROB_TOP.);
   enc->nb_symbols++;
 #endif
 }
@@ -179,18 +193,18 @@ void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) {
   od_ec_window l;
   unsigned r;
   unsigned v;
-  OD_ASSERT(0 < f);
-  OD_ASSERT(f < 32768U);
+  assert(0 < f);
+  assert(f < 32768U);
   l = enc->low;
   r = enc->rng;
-  OD_ASSERT(32768U <= r);
-  v = (r >> 8) * (uint32_t)f >> 7;
+  assert(32768U <= r);
+  v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
+  v += EC_MIN_PROB;
   if (val) l += r - v;
   r = val ? v : r - v;
   od_ec_enc_normalize(enc, l, r);
 #if OD_MEASURE_EC_OVERHEAD
-  enc->entropy -=
-      OD_LOG2((double)(val ? 32768 - OD_ICDF(f) : OD_ICDF(f)) / 32768.);
+  enc->entropy -= OD_LOG2((double)(val ? f : (32768 - f)) / 32768.);
   enc->nb_symbols++;
 #endif
 }
@@ -206,67 +220,12 @@ void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) {
 void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *icdf,
                           int nsyms) {
   (void)nsyms;
-  OD_ASSERT(s >= 0);
-  OD_ASSERT(s < nsyms);
-  OD_ASSERT(icdf[nsyms - 1] == OD_ICDF(32768U));
-  od_ec_encode_q15(enc, s > 0 ? icdf[s - 1] : OD_ICDF(0), icdf[s]);
+  assert(s >= 0);
+  assert(s < nsyms);
+  assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP));
+  od_ec_encode_q15(enc, s > 0 ? icdf[s - 1] : OD_ICDF(0), icdf[s], s, nsyms);
 }
 
-#if CONFIG_RAWBITS
-/*Encodes a sequence of raw bits in the stream.
-  fl: The bits to encode.
-  ftb: The number of bits to encode.
-       This must be between 0 and 25, inclusive.*/
-void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb) {
-  od_ec_window end_window;
-  int nend_bits;
-  OD_ASSERT(ftb <= 25);
-  OD_ASSERT(fl < (uint32_t)1 << ftb);
-#if OD_MEASURE_EC_OVERHEAD
-  enc->entropy += ftb;
-#endif
-  end_window = enc->end_window;
-  nend_bits = enc->nend_bits;
-  if (nend_bits + ftb > OD_EC_WINDOW_SIZE) {
-    unsigned char *buf;
-    uint32_t storage;
-    uint32_t end_offs;
-    buf = enc->buf;
-    storage = enc->storage;
-    end_offs = enc->end_offs;
-    if (end_offs + (OD_EC_WINDOW_SIZE >> 3) >= storage) {
-      unsigned char *new_buf;
-      uint32_t new_storage;
-      new_storage = 2 * storage + (OD_EC_WINDOW_SIZE >> 3);
-      new_buf = (unsigned char *)malloc(sizeof(*new_buf) * new_storage);
-      if (new_buf == NULL) {
-        enc->error = -1;
-        enc->end_offs = 0;
-        return;
-      }
-      OD_COPY(new_buf + new_storage - end_offs, buf + storage - end_offs,
-              end_offs);
-      storage = new_storage;
-      free(buf);
-      enc->buf = buf = new_buf;
-      enc->storage = storage;
-    }
-    do {
-      OD_ASSERT(end_offs < storage);
-      buf[storage - ++end_offs] = (unsigned char)end_window;
-      end_window >>= 8;
-      nend_bits -= 8;
-    } while (nend_bits >= 8);
-    enc->end_offs = end_offs;
-  }
-  OD_ASSERT(nend_bits + ftb <= OD_EC_WINDOW_SIZE);
-  end_window |= (od_ec_window)fl << nend_bits;
-  nend_bits += ftb;
-  enc->end_window = end_window;
-  enc->nend_bits = nend_bits;
-}
-#endif
-
 /*Overwrites a few bits at the very start of an existing stream, after they
    have already been encoded.
   This makes it possible to have a few flags up front, where it is easy for
@@ -284,9 +243,9 @@ void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb) {
 void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) {
   int shift;
   unsigned mask;
-  OD_ASSERT(nbits >= 0);
-  OD_ASSERT(nbits <= 8);
-  OD_ASSERT(val < 1U << nbits);
+  assert(nbits >= 0);
+  assert(nbits <= 8);
+  assert(val < 1U << nbits);
   shift = 8 - nbits;
   mask = ((1U << nbits) - 1) << shift;
   if (enc->offs > 0) {
@@ -318,12 +277,9 @@ unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
   uint32_t storage;
   uint16_t *buf;
   uint32_t offs;
-  uint32_t end_offs;
-  int nend_bits;
   od_ec_window m;
   od_ec_window e;
   od_ec_window l;
-  unsigned r;
   int c;
   int s;
   if (enc->error) return NULL;
@@ -341,16 +297,10 @@ unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
   /*We output the minimum number of bits that ensures that the symbols encoded
      thus far will be decoded correctly regardless of the bits that follow.*/
   l = enc->low;
-  r = enc->rng;
   c = enc->cnt;
-  s = 9;
-  m = 0x7FFF;
-  e = (l + m) & ~m;
-  while ((e | m) >= l + r) {
-    s++;
-    m >>= 1;
-    e = (l + m) & ~m;
-  }
+  s = 10;
+  m = 0x3FFF;
+  e = ((l + m) & ~m) | (m + 1);
   s += c;
   offs = enc->offs;
   buf = enc->precarry_buf;
@@ -369,7 +319,7 @@ unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
     }
     n = (1 << (c + 16)) - 1;
     do {
-      OD_ASSERT(offs < storage);
+      assert(offs < storage);
       buf[offs++] = (uint16_t)(e >> (c + 16));
       e &= n;
       s -= 8;
@@ -377,49 +327,31 @@ unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
       n >>= 8;
     } while (s > 0);
   }
-  /*Make sure there's enough room for the entropy-coded bits and the raw
-     bits.*/
+  /*Make sure there's enough room for the entropy-coded bits.*/
   out = enc->buf;
   storage = enc->storage;
-  end_offs = enc->end_offs;
-  e = enc->end_window;
-  nend_bits = enc->nend_bits;
-  s = -s;
-  c = OD_MAXI((nend_bits - s + 7) >> 3, 0);
-  if (offs + end_offs + c > storage) {
-    storage = offs + end_offs + c;
+  c = OD_MAXI((s + 7) >> 3, 0);
+  if (offs + c > storage) {
+    storage = offs + c;
     out = (unsigned char *)realloc(out, sizeof(*out) * storage);
     if (out == NULL) {
       enc->error = -1;
       return NULL;
     }
-    OD_MOVE(out + storage - end_offs, out + enc->storage - end_offs, end_offs);
     enc->buf = out;
     enc->storage = storage;
   }
-  /*If we have buffered raw bits, flush them as well.*/
-  while (nend_bits > s) {
-    OD_ASSERT(end_offs < storage);
-    out[storage - ++end_offs] = (unsigned char)e;
-    e >>= 8;
-    nend_bits -= 8;
-  }
-  *nbytes = offs + end_offs;
+  *nbytes = offs;
   /*Perform carry propagation.*/
-  OD_ASSERT(offs + end_offs <= storage);
-  out = out + storage - (offs + end_offs);
+  assert(offs <= storage);
+  out = out + storage - offs;
   c = 0;
-  end_offs = offs;
   while (offs > 0) {
     offs--;
     c = buf[offs] + c;
     out[offs] = (unsigned char)c;
     c >>= 8;
   }
-  /*Add any remaining raw bits to the last byte.
-    There is guaranteed to be enough room, because nend_bits <= s.*/
-  OD_ASSERT(nend_bits <= 0 || end_offs > 0);
-  if (nend_bits > 0) out[end_offs - 1] |= (unsigned char)e;
   /*Note: Unless there's an allocation error, if you keep encoding into the
      current buffer and call this function again later, everything will work
      just fine (you won't get a new packet out, but you will get a single
@@ -441,7 +373,7 @@ unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
 int od_ec_enc_tell(const od_ec_enc *enc) {
   /*The 10 here counteracts the offset of -9 baked into cnt, and adds 1 extra
      bit, which we reserve for terminating the stream.*/
-  return (enc->offs + enc->end_offs) * 8 + enc->cnt + enc->nend_bits + 10;
+  return (enc->cnt + 10) + enc->offs * 8;
 }
 
 /*Returns the number of bits "used" by the encoded symbols so far.
@@ -476,8 +408,8 @@ void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src) {
   uint32_t storage;
   uint16_t *precarry_buf;
   uint32_t precarry_storage;
-  OD_ASSERT(dst->storage >= src->storage);
-  OD_ASSERT(dst->precarry_storage >= src->precarry_storage);
+  assert(dst->storage >= src->storage);
+  assert(dst->precarry_storage >= src->precarry_storage);
   buf = dst->buf;
   storage = dst->storage;
   precarry_buf = dst->precarry_buf;
diff --git a/third_party/aom/aom_dsp/entenc.h b/third_party/aom/aom_dsp/entenc.h
index 314b36318..1988f6818 100644
--- a/third_party/aom/aom_dsp/entenc.h
+++ b/third_party/aom/aom_dsp/entenc.h
@@ -30,12 +30,6 @@ struct od_ec_enc {
   unsigned char *buf;
   /*The size of the buffer.*/
   uint32_t storage;
-  /*The offset at which the last byte containing raw bits was written.*/
-  uint32_t end_offs;
-  /*Bits that will be read from/written at the end.*/
-  od_ec_window end_window;
-  /*Number of valid bits in end_window.*/
-  int nend_bits;
   /*A buffer for output bytes with their associated carry flags.*/
   uint16_t *precarry_buf;
   /*The size of the pre-carry buffer.*/
diff --git a/third_party/aom/aom_dsp/fastssim.c b/third_party/aom/aom_dsp/fastssim.c
index 09d945afc..3804519b3 100644
--- a/third_party/aom/aom_dsp/fastssim.c
+++ b/third_party/aom/aom_dsp/fastssim.c
@@ -15,8 +15,10 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/ssim.h"
 #include "aom_ports/system_state.h"
 
@@ -25,12 +27,11 @@ typedef struct fs_ctx fs_ctx;
 
 #define SSIM_C1 (255 * 255 * 0.01 * 0.01)
 #define SSIM_C2 (255 * 255 * 0.03 * 0.03)
-#if CONFIG_HIGHBITDEPTH
 #define SSIM_C1_10 (1023 * 1023 * 0.01 * 0.01)
 #define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01)
 #define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03)
 #define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03)
-#endif
+
 #define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b))
 #define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b))
 
@@ -139,8 +140,8 @@ static void fs_downsample_level(fs_ctx *_ctx, int _l) {
 
 static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
                                  int _s1ystride, const uint8_t *_src2,
-                                 int _s2ystride, int _w, int _h, uint32_t bd,
-                                 uint32_t shift) {
+                                 int _s2ystride, int _w, int _h, uint32_t shift,
+                                 int buf_is_hbd) {
   uint32_t *dst1;
   uint32_t *dst2;
   int w;
@@ -161,7 +162,7 @@ static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
       int i1;
       i0 = 2 * i;
       i1 = FS_MINI(i0 + 1, _w);
-      if (bd == 8 && shift == 0) {
+      if (!buf_is_hbd) {
         dst1[j * w + i] =
             _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] +
             _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1];
@@ -198,13 +199,10 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
   int i;
   int j;
   double ssim_c1 = SSIM_C1;
-#if CONFIG_HIGHBITDEPTH
+
   if (bit_depth == 10) ssim_c1 = SSIM_C1_10;
   if (bit_depth == 12) ssim_c1 = SSIM_C1_12;
-#else
-  assert(bit_depth == 8);
-  (void)bit_depth;
-#endif
+
   w = _ctx->level[_l].w;
   h = _ctx->level[_l].h;
   col_sums_x = _ctx->col_buf;
@@ -323,13 +321,8 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
   int i;
   int j;
   double ssim_c2 = SSIM_C2;
-#if CONFIG_HIGHBITDEPTH
   if (bit_depth == 10) ssim_c2 = SSIM_C2_10;
   if (bit_depth == 12) ssim_c2 = SSIM_C2_12;
-#else
-  assert(bit_depth == 8);
-  (void)bit_depth;
-#endif
 
   w = _ctx->level[_l].w;
   h = _ctx->level[_l].h;
@@ -448,14 +441,14 @@ static double convert_ssim_db(double _ssim, double _weight) {
 
 static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst,
                         int _dystride, int _w, int _h, uint32_t _bd,
-                        uint32_t _shift) {
+                        uint32_t _shift, int buf_is_hbd) {
   fs_ctx ctx;
   double ret;
   int l;
   ret = 1;
   fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
-  fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _bd,
-                       _shift);
+  fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _shift,
+                       buf_is_hbd);
   for (l = 0; l < FS_NLEVELS - 1; l++) {
     fs_calc_structure(&ctx, l, _bd);
     ret *= fs_average(&ctx, l);
@@ -476,18 +469,19 @@ double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
   uint32_t bd_shift = 0;
   aom_clear_system_state();
   assert(bd >= in_bd);
-
+  assert(source->flags == dest->flags);
+  int buf_is_hbd = source->flags & YV12_FLAG_HIGHBITDEPTH;
   bd_shift = bd - in_bd;
 
   *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer,
                       dest->y_stride, source->y_crop_width,
-                      source->y_crop_height, in_bd, bd_shift);
+                      source->y_crop_height, in_bd, bd_shift, buf_is_hbd);
   *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer,
                       dest->uv_stride, source->uv_crop_width,
-                      source->uv_crop_height, in_bd, bd_shift);
+                      source->uv_crop_height, in_bd, bd_shift, buf_is_hbd);
   *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer,
                       dest->uv_stride, source->uv_crop_width,
-                      source->uv_crop_height, in_bd, bd_shift);
+                      source->uv_crop_height, in_bd, bd_shift, buf_is_hbd);
   ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v));
   return convert_ssim_db(ssimv, 1.0);
 }
diff --git a/third_party/aom/aom_dsp/fft.c b/third_party/aom/aom_dsp/fft.c
new file mode 100644
index 000000000..0ba71cfb3
--- /dev/null
+++ b/third_party/aom/aom_dsp/fft.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+static INLINE void simple_transpose(const float *A, float *B, int n) {
+  for (int y = 0; y < n; y++) {
+    for (int x = 0; x < n; x++) {
+      B[y * n + x] = A[x * n + y];
+    }
+  }
+}
+
+// The 1d transform is real to complex and packs the complex results in
+// a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real
+// components, followed by the n/2 - 1 imaginary components). After the
+// transform is done on the rows, the first n/2 + 1 columns are real, and
+// the remaining are the imaginary components. After the transform on the
+// columns, the region of [0, n/2]x[0, n/2] contains the real part of
+// fft of the real columns. The real part of the 2d fft also includes the
+// imaginary part of transformed imaginary columns. This function assembles
+// the correct outputs while putting the real and imaginary components
+// next to each other.
+static INLINE void unpack_2d_output(const float *col_fft, float *output,
+                                    int n) {
+  for (int y = 0; y <= n / 2; ++y) {
+    const int y2 = y + n / 2;
+    const int y_extra = y2 > n / 2 && y2 < n;
+
+    for (int x = 0; x <= n / 2; ++x) {
+      const int x2 = x + n / 2;
+      const int x_extra = x2 > n / 2 && x2 < n;
+      output[2 * (y * n + x)] =
+          col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
+      output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) +
+                                    (x_extra ? col_fft[y * n + x2] : 0);
+      if (y_extra) {
+        output[2 * ((n - y) * n + x)] =
+            col_fft[y * n + x] +
+            (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
+        output[2 * ((n - y) * n + x) + 1] =
+            -(y_extra ? col_fft[y2 * n + x] : 0) +
+            (x_extra ? col_fft[y * n + x2] : 0);
+      }
+    }
+  }
+}
+
+void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
+                    aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
+                    aom_fft_unpack_func_t unpack, int vec_size) {
+  for (int x = 0; x < n; x += vec_size) {
+    tform(input + x, output + x, n);
+  }
+  transpose(output, temp, n);
+
+  for (int x = 0; x < n; x += vec_size) {
+    tform(temp + x, output + x, n);
+  }
+  transpose(output, temp, n);
+
+  unpack(temp, output, n);
+}
+
+static INLINE void store_float(float *output, float input) { *output = input; }
+static INLINE float add_float(float a, float b) { return a + b; }
+static INLINE float sub_float(float a, float b) { return a - b; }
+static INLINE float mul_float(float a, float b) { return a * b; }
+
+GEN_FFT_2(void, float, float, float, *, store_float);
+GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float,
+          sub_float);
+GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float,
+          sub_float, mul_float);
+GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float,
+           sub_float, mul_float);
+GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float,
+           sub_float, mul_float);
+
+void aom_fft2x2_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft4x4_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft8x8_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft16x16_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft32x32_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
+                     aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
+                     aom_fft_1d_func_t ifft_multi,
+                     aom_fft_transpose_func_t transpose, int vec_size) {
+  // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft
+  // and get real outputs.
+  for (int y = 0; y <= n / 2; ++y) {
+    output[y * n] = input[2 * y * n];
+    output[y * n + 1] = input[2 * (y * n + n / 2)];
+  }
+  for (int y = n / 2 + 1; y < n; ++y) {
+    output[y * n] = input[2 * (y - n / 2) * n + 1];
+    output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1];
+  }
+
+  for (int i = 0; i < 2; i += vec_size) {
+    ifft_multi(output + i, temp + i, n);
+  }
+
+  // For the other columns, since we don't have a full ifft for complex inputs
+  // we have to split them into the real and imaginary counterparts.
+  // Pack the real component, then the imaginary components.
+  for (int y = 0; y < n; ++y) {
+    for (int x = 1; x < n / 2; ++x) {
+      output[y * n + (x + 1)] = input[2 * (y * n + x)];
+    }
+    for (int x = 1; x < n / 2; ++x) {
+      output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1];
+    }
+  }
+  for (int y = 2; y < vec_size; y++) {
+    fft_single(output + y, temp + y, n);
+  }
+  // This is the part that can be sped up with SIMD
+  for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) {
+    fft_multi(output + y, temp + y, n);
+  }
+
+  // Put the 0 and n/2 th results in the correct place.
+  for (int x = 0; x < n; ++x) {
+    output[x] = temp[x * n];
+    output[(n / 2) * n + x] = temp[x * n + 1];
+  }
+  // This rearranges and transposes.
+  for (int y = 1; y < n / 2; ++y) {
+    // Fill in the real columns
+    for (int x = 0; x <= n / 2; ++x) {
+      output[x + y * n] =
+          temp[(y + 1) + x * n] +
+          ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0);
+    }
+    for (int x = n / 2 + 1; x < n; ++x) {
+      output[x + y * n] = temp[(y + 1) + (n - x) * n] -
+                          temp[(y + n / 2) + ((n - x) + n / 2) * n];
+    }
+    // Fill in the imag columns
+    for (int x = 0; x <= n / 2; ++x) {
+      output[x + (y + n / 2) * n] =
+          temp[(y + n / 2) + x * n] -
+          ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0);
+    }
+    for (int x = n / 2 + 1; x < n; ++x) {
+      output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] +
+                                    temp[(y + n / 2) + (n - x) * n];
+    }
+  }
+  for (int y = 0; y < n; y += vec_size) {
+    ifft_multi(output + y, temp + y, n);
+  }
+  transpose(temp, output, n);
+}
+
+GEN_IFFT_2(void, float, float, float, *, store_float);
+GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float,
+           sub_float);
+GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float,
+           sub_float, mul_float);
+GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float,
+            sub_float, mul_float);
+GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float,
+            sub_float, mul_float);
+
+void aom_ifft2x2_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float,
+                  aom_ifft1d_2_float, simple_transpose, 1);
+}
+
+void aom_ifft4x4_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float,
+                  aom_ifft1d_4_float, simple_transpose, 1);
+}
+
+void aom_ifft8x8_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float,
+                  aom_ifft1d_8_float, simple_transpose, 1);
+}
+
+void aom_ifft16x16_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+                  aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1);
+}
+
+void aom_ifft32x32_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+                  aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1);
+}
diff --git a/third_party/aom/aom_dsp/fft_common.h b/third_party/aom/aom_dsp/fft_common.h
new file mode 100644
index 000000000..2f3cd5fdc
--- /dev/null
+++ b/third_party/aom/aom_dsp/fft_common.h
@@ -0,0 +1,1050 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_FFT_COMMON_H_
+#define AOM_DSP_FFT_COMMON_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief A function pointer for computing 1d fft and ifft.
+ *
+ * The function will point to an implementation for a specific transform size,
+ * and may perform the transforms using vectorized instructions.
+ *
+ * For a non-vectorized forward transforms of size n, the input and output
+ * buffers will be size n. The output takes advantage of conjugate symmetry and
+ * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where
+ * (r_{j}, i_{j}) is the complex output for index j.
+ *
+ * An inverse transform will assume that the complex "input" is packed
+ * similarly. Its output will be real.
+ *
+ * Non-vectorized transforms (e.g., on a single row) would use a stride = 1.
+ *
+ * Vectorized implementations are parallelized along the columns so that the fft
+ * can be performed on multiple columns at a time. In such cases the data block
+ * for input and output is typically square (n x n) and the stride will
+ * correspond to the spacing between rows. At minimum, the input size must be
+ * n x simd_vector_length.
+ *
+ * \param[in]  input   Input buffer. See above for size restrictions.
+ * \param[out] output  Output buffer. See above for size restrictions.
+ * \param[in]  stride  The spacing in number of elements between rows
+ *                     (or elements)
+ */
+typedef void (*aom_fft_1d_func_t)(const float *input, float *output,
+                                  int stride);
+
+// Declare some of the forward non-vectorized transforms which are used in some
+// of the vectorized implementations
+void aom_fft1d_4_float(const float *input, float *output, int stride);
+void aom_fft1d_8_float(const float *input, float *output, int stride);
+void aom_fft1d_16_float(const float *input, float *output, int stride);
+void aom_fft1d_32_float(const float *input, float *output, int stride);
+
+/**\!brief Function pointer for transposing a matrix of floats.
+ *
+ * \param[in]  input  Input buffer (size n x n)
+ * \param[out] output Output buffer (size n x n)
+ * \param[in]  n      Extent of one dimension of the square matrix.
+ */
+typedef void (*aom_fft_transpose_func_t)(const float *input, float *output,
+                                         int n);
+
+/**\!brief Function pointer for re-arranging intermediate 2d transform results.
+ *
+ * After re-arrangement, the real and imaginary components will be packed
+ * tightly next to each other.
+ *
+ * \param[in]  input  Input buffer (size n x n)
+ * \param[out] output Output buffer (size 2 x n x n)
+ * \param[in]  n      Extent of one dimension of the square matrix.
+ */
+typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n);
+
+/*!\brief Performs a 2d fft with the given functions.
+ *
+ * This generator function allows for multiple different implementations of 2d
+ * fft with different vector operations, without having to redefine the main
+ * body multiple times.
+ *
+ * \param[in]  input     Input buffer to run the transform on (size n x n)
+ * \param[out] temp      Working buffer for computing the transform (size n x n)
+ * \param[out] output    Output buffer (size 2 x n x n)
+ * \param[in]  tform     Forward transform function
+ * \param[in]  transpose Transpose function (for n x n matrix)
+ * \param[in]  unpack    Unpack function used to massage outputs to correct form
+ * \param[in]  vec_size  Vector size (the transform is done vec_size units at
+ *                       a time)
+ */
+void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
+                    aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
+                    aom_fft_unpack_func_t unpack, int vec_size);
+
+/*!\brief Perform a 2d inverse fft with the given helper functions
+ *
+ * \param[in]  input      Input buffer to run the transform on (size 2 x n x n)
+ * \param[out] temp       Working buffer for computations (size 2 x n x n)
+ * \param[out] output     Output buffer (size n x n)
+ * \param[in]  fft_single Forward transform function (non vectorized)
+ * \param[in]  fft_multi  Forward transform function (vectorized)
+ * \param[in]  ifft_multi Inverse transform function (vectorized)
+ * \param[in]  transpose  Transpose function (for n x n matrix)
+ * \param[in]  vec_size   Vector size (the transform is done vec_size
+ *                        units at a time)
+ */
+void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
+                     aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
+                     aom_fft_1d_func_t ifft_multi,
+                     aom_fft_transpose_func_t transpose, int vec_size);
+#ifdef __cplusplus
+}
+#endif
+
+// The macros below define 1D fft/ifft for different data types and for
+// different simd vector intrinsic types.
+
+#define GEN_FFT_2(ret, suffix, T, T_VEC, load, store)               \
+  ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \
+    const T_VEC i0 = load(input + 0 * stride);                      \
+    const T_VEC i1 = load(input + 1 * stride);                      \
+    store(output + 0 * stride, i0 + i1);                            \
+    store(output + 1 * stride, i0 - i1);                            \
+  }
+
+#define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
+  ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) {       \
+    const T_VEC kWeight0 = constant(0.0f);                                \
+    const T_VEC i0 = load(input + 0 * stride);                            \
+    const T_VEC i1 = load(input + 1 * stride);                            \
+    const T_VEC i2 = load(input + 2 * stride);                            \
+    const T_VEC i3 = load(input + 3 * stride);                            \
+    const T_VEC w0 = add(i0, i2);                                         \
+    const T_VEC w1 = sub(i0, i2);                                         \
+    const T_VEC w2 = add(i1, i3);                                         \
+    const T_VEC w3 = sub(i1, i3);                                         \
+    store(output + 0 * stride, add(w0, w2));                              \
+    store(output + 1 * stride, w1);                                       \
+    store(output + 2 * stride, sub(w0, w2));                              \
+    store(output + 3 * stride, sub(kWeight0, w3));                        \
+  }
+
+#define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \
+  ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) {            \
+    const T_VEC kWeight0 = constant(0.0f);                                     \
+    const T_VEC kWeight2 = constant(0.707107f);                                \
+    const T_VEC i0 = load(input + 0 * stride);                                 \
+    const T_VEC i1 = load(input + 1 * stride);                                 \
+    const T_VEC i2 = load(input + 2 * stride);                                 \
+    const T_VEC i3 = load(input + 3 * stride);                                 \
+    const T_VEC i4 = load(input + 4 * stride);                                 \
+    const T_VEC i5 = load(input + 5 * stride);                                 \
+    const T_VEC i6 = load(input + 6 * stride);                                 \
+    const T_VEC i7 = load(input + 7 * stride);                                 \
+    const T_VEC w0 = add(i0, i4);                                              \
+    const T_VEC w1 = sub(i0, i4);                                              \
+    const T_VEC w2 = add(i2, i6);                                              \
+    const T_VEC w3 = sub(i2, i6);                                              \
+    const T_VEC w4 = add(w0, w2);                                              \
+    const T_VEC w5 = sub(w0, w2);                                              \
+    const T_VEC w7 = add(i1, i5);                                              \
+    const T_VEC w8 = sub(i1, i5);                                              \
+    const T_VEC w9 = add(i3, i7);                                              \
+    const T_VEC w10 = sub(i3, i7);                                             \
+    const T_VEC w11 = add(w7, w9);                                             \
+    const T_VEC w12 = sub(w7, w9);                                             \
+    store(output + 0 * stride, add(w4, w11));                                  \
+    store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10))));          \
+    store(output + 2 * stride, w5);                                            \
+    store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10))));          \
+    store(output + 4 * stride, sub(w4, w11));                                  \
+    store(output + 5 * stride,                                                 \
+          sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8))));                \
+    store(output + 6 * stride, sub(kWeight0, w12));                            \
+    store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8))));          \
+  }
+
+#define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+                   mul)                                                    \
+  ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) {       \
+    const T_VEC kWeight0 = constant(0.0f);                                 \
+    const T_VEC kWeight2 = constant(0.707107f);                            \
+    const T_VEC kWeight3 = constant(0.92388f);                             \
+    const T_VEC kWeight4 = constant(0.382683f);                            \
+    const T_VEC i0 = load(input + 0 * stride);                             \
+    const T_VEC i1 = load(input + 1 * stride);                             \
+    const T_VEC i2 = load(input + 2 * stride);                             \
+    const T_VEC i3 = load(input + 3 * stride);                             \
+    const T_VEC i4 = load(input + 4 * stride);                             \
+    const T_VEC i5 = load(input + 5 * stride);                             \
+    const T_VEC i6 = load(input + 6 * stride);                             \
+    const T_VEC i7 = load(input + 7 * stride);                             \
+    const T_VEC i8 = load(input + 8 * stride);                             \
+    const T_VEC i9 = load(input + 9 * stride);                             \
+    const T_VEC i10 = load(input + 10 * stride);                           \
+    const T_VEC i11 = load(input + 11 * stride);                           \
+    const T_VEC i12 = load(input + 12 * stride);                           \
+    const T_VEC i13 = load(input + 13 * stride);                           \
+    const T_VEC i14 = load(input + 14 * stride);                           \
+    const T_VEC i15 = load(input + 15 * stride);                           \
+    const T_VEC w0 = add(i0, i8);                                          \
+    const T_VEC w1 = sub(i0, i8);                                          \
+    const T_VEC w2 = add(i4, i12);                                         \
+    const T_VEC w3 = sub(i4, i12);                                         \
+    const T_VEC w4 = add(w0, w2);                                          \
+    const T_VEC w5 = sub(w0, w2);                                          \
+    const T_VEC w7 = add(i2, i10);                                         \
+    const T_VEC w8 = sub(i2, i10);                                         \
+    const T_VEC w9 = add(i6, i14);                                         \
+    const T_VEC w10 = sub(i6, i14);                                        \
+    const T_VEC w11 = add(w7, w9);                                         \
+    const T_VEC w12 = sub(w7, w9);                                         \
+    const T_VEC w14 = add(w4, w11);                                        \
+    const T_VEC w15 = sub(w4, w11);                                        \
+    const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))),           \
+                           sub(sub(kWeight0, w3),                          \
+                               mul(kWeight2, add(w10, w8))) };             \
+    const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))),           \
+                           sub(w3, mul(kWeight2, add(w10, w8))) };         \
+    const T_VEC w19 = add(i1, i9);                                         \
+    const T_VEC w20 = sub(i1, i9);                                         \
+    const T_VEC w21 = add(i5, i13);                                        \
+    const T_VEC w22 = sub(i5, i13);                                        \
+    const T_VEC w23 = add(w19, w21);                                       \
+    const T_VEC w24 = sub(w19, w21);                                       \
+    const T_VEC w26 = add(i3, i11);                                        \
+    const T_VEC w27 = sub(i3, i11);                                        \
+    const T_VEC w28 = add(i7, i15);                                        \
+    const T_VEC w29 = sub(i7, i15);                                        \
+    const T_VEC w30 = add(w26, w28);                                       \
+    const T_VEC w31 = sub(w26, w28);                                       \
+    const T_VEC w33 = add(w23, w30);                                       \
+    const T_VEC w34 = sub(w23, w30);                                       \
+    const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))),         \
+                           sub(sub(kWeight0, w22),                         \
+                               mul(kWeight2, add(w29, w27))) };            \
+    const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))),         \
+                           sub(w22, mul(kWeight2, add(w29, w27))) };       \
+    store(output + 0 * stride, add(w14, w33));                             \
+    store(output + 1 * stride,                                             \
+          add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1])))); \
+    store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31))));     \
+    store(output + 3 * stride,                                             \
+          add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1])))); \
+    store(output + 4 * stride, w15);                                       \
+    store(output + 5 * stride,                                             \
+          add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])),            \
+                          mul(kWeight3, w37[1]))));                        \
+    store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31))));     \
+    store(output + 7 * stride,                                             \
+          add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])),            \
+                          mul(kWeight4, w35[1]))));                        \
+    store(output + 8 * stride, sub(w14, w33));                             \
+    store(output + 9 * stride,                                             \
+          add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \
+    store(output + 10 * stride,                                            \
+          sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24))));          \
+    store(output + 11 * stride,                                            \
+          add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \
+    store(output + 12 * stride, sub(kWeight0, w34));                       \
+    store(output + 13 * stride,                                            \
+          sub(sub(kWeight0, w18[1]),                                       \
+              sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))));         \
+    store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24))));   \
+    store(output + 15 * stride,                                            \
+          sub(sub(kWeight0, w16[1]),                                       \
+              sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))));         \
+  }
+
+#define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub,   \
+                   mul)                                                      \
+  ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) {         \
+    const T_VEC kWeight0 = constant(0.0f);                                   \
+    const T_VEC kWeight2 = constant(0.707107f);                              \
+    const T_VEC kWeight3 = constant(0.92388f);                               \
+    const T_VEC kWeight4 = constant(0.382683f);                              \
+    const T_VEC kWeight5 = constant(0.980785f);                              \
+    const T_VEC kWeight6 = constant(0.19509f);                               \
+    const T_VEC kWeight7 = constant(0.83147f);                               \
+    const T_VEC kWeight8 = constant(0.55557f);                               \
+    const T_VEC i0 = load(input + 0 * stride);                               \
+    const T_VEC i1 = load(input + 1 * stride);                               \
+    const T_VEC i2 = load(input + 2 * stride);                               \
+    const T_VEC i3 = load(input + 3 * stride);                               \
+    const T_VEC i4 = load(input + 4 * stride);                               \
+    const T_VEC i5 = load(input + 5 * stride);                               \
+    const T_VEC i6 = load(input + 6 * stride);                               \
+    const T_VEC i7 = load(input + 7 * stride);                               \
+    const T_VEC i8 = load(input + 8 * stride);                               \
+    const T_VEC i9 = load(input + 9 * stride);                               \
+    const T_VEC i10 = load(input + 10 * stride);                             \
+    const T_VEC i11 = load(input + 11 * stride);                             \
+    const T_VEC i12 = load(input + 12 * stride);                             \
+    const T_VEC i13 = load(input + 13 * stride);                             \
+    const T_VEC i14 = load(input + 14 * stride);                             \
+    const T_VEC i15 = load(input + 15 * stride);                             \
+    const T_VEC i16 = load(input + 16 * stride);                             \
+    const T_VEC i17 = load(input + 17 * stride);                             \
+    const T_VEC i18 = load(input + 18 * stride);                             \
+    const T_VEC i19 = load(input + 19 * stride);                             \
+    const T_VEC i20 = load(input + 20 * stride);                             \
+    const T_VEC i21 = load(input + 21 * stride);                             \
+    const T_VEC i22 = load(input + 22 * stride);                             \
+    const T_VEC i23 = load(input + 23 * stride);                             \
+    const T_VEC i24 = load(input + 24 * stride);                             \
+    const T_VEC i25 = load(input + 25 * stride);                             \
+    const T_VEC i26 = load(input + 26 * stride);                             \
+    const T_VEC i27 = load(input + 27 * stride);                             \
+    const T_VEC i28 = load(input + 28 * stride);                             \
+    const T_VEC i29 = load(input + 29 * stride);                             \
+    const T_VEC i30 = load(input + 30 * stride);                             \
+    const T_VEC i31 = load(input + 31 * stride);                             \
+    const T_VEC w0 = add(i0, i16);                                           \
+    const T_VEC w1 = sub(i0, i16);                                           \
+    const T_VEC w2 = add(i8, i24);                                           \
+    const T_VEC w3 = sub(i8, i24);                                           \
+    const T_VEC w4 = add(w0, w2);                                            \
+    const T_VEC w5 = sub(w0, w2);                                            \
+    const T_VEC w7 = add(i4, i20);                                           \
+    const T_VEC w8 = sub(i4, i20);                                           \
+    const T_VEC w9 = add(i12, i28);                                          \
+    const T_VEC w10 = sub(i12, i28);                                         \
+    const T_VEC w11 = add(w7, w9);                                           \
+    const T_VEC w12 = sub(w7, w9);                                           \
+    const T_VEC w14 = add(w4, w11);                                          \
+    const T_VEC w15 = sub(w4, w11);                                          \
+    const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))),             \
+                           sub(sub(kWeight0, w3),                            \
+                               mul(kWeight2, add(w10, w8))) };               \
+    const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))),             \
+                           sub(w3, mul(kWeight2, add(w10, w8))) };           \
+    const T_VEC w19 = add(i2, i18);                                          \
+    const T_VEC w20 = sub(i2, i18);                                          \
+    const T_VEC w21 = add(i10, i26);                                         \
+    const T_VEC w22 = sub(i10, i26);                                         \
+    const T_VEC w23 = add(w19, w21);                                         \
+    const T_VEC w24 = sub(w19, w21);                                         \
+    const T_VEC w26 = add(i6, i22);                                          \
+    const T_VEC w27 = sub(i6, i22);                                          \
+    const T_VEC w28 = add(i14, i30);                                         \
+    const T_VEC w29 = sub(i14, i30);                                         \
+    const T_VEC w30 = add(w26, w28);                                         \
+    const T_VEC w31 = sub(w26, w28);                                         \
+    const T_VEC w33 = add(w23, w30);                                         \
+    const T_VEC w34 = sub(w23, w30);                                         \
+    const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))),           \
+                           sub(sub(kWeight0, w22),                           \
+                               mul(kWeight2, add(w29, w27))) };              \
+    const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))),           \
+                           sub(w22, mul(kWeight2, add(w29, w27))) };         \
+    const T_VEC w38 = add(w14, w33);                                         \
+    const T_VEC w39 = sub(w14, w33);                                         \
+    const T_VEC w40[2] = {                                                   \
+      add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1]))),        \
+      add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))         \
+    };                                                                       \
+    const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))),            \
+                           sub(sub(kWeight0, w12),                           \
+                               mul(kWeight2, add(w31, w24))) };              \
+    const T_VEC w42[2] = {                                                   \
+      add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1]))),        \
+      add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))         \
+    };                                                                       \
+    const T_VEC w44[2] = {                                                   \
+      add(w18[0],                                                            \
+          sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \
+      sub(sub(kWeight0, w18[1]),                                             \
+          sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))                 \
+    };                                                                       \
+    const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))),            \
+                           sub(w12, mul(kWeight2, add(w31, w24))) };         \
+    const T_VEC w46[2] = {                                                   \
+      add(w16[0],                                                            \
+          sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \
+      sub(sub(kWeight0, w16[1]),                                             \
+          sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))                 \
+    };                                                                       \
+    const T_VEC w47 = add(i1, i17);                                          \
+    const T_VEC w48 = sub(i1, i17);                                          \
+    const T_VEC w49 = add(i9, i25);                                          \
+    const T_VEC w50 = sub(i9, i25);                                          \
+    const T_VEC w51 = add(w47, w49);                                         \
+    const T_VEC w52 = sub(w47, w49);                                         \
+    const T_VEC w54 = add(i5, i21);                                          \
+    const T_VEC w55 = sub(i5, i21);                                          \
+    const T_VEC w56 = add(i13, i29);                                         \
+    const T_VEC w57 = sub(i13, i29);                                         \
+    const T_VEC w58 = add(w54, w56);                                         \
+    const T_VEC w59 = sub(w54, w56);                                         \
+    const T_VEC w61 = add(w51, w58);                                         \
+    const T_VEC w62 = sub(w51, w58);                                         \
+    const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))),           \
+                           sub(sub(kWeight0, w50),                           \
+                               mul(kWeight2, add(w57, w55))) };              \
+    const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))),           \
+                           sub(w50, mul(kWeight2, add(w57, w55))) };         \
+    const T_VEC w66 = add(i3, i19);                                          \
+    const T_VEC w67 = sub(i3, i19);                                          \
+    const T_VEC w68 = add(i11, i27);                                         \
+    const T_VEC w69 = sub(i11, i27);                                         \
+    const T_VEC w70 = add(w66, w68);                                         \
+    const T_VEC w71 = sub(w66, w68);                                         \
+    const T_VEC w73 = add(i7, i23);                                          \
+    const T_VEC w74 = sub(i7, i23);                                          \
+    const T_VEC w75 = add(i15, i31);                                         \
+    const T_VEC w76 = sub(i15, i31);                                         \
+    const T_VEC w77 = add(w73, w75);                                         \
+    const T_VEC w78 = sub(w73, w75);                                         \
+    const T_VEC w80 = add(w70, w77);                                         \
+    const T_VEC w81 = sub(w70, w77);                                         \
+    const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))),           \
+                           sub(sub(kWeight0, w69),                           \
+                               mul(kWeight2, add(w76, w74))) };              \
+    const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))),           \
+                           sub(w69, mul(kWeight2, add(w76, w74))) };         \
+    const T_VEC w85 = add(w61, w80);                                         \
+    const T_VEC w86 = sub(w61, w80);                                         \
+    const T_VEC w87[2] = {                                                   \
+      add(w63[0], add(mul(kWeight3, w82[0]), mul(kWeight4, w82[1]))),        \
+      add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0])))         \
+    };                                                                       \
+    const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))),           \
+                           sub(sub(kWeight0, w59),                           \
+                               mul(kWeight2, add(w78, w71))) };              \
+    const T_VEC w89[2] = {                                                   \
+      add(w65[0], add(mul(kWeight4, w84[0]), mul(kWeight3, w84[1]))),        \
+      add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0])))         \
+    };                                                                       \
+    const T_VEC w91[2] = {                                                   \
+      add(w65[0],                                                            \
+          sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \
+      sub(sub(kWeight0, w65[1]),                                             \
+          sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1])))                 \
+    };                                                                       \
+    const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))),           \
+                           sub(w59, mul(kWeight2, add(w78, w71))) };         \
+    const T_VEC w93[2] = {                                                   \
+      add(w63[0],                                                            \
+          sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \
+      sub(sub(kWeight0, w63[1]),                                             \
+          sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1])))                 \
+    };                                                                       \
+    store(output + 0 * stride, add(w38, w85));                               \
+    store(output + 1 * stride,                                               \
+          add(w40[0], add(mul(kWeight5, w87[0]), mul(kWeight6, w87[1]))));   \
+    store(output + 2 * stride,                                               \
+          add(w41[0], add(mul(kWeight3, w88[0]), mul(kWeight4, w88[1]))));   \
+    store(output + 3 * stride,                                               \
+          add(w42[0], add(mul(kWeight7, w89[0]), mul(kWeight8, w89[1]))));   \
+    store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81))));      \
+    store(output + 5 * stride,                                               \
+          add(w44[0], add(mul(kWeight8, w91[0]), mul(kWeight7, w91[1]))));   \
+    store(output + 6 * stride,                                               \
+          add(w45[0], add(mul(kWeight4, w92[0]), mul(kWeight3, w92[1]))));   \
+    store(output + 7 * stride,                                               \
+          add(w46[0], add(mul(kWeight6, w93[0]), mul(kWeight5, w93[1]))));   \
+    store(output + 8 * stride, w39);                                         \
+    store(output + 9 * stride,                                               \
+          add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])),              \
+                          mul(kWeight5, w93[1]))));                          \
+    store(output + 10 * stride,                                              \
+          add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])),              \
+                          mul(kWeight3, w92[1]))));                          \
+    store(output + 11 * stride,                                              \
+          add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])),              \
+                          mul(kWeight7, w91[1]))));                          \
+    store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81))));     \
+    store(output + 13 * stride,                                              \
+          add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])),              \
+                          mul(kWeight8, w89[1]))));                          \
+    store(output + 14 * stride,                                              \
+          add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])),              \
+                          mul(kWeight4, w88[1]))));                          \
+    store(output + 15 * stride,                                              \
+          add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])),              \
+                          mul(kWeight6, w87[1]))));                          \
+    store(output + 16 * stride, sub(w38, w85));                              \
+    store(output + 17 * stride,                                              \
+          add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0]))));   \
+    store(output + 18 * stride,                                              \
+          add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0]))));   \
+    store(output + 19 * stride,                                              \
+          add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0]))));   \
+    store(output + 20 * stride,                                              \
+          sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62))));            \
+    store(output + 21 * stride,                                              \
+          add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0]))));   \
+    store(output + 22 * stride,                                              \
+          add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0]))));   \
+    store(output + 23 * stride,                                              \
+          add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0]))));   \
+    store(output + 24 * stride, sub(kWeight0, w86));                         \
+    store(output + 25 * stride,                                              \
+          sub(sub(kWeight0, w46[1]),                                         \
+              sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1]))));           \
+    store(output + 26 * stride,                                              \
+          sub(sub(kWeight0, w45[1]),                                         \
+              sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1]))));           \
+    store(output + 27 * stride,                                              \
+          sub(sub(kWeight0, w44[1]),                                         \
+              sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1]))));           \
+    store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62))));     \
+    store(output + 29 * stride,                                              \
+          sub(sub(kWeight0, w42[1]),                                         \
+              sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1]))));           \
+    store(output + 30 * stride,                                              \
+          sub(sub(kWeight0, w41[1]),                                         \
+              sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1]))));           \
+    store(output + 31 * stride,                                              \
+          sub(sub(kWeight0, w40[1]),                                         \
+              sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1]))));           \
+  }
+
+#define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store)               \
+  ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \
+    const T_VEC i0 = load(input + 0 * stride);                       \
+    const T_VEC i1 = load(input + 1 * stride);                       \
+    store(output + 0 * stride, i0 + i1);                             \
+    store(output + 1 * stride, i0 - i1);                             \
+  }
+
+#define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
+  ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) {       \
+    const T_VEC kWeight0 = constant(0.0f);                                 \
+    const T_VEC i0 = load(input + 0 * stride);                             \
+    const T_VEC i1 = load(input + 1 * stride);                             \
+    const T_VEC i2 = load(input + 2 * stride);                             \
+    const T_VEC i3 = load(input + 3 * stride);                             \
+    const T_VEC w2 = add(i0, i2);                                          \
+    const T_VEC w3 = sub(i0, i2);                                          \
+    const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) };                      \
+    const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) };       \
+    store(output + 0 * stride, add(w2, w4[0]));                            \
+    store(output + 1 * stride, add(w3, w5[1]));                            \
+    store(output + 2 * stride, sub(w2, w4[0]));                            \
+    store(output + 3 * stride, sub(w3, w5[1]));                            \
+  }
+
+#define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+                   mul)                                                    \
+  ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) {       \
+    const T_VEC kWeight0 = constant(0.0f);                                 \
+    const T_VEC kWeight2 = constant(0.707107f);                            \
+    const T_VEC i0 = load(input + 0 * stride);                             \
+    const T_VEC i1 = load(input + 1 * stride);                             \
+    const T_VEC i2 = load(input + 2 * stride);                             \
+    const T_VEC i3 = load(input + 3 * stride);                             \
+    const T_VEC i4 = load(input + 4 * stride);                             \
+    const T_VEC i5 = load(input + 5 * stride);                             \
+    const T_VEC i6 = load(input + 6 * stride);                             \
+    const T_VEC i7 = load(input + 7 * stride);                             \
+    const T_VEC w6 = add(i0, i4);                                          \
+    const T_VEC w7 = sub(i0, i4);                                          \
+    const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) };                      \
+    const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) };       \
+    const T_VEC w10[2] = { add(w6, w8[0]), w8[1] };                        \
+    const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) };         \
+    const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) };         \
+    const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] };                        \
+    const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) };                     \
+    const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) };      \
+    const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) };                     \
+    const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) };      \
+    const T_VEC w18[2] = { add(w14[0], w16[0]), add(w14[1], w16[1]) };     \
+    const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) };     \
+    const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) };     \
+    const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) };     \
+    store(output + 0 * stride, add(w10[0], w18[0]));                       \
+    store(output + 1 * stride,                                             \
+          add(w12[0], mul(kWeight2, add(w20[0], w20[1]))));                \
+    store(output + 2 * stride, add(w11[0], w19[1]));                       \
+    store(output + 3 * stride,                                             \
+          sub(w13[0], mul(kWeight2, sub(w21[0], w21[1]))));                \
+    store(output + 4 * stride, sub(w10[0], w18[0]));                       \
+    store(output + 5 * stride,                                             \
+          add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])),            \
+                          mul(kWeight2, w20[1]))));                        \
+    store(output + 6 * stride, sub(w11[0], w19[1]));                       \
+    store(output + 7 * stride,                                             \
+          add(w13[0], mul(kWeight2, sub(w21[0], w21[1]))));                \
+  }
+
+#define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub,   \
+                    mul)                                                      \
+  ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) {         \
+    const T_VEC kWeight0 = constant(0.0f);                                    \
+    const T_VEC kWeight2 = constant(0.707107f);                               \
+    const T_VEC kWeight3 = constant(0.92388f);                                \
+    const T_VEC kWeight4 = constant(0.382683f);                               \
+    const T_VEC i0 = load(input + 0 * stride);                                \
+    const T_VEC i1 = load(input + 1 * stride);                                \
+    const T_VEC i2 = load(input + 2 * stride);                                \
+    const T_VEC i3 = load(input + 3 * stride);                                \
+    const T_VEC i4 = load(input + 4 * stride);                                \
+    const T_VEC i5 = load(input + 5 * stride);                                \
+    const T_VEC i6 = load(input + 6 * stride);                                \
+    const T_VEC i7 = load(input + 7 * stride);                                \
+    const T_VEC i8 = load(input + 8 * stride);                                \
+    const T_VEC i9 = load(input + 9 * stride);                                \
+    const T_VEC i10 = load(input + 10 * stride);                              \
+    const T_VEC i11 = load(input + 11 * stride);                              \
+    const T_VEC i12 = load(input + 12 * stride);                              \
+    const T_VEC i13 = load(input + 13 * stride);                              \
+    const T_VEC i14 = load(input + 14 * stride);                              \
+    const T_VEC i15 = load(input + 15 * stride);                              \
+    const T_VEC w14 = add(i0, i8);                                            \
+    const T_VEC w15 = sub(i0, i8);                                            \
+    const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) };                      \
+    const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) };       \
+    const T_VEC w18[2] = { add(w14, w16[0]), w16[1] };                        \
+    const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) };         \
+    const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) };         \
+    const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] };                        \
+    const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) };                      \
+    const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) };       \
+    const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) };                      \
+    const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) };       \
+    const T_VEC w26[2] = { add(w22[0], w24[0]), add(w22[1], w24[1]) };        \
+    const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) };        \
+    const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) };        \
+    const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) };        \
+    const T_VEC w30[2] = { add(w18[0], w26[0]), add(w18[1], w26[1]) };        \
+    const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) };        \
+    const T_VEC w32[2] = { add(w20[0], mul(kWeight2, add(w28[0], w28[1]))),   \
+                           add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \
+    const T_VEC w33[2] = { add(w20[0],                                        \
+                               sub(sub(kWeight0, mul(kWeight2, w28[0])),      \
+                                   mul(kWeight2, w28[1]))),                   \
+                           add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \
+    const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) };        \
+    const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) };        \
+    const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))),   \
+                           sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
+    const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))),   \
+                           add(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
+    const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) };                       \
+    const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) };        \
+    const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) };                      \
+    const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) };       \
+    const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) };        \
+    const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) };        \
+    const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) };        \
+    const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) };        \
+    const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) };                      \
+    const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) };       \
+    const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) };                       \
+    const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) };        \
+    const T_VEC w50[2] = { add(w46[0], w48[0]), add(w46[1], w48[1]) };        \
+    const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) };        \
+    const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) };        \
+    const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) };        \
+    const T_VEC w54[2] = { add(w42[0], w50[0]), add(w42[1], w50[1]) };        \
+    const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) };        \
+    const T_VEC w56[2] = { add(w44[0], mul(kWeight2, add(w52[0], w52[1]))),   \
+                           add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \
+    const T_VEC w57[2] = { add(w44[0],                                        \
+                               sub(sub(kWeight0, mul(kWeight2, w52[0])),      \
+                                   mul(kWeight2, w52[1]))),                   \
+                           add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \
+    const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) };        \
+    const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) };        \
+    const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))),   \
+                           sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
+    const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))),   \
+                           add(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
+    store(output + 0 * stride, add(w30[0], w54[0]));                          \
+    store(output + 1 * stride,                                                \
+          add(w32[0], add(mul(kWeight3, w56[0]), mul(kWeight4, w56[1]))));    \
+    store(output + 2 * stride,                                                \
+          add(w34[0], mul(kWeight2, add(w58[0], w58[1]))));                   \
+    store(output + 3 * stride,                                                \
+          add(w36[0], add(mul(kWeight4, w60[0]), mul(kWeight3, w60[1]))));    \
+    store(output + 4 * stride, add(w31[0], w55[1]));                          \
+    store(output + 5 * stride,                                                \
+          sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1]))));    \
+    store(output + 6 * stride,                                                \
+          sub(w35[0], mul(kWeight2, sub(w59[0], w59[1]))));                   \
+    store(output + 7 * stride,                                                \
+          sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1]))));    \
+    store(output + 8 * stride, sub(w30[0], w54[0]));                          \
+    store(output + 9 * stride,                                                \
+          add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])),               \
+                          mul(kWeight4, w56[1]))));                           \
+    store(output + 10 * stride,                                               \
+          add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])),               \
+                          mul(kWeight2, w58[1]))));                           \
+    store(output + 11 * stride,                                               \
+          add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])),               \
+                          mul(kWeight3, w60[1]))));                           \
+    store(output + 12 * stride, sub(w31[0], w55[1]));                         \
+    store(output + 13 * stride,                                               \
+          add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1]))));    \
+    store(output + 14 * stride,                                               \
+          add(w35[0], mul(kWeight2, sub(w59[0], w59[1]))));                   \
+    store(output + 15 * stride,                                               \
+          add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1]))));    \
+  }
+#define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub,    \
+                    mul)                                                       \
+  ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) {          \
+    const T_VEC kWeight0 = constant(0.0f);                                     \
+    const T_VEC kWeight2 = constant(0.707107f);                                \
+    const T_VEC kWeight3 = constant(0.92388f);                                 \
+    const T_VEC kWeight4 = constant(0.382683f);                                \
+    const T_VEC kWeight5 = constant(0.980785f);                                \
+    const T_VEC kWeight6 = constant(0.19509f);                                 \
+    const T_VEC kWeight7 = constant(0.83147f);                                 \
+    const T_VEC kWeight8 = constant(0.55557f);                                 \
+    const T_VEC i0 = load(input + 0 * stride);                                 \
+    const T_VEC i1 = load(input + 1 * stride);                                 \
+    const T_VEC i2 = load(input + 2 * stride);                                 \
+    const T_VEC i3 = load(input + 3 * stride);                                 \
+    const T_VEC i4 = load(input + 4 * stride);                                 \
+    const T_VEC i5 = load(input + 5 * stride);                                 \
+    const T_VEC i6 = load(input + 6 * stride);                                 \
+    const T_VEC i7 = load(input + 7 * stride);                                 \
+    const T_VEC i8 = load(input + 8 * stride);                                 \
+    const T_VEC i9 = load(input + 9 * stride);                                 \
+    const T_VEC i10 = load(input + 10 * stride);                               \
+    const T_VEC i11 = load(input + 11 * stride);                               \
+    const T_VEC i12 = load(input + 12 * stride);                               \
+    const T_VEC i13 = load(input + 13 * stride);                               \
+    const T_VEC i14 = load(input + 14 * stride);                               \
+    const T_VEC i15 = load(input + 15 * stride);                               \
+    const T_VEC i16 = load(input + 16 * stride);                               \
+    const T_VEC i17 = load(input + 17 * stride);                               \
+    const T_VEC i18 = load(input + 18 * stride);                               \
+    const T_VEC i19 = load(input + 19 * stride);                               \
+    const T_VEC i20 = load(input + 20 * stride);                               \
+    const T_VEC i21 = load(input + 21 * stride);                               \
+    const T_VEC i22 = load(input + 22 * stride);                               \
+    const T_VEC i23 = load(input + 23 * stride);                               \
+    const T_VEC i24 = load(input + 24 * stride);                               \
+    const T_VEC i25 = load(input + 25 * stride);                               \
+    const T_VEC i26 = load(input + 26 * stride);                               \
+    const T_VEC i27 = load(input + 27 * stride);                               \
+    const T_VEC i28 = load(input + 28 * stride);                               \
+    const T_VEC i29 = load(input + 29 * stride);                               \
+    const T_VEC i30 = load(input + 30 * stride);                               \
+    const T_VEC i31 = load(input + 31 * stride);                               \
+    const T_VEC w30 = add(i0, i16);                                            \
+    const T_VEC w31 = sub(i0, i16);                                            \
+    const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) };                       \
+    const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) };        \
+    const T_VEC w34[2] = { add(w30, w32[0]), w32[1] };                         \
+    const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) };          \
+    const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) };          \
+    const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] };                         \
+    const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) };                      \
+    const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) };       \
+    const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) };                      \
+    const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) };       \
+    const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) };         \
+    const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) };         \
+    const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) };         \
+    const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) };         \
+    const T_VEC w46[2] = { add(w34[0], w42[0]), add(w34[1], w42[1]) };         \
+    const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) };         \
+    const T_VEC w48[2] = { add(w36[0], mul(kWeight2, add(w44[0], w44[1]))),    \
+                           add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) };  \
+    const T_VEC w49[2] = { add(w36[0],                                         \
+                               sub(sub(kWeight0, mul(kWeight2, w44[0])),       \
+                                   mul(kWeight2, w44[1]))),                    \
+                           add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) };  \
+    const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) };         \
+    const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) };         \
+    const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))),    \
+                           sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) };  \
+    const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))),    \
+                           add(w37[1], mul(kWeight2, add(w45[1], w45[0]))) };  \
+    const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) };                      \
+    const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) };       \
+    const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) };                      \
+    const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) };       \
+    const T_VEC w58[2] = { add(w54[0], w56[0]), add(w54[1], w56[1]) };         \
+    const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) };         \
+    const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) };         \
+    const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) };         \
+    const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) };                      \
+    const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) };       \
+    const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) };                      \
+    const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) };       \
+    const T_VEC w66[2] = { add(w62[0], w64[0]), add(w62[1], w64[1]) };         \
+    const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) };         \
+    const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) };         \
+    const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) };         \
+    const T_VEC w70[2] = { add(w58[0], w66[0]), add(w58[1], w66[1]) };         \
+    const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) };         \
+    const T_VEC w72[2] = { add(w60[0], mul(kWeight2, add(w68[0], w68[1]))),    \
+                           add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) };  \
+    const T_VEC w73[2] = { add(w60[0],                                         \
+                               sub(sub(kWeight0, mul(kWeight2, w68[0])),       \
+                                   mul(kWeight2, w68[1]))),                    \
+                           add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) };  \
+    const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) };         \
+    const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) };         \
+    const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))),    \
+                           sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) };  \
+    const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))),    \
+                           add(w61[1], mul(kWeight2, add(w69[1], w69[0]))) };  \
+    const T_VEC w78[2] = { add(w46[0], w70[0]), add(w46[1], w70[1]) };         \
+    const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) };         \
+    const T_VEC w80[2] = {                                                     \
+      add(w48[0], add(mul(kWeight3, w72[0]), mul(kWeight4, w72[1]))),          \
+      add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0])))           \
+    };                                                                         \
+    const T_VEC w81[2] = {                                                     \
+      add(w48[0],                                                              \
+          sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))),   \
+      add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1])))           \
+    };                                                                         \
+    const T_VEC w82[2] = { add(w50[0], mul(kWeight2, add(w74[0], w74[1]))),    \
+                           add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) };  \
+    const T_VEC w83[2] = { add(w50[0],                                         \
+                               sub(sub(kWeight0, mul(kWeight2, w74[0])),       \
+                                   mul(kWeight2, w74[1]))),                    \
+                           add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) };  \
+    const T_VEC w84[2] = {                                                     \
+      add(w52[0], add(mul(kWeight4, w76[0]), mul(kWeight3, w76[1]))),          \
+      add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0])))           \
+    };                                                                         \
+    const T_VEC w85[2] = {                                                     \
+      add(w52[0],                                                              \
+          sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))),   \
+      add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1])))           \
+    };                                                                         \
+    const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) };         \
+    const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) };         \
+    const T_VEC w88[2] = {                                                     \
+      sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))),          \
+      add(w49[1],                                                              \
+          sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0])))    \
+    };                                                                         \
+    const T_VEC w89[2] = {                                                     \
+      add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))),          \
+      add(w49[1], add(mul(kWeight4, w73[1]), mul(kWeight3, w73[0])))           \
+    };                                                                         \
+    const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))),    \
+                           sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) };  \
+    const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))),    \
+                           add(w51[1], mul(kWeight2, add(w75[1], w75[0]))) };  \
+    const T_VEC w92[2] = {                                                     \
+      sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))),          \
+      add(w53[1],                                                              \
+          sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0])))    \
+    };                                                                         \
+    const T_VEC w93[2] = {                                                     \
+      add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))),          \
+      add(w53[1], add(mul(kWeight3, w77[1]), mul(kWeight4, w77[0])))           \
+    };                                                                         \
+    const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) };                      \
+    const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) };       \
+    const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) };                       \
+    const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) };        \
+    const T_VEC w98[2] = { add(w94[0], w96[0]), add(w94[1], w96[1]) };         \
+    const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) };         \
+    const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) };        \
+    const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) };        \
+    const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) };                     \
+    const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) };      \
+    const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) };                     \
+    const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) };      \
+    const T_VEC w106[2] = { add(w102[0], w104[0]), add(w102[1], w104[1]) };    \
+    const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) };    \
+    const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) };    \
+    const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) };    \
+    const T_VEC w110[2] = { add(w98[0], w106[0]), add(w98[1], w106[1]) };      \
+    const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) };      \
+    const T_VEC w112[2] = {                                                    \
+      add(w100[0], mul(kWeight2, add(w108[0], w108[1]))),                      \
+      add(w100[1], mul(kWeight2, sub(w108[1], w108[0])))                       \
+    };                                                                         \
+    const T_VEC w113[2] = {                                                    \
+      add(w100[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \
+      add(w100[1], mul(kWeight2, sub(w108[0], w108[1])))                       \
+    };                                                                         \
+    const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) };      \
+    const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) };      \
+    const T_VEC w116[2] = {                                                    \
+      sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))),                      \
+      sub(w101[1], mul(kWeight2, add(w109[1], w109[0])))                       \
+    };                                                                         \
+    const T_VEC w117[2] = {                                                    \
+      add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))),                      \
+      add(w101[1], mul(kWeight2, add(w109[1], w109[0])))                       \
+    };                                                                         \
+    const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) };                     \
+    const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) };      \
+    const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) };                     \
+    const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) };      \
+    const T_VEC w122[2] = { add(w118[0], w120[0]), add(w118[1], w120[1]) };    \
+    const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) };    \
+    const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) };    \
+    const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) };    \
+    const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) };                      \
+    const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) };       \
+    const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) };                     \
+    const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) };      \
+    const T_VEC w130[2] = { add(w126[0], w128[0]), add(w126[1], w128[1]) };    \
+    const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) };    \
+    const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) };    \
+    const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) };    \
+    const T_VEC w134[2] = { add(w122[0], w130[0]), add(w122[1], w130[1]) };    \
+    const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) };    \
+    const T_VEC w136[2] = {                                                    \
+      add(w124[0], mul(kWeight2, add(w132[0], w132[1]))),                      \
+      add(w124[1], mul(kWeight2, sub(w132[1], w132[0])))                       \
+    };                                                                         \
+    const T_VEC w137[2] = {                                                    \
+      add(w124[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \
+      add(w124[1], mul(kWeight2, sub(w132[0], w132[1])))                       \
+    };                                                                         \
+    const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) };    \
+    const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) };    \
+    const T_VEC w140[2] = {                                                    \
+      sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))),                      \
+      sub(w125[1], mul(kWeight2, add(w133[1], w133[0])))                       \
+    };                                                                         \
+    const T_VEC w141[2] = {                                                    \
+      add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))),                      \
+      add(w125[1], mul(kWeight2, add(w133[1], w133[0])))                       \
+    };                                                                         \
+    const T_VEC w142[2] = { add(w110[0], w134[0]), add(w110[1], w134[1]) };    \
+    const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) };    \
+    const T_VEC w144[2] = {                                                    \
+      add(w112[0], add(mul(kWeight3, w136[0]), mul(kWeight4, w136[1]))),       \
+      add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0])))        \
+    };                                                                         \
+    const T_VEC w145[2] = {                                                    \
+      add(w112[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \
+      add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1])))        \
+    };                                                                         \
+    const T_VEC w146[2] = {                                                    \
+      add(w114[0], mul(kWeight2, add(w138[0], w138[1]))),                      \
+      add(w114[1], mul(kWeight2, sub(w138[1], w138[0])))                       \
+    };                                                                         \
+    const T_VEC w147[2] = {                                                    \
+      add(w114[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \
+      add(w114[1], mul(kWeight2, sub(w138[0], w138[1])))                       \
+    };                                                                         \
+    const T_VEC w148[2] = {                                                    \
+      add(w116[0], add(mul(kWeight4, w140[0]), mul(kWeight3, w140[1]))),       \
+      add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0])))        \
+    };                                                                         \
+    const T_VEC w149[2] = {                                                    \
+      add(w116[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \
+      add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1])))        \
+    };                                                                         \
+    const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) };    \
+    const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) };    \
+    const T_VEC w152[2] = {                                                    \
+      sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))),       \
+      add(w113[1],                                                             \
+          sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0])))  \
+    };                                                                         \
+    const T_VEC w153[2] = {                                                    \
+      add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))),       \
+      add(w113[1], add(mul(kWeight4, w137[1]), mul(kWeight3, w137[0])))        \
+    };                                                                         \
+    const T_VEC w154[2] = {                                                    \
+      sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))),                      \
+      sub(w115[1], mul(kWeight2, add(w139[1], w139[0])))                       \
+    };                                                                         \
+    const T_VEC w155[2] = {                                                    \
+      add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))),                      \
+      add(w115[1], mul(kWeight2, add(w139[1], w139[0])))                       \
+    };                                                                         \
+    const T_VEC w156[2] = {                                                    \
+      sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))),       \
+      add(w117[1],                                                             \
+          sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0])))  \
+    };                                                                         \
+    const T_VEC w157[2] = {                                                    \
+      add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))),       \
+      add(w117[1], add(mul(kWeight3, w141[1]), mul(kWeight4, w141[0])))        \
+    };                                                                         \
+    store(output + 0 * stride, add(w78[0], w142[0]));                          \
+    store(output + 1 * stride,                                                 \
+          add(w80[0], add(mul(kWeight5, w144[0]), mul(kWeight6, w144[1]))));   \
+    store(output + 2 * stride,                                                 \
+          add(w82[0], add(mul(kWeight3, w146[0]), mul(kWeight4, w146[1]))));   \
+    store(output + 3 * stride,                                                 \
+          add(w84[0], add(mul(kWeight7, w148[0]), mul(kWeight8, w148[1]))));   \
+    store(output + 4 * stride,                                                 \
+          add(w86[0], mul(kWeight2, add(w150[0], w150[1]))));                  \
+    store(output + 5 * stride,                                                 \
+          add(w88[0], add(mul(kWeight8, w152[0]), mul(kWeight7, w152[1]))));   \
+    store(output + 6 * stride,                                                 \
+          add(w90[0], add(mul(kWeight4, w154[0]), mul(kWeight3, w154[1]))));   \
+    store(output + 7 * stride,                                                 \
+          add(w92[0], add(mul(kWeight6, w156[0]), mul(kWeight5, w156[1]))));   \
+    store(output + 8 * stride, add(w79[0], w143[1]));                          \
+    store(output + 9 * stride,                                                 \
+          sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1]))));   \
+    store(output + 10 * stride,                                                \
+          sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1]))));   \
+    store(output + 11 * stride,                                                \
+          sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1]))));   \
+    store(output + 12 * stride,                                                \
+          sub(w87[0], mul(kWeight2, sub(w151[0], w151[1]))));                  \
+    store(output + 13 * stride,                                                \
+          sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1]))));   \
+    store(output + 14 * stride,                                                \
+          sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1]))));   \
+    store(output + 15 * stride,                                                \
+          sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1]))));   \
+    store(output + 16 * stride, sub(w78[0], w142[0]));                         \
+    store(output + 17 * stride,                                                \
+          add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])),               \
+                          mul(kWeight6, w144[1]))));                           \
+    store(output + 18 * stride,                                                \
+          add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])),               \
+                          mul(kWeight4, w146[1]))));                           \
+    store(output + 19 * stride,                                                \
+          add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])),               \
+                          mul(kWeight8, w148[1]))));                           \
+    store(output + 20 * stride,                                                \
+          add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])),               \
+                          mul(kWeight2, w150[1]))));                           \
+    store(output + 21 * stride,                                                \
+          add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])),               \
+                          mul(kWeight7, w152[1]))));                           \
+    store(output + 22 * stride,                                                \
+          add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])),               \
+                          mul(kWeight3, w154[1]))));                           \
+    store(output + 23 * stride,                                                \
+          add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])),               \
+                          mul(kWeight5, w156[1]))));                           \
+    store(output + 24 * stride, sub(w79[0], w143[1]));                         \
+    store(output + 25 * stride,                                                \
+          add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1]))));   \
+    store(output + 26 * stride,                                                \
+          add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1]))));   \
+    store(output + 27 * stride,                                                \
+          add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1]))));   \
+    store(output + 28 * stride,                                                \
+          add(w87[0], mul(kWeight2, sub(w151[0], w151[1]))));                  \
+    store(output + 29 * stride,                                                \
+          add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1]))));   \
+    store(output + 30 * stride,                                                \
+          add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1]))));   \
+    store(output + 31 * stride,                                                \
+          add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1]))));   \
+  }
+
+#endif  // AOM_DSP_FFT_COMMON_H_
diff --git a/third_party/aom/aom_dsp/fwd_txfm.c b/third_party/aom/aom_dsp/fwd_txfm.c
index 1ceef7782..e50f951c1 100644
--- a/third_party/aom/aom_dsp/fwd_txfm.c
+++ b/third_party/aom/aom_dsp/fwd_txfm.c
@@ -9,84 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "aom_dsp/fwd_txfm.h"
 #include <assert.h>
-#include "./aom_dsp_rtcd.h"
-
-void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  tran_low_t intermediate[4 * 4];
-  const tran_low_t *in_low = NULL;
-  tran_low_t *out = intermediate;
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    tran_high_t in_high[4];    // canbe16
-    tran_high_t step[4];       // canbe16
-    tran_high_t temp1, temp2;  // needs32
-    int i;
-    for (i = 0; i < 4; ++i) {
-      // Load inputs.
-      if (pass == 0) {
-        in_high[0] = input[0 * stride] * 16;
-        in_high[1] = input[1 * stride] * 16;
-        in_high[2] = input[2 * stride] * 16;
-        in_high[3] = input[3 * stride] * 16;
-        if (i == 0 && in_high[0]) {
-          ++in_high[0];
-        }
-      } else {
-        assert(in_low != NULL);
-        in_high[0] = in_low[0 * 4];
-        in_high[1] = in_low[1 * 4];
-        in_high[2] = in_low[2 * 4];
-        in_high[3] = in_low[3 * 4];
-        ++in_low;
-      }
-      // Transform.
-      step[0] = in_high[0] + in_high[3];
-      step[1] = in_high[1] + in_high[2];
-      step[2] = in_high[1] - in_high[2];
-      step[3] = in_high[0] - in_high[3];
-      temp1 = (step[0] + step[1]) * cospi_16_64;
-      temp2 = (step[0] - step[1]) * cospi_16_64;
-      out[0] = (tran_low_t)fdct_round_shift(temp1);
-      out[2] = (tran_low_t)fdct_round_shift(temp2);
-      temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
-      temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
-      out[1] = (tran_low_t)fdct_round_shift(temp1);
-      out[3] = (tran_low_t)fdct_round_shift(temp2);
-      // Do next column (which is a transposed row in second/horizontal pass)
-      ++input;
-      out += 4;
-    }
-    // Setup in/out for next pass.
-    in_low = intermediate;
-    out = output;
-  }
-
-  {
-    int i, j;
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
-    }
-  }
-}
-
-void aom_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
-  int r, c;
-  tran_low_t sum = 0;
-  for (r = 0; r < 4; ++r)
-    for (c = 0; c < 4; ++c) sum += input[r * stride + c];
-
-  output[0] = sum << 1;
-}
+#include "aom_dsp/txfm_common.h"
+#include "config/aom_dsp_rtcd.h"
 
 void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
   int i, j;
@@ -172,596 +97,7 @@ void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
   }
 }
 
-void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  tran_low_t intermediate[256];
-  const tran_low_t *in_low = NULL;
-  tran_low_t *out = intermediate;
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    tran_high_t step1[8];      // canbe16
-    tran_high_t step2[8];      // canbe16
-    tran_high_t step3[8];      // canbe16
-    tran_high_t in_high[8];    // canbe16
-    tran_high_t temp1, temp2;  // needs32
-    int i;
-    for (i = 0; i < 16; i++) {
-      if (0 == pass) {
-        // Calculate input for the first 8 results.
-        in_high[0] = (input[0 * stride] + input[15 * stride]) * 4;
-        in_high[1] = (input[1 * stride] + input[14 * stride]) * 4;
-        in_high[2] = (input[2 * stride] + input[13 * stride]) * 4;
-        in_high[3] = (input[3 * stride] + input[12 * stride]) * 4;
-        in_high[4] = (input[4 * stride] + input[11 * stride]) * 4;
-        in_high[5] = (input[5 * stride] + input[10 * stride]) * 4;
-        in_high[6] = (input[6 * stride] + input[9 * stride]) * 4;
-        in_high[7] = (input[7 * stride] + input[8 * stride]) * 4;
-        // Calculate input for the next 8 results.
-        step1[0] = (input[7 * stride] - input[8 * stride]) * 4;
-        step1[1] = (input[6 * stride] - input[9 * stride]) * 4;
-        step1[2] = (input[5 * stride] - input[10 * stride]) * 4;
-        step1[3] = (input[4 * stride] - input[11 * stride]) * 4;
-        step1[4] = (input[3 * stride] - input[12 * stride]) * 4;
-        step1[5] = (input[2 * stride] - input[13 * stride]) * 4;
-        step1[6] = (input[1 * stride] - input[14 * stride]) * 4;
-        step1[7] = (input[0 * stride] - input[15 * stride]) * 4;
-      } else {
-        // Calculate input for the first 8 results.
-        assert(in_low != NULL);
-        in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2);
-        in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2);
-        in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2);
-        in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2);
-        in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2);
-        in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2);
-        in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2);
-        in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2);
-        // Calculate input for the next 8 results.
-        step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2);
-        step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2);
-        step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2);
-        step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2);
-        step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2);
-        step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2);
-        step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2);
-        step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2);
-        in_low++;
-      }
-      // Work on the first eight values; fdct8(input, even_results);
-      {
-        tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
-        tran_high_t t0, t1, t2, t3;                  // needs32
-        tran_high_t x0, x1, x2, x3;                  // canbe16
-
-        // stage 1
-        s0 = in_high[0] + in_high[7];
-        s1 = in_high[1] + in_high[6];
-        s2 = in_high[2] + in_high[5];
-        s3 = in_high[3] + in_high[4];
-        s4 = in_high[3] - in_high[4];
-        s5 = in_high[2] - in_high[5];
-        s6 = in_high[1] - in_high[6];
-        s7 = in_high[0] - in_high[7];
-
-        // fdct4(step, step);
-        x0 = s0 + s3;
-        x1 = s1 + s2;
-        x2 = s1 - s2;
-        x3 = s0 - s3;
-        t0 = (x0 + x1) * cospi_16_64;
-        t1 = (x0 - x1) * cospi_16_64;
-        t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
-        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
-        out[0] = (tran_low_t)fdct_round_shift(t0);
-        out[4] = (tran_low_t)fdct_round_shift(t2);
-        out[8] = (tran_low_t)fdct_round_shift(t1);
-        out[12] = (tran_low_t)fdct_round_shift(t3);
-
-        // Stage 2
-        t0 = (s6 - s5) * cospi_16_64;
-        t1 = (s6 + s5) * cospi_16_64;
-        t2 = fdct_round_shift(t0);
-        t3 = fdct_round_shift(t1);
-
-        // Stage 3
-        x0 = s4 + t2;
-        x1 = s4 - t2;
-        x2 = s7 - t3;
-        x3 = s7 + t3;
-
-        // Stage 4
-        t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
-        t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
-        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-        t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
-        out[2] = (tran_low_t)fdct_round_shift(t0);
-        out[6] = (tran_low_t)fdct_round_shift(t2);
-        out[10] = (tran_low_t)fdct_round_shift(t1);
-        out[14] = (tran_low_t)fdct_round_shift(t3);
-      }
-      // Work on the next eight values; step1 -> odd_results
-      {
-        // step 2
-        temp1 = (step1[5] - step1[2]) * cospi_16_64;
-        temp2 = (step1[4] - step1[3]) * cospi_16_64;
-        step2[2] = fdct_round_shift(temp1);
-        step2[3] = fdct_round_shift(temp2);
-        temp1 = (step1[4] + step1[3]) * cospi_16_64;
-        temp2 = (step1[5] + step1[2]) * cospi_16_64;
-        step2[4] = fdct_round_shift(temp1);
-        step2[5] = fdct_round_shift(temp2);
-        // step 3
-        step3[0] = step1[0] + step2[3];
-        step3[1] = step1[1] + step2[2];
-        step3[2] = step1[1] - step2[2];
-        step3[3] = step1[0] - step2[3];
-        step3[4] = step1[7] - step2[4];
-        step3[5] = step1[6] - step2[5];
-        step3[6] = step1[6] + step2[5];
-        step3[7] = step1[7] + step2[4];
-        // step 4
-        temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
-        temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
-        step2[1] = fdct_round_shift(temp1);
-        step2[2] = fdct_round_shift(temp2);
-        temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
-        temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
-        step2[5] = fdct_round_shift(temp1);
-        step2[6] = fdct_round_shift(temp2);
-        // step 5
-        step1[0] = step3[0] + step2[1];
-        step1[1] = step3[0] - step2[1];
-        step1[2] = step3[3] + step2[2];
-        step1[3] = step3[3] - step2[2];
-        step1[4] = step3[4] - step2[5];
-        step1[5] = step3[4] + step2[5];
-        step1[6] = step3[7] - step2[6];
-        step1[7] = step3[7] + step2[6];
-        // step 6
-        temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
-        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-        out[1] = (tran_low_t)fdct_round_shift(temp1);
-        out[9] = (tran_low_t)fdct_round_shift(temp2);
-        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
-        temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
-        out[5] = (tran_low_t)fdct_round_shift(temp1);
-        out[13] = (tran_low_t)fdct_round_shift(temp2);
-        temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
-        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-        out[3] = (tran_low_t)fdct_round_shift(temp1);
-        out[11] = (tran_low_t)fdct_round_shift(temp2);
-        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
-        temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
-        out[7] = (tran_low_t)fdct_round_shift(temp1);
-        out[15] = (tran_low_t)fdct_round_shift(temp2);
-      }
-      // Do next column (which is a transposed row in second/horizontal pass)
-      input++;
-      out += 16;
-    }
-    // Setup in/out for next pass.
-    in_low = intermediate;
-    out = output;
-  }
-}
-
-static INLINE tran_high_t dct_32_round(tran_high_t input) {
-  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  // TODO(debargha, peter.derivaz): Find new bounds for this assert,
-  // and make the bounds consts.
-  // assert(-131072 <= rv && rv <= 131071);
-  return rv;
-}
-
-static INLINE tran_high_t half_round_shift(tran_high_t input) {
-  tran_high_t rv = (input + 1 + (input < 0)) >> 2;
-  return rv;
-}
-
-void aom_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
-  tran_high_t step[32];
-  // Stage 1
-  step[0] = input[0] + input[(32 - 1)];
-  step[1] = input[1] + input[(32 - 2)];
-  step[2] = input[2] + input[(32 - 3)];
-  step[3] = input[3] + input[(32 - 4)];
-  step[4] = input[4] + input[(32 - 5)];
-  step[5] = input[5] + input[(32 - 6)];
-  step[6] = input[6] + input[(32 - 7)];
-  step[7] = input[7] + input[(32 - 8)];
-  step[8] = input[8] + input[(32 - 9)];
-  step[9] = input[9] + input[(32 - 10)];
-  step[10] = input[10] + input[(32 - 11)];
-  step[11] = input[11] + input[(32 - 12)];
-  step[12] = input[12] + input[(32 - 13)];
-  step[13] = input[13] + input[(32 - 14)];
-  step[14] = input[14] + input[(32 - 15)];
-  step[15] = input[15] + input[(32 - 16)];
-  step[16] = -input[16] + input[(32 - 17)];
-  step[17] = -input[17] + input[(32 - 18)];
-  step[18] = -input[18] + input[(32 - 19)];
-  step[19] = -input[19] + input[(32 - 20)];
-  step[20] = -input[20] + input[(32 - 21)];
-  step[21] = -input[21] + input[(32 - 22)];
-  step[22] = -input[22] + input[(32 - 23)];
-  step[23] = -input[23] + input[(32 - 24)];
-  step[24] = -input[24] + input[(32 - 25)];
-  step[25] = -input[25] + input[(32 - 26)];
-  step[26] = -input[26] + input[(32 - 27)];
-  step[27] = -input[27] + input[(32 - 28)];
-  step[28] = -input[28] + input[(32 - 29)];
-  step[29] = -input[29] + input[(32 - 30)];
-  step[30] = -input[30] + input[(32 - 31)];
-  step[31] = -input[31] + input[(32 - 32)];
-
-  // Stage 2
-  output[0] = step[0] + step[16 - 1];
-  output[1] = step[1] + step[16 - 2];
-  output[2] = step[2] + step[16 - 3];
-  output[3] = step[3] + step[16 - 4];
-  output[4] = step[4] + step[16 - 5];
-  output[5] = step[5] + step[16 - 6];
-  output[6] = step[6] + step[16 - 7];
-  output[7] = step[7] + step[16 - 8];
-  output[8] = -step[8] + step[16 - 9];
-  output[9] = -step[9] + step[16 - 10];
-  output[10] = -step[10] + step[16 - 11];
-  output[11] = -step[11] + step[16 - 12];
-  output[12] = -step[12] + step[16 - 13];
-  output[13] = -step[13] + step[16 - 14];
-  output[14] = -step[14] + step[16 - 15];
-  output[15] = -step[15] + step[16 - 16];
-
-  output[16] = step[16];
-  output[17] = step[17];
-  output[18] = step[18];
-  output[19] = step[19];
-
-  output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
-  output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
-  output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
-  output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
-
-  output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
-  output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
-  output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
-  output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
-
-  output[28] = step[28];
-  output[29] = step[29];
-  output[30] = step[30];
-  output[31] = step[31];
-
-  // dump the magnitude by 4, hence the intermediate values are within
-  // the range of 16 bits.
-  if (round) {
-    output[0] = half_round_shift(output[0]);
-    output[1] = half_round_shift(output[1]);
-    output[2] = half_round_shift(output[2]);
-    output[3] = half_round_shift(output[3]);
-    output[4] = half_round_shift(output[4]);
-    output[5] = half_round_shift(output[5]);
-    output[6] = half_round_shift(output[6]);
-    output[7] = half_round_shift(output[7]);
-    output[8] = half_round_shift(output[8]);
-    output[9] = half_round_shift(output[9]);
-    output[10] = half_round_shift(output[10]);
-    output[11] = half_round_shift(output[11]);
-    output[12] = half_round_shift(output[12]);
-    output[13] = half_round_shift(output[13]);
-    output[14] = half_round_shift(output[14]);
-    output[15] = half_round_shift(output[15]);
-
-    output[16] = half_round_shift(output[16]);
-    output[17] = half_round_shift(output[17]);
-    output[18] = half_round_shift(output[18]);
-    output[19] = half_round_shift(output[19]);
-    output[20] = half_round_shift(output[20]);
-    output[21] = half_round_shift(output[21]);
-    output[22] = half_round_shift(output[22]);
-    output[23] = half_round_shift(output[23]);
-    output[24] = half_round_shift(output[24]);
-    output[25] = half_round_shift(output[25]);
-    output[26] = half_round_shift(output[26]);
-    output[27] = half_round_shift(output[27]);
-    output[28] = half_round_shift(output[28]);
-    output[29] = half_round_shift(output[29]);
-    output[30] = half_round_shift(output[30]);
-    output[31] = half_round_shift(output[31]);
-  }
-
-  // Stage 3
-  step[0] = output[0] + output[(8 - 1)];
-  step[1] = output[1] + output[(8 - 2)];
-  step[2] = output[2] + output[(8 - 3)];
-  step[3] = output[3] + output[(8 - 4)];
-  step[4] = -output[4] + output[(8 - 5)];
-  step[5] = -output[5] + output[(8 - 6)];
-  step[6] = -output[6] + output[(8 - 7)];
-  step[7] = -output[7] + output[(8 - 8)];
-  step[8] = output[8];
-  step[9] = output[9];
-  step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
-  step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
-  step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
-  step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
-  step[14] = output[14];
-  step[15] = output[15];
-
-  step[16] = output[16] + output[23];
-  step[17] = output[17] + output[22];
-  step[18] = output[18] + output[21];
-  step[19] = output[19] + output[20];
-  step[20] = -output[20] + output[19];
-  step[21] = -output[21] + output[18];
-  step[22] = -output[22] + output[17];
-  step[23] = -output[23] + output[16];
-  step[24] = -output[24] + output[31];
-  step[25] = -output[25] + output[30];
-  step[26] = -output[26] + output[29];
-  step[27] = -output[27] + output[28];
-  step[28] = output[28] + output[27];
-  step[29] = output[29] + output[26];
-  step[30] = output[30] + output[25];
-  step[31] = output[31] + output[24];
-
-  // Stage 4
-  output[0] = step[0] + step[3];
-  output[1] = step[1] + step[2];
-  output[2] = -step[2] + step[1];
-  output[3] = -step[3] + step[0];
-  output[4] = step[4];
-  output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
-  output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
-  output[7] = step[7];
-  output[8] = step[8] + step[11];
-  output[9] = step[9] + step[10];
-  output[10] = -step[10] + step[9];
-  output[11] = -step[11] + step[8];
-  output[12] = -step[12] + step[15];
-  output[13] = -step[13] + step[14];
-  output[14] = step[14] + step[13];
-  output[15] = step[15] + step[12];
-
-  output[16] = step[16];
-  output[17] = step[17];
-  output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
-  output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
-  output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
-  output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
-  output[22] = step[22];
-  output[23] = step[23];
-  output[24] = step[24];
-  output[25] = step[25];
-  output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
-  output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
-  output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
-  output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
-  output[30] = step[30];
-  output[31] = step[31];
-
-  // Stage 5
-  step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
-  step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
-  step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
-  step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
-  step[4] = output[4] + output[5];
-  step[5] = -output[5] + output[4];
-  step[6] = -output[6] + output[7];
-  step[7] = output[7] + output[6];
-  step[8] = output[8];
-  step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
-  step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
-  step[11] = output[11];
-  step[12] = output[12];
-  step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
-  step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
-  step[15] = output[15];
-
-  step[16] = output[16] + output[19];
-  step[17] = output[17] + output[18];
-  step[18] = -output[18] + output[17];
-  step[19] = -output[19] + output[16];
-  step[20] = -output[20] + output[23];
-  step[21] = -output[21] + output[22];
-  step[22] = output[22] + output[21];
-  step[23] = output[23] + output[20];
-  step[24] = output[24] + output[27];
-  step[25] = output[25] + output[26];
-  step[26] = -output[26] + output[25];
-  step[27] = -output[27] + output[24];
-  step[28] = -output[28] + output[31];
-  step[29] = -output[29] + output[30];
-  step[30] = output[30] + output[29];
-  step[31] = output[31] + output[28];
-
-  // Stage 6
-  output[0] = step[0];
-  output[1] = step[1];
-  output[2] = step[2];
-  output[3] = step[3];
-  output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
-  output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
-  output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
-  output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
-  output[8] = step[8] + step[9];
-  output[9] = -step[9] + step[8];
-  output[10] = -step[10] + step[11];
-  output[11] = step[11] + step[10];
-  output[12] = step[12] + step[13];
-  output[13] = -step[13] + step[12];
-  output[14] = -step[14] + step[15];
-  output[15] = step[15] + step[14];
-
-  output[16] = step[16];
-  output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
-  output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
-  output[19] = step[19];
-  output[20] = step[20];
-  output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
-  output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
-  output[23] = step[23];
-  output[24] = step[24];
-  output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
-  output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
-  output[27] = step[27];
-  output[28] = step[28];
-  output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
-  output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
-  output[31] = step[31];
-
-  // Stage 7
-  step[0] = output[0];
-  step[1] = output[1];
-  step[2] = output[2];
-  step[3] = output[3];
-  step[4] = output[4];
-  step[5] = output[5];
-  step[6] = output[6];
-  step[7] = output[7];
-  step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
-  step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
-  step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
-  step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
-  step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
-  step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
-  step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
-  step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
-
-  step[16] = output[16] + output[17];
-  step[17] = -output[17] + output[16];
-  step[18] = -output[18] + output[19];
-  step[19] = output[19] + output[18];
-  step[20] = output[20] + output[21];
-  step[21] = -output[21] + output[20];
-  step[22] = -output[22] + output[23];
-  step[23] = output[23] + output[22];
-  step[24] = output[24] + output[25];
-  step[25] = -output[25] + output[24];
-  step[26] = -output[26] + output[27];
-  step[27] = output[27] + output[26];
-  step[28] = output[28] + output[29];
-  step[29] = -output[29] + output[28];
-  step[30] = -output[30] + output[31];
-  step[31] = output[31] + output[30];
-
-  // Final stage --- outputs indices are bit-reversed.
-  output[0] = step[0];
-  output[16] = step[1];
-  output[8] = step[2];
-  output[24] = step[3];
-  output[4] = step[4];
-  output[20] = step[5];
-  output[12] = step[6];
-  output[28] = step[7];
-  output[2] = step[8];
-  output[18] = step[9];
-  output[10] = step[10];
-  output[26] = step[11];
-  output[6] = step[12];
-  output[22] = step[13];
-  output[14] = step[14];
-  output[30] = step[15];
-
-  output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
-  output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
-  output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
-  output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
-  output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
-  output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
-  output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
-  output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
-  output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
-  output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
-  output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
-  output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
-  output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
-  output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
-  output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
-  output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
-}
-
-void aom_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
-  int i, j;
-  tran_high_t output[32 * 32];
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
-    aom_fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-  }
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
-    aom_fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      out[j + i * 32] =
-          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
-  }
-}
-
-// Note that although we use dct_32_round in dct32 computation flow,
-// this 2d fdct32x32 for rate-distortion optimization loop is operating
-// within 16 bits precision.
-void aom_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
-  int i, j;
-  tran_high_t output[32 * 32];
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
-    aom_fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      // TODO(cd): see quality impact of only doing
-      //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
-      //           PS: also change code in aom_dsp/x86/aom_dct_sse2.c
-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-  }
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
-    aom_fdct32(temp_in, temp_out, 1);
-    for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
-  }
-}
-
-#if CONFIG_HIGHBITDEPTH
-void aom_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
-                          int stride) {
-  aom_fdct4x4_c(input, output, stride);
-}
-
 void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
                           int stride) {
   aom_fdct8x8_c(input, final_output, stride);
 }
-
-void aom_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
-                            int stride) {
-  aom_fdct16x16_c(input, output, stride);
-}
-
-void aom_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
-  aom_fdct32x32_c(input, out, stride);
-}
-void aom_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
-                               int stride) {
-  aom_fdct32x32_rd_c(input, out, stride);
-}
-
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/fwd_txfm.h b/third_party/aom/aom_dsp/fwd_txfm.h
deleted file mode 100644
index f4dc04ab4..000000000
--- a/third_party/aom/aom_dsp/fwd_txfm.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_FWD_TXFM_H_
-#define AOM_DSP_FWD_TXFM_H_
-
-#include "aom_dsp/txfm_common.h"
-
-static INLINE tran_high_t saturate_int16(tran_high_t value) {
-  tran_high_t result;
-  result = value > INT16_MAX ? INT16_MAX : value;
-  return result < INT16_MIN ? INT16_MIN : result;
-}
-
-void aom_fdct32(const tran_high_t *input, tran_high_t *output, int round);
-#endif  // AOM_DSP_FWD_TXFM_H_
diff --git a/third_party/aom/aom_dsp/grain_synthesis.c b/third_party/aom/aom_dsp/grain_synthesis.c
new file mode 100644
index 000000000..fcb6c290e
--- /dev/null
+++ b/third_party/aom/aom_dsp/grain_synthesis.c
@@ -0,0 +1,1392 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes film grain parameters and film grain synthesis
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "aom_dsp/grain_synthesis.h"
+#include "aom_mem/aom_mem.h"
+
+// Samples with Gaussian distribution in the range of [-2048, 2047] (12 bits)
+// with zero mean and standard deviation of about 512.
+// should be divided by 4 for 10-bit range and 16 for 8-bit range.
+static const int gaussian_sequence[2048] = {
+  56,    568,   -180,  172,   124,   -84,   172,   -64,   -900,  24,   820,
+  224,   1248,  996,   272,   -8,    -916,  -388,  -732,  -104,  -188, 800,
+  112,   -652,  -320,  -376,  140,   -252,  492,   -168,  44,    -788, 588,
+  -584,  500,   -228,  12,    680,   272,   -476,  972,   -100,  652,  368,
+  432,   -196,  -720,  -192,  1000,  -332,  652,   -136,  -552,  -604, -4,
+  192,   -220,  -136,  1000,  -52,   372,   -96,   -624,  124,   -24,  396,
+  540,   -12,   -104,  640,   464,   244,   -208,  -84,   368,   -528, -740,
+  248,   -968,  -848,  608,   376,   -60,   -292,  -40,   -156,  252,  -292,
+  248,   224,   -280,  400,   -244,  244,   -60,   76,    -80,   212,  532,
+  340,   128,   -36,   824,   -352,  -60,   -264,  -96,   -612,  416,  -704,
+  220,   -204,  640,   -160,  1220,  -408,  900,   336,   20,    -336, -96,
+  -792,  304,   48,    -28,   -1232, -1172, -448,  104,   -292,  -520, 244,
+  60,    -948,  0,     -708,  268,   108,   356,   -548,  488,   -344, -136,
+  488,   -196,  -224,  656,   -236,  -1128, 60,    4,     140,   276,  -676,
+  -376,  168,   -108,  464,   8,     564,   64,    240,   308,   -300, -400,
+  -456,  -136,  56,    120,   -408,  -116,  436,   504,   -232,  328,  844,
+  -164,  -84,   784,   -168,  232,   -224,  348,   -376,  128,   568,  96,
+  -1244, -288,  276,   848,   832,   -360,  656,   464,   -384,  -332, -356,
+  728,   -388,  160,   -192,  468,   296,   224,   140,   -776,  -100, 280,
+  4,     196,   44,    -36,   -648,  932,   16,    1428,  28,    528,  808,
+  772,   20,    268,   88,    -332,  -284,  124,   -384,  -448,  208,  -228,
+  -1044, -328,  660,   380,   -148,  -300,  588,   240,   540,   28,   136,
+  -88,   -436,  256,   296,   -1000, 1400,  0,     -48,   1056,  -136, 264,
+  -528,  -1108, 632,   -484,  -592,  -344,  796,   124,   -668,  -768, 388,
+  1296,  -232,  -188,  -200,  -288,  -4,    308,   100,   -168,  256,  -500,
+  204,   -508,  648,   -136,  372,   -272,  -120,  -1004, -552,  -548, -384,
+  548,   -296,  428,   -108,  -8,    -912,  -324,  -224,  -88,   -112, -220,
+  -100,  996,   -796,  548,   360,   -216,  180,   428,   -200,  -212, 148,
+  96,    148,   284,   216,   -412,  -320,  120,   -300,  -384,  -604, -572,
+  -332,  -8,    -180,  -176,  696,   116,   -88,   628,   76,    44,   -516,
+  240,   -208,  -40,   100,   -592,  344,   -308,  -452,  -228,  20,   916,
+  -1752, -136,  -340,  -804,  140,   40,    512,   340,   248,   184,  -492,
+  896,   -156,  932,   -628,  328,   -688,  -448,  -616,  -752,  -100, 560,
+  -1020, 180,   -800,  -64,   76,    576,   1068,  396,   660,   552,  -108,
+  -28,   320,   -628,  312,   -92,   -92,   -472,  268,   16,    560,  516,
+  -672,  -52,   492,   -100,  260,   384,   284,   292,   304,   -148, 88,
+  -152,  1012,  1064,  -228,  164,   -376,  -684,  592,   -392,  156,  196,
+  -524,  -64,   -884,  160,   -176,  636,   648,   404,   -396,  -436, 864,
+  424,   -728,  988,   -604,  904,   -592,  296,   -224,  536,   -176, -920,
+  436,   -48,   1176,  -884,  416,   -776,  -824,  -884,  524,   -548, -564,
+  -68,   -164,  -96,   692,   364,   -692,  -1012, -68,   260,   -480, 876,
+  -1116, 452,   -332,  -352,  892,   -1088, 1220,  -676,  12,    -292, 244,
+  496,   372,   -32,   280,   200,   112,   -440,  -96,   24,    -644, -184,
+  56,    -432,  224,   -980,  272,   -260,  144,   -436,  420,   356,  364,
+  -528,  76,    172,   -744,  -368,  404,   -752,  -416,  684,   -688, 72,
+  540,   416,   92,    444,   480,   -72,   -1416, 164,   -1172, -68,  24,
+  424,   264,   1040,  128,   -912,  -524,  -356,  64,    876,   -12,  4,
+  -88,   532,   272,   -524,  320,   276,   -508,  940,   24,    -400, -120,
+  756,   60,    236,   -412,  100,   376,   -484,  400,   -100,  -740, -108,
+  -260,  328,   -268,  224,   -200,  -416,  184,   -604,  -564,  -20,  296,
+  60,    892,   -888,  60,    164,   68,    -760,  216,   -296,  904,  -336,
+  -28,   404,   -356,  -568,  -208,  -1480, -512,  296,   328,   -360, -164,
+  -1560, -776,  1156,  -428,  164,   -504,  -112,  120,   -216,  -148, -264,
+  308,   32,    64,    -72,   72,    116,   176,   -64,   -272,  460,  -536,
+  -784,  -280,  348,   108,   -752,  -132,  524,   -540,  -776,  116,  -296,
+  -1196, -288,  -560,  1040,  -472,  116,   -848,  -1116, 116,   636,  696,
+  284,   -176,  1016,  204,   -864,  -648,  -248,  356,   972,   -584, -204,
+  264,   880,   528,   -24,   -184,  116,   448,   -144,  828,   524,  212,
+  -212,  52,    12,    200,   268,   -488,  -404,  -880,  824,   -672, -40,
+  908,   -248,  500,   716,   -576,  492,   -576,  16,    720,   -108, 384,
+  124,   344,   280,   576,   -500,  252,   104,   -308,  196,   -188, -8,
+  1268,  296,   1032,  -1196, 436,   316,   372,   -432,  -200,  -660, 704,
+  -224,  596,   -132,  268,   32,    -452,  884,   104,   -1008, 424,  -1348,
+  -280,  4,     -1168, 368,   476,   696,   300,   -8,    24,    180,  -592,
+  -196,  388,   304,   500,   724,   -160,  244,   -84,   272,   -256, -420,
+  320,   208,   -144,  -156,  156,   364,   452,   28,    540,   316,  220,
+  -644,  -248,  464,   72,    360,   32,    -388,  496,   -680,  -48,  208,
+  -116,  -408,  60,    -604,  -392,  548,   -840,  784,   -460,  656,  -544,
+  -388,  -264,  908,   -800,  -628,  -612,  -568,  572,   -220,  164,  288,
+  -16,   -308,  308,   -112,  -636,  -760,  280,   -668,  432,   364,  240,
+  -196,  604,   340,   384,   196,   592,   -44,   -500,  432,   -580, -132,
+  636,   -76,   392,   4,     -412,  540,   508,   328,   -356,  -36,  16,
+  -220,  -64,   -248,  -60,   24,    -192,  368,   1040,  92,    -24,  -1044,
+  -32,   40,    104,   148,   192,   -136,  -520,  56,    -816,  -224, 732,
+  392,   356,   212,   -80,   -424,  -1008, -324,  588,   -1496, 576,  460,
+  -816,  -848,  56,    -580,  -92,   -1372, -112,  -496,  200,   364,  52,
+  -140,  48,    -48,   -60,   84,    72,    40,    132,   -356,  -268, -104,
+  -284,  -404,  732,   -520,  164,   -304,  -540,  120,   328,   -76,  -460,
+  756,   388,   588,   236,   -436,  -72,   -176,  -404,  -316,  -148, 716,
+  -604,  404,   -72,   -88,   -888,  -68,   944,   88,    -220,  -344, 960,
+  472,   460,   -232,  704,   120,   832,   -228,  692,   -508,  132,  -476,
+  844,   -748,  -364,  -44,   1116,  -1104, -1056, 76,    428,   552,  -692,
+  60,    356,   96,    -384,  -188,  -612,  -576,  736,   508,   892,  352,
+  -1132, 504,   -24,   -352,  324,   332,   -600,  -312,  292,   508,  -144,
+  -8,    484,   48,    284,   -260,  -240,  256,   -100,  -292,  -204, -44,
+  472,   -204,  908,   -188,  -1000, -256,  92,    1164,  -392,  564,  356,
+  652,   -28,   -884,  256,   484,   -192,  760,   -176,  376,   -524, -452,
+  -436,  860,   -736,  212,   124,   504,   -476,  468,   76,    -472, 552,
+  -692,  -944,  -620,  740,   -240,  400,   132,   20,    192,   -196, 264,
+  -668,  -1012, -60,   296,   -316,  -828,  76,    -156,  284,   -768, -448,
+  -832,  148,   248,   652,   616,   1236,  288,   -328,  -400,  -124, 588,
+  220,   520,   -696,  1032,  768,   -740,  -92,   -272,  296,   448,  -464,
+  412,   -200,  392,   440,   -200,  264,   -152,  -260,  320,   1032, 216,
+  320,   -8,    -64,   156,   -1016, 1084,  1172,  536,   484,   -432, 132,
+  372,   -52,   -256,  84,    116,   -352,  48,    116,   304,   -384, 412,
+  924,   -300,  528,   628,   180,   648,   44,    -980,  -220,  1320, 48,
+  332,   748,   524,   -268,  -720,  540,   -276,  564,   -344,  -208, -196,
+  436,   896,   88,    -392,  132,   80,    -964,  -288,  568,   56,   -48,
+  -456,  888,   8,     552,   -156,  -292,  948,   288,   128,   -716, -292,
+  1192,  -152,  876,   352,   -600,  -260,  -812,  -468,  -28,   -120, -32,
+  -44,   1284,  496,   192,   464,   312,   -76,   -516,  -380,  -456, -1012,
+  -48,   308,   -156,  36,    492,   -156,  -808,  188,   1652,  68,   -120,
+  -116,  316,   160,   -140,  352,   808,   -416,  592,   316,   -480, 56,
+  528,   -204,  -568,  372,   -232,  752,   -344,  744,   -4,    324,  -416,
+  -600,  768,   268,   -248,  -88,   -132,  -420,  -432,  80,    -288, 404,
+  -316,  -1216, -588,  520,   -108,  92,    -320,  368,   -480,  -216, -92,
+  1688,  -300,  180,   1020,  -176,  820,   -68,   -228,  -260,  436,  -904,
+  20,    40,    -508,  440,   -736,  312,   332,   204,   760,   -372, 728,
+  96,    -20,   -632,  -520,  -560,  336,   1076,  -64,   -532,  776,  584,
+  192,   396,   -728,  -520,  276,   -188,  80,    -52,   -612,  -252, -48,
+  648,   212,   -688,  228,   -52,   -260,  428,   -412,  -272,  -404, 180,
+  816,   -796,  48,    152,   484,   -88,   -216,  988,   696,   188,  -528,
+  648,   -116,  -180,  316,   476,   12,    -564,  96,    476,   -252, -364,
+  -376,  -392,  556,   -256,  -576,  260,   -352,  120,   -16,   -136, -260,
+  -492,  72,    556,   660,   580,   616,   772,   436,   424,   -32,  -324,
+  -1268, 416,   -324,  -80,   920,   160,   228,   724,   32,    -516, 64,
+  384,   68,    -128,  136,   240,   248,   -204,  -68,   252,   -932, -120,
+  -480,  -628,  -84,   192,   852,   -404,  -288,  -132,  204,   100,  168,
+  -68,   -196,  -868,  460,   1080,  380,   -80,   244,   0,     484,  -888,
+  64,    184,   352,   600,   460,   164,   604,   -196,  320,   -64,  588,
+  -184,  228,   12,    372,   48,    -848,  -344,  224,   208,   -200, 484,
+  128,   -20,   272,   -468,  -840,  384,   256,   -720,  -520,  -464, -580,
+  112,   -120,  644,   -356,  -208,  -608,  -528,  704,   560,   -424, 392,
+  828,   40,    84,    200,   -152,  0,     -144,  584,   280,   -120, 80,
+  -556,  -972,  -196,  -472,  724,   80,    168,   -32,   88,    160,  -688,
+  0,     160,   356,   372,   -776,  740,   -128,  676,   -248,  -480, 4,
+  -364,  96,    544,   232,   -1032, 956,   236,   356,   20,    -40,  300,
+  24,    -676,  -596,  132,   1120,  -104,  532,   -1096, 568,   648,  444,
+  508,   380,   188,   -376,  -604,  1488,  424,   24,    756,   -220, -192,
+  716,   120,   920,   688,   168,   44,    -460,  568,   284,   1144, 1160,
+  600,   424,   888,   656,   -356,  -320,  220,   316,   -176,  -724, -188,
+  -816,  -628,  -348,  -228,  -380,  1012,  -452,  -660,  736,   928,  404,
+  -696,  -72,   -268,  -892,  128,   184,   -344,  -780,  360,   336,  400,
+  344,   428,   548,   -112,  136,   -228,  -216,  -820,  -516,  340,  92,
+  -136,  116,   -300,  376,   -244,  100,   -316,  -520,  -284,  -12,  824,
+  164,   -548,  -180,  -128,  116,   -924,  -828,  268,   -368,  -580, 620,
+  192,   160,   0,     -1676, 1068,  424,   -56,   -360,  468,   -156, 720,
+  288,   -528,  556,   -364,  548,   -148,  504,   316,   152,   -648, -620,
+  -684,  -24,   -376,  -384,  -108,  -920,  -1032, 768,   180,   -264, -508,
+  -1268, -260,  -60,   300,   -240,  988,   724,   -376,  -576,  -212, -736,
+  556,   192,   1092,  -620,  -880,  376,   -56,   -4,    -216,  -32,  836,
+  268,   396,   1332,  864,   -600,  100,   56,    -412,  -92,   356,  180,
+  884,   -468,  -436,  292,   -388,  -804,  -704,  -840,  368,   -348, 140,
+  -724,  1536,  940,   372,   112,   -372,  436,   -480,  1136,  296,  -32,
+  -228,  132,   -48,   -220,  868,   -1016, -60,   -1044, -464,  328,  916,
+  244,   12,    -736,  -296,  360,   468,   -376,  -108,  -92,   788,  368,
+  -56,   544,   400,   -672,  -420,  728,   16,    320,   44,    -284, -380,
+  -796,  488,   132,   204,   -596,  -372,  88,    -152,  -908,  -636, -572,
+  -624,  -116,  -692,  -200,  -56,   276,   -88,   484,   -324,  948,  864,
+  1000,  -456,  -184,  -276,  292,   -296,  156,   676,   320,   160,  908,
+  -84,   -1236, -288,  -116,  260,   -372,  -644,  732,   -756,  -96,  84,
+  344,   -520,  348,   -688,  240,   -84,   216,   -1044, -136,  -676, -396,
+  -1500, 960,   -40,   176,   168,   1516,  420,   -504,  -344,  -364, -360,
+  1216,  -940,  -380,  -212,  252,   -660,  -708,  484,   -444,  -152, 928,
+  -120,  1112,  476,   -260,  560,   -148,  -344,  108,   -196,  228,  -288,
+  504,   560,   -328,  -88,   288,   -1008, 460,   -228,  468,   -836, -196,
+  76,    388,   232,   412,   -1168, -716,  -644,  756,   -172,  -356, -504,
+  116,   432,   528,   48,    476,   -168,  -608,  448,   160,   -532, -272,
+  28,    -676,  -12,   828,   980,   456,   520,   104,   -104,  256,  -344,
+  -4,    -28,   -368,  -52,   -524,  -572,  -556,  -200,  768,   1124, -208,
+  -512,  176,   232,   248,   -148,  -888,  604,   -600,  -304,  804,  -156,
+  -212,  488,   -192,  -804,  -256,  368,   -360,  -916,  -328,  228,  -240,
+  -448,  -472,  856,   -556,  -364,  572,   -12,   -156,  -368,  -340, 432,
+  252,   -752,  -152,  288,   268,   -580,  -848,  -592,  108,   -76,  244,
+  312,   -716,  592,   -80,   436,   360,   4,     -248,  160,   516,  584,
+  732,   44,    -468,  -280,  -292,  -156,  -588,  28,    308,   912,  24,
+  124,   156,   180,   -252,  944,   -924,  -772,  -520,  -428,  -624, 300,
+  -212,  -1144, 32,    -724,  800,   -1128, -212,  -1288, -848,  180,  -416,
+  440,   192,   -576,  -792,  -76,   -1080, 80,    -532,  -352,  -132, 380,
+  -820,  148,   1112,  128,   164,   456,   700,   -924,  144,   -668, -384,
+  648,   -832,  508,   552,   -52,   -100,  -656,  208,   -568,  748,  -88,
+  680,   232,   300,   192,   -408,  -1012, -152,  -252,  -268,  272,  -876,
+  -664,  -648,  -332,  -136,  16,    12,    1152,  -28,   332,   -536, 320,
+  -672,  -460,  -316,  532,   -260,  228,   -40,   1052,  -816,  180,  88,
+  -496,  -556,  -672,  -368,  428,   92,    356,   404,   -408,  252,  196,
+  -176,  -556,  792,   268,   32,    372,   40,    96,    -332,  328,  120,
+  372,   -900,  -40,   472,   -264,  -592,  952,   128,   656,   112,  664,
+  -232,  420,   4,     -344,  -464,  556,   244,   -416,  -32,   252,  0,
+  -412,  188,   -696,  508,   -476,  324,   -1096, 656,   -312,  560,  264,
+  -136,  304,   160,   -64,   -580,  248,   336,   -720,  560,   -348, -288,
+  -276,  -196,  -500,  852,   -544,  -236,  -1128, -992,  -776,  116,  56,
+  52,    860,   884,   212,   -12,   168,   1020,  512,   -552,  924,  -148,
+  716,   188,   164,   -340,  -520,  -184,  880,   -152,  -680,  -208, -1156,
+  -300,  -528,  -472,  364,   100,   -744,  -1056, -32,   540,   280,  144,
+  -676,  -32,   -232,  -280,  -224,  96,    568,   -76,   172,   148,  148,
+  104,   32,    -296,  -32,   788,   -80,   32,    -16,   280,   288,  944,
+  428,   -484
+};
+
+static const int gauss_bits = 11;
+
+static int luma_subblock_size_y = 32;
+static int luma_subblock_size_x = 32;
+
+static int chroma_subblock_size_y = 16;
+static int chroma_subblock_size_x = 16;
+
+static const int min_luma_legal_range = 16;
+static const int max_luma_legal_range = 235;
+
+static const int min_chroma_legal_range = 16;
+static const int max_chroma_legal_range = 240;
+
+static int scaling_lut_y[256];
+static int scaling_lut_cb[256];
+static int scaling_lut_cr[256];
+
+static int grain_center;
+static int grain_min;
+static int grain_max;
+
+static uint16_t random_register = 0;  // random number generator register
+
+static void init_arrays(aom_film_grain_t *params, int luma_stride,
+                        int chroma_stride, int ***pred_pos_luma_p,
+                        int ***pred_pos_chroma_p, int **luma_grain_block,
+                        int **cb_grain_block, int **cr_grain_block,
+                        int **y_line_buf, int **cb_line_buf, int **cr_line_buf,
+                        int **y_col_buf, int **cb_col_buf, int **cr_col_buf,
+                        int luma_grain_samples, int chroma_grain_samples,
+                        int chroma_subsamp_y, int chroma_subsamp_x) {
+  memset(scaling_lut_y, 0, sizeof(*scaling_lut_y) * 256);
+  memset(scaling_lut_cb, 0, sizeof(*scaling_lut_cb) * 256);
+  memset(scaling_lut_cr, 0, sizeof(*scaling_lut_cr) * 256);
+
+  int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+  int num_pos_chroma = num_pos_luma;
+  if (params->num_y_points > 0) ++num_pos_chroma;
+
+  int **pred_pos_luma;
+  int **pred_pos_chroma;
+
+  pred_pos_luma = (int **)aom_malloc(sizeof(*pred_pos_luma) * num_pos_luma);
+
+  for (int row = 0; row < num_pos_luma; row++) {
+    pred_pos_luma[row] = (int *)aom_malloc(sizeof(**pred_pos_luma) * 3);
+  }
+
+  pred_pos_chroma =
+      (int **)aom_malloc(sizeof(*pred_pos_chroma) * num_pos_chroma);
+
+  for (int row = 0; row < num_pos_chroma; row++) {
+    pred_pos_chroma[row] = (int *)aom_malloc(sizeof(**pred_pos_chroma) * 3);
+  }
+
+  int pos_ar_index = 0;
+
+  for (int row = -params->ar_coeff_lag; row < 0; row++) {
+    for (int col = -params->ar_coeff_lag; col < params->ar_coeff_lag + 1;
+         col++) {
+      pred_pos_luma[pos_ar_index][0] = row;
+      pred_pos_luma[pos_ar_index][1] = col;
+      pred_pos_luma[pos_ar_index][2] = 0;
+
+      pred_pos_chroma[pos_ar_index][0] = row;
+      pred_pos_chroma[pos_ar_index][1] = col;
+      pred_pos_chroma[pos_ar_index][2] = 0;
+      ++pos_ar_index;
+    }
+  }
+
+  for (int col = -params->ar_coeff_lag; col < 0; col++) {
+    pred_pos_luma[pos_ar_index][0] = 0;
+    pred_pos_luma[pos_ar_index][1] = col;
+    pred_pos_luma[pos_ar_index][2] = 0;
+
+    pred_pos_chroma[pos_ar_index][0] = 0;
+    pred_pos_chroma[pos_ar_index][1] = col;
+    pred_pos_chroma[pos_ar_index][2] = 0;
+
+    ++pos_ar_index;
+  }
+
+  if (params->num_y_points > 0) {
+    pred_pos_chroma[pos_ar_index][0] = 0;
+    pred_pos_chroma[pos_ar_index][1] = 0;
+    pred_pos_chroma[pos_ar_index][2] = 1;
+  }
+
+  *pred_pos_luma_p = pred_pos_luma;
+  *pred_pos_chroma_p = pred_pos_chroma;
+
+  *y_line_buf = (int *)aom_malloc(sizeof(**y_line_buf) * luma_stride * 2);
+  *cb_line_buf = (int *)aom_malloc(sizeof(**cb_line_buf) * chroma_stride *
+                                   (2 >> chroma_subsamp_y));
+  *cr_line_buf = (int *)aom_malloc(sizeof(**cr_line_buf) * chroma_stride *
+                                   (2 >> chroma_subsamp_y));
+
+  *y_col_buf =
+      (int *)aom_malloc(sizeof(**y_col_buf) * (luma_subblock_size_y + 2) * 2);
+  *cb_col_buf =
+      (int *)aom_malloc(sizeof(**cb_col_buf) *
+                        (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) *
+                        (2 >> chroma_subsamp_x));
+  *cr_col_buf =
+      (int *)aom_malloc(sizeof(**cr_col_buf) *
+                        (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) *
+                        (2 >> chroma_subsamp_x));
+
+  *luma_grain_block =
+      (int *)aom_malloc(sizeof(**luma_grain_block) * luma_grain_samples);
+  *cb_grain_block =
+      (int *)aom_malloc(sizeof(**cb_grain_block) * chroma_grain_samples);
+  *cr_grain_block =
+      (int *)aom_malloc(sizeof(**cr_grain_block) * chroma_grain_samples);
+}
+
+static void dealloc_arrays(aom_film_grain_t *params, int ***pred_pos_luma,
+                           int ***pred_pos_chroma, int **luma_grain_block,
+                           int **cb_grain_block, int **cr_grain_block,
+                           int **y_line_buf, int **cb_line_buf,
+                           int **cr_line_buf, int **y_col_buf, int **cb_col_buf,
+                           int **cr_col_buf) {
+  int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+  int num_pos_chroma = num_pos_luma;
+  if (params->num_y_points > 0) ++num_pos_chroma;
+
+  for (int row = 0; row < num_pos_luma; row++) {
+    aom_free((*pred_pos_luma)[row]);
+  }
+  aom_free(*pred_pos_luma);
+
+  for (int row = 0; row < num_pos_chroma; row++) {
+    aom_free((*pred_pos_chroma)[row]);
+  }
+  aom_free((*pred_pos_chroma));
+
+  aom_free(*y_line_buf);
+
+  aom_free(*cb_line_buf);
+
+  aom_free(*cr_line_buf);
+
+  aom_free(*y_col_buf);
+
+  aom_free(*cb_col_buf);
+
+  aom_free(*cr_col_buf);
+
+  aom_free(*luma_grain_block);
+
+  aom_free(*cb_grain_block);
+
+  aom_free(*cr_grain_block);
+}
+
+// get a number between 0 and 2^bits - 1
+static INLINE int get_random_number(int bits) {
+  uint16_t bit;
+  bit = ((random_register >> 0) ^ (random_register >> 1) ^
+         (random_register >> 3) ^ (random_register >> 12)) &
+        1;
+  random_register = (random_register >> 1) | (bit << 15);
+  return (random_register >> (16 - bits)) & ((1 << bits) - 1);
+}
+
+static void init_random_generator(int luma_line, uint16_t seed) {
+  // same for the picture
+
+  uint16_t msb = (seed >> 8) & 255;
+  uint16_t lsb = seed & 255;
+
+  random_register = (msb << 8) + lsb;
+
+  //  changes for each row
+  int luma_num = luma_line >> 5;
+
+  random_register ^= ((luma_num * 37 + 178) & 255) << 8;
+  random_register ^= ((luma_num * 173 + 105) & 255);
+}
+
+static void generate_luma_grain_block(
+    aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block,
+    int luma_block_size_y, int luma_block_size_x, int luma_grain_stride,
+    int left_pad, int top_pad, int right_pad, int bottom_pad) {
+  if (params->num_y_points == 0) return;
+
+  int bit_depth = params->bit_depth;
+  int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift;
+
+  int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+  int rounding_offset = (1 << (params->ar_coeff_shift - 1));
+
+  for (int i = 0; i < luma_block_size_y; i++)
+    for (int j = 0; j < luma_block_size_x; j++)
+      luma_grain_block[i * luma_grain_stride + j] =
+          (gaussian_sequence[get_random_number(gauss_bits)] +
+           ((1 << gauss_sec_shift) >> 1)) >>
+          gauss_sec_shift;
+
+  for (int i = top_pad; i < luma_block_size_y - bottom_pad; i++)
+    for (int j = left_pad; j < luma_block_size_x - right_pad; j++) {
+      int wsum = 0;
+      for (int pos = 0; pos < num_pos_luma; pos++) {
+        wsum = wsum + params->ar_coeffs_y[pos] *
+                          luma_grain_block[(i + pred_pos_luma[pos][0]) *
+                                               luma_grain_stride +
+                                           j + pred_pos_luma[pos][1]];
+      }
+      luma_grain_block[i * luma_grain_stride + j] =
+          clamp(luma_grain_block[i * luma_grain_stride + j] +
+                    ((wsum + rounding_offset) >> params->ar_coeff_shift),
+                grain_min, grain_max);
+    }
+}
+
+static void generate_chroma_grain_blocks(
+    aom_film_grain_t *params,
+    //                                  int** pred_pos_luma,
+    int **pred_pos_chroma, int *luma_grain_block, int *cb_grain_block,
+    int *cr_grain_block, int luma_grain_stride, int chroma_block_size_y,
+    int chroma_block_size_x, int chroma_grain_stride, int left_pad, int top_pad,
+    int right_pad, int bottom_pad, int chroma_subsamp_y, int chroma_subsamp_x) {
+  int bit_depth = params->bit_depth;
+  int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift;
+
+  int num_pos_chroma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+  if (params->num_y_points > 0) ++num_pos_chroma;
+  int rounding_offset = (1 << (params->ar_coeff_shift - 1));
+  int chroma_grain_samples = chroma_block_size_y * chroma_block_size_x;
+
+  if (params->num_cb_points || params->chroma_scaling_from_luma) {
+    init_random_generator(7 << 5, params->random_seed);
+
+    for (int i = 0; i < chroma_block_size_y; i++)
+      for (int j = 0; j < chroma_block_size_x; j++)
+        cb_grain_block[i * chroma_grain_stride + j] =
+            (gaussian_sequence[get_random_number(gauss_bits)] +
+             ((1 << gauss_sec_shift) >> 1)) >>
+            gauss_sec_shift;
+  } else {
+    memset(cr_grain_block, 0, sizeof(*cr_grain_block) * chroma_grain_samples);
+  }
+
+  if (params->num_cr_points || params->chroma_scaling_from_luma) {
+    init_random_generator(11 << 5, params->random_seed);
+
+    for (int i = 0; i < chroma_block_size_y; i++)
+      for (int j = 0; j < chroma_block_size_x; j++)
+        cr_grain_block[i * chroma_grain_stride + j] =
+            (gaussian_sequence[get_random_number(gauss_bits)] +
+             ((1 << gauss_sec_shift) >> 1)) >>
+            gauss_sec_shift;
+  } else {
+    memset(cb_grain_block, 0, sizeof(*cb_grain_block) * chroma_grain_samples);
+  }
+
+  for (int i = top_pad; i < chroma_block_size_y - bottom_pad; i++)
+    for (int j = left_pad; j < chroma_block_size_x - right_pad; j++) {
+      int wsum_cb = 0;
+      int wsum_cr = 0;
+      for (int pos = 0; pos < num_pos_chroma; pos++) {
+        if (pred_pos_chroma[pos][2] == 0) {
+          wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] *
+                                  cb_grain_block[(i + pred_pos_chroma[pos][0]) *
+                                                     chroma_grain_stride +
+                                                 j + pred_pos_chroma[pos][1]];
+          wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] *
+                                  cr_grain_block[(i + pred_pos_chroma[pos][0]) *
+                                                     chroma_grain_stride +
+                                                 j + pred_pos_chroma[pos][1]];
+        } else if (pred_pos_chroma[pos][2] == 1) {
+          int av_luma = 0;
+          int luma_coord_y = ((i - top_pad) << chroma_subsamp_y) + top_pad;
+          int luma_coord_x = ((j - left_pad) << chroma_subsamp_x) + left_pad;
+
+          for (int k = luma_coord_y; k < luma_coord_y + chroma_subsamp_y + 1;
+               k++)
+            for (int l = luma_coord_x; l < luma_coord_x + chroma_subsamp_x + 1;
+                 l++)
+              av_luma += luma_grain_block[k * luma_grain_stride + l];
+
+          av_luma =
+              (av_luma + ((1 << (chroma_subsamp_y + chroma_subsamp_x)) >> 1)) >>
+              (chroma_subsamp_y + chroma_subsamp_x);
+
+          wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * av_luma;
+          wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * av_luma;
+        } else {
+          printf(
+              "Grain synthesis: prediction between two chroma components is "
+              "not supported!");
+          exit(1);
+        }
+      }
+      if (params->num_cb_points || params->chroma_scaling_from_luma)
+        cb_grain_block[i * chroma_grain_stride + j] =
+            clamp(cb_grain_block[i * chroma_grain_stride + j] +
+                      ((wsum_cb + rounding_offset) >> params->ar_coeff_shift),
+                  grain_min, grain_max);
+      if (params->num_cr_points || params->chroma_scaling_from_luma)
+        cr_grain_block[i * chroma_grain_stride + j] =
+            clamp(cr_grain_block[i * chroma_grain_stride + j] +
+                      ((wsum_cr + rounding_offset) >> params->ar_coeff_shift),
+                  grain_min, grain_max);
+    }
+}
+
+static void init_scaling_function(int scaling_points[][2], int num_points,
+                                  int scaling_lut[]) {
+  if (num_points == 0) return;
+
+  for (int i = 0; i < scaling_points[0][0]; i++)
+    scaling_lut[i] = scaling_points[0][1];
+
+  for (int point = 0; point < num_points - 1; point++) {
+    int delta_y = scaling_points[point + 1][1] - scaling_points[point][1];
+    int delta_x = scaling_points[point + 1][0] - scaling_points[point][0];
+
+    int64_t delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
+
+    for (int x = 0; x < delta_x; x++) {
+      scaling_lut[scaling_points[point][0] + x] =
+          scaling_points[point][1] + (int)((x * delta + 32768) >> 16);
+    }
+  }
+
+  for (int i = scaling_points[num_points - 1][0]; i < 256; i++)
+    scaling_lut[i] = scaling_points[num_points - 1][1];
+}
+
+// function that extracts samples from a LUT (and interpolates intemediate
+// frames for 10- and 12-bit video)
+static int scale_LUT(int *scaling_lut, int index, int bit_depth) {
+  int x = index >> (bit_depth - 8);
+
+  if (!(bit_depth - 8) || x == 255)
+    return scaling_lut[x];
+  else
+    return scaling_lut[x] + (((scaling_lut[x + 1] - scaling_lut[x]) *
+                                  (index & ((1 << (bit_depth - 8)) - 1)) +
+                              (1 << (bit_depth - 9))) >>
+                             (bit_depth - 8));
+}
+
+static void add_noise_to_block(aom_film_grain_t *params, uint8_t *luma,
+                               uint8_t *cb, uint8_t *cr, int luma_stride,
+                               int chroma_stride, int *luma_grain,
+                               int *cb_grain, int *cr_grain,
+                               int luma_grain_stride, int chroma_grain_stride,
+                               int half_luma_height, int half_luma_width,
+                               int bit_depth, int chroma_subsamp_y,
+                               int chroma_subsamp_x, int mc_identity) {
+  int cb_mult = params->cb_mult - 128;            // fixed scale
+  int cb_luma_mult = params->cb_luma_mult - 128;  // fixed scale
+  int cb_offset = params->cb_offset - 256;
+
+  int cr_mult = params->cr_mult - 128;            // fixed scale
+  int cr_luma_mult = params->cr_luma_mult - 128;  // fixed scale
+  int cr_offset = params->cr_offset - 256;
+
+  int rounding_offset = (1 << (params->scaling_shift - 1));
+
+  int apply_y = params->num_y_points > 0 ? 1 : 0;
+  int apply_cb =
+      (params->num_cb_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0;
+  int apply_cr =
+      (params->num_cr_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0;
+
+  if (params->chroma_scaling_from_luma) {
+    cb_mult = 0;        // fixed scale
+    cb_luma_mult = 64;  // fixed scale
+    cb_offset = 0;
+
+    cr_mult = 0;        // fixed scale
+    cr_luma_mult = 64;  // fixed scale
+    cr_offset = 0;
+  }
+
+  int min_luma, max_luma, min_chroma, max_chroma;
+
+  if (params->clip_to_restricted_range) {
+    min_luma = min_luma_legal_range;
+    max_luma = max_luma_legal_range;
+
+    if (mc_identity) {
+      min_chroma = min_luma_legal_range;
+      max_chroma = max_luma_legal_range;
+    } else {
+      min_chroma = min_chroma_legal_range;
+      max_chroma = max_chroma_legal_range;
+    }
+  } else {
+    min_luma = min_chroma = 0;
+    max_luma = max_chroma = 255;
+  }
+
+  for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) {
+    for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) {
+      int average_luma = 0;
+      if (chroma_subsamp_x) {
+        average_luma = (luma[(i << chroma_subsamp_y) * luma_stride +
+                             (j << chroma_subsamp_x)] +
+                        luma[(i << chroma_subsamp_y) * luma_stride +
+                             (j << chroma_subsamp_x) + 1] +
+                        1) >>
+                       1;
+      } else {
+        average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j];
+      }
+
+      if (apply_cb) {
+        cb[i * chroma_stride + j] = clamp(
+            cb[i * chroma_stride + j] +
+                ((scale_LUT(scaling_lut_cb,
+                            clamp(((average_luma * cb_luma_mult +
+                                    cb_mult * cb[i * chroma_stride + j]) >>
+                                   6) +
+                                      cb_offset,
+                                  0, (256 << (bit_depth - 8)) - 1),
+                            8) *
+                      cb_grain[i * chroma_grain_stride + j] +
+                  rounding_offset) >>
+                 params->scaling_shift),
+            min_chroma, max_chroma);
+      }
+
+      if (apply_cr) {
+        cr[i * chroma_stride + j] = clamp(
+            cr[i * chroma_stride + j] +
+                ((scale_LUT(scaling_lut_cr,
+                            clamp(((average_luma * cr_luma_mult +
+                                    cr_mult * cr[i * chroma_stride + j]) >>
+                                   6) +
+                                      cr_offset,
+                                  0, (256 << (bit_depth - 8)) - 1),
+                            8) *
+                      cr_grain[i * chroma_grain_stride + j] +
+                  rounding_offset) >>
+                 params->scaling_shift),
+            min_chroma, max_chroma);
+      }
+    }
+  }
+
+  if (apply_y) {
+    for (int i = 0; i < (half_luma_height << 1); i++) {
+      for (int j = 0; j < (half_luma_width << 1); j++) {
+        luma[i * luma_stride + j] =
+            clamp(luma[i * luma_stride + j] +
+                      ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j], 8) *
+                            luma_grain[i * luma_grain_stride + j] +
+                        rounding_offset) >>
+                       params->scaling_shift),
+                  min_luma, max_luma);
+      }
+    }
+  }
+}
+
+static void add_noise_to_block_hbd(
+    aom_film_grain_t *params, uint16_t *luma, uint16_t *cb, uint16_t *cr,
+    int luma_stride, int chroma_stride, int *luma_grain, int *cb_grain,
+    int *cr_grain, int luma_grain_stride, int chroma_grain_stride,
+    int half_luma_height, int half_luma_width, int bit_depth,
+    int chroma_subsamp_y, int chroma_subsamp_x, int mc_identity) {
+  int cb_mult = params->cb_mult - 128;            // fixed scale
+  int cb_luma_mult = params->cb_luma_mult - 128;  // fixed scale
+  // offset value depends on the bit depth
+  int cb_offset = (params->cb_offset << (bit_depth - 8)) - (1 << bit_depth);
+
+  int cr_mult = params->cr_mult - 128;            // fixed scale
+  int cr_luma_mult = params->cr_luma_mult - 128;  // fixed scale
+  // offset value depends on the bit depth
+  int cr_offset = (params->cr_offset << (bit_depth - 8)) - (1 << bit_depth);
+
+  int rounding_offset = (1 << (params->scaling_shift - 1));
+
+  int apply_y = params->num_y_points > 0 ? 1 : 0;
+  int apply_cb =
+      (params->num_cb_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1
+                                                                          : 0;
+  int apply_cr =
+      (params->num_cr_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1
+                                                                          : 0;
+
+  if (params->chroma_scaling_from_luma) {
+    cb_mult = 0;        // fixed scale
+    cb_luma_mult = 64;  // fixed scale
+    cb_offset = 0;
+
+    cr_mult = 0;        // fixed scale
+    cr_luma_mult = 64;  // fixed scale
+    cr_offset = 0;
+  }
+
+  int min_luma, max_luma, min_chroma, max_chroma;
+
+  if (params->clip_to_restricted_range) {
+    min_luma = min_luma_legal_range << (bit_depth - 8);
+    max_luma = max_luma_legal_range << (bit_depth - 8);
+
+    if (mc_identity) {
+      min_chroma = min_luma_legal_range << (bit_depth - 8);
+      max_chroma = max_luma_legal_range << (bit_depth - 8);
+    } else {
+      min_chroma = min_chroma_legal_range << (bit_depth - 8);
+      max_chroma = max_chroma_legal_range << (bit_depth - 8);
+    }
+  } else {
+    min_luma = min_chroma = 0;
+    max_luma = max_chroma = (256 << (bit_depth - 8)) - 1;
+  }
+
+  for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) {
+    for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) {
+      int average_luma = 0;
+      if (chroma_subsamp_x) {
+        average_luma = (luma[(i << chroma_subsamp_y) * luma_stride +
+                             (j << chroma_subsamp_x)] +
+                        luma[(i << chroma_subsamp_y) * luma_stride +
+                             (j << chroma_subsamp_x) + 1] +
+                        1) >>
+                       1;
+      } else {
+        average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j];
+      }
+
+      if (apply_cb) {
+        cb[i * chroma_stride + j] = clamp(
+            cb[i * chroma_stride + j] +
+                ((scale_LUT(scaling_lut_cb,
+                            clamp(((average_luma * cb_luma_mult +
+                                    cb_mult * cb[i * chroma_stride + j]) >>
+                                   6) +
+                                      cb_offset,
+                                  0, (256 << (bit_depth - 8)) - 1),
+                            bit_depth) *
+                      cb_grain[i * chroma_grain_stride + j] +
+                  rounding_offset) >>
+                 params->scaling_shift),
+            min_chroma, max_chroma);
+      }
+      if (apply_cr) {
+        cr[i * chroma_stride + j] = clamp(
+            cr[i * chroma_stride + j] +
+                ((scale_LUT(scaling_lut_cr,
+                            clamp(((average_luma * cr_luma_mult +
+                                    cr_mult * cr[i * chroma_stride + j]) >>
+                                   6) +
+                                      cr_offset,
+                                  0, (256 << (bit_depth - 8)) - 1),
+                            bit_depth) *
+                      cr_grain[i * chroma_grain_stride + j] +
+                  rounding_offset) >>
+                 params->scaling_shift),
+            min_chroma, max_chroma);
+      }
+    }
+  }
+
+  if (apply_y) {
+    for (int i = 0; i < (half_luma_height << 1); i++) {
+      for (int j = 0; j < (half_luma_width << 1); j++) {
+        luma[i * luma_stride + j] =
+            clamp(luma[i * luma_stride + j] +
+                      ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j],
+                                  bit_depth) *
+                            luma_grain[i * luma_grain_stride + j] +
+                        rounding_offset) >>
+                       params->scaling_shift),
+                  min_luma, max_luma);
+      }
+    }
+  }
+}
+
+static void copy_rect(uint8_t *src, int src_stride, uint8_t *dst,
+                      int dst_stride, int width, int height,
+                      int use_high_bit_depth) {
+  int hbd_coeff = use_high_bit_depth ? 2 : 1;
+  while (height) {
+    memcpy(dst, src, width * sizeof(uint8_t) * hbd_coeff);
+    src += src_stride;
+    dst += dst_stride;
+    --height;
+  }
+  return;
+}
+
+static void copy_area(int *src, int src_stride, int *dst, int dst_stride,
+                      int width, int height) {
+  while (height) {
+    memcpy(dst, src, width * sizeof(*src));
+    src += src_stride;
+    dst += dst_stride;
+    --height;
+  }
+  return;
+}
+
+static void extend_even(uint8_t *dst, int dst_stride, int width, int height,
+                        int use_high_bit_depth) {
+  if ((width & 1) == 0 && (height & 1) == 0) return;
+  if (use_high_bit_depth) {
+    uint16_t *dst16 = (uint16_t *)dst;
+    int dst16_stride = dst_stride / 2;
+    if (width & 1) {
+      for (int i = 0; i < height; ++i)
+        dst16[i * dst16_stride + width] = dst16[i * dst16_stride + width - 1];
+    }
+    width = (width + 1) & (~1);
+    if (height & 1) {
+      memcpy(&dst16[height * dst16_stride], &dst16[(height - 1) * dst16_stride],
+             sizeof(*dst16) * width);
+    }
+  } else {
+    if (width & 1) {
+      for (int i = 0; i < height; ++i)
+        dst[i * dst_stride + width] = dst[i * dst_stride + width - 1];
+    }
+    width = (width + 1) & (~1);
+    if (height & 1) {
+      memcpy(&dst[height * dst_stride], &dst[(height - 1) * dst_stride],
+             sizeof(*dst) * width);
+    }
+  }
+}
+
+static void ver_boundary_overlap(int *left_block, int left_stride,
+                                 int *right_block, int right_stride,
+                                 int *dst_block, int dst_stride, int width,
+                                 int height) {
+  if (width == 1) {
+    while (height) {
+      *dst_block = clamp((*left_block * 23 + *right_block * 22 + 16) >> 5,
+                         grain_min, grain_max);
+      left_block += left_stride;
+      right_block += right_stride;
+      dst_block += dst_stride;
+      --height;
+    }
+    return;
+  } else if (width == 2) {
+    while (height) {
+      dst_block[0] = clamp((27 * left_block[0] + 17 * right_block[0] + 16) >> 5,
+                           grain_min, grain_max);
+      dst_block[1] = clamp((17 * left_block[1] + 27 * right_block[1] + 16) >> 5,
+                           grain_min, grain_max);
+      left_block += left_stride;
+      right_block += right_stride;
+      dst_block += dst_stride;
+      --height;
+    }
+    return;
+  }
+}
+
+static void hor_boundary_overlap(int *top_block, int top_stride,
+                                 int *bottom_block, int bottom_stride,
+                                 int *dst_block, int dst_stride, int width,
+                                 int height) {
+  if (height == 1) {
+    while (width) {
+      *dst_block = clamp((*top_block * 23 + *bottom_block * 22 + 16) >> 5,
+                         grain_min, grain_max);
+      ++top_block;
+      ++bottom_block;
+      ++dst_block;
+      --width;
+    }
+    return;
+  } else if (height == 2) {
+    while (width) {
+      dst_block[0] = clamp((27 * top_block[0] + 17 * bottom_block[0] + 16) >> 5,
+                           grain_min, grain_max);
+      dst_block[dst_stride] = clamp((17 * top_block[top_stride] +
+                                     27 * bottom_block[bottom_stride] + 16) >>
+                                        5,
+                                    grain_min, grain_max);
+      ++top_block;
+      ++bottom_block;
+      ++dst_block;
+      --width;
+    }
+    return;
+  }
+}
+
+void av1_add_film_grain(aom_film_grain_t *params, aom_image_t *src,
+                        aom_image_t *dst) {
+  uint8_t *luma, *cb, *cr;
+  int height, width, luma_stride, chroma_stride;
+  int use_high_bit_depth = 0;
+  int chroma_subsamp_x = 0;
+  int chroma_subsamp_y = 0;
+  int mc_identity = src->mc == AOM_CICP_MC_IDENTITY ? 1 : 0;
+
+  switch (src->fmt) {
+    case AOM_IMG_FMT_AOMI420:
+    case AOM_IMG_FMT_I420:
+      use_high_bit_depth = 0;
+      chroma_subsamp_x = 1;
+      chroma_subsamp_y = 1;
+      break;
+    case AOM_IMG_FMT_I42016:
+      use_high_bit_depth = 1;
+      chroma_subsamp_x = 1;
+      chroma_subsamp_y = 1;
+      break;
+      //    case AOM_IMG_FMT_444A:
+    case AOM_IMG_FMT_I444:
+      use_high_bit_depth = 0;
+      chroma_subsamp_x = 0;
+      chroma_subsamp_y = 0;
+      break;
+    case AOM_IMG_FMT_I44416:
+      use_high_bit_depth = 1;
+      chroma_subsamp_x = 0;
+      chroma_subsamp_y = 0;
+      break;
+    case AOM_IMG_FMT_I422:
+      use_high_bit_depth = 0;
+      chroma_subsamp_x = 1;
+      chroma_subsamp_y = 0;
+      break;
+    case AOM_IMG_FMT_I42216:
+      use_high_bit_depth = 1;
+      chroma_subsamp_x = 1;
+      chroma_subsamp_y = 0;
+      break;
+    default:  // unknown input format
+      printf("Film grain error: input format is not supported!");
+      exit(1);
+  }
+
+  dst->r_w = src->r_w;
+  dst->r_h = src->r_h;
+  dst->d_w = src->d_w;
+  dst->d_h = src->d_h;
+
+  dst->cp = src->cp;
+  dst->tc = src->tc;
+  dst->mc = src->mc;
+
+  dst->monochrome = src->monochrome;
+  dst->csp = src->csp;
+  dst->range = src->range;
+
+  dst->x_chroma_shift = src->x_chroma_shift;
+  dst->y_chroma_shift = src->y_chroma_shift;
+
+  dst->temporal_id = src->temporal_id;
+  dst->spatial_id = src->spatial_id;
+
+  width = src->d_w % 2 ? src->d_w + 1 : src->d_w;
+  height = src->d_h % 2 ? src->d_h + 1 : src->d_h;
+
+  copy_rect(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y],
+            dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w,
+            src->d_h, use_high_bit_depth);
+  // Note that dst is already assumed to be aligned to even.
+  extend_even(dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w,
+              src->d_h, use_high_bit_depth);
+
+  if (!src->monochrome) {
+    copy_rect(src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U],
+              dst->planes[AOM_PLANE_U], dst->stride[AOM_PLANE_U],
+              width >> chroma_subsamp_x, height >> chroma_subsamp_y,
+              use_high_bit_depth);
+
+    copy_rect(src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V],
+              dst->planes[AOM_PLANE_V], dst->stride[AOM_PLANE_V],
+              width >> chroma_subsamp_x, height >> chroma_subsamp_y,
+              use_high_bit_depth);
+  }
+
+  luma = dst->planes[AOM_PLANE_Y];
+  cb = dst->planes[AOM_PLANE_U];
+  cr = dst->planes[AOM_PLANE_V];
+
+  // luma and chroma strides in samples
+  luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth;
+  chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth;
+
+  params->bit_depth = dst->bit_depth;
+
+  av1_add_film_grain_run(params, luma, cb, cr, height, width, luma_stride,
+                         chroma_stride, use_high_bit_depth, chroma_subsamp_y,
+                         chroma_subsamp_x, mc_identity);
+  return;
+}
+
+void av1_add_film_grain_run(aom_film_grain_t *params, uint8_t *luma,
+                            uint8_t *cb, uint8_t *cr, int height, int width,
+                            int luma_stride, int chroma_stride,
+                            int use_high_bit_depth, int chroma_subsamp_y,
+                            int chroma_subsamp_x, int mc_identity) {
+  int **pred_pos_luma;
+  int **pred_pos_chroma;
+  int *luma_grain_block;
+  int *cb_grain_block;
+  int *cr_grain_block;
+
+  int *y_line_buf;
+  int *cb_line_buf;
+  int *cr_line_buf;
+
+  int *y_col_buf;
+  int *cb_col_buf;
+  int *cr_col_buf;
+
+  random_register = params->random_seed;
+
+  int left_pad = 3;
+  int right_pad = 3;  // padding to offset for AR coefficients
+  int top_pad = 3;
+  int bottom_pad = 0;
+
+  int ar_padding = 3;  // maximum lag used for stabilization of AR coefficients
+
+  luma_subblock_size_y = 32;
+  luma_subblock_size_x = 32;
+
+  chroma_subblock_size_y = luma_subblock_size_y >> chroma_subsamp_y;
+  chroma_subblock_size_x = luma_subblock_size_x >> chroma_subsamp_x;
+
+  // Initial padding is only needed for generation of
+  // film grain templates (to stabilize the AR process)
+  // Only a 64x64 luma and 32x32 chroma part of a template
+  // is used later for adding grain, padding can be discarded
+
+  int luma_block_size_y =
+      top_pad + 2 * ar_padding + luma_subblock_size_y * 2 + bottom_pad;
+  int luma_block_size_x = left_pad + 2 * ar_padding + luma_subblock_size_x * 2 +
+                          2 * ar_padding + right_pad;
+
+  int chroma_block_size_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding +
+                            chroma_subblock_size_y * 2 + bottom_pad;
+  int chroma_block_size_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding +
+                            chroma_subblock_size_x * 2 +
+                            (2 >> chroma_subsamp_x) * ar_padding + right_pad;
+
+  int luma_grain_stride = luma_block_size_x;
+  int chroma_grain_stride = chroma_block_size_x;
+
+  int overlap = params->overlap_flag;
+  int bit_depth = params->bit_depth;
+
+  grain_center = 128 << (bit_depth - 8);
+  grain_min = 0 - grain_center;
+  grain_max = (256 << (bit_depth - 8)) - 1 - grain_center;
+
+  init_arrays(params, luma_stride, chroma_stride, &pred_pos_luma,
+              &pred_pos_chroma, &luma_grain_block, &cb_grain_block,
+              &cr_grain_block, &y_line_buf, &cb_line_buf, &cr_line_buf,
+              &y_col_buf, &cb_col_buf, &cr_col_buf,
+              luma_block_size_y * luma_block_size_x,
+              chroma_block_size_y * chroma_block_size_x, chroma_subsamp_y,
+              chroma_subsamp_x);
+
+  generate_luma_grain_block(params, pred_pos_luma, luma_grain_block,
+                            luma_block_size_y, luma_block_size_x,
+                            luma_grain_stride, left_pad, top_pad, right_pad,
+                            bottom_pad);
+
+  generate_chroma_grain_blocks(
+      params,
+      //                               pred_pos_luma,
+      pred_pos_chroma, luma_grain_block, cb_grain_block, cr_grain_block,
+      luma_grain_stride, chroma_block_size_y, chroma_block_size_x,
+      chroma_grain_stride, left_pad, top_pad, right_pad, bottom_pad,
+      chroma_subsamp_y, chroma_subsamp_x);
+
+  init_scaling_function(params->scaling_points_y, params->num_y_points,
+                        scaling_lut_y);
+
+  if (params->chroma_scaling_from_luma) {
+    memcpy(scaling_lut_cb, scaling_lut_y, sizeof(*scaling_lut_y) * 256);
+    memcpy(scaling_lut_cr, scaling_lut_y, sizeof(*scaling_lut_y) * 256);
+  } else {
+    init_scaling_function(params->scaling_points_cb, params->num_cb_points,
+                          scaling_lut_cb);
+    init_scaling_function(params->scaling_points_cr, params->num_cr_points,
+                          scaling_lut_cr);
+  }
+  for (int y = 0; y < height / 2; y += (luma_subblock_size_y >> 1)) {
+    init_random_generator(y * 2, params->random_seed);
+
+    for (int x = 0; x < width / 2; x += (luma_subblock_size_x >> 1)) {
+      int offset_y = get_random_number(8);
+      int offset_x = (offset_y >> 4) & 15;
+      offset_y &= 15;
+
+      int luma_offset_y = left_pad + 2 * ar_padding + (offset_y << 1);
+      int luma_offset_x = top_pad + 2 * ar_padding + (offset_x << 1);
+
+      int chroma_offset_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding +
+                            offset_y * (2 >> chroma_subsamp_y);
+      int chroma_offset_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding +
+                            offset_x * (2 >> chroma_subsamp_x);
+
+      if (overlap && x) {
+        ver_boundary_overlap(
+            y_col_buf, 2,
+            luma_grain_block + luma_offset_y * luma_grain_stride +
+                luma_offset_x,
+            luma_grain_stride, y_col_buf, 2, 2,
+            AOMMIN(luma_subblock_size_y + 2, height - (y << 1)));
+
+        ver_boundary_overlap(
+            cb_col_buf, 2 >> chroma_subsamp_x,
+            cb_grain_block + chroma_offset_y * chroma_grain_stride +
+                chroma_offset_x,
+            chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x,
+            2 >> chroma_subsamp_x,
+            AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
+                   (height - (y << 1)) >> chroma_subsamp_y));
+
+        ver_boundary_overlap(
+            cr_col_buf, 2 >> chroma_subsamp_x,
+            cr_grain_block + chroma_offset_y * chroma_grain_stride +
+                chroma_offset_x,
+            chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x,
+            2 >> chroma_subsamp_x,
+            AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
+                   (height - (y << 1)) >> chroma_subsamp_y));
+
+        int i = y ? 1 : 0;
+
+        if (use_high_bit_depth) {
+          add_noise_to_block_hbd(
+              params,
+              (uint16_t *)luma + ((y + i) << 1) * luma_stride + (x << 1),
+              (uint16_t *)cb +
+                  ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << (1 - chroma_subsamp_x)),
+              (uint16_t *)cr +
+                  ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << (1 - chroma_subsamp_x)),
+              luma_stride, chroma_stride, y_col_buf + i * 4,
+              cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
+              cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
+              2, (2 - chroma_subsamp_x),
+              AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1,
+              bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+        } else {
+          add_noise_to_block(
+              params, luma + ((y + i) << 1) * luma_stride + (x << 1),
+              cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << (1 - chroma_subsamp_x)),
+              cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << (1 - chroma_subsamp_x)),
+              luma_stride, chroma_stride, y_col_buf + i * 4,
+              cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
+              cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
+              2, (2 - chroma_subsamp_x),
+              AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1,
+              bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+        }
+      }
+
+      if (overlap && y) {
+        if (x) {
+          hor_boundary_overlap(y_line_buf + (x << 1), luma_stride, y_col_buf, 2,
+                               y_line_buf + (x << 1), luma_stride, 2, 2);
+
+          hor_boundary_overlap(cb_line_buf + x * (2 >> chroma_subsamp_x),
+                               chroma_stride, cb_col_buf, 2 >> chroma_subsamp_x,
+                               cb_line_buf + x * (2 >> chroma_subsamp_x),
+                               chroma_stride, 2 >> chroma_subsamp_x,
+                               2 >> chroma_subsamp_y);
+
+          hor_boundary_overlap(cr_line_buf + x * (2 >> chroma_subsamp_x),
+                               chroma_stride, cr_col_buf, 2 >> chroma_subsamp_x,
+                               cr_line_buf + x * (2 >> chroma_subsamp_x),
+                               chroma_stride, 2 >> chroma_subsamp_x,
+                               2 >> chroma_subsamp_y);
+        }
+
+        hor_boundary_overlap(
+            y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
+            luma_grain_block + luma_offset_y * luma_grain_stride +
+                luma_offset_x + (x ? 2 : 0),
+            luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
+            AOMMIN(luma_subblock_size_x - ((x ? 1 : 0) << 1),
+                   width - ((x ? x + 1 : 0) << 1)),
+            2);
+
+        hor_boundary_overlap(
+            cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_stride,
+            cb_grain_block + chroma_offset_y * chroma_grain_stride +
+                chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_grain_stride,
+            cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_stride,
+            AOMMIN(chroma_subblock_size_x -
+                       ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
+                   (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x),
+            2 >> chroma_subsamp_y);
+
+        hor_boundary_overlap(
+            cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_stride,
+            cr_grain_block + chroma_offset_y * chroma_grain_stride +
+                chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_grain_stride,
+            cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_stride,
+            AOMMIN(chroma_subblock_size_x -
+                       ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
+                   (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x),
+            2 >> chroma_subsamp_y);
+
+        if (use_high_bit_depth) {
+          add_noise_to_block_hbd(
+              params, (uint16_t *)luma + (y << 1) * luma_stride + (x << 1),
+              (uint16_t *)cb + (y << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << ((1 - chroma_subsamp_x))),
+              (uint16_t *)cr + (y << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << ((1 - chroma_subsamp_x))),
+              luma_stride, chroma_stride, y_line_buf + (x << 1),
+              cb_line_buf + (x << (1 - chroma_subsamp_x)),
+              cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride,
+              chroma_stride, 1,
+              AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth,
+              chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+        } else {
+          add_noise_to_block(
+              params, luma + (y << 1) * luma_stride + (x << 1),
+              cb + (y << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << ((1 - chroma_subsamp_x))),
+              cr + (y << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << ((1 - chroma_subsamp_x))),
+              luma_stride, chroma_stride, y_line_buf + (x << 1),
+              cb_line_buf + (x << (1 - chroma_subsamp_x)),
+              cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride,
+              chroma_stride, 1,
+              AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth,
+              chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+        }
+      }
+
+      int i = overlap && y ? 1 : 0;
+      int j = overlap && x ? 1 : 0;
+
+      if (use_high_bit_depth) {
+        add_noise_to_block_hbd(
+            params,
+            (uint16_t *)luma + ((y + i) << 1) * luma_stride + ((x + j) << 1),
+            (uint16_t *)cb +
+                ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                ((x + j) << (1 - chroma_subsamp_x)),
+            (uint16_t *)cr +
+                ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                ((x + j) << (1 - chroma_subsamp_x)),
+            luma_stride, chroma_stride,
+            luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride +
+                luma_offset_x + (j << 1),
+            cb_grain_block +
+                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
+                    chroma_grain_stride +
+                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
+            cr_grain_block +
+                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
+                    chroma_grain_stride +
+                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
+            luma_grain_stride, chroma_grain_stride,
+            AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i,
+            AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth,
+            chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+      } else {
+        add_noise_to_block(
+            params, luma + ((y + i) << 1) * luma_stride + ((x + j) << 1),
+            cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                ((x + j) << (1 - chroma_subsamp_x)),
+            cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                ((x + j) << (1 - chroma_subsamp_x)),
+            luma_stride, chroma_stride,
+            luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride +
+                luma_offset_x + (j << 1),
+            cb_grain_block +
+                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
+                    chroma_grain_stride +
+                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
+            cr_grain_block +
+                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
+                    chroma_grain_stride +
+                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
+            luma_grain_stride, chroma_grain_stride,
+            AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i,
+            AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth,
+            chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+      }
+
+      if (overlap) {
+        if (x) {
+          // Copy overlapped column bufer to line buffer
+          copy_area(y_col_buf + (luma_subblock_size_y << 1), 2,
+                    y_line_buf + (x << 1), luma_stride, 2, 2);
+
+          copy_area(
+              cb_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)),
+              2 >> chroma_subsamp_x,
+              cb_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride,
+              2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y);
+
+          copy_area(
+              cr_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)),
+              2 >> chroma_subsamp_x,
+              cr_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride,
+              2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y);
+        }
+
+        // Copy grain to the line buffer for overlap with a bottom block
+        copy_area(
+            luma_grain_block +
+                (luma_offset_y + luma_subblock_size_y) * luma_grain_stride +
+                luma_offset_x + ((x ? 2 : 0)),
+            luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
+            AOMMIN(luma_subblock_size_x, width - (x << 1)) - (x ? 2 : 0), 2);
+
+        copy_area(cb_grain_block +
+                      (chroma_offset_y + chroma_subblock_size_y) *
+                          chroma_grain_stride +
+                      chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0),
+                  chroma_grain_stride,
+                  cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+                  chroma_stride,
+                  AOMMIN(chroma_subblock_size_x,
+                         ((width - (x << 1)) >> chroma_subsamp_x)) -
+                      (x ? 2 >> chroma_subsamp_x : 0),
+                  2 >> chroma_subsamp_y);
+
+        copy_area(cr_grain_block +
+                      (chroma_offset_y + chroma_subblock_size_y) *
+                          chroma_grain_stride +
+                      chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0),
+                  chroma_grain_stride,
+                  cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+                  chroma_stride,
+                  AOMMIN(chroma_subblock_size_x,
+                         ((width - (x << 1)) >> chroma_subsamp_x)) -
+                      (x ? 2 >> chroma_subsamp_x : 0),
+                  2 >> chroma_subsamp_y);
+
+        // Copy grain to the column buffer for overlap with the next block to
+        // the right
+
+        copy_area(luma_grain_block + luma_offset_y * luma_grain_stride +
+                      luma_offset_x + luma_subblock_size_x,
+                  luma_grain_stride, y_col_buf, 2, 2,
+                  AOMMIN(luma_subblock_size_y + 2, height - (y << 1)));
+
+        copy_area(cb_grain_block + chroma_offset_y * chroma_grain_stride +
+                      chroma_offset_x + chroma_subblock_size_x,
+                  chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x,
+                  2 >> chroma_subsamp_x,
+                  AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
+                         (height - (y << 1)) >> chroma_subsamp_y));
+
+        copy_area(cr_grain_block + chroma_offset_y * chroma_grain_stride +
+                      chroma_offset_x + chroma_subblock_size_x,
+                  chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x,
+                  2 >> chroma_subsamp_x,
+                  AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
+                         (height - (y << 1)) >> chroma_subsamp_y));
+      }
+    }
+  }
+
+  dealloc_arrays(params, &pred_pos_luma, &pred_pos_chroma, &luma_grain_block,
+                 &cb_grain_block, &cr_grain_block, &y_line_buf, &cb_line_buf,
+                 &cr_line_buf, &y_col_buf, &cb_col_buf, &cr_col_buf);
+}
diff --git a/third_party/aom/aom_dsp/grain_synthesis.h b/third_party/aom/aom_dsp/grain_synthesis.h
new file mode 100644
index 000000000..016cb12d7
--- /dev/null
+++ b/third_party/aom/aom_dsp/grain_synthesis.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes film grain parameters and film grain synthesis
+ *
+ */
+#ifndef AOM_AOM_GRAIN_SYNTHESIS_H_
+#define AOM_AOM_GRAIN_SYNTHESIS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom/aom_image.h"
+
+/*!\brief Structure containing film grain synthesis parameters for a frame
+ *
+ * This structure contains input parameters for film grain synthesis
+ */
+typedef struct {
+  int apply_grain;
+
+  int update_parameters;
+
+  // 8 bit values
+  int scaling_points_y[14][2];
+  int num_y_points;  // value: 0..14
+
+  // 8 bit values
+  int scaling_points_cb[10][2];
+  int num_cb_points;  // value: 0..10
+
+  // 8 bit values
+  int scaling_points_cr[10][2];
+  int num_cr_points;  // value: 0..10
+
+  int scaling_shift;  // values : 8..11
+
+  int ar_coeff_lag;  // values:  0..3
+
+  // 8 bit values
+  int ar_coeffs_y[24];
+  int ar_coeffs_cb[25];
+  int ar_coeffs_cr[25];
+
+  // Shift value: AR coeffs range
+  // 6: [-2, 2)
+  // 7: [-1, 1)
+  // 8: [-0.5, 0.5)
+  // 9: [-0.25, 0.25)
+  int ar_coeff_shift;  // values : 6..9
+
+  int cb_mult;       // 8 bits
+  int cb_luma_mult;  // 8 bits
+  int cb_offset;     // 9 bits
+
+  int cr_mult;       // 8 bits
+  int cr_luma_mult;  // 8 bits
+  int cr_offset;     // 9 bits
+
+  int overlap_flag;
+
+  int clip_to_restricted_range;
+
+  int bit_depth;  // video bit depth
+
+  int chroma_scaling_from_luma;
+
+  int grain_scale_shift;
+
+  uint16_t random_seed;
+} aom_film_grain_t;
+
+/*!\brief Add film grain
+ *
+ * Add film grain to an image
+ *
+ * \param[in]    grain_params     Grain parameters
+ * \param[in]    luma             luma plane
+ * \param[in]    cb               cb plane
+ * \param[in]    cr               cr plane
+ * \param[in]    height           luma plane height
+ * \param[in]    width            luma plane width
+ * \param[in]    luma_stride      luma plane stride
+ * \param[in]    chroma_stride    chroma plane stride
+ */
+void av1_add_film_grain_run(aom_film_grain_t *grain_params, uint8_t *luma,
+                            uint8_t *cb, uint8_t *cr, int height, int width,
+                            int luma_stride, int chroma_stride,
+                            int use_high_bit_depth, int chroma_subsamp_y,
+                            int chroma_subsamp_x, int mc_identity);
+
+/*!\brief Add film grain
+ *
+ * Add film grain to an image
+ *
+ * \param[in]    grain_params     Grain parameters
+ * \param[in]    src              Source image
+ * \param[in]    dst              Resulting image with grain
+ */
+void av1_add_film_grain(aom_film_grain_t *grain_params, aom_image_t *src,
+                        aom_image_t *dst);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_GRAIN_SYNTHESIS_H_
diff --git a/third_party/aom/aom_dsp/grain_table.c b/third_party/aom/aom_dsp/grain_table.c
new file mode 100644
index 000000000..0d6a73f55
--- /dev/null
+++ b/third_party/aom/aom_dsp/grain_table.c
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief This file has the implementation details of the grain table.
+ *
+ * The file format is an ascii representation for readability and
+ * editability. Array parameters are separated from the non-array
+ * parameters and prefixed with a few characters to make for easy
+ * localization with a parameter set. Each entry is prefixed with "E"
+ * and the other parameters are only specified if "update-parms" is
+ * non-zero.
+ *
+ * filmgrn1
+ * E <start-time> <end-time> <apply-grain> <random-seed> <update-parms>
+ *  p <ar_coeff_lag> <ar_coeff_shift> <grain_scale_shift> ...
+ *  sY <num_y_points> <point_0_x> <point_0_y> ...
+ *  sCb <num_cb_points> <point_0_x> <point_0_y> ...
+ *  sCr <num_cr_points> <point_0_x> <point_0_y> ...
+ *  cY <ar_coeff_y_0> ....
+ *  cCb <ar_coeff_cb_0> ....
+ *  cCr <ar_coeff_cr_0> ....
+ * E <start-time> ...
+ */
+#include <string.h>
+#include <stdio.h>
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/grain_table.h"
+#include "aom_mem/aom_mem.h"
+
+static const char kFileMagic[8] = "filmgrn1";
+
+static void grain_table_entry_read(FILE *file,
+                                   struct aom_internal_error_info *error_info,
+                                   aom_film_grain_table_entry_t *entry) {
+  aom_film_grain_t *pars = &entry->params;
+  int num_read =
+      fscanf(file, "E %" PRId64 " %" PRId64 " %d %hd %d\n", &entry->start_time,
+             &entry->end_time, &pars->apply_grain, &pars->random_seed,
+             &pars->update_parameters);
+  if (num_read == 0 && feof(file)) return;
+  if (num_read != 5) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR,
+                       "Unable to read entry header. Read %d != 5", num_read);
+    return;
+  }
+  if (pars->update_parameters) {
+    num_read = fscanf(file, "p %d %d %d %d %d %d %d %d %d %d %d %d\n",
+                      &pars->ar_coeff_lag, &pars->ar_coeff_shift,
+                      &pars->grain_scale_shift, &pars->scaling_shift,
+                      &pars->chroma_scaling_from_luma, &pars->overlap_flag,
+                      &pars->cb_mult, &pars->cb_luma_mult, &pars->cb_offset,
+                      &pars->cr_mult, &pars->cr_luma_mult, &pars->cr_offset);
+    if (num_read != 12) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read entry params. Read %d != 12",
+                         num_read);
+      return;
+    }
+    if (!fscanf(file, "\tsY %d ", &pars->num_y_points)) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read num y points");
+      return;
+    }
+    for (int i = 0; i < pars->num_y_points; ++i) {
+      if (2 != fscanf(file, "%d %d", &pars->scaling_points_y[i][0],
+                      &pars->scaling_points_y[i][1])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read y scaling points");
+        return;
+      }
+    }
+    if (!fscanf(file, "\n\tsCb %d", &pars->num_cb_points)) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read num cb points");
+      return;
+    }
+    for (int i = 0; i < pars->num_cb_points; ++i) {
+      if (2 != fscanf(file, "%d %d", &pars->scaling_points_cb[i][0],
+                      &pars->scaling_points_cb[i][1])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read cb scaling points");
+        return;
+      }
+    }
+    if (!fscanf(file, "\n\tsCr %d", &pars->num_cr_points)) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read num cr points");
+      return;
+    }
+    for (int i = 0; i < pars->num_cr_points; ++i) {
+      if (2 != fscanf(file, "%d %d", &pars->scaling_points_cr[i][0],
+                      &pars->scaling_points_cr[i][1])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read cr scaling points");
+        return;
+      }
+    }
+
+    fscanf(file, "\n\tcY");
+    const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+    for (int i = 0; i < n; ++i) {
+      if (1 != fscanf(file, "%d", &pars->ar_coeffs_y[i])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read Y coeffs");
+        return;
+      }
+    }
+    fscanf(file, "\n\tcCb");
+    for (int i = 0; i <= n; ++i) {
+      if (1 != fscanf(file, "%d", &pars->ar_coeffs_cb[i])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read Cb coeffs");
+        return;
+      }
+    }
+    fscanf(file, "\n\tcCr");
+    for (int i = 0; i <= n; ++i) {
+      if (1 != fscanf(file, "%d", &pars->ar_coeffs_cr[i])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read Cr coeffs");
+        return;
+      }
+    }
+    fscanf(file, "\n");
+  }
+}
+
+void grain_table_entry_write(FILE *file, aom_film_grain_table_entry_t *entry) {
+  const aom_film_grain_t *pars = &entry->params;
+  fprintf(file, "E %" PRId64 " %" PRId64 " %d %d %d\n", entry->start_time,
+          entry->end_time, pars->apply_grain, pars->random_seed,
+          pars->update_parameters);
+  if (pars->update_parameters) {
+    fprintf(file, "\tp %d %d %d %d %d %d %d %d %d %d %d %d\n",
+            pars->ar_coeff_lag, pars->ar_coeff_shift, pars->grain_scale_shift,
+            pars->scaling_shift, pars->chroma_scaling_from_luma,
+            pars->overlap_flag, pars->cb_mult, pars->cb_luma_mult,
+            pars->cb_offset, pars->cr_mult, pars->cr_luma_mult,
+            pars->cr_offset);
+    fprintf(file, "\tsY %d ", pars->num_y_points);
+    for (int i = 0; i < pars->num_y_points; ++i) {
+      fprintf(file, " %d %d", pars->scaling_points_y[i][0],
+              pars->scaling_points_y[i][1]);
+    }
+    fprintf(file, "\n\tsCb %d", pars->num_cb_points);
+    for (int i = 0; i < pars->num_cb_points; ++i) {
+      fprintf(file, " %d %d", pars->scaling_points_cb[i][0],
+              pars->scaling_points_cb[i][1]);
+    }
+    fprintf(file, "\n\tsCr %d", pars->num_cr_points);
+    for (int i = 0; i < pars->num_cr_points; ++i) {
+      fprintf(file, " %d %d", pars->scaling_points_cr[i][0],
+              pars->scaling_points_cr[i][1]);
+    }
+    fprintf(file, "\n\tcY");
+    const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+    for (int i = 0; i < n; ++i) {
+      fprintf(file, " %d", pars->ar_coeffs_y[i]);
+    }
+    fprintf(file, "\n\tcCb");
+    for (int i = 0; i <= n; ++i) {
+      fprintf(file, " %d", pars->ar_coeffs_cb[i]);
+    }
+    fprintf(file, "\n\tcCr");
+    for (int i = 0; i <= n; ++i) {
+      fprintf(file, " %d", pars->ar_coeffs_cr[i]);
+    }
+    fprintf(file, "\n");
+  }
+}
+
+void aom_film_grain_table_append(aom_film_grain_table_t *t, int64_t time_stamp,
+                                 int64_t end_time,
+                                 const aom_film_grain_t *grain) {
+  if (!t->tail || memcmp(grain, &t->tail->params, sizeof(*grain))) {
+    aom_film_grain_table_entry_t *new_tail = aom_malloc(sizeof(*new_tail));
+    memset(new_tail, 0, sizeof(*new_tail));
+    if (t->tail) t->tail->next = new_tail;
+    if (!t->head) t->head = new_tail;
+    t->tail = new_tail;
+
+    new_tail->start_time = time_stamp;
+    new_tail->end_time = end_time;
+    new_tail->params = *grain;
+  } else {
+    t->tail->end_time = AOMMAX(t->tail->end_time, end_time);
+    t->tail->start_time = AOMMIN(t->tail->start_time, time_stamp);
+  }
+}
+
+int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
+                                int64_t end_time, int erase,
+                                aom_film_grain_t *grain) {
+  aom_film_grain_table_entry_t *entry = t->head;
+  aom_film_grain_table_entry_t *prev_entry = 0;
+  int16_t random_seed = grain ? grain->random_seed : 0;
+  if (grain) memset(grain, 0, sizeof(*grain));
+
+  while (entry) {
+    aom_film_grain_table_entry_t *next = entry->next;
+    if (time_stamp >= entry->start_time && time_stamp < entry->end_time) {
+      if (grain) {
+        *grain = entry->params;
+        if (time_stamp != 0) grain->random_seed = random_seed;
+      }
+      if (!erase) return 1;
+
+      const int64_t entry_end_time = entry->end_time;
+      if (time_stamp <= entry->start_time && end_time >= entry->end_time) {
+        if (t->tail == entry) t->tail = prev_entry;
+        if (prev_entry) {
+          prev_entry->next = entry->next;
+        } else {
+          t->head = entry->next;
+        }
+        aom_free(entry);
+      } else if (time_stamp <= entry->start_time &&
+                 end_time < entry->end_time) {
+        entry->start_time = end_time;
+      } else if (time_stamp > entry->start_time &&
+                 end_time >= entry->end_time) {
+        entry->end_time = time_stamp;
+      } else {
+        aom_film_grain_table_entry_t *new_entry =
+            aom_malloc(sizeof(*new_entry));
+        new_entry->next = entry->next;
+        new_entry->start_time = end_time;
+        new_entry->end_time = entry->end_time;
+        new_entry->params = entry->params;
+        entry->next = new_entry;
+        entry->end_time = time_stamp;
+        if (t->tail == entry) t->tail = new_entry;
+      }
+      // If segments aren't aligned, delete from the beggining of subsequent
+      // segments
+      if (end_time > entry_end_time) {
+        aom_film_grain_table_lookup(t, entry->end_time, end_time, 1, 0);
+      }
+      return 1;
+    }
+    prev_entry = entry;
+    entry = next;
+  }
+  return 0;
+}
+
+aom_codec_err_t aom_film_grain_table_read(
+    aom_film_grain_table_t *t, const char *filename,
+    struct aom_internal_error_info *error_info) {
+  FILE *file = fopen(filename, "rb");
+  if (!file) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open %s",
+                       filename);
+    return error_info->error_code;
+  }
+  error_info->error_code = AOM_CODEC_OK;
+
+  // Read in one extra character as there should be white space after
+  // the header.
+  char magic[9];
+  if (!fread(magic, 9, 1, file) || memcmp(magic, kFileMagic, 8)) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR,
+                       "Unable to read (or invalid) file magic");
+    fclose(file);
+    return error_info->error_code;
+  }
+
+  aom_film_grain_table_entry_t *prev_entry = 0;
+  while (!feof(file)) {
+    aom_film_grain_table_entry_t *entry = aom_malloc(sizeof(*entry));
+    memset(entry, 0, sizeof(*entry));
+    grain_table_entry_read(file, error_info, entry);
+    entry->next = 0;
+
+    if (prev_entry) prev_entry->next = entry;
+    if (!t->head) t->head = entry;
+    t->tail = entry;
+    prev_entry = entry;
+
+    if (error_info->error_code != AOM_CODEC_OK) break;
+  }
+
+  fclose(file);
+  return error_info->error_code;
+}
+
+aom_codec_err_t aom_film_grain_table_write(
+    const aom_film_grain_table_t *t, const char *filename,
+    struct aom_internal_error_info *error_info) {
+  error_info->error_code = AOM_CODEC_OK;
+
+  FILE *file = fopen(filename, "wb");
+  if (!file) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open file %s",
+                       filename);
+    return error_info->error_code;
+  }
+
+  if (!fwrite(kFileMagic, 8, 1, file)) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR,
+                       "Unable to write file magic");
+    fclose(file);
+    return error_info->error_code;
+  }
+
+  fprintf(file, "\n");
+  aom_film_grain_table_entry_t *entry = t->head;
+  while (entry) {
+    grain_table_entry_write(file, entry);
+    entry = entry->next;
+  }
+  fclose(file);
+  return error_info->error_code;
+}
+
+void aom_film_grain_table_free(aom_film_grain_table_t *t) {
+  aom_film_grain_table_entry_t *entry = t->head;
+  while (entry) {
+    aom_film_grain_table_entry_t *next = entry->next;
+    aom_free(entry);
+    entry = next;
+  }
+  memset(t, 0, sizeof(*t));
+}
diff --git a/third_party/aom/aom_dsp/grain_table.h b/third_party/aom/aom_dsp/grain_table.h
new file mode 100644
index 000000000..5c20413b2
--- /dev/null
+++ b/third_party/aom/aom_dsp/grain_table.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief A table mapping from time to corresponding film grain parameters.
+ *
+ * In order to apply grain synthesis in the decoder, the film grain parameters
+ * need to be signalled in the encoder. The film grain parameters are time
+ * varying, and for two-pass encoding (and denoiser implementation flexibility)
+ * it is common to denoise the video and do parameter estimation before encoding
+ * the denoised video.
+ *
+ * The film grain table is used to provide this flexibility and is used as a
+ * parameter that is passed to the encoder.
+ *
+ * Further, if regraining is to be done in say a single pass mode, or in two
+ * pass within the encoder (before frames are added to the lookahead buffer),
+ * this data structure can be used to keep track of on-the-fly estimated grain
+ * parameters, that are then extracted from the table before the encoded frame
+ * is written.
+ */
+#ifndef AOM_AOM_DSP_GRAIN_TABLE_H_
+#define AOM_AOM_DSP_GRAIN_TABLE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom_dsp/grain_synthesis.h"
+#include "aom/internal/aom_codec_internal.h"
+
+typedef struct aom_film_grain_table_entry_t {
+  aom_film_grain_t params;
+  int64_t start_time;
+  int64_t end_time;
+  struct aom_film_grain_table_entry_t *next;
+} aom_film_grain_table_entry_t;
+
+typedef struct {
+  aom_film_grain_table_entry_t *head;
+  aom_film_grain_table_entry_t *tail;
+} aom_film_grain_table_t;
+
+/*!\brief Add a mapping from [time_stamp, end_time) to the given grain
+ * parameters
+ *
+ * \param[in/out] table      The grain table
+ * \param[in]     time_stamp The start time stamp
+ * \param[in]     end_stamp  The end time_stamp
+ * \param[in]     grain      The grain parameters
+ */
+void aom_film_grain_table_append(aom_film_grain_table_t *table,
+                                 int64_t time_stamp, int64_t end_time,
+                                 const aom_film_grain_t *grain);
+
+/*!\brief Look-up (and optionally erase) the grain parameters for the given time
+ *
+ * \param[in]  table      The grain table
+ * \param[in]  time_stamp The start time stamp
+ * \param[in]  end_stamp  The end time_stamp
+ * \param[in]  erase      Whether the time segment can be deleted
+ * \param[out] grain      The output grain parameters
+ */
+int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
+                                int64_t end_time, int erase,
+                                aom_film_grain_t *grain);
+
+/*!\brief Reads the grain table from a file.
+ *
+ * \param[out]  table       The grain table
+ * \param[in]   filename    The file to read from
+ * \param[in]   error_info  Error info for tracking errors
+ */
+aom_codec_err_t aom_film_grain_table_read(
+    aom_film_grain_table_t *table, const char *filename,
+    struct aom_internal_error_info *error_info);
+
+/*!\brief Writes the grain table from a file.
+ *
+ * \param[out]  table       The grain table
+ * \param[in]   filename    The file to read from
+ * \param[in]   error_info  Error info for tracking errors
+ */
+aom_codec_err_t aom_film_grain_table_write(
+    const aom_film_grain_table_t *t, const char *filename,
+    struct aom_internal_error_info *error_info);
+
+void aom_film_grain_table_free(aom_film_grain_table_t *t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/third_party/aom/aom_dsp/intrapred.c b/third_party/aom/aom_dsp/intrapred.c
index 6d2ac37d9..c6aa6b207 100644
--- a/third_party/aom/aom_dsp/intrapred.c
+++ b/third_party/aom/aom_dsp/intrapred.c
@@ -12,152 +12,14 @@
 #include <assert.h>
 #include <math.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/intrapred_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/bitops.h"
 
-#define DST(x, y) dst[(x) + (y)*stride]
-#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
-#define AVG2(a, b) (((a) + (b) + 1) >> 1)
-
-static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
-                                   int bh, const uint8_t *above,
-                                   const uint8_t *left) {
-  int r, c;
-  (void)above;
-
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
-                            left[(c >> 1) + r + 2])
-                     : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
-    }
-    dst += stride;
-  }
-}
-
-static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
-                                  int bh, const uint8_t *above,
-                                  const uint8_t *left) {
-  int r, c;
-  (void)left;
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
-                            above[(r >> 1) + c + 2])
-                     : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
-    }
-    dst += stride;
-  }
-}
-
-static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
-                                  int bh, const uint8_t *above,
-                                  const uint8_t *left) {
-  int r, c;
-  (void)left;
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = AVG3(above[r + c], above[r + c + 1],
-                    above[r + c + 1 + (r + c + 2 < bw + bh)]);
-    }
-    dst += stride;
-  }
-}
-
-static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
-                                  int bh, const uint8_t *above,
-                                  const uint8_t *left) {
-  int r, c;
-
-  // first row
-  for (c = 0; c < bw; c++) dst[c] = AVG2(above[c - 1], above[c]);
-  dst += stride;
-
-  // second row
-  dst[0] = AVG3(left[0], above[-1], above[0]);
-  for (c = 1; c < bw; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
-  dst += stride;
-
-  // the rest of first col
-  dst[0] = AVG3(above[-1], left[0], left[1]);
-  for (r = 3; r < bh; ++r)
-    dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]);
-
-  // the rest of the block
-  for (r = 2; r < bh; ++r) {
-    for (c = 1; c < bw; c++) dst[c] = dst[-2 * stride + c - 1];
-    dst += stride;
-  }
-}
-
-static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
-                                  int bh, const uint8_t *above,
-                                  const uint8_t *left) {
-  int i;
-#if CONFIG_TX64X64
-#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7
-  // silence a spurious -Warray-bounds warning, possibly related to:
-  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273
-  uint8_t border[133];
-#else
-  uint8_t border[64 + 64 - 1];  // outer border from bottom-left to top-right
-#endif
-#else
-#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7
-  // silence a spurious -Warray-bounds warning, possibly related to:
-  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273
-  uint8_t border[69];
-#else
-  uint8_t border[32 + 32 - 1];  // outer border from bottom-left to top-right
-#endif
-#endif  // CONFIG_TX64X64
-
-  // dst(bh, bh - 2)[0], i.e., border starting at bottom-left
-  for (i = 0; i < bh - 2; ++i) {
-    border[i] = AVG3(left[bh - 3 - i], left[bh - 2 - i], left[bh - 1 - i]);
-  }
-  border[bh - 2] = AVG3(above[-1], left[0], left[1]);
-  border[bh - 1] = AVG3(left[0], above[-1], above[0]);
-  border[bh - 0] = AVG3(above[-1], above[0], above[1]);
-  // dst[0][2, size), i.e., remaining top border ascending
-  for (i = 0; i < bw - 2; ++i) {
-    border[bh + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]);
-  }
-
-  for (i = 0; i < bh; ++i) {
-    memcpy(dst + i * stride, border + bh - 1 - i, bw);
-  }
-}
-
-static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
-                                  int bh, const uint8_t *above,
-                                  const uint8_t *left) {
-  int r, c;
-  dst[0] = AVG2(above[-1], left[0]);
-  for (r = 1; r < bh; r++) dst[r * stride] = AVG2(left[r - 1], left[r]);
-  dst++;
-
-  dst[0] = AVG3(left[0], above[-1], above[0]);
-  dst[stride] = AVG3(above[-1], left[0], left[1]);
-  for (r = 2; r < bh; r++)
-    dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
-  dst++;
-
-  for (c = 0; c < bw - 2; c++)
-    dst[c] = AVG3(above[c - 1], above[c], above[c + 1]);
-  dst += stride;
-
-  for (r = 1; r < bh; ++r) {
-    for (c = 0; c < bw - 2; c++) dst[c] = dst[-stride + c - 2];
-    dst += stride;
-  }
-}
-
 static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
                                const uint8_t *above, const uint8_t *left) {
   int r;
@@ -244,13 +106,12 @@ static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
       for (i = 0; i < 4; ++i) {
         this_pred += weights[i] * pixels[i];
       }
-      dst[c] = clip_pixel(divide_round(this_pred, log2_scale));
+      dst[c] = divide_round(this_pred, log2_scale);
     }
     dst += stride;
   }
 }
 
-#if CONFIG_SMOOTH_HV
 static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
                                       int bh, const uint8_t *above,
                                       const uint8_t *left) {
@@ -274,7 +135,7 @@ static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
       for (i = 0; i < 2; ++i) {
         this_pred += weights[i] * pixels[i];
       }
-      dst[c] = clip_pixel(divide_round(this_pred, log2_scale));
+      dst[c] = divide_round(this_pred, log2_scale);
     }
     dst += stride;
   }
@@ -303,12 +164,11 @@ static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
       for (i = 0; i < 2; ++i) {
         this_pred += weights[i] * pixels[i];
       }
-      dst[c] = clip_pixel(divide_round(this_pred, log2_scale));
+      dst[c] = divide_round(this_pred, log2_scale);
     }
     dst += stride;
   }
 }
-#endif  // CONFIG_SMOOTH_HV
 
 static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
                                     int bh, const uint8_t *above,
@@ -373,267 +233,133 @@ static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   }
 }
 
-void aom_d45e_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-  const int D = above[3];
-  (void)stride;
-  (void)left;
-
-  DST(0, 0) = AVG3(A, B, C);
-  DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
-  DST(1, 1) = AVG3(C, D, D);
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+                                              int multiplier, int shift2) {
+  const int interm = num >> shift1;
+  return interm * multiplier >> shift2;
 }
 
-void aom_d117_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int I = left[0];
-  const int X = above[-1];
-  const int A = above[0];
-  const int B = above[1];
-  DST(0, 0) = AVG2(X, A);
-  DST(1, 0) = AVG2(A, B);
-  DST(0, 1) = AVG3(I, X, A);
-  DST(1, 1) = AVG3(X, A, B);
-}
+  // The constants (multiplier and shifts) for a given block size are obtained
+  // as follows:
+  // - Let sum_w_h =  block width + block height.
+  // - Shift 'sum_w_h' right until we reach an odd number. Let the number of
+  // shifts for that block size be called 'shift1' (see the parameter in
+  // dc_predictor_rect() function), and let the odd number be 'd'. [d has only 2
+  // possible values: d = 3 for a 1:2 rect block and d = 5 for a 1:4 rect
+  // block].
+  // - Find multipliers for (i) dividing by 3, and (ii) dividing by 5,
+  // using the "Algorithm 1" in:
+  // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632
+  // by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd
+  // shift will be 16, regardless of the block size.
 
-void aom_d135_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int I = left[0];
-  const int J = left[1];
-  const int X = above[-1];
-  const int A = above[0];
-  const int B = above[1];
-  (void)stride;
-  DST(0, 1) = AVG3(X, I, J);
-  DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
-  DST(1, 0) = AVG3(B, A, X);
-}
+  // Note: For low bitdepth, assembly code may be optimized by using smaller
+  // constants for smaller block sizes, where the range of the 'sum' is
+  // restricted to fewer bits.
 
-void aom_d153_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int I = left[0];
-  const int J = left[1];
-  const int X = above[-1];
-  const int A = above[0];
-
-  DST(0, 0) = AVG2(I, X);
-  DST(0, 1) = AVG2(J, I);
-  DST(1, 0) = AVG3(I, X, A);
-  DST(1, 1) = AVG3(J, I, X);
-}
+#define DC_MULTIPLIER_1X2 0x5556
+#define DC_MULTIPLIER_1X4 0x3334
 
-void aom_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-  const int D = above[3];
-  const int E = above[4];
-  const int F = above[5];
-  const int G = above[6];
-  const int H = above[7];
-  (void)stride;
-  (void)left;
-  DST(0, 0) = AVG3(A, B, C);
-  DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
-  DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
-  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
-  DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
-  DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
-  DST(3, 3) = AVG3(G, H, H);
-}
+#define DC_SHIFT2 16
 
-void aom_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const int X = above[-1];
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-  const int D = above[3];
-  DST(0, 0) = DST(1, 2) = AVG2(X, A);
-  DST(1, 0) = DST(2, 2) = AVG2(A, B);
-  DST(2, 0) = DST(3, 2) = AVG2(B, C);
-  DST(3, 0) = AVG2(C, D);
-
-  DST(0, 3) = AVG3(K, J, I);
-  DST(0, 2) = AVG3(J, I, X);
-  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
-  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
-  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
-  DST(3, 1) = AVG3(B, C, D);
-}
+static INLINE void dc_predictor_rect(uint8_t *dst, ptrdiff_t stride, int bw,
+                                     int bh, const uint8_t *above,
+                                     const uint8_t *left, int shift1,
+                                     int multiplier) {
+  int sum = 0;
 
-void aom_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const int L = left[3];
-  const int X = above[-1];
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-  const int D = above[3];
-  (void)stride;
-  DST(0, 3) = AVG3(J, K, L);
-  DST(1, 3) = DST(0, 2) = AVG3(I, J, K);
-  DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J);
-  DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
-  DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
-  DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
-  DST(3, 0) = AVG3(D, C, B);
+  for (int i = 0; i < bw; i++) {
+    sum += above[i];
+  }
+  for (int i = 0; i < bh; i++) {
+    sum += left[i];
+  }
+
+  const int expected_dc = divide_using_multiply_shift(
+      sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
+  assert(expected_dc < (1 << 8));
+
+  for (int r = 0; r < bh; r++) {
+    memset(dst, expected_dc, bw);
+    dst += stride;
+  }
 }
 
-void aom_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const int L = left[3];
-  const int X = above[-1];
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-
-  DST(0, 0) = DST(2, 1) = AVG2(I, X);
-  DST(0, 1) = DST(2, 2) = AVG2(J, I);
-  DST(0, 2) = DST(2, 3) = AVG2(K, J);
-  DST(0, 3) = AVG2(L, K);
-
-  DST(3, 0) = AVG3(A, B, C);
-  DST(2, 0) = AVG3(X, A, B);
-  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
-  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
-  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
-  DST(1, 3) = AVG3(L, K, J);
+#undef DC_SHIFT2
+
+void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 4, 8, above, left, 2, DC_MULTIPLIER_1X2);
 }
 
-#if CONFIG_HIGHBITDEPTH
-static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride,
-                                          int bw, int bh, const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  int r, c;
-  (void)above;
-  (void)bd;
+void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 8, 4, above, left, 2, DC_MULTIPLIER_1X2);
+}
 
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
-                            left[(c >> 1) + r + 2])
-                     : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
-    }
-    dst += stride;
-  }
+void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 4, 16, above, left, 2, DC_MULTIPLIER_1X4);
 }
 
-static INLINE void highbd_d63e_predictor(uint16_t *dst, ptrdiff_t stride,
-                                         int bw, int bh, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  int r, c;
-  (void)left;
-  (void)bd;
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
-                            above[(r >> 1) + c + 2])
-                     : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
-    }
-    dst += stride;
-  }
+void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 16, 4, above, left, 2, DC_MULTIPLIER_1X4);
 }
 
-static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride,
-                                         int bw, int bh, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  int r, c;
-  (void)left;
-  (void)bd;
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = AVG3(above[r + c], above[r + c + 1],
-                    above[r + c + 1 + (r + c + 2 < bw + bh)]);
-    }
-    dst += stride;
-  }
+void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 8, 16, above, left, 3, DC_MULTIPLIER_1X2);
 }
 
-static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
-                                         int bw, int bh, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  int r, c;
-  (void)bd;
+void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 16, 8, above, left, 3, DC_MULTIPLIER_1X2);
+}
 
-  // first row
-  for (c = 0; c < bw; c++) dst[c] = AVG2(above[c - 1], above[c]);
-  dst += stride;
+void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 8, 32, above, left, 3, DC_MULTIPLIER_1X4);
+}
 
-  // second row
-  dst[0] = AVG3(left[0], above[-1], above[0]);
-  for (c = 1; c < bw; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
-  dst += stride;
+void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 32, 8, above, left, 3, DC_MULTIPLIER_1X4);
+}
 
-  // the rest of first col
-  dst[0] = AVG3(above[-1], left[0], left[1]);
-  for (r = 3; r < bh; ++r)
-    dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]);
+void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 16, 32, above, left, 4, DC_MULTIPLIER_1X2);
+}
 
-  // the rest of the block
-  for (r = 2; r < bh; ++r) {
-    for (c = 1; c < bw; c++) dst[c] = dst[-2 * stride + c - 1];
-    dst += stride;
-  }
+void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 32, 16, above, left, 4, DC_MULTIPLIER_1X2);
 }
 
-static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride,
-                                         int bw, int bh, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  int r, c;
-  (void)bd;
-  dst[0] = AVG3(left[0], above[-1], above[0]);
-  for (c = 1; c < bw; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 16, 64, above, left, 4, DC_MULTIPLIER_1X4);
+}
 
-  dst[stride] = AVG3(above[-1], left[0], left[1]);
-  for (r = 2; r < bh; ++r)
-    dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
+void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 64, 16, above, left, 4, DC_MULTIPLIER_1X4);
+}
 
-  dst += stride;
-  for (r = 1; r < bh; ++r) {
-    for (c = 1; c < bw; c++) dst[c] = dst[-stride + c - 1];
-    dst += stride;
-  }
+void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 32, 64, above, left, 5, DC_MULTIPLIER_1X2);
 }
 
-static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride,
-                                         int bw, int bh, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  int r, c;
-  (void)bd;
-  dst[0] = AVG2(above[-1], left[0]);
-  for (r = 1; r < bh; r++) dst[r * stride] = AVG2(left[r - 1], left[r]);
-  dst++;
-
-  dst[0] = AVG3(left[0], above[-1], above[0]);
-  dst[stride] = AVG3(above[-1], left[0], left[1]);
-  for (r = 2; r < bh; r++)
-    dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
-  dst++;
-
-  for (c = 0; c < bw - 2; c++)
-    dst[c] = AVG3(above[c - 1], above[c], above[c + 1]);
-  dst += stride;
-
-  for (r = 1; r < bh; ++r) {
-    for (c = 0; c < bw - 2; c++) dst[c] = dst[-stride + c - 2];
-    dst += stride;
-  }
+void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 64, 32, above, left, 5, DC_MULTIPLIER_1X2);
 }
 
+#undef DC_MULTIPLIER_1X2
+#undef DC_MULTIPLIER_1X4
+
 static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
                                       int bh, const uint16_t *above,
                                       const uint16_t *left, int bd) {
@@ -658,93 +384,6 @@ static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
   }
 }
 
-void aom_highbd_d207_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const int L = left[3];
-  (void)above;
-  (void)bd;
-  DST(0, 0) = AVG2(I, J);
-  DST(0, 1) = AVG2(J, K);
-  DST(1, 0) = AVG3(I, J, K);
-  DST(1, 1) = AVG3(J, K, L);
-}
-
-void aom_highbd_d63_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride,
-                                    const uint16_t *above, const uint16_t *left,
-                                    int bd) {
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-  const int D = above[3];
-  (void)left;
-  (void)bd;
-  DST(0, 0) = AVG2(A, B);
-  DST(1, 0) = AVG2(B, C);
-  DST(0, 1) = AVG3(A, B, C);
-  DST(1, 1) = AVG3(B, C, D);
-}
-
-void aom_highbd_d45e_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-  const int D = above[3];
-  (void)stride;
-  (void)left;
-  (void)bd;
-  DST(0, 0) = AVG3(A, B, C);
-  DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
-  DST(1, 1) = AVG3(C, D, D);
-}
-
-void aom_highbd_d117_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  const int I = left[0];
-  const int X = above[-1];
-  const int A = above[0];
-  const int B = above[1];
-  (void)bd;
-  DST(0, 0) = AVG2(X, A);
-  DST(1, 0) = AVG2(A, B);
-  DST(0, 1) = AVG3(I, X, A);
-  DST(1, 1) = AVG3(X, A, B);
-}
-
-void aom_highbd_d135_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  const int I = left[0];
-  const int J = left[1];
-  const int X = above[-1];
-  const int A = above[0];
-  const int B = above[1];
-  (void)bd;
-  DST(0, 1) = AVG3(X, I, J);
-  DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
-  DST(1, 0) = AVG3(B, A, X);
-}
-
-void aom_highbd_d153_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  const int I = left[0];
-  const int J = left[1];
-  const int X = above[-1];
-  const int A = above[0];
-  (void)bd;
-  DST(0, 0) = AVG2(I, X);
-  DST(0, 1) = AVG2(J, I);
-  DST(1, 0) = AVG3(I, X, A);
-  DST(1, 1) = AVG3(J, I, X);
-}
-
 static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride,
                                           int bw, int bh, const uint16_t *above,
                                           const uint16_t *left, int bd) {
@@ -763,6 +402,7 @@ static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride,
                                            int bw, int bh,
                                            const uint16_t *above,
                                            const uint16_t *left, int bd) {
+  (void)bd;
   const uint16_t below_pred = left[bh - 1];   // estimated by bottom-left pixel
   const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
   const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
@@ -785,17 +425,17 @@ static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride,
       for (i = 0; i < 4; ++i) {
         this_pred += weights[i] * pixels[i];
       }
-      dst[c] = clip_pixel_highbd(divide_round(this_pred, log2_scale), bd);
+      dst[c] = divide_round(this_pred, log2_scale);
     }
     dst += stride;
   }
 }
 
-#if CONFIG_SMOOTH_HV
 static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride,
                                              int bw, int bh,
                                              const uint16_t *above,
                                              const uint16_t *left, int bd) {
+  (void)bd;
   const uint16_t below_pred = left[bh - 1];  // estimated by bottom-left pixel
   const uint8_t *const sm_weights = sm_weight_arrays + bh;
   // scale = 2^sm_weight_log2_scale
@@ -816,7 +456,7 @@ static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride,
       for (i = 0; i < 2; ++i) {
         this_pred += weights[i] * pixels[i];
       }
-      dst[c] = clip_pixel_highbd(divide_round(this_pred, log2_scale), bd);
+      dst[c] = divide_round(this_pred, log2_scale);
     }
     dst += stride;
   }
@@ -826,6 +466,7 @@ static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride,
                                              int bw, int bh,
                                              const uint16_t *above,
                                              const uint16_t *left, int bd) {
+  (void)bd;
   const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
   const uint8_t *const sm_weights = sm_weight_arrays + bw;
   // scale = 2^sm_weight_log2_scale
@@ -846,12 +487,11 @@ static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride,
       for (i = 0; i < 2; ++i) {
         this_pred += weights[i] * pixels[i];
       }
-      dst[c] = clip_pixel_highbd(divide_round(this_pred, log2_scale), bd);
+      dst[c] = divide_round(this_pred, log2_scale);
     }
     dst += stride;
   }
 }
-#endif  // CONFIG_SMOOTH_HV
 
 static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
                                            int bw, int bh,
@@ -922,7 +562,148 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
     dst += stride;
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
+
+// Obtained similarly as DC_MULTIPLIER_1X2 and DC_MULTIPLIER_1X4 above, but
+// assume 2nd shift of 17 bits instead of 16.
+// Note: Strictly speaking, 2nd shift needs to be 17 only when:
+// - bit depth == 12, and
+// - bw + bh is divisible by 5 (as opposed to divisible by 3).
+// All other cases can use half the multipliers with a shift of 16 instead.
+// This special optimization can be used when writing assembly code.
+#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
+// Note: This constant is odd, but a smaller even constant (0x199a) with the
+// appropriate shift should work for neon in 8/10-bit.
+#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
+
+#define HIGHBD_DC_SHIFT2 17
+
+static INLINE void highbd_dc_predictor_rect(uint16_t *dst, ptrdiff_t stride,
+                                            int bw, int bh,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd,
+                                            int shift1, uint32_t multiplier) {
+  int sum = 0;
+  (void)bd;
+
+  for (int i = 0; i < bw; i++) {
+    sum += above[i];
+  }
+  for (int i = 0; i < bh; i++) {
+    sum += left[i];
+  }
+
+  const int expected_dc = divide_using_multiply_shift(
+      sum + ((bw + bh) >> 1), shift1, multiplier, HIGHBD_DC_SHIFT2);
+  assert(expected_dc < (1 << bd));
+
+  for (int r = 0; r < bh; r++) {
+    aom_memset16(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+#undef HIGHBD_DC_SHIFT2
+
+void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t stride,
+                                   const uint16_t *above, const uint16_t *left,
+                                   int bd) {
+  highbd_dc_predictor_rect(dst, stride, 4, 8, above, left, bd, 2,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t stride,
+                                   const uint16_t *above, const uint16_t *left,
+                                   int bd) {
+  highbd_dc_predictor_rect(dst, stride, 8, 4, above, left, bd, 2,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 4, 16, above, left, bd, 2,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 16, 4, above, left, bd, 2,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 8, 16, above, left, bd, 3,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 16, 8, above, left, bd, 3,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 8, 32, above, left, bd, 3,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 32, 8, above, left, bd, 3,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 16, 32, above, left, bd, 4,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 32, 16, above, left, bd, 4,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 16, 64, above, left, bd, 4,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 64, 16, above, left, bd, 4,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 32, 64, above, left, bd, 5,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 64, 32, above, left, bd, 5,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+#undef HIGHBD_DC_MULTIPLIER_1X2
+#undef HIGHBD_DC_MULTIPLIER_1X4
 
 // This serves as a wrapper function, so that all the prediction functions
 // can be unified and accessed as a pointer array. Note that the boundary
@@ -934,7 +715,6 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
     type##_predictor(dst, stride, width, height, above, left); \
   }
 
-#if CONFIG_HIGHBITDEPTH
 #define intra_pred_highbd_sized(type, width, height)                        \
   void aom_highbd_##type##_predictor_##width##x##height##_c(                \
       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,               \
@@ -943,7 +723,6 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
   }
 
 /* clang-format off */
-#if CONFIG_TX64X64
 #define intra_pred_rectangular(type) \
   intra_pred_sized(type, 4, 8) \
   intra_pred_sized(type, 8, 4) \
@@ -953,6 +732,12 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
   intra_pred_sized(type, 32, 16) \
   intra_pred_sized(type, 32, 64) \
   intra_pred_sized(type, 64, 32) \
+  intra_pred_sized(type, 4, 16) \
+  intra_pred_sized(type, 16, 4) \
+  intra_pred_sized(type, 8, 32) \
+  intra_pred_sized(type, 32, 8) \
+  intra_pred_sized(type, 16, 64) \
+  intra_pred_sized(type, 64, 16) \
   intra_pred_highbd_sized(type, 4, 8) \
   intra_pred_highbd_sized(type, 8, 4) \
   intra_pred_highbd_sized(type, 8, 16) \
@@ -960,7 +745,13 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
   intra_pred_highbd_sized(type, 16, 32) \
   intra_pred_highbd_sized(type, 32, 16) \
   intra_pred_highbd_sized(type, 32, 64) \
-  intra_pred_highbd_sized(type, 64, 32)
+  intra_pred_highbd_sized(type, 64, 32) \
+  intra_pred_highbd_sized(type, 4, 16) \
+  intra_pred_highbd_sized(type, 16, 4) \
+  intra_pred_highbd_sized(type, 8, 32) \
+  intra_pred_highbd_sized(type, 32, 8) \
+  intra_pred_highbd_sized(type, 16, 64) \
+  intra_pred_highbd_sized(type, 64, 16)
 #define intra_pred_above_4x4(type) \
   intra_pred_sized(type, 8, 8) \
   intra_pred_sized(type, 16, 16) \
@@ -973,100 +764,29 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
   intra_pred_highbd_sized(type, 64, 64) \
   intra_pred_rectangular(type)
 #define intra_pred_allsizes(type) \
-  intra_pred_sized(type, 2, 2) \
   intra_pred_sized(type, 4, 4) \
-  intra_pred_highbd_sized(type, 2, 2) \
   intra_pred_above_4x4(type)
-#else  // CONFIG_TX64X64
-#define intra_pred_rectangular(type) \
-  intra_pred_sized(type, 4, 8) \
-  intra_pred_sized(type, 8, 4) \
-  intra_pred_sized(type, 8, 16) \
-  intra_pred_sized(type, 16, 8) \
-  intra_pred_sized(type, 16, 32) \
-  intra_pred_sized(type, 32, 16) \
-  intra_pred_highbd_sized(type, 4, 8) \
-  intra_pred_highbd_sized(type, 8, 4) \
-  intra_pred_highbd_sized(type, 8, 16) \
-  intra_pred_highbd_sized(type, 16, 8) \
-  intra_pred_highbd_sized(type, 16, 32) \
-  intra_pred_highbd_sized(type, 32, 16)
-#define intra_pred_above_4x4(type) \
+#define intra_pred_square(type) \
+  intra_pred_sized(type, 4, 4) \
   intra_pred_sized(type, 8, 8) \
   intra_pred_sized(type, 16, 16) \
   intra_pred_sized(type, 32, 32) \
+  intra_pred_sized(type, 64, 64) \
   intra_pred_highbd_sized(type, 4, 4) \
   intra_pred_highbd_sized(type, 8, 8) \
   intra_pred_highbd_sized(type, 16, 16) \
   intra_pred_highbd_sized(type, 32, 32) \
-  intra_pred_rectangular(type)
-#define intra_pred_allsizes(type) \
-  intra_pred_sized(type, 2, 2) \
-  intra_pred_sized(type, 4, 4) \
-  intra_pred_highbd_sized(type, 2, 2) \
-  intra_pred_above_4x4(type)
-#endif  // CONFIG_TX64X64
-
-#else
-
-#if CONFIG_TX64X64
-#define intra_pred_rectangular(type) \
-  intra_pred_sized(type, 4, 8) \
-  intra_pred_sized(type, 8, 4) \
-  intra_pred_sized(type, 8, 16) \
-  intra_pred_sized(type, 16, 8) \
-  intra_pred_sized(type, 16, 32) \
-  intra_pred_sized(type, 32, 16) \
-  intra_pred_sized(type, 32, 64) \
-  intra_pred_sized(type, 64, 32)
-#define intra_pred_above_4x4(type) \
-  intra_pred_sized(type, 8, 8) \
-  intra_pred_sized(type, 16, 16) \
-  intra_pred_sized(type, 32, 32) \
-  intra_pred_sized(type, 64, 64) \
-  intra_pred_rectangular(type)
-#define intra_pred_allsizes(type) \
-  intra_pred_sized(type, 2, 2) \
-  intra_pred_sized(type, 4, 4) \
-  intra_pred_above_4x4(type)
-#else  // CONFIG_TX64X64
-#define intra_pred_rectangular(type) \
-  intra_pred_sized(type, 4, 8) \
-  intra_pred_sized(type, 8, 4) \
-  intra_pred_sized(type, 8, 16) \
-  intra_pred_sized(type, 16, 8) \
-  intra_pred_sized(type, 16, 32) \
-  intra_pred_sized(type, 32, 16)
-#define intra_pred_above_4x4(type) \
-  intra_pred_sized(type, 8, 8) \
-  intra_pred_sized(type, 16, 16) \
-  intra_pred_sized(type, 32, 32) \
-  intra_pred_rectangular(type)
-#define intra_pred_allsizes(type) \
-  intra_pred_sized(type, 2, 2) \
-  intra_pred_sized(type, 4, 4) \
-  intra_pred_above_4x4(type)
-#endif  // CONFIG_TX64X64
-
-#endif  // CONFIG_HIGHBITDEPTH
+  intra_pred_highbd_sized(type, 64, 64)
 
-intra_pred_allsizes(d207e)
-intra_pred_allsizes(d63e)
-intra_pred_above_4x4(d45e)
-intra_pred_above_4x4(d117)
-intra_pred_above_4x4(d135)
-intra_pred_above_4x4(d153)
 intra_pred_allsizes(v)
 intra_pred_allsizes(h)
 intra_pred_allsizes(smooth)
-#if CONFIG_SMOOTH_HV
 intra_pred_allsizes(smooth_v)
 intra_pred_allsizes(smooth_h)
-#endif  // CONFIG_SMOOTH_HV
 intra_pred_allsizes(paeth)
 intra_pred_allsizes(dc_128)
 intra_pred_allsizes(dc_left)
 intra_pred_allsizes(dc_top)
-intra_pred_allsizes(dc)
+intra_pred_square(dc)
 /* clang-format on */
 #undef intra_pred_allsizes
diff --git a/third_party/aom/aom_dsp/intrapred_common.h b/third_party/aom/aom_dsp/intrapred_common.h
index 96da49b03..e047d98bc 100644
--- a/third_party/aom/aom_dsp/intrapred_common.h
+++ b/third_party/aom/aom_dsp/intrapred_common.h
@@ -12,19 +12,16 @@
 #ifndef _AOM_DSP_INTRAPRED_COMMON_H
 #define _AOM_DSP_INTRAPRED_COMMON_H
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 // Weights are quadratic from '1' to '1 / block_size', scaled by
 // 2^sm_weight_log2_scale.
 static const int sm_weight_log2_scale = 8;
 
-#if CONFIG_TX64X64
 // max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
 #define MAX_BLOCK_DIM 64
-#else
-#define MAX_BLOCK_DIM 32
-#endif  // CONFIG_TX64X64
 
+/* clang-format off */
 static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
   // Unused, because we always offset by bs, which is at least 2.
   0, 0,
@@ -39,13 +36,12 @@ static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
   // bs = 32
   255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
   66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
-#if CONFIG_TX64X64
   // bs = 64
   255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
   150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69,
   65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15,
   13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
-#endif  // CONFIG_TX64X64
 };
+/* clang-format on */
 
 #endif  // _AOM_DSP_INTRAPRED_COMMON_H
diff --git a/third_party/aom/aom_dsp/inv_txfm.c b/third_party/aom/aom_dsp/inv_txfm.c
deleted file mode 100644
index 6b7c1c2ab..000000000
--- a/third_party/aom/aom_dsp/inv_txfm.c
+++ /dev/null
@@ -1,1482 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <math.h>
-#include <string.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/inv_txfm.h"
-#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \
-    CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64
-#include "av1/common/daala_tx.h"
-#endif
-
-void aom_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
-     0.5 shifts per pixel. */
-  int i;
-  tran_low_t output[16];
-  tran_high_t a1, b1, c1, d1, e1;
-  const tran_low_t *ip = input;
-  tran_low_t *op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] >> UNIT_QUANT_SHIFT;
-    c1 = ip[1] >> UNIT_QUANT_SHIFT;
-    d1 = ip[2] >> UNIT_QUANT_SHIFT;
-    b1 = ip[3] >> UNIT_QUANT_SHIFT;
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    op[0] = WRAPLOW(a1);
-    op[1] = WRAPLOW(b1);
-    op[2] = WRAPLOW(c1);
-    op[3] = WRAPLOW(d1);
-    ip += 4;
-    op += 4;
-  }
-
-  ip = output;
-  for (i = 0; i < 4; i++) {
-    a1 = ip[4 * 0];
-    c1 = ip[4 * 1];
-    d1 = ip[4 * 2];
-    b1 = ip[4 * 3];
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
-    dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
-    dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
-    dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
-
-    ip++;
-    dest++;
-  }
-}
-
-void aom_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
-  int i;
-  tran_high_t a1, e1;
-  tran_low_t tmp[4];
-  const tran_low_t *ip = in;
-  tran_low_t *op = tmp;
-
-  a1 = ip[0] >> UNIT_QUANT_SHIFT;
-  e1 = a1 >> 1;
-  a1 -= e1;
-  op[0] = WRAPLOW(a1);
-  op[1] = op[2] = op[3] = WRAPLOW(e1);
-
-  ip = tmp;
-  for (i = 0; i < 4; i++) {
-    e1 = ip[0] >> 1;
-    a1 = ip[0] - e1;
-    dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
-    dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
-    dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
-    dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
-    ip++;
-    dest++;
-  }
-}
-
-void aom_idct4_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step[4];
-  tran_high_t temp1, temp2;
-  // stage 1
-  temp1 = (input[0] + input[2]) * cospi_16_64;
-  temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step[3] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 2
-  output[0] = WRAPLOW(step[0] + step[3]);
-  output[1] = WRAPLOW(step[1] + step[2]);
-  output[2] = WRAPLOW(step[1] - step[2]);
-  output[3] = WRAPLOW(step[0] - step[3]);
-}
-
-void aom_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[4], temp_out[4];
-
-  // Rows
-  for (i = 0; i < 4; ++i) {
-    aom_idct4_c(input, outptr);
-    input += 4;
-    outptr += 4;
-  }
-
-  // Columns
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
-    aom_idct4_c(temp_in, temp_out);
-    for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
-    }
-  }
-}
-
-void aom_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
-                         int dest_stride) {
-  int i;
-  tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 4);
-
-  if (a1 == 0) return;
-
-  for (i = 0; i < 4; i++) {
-    dest[0] = clip_pixel_add(dest[0], a1);
-    dest[1] = clip_pixel_add(dest[1], a1);
-    dest[2] = clip_pixel_add(dest[2], a1);
-    dest[3] = clip_pixel_add(dest[3], a1);
-    dest += dest_stride;
-  }
-}
-
-void aom_idct8_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[8], step2[8];
-  tran_high_t temp1, temp2;
-  // stage 1
-  step1[0] = input[0];
-  step1[2] = input[4];
-  step1[1] = input[2];
-  step1[3] = input[6];
-  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 2
-  temp1 = (step1[0] + step1[2]) * cospi_16_64;
-  temp2 = (step1[0] - step1[2]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[4] = WRAPLOW(step1[4] + step1[5]);
-  step2[5] = WRAPLOW(step1[4] - step1[5]);
-  step2[6] = WRAPLOW(-step1[6] + step1[7]);
-  step2[7] = WRAPLOW(step1[6] + step1[7]);
-
-  // stage 3
-  step1[0] = WRAPLOW(step2[0] + step2[3]);
-  step1[1] = WRAPLOW(step2[1] + step2[2]);
-  step1[2] = WRAPLOW(step2[1] - step2[2]);
-  step1[3] = WRAPLOW(step2[0] - step2[3]);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[7] = step2[7];
-
-  // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7]);
-  output[1] = WRAPLOW(step1[1] + step1[6]);
-  output[2] = WRAPLOW(step1[2] + step1[5]);
-  output[3] = WRAPLOW(step1[3] + step1[4]);
-  output[4] = WRAPLOW(step1[3] - step1[4]);
-  output[5] = WRAPLOW(step1[2] - step1[5]);
-  output[6] = WRAPLOW(step1[1] - step1[6]);
-  output[7] = WRAPLOW(step1[0] - step1[7]);
-}
-
-void aom_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[8], temp_out[8];
-
-  // First transform rows
-  for (i = 0; i < 8; ++i) {
-    aom_idct8_c(input, outptr);
-    input += 8;
-    outptr += 8;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    aom_idct8_c(temp_in, temp_out);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
-    }
-  }
-}
-
-void aom_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 5);
-  if (a1 == 0) return;
-  for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
-    dest += stride;
-  }
-}
-
-void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_low_t x0 = input[0];
-  tran_low_t x1 = input[1];
-  tran_low_t x2 = input[2];
-  tran_low_t x3 = input[3];
-
-  if (!(x0 | x1 | x2 | x3)) {
-    output[0] = output[1] = output[2] = output[3] = 0;
-    return;
-  }
-
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_2_9 * x0;
-  s2 = sinpi_3_9 * x1;
-  s3 = sinpi_4_9 * x2;
-  s4 = sinpi_1_9 * x2;
-  s5 = sinpi_2_9 * x3;
-  s6 = sinpi_4_9 * x3;
-  s7 = WRAPLOW(x0 - x2 + x3);
-
-  s0 = s0 + s3 + s5;
-  s1 = s1 - s4 - s6;
-  s3 = s2;
-  s2 = sinpi_3_9 * s7;
-
-  // 1-D transform scaling factor is sqrt(2).
-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
-  // + 1b (addition) = 29b.
-  // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
-  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
-  output[2] = WRAPLOW(dct_const_round_shift(s2));
-  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
-}
-
-void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_high_t x0 = input[7];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[5];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[3];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[1];
-  tran_high_t x7 = input[6];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
-        output[6] = output[7] = 0;
-    return;
-  }
-
-  // stage 1
-  s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
-  s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
-  s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
-  s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
-  s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
-  s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
-  s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
-  s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
-
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
-  x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
-  x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
-  x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
-  x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
-
-  // stage 2
-  s0 = (int)x0;
-  s1 = (int)x1;
-  s2 = (int)x2;
-  s3 = (int)x3;
-  s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
-  s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
-  s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
-  s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
-
-  x0 = WRAPLOW(s0 + s2);
-  x1 = WRAPLOW(s1 + s3);
-  x2 = WRAPLOW(s0 - s2);
-  x3 = WRAPLOW(s1 - s3);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
-
-  // stage 3
-  s2 = (int)(cospi_16_64 * (x2 + x3));
-  s3 = (int)(cospi_16_64 * (x2 - x3));
-  s6 = (int)(cospi_16_64 * (x6 + x7));
-  s7 = (int)(cospi_16_64 * (x6 - x7));
-
-  x2 = WRAPLOW(dct_const_round_shift(s2));
-  x3 = WRAPLOW(dct_const_round_shift(s3));
-  x6 = WRAPLOW(dct_const_round_shift(s6));
-  x7 = WRAPLOW(dct_const_round_shift(s7));
-
-  output[0] = WRAPLOW(x0);
-  output[1] = WRAPLOW(-x4);
-  output[2] = WRAPLOW(x6);
-  output[3] = WRAPLOW(-x2);
-  output[4] = WRAPLOW(x3);
-  output[5] = WRAPLOW(-x7);
-  output[6] = WRAPLOW(x5);
-  output[7] = WRAPLOW(-x1);
-}
-
-void aom_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  tran_low_t out[8 * 8] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[8], temp_out[8];
-
-  // First transform rows
-  // only first 4 row has non-zero coefs
-  for (i = 0; i < 4; ++i) {
-    aom_idct8_c(input, outptr);
-    input += 8;
-    outptr += 8;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    aom_idct8_c(temp_in, temp_out);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
-    }
-  }
-}
-
-void aom_idct16_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[16], step2[16];
-  tran_high_t temp1, temp2;
-
-  // stage 1
-  step1[0] = input[0 / 2];
-  step1[1] = input[16 / 2];
-  step1[2] = input[8 / 2];
-  step1[3] = input[24 / 2];
-  step1[4] = input[4 / 2];
-  step1[5] = input[20 / 2];
-  step1[6] = input[12 / 2];
-  step1[7] = input[28 / 2];
-  step1[8] = input[2 / 2];
-  step1[9] = input[18 / 2];
-  step1[10] = input[10 / 2];
-  step1[11] = input[26 / 2];
-  step1[12] = input[6 / 2];
-  step1[13] = input[22 / 2];
-  step1[14] = input[14 / 2];
-  step1[15] = input[30 / 2];
-
-  // stage 2
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[4];
-  step2[5] = step1[5];
-  step2[6] = step1[6];
-  step2[7] = step1[7];
-
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
-  step1[8] = WRAPLOW(step2[8] + step2[9]);
-  step1[9] = WRAPLOW(step2[8] - step2[9]);
-  step1[10] = WRAPLOW(-step2[10] + step2[11]);
-  step1[11] = WRAPLOW(step2[10] + step2[11]);
-  step1[12] = WRAPLOW(step2[12] + step2[13]);
-  step1[13] = WRAPLOW(step2[12] - step2[13]);
-  step1[14] = WRAPLOW(-step2[14] + step2[15]);
-  step1[15] = WRAPLOW(step2[14] + step2[15]);
-
-  // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[4] = WRAPLOW(step1[4] + step1[5]);
-  step2[5] = WRAPLOW(step1[4] - step1[5]);
-  step2[6] = WRAPLOW(-step1[6] + step1[7]);
-  step2[7] = WRAPLOW(step1[6] + step1[7]);
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-
-  // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3]);
-  step1[1] = WRAPLOW(step2[1] + step2[2]);
-  step1[2] = WRAPLOW(step2[1] - step2[2]);
-  step1[3] = WRAPLOW(step2[0] - step2[3]);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[7] = step2[7];
-
-  step1[8] = WRAPLOW(step2[8] + step2[11]);
-  step1[9] = WRAPLOW(step2[9] + step2[10]);
-  step1[10] = WRAPLOW(step2[9] - step2[10]);
-  step1[11] = WRAPLOW(step2[8] - step2[11]);
-  step1[12] = WRAPLOW(-step2[12] + step2[15]);
-  step1[13] = WRAPLOW(-step2[13] + step2[14]);
-  step1[14] = WRAPLOW(step2[13] + step2[14]);
-  step1[15] = WRAPLOW(step2[12] + step2[15]);
-
-  // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7]);
-  step2[1] = WRAPLOW(step1[1] + step1[6]);
-  step2[2] = WRAPLOW(step1[2] + step1[5]);
-  step2[3] = WRAPLOW(step1[3] + step1[4]);
-  step2[4] = WRAPLOW(step1[3] - step1[4]);
-  step2[5] = WRAPLOW(step1[2] - step1[5]);
-  step2[6] = WRAPLOW(step1[1] - step1[6]);
-  step2[7] = WRAPLOW(step1[0] - step1[7]);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15]);
-  output[1] = WRAPLOW(step2[1] + step2[14]);
-  output[2] = WRAPLOW(step2[2] + step2[13]);
-  output[3] = WRAPLOW(step2[3] + step2[12]);
-  output[4] = WRAPLOW(step2[4] + step2[11]);
-  output[5] = WRAPLOW(step2[5] + step2[10]);
-  output[6] = WRAPLOW(step2[6] + step2[9]);
-  output[7] = WRAPLOW(step2[7] + step2[8]);
-  output[8] = WRAPLOW(step2[7] - step2[8]);
-  output[9] = WRAPLOW(step2[6] - step2[9]);
-  output[10] = WRAPLOW(step2[5] - step2[10]);
-  output[11] = WRAPLOW(step2[4] - step2[11]);
-  output[12] = WRAPLOW(step2[3] - step2[12]);
-  output[13] = WRAPLOW(step2[2] - step2[13]);
-  output[14] = WRAPLOW(step2[1] - step2[14]);
-  output[15] = WRAPLOW(step2[0] - step2[15]);
-}
-
-void aom_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[16], temp_out[16];
-
-  // First transform rows
-  for (i = 0; i < 16; ++i) {
-    aom_idct16_c(input, outptr);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    aom_idct16_c(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void aom_iadst16_c(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-  tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
-  tran_high_t x0 = input[15];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[13];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[11];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[9];
-  tran_high_t x7 = input[6];
-  tran_high_t x8 = input[7];
-  tran_high_t x9 = input[8];
-  tran_high_t x10 = input[5];
-  tran_high_t x11 = input[10];
-  tran_high_t x12 = input[3];
-  tran_high_t x13 = input[12];
-  tran_high_t x14 = input[1];
-  tran_high_t x15 = input[14];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
-        x13 | x14 | x15)) {
-    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
-        output[6] = output[7] = output[8] = output[9] = output[10] =
-            output[11] = output[12] = output[13] = output[14] = output[15] = 0;
-    return;
-  }
-
-  // stage 1
-  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
-  x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
-  x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
-  x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
-  x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
-  x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
-  x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
-  x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
-  x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
-  x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
-  x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4;
-  s5 = x5;
-  s6 = x6;
-  s7 = x7;
-  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
-  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
-  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
-  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
-  x0 = WRAPLOW(s0 + s4);
-  x1 = WRAPLOW(s1 + s5);
-  x2 = WRAPLOW(s2 + s6);
-  x3 = WRAPLOW(s3 + s7);
-  x4 = WRAPLOW(s0 - s4);
-  x5 = WRAPLOW(s1 - s5);
-  x6 = WRAPLOW(s2 - s6);
-  x7 = WRAPLOW(s3 - s7);
-  x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
-  x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
-  x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
-  x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
-  x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
-  x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
-  x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
-  x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
-
-  // stage 3
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
-  s8 = x8;
-  s9 = x9;
-  s10 = x10;
-  s11 = x11;
-  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
-  x0 = WRAPLOW(s0 + s2);
-  x1 = WRAPLOW(s1 + s3);
-  x2 = WRAPLOW(s0 - s2);
-  x3 = WRAPLOW(s1 - s3);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
-  x8 = WRAPLOW(s8 + s10);
-  x9 = WRAPLOW(s9 + s11);
-  x10 = WRAPLOW(s8 - s10);
-  x11 = WRAPLOW(s9 - s11);
-  x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
-  x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
-  x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
-  x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
-
-  // stage 4
-  s2 = (-cospi_16_64) * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (-x6 + x7);
-  s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (-x10 + x11);
-  s14 = (-cospi_16_64) * (x14 + x15);
-  s15 = cospi_16_64 * (x14 - x15);
-
-  x2 = WRAPLOW(dct_const_round_shift(s2));
-  x3 = WRAPLOW(dct_const_round_shift(s3));
-  x6 = WRAPLOW(dct_const_round_shift(s6));
-  x7 = WRAPLOW(dct_const_round_shift(s7));
-  x10 = WRAPLOW(dct_const_round_shift(s10));
-  x11 = WRAPLOW(dct_const_round_shift(s11));
-  x14 = WRAPLOW(dct_const_round_shift(s14));
-  x15 = WRAPLOW(dct_const_round_shift(s15));
-
-  output[0] = WRAPLOW(x0);
-  output[1] = WRAPLOW(-x8);
-  output[2] = WRAPLOW(x12);
-  output[3] = WRAPLOW(-x4);
-  output[4] = WRAPLOW(x6);
-  output[5] = WRAPLOW(x14);
-  output[6] = WRAPLOW(x10);
-  output[7] = WRAPLOW(x2);
-  output[8] = WRAPLOW(x3);
-  output[9] = WRAPLOW(x11);
-  output[10] = WRAPLOW(x15);
-  output[11] = WRAPLOW(x7);
-  output[12] = WRAPLOW(x5);
-  output[13] = WRAPLOW(-x13);
-  output[14] = WRAPLOW(x9);
-  output[15] = WRAPLOW(-x1);
-}
-
-void aom_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  int i, j;
-  tran_low_t out[16 * 16] = { 0 };
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[16], temp_out[16];
-
-  // First transform rows. Since all non-zero dct coefficients are in
-  // upper-left 8x8 area, we only need to calculate first 8 rows here.
-  for (i = 0; i < 8; ++i) {
-    aom_idct16_c(input, outptr);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    aom_idct16_c(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void aom_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  tran_low_t out[16 * 16] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[16], temp_out[16];
-
-  // First transform rows. Since all non-zero dct coefficients are in
-  // upper-left 4x4 area, we only need to calculate first 4 rows here.
-  for (i = 0; i < 4; ++i) {
-    aom_idct16_c(input, outptr);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    aom_idct16_c(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void aom_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-  if (a1 == 0) return;
-  for (j = 0; j < 16; ++j) {
-    for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
-    dest += stride;
-  }
-}
-
-void aom_idct32_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[32], step2[32];
-  tran_high_t temp1, temp2;
-
-  // stage 1
-  step1[0] = input[0];
-  step1[1] = input[16];
-  step1[2] = input[8];
-  step1[3] = input[24];
-  step1[4] = input[4];
-  step1[5] = input[20];
-  step1[6] = input[12];
-  step1[7] = input[28];
-  step1[8] = input[2];
-  step1[9] = input[18];
-  step1[10] = input[10];
-  step1[11] = input[26];
-  step1[12] = input[6];
-  step1[13] = input[22];
-  step1[14] = input[14];
-  step1[15] = input[30];
-
-  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
-  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[31] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
-  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
-  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
-  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
-  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
-  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
-  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
-  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 2
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[4];
-  step2[5] = step1[5];
-  step2[6] = step1[6];
-  step2[7] = step1[7];
-
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-
-  step2[16] = WRAPLOW(step1[16] + step1[17]);
-  step2[17] = WRAPLOW(step1[16] - step1[17]);
-  step2[18] = WRAPLOW(-step1[18] + step1[19]);
-  step2[19] = WRAPLOW(step1[18] + step1[19]);
-  step2[20] = WRAPLOW(step1[20] + step1[21]);
-  step2[21] = WRAPLOW(step1[20] - step1[21]);
-  step2[22] = WRAPLOW(-step1[22] + step1[23]);
-  step2[23] = WRAPLOW(step1[22] + step1[23]);
-  step2[24] = WRAPLOW(step1[24] + step1[25]);
-  step2[25] = WRAPLOW(step1[24] - step1[25]);
-  step2[26] = WRAPLOW(-step1[26] + step1[27]);
-  step2[27] = WRAPLOW(step1[26] + step1[27]);
-  step2[28] = WRAPLOW(step1[28] + step1[29]);
-  step2[29] = WRAPLOW(step1[28] - step1[29]);
-  step2[30] = WRAPLOW(-step1[30] + step1[31]);
-  step2[31] = WRAPLOW(step1[30] + step1[31]);
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
-  step1[8] = WRAPLOW(step2[8] + step2[9]);
-  step1[9] = WRAPLOW(step2[8] - step2[9]);
-  step1[10] = WRAPLOW(-step2[10] + step2[11]);
-  step1[11] = WRAPLOW(step2[10] + step2[11]);
-  step1[12] = WRAPLOW(step2[12] + step2[13]);
-  step1[13] = WRAPLOW(step2[12] - step2[13]);
-  step1[14] = WRAPLOW(-step2[14] + step2[15]);
-  step1[15] = WRAPLOW(step2[14] + step2[15]);
-
-  step1[16] = step2[16];
-  step1[31] = step2[31];
-  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
-  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
-  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[19] = step2[19];
-  step1[20] = step2[20];
-  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
-  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
-  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[27] = step2[27];
-  step1[28] = step2[28];
-
-  // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[4] = WRAPLOW(step1[4] + step1[5]);
-  step2[5] = WRAPLOW(step1[4] - step1[5]);
-  step2[6] = WRAPLOW(-step1[6] + step1[7]);
-  step2[7] = WRAPLOW(step1[6] + step1[7]);
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-
-  step2[16] = WRAPLOW(step1[16] + step1[19]);
-  step2[17] = WRAPLOW(step1[17] + step1[18]);
-  step2[18] = WRAPLOW(step1[17] - step1[18]);
-  step2[19] = WRAPLOW(step1[16] - step1[19]);
-  step2[20] = WRAPLOW(-step1[20] + step1[23]);
-  step2[21] = WRAPLOW(-step1[21] + step1[22]);
-  step2[22] = WRAPLOW(step1[21] + step1[22]);
-  step2[23] = WRAPLOW(step1[20] + step1[23]);
-
-  step2[24] = WRAPLOW(step1[24] + step1[27]);
-  step2[25] = WRAPLOW(step1[25] + step1[26]);
-  step2[26] = WRAPLOW(step1[25] - step1[26]);
-  step2[27] = WRAPLOW(step1[24] - step1[27]);
-  step2[28] = WRAPLOW(-step1[28] + step1[31]);
-  step2[29] = WRAPLOW(-step1[29] + step1[30]);
-  step2[30] = WRAPLOW(step1[29] + step1[30]);
-  step2[31] = WRAPLOW(step1[28] + step1[31]);
-
-  // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3]);
-  step1[1] = WRAPLOW(step2[1] + step2[2]);
-  step1[2] = WRAPLOW(step2[1] - step2[2]);
-  step1[3] = WRAPLOW(step2[0] - step2[3]);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[7] = step2[7];
-
-  step1[8] = WRAPLOW(step2[8] + step2[11]);
-  step1[9] = WRAPLOW(step2[9] + step2[10]);
-  step1[10] = WRAPLOW(step2[9] - step2[10]);
-  step1[11] = WRAPLOW(step2[8] - step2[11]);
-  step1[12] = WRAPLOW(-step2[12] + step2[15]);
-  step1[13] = WRAPLOW(-step2[13] + step2[14]);
-  step1[14] = WRAPLOW(step2[13] + step2[14]);
-  step1[15] = WRAPLOW(step2[12] + step2[15]);
-
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
-  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
-  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
-  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
-  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[22] = step2[22];
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[25] = step2[25];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-
-  // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7]);
-  step2[1] = WRAPLOW(step1[1] + step1[6]);
-  step2[2] = WRAPLOW(step1[2] + step1[5]);
-  step2[3] = WRAPLOW(step1[3] + step1[4]);
-  step2[4] = WRAPLOW(step1[3] - step1[4]);
-  step2[5] = WRAPLOW(step1[2] - step1[5]);
-  step2[6] = WRAPLOW(step1[1] - step1[6]);
-  step2[7] = WRAPLOW(step1[0] - step1[7]);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  step2[16] = WRAPLOW(step1[16] + step1[23]);
-  step2[17] = WRAPLOW(step1[17] + step1[22]);
-  step2[18] = WRAPLOW(step1[18] + step1[21]);
-  step2[19] = WRAPLOW(step1[19] + step1[20]);
-  step2[20] = WRAPLOW(step1[19] - step1[20]);
-  step2[21] = WRAPLOW(step1[18] - step1[21]);
-  step2[22] = WRAPLOW(step1[17] - step1[22]);
-  step2[23] = WRAPLOW(step1[16] - step1[23]);
-
-  step2[24] = WRAPLOW(-step1[24] + step1[31]);
-  step2[25] = WRAPLOW(-step1[25] + step1[30]);
-  step2[26] = WRAPLOW(-step1[26] + step1[29]);
-  step2[27] = WRAPLOW(-step1[27] + step1[28]);
-  step2[28] = WRAPLOW(step1[27] + step1[28]);
-  step2[29] = WRAPLOW(step1[26] + step1[29]);
-  step2[30] = WRAPLOW(step1[25] + step1[30]);
-  step2[31] = WRAPLOW(step1[24] + step1[31]);
-
-  // stage 7
-  step1[0] = WRAPLOW(step2[0] + step2[15]);
-  step1[1] = WRAPLOW(step2[1] + step2[14]);
-  step1[2] = WRAPLOW(step2[2] + step2[13]);
-  step1[3] = WRAPLOW(step2[3] + step2[12]);
-  step1[4] = WRAPLOW(step2[4] + step2[11]);
-  step1[5] = WRAPLOW(step2[5] + step2[10]);
-  step1[6] = WRAPLOW(step2[6] + step2[9]);
-  step1[7] = WRAPLOW(step2[7] + step2[8]);
-  step1[8] = WRAPLOW(step2[7] - step2[8]);
-  step1[9] = WRAPLOW(step2[6] - step2[9]);
-  step1[10] = WRAPLOW(step2[5] - step2[10]);
-  step1[11] = WRAPLOW(step2[4] - step2[11]);
-  step1[12] = WRAPLOW(step2[3] - step2[12]);
-  step1[13] = WRAPLOW(step2[2] - step2[13]);
-  step1[14] = WRAPLOW(step2[1] - step2[14]);
-  step1[15] = WRAPLOW(step2[0] - step2[15]);
-
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  step1[18] = step2[18];
-  step1[19] = step2[19];
-  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
-  temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
-  temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
-  temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
-  temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[28] = step2[28];
-  step1[29] = step2[29];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-
-  // final stage
-  output[0] = WRAPLOW(step1[0] + step1[31]);
-  output[1] = WRAPLOW(step1[1] + step1[30]);
-  output[2] = WRAPLOW(step1[2] + step1[29]);
-  output[3] = WRAPLOW(step1[3] + step1[28]);
-  output[4] = WRAPLOW(step1[4] + step1[27]);
-  output[5] = WRAPLOW(step1[5] + step1[26]);
-  output[6] = WRAPLOW(step1[6] + step1[25]);
-  output[7] = WRAPLOW(step1[7] + step1[24]);
-  output[8] = WRAPLOW(step1[8] + step1[23]);
-  output[9] = WRAPLOW(step1[9] + step1[22]);
-  output[10] = WRAPLOW(step1[10] + step1[21]);
-  output[11] = WRAPLOW(step1[11] + step1[20]);
-  output[12] = WRAPLOW(step1[12] + step1[19]);
-  output[13] = WRAPLOW(step1[13] + step1[18]);
-  output[14] = WRAPLOW(step1[14] + step1[17]);
-  output[15] = WRAPLOW(step1[15] + step1[16]);
-  output[16] = WRAPLOW(step1[15] - step1[16]);
-  output[17] = WRAPLOW(step1[14] - step1[17]);
-  output[18] = WRAPLOW(step1[13] - step1[18]);
-  output[19] = WRAPLOW(step1[12] - step1[19]);
-  output[20] = WRAPLOW(step1[11] - step1[20]);
-  output[21] = WRAPLOW(step1[10] - step1[21]);
-  output[22] = WRAPLOW(step1[9] - step1[22]);
-  output[23] = WRAPLOW(step1[8] - step1[23]);
-  output[24] = WRAPLOW(step1[7] - step1[24]);
-  output[25] = WRAPLOW(step1[6] - step1[25]);
-  output[26] = WRAPLOW(step1[5] - step1[26]);
-  output[27] = WRAPLOW(step1[4] - step1[27]);
-  output[28] = WRAPLOW(step1[3] - step1[28]);
-  output[29] = WRAPLOW(step1[2] - step1[29]);
-  output[30] = WRAPLOW(step1[1] - step1[30]);
-  output[31] = WRAPLOW(step1[0] - step1[31]);
-}
-
-#if CONFIG_MRC_TX
-void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
-                              int stride, uint8_t *mask) {
-  tran_low_t out[32 * 32];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    int16_t zero_coeff[16];
-    for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
-    for (j = 0; j < 8; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 4; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 2; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
-    if (zero_coeff[0] | zero_coeff[1])
-      aom_idct32_c(input, outptr);
-    else
-      memset(outptr, 0, sizeof(tran_low_t) * 32);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    aom_idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      // Only add the coefficient if the mask value is 1
-      int mask_val = mask[j * 32 + i];
-      dest[j * stride + i] =
-          mask_val ? clip_pixel_add(dest[j * stride + i],
-                                    ROUND_POWER_OF_TWO(temp_out[j], 6))
-                   : dest[j * stride + i];
-    }
-  }
-}
-
-void aom_imrc32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                             uint8_t *mask) {
-  tran_low_t out[32 * 32] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  // only upper-left 16x16 has non-zero coeff
-  for (i = 0; i < 16; ++i) {
-    aom_idct32_c(input, outptr);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    aom_idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      // Only add the coefficient if the mask value is 1
-      int mask_val = mask[j * 32 + i];
-      dest[j * stride + i] =
-          mask_val ? clip_pixel_add(dest[j * stride + i],
-                                    ROUND_POWER_OF_TWO(temp_out[j], 6))
-                   : dest[j * stride + i];
-    }
-  }
-}
-
-void aom_imrc32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                            uint8_t *mask) {
-  tran_low_t out[32 * 32] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  // only upper-left 8x8 has non-zero coeff
-  for (i = 0; i < 8; ++i) {
-    aom_idct32_c(input, outptr);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    aom_idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      // Only add the coefficient if the mask value is 1
-      int mask_val = mask[j * 32 + i];
-      dest[j * stride + i] =
-          mask_val ? clip_pixel_add(dest[j * stride + i],
-                                    ROUND_POWER_OF_TWO(temp_out[j], 6))
-                   : dest[j * stride + i];
-    }
-  }
-}
-#endif  // CONFIG_MRC_TX
-
-void aom_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  tran_low_t out[32 * 32];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    int16_t zero_coeff[16];
-    for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
-    for (j = 0; j < 8; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 4; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 2; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
-    if (zero_coeff[0] | zero_coeff[1])
-      aom_idct32_c(input, outptr);
-    else
-      memset(outptr, 0, sizeof(tran_low_t) * 32);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    aom_idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void aom_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  tran_low_t out[32 * 32] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  // only upper-left 16x16 has non-zero coeff
-  for (i = 0; i < 16; ++i) {
-    aom_idct32_c(input, outptr);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    aom_idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void aom_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  tran_low_t out[32 * 32] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  // only upper-left 8x8 has non-zero coeff
-  for (i = 0; i < 8; ++i) {
-    aom_idct32_c(input, outptr);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    aom_idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void aom_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_high_t a1;
-
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-  if (a1 == 0) return;
-
-  for (j = 0; j < 32; ++j) {
-    for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
-    dest += stride;
-  }
-}
-
-void aom_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
-                                 int stride, int bd) {
-  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
-     0.5 shifts per pixel. */
-  int i;
-  tran_low_t output[16];
-  tran_high_t a1, b1, c1, d1, e1;
-  const tran_low_t *ip = input;
-  tran_low_t *op = output;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] >> UNIT_QUANT_SHIFT;
-    c1 = ip[1] >> UNIT_QUANT_SHIFT;
-    d1 = ip[2] >> UNIT_QUANT_SHIFT;
-    b1 = ip[3] >> UNIT_QUANT_SHIFT;
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    op[0] = HIGHBD_WRAPLOW(a1, bd);
-    op[1] = HIGHBD_WRAPLOW(b1, bd);
-    op[2] = HIGHBD_WRAPLOW(c1, bd);
-    op[3] = HIGHBD_WRAPLOW(d1, bd);
-    ip += 4;
-    op += 4;
-  }
-
-  ip = output;
-  for (i = 0; i < 4; i++) {
-    a1 = ip[4 * 0];
-    c1 = ip[4 * 1];
-    d1 = ip[4 * 2];
-    b1 = ip[4 * 3];
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    dest[stride * 0] =
-        highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
-    dest[stride * 1] =
-        highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
-    dest[stride * 2] =
-        highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
-    dest[stride * 3] =
-        highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
-
-    ip++;
-    dest++;
-  }
-}
-
-void aom_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
-                                int dest_stride, int bd) {
-  int i;
-  tran_high_t a1, e1;
-  tran_low_t tmp[4];
-  const tran_low_t *ip = in;
-  tran_low_t *op = tmp;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  (void)bd;
-
-  a1 = ip[0] >> UNIT_QUANT_SHIFT;
-  e1 = a1 >> 1;
-  a1 -= e1;
-  op[0] = HIGHBD_WRAPLOW(a1, bd);
-  op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
-
-  ip = tmp;
-  for (i = 0; i < 4; i++) {
-    e1 = ip[0] >> 1;
-    a1 = ip[0] - e1;
-    dest[dest_stride * 0] =
-        highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
-    dest[dest_stride * 1] =
-        highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
-    dest[dest_stride * 2] =
-        highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
-    dest[dest_stride * 3] =
-        highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
-    ip++;
-    dest++;
-  }
-}
diff --git a/third_party/aom/aom_dsp/inv_txfm.h b/third_party/aom/aom_dsp/inv_txfm.h
deleted file mode 100644
index 644a6599f..000000000
--- a/third_party/aom/aom_dsp/inv_txfm.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_INV_TXFM_H_
-#define AOM_DSP_INV_TXFM_H_
-
-#include <assert.h>
-
-#include "./aom_config.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-static INLINE tran_high_t dct_const_round_shift(tran_high_t input) {
-  return ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-}
-
-static INLINE tran_high_t check_range(tran_high_t input, int bd) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-  // For valid AV1 input streams, intermediate stage coefficients should always
-  // stay within the range of a signed 16 bit integer. Coefficients can go out
-  // of this range for invalid/corrupt AV1 streams. However, strictly checking
-  // this range for every intermediate coefficient can burdensome for a decoder,
-  // therefore the following assertion is only enabled when configured with
-  // --enable-coefficient-range-checking.
-  // For valid highbitdepth AV1 streams, intermediate stage coefficients will
-  // stay within the ranges:
-  // - 8 bit: signed 16 bit integer
-  // - 10 bit: signed 18 bit integer
-  // - 12 bit: signed 20 bit integer
-  const int32_t int_max = (1 << (7 + bd)) - 1;
-  const int32_t int_min = -int_max - 1;
-  assert(int_min <= input);
-  assert(input <= int_max);
-  (void)int_min;
-#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-  (void)bd;
-  return input;
-}
-
-#define WRAPLOW(x) ((int32_t)check_range(x, 8))
-#define HIGHBD_WRAPLOW(x, bd) ((int32_t)check_range((x), bd))
-
-#if CONFIG_MRC_TX
-// These each perform dct but add coefficients based on a mask
-void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
-                              int stride, uint8_t *mask);
-
-void aom_imrc32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                             uint8_t *mask);
-
-void aom_imrc32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                            uint8_t *mask);
-#endif  // CONFIG_MRC_TX
-
-void aom_idct4_c(const tran_low_t *input, tran_low_t *output);
-void aom_idct8_c(const tran_low_t *input, tran_low_t *output);
-void aom_idct16_c(const tran_low_t *input, tran_low_t *output);
-void aom_idct32_c(const tran_low_t *input, tran_low_t *output);
-#if CONFIG_TX64X64 && CONFIG_DAALA_DCT64
-void aom_idct64_c(const tran_low_t *input, tran_low_t *output);
-#endif
-void aom_iadst4_c(const tran_low_t *input, tran_low_t *output);
-void aom_iadst8_c(const tran_low_t *input, tran_low_t *output);
-void aom_iadst16_c(const tran_low_t *input, tran_low_t *output);
-
-void aom_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
-void aom_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
-void aom_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
-void aom_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd);
-
-void aom_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
-void aom_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
-void aom_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
-
-static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
-                                             int bd) {
-  trans = HIGHBD_WRAPLOW(trans, bd);
-  return clip_pixel_highbd(dest + (int)trans, bd);
-}
-
-static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
-  trans = WRAPLOW(trans);
-  return clip_pixel(dest + (int)trans);
-}
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_DSP_INV_TXFM_H_
diff --git a/third_party/aom/aom_dsp/loopfilter.c b/third_party/aom/aom_dsp/loopfilter.c
index 69f131378..a3f261824 100644
--- a/third_party/aom/aom_dsp/loopfilter.c
+++ b/third_party/aom/aom_dsp/loopfilter.c
@@ -11,8 +11,9 @@
 
 #include <stdlib.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/mem.h"
 
@@ -20,18 +21,6 @@ static INLINE int8_t signed_char_clamp(int t) {
   return (int8_t)clamp(t, -128, 127);
 }
 
-#define PARALLEL_DEBLOCKING_11_TAP 0
-#define PARALLEL_DEBLOCKING_9_TAP 0
-
-#if CONFIG_DEBLOCK_13TAP
-#define PARALLEL_DEBLOCKING_13_TAP 1
-#define PARALLEL_DEBLOCKING_5_TAP_CHROMA 1
-#else
-#define PARALLEL_DEBLOCKING_13_TAP 0
-#define PARALLEL_DEBLOCKING_5_TAP_CHROMA 0
-#endif
-
-#if CONFIG_HIGHBITDEPTH
 static INLINE int16_t signed_char_clamp_high(int t, int bd) {
   switch (bd) {
     case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
@@ -40,8 +29,7 @@ static INLINE int16_t signed_char_clamp_high(int t, int bd) {
     default: return (int16_t)clamp(t, -128, 128 - 1);
   }
 }
-#endif
-#if CONFIG_PARALLEL_DEBLOCKING
+
 // should we apply any filter at all: 11111111 yes, 00000000 no
 static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
                                   uint8_t p0, uint8_t q0, uint8_t q1) {
@@ -51,7 +39,7 @@ static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
   mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
   return ~mask;
 }
-#endif  // CONFIG_PARALLEL_DEBLOCKING
+
 static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
                                  uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
                                  uint8_t q1, uint8_t q2, uint8_t q3) {
@@ -66,7 +54,18 @@ static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
   return ~mask;
 }
 
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
+static INLINE int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit,
+                                         uint8_t p2, uint8_t p1, uint8_t p0,
+                                         uint8_t q0, uint8_t q1, uint8_t q2) {
+  int8_t mask = 0;
+  mask |= (abs(p2 - p1) > limit) * -1;
+  mask |= (abs(p1 - p0) > limit) * -1;
+  mask |= (abs(q1 - q0) > limit) * -1;
+  mask |= (abs(q2 - q1) > limit) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+  return ~mask;
+}
+
 static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1,
                                        uint8_t p0, uint8_t q0, uint8_t q1,
                                        uint8_t q2) {
@@ -77,7 +76,6 @@ static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1,
   mask |= (abs(q2 - q0) > thresh) * -1;
   return ~mask;
 }
-#endif
 
 static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
                                 uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
@@ -92,39 +90,6 @@ static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
   return ~mask;
 }
 
-#if PARALLEL_DEBLOCKING_9_TAP
-static INLINE int8_t flat_mask2(uint8_t thresh, uint8_t p4, uint8_t p0,
-                                uint8_t q0, uint8_t q4) {
-  int8_t mask = 0;
-  mask |= (abs(p4 - p0) > thresh) * -1;
-  mask |= (abs(q4 - q0) > thresh) * -1;
-  return ~mask;
-}
-#endif
-
-#if PARALLEL_DEBLOCKING_11_TAP
-static INLINE int8_t flat_mask3(uint8_t thresh, uint8_t p5, uint8_t p4,
-                                uint8_t p0, uint8_t q0, uint8_t q4,
-                                uint8_t q5) {
-  int8_t mask = 0;
-  mask |= (abs(p4 - p0) > thresh) * -1;
-  mask |= (abs(q4 - q0) > thresh) * -1;
-  mask |= (abs(p5 - p0) > thresh) * -1;
-  mask |= (abs(q5 - q0) > thresh) * -1;
-  return ~mask;
-}
-#endif
-
-static INLINE int8_t flat_mask5(uint8_t thresh, uint8_t p4, uint8_t p3,
-                                uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
-                                uint8_t q1, uint8_t q2, uint8_t q3,
-                                uint8_t q4) {
-  int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
-  mask |= (abs(p4 - p0) > thresh) * -1;
-  mask |= (abs(q4 - q0) > thresh) * -1;
-  return ~mask;
-}
-
 // is there high edge variance internal edge: 11111111 yes, 00000000 no
 static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
                               uint8_t q0, uint8_t q1) {
@@ -170,25 +135,14 @@ void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
                             const uint8_t *blimit, const uint8_t *limit,
                             const uint8_t *thresh) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < count; ++i) {
-#if !CONFIG_PARALLEL_DEBLOCKING
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
-    const int8_t mask =
-        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
-#else   // CONFIG_PARALLEL_DEBLOCKING
     const uint8_t p1 = s[-2 * p], p0 = s[-p];
     const uint8_t q0 = s[0 * p], q1 = s[1 * p];
     const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
     filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
     ++s;
   }
@@ -199,31 +153,20 @@ void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
   aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
-  aom_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1);
+  aom_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1);
 }
 
 void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < count; ++i) {
-#if !CONFIG_PARALLEL_DEBLOCKING
-    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
-    const int8_t mask =
-        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
-#else   // CONFIG_PARALLEL_DEBLOCKING
     const uint8_t p1 = s[-2], p0 = s[-1];
     const uint8_t q0 = s[0], q1 = s[1];
     const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
     filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
     s += pitch;
   }
@@ -234,10 +177,9 @@ void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                const uint8_t *blimit1, const uint8_t *limit1,
                                const uint8_t *thresh1) {
   aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
-  aom_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
+  aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
 }
 
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
 static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat,
                            uint8_t *op2, uint8_t *op1, uint8_t *op0,
                            uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) {
@@ -254,7 +196,6 @@ static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat,
     filter4(mask, thresh, op1, op0, oq0, oq1);
   }
 }
-#endif
 
 static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
                            uint8_t *op3, uint8_t *op2, uint8_t *op1,
@@ -276,40 +217,38 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
   }
 }
 
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
 void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit,
                             const uint8_t *limit, const uint8_t *thresh) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < count; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const uint8_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
 
     const int8_t mask =
-        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+        filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
     const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
     filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
             s + 2 * p);
     ++s;
   }
 }
-#endif
+
+void aom_lpf_horizontal_6_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0, const uint8_t *thresh0,
+                                 const uint8_t *blimit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1);
+}
 
 void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
                             const uint8_t *limit, const uint8_t *thresh) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
@@ -331,39 +270,37 @@ void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
   aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
-  aom_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1);
+  aom_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1);
 }
 
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
 void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   for (i = 0; i < count; ++i) {
-    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const uint8_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2];
     const int8_t mask =
-        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+        filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
     const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
     filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2);
     s += pitch;
   }
 }
-#endif
+
+void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1) {
+  aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
 
 void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   for (i = 0; i < count; ++i) {
     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
@@ -382,10 +319,9 @@ void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                const uint8_t *blimit1, const uint8_t *limit1,
                                const uint8_t *thresh1) {
   aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
-  aom_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
+  aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
 }
 
-#if PARALLEL_DEBLOCKING_13_TAP
 static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat,
                             int8_t flat2, uint8_t *op6, uint8_t *op5,
                             uint8_t *op4, uint8_t *op3, uint8_t *op2,
@@ -433,186 +369,43 @@ static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat,
     filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
   }
 }
-#endif
-
-#if PARALLEL_DEBLOCKING_11_TAP
-static INLINE void filter12(int8_t mask, uint8_t thresh, int8_t flat,
-                            int8_t flat2, uint8_t *op5, uint8_t *op4,
-                            uint8_t *op3, uint8_t *op2, uint8_t *op1,
-                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
-                            uint8_t *oq2, uint8_t *oq3, uint8_t *oq4,
-                            uint8_t *oq5) {
-  if (flat2 && flat && mask) {
-    const uint8_t p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2, p1 = *op1,
-                  p0 = *op0;
-    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
-                  q5 = *oq5;
-
-    // 11-tap filter [1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1]
-    *op4 = (p5 * 5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + 6) / 12;
-    *op3 = (p5 * 4 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + 6) / 12;
-    *op2 = (p5 * 3 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + 6) / 12;
-    *op1 = (p5 * 2 + p4 + p3 + p2 + p1 * 2 + p0 + q0 + q1 + q2 + q3 + 6) / 12;
-    *op0 = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 + q1 + q2 + q3 + q4 + 6) / 12;
-    *oq0 = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 + q2 + q3 + q4 + q5 + 6) / 12;
-    *oq1 = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 + q3 + q4 + q5 * 2 + 6) / 12;
-    *oq2 = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 * 3 + 6) / 12;
-    *oq3 = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 * 4 + 6) / 12;
-    *oq4 = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 5 + 6) / 12;
-  } else {
-    filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
-  }
-}
-#endif
-
-#if PARALLEL_DEBLOCKING_9_TAP
-static INLINE void filter10(int8_t mask, uint8_t thresh, int8_t flat,
-                            int8_t flat2, uint8_t *op4, uint8_t *op3,
-                            uint8_t *op2, uint8_t *op1, uint8_t *op0,
-                            uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
-                            uint8_t *oq3, uint8_t *oq4) {
-  if (flat2 && flat && mask) {
-    const uint8_t p4 = *op4, p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
-    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4;
-
-    // 9-tap filter [1, 1, 1, 1, 2, 1, 1, 1, 1]
-    *op3 = (p4 * 4 + p3 * 2 + p2 + p1 + p0 + q0 + 5) / 10;
-    *op2 = (p4 * 3 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + 5) / 10;
-    *op1 = (p4 * 2 + p3 + p2 + p1 * 2 + p0 + q0 + q1 + q2 + 5) / 10;
-    *op0 = (p4 + p3 + p2 + p1 + p0 * 2 + q0 + q1 + q2 + q3 + 5) / 10;
-    *oq0 = (p3 + p2 + p1 + p0 + q0 * 2 + q1 + q2 + q3 + q4 + 5) / 10;
-    *oq1 = (p2 + p1 + p0 + q0 + q1 * 2 + q2 + q3 + q4 * 2 + 5) / 10;
-    *oq2 = (p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 * 3 + 5) / 10;
-    *oq3 = (p0 + q0 + q1 + q2 + q3 * 2 + q4 * 4 + 5) / 10;
-  } else {
-    filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
-  }
-}
-#endif
-
-static INLINE void filter16(int8_t mask, uint8_t thresh, int8_t flat,
-                            int8_t flat2, uint8_t *op7, uint8_t *op6,
-                            uint8_t *op5, uint8_t *op4, uint8_t *op3,
-                            uint8_t *op2, uint8_t *op1, uint8_t *op0,
-                            uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
-                            uint8_t *oq3, uint8_t *oq4, uint8_t *oq5,
-                            uint8_t *oq6, uint8_t *oq7) {
-  if (flat2 && flat && mask) {
-    const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3,
-                  p2 = *op2, p1 = *op1, p0 = *op0;
-
-    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
-                  q5 = *oq5, q6 = *oq6, q7 = *oq7;
-
-    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
-    *op6 = ROUND_POWER_OF_TWO(
-        p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
-    *op5 = ROUND_POWER_OF_TWO(
-        p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
-    *op4 = ROUND_POWER_OF_TWO(
-        p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
-    *op3 = ROUND_POWER_OF_TWO(
-        p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
-    *op2 = ROUND_POWER_OF_TWO(
-        p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
-        4);
-    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
-                                  q0 + q1 + q2 + q3 + q4 + q5,
-                              4);
-    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
-                                  q1 + q2 + q3 + q4 + q5 + q6,
-                              4);
-    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
-                                  q2 + q3 + q4 + q5 + q6 + q7,
-                              4);
-    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
-                                  q3 + q4 + q5 + q6 + q7 * 2,
-                              4);
-    *oq2 = ROUND_POWER_OF_TWO(
-        p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
-        4);
-    *oq3 = ROUND_POWER_OF_TWO(
-        p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
-    *oq4 = ROUND_POWER_OF_TWO(
-        p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
-    *oq5 = ROUND_POWER_OF_TWO(
-        p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
-    *oq6 = ROUND_POWER_OF_TWO(
-        p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
-  } else {
-    filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
-  }
-}
 
 static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
                                      const uint8_t *limit,
                                      const uint8_t *thresh, int count) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int step = 4;
-#else
-  int step = 8;
-#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < step * count; ++i) {
-    const uint8_t p7 = s[-8 * p], p6 = s[-7 * p], p5 = s[-6 * p],
-                  p4 = s[-5 * p], p3 = s[-4 * p], p2 = s[-3 * p],
-                  p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t p6 = s[-7 * p], p5 = s[-6 * p], p4 = s[-5 * p],
+                  p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p],
-                  q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p], q7 = s[7 * p];
+                  q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p];
     const int8_t mask =
         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-
-#if PARALLEL_DEBLOCKING_13_TAP
-    (void)p7;
-    (void)q7;
     const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
 
     filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
              s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
              s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p);
-
-#elif PARALLEL_DEBLOCKING_11_TAP
-    const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5);
-
-    filter12(mask, *thresh, flat, flat2, s - 6 * p, s - 5 * p, s - 4 * p,
-             s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p,
-             s + 3 * p, s + 4 * p, s + 5 * p);
-
-#elif PARALLEL_DEBLOCKING_9_TAP
-    const int8_t flat2 = flat_mask2(1, p4, p0, q0, q4);
-
-    filter10(mask, *thresh, flat, flat2, s - 5 * p, s - 4 * p, s - 3 * p,
-             s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p,
-             s + 4 * p);
-#else
-    const int8_t flat2 = flat_mask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7);
-
-    filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
-             s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
-             s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p,
-             s + 7 * p);
-#endif
-
     ++s;
   }
 }
 
-void aom_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit,
-                                 const uint8_t *limit, const uint8_t *thresh) {
+void aom_lpf_horizontal_14_c(uint8_t *s, int p, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
   mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
 }
 
-void aom_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit,
-                                  const uint8_t *limit, const uint8_t *thresh) {
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
-#else
-  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
-#endif
+void aom_lpf_horizontal_14_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1);
+  mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1);
 }
 
 static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
@@ -621,60 +414,34 @@ static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
   int i;
 
   for (i = 0; i < count; ++i) {
-    const uint8_t p7 = s[-8], p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4],
-                  p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], p2 = s[-3],
+                  p1 = s[-2], p0 = s[-1];
     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4],
-                  q5 = s[5], q6 = s[6], q7 = s[7];
+                  q5 = s[5], q6 = s[6];
     const int8_t mask =
         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-
-#if PARALLEL_DEBLOCKING_13_TAP
-    (void)p7;
-    (void)q7;
     const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
 
     filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3,
              s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6);
-#elif PARALLEL_DEBLOCKING_11_TAP
-    const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5);
-
-    filter12(mask, *thresh, flat, flat2, s - 6, s - 5, s - 4, s - 3, s - 2,
-             s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5);
-#elif PARALLEL_DEBLOCKING_9_TAP
-    const int8_t flat2 = flat_mask2(1, p4, p0, q0, q4);
-
-    filter10(mask, *thresh, flat, flat2, s - 5, s - 4, s - 3, s - 2, s - 1, s,
-             s + 1, s + 2, s + 3, s + 4);
-
-#else
-    const int8_t flat2 = flat_mask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7);
-
-    filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4,
-             s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6,
-             s + 7);
-#endif
-
     s += p;
   }
 }
 
-void aom_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+void aom_lpf_vertical_14_c(uint8_t *s, int p, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh) {
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4);
-#else
-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
-#endif
 }
 
-void aom_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
-                                const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
+void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                                const uint8_t *limit0, const uint8_t *thresh0,
+                                const uint8_t *blimit1, const uint8_t *limit1,
+                                const uint8_t *thresh1) {
+  mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4);
+  mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, 4);
 }
 
-#if CONFIG_HIGHBITDEPTH
-#if CONFIG_PARALLEL_DEBLOCKING
 // Should we apply any filter at all: 11111111 yes, 00000000 no ?
 static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
                                          uint16_t p1, uint16_t p0, uint16_t q0,
@@ -687,7 +454,6 @@ static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
   mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
   return ~mask;
 }
-#endif  // CONFIG_PARALLEL_DEBLOCKING
 
 // Should we apply any filter at all: 11111111 yes, 00000000 no ?
 static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
@@ -707,7 +473,22 @@ static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
   return ~mask;
 }
 
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
+static INLINE int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit,
+                                                uint16_t p2, uint16_t p1,
+                                                uint16_t p0, uint16_t q0,
+                                                uint16_t q1, uint16_t q2,
+                                                int bd) {
+  int8_t mask = 0;
+  int16_t limit16 = (uint16_t)limit << (bd - 8);
+  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+  mask |= (abs(p2 - p1) > limit16) * -1;
+  mask |= (abs(p1 - p0) > limit16) * -1;
+  mask |= (abs(q1 - q0) > limit16) * -1;
+  mask |= (abs(q2 - q1) > limit16) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+  return ~mask;
+}
+
 static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2,
                                               uint16_t p1, uint16_t p0,
                                               uint16_t q0, uint16_t q1,
@@ -720,7 +501,6 @@ static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2,
   mask |= (abs(q2 - q0) > thresh16) * -1;
   return ~mask;
 }
-#endif
 
 static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
                                        uint16_t p1, uint16_t p0, uint16_t q0,
@@ -737,17 +517,6 @@ static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
   return ~mask;
 }
 
-static INLINE int8_t highbd_flat_mask5(uint8_t thresh, uint16_t p4, uint16_t p3,
-                                       uint16_t p2, uint16_t p1, uint16_t p0,
-                                       uint16_t q0, uint16_t q1, uint16_t q2,
-                                       uint16_t q3, uint16_t q4, int bd) {
-  int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
-  mask |= (abs(p4 - p0) > thresh16) * -1;
-  mask |= (abs(q4 - q0) > thresh16) * -1;
-  return ~mask;
-}
-
 // Is there high edge variance internal edge:
 // 11111111_11111111 yes, 00000000_00000000 no ?
 static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
@@ -798,34 +567,17 @@ void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
                                    const uint8_t *blimit, const uint8_t *limit,
                                    const uint8_t *thresh, int bd) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < count; ++i) {
-#if !CONFIG_PARALLEL_DEBLOCKING
-    const uint16_t p3 = s[-4 * p];
-    const uint16_t p2 = s[-3 * p];
-    const uint16_t p1 = s[-2 * p];
-    const uint16_t p0 = s[-p];
-    const uint16_t q0 = s[0 * p];
-    const uint16_t q1 = s[1 * p];
-    const uint16_t q2 = s[2 * p];
-    const uint16_t q3 = s[3 * p];
-    const int8_t mask =
-        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-#else   // CONFIG_PARALLEL_DEBLOCKING
     const uint16_t p1 = s[-2 * p];
     const uint16_t p0 = s[-p];
     const uint16_t q0 = s[0 * p];
     const uint16_t q1 = s[1 * p];
     const int8_t mask =
         highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
     highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
     ++s;
   }
@@ -836,33 +588,22 @@ void aom_highbd_lpf_horizontal_4_dual_c(
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
   aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
-  aom_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);
+  aom_highbd_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1, bd);
 }
 
 void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                  const uint8_t *limit, const uint8_t *thresh,
                                  int bd) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < count; ++i) {
-#if !CONFIG_PARALLEL_DEBLOCKING
-    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
-    const int8_t mask =
-        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-#else   // CONFIG_PARALLEL_DEBLOCKING
     const uint16_t p1 = s[-2], p0 = s[-1];
     const uint16_t q0 = s[0], q1 = s[1];
     const int8_t mask =
         highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
     highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
     s += pitch;
   }
@@ -873,11 +614,10 @@ void aom_highbd_lpf_vertical_4_dual_c(
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
   aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
-  aom_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
+  aom_highbd_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
                               bd);
 }
 
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
 static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat,
                                   uint16_t *op2, uint16_t *op1, uint16_t *op0,
                                   uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
@@ -895,7 +635,6 @@ static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat,
     highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
   }
 }
-#endif
 
 static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
                                   uint16_t *op3, uint16_t *op2, uint16_t *op1,
@@ -921,11 +660,7 @@ void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int bd) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
@@ -943,74 +678,75 @@ void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
   }
 }
 
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
 void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int bd) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < count; ++i) {
-    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const uint16_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
 
     const int8_t mask =
-        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+        highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
     const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
     highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s,
                    s + 1 * p, s + 2 * p, bd);
     ++s;
   }
 }
-#endif
+
+void aom_highbd_lpf_horizontal_6_dual_c(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1, bd);
+}
 
 void aom_highbd_lpf_horizontal_8_dual_c(
     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
   aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
-  aom_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
+  aom_highbd_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1, bd);
 }
 
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
 void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                  const uint8_t *limit, const uint8_t *thresh,
                                  int bd) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   for (i = 0; i < count; ++i) {
-    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const uint16_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2];
     const int8_t mask =
-        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+        highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
     const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
     highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2,
                    bd);
     s += pitch;
   }
 }
-#endif
+
+void aom_highbd_lpf_vertical_6_dual_c(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                              bd);
+}
 
 void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                  const uint8_t *limit, const uint8_t *thresh,
                                  int bd) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   for (i = 0; i < count; ++i) {
     const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
@@ -1030,11 +766,10 @@ void aom_highbd_lpf_vertical_8_dual_c(
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
   aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
-  aom_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
+  aom_highbd_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
                               bd);
 }
 
-#if PARALLEL_DEBLOCKING_13_TAP
 static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat,
                                    int8_t flat2, uint16_t *op6, uint16_t *op5,
                                    uint16_t *op4, uint16_t *op3, uint16_t *op2,
@@ -1094,73 +829,6 @@ static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat,
                    bd);
   }
 }
-#endif
-
-static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat,
-                                   int8_t flat2, uint16_t *op7, uint16_t *op6,
-                                   uint16_t *op5, uint16_t *op4, uint16_t *op3,
-                                   uint16_t *op2, uint16_t *op1, uint16_t *op0,
-                                   uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
-                                   uint16_t *oq3, uint16_t *oq4, uint16_t *oq5,
-                                   uint16_t *oq6, uint16_t *oq7, int bd) {
-  if (flat2 && flat && mask) {
-    const uint16_t p7 = *op7;
-    const uint16_t p6 = *op6;
-    const uint16_t p5 = *op5;
-    const uint16_t p4 = *op4;
-    const uint16_t p3 = *op3;
-    const uint16_t p2 = *op2;
-    const uint16_t p1 = *op1;
-    const uint16_t p0 = *op0;
-    const uint16_t q0 = *oq0;
-    const uint16_t q1 = *oq1;
-    const uint16_t q2 = *oq2;
-    const uint16_t q3 = *oq3;
-    const uint16_t q4 = *oq4;
-    const uint16_t q5 = *oq5;
-    const uint16_t q6 = *oq6;
-    const uint16_t q7 = *oq7;
-
-    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
-    *op6 = ROUND_POWER_OF_TWO(
-        p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
-    *op5 = ROUND_POWER_OF_TWO(
-        p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
-    *op4 = ROUND_POWER_OF_TWO(
-        p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
-    *op3 = ROUND_POWER_OF_TWO(
-        p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
-    *op2 = ROUND_POWER_OF_TWO(
-        p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
-        4);
-    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
-                                  q0 + q1 + q2 + q3 + q4 + q5,
-                              4);
-    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
-                                  q1 + q2 + q3 + q4 + q5 + q6,
-                              4);
-    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
-                                  q2 + q3 + q4 + q5 + q6 + q7,
-                              4);
-    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
-                                  q3 + q4 + q5 + q6 + q7 * 2,
-                              4);
-    *oq2 = ROUND_POWER_OF_TWO(
-        p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
-        4);
-    *oq3 = ROUND_POWER_OF_TWO(
-        p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
-    *oq4 = ROUND_POWER_OF_TWO(
-        p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
-    *oq5 = ROUND_POWER_OF_TWO(
-        p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
-    *oq6 = ROUND_POWER_OF_TWO(
-        p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
-  } else {
-    highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
-                   bd);
-  }
-}
 
 static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
                                             const uint8_t *blimit,
@@ -1168,11 +836,7 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
                                             const uint8_t *thresh, int count,
                                             int bd) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int step = 4;
-#else
-  int step = 8;
-#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
@@ -1190,7 +854,6 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
     const int8_t flat =
         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
 
-#if PARALLEL_DEBLOCKING_13_TAP
     const int8_t flat2 =
         highbd_flat_mask4(1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p],
                           s[5 * p], s[6 * p], bd);
@@ -1198,36 +861,22 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
     highbd_filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
                     s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
                     s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, bd);
-#else
-    const int8_t flat2 =
-        highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
-                          s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
-
-    highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
-                    s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
-                    s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p,
-                    s + 6 * p, s + 7 * p, bd);
-#endif
     ++s;
   }
 }
 
-void aom_highbd_lpf_horizontal_edge_8_c(uint16_t *s, int p,
-                                        const uint8_t *blimit,
-                                        const uint8_t *limit,
-                                        const uint8_t *thresh, int bd) {
+void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int bd) {
   highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
 }
 
-void aom_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p,
-                                         const uint8_t *blimit,
-                                         const uint8_t *limit,
-                                         const uint8_t *thresh, int bd) {
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
-#else
-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
-#endif
+void aom_highbd_lpf_horizontal_14_dual_c(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  highbd_mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1, bd);
+  highbd_mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1, bd);
 }
 
 static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
@@ -1250,43 +899,27 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     const int8_t flat =
         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-#if PARALLEL_DEBLOCKING_13_TAP
     const int8_t flat2 =
         highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd);
 
     highbd_filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4,
                     s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5,
                     s + 6, bd);
-#else
-    const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
-                                           q0, s[4], s[5], s[6], s[7], bd);
-
-    highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5,
-                    s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4,
-                    s + 5, s + 6, s + 7, bd);
-#endif
     s += p;
   }
 }
 
-void aom_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+void aom_highbd_lpf_vertical_14_c(uint16_t *s, int p, const uint8_t *blimit,
                                   const uint8_t *limit, const uint8_t *thresh,
                                   int bd) {
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd);
-#else
-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
-#endif
 }
 
-void aom_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
-                                       const uint8_t *blimit,
-                                       const uint8_t *limit,
-                                       const uint8_t *thresh, int bd) {
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
-#else
-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
-#endif
+void aom_highbd_lpf_vertical_14_dual_c(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  highbd_mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4, bd);
+  highbd_mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                                4, bd);
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/mips/add_noise_msa.c b/third_party/aom/aom_dsp/mips/add_noise_msa.c
index 4c6e201e1..96d04cff0 100644
--- a/third_party/aom/aom_dsp/mips/add_noise_msa.c
+++ b/third_party/aom/aom_dsp/mips/add_noise_msa.c
@@ -10,7 +10,8 @@
  */
 
 #include <stdlib.h>
-#include "./macros_msa.h"
+
+#include "aom_dsp/mips/macros_msa.h"
 
 void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
                              char blackclamp[16], char whiteclamp[16],
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c
deleted file mode 100644
index 847394a3d..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c
+++ /dev/null
@@ -1,704 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/aom_convolve_msa.h"
-
-static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter) {
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 dst0, dst1, dst2, dst3, res2, res3;
-  v16u8 mask0, mask1, mask2, mask3;
-  v8i16 filt, res0, res1;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[16]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  XORI_B4_128_SB(src0, src1, src2, src3);
-  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
-                             filt0, filt1, filt2, filt3, res0, res1);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  SRARI_H2_SH(res0, res1, FILTER_BITS);
-  SAT_SH2_SH(res0, res1, 7);
-  PCKEV_B2_UB(res0, res0, res1, res1, res2, res3);
-  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-  XORI_B2_128_UB(res2, res3);
-  AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3);
-  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter) {
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-  v8i16 filt, vec0, vec1, vec2, vec3;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[16]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  XORI_B4_128_SB(src0, src1, src2, src3);
-  src += (4 * src_stride);
-  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
-                             filt0, filt1, filt2, filt3, vec0, vec1);
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  XORI_B4_128_SB(src0, src1, src2, src3);
-  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
-                             filt0, filt1, filt2, filt3, vec2, vec3);
-  SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
-  PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2,
-              res3);
-  ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
-  XORI_B2_128_UB(res0, res2);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
-             dst6);
-  ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4);
-  AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2);
-  ST4x8_UB(res0, res2, dst, dst_stride);
-}
-
-static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
-                                             int32_t src_stride, uint8_t *dst,
-                                             int32_t dst_stride, int8_t *filter,
-                                             int32_t height) {
-  if (4 == height) {
-    common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else if (8 == height) {
-    common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
-  }
-}
-
-static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
-                                             int32_t src_stride, uint8_t *dst,
-                                             int32_t dst_stride, int8_t *filter,
-                                             int32_t height) {
-  int32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
-  v8i16 filt, out0, out1, out2, out3;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    XORI_B4_128_SB(src0, src1, src2, src3);
-    src += (4 * src_stride);
-    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
-                               mask3, filt0, filt1, filt2, filt3, out0, out1,
-                               out2, out3);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-    SAT_SH4_SH(out0, out1, out2, out3, 7);
-    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
-                            dst_stride);
-    dst += (4 * dst_stride);
-  }
-}
-
-static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  int32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
-  v8i16 filt, out0, out1, out2, out3;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  for (loop_cnt = height >> 1; loop_cnt--;) {
-    LD_SB2(src, src_stride, src0, src2);
-    LD_SB2(src + 8, src_stride, src1, src3);
-    src += (2 * src_stride);
-
-    XORI_B4_128_SB(src0, src1, src2, src3);
-    VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
-    VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
-    VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
-               vec14);
-    VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
-               vec15);
-    DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-                vec2, vec3);
-    DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
-                vec9, vec10, vec11);
-    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
-                 vec2, vec3);
-    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
-                 vec9, vec10, vec11);
-    ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
-                out2, out3);
-    LD_UB2(dst, dst_stride, dst0, dst1);
-    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-    SAT_SH4_SH(out0, out1, out2, out3, 7);
-    PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
-    dst += dst_stride;
-    PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
-    dst += dst_stride;
-  }
-}
-
-static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
-  v8i16 filt, out0, out1, out2, out3;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  for (loop_cnt = height; loop_cnt--;) {
-    src0 = LD_SB(src);
-    src2 = LD_SB(src + 16);
-    src3 = LD_SB(src + 24);
-    src1 = __msa_sldi_b(src2, src0, 8);
-    src += src_stride;
-
-    XORI_B4_128_SB(src0, src1, src2, src3);
-    VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
-    VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
-    VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
-               vec14);
-    VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
-               vec15);
-    DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-                vec2, vec3);
-    DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
-                vec9, vec10, vec11);
-    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
-                 vec2, vec3);
-    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
-                 vec9, vec10, vec11);
-    ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
-                out2, out3);
-    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-    SAT_SH4_SH(out0, out1, out2, out3, 7);
-    LD_UB2(dst, 16, dst1, dst2);
-    PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
-    PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
-    dst += dst_stride;
-  }
-}
-
-static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  uint32_t loop_cnt, cnt;
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
-  v8i16 filt, out0, out1, out2, out3;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  for (loop_cnt = height; loop_cnt--;) {
-    for (cnt = 0; cnt < 2; ++cnt) {
-      src0 = LD_SB(&src[cnt << 5]);
-      src2 = LD_SB(&src[16 + (cnt << 5)]);
-      src3 = LD_SB(&src[24 + (cnt << 5)]);
-      src1 = __msa_sldi_b(src2, src0, 8);
-
-      XORI_B4_128_SB(src0, src1, src2, src3);
-      VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
-                 vec12);
-      VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
-                 vec13);
-      VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
-                 vec14);
-      VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
-                 vec15);
-      DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
-                  vec1, vec2, vec3);
-      DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
-                  vec9, vec10, vec11);
-      DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
-                   vec1, vec2, vec3);
-      DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
-                   vec9, vec10, vec11);
-      ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
-                  out2, out3);
-      SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-      SAT_SH4_SH(out0, out1, out2, out3, 7);
-      LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
-      PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
-      PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
-    }
-
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter) {
-  v16i8 src0, src1, src2, src3, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
-  v8u16 vec2, vec3, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[16]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
-  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
-  SRARI_H2_UH(vec2, vec3, FILTER_BITS);
-  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
-  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-  AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter) {
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-  v8u16 vec4, vec5, vec6, vec7, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[16]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
-  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
-              vec6, vec7);
-  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
-  PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
-              res3);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
-             dst6);
-  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
-              res3);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-  dst += (4 * dst_stride);
-  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
-                                             int32_t src_stride, uint8_t *dst,
-                                             int32_t dst_stride, int8_t *filter,
-                                             int32_t height) {
-  if (4 == height) {
-    common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else if (8 == height) {
-    common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
-  }
-}
-
-static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter) {
-  v16i8 src0, src1, src2, src3, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3;
-  v8u16 vec0, vec1, vec2, vec3, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-              vec2, vec3);
-  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                     dst_stride);
-}
-
-static void common_hz_2t_and_aver_dst_8x8mult_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter, int32_t height) {
-  v16i8 src0, src1, src2, src3, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3;
-  v8u16 vec0, vec1, vec2, vec3, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  src += (4 * src_stride);
-  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-              vec2, vec3);
-  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  src += (4 * src_stride);
-  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                     dst_stride);
-  dst += (4 * dst_stride);
-
-  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-              vec2, vec3);
-  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                     dst_stride);
-  dst += (4 * dst_stride);
-
-  if (16 == height) {
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-                vec2, vec3);
-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                       dst_stride);
-    dst += (4 * dst_stride);
-
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-                vec2, vec3);
-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                       dst_stride);
-  }
-}
-
-static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
-                                             int32_t src_stride, uint8_t *dst,
-                                             int32_t dst_stride, int8_t *filter,
-                                             int32_t height) {
-  if (4 == height) {
-    common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else {
-    common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
-                                          filter, height);
-  }
-}
-
-static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB4(src, src_stride, src0, src2, src4, src6);
-  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
-  src += (4 * src_stride);
-
-  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
-  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
-  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
-  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
-              res2, res3);
-  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
-              res6, res7);
-  SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
-  SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
-  dst += dst_stride;
-  PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
-  dst += dst_stride;
-  PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
-  dst += dst_stride;
-  PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
-  dst += dst_stride;
-
-  for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src2, src4, src6);
-    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-
-    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
-    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
-                res2, res3);
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
-                res6, res7);
-    SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
-    SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
-    dst += dst_stride;
-    PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
-    dst += dst_stride;
-    PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
-    dst += dst_stride;
-    PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
-    dst += dst_stride;
-  }
-}
-
-static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  for (loop_cnt = (height >> 1); loop_cnt--;) {
-    src0 = LD_SB(src);
-    src2 = LD_SB(src + 16);
-    src3 = LD_SB(src + 24);
-    src1 = __msa_sldi_b(src2, src0, 8);
-    src += src_stride;
-    src4 = LD_SB(src);
-    src6 = LD_SB(src + 16);
-    src7 = LD_SB(src + 24);
-    src5 = __msa_sldi_b(src6, src4, 8);
-    src += src_stride;
-
-    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
-    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
-                res2, res3);
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
-                res6, res7);
-    SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
-    SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
-    LD_UB2(dst, 16, dst0, dst1);
-    PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
-    PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
-    dst += dst_stride;
-    LD_UB2(dst, 16, dst2, dst3);
-    PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
-    PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
-    dst += dst_stride;
-  }
-}
-
-static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  for (loop_cnt = height; loop_cnt--;) {
-    LD_SB4(src, 16, src0, src2, src4, src6);
-    src7 = LD_SB(src + 56);
-    SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
-    src += src_stride;
-
-    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
-    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
-                out2, out3);
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
-                out6, out7);
-    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
-    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-    LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
-    PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
-    PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
-    PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
-    PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
-    dst += dst_stride;
-  }
-}
-
-void aom_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4, int w,
-                                 int h) {
-  int8_t cnt, filt_hor[8];
-
-  assert(x_step_q4 == 16);
-  assert(((const int32_t *)filter_x)[1] != 0x800000);
-
-  for (cnt = 0; cnt < 8; ++cnt) {
-    filt_hor[cnt] = filter_x[cnt];
-  }
-
-  if (((const int32_t *)filter_x)[0] == 0) {
-    switch (w) {
-      case 4:
-        common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
-                                         (int32_t)dst_stride, &filt_hor[3], h);
-        break;
-      case 8:
-        common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
-                                         (int32_t)dst_stride, &filt_hor[3], h);
-        break;
-      case 16:
-        common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, &filt_hor[3], h);
-        break;
-      case 32:
-        common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, &filt_hor[3], h);
-        break;
-      case 64:
-        common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, &filt_hor[3], h);
-        break;
-      default:
-        aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                  x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  } else {
-    switch (w) {
-      case 4:
-        common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
-                                         (int32_t)dst_stride, filt_hor, h);
-        break;
-      case 8:
-        common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
-                                         (int32_t)dst_stride, filt_hor, h);
-        break;
-      case 16:
-        common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, filt_hor, h);
-        break;
-      case 32:
-        common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, filt_hor, h);
-        break;
-      case 64:
-        common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, filt_hor, h);
-        break;
-      default:
-        aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                  x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c
deleted file mode 100644
index bed600d5b..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c
+++ /dev/null
@@ -1,605 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/aom_convolve_msa.h"
-
-static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
-  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
-  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
-  v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
-  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[16]);
-  src -= (3 + 3 * src_stride);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
-  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
-  src += (7 * src_stride);
-
-  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
-
-  filt = LD_SH(filter_vert);
-  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
-
-  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-  vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src7, src8, src9, src10);
-    XORI_B4_128_SB(src7, src8, src9, src10);
-    src += (4 * src_stride);
-
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
-                              filt_hz1, filt_hz2, filt_hz3);
-    hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
-    vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
-    res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-
-    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
-                              filt_hz1, filt_hz2, filt_hz3);
-    hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
-    vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
-    res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-
-    SRARI_H2_SH(res0, res1, FILTER_BITS);
-    SAT_SH2_SH(res0, res1, 7);
-    PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
-    XORI_B2_128_UB(tmp0, tmp1);
-    AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
-    ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
-    dst += (4 * dst_stride);
-
-    hz_out5 = hz_out9;
-    vec0 = vec2;
-    vec1 = vec3;
-    vec2 = vec4;
-  }
-}
-
-static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
-  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
-  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
-  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
-  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
-  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= (3 + 3 * src_stride);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
-  src += (7 * src_stride);
-
-  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
-  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-
-  filt = LD_SH(filter_vert);
-  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
-
-  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
-  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
-  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src7, src8, src9, src10);
-    XORI_B4_128_SB(src7, src8, src9, src10);
-    src += (4 * src_stride);
-
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-
-    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
-                              filt_hz1, filt_hz2, filt_hz3);
-    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
-    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-
-    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
-                              filt_hz1, filt_hz2, filt_hz3);
-    out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
-    tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-
-    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
-                              filt_hz1, filt_hz2, filt_hz3);
-    out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
-    tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-
-    hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
-                               filt_hz0, filt_hz1, filt_hz2, filt_hz3);
-    out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
-    tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-
-    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
-    CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst,
-                            dst_stride);
-    dst += (4 * dst_stride);
-
-    hz_out6 = hz_out10;
-    out0 = out2;
-    out1 = out3;
-    out2 = out8;
-    out4 = out6;
-    out5 = out7;
-    out6 = out9;
-  }
-}
-
-static void common_hv_8ht_8vt_and_aver_dst_16w_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  int32_t multiple8_cnt;
-  for (multiple8_cnt = 2; multiple8_cnt--;) {
-    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
-                                          filter_horiz, filter_vert, height);
-    src += 8;
-    dst += 8;
-  }
-}
-
-static void common_hv_8ht_8vt_and_aver_dst_32w_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  int32_t multiple8_cnt;
-  for (multiple8_cnt = 4; multiple8_cnt--;) {
-    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
-                                          filter_horiz, filter_vert, height);
-    src += 8;
-    dst += 8;
-  }
-}
-
-static void common_hv_8ht_8vt_and_aver_dst_64w_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  int32_t multiple8_cnt;
-  for (multiple8_cnt = 8; multiple8_cnt--;) {
-    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
-                                          filter_horiz, filter_vert, height);
-    src += 8;
-    dst += 8;
-  }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert) {
-  v16i8 src0, src1, src2, src3, src4, mask;
-  v16u8 filt_hz, filt_vt, vec0, vec1;
-  v16u8 dst0, dst1, dst2, dst3, res0, res1;
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[16]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter_horiz);
-  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  filt = LD_UH(filter_vert);
-  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
-  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-  hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
-  hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
-  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
-  AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert) {
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
-  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
-  v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[16]);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
-  filt = LD_SH(filter_vert);
-  filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-  src += (8 * src_stride);
-  src8 = LD_SB(src);
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
-  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
-  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
-  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
-  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
-             hz_out3, hz_out5, 8);
-  hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
-
-  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
-             dst6);
-  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0,
-              tmp1, tmp2, tmp3);
-  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2,
-              res3);
-  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
-              res3);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-  dst += (4 * dst_stride);
-  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  if (4 == height) {
-    common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
-                                           filter_horiz, filter_vert);
-  } else if (8 == height) {
-    common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
-                                           filter_horiz, filter_vert);
-  }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert) {
-  v16i8 src0, src1, src2, src3, src4, mask;
-  v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
-  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
-  filt = LD_SH(filter_vert);
-  filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-  src += (5 * src_stride);
-
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-  vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-  vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
-
-  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-  vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-  vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
-
-  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
-                     dst_stride);
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, mask;
-  v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
-  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
-  filt = LD_SH(filter_vert);
-  filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
-  src0 = LD_SB(src);
-  src += src_stride;
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
-
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
-
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
-                       dst_stride);
-    dst += (4 * dst_stride);
-  }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_8w_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  if (4 == height) {
-    common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
-                                           filter_horiz, filter_vert);
-  } else {
-    common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
-        src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
-  }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_16w_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
-  v8i16 filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
-  filt = LD_SH(filter_vert);
-  filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_SB2(src, 8, src0, src1);
-  src += src_stride;
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src2, src4, src6);
-    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
-    dst += dst_stride;
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
-    dst += dst_stride;
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
-    dst += dst_stride;
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
-    dst += dst_stride;
-  }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_32w_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  int32_t multiple8_cnt;
-  for (multiple8_cnt = 2; multiple8_cnt--;) {
-    common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
-                                           filter_horiz, filter_vert, height);
-    src += 16;
-    dst += 16;
-  }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_64w_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  int32_t multiple8_cnt;
-  for (multiple8_cnt = 4; multiple8_cnt--;) {
-    common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
-                                           filter_horiz, filter_vert, height);
-    src += 16;
-    dst += 16;
-  }
-}
-
-void aom_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
-  int8_t cnt, filt_hor[8], filt_ver[8];
-
-  assert(x_step_q4 == 16);
-  assert(y_step_q4 == 16);
-  assert(((const int32_t *)filter_x)[1] != 0x800000);
-  assert(((const int32_t *)filter_y)[1] != 0x800000);
-
-  for (cnt = 0; cnt < 8; ++cnt) {
-    filt_hor[cnt] = filter_x[cnt];
-    filt_ver[cnt] = filter_y[cnt];
-  }
-
-  if (((const int32_t *)filter_x)[0] == 0 &&
-      ((const int32_t *)filter_y)[0] == 0) {
-    switch (w) {
-      case 4:
-        common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
-                                              (int32_t)dst_stride, &filt_hor[3],
-                                              &filt_ver[3], h);
-        break;
-      case 8:
-        common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
-                                              (int32_t)dst_stride, &filt_hor[3],
-                                              &filt_ver[3], h);
-        break;
-      case 16:
-        common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
-                                               (int32_t)dst_stride,
-                                               &filt_hor[3], &filt_ver[3], h);
-        break;
-      case 32:
-        common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
-                                               (int32_t)dst_stride,
-                                               &filt_hor[3], &filt_ver[3], h);
-        break;
-      case 64:
-        common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
-                                               (int32_t)dst_stride,
-                                               &filt_hor[3], &filt_ver[3], h);
-        break;
-      default:
-        aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  } else if (((const int32_t *)filter_x)[0] == 0 ||
-             ((const int32_t *)filter_y)[0] == 0) {
-    aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
-  } else {
-    switch (w) {
-      case 4:
-        common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
-                                              (int32_t)dst_stride, filt_hor,
-                                              filt_ver, h);
-        break;
-      case 8:
-        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
-                                              (int32_t)dst_stride, filt_hor,
-                                              filt_ver, h);
-        break;
-      case 16:
-        common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
-                                               (int32_t)dst_stride, filt_hor,
-                                               filt_ver, h);
-        break;
-      case 32:
-        common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
-                                               (int32_t)dst_stride, filt_hor,
-                                               filt_ver, h);
-        break;
-      case 64:
-        common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
-                                               (int32_t)dst_stride, filt_hor,
-                                               filt_ver, h);
-        break;
-      default:
-        aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c
deleted file mode 100644
index dae771104..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c
+++ /dev/null
@@ -1,677 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/aom_convolve_msa.h"
-
-static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
-                                             int32_t src_stride, uint8_t *dst,
-                                             int32_t dst_stride, int8_t *filter,
-                                             int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16u8 dst0, dst1, dst2, dst3, out;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
-  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
-  v16i8 src10998, filt0, filt1, filt2, filt3;
-  v8i16 filt, out10, out32;
-
-  src -= (3 * src_stride);
-
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
-  src += (7 * src_stride);
-
-  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
-             src54_r, src21_r);
-  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
-  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
-             src4332, src6554);
-  XORI_B3_128_SB(src2110, src4332, src6554);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src7, src8, src9, src10);
-    src += (4 * src_stride);
-
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
-               src87_r, src98_r, src109_r);
-    ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
-    XORI_B2_128_SB(src8776, src10998);
-    out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
-                                filt1, filt2, filt3);
-    out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
-                                filt1, filt2, filt3);
-    SRARI_H2_SH(out10, out32, FILTER_BITS);
-    SAT_SH2_SH(out10, out32, 7);
-    out = PCKEV_XORI128_UB(out10, out32);
-    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-
-    dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0);
-    out = __msa_aver_u_b(out, dst0);
-
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-    dst += (4 * dst_stride);
-
-    src2110 = src6554;
-    src4332 = src8776;
-    src6554 = src10998;
-    src6 = src10;
-  }
-}
-
-static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
-                                             int32_t src_stride, uint8_t *dst,
-                                             int32_t dst_stride, int8_t *filter,
-                                             int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16u8 dst0, dst1, dst2, dst3;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
-  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
-  v8i16 filt, out0, out1, out2, out3;
-
-  src -= (3 * src_stride);
-
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
-  src += (7 * src_stride);
-
-  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
-  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
-             src54_r, src21_r);
-  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src7, src8, src9, src10);
-    src += (4 * src_stride);
-
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    XORI_B4_128_SB(src7, src8, src9, src10);
-    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
-               src87_r, src98_r, src109_r);
-    out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, filt1,
-                               filt2, filt3);
-    out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, filt1,
-                               filt2, filt3);
-    out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, filt1,
-                               filt2, filt3);
-    out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
-                               filt1, filt2, filt3);
-    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-    SAT_SH4_SH(out0, out1, out2, out3, 7);
-    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
-                            dst_stride);
-    dst += (4 * dst_stride);
-
-    src10_r = src54_r;
-    src32_r = src76_r;
-    src54_r = src98_r;
-    src21_r = src65_r;
-    src43_r = src87_r;
-    src65_r = src109_r;
-    src6 = src10;
-  }
-}
-
-static void common_vt_8t_and_aver_dst_16w_mult_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter, int32_t height, int32_t width) {
-  const uint8_t *src_tmp;
-  uint8_t *dst_tmp;
-  uint32_t loop_cnt, cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
-  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
-  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
-  v16i8 filt0, filt1, filt2, filt3;
-  v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
-  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
-
-  src -= (3 * src_stride);
-
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  for (cnt = (width >> 4); cnt--;) {
-    src_tmp = src;
-    dst_tmp = dst;
-
-    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
-    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
-    src_tmp += (7 * src_stride);
-
-    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
-               src54_r, src21_r);
-    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
-    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
-               src54_l, src21_l);
-    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
-
-    for (loop_cnt = (height >> 2); loop_cnt--;) {
-      LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
-      src_tmp += (4 * src_stride);
-
-      LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
-      XORI_B4_128_SB(src7, src8, src9, src10);
-      ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
-                 src87_r, src98_r, src109_r);
-      ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
-                 src87_l, src98_l, src109_l);
-      out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
-                                   filt1, filt2, filt3);
-      out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
-                                   filt1, filt2, filt3);
-      out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
-                                   filt1, filt2, filt3);
-      out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
-                                   filt1, filt2, filt3);
-      out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
-                                   filt1, filt2, filt3);
-      out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
-                                   filt1, filt2, filt3);
-      out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
-                                   filt1, filt2, filt3);
-      out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
-                                   filt1, filt2, filt3);
-      SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
-      SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
-      SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
-      SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
-      PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
-                  out3_r, tmp0, tmp1, tmp2, tmp3);
-      XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
-      AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst0, dst1,
-                  dst2, dst3);
-      ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
-      dst_tmp += (4 * dst_stride);
-
-      src10_r = src54_r;
-      src32_r = src76_r;
-      src54_r = src98_r;
-      src21_r = src65_r;
-      src43_r = src87_r;
-      src65_r = src109_r;
-      src10_l = src54_l;
-      src32_l = src76_l;
-      src54_l = src98_l;
-      src21_l = src65_l;
-      src43_l = src87_l;
-      src65_l = src109_l;
-      src6 = src10;
-    }
-
-    src += 16;
-    dst += 16;
-  }
-}
-
-static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
-                                         filter, height, 16);
-}
-
-static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
-                                         filter, height, 32);
-}
-
-static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
-                                         filter, height, 64);
-}
-
-static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter) {
-  v16i8 src0, src1, src2, src3, src4;
-  v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
-  v16i8 src10_r, src32_r, src21_r, src43_r;
-  v8i16 filt;
-  v8u16 tmp0, tmp1;
-
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  src += (4 * src_stride);
-
-  src4 = LD_SB(src);
-  src += src_stride;
-
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
-  dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0);
-  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
-             src32_r, src43_r);
-  ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
-  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
-  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-
-  out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-  out = __msa_aver_u_b(out, dst0);
-
-  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter) {
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
-  v16u8 src2110, src4332, src6554, src8776, filt0;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-  src += (8 * src_stride);
-  src8 = LD_SB(src);
-
-  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2,
-             dst3);
-  ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
-  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
-             src32_r, src43_r);
-  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
-             src76_r, src87_r);
-  ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
-             src76_r, src2110, src4332, src6554, src8776);
-  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
-              tmp0, tmp1, tmp2, tmp3);
-  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
-  AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
-  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
-  dst += (4 * dst_stride);
-  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
-                                             int32_t src_stride, uint8_t *dst,
-                                             int32_t dst_stride, int8_t *filter,
-                                             int32_t height) {
-  if (4 == height) {
-    common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else if (8 == height) {
-    common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
-  }
-}
-
-static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter) {
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  /* rearranging filter_y */
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
-  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
-              tmp2, tmp3);
-  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
-                     dst_stride);
-}
-
-static void common_vt_2t_and_aver_dst_8x8mult_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
-  v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  /* rearranging filter_y */
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 3); loop_cnt--;) {
-    LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
-    src += (8 * src_stride);
-    LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
-
-    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
-               vec3);
-    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
-               vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
-                tmp2, tmp3);
-    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst,
-                       dst_stride);
-    dst += (4 * dst_stride);
-
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
-                tmp2, tmp3);
-    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst,
-                       dst_stride);
-    dst += (4 * dst_stride);
-
-    src0 = src8;
-  }
-}
-
-static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src,
-                                             int32_t src_stride, uint8_t *dst,
-                                             int32_t dst_stride, int8_t *filter,
-                                             int32_t height) {
-  if (4 == height) {
-    common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else {
-    common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
-                                          filter, height);
-  }
-}
-
-static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 tmp0, tmp1, tmp2, tmp3, filt;
-
-  /* rearranging filter_y */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
-    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
-    dst += dst_stride;
-
-    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
-    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
-    dst += dst_stride;
-
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
-    dst += dst_stride;
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
-    dst += dst_stride;
-
-    src0 = src4;
-  }
-}
-
-static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
-  v8u16 tmp0, tmp1, tmp2, tmp3, filt;
-
-  /* rearranging filter_y */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_UB2(src, 16, src0, src5);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
-    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
-
-    LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
-    LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
-    src += (4 * src_stride);
-
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
-
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
-
-    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
-    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
-
-    ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
-    ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
-
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
-
-    ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
-    ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
-    dst += (4 * dst_stride);
-
-    src0 = src4;
-    src5 = src9;
-  }
-}
-
-static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5;
-  v16u8 src6, src7, src8, src9, src10, src11, filt0;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  v8u16 filt;
-
-  /* rearranging filter_y */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_UB4(src, 16, src0, src3, src6, src9);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 1); loop_cnt--;) {
-    LD_UB2(src, src_stride, src1, src2);
-    LD_UB2(dst, dst_stride, dst0, dst1);
-    LD_UB2(src + 16, src_stride, src4, src5);
-    LD_UB2(dst + 16, dst_stride, dst2, dst3);
-    LD_UB2(src + 32, src_stride, src7, src8);
-    LD_UB2(dst + 32, dst_stride, dst4, dst5);
-    LD_UB2(src + 48, src_stride, src10, src11);
-    LD_UB2(dst + 48, dst_stride, dst6, dst7);
-    src += (2 * src_stride);
-
-    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
-    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
-
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
-
-    ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
-    ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
-    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
-    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
-
-    ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
-    ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
-
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
-
-    ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
-    ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
-    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
-    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
-    dst += (2 * dst_stride);
-
-    src0 = src2;
-    src3 = src5;
-    src6 = src8;
-    src9 = src11;
-  }
-}
-
-void aom_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4, int w,
-                                int h) {
-  int8_t cnt, filt_ver[8];
-
-  assert(y_step_q4 == 16);
-  assert(((const int32_t *)filter_y)[1] != 0x800000);
-
-  for (cnt = 0; cnt < 8; ++cnt) {
-    filt_ver[cnt] = filter_y[cnt];
-  }
-
-  if (((const int32_t *)filter_y)[0] == 0) {
-    switch (w) {
-      case 4:
-        common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
-                                         (int32_t)dst_stride, &filt_ver[3], h);
-        break;
-      case 8:
-        common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
-                                         (int32_t)dst_stride, &filt_ver[3], h);
-        break;
-      case 16:
-        common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, &filt_ver[3], h);
-        break;
-      case 32:
-        common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, &filt_ver[3], h);
-        break;
-      case 64:
-        common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, &filt_ver[3], h);
-        break;
-      default:
-        aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  } else {
-    switch (w) {
-      case 4:
-        common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
-                                         (int32_t)dst_stride, filt_ver, h);
-        break;
-      case 8:
-        common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
-                                         (int32_t)dst_stride, filt_ver, h);
-        break;
-      case 16:
-        common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, filt_ver, h);
-
-        break;
-      case 32:
-        common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, filt_ver, h);
-        break;
-      case 64:
-        common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, filt_ver, h);
-        break;
-      default:
-        aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c
index fc3a823c5..363fad308 100644
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c
+++ b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c
@@ -10,7 +10,9 @@
  */
 
 #include <assert.h>
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/aom_convolve_msa.h"
 
 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c
deleted file mode 100644
index a4d594931..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c
+++ /dev/null
@@ -1,630 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/aom_convolve_msa.h"
-
-const uint8_t mc_filt_mask_arr[16 * 3] = {
-  /* 8 width cases */
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-  /* 4 width cases */
-  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
-  /* 4 width cases */
-  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
-};
-
-static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
-                                     uint8_t *dst, int32_t dst_stride,
-                                     int8_t *filter_horiz, int8_t *filter_vert,
-                                     int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
-  v16u8 mask0, mask1, mask2, mask3, out;
-  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
-  v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
-  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[16]);
-  src -= (3 + 3 * src_stride);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
-  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
-  src += (7 * src_stride);
-
-  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
-
-  filt = LD_SH(filter_vert);
-  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
-
-  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
-  out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src7, src8, src9, src10);
-    XORI_B4_128_SB(src7, src8, src9, src10);
-    src += (4 * src_stride);
-
-    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
-                              filt_hz1, filt_hz2, filt_hz3);
-    hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
-    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
-    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-
-    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
-                              filt_hz1, filt_hz2, filt_hz3);
-    hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
-    out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
-    tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-    SRARI_H2_SH(tmp0, tmp1, FILTER_BITS);
-    SAT_SH2_SH(tmp0, tmp1, 7);
-    out = PCKEV_XORI128_UB(tmp0, tmp1);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-    dst += (4 * dst_stride);
-
-    hz_out5 = hz_out9;
-    out0 = out2;
-    out1 = out3;
-    out2 = out4;
-  }
-}
-
-static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
-                                     uint8_t *dst, int32_t dst_stride,
-                                     int8_t *filter_horiz, int8_t *filter_vert,
-                                     int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
-  v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
-  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
-  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
-  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
-  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= (3 + 3 * src_stride);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
-  src += (7 * src_stride);
-
-  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
-  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-
-  filt = LD_SH(filter_vert);
-  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
-
-  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
-  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
-  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src7, src8, src9, src10);
-    src += (4 * src_stride);
-
-    XORI_B4_128_SB(src7, src8, src9, src10);
-
-    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
-                              filt_hz1, filt_hz2, filt_hz3);
-    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
-    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-
-    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
-                              filt_hz1, filt_hz2, filt_hz3);
-    out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
-    tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-
-    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
-                              filt_hz1, filt_hz2, filt_hz3);
-    out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
-    tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-
-    hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
-                               filt_hz0, filt_hz1, filt_hz2, filt_hz3);
-    out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
-    tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
-    vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
-    vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
-    ST8x4_UB(vec0, vec1, dst, dst_stride);
-    dst += (4 * dst_stride);
-
-    hz_out6 = hz_out10;
-    out0 = out2;
-    out1 = out3;
-    out2 = out8;
-    out4 = out6;
-    out5 = out7;
-    out6 = out9;
-  }
-}
-
-static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      int8_t *filter_horiz, int8_t *filter_vert,
-                                      int32_t height) {
-  int32_t multiple8_cnt;
-  for (multiple8_cnt = 2; multiple8_cnt--;) {
-    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
-                             filter_vert, height);
-    src += 8;
-    dst += 8;
-  }
-}
-
-static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      int8_t *filter_horiz, int8_t *filter_vert,
-                                      int32_t height) {
-  int32_t multiple8_cnt;
-  for (multiple8_cnt = 4; multiple8_cnt--;) {
-    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
-                             filter_vert, height);
-    src += 8;
-    dst += 8;
-  }
-}
-
-static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      int8_t *filter_horiz, int8_t *filter_vert,
-                                      int32_t height) {
-  int32_t multiple8_cnt;
-  for (multiple8_cnt = 8; multiple8_cnt--;) {
-    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
-                             filter_vert, height);
-    src += 8;
-    dst += 8;
-  }
-}
-
-static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      int8_t *filter_horiz,
-                                      int8_t *filter_vert) {
-  v16i8 src0, src1, src2, src3, src4, mask;
-  v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
-
-  mask = LD_SB(&mc_filt_mask_arr[16]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter_horiz);
-  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  filt = LD_UH(filter_vert);
-  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
-  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-  hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
-  hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
-
-  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      int8_t *filter_horiz,
-                                      int8_t *filter_vert) {
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
-  v16i8 res0, res1, res2, res3;
-  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
-  v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[16]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter_horiz);
-  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  filt = LD_UH(filter_vert);
-  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-  src += (8 * src_stride);
-  src8 = LD_SB(src);
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
-  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
-  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
-  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
-  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
-             hz_out3, hz_out5, 8);
-  hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
-
-  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4,
-              vec5, vec6, vec7);
-  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
-  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
-              res3);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-  dst += (4 * dst_stride);
-  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
-                                     uint8_t *dst, int32_t dst_stride,
-                                     int8_t *filter_horiz, int8_t *filter_vert,
-                                     int32_t height) {
-  if (4 == height) {
-    common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
-                              filter_vert);
-  } else if (8 == height) {
-    common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
-                              filter_vert);
-  }
-}
-
-static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      int8_t *filter_horiz,
-                                      int8_t *filter_vert) {
-  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
-  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
-  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
-  filt = LD_SH(filter_vert);
-  filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-  vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-  vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
-
-  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-  vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-  vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
-
-  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-  ST8x4_UB(out0, out1, dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
-                                          int32_t src_stride, uint8_t *dst,
-                                          int32_t dst_stride,
-                                          int8_t *filter_horiz,
-                                          int8_t *filter_vert, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
-  v16u8 filt_hz, filt_vt, vec0;
-  v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
-  v8i16 filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
-  filt = LD_SH(filter_vert);
-  filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
-  src0 = LD_SB(src);
-  src += src_stride;
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-
-  for (loop_cnt = (height >> 3); loop_cnt--;) {
-    LD_SB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
-
-    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    LD_SB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp4 = __msa_dotp_u_h(vec0, filt_vt);
-
-    SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
-    PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
-    ST8x4_UB(out0, out1, dst, dst_stride);
-    dst += (4 * dst_stride);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp5 = __msa_dotp_u_h(vec0, filt_vt);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp6 = __msa_dotp_u_h(vec0, filt_vt);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp7 = __msa_dotp_u_h(vec0, filt_vt);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp8 = __msa_dotp_u_h(vec0, filt_vt);
-
-    SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
-    PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
-    ST8x4_UB(out0, out1, dst, dst_stride);
-    dst += (4 * dst_stride);
-  }
-}
-
-static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride,
-                                     uint8_t *dst, int32_t dst_stride,
-                                     int8_t *filter_horiz, int8_t *filter_vert,
-                                     int32_t height) {
-  if (4 == height) {
-    common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
-                              filter_vert);
-  } else {
-    common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
-                                  filter_horiz, filter_vert, height);
-  }
-}
-
-static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      int8_t *filter_horiz, int8_t *filter_vert,
-                                      int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt_hz, filt_vt, vec0, vec1;
-  v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
-  v8i16 filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
-  filt = LD_SH(filter_vert);
-  filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_SB2(src, 8, src0, src1);
-  src += src_stride;
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src2, src4, src6);
-    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
-    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
-    PCKEV_ST_SB(tmp1, tmp2, dst);
-    dst += dst_stride;
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
-    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
-    PCKEV_ST_SB(tmp1, tmp2, dst);
-    dst += dst_stride;
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
-    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
-    PCKEV_ST_SB(tmp1, tmp2, dst);
-    dst += dst_stride;
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
-    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
-    PCKEV_ST_SB(tmp1, tmp2, dst);
-    dst += dst_stride;
-  }
-}
-
-static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      int8_t *filter_horiz, int8_t *filter_vert,
-                                      int32_t height) {
-  int32_t multiple8_cnt;
-  for (multiple8_cnt = 2; multiple8_cnt--;) {
-    common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
-                              filter_vert, height);
-    src += 16;
-    dst += 16;
-  }
-}
-
-static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      int8_t *filter_horiz, int8_t *filter_vert,
-                                      int32_t height) {
-  int32_t multiple8_cnt;
-  for (multiple8_cnt = 4; multiple8_cnt--;) {
-    common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
-                              filter_vert, height);
-    src += 16;
-    dst += 16;
-  }
-}
-
-void aom_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                       ptrdiff_t dst_stride, const int16_t *filter_x,
-                       int32_t x_step_q4, const int16_t *filter_y,
-                       int32_t y_step_q4, int32_t w, int32_t h) {
-  int8_t cnt, filt_hor[8], filt_ver[8];
-
-  assert(x_step_q4 == 16);
-  assert(y_step_q4 == 16);
-  assert(((const int32_t *)filter_x)[1] != 0x800000);
-  assert(((const int32_t *)filter_y)[1] != 0x800000);
-
-  for (cnt = 0; cnt < 8; ++cnt) {
-    filt_hor[cnt] = filter_x[cnt];
-    filt_ver[cnt] = filter_y[cnt];
-  }
-
-  if (((const int32_t *)filter_x)[0] == 0 &&
-      ((const int32_t *)filter_y)[0] == 0) {
-    switch (w) {
-      case 4:
-        common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst,
-                                 (int32_t)dst_stride, &filt_hor[3],
-                                 &filt_ver[3], (int32_t)h);
-        break;
-      case 8:
-        common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst,
-                                 (int32_t)dst_stride, &filt_hor[3],
-                                 &filt_ver[3], (int32_t)h);
-        break;
-      case 16:
-        common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst,
-                                  (int32_t)dst_stride, &filt_hor[3],
-                                  &filt_ver[3], (int32_t)h);
-        break;
-      case 32:
-        common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst,
-                                  (int32_t)dst_stride, &filt_hor[3],
-                                  &filt_ver[3], (int32_t)h);
-        break;
-      case 64:
-        common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst,
-                                  (int32_t)dst_stride, &filt_hor[3],
-                                  &filt_ver[3], (int32_t)h);
-        break;
-      default:
-        aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
-        break;
-    }
-  } else if (((const int32_t *)filter_x)[0] == 0 ||
-             ((const int32_t *)filter_y)[0] == 0) {
-    aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                    filter_y, y_step_q4, w, h);
-  } else {
-    switch (w) {
-      case 4:
-        common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst,
-                                 (int32_t)dst_stride, filt_hor, filt_ver,
-                                 (int32_t)h);
-        break;
-      case 8:
-        common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst,
-                                 (int32_t)dst_stride, filt_hor, filt_ver,
-                                 (int32_t)h);
-        break;
-      case 16:
-        common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst,
-                                  (int32_t)dst_stride, filt_hor, filt_ver,
-                                  (int32_t)h);
-        break;
-      case 32:
-        common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst,
-                                  (int32_t)dst_stride, filt_hor, filt_ver,
-                                  (int32_t)h);
-        break;
-      case 64:
-        common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst,
-                                  (int32_t)dst_stride, filt_hor, filt_ver,
-                                  (int32_t)h);
-        break;
-      default:
-        aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
-        break;
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c
index f7bdfc2bd..aa962b41f 100644
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c
+++ b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c
@@ -10,7 +10,9 @@
  */
 
 #include <assert.h>
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/aom_convolve_msa.h"
 
 static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c
deleted file mode 100644
index 75f8c7ea8..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/macros_msa.h"
-
-static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
-                           int32_t dst_stride, int32_t height) {
-  int32_t cnt;
-  uint32_t out0, out1, out2, out3;
-  v16u8 src0, src1, src2, src3;
-  v16u8 dst0, dst1, dst2, dst3;
-
-  if (0 == (height % 4)) {
-    for (cnt = (height / 4); cnt--;) {
-      LD_UB4(src, src_stride, src0, src1, src2, src3);
-      src += (4 * src_stride);
-
-      LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-
-      AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
-                  dst2, dst3);
-
-      out0 = __msa_copy_u_w((v4i32)dst0, 0);
-      out1 = __msa_copy_u_w((v4i32)dst1, 0);
-      out2 = __msa_copy_u_w((v4i32)dst2, 0);
-      out3 = __msa_copy_u_w((v4i32)dst3, 0);
-      SW4(out0, out1, out2, out3, dst, dst_stride);
-      dst += (4 * dst_stride);
-    }
-  } else if (0 == (height % 2)) {
-    for (cnt = (height / 2); cnt--;) {
-      LD_UB2(src, src_stride, src0, src1);
-      src += (2 * src_stride);
-
-      LD_UB2(dst, dst_stride, dst0, dst1);
-
-      AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
-
-      out0 = __msa_copy_u_w((v4i32)dst0, 0);
-      out1 = __msa_copy_u_w((v4i32)dst1, 0);
-      SW(out0, dst);
-      dst += dst_stride;
-      SW(out1, dst);
-      dst += dst_stride;
-    }
-  }
-}
-
-static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
-                           int32_t dst_stride, int32_t height) {
-  int32_t cnt;
-  uint64_t out0, out1, out2, out3;
-  v16u8 src0, src1, src2, src3;
-  v16u8 dst0, dst1, dst2, dst3;
-
-  for (cnt = (height / 4); cnt--;) {
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-
-    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
-                dst2, dst3);
-
-    out0 = __msa_copy_u_d((v2i64)dst0, 0);
-    out1 = __msa_copy_u_d((v2i64)dst1, 0);
-    out2 = __msa_copy_u_d((v2i64)dst2, 0);
-    out3 = __msa_copy_u_d((v2i64)dst3, 0);
-    SD4(out0, out1, out2, out3, dst, dst_stride);
-    dst += (4 * dst_stride);
-  }
-}
-
-static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
-                            uint8_t *dst, int32_t dst_stride, int32_t height) {
-  int32_t cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-
-  for (cnt = (height / 8); cnt--;) {
-    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-    src += (8 * src_stride);
-    LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-
-    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
-                dst2, dst3);
-    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
-                dst6, dst7);
-    ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
-    dst += (8 * dst_stride);
-  }
-}
-
-static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
-                            uint8_t *dst, int32_t dst_stride, int32_t height) {
-  int32_t cnt;
-  uint8_t *dst_dup = dst;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
-
-  for (cnt = (height / 8); cnt--;) {
-    LD_UB4(src, src_stride, src0, src2, src4, src6);
-    LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-    LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
-    LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
-    dst_dup += (4 * dst_stride);
-    LD_UB4(src, src_stride, src8, src10, src12, src14);
-    LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
-    src += (4 * src_stride);
-    LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
-    LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
-    dst_dup += (4 * dst_stride);
-
-    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
-                dst2, dst3);
-    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
-                dst6, dst7);
-    AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9,
-                dst10, dst11);
-    AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12,
-                dst13, dst14, dst15);
-
-    ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
-    ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
-    dst += (4 * dst_stride);
-    ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
-    ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
-    dst += (4 * dst_stride);
-  }
-}
-
-static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
-                            uint8_t *dst, int32_t dst_stride, int32_t height) {
-  int32_t cnt;
-  uint8_t *dst_dup = dst;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
-
-  for (cnt = (height / 4); cnt--;) {
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB4(src, 16, src4, src5, src6, src7);
-    src += src_stride;
-    LD_UB4(src, 16, src8, src9, src10, src11);
-    src += src_stride;
-    LD_UB4(src, 16, src12, src13, src14, src15);
-    src += src_stride;
-
-    LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
-    dst_dup += dst_stride;
-    LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
-    dst_dup += dst_stride;
-    LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
-    dst_dup += dst_stride;
-    LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
-    dst_dup += dst_stride;
-
-    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
-                dst2, dst3);
-    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
-                dst6, dst7);
-    AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9,
-                dst10, dst11);
-    AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12,
-                dst13, dst14, dst15);
-
-    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
-    dst += dst_stride;
-    ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
-    dst += dst_stride;
-    ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
-    dst += dst_stride;
-    ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
-    dst += dst_stride;
-  }
-}
-
-void aom_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
-                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const int16_t *filter_x, int32_t filter_x_stride,
-                          const int16_t *filter_y, int32_t filter_y_stride,
-                          int32_t w, int32_t h) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-
-  switch (w) {
-    case 4: {
-      avg_width4_msa(src, src_stride, dst, dst_stride, h);
-      break;
-    }
-    case 8: {
-      avg_width8_msa(src, src_stride, dst, dst_stride, h);
-      break;
-    }
-    case 16: {
-      avg_width16_msa(src, src_stride, dst, dst_stride, h);
-      break;
-    }
-    case 32: {
-      avg_width32_msa(src, src_stride, dst, dst_stride, h);
-      break;
-    }
-    case 64: {
-      avg_width64_msa(src, src_stride, dst, dst_stride, h);
-      break;
-    }
-    default: {
-      int32_t lp, cnt;
-      for (cnt = h; cnt--;) {
-        for (lp = 0; lp < w; ++lp) {
-          dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1);
-        }
-        src += src_stride;
-        dst += dst_stride;
-      }
-      break;
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h
index 1a0ae4d8d..a0627c074 100644
--- a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h
+++ b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h
@@ -31,23 +31,6 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
     tmp_dpadd_0;                                                           \
   })
 
-#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0,       \
-                        filt_h1, filt_h2, filt_h3)                             \
-  ({                                                                           \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                      \
-    v8i16 hz_out_m;                                                            \
-                                                                               \
-    VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, vec0_m, vec1_m, vec2_m, \
-               vec3_m);                                                        \
-    hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, filt_h0,    \
-                                   filt_h1, filt_h2, filt_h3);                 \
-                                                                               \
-    hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS);                           \
-    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                                     \
-                                                                               \
-    hz_out_m;                                                                  \
-  })
-
 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
                                    mask2, mask3, filt0, filt1, filt2, filt3, \
                                    out0, out1)                               \
@@ -93,32 +76,4 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
                 res7_m, out0, out1, out2, out3);                             \
   }
 
-#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \
-  {                                                  \
-    v16u8 tmp_m;                                     \
-                                                     \
-    tmp_m = PCKEV_XORI128_UB(in1, in0);              \
-    tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst);       \
-    ST_UB(tmp_m, (pdst));                            \
-  }
-
-#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)              \
-  {                                                       \
-    v16u8 tmp_m;                                          \
-                                                          \
-    tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
-    tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst);            \
-    ST_UB(tmp_m, (pdst));                                 \
-  }
-
-#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, \
-                           stride)                                           \
-  {                                                                          \
-    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
-                                                                             \
-    PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                         \
-    PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
-    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);             \
-    ST8x4_UB(tmp0_m, tmp1_m, pdst, stride);                                  \
-  }
 #endif /* AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ */
diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.h b/third_party/aom/aom_dsp/mips/common_dspr2.h
index 31159fdcd..d51bfa899 100644
--- a/third_party/aom/aom_dsp/mips/common_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/common_dspr2.h
@@ -13,7 +13,9 @@
 #define AOM_COMMON_MIPS_DSPR2_H_
 
 #include <assert.h>
-#include "./aom_config.h"
+
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 
 #ifdef __cplusplus
diff --git a/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c
deleted file mode 100644
index d557115b9..000000000
--- a/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
-                                         uint8_t *dst, int32_t dst_stride,
-                                         const int16_t *filter_y, int32_t w,
-                                         int32_t h) {
-  int32_t x, y;
-  const uint8_t *src_ptr;
-  uint8_t *dst_ptr;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  uint32_t load1, load2;
-  uint32_t p1, p2;
-  uint32_t scratch1, scratch2;
-  uint32_t store1, store2;
-  int32_t Temp1, Temp2;
-  const int16_t *filter = &filter_y[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_store(dst + dst_stride);
-
-    for (x = 0; x < w; x += 4) {
-      src_ptr = src + x;
-      dst_ptr = dst + x;
-
-      __asm__ __volatile__(
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-
-          "mtlo             %[vector4a],  $ac0                            \n\t"
-          "mtlo             %[vector4a],  $ac1                            \n\t"
-          "mtlo             %[vector4a],  $ac2                            \n\t"
-          "mtlo             %[vector4a],  $ac3                            \n\t"
-          "mthi             $zero,        $ac0                            \n\t"
-          "mthi             $zero,        $ac1                            \n\t"
-          "mthi             $zero,        $ac2                            \n\t"
-          "mthi             $zero,        $ac3                            \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
-          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
-
-          "extp             %[Temp1],     $ac0,           31              \n\t"
-          "extp             %[Temp2],     $ac1,           31              \n\t"
-
-          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
-          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
-          "extp             %[Temp1],     $ac2,           31              \n\t"
-
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
-          "extp             %[Temp2],     $ac3,           31              \n\t"
-          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
-
-          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
-          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
-          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
-
-          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
-
-          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
-            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1),
-            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
-            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
-          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
-            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
-    }
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
-                                          int32_t src_stride, uint8_t *dst,
-                                          int32_t dst_stride,
-                                          const int16_t *filter_y, int32_t h) {
-  int32_t x, y;
-  const uint8_t *src_ptr;
-  uint8_t *dst_ptr;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  uint32_t load1, load2;
-  uint32_t p1, p2;
-  uint32_t scratch1, scratch2;
-  uint32_t store1, store2;
-  int32_t Temp1, Temp2;
-  const int16_t *filter = &filter_y[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_store(dst + dst_stride);
-    prefetch_store(dst + dst_stride + 32);
-
-    for (x = 0; x < 64; x += 4) {
-      src_ptr = src + x;
-      dst_ptr = dst + x;
-
-      __asm__ __volatile__(
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-
-          "mtlo             %[vector4a],  $ac0                            \n\t"
-          "mtlo             %[vector4a],  $ac1                            \n\t"
-          "mtlo             %[vector4a],  $ac2                            \n\t"
-          "mtlo             %[vector4a],  $ac3                            \n\t"
-          "mthi             $zero,        $ac0                            \n\t"
-          "mthi             $zero,        $ac1                            \n\t"
-          "mthi             $zero,        $ac2                            \n\t"
-          "mthi             $zero,        $ac3                            \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
-          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
-
-          "extp             %[Temp1],     $ac0,           31              \n\t"
-          "extp             %[Temp2],     $ac1,           31              \n\t"
-
-          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
-          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
-          "extp             %[Temp1],     $ac2,           31              \n\t"
-
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
-          "extp             %[Temp2],     $ac3,           31              \n\t"
-          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
-
-          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
-          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
-          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
-
-          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
-
-          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
-            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1),
-            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
-            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
-          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
-            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
-    }
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void aom_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h) {
-  uint32_t pos = 38;
-
-  assert(y_step_q4 == 16);
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                       :
-                       : [pos] "r"(pos));
-
-  prefetch_store(dst);
-
-  switch (w) {
-    case 4:
-    case 8:
-    case 16:
-    case 32:
-      convolve_bi_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y,
-                                   w, h);
-      break;
-    case 64:
-      prefetch_store(dst + 32);
-      convolve_bi_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
-                                    h);
-      break;
-    default:
-      aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                               x_step_q4, filter_y, y_step_q4, w, h);
-      break;
-  }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c
deleted file mode 100644
index efbdcf60f..000000000
--- a/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c
+++ /dev/null
@@ -1,802 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
-                                          int32_t src_stride, uint8_t *dst,
-                                          int32_t dst_stride,
-                                          const int16_t *filter_x0, int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  int32_t Temp1, Temp2, Temp3, Temp4;
-  uint32_t vector4a = 64;
-  uint32_t tp1, tp2;
-  uint32_t p1, p2, p3;
-  uint32_t tn1, tn2;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-    prefetch_store(dst + dst_stride);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],         0(%[src])                      \n\t"
-        "ulw              %[tp2],         4(%[src])                      \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
-        "extp             %[Temp1],       $ac3,           31             \n\t"
-
-        /* even 2. pixel */
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "balign           %[tp2],         %[tp1],         3              \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
-        "extp             %[Temp3],       $ac2,           31             \n\t"
-
-        "lbu              %[p2],          3(%[dst])                      \n\t" /* load odd 2 */
-
-        /* odd 1. pixel */
-        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t" /* even 1 */
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "lbu              %[Temp1],       1(%[dst])                      \n\t" /* load odd 1 */
-        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p3],          %[tp2]                         \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
-        "extp             %[Temp2],       $ac3,           31             \n\t"
-
-        "lbu              %[tn2],         0(%[dst])                      \n\t" /* load even 1 */
-
-        /* odd 2. pixel */
-        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t" /* even 2 */
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t" /* odd 1 */
-        "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t" /* average even 1 */
-        "dpa.w.ph         $ac2,           %[p3],          %[filter45]    \n\t"
-        "extp             %[Temp4],       $ac2,           31             \n\t"
-
-        "lbu              %[tp1],         2(%[dst])                      \n\t" /* load even 2 */
-        "sb               %[tn2],         0(%[dst])                      \n\t" /* store even 1 */
-
-        /* clamp */
-        "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t" /* average odd 1 */
-        "lbux             %[p3],          %[Temp4](%[cm])                \n\t" /* odd 2 */
-        "sb               %[Temp1],       1(%[dst])                      \n\t" /* store odd 1 */
-
-        "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t" /* average even 2 */
-        "sb               %[tp1],         2(%[dst])                      \n\t" /* store even 2 */
-
-        "addqh_r.w        %[p2],          %[p2],          %[p3]          \n\t" /* average odd 2 */
-        "sb               %[p2],          3(%[dst])                      \n\t" /* store odd 2 */
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
-          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
-          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
-          [Temp4] "=&r"(Temp4)
-        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
-          [dst] "r"(dst), [src] "r"(src));
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
-                                          int32_t src_stride, uint8_t *dst,
-                                          int32_t dst_stride,
-                                          const int16_t *filter_x0, int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t tp1, tp2, tp3, tp4;
-  uint32_t p1, p2, p3, p4, n1;
-  uint32_t st0, st1;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-    prefetch_store(dst + dst_stride);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],         0(%[src])                      \n\t"
-        "ulw              %[tp2],         4(%[src])                      \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
-        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
-        "ulw              %[tp3],         8(%[src])                      \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
-        "extp             %[Temp1],       $ac3,           31             \n\t"
-        "lbu              %[Temp2],       0(%[dst])                      \n\t"
-        "lbu              %[tp4],         2(%[dst])                      \n\t"
-
-        /* even 2. pixel */
-        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
-        "extp             %[Temp3],       $ac2,           31             \n\t"
-
-        /* even 3. pixel */
-        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
-        "mtlo             %[vector4a],    $ac1                           \n\t"
-        "mthi             $zero,          $ac1                           \n\t"
-        "lbux             %[st1],         %[Temp3](%[cm])                \n\t"
-        "dpa.w.ph         $ac1,           %[p3],          %[filter45]    \n\t"
-        "extp             %[Temp1],       $ac1,           31             \n\t"
-
-        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
-        "addqh_r.w        %[tp4],         %[tp4],         %[st1]         \n\t"
-        "sb               %[Temp2],       0(%[dst])                      \n\t"
-        "sb               %[tp4],         2(%[dst])                      \n\t"
-
-        /* even 4. pixel */
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-
-        "balign           %[tp3],         %[tp2],         3              \n\t"
-        "balign           %[tp2],         %[tp1],         3              \n\t"
-
-        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
-        "lbu              %[Temp2],       4(%[dst])                      \n\t"
-        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
-
-        "dpa.w.ph         $ac2,           %[p4],          %[filter45]    \n\t"
-        "extp             %[Temp3],       $ac2,           31             \n\t"
-
-        /* odd 1. pixel */
-        "mtlo             %[vector4a],    $ac1                           \n\t"
-        "mthi             $zero,          $ac1                           \n\t"
-        "sb               %[Temp2],       4(%[dst])                      \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
-        "preceu.ph.qbr    %[p3],          %[tp3]                         \n\t"
-        "preceu.ph.qbl    %[p4],          %[tp3]                         \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
-        "extp             %[Temp2],       $ac3,           31             \n\t"
-
-        "lbu              %[tp1],         6(%[dst])                      \n\t"
-
-        /* odd 2. pixel */
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "lbux             %[st0],         %[Temp3](%[cm])                \n\t"
-        "dpa.w.ph         $ac1,           %[p2],          %[filter45]    \n\t"
-        "extp             %[Temp3],       $ac1,           31             \n\t"
-
-        "lbu              %[tp2],         1(%[dst])                      \n\t"
-        "lbu              %[tp3],         3(%[dst])                      \n\t"
-        "addqh_r.w        %[tp1],         %[tp1],         %[st0]         \n\t"
-
-        /* odd 3. pixel */
-        "lbux             %[st1],         %[Temp2](%[cm])                \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[filter45]    \n\t"
-        "addqh_r.w        %[tp2],         %[tp2],         %[st1]         \n\t"
-        "extp             %[Temp2],       $ac3,           31             \n\t"
-
-        "lbu              %[tp4],         5(%[dst])                      \n\t"
-
-        /* odd 4. pixel */
-        "sb               %[tp2],         1(%[dst])                      \n\t"
-        "sb               %[tp1],         6(%[dst])                      \n\t"
-        "dpa.w.ph         $ac2,           %[p4],          %[filter45]    \n\t"
-        "extp             %[Temp1],       $ac2,           31             \n\t"
-
-        "lbu              %[tp1],         7(%[dst])                      \n\t"
-
-        /* clamp */
-        "lbux             %[p4],          %[Temp3](%[cm])                \n\t"
-        "addqh_r.w        %[tp3],         %[tp3],         %[p4]          \n\t"
-
-        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
-        "addqh_r.w        %[tp4],         %[tp4],         %[p2]          \n\t"
-
-        "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
-        "addqh_r.w        %[tp1],         %[tp1],         %[p1]          \n\t"
-
-        /* store bytes */
-        "sb               %[tp3],         3(%[dst])                      \n\t"
-        "sb               %[tp4],         5(%[dst])                      \n\t"
-        "sb               %[tp1],         7(%[dst])                      \n\t"
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
-          [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1),
-          [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
-          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
-        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
-          [dst] "r"(dst), [src] "r"(src));
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
-                                           int32_t src_stride, uint8_t *dst_ptr,
-                                           int32_t dst_stride,
-                                           const int16_t *filter_x0, int32_t h,
-                                           int32_t count) {
-  int32_t y, c;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2, qload3;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    src = src_ptr;
-    dst = dst_ptr;
-
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-    prefetch_store(dst_ptr + dst_stride);
-
-    for (c = 0; c < count; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],    0(%[src])                    \n\t"
-          "ulw              %[qload2],    4(%[src])                    \n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "ulw              %[qload3],    8(%[src])                    \n\t"
-          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
-          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "ulw              %[qload1],    12(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
-
-          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
-          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
-          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
-          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
-          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
-          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
-          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
-          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
-
-          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
-          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],    1(%[src])                   \n\t"
-          "ulw              %[qload2],    5(%[src])                    \n\t"
-
-          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
-          "ulw              %[qload3],    9(%[src])                    \n\t"
-          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
-          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
-
-          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
-          "ulw              %[qload1],    13(%[src])                   \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
-          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
-          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
-          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
-
-          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
-          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
-
-          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
-          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
-          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
-          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
-
-          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
-
-          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
-
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
-
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
-
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
-
-          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
-          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
-          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
-            [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
-            [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
-            [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-            [Temp3] "=&r"(Temp3)
-          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
-            [dst] "r"(dst), [src] "r"(src));
-
-      src += 16;
-      dst += 16;
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
-                                           int32_t src_stride, uint8_t *dst_ptr,
-                                           int32_t dst_stride,
-                                           const int16_t *filter_x0,
-                                           int32_t h) {
-  int32_t y, c;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2, qload3;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    src = src_ptr;
-    dst = dst_ptr;
-
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-    prefetch_load(src_ptr + src_stride + 64);
-    prefetch_store(dst_ptr + dst_stride);
-    prefetch_store(dst_ptr + dst_stride + 32);
-
-    for (c = 0; c < 4; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],    0(%[src])                    \n\t"
-          "ulw              %[qload2],    4(%[src])                    \n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "ulw              %[qload3],    8(%[src])                    \n\t"
-          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
-          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "ulw              %[qload1],    12(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
-
-          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
-          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
-          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
-          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
-          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
-          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
-          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
-          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
-
-          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
-          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],    1(%[src])                   \n\t"
-          "ulw              %[qload2],    5(%[src])                    \n\t"
-
-          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
-          "ulw              %[qload3],    9(%[src])                    \n\t"
-          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
-          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
-
-          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
-          "ulw              %[qload1],    13(%[src])                   \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
-          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
-          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
-          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
-
-          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
-          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
-
-          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
-          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
-          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
-          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
-
-          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
-
-          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
-
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
-
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
-
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
-
-          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
-          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
-          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
-            [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
-            [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
-            [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-            [Temp3] "=&r"(Temp3)
-          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
-            [dst] "r"(dst), [src] "r"(src));
-
-      src += 16;
-      dst += 16;
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-void aom_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
-                                   int w, int h) {
-  uint32_t pos = 38;
-
-  assert(x_step_q4 == 16);
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                       :
-                       : [pos] "r"(pos));
-
-  /* prefetch data to cache memory */
-  prefetch_load(src);
-  prefetch_load(src + 32);
-  prefetch_store(dst);
-
-  switch (w) {
-    case 4:
-      convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                    h);
-      break;
-    case 8:
-      convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                    h);
-      break;
-    case 16:
-      convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                     h, 1);
-      break;
-    case 32:
-      convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                     h, 2);
-      break;
-    case 64:
-      prefetch_load(src + 64);
-      prefetch_store(dst + 32);
-
-      convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                     h);
-      break;
-    default:
-      aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                x_step_q4, filter_y, y_step_q4, w, h);
-      break;
-  }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c
index 066308315..08bf1ab30 100644
--- a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c
@@ -12,7 +12,8 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/convolve_common_dspr2.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
diff --git a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
index dc51ab1cb..2a8f75938 100644
--- a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
@@ -12,7 +12,8 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/convolve_common_dspr2.h"
 #include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
diff --git a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
index 3367be01a..ac87936da 100644
--- a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
@@ -12,7 +12,8 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/convolve_common_dspr2.h"
 #include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
diff --git a/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c
deleted file mode 100644
index 3574da19f..000000000
--- a/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c
+++ /dev/null
@@ -1,646 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      const int16_t *filter_y, int32_t w,
-                                      int32_t h) {
-  int32_t x, y;
-  const uint8_t *src_ptr;
-  uint8_t *dst_ptr;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  uint32_t load1, load2, load3, load4;
-  uint32_t p1, p2;
-  uint32_t n1, n2;
-  uint32_t scratch1, scratch2;
-  uint32_t store1, store2;
-  int32_t vector1b, vector2b, vector3b, vector4b;
-  int32_t Temp1, Temp2;
-
-  vector1b = ((const int32_t *)filter_y)[0];
-  vector2b = ((const int32_t *)filter_y)[1];
-  vector3b = ((const int32_t *)filter_y)[2];
-  vector4b = ((const int32_t *)filter_y)[3];
-
-  src -= 3 * src_stride;
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_store(dst + dst_stride);
-
-    for (x = 0; x < w; x += 4) {
-      src_ptr = src + x;
-      dst_ptr = dst + x;
-
-      __asm__ __volatile__(
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
-
-          "mtlo             %[vector4a],  $ac0                            \n\t"
-          "mtlo             %[vector4a],  $ac1                            \n\t"
-          "mtlo             %[vector4a],  $ac2                            \n\t"
-          "mtlo             %[vector4a],  $ac3                            \n\t"
-          "mthi             $zero,        $ac0                            \n\t"
-          "mthi             $zero,        $ac1                            \n\t"
-          "mthi             $zero,        $ac2                            \n\t"
-          "mthi             $zero,        $ac3                            \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
-
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
-          "extp             %[Temp1],     $ac0,           31              \n\t"
-          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
-          "extp             %[Temp2],     $ac1,           31              \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
-          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
-          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
-          "extp             %[Temp1],     $ac2,           31              \n\t"
-
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
-          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
-          "extp             %[Temp2],     $ac3,           31              \n\t"
-          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
-
-          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
-          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
-          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
-
-          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
-
-          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
-            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
-            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
-            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
-          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
-            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
-            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
-            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
-    }
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
-                                       uint8_t *dst, int32_t dst_stride,
-                                       const int16_t *filter_y, int32_t h) {
-  int32_t x, y;
-  const uint8_t *src_ptr;
-  uint8_t *dst_ptr;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  uint32_t load1, load2, load3, load4;
-  uint32_t p1, p2;
-  uint32_t n1, n2;
-  uint32_t scratch1, scratch2;
-  uint32_t store1, store2;
-  int32_t vector1b, vector2b, vector3b, vector4b;
-  int32_t Temp1, Temp2;
-
-  vector1b = ((const int32_t *)filter_y)[0];
-  vector2b = ((const int32_t *)filter_y)[1];
-  vector3b = ((const int32_t *)filter_y)[2];
-  vector4b = ((const int32_t *)filter_y)[3];
-
-  src -= 3 * src_stride;
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_store(dst + dst_stride);
-    prefetch_store(dst + dst_stride + 32);
-
-    for (x = 0; x < 64; x += 4) {
-      src_ptr = src + x;
-      dst_ptr = dst + x;
-
-      __asm__ __volatile__(
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
-
-          "mtlo             %[vector4a],  $ac0                            \n\t"
-          "mtlo             %[vector4a],  $ac1                            \n\t"
-          "mtlo             %[vector4a],  $ac2                            \n\t"
-          "mtlo             %[vector4a],  $ac3                            \n\t"
-          "mthi             $zero,        $ac0                            \n\t"
-          "mthi             $zero,        $ac1                            \n\t"
-          "mthi             $zero,        $ac2                            \n\t"
-          "mthi             $zero,        $ac3                            \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
-
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
-          "extp             %[Temp1],     $ac0,           31              \n\t"
-          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
-          "extp             %[Temp2],     $ac1,           31              \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
-          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
-          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
-          "extp             %[Temp1],     $ac2,           31              \n\t"
-
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
-          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
-          "extp             %[Temp2],     $ac3,           31              \n\t"
-          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
-
-          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
-          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
-          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
-
-          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
-
-          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
-            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
-            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
-            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
-          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
-            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
-            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
-            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
-    }
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void aom_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h) {
-  assert(y_step_q4 == 16);
-  assert(((const int32_t *)filter_y)[1] != 0x800000);
-
-  if (((const int32_t *)filter_y)[0] == 0) {
-    aom_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
-  } else {
-    uint32_t pos = 38;
-
-    /* bit positon for extract from acc */
-    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                         :
-                         : [pos] "r"(pos));
-
-    prefetch_store(dst);
-
-    switch (w) {
-      case 4:
-      case 8:
-      case 16:
-      case 32:
-        convolve_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
-                                  h);
-        break;
-      case 64:
-        prefetch_store(dst + 32);
-        convolve_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
-                                   h);
-        break;
-      default:
-        aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  }
-}
-
-void aom_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
-                             int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
-  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
-
-  assert(w <= 64);
-  assert(h <= 64);
-  assert(x_step_q4 == 16);
-  assert(y_step_q4 == 16);
-
-  if (intermediate_height < h) intermediate_height = h;
-
-  aom_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x,
-                      x_step_q4, filter_y, y_step_q4, w, intermediate_height);
-
-  aom_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x,
-                         x_step_q4, filter_y, y_step_q4, w, h);
-}
-
-void aom_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int filter_x_stride,
-                            const int16_t *filter_y, int filter_y_stride, int w,
-                            int h) {
-  int x, y;
-  uint32_t tp1, tp2, tn1;
-  uint32_t tp3, tp4, tn2;
-
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
-
-  /* prefetch data to cache memory */
-  prefetch_load(src);
-  prefetch_load(src + 32);
-  prefetch_store(dst);
-
-  switch (w) {
-    case 4:
-      /* 1 word storage */
-      for (y = h; y--;) {
-        prefetch_load(src + src_stride);
-        prefetch_load(src + src_stride + 32);
-        prefetch_store(dst + dst_stride);
-
-        __asm__ __volatile__(
-            "ulw              %[tp1],         0(%[src])      \n\t"
-            "ulw              %[tp2],         0(%[dst])      \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
-
-            : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
-            : [src] "r"(src), [dst] "r"(dst));
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-      break;
-    case 8:
-      /* 2 word storage */
-      for (y = h; y--;) {
-        prefetch_load(src + src_stride);
-        prefetch_load(src + src_stride + 32);
-        prefetch_store(dst + dst_stride);
-
-        __asm__ __volatile__(
-            "ulw              %[tp1],         0(%[src])      \n\t"
-            "ulw              %[tp2],         0(%[dst])      \n\t"
-            "ulw              %[tp3],         4(%[src])      \n\t"
-            "ulw              %[tp4],         4(%[dst])      \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         4(%[dst])      \n\t" /* store */
-
-            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
-              [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
-            : [src] "r"(src), [dst] "r"(dst));
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-      break;
-    case 16:
-      /* 4 word storage */
-      for (y = h; y--;) {
-        prefetch_load(src + src_stride);
-        prefetch_load(src + src_stride + 32);
-        prefetch_store(dst + dst_stride);
-
-        __asm__ __volatile__(
-            "ulw              %[tp1],         0(%[src])      \n\t"
-            "ulw              %[tp2],         0(%[dst])      \n\t"
-            "ulw              %[tp3],         4(%[src])      \n\t"
-            "ulw              %[tp4],         4(%[dst])      \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         8(%[src])      \n\t"
-            "ulw              %[tp2],         8(%[dst])      \n\t"
-            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         4(%[dst])      \n\t" /* store */
-            "ulw              %[tp3],         12(%[src])     \n\t"
-            "ulw              %[tp4],         12(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "sw               %[tn1],         8(%[dst])      \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         12(%[dst])     \n\t" /* store */
-
-            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
-              [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
-            : [src] "r"(src), [dst] "r"(dst));
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-      break;
-    case 32:
-      /* 8 word storage */
-      for (y = h; y--;) {
-        prefetch_load(src + src_stride);
-        prefetch_load(src + src_stride + 32);
-        prefetch_store(dst + dst_stride);
-
-        __asm__ __volatile__(
-            "ulw              %[tp1],         0(%[src])      \n\t"
-            "ulw              %[tp2],         0(%[dst])      \n\t"
-            "ulw              %[tp3],         4(%[src])      \n\t"
-            "ulw              %[tp4],         4(%[dst])      \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         8(%[src])      \n\t"
-            "ulw              %[tp2],         8(%[dst])      \n\t"
-            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         4(%[dst])      \n\t" /* store */
-            "ulw              %[tp3],         12(%[src])     \n\t"
-            "ulw              %[tp4],         12(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         16(%[src])     \n\t"
-            "ulw              %[tp2],         16(%[dst])     \n\t"
-            "sw               %[tn1],         8(%[dst])      \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         12(%[dst])     \n\t" /* store */
-            "ulw              %[tp3],         20(%[src])     \n\t"
-            "ulw              %[tp4],         20(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         24(%[src])     \n\t"
-            "ulw              %[tp2],         24(%[dst])     \n\t"
-            "sw               %[tn1],         16(%[dst])     \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         20(%[dst])     \n\t" /* store */
-            "ulw              %[tp3],         28(%[src])     \n\t"
-            "ulw              %[tp4],         28(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "sw               %[tn1],         24(%[dst])     \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         28(%[dst])     \n\t" /* store */
-
-            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
-              [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
-            : [src] "r"(src), [dst] "r"(dst));
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-      break;
-    case 64:
-      prefetch_load(src + 64);
-      prefetch_store(dst + 32);
-
-      /* 16 word storage */
-      for (y = h; y--;) {
-        prefetch_load(src + src_stride);
-        prefetch_load(src + src_stride + 32);
-        prefetch_load(src + src_stride + 64);
-        prefetch_store(dst + dst_stride);
-        prefetch_store(dst + dst_stride + 32);
-
-        __asm__ __volatile__(
-            "ulw              %[tp1],         0(%[src])      \n\t"
-            "ulw              %[tp2],         0(%[dst])      \n\t"
-            "ulw              %[tp3],         4(%[src])      \n\t"
-            "ulw              %[tp4],         4(%[dst])      \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         8(%[src])      \n\t"
-            "ulw              %[tp2],         8(%[dst])      \n\t"
-            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         4(%[dst])      \n\t" /* store */
-            "ulw              %[tp3],         12(%[src])     \n\t"
-            "ulw              %[tp4],         12(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         16(%[src])     \n\t"
-            "ulw              %[tp2],         16(%[dst])     \n\t"
-            "sw               %[tn1],         8(%[dst])      \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         12(%[dst])     \n\t" /* store */
-            "ulw              %[tp3],         20(%[src])     \n\t"
-            "ulw              %[tp4],         20(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         24(%[src])     \n\t"
-            "ulw              %[tp2],         24(%[dst])     \n\t"
-            "sw               %[tn1],         16(%[dst])     \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         20(%[dst])     \n\t" /* store */
-            "ulw              %[tp3],         28(%[src])     \n\t"
-            "ulw              %[tp4],         28(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         32(%[src])     \n\t"
-            "ulw              %[tp2],         32(%[dst])     \n\t"
-            "sw               %[tn1],         24(%[dst])     \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         28(%[dst])     \n\t" /* store */
-            "ulw              %[tp3],         36(%[src])     \n\t"
-            "ulw              %[tp4],         36(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         40(%[src])     \n\t"
-            "ulw              %[tp2],         40(%[dst])     \n\t"
-            "sw               %[tn1],         32(%[dst])     \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         36(%[dst])     \n\t" /* store */
-            "ulw              %[tp3],         44(%[src])     \n\t"
-            "ulw              %[tp4],         44(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         48(%[src])     \n\t"
-            "ulw              %[tp2],         48(%[dst])     \n\t"
-            "sw               %[tn1],         40(%[dst])     \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         44(%[dst])     \n\t" /* store */
-            "ulw              %[tp3],         52(%[src])     \n\t"
-            "ulw              %[tp4],         52(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         56(%[src])     \n\t"
-            "ulw              %[tp2],         56(%[dst])     \n\t"
-            "sw               %[tn1],         48(%[dst])     \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         52(%[dst])     \n\t" /* store */
-            "ulw              %[tp3],         60(%[src])     \n\t"
-            "ulw              %[tp4],         60(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "sw               %[tn1],         56(%[dst])     \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         60(%[dst])     \n\t" /* store */
-
-            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
-              [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
-            : [src] "r"(src), [dst] "r"(dst));
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-      break;
-    default:
-      for (y = h; y > 0; --y) {
-        for (x = 0; x < w; ++x) {
-          dst[x] = (dst[x] + src[x] + 1) >> 1;
-        }
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-      break;
-  }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c
deleted file mode 100644
index f6534b420..000000000
--- a/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c
+++ /dev/null
@@ -1,998 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
-                                       uint8_t *dst, int32_t dst_stride,
-                                       const int16_t *filter_x0, int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  int32_t vector1b, vector2b, vector3b, vector4b;
-  int32_t Temp1, Temp2, Temp3, Temp4;
-  uint32_t vector4a = 64;
-  uint32_t tp1, tp2;
-  uint32_t p1, p2, p3, p4;
-  uint32_t n1, n2, n3, n4;
-  uint32_t tn1, tn2;
-
-  vector1b = ((const int32_t *)filter_x0)[0];
-  vector2b = ((const int32_t *)filter_x0)[1];
-  vector3b = ((const int32_t *)filter_x0)[2];
-  vector4b = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-    prefetch_store(dst + dst_stride);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],         0(%[src])                      \n\t"
-        "ulw              %[tp2],         4(%[src])                      \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
-        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
-        "ulw              %[tn2],         8(%[src])                      \n\t"
-        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
-        "extp             %[Temp1],       $ac3,           31             \n\t"
-
-        /* even 2. pixel */
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
-        "balign           %[tn1],         %[tn2],         3              \n\t"
-        "balign           %[tn2],         %[tp2],         3              \n\t"
-        "balign           %[tp2],         %[tp1],         3              \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
-        "extp             %[Temp3],       $ac2,           31             \n\t"
-
-        "lbu              %[p2],          3(%[dst])                      \n\t" /* load odd 2 */
-
-        /* odd 1. pixel */
-        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t" /* even 1 */
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "lbu              %[Temp1],       1(%[dst])                      \n\t" /* load odd 1 */
-        "preceu.ph.qbr    %[n1],          %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[n2],          %[tp2]                         \n\t"
-        "preceu.ph.qbr    %[n3],          %[tn2]                         \n\t"
-        "preceu.ph.qbl    %[n4],          %[tn2]                         \n\t"
-        "dpa.w.ph         $ac3,           %[n1],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,           %[n2],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,           %[n3],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac3,           %[n4],          %[vector4b]    \n\t"
-        "extp             %[Temp2],       $ac3,           31             \n\t"
-
-        "lbu              %[tn2],         0(%[dst])                      \n\t" /* load even 1 */
-
-        /* odd 2. pixel */
-        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t" /* even 2 */
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "preceu.ph.qbr    %[n1],          %[tn1]                         \n\t"
-        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t" /* odd 1 */
-        "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t" /* average even 1 */
-        "dpa.w.ph         $ac2,           %[n2],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,           %[n3],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,           %[n4],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,           %[n1],          %[vector4b]    \n\t"
-        "extp             %[Temp4],       $ac2,           31             \n\t"
-
-        "lbu              %[tp1],         2(%[dst])                      \n\t" /* load even 2 */
-        "sb               %[tn2],         0(%[dst])                      \n\t" /* store even 1 */
-
-        /* clamp */
-        "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t" /* average odd 1 */
-        "lbux             %[n2],          %[Temp4](%[cm])                \n\t" /* odd 2 */
-        "sb               %[Temp1],       1(%[dst])                      \n\t" /* store odd 1 */
-
-        "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t" /* average even 2 */
-        "sb               %[tp1],         2(%[dst])                      \n\t" /* store even 2 */
-
-        "addqh_r.w        %[p2],          %[p2],          %[n2]          \n\t" /* average odd 2 */
-        "sb               %[p2],          3(%[dst])                      \n\t" /* store odd 2 */
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
-          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
-          [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
-          [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-          [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
-        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
-          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
-          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
-          [src] "r"(src));
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
-                                       uint8_t *dst, int32_t dst_stride,
-                                       const int16_t *filter_x0, int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  int32_t vector1b, vector2b, vector3b, vector4b;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t tp1, tp2;
-  uint32_t p1, p2, p3, p4, n1;
-  uint32_t tn1, tn2, tn3;
-  uint32_t st0, st1;
-
-  vector1b = ((const int32_t *)filter_x0)[0];
-  vector2b = ((const int32_t *)filter_x0)[1];
-  vector3b = ((const int32_t *)filter_x0)[2];
-  vector4b = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-    prefetch_store(dst + dst_stride);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],         0(%[src])                      \n\t"
-        "ulw              %[tp2],         4(%[src])                      \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
-        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
-        "ulw              %[tn2],         8(%[src])                      \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
-        "extp             %[Temp1],       $ac3,           31             \n\t"
-        "lbu              %[Temp2],       0(%[dst])                      \n\t"
-        "lbu              %[tn3],         2(%[dst])                      \n\t"
-
-        /* even 2. pixel */
-        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
-        "preceu.ph.qbl    %[n1],          %[tn2]                         \n\t"
-        "ulw              %[tn1],         12(%[src])                     \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
-        "extp             %[Temp3],       $ac2,           31             \n\t"
-
-        /* even 3. pixel */
-        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
-        "mtlo             %[vector4a],    $ac1                           \n\t"
-        "mthi             $zero,          $ac1                           \n\t"
-        "preceu.ph.qbr    %[p2],          %[tn1]                         \n\t"
-        "lbux             %[st1],         %[Temp3](%[cm])                \n\t"
-        "dpa.w.ph         $ac1,           %[p3],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac1,           %[p4],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac1,           %[p1],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac1,           %[n1],          %[vector4b]    \n\t"
-        "extp             %[Temp1],       $ac1,           31             \n\t"
-
-        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
-        "addqh_r.w        %[tn3],         %[tn3],         %[st1]         \n\t"
-        "sb               %[Temp2],       0(%[dst])                      \n\t"
-        "sb               %[tn3],         2(%[dst])                      \n\t"
-
-        /* even 4. pixel */
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-
-        "balign           %[tn3],         %[tn1],         3              \n\t"
-        "balign           %[tn1],         %[tn2],         3              \n\t"
-        "balign           %[tn2],         %[tp2],         3              \n\t"
-        "balign           %[tp2],         %[tp1],         3              \n\t"
-
-        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
-        "lbu              %[Temp2],       4(%[dst])                      \n\t"
-        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
-
-        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
-        "extp             %[Temp3],       $ac2,           31             \n\t"
-
-        /* odd 1. pixel */
-        "mtlo             %[vector4a],    $ac1                           \n\t"
-        "mthi             $zero,          $ac1                           \n\t"
-        "sb               %[Temp2],       4(%[dst])                      \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
-        "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
-        "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
-        "extp             %[Temp2],       $ac3,           31             \n\t"
-
-        "lbu              %[tp1],         6(%[dst])                      \n\t"
-
-        /* odd 2. pixel */
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
-        "preceu.ph.qbl    %[n1],          %[tn1]                         \n\t"
-        "lbux             %[st0],         %[Temp3](%[cm])                \n\t"
-        "dpa.w.ph         $ac1,           %[p2],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac1,           %[p3],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac1,           %[p4],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac1,           %[p1],          %[vector4b]    \n\t"
-        "extp             %[Temp3],       $ac1,           31             \n\t"
-
-        "lbu              %[tp2],         1(%[dst])                      \n\t"
-        "lbu              %[tn2],         3(%[dst])                      \n\t"
-        "addqh_r.w        %[tp1],         %[tp1],         %[st0]         \n\t"
-
-        /* odd 3. pixel */
-        "lbux             %[st1],         %[Temp2](%[cm])                \n\t"
-        "preceu.ph.qbr    %[p2],          %[tn3]                         \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p4],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac3,           %[n1],          %[vector4b]    \n\t"
-        "addqh_r.w        %[tp2],         %[tp2],         %[st1]         \n\t"
-        "extp             %[Temp2],       $ac3,           31             \n\t"
-
-        "lbu              %[tn3],         5(%[dst])                      \n\t"
-
-        /* odd 4. pixel */
-        "sb               %[tp2],         1(%[dst])                      \n\t"
-        "sb               %[tp1],         6(%[dst])                      \n\t"
-        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
-        "extp             %[Temp1],       $ac2,           31             \n\t"
-
-        "lbu              %[tn1],         7(%[dst])                      \n\t"
-
-        /* clamp */
-        "lbux             %[p4],          %[Temp3](%[cm])                \n\t"
-        "addqh_r.w        %[tn2],         %[tn2],         %[p4]          \n\t"
-
-        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
-        "addqh_r.w        %[tn3],         %[tn3],         %[p2]          \n\t"
-
-        "lbux             %[n1],          %[Temp1](%[cm])                \n\t"
-        "addqh_r.w        %[tn1],         %[tn1],         %[n1]          \n\t"
-
-        /* store bytes */
-        "sb               %[tn2],         3(%[dst])                      \n\t"
-        "sb               %[tn3],         5(%[dst])                      \n\t"
-        "sb               %[tn1],         7(%[dst])                      \n\t"
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
-          [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
-          [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
-          [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
-          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
-        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
-          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
-          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
-          [src] "r"(src));
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
-                                        int32_t src_stride, uint8_t *dst_ptr,
-                                        int32_t dst_stride,
-                                        const int16_t *filter_x0, int32_t h,
-                                        int32_t count) {
-  int32_t y, c;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t filter12, filter34, filter56, filter78;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2, qload3;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-
-  filter12 = ((const int32_t *)filter_x0)[0];
-  filter34 = ((const int32_t *)filter_x0)[1];
-  filter56 = ((const int32_t *)filter_x0)[2];
-  filter78 = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    src = src_ptr;
-    dst = dst_ptr;
-
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-    prefetch_store(dst_ptr + dst_stride);
-
-    for (c = 0; c < count; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],    0(%[src])                    \n\t"
-          "ulw              %[qload2],    4(%[src])                    \n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "ulw              %[qload3],    8(%[src])                    \n\t"
-          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
-          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "ulw              %[qload1],    12(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
-
-          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
-          "ulw              %[qload2],    16(%[src])                   \n\t"
-          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
-          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
-          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
-          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
-          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
-          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
-          "ulw              %[qload3],    20(%[src])                   \n\t"
-          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
-          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
-          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
-          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
-          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
-
-          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
-          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],    1(%[src])                   \n\t"
-          "ulw              %[qload2],    5(%[src])                    \n\t"
-
-          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
-          "ulw              %[qload3],    9(%[src])                    \n\t"
-          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
-          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
-
-          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
-          "ulw              %[qload1],    13(%[src])                   \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
-          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
-          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
-          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
-          "ulw              %[qload2],    17(%[src])                   \n\t"
-          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
-
-          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
-          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
-          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
-
-          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
-          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
-          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
-          "ulw              %[qload3],    21(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
-          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
-          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
-          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
-
-          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
-
-          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
-
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
-
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
-
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
-
-          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
-          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
-          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
-            [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
-            [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
-            [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-            [Temp3] "=&r"(Temp3)
-          : [filter12] "r"(filter12), [filter34] "r"(filter34),
-            [filter56] "r"(filter56), [filter78] "r"(filter78),
-            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
-            [src] "r"(src));
-
-      src += 16;
-      dst += 16;
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
-                                        int32_t src_stride, uint8_t *dst_ptr,
-                                        int32_t dst_stride,
-                                        const int16_t *filter_x0, int32_t h) {
-  int32_t y, c;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t filter12, filter34, filter56, filter78;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2, qload3;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-
-  filter12 = ((const int32_t *)filter_x0)[0];
-  filter34 = ((const int32_t *)filter_x0)[1];
-  filter56 = ((const int32_t *)filter_x0)[2];
-  filter78 = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    src = src_ptr;
-    dst = dst_ptr;
-
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-    prefetch_load(src_ptr + src_stride + 64);
-    prefetch_store(dst_ptr + dst_stride);
-    prefetch_store(dst_ptr + dst_stride + 32);
-
-    for (c = 0; c < 4; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],    0(%[src])                    \n\t"
-          "ulw              %[qload2],    4(%[src])                    \n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "ulw              %[qload3],    8(%[src])                    \n\t"
-          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
-          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "ulw              %[qload1],    12(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
-
-          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
-          "ulw              %[qload2],    16(%[src])                   \n\t"
-          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
-          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
-          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
-          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
-          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
-          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
-          "ulw              %[qload3],    20(%[src])                   \n\t"
-          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
-          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
-          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
-          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
-          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
-
-          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
-          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],    1(%[src])                   \n\t"
-          "ulw              %[qload2],    5(%[src])                    \n\t"
-
-          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
-          "ulw              %[qload3],    9(%[src])                    \n\t"
-          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
-          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
-
-          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
-          "ulw              %[qload1],    13(%[src])                   \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
-          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
-          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
-          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
-          "ulw              %[qload2],    17(%[src])                   \n\t"
-          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
-
-          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
-          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
-          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
-
-          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
-          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
-          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
-          "ulw              %[qload3],    21(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
-          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
-          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
-          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
-
-          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
-
-          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
-
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
-
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
-
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
-
-          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
-          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
-          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
-            [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
-            [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
-            [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-            [Temp3] "=&r"(Temp3)
-          : [filter12] "r"(filter12), [filter34] "r"(filter34),
-            [filter56] "r"(filter56), [filter78] "r"(filter78),
-            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
-            [src] "r"(src));
-
-      src += 16;
-      dst += 16;
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-void aom_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
-                                   int w, int h) {
-  assert(x_step_q4 == 16);
-  assert(((const int32_t *)filter_x)[1] != 0x800000);
-
-  if (((const int32_t *)filter_x)[0] == 0) {
-    aom_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                  x_step_q4, filter_y, y_step_q4, w, h);
-  } else {
-    uint32_t pos = 38;
-
-    src -= 3;
-
-    /* bit positon for extract from acc */
-    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                         :
-                         : [pos] "r"(pos));
-
-    /* prefetch data to cache memory */
-    prefetch_load(src);
-    prefetch_load(src + 32);
-    prefetch_store(dst);
-
-    switch (w) {
-      case 4:
-        convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                   h);
-        break;
-      case 8:
-        convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                   h);
-        break;
-      case 16:
-        convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                    h, 1);
-        break;
-      case 32:
-        convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                    h, 2);
-        break;
-      case 64:
-        prefetch_load(src + 64);
-        prefetch_store(dst + 32);
-
-        convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                    h);
-        break;
-      default:
-        aom_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride,
-                                  filter_x, x_step_q4, filter_y, y_step_q4, w,
-                                  h);
-        break;
-    }
-  }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c
index dd4bc821a..af54b4264 100644
--- a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c
@@ -12,1389 +12,14 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/convolve_common_dspr2.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_ports/mem.h"
 
 #if HAVE_DSPR2
-static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              const int16_t *filter_x0,
-                                              int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint8_t *dst_ptr;
-  int32_t vector1b, vector2b, vector3b, vector4b;
-  int32_t Temp1, Temp2, Temp3, Temp4;
-  uint32_t vector4a = 64;
-  uint32_t tp1, tp2;
-  uint32_t p1, p2, p3, p4;
-  uint32_t tn1, tn2;
-
-  vector1b = ((const int32_t *)filter_x0)[0];
-  vector2b = ((const int32_t *)filter_x0)[1];
-  vector3b = ((const int32_t *)filter_x0)[2];
-  vector4b = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    dst_ptr = dst;
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],         0(%[src])                      \n\t"
-        "ulw              %[tp2],         4(%[src])                      \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
-        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
-        "ulw              %[tn2],         8(%[src])                      \n\t"
-        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
-        "extp             %[Temp1],       $ac3,           31             \n\t"
-
-        /* even 2. pixel */
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
-        "balign           %[tn1],         %[tn2],         3              \n\t"
-        "balign           %[tn2],         %[tp2],         3              \n\t"
-        "balign           %[tp2],         %[tp1],         3              \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
-        "extp             %[Temp3],       $ac2,           31             \n\t"
-
-        /* odd 1. pixel */
-        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
-        "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
-        "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
-        "extp             %[Temp2],       $ac3,           31             \n\t"
-
-        /* odd 2. pixel */
-        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
-        "extp             %[Temp4],       $ac2,           31             \n\t"
-
-        /* clamp */
-        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"
-        "lbux             %[p2],          %[Temp4](%[cm])                \n\t"
-
-        /* store bytes */
-        "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
-
-        "sb               %[tn1],         0(%[dst_ptr])                  \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
-
-        "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
-
-        "sb               %[p2],          0(%[dst_ptr])                  \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
-          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
-          [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-          [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [dst_ptr] "+r"(dst_ptr)
-        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
-          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
-          [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
-          [dst_stride] "r"(dst_stride));
-
-    /* Next row... */
-    src += src_stride;
-    dst += 1;
-  }
-}
-
-static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              const int16_t *filter_x0,
-                                              int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint8_t *dst_ptr;
-  uint32_t vector4a = 64;
-  int32_t vector1b, vector2b, vector3b, vector4b;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t tp1, tp2, tp3;
-  uint32_t p1, p2, p3, p4, n1;
-  uint8_t *odd_dst;
-  uint32_t dst_pitch_2 = (dst_stride << 1);
-
-  vector1b = ((const int32_t *)filter_x0)[0];
-  vector2b = ((const int32_t *)filter_x0)[1];
-  vector3b = ((const int32_t *)filter_x0)[2];
-  vector4b = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-
-    dst_ptr = dst;
-    odd_dst = (dst_ptr + dst_stride);
-
-    __asm__ __volatile__(
-        "ulw              %[tp2],         0(%[src])                       \n\t"
-        "ulw              %[tp1],         4(%[src])                       \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a],    $ac3                            \n\t"
-        "mthi             $zero,          $ac3                            \n\t"
-        "mtlo             %[vector4a],    $ac2                            \n\t"
-        "mthi             $zero,          $ac2                            \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
-        "preceu.ph.qbr    %[p3],          %[tp1]                          \n\t"
-        "preceu.ph.qbl    %[p4],          %[tp1]                          \n\t"
-        "ulw              %[tp3],         8(%[src])                       \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
-        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
-        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
-        "extp             %[Temp1],       $ac3,           31              \n\t"
-
-        /* even 2. pixel */
-        "preceu.ph.qbr    %[p1],          %[tp3]                          \n\t"
-        "preceu.ph.qbl    %[n1],          %[tp3]                          \n\t"
-        "ulw              %[tp2],         12(%[src])                      \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]     \n\t"
-        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]     \n\t"
-        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]     \n\t"
-        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]     \n\t"
-        "extp             %[Temp3],       $ac2,           31              \n\t"
-
-        /* even 3. pixel */
-        "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
-        "mtlo             %[vector4a],    $ac1                            \n\t"
-        "mthi             $zero,          $ac1                            \n\t"
-        "preceu.ph.qbr    %[p2],          %[tp2]                          \n\t"
-        "dpa.w.ph         $ac1,           %[p3],          %[vector1b]     \n\t"
-        "dpa.w.ph         $ac1,           %[p4],          %[vector2b]     \n\t"
-        "dpa.w.ph         $ac1,           %[p1],          %[vector3b]     \n\t"
-        "lbux             %[tp3],         %[Temp3](%[cm])                 \n\t"
-        "dpa.w.ph         $ac1,           %[n1],          %[vector4b]     \n\t"
-        "extp             %[p3],          $ac1,           31              \n\t"
-
-        /* even 4. pixel */
-        "mtlo             %[vector4a],    $ac2                            \n\t"
-        "mthi             $zero,          $ac2                            \n\t"
-        "mtlo             %[vector4a],    $ac3                            \n\t"
-        "mthi             $zero,          $ac3                            \n\t"
-        "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
-        "sb               %[tp3],         0(%[dst_ptr])                   \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
-
-        "ulw              %[tp1],         1(%[src])                       \n\t"
-        "ulw              %[tp3],         5(%[src])                       \n\t"
-
-        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
-        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
-        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
-        "extp             %[Temp3],       $ac2,           31              \n\t"
-
-        "lbux             %[tp2],         %[p3](%[cm])                    \n\t"
-
-        /* odd 1. pixel */
-        "mtlo             %[vector4a],    $ac1                            \n\t"
-        "mthi             $zero,          $ac1                            \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
-        "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
-        "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
-        "sb               %[tp2],         0(%[dst_ptr])                   \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
-        "ulw              %[tp2],         9(%[src])                       \n\t"
-
-        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
-        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
-        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
-        "extp             %[Temp2],       $ac3,           31              \n\t"
-
-        /* odd 2. pixel */
-        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
-        "mtlo             %[vector4a],    $ac3                            \n\t"
-        "mthi             $zero,          $ac3                            \n\t"
-        "mtlo             %[vector4a],    $ac2                            \n\t"
-        "mthi             $zero,          $ac2                            \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
-        "preceu.ph.qbl    %[n1],          %[tp2]                          \n\t"
-        "ulw              %[Temp1],       13(%[src])                      \n\t"
-        "dpa.w.ph         $ac1,           %[p2],          %[vector1b]     \n\t"
-        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
-        "dpa.w.ph         $ac1,           %[p3],          %[vector2b]     \n\t"
-        "dpa.w.ph         $ac1,           %[p4],          %[vector3b]     \n\t"
-        "dpa.w.ph         $ac1,           %[p1],          %[vector4b]     \n\t"
-        "extp             %[Temp3],       $ac1,           31              \n\t"
-
-        /* odd 3. pixel */
-        "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
-        "preceu.ph.qbr    %[p2],          %[Temp1]                        \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[vector1b]     \n\t"
-        "dpa.w.ph         $ac3,           %[p4],          %[vector2b]     \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[vector3b]     \n\t"
-        "dpa.w.ph         $ac3,           %[n1],          %[vector4b]     \n\t"
-        "extp             %[Temp2],       $ac3,           31              \n\t"
-
-        /* odd 4. pixel */
-        "sb               %[tp3],         0(%[odd_dst])                   \n\t"
-        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
-        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
-        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
-        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
-        "extp             %[Temp1],       $ac2,           31              \n\t"
-
-        /* clamp */
-        "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
-        "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
-        "lbux             %[n1],          %[Temp1](%[cm])                 \n\t"
-
-        /* store bytes */
-        "sb               %[p4],          0(%[odd_dst])                   \n\t"
-        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
-
-        "sb               %[p2],          0(%[odd_dst])                   \n\t"
-        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
-
-        "sb               %[n1],          0(%[odd_dst])                   \n\t"
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
-          [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
-          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
-          [dst_ptr] "+r"(dst_ptr), [odd_dst] "+r"(odd_dst)
-        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
-          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
-          [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
-          [dst_pitch_2] "r"(dst_pitch_2));
-
-    /* Next row... */
-    src += src_stride;
-    dst += 1;
-  }
-}
-
-static void convolve_horiz_16_transposed_dspr2(
-    const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
-    int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
-  int32_t c, y;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t filter12, filter34, filter56, filter78;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-  uint32_t dst_pitch_2 = (dst_stride << 1);
-  uint8_t *odd_dst;
-
-  filter12 = ((const int32_t *)filter_x0)[0];
-  filter34 = ((const int32_t *)filter_x0)[1];
-  filter56 = ((const int32_t *)filter_x0)[2];
-  filter78 = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-
-    src = src_ptr;
-    dst = dst_ptr;
-
-    odd_dst = (dst + dst_stride);
-
-    for (c = 0; c < count; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],        0(%[src])                       "
-          "\n\t"
-          "ulw              %[qload2],        4(%[src])                       "
-          "\n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 1 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 2 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       "
-          "\n\t"
-          "ulw              %[qload2],        8(%[src])                       "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     "
-          "\n\t" /* even 1 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 1 */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* even 3 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p5],            %[qload2]                       "
-          "\n\t"
-          "ulw              %[qload1],        12(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     "
-          "\n\t" /* even 1 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 1 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 1 */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 4 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p2],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 1 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
-          "          \n\t"
-          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     "
-          "\n\t" /* even 3 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     "
-          "\n\t" /* even 3 */
-          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     "
-          "\n\t" /* even 3 */
-          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     "
-          "\n\t" /* even 3 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* even 3 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 5 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p3],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 2 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload2],        16(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     "
-          "\n\t" /* even 4 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     "
-          "\n\t" /* even 4 */
-          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     "
-          "\n\t" /* even 4 */
-          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     "
-          "\n\t" /* even 4 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 4 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* even 6 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p4],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st3],           0(%[dst])                       "
-          "\n\t" /* even 3 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     "
-          "\n\t" /* even 5 */
-          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     "
-          "\n\t" /* even 5 */
-          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     "
-          "\n\t" /* even 5 */
-          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     "
-          "\n\t" /* even 5 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 5 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 7 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p1],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 4 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload1],        20(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     "
-          "\n\t" /* even 6 */
-          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
-          "\n\t" /* even 6 */
-          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
-          "\n\t" /* even 6 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
-          "\n\t" /* even 6 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* even 6 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 8 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 5 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
-          "\n\t" /* even 7 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
-          "\n\t" /* even 7 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
-          "\n\t" /* even 7 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
-          "\n\t" /* even 7 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 7 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* even 6 */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 1 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
-          "\n\t" /* even 8 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
-          "\n\t" /* even 8 */
-          "sb               %[st3],           0(%[dst])                       "
-          "\n\t" /* even 6 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
-          "\n\t" /* even 8 */
-          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
-          "\n\t" /* even 8 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 8 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],        1(%[src])                       "
-          "\n\t"
-          "ulw              %[qload2],        5(%[src])                       "
-          "\n\t"
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 2 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 7 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload2],        9(%[src])                       "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     "
-          "\n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
-          "\n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
-          "\n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
-          "\n\t" /* odd 1 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 1 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 8 */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* odd 3 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p5],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 8 */
-          "ulw              %[qload1],        13(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
-          "\n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
-          "\n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
-          "\n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
-          "\n\t" /* odd 2 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 2 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 4 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p2],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 1 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
-          "\n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
-          "\n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
-          "\n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
-          "\n\t" /* odd 3 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* odd 3 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 5 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p3],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 2 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload2],        17(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     "
-          "\n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     "
-          "\n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     "
-          "\n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     "
-          "\n\t" /* odd 4 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 4 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* odd 3 */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* odd 6 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p4],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[odd_dst])                   "
-          "\n\t" /* odd 3 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
-          "\n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     "
-          "\n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     "
-          "\n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     "
-          "\n\t" /* odd 5 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 5 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 4 */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 7 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p1],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 4 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload1],        21(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     "
-          "\n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     "
-          "\n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     "
-          "\n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     "
-          "\n\t" /* odd 6 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* odd 6 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 8 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 5 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     "
-          "\n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     "
-          "\n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     "
-          "\n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     "
-          "\n\t" /* odd 7 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 7 */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     "
-          "\n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     "
-          "\n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     "
-          "\n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     "
-          "\n\t" /* odd 8 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 8 */
-
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* odd 6 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 7 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 8 */
-
-          "sb               %[st2],           0(%[odd_dst])                   "
-          "\n\t" /* odd 6 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 7 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 8 */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
-            [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
-            [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
-            [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
-            [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
-          : [filter12] "r"(filter12), [filter34] "r"(filter34),
-            [filter56] "r"(filter56), [filter78] "r"(filter78),
-            [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
-            [dst_pitch_2] "r"(dst_pitch_2));
-
-      src += 16;
-      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
-      odd_dst = (dst + dst_stride);
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-
-    dst_ptr += 1;
-  }
-}
-
-static void convolve_horiz_64_transposed_dspr2(
-    const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
-    int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
-  int32_t c, y;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t filter12, filter34, filter56, filter78;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-  uint32_t dst_pitch_2 = (dst_stride << 1);
-  uint8_t *odd_dst;
-
-  filter12 = ((const int32_t *)filter_x0)[0];
-  filter34 = ((const int32_t *)filter_x0)[1];
-  filter56 = ((const int32_t *)filter_x0)[2];
-  filter78 = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-    prefetch_load(src_ptr + src_stride + 64);
-
-    src = src_ptr;
-    dst = dst_ptr;
-
-    odd_dst = (dst + dst_stride);
-
-    for (c = 0; c < 4; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],        0(%[src])                       "
-          "\n\t"
-          "ulw              %[qload2],        4(%[src])                       "
-          "\n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 1 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 2 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       "
-          "\n\t"
-          "ulw              %[qload2],        8(%[src])                       "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     "
-          "\n\t" /* even 1 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 1 */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* even 3 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p5],            %[qload2]                       "
-          "\n\t"
-          "ulw              %[qload1],        12(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     "
-          "\n\t" /* even 1 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 1 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 1 */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 4 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p2],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 1 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
-          "          \n\t"
-          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     "
-          "\n\t" /* even 3 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     "
-          "\n\t" /* even 3 */
-          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     "
-          "\n\t" /* even 3 */
-          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     "
-          "\n\t" /* even 3 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* even 3 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 5 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p3],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 2 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload2],        16(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     "
-          "\n\t" /* even 4 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     "
-          "\n\t" /* even 4 */
-          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     "
-          "\n\t" /* even 4 */
-          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     "
-          "\n\t" /* even 4 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 4 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* even 6 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p4],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st3],           0(%[dst])                       "
-          "\n\t" /* even 3 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     "
-          "\n\t" /* even 5 */
-          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     "
-          "\n\t" /* even 5 */
-          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     "
-          "\n\t" /* even 5 */
-          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     "
-          "\n\t" /* even 5 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 5 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 7 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p1],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 4 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload1],        20(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     "
-          "\n\t" /* even 6 */
-          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
-          "\n\t" /* even 6 */
-          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
-          "\n\t" /* even 6 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
-          "\n\t" /* even 6 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* even 6 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 8 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 5 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
-          "\n\t" /* even 7 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
-          "\n\t" /* even 7 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
-          "\n\t" /* even 7 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
-          "\n\t" /* even 7 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 7 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* even 6 */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 1 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
-          "\n\t" /* even 8 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
-          "\n\t" /* even 8 */
-          "sb               %[st3],           0(%[dst])                       "
-          "\n\t" /* even 6 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
-          "\n\t" /* even 8 */
-          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
-          "\n\t" /* even 8 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 8 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],        1(%[src])                       "
-          "\n\t"
-          "ulw              %[qload2],        5(%[src])                       "
-          "\n\t"
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 2 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 7 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload2],        9(%[src])                       "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     "
-          "\n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
-          "\n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
-          "\n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
-          "\n\t" /* odd 1 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 1 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 8 */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* odd 3 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p5],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 8 */
-          "ulw              %[qload1],        13(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
-          "\n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
-          "\n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
-          "\n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
-          "\n\t" /* odd 2 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 2 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 4 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p2],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 1 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
-          "\n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
-          "\n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
-          "\n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
-          "\n\t" /* odd 3 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* odd 3 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 5 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p3],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 2 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload2],        17(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     "
-          "\n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     "
-          "\n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     "
-          "\n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     "
-          "\n\t" /* odd 4 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 4 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* odd 3 */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* odd 6 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p4],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[odd_dst])                   "
-          "\n\t" /* odd 3 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
-          "\n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     "
-          "\n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     "
-          "\n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     "
-          "\n\t" /* odd 5 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 5 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 4 */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 7 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p1],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 4 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload1],        21(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     "
-          "\n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     "
-          "\n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     "
-          "\n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     "
-          "\n\t" /* odd 6 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* odd 6 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 8 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 5 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     "
-          "\n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     "
-          "\n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     "
-          "\n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     "
-          "\n\t" /* odd 7 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 7 */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     "
-          "\n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     "
-          "\n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     "
-          "\n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     "
-          "\n\t" /* odd 8 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 8 */
-
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* odd 6 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 7 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 8 */
-
-          "sb               %[st2],           0(%[odd_dst])                   "
-          "\n\t" /* odd 6 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 7 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 8 */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
-            [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
-            [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
-            [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
-            [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
-          : [filter12] "r"(filter12), [filter34] "r"(filter34),
-            [filter56] "r"(filter56), [filter78] "r"(filter78),
-            [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
-            [dst_pitch_2] "r"(dst_pitch_2));
-
-      src += 16;
-      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
-      odd_dst = (dst + dst_stride);
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-
-    dst_ptr += 1;
-  }
-}
-
-void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter, int w, int h) {
-  int x, y, k;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      int sum = 0;
-
-      for (k = 0; k < 8; ++k) sum += src[x + k] * filter[k];
-
-      dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-    }
-
-    src += src_stride;
-    dst += 1;
-  }
-}
-
-void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
-  int x, y;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x * dst_stride] = src[x];
-    }
-
-    src += src_stride;
-    dst += 1;
-  }
-}
-
-void aom_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
-  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
-  uint32_t pos = 38;
-
-  (void)x_step_q4;
-
-  assert(x_step_q4 == 16);
-  assert(y_step_q4 == 16);
-  assert(((const int32_t *)filter_x)[1] != 0x800000);
-  assert(((const int32_t *)filter_y)[1] != 0x800000);
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                       :
-                       : [pos] "r"(pos));
-
-  if (intermediate_height < h) intermediate_height = h;
-
-  /* copy the src to dst */
-  if (filter_x[3] == 0x80) {
-    copy_horiz_transposed(src - src_stride * 3, src_stride, temp,
-                          intermediate_height, w, intermediate_height);
-  } else if (((const int32_t *)filter_x)[0] == 0) {
-    aom_convolve2_dspr2(src - src_stride * 3, src_stride, temp,
-                        intermediate_height, filter_x, w, intermediate_height);
-  } else {
-    src -= (src_stride * 3 + 3);
-
-    /* prefetch data to cache memory */
-    prefetch_load(src);
-    prefetch_load(src + 32);
-
-    switch (w) {
-      case 4:
-        convolve_horiz_4_transposed_dspr2(src, src_stride, temp,
-                                          intermediate_height, filter_x,
-                                          intermediate_height);
-        break;
-      case 8:
-        convolve_horiz_8_transposed_dspr2(src, src_stride, temp,
-                                          intermediate_height, filter_x,
-                                          intermediate_height);
-        break;
-      case 16:
-      case 32:
-        convolve_horiz_16_transposed_dspr2(src, src_stride, temp,
-                                           intermediate_height, filter_x,
-                                           intermediate_height, (w / 16));
-        break;
-      case 64:
-        prefetch_load(src + 32);
-        convolve_horiz_64_transposed_dspr2(src, src_stride, temp,
-                                           intermediate_height, filter_x,
-                                           intermediate_height);
-        break;
-      default:
-        convolve_horiz_transposed(src, src_stride, temp, intermediate_height,
-                                  filter_x, w, intermediate_height);
-        break;
-    }
-  }
-
-  /* copy the src to dst */
-  if (filter_y[3] == 0x80) {
-    copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w);
-  } else if (((const int32_t *)filter_y)[0] == 0) {
-    aom_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride,
-                        filter_y, h, w);
-  } else {
-    switch (h) {
-      case 4:
-        convolve_horiz_4_transposed_dspr2(temp, intermediate_height, dst,
-                                          dst_stride, filter_y, w);
-        break;
-      case 8:
-        convolve_horiz_8_transposed_dspr2(temp, intermediate_height, dst,
-                                          dst_stride, filter_y, w);
-        break;
-      case 16:
-      case 32:
-        convolve_horiz_16_transposed_dspr2(temp, intermediate_height, dst,
-                                           dst_stride, filter_y, w, (h / 16));
-        break;
-      case 64:
-        convolve_horiz_64_transposed_dspr2(temp, intermediate_height, dst,
-                                           dst_stride, filter_y, w);
-        break;
-      default:
-        convolve_horiz_transposed(temp, intermediate_height, dst, dst_stride,
-                                  filter_y, h, w);
-        break;
-    }
-  }
-}
-
 void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int filter_x_stride,
diff --git a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c
index c60557617..f9c6879ab 100644
--- a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c
@@ -12,7 +12,8 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/convolve_common_dspr2.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
diff --git a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c
index d8a90b6ab..201e66427 100644
--- a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c
@@ -12,7 +12,8 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/convolve_common_dspr2.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
diff --git a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
index f8fd9e2b6..e7b8d531b 100644
--- a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
@@ -14,7 +14,8 @@
 
 #include <assert.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/mips/common_dspr2.h"
 
@@ -29,18 +30,6 @@ void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                const int16_t *filter_y, int y_step_q4, int w,
                                int h);
 
-void aom_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
-                                   int w, int h);
-
-void aom_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h);
-
 void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                          ptrdiff_t dst_stride, const int16_t *filter, int w,
                          int h);
diff --git a/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c b/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c
deleted file mode 100644
index 43dce8ba6..000000000
--- a/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c
+++ /dev/null
@@ -1,928 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/fwd_txfm_msa.h"
-
-static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
-                                              int32_t src_stride,
-                                              int16_t *temp_buff) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v8i16 step0, step1, step2, step3;
-  v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
-  v8i16 step0_1, step1_1, step2_1, step3_1;
-
-  /* 1st and 2nd set */
-  LD_SH4(input, src_stride, in0, in1, in2, in3);
-  LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7);
-  LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
-  LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
-  SLLI_4V(in0, in1, in2, in3, 2);
-  SLLI_4V(in4, in5, in6, in7, 2);
-  SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
-  SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
-  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
-              step3, in4, in5, in6, in7);
-  BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
-              step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
-  ST_SH4(step0, step1, step2, step3, temp_buff, 8);
-  ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8);
-  ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8);
-  ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8);
-
-  /* 3rd and 4th set */
-  LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3);
-  LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7);
-  LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
-  LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
-  SLLI_4V(in0, in1, in2, in3, 2);
-  SLLI_4V(in4, in5, in6, in7, 2);
-  SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
-  SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
-  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
-              step3, in4, in5, in6, in7);
-  BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
-              step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
-  ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8);
-  ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8);
-  ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8);
-  ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8);
-}
-
-static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8i16 temp0, temp1;
-
-  /* fdct even */
-  LD_SH4(input, 8, in0, in1, in2, in3);
-  LD_SH4(input + 96, 8, in12, in13, in14, in15);
-  BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2,
-              vec3, in12, in13, in14, in15);
-  LD_SH4(input + 32, 8, in4, in5, in6, in7);
-  LD_SH4(input + 64, 8, in8, in9, in10, in11);
-  BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7,
-              in8, in9, in10, in11);
-
-  /* Stage 3 */
-  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
-  BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
-  DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
-  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
-  ST_SH(temp0, temp);
-  ST_SH(temp1, temp + 512);
-
-  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
-  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
-  ST_SH(temp0, temp + 256);
-  ST_SH(temp1, temp + 768);
-
-  SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4);
-  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
-  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
-  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
-  ST_SH(temp0, temp + 128);
-  ST_SH(temp1, temp + 896);
-
-  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
-  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
-  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
-  ST_SH(temp0, temp + 640);
-  ST_SH(temp1, temp + 384);
-
-  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
-  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
-  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
-  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
-  ADD2(in0, in1, in2, in3, vec0, vec7);
-  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
-  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
-  ST_SH(temp0, temp + 64);
-  ST_SH(temp1, temp + 960);
-
-  SUB2(in0, in1, in2, in3, in0, in2);
-  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
-  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
-  ST_SH(temp0, temp + 576);
-  ST_SH(temp1, temp + 448);
-
-  SUB2(in9, vec2, in14, vec5, vec2, vec5);
-  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
-  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
-  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
-  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
-  ST_SH(temp0, temp + 320);
-  ST_SH(temp1, temp + 704);
-
-  ADD2(in3, in2, in0, in1, vec3, vec4);
-  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
-  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
-  ST_SH(temp0, temp + 192);
-  ST_SH(temp1, temp + 832);
-}
-
-static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) {
-  v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
-  v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
-
-  in20 = LD_SH(input + 32);
-  in21 = LD_SH(input + 40);
-  in26 = LD_SH(input + 80);
-  in27 = LD_SH(input + 88);
-
-  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
-  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
-
-  in18 = LD_SH(input + 16);
-  in19 = LD_SH(input + 24);
-  in28 = LD_SH(input + 96);
-  in29 = LD_SH(input + 104);
-
-  vec4 = in19 - in20;
-  ST_SH(vec4, input + 32);
-  vec4 = in18 - in21;
-  ST_SH(vec4, input + 40);
-  vec4 = in29 - in26;
-  ST_SH(vec4, input + 80);
-  vec4 = in28 - in27;
-  ST_SH(vec4, input + 88);
-
-  in21 = in18 + in21;
-  in20 = in19 + in20;
-  in27 = in28 + in27;
-  in26 = in29 + in26;
-
-  LD_SH4(input + 48, 8, in22, in23, in24, in25);
-  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
-  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
-
-  in16 = LD_SH(input);
-  in17 = LD_SH(input + 8);
-  in30 = LD_SH(input + 112);
-  in31 = LD_SH(input + 120);
-
-  vec4 = in17 - in22;
-  ST_SH(vec4, input + 16);
-  vec4 = in16 - in23;
-  ST_SH(vec4, input + 24);
-  vec4 = in31 - in24;
-  ST_SH(vec4, input + 96);
-  vec4 = in30 - in25;
-  ST_SH(vec4, input + 104);
-
-  ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
-  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
-  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
-  ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
-  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
-  ADD2(in27, in26, in25, in24, in23, in20);
-  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
-  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
-  ST_SH(vec5, temp_ptr);
-  ST_SH(vec4, temp_ptr + 960);
-
-  SUB2(in27, in26, in25, in24, in22, in21);
-  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
-  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
-  ST_SH(vec5, temp_ptr + 448);
-  ST_SH(vec4, temp_ptr + 512);
-
-  SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
-  DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
-  SUB2(in26, in27, in24, in25, in23, in20);
-  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
-  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
-  ST_SH(vec4, temp_ptr + 704);
-  ST_SH(vec5, temp_ptr + 256);
-
-  ADD2(in26, in27, in24, in25, in22, in21);
-  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
-  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
-  ST_SH(vec4, temp_ptr + 192);
-  ST_SH(vec5, temp_ptr + 768);
-
-  LD_SH4(input + 16, 8, in22, in23, in20, in21);
-  LD_SH4(input + 80, 8, in26, in27, in24, in25);
-  in16 = in20;
-  in17 = in21;
-  DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
-  DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
-  SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
-  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
-  ADD2(in28, in29, in31, in30, in16, in19);
-  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
-  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
-  ST_SH(vec5, temp_ptr + 832);
-  ST_SH(vec4, temp_ptr + 128);
-
-  SUB2(in28, in29, in31, in30, in17, in18);
-  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
-  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
-  ST_SH(vec5, temp_ptr + 320);
-  ST_SH(vec4, temp_ptr + 640);
-  ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
-  DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
-  SUB2(in29, in28, in30, in31, in16, in19);
-  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
-  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
-  ST_SH(vec5, temp_ptr + 576);
-  ST_SH(vec4, temp_ptr + 384);
-
-  ADD2(in29, in28, in30, in31, in17, in18);
-  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
-  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
-  ST_SH(vec5, temp_ptr + 64);
-  ST_SH(vec4, temp_ptr + 896);
-}
-
-static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride,
-                               int16_t *tmp_buf, int16_t *tmp_buf_big) {
-  fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf);
-  fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big);
-  fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32));
-}
-
-static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
-                                           int16_t *output) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
-  v8i16 step0, step1, step2, step3, step4, step5, step6, step7;
-
-  LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7);
-  LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15);
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
-                     in10, in11, in12, in13, in14, in15);
-  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
-               in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
-               step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
-  ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8);
-  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8);
-
-  /* 2nd set */
-  LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
-  LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15);
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
-                     in10, in11, in12, in13, in14, in15);
-  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
-               in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
-               step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
-  ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7,
-         (output + 8 * 8), 8);
-  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8);
-}
-
-static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
-                                    int16_t *out) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
-  v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
-  v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w;
-
-  /* fdct32 even */
-  /* stage 2 */
-  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
-  LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
-
-  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
-               in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
-               vec7, in8, in9, in10, in11, in12, in13, in14, in15);
-  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8);
-  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8);
-
-  /* Stage 3 */
-  UNPCK_SH_SW(vec0, vec0_l, vec0_r);
-  UNPCK_SH_SW(vec1, vec1_l, vec1_r);
-  UNPCK_SH_SW(vec2, vec2_l, vec2_r);
-  UNPCK_SH_SW(vec3, vec3_l, vec3_r);
-  UNPCK_SH_SW(vec4, vec4_l, vec4_r);
-  UNPCK_SH_SW(vec5, vec5_l, vec5_r);
-  UNPCK_SH_SW(vec6, vec6_l, vec6_r);
-  UNPCK_SH_SW(vec7, vec7_l, vec7_r);
-  ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w,
-       tmp1_w, tmp2_w, tmp3_w);
-  BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r);
-  ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r,
-       vec1_r, vec2_r, vec3_r);
-
-  tmp3_w = vec0_r + vec3_r;
-  vec0_r = vec0_r - vec3_r;
-  vec3_r = vec1_r + vec2_r;
-  vec1_r = vec1_r - vec2_r;
-
-  DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64,
-                    vec4_r, tmp3_w, vec6_r, vec3_r);
-  FDCT32_POSTPROC_NEG_W(vec4_r);
-  FDCT32_POSTPROC_NEG_W(tmp3_w);
-  FDCT32_POSTPROC_NEG_W(vec6_r);
-  FDCT32_POSTPROC_NEG_W(vec3_r);
-  PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
-  ST_SH2(vec5, vec4, out, 8);
-
-  DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64,
-                    vec4_r, tmp3_w, vec6_r, vec3_r);
-  FDCT32_POSTPROC_NEG_W(vec4_r);
-  FDCT32_POSTPROC_NEG_W(tmp3_w);
-  FDCT32_POSTPROC_NEG_W(vec6_r);
-  FDCT32_POSTPROC_NEG_W(vec3_r);
-  PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
-  ST_SH2(vec5, vec4, out + 16, 8);
-
-  LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
-  SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
-  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
-  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
-  FDCT_POSTPROC_2V_NEG_H(in4, in5);
-  ST_SH(in4, out + 32);
-  ST_SH(in5, out + 56);
-
-  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
-  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
-  FDCT_POSTPROC_2V_NEG_H(in4, in5);
-  ST_SH(in4, out + 40);
-  ST_SH(in5, out + 48);
-
-  LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
-  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
-  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
-  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
-  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
-  ADD2(in0, in1, in2, in3, vec0, vec7);
-  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
-  FDCT_POSTPROC_2V_NEG_H(in4, in5);
-  ST_SH(in4, out + 64);
-  ST_SH(in5, out + 120);
-
-  SUB2(in0, in1, in2, in3, in0, in2);
-  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
-  FDCT_POSTPROC_2V_NEG_H(in4, in5);
-  ST_SH(in4, out + 72);
-  ST_SH(in5, out + 112);
-
-  SUB2(in9, vec2, in14, vec5, vec2, vec5);
-  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
-  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
-  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
-  FDCT_POSTPROC_2V_NEG_H(in4, in5);
-  ST_SH(in4, out + 80);
-  ST_SH(in5, out + 104);
-
-  ADD2(in3, in2, in0, in1, vec3, vec4);
-  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
-  FDCT_POSTPROC_2V_NEG_H(in4, in5);
-  ST_SH(in4, out + 96);
-  ST_SH(in5, out + 88);
-}
-
-static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
-
-  /* fdct32 even */
-  /* stage 2 */
-  LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
-  LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
-
-  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
-               in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
-               vec7, in8, in9, in10, in11, in12, in13, in14, in15);
-
-  /* Stage 3 */
-  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
-  BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
-  DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
-  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
-  ST_SH(temp0, out);
-  ST_SH(temp1, out + 8);
-
-  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
-  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
-  ST_SH(temp0, out + 16);
-  ST_SH(temp1, out + 24);
-
-  SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
-  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
-  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
-  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
-  ST_SH(temp0, out + 32);
-  ST_SH(temp1, out + 56);
-
-  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
-  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
-  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
-  ST_SH(temp0, out + 40);
-  ST_SH(temp1, out + 48);
-
-  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
-  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
-  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
-  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
-  ADD2(in0, in1, in2, in3, vec0, vec7);
-  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
-  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
-  ST_SH(temp0, out + 64);
-  ST_SH(temp1, out + 120);
-
-  SUB2(in0, in1, in2, in3, in0, in2);
-  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
-  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
-  ST_SH(temp0, out + 72);
-  ST_SH(temp1, out + 112);
-
-  SUB2(in9, vec2, in14, vec5, vec2, vec5);
-  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
-  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5)
-  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
-  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
-  ST_SH(temp0, out + 80);
-  ST_SH(temp1, out + 104);
-
-  ADD2(in3, in2, in0, in1, vec3, vec4);
-  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
-  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
-  ST_SH(temp0, out + 96);
-  ST_SH(temp1, out + 88);
-}
-
-static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr,
-                                int16_t *out) {
-  v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
-  v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
-
-  in20 = LD_SH(temp + 32);
-  in21 = LD_SH(temp + 40);
-  in26 = LD_SH(temp + 80);
-  in27 = LD_SH(temp + 88);
-
-  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
-  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
-
-  in18 = LD_SH(temp + 16);
-  in19 = LD_SH(temp + 24);
-  in28 = LD_SH(temp + 96);
-  in29 = LD_SH(temp + 104);
-
-  vec4 = in19 - in20;
-  ST_SH(vec4, interm_ptr + 32);
-  vec4 = in18 - in21;
-  ST_SH(vec4, interm_ptr + 88);
-  vec4 = in28 - in27;
-  ST_SH(vec4, interm_ptr + 56);
-  vec4 = in29 - in26;
-  ST_SH(vec4, interm_ptr + 64);
-
-  ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
-
-  in22 = LD_SH(temp + 48);
-  in23 = LD_SH(temp + 56);
-  in24 = LD_SH(temp + 64);
-  in25 = LD_SH(temp + 72);
-
-  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
-  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
-
-  in16 = LD_SH(temp);
-  in17 = LD_SH(temp + 8);
-  in30 = LD_SH(temp + 112);
-  in31 = LD_SH(temp + 120);
-
-  vec4 = in17 - in22;
-  ST_SH(vec4, interm_ptr + 40);
-  vec4 = in30 - in25;
-  ST_SH(vec4, interm_ptr + 48);
-  vec4 = in31 - in24;
-  ST_SH(vec4, interm_ptr + 72);
-  vec4 = in16 - in23;
-  ST_SH(vec4, interm_ptr + 80);
-
-  ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
-  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
-  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
-
-  ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
-  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
-  ADD2(in27, in26, in25, in24, in23, in20);
-
-  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
-  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
-  ST_SH(vec5, out);
-  ST_SH(vec4, out + 120);
-
-  SUB2(in27, in26, in25, in24, in22, in21);
-
-  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
-  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
-  ST_SH(vec5, out + 112);
-  ST_SH(vec4, out + 8);
-
-  SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
-  DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
-  SUB2(in26, in27, in24, in25, in23, in20);
-
-  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
-  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
-  ST_SH(vec4, out + 16);
-  ST_SH(vec5, out + 104);
-
-  ADD2(in26, in27, in24, in25, in22, in21);
-  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
-  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
-  ST_SH(vec4, out + 24);
-  ST_SH(vec5, out + 96);
-
-  in20 = LD_SH(interm_ptr + 32);
-  in21 = LD_SH(interm_ptr + 88);
-  in27 = LD_SH(interm_ptr + 56);
-  in26 = LD_SH(interm_ptr + 64);
-
-  in16 = in20;
-  in17 = in21;
-  DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
-  DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
-
-  in22 = LD_SH(interm_ptr + 40);
-  in25 = LD_SH(interm_ptr + 48);
-  in24 = LD_SH(interm_ptr + 72);
-  in23 = LD_SH(interm_ptr + 80);
-
-  SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
-  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
-  ADD2(in28, in29, in31, in30, in16, in19);
-  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
-  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
-  ST_SH(vec5, out + 32);
-  ST_SH(vec4, out + 88);
-
-  SUB2(in28, in29, in31, in30, in17, in18);
-  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
-  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
-  ST_SH(vec5, out + 40);
-  ST_SH(vec4, out + 80);
-
-  ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
-  DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
-  SUB2(in29, in28, in30, in31, in16, in19);
-
-  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
-  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
-  ST_SH(vec5, out + 72);
-  ST_SH(vec4, out + 48);
-
-  ADD2(in29, in28, in30, in31, in17, in18);
-
-  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
-  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
-  ST_SH(vec4, out + 56);
-  ST_SH(vec5, out + 64);
-}
-
-static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
-
-  /* 1st set */
-  in0 = LD_SH(temp);
-  in4 = LD_SH(temp + 32);
-  in2 = LD_SH(temp + 64);
-  in6 = LD_SH(temp + 96);
-  in1 = LD_SH(temp + 128);
-  in7 = LD_SH(temp + 152);
-  in3 = LD_SH(temp + 192);
-  in5 = LD_SH(temp + 216);
-
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-
-  /* 2nd set */
-  in0_1 = LD_SH(temp + 16);
-  in1_1 = LD_SH(temp + 232);
-  in2_1 = LD_SH(temp + 80);
-  in3_1 = LD_SH(temp + 168);
-  in4_1 = LD_SH(temp + 48);
-  in5_1 = LD_SH(temp + 176);
-  in6_1 = LD_SH(temp + 112);
-  in7_1 = LD_SH(temp + 240);
-
-  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32);
-  TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
-                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
-
-  /* 3rd set */
-  in0 = LD_SH(temp + 8);
-  in1 = LD_SH(temp + 136);
-  in2 = LD_SH(temp + 72);
-  in3 = LD_SH(temp + 200);
-  in4 = LD_SH(temp + 40);
-  in5 = LD_SH(temp + 208);
-  in6 = LD_SH(temp + 104);
-  in7 = LD_SH(temp + 144);
-
-  ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8,
-         32);
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32);
-
-  /* 4th set */
-  in0_1 = LD_SH(temp + 24);
-  in1_1 = LD_SH(temp + 224);
-  in2_1 = LD_SH(temp + 88);
-  in3_1 = LD_SH(temp + 160);
-  in4_1 = LD_SH(temp + 56);
-  in5_1 = LD_SH(temp + 184);
-  in6_1 = LD_SH(temp + 120);
-  in7_1 = LD_SH(temp + 248);
-
-  TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
-                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
-  ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24,
-         32);
-}
-
-static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) {
-  fdct8x32_1d_row_load_butterfly(temp, temp_buf);
-  fdct8x32_1d_row_even(temp_buf, temp_buf);
-  fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
-  fdct8x32_1d_row_transpose_store(temp_buf, output);
-}
-
-static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf,
-                               int16_t *output) {
-  fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
-  fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf);
-  fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128);
-  fdct8x32_1d_row_transpose_store(tmp_buf, output);
-}
-
-void aom_fdct32x32_msa(const int16_t *input, int16_t *output,
-                       int32_t src_stride) {
-  int32_t i;
-  DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
-  DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
-
-  /* column transform */
-  for (i = 0; i < 4; ++i) {
-    fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf,
-                       tmp_buf_big + (8 * i));
-  }
-
-  /* row transform */
-  fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output);
-
-  /* row transform */
-  for (i = 1; i < 4; ++i) {
-    fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256));
-  }
-}
-
-static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
-
-  /* fdct32 even */
-  /* stage 2 */
-  LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
-  LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
-
-  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
-               in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
-               vec7, in8, in9, in10, in11, in12, in13, in14, in15);
-  FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
-  FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
-  FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
-  FDCT_POSTPROC_2V_NEG_H(vec6, vec7);
-  FDCT_POSTPROC_2V_NEG_H(in8, in9);
-  FDCT_POSTPROC_2V_NEG_H(in10, in11);
-  FDCT_POSTPROC_2V_NEG_H(in12, in13);
-  FDCT_POSTPROC_2V_NEG_H(in14, in15);
-
-  /* Stage 3 */
-  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
-
-  temp0 = in0 + in3;
-  in0 = in0 - in3;
-  in3 = in1 + in2;
-  in1 = in1 - in2;
-
-  DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0);
-  ST_SH(temp0, out);
-  ST_SH(temp1, out + 8);
-
-  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
-  ST_SH(temp0, out + 16);
-  ST_SH(temp1, out + 24);
-
-  SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
-  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
-  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
-  ST_SH(temp0, out + 32);
-  ST_SH(temp1, out + 56);
-
-  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
-  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
-  ST_SH(temp0, out + 40);
-  ST_SH(temp1, out + 48);
-
-  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
-  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
-  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
-  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
-  ADD2(in0, in1, in2, in3, vec0, vec7);
-  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
-  ST_SH(temp0, out + 64);
-  ST_SH(temp1, out + 120);
-
-  SUB2(in0, in1, in2, in3, in0, in2);
-  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
-  ST_SH(temp0, out + 72);
-  ST_SH(temp1, out + 112);
-
-  SUB2(in9, vec2, in14, vec5, vec2, vec5);
-  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
-  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
-  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
-  ST_SH(temp0, out + 80);
-  ST_SH(temp1, out + 104);
-
-  ADD2(in3, in2, in0, in1, vec3, vec4);
-  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
-  ST_SH(temp0, out + 96);
-  ST_SH(temp1, out + 88);
-}
-
-static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr,
-                                   int16_t *out) {
-  v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
-  v8i16 in24, in25, in26, in27, in28, in29, in30, in31;
-  v8i16 vec4, vec5;
-
-  in20 = LD_SH(temp + 32);
-  in21 = LD_SH(temp + 40);
-  in26 = LD_SH(temp + 80);
-  in27 = LD_SH(temp + 88);
-
-  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
-  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
-
-  FDCT_POSTPROC_2V_NEG_H(in20, in21);
-  FDCT_POSTPROC_2V_NEG_H(in26, in27);
-
-  in18 = LD_SH(temp + 16);
-  in19 = LD_SH(temp + 24);
-  in28 = LD_SH(temp + 96);
-  in29 = LD_SH(temp + 104);
-
-  FDCT_POSTPROC_2V_NEG_H(in18, in19);
-  FDCT_POSTPROC_2V_NEG_H(in28, in29);
-
-  vec4 = in19 - in20;
-  ST_SH(vec4, interm_ptr + 32);
-  vec4 = in18 - in21;
-  ST_SH(vec4, interm_ptr + 88);
-  vec4 = in29 - in26;
-  ST_SH(vec4, interm_ptr + 64);
-  vec4 = in28 - in27;
-  ST_SH(vec4, interm_ptr + 56);
-
-  ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
-
-  in22 = LD_SH(temp + 48);
-  in23 = LD_SH(temp + 56);
-  in24 = LD_SH(temp + 64);
-  in25 = LD_SH(temp + 72);
-
-  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
-  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
-  FDCT_POSTPROC_2V_NEG_H(in22, in23);
-  FDCT_POSTPROC_2V_NEG_H(in24, in25);
-
-  in16 = LD_SH(temp);
-  in17 = LD_SH(temp + 8);
-  in30 = LD_SH(temp + 112);
-  in31 = LD_SH(temp + 120);
-
-  FDCT_POSTPROC_2V_NEG_H(in16, in17);
-  FDCT_POSTPROC_2V_NEG_H(in30, in31);
-
-  vec4 = in17 - in22;
-  ST_SH(vec4, interm_ptr + 40);
-  vec4 = in30 - in25;
-  ST_SH(vec4, interm_ptr + 48);
-  vec4 = in31 - in24;
-  ST_SH(vec4, interm_ptr + 72);
-  vec4 = in16 - in23;
-  ST_SH(vec4, interm_ptr + 80);
-
-  ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
-  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
-  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
-  ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
-  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
-  ADD2(in27, in26, in25, in24, in23, in20);
-  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
-  ST_SH(vec5, out);
-  ST_SH(vec4, out + 120);
-
-  SUB2(in27, in26, in25, in24, in22, in21);
-  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
-  ST_SH(vec5, out + 112);
-  ST_SH(vec4, out + 8);
-
-  SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
-  DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
-  SUB2(in26, in27, in24, in25, in23, in20);
-  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
-  ST_SH(vec4, out + 16);
-  ST_SH(vec5, out + 104);
-
-  ADD2(in26, in27, in24, in25, in22, in21);
-  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
-  ST_SH(vec4, out + 24);
-  ST_SH(vec5, out + 96);
-
-  in20 = LD_SH(interm_ptr + 32);
-  in21 = LD_SH(interm_ptr + 88);
-  in27 = LD_SH(interm_ptr + 56);
-  in26 = LD_SH(interm_ptr + 64);
-
-  in16 = in20;
-  in17 = in21;
-  DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
-  DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
-
-  in22 = LD_SH(interm_ptr + 40);
-  in25 = LD_SH(interm_ptr + 48);
-  in24 = LD_SH(interm_ptr + 72);
-  in23 = LD_SH(interm_ptr + 80);
-
-  SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
-  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
-  in16 = in28 + in29;
-  in19 = in31 + in30;
-  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
-  ST_SH(vec5, out + 32);
-  ST_SH(vec4, out + 88);
-
-  SUB2(in28, in29, in31, in30, in17, in18);
-  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
-  ST_SH(vec5, out + 40);
-  ST_SH(vec4, out + 80);
-
-  ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
-  DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
-  SUB2(in29, in28, in30, in31, in16, in19);
-  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
-  ST_SH(vec5, out + 72);
-  ST_SH(vec4, out + 48);
-
-  ADD2(in29, in28, in30, in31, in17, in18);
-  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
-  ST_SH(vec4, out + 56);
-  ST_SH(vec5, out + 64);
-}
-
-static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf,
-                               int16_t *output) {
-  fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
-  fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf);
-  fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128));
-  fdct8x32_1d_row_transpose_store(tmp_buf, output);
-}
-
-void aom_fdct32x32_rd_msa(const int16_t *input, int16_t *out,
-                          int32_t src_stride) {
-  int32_t i;
-  DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
-  DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
-
-  /* column transform */
-  for (i = 0; i < 4; ++i) {
-    fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0],
-                       &tmp_buf_big[0] + (8 * i));
-  }
-
-  /* row transform */
-  for (i = 0; i < 4; ++i) {
-    fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0],
-                       out + (8 * i * 32));
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c
deleted file mode 100644
index 7a285b7b8..000000000
--- a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/fwd_txfm_msa.h"
-
-void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
-                        int32_t src_stride) {
-  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
-  v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30;
-  v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
-  v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64,  cospi_24_64,
-                  -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };
-  v8i16 coeff1 = { cospi_2_64,  cospi_30_64, cospi_14_64, cospi_18_64,
-                   cospi_10_64, cospi_22_64, cospi_6_64,  cospi_26_64 };
-  v8i16 coeff2 = {
-    -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0
-  };
-
-  LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9,
-          in10, in11, in12, in13, in14, in15);
-  SLLI_4V(in0, in1, in2, in3, 2);
-  SLLI_4V(in4, in5, in6, in7, 2);
-  SLLI_4V(in8, in9, in10, in11, 2);
-  SLLI_4V(in12, in13, in14, in15, 2);
-  ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3);
-  ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7);
-  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
-                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
-  ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32);
-  SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12);
-  SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8);
-
-  tmp_ptr += 16;
-
-  /* stp 1 */
-  ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4);
-  ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5);
-
-  cnst4 = __msa_splati_h(coeff, 0);
-  stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4);
-
-  cnst5 = __msa_splati_h(coeff, 1);
-  cnst5 = __msa_ilvev_h(cnst5, cnst4);
-  stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5);
-  stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4);
-  stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5);
-
-  /* stp2 */
-  BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
-  BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
-  ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4);
-  ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5);
-  SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1);
-  cnst0 = __msa_ilvev_h(cnst0, cnst1);
-  stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0);
-
-  cnst0 = __msa_splati_h(coeff, 4);
-  cnst1 = __msa_ilvev_h(cnst1, cnst0);
-  stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1);
-
-  BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
-  ILVRL_H2_SH(in15, in8, vec1, vec0);
-  SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1);
-  cnst0 = __msa_ilvev_h(cnst0, cnst1);
-
-  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
-  ST_SH(in8, tmp_ptr);
-
-  cnst0 = __msa_splati_h(coeff2, 0);
-  cnst0 = __msa_ilvev_h(cnst1, cnst0);
-  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
-  ST_SH(in8, tmp_ptr + 224);
-
-  ILVRL_H2_SH(in14, in9, vec1, vec0);
-  SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1);
-  cnst1 = __msa_ilvev_h(cnst1, cnst0);
-
-  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
-  ST_SH(in8, tmp_ptr + 128);
-
-  cnst1 = __msa_splati_h(coeff2, 2);
-  cnst0 = __msa_ilvev_h(cnst0, cnst1);
-  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
-  ST_SH(in8, tmp_ptr + 96);
-
-  SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1);
-  cnst1 = __msa_ilvev_h(cnst1, cnst0);
-
-  stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);
-
-  cnst1 = __msa_splati_h(coeff, 3);
-  cnst1 = __msa_ilvev_h(cnst0, cnst1);
-  stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);
-
-  /* stp4 */
-  ADD2(stp34, stp25, stp33, stp22, in13, in10);
-
-  ILVRL_H2_SH(in13, in10, vec1, vec0);
-  SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1);
-  cnst0 = __msa_ilvev_h(cnst0, cnst1);
-  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
-  ST_SH(in8, tmp_ptr + 64);
-
-  cnst0 = __msa_splati_h(coeff2, 1);
-  cnst0 = __msa_ilvev_h(cnst1, cnst0);
-  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
-  ST_SH(in8, tmp_ptr + 160);
-
-  SUB2(stp34, stp25, stp33, stp22, in12, in11);
-  ILVRL_H2_SH(in12, in11, vec1, vec0);
-  SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1);
-  cnst1 = __msa_ilvev_h(cnst1, cnst0);
-
-  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
-  ST_SH(in8, tmp_ptr + 192);
-
-  cnst1 = __msa_splati_h(coeff2, 3);
-  cnst0 = __msa_ilvev_h(cnst0, cnst1);
-  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
-  ST_SH(in8, tmp_ptr + 32);
-}
-
-void fdct16x8_1d_row(int16_t *input, int16_t *output) {
-  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
-
-  LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7);
-  LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15);
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
-                     in10, in11, in12, in13, in14, in15);
-  ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
-  ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
-  ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11);
-  ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15);
-  SRA_4V(in0, in1, in2, in3, 2);
-  SRA_4V(in4, in5, in6, in7, 2);
-  SRA_4V(in8, in9, in10, in11, 2);
-  SRA_4V(in12, in13, in14, in15, 2);
-  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
-               in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6,
-               tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
-  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16);
-  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
-                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
-  LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15);
-  FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
-               in4, in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
-                     tmp1, in1, tmp2, in2, tmp3, in3);
-  ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16);
-  TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
-                     tmp5, in5, tmp6, in6, tmp7, in7);
-  ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16);
-}
-
-void aom_fdct4x4_msa(const int16_t *input, int16_t *output,
-                     int32_t src_stride) {
-  v8i16 in0, in1, in2, in3;
-
-  LD_SH4(input, src_stride, in0, in1, in2, in3);
-
-  /* fdct4 pre-process */
-  {
-    v8i16 vec, mask;
-    v16i8 zero = { 0 };
-    v16i8 one = __msa_ldi_b(1);
-
-    mask = (v8i16)__msa_sldi_b(zero, one, 15);
-    SLLI_4V(in0, in1, in2, in3, 4);
-    vec = __msa_ceqi_h(in0, 0);
-    vec = vec ^ 255;
-    vec = mask & vec;
-    in0 += vec;
-  }
-
-  AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
-  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-  AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
-  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-  ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
-  SRA_4V(in0, in1, in2, in3, 2);
-  PCKEV_D2_SH(in1, in0, in3, in2, in0, in2);
-  ST_SH2(in0, in2, output, 8);
-}
-
-void aom_fdct8x8_msa(const int16_t *input, int16_t *output,
-                     int32_t src_stride) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-
-  LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7);
-  SLLI_4V(in0, in1, in2, in3, 2);
-  SLLI_4V(in4, in5, in6, in7, 2);
-  AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-            in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-            in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
-  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
-}
-
-void aom_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
-  out[0] = LD_HADD(input, stride);
-  out[1] = 0;
-}
-
-void aom_fdct16x16_msa(const int16_t *input, int16_t *output,
-                       int32_t src_stride) {
-  int32_t i;
-  DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);
-
-  /* column transform */
-  for (i = 0; i < 2; ++i) {
-    fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
-  }
-
-  /* row transform */
-  for (i = 0; i < 2; ++i) {
-    fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h
deleted file mode 100644
index ada25dffd..000000000
--- a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_MIPS_FWD_TXFM_MSA_H_
-#define AOM_DSP_MIPS_FWD_TXFM_MSA_H_
-
-#include "aom_dsp/mips/txfm_macros_msa.h"
-#include "aom_dsp/txfm_common.h"
-
-#define LD_HADD(psrc, stride)                                                  \
-  ({                                                                           \
-    v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m;              \
-    v4i32 vec_w_m;                                                             \
-                                                                               \
-    LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m);                        \
-    ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m);                            \
-    LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m);         \
-    ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, in4_m, in6_m, \
-         in0_m, in4_m);                                                        \
-    in0_m += in4_m;                                                            \
-                                                                               \
-    vec_w_m = __msa_hadd_s_w(in0_m, in0_m);                                    \
-    HADD_SW_S32(vec_w_m);                                                      \
-  })
-
-#define AOM_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3)                  \
-  {                                                                            \
-    v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m;                                  \
-    v8i16 vec0_m, vec1_m, vec2_m, vec3_m;                                      \
-    v4i32 vec4_m, vec5_m, vec6_m, vec7_m;                                      \
-    v8i16 coeff_m = {                                                          \
-      cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, 0, 0, 0 \
-    };                                                                         \
-                                                                               \
-    BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m);           \
-    ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m);                \
-    SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m);                             \
-    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
-    vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m);                                  \
-                                                                               \
-    SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m);                             \
-    cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m);                                 \
-    vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m);                                  \
-                                                                               \
-    vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m);                                  \
-    cnst2_m = __msa_splati_h(coeff_m, 2);                                      \
-    cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m);                                 \
-    vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m);                                  \
-                                                                               \
-    SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS);               \
-    PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, vec7_m,        \
-                vec7_m, out0, out2, out1, out3);                               \
-  }
-
-#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7)              \
-  {                                                                          \
-    v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
-                                                                             \
-    SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15);      \
-    SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15);      \
-    AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, in0, in1, \
-               in2, in3);                                                    \
-    AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, in4, in5, \
-               in6, in7);                                                    \
-  }
-
-#define AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,  \
-                  out3, out4, out5, out6, out7)                              \
-  {                                                                          \
-    v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m;                          \
-    v8i16 s7_m, x0_m, x1_m, x2_m, x3_m;                                      \
-    v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,  cospi_24_64,   \
-                      cospi_4_64,  cospi_28_64,  cospi_12_64, cospi_20_64 }; \
-                                                                             \
-    /* FDCT stage1 */                                                        \
-    BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m,    \
-                s3_m, s4_m, s5_m, s6_m, s7_m);                               \
-    BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);             \
-    ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);                          \
-    ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);                          \
-    SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m);                                 \
-    x1_m = __msa_ilvev_h(x1_m, x0_m);                                        \
-    out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                          \
-                                                                             \
-    SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m);                                 \
-    x2_m = -x2_m;                                                            \
-    x2_m = __msa_ilvev_h(x3_m, x2_m);                                        \
-    out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                          \
-                                                                             \
-    out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                          \
-    x2_m = __msa_splati_h(coeff_m, 2);                                       \
-    x2_m = __msa_ilvev_h(x2_m, x3_m);                                        \
-    out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                          \
-                                                                             \
-    /* stage2 */                                                             \
-    ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m);                                     \
-                                                                             \
-    s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                          \
-    s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                          \
-                                                                             \
-    /* stage3 */                                                             \
-    BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);             \
-                                                                             \
-    /* stage4 */                                                             \
-    ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);                          \
-    ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);                          \
-                                                                             \
-    SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m);                                 \
-    x1_m = __msa_ilvev_h(x0_m, x1_m);                                        \
-    out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m);                          \
-                                                                             \
-    SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m);                                 \
-    x2_m = __msa_ilvev_h(x3_m, x2_m);                                        \
-    out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                          \
-                                                                             \
-    x1_m = __msa_splati_h(coeff_m, 5);                                       \
-    x0_m = -x0_m;                                                            \
-    x0_m = __msa_ilvev_h(x1_m, x0_m);                                        \
-    out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m);                          \
-                                                                             \
-    x2_m = __msa_splati_h(coeff_m, 6);                                       \
-    x3_m = -x3_m;                                                            \
-    x2_m = __msa_ilvev_h(x2_m, x3_m);                                        \
-    out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                          \
-  }
-
-#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,    \
-                      out2, out3, out4, out5, out6, out7)                    \
-  {                                                                          \
-    v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;                    \
-    v8i16 x0_m, x1_m, x2_m, x3_m;                                            \
-    v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,  cospi_24_64,   \
-                      cospi_4_64,  cospi_28_64,  cospi_12_64, cospi_20_64 }; \
-                                                                             \
-    /* FDCT stage1 */                                                        \
-    BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m,    \
-                s3_m, s4_m, s5_m, s6_m, s7_m);                               \
-    BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);             \
-    ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);                          \
-    ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);                          \
-    SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m);                                 \
-    x1_m = __msa_ilvev_h(x1_m, x0_m);                                        \
-    out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                          \
-                                                                             \
-    SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m);                                 \
-    x2_m = -x2_m;                                                            \
-    x2_m = __msa_ilvev_h(x3_m, x2_m);                                        \
-    out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                          \
-                                                                             \
-    out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                          \
-    x2_m = __msa_splati_h(coeff_m, 2);                                       \
-    x2_m = __msa_ilvev_h(x2_m, x3_m);                                        \
-    out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                          \
-                                                                             \
-    /* stage2 */                                                             \
-    ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m);                                     \
-                                                                             \
-    s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                          \
-    s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                          \
-                                                                             \
-    /* stage3 */                                                             \
-    BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);             \
-                                                                             \
-    /* stage4 */                                                             \
-    ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);                          \
-    ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);                          \
-                                                                             \
-    SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m);                                 \
-    x1_m = __msa_ilvev_h(x0_m, x1_m);                                        \
-    out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m);                          \
-                                                                             \
-    SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m);                                 \
-    x2_m = __msa_ilvev_h(x3_m, x2_m);                                        \
-    out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                          \
-                                                                             \
-    x1_m = __msa_splati_h(coeff_m, 5);                                       \
-    x0_m = -x0_m;                                                            \
-    x0_m = __msa_ilvev_h(x1_m, x0_m);                                        \
-    out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m);                          \
-                                                                             \
-    x2_m = __msa_splati_h(coeff_m, 6);                                       \
-    x3_m = -x3_m;                                                            \
-    x2_m = __msa_ilvev_h(x2_m, x3_m);                                        \
-    out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                          \
-  }
-
-#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6,   \
-                     input7, out1, out3, out5, out7, out9, out11, out13,       \
-                     out15)                                                    \
-  {                                                                            \
-    v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m;                \
-    v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m;                \
-    v8i16 stp36_m, stp37_m, vec0_m, vec1_m;                                    \
-    v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m;                              \
-    v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m;                                  \
-    v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,  cospi_24_64,     \
-                      -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };   \
-    v8i16 coeff1_m = { cospi_2_64,  cospi_30_64, cospi_14_64, cospi_18_64,     \
-                       cospi_10_64, cospi_22_64, cospi_6_64,  cospi_26_64 };   \
-    v8i16 coeff2_m = {                                                         \
-      -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0        \
-    };                                                                         \
-                                                                               \
-    /* stp 1 */                                                                \
-    ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m);                \
-    ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m);                \
-                                                                               \
-    cnst4_m = __msa_splati_h(coeff_m, 0);                                      \
-    stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m);                  \
-                                                                               \
-    cnst5_m = __msa_splati_h(coeff_m, 1);                                      \
-    cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m);                                 \
-    stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m);                  \
-    stp24_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m);                  \
-    stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m);                  \
-                                                                               \
-    /* stp2 */                                                                 \
-    BUTTERFLY_4(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, stp32_m,   \
-                stp33_m);                                                      \
-    BUTTERFLY_4(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, stp35_m,   \
-                stp34_m);                                                      \
-                                                                               \
-    ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m);            \
-    ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m);            \
-                                                                               \
-    SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m);                             \
-    cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                                 \
-    stp26_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);                  \
-                                                                               \
-    cnst0_m = __msa_splati_h(coeff_m, 4);                                      \
-    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
-    stp21_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);                  \
-                                                                               \
-    SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m);                             \
-    cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m);                                 \
-    stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m);                  \
-                                                                               \
-    cnst0_m = __msa_splati_h(coeff_m, 3);                                      \
-    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
-    stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m);                  \
-                                                                               \
-    /* stp4 */                                                                 \
-    BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, vec4_m,    \
-                vec5_m);                                                       \
-    BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, stp24_m, \
-                stp31_m);                                                      \
-                                                                               \
-    ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m);                               \
-    SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m);                            \
-    cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                                 \
-                                                                               \
-    out1 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                     \
-                                                                               \
-    cnst0_m = __msa_splati_h(coeff2_m, 0);                                     \
-    cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
-    out15 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                    \
-                                                                               \
-    ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m);                               \
-    SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                            \
-    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
-                                                                               \
-    out9 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);                     \
-                                                                               \
-    cnst1_m = __msa_splati_h(coeff2_m, 2);                                     \
-    cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                                 \
-    out7 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                     \
-                                                                               \
-    ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m);                             \
-    SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m);                            \
-    cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                                 \
-    out5 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                     \
-                                                                               \
-    cnst0_m = __msa_splati_h(coeff2_m, 1);                                     \
-    cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
-    out11 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                    \
-                                                                               \
-    ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m);                             \
-    SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m);                            \
-    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
-                                                                               \
-    out13 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);                    \
-                                                                               \
-    cnst1_m = __msa_splati_h(coeff2_m, 3);                                     \
-    cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                                 \
-    out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                     \
-  }
-
-#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \
-  {                                        \
-    v8i16 tp0_m, tp1_m;                    \
-    v8i16 one_m = __msa_ldi_h(1);          \
-                                           \
-    tp0_m = __msa_clti_s_h(vec0, 0);       \
-    tp1_m = __msa_clti_s_h(vec1, 0);       \
-    vec0 += 1;                             \
-    vec1 += 1;                             \
-    tp0_m = one_m & tp0_m;                 \
-    tp1_m = one_m & tp1_m;                 \
-    vec0 += tp0_m;                         \
-    vec1 += tp1_m;                         \
-    vec0 >>= 2;                            \
-    vec1 >>= 2;                            \
-  }
-
-#define FDCT32_POSTPROC_NEG_W(vec)   \
-  {                                  \
-    v4i32 temp_m;                    \
-    v4i32 one_m = __msa_ldi_w(1);    \
-                                     \
-    temp_m = __msa_clti_s_w(vec, 0); \
-    vec += 1;                        \
-    temp_m = one_m & temp_m;         \
-    vec += temp_m;                   \
-    vec >>= 2;                       \
-  }
-
-#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1)        \
-  {                                                 \
-    v8i16 tp0_m, tp1_m;                             \
-    v8i16 one = __msa_ldi_h(1);                     \
-                                                    \
-    tp0_m = __msa_clei_s_h(vec0, 0);                \
-    tp1_m = __msa_clei_s_h(vec1, 0);                \
-    tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \
-    tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \
-    vec0 += 1;                                      \
-    vec1 += 1;                                      \
-    tp0_m = one & tp0_m;                            \
-    tp1_m = one & tp1_m;                            \
-    vec0 += tp0_m;                                  \
-    vec1 += tp1_m;                                  \
-    vec0 >>= 2;                                     \
-    vec1 >>= 2;                                     \
-  }
-
-#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \
-                          const0, const1, out0, out1, out2, out3)       \
-  {                                                                     \
-    v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;               \
-    v2i64 tp0_m, tp1_m, tp2_m, tp3_m;                                   \
-    v4i32 k0_m = __msa_fill_w((int32_t)const0);                         \
-                                                                        \
-    s0_m = __msa_fill_w((int32_t)const1);                               \
-    k0_m = __msa_ilvev_w(s0_m, k0_m);                                   \
-                                                                        \
-    ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m);                     \
-    ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m);                      \
-    ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m);                   \
-    ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m);                    \
-                                                                        \
-    DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m);                  \
-    DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m);                  \
-    tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS);                       \
-    tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS);                       \
-    tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS);                       \
-    tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS);                       \
-    out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m);                   \
-    out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m);                   \
-                                                                        \
-    DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m);                  \
-    DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m);                  \
-    tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS);                       \
-    tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS);                       \
-    tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS);                       \
-    tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS);                       \
-    out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m);                   \
-    out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m);                   \
-  }
-
-void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
-                        int32_t src_stride);
-void fdct16x8_1d_row(int16_t *input, int16_t *output);
-#endif  // AOM_DSP_MIPS_FWD_TXFM_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/idct16x16_msa.c b/third_party/aom/aom_dsp/mips/idct16x16_msa.c
deleted file mode 100644
index 0ea127f52..000000000
--- a/third_party/aom/aom_dsp/mips/idct16x16_msa.c
+++ /dev/null
@@ -1,486 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/inv_txfm_msa.h"
-
-void aom_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
-  v8i16 loc0, loc1, loc2, loc3;
-  v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
-  v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
-  v8i16 tmp5, tmp6, tmp7;
-
-  LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
-  input += 8;
-  LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
-
-  TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg0, reg1,
-                     reg2, reg3, reg4, reg5, reg6, reg7);
-  TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, reg8,
-                     reg9, reg10, reg11, reg12, reg13, reg14, reg15);
-  DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
-  DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
-  BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
-  DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
-  DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
-  DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
-  BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
-  SUB4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg0, reg12, reg4,
-       reg8);
-  ADD4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg2, reg14, reg6,
-       reg10);
-
-  /* stage 2 */
-  DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
-  DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
-
-  reg9 = reg1 - loc2;
-  reg1 = reg1 + loc2;
-  reg7 = reg15 - loc3;
-  reg15 = reg15 + loc3;
-
-  DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
-  DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
-  BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
-
-  loc1 = reg15 + reg3;
-  reg3 = reg15 - reg3;
-  loc2 = reg2 + loc1;
-  reg15 = reg2 - loc1;
-
-  loc1 = reg1 + reg13;
-  reg13 = reg1 - reg13;
-  loc0 = reg0 + loc1;
-  loc1 = reg0 - loc1;
-  tmp6 = loc0;
-  tmp7 = loc1;
-  reg0 = loc2;
-
-  DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
-  DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
-
-  loc0 = reg9 + reg5;
-  reg5 = reg9 - reg5;
-  reg2 = reg6 + loc0;
-  reg1 = reg6 - loc0;
-
-  loc0 = reg7 + reg11;
-  reg11 = reg7 - reg11;
-  loc1 = reg4 + loc0;
-  loc2 = reg4 - loc0;
-  tmp5 = loc1;
-
-  DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
-  BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
-
-  reg10 = loc0;
-  reg11 = loc1;
-
-  DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
-  BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
-
-  reg13 = loc2;
-
-  /* Transpose and store the output */
-  reg12 = tmp5;
-  reg14 = tmp6;
-  reg3 = tmp7;
-
-  /* transpose block */
-  TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, reg0,
-                     reg2, reg4, reg6, reg8, reg10, reg12, reg14);
-  ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16);
-
-  /* transpose block */
-  TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, reg3,
-                     reg13, reg11, reg5, reg7, reg9, reg1, reg15);
-  ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16);
-}
-
-void aom_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
-                                      int32_t dst_stride) {
-  v8i16 loc0, loc1, loc2, loc3;
-  v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
-  v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
-  v8i16 tmp5, tmp6, tmp7;
-
-  /* load up 8x8 */
-  LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
-  input += 8 * 16;
-  /* load bottom 8x8 */
-  LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
-
-  DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
-  DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
-  BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
-  DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
-  DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
-  DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
-  BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
-
-  reg0 = reg2 - loc1;
-  reg2 = reg2 + loc1;
-  reg12 = reg14 - loc0;
-  reg14 = reg14 + loc0;
-  reg4 = reg6 - loc3;
-  reg6 = reg6 + loc3;
-  reg8 = reg10 - loc2;
-  reg10 = reg10 + loc2;
-
-  /* stage 2 */
-  DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
-  DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
-
-  reg9 = reg1 - loc2;
-  reg1 = reg1 + loc2;
-  reg7 = reg15 - loc3;
-  reg15 = reg15 + loc3;
-
-  DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
-  DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
-  BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
-
-  loc1 = reg15 + reg3;
-  reg3 = reg15 - reg3;
-  loc2 = reg2 + loc1;
-  reg15 = reg2 - loc1;
-
-  loc1 = reg1 + reg13;
-  reg13 = reg1 - reg13;
-  loc0 = reg0 + loc1;
-  loc1 = reg0 - loc1;
-  tmp6 = loc0;
-  tmp7 = loc1;
-  reg0 = loc2;
-
-  DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
-  DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
-
-  loc0 = reg9 + reg5;
-  reg5 = reg9 - reg5;
-  reg2 = reg6 + loc0;
-  reg1 = reg6 - loc0;
-
-  loc0 = reg7 + reg11;
-  reg11 = reg7 - reg11;
-  loc1 = reg4 + loc0;
-  loc2 = reg4 - loc0;
-  tmp5 = loc1;
-
-  DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
-  BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
-
-  reg10 = loc0;
-  reg11 = loc1;
-
-  DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
-  BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
-  reg13 = loc2;
-
-  /* Transpose and store the output */
-  reg12 = tmp5;
-  reg14 = tmp6;
-  reg3 = tmp7;
-
-  SRARI_H4_SH(reg0, reg2, reg4, reg6, 6);
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6);
-  dst += (4 * dst_stride);
-  SRARI_H4_SH(reg8, reg10, reg12, reg14, 6);
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14);
-  dst += (4 * dst_stride);
-  SRARI_H4_SH(reg3, reg13, reg11, reg5, 6);
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5);
-  dst += (4 * dst_stride);
-  SRARI_H4_SH(reg7, reg9, reg1, reg15, 6);
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15);
-}
-
-void aom_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst,
-                               int32_t dst_stride) {
-  int32_t i;
-  DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
-  int16_t *out = out_arr;
-
-  /* transform rows */
-  for (i = 0; i < 2; ++i) {
-    /* process 16 * 8 block */
-    aom_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7)));
-  }
-
-  /* transform columns */
-  for (i = 0; i < 2; ++i) {
-    /* process 8 * 16 block */
-    aom_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
-                                     dst_stride);
-  }
-}
-
-void aom_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst,
-                              int32_t dst_stride) {
-  uint8_t i;
-  DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
-  int16_t *out = out_arr;
-
-  /* process 16 * 8 block */
-  aom_idct16_1d_rows_msa(input, out);
-
-  /* short case just considers top 4 rows as valid output */
-  out += 4 * 16;
-  for (i = 12; i--;) {
-    __asm__ __volatile__(
-        "sw     $zero,   0(%[out])     \n\t"
-        "sw     $zero,   4(%[out])     \n\t"
-        "sw     $zero,   8(%[out])     \n\t"
-        "sw     $zero,  12(%[out])     \n\t"
-        "sw     $zero,  16(%[out])     \n\t"
-        "sw     $zero,  20(%[out])     \n\t"
-        "sw     $zero,  24(%[out])     \n\t"
-        "sw     $zero,  28(%[out])     \n\t"
-
-        :
-        : [out] "r"(out));
-
-    out += 16;
-  }
-
-  out = out_arr;
-
-  /* transform columns */
-  for (i = 0; i < 2; ++i) {
-    /* process 8 * 16 block */
-    aom_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
-                                     dst_stride);
-  }
-}
-
-void aom_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst,
-                             int32_t dst_stride) {
-  uint8_t i;
-  int16_t out;
-  v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7;
-  v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
-
-  out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
-  out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
-  out = ROUND_POWER_OF_TWO(out, 6);
-
-  vec = __msa_fill_h(out);
-
-  for (i = 4; i--;) {
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    UNPCK_UB_SH(dst0, res0, res4);
-    UNPCK_UB_SH(dst1, res1, res5);
-    UNPCK_UB_SH(dst2, res2, res6);
-    UNPCK_UB_SH(dst3, res3, res7);
-    ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
-    ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
-    CLIP_SH4_0_255(res0, res1, res2, res3);
-    CLIP_SH4_0_255(res4, res5, res6, res7);
-    PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1,
-                tmp2, tmp3);
-    ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
-    dst += (4 * dst_stride);
-  }
-}
-
-void aom_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) {
-  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
-  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
-
-  /* load input data */
-  LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
-          l7, l15);
-  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, l0, l1, l2, l3, l4, l5, l6,
-                     l7);
-  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, l8, l9, l10, l11,
-                     l12, l13, l14, l15);
-
-  /* ADST in horizontal */
-  AOM_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13,
-                   l14, l15, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11,
-                   r12, r13, r14, r15);
-
-  l1 = -r8;
-  l3 = -r4;
-  l13 = -r13;
-  l15 = -r1;
-
-  TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, l0, l1, l2, l3, l4, l5,
-                     l6, l7);
-  ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16);
-  TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, l8, l9, l10, l11, l12,
-                     l13, l14, l15);
-  ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16);
-}
-
-void aom_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
-                                       int32_t dst_stride) {
-  v8i16 v0, v2, v4, v6, k0, k1, k2, k3;
-  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
-  v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
-  v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
-  v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15;
-  v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11;
-  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
-  v8i16 res8, res9, res10, res11, res12, res13, res14, res15;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
-  v16i8 zero = { 0 };
-
-  r0 = LD_SH(input + 0 * 16);
-  r3 = LD_SH(input + 3 * 16);
-  r4 = LD_SH(input + 4 * 16);
-  r7 = LD_SH(input + 7 * 16);
-  r8 = LD_SH(input + 8 * 16);
-  r11 = LD_SH(input + 11 * 16);
-  r12 = LD_SH(input + 12 * 16);
-  r15 = LD_SH(input + 15 * 16);
-
-  /* stage 1 */
-  k0 = AOM_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);
-  k1 = AOM_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);
-  k2 = AOM_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);
-  k3 = AOM_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);
-  MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
-  k0 = AOM_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);
-  k1 = AOM_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);
-  k2 = AOM_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);
-  k3 = AOM_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);
-  MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
-  BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0);
-  k0 = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
-  k1 = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
-  k2 = AOM_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);
-  MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
-
-  r1 = LD_SH(input + 1 * 16);
-  r2 = LD_SH(input + 2 * 16);
-  r5 = LD_SH(input + 5 * 16);
-  r6 = LD_SH(input + 6 * 16);
-  r9 = LD_SH(input + 9 * 16);
-  r10 = LD_SH(input + 10 * 16);
-  r13 = LD_SH(input + 13 * 16);
-  r14 = LD_SH(input + 14 * 16);
-
-  k0 = AOM_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);
-  k1 = AOM_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);
-  k2 = AOM_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);
-  k3 = AOM_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);
-  MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7);
-  k0 = AOM_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);
-  k1 = AOM_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);
-  k2 = AOM_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);
-  k3 = AOM_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);
-  MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15);
-  BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4);
-  BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10);
-  out1 = -out1;
-  SRARI_H2_SH(out0, out1, 6);
-  dst0 = LD_UB(dst + 0 * dst_stride);
-  dst1 = LD_UB(dst + 15 * dst_stride);
-  ILVR_B2_SH(zero, dst0, zero, dst1, res0, res1);
-  ADD2(res0, out0, res1, out1, res0, res1);
-  CLIP_SH2_0_255(res0, res1);
-  PCKEV_B2_SH(res0, res0, res1, res1, res0, res1);
-  ST8x1_UB(res0, dst);
-  ST8x1_UB(res1, dst + 15 * dst_stride);
-
-  k0 = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
-  k1 = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
-  k2 = AOM_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);
-  MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
-  BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
-  out8 = -out8;
-
-  SRARI_H2_SH(out8, out9, 6);
-  dst8 = LD_UB(dst + 1 * dst_stride);
-  dst9 = LD_UB(dst + 14 * dst_stride);
-  ILVR_B2_SH(zero, dst8, zero, dst9, res8, res9);
-  ADD2(res8, out8, res9, out9, res8, res9);
-  CLIP_SH2_0_255(res8, res9);
-  PCKEV_B2_SH(res8, res8, res9, res9, res8, res9);
-  ST8x1_UB(res8, dst + dst_stride);
-  ST8x1_UB(res9, dst + 14 * dst_stride);
-
-  k0 = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
-  k1 = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
-  k2 = AOM_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);
-  MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7);
-  out4 = -out4;
-  SRARI_H2_SH(out4, out5, 6);
-  dst4 = LD_UB(dst + 3 * dst_stride);
-  dst5 = LD_UB(dst + 12 * dst_stride);
-  ILVR_B2_SH(zero, dst4, zero, dst5, res4, res5);
-  ADD2(res4, out4, res5, out5, res4, res5);
-  CLIP_SH2_0_255(res4, res5);
-  PCKEV_B2_SH(res4, res4, res5, res5, res4, res5);
-  ST8x1_UB(res4, dst + 3 * dst_stride);
-  ST8x1_UB(res5, dst + 12 * dst_stride);
-
-  MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
-  out13 = -out13;
-  SRARI_H2_SH(out12, out13, 6);
-  dst12 = LD_UB(dst + 2 * dst_stride);
-  dst13 = LD_UB(dst + 13 * dst_stride);
-  ILVR_B2_SH(zero, dst12, zero, dst13, res12, res13);
-  ADD2(res12, out12, res13, out13, res12, res13);
-  CLIP_SH2_0_255(res12, res13);
-  PCKEV_B2_SH(res12, res12, res13, res13, res12, res13);
-  ST8x1_UB(res12, dst + 2 * dst_stride);
-  ST8x1_UB(res13, dst + 13 * dst_stride);
-
-  k0 = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
-  k3 = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
-  MADD_SHORT(out6, out7, k0, k3, out6, out7);
-  SRARI_H2_SH(out6, out7, 6);
-  dst6 = LD_UB(dst + 4 * dst_stride);
-  dst7 = LD_UB(dst + 11 * dst_stride);
-  ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7);
-  ADD2(res6, out6, res7, out7, res6, res7);
-  CLIP_SH2_0_255(res6, res7);
-  PCKEV_B2_SH(res6, res6, res7, res7, res6, res7);
-  ST8x1_UB(res6, dst + 4 * dst_stride);
-  ST8x1_UB(res7, dst + 11 * dst_stride);
-
-  MADD_SHORT(out10, out11, k0, k3, out10, out11);
-  SRARI_H2_SH(out10, out11, 6);
-  dst10 = LD_UB(dst + 6 * dst_stride);
-  dst11 = LD_UB(dst + 9 * dst_stride);
-  ILVR_B2_SH(zero, dst10, zero, dst11, res10, res11);
-  ADD2(res10, out10, res11, out11, res10, res11);
-  CLIP_SH2_0_255(res10, res11);
-  PCKEV_B2_SH(res10, res10, res11, res11, res10, res11);
-  ST8x1_UB(res10, dst + 6 * dst_stride);
-  ST8x1_UB(res11, dst + 9 * dst_stride);
-
-  k1 = AOM_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);
-  k2 = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
-  MADD_SHORT(h10, h11, k1, k2, out2, out3);
-  SRARI_H2_SH(out2, out3, 6);
-  dst2 = LD_UB(dst + 7 * dst_stride);
-  dst3 = LD_UB(dst + 8 * dst_stride);
-  ILVR_B2_SH(zero, dst2, zero, dst3, res2, res3);
-  ADD2(res2, out2, res3, out3, res2, res3);
-  CLIP_SH2_0_255(res2, res3);
-  PCKEV_B2_SH(res2, res2, res3, res3, res2, res3);
-  ST8x1_UB(res2, dst + 7 * dst_stride);
-  ST8x1_UB(res3, dst + 8 * dst_stride);
-
-  MADD_SHORT(out14, out15, k1, k2, out14, out15);
-  SRARI_H2_SH(out14, out15, 6);
-  dst14 = LD_UB(dst + 5 * dst_stride);
-  dst15 = LD_UB(dst + 10 * dst_stride);
-  ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15);
-  ADD2(res14, out14, res15, out15, res14, res15);
-  CLIP_SH2_0_255(res14, res15);
-  PCKEV_B2_SH(res14, res14, res15, res15, res14, res15);
-  ST8x1_UB(res14, dst + 5 * dst_stride);
-  ST8x1_UB(res15, dst + 10 * dst_stride);
-}
diff --git a/third_party/aom/aom_dsp/mips/idct32x32_msa.c b/third_party/aom/aom_dsp/mips/idct32x32_msa.c
deleted file mode 100644
index f1ca757a0..000000000
--- a/third_party/aom/aom_dsp/mips/idct32x32_msa.c
+++ /dev/null
@@ -1,730 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/inv_txfm_msa.h"
-
-static void idct32x8_row_transpose_store(const int16_t *input,
-                                         int16_t *tmp_buf) {
-  v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
-
-  /* 1st & 2nd 8x8 */
-  LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3);
-  LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7);
-  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
-                     n3);
-  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
-                     n7);
-  ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8);
-  ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8);
-  ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8);
-
-  /* 3rd & 4th 8x8 */
-  LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3);
-  LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7);
-  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
-                     n3);
-  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
-                     n7);
-  ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8);
-  ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8);
-  ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8);
-  ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8);
-}
-
-static void idct32x8_row_even_process_store(int16_t *tmp_buf,
-                                            int16_t *tmp_eve_buf) {
-  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
-  v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
-  v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
-
-  /* Even stage 1 */
-  LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
-
-  DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
-  DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
-  BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
-  DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
-
-  loc1 = vec3;
-  loc0 = vec1;
-
-  DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
-  DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
-  BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
-  BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
-  BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
-
-  /* Even stage 2 */
-  LD_SH8((tmp_buf + 16), 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
-  DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
-  DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
-  DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
-  DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
-
-  vec0 = reg0 + reg4;
-  reg0 = reg0 - reg4;
-  reg4 = reg6 + reg2;
-  reg6 = reg6 - reg2;
-  reg2 = reg1 + reg5;
-  reg1 = reg1 - reg5;
-  reg5 = reg7 + reg3;
-  reg7 = reg7 - reg3;
-  reg3 = vec0;
-
-  vec1 = reg2;
-  reg2 = reg3 + reg4;
-  reg3 = reg3 - reg4;
-  reg4 = reg5 - vec1;
-  reg5 = reg5 + vec1;
-
-  DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
-  DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
-
-  vec0 = reg0 - reg6;
-  reg0 = reg0 + reg6;
-  vec1 = reg7 - reg1;
-  reg7 = reg7 + reg1;
-
-  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
-  DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
-
-  /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
-  BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
-  ST_SH(loc0, (tmp_eve_buf + 15 * 8));
-  ST_SH(loc1, (tmp_eve_buf));
-  ST_SH(loc2, (tmp_eve_buf + 14 * 8));
-  ST_SH(loc3, (tmp_eve_buf + 8));
-
-  BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
-  ST_SH(loc0, (tmp_eve_buf + 13 * 8));
-  ST_SH(loc1, (tmp_eve_buf + 2 * 8));
-  ST_SH(loc2, (tmp_eve_buf + 12 * 8));
-  ST_SH(loc3, (tmp_eve_buf + 3 * 8));
-
-  /* Store 8 */
-  BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
-  ST_SH(loc0, (tmp_eve_buf + 11 * 8));
-  ST_SH(loc1, (tmp_eve_buf + 4 * 8));
-  ST_SH(loc2, (tmp_eve_buf + 10 * 8));
-  ST_SH(loc3, (tmp_eve_buf + 5 * 8));
-
-  BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
-  ST_SH(loc0, (tmp_eve_buf + 9 * 8));
-  ST_SH(loc1, (tmp_eve_buf + 6 * 8));
-  ST_SH(loc2, (tmp_eve_buf + 8 * 8));
-  ST_SH(loc3, (tmp_eve_buf + 7 * 8));
-}
-
-static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
-                                           int16_t *tmp_odd_buf) {
-  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
-  v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
-
-  /* Odd stage 1 */
-  reg0 = LD_SH(tmp_buf + 8);
-  reg1 = LD_SH(tmp_buf + 7 * 8);
-  reg2 = LD_SH(tmp_buf + 9 * 8);
-  reg3 = LD_SH(tmp_buf + 15 * 8);
-  reg4 = LD_SH(tmp_buf + 17 * 8);
-  reg5 = LD_SH(tmp_buf + 23 * 8);
-  reg6 = LD_SH(tmp_buf + 25 * 8);
-  reg7 = LD_SH(tmp_buf + 31 * 8);
-
-  DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
-  DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
-  DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
-  DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
-
-  vec0 = reg0 + reg3;
-  reg0 = reg0 - reg3;
-  reg3 = reg7 + reg4;
-  reg7 = reg7 - reg4;
-  reg4 = reg1 + reg2;
-  reg1 = reg1 - reg2;
-  reg2 = reg6 + reg5;
-  reg6 = reg6 - reg5;
-  reg5 = vec0;
-
-  /* 4 Stores */
-  ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
-  ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
-
-  SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
-  ST_SH2(vec0, vec1, (tmp_odd_buf), 8);
-
-  /* 4 Stores */
-  DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
-  DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
-  BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
-  ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
-
-  DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
-  ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
-
-  /* Odd stage 2 */
-  /* 8 loads */
-  reg0 = LD_SH(tmp_buf + 3 * 8);
-  reg1 = LD_SH(tmp_buf + 5 * 8);
-  reg2 = LD_SH(tmp_buf + 11 * 8);
-  reg3 = LD_SH(tmp_buf + 13 * 8);
-  reg4 = LD_SH(tmp_buf + 19 * 8);
-  reg5 = LD_SH(tmp_buf + 21 * 8);
-  reg6 = LD_SH(tmp_buf + 27 * 8);
-  reg7 = LD_SH(tmp_buf + 29 * 8);
-
-  DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
-  DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
-  DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
-  DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
-
-  /* 4 Stores */
-  SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
-  DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
-
-  BUTTERFLY_4(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3);
-  ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
-
-  DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
-  ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
-
-  /* 4 Stores */
-  ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, vec2, vec0, vec3);
-  BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
-  ST_SH(reg0, (tmp_odd_buf + 13 * 8));
-  ST_SH(reg1, (tmp_odd_buf + 14 * 8));
-
-  DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
-  ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
-
-  /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
-
-  /* Load 8 & Store 8 */
-  LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
-  LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
-
-  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
-  ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
-
-  SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
-
-  SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
-  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
-
-  /* Load 8 & Store 8 */
-  LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
-  LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
-
-  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
-  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
-
-  SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
-
-  SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
-  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
-}
-
-static void idct_butterfly_transpose_store(int16_t *tmp_buf,
-                                           int16_t *tmp_eve_buf,
-                                           int16_t *tmp_odd_buf, int16_t *dst) {
-  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
-  v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
-
-  /* FINAL BUTTERFLY : Dependency on Even & Odd */
-  vec0 = LD_SH(tmp_odd_buf);
-  vec1 = LD_SH(tmp_odd_buf + 9 * 8);
-  vec2 = LD_SH(tmp_odd_buf + 14 * 8);
-  vec3 = LD_SH(tmp_odd_buf + 6 * 8);
-  loc0 = LD_SH(tmp_eve_buf);
-  loc1 = LD_SH(tmp_eve_buf + 8 * 8);
-  loc2 = LD_SH(tmp_eve_buf + 4 * 8);
-  loc3 = LD_SH(tmp_eve_buf + 12 * 8);
-
-  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
-
-  ST_SH((loc0 - vec3), (tmp_buf + 31 * 8));
-  ST_SH((loc1 - vec2), (tmp_buf + 23 * 8));
-  ST_SH((loc2 - vec1), (tmp_buf + 27 * 8));
-  ST_SH((loc3 - vec0), (tmp_buf + 19 * 8));
-
-  /* Load 8 & Store 8 */
-  vec0 = LD_SH(tmp_odd_buf + 4 * 8);
-  vec1 = LD_SH(tmp_odd_buf + 13 * 8);
-  vec2 = LD_SH(tmp_odd_buf + 10 * 8);
-  vec3 = LD_SH(tmp_odd_buf + 3 * 8);
-  loc0 = LD_SH(tmp_eve_buf + 2 * 8);
-  loc1 = LD_SH(tmp_eve_buf + 10 * 8);
-  loc2 = LD_SH(tmp_eve_buf + 6 * 8);
-  loc3 = LD_SH(tmp_eve_buf + 14 * 8);
-
-  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
-
-  ST_SH((loc0 - vec3), (tmp_buf + 29 * 8));
-  ST_SH((loc1 - vec2), (tmp_buf + 21 * 8));
-  ST_SH((loc2 - vec1), (tmp_buf + 25 * 8));
-  ST_SH((loc3 - vec0), (tmp_buf + 17 * 8));
-
-  /* Load 8 & Store 8 */
-  vec0 = LD_SH(tmp_odd_buf + 2 * 8);
-  vec1 = LD_SH(tmp_odd_buf + 11 * 8);
-  vec2 = LD_SH(tmp_odd_buf + 12 * 8);
-  vec3 = LD_SH(tmp_odd_buf + 7 * 8);
-  loc0 = LD_SH(tmp_eve_buf + 1 * 8);
-  loc1 = LD_SH(tmp_eve_buf + 9 * 8);
-  loc2 = LD_SH(tmp_eve_buf + 5 * 8);
-  loc3 = LD_SH(tmp_eve_buf + 13 * 8);
-
-  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
-
-  ST_SH((loc0 - vec3), (tmp_buf + 30 * 8));
-  ST_SH((loc1 - vec2), (tmp_buf + 22 * 8));
-  ST_SH((loc2 - vec1), (tmp_buf + 26 * 8));
-  ST_SH((loc3 - vec0), (tmp_buf + 18 * 8));
-
-  /* Load 8 & Store 8 */
-  vec0 = LD_SH(tmp_odd_buf + 5 * 8);
-  vec1 = LD_SH(tmp_odd_buf + 15 * 8);
-  vec2 = LD_SH(tmp_odd_buf + 8 * 8);
-  vec3 = LD_SH(tmp_odd_buf + 1 * 8);
-  loc0 = LD_SH(tmp_eve_buf + 3 * 8);
-  loc1 = LD_SH(tmp_eve_buf + 11 * 8);
-  loc2 = LD_SH(tmp_eve_buf + 7 * 8);
-  loc3 = LD_SH(tmp_eve_buf + 15 * 8);
-
-  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
-
-  ST_SH((loc0 - vec3), (tmp_buf + 28 * 8));
-  ST_SH((loc1 - vec2), (tmp_buf + 20 * 8));
-  ST_SH((loc2 - vec1), (tmp_buf + 24 * 8));
-  ST_SH((loc3 - vec0), (tmp_buf + 16 * 8));
-
-  /* Transpose : 16 vectors */
-  /* 1st & 2nd 8x8 */
-  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
-                     n3);
-  ST_SH4(m0, n0, m1, n1, (dst + 0), 32);
-  ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32);
-
-  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
-                     n7);
-  ST_SH4(m4, n4, m5, n5, (dst + 8), 32);
-  ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32);
-
-  /* 3rd & 4th 8x8 */
-  LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3);
-  LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7);
-  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
-                     n3);
-  ST_SH4(m0, n0, m1, n1, (dst + 16), 32);
-  ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32);
-
-  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
-                     n7);
-  ST_SH4(m4, n4, m5, n5, (dst + 24), 32);
-  ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32);
-}
-
-static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) {
-  DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]);
-  DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
-  DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
-
-  idct32x8_row_transpose_store(input, &tmp_buf[0]);
-  idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
-  idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
-  idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0],
-                                 output);
-}
-
-static void idct8x32_column_even_process_store(int16_t *tmp_buf,
-                                               int16_t *tmp_eve_buf) {
-  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
-  v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
-  v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
-
-  /* Even stage 1 */
-  LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
-  tmp_buf += (2 * 32);
-
-  DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
-  DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
-  BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
-  DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
-
-  loc1 = vec3;
-  loc0 = vec1;
-
-  DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
-  DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
-  BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
-  BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
-  BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
-
-  /* Even stage 2 */
-  /* Load 8 */
-  LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
-
-  DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
-  DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
-  DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
-  DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
-
-  vec0 = reg0 + reg4;
-  reg0 = reg0 - reg4;
-  reg4 = reg6 + reg2;
-  reg6 = reg6 - reg2;
-  reg2 = reg1 + reg5;
-  reg1 = reg1 - reg5;
-  reg5 = reg7 + reg3;
-  reg7 = reg7 - reg3;
-  reg3 = vec0;
-
-  vec1 = reg2;
-  reg2 = reg3 + reg4;
-  reg3 = reg3 - reg4;
-  reg4 = reg5 - vec1;
-  reg5 = reg5 + vec1;
-
-  DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
-  DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
-
-  vec0 = reg0 - reg6;
-  reg0 = reg0 + reg6;
-  vec1 = reg7 - reg1;
-  reg7 = reg7 + reg1;
-
-  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
-  DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
-
-  /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
-  /* Store 8 */
-  BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
-  ST_SH2(loc1, loc3, tmp_eve_buf, 8);
-  ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8);
-
-  BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
-  ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8);
-  ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8);
-
-  /* Store 8 */
-  BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
-  ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8);
-  ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8);
-
-  BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
-  ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8);
-  ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8);
-}
-
-static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
-                                              int16_t *tmp_odd_buf) {
-  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
-  v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
-
-  /* Odd stage 1 */
-  reg0 = LD_SH(tmp_buf + 32);
-  reg1 = LD_SH(tmp_buf + 7 * 32);
-  reg2 = LD_SH(tmp_buf + 9 * 32);
-  reg3 = LD_SH(tmp_buf + 15 * 32);
-  reg4 = LD_SH(tmp_buf + 17 * 32);
-  reg5 = LD_SH(tmp_buf + 23 * 32);
-  reg6 = LD_SH(tmp_buf + 25 * 32);
-  reg7 = LD_SH(tmp_buf + 31 * 32);
-
-  DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
-  DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
-  DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
-  DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
-
-  vec0 = reg0 + reg3;
-  reg0 = reg0 - reg3;
-  reg3 = reg7 + reg4;
-  reg7 = reg7 - reg4;
-  reg4 = reg1 + reg2;
-  reg1 = reg1 - reg2;
-  reg2 = reg6 + reg5;
-  reg6 = reg6 - reg5;
-  reg5 = vec0;
-
-  /* 4 Stores */
-  ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
-  ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
-  SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
-  ST_SH2(vec0, vec1, tmp_odd_buf, 8);
-
-  /* 4 Stores */
-  DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
-  DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
-  BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
-  ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
-  DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
-  ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
-
-  /* Odd stage 2 */
-  /* 8 loads */
-  reg0 = LD_SH(tmp_buf + 3 * 32);
-  reg1 = LD_SH(tmp_buf + 5 * 32);
-  reg2 = LD_SH(tmp_buf + 11 * 32);
-  reg3 = LD_SH(tmp_buf + 13 * 32);
-  reg4 = LD_SH(tmp_buf + 19 * 32);
-  reg5 = LD_SH(tmp_buf + 21 * 32);
-  reg6 = LD_SH(tmp_buf + 27 * 32);
-  reg7 = LD_SH(tmp_buf + 29 * 32);
-
-  DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
-  DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
-  DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
-  DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
-
-  /* 4 Stores */
-  SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
-  DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
-  BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
-  ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
-  DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
-  ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
-
-  /* 4 Stores */
-  ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, vec1, vec2, vec3);
-  BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
-  ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8);
-  DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
-  ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
-
-  /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
-  /* Load 8 & Store 8 */
-  LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
-  LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
-
-  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
-  ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
-
-  SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
-
-  SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
-  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
-
-  /* Load 8 & Store 8 */
-  LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
-  LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
-
-  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
-  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
-
-  SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
-
-  SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
-  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
-}
-
-static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
-                                             int16_t *tmp_odd_buf, uint8_t *dst,
-                                             int32_t dst_stride) {
-  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
-  v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
-
-  /* FINAL BUTTERFLY : Dependency on Even & Odd */
-  vec0 = LD_SH(tmp_odd_buf);
-  vec1 = LD_SH(tmp_odd_buf + 9 * 8);
-  vec2 = LD_SH(tmp_odd_buf + 14 * 8);
-  vec3 = LD_SH(tmp_odd_buf + 6 * 8);
-  loc0 = LD_SH(tmp_eve_buf);
-  loc1 = LD_SH(tmp_eve_buf + 8 * 8);
-  loc2 = LD_SH(tmp_eve_buf + 4 * 8);
-  loc3 = LD_SH(tmp_eve_buf + 12 * 8);
-
-  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
-  SRARI_H4_SH(m0, m2, m4, m6, 6);
-  AOM_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6);
-
-  SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0);
-  SRARI_H4_SH(m0, m2, m4, m6, 6);
-  AOM_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), m0, m2, m4,
-                      m6);
-
-  /* Load 8 & Store 8 */
-  vec0 = LD_SH(tmp_odd_buf + 4 * 8);
-  vec1 = LD_SH(tmp_odd_buf + 13 * 8);
-  vec2 = LD_SH(tmp_odd_buf + 10 * 8);
-  vec3 = LD_SH(tmp_odd_buf + 3 * 8);
-  loc0 = LD_SH(tmp_eve_buf + 2 * 8);
-  loc1 = LD_SH(tmp_eve_buf + 10 * 8);
-  loc2 = LD_SH(tmp_eve_buf + 6 * 8);
-  loc3 = LD_SH(tmp_eve_buf + 14 * 8);
-
-  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
-  SRARI_H4_SH(m1, m3, m5, m7, 6);
-  AOM_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), m1, m3, m5, m7);
-
-  SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1);
-  SRARI_H4_SH(m1, m3, m5, m7, 6);
-  AOM_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), m1, m3, m5,
-                      m7);
-
-  /* Load 8 & Store 8 */
-  vec0 = LD_SH(tmp_odd_buf + 2 * 8);
-  vec1 = LD_SH(tmp_odd_buf + 11 * 8);
-  vec2 = LD_SH(tmp_odd_buf + 12 * 8);
-  vec3 = LD_SH(tmp_odd_buf + 7 * 8);
-  loc0 = LD_SH(tmp_eve_buf + 1 * 8);
-  loc1 = LD_SH(tmp_eve_buf + 9 * 8);
-  loc2 = LD_SH(tmp_eve_buf + 5 * 8);
-  loc3 = LD_SH(tmp_eve_buf + 13 * 8);
-
-  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
-  SRARI_H4_SH(n0, n2, n4, n6, 6);
-  AOM_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), n0, n2, n4, n6);
-
-  SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0);
-  SRARI_H4_SH(n0, n2, n4, n6, 6);
-  AOM_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), n0, n2, n4,
-                      n6);
-
-  /* Load 8 & Store 8 */
-  vec0 = LD_SH(tmp_odd_buf + 5 * 8);
-  vec1 = LD_SH(tmp_odd_buf + 15 * 8);
-  vec2 = LD_SH(tmp_odd_buf + 8 * 8);
-  vec3 = LD_SH(tmp_odd_buf + 1 * 8);
-  loc0 = LD_SH(tmp_eve_buf + 3 * 8);
-  loc1 = LD_SH(tmp_eve_buf + 11 * 8);
-  loc2 = LD_SH(tmp_eve_buf + 7 * 8);
-  loc3 = LD_SH(tmp_eve_buf + 15 * 8);
-
-  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
-  SRARI_H4_SH(n1, n3, n5, n7, 6);
-  AOM_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), n1, n3, n5, n7);
-
-  SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1);
-  SRARI_H4_SH(n1, n3, n5, n7, 6);
-  AOM_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), n1, n3, n5,
-                      n7);
-}
-
-static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
-                                           int32_t dst_stride) {
-  DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
-  DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
-
-  idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
-  idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
-  idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst,
-                                   dst_stride);
-}
-
-void aom_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst,
-                                int32_t dst_stride) {
-  int32_t i;
-  DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
-  int16_t *out_ptr = out_arr;
-
-  /* transform rows */
-  for (i = 0; i < 4; ++i) {
-    /* process 32 * 8 block */
-    idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8)));
-  }
-
-  /* transform columns */
-  for (i = 0; i < 4; ++i) {
-    /* process 8 * 32 block */
-    idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
-                                   dst_stride);
-  }
-}
-
-void aom_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst,
-                              int32_t dst_stride) {
-  int32_t i;
-  DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
-  int16_t *out_ptr = out_arr;
-
-  for (i = 32; i--;) {
-    __asm__ __volatile__(
-        "sw     $zero,      0(%[out_ptr])     \n\t"
-        "sw     $zero,      4(%[out_ptr])     \n\t"
-        "sw     $zero,      8(%[out_ptr])     \n\t"
-        "sw     $zero,     12(%[out_ptr])     \n\t"
-        "sw     $zero,     16(%[out_ptr])     \n\t"
-        "sw     $zero,     20(%[out_ptr])     \n\t"
-        "sw     $zero,     24(%[out_ptr])     \n\t"
-        "sw     $zero,     28(%[out_ptr])     \n\t"
-        "sw     $zero,     32(%[out_ptr])     \n\t"
-        "sw     $zero,     36(%[out_ptr])     \n\t"
-        "sw     $zero,     40(%[out_ptr])     \n\t"
-        "sw     $zero,     44(%[out_ptr])     \n\t"
-        "sw     $zero,     48(%[out_ptr])     \n\t"
-        "sw     $zero,     52(%[out_ptr])     \n\t"
-        "sw     $zero,     56(%[out_ptr])     \n\t"
-        "sw     $zero,     60(%[out_ptr])     \n\t"
-
-        :
-        : [out_ptr] "r"(out_ptr));
-
-    out_ptr += 32;
-  }
-
-  out_ptr = out_arr;
-
-  /* rows: only upper-left 8x8 has non-zero coeff */
-  idct32x8_1d_rows_msa(input, out_ptr);
-
-  /* transform columns */
-  for (i = 0; i < 4; ++i) {
-    /* process 8 * 32 block */
-    idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
-                                   dst_stride);
-  }
-}
-
-void aom_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst,
-                             int32_t dst_stride) {
-  int32_t i;
-  int16_t out;
-  v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
-  v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec;
-
-  out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
-  out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
-  out = ROUND_POWER_OF_TWO(out, 6);
-
-  vec = __msa_fill_h(out);
-
-  for (i = 16; i--;) {
-    LD_UB2(dst, 16, dst0, dst1);
-    LD_UB2(dst + dst_stride, 16, dst2, dst3);
-
-    UNPCK_UB_SH(dst0, res0, res4);
-    UNPCK_UB_SH(dst1, res1, res5);
-    UNPCK_UB_SH(dst2, res2, res6);
-    UNPCK_UB_SH(dst3, res3, res7);
-    ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
-    ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
-    CLIP_SH4_0_255(res0, res1, res2, res3);
-    CLIP_SH4_0_255(res4, res5, res6, res7);
-    PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1,
-                tmp2, tmp3);
-
-    ST_UB2(tmp0, tmp1, dst, 16);
-    dst += dst_stride;
-    ST_UB2(tmp2, tmp3, dst, 16);
-    dst += dst_stride;
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/idct4x4_msa.c b/third_party/aom/aom_dsp/mips/idct4x4_msa.c
deleted file mode 100644
index 274818baa..000000000
--- a/third_party/aom/aom_dsp/mips/idct4x4_msa.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/inv_txfm_msa.h"
-
-void aom_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
-                            int32_t dst_stride) {
-  v8i16 in0, in1, in2, in3;
-  v4i32 in0_r, in1_r, in2_r, in3_r, in4_r;
-
-  /* load vector elements of 4x4 block */
-  LD4x4_SH(input, in0, in2, in3, in1);
-  TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1);
-  UNPCK_R_SH_SW(in0, in0_r);
-  UNPCK_R_SH_SW(in2, in2_r);
-  UNPCK_R_SH_SW(in3, in3_r);
-  UNPCK_R_SH_SW(in1, in1_r);
-  SRA_4V(in0_r, in1_r, in2_r, in3_r, UNIT_QUANT_SHIFT);
-
-  in0_r += in2_r;
-  in3_r -= in1_r;
-  in4_r = (in0_r - in3_r) >> 1;
-  in1_r = in4_r - in1_r;
-  in2_r = in4_r - in2_r;
-  in0_r -= in1_r;
-  in3_r += in2_r;
-
-  TRANSPOSE4x4_SW_SW(in0_r, in1_r, in2_r, in3_r, in0_r, in1_r, in2_r, in3_r);
-
-  in0_r += in1_r;
-  in2_r -= in3_r;
-  in4_r = (in0_r - in2_r) >> 1;
-  in3_r = in4_r - in3_r;
-  in1_r = in4_r - in1_r;
-  in0_r -= in3_r;
-  in2_r += in1_r;
-
-  PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, in0, in1,
-              in2, in3);
-  ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride);
-}
-
-void aom_iwht4x4_1_add_msa(const int16_t *input, uint8_t *dst,
-                           int32_t dst_stride) {
-  int16_t a1, e1;
-  v8i16 in1, in0 = { 0 };
-
-  a1 = input[0] >> UNIT_QUANT_SHIFT;
-  e1 = a1 >> 1;
-  a1 -= e1;
-
-  in0 = __msa_insert_h(in0, 0, a1);
-  in0 = __msa_insert_h(in0, 1, e1);
-  in0 = __msa_insert_h(in0, 2, e1);
-  in0 = __msa_insert_h(in0, 3, e1);
-
-  in1 = in0 >> 1;
-  in0 -= in1;
-
-  ADDBLK_ST4x4_UB(in0, in1, in1, in1, dst, dst_stride);
-}
-
-void aom_idct4x4_16_add_msa(const int16_t *input, uint8_t *dst,
-                            int32_t dst_stride) {
-  v8i16 in0, in1, in2, in3;
-
-  /* load vector elements of 4x4 block */
-  LD4x4_SH(input, in0, in1, in2, in3);
-  /* rows */
-  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-  AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
-  /* columns */
-  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-  AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
-  /* rounding (add 2^3, divide by 2^4) */
-  SRARI_H4_SH(in0, in1, in2, in3, 4);
-  ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
-}
-
-void aom_idct4x4_1_add_msa(const int16_t *input, uint8_t *dst,
-                           int32_t dst_stride) {
-  int16_t out;
-  v8i16 vec;
-
-  out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
-  out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
-  out = ROUND_POWER_OF_TWO(out, 4);
-  vec = __msa_fill_h(out);
-
-  ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride);
-}
diff --git a/third_party/aom/aom_dsp/mips/idct8x8_msa.c b/third_party/aom/aom_dsp/mips/idct8x8_msa.c
deleted file mode 100644
index 981c103cd..000000000
--- a/third_party/aom/aom_dsp/mips/idct8x8_msa.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/inv_txfm_msa.h"
-
-void aom_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst,
-                            int32_t dst_stride) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-
-  /* load vector elements of 8x8 block */
-  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
-
-  /* rows transform */
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  /* 1D idct8x8 */
-  AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                 in4, in5, in6, in7);
-  /* columns transform */
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  /* 1D idct8x8 */
-  AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                 in4, in5, in6, in7);
-  /* final rounding (add 2^4, divide by 2^5) and shift */
-  SRARI_H4_SH(in0, in1, in2, in3, 5);
-  SRARI_H4_SH(in4, in5, in6, in7, 5);
-  /* add block and store 8x8 */
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
-  dst += (4 * dst_stride);
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
-}
-
-void aom_idct8x8_12_add_msa(const int16_t *input, uint8_t *dst,
-                            int32_t dst_stride) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v8i16 s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3;
-  v4i32 tmp0, tmp1, tmp2, tmp3;
-  v8i16 zero = { 0 };
-
-  /* load vector elements of 8x8 block */
-  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
-  TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-
-  /* stage1 */
-  ILVL_H2_SH(in3, in0, in2, in1, s0, s1);
-  k0 = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
-  k1 = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
-  k2 = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
-  k3 = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
-  DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
-  SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS);
-  PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
-  PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
-  BUTTERFLY_4(s0, s1, s3, s2, s4, s7, s6, s5);
-
-  /* stage2 */
-  ILVR_H2_SH(in3, in1, in2, in0, s1, s0);
-  k0 = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
-  k1 = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
-  k2 = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
-  k3 = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
-  DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
-  SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS);
-  PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
-  PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
-  BUTTERFLY_4(s0, s1, s2, s3, m0, m1, m2, m3);
-
-  /* stage3 */
-  s0 = __msa_ilvr_h(s6, s5);
-
-  k1 = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
-  DOTP_SH2_SW(s0, s0, k1, k0, tmp0, tmp1);
-  SRARI_W2_SW(tmp0, tmp1, DCT_CONST_BITS);
-  PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3);
-
-  /* stage4 */
-  BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, in0, in1, in2, in3, in4, in5, in6,
-              in7);
-  TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                 in4, in5, in6, in7);
-
-  /* final rounding (add 2^4, divide by 2^5) and shift */
-  SRARI_H4_SH(in0, in1, in2, in3, 5);
-  SRARI_H4_SH(in4, in5, in6, in7, 5);
-
-  /* add block and store 8x8 */
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
-  dst += (4 * dst_stride);
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
-}
-
-void aom_idct8x8_1_add_msa(const int16_t *input, uint8_t *dst,
-                           int32_t dst_stride) {
-  int16_t out;
-  int32_t val;
-  v8i16 vec;
-
-  out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
-  out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
-  val = ROUND_POWER_OF_TWO(out, 5);
-  vec = __msa_fill_h(val);
-
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
-  dst += (4 * dst_stride);
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
-}
diff --git a/third_party/aom/aom_dsp/mips/intrapred_msa.c b/third_party/aom/aom_dsp/mips/intrapred_msa.c
index bcb9c9df9..9f25cc1ca 100644
--- a/third_party/aom/aom_dsp/mips/intrapred_msa.c
+++ b/third_party/aom/aom_dsp/mips/intrapred_msa.c
@@ -9,7 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/macros_msa.h"
 
 #define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
diff --git a/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h b/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h
deleted file mode 100644
index c69835173..000000000
--- a/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_MIPS_INV_TXFM_DSPR2_H_
-#define AOM_DSP_MIPS_INV_TXFM_DSPR2_H_
-
-#include <assert.h>
-
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/inv_txfm.h"
-#include "aom_dsp/mips/common_dspr2.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if HAVE_DSPR2
-/* Note: this macro expects a local int32_t named out to exist, and will write
- * to that variable. */
-#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input)                         \
-  ({                                                                           \
-                                                                               \
-    int32_t tmp;                                                               \
-    int dct_cost_rounding = DCT_CONST_ROUNDING;                                \
-    int in = input;                                                            \
-                                                                               \
-    __asm__ __volatile__(/* out = dct_const_round_shift(dc *  cospi_16_64); */ \
-                         "mtlo     %[dct_cost_rounding],   $ac1              " \
-                         "                \n\t"                                \
-                         "mthi     $zero,                  $ac1              " \
-                         "                \n\t"                                \
-                         "madd     $ac1,                   %[in],            " \
-                         "%[cospi_16_64]  \n\t"                                \
-                         "extp     %[tmp],                 $ac1,             " \
-                         "31              \n\t"                                \
-                                                                               \
-                         /* out = dct_const_round_shift(out * cospi_16_64); */ \
-                         "mtlo     %[dct_cost_rounding],   $ac2              " \
-                         "                \n\t"                                \
-                         "mthi     $zero,                  $ac2              " \
-                         "                \n\t"                                \
-                         "madd     $ac2,                   %[tmp],           " \
-                         "%[cospi_16_64]  \n\t"                                \
-                         "extp     %[out],                 $ac2,             " \
-                         "31              \n\t"                                \
-                                                                               \
-                         : [tmp] "=&r"(tmp), [out] "=r"(out)                   \
-                         : [in] "r"(in),                                       \
-                           [dct_cost_rounding] "r"(dct_cost_rounding),         \
-                           [cospi_16_64] "r"(cospi_16_64));                    \
-    out;                                                                       \
-  })
-
-void aom_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                   int dest_stride);
-void aom_idct4_rows_dspr2(const int16_t *input, int16_t *output);
-void aom_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                     int dest_stride);
-void iadst4_dspr2(const int16_t *input, int16_t *output);
-void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
-void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                 int dest_stride);
-void iadst8_dspr2(const int16_t *input, int16_t *output);
-void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
-void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride);
-void iadst16_dspr2(const int16_t *input, int16_t *output);
-
-#endif  // #if HAVE_DSPR2
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_DSP_MIPS_INV_TXFM_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/inv_txfm_msa.h b/third_party/aom/aom_dsp/mips/inv_txfm_msa.h
deleted file mode 100644
index 122667aa8..000000000
--- a/third_party/aom/aom_dsp/mips/inv_txfm_msa.h
+++ /dev/null
@@ -1,412 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_MIPS_INV_TXFM_MSA_H_
-#define AOM_DSP_MIPS_INV_TXFM_MSA_H_
-
-#include "aom_dsp/mips/macros_msa.h"
-#include "aom_dsp/mips/txfm_macros_msa.h"
-#include "aom_dsp/txfm_common.h"
-
-#define AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,  \
-                  out3, out4, out5, out6, out7)                              \
-  {                                                                          \
-    v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m;                       \
-    v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m;                        \
-    v8i16 coeff0_m = { cospi_2_64,  cospi_6_64,  cospi_10_64, cospi_14_64,   \
-                       cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \
-    v8i16 coeff1_m = { cospi_8_64,  -cospi_8_64,  cospi_16_64, -cospi_16_64, \
-                       cospi_24_64, -cospi_24_64, 0,           0 };          \
-                                                                             \
-    SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m);                          \
-    cnst2_m = -cnst0_m;                                                      \
-    ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);       \
-    SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m);                          \
-    cnst4_m = -cnst2_m;                                                      \
-    ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);       \
-                                                                             \
-    ILVRL_H2_SH(in0, in7, vec1_m, vec0_m);                                   \
-    ILVRL_H2_SH(in4, in3, vec3_m, vec2_m);                                   \
-    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m,  \
-                          cnst2_m, cnst3_m, in7, in0, in4, in3);             \
-                                                                             \
-    SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m);                          \
-    cnst2_m = -cnst0_m;                                                      \
-    ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);       \
-    SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m);                          \
-    cnst4_m = -cnst2_m;                                                      \
-    ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);       \
-                                                                             \
-    ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                   \
-    ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                   \
-                                                                             \
-    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m,  \
-                          cnst2_m, cnst3_m, in5, in2, in6, in1);             \
-    BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5);                   \
-    out7 = -s0_m;                                                            \
-    out0 = s1_m;                                                             \
-                                                                             \
-    SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m);  \
-                                                                             \
-    ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m);       \
-    cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                               \
-    cnst1_m = cnst0_m;                                                       \
-                                                                             \
-    ILVRL_H2_SH(in4, in3, vec1_m, vec0_m);                                   \
-    ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                   \
-    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m,  \
-                          cnst3_m, cnst1_m, out1, out6, s0_m, s1_m);         \
-                                                                             \
-    SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                          \
-    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                               \
-                                                                             \
-    ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                   \
-    ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m);                                 \
-    out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                   \
-    out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);                   \
-    out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);                   \
-    out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);                   \
-                                                                             \
-    out1 = -out1;                                                            \
-    out3 = -out3;                                                            \
-    out5 = -out5;                                                            \
-  }
-
-#define AOM_SET_COSPI_PAIR(c0_h, c1_h)  \
-  ({                                    \
-    v8i16 out0_m, r0_m, r1_m;           \
-                                        \
-    r0_m = __msa_fill_h(c0_h);          \
-    r1_m = __msa_fill_h(c1_h);          \
-    out0_m = __msa_ilvev_h(r1_m, r0_m); \
-                                        \
-    out0_m;                             \
-  })
-
-#define AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3)               \
-  {                                                                            \
-    uint8_t *dst_m = (uint8_t *)(dst);                                         \
-    v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                                      \
-    v16i8 tmp0_m, tmp1_m;                                                      \
-    v16i8 zero_m = { 0 };                                                      \
-    v8i16 res0_m, res1_m, res2_m, res3_m;                                      \
-                                                                               \
-    LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m);                 \
-    ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, zero_m, dst3_m, \
-               res0_m, res1_m, res2_m, res3_m);                                \
-    ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, res0_m, res1_m,   \
-         res2_m, res3_m);                                                      \
-    CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m);                            \
-    PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m);               \
-    ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride);                               \
-  }
-
-#define AOM_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3)             \
-  {                                                                         \
-    v8i16 c0_m, c1_m, c2_m, c3_m;                                           \
-    v8i16 step0_m, step1_m;                                                 \
-    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
-                                                                            \
-    c0_m = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);                    \
-    c1_m = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);                   \
-    step0_m = __msa_ilvr_h(in2, in0);                                       \
-    DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m);              \
-                                                                            \
-    c2_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                    \
-    c3_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                     \
-    step1_m = __msa_ilvr_h(in3, in1);                                       \
-    DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m);              \
-    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);            \
-                                                                            \
-    PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m);            \
-    SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8);                        \
-    BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, (v8i16)tmp2_m, (v8i16)tmp3_m, \
-                out0, out1, out2, out3);                                    \
-  }
-
-#define AOM_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3)       \
-  {                                                                    \
-    v8i16 res0_m, res1_m, c0_m, c1_m;                                  \
-    v8i16 k1_m, k2_m, k3_m, k4_m;                                      \
-    v8i16 zero_m = { 0 };                                              \
-    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \
-    v4i32 int0_m, int1_m, int2_m, int3_m;                              \
-    v8i16 mask_m = { sinpi_1_9,  sinpi_2_9,  sinpi_3_9,  sinpi_4_9,    \
-                     -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, -sinpi_4_9 }; \
-                                                                       \
-    SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m);          \
-    ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m);                   \
-    ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                    \
-    DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m);           \
-    int0_m = tmp2_m + tmp1_m;                                          \
-                                                                       \
-    SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m);                            \
-    ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m);                   \
-    DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);           \
-    int1_m = tmp0_m + tmp1_m;                                          \
-                                                                       \
-    c0_m = __msa_splati_h(mask_m, 6);                                  \
-    ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m);                  \
-    ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                    \
-    DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);           \
-    int2_m = tmp0_m + tmp1_m;                                          \
-                                                                       \
-    c0_m = __msa_splati_h(mask_m, 6);                                  \
-    c0_m = __msa_ilvev_h(c0_m, k1_m);                                  \
-                                                                       \
-    res0_m = __msa_ilvr_h((in1), (in3));                               \
-    tmp0_m = __msa_dotp_s_w(res0_m, c0_m);                             \
-    int3_m = tmp2_m + tmp0_m;                                          \
-                                                                       \
-    res0_m = __msa_ilvr_h((in2), (in3));                               \
-    c1_m = __msa_ilvev_h(k4_m, k3_m);                                  \
-                                                                       \
-    tmp2_m = __msa_dotp_s_w(res0_m, c1_m);                             \
-    res1_m = __msa_ilvr_h((in0), (in2));                               \
-    c1_m = __msa_ilvev_h(k1_m, zero_m);                                \
-                                                                       \
-    tmp3_m = __msa_dotp_s_w(res1_m, c1_m);                             \
-    int3_m += tmp2_m;                                                  \
-    int3_m += tmp3_m;                                                  \
-                                                                       \
-    SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS);       \
-    PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1);           \
-    PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3);           \
-  }
-
-#define AV1_SET_CONST_PAIR(mask_h, idx1_h, idx2_h)    \
-  ({                                                  \
-    v8i16 c0_m, c1_m;                                 \
-                                                      \
-    SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \
-    c0_m = __msa_ilvev_h(c1_m, c0_m);                 \
-                                                      \
-    c0_m;                                             \
-  })
-
-/* multiply and add macro */
-#define AV1_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1,  \
-                 out2, out3)                                                  \
-  {                                                                           \
-    v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                         \
-    v4i32 tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd;                         \
-                                                                              \
-    ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m);                            \
-    ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m);                            \
-    DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \
-                cst1, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd);            \
-    SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS);  \
-    PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out0, out1);      \
-    DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \
-                cst3, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd);            \
-    SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS);  \
-    PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out2, out3);      \
-  }
-
-/* idct 8x8 macro */
-#define AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,    \
-                       out2, out3, out4, out5, out6, out7)                    \
-  {                                                                           \
-    v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m;             \
-    v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m;             \
-    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
-    v8i16 mask_m = { cospi_28_64, cospi_4_64,  cospi_20_64,  cospi_12_64,     \
-                     cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 };  \
-                                                                              \
-    k0_m = AV1_SET_CONST_PAIR(mask_m, 0, 5);                                  \
-    k1_m = AV1_SET_CONST_PAIR(mask_m, 1, 0);                                  \
-    k2_m = AV1_SET_CONST_PAIR(mask_m, 6, 3);                                  \
-    k3_m = AV1_SET_CONST_PAIR(mask_m, 3, 2);                                  \
-    AV1_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \
-    SUB2(in1, in3, in7, in5, res0_m, res1_m);                                 \
-    k0_m = AV1_SET_CONST_PAIR(mask_m, 4, 7);                                  \
-    k1_m = __msa_splati_h(mask_m, 4);                                         \
-                                                                              \
-    ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m);                              \
-    DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m,       \
-                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                              \
-    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
-    tp4_m = in1 + in3;                                                        \
-    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m);                \
-    tp7_m = in7 + in5;                                                        \
-    k2_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                      \
-    k3_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                       \
-    AV1_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, in0, in4, in2, in6); \
-    BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m);              \
-    BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, out0, \
-                out1, out2, out3, out4, out5, out6, out7);                    \
-  }
-
-#define AV1_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,   \
-                        out2, out3, out4, out5, out6, out7)                   \
-  {                                                                           \
-    v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m;                     \
-    v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m;                                 \
-    v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1;           \
-    v8i16 mask1_m = { cospi_2_64,  cospi_30_64,  -cospi_2_64, cospi_10_64,    \
-                      cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 };  \
-    v8i16 mask2_m = { cospi_14_64,  -cospi_18_64, cospi_26_64, cospi_6_64,    \
-                      -cospi_26_64, cospi_8_64,   cospi_24_64, -cospi_8_64 }; \
-    v8i16 mask3_m = {                                                         \
-      -cospi_24_64, cospi_8_64, cospi_16_64, -cospi_16_64, 0, 0, 0, 0         \
-    };                                                                        \
-                                                                              \
-    k0_m = AV1_SET_CONST_PAIR(mask1_m, 0, 1);                                 \
-    k1_m = AV1_SET_CONST_PAIR(mask1_m, 1, 2);                                 \
-    ILVRL_H2_SH(in1, in0, in_s1, in_s0);                                      \
-    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m,     \
-                r1_m, r2_m, r3_m);                                            \
-    k0_m = AV1_SET_CONST_PAIR(mask1_m, 6, 7);                                 \
-    k1_m = AV1_SET_CONST_PAIR(mask2_m, 0, 1);                                 \
-    ILVRL_H2_SH(in5, in4, in_s1, in_s0);                                      \
-    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m,     \
-                r5_m, r6_m, r7_m);                                            \
-    ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m,    \
-         m3_m);                                                               \
-    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
-    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m);                      \
-    SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m,    \
-         m3_m);                                                               \
-    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
-    PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m);                          \
-    k0_m = AV1_SET_CONST_PAIR(mask1_m, 3, 4);                                 \
-    k1_m = AV1_SET_CONST_PAIR(mask1_m, 4, 5);                                 \
-    ILVRL_H2_SH(in3, in2, in_s1, in_s0);                                      \
-    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m,     \
-                r1_m, r2_m, r3_m);                                            \
-    k0_m = AV1_SET_CONST_PAIR(mask2_m, 2, 3);                                 \
-    k1_m = AV1_SET_CONST_PAIR(mask2_m, 3, 4);                                 \
-    ILVRL_H2_SH(in7, in6, in_s1, in_s0);                                      \
-    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m,     \
-                r5_m, r6_m, r7_m);                                            \
-    ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m,    \
-         m3_m);                                                               \
-    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
-    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m);                      \
-    SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m,    \
-         m3_m);                                                               \
-    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
-    PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m);                          \
-    ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m);                                      \
-    BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3);         \
-    k0_m = AV1_SET_CONST_PAIR(mask2_m, 5, 6);                                 \
-    k1_m = AV1_SET_CONST_PAIR(mask2_m, 6, 7);                                 \
-    ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0);                                    \
-    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m,     \
-                r1_m, r2_m, r3_m);                                            \
-    k1_m = AV1_SET_CONST_PAIR(mask3_m, 0, 1);                                 \
-    DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, r4_m, r5_m,   \
-                r6_m, r7_m);                                                  \
-    ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m,    \
-         m3_m);                                                               \
-    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
-    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6);                           \
-    SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m,    \
-         m3_m);                                                               \
-    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
-    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5);                            \
-    k0_m = AV1_SET_CONST_PAIR(mask3_m, 2, 2);                                 \
-    k1_m = AV1_SET_CONST_PAIR(mask3_m, 2, 3);                                 \
-    ILVRL_H2_SH(in4, in3, in_s1, in_s0);                                      \
-    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, m0_m,     \
-                m1_m, m2_m, m3_m);                                            \
-    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
-    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4);                           \
-    ILVRL_H2_SW(in5, in2, m2_m, m3_m);                                        \
-    DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, m0_m, m1_m,   \
-                m2_m, m3_m);                                                  \
-    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
-    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5);                           \
-                                                                              \
-    out1 = -in1;                                                              \
-    out3 = -in3;                                                              \
-    out5 = -in5;                                                              \
-    out7 = -in7;                                                              \
-  }
-
-#define AOM_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11,     \
-                         r12, r13, r14, r15, out0, out1, out2, out3, out4,     \
-                         out5, out6, out7, out8, out9, out10, out11, out12,    \
-                         out13, out14, out15)                                  \
-  {                                                                            \
-    v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m;                      \
-    v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m;                \
-    v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m;                      \
-    v8i16 h8_m, h9_m, h10_m, h11_m;                                            \
-    v8i16 k0_m, k1_m, k2_m, k3_m;                                              \
-                                                                               \
-    /* stage 1 */                                                              \
-    k0_m = AOM_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);                        \
-    k1_m = AOM_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);                       \
-    k2_m = AOM_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);                       \
-    k3_m = AOM_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);                      \
-    MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, g0_m, g1_m, g2_m, g3_m);  \
-    k0_m = AOM_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);                        \
-    k1_m = AOM_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);                       \
-    k2_m = AOM_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);                       \
-    k3_m = AOM_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);                      \
-    MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, g4_m, g5_m, g6_m, g7_m); \
-    k0_m = AOM_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);                        \
-    k1_m = AOM_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);                       \
-    k2_m = AOM_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);                        \
-    k3_m = AOM_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);                       \
-    MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, g8_m, g9_m, g10_m,       \
-            g11_m);                                                            \
-    k0_m = AOM_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);                       \
-    k1_m = AOM_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);                      \
-    k2_m = AOM_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);                        \
-    k3_m = AOM_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);                       \
-    MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, g12_m, g13_m, g14_m,      \
-            g15_m);                                                            \
-                                                                               \
-    /* stage 2 */                                                              \
-    k0_m = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);                        \
-    k1_m = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);                       \
-    k2_m = AOM_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);                       \
-    MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, h0_m, h1_m, h2_m, \
-            h3_m);                                                             \
-    k0_m = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);                       \
-    k1_m = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);                      \
-    k2_m = AOM_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);                      \
-    MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, h4_m, h5_m,      \
-            h6_m, h7_m);                                                       \
-    BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10);             \
-    BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, h8_m, h9_m, \
-                h10_m, h11_m, h6_m, h4_m, h2_m, h0_m);                         \
-                                                                               \
-    /* stage 3 */                                                              \
-    BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m);           \
-    k0_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                        \
-    k1_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                       \
-    k2_m = AOM_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);                       \
-    MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, out4, out6, out5,  \
-            out7);                                                             \
-    MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, out12, out14,      \
-            out13, out15);                                                     \
-                                                                               \
-    /* stage 4 */                                                              \
-    k0_m = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);                       \
-    k1_m = AOM_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);                     \
-    k2_m = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);                      \
-    k3_m = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);                      \
-    MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3);                          \
-    MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7);                            \
-    MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11);                        \
-    MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15);                        \
-  }
-
-void aom_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
-                                      int32_t dst_stride);
-void aom_idct16_1d_rows_msa(const int16_t *input, int16_t *output);
-void aom_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
-                                       int32_t dst_stride);
-void aom_iadst16_1d_rows_msa(const int16_t *input, int16_t *output);
-#endif  // AOM_DSP_MIPS_INV_TXFM_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/itrans16_dspr2.c b/third_party/aom/aom_dsp/mips/itrans16_dspr2.c
deleted file mode 100644
index c63b1e857..000000000
--- a/third_party/aom/aom_dsp/mips/itrans16_dspr2.c
+++ /dev/null
@@ -1,1190 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/inv_txfm_dspr2.h"
-#include "aom_dsp/txfm_common.h"
-
-#if HAVE_DSPR2
-void idct16_rows_dspr2(const int16_t *input, int16_t *output,
-                       uint32_t no_rows) {
-  int i;
-  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
-  int step1_10, step1_11, step1_12, step1_13;
-  int step2_0, step2_1, step2_2, step2_3;
-  int step2_8, step2_9, step2_10, step2_11;
-  int step2_12, step2_13, step2_14, step2_15;
-  int load1, load2, load3, load4, load5, load6, load7, load8;
-  int result1, result2, result3, result4;
-  const int const_2_power_13 = 8192;
-
-  for (i = no_rows; i--;) {
-    /* prefetch row */
-    prefetch_load((const uint8_t *)(input + 16));
-
-    __asm__ __volatile__(
-        "lh       %[load1],              0(%[input])                    \n\t"
-        "lh       %[load2],             16(%[input])                    \n\t"
-        "lh       %[load3],              8(%[input])                    \n\t"
-        "lh       %[load4],             24(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "add      %[result1],           %[load1],       %[load2]        \n\t"
-        "sub      %[result2],           %[load1],       %[load2]        \n\t"
-        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
-        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
-        "extp     %[step2_0],           $ac1,           31              \n\t"
-        "extp     %[step2_1],           $ac2,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
-        "extp     %[step2_2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
-        "extp     %[step2_3],           $ac1,           31              \n\t"
-
-        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
-        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
-        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
-        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [result1] "=&r"(result1),
-          [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
-          [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
-          [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
-          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
-          [step1_3] "=r"(step1_3)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
-          [cospi_16_64] "r"(cospi_16_64));
-
-    __asm__ __volatile__(
-        "lh       %[load5],             2(%[input])                     \n\t"
-        "lh       %[load6],             30(%[input])                    \n\t"
-        "lh       %[load7],             18(%[input])                    \n\t"
-        "lh       %[load8],             14(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
-        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
-        "extp     %[result1],           $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
-        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
-        "extp     %[result2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
-        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
-        "extp     %[result3],           $ac1,           31              \n\t"
-
-        "madd     $ac2,                 %[load5],       %[cospi_2_64]   \n\t"
-        "madd     $ac2,                 %[load6],       %[cospi_30_64]  \n\t"
-        "extp     %[result4],           $ac2,           31              \n\t"
-
-        "sub      %[load5],             %[result1],     %[result2]      \n\t"
-        "sub      %[load6],             %[result4],     %[result3]      \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
-        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
-        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
-
-        "extp     %[step2_9],           $ac1,           31              \n\t"
-        "extp     %[step2_14],          $ac3,           31              \n\t"
-        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
-        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
-
-        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
-          [load8] "=&r"(load8), [result1] "=&r"(result1),
-          [result2] "=&r"(result2), [result3] "=&r"(result3),
-          [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
-          [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
-          [step2_14] "=r"(step2_14)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
-          [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
-          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             10(%[input])                    \n\t"
-        "lh       %[load2],             22(%[input])                    \n\t"
-        "lh       %[load3],             26(%[input])                    \n\t"
-        "lh       %[load4],             6(%[input])                     \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
-        "extp     %[result1],           $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load3],       %[cospi_6_64]   \n\t"
-        "msub     $ac3,                 %[load4],       %[cospi_26_64]  \n\t"
-        "extp     %[result2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_10_64]  \n\t"
-        "madd     $ac1,                 %[load2],       %[cospi_22_64]  \n\t"
-        "extp     %[result3],           $ac1,           31              \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_26_64]  \n\t"
-        "madd     $ac2,                 %[load4],       %[cospi_6_64]   \n\t"
-        "extp     %[result4],           $ac2,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[result2],     %[result1]      \n\t"
-        "sub      %[load2],             %[result4],     %[result3]      \n\t"
-
-        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
-
-        "extp     %[step2_10],          $ac1,           31              \n\t"
-        "extp     %[step2_13],          $ac3,           31              \n\t"
-        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
-        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [result1] "=&r"(result1),
-          [result2] "=&r"(result2), [result3] "=&r"(result3),
-          [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
-          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
-          [step2_13] "=r"(step2_13)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
-          [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
-          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
-
-    __asm__ __volatile__(
-        "lh       %[load5],             4(%[input])                     \n\t"
-        "lh       %[load6],             28(%[input])                    \n\t"
-        "lh       %[load7],             20(%[input])                    \n\t"
-        "lh       %[load8],             12(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load5],       %[cospi_28_64]  \n\t"
-        "msub     $ac1,                 %[load6],       %[cospi_4_64]   \n\t"
-        "extp     %[result1],           $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load7],       %[cospi_12_64]  \n\t"
-        "msub     $ac3,                 %[load8],       %[cospi_20_64]  \n\t"
-        "extp     %[result2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac1,                 %[load7],       %[cospi_20_64]  \n\t"
-        "madd     $ac1,                 %[load8],       %[cospi_12_64]  \n\t"
-        "extp     %[result3],           $ac1,           31              \n\t"
-
-        "madd     $ac2,                 %[load5],       %[cospi_4_64]   \n\t"
-        "madd     $ac2,                 %[load6],       %[cospi_28_64]  \n\t"
-        "extp     %[result4],           $ac2,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load5],             %[result4],     %[result3]      \n\t"
-        "sub      %[load5],             %[load5],       %[result1]      \n\t"
-        "add      %[load5],             %[load5],       %[result2]      \n\t"
-
-        "sub      %[load6],             %[result1],     %[result2]      \n\t"
-        "sub      %[load6],             %[load6],       %[result3]      \n\t"
-        "add      %[load6],             %[load6],       %[result4]      \n\t"
-
-        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
-        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "extp     %[step1_5],           $ac1,           31              \n\t"
-        "extp     %[step1_6],           $ac3,           31              \n\t"
-        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
-        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
-
-        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
-          [load8] "=&r"(load8), [result1] "=&r"(result1),
-          [result2] "=&r"(result2), [result3] "=&r"(result3),
-          [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
-          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
-          [step1_7] "=r"(step1_7)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
-          [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
-          [cospi_16_64] "r"(cospi_16_64));
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
-        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
-
-        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
-
-        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
-        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
-        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
-
-        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
-        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
-
-        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
-
-        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
-        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
-        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
-
-        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "extp     %[step1_10],          $ac0,           31              \n\t"
-        "extp     %[step1_13],          $ac1,           31              \n\t"
-        "extp     %[step1_11],          $ac2,           31              \n\t"
-        "extp     %[step1_12],          $ac3,           31              \n\t"
-
-        : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
-          [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
-          [step1_13] "=r"(step1_13)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
-          [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
-          [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
-          [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
-          [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
-
-    __asm__ __volatile__(
-        "add      %[load5],             %[step1_0],     %[step1_7]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_12]     \n\t"
-        "add      %[load5],             %[load5],       %[step2_15]     \n\t"
-        "add      %[load6],             %[step1_1],     %[step1_6]      \n\t"
-        "add      %[load6],             %[load6],       %[step2_13]     \n\t"
-        "add      %[load6],             %[load6],       %[step2_14]     \n\t"
-        "sh       %[load5],             0(%[output])                    \n\t"
-        "sh       %[load6],             32(%[output])                   \n\t"
-        "sub      %[load5],             %[step1_1],     %[step1_6]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_9]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
-        "sub      %[load6],             %[step1_0],     %[step1_7]      \n\t"
-        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
-        "add      %[load6],             %[load6],       %[step2_11]     \n\t"
-        "sh       %[load5],             192(%[output])                  \n\t"
-        "sh       %[load6],             224(%[output])                  \n\t"
-        "sub      %[load5],             %[step1_0],     %[step1_7]      \n\t"
-        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
-        "sub      %[load5],             %[load5],       %[step2_11]     \n\t"
-        "sub      %[load6],             %[step1_1],     %[step1_6]      \n\t"
-        "sub      %[load6],             %[load6],       %[step2_9]      \n\t"
-        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
-        "sh       %[load5],             256(%[output])                  \n\t"
-        "sh       %[load6],             288(%[output])                  \n\t"
-        "add      %[load5],             %[step1_1],     %[step1_6]      \n\t"
-        "sub      %[load5],             %[load5],       %[step2_13]     \n\t"
-        "sub      %[load5],             %[load5],       %[step2_14]     \n\t"
-        "add      %[load6],             %[step1_0],     %[step1_7]      \n\t"
-        "sub      %[load6],             %[load6],       %[step2_12]     \n\t"
-        "sub      %[load6],             %[load6],       %[step2_15]     \n\t"
-        "sh       %[load5],             448(%[output])                  \n\t"
-        "sh       %[load6],             480(%[output])                  \n\t"
-
-        : [load5] "=&r"(load5), [load6] "=&r"(load6)
-        : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1),
-          [step1_6] "r"(step1_6), [step1_7] "r"(step1_7),
-          [step2_8] "r"(step2_8), [step2_9] "r"(step2_9),
-          [step2_10] "r"(step2_10), [step2_11] "r"(step2_11),
-          [step2_12] "r"(step2_12), [step2_13] "r"(step2_13),
-          [step2_14] "r"(step2_14), [step2_15] "r"(step2_15));
-
-    __asm__ __volatile__(
-        "add      %[load5],             %[step1_2],     %[step1_5]      \n\t"
-        "add      %[load5],             %[load5],       %[step1_13]     \n\t"
-        "add      %[load6],             %[step1_3],     %[step1_4]      \n\t"
-        "add      %[load6],             %[load6],       %[step1_12]     \n\t"
-        "sh       %[load5],             64(%[output])                   \n\t"
-        "sh       %[load6],             96(%[output])                   \n\t"
-        "sub      %[load5],             %[step1_3],     %[step1_4]      \n\t"
-        "add      %[load5],             %[load5],       %[step1_11]     \n\t"
-        "sub      %[load6],             %[step1_2],     %[step1_5]      \n\t"
-        "add      %[load6],             %[load6],       %[step1_10]     \n\t"
-        "sh       %[load5],             128(%[output])                  \n\t"
-        "sh       %[load6],             160(%[output])                  \n\t"
-        "sub      %[load5],             %[step1_2],     %[step1_5]      \n\t"
-        "sub      %[load5],             %[load5],       %[step1_10]     \n\t"
-        "sub      %[load6],             %[step1_3],     %[step1_4]      \n\t"
-        "sub      %[load6],             %[load6],       %[step1_11]     \n\t"
-        "sh       %[load5],             320(%[output])                  \n\t"
-        "sh       %[load6],             352(%[output])                  \n\t"
-        "add      %[load5],             %[step1_3],     %[step1_4]      \n\t"
-        "sub      %[load5],             %[load5],       %[step1_12]     \n\t"
-        "add      %[load6],             %[step1_2],     %[step1_5]      \n\t"
-        "sub      %[load6],             %[load6],       %[step1_13]     \n\t"
-        "sh       %[load5],             384(%[output])                  \n\t"
-        "sh       %[load6],             416(%[output])                  \n\t"
-
-        : [load5] "=&r"(load5), [load6] "=&r"(load6)
-        : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
-          [step1_4] "r"(step1_4), [step1_5] "r"(step1_5),
-          [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
-          [step1_12] "r"(step1_12), [step1_13] "r"(step1_13));
-
-    input += 16;
-    output += 1;
-  }
-}
-
-void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
-  int i;
-  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
-  int step1_8, step1_9, step1_10, step1_11;
-  int step1_12, step1_13, step1_14, step1_15;
-  int step2_0, step2_1, step2_2, step2_3;
-  int step2_8, step2_9, step2_10, step2_11;
-  int step2_12, step2_13, step2_14, step2_15;
-  int load1, load2, load3, load4, load5, load6, load7, load8;
-  int result1, result2, result3, result4;
-  const int const_2_power_13 = 8192;
-  uint8_t *dest_pix;
-  uint8_t *cm = aom_ff_cropTbl;
-
-  /* prefetch aom_ff_cropTbl */
-  prefetch_load(aom_ff_cropTbl);
-  prefetch_load(aom_ff_cropTbl + 32);
-  prefetch_load(aom_ff_cropTbl + 64);
-  prefetch_load(aom_ff_cropTbl + 96);
-  prefetch_load(aom_ff_cropTbl + 128);
-  prefetch_load(aom_ff_cropTbl + 160);
-  prefetch_load(aom_ff_cropTbl + 192);
-  prefetch_load(aom_ff_cropTbl + 224);
-
-  for (i = 0; i < 16; ++i) {
-    dest_pix = (dest + i);
-    __asm__ __volatile__(
-        "lh       %[load1],              0(%[input])                    \n\t"
-        "lh       %[load2],             16(%[input])                    \n\t"
-        "lh       %[load3],              8(%[input])                    \n\t"
-        "lh       %[load4],             24(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "add      %[result1],           %[load1],       %[load2]        \n\t"
-        "sub      %[result2],           %[load1],       %[load2]        \n\t"
-        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
-        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
-        "extp     %[step2_0],           $ac1,           31              \n\t"
-        "extp     %[step2_1],           $ac2,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
-        "extp     %[step2_2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
-        "extp     %[step2_3],           $ac1,           31              \n\t"
-
-        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
-        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
-        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
-        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [result1] "=&r"(result1),
-          [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
-          [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
-          [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
-          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
-          [step1_3] "=r"(step1_3)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
-          [cospi_16_64] "r"(cospi_16_64));
-
-    __asm__ __volatile__(
-        "lh       %[load5],             2(%[input])                     \n\t"
-        "lh       %[load6],             30(%[input])                    \n\t"
-        "lh       %[load7],             18(%[input])                    \n\t"
-        "lh       %[load8],             14(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
-        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
-        "extp     %[result1],           $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
-        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
-        "extp     %[result2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
-        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
-        "extp     %[result3],           $ac1,           31              \n\t"
-
-        "madd     $ac2,                 %[load5],        %[cospi_2_64]  \n\t"
-        "madd     $ac2,                 %[load6],        %[cospi_30_64] \n\t"
-        "extp     %[result4],           $ac2,            31             \n\t"
-
-        "sub      %[load5],             %[result1],     %[result2]      \n\t"
-        "sub      %[load6],             %[result4],     %[result3]      \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
-        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
-        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
-
-        "extp     %[step2_9],           $ac1,           31              \n\t"
-        "extp     %[step2_14],          $ac3,           31              \n\t"
-        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
-        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
-
-        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
-          [load8] "=&r"(load8), [result1] "=&r"(result1),
-          [result2] "=&r"(result2), [result3] "=&r"(result3),
-          [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
-          [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
-          [step2_14] "=r"(step2_14)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
-          [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
-          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             10(%[input])                    \n\t"
-        "lh       %[load2],             22(%[input])                    \n\t"
-        "lh       %[load3],             26(%[input])                    \n\t"
-        "lh       %[load4],             6(%[input])                     \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],    %[cospi_22_64]     \n\t"
-        "msub     $ac1,                 %[load2],    %[cospi_10_64]     \n\t"
-        "extp     %[result1],           $ac1,        31                 \n\t"
-
-        "madd     $ac3,                 %[load3],    %[cospi_6_64]      \n\t"
-        "msub     $ac3,                 %[load4],    %[cospi_26_64]     \n\t"
-        "extp     %[result2],           $ac3,        31                 \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac1,                 %[load1],    %[cospi_10_64]     \n\t"
-        "madd     $ac1,                 %[load2],    %[cospi_22_64]     \n\t"
-        "extp     %[result3],           $ac1,        31                 \n\t"
-
-        "madd     $ac2,                 %[load3],    %[cospi_26_64]     \n\t"
-        "madd     $ac2,                 %[load4],    %[cospi_6_64]      \n\t"
-        "extp     %[result4],           $ac2,        31                 \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[result2],     %[result1]      \n\t"
-        "sub      %[load2],             %[result4],     %[result3]      \n\t"
-
-        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
-
-        "extp     %[step2_10],          $ac1,           31              \n\t"
-        "extp     %[step2_13],          $ac3,           31              \n\t"
-        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
-        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [result1] "=&r"(result1),
-          [result2] "=&r"(result2), [result3] "=&r"(result3),
-          [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
-          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
-          [step2_13] "=r"(step2_13)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
-          [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
-          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
-
-    __asm__ __volatile__(
-        "lh       %[load5],             4(%[input])                   \n\t"
-        "lh       %[load6],             28(%[input])                  \n\t"
-        "lh       %[load7],             20(%[input])                  \n\t"
-        "lh       %[load8],             12(%[input])                  \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
-        "mthi     $zero,                $ac1                          \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                          \n\t"
-        "mthi     $zero,                $ac3                          \n\t"
-
-        "madd     $ac1,                 %[load5],    %[cospi_28_64]   \n\t"
-        "msub     $ac1,                 %[load6],    %[cospi_4_64]    \n\t"
-        "extp     %[result1],           $ac1,        31               \n\t"
-
-        "madd     $ac3,                 %[load7],    %[cospi_12_64]   \n\t"
-        "msub     $ac3,                 %[load8],    %[cospi_20_64]   \n\t"
-        "extp     %[result2],           $ac3,        31               \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
-        "mthi     $zero,                $ac1                          \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                          \n\t"
-        "mthi     $zero,                $ac2                          \n\t"
-
-        "madd     $ac1,                 %[load7],    %[cospi_20_64]   \n\t"
-        "madd     $ac1,                 %[load8],    %[cospi_12_64]   \n\t"
-        "extp     %[result3],           $ac1,        31               \n\t"
-
-        "madd     $ac2,                 %[load5],    %[cospi_4_64]    \n\t"
-        "madd     $ac2,                 %[load6],    %[cospi_28_64]   \n\t"
-        "extp     %[result4],           $ac2,        31               \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load5],             %[result4],     %[result3]      \n\t"
-        "sub      %[load5],             %[load5],       %[result1]      \n\t"
-        "add      %[load5],             %[load5],       %[result2]      \n\t"
-
-        "sub      %[load6],             %[result1],     %[result2]      \n\t"
-        "sub      %[load6],             %[load6],       %[result3]      \n\t"
-        "add      %[load6],             %[load6],       %[result4]      \n\t"
-
-        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
-        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "extp     %[step1_5],           $ac1,           31              \n\t"
-        "extp     %[step1_6],           $ac3,           31              \n\t"
-
-        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
-        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
-
-        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
-          [load8] "=&r"(load8), [result1] "=&r"(result1),
-          [result2] "=&r"(result2), [result3] "=&r"(result3),
-          [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
-          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
-          [step1_7] "=r"(step1_7)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
-          [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
-          [cospi_16_64] "r"(cospi_16_64));
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
-        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
-
-        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
-
-        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
-        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
-        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
-
-        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
-        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
-
-        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
-
-        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
-        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
-        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
-
-        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "extp     %[step1_10],          $ac0,           31              \n\t"
-        "extp     %[step1_13],          $ac1,           31              \n\t"
-        "extp     %[step1_11],          $ac2,           31              \n\t"
-        "extp     %[step1_12],          $ac3,           31              \n\t"
-
-        : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
-          [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
-          [step1_13] "=r"(step1_13)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
-          [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
-          [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
-          [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
-          [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
-
-    step1_8 = step2_8 + step2_11;
-    step1_9 = step2_9 + step2_10;
-    step1_14 = step2_13 + step2_14;
-    step1_15 = step2_12 + step2_15;
-
-    __asm__ __volatile__(
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "add      %[load5],         %[step1_0],         %[step1_7]      \n\t"
-        "add      %[load5],         %[load5],           %[step1_15]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "add      %[load6],         %[step1_1],         %[step1_6]      \n\t"
-        "add      %[load6],         %[load6],           %[step1_14]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "add      %[load5],         %[step1_2],         %[step1_5]      \n\t"
-        "add      %[load5],         %[load5],           %[step1_13]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "add      %[load6],         %[step1_3],         %[step1_4]      \n\t"
-        "add      %[load6],         %[load6],           %[step1_12]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "sub      %[load5],         %[step1_3],         %[step1_4]      \n\t"
-        "add      %[load5],         %[load5],           %[step1_11]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "sub      %[load6],         %[step1_2],         %[step1_5]      \n\t"
-        "add      %[load6],         %[load6],           %[step1_10]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "sub      %[load5],         %[step1_1],         %[step1_6]      \n\t"
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "add      %[load5],         %[load5],           %[step1_9]      \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "sub      %[load6],         %[step1_0],         %[step1_7]      \n\t"
-        "add      %[load6],         %[load6],           %[step1_8]      \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "sub      %[load5],         %[step1_0],         %[step1_7]      \n\t"
-        "sub      %[load5],         %[load5],           %[step1_8]      \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "sub      %[load6],         %[step1_1],         %[step1_6]      \n\t"
-        "sub      %[load6],         %[load6],           %[step1_9]      \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "sub      %[load5],         %[step1_2],         %[step1_5]      \n\t"
-        "sub      %[load5],         %[load5],           %[step1_10]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "sub      %[load6],         %[step1_3],         %[step1_4]      \n\t"
-        "sub      %[load6],         %[load6],           %[step1_11]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "add      %[load5],         %[step1_3],         %[step1_4]      \n\t"
-        "sub      %[load5],         %[load5],           %[step1_12]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "add      %[load6],         %[step1_2],         %[step1_5]      \n\t"
-        "sub      %[load6],         %[load6],           %[step1_13]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "add      %[load5],         %[step1_1],         %[step1_6]      \n\t"
-        "sub      %[load5],         %[load5],           %[step1_14]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "add      %[load6],         %[step1_0],         %[step1_7]      \n\t"
-        "sub      %[load6],         %[load6],           %[step1_15]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-
-        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
-          [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix)
-        :
-        [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
-        [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
-        [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
-        [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9),
-        [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
-        [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
-        [step1_14] "r"(step1_14), [step1_15] "r"(step1_15));
-
-    input += 16;
-  }
-}
-
-void aom_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
-                                 int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
-
-  // First transform rows
-  idct16_rows_dspr2(input, out, 16);
-
-  // Then transform columns and add to dest
-  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
-}
-
-void aom_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
-                                int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
-  int16_t *outptr = out;
-  uint32_t i;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
-
-  // First transform rows. Since all non-zero dct coefficients are in
-  // upper-left 4x4 area, we only need to calculate first 4 rows here.
-  idct16_rows_dspr2(input, outptr, 4);
-
-  outptr += 4;
-  for (i = 0; i < 6; ++i) {
-    __asm__ __volatile__(
-        "sw     $zero,    0(%[outptr])     \n\t"
-        "sw     $zero,   32(%[outptr])     \n\t"
-        "sw     $zero,   64(%[outptr])     \n\t"
-        "sw     $zero,   96(%[outptr])     \n\t"
-        "sw     $zero,  128(%[outptr])     \n\t"
-        "sw     $zero,  160(%[outptr])     \n\t"
-        "sw     $zero,  192(%[outptr])     \n\t"
-        "sw     $zero,  224(%[outptr])     \n\t"
-        "sw     $zero,  256(%[outptr])     \n\t"
-        "sw     $zero,  288(%[outptr])     \n\t"
-        "sw     $zero,  320(%[outptr])     \n\t"
-        "sw     $zero,  352(%[outptr])     \n\t"
-        "sw     $zero,  384(%[outptr])     \n\t"
-        "sw     $zero,  416(%[outptr])     \n\t"
-        "sw     $zero,  448(%[outptr])     \n\t"
-        "sw     $zero,  480(%[outptr])     \n\t"
-
-        :
-        : [outptr] "r"(outptr));
-
-    outptr += 2;
-  }
-
-  // Then transform columns
-  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
-}
-
-void aom_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
-                               int dest_stride) {
-  uint32_t pos = 45;
-  int32_t out;
-  int32_t r;
-  int32_t a1, absa1;
-  int32_t vector_a1;
-  int32_t t1, t2, t3, t4;
-  int32_t vector_1, vector_2, vector_3, vector_4;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-
-                       :
-                       : [pos] "r"(pos));
-
-  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
-  __asm__ __volatile__(
-      "addi     %[out],     %[out],     32      \n\t"
-      "sra      %[a1],      %[out],     6       \n\t"
-
-      : [out] "+r"(out), [a1] "=r"(a1)
-      :);
-
-  if (a1 < 0) {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__(
-        "abs        %[absa1],       %[a1]       \n\t"
-        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
-
-        : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
-        : [a1] "r"(a1));
-
-    for (r = 16; r--;) {
-      __asm__ __volatile__(
-          "lw             %[t1],          0(%[dest])                      \n\t"
-          "lw             %[t2],          4(%[dest])                      \n\t"
-          "lw             %[t3],          8(%[dest])                      \n\t"
-          "lw             %[t4],          12(%[dest])                     \n\t"
-          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
-          "sw             %[vector_1],    0(%[dest])                      \n\t"
-          "sw             %[vector_2],    4(%[dest])                      \n\t"
-          "sw             %[vector_3],    8(%[dest])                      \n\t"
-          "sw             %[vector_4],    12(%[dest])                     \n\t"
-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
-
-          : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
-            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
-            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
-            [dest] "+&r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
-    }
-  } else {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__("replv.qb   %[vector_a1],   %[a1]   \n\t"
-
-                         : [vector_a1] "=r"(vector_a1)
-                         : [a1] "r"(a1));
-
-    for (r = 16; r--;) {
-      __asm__ __volatile__(
-          "lw             %[t1],          0(%[dest])                      \n\t"
-          "lw             %[t2],          4(%[dest])                      \n\t"
-          "lw             %[t3],          8(%[dest])                      \n\t"
-          "lw             %[t4],          12(%[dest])                     \n\t"
-          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
-          "sw             %[vector_1],    0(%[dest])                      \n\t"
-          "sw             %[vector_2],    4(%[dest])                      \n\t"
-          "sw             %[vector_3],    8(%[dest])                      \n\t"
-          "sw             %[vector_4],    12(%[dest])                     \n\t"
-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
-
-          : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
-            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
-            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
-            [dest] "+&r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
-    }
-  }
-}
-
-void iadst16_dspr2(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
-
-  int x0 = input[15];
-  int x1 = input[0];
-  int x2 = input[13];
-  int x3 = input[2];
-  int x4 = input[11];
-  int x5 = input[4];
-  int x6 = input[9];
-  int x7 = input[6];
-  int x8 = input[7];
-  int x9 = input[8];
-  int x10 = input[5];
-  int x11 = input[10];
-  int x12 = input[3];
-  int x13 = input[12];
-  int x14 = input[1];
-  int x15 = input[14];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
-        x13 | x14 | x15)) {
-    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
-        output[6] = output[7] = output[8] = output[9] = output[10] =
-            output[11] = output[12] = output[13] = output[14] = output[15] = 0;
-    return;
-  }
-
-  // stage 1
-  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
-  x0 = dct_const_round_shift(s0 + s8);
-  x1 = dct_const_round_shift(s1 + s9);
-  x2 = dct_const_round_shift(s2 + s10);
-  x3 = dct_const_round_shift(s3 + s11);
-  x4 = dct_const_round_shift(s4 + s12);
-  x5 = dct_const_round_shift(s5 + s13);
-  x6 = dct_const_round_shift(s6 + s14);
-  x7 = dct_const_round_shift(s7 + s15);
-  x8 = dct_const_round_shift(s0 - s8);
-  x9 = dct_const_round_shift(s1 - s9);
-  x10 = dct_const_round_shift(s2 - s10);
-  x11 = dct_const_round_shift(s3 - s11);
-  x12 = dct_const_round_shift(s4 - s12);
-  x13 = dct_const_round_shift(s5 - s13);
-  x14 = dct_const_round_shift(s6 - s14);
-  x15 = dct_const_round_shift(s7 - s15);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4;
-  s5 = x5;
-  s6 = x6;
-  s7 = x7;
-  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
-  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
-  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
-  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
-  x0 = s0 + s4;
-  x1 = s1 + s5;
-  x2 = s2 + s6;
-  x3 = s3 + s7;
-  x4 = s0 - s4;
-  x5 = s1 - s5;
-  x6 = s2 - s6;
-  x7 = s3 - s7;
-  x8 = dct_const_round_shift(s8 + s12);
-  x9 = dct_const_round_shift(s9 + s13);
-  x10 = dct_const_round_shift(s10 + s14);
-  x11 = dct_const_round_shift(s11 + s15);
-  x12 = dct_const_round_shift(s8 - s12);
-  x13 = dct_const_round_shift(s9 - s13);
-  x14 = dct_const_round_shift(s10 - s14);
-  x15 = dct_const_round_shift(s11 - s15);
-
-  // stage 3
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
-  s8 = x8;
-  s9 = x9;
-  s10 = x10;
-  s11 = x11;
-  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
-  x0 = s0 + s2;
-  x1 = s1 + s3;
-  x2 = s0 - s2;
-  x3 = s1 - s3;
-  x4 = dct_const_round_shift(s4 + s6);
-  x5 = dct_const_round_shift(s5 + s7);
-  x6 = dct_const_round_shift(s4 - s6);
-  x7 = dct_const_round_shift(s5 - s7);
-  x8 = s8 + s10;
-  x9 = s9 + s11;
-  x10 = s8 - s10;
-  x11 = s9 - s11;
-  x12 = dct_const_round_shift(s12 + s14);
-  x13 = dct_const_round_shift(s13 + s15);
-  x14 = dct_const_round_shift(s12 - s14);
-  x15 = dct_const_round_shift(s13 - s15);
-
-  // stage 4
-  s2 = (-cospi_16_64) * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (-x6 + x7);
-  s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (-x10 + x11);
-  s14 = (-cospi_16_64) * (x14 + x15);
-  s15 = cospi_16_64 * (x14 - x15);
-
-  x2 = dct_const_round_shift(s2);
-  x3 = dct_const_round_shift(s3);
-  x6 = dct_const_round_shift(s6);
-  x7 = dct_const_round_shift(s7);
-  x10 = dct_const_round_shift(s10);
-  x11 = dct_const_round_shift(s11);
-  x14 = dct_const_round_shift(s14);
-  x15 = dct_const_round_shift(s15);
-
-  output[0] = x0;
-  output[1] = -x8;
-  output[2] = x12;
-  output[3] = -x4;
-  output[4] = x6;
-  output[5] = x14;
-  output[6] = x10;
-  output[7] = x2;
-  output[8] = x3;
-  output[9] = x11;
-  output[10] = x15;
-  output[11] = x7;
-  output[12] = x5;
-  output[13] = -x13;
-  output[14] = x9;
-  output[15] = -x1;
-}
-
-#endif  // HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c b/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c
deleted file mode 100644
index d469d1ad0..000000000
--- a/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c
+++ /dev/null
@@ -1,1042 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_config.h"
-#include "aom_dsp/mips/inv_txfm_dspr2.h"
-#include "aom_dsp/txfm_common.h"
-
-#if HAVE_DSPR2
-void aom_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                   int dest_stride) {
-  int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
-  int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
-  int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;
-  int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26;
-  int16_t step1_27, step1_28, step1_29, step1_30, step1_31;
-  int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
-  int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
-  int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
-  int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
-  int16_t step2_28, step2_29, step2_30, step2_31;
-  int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
-  int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
-  int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27;
-  int16_t step3_28, step3_29, step3_30, step3_31;
-  int temp0, temp1, temp2, temp3;
-  int load1, load2, load3, load4;
-  int result1, result2;
-  int i, temp21;
-  uint8_t *dest_pix, *dest_pix1;
-  const int const_2_power_13 = 8192;
-  uint8_t *cm = aom_ff_cropTbl;
-
-  /* prefetch aom_ff_cropTbl */
-  prefetch_load(aom_ff_cropTbl);
-  prefetch_load(aom_ff_cropTbl + 32);
-  prefetch_load(aom_ff_cropTbl + 64);
-  prefetch_load(aom_ff_cropTbl + 96);
-  prefetch_load(aom_ff_cropTbl + 128);
-  prefetch_load(aom_ff_cropTbl + 160);
-  prefetch_load(aom_ff_cropTbl + 192);
-  prefetch_load(aom_ff_cropTbl + 224);
-
-  for (i = 0; i < 32; ++i) {
-    dest_pix = dest + i;
-    dest_pix1 = dest + i + 31 * dest_stride;
-
-    __asm__ __volatile__(
-        "lh       %[load1],             2(%[input])                     \n\t"
-        "lh       %[load2],             62(%[input])                    \n\t"
-        "lh       %[load3],             34(%[input])                    \n\t"
-        "lh       %[load4],             30(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_31_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_1_64]   \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load1],       %[cospi_1_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_31_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_15_64]  \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_17_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "madd     $ac1,                 %[load3],       %[cospi_17_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_15_64]  \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
-        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
-        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
-
-        "extp     %[step1_17],          $ac1,           31              \n\t"
-        "extp     %[step1_30],          $ac3,           31              \n\t"
-        "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"
-        "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
-          [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
-          [step1_31] "=r"(step1_31)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
-          [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
-          [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             18(%[input])                    \n\t"
-        "lh       %[load2],             46(%[input])                    \n\t"
-        "lh       %[load3],             50(%[input])                    \n\t"
-        "lh       %[load4],             14(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_23_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_9_64]   \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load1],       %[cospi_9_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_23_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_7_64]   \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_25_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "madd     $ac1,                 %[load3],       %[cospi_25_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_7_64]   \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
-        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
-
-        "msub     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
-        "msub     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
-
-        "extp     %[step1_18],          $ac1,           31              \n\t"
-        "extp     %[step1_29],          $ac3,           31              \n\t"
-        "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"
-        "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
-          [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
-          [step1_29] "=r"(step1_29)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
-          [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
-          [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             10(%[input])                    \n\t"
-        "lh       %[load2],             54(%[input])                    \n\t"
-        "lh       %[load3],             42(%[input])                    \n\t"
-        "lh       %[load4],             22(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_27_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_5_64]   \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load1],       %[cospi_5_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_27_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_11_64]  \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_21_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "madd     $ac1,                 %[load3],       %[cospi_21_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_11_64]  \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
-        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
-
-        "madd     $ac1,                 %[load2],       %[cospi_12_64]  \n\t"
-        "msub     $ac1,                 %[load1],       %[cospi_20_64]  \n\t"
-        "madd     $ac3,                 %[load1],       %[cospi_12_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_20_64]  \n\t"
-
-        "extp     %[step1_21],          $ac1,           31              \n\t"
-        "extp     %[step1_26],          $ac3,           31              \n\t"
-        "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"
-        "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
-          [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
-          [step1_27] "=r"(step1_27)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
-          [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
-          [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             26(%[input])                    \n\t"
-        "lh       %[load2],             38(%[input])                    \n\t"
-        "lh       %[load3],             58(%[input])                    \n\t"
-        "lh       %[load4],              6(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-        "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-        "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
-        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
-        "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"
-        "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"
-        "extp     %[step1_22],          $ac1,           31              \n\t"
-        "extp     %[step1_25],          $ac3,           31              \n\t"
-        "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
-        "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
-          [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
-          [step1_25] "=r"(step1_25)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
-          [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
-          [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],              4(%[input])                    \n\t"
-        "lh       %[load2],             60(%[input])                    \n\t"
-        "lh       %[load3],             36(%[input])                    \n\t"
-        "lh       %[load4],             28(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-        "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-        "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
-        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
-        "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"
-        "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"
-        "extp     %[step2_9],           $ac1,           31              \n\t"
-        "extp     %[step2_14],          $ac3,           31              \n\t"
-        "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
-        "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
-          [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
-          [step2_15] "=r"(step2_15)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
-          [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
-          [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             20(%[input])                    \n\t"
-        "lh       %[load2],             44(%[input])                    \n\t"
-        "lh       %[load3],             52(%[input])                    \n\t"
-        "lh       %[load4],             12(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-        "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-        "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
-        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
-        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
-        "extp     %[step2_10],          $ac1,           31              \n\t"
-        "extp     %[step2_13],          $ac3,           31              \n\t"
-        "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
-        "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
-          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
-          [step2_13] "=r"(step2_13)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
-          [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
-          [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"
-        "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"
-        "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"
-        "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"
-        "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"
-        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"
-        "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"
-        "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"
-        "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-        "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"
-        "add      %[temp1],             %[temp1],       %[step2_8]      \n\t"
-        "sub      %[temp1],             %[temp1],       %[step2_11]     \n\t"
-        "madd     $ac3,                 %[temp1],       %[cospi_16_64]  \n\t"
-
-        "add      %[step3_8],           %[step2_8],     %[step2_11]     \n\t"
-        "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"
-        "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"
-        "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"
-        "extp     %[step3_10],          $ac0,           31              \n\t"
-        "extp     %[step3_13],          $ac1,           31              \n\t"
-        "extp     %[step3_11],          $ac2,           31              \n\t"
-        "extp     %[step3_12],          $ac3,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
-          [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
-          [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
-          [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
-          [step3_15] "=r"(step3_15)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
-          [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
-          [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
-          [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
-          [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
-
-    step2_18 = step1_17 - step1_18;
-    step2_29 = step1_30 - step1_29;
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"
-        "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"
-        "extp     %[step3_18],          $ac0,           31              \n\t"
-
-        : [step3_18] "=r"(step3_18)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
-          [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
-    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step2_19 = step1_16 - step1_19;
-    step2_28 = step1_31 - step1_28;
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"
-        "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"
-        "extp     %[step3_19],          $ac0,           31              \n\t"
-
-        : [step3_19] "=r"(step3_19)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
-          [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
-    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step3_16 = step1_16 + step1_19;
-    step3_17 = step1_17 + step1_18;
-    step3_30 = step1_29 + step1_30;
-    step3_31 = step1_28 + step1_31;
-
-    step2_20 = step1_23 - step1_20;
-    step2_27 = step1_24 - step1_27;
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"
-        "extp     %[step3_20],          $ac0,           31              \n\t"
-
-        : [step3_20] "=r"(step3_20)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
-          [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
-    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step2_21 = step1_22 - step1_21;
-    step2_26 = step1_25 - step1_26;
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"
-        "extp     %[step3_21],          $ac1,           31              \n\t"
-
-        : [step3_21] "=r"(step3_21)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
-          [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
-    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step3_22 = step1_21 + step1_22;
-    step3_23 = step1_20 + step1_23;
-    step3_24 = step1_24 + step1_27;
-    step3_25 = step1_25 + step1_26;
-
-    step2_16 = step3_16 + step3_23;
-    step2_17 = step3_17 + step3_22;
-    step2_18 = step3_18 + step3_21;
-    step2_19 = step3_19 + step3_20;
-    step2_20 = step3_19 - step3_20;
-    step2_21 = step3_18 - step3_21;
-    step2_22 = step3_17 - step3_22;
-    step2_23 = step3_16 - step3_23;
-
-    step2_24 = step3_31 - step3_24;
-    step2_25 = step3_30 - step3_25;
-    step2_26 = step3_29 - step3_26;
-    step2_27 = step3_28 - step3_27;
-    step2_28 = step3_28 + step3_27;
-    step2_29 = step3_29 + step3_26;
-    step2_30 = step3_30 + step3_25;
-    step2_31 = step3_31 + step3_24;
-
-    __asm__ __volatile__(
-        "lh       %[load1],             0(%[input])                     \n\t"
-        "lh       %[load2],             32(%[input])                    \n\t"
-        "lh       %[load3],             16(%[input])                    \n\t"
-        "lh       %[load4],             48(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "add      %[result1],           %[load1],       %[load2]        \n\t"
-        "sub      %[result2],           %[load1],       %[load2]        \n\t"
-        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
-        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
-        "extp     %[temp2],             $ac3,           31              \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
-        "extp     %[temp3],             $ac1,           31              \n\t"
-        "add      %[step1_0],           %[temp0],       %[temp3]        \n\t"
-        "add      %[step1_1],           %[temp1],       %[temp2]        \n\t"
-        "sub      %[step1_2],           %[temp1],       %[temp2]        \n\t"
-        "sub      %[step1_3],           %[temp0],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [result1] "=&r"(result1),
-          [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
-          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
-          [step1_3] "=r"(step1_3)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
-          [cospi_16_64] "r"(cospi_16_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             8(%[input])                     \n\t"
-        "lh       %[load2],             56(%[input])                    \n\t"
-        "lh       %[load3],             40(%[input])                    \n\t"
-        "lh       %[load4],             24(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-        "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
-        "sub      %[load1],             %[load1],       %[temp0]        \n\t"
-        "add      %[load1],             %[load1],       %[temp1]        \n\t"
-        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
-        "sub      %[load2],             %[load2],       %[temp2]        \n\t"
-        "add      %[load2],             %[load2],       %[temp3]        \n\t"
-        "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"
-
-        "extp     %[step1_5],           $ac1,           31              \n\t"
-        "extp     %[step1_6],           $ac3,           31              \n\t"
-        "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"
-        "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
-          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
-          [step1_7] "=r"(step1_7)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
-          [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
-          [cospi_16_64] "r"(cospi_16_64));
-
-    step2_0 = step1_0 + step1_7;
-    step2_1 = step1_1 + step1_6;
-    step2_2 = step1_2 + step1_5;
-    step2_3 = step1_3 + step1_4;
-    step2_4 = step1_3 - step1_4;
-    step2_5 = step1_2 - step1_5;
-    step2_6 = step1_1 - step1_6;
-    step2_7 = step1_0 - step1_7;
-
-    // stage 7
-    step1_0 = step2_0 + step3_15;
-    step1_1 = step2_1 + step3_14;
-    step1_2 = step2_2 + step3_13;
-    step1_3 = step2_3 + step3_12;
-    step1_4 = step2_4 + step3_11;
-    step1_5 = step2_5 + step3_10;
-    step1_6 = step2_6 + step3_9;
-    step1_7 = step2_7 + step3_8;
-    step1_8 = step2_7 - step3_8;
-    step1_9 = step2_6 - step3_9;
-    step1_10 = step2_5 - step3_10;
-    step1_11 = step2_4 - step3_11;
-    step1_12 = step2_3 - step3_12;
-    step1_13 = step2_2 - step3_13;
-    step1_14 = step2_1 - step3_14;
-    step1_15 = step2_0 - step3_15;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_20],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
-          [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_20 + step2_27) * cospi_16_64;
-    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_21],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
-          [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_21 + step2_26) * cospi_16_64;
-    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_22],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
-          [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_22 + step2_25) * cospi_16_64;
-    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_23],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
-          [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_23 + step2_24) * cospi_16_64;
-    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    __asm__ __volatile__(
-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
-        "add      %[temp0],         %[step1_0],         %[step2_31]     \n\t"
-        "addi     %[temp0],         %[temp0],           32              \n\t"
-        "sra      %[temp0],         %[temp0],           6               \n\t"
-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "add      %[temp1],         %[step1_1],         %[step2_30]     \n\t"
-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
-        "addi     %[temp1],         %[temp1],           32              \n\t"
-        "sra      %[temp1],         %[temp1],           6               \n\t"
-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
-        "add      %[temp0],         %[step1_2],         %[step2_29]     \n\t"
-        "addi     %[temp0],         %[temp0],           32              \n\t"
-        "sra      %[temp0],         %[temp0],           6               \n\t"
-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "add      %[temp1],         %[step1_3],         %[step2_28]     \n\t"
-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
-        "addi     %[temp1],         %[temp1],           32              \n\t"
-        "sra      %[temp1],         %[temp1],           6               \n\t"
-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
-          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
-          [step1_1] "r"(step1_1), [step1_2] "r"(step1_2),
-          [step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
-          [step2_29] "r"(step2_29), [step2_30] "r"(step2_30),
-          [step2_31] "r"(step2_31));
-
-    step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);
-    step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);
-    step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);
-    step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);
-
-    __asm__ __volatile__(
-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-
-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
-          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
-          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
-          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
-
-    __asm__ __volatile__(
-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
-        "add      %[temp0],         %[step1_4],         %[step1_27]     \n\t"
-        "addi     %[temp0],         %[temp0],           32              \n\t"
-        "sra      %[temp0],         %[temp0],           6               \n\t"
-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "add      %[temp1],         %[step1_5],         %[step1_26]     \n\t"
-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
-        "addi     %[temp1],         %[temp1],           32              \n\t"
-        "sra      %[temp1],         %[temp1],           6               \n\t"
-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
-        "add      %[temp0],         %[step1_6],         %[step1_25]     \n\t"
-        "addi     %[temp0],         %[temp0],           32              \n\t"
-        "sra      %[temp0],         %[temp0],           6               \n\t"
-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "add      %[temp1],         %[step1_7],         %[step1_24]     \n\t"
-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
-        "addi     %[temp1],         %[temp1],           32              \n\t"
-        "sra      %[temp1],         %[temp1],           6               \n\t"
-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
-          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_4] "r"(step1_4),
-          [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
-          [step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
-          [step1_25] "r"(step1_25), [step1_26] "r"(step1_26),
-          [step1_27] "r"(step1_27));
-
-    step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);
-    step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);
-    step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);
-    step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);
-
-    __asm__ __volatile__(
-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-
-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
-          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
-          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
-          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
-
-    __asm__ __volatile__(
-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
-        "add      %[temp0],         %[step1_8],         %[step1_23]     \n\t"
-        "addi     %[temp0],         %[temp0],           32              \n\t"
-        "sra      %[temp0],         %[temp0],           6               \n\t"
-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "add      %[temp1],         %[step1_9],         %[step1_22]     \n\t"
-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
-        "addi     %[temp1],         %[temp1],           32              \n\t"
-        "sra      %[temp1],         %[temp1],           6               \n\t"
-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
-        "add      %[temp0],         %[step1_10],        %[step1_21]     \n\t"
-        "addi     %[temp0],         %[temp0],           32              \n\t"
-        "sra      %[temp0],         %[temp0],           6               \n\t"
-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "add      %[temp1],         %[step1_11],        %[step1_20]     \n\t"
-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
-        "addi     %[temp1],         %[temp1],           32              \n\t"
-        "sra      %[temp1],         %[temp1],           6               \n\t"
-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
-          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_8] "r"(step1_8),
-          [step1_9] "r"(step1_9), [step1_10] "r"(step1_10),
-          [step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
-          [step1_21] "r"(step1_21), [step1_22] "r"(step1_22),
-          [step1_23] "r"(step1_23));
-
-    step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);
-    step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);
-    step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);
-    step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);
-
-    __asm__ __volatile__(
-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-
-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
-          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
-          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
-          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
-
-    __asm__ __volatile__(
-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
-        "add      %[temp0],         %[step1_12],        %[step2_19]     \n\t"
-        "addi     %[temp0],         %[temp0],           32              \n\t"
-        "sra      %[temp0],         %[temp0],           6               \n\t"
-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "add      %[temp1],         %[step1_13],        %[step2_18]     \n\t"
-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
-        "addi     %[temp1],         %[temp1],           32              \n\t"
-        "sra      %[temp1],         %[temp1],           6               \n\t"
-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
-        "add      %[temp0],         %[step1_14],        %[step2_17]     \n\t"
-        "addi     %[temp0],         %[temp0],           32              \n\t"
-        "sra      %[temp0],         %[temp0],           6               \n\t"
-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "add      %[temp1],         %[step1_15],        %[step2_16]     \n\t"
-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
-        "addi     %[temp1],         %[temp1],           32              \n\t"
-        "sra      %[temp1],         %[temp1],           6               \n\t"
-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
-          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
-          [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
-          [step1_14] "r"(step1_14), [step1_15] "r"(step1_15),
-          [step2_16] "r"(step2_16), [step2_17] "r"(step2_17),
-          [step2_18] "r"(step2_18), [step2_19] "r"(step2_19));
-
-    step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
-    step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
-    step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);
-    step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);
-
-    __asm__ __volatile__(
-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-
-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
-          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
-          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
-          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
-
-    input += 32;
-  }
-}
-#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/itrans32_dspr2.c b/third_party/aom/aom_dsp/mips/itrans32_dspr2.c
deleted file mode 100644
index fa7703217..000000000
--- a/third_party/aom/aom_dsp/mips/itrans32_dspr2.c
+++ /dev/null
@@ -1,1030 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "./aom_config.h"
-#include "aom_dsp/mips/inv_txfm_dspr2.h"
-#include "aom_dsp/txfm_common.h"
-
-#if HAVE_DSPR2
-static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
-                              uint32_t no_rows) {
-  int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
-  int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
-  int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
-  int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
-  int16_t step1_28, step1_29, step1_30, step1_31;
-  int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
-  int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
-  int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
-  int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
-  int16_t step2_28, step2_29, step2_30, step2_31;
-  int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
-  int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
-  int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
-  int16_t step3_29, step3_30, step3_31;
-  int temp0, temp1, temp2, temp3;
-  int load1, load2, load3, load4;
-  int result1, result2;
-  int temp21;
-  int i;
-  const int const_2_power_13 = 8192;
-  const int32_t *input_int;
-
-  for (i = no_rows; i--;) {
-    input_int = (const int32_t *)input;
-
-    if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] |
-          input_int[4] | input_int[5] | input_int[6] | input_int[7] |
-          input_int[8] | input_int[9] | input_int[10] | input_int[11] |
-          input_int[12] | input_int[13] | input_int[14] | input_int[15])) {
-      input += 32;
-
-      __asm__ __volatile__(
-          "sh     $zero,     0(%[output])     \n\t"
-          "sh     $zero,    64(%[output])     \n\t"
-          "sh     $zero,   128(%[output])     \n\t"
-          "sh     $zero,   192(%[output])     \n\t"
-          "sh     $zero,   256(%[output])     \n\t"
-          "sh     $zero,   320(%[output])     \n\t"
-          "sh     $zero,   384(%[output])     \n\t"
-          "sh     $zero,   448(%[output])     \n\t"
-          "sh     $zero,   512(%[output])     \n\t"
-          "sh     $zero,   576(%[output])     \n\t"
-          "sh     $zero,   640(%[output])     \n\t"
-          "sh     $zero,   704(%[output])     \n\t"
-          "sh     $zero,   768(%[output])     \n\t"
-          "sh     $zero,   832(%[output])     \n\t"
-          "sh     $zero,   896(%[output])     \n\t"
-          "sh     $zero,   960(%[output])     \n\t"
-          "sh     $zero,  1024(%[output])     \n\t"
-          "sh     $zero,  1088(%[output])     \n\t"
-          "sh     $zero,  1152(%[output])     \n\t"
-          "sh     $zero,  1216(%[output])     \n\t"
-          "sh     $zero,  1280(%[output])     \n\t"
-          "sh     $zero,  1344(%[output])     \n\t"
-          "sh     $zero,  1408(%[output])     \n\t"
-          "sh     $zero,  1472(%[output])     \n\t"
-          "sh     $zero,  1536(%[output])     \n\t"
-          "sh     $zero,  1600(%[output])     \n\t"
-          "sh     $zero,  1664(%[output])     \n\t"
-          "sh     $zero,  1728(%[output])     \n\t"
-          "sh     $zero,  1792(%[output])     \n\t"
-          "sh     $zero,  1856(%[output])     \n\t"
-          "sh     $zero,  1920(%[output])     \n\t"
-          "sh     $zero,  1984(%[output])     \n\t"
-
-          :
-          : [output] "r"(output));
-
-      output += 1;
-
-      continue;
-    }
-
-    /* prefetch row */
-    prefetch_load((const uint8_t *)(input + 32));
-    prefetch_load((const uint8_t *)(input + 48));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             2(%[input])                     \n\t"
-        "lh       %[load2],             62(%[input])                    \n\t"
-        "lh       %[load3],             34(%[input])                    \n\t"
-        "lh       %[load4],             30(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_31_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_1_64]   \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load1],       %[cospi_1_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_31_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_15_64]  \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_17_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "madd     $ac1,                 %[load3],       %[cospi_17_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_15_64]  \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
-        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
-        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
-
-        "extp     %[step1_17],          $ac1,           31              \n\t"
-        "extp     %[step1_30],          $ac3,           31              \n\t"
-        "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"
-        "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
-          [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
-          [step1_31] "=r"(step1_31)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
-          [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
-          [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             18(%[input])                    \n\t"
-        "lh       %[load2],             46(%[input])                    \n\t"
-        "lh       %[load3],             50(%[input])                    \n\t"
-        "lh       %[load4],             14(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_23_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_9_64]   \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load1],       %[cospi_9_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_23_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_7_64]   \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_25_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "madd     $ac1,                 %[load3],       %[cospi_25_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_7_64]   \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
-        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
-
-        "msub     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
-        "msub     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
-
-        "extp     %[step1_18],          $ac1,           31              \n\t"
-        "extp     %[step1_29],          $ac3,           31              \n\t"
-        "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"
-        "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
-          [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
-          [step1_29] "=r"(step1_29)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
-          [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
-          [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             10(%[input])                    \n\t"
-        "lh       %[load2],             54(%[input])                    \n\t"
-        "lh       %[load3],             42(%[input])                    \n\t"
-        "lh       %[load4],             22(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_27_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_5_64]   \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load1],       %[cospi_5_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_27_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_11_64]  \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_21_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "madd     $ac1,                 %[load3],       %[cospi_21_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_11_64]  \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
-        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
-
-        "madd     $ac1,                 %[load2],       %[cospi_12_64]  \n\t"
-        "msub     $ac1,                 %[load1],       %[cospi_20_64]  \n\t"
-        "madd     $ac3,                 %[load1],       %[cospi_12_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_20_64]  \n\t"
-
-        "extp     %[step1_21],          $ac1,           31              \n\t"
-        "extp     %[step1_26],          $ac3,           31              \n\t"
-        "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"
-        "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
-          [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
-          [step1_27] "=r"(step1_27)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
-          [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
-          [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             26(%[input])                    \n\t"
-        "lh       %[load2],             38(%[input])                    \n\t"
-        "lh       %[load3],             58(%[input])                    \n\t"
-        "lh       %[load4],              6(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
-        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
-
-        "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"
-        "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"
-
-        "extp     %[step1_22],          $ac1,           31              \n\t"
-        "extp     %[step1_25],          $ac3,           31              \n\t"
-        "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
-        "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
-          [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
-          [step1_25] "=r"(step1_25)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
-          [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
-          [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],              4(%[input])                    \n\t"
-        "lh       %[load2],             60(%[input])                    \n\t"
-        "lh       %[load3],             36(%[input])                    \n\t"
-        "lh       %[load4],             28(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
-        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
-
-        "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"
-        "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"
-
-        "extp     %[step2_9],           $ac1,           31              \n\t"
-        "extp     %[step2_14],          $ac3,           31              \n\t"
-        "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
-        "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
-          [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
-          [step2_15] "=r"(step2_15)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
-          [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
-          [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             20(%[input])                    \n\t"
-        "lh       %[load2],             44(%[input])                    \n\t"
-        "lh       %[load3],             52(%[input])                    \n\t"
-        "lh       %[load4],             12(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
-        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
-
-        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
-
-        "extp     %[step2_10],          $ac1,           31              \n\t"
-        "extp     %[step2_13],          $ac3,           31              \n\t"
-        "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
-        "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
-          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
-          [step2_13] "=r"(step2_13)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
-          [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
-          [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"
-        "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"
-        "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"
-        "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"
-        "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"
-        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"
-        "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"
-        "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"
-        "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-        "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"
-        "add      %[temp1],             %[temp1],       %[step2_8]      \n\t"
-        "sub      %[temp1],             %[temp1],       %[step2_11]     \n\t"
-        "madd     $ac3,                 %[temp1],       %[cospi_16_64]  \n\t"
-
-        "add      %[step3_8],           %[step2_8],     %[step2_11]     \n\t"
-        "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"
-        "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"
-        "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"
-
-        "extp     %[step3_10],          $ac0,           31              \n\t"
-        "extp     %[step3_13],          $ac1,           31              \n\t"
-        "extp     %[step3_11],          $ac2,           31              \n\t"
-        "extp     %[step3_12],          $ac3,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
-          [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
-          [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
-          [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
-          [step3_15] "=r"(step3_15)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
-          [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
-          [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
-          [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
-          [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
-
-    step2_18 = step1_17 - step1_18;
-    step2_29 = step1_30 - step1_29;
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"
-        "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"
-        "extp     %[step3_18],          $ac0,           31              \n\t"
-
-        : [step3_18] "=r"(step3_18)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
-          [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
-    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step2_19 = step1_16 - step1_19;
-    step2_28 = step1_31 - step1_28;
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"
-        "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"
-        "extp     %[step3_19],          $ac0,           31              \n\t"
-
-        : [step3_19] "=r"(step3_19)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
-          [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
-    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step3_16 = step1_16 + step1_19;
-    step3_17 = step1_17 + step1_18;
-    step3_30 = step1_29 + step1_30;
-    step3_31 = step1_28 + step1_31;
-
-    step2_20 = step1_23 - step1_20;
-    step2_27 = step1_24 - step1_27;
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"
-        "extp     %[step3_20],          $ac0,           31              \n\t"
-
-        : [step3_20] "=r"(step3_20)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
-          [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
-    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step2_21 = step1_22 - step1_21;
-    step2_26 = step1_25 - step1_26;
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"
-        "extp     %[step3_21],          $ac1,           31              \n\t"
-
-        : [step3_21] "=r"(step3_21)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
-          [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
-    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step3_22 = step1_21 + step1_22;
-    step3_23 = step1_20 + step1_23;
-    step3_24 = step1_24 + step1_27;
-    step3_25 = step1_25 + step1_26;
-
-    step2_16 = step3_16 + step3_23;
-    step2_17 = step3_17 + step3_22;
-    step2_18 = step3_18 + step3_21;
-    step2_19 = step3_19 + step3_20;
-    step2_20 = step3_19 - step3_20;
-    step2_21 = step3_18 - step3_21;
-    step2_22 = step3_17 - step3_22;
-    step2_23 = step3_16 - step3_23;
-
-    step2_24 = step3_31 - step3_24;
-    step2_25 = step3_30 - step3_25;
-    step2_26 = step3_29 - step3_26;
-    step2_27 = step3_28 - step3_27;
-    step2_28 = step3_28 + step3_27;
-    step2_29 = step3_29 + step3_26;
-    step2_30 = step3_30 + step3_25;
-    step2_31 = step3_31 + step3_24;
-
-    __asm__ __volatile__(
-        "lh       %[load1],             0(%[input])                     \n\t"
-        "lh       %[load2],             32(%[input])                    \n\t"
-        "lh       %[load3],             16(%[input])                    \n\t"
-        "lh       %[load4],             48(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "add      %[result1],           %[load1],       %[load2]        \n\t"
-        "sub      %[result2],           %[load1],       %[load2]        \n\t"
-        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
-        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
-        "extp     %[temp2],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
-        "extp     %[temp3],             $ac1,           31              \n\t"
-
-        "add      %[step1_0],          %[temp0],        %[temp3]        \n\t"
-        "add      %[step1_1],          %[temp1],        %[temp2]        \n\t"
-        "sub      %[step1_2],          %[temp1],        %[temp2]        \n\t"
-        "sub      %[step1_3],          %[temp0],        %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [result1] "=&r"(result1),
-          [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
-          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
-          [step1_3] "=r"(step1_3)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_16_64] "r"(cospi_16_64), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64)
-
-            );
-
-    __asm__ __volatile__(
-        "lh       %[load1],             8(%[input])                     \n\t"
-        "lh       %[load2],             56(%[input])                    \n\t"
-        "lh       %[load3],             40(%[input])                    \n\t"
-        "lh       %[load4],             24(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
-        "sub      %[load1],             %[load1],       %[temp0]        \n\t"
-        "add      %[load1],             %[load1],       %[temp1]        \n\t"
-
-        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
-        "sub      %[load2],             %[load2],       %[temp2]        \n\t"
-        "add      %[load2],             %[load2],       %[temp3]        \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"
-
-        "extp     %[step1_5],           $ac1,           31              \n\t"
-        "extp     %[step1_6],           $ac3,           31              \n\t"
-        "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"
-        "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
-          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
-          [step1_7] "=r"(step1_7)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
-          [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
-          [cospi_16_64] "r"(cospi_16_64));
-
-    step2_0 = step1_0 + step1_7;
-    step2_1 = step1_1 + step1_6;
-    step2_2 = step1_2 + step1_5;
-    step2_3 = step1_3 + step1_4;
-    step2_4 = step1_3 - step1_4;
-    step2_5 = step1_2 - step1_5;
-    step2_6 = step1_1 - step1_6;
-    step2_7 = step1_0 - step1_7;
-
-    step1_0 = step2_0 + step3_15;
-    step1_1 = step2_1 + step3_14;
-    step1_2 = step2_2 + step3_13;
-    step1_3 = step2_3 + step3_12;
-    step1_4 = step2_4 + step3_11;
-    step1_5 = step2_5 + step3_10;
-    step1_6 = step2_6 + step3_9;
-    step1_7 = step2_7 + step3_8;
-    step1_8 = step2_7 - step3_8;
-    step1_9 = step2_6 - step3_9;
-    step1_10 = step2_5 - step3_10;
-    step1_11 = step2_4 - step3_11;
-    step1_12 = step2_3 - step3_12;
-    step1_13 = step2_2 - step3_13;
-    step1_14 = step2_1 - step3_14;
-    step1_15 = step2_0 - step3_15;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_20],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
-          [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_20 + step2_27) * cospi_16_64;
-    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_21],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
-          [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_21 + step2_26) * cospi_16_64;
-    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_22],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
-          [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_22 + step2_25) * cospi_16_64;
-    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_23],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
-          [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_23 + step2_24) * cospi_16_64;
-    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    // final stage
-    output[0 * 32] = step1_0 + step2_31;
-    output[1 * 32] = step1_1 + step2_30;
-    output[2 * 32] = step1_2 + step2_29;
-    output[3 * 32] = step1_3 + step2_28;
-    output[4 * 32] = step1_4 + step1_27;
-    output[5 * 32] = step1_5 + step1_26;
-    output[6 * 32] = step1_6 + step1_25;
-    output[7 * 32] = step1_7 + step1_24;
-    output[8 * 32] = step1_8 + step1_23;
-    output[9 * 32] = step1_9 + step1_22;
-    output[10 * 32] = step1_10 + step1_21;
-    output[11 * 32] = step1_11 + step1_20;
-    output[12 * 32] = step1_12 + step2_19;
-    output[13 * 32] = step1_13 + step2_18;
-    output[14 * 32] = step1_14 + step2_17;
-    output[15 * 32] = step1_15 + step2_16;
-    output[16 * 32] = step1_15 - step2_16;
-    output[17 * 32] = step1_14 - step2_17;
-    output[18 * 32] = step1_13 - step2_18;
-    output[19 * 32] = step1_12 - step2_19;
-    output[20 * 32] = step1_11 - step1_20;
-    output[21 * 32] = step1_10 - step1_21;
-    output[22 * 32] = step1_9 - step1_22;
-    output[23 * 32] = step1_8 - step1_23;
-    output[24 * 32] = step1_7 - step1_24;
-    output[25 * 32] = step1_6 - step1_25;
-    output[26 * 32] = step1_5 - step1_26;
-    output[27 * 32] = step1_4 - step1_27;
-    output[28 * 32] = step1_3 - step2_28;
-    output[29 * 32] = step1_2 - step2_29;
-    output[30 * 32] = step1_1 - step2_30;
-    output[31 * 32] = step1_0 - step2_31;
-
-    input += 32;
-    output += 1;
-  }
-}
-
-void aom_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
-                                  int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
-  int16_t *outptr = out;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                       :
-                       : [pos] "r"(pos));
-
-  // Rows
-  idct32_rows_dspr2(input, outptr, 32);
-
-  // Columns
-  aom_idct32_cols_add_blk_dspr2(out, dest, dest_stride);
-}
-
-void aom_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
-                                int stride) {
-  DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
-  int16_t *outptr = out;
-  uint32_t i;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                       :
-                       : [pos] "r"(pos));
-
-  // Rows
-  idct32_rows_dspr2(input, outptr, 8);
-
-  outptr += 8;
-  __asm__ __volatile__(
-      "sw     $zero,      0(%[outptr])     \n\t"
-      "sw     $zero,      4(%[outptr])     \n\t"
-      "sw     $zero,      8(%[outptr])     \n\t"
-      "sw     $zero,     12(%[outptr])     \n\t"
-      "sw     $zero,     16(%[outptr])     \n\t"
-      "sw     $zero,     20(%[outptr])     \n\t"
-      "sw     $zero,     24(%[outptr])     \n\t"
-      "sw     $zero,     28(%[outptr])     \n\t"
-      "sw     $zero,     32(%[outptr])     \n\t"
-      "sw     $zero,     36(%[outptr])     \n\t"
-      "sw     $zero,     40(%[outptr])     \n\t"
-      "sw     $zero,     44(%[outptr])     \n\t"
-
-      :
-      : [outptr] "r"(outptr));
-
-  for (i = 0; i < 31; ++i) {
-    outptr += 32;
-
-    __asm__ __volatile__(
-        "sw     $zero,      0(%[outptr])     \n\t"
-        "sw     $zero,      4(%[outptr])     \n\t"
-        "sw     $zero,      8(%[outptr])     \n\t"
-        "sw     $zero,     12(%[outptr])     \n\t"
-        "sw     $zero,     16(%[outptr])     \n\t"
-        "sw     $zero,     20(%[outptr])     \n\t"
-        "sw     $zero,     24(%[outptr])     \n\t"
-        "sw     $zero,     28(%[outptr])     \n\t"
-        "sw     $zero,     32(%[outptr])     \n\t"
-        "sw     $zero,     36(%[outptr])     \n\t"
-        "sw     $zero,     40(%[outptr])     \n\t"
-        "sw     $zero,     44(%[outptr])     \n\t"
-
-        :
-        : [outptr] "r"(outptr));
-  }
-
-  // Columns
-  aom_idct32_cols_add_blk_dspr2(out, dest, stride);
-}
-
-void aom_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
-                               int stride) {
-  int r, out;
-  int32_t a1, absa1;
-  int32_t vector_a1;
-  int32_t t1, t2, t3, t4;
-  int32_t vector_1, vector_2, vector_3, vector_4;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-
-                       :
-                       : [pos] "r"(pos));
-
-  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
-  __asm__ __volatile__(
-      "addi     %[out],    %[out],    32      \n\t"
-      "sra      %[a1],     %[out],    6       \n\t"
-
-      : [out] "+r"(out), [a1] "=r"(a1)
-      :);
-
-  if (a1 < 0) {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__(
-        "abs        %[absa1],     %[a1]         \n\t"
-        "replv.qb   %[vector_a1], %[absa1]      \n\t"
-
-        : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
-        : [a1] "r"(a1));
-
-    for (r = 32; r--;) {
-      __asm__ __volatile__(
-          "lw             %[t1],          0(%[dest])                      \n\t"
-          "lw             %[t2],          4(%[dest])                      \n\t"
-          "lw             %[t3],          8(%[dest])                      \n\t"
-          "lw             %[t4],          12(%[dest])                     \n\t"
-          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
-          "sw             %[vector_1],    0(%[dest])                      \n\t"
-          "sw             %[vector_2],    4(%[dest])                      \n\t"
-          "sw             %[vector_3],    8(%[dest])                      \n\t"
-          "sw             %[vector_4],    12(%[dest])                     \n\t"
-
-          "lw             %[t1],          16(%[dest])                     \n\t"
-          "lw             %[t2],          20(%[dest])                     \n\t"
-          "lw             %[t3],          24(%[dest])                     \n\t"
-          "lw             %[t4],          28(%[dest])                     \n\t"
-          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
-          "sw             %[vector_1],    16(%[dest])                     \n\t"
-          "sw             %[vector_2],    20(%[dest])                     \n\t"
-          "sw             %[vector_3],    24(%[dest])                     \n\t"
-          "sw             %[vector_4],    28(%[dest])                     \n\t"
-
-          "add            %[dest],        %[dest],        %[stride]       \n\t"
-
-          : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
-            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
-            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
-            [dest] "+&r"(dest)
-          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
-    }
-  } else {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__("replv.qb       %[vector_a1],   %[a1]     \n\t"
-
-                         : [vector_a1] "=r"(vector_a1)
-                         : [a1] "r"(a1));
-
-    for (r = 32; r--;) {
-      __asm__ __volatile__(
-          "lw             %[t1],          0(%[dest])                      \n\t"
-          "lw             %[t2],          4(%[dest])                      \n\t"
-          "lw             %[t3],          8(%[dest])                      \n\t"
-          "lw             %[t4],          12(%[dest])                     \n\t"
-          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
-          "sw             %[vector_1],    0(%[dest])                      \n\t"
-          "sw             %[vector_2],    4(%[dest])                      \n\t"
-          "sw             %[vector_3],    8(%[dest])                      \n\t"
-          "sw             %[vector_4],    12(%[dest])                     \n\t"
-
-          "lw             %[t1],          16(%[dest])                     \n\t"
-          "lw             %[t2],          20(%[dest])                     \n\t"
-          "lw             %[t3],          24(%[dest])                     \n\t"
-          "lw             %[t4],          28(%[dest])                     \n\t"
-          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
-          "sw             %[vector_1],    16(%[dest])                     \n\t"
-          "sw             %[vector_2],    20(%[dest])                     \n\t"
-          "sw             %[vector_3],    24(%[dest])                     \n\t"
-          "sw             %[vector_4],    28(%[dest])                     \n\t"
-
-          "add            %[dest],        %[dest],        %[stride]       \n\t"
-
-          : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
-            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
-            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
-            [dest] "+&r"(dest)
-          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
-    }
-  }
-}
-#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/itrans4_dspr2.c b/third_party/aom/aom_dsp/mips/itrans4_dspr2.c
deleted file mode 100644
index e6d0367cd..000000000
--- a/third_party/aom/aom_dsp/mips/itrans4_dspr2.c
+++ /dev/null
@@ -1,342 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/inv_txfm_dspr2.h"
-#include "aom_dsp/txfm_common.h"
-
-#if HAVE_DSPR2
-void aom_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
-  int16_t step_0, step_1, step_2, step_3;
-  int Temp0, Temp1, Temp2, Temp3;
-  const int const_2_power_13 = 8192;
-  int i;
-
-  for (i = 4; i--;) {
-    __asm__ __volatile__(
-        /*
-          temp_1 = (input[0] + input[2]) * cospi_16_64;
-          step_0 = dct_const_round_shift(temp_1);
-
-          temp_2 = (input[0] - input[2]) * cospi_16_64;
-          step_1 = dct_const_round_shift(temp_2);
-        */
-        "lh       %[Temp0],             0(%[input])                     \n\t"
-        "lh       %[Temp1],             4(%[input])                     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
-        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "extp     %[step_0],            $ac0,           31              \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-
-        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
-        "extp     %[step_1],            $ac1,           31              \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        /*
-          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-          step_2 = dct_const_round_shift(temp1);
-        */
-        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
-        "extp     %[step_2],            $ac0,           31              \n\t"
-
-        /*
-          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-          step_3 = dct_const_round_shift(temp2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
-        "extp     %[step_3],            $ac1,           31              \n\t"
-
-        /*
-          output[0]  = step_0 + step_3;
-          output[4]  = step_1 + step_2;
-          output[8]  = step_1 - step_2;
-          output[12] = step_0 - step_3;
-        */
-        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
-        "sh       %[Temp0],             0(%[output])                    \n\t"
-
-        "add      %[Temp1],             %[step_1],      %[step_2]       \n\t"
-        "sh       %[Temp1],             8(%[output])                    \n\t"
-
-        "sub      %[Temp2],             %[step_1],      %[step_2]       \n\t"
-        "sh       %[Temp2],             16(%[output])                   \n\t"
-
-        "sub      %[Temp3],             %[step_0],      %[step_3]       \n\t"
-        "sh       %[Temp3],             24(%[output])                   \n\t"
-
-        : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-          [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
-          [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), [output] "+r"(output)
-        : [const_2_power_13] "r"(const_2_power_13),
-          [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
-          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input));
-
-    input += 4;
-    output += 1;
-  }
-}
-
-void aom_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                     int dest_stride) {
-  int16_t step_0, step_1, step_2, step_3;
-  int Temp0, Temp1, Temp2, Temp3;
-  const int const_2_power_13 = 8192;
-  int i;
-  uint8_t *dest_pix;
-  uint8_t *cm = aom_ff_cropTbl;
-
-  /* prefetch aom_ff_cropTbl */
-  prefetch_load(aom_ff_cropTbl);
-  prefetch_load(aom_ff_cropTbl + 32);
-  prefetch_load(aom_ff_cropTbl + 64);
-  prefetch_load(aom_ff_cropTbl + 96);
-  prefetch_load(aom_ff_cropTbl + 128);
-  prefetch_load(aom_ff_cropTbl + 160);
-  prefetch_load(aom_ff_cropTbl + 192);
-  prefetch_load(aom_ff_cropTbl + 224);
-
-  for (i = 0; i < 4; ++i) {
-    dest_pix = (dest + i);
-
-    __asm__ __volatile__(
-        /*
-          temp_1 = (input[0] + input[2]) * cospi_16_64;
-          step_0 = dct_const_round_shift(temp_1);
-
-          temp_2 = (input[0] - input[2]) * cospi_16_64;
-          step_1 = dct_const_round_shift(temp_2);
-        */
-        "lh       %[Temp0],             0(%[input])                     \n\t"
-        "lh       %[Temp1],             4(%[input])                     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
-        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "extp     %[step_0],            $ac0,           31              \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-
-        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
-        "extp     %[step_1],            $ac1,           31              \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        /*
-          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-          step_2 = dct_const_round_shift(temp1);
-        */
-        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
-        "extp     %[step_2],            $ac0,           31              \n\t"
-
-        /*
-          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-          step_3 = dct_const_round_shift(temp2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
-        "extp     %[step_3],            $ac1,           31              \n\t"
-
-        /*
-          output[0]  = step_0 + step_3;
-          output[4]  = step_1 + step_2;
-          output[8]  = step_1 - step_2;
-          output[12] = step_0 - step_3;
-        */
-        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
-        "addi     %[Temp0],             %[Temp0],       8               \n\t"
-        "sra      %[Temp0],             %[Temp0],       4               \n\t"
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "add      %[Temp0],             %[step_1],      %[step_2]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "addi     %[Temp0],             %[Temp0],       8               \n\t"
-        "sra      %[Temp0],             %[Temp0],       4               \n\t"
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step_1],      %[step_2]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "addi     %[Temp0],             %[Temp0],       8               \n\t"
-        "sra      %[Temp0],             %[Temp0],       4               \n\t"
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step_0],      %[step_3]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "addi     %[Temp0],             %[Temp0],       8               \n\t"
-        "sra      %[Temp0],             %[Temp0],       4               \n\t"
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-
-        : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-          [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
-          [step_2] "=&r"(step_2), [step_3] "=&r"(step_3),
-          [dest_pix] "+r"(dest_pix)
-        : [const_2_power_13] "r"(const_2_power_13),
-          [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
-          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
-          [dest_stride] "r"(dest_stride));
-
-    input += 4;
-  }
-}
-
-void aom_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
-                              int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
-  int16_t *outptr = out;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                       :
-                       : [pos] "r"(pos));
-
-  // Rows
-  aom_idct4_rows_dspr2(input, outptr);
-
-  // Columns
-  aom_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
-}
-
-void aom_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
-                             int dest_stride) {
-  int a1, absa1;
-  int r;
-  int32_t out;
-  int t2, vector_a1, vector_a;
-  uint32_t pos = 45;
-  int16_t input_dc = input[0];
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-
-                       :
-                       : [pos] "r"(pos));
-
-  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
-  __asm__ __volatile__(
-      "addi     %[out],     %[out],    8       \n\t"
-      "sra      %[a1],      %[out],    4       \n\t"
-
-      : [out] "+r"(out), [a1] "=r"(a1)
-      :);
-
-  if (a1 < 0) {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__(
-        "abs        %[absa1],     %[a1]         \n\t"
-        "replv.qb   %[vector_a1], %[absa1]      \n\t"
-
-        : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
-        : [a1] "r"(a1));
-
-    for (r = 4; r--;) {
-      __asm__ __volatile__(
-          "lw             %[t2],          0(%[dest])                      \n\t"
-          "subu_s.qb      %[vector_a],    %[t2],          %[vector_a1]    \n\t"
-          "sw             %[vector_a],    0(%[dest])                      \n\t"
-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
-
-          : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
-    }
-  } else {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__("replv.qb       %[vector_a1],   %[a1]     \n\t"
-                         : [vector_a1] "=r"(vector_a1)
-                         : [a1] "r"(a1));
-
-    for (r = 4; r--;) {
-      __asm__ __volatile__(
-          "lw           %[t2],          0(%[dest])                        \n\t"
-          "addu_s.qb    %[vector_a],    %[t2],            %[vector_a1]    \n\t"
-          "sw           %[vector_a],    0(%[dest])                        \n\t"
-          "add          %[dest],        %[dest],          %[dest_stride]  \n\t"
-
-          : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
-    }
-  }
-}
-
-void iadst4_dspr2(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7;
-  int x0, x1, x2, x3;
-
-  x0 = input[0];
-  x1 = input[1];
-  x2 = input[2];
-  x3 = input[3];
-
-  if (!(x0 | x1 | x2 | x3)) {
-    output[0] = output[1] = output[2] = output[3] = 0;
-    return;
-  }
-
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_2_9 * x0;
-  s2 = sinpi_3_9 * x1;
-  s3 = sinpi_4_9 * x2;
-  s4 = sinpi_1_9 * x2;
-  s5 = sinpi_2_9 * x3;
-  s6 = sinpi_4_9 * x3;
-  s7 = x0 - x2 + x3;
-
-  x0 = s0 + s3 + s5;
-  x1 = s1 - s4 - s6;
-  x2 = sinpi_3_9 * s7;
-  x3 = s2;
-
-  s0 = x0 + x3;
-  s1 = x1 + x3;
-  s2 = x2;
-  s3 = x0 + x1 - x3;
-
-  // 1-D transform scaling factor is sqrt(2).
-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
-  // + 1b (addition) = 29b.
-  // Hence the output bit depth is 15b.
-  output[0] = dct_const_round_shift(s0);
-  output[1] = dct_const_round_shift(s1);
-  output[2] = dct_const_round_shift(s2);
-  output[3] = dct_const_round_shift(s3);
-}
-#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/itrans8_dspr2.c b/third_party/aom/aom_dsp/mips/itrans8_dspr2.c
deleted file mode 100644
index 0a20f76f2..000000000
--- a/third_party/aom/aom_dsp/mips/itrans8_dspr2.c
+++ /dev/null
@@ -1,645 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/inv_txfm_dspr2.h"
-#include "aom_dsp/txfm_common.h"
-
-#if HAVE_DSPR2
-void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {
-  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
-  const int const_2_power_13 = 8192;
-  int Temp0, Temp1, Temp2, Temp3, Temp4;
-  int i;
-
-  for (i = no_rows; i--;) {
-    __asm__ __volatile__(
-        /*
-          temp_1 = (input[0] + input[4]) * cospi_16_64;
-          step2_0 = dct_const_round_shift(temp_1);
-
-          temp_2 = (input[0] - input[4]) * cospi_16_64;
-          step2_1 = dct_const_round_shift(temp_2);
-        */
-        "lh       %[Temp0],             0(%[input])                     \n\t"
-        "lh       %[Temp1],             8(%[input])                     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
-        "extp     %[Temp4],             $ac0,           31              \n\t"
-
-        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "extp     %[Temp2],             $ac1,           31              \n\t"
-
-        /*
-          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
-          step2_2 = dct_const_round_shift(temp_1);
-        */
-        "lh       %[Temp0],             4(%[input])                     \n\t"
-        "lh       %[Temp1],             12(%[input])                    \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "extp     %[Temp3],             $ac0,           31              \n\t"
-
-        /*
-          step1_1 = step2_1 + step2_2;
-          step1_2 = step2_1 - step2_2;
-        */
-        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
-        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
-
-        /*
-          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
-          step2_3 = dct_const_round_shift(temp_2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
-        "extp     %[Temp1],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-
-        /*
-          step1_0 = step2_0 + step2_3;
-          step1_3 = step2_0 - step2_3;
-        */
-        "add      %[step1_0],           %[Temp4],       %[Temp1]        \n\t"
-        "sub      %[step1_3],           %[Temp4],       %[Temp1]        \n\t"
-
-        /*
-          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-          step1_4 = dct_const_round_shift(temp_1);
-        */
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "lh       %[Temp1],             14(%[input])                    \n\t"
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
-        "extp     %[step1_4],           $ac0,           31              \n\t"
-
-        /*
-          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-          step1_7 = dct_const_round_shift(temp_2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
-        "extp     %[step1_7],           $ac1,           31              \n\t"
-
-        /*
-          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-          step1_5 = dct_const_round_shift(temp_1);
-        */
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "lh       %[Temp0],             10(%[input])                    \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
-        "extp     %[step1_5],           $ac0,           31              \n\t"
-
-        /*
-          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-          step1_6 = dct_const_round_shift(temp_2);
-        */
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "lh       %[Temp0],             10(%[input])                    \n\t"
-        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
-        "extp     %[step1_6],           $ac1,           31              \n\t"
-
-        /*
-          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
-          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
-        */
-        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
-        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
-        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
-        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
-        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
-        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
-
-        /*
-          step1_4 = step1_4 + step1_5;
-          step1_7 = step1_6 + step1_7;
-        */
-        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
-        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
-
-        "extp     %[step1_5],           $ac0,           31              \n\t"
-        "extp     %[step1_6],           $ac1,           31              \n\t"
-
-        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
-        "sh       %[Temp0],             0(%[output])                    \n\t"
-        "add      %[Temp1],             %[step1_1],     %[step1_6]      \n\t"
-        "sh       %[Temp1],             16(%[output])                   \n\t"
-        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
-        "sh       %[Temp0],             32(%[output])                   \n\t"
-        "add      %[Temp1],             %[step1_3],     %[step1_4]      \n\t"
-        "sh       %[Temp1],             48(%[output])                   \n\t"
-
-        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
-        "sh       %[Temp0],             64(%[output])                   \n\t"
-        "sub      %[Temp1],             %[step1_2],     %[step1_5]      \n\t"
-        "sh       %[Temp1],             80(%[output])                   \n\t"
-        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
-        "sh       %[Temp0],             96(%[output])                   \n\t"
-        "sub      %[Temp1],             %[step1_0],     %[step1_7]      \n\t"
-        "sh       %[Temp1],             112(%[output])                  \n\t"
-
-        : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1),
-          [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
-          [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5),
-          [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
-          [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-          [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
-        : [const_2_power_13] "r"(const_2_power_13),
-          [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
-          [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
-          [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
-          [cospi_24_64] "r"(cospi_24_64), [output] "r"(output),
-          [input] "r"(input));
-
-    input += 8;
-    output += 1;
-  }
-}
-
-void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                 int dest_stride) {
-  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
-  int Temp0, Temp1, Temp2, Temp3;
-  int i;
-  const int const_2_power_13 = 8192;
-  uint8_t *dest_pix;
-  uint8_t *cm = aom_ff_cropTbl;
-
-  /* prefetch aom_ff_cropTbl */
-  prefetch_load(aom_ff_cropTbl);
-  prefetch_load(aom_ff_cropTbl + 32);
-  prefetch_load(aom_ff_cropTbl + 64);
-  prefetch_load(aom_ff_cropTbl + 96);
-  prefetch_load(aom_ff_cropTbl + 128);
-  prefetch_load(aom_ff_cropTbl + 160);
-  prefetch_load(aom_ff_cropTbl + 192);
-  prefetch_load(aom_ff_cropTbl + 224);
-
-  for (i = 0; i < 8; ++i) {
-    dest_pix = (dest + i);
-
-    __asm__ __volatile__(
-        /*
-          temp_1 = (input[0] + input[4]) * cospi_16_64;
-          step2_0 = dct_const_round_shift(temp_1);
-
-          temp_2 = (input[0] - input[4]) * cospi_16_64;
-          step2_1 = dct_const_round_shift(temp_2);
-        */
-        "lh       %[Temp0],             0(%[input])                     \n\t"
-        "lh       %[Temp1],             8(%[input])                     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_6],           $ac0,           31              \n\t"
-
-        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "extp     %[Temp2],             $ac1,           31              \n\t"
-
-        /*
-          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
-          step2_2 = dct_const_round_shift(temp_1);
-        */
-        "lh       %[Temp0],             4(%[input])                     \n\t"
-        "lh       %[Temp1],             12(%[input])                    \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "extp     %[Temp3],             $ac0,           31              \n\t"
-
-        /*
-          step1_1 = step2_1 + step2_2;
-          step1_2 = step2_1 - step2_2;
-        */
-        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
-        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
-
-        /*
-          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
-          step2_3 = dct_const_round_shift(temp_2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
-        "extp     %[Temp1],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-
-        /*
-          step1_0 = step2_0 + step2_3;
-          step1_3 = step2_0 - step2_3;
-        */
-        "add      %[step1_0],           %[step1_6],     %[Temp1]        \n\t"
-        "sub      %[step1_3],           %[step1_6],     %[Temp1]        \n\t"
-
-        /*
-          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-          step1_4 = dct_const_round_shift(temp_1);
-        */
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "lh       %[Temp1],             14(%[input])                    \n\t"
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
-        "extp     %[step1_4],           $ac0,           31              \n\t"
-
-        /*
-          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-          step1_7 = dct_const_round_shift(temp_2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
-        "extp     %[step1_7],           $ac1,           31              \n\t"
-
-        /*
-          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-          step1_5 = dct_const_round_shift(temp_1);
-        */
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "lh       %[Temp0],             10(%[input])                    \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
-        "extp     %[step1_5],           $ac0,           31              \n\t"
-
-        /*
-          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-          step1_6 = dct_const_round_shift(temp_2);
-        */
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "lh       %[Temp0],             10(%[input])                    \n\t"
-        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
-        "extp     %[step1_6],           $ac1,           31              \n\t"
-
-        /*
-          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
-          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
-        */
-        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
-        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
-        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
-        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
-        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
-        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
-
-        /*
-          step1_4 = step1_4 + step1_5;
-          step1_7 = step1_6 + step1_7;
-        */
-        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
-        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
-
-        "extp     %[step1_5],           $ac0,           31              \n\t"
-        "extp     %[step1_6],           $ac1,           31              \n\t"
-
-        /* add block */
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "add      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "add      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-
-        : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1),
-          [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
-          [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5),
-          [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
-          [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-          [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix)
-        : [const_2_power_13] "r"(const_2_power_13),
-          [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
-          [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
-          [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
-          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
-          [dest_stride] "r"(dest_stride));
-
-    input += 8;
-  }
-}
-
-void aom_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
-                              int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
-  int16_t *outptr = out;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
-
-  // First transform rows
-  idct8_rows_dspr2(input, outptr, 8);
-
-  // Then transform columns and add to dest
-  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
-}
-
-void aom_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
-                              int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
-  int16_t *outptr = out;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
-
-  // First transform rows
-  idct8_rows_dspr2(input, outptr, 4);
-
-  outptr += 4;
-
-  __asm__ __volatile__(
-      "sw  $zero,   0(%[outptr])  \n\t"
-      "sw  $zero,   4(%[outptr])  \n\t"
-      "sw  $zero,  16(%[outptr])  \n\t"
-      "sw  $zero,  20(%[outptr])  \n\t"
-      "sw  $zero,  32(%[outptr])  \n\t"
-      "sw  $zero,  36(%[outptr])  \n\t"
-      "sw  $zero,  48(%[outptr])  \n\t"
-      "sw  $zero,  52(%[outptr])  \n\t"
-      "sw  $zero,  64(%[outptr])  \n\t"
-      "sw  $zero,  68(%[outptr])  \n\t"
-      "sw  $zero,  80(%[outptr])  \n\t"
-      "sw  $zero,  84(%[outptr])  \n\t"
-      "sw  $zero,  96(%[outptr])  \n\t"
-      "sw  $zero, 100(%[outptr])  \n\t"
-      "sw  $zero, 112(%[outptr])  \n\t"
-      "sw  $zero, 116(%[outptr])  \n\t"
-
-      :
-      : [outptr] "r"(outptr));
-
-  // Then transform columns and add to dest
-  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
-}
-
-void aom_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
-                             int dest_stride) {
-  uint32_t pos = 45;
-  int32_t out;
-  int32_t r;
-  int32_t a1, absa1;
-  int32_t t1, t2, vector_a1, vector_1, vector_2;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-
-                       :
-                       : [pos] "r"(pos));
-
-  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
-  __asm__ __volatile__(
-      "addi     %[out],     %[out],     16      \n\t"
-      "sra      %[a1],      %[out],     5       \n\t"
-
-      : [out] "+r"(out), [a1] "=r"(a1)
-      :);
-
-  if (a1 < 0) {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__(
-        "abs        %[absa1],       %[a1]       \n\t"
-        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
-
-        : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
-        : [a1] "r"(a1));
-
-    for (r = 8; r--;) {
-      __asm__ __volatile__(
-          "lw           %[t1],          0(%[dest])                      \n\t"
-          "lw           %[t2],          4(%[dest])                      \n\t"
-          "subu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "subu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "sw           %[vector_1],    0(%[dest])                      \n\t"
-          "sw           %[vector_2],    4(%[dest])                      \n\t"
-          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
-
-          : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
-            [vector_2] "=&r"(vector_2), [dest] "+&r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
-    }
-  } else {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__("replv.qb   %[vector_a1],   %[a1]   \n\t"
-
-                         : [vector_a1] "=r"(vector_a1)
-                         : [a1] "r"(a1));
-
-    for (r = 8; r--;) {
-      __asm__ __volatile__(
-          "lw           %[t1],          0(%[dest])                      \n\t"
-          "lw           %[t2],          4(%[dest])                      \n\t"
-          "addu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "addu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "sw           %[vector_1],    0(%[dest])                      \n\t"
-          "sw           %[vector_2],    4(%[dest])                      \n\t"
-          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
-
-          : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
-            [vector_2] "=&r"(vector_2), [dest] "+r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
-    }
-  }
-}
-
-void iadst8_dspr2(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7;
-  int x0, x1, x2, x3, x4, x5, x6, x7;
-
-  x0 = input[7];
-  x1 = input[0];
-  x2 = input[5];
-  x3 = input[2];
-  x4 = input[3];
-  x5 = input[4];
-  x6 = input[1];
-  x7 = input[6];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
-        output[6] = output[7] = 0;
-    return;
-  }
-
-  // stage 1
-  s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
-  s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
-  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
-  s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
-  s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
-
-  x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
-  x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
-  x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
-  x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
-  x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
-  x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
-  x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
-  x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
-  s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
-  s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
-  s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
-
-  x0 = s0 + s2;
-  x1 = s1 + s3;
-  x2 = s0 - s2;
-  x3 = s1 - s3;
-  x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
-  x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
-  x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
-  x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
-
-  // stage 3
-  s2 = cospi_16_64 * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (x6 - x7);
-
-  x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
-  x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
-  x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
-  x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
-
-  output[0] = x0;
-  output[1] = -x4;
-  output[2] = x6;
-  output[3] = -x2;
-  output[4] = x3;
-  output[5] = -x7;
-  output[6] = x5;
-  output[7] = -x1;
-}
-#endif  // HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c
index fc0c32ce3..38a10e9b2 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c
+++ b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c
@@ -404,10 +404,11 @@ void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
   }
 }
 
-void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
-                                    const uint8_t *b_limit_ptr,
-                                    const uint8_t *limit_ptr,
-                                    const uint8_t *thresh_ptr, int32_t count) {
+static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch,
+                                        const uint8_t *b_limit_ptr,
+                                        const uint8_t *limit_ptr,
+                                        const uint8_t *thresh_ptr,
+                                        int32_t count) {
   DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
   uint8_t early_exit = 0;
 
@@ -639,19 +640,19 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
       }
     }
   } else {
-    aom_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr,
-                                   thresh_ptr, count);
+    mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr,
+                                count);
   }
 }
 
-void aom_lpf_horizontal_edge_8_msa(uint8_t *src, int32_t pitch,
-                                   const uint8_t *b_limit_ptr,
-                                   const uint8_t *limit_ptr,
-                                   const uint8_t *thresh_ptr) {
+void aom_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
+                               const uint8_t *b_limit_ptr,
+                               const uint8_t *limit_ptr,
+                               const uint8_t *thresh_ptr) {
   mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
 }
 
-void aom_lpf_horizontal_edge_16_msa(uint8_t *src, int32_t pitch,
+void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
                                     const uint8_t *b_limit_ptr,
                                     const uint8_t *limit_ptr,
                                     const uint8_t *thresh_ptr) {
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c
index 883d0523d..8c41278be 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c
@@ -11,7 +11,8 @@
 
 #include <stdlib.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/mips/common_dspr2.h"
 #include "aom_dsp/mips/loopfilter_filters_dspr2.h"
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
index 72df09823..3e38ef3fb 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
@@ -14,7 +14,8 @@
 
 #include <stdlib.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
index 3e6994714..cb599cf2e 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
@@ -14,7 +14,8 @@
 
 #include <stdlib.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_mem/aom_mem.h"
 
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
index 8db3e521f..6db1dac08 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
@@ -14,7 +14,8 @@
 
 #include <stdlib.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_mem/aom_mem.h"
 
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c
index a3b5a9eb1..b67ccfe9d 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c
@@ -11,7 +11,8 @@
 
 #include <stdlib.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/mips/common_dspr2.h"
 #include "aom_dsp/mips/loopfilter_filters_dspr2.h"
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
index 8d2fd69f7..34733e42e 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
@@ -11,7 +11,8 @@
 
 #include <stdlib.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/mips/common_dspr2.h"
 #include "aom_dsp/mips/loopfilter_filters_dspr2.h"
@@ -718,14 +719,13 @@ static void mb_lpf_horizontal_edge(unsigned char *s, int pitch,
   }
 }
 
-void aom_lpf_horizontal_edge_8_dspr2(unsigned char *s, int pitch,
-                                     const uint8_t *blimit,
-                                     const uint8_t *limit,
-                                     const uint8_t *thresh) {
+void aom_lpf_horizontal_16_dspr2(unsigned char *s, int pitch,
+                                 const uint8_t *blimit, const uint8_t *limit,
+                                 const uint8_t *thresh) {
   mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1);
 }
 
-void aom_lpf_horizontal_edge_16_dspr2(unsigned char *s, int pitch,
+void aom_lpf_horizontal_16_dual_dspr2(unsigned char *s, int pitch,
                                       const uint8_t *blimit,
                                       const uint8_t *limit,
                                       const uint8_t *thresh) {
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
index 28528869b..3d3f1ec97 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
@@ -11,7 +11,8 @@
 
 #include <stdlib.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/mips/common_dspr2.h"
 #include "aom_dsp/mips/loopfilter_filters_dspr2.h"
diff --git a/third_party/aom/aom_dsp/mips/macros_msa.h b/third_party/aom/aom_dsp/mips/macros_msa.h
index 48fbcfd47..eb919d42b 100644
--- a/third_party/aom/aom_dsp/mips/macros_msa.h
+++ b/third_party/aom/aom_dsp/mips/macros_msa.h
@@ -14,7 +14,8 @@
 
 #include <msa.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 
 #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
diff --git a/third_party/aom/aom_dsp/mips/sad_msa.c b/third_party/aom/aom_dsp/mips/sad_msa.c
index 258eb5c07..58cdd80d9 100644
--- a/third_party/aom/aom_dsp/mips/sad_msa.c
+++ b/third_party/aom/aom_dsp/mips/sad_msa.c
@@ -9,7 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/macros_msa.h"
 
 #define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out)       \
@@ -160,640 +161,6 @@ static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
   return sad;
 }
 
-static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
-                              const uint8_t *ref_ptr, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v16u8 ref0, ref1, ref2, ref3, diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-
-    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
-                              const uint8_t *ref, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
-    ref += (4 * ref_stride);
-    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
-                ref0, ref1);
-    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *ref_ptr, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src, ref, ref0, ref1, diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = (height >> 1); ht_cnt--;) {
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = height >> 1; ht_cnt--;) {
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
-    ref += ref_stride;
-
-    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
-    ref += ref_stride;
-
-    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3;
-  v8u16 sad0_0 = { 0 };
-  v8u16 sad0_1 = { 0 };
-  v8u16 sad1_0 = { 0 };
-  v8u16 sad1_1 = { 0 };
-  v8u16 sad2_0 = { 0 };
-  v8u16 sad2_1 = { 0 };
-  v4u32 sad;
-
-  for (ht_cnt = height; ht_cnt--;) {
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3);
-    ref0_4 = LD_UB(ref + 64);
-    ref += ref_stride;
-
-    sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-    sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
-    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
-    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-  }
-
-  sad = __msa_hadd_u_w(sad0_0, sad0_0);
-  sad += __msa_hadd_u_w(sad0_1, sad0_1);
-  sad_array[0] = HADD_SW_S32((v4i32)sad);
-
-  sad = __msa_hadd_u_w(sad1_0, sad1_0);
-  sad += __msa_hadd_u_w(sad1_1, sad1_1);
-  sad_array[1] = HADD_SW_S32((v4i32)sad);
-
-  sad = __msa_hadd_u_w(sad2_0, sad2_0);
-  sad += __msa_hadd_u_w(sad2_1, sad2_1);
-  sad_array[2] = HADD_SW_S32((v4i32)sad);
-}
-
-static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
-                              const uint8_t *ref_ptr, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3, diff;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-    src_ptr += (4 * src_stride);
-    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad3 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad4 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad5 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad6 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad7 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
-                              const uint8_t *ref, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
-    ref += (4 * ref_stride);
-    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
-                ref0, ref1);
-    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *ref_ptr, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src, ref0, ref1, ref;
-  v16u8 diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = (height >> 1); ht_cnt--;) {
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
-    diff = __msa_asub_u_b(src, ref);
-    sad3 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
-    diff = __msa_asub_u_b(src, ref);
-    sad4 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
-    diff = __msa_asub_u_b(src, ref);
-    sad5 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
-    diff = __msa_asub_u_b(src, ref);
-    sad6 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
-    diff = __msa_asub_u_b(src, ref);
-    sad7 += __msa_hadd_u_h(diff, diff);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
-    diff = __msa_asub_u_b(src, ref);
-    sad3 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
-    diff = __msa_asub_u_b(src, ref);
-    sad4 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
-    diff = __msa_asub_u_b(src, ref);
-    sad5 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
-    diff = __msa_asub_u_b(src, ref);
-    sad6 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
-    diff = __msa_asub_u_b(src, ref);
-    sad7 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1;
-  v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = height; ht_cnt--;) {
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
-    ref += ref_stride;
-
-    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
-    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
-    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
-    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
-    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
-    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  const uint8_t *src_dup, *ref_dup;
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4;
-  v16u8 ref0, ref1, ref2, ref3;
-  v8u16 sad0_0 = { 0 };
-  v8u16 sad0_1 = { 0 };
-  v8u16 sad1_0 = { 0 };
-  v8u16 sad1_1 = { 0 };
-  v8u16 sad2_0 = { 0 };
-  v8u16 sad2_1 = { 0 };
-  v8u16 sad3_0 = { 0 };
-  v8u16 sad3_1 = { 0 };
-  v4u32 sad;
-
-  src_dup = src;
-  ref_dup = ref;
-
-  for (ht_cnt = height; ht_cnt--;) {
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
-    ref += ref_stride;
-
-    sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-    sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
-    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
-    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3);
-    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-  }
-
-  sad = __msa_hadd_u_w(sad0_0, sad0_0);
-  sad += __msa_hadd_u_w(sad0_1, sad0_1);
-  sad_array[0] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad1_0, sad1_0);
-  sad += __msa_hadd_u_w(sad1_1, sad1_1);
-  sad_array[1] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad2_0, sad2_0);
-  sad += __msa_hadd_u_w(sad2_1, sad2_1);
-  sad_array[2] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad3_0, sad3_0);
-  sad += __msa_hadd_u_w(sad3_1, sad3_1);
-  sad_array[3] = HADD_SW_S32(sad);
-
-  sad0_0 = (v8u16)__msa_ldi_h(0);
-  sad0_1 = (v8u16)__msa_ldi_h(0);
-  sad1_0 = (v8u16)__msa_ldi_h(0);
-  sad1_1 = (v8u16)__msa_ldi_h(0);
-  sad2_0 = (v8u16)__msa_ldi_h(0);
-  sad2_1 = (v8u16)__msa_ldi_h(0);
-  sad3_0 = (v8u16)__msa_ldi_h(0);
-  sad3_1 = (v8u16)__msa_ldi_h(0);
-
-  for (ht_cnt = 64; ht_cnt--;) {
-    LD_UB4(src_dup, 16, src0, src1, src2, src3);
-    src_dup += src_stride;
-    LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
-    ref_dup += ref_stride;
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4);
-    sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5);
-    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6);
-    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7);
-    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-  }
-
-  sad = __msa_hadd_u_w(sad0_0, sad0_0);
-  sad += __msa_hadd_u_w(sad0_1, sad0_1);
-  sad_array[4] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad1_0, sad1_0);
-  sad += __msa_hadd_u_w(sad1_1, sad1_1);
-  sad_array[5] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad2_0, sad2_0);
-  sad += __msa_hadd_u_w(sad2_1, sad2_1);
-  sad_array[6] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad3_0, sad3_0);
-  sad += __msa_hadd_u_w(sad3_1, sad3_1);
-  sad_array[7] = HADD_SW_S32(sad);
-}
-
 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
                                const uint8_t *const aref_ptr[],
                                int32_t ref_stride, int32_t height,
@@ -1290,76 +657,6 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
     return sad_64width_msa(src, src_stride, ref, ref_stride, height);         \
   }
 
-#define AOM_SAD_4xHEIGHTx3_MSA(height)                                   \
-  void aom_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
-    sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define AOM_SAD_8xHEIGHTx3_MSA(height)                                   \
-  void aom_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
-    sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define AOM_SAD_16xHEIGHTx3_MSA(height)                                   \
-  void aom_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define AOM_SAD_32xHEIGHTx3_MSA(height)                                   \
-  void aom_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define AOM_SAD_64xHEIGHTx3_MSA(height)                                   \
-  void aom_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define AOM_SAD_4xHEIGHTx8_MSA(height)                                   \
-  void aom_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
-    sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define AOM_SAD_8xHEIGHTx8_MSA(height)                                   \
-  void aom_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
-    sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define AOM_SAD_16xHEIGHTx8_MSA(height)                                   \
-  void aom_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define AOM_SAD_32xHEIGHTx8_MSA(height)                                   \
-  void aom_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define AOM_SAD_64xHEIGHTx8_MSA(height)                                   \
-  void aom_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
 #define AOM_SAD_4xHEIGHTx4D_MSA(height)                                   \
   void aom_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
                                   const uint8_t *const refs[],            \
@@ -1438,92 +735,66 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
 /* clang-format off */
 // 64x64
 AOM_SAD_64xHEIGHT_MSA(64)
-AOM_SAD_64xHEIGHTx3_MSA(64)
-AOM_SAD_64xHEIGHTx8_MSA(64)
 AOM_SAD_64xHEIGHTx4D_MSA(64)
 AOM_AVGSAD_64xHEIGHT_MSA(64)
 
 // 64x32
 AOM_SAD_64xHEIGHT_MSA(32)
-AOM_SAD_64xHEIGHTx3_MSA(32)
-AOM_SAD_64xHEIGHTx8_MSA(32)
 AOM_SAD_64xHEIGHTx4D_MSA(32)
 AOM_AVGSAD_64xHEIGHT_MSA(32)
 
 // 32x64
 AOM_SAD_32xHEIGHT_MSA(64)
-AOM_SAD_32xHEIGHTx3_MSA(64)
-AOM_SAD_32xHEIGHTx8_MSA(64)
 AOM_SAD_32xHEIGHTx4D_MSA(64)
 AOM_AVGSAD_32xHEIGHT_MSA(64)
 
 // 32x32
 AOM_SAD_32xHEIGHT_MSA(32)
-AOM_SAD_32xHEIGHTx3_MSA(32)
-AOM_SAD_32xHEIGHTx8_MSA(32)
 AOM_SAD_32xHEIGHTx4D_MSA(32)
 AOM_AVGSAD_32xHEIGHT_MSA(32)
 
 // 32x16
 AOM_SAD_32xHEIGHT_MSA(16)
-AOM_SAD_32xHEIGHTx3_MSA(16)
-AOM_SAD_32xHEIGHTx8_MSA(16)
 AOM_SAD_32xHEIGHTx4D_MSA(16)
 AOM_AVGSAD_32xHEIGHT_MSA(16)
 
 // 16x32
 AOM_SAD_16xHEIGHT_MSA(32)
-AOM_SAD_16xHEIGHTx3_MSA(32)
-AOM_SAD_16xHEIGHTx8_MSA(32)
 AOM_SAD_16xHEIGHTx4D_MSA(32)
 AOM_AVGSAD_16xHEIGHT_MSA(32)
 
 // 16x16
 AOM_SAD_16xHEIGHT_MSA(16)
-AOM_SAD_16xHEIGHTx3_MSA(16)
-AOM_SAD_16xHEIGHTx8_MSA(16)
 AOM_SAD_16xHEIGHTx4D_MSA(16)
 AOM_AVGSAD_16xHEIGHT_MSA(16)
 
 // 16x8
 AOM_SAD_16xHEIGHT_MSA(8)
-AOM_SAD_16xHEIGHTx3_MSA(8)
-AOM_SAD_16xHEIGHTx8_MSA(8)
 AOM_SAD_16xHEIGHTx4D_MSA(8)
 AOM_AVGSAD_16xHEIGHT_MSA(8)
 
 // 8x16
 AOM_SAD_8xHEIGHT_MSA(16)
-AOM_SAD_8xHEIGHTx3_MSA(16)
-AOM_SAD_8xHEIGHTx8_MSA(16)
 AOM_SAD_8xHEIGHTx4D_MSA(16)
 AOM_AVGSAD_8xHEIGHT_MSA(16)
 
 // 8x8
 AOM_SAD_8xHEIGHT_MSA(8)
-AOM_SAD_8xHEIGHTx3_MSA(8)
-AOM_SAD_8xHEIGHTx8_MSA(8)
 AOM_SAD_8xHEIGHTx4D_MSA(8)
 AOM_AVGSAD_8xHEIGHT_MSA(8)
 
 // 8x4
 AOM_SAD_8xHEIGHT_MSA(4)
-AOM_SAD_8xHEIGHTx3_MSA(4)
-AOM_SAD_8xHEIGHTx8_MSA(4)
 AOM_SAD_8xHEIGHTx4D_MSA(4)
 AOM_AVGSAD_8xHEIGHT_MSA(4)
 
 // 4x8
 AOM_SAD_4xHEIGHT_MSA(8)
-AOM_SAD_4xHEIGHTx3_MSA(8)
-AOM_SAD_4xHEIGHTx8_MSA(8)
 AOM_SAD_4xHEIGHTx4D_MSA(8)
 AOM_AVGSAD_4xHEIGHT_MSA(8)
 
 // 4x4
 AOM_SAD_4xHEIGHT_MSA(4)
-AOM_SAD_4xHEIGHTx3_MSA(4)
-AOM_SAD_4xHEIGHTx8_MSA(4)
 AOM_SAD_4xHEIGHTx4D_MSA(4)
 AOM_AVGSAD_4xHEIGHT_MSA(4)
     /* clang-format on */
diff --git a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
index 3eb85107d..a8ee85b6b 100644
--- a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
+++ b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
@@ -9,7 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_ports/mem.h"
 #include "aom_dsp/mips/macros_msa.h"
 #include "aom_dsp/variance.h"
diff --git a/third_party/aom/aom_dsp/mips/subtract_msa.c b/third_party/aom/aom_dsp/mips/subtract_msa.c
index 37b89765d..bfed773ac 100644
--- a/third_party/aom/aom_dsp/mips/subtract_msa.c
+++ b/third_party/aom/aom_dsp/mips/subtract_msa.c
@@ -9,7 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/macros_msa.h"
 
 static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
diff --git a/third_party/aom/aom_dsp/mips/txfm_macros_msa.h b/third_party/aom/aom_dsp/mips/txfm_macros_msa.h
deleted file mode 100644
index cba5d4445..000000000
--- a/third_party/aom/aom_dsp/mips/txfm_macros_msa.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
-#define AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
-
-#include "aom_dsp/mips/macros_msa.h"
-
-#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \
-  {                                                           \
-    v8i16 k0_m = __msa_fill_h(cnst0);                         \
-    v4i32 s0_m, s1_m, s2_m, s3_m;                             \
-                                                              \
-    s0_m = (v4i32)__msa_fill_h(cnst1);                        \
-    k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m);                  \
-                                                              \
-    ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m);                   \
-    ILVRL_H2_SW(reg0, reg1, s3_m, s2_m);                      \
-    DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m);          \
-    SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS);                  \
-    out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);           \
-                                                              \
-    DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m);          \
-    SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS);                  \
-    out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);           \
-  }
-
-#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, dst0,   \
-                              dst1, dst2, dst3)                               \
-  {                                                                           \
-    v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m;                                  \
-    v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m;                                  \
-                                                                              \
-    DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, tp0_m, tp2_m, tp3_m,  \
-                tp4_m);                                                       \
-    DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, tp5_m, tp6_m, tp7_m,  \
-                tp8_m);                                                       \
-    BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m);      \
-    BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m);      \
-    SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS);                  \
-    SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS);                  \
-    PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, dst0, \
-                dst1, dst2, dst3);                                            \
-  }
-
-#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2)           \
-  ({                                                   \
-    v8i16 dst_m;                                       \
-    v4i32 tp0_m, tp1_m;                                \
-                                                       \
-    DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m);     \
-    SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS);         \
-    dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \
-                                                       \
-    dst_m;                                             \
-  })
-
-#define MADD_SHORT(m0, m1, c0, c1, res0, res1)                              \
-  {                                                                         \
-    v4i32 madd0_m, madd1_m, madd2_m, madd3_m;                               \
-    v8i16 madd_s0_m, madd_s1_m;                                             \
-                                                                            \
-    ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m);                              \
-    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, c0, c0, c1, c1, \
-                madd0_m, madd1_m, madd2_m, madd3_m);                        \
-    SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS);        \
-    PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1);            \
-  }
-
-#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1,   \
-                out2, out3)                                                   \
-  {                                                                           \
-    v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                         \
-    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m;                         \
-                                                                              \
-    ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m);                            \
-    ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m);                            \
-    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst0, cst0, cst2, \
-                cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m);                        \
-    BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m);  \
-    SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS);                  \
-    PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1);                      \
-    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst1, cst1, cst3, \
-                cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);                        \
-    BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m);  \
-    SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS);                  \
-    PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3);                      \
-  }
-#endif  // AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/variance_msa.c b/third_party/aom/aom_dsp/mips/variance_msa.c
index 745fdfc9c..065c09ac5 100644
--- a/third_party/aom/aom_dsp/mips/variance_msa.c
+++ b/third_party/aom/aom_dsp/mips/variance_msa.c
@@ -9,7 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/macros_msa.h"
 
 #define CALC_MSE_B(src, ref, var)                                   \
diff --git a/third_party/aom/aom_dsp/noise_model.c b/third_party/aom/aom_dsp/noise_model.c
new file mode 100644
index 000000000..a1287f74f
--- /dev/null
+++ b/third_party/aom/aom_dsp/noise_model.c
@@ -0,0 +1,1460 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/noise_model.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/common.h"
+#include "av1/encoder/mathutils.h"
+
+#define kLowPolyNumParams 3
+
+static const int kMaxLag = 4;
+
+// Defines a function that can be used to obtain the mean of a block for the
+// provided data type (uint8_t, or uint16_t)
+#define GET_BLOCK_MEAN(INT_TYPE, suffix)                                    \
+  static double get_block_mean_##suffix(const INT_TYPE *data, int w, int h, \
+                                        int stride, int x_o, int y_o,       \
+                                        int block_size) {                   \
+    const int max_h = AOMMIN(h - y_o, block_size);                          \
+    const int max_w = AOMMIN(w - x_o, block_size);                          \
+    double block_mean = 0;                                                  \
+    for (int y = 0; y < max_h; ++y) {                                       \
+      for (int x = 0; x < max_w; ++x) {                                     \
+        block_mean += data[(y_o + y) * stride + x_o + x];                   \
+      }                                                                     \
+    }                                                                       \
+    return block_mean / (max_w * max_h);                                    \
+  }
+
+GET_BLOCK_MEAN(uint8_t, lowbd);
+GET_BLOCK_MEAN(uint16_t, highbd);
+
+static INLINE double get_block_mean(const uint8_t *data, int w, int h,
+                                    int stride, int x_o, int y_o,
+                                    int block_size, int use_highbd) {
+  if (use_highbd)
+    return get_block_mean_highbd((const uint16_t *)data, w, h, stride, x_o, y_o,
+                                 block_size);
+  return get_block_mean_lowbd(data, w, h, stride, x_o, y_o, block_size);
+}
+
+// Defines a function that can be used to obtain the variance of a block
+// for the provided data type (uint8_t, or uint16_t)
+#define GET_NOISE_VAR(INT_TYPE, suffix)                                  \
+  static double get_noise_var_##suffix(                                  \
+      const INT_TYPE *data, const INT_TYPE *denoised, int stride, int w, \
+      int h, int x_o, int y_o, int block_size_x, int block_size_y) {     \
+    const int max_h = AOMMIN(h - y_o, block_size_y);                     \
+    const int max_w = AOMMIN(w - x_o, block_size_x);                     \
+    double noise_var = 0;                                                \
+    double noise_mean = 0;                                               \
+    for (int y = 0; y < max_h; ++y) {                                    \
+      for (int x = 0; x < max_w; ++x) {                                  \
+        double noise = (double)data[(y_o + y) * stride + x_o + x] -      \
+                       denoised[(y_o + y) * stride + x_o + x];           \
+        noise_mean += noise;                                             \
+        noise_var += noise * noise;                                      \
+      }                                                                  \
+    }                                                                    \
+    noise_mean /= (max_w * max_h);                                       \
+    return noise_var / (max_w * max_h) - noise_mean * noise_mean;        \
+  }
+
+GET_NOISE_VAR(uint8_t, lowbd);
+GET_NOISE_VAR(uint16_t, highbd);
+
+static INLINE double get_noise_var(const uint8_t *data, const uint8_t *denoised,
+                                   int w, int h, int stride, int x_o, int y_o,
+                                   int block_size_x, int block_size_y,
+                                   int use_highbd) {
+  if (use_highbd)
+    return get_noise_var_highbd((const uint16_t *)data,
+                                (const uint16_t *)denoised, w, h, stride, x_o,
+                                y_o, block_size_x, block_size_y);
+  return get_noise_var_lowbd(data, denoised, w, h, stride, x_o, y_o,
+                             block_size_x, block_size_y);
+}
+
+static void equation_system_clear(aom_equation_system_t *eqns) {
+  const int n = eqns->n;
+  memset(eqns->A, 0, sizeof(*eqns->A) * n * n);
+  memset(eqns->x, 0, sizeof(*eqns->x) * n);
+  memset(eqns->b, 0, sizeof(*eqns->b) * n);
+}
+
+static void equation_system_copy(aom_equation_system_t *dst,
+                                 const aom_equation_system_t *src) {
+  const int n = dst->n;
+  memcpy(dst->A, src->A, sizeof(*dst->A) * n * n);
+  memcpy(dst->x, src->x, sizeof(*dst->x) * n);
+  memcpy(dst->b, src->b, sizeof(*dst->b) * n);
+}
+
+static int equation_system_init(aom_equation_system_t *eqns, int n) {
+  eqns->A = (double *)aom_malloc(sizeof(*eqns->A) * n * n);
+  eqns->b = (double *)aom_malloc(sizeof(*eqns->b) * n);
+  eqns->x = (double *)aom_malloc(sizeof(*eqns->x) * n);
+  eqns->n = n;
+  if (!eqns->A || !eqns->b || !eqns->x) {
+    fprintf(stderr, "Failed to allocate system of equations of size %d\n", n);
+    aom_free(eqns->A);
+    aom_free(eqns->b);
+    aom_free(eqns->x);
+    memset(eqns, 0, sizeof(*eqns));
+    return 0;
+  }
+  equation_system_clear(eqns);
+  return 1;
+}
+
+static int equation_system_solve(aom_equation_system_t *eqns) {
+  const int n = eqns->n;
+  double *b = (double *)aom_malloc(sizeof(*b) * n);
+  double *A = (double *)aom_malloc(sizeof(*A) * n * n);
+  int ret = 0;
+  if (A == NULL || b == NULL) {
+    fprintf(stderr, "Unable to allocate temp values of size %dx%d\n", n, n);
+    aom_free(b);
+    aom_free(A);
+    return 0;
+  }
+  memcpy(A, eqns->A, sizeof(*eqns->A) * n * n);
+  memcpy(b, eqns->b, sizeof(*eqns->b) * n);
+  ret = linsolve(n, A, eqns->n, b, eqns->x);
+  aom_free(b);
+  aom_free(A);
+
+  if (ret == 0) {
+    return 0;
+  }
+  return 1;
+}
+
+static void equation_system_add(aom_equation_system_t *dest,
+                                aom_equation_system_t *src) {
+  const int n = dest->n;
+  int i, j;
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n; ++j) {
+      dest->A[i * n + j] += src->A[i * n + j];
+    }
+    dest->b[i] += src->b[i];
+  }
+}
+
+static void equation_system_free(aom_equation_system_t *eqns) {
+  if (!eqns) return;
+  aom_free(eqns->A);
+  aom_free(eqns->b);
+  aom_free(eqns->x);
+  memset(eqns, 0, sizeof(*eqns));
+}
+
+static void noise_strength_solver_clear(aom_noise_strength_solver_t *solver) {
+  equation_system_clear(&solver->eqns);
+  solver->num_equations = 0;
+  solver->total = 0;
+}
+
+static void noise_strength_solver_add(aom_noise_strength_solver_t *dest,
+                                      aom_noise_strength_solver_t *src) {
+  equation_system_add(&dest->eqns, &src->eqns);
+  dest->num_equations += src->num_equations;
+  dest->total += src->total;
+}
+
+// Return the number of coefficients required for the given parameters
+static int num_coeffs(const aom_noise_model_params_t params) {
+  const int n = 2 * params.lag + 1;
+  switch (params.shape) {
+    case AOM_NOISE_SHAPE_DIAMOND: return params.lag * (params.lag + 1);
+    case AOM_NOISE_SHAPE_SQUARE: return (n * n) / 2;
+  }
+  return 0;
+}
+
+static int noise_state_init(aom_noise_state_t *state, int n, int bit_depth) {
+  const int kNumBins = 20;
+  if (!equation_system_init(&state->eqns, n)) {
+    fprintf(stderr, "Failed initialization noise state with size %d\n", n);
+    return 0;
+  }
+  state->ar_gain = 1.0;
+  state->num_observations = 0;
+  return aom_noise_strength_solver_init(&state->strength_solver, kNumBins,
+                                        bit_depth);
+}
+
+static void set_chroma_coefficient_fallback_soln(aom_equation_system_t *eqns) {
+  const double kTolerance = 1e-6;
+  const int last = eqns->n - 1;
+  // Set all of the AR coefficients to zero, but try to solve for correlation
+  // with the luma channel
+  memset(eqns->x, 0, sizeof(*eqns->x) * eqns->n);
+  if (fabs(eqns->A[last * eqns->n + last]) > kTolerance) {
+    eqns->x[last] = eqns->b[last] / eqns->A[last * eqns->n + last];
+  }
+}
+
+int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points) {
+  if (!lut) return 0;
+  lut->points = (double(*)[2])aom_malloc(num_points * sizeof(*lut->points));
+  if (!lut->points) return 0;
+  lut->num_points = num_points;
+  memset(lut->points, 0, sizeof(*lut->points) * num_points);
+  return 1;
+}
+
+void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut) {
+  if (!lut) return;
+  aom_free(lut->points);
+  memset(lut, 0, sizeof(*lut));
+}
+
+double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut,
+                                   double x) {
+  int i = 0;
+  // Constant extrapolation for x <  x_0.
+  if (x < lut->points[0][0]) return lut->points[0][1];
+  for (i = 0; i < lut->num_points - 1; ++i) {
+    if (x >= lut->points[i][0] && x <= lut->points[i + 1][0]) {
+      const double a =
+          (x - lut->points[i][0]) / (lut->points[i + 1][0] - lut->points[i][0]);
+      return lut->points[i + 1][1] * a + lut->points[i][1] * (1.0 - a);
+    }
+  }
+  // Constant extrapolation for x > x_{n-1}
+  return lut->points[lut->num_points - 1][1];
+}
+
+static double noise_strength_solver_get_bin_index(
+    const aom_noise_strength_solver_t *solver, double value) {
+  const double val =
+      fclamp(value, solver->min_intensity, solver->max_intensity);
+  const double range = solver->max_intensity - solver->min_intensity;
+  return (solver->num_bins - 1) * (val - solver->min_intensity) / range;
+}
+
+static double noise_strength_solver_get_value(
+    const aom_noise_strength_solver_t *solver, double x) {
+  const double bin = noise_strength_solver_get_bin_index(solver, x);
+  const int bin_i0 = (int)floor(bin);
+  const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1);
+  const double a = bin - bin_i0;
+  return (1.0 - a) * solver->eqns.x[bin_i0] + a * solver->eqns.x[bin_i1];
+}
+
+void aom_noise_strength_solver_add_measurement(
+    aom_noise_strength_solver_t *solver, double block_mean, double noise_std) {
+  const double bin = noise_strength_solver_get_bin_index(solver, block_mean);
+  const int bin_i0 = (int)floor(bin);
+  const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1);
+  const double a = bin - bin_i0;
+  const int n = solver->num_bins;
+  solver->eqns.A[bin_i0 * n + bin_i0] += (1.0 - a) * (1.0 - a);
+  solver->eqns.A[bin_i1 * n + bin_i0] += a * (1.0 - a);
+  solver->eqns.A[bin_i1 * n + bin_i1] += a * a;
+  solver->eqns.A[bin_i0 * n + bin_i1] += a * (1.0 - a);
+  solver->eqns.b[bin_i0] += (1.0 - a) * noise_std;
+  solver->eqns.b[bin_i1] += a * noise_std;
+  solver->total += noise_std;
+  solver->num_equations++;
+}
+
+int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver) {
+  // Add regularization proportional to the number of constraints
+  const int n = solver->num_bins;
+  const double kAlpha = 2.0 * (double)(solver->num_equations) / n;
+  int result = 0;
+  double mean = 0;
+
+  // Do this in a non-destructive manner so it is not confusing to the caller
+  double *old_A = solver->eqns.A;
+  double *A = (double *)aom_malloc(sizeof(*A) * n * n);
+  if (!A) {
+    fprintf(stderr, "Unable to allocate copy of A\n");
+    return 0;
+  }
+  memcpy(A, old_A, sizeof(*A) * n * n);
+
+  for (int i = 0; i < n; ++i) {
+    const int i_lo = AOMMAX(0, i - 1);
+    const int i_hi = AOMMIN(n - 1, i + 1);
+    A[i * n + i_lo] -= kAlpha;
+    A[i * n + i] += 2 * kAlpha;
+    A[i * n + i_hi] -= kAlpha;
+  }
+
+  // Small regularization to give average noise strength
+  mean = solver->total / solver->num_equations;
+  for (int i = 0; i < n; ++i) {
+    A[i * n + i] += 1.0 / 8192.;
+    solver->eqns.b[i] += mean / 8192.;
+  }
+  solver->eqns.A = A;
+  result = equation_system_solve(&solver->eqns);
+  solver->eqns.A = old_A;
+
+  aom_free(A);
+  return result;
+}
+
+int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver,
+                                   int num_bins, int bit_depth) {
+  if (!solver) return 0;
+  memset(solver, 0, sizeof(*solver));
+  solver->num_bins = num_bins;
+  solver->min_intensity = 0;
+  solver->max_intensity = (1 << bit_depth) - 1;
+  solver->total = 0;
+  solver->num_equations = 0;
+  return equation_system_init(&solver->eqns, num_bins);
+}
+
+void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver) {
+  if (!solver) return;
+  equation_system_free(&solver->eqns);
+}
+
+double aom_noise_strength_solver_get_center(
+    const aom_noise_strength_solver_t *solver, int i) {
+  const double range = solver->max_intensity - solver->min_intensity;
+  const int n = solver->num_bins;
+  return ((double)i) / (n - 1) * range + solver->min_intensity;
+}
+
+// Computes the residual if a point were to be removed from the lut. This is
+// calculated as the area between the output of the solver and the line segment
+// that would be formed between [x_{i - 1}, x_{i + 1}).
+static void update_piecewise_linear_residual(
+    const aom_noise_strength_solver_t *solver,
+    const aom_noise_strength_lut_t *lut, double *residual, int start, int end) {
+  const double dx = 255. / solver->num_bins;
+  for (int i = AOMMAX(start, 1); i < AOMMIN(end, lut->num_points - 1); ++i) {
+    const int lower = AOMMAX(0, (int)floor(noise_strength_solver_get_bin_index(
+                                    solver, lut->points[i - 1][0])));
+    const int upper = AOMMIN(solver->num_bins - 1,
+                             (int)ceil(noise_strength_solver_get_bin_index(
+                                 solver, lut->points[i + 1][0])));
+    double r = 0;
+    for (int j = lower; j <= upper; ++j) {
+      const double x = aom_noise_strength_solver_get_center(solver, j);
+      if (x < lut->points[i - 1][0]) continue;
+      if (x >= lut->points[i + 1][0]) continue;
+      const double y = solver->eqns.x[j];
+      const double a = (x - lut->points[i - 1][0]) /
+                       (lut->points[i + 1][0] - lut->points[i - 1][0]);
+      const double estimate_y =
+          lut->points[i - 1][1] * (1.0 - a) + lut->points[i + 1][1] * a;
+      r += fabs(y - estimate_y);
+    }
+    residual[i] = r * dx;
+  }
+}
+
+int aom_noise_strength_solver_fit_piecewise(
+    const aom_noise_strength_solver_t *solver, int max_output_points,
+    aom_noise_strength_lut_t *lut) {
+  // The tolerance is normalized to be give consistent results between
+  // different bit-depths.
+  const double kTolerance = solver->max_intensity * 0.00625 / 255.0;
+  if (!aom_noise_strength_lut_init(lut, solver->num_bins)) {
+    fprintf(stderr, "Failed to init lut\n");
+    return 0;
+  }
+  for (int i = 0; i < solver->num_bins; ++i) {
+    lut->points[i][0] = aom_noise_strength_solver_get_center(solver, i);
+    lut->points[i][1] = solver->eqns.x[i];
+  }
+  if (max_output_points < 0) {
+    max_output_points = solver->num_bins;
+  }
+
+  double *residual = aom_malloc(solver->num_bins * sizeof(*residual));
+  memset(residual, 0, sizeof(*residual) * solver->num_bins);
+
+  update_piecewise_linear_residual(solver, lut, residual, 0, solver->num_bins);
+
+  // Greedily remove points if there are too many or if it doesn't hurt local
+  // approximation (never remove the end points)
+  while (lut->num_points > 2) {
+    int min_index = 1;
+    for (int j = 1; j < lut->num_points - 1; ++j) {
+      if (residual[j] < residual[min_index]) {
+        min_index = j;
+      }
+    }
+    const double dx =
+        lut->points[min_index + 1][0] - lut->points[min_index - 1][0];
+    const double avg_residual = residual[min_index] / dx;
+    if (lut->num_points <= max_output_points && avg_residual > kTolerance) {
+      break;
+    }
+
+    const int num_remaining = lut->num_points - min_index - 1;
+    memmove(lut->points + min_index, lut->points + min_index + 1,
+            sizeof(lut->points[0]) * num_remaining);
+    lut->num_points--;
+
+    update_piecewise_linear_residual(solver, lut, residual, min_index - 1,
+                                     min_index + 1);
+  }
+  aom_free(residual);
+  return 1;
+}
+
+int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder,
+                               int block_size, int bit_depth, int use_highbd) {
+  const int n = block_size * block_size;
+  aom_equation_system_t eqns;
+  double *AtA_inv = 0;
+  double *A = 0;
+  int x = 0, y = 0, i = 0, j = 0;
+  if (!equation_system_init(&eqns, kLowPolyNumParams)) {
+    fprintf(stderr, "Failed to init equation system for block_size=%d\n",
+            block_size);
+    return 0;
+  }
+
+  AtA_inv = (double *)aom_malloc(kLowPolyNumParams * kLowPolyNumParams *
+                                 sizeof(*AtA_inv));
+  A = (double *)aom_malloc(kLowPolyNumParams * n * sizeof(*A));
+  if (AtA_inv == NULL || A == NULL) {
+    fprintf(stderr, "Failed to alloc A or AtA_inv for block_size=%d\n",
+            block_size);
+    aom_free(AtA_inv);
+    aom_free(A);
+    equation_system_free(&eqns);
+    return 0;
+  }
+
+  block_finder->A = A;
+  block_finder->AtA_inv = AtA_inv;
+  block_finder->block_size = block_size;
+  block_finder->normalization = (1 << bit_depth) - 1;
+  block_finder->use_highbd = use_highbd;
+
+  for (y = 0; y < block_size; ++y) {
+    const double yd = ((double)y - block_size / 2.) / (block_size / 2.);
+    for (x = 0; x < block_size; ++x) {
+      const double xd = ((double)x - block_size / 2.) / (block_size / 2.);
+      const double coords[3] = { yd, xd, 1 };
+      const int row = y * block_size + x;
+      A[kLowPolyNumParams * row + 0] = yd;
+      A[kLowPolyNumParams * row + 1] = xd;
+      A[kLowPolyNumParams * row + 2] = 1;
+
+      for (i = 0; i < kLowPolyNumParams; ++i) {
+        for (j = 0; j < kLowPolyNumParams; ++j) {
+          eqns.A[kLowPolyNumParams * i + j] += coords[i] * coords[j];
+        }
+      }
+    }
+  }
+
+  // Lazy inverse using existing equation solver.
+  for (i = 0; i < kLowPolyNumParams; ++i) {
+    memset(eqns.b, 0, sizeof(*eqns.b) * kLowPolyNumParams);
+    eqns.b[i] = 1;
+    equation_system_solve(&eqns);
+
+    for (j = 0; j < kLowPolyNumParams; ++j) {
+      AtA_inv[j * kLowPolyNumParams + i] = eqns.x[j];
+    }
+  }
+  equation_system_free(&eqns);
+  return 1;
+}
+
+void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder) {
+  if (!block_finder) return;
+  aom_free(block_finder->A);
+  aom_free(block_finder->AtA_inv);
+  memset(block_finder, 0, sizeof(*block_finder));
+}
+
+void aom_flat_block_finder_extract_block(
+    const aom_flat_block_finder_t *block_finder, const uint8_t *const data,
+    int w, int h, int stride, int offsx, int offsy, double *plane,
+    double *block) {
+  const int block_size = block_finder->block_size;
+  const int n = block_size * block_size;
+  const double *A = block_finder->A;
+  const double *AtA_inv = block_finder->AtA_inv;
+  double plane_coords[kLowPolyNumParams];
+  double AtA_inv_b[kLowPolyNumParams];
+  int xi, yi, i;
+
+  if (block_finder->use_highbd) {
+    const uint16_t *const data16 = (const uint16_t *const)data;
+    for (yi = 0; yi < block_size; ++yi) {
+      const int y = clamp(offsy + yi, 0, h - 1);
+      for (xi = 0; xi < block_size; ++xi) {
+        const int x = clamp(offsx + xi, 0, w - 1);
+        block[yi * block_size + xi] =
+            ((double)data16[y * stride + x]) / block_finder->normalization;
+      }
+    }
+  } else {
+    for (yi = 0; yi < block_size; ++yi) {
+      const int y = clamp(offsy + yi, 0, h - 1);
+      for (xi = 0; xi < block_size; ++xi) {
+        const int x = clamp(offsx + xi, 0, w - 1);
+        block[yi * block_size + xi] =
+            ((double)data[y * stride + x]) / block_finder->normalization;
+      }
+    }
+  }
+  multiply_mat(block, A, AtA_inv_b, 1, n, kLowPolyNumParams);
+  multiply_mat(AtA_inv, AtA_inv_b, plane_coords, kLowPolyNumParams,
+               kLowPolyNumParams, 1);
+  multiply_mat(A, plane_coords, plane, n, kLowPolyNumParams, 1);
+
+  for (i = 0; i < n; ++i) {
+    block[i] -= plane[i];
+  }
+}
+
+typedef struct {
+  int index;
+  float score;
+} index_and_score_t;
+
+static int compare_scores(const void *a, const void *b) {
+  const float diff =
+      ((index_and_score_t *)a)->score - ((index_and_score_t *)b)->score;
+  if (diff < 0)
+    return -1;
+  else if (diff > 0)
+    return 1;
+  return 0;
+}
+
+int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder,
+                              const uint8_t *const data, int w, int h,
+                              int stride, uint8_t *flat_blocks) {
+  // The gradient-based features used in this code are based on:
+  //  A. Kokaram, D. Kelly, H. Denman and A. Crawford, "Measuring noise
+  //  correlation for improved video denoising," 2012 19th, ICIP.
+  // The thresholds are more lenient to allow for correct grain modeling
+  // if extreme cases.
+  const int block_size = block_finder->block_size;
+  const int n = block_size * block_size;
+  const double kTraceThreshold = 0.15 / (32 * 32);
+  const double kRatioThreshold = 1.25;
+  const double kNormThreshold = 0.08 / (32 * 32);
+  const double kVarThreshold = 0.005 / (double)n;
+  const int num_blocks_w = (w + block_size - 1) / block_size;
+  const int num_blocks_h = (h + block_size - 1) / block_size;
+  int num_flat = 0;
+  int bx = 0, by = 0;
+  double *plane = (double *)aom_malloc(n * sizeof(*plane));
+  double *block = (double *)aom_malloc(n * sizeof(*block));
+  index_and_score_t *scores = (index_and_score_t *)aom_malloc(
+      num_blocks_w * num_blocks_h * sizeof(*scores));
+  if (plane == NULL || block == NULL || scores == NULL) {
+    fprintf(stderr, "Failed to allocate memory for block of size %d\n", n);
+    aom_free(plane);
+    aom_free(block);
+    aom_free(scores);
+    return -1;
+  }
+
+#ifdef NOISE_MODEL_LOG_SCORE
+  fprintf(stderr, "score = [");
+#endif
+  for (by = 0; by < num_blocks_h; ++by) {
+    for (bx = 0; bx < num_blocks_w; ++bx) {
+      // Compute gradient covariance matrix.
+      double Gxx = 0, Gxy = 0, Gyy = 0;
+      double var = 0;
+      double mean = 0;
+      int xi, yi;
+      aom_flat_block_finder_extract_block(block_finder, data, w, h, stride,
+                                          bx * block_size, by * block_size,
+                                          plane, block);
+
+      for (yi = 1; yi < block_size - 1; ++yi) {
+        for (xi = 1; xi < block_size - 1; ++xi) {
+          const double gx = (block[yi * block_size + xi + 1] -
+                             block[yi * block_size + xi - 1]) /
+                            2;
+          const double gy = (block[yi * block_size + xi + block_size] -
+                             block[yi * block_size + xi - block_size]) /
+                            2;
+          Gxx += gx * gx;
+          Gxy += gx * gy;
+          Gyy += gy * gy;
+
+          mean += block[yi * block_size + xi];
+          var += block[yi * block_size + xi] * block[yi * block_size + xi];
+        }
+      }
+      mean /= (block_size - 2) * (block_size - 2);
+
+      // Normalize gradients by block_size.
+      Gxx /= ((block_size - 2) * (block_size - 2));
+      Gxy /= ((block_size - 2) * (block_size - 2));
+      Gyy /= ((block_size - 2) * (block_size - 2));
+      var = var / ((block_size - 2) * (block_size - 2)) - mean * mean;
+
+      {
+        const double trace = Gxx + Gyy;
+        const double det = Gxx * Gyy - Gxy * Gxy;
+        const double e1 = (trace + sqrt(trace * trace - 4 * det)) / 2.;
+        const double e2 = (trace - sqrt(trace * trace - 4 * det)) / 2.;
+        const double norm = e1;  // Spectral norm
+        const double ratio = (e1 / AOMMAX(e2, 1e-6));
+        const int is_flat = (trace < kTraceThreshold) &&
+                            (ratio < kRatioThreshold) &&
+                            (norm < kNormThreshold) && (var > kVarThreshold);
+        // The following weights are used to combine the above features to give
+        // a sigmoid score for flatness. If the input was normalized to [0,100]
+        // the magnitude of these values would be close to 1 (e.g., weights
+        // corresponding to variance would be a factor of 10000x smaller).
+        // The weights are given in the following order:
+        //    [{var}, {ratio}, {trace}, {norm}, offset]
+        // with one of the most discriminative being simply the variance.
+        const double weights[5] = { -6682, -0.2056, 13087, -12434, 2.5694 };
+        const float score =
+            (float)(1.0 / (1 + exp(-(weights[0] * var + weights[1] * ratio +
+                                     weights[2] * trace + weights[3] * norm +
+                                     weights[4]))));
+        flat_blocks[by * num_blocks_w + bx] = is_flat ? 255 : 0;
+        scores[by * num_blocks_w + bx].score = var > kVarThreshold ? score : 0;
+        scores[by * num_blocks_w + bx].index = by * num_blocks_w + bx;
+#ifdef NOISE_MODEL_LOG_SCORE
+        fprintf(stderr, "%g %g %g %g %g %d ", score, var, ratio, trace, norm,
+                is_flat);
+#endif
+        num_flat += is_flat;
+      }
+    }
+#ifdef NOISE_MODEL_LOG_SCORE
+    fprintf(stderr, "\n");
+#endif
+  }
+#ifdef NOISE_MODEL_LOG_SCORE
+  fprintf(stderr, "];\n");
+#endif
+  // Find the top-scored blocks (most likely to be flat) and set the flat blocks
+  // be the union of the thresholded results and the top 10th percentile of the
+  // scored results.
+  qsort(scores, num_blocks_w * num_blocks_h, sizeof(*scores), &compare_scores);
+  const int top_nth_percentile = num_blocks_w * num_blocks_h * 90 / 100;
+  const float score_threshold = scores[top_nth_percentile].score;
+  for (int i = 0; i < num_blocks_w * num_blocks_h; ++i) {
+    if (scores[i].score >= score_threshold) {
+      num_flat += flat_blocks[scores[i].index] == 0;
+      flat_blocks[scores[i].index] |= 1;
+    }
+  }
+  aom_free(block);
+  aom_free(plane);
+  aom_free(scores);
+  return num_flat;
+}
+
+int aom_noise_model_init(aom_noise_model_t *model,
+                         const aom_noise_model_params_t params) {
+  const int n = num_coeffs(params);
+  const int lag = params.lag;
+  const int bit_depth = params.bit_depth;
+  int x = 0, y = 0, i = 0, c = 0;
+
+  memset(model, 0, sizeof(*model));
+  if (params.lag < 1) {
+    fprintf(stderr, "Invalid noise param: lag = %d must be >= 1\n", params.lag);
+    return 0;
+  }
+  if (params.lag > kMaxLag) {
+    fprintf(stderr, "Invalid noise param: lag = %d must be <= %d\n", params.lag,
+            kMaxLag);
+    return 0;
+  }
+
+  memcpy(&model->params, &params, sizeof(params));
+  for (c = 0; c < 3; ++c) {
+    if (!noise_state_init(&model->combined_state[c], n + (c > 0), bit_depth)) {
+      fprintf(stderr, "Failed to allocate noise state for channel %d\n", c);
+      aom_noise_model_free(model);
+      return 0;
+    }
+    if (!noise_state_init(&model->latest_state[c], n + (c > 0), bit_depth)) {
+      fprintf(stderr, "Failed to allocate noise state for channel %d\n", c);
+      aom_noise_model_free(model);
+      return 0;
+    }
+  }
+  model->n = n;
+  model->coords = (int(*)[2])aom_malloc(sizeof(*model->coords) * n);
+
+  for (y = -lag; y <= 0; ++y) {
+    const int max_x = y == 0 ? -1 : lag;
+    for (x = -lag; x <= max_x; ++x) {
+      switch (params.shape) {
+        case AOM_NOISE_SHAPE_DIAMOND:
+          if (abs(x) <= y + lag) {
+            model->coords[i][0] = x;
+            model->coords[i][1] = y;
+            ++i;
+          }
+          break;
+        case AOM_NOISE_SHAPE_SQUARE:
+          model->coords[i][0] = x;
+          model->coords[i][1] = y;
+          ++i;
+          break;
+        default:
+          fprintf(stderr, "Invalid shape\n");
+          aom_noise_model_free(model);
+          return 0;
+      }
+    }
+  }
+  assert(i == n);
+  return 1;
+}
+
+void aom_noise_model_free(aom_noise_model_t *model) {
+  int c = 0;
+  if (!model) return;
+
+  aom_free(model->coords);
+  for (c = 0; c < 3; ++c) {
+    equation_system_free(&model->latest_state[c].eqns);
+    equation_system_free(&model->combined_state[c].eqns);
+
+    equation_system_free(&model->latest_state[c].strength_solver.eqns);
+    equation_system_free(&model->combined_state[c].strength_solver.eqns);
+  }
+  memset(model, 0, sizeof(*model));
+}
+
+// Extracts the neighborhood defined by coords around point (x, y) from
+// the difference between the data and denoised images. Also extracts the
+// entry (possibly downsampled) for (x, y) in the alt_data (e.g., luma).
+#define EXTRACT_AR_ROW(INT_TYPE, suffix)                                   \
+  static double extract_ar_row_##suffix(                                   \
+      int(*coords)[2], int num_coords, const INT_TYPE *const data,         \
+      const INT_TYPE *const denoised, int stride, int sub_log2[2],         \
+      const INT_TYPE *const alt_data, const INT_TYPE *const alt_denoised,  \
+      int alt_stride, int x, int y, double *buffer) {                      \
+    for (int i = 0; i < num_coords; ++i) {                                 \
+      const int x_i = x + coords[i][0], y_i = y + coords[i][1];            \
+      buffer[i] =                                                          \
+          (double)data[y_i * stride + x_i] - denoised[y_i * stride + x_i]; \
+    }                                                                      \
+    const double val =                                                     \
+        (double)data[y * stride + x] - denoised[y * stride + x];           \
+                                                                           \
+    if (alt_data && alt_denoised) {                                        \
+      double avg_data = 0, avg_denoised = 0;                               \
+      int num_samples = 0;                                                 \
+      for (int dy_i = 0; dy_i < (1 << sub_log2[1]); dy_i++) {              \
+        const int y_up = (y << sub_log2[1]) + dy_i;                        \
+        for (int dx_i = 0; dx_i < (1 << sub_log2[0]); dx_i++) {            \
+          const int x_up = (x << sub_log2[0]) + dx_i;                      \
+          avg_data += alt_data[y_up * alt_stride + x_up];                  \
+          avg_denoised += alt_denoised[y_up * alt_stride + x_up];          \
+          num_samples++;                                                   \
+        }                                                                  \
+      }                                                                    \
+      buffer[num_coords] = (avg_data - avg_denoised) / num_samples;        \
+    }                                                                      \
+    return val;                                                            \
+  }
+
+EXTRACT_AR_ROW(uint8_t, lowbd);
+EXTRACT_AR_ROW(uint16_t, highbd);
+
+static int add_block_observations(
+    aom_noise_model_t *noise_model, int c, const uint8_t *const data,
+    const uint8_t *const denoised, int w, int h, int stride, int sub_log2[2],
+    const uint8_t *const alt_data, const uint8_t *const alt_denoised,
+    int alt_stride, const uint8_t *const flat_blocks, int block_size,
+    int num_blocks_w, int num_blocks_h) {
+  const int lag = noise_model->params.lag;
+  const int num_coords = noise_model->n;
+  const double normalization = (1 << noise_model->params.bit_depth) - 1;
+  double *A = noise_model->latest_state[c].eqns.A;
+  double *b = noise_model->latest_state[c].eqns.b;
+  double *buffer = (double *)aom_malloc(sizeof(*buffer) * (num_coords + 1));
+  const int n = noise_model->latest_state[c].eqns.n;
+
+  if (!buffer) {
+    fprintf(stderr, "Unable to allocate buffer of size %d\n", num_coords + 1);
+    return 0;
+  }
+  for (int by = 0; by < num_blocks_h; ++by) {
+    const int y_o = by * (block_size >> sub_log2[1]);
+    for (int bx = 0; bx < num_blocks_w; ++bx) {
+      const int x_o = bx * (block_size >> sub_log2[0]);
+      if (!flat_blocks[by * num_blocks_w + bx]) {
+        continue;
+      }
+      int y_start =
+          (by > 0 && flat_blocks[(by - 1) * num_blocks_w + bx]) ? 0 : lag;
+      int x_start =
+          (bx > 0 && flat_blocks[by * num_blocks_w + bx - 1]) ? 0 : lag;
+      int y_end = AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]),
+                         block_size >> sub_log2[1]);
+      int x_end = AOMMIN(
+          (w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]) - lag,
+          (bx + 1 < num_blocks_w && flat_blocks[by * num_blocks_w + bx + 1])
+              ? (block_size >> sub_log2[0])
+              : ((block_size >> sub_log2[0]) - lag));
+      for (int y = y_start; y < y_end; ++y) {
+        for (int x = x_start; x < x_end; ++x) {
+          const double val =
+              noise_model->params.use_highbd
+                  ? extract_ar_row_highbd(noise_model->coords, num_coords,
+                                          (const uint16_t *const)data,
+                                          (const uint16_t *const)denoised,
+                                          stride, sub_log2,
+                                          (const uint16_t *const)alt_data,
+                                          (const uint16_t *const)alt_denoised,
+                                          alt_stride, x + x_o, y + y_o, buffer)
+                  : extract_ar_row_lowbd(noise_model->coords, num_coords, data,
+                                         denoised, stride, sub_log2, alt_data,
+                                         alt_denoised, alt_stride, x + x_o,
+                                         y + y_o, buffer);
+          for (int i = 0; i < n; ++i) {
+            for (int j = 0; j < n; ++j) {
+              A[i * n + j] +=
+                  (buffer[i] * buffer[j]) / (normalization * normalization);
+            }
+            b[i] += (buffer[i] * val) / (normalization * normalization);
+          }
+          noise_model->latest_state[c].num_observations++;
+        }
+      }
+    }
+  }
+  aom_free(buffer);
+  return 1;
+}
+
+static void add_noise_std_observations(
+    aom_noise_model_t *noise_model, int c, const double *coeffs,
+    const uint8_t *const data, const uint8_t *const denoised, int w, int h,
+    int stride, int sub_log2[2], const uint8_t *const alt_data, int alt_stride,
+    const uint8_t *const flat_blocks, int block_size, int num_blocks_w,
+    int num_blocks_h) {
+  const int num_coords = noise_model->n;
+  aom_noise_strength_solver_t *noise_strength_solver =
+      &noise_model->latest_state[c].strength_solver;
+
+  const aom_noise_strength_solver_t *noise_strength_luma =
+      &noise_model->latest_state[0].strength_solver;
+  const double luma_gain = noise_model->latest_state[0].ar_gain;
+  const double noise_gain = noise_model->latest_state[c].ar_gain;
+  for (int by = 0; by < num_blocks_h; ++by) {
+    const int y_o = by * (block_size >> sub_log2[1]);
+    for (int bx = 0; bx < num_blocks_w; ++bx) {
+      const int x_o = bx * (block_size >> sub_log2[0]);
+      if (!flat_blocks[by * num_blocks_w + bx]) {
+        continue;
+      }
+      const int num_samples_h =
+          AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]),
+                 block_size >> sub_log2[1]);
+      const int num_samples_w =
+          AOMMIN((w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]),
+                 (block_size >> sub_log2[0]));
+      // Make sure that we have a reasonable amount of samples to consider the
+      // block
+      if (num_samples_w * num_samples_h > block_size) {
+        const double block_mean = get_block_mean(
+            alt_data ? alt_data : data, w, h, alt_data ? alt_stride : stride,
+            x_o << sub_log2[0], y_o << sub_log2[1], block_size,
+            noise_model->params.use_highbd);
+        const double noise_var = get_noise_var(
+            data, denoised, stride, w >> sub_log2[0], h >> sub_log2[1], x_o,
+            y_o, block_size >> sub_log2[0], block_size >> sub_log2[1],
+            noise_model->params.use_highbd);
+        // We want to remove the part of the noise that came from being
+        // correlated with luma. Note that the noise solver for luma must
+        // have already been run.
+        const double luma_strength =
+            c > 0 ? luma_gain * noise_strength_solver_get_value(
+                                    noise_strength_luma, block_mean)
+                  : 0;
+        const double corr = c > 0 ? coeffs[num_coords] : 0;
+        // Chroma noise:
+        //    N(0, noise_var) = N(0, uncorr_var) + corr * N(0, luma_strength^2)
+        // The uncorrelated component:
+        //   uncorr_var = noise_var - (corr * luma_strength)^2
+        // But don't allow fully correlated noise (hence the max), since the
+        // synthesis cannot model it.
+        const double uncorr_std = sqrt(
+            AOMMAX(noise_var / 16, noise_var - pow(corr * luma_strength, 2)));
+        // After we've removed correlation with luma, undo the gain that will
+        // come from running the IIR filter.
+        const double adjusted_strength = uncorr_std / noise_gain;
+        aom_noise_strength_solver_add_measurement(
+            noise_strength_solver, block_mean, adjusted_strength);
+      }
+    }
+  }
+}
+
+// Return true if the noise estimate appears to be different from the combined
+// (multi-frame) estimate. The difference is measured by checking whether the
+// AR coefficients have diverged (using a threshold on normalized cross
+// correlation), or whether the noise strength has changed.
+static int is_noise_model_different(aom_noise_model_t *const noise_model) {
+  // These thresholds are kind of arbitrary and will likely need further tuning
+  // (or exported as parameters). The threshold on noise strength is a weighted
+  // difference between the noise strength histograms
+  const double kCoeffThreshold = 0.9;
+  const double kStrengthThreshold =
+      0.005 * (1 << (noise_model->params.bit_depth - 8));
+  for (int c = 0; c < 1; ++c) {
+    const double corr =
+        aom_normalized_cross_correlation(noise_model->latest_state[c].eqns.x,
+                                         noise_model->combined_state[c].eqns.x,
+                                         noise_model->combined_state[c].eqns.n);
+    if (corr < kCoeffThreshold) return 1;
+
+    const double dx =
+        1.0 / noise_model->latest_state[c].strength_solver.num_bins;
+
+    const aom_equation_system_t *latest_eqns =
+        &noise_model->latest_state[c].strength_solver.eqns;
+    const aom_equation_system_t *combined_eqns =
+        &noise_model->combined_state[c].strength_solver.eqns;
+    double diff = 0;
+    double total_weight = 0;
+    for (int j = 0; j < latest_eqns->n; ++j) {
+      double weight = 0;
+      for (int i = 0; i < latest_eqns->n; ++i) {
+        weight += latest_eqns->A[i * latest_eqns->n + j];
+      }
+      weight = sqrt(weight);
+      diff += weight * fabs(latest_eqns->x[j] - combined_eqns->x[j]);
+      total_weight += weight;
+    }
+    if (diff * dx / total_weight > kStrengthThreshold) return 1;
+  }
+  return 0;
+}
+
+static int ar_equation_system_solve(aom_noise_state_t *state, int is_chroma) {
+  const int ret = equation_system_solve(&state->eqns);
+  state->ar_gain = 1.0;
+  if (!ret) return ret;
+
+  // Update the AR gain from the equation system as it will be used to fit
+  // the noise strength as a function of intensity.  In the Yule-Walker
+  // equations, the diagonal should be the variance of the correlated noise.
+  // In the case of the least squares estimate, there will be some variability
+  // in the diagonal. So use the mean of the diagonal as the estimate of
+  // overall variance (this works for least squares or Yule-Walker formulation).
+  double var = 0;
+  const int n = state->eqns.n;
+  for (int i = 0; i < (state->eqns.n - is_chroma); ++i) {
+    var += state->eqns.A[i * n + i] / state->num_observations;
+  }
+  var /= (n - is_chroma);
+
+  // Keep track of E(Y^2) = <b, x> + E(X^2)
+  // In the case that we are using chroma and have an estimate of correlation
+  // with luma we adjust that estimate slightly to remove the correlated bits by
+  // subtracting out the last column of a scaled by our correlation estimate
+  // from b. E(y^2) = <b - A(:, end)*x(end), x>
+  double sum_covar = 0;
+  for (int i = 0; i < state->eqns.n - is_chroma; ++i) {
+    double bi = state->eqns.b[i];
+    if (is_chroma) {
+      bi -= state->eqns.A[i * n + (n - 1)] * state->eqns.x[n - 1];
+    }
+    sum_covar += (bi * state->eqns.x[i]) / state->num_observations;
+  }
+  // Now, get an estimate of the variance of uncorrelated noise signal and use
+  // it to determine the gain of the AR filter.
+  const double noise_var = AOMMAX(var - sum_covar, 1e-6);
+  state->ar_gain = AOMMAX(1, sqrt(AOMMAX(var / noise_var, 1e-6)));
+  return ret;
+}
+
+aom_noise_status_t aom_noise_model_update(
+    aom_noise_model_t *const noise_model, const uint8_t *const data[3],
+    const uint8_t *const denoised[3], int w, int h, int stride[3],
+    int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size) {
+  const int num_blocks_w = (w + block_size - 1) / block_size;
+  const int num_blocks_h = (h + block_size - 1) / block_size;
+  int y_model_different = 0;
+  int num_blocks = 0;
+  int i = 0, channel = 0;
+
+  if (block_size <= 1) {
+    fprintf(stderr, "block_size = %d must be > 1\n", block_size);
+    return AOM_NOISE_STATUS_INVALID_ARGUMENT;
+  }
+
+  if (block_size < noise_model->params.lag * 2 + 1) {
+    fprintf(stderr, "block_size = %d must be >= %d\n", block_size,
+            noise_model->params.lag * 2 + 1);
+    return AOM_NOISE_STATUS_INVALID_ARGUMENT;
+  }
+
+  // Clear the latest equation system
+  for (i = 0; i < 3; ++i) {
+    equation_system_clear(&noise_model->latest_state[i].eqns);
+    noise_model->latest_state[i].num_observations = 0;
+    noise_strength_solver_clear(&noise_model->latest_state[i].strength_solver);
+  }
+
+  // Check that we have enough flat blocks
+  for (i = 0; i < num_blocks_h * num_blocks_w; ++i) {
+    if (flat_blocks[i]) {
+      num_blocks++;
+    }
+  }
+
+  if (num_blocks <= 1) {
+    fprintf(stderr, "Not enough flat blocks to update noise estimate\n");
+    return AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS;
+  }
+
+  for (channel = 0; channel < 3; ++channel) {
+    int no_subsampling[2] = { 0, 0 };
+    const uint8_t *alt_data = channel > 0 ? data[0] : 0;
+    const uint8_t *alt_denoised = channel > 0 ? denoised[0] : 0;
+    int *sub = channel > 0 ? chroma_sub_log2 : no_subsampling;
+    const int is_chroma = channel != 0;
+    if (!data[channel] || !denoised[channel]) break;
+    if (!add_block_observations(noise_model, channel, data[channel],
+                                denoised[channel], w, h, stride[channel], sub,
+                                alt_data, alt_denoised, stride[0], flat_blocks,
+                                block_size, num_blocks_w, num_blocks_h)) {
+      fprintf(stderr, "Adding block observation failed\n");
+      return AOM_NOISE_STATUS_INTERNAL_ERROR;
+    }
+
+    if (!ar_equation_system_solve(&noise_model->latest_state[channel],
+                                  is_chroma)) {
+      if (is_chroma) {
+        set_chroma_coefficient_fallback_soln(
+            &noise_model->latest_state[channel].eqns);
+      } else {
+        fprintf(stderr, "Solving latest noise equation system failed %d!\n",
+                channel);
+        return AOM_NOISE_STATUS_INTERNAL_ERROR;
+      }
+    }
+
+    add_noise_std_observations(
+        noise_model, channel, noise_model->latest_state[channel].eqns.x,
+        data[channel], denoised[channel], w, h, stride[channel], sub, alt_data,
+        stride[0], flat_blocks, block_size, num_blocks_w, num_blocks_h);
+
+    if (!aom_noise_strength_solver_solve(
+            &noise_model->latest_state[channel].strength_solver)) {
+      fprintf(stderr, "Solving latest noise strength failed!\n");
+      return AOM_NOISE_STATUS_INTERNAL_ERROR;
+    }
+
+    // Check noise characteristics and return if error.
+    if (channel == 0 &&
+        noise_model->combined_state[channel].strength_solver.num_equations >
+            0 &&
+        is_noise_model_different(noise_model)) {
+      y_model_different = 1;
+    }
+
+    // Don't update the combined stats if the y model is different.
+    if (y_model_different) continue;
+
+    noise_model->combined_state[channel].num_observations +=
+        noise_model->latest_state[channel].num_observations;
+    equation_system_add(&noise_model->combined_state[channel].eqns,
+                        &noise_model->latest_state[channel].eqns);
+    if (!ar_equation_system_solve(&noise_model->combined_state[channel],
+                                  is_chroma)) {
+      if (is_chroma) {
+        set_chroma_coefficient_fallback_soln(
+            &noise_model->combined_state[channel].eqns);
+      } else {
+        fprintf(stderr, "Solving combined noise equation system failed %d!\n",
+                channel);
+        return AOM_NOISE_STATUS_INTERNAL_ERROR;
+      }
+    }
+
+    noise_strength_solver_add(
+        &noise_model->combined_state[channel].strength_solver,
+        &noise_model->latest_state[channel].strength_solver);
+
+    if (!aom_noise_strength_solver_solve(
+            &noise_model->combined_state[channel].strength_solver)) {
+      fprintf(stderr, "Solving combined noise strength failed!\n");
+      return AOM_NOISE_STATUS_INTERNAL_ERROR;
+    }
+  }
+
+  return y_model_different ? AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE
+                           : AOM_NOISE_STATUS_OK;
+}
+
+void aom_noise_model_save_latest(aom_noise_model_t *noise_model) {
+  for (int c = 0; c < 3; c++) {
+    equation_system_copy(&noise_model->combined_state[c].eqns,
+                         &noise_model->latest_state[c].eqns);
+    equation_system_copy(&noise_model->combined_state[c].strength_solver.eqns,
+                         &noise_model->latest_state[c].strength_solver.eqns);
+    noise_model->combined_state[c].strength_solver.num_equations =
+        noise_model->latest_state[c].strength_solver.num_equations;
+    noise_model->combined_state[c].num_observations =
+        noise_model->latest_state[c].num_observations;
+    noise_model->combined_state[c].ar_gain =
+        noise_model->latest_state[c].ar_gain;
+  }
+}
+
+int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
+                                         aom_film_grain_t *film_grain) {
+  if (noise_model->params.lag > 3) {
+    fprintf(stderr, "params.lag = %d > 3\n", noise_model->params.lag);
+    return 0;
+  }
+  memset(film_grain, 0, sizeof(*film_grain));
+
+  film_grain->apply_grain = 1;
+  film_grain->update_parameters = 1;
+
+  film_grain->ar_coeff_lag = noise_model->params.lag;
+
+  // Convert the scaling functions to 8 bit values
+  aom_noise_strength_lut_t scaling_points[3];
+  aom_noise_strength_solver_fit_piecewise(
+      &noise_model->combined_state[0].strength_solver, 14, scaling_points + 0);
+  aom_noise_strength_solver_fit_piecewise(
+      &noise_model->combined_state[1].strength_solver, 10, scaling_points + 1);
+  aom_noise_strength_solver_fit_piecewise(
+      &noise_model->combined_state[2].strength_solver, 10, scaling_points + 2);
+
+  // Both the domain and the range of the scaling functions in the film_grain
+  // are normalized to 8-bit (e.g., they are implicitly scaled during grain
+  // synthesis).
+  const double strength_divisor = 1 << (noise_model->params.bit_depth - 8);
+  double max_scaling_value = 1e-4;
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < scaling_points[c].num_points; ++i) {
+      scaling_points[c].points[i][0] =
+          AOMMIN(255, scaling_points[c].points[i][0] / strength_divisor);
+      scaling_points[c].points[i][1] =
+          AOMMIN(255, scaling_points[c].points[i][1] / strength_divisor);
+      max_scaling_value =
+          AOMMAX(scaling_points[c].points[i][1], max_scaling_value);
+    }
+  }
+
+  // Scaling_shift values are in the range [8,11]
+  const int max_scaling_value_log2 =
+      clamp((int)floor(log2(max_scaling_value) + 1), 2, 5);
+  film_grain->scaling_shift = 5 + (8 - max_scaling_value_log2);
+
+  const double scale_factor = 1 << (8 - max_scaling_value_log2);
+  film_grain->num_y_points = scaling_points[0].num_points;
+  film_grain->num_cb_points = scaling_points[1].num_points;
+  film_grain->num_cr_points = scaling_points[2].num_points;
+
+  int(*film_grain_scaling[3])[2] = {
+    film_grain->scaling_points_y,
+    film_grain->scaling_points_cb,
+    film_grain->scaling_points_cr,
+  };
+  for (int c = 0; c < 3; c++) {
+    for (int i = 0; i < scaling_points[c].num_points; ++i) {
+      film_grain_scaling[c][i][0] = (int)(scaling_points[c].points[i][0] + 0.5);
+      film_grain_scaling[c][i][1] = clamp(
+          (int)(scale_factor * scaling_points[c].points[i][1] + 0.5), 0, 255);
+    }
+  }
+  aom_noise_strength_lut_free(scaling_points + 0);
+  aom_noise_strength_lut_free(scaling_points + 1);
+  aom_noise_strength_lut_free(scaling_points + 2);
+
+  // Convert the ar_coeffs into 8-bit values
+  const int n_coeff = noise_model->combined_state[0].eqns.n;
+  double max_coeff = 1e-4, min_coeff = -1e-4;
+  double y_corr[2] = { 0, 0 };
+  double avg_luma_strength = 0;
+  for (int c = 0; c < 3; c++) {
+    aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns;
+    for (int i = 0; i < n_coeff; ++i) {
+      max_coeff = AOMMAX(max_coeff, eqns->x[i]);
+      min_coeff = AOMMIN(min_coeff, eqns->x[i]);
+    }
+    // Since the correlation between luma/chroma was computed in an already
+    // scaled space, we adjust it in the un-scaled space.
+    aom_noise_strength_solver_t *solver =
+        &noise_model->combined_state[c].strength_solver;
+    // Compute a weighted average of the strength for the channel.
+    double average_strength = 0, total_weight = 0;
+    for (int i = 0; i < solver->eqns.n; ++i) {
+      double w = 0;
+      for (int j = 0; j < solver->eqns.n; ++j) {
+        w += solver->eqns.A[i * solver->eqns.n + j];
+      }
+      w = sqrt(w);
+      average_strength += solver->eqns.x[i] * w;
+      total_weight += w;
+    }
+    if (total_weight == 0)
+      average_strength = 1;
+    else
+      average_strength /= total_weight;
+    if (c == 0) {
+      avg_luma_strength = average_strength;
+    } else {
+      y_corr[c - 1] = avg_luma_strength * eqns->x[n_coeff] / average_strength;
+      max_coeff = AOMMAX(max_coeff, y_corr[c - 1]);
+      min_coeff = AOMMIN(min_coeff, y_corr[c - 1]);
+    }
+  }
+  // Shift value: AR coeffs range (values 6-9)
+  // 6: [-2, 2),  7: [-1, 1), 8: [-0.5, 0.5), 9: [-0.25, 0.25)
+  film_grain->ar_coeff_shift =
+      clamp(7 - (int)AOMMAX(1 + floor(log2(max_coeff)), ceil(log2(-min_coeff))),
+            6, 9);
+  double scale_ar_coeff = 1 << film_grain->ar_coeff_shift;
+  int *ar_coeffs[3] = {
+    film_grain->ar_coeffs_y,
+    film_grain->ar_coeffs_cb,
+    film_grain->ar_coeffs_cr,
+  };
+  for (int c = 0; c < 3; ++c) {
+    aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns;
+    for (int i = 0; i < n_coeff; ++i) {
+      ar_coeffs[c][i] =
+          clamp((int)round(scale_ar_coeff * eqns->x[i]), -128, 127);
+    }
+    if (c > 0) {
+      ar_coeffs[c][n_coeff] =
+          clamp((int)round(scale_ar_coeff * y_corr[c - 1]), -128, 127);
+    }
+  }
+
+  // At the moment, the noise modeling code assumes that the chroma scaling
+  // functions are a function of luma.
+  film_grain->cb_mult = 128;       // 8 bits
+  film_grain->cb_luma_mult = 192;  // 8 bits
+  film_grain->cb_offset = 256;     // 9 bits
+
+  film_grain->cr_mult = 128;       // 8 bits
+  film_grain->cr_luma_mult = 192;  // 8 bits
+  film_grain->cr_offset = 256;     // 9 bits
+
+  film_grain->chroma_scaling_from_luma = 0;
+  film_grain->grain_scale_shift = 0;
+  film_grain->overlap_flag = 1;
+  return 1;
+}
+
+static void pointwise_multiply(const float *a, float *b, int n) {
+  for (int i = 0; i < n; ++i) {
+    b[i] *= a[i];
+  }
+}
+
+static float *get_half_cos_window(int block_size) {
+  float *window_function =
+      (float *)aom_malloc(block_size * block_size * sizeof(*window_function));
+  for (int y = 0; y < block_size; ++y) {
+    const double cos_yd = cos((.5 + y) * PI / block_size - PI / 2);
+    for (int x = 0; x < block_size; ++x) {
+      const double cos_xd = cos((.5 + x) * PI / block_size - PI / 2);
+      window_function[y * block_size + x] = (float)(cos_yd * cos_xd);
+    }
+  }
+  return window_function;
+}
+
+#define DITHER_AND_QUANTIZE(INT_TYPE, suffix)                               \
+  static void dither_and_quantize_##suffix(                                 \
+      float *result, int result_stride, INT_TYPE *denoised, int w, int h,   \
+      int stride, int chroma_sub_w, int chroma_sub_h, int block_size,       \
+      float block_normalization) {                                          \
+    for (int y = 0; y < (h >> chroma_sub_h); ++y) {                         \
+      for (int x = 0; x < (w >> chroma_sub_w); ++x) {                       \
+        const int result_idx =                                              \
+            (y + (block_size >> chroma_sub_h)) * result_stride + x +        \
+            (block_size >> chroma_sub_w);                                   \
+        INT_TYPE new_val = (INT_TYPE)AOMMIN(                                \
+            AOMMAX(result[result_idx] * block_normalization + 0.5f, 0),     \
+            block_normalization);                                           \
+        const float err =                                                   \
+            -(((float)new_val) / block_normalization - result[result_idx]); \
+        denoised[y * stride + x] = new_val;                                 \
+        if (x + 1 < (w >> chroma_sub_w)) {                                  \
+          result[result_idx + 1] += err * 7.0f / 16.0f;                     \
+        }                                                                   \
+        if (y + 1 < (h >> chroma_sub_h)) {                                  \
+          if (x > 0) {                                                      \
+            result[result_idx + result_stride - 1] += err * 3.0f / 16.0f;   \
+          }                                                                 \
+          result[result_idx + result_stride] += err * 5.0f / 16.0f;         \
+          if (x + 1 < (w >> chroma_sub_w)) {                                \
+            result[result_idx + result_stride + 1] += err * 1.0f / 16.0f;   \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+DITHER_AND_QUANTIZE(uint8_t, lowbd);
+DITHER_AND_QUANTIZE(uint16_t, highbd);
+
+int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
+                          int w, int h, int stride[3], int chroma_sub[2],
+                          float *noise_psd[3], int block_size, int bit_depth,
+                          int use_highbd) {
+  float *plane = NULL, *block = NULL, *window_full = NULL,
+        *window_chroma = NULL;
+  double *block_d = NULL, *plane_d = NULL;
+  struct aom_noise_tx_t *tx_full = NULL;
+  struct aom_noise_tx_t *tx_chroma = NULL;
+  const int num_blocks_w = (w + block_size - 1) / block_size;
+  const int num_blocks_h = (h + block_size - 1) / block_size;
+  const int result_stride = (num_blocks_w + 2) * block_size;
+  const int result_height = (num_blocks_h + 2) * block_size;
+  float *result = NULL;
+  int init_success = 1;
+  aom_flat_block_finder_t block_finder_full;
+  aom_flat_block_finder_t block_finder_chroma;
+  const float kBlockNormalization = (float)((1 << bit_depth) - 1);
+  if (chroma_sub[0] != chroma_sub[1]) {
+    fprintf(stderr,
+            "aom_wiener_denoise_2d doesn't handle different chroma "
+            "subsampling");
+    return 0;
+  }
+  init_success &= aom_flat_block_finder_init(&block_finder_full, block_size,
+                                             bit_depth, use_highbd);
+  result = (float *)aom_malloc((num_blocks_h + 2) * block_size * result_stride *
+                               sizeof(*result));
+  plane = (float *)aom_malloc(block_size * block_size * sizeof(*plane));
+  block =
+      (float *)aom_memalign(32, 2 * block_size * block_size * sizeof(*block));
+  block_d = (double *)aom_malloc(block_size * block_size * sizeof(*block_d));
+  plane_d = (double *)aom_malloc(block_size * block_size * sizeof(*plane_d));
+  window_full = get_half_cos_window(block_size);
+  tx_full = aom_noise_tx_malloc(block_size);
+
+  if (chroma_sub[0] != 0) {
+    init_success &= aom_flat_block_finder_init(&block_finder_chroma,
+                                               block_size >> chroma_sub[0],
+                                               bit_depth, use_highbd);
+    window_chroma = get_half_cos_window(block_size >> chroma_sub[0]);
+    tx_chroma = aom_noise_tx_malloc(block_size >> chroma_sub[0]);
+  } else {
+    window_chroma = window_full;
+    tx_chroma = tx_full;
+  }
+
+  init_success &= (tx_full != NULL) && (tx_chroma != NULL) && (plane != NULL) &&
+                  (plane_d != NULL) && (block != NULL) && (block_d != NULL) &&
+                  (window_full != NULL) && (window_chroma != NULL) &&
+                  (result != NULL);
+  for (int c = init_success ? 0 : 3; c < 3; ++c) {
+    float *window_function = c == 0 ? window_full : window_chroma;
+    aom_flat_block_finder_t *block_finder = &block_finder_full;
+    const int chroma_sub_h = c > 0 ? chroma_sub[1] : 0;
+    const int chroma_sub_w = c > 0 ? chroma_sub[0] : 0;
+    struct aom_noise_tx_t *tx =
+        (c > 0 && chroma_sub[0] > 0) ? tx_chroma : tx_full;
+    if (!data[c] || !denoised[c]) continue;
+    if (c > 0 && chroma_sub[0] != 0) {
+      block_finder = &block_finder_chroma;
+    }
+    memset(result, 0, sizeof(*result) * result_stride * result_height);
+    // Do overlapped block processing (half overlapped). The block rows can
+    // easily be done in parallel
+    for (int offsy = 0; offsy < (block_size >> chroma_sub_h);
+         offsy += (block_size >> chroma_sub_h) / 2) {
+      for (int offsx = 0; offsx < (block_size >> chroma_sub_w);
+           offsx += (block_size >> chroma_sub_w) / 2) {
+        // Pad the boundary when processing each block-set.
+        for (int by = -1; by < num_blocks_h; ++by) {
+          for (int bx = -1; bx < num_blocks_w; ++bx) {
+            const int pixels_per_block =
+                (block_size >> chroma_sub_w) * (block_size >> chroma_sub_h);
+            aom_flat_block_finder_extract_block(
+                block_finder, data[c], w >> chroma_sub_w, h >> chroma_sub_h,
+                stride[c], bx * (block_size >> chroma_sub_w) + offsx,
+                by * (block_size >> chroma_sub_h) + offsy, plane_d, block_d);
+            for (int j = 0; j < pixels_per_block; ++j) {
+              block[j] = (float)block_d[j];
+              plane[j] = (float)plane_d[j];
+            }
+            pointwise_multiply(window_function, block, pixels_per_block);
+            aom_noise_tx_forward(tx, block);
+            aom_noise_tx_filter(tx, noise_psd[c]);
+            aom_noise_tx_inverse(tx, block);
+
+            // Apply window function to the plane approximation (we will apply
+            // it to the sum of plane + block when composing the results).
+            pointwise_multiply(window_function, plane, pixels_per_block);
+
+            for (int y = 0; y < (block_size >> chroma_sub_h); ++y) {
+              const int y_result =
+                  y + (by + 1) * (block_size >> chroma_sub_h) + offsy;
+              for (int x = 0; x < (block_size >> chroma_sub_w); ++x) {
+                const int x_result =
+                    x + (bx + 1) * (block_size >> chroma_sub_w) + offsx;
+                result[y_result * result_stride + x_result] +=
+                    (block[y * (block_size >> chroma_sub_w) + x] +
+                     plane[y * (block_size >> chroma_sub_w) + x]) *
+                    window_function[y * (block_size >> chroma_sub_w) + x];
+              }
+            }
+          }
+        }
+      }
+    }
+    if (use_highbd) {
+      dither_and_quantize_highbd(result, result_stride, (uint16_t *)denoised[c],
+                                 w, h, stride[c], chroma_sub_w, chroma_sub_h,
+                                 block_size, kBlockNormalization);
+    } else {
+      dither_and_quantize_lowbd(result, result_stride, denoised[c], w, h,
+                                stride[c], chroma_sub_w, chroma_sub_h,
+                                block_size, kBlockNormalization);
+    }
+  }
+  aom_free(result);
+  aom_free(plane);
+  aom_free(block);
+  aom_free(plane_d);
+  aom_free(block_d);
+  aom_free(window_full);
+
+  aom_noise_tx_free(tx_full);
+
+  aom_flat_block_finder_free(&block_finder_full);
+  if (chroma_sub[0] != 0) {
+    aom_flat_block_finder_free(&block_finder_chroma);
+    aom_free(window_chroma);
+    aom_noise_tx_free(tx_chroma);
+  }
+  return init_success;
+}
diff --git a/third_party/aom/aom_dsp/noise_model.h b/third_party/aom/aom_dsp/noise_model.h
new file mode 100644
index 000000000..dabeacc14
--- /dev/null
+++ b/third_party/aom/aom_dsp/noise_model.h
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_NOISE_MODEL_H_
+#define AOM_DSP_NOISE_MODEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#include <stdint.h>
+#include "aom_dsp/grain_synthesis.h"
+
+/*!\brief Wrapper of data required to represent linear system of eqns and soln.
+ */
+typedef struct {
+  double *A;
+  double *b;
+  double *x;
+  int n;
+} aom_equation_system_t;
+
+/*!\brief Representation of a piecewise linear curve
+ *
+ * Holds n points as (x, y) pairs, that store the curve.
+ */
+typedef struct {
+  double (*points)[2];
+  int num_points;
+} aom_noise_strength_lut_t;
+
+/*!\brief Init the noise strength lut with the given number of points*/
+int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points);
+
+/*!\brief Frees the noise strength lut. */
+void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut);
+
+/*!\brief Evaluate the lut at the point x.
+ *
+ * \param[in] lut  The lut data.
+ * \param[in] x    The coordinate to evaluate the lut.
+ */
+double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut,
+                                   double x);
+
+/*!\brief Helper struct to model noise strength as a function of intensity.
+ *
+ * Internally, this structure holds a representation of a linear system
+ * of equations that models noise strength (standard deviation) as a
+ * function of intensity. The mapping is initially stored using a
+ * piecewise representation with evenly spaced bins that cover the entire
+ * domain from [min_intensity, max_intensity]. Each observation (x,y) gives a
+ * constraint of the form:
+ *   y_{i} (1 - a) + y_{i+1} a = y
+ * where y_{i} is the value of bin i and x_{i} <= x <= x_{i+1} and
+ * a = x/(x_{i+1} - x{i}). The equation system holds the corresponding
+ * normal equations.
+ *
+ * As there may be missing data, the solution is regularized to get a
+ * complete set of values for the bins. A reduced representation after
+ * solving can be obtained by getting the corresponding noise_strength_lut_t.
+ */
+typedef struct {
+  aom_equation_system_t eqns;
+  double min_intensity;
+  double max_intensity;
+  int num_bins;
+  int num_equations;
+  double total;
+} aom_noise_strength_solver_t;
+
+/*!\brief Initializes the noise solver with the given number of bins.
+ *
+ * Returns 0 if initialization fails.
+ *
+ * \param[in]  solver    The noise solver to be initialized.
+ * \param[in]  num_bins  Number of bins to use in the internal representation.
+ * \param[in]  bit_depth The bit depth used to derive {min,max}_intensity.
+ */
+int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver,
+                                   int num_bins, int bit_depth);
+void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver);
+
+/*!\brief Gets the x coordinate of bin i.
+ *
+ * \param[in]  i  The bin whose coordinate to query.
+ */
+double aom_noise_strength_solver_get_center(
+    const aom_noise_strength_solver_t *solver, int i);
+
+/*!\brief Add an observation of the block mean intensity to its noise strength.
+ *
+ * \param[in]  block_mean  The average block intensity,
+ * \param[in]  noise_std   The observed noise strength.
+ */
+void aom_noise_strength_solver_add_measurement(
+    aom_noise_strength_solver_t *solver, double block_mean, double noise_std);
+
+/*!\brief Solves the current set of equations for the noise strength. */
+int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver);
+
+/*!\brief Fits a reduced piecewise linear lut to the internal solution
+ *
+ * \param[in] max_num_points  The maximum number of output points
+ * \param[out] lut  The output piecewise linear lut.
+ */
+int aom_noise_strength_solver_fit_piecewise(
+    const aom_noise_strength_solver_t *solver, int max_num_points,
+    aom_noise_strength_lut_t *lut);
+
+/*!\brief Helper for holding precomputed data for finding flat blocks.
+ *
+ * Internally a block is modeled with a low-order polynomial model. A
+ * planar model would be a bunch of equations like:
+ * <[y_i x_i 1], [a_1, a_2, a_3]>  = b_i
+ * for each point in the block. The system matrix A with row i as [y_i x_i 1]
+ * is maintained as is the inverse, inv(A'*A), so that the plane parameters
+ * can be fit for each block.
+ */
+typedef struct {
+  double *AtA_inv;
+  double *A;
+  int num_params;  // The number of parameters used for internal low-order model
+  int block_size;  // The block size the finder was initialized with
+  double normalization;  // Normalization factor (1 / (2^(bit_depth) - 1))
+  int use_highbd;        // Whether input data should be interpreted as uint16
+} aom_flat_block_finder_t;
+
+/*!\brief Init the block_finder with the given block size, bit_depth */
+int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder,
+                               int block_size, int bit_depth, int use_highbd);
+void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder);
+
+/*!\brief Helper to extract a block and low order "planar" model. */
+void aom_flat_block_finder_extract_block(
+    const aom_flat_block_finder_t *block_finder, const uint8_t *const data,
+    int w, int h, int stride, int offsx, int offsy, double *plane,
+    double *block);
+
+/*!\brief Runs the flat block finder on the input data.
+ *
+ * Find flat blocks in the input image data. Returns a map of
+ * flat_blocks, where the value of flat_blocks map will be non-zero
+ * when a block is determined to be flat. A higher value indicates a bigger
+ * confidence in the decision.
+ */
+int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder,
+                              const uint8_t *const data, int w, int h,
+                              int stride, uint8_t *flat_blocks);
+
+// The noise shape indicates the allowed coefficients in the AR model.
+typedef enum {
+  AOM_NOISE_SHAPE_DIAMOND = 0,
+  AOM_NOISE_SHAPE_SQUARE = 1
+} aom_noise_shape;
+
+// The parameters of the noise model include the shape type, lag, the
+// bit depth of the input images provided, and whether the input images
+// will be using uint16 (or uint8) representation.
+typedef struct {
+  aom_noise_shape shape;
+  int lag;
+  int bit_depth;
+  int use_highbd;
+} aom_noise_model_params_t;
+
+/*!\brief State of a noise model estimate for a single channel.
+ *
+ * This contains a system of equations that can be used to solve
+ * for the auto-regressive coefficients as well as a noise strength
+ * solver that can be used to model noise strength as a function of
+ * intensity.
+ */
+typedef struct {
+  aom_equation_system_t eqns;
+  aom_noise_strength_solver_t strength_solver;
+  int num_observations;  // The number of observations in the eqn system
+  double ar_gain;        // The gain of the current AR filter
+} aom_noise_state_t;
+
+/*!\brief Complete model of noise for a planar video
+ *
+ * This includes a noise model for the latest frame and an aggregated
+ * estimate over all previous frames that had similar parameters.
+ */
+typedef struct {
+  aom_noise_model_params_t params;
+  aom_noise_state_t combined_state[3];  // Combined state per channel
+  aom_noise_state_t latest_state[3];    // Latest state per channel
+  int (*coords)[2];  // Offsets (x,y) of the coefficient samples
+  int n;             // Number of parameters (size of coords)
+  int bit_depth;
+} aom_noise_model_t;
+
+/*!\brief Result of a noise model update. */
+typedef enum {
+  AOM_NOISE_STATUS_OK = 0,
+  AOM_NOISE_STATUS_INVALID_ARGUMENT,
+  AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS,
+  AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE,
+  AOM_NOISE_STATUS_INTERNAL_ERROR,
+} aom_noise_status_t;
+
+/*!\brief Initializes a noise model with the given parameters.
+ *
+ * Returns 0 on failure.
+ */
+int aom_noise_model_init(aom_noise_model_t *model,
+                         const aom_noise_model_params_t params);
+void aom_noise_model_free(aom_noise_model_t *model);
+
+/*!\brief Updates the noise model with a new frame observation.
+ *
+ * Updates the noise model with measurements from the given input frame and a
+ * denoised variant of it. Noise is sampled from flat blocks using the flat
+ * block map.
+ *
+ * Returns a noise_status indicating if the update was successful. If the
+ * Update was successful, the combined_state is updated with measurements from
+ * the provided frame. If status is OK or DIFFERENT_NOISE_TYPE, the latest noise
+ * state will be updated with measurements from the provided frame.
+ *
+ * \param[in,out] noise_model     The noise model to be updated
+ * \param[in]     data            Raw frame data
+ * \param[in]     denoised        Denoised frame data.
+ * \param[in]     w               Frame width
+ * \param[in]     h               Frame height
+ * \param[in]     strides         Stride of the planes
+ * \param[in]     chroma_sub_log2 Chroma subsampling for planes != 0.
+ * \param[in]     flat_blocks     A map to blocks that have been determined flat
+ * \param[in]     block_size      The size of blocks.
+ */
+aom_noise_status_t aom_noise_model_update(
+    aom_noise_model_t *const noise_model, const uint8_t *const data[3],
+    const uint8_t *const denoised[3], int w, int h, int strides[3],
+    int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size);
+
+/*\brief Save the "latest" estimate into the "combined" estimate.
+ *
+ * This is meant to be called when the noise modeling detected a change
+ * in parameters (or for example, if a user wanted to reset estimation at
+ * a shot boundary).
+ */
+void aom_noise_model_save_latest(aom_noise_model_t *noise_model);
+
+/*!\brief Converts the noise_model parameters to the corresponding
+ *    grain_parameters.
+ *
+ * The noise structs in this file are suitable for estimation (e.g., using
+ * floats), but the grain parameters in the bitstream are quantized. This
+ * function does the conversion by selecting the correct quantization levels.
+ */
+int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
+                                         aom_film_grain_t *film_grain);
+
+/*!\brief Perform a Wiener filter denoising in 2D using the provided noise psd.
+ *
+ * \param[in]     data            Raw frame data
+ * \param[out]    denoised        Denoised frame data
+ * \param[in]     w               Frame width
+ * \param[in]     h               Frame height
+ * \param[in]     stride          Stride of the planes
+ * \param[in]     chroma_sub_log2 Chroma subsampling for planes != 0.
+ * \param[in]     noise_psd       The power spectral density of the noise
+ * \param[in]     block_size      The size of blocks
+ * \param[in]     bit_depth       Bit depth of the image
+ * \param[in]     use_highbd      If true, uint8 pointers are interpreted as
+ *                                uint16 and stride is measured in uint16.
+ *                                This must be true when bit_depth >= 10.
+ */
+int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
+                          int w, int h, int stride[3], int chroma_sub_log2[2],
+                          float *noise_psd[3], int block_size, int bit_depth,
+                          int use_highbd);
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // AOM_DSP_NOISE_MODEL_H_
diff --git a/third_party/aom/aom_dsp/noise_util.c b/third_party/aom/aom_dsp/noise_util.c
new file mode 100644
index 000000000..87e8e9fec
--- /dev/null
+++ b/third_party/aom/aom_dsp/noise_util.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/noise_util.h"
+#include "aom_dsp/fft_common.h"
+#include "aom_mem/aom_mem.h"
+#include "config/aom_dsp_rtcd.h"
+
+float aom_noise_psd_get_default_value(int block_size, float factor) {
+  return (factor * factor / 10000) * block_size * block_size / 8;
+}
+
+// Internal representation of noise transform. It keeps track of the
+// transformed data and a temporary working buffer to use during the
+// transform.
+struct aom_noise_tx_t {
+  float *tx_block;
+  float *temp;
+  int block_size;
+  void (*fft)(const float *, float *, float *);
+  void (*ifft)(const float *, float *, float *);
+};
+
+struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size) {
+  struct aom_noise_tx_t *noise_tx =
+      (struct aom_noise_tx_t *)aom_malloc(sizeof(struct aom_noise_tx_t));
+  if (!noise_tx) return NULL;
+  memset(noise_tx, 0, sizeof(*noise_tx));
+  switch (block_size) {
+    case 2:
+      noise_tx->fft = aom_fft2x2_float;
+      noise_tx->ifft = aom_ifft2x2_float;
+      break;
+    case 4:
+      noise_tx->fft = aom_fft4x4_float;
+      noise_tx->ifft = aom_ifft4x4_float;
+      break;
+    case 8:
+      noise_tx->fft = aom_fft8x8_float;
+      noise_tx->ifft = aom_ifft8x8_float;
+      break;
+    case 16:
+      noise_tx->fft = aom_fft16x16_float;
+      noise_tx->ifft = aom_ifft16x16_float;
+      break;
+    case 32:
+      noise_tx->fft = aom_fft32x32_float;
+      noise_tx->ifft = aom_ifft32x32_float;
+      break;
+    default:
+      aom_free(noise_tx);
+      fprintf(stderr, "Unsupported block size %d\n", block_size);
+      return NULL;
+  }
+  noise_tx->block_size = block_size;
+  noise_tx->tx_block = (float *)aom_memalign(
+      32, 2 * sizeof(*noise_tx->tx_block) * block_size * block_size);
+  noise_tx->temp = (float *)aom_memalign(
+      32, 2 * sizeof(*noise_tx->temp) * block_size * block_size);
+  if (!noise_tx->tx_block || !noise_tx->temp) {
+    aom_noise_tx_free(noise_tx);
+    return NULL;
+  }
+  // Clear the buffers up front. Some outputs of the forward transform are
+  // real only (the imaginary component will never be touched)
+  memset(noise_tx->tx_block, 0,
+         2 * sizeof(*noise_tx->tx_block) * block_size * block_size);
+  memset(noise_tx->temp, 0,
+         2 * sizeof(*noise_tx->temp) * block_size * block_size);
+  return noise_tx;
+}
+
+void aom_noise_tx_forward(struct aom_noise_tx_t *noise_tx, const float *data) {
+  noise_tx->fft(data, noise_tx->temp, noise_tx->tx_block);
+}
+
+void aom_noise_tx_filter(struct aom_noise_tx_t *noise_tx, const float *psd) {
+  const int block_size = noise_tx->block_size;
+  const float kBeta = 1.1f;
+  const float kEps = 1e-6f;
+  for (int y = 0; y < block_size; ++y) {
+    for (int x = 0; x < block_size; ++x) {
+      int i = y * block_size + x;
+      float *c = noise_tx->tx_block + 2 * i;
+      const float p = c[0] * c[0] + c[1] * c[1];
+      if (p > kBeta * psd[i] && p > 1e-6) {
+        noise_tx->tx_block[2 * i + 0] *= (p - psd[i]) / AOMMAX(p, kEps);
+        noise_tx->tx_block[2 * i + 1] *= (p - psd[i]) / AOMMAX(p, kEps);
+      } else {
+        noise_tx->tx_block[2 * i + 0] *= (kBeta - 1.0f) / kBeta;
+        noise_tx->tx_block[2 * i + 1] *= (kBeta - 1.0f) / kBeta;
+      }
+    }
+  }
+}
+
+void aom_noise_tx_inverse(struct aom_noise_tx_t *noise_tx, float *data) {
+  const int n = noise_tx->block_size * noise_tx->block_size;
+  noise_tx->ifft(noise_tx->tx_block, noise_tx->temp, data);
+  for (int i = 0; i < n; ++i) {
+    data[i] /= n;
+  }
+}
+
+void aom_noise_tx_add_energy(const struct aom_noise_tx_t *noise_tx,
+                             float *psd) {
+  const int block_size = noise_tx->block_size;
+  for (int yb = 0; yb < block_size; ++yb) {
+    for (int xb = 0; xb <= block_size / 2; ++xb) {
+      float *c = noise_tx->tx_block + 2 * (yb * block_size + xb);
+      psd[yb * block_size + xb] += c[0] * c[0] + c[1] * c[1];
+    }
+  }
+}
+
+void aom_noise_tx_free(struct aom_noise_tx_t *noise_tx) {
+  if (!noise_tx) return;
+  aom_free(noise_tx->tx_block);
+  aom_free(noise_tx->temp);
+  aom_free(noise_tx);
+}
+
+double aom_normalized_cross_correlation(const double *a, const double *b,
+                                        int n) {
+  double c = 0;
+  double a_len = 0;
+  double b_len = 0;
+  for (int i = 0; i < n; ++i) {
+    a_len += a[i] * a[i];
+    b_len += b[i] * b[i];
+    c += a[i] * b[i];
+  }
+  return c / (sqrt(a_len) * sqrt(b_len));
+}
+
+int aom_noise_data_validate(const double *data, int w, int h) {
+  const double kVarianceThreshold = 2;
+  const double kMeanThreshold = 2;
+
+  int x = 0, y = 0;
+  int ret_value = 1;
+  double var = 0, mean = 0;
+  double *mean_x, *mean_y, *var_x, *var_y;
+
+  // Check that noise variance is not increasing in x or y
+  // and that the data is zero mean.
+  mean_x = (double *)aom_malloc(sizeof(*mean_x) * w);
+  var_x = (double *)aom_malloc(sizeof(*var_x) * w);
+  mean_y = (double *)aom_malloc(sizeof(*mean_x) * h);
+  var_y = (double *)aom_malloc(sizeof(*var_y) * h);
+
+  memset(mean_x, 0, sizeof(*mean_x) * w);
+  memset(var_x, 0, sizeof(*var_x) * w);
+  memset(mean_y, 0, sizeof(*mean_y) * h);
+  memset(var_y, 0, sizeof(*var_y) * h);
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      const double d = data[y * w + x];
+      var_x[x] += d * d;
+      var_y[y] += d * d;
+      mean_x[x] += d;
+      mean_y[y] += d;
+      var += d * d;
+      mean += d;
+    }
+  }
+  mean /= (w * h);
+  var = var / (w * h) - mean * mean;
+
+  for (y = 0; y < h; ++y) {
+    mean_y[y] /= h;
+    var_y[y] = var_y[y] / h - mean_y[y] * mean_y[y];
+    if (fabs(var_y[y] - var) >= kVarianceThreshold) {
+      fprintf(stderr, "Variance distance too large %f %f\n", var_y[y], var);
+      ret_value = 0;
+      break;
+    }
+    if (fabs(mean_y[y] - mean) >= kMeanThreshold) {
+      fprintf(stderr, "Mean distance too large %f %f\n", mean_y[y], mean);
+      ret_value = 0;
+      break;
+    }
+  }
+
+  for (x = 0; x < w; ++x) {
+    mean_x[x] /= w;
+    var_x[x] = var_x[x] / w - mean_x[x] * mean_x[x];
+    if (fabs(var_x[x] - var) >= kVarianceThreshold) {
+      fprintf(stderr, "Variance distance too large %f %f\n", var_x[x], var);
+      ret_value = 0;
+      break;
+    }
+    if (fabs(mean_x[x] - mean) >= kMeanThreshold) {
+      fprintf(stderr, "Mean distance too large %f %f\n", mean_x[x], mean);
+      ret_value = 0;
+      break;
+    }
+  }
+
+  aom_free(mean_x);
+  aom_free(mean_y);
+  aom_free(var_x);
+  aom_free(var_y);
+
+  return ret_value;
+}
diff --git a/third_party/aom/aom_dsp/noise_util.h b/third_party/aom/aom_dsp/noise_util.h
new file mode 100644
index 000000000..ea4d9e3de
--- /dev/null
+++ b/third_party/aom/aom_dsp/noise_util.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_NOISE_UTIL_H_
+#define AOM_DSP_NOISE_UTIL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// aom_noise_tx_t is an abstraction of a transform that is used for denoising.
+// It is meant to be lightweight and does hold the transformed data (as
+// the user should not be manipulating the transformed data directly).
+struct aom_noise_tx_t;
+
+// Allocates and returns a aom_noise_tx_t useful for denoising the given
+// block_size. The resulting aom_noise_tx_t should be free'd with
+// aom_noise_tx_free.
+struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size);
+void aom_noise_tx_free(struct aom_noise_tx_t *aom_noise_tx);
+
+// Transforms the internal data and holds it in the aom_noise_tx's internal
+// buffer. For compatibility with existing SIMD implementations, "data" must
+// be 32-byte aligned.
+void aom_noise_tx_forward(struct aom_noise_tx_t *aom_noise_tx,
+                          const float *data);
+
+// Filters aom_noise_tx's internal data using the provided noise power spectral
+// density. The PSD must be at least block_size * block_size and should be
+// populated with a constant or via estimates taken from
+// aom_noise_tx_add_energy.
+void aom_noise_tx_filter(struct aom_noise_tx_t *aom_noise_tx, const float *psd);
+
+// Performs an inverse transform using the internal transform data.
+// For compatibility with existing SIMD implementations, "data" must be 32-byte
+// aligned.
+void aom_noise_tx_inverse(struct aom_noise_tx_t *aom_noise_tx, float *data);
+
+// Aggregates the power of the buffered transform data into the psd buffer.
+void aom_noise_tx_add_energy(const struct aom_noise_tx_t *aom_noise_tx,
+                             float *psd);
+
+// Returns a default value suitable for denosing a transform of the given
+// block_size. The noise "factor" determines the strength of the noise to
+// be removed. A value of about 2.5 can be used for moderate denoising,
+// where a value of 5.0 can be used for a high level of denoising.
+float aom_noise_psd_get_default_value(int block_size, float factor);
+
+// Computes normalized cross correlation of two vectors a and b of length n.
+double aom_normalized_cross_correlation(const double *a, const double *b,
+                                        int n);
+
+// Validates the correlated noise in the data buffer of size (w, h).
+int aom_noise_data_validate(const double *data, int w, int h);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // AOM_DSP_NOISE_UTIL_H_
diff --git a/third_party/aom/aom_dsp/prob.c b/third_party/aom/aom_dsp/prob.c
deleted file mode 100644
index a42fb806b..000000000
--- a/third_party/aom/aom_dsp/prob.c
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_config.h"
-
-#include <string.h>
-
-#include "aom_dsp/prob.h"
-
-static unsigned int tree_merge_probs_impl(unsigned int i,
-                                          const aom_tree_index *tree,
-                                          const aom_prob *pre_probs,
-                                          const unsigned int *counts,
-                                          aom_prob *probs) {
-  const int l = tree[i];
-  const unsigned int left_count =
-      (l <= 0) ? counts[-l]
-               : tree_merge_probs_impl(l, tree, pre_probs, counts, probs);
-  const int r = tree[i + 1];
-  const unsigned int right_count =
-      (r <= 0) ? counts[-r]
-               : tree_merge_probs_impl(r, tree, pre_probs, counts, probs);
-  const unsigned int ct[2] = { left_count, right_count };
-  probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct);
-  return left_count + right_count;
-}
-
-void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs,
-                          const unsigned int *counts, aom_prob *probs) {
-  tree_merge_probs_impl(0, tree, pre_probs, counts, probs);
-}
-
-typedef struct tree_node tree_node;
-
-struct tree_node {
-  aom_tree_index index;
-  uint8_t probs[16];
-  uint8_t prob;
-  int path;
-  int len;
-  int l;
-  int r;
-  aom_cdf_prob pdf;
-};
-
-/* Compute the probability of this node in Q23 */
-static uint32_t tree_node_prob(tree_node n, int i) {
-  uint32_t prob;
-  /* 1.0 in Q23 */
-  prob = 16777216;
-  for (; i < n.len; i++) {
-    prob = prob * n.probs[i] >> 8;
-  }
-  return prob;
-}
-
-static int tree_node_cmp(tree_node a, tree_node b) {
-  int i;
-  uint32_t pa;
-  uint32_t pb;
-  for (i = 0; i < AOMMIN(a.len, b.len) && a.probs[i] == b.probs[i]; i++) {
-  }
-  pa = tree_node_prob(a, i);
-  pb = tree_node_prob(b, i);
-  return pa > pb ? 1 : pa < pb ? -1 : 0;
-}
-
-/* Given a Q15 probability for symbol subtree rooted at tree[n], this function
-    computes the probability of each symbol (defined as a node that has no
-    children). */
-static aom_cdf_prob tree_node_compute_probs(tree_node *tree, int n,
-                                            aom_cdf_prob pdf) {
-  if (tree[n].l == 0) {
-    /* This prevents probability computations in Q15 that underflow from
-        producing a symbol that has zero probability. */
-    if (pdf == 0) pdf = 1;
-    tree[n].pdf = pdf;
-    return pdf;
-  } else {
-    /* We process the smaller probability first,  */
-    if (tree[n].prob < 128) {
-      aom_cdf_prob lp;
-      aom_cdf_prob rp;
-      lp = (((uint32_t)pdf) * tree[n].prob + 128) >> 8;
-      lp = tree_node_compute_probs(tree, tree[n].l, lp);
-      rp = tree_node_compute_probs(tree, tree[n].r, lp > pdf ? 0 : pdf - lp);
-      return lp + rp;
-    } else {
-      aom_cdf_prob rp;
-      aom_cdf_prob lp;
-      rp = (((uint32_t)pdf) * (256 - tree[n].prob) + 128) >> 8;
-      rp = tree_node_compute_probs(tree, tree[n].r, rp);
-      lp = tree_node_compute_probs(tree, tree[n].l, rp > pdf ? 0 : pdf - rp);
-      return lp + rp;
-    }
-  }
-}
-
-static int tree_node_extract(tree_node *tree, int n, int symb,
-                             aom_cdf_prob *pdf, aom_tree_index *index,
-                             int *path, int *len) {
-  if (tree[n].l == 0) {
-    pdf[symb] = tree[n].pdf;
-    if (index != NULL) index[symb] = tree[n].index;
-    if (path != NULL) path[symb] = tree[n].path;
-    if (len != NULL) len[symb] = tree[n].len;
-    return symb + 1;
-  } else {
-    symb = tree_node_extract(tree, tree[n].l, symb, pdf, index, path, len);
-    return tree_node_extract(tree, tree[n].r, symb, pdf, index, path, len);
-  }
-}
-
-int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs,
-                aom_tree_index root, aom_cdf_prob *cdf, aom_tree_index *index,
-                int *path, int *len) {
-  tree_node symb[2 * 16 - 1];
-  int nodes;
-  int next[16];
-  int size;
-  int nsymbs;
-  int i;
-  /* Create the root node with probability 1 in Q15. */
-  symb[0].index = root;
-  symb[0].path = 0;
-  symb[0].len = 0;
-  symb[0].l = symb[0].r = 0;
-  nodes = 1;
-  next[0] = 0;
-  size = 1;
-  nsymbs = 1;
-  while (size > 0 && nsymbs < 16) {
-    int m;
-    tree_node n;
-    aom_tree_index j;
-    uint8_t prob;
-    m = 0;
-    /* Find the internal node with the largest probability. */
-    for (i = 1; i < size; i++) {
-      if (tree_node_cmp(symb[next[i]], symb[next[m]]) > 0) m = i;
-    }
-    i = next[m];
-    memmove(&next[m], &next[m + 1], sizeof(*next) * (size - (m + 1)));
-    size--;
-    /* Split this symbol into two symbols */
-    n = symb[i];
-    j = n.index;
-    prob = probs[j >> 1];
-    /* Left */
-    n.index = tree[j];
-    n.path <<= 1;
-    n.len++;
-    n.probs[n.len - 1] = prob;
-    symb[nodes] = n;
-    if (n.index > 0) {
-      next[size++] = nodes;
-    }
-    /* Right */
-    n.index = tree[j + 1];
-    n.path += 1;
-    n.probs[n.len - 1] = 256 - prob;
-    symb[nodes + 1] = n;
-    if (n.index > 0) {
-      next[size++] = nodes + 1;
-    }
-    symb[i].prob = prob;
-    symb[i].l = nodes;
-    symb[i].r = nodes + 1;
-    nodes += 2;
-    nsymbs++;
-  }
-  /* Compute the probabilities of each symbol in Q15 */
-  tree_node_compute_probs(symb, 0, CDF_PROB_TOP);
-  /* Extract the cdf, index, path and length */
-  tree_node_extract(symb, 0, 0, cdf, index, path, len);
-  /* Convert to CDF */
-  cdf[0] = AOM_ICDF(cdf[0]);
-  for (i = 1; i < nsymbs; i++) {
-    cdf[i] = AOM_ICDF(AOM_ICDF(cdf[i - 1]) + cdf[i]);
-  }
-  // Store symbol count at the end of the CDF
-  cdf[nsymbs] = 0;
-  return nsymbs;
-}
-
-/* This code assumes that tree contains as unique leaf nodes the integer values
-    0 to len - 1 and produces the forward and inverse mapping tables in ind[]
-    and inv[] respectively. */
-static void tree_to_index(int *stack_index, int *ind, int *inv,
-                          const aom_tree_index *tree, int value, int index) {
-  value *= 2;
-
-  do {
-    const aom_tree_index content = tree[index];
-    ++index;
-    if (content <= 0) {
-      inv[*stack_index] = -content;
-      ind[-content] = *stack_index;
-      ++(*stack_index);
-    } else {
-      tree_to_index(stack_index, ind, inv, tree, value, content);
-    }
-  } while (++value & 1);
-}
-
-void av1_indices_from_tree(int *ind, int *inv, const aom_tree_index *tree) {
-  int stack_index = 0;
-  tree_to_index(&stack_index, ind, inv, tree, 0, 0);
-}
diff --git a/third_party/aom/aom_dsp/prob.h b/third_party/aom/aom_dsp/prob.h
index a517e810a..85dd4249d 100644
--- a/third_party/aom/aom_dsp/prob.h
+++ b/third_party/aom/aom_dsp/prob.h
@@ -13,194 +13,657 @@
 #define AOM_DSP_PROB_H_
 
 #include <assert.h>
+#include <stdio.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_common.h"
+#include "config/aom_config.h"
 
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/entcode.h"
 #include "aom_ports/bitops.h"
 #include "aom_ports/mem.h"
 
-#if !CONFIG_ANS
-#include "aom_dsp/entcode.h"
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-typedef uint8_t aom_prob;
-
 // TODO(negge): Rename this aom_prob once we remove vpxbool.
 typedef uint16_t aom_cdf_prob;
 
 #define CDF_SIZE(x) ((x) + 1)
-
 #define CDF_PROB_BITS 15
 #define CDF_PROB_TOP (1 << CDF_PROB_BITS)
+#define CDF_INIT_TOP 32768
+#define CDF_SHIFT (15 - CDF_PROB_BITS)
+/*The value stored in an iCDF is CDF_PROB_TOP minus the actual cumulative
+  probability (an "inverse" CDF).
+  This function converts from one representation to the other (and is its own
+  inverse).*/
+#define AOM_ICDF(x) (CDF_PROB_TOP - (x))
+
+#if CDF_SHIFT == 0
+
+#define AOM_CDF2(a0) AOM_ICDF(a0), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF3(a0, a1) AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF4(a0, a1, a2) \
+  AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF5(a0, a1, a2, a3) \
+  AOM_ICDF(a0)                   \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF6(a0, a1, a2, a3, a4)                        \
+  AOM_ICDF(a0)                                              \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF7(a0, a1, a2, a3, a4, a5)                                  \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6)                              \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(a6), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7)                          \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8)                     \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)                 \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9),             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10)               \
+  AOM_ICDF(a0)                                                               \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),    \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11)          \
+  AOM_ICDF(a0)                                                               \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),    \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
+      AOM_ICDF(a11), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12)     \
+  AOM_ICDF(a0)                                                               \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),    \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
+      AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \
+  AOM_ICDF(a0)                                                                \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),     \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10),  \
+      AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \
+                  a14)                                                        \
+  AOM_ICDF(a0)                                                                \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),     \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10),  \
+      AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(a14),             \
+      AOM_ICDF(CDF_PROB_TOP), 0
 
-#if !CONFIG_ANS
-#define AOM_ICDF OD_ICDF
 #else
-#define AOM_ICDF(x) (x)
-#endif
-
-#define MAX_PROB 255
-
-#define LV_MAP_PROB 1
+#define AOM_CDF2(a0)                                       \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 2) + \
+            ((CDF_INIT_TOP - 2) >> 1)) /                   \
+               ((CDF_INIT_TOP - 2)) +                      \
+           1)                                              \
+  , AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF3(a0, a1)                                       \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 3) +     \
+            ((CDF_INIT_TOP - 3) >> 1)) /                       \
+               ((CDF_INIT_TOP - 3)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 3) + \
+                ((CDF_INIT_TOP - 3) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 3)) +                      \
+               2),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF4(a0, a1, a2)                                   \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) +     \
+            ((CDF_INIT_TOP - 4) >> 1)) /                       \
+               ((CDF_INIT_TOP - 4)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \
+                ((CDF_INIT_TOP - 4) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 4)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \
+                ((CDF_INIT_TOP - 4) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 4)) +                      \
+               3),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF5(a0, a1, a2, a3)                               \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) +     \
+            ((CDF_INIT_TOP - 5) >> 1)) /                       \
+               ((CDF_INIT_TOP - 5)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \
+                ((CDF_INIT_TOP - 5) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 5)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \
+                ((CDF_INIT_TOP - 5) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 5)) +                      \
+               3),                                             \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \
+                ((CDF_INIT_TOP - 5) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 5)) +                      \
+               4),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF6(a0, a1, a2, a3, a4)                           \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) +     \
+            ((CDF_INIT_TOP - 6) >> 1)) /                       \
+               ((CDF_INIT_TOP - 6)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
+                ((CDF_INIT_TOP - 6) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 6)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
+                ((CDF_INIT_TOP - 6) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 6)) +                      \
+               3),                                             \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
+                ((CDF_INIT_TOP - 6) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 6)) +                      \
+               4),                                             \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
+                ((CDF_INIT_TOP - 6) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 6)) +                      \
+               5),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF7(a0, a1, a2, a3, a4, a5)                       \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) +     \
+            ((CDF_INIT_TOP - 7) >> 1)) /                       \
+               ((CDF_INIT_TOP - 7)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
+                ((CDF_INIT_TOP - 7) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 7)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
+                ((CDF_INIT_TOP - 7) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 7)) +                      \
+               3),                                             \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
+                ((CDF_INIT_TOP - 7) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 7)) +                      \
+               4),                                             \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
+                ((CDF_INIT_TOP - 7) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 7)) +                      \
+               5),                                             \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
+                ((CDF_INIT_TOP - 7) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 7)) +                      \
+               6),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6)                   \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) +     \
+            ((CDF_INIT_TOP - 8) >> 1)) /                       \
+               ((CDF_INIT_TOP - 8)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               3),                                             \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               4),                                             \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               5),                                             \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               6),                                             \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               7),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7)               \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) +     \
+            ((CDF_INIT_TOP - 9) >> 1)) /                       \
+               ((CDF_INIT_TOP - 9)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               3),                                             \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               4),                                             \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               5),                                             \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               6),                                             \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               7),                                             \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               8),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8)           \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) +     \
+            ((CDF_INIT_TOP - 10) >> 1)) /                       \
+               ((CDF_INIT_TOP - 10)) +                          \
+           1)                                                   \
+  ,                                                             \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               2),                                              \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               3),                                              \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               4),                                              \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               5),                                              \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               6),                                              \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               7),                                              \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               8),                                              \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               9),                                              \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)        \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +      \
+            ((CDF_INIT_TOP - 11) >> 1)) /                        \
+               ((CDF_INIT_TOP - 11)) +                           \
+           1)                                                    \
+  ,                                                              \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               2),                                               \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               3),                                               \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               4),                                               \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               5),                                               \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               6),                                               \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               7),                                               \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               8),                                               \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               9),                                               \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               10),                                              \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10)    \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +       \
+            ((CDF_INIT_TOP - 12) >> 1)) /                         \
+               ((CDF_INIT_TOP - 12)) +                            \
+           1)                                                     \
+  ,                                                               \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               2),                                                \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               3),                                                \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               4),                                                \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               5),                                                \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               6),                                                \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               7),                                                \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               8),                                                \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               9),                                                \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +  \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               10),                                               \
+      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               11),                                               \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +         \
+            ((CDF_INIT_TOP - 13) >> 1)) /                           \
+               ((CDF_INIT_TOP - 13)) +                              \
+           1)                                                       \
+  ,                                                                 \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               2),                                                  \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               3),                                                  \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               4),                                                  \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               5),                                                  \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               6),                                                  \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               7),                                                  \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               8),                                                  \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               9),                                                  \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +    \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               10),                                                 \
+      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +   \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               11),                                                 \
+      AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +   \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               12),                                                 \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12) \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +              \
+            ((CDF_INIT_TOP - 14) >> 1)) /                                \
+               ((CDF_INIT_TOP - 14)) +                                   \
+           1)                                                            \
+  ,                                                                      \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               2),                                                       \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               3),                                                       \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               4),                                                       \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               5),                                                       \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               6),                                                       \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               7),                                                       \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               8),                                                       \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               9),                                                       \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +         \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               10),                                                      \
+      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +        \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               11),                                                      \
+      AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +        \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               12),                                                      \
+      AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +        \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               13),                                                      \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +                   \
+            ((CDF_INIT_TOP - 15) >> 1)) /                                     \
+               ((CDF_INIT_TOP - 15)) +                                        \
+           1)                                                                 \
+  ,                                                                           \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               2),                                                            \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               3),                                                            \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               4),                                                            \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               5),                                                            \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               6),                                                            \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               7),                                                            \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               8),                                                            \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               9),                                                            \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +              \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               10),                                                           \
+      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +             \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               11),                                                           \
+      AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +             \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               12),                                                           \
+      AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +             \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               13),                                                           \
+      AOM_ICDF((((a13)-14) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +             \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               14),                                                           \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \
+                  a14)                                                        \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +                   \
+            ((CDF_INIT_TOP - 16) >> 1)) /                                     \
+               ((CDF_INIT_TOP - 16)) +                                        \
+           1)                                                                 \
+  ,                                                                           \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               2),                                                            \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               3),                                                            \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               4),                                                            \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               5),                                                            \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               6),                                                            \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               7),                                                            \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               8),                                                            \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               9),                                                            \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +              \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               10),                                                           \
+      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               11),                                                           \
+      AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               12),                                                           \
+      AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               13),                                                           \
+      AOM_ICDF((((a13)-14) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               14),                                                           \
+      AOM_ICDF((((a14)-15) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               15),                                                           \
+      AOM_ICDF(CDF_PROB_TOP), 0
 
-#define BR_NODE 1
-
-#if CONFIG_ADAPT_SCAN
-#define CACHE_SCAN_PROB 1
 #endif
 
-#define aom_prob_half ((aom_prob)128)
-
-typedef int8_t aom_tree_index;
-
-#define TREE_SIZE(leaf_count) (-2 + 2 * (leaf_count))
-
-#define MODE_MV_COUNT_SAT 20
-
-/* We build coding trees compactly in arrays.
-   Each node of the tree is a pair of aom_tree_indices.
-   Array index often references a corresponding probability table.
-   Index <= 0 means done encoding/decoding and value = -Index,
-   Index > 0 means need another bit, specification at index.
-   Nonnegative indices are always even;  processing begins at node 0. */
-
-typedef const aom_tree_index aom_tree[];
-
-static INLINE aom_prob get_prob(unsigned int num, unsigned int den) {
+static INLINE uint8_t get_prob(unsigned int num, unsigned int den) {
   assert(den != 0);
   {
     const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den);
     // (p > 255) ? 255 : (p < 1) ? 1 : p;
     const int clipped_prob = p | ((255 - p) >> 23) | (p == 0);
-    return (aom_prob)clipped_prob;
+    return (uint8_t)clipped_prob;
   }
 }
 
-static INLINE aom_prob get_binary_prob(unsigned int n0, unsigned int n1) {
-  const unsigned int den = n0 + n1;
-  if (den == 0) return 128u;
-  return get_prob(n0, den);
-}
-
-/* This function assumes prob1 and prob2 are already within [1,255] range. */
-static INLINE aom_prob weighted_prob(int prob1, int prob2, int factor) {
-  return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
-}
-
-static INLINE aom_prob merge_probs(aom_prob pre_prob, const unsigned int ct[2],
-                                   unsigned int count_sat,
-                                   unsigned int max_update_factor) {
-  const aom_prob prob = get_binary_prob(ct[0], ct[1]);
-  const unsigned int count = AOMMIN(ct[0] + ct[1], count_sat);
-  const unsigned int factor = max_update_factor * count / count_sat;
-  return weighted_prob(pre_prob, prob, factor);
-}
-
-// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT;
-static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = {
-  0,  6,  12, 19, 25, 32,  38,  44,  51,  57, 64,
-  70, 76, 83, 89, 96, 102, 108, 115, 121, 128
-};
-
-static INLINE aom_prob mode_mv_merge_probs(aom_prob pre_prob,
-                                           const unsigned int ct[2]) {
-  const unsigned int den = ct[0] + ct[1];
-  if (den == 0) {
-    return pre_prob;
-  } else {
-    const unsigned int count = AOMMIN(den, MODE_MV_COUNT_SAT);
-    const unsigned int factor = count_to_update_factor[count];
-    const aom_prob prob = get_prob(ct[0], den);
-    return weighted_prob(pre_prob, prob, factor);
-  }
-}
-
-void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs,
-                          const unsigned int *counts, aom_prob *probs);
-
-int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs,
-                aom_tree_index root, aom_cdf_prob *cdf, aom_tree_index *ind,
-                int *pth, int *len);
-
-static INLINE void av1_tree_to_cdf(const aom_tree_index *tree,
-                                   const aom_prob *probs, aom_cdf_prob *cdf) {
-  aom_tree_index index[16];
-  int path[16];
-  int dist[16];
-  tree_to_cdf(tree, probs, 0, cdf, index, path, dist);
-}
-
-#define av1_tree_to_cdf_1D(tree, probs, cdf, u) \
-  do {                                          \
-    int i;                                      \
-    for (i = 0; i < u; i++) {                   \
-      av1_tree_to_cdf(tree, probs[i], cdf[i]);  \
-    }                                           \
-  } while (0)
-
-#define av1_tree_to_cdf_2D(tree, probs, cdf, v, u)     \
-  do {                                                 \
-    int j;                                             \
-    int i;                                             \
-    for (j = 0; j < v; j++) {                          \
-      for (i = 0; i < u; i++) {                        \
-        av1_tree_to_cdf(tree, probs[j][i], cdf[j][i]); \
-      }                                                \
-    }                                                  \
-  } while (0)
-
-void av1_indices_from_tree(int *ind, int *inv, const aom_tree_index *tree);
-
 static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) {
-  int rate = 4 + (cdf[nsymbs] > 31) + get_msb(nsymbs);
-#if CONFIG_LV_MAP
-  if (nsymbs == 2)
-    rate = 4 + (cdf[nsymbs] > 7) + (cdf[nsymbs] > 15) + get_msb(nsymbs);
-#endif
-  const int rate2 = 5;
+  int rate;
   int i, tmp;
-  int diff;
-#if 1
-  const int tmp0 = 1 << rate2;
-  tmp = AOM_ICDF(tmp0);
-  diff = ((CDF_PROB_TOP - (nsymbs << rate2)) >> rate) << rate;
-// Single loop (faster)
-#if !CONFIG_ANS
-  for (i = 0; i < nsymbs - 1; ++i, tmp -= tmp0) {
-    tmp -= (i == val ? diff : 0);
-    cdf[i] += ((tmp - cdf[i]) >> rate);
-  }
-#else
-  for (i = 0; i < nsymbs - 1; ++i, tmp += tmp0) {
-    tmp += (i == val ? diff : 0);
-    cdf[i] -= ((cdf[i] - tmp) >> rate);
-  }
-#endif
-#else
-  for (i = 0; i < nsymbs; ++i) {
-    tmp = (i + 1) << rate2;
-    cdf[i] -= ((cdf[i] - tmp) >> rate);
-  }
-  diff = CDF_PROB_TOP - cdf[nsymbs - 1];
 
-  for (i = val; i < nsymbs; ++i) {
-    cdf[i] += diff;
+  static const int nsymbs2speed[17] = { 0, 0, 1, 1, 2, 2, 2, 2, 2,
+                                        2, 2, 2, 2, 2, 2, 2, 2 };
+  assert(nsymbs < 17);
+  rate = 3 + (cdf[nsymbs] > 15) + (cdf[nsymbs] > 31) +
+         nsymbs2speed[nsymbs];  // + get_msb(nsymbs);
+  tmp = AOM_ICDF(0);
+
+  // Single loop (faster)
+  for (i = 0; i < nsymbs - 1; ++i) {
+    tmp = (i == val) ? 0 : tmp;
+    if (tmp < cdf[i]) {
+      cdf[i] -= ((cdf[i] - tmp) >> rate);
+    } else {
+      cdf[i] += ((tmp - cdf[i]) >> rate);
+    }
   }
-#endif
   cdf[nsymbs] += (cdf[nsymbs] < 32);
 }
 
-#if CONFIG_LV_MAP
-static INLINE void update_bin(aom_cdf_prob *cdf, int val, int nsymbs) {
-  update_cdf(cdf, val, nsymbs);
-}
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/aom_dsp/psnr.c b/third_party/aom/aom_dsp/psnr.c
index d543f12d1..37d3bb585 100644
--- a/third_party/aom/aom_dsp/psnr.c
+++ b/third_party/aom/aom_dsp/psnr.c
@@ -12,7 +12,8 @@
 #include <assert.h>
 #include <math.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/psnr.h"
 #include "aom_scale/yv12config.h"
 
@@ -26,8 +27,8 @@ double aom_sse_to_psnr(double samples, double peak, double sse) {
 }
 
 /* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
-* and highbd_8_variance(). It should not.
-*/
+ * and highbd_8_variance(). It should not.
+ */
 static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
                              int b_stride, int w, int h, unsigned int *sse,
                              int *sum) {
@@ -48,26 +49,26 @@ static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
                                       const uint8_t *b8, int b_stride, int w,
                                       int h, uint64_t *sse, int64_t *sum) {
-  int i, j;
-
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int64_t tsum = 0;
+  uint64_t tsse = 0;
+  for (int i = 0; i < h; ++i) {
+    int32_t lsum = 0;
+    for (int j = 0; j < w; ++j) {
       const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
+      lsum += diff;
+      tsse += (uint32_t)(diff * diff);
     }
+    tsum += lsum;
     a += a_stride;
     b += b_stride;
   }
+  *sum = tsum;
+  *sse = tsse;
 }
 
 static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
@@ -80,7 +81,6 @@ static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
   *sse = (unsigned int)sse_long;
   *sum = (int)sum_long;
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
                        int b_stride, int width, int height) {
@@ -122,7 +122,6 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
   return total_sse;
 }
 
-#if CONFIG_HIGHBITDEPTH
 static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
                                     const uint8_t *b8, int b_stride, int width,
                                     int height, unsigned int input_shift) {
@@ -175,7 +174,6 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
   }
   return total_sse;
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
                            const YV12_BUFFER_CONFIG *b, int hstart, int width,
@@ -228,7 +226,6 @@ int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a,
                  a->uv_crop_width, a->uv_crop_height);
 }
 
-#if CONFIG_HIGHBITDEPTH
 int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
                                   const YV12_BUFFER_CONFIG *b, int hstart,
                                   int width, int vstart, int height) {
@@ -287,11 +284,9 @@ int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
   return highbd_get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
                         a->uv_crop_width, a->uv_crop_height);
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, int plane, int highbd) {
-#if CONFIG_HIGHBITDEPTH
   if (highbd) {
     switch (plane) {
       case 0: return aom_highbd_get_y_sse(a, b);
@@ -300,7 +295,6 @@ int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
       default: assert(plane >= 0 && plane <= 2); return 0;
     }
   }
-#endif
   (void)highbd;
   switch (plane) {
     case 0: return aom_get_y_sse(a, b);
@@ -310,7 +304,6 @@ int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
                           uint32_t bit_depth, uint32_t in_bit_depth) {
@@ -356,8 +349,6 @@ void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
       aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
 }
 
-#endif  // !CONFIG_HIGHBITDEPTH
-
 void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
                    PSNR_STATS *psnr) {
   static const double peak = 255.0;
diff --git a/third_party/aom/aom_dsp/psnr.h b/third_party/aom/aom_dsp/psnr.h
index df5f8f9f2..8300b0a88 100644
--- a/third_party/aom/aom_dsp/psnr.h
+++ b/third_party/aom/aom_dsp/psnr.h
@@ -27,13 +27,13 @@ typedef struct {
 } PSNR_STATS;
 
 /*!\brief Converts SSE to PSNR
-*
-* Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
-*
-* \param[in]    samples       Number of samples
-* \param[in]    peak          Max sample value
-* \param[in]    sse           Sum of squared errors
-*/
+ *
+ * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
+ *
+ * \param[in]    samples       Number of samples
+ * \param[in]    peak          Max sample value
+ * \param[in]    sse           Sum of squared errors
+ */
 double aom_sse_to_psnr(double samples, double peak, double sse);
 int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
                            const YV12_BUFFER_CONFIG *b, int hstart, int width,
@@ -49,7 +49,6 @@ int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
 int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
 int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, int plane, int highbd);
-#if CONFIG_HIGHBITDEPTH
 int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
                                   const YV12_BUFFER_CONFIG *b, int hstart,
                                   int width, int vstart, int height);
@@ -68,7 +67,6 @@ int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
 void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
                           unsigned int bit_depth, unsigned int in_bit_depth);
-#endif
 void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
                    PSNR_STATS *psnr);
 
diff --git a/third_party/aom/aom_dsp/psnrhvs.c b/third_party/aom/aom_dsp/psnrhvs.c
index aeefd5908..30fe21d9c 100644
--- a/third_party/aom/aom_dsp/psnrhvs.c
+++ b/third_party/aom/aom_dsp/psnrhvs.c
@@ -17,17 +17,13 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/psnr.h"
 #include "aom_dsp/ssim.h"
 #include "aom_ports/system_state.h"
 
-#if !defined(M_PI)
-#define M_PI (3.141592653589793238462643)
-#endif
-#include <string.h>
-
 static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
                            int xstride) {
   int i, j;
@@ -38,7 +34,6 @@ static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
       *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
 }
 
-#if CONFIG_HIGHBITDEPTH
 static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
                                int xstride) {
   int i, j;
@@ -48,7 +43,6 @@ static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
     for (j = 0; j < 8; j++)
       *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
 }
-#endif
 
 /* Normalized inverse quantization matrix for 8x8 DCT at the point of
  * transparency. This is not the JPEG based matrix from the paper,
@@ -123,14 +117,16 @@ static double convert_score_db(double _score, double _weight, int bit_depth) {
 static double calc_psnrhvs(const unsigned char *src, int _systride,
                            const unsigned char *dst, int _dystride, double _par,
                            int _w, int _h, int _step, const double _csf[8][8],
-                           uint32_t bit_depth, uint32_t _shift) {
+                           uint32_t _shift, int buf_is_hbd) {
   double ret;
   const uint8_t *_src8 = src;
   const uint8_t *_dst8 = dst;
   const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src);
   const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst);
-  int16_t dct_s[8 * 8], dct_d[8 * 8];
-  tran_low_t dct_s_coef[8 * 8], dct_d_coef[8 * 8];
+  DECLARE_ALIGNED(16, int16_t, dct_s[8 * 8]);
+  DECLARE_ALIGNED(16, int16_t, dct_d[8 * 8]);
+  DECLARE_ALIGNED(16, tran_low_t, dct_s_coef[8 * 8]);
+  DECLARE_ALIGNED(16, tran_low_t, dct_d_coef[8 * 8]);
   double mask[8][8];
   int pixels;
   int x;
@@ -176,10 +172,10 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
       for (i = 0; i < 8; i++) {
         for (j = 0; j < 8; j++) {
           int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
-          if (bit_depth == 8 && _shift == 0) {
+          if (!buf_is_hbd) {
             dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)];
             dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)];
-          } else if (bit_depth == 10 || bit_depth == 12) {
+          } else {
             dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift;
             dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift;
           }
@@ -212,15 +208,12 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
         s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar;
       if (d_gvar > 0)
         d_gvar = (d_vars[0] + d_vars[1] + d_vars[2] + d_vars[3]) / d_gvar;
-#if CONFIG_HIGHBITDEPTH
-      if (bit_depth == 10 || bit_depth == 12) {
-        hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
-        hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
-      }
-#endif
-      if (bit_depth == 8) {
+      if (!buf_is_hbd) {
         od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
         od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+      } else {
+        hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+        hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
       }
       for (i = 0; i < 8; i++)
         for (j = (i == 0); j < 8; j++)
@@ -256,21 +249,24 @@ double aom_psnrhvs(const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *dst,
   const int step = 7;
   uint32_t bd_shift = 0;
   aom_clear_system_state();
-
   assert(bd == 8 || bd == 10 || bd == 12);
   assert(bd >= in_bd);
+  assert(src->flags == dst->flags);
+  const int buf_is_hbd = src->flags & YV12_FLAG_HIGHBITDEPTH;
 
   bd_shift = bd - in_bd;
 
-  *y_psnrhvs = calc_psnrhvs(src->y_buffer, src->y_stride, dst->y_buffer,
-                            dst->y_stride, par, src->y_crop_width,
-                            src->y_crop_height, step, csf_y, bd, bd_shift);
-  *u_psnrhvs = calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer,
-                            dst->uv_stride, par, src->uv_crop_width,
-                            src->uv_crop_height, step, csf_cb420, bd, bd_shift);
-  *v_psnrhvs = calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer,
-                            dst->uv_stride, par, src->uv_crop_width,
-                            src->uv_crop_height, step, csf_cr420, bd, bd_shift);
+  *y_psnrhvs = calc_psnrhvs(
+      src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, par,
+      src->y_crop_width, src->y_crop_height, step, csf_y, bd_shift, buf_is_hbd);
+  *u_psnrhvs =
+      calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+                   par, src->uv_crop_width, src->uv_crop_height, step,
+                   csf_cb420, bd_shift, buf_is_hbd);
+  *v_psnrhvs =
+      calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+                   par, src->uv_crop_width, src->uv_crop_height, step,
+                   csf_cr420, bd_shift, buf_is_hbd);
   psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs));
   return convert_score_db(psnrhvs, 1.0, in_bd);
 }
diff --git a/third_party/aom/aom_dsp/quantize.c b/third_party/aom/aom_dsp/quantize.c
index 21bcc486a..e1601cc7d 100644
--- a/third_party/aom/aom_dsp/quantize.c
+++ b/third_party/aom/aom_dsp/quantize.c
@@ -66,7 +66,8 @@ void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
         const int dequant =
             (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
             AOM_QM_BITS;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / (1 << log_scale);
+        const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+        dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
 
         if (tmp32) eob = i;
       }
@@ -87,11 +88,7 @@ void highbd_quantize_b_helper_c(
                          ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
   int dequant;
-#if CONFIG_TX64X64
   int idx_arr[4096];
-#else
-  int idx_arr[1024];
-#endif
   (void)iscan;
   int idx = 0;
 
@@ -130,45 +127,14 @@ void highbd_quantize_b_helper_c(
       qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
       dequant = (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
                 AOM_QM_BITS;
-      dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / (1 << log_scale);
+      const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+      dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
       if (abs_qcoeff) eob = idx_arr[i];
     }
   }
   *eob_ptr = eob + 1;
 }
 
-void quantize_dc_helper(const tran_low_t *coeff_ptr, int n_coeffs,
-                        int skip_block, const int16_t *round_ptr,
-                        const int16_t quant, tran_low_t *qcoeff_ptr,
-                        tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
-                        uint16_t *eob_ptr, const qm_val_t *qm_ptr,
-                        const qm_val_t *iqm_ptr, const int log_scale) {
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int64_t tmp;
-  int eob = -1;
-  int32_t tmp32;
-  int dequant;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    const int wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
-    const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
-    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
-                INT16_MIN, INT16_MAX);
-    tmp32 = (int32_t)((tmp * wt * quant) >> (16 - log_scale + AOM_QM_BITS));
-    qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-    dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-    dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / (1 << log_scale);
-    if (tmp32) eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
 /* These functions should only be called when quantisation matrices
    are not used. */
 void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -195,7 +161,6 @@ void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                       dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1);
 }
 
-#if CONFIG_TX64X64
 void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             int skip_block, const int16_t *zbin_ptr,
                             const int16_t *round_ptr, const int16_t *quant_ptr,
@@ -207,34 +172,6 @@ void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                       quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
                       dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2);
 }
-#endif  // CONFIG_TX64X64
-
-void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant,
-                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  quantize_dc_helper(coeff_ptr, n_coeffs, skip_block, round_ptr, quant,
-                     qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, NULL, NULL,
-                     0);
-}
-
-void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  quantize_dc_helper(coeff_ptr, 1024, skip_block, round_ptr, quant, qcoeff_ptr,
-                     dqcoeff_ptr, dequant_ptr, eob_ptr, NULL, NULL, 1);
-}
-
-#if CONFIG_TX64X64
-void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  quantize_dc_helper(coeff_ptr, 4096, skip_block, round_ptr, quant, qcoeff_ptr,
-                     dqcoeff_ptr, dequant_ptr, eob_ptr, NULL, NULL, 2);
-}
-#endif  // CONFIG_TX64X64
 
 void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              int skip_block, const int16_t *zbin_ptr,
@@ -261,7 +198,6 @@ void aom_highbd_quantize_b_32x32_c(
                              NULL, NULL, 1);
 }
 
-#if CONFIG_TX64X64
 void aom_highbd_quantize_b_64x64_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
     const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
@@ -273,4 +209,3 @@ void aom_highbd_quantize_b_64x64_c(
                              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
                              NULL, NULL, 2);
 }
-#endif  // CONFIG_TX64X64
diff --git a/third_party/aom/aom_dsp/quantize.h b/third_party/aom/aom_dsp/quantize.h
index 03609e8b4..56d50b929 100644
--- a/third_party/aom/aom_dsp/quantize.h
+++ b/third_party/aom/aom_dsp/quantize.h
@@ -12,7 +12,8 @@
 #ifndef AOM_DSP_QUANTIZE_H_
 #define AOM_DSP_QUANTIZE_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_dsp/aom_dsp_common.h"
 
 #ifdef __cplusplus
@@ -44,7 +45,6 @@ void highbd_quantize_b_helper_c(
     const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
     const qm_val_t *iqm_ptr, const int log_scale);
 
-#if CONFIG_HIGHBITDEPTH
 void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              int skip_block, const int16_t *zbin_ptr,
                              const int16_t *round_ptr, const int16_t *quant_ptr,
@@ -52,69 +52,6 @@ void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
                              const int16_t *scan, const int16_t *iscan);
-#endif
-
-void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant_ptr,
-                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr);
-void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant_ptr,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr);
-#if CONFIG_TX64X64
-void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant_ptr,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr);
-#endif  // CONFIG_TX64X64
-
-#if CONFIG_AOM_QM
-#if CONFIG_HIGHBITDEPTH
-void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
-                            uint16_t *eob_ptr, const qm_val_t *qm_ptr,
-                            const qm_val_t *iqm_ptr);
-void aom_highbd_quantize_dc_32x32(
-    const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr,
-    const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-    const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr,
-    const qm_val_t *iqm_ptr);
-#if CONFIG_TX64X64
-void aom_highbd_quantize_dc_64x64(
-    const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr,
-    const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-    const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr,
-    const qm_val_t *iqm_ptr);
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_HIGHBITDEPTH
-
-#else  // CONFIG_AOM_QM
-
-#if CONFIG_HIGHBITDEPTH
-void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
-                            uint16_t *eob_ptr);
-void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                                  const int16_t *round_ptr,
-                                  const int16_t quant_ptr,
-                                  tran_low_t *qcoeff_ptr,
-                                  tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr, uint16_t *eob_ptr);
-#if CONFIG_TX64X64
-void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
-                                  const int16_t *round_ptr,
-                                  const int16_t quant_ptr,
-                                  tran_low_t *qcoeff_ptr,
-                                  tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr, uint16_t *eob_ptr);
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_AOM_QM
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/aom_dsp/sad.c b/third_party/aom/aom_dsp/sad.c
index 6b8ca669b..ede4c583b 100644
--- a/third_party/aom/aom_dsp/sad.c
+++ b/third_party/aom/aom_dsp/sad.c
@@ -11,8 +11,8 @@
 
 #include <stdlib.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
@@ -33,32 +33,35 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
   return sad;
 }
 
-#define sadMxN(m, n)                                                        \
-  unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride,     \
-                                    const uint8_t *ref, int ref_stride) {   \
-    return sad(src, src_stride, ref, ref_stride, m, n);                     \
-  }                                                                         \
-  unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
-                                        const uint8_t *ref, int ref_stride, \
-                                        const uint8_t *second_pred) {       \
-    uint8_t comp_pred[m * n];                                               \
-    aom_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride);     \
-    return sad(src, src_stride, comp_pred, m, m, n);                        \
+#define sadMxh(m)                                                          \
+  unsigned int aom_sad##m##xh_c(const uint8_t *a, int a_stride,            \
+                                const uint8_t *b, int b_stride, int width, \
+                                int height) {                              \
+    return sad(a, a_stride, b, b_stride, width, height);                   \
   }
 
-// depending on call sites, pass **ref_array to avoid & in subsequent call and
-// de-dup with 4D below.
-#define sadMxNxK(m, n, k)                                                   \
-  void aom_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride,       \
-                                  const uint8_t *ref_array, int ref_stride, \
-                                  uint32_t *sad_array) {                    \
-    int i;                                                                  \
-    for (i = 0; i < k; ++i)                                                 \
-      sad_array[i] =                                                        \
-          aom_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \
+#define sadMxN(m, n)                                                          \
+  unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride,       \
+                                    const uint8_t *ref, int ref_stride) {     \
+    return sad(src, src_stride, ref, ref_stride, m, n);                       \
+  }                                                                           \
+  unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride,   \
+                                        const uint8_t *ref, int ref_stride,   \
+                                        const uint8_t *second_pred) {         \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride);         \
+    return sad(src, src_stride, comp_pred, m, m, n);                          \
+  }                                                                           \
+  unsigned int aom_jnt_sad##m##x##n##_avg_c(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_jnt_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride,    \
+                            jcp_param);                                       \
+    return sad(src, src_stride, comp_pred, m, m, n);                          \
   }
 
-// This appears to be equivalent to the above when k == 4 and refs is const
+// Calculate sad against 4 reference locations and store each in sad_array
 #define sadMxNx4D(m, n)                                                    \
   void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,         \
                                const uint8_t *const ref_array[],           \
@@ -70,11 +73,8 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
   }
 
 /* clang-format off */
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
 // 128x128
 sadMxN(128, 128)
-sadMxNxK(128, 128, 3)
-sadMxNxK(128, 128, 8)
 sadMxNx4D(128, 128)
 
 // 128x64
@@ -84,12 +84,9 @@ sadMxNx4D(128, 64)
 // 64x128
 sadMxN(64, 128)
 sadMxNx4D(64, 128)
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
 
 // 64x64
 sadMxN(64, 64)
-sadMxNxK(64, 64, 3)
-sadMxNxK(64, 64, 8)
 sadMxNx4D(64, 64)
 
 // 64x32
@@ -102,8 +99,6 @@ sadMxNx4D(32, 64)
 
 // 32x32
 sadMxN(32, 32)
-sadMxNxK(32, 32, 3)
-sadMxNxK(32, 32, 8)
 sadMxNx4D(32, 32)
 
 // 32x16
@@ -116,45 +111,39 @@ sadMxNx4D(16, 32)
 
 // 16x16
 sadMxN(16, 16)
-sadMxNxK(16, 16, 3)
-sadMxNxK(16, 16, 8)
 sadMxNx4D(16, 16)
 
 // 16x8
 sadMxN(16, 8)
-sadMxNxK(16, 8, 3)
-sadMxNxK(16, 8, 8)
 sadMxNx4D(16, 8)
 
 // 8x16
 sadMxN(8, 16)
-sadMxNxK(8, 16, 3)
-sadMxNxK(8, 16, 8)
 sadMxNx4D(8, 16)
 
 // 8x8
 sadMxN(8, 8)
-sadMxNxK(8, 8, 3)
-sadMxNxK(8, 8, 8)
 sadMxNx4D(8, 8)
 
 // 8x4
 sadMxN(8, 4)
-sadMxNxK(8, 4, 8)
 sadMxNx4D(8, 4)
 
 // 4x8
 sadMxN(4, 8)
-sadMxNxK(4, 8, 8)
 sadMxNx4D(4, 8)
 
 // 4x4
 sadMxN(4, 4)
-sadMxNxK(4, 4, 3)
-sadMxNxK(4, 4, 8)
 sadMxNx4D(4, 4)
 
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
+sadMxh(128);
+sadMxh(64);
+sadMxh(32);
+sadMxh(16);
+sadMxh(8);
+sadMxh(4);
+
 sadMxN(4, 16)
 sadMxNx4D(4, 16)
 sadMxN(16, 4)
@@ -167,15 +156,10 @@ sadMxN(16, 64)
 sadMxNx4D(16, 64)
 sadMxN(64, 16)
 sadMxNx4D(64, 16)
-sadMxN(32, 128)
-sadMxNx4D(32, 128)
-sadMxN(128, 32)
-sadMxNx4D(128, 32)
-#endif
-/* clang-format on */
-
-#if CONFIG_HIGHBITDEPTH
-                            static INLINE
+
+    /* clang-format on */
+
+    static INLINE
     unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8,
                             int b_stride, int width, int height) {
   int y, x;
@@ -216,19 +200,16 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
       const uint8_t *second_pred) {                                            \
     uint16_t comp_pred[m * n];                                                 \
-    aom_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
+    aom_highbd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride);   \
+    return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
+  }                                                                            \
+  unsigned int aom_highbd_jnt_sad##m##x##n##_avg_c(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {          \
+    uint16_t comp_pred[m * n];                                                 \
+    aom_highbd_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref,            \
+                                 ref_stride, jcp_param);                       \
     return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
-  }
-
-#define highbd_sadMxNxK(m, n, k)                                             \
-  void aom_highbd_sad##m##x##n##x##k##_c(                                    \
-      const uint8_t *src, int src_stride, const uint8_t *ref_array,          \
-      int ref_stride, uint32_t *sad_array) {                                 \
-    int i;                                                                   \
-    for (i = 0; i < k; ++i) {                                                \
-      sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride,            \
-                                                 &ref_array[i], ref_stride); \
-    }                                                                        \
   }
 
 #define highbd_sadMxNx4D(m, n)                                               \
@@ -243,11 +224,8 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
   }
 
 /* clang-format off */
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
 // 128x128
 highbd_sadMxN(128, 128)
-highbd_sadMxNxK(128, 128, 3)
-highbd_sadMxNxK(128, 128, 8)
 highbd_sadMxNx4D(128, 128)
 
 // 128x64
@@ -257,12 +235,9 @@ highbd_sadMxNx4D(128, 64)
 // 64x128
 highbd_sadMxN(64, 128)
 highbd_sadMxNx4D(64, 128)
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
 
 // 64x64
 highbd_sadMxN(64, 64)
-highbd_sadMxNxK(64, 64, 3)
-highbd_sadMxNxK(64, 64, 8)
 highbd_sadMxNx4D(64, 64)
 
 // 64x32
@@ -275,8 +250,6 @@ highbd_sadMxNx4D(32, 64)
 
 // 32x32
 highbd_sadMxN(32, 32)
-highbd_sadMxNxK(32, 32, 3)
-highbd_sadMxNxK(32, 32, 8)
 highbd_sadMxNx4D(32, 32)
 
 // 32x16
@@ -289,45 +262,32 @@ highbd_sadMxNx4D(16, 32)
 
 // 16x16
 highbd_sadMxN(16, 16)
-highbd_sadMxNxK(16, 16, 3)
-highbd_sadMxNxK(16, 16, 8)
 highbd_sadMxNx4D(16, 16)
 
 // 16x8
 highbd_sadMxN(16, 8)
-highbd_sadMxNxK(16, 8, 3)
-highbd_sadMxNxK(16, 8, 8)
 highbd_sadMxNx4D(16, 8)
 
 // 8x16
 highbd_sadMxN(8, 16)
-highbd_sadMxNxK(8, 16, 3)
-highbd_sadMxNxK(8, 16, 8)
 highbd_sadMxNx4D(8, 16)
 
 // 8x8
 highbd_sadMxN(8, 8)
-highbd_sadMxNxK(8, 8, 3)
-highbd_sadMxNxK(8, 8, 8)
 highbd_sadMxNx4D(8, 8)
 
 // 8x4
 highbd_sadMxN(8, 4)
-highbd_sadMxNxK(8, 4, 8)
 highbd_sadMxNx4D(8, 4)
 
 // 4x8
 highbd_sadMxN(4, 8)
-highbd_sadMxNxK(4, 8, 8)
 highbd_sadMxNx4D(4, 8)
 
 // 4x4
 highbd_sadMxN(4, 4)
-highbd_sadMxNxK(4, 4, 3)
-highbd_sadMxNxK(4, 4, 8)
 highbd_sadMxNx4D(4, 4)
 
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
 highbd_sadMxN(4, 16)
 highbd_sadMxNx4D(4, 16)
 highbd_sadMxN(16, 4)
@@ -340,277 +300,4 @@ highbd_sadMxN(16, 64)
 highbd_sadMxNx4D(16, 64)
 highbd_sadMxN(64, 16)
 highbd_sadMxNx4D(64, 16)
-highbd_sadMxN(32, 128)
-highbd_sadMxNx4D(32, 128)
-highbd_sadMxN(128, 32)
-highbd_sadMxNx4D(128, 32)
-#endif
-/* clang-format on */
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_AV1
-                                                static INLINE
-    unsigned int masked_sad(const uint8_t *src, int src_stride,
-                            const uint8_t *a, int a_stride, const uint8_t *b,
-                            int b_stride, const uint8_t *m, int m_stride,
-                            int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) {
-      const uint8_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
-      sad += abs(pred - src[x]);
-    }
-
-    src += src_stride;
-    a += a_stride;
-    b += b_stride;
-    m += m_stride;
-  }
-  sad = (sad + 31) >> 6;
-
-  return sad;
-}
-
-#define MASKSADMxN(m, n)                                                       \
-  unsigned int aom_masked_sad##m##x##n##_c(                                    \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
-      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,          \
-      int invert_mask) {                                                       \
-    if (!invert_mask)                                                          \
-      return masked_sad(src, src_stride, ref, ref_stride, second_pred, m, msk, \
-                        msk_stride, m, n);                                     \
-    else                                                                       \
-      return masked_sad(src, src_stride, second_pred, m, ref, ref_stride, msk, \
-                        msk_stride, m, n);                                     \
-  }
-
-/* clang-format off */
-#if CONFIG_EXT_PARTITION
-MASKSADMxN(128, 128)
-MASKSADMxN(128, 64)
-MASKSADMxN(64, 128)
-#endif  // CONFIG_EXT_PARTITION
-MASKSADMxN(64, 64)
-MASKSADMxN(64, 32)
-MASKSADMxN(32, 64)
-MASKSADMxN(32, 32)
-MASKSADMxN(32, 16)
-MASKSADMxN(16, 32)
-MASKSADMxN(16, 16)
-MASKSADMxN(16, 8)
-MASKSADMxN(8, 16)
-MASKSADMxN(8, 8)
-MASKSADMxN(8, 4)
-MASKSADMxN(4, 8)
-MASKSADMxN(4, 4)
-
-#if CONFIG_EXT_PARTITION_TYPES
-MASKSADMxN(4, 16)
-MASKSADMxN(16, 4)
-MASKSADMxN(8, 32)
-MASKSADMxN(32, 8)
-MASKSADMxN(16, 64)
-MASKSADMxN(64, 16)
-MASKSADMxN(32, 128)
-MASKSADMxN(128, 32)
-#endif
-/* clang-format on */
-
-#if CONFIG_HIGHBITDEPTH
-                                static INLINE
-    unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride,
-                                   const uint8_t *a8, int a_stride,
-                                   const uint8_t *b8, int b_stride,
-                                   const uint8_t *m, int m_stride, int width,
-                                   int height) {
-  int y, x;
-  unsigned int sad = 0;
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) {
-      const uint16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
-      sad += abs(pred - src[x]);
-    }
-
-    src += src_stride;
-    a += a_stride;
-    b += b_stride;
-    m += m_stride;
-  }
-  sad = (sad + 31) >> 6;
-
-  return sad;
-}
-
-#define HIGHBD_MASKSADMXN(m, n)                                         \
-  unsigned int aom_highbd_masked_sad##m##x##n##_c(                      \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,         \
-      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,  \
-      int msk_stride, int invert_mask) {                                \
-    if (!invert_mask)                                                   \
-      return highbd_masked_sad(src8, src_stride, ref8, ref_stride,      \
-                               second_pred8, m, msk, msk_stride, m, n); \
-    else                                                                \
-      return highbd_masked_sad(src8, src_stride, second_pred8, m, ref8, \
-                               ref_stride, msk, msk_stride, m, n);      \
-  }
-
-#if CONFIG_EXT_PARTITION
-HIGHBD_MASKSADMXN(128, 128)
-HIGHBD_MASKSADMXN(128, 64)
-HIGHBD_MASKSADMXN(64, 128)
-#endif  // CONFIG_EXT_PARTITION
-HIGHBD_MASKSADMXN(64, 64)
-HIGHBD_MASKSADMXN(64, 32)
-HIGHBD_MASKSADMXN(32, 64)
-HIGHBD_MASKSADMXN(32, 32)
-HIGHBD_MASKSADMXN(32, 16)
-HIGHBD_MASKSADMXN(16, 32)
-HIGHBD_MASKSADMXN(16, 16)
-HIGHBD_MASKSADMXN(16, 8)
-HIGHBD_MASKSADMXN(8, 16)
-HIGHBD_MASKSADMXN(8, 8)
-HIGHBD_MASKSADMXN(8, 4)
-HIGHBD_MASKSADMXN(4, 8)
-HIGHBD_MASKSADMXN(4, 4)
-
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
-HIGHBD_MASKSADMXN(4, 16)
-HIGHBD_MASKSADMXN(16, 4)
-HIGHBD_MASKSADMXN(8, 32)
-HIGHBD_MASKSADMXN(32, 8)
-HIGHBD_MASKSADMXN(16, 64)
-HIGHBD_MASKSADMXN(64, 16)
-HIGHBD_MASKSADMXN(32, 128)
-HIGHBD_MASKSADMXN(128, 32)
-#endif
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_AV1
-
-#if CONFIG_AV1 && CONFIG_MOTION_VAR
-// pre: predictor being evaluated
-// wsrc: target weighted prediction (has been *4096 to keep precision)
-// mask: 2d weights (scaled by 4096)
-static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
-                                    const int32_t *wsrc, const int32_t *mask,
-                                    int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++)
-      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
-
-    pre += pre_stride;
-    wsrc += width;
-    mask += width;
-  }
-
-  return sad;
-}
-
-#define OBMCSADMxN(m, n)                                                     \
-  unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \
-                                         const int32_t *wsrc,                \
-                                         const int32_t *mask) {              \
-    return obmc_sad(ref, ref_stride, wsrc, mask, m, n);                      \
-  }
-
-/* clang-format off */
-#if CONFIG_EXT_PARTITION
-OBMCSADMxN(128, 128)
-OBMCSADMxN(128, 64)
-OBMCSADMxN(64, 128)
-#endif  // CONFIG_EXT_PARTITION
-OBMCSADMxN(64, 64)
-OBMCSADMxN(64, 32)
-OBMCSADMxN(32, 64)
-OBMCSADMxN(32, 32)
-OBMCSADMxN(32, 16)
-OBMCSADMxN(16, 32)
-OBMCSADMxN(16, 16)
-OBMCSADMxN(16, 8)
-OBMCSADMxN(8, 16)
-OBMCSADMxN(8, 8)
-OBMCSADMxN(8, 4)
-OBMCSADMxN(4, 8)
-OBMCSADMxN(4, 4)
-
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
-OBMCSADMxN(4, 16)
-OBMCSADMxN(16, 4)
-OBMCSADMxN(8, 32)
-OBMCSADMxN(32, 8)
-OBMCSADMxN(16, 64)
-OBMCSADMxN(64, 16)
-OBMCSADMxN(32, 128)
-OBMCSADMxN(128, 32)
-#endif
-/* clang-format on */
-
-#if CONFIG_HIGHBITDEPTH
-                                static INLINE
-    unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
-                                 const int32_t *wsrc, const int32_t *mask,
-                                 int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
-  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++)
-      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
-
-    pre += pre_stride;
-    wsrc += width;
-    mask += width;
-  }
-
-  return sad;
-}
-
-#define HIGHBD_OBMCSADMXN(m, n)                                \
-  unsigned int aom_highbd_obmc_sad##m##x##n##_c(               \
-      const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
-      const int32_t *mask) {                                   \
-    return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
-  }
-
-/* clang-format off */
-#if CONFIG_EXT_PARTITION
-HIGHBD_OBMCSADMXN(128, 128)
-HIGHBD_OBMCSADMXN(128, 64)
-HIGHBD_OBMCSADMXN(64, 128)
-#endif  // CONFIG_EXT_PARTITION
-HIGHBD_OBMCSADMXN(64, 64)
-HIGHBD_OBMCSADMXN(64, 32)
-HIGHBD_OBMCSADMXN(32, 64)
-HIGHBD_OBMCSADMXN(32, 32)
-HIGHBD_OBMCSADMXN(32, 16)
-HIGHBD_OBMCSADMXN(16, 32)
-HIGHBD_OBMCSADMXN(16, 16)
-HIGHBD_OBMCSADMXN(16, 8)
-HIGHBD_OBMCSADMXN(8, 16)
-HIGHBD_OBMCSADMXN(8, 8)
-HIGHBD_OBMCSADMXN(8, 4)
-HIGHBD_OBMCSADMXN(4, 8)
-HIGHBD_OBMCSADMXN(4, 4)
-
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
-HIGHBD_OBMCSADMXN(4, 16)
-HIGHBD_OBMCSADMXN(16, 4)
-HIGHBD_OBMCSADMXN(8, 32)
-HIGHBD_OBMCSADMXN(32, 8)
-HIGHBD_OBMCSADMXN(16, 64)
-HIGHBD_OBMCSADMXN(64, 16)
-HIGHBD_OBMCSADMXN(32, 128)
-HIGHBD_OBMCSADMXN(128, 32)
-#endif
-/* clang-format on */
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_AV1 && CONFIG_MOTION_VAR
+    /* clang-format on */
diff --git a/third_party/aom/aom_dsp/sad_av1.c b/third_party/aom/aom_dsp/sad_av1.c
new file mode 100644
index 000000000..c176001d6
--- /dev/null
+++ b/third_party/aom/aom_dsp/sad_av1.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/blend.h"
+
+static INLINE unsigned int masked_sad(const uint8_t *src, int src_stride,
+                                      const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride,
+                                      const uint8_t *m, int m_stride, int width,
+                                      int height) {
+  int y, x;
+  unsigned int sad = 0;
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const int16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
+      sad += abs(pred - src[x]);
+    }
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+  sad = (sad + 31) >> 6;
+  return sad;
+}
+
+#define MASKSADMxN(m, n)                                                       \
+  unsigned int aom_masked_sad##m##x##n##_c(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,          \
+      int invert_mask) {                                                       \
+    if (!invert_mask)                                                          \
+      return masked_sad(src, src_stride, ref, ref_stride, second_pred, m, msk, \
+                        msk_stride, m, n);                                     \
+    else                                                                       \
+      return masked_sad(src, src_stride, second_pred, m, ref, ref_stride, msk, \
+                        msk_stride, m, n);                                     \
+  }
+
+/* clang-format off */
+MASKSADMxN(128, 128)
+MASKSADMxN(128, 64)
+MASKSADMxN(64, 128)
+MASKSADMxN(64, 64)
+MASKSADMxN(64, 32)
+MASKSADMxN(32, 64)
+MASKSADMxN(32, 32)
+MASKSADMxN(32, 16)
+MASKSADMxN(16, 32)
+MASKSADMxN(16, 16)
+MASKSADMxN(16, 8)
+MASKSADMxN(8, 16)
+MASKSADMxN(8, 8)
+MASKSADMxN(8, 4)
+MASKSADMxN(4, 8)
+MASKSADMxN(4, 4)
+MASKSADMxN(4, 16)
+MASKSADMxN(16, 4)
+MASKSADMxN(8, 32)
+MASKSADMxN(32, 8)
+MASKSADMxN(16, 64)
+MASKSADMxN(64, 16)
+
+    /* clang-format on */
+
+    static INLINE
+    unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride,
+                                   const uint8_t *a8, int a_stride,
+                                   const uint8_t *b8, int b_stride,
+                                   const uint8_t *m, int m_stride, int width,
+                                   int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const uint16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
+      sad += abs(pred - src[x]);
+    }
+
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+  sad = (sad + 31) >> 6;
+
+  return sad;
+}
+
+#define HIGHBD_MASKSADMXN(m, n)                                         \
+  unsigned int aom_highbd_masked_sad##m##x##n##_c(                      \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,         \
+      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,  \
+      int msk_stride, int invert_mask) {                                \
+    if (!invert_mask)                                                   \
+      return highbd_masked_sad(src8, src_stride, ref8, ref_stride,      \
+                               second_pred8, m, msk, msk_stride, m, n); \
+    else                                                                \
+      return highbd_masked_sad(src8, src_stride, second_pred8, m, ref8, \
+                               ref_stride, msk, msk_stride, m, n);      \
+  }
+
+HIGHBD_MASKSADMXN(128, 128)
+HIGHBD_MASKSADMXN(128, 64)
+HIGHBD_MASKSADMXN(64, 128)
+HIGHBD_MASKSADMXN(64, 64)
+HIGHBD_MASKSADMXN(64, 32)
+HIGHBD_MASKSADMXN(32, 64)
+HIGHBD_MASKSADMXN(32, 32)
+HIGHBD_MASKSADMXN(32, 16)
+HIGHBD_MASKSADMXN(16, 32)
+HIGHBD_MASKSADMXN(16, 16)
+HIGHBD_MASKSADMXN(16, 8)
+HIGHBD_MASKSADMXN(8, 16)
+HIGHBD_MASKSADMXN(8, 8)
+HIGHBD_MASKSADMXN(8, 4)
+HIGHBD_MASKSADMXN(4, 8)
+HIGHBD_MASKSADMXN(4, 4)
+HIGHBD_MASKSADMXN(4, 16)
+HIGHBD_MASKSADMXN(16, 4)
+HIGHBD_MASKSADMXN(8, 32)
+HIGHBD_MASKSADMXN(32, 8)
+HIGHBD_MASKSADMXN(16, 64)
+HIGHBD_MASKSADMXN(64, 16)
+
+// pre: predictor being evaluated
+// wsrc: target weighted prediction (has been *4096 to keep precision)
+// mask: 2d weights (scaled by 4096)
+static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
+                                    const int32_t *wsrc, const int32_t *mask,
+                                    int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
+
+    pre += pre_stride;
+    wsrc += width;
+    mask += width;
+  }
+
+  return sad;
+}
+
+#define OBMCSADMxN(m, n)                                                     \
+  unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \
+                                         const int32_t *wsrc,                \
+                                         const int32_t *mask) {              \
+    return obmc_sad(ref, ref_stride, wsrc, mask, m, n);                      \
+  }
+
+/* clang-format off */
+OBMCSADMxN(128, 128)
+OBMCSADMxN(128, 64)
+OBMCSADMxN(64, 128)
+OBMCSADMxN(64, 64)
+OBMCSADMxN(64, 32)
+OBMCSADMxN(32, 64)
+OBMCSADMxN(32, 32)
+OBMCSADMxN(32, 16)
+OBMCSADMxN(16, 32)
+OBMCSADMxN(16, 16)
+OBMCSADMxN(16, 8)
+OBMCSADMxN(8, 16)
+OBMCSADMxN(8, 8)
+OBMCSADMxN(8, 4)
+OBMCSADMxN(4, 8)
+OBMCSADMxN(4, 4)
+OBMCSADMxN(4, 16)
+OBMCSADMxN(16, 4)
+OBMCSADMxN(8, 32)
+OBMCSADMxN(32, 8)
+OBMCSADMxN(16, 64)
+OBMCSADMxN(64, 16)
+    /* clang-format on */
+
+    static INLINE
+    unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
+                                 const int32_t *wsrc, const int32_t *mask,
+                                 int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
+
+    pre += pre_stride;
+    wsrc += width;
+    mask += width;
+  }
+
+  return sad;
+}
+
+#define HIGHBD_OBMCSADMXN(m, n)                                \
+  unsigned int aom_highbd_obmc_sad##m##x##n##_c(               \
+      const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
+      const int32_t *mask) {                                   \
+    return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
+  }
+
+/* clang-format off */
+HIGHBD_OBMCSADMXN(128, 128)
+HIGHBD_OBMCSADMXN(128, 64)
+HIGHBD_OBMCSADMXN(64, 128)
+HIGHBD_OBMCSADMXN(64, 64)
+HIGHBD_OBMCSADMXN(64, 32)
+HIGHBD_OBMCSADMXN(32, 64)
+HIGHBD_OBMCSADMXN(32, 32)
+HIGHBD_OBMCSADMXN(32, 16)
+HIGHBD_OBMCSADMXN(16, 32)
+HIGHBD_OBMCSADMXN(16, 16)
+HIGHBD_OBMCSADMXN(16, 8)
+HIGHBD_OBMCSADMXN(8, 16)
+HIGHBD_OBMCSADMXN(8, 8)
+HIGHBD_OBMCSADMXN(8, 4)
+HIGHBD_OBMCSADMXN(4, 8)
+HIGHBD_OBMCSADMXN(4, 4)
+HIGHBD_OBMCSADMXN(4, 16)
+HIGHBD_OBMCSADMXN(16, 4)
+HIGHBD_OBMCSADMXN(8, 32)
+HIGHBD_OBMCSADMXN(32, 8)
+HIGHBD_OBMCSADMXN(16, 64)
+HIGHBD_OBMCSADMXN(64, 16)
+/* clang-format on */
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics.h b/third_party/aom/aom_dsp/simd/v128_intrinsics.h
index 8f6509383..51a38a7e1 100644
--- a/third_party/aom/aom_dsp/simd/v128_intrinsics.h
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics.h
@@ -15,8 +15,9 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./v128_intrinsics_c.h"
-#include "./v64_intrinsics.h"
+
+#include "aom_dsp/simd/v128_intrinsics_c.h"
+#include "aom_dsp/simd/v64_intrinsics.h"
 
 /* Fallback to plain, unoptimised C. */
 
@@ -57,6 +58,7 @@ SIMD_INLINE v128 v128_zero() { return c_v128_zero(); }
 SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); }
 SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); }
 SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); }
+SIMD_INLINE v128 v128_dup_64(uint64_t x) { return c_v128_dup_64(x); }
 
 typedef uint32_t sad128_internal;
 SIMD_INLINE sad128_internal v128_sad_u8_init() { return c_v128_sad_u8_init(); }
@@ -74,9 +76,15 @@ SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
 SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
   return c_v128_ssd_u8_sum(s);
 }
+SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
+  return c_v128_dotp_su8(a, b);
+}
 SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
   return c_v128_dotp_s16(a, b);
 }
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+  return c_v128_dotp_s32(a, b);
+}
 SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { return c_v128_hadd_u8(a); }
 
 SIMD_INLINE v128 v128_or(v128 a, v128 b) { return c_v128_or(a, b); }
@@ -86,8 +94,12 @@ SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return c_v128_andn(a, b); }
 
 SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return c_v128_add_8(a, b); }
 SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return c_v128_add_16(a, b); }
+SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return c_v128_sadd_u8(a, b); }
+SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return c_v128_sadd_s8(a, b); }
 SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return c_v128_sadd_s16(a, b); }
 SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return c_v128_add_32(a, b); }
+SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return c_v128_add_64(a, b); }
+SIMD_INLINE v128 v128_padd_u8(v128 a) { return c_v128_padd_u8(a); }
 SIMD_INLINE v128 v128_padd_s16(v128 a) { return c_v128_padd_s16(a); }
 SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return c_v128_sub_8(a, b); }
 SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return c_v128_ssub_u8(a, b); }
@@ -96,6 +108,7 @@ SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return c_v128_sub_16(a, b); }
 SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); }
 SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return c_v128_ssub_u16(a, b); }
 SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); }
+SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return c_v128_sub_64(a, b); }
 SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); }
 SIMD_INLINE v128 v128_abs_s8(v128 a) { return c_v128_abs_s8(a); }
 
@@ -112,8 +125,16 @@ SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
 SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return c_v128_madd_s16(a, b); }
 SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { return c_v128_madd_us8(a, b); }
 
+SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return c_v128_movemask_8(a); }
+SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
+  return c_v128_blend_8(a, b, c);
+}
+
 SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return c_v128_avg_u8(a, b); }
 SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { return c_v128_rdavg_u8(a, b); }
+SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) {
+  return c_v128_rdavg_u16(a, b);
+}
 SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return c_v128_avg_u16(a, b); }
 SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return c_v128_min_u8(a, b); }
 SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return c_v128_max_u8(a, b); }
@@ -121,6 +142,8 @@ SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { return c_v128_min_s8(a, b); }
 SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { return c_v128_max_s8(a, b); }
 SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return c_v128_min_s16(a, b); }
 SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return c_v128_max_s16(a, b); }
+SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { return c_v128_min_s32(a, b); }
+SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { return c_v128_max_s32(a, b); }
 
 SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { return c_v128_ziplo_8(a, b); }
 SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { return c_v128_ziphi_8(a, b); }
@@ -168,6 +191,9 @@ SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
 SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
   return c_v128_pack_s32_s16(a, b);
 }
+SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
+  return c_v128_pack_s32_u16(a, b);
+}
 SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
   return c_v128_pack_s16_u8(a, b);
 }
@@ -203,6 +229,14 @@ SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
 }
 SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return c_v128_cmpeq_16(a, b); }
 
+SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) {
+  return c_v128_cmpgt_s32(a, b);
+}
+SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
+  return c_v128_cmplt_s32(a, b);
+}
+SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return c_v128_cmpeq_32(a, b); }
+
 SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
   return c_v128_shl_8(a, c);
 }
@@ -230,6 +264,15 @@ SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
 SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
   return c_v128_shr_s32(a, c);
 }
+SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
+  return c_v128_shl_64(a, c);
+}
+SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
+  return c_v128_shr_u64(a, c);
+}
+SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
+  return c_v128_shr_s64(a, c);
+}
 
 SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
   return c_v128_shr_n_byte(a, n);
@@ -246,6 +289,9 @@ SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int n) {
 SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int n) {
   return c_v128_shl_n_32(a, n);
 }
+SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int n) {
+  return c_v128_shl_n_64(a, n);
+}
 SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int n) {
   return c_v128_shr_n_u8(a, n);
 }
@@ -255,6 +301,9 @@ SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int n) {
 SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int n) {
   return c_v128_shr_n_u32(a, n);
 }
+SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int n) {
+  return c_v128_shr_n_u64(a, n);
+}
 SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int n) {
   return c_v128_shr_n_s8(a, n);
 }
@@ -264,5 +313,32 @@ SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int n) {
 SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int n) {
   return c_v128_shr_n_s32(a, n);
 }
+SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int n) {
+  return c_v128_shr_n_s64(a, n);
+}
+
+typedef uint32_t sad128_internal_u16;
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() {
+  return c_v128_sad_u16_init();
+}
+SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
+                                             v128 b) {
+  return c_v128_sad_u16(s, a, b);
+}
+SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
+  return c_v128_sad_u16_sum(s);
+}
+
+typedef uint64_t ssd128_internal_s16;
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() {
+  return c_v128_ssd_s16_init();
+}
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
+                                             v128 b) {
+  return c_v128_ssd_s16(s, a, b);
+}
+SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
+  return c_v128_ssd_s16_sum(s);
+}
 
 #endif /* _V128_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h
index 0377d4ce1..d4fec4237 100644
--- a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h
@@ -13,7 +13,8 @@
 #define _V128_INTRINSICS_H
 
 #include <arm_neon.h>
-#include "./v64_intrinsics_arm.h"
+
+#include "aom_dsp/simd/v64_intrinsics_arm.h"
 
 typedef int64x2_t v128;
 
@@ -28,7 +29,7 @@ SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); }
 SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); }
 
 SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
-  return vcombine_s64((uint64x1_t)b, (uint64x1_t)a);
+  return vcombine_s64((int64x1_t)b, (int64x1_t)a);
 }
 
 SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
@@ -83,22 +84,57 @@ SIMD_INLINE v128 v128_dup_32(uint32_t x) {
   return vreinterpretq_s64_u32(vdupq_n_u32(x));
 }
 
+SIMD_INLINE v128 v128_dup_64(uint64_t x) {
+  return vreinterpretq_s64_u64(vdupq_n_u64(x));
+}
+
+SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
+  int16x8_t t1 = vmulq_s16(
+      vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))),
+      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(b)))));
+  int16x8_t t2 = vmulq_s16(
+      vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))),
+      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(b)))));
+#if defined(__aarch64__)
+  return vaddlvq_s16(t1) + vaddlvq_s16(t2);
+#else
+  int64x2_t t = vpaddlq_s32(vaddq_s32(vpaddlq_s16(t1), vpaddlq_s16(t2)));
+  return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
+#endif
+}
+
 SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
   return v64_dotp_s16(vget_high_s64(a), vget_high_s64(b)) +
          v64_dotp_s16(vget_low_s64(a), vget_low_s64(b));
 }
 
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+  int64x2_t t = vpaddlq_s32(
+      vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
+  return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
+}
+
 SIMD_INLINE uint64_t v128_hadd_u8(v128 x) {
+#if defined(__aarch64__)
+  return vaddlvq_u8(vreinterpretq_u8_s64(x));
+#else
   uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x))));
   return vget_lane_s32(
       vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
+#endif
 }
 
 SIMD_INLINE v128 v128_padd_s16(v128 a) {
   return vreinterpretq_s64_s32(vpaddlq_s16(vreinterpretq_s16_s64(a)));
 }
 
-typedef struct { sad64_internal hi, lo; } sad128_internal;
+SIMD_INLINE v128 v128_padd_u8(v128 a) {
+  return vreinterpretq_s64_u16(vpaddlq_u8(vreinterpretq_u8_s64(a)));
+}
+
+typedef struct {
+  sad64_internal hi, lo;
+} sad128_internal;
 
 SIMD_INLINE sad128_internal v128_sad_u8_init() {
   sad128_internal s;
@@ -117,14 +153,21 @@ SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
 }
 
 SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
-  return (uint32_t)(v64_sad_u8_sum(s.hi) + v64_sad_u8_sum(s.lo));
+#if defined(__aarch64__)
+  return vaddlvq_u16(s.hi) + vaddlvq_u16(s.lo);
+#else
+  uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vaddq_u16(s.hi, s.lo)));
+  return (uint32_t)(uint64_t)(vget_high_u64(t) + vget_low_u64(t));
+#endif
 }
 
-typedef struct { ssd64_internal hi, lo; } ssd128_internal;
+typedef struct {
+  ssd64_internal hi, lo;
+} ssd128_internal;
 
 SIMD_INLINE ssd128_internal v128_ssd_u8_init() {
   ssd128_internal s;
-  s.hi = s.lo = (ssd64_internal)(uint64_t)0;
+  s.hi = s.lo = v64_ssd_u8_init();
   return s;
 }
 
@@ -154,6 +197,16 @@ SIMD_INLINE v128 v128_add_8(v128 x, v128 y) {
       vaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
 }
 
+SIMD_INLINE v128 v128_sadd_u8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vqaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sadd_s8(v128 x, v128 y) {
+  return vreinterpretq_s64_s8(
+      vqaddq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
+}
+
 SIMD_INLINE v128 v128_add_16(v128 x, v128 y) {
   return vreinterpretq_s64_s16(
       vaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
@@ -169,6 +222,11 @@ SIMD_INLINE v128 v128_add_32(v128 x, v128 y) {
       vaddq_u32(vreinterpretq_u32_s64(x), vreinterpretq_u32_s64(y)));
 }
 
+SIMD_INLINE v128 v128_add_64(v128 x, v128 y) {
+  return vreinterpretq_s64_u64(
+      vaddq_u64(vreinterpretq_u64_s64(x), vreinterpretq_u64_s64(y)));
+}
+
 SIMD_INLINE v128 v128_sub_8(v128 x, v128 y) {
   return vreinterpretq_s64_u8(
       vsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
@@ -204,6 +262,8 @@ SIMD_INLINE v128 v128_sub_32(v128 x, v128 y) {
       vsubq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
 }
 
+SIMD_INLINE v128 v128_sub_64(v128 x, v128 y) { return vsubq_s64(x, y); }
+
 SIMD_INLINE v128 v128_abs_s16(v128 x) {
   return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x)));
 }
@@ -223,8 +283,16 @@ SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
 }
 
 SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_s16(vuzp2q_s16(
+      vreinterpretq_s16_s32(vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
+                                      vreinterpret_s16_s64(vget_low_s64(b)))),
+      vreinterpretq_s16_s32(
+          vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)))));
+#else
   return v128_from_v64(v64_mulhi_s16(vget_high_s64(a), vget_high_s64(b)),
                        v64_mulhi_s16(vget_low_s64(a), vget_low_s64(b)));
+#endif
 }
 
 SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
@@ -233,13 +301,32 @@ SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
 }
 
 SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) {
+#if defined(__aarch64__)
+  int32x4_t t1 = vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
+                           vreinterpret_s16_s64(vget_low_s64(b)));
+  int32x4_t t2 =
+      vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b));
+  return vreinterpretq_s64_s32(vpaddq_s32(t1, t2));
+#else
   return v128_from_v64(v64_madd_s16(vget_high_s64(a), vget_high_s64(b)),
                        v64_madd_s16(vget_low_s64(a), vget_low_s64(b)));
+#endif
 }
 
 SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
+#if defined(__aarch64__)
+  int16x8_t t1 = vmulq_s16(
+      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))),
+      vmovl_s8(vreinterpret_s8_s64(vget_low_s64(b))));
+  int16x8_t t2 = vmulq_s16(
+      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))),
+      vmovl_s8(vreinterpret_s8_s64(vget_high_s64(b))));
+  return vreinterpretq_s64_s16(
+      vqaddq_s16(vuzp1q_s16(t1, t2), vuzp2q_s16(t1, t2)));
+#else
   return v128_from_v64(v64_madd_us8(vget_high_s64(a), vget_high_s64(b)),
                        v64_madd_us8(vget_low_s64(a), vget_low_s64(b)));
+#endif
 }
 
 SIMD_INLINE v128 v128_avg_u8(v128 x, v128 y) {
@@ -252,6 +339,11 @@ SIMD_INLINE v128 v128_rdavg_u8(v128 x, v128 y) {
       vhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
 }
 
+SIMD_INLINE v128 v128_rdavg_u16(v128 x, v128 y) {
+  return vreinterpretq_s64_u16(
+      vhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
+}
+
 SIMD_INLINE v128 v128_avg_u16(v128 x, v128 y) {
   return vreinterpretq_s64_u16(
       vrhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
@@ -272,6 +364,26 @@ SIMD_INLINE v128 v128_min_s8(v128 x, v128 y) {
       vminq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
 }
 
+SIMD_INLINE uint32_t v128_movemask_8(v128 a) {
+  a = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(0)));
+#if defined(__aarch64__)
+  uint8x16_t m =
+      vandq_u8(vreinterpretq_u8_s64(a),
+               vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)));
+  return vaddv_u8(vget_low_u8(m)) + (vaddv_u8(vget_high_u8(m)) << 8);
+#else
+  uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(
+      vandq_u8(vreinterpretq_u8_s64(a),
+               vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL))))));
+  return v64_u64(v64_ziplo_8(v128_high_v64((v128)m), v128_low_v64((v128)m)));
+#endif
+}
+
+SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
+  c = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(c), vdupq_n_s8(0)));
+  return v128_or(v128_and(b, c), v128_andn(a, c));
+}
+
 SIMD_INLINE v128 v128_max_s8(v128 x, v128 y) {
   return vreinterpretq_s64_s8(
       vmaxq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
@@ -287,14 +399,34 @@ SIMD_INLINE v128 v128_max_s16(v128 x, v128 y) {
       vmaxq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
 }
 
+SIMD_INLINE v128 v128_min_s32(v128 x, v128 y) {
+  return vreinterpretq_s64_s32(
+      vminq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_max_s32(v128 x, v128 y) {
+  return vreinterpretq_s64_s32(
+      vmaxq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
 SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vzip1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
+#else
   uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
   return vreinterpretq_s64_u8(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vzip2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
+#else
   uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
   return vreinterpretq_s64_u8(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) {
@@ -303,13 +435,23 @@ SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) {
 }
 
 SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u16(
+      vzip1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
+#else
   int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
   return vreinterpretq_s64_s16(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u16(
+      vzip2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
+#else
   int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
   return vreinterpretq_s64_s16(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) {
@@ -318,13 +460,23 @@ SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) {
 }
 
 SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u32(
+      vzip1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
+#else
   int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
   return vreinterpretq_s64_s32(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u32(
+      vzip2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
+#else
   int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
   return vreinterpretq_s64_s32(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) {
@@ -333,47 +485,76 @@ SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) {
 }
 
 SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
-  return v128_from_v64(vget_low_u64((uint64x2_t)a),
-                       vget_low_u64((uint64x2_t)b));
+  return v128_from_v64(vget_low_s64((int64x2_t)a), vget_low_s64((int64x2_t)b));
 }
 
 SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
-  return v128_from_v64(vget_high_u64((uint64x2_t)a),
-                       vget_high_u64((uint64x2_t)b));
+  return v128_from_v64(vget_high_s64((int64x2_t)a),
+                       vget_high_s64((int64x2_t)b));
 }
 
 SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vuzp1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
+#else
   uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
   return vreinterpretq_s64_u8(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vuzp2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
+#else
   uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
   return vreinterpretq_s64_u8(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u16(
+      vuzp1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
+#else
   uint16x8x2_t r =
       vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
   return vreinterpretq_s64_u16(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u16(
+      vuzp2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
+#else
   uint16x8x2_t r =
       vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
   return vreinterpretq_s64_u16(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u32(
+      vuzp1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
+#else
   uint32x4x2_t r =
       vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
   return vreinterpretq_s64_u32(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u32(
+      vuzp2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
+#else
   uint32x4x2_t r =
       vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
   return vreinterpretq_s64_u32(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
@@ -406,6 +587,12 @@ SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
       vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(b))));
 }
 
+SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
+  return v128_from_v64(
+      vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(a))),
+      vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(b))));
+}
+
 SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
   return v128_from_v64(
       vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(a))),
@@ -447,15 +634,17 @@ SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
 }
 
 SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
-  return v128_from_64(
-      (uint64_t)vreinterpret_s64_u8(
-          vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)),
-                                    vget_high_u8(vreinterpretq_u8_s64(x)) } },
-                   vreinterpret_u8_s64(vget_high_s64(pattern)))),
-      (uint64_t)vreinterpret_s64_u8(
-          vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)),
-                                    vget_high_u8(vreinterpretq_u8_s64(x)) } },
-                   vreinterpret_u8_s64(vget_low_s64(pattern)))));
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vqtbl1q_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(pattern)));
+#else
+  uint8x8x2_t p = { { vget_low_u8(vreinterpretq_u8_s64(x)),
+                      vget_high_u8(vreinterpretq_u8_s64(x)) } };
+  return v128_from_64((uint64_t)vreinterpret_s64_u8(vtbl2_u8(
+                          p, vreinterpret_u8_s64(vget_high_s64(pattern)))),
+                      (uint64_t)vreinterpret_s64_u8(vtbl2_u8(
+                          p, vreinterpret_u8_s64(vget_low_s64(pattern)))));
+#endif
 }
 
 SIMD_INLINE v128 v128_cmpgt_s8(v128 x, v128 y) {
@@ -488,19 +677,37 @@ SIMD_INLINE v128 v128_cmpeq_16(v128 x, v128 y) {
       vceqq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
 }
 
+SIMD_INLINE v128 v128_cmpgt_s32(v128 x, v128 y) {
+  return vreinterpretq_s64_u32(
+      vcgtq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmplt_s32(v128 x, v128 y) {
+  return vreinterpretq_s64_u32(
+      vcltq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmpeq_32(v128 x, v128 y) {
+  return vreinterpretq_s64_u32(
+      vceqq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
 SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
-  return (c > 7) ? v128_zero() : vreinterpretq_s64_u8(vshlq_u8(
-                                     vreinterpretq_u8_s64(a), vdupq_n_s8(c)));
+  return (c > 7) ? v128_zero()
+                 : vreinterpretq_s64_u8(
+                       vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(c)));
 }
 
 SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
-  return (c > 7) ? v128_zero() : vreinterpretq_s64_u8(vshlq_u8(
-                                     vreinterpretq_u8_s64(a), vdupq_n_s8(-c)));
+  return (c > 7) ? v128_zero()
+                 : vreinterpretq_s64_u8(
+                       vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(-c)));
 }
 
 SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
-  return (c > 7) ? v128_ones() : vreinterpretq_s64_s8(vshlq_s8(
-                                     vreinterpretq_s8_s64(a), vdupq_n_s8(-c)));
+  return (c > 7) ? v128_ones()
+                 : vreinterpretq_s64_s8(
+                       vshlq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(-c)));
 }
 
 SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
@@ -539,6 +746,22 @@ SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
                         vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c)));
 }
 
+SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
+  return (c > 63) ? v128_zero()
+                  : vreinterpretq_s64_u64(
+                        vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(c)));
+}
+
+SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
+  return (c > 63) ? v128_zero()
+                  : vreinterpretq_s64_u64(
+                        vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(-c)));
+}
+
+SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
+  return (c > 63) ? v128_ones() : vshlq_s64(a, vdupq_n_s64(-c));
+}
+
 #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
 
 SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
@@ -562,16 +785,18 @@ SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
 SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
   return n < 8
              ? v128_from_64(
-                   vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), n * 8),
-                   vorr_u64(
+                   (uint64_t)vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
+                                        n * 8),
+                   (uint64_t)vorr_u64(
                        vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), n * 8),
                        vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
                                   (8 - n) * 8)))
-             : (n == 8
-                    ? v128_from_64(0, vreinterpret_u64_s64(vget_high_s64(a)))
-                    : v128_from_64(
-                          0, vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
-                                        (n - 8) * 8)));
+             : (n == 8 ? v128_from_64(0, (uint64_t)vreinterpret_u64_s64(
+                                             vget_high_s64(a)))
+                       : v128_from_64(
+                             0, (uint64_t)vshr_n_u64(
+                                    vreinterpret_u64_s64(vget_high_s64(a)),
+                                    (n - 8) * 8)));
 }
 
 SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
@@ -610,6 +835,18 @@ SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
   return vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c));
 }
 
+SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
+  return vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
+  return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
+  return vshrq_n_s64(a, c);
+}
+
 #else
 
 SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
@@ -666,6 +903,55 @@ SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
   return v128_shr_s32(a, c);
 }
 
+SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
+  return v128_shl_64(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
+  return v128_shr_u64(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
+  return v128_shr_s64(a, c);
+}
+
 #endif
 
+typedef uint32x4_t sad128_internal_u16;
+
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return vdupq_n_u32(0); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_sad_u16_sum(). */
+SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
+                                             v128 b) {
+  return vaddq_u32(
+      s, vpaddlq_u16(vsubq_u16(
+             vmaxq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)),
+             vminq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)))));
+}
+
+SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
+  uint64x2_t t = vpaddlq_u32(s);
+  return (uint32_t)(uint64_t)vget_high_u64(t) +
+         (uint32_t)(uint64_t)vget_low_u64(t);
+}
+
+typedef v128 ssd128_internal_s16;
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_s16_sum(). */
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
+                                             v128 b) {
+  v128 d = v128_sub_16(a, b);
+  d = v128_madd_s16(d, d);
+  return v128_add_64(
+      s, vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s64(d))));
+}
+
+SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
+  return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
+}
+
 #endif /* _V128_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
index 32e7c32de..e508f6ad7 100644
--- a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
@@ -14,8 +14,10 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include "./v64_intrinsics_c.h"
-#include "./aom_config.h"
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/simd/v64_intrinsics_c.h"
 
 typedef union {
   uint8_t u8[16];
@@ -115,11 +117,30 @@ SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) {
   return t;
 }
 
+SIMD_INLINE c_v128 c_v128_dup_64(uint64_t x) {
+  c_v128 t;
+  t.u64[1] = t.u64[0] = x;
+  return t;
+}
+
+SIMD_INLINE int64_t c_v128_dotp_su8(c_v128 a, c_v128 b) {
+  return c_v64_dotp_su8(a.v64[1], b.v64[1]) +
+         c_v64_dotp_su8(a.v64[0], b.v64[0]);
+}
+
 SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) {
   return c_v64_dotp_s16(a.v64[1], b.v64[1]) +
          c_v64_dotp_s16(a.v64[0], b.v64[0]);
 }
 
+SIMD_INLINE int64_t c_v128_dotp_s32(c_v128 a, c_v128 b) {
+  // 32 bit products, 64 bit sum
+  return (int64_t)(int32_t)((int64_t)a.s32[3] * b.s32[3]) +
+         (int64_t)(int32_t)((int64_t)a.s32[2] * b.s32[2]) +
+         (int64_t)(int32_t)((int64_t)a.s32[1] * b.s32[1]) +
+         (int64_t)(int32_t)((int64_t)a.s32[0] * b.s32[0]);
+}
+
 SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) {
   return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]);
 }
@@ -186,6 +207,16 @@ SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) {
                          c_v64_add_16(a.v64[0], b.v64[0]));
 }
 
+SIMD_INLINE c_v128 c_v128_sadd_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sadd_u8(a.v64[1], b.v64[1]),
+                         c_v64_sadd_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sadd_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sadd_s8(a.v64[1], b.v64[1]),
+                         c_v64_sadd_s8(a.v64[0], b.v64[0]));
+}
+
 SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) {
   return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]),
                          c_v64_sadd_s16(a.v64[0], b.v64[0]));
@@ -196,6 +227,15 @@ SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) {
                          c_v64_add_32(a.v64[0], b.v64[0]));
 }
 
+SIMD_INLINE c_v128 c_v128_add_64(c_v128 a, c_v128 b) {
+  // Two complement overflow (silences sanitizers)
+  return c_v128_from_64(
+      a.v64[1].u64 > ~b.v64[1].u64 ? a.v64[1].u64 - ~b.v64[1].u64 - 1
+                                   : a.v64[1].u64 + b.v64[1].u64,
+      a.v64[0].u64 > ~b.v64[0].u64 ? a.v64[0].u64 - ~b.v64[0].u64 - 1
+                                   : a.v64[0].u64 + b.v64[0].u64);
+}
+
 SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) {
   c_v128 t;
   t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
@@ -205,6 +245,19 @@ SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) {
   return t;
 }
 
+SIMD_INLINE c_v128 c_v128_padd_u8(c_v128 a) {
+  c_v128 t;
+  t.u16[0] = (uint16_t)a.u8[0] + (uint16_t)a.u8[1];
+  t.u16[1] = (uint16_t)a.u8[2] + (uint16_t)a.u8[3];
+  t.u16[2] = (uint16_t)a.u8[4] + (uint16_t)a.u8[5];
+  t.u16[3] = (uint16_t)a.u8[6] + (uint16_t)a.u8[7];
+  t.u16[4] = (uint16_t)a.u8[8] + (uint16_t)a.u8[9];
+  t.u16[5] = (uint16_t)a.u8[10] + (uint16_t)a.u8[11];
+  t.u16[6] = (uint16_t)a.u8[12] + (uint16_t)a.u8[13];
+  t.u16[7] = (uint16_t)a.u8[14] + (uint16_t)a.u8[15];
+  return t;
+}
+
 SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) {
   return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]),
                          c_v64_sub_8(a.v64[0], b.v64[0]));
@@ -240,6 +293,15 @@ SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) {
                          c_v64_sub_32(a.v64[0], b.v64[0]));
 }
 
+SIMD_INLINE c_v128 c_v128_sub_64(c_v128 a, c_v128 b) {
+  // Two complement underflow (silences sanitizers)
+  return c_v128_from_64(
+      a.v64[1].u64 < b.v64[1].u64 ? a.v64[1].u64 + ~b.v64[1].u64 + 1
+                                  : a.v64[1].u64 - b.v64[1].u64,
+      a.v64[0].u64 < b.v64[0].u64 ? a.v64[0].u64 + ~b.v64[0].u64 + 1
+                                  : a.v64[0].u64 - b.v64[0].u64);
+}
+
 SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) {
   return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0]));
 }
@@ -290,6 +352,11 @@ SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) {
                          c_v64_rdavg_u8(a.v64[0], b.v64[0]));
 }
 
+SIMD_INLINE c_v128 c_v128_rdavg_u16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_rdavg_u16(a.v64[1], b.v64[1]),
+                         c_v64_rdavg_u16(a.v64[0], b.v64[0]));
+}
+
 SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) {
   return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]),
                          c_v64_avg_u16(a.v64[0], b.v64[0]));
@@ -310,6 +377,22 @@ SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) {
                          c_v64_min_s8(a.v64[0], b.v64[0]));
 }
 
+SIMD_INLINE uint32_t c_v128_movemask_8(c_v128 a) {
+  return ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
+         ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
+         ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
+         ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
+         ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
+         ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
+         ((a.s8[0] < 0) << 0);
+}
+
+SIMD_INLINE c_v128 c_v128_blend_8(c_v128 a, c_v128 b, c_v128 c) {
+  c_v128 t;
+  for (int i = 0; i < 16; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
+  return t;
+}
+
 SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) {
   return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]),
                          c_v64_max_s8(a.v64[0], b.v64[0]));
@@ -325,6 +408,20 @@ SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) {
                          c_v64_max_s16(a.v64[0], b.v64[0]));
 }
 
+SIMD_INLINE c_v128 c_v128_max_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? a.s32[c] : b.s32[c];
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_min_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? b.s32[c] : a.s32[c];
+  return t;
+}
+
 SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) {
   return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]),
                          c_v64_ziplo_8(a.v64[0], b.v64[0]));
@@ -518,6 +615,11 @@ SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) {
                          c_v64_pack_s32_s16(b.v64[1], b.v64[0]));
 }
 
+SIMD_INLINE c_v128 c_v128_pack_s32_u16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_pack_s32_u16(a.v64[1], a.v64[0]),
+                         c_v64_pack_s32_u16(b.v64[1], b.v64[0]));
+}
+
 SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) {
   return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]),
                          c_v64_pack_s16_u8(b.v64[1], b.v64[0]));
@@ -559,15 +661,10 @@ SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) {
 SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) {
   c_v128 t;
   int c;
-  for (c = 0; c < 16; c++) {
-    if (pattern.u8[c] & ~15) {
-      fprintf(stderr, "Undefined v128_shuffle_8 index %d/%d\n", pattern.u8[c],
-              c);
-      abort();
-    }
+  for (c = 0; c < 16; c++)
     t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15)
                                      : pattern.u8[c] & 15];
-  }
+
   return t;
 }
 
@@ -601,7 +698,28 @@ SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) {
                          c_v64_cmpeq_16(a.v64[0], b.v64[0]));
 }
 
-SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_cmpgt_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] > b.s32[c]);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_cmplt_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] < b.s32[c]);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_cmpeq_32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] == b.s32[c]);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) {
   if (n < 8)
     return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n),
                                     c_v64_shr_n_byte(a.v64[0], 8 - n)),
@@ -610,7 +728,7 @@ SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, unsigned int n) {
     return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero());
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) {
   if (n < 8)
     return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n),
                            c_v64_or(c_v64_shr_n_byte(a.v64[0], n),
@@ -619,7 +737,7 @@ SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, unsigned int n) {
     return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8));
 }
 
-SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, const unsigned int c) {
   if (SIMD_CHECK && c > 15) {
     fprintf(stderr, "Error: undefined alignment %d\n", c);
     abort();
@@ -628,80 +746,143 @@ SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, unsigned int c) {
            : b;
 }
 
-SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c),
                          c_v64_shr_u16(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c),
                          c_v64_shr_s16(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c),
                          c_v64_shr_u32(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c),
                          c_v64_shr_s32(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shl_64(c_v128 a, const unsigned int c) {
+  a.v64[1].u64 <<= c;
+  a.v64[0].u64 <<= c;
+  return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u64(c_v128 a, const unsigned int c) {
+  a.v64[1].u64 >>= c;
+  a.v64[0].u64 >>= c;
+  return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s64(c_v128 a, const unsigned int c) {
+  a.v64[1].s64 >>= c;
+  a.v64[0].s64 >>= c;
+  return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, const unsigned int n) {
   return c_v128_shl_8(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, const unsigned int n) {
   return c_v128_shl_16(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, const unsigned int n) {
   return c_v128_shl_32(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shl_n_64(c_v128 a, const unsigned int n) {
+  return c_v128_shl_64(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, const unsigned int n) {
   return c_v128_shr_u8(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, const unsigned int n) {
   return c_v128_shr_u16(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, const unsigned int n) {
   return c_v128_shr_u32(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shr_n_u64(c_v128 a, const unsigned int n) {
+  return c_v128_shr_u64(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, const unsigned int n) {
   return c_v128_shr_s8(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, const unsigned int n) {
   return c_v128_shr_s16(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, const unsigned int n) {
   return c_v128_shr_s32(a, n);
 }
 
+SIMD_INLINE c_v128 c_v128_shr_n_s64(c_v128 a, const unsigned int n) {
+  return c_v128_shr_s64(a, n);
+}
+
+typedef uint32_t c_sad128_internal_u16;
+
+SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_sad_u16_sum(). */
+SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16(c_sad128_internal_u16 s,
+                                                 c_v128 a, c_v128 b) {
+  int c;
+  for (c = 0; c < 8; c++)
+    s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v128_sad_u16_sum(c_sad128_internal_u16 s) { return s; }
+
+typedef uint64_t c_ssd128_internal_s16;
+
+SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_s16_sum(). */
+SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s,
+                                                 c_v128 a, c_v128 b) {
+  int c;
+  for (c = 0; c < 8; c++)
+    s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
+         (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
+  return s;
+}
+
+SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; }
+
 #endif /* _V128_INTRINSICS_C_H */
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
index cca1788d5..f9043fe99 100644
--- a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
@@ -12,7 +12,8 @@
 #ifndef _V128_INTRINSICS_H
 #define _V128_INTRINSICS_H
 
-#include "./v64_intrinsics_x86.h"
+#include <stdint.h>
+#include "aom_dsp/simd/v64_intrinsics_x86.h"
 
 typedef __m128i v128;
 
@@ -62,7 +63,7 @@ SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
 // Some compilers will check this during optimisation, others wont.
 #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
 #if defined(__SSSE3__)
-SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
+SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
   return c ? _mm_alignr_epi8(a, b, c) : b;
 }
 #else
@@ -71,7 +72,7 @@ SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
 #endif
 #else
 #if defined(__SSSE3__)
-#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, c) : (b))
+#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b))
 #else
 #define v128_align(a, b, c) \
   ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
@@ -86,14 +87,25 @@ SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
 
 SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
 
+SIMD_INLINE v128 v128_dup_64(uint64_t x) {
+  // _mm_set_pi64x and _mm_cvtsi64x_si64 missing in some compilers
+  return _mm_set_epi32(x >> 32, (uint32_t)x, x >> 32, (uint32_t)x);
+}
+
 SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); }
 
 SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); }
 
+SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return _mm_adds_epu8(a, b); }
+
+SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return _mm_adds_epi8(a, b); }
+
 SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); }
 
 SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); }
 
+SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return _mm_add_epi64(a, b); }
+
 SIMD_INLINE v128 v128_padd_s16(v128 a) {
   return _mm_madd_epi16(a, _mm_set1_epi16(1));
 }
@@ -112,6 +124,8 @@ SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); }
 
 SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); }
 
+SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return _mm_sub_epi64(a, b); }
+
 SIMD_INLINE v128 v128_abs_s16(v128 a) {
 #if defined(__SSSE3__)
   return _mm_abs_epi16(a);
@@ -241,6 +255,15 @@ SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
   return _mm_packs_epi32(b, a);
 }
 
+SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_packus_epi32(b, a);
+#else
+  return v128_from_v64(v64_pack_s32_u16(v128_high_v64(a), v128_low_v64(a)),
+                       v64_pack_s32_u16(v128_high_v64(b), v128_low_v64(b)));
+#endif
+}
+
 SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
   return _mm_packus_epi16(b, a);
 }
@@ -291,6 +314,15 @@ SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
 #endif
 }
 
+SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
+  v128 t1 = _mm_madd_epi16(v128_unpackhi_s8_s16(a), v128_unpackhi_u8_s16(b));
+  v128 t2 = _mm_madd_epi16(v128_unpacklo_s8_s16(a), v128_unpacklo_u8_s16(b));
+  v128 t = v128_add_32(t1, t2);
+  t = v128_add_32(t, _mm_srli_si128(t, 8));
+  t = v128_add_32(t, _mm_srli_si128(t, 4));
+  return (int32_t)v128_low_u32(t);
+}
+
 SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
   v128 r = _mm_madd_epi16(a, b);
 #if defined(__SSE4_1__) && defined(__x86_64__)
@@ -325,31 +357,25 @@ SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
   return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
 }
 
-typedef v128 ssd128_internal;
+typedef int32_t ssd128_internal;
 
-SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return _mm_setzero_si128(); }
+SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return 0; }
 
 /* Implementation dependent return value.  Result must be finalised with
  * v128_ssd_sum(). */
 SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
-  v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
-                         _mm_unpacklo_epi8(b, _mm_setzero_si128()));
-  v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()),
-                         _mm_unpackhi_epi8(b, _mm_setzero_si128()));
+  v128 z = _mm_setzero_si128();
+  v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, z), _mm_unpacklo_epi8(b, z));
+  v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, z), _mm_unpackhi_epi8(b, z));
   v128 rl = _mm_madd_epi16(l, l);
   v128 rh = _mm_madd_epi16(h, h);
-  v128 c = _mm_cvtsi32_si128(32);
-  rl = _mm_add_epi32(rl, _mm_srli_si128(rl, 8));
-  rl = _mm_add_epi32(rl, _mm_srli_si128(rl, 4));
-  rh = _mm_add_epi32(rh, _mm_srli_si128(rh, 8));
-  rh = _mm_add_epi32(rh, _mm_srli_si128(rh, 4));
-  return _mm_add_epi64(
-      s, _mm_srl_epi64(_mm_sll_epi64(_mm_unpacklo_epi64(rl, rh), c), c));
+  v128 r = _mm_add_epi32(rl, rh);
+  r = _mm_add_epi32(r, _mm_srli_si128(r, 8));
+  r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
+  return s + _mm_cvtsi128_si32(r);
 }
 
-SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
-  return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
-}
+SIMD_INLINE int32_t v128_ssd_u8_sum(ssd128_internal s) { return s; }
 
 SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); }
 
@@ -385,6 +411,14 @@ SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
 #endif
 }
 
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+  v128 r = v128_mullo_s32(a, b);
+  return (int64_t)_mm_cvtsi128_si32(r) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
+}
+
 SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); }
 
 SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
@@ -399,6 +433,10 @@ SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
 #endif
 }
 
+SIMD_INLINE v128 v128_padd_u8(v128 a) {
+  return v128_madd_us8(a, _mm_set1_epi8(1));
+}
+
 SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); }
 
 SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) {
@@ -406,6 +444,11 @@ SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) {
                       _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1)));
 }
 
+SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) {
+  return _mm_sub_epi16(_mm_avg_epu16(a, b),
+                       _mm_and_si128(_mm_xor_si128(a, b), v128_dup_16(1)));
+}
+
 SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); }
 
 SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); }
@@ -421,6 +464,17 @@ SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) {
 #endif
 }
 
+SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); }
+
+SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
+#if defined(__SSE4_1__)
+  return _mm_blendv_epi8(a, b, c);
+#else
+  c = _mm_cmplt_epi8(c, v128_zero());
+  return v128_or(v128_and(b, c), v128_andn(a, c));
+#endif
+}
+
 SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) {
 #if defined(__SSE4_1__)
   return _mm_max_epi8(a, b);
@@ -434,6 +488,24 @@ SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); }
 
 SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); }
 
+SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_min_epi32(a, b);
+#else
+  v128 mask = _mm_cmplt_epi32(a, b);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_max_epi32(a, b);
+#else
+  v128 mask = _mm_cmplt_epi32(b, a);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
 SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); }
 
 SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); }
@@ -448,6 +520,16 @@ SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
   return _mm_cmplt_epi16(a, b);
 }
 
+SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); }
+
+SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) {
+  return _mm_cmpgt_epi32(a, b);
+}
+
+SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
+  return _mm_cmplt_epi32(a, b);
+}
+
 SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); }
 
 SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
@@ -490,10 +572,25 @@ SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
   return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
 }
 
+SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
+  return _mm_sll_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
+  return _mm_srl_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
+  // _mm_sra_epi64 is missing in gcc?
+  return v128_from_64((int64_t)v64_u64(v128_high_v64(a)) >> c,
+                      (int64_t)v64_u64(v128_low_v64(a)) >> c);
+  // return _mm_sra_epi64(a, _mm_cvtsi32_si128(c));
+}
+
 /* These intrinsics require immediate values, so we must use #defines
    to enforce that. */
-#define v128_shl_n_byte(a, c) _mm_slli_si128(a, c)
-#define v128_shr_n_byte(a, c) _mm_srli_si128(a, c)
+#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
+#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
 #define v128_shl_n_8(a, c) \
   _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
 #define v128_shr_n_u8(a, c) \
@@ -507,5 +604,53 @@ SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
 #define v128_shl_n_32(a, c) _mm_slli_epi32(a, c)
 #define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c)
 #define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c)
+#define v128_shl_n_64(a, c) _mm_slli_epi64(a, c)
+#define v128_shr_n_u64(a, c) _mm_srli_epi64(a, c)
+#define v128_shr_n_s64(a, c) \
+  v128_shr_s64(a, c)  // _mm_srai_epi64 missing in gcc?
+
+typedef v128 sad128_internal_u16;
+
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return v128_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_sad_u16_sum(). */
+SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
+                                             v128 b) {
+#if defined(__SSE4_1__)
+  v128 t = v128_sub_16(_mm_max_epu16(a, b), _mm_min_epu16(a, b));
+#else
+  v128 t = v128_cmplt_s16(v128_xor(a, v128_dup_16(32768)),
+                          v128_xor(b, v128_dup_16(32768)));
+  t = v128_sub_16(v128_or(v128_and(b, t), v128_andn(a, t)),
+                  v128_or(v128_and(a, t), v128_andn(b, t)));
+#endif
+  return v128_add_32(
+      s, v128_add_32(v128_unpackhi_u16_s32(t), v128_unpacklo_u16_s32(t)));
+}
+
+SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
+  return v128_low_u32(s) + v128_low_u32(v128_shr_n_byte(s, 4)) +
+         v128_low_u32(v128_shr_n_byte(s, 8)) +
+         v128_low_u32(v128_shr_n_byte(s, 12));
+}
+
+typedef v128 ssd128_internal_s16;
+
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_s16_sum(). */
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
+                                             v128 b) {
+  v128 d = v128_sub_16(a, b);
+  d = v128_madd_s16(d, d);
+  return v128_add_64(s, v128_add_64(_mm_unpackhi_epi32(d, v128_zero()),
+                                    _mm_unpacklo_epi32(d, v128_zero())));
+}
+
+SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
+  return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
+}
 
 #endif /* _V128_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics.h b/third_party/aom/aom_dsp/simd/v256_intrinsics.h
index 1896374ee..0e5ae5b68 100644
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics.h
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics.h
@@ -15,9 +15,10 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./v256_intrinsics_c.h"
-#include "./v128_intrinsics.h"
-#include "./v64_intrinsics.h"
+
+#include "aom_dsp/simd/v256_intrinsics_c.h"
+#include "aom_dsp/simd/v128_intrinsics.h"
+#include "aom_dsp/simd/v64_intrinsics.h"
 
 /* Fallback to plain, unoptimised C. */
 
@@ -25,6 +26,7 @@ typedef c_v256 v256;
 
 SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); }
 SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); }
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return c_v256_low_u64(a); }
 SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); }
 SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); }
 SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
@@ -59,6 +61,7 @@ SIMD_INLINE v256 v256_zero() { return c_v256_zero(); }
 SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); }
 SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); }
 SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); }
+SIMD_INLINE v256 v256_dup_64(uint64_t x) { return c_v256_dup_64(x); }
 
 typedef uint32_t sad256_internal;
 SIMD_INLINE sad256_internal v256_sad_u8_init() { return c_v256_sad_u8_init(); }
@@ -76,9 +79,16 @@ SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
 SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
   return c_v256_ssd_u8_sum(s);
 }
+
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+  return c_v256_dotp_su8(a, b);
+}
 SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
   return c_v256_dotp_s16(a, b);
 }
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+  return c_v256_dotp_s32(a, b);
+}
 SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); }
 
 SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); }
@@ -88,8 +98,13 @@ SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return c_v256_andn(a, b); }
 
 SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); }
 SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); }
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return c_v256_sadd_s8(a, b); }
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return c_v256_sadd_u8(a, b); }
 SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); }
 SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); }
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return c_v256_add_64(a, b); }
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return c_v256_sub_64(a, b); }
+SIMD_INLINE v256 v256_padd_u8(v256 a) { return c_v256_padd_u8(a); }
 SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); }
 SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); }
 SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); }
@@ -114,8 +129,16 @@ SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
 SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); }
 SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); }
 
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return c_v256_movemask_8(a); }
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+  return c_v256_blend_8(a, b, c);
+}
+
 SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); }
 SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); }
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+  return c_v256_rdavg_u16(a, b);
+}
 SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); }
 SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); }
 SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); }
@@ -123,6 +146,8 @@ SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return c_v256_min_s8(a, b); }
 SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); }
 SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); }
 SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); }
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return c_v256_min_s32(a, b); }
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return c_v256_max_s32(a, b); }
 
 SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); }
 SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); }
@@ -159,6 +184,12 @@ SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
 SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
   return c_v256_unziphi_32(a, b);
 }
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+  return c_v256_unziplo_64(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+  return c_v256_unziphi_64(a, b);
+}
 SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); }
 SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
   return c_v256_unpacklo_u8_s16(a);
@@ -176,6 +207,9 @@ SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
 SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
   return c_v256_pack_s32_s16(a, b);
 }
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+  return c_v256_pack_s32_u16(a, b);
+}
 SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
   return c_v256_pack_s16_u8(a, b);
 }
@@ -203,6 +237,9 @@ SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
 SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
   return c_v256_shuffle_8(a, pattern);
 }
+SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) {
+  return c_v256_wideshuffle_8(a, b, pattern);
+}
 SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
   return c_v256_pshuffle_8(a, pattern);
 }
@@ -217,7 +254,14 @@ SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
   return c_v256_cmplt_s16(a, b);
 }
 SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); }
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { return c_v256_cmpeq_32(a, b); }
 
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+  return c_v256_cmpgt_s32(a, b);
+}
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+  return c_v256_cmplt_s32(a, b);
+}
 SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
   return c_v256_shl_8(a, c);
 }
@@ -261,6 +305,9 @@ SIMD_INLINE v256 v256_shl_n_16(v256 a, unsigned int n) {
 SIMD_INLINE v256 v256_shl_n_32(v256 a, unsigned int n) {
   return c_v256_shl_n_32(a, n);
 }
+SIMD_INLINE v256 v256_shl_n_64(v256 a, unsigned int n) {
+  return c_v256_shl_n_64(a, n);
+}
 SIMD_INLINE v256 v256_shr_n_u8(v256 a, unsigned int n) {
   return c_v256_shr_n_u8(a, n);
 }
@@ -270,6 +317,9 @@ SIMD_INLINE v256 v256_shr_n_u16(v256 a, unsigned int n) {
 SIMD_INLINE v256 v256_shr_n_u32(v256 a, unsigned int n) {
   return c_v256_shr_n_u32(a, n);
 }
+SIMD_INLINE v256 v256_shr_n_u64(v256 a, unsigned int n) {
+  return c_v256_shr_n_u64(a, n);
+}
 SIMD_INLINE v256 v256_shr_n_s8(v256 a, unsigned int n) {
   return c_v256_shr_n_s8(a, n);
 }
@@ -279,5 +329,39 @@ SIMD_INLINE v256 v256_shr_n_s16(v256 a, unsigned int n) {
 SIMD_INLINE v256 v256_shr_n_s32(v256 a, unsigned int n) {
   return c_v256_shr_n_s32(a, n);
 }
+SIMD_INLINE v256 v256_shr_n_s64(v256 a, unsigned int n) {
+  return c_v256_shr_n_s64(a, n);
+}
+
+SIMD_INLINE v256 v256_shr_n_word(v256 a, unsigned int n) {
+  return c_v256_shr_n_word(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_word(v256 a, unsigned int n) {
+  return c_v256_shl_n_word(a, n);
+}
+
+typedef uint32_t sad256_internal_u16;
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() {
+  return c_v256_sad_u16_init();
+}
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+                                             v256 b) {
+  return c_v256_sad_u16(s, a, b);
+}
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+  return c_v256_sad_u16_sum(s);
+}
+
+typedef uint64_t ssd256_internal_s16;
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() {
+  return c_v256_ssd_s16_init();
+}
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
+                                             v256 b) {
+  return c_v256_ssd_s16(s, a, b);
+}
+SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
+  return c_v256_ssd_s16_sum(s);
+}
 
 #endif /* _V256_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h
index ba4ed719d..d96638488 100644
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h
@@ -12,6 +12,6 @@
 #ifndef _V256_INTRINSICS_H
 #define _V256_INTRINSICS_H
 
-#include "./v256_intrinsics_v128.h"
+#include "aom_dsp/simd/v256_intrinsics_v128.h"
 
 #endif /* _V256_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
index f96ca7fa6..5b412df71 100644
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
@@ -14,8 +14,10 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include "./v128_intrinsics_c.h"
-#include "./aom_config.h"
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/simd/v128_intrinsics_c.h"
 
 typedef union {
   uint8_t u8[32];
@@ -34,6 +36,8 @@ SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; }
 
 SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; }
 
+SIMD_INLINE uint64_t c_v256_low_u64(c_v256 a) { return a.u64[0]; }
+
 SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; }
 
 SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; }
@@ -120,23 +124,39 @@ SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) {
   return t;
 }
 
+SIMD_INLINE c_v256 c_v256_dup_64(uint64_t x) {
+  c_v256 t;
+  t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = x;
+  return t;
+}
+
+SIMD_INLINE int64_t c_v256_dotp_su8(c_v256 a, c_v256 b) {
+  return c_v128_dotp_su8(a.v128[1], b.v128[1]) +
+         c_v128_dotp_su8(a.v128[0], b.v128[0]);
+}
+
 SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) {
   return c_v128_dotp_s16(a.v128[1], b.v128[1]) +
          c_v128_dotp_s16(a.v128[0], b.v128[0]);
 }
 
+SIMD_INLINE int64_t c_v256_dotp_s32(c_v256 a, c_v256 b) {
+  return c_v128_dotp_s32(a.v128[1], b.v128[1]) +
+         c_v128_dotp_s32(a.v128[0], b.v128[0]);
+}
+
 SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) {
   return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]);
 }
 
 typedef uint32_t c_sad256_internal;
 
-SIMD_INLINE c_sad128_internal c_v256_sad_u8_init() { return 0; }
+SIMD_INLINE c_sad256_internal c_v256_sad_u8_init() { return 0; }
 
 /* Implementation dependent return value.  Result must be finalised with
    v256_sad_u8_sum().
    The result for more than 16 v256_sad_u8() calls is undefined. */
-SIMD_INLINE c_sad128_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a,
+SIMD_INLINE c_sad256_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a,
                                             c_v256 b) {
   int c;
   for (c = 0; c < 32; c++)
@@ -191,6 +211,16 @@ SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) {
                           c_v128_add_16(a.v128[0], b.v128[0]));
 }
 
+SIMD_INLINE c_v256 c_v256_sadd_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sadd_s8(a.v128[1], b.v128[1]),
+                          c_v128_sadd_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sadd_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sadd_u8(a.v128[1], b.v128[1]),
+                          c_v128_sadd_u8(a.v128[0], b.v128[0]));
+}
+
 SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) {
   return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]),
                           c_v128_sadd_s16(a.v128[0], b.v128[0]));
@@ -201,6 +231,23 @@ SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) {
                           c_v128_add_32(a.v128[0], b.v128[0]));
 }
 
+SIMD_INLINE c_v256 c_v256_add_64(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_add_64(a.v128[1], b.v128[1]),
+                          c_v128_add_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_64(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sub_64(a.v128[1], b.v128[1]),
+                          c_v128_sub_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_padd_u8(c_v256 a) {
+  c_v256 t;
+  for (int i = 0; i < 16; i++)
+    t.u16[i] = (uint16_t)a.u8[i * 2] + (uint16_t)a.u8[i * 2 + 1];
+  return t;
+}
+
 SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) {
   c_v256 t;
   t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
@@ -299,6 +346,11 @@ SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) {
                           c_v128_rdavg_u8(a.v128[0], b.v128[0]));
 }
 
+SIMD_INLINE c_v256 c_v256_rdavg_u16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_rdavg_u16(a.v128[1], b.v128[1]),
+                          c_v128_rdavg_u16(a.v128[0], b.v128[0]));
+}
+
 SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) {
   return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]),
                           c_v128_avg_u16(a.v128[0], b.v128[0]));
@@ -319,6 +371,30 @@ SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) {
                           c_v128_min_s8(a.v128[0], b.v128[0]));
 }
 
+SIMD_INLINE uint32_t c_v256_movemask_8(c_v256 a) {
+  return ((a.s8[31] < 0) << 31) | ((a.s8[30] < 0) << 30) |
+         ((a.s8[29] < 0) << 29) | ((a.s8[28] < 0) << 28) |
+         ((a.s8[27] < 0) << 27) | ((a.s8[26] < 0) << 26) |
+         ((a.s8[25] < 0) << 25) | ((a.s8[24] < 0) << 24) |
+         ((a.s8[23] < 0) << 23) | ((a.s8[22] < 0) << 22) |
+         ((a.s8[21] < 0) << 21) | ((a.s8[20] < 0) << 20) |
+         ((a.s8[19] < 0) << 19) | ((a.s8[18] < 0) << 18) |
+         ((a.s8[17] < 0) << 17) | ((a.s8[16] < 0) << 16) |
+         ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
+         ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
+         ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
+         ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
+         ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
+         ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
+         ((a.s8[0] < 0) << 0);
+}
+
+SIMD_INLINE c_v256 c_v256_blend_8(c_v256 a, c_v256 b, c_v256 c) {
+  c_v256 t;
+  for (int i = 0; i < 32; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
+  return t;
+}
+
 SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) {
   return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]),
                           c_v128_max_s8(a.v128[0], b.v128[0]));
@@ -334,6 +410,16 @@ SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) {
                           c_v128_max_s16(a.v128[0], b.v128[0]));
 }
 
+SIMD_INLINE c_v256 c_v256_min_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_min_s32(a.v128[1], b.v128[1]),
+                          c_v128_min_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_max_s32(a.v128[1], b.v128[1]),
+                          c_v128_max_s32(a.v128[0], b.v128[0]));
+}
+
 SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) {
   return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]),
                           c_v128_ziplo_8(a.v128[0], b.v128[0]));
@@ -482,6 +568,32 @@ SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) {
                            : _c_v256_unzip_32(b, a, 1);
 }
 
+SIMD_INLINE c_v256 _c_v256_unzip_64(c_v256 a, c_v256 b, int mode) {
+  c_v256 t;
+  if (mode) {
+    t.u64[3] = b.u64[3];
+    t.u64[2] = b.u64[1];
+    t.u64[1] = a.u64[3];
+    t.u64[0] = a.u64[1];
+  } else {
+    t.u64[3] = a.u64[2];
+    t.u64[2] = a.u64[0];
+    t.u64[1] = b.u64[2];
+    t.u64[0] = b.u64[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_64(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(a, b, 1)
+                           : _c_v256_unzip_64(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_64(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(b, a, 0)
+                           : _c_v256_unzip_64(b, a, 1);
+}
+
 SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) {
   return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a));
 }
@@ -515,6 +627,11 @@ SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) {
                           c_v128_pack_s32_s16(b.v128[1], b.v128[0]));
 }
 
+SIMD_INLINE c_v256 c_v256_pack_s32_u16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_pack_s32_u16(a.v128[1], a.v128[0]),
+                          c_v128_pack_s32_u16(b.v128[1], b.v128[0]));
+}
+
 SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) {
   return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]),
                           c_v128_pack_s16_u8(b.v128[1], b.v128[0]));
@@ -558,15 +675,21 @@ SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) {
 SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) {
   c_v256 t;
   int c;
-  for (c = 0; c < 32; c++) {
-    if (pattern.u8[c] & ~31) {
-      fprintf(stderr, "Undefined v256_shuffle_8 index %d/%d\n", pattern.u8[c],
-              c);
-      abort();
-    }
+  for (c = 0; c < 32; c++)
     t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
                                      : pattern.u8[c] & 31];
-  }
+
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_wideshuffle_8(c_v256 a, c_v256 b, c_v256 pattern) {
+  c_v256 t;
+  int c;
+  for (c = 0; c < 32; c++)
+    t.u8[c] = (pattern.u8[c] < 32
+                   ? b.u8
+                   : a.u8)[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
+                                             : pattern.u8[c] & 31];
   return t;
 }
 
@@ -607,6 +730,21 @@ SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) {
                           c_v128_cmpeq_16(a.v128[0], b.v128[0]));
 }
 
+SIMD_INLINE c_v256 c_v256_cmpgt_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpgt_s32(a.v128[1], b.v128[1]),
+                          c_v128_cmpgt_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmplt_s32(a.v128[1], b.v128[1]),
+                          c_v128_cmplt_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpeq_32(a.v128[1], b.v128[1]),
+                          c_v128_cmpeq_32(a.v128[0], b.v128[0]));
+}
+
 SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) {
   if (n < 16)
     return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n),
@@ -685,6 +823,45 @@ SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, unsigned int c) {
                           c_v128_shr_s32(a.v128[0], c));
 }
 
+SIMD_INLINE c_v256 c_v256_shr_s64(c_v256 a, unsigned int n) {
+  c_v256 t;
+  if (SIMD_CHECK && n > 63) {
+    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+    abort();
+  }
+  t.s64[3] = a.s64[3] >> n;
+  t.s64[2] = a.s64[2] >> n;
+  t.s64[1] = a.s64[1] >> n;
+  t.s64[0] = a.s64[0] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u64(c_v256 a, unsigned int n) {
+  c_v256 t;
+  if (SIMD_CHECK && n > 63) {
+    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+    abort();
+  }
+  t.u64[3] = a.u64[3] >> n;
+  t.u64[2] = a.u64[2] >> n;
+  t.u64[1] = a.u64[1] >> n;
+  t.u64[0] = a.u64[0] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_shl_64(c_v256 a, unsigned int n) {
+  c_v256 t;
+  if (SIMD_CHECK && n > 63) {
+    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+    abort();
+  }
+  t.u64[3] = a.u64[3] << n;
+  t.u64[2] = a.u64[2] << n;
+  t.u64[1] = a.u64[1] << n;
+  t.u64[0] = a.u64[0] << n;
+  return t;
+}
+
 SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, unsigned int n) {
   return c_v256_shl_8(a, n);
 }
@@ -697,6 +874,10 @@ SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, unsigned int n) {
   return c_v256_shl_32(a, n);
 }
 
+SIMD_INLINE c_v256 c_v256_shl_n_64(c_v256 a, unsigned int n) {
+  return c_v256_shl_64(a, n);
+}
+
 SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, unsigned int n) {
   return c_v256_shr_u8(a, n);
 }
@@ -709,6 +890,10 @@ SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, unsigned int n) {
   return c_v256_shr_u32(a, n);
 }
 
+SIMD_INLINE c_v256 c_v256_shr_n_u64(c_v256 a, unsigned int n) {
+  return c_v256_shr_u64(a, n);
+}
+
 SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, unsigned int n) {
   return c_v256_shr_s8(a, n);
 }
@@ -721,4 +906,48 @@ SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, unsigned int n) {
   return c_v256_shr_s32(a, n);
 }
 
+SIMD_INLINE c_v256 c_v256_shr_n_s64(c_v256 a, unsigned int n) {
+  return c_v256_shr_s64(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_word(c_v256 a, const unsigned int n) {
+  return c_v256_shr_n_byte(a, 2 * n);
+}
+SIMD_INLINE c_v256 c_v256_shl_n_word(c_v256 a, const unsigned int n) {
+  return c_v256_shl_n_byte(a, 2 * n);
+}
+
+typedef uint32_t c_sad256_internal_u16;
+
+SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u16_sum(). */
+SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16(c_sad256_internal_u16 s,
+                                                 c_v256 a, c_v256 b) {
+  int c;
+  for (c = 0; c < 16; c++)
+    s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v256_sad_u16_sum(c_sad256_internal_u16 s) { return s; }
+
+typedef uint64_t c_ssd256_internal_s16;
+
+SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16(c_ssd256_internal_s16 s,
+                                                 c_v256 a, c_v256 b) {
+  int c;
+  for (c = 0; c < 16; c++)
+    s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
+         (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
+  return s;
+}
+
+SIMD_INLINE uint64_t c_v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return s; }
+
 #endif /* _V256_INTRINSICS_C_H */
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
index cbea55ca1..60b2a1791 100644
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
@@ -13,27 +13,35 @@
 #define _V256_INTRINSICS_V128_H
 
 #if HAVE_NEON
-#include "./v128_intrinsics_arm.h"
+#include "aom_dsp/simd/v128_intrinsics_arm.h"
 #elif HAVE_SSE2
-#include "./v128_intrinsics_x86.h"
+#include "aom_dsp/simd/v128_intrinsics_x86.h"
 #else
-#include "./v128_intrinsics.h"
+#include "aom_dsp/simd/v128_intrinsics.h"
 #endif
 
-typedef struct { v128 lo, hi; } v256;
+#if HAVE_NEON
+typedef int64x2x2_t v256;
+#else
+typedef struct {
+  v128 val[2];
+} v256;
+#endif
 
-SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.lo); }
+SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); }
 
-SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.lo); }
+SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.val[0]); }
 
-SIMD_INLINE v128 v256_low_v128(v256 a) { return a.lo; }
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
 
-SIMD_INLINE v128 v256_high_v128(v256 a) { return a.hi; }
+SIMD_INLINE v128 v256_low_v128(v256 a) { return a.val[0]; }
+
+SIMD_INLINE v128 v256_high_v128(v256 a) { return a.val[1]; }
 
 SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
   v256 t;
-  t.hi = hi;
-  t.lo = lo;
+  t.val[1] = hi;
+  t.val[0] = lo;
   return t;
 }
 
@@ -56,13 +64,13 @@ SIMD_INLINE v256 v256_load_aligned(const void *p) {
 }
 
 SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
-  v128_store_unaligned(p, a.lo);
-  v128_store_unaligned((uint8_t *)p + 16, a.hi);
+  v128_store_unaligned(p, a.val[0]);
+  v128_store_unaligned((uint8_t *)p + 16, a.val[1]);
 }
 
 SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
-  v128_store_aligned(p, a.lo);
-  v128_store_aligned((uint8_t *)p + 16, a.hi);
+  v128_store_aligned(p, a.val[0]);
+  v128_store_aligned((uint8_t *)p + 16, a.val[1]);
 }
 
 SIMD_INLINE v256 v256_zero() {
@@ -84,23 +92,35 @@ SIMD_INLINE v256 v256_dup_32(uint32_t x) {
   return v256_from_v128(t, t);
 }
 
+SIMD_INLINE v256 v256_dup_64(uint64_t x) {
+  v128 t = v128_dup_64(x);
+  return v256_from_v128(t, t);
+}
+
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+  return v128_dotp_su8(a.val[1], b.val[1]) + v128_dotp_su8(a.val[0], b.val[0]);
+}
+
 SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
-  return v128_dotp_s16(a.hi, b.hi) + v128_dotp_s16(a.lo, b.lo);
+  return v128_dotp_s16(a.val[1], b.val[1]) + v128_dotp_s16(a.val[0], b.val[0]);
+}
+
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+  return v128_dotp_s32(a.val[1], b.val[1]) + v128_dotp_s32(a.val[0], b.val[0]);
 }
 
 SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
-  return v128_hadd_u8(a.hi) + v128_hadd_u8(a.lo);
+  return v128_hadd_u8(a.val[1]) + v128_hadd_u8(a.val[0]);
 }
 
 typedef struct {
-  sad128_internal hi;
-  sad128_internal lo;
+  sad128_internal val[2];
 } sad256_internal;
 
 SIMD_INLINE sad256_internal v256_sad_u8_init() {
   sad256_internal t;
-  t.hi = v128_sad_u8_init();
-  t.lo = v128_sad_u8_init();
+  t.val[1] = v128_sad_u8_init();
+  t.val[0] = v128_sad_u8_init();
   return t;
 }
 
@@ -109,24 +129,23 @@ SIMD_INLINE sad256_internal v256_sad_u8_init() {
    The result for more than 16 v256_sad_u8() calls is undefined. */
 SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
   sad256_internal t;
-  t.hi = v128_sad_u8(s.hi, a.hi, b.hi);
-  t.lo = v128_sad_u8(s.lo, a.lo, b.lo);
+  t.val[1] = v128_sad_u8(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_sad_u8(s.val[0], a.val[0], b.val[0]);
   return t;
 }
 
 SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
-  return v128_sad_u8_sum(s.hi) + v128_sad_u8_sum(s.lo);
+  return v128_sad_u8_sum(s.val[1]) + v128_sad_u8_sum(s.val[0]);
 }
 
 typedef struct {
-  ssd128_internal hi;
-  ssd128_internal lo;
+  ssd128_internal val[2];
 } ssd256_internal;
 
 SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
   ssd256_internal t;
-  t.hi = v128_ssd_u8_init();
-  t.lo = v128_ssd_u8_init();
+  t.val[1] = v128_ssd_u8_init();
+  t.val[0] = v128_ssd_u8_init();
   return t;
 }
 
@@ -134,85 +153,124 @@ SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
  * v256_ssd_u8_sum(). */
 SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
   ssd256_internal t;
-  t.hi = v128_ssd_u8(s.hi, a.hi, b.hi);
-  t.lo = v128_ssd_u8(s.lo, a.lo, b.lo);
+  t.val[1] = v128_ssd_u8(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_ssd_u8(s.val[0], a.val[0], b.val[0]);
   return t;
 }
 
 SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
-  return v128_ssd_u8_sum(s.hi) + v128_ssd_u8_sum(s.lo);
+  return v128_ssd_u8_sum(s.val[1]) + v128_ssd_u8_sum(s.val[0]);
 }
 
 SIMD_INLINE v256 v256_or(v256 a, v256 b) {
-  return v256_from_v128(v128_or(a.hi, b.hi), v128_or(a.lo, b.lo));
+  return v256_from_v128(v128_or(a.val[1], b.val[1]),
+                        v128_or(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_xor(v256 a, v256 b) {
-  return v256_from_v128(v128_xor(a.hi, b.hi), v128_xor(a.lo, b.lo));
+  return v256_from_v128(v128_xor(a.val[1], b.val[1]),
+                        v128_xor(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_and(v256 a, v256 b) {
-  return v256_from_v128(v128_and(a.hi, b.hi), v128_and(a.lo, b.lo));
+  return v256_from_v128(v128_and(a.val[1], b.val[1]),
+                        v128_and(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_andn(v256 a, v256 b) {
-  return v256_from_v128(v128_andn(a.hi, b.hi), v128_andn(a.lo, b.lo));
+  return v256_from_v128(v128_andn(a.val[1], b.val[1]),
+                        v128_andn(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_add_8(v256 a, v256 b) {
-  return v256_from_v128(v128_add_8(a.hi, b.hi), v128_add_8(a.lo, b.lo));
+  return v256_from_v128(v128_add_8(a.val[1], b.val[1]),
+                        v128_add_8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_add_16(v256 a, v256 b) {
-  return v256_from_v128(v128_add_16(a.hi, b.hi), v128_add_16(a.lo, b.lo));
+  return v256_from_v128(v128_add_16(a.val[1], b.val[1]),
+                        v128_add_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_sadd_s8(a.val[1], b.val[1]),
+                        v128_sadd_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_sadd_u8(a.val[1], b.val[1]),
+                        v128_sadd_u8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_sadd_s16(a.hi, b.hi), v128_sadd_s16(a.lo, b.lo));
+  return v256_from_v128(v128_sadd_s16(a.val[1], b.val[1]),
+                        v128_sadd_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_add_32(v256 a, v256 b) {
-  return v256_from_v128(v128_add_32(a.hi, b.hi), v128_add_32(a.lo, b.lo));
+  return v256_from_v128(v128_add_32(a.val[1], b.val[1]),
+                        v128_add_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) {
+  return v256_from_v128(v128_add_64(a.val[1], b.val[1]),
+                        v128_add_64(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_padd_u8(v256 a) {
+  return v256_from_v128(v128_padd_u8(a.val[1]), v128_padd_u8(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_padd_s16(v256 a) {
-  return v256_from_v128(v128_padd_s16(a.hi), v128_padd_s16(a.lo));
+  return v256_from_v128(v128_padd_s16(a.val[1]), v128_padd_s16(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) {
-  return v256_from_v128(v128_sub_8(a.hi, b.hi), v128_sub_8(a.lo, b.lo));
+  return v256_from_v128(v128_sub_8(a.val[1], b.val[1]),
+                        v128_sub_8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_ssub_u8(a.hi, b.hi), v128_ssub_u8(a.lo, b.lo));
+  return v256_from_v128(v128_ssub_u8(a.val[1], b.val[1]),
+                        v128_ssub_u8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_ssub_s8(a.hi, b.hi), v128_ssub_s8(a.lo, b.lo));
+  return v256_from_v128(v128_ssub_s8(a.val[1], b.val[1]),
+                        v128_ssub_s8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) {
-  return v256_from_v128(v128_sub_16(a.hi, b.hi), v128_sub_16(a.lo, b.lo));
+  return v256_from_v128(v128_sub_16(a.val[1], b.val[1]),
+                        v128_sub_16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_ssub_s16(a.hi, b.hi), v128_ssub_s16(a.lo, b.lo));
+  return v256_from_v128(v128_ssub_s16(a.val[1], b.val[1]),
+                        v128_ssub_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
-  return v256_from_v128(v128_ssub_u16(a.hi, b.hi), v128_ssub_u16(a.lo, b.lo));
+  return v256_from_v128(v128_ssub_u16(a.val[1], b.val[1]),
+                        v128_ssub_u16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) {
-  return v256_from_v128(v128_sub_32(a.hi, b.hi), v128_sub_32(a.lo, b.lo));
+  return v256_from_v128(v128_sub_32(a.val[1], b.val[1]),
+                        v128_sub_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) {
+  return v256_from_v128(v128_sub_64(a.val[1], b.val[1]),
+                        v128_sub_64(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_abs_s16(v256 a) {
-  return v256_from_v128(v128_abs_s16(a.hi), v128_abs_s16(a.lo));
+  return v256_from_v128(v128_abs_s16(a.val[1]), v128_abs_s16(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_abs_s8(v256 a) {
-  return v256_from_v128(v128_abs_s8(a.hi), v128_abs_s8(a.lo));
+  return v256_from_v128(v128_abs_s8(a.val[1]), v128_abs_s8(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
@@ -223,99 +281,146 @@ SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
 }
 
 SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_mullo_s16(a.hi, b.hi), v128_mullo_s16(a.lo, b.lo));
+  return v256_from_v128(v128_mullo_s16(a.val[1], b.val[1]),
+                        v128_mullo_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_mulhi_s16(a.hi, b.hi), v128_mulhi_s16(a.lo, b.lo));
+  return v256_from_v128(v128_mulhi_s16(a.val[1], b.val[1]),
+                        v128_mulhi_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
-  return v256_from_v128(v128_mullo_s32(a.hi, b.hi), v128_mullo_s32(a.lo, b.lo));
+  return v256_from_v128(v128_mullo_s32(a.val[1], b.val[1]),
+                        v128_mullo_s32(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_madd_s16(a.hi, b.hi), v128_madd_s16(a.lo, b.lo));
+  return v256_from_v128(v128_madd_s16(a.val[1], b.val[1]),
+                        v128_madd_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
-  return v256_from_v128(v128_madd_us8(a.hi, b.hi), v128_madd_us8(a.lo, b.lo));
+  return v256_from_v128(v128_madd_us8(a.val[1], b.val[1]),
+                        v128_madd_us8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_avg_u8(a.hi, b.hi), v128_avg_u8(a.lo, b.lo));
+  return v256_from_v128(v128_avg_u8(a.val[1], b.val[1]),
+                        v128_avg_u8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_rdavg_u8(a.hi, b.hi), v128_rdavg_u8(a.lo, b.lo));
+  return v256_from_v128(v128_rdavg_u8(a.val[1], b.val[1]),
+                        v128_rdavg_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+  return v256_from_v128(v128_rdavg_u16(a.val[1], b.val[1]),
+                        v128_rdavg_u16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) {
-  return v256_from_v128(v128_avg_u16(a.hi, b.hi), v128_avg_u16(a.lo, b.lo));
+  return v256_from_v128(v128_avg_u16(a.val[1], b.val[1]),
+                        v128_avg_u16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_min_u8(a.hi, b.hi), v128_min_u8(a.lo, b.lo));
+  return v256_from_v128(v128_min_u8(a.val[1], b.val[1]),
+                        v128_min_u8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_max_u8(a.hi, b.hi), v128_max_u8(a.lo, b.lo));
+  return v256_from_v128(v128_max_u8(a.val[1], b.val[1]),
+                        v128_max_u8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_min_s8(a.hi, b.hi), v128_min_s8(a.lo, b.lo));
+  return v256_from_v128(v128_min_s8(a.val[1], b.val[1]),
+                        v128_min_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) {
+  return (v128_movemask_8(v256_high_v128(a)) << 16) |
+         v128_movemask_8(v256_low_v128(a));
+}
+
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+  return v256_from_v128(v128_blend_8(a.val[1], b.val[1], c.val[1]),
+                        v128_blend_8(a.val[0], b.val[0], c.val[0]));
 }
 
 SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_max_s8(a.hi, b.hi), v128_max_s8(a.lo, b.lo));
+  return v256_from_v128(v128_max_s8(a.val[1], b.val[1]),
+                        v128_max_s8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_min_s16(a.hi, b.hi), v128_min_s16(a.lo, b.lo));
+  return v256_from_v128(v128_min_s16(a.val[1], b.val[1]),
+                        v128_min_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_max_s16(a.hi, b.hi), v128_max_s16(a.lo, b.lo));
+  return v256_from_v128(v128_max_s16(a.val[1], b.val[1]),
+                        v128_max_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_min_s32(a.val[1], b.val[1]),
+                        v128_min_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_max_s32(a.val[1], b.val[1]),
+                        v128_max_s32(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_8(a.lo, b.lo), v128_ziplo_8(a.lo, b.lo));
+  return v256_from_v128(v128_ziphi_8(a.val[0], b.val[0]),
+                        v128_ziplo_8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_8(a.hi, b.hi), v128_ziplo_8(a.hi, b.hi));
+  return v256_from_v128(v128_ziphi_8(a.val[1], b.val[1]),
+                        v128_ziplo_8(a.val[1], b.val[1]));
 }
 
 SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_16(a.lo, b.lo), v128_ziplo_16(a.lo, b.lo));
+  return v256_from_v128(v128_ziphi_16(a.val[0], b.val[0]),
+                        v128_ziplo_16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_16(a.hi, b.hi), v128_ziplo_16(a.hi, b.hi));
+  return v256_from_v128(v128_ziphi_16(a.val[1], b.val[1]),
+                        v128_ziplo_16(a.val[1], b.val[1]));
 }
 
 SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_32(a.lo, b.lo), v128_ziplo_32(a.lo, b.lo));
+  return v256_from_v128(v128_ziphi_32(a.val[0], b.val[0]),
+                        v128_ziplo_32(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_32(a.hi, b.hi), v128_ziplo_32(a.hi, b.hi));
+  return v256_from_v128(v128_ziphi_32(a.val[1], b.val[1]),
+                        v128_ziplo_32(a.val[1], b.val[1]));
 }
 
 SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_64(a.lo, b.lo), v128_ziplo_64(a.lo, b.lo));
+  return v256_from_v128(v128_ziphi_64(a.val[0], b.val[0]),
+                        v128_ziplo_64(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_64(a.hi, b.hi), v128_ziplo_64(a.hi, b.hi));
+  return v256_from_v128(v128_ziphi_64(a.val[1], b.val[1]),
+                        v128_ziplo_64(a.val[1], b.val[1]));
 }
 
 SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
-  return v256_from_v128(a.lo, b.lo);
+  return v256_from_v128(a.val[0], b.val[0]);
 }
 
 SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
-  return v256_from_v128(a.hi, b.hi);
+  return v256_from_v128(a.val[1], b.val[1]);
 }
 
 SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
@@ -331,31 +436,59 @@ SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
 }
 
 SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_8(a.hi, a.lo), v128_unziplo_8(b.hi, b.lo));
+  return v256_from_v128(v128_unziplo_8(a.val[1], a.val[0]),
+                        v128_unziplo_8(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_8(a.hi, a.lo), v128_unziphi_8(b.hi, b.lo));
+  return v256_from_v128(v128_unziphi_8(a.val[1], a.val[0]),
+                        v128_unziphi_8(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_16(a.hi, a.lo),
-                        v128_unziplo_16(b.hi, b.lo));
+  return v256_from_v128(v128_unziplo_16(a.val[1], a.val[0]),
+                        v128_unziplo_16(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_16(a.hi, a.lo),
-                        v128_unziphi_16(b.hi, b.lo));
+  return v256_from_v128(v128_unziphi_16(a.val[1], a.val[0]),
+                        v128_unziphi_16(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_32(a.hi, a.lo),
-                        v128_unziplo_32(b.hi, b.lo));
+  return v256_from_v128(v128_unziplo_32(a.val[1], a.val[0]),
+                        v128_unziplo_32(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_32(a.hi, a.lo),
-                        v128_unziphi_32(b.hi, b.lo));
+  return v256_from_v128(v128_unziphi_32(a.val[1], a.val[0]),
+                        v128_unziphi_32(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+#if HAVE_SSE2
+  return v256_from_v128(
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
+                                      _mm_castsi128_pd(a.val[1]), 0)),
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
+                                      _mm_castsi128_pd(b.val[1]), 0)));
+#else
+  return v256_from_v64(v128_low_v64(a.val[1]), v128_low_v64(a.val[0]),
+                       v128_low_v64(b.val[1]), v128_low_v64(b.val[0]));
+#endif
+}
+
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+#if HAVE_SSE2
+  return v256_from_v128(
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
+                                      _mm_castsi128_pd(a.val[1]), 3)),
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
+                                      _mm_castsi128_pd(b.val[1]), 3)));
+#else
+  return v256_from_v64(v128_high_v64(a.val[1]), v128_high_v64(a.val[0]),
+                       v128_high_v64(b.val[1]), v128_high_v64(b.val[0]));
+#endif
 }
 
 SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
@@ -363,11 +496,13 @@ SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
 }
 
 SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(a.lo), v128_unpacklo_u8_s16(a.lo));
+  return v256_from_v128(v128_unpackhi_u8_s16(a.val[0]),
+                        v128_unpacklo_u8_s16(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(a.hi), v128_unpacklo_u8_s16(a.hi));
+  return v256_from_v128(v128_unpackhi_u8_s16(a.val[1]),
+                        v128_unpacklo_u8_s16(a.val[1]));
 }
 
 SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
@@ -375,26 +510,33 @@ SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
 }
 
 SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_s8_s16(a.lo), v128_unpacklo_s8_s16(a.lo));
+  return v256_from_v128(v128_unpackhi_s8_s16(a.val[0]),
+                        v128_unpacklo_s8_s16(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_s8_s16(a.hi), v128_unpacklo_s8_s16(a.hi));
+  return v256_from_v128(v128_unpackhi_s8_s16(a.val[1]),
+                        v128_unpacklo_s8_s16(a.val[1]));
 }
 
 SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s32_s16(a.hi, a.lo),
-                        v128_pack_s32_s16(b.hi, b.lo));
+  return v256_from_v128(v128_pack_s32_s16(a.val[1], a.val[0]),
+                        v128_pack_s32_s16(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s32_u16(a.val[1], a.val[0]),
+                        v128_pack_s32_u16(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s16_u8(a.hi, a.lo),
-                        v128_pack_s16_u8(b.hi, b.lo));
+  return v256_from_v128(v128_pack_s16_u8(a.val[1], a.val[0]),
+                        v128_pack_s16_u8(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s16_s8(a.hi, a.lo),
-                        v128_pack_s16_s8(b.hi, b.lo));
+  return v256_from_v128(v128_pack_s16_s8(a.val[1], a.val[0]),
+                        v128_pack_s16_s8(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
@@ -406,142 +548,326 @@ SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
 }
 
 SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(a.lo),
-                        v128_unpacklo_u16_s32(a.lo));
+  return v256_from_v128(v128_unpackhi_u16_s32(a.val[0]),
+                        v128_unpacklo_u16_s32(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(a.lo),
-                        v128_unpacklo_s16_s32(a.lo));
+  return v256_from_v128(v128_unpackhi_s16_s32(a.val[0]),
+                        v128_unpacklo_s16_s32(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(a.hi),
-                        v128_unpacklo_u16_s32(a.hi));
+  return v256_from_v128(v128_unpackhi_u16_s32(a.val[1]),
+                        v128_unpacklo_u16_s32(a.val[1]));
 }
 
 SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(a.hi),
-                        v128_unpacklo_s16_s32(a.hi));
-}
-
-SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
-  v128 c16 = v128_dup_8(16);
-  v128 maskhi = v128_cmplt_s8(pattern.hi, c16);
-  v128 masklo = v128_cmplt_s8(pattern.lo, c16);
-  return v256_from_v128(
-      v128_or(
-          v128_and(v128_shuffle_8(a.lo, pattern.hi), maskhi),
-          v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.hi, c16)), maskhi)),
-      v128_or(v128_and(v128_shuffle_8(a.lo, pattern.lo), masklo),
-              v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.lo, c16)),
-                        masklo)));
-}
-
-SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
-  return v256_from_v128(
-      v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)),
-      v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern)));
+  return v256_from_v128(v128_unpackhi_s16_s32(a.val[1]),
+                        v128_unpacklo_s16_s32(a.val[1]));
 }
 
 SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpgt_s8(a.hi, b.hi), v128_cmpgt_s8(a.lo, b.lo));
+  return v256_from_v128(v128_cmpgt_s8(a.val[1], b.val[1]),
+                        v128_cmpgt_s8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_cmplt_s8(a.hi, b.hi), v128_cmplt_s8(a.lo, b.lo));
+  return v256_from_v128(v128_cmplt_s8(a.val[1], b.val[1]),
+                        v128_cmplt_s8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpeq_8(a.hi, b.hi), v128_cmpeq_8(a.lo, b.lo));
+  return v256_from_v128(v128_cmpeq_8(a.val[1], b.val[1]),
+                        v128_cmpeq_8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpgt_s16(a.hi, b.hi), v128_cmpgt_s16(a.lo, b.lo));
+  return v256_from_v128(v128_cmpgt_s16(a.val[1], b.val[1]),
+                        v128_cmpgt_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_cmplt_s16(a.hi, b.hi), v128_cmplt_s16(a.lo, b.lo));
+  return v256_from_v128(v128_cmplt_s16(a.val[1], b.val[1]),
+                        v128_cmplt_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpeq_16(a.hi, b.hi), v128_cmpeq_16(a.lo, b.lo));
+  return v256_from_v128(v128_cmpeq_16(a.val[1], b.val[1]),
+                        v128_cmpeq_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpgt_s32(a.val[1], b.val[1]),
+                        v128_cmpgt_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_cmplt_s32(a.val[1], b.val[1]),
+                        v128_cmplt_s32(a.val[0], b.val[0]));
 }
 
-SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shl_8(a.hi, c), v128_shl_8(a.lo, c));
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpeq_32(a.val[1], b.val[1]),
+                        v128_cmpeq_32(a.val[0], b.val[0]));
 }
 
-SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shr_u8(a.hi, c), v128_shr_u8(a.lo, c));
+SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) {
+#if HAVE_NEON
+#if defined(__aarch64__)
+  uint8x16x2_t p = { { vreinterpretq_u8_s64(x.val[0]),
+                       vreinterpretq_u8_s64(x.val[1]) } };
+  return v256_from_v128(
+      vreinterpretq_s64_u8(vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))),
+      vreinterpretq_s64_u8(
+          vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[0]))));
+#else
+  uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])),
+                      vget_high_u8(vreinterpretq_u8_s64(x.val[0])),
+                      vget_low_u8(vreinterpretq_u8_s64(x.val[1])),
+                      vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } };
+  return v256_from_64(
+      (uint64_t)vreinterpret_s64_u8(
+          vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))),
+      (uint64_t)vreinterpret_s64_u8(
+          vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))),
+      (uint64_t)vreinterpret_s64_u8(
+          vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))),
+      (uint64_t)vreinterpret_s64_u8(
+          vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])))));
+#endif
+#else
+  v128 c16 = v128_dup_8(16);
+  v128 maskhi = v128_cmplt_s8(pattern.val[1], c16);
+  v128 masklo = v128_cmplt_s8(pattern.val[0], c16);
+  return v256_from_v128(
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c16)),
+                   v128_shuffle_8(x.val[0], pattern.val[1]), maskhi),
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)),
+                   v128_shuffle_8(x.val[0], pattern.val[0]), masklo));
+#endif
 }
 
-SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shr_s8(a.hi, c), v128_shr_s8(a.lo, c));
+SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) {
+#if HAVE_NEON
+#if defined(__aarch64__)
+  uint8x16x4_t p = { {
+      vreinterpretq_u8_s64(y.val[0]),
+      vreinterpretq_u8_s64(y.val[1]),
+      vreinterpretq_u8_s64(x.val[0]),
+      vreinterpretq_u8_s64(x.val[1]),
+  } };
+  return v256_from_v128(
+      vreinterpretq_s64_u8(vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))),
+      vreinterpretq_s64_u8(
+          vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[0]))));
+#else
+  v256 c32 = v256_dup_8(32);
+  v256 p32 = v256_sub_8(pattern, c32);
+  uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])),
+                      vget_high_u8(vreinterpretq_u8_s64(x.val[0])),
+                      vget_low_u8(vreinterpretq_u8_s64(x.val[1])),
+                      vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } };
+  uint8x8x4_t q = { { vget_low_u8(vreinterpretq_u8_s64(y.val[0])),
+                      vget_high_u8(vreinterpretq_u8_s64(y.val[0])),
+                      vget_low_u8(vreinterpretq_u8_s64(y.val[1])),
+                      vget_high_u8(vreinterpretq_u8_s64(y.val[1])) } };
+  v256 r1 =
+      v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       p, vreinterpret_u8_s64(vget_high_s64(p32.val[1])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       p, vreinterpret_u8_s64(vget_low_s64(p32.val[1])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       p, vreinterpret_u8_s64(vget_high_s64(p32.val[0])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       p, vreinterpret_u8_s64(vget_low_s64(p32.val[0])))));
+  v256 r2 =
+      v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       q, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       q, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       q, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       q, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])))));
+  return v256_blend_8(r1, r2, v256_cmplt_s8(pattern, c32));
+#endif
+#else
+  v128 c16 = v128_dup_8(16);
+  v128 c32 = v128_dup_8(32);
+  v128 c48 = v128_dup_8(48);
+  v128 maskhi16 = v128_cmpgt_s8(c16, pattern.val[1]);
+  v128 masklo16 = v128_cmpgt_s8(c16, pattern.val[0]);
+  v128 maskhi48 = v128_cmpgt_s8(c48, pattern.val[1]);
+  v128 masklo48 = v128_cmpgt_s8(c48, pattern.val[0]);
+  v256 r1 = v256_from_v128(
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c48)),
+                   v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[1], c32)),
+                   maskhi48),
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c48)),
+                   v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[0], c32)),
+                   masklo48));
+  v256 r2 = v256_from_v128(
+      v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[1], c16)),
+                   v128_shuffle_8(y.val[0], pattern.val[1]), maskhi16),
+      v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)),
+                   v128_shuffle_8(y.val[0], pattern.val[0]), masklo16));
+  return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern));
+#endif
+}
+
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+  return v256_from_v128(
+      v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)),
+      v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern)));
 }
 
-SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shl_16(a.hi, c), v128_shl_16(a.lo, c));
+SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_8(a.val[1], c), v128_shl_8(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shr_u16(a.hi, c), v128_shr_u16(a.lo, c));
+SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u8(a.val[1], c), v128_shr_u8(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shr_s16(a.hi, c), v128_shr_s16(a.lo, c));
+SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s8(a.val[1], c), v128_shr_s8(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shl_32(a.hi, c), v128_shl_32(a.lo, c));
+SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_16(a.val[1], c), v128_shl_16(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shr_u32(a.hi, c), v128_shr_u32(a.lo, c));
+SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u16(a.val[1], c), v128_shr_u16(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shr_s32(a.hi, c), v128_shr_s32(a.lo, c));
+SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s16(a.val[1], c), v128_shr_s16(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_32(a.val[1], c), v128_shl_32(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u32(a.val[1], c), v128_shr_u32(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s32(a.val[1], c), v128_shr_s32(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shl_64(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_64(a.val[1], c), v128_shl_64(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_u64(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u64(a.val[1], c), v128_shr_u64(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_s64(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s64(a.val[1], c), v128_shr_s64(a.val[0], c));
 }
 
 /* These intrinsics require immediate values, so we must use #defines
    to enforce that. */
-#define v256_shl_n_byte(a, n)                                                 \
-  ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.hi, n),                \
-                                     v128_shr_n_byte(a.lo, (16 - (n)) & 31)), \
-                             v128_shl_n_byte(a.lo, (n)))                      \
-            : v256_from_v128(                                                 \
-                  (n) > 16 ? v128_shl_n_byte(a.lo, ((n)-16) & 31) : a.lo,     \
+#define v256_shl_n_byte(a, n)                                              \
+  ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.val[1], n),         \
+                                     v128_shr_n_byte(a.val[0], 16 - (n))), \
+                             v128_shl_n_byte(a.val[0], (n)))               \
+            : v256_from_v128(                                              \
+                  (n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \
                   v128_zero()))
 
-#define v256_shr_n_byte(a, n)                                                 \
-  ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.hi, n),                        \
-                             v128_or(v128_shr_n_byte(a.lo, n),                \
-                                     v128_shl_n_byte(a.hi, (16 - (n)) & 31))) \
-            : v256_from_v128(                                                 \
-                  v128_zero(),                                                \
-                  (n) > 16 ? v128_shr_n_byte(a.hi, ((n)-16) & 31) : a.hi))
+#define v256_shr_n_byte(a, n)                                              \
+  ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.val[1], n),                 \
+                             v128_or(v128_shr_n_byte(a.val[0], n),         \
+                                     v128_shl_n_byte(a.val[1], 16 - (n)))) \
+            : v256_from_v128(                                              \
+                  v128_zero(),                                             \
+                  (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1]))
 
 #define v256_align(a, b, c) \
   ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
 
 #define v256_shl_n_8(a, n) \
-  v256_from_v128(v128_shl_n_8(a.hi, n), v128_shl_n_8(a.lo, n))
+  v256_from_v128(v128_shl_n_8(a.val[1], n), v128_shl_n_8(a.val[0], n))
 #define v256_shl_n_16(a, n) \
-  v256_from_v128(v128_shl_n_16(a.hi, n), v128_shl_n_16(a.lo, n))
+  v256_from_v128(v128_shl_n_16(a.val[1], n), v128_shl_n_16(a.val[0], n))
 #define v256_shl_n_32(a, n) \
-  v256_from_v128(v128_shl_n_32(a.hi, n), v128_shl_n_32(a.lo, n))
+  v256_from_v128(v128_shl_n_32(a.val[1], n), v128_shl_n_32(a.val[0], n))
+#define v256_shl_n_64(a, n) \
+  v256_from_v128(v128_shl_n_64(a.val[1], n), v128_shl_n_64(a.val[0], n))
 #define v256_shr_n_u8(a, n) \
-  v256_from_v128(v128_shr_n_u8(a.hi, n), v128_shr_n_u8(a.lo, n))
+  v256_from_v128(v128_shr_n_u8(a.val[1], n), v128_shr_n_u8(a.val[0], n))
 #define v256_shr_n_u16(a, n) \
-  v256_from_v128(v128_shr_n_u16(a.hi, n), v128_shr_n_u16(a.lo, n))
+  v256_from_v128(v128_shr_n_u16(a.val[1], n), v128_shr_n_u16(a.val[0], n))
 #define v256_shr_n_u32(a, n) \
-  v256_from_v128(v128_shr_n_u32(a.hi, n), v128_shr_n_u32(a.lo, n))
+  v256_from_v128(v128_shr_n_u32(a.val[1], n), v128_shr_n_u32(a.val[0], n))
+#define v256_shr_n_u64(a, n) \
+  v256_from_v128(v128_shr_n_u64(a.val[1], n), v128_shr_n_u64(a.val[0], n))
 #define v256_shr_n_s8(a, n) \
-  v256_from_v128(v128_shr_n_s8(a.hi, n), v128_shr_n_s8(a.lo, n))
+  v256_from_v128(v128_shr_n_s8(a.val[1], n), v128_shr_n_s8(a.val[0], n))
 #define v256_shr_n_s16(a, n) \
-  v256_from_v128(v128_shr_n_s16(a.hi, n), v128_shr_n_s16(a.lo, n))
+  v256_from_v128(v128_shr_n_s16(a.val[1], n), v128_shr_n_s16(a.val[0], n))
 #define v256_shr_n_s32(a, n) \
-  v256_from_v128(v128_shr_n_s32(a.hi, n), v128_shr_n_s32(a.lo, n))
+  v256_from_v128(v128_shr_n_s32(a.val[1], n), v128_shr_n_s32(a.val[0], n))
+#define v256_shr_n_s64(a, n) \
+  v256_from_v128(v128_shr_n_s64(a.val[1], n), v128_shr_n_s64(a.val[0], n))
+
+#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
+#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
+
+typedef struct {
+  sad128_internal_u16 val[2];
+} sad256_internal_u16;
+
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() {
+  sad256_internal_u16 t;
+  t.val[1] = v128_sad_u16_init();
+  t.val[0] = v128_sad_u16_init();
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u16_sum().
+   The result for more than 16 v256_sad_u16() calls is undefined. */
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+                                             v256 b) {
+  sad256_internal_u16 t;
+  t.val[1] = v128_sad_u16(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_sad_u16(s.val[0], a.val[0], b.val[0]);
+  return t;
+}
+
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+  return v128_sad_u16_sum(s.val[1]) + v128_sad_u16_sum(s.val[0]);
+}
+
+typedef struct {
+  ssd128_internal_s16 val[2];
+} ssd256_internal_s16;
+
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() {
+  ssd256_internal_s16 t;
+  t.val[1] = v128_ssd_s16_init();
+  t.val[0] = v128_ssd_s16_init();
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
+                                             v256 b) {
+  ssd256_internal_s16 t;
+  t.val[1] = v128_ssd_s16(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_ssd_s16(s.val[0], a.val[0], b.val[0]);
+  return t;
+}
+
+SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
+  return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]);
+}
 
 #endif /* _V256_INTRINSICS_V128_H */
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
index b82daab68..05f205169 100644
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
@@ -14,7 +14,7 @@
 
 #if !defined(__AVX2__)
 
-#include "./v256_intrinsics_v128.h"
+#include "aom_dsp/simd/v256_intrinsics_v128.h"
 
 #else
 
@@ -26,7 +26,8 @@
 #endif
 
 #include <immintrin.h>
-#include "./v128_intrinsics_x86.h"
+
+#include "aom_dsp/simd/v128_intrinsics_x86.h"
 
 typedef __m256i v256;
 
@@ -38,9 +39,9 @@ SIMD_INLINE v64 v256_low_v64(v256 a) {
   return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero());
 }
 
-SIMD_INLINE v128 v256_low_v128(v256 a) {
-  return _mm256_extracti128_si256(a, 0);
-}
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
+
+SIMD_INLINE v128 v256_low_v128(v256 a) { return _mm256_castsi256_si128(a); }
 
 SIMD_INLINE v128 v256_high_v128(v256 a) {
   return _mm256_extracti128_si256(a, 1);
@@ -48,8 +49,7 @@ SIMD_INLINE v128 v256_high_v128(v256 a) {
 
 SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) {
   // gcc seems to be missing _mm256_set_m128i()
-  return _mm256_insertf128_si256(
-      _mm256_insertf128_si256(_mm256_setzero_si256(), b, 0), a, 1);
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(b), a, 1);
 }
 
 SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
@@ -84,16 +84,28 @@ SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16(x); }
 
 SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32(x); }
 
+SIMD_INLINE v256 v256_dup_64(uint64_t x) { return _mm256_set1_epi64x(x); }
+
 SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); }
 
 SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); }
 
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return _mm256_adds_epu8(a, b); }
+
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return _mm256_adds_epi8(a, b); }
+
 SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
   return _mm256_adds_epi16(a, b);
 }
 
 SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); }
 
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return _mm256_add_epi64(a, b); }
+
+SIMD_INLINE v256 v256_padd_u8(v256 a) {
+  return _mm256_maddubs_epi16(a, _mm256_set1_epi8(1));
+}
+
 SIMD_INLINE v256 v256_padd_s16(v256 a) {
   return _mm256_madd_epi16(a, _mm256_set1_epi16(1));
 }
@@ -116,6 +128,8 @@ SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
 
 SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); }
 
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return _mm256_sub_epi64(a, b); }
+
 SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); }
 
 SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); }
@@ -125,43 +139,51 @@ SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); }
 // unpack/pack intrinsics operate on the 256 bit input vector as 2
 // independent 128 bit vectors.
 SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_8(v256_low_v128(a), v256_low_v128(b)),
-                        v128_ziplo_8(v256_low_v128(a), v256_low_v128(b)));
+  return _mm256_unpacklo_epi8(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_8(v256_high_v128(a), v256_high_v128(b)),
-                        v128_ziplo_8(v256_high_v128(a), v256_high_v128(b)));
+  return _mm256_unpackhi_epi8(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_16(v256_low_v128(a), v256_low_v128(b)),
-                        v128_ziplo_16(v256_low_v128(a), v256_low_v128(b)));
+  return _mm256_unpacklo_epi16(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_16(v256_high_v128(a), v256_high_v128(b)),
-                        v128_ziplo_16(v256_high_v128(a), v256_high_v128(b)));
+  return _mm256_unpackhi_epi16(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_32(v256_low_v128(a), v256_low_v128(b)),
-                        v128_ziplo_32(v256_low_v128(a), v256_low_v128(b)));
+  return _mm256_unpacklo_epi32(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_32(v256_high_v128(a), v256_high_v128(b)),
-                        v128_ziplo_32(v256_high_v128(a), v256_high_v128(b)));
+  return _mm256_unpackhi_epi32(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_64(v256_low_v128(a), v256_low_v128(b)),
-                        v128_ziplo_64(v256_low_v128(a), v256_low_v128(b)));
+  return _mm256_unpacklo_epi64(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_64(v256_high_v128(a), v256_high_v128(b)),
-                        v128_ziplo_64(v256_high_v128(a), v256_high_v128(b)));
+  return _mm256_unpackhi_epi64(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
@@ -184,34 +206,54 @@ SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
   return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
 }
 
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_packs_epi16(_mm256_srai_epi16(b, 8), _mm256_srai_epi16(a, 8)),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
 SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_8(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziplo_8(v256_high_v128(b), v256_low_v128(b)));
+  return v256_unziphi_8(_mm256_slli_si256(a, 1), _mm256_slli_si256(b, 1));
 }
 
-SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_8(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziphi_8(v256_high_v128(b), v256_low_v128(b)));
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm256_srai_epi32(b, 16), _mm256_srai_epi32(a, 16)),
+      _MM_SHUFFLE(3, 1, 2, 0));
 }
 
 SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_16(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziplo_16(v256_high_v128(b), v256_low_v128(b)));
+  return v256_unziphi_16(_mm256_slli_si256(a, 2), _mm256_slli_si256(b, 2));
 }
 
-SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_16(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziphi_16(v256_high_v128(b), v256_low_v128(b)));
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b),
+                                            _mm256_castsi256_ps(a),
+                                            _MM_SHUFFLE(3, 1, 3, 1))),
+      _MM_SHUFFLE(3, 1, 2, 0));
 }
 
 SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_32(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziplo_32(v256_high_v128(b), v256_low_v128(b)));
+  return _mm256_permute4x64_epi64(
+      _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b),
+                                            _mm256_castsi256_ps(a),
+                                            _MM_SHUFFLE(2, 0, 2, 0))),
+      _MM_SHUFFLE(3, 1, 2, 0));
 }
 
-SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_32(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziphi_32(v256_high_v128(b), v256_low_v128(b)));
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(b),
+                                            _mm256_castsi256_pd(a), 15)),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castpd_si256(
+          _mm256_shuffle_pd(_mm256_castsi256_pd(b), _mm256_castsi256_pd(a), 0)),
+      _MM_SHUFFLE(3, 1, 2, 0));
 }
 
 SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
@@ -219,13 +261,15 @@ SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
 }
 
 SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(v256_low_v128(a)),
-                        v128_unpacklo_u8_s16(v256_low_v128(a)));
+  return _mm256_unpacklo_epi8(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
 }
 
 SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(v256_high_v128(a)),
-                        v128_unpacklo_u8_s16(v256_high_v128(a)));
+  return _mm256_unpackhi_epi8(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
 }
 
 SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
@@ -233,28 +277,37 @@ SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
 }
 
 SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_s8_s16(v256_low_v128(a)),
-                        v128_unpacklo_s8_s16(v256_low_v128(a)));
+  return _mm256_srai_epi16(
+      _mm256_unpacklo_epi8(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      8);
 }
 
 SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_s8_s16(v256_high_v128(a)),
-                        v128_unpacklo_s8_s16(v256_high_v128(a)));
+  return _mm256_srai_epi16(
+      _mm256_unpackhi_epi8(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      8);
 }
 
 SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s32_s16(v256_high_v128(a), v256_low_v128(a)),
-                        v128_pack_s32_s16(v256_high_v128(b), v256_low_v128(b)));
+  return _mm256_permute4x64_epi64(_mm256_packs_epi32(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(_mm256_packus_epi32(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
 }
 
 SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s16_u8(v256_high_v128(a), v256_low_v128(a)),
-                        v128_pack_s16_u8(v256_high_v128(b), v256_low_v128(b)));
+  return _mm256_permute4x64_epi64(_mm256_packus_epi16(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
 }
 
 SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s16_s8(v256_high_v128(a), v256_low_v128(a)),
-                        v128_pack_s16_s8(v256_high_v128(b), v256_low_v128(b)));
+  return _mm256_permute4x64_epi64(_mm256_packs_epi16(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
 }
 
 SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
@@ -266,43 +319,73 @@ SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
 }
 
 SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(v256_low_v128(a)),
-                        v128_unpacklo_u16_s32(v256_low_v128(a)));
+  return _mm256_unpacklo_epi16(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
 }
 
 SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(v256_low_v128(a)),
-                        v128_unpacklo_s16_s32(v256_low_v128(a)));
+  return _mm256_srai_epi32(
+      _mm256_unpacklo_epi16(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      16);
 }
 
 SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(v256_high_v128(a)),
-                        v128_unpacklo_u16_s32(v256_high_v128(a)));
+  return _mm256_unpackhi_epi16(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
 }
 
 SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(v256_high_v128(a)),
-                        v128_unpacklo_s16_s32(v256_high_v128(a)));
+  return _mm256_srai_epi32(
+      _mm256_unpackhi_epi16(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      16);
 }
+
 SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
-  v128 c16 = v128_dup_8(16);
-  v128 hi = v256_high_v128(pattern);
-  v128 lo = v256_low_v128(pattern);
-  v128 maskhi = v128_cmplt_s8(hi, c16);
-  v128 masklo = v128_cmplt_s8(lo, c16);
-  return v256_from_v128(
-      v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), hi), maskhi),
-              v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(hi, c16)),
-                        maskhi)),
-      v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), lo), masklo),
-              v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(lo, c16)),
-                        masklo)));
+  return _mm256_blendv_epi8(
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 1, 0, 1)), pattern),
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 0, 0)), pattern),
+      _mm256_cmpgt_epi8(v256_dup_8(16), pattern));
+}
+
+SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) {
+  v256 c32 = v256_dup_8(32);
+  v256 p32 = v256_sub_8(pattern, c32);
+  v256 r1 = _mm256_blendv_epi8(
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 1, 0, 1)), p32),
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 0, 0, 0)), p32),
+      _mm256_cmpgt_epi8(v256_dup_8(48), pattern));
+  v256 r2 = _mm256_blendv_epi8(
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 3)), pattern),
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 2)), pattern),
+      _mm256_cmpgt_epi8(v256_dup_8(16), pattern));
+  return _mm256_blendv_epi8(r1, r2, _mm256_cmpgt_epi8(c32, pattern));
 }
 
 SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
   return _mm256_shuffle_epi8(a, pattern);
 }
 
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+  v256 t1 = _mm256_madd_epi16(v256_unpackhi_s8_s16(a), v256_unpackhi_u8_s16(b));
+  v256 t2 = _mm256_madd_epi16(v256_unpacklo_s8_s16(a), v256_unpacklo_u8_s16(b));
+  t1 = _mm256_add_epi32(t1, t2);
+  v128 t = _mm_add_epi32(_mm256_extracti128_si256(t1, 0),
+                         _mm256_extracti128_si256(t1, 1));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
+  return (int32_t)v128_low_u32(t);
+}
+
 SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
   v256 r = _mm256_madd_epi16(a, b);
 #if defined(__x86_64__)
@@ -326,6 +409,29 @@ SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
 #endif
 }
 
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+  v256 r = _mm256_mullo_epi32(a, b);
+#if defined(__x86_64__)
+  v128 t;
+  r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
+                       _mm256_cvtepi32_epi64(v256_low_v128(r)));
+  t = v256_low_v128(_mm256_add_epi64(
+      r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
+  return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
+#else
+  v128 l = v256_low_v128(r);
+  v128 h = v256_high_v128(r);
+  return (int64_t)_mm_cvtsi128_si32(l) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
+         (int64_t)_mm_cvtsi128_si32(h) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
+#endif
+}
+
 SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
   v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256());
   v128 lo = v256_low_v128(t);
@@ -341,7 +447,7 @@ SIMD_INLINE sad256_internal v256_sad_u8_init() {
 }
 
 /* Implementation dependent return value.  Result must be finalised with
-   v256_sad_sum().
+   v256_sad_u8_sum().
    The result for more than 32 v256_sad_u8() calls is undefined. */
 SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
   return _mm256_add_epi64(s, _mm256_sad_epu8(a, b));
@@ -359,7 +465,7 @@ SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
 }
 
 /* Implementation dependent return value.  Result must be finalised with
- * v256_ssd_sum(). */
+ * v256_ssd_u8_sum(). */
 SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
   v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()),
                             _mm256_unpacklo_epi8(b, _mm256_setzero_si256()));
@@ -425,6 +531,12 @@ SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
       _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1)));
 }
 
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+  return _mm256_sub_epi16(
+      _mm256_avg_epu16(a, b),
+      _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_16(1)));
+}
+
 SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); }
 
 SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); }
@@ -433,18 +545,28 @@ SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); }
 
 SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); }
 
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return _mm256_movemask_epi8(a); }
+
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+  return _mm256_blendv_epi8(a, b, c);
+}
+
 SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); }
 
 SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); }
 
 SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); }
 
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return _mm256_min_epi32(a, b); }
+
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return _mm256_max_epi32(a, b); }
+
 SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
   return _mm256_cmpgt_epi8(a, b);
 }
 
 SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
-  return v256_andn(_mm256_cmpgt_epi8(b, a), _mm256_cmpeq_epi8(b, a));
+  return _mm256_cmpgt_epi8(b, a);
 }
 
 SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
@@ -456,13 +578,25 @@ SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
 }
 
 SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
-  return v256_andn(_mm256_cmpgt_epi16(b, a), _mm256_cmpeq_epi16(b, a));
+  return _mm256_cmpgt_epi16(b, a);
 }
 
 SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
   return _mm256_cmpeq_epi16(a, b);
 }
 
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+  return _mm256_cmpgt_epi32(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+  return _mm256_cmpgt_epi32(b, a);
+}
+
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
+  return _mm256_cmpeq_epi32(a, b);
+}
+
 SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
   return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)),
                           _mm256_sll_epi16(a, _mm_cvtsi32_si128(c)));
@@ -503,27 +637,42 @@ SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
   return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c));
 }
 
+SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
+  return _mm256_sll_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
+  return _mm256_srl_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
+#if defined(__AVX512F__)
+  return _mm256_sra_epi64(a, _mm_cvtsi32_si128(c));
+#else
+  return v256_from_v128(v128_shr_s64(v256_high_v128(a), c),
+                        v128_shr_s64(v256_low_v128(a), c));
+#endif
+}
+
 /* These intrinsics require immediate values, so we must use #defines
    to enforce that. */
 // _mm256_slli_si256 works on 128 bit lanes and can't be used
-#define v256_shl_n_byte(a, n)                                                 \
-  ((n) < 16                                                                   \
-       ? v256_from_v128(v128_or(v128_shl_n_byte(v256_high_v128(a), n),        \
-                                v128_shr_n_byte(v256_low_v128(a), 16 - (n))), \
-                        v128_shl_n_byte(v256_low_v128(a), n))                 \
-       : v256_from_v128(v128_shl_n_byte(v256_low_v128(a), (n)-16),            \
-                        v128_zero()))
+#define v256_shl_n_byte(a, n)                                                \
+  ((n) < 16 ? v256_from_v128(                                                \
+                  v128_align(v256_high_v128(a), v256_low_v128(a), 16 - (n)), \
+                  v128_shl_n_byte(v256_low_v128(a), n))                      \
+            : _mm256_inserti128_si256(                                       \
+                  _mm256_setzero_si256(),                                    \
+                  v128_shl_n_byte(v256_low_v128(a), (n)-16), 1))
 
 // _mm256_srli_si256 works on 128 bit lanes and can't be used
-#define v256_shr_n_byte(a, n)                                                 \
-  ((n) < 16                                                                   \
-       ? _mm256_alignr_epi8(                                                  \
-             _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n)  \
-       : ((n) > 16                                                            \
-              ? _mm256_srli_si256(                                            \
-                    _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), \
-                    (n)-16)                                                   \
-              : _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1))))
+#define v256_shr_n_byte(a, n)                                                \
+  ((n) < 16                                                                  \
+       ? _mm256_alignr_epi8(                                                 \
+             _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \
+       : _mm256_inserti128_si256(                                            \
+             _mm256_setzero_si256(),                                         \
+             v128_align(v256_high_v128(a), v256_high_v128(a), n), 0))
 
 // _mm256_alignr_epi8 works on two 128 bit lanes and can't be used
 #define v256_align(a, b, c) \
@@ -543,6 +692,59 @@ SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
 #define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c)
 #define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c)
 #define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c)
+#define v256_shl_n_64(a, c) _mm256_slli_epi64(a, c)
+#define v256_shr_n_u64(a, c) _mm256_srli_epi64(a, c)
+#define v256_shr_n_s64(a, c) \
+  v256_shr_s64((a), (c))  // _mm256_srai_epi64 broken in gcc?
+#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
+#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
+
+typedef v256 sad256_internal_u16;
+
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() { return v256_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_sad_u16_sum(). */
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+                                             v256 b) {
+#if defined(__SSE4_1__)
+  v256 t = v256_sub_16(_mm256_max_epu16(a, b), _mm256_min_epu16(a, b));
+#else
+  v256 t = v256_cmplt_s16(v256_xor(a, v256_dup_16(32768)),
+                          v256_xor(b, v256_dup_16(32768)));
+  t = v256_sub_16(v256_or(v256_and(b, t), v256_andn(a, t)),
+                  v256_or(v256_and(a, t), v256_andn(b, t)));
+#endif
+  return v256_add_32(
+      s, v256_add_32(v256_unpackhi_u16_s32(t), v256_unpacklo_u16_s32(t)));
+}
+
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+  v128 t = v128_add_32(v256_high_v128(s), v256_low_v128(s));
+  return v128_low_u32(t) + v128_low_u32(v128_shr_n_byte(t, 4)) +
+         v128_low_u32(v128_shr_n_byte(t, 8)) +
+         v128_low_u32(v128_shr_n_byte(t, 12));
+}
+
+typedef v256 ssd256_internal_s16;
+
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() { return v256_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
+                                             v256 b) {
+  v256 d = v256_sub_16(a, b);
+  d = v256_madd_s16(d, d);
+  return v256_add_64(s, v256_add_64(_mm256_unpackhi_epi32(d, v256_zero()),
+                                    _mm256_unpacklo_epi32(d, v256_zero())));
+}
+
+SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
+  v128 t = v128_add_64(v256_high_v128(s), v256_low_v128(s));
+  return v64_u64(v128_low_v64(t)) + v64_u64(v128_high_v64(t));
+}
+
 #endif
 
 #endif /* _V256_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics.h b/third_party/aom/aom_dsp/simd/v64_intrinsics.h
index 5c0042d8c..6ce53c6a9 100644
--- a/third_party/aom/aom_dsp/simd/v64_intrinsics.h
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics.h
@@ -14,7 +14,8 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include "./v64_intrinsics_c.h"
+
+#include "aom_dsp/simd/v64_intrinsics_c.h"
 
 /* Fallback to plain, unoptimised C. */
 
@@ -71,6 +72,8 @@ SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); }
 
 SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return c_v64_add_8(a, b); }
 SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return c_v64_add_16(a, b); }
+SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return c_v64_sadd_u8(a, b); }
+SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return c_v64_sadd_s8(a, b); }
 SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return c_v64_sadd_s16(a, b); }
 SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return c_v64_add_32(a, b); }
 SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return c_v64_sub_8(a, b); }
@@ -100,6 +103,9 @@ SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { return c_v64_unpackhi_s8_s16(a); }
 SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
   return c_v64_pack_s32_s16(a, b);
 }
+SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
+  return c_v64_pack_s32_u16(a, b);
+}
 SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
   return c_v64_pack_s16_u8(a, b);
 }
@@ -156,6 +162,7 @@ SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { return c_v64_madd_us8(a, b); }
 
 SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return c_v64_avg_u8(a, b); }
 SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { return c_v64_rdavg_u8(a, b); }
+SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) { return c_v64_rdavg_u16(a, b); }
 SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return c_v64_avg_u16(a, b); }
 SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return c_v64_min_u8(a, b); }
 SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return c_v64_max_u8(a, b); }
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h
index c7574eef5..267441b02 100644
--- a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h
@@ -13,7 +13,8 @@
 #define _V64_INTRINSICS_H
 
 #include <arm_neon.h>
-#include "./v64_intrinsics_arm.h"
+
+#include "aom_dsp/simd/v64_intrinsics_arm.h"
 #include "aom_ports/arm.h"
 
 #ifdef AOM_INCOMPATIBLE_GCC
@@ -121,20 +122,34 @@ SIMD_INLINE v64 v64_dup_32(uint32_t x) {
 }
 
 SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) {
-  int64x2_t r = vpaddlq_s32(vpaddlq_s16(
+  int16x8_t t =
       vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)),
-                vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))))));
+                vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))));
+#if defined(__aarch64__)
+  return vaddlvq_s16(t);
+#else
+  int64x2_t r = vpaddlq_s32(vpaddlq_s16(t));
   return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r));
+#endif
 }
 
 SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vaddlvq_s32(
+      vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+#else
   int64x2_t r =
       vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
   return (int64_t)(vget_high_s64(r) + vget_low_s64(r));
+#endif
 }
 
 SIMD_INLINE uint64_t v64_hadd_u8(v64 x) {
+#if defined(__aarch64__)
+  return vaddlv_u8(vreinterpret_u8_s64(x));
+#else
   return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
+#endif
 }
 
 SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
@@ -145,34 +160,40 @@ typedef uint16x8_t sad64_internal;
 
 SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); }
 
-/* Implementation dependent return value.  Result must be finalised with
-   v64_sad_u8_sum().
-   The result for more than 32 v64_sad_u8() calls is undefined. */
+// Implementation dependent return value. Result must be finalised with
+// v64_sad_u8_sum().
 SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
   return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
 }
 
 SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
+#if defined(__aarch64__)
+  return vaddlvq_u16(s);
+#else
   uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s));
   return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r));
+#endif
 }
 
-typedef int64x1_t ssd64_internal;
+typedef uint32x4_t ssd64_internal;
 
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() {
-  return (ssd64_internal)(uint64_t)0;
-}
+SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return vdupq_n_u32(0); }
 
-/* Implementation dependent return value.  Result must be finalised with
- * v64_ssd_u8_sum(). */
+// Implementation dependent return value. Result must be finalised with
+// v64_ssd_u8_sum().
 SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
   uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
-  uint64x2_t r = vpaddlq_u32(vpaddlq_u16(vmull_u8(t, t)));
-  return vadd_u64(s, vadd_u64(vget_high_u64(r), vget_low_u64(r)));
+  return vaddq_u32(s, vpaddlq_u16(vmull_u8(t, t)));
 }
 
 SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
-  return (uint32_t)(uint64_t)s;
+#if defined(__aarch64__)
+  return vaddvq_u32(s);
+#else
+  uint64x2_t t = vpaddlq_u32(s);
+  return vget_lane_u32(
+      vreinterpret_u32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
+#endif
 }
 
 SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); }
@@ -188,6 +209,16 @@ SIMD_INLINE v64 v64_add_8(v64 x, v64 y) {
       vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
 }
 
+SIMD_INLINE v64 v64_sadd_u8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vqadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_sadd_s8(v64 x, v64 y) {
+  return vreinterpret_s64_s8(
+      vqadd_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
+}
+
 SIMD_INLINE v64 v64_add_16(v64 x, v64 y) {
   return vreinterpret_s64_s16(
       vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
@@ -252,8 +283,14 @@ SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) {
 }
 
 SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  int16x8_t t = vreinterpretq_s16_s32(
+      vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+  return vget_low_s64(vreinterpretq_s64_s16(vuzp2q_s16(t, t)));
+#else
   return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32(
       vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16)));
+#endif
 }
 
 SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) {
@@ -269,10 +306,10 @@ SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) {
 }
 
 SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) {
-  return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(
-      vaddq_s16(vmull_s8(vadd_s8(vreinterpret_s8_s64(x), vdup_n_s8(-128)),
-                         vreinterpret_s8_s64(y)),
-                vshlq_n_s16(vmovl_s8(vreinterpret_s8_s64(y)), 7)))));
+  int16x8_t t =
+      vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(x))),
+                vmovl_s8(vreinterpret_s8_s64(y)));
+  return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(t)));
 }
 
 SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) {
@@ -285,6 +322,11 @@ SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) {
       vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
 }
 
+SIMD_INLINE v64 v64_rdavg_u16(v64 x, v64 y) {
+  return vreinterpret_s64_u16(
+      vhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
+}
+
 SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) {
   return vreinterpret_s64_u16(
       vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
@@ -321,33 +363,63 @@ SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) {
 }
 
 SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u8(
+      vzip1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
+#else
   uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
   return vreinterpret_s64_u8(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u8(
+      vzip2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
+#else
   uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
   return vreinterpret_s64_u8(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u16(
+      vzip1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
+#else
   int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
   return vreinterpret_s64_s16(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u16(
+      vzip2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
+#else
   int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
   return vreinterpret_s64_s16(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u32(
+      vzip1_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)));
+#else
   int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
   return vreinterpret_s64_s32(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u32(
+      vzip2_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)));
+#else
   int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
   return vreinterpret_s64_s32(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
@@ -371,6 +443,11 @@ SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) {
       vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
 }
 
+SIMD_INLINE v64 v64_pack_s32_u16(v64 x, v64 y) {
+  return vreinterpret_s64_u16(vqmovun_s32(
+      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
+}
+
 SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) {
   return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32(
       vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
@@ -382,23 +459,43 @@ SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) {
 }
 
 SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u8(
+      vuzp1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
+#else
   uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
   return vreinterpret_s64_u8(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u8(
+      vuzp2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
+#else
   uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
   return vreinterpret_s64_u8(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u16(
+      vuzp1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
+#else
   uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
   return vreinterpret_s64_u16(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u16(
+      vuzp2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
+#else
   uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
   return vreinterpret_s64_u16(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) {
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
index 5032238b6..8158899cb 100644
--- a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
@@ -17,7 +17,8 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include "./aom_config.h"
+
+#include "config/aom_config.h"
 
 typedef union {
   uint8_t u8[8];
@@ -30,13 +31,17 @@ typedef union {
   int64_t s64;
 } c_v64;
 
-SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) { return a.u32[CONFIG_BIG_ENDIAN]; }
+SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) {
+  return a.u32[!!CONFIG_BIG_ENDIAN];
+}
 
 SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) {
   return a.u32[!CONFIG_BIG_ENDIAN];
 }
 
-SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) { return a.s32[CONFIG_BIG_ENDIAN]; }
+SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) {
+  return a.s32[!!CONFIG_BIG_ENDIAN];
+}
 
 SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) {
   return a.s32[!CONFIG_BIG_ENDIAN];
@@ -45,7 +50,7 @@ SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) {
 SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) {
   c_v64 t;
   t.u32[!CONFIG_BIG_ENDIAN] = x;
-  t.u32[CONFIG_BIG_ENDIAN] = y;
+  t.u32[!!CONFIG_BIG_ENDIAN] = y;
   return t;
 }
 
@@ -177,6 +182,30 @@ SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) {
   return t;
 }
 
+SIMD_INLINE c_v64 c_v64_sadd_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++)
+    t.u8[c] = (int16_t)a.u8[c] + (int16_t)b.u8[c] > 255
+                  ? 255
+                  : (int16_t)a.u8[c] + (int16_t)b.u8[c] < 0
+                        ? 0
+                        : (int16_t)a.u8[c] + (int16_t)b.u8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sadd_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++)
+    t.s8[c] = (int16_t)a.s8[c] + (int16_t)b.s8[c] > 127
+                  ? 127
+                  : (int16_t)a.s8[c] + (int16_t)b.s8[c] < -128
+                        ? -128
+                        : (int16_t)a.s8[c] + (int16_t)b.s8[c];
+  return t;
+}
+
 SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) {
   c_v64 t;
   int c;
@@ -206,8 +235,7 @@ SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) {
 SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) {
   c_v64 t;
   int c;
-  for (c = 0; c < 8; c++)
-    t.u8[c] = (int32_t)a.u8[c] - (int32_t)b.u8[c] < 0 ? 0 : a.u8[c] - b.u8[c];
+  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] < b.u8[c] ? 0 : a.u8[c] - b.u8[c];
   return t;
 }
 
@@ -459,6 +487,20 @@ SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) {
   return t;
 }
 
+SIMD_INLINE c_v64 c_v64_pack_s32_u16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  if (CONFIG_BIG_ENDIAN) {
+    c_v64 u = a;
+    a = b;
+    b = u;
+  }
+  t.u16[3] = a.s32[1] > 65535 ? 65535 : a.s32[1] < 0 ? 0 : a.s32[1];
+  t.u16[2] = a.s32[0] > 65535 ? 65535 : a.s32[0] < 0 ? 0 : a.s32[0];
+  t.u16[1] = b.s32[1] > 65535 ? 65535 : b.s32[1] < 0 ? 0 : b.s32[1];
+  t.u16[0] = b.s32[0] > 65535 ? 65535 : b.s32[0] < 0 ? 0 : b.s32[0];
+  return t;
+}
+
 SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) {
   c_v64 t;
   if (CONFIG_BIG_ENDIAN) {
@@ -670,6 +712,13 @@ SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) {
   return t;
 }
 
+SIMD_INLINE c_v64 c_v64_rdavg_u16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c]) >> 1;
+  return t;
+}
+
 SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) {
   c_v64 t;
   int c;
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
index 8dcc9f6fc..130052ee1 100644
--- a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
@@ -90,8 +90,7 @@ SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
   _mm_storel_epi64((__m128i *)p, a);
 }
 
-// The following function requires an immediate.
-#if defined(__OPTIMIZE__) && __OPTIMIZE__
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
 #define v64_align(a, b, c) \
   ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b)
 #else
@@ -112,6 +111,10 @@ SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); }
 
 SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); }
 
+SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return _mm_adds_epu8(a, b); }
+
+SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return _mm_adds_epi8(a, b); }
+
 SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); }
 
 SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); }
@@ -170,6 +173,22 @@ SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
   return _mm_packs_epi32(t, t);
 }
 
+SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+  __m128i t = _mm_unpacklo_epi64(b, a);
+  return _mm_packus_epi32(t, t);
+#else
+  int32_t ah = v64_high_u32(a);
+  int32_t al = v64_low_u32(a);
+  int32_t bh = v64_high_u32(b);
+  int32_t bl = v64_low_u32(b);
+  return v64_from_16(ah > 65535 ? 65535 : ah < 0 ? 0 : ah,
+                     al > 65535 ? 65535 : al < 0 ? 0 : al,
+                     bh > 65535 ? 65535 : bh < 0 ? 0 : bh,
+                     bl > 65535 ? 65535 : bl < 0 ? 0 : bl);
+#endif
+}
+
 SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
   __m128i t = _mm_unpacklo_epi64(b, a);
   return _mm_packus_epi16(t, t);
@@ -272,14 +291,11 @@ SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
 }
 
 SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) {
-  __m128i r, r1, r2, z;
-  z = _mm_setzero_si128();
-  r1 = _mm_madd_epi16(_mm_slli_epi16(_mm_unpacklo_epi8(a, z), 8),
-                      _mm_unpacklo_epi8(b, z));
-  r2 = _mm_srli_si128(r1, 8);
-  r = _mm_add_epi32(r1, r2);
-  r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
-  return ((int32_t)v64_low_u32(r)) >> 8;
+  __m128i t = _mm_madd_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8),
+                             _mm_unpacklo_epi8(b, _mm_setzero_si128()));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
+  return (int32_t)v64_low_u32(t);
 }
 
 SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) {
@@ -371,6 +387,11 @@ SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) {
                       _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1)));
 }
 
+SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) {
+  return _mm_sub_epi16(_mm_avg_epu16(a, b),
+                       _mm_and_si128(_mm_xor_si128(a, b), v64_dup_16(1)));
+}
+
 SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); }
 
 SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); }
diff --git a/third_party/aom/aom_dsp/ssim.c b/third_party/aom/aom_dsp/ssim.c
index 6ae378ff2..6ce3d7acb 100644
--- a/third_party/aom/aom_dsp/ssim.c
+++ b/third_party/aom/aom_dsp/ssim.c
@@ -11,7 +11,9 @@
 
 #include <assert.h>
 #include <math.h>
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/ssim.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
@@ -31,6 +33,7 @@ void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
     }
   }
 }
+
 void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
                           uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
                           uint32_t *sum_sq_r, uint32_t *sum_sxr) {
@@ -46,7 +49,6 @@ void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
                                  int rp, uint32_t *sum_s, uint32_t *sum_r,
                                  uint32_t *sum_sq_s, uint32_t *sum_sq_r,
@@ -62,7 +64,6 @@ void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
     }
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 static const int64_t cc1 = 26634;        // (64^2*(.01*255)^2
 static const int64_t cc2 = 239708;       // (64^2*(.03*255)^2
@@ -108,7 +109,6 @@ static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
   return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
 }
 
-#if CONFIG_HIGHBITDEPTH
 static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
                               int rp, uint32_t bd, uint32_t shift) {
   uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
@@ -117,7 +117,6 @@ static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
   return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
                     sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 // We are using a 8x8 moving window with starting location of each 8x8 window
 // on the 4x4 pixel grid. Such arrangement allows the windows to overlap
@@ -142,7 +141,6 @@ static double aom_ssim2(const uint8_t *img1, const uint8_t *img2,
   return ssim_total;
 }
 
-#if CONFIG_HIGHBITDEPTH
 static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
                                int stride_img1, int stride_img2, int width,
                                int height, uint32_t bd, uint32_t shift) {
@@ -164,7 +162,6 @@ static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
   ssim_total /= samples;
   return ssim_total;
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 double aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
                      const YV12_BUFFER_CONFIG *dest, double *weight) {
@@ -422,7 +419,6 @@ double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
   return inconsistency_total;
 }
 
-#if CONFIG_HIGHBITDEPTH
 double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
                             const YV12_BUFFER_CONFIG *dest, double *weight,
                             uint32_t bd, uint32_t in_bd) {
@@ -441,4 +437,3 @@ double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
   *weight = 1;
   return abc[0] * .8 + .1 * (abc[1] + abc[2]);
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/ssim.h b/third_party/aom/aom_dsp/ssim.h
index 902735e50..c8a389dfe 100644
--- a/third_party/aom/aom_dsp/ssim.h
+++ b/third_party/aom/aom_dsp/ssim.h
@@ -18,7 +18,8 @@
 extern "C" {
 #endif
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_scale/yv12config.h"
 
 // metrics used for calculating ssim, ssim2, dssim, and ssimc
@@ -75,11 +76,9 @@ double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
                          double *ssim_u, double *ssim_v, uint32_t bd,
                          uint32_t in_bd);
 
-#if CONFIG_HIGHBITDEPTH
 double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
                             const YV12_BUFFER_CONFIG *dest, double *weight,
                             uint32_t bd, uint32_t in_bd);
-#endif  // CONFIG_HIGHBITDEPTH
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/aom_dsp/subtract.c b/third_party/aom/aom_dsp/subtract.c
index 8dda96efb..2f6da96e5 100644
--- a/third_party/aom/aom_dsp/subtract.c
+++ b/third_party/aom/aom_dsp/subtract.c
@@ -11,8 +11,8 @@
 
 #include <stdlib.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
@@ -32,7 +32,6 @@ void aom_subtract_block_c(int rows, int cols, int16_t *diff,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
                                  ptrdiff_t diff_stride, const uint8_t *src8,
                                  ptrdiff_t src_stride, const uint8_t *pred8,
@@ -52,4 +51,3 @@ void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
     src += src_stride;
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/sum_squares.c b/third_party/aom/aom_dsp/sum_squares.c
index b9155fdc0..44ec41f2e 100644
--- a/third_party/aom/aom_dsp/sum_squares.c
+++ b/third_party/aom/aom_dsp/sum_squares.c
@@ -11,7 +11,7 @@
 
 #include <assert.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 uint64_t aom_sum_squares_2d_i16_c(const int16_t *src, int src_stride, int width,
                                   int height) {
diff --git a/third_party/aom/aom_dsp/txfm_common.h b/third_party/aom/aom_dsp/txfm_common.h
index ef9e9bc98..7deb0aea3 100644
--- a/third_party/aom/aom_dsp/txfm_common.h
+++ b/third_party/aom/aom_dsp/txfm_common.h
@@ -28,25 +28,11 @@ typedef struct txfm_param {
   TX_SIZE tx_size;
   int lossless;
   int bd;
-#if CONFIG_MRC_TX || CONFIG_LGT
-  int is_inter;
-#endif  // CONFIG_MRC_TX || CONFIG_LGT
-#if CONFIG_MRC_TX || CONFIG_LGT_FROM_PRED
-  int stride;
-  uint8_t *dst;
-#if CONFIG_MRC_TX
-  int *valid_mask;
-  uint8_t *mask;
-#endif  // CONFIG_MRC_TX
-#if CONFIG_LGT_FROM_PRED
-  int mode;
-  int use_lgt;
-#endif  // CONFIG_LGT_FROM_PRED
-#endif  // CONFIG_MRC_TX || CONFIG_LGT_FROM_PRED
-// for inverse transforms only
-#if CONFIG_ADAPT_SCAN
-  const int16_t *eob_threshold;
-#endif
+  // are the pixel buffers octets or shorts?  This should collapse to
+  // bd==8 implies !is_hbd, but that's not certain right now.
+  int is_hbd;
+  TxSetType tx_set_type;
+  // for inverse transforms only
   int eob;
 } TxfmParam;
 
@@ -102,647 +88,4 @@ static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
   return rv;
 }
 
-#if CONFIG_LGT_FROM_PRED
-// Use negative numbers so they do not coincide with lgt*[0][0], which are
-// always nonnegative.
-typedef enum {
-  DCT4 = -1,
-  ADST4 = -2,
-  DCT8 = -3,
-  ADST8 = -4,
-  DCT16 = -5,
-  ADST16 = -6,
-  DCT32 = -7,
-  ADST32 = -8,
-} ButterflyLgt;
-
-/* These are some LGTs already implementated in the codec. When any of them
- * is chosen, the flgt or ilgt function will call the existing fast
- * transform instead of the matrix product implementation. Thus, we
- * do not need the actual basis functions here */
-static const tran_high_t lgt4_000[1][1] = { { (tran_high_t)DCT4 } };
-static const tran_high_t lgt4_100[1][1] = { { (tran_high_t)ADST4 } };
-static const tran_high_t lgt8_000[1][1] = { { (tran_high_t)DCT8 } };
-static const tran_high_t lgt8_200[1][1] = { { (tran_high_t)ADST8 } };
-static const tran_high_t lgt16_000[1][1] = { { (tran_high_t)DCT16 } };
-static const tran_high_t lgt16_200[1][1] = { { (tran_high_t)ADST16 } };
-static const tran_high_t lgt32_000[1][1] = { { (tran_high_t)DCT32 } };
-static const tran_high_t lgt32_200[1][1] = { { (tran_high_t)ADST32 } };
-
-/* The Line Graph Transforms (LGTs) matrices are written as follows.
-   Each 2D array is sqrt(2)*16384 times an LGT matrix, which is the
-   matrix of eigenvectors of the graph Laplacian matrix of the associated
-   line graph. Some of those transforms have fast algorithms but not
-   implemented yet for now. */
-
-// LGT4 name: lgt4_150_000w3
-// Self loops: 1.500, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 0.000
-static const tran_high_t lgt4_150_000w3[4][4] = {
-  { 0, 0, 0, 23170 },
-  { 5991, 13537, 17825, 0 },
-  { 15515, 10788, -13408, 0 },
-  { 16133, -15403, 6275, 0 },
-};
-
-// LGT4 name: lgt4_100_000w3
-// Self loops: 1.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 0.000
-static const tran_high_t lgt4_100_000w3[4][4] = {
-  { 0, 0, 0, 23170 },
-  { 7600, 13694, 17076, 0 },
-  { 17076, 7600, -13694, 0 },
-  { 13694, -17076, 7600, 0 },
-};
-
-// LGT4 name: lgt4_060_000w3
-// Self loops: 0.600, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 0.000
-static const tran_high_t lgt4_060_000w3[4][4] = {
-  { 0, 0, 0, 23170 },
-  { 9449, 13755, 16075, 0 },
-  { 17547, 4740, -14370, 0 },
-  { 11819, -18034, 8483, 0 },
-};
-
-// LGT4 name: lgt4_000w3
-// Self loops: 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 0.000
-static const tran_high_t lgt4_000w3[4][4] = {
-  { 0, 0, 0, 23170 },
-  { 13377, 13377, 13377, 0 },
-  { 16384, 0, -16384, 0 },
-  { 9459, -18919, 9459, 0 },
-};
-
-// LGT4 name: lgt4_150_000w2
-// Self loops: 1.500, 0.000, 0.000, 0.000
-// Edges: 1.000, 0.000, 1.000
-static const tran_high_t lgt4_150_000w2[4][4] = {
-  { 10362, 20724, 0, 0 },
-  { 20724, -10362, 0, 0 },
-  { 0, 0, 16384, 16384 },
-  { 0, 0, 16384, -16384 },
-};
-
-// LGT4 name: lgt4_100_000w2
-// Self loops: 1.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 0.000, 1.000
-static const tran_high_t lgt4_100_000w2[4][4] = {
-  { 12181, 19710, 0, 0 },
-  { 19710, -12181, 0, 0 },
-  { 0, 0, 16384, 16384 },
-  { 0, 0, 16384, -16384 },
-};
-
-// LGT4 name: lgt4_060_000w2
-// Self loops: 0.600, 0.000, 0.000, 0.000
-// Edges: 1.000, 0.000, 1.000
-static const tran_high_t lgt4_060_000w2[4][4] = {
-  { 13831, 18590, 0, 0 },
-  { 18590, -13831, 0, 0 },
-  { 0, 0, 16384, 16384 },
-  { 0, 0, 16384, -16384 },
-};
-
-// LGT4 name: lgt4_000w2
-// Self loops: 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 0.000, 1.000
-static const tran_high_t lgt4_000w2[4][4] = {
-  { 16384, 16384, 0, 0 },
-  { 16384, -16384, 0, 0 },
-  { 0, 0, 16384, 16384 },
-  { 0, 0, 16384, -16384 },
-};
-
-// LGT4 name: lgt4_150_000w1
-// Self loops: 1.500, 0.000, 0.000, 0.000
-// Edges: 0.000, 1.000, 1.000
-static const tran_high_t lgt4_150_000w1[4][4] = {
-  { 23170, 0, 0, 0 },
-  { 0, 13377, 13377, 13377 },
-  { 0, 16384, 0, -16384 },
-  { 0, 9459, -18919, 9459 },
-};
-
-// LGT4 name: lgt4_100_000w1
-// Self loops: 1.000, 0.000, 0.000, 0.000
-// Edges: 0.000, 1.000, 1.000
-static const tran_high_t lgt4_100_000w1[4][4] = {
-  { 23170, 0, 0, 0 },
-  { 0, 13377, 13377, 13377 },
-  { 0, 16384, 0, -16384 },
-  { 0, 9459, -18919, 9459 },
-};
-
-// LGT4 name: lgt4_060_000w1
-// Self loops: 0.600, 0.000, 0.000, 0.000
-// Edges: 0.000, 1.000, 1.000
-static const tran_high_t lgt4_060_000w1[4][4] = {
-  { 23170, 0, 0, 0 },
-  { 0, 13377, 13377, 13377 },
-  { 0, 16384, 0, -16384 },
-  { 0, 9459, -18919, 9459 },
-};
-
-// LGT4 name: lgt4_000w1
-// Self loops: 0.000, 0.000, 0.000, 0.000
-// Edges: 0.000, 1.000, 1.000
-static const tran_high_t lgt4_000w1[4][4] = {
-  { 23170, 0, 0, 0 },
-  { 0, 13377, 13377, 13377 },
-  { 0, 16384, 0, -16384 },
-  { 0, 9459, -18919, 9459 },
-};
-
-// LGT4 name: lgt4_060
-// Self loops: 0.600, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000
-static const tran_high_t lgt4_060[4][4] = {
-  { 6971, 10504, 13060, 14400 },
-  { 14939, 11211, -2040, -13559 },
-  { 14096, -8258, -12561, 10593 },
-  { 8150, -15253, 14295, -5784 },
-};
-
-// LGT4 name: lgt4_150
-// Self loops: 1.500, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000
-static const tran_high_t lgt4_150[4][4] = {
-  { 3998, 9435, 13547, 15759 },
-  { 11106, 15105, 1886, -13483 },
-  { 15260, -1032, -14674, 9361 },
-  { 12833, -14786, 11596, -4372 },
-};
-
-// LGT8 name: lgt8_150_000w7
-// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000
-static const tran_high_t lgt8_150_000w7[8][8] = {
-  { 0, 0, 0, 0, 0, 0, 0, 32768 },
-  { 2522, 6185, 9551, 12461, 14775, 16381, 17204, 0 },
-  { 7390, 15399, 16995, 11515, 1240, -9551, -16365, 0 },
-  { 11716, 16625, 3560, -13353, -15831, -1194, 14733, 0 },
-  { 15073, 8866, -14291, -10126, 13398, 11308, -12401, 0 },
-  { 16848, -4177, -13724, 14441, 2923, -16628, 9513, 0 },
-  { 15942, -14888, 5405, 7137, -15640, 15288, -6281, 0 },
-  { 10501, -14293, 16099, -15670, 13063, -8642, 3021, 0 },
-};
-
-// LGT8 name: lgt8_100_000w7
-// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000
-static const tran_high_t lgt8_100_000w7[8][8] = {
-  { 0, 0, 0, 0, 0, 0, 0, 32768 },
-  { 3518, 6883, 9946, 12575, 14654, 16093, 16829, 0 },
-  { 9946, 16093, 16093, 9946, 0, -9946, -16093, 0 },
-  { 14654, 14654, 0, -14654, -14654, 0, 14654, 0 },
-  { 16829, 3518, -16093, -6883, 14654, 9946, -12575, 0 },
-  { 16093, -9946, -9946, 16093, 0, -16093, 9946, 0 },
-  { 12575, -16829, 9946, 3518, -14654, 16093, -6883, 0 },
-  { 6883, -12575, 16093, -16829, 14654, -9946, 3518, 0 },
-};
-
-// LGT8 name: lgt8_060_000w7
-// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000
-static const tran_high_t lgt8_060_000w7[8][8] = {
-  { 0, 0, 0, 0, 0, 0, 0, 32768 },
-  { 5087, 7951, 10521, 12701, 14411, 15587, 16186, 0 },
-  { 13015, 16486, 14464, 7621, -1762, -10557, -15834, 0 },
-  { 16581, 11475, -4050, -15898, -13311, 1362, 14798, 0 },
-  { 16536, -1414, -16981, -3927, 15746, 8879, -12953, 0 },
-  { 14104, -13151, -7102, 16932, -1912, -15914, 10385, 0 },
-  { 10156, -17168, 11996, 1688, -14174, 16602, -7249, 0 },
-  { 5295, -11721, 15961, -17224, 15274, -10476, 3723, 0 },
-};
-
-// LGT8 name: lgt8_000w7
-// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000
-static const tran_high_t lgt8_000w7[8][8] = {
-  { 0, 0, 0, 0, 0, 0, 0, 32768 },
-  { 12385, 12385, 12385, 12385, 12385, 12385, 12385, 0 },
-  { 17076, 13694, 7600, 0, -7600, -13694, -17076, 0 },
-  { 15781, 3898, -10921, -17515, -10921, 3898, 15781, 0 },
-  { 13694, -7600, -17076, 0, 17076, 7600, -13694, 0 },
-  { 10921, -15781, -3898, 17515, -3898, -15781, 10921, 0 },
-  { 7600, -17076, 13694, 0, -13694, 17076, -7600, 0 },
-  { 3898, -10921, 15781, -17515, 15781, -10921, 3898, 0 },
-};
-
-// LGT8 name: lgt8_150_000w6
-// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000
-static const tran_high_t lgt8_150_000w6[8][8] = {
-  { 0, 0, 0, 0, 0, 0, 23170, 23170 },
-  { 0, 0, 0, 0, 0, 0, 23170, -23170 },
-  { 3157, 7688, 11723, 15002, 17312, 18506, 0, 0 },
-  { 9167, 17832, 16604, 6164, -7696, -17286, 0, 0 },
-  { 14236, 15584, -4969, -18539, -6055, 14938, 0, 0 },
-  { 17558, 1891, -18300, 5288, 16225, -11653, 0, 0 },
-  { 17776, -13562, -647, 14380, -17514, 7739, 0, 0 },
-  { 12362, -16318, 17339, -15240, 10399, -3688, 0, 0 },
-};
-
-// LGT8 name: lgt8_100_000w6
-// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000
-static const tran_high_t lgt8_100_000w6[8][8] = {
-  { 0, 0, 0, 0, 0, 0, 23170, 23170 },
-  { 0, 0, 0, 0, 0, 0, 23170, -23170 },
-  { 4350, 8447, 12053, 14959, 16995, 18044, 0, 0 },
-  { 12053, 18044, 14959, 4350, -8447, -16995, 0, 0 },
-  { 16995, 12053, -8447, -18044, -4350, 14959, 0, 0 },
-  { 18044, -4350, -16995, 8447, 14959, -12053, 0, 0 },
-  { 14959, -16995, 4350, 12053, -18044, 8447, 0, 0 },
-  { 8447, -14959, 18044, -16995, 12053, -4350, 0, 0 },
-};
-
-// LGT8 name: lgt8_060_000w6
-// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000
-static const tran_high_t lgt8_060_000w6[8][8] = {
-  { 0, 0, 0, 0, 0, 0, 23170, 23170 },
-  { 0, 0, 0, 0, 0, 0, 23170, -23170 },
-  { 6154, 9551, 12487, 14823, 16446, 17277, 0, 0 },
-  { 15149, 17660, 12503, 1917, -9502, -16795, 0, 0 },
-  { 18166, 7740, -11772, -17465, -2656, 15271, 0, 0 },
-  { 16682, -8797, -15561, 10779, 14189, -12586, 0, 0 },
-  { 12436, -18234, 7007, 10763, -18483, 8945, 0, 0 },
-  { 6591, -14172, 18211, -17700, 12766, -4642, 0, 0 },
-};
-
-// LGT8 name: lgt8_000w6
-// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000
-static const tran_high_t lgt8_000w6[8][8] = {
-  { 0, 0, 0, 0, 0, 0, 23170, 23170 },
-  { 0, 0, 0, 0, 0, 0, 23170, -23170 },
-  { 13377, 13377, 13377, 13377, 13377, 13377, 0, 0 },
-  { 18274, 13377, 4896, -4896, -13377, -18274, 0, 0 },
-  { 16384, 0, -16384, -16384, 0, 16384, 0, 0 },
-  { 13377, -13377, -13377, 13377, 13377, -13377, 0, 0 },
-  { 9459, -18919, 9459, 9459, -18919, 9459, 0, 0 },
-  { 4896, -13377, 18274, -18274, 13377, -4896, 0, 0 },
-};
-
-// LGT8 name: lgt8_150_000w5
-// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000
-static const tran_high_t lgt8_150_000w5[8][8] = {
-  { 0, 0, 0, 0, 0, 18919, 18919, 18919 },
-  { 0, 0, 0, 0, 0, 23170, 0, -23170 },
-  { 0, 0, 0, 0, 0, 13377, -26755, 13377 },
-  { 4109, 9895, 14774, 18299, 20146, 0, 0, 0 },
-  { 11753, 20300, 13161, -4148, -18252, 0, 0, 0 },
-  { 17573, 10921, -16246, -12895, 14679, 0, 0, 0 },
-  { 19760, -9880, -9880, 19760, -9880, 0, 0, 0 },
-  { 14815, -18624, 17909, -12844, 4658, 0, 0, 0 },
-};
-
-// LGT8 name: lgt8_100_000w5
-// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000
-static const tran_high_t lgt8_100_000w5[8][8] = {
-  { 0, 0, 0, 0, 0, 18919, 18919, 18919 },
-  { 0, 0, 0, 0, 0, 23170, 0, -23170 },
-  { 0, 0, 0, 0, 0, 13377, -26755, 13377 },
-  { 5567, 10683, 14933, 17974, 19559, 0, 0, 0 },
-  { 14933, 19559, 10683, -5567, -17974, 0, 0, 0 },
-  { 19559, 5567, -17974, -10683, 14933, 0, 0, 0 },
-  { 17974, -14933, -5567, 19559, -10683, 0, 0, 0 },
-  { 10683, -17974, 19559, -14933, 5567, 0, 0, 0 },
-};
-
-// LGT8 name: lgt8_060_000w5
-// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000
-static const tran_high_t lgt8_060_000w5[8][8] = {
-  { 0, 0, 0, 0, 0, 18919, 18919, 18919 },
-  { 0, 0, 0, 0, 0, 23170, 0, -23170 },
-  { 0, 0, 0, 0, 0, 13377, -26755, 13377 },
-  { 7650, 11741, 15069, 17415, 18628, 0, 0, 0 },
-  { 17824, 18002, 7558, -7345, -17914, 0, 0, 0 },
-  { 19547, 569, -19303, -8852, 15505, 0, 0, 0 },
-  { 15592, -17548, -2862, 19625, -11374, 0, 0, 0 },
-  { 8505, -17423, 20218, -15907, 6006, 0, 0, 0 },
-};
-
-// LGT8 name: lgt8_000w5
-// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000
-static const tran_high_t lgt8_000w5[8][8] = {
-  { 0, 0, 0, 0, 0, 18919, 18919, 18919 },
-  { 0, 0, 0, 0, 0, 23170, 0, -23170 },
-  { 0, 0, 0, 0, 0, 13377, -26755, 13377 },
-  { 14654, 14654, 14654, 14654, 14654, 0, 0, 0 },
-  { 19710, 12181, 0, -12181, -19710, 0, 0, 0 },
-  { 16766, -6404, -20724, -6404, 16766, 0, 0, 0 },
-  { 12181, -19710, 0, 19710, -12181, 0, 0, 0 },
-  { 6404, -16766, 20724, -16766, 6404, 0, 0, 0 },
-};
-
-// LGT8 name: lgt8_150_000w4
-// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_150_000w4[8][8] = {
-  { 5655, 13343, 19159, 22286, 0, 0, 0, 0 },
-  { 15706, 21362, 2667, -19068, 0, 0, 0, 0 },
-  { 21580, -1459, -20752, 13238, 0, 0, 0, 0 },
-  { 18148, -20910, 16399, -6183, 0, 0, 0, 0 },
-  { 0, 0, 0, 0, 16384, 16384, 16384, 16384 },
-  { 0, 0, 0, 0, 21407, 8867, -8867, -21407 },
-  { 0, 0, 0, 0, 16384, -16384, -16384, 16384 },
-  { 0, 0, 0, 0, 8867, -21407, 21407, -8867 },
-};
-
-// LGT8 name: lgt8_100_000w4
-// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_100_000w4[8][8] = {
-  { 7472, 14042, 18919, 21513, 0, 0, 0, 0 },
-  { 18919, 18919, 0, -18919, 0, 0, 0, 0 },
-  { 21513, -7472, -18919, 14042, 0, 0, 0, 0 },
-  { 14042, -21513, 18919, -7472, 0, 0, 0, 0 },
-  { 0, 0, 0, 0, 16384, 16384, 16384, 16384 },
-  { 0, 0, 0, 0, 21407, 8867, -8867, -21407 },
-  { 0, 0, 0, 0, 16384, -16384, -16384, 16384 },
-  { 0, 0, 0, 0, 8867, -21407, 21407, -8867 },
-};
-
-// LGT8 name: lgt8_060_000w4
-// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_060_000w4[8][8] = {
-  { 9858, 14855, 18470, 20365, 0, 0, 0, 0 },
-  { 21127, 15855, -2886, -19175, 0, 0, 0, 0 },
-  { 19935, -11679, -17764, 14980, 0, 0, 0, 0 },
-  { 11525, -21570, 20217, -8180, 0, 0, 0, 0 },
-  { 0, 0, 0, 0, 16384, 16384, 16384, 16384 },
-  { 0, 0, 0, 0, 21407, 8867, -8867, -21407 },
-  { 0, 0, 0, 0, 16384, -16384, -16384, 16384 },
-  { 0, 0, 0, 0, 8867, -21407, 21407, -8867 },
-};
-
-// LGT8 name: lgt8_000w4
-// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_000w4[8][8] = {
-  { 16384, 16384, 16384, 16384, 0, 0, 0, 0 },
-  { 21407, 8867, -8867, -21407, 0, 0, 0, 0 },
-  { 16384, -16384, -16384, 16384, 0, 0, 0, 0 },
-  { 8867, -21407, 21407, -8867, 0, 0, 0, 0 },
-  { 0, 0, 0, 0, 16384, 16384, 16384, 16384 },
-  { 0, 0, 0, 0, 21407, 8867, -8867, -21407 },
-  { 0, 0, 0, 0, 16384, -16384, -16384, 16384 },
-  { 0, 0, 0, 0, 8867, -21407, 21407, -8867 },
-};
-
-// LGT8 name: lgt8_150_000w3
-// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_150_000w3[8][8] = {
-  { 8473, 19144, 25209, 0, 0, 0, 0, 0 },
-  { 21942, 15257, -18961, 0, 0, 0, 0, 0 },
-  { 22815, -21783, 8874, 0, 0, 0, 0, 0 },
-  { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 },
-  { 0, 0, 0, 19710, 12181, 0, -12181, -19710 },
-  { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 },
-  { 0, 0, 0, 12181, -19710, 0, 19710, -12181 },
-  { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 },
-};
-
-// LGT8 name: lgt8_100_000w3
-// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_100_000w3[8][8] = {
-  { 10747, 19366, 24149, 0, 0, 0, 0, 0 },
-  { 24149, 10747, -19366, 0, 0, 0, 0, 0 },
-  { 19366, -24149, 10747, 0, 0, 0, 0, 0 },
-  { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 },
-  { 0, 0, 0, 19710, 12181, 0, -12181, -19710 },
-  { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 },
-  { 0, 0, 0, 12181, -19710, 0, 19710, -12181 },
-  { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 },
-};
-
-// LGT8 name: lgt8_060_000w3
-// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_060_000w3[8][8] = {
-  { 13363, 19452, 22733, 0, 0, 0, 0, 0 },
-  { 24815, 6704, -20323, 0, 0, 0, 0, 0 },
-  { 16715, -25503, 11997, 0, 0, 0, 0, 0 },
-  { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 },
-  { 0, 0, 0, 19710, 12181, 0, -12181, -19710 },
-  { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 },
-  { 0, 0, 0, 12181, -19710, 0, 19710, -12181 },
-  { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 },
-};
-
-// LGT8 name: lgt8_000w3
-// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_000w3[8][8] = {
-  { 18919, 18919, 18919, 0, 0, 0, 0, 0 },
-  { 23170, 0, -23170, 0, 0, 0, 0, 0 },
-  { 13377, -26755, 13377, 0, 0, 0, 0, 0 },
-  { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 },
-  { 0, 0, 0, 19710, 12181, 0, -12181, -19710 },
-  { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 },
-  { 0, 0, 0, 12181, -19710, 0, 19710, -12181 },
-  { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 },
-};
-
-// LGT8 name: lgt8_150_000w2
-// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_150_000w2[8][8] = {
-  { 14654, 29309, 0, 0, 0, 0, 0, 0 },
-  { 29309, -14654, 0, 0, 0, 0, 0, 0 },
-  { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 },
-  { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 },
-  { 0, 0, 16384, 0, -16384, -16384, 0, 16384 },
-  { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 },
-  { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 },
-  { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 },
-};
-
-// LGT8 name: lgt8_100_000w2
-// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_100_000w2[8][8] = {
-  { 17227, 27874, 0, 0, 0, 0, 0, 0 },
-  { 27874, -17227, 0, 0, 0, 0, 0, 0 },
-  { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 },
-  { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 },
-  { 0, 0, 16384, 0, -16384, -16384, 0, 16384 },
-  { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 },
-  { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 },
-  { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 },
-};
-
-// LGT8 name: lgt8_060_000w2
-// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_060_000w2[8][8] = {
-  { 19560, 26290, 0, 0, 0, 0, 0, 0 },
-  { 26290, -19560, 0, 0, 0, 0, 0, 0 },
-  { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 },
-  { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 },
-  { 0, 0, 16384, 0, -16384, -16384, 0, 16384 },
-  { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 },
-  { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 },
-  { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 },
-};
-
-// LGT8 name: lgt8_000w2
-// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_000w2[8][8] = {
-  { 23170, 23170, 0, 0, 0, 0, 0, 0 },
-  { 23170, -23170, 0, 0, 0, 0, 0, 0 },
-  { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 },
-  { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 },
-  { 0, 0, 16384, 0, -16384, -16384, 0, 16384 },
-  { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 },
-  { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 },
-  { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 },
-};
-
-// LGT8 name: lgt8_150_000w1
-// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_150_000w1[8][8] = {
-  { 32768, 0, 0, 0, 0, 0, 0, 0 },
-  { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 },
-  { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 },
-  { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 },
-  { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 },
-  { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 },
-  { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 },
-  { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 },
-};
-
-// LGT8 name: lgt8_100_000w1
-// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_100_000w1[8][8] = {
-  { 32768, 0, 0, 0, 0, 0, 0, 0 },
-  { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 },
-  { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 },
-  { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 },
-  { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 },
-  { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 },
-  { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 },
-  { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 },
-};
-
-// LGT8 name: lgt8_060_000w1
-// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_060_000w1[8][8] = {
-  { 32768, 0, 0, 0, 0, 0, 0, 0 },
-  { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 },
-  { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 },
-  { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 },
-  { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 },
-  { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 },
-  { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 },
-  { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 },
-};
-
-// LGT8 name: lgt8_000w1
-// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_000w1[8][8] = {
-  { 32768, 0, 0, 0, 0, 0, 0, 0 },
-  { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 },
-  { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 },
-  { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 },
-  { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 },
-  { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 },
-  { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 },
-  { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 },
-};
-
-// LGT8 name: lgt8_060
-// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_060[8][8] = {
-  { 4295, 6746, 8999, 10987, 12653, 13947, 14832, 15280 },
-  { 11303, 15101, 14912, 10786, 3812, -4168, -11047, -15010 },
-  { 15051, 13208, 1823, -10879, -15721, -9207, 3959, 14265 },
-  { 15871, 3800, -13441, -12395, 5516, 15922, 4665, -12939 },
-  { 14630, -7269, -13926, 8618, 13091, -9886, -12133, 11062 },
-  { 12008, -14735, 180, 14586, -12245, -4458, 15932, -8720 },
-  { 8472, -15623, 14088, -4721, -7272, 15221, -14708, 6018 },
-  { 4372, -9862, 13927, -15981, 15727, -13202, 8770, -3071 },
-};
-
-// LGT8 name: lgt8_100
-// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_100[8][8] = {
-  { 2921, 5742, 8368, 10708, 12684, 14228, 15288, 15827 },
-  { 8368, 14228, 15827, 12684, 5742, -2921, -10708, -15288 },
-  { 12684, 15288, 5742, -8368, -15827, -10708, 2921, 14228 },
-  { 15288, 8368, -10708, -14228, 2921, 15827, 5742, -12684 },
-  { 15827, -2921, -15288, 5742, 14228, -8368, -12684, 10708 },
-  { 14228, -12684, -2921, 15288, -10708, -5742, 15827, -8368 },
-  { 10708, -15827, 12684, -2921, -8368, 15288, -14228, 5742 },
-  { 5742, -10708, 14228, -15827, 15288, -12684, 8368, -2921 },
-};
-#endif  // CONFIG_LGT_FROM_PRED
-
-#if CONFIG_LGT || CONFIG_LGT_FROM_PRED
-// LGT4 name: lgt4_170
-// Self loops: 1.700, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000
-static const tran_high_t lgt4_170[4][4] = {
-  { 3636, 9287, 13584, 15902 },
-  { 10255, 15563, 2470, -13543 },
-  { 14786, 711, -15249, 9231 },
-  { 14138, -14420, 10663, -3920 },
-};
-
-// LGT4 name: lgt4_140
-// Self loops: 1.400, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000
-static const tran_high_t lgt4_140[4][4] = {
-  { 4206, 9518, 13524, 15674 },
-  { 11552, 14833, 1560, -13453 },
-  { 15391, -1906, -14393, 9445 },
-  { 12201, -14921, 12016, -4581 },
-};
-
-// LGT8 name: lgt8_170
-// Self loops: 1.700, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_170[8][8] = {
-  { 1858, 4947, 7850, 10458, 12672, 14411, 15607, 16217 },
-  { 5494, 13022, 16256, 14129, 7343, -1864, -10456, -15601 },
-  { 8887, 16266, 9500, -5529, -15749, -12273, 1876, 14394 },
-  { 11870, 13351, -6199, -15984, -590, 15733, 7273, -12644 },
-  { 14248, 5137, -15991, 291, 15893, -5685, -13963, 10425 },
-  { 15716, -5450, -10010, 15929, -6665, -8952, 16036, -7835 },
-  { 15533, -13869, 6559, 3421, -12009, 15707, -13011, 5018 },
-  { 11357, -13726, 14841, -14600, 13025, -10259, 6556, -2254 },
-};
-
-// LGT8 name: lgt8_150
-// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_150[8][8] = {
-  { 2075, 5110, 7958, 10511, 12677, 14376, 15544, 16140 },
-  { 6114, 13307, 16196, 13845, 7015, -2084, -10509, -15534 },
-  { 9816, 16163, 8717, -6168, -15790, -11936, 2104, 14348 },
-  { 12928, 12326, -7340, -15653, 242, 15763, 6905, -12632 },
-  { 15124, 3038, -16033, 1758, 15507, -6397, -13593, 10463 },
-  { 15895, -7947, -7947, 15895, -7947, -7947, 15895, -7947 },
-  { 14325, -15057, 9030, 1050, -10659, 15483, -13358, 5236 },
-  { 9054, -12580, 14714, -15220, 14043, -11312, 7330, -2537 },
-};
-#endif  // CONFIG_LGT || CONFIG_LGT_FROM_PRED
 #endif  // AOM_DSP_TXFM_COMMON_H_
diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c
index 3c99aa155..d367905bc 100644
--- a/third_party/aom/aom_dsp/variance.c
+++ b/third_party/aom/aom_dsp/variance.c
@@ -8,22 +8,24 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+#include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include <assert.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
-#include "aom_ports/mem.h"
 #include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
 
-#include "aom_dsp/variance.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/blend.h"
+#include "aom_dsp/variance.h"
 
-#include "./av1_rtcd.h"
 #include "av1/common/filter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
 
 uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
                             int b_stride) {
@@ -106,12 +108,12 @@ uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
 // It defines the offset required to move from one input to the next.
-static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
-                                              unsigned int src_pixels_per_line,
-                                              int pixel_step,
-                                              unsigned int output_height,
-                                              unsigned int output_width,
-                                              const uint8_t *filter) {
+void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b,
+                                             unsigned int src_pixels_per_line,
+                                             unsigned int pixel_step,
+                                             unsigned int output_height,
+                                             unsigned int output_width,
+                                             const uint8_t *filter) {
   unsigned int i, j;
 
   for (i = 0; i < output_height; ++i) {
@@ -136,12 +138,12 @@ static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
 // filter is applied horizontally (pixel_step = 1) or vertically
 // (pixel_step = stride). It defines the offset required to move from one input
 // to the next. Output is 8-bit.
-static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
-                                               unsigned int src_pixels_per_line,
-                                               unsigned int pixel_step,
-                                               unsigned int output_height,
-                                               unsigned int output_width,
-                                               const uint8_t *filter) {
+void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
+                                              unsigned int src_pixels_per_line,
+                                              unsigned int pixel_step,
+                                              unsigned int output_height,
+                                              unsigned int output_width,
+                                              const uint8_t *filter) {
   unsigned int i, j;
 
   for (i = 0; i < output_height; ++i) {
@@ -165,38 +167,55 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));        \
   }
 
-#define SUBPIX_VAR(W, H)                                                \
-  uint32_t aom_sub_pixel_variance##W##x##H##_c(                         \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,         \
-      const uint8_t *b, int b_stride, uint32_t *sse) {                  \
-    uint16_t fdata3[(H + 1) * W];                                       \
-    uint8_t temp2[H * W];                                               \
-                                                                        \
-    var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
-                                      bilinear_filters_2t[xoffset]);    \
-    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                       bilinear_filters_2t[yoffset]);   \
-                                                                        \
-    return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse);       \
+#define SUBPIX_VAR(W, H)                                                      \
+  uint32_t aom_sub_pixel_variance##W##x##H##_c(                               \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
+      const uint8_t *b, int b_stride, uint32_t *sse) {                        \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint8_t temp2[H * W];                                                     \
+                                                                              \
+    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
+                                            bilinear_filters_2t[xoffset]);    \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
+                                             bilinear_filters_2t[yoffset]);   \
+                                                                              \
+    return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse);             \
   }
 
-#define SUBPIX_AVG_VAR(W, H)                                            \
-  uint32_t aom_sub_pixel_avg_variance##W##x##H##_c(                     \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,         \
-      const uint8_t *b, int b_stride, uint32_t *sse,                    \
-      const uint8_t *second_pred) {                                     \
-    uint16_t fdata3[(H + 1) * W];                                       \
-    uint8_t temp2[H * W];                                               \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                         \
-                                                                        \
-    var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
-                                      bilinear_filters_2t[xoffset]);    \
-    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                       bilinear_filters_2t[yoffset]);   \
-                                                                        \
-    aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);              \
-                                                                        \
-    return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);       \
+#define SUBPIX_AVG_VAR(W, H)                                                  \
+  uint32_t aom_sub_pixel_avg_variance##W##x##H##_c(                           \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
+      const uint8_t *b, int b_stride, uint32_t *sse,                          \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint8_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                               \
+                                                                              \
+    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
+                                            bilinear_filters_2t[xoffset]);    \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
+                                             bilinear_filters_2t[yoffset]);   \
+                                                                              \
+    aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);                    \
+                                                                              \
+    return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);             \
+  }                                                                           \
+  uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_c(                       \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
+      const uint8_t *b, int b_stride, uint32_t *sse,                          \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint8_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                               \
+                                                                              \
+    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
+                                            bilinear_filters_2t[xoffset]);    \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
+                                             bilinear_filters_2t[yoffset]);   \
+                                                                              \
+    aom_jnt_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param);     \
+                                                                              \
+    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);                 \
   }
 
 /* Identical to the variance call except it takes an additional parameter, sum,
@@ -229,11 +248,9 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
   SUBPIX_VAR(W, H)      \
   SUBPIX_AVG_VAR(W, H)
 
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
 VARIANCES(128, 128)
 VARIANCES(128, 64)
 VARIANCES(64, 128)
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
 VARIANCES(64, 64)
 VARIANCES(64, 32)
 VARIANCES(32, 64)
@@ -250,19 +267,12 @@ VARIANCES(4, 4)
 VARIANCES(4, 2)
 VARIANCES(2, 4)
 VARIANCES(2, 2)
-
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
 VARIANCES(4, 16)
 VARIANCES(16, 4)
 VARIANCES(8, 32)
 VARIANCES(32, 8)
 VARIANCES(16, 64)
 VARIANCES(64, 16)
-#if CONFIG_EXT_PARTITION
-VARIANCES(32, 128)
-VARIANCES(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
 
 GET_VAR(16, 16)
 GET_VAR(8, 8)
@@ -288,61 +298,142 @@ void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
 }
 
 // Get pred block from up-sampled reference.
-void aom_upsampled_pred_c(uint8_t *comp_pred, int width, int height,
+void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                          int mi_row, int mi_col, const MV *const mv,
+                          uint8_t *comp_pred, int width, int height,
                           int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
                           int ref_stride) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      // Note: This is mostly a copy from the >=8X8 case in
+      // build_inter_predictors() function, with some small tweaks.
+
+      // Some assumptions.
+      const int plane = 0;
+
+      // Get pre-requisites.
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const int ssx = pd->subsampling_x;
+      const int ssy = pd->subsampling_y;
+      assert(ssx == 0 && ssy == 0);
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+
+      // Calculate subpel_x/y and x/y_step.
+      const int row_start = 0;  // Because ss_y is 0.
+      const int col_start = 0;  // Because ss_x is 0.
+      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
+      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
+      int orig_pos_y = pre_y << SUBPEL_BITS;
+      orig_pos_y += mv->row * (1 << (1 - ssy));
+      int orig_pos_x = pre_x << SUBPEL_BITS;
+      orig_pos_x += mv->col * (1 << (1 - ssx));
+      int pos_y = sf->scale_value_y(orig_pos_y, sf);
+      int pos_x = sf->scale_value_x(orig_pos_x, sf);
+      pos_x += SCALE_EXTRA_OFF;
+      pos_y += SCALE_EXTRA_OFF;
+
+      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                         << SCALE_SUBPEL_BITS;
+      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
+                        << SCALE_SUBPEL_BITS;
+      pos_y = clamp(pos_y, top, bottom);
+      pos_x = clamp(pos_x, left, right);
+
+      const uint8_t *const pre =
+          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+          (pos_x >> SCALE_SUBPEL_BITS);
+
+      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+                                           pos_x & SCALE_SUBPEL_MASK,
+                                           pos_y & SCALE_SUBPEL_MASK };
+
+      // Get warp types.
+      const WarpedMotionParams *const wm =
+          &xd->global_motion[mi->ref_frame[ref_num]];
+      const int is_global = is_global_mv_block(mi, wm->wmtype);
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global;
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+      // Get convolve parameters.
+      ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd);
+      const InterpFilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+      // Get the inter predictor.
+      const int build_for_obmc = 0;
+      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width,
+                               &subpel_params, sf, width, height, &conv_params,
+                               filters, &warp_types, mi_x >> pd->subsampling_x,
+                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
+                               build_for_obmc, xd, cm->allow_warped_motion);
+
+      return;
+    }
+  }
+
+  const InterpFilterParams filter =
+      av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+
   if (!subpel_x_q3 && !subpel_y_q3) {
-    int i;
-    for (i = 0; i < height; i++) {
+    for (int i = 0; i < height; i++) {
       memcpy(comp_pred, ref, width * sizeof(*comp_pred));
       comp_pred += width;
       ref += ref_stride;
     }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
+                        width, height);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
+                       width, height);
   } else {
-    InterpFilterParams filter;
-    filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR);
-    if (!subpel_y_q3) {
-      const int16_t *kernel;
-      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-      /*Directly call C version to allow this to work for small (2x2) sizes.*/
-      aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
-                            -1, width, height);
-    } else if (!subpel_x_q3) {
-      const int16_t *kernel;
-      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-      /*Directly call C version to allow this to work for small (2x2) sizes.*/
-      aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
-                           16, width, height);
-    } else {
-      DECLARE_ALIGNED(16, uint8_t,
-                      temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-      const int16_t *kernel_x;
-      const int16_t *kernel_y;
-      int intermediate_height;
-      kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-      kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-      intermediate_height =
-          (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
-      assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-      /*Directly call C versions to allow this to work for small (2x2) sizes.*/
-      aom_convolve8_horiz_c(ref - ref_stride * ((filter.taps >> 1) - 1),
-                            ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL,
-                            -1, width, intermediate_height);
-      aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
-                           MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y,
-                           16, width, height);
-    }
+    DECLARE_ALIGNED(16, uint8_t,
+                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), ref_stride,
+                        temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+                        intermediate_height);
+    aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
+                       MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
+                       width, height);
   }
 }
 
-void aom_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
+void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                   int mi_row, int mi_col, const MV *const mv,
+                                   uint8_t *comp_pred, const uint8_t *pred,
                                    int width, int height, int subpel_x_q3,
                                    int subpel_y_q3, const uint8_t *ref,
                                    int ref_stride) {
   int i, j;
 
-  aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
-                     ref_stride);
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride);
   for (i = 0; i < height; i++) {
     for (j = 0; j < width; j++) {
       comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
@@ -352,26 +443,68 @@ void aom_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
-static void highbd_variance64(const uint8_t *a8, int a_stride,
-                              const uint8_t *b8, int b_stride, int w, int h,
-                              uint64_t *sse, int64_t *sum) {
+void aom_jnt_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
+                             int height, const uint8_t *ref, int ref_stride,
+                             const JNT_COMP_PARAMS *jcp_param) {
   int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
 
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  *sum = 0;
-  *sse = 0;
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint8_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
 
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) {
+void aom_jnt_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const JNT_COMP_PARAMS *jcp_param) {
+  int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride);
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint8_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+static void highbd_variance64(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride, int w, int h,
+                              uint64_t *sse, int64_t *sum) {
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int64_t tsum = 0;
+  uint64_t tsse = 0;
+  for (int i = 0; i < h; ++i) {
+    int32_t lsum = 0;
+    for (int j = 0; j < w; ++j) {
       const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
+      lsum += diff;
+      tsse += (uint32_t)(diff * diff);
     }
+    tsum += lsum;
     a += a_stride;
     b += b_stride;
   }
+  *sum = tsum;
+  *sse = tsse;
 }
 
 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
@@ -573,65 +706,125 @@ void aom_highbd_var_filter_block2d_bil_second_pass(
                                                dst, dst_stride, sse);        \
   }
 
-#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                          \
-  uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                 \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
-      const uint8_t *second_pred) {                                          \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
-                                                                             \
-    aom_highbd_var_filter_block2d_bil_first_pass(                            \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
-    aom_highbd_var_filter_block2d_bil_second_pass(                           \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
-                                                                             \
-    aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
-                               CONVERT_TO_BYTEPTR(temp2), W);                \
-                                                                             \
-    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
-                                              dst, dst_stride, sse);         \
-  }                                                                          \
-                                                                             \
-  uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
-      const uint8_t *second_pred) {                                          \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
-                                                                             \
-    aom_highbd_var_filter_block2d_bil_first_pass(                            \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
-    aom_highbd_var_filter_block2d_bil_second_pass(                           \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
-                                                                             \
-    aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
-                               CONVERT_TO_BYTEPTR(temp2), W);                \
-                                                                             \
-    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
-                                               dst, dst_stride, sse);        \
-  }                                                                          \
-                                                                             \
-  uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
-      const uint8_t *second_pred) {                                          \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
-                                                                             \
-    aom_highbd_var_filter_block2d_bil_first_pass(                            \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
-    aom_highbd_var_filter_block2d_bil_second_pass(                           \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
-                                                                             \
-    aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
-                               CONVERT_TO_BYTEPTR(temp2), W);                \
-                                                                             \
-    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
-                                               dst, dst_stride, sse);        \
+#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                           \
+  uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                  \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                      \
+                               CONVERT_TO_BYTEPTR(temp2), W);                 \
+                                                                              \
+    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
+                                              dst, dst_stride, sse);          \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                      \
+                               CONVERT_TO_BYTEPTR(temp2), W);                 \
+                                                                              \
+    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
+                                               dst, dst_stride, sse);         \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                      \
+                               CONVERT_TO_BYTEPTR(temp2), W);                 \
+                                                                              \
+    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
+                                               dst, dst_stride, sse);         \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_8_jnt_sub_pixel_avg_variance##W##x##H##_c(              \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H,                    \
+                                 CONVERT_TO_BYTEPTR(temp2), W, jcp_param);    \
+                                                                              \
+    return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,  \
+                                          dst_stride, sse);                   \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_10_jnt_sub_pixel_avg_variance##W##x##H##_c(             \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H,                    \
+                                 CONVERT_TO_BYTEPTR(temp2), W, jcp_param);    \
+                                                                              \
+    return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                           dst_stride, sse);                  \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_12_jnt_sub_pixel_avg_variance##W##x##H##_c(             \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H,                    \
+                                 CONVERT_TO_BYTEPTR(temp2), W, jcp_param);    \
+                                                                              \
+    return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                           dst_stride, sse);                  \
   }
 
 /* All three forms of the variance are available in the same sizes. */
@@ -640,11 +833,9 @@ void aom_highbd_var_filter_block2d_bil_second_pass(
   HIGHBD_SUBPIX_VAR(W, H)      \
   HIGHBD_SUBPIX_AVG_VAR(W, H)
 
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
 HIGHBD_VARIANCES(128, 128)
 HIGHBD_VARIANCES(128, 64)
 HIGHBD_VARIANCES(64, 128)
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
 HIGHBD_VARIANCES(64, 64)
 HIGHBD_VARIANCES(64, 32)
 HIGHBD_VARIANCES(32, 64)
@@ -661,19 +852,12 @@ HIGHBD_VARIANCES(4, 4)
 HIGHBD_VARIANCES(4, 2)
 HIGHBD_VARIANCES(2, 4)
 HIGHBD_VARIANCES(2, 2)
-
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
 HIGHBD_VARIANCES(4, 16)
 HIGHBD_VARIANCES(16, 4)
 HIGHBD_VARIANCES(8, 32)
 HIGHBD_VARIANCES(32, 8)
 HIGHBD_VARIANCES(16, 64)
 HIGHBD_VARIANCES(64, 16)
-#if CONFIG_EXT_PARTITION
-HIGHBD_VARIANCES(32, 128)
-HIGHBD_VARIANCES(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
 
 HIGHBD_GET_VAR(8)
 HIGHBD_GET_VAR(16)
@@ -700,9 +884,99 @@ void aom_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
   }
 }
 
-void aom_highbd_upsampled_pred_c(uint16_t *comp_pred, int width, int height,
+void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
+                                 const struct AV1Common *const cm, int mi_row,
+                                 int mi_col, const MV *const mv,
+                                 uint16_t *comp_pred, int width, int height,
                                  int subpel_x_q3, int subpel_y_q3,
                                  const uint8_t *ref8, int ref_stride, int bd) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      // Note: This is mostly a copy from the >=8X8 case in
+      // build_inter_predictors() function, with some small tweaks.
+      uint8_t *comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred);
+
+      // Some assumptions.
+      const int plane = 0;
+
+      // Get pre-requisites.
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const int ssx = pd->subsampling_x;
+      const int ssy = pd->subsampling_y;
+      assert(ssx == 0 && ssy == 0);
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+
+      // Calculate subpel_x/y and x/y_step.
+      const int row_start = 0;  // Because ss_y is 0.
+      const int col_start = 0;  // Because ss_x is 0.
+      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
+      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
+      int orig_pos_y = pre_y << SUBPEL_BITS;
+      orig_pos_y += mv->row * (1 << (1 - ssy));
+      int orig_pos_x = pre_x << SUBPEL_BITS;
+      orig_pos_x += mv->col * (1 << (1 - ssx));
+      int pos_y = sf->scale_value_y(orig_pos_y, sf);
+      int pos_x = sf->scale_value_x(orig_pos_x, sf);
+      pos_x += SCALE_EXTRA_OFF;
+      pos_y += SCALE_EXTRA_OFF;
+
+      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                         << SCALE_SUBPEL_BITS;
+      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
+                        << SCALE_SUBPEL_BITS;
+      pos_y = clamp(pos_y, top, bottom);
+      pos_x = clamp(pos_x, left, right);
+
+      const uint8_t *const pre =
+          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+          (pos_x >> SCALE_SUBPEL_BITS);
+
+      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+                                           pos_x & SCALE_SUBPEL_MASK,
+                                           pos_y & SCALE_SUBPEL_MASK };
+
+      // Get warp types.
+      const WarpedMotionParams *const wm =
+          &xd->global_motion[mi->ref_frame[ref_num]];
+      const int is_global = is_global_mv_block(mi, wm->wmtype);
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global;
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+      // Get convolve parameters.
+      ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd);
+      const InterpFilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+      // Get the inter predictor.
+      const int build_for_obmc = 0;
+      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width,
+                               &subpel_params, sf, width, height, &conv_params,
+                               filters, &warp_types, mi_x >> pd->subsampling_x,
+                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
+                               build_for_obmc, xd, cm->allow_warped_motion);
+
+      return;
+    }
+  }
+
+  const InterpFilterParams filter =
+      av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+
   if (!subpel_x_q3 && !subpel_y_q3) {
     const uint16_t *ref;
     int i;
@@ -712,57 +986,48 @@ void aom_highbd_upsampled_pred_c(uint16_t *comp_pred, int width, int height,
       comp_pred += width;
       ref += ref_stride;
     }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_highbd_convolve8_horiz(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
+                               width, kernel, 16, NULL, -1, width, height, bd);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
+                              width, NULL, -1, kernel, 16, width, height, bd);
   } else {
-    InterpFilterParams filter;
-    filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR);
-    if (!subpel_y_q3) {
-      const int16_t *kernel;
-      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-      /*Directly call C version to allow this to work for small (2x2) sizes.*/
-      aom_highbd_convolve8_horiz_c(ref8, ref_stride,
-                                   CONVERT_TO_BYTEPTR(comp_pred), width, kernel,
-                                   16, NULL, -1, width, height, bd);
-    } else if (!subpel_x_q3) {
-      const int16_t *kernel;
-      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-      /*Directly call C version to allow this to work for small (2x2) sizes.*/
-      aom_highbd_convolve8_vert_c(ref8, ref_stride,
-                                  CONVERT_TO_BYTEPTR(comp_pred), width, NULL,
-                                  -1, kernel, 16, width, height, bd);
-    } else {
-      DECLARE_ALIGNED(16, uint16_t,
-                      temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
-      const int16_t *kernel_x;
-      const int16_t *kernel_y;
-      int intermediate_height;
-      kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-      kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-      intermediate_height =
-          (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
-      assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-      /*Directly call C versions to allow this to work for small (2x2) sizes.*/
-      aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter.taps >> 1) - 1),
-                                   ref_stride, CONVERT_TO_BYTEPTR(temp),
-                                   MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
-                                   intermediate_height, bd);
-      aom_highbd_convolve8_vert_c(
-          CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)),
-          MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
-          16, width, height, bd);
-    }
+    DECLARE_ALIGNED(16, uint16_t,
+                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1),
+                               ref_stride, CONVERT_TO_BYTEPTR(temp),
+                               MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+                               intermediate_height, bd);
+    aom_highbd_convolve8_vert(
+        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)),
+        MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
+        16, width, height, bd);
   }
 }
 
-void aom_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred,
-                                          const uint8_t *pred8, int width,
-                                          int height, int subpel_x_q3,
-                                          int subpel_y_q3, const uint8_t *ref8,
-                                          int ref_stride, int bd) {
+void aom_highbd_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd) {
   int i, j;
 
   const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3,
-                            ref8, ref_stride, bd);
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd);
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
       comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
@@ -771,69 +1036,109 @@ void aom_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred,
     pred += width;
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
-#if CONFIG_AV1
-void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
-                          int height, const uint8_t *ref, int ref_stride,
-                          const uint8_t *mask, int mask_stride,
-                          int invert_mask) {
+void aom_highbd_jnt_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
+                                    int width, int height, const uint8_t *ref8,
+                                    int ref_stride,
+                                    const JNT_COMP_PARAMS *jcp_param) {
   int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
 
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
-      if (!invert_mask)
-        comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
-      else
-        comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
+      int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint16_t)tmp;
     }
     comp_pred += width;
     pred += width;
     ref += ref_stride;
-    mask += mask_stride;
   }
 }
 
-void aom_comp_mask_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
-                                    int width, int height, int subpel_x_q3,
-                                    int subpel_y_q3, const uint8_t *ref,
-                                    int ref_stride, const uint8_t *mask,
-                                    int mask_stride, int invert_mask) {
+void aom_highbd_jnt_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param) {
   int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd);
 
-  aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
-                     ref_stride);
   for (i = 0; i < height; i++) {
     for (j = 0; j < width; j++) {
-      if (!invert_mask)
-        comp_pred[j] = AOM_BLEND_A64(mask[j], comp_pred[j], pred[j]);
-      else
-        comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], comp_pred[j]);
+      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint16_t)tmp;
     }
     comp_pred += width;
     pred += width;
+  }
+}
+
+void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
+                          int height, const uint8_t *ref, int ref_stride,
+                          const uint8_t *mask, int mask_stride,
+                          int invert_mask) {
+  int i, j;
+  const uint8_t *src0 = invert_mask ? pred : ref;
+  const uint8_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
+    }
+    comp_pred += width;
+    src0 += stride0;
+    src1 += stride1;
     mask += mask_stride;
   }
 }
 
-#define MASK_SUBPIX_VAR(W, H)                                                 \
-  unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                    \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,         \
-      const uint8_t *msk, int msk_stride, int invert_mask,                    \
-      unsigned int *sse) {                                                    \
-    uint16_t fdata3[(H + 1) * W];                                             \
-    uint8_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                               \
-                                                                              \
-    var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W,   \
-                                      bilinear_filters_2t[xoffset]);          \
-    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,             \
-                                       bilinear_filters_2t[yoffset]);         \
-                                                                              \
-    aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
-                         invert_mask);                                        \
-    return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse);         \
+void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                  int mi_row, int mi_col, const MV *const mv,
+                                  uint8_t *comp_pred, const uint8_t *pred,
+                                  int width, int height, int subpel_x_q3,
+                                  int subpel_y_q3, const uint8_t *ref,
+                                  int ref_stride, const uint8_t *mask,
+                                  int mask_stride, int invert_mask) {
+  if (subpel_x_q3 | subpel_y_q3) {
+    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                       subpel_x_q3, subpel_y_q3, ref, ref_stride);
+    ref = comp_pred;
+    ref_stride = width;
+  }
+  aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
+                     mask_stride, invert_mask);
+}
+
+#define MASK_SUBPIX_VAR(W, H)                                                  \
+  unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
+      const uint8_t *msk, int msk_stride, int invert_mask,                     \
+      unsigned int *sse) {                                                     \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint8_t temp2[H * W];                                                      \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
+                                                                               \
+    aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \
+                                            W, bilinear_filters_2t[xoffset]);  \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
+                                             bilinear_filters_2t[yoffset]);    \
+                                                                               \
+    aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride,  \
+                         invert_mask);                                         \
+    return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse);          \
   }
 
 MASK_SUBPIX_VAR(4, 4)
@@ -849,26 +1154,16 @@ MASK_SUBPIX_VAR(32, 32)
 MASK_SUBPIX_VAR(32, 64)
 MASK_SUBPIX_VAR(64, 32)
 MASK_SUBPIX_VAR(64, 64)
-#if CONFIG_EXT_PARTITION
 MASK_SUBPIX_VAR(64, 128)
 MASK_SUBPIX_VAR(128, 64)
 MASK_SUBPIX_VAR(128, 128)
-#endif  // CONFIG_EXT_PARTITION
-
-#if CONFIG_EXT_PARTITION_TYPES
 MASK_SUBPIX_VAR(4, 16)
 MASK_SUBPIX_VAR(16, 4)
 MASK_SUBPIX_VAR(8, 32)
 MASK_SUBPIX_VAR(32, 8)
 MASK_SUBPIX_VAR(16, 64)
 MASK_SUBPIX_VAR(64, 16)
-#if CONFIG_EXT_PARTITION
-MASK_SUBPIX_VAR(32, 128)
-MASK_SUBPIX_VAR(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
-#if CONFIG_HIGHBITDEPTH
 void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
                                  int width, int height, const uint8_t *ref8,
                                  int ref_stride, const uint8_t *mask,
@@ -891,14 +1186,17 @@ void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
 }
 
 void aom_highbd_comp_mask_upsampled_pred_c(
-    uint16_t *comp_pred, const uint8_t *pred8, int width, int height,
-    int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride,
-    const uint8_t *mask, int mask_stride, int invert_mask, int bd) {
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int bd) {
   int i, j;
 
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3,
-                            ref8, ref_stride, bd);
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd);
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
       if (!invert_mask)
@@ -992,28 +1290,16 @@ HIGHBD_MASK_SUBPIX_VAR(32, 32)
 HIGHBD_MASK_SUBPIX_VAR(32, 64)
 HIGHBD_MASK_SUBPIX_VAR(64, 32)
 HIGHBD_MASK_SUBPIX_VAR(64, 64)
-#if CONFIG_EXT_PARTITION
 HIGHBD_MASK_SUBPIX_VAR(64, 128)
 HIGHBD_MASK_SUBPIX_VAR(128, 64)
 HIGHBD_MASK_SUBPIX_VAR(128, 128)
-#endif  // CONFIG_EXT_PARTITION
-
-#if CONFIG_EXT_PARTITION_TYPES
 HIGHBD_MASK_SUBPIX_VAR(4, 16)
 HIGHBD_MASK_SUBPIX_VAR(16, 4)
 HIGHBD_MASK_SUBPIX_VAR(8, 32)
 HIGHBD_MASK_SUBPIX_VAR(32, 8)
 HIGHBD_MASK_SUBPIX_VAR(16, 64)
 HIGHBD_MASK_SUBPIX_VAR(64, 16)
-#if CONFIG_EXT_PARTITION
-HIGHBD_MASK_SUBPIX_VAR(32, 128)
-HIGHBD_MASK_SUBPIX_VAR(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_AV1
-
-#if CONFIG_AV1 && CONFIG_MOTION_VAR
+
 static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
                                  const int32_t *wsrc, const int32_t *mask,
                                  int w, int h, unsigned int *sse, int *sum) {
@@ -1044,19 +1330,19 @@ static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
   }
 
-#define OBMC_SUBPIX_VAR(W, H)                                               \
-  unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c(                    \
-      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,         \
-      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {        \
-    uint16_t fdata3[(H + 1) * W];                                           \
-    uint8_t temp2[H * W];                                                   \
-                                                                            \
-    var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1, H + 1, W, \
-                                      bilinear_filters_2t[xoffset]);        \
-    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,           \
-                                       bilinear_filters_2t[yoffset]);       \
-                                                                            \
-    return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);       \
+#define OBMC_SUBPIX_VAR(W, H)                                                  \
+  unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c(                       \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint8_t temp2[H * W];                                                      \
+                                                                               \
+    aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \
+                                            W, bilinear_filters_2t[xoffset]);  \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
+                                             bilinear_filters_2t[yoffset]);    \
+                                                                               \
+    return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);          \
   }
 
 OBMC_VAR(4, 4)
@@ -1098,7 +1384,6 @@ OBMC_SUBPIX_VAR(64, 32)
 OBMC_VAR(64, 64)
 OBMC_SUBPIX_VAR(64, 64)
 
-#if CONFIG_EXT_PARTITION
 OBMC_VAR(64, 128)
 OBMC_SUBPIX_VAR(64, 128)
 
@@ -1107,9 +1392,7 @@ OBMC_SUBPIX_VAR(128, 64)
 
 OBMC_VAR(128, 128)
 OBMC_SUBPIX_VAR(128, 128)
-#endif  // CONFIG_EXT_PARTITION
 
-#if CONFIG_EXT_PARTITION_TYPES
 OBMC_VAR(4, 16)
 OBMC_SUBPIX_VAR(4, 16)
 OBMC_VAR(16, 4)
@@ -1122,15 +1405,7 @@ OBMC_VAR(16, 64)
 OBMC_SUBPIX_VAR(16, 64)
 OBMC_VAR(64, 16)
 OBMC_SUBPIX_VAR(64, 16)
-#if CONFIG_EXT_PARTITION
-OBMC_VAR(32, 128)
-OBMC_SUBPIX_VAR(32, 128)
-OBMC_VAR(128, 32)
-OBMC_SUBPIX_VAR(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
-
-#if CONFIG_HIGHBITDEPTH
+
 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
                                           const int32_t *wsrc,
                                           const int32_t *mask, int w, int h,
@@ -1301,7 +1576,6 @@ HIGHBD_OBMC_SUBPIX_VAR(64, 32)
 HIGHBD_OBMC_VAR(64, 64)
 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
 
-#if CONFIG_EXT_PARTITION
 HIGHBD_OBMC_VAR(64, 128)
 HIGHBD_OBMC_SUBPIX_VAR(64, 128)
 
@@ -1310,9 +1584,7 @@ HIGHBD_OBMC_SUBPIX_VAR(128, 64)
 
 HIGHBD_OBMC_VAR(128, 128)
 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
-#endif  // CONFIG_EXT_PARTITION
 
-#if CONFIG_EXT_PARTITION_TYPES
 HIGHBD_OBMC_VAR(4, 16)
 HIGHBD_OBMC_SUBPIX_VAR(4, 16)
 HIGHBD_OBMC_VAR(16, 4)
@@ -1325,12 +1597,3 @@ HIGHBD_OBMC_VAR(16, 64)
 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
 HIGHBD_OBMC_VAR(64, 16)
 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
-#if CONFIG_EXT_PARTITION
-HIGHBD_OBMC_VAR(32, 128)
-HIGHBD_OBMC_SUBPIX_VAR(32, 128)
-HIGHBD_OBMC_VAR(128, 32)
-HIGHBD_OBMC_SUBPIX_VAR(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_AV1 && CONFIG_MOTION_VAR
diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h
index a193df467..544dda944 100644
--- a/third_party/aom/aom_dsp/variance.h
+++ b/third_party/aom/aom_dsp/variance.h
@@ -12,7 +12,7 @@
 #ifndef AOM_DSP_VARIANCE_H_
 #define AOM_DSP_VARIANCE_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #include "aom/aom_integer.h"
 
@@ -33,10 +33,6 @@ typedef unsigned int (*aom_sad_avg_fn_t)(const uint8_t *a, int a_stride,
 typedef void (*aom_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b,
                                   int b_stride, int n);
 
-typedef void (*aom_sad_multi_fn_t)(const uint8_t *a, int a_stride,
-                                   const uint8_t *b, int b_stride,
-                                   unsigned int *sad_array);
-
 typedef void (*aom_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
                                      const uint8_t *const b_array[],
                                      int b_stride, unsigned int *sad_array);
@@ -54,7 +50,16 @@ typedef unsigned int (*aom_subp_avg_variance_fn_t)(
     const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
     int b_stride, unsigned int *sse, const uint8_t *second_pred);
 
-#if CONFIG_AV1
+typedef unsigned int (*aom_jnt_sad_avg_fn_t)(const uint8_t *a, int a_stride,
+                                             const uint8_t *b, int b_stride,
+                                             const uint8_t *second_pred,
+                                             const JNT_COMP_PARAMS *jcp_param);
+
+typedef unsigned int (*aom_jnt_subp_avg_variance_fn_t)(
+    const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
+    int b_stride, unsigned int *sse, const uint8_t *second_pred,
+    const JNT_COMP_PARAMS *jcp_param);
+
 typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride,
                                             const uint8_t *ref, int ref_stride,
                                             const uint8_t *second_pred,
@@ -64,9 +69,13 @@ typedef unsigned int (*aom_masked_subpixvariance_fn_t)(
     const uint8_t *src, int src_stride, int xoffset, int yoffset,
     const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
     const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#endif  // CONFIG_AV1
 
-#if CONFIG_AV1 && CONFIG_MOTION_VAR
+void aom_comp_mask_upsampled_pred(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
+
 typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
                                           const int32_t *wsrc,
                                           const int32_t *msk);
@@ -78,27 +87,22 @@ typedef unsigned int (*aom_obmc_variance_fn_t)(const uint8_t *pred,
 typedef unsigned int (*aom_obmc_subpixvariance_fn_t)(
     const uint8_t *pred, int pred_stride, int xoffset, int yoffset,
     const int32_t *wsrc, const int32_t *msk, unsigned int *sse);
-#endif  // CONFIG_AV1 && CONFIG_MOTION_VAR
 
-#if CONFIG_AV1
 typedef struct aom_variance_vtable {
   aom_sad_fn_t sdf;
   aom_sad_avg_fn_t sdaf;
   aom_variance_fn_t vf;
   aom_subpixvariance_fn_t svf;
   aom_subp_avg_variance_fn_t svaf;
-  aom_sad_multi_fn_t sdx3f;
-  aom_sad_multi_fn_t sdx8f;
   aom_sad_multi_d_fn_t sdx4df;
   aom_masked_sad_fn_t msdf;
   aom_masked_subpixvariance_fn_t msvf;
-#if CONFIG_MOTION_VAR
   aom_obmc_sad_fn_t osdf;
   aom_obmc_variance_fn_t ovf;
   aom_obmc_subpixvariance_fn_t osvf;
-#endif  // CONFIG_MOTION_VAR
+  aom_jnt_sad_avg_fn_t jsdaf;
+  aom_jnt_subp_avg_variance_fn_t jsvaf;
 } aom_variance_fn_ptr_t;
-#endif  // CONFIG_AV1
 
 void aom_highbd_var_filter_block2d_bil_first_pass(
     const uint8_t *src_ptr8, uint16_t *output_ptr,
@@ -115,10 +119,8 @@ void aom_highbd_var_filter_block2d_bil_second_pass(
 uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
                           int b_stride, int w, int h);
 
-#if CONFIG_HIGHBITDEPTH
 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
                                  const uint8_t *b, int b_stride, int w, int h);
-#endif  // CONFIG_HIGHBITDEPTH
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
index 4067b0b53..401fbdc48 100644
--- a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
+++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
@@ -9,8 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/x86/convolve.h"
 
 #if HAVE_SSE2
@@ -20,12 +21,6 @@ filter8_1dfunction aom_filter_block1d8_v8_sse2;
 filter8_1dfunction aom_filter_block1d8_h8_sse2;
 filter8_1dfunction aom_filter_block1d4_v8_sse2;
 filter8_1dfunction aom_filter_block1d4_h8_sse2;
-filter8_1dfunction aom_filter_block1d16_v8_avg_sse2;
-filter8_1dfunction aom_filter_block1d16_h8_avg_sse2;
-filter8_1dfunction aom_filter_block1d8_v8_avg_sse2;
-filter8_1dfunction aom_filter_block1d8_h8_avg_sse2;
-filter8_1dfunction aom_filter_block1d4_v8_avg_sse2;
-filter8_1dfunction aom_filter_block1d4_h8_avg_sse2;
 
 filter8_1dfunction aom_filter_block1d16_v2_sse2;
 filter8_1dfunction aom_filter_block1d16_h2_sse2;
@@ -33,12 +28,6 @@ filter8_1dfunction aom_filter_block1d8_v2_sse2;
 filter8_1dfunction aom_filter_block1d8_h2_sse2;
 filter8_1dfunction aom_filter_block1d4_v2_sse2;
 filter8_1dfunction aom_filter_block1d4_h2_sse2;
-filter8_1dfunction aom_filter_block1d16_v2_avg_sse2;
-filter8_1dfunction aom_filter_block1d16_h2_avg_sse2;
-filter8_1dfunction aom_filter_block1d8_v2_avg_sse2;
-filter8_1dfunction aom_filter_block1d8_h2_avg_sse2;
-filter8_1dfunction aom_filter_block1d4_v2_avg_sse2;
-filter8_1dfunction aom_filter_block1d4_h2_avg_sse2;
 
 // void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                               uint8_t *dst, ptrdiff_t dst_stride,
@@ -50,47 +39,16 @@ filter8_1dfunction aom_filter_block1d4_h2_avg_sse2;
 //                              const int16_t *filter_x, int x_step_q4,
 //                              const int16_t *filter_y, int y_step_q4,
 //                              int w, int h);
-// void aom_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
-// void aom_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                  uint8_t *dst, ptrdiff_t dst_stride,
-//                                  const int16_t *filter_x, int x_step_q4,
-//                                  const int16_t *filter_y, int y_step_q4,
-//                                  int w, int h);
 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
-
-// void aom_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                         uint8_t *dst, ptrdiff_t dst_stride,
-//                         const int16_t *filter_x, int x_step_q4,
-//                         const int16_t *filter_y, int y_step_q4,
-//                         int w, int h);
-// void aom_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                             uint8_t *dst, ptrdiff_t dst_stride,
-//                             const int16_t *filter_x, int x_step_q4,
-//                             const int16_t *filter_y, int y_step_q4,
-//                             int w, int h);
-FUN_CONV_2D(, sse2);
-FUN_CONV_2D(avg_, sse2);
 
-#if CONFIG_HIGHBITDEPTH && ARCH_X86_64
+#if ARCH_X86_64
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_avg_sse2;
 
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
@@ -98,12 +56,6 @@ highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_avg_sse2;
 
 // void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
 //                                      ptrdiff_t src_stride,
@@ -123,60 +75,8 @@ highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_avg_sse2;
 //                                     const int16_t *filter_y,
 //                                     int y_step_q4,
 //                                     int w, int h, int bd);
-// void aom_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
-//                                          ptrdiff_t src_stride,
-//                                          uint8_t *dst,
-//                                          ptrdiff_t dst_stride,
-//                                          const int16_t *filter_x,
-//                                          int x_step_q4,
-//                                          const int16_t *filter_y,
-//                                          int y_step_q4,
-//                                          int w, int h, int bd);
-// void aom_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
-//                                         ptrdiff_t src_stride,
-//                                         uint8_t *dst,
-//                                         ptrdiff_t dst_stride,
-//                                         const int16_t *filter_x,
-//                                         int x_step_q4,
-//                                         const int16_t *filter_y,
-//                                         int y_step_q4,
-//                                         int w, int h, int bd);
 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
 HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-                 sse2);
-
-// void aom_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
-//                                int w, int h, int bd);
-// void aom_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h, int bd);
-HIGH_FUN_CONV_2D(, sse2);
-HIGH_FUN_CONV_2D(avg_, sse2);
 
-#if CONFIG_LOOP_RESTORATION
-// The SSE2 highbd convolve functions can deal with coefficients up to 32767.
-// So redirect highbd_convolve8_add_src to regular highbd_convolve8.
-void aom_highbd_convolve8_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                                       uint8_t *dst, ptrdiff_t dst_stride,
-                                       const int16_t *filter_x, int x_step_q4,
-                                       const int16_t *filter_y, int y_step_q4,
-                                       int w, int h, int bd) {
-  assert(x_step_q4 == 16);
-  assert(y_step_q4 == 16);
-  ((int16_t *)filter_x)[3] += 128;
-  ((int16_t *)filter_y)[3] += 128;
-  aom_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h, bd);
-  ((int16_t *)filter_x)[3] -= 128;
-  ((int16_t *)filter_y)[3] -= 128;
-}
-#endif  // CONFIG_LOOP_RESTORATION
-#endif  // CONFIG_HIGHBITDEPTH && ARCH_X86_64
+#endif  // ARCH_X86_64
 #endif  // HAVE_SSE2
diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
index 4d3142867..7283c32b8 100644
--- a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
@@ -50,7 +50,6 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
   cmp r4d, 32
   je .w32
 
-%if CONFIG_AV1 && CONFIG_EXT_PARTITION
   cmp r4d, 64
   je .w64
 %ifidn %2, highbd
@@ -160,50 +159,6 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
   jnz .loop128
   RET
 
-%else  ; CONFIG_AV1 && CONFIG_EXT_PARTITION
-
-%ifidn %2, highbd
-  cmp r4d, 64
-  je .w64
-
-  mov                    r4d, dword hm
-.loop128:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+32]
-  movu                    m3, [srcq+48]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+16]
-  pavg                    m2, [dstq+32]
-  pavg                    m3, [dstq+48]
-%endif
-  mova             [dstq   ], m0
-  mova             [dstq+16], m1
-  mova             [dstq+32], m2
-  mova             [dstq+48], m3
-  movu                    m0, [srcq+64]
-  movu                    m1, [srcq+80]
-  movu                    m2, [srcq+96]
-  movu                    m3, [srcq+112]
-  add                   srcq, src_strideq
-%ifidn %1, avg
-  pavg                    m0, [dstq+64]
-  pavg                    m1, [dstq+80]
-  pavg                    m2, [dstq+96]
-  pavg                    m3, [dstq+112]
-%endif
-  mova             [dstq+64], m0
-  mova             [dstq+80], m1
-  mova             [dstq+96], m2
-  mova            [dstq+112], m3
-  add                   dstq, dst_strideq
-  sub                    r4d, 1
-  jnz .loop128
-  RET
-%endif
-%endif  ; CONFIG_AV1 && CONFIG_EXT_PARTITION
-
 .w64:
   mov                    r4d, dword hm
 .loop64:
@@ -339,7 +294,4 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
 INIT_XMM sse2
 convolve_fn copy
 convolve_fn avg
-%if CONFIG_HIGHBITDEPTH
 convolve_fn copy, highbd
-convolve_fn avg, highbd
-%endif
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
index e6d357ba3..b6f040791 100644
--- a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
@@ -200,6 +200,8 @@
     movdqu      [rdi + %2], xmm0
 %endm
 
+SECTION .text
+
 ;void aom_filter_block1d4_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
@@ -392,169 +394,6 @@ sym(aom_highbd_filter_block1d16_v8_sse2):
     pop         rbp
     ret
 
-global sym(aom_highbd_filter_block1d4_v8_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 7
-    %define k0k6 [rsp + 16 * 0]
-    %define k2k5 [rsp + 16 * 1]
-    %define k3k4 [rsp + 16 * 2]
-    %define k1k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define max [rsp + 16 * 5]
-    %define min [rsp + 16 * 6]
-
-    HIGH_GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rbx, [rbx + rbx]
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movq        xmm0, [rsi]                 ;load src: row 0
-    movq        xmm1, [rsi + rax]           ;1
-    movq        xmm6, [rsi + rdx * 2]       ;6
-    lea         rsi,  [rsi + rax]
-    movq        xmm7, [rsi + rdx * 2]       ;7
-    movq        xmm2, [rsi + rax]           ;2
-    movq        xmm3, [rsi + rax * 2]       ;3
-    movq        xmm4, [rsi + rdx]           ;4
-    movq        xmm5, [rsi + rax * 4]       ;5
-
-    HIGH_APPLY_FILTER_4 1
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 7
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d8_v8_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
-    HIGH_GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rbx, [rbx + rbx]
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-.loop:
-    LOAD_VERT_8 0
-    HIGH_APPLY_FILTER_8 1, 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 8
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d16_v8_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
-    HIGH_GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rbx, [rbx + rbx]
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-.loop:
-    LOAD_VERT_8 0
-    HIGH_APPLY_FILTER_8 1, 0
-    sub         rsi, rax
-
-    LOAD_VERT_8 16
-    HIGH_APPLY_FILTER_8 1, 16
-    add         rdi, rbx
-
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 8
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 ;void aom_filter_block1d4_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
@@ -772,194 +611,3 @@ sym(aom_highbd_filter_block1d16_h8_sse2):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-global sym(aom_highbd_filter_block1d4_h8_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 7
-    %define k0k6 [rsp + 16 * 0]
-    %define k2k5 [rsp + 16 * 1]
-    %define k3k4 [rsp + 16 * 2]
-    %define k1k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define max [rsp + 16 * 5]
-    %define min [rsp + 16 * 6]
-
-    HIGH_GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rdx, [rdx + rdx]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 6]           ;load src
-    movdqu      xmm4,   [rsi + 2]
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm4
-    movdqa      xmm7, xmm4
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm5, xmm4
-
-    psrldq      xmm1, 2
-    psrldq      xmm6, 4
-    psrldq      xmm7, 6
-    psrldq      xmm2, 4
-    psrldq      xmm3, 6
-    psrldq      xmm5, 2
-
-    HIGH_APPLY_FILTER_4 1
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 7
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d8_h8_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
-    HIGH_GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rdx, [rdx + rdx]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 6]           ;load src
-    movdqu      xmm1,   [rsi - 4]
-    movdqu      xmm2,   [rsi - 2]
-    movdqu      xmm3,   [rsi]
-    movdqu      xmm4,   [rsi + 2]
-    movdqu      xmm5,   [rsi + 4]
-    movdqu      xmm6,   [rsi + 6]
-    movdqu      xmm7,   [rsi + 8]
-
-    HIGH_APPLY_FILTER_8 1, 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 8
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d16_h8_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
-    HIGH_GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rdx, [rdx + rdx]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 6]           ;load src
-    movdqu      xmm1,   [rsi - 4]
-    movdqu      xmm2,   [rsi - 2]
-    movdqu      xmm3,   [rsi]
-    movdqu      xmm4,   [rsi + 2]
-    movdqu      xmm5,   [rsi + 4]
-    movdqu      xmm6,   [rsi + 6]
-    movdqu      xmm7,   [rsi + 8]
-
-    HIGH_APPLY_FILTER_8 1, 0
-
-    movdqu      xmm0,   [rsi + 10]           ;load src
-    movdqu      xmm1,   [rsi + 12]
-    movdqu      xmm2,   [rsi + 14]
-    movdqu      xmm3,   [rsi + 16]
-    movdqu      xmm4,   [rsi + 18]
-    movdqu      xmm5,   [rsi + 20]
-    movdqu      xmm6,   [rsi + 22]
-    movdqu      xmm7,   [rsi + 24]
-
-    HIGH_APPLY_FILTER_8 1, 16
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 8
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
index 9e2ec748c..7b3fe6419 100644
--- a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
@@ -174,6 +174,8 @@
 %endm
 %endif
 
+SECTION .text
+
 global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE
 sym(aom_highbd_filter_block1d4_v2_sse2):
     push        rbp
@@ -254,86 +256,6 @@ sym(aom_highbd_filter_block1d16_v2_sse2):
     ret
 %endif
 
-global sym(aom_highbd_filter_block1d4_v2_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM_4
-.loop:
-    movq        xmm0, [rsi]                 ;load src
-    movq        xmm1, [rsi + 2*rax]
-
-    HIGH_APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-%if ARCH_X86_64
-global sym(aom_highbd_filter_block1d8_v2_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 8
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;0
-    movdqu      xmm1, [rsi + 2*rax]         ;1
-
-    HIGH_APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d16_v2_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 9
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + 2*rax]       ;1
-    movdqu        xmm2, [rsi + 16]
-    movdqu        xmm3, [rsi + 2*rax + 16]
-
-    HIGH_APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%endif
-
 global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE
 sym(aom_highbd_filter_block1d4_h2_sse2):
     push        rbp
@@ -414,84 +336,3 @@ sym(aom_highbd_filter_block1d16_h2_sse2):
     pop         rbp
     ret
 %endif
-
-global sym(aom_highbd_filter_block1d4_h2_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 2
-
-    HIGH_APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-%if ARCH_X86_64
-global sym(aom_highbd_filter_block1d8_h2_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 8
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqu      xmm1, [rsi + 2]
-
-    HIGH_APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d16_h2_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 9
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 2]
-    movdqu      xmm2,   [rsi + 16]
-    movdqu      xmm3,   [rsi + 18]
-
-    HIGH_APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%endif
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
index 61476b8be..af45a03ac 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -11,31 +11,12 @@
 
 #include <immintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_ports/mem.h"
 
-// filters for 16_h8 and 16_v8
-DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
-
 #if defined(__clang__)
 #if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
     (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
@@ -566,10 +547,4 @@ filter8_1dfunction aom_filter_block1d4_h2_ssse3;
 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
 
-// void aom_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
-//                          int w, int h);
-FUN_CONV_2D(, avx2);
 #endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index be37738df..6bcb4a512 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -11,7 +11,8 @@
 
 #include <tmmintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve.h"
 #include "aom_mem/aom_mem.h"
@@ -285,20 +286,6 @@ filter8_1dfunction aom_filter_block1d8_v8_ssse3;
 filter8_1dfunction aom_filter_block1d8_h8_ssse3;
 filter8_1dfunction aom_filter_block1d4_v8_ssse3;
 filter8_1dfunction aom_filter_block1d4_h8_ssse3;
-filter8_1dfunction aom_filter_block1d16_v8_avg_ssse3;
-filter8_1dfunction aom_filter_block1d16_h8_avg_ssse3;
-filter8_1dfunction aom_filter_block1d8_v8_avg_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_avg_ssse3;
-filter8_1dfunction aom_filter_block1d4_v8_avg_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_avg_ssse3;
-#if CONFIG_LOOP_RESTORATION
-filter8_1dfunction aom_filter_block1d16_v8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d16_h8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d8_v8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d4_v8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_add_src_ssse3;
-#endif
 
 filter8_1dfunction aom_filter_block1d16_v2_ssse3;
 filter8_1dfunction aom_filter_block1d16_h2_ssse3;
@@ -306,12 +293,6 @@ filter8_1dfunction aom_filter_block1d8_v2_ssse3;
 filter8_1dfunction aom_filter_block1d8_h2_ssse3;
 filter8_1dfunction aom_filter_block1d4_v2_ssse3;
 filter8_1dfunction aom_filter_block1d4_h2_ssse3;
-filter8_1dfunction aom_filter_block1d16_v2_avg_ssse3;
-filter8_1dfunction aom_filter_block1d16_h2_avg_ssse3;
-filter8_1dfunction aom_filter_block1d8_v2_avg_ssse3;
-filter8_1dfunction aom_filter_block1d8_h2_avg_ssse3;
-filter8_1dfunction aom_filter_block1d4_v2_avg_ssse3;
-filter8_1dfunction aom_filter_block1d4_h2_avg_ssse3;
 
 // void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
@@ -323,598 +304,5 @@ filter8_1dfunction aom_filter_block1d4_h2_avg_ssse3;
 //                               const int16_t *filter_x, int x_step_q4,
 //                               const int16_t *filter_y, int y_step_q4,
 //                               int w, int h);
-// void aom_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h);
-// void aom_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-            ssse3);
-
-#if CONFIG_LOOP_RESTORATION
-FUN_CONV_1D_NO_BILINEAR(add_src_horiz, x_step_q4, filter_x, h, src, add_src_,
-                        ssse3);
-FUN_CONV_1D_NO_BILINEAR(add_src_vert, y_step_q4, filter_y, v,
-                        src - src_stride * 3, add_src_, ssse3);
-#endif
-
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                      out2, out3, out4, out5, out6, out7)                 \
-  {                                                                       \
-    const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1);                    \
-    const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3);                    \
-    const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5);                    \
-    const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7);                    \
-                                                                          \
-    const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1);               \
-    const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1);               \
-    const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3);               \
-    const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3);               \
-                                                                          \
-    const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2);               \
-    const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2);               \
-    const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3);               \
-    const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3);               \
-                                                                          \
-    out0 = _mm_unpacklo_epi64(tr2_0, tr2_0);                              \
-    out1 = _mm_unpackhi_epi64(tr2_0, tr2_0);                              \
-    out2 = _mm_unpacklo_epi64(tr2_1, tr2_1);                              \
-    out3 = _mm_unpackhi_epi64(tr2_1, tr2_1);                              \
-    out4 = _mm_unpacklo_epi64(tr2_2, tr2_2);                              \
-    out5 = _mm_unpackhi_epi64(tr2_2, tr2_2);                              \
-    out6 = _mm_unpacklo_epi64(tr2_3, tr2_3);                              \
-    out7 = _mm_unpackhi_epi64(tr2_3, tr2_3);                              \
-  }
-
-static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *x_filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
-  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
-  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
-  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
-  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
-  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
-  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
-  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
-  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
-  // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
-  const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
-  // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
-  const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
-  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
-  const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
-  const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
-  const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
-  const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
-  const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
-  const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 8 bytes convolve result
-  _mm_storel_epi64((__m128i *)dst, temp);
-}
-
-static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride) {
-  __m128i A, B, C, D, E, F, G, H;
-
-  A = _mm_loadl_epi64((const __m128i *)src);
-  B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
-  C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
-  D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
-  E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
-  F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
-  G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
-  H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
-
-  TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H);
-
-  _mm_storel_epi64((__m128i *)dst, A);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H);
-}
-
-static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters, int x0_q4,
-                                    int x_step_q4, int w, int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
-  int x, y, z;
-  src -= SUBPEL_TAPS / 2 - 1;
-
-  // This function processes 8x8 areas.  The intermediate height is not always
-  // a multiple of 8, so force it to be a multiple of 8 here.
-  y = h + (8 - (h & 0x7));
-
-  do {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; x += 8) {
-      // process 8 src_x steps
-      for (z = 0; z < 8; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-        if (x_q4 & SUBPEL_MASK) {
-          filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
-        } else {
-          int i;
-          for (i = 0; i < 8; ++i) {
-            temp[z * 8 + i] = src_x[i * src_stride + 3];
-          }
-        }
-        x_q4 += x_step_q4;
-      }
-
-      // transpose the 8x8 filters values back to dst
-      transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
-    }
-
-    src += src_stride * 8;
-    dst += dst_stride * 8;
-  } while (y -= 8);
-}
-
-static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  // TRANSPOSE...
-  // 00 01 02 03 04 05 06 07
-  // 10 11 12 13 14 15 16 17
-  // 20 21 22 23 24 25 26 27
-  // 30 31 32 33 34 35 36 37
-  //
-  // TO
-  //
-  // 00 10 20 30
-  // 01 11 21 31
-  // 02 12 22 32
-  // 03 13 23 33
-  // 04 14 24 34
-  // 05 15 25 35
-  // 06 16 26 36
-  // 07 17 27 37
-  //
-  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
-  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
-  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
-  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
-  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
-  const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  // 02 03 12 13 22 23 32 33
-  const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
-  // 06 07 16 17 26 27 36 37
-  const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 4 bytes
-  *(int *)dst = _mm_cvtsi128_si32(temp);
-}
-
-static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride) {
-  __m128i A = _mm_cvtsi32_si128(*(const int *)src);
-  __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
-  __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
-  __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
-  // 00 10 01 11 02 12 03 13
-  const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
-  // 20 30 21 31 22 32 23 33
-  const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
-  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  A = _mm_unpacklo_epi16(tr0_0, tr0_1);
-  B = _mm_srli_si128(A, 4);
-  C = _mm_srli_si128(A, 8);
-  D = _mm_srli_si128(A, 12);
-
-  *(int *)(dst) = _mm_cvtsi128_si32(A);
-  *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
-  *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
-  *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
-}
-
-static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters, int x0_q4,
-                                    int x_step_q4, int w, int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
-  int x, y, z;
-  src -= SUBPEL_TAPS / 2 - 1;
-
-  for (y = 0; y < h; y += 4) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; x += 4) {
-      // process 4 src_x steps
-      for (z = 0; z < 4; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-        if (x_q4 & SUBPEL_MASK) {
-          filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
-        } else {
-          int i;
-          for (i = 0; i < 4; ++i) {
-            temp[z * 4 + i] = src_x[i * src_stride + 3];
-          }
-        }
-        x_q4 += x_step_q4;
-      }
-
-      // transpose the 4x4 filters values back to dst
-      transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
-    }
-
-    src += src_stride * 4;
-    dst += dst_stride * 4;
-  }
-}
-
-static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
-  const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
-  const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
-  const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
-  const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
-  const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
-  const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
-  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
-  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
-  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
-  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 4 bytes
-  *(int *)dst = _mm_cvtsi128_si32(temp);
-}
-
-static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters, int y0_q4,
-                                   int y_step_q4, int w, int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (y = 0; y < h; ++y) {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-
-    if (y_q4 & SUBPEL_MASK) {
-      filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
-    } else {
-      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
-    }
-
-    y_q4 += y_step_q4;
-  }
-}
-
-static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
-  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
-  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
-  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 8 bytes convolve result
-  _mm_storel_epi64((__m128i *)dst, temp);
-}
-
-static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters, int y0_q4,
-                                   int y_step_q4, int w, int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (y = 0; y < h; ++y) {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-    if (y_q4 & SUBPEL_MASK) {
-      filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
-    } else {
-      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
-    }
-    y_q4 += y_step_q4;
-  }
-}
-
-static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *filter, int w) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  int i;
-
-  for (i = 0; i < w; i += 16) {
-    const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
-    const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
-    const __m128i C =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-    const __m128i D =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-    const __m128i E =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
-    const __m128i F =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-    const __m128i G =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-    const __m128i H =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-    // merge the result together
-    const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
-    const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
-    const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
-    const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
-    const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
-    const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
-    const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
-    // add and saturate the results together
-    const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
-    const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
-    // merge the result together
-    const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
-    const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
-    const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
-    // merge the result together
-    const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
-    const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
-    const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
-    // add and saturate the results together
-    __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
-    __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
-
-    // add and saturate the results together
-    temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
-    temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
-    // round and shift by 7 bit each 16 bit
-    temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
-    temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
-    src_ptr += 16;
-    // save 16 bytes convolve result
-    _mm_store_si128((__m128i *)&dst[i], temp_hi);
-  }
-}
-
-static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *y_filters, int y0_q4,
-                                    int y_step_q4, int w, int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (y = 0; y < h; ++y) {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-    if (y_q4 & SUBPEL_MASK) {
-      filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
-                            w);
-    } else {
-      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
-    }
-    y_q4 += y_step_q4;
-  }
-}
-
-static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *const x_filters, int x0_q4,
-                             int x_step_q4, const InterpKernel *const y_filters,
-                             int y0_q4, int y_step_q4, int w, int h) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  // --Require an additional 8 rows for the horiz_w8 transpose tail.
-  DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_SB_SIZE]);
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  if (w >= 8) {
-    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
-                            x_step_q4, w, intermediate_height);
-  } else {
-    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
-                            x_step_q4, w, intermediate_height);
-  }
-
-  if (w >= 16) {
-    scaledconvolve_vert_w16(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                            MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
-                            y_step_q4, w, h);
-  } else if (w == 8) {
-    scaledconvolve_vert_w8(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                           MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
-                           y_step_q4, w, h);
-  } else {
-    scaledconvolve_vert_w4(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                           MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
-                           y_step_q4, w, h);
-  }
-}
-
-static const InterpKernel *get_filter_base(const int16_t *filter) {
-  // NOTE: This assumes that the filter table is 256-byte aligned.
-  // TODO(agrange) Modify to make independent of table alignment.
-  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
-  return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
-void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                   x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
-}
-
-// void aom_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
-//                          int w, int h);
-// void aom_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
-//                              int w, int h);
-FUN_CONV_2D(, ssse3);
-FUN_CONV_2D(avg_, ssse3);
-#if CONFIG_LOOP_RESTORATION
-FUN_CONV_2D_NO_BILINEAR(add_src_, add_src_, ssse3);
-#endif
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
index b946010d3..c88fc9ffb 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
@@ -179,6 +179,8 @@
     movq        [rdi + %2], xmm0
 %endm
 
+SECTION .text
+
 ;void aom_filter_block1d4_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
@@ -368,166 +370,6 @@ sym(aom_filter_block1d16_v8_sse2):
     pop         rbp
     ret
 
-global sym(aom_filter_block1d4_v8_avg_sse2) PRIVATE
-sym(aom_filter_block1d4_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 6
-    %define k0k1 [rsp + 16 * 0]
-    %define k2k3 [rsp + 16 * 1]
-    %define k5k4 [rsp + 16 * 2]
-    %define k6k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define zero [rsp + 16 * 5]
-
-    GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movd        xmm0, [rsi]                 ;load src: row 0
-    movd        xmm1, [rsi + rax]           ;1
-    movd        xmm6, [rsi + rdx * 2]       ;6
-    lea         rsi,  [rsi + rax]
-    movd        xmm7, [rsi + rdx * 2]       ;7
-    movd        xmm2, [rsi + rax]           ;2
-    movd        xmm3, [rsi + rax * 2]       ;3
-    movd        xmm4, [rsi + rdx]           ;4
-    movd        xmm5, [rsi + rax * 4]       ;5
-
-    APPLY_FILTER_4 1
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 6
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_v8_avg_sse2) PRIVATE
-sym(aom_filter_block1d8_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-.loop:
-    LOAD_VERT_8 0
-    APPLY_FILTER_8 1, 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_v8_avg_sse2) PRIVATE
-sym(aom_filter_block1d16_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-.loop:
-    LOAD_VERT_8 0
-    APPLY_FILTER_8 1, 0
-    sub         rsi, rax
-
-    LOAD_VERT_8 8
-    APPLY_FILTER_8 1, 8
-    add         rdi, rbx
-
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 ;void aom_filter_block1d4_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
@@ -771,220 +613,3 @@ sym(aom_filter_block1d16_h8_sse2):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-global sym(aom_filter_block1d4_h8_avg_sse2) PRIVATE
-sym(aom_filter_block1d4_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 6
-    %define k0k1 [rsp + 16 * 0]
-    %define k2k3 [rsp + 16 * 1]
-    %define k5k4 [rsp + 16 * 2]
-    %define k6k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define zero [rsp + 16 * 5]
-
-    GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm3, 3
-    psrldq      xmm5, 5
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_4 1
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 6
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_h8_avg_sse2) PRIVATE
-sym(aom_filter_block1d8_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 1, 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_h8_avg_sse2) PRIVATE
-sym(aom_filter_block1d16_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 1, 0
-
-    movdqu      xmm0,   [rsi + 5]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 1, 8
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
index 8688fb544..3ca7921b6 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
@@ -375,17 +375,8 @@ cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
 
 INIT_XMM ssse3
 SUBPIX_HFILTER16 h8
-SUBPIX_HFILTER16 h8_avg
 SUBPIX_HFILTER8  h8
-SUBPIX_HFILTER8  h8_avg
 SUBPIX_HFILTER4  h8
-SUBPIX_HFILTER4  h8_avg
-
-%if CONFIG_LOOP_RESTORATION
-SUBPIX_HFILTER16 h8_add_src
-SUBPIX_HFILTER8  h8_add_src
-SUBPIX_HFILTER4  h8_add_src
-%endif
 
 ;-------------------------------------------------------------------------------
 
@@ -875,15 +866,5 @@ cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
 
 INIT_XMM ssse3
 SUBPIX_VFILTER16     v8
-SUBPIX_VFILTER16 v8_avg
 SUBPIX_VFILTER       v8, 8
-SUBPIX_VFILTER   v8_avg, 8
 SUBPIX_VFILTER       v8, 4
-SUBPIX_VFILTER   v8_avg, 4
-
-%if (ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON) && \
-    CONFIG_LOOP_RESTORATION
-SUBPIX_VFILTER16 v8_add_src
-SUBPIX_VFILTER   v8_add_src, 8
-SUBPIX_VFILTER   v8_add_src, 4
-%endif
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
index 8f025a8be..d0b4b2839 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
@@ -134,6 +134,8 @@
     dec         rcx
 %endm
 
+SECTION .text
+
 global sym(aom_filter_block1d4_v2_sse2) PRIVATE
 sym(aom_filter_block1d4_v2_sse2):
     push        rbp
@@ -212,84 +214,6 @@ sym(aom_filter_block1d16_v2_sse2):
     pop         rbp
     ret
 
-global sym(aom_filter_block1d4_v2_avg_sse2) PRIVATE
-sym(aom_filter_block1d4_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movd        xmm0, [rsi]                 ;load src
-    movd        xmm1, [rsi + rax]
-
-    APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_v2_avg_sse2) PRIVATE
-sym(aom_filter_block1d8_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movq        xmm0, [rsi]                 ;0
-    movq        xmm1, [rsi + rax]           ;1
-
-    APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_v2_avg_sse2) PRIVATE
-sym(aom_filter_block1d16_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + rax]         ;1
-    movdqa        xmm2, xmm0
-    movdqa        xmm3, xmm1
-
-    APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 global sym(aom_filter_block1d4_h2_sse2) PRIVATE
 sym(aom_filter_block1d4_h2_sse2):
     push        rbp
@@ -369,83 +293,3 @@ sym(aom_filter_block1d16_h2_sse2):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-global sym(aom_filter_block1d4_h2_avg_sse2) PRIVATE
-sym(aom_filter_block1d4_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_h2_avg_sse2) PRIVATE
-sym(aom_filter_block1d8_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_h2_avg_sse2) PRIVATE
-sym(aom_filter_block1d16_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 1]
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm1
-
-    APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
index b9b2da0be..59edc49a9 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
@@ -108,6 +108,8 @@
     dec         rcx
 %endm
 
+SECTION .text
+
 global sym(aom_filter_block1d4_v2_ssse3) PRIVATE
 sym(aom_filter_block1d4_v2_ssse3):
     push        rbp
@@ -185,83 +187,6 @@ sym(aom_filter_block1d16_v2_ssse3):
     pop         rbp
     ret
 
-global sym(aom_filter_block1d4_v2_avg_ssse3) PRIVATE
-sym(aom_filter_block1d4_v2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movd        xmm0, [rsi]                 ;load src
-    movd        xmm1, [rsi + rax]
-
-    APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_v2_avg_ssse3) PRIVATE
-sym(aom_filter_block1d8_v2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movq        xmm0, [rsi]                 ;0
-    movq        xmm1, [rsi + rax]           ;1
-
-    APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_v2_avg_ssse3) PRIVATE
-sym(aom_filter_block1d16_v2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + rax]         ;1
-    movdqa        xmm2, xmm0
-
-    APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 global sym(aom_filter_block1d4_h2_ssse3) PRIVATE
 sym(aom_filter_block1d4_h2_ssse3):
     push        rbp
@@ -340,82 +265,3 @@ sym(aom_filter_block1d16_h2_ssse3):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-global sym(aom_filter_block1d4_h2_avg_ssse3) PRIVATE
-sym(aom_filter_block1d4_h2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_h2_avg_ssse3) PRIVATE
-sym(aom_filter_block1d8_h2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_h2_avg_ssse3) PRIVATE
-sym(aom_filter_block1d16_h2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 1]
-    movdqa      xmm2, xmm0
-
-    APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
deleted file mode 100644
index 1a6457402..000000000
--- a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "aom_dsp/x86/synonyms.h"
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_ports/mem.h"
-
-void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
-                         int *min, int *max) {
-  __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
-  u0 = _mm_setzero_si128();
-  // Row 0
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff0 = _mm_max_epi16(diff, negdiff);
-  // Row 1
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
-  minabsdiff = _mm_min_epi16(absdiff0, absdiff);
-  // Row 2
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 3
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 4
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 5
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 6
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 7
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-
-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
-  *max = _mm_extract_epi16(maxabsdiff, 0);
-
-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
-  *min = _mm_extract_epi16(minabsdiff, 0);
-}
-
-static void hadamard_col8_sse2(__m128i *in, int iter) {
-  __m128i a0 = in[0];
-  __m128i a1 = in[1];
-  __m128i a2 = in[2];
-  __m128i a3 = in[3];
-  __m128i a4 = in[4];
-  __m128i a5 = in[5];
-  __m128i a6 = in[6];
-  __m128i a7 = in[7];
-
-  __m128i b0 = _mm_add_epi16(a0, a1);
-  __m128i b1 = _mm_sub_epi16(a0, a1);
-  __m128i b2 = _mm_add_epi16(a2, a3);
-  __m128i b3 = _mm_sub_epi16(a2, a3);
-  __m128i b4 = _mm_add_epi16(a4, a5);
-  __m128i b5 = _mm_sub_epi16(a4, a5);
-  __m128i b6 = _mm_add_epi16(a6, a7);
-  __m128i b7 = _mm_sub_epi16(a6, a7);
-
-  a0 = _mm_add_epi16(b0, b2);
-  a1 = _mm_add_epi16(b1, b3);
-  a2 = _mm_sub_epi16(b0, b2);
-  a3 = _mm_sub_epi16(b1, b3);
-  a4 = _mm_add_epi16(b4, b6);
-  a5 = _mm_add_epi16(b5, b7);
-  a6 = _mm_sub_epi16(b4, b6);
-  a7 = _mm_sub_epi16(b5, b7);
-
-  if (iter == 0) {
-    b0 = _mm_add_epi16(a0, a4);
-    b7 = _mm_add_epi16(a1, a5);
-    b3 = _mm_add_epi16(a2, a6);
-    b4 = _mm_add_epi16(a3, a7);
-    b2 = _mm_sub_epi16(a0, a4);
-    b6 = _mm_sub_epi16(a1, a5);
-    b1 = _mm_sub_epi16(a2, a6);
-    b5 = _mm_sub_epi16(a3, a7);
-
-    a0 = _mm_unpacklo_epi16(b0, b1);
-    a1 = _mm_unpacklo_epi16(b2, b3);
-    a2 = _mm_unpackhi_epi16(b0, b1);
-    a3 = _mm_unpackhi_epi16(b2, b3);
-    a4 = _mm_unpacklo_epi16(b4, b5);
-    a5 = _mm_unpacklo_epi16(b6, b7);
-    a6 = _mm_unpackhi_epi16(b4, b5);
-    a7 = _mm_unpackhi_epi16(b6, b7);
-
-    b0 = _mm_unpacklo_epi32(a0, a1);
-    b1 = _mm_unpacklo_epi32(a4, a5);
-    b2 = _mm_unpackhi_epi32(a0, a1);
-    b3 = _mm_unpackhi_epi32(a4, a5);
-    b4 = _mm_unpacklo_epi32(a2, a3);
-    b5 = _mm_unpacklo_epi32(a6, a7);
-    b6 = _mm_unpackhi_epi32(a2, a3);
-    b7 = _mm_unpackhi_epi32(a6, a7);
-
-    in[0] = _mm_unpacklo_epi64(b0, b1);
-    in[1] = _mm_unpackhi_epi64(b0, b1);
-    in[2] = _mm_unpacklo_epi64(b2, b3);
-    in[3] = _mm_unpackhi_epi64(b2, b3);
-    in[4] = _mm_unpacklo_epi64(b4, b5);
-    in[5] = _mm_unpackhi_epi64(b4, b5);
-    in[6] = _mm_unpacklo_epi64(b6, b7);
-    in[7] = _mm_unpackhi_epi64(b6, b7);
-  } else {
-    in[0] = _mm_add_epi16(a0, a4);
-    in[7] = _mm_add_epi16(a1, a5);
-    in[3] = _mm_add_epi16(a2, a6);
-    in[4] = _mm_add_epi16(a3, a7);
-    in[2] = _mm_sub_epi16(a0, a4);
-    in[6] = _mm_sub_epi16(a1, a5);
-    in[1] = _mm_sub_epi16(a2, a6);
-    in[5] = _mm_sub_epi16(a3, a7);
-  }
-}
-
-void aom_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
-                           int16_t *coeff) {
-  __m128i src[8];
-  src[0] = _mm_load_si128((const __m128i *)src_diff);
-  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-
-  hadamard_col8_sse2(src, 0);
-  hadamard_col8_sse2(src, 1);
-
-  _mm_store_si128((__m128i *)coeff, src[0]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[1]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[2]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[3]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[4]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[5]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[6]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[7]);
-}
-
-void aom_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
-                             int16_t *coeff) {
-  int idx;
-  for (idx = 0; idx < 4; ++idx) {
-    int16_t const *src_ptr =
-        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
-    aom_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
-  }
-
-  for (idx = 0; idx < 64; idx += 8) {
-    __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
-    __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
-    __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
-    __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
-
-    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
-    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
-    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
-    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
-
-    b0 = _mm_srai_epi16(b0, 1);
-    b1 = _mm_srai_epi16(b1, 1);
-    b2 = _mm_srai_epi16(b2, 1);
-    b3 = _mm_srai_epi16(b3, 1);
-
-    coeff0 = _mm_add_epi16(b0, b2);
-    coeff1 = _mm_add_epi16(b1, b3);
-    _mm_store_si128((__m128i *)coeff, coeff0);
-    _mm_store_si128((__m128i *)(coeff + 64), coeff1);
-
-    coeff2 = _mm_sub_epi16(b0, b2);
-    coeff3 = _mm_sub_epi16(b1, b3);
-    _mm_store_si128((__m128i *)(coeff + 128), coeff2);
-    _mm_store_si128((__m128i *)(coeff + 192), coeff3);
-
-    coeff += 8;
-  }
-}
-
-int aom_satd_sse2(const int16_t *coeff, int length) {
-  int i;
-  const __m128i zero = _mm_setzero_si128();
-  __m128i accum = zero;
-
-  for (i = 0; i < length; i += 8) {
-    const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
-    const __m128i inv = _mm_sub_epi16(zero, src_line);
-    const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)
-    const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
-    const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
-    const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
-    accum = _mm_add_epi32(accum, sum);
-    coeff += 8;
-  }
-
-  {  // cascading summation of accum
-    __m128i hi = _mm_srli_si128(accum, 8);
-    accum = _mm_add_epi32(accum, hi);
-    hi = _mm_srli_epi64(accum, 32);
-    accum = _mm_add_epi32(accum, hi);
-  }
-
-  return _mm_cvtsi128_si32(accum);
-}
-
-void aom_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref, int ref_stride,
-                          int height) {
-  int idx;
-  __m128i zero = _mm_setzero_si128();
-  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
-  __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
-  __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
-  __m128i t0, t1;
-  int height_1 = height - 1;
-  ref += ref_stride;
-
-  for (idx = 1; idx < height_1; idx += 2) {
-    src_line = _mm_loadu_si128((const __m128i *)ref);
-    t0 = _mm_unpacklo_epi8(src_line, zero);
-    t1 = _mm_unpackhi_epi8(src_line, zero);
-    s0 = _mm_adds_epu16(s0, t0);
-    s1 = _mm_adds_epu16(s1, t1);
-    ref += ref_stride;
-
-    src_line = _mm_loadu_si128((const __m128i *)ref);
-    t0 = _mm_unpacklo_epi8(src_line, zero);
-    t1 = _mm_unpackhi_epi8(src_line, zero);
-    s0 = _mm_adds_epu16(s0, t0);
-    s1 = _mm_adds_epu16(s1, t1);
-    ref += ref_stride;
-  }
-
-  src_line = _mm_loadu_si128((const __m128i *)ref);
-  t0 = _mm_unpacklo_epi8(src_line, zero);
-  t1 = _mm_unpackhi_epi8(src_line, zero);
-  s0 = _mm_adds_epu16(s0, t0);
-  s1 = _mm_adds_epu16(s1, t1);
-
-  if (height == 64) {
-    s0 = _mm_srai_epi16(s0, 5);
-    s1 = _mm_srai_epi16(s1, 5);
-  } else if (height == 32) {
-    s0 = _mm_srai_epi16(s0, 4);
-    s1 = _mm_srai_epi16(s1, 4);
-  } else {
-    s0 = _mm_srai_epi16(s0, 3);
-    s1 = _mm_srai_epi16(s1, 3);
-  }
-
-  _mm_storeu_si128((__m128i *)hbuf, s0);
-  hbuf += 8;
-  _mm_storeu_si128((__m128i *)hbuf, s1);
-}
-
-int16_t aom_int_pro_col_sse2(uint8_t const *ref, int width) {
-  __m128i zero = _mm_setzero_si128();
-  __m128i src_line = _mm_load_si128((const __m128i *)ref);
-  __m128i s0 = _mm_sad_epu8(src_line, zero);
-  __m128i s1;
-  int i;
-
-  for (i = 16; i < width; i += 16) {
-    ref += 16;
-    src_line = _mm_load_si128((const __m128i *)ref);
-    s1 = _mm_sad_epu8(src_line, zero);
-    s0 = _mm_adds_epu16(s0, s1);
-  }
-
-  s1 = _mm_srli_si128(s0, 8);
-  s0 = _mm_adds_epu16(s0, s1);
-
-  return _mm_extract_epi16(s0, 0);
-}
-
-int aom_vector_var_sse2(int16_t const *ref, int16_t const *src, int bwl) {
-  int idx;
-  int width = 4 << bwl;
-  int16_t mean;
-  __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
-  __m128i v1 = _mm_load_si128((const __m128i *)src);
-  __m128i diff = _mm_subs_epi16(v0, v1);
-  __m128i sum = diff;
-  __m128i sse = _mm_madd_epi16(diff, diff);
-
-  ref += 8;
-  src += 8;
-
-  for (idx = 8; idx < width; idx += 8) {
-    v0 = _mm_loadu_si128((const __m128i *)ref);
-    v1 = _mm_load_si128((const __m128i *)src);
-    diff = _mm_subs_epi16(v0, v1);
-
-    sum = _mm_add_epi16(sum, diff);
-    v0 = _mm_madd_epi16(diff, diff);
-    sse = _mm_add_epi32(sse, v0);
-
-    ref += 8;
-    src += 8;
-  }
-
-  v0 = _mm_srli_si128(sum, 8);
-  sum = _mm_add_epi16(sum, v0);
-  v0 = _mm_srli_epi64(sum, 32);
-  sum = _mm_add_epi16(sum, v0);
-  v0 = _mm_srli_epi32(sum, 16);
-  sum = _mm_add_epi16(sum, v0);
-
-  v1 = _mm_srli_si128(sse, 8);
-  sse = _mm_add_epi32(sse, v1);
-  v1 = _mm_srli_epi64(sse, 32);
-  sse = _mm_add_epi32(sse, v1);
-
-  mean = _mm_extract_epi16(sum, 0);
-
-  return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
-}
diff --git a/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm
deleted file mode 100644
index b2d150296..000000000
--- a/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm
+++ /dev/null
@@ -1,124 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%define private_prefix aom
-
-%include "third_party/x86inc/x86inc.asm"
-
-; This file provides SSSE3 version of the hadamard transformation. Part
-; of the macro definitions are originally derived from the ffmpeg project.
-; The current version applies to x86 64-bit only.
-
-SECTION .text
-
-%if ARCH_X86_64
-; matrix transpose
-%macro INTERLEAVE_2X 4
-  punpckh%1          m%4, m%2, m%3
-  punpckl%1          m%2, m%3
-  SWAP               %3,  %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
-  INTERLEAVE_2X  wd, %1, %2, %9
-  INTERLEAVE_2X  wd, %3, %4, %9
-  INTERLEAVE_2X  wd, %5, %6, %9
-  INTERLEAVE_2X  wd, %7, %8, %9
-
-  INTERLEAVE_2X  dq, %1, %3, %9
-  INTERLEAVE_2X  dq, %2, %4, %9
-  INTERLEAVE_2X  dq, %5, %7, %9
-  INTERLEAVE_2X  dq, %6, %8, %9
-
-  INTERLEAVE_2X  qdq, %1, %5, %9
-  INTERLEAVE_2X  qdq, %3, %7, %9
-  INTERLEAVE_2X  qdq, %2, %6, %9
-  INTERLEAVE_2X  qdq, %4, %8, %9
-
-  SWAP  %2, %5
-  SWAP  %4, %7
-%endmacro
-
-%macro HMD8_1D 0
-  psubw              m8, m0, m1
-  psubw              m9, m2, m3
-  paddw              m0, m1
-  paddw              m2, m3
-  SWAP               1, 8
-  SWAP               3, 9
-  psubw              m8, m4, m5
-  psubw              m9, m6, m7
-  paddw              m4, m5
-  paddw              m6, m7
-  SWAP               5, 8
-  SWAP               7, 9
-
-  psubw              m8, m0, m2
-  psubw              m9, m1, m3
-  paddw              m0, m2
-  paddw              m1, m3
-  SWAP               2, 8
-  SWAP               3, 9
-  psubw              m8, m4, m6
-  psubw              m9, m5, m7
-  paddw              m4, m6
-  paddw              m5, m7
-  SWAP               6, 8
-  SWAP               7, 9
-
-  psubw              m8, m0, m4
-  psubw              m9, m1, m5
-  paddw              m0, m4
-  paddw              m1, m5
-  SWAP               4, 8
-  SWAP               5, 9
-  psubw              m8, m2, m6
-  psubw              m9, m3, m7
-  paddw              m2, m6
-  paddw              m3, m7
-  SWAP               6, 8
-  SWAP               7, 9
-%endmacro
-
-INIT_XMM ssse3
-cglobal hadamard_8x8, 3, 5, 10, input, stride, output
-  lea                r3, [2 * strideq]
-  lea                r4, [4 * strideq]
-
-  mova               m0, [inputq]
-  mova               m1, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m2, [inputq]
-  mova               m3, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m4, [inputq]
-  mova               m5, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m6, [inputq]
-  mova               m7, [inputq + r3]
-
-  HMD8_1D
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-  HMD8_1D
-
-  mova              [outputq +   0], m0
-  mova              [outputq +  16], m1
-  mova              [outputq +  32], m2
-  mova              [outputq +  48], m3
-  mova              [outputq +  64], m4
-  mova              [outputq +  80], m5
-  mova              [outputq +  96], m6
-  mova              [outputq + 112], m7
-
-  RET
-%endif
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
index e916e4ff9..4f5e3f8c1 100644
--- a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
+++ b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
@@ -11,7 +11,7 @@
 
 #include "aom/aom_integer.h"
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 // To start out, just dispatch to the function using the 2D mask and
 // pass mask stride as 0. This can be improved upon if necessary.
@@ -19,18 +19,16 @@
 void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                 const uint8_t *src0, uint32_t src0_stride,
                                 const uint8_t *src1, uint32_t src1_stride,
-                                const uint8_t *mask, int h, int w) {
+                                const uint8_t *mask, int w, int h) {
   aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                            src1_stride, mask, 0, h, w, 0, 0);
+                            src1_stride, mask, 0, w, h, 0, 0);
 }
 
-#if CONFIG_HIGHBITDEPTH
 void aom_highbd_blend_a64_hmask_sse4_1(
     uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
     uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w, int bd) {
+    const uint8_t *mask, int w, int h, int bd) {
   aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride,
-                                   src1_8, src1_stride, mask, 0, h, w, 0, 0,
+                                   src1_8, src1_stride, mask, 0, w, h, 0, 0,
                                    bd);
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
index 68d74e517..49c20b467 100644
--- a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -21,7 +21,7 @@
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/blend_sse4.h"
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 //////////////////////////////////////////////////////////////////////////////
 // No sub-sampling
@@ -31,7 +31,7 @@ static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                      const uint8_t *src0, uint32_t src0_stride,
                                      const uint8_t *src1, uint32_t src1_stride,
                                      const uint8_t *mask, uint32_t mask_stride,
-                                     int h, int w) {
+                                     int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -58,7 +58,7 @@ static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                      const uint8_t *src0, uint32_t src0_stride,
                                      const uint8_t *src1, uint32_t src1_stride,
                                      const uint8_t *mask, uint32_t mask_stride,
-                                     int h, int w) {
+                                     int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -84,7 +84,7 @@ static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
 static void blend_a64_mask_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -119,7 +119,7 @@ static void blend_a64_mask_w16n_sse4_1(
 static void blend_a64_mask_sx_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
@@ -149,7 +149,7 @@ static void blend_a64_mask_sx_w4_sse4_1(
 static void blend_a64_mask_sx_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
@@ -179,7 +179,7 @@ static void blend_a64_mask_sx_w8_sse4_1(
 static void blend_a64_mask_sx_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
@@ -219,7 +219,7 @@ static void blend_a64_mask_sx_w16n_sse4_1(
 static void blend_a64_mask_sy_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -248,7 +248,7 @@ static void blend_a64_mask_sy_w4_sse4_1(
 static void blend_a64_mask_sy_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -277,7 +277,7 @@ static void blend_a64_mask_sy_w8_sse4_1(
 static void blend_a64_mask_sy_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zero = _mm_setzero_si128();
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
@@ -315,7 +315,7 @@ static void blend_a64_mask_sy_w16n_sse4_1(
 static void blend_a64_mask_sx_sy_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
@@ -350,7 +350,7 @@ static void blend_a64_mask_sx_sy_w4_sse4_1(
 static void blend_a64_mask_sx_sy_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
@@ -385,7 +385,7 @@ static void blend_a64_mask_sx_sy_w8_sse4_1(
 static void blend_a64_mask_sx_sy_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
@@ -435,12 +435,12 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1(
 void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                const uint8_t *src0, uint32_t src0_stride,
                                const uint8_t *src1, uint32_t src1_stride,
-                               const uint8_t *mask, uint32_t mask_stride, int h,
-                               int w, int suby, int subx) {
+                               const uint8_t *mask, uint32_t mask_stride, int w,
+                               int h, int subx, int suby) {
   typedef void (*blend_fn)(
       uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
       uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-      const uint8_t *mask, uint32_t mask_stride, int h, int w);
+      const uint8_t *mask, uint32_t mask_stride, int w, int h);
 
   // Dimensions are: width_index X subx X suby
   static const blend_fn blend[3][2][2] = {
@@ -465,15 +465,14 @@ void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
 
   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
     aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                         mask, mask_stride, h, w, suby, subx);
+                         mask, mask_stride, w, h, subx, suby);
   } else {
     blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0,
                                               src0_stride, src1, src1_stride,
-                                              mask, mask_stride, h, w);
+                                              mask, mask_stride, w, h);
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 //////////////////////////////////////////////////////////////////////////////
 // No sub-sampling
 //////////////////////////////////////////////////////////////////////////////
@@ -503,7 +502,7 @@ static INLINE void blend_a64_mask_bn_w4_sse4_1(
 static void blend_a64_mask_b10_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                               src1_stride, mask, mask_stride, h, blend_4_b10);
@@ -512,7 +511,7 @@ static void blend_a64_mask_b10_w4_sse4_1(
 static void blend_a64_mask_b12_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                               src1_stride, mask, mask_stride, h, blend_4_b12);
@@ -521,7 +520,7 @@ static void blend_a64_mask_b12_w4_sse4_1(
 static INLINE void blend_a64_mask_bn_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
     blend_unit_fn blend) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
@@ -546,18 +545,18 @@ static INLINE void blend_a64_mask_bn_w8n_sse4_1(
 static void blend_a64_mask_b10_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, h, w,
+                               src1_stride, mask, mask_stride, w, h,
                                blend_8_b10);
 }
 
 static void blend_a64_mask_b12_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, h, w,
+                               src1_stride, mask, mask_stride, w, h,
                                blend_8_b12);
 }
 
@@ -594,7 +593,7 @@ static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
 static void blend_a64_mask_b10_sx_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                  src1_stride, mask, mask_stride, h,
@@ -604,7 +603,7 @@ static void blend_a64_mask_b10_sx_w4_sse4_1(
 static void blend_a64_mask_b12_sx_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                  src1_stride, mask, mask_stride, h,
@@ -614,7 +613,7 @@ static void blend_a64_mask_b12_sx_w4_sse4_1(
 static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
     blend_unit_fn blend) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
@@ -643,18 +642,18 @@ static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
 static void blend_a64_mask_b10_sx_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, h, w,
+                                  src1_stride, mask, mask_stride, w, h,
                                   blend_8_b10);
 }
 
 static void blend_a64_mask_b12_sx_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, h, w,
+                                  src1_stride, mask, mask_stride, w, h,
                                   blend_8_b12);
 }
 
@@ -690,7 +689,7 @@ static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
 static void blend_a64_mask_b10_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                  src1_stride, mask, mask_stride, h,
@@ -700,7 +699,7 @@ static void blend_a64_mask_b10_sy_w4_sse4_1(
 static void blend_a64_mask_b12_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                  src1_stride, mask, mask_stride, h,
@@ -710,7 +709,7 @@ static void blend_a64_mask_b12_sy_w4_sse4_1(
 static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
     blend_unit_fn blend) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
@@ -738,18 +737,18 @@ static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
 static void blend_a64_mask_b10_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, h, w,
+                                  src1_stride, mask, mask_stride, w, h,
                                   blend_8_b10);
 }
 
 static void blend_a64_mask_b12_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, h, w,
+                                  src1_stride, mask, mask_stride, w, h,
                                   blend_8_b12);
 }
 
@@ -791,7 +790,7 @@ static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
 static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                     src1_stride, mask, mask_stride, h,
@@ -801,7 +800,7 @@ static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
 static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                     src1_stride, mask, mask_stride, h,
@@ -811,7 +810,7 @@ static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
 static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
     blend_unit_fn blend) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
@@ -845,18 +844,18 @@ static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
 static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                     src1_stride, mask, mask_stride, h, w,
+                                     src1_stride, mask, mask_stride, w, h,
                                      blend_8_b10);
 }
 
 static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                     src1_stride, mask, mask_stride, h, w,
+                                     src1_stride, mask, mask_stride, w, h,
                                      blend_8_b12);
 }
 
@@ -869,12 +868,12 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
                                       uint32_t src0_stride,
                                       const uint8_t *src1_8,
                                       uint32_t src1_stride, const uint8_t *mask,
-                                      uint32_t mask_stride, int h, int w,
-                                      int suby, int subx, int bd) {
+                                      uint32_t mask_stride, int w, int h,
+                                      int subx, int suby, int bd) {
   typedef void (*blend_fn)(
       uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
       uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-      const uint8_t *mask, uint32_t mask_stride, int h, int w);
+      const uint8_t *mask, uint32_t mask_stride, int w, int h);
 
   // Dimensions are: bd_index X width_index X subx X suby
   static const blend_fn blend[2][2][2][2] = {
@@ -909,8 +908,8 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
   assert(bd == 8 || bd == 10 || bd == 12);
   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
     aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
-                                src1_stride, mask, mask_stride, h, w, suby,
-                                subx, bd);
+                                src1_stride, mask, mask_stride, w, h, subx,
+                                suby, bd);
   } else {
     uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
     const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
@@ -918,7 +917,113 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
 
     blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](
         dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-        mask_stride, h, w);
+        mask_stride, w, h);
+  }
+}
+
+static INLINE void blend_a64_d16_mask(uint8_t *dst, const CONV_BUF_TYPE *src0,
+                                      const CONV_BUF_TYPE *src1,
+                                      const __m128i *m,
+                                      const __m128i *v_round_offset,
+                                      const __m128i *v_maxval, int round_bits) {
+  const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
+  const __m128i s0 = xx_loadl_64(src0);
+  const __m128i s1 = xx_loadl_64(src1);
+  const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
+  const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
+  const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
+  const __m128i res_b = _mm_srli_epi32(res_a, AOM_BLEND_A64_ROUND_BITS);
+  const __m128i res_c = _mm_sub_epi32(res_b, *v_round_offset);
+  const __m128i res_d = xx_roundn_epi32(res_c, round_bits);
+  const __m128i res_e = _mm_packs_epi32(res_d, res_d);
+  const __m128i res = _mm_packus_epi16(res_e, res_e);
+
+  xx_storel_32(dst, res);
+}
+
+void aom_lowbd_blend_a64_d16_mask_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i v_ro_a = xx_loadl_32(&round_offset);
+  const __m128i v_round_offset = _mm_shuffle_epi32(v_ro_a, 0);
+  const __m128i one_w = _mm_set1_epi16(1);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+
+  if (subw == 0 && subh == 0) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 4) {
+        const __m128i m0 = xx_loadl_32(&mask[i * mask_stride + j]);
+        const __m128i m = _mm_cvtepu8_epi16(m0);
+
+        blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
+                           &src1[i * src1_stride + j], &m, &v_round_offset,
+                           &v_maxval, round_bits);
+      }
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 4) {
+        const __m128i m_i0 =
+            xx_loadl_64(&mask[(2 * i) * mask_stride + (2 * j)]);
+        const __m128i m_i1 =
+            xx_loadl_64(&mask[(2 * i + 1) * mask_stride + (2 * j)]);
+        const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+        const __m128i m_bd = _mm_maddubs_epi16(m_i1, one_b);
+        const __m128i m_acbd = _mm_add_epi16(m_ac, m_bd);
+        const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
+        const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
+
+        blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
+                           &src1[i * src1_stride + j], &m, &v_round_offset,
+                           &v_maxval, round_bits);
+      }
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 4) {
+        const __m128i m_i0 = xx_loadl_64(&mask[i * mask_stride + (2 * j)]);
+        const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+        const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w);
+        const __m128i m = _mm_srli_epi16(m_ac_1, 1);
+
+        blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
+                           &src1[i * src1_stride + j], &m, &v_round_offset,
+                           &v_maxval, round_bits);
+      }
+    }
+  } else {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 4) {
+        const __m128i m_i0 = xx_loadl_64(&mask[(2 * i) * mask_stride + j]);
+        const __m128i m_i1 = xx_loadl_64(&mask[(2 * i + 1) * mask_stride + j]);
+        const __m128i m_i01 = _mm_unpacklo_epi8(m_i0, m_i1);
+        const __m128i m_ac = _mm_maddubs_epi16(m_i01, one_b);
+        const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w);
+        const __m128i m = _mm_srli_epi16(m_ac_1, 1);
+
+        blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
+                           &src1[i * src1_stride + j], &m, &v_round_offset,
+                           &v_maxval, round_bits);
+      }
+    }
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
index 9dabe5b79..59506bdfe 100644
--- a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
+++ b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
@@ -21,7 +21,7 @@
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/blend_sse4.h"
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 //////////////////////////////////////////////////////////////////////////////
 // Implementation - No sub-sampling
@@ -30,7 +30,7 @@
 static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                       const uint8_t *src0, uint32_t src0_stride,
                                       const uint8_t *src1, uint32_t src1_stride,
-                                      const uint8_t *mask, int h, int w) {
+                                      const uint8_t *mask, int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -55,7 +55,7 @@ static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
 static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                       const uint8_t *src0, uint32_t src0_stride,
                                       const uint8_t *src1, uint32_t src1_stride,
-                                      const uint8_t *mask, int h, int w) {
+                                      const uint8_t *mask, int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -82,7 +82,7 @@ static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                         uint32_t src0_stride,
                                         const uint8_t *src1,
                                         uint32_t src1_stride,
-                                        const uint8_t *mask, int h, int w) {
+                                        const uint8_t *mask, int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -112,11 +112,11 @@ static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
 void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                 const uint8_t *src0, uint32_t src0_stride,
                                 const uint8_t *src1, uint32_t src1_stride,
-                                const uint8_t *mask, int h, int w) {
+                                const uint8_t *mask, int w, int h) {
   typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride,
                            const uint8_t *src0, uint32_t src0_stride,
                            const uint8_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int h, int w);
+                           const uint8_t *mask, int w, int h);
 
   // Dimension: width_index
   static const blend_fn blend[9] = {
@@ -139,11 +139,10 @@ void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
   assert(IS_POWER_OF_TWO(h));
   assert(IS_POWER_OF_TWO(w));
 
-  blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, h,
-                 w);
+  blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w,
+                 h);
 }
 
-#if CONFIG_HIGHBITDEPTH
 //////////////////////////////////////////////////////////////////////////////
 // Implementation - No sub-sampling
 //////////////////////////////////////////////////////////////////////////////
@@ -174,7 +173,7 @@ static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
                                           uint32_t src0_stride,
                                           const uint16_t *src1,
                                           uint32_t src1_stride,
-                                          const uint8_t *mask, int h, int w) {
+                                          const uint8_t *mask, int w, int h) {
   (void)w;
   blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                src1_stride, mask, h, blend_4_b10);
@@ -185,7 +184,7 @@ static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
                                           uint32_t src0_stride,
                                           const uint16_t *src1,
                                           uint32_t src1_stride,
-                                          const uint8_t *mask, int h, int w) {
+                                          const uint8_t *mask, int w, int h) {
   (void)w;
   blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                src1_stride, mask, h, blend_4_b12);
@@ -194,7 +193,7 @@ static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
 static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w, blend_unit_fn blend) {
+    const uint8_t *mask, int w, int h, blend_unit_fn blend) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -218,9 +217,9 @@ static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
                                            uint32_t src0_stride,
                                            const uint16_t *src1,
                                            uint32_t src1_stride,
-                                           const uint8_t *mask, int h, int w) {
+                                           const uint8_t *mask, int w, int h) {
   blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                src1_stride, mask, h, w, blend_8_b10);
+                                src1_stride, mask, w, h, blend_8_b10);
 }
 
 static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
@@ -228,9 +227,9 @@ static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
                                            uint32_t src0_stride,
                                            const uint16_t *src1,
                                            uint32_t src1_stride,
-                                           const uint8_t *mask, int h, int w) {
+                                           const uint8_t *mask, int w, int h) {
   blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                src1_stride, mask, h, w, blend_8_b12);
+                                src1_stride, mask, w, h, blend_8_b12);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -240,11 +239,11 @@ static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
 void aom_highbd_blend_a64_vmask_sse4_1(
     uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
     uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w, int bd) {
+    const uint8_t *mask, int w, int h, int bd) {
   typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride,
                            const uint16_t *src0, uint32_t src0_stride,
                            const uint16_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int h, int w);
+                           const uint8_t *mask, int w, int h);
 
   // Dimensions are: bd_index X width_index
   static const blend_fn blend[2][2] = {
@@ -272,14 +271,13 @@ void aom_highbd_blend_a64_vmask_sse4_1(
 
   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
     aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
-                                 src1_stride, mask, h, w, bd);
+                                 src1_stride, mask, w, h, bd);
   } else {
     uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
     const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
     const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
 
     blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, h, w);
+                                  src1_stride, mask, w, h);
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h
index daa2b2b3a..4880438bc 100644
--- a/third_party/aom/aom_dsp/x86/blend_sse4.h
+++ b/third_party/aom/aom_dsp/x86/blend_sse4.h
@@ -53,7 +53,6 @@ static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
   return v_res_w;
 }
 
-#if CONFIG_HIGHBITDEPTH
 typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
                                  const __m128i v_m0_w, const __m128i v_m1_w);
 
@@ -141,6 +140,5 @@ static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
 
   return v_res_w;
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 #endif  // AOM_DSP_X86_BLEND_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h
index 5f9596a74..3f46420dd 100644
--- a/third_party/aom/aom_dsp/x86/common_avx2.h
+++ b/third_party/aom/aom_dsp/x86/common_avx2.h
@@ -14,7 +14,7 @@
 
 #include <immintrin.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 // Note: in and out could have the same value
 static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h
index 8641164db..36fb1963a 100644
--- a/third_party/aom/aom_dsp/x86/convolve.h
+++ b/third_party/aom/aom_dsp/x86/convolve.h
@@ -13,7 +13,8 @@
 
 #include <assert.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 #include "aom_dsp/aom_convolve.h"
@@ -84,102 +85,6 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     }                                                                        \
   }
 
-#define FUN_CONV_2D(avg, opt)                                                \
-  void aom_convolve8_##avg##opt(                                             \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {                \
-    assert((-128 <= filter_x[3]) && (filter_x[3] <= 127));                   \
-    assert((-128 <= filter_y[3]) && (filter_y[3] <= 127));                   \
-    assert(w <= MAX_SB_SIZE);                                                \
-    assert(h <= MAX_SB_SIZE);                                                \
-    assert(x_step_q4 == 16);                                                 \
-    assert(y_step_q4 == 16);                                                 \
-    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_y[0] ||          \
-        filter_y[1] || filter_y[2]) {                                        \
-      DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \
-      aom_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2,    \
-                                MAX_SB_SIZE, filter_x, x_step_q4, filter_y,  \
-                                y_step_q4, w, h + 7);                        \
-      aom_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \
-                                      dst, dst_stride, filter_x, x_step_q4,  \
-                                      filter_y, y_step_q4, w, h);            \
-    } else {                                                                 \
-      DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]); \
-      aom_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_SB_SIZE,        \
-                                filter_x, x_step_q4, filter_y, y_step_q4, w, \
-                                h + 1);                                      \
-      aom_convolve8_##avg##vert_##opt(fdata2, MAX_SB_SIZE, dst, dst_stride,  \
-                                      filter_x, x_step_q4, filter_y,         \
-                                      y_step_q4, w, h);                      \
-    }                                                                        \
-  }
-
-#if CONFIG_LOOP_RESTORATION
-// convolve_add_src is only used by the Wiener filter, which will never
-// end up calling the bilinear functions (it uses a symmetric filter, so
-// the possible numbers of taps are 1,3,5,7)
-#define FUN_CONV_1D_NO_BILINEAR(name, step_q4, filter, dir, src_start, avg, \
-                                opt)                                        \
-  void aom_convolve8_##name##_##opt(                                        \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,               \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,         \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {               \
-    (void)filter_x;                                                         \
-    (void)x_step_q4;                                                        \
-    (void)filter_y;                                                         \
-    (void)y_step_q4;                                                        \
-    assert((-128 <= filter[3]) && (filter[3] <= 127));                      \
-    assert(step_q4 == 16);                                                  \
-    while (w >= 16) {                                                       \
-      aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst,  \
-                                               dst_stride, h, filter);      \
-      src += 16;                                                            \
-      dst += 16;                                                            \
-      w -= 16;                                                              \
-    }                                                                       \
-    while (w >= 8) {                                                        \
-      aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,   \
-                                              dst_stride, h, filter);       \
-      src += 8;                                                             \
-      dst += 8;                                                             \
-      w -= 8;                                                               \
-    }                                                                       \
-    while (w >= 4) {                                                        \
-      aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,   \
-                                              dst_stride, h, filter);       \
-      src += 4;                                                             \
-      dst += 4;                                                             \
-      w -= 4;                                                               \
-    }                                                                       \
-    if (w) {                                                                \
-      aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x,  \
-                               x_step_q4, filter_y, y_step_q4, w, h);       \
-    }                                                                       \
-  }
-
-#define FUN_CONV_2D_NO_BILINEAR(type, htype, opt)                           \
-  void aom_convolve8_##type##opt(                                           \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,               \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,         \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {               \
-    DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]);  \
-    assert((-128 <= filter_x[3]) && (filter_x[3] <= 127));                  \
-    assert((-128 <= filter_y[3]) && (filter_y[3] <= 127));                  \
-    assert(w <= MAX_SB_SIZE);                                               \
-    assert(h <= MAX_SB_SIZE);                                               \
-    assert(x_step_q4 == 16);                                                \
-    assert(y_step_q4 == 16);                                                \
-    aom_convolve8_##htype##horiz_##opt(                                     \
-        src - 3 * src_stride, src_stride, fdata2, MAX_SB_SIZE, filter_x,    \
-        x_step_q4, filter_y, y_step_q4, w, h + 7);                          \
-    aom_convolve8_##type##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \
-                                     dst, dst_stride, filter_x, x_step_q4,  \
-                                     filter_y, y_step_q4, w, h);            \
-  }
-#endif
-
-#if CONFIG_HIGHBITDEPTH
 typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
                                        const ptrdiff_t src_pitch,
                                        uint16_t *output_ptr,
@@ -248,41 +153,4 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
     }                                                                      \
   }
 
-#define HIGH_FUN_CONV_2D(avg, opt)                                            \
-  void aom_highbd_convolve8_##avg##opt(                                       \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                 \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,           \
-      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {         \
-    assert(w <= MAX_SB_SIZE);                                                 \
-    assert(h <= MAX_SB_SIZE);                                                 \
-    if (x_step_q4 == 16 && y_step_q4 == 16) {                                 \
-      if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 ||  \
-          filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) {  \
-        DECLARE_ALIGNED(16, uint16_t,                                         \
-                        fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]);             \
-        aom_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride,    \
-                                         CONVERT_TO_BYTEPTR(fdata2),          \
-                                         MAX_SB_SIZE, filter_x, x_step_q4,    \
-                                         filter_y, y_step_q4, w, h + 7, bd);  \
-        aom_highbd_convolve8_##avg##vert_##opt(                               \
-            CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_SB_SIZE, MAX_SB_SIZE, dst,   \
-            dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);  \
-      } else {                                                                \
-        DECLARE_ALIGNED(16, uint16_t,                                         \
-                        fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]);             \
-        aom_highbd_convolve8_horiz_##opt(                                     \
-            src, src_stride, CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE,         \
-            filter_x, x_step_q4, filter_y, y_step_q4, w, h + 1, bd);          \
-        aom_highbd_convolve8_##avg##vert_##opt(                               \
-            CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE, dst, dst_stride,         \
-            filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);              \
-      }                                                                       \
-    } else {                                                                  \
-      aom_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride,         \
-                                    filter_x, x_step_q4, filter_y, y_step_q4, \
-                                    w, h, bd);                                \
-    }                                                                         \
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-
 #endif  // AOM_DSP_X86_CONVOLVE_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h
new file mode 100644
index 000000000..7790baf2e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_CONVOLVE_AVX2_H_
+#define AOM_DSP_X86_CONVOLVE_AVX2_H_
+
+// filters for 16
+DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+static INLINE void prepare_coeffs_lowbd(
+    const InterpFilterParams *const filter_params, const int subpel_q4,
+    __m256i *const coeffs /* [4] */) {
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
+
+  // right shift all filter co-efficients by 1 to reduce the bits required.
+  // This extra right shift will be taken care of at the end while rounding
+  // the result.
+  // Since all filter co-efficients are even, this change will not affect the
+  // end result
+  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+                            _mm_set1_epi16(0xffff)));
+
+  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
+}
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+                                  const int subpel_q4,
+                                  __m256i *const coeffs /* [4] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params, subpel_q4 & SUBPEL_MASK);
+
+  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
+  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
+}
+
+static INLINE __m256i convolve_lowbd(const __m256i *const s,
+                                     const __m256i *const coeffs) {
+  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
+  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
+
+  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
+                                       _mm256_add_epi16(res_23, res_67));
+
+  return res;
+}
+
+static INLINE __m256i convolve(const __m256i *const s,
+                               const __m256i *const coeffs) {
+  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
+  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
+  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
+  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
+
+  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
+                                       _mm256_add_epi32(res_2, res_3));
+
+  return res;
+}
+
+static INLINE __m256i convolve_lowbd_x(const __m256i data,
+                                       const __m256i *const coeffs,
+                                       const __m256i *const filt) {
+  __m256i s[4];
+
+  s[0] = _mm256_shuffle_epi8(data, filt[0]);
+  s[1] = _mm256_shuffle_epi8(data, filt[1]);
+  s[2] = _mm256_shuffle_epi8(data, filt[2]);
+  s[3] = _mm256_shuffle_epi8(data, filt[3]);
+
+  return convolve_lowbd(s, coeffs);
+}
+
+static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst,
+                                         const __m256i *const res,
+                                         const int do_average) {
+  __m256i d;
+  if (do_average) {
+    d = _mm256_load_si256((__m256i *)dst);
+    d = _mm256_add_epi32(d, *res);
+    d = _mm256_srai_epi32(d, 1);
+  } else {
+    d = *res;
+  }
+  _mm256_store_si256((__m256i *)dst, d);
+}
+
+static INLINE __m256i comp_avg(const __m256i *const data_ref_0,
+                               const __m256i *const res_unsigned,
+                               const __m256i *const wt,
+                               const int use_jnt_comp_avg) {
+  __m256i res;
+  if (use_jnt_comp_avg) {
+    const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
+    const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
+
+    const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
+    const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
+
+    const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+    const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+
+    res = _mm256_packs_epi32(res_lo, res_hi);
+  } else {
+    const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
+    res = _mm256_srai_epi16(wt_res, 1);
+  }
+  return res;
+}
+
+static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned,
+                                        const __m256i *const offset_const,
+                                        const __m256i *const round_const,
+                                        const int round_shift) {
+  const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
+  const __m256i res_round = _mm256_srai_epi16(
+      _mm256_add_epi16(res_signed, *round_const), round_shift);
+  return res_round;
+}
+
+static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0,
+                                      const __m256i *const res_unsigned,
+                                      const __m256i *const wt0,
+                                      const __m256i *const wt1,
+                                      const int use_jnt_comp_avg) {
+  __m256i res;
+  if (use_jnt_comp_avg) {
+    const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
+    const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
+    const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
+    res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
+  } else {
+    const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
+    res = _mm256_srai_epi32(wt_res, 1);
+  }
+  return res;
+}
+
+static INLINE __m256i highbd_convolve_rounding(
+    const __m256i *const res_unsigned, const __m256i *const offset_const,
+    const __m256i *const round_const, const int round_shift) {
+  const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
+  const __m256i res_round = _mm256_srai_epi32(
+      _mm256_add_epi32(res_signed, *round_const), round_shift);
+
+  return res_round;
+}
+
+#endif
diff --git a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
new file mode 100644
index 000000000..e80c5872f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
+#define _AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
+
+// Note:
+//  This header file should be put below any x86 intrinsics head file
+
+static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res,
+                             const int do_average) {
+  __m128i d;
+  if (do_average) {
+    d = _mm_load_si128((__m128i *)dst);
+    d = _mm_add_epi32(d, *res);
+    d = _mm_srai_epi32(d, 1);
+  } else {
+    d = *res;
+  }
+  _mm_store_si128((__m128i *)dst, d);
+}
+
+#endif  // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h
new file mode 100644
index 000000000..846fe7bb4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_CONVOLVE_SSE2_H_
+#define AOM_DSP_X86_CONVOLVE_SSE2_H_
+
+// Note:
+//  This header file should be put below any x86 intrinsics head file
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+                                  const int subpel_q4,
+                                  __m128i *const coeffs /* [4] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
+}
+
+static INLINE __m128i convolve(const __m128i *const s,
+                               const __m128i *const coeffs) {
+  const __m128i res_0 = _mm_madd_epi16(s[0], coeffs[0]);
+  const __m128i res_1 = _mm_madd_epi16(s[1], coeffs[1]);
+  const __m128i res_2 = _mm_madd_epi16(s[2], coeffs[2]);
+  const __m128i res_3 = _mm_madd_epi16(s[3], coeffs[3]);
+
+  const __m128i res =
+      _mm_add_epi32(_mm_add_epi32(res_0, res_1), _mm_add_epi32(res_2, res_3));
+
+  return res;
+}
+
+static INLINE __m128i convolve_lo_x(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
+  ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+  ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_lo_y(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+  ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
+  ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
+  ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
+  ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i comp_avg(const __m128i *const data_ref_0,
+                               const __m128i *const res_unsigned,
+                               const __m128i *const wt,
+                               const int use_jnt_comp_avg) {
+  __m128i res;
+  if (use_jnt_comp_avg) {
+    const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned);
+    const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned);
+
+    const __m128i wt_res_lo = _mm_madd_epi16(data_lo, *wt);
+    const __m128i wt_res_hi = _mm_madd_epi16(data_hi, *wt);
+
+    const __m128i res_lo = _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+    const __m128i res_hi = _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+
+    res = _mm_packs_epi32(res_lo, res_hi);
+  } else {
+    const __m128i wt_res = _mm_add_epi16(*data_ref_0, *res_unsigned);
+    res = _mm_srai_epi16(wt_res, 1);
+  }
+  return res;
+}
+
+static INLINE __m128i convolve_rounding(const __m128i *const res_unsigned,
+                                        const __m128i *const offset_const,
+                                        const __m128i *const round_const,
+                                        const int round_shift) {
+  const __m128i res_signed = _mm_sub_epi16(*res_unsigned, *offset_const);
+  const __m128i res_round =
+      _mm_srai_epi16(_mm_add_epi16(res_signed, *round_const), round_shift);
+  return res_round;
+}
+
+static INLINE __m128i highbd_convolve_rounding_sse2(
+    const __m128i *const res_unsigned, const __m128i *const offset_const,
+    const __m128i *const round_const, const int round_shift) {
+  const __m128i res_signed = _mm_sub_epi32(*res_unsigned, *offset_const);
+  const __m128i res_round =
+      _mm_srai_epi32(_mm_add_epi32(res_signed, *round_const), round_shift);
+
+  return res_round;
+}
+
+#endif
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
new file mode 100644
index 000000000..d48c25667
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _AOM_DSP_X86_CONVOLVE_SSE4_1_INTRIN_H_
+#define _AOM_DSP_X86_CONVOLVE_SSE4_1_INTRIN_H_
+
+// Note:
+//  This header file should be put below any x86 intrinsics head file
+
+static INLINE void mult_add_store(CONV_BUF_TYPE *const dst,
+                                  const __m128i *const res,
+                                  const __m128i *const wt0,
+                                  const __m128i *const wt1,
+                                  const int do_average) {
+  __m128i d;
+  if (do_average) {
+    d = _mm_load_si128((__m128i *)dst);
+    d = _mm_add_epi32(_mm_mullo_epi32(d, *wt0), _mm_mullo_epi32(*res, *wt1));
+    d = _mm_srai_epi32(d, DIST_PRECISION_BITS);
+  } else {
+    d = *res;
+  }
+  _mm_store_si128((__m128i *)dst, d);
+}
+
+static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0,
+                                             const __m128i *const res_unsigned,
+                                             const __m128i *const wt0,
+                                             const __m128i *const wt1,
+                                             const int use_jnt_comp_avg) {
+  __m128i res;
+  if (use_jnt_comp_avg) {
+    const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0);
+    const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1);
+
+    const __m128i wt_res = _mm_add_epi32(wt0_res, wt1_res);
+    res = _mm_srai_epi32(wt_res, DIST_PRECISION_BITS);
+  } else {
+    const __m128i wt_res = _mm_add_epi32(*data_ref_0, *res_unsigned);
+    res = _mm_srai_epi32(wt_res, 1);
+  }
+  return res;
+}
+
+#endif  // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
diff --git a/third_party/aom/aom_dsp/x86/fft_avx2.c b/third_party/aom/aom_dsp/x86/fft_avx2.c
new file mode 100644
index 000000000..54da02253
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fft_avx2.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+extern void aom_transpose_float_sse2(const float *A, float *B, int n);
+extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output,
+                                          int n);
+
+// Generate the 1d forward transforms for float using _mm256
+GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+          _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+          _mm256_mul_ps);
+GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+           _mm256_mul_ps);
+GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+           _mm256_mul_ps);
+
+void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+// Generate the 1d inverse transforms for float using _mm256
+GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+           _mm256_mul_ps);
+GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+            _mm256_mul_ps);
+GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+            _mm256_mul_ps);
+
+void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2,
+                  aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+                  aom_fft1d_16_avx2, aom_ifft1d_16_avx2,
+                  aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+                  aom_fft1d_32_avx2, aom_ifft1d_32_avx2,
+                  aom_transpose_float_sse2, 8);
+}
diff --git a/third_party/aom/aom_dsp/x86/fft_sse2.c b/third_party/aom/aom_dsp/x86/fft_sse2.c
new file mode 100644
index 000000000..12bdc3e18
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fft_sse2.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <xmmintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+static INLINE void transpose4x4(const float *A, float *B, const int lda,
+                                const int ldb) {
+  __m128 row1 = _mm_load_ps(&A[0 * lda]);
+  __m128 row2 = _mm_load_ps(&A[1 * lda]);
+  __m128 row3 = _mm_load_ps(&A[2 * lda]);
+  __m128 row4 = _mm_load_ps(&A[3 * lda]);
+  _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
+  _mm_store_ps(&B[0 * ldb], row1);
+  _mm_store_ps(&B[1 * ldb], row2);
+  _mm_store_ps(&B[2 * ldb], row3);
+  _mm_store_ps(&B[3 * ldb], row4);
+}
+
+void aom_transpose_float_sse2(const float *A, float *B, int n) {
+  for (int y = 0; y < n; y += 4) {
+    for (int x = 0; x < n; x += 4) {
+      transpose4x4(A + y * n + x, B + x * n + y, n, n);
+    }
+  }
+}
+
+void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) {
+  const int n2 = n / 2;
+  output[0] = packed[0];
+  output[1] = 0;
+  output[2 * (n2 * n)] = packed[n2 * n];
+  output[2 * (n2 * n) + 1] = 0;
+
+  output[2 * n2] = packed[n2];
+  output[2 * n2 + 1] = 0;
+  output[2 * (n2 * n + n2)] = packed[n2 * n + n2];
+  output[2 * (n2 * n + n2) + 1] = 0;
+
+  for (int c = 1; c < n2; ++c) {
+    output[2 * (0 * n + c)] = packed[c];
+    output[2 * (0 * n + c) + 1] = packed[c + n2];
+    output[2 * (n2 * n + c) + 0] = packed[n2 * n + c];
+    output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2];
+  }
+  for (int r = 1; r < n2; ++r) {
+    output[2 * (r * n + 0)] = packed[r * n];
+    output[2 * (r * n + 0) + 1] = packed[(r + n2) * n];
+    output[2 * (r * n + n2) + 0] = packed[r * n + n2];
+    output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2];
+
+    for (int c = 1; c < AOMMIN(n2, 4); ++c) {
+      output[2 * (r * n + c)] =
+          packed[r * n + c] - packed[(r + n2) * n + c + n2];
+      output[2 * (r * n + c) + 1] =
+          packed[(r + n2) * n + c] + packed[r * n + c + n2];
+    }
+
+    for (int c = 4; c < n2; c += 4) {
+      __m128 real1 = _mm_load_ps(packed + r * n + c);
+      __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2);
+      __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c);
+      __m128 imag2 = _mm_load_ps(packed + r * n + c + n2);
+      real1 = _mm_sub_ps(real1, real2);
+      imag1 = _mm_add_ps(imag1, imag2);
+      _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1));
+      _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1));
+    }
+
+    int r2 = r + n2;
+    int r3 = n - r2;
+    output[2 * (r2 * n + 0)] = packed[r3 * n];
+    output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n];
+    output[2 * (r2 * n + n2)] = packed[r3 * n + n2];
+    output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2];
+    for (int c = 1; c < AOMMIN(4, n2); ++c) {
+      output[2 * (r2 * n + c)] =
+          packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2];
+      output[2 * (r2 * n + c) + 1] =
+          -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2];
+    }
+    for (int c = 4; c < n2; c += 4) {
+      __m128 real1 = _mm_load_ps(packed + r3 * n + c);
+      __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2);
+      __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c);
+      __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2);
+      real1 = _mm_add_ps(real1, real2);
+      imag1 = _mm_sub_ps(imag2, imag1);
+      _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1));
+      _mm_store_ps(output + 2 * (r2 * n + c + 2),
+                   _mm_unpackhi_ps(real1, imag1));
+    }
+  }
+}
+
+// Generate definitions for 1d transforms using float and __mm128
+GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+          _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
+GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+          _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+
+void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+// Generate definitions for 1d inverse transforms using float and mm128
+GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
+GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+
+void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2,
+                  aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2,
+                  aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+                  aom_fft1d_16_sse2, aom_ifft1d_16_sse2,
+                  aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+                  aom_fft1d_32_sse2, aom_ifft1d_32_sse2,
+                  aom_transpose_float_sse2, 4);
+}
diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c b/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c
deleted file mode 100644
index b8ec08de7..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c
+++ /dev/null
@@ -1,862 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "aom_dsp/fwd_txfm.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-// Apply a 32-element IDCT to 8 columns. This does not do any transposition
-// of its output - the caller is expected to do that.
-// The input buffers are the top and bottom halves of an 8x32 block.
-void fdct32_8col(__m128i *in0, __m128i *in1) {
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
-  const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
-  const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
-  const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
-  const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
-  const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
-  const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
-  const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
-  const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
-  const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
-  const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
-  const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
-  const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
-  const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
-  const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
-  const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i step1[32];
-  __m128i step2[32];
-  __m128i step3[32];
-  __m128i out[32];
-  // Stage 1
-  {
-    const __m128i *ina = in0;
-    const __m128i *inb = in1 + 15;
-    __m128i *step1a = &step1[0];
-    __m128i *step1b = &step1[31];
-    const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-    const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
-    const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
-    const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
-    const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
-    const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
-    const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
-    const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-    step1a[0] = _mm_add_epi16(ina0, inb0);
-    step1a[1] = _mm_add_epi16(ina1, inb1);
-    step1a[2] = _mm_add_epi16(ina2, inb2);
-    step1a[3] = _mm_add_epi16(ina3, inb3);
-    step1b[-3] = _mm_sub_epi16(ina3, inb3);
-    step1b[-2] = _mm_sub_epi16(ina2, inb2);
-    step1b[-1] = _mm_sub_epi16(ina1, inb1);
-    step1b[-0] = _mm_sub_epi16(ina0, inb0);
-  }
-  {
-    const __m128i *ina = in0 + 4;
-    const __m128i *inb = in1 + 11;
-    __m128i *step1a = &step1[4];
-    __m128i *step1b = &step1[27];
-    const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-    const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
-    const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
-    const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
-    const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
-    const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
-    const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
-    const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-    step1a[0] = _mm_add_epi16(ina0, inb0);
-    step1a[1] = _mm_add_epi16(ina1, inb1);
-    step1a[2] = _mm_add_epi16(ina2, inb2);
-    step1a[3] = _mm_add_epi16(ina3, inb3);
-    step1b[-3] = _mm_sub_epi16(ina3, inb3);
-    step1b[-2] = _mm_sub_epi16(ina2, inb2);
-    step1b[-1] = _mm_sub_epi16(ina1, inb1);
-    step1b[-0] = _mm_sub_epi16(ina0, inb0);
-  }
-  {
-    const __m128i *ina = in0 + 8;
-    const __m128i *inb = in1 + 7;
-    __m128i *step1a = &step1[8];
-    __m128i *step1b = &step1[23];
-    const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-    const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
-    const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
-    const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
-    const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
-    const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
-    const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
-    const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-    step1a[0] = _mm_add_epi16(ina0, inb0);
-    step1a[1] = _mm_add_epi16(ina1, inb1);
-    step1a[2] = _mm_add_epi16(ina2, inb2);
-    step1a[3] = _mm_add_epi16(ina3, inb3);
-    step1b[-3] = _mm_sub_epi16(ina3, inb3);
-    step1b[-2] = _mm_sub_epi16(ina2, inb2);
-    step1b[-1] = _mm_sub_epi16(ina1, inb1);
-    step1b[-0] = _mm_sub_epi16(ina0, inb0);
-  }
-  {
-    const __m128i *ina = in0 + 12;
-    const __m128i *inb = in1 + 3;
-    __m128i *step1a = &step1[12];
-    __m128i *step1b = &step1[19];
-    const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-    const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
-    const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
-    const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
-    const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
-    const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
-    const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
-    const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-    step1a[0] = _mm_add_epi16(ina0, inb0);
-    step1a[1] = _mm_add_epi16(ina1, inb1);
-    step1a[2] = _mm_add_epi16(ina2, inb2);
-    step1a[3] = _mm_add_epi16(ina3, inb3);
-    step1b[-3] = _mm_sub_epi16(ina3, inb3);
-    step1b[-2] = _mm_sub_epi16(ina2, inb2);
-    step1b[-1] = _mm_sub_epi16(ina1, inb1);
-    step1b[-0] = _mm_sub_epi16(ina0, inb0);
-  }
-  // Stage 2
-  {
-    step2[0] = _mm_add_epi16(step1[0], step1[15]);
-    step2[1] = _mm_add_epi16(step1[1], step1[14]);
-    step2[2] = _mm_add_epi16(step1[2], step1[13]);
-    step2[3] = _mm_add_epi16(step1[3], step1[12]);
-    step2[4] = _mm_add_epi16(step1[4], step1[11]);
-    step2[5] = _mm_add_epi16(step1[5], step1[10]);
-    step2[6] = _mm_add_epi16(step1[6], step1[9]);
-    step2[7] = _mm_add_epi16(step1[7], step1[8]);
-    step2[8] = _mm_sub_epi16(step1[7], step1[8]);
-    step2[9] = _mm_sub_epi16(step1[6], step1[9]);
-    step2[10] = _mm_sub_epi16(step1[5], step1[10]);
-    step2[11] = _mm_sub_epi16(step1[4], step1[11]);
-    step2[12] = _mm_sub_epi16(step1[3], step1[12]);
-    step2[13] = _mm_sub_epi16(step1[2], step1[13]);
-    step2[14] = _mm_sub_epi16(step1[1], step1[14]);
-    step2[15] = _mm_sub_epi16(step1[0], step1[15]);
-  }
-  {
-    const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
-    const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
-    const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
-    const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
-    const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
-    const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
-    const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
-    const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
-    const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
-    const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
-    const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
-    const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
-    const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
-    const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
-    const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
-    const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
-    const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
-    const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
-    const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
-    const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
-    const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
-    const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
-    const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
-    const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
-    // dct_const_round_shift
-    const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
-    const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
-    const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
-    const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
-    const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
-    const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
-    const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
-    const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
-    const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
-    const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
-    const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
-    const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
-    const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
-    const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
-    const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
-    const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
-    // Combine
-    step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
-    step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
-    step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
-    step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
-    step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
-    step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
-    step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
-    step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
-  }
-  // Stage 3
-  {
-    step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
-    step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
-    step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
-    step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
-    step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
-    step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
-    step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
-    step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
-  }
-  {
-    const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
-    const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
-    const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
-    const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
-    const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
-    const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
-    const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
-    const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
-    const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
-    const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
-    const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
-    const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
-    // dct_const_round_shift
-    const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
-    const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
-    const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
-    const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
-    const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
-    const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
-    const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
-    const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
-    // Combine
-    step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
-    step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
-    step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
-    step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
-  }
-  {
-    step3[16] = _mm_add_epi16(step2[23], step1[16]);
-    step3[17] = _mm_add_epi16(step2[22], step1[17]);
-    step3[18] = _mm_add_epi16(step2[21], step1[18]);
-    step3[19] = _mm_add_epi16(step2[20], step1[19]);
-    step3[20] = _mm_sub_epi16(step1[19], step2[20]);
-    step3[21] = _mm_sub_epi16(step1[18], step2[21]);
-    step3[22] = _mm_sub_epi16(step1[17], step2[22]);
-    step3[23] = _mm_sub_epi16(step1[16], step2[23]);
-    step3[24] = _mm_sub_epi16(step1[31], step2[24]);
-    step3[25] = _mm_sub_epi16(step1[30], step2[25]);
-    step3[26] = _mm_sub_epi16(step1[29], step2[26]);
-    step3[27] = _mm_sub_epi16(step1[28], step2[27]);
-    step3[28] = _mm_add_epi16(step2[27], step1[28]);
-    step3[29] = _mm_add_epi16(step2[26], step1[29]);
-    step3[30] = _mm_add_epi16(step2[25], step1[30]);
-    step3[31] = _mm_add_epi16(step2[24], step1[31]);
-  }
-
-  // Stage 4
-  {
-    step1[0] = _mm_add_epi16(step3[3], step3[0]);
-    step1[1] = _mm_add_epi16(step3[2], step3[1]);
-    step1[2] = _mm_sub_epi16(step3[1], step3[2]);
-    step1[3] = _mm_sub_epi16(step3[0], step3[3]);
-    step1[8] = _mm_add_epi16(step3[11], step2[8]);
-    step1[9] = _mm_add_epi16(step3[10], step2[9]);
-    step1[10] = _mm_sub_epi16(step2[9], step3[10]);
-    step1[11] = _mm_sub_epi16(step2[8], step3[11]);
-    step1[12] = _mm_sub_epi16(step2[15], step3[12]);
-    step1[13] = _mm_sub_epi16(step2[14], step3[13]);
-    step1[14] = _mm_add_epi16(step3[13], step2[14]);
-    step1[15] = _mm_add_epi16(step3[12], step2[15]);
-  }
-  {
-    const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
-    const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
-    const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
-    const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
-    const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
-    const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
-    // dct_const_round_shift
-    const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
-    const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
-    const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
-    const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
-    // Combine
-    step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
-    step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
-  }
-  {
-    const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
-    const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
-    const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
-    const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
-    const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
-    const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
-    const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
-    const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
-    const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
-    const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
-    const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
-    const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
-    const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
-    const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
-    const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
-    const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
-    const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
-    const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
-    const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
-    const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
-    const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
-    const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
-    const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
-    const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
-    // dct_const_round_shift
-    const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
-    const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
-    const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
-    const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
-    const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
-    const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
-    const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
-    const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
-    const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
-    const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
-    const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
-    const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
-    const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
-    const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
-    const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
-    const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
-    // Combine
-    step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
-    step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
-    step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
-    step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
-    step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
-    step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
-    step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
-    step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
-  }
-  // Stage 5
-  {
-    step2[4] = _mm_add_epi16(step1[5], step3[4]);
-    step2[5] = _mm_sub_epi16(step3[4], step1[5]);
-    step2[6] = _mm_sub_epi16(step3[7], step1[6]);
-    step2[7] = _mm_add_epi16(step1[6], step3[7]);
-  }
-  {
-    const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
-    const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
-    const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
-    const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
-    const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
-    const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
-    const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
-    const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
-    const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
-    const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
-    const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
-    const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
-    // dct_const_round_shift
-    const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
-    const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
-    const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
-    const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
-    const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
-    const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
-    const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
-    const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
-    // Combine
-    out[0] = _mm_packs_epi32(out_00_6, out_00_7);
-    out[16] = _mm_packs_epi32(out_16_6, out_16_7);
-    out[8] = _mm_packs_epi32(out_08_6, out_08_7);
-    out[24] = _mm_packs_epi32(out_24_6, out_24_7);
-  }
-  {
-    const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
-    const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
-    const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
-    const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
-    const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
-    const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
-    const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
-    const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
-    const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
-    const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
-    const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
-    const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
-    // dct_const_round_shift
-    const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
-    const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
-    const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
-    const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
-    const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
-    const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
-    const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
-    const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
-    // Combine
-    step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
-    step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
-    step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
-    step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
-  }
-  {
-    step2[16] = _mm_add_epi16(step1[19], step3[16]);
-    step2[17] = _mm_add_epi16(step1[18], step3[17]);
-    step2[18] = _mm_sub_epi16(step3[17], step1[18]);
-    step2[19] = _mm_sub_epi16(step3[16], step1[19]);
-    step2[20] = _mm_sub_epi16(step3[23], step1[20]);
-    step2[21] = _mm_sub_epi16(step3[22], step1[21]);
-    step2[22] = _mm_add_epi16(step1[21], step3[22]);
-    step2[23] = _mm_add_epi16(step1[20], step3[23]);
-    step2[24] = _mm_add_epi16(step1[27], step3[24]);
-    step2[25] = _mm_add_epi16(step1[26], step3[25]);
-    step2[26] = _mm_sub_epi16(step3[25], step1[26]);
-    step2[27] = _mm_sub_epi16(step3[24], step1[27]);
-    step2[28] = _mm_sub_epi16(step3[31], step1[28]);
-    step2[29] = _mm_sub_epi16(step3[30], step1[29]);
-    step2[30] = _mm_add_epi16(step1[29], step3[30]);
-    step2[31] = _mm_add_epi16(step1[28], step3[31]);
-  }
-  // Stage 6
-  {
-    const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-    const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-    const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-    const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-    const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-    const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-    const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-    const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-    const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
-    const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
-    const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
-    const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
-    const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
-    const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
-    const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
-    const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
-    // dct_const_round_shift
-    const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
-    const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
-    const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
-    const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
-    const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
-    const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
-    const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
-    const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
-    // Combine
-    out[4] = _mm_packs_epi32(out_04_6, out_04_7);
-    out[20] = _mm_packs_epi32(out_20_6, out_20_7);
-    out[12] = _mm_packs_epi32(out_12_6, out_12_7);
-    out[28] = _mm_packs_epi32(out_28_6, out_28_7);
-  }
-  {
-    step3[8] = _mm_add_epi16(step2[9], step1[8]);
-    step3[9] = _mm_sub_epi16(step1[8], step2[9]);
-    step3[10] = _mm_sub_epi16(step1[11], step2[10]);
-    step3[11] = _mm_add_epi16(step2[10], step1[11]);
-    step3[12] = _mm_add_epi16(step2[13], step1[12]);
-    step3[13] = _mm_sub_epi16(step1[12], step2[13]);
-    step3[14] = _mm_sub_epi16(step1[15], step2[14]);
-    step3[15] = _mm_add_epi16(step2[14], step1[15]);
-  }
-  {
-    const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
-    const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
-    const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
-    const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
-    const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
-    const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
-    const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
-    const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
-    const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
-    const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
-    const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
-    const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
-    const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
-    const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
-    const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
-    const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
-    const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
-    const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
-    const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
-    const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
-    const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
-    const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
-    const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
-    const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
-    // dct_const_round_shift
-    const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
-    const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
-    const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
-    const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
-    const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
-    const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
-    const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
-    const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
-    const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
-    const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
-    const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
-    const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
-    const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
-    const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
-    const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
-    const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
-    // Combine
-    step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
-    step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
-    step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
-    step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
-    // Combine
-    step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
-    step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
-    step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
-    step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
-  }
-  // Stage 7
-  {
-    const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
-    const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
-    const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
-    const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
-    const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
-    const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
-    const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
-    const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
-    const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
-    const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
-    const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
-    const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
-    const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
-    const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
-    const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
-    const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
-    const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
-    const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
-    const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
-    const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
-    const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
-    const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
-    const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
-    const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
-    // dct_const_round_shift
-    const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
-    const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
-    const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
-    const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
-    const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
-    const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
-    const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
-    const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
-    const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
-    const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
-    const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
-    const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
-    const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
-    const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
-    const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
-    const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
-    // Combine
-    out[2] = _mm_packs_epi32(out_02_6, out_02_7);
-    out[18] = _mm_packs_epi32(out_18_6, out_18_7);
-    out[10] = _mm_packs_epi32(out_10_6, out_10_7);
-    out[26] = _mm_packs_epi32(out_26_6, out_26_7);
-    out[6] = _mm_packs_epi32(out_06_6, out_06_7);
-    out[22] = _mm_packs_epi32(out_22_6, out_22_7);
-    out[14] = _mm_packs_epi32(out_14_6, out_14_7);
-    out[30] = _mm_packs_epi32(out_30_6, out_30_7);
-  }
-  {
-    step1[16] = _mm_add_epi16(step3[17], step2[16]);
-    step1[17] = _mm_sub_epi16(step2[16], step3[17]);
-    step1[18] = _mm_sub_epi16(step2[19], step3[18]);
-    step1[19] = _mm_add_epi16(step3[18], step2[19]);
-    step1[20] = _mm_add_epi16(step3[21], step2[20]);
-    step1[21] = _mm_sub_epi16(step2[20], step3[21]);
-    step1[22] = _mm_sub_epi16(step2[23], step3[22]);
-    step1[23] = _mm_add_epi16(step3[22], step2[23]);
-    step1[24] = _mm_add_epi16(step3[25], step2[24]);
-    step1[25] = _mm_sub_epi16(step2[24], step3[25]);
-    step1[26] = _mm_sub_epi16(step2[27], step3[26]);
-    step1[27] = _mm_add_epi16(step3[26], step2[27]);
-    step1[28] = _mm_add_epi16(step3[29], step2[28]);
-    step1[29] = _mm_sub_epi16(step2[28], step3[29]);
-    step1[30] = _mm_sub_epi16(step2[31], step3[30]);
-    step1[31] = _mm_add_epi16(step3[30], step2[31]);
-  }
-  // Final stage --- outputs indices are bit-reversed.
-  {
-    const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
-    const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
-    const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
-    const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
-    const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
-    const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
-    const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
-    const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
-    const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
-    const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
-    const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
-    const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
-    const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
-    const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
-    const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
-    const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
-    const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
-    const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
-    const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
-    const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
-    const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
-    const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
-    const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
-    const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
-    // dct_const_round_shift
-    const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
-    const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
-    const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
-    const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
-    const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
-    const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
-    const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
-    const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
-    const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
-    const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
-    const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
-    const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
-    const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
-    const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
-    const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
-    const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
-    // Combine
-    out[1] = _mm_packs_epi32(out_01_6, out_01_7);
-    out[17] = _mm_packs_epi32(out_17_6, out_17_7);
-    out[9] = _mm_packs_epi32(out_09_6, out_09_7);
-    out[25] = _mm_packs_epi32(out_25_6, out_25_7);
-    out[7] = _mm_packs_epi32(out_07_6, out_07_7);
-    out[23] = _mm_packs_epi32(out_23_6, out_23_7);
-    out[15] = _mm_packs_epi32(out_15_6, out_15_7);
-    out[31] = _mm_packs_epi32(out_31_6, out_31_7);
-  }
-  {
-    const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
-    const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
-    const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
-    const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
-    const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
-    const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
-    const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
-    const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
-    const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
-    const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
-    const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
-    const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
-    const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
-    const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
-    const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
-    const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
-    const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
-    const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
-    const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
-    const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
-    const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
-    const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
-    const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
-    const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
-    // dct_const_round_shift
-    const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
-    const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
-    const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
-    const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
-    const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
-    const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
-    const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
-    const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
-    const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
-    const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
-    const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
-    const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
-    const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
-    const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
-    const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
-    const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
-    // Combine
-    out[5] = _mm_packs_epi32(out_05_6, out_05_7);
-    out[21] = _mm_packs_epi32(out_21_6, out_21_7);
-    out[13] = _mm_packs_epi32(out_13_6, out_13_7);
-    out[29] = _mm_packs_epi32(out_29_6, out_29_7);
-    out[3] = _mm_packs_epi32(out_03_6, out_03_7);
-    out[19] = _mm_packs_epi32(out_19_6, out_19_7);
-    out[11] = _mm_packs_epi32(out_11_6, out_11_7);
-    out[27] = _mm_packs_epi32(out_27_6, out_27_7);
-  }
-
-  // Output results
-  {
-    int j;
-    for (j = 0; j < 16; ++j) {
-      _mm_storeu_si128((__m128i *)(in0 + j), out[j]);
-      _mm_storeu_si128((__m128i *)(in1 + j), out[j + 16]);
-    }
-  }
-}  // NOLINT
diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h
deleted file mode 100644
index 216739581..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ /dev/null
@@ -1,3022 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>  // AVX2
-
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_intrin.h"
-#include "aom_dsp/x86/txfm_common_avx2.h"
-
-#if FDCT32x32_HIGH_PRECISION
-static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {
-  __m256i buf0, buf1;
-  buf0 = _mm256_mul_epu32(a, b);
-  a = _mm256_srli_epi64(a, 32);
-  b = _mm256_srli_epi64(b, 32);
-  buf1 = _mm256_mul_epu32(a, b);
-  return _mm256_add_epi64(buf0, buf1);
-}
-
-static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) {
-  __m256i buf0 = _mm256_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
-  __m256i buf1 = _mm256_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
-  return _mm256_unpacklo_epi64(buf0, buf1);
-}
-#endif
-
-#ifndef STORE_COEFF_FUNC
-#define STORE_COEFF_FUNC
-static void store_coeff(const __m256i *coeff, tran_low_t *curr,
-                        tran_low_t *next) {
-  __m128i u = _mm256_castsi256_si128(*coeff);
-  storeu_output(&u, curr);
-  u = _mm256_extractf128_si256(*coeff, 1);
-  storeu_output(&u, next);
-}
-#endif
-
-void FDCT32x32_2D_AVX2(const int16_t *input, tran_low_t *output_org,
-                       int stride) {
-  // Calculate pre-multiplied strides
-  const int str1 = stride;
-  const int str2 = 2 * stride;
-  const int str3 = 2 * stride + str1;
-  // We need an intermediate buffer between passes.
-  DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]);
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i k__cospi_p16_m16 =
-      pair256_set_epi16(+cospi_16_64, -cospi_16_64);
-  const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64);
-  const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64);
-  const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64);
-  const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m256i k__cospi_m12_m20 =
-      pair256_set_epi16(-cospi_12_64, -cospi_20_64);
-  const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64, cospi_2_64);
-  const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64, cospi_18_64);
-  const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64, cospi_10_64);
-  const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64, cospi_26_64);
-  const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64, cospi_1_64);
-  const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64, cospi_17_64);
-  const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64, cospi_9_64);
-  const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64, cospi_25_64);
-  const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64);
-  const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64);
-  const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64);
-  const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64);
-  const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64, cospi_5_64);
-  const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64, cospi_21_64);
-  const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64, cospi_13_64);
-  const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64, cospi_29_64);
-  const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64);
-  const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64);
-  const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
-  const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
-  const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  const __m256i kZero = _mm256_set1_epi16(0);
-  const __m256i kOne = _mm256_set1_epi16(1);
-  // Do the two transform/transpose passes
-  int pass;
-  for (pass = 0; pass < 2; ++pass) {
-    // We process sixteen columns (transposed rows in second pass) at a time.
-    int column_start;
-    for (column_start = 0; column_start < 32; column_start += 16) {
-      __m256i step1[32];
-      __m256i step2[32];
-      __m256i step3[32];
-      __m256i out[32];
-      // Stage 1
-      // Note: even though all the loads below are aligned, using the aligned
-      //       intrinsic make the code slightly slower.
-      if (0 == pass) {
-        const int16_t *in = &input[column_start];
-        // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
-        // Note: the next four blocks could be in a loop. That would help the
-        //       instruction cache but is actually slower.
-        {
-          const int16_t *ina = in + 0 * str1;
-          const int16_t *inb = in + 31 * str1;
-          __m256i *step1a = &step1[0];
-          __m256i *step1b = &step1[31];
-          const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
-          const __m256i ina1 =
-              _mm256_loadu_si256((const __m256i *)(ina + str1));
-          const __m256i ina2 =
-              _mm256_loadu_si256((const __m256i *)(ina + str2));
-          const __m256i ina3 =
-              _mm256_loadu_si256((const __m256i *)(ina + str3));
-          const __m256i inb3 =
-              _mm256_loadu_si256((const __m256i *)(inb - str3));
-          const __m256i inb2 =
-              _mm256_loadu_si256((const __m256i *)(inb - str2));
-          const __m256i inb1 =
-              _mm256_loadu_si256((const __m256i *)(inb - str1));
-          const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
-          step1a[0] = _mm256_add_epi16(ina0, inb0);
-          step1a[1] = _mm256_add_epi16(ina1, inb1);
-          step1a[2] = _mm256_add_epi16(ina2, inb2);
-          step1a[3] = _mm256_add_epi16(ina3, inb3);
-          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
-          step1a[0] = _mm256_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm256_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm256_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm256_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 4 * str1;
-          const int16_t *inb = in + 27 * str1;
-          __m256i *step1a = &step1[4];
-          __m256i *step1b = &step1[27];
-          const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
-          const __m256i ina1 =
-              _mm256_loadu_si256((const __m256i *)(ina + str1));
-          const __m256i ina2 =
-              _mm256_loadu_si256((const __m256i *)(ina + str2));
-          const __m256i ina3 =
-              _mm256_loadu_si256((const __m256i *)(ina + str3));
-          const __m256i inb3 =
-              _mm256_loadu_si256((const __m256i *)(inb - str3));
-          const __m256i inb2 =
-              _mm256_loadu_si256((const __m256i *)(inb - str2));
-          const __m256i inb1 =
-              _mm256_loadu_si256((const __m256i *)(inb - str1));
-          const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
-          step1a[0] = _mm256_add_epi16(ina0, inb0);
-          step1a[1] = _mm256_add_epi16(ina1, inb1);
-          step1a[2] = _mm256_add_epi16(ina2, inb2);
-          step1a[3] = _mm256_add_epi16(ina3, inb3);
-          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
-          step1a[0] = _mm256_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm256_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm256_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm256_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 8 * str1;
-          const int16_t *inb = in + 23 * str1;
-          __m256i *step1a = &step1[8];
-          __m256i *step1b = &step1[23];
-          const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
-          const __m256i ina1 =
-              _mm256_loadu_si256((const __m256i *)(ina + str1));
-          const __m256i ina2 =
-              _mm256_loadu_si256((const __m256i *)(ina + str2));
-          const __m256i ina3 =
-              _mm256_loadu_si256((const __m256i *)(ina + str3));
-          const __m256i inb3 =
-              _mm256_loadu_si256((const __m256i *)(inb - str3));
-          const __m256i inb2 =
-              _mm256_loadu_si256((const __m256i *)(inb - str2));
-          const __m256i inb1 =
-              _mm256_loadu_si256((const __m256i *)(inb - str1));
-          const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
-          step1a[0] = _mm256_add_epi16(ina0, inb0);
-          step1a[1] = _mm256_add_epi16(ina1, inb1);
-          step1a[2] = _mm256_add_epi16(ina2, inb2);
-          step1a[3] = _mm256_add_epi16(ina3, inb3);
-          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
-          step1a[0] = _mm256_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm256_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm256_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm256_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 12 * str1;
-          const int16_t *inb = in + 19 * str1;
-          __m256i *step1a = &step1[12];
-          __m256i *step1b = &step1[19];
-          const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
-          const __m256i ina1 =
-              _mm256_loadu_si256((const __m256i *)(ina + str1));
-          const __m256i ina2 =
-              _mm256_loadu_si256((const __m256i *)(ina + str2));
-          const __m256i ina3 =
-              _mm256_loadu_si256((const __m256i *)(ina + str3));
-          const __m256i inb3 =
-              _mm256_loadu_si256((const __m256i *)(inb - str3));
-          const __m256i inb2 =
-              _mm256_loadu_si256((const __m256i *)(inb - str2));
-          const __m256i inb1 =
-              _mm256_loadu_si256((const __m256i *)(inb - str1));
-          const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
-          step1a[0] = _mm256_add_epi16(ina0, inb0);
-          step1a[1] = _mm256_add_epi16(ina1, inb1);
-          step1a[2] = _mm256_add_epi16(ina2, inb2);
-          step1a[3] = _mm256_add_epi16(ina3, inb3);
-          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
-          step1a[0] = _mm256_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm256_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm256_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm256_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
-        }
-      } else {
-        int16_t *in = &intermediate[column_start];
-        // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
-        // Note: using the same approach as above to have common offset is
-        //       counter-productive as all offsets can be calculated at compile
-        //       time.
-        // Note: the next four blocks could be in a loop. That would help the
-        //       instruction cache but is actually slower.
-        {
-          __m256i in00 = _mm256_loadu_si256((const __m256i *)(in + 0 * 32));
-          __m256i in01 = _mm256_loadu_si256((const __m256i *)(in + 1 * 32));
-          __m256i in02 = _mm256_loadu_si256((const __m256i *)(in + 2 * 32));
-          __m256i in03 = _mm256_loadu_si256((const __m256i *)(in + 3 * 32));
-          __m256i in28 = _mm256_loadu_si256((const __m256i *)(in + 28 * 32));
-          __m256i in29 = _mm256_loadu_si256((const __m256i *)(in + 29 * 32));
-          __m256i in30 = _mm256_loadu_si256((const __m256i *)(in + 30 * 32));
-          __m256i in31 = _mm256_loadu_si256((const __m256i *)(in + 31 * 32));
-          step1[0] = _mm256_add_epi16(in00, in31);
-          step1[1] = _mm256_add_epi16(in01, in30);
-          step1[2] = _mm256_add_epi16(in02, in29);
-          step1[3] = _mm256_add_epi16(in03, in28);
-          step1[28] = _mm256_sub_epi16(in03, in28);
-          step1[29] = _mm256_sub_epi16(in02, in29);
-          step1[30] = _mm256_sub_epi16(in01, in30);
-          step1[31] = _mm256_sub_epi16(in00, in31);
-        }
-        {
-          __m256i in04 = _mm256_loadu_si256((const __m256i *)(in + 4 * 32));
-          __m256i in05 = _mm256_loadu_si256((const __m256i *)(in + 5 * 32));
-          __m256i in06 = _mm256_loadu_si256((const __m256i *)(in + 6 * 32));
-          __m256i in07 = _mm256_loadu_si256((const __m256i *)(in + 7 * 32));
-          __m256i in24 = _mm256_loadu_si256((const __m256i *)(in + 24 * 32));
-          __m256i in25 = _mm256_loadu_si256((const __m256i *)(in + 25 * 32));
-          __m256i in26 = _mm256_loadu_si256((const __m256i *)(in + 26 * 32));
-          __m256i in27 = _mm256_loadu_si256((const __m256i *)(in + 27 * 32));
-          step1[4] = _mm256_add_epi16(in04, in27);
-          step1[5] = _mm256_add_epi16(in05, in26);
-          step1[6] = _mm256_add_epi16(in06, in25);
-          step1[7] = _mm256_add_epi16(in07, in24);
-          step1[24] = _mm256_sub_epi16(in07, in24);
-          step1[25] = _mm256_sub_epi16(in06, in25);
-          step1[26] = _mm256_sub_epi16(in05, in26);
-          step1[27] = _mm256_sub_epi16(in04, in27);
-        }
-        {
-          __m256i in08 = _mm256_loadu_si256((const __m256i *)(in + 8 * 32));
-          __m256i in09 = _mm256_loadu_si256((const __m256i *)(in + 9 * 32));
-          __m256i in10 = _mm256_loadu_si256((const __m256i *)(in + 10 * 32));
-          __m256i in11 = _mm256_loadu_si256((const __m256i *)(in + 11 * 32));
-          __m256i in20 = _mm256_loadu_si256((const __m256i *)(in + 20 * 32));
-          __m256i in21 = _mm256_loadu_si256((const __m256i *)(in + 21 * 32));
-          __m256i in22 = _mm256_loadu_si256((const __m256i *)(in + 22 * 32));
-          __m256i in23 = _mm256_loadu_si256((const __m256i *)(in + 23 * 32));
-          step1[8] = _mm256_add_epi16(in08, in23);
-          step1[9] = _mm256_add_epi16(in09, in22);
-          step1[10] = _mm256_add_epi16(in10, in21);
-          step1[11] = _mm256_add_epi16(in11, in20);
-          step1[20] = _mm256_sub_epi16(in11, in20);
-          step1[21] = _mm256_sub_epi16(in10, in21);
-          step1[22] = _mm256_sub_epi16(in09, in22);
-          step1[23] = _mm256_sub_epi16(in08, in23);
-        }
-        {
-          __m256i in12 = _mm256_loadu_si256((const __m256i *)(in + 12 * 32));
-          __m256i in13 = _mm256_loadu_si256((const __m256i *)(in + 13 * 32));
-          __m256i in14 = _mm256_loadu_si256((const __m256i *)(in + 14 * 32));
-          __m256i in15 = _mm256_loadu_si256((const __m256i *)(in + 15 * 32));
-          __m256i in16 = _mm256_loadu_si256((const __m256i *)(in + 16 * 32));
-          __m256i in17 = _mm256_loadu_si256((const __m256i *)(in + 17 * 32));
-          __m256i in18 = _mm256_loadu_si256((const __m256i *)(in + 18 * 32));
-          __m256i in19 = _mm256_loadu_si256((const __m256i *)(in + 19 * 32));
-          step1[12] = _mm256_add_epi16(in12, in19);
-          step1[13] = _mm256_add_epi16(in13, in18);
-          step1[14] = _mm256_add_epi16(in14, in17);
-          step1[15] = _mm256_add_epi16(in15, in16);
-          step1[16] = _mm256_sub_epi16(in15, in16);
-          step1[17] = _mm256_sub_epi16(in14, in17);
-          step1[18] = _mm256_sub_epi16(in13, in18);
-          step1[19] = _mm256_sub_epi16(in12, in19);
-        }
-      }
-      // Stage 2
-      {
-        step2[0] = _mm256_add_epi16(step1[0], step1[15]);
-        step2[1] = _mm256_add_epi16(step1[1], step1[14]);
-        step2[2] = _mm256_add_epi16(step1[2], step1[13]);
-        step2[3] = _mm256_add_epi16(step1[3], step1[12]);
-        step2[4] = _mm256_add_epi16(step1[4], step1[11]);
-        step2[5] = _mm256_add_epi16(step1[5], step1[10]);
-        step2[6] = _mm256_add_epi16(step1[6], step1[9]);
-        step2[7] = _mm256_add_epi16(step1[7], step1[8]);
-        step2[8] = _mm256_sub_epi16(step1[7], step1[8]);
-        step2[9] = _mm256_sub_epi16(step1[6], step1[9]);
-        step2[10] = _mm256_sub_epi16(step1[5], step1[10]);
-        step2[11] = _mm256_sub_epi16(step1[4], step1[11]);
-        step2[12] = _mm256_sub_epi16(step1[3], step1[12]);
-        step2[13] = _mm256_sub_epi16(step1[2], step1[13]);
-        step2[14] = _mm256_sub_epi16(step1[1], step1[14]);
-        step2[15] = _mm256_sub_epi16(step1[0], step1[15]);
-      }
-      {
-        const __m256i s2_20_0 = _mm256_unpacklo_epi16(step1[27], step1[20]);
-        const __m256i s2_20_1 = _mm256_unpackhi_epi16(step1[27], step1[20]);
-        const __m256i s2_21_0 = _mm256_unpacklo_epi16(step1[26], step1[21]);
-        const __m256i s2_21_1 = _mm256_unpackhi_epi16(step1[26], step1[21]);
-        const __m256i s2_22_0 = _mm256_unpacklo_epi16(step1[25], step1[22]);
-        const __m256i s2_22_1 = _mm256_unpackhi_epi16(step1[25], step1[22]);
-        const __m256i s2_23_0 = _mm256_unpacklo_epi16(step1[24], step1[23]);
-        const __m256i s2_23_1 = _mm256_unpackhi_epi16(step1[24], step1[23]);
-        const __m256i s2_20_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_m16);
-        const __m256i s2_20_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_m16);
-        const __m256i s2_21_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_m16);
-        const __m256i s2_21_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_m16);
-        const __m256i s2_22_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_m16);
-        const __m256i s2_22_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_m16);
-        const __m256i s2_23_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_m16);
-        const __m256i s2_23_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_m16);
-        const __m256i s2_24_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_p16);
-        const __m256i s2_24_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_p16);
-        const __m256i s2_25_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_p16);
-        const __m256i s2_25_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_p16);
-        const __m256i s2_26_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_p16);
-        const __m256i s2_26_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_p16);
-        const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16);
-        const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16);
-        // dct_const_round_shift
-        const __m256i s2_20_4 =
-            _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_20_5 =
-            _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_21_4 =
-            _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_21_5 =
-            _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_22_4 =
-            _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_22_5 =
-            _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_23_4 =
-            _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_23_5 =
-            _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_24_4 =
-            _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_24_5 =
-            _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_25_4 =
-            _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_25_5 =
-            _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_26_4 =
-            _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_26_5 =
-            _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_27_4 =
-            _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_27_5 =
-            _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS);
-        const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS);
-        const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS);
-        const __m256i s2_21_7 = _mm256_srai_epi32(s2_21_5, DCT_CONST_BITS);
-        const __m256i s2_22_6 = _mm256_srai_epi32(s2_22_4, DCT_CONST_BITS);
-        const __m256i s2_22_7 = _mm256_srai_epi32(s2_22_5, DCT_CONST_BITS);
-        const __m256i s2_23_6 = _mm256_srai_epi32(s2_23_4, DCT_CONST_BITS);
-        const __m256i s2_23_7 = _mm256_srai_epi32(s2_23_5, DCT_CONST_BITS);
-        const __m256i s2_24_6 = _mm256_srai_epi32(s2_24_4, DCT_CONST_BITS);
-        const __m256i s2_24_7 = _mm256_srai_epi32(s2_24_5, DCT_CONST_BITS);
-        const __m256i s2_25_6 = _mm256_srai_epi32(s2_25_4, DCT_CONST_BITS);
-        const __m256i s2_25_7 = _mm256_srai_epi32(s2_25_5, DCT_CONST_BITS);
-        const __m256i s2_26_6 = _mm256_srai_epi32(s2_26_4, DCT_CONST_BITS);
-        const __m256i s2_26_7 = _mm256_srai_epi32(s2_26_5, DCT_CONST_BITS);
-        const __m256i s2_27_6 = _mm256_srai_epi32(s2_27_4, DCT_CONST_BITS);
-        const __m256i s2_27_7 = _mm256_srai_epi32(s2_27_5, DCT_CONST_BITS);
-        // Combine
-        step2[20] = _mm256_packs_epi32(s2_20_6, s2_20_7);
-        step2[21] = _mm256_packs_epi32(s2_21_6, s2_21_7);
-        step2[22] = _mm256_packs_epi32(s2_22_6, s2_22_7);
-        step2[23] = _mm256_packs_epi32(s2_23_6, s2_23_7);
-        step2[24] = _mm256_packs_epi32(s2_24_6, s2_24_7);
-        step2[25] = _mm256_packs_epi32(s2_25_6, s2_25_7);
-        step2[26] = _mm256_packs_epi32(s2_26_6, s2_26_7);
-        step2[27] = _mm256_packs_epi32(s2_27_6, s2_27_7);
-      }
-
-#if !FDCT32x32_HIGH_PRECISION
-      // dump the magnitude by half, hence the intermediate values are within
-      // the range of 16 bits.
-      if (1 == pass) {
-        __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero, step2[0]);
-        __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero, step2[1]);
-        __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero, step2[2]);
-        __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero, step2[3]);
-        __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero, step2[4]);
-        __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero, step2[5]);
-        __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero, step2[6]);
-        __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero, step2[7]);
-        __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero, step2[8]);
-        __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero, step2[9]);
-        __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero, step2[10]);
-        __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero, step2[11]);
-        __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero, step2[12]);
-        __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero, step2[13]);
-        __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero, step2[14]);
-        __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero, step2[15]);
-        __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero, step1[16]);
-        __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero, step1[17]);
-        __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero, step1[18]);
-        __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero, step1[19]);
-        __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero, step2[20]);
-        __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero, step2[21]);
-        __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero, step2[22]);
-        __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero, step2[23]);
-        __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero, step2[24]);
-        __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero, step2[25]);
-        __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero, step2[26]);
-        __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero, step2[27]);
-        __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero, step1[28]);
-        __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero, step1[29]);
-        __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero, step1[30]);
-        __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero, step1[31]);
-
-        step2[0] = _mm256_sub_epi16(step2[0], s3_00_0);
-        step2[1] = _mm256_sub_epi16(step2[1], s3_01_0);
-        step2[2] = _mm256_sub_epi16(step2[2], s3_02_0);
-        step2[3] = _mm256_sub_epi16(step2[3], s3_03_0);
-        step2[4] = _mm256_sub_epi16(step2[4], s3_04_0);
-        step2[5] = _mm256_sub_epi16(step2[5], s3_05_0);
-        step2[6] = _mm256_sub_epi16(step2[6], s3_06_0);
-        step2[7] = _mm256_sub_epi16(step2[7], s3_07_0);
-        step2[8] = _mm256_sub_epi16(step2[8], s2_08_0);
-        step2[9] = _mm256_sub_epi16(step2[9], s2_09_0);
-        step2[10] = _mm256_sub_epi16(step2[10], s3_10_0);
-        step2[11] = _mm256_sub_epi16(step2[11], s3_11_0);
-        step2[12] = _mm256_sub_epi16(step2[12], s3_12_0);
-        step2[13] = _mm256_sub_epi16(step2[13], s3_13_0);
-        step2[14] = _mm256_sub_epi16(step2[14], s2_14_0);
-        step2[15] = _mm256_sub_epi16(step2[15], s2_15_0);
-        step1[16] = _mm256_sub_epi16(step1[16], s3_16_0);
-        step1[17] = _mm256_sub_epi16(step1[17], s3_17_0);
-        step1[18] = _mm256_sub_epi16(step1[18], s3_18_0);
-        step1[19] = _mm256_sub_epi16(step1[19], s3_19_0);
-        step2[20] = _mm256_sub_epi16(step2[20], s3_20_0);
-        step2[21] = _mm256_sub_epi16(step2[21], s3_21_0);
-        step2[22] = _mm256_sub_epi16(step2[22], s3_22_0);
-        step2[23] = _mm256_sub_epi16(step2[23], s3_23_0);
-        step2[24] = _mm256_sub_epi16(step2[24], s3_24_0);
-        step2[25] = _mm256_sub_epi16(step2[25], s3_25_0);
-        step2[26] = _mm256_sub_epi16(step2[26], s3_26_0);
-        step2[27] = _mm256_sub_epi16(step2[27], s3_27_0);
-        step1[28] = _mm256_sub_epi16(step1[28], s3_28_0);
-        step1[29] = _mm256_sub_epi16(step1[29], s3_29_0);
-        step1[30] = _mm256_sub_epi16(step1[30], s3_30_0);
-        step1[31] = _mm256_sub_epi16(step1[31], s3_31_0);
-
-        step2[0] = _mm256_add_epi16(step2[0], kOne);
-        step2[1] = _mm256_add_epi16(step2[1], kOne);
-        step2[2] = _mm256_add_epi16(step2[2], kOne);
-        step2[3] = _mm256_add_epi16(step2[3], kOne);
-        step2[4] = _mm256_add_epi16(step2[4], kOne);
-        step2[5] = _mm256_add_epi16(step2[5], kOne);
-        step2[6] = _mm256_add_epi16(step2[6], kOne);
-        step2[7] = _mm256_add_epi16(step2[7], kOne);
-        step2[8] = _mm256_add_epi16(step2[8], kOne);
-        step2[9] = _mm256_add_epi16(step2[9], kOne);
-        step2[10] = _mm256_add_epi16(step2[10], kOne);
-        step2[11] = _mm256_add_epi16(step2[11], kOne);
-        step2[12] = _mm256_add_epi16(step2[12], kOne);
-        step2[13] = _mm256_add_epi16(step2[13], kOne);
-        step2[14] = _mm256_add_epi16(step2[14], kOne);
-        step2[15] = _mm256_add_epi16(step2[15], kOne);
-        step1[16] = _mm256_add_epi16(step1[16], kOne);
-        step1[17] = _mm256_add_epi16(step1[17], kOne);
-        step1[18] = _mm256_add_epi16(step1[18], kOne);
-        step1[19] = _mm256_add_epi16(step1[19], kOne);
-        step2[20] = _mm256_add_epi16(step2[20], kOne);
-        step2[21] = _mm256_add_epi16(step2[21], kOne);
-        step2[22] = _mm256_add_epi16(step2[22], kOne);
-        step2[23] = _mm256_add_epi16(step2[23], kOne);
-        step2[24] = _mm256_add_epi16(step2[24], kOne);
-        step2[25] = _mm256_add_epi16(step2[25], kOne);
-        step2[26] = _mm256_add_epi16(step2[26], kOne);
-        step2[27] = _mm256_add_epi16(step2[27], kOne);
-        step1[28] = _mm256_add_epi16(step1[28], kOne);
-        step1[29] = _mm256_add_epi16(step1[29], kOne);
-        step1[30] = _mm256_add_epi16(step1[30], kOne);
-        step1[31] = _mm256_add_epi16(step1[31], kOne);
-
-        step2[0] = _mm256_srai_epi16(step2[0], 2);
-        step2[1] = _mm256_srai_epi16(step2[1], 2);
-        step2[2] = _mm256_srai_epi16(step2[2], 2);
-        step2[3] = _mm256_srai_epi16(step2[3], 2);
-        step2[4] = _mm256_srai_epi16(step2[4], 2);
-        step2[5] = _mm256_srai_epi16(step2[5], 2);
-        step2[6] = _mm256_srai_epi16(step2[6], 2);
-        step2[7] = _mm256_srai_epi16(step2[7], 2);
-        step2[8] = _mm256_srai_epi16(step2[8], 2);
-        step2[9] = _mm256_srai_epi16(step2[9], 2);
-        step2[10] = _mm256_srai_epi16(step2[10], 2);
-        step2[11] = _mm256_srai_epi16(step2[11], 2);
-        step2[12] = _mm256_srai_epi16(step2[12], 2);
-        step2[13] = _mm256_srai_epi16(step2[13], 2);
-        step2[14] = _mm256_srai_epi16(step2[14], 2);
-        step2[15] = _mm256_srai_epi16(step2[15], 2);
-        step1[16] = _mm256_srai_epi16(step1[16], 2);
-        step1[17] = _mm256_srai_epi16(step1[17], 2);
-        step1[18] = _mm256_srai_epi16(step1[18], 2);
-        step1[19] = _mm256_srai_epi16(step1[19], 2);
-        step2[20] = _mm256_srai_epi16(step2[20], 2);
-        step2[21] = _mm256_srai_epi16(step2[21], 2);
-        step2[22] = _mm256_srai_epi16(step2[22], 2);
-        step2[23] = _mm256_srai_epi16(step2[23], 2);
-        step2[24] = _mm256_srai_epi16(step2[24], 2);
-        step2[25] = _mm256_srai_epi16(step2[25], 2);
-        step2[26] = _mm256_srai_epi16(step2[26], 2);
-        step2[27] = _mm256_srai_epi16(step2[27], 2);
-        step1[28] = _mm256_srai_epi16(step1[28], 2);
-        step1[29] = _mm256_srai_epi16(step1[29], 2);
-        step1[30] = _mm256_srai_epi16(step1[30], 2);
-        step1[31] = _mm256_srai_epi16(step1[31], 2);
-      }
-#endif
-
-#if FDCT32x32_HIGH_PRECISION
-      if (pass == 0) {
-#endif
-        // Stage 3
-        {
-          step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]);
-          step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]);
-          step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]);
-          step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]);
-          step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]);
-          step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]);
-          step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]);
-          step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]);
-        }
-        {
-          const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
-          const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
-          const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
-          const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
-          const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
-          const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
-          const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
-          const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
-          const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
-          const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
-          const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
-          const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m256i s3_10_4 =
-              _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_10_5 =
-              _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_11_4 =
-              _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_11_5 =
-              _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_12_4 =
-              _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_12_5 =
-              _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_13_4 =
-              _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_13_5 =
-              _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
-          const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
-          const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
-          const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
-          const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
-          const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
-          const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
-          const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
-          // Combine
-          step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7);
-          step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7);
-          step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7);
-          step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7);
-        }
-        {
-          step3[16] = _mm256_add_epi16(step2[23], step1[16]);
-          step3[17] = _mm256_add_epi16(step2[22], step1[17]);
-          step3[18] = _mm256_add_epi16(step2[21], step1[18]);
-          step3[19] = _mm256_add_epi16(step2[20], step1[19]);
-          step3[20] = _mm256_sub_epi16(step1[19], step2[20]);
-          step3[21] = _mm256_sub_epi16(step1[18], step2[21]);
-          step3[22] = _mm256_sub_epi16(step1[17], step2[22]);
-          step3[23] = _mm256_sub_epi16(step1[16], step2[23]);
-          step3[24] = _mm256_sub_epi16(step1[31], step2[24]);
-          step3[25] = _mm256_sub_epi16(step1[30], step2[25]);
-          step3[26] = _mm256_sub_epi16(step1[29], step2[26]);
-          step3[27] = _mm256_sub_epi16(step1[28], step2[27]);
-          step3[28] = _mm256_add_epi16(step2[27], step1[28]);
-          step3[29] = _mm256_add_epi16(step2[26], step1[29]);
-          step3[30] = _mm256_add_epi16(step2[25], step1[30]);
-          step3[31] = _mm256_add_epi16(step2[24], step1[31]);
-        }
-
-        // Stage 4
-        {
-          step1[0] = _mm256_add_epi16(step3[3], step3[0]);
-          step1[1] = _mm256_add_epi16(step3[2], step3[1]);
-          step1[2] = _mm256_sub_epi16(step3[1], step3[2]);
-          step1[3] = _mm256_sub_epi16(step3[0], step3[3]);
-          step1[8] = _mm256_add_epi16(step3[11], step2[8]);
-          step1[9] = _mm256_add_epi16(step3[10], step2[9]);
-          step1[10] = _mm256_sub_epi16(step2[9], step3[10]);
-          step1[11] = _mm256_sub_epi16(step2[8], step3[11]);
-          step1[12] = _mm256_sub_epi16(step2[15], step3[12]);
-          step1[13] = _mm256_sub_epi16(step2[14], step3[13]);
-          step1[14] = _mm256_add_epi16(step3[13], step2[14]);
-          step1[15] = _mm256_add_epi16(step3[12], step2[15]);
-        }
-        {
-          const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]);
-          const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]);
-          const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16);
-          const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16);
-          const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16);
-          const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m256i s1_05_4 =
-              _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_05_5 =
-              _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_06_4 =
-              _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_06_5 =
-              _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS);
-          const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS);
-          const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS);
-          const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS);
-          // Combine
-          step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7);
-          step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7);
-        }
-        {
-          const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]);
-          const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]);
-          const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]);
-          const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]);
-          const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]);
-          const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]);
-          const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]);
-          const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]);
-          const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24);
-          const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24);
-          const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24);
-          const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24);
-          const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08);
-          const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08);
-          const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08);
-          const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08);
-          const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24);
-          const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24);
-          const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24);
-          const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24);
-          const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08);
-          const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08);
-          const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08);
-          const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08);
-          // dct_const_round_shift
-          const __m256i s1_18_4 =
-              _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_18_5 =
-              _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_19_4 =
-              _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_19_5 =
-              _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_20_4 =
-              _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_20_5 =
-              _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_21_4 =
-              _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_21_5 =
-              _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_26_4 =
-              _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_26_5 =
-              _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_27_4 =
-              _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_27_5 =
-              _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_28_4 =
-              _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_28_5 =
-              _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_29_4 =
-              _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_29_5 =
-              _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS);
-          const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS);
-          const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS);
-          const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS);
-          const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS);
-          const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS);
-          const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS);
-          const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS);
-          const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS);
-          const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS);
-          const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS);
-          const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS);
-          const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS);
-          const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS);
-          const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS);
-          const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS);
-          // Combine
-          step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7);
-          step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7);
-          step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7);
-          step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7);
-          step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7);
-          step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7);
-          step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7);
-          step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7);
-        }
-        // Stage 5
-        {
-          step2[4] = _mm256_add_epi16(step1[5], step3[4]);
-          step2[5] = _mm256_sub_epi16(step3[4], step1[5]);
-          step2[6] = _mm256_sub_epi16(step3[7], step1[6]);
-          step2[7] = _mm256_add_epi16(step1[6], step3[7]);
-        }
-        {
-          const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]);
-          const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]);
-          const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]);
-          const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]);
-          const __m256i out_00_2 =
-              _mm256_madd_epi16(out_00_0, k__cospi_p16_p16);
-          const __m256i out_00_3 =
-              _mm256_madd_epi16(out_00_1, k__cospi_p16_p16);
-          const __m256i out_16_2 =
-              _mm256_madd_epi16(out_00_0, k__cospi_p16_m16);
-          const __m256i out_16_3 =
-              _mm256_madd_epi16(out_00_1, k__cospi_p16_m16);
-          const __m256i out_08_2 =
-              _mm256_madd_epi16(out_08_0, k__cospi_p24_p08);
-          const __m256i out_08_3 =
-              _mm256_madd_epi16(out_08_1, k__cospi_p24_p08);
-          const __m256i out_24_2 =
-              _mm256_madd_epi16(out_08_0, k__cospi_m08_p24);
-          const __m256i out_24_3 =
-              _mm256_madd_epi16(out_08_1, k__cospi_m08_p24);
-          // dct_const_round_shift
-          const __m256i out_00_4 =
-              _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_00_5 =
-              _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_16_4 =
-              _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_16_5 =
-              _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_08_4 =
-              _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_08_5 =
-              _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_24_4 =
-              _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_24_5 =
-              _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS);
-          const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS);
-          const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS);
-          const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS);
-          const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS);
-          const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS);
-          const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS);
-          const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS);
-          // Combine
-          out[0] = _mm256_packs_epi32(out_00_6, out_00_7);
-          out[16] = _mm256_packs_epi32(out_16_6, out_16_7);
-          out[8] = _mm256_packs_epi32(out_08_6, out_08_7);
-          out[24] = _mm256_packs_epi32(out_24_6, out_24_7);
-        }
-        {
-          const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[9], step1[14]);
-          const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[9], step1[14]);
-          const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]);
-          const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]);
-          const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24);
-          const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24);
-          const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08);
-          const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08);
-          const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24);
-          const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24);
-          const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08);
-          const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08);
-          // dct_const_round_shift
-          const __m256i s2_09_4 =
-              _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
-          const __m256i s2_09_5 =
-              _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
-          const __m256i s2_10_4 =
-              _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
-          const __m256i s2_10_5 =
-              _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
-          const __m256i s2_13_4 =
-              _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
-          const __m256i s2_13_5 =
-              _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
-          const __m256i s2_14_4 =
-              _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
-          const __m256i s2_14_5 =
-              _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
-          const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS);
-          const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS);
-          const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS);
-          const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS);
-          const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS);
-          const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS);
-          const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS);
-          const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS);
-          // Combine
-          step2[9] = _mm256_packs_epi32(s2_09_6, s2_09_7);
-          step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7);
-          step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7);
-          step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7);
-        }
-        {
-          step2[16] = _mm256_add_epi16(step1[19], step3[16]);
-          step2[17] = _mm256_add_epi16(step1[18], step3[17]);
-          step2[18] = _mm256_sub_epi16(step3[17], step1[18]);
-          step2[19] = _mm256_sub_epi16(step3[16], step1[19]);
-          step2[20] = _mm256_sub_epi16(step3[23], step1[20]);
-          step2[21] = _mm256_sub_epi16(step3[22], step1[21]);
-          step2[22] = _mm256_add_epi16(step1[21], step3[22]);
-          step2[23] = _mm256_add_epi16(step1[20], step3[23]);
-          step2[24] = _mm256_add_epi16(step1[27], step3[24]);
-          step2[25] = _mm256_add_epi16(step1[26], step3[25]);
-          step2[26] = _mm256_sub_epi16(step3[25], step1[26]);
-          step2[27] = _mm256_sub_epi16(step3[24], step1[27]);
-          step2[28] = _mm256_sub_epi16(step3[31], step1[28]);
-          step2[29] = _mm256_sub_epi16(step3[30], step1[29]);
-          step2[30] = _mm256_add_epi16(step1[29], step3[30]);
-          step2[31] = _mm256_add_epi16(step1[28], step3[31]);
-        }
-        // Stage 6
-        {
-          const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
-          const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
-          const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
-          const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
-          const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
-          const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
-          const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
-          const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
-          const __m256i out_04_2 =
-              _mm256_madd_epi16(out_04_0, k__cospi_p28_p04);
-          const __m256i out_04_3 =
-              _mm256_madd_epi16(out_04_1, k__cospi_p28_p04);
-          const __m256i out_20_2 =
-              _mm256_madd_epi16(out_20_0, k__cospi_p12_p20);
-          const __m256i out_20_3 =
-              _mm256_madd_epi16(out_20_1, k__cospi_p12_p20);
-          const __m256i out_12_2 =
-              _mm256_madd_epi16(out_12_0, k__cospi_m20_p12);
-          const __m256i out_12_3 =
-              _mm256_madd_epi16(out_12_1, k__cospi_m20_p12);
-          const __m256i out_28_2 =
-              _mm256_madd_epi16(out_28_0, k__cospi_m04_p28);
-          const __m256i out_28_3 =
-              _mm256_madd_epi16(out_28_1, k__cospi_m04_p28);
-          // dct_const_round_shift
-          const __m256i out_04_4 =
-              _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_04_5 =
-              _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_20_4 =
-              _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_20_5 =
-              _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_12_4 =
-              _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_12_5 =
-              _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_28_4 =
-              _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_28_5 =
-              _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS);
-          const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS);
-          const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS);
-          const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS);
-          const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS);
-          const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS);
-          const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS);
-          const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS);
-          // Combine
-          out[4] = _mm256_packs_epi32(out_04_6, out_04_7);
-          out[20] = _mm256_packs_epi32(out_20_6, out_20_7);
-          out[12] = _mm256_packs_epi32(out_12_6, out_12_7);
-          out[28] = _mm256_packs_epi32(out_28_6, out_28_7);
-        }
-        {
-          step3[8] = _mm256_add_epi16(step2[9], step1[8]);
-          step3[9] = _mm256_sub_epi16(step1[8], step2[9]);
-          step3[10] = _mm256_sub_epi16(step1[11], step2[10]);
-          step3[11] = _mm256_add_epi16(step2[10], step1[11]);
-          step3[12] = _mm256_add_epi16(step2[13], step1[12]);
-          step3[13] = _mm256_sub_epi16(step1[12], step2[13]);
-          step3[14] = _mm256_sub_epi16(step1[15], step2[14]);
-          step3[15] = _mm256_add_epi16(step2[14], step1[15]);
-        }
-        {
-          const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]);
-          const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]);
-          const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]);
-          const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]);
-          const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]);
-          const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]);
-          const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]);
-          const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]);
-          const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28);
-          const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28);
-          const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04);
-          const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04);
-          const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12);
-          const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12);
-          const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20);
-          const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20);
-          const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12);
-          const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12);
-          const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20);
-          const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20);
-          const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28);
-          const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28);
-          const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04);
-          const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04);
-          // dct_const_round_shift
-          const __m256i s3_17_4 =
-              _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_17_5 =
-              _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_18_4 =
-              _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_18_5 =
-              _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_21_4 =
-              _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_21_5 =
-              _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_22_4 =
-              _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_22_5 =
-              _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS);
-          const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS);
-          const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS);
-          const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS);
-          const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS);
-          const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS);
-          const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS);
-          const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS);
-          const __m256i s3_25_4 =
-              _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_25_5 =
-              _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_26_4 =
-              _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_26_5 =
-              _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_29_4 =
-              _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_29_5 =
-              _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_30_4 =
-              _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_30_5 =
-              _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS);
-          const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS);
-          const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS);
-          const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS);
-          const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS);
-          const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS);
-          const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS);
-          const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS);
-          // Combine
-          step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7);
-          step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7);
-          step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7);
-          step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7);
-          // Combine
-          step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7);
-          step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7);
-          step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7);
-          step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7);
-        }
-        // Stage 7
-        {
-          const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[8], step3[15]);
-          const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[8], step3[15]);
-          const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[9], step3[14]);
-          const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[9], step3[14]);
-          const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]);
-          const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]);
-          const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]);
-          const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]);
-          const __m256i out_02_2 =
-              _mm256_madd_epi16(out_02_0, k__cospi_p30_p02);
-          const __m256i out_02_3 =
-              _mm256_madd_epi16(out_02_1, k__cospi_p30_p02);
-          const __m256i out_18_2 =
-              _mm256_madd_epi16(out_18_0, k__cospi_p14_p18);
-          const __m256i out_18_3 =
-              _mm256_madd_epi16(out_18_1, k__cospi_p14_p18);
-          const __m256i out_10_2 =
-              _mm256_madd_epi16(out_10_0, k__cospi_p22_p10);
-          const __m256i out_10_3 =
-              _mm256_madd_epi16(out_10_1, k__cospi_p22_p10);
-          const __m256i out_26_2 =
-              _mm256_madd_epi16(out_26_0, k__cospi_p06_p26);
-          const __m256i out_26_3 =
-              _mm256_madd_epi16(out_26_1, k__cospi_p06_p26);
-          const __m256i out_06_2 =
-              _mm256_madd_epi16(out_26_0, k__cospi_m26_p06);
-          const __m256i out_06_3 =
-              _mm256_madd_epi16(out_26_1, k__cospi_m26_p06);
-          const __m256i out_22_2 =
-              _mm256_madd_epi16(out_10_0, k__cospi_m10_p22);
-          const __m256i out_22_3 =
-              _mm256_madd_epi16(out_10_1, k__cospi_m10_p22);
-          const __m256i out_14_2 =
-              _mm256_madd_epi16(out_18_0, k__cospi_m18_p14);
-          const __m256i out_14_3 =
-              _mm256_madd_epi16(out_18_1, k__cospi_m18_p14);
-          const __m256i out_30_2 =
-              _mm256_madd_epi16(out_02_0, k__cospi_m02_p30);
-          const __m256i out_30_3 =
-              _mm256_madd_epi16(out_02_1, k__cospi_m02_p30);
-          // dct_const_round_shift
-          const __m256i out_02_4 =
-              _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_02_5 =
-              _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_18_4 =
-              _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_18_5 =
-              _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_10_4 =
-              _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_10_5 =
-              _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_26_4 =
-              _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_26_5 =
-              _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_06_4 =
-              _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_06_5 =
-              _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_22_4 =
-              _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_22_5 =
-              _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_14_4 =
-              _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_14_5 =
-              _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_30_4 =
-              _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_30_5 =
-              _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS);
-          const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS);
-          const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS);
-          const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS);
-          const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS);
-          const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS);
-          const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS);
-          const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS);
-          const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS);
-          const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS);
-          const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS);
-          const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS);
-          const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS);
-          const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS);
-          const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS);
-          const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS);
-          // Combine
-          out[2] = _mm256_packs_epi32(out_02_6, out_02_7);
-          out[18] = _mm256_packs_epi32(out_18_6, out_18_7);
-          out[10] = _mm256_packs_epi32(out_10_6, out_10_7);
-          out[26] = _mm256_packs_epi32(out_26_6, out_26_7);
-          out[6] = _mm256_packs_epi32(out_06_6, out_06_7);
-          out[22] = _mm256_packs_epi32(out_22_6, out_22_7);
-          out[14] = _mm256_packs_epi32(out_14_6, out_14_7);
-          out[30] = _mm256_packs_epi32(out_30_6, out_30_7);
-        }
-        {
-          step1[16] = _mm256_add_epi16(step3[17], step2[16]);
-          step1[17] = _mm256_sub_epi16(step2[16], step3[17]);
-          step1[18] = _mm256_sub_epi16(step2[19], step3[18]);
-          step1[19] = _mm256_add_epi16(step3[18], step2[19]);
-          step1[20] = _mm256_add_epi16(step3[21], step2[20]);
-          step1[21] = _mm256_sub_epi16(step2[20], step3[21]);
-          step1[22] = _mm256_sub_epi16(step2[23], step3[22]);
-          step1[23] = _mm256_add_epi16(step3[22], step2[23]);
-          step1[24] = _mm256_add_epi16(step3[25], step2[24]);
-          step1[25] = _mm256_sub_epi16(step2[24], step3[25]);
-          step1[26] = _mm256_sub_epi16(step2[27], step3[26]);
-          step1[27] = _mm256_add_epi16(step3[26], step2[27]);
-          step1[28] = _mm256_add_epi16(step3[29], step2[28]);
-          step1[29] = _mm256_sub_epi16(step2[28], step3[29]);
-          step1[30] = _mm256_sub_epi16(step2[31], step3[30]);
-          step1[31] = _mm256_add_epi16(step3[30], step2[31]);
-        }
-        // Final stage --- outputs indices are bit-reversed.
-        {
-          const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]);
-          const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]);
-          const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]);
-          const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]);
-          const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]);
-          const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]);
-          const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]);
-          const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]);
-          const __m256i out_01_2 =
-              _mm256_madd_epi16(out_01_0, k__cospi_p31_p01);
-          const __m256i out_01_3 =
-              _mm256_madd_epi16(out_01_1, k__cospi_p31_p01);
-          const __m256i out_17_2 =
-              _mm256_madd_epi16(out_17_0, k__cospi_p15_p17);
-          const __m256i out_17_3 =
-              _mm256_madd_epi16(out_17_1, k__cospi_p15_p17);
-          const __m256i out_09_2 =
-              _mm256_madd_epi16(out_09_0, k__cospi_p23_p09);
-          const __m256i out_09_3 =
-              _mm256_madd_epi16(out_09_1, k__cospi_p23_p09);
-          const __m256i out_25_2 =
-              _mm256_madd_epi16(out_25_0, k__cospi_p07_p25);
-          const __m256i out_25_3 =
-              _mm256_madd_epi16(out_25_1, k__cospi_p07_p25);
-          const __m256i out_07_2 =
-              _mm256_madd_epi16(out_25_0, k__cospi_m25_p07);
-          const __m256i out_07_3 =
-              _mm256_madd_epi16(out_25_1, k__cospi_m25_p07);
-          const __m256i out_23_2 =
-              _mm256_madd_epi16(out_09_0, k__cospi_m09_p23);
-          const __m256i out_23_3 =
-              _mm256_madd_epi16(out_09_1, k__cospi_m09_p23);
-          const __m256i out_15_2 =
-              _mm256_madd_epi16(out_17_0, k__cospi_m17_p15);
-          const __m256i out_15_3 =
-              _mm256_madd_epi16(out_17_1, k__cospi_m17_p15);
-          const __m256i out_31_2 =
-              _mm256_madd_epi16(out_01_0, k__cospi_m01_p31);
-          const __m256i out_31_3 =
-              _mm256_madd_epi16(out_01_1, k__cospi_m01_p31);
-          // dct_const_round_shift
-          const __m256i out_01_4 =
-              _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_01_5 =
-              _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_17_4 =
-              _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_17_5 =
-              _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_09_4 =
-              _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_09_5 =
-              _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_25_4 =
-              _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_25_5 =
-              _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_07_4 =
-              _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_07_5 =
-              _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_23_4 =
-              _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_23_5 =
-              _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_15_4 =
-              _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_15_5 =
-              _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_31_4 =
-              _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_31_5 =
-              _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS);
-          const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS);
-          const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS);
-          const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS);
-          const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS);
-          const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS);
-          const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS);
-          const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS);
-          const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS);
-          const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS);
-          const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS);
-          const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS);
-          const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS);
-          const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS);
-          const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS);
-          const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS);
-          // Combine
-          out[1] = _mm256_packs_epi32(out_01_6, out_01_7);
-          out[17] = _mm256_packs_epi32(out_17_6, out_17_7);
-          out[9] = _mm256_packs_epi32(out_09_6, out_09_7);
-          out[25] = _mm256_packs_epi32(out_25_6, out_25_7);
-          out[7] = _mm256_packs_epi32(out_07_6, out_07_7);
-          out[23] = _mm256_packs_epi32(out_23_6, out_23_7);
-          out[15] = _mm256_packs_epi32(out_15_6, out_15_7);
-          out[31] = _mm256_packs_epi32(out_31_6, out_31_7);
-        }
-        {
-          const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]);
-          const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]);
-          const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]);
-          const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]);
-          const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]);
-          const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]);
-          const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]);
-          const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]);
-          const __m256i out_05_2 =
-              _mm256_madd_epi16(out_05_0, k__cospi_p27_p05);
-          const __m256i out_05_3 =
-              _mm256_madd_epi16(out_05_1, k__cospi_p27_p05);
-          const __m256i out_21_2 =
-              _mm256_madd_epi16(out_21_0, k__cospi_p11_p21);
-          const __m256i out_21_3 =
-              _mm256_madd_epi16(out_21_1, k__cospi_p11_p21);
-          const __m256i out_13_2 =
-              _mm256_madd_epi16(out_13_0, k__cospi_p19_p13);
-          const __m256i out_13_3 =
-              _mm256_madd_epi16(out_13_1, k__cospi_p19_p13);
-          const __m256i out_29_2 =
-              _mm256_madd_epi16(out_29_0, k__cospi_p03_p29);
-          const __m256i out_29_3 =
-              _mm256_madd_epi16(out_29_1, k__cospi_p03_p29);
-          const __m256i out_03_2 =
-              _mm256_madd_epi16(out_29_0, k__cospi_m29_p03);
-          const __m256i out_03_3 =
-              _mm256_madd_epi16(out_29_1, k__cospi_m29_p03);
-          const __m256i out_19_2 =
-              _mm256_madd_epi16(out_13_0, k__cospi_m13_p19);
-          const __m256i out_19_3 =
-              _mm256_madd_epi16(out_13_1, k__cospi_m13_p19);
-          const __m256i out_11_2 =
-              _mm256_madd_epi16(out_21_0, k__cospi_m21_p11);
-          const __m256i out_11_3 =
-              _mm256_madd_epi16(out_21_1, k__cospi_m21_p11);
-          const __m256i out_27_2 =
-              _mm256_madd_epi16(out_05_0, k__cospi_m05_p27);
-          const __m256i out_27_3 =
-              _mm256_madd_epi16(out_05_1, k__cospi_m05_p27);
-          // dct_const_round_shift
-          const __m256i out_05_4 =
-              _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_05_5 =
-              _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_21_4 =
-              _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_21_5 =
-              _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_13_4 =
-              _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_13_5 =
-              _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_29_4 =
-              _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_29_5 =
-              _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_03_4 =
-              _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_03_5 =
-              _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_19_4 =
-              _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_19_5 =
-              _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_11_4 =
-              _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_11_5 =
-              _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_27_4 =
-              _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_27_5 =
-              _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS);
-          const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS);
-          const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS);
-          const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS);
-          const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS);
-          const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS);
-          const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS);
-          const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS);
-          const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS);
-          const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS);
-          const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS);
-          const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS);
-          const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS);
-          const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS);
-          const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS);
-          const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS);
-          // Combine
-          out[5] = _mm256_packs_epi32(out_05_6, out_05_7);
-          out[21] = _mm256_packs_epi32(out_21_6, out_21_7);
-          out[13] = _mm256_packs_epi32(out_13_6, out_13_7);
-          out[29] = _mm256_packs_epi32(out_29_6, out_29_7);
-          out[3] = _mm256_packs_epi32(out_03_6, out_03_7);
-          out[19] = _mm256_packs_epi32(out_19_6, out_19_7);
-          out[11] = _mm256_packs_epi32(out_11_6, out_11_7);
-          out[27] = _mm256_packs_epi32(out_27_6, out_27_7);
-        }
-#if FDCT32x32_HIGH_PRECISION
-      } else {
-        __m256i lstep1[64], lstep2[64], lstep3[64];
-        __m256i u[32], v[32], sign[16];
-        const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
-        // start using 32-bit operations
-        // stage 3
-        {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[0] = _mm256_unpacklo_epi16(step2[0], kZero);
-          lstep2[1] = _mm256_unpackhi_epi16(step2[0], kZero);
-          lstep2[2] = _mm256_unpacklo_epi16(step2[1], kZero);
-          lstep2[3] = _mm256_unpackhi_epi16(step2[1], kZero);
-          lstep2[4] = _mm256_unpacklo_epi16(step2[2], kZero);
-          lstep2[5] = _mm256_unpackhi_epi16(step2[2], kZero);
-          lstep2[6] = _mm256_unpacklo_epi16(step2[3], kZero);
-          lstep2[7] = _mm256_unpackhi_epi16(step2[3], kZero);
-          lstep2[8] = _mm256_unpacklo_epi16(step2[4], kZero);
-          lstep2[9] = _mm256_unpackhi_epi16(step2[4], kZero);
-          lstep2[10] = _mm256_unpacklo_epi16(step2[5], kZero);
-          lstep2[11] = _mm256_unpackhi_epi16(step2[5], kZero);
-          lstep2[12] = _mm256_unpacklo_epi16(step2[6], kZero);
-          lstep2[13] = _mm256_unpackhi_epi16(step2[6], kZero);
-          lstep2[14] = _mm256_unpacklo_epi16(step2[7], kZero);
-          lstep2[15] = _mm256_unpackhi_epi16(step2[7], kZero);
-          lstep2[0] = _mm256_madd_epi16(lstep2[0], kOne);
-          lstep2[1] = _mm256_madd_epi16(lstep2[1], kOne);
-          lstep2[2] = _mm256_madd_epi16(lstep2[2], kOne);
-          lstep2[3] = _mm256_madd_epi16(lstep2[3], kOne);
-          lstep2[4] = _mm256_madd_epi16(lstep2[4], kOne);
-          lstep2[5] = _mm256_madd_epi16(lstep2[5], kOne);
-          lstep2[6] = _mm256_madd_epi16(lstep2[6], kOne);
-          lstep2[7] = _mm256_madd_epi16(lstep2[7], kOne);
-          lstep2[8] = _mm256_madd_epi16(lstep2[8], kOne);
-          lstep2[9] = _mm256_madd_epi16(lstep2[9], kOne);
-          lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne);
-          lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne);
-          lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne);
-          lstep2[13] = _mm256_madd_epi16(lstep2[13], kOne);
-          lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne);
-          lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne);
-
-          lstep3[0] = _mm256_add_epi32(lstep2[14], lstep2[0]);
-          lstep3[1] = _mm256_add_epi32(lstep2[15], lstep2[1]);
-          lstep3[2] = _mm256_add_epi32(lstep2[12], lstep2[2]);
-          lstep3[3] = _mm256_add_epi32(lstep2[13], lstep2[3]);
-          lstep3[4] = _mm256_add_epi32(lstep2[10], lstep2[4]);
-          lstep3[5] = _mm256_add_epi32(lstep2[11], lstep2[5]);
-          lstep3[6] = _mm256_add_epi32(lstep2[8], lstep2[6]);
-          lstep3[7] = _mm256_add_epi32(lstep2[9], lstep2[7]);
-          lstep3[8] = _mm256_sub_epi32(lstep2[6], lstep2[8]);
-          lstep3[9] = _mm256_sub_epi32(lstep2[7], lstep2[9]);
-          lstep3[10] = _mm256_sub_epi32(lstep2[4], lstep2[10]);
-          lstep3[11] = _mm256_sub_epi32(lstep2[5], lstep2[11]);
-          lstep3[12] = _mm256_sub_epi32(lstep2[2], lstep2[12]);
-          lstep3[13] = _mm256_sub_epi32(lstep2[3], lstep2[13]);
-          lstep3[14] = _mm256_sub_epi32(lstep2[0], lstep2[14]);
-          lstep3[15] = _mm256_sub_epi32(lstep2[1], lstep2[15]);
-        }
-        {
-          const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
-          const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
-          const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
-          const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
-          const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
-          const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
-          const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
-          const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
-          const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
-          const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
-          const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
-          const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m256i s3_10_4 =
-              _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_10_5 =
-              _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_11_4 =
-              _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_11_5 =
-              _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_12_4 =
-              _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_12_5 =
-              _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_13_4 =
-              _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_13_5 =
-              _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-          lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
-          lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
-          lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
-          lstep3[23] = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
-          lstep3[24] = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
-          lstep3[25] = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
-          lstep3[26] = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
-          lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
-        }
-        {
-          lstep2[40] = _mm256_unpacklo_epi16(step2[20], kZero);
-          lstep2[41] = _mm256_unpackhi_epi16(step2[20], kZero);
-          lstep2[42] = _mm256_unpacklo_epi16(step2[21], kZero);
-          lstep2[43] = _mm256_unpackhi_epi16(step2[21], kZero);
-          lstep2[44] = _mm256_unpacklo_epi16(step2[22], kZero);
-          lstep2[45] = _mm256_unpackhi_epi16(step2[22], kZero);
-          lstep2[46] = _mm256_unpacklo_epi16(step2[23], kZero);
-          lstep2[47] = _mm256_unpackhi_epi16(step2[23], kZero);
-          lstep2[48] = _mm256_unpacklo_epi16(step2[24], kZero);
-          lstep2[49] = _mm256_unpackhi_epi16(step2[24], kZero);
-          lstep2[50] = _mm256_unpacklo_epi16(step2[25], kZero);
-          lstep2[51] = _mm256_unpackhi_epi16(step2[25], kZero);
-          lstep2[52] = _mm256_unpacklo_epi16(step2[26], kZero);
-          lstep2[53] = _mm256_unpackhi_epi16(step2[26], kZero);
-          lstep2[54] = _mm256_unpacklo_epi16(step2[27], kZero);
-          lstep2[55] = _mm256_unpackhi_epi16(step2[27], kZero);
-          lstep2[40] = _mm256_madd_epi16(lstep2[40], kOne);
-          lstep2[41] = _mm256_madd_epi16(lstep2[41], kOne);
-          lstep2[42] = _mm256_madd_epi16(lstep2[42], kOne);
-          lstep2[43] = _mm256_madd_epi16(lstep2[43], kOne);
-          lstep2[44] = _mm256_madd_epi16(lstep2[44], kOne);
-          lstep2[45] = _mm256_madd_epi16(lstep2[45], kOne);
-          lstep2[46] = _mm256_madd_epi16(lstep2[46], kOne);
-          lstep2[47] = _mm256_madd_epi16(lstep2[47], kOne);
-          lstep2[48] = _mm256_madd_epi16(lstep2[48], kOne);
-          lstep2[49] = _mm256_madd_epi16(lstep2[49], kOne);
-          lstep2[50] = _mm256_madd_epi16(lstep2[50], kOne);
-          lstep2[51] = _mm256_madd_epi16(lstep2[51], kOne);
-          lstep2[52] = _mm256_madd_epi16(lstep2[52], kOne);
-          lstep2[53] = _mm256_madd_epi16(lstep2[53], kOne);
-          lstep2[54] = _mm256_madd_epi16(lstep2[54], kOne);
-          lstep2[55] = _mm256_madd_epi16(lstep2[55], kOne);
-
-          lstep1[32] = _mm256_unpacklo_epi16(step1[16], kZero);
-          lstep1[33] = _mm256_unpackhi_epi16(step1[16], kZero);
-          lstep1[34] = _mm256_unpacklo_epi16(step1[17], kZero);
-          lstep1[35] = _mm256_unpackhi_epi16(step1[17], kZero);
-          lstep1[36] = _mm256_unpacklo_epi16(step1[18], kZero);
-          lstep1[37] = _mm256_unpackhi_epi16(step1[18], kZero);
-          lstep1[38] = _mm256_unpacklo_epi16(step1[19], kZero);
-          lstep1[39] = _mm256_unpackhi_epi16(step1[19], kZero);
-          lstep1[56] = _mm256_unpacklo_epi16(step1[28], kZero);
-          lstep1[57] = _mm256_unpackhi_epi16(step1[28], kZero);
-          lstep1[58] = _mm256_unpacklo_epi16(step1[29], kZero);
-          lstep1[59] = _mm256_unpackhi_epi16(step1[29], kZero);
-          lstep1[60] = _mm256_unpacklo_epi16(step1[30], kZero);
-          lstep1[61] = _mm256_unpackhi_epi16(step1[30], kZero);
-          lstep1[62] = _mm256_unpacklo_epi16(step1[31], kZero);
-          lstep1[63] = _mm256_unpackhi_epi16(step1[31], kZero);
-          lstep1[32] = _mm256_madd_epi16(lstep1[32], kOne);
-          lstep1[33] = _mm256_madd_epi16(lstep1[33], kOne);
-          lstep1[34] = _mm256_madd_epi16(lstep1[34], kOne);
-          lstep1[35] = _mm256_madd_epi16(lstep1[35], kOne);
-          lstep1[36] = _mm256_madd_epi16(lstep1[36], kOne);
-          lstep1[37] = _mm256_madd_epi16(lstep1[37], kOne);
-          lstep1[38] = _mm256_madd_epi16(lstep1[38], kOne);
-          lstep1[39] = _mm256_madd_epi16(lstep1[39], kOne);
-          lstep1[56] = _mm256_madd_epi16(lstep1[56], kOne);
-          lstep1[57] = _mm256_madd_epi16(lstep1[57], kOne);
-          lstep1[58] = _mm256_madd_epi16(lstep1[58], kOne);
-          lstep1[59] = _mm256_madd_epi16(lstep1[59], kOne);
-          lstep1[60] = _mm256_madd_epi16(lstep1[60], kOne);
-          lstep1[61] = _mm256_madd_epi16(lstep1[61], kOne);
-          lstep1[62] = _mm256_madd_epi16(lstep1[62], kOne);
-          lstep1[63] = _mm256_madd_epi16(lstep1[63], kOne);
-
-          lstep3[32] = _mm256_add_epi32(lstep2[46], lstep1[32]);
-          lstep3[33] = _mm256_add_epi32(lstep2[47], lstep1[33]);
-
-          lstep3[34] = _mm256_add_epi32(lstep2[44], lstep1[34]);
-          lstep3[35] = _mm256_add_epi32(lstep2[45], lstep1[35]);
-          lstep3[36] = _mm256_add_epi32(lstep2[42], lstep1[36]);
-          lstep3[37] = _mm256_add_epi32(lstep2[43], lstep1[37]);
-          lstep3[38] = _mm256_add_epi32(lstep2[40], lstep1[38]);
-          lstep3[39] = _mm256_add_epi32(lstep2[41], lstep1[39]);
-          lstep3[40] = _mm256_sub_epi32(lstep1[38], lstep2[40]);
-          lstep3[41] = _mm256_sub_epi32(lstep1[39], lstep2[41]);
-          lstep3[42] = _mm256_sub_epi32(lstep1[36], lstep2[42]);
-          lstep3[43] = _mm256_sub_epi32(lstep1[37], lstep2[43]);
-          lstep3[44] = _mm256_sub_epi32(lstep1[34], lstep2[44]);
-          lstep3[45] = _mm256_sub_epi32(lstep1[35], lstep2[45]);
-          lstep3[46] = _mm256_sub_epi32(lstep1[32], lstep2[46]);
-          lstep3[47] = _mm256_sub_epi32(lstep1[33], lstep2[47]);
-          lstep3[48] = _mm256_sub_epi32(lstep1[62], lstep2[48]);
-          lstep3[49] = _mm256_sub_epi32(lstep1[63], lstep2[49]);
-          lstep3[50] = _mm256_sub_epi32(lstep1[60], lstep2[50]);
-          lstep3[51] = _mm256_sub_epi32(lstep1[61], lstep2[51]);
-          lstep3[52] = _mm256_sub_epi32(lstep1[58], lstep2[52]);
-          lstep3[53] = _mm256_sub_epi32(lstep1[59], lstep2[53]);
-          lstep3[54] = _mm256_sub_epi32(lstep1[56], lstep2[54]);
-          lstep3[55] = _mm256_sub_epi32(lstep1[57], lstep2[55]);
-          lstep3[56] = _mm256_add_epi32(lstep2[54], lstep1[56]);
-          lstep3[57] = _mm256_add_epi32(lstep2[55], lstep1[57]);
-          lstep3[58] = _mm256_add_epi32(lstep2[52], lstep1[58]);
-          lstep3[59] = _mm256_add_epi32(lstep2[53], lstep1[59]);
-          lstep3[60] = _mm256_add_epi32(lstep2[50], lstep1[60]);
-          lstep3[61] = _mm256_add_epi32(lstep2[51], lstep1[61]);
-          lstep3[62] = _mm256_add_epi32(lstep2[48], lstep1[62]);
-          lstep3[63] = _mm256_add_epi32(lstep2[49], lstep1[63]);
-        }
-
-        // stage 4
-        {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[16] = _mm256_unpacklo_epi16(step2[8], kZero);
-          lstep2[17] = _mm256_unpackhi_epi16(step2[8], kZero);
-          lstep2[18] = _mm256_unpacklo_epi16(step2[9], kZero);
-          lstep2[19] = _mm256_unpackhi_epi16(step2[9], kZero);
-          lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero);
-          lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero);
-          lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero);
-          lstep2[31] = _mm256_unpackhi_epi16(step2[15], kZero);
-          lstep2[16] = _mm256_madd_epi16(lstep2[16], kOne);
-          lstep2[17] = _mm256_madd_epi16(lstep2[17], kOne);
-          lstep2[18] = _mm256_madd_epi16(lstep2[18], kOne);
-          lstep2[19] = _mm256_madd_epi16(lstep2[19], kOne);
-          lstep2[28] = _mm256_madd_epi16(lstep2[28], kOne);
-          lstep2[29] = _mm256_madd_epi16(lstep2[29], kOne);
-          lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne);
-          lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne);
-
-          lstep1[0] = _mm256_add_epi32(lstep3[6], lstep3[0]);
-          lstep1[1] = _mm256_add_epi32(lstep3[7], lstep3[1]);
-          lstep1[2] = _mm256_add_epi32(lstep3[4], lstep3[2]);
-          lstep1[3] = _mm256_add_epi32(lstep3[5], lstep3[3]);
-          lstep1[4] = _mm256_sub_epi32(lstep3[2], lstep3[4]);
-          lstep1[5] = _mm256_sub_epi32(lstep3[3], lstep3[5]);
-          lstep1[6] = _mm256_sub_epi32(lstep3[0], lstep3[6]);
-          lstep1[7] = _mm256_sub_epi32(lstep3[1], lstep3[7]);
-          lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]);
-          lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]);
-          lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]);
-          lstep1[19] = _mm256_add_epi32(lstep3[21], lstep2[19]);
-          lstep1[20] = _mm256_sub_epi32(lstep2[18], lstep3[20]);
-          lstep1[21] = _mm256_sub_epi32(lstep2[19], lstep3[21]);
-          lstep1[22] = _mm256_sub_epi32(lstep2[16], lstep3[22]);
-          lstep1[23] = _mm256_sub_epi32(lstep2[17], lstep3[23]);
-          lstep1[24] = _mm256_sub_epi32(lstep2[30], lstep3[24]);
-          lstep1[25] = _mm256_sub_epi32(lstep2[31], lstep3[25]);
-          lstep1[26] = _mm256_sub_epi32(lstep2[28], lstep3[26]);
-          lstep1[27] = _mm256_sub_epi32(lstep2[29], lstep3[27]);
-          lstep1[28] = _mm256_add_epi32(lstep3[26], lstep2[28]);
-          lstep1[29] = _mm256_add_epi32(lstep3[27], lstep2[29]);
-          lstep1[30] = _mm256_add_epi32(lstep3[24], lstep2[30]);
-          lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]);
-        }
-        {
-          // to be continued...
-          //
-          const __m256i k32_p16_p16 =
-              pair256_set_epi32(cospi_16_64, cospi_16_64);
-          const __m256i k32_p16_m16 =
-              pair256_set_epi32(cospi_16_64, -cospi_16_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]);
-          u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]);
-          u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]);
-          u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]);
-
-          // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
-          // instruction latency.
-          v[0] = k_madd_epi32_avx2(u[0], k32_p16_m16);
-          v[1] = k_madd_epi32_avx2(u[1], k32_p16_m16);
-          v[2] = k_madd_epi32_avx2(u[2], k32_p16_m16);
-          v[3] = k_madd_epi32_avx2(u[3], k32_p16_m16);
-          v[4] = k_madd_epi32_avx2(u[0], k32_p16_p16);
-          v[5] = k_madd_epi32_avx2(u[1], k32_p16_p16);
-          v[6] = k_madd_epi32_avx2(u[2], k32_p16_p16);
-          v[7] = k_madd_epi32_avx2(u[3], k32_p16_p16);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-
-          lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-        }
-        {
-          const __m256i k32_m08_p24 =
-              pair256_set_epi32(-cospi_8_64, cospi_24_64);
-          const __m256i k32_m24_m08 =
-              pair256_set_epi32(-cospi_24_64, -cospi_8_64);
-          const __m256i k32_p24_p08 =
-              pair256_set_epi32(cospi_24_64, cospi_8_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]);
-          u[1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]);
-          u[2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]);
-          u[3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]);
-          u[4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]);
-          u[5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]);
-          u[6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]);
-          u[7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]);
-          u[8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]);
-          u[9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]);
-          u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]);
-          u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]);
-          u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]);
-          u[13] = _mm256_unpackhi_epi32(lstep3[42], lstep3[52]);
-          u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]);
-          u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24);
-          v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24);
-          v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24);
-          v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24);
-          v[4] = k_madd_epi32_avx2(u[4], k32_m08_p24);
-          v[5] = k_madd_epi32_avx2(u[5], k32_m08_p24);
-          v[6] = k_madd_epi32_avx2(u[6], k32_m08_p24);
-          v[7] = k_madd_epi32_avx2(u[7], k32_m08_p24);
-          v[8] = k_madd_epi32_avx2(u[8], k32_m24_m08);
-          v[9] = k_madd_epi32_avx2(u[9], k32_m24_m08);
-          v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08);
-          v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08);
-          v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08);
-          v[13] = k_madd_epi32_avx2(u[13], k32_m24_m08);
-          v[14] = k_madd_epi32_avx2(u[14], k32_m24_m08);
-          v[15] = k_madd_epi32_avx2(u[15], k32_m24_m08);
-          v[16] = k_madd_epi32_avx2(u[12], k32_m08_p24);
-          v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24);
-          v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24);
-          v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24);
-          v[20] = k_madd_epi32_avx2(u[8], k32_m08_p24);
-          v[21] = k_madd_epi32_avx2(u[9], k32_m08_p24);
-          v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24);
-          v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24);
-          v[24] = k_madd_epi32_avx2(u[4], k32_p24_p08);
-          v[25] = k_madd_epi32_avx2(u[5], k32_p24_p08);
-          v[26] = k_madd_epi32_avx2(u[6], k32_p24_p08);
-          v[27] = k_madd_epi32_avx2(u[7], k32_p24_p08);
-          v[28] = k_madd_epi32_avx2(u[0], k32_p24_p08);
-          v[29] = k_madd_epi32_avx2(u[1], k32_p24_p08);
-          v[30] = k_madd_epi32_avx2(u[2], k32_p24_p08);
-          v[31] = k_madd_epi32_avx2(u[3], k32_p24_p08);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-          u[8] = k_packs_epi64_avx2(v[16], v[17]);
-          u[9] = k_packs_epi64_avx2(v[18], v[19]);
-          u[10] = k_packs_epi64_avx2(v[20], v[21]);
-          u[11] = k_packs_epi64_avx2(v[22], v[23]);
-          u[12] = k_packs_epi64_avx2(v[24], v[25]);
-          u[13] = k_packs_epi64_avx2(v[26], v[27]);
-          u[14] = k_packs_epi64_avx2(v[28], v[29]);
-          u[15] = k_packs_epi64_avx2(v[30], v[31]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          lstep1[36] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep1[37] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep1[38] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep1[39] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          lstep1[40] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          lstep1[41] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          lstep1[42] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          lstep1[43] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-          lstep1[52] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
-          lstep1[53] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
-          lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
-          lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
-          lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
-          lstep1[57] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
-          lstep1[58] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
-          lstep1[59] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
-        }
-        // stage 5
-        {
-          lstep2[8] = _mm256_add_epi32(lstep1[10], lstep3[8]);
-          lstep2[9] = _mm256_add_epi32(lstep1[11], lstep3[9]);
-          lstep2[10] = _mm256_sub_epi32(lstep3[8], lstep1[10]);
-          lstep2[11] = _mm256_sub_epi32(lstep3[9], lstep1[11]);
-          lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]);
-          lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]);
-          lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]);
-          lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]);
-        }
-        {
-          const __m256i k32_p16_p16 =
-              pair256_set_epi32(cospi_16_64, cospi_16_64);
-          const __m256i k32_p16_m16 =
-              pair256_set_epi32(cospi_16_64, -cospi_16_64);
-          const __m256i k32_p24_p08 =
-              pair256_set_epi32(cospi_24_64, cospi_8_64);
-          const __m256i k32_m08_p24 =
-              pair256_set_epi32(-cospi_8_64, cospi_24_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]);
-          u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]);
-          u[2] = _mm256_unpacklo_epi32(lstep1[1], lstep1[3]);
-          u[3] = _mm256_unpackhi_epi32(lstep1[1], lstep1[3]);
-          u[4] = _mm256_unpacklo_epi32(lstep1[4], lstep1[6]);
-          u[5] = _mm256_unpackhi_epi32(lstep1[4], lstep1[6]);
-          u[6] = _mm256_unpacklo_epi32(lstep1[5], lstep1[7]);
-          u[7] = _mm256_unpackhi_epi32(lstep1[5], lstep1[7]);
-
-          // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
-          // instruction latency.
-          v[0] = k_madd_epi32_avx2(u[0], k32_p16_p16);
-          v[1] = k_madd_epi32_avx2(u[1], k32_p16_p16);
-          v[2] = k_madd_epi32_avx2(u[2], k32_p16_p16);
-          v[3] = k_madd_epi32_avx2(u[3], k32_p16_p16);
-          v[4] = k_madd_epi32_avx2(u[0], k32_p16_m16);
-          v[5] = k_madd_epi32_avx2(u[1], k32_p16_m16);
-          v[6] = k_madd_epi32_avx2(u[2], k32_p16_m16);
-          v[7] = k_madd_epi32_avx2(u[3], k32_p16_m16);
-          v[8] = k_madd_epi32_avx2(u[4], k32_p24_p08);
-          v[9] = k_madd_epi32_avx2(u[5], k32_p24_p08);
-          v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08);
-          v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08);
-          v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24);
-          v[13] = k_madd_epi32_avx2(u[5], k32_m08_p24);
-          v[14] = k_madd_epi32_avx2(u[6], k32_m08_p24);
-          v[15] = k_madd_epi32_avx2(u[7], k32_m08_p24);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-
-          sign[0] = _mm256_cmpgt_epi32(kZero, u[0]);
-          sign[1] = _mm256_cmpgt_epi32(kZero, u[1]);
-          sign[2] = _mm256_cmpgt_epi32(kZero, u[2]);
-          sign[3] = _mm256_cmpgt_epi32(kZero, u[3]);
-          sign[4] = _mm256_cmpgt_epi32(kZero, u[4]);
-          sign[5] = _mm256_cmpgt_epi32(kZero, u[5]);
-          sign[6] = _mm256_cmpgt_epi32(kZero, u[6]);
-          sign[7] = _mm256_cmpgt_epi32(kZero, u[7]);
-
-          u[0] = _mm256_sub_epi32(u[0], sign[0]);
-          u[1] = _mm256_sub_epi32(u[1], sign[1]);
-          u[2] = _mm256_sub_epi32(u[2], sign[2]);
-          u[3] = _mm256_sub_epi32(u[3], sign[3]);
-          u[4] = _mm256_sub_epi32(u[4], sign[4]);
-          u[5] = _mm256_sub_epi32(u[5], sign[5]);
-          u[6] = _mm256_sub_epi32(u[6], sign[6]);
-          u[7] = _mm256_sub_epi32(u[7], sign[7]);
-
-          u[0] = _mm256_add_epi32(u[0], K32One);
-          u[1] = _mm256_add_epi32(u[1], K32One);
-          u[2] = _mm256_add_epi32(u[2], K32One);
-          u[3] = _mm256_add_epi32(u[3], K32One);
-          u[4] = _mm256_add_epi32(u[4], K32One);
-          u[5] = _mm256_add_epi32(u[5], K32One);
-          u[6] = _mm256_add_epi32(u[6], K32One);
-          u[7] = _mm256_add_epi32(u[7], K32One);
-
-          u[0] = _mm256_srai_epi32(u[0], 2);
-          u[1] = _mm256_srai_epi32(u[1], 2);
-          u[2] = _mm256_srai_epi32(u[2], 2);
-          u[3] = _mm256_srai_epi32(u[3], 2);
-          u[4] = _mm256_srai_epi32(u[4], 2);
-          u[5] = _mm256_srai_epi32(u[5], 2);
-          u[6] = _mm256_srai_epi32(u[6], 2);
-          u[7] = _mm256_srai_epi32(u[7], 2);
-
-          // Combine
-          out[0] = _mm256_packs_epi32(u[0], u[1]);
-          out[16] = _mm256_packs_epi32(u[2], u[3]);
-          out[8] = _mm256_packs_epi32(u[4], u[5]);
-          out[24] = _mm256_packs_epi32(u[6], u[7]);
-        }
-        {
-          const __m256i k32_m08_p24 =
-              pair256_set_epi32(-cospi_8_64, cospi_24_64);
-          const __m256i k32_m24_m08 =
-              pair256_set_epi32(-cospi_24_64, -cospi_8_64);
-          const __m256i k32_p24_p08 =
-              pair256_set_epi32(cospi_24_64, cospi_8_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]);
-          u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]);
-          u[2] = _mm256_unpacklo_epi32(lstep1[19], lstep1[29]);
-          u[3] = _mm256_unpackhi_epi32(lstep1[19], lstep1[29]);
-          u[4] = _mm256_unpacklo_epi32(lstep1[20], lstep1[26]);
-          u[5] = _mm256_unpackhi_epi32(lstep1[20], lstep1[26]);
-          u[6] = _mm256_unpacklo_epi32(lstep1[21], lstep1[27]);
-          u[7] = _mm256_unpackhi_epi32(lstep1[21], lstep1[27]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24);
-          v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24);
-          v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24);
-          v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24);
-          v[4] = k_madd_epi32_avx2(u[4], k32_m24_m08);
-          v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08);
-          v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08);
-          v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08);
-          v[8] = k_madd_epi32_avx2(u[4], k32_m08_p24);
-          v[9] = k_madd_epi32_avx2(u[5], k32_m08_p24);
-          v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24);
-          v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24);
-          v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08);
-          v[13] = k_madd_epi32_avx2(u[1], k32_p24_p08);
-          v[14] = k_madd_epi32_avx2(u[2], k32_p24_p08);
-          v[15] = k_madd_epi32_avx2(u[3], k32_p24_p08);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-
-          u[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          u[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          u[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          u[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          u[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          u[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          u[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          u[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          lstep2[18] = _mm256_srai_epi32(u[0], DCT_CONST_BITS);
-          lstep2[19] = _mm256_srai_epi32(u[1], DCT_CONST_BITS);
-          lstep2[20] = _mm256_srai_epi32(u[2], DCT_CONST_BITS);
-          lstep2[21] = _mm256_srai_epi32(u[3], DCT_CONST_BITS);
-          lstep2[26] = _mm256_srai_epi32(u[4], DCT_CONST_BITS);
-          lstep2[27] = _mm256_srai_epi32(u[5], DCT_CONST_BITS);
-          lstep2[28] = _mm256_srai_epi32(u[6], DCT_CONST_BITS);
-          lstep2[29] = _mm256_srai_epi32(u[7], DCT_CONST_BITS);
-        }
-        {
-          lstep2[32] = _mm256_add_epi32(lstep1[38], lstep3[32]);
-          lstep2[33] = _mm256_add_epi32(lstep1[39], lstep3[33]);
-          lstep2[34] = _mm256_add_epi32(lstep1[36], lstep3[34]);
-          lstep2[35] = _mm256_add_epi32(lstep1[37], lstep3[35]);
-          lstep2[36] = _mm256_sub_epi32(lstep3[34], lstep1[36]);
-          lstep2[37] = _mm256_sub_epi32(lstep3[35], lstep1[37]);
-          lstep2[38] = _mm256_sub_epi32(lstep3[32], lstep1[38]);
-          lstep2[39] = _mm256_sub_epi32(lstep3[33], lstep1[39]);
-          lstep2[40] = _mm256_sub_epi32(lstep3[46], lstep1[40]);
-          lstep2[41] = _mm256_sub_epi32(lstep3[47], lstep1[41]);
-          lstep2[42] = _mm256_sub_epi32(lstep3[44], lstep1[42]);
-          lstep2[43] = _mm256_sub_epi32(lstep3[45], lstep1[43]);
-          lstep2[44] = _mm256_add_epi32(lstep1[42], lstep3[44]);
-          lstep2[45] = _mm256_add_epi32(lstep1[43], lstep3[45]);
-          lstep2[46] = _mm256_add_epi32(lstep1[40], lstep3[46]);
-          lstep2[47] = _mm256_add_epi32(lstep1[41], lstep3[47]);
-          lstep2[48] = _mm256_add_epi32(lstep1[54], lstep3[48]);
-          lstep2[49] = _mm256_add_epi32(lstep1[55], lstep3[49]);
-          lstep2[50] = _mm256_add_epi32(lstep1[52], lstep3[50]);
-          lstep2[51] = _mm256_add_epi32(lstep1[53], lstep3[51]);
-          lstep2[52] = _mm256_sub_epi32(lstep3[50], lstep1[52]);
-          lstep2[53] = _mm256_sub_epi32(lstep3[51], lstep1[53]);
-          lstep2[54] = _mm256_sub_epi32(lstep3[48], lstep1[54]);
-          lstep2[55] = _mm256_sub_epi32(lstep3[49], lstep1[55]);
-          lstep2[56] = _mm256_sub_epi32(lstep3[62], lstep1[56]);
-          lstep2[57] = _mm256_sub_epi32(lstep3[63], lstep1[57]);
-          lstep2[58] = _mm256_sub_epi32(lstep3[60], lstep1[58]);
-          lstep2[59] = _mm256_sub_epi32(lstep3[61], lstep1[59]);
-          lstep2[60] = _mm256_add_epi32(lstep1[58], lstep3[60]);
-          lstep2[61] = _mm256_add_epi32(lstep1[59], lstep3[61]);
-          lstep2[62] = _mm256_add_epi32(lstep1[56], lstep3[62]);
-          lstep2[63] = _mm256_add_epi32(lstep1[57], lstep3[63]);
-        }
-        // stage 6
-        {
-          const __m256i k32_p28_p04 =
-              pair256_set_epi32(cospi_28_64, cospi_4_64);
-          const __m256i k32_p12_p20 =
-              pair256_set_epi32(cospi_12_64, cospi_20_64);
-          const __m256i k32_m20_p12 =
-              pair256_set_epi32(-cospi_20_64, cospi_12_64);
-          const __m256i k32_m04_p28 =
-              pair256_set_epi32(-cospi_4_64, cospi_28_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]);
-          u[1] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]);
-          u[2] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]);
-          u[3] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]);
-          u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
-          u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
-          u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
-          u[7] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
-          u[8] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
-          u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
-          u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
-          u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
-          u[12] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]);
-          u[13] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]);
-          u[14] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]);
-          u[15] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04);
-          v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04);
-          v[2] = k_madd_epi32_avx2(u[2], k32_p28_p04);
-          v[3] = k_madd_epi32_avx2(u[3], k32_p28_p04);
-          v[4] = k_madd_epi32_avx2(u[4], k32_p12_p20);
-          v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20);
-          v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20);
-          v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20);
-          v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12);
-          v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12);
-          v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
-          v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
-          v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28);
-          v[13] = k_madd_epi32_avx2(u[13], k32_m04_p28);
-          v[14] = k_madd_epi32_avx2(u[14], k32_m04_p28);
-          v[15] = k_madd_epi32_avx2(u[15], k32_m04_p28);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-
-          sign[0] = _mm256_cmpgt_epi32(kZero, u[0]);
-          sign[1] = _mm256_cmpgt_epi32(kZero, u[1]);
-          sign[2] = _mm256_cmpgt_epi32(kZero, u[2]);
-          sign[3] = _mm256_cmpgt_epi32(kZero, u[3]);
-          sign[4] = _mm256_cmpgt_epi32(kZero, u[4]);
-          sign[5] = _mm256_cmpgt_epi32(kZero, u[5]);
-          sign[6] = _mm256_cmpgt_epi32(kZero, u[6]);
-          sign[7] = _mm256_cmpgt_epi32(kZero, u[7]);
-
-          u[0] = _mm256_sub_epi32(u[0], sign[0]);
-          u[1] = _mm256_sub_epi32(u[1], sign[1]);
-          u[2] = _mm256_sub_epi32(u[2], sign[2]);
-          u[3] = _mm256_sub_epi32(u[3], sign[3]);
-          u[4] = _mm256_sub_epi32(u[4], sign[4]);
-          u[5] = _mm256_sub_epi32(u[5], sign[5]);
-          u[6] = _mm256_sub_epi32(u[6], sign[6]);
-          u[7] = _mm256_sub_epi32(u[7], sign[7]);
-
-          u[0] = _mm256_add_epi32(u[0], K32One);
-          u[1] = _mm256_add_epi32(u[1], K32One);
-          u[2] = _mm256_add_epi32(u[2], K32One);
-          u[3] = _mm256_add_epi32(u[3], K32One);
-          u[4] = _mm256_add_epi32(u[4], K32One);
-          u[5] = _mm256_add_epi32(u[5], K32One);
-          u[6] = _mm256_add_epi32(u[6], K32One);
-          u[7] = _mm256_add_epi32(u[7], K32One);
-
-          u[0] = _mm256_srai_epi32(u[0], 2);
-          u[1] = _mm256_srai_epi32(u[1], 2);
-          u[2] = _mm256_srai_epi32(u[2], 2);
-          u[3] = _mm256_srai_epi32(u[3], 2);
-          u[4] = _mm256_srai_epi32(u[4], 2);
-          u[5] = _mm256_srai_epi32(u[5], 2);
-          u[6] = _mm256_srai_epi32(u[6], 2);
-          u[7] = _mm256_srai_epi32(u[7], 2);
-
-          out[4] = _mm256_packs_epi32(u[0], u[1]);
-          out[20] = _mm256_packs_epi32(u[2], u[3]);
-          out[12] = _mm256_packs_epi32(u[4], u[5]);
-          out[28] = _mm256_packs_epi32(u[6], u[7]);
-        }
-        {
-          lstep3[16] = _mm256_add_epi32(lstep2[18], lstep1[16]);
-          lstep3[17] = _mm256_add_epi32(lstep2[19], lstep1[17]);
-          lstep3[18] = _mm256_sub_epi32(lstep1[16], lstep2[18]);
-          lstep3[19] = _mm256_sub_epi32(lstep1[17], lstep2[19]);
-          lstep3[20] = _mm256_sub_epi32(lstep1[22], lstep2[20]);
-          lstep3[21] = _mm256_sub_epi32(lstep1[23], lstep2[21]);
-          lstep3[22] = _mm256_add_epi32(lstep2[20], lstep1[22]);
-          lstep3[23] = _mm256_add_epi32(lstep2[21], lstep1[23]);
-          lstep3[24] = _mm256_add_epi32(lstep2[26], lstep1[24]);
-          lstep3[25] = _mm256_add_epi32(lstep2[27], lstep1[25]);
-          lstep3[26] = _mm256_sub_epi32(lstep1[24], lstep2[26]);
-          lstep3[27] = _mm256_sub_epi32(lstep1[25], lstep2[27]);
-          lstep3[28] = _mm256_sub_epi32(lstep1[30], lstep2[28]);
-          lstep3[29] = _mm256_sub_epi32(lstep1[31], lstep2[29]);
-          lstep3[30] = _mm256_add_epi32(lstep2[28], lstep1[30]);
-          lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]);
-        }
-        {
-          const __m256i k32_m04_p28 =
-              pair256_set_epi32(-cospi_4_64, cospi_28_64);
-          const __m256i k32_m28_m04 =
-              pair256_set_epi32(-cospi_28_64, -cospi_4_64);
-          const __m256i k32_m20_p12 =
-              pair256_set_epi32(-cospi_20_64, cospi_12_64);
-          const __m256i k32_m12_m20 =
-              pair256_set_epi32(-cospi_12_64, -cospi_20_64);
-          const __m256i k32_p12_p20 =
-              pair256_set_epi32(cospi_12_64, cospi_20_64);
-          const __m256i k32_p28_p04 =
-              pair256_set_epi32(cospi_28_64, cospi_4_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]);
-          u[1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]);
-          u[2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]);
-          u[3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]);
-          u[4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]);
-          u[5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]);
-          u[6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]);
-          u[7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]);
-          u[8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]);
-          u[9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]);
-          u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]);
-          u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]);
-          u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]);
-          u[13] = _mm256_unpackhi_epi32(lstep2[44], lstep2[50]);
-          u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]);
-          u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_m04_p28);
-          v[1] = k_madd_epi32_avx2(u[1], k32_m04_p28);
-          v[2] = k_madd_epi32_avx2(u[2], k32_m04_p28);
-          v[3] = k_madd_epi32_avx2(u[3], k32_m04_p28);
-          v[4] = k_madd_epi32_avx2(u[4], k32_m28_m04);
-          v[5] = k_madd_epi32_avx2(u[5], k32_m28_m04);
-          v[6] = k_madd_epi32_avx2(u[6], k32_m28_m04);
-          v[7] = k_madd_epi32_avx2(u[7], k32_m28_m04);
-          v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12);
-          v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12);
-          v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
-          v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
-          v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20);
-          v[13] = k_madd_epi32_avx2(u[13], k32_m12_m20);
-          v[14] = k_madd_epi32_avx2(u[14], k32_m12_m20);
-          v[15] = k_madd_epi32_avx2(u[15], k32_m12_m20);
-          v[16] = k_madd_epi32_avx2(u[12], k32_m20_p12);
-          v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12);
-          v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12);
-          v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12);
-          v[20] = k_madd_epi32_avx2(u[8], k32_p12_p20);
-          v[21] = k_madd_epi32_avx2(u[9], k32_p12_p20);
-          v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20);
-          v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20);
-          v[24] = k_madd_epi32_avx2(u[4], k32_m04_p28);
-          v[25] = k_madd_epi32_avx2(u[5], k32_m04_p28);
-          v[26] = k_madd_epi32_avx2(u[6], k32_m04_p28);
-          v[27] = k_madd_epi32_avx2(u[7], k32_m04_p28);
-          v[28] = k_madd_epi32_avx2(u[0], k32_p28_p04);
-          v[29] = k_madd_epi32_avx2(u[1], k32_p28_p04);
-          v[30] = k_madd_epi32_avx2(u[2], k32_p28_p04);
-          v[31] = k_madd_epi32_avx2(u[3], k32_p28_p04);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-          u[8] = k_packs_epi64_avx2(v[16], v[17]);
-          u[9] = k_packs_epi64_avx2(v[18], v[19]);
-          u[10] = k_packs_epi64_avx2(v[20], v[21]);
-          u[11] = k_packs_epi64_avx2(v[22], v[23]);
-          u[12] = k_packs_epi64_avx2(v[24], v[25]);
-          u[13] = k_packs_epi64_avx2(v[26], v[27]);
-          u[14] = k_packs_epi64_avx2(v[28], v[29]);
-          u[15] = k_packs_epi64_avx2(v[30], v[31]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          lstep3[34] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep3[35] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep3[36] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep3[37] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          lstep3[42] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          lstep3[43] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          lstep3[44] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          lstep3[45] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-          lstep3[50] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
-          lstep3[51] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
-          lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
-          lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
-          lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
-          lstep3[59] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
-          lstep3[60] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
-          lstep3[61] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
-        }
-        // stage 7
-        {
-          const __m256i k32_p30_p02 =
-              pair256_set_epi32(cospi_30_64, cospi_2_64);
-          const __m256i k32_p14_p18 =
-              pair256_set_epi32(cospi_14_64, cospi_18_64);
-          const __m256i k32_p22_p10 =
-              pair256_set_epi32(cospi_22_64, cospi_10_64);
-          const __m256i k32_p06_p26 =
-              pair256_set_epi32(cospi_6_64, cospi_26_64);
-          const __m256i k32_m26_p06 =
-              pair256_set_epi32(-cospi_26_64, cospi_6_64);
-          const __m256i k32_m10_p22 =
-              pair256_set_epi32(-cospi_10_64, cospi_22_64);
-          const __m256i k32_m18_p14 =
-              pair256_set_epi32(-cospi_18_64, cospi_14_64);
-          const __m256i k32_m02_p30 =
-              pair256_set_epi32(-cospi_2_64, cospi_30_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]);
-          u[1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]);
-          u[2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]);
-          u[3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]);
-          u[4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]);
-          u[5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]);
-          u[6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]);
-          u[7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]);
-          u[8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]);
-          u[9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]);
-          u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]);
-          u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]);
-          u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]);
-          u[13] = _mm256_unpackhi_epi32(lstep3[22], lstep3[24]);
-          u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]);
-          u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_p30_p02);
-          v[1] = k_madd_epi32_avx2(u[1], k32_p30_p02);
-          v[2] = k_madd_epi32_avx2(u[2], k32_p30_p02);
-          v[3] = k_madd_epi32_avx2(u[3], k32_p30_p02);
-          v[4] = k_madd_epi32_avx2(u[4], k32_p14_p18);
-          v[5] = k_madd_epi32_avx2(u[5], k32_p14_p18);
-          v[6] = k_madd_epi32_avx2(u[6], k32_p14_p18);
-          v[7] = k_madd_epi32_avx2(u[7], k32_p14_p18);
-          v[8] = k_madd_epi32_avx2(u[8], k32_p22_p10);
-          v[9] = k_madd_epi32_avx2(u[9], k32_p22_p10);
-          v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10);
-          v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10);
-          v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26);
-          v[13] = k_madd_epi32_avx2(u[13], k32_p06_p26);
-          v[14] = k_madd_epi32_avx2(u[14], k32_p06_p26);
-          v[15] = k_madd_epi32_avx2(u[15], k32_p06_p26);
-          v[16] = k_madd_epi32_avx2(u[12], k32_m26_p06);
-          v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06);
-          v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06);
-          v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06);
-          v[20] = k_madd_epi32_avx2(u[8], k32_m10_p22);
-          v[21] = k_madd_epi32_avx2(u[9], k32_m10_p22);
-          v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22);
-          v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22);
-          v[24] = k_madd_epi32_avx2(u[4], k32_m18_p14);
-          v[25] = k_madd_epi32_avx2(u[5], k32_m18_p14);
-          v[26] = k_madd_epi32_avx2(u[6], k32_m18_p14);
-          v[27] = k_madd_epi32_avx2(u[7], k32_m18_p14);
-          v[28] = k_madd_epi32_avx2(u[0], k32_m02_p30);
-          v[29] = k_madd_epi32_avx2(u[1], k32_m02_p30);
-          v[30] = k_madd_epi32_avx2(u[2], k32_m02_p30);
-          v[31] = k_madd_epi32_avx2(u[3], k32_m02_p30);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-          u[8] = k_packs_epi64_avx2(v[16], v[17]);
-          u[9] = k_packs_epi64_avx2(v[18], v[19]);
-          u[10] = k_packs_epi64_avx2(v[20], v[21]);
-          u[11] = k_packs_epi64_avx2(v[22], v[23]);
-          u[12] = k_packs_epi64_avx2(v[24], v[25]);
-          u[13] = k_packs_epi64_avx2(v[26], v[27]);
-          u[14] = k_packs_epi64_avx2(v[28], v[29]);
-          u[15] = k_packs_epi64_avx2(v[30], v[31]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
-          v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
-          v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
-          v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
-          v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
-          v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
-          v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
-          v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
-          v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
-          v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
-          v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
-          v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
-          v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
-          v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
-          v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
-          v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
-
-          u[0] = _mm256_sub_epi32(u[0], v[0]);
-          u[1] = _mm256_sub_epi32(u[1], v[1]);
-          u[2] = _mm256_sub_epi32(u[2], v[2]);
-          u[3] = _mm256_sub_epi32(u[3], v[3]);
-          u[4] = _mm256_sub_epi32(u[4], v[4]);
-          u[5] = _mm256_sub_epi32(u[5], v[5]);
-          u[6] = _mm256_sub_epi32(u[6], v[6]);
-          u[7] = _mm256_sub_epi32(u[7], v[7]);
-          u[8] = _mm256_sub_epi32(u[8], v[8]);
-          u[9] = _mm256_sub_epi32(u[9], v[9]);
-          u[10] = _mm256_sub_epi32(u[10], v[10]);
-          u[11] = _mm256_sub_epi32(u[11], v[11]);
-          u[12] = _mm256_sub_epi32(u[12], v[12]);
-          u[13] = _mm256_sub_epi32(u[13], v[13]);
-          u[14] = _mm256_sub_epi32(u[14], v[14]);
-          u[15] = _mm256_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm256_add_epi32(u[0], K32One);
-          v[1] = _mm256_add_epi32(u[1], K32One);
-          v[2] = _mm256_add_epi32(u[2], K32One);
-          v[3] = _mm256_add_epi32(u[3], K32One);
-          v[4] = _mm256_add_epi32(u[4], K32One);
-          v[5] = _mm256_add_epi32(u[5], K32One);
-          v[6] = _mm256_add_epi32(u[6], K32One);
-          v[7] = _mm256_add_epi32(u[7], K32One);
-          v[8] = _mm256_add_epi32(u[8], K32One);
-          v[9] = _mm256_add_epi32(u[9], K32One);
-          v[10] = _mm256_add_epi32(u[10], K32One);
-          v[11] = _mm256_add_epi32(u[11], K32One);
-          v[12] = _mm256_add_epi32(u[12], K32One);
-          v[13] = _mm256_add_epi32(u[13], K32One);
-          v[14] = _mm256_add_epi32(u[14], K32One);
-          v[15] = _mm256_add_epi32(u[15], K32One);
-
-          u[0] = _mm256_srai_epi32(v[0], 2);
-          u[1] = _mm256_srai_epi32(v[1], 2);
-          u[2] = _mm256_srai_epi32(v[2], 2);
-          u[3] = _mm256_srai_epi32(v[3], 2);
-          u[4] = _mm256_srai_epi32(v[4], 2);
-          u[5] = _mm256_srai_epi32(v[5], 2);
-          u[6] = _mm256_srai_epi32(v[6], 2);
-          u[7] = _mm256_srai_epi32(v[7], 2);
-          u[8] = _mm256_srai_epi32(v[8], 2);
-          u[9] = _mm256_srai_epi32(v[9], 2);
-          u[10] = _mm256_srai_epi32(v[10], 2);
-          u[11] = _mm256_srai_epi32(v[11], 2);
-          u[12] = _mm256_srai_epi32(v[12], 2);
-          u[13] = _mm256_srai_epi32(v[13], 2);
-          u[14] = _mm256_srai_epi32(v[14], 2);
-          u[15] = _mm256_srai_epi32(v[15], 2);
-
-          out[2] = _mm256_packs_epi32(u[0], u[1]);
-          out[18] = _mm256_packs_epi32(u[2], u[3]);
-          out[10] = _mm256_packs_epi32(u[4], u[5]);
-          out[26] = _mm256_packs_epi32(u[6], u[7]);
-          out[6] = _mm256_packs_epi32(u[8], u[9]);
-          out[22] = _mm256_packs_epi32(u[10], u[11]);
-          out[14] = _mm256_packs_epi32(u[12], u[13]);
-          out[30] = _mm256_packs_epi32(u[14], u[15]);
-        }
-        {
-          lstep1[32] = _mm256_add_epi32(lstep3[34], lstep2[32]);
-          lstep1[33] = _mm256_add_epi32(lstep3[35], lstep2[33]);
-          lstep1[34] = _mm256_sub_epi32(lstep2[32], lstep3[34]);
-          lstep1[35] = _mm256_sub_epi32(lstep2[33], lstep3[35]);
-          lstep1[36] = _mm256_sub_epi32(lstep2[38], lstep3[36]);
-          lstep1[37] = _mm256_sub_epi32(lstep2[39], lstep3[37]);
-          lstep1[38] = _mm256_add_epi32(lstep3[36], lstep2[38]);
-          lstep1[39] = _mm256_add_epi32(lstep3[37], lstep2[39]);
-          lstep1[40] = _mm256_add_epi32(lstep3[42], lstep2[40]);
-          lstep1[41] = _mm256_add_epi32(lstep3[43], lstep2[41]);
-          lstep1[42] = _mm256_sub_epi32(lstep2[40], lstep3[42]);
-          lstep1[43] = _mm256_sub_epi32(lstep2[41], lstep3[43]);
-          lstep1[44] = _mm256_sub_epi32(lstep2[46], lstep3[44]);
-          lstep1[45] = _mm256_sub_epi32(lstep2[47], lstep3[45]);
-          lstep1[46] = _mm256_add_epi32(lstep3[44], lstep2[46]);
-          lstep1[47] = _mm256_add_epi32(lstep3[45], lstep2[47]);
-          lstep1[48] = _mm256_add_epi32(lstep3[50], lstep2[48]);
-          lstep1[49] = _mm256_add_epi32(lstep3[51], lstep2[49]);
-          lstep1[50] = _mm256_sub_epi32(lstep2[48], lstep3[50]);
-          lstep1[51] = _mm256_sub_epi32(lstep2[49], lstep3[51]);
-          lstep1[52] = _mm256_sub_epi32(lstep2[54], lstep3[52]);
-          lstep1[53] = _mm256_sub_epi32(lstep2[55], lstep3[53]);
-          lstep1[54] = _mm256_add_epi32(lstep3[52], lstep2[54]);
-          lstep1[55] = _mm256_add_epi32(lstep3[53], lstep2[55]);
-          lstep1[56] = _mm256_add_epi32(lstep3[58], lstep2[56]);
-          lstep1[57] = _mm256_add_epi32(lstep3[59], lstep2[57]);
-          lstep1[58] = _mm256_sub_epi32(lstep2[56], lstep3[58]);
-          lstep1[59] = _mm256_sub_epi32(lstep2[57], lstep3[59]);
-          lstep1[60] = _mm256_sub_epi32(lstep2[62], lstep3[60]);
-          lstep1[61] = _mm256_sub_epi32(lstep2[63], lstep3[61]);
-          lstep1[62] = _mm256_add_epi32(lstep3[60], lstep2[62]);
-          lstep1[63] = _mm256_add_epi32(lstep3[61], lstep2[63]);
-        }
-        // stage 8
-        {
-          const __m256i k32_p31_p01 =
-              pair256_set_epi32(cospi_31_64, cospi_1_64);
-          const __m256i k32_p15_p17 =
-              pair256_set_epi32(cospi_15_64, cospi_17_64);
-          const __m256i k32_p23_p09 =
-              pair256_set_epi32(cospi_23_64, cospi_9_64);
-          const __m256i k32_p07_p25 =
-              pair256_set_epi32(cospi_7_64, cospi_25_64);
-          const __m256i k32_m25_p07 =
-              pair256_set_epi32(-cospi_25_64, cospi_7_64);
-          const __m256i k32_m09_p23 =
-              pair256_set_epi32(-cospi_9_64, cospi_23_64);
-          const __m256i k32_m17_p15 =
-              pair256_set_epi32(-cospi_17_64, cospi_15_64);
-          const __m256i k32_m01_p31 =
-              pair256_set_epi32(-cospi_1_64, cospi_31_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]);
-          u[1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]);
-          u[2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]);
-          u[3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]);
-          u[4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]);
-          u[5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]);
-          u[6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]);
-          u[7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]);
-          u[8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]);
-          u[9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]);
-          u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]);
-          u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]);
-          u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]);
-          u[13] = _mm256_unpackhi_epi32(lstep1[38], lstep1[56]);
-          u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]);
-          u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_p31_p01);
-          v[1] = k_madd_epi32_avx2(u[1], k32_p31_p01);
-          v[2] = k_madd_epi32_avx2(u[2], k32_p31_p01);
-          v[3] = k_madd_epi32_avx2(u[3], k32_p31_p01);
-          v[4] = k_madd_epi32_avx2(u[4], k32_p15_p17);
-          v[5] = k_madd_epi32_avx2(u[5], k32_p15_p17);
-          v[6] = k_madd_epi32_avx2(u[6], k32_p15_p17);
-          v[7] = k_madd_epi32_avx2(u[7], k32_p15_p17);
-          v[8] = k_madd_epi32_avx2(u[8], k32_p23_p09);
-          v[9] = k_madd_epi32_avx2(u[9], k32_p23_p09);
-          v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09);
-          v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09);
-          v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25);
-          v[13] = k_madd_epi32_avx2(u[13], k32_p07_p25);
-          v[14] = k_madd_epi32_avx2(u[14], k32_p07_p25);
-          v[15] = k_madd_epi32_avx2(u[15], k32_p07_p25);
-          v[16] = k_madd_epi32_avx2(u[12], k32_m25_p07);
-          v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07);
-          v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07);
-          v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07);
-          v[20] = k_madd_epi32_avx2(u[8], k32_m09_p23);
-          v[21] = k_madd_epi32_avx2(u[9], k32_m09_p23);
-          v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23);
-          v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23);
-          v[24] = k_madd_epi32_avx2(u[4], k32_m17_p15);
-          v[25] = k_madd_epi32_avx2(u[5], k32_m17_p15);
-          v[26] = k_madd_epi32_avx2(u[6], k32_m17_p15);
-          v[27] = k_madd_epi32_avx2(u[7], k32_m17_p15);
-          v[28] = k_madd_epi32_avx2(u[0], k32_m01_p31);
-          v[29] = k_madd_epi32_avx2(u[1], k32_m01_p31);
-          v[30] = k_madd_epi32_avx2(u[2], k32_m01_p31);
-          v[31] = k_madd_epi32_avx2(u[3], k32_m01_p31);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-          u[8] = k_packs_epi64_avx2(v[16], v[17]);
-          u[9] = k_packs_epi64_avx2(v[18], v[19]);
-          u[10] = k_packs_epi64_avx2(v[20], v[21]);
-          u[11] = k_packs_epi64_avx2(v[22], v[23]);
-          u[12] = k_packs_epi64_avx2(v[24], v[25]);
-          u[13] = k_packs_epi64_avx2(v[26], v[27]);
-          u[14] = k_packs_epi64_avx2(v[28], v[29]);
-          u[15] = k_packs_epi64_avx2(v[30], v[31]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
-          v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
-          v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
-          v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
-          v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
-          v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
-          v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
-          v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
-          v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
-          v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
-          v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
-          v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
-          v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
-          v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
-          v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
-          v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
-
-          u[0] = _mm256_sub_epi32(u[0], v[0]);
-          u[1] = _mm256_sub_epi32(u[1], v[1]);
-          u[2] = _mm256_sub_epi32(u[2], v[2]);
-          u[3] = _mm256_sub_epi32(u[3], v[3]);
-          u[4] = _mm256_sub_epi32(u[4], v[4]);
-          u[5] = _mm256_sub_epi32(u[5], v[5]);
-          u[6] = _mm256_sub_epi32(u[6], v[6]);
-          u[7] = _mm256_sub_epi32(u[7], v[7]);
-          u[8] = _mm256_sub_epi32(u[8], v[8]);
-          u[9] = _mm256_sub_epi32(u[9], v[9]);
-          u[10] = _mm256_sub_epi32(u[10], v[10]);
-          u[11] = _mm256_sub_epi32(u[11], v[11]);
-          u[12] = _mm256_sub_epi32(u[12], v[12]);
-          u[13] = _mm256_sub_epi32(u[13], v[13]);
-          u[14] = _mm256_sub_epi32(u[14], v[14]);
-          u[15] = _mm256_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm256_add_epi32(u[0], K32One);
-          v[1] = _mm256_add_epi32(u[1], K32One);
-          v[2] = _mm256_add_epi32(u[2], K32One);
-          v[3] = _mm256_add_epi32(u[3], K32One);
-          v[4] = _mm256_add_epi32(u[4], K32One);
-          v[5] = _mm256_add_epi32(u[5], K32One);
-          v[6] = _mm256_add_epi32(u[6], K32One);
-          v[7] = _mm256_add_epi32(u[7], K32One);
-          v[8] = _mm256_add_epi32(u[8], K32One);
-          v[9] = _mm256_add_epi32(u[9], K32One);
-          v[10] = _mm256_add_epi32(u[10], K32One);
-          v[11] = _mm256_add_epi32(u[11], K32One);
-          v[12] = _mm256_add_epi32(u[12], K32One);
-          v[13] = _mm256_add_epi32(u[13], K32One);
-          v[14] = _mm256_add_epi32(u[14], K32One);
-          v[15] = _mm256_add_epi32(u[15], K32One);
-
-          u[0] = _mm256_srai_epi32(v[0], 2);
-          u[1] = _mm256_srai_epi32(v[1], 2);
-          u[2] = _mm256_srai_epi32(v[2], 2);
-          u[3] = _mm256_srai_epi32(v[3], 2);
-          u[4] = _mm256_srai_epi32(v[4], 2);
-          u[5] = _mm256_srai_epi32(v[5], 2);
-          u[6] = _mm256_srai_epi32(v[6], 2);
-          u[7] = _mm256_srai_epi32(v[7], 2);
-          u[8] = _mm256_srai_epi32(v[8], 2);
-          u[9] = _mm256_srai_epi32(v[9], 2);
-          u[10] = _mm256_srai_epi32(v[10], 2);
-          u[11] = _mm256_srai_epi32(v[11], 2);
-          u[12] = _mm256_srai_epi32(v[12], 2);
-          u[13] = _mm256_srai_epi32(v[13], 2);
-          u[14] = _mm256_srai_epi32(v[14], 2);
-          u[15] = _mm256_srai_epi32(v[15], 2);
-
-          out[1] = _mm256_packs_epi32(u[0], u[1]);
-          out[17] = _mm256_packs_epi32(u[2], u[3]);
-          out[9] = _mm256_packs_epi32(u[4], u[5]);
-          out[25] = _mm256_packs_epi32(u[6], u[7]);
-          out[7] = _mm256_packs_epi32(u[8], u[9]);
-          out[23] = _mm256_packs_epi32(u[10], u[11]);
-          out[15] = _mm256_packs_epi32(u[12], u[13]);
-          out[31] = _mm256_packs_epi32(u[14], u[15]);
-        }
-        {
-          const __m256i k32_p27_p05 =
-              pair256_set_epi32(cospi_27_64, cospi_5_64);
-          const __m256i k32_p11_p21 =
-              pair256_set_epi32(cospi_11_64, cospi_21_64);
-          const __m256i k32_p19_p13 =
-              pair256_set_epi32(cospi_19_64, cospi_13_64);
-          const __m256i k32_p03_p29 =
-              pair256_set_epi32(cospi_3_64, cospi_29_64);
-          const __m256i k32_m29_p03 =
-              pair256_set_epi32(-cospi_29_64, cospi_3_64);
-          const __m256i k32_m13_p19 =
-              pair256_set_epi32(-cospi_13_64, cospi_19_64);
-          const __m256i k32_m21_p11 =
-              pair256_set_epi32(-cospi_21_64, cospi_11_64);
-          const __m256i k32_m05_p27 =
-              pair256_set_epi32(-cospi_5_64, cospi_27_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]);
-          u[1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]);
-          u[2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]);
-          u[3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]);
-          u[4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]);
-          u[5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]);
-          u[6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]);
-          u[7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]);
-          u[8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]);
-          u[9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]);
-          u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]);
-          u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]);
-          u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]);
-          u[13] = _mm256_unpackhi_epi32(lstep1[46], lstep1[48]);
-          u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]);
-          u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_p27_p05);
-          v[1] = k_madd_epi32_avx2(u[1], k32_p27_p05);
-          v[2] = k_madd_epi32_avx2(u[2], k32_p27_p05);
-          v[3] = k_madd_epi32_avx2(u[3], k32_p27_p05);
-          v[4] = k_madd_epi32_avx2(u[4], k32_p11_p21);
-          v[5] = k_madd_epi32_avx2(u[5], k32_p11_p21);
-          v[6] = k_madd_epi32_avx2(u[6], k32_p11_p21);
-          v[7] = k_madd_epi32_avx2(u[7], k32_p11_p21);
-          v[8] = k_madd_epi32_avx2(u[8], k32_p19_p13);
-          v[9] = k_madd_epi32_avx2(u[9], k32_p19_p13);
-          v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13);
-          v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13);
-          v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29);
-          v[13] = k_madd_epi32_avx2(u[13], k32_p03_p29);
-          v[14] = k_madd_epi32_avx2(u[14], k32_p03_p29);
-          v[15] = k_madd_epi32_avx2(u[15], k32_p03_p29);
-          v[16] = k_madd_epi32_avx2(u[12], k32_m29_p03);
-          v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03);
-          v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03);
-          v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03);
-          v[20] = k_madd_epi32_avx2(u[8], k32_m13_p19);
-          v[21] = k_madd_epi32_avx2(u[9], k32_m13_p19);
-          v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19);
-          v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19);
-          v[24] = k_madd_epi32_avx2(u[4], k32_m21_p11);
-          v[25] = k_madd_epi32_avx2(u[5], k32_m21_p11);
-          v[26] = k_madd_epi32_avx2(u[6], k32_m21_p11);
-          v[27] = k_madd_epi32_avx2(u[7], k32_m21_p11);
-          v[28] = k_madd_epi32_avx2(u[0], k32_m05_p27);
-          v[29] = k_madd_epi32_avx2(u[1], k32_m05_p27);
-          v[30] = k_madd_epi32_avx2(u[2], k32_m05_p27);
-          v[31] = k_madd_epi32_avx2(u[3], k32_m05_p27);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-          u[8] = k_packs_epi64_avx2(v[16], v[17]);
-          u[9] = k_packs_epi64_avx2(v[18], v[19]);
-          u[10] = k_packs_epi64_avx2(v[20], v[21]);
-          u[11] = k_packs_epi64_avx2(v[22], v[23]);
-          u[12] = k_packs_epi64_avx2(v[24], v[25]);
-          u[13] = k_packs_epi64_avx2(v[26], v[27]);
-          u[14] = k_packs_epi64_avx2(v[28], v[29]);
-          u[15] = k_packs_epi64_avx2(v[30], v[31]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
-          v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
-          v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
-          v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
-          v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
-          v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
-          v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
-          v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
-          v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
-          v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
-          v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
-          v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
-          v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
-          v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
-          v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
-          v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
-
-          u[0] = _mm256_sub_epi32(u[0], v[0]);
-          u[1] = _mm256_sub_epi32(u[1], v[1]);
-          u[2] = _mm256_sub_epi32(u[2], v[2]);
-          u[3] = _mm256_sub_epi32(u[3], v[3]);
-          u[4] = _mm256_sub_epi32(u[4], v[4]);
-          u[5] = _mm256_sub_epi32(u[5], v[5]);
-          u[6] = _mm256_sub_epi32(u[6], v[6]);
-          u[7] = _mm256_sub_epi32(u[7], v[7]);
-          u[8] = _mm256_sub_epi32(u[8], v[8]);
-          u[9] = _mm256_sub_epi32(u[9], v[9]);
-          u[10] = _mm256_sub_epi32(u[10], v[10]);
-          u[11] = _mm256_sub_epi32(u[11], v[11]);
-          u[12] = _mm256_sub_epi32(u[12], v[12]);
-          u[13] = _mm256_sub_epi32(u[13], v[13]);
-          u[14] = _mm256_sub_epi32(u[14], v[14]);
-          u[15] = _mm256_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm256_add_epi32(u[0], K32One);
-          v[1] = _mm256_add_epi32(u[1], K32One);
-          v[2] = _mm256_add_epi32(u[2], K32One);
-          v[3] = _mm256_add_epi32(u[3], K32One);
-          v[4] = _mm256_add_epi32(u[4], K32One);
-          v[5] = _mm256_add_epi32(u[5], K32One);
-          v[6] = _mm256_add_epi32(u[6], K32One);
-          v[7] = _mm256_add_epi32(u[7], K32One);
-          v[8] = _mm256_add_epi32(u[8], K32One);
-          v[9] = _mm256_add_epi32(u[9], K32One);
-          v[10] = _mm256_add_epi32(u[10], K32One);
-          v[11] = _mm256_add_epi32(u[11], K32One);
-          v[12] = _mm256_add_epi32(u[12], K32One);
-          v[13] = _mm256_add_epi32(u[13], K32One);
-          v[14] = _mm256_add_epi32(u[14], K32One);
-          v[15] = _mm256_add_epi32(u[15], K32One);
-
-          u[0] = _mm256_srai_epi32(v[0], 2);
-          u[1] = _mm256_srai_epi32(v[1], 2);
-          u[2] = _mm256_srai_epi32(v[2], 2);
-          u[3] = _mm256_srai_epi32(v[3], 2);
-          u[4] = _mm256_srai_epi32(v[4], 2);
-          u[5] = _mm256_srai_epi32(v[5], 2);
-          u[6] = _mm256_srai_epi32(v[6], 2);
-          u[7] = _mm256_srai_epi32(v[7], 2);
-          u[8] = _mm256_srai_epi32(v[8], 2);
-          u[9] = _mm256_srai_epi32(v[9], 2);
-          u[10] = _mm256_srai_epi32(v[10], 2);
-          u[11] = _mm256_srai_epi32(v[11], 2);
-          u[12] = _mm256_srai_epi32(v[12], 2);
-          u[13] = _mm256_srai_epi32(v[13], 2);
-          u[14] = _mm256_srai_epi32(v[14], 2);
-          u[15] = _mm256_srai_epi32(v[15], 2);
-
-          out[5] = _mm256_packs_epi32(u[0], u[1]);
-          out[21] = _mm256_packs_epi32(u[2], u[3]);
-          out[13] = _mm256_packs_epi32(u[4], u[5]);
-          out[29] = _mm256_packs_epi32(u[6], u[7]);
-          out[3] = _mm256_packs_epi32(u[8], u[9]);
-          out[19] = _mm256_packs_epi32(u[10], u[11]);
-          out[11] = _mm256_packs_epi32(u[12], u[13]);
-          out[27] = _mm256_packs_epi32(u[14], u[15]);
-        }
-      }
-#endif
-      // Transpose the results, do it as four 8x8 transposes.
-      {
-        int transpose_block;
-        int16_t *output_currStep, *output_nextStep;
-        tran_low_t *curr_out, *next_out;
-        // Pass 0
-        output_currStep = &intermediate[column_start * 32];
-        output_nextStep = &intermediate[(column_start + 8) * 32];
-        // Pass 1
-        curr_out = &output_org[column_start * 32];
-        next_out = &output_org[(column_start + 8) * 32];
-
-        for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
-          __m256i *this_out = &out[8 * transpose_block];
-          // 00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15
-          // 20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
-          // 40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55
-          // 60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75
-          // 80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
-          // 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
-          // 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
-          // 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
-          const __m256i tr0_0 = _mm256_unpacklo_epi16(this_out[0], this_out[1]);
-          const __m256i tr0_1 = _mm256_unpacklo_epi16(this_out[2], this_out[3]);
-          const __m256i tr0_2 = _mm256_unpackhi_epi16(this_out[0], this_out[1]);
-          const __m256i tr0_3 = _mm256_unpackhi_epi16(this_out[2], this_out[3]);
-          const __m256i tr0_4 = _mm256_unpacklo_epi16(this_out[4], this_out[5]);
-          const __m256i tr0_5 = _mm256_unpacklo_epi16(this_out[6], this_out[7]);
-          const __m256i tr0_6 = _mm256_unpackhi_epi16(this_out[4], this_out[5]);
-          const __m256i tr0_7 = _mm256_unpackhi_epi16(this_out[6], this_out[7]);
-          // 00  20  01  21  02  22  03  23  08  28  09  29  10  30  11  31
-          // 40  60  41  61  42  62  43  63  48  68  49  69  50  70  51  71
-          // 04  24  05  25  06  26  07  27  12  32  13  33  14  34  15  35
-          // 44  64  45  65  46  66  47  67  52  72  53  73  54  74  55  75
-          // 80  100 81  101 82  102 83  103 88  108 89  109 90  110 91  101
-          // 120 140 121 141 122 142 123 143 128 148 129 149 130 150 131 151
-          // 84  104 85  105 86  106 87  107 92  112 93  113 94  114 95  115
-          // 124 144 125 145 126 146 127 147 132 152 133 153 134 154 135 155
-
-          const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
-          const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
-          const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
-          const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
-          const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
-          const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
-          const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
-          const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
-          // 00 20  40  60  01 21  41  61  08 28  48  68  09 29  49  69
-          // 04 24  44  64  05 25  45  65  12 32  52  72  13 33  53  73
-          // 02 22  42  62  03 23  43  63  10 30  50  70  11 31  51  71
-          // 06 26  46  66  07 27  47  67  14 34  54  74  15 35  55  75
-          // 80 100 120 140 81 101 121 141 88 108 128 148 89 109 129 149
-          // 84 104 124 144 85 105 125 145 92 112 132 152 93 113 133 153
-          // 82 102 122 142 83 103 123 143 90 110 130 150 91 101 131 151
-          // 86 106 126 146 87 107 127 147 94 114 134 154 95 115 135 155
-          __m256i tr2_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
-          __m256i tr2_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
-          __m256i tr2_2 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
-          __m256i tr2_3 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
-          __m256i tr2_4 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
-          __m256i tr2_5 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
-          __m256i tr2_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
-          __m256i tr2_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
-          // 00 20 40 60 80 100 120 140 08 28 48 68 88 108 128 148
-          // 01 21 41 61 81 101 121 141 09 29 49 69 89 109 129 149
-          // 02 22 42 62 82 102 122 142 10 30 50 70 90 110 130 150
-          // 03 23 43 63 83 103 123 143 11 31 51 71 91 101 131 151
-          // 04 24 44 64 84 104 124 144 12 32 52 72 92 112 132 152
-          // 05 25 45 65 85 105 125 145 13 33 53 73 93 113 133 153
-          // 06 26 46 66 86 106 126 146 14 34 54 74 94 114 134 154
-          // 07 27 47 67 87 107 127 147 15 35 55 75 95 115 135 155
-          if (0 == pass) {
-            // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
-            // TODO(cd): see quality impact of only doing
-            //           output[j] = (output[j] + 1) >> 2;
-            //           which would remove the code between here ...
-            __m256i tr2_0_0 = _mm256_cmpgt_epi16(tr2_0, kZero);
-            __m256i tr2_1_0 = _mm256_cmpgt_epi16(tr2_1, kZero);
-            __m256i tr2_2_0 = _mm256_cmpgt_epi16(tr2_2, kZero);
-            __m256i tr2_3_0 = _mm256_cmpgt_epi16(tr2_3, kZero);
-            __m256i tr2_4_0 = _mm256_cmpgt_epi16(tr2_4, kZero);
-            __m256i tr2_5_0 = _mm256_cmpgt_epi16(tr2_5, kZero);
-            __m256i tr2_6_0 = _mm256_cmpgt_epi16(tr2_6, kZero);
-            __m256i tr2_7_0 = _mm256_cmpgt_epi16(tr2_7, kZero);
-            tr2_0 = _mm256_sub_epi16(tr2_0, tr2_0_0);
-            tr2_1 = _mm256_sub_epi16(tr2_1, tr2_1_0);
-            tr2_2 = _mm256_sub_epi16(tr2_2, tr2_2_0);
-            tr2_3 = _mm256_sub_epi16(tr2_3, tr2_3_0);
-            tr2_4 = _mm256_sub_epi16(tr2_4, tr2_4_0);
-            tr2_5 = _mm256_sub_epi16(tr2_5, tr2_5_0);
-            tr2_6 = _mm256_sub_epi16(tr2_6, tr2_6_0);
-            tr2_7 = _mm256_sub_epi16(tr2_7, tr2_7_0);
-            //           ... and here.
-            //           PS: also change code in av1/encoder/av1_dct.c
-            tr2_0 = _mm256_add_epi16(tr2_0, kOne);
-            tr2_1 = _mm256_add_epi16(tr2_1, kOne);
-            tr2_2 = _mm256_add_epi16(tr2_2, kOne);
-            tr2_3 = _mm256_add_epi16(tr2_3, kOne);
-            tr2_4 = _mm256_add_epi16(tr2_4, kOne);
-            tr2_5 = _mm256_add_epi16(tr2_5, kOne);
-            tr2_6 = _mm256_add_epi16(tr2_6, kOne);
-            tr2_7 = _mm256_add_epi16(tr2_7, kOne);
-            tr2_0 = _mm256_srai_epi16(tr2_0, 2);
-            tr2_1 = _mm256_srai_epi16(tr2_1, 2);
-            tr2_2 = _mm256_srai_epi16(tr2_2, 2);
-            tr2_3 = _mm256_srai_epi16(tr2_3, 2);
-            tr2_4 = _mm256_srai_epi16(tr2_4, 2);
-            tr2_5 = _mm256_srai_epi16(tr2_5, 2);
-            tr2_6 = _mm256_srai_epi16(tr2_6, 2);
-            tr2_7 = _mm256_srai_epi16(tr2_7, 2);
-          }
-          if (0 == pass) {
-            // Note: even though all these stores are aligned, using the aligned
-            //       intrinsic make the code slightly slower.
-            _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32),
-                             _mm256_castsi256_si128(tr2_0));
-            _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32),
-                             _mm256_castsi256_si128(tr2_1));
-            _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32),
-                             _mm256_castsi256_si128(tr2_2));
-            _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32),
-                             _mm256_castsi256_si128(tr2_3));
-            _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32),
-                             _mm256_castsi256_si128(tr2_4));
-            _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32),
-                             _mm256_castsi256_si128(tr2_5));
-            _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32),
-                             _mm256_castsi256_si128(tr2_6));
-            _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32),
-                             _mm256_castsi256_si128(tr2_7));
-
-            _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32),
-                             _mm256_extractf128_si256(tr2_0, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32),
-                             _mm256_extractf128_si256(tr2_1, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32),
-                             _mm256_extractf128_si256(tr2_2, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32),
-                             _mm256_extractf128_si256(tr2_3, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32),
-                             _mm256_extractf128_si256(tr2_4, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32),
-                             _mm256_extractf128_si256(tr2_5, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32),
-                             _mm256_extractf128_si256(tr2_6, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32),
-                             _mm256_extractf128_si256(tr2_7, 1));
-            // Process next 8x8
-            output_currStep += 8;
-            output_nextStep += 8;
-          }
-          if (1 == pass) {
-            store_coeff(&tr2_0, curr_out + 0 * 32, next_out + 0 * 32);
-            store_coeff(&tr2_1, curr_out + 1 * 32, next_out + 1 * 32);
-            store_coeff(&tr2_2, curr_out + 2 * 32, next_out + 2 * 32);
-            store_coeff(&tr2_3, curr_out + 3 * 32, next_out + 3 * 32);
-            store_coeff(&tr2_4, curr_out + 4 * 32, next_out + 4 * 32);
-            store_coeff(&tr2_5, curr_out + 5 * 32, next_out + 5 * 32);
-            store_coeff(&tr2_6, curr_out + 6 * 32, next_out + 6 * 32);
-            store_coeff(&tr2_7, curr_out + 7 * 32, next_out + 7 * 32);
-            curr_out += 8;
-            next_out += 8;
-          }
-        }
-      }
-    }
-  }
-  _mm256_zeroupper();
-}  // NOLINT
diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h
deleted file mode 100644
index 69dd6af11..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h
+++ /dev/null
@@ -1,3201 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "aom_dsp/fwd_txfm.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-// TODO(jingning) The high bit-depth version needs re-work for performance.
-// The current SSE2 implementation also causes cross reference to the static
-// functions in the C implementation file.
-#if DCT_HIGH_BIT_DEPTH
-#define ADD_EPI16 _mm_adds_epi16
-#define SUB_EPI16 _mm_subs_epi16
-#if FDCT32x32_HIGH_PRECISION
-void aom_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
-  int i, j;
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
-    aom_fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      out[j + i * 32] =
-          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
-  }
-}
-#define HIGH_FDCT32x32_2D_C aom_highbd_fdct32x32_c
-#define HIGH_FDCT32x32_2D_ROWS_C aom_fdct32x32_rows_c
-#else
-void aom_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) {
-  int i, j;
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
-    aom_fdct32(temp_in, temp_out, 1);
-    for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
-  }
-}
-#define HIGH_FDCT32x32_2D_C aom_highbd_fdct32x32_rd_c
-#define HIGH_FDCT32x32_2D_ROWS_C aom_fdct32x32_rd_rows_c
-#endif  // FDCT32x32_HIGH_PRECISION
-#else
-#define ADD_EPI16 _mm_add_epi16
-#define SUB_EPI16 _mm_sub_epi16
-#endif  // DCT_HIGH_BIT_DEPTH
-
-void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
-  // Calculate pre-multiplied strides
-  const int str1 = stride;
-  const int str2 = 2 * stride;
-  const int str3 = 2 * stride + str1;
-  // We need an intermediate buffer between passes.
-  DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
-  const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
-  const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
-  const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
-  const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
-  const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
-  const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
-  const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
-  const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
-  const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
-  const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
-  const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
-  const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
-  const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
-  const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
-  const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
-  const __m128i kOne = _mm_set1_epi16(1);
-  // Do the two transform/transpose passes
-  int pass;
-#if DCT_HIGH_BIT_DEPTH
-  int overflow;
-#endif
-  for (pass = 0; pass < 2; ++pass) {
-    // We process eight columns (transposed rows in second pass) at a time.
-    int column_start;
-    for (column_start = 0; column_start < 32; column_start += 8) {
-      __m128i step1[32];
-      __m128i step2[32];
-      __m128i step3[32];
-      __m128i out[32];
-      // Stage 1
-      // Note: even though all the loads below are aligned, using the aligned
-      //       intrinsic make the code slightly slower.
-      if (0 == pass) {
-        const int16_t *in = &input[column_start];
-        // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
-        // Note: the next four blocks could be in a loop. That would help the
-        //       instruction cache but is actually slower.
-        {
-          const int16_t *ina = in + 0 * str1;
-          const int16_t *inb = in + 31 * str1;
-          __m128i *step1a = &step1[0];
-          __m128i *step1b = &step1[31];
-          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[0] = _mm_add_epi16(ina0, inb0);
-          step1a[1] = _mm_add_epi16(ina1, inb1);
-          step1a[2] = _mm_add_epi16(ina2, inb2);
-          step1a[3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[0] = _mm_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 4 * str1;
-          const int16_t *inb = in + 27 * str1;
-          __m128i *step1a = &step1[4];
-          __m128i *step1b = &step1[27];
-          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[0] = _mm_add_epi16(ina0, inb0);
-          step1a[1] = _mm_add_epi16(ina1, inb1);
-          step1a[2] = _mm_add_epi16(ina2, inb2);
-          step1a[3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[0] = _mm_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 8 * str1;
-          const int16_t *inb = in + 23 * str1;
-          __m128i *step1a = &step1[8];
-          __m128i *step1b = &step1[23];
-          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[0] = _mm_add_epi16(ina0, inb0);
-          step1a[1] = _mm_add_epi16(ina1, inb1);
-          step1a[2] = _mm_add_epi16(ina2, inb2);
-          step1a[3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[0] = _mm_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 12 * str1;
-          const int16_t *inb = in + 19 * str1;
-          __m128i *step1a = &step1[12];
-          __m128i *step1b = &step1[19];
-          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[0] = _mm_add_epi16(ina0, inb0);
-          step1a[1] = _mm_add_epi16(ina1, inb1);
-          step1a[2] = _mm_add_epi16(ina2, inb2);
-          step1a[3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[0] = _mm_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-      } else {
-        int16_t *in = &intermediate[column_start];
-        // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
-        // Note: using the same approach as above to have common offset is
-        //       counter-productive as all offsets can be calculated at compile
-        //       time.
-        // Note: the next four blocks could be in a loop. That would help the
-        //       instruction cache but is actually slower.
-        {
-          __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
-          __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
-          __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
-          __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
-          __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
-          __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
-          __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
-          __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
-          step1[0] = ADD_EPI16(in00, in31);
-          step1[1] = ADD_EPI16(in01, in30);
-          step1[2] = ADD_EPI16(in02, in29);
-          step1[3] = ADD_EPI16(in03, in28);
-          step1[28] = SUB_EPI16(in03, in28);
-          step1[29] = SUB_EPI16(in02, in29);
-          step1[30] = SUB_EPI16(in01, in30);
-          step1[31] = SUB_EPI16(in00, in31);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
-                                             &step1[3], &step1[28], &step1[29],
-                                             &step1[30], &step1[31]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
-          __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
-          __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
-          __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
-          __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
-          __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
-          __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
-          __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
-          step1[4] = ADD_EPI16(in04, in27);
-          step1[5] = ADD_EPI16(in05, in26);
-          step1[6] = ADD_EPI16(in06, in25);
-          step1[7] = ADD_EPI16(in07, in24);
-          step1[24] = SUB_EPI16(in07, in24);
-          step1[25] = SUB_EPI16(in06, in25);
-          step1[26] = SUB_EPI16(in05, in26);
-          step1[27] = SUB_EPI16(in04, in27);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
-                                             &step1[7], &step1[24], &step1[25],
-                                             &step1[26], &step1[27]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
-          __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
-          __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
-          __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
-          __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
-          __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
-          __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
-          __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
-          step1[8] = ADD_EPI16(in08, in23);
-          step1[9] = ADD_EPI16(in09, in22);
-          step1[10] = ADD_EPI16(in10, in21);
-          step1[11] = ADD_EPI16(in11, in20);
-          step1[20] = SUB_EPI16(in11, in20);
-          step1[21] = SUB_EPI16(in10, in21);
-          step1[22] = SUB_EPI16(in09, in22);
-          step1[23] = SUB_EPI16(in08, in23);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
-                                             &step1[11], &step1[20], &step1[21],
-                                             &step1[22], &step1[23]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
-          __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
-          __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
-          __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
-          __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
-          __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
-          __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
-          __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
-          step1[12] = ADD_EPI16(in12, in19);
-          step1[13] = ADD_EPI16(in13, in18);
-          step1[14] = ADD_EPI16(in14, in17);
-          step1[15] = ADD_EPI16(in15, in16);
-          step1[16] = SUB_EPI16(in15, in16);
-          step1[17] = SUB_EPI16(in14, in17);
-          step1[18] = SUB_EPI16(in13, in18);
-          step1[19] = SUB_EPI16(in12, in19);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
-                                             &step1[15], &step1[16], &step1[17],
-                                             &step1[18], &step1[19]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-      }
-      // Stage 2
-      {
-        step2[0] = ADD_EPI16(step1[0], step1[15]);
-        step2[1] = ADD_EPI16(step1[1], step1[14]);
-        step2[2] = ADD_EPI16(step1[2], step1[13]);
-        step2[3] = ADD_EPI16(step1[3], step1[12]);
-        step2[4] = ADD_EPI16(step1[4], step1[11]);
-        step2[5] = ADD_EPI16(step1[5], step1[10]);
-        step2[6] = ADD_EPI16(step1[6], step1[9]);
-        step2[7] = ADD_EPI16(step1[7], step1[8]);
-        step2[8] = SUB_EPI16(step1[7], step1[8]);
-        step2[9] = SUB_EPI16(step1[6], step1[9]);
-        step2[10] = SUB_EPI16(step1[5], step1[10]);
-        step2[11] = SUB_EPI16(step1[4], step1[11]);
-        step2[12] = SUB_EPI16(step1[3], step1[12]);
-        step2[13] = SUB_EPI16(step1[2], step1[13]);
-        step2[14] = SUB_EPI16(step1[1], step1[14]);
-        step2[15] = SUB_EPI16(step1[0], step1[15]);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x16(
-            &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
-            &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
-            &step2[12], &step2[13], &step2[14], &step2[15]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      {
-        const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
-        const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
-        const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
-        const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
-        const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
-        const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
-        const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
-        const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
-        const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
-        const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
-        const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
-        const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
-        const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
-        const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
-        const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
-        const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
-        const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
-        const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
-        const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
-        const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
-        const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
-        const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
-        const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
-        const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
-        // dct_const_round_shift
-        const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
-        const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
-        const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
-        const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
-        const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
-        const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
-        const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
-        const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
-        const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
-        const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
-        const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
-        const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
-        const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
-        const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
-        const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
-        const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
-        // Combine
-        step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
-        step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
-        step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
-        step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
-        step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
-        step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
-        step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
-        step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
-                                           &step2[23], &step2[24], &step2[25],
-                                           &step2[26], &step2[27]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-
-#if !FDCT32x32_HIGH_PRECISION
-      // dump the magnitude by half, hence the intermediate values are within
-      // the range of 16 bits.
-      if (1 == pass) {
-        __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero);
-        __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero);
-        __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero);
-        __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero);
-        __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero);
-        __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero);
-        __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero);
-        __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero);
-        __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero);
-        __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero);
-        __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
-        __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
-        __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
-        __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
-        __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
-        __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
-        __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
-        __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
-        __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
-        __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
-        __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
-        __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
-        __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
-        __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
-        __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
-        __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
-        __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
-        __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
-        __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
-        __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
-        __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
-        __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
-
-        step2[0] = SUB_EPI16(step2[0], s3_00_0);
-        step2[1] = SUB_EPI16(step2[1], s3_01_0);
-        step2[2] = SUB_EPI16(step2[2], s3_02_0);
-        step2[3] = SUB_EPI16(step2[3], s3_03_0);
-        step2[4] = SUB_EPI16(step2[4], s3_04_0);
-        step2[5] = SUB_EPI16(step2[5], s3_05_0);
-        step2[6] = SUB_EPI16(step2[6], s3_06_0);
-        step2[7] = SUB_EPI16(step2[7], s3_07_0);
-        step2[8] = SUB_EPI16(step2[8], s2_08_0);
-        step2[9] = SUB_EPI16(step2[9], s2_09_0);
-        step2[10] = SUB_EPI16(step2[10], s3_10_0);
-        step2[11] = SUB_EPI16(step2[11], s3_11_0);
-        step2[12] = SUB_EPI16(step2[12], s3_12_0);
-        step2[13] = SUB_EPI16(step2[13], s3_13_0);
-        step2[14] = SUB_EPI16(step2[14], s2_14_0);
-        step2[15] = SUB_EPI16(step2[15], s2_15_0);
-        step1[16] = SUB_EPI16(step1[16], s3_16_0);
-        step1[17] = SUB_EPI16(step1[17], s3_17_0);
-        step1[18] = SUB_EPI16(step1[18], s3_18_0);
-        step1[19] = SUB_EPI16(step1[19], s3_19_0);
-        step2[20] = SUB_EPI16(step2[20], s3_20_0);
-        step2[21] = SUB_EPI16(step2[21], s3_21_0);
-        step2[22] = SUB_EPI16(step2[22], s3_22_0);
-        step2[23] = SUB_EPI16(step2[23], s3_23_0);
-        step2[24] = SUB_EPI16(step2[24], s3_24_0);
-        step2[25] = SUB_EPI16(step2[25], s3_25_0);
-        step2[26] = SUB_EPI16(step2[26], s3_26_0);
-        step2[27] = SUB_EPI16(step2[27], s3_27_0);
-        step1[28] = SUB_EPI16(step1[28], s3_28_0);
-        step1[29] = SUB_EPI16(step1[29], s3_29_0);
-        step1[30] = SUB_EPI16(step1[30], s3_30_0);
-        step1[31] = SUB_EPI16(step1[31], s3_31_0);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x32(
-            &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
-            &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
-            &step2[12], &step2[13], &step2[14], &step2[15], &step1[16],
-            &step1[17], &step1[18], &step1[19], &step2[20], &step2[21],
-            &step2[22], &step2[23], &step2[24], &step2[25], &step2[26],
-            &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]);
-        if (overflow) {
-          HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-        step2[0] = _mm_add_epi16(step2[0], kOne);
-        step2[1] = _mm_add_epi16(step2[1], kOne);
-        step2[2] = _mm_add_epi16(step2[2], kOne);
-        step2[3] = _mm_add_epi16(step2[3], kOne);
-        step2[4] = _mm_add_epi16(step2[4], kOne);
-        step2[5] = _mm_add_epi16(step2[5], kOne);
-        step2[6] = _mm_add_epi16(step2[6], kOne);
-        step2[7] = _mm_add_epi16(step2[7], kOne);
-        step2[8] = _mm_add_epi16(step2[8], kOne);
-        step2[9] = _mm_add_epi16(step2[9], kOne);
-        step2[10] = _mm_add_epi16(step2[10], kOne);
-        step2[11] = _mm_add_epi16(step2[11], kOne);
-        step2[12] = _mm_add_epi16(step2[12], kOne);
-        step2[13] = _mm_add_epi16(step2[13], kOne);
-        step2[14] = _mm_add_epi16(step2[14], kOne);
-        step2[15] = _mm_add_epi16(step2[15], kOne);
-        step1[16] = _mm_add_epi16(step1[16], kOne);
-        step1[17] = _mm_add_epi16(step1[17], kOne);
-        step1[18] = _mm_add_epi16(step1[18], kOne);
-        step1[19] = _mm_add_epi16(step1[19], kOne);
-        step2[20] = _mm_add_epi16(step2[20], kOne);
-        step2[21] = _mm_add_epi16(step2[21], kOne);
-        step2[22] = _mm_add_epi16(step2[22], kOne);
-        step2[23] = _mm_add_epi16(step2[23], kOne);
-        step2[24] = _mm_add_epi16(step2[24], kOne);
-        step2[25] = _mm_add_epi16(step2[25], kOne);
-        step2[26] = _mm_add_epi16(step2[26], kOne);
-        step2[27] = _mm_add_epi16(step2[27], kOne);
-        step1[28] = _mm_add_epi16(step1[28], kOne);
-        step1[29] = _mm_add_epi16(step1[29], kOne);
-        step1[30] = _mm_add_epi16(step1[30], kOne);
-        step1[31] = _mm_add_epi16(step1[31], kOne);
-
-        step2[0] = _mm_srai_epi16(step2[0], 2);
-        step2[1] = _mm_srai_epi16(step2[1], 2);
-        step2[2] = _mm_srai_epi16(step2[2], 2);
-        step2[3] = _mm_srai_epi16(step2[3], 2);
-        step2[4] = _mm_srai_epi16(step2[4], 2);
-        step2[5] = _mm_srai_epi16(step2[5], 2);
-        step2[6] = _mm_srai_epi16(step2[6], 2);
-        step2[7] = _mm_srai_epi16(step2[7], 2);
-        step2[8] = _mm_srai_epi16(step2[8], 2);
-        step2[9] = _mm_srai_epi16(step2[9], 2);
-        step2[10] = _mm_srai_epi16(step2[10], 2);
-        step2[11] = _mm_srai_epi16(step2[11], 2);
-        step2[12] = _mm_srai_epi16(step2[12], 2);
-        step2[13] = _mm_srai_epi16(step2[13], 2);
-        step2[14] = _mm_srai_epi16(step2[14], 2);
-        step2[15] = _mm_srai_epi16(step2[15], 2);
-        step1[16] = _mm_srai_epi16(step1[16], 2);
-        step1[17] = _mm_srai_epi16(step1[17], 2);
-        step1[18] = _mm_srai_epi16(step1[18], 2);
-        step1[19] = _mm_srai_epi16(step1[19], 2);
-        step2[20] = _mm_srai_epi16(step2[20], 2);
-        step2[21] = _mm_srai_epi16(step2[21], 2);
-        step2[22] = _mm_srai_epi16(step2[22], 2);
-        step2[23] = _mm_srai_epi16(step2[23], 2);
-        step2[24] = _mm_srai_epi16(step2[24], 2);
-        step2[25] = _mm_srai_epi16(step2[25], 2);
-        step2[26] = _mm_srai_epi16(step2[26], 2);
-        step2[27] = _mm_srai_epi16(step2[27], 2);
-        step1[28] = _mm_srai_epi16(step1[28], 2);
-        step1[29] = _mm_srai_epi16(step1[29], 2);
-        step1[30] = _mm_srai_epi16(step1[30], 2);
-        step1[31] = _mm_srai_epi16(step1[31], 2);
-      }
-#endif  // !FDCT32x32_HIGH_PRECISION
-
-#if FDCT32x32_HIGH_PRECISION
-      if (pass == 0) {
-#endif
-        // Stage 3
-        {
-          step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
-          step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
-          step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
-          step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
-          step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
-          step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
-          step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
-          step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
-                                             &step3[3], &step3[4], &step3[5],
-                                             &step3[6], &step3[7]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
-          const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
-          const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
-          const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
-          const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
-          const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
-          const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
-          const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
-          const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
-          const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
-          const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
-          const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
-          const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
-          const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
-          const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
-          const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
-          const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
-          const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
-          const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
-          // Combine
-          step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
-          step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
-          step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
-          step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12],
-                                             &step3[13]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          step3[16] = ADD_EPI16(step2[23], step1[16]);
-          step3[17] = ADD_EPI16(step2[22], step1[17]);
-          step3[18] = ADD_EPI16(step2[21], step1[18]);
-          step3[19] = ADD_EPI16(step2[20], step1[19]);
-          step3[20] = SUB_EPI16(step1[19], step2[20]);
-          step3[21] = SUB_EPI16(step1[18], step2[21]);
-          step3[22] = SUB_EPI16(step1[17], step2[22]);
-          step3[23] = SUB_EPI16(step1[16], step2[23]);
-          step3[24] = SUB_EPI16(step1[31], step2[24]);
-          step3[25] = SUB_EPI16(step1[30], step2[25]);
-          step3[26] = SUB_EPI16(step1[29], step2[26]);
-          step3[27] = SUB_EPI16(step1[28], step2[27]);
-          step3[28] = ADD_EPI16(step2[27], step1[28]);
-          step3[29] = ADD_EPI16(step2[26], step1[29]);
-          step3[30] = ADD_EPI16(step2[25], step1[30]);
-          step3[31] = ADD_EPI16(step2[24], step1[31]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x16(
-              &step3[16], &step3[17], &step3[18], &step3[19], &step3[20],
-              &step3[21], &step3[22], &step3[23], &step3[24], &step3[25],
-              &step3[26], &step3[27], &step3[28], &step3[29], &step3[30],
-              &step3[31]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-
-        // Stage 4
-        {
-          step1[0] = ADD_EPI16(step3[3], step3[0]);
-          step1[1] = ADD_EPI16(step3[2], step3[1]);
-          step1[2] = SUB_EPI16(step3[1], step3[2]);
-          step1[3] = SUB_EPI16(step3[0], step3[3]);
-          step1[8] = ADD_EPI16(step3[11], step2[8]);
-          step1[9] = ADD_EPI16(step3[10], step2[9]);
-          step1[10] = SUB_EPI16(step2[9], step3[10]);
-          step1[11] = SUB_EPI16(step2[8], step3[11]);
-          step1[12] = SUB_EPI16(step2[15], step3[12]);
-          step1[13] = SUB_EPI16(step2[14], step3[13]);
-          step1[14] = ADD_EPI16(step3[13], step2[14]);
-          step1[15] = ADD_EPI16(step3[12], step2[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x16(
-              &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5],
-              &step1[6], &step1[7], &step1[8], &step1[9], &step1[10],
-              &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
-          const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
-          const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
-          const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
-          const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
-          const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
-          const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
-          const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
-          const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
-          // Combine
-          step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
-          step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
-          const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
-          const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
-          const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
-          const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
-          const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
-          const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
-          const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
-          const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
-          const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
-          const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
-          const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
-          const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
-          const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
-          const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
-          const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
-          const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
-          const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
-          const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
-          const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
-          const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
-          const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
-          const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
-          const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
-          // dct_const_round_shift
-          const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
-          const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
-          const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
-          const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
-          const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
-          const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
-          const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
-          const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
-          const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
-          const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
-          const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
-          const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
-          const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
-          const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
-          const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
-          const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
-          // Combine
-          step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
-          step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
-          step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
-          step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
-          step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
-          step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
-          step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
-          step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
-                                             &step1[21], &step1[26], &step1[27],
-                                             &step1[28], &step1[29]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // Stage 5
-        {
-          step2[4] = ADD_EPI16(step1[5], step3[4]);
-          step2[5] = SUB_EPI16(step3[4], step1[5]);
-          step2[6] = SUB_EPI16(step3[7], step1[6]);
-          step2[7] = ADD_EPI16(step1[6], step3[7]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6],
-                                             &step2[7]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
-          const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
-          const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
-          const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
-          const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
-          const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
-          const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
-          const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
-          const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
-          const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
-          const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
-          const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
-          // dct_const_round_shift
-          const __m128i out_00_4 =
-              _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_00_5 =
-              _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_16_4 =
-              _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_16_5 =
-              _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_08_4 =
-              _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_08_5 =
-              _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_24_4 =
-              _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_24_5 =
-              _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
-          const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
-          const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
-          const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
-          const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
-          const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
-          const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
-          const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
-          // Combine
-          out[0] = _mm_packs_epi32(out_00_6, out_00_7);
-          out[16] = _mm_packs_epi32(out_16_6, out_16_7);
-          out[8] = _mm_packs_epi32(out_08_6, out_08_7);
-          out[24] = _mm_packs_epi32(out_24_6, out_24_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
-          const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
-          const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
-          const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
-          const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
-          const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
-          const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
-          const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
-          const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
-          const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
-          const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
-          const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
-          // dct_const_round_shift
-          const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
-          const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
-          const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
-          const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
-          const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
-          const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
-          const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
-          const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
-          const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
-          const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
-          const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
-          const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
-          const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
-          const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
-          const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
-          const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
-          // Combine
-          step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
-          step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
-          step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
-          step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13],
-                                             &step2[14]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          step2[16] = ADD_EPI16(step1[19], step3[16]);
-          step2[17] = ADD_EPI16(step1[18], step3[17]);
-          step2[18] = SUB_EPI16(step3[17], step1[18]);
-          step2[19] = SUB_EPI16(step3[16], step1[19]);
-          step2[20] = SUB_EPI16(step3[23], step1[20]);
-          step2[21] = SUB_EPI16(step3[22], step1[21]);
-          step2[22] = ADD_EPI16(step1[21], step3[22]);
-          step2[23] = ADD_EPI16(step1[20], step3[23]);
-          step2[24] = ADD_EPI16(step1[27], step3[24]);
-          step2[25] = ADD_EPI16(step1[26], step3[25]);
-          step2[26] = SUB_EPI16(step3[25], step1[26]);
-          step2[27] = SUB_EPI16(step3[24], step1[27]);
-          step2[28] = SUB_EPI16(step3[31], step1[28]);
-          step2[29] = SUB_EPI16(step3[30], step1[29]);
-          step2[30] = ADD_EPI16(step1[29], step3[30]);
-          step2[31] = ADD_EPI16(step1[28], step3[31]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x16(
-              &step2[16], &step2[17], &step2[18], &step2[19], &step2[20],
-              &step2[21], &step2[22], &step2[23], &step2[24], &step2[25],
-              &step2[26], &step2[27], &step2[28], &step2[29], &step2[30],
-              &step2[31]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // Stage 6
-        {
-          const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-          const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-          const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-          const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-          const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-          const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-          const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-          const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-          const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
-          const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
-          const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
-          const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
-          const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
-          const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
-          const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
-          const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
-          // dct_const_round_shift
-          const __m128i out_04_4 =
-              _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_04_5 =
-              _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_20_4 =
-              _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_20_5 =
-              _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_12_4 =
-              _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_12_5 =
-              _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_28_4 =
-              _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_28_5 =
-              _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
-          const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
-          const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
-          const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
-          const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
-          const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
-          const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
-          const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
-          // Combine
-          out[4] = _mm_packs_epi32(out_04_6, out_04_7);
-          out[20] = _mm_packs_epi32(out_20_6, out_20_7);
-          out[12] = _mm_packs_epi32(out_12_6, out_12_7);
-          out[28] = _mm_packs_epi32(out_28_6, out_28_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          step3[8] = ADD_EPI16(step2[9], step1[8]);
-          step3[9] = SUB_EPI16(step1[8], step2[9]);
-          step3[10] = SUB_EPI16(step1[11], step2[10]);
-          step3[11] = ADD_EPI16(step2[10], step1[11]);
-          step3[12] = ADD_EPI16(step2[13], step1[12]);
-          step3[13] = SUB_EPI16(step1[12], step2[13]);
-          step3[14] = SUB_EPI16(step1[15], step2[14]);
-          step3[15] = ADD_EPI16(step2[14], step1[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
-                                             &step3[11], &step3[12], &step3[13],
-                                             &step3[14], &step3[15]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
-          const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
-          const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
-          const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
-          const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
-          const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
-          const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
-          const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
-          const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
-          const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
-          const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
-          const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
-          const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
-          const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
-          const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
-          const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
-          const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
-          const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
-          const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
-          const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
-          const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
-          const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
-          const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
-          const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
-          // dct_const_round_shift
-          const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
-          const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
-          const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
-          const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
-          const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
-          const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
-          const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
-          const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
-          const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
-          const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
-          const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
-          const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
-          const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
-          const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
-          const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
-          const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
-          // Combine
-          step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
-          step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
-          step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
-          step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
-          // Combine
-          step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
-          step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
-          step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
-          step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
-                                             &step3[22], &step3[25], &step3[26],
-                                             &step3[29], &step3[30]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // Stage 7
-        {
-          const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
-          const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
-          const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
-          const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
-          const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
-          const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
-          const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
-          const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
-          const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
-          const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
-          const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
-          const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
-          const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
-          const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
-          const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
-          const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
-          const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
-          const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
-          const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
-          const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
-          const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
-          const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
-          const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
-          const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
-          // dct_const_round_shift
-          const __m128i out_02_4 =
-              _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_02_5 =
-              _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_18_4 =
-              _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_18_5 =
-              _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_10_4 =
-              _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_10_5 =
-              _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_26_4 =
-              _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_26_5 =
-              _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_06_4 =
-              _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_06_5 =
-              _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_22_4 =
-              _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_22_5 =
-              _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_14_4 =
-              _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_14_5 =
-              _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_30_4 =
-              _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_30_5 =
-              _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
-          const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
-          const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
-          const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
-          const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
-          const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
-          const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
-          const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
-          const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
-          const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
-          const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
-          const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
-          const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
-          const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
-          const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
-          const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
-          // Combine
-          out[2] = _mm_packs_epi32(out_02_6, out_02_7);
-          out[18] = _mm_packs_epi32(out_18_6, out_18_7);
-          out[10] = _mm_packs_epi32(out_10_6, out_10_7);
-          out[26] = _mm_packs_epi32(out_26_6, out_26_7);
-          out[6] = _mm_packs_epi32(out_06_6, out_06_7);
-          out[22] = _mm_packs_epi32(out_22_6, out_22_7);
-          out[14] = _mm_packs_epi32(out_14_6, out_14_7);
-          out[30] = _mm_packs_epi32(out_30_6, out_30_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
-                                      &out[6], &out[22], &out[14], &out[30]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          step1[16] = ADD_EPI16(step3[17], step2[16]);
-          step1[17] = SUB_EPI16(step2[16], step3[17]);
-          step1[18] = SUB_EPI16(step2[19], step3[18]);
-          step1[19] = ADD_EPI16(step3[18], step2[19]);
-          step1[20] = ADD_EPI16(step3[21], step2[20]);
-          step1[21] = SUB_EPI16(step2[20], step3[21]);
-          step1[22] = SUB_EPI16(step2[23], step3[22]);
-          step1[23] = ADD_EPI16(step3[22], step2[23]);
-          step1[24] = ADD_EPI16(step3[25], step2[24]);
-          step1[25] = SUB_EPI16(step2[24], step3[25]);
-          step1[26] = SUB_EPI16(step2[27], step3[26]);
-          step1[27] = ADD_EPI16(step3[26], step2[27]);
-          step1[28] = ADD_EPI16(step3[29], step2[28]);
-          step1[29] = SUB_EPI16(step2[28], step3[29]);
-          step1[30] = SUB_EPI16(step2[31], step3[30]);
-          step1[31] = ADD_EPI16(step3[30], step2[31]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x16(
-              &step1[16], &step1[17], &step1[18], &step1[19], &step1[20],
-              &step1[21], &step1[22], &step1[23], &step1[24], &step1[25],
-              &step1[26], &step1[27], &step1[28], &step1[29], &step1[30],
-              &step1[31]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // Final stage --- outputs indices are bit-reversed.
-        {
-          const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
-          const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
-          const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
-          const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
-          const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
-          const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
-          const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
-          const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
-          const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
-          const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
-          const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
-          const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
-          const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
-          const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
-          const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
-          const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
-          const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
-          const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
-          const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
-          const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
-          const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
-          const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
-          const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
-          const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
-          // dct_const_round_shift
-          const __m128i out_01_4 =
-              _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_01_5 =
-              _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_17_4 =
-              _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_17_5 =
-              _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_09_4 =
-              _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_09_5 =
-              _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_25_4 =
-              _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_25_5 =
-              _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_07_4 =
-              _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_07_5 =
-              _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_23_4 =
-              _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_23_5 =
-              _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_15_4 =
-              _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_15_5 =
-              _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_31_4 =
-              _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_31_5 =
-              _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
-          const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
-          const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
-          const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
-          const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
-          const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
-          const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
-          const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
-          const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
-          const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
-          const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
-          const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
-          const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
-          const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
-          const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
-          const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
-          // Combine
-          out[1] = _mm_packs_epi32(out_01_6, out_01_7);
-          out[17] = _mm_packs_epi32(out_17_6, out_17_7);
-          out[9] = _mm_packs_epi32(out_09_6, out_09_7);
-          out[25] = _mm_packs_epi32(out_25_6, out_25_7);
-          out[7] = _mm_packs_epi32(out_07_6, out_07_7);
-          out[23] = _mm_packs_epi32(out_23_6, out_23_7);
-          out[15] = _mm_packs_epi32(out_15_6, out_15_7);
-          out[31] = _mm_packs_epi32(out_31_6, out_31_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
-                                      &out[7], &out[23], &out[15], &out[31]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
-          const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
-          const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
-          const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
-          const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
-          const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
-          const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
-          const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
-          const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
-          const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
-          const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
-          const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
-          const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
-          const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
-          const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
-          const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
-          const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
-          const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
-          const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
-          const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
-          const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
-          const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
-          const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
-          const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
-          // dct_const_round_shift
-          const __m128i out_05_4 =
-              _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_05_5 =
-              _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_21_4 =
-              _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_21_5 =
-              _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_13_4 =
-              _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_13_5 =
-              _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_29_4 =
-              _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_29_5 =
-              _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_03_4 =
-              _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_03_5 =
-              _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_19_4 =
-              _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_19_5 =
-              _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_11_4 =
-              _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_11_5 =
-              _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_27_4 =
-              _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_27_5 =
-              _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
-          const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
-          const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
-          const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
-          const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
-          const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
-          const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
-          const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
-          const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
-          const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
-          const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
-          const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
-          const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
-          const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
-          const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
-          const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
-          // Combine
-          out[5] = _mm_packs_epi32(out_05_6, out_05_7);
-          out[21] = _mm_packs_epi32(out_21_6, out_21_7);
-          out[13] = _mm_packs_epi32(out_13_6, out_13_7);
-          out[29] = _mm_packs_epi32(out_29_6, out_29_7);
-          out[3] = _mm_packs_epi32(out_03_6, out_03_7);
-          out[19] = _mm_packs_epi32(out_19_6, out_19_7);
-          out[11] = _mm_packs_epi32(out_11_6, out_11_7);
-          out[27] = _mm_packs_epi32(out_27_6, out_27_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
-                                      &out[3], &out[19], &out[11], &out[27]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-#if FDCT32x32_HIGH_PRECISION
-      } else {
-        __m128i lstep1[64], lstep2[64], lstep3[64];
-        __m128i u[32], v[32], sign[16];
-        const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
-        // start using 32-bit operations
-        // stage 3
-        {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[0] = _mm_unpacklo_epi16(step2[0], kZero);
-          lstep2[1] = _mm_unpackhi_epi16(step2[0], kZero);
-          lstep2[2] = _mm_unpacklo_epi16(step2[1], kZero);
-          lstep2[3] = _mm_unpackhi_epi16(step2[1], kZero);
-          lstep2[4] = _mm_unpacklo_epi16(step2[2], kZero);
-          lstep2[5] = _mm_unpackhi_epi16(step2[2], kZero);
-          lstep2[6] = _mm_unpacklo_epi16(step2[3], kZero);
-          lstep2[7] = _mm_unpackhi_epi16(step2[3], kZero);
-          lstep2[8] = _mm_unpacklo_epi16(step2[4], kZero);
-          lstep2[9] = _mm_unpackhi_epi16(step2[4], kZero);
-          lstep2[10] = _mm_unpacklo_epi16(step2[5], kZero);
-          lstep2[11] = _mm_unpackhi_epi16(step2[5], kZero);
-          lstep2[12] = _mm_unpacklo_epi16(step2[6], kZero);
-          lstep2[13] = _mm_unpackhi_epi16(step2[6], kZero);
-          lstep2[14] = _mm_unpacklo_epi16(step2[7], kZero);
-          lstep2[15] = _mm_unpackhi_epi16(step2[7], kZero);
-          lstep2[0] = _mm_madd_epi16(lstep2[0], kOne);
-          lstep2[1] = _mm_madd_epi16(lstep2[1], kOne);
-          lstep2[2] = _mm_madd_epi16(lstep2[2], kOne);
-          lstep2[3] = _mm_madd_epi16(lstep2[3], kOne);
-          lstep2[4] = _mm_madd_epi16(lstep2[4], kOne);
-          lstep2[5] = _mm_madd_epi16(lstep2[5], kOne);
-          lstep2[6] = _mm_madd_epi16(lstep2[6], kOne);
-          lstep2[7] = _mm_madd_epi16(lstep2[7], kOne);
-          lstep2[8] = _mm_madd_epi16(lstep2[8], kOne);
-          lstep2[9] = _mm_madd_epi16(lstep2[9], kOne);
-          lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
-          lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
-          lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
-          lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
-          lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
-          lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
-
-          lstep3[0] = _mm_add_epi32(lstep2[14], lstep2[0]);
-          lstep3[1] = _mm_add_epi32(lstep2[15], lstep2[1]);
-          lstep3[2] = _mm_add_epi32(lstep2[12], lstep2[2]);
-          lstep3[3] = _mm_add_epi32(lstep2[13], lstep2[3]);
-          lstep3[4] = _mm_add_epi32(lstep2[10], lstep2[4]);
-          lstep3[5] = _mm_add_epi32(lstep2[11], lstep2[5]);
-          lstep3[6] = _mm_add_epi32(lstep2[8], lstep2[6]);
-          lstep3[7] = _mm_add_epi32(lstep2[9], lstep2[7]);
-          lstep3[8] = _mm_sub_epi32(lstep2[6], lstep2[8]);
-          lstep3[9] = _mm_sub_epi32(lstep2[7], lstep2[9]);
-          lstep3[10] = _mm_sub_epi32(lstep2[4], lstep2[10]);
-          lstep3[11] = _mm_sub_epi32(lstep2[5], lstep2[11]);
-          lstep3[12] = _mm_sub_epi32(lstep2[2], lstep2[12]);
-          lstep3[13] = _mm_sub_epi32(lstep2[3], lstep2[13]);
-          lstep3[14] = _mm_sub_epi32(lstep2[0], lstep2[14]);
-          lstep3[15] = _mm_sub_epi32(lstep2[1], lstep2[15]);
-        }
-        {
-          const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
-          const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
-          const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
-          const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
-          const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
-          const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
-          const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
-          const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
-          const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
-          const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
-          const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
-          const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-          lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
-          lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
-          lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
-          lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
-          lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
-          lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
-          lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
-          lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
-        }
-        {
-          lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
-          lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
-          lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
-          lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
-          lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
-          lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
-          lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
-          lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
-          lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
-          lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
-          lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
-          lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
-          lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
-          lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
-          lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
-          lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
-          lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
-          lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
-          lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
-          lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
-          lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
-          lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
-          lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
-          lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
-          lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
-          lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
-          lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
-          lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
-          lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
-          lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
-          lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
-          lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
-
-          lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
-          lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
-          lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
-          lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
-          lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
-          lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
-          lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
-          lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
-          lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
-          lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
-          lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
-          lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
-          lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
-          lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
-          lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
-          lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
-          lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
-          lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
-          lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
-          lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
-          lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
-          lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
-          lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
-          lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
-          lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
-          lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
-          lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
-          lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
-          lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
-          lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
-          lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
-          lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
-
-          lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
-          lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
-
-          lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
-          lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
-          lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
-          lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
-          lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
-          lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
-          lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
-          lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
-          lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
-          lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
-          lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
-          lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
-          lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
-          lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
-          lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
-          lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
-          lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
-          lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
-          lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
-          lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
-          lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
-          lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
-          lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
-          lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
-          lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
-          lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
-          lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
-          lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
-          lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
-          lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
-        }
-
-        // stage 4
-        {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[16] = _mm_unpacklo_epi16(step2[8], kZero);
-          lstep2[17] = _mm_unpackhi_epi16(step2[8], kZero);
-          lstep2[18] = _mm_unpacklo_epi16(step2[9], kZero);
-          lstep2[19] = _mm_unpackhi_epi16(step2[9], kZero);
-          lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
-          lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
-          lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
-          lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
-          lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
-          lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
-          lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
-          lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
-          lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
-          lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
-          lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
-          lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
-
-          lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]);
-          lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]);
-          lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]);
-          lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]);
-          lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]);
-          lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]);
-          lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]);
-          lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]);
-          lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
-          lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
-          lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
-          lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
-          lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
-          lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
-          lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
-          lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
-          lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
-          lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
-          lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
-          lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
-          lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
-          lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
-          lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
-          lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
-        }
-        {
-          // to be continued...
-          //
-          const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
-          const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
-          u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
-          u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
-          u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
-
-          // TODO(jingning): manually inline k_madd_epi32_ to further hide
-          // instruction latency.
-          v[0] = k_madd_epi32(u[0], k32_p16_m16);
-          v[1] = k_madd_epi32(u[1], k32_p16_m16);
-          v[2] = k_madd_epi32(u[2], k32_p16_m16);
-          v[3] = k_madd_epi32(u[3], k32_p16_m16);
-          v[4] = k_madd_epi32(u[0], k32_p16_p16);
-          v[5] = k_madd_epi32(u[1], k32_p16_p16);
-          v[6] = k_madd_epi32(u[2], k32_p16_p16);
-          v[7] = k_madd_epi32(u[3], k32_p16_p16);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4],
-                                              &v[5], &v[6], &v[7], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-
-          lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-        }
-        {
-          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
-          const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
-          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
-          u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
-          u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
-          u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
-          u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
-          u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
-          u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
-          u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
-          u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
-          u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
-          u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
-          u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
-          u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
-          u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
-          u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
-          u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
-
-          v[0] = k_madd_epi32(u[0], k32_m08_p24);
-          v[1] = k_madd_epi32(u[1], k32_m08_p24);
-          v[2] = k_madd_epi32(u[2], k32_m08_p24);
-          v[3] = k_madd_epi32(u[3], k32_m08_p24);
-          v[4] = k_madd_epi32(u[4], k32_m08_p24);
-          v[5] = k_madd_epi32(u[5], k32_m08_p24);
-          v[6] = k_madd_epi32(u[6], k32_m08_p24);
-          v[7] = k_madd_epi32(u[7], k32_m08_p24);
-          v[8] = k_madd_epi32(u[8], k32_m24_m08);
-          v[9] = k_madd_epi32(u[9], k32_m24_m08);
-          v[10] = k_madd_epi32(u[10], k32_m24_m08);
-          v[11] = k_madd_epi32(u[11], k32_m24_m08);
-          v[12] = k_madd_epi32(u[12], k32_m24_m08);
-          v[13] = k_madd_epi32(u[13], k32_m24_m08);
-          v[14] = k_madd_epi32(u[14], k32_m24_m08);
-          v[15] = k_madd_epi32(u[15], k32_m24_m08);
-          v[16] = k_madd_epi32(u[12], k32_m08_p24);
-          v[17] = k_madd_epi32(u[13], k32_m08_p24);
-          v[18] = k_madd_epi32(u[14], k32_m08_p24);
-          v[19] = k_madd_epi32(u[15], k32_m08_p24);
-          v[20] = k_madd_epi32(u[8], k32_m08_p24);
-          v[21] = k_madd_epi32(u[9], k32_m08_p24);
-          v[22] = k_madd_epi32(u[10], k32_m08_p24);
-          v[23] = k_madd_epi32(u[11], k32_m08_p24);
-          v[24] = k_madd_epi32(u[4], k32_p24_p08);
-          v[25] = k_madd_epi32(u[5], k32_p24_p08);
-          v[26] = k_madd_epi32(u[6], k32_p24_p08);
-          v[27] = k_madd_epi32(u[7], k32_p24_p08);
-          v[28] = k_madd_epi32(u[0], k32_p24_p08);
-          v[29] = k_madd_epi32(u[1], k32_p24_p08);
-          v[30] = k_madd_epi32(u[2], k32_p24_p08);
-          v[31] = k_madd_epi32(u[3], k32_p24_p08);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-        }
-        // stage 5
-        {
-          lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]);
-          lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]);
-          lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]);
-          lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]);
-          lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
-          lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
-          lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
-          lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]);
-        }
-        {
-          const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
-          const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
-          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
-          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]);
-          u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]);
-          u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]);
-          u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]);
-          u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]);
-          u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]);
-          u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]);
-          u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]);
-
-          // TODO(jingning): manually inline k_madd_epi32_ to further hide
-          // instruction latency.
-          v[0] = k_madd_epi32(u[0], k32_p16_p16);
-          v[1] = k_madd_epi32(u[1], k32_p16_p16);
-          v[2] = k_madd_epi32(u[2], k32_p16_p16);
-          v[3] = k_madd_epi32(u[3], k32_p16_p16);
-          v[4] = k_madd_epi32(u[0], k32_p16_m16);
-          v[5] = k_madd_epi32(u[1], k32_p16_m16);
-          v[6] = k_madd_epi32(u[2], k32_p16_m16);
-          v[7] = k_madd_epi32(u[3], k32_p16_m16);
-          v[8] = k_madd_epi32(u[4], k32_p24_p08);
-          v[9] = k_madd_epi32(u[5], k32_p24_p08);
-          v[10] = k_madd_epi32(u[6], k32_p24_p08);
-          v[11] = k_madd_epi32(u[7], k32_p24_p08);
-          v[12] = k_madd_epi32(u[4], k32_m08_p24);
-          v[13] = k_madd_epi32(u[5], k32_m08_p24);
-          v[14] = k_madd_epi32(u[6], k32_m08_p24);
-          v[15] = k_madd_epi32(u[7], k32_m08_p24);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_16(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
-          sign[0] = _mm_cmplt_epi32(u[0], kZero);
-          sign[1] = _mm_cmplt_epi32(u[1], kZero);
-          sign[2] = _mm_cmplt_epi32(u[2], kZero);
-          sign[3] = _mm_cmplt_epi32(u[3], kZero);
-          sign[4] = _mm_cmplt_epi32(u[4], kZero);
-          sign[5] = _mm_cmplt_epi32(u[5], kZero);
-          sign[6] = _mm_cmplt_epi32(u[6], kZero);
-          sign[7] = _mm_cmplt_epi32(u[7], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], sign[0]);
-          u[1] = _mm_sub_epi32(u[1], sign[1]);
-          u[2] = _mm_sub_epi32(u[2], sign[2]);
-          u[3] = _mm_sub_epi32(u[3], sign[3]);
-          u[4] = _mm_sub_epi32(u[4], sign[4]);
-          u[5] = _mm_sub_epi32(u[5], sign[5]);
-          u[6] = _mm_sub_epi32(u[6], sign[6]);
-          u[7] = _mm_sub_epi32(u[7], sign[7]);
-
-          u[0] = _mm_add_epi32(u[0], K32One);
-          u[1] = _mm_add_epi32(u[1], K32One);
-          u[2] = _mm_add_epi32(u[2], K32One);
-          u[3] = _mm_add_epi32(u[3], K32One);
-          u[4] = _mm_add_epi32(u[4], K32One);
-          u[5] = _mm_add_epi32(u[5], K32One);
-          u[6] = _mm_add_epi32(u[6], K32One);
-          u[7] = _mm_add_epi32(u[7], K32One);
-
-          u[0] = _mm_srai_epi32(u[0], 2);
-          u[1] = _mm_srai_epi32(u[1], 2);
-          u[2] = _mm_srai_epi32(u[2], 2);
-          u[3] = _mm_srai_epi32(u[3], 2);
-          u[4] = _mm_srai_epi32(u[4], 2);
-          u[5] = _mm_srai_epi32(u[5], 2);
-          u[6] = _mm_srai_epi32(u[6], 2);
-          u[7] = _mm_srai_epi32(u[7], 2);
-
-          // Combine
-          out[0] = _mm_packs_epi32(u[0], u[1]);
-          out[16] = _mm_packs_epi32(u[2], u[3]);
-          out[8] = _mm_packs_epi32(u[4], u[5]);
-          out[24] = _mm_packs_epi32(u[6], u[7]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
-          const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
-          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]);
-          u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]);
-          u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]);
-          u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]);
-          u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]);
-          u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]);
-          u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]);
-          u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]);
-
-          v[0] = k_madd_epi32(u[0], k32_m08_p24);
-          v[1] = k_madd_epi32(u[1], k32_m08_p24);
-          v[2] = k_madd_epi32(u[2], k32_m08_p24);
-          v[3] = k_madd_epi32(u[3], k32_m08_p24);
-          v[4] = k_madd_epi32(u[4], k32_m24_m08);
-          v[5] = k_madd_epi32(u[5], k32_m24_m08);
-          v[6] = k_madd_epi32(u[6], k32_m24_m08);
-          v[7] = k_madd_epi32(u[7], k32_m24_m08);
-          v[8] = k_madd_epi32(u[4], k32_m08_p24);
-          v[9] = k_madd_epi32(u[5], k32_m08_p24);
-          v[10] = k_madd_epi32(u[6], k32_m08_p24);
-          v[11] = k_madd_epi32(u[7], k32_m08_p24);
-          v[12] = k_madd_epi32(u[0], k32_p24_p08);
-          v[13] = k_madd_epi32(u[1], k32_p24_p08);
-          v[14] = k_madd_epi32(u[2], k32_p24_p08);
-          v[15] = k_madd_epi32(u[3], k32_p24_p08);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_16(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-
-          u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-          lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-          lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-          lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-          lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-          lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-          lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-          lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-        }
-        {
-          lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
-          lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
-          lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
-          lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]);
-          lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]);
-          lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]);
-          lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]);
-          lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]);
-          lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]);
-          lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]);
-          lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]);
-          lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]);
-          lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]);
-          lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]);
-          lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]);
-          lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]);
-          lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]);
-          lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]);
-          lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]);
-          lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]);
-          lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]);
-          lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]);
-          lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]);
-          lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]);
-          lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]);
-          lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]);
-          lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]);
-          lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]);
-          lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]);
-          lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]);
-          lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]);
-          lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]);
-        }
-        // stage 6
-        {
-          const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
-          const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
-          const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
-          const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
-          u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
-          u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
-          u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
-          u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
-          u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
-          u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
-          u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
-          u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
-          u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
-          u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
-          u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
-          u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
-          u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
-          u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
-          u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
-
-          v[0] = k_madd_epi32(u[0], k32_p28_p04);
-          v[1] = k_madd_epi32(u[1], k32_p28_p04);
-          v[2] = k_madd_epi32(u[2], k32_p28_p04);
-          v[3] = k_madd_epi32(u[3], k32_p28_p04);
-          v[4] = k_madd_epi32(u[4], k32_p12_p20);
-          v[5] = k_madd_epi32(u[5], k32_p12_p20);
-          v[6] = k_madd_epi32(u[6], k32_p12_p20);
-          v[7] = k_madd_epi32(u[7], k32_p12_p20);
-          v[8] = k_madd_epi32(u[8], k32_m20_p12);
-          v[9] = k_madd_epi32(u[9], k32_m20_p12);
-          v[10] = k_madd_epi32(u[10], k32_m20_p12);
-          v[11] = k_madd_epi32(u[11], k32_m20_p12);
-          v[12] = k_madd_epi32(u[12], k32_m04_p28);
-          v[13] = k_madd_epi32(u[13], k32_m04_p28);
-          v[14] = k_madd_epi32(u[14], k32_m04_p28);
-          v[15] = k_madd_epi32(u[15], k32_m04_p28);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_16(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
-          sign[0] = _mm_cmplt_epi32(u[0], kZero);
-          sign[1] = _mm_cmplt_epi32(u[1], kZero);
-          sign[2] = _mm_cmplt_epi32(u[2], kZero);
-          sign[3] = _mm_cmplt_epi32(u[3], kZero);
-          sign[4] = _mm_cmplt_epi32(u[4], kZero);
-          sign[5] = _mm_cmplt_epi32(u[5], kZero);
-          sign[6] = _mm_cmplt_epi32(u[6], kZero);
-          sign[7] = _mm_cmplt_epi32(u[7], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], sign[0]);
-          u[1] = _mm_sub_epi32(u[1], sign[1]);
-          u[2] = _mm_sub_epi32(u[2], sign[2]);
-          u[3] = _mm_sub_epi32(u[3], sign[3]);
-          u[4] = _mm_sub_epi32(u[4], sign[4]);
-          u[5] = _mm_sub_epi32(u[5], sign[5]);
-          u[6] = _mm_sub_epi32(u[6], sign[6]);
-          u[7] = _mm_sub_epi32(u[7], sign[7]);
-
-          u[0] = _mm_add_epi32(u[0], K32One);
-          u[1] = _mm_add_epi32(u[1], K32One);
-          u[2] = _mm_add_epi32(u[2], K32One);
-          u[3] = _mm_add_epi32(u[3], K32One);
-          u[4] = _mm_add_epi32(u[4], K32One);
-          u[5] = _mm_add_epi32(u[5], K32One);
-          u[6] = _mm_add_epi32(u[6], K32One);
-          u[7] = _mm_add_epi32(u[7], K32One);
-
-          u[0] = _mm_srai_epi32(u[0], 2);
-          u[1] = _mm_srai_epi32(u[1], 2);
-          u[2] = _mm_srai_epi32(u[2], 2);
-          u[3] = _mm_srai_epi32(u[3], 2);
-          u[4] = _mm_srai_epi32(u[4], 2);
-          u[5] = _mm_srai_epi32(u[5], 2);
-          u[6] = _mm_srai_epi32(u[6], 2);
-          u[7] = _mm_srai_epi32(u[7], 2);
-
-          out[4] = _mm_packs_epi32(u[0], u[1]);
-          out[20] = _mm_packs_epi32(u[2], u[3]);
-          out[12] = _mm_packs_epi32(u[4], u[5]);
-          out[28] = _mm_packs_epi32(u[6], u[7]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
-          lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]);
-          lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]);
-          lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]);
-          lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]);
-          lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]);
-          lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]);
-          lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]);
-          lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]);
-          lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]);
-          lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]);
-          lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]);
-          lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]);
-          lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]);
-          lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]);
-          lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]);
-        }
-        {
-          const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
-          const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
-          const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
-          const __m128i k32_m12_m20 =
-              pair_set_epi32(-cospi_12_64, -cospi_20_64);
-          const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
-          const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
-          u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
-          u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
-          u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
-          u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
-          u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
-          u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
-          u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
-          u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
-          u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
-          u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
-          u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
-          u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
-          u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]);
-          u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
-          u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
-
-          v[0] = k_madd_epi32(u[0], k32_m04_p28);
-          v[1] = k_madd_epi32(u[1], k32_m04_p28);
-          v[2] = k_madd_epi32(u[2], k32_m04_p28);
-          v[3] = k_madd_epi32(u[3], k32_m04_p28);
-          v[4] = k_madd_epi32(u[4], k32_m28_m04);
-          v[5] = k_madd_epi32(u[5], k32_m28_m04);
-          v[6] = k_madd_epi32(u[6], k32_m28_m04);
-          v[7] = k_madd_epi32(u[7], k32_m28_m04);
-          v[8] = k_madd_epi32(u[8], k32_m20_p12);
-          v[9] = k_madd_epi32(u[9], k32_m20_p12);
-          v[10] = k_madd_epi32(u[10], k32_m20_p12);
-          v[11] = k_madd_epi32(u[11], k32_m20_p12);
-          v[12] = k_madd_epi32(u[12], k32_m12_m20);
-          v[13] = k_madd_epi32(u[13], k32_m12_m20);
-          v[14] = k_madd_epi32(u[14], k32_m12_m20);
-          v[15] = k_madd_epi32(u[15], k32_m12_m20);
-          v[16] = k_madd_epi32(u[12], k32_m20_p12);
-          v[17] = k_madd_epi32(u[13], k32_m20_p12);
-          v[18] = k_madd_epi32(u[14], k32_m20_p12);
-          v[19] = k_madd_epi32(u[15], k32_m20_p12);
-          v[20] = k_madd_epi32(u[8], k32_p12_p20);
-          v[21] = k_madd_epi32(u[9], k32_p12_p20);
-          v[22] = k_madd_epi32(u[10], k32_p12_p20);
-          v[23] = k_madd_epi32(u[11], k32_p12_p20);
-          v[24] = k_madd_epi32(u[4], k32_m04_p28);
-          v[25] = k_madd_epi32(u[5], k32_m04_p28);
-          v[26] = k_madd_epi32(u[6], k32_m04_p28);
-          v[27] = k_madd_epi32(u[7], k32_m04_p28);
-          v[28] = k_madd_epi32(u[0], k32_p28_p04);
-          v[29] = k_madd_epi32(u[1], k32_p28_p04);
-          v[30] = k_madd_epi32(u[2], k32_p28_p04);
-          v[31] = k_madd_epi32(u[3], k32_p28_p04);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-        }
-        // stage 7
-        {
-          const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
-          const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
-          const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
-          const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64);
-          const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
-          const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
-          const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
-          const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
-          u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
-          u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
-          u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
-          u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
-          u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
-          u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
-          u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
-          u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
-          u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
-          u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
-          u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
-          u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
-          u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]);
-          u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
-          u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
-
-          v[0] = k_madd_epi32(u[0], k32_p30_p02);
-          v[1] = k_madd_epi32(u[1], k32_p30_p02);
-          v[2] = k_madd_epi32(u[2], k32_p30_p02);
-          v[3] = k_madd_epi32(u[3], k32_p30_p02);
-          v[4] = k_madd_epi32(u[4], k32_p14_p18);
-          v[5] = k_madd_epi32(u[5], k32_p14_p18);
-          v[6] = k_madd_epi32(u[6], k32_p14_p18);
-          v[7] = k_madd_epi32(u[7], k32_p14_p18);
-          v[8] = k_madd_epi32(u[8], k32_p22_p10);
-          v[9] = k_madd_epi32(u[9], k32_p22_p10);
-          v[10] = k_madd_epi32(u[10], k32_p22_p10);
-          v[11] = k_madd_epi32(u[11], k32_p22_p10);
-          v[12] = k_madd_epi32(u[12], k32_p06_p26);
-          v[13] = k_madd_epi32(u[13], k32_p06_p26);
-          v[14] = k_madd_epi32(u[14], k32_p06_p26);
-          v[15] = k_madd_epi32(u[15], k32_p06_p26);
-          v[16] = k_madd_epi32(u[12], k32_m26_p06);
-          v[17] = k_madd_epi32(u[13], k32_m26_p06);
-          v[18] = k_madd_epi32(u[14], k32_m26_p06);
-          v[19] = k_madd_epi32(u[15], k32_m26_p06);
-          v[20] = k_madd_epi32(u[8], k32_m10_p22);
-          v[21] = k_madd_epi32(u[9], k32_m10_p22);
-          v[22] = k_madd_epi32(u[10], k32_m10_p22);
-          v[23] = k_madd_epi32(u[11], k32_m10_p22);
-          v[24] = k_madd_epi32(u[4], k32_m18_p14);
-          v[25] = k_madd_epi32(u[5], k32_m18_p14);
-          v[26] = k_madd_epi32(u[6], k32_m18_p14);
-          v[27] = k_madd_epi32(u[7], k32_m18_p14);
-          v[28] = k_madd_epi32(u[0], k32_m02_p30);
-          v[29] = k_madd_epi32(u[1], k32_m02_p30);
-          v[30] = k_madd_epi32(u[2], k32_m02_p30);
-          v[31] = k_madd_epi32(u[3], k32_m02_p30);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm_cmplt_epi32(u[0], kZero);
-          v[1] = _mm_cmplt_epi32(u[1], kZero);
-          v[2] = _mm_cmplt_epi32(u[2], kZero);
-          v[3] = _mm_cmplt_epi32(u[3], kZero);
-          v[4] = _mm_cmplt_epi32(u[4], kZero);
-          v[5] = _mm_cmplt_epi32(u[5], kZero);
-          v[6] = _mm_cmplt_epi32(u[6], kZero);
-          v[7] = _mm_cmplt_epi32(u[7], kZero);
-          v[8] = _mm_cmplt_epi32(u[8], kZero);
-          v[9] = _mm_cmplt_epi32(u[9], kZero);
-          v[10] = _mm_cmplt_epi32(u[10], kZero);
-          v[11] = _mm_cmplt_epi32(u[11], kZero);
-          v[12] = _mm_cmplt_epi32(u[12], kZero);
-          v[13] = _mm_cmplt_epi32(u[13], kZero);
-          v[14] = _mm_cmplt_epi32(u[14], kZero);
-          v[15] = _mm_cmplt_epi32(u[15], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], v[0]);
-          u[1] = _mm_sub_epi32(u[1], v[1]);
-          u[2] = _mm_sub_epi32(u[2], v[2]);
-          u[3] = _mm_sub_epi32(u[3], v[3]);
-          u[4] = _mm_sub_epi32(u[4], v[4]);
-          u[5] = _mm_sub_epi32(u[5], v[5]);
-          u[6] = _mm_sub_epi32(u[6], v[6]);
-          u[7] = _mm_sub_epi32(u[7], v[7]);
-          u[8] = _mm_sub_epi32(u[8], v[8]);
-          u[9] = _mm_sub_epi32(u[9], v[9]);
-          u[10] = _mm_sub_epi32(u[10], v[10]);
-          u[11] = _mm_sub_epi32(u[11], v[11]);
-          u[12] = _mm_sub_epi32(u[12], v[12]);
-          u[13] = _mm_sub_epi32(u[13], v[13]);
-          u[14] = _mm_sub_epi32(u[14], v[14]);
-          u[15] = _mm_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], K32One);
-          v[1] = _mm_add_epi32(u[1], K32One);
-          v[2] = _mm_add_epi32(u[2], K32One);
-          v[3] = _mm_add_epi32(u[3], K32One);
-          v[4] = _mm_add_epi32(u[4], K32One);
-          v[5] = _mm_add_epi32(u[5], K32One);
-          v[6] = _mm_add_epi32(u[6], K32One);
-          v[7] = _mm_add_epi32(u[7], K32One);
-          v[8] = _mm_add_epi32(u[8], K32One);
-          v[9] = _mm_add_epi32(u[9], K32One);
-          v[10] = _mm_add_epi32(u[10], K32One);
-          v[11] = _mm_add_epi32(u[11], K32One);
-          v[12] = _mm_add_epi32(u[12], K32One);
-          v[13] = _mm_add_epi32(u[13], K32One);
-          v[14] = _mm_add_epi32(u[14], K32One);
-          v[15] = _mm_add_epi32(u[15], K32One);
-
-          u[0] = _mm_srai_epi32(v[0], 2);
-          u[1] = _mm_srai_epi32(v[1], 2);
-          u[2] = _mm_srai_epi32(v[2], 2);
-          u[3] = _mm_srai_epi32(v[3], 2);
-          u[4] = _mm_srai_epi32(v[4], 2);
-          u[5] = _mm_srai_epi32(v[5], 2);
-          u[6] = _mm_srai_epi32(v[6], 2);
-          u[7] = _mm_srai_epi32(v[7], 2);
-          u[8] = _mm_srai_epi32(v[8], 2);
-          u[9] = _mm_srai_epi32(v[9], 2);
-          u[10] = _mm_srai_epi32(v[10], 2);
-          u[11] = _mm_srai_epi32(v[11], 2);
-          u[12] = _mm_srai_epi32(v[12], 2);
-          u[13] = _mm_srai_epi32(v[13], 2);
-          u[14] = _mm_srai_epi32(v[14], 2);
-          u[15] = _mm_srai_epi32(v[15], 2);
-
-          out[2] = _mm_packs_epi32(u[0], u[1]);
-          out[18] = _mm_packs_epi32(u[2], u[3]);
-          out[10] = _mm_packs_epi32(u[4], u[5]);
-          out[26] = _mm_packs_epi32(u[6], u[7]);
-          out[6] = _mm_packs_epi32(u[8], u[9]);
-          out[22] = _mm_packs_epi32(u[10], u[11]);
-          out[14] = _mm_packs_epi32(u[12], u[13]);
-          out[30] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
-                                      &out[6], &out[22], &out[14], &out[30]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
-          lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]);
-          lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]);
-          lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]);
-          lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]);
-          lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]);
-          lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]);
-          lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]);
-          lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]);
-          lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]);
-          lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]);
-          lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]);
-          lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]);
-          lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]);
-          lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]);
-          lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]);
-          lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]);
-          lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]);
-          lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]);
-          lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]);
-          lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]);
-          lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]);
-          lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]);
-          lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]);
-          lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]);
-          lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]);
-          lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]);
-          lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]);
-          lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]);
-          lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]);
-          lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]);
-          lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]);
-        }
-        // stage 8
-        {
-          const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64);
-          const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64);
-          const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64);
-          const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64);
-          const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64);
-          const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64);
-          const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
-          const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
-          u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
-          u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
-          u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
-          u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
-          u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
-          u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
-          u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
-          u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
-          u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
-          u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
-          u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
-          u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
-          u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]);
-          u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
-          u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
-
-          v[0] = k_madd_epi32(u[0], k32_p31_p01);
-          v[1] = k_madd_epi32(u[1], k32_p31_p01);
-          v[2] = k_madd_epi32(u[2], k32_p31_p01);
-          v[3] = k_madd_epi32(u[3], k32_p31_p01);
-          v[4] = k_madd_epi32(u[4], k32_p15_p17);
-          v[5] = k_madd_epi32(u[5], k32_p15_p17);
-          v[6] = k_madd_epi32(u[6], k32_p15_p17);
-          v[7] = k_madd_epi32(u[7], k32_p15_p17);
-          v[8] = k_madd_epi32(u[8], k32_p23_p09);
-          v[9] = k_madd_epi32(u[9], k32_p23_p09);
-          v[10] = k_madd_epi32(u[10], k32_p23_p09);
-          v[11] = k_madd_epi32(u[11], k32_p23_p09);
-          v[12] = k_madd_epi32(u[12], k32_p07_p25);
-          v[13] = k_madd_epi32(u[13], k32_p07_p25);
-          v[14] = k_madd_epi32(u[14], k32_p07_p25);
-          v[15] = k_madd_epi32(u[15], k32_p07_p25);
-          v[16] = k_madd_epi32(u[12], k32_m25_p07);
-          v[17] = k_madd_epi32(u[13], k32_m25_p07);
-          v[18] = k_madd_epi32(u[14], k32_m25_p07);
-          v[19] = k_madd_epi32(u[15], k32_m25_p07);
-          v[20] = k_madd_epi32(u[8], k32_m09_p23);
-          v[21] = k_madd_epi32(u[9], k32_m09_p23);
-          v[22] = k_madd_epi32(u[10], k32_m09_p23);
-          v[23] = k_madd_epi32(u[11], k32_m09_p23);
-          v[24] = k_madd_epi32(u[4], k32_m17_p15);
-          v[25] = k_madd_epi32(u[5], k32_m17_p15);
-          v[26] = k_madd_epi32(u[6], k32_m17_p15);
-          v[27] = k_madd_epi32(u[7], k32_m17_p15);
-          v[28] = k_madd_epi32(u[0], k32_m01_p31);
-          v[29] = k_madd_epi32(u[1], k32_m01_p31);
-          v[30] = k_madd_epi32(u[2], k32_m01_p31);
-          v[31] = k_madd_epi32(u[3], k32_m01_p31);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm_cmplt_epi32(u[0], kZero);
-          v[1] = _mm_cmplt_epi32(u[1], kZero);
-          v[2] = _mm_cmplt_epi32(u[2], kZero);
-          v[3] = _mm_cmplt_epi32(u[3], kZero);
-          v[4] = _mm_cmplt_epi32(u[4], kZero);
-          v[5] = _mm_cmplt_epi32(u[5], kZero);
-          v[6] = _mm_cmplt_epi32(u[6], kZero);
-          v[7] = _mm_cmplt_epi32(u[7], kZero);
-          v[8] = _mm_cmplt_epi32(u[8], kZero);
-          v[9] = _mm_cmplt_epi32(u[9], kZero);
-          v[10] = _mm_cmplt_epi32(u[10], kZero);
-          v[11] = _mm_cmplt_epi32(u[11], kZero);
-          v[12] = _mm_cmplt_epi32(u[12], kZero);
-          v[13] = _mm_cmplt_epi32(u[13], kZero);
-          v[14] = _mm_cmplt_epi32(u[14], kZero);
-          v[15] = _mm_cmplt_epi32(u[15], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], v[0]);
-          u[1] = _mm_sub_epi32(u[1], v[1]);
-          u[2] = _mm_sub_epi32(u[2], v[2]);
-          u[3] = _mm_sub_epi32(u[3], v[3]);
-          u[4] = _mm_sub_epi32(u[4], v[4]);
-          u[5] = _mm_sub_epi32(u[5], v[5]);
-          u[6] = _mm_sub_epi32(u[6], v[6]);
-          u[7] = _mm_sub_epi32(u[7], v[7]);
-          u[8] = _mm_sub_epi32(u[8], v[8]);
-          u[9] = _mm_sub_epi32(u[9], v[9]);
-          u[10] = _mm_sub_epi32(u[10], v[10]);
-          u[11] = _mm_sub_epi32(u[11], v[11]);
-          u[12] = _mm_sub_epi32(u[12], v[12]);
-          u[13] = _mm_sub_epi32(u[13], v[13]);
-          u[14] = _mm_sub_epi32(u[14], v[14]);
-          u[15] = _mm_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], K32One);
-          v[1] = _mm_add_epi32(u[1], K32One);
-          v[2] = _mm_add_epi32(u[2], K32One);
-          v[3] = _mm_add_epi32(u[3], K32One);
-          v[4] = _mm_add_epi32(u[4], K32One);
-          v[5] = _mm_add_epi32(u[5], K32One);
-          v[6] = _mm_add_epi32(u[6], K32One);
-          v[7] = _mm_add_epi32(u[7], K32One);
-          v[8] = _mm_add_epi32(u[8], K32One);
-          v[9] = _mm_add_epi32(u[9], K32One);
-          v[10] = _mm_add_epi32(u[10], K32One);
-          v[11] = _mm_add_epi32(u[11], K32One);
-          v[12] = _mm_add_epi32(u[12], K32One);
-          v[13] = _mm_add_epi32(u[13], K32One);
-          v[14] = _mm_add_epi32(u[14], K32One);
-          v[15] = _mm_add_epi32(u[15], K32One);
-
-          u[0] = _mm_srai_epi32(v[0], 2);
-          u[1] = _mm_srai_epi32(v[1], 2);
-          u[2] = _mm_srai_epi32(v[2], 2);
-          u[3] = _mm_srai_epi32(v[3], 2);
-          u[4] = _mm_srai_epi32(v[4], 2);
-          u[5] = _mm_srai_epi32(v[5], 2);
-          u[6] = _mm_srai_epi32(v[6], 2);
-          u[7] = _mm_srai_epi32(v[7], 2);
-          u[8] = _mm_srai_epi32(v[8], 2);
-          u[9] = _mm_srai_epi32(v[9], 2);
-          u[10] = _mm_srai_epi32(v[10], 2);
-          u[11] = _mm_srai_epi32(v[11], 2);
-          u[12] = _mm_srai_epi32(v[12], 2);
-          u[13] = _mm_srai_epi32(v[13], 2);
-          u[14] = _mm_srai_epi32(v[14], 2);
-          u[15] = _mm_srai_epi32(v[15], 2);
-
-          out[1] = _mm_packs_epi32(u[0], u[1]);
-          out[17] = _mm_packs_epi32(u[2], u[3]);
-          out[9] = _mm_packs_epi32(u[4], u[5]);
-          out[25] = _mm_packs_epi32(u[6], u[7]);
-          out[7] = _mm_packs_epi32(u[8], u[9]);
-          out[23] = _mm_packs_epi32(u[10], u[11]);
-          out[15] = _mm_packs_epi32(u[12], u[13]);
-          out[31] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
-                                      &out[7], &out[23], &out[15], &out[31]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
-          const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64);
-          const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64);
-          const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64);
-          const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64);
-          const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64);
-          const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
-          const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
-          u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
-          u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
-          u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
-          u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
-          u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
-          u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
-          u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
-          u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
-          u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
-          u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
-          u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
-          u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
-          u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]);
-          u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
-          u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
-
-          v[0] = k_madd_epi32(u[0], k32_p27_p05);
-          v[1] = k_madd_epi32(u[1], k32_p27_p05);
-          v[2] = k_madd_epi32(u[2], k32_p27_p05);
-          v[3] = k_madd_epi32(u[3], k32_p27_p05);
-          v[4] = k_madd_epi32(u[4], k32_p11_p21);
-          v[5] = k_madd_epi32(u[5], k32_p11_p21);
-          v[6] = k_madd_epi32(u[6], k32_p11_p21);
-          v[7] = k_madd_epi32(u[7], k32_p11_p21);
-          v[8] = k_madd_epi32(u[8], k32_p19_p13);
-          v[9] = k_madd_epi32(u[9], k32_p19_p13);
-          v[10] = k_madd_epi32(u[10], k32_p19_p13);
-          v[11] = k_madd_epi32(u[11], k32_p19_p13);
-          v[12] = k_madd_epi32(u[12], k32_p03_p29);
-          v[13] = k_madd_epi32(u[13], k32_p03_p29);
-          v[14] = k_madd_epi32(u[14], k32_p03_p29);
-          v[15] = k_madd_epi32(u[15], k32_p03_p29);
-          v[16] = k_madd_epi32(u[12], k32_m29_p03);
-          v[17] = k_madd_epi32(u[13], k32_m29_p03);
-          v[18] = k_madd_epi32(u[14], k32_m29_p03);
-          v[19] = k_madd_epi32(u[15], k32_m29_p03);
-          v[20] = k_madd_epi32(u[8], k32_m13_p19);
-          v[21] = k_madd_epi32(u[9], k32_m13_p19);
-          v[22] = k_madd_epi32(u[10], k32_m13_p19);
-          v[23] = k_madd_epi32(u[11], k32_m13_p19);
-          v[24] = k_madd_epi32(u[4], k32_m21_p11);
-          v[25] = k_madd_epi32(u[5], k32_m21_p11);
-          v[26] = k_madd_epi32(u[6], k32_m21_p11);
-          v[27] = k_madd_epi32(u[7], k32_m21_p11);
-          v[28] = k_madd_epi32(u[0], k32_m05_p27);
-          v[29] = k_madd_epi32(u[1], k32_m05_p27);
-          v[30] = k_madd_epi32(u[2], k32_m05_p27);
-          v[31] = k_madd_epi32(u[3], k32_m05_p27);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm_cmplt_epi32(u[0], kZero);
-          v[1] = _mm_cmplt_epi32(u[1], kZero);
-          v[2] = _mm_cmplt_epi32(u[2], kZero);
-          v[3] = _mm_cmplt_epi32(u[3], kZero);
-          v[4] = _mm_cmplt_epi32(u[4], kZero);
-          v[5] = _mm_cmplt_epi32(u[5], kZero);
-          v[6] = _mm_cmplt_epi32(u[6], kZero);
-          v[7] = _mm_cmplt_epi32(u[7], kZero);
-          v[8] = _mm_cmplt_epi32(u[8], kZero);
-          v[9] = _mm_cmplt_epi32(u[9], kZero);
-          v[10] = _mm_cmplt_epi32(u[10], kZero);
-          v[11] = _mm_cmplt_epi32(u[11], kZero);
-          v[12] = _mm_cmplt_epi32(u[12], kZero);
-          v[13] = _mm_cmplt_epi32(u[13], kZero);
-          v[14] = _mm_cmplt_epi32(u[14], kZero);
-          v[15] = _mm_cmplt_epi32(u[15], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], v[0]);
-          u[1] = _mm_sub_epi32(u[1], v[1]);
-          u[2] = _mm_sub_epi32(u[2], v[2]);
-          u[3] = _mm_sub_epi32(u[3], v[3]);
-          u[4] = _mm_sub_epi32(u[4], v[4]);
-          u[5] = _mm_sub_epi32(u[5], v[5]);
-          u[6] = _mm_sub_epi32(u[6], v[6]);
-          u[7] = _mm_sub_epi32(u[7], v[7]);
-          u[8] = _mm_sub_epi32(u[8], v[8]);
-          u[9] = _mm_sub_epi32(u[9], v[9]);
-          u[10] = _mm_sub_epi32(u[10], v[10]);
-          u[11] = _mm_sub_epi32(u[11], v[11]);
-          u[12] = _mm_sub_epi32(u[12], v[12]);
-          u[13] = _mm_sub_epi32(u[13], v[13]);
-          u[14] = _mm_sub_epi32(u[14], v[14]);
-          u[15] = _mm_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], K32One);
-          v[1] = _mm_add_epi32(u[1], K32One);
-          v[2] = _mm_add_epi32(u[2], K32One);
-          v[3] = _mm_add_epi32(u[3], K32One);
-          v[4] = _mm_add_epi32(u[4], K32One);
-          v[5] = _mm_add_epi32(u[5], K32One);
-          v[6] = _mm_add_epi32(u[6], K32One);
-          v[7] = _mm_add_epi32(u[7], K32One);
-          v[8] = _mm_add_epi32(u[8], K32One);
-          v[9] = _mm_add_epi32(u[9], K32One);
-          v[10] = _mm_add_epi32(u[10], K32One);
-          v[11] = _mm_add_epi32(u[11], K32One);
-          v[12] = _mm_add_epi32(u[12], K32One);
-          v[13] = _mm_add_epi32(u[13], K32One);
-          v[14] = _mm_add_epi32(u[14], K32One);
-          v[15] = _mm_add_epi32(u[15], K32One);
-
-          u[0] = _mm_srai_epi32(v[0], 2);
-          u[1] = _mm_srai_epi32(v[1], 2);
-          u[2] = _mm_srai_epi32(v[2], 2);
-          u[3] = _mm_srai_epi32(v[3], 2);
-          u[4] = _mm_srai_epi32(v[4], 2);
-          u[5] = _mm_srai_epi32(v[5], 2);
-          u[6] = _mm_srai_epi32(v[6], 2);
-          u[7] = _mm_srai_epi32(v[7], 2);
-          u[8] = _mm_srai_epi32(v[8], 2);
-          u[9] = _mm_srai_epi32(v[9], 2);
-          u[10] = _mm_srai_epi32(v[10], 2);
-          u[11] = _mm_srai_epi32(v[11], 2);
-          u[12] = _mm_srai_epi32(v[12], 2);
-          u[13] = _mm_srai_epi32(v[13], 2);
-          u[14] = _mm_srai_epi32(v[14], 2);
-          u[15] = _mm_srai_epi32(v[15], 2);
-
-          out[5] = _mm_packs_epi32(u[0], u[1]);
-          out[21] = _mm_packs_epi32(u[2], u[3]);
-          out[13] = _mm_packs_epi32(u[4], u[5]);
-          out[29] = _mm_packs_epi32(u[6], u[7]);
-          out[3] = _mm_packs_epi32(u[8], u[9]);
-          out[19] = _mm_packs_epi32(u[10], u[11]);
-          out[11] = _mm_packs_epi32(u[12], u[13]);
-          out[27] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
-                                      &out[3], &out[19], &out[11], &out[27]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-      }
-#endif  // FDCT32x32_HIGH_PRECISION
-      // Transpose the results, do it as four 8x8 transposes.
-      {
-        int transpose_block;
-        int16_t *output0 = &intermediate[column_start * 32];
-        tran_low_t *output1 = &output_org[column_start * 32];
-        for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
-          __m128i *this_out = &out[8 * transpose_block];
-          // 00 01 02 03 04 05 06 07
-          // 10 11 12 13 14 15 16 17
-          // 20 21 22 23 24 25 26 27
-          // 30 31 32 33 34 35 36 37
-          // 40 41 42 43 44 45 46 47
-          // 50 51 52 53 54 55 56 57
-          // 60 61 62 63 64 65 66 67
-          // 70 71 72 73 74 75 76 77
-          const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
-          const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
-          const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
-          const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
-          const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
-          const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
-          const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
-          const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
-          // 00 10 01 11 02 12 03 13
-          // 20 30 21 31 22 32 23 33
-          // 04 14 05 15 06 16 07 17
-          // 24 34 25 35 26 36 27 37
-          // 40 50 41 51 42 52 43 53
-          // 60 70 61 71 62 72 63 73
-          // 54 54 55 55 56 56 57 57
-          // 64 74 65 75 66 76 67 77
-          const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-          const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-          const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-          const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-          const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-          const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-          const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-          const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-          // 00 10 20 30 01 11 21 31
-          // 40 50 60 70 41 51 61 71
-          // 02 12 22 32 03 13 23 33
-          // 42 52 62 72 43 53 63 73
-          // 04 14 24 34 05 15 21 36
-          // 44 54 64 74 45 55 61 76
-          // 06 16 26 36 07 17 27 37
-          // 46 56 66 76 47 57 67 77
-          __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-          __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-          __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-          __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-          __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-          __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-          __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-          __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-          // 00 10 20 30 40 50 60 70
-          // 01 11 21 31 41 51 61 71
-          // 02 12 22 32 42 52 62 72
-          // 03 13 23 33 43 53 63 73
-          // 04 14 24 34 44 54 64 74
-          // 05 15 25 35 45 55 65 75
-          // 06 16 26 36 46 56 66 76
-          // 07 17 27 37 47 57 67 77
-          if (0 == pass) {
-            // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
-            // TODO(cd): see quality impact of only doing
-            //           output[j] = (output[j] + 1) >> 2;
-            //           which would remove the code between here ...
-            __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
-            __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
-            __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
-            __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
-            __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
-            __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
-            __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
-            __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
-            tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
-            tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
-            tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
-            tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
-            tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
-            tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
-            tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
-            tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
-            //           ... and here.
-            //           PS: also change code in av1/encoder/av1_dct.c
-            tr2_0 = _mm_add_epi16(tr2_0, kOne);
-            tr2_1 = _mm_add_epi16(tr2_1, kOne);
-            tr2_2 = _mm_add_epi16(tr2_2, kOne);
-            tr2_3 = _mm_add_epi16(tr2_3, kOne);
-            tr2_4 = _mm_add_epi16(tr2_4, kOne);
-            tr2_5 = _mm_add_epi16(tr2_5, kOne);
-            tr2_6 = _mm_add_epi16(tr2_6, kOne);
-            tr2_7 = _mm_add_epi16(tr2_7, kOne);
-            tr2_0 = _mm_srai_epi16(tr2_0, 2);
-            tr2_1 = _mm_srai_epi16(tr2_1, 2);
-            tr2_2 = _mm_srai_epi16(tr2_2, 2);
-            tr2_3 = _mm_srai_epi16(tr2_3, 2);
-            tr2_4 = _mm_srai_epi16(tr2_4, 2);
-            tr2_5 = _mm_srai_epi16(tr2_5, 2);
-            tr2_6 = _mm_srai_epi16(tr2_6, 2);
-            tr2_7 = _mm_srai_epi16(tr2_7, 2);
-          }
-          // Note: even though all these stores are aligned, using the aligned
-          //       intrinsic make the code slightly slower.
-          if (pass == 0) {
-            _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0);
-            _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1);
-            _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2);
-            _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3);
-            _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4);
-            _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5);
-            _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6);
-            _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7);
-            // Process next 8x8
-            output0 += 8;
-          } else {
-            storeu_output(&tr2_0, (output1 + 0 * 32));
-            storeu_output(&tr2_1, (output1 + 1 * 32));
-            storeu_output(&tr2_2, (output1 + 2 * 32));
-            storeu_output(&tr2_3, (output1 + 3 * 32));
-            storeu_output(&tr2_4, (output1 + 4 * 32));
-            storeu_output(&tr2_5, (output1 + 5 * 32));
-            storeu_output(&tr2_6, (output1 + 6 * 32));
-            storeu_output(&tr2_7, (output1 + 7 * 32));
-            // Process next 8x8
-            output1 += 8;
-          }
-        }
-      }
-    }
-  }
-}  // NOLINT
-
-#undef ADD_EPI16
-#undef SUB_EPI16
-#undef HIGH_FDCT32x32_2D_C
-#undef HIGH_FDCT32x32_2D_ROWS_C
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c
deleted file mode 100644
index 670f864d0..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_config.h"
-
-#define FDCT32x32_2D_AVX2 aom_fdct32x32_rd_avx2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h"
-#undef FDCT32x32_2D_AVX2
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D_AVX2 aom_fdct32x32_avx2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h"  // NOLINT
-#undef FDCT32x32_2D_AVX2
-#undef FDCT32x32_HIGH_PRECISION
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h
deleted file mode 100644
index 86df4a6f6..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_FWD_TXFM_AVX2_H
-#define AOM_DSP_X86_FWD_TXFM_AVX2_H
-
-#include "./aom_config.h"
-
-static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) {
-  if (sizeof(tran_low_t) == 4) {
-    const __m256i zero = _mm256_setzero_si256();
-    const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff);
-
-    __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign);
-    __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign);
-
-    __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20);
-    __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31);
-
-    _mm256_storeu_si256((__m256i *)out, y0);
-    _mm256_storeu_si256((__m256i *)(out + 8), y1);
-  } else {
-    _mm256_storeu_si256((__m256i *)out, *coeff);
-  }
-}
-
-#endif  // AOM_DSP_X86_FWD_TXFM_AVX2_H
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
index 7bb1db70a..1e3d13ec8 100644
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
@@ -11,7 +11,8 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/txfm_common.h"
 #include "aom_dsp/x86/fwd_txfm_sse2.h"
 #include "aom_dsp/x86/txfm_common_sse2.h"
@@ -29,233 +30,6 @@
 #define SUB_EPI16 _mm_sub_epi16
 #endif
 
-void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
-  // This 2D transform implements 4 vertical 1D transforms followed
-  // by 4 horizontal 1D transforms.  The multiplies and adds are as given
-  // by Chen, Smith and Fralick ('77).  The commands for moving the data
-  // around have been minimized by hand.
-  // For the purposes of the comments, the 16 inputs are referred to at i0
-  // through iF (in raster order), intermediate variables are a0, b0, c0
-  // through f, and correspond to the in-place computations mapped to input
-  // locations.  The outputs, o0 through oF are labeled according to the
-  // output locations.
-
-  // Constants
-  // These are the coefficients used for the multiplies.
-  // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
-  // where cospi_N_64 = cos(N pi /64)
-  const __m128i k__cospi_A =
-      octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
-                     cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_B =
-      octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
-                     cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_C =
-      octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
-                     cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_D =
-      octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
-                     cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_E =
-      octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
-                     cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_F =
-      octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
-                     cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_G =
-      octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
-                     -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64);
-  const __m128i k__cospi_H =
-      octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
-                     -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64);
-
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  // This second rounding constant saves doing some extra adds at the end
-  const __m128i k__DCT_CONST_ROUNDING2 =
-      _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1));
-  const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
-  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
-  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
-  __m128i in0, in1;
-#if DCT_HIGH_BIT_DEPTH
-  __m128i cmp0, cmp1;
-  int test, overflow;
-#endif
-
-  // Load inputs.
-  in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-  in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-  // in0 = [i0 i1 i2 i3 iC iD iE iF]
-  // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
-  in1 = _mm_unpacklo_epi64(
-      in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
-  in0 = _mm_unpacklo_epi64(
-      in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
-#if DCT_HIGH_BIT_DEPTH
-  // Check inputs small enough to use optimised code
-  cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
-                       _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00)));
-  cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)),
-                       _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00)));
-  test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1));
-  if (test) {
-    aom_highbd_fdct4x4_c(input, output, stride);
-    return;
-  }
-#endif  // DCT_HIGH_BIT_DEPTH
-
-  // multiply by 16 to give some extra precision
-  in0 = _mm_slli_epi16(in0, 4);
-  in1 = _mm_slli_epi16(in1, 4);
-  // if (i == 0 && input[0]) input[0] += 1;
-  // add 1 to the upper left pixel if it is non-zero, which helps reduce
-  // the round-trip error
-  {
-    // The mask will only contain whether the first value is zero, all
-    // other comparison will fail as something shifted by 4 (above << 4)
-    // can never be equal to one. To increment in the non-zero case, we
-    // add the mask and one for the first element:
-    //   - if zero, mask = -1, v = v - 1 + 1 = v
-    //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
-    __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
-    in0 = _mm_add_epi16(in0, mask);
-    in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
-  }
-  // There are 4 total stages, alternating between an add/subtract stage
-  // followed by an multiply-and-add stage.
-  {
-    // Stage 1: Add/subtract
-
-    // in0 = [i0 i1 i2 i3 iC iD iE iF]
-    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
-    const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
-    const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
-    // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
-    // r1 = [iC i8 iD i9 iE iA iF iB]
-    const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
-    const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
-    // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
-    // r3 = [iC i8 iD i9 iF iB iE iA]
-
-    const __m128i t0 = _mm_add_epi16(r2, r3);
-    const __m128i t1 = _mm_sub_epi16(r2, r3);
-    // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
-    // t1 = [aC a8 aD a9 aF aB aE aA]
-
-    // Stage 2: multiply by constants (which gets us into 32 bits).
-    // The constants needed here are:
-    // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
-    // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
-    // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
-    // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
-    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
-    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
-    const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
-    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
-    // Then add and right-shift to get back to 16-bit range
-    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-    // w0 = [b0 b1 b7 b6]
-    // w1 = [b8 b9 bF bE]
-    // w2 = [b4 b5 b3 b2]
-    // w3 = [bC bD bB bA]
-    const __m128i x0 = _mm_packs_epi32(w0, w1);
-    const __m128i x1 = _mm_packs_epi32(w2, w3);
-#if DCT_HIGH_BIT_DEPTH
-    overflow = check_epi16_overflow_x2(&x0, &x1);
-    if (overflow) {
-      aom_highbd_fdct4x4_c(input, output, stride);
-      return;
-    }
-#endif  // DCT_HIGH_BIT_DEPTH
-    // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
-    // x1 = [b4 b5 b3 b2 bC bD bB bA]
-    in0 = _mm_shuffle_epi32(x0, 0xD8);
-    in1 = _mm_shuffle_epi32(x1, 0x8D);
-    // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
-    // in1 = [b3 b2 bB bA b4 b5 bC bD]
-  }
-  {
-    // vertical DCTs finished. Now we do the horizontal DCTs.
-    // Stage 3: Add/subtract
-
-    // t0 = [c0 c1 c8 c9  c4  c5  cC  cD]
-    // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
-    const __m128i t0 = ADD_EPI16(in0, in1);
-    const __m128i t1 = SUB_EPI16(in0, in1);
-#if DCT_HIGH_BIT_DEPTH
-    overflow = check_epi16_overflow_x2(&t0, &t1);
-    if (overflow) {
-      aom_highbd_fdct4x4_c(input, output, stride);
-      return;
-    }
-#endif  // DCT_HIGH_BIT_DEPTH
-
-    // Stage 4: multiply by constants (which gets us into 32 bits).
-    {
-      // The constants needed here are:
-      // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
-      // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
-      // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
-      // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
-      const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
-      const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
-      // Then add and right-shift to get back to 16-bit range
-      // but this combines the final right-shift as well to save operations
-      // This unusual rounding operations is to maintain bit-accurate
-      // compatibility with the c version of this function which has two
-      // rounding steps in a row.
-      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
-      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
-      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
-      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
-      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
-      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
-      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
-      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
-      // w0 = [o0 o4 o8 oC]
-      // w1 = [o2 o6 oA oE]
-      // w2 = [o1 o5 o9 oD]
-      // w3 = [o3 o7 oB oF]
-      // remember the o's are numbered according to the correct output location
-      const __m128i x0 = _mm_packs_epi32(w0, w1);
-      const __m128i x1 = _mm_packs_epi32(w2, w3);
-#if DCT_HIGH_BIT_DEPTH
-      overflow = check_epi16_overflow_x2(&x0, &x1);
-      if (overflow) {
-        aom_highbd_fdct4x4_c(input, output, stride);
-        return;
-      }
-#endif  // DCT_HIGH_BIT_DEPTH
-      {
-        // x0 = [o0 o4 o8 oC o2 o6 oA oE]
-        // x1 = [o1 o5 o9 oD o3 o7 oB oF]
-        const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
-        const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
-        // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
-        // y1 = [o2 o3 o6 o7 oA oB oE oF]
-        in0 = _mm_unpacklo_epi32(y0, y1);
-        // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
-        in1 = _mm_unpackhi_epi32(y0, y1);
-        // in1 = [o8 o9 oA oB oC oD oE oF]
-      }
-    }
-  }
-  // Post-condition (v + 1) >> 2 is now incorporated into previous
-  // add and right-shift commands.  Only 2 store instructions needed
-  // because we are using the fact that 1/3 are stored just after 0/2.
-  storeu_output(&in0, output + 0 * 4);
-  storeu_output(&in1, output + 2 * 4);
-}
-
 void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
   int pass;
   // Constants
@@ -566,449 +340,5 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
   }
 }
 
-void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  DECLARE_ALIGNED(16, int16_t, intermediate[256]);
-  const int16_t *in = input;
-  int16_t *out0 = intermediate;
-  tran_low_t *out1 = output;
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kOne = _mm_set1_epi16(1);
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    // We process eight columns (transposed rows in second pass) at a time.
-    int column_start;
-#if DCT_HIGH_BIT_DEPTH
-    int overflow;
-#endif
-    for (column_start = 0; column_start < 16; column_start += 8) {
-      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
-      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
-      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
-      __m128i step1_0, step1_1, step1_2, step1_3;
-      __m128i step1_4, step1_5, step1_6, step1_7;
-      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
-      __m128i step3_0, step3_1, step3_2, step3_3;
-      __m128i step3_4, step3_5, step3_6, step3_7;
-      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
-      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
-      // Load and pre-condition input.
-      if (0 == pass) {
-        in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
-        in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
-        in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
-        in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
-        in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
-        in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
-        in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
-        in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
-        in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
-        in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
-        in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
-        in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
-        in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
-        in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
-        in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
-        in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
-        // x = x << 2
-        in00 = _mm_slli_epi16(in00, 2);
-        in01 = _mm_slli_epi16(in01, 2);
-        in02 = _mm_slli_epi16(in02, 2);
-        in03 = _mm_slli_epi16(in03, 2);
-        in04 = _mm_slli_epi16(in04, 2);
-        in05 = _mm_slli_epi16(in05, 2);
-        in06 = _mm_slli_epi16(in06, 2);
-        in07 = _mm_slli_epi16(in07, 2);
-        in08 = _mm_slli_epi16(in08, 2);
-        in09 = _mm_slli_epi16(in09, 2);
-        in10 = _mm_slli_epi16(in10, 2);
-        in11 = _mm_slli_epi16(in11, 2);
-        in12 = _mm_slli_epi16(in12, 2);
-        in13 = _mm_slli_epi16(in13, 2);
-        in14 = _mm_slli_epi16(in14, 2);
-        in15 = _mm_slli_epi16(in15, 2);
-      } else {
-        in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
-        in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
-        in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
-        in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
-        in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
-        in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
-        in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
-        in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
-        in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
-        in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
-        in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
-        in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
-        in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
-        in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
-        in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
-        in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
-        // x = (x + 1) >> 2
-        in00 = _mm_add_epi16(in00, kOne);
-        in01 = _mm_add_epi16(in01, kOne);
-        in02 = _mm_add_epi16(in02, kOne);
-        in03 = _mm_add_epi16(in03, kOne);
-        in04 = _mm_add_epi16(in04, kOne);
-        in05 = _mm_add_epi16(in05, kOne);
-        in06 = _mm_add_epi16(in06, kOne);
-        in07 = _mm_add_epi16(in07, kOne);
-        in08 = _mm_add_epi16(in08, kOne);
-        in09 = _mm_add_epi16(in09, kOne);
-        in10 = _mm_add_epi16(in10, kOne);
-        in11 = _mm_add_epi16(in11, kOne);
-        in12 = _mm_add_epi16(in12, kOne);
-        in13 = _mm_add_epi16(in13, kOne);
-        in14 = _mm_add_epi16(in14, kOne);
-        in15 = _mm_add_epi16(in15, kOne);
-        in00 = _mm_srai_epi16(in00, 2);
-        in01 = _mm_srai_epi16(in01, 2);
-        in02 = _mm_srai_epi16(in02, 2);
-        in03 = _mm_srai_epi16(in03, 2);
-        in04 = _mm_srai_epi16(in04, 2);
-        in05 = _mm_srai_epi16(in05, 2);
-        in06 = _mm_srai_epi16(in06, 2);
-        in07 = _mm_srai_epi16(in07, 2);
-        in08 = _mm_srai_epi16(in08, 2);
-        in09 = _mm_srai_epi16(in09, 2);
-        in10 = _mm_srai_epi16(in10, 2);
-        in11 = _mm_srai_epi16(in11, 2);
-        in12 = _mm_srai_epi16(in12, 2);
-        in13 = _mm_srai_epi16(in13, 2);
-        in14 = _mm_srai_epi16(in14, 2);
-        in15 = _mm_srai_epi16(in15, 2);
-      }
-      in += 8;
-      // Calculate input for the first 8 results.
-      {
-        input0 = ADD_EPI16(in00, in15);
-        input1 = ADD_EPI16(in01, in14);
-        input2 = ADD_EPI16(in02, in13);
-        input3 = ADD_EPI16(in03, in12);
-        input4 = ADD_EPI16(in04, in11);
-        input5 = ADD_EPI16(in05, in10);
-        input6 = ADD_EPI16(in06, in09);
-        input7 = ADD_EPI16(in07, in08);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3,
-                                           &input4, &input5, &input6, &input7);
-        if (overflow) {
-          aom_highbd_fdct16x16_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      // Calculate input for the next 8 results.
-      {
-        step1_0 = SUB_EPI16(in07, in08);
-        step1_1 = SUB_EPI16(in06, in09);
-        step1_2 = SUB_EPI16(in05, in10);
-        step1_3 = SUB_EPI16(in04, in11);
-        step1_4 = SUB_EPI16(in03, in12);
-        step1_5 = SUB_EPI16(in02, in13);
-        step1_6 = SUB_EPI16(in01, in14);
-        step1_7 = SUB_EPI16(in00, in15);
-#if DCT_HIGH_BIT_DEPTH
-        overflow =
-            check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
-                                    &step1_4, &step1_5, &step1_6, &step1_7);
-        if (overflow) {
-          aom_highbd_fdct16x16_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      // Work on the first eight values; fdct8(input, even_results);
-      {
-        // Add/subtract
-        const __m128i q0 = ADD_EPI16(input0, input7);
-        const __m128i q1 = ADD_EPI16(input1, input6);
-        const __m128i q2 = ADD_EPI16(input2, input5);
-        const __m128i q3 = ADD_EPI16(input3, input4);
-        const __m128i q4 = SUB_EPI16(input3, input4);
-        const __m128i q5 = SUB_EPI16(input2, input5);
-        const __m128i q6 = SUB_EPI16(input1, input6);
-        const __m128i q7 = SUB_EPI16(input0, input7);
-#if DCT_HIGH_BIT_DEPTH
-        overflow =
-            check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
-        if (overflow) {
-          aom_highbd_fdct16x16_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-        // Work on first four results
-        {
-          // Add/subtract
-          const __m128i r0 = ADD_EPI16(q0, q3);
-          const __m128i r1 = ADD_EPI16(q1, q2);
-          const __m128i r2 = SUB_EPI16(q1, q2);
-          const __m128i r3 = SUB_EPI16(q0, q3);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          {
-            const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-            const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
-            const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-            const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-            res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-            overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12);
-            if (overflow) {
-              aom_highbd_fdct16x16_c(input, output, stride);
-              return;
-            }
-#endif  // DCT_HIGH_BIT_DEPTH
-          }
-        }
-        // Work on next four results
-        {
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
-          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-          const __m128i r0 =
-              mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
-                               &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          const __m128i r1 =
-              mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
-                               &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x2(&r0, &r1);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          {
-            // Add/subtract
-            const __m128i x0 = ADD_EPI16(q4, r0);
-            const __m128i x1 = SUB_EPI16(q4, r0);
-            const __m128i x2 = SUB_EPI16(q7, r1);
-            const __m128i x3 = ADD_EPI16(q7, r1);
-#if DCT_HIGH_BIT_DEPTH
-            overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
-            if (overflow) {
-              aom_highbd_fdct16x16_c(input, output, stride);
-              return;
-            }
-#endif  // DCT_HIGH_BIT_DEPTH
-            // Interleave to do the multiply by constants which gets us
-            // into 32 bits.
-            {
-              const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-              const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-              const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-              const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-              res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-              overflow =
-                  check_epi16_overflow_x4(&res02, &res14, &res10, &res06);
-              if (overflow) {
-                aom_highbd_fdct16x16_c(input, output, stride);
-                return;
-              }
-#endif  // DCT_HIGH_BIT_DEPTH
-            }
-          }
-        }
-      }
-      // Work on the next eight values; step1 -> odd_results
-      {
-        // step 2
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
-          step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 3
-        {
-          step3_0 = ADD_EPI16(step1_0, step2_3);
-          step3_1 = ADD_EPI16(step1_1, step2_2);
-          step3_2 = SUB_EPI16(step1_1, step2_2);
-          step3_3 = SUB_EPI16(step1_0, step2_3);
-          step3_4 = SUB_EPI16(step1_7, step2_4);
-          step3_5 = SUB_EPI16(step1_6, step2_5);
-          step3_6 = ADD_EPI16(step1_6, step2_5);
-          step3_7 = ADD_EPI16(step1_7, step2_4);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3,
-                                      &step3_4, &step3_5, &step3_6, &step3_7);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 4
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
-          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
-          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
-          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
-          step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 5
-        {
-          step1_0 = ADD_EPI16(step3_0, step2_1);
-          step1_1 = SUB_EPI16(step3_0, step2_1);
-          step1_2 = ADD_EPI16(step3_3, step2_2);
-          step1_3 = SUB_EPI16(step3_3, step2_2);
-          step1_4 = SUB_EPI16(step3_4, step2_5);
-          step1_5 = ADD_EPI16(step3_4, step2_5);
-          step1_6 = SUB_EPI16(step3_7, step2_6);
-          step1_7 = ADD_EPI16(step3_7, step2_6);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
-                                      &step1_4, &step1_5, &step1_6, &step1_7);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 6
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
-          res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
-          res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-      }
-      // Transpose the results, do it as two 8x8 transposes.
-      transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05,
-                              &res06, &res07, pass, out0, out1);
-      transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13,
-                              &res14, &res15, pass, out0 + 8, out1 + 8);
-      if (pass == 0) {
-        out0 += 8 * 16;
-      } else {
-        out1 += 8 * 16;
-      }
-    }
-    // Setup in/out for next pass.
-    in = intermediate;
-  }
-}
-
 #undef ADD_EPI16
 #undef SUB_EPI16
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
index 657dcfa22..2d8f8f71e 100644
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
@@ -11,40 +11,12 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/x86/fwd_txfm_sse2.h"
 
-void aom_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
-  __m128i in0, in1;
-  __m128i tmp;
-  const __m128i zero = _mm_setzero_si128();
-  in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-  in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-  in1 = _mm_unpacklo_epi64(
-      in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
-  in0 = _mm_unpacklo_epi64(
-      in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
-
-  tmp = _mm_add_epi16(in0, in1);
-  in0 = _mm_unpacklo_epi16(zero, tmp);
-  in1 = _mm_unpackhi_epi16(zero, tmp);
-  in0 = _mm_srai_epi32(in0, 16);
-  in1 = _mm_srai_epi32(in1, 16);
-
-  tmp = _mm_add_epi32(in0, in1);
-  in0 = _mm_unpacklo_epi32(tmp, zero);
-  in1 = _mm_unpackhi_epi32(tmp, zero);
-
-  tmp = _mm_add_epi32(in0, in1);
-  in0 = _mm_srli_si128(tmp, 8);
-
-  in1 = _mm_add_epi32(tmp, in0);
-  in0 = _mm_slli_epi32(in1, 1);
-  output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
-}
-
 void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
   __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
   __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
@@ -86,47 +58,12 @@ void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
 }
 
 #define DCT_HIGH_BIT_DEPTH 0
-#define FDCT4x4_2D aom_fdct4x4_sse2
 #define FDCT8x8_2D aom_fdct8x8_sse2
-#define FDCT16x16_2D aom_fdct16x16_sse2
 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h"
-#undef FDCT4x4_2D
 #undef FDCT8x8_2D
-#undef FDCT16x16_2D
 
-#define FDCT32x32_2D aom_fdct32x32_rd_sse2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D aom_fdct32x32_sse2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
 #undef DCT_HIGH_BIT_DEPTH
-
-#if CONFIG_HIGHBITDEPTH
 #define DCT_HIGH_BIT_DEPTH 1
-#define FDCT4x4_2D aom_highbd_fdct4x4_sse2
 #define FDCT8x8_2D aom_highbd_fdct8x8_sse2
-#define FDCT16x16_2D aom_highbd_fdct16x16_sse2
 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
-#undef FDCT4x4_2D
 #undef FDCT8x8_2D
-#undef FDCT16x16_2D
-
-#define FDCT32x32_2D aom_highbd_fdct32x32_rd_sse2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D aom_highbd_fdct32x32_sse2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-#undef DCT_HIGH_BIT_DEPTH
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
index 58e8971dd..12ccf7f26 100644
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
@@ -12,15 +12,10 @@
 #ifndef AOM_DSP_X86_FWD_TXFM_SSE2_H_
 #define AOM_DSP_X86_FWD_TXFM_SSE2_H_
 
-#include "aom_dsp/x86/txfm_common_intrin.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define pair_set_epi32(a, b) \
-  _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
-
 static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
   __m128i buf0, buf1;
   buf0 = _mm_mul_epu32(a, b);
@@ -140,112 +135,6 @@ static INLINE int check_epi16_overflow_x32(
   return res0 + res1;
 }
 
-static INLINE int k_check_epi32_overflow_4(const __m128i *preg0,
-                                           const __m128i *preg1,
-                                           const __m128i *preg2,
-                                           const __m128i *preg3,
-                                           const __m128i *zero) {
-  __m128i minus_one = _mm_set1_epi32(-1);
-  // Check for overflows
-  __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1);
-  __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1);
-  __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1);
-  __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1);
-  __m128i reg0_top_dwords =
-      _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
-  __m128i reg1_top_dwords =
-      _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
-  __m128i reg2_top_dwords =
-      _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
-  __m128i reg3_top_dwords =
-      _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
-  __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
-  __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
-  __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
-  __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
-  __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
-  __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
-  int overflow_01 =
-      _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
-  int overflow_23 =
-      _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
-  return (overflow_01 + overflow_23);
-}
-
-static INLINE int k_check_epi32_overflow_8(
-    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
-    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
-    const __m128i *preg6, const __m128i *preg7, const __m128i *zero) {
-  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
-  if (!overflow) {
-    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
-  }
-  return overflow;
-}
-
-static INLINE int k_check_epi32_overflow_16(
-    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
-    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
-    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
-    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
-    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
-    const __m128i *preg15, const __m128i *zero) {
-  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
-  if (!overflow) {
-    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
-    if (!overflow) {
-      overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
-      if (!overflow) {
-        overflow =
-            k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
-      }
-    }
-  }
-  return overflow;
-}
-
-static INLINE int k_check_epi32_overflow_32(
-    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
-    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
-    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
-    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
-    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
-    const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
-    const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
-    const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
-    const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
-    const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
-    const __m128i *preg30, const __m128i *preg31, const __m128i *zero) {
-  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
-  if (!overflow) {
-    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
-    if (!overflow) {
-      overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
-      if (!overflow) {
-        overflow =
-            k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
-        if (!overflow) {
-          overflow =
-              k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero);
-          if (!overflow) {
-            overflow =
-                k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero);
-            if (!overflow) {
-              overflow = k_check_epi32_overflow_4(preg24, preg25, preg26,
-                                                  preg27, zero);
-              if (!overflow) {
-                overflow = k_check_epi32_overflow_4(preg28, preg29, preg30,
-                                                    preg31, zero);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  return overflow;
-}
-
 static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
   if (sizeof(tran_low_t) == 4) {
     const __m128i zero = _mm_setzero_si128();
@@ -259,102 +148,6 @@ static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
   }
 }
 
-static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
-                                       const __m128i *pmultiplier,
-                                       const __m128i *prounding, int shift) {
-  const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier);
-  const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier);
-  const __m128i v0 = _mm_add_epi32(u0, *prounding);
-  const __m128i v1 = _mm_add_epi32(u1, *prounding);
-  const __m128i w0 = _mm_srai_epi32(v0, shift);
-  const __m128i w1 = _mm_srai_epi32(v1, shift);
-  return _mm_packs_epi32(w0, w1);
-}
-
-static INLINE void transpose_and_output8x8(
-    const __m128i *pin00, const __m128i *pin01, const __m128i *pin02,
-    const __m128i *pin03, const __m128i *pin04, const __m128i *pin05,
-    const __m128i *pin06, const __m128i *pin07, int pass, int16_t *out0_ptr,
-    tran_low_t *out1_ptr) {
-  // 00 01 02 03 04 05 06 07
-  // 10 11 12 13 14 15 16 17
-  // 20 21 22 23 24 25 26 27
-  // 30 31 32 33 34 35 36 37
-  // 40 41 42 43 44 45 46 47
-  // 50 51 52 53 54 55 56 57
-  // 60 61 62 63 64 65 66 67
-  // 70 71 72 73 74 75 76 77
-  const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07);
-  // 00 10 01 11 02 12 03 13
-  // 20 30 21 31 22 32 23 33
-  // 04 14 05 15 06 16 07 17
-  // 24 34 25 35 26 36 27 37
-  // 40 50 41 51 42 52 43 53
-  // 60 70 61 71 62 72 63 73
-  // 54 54 55 55 56 56 57 57
-  // 64 74 65 75 66 76 67 77
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-  // 00 10 20 30 01 11 21 31
-  // 40 50 60 70 41 51 61 71
-  // 02 12 22 32 03 13 23 33
-  // 42 52 62 72 43 53 63 73
-  // 04 14 24 34 05 15 21 36
-  // 44 54 64 74 45 55 61 76
-  // 06 16 26 36 07 17 27 37
-  // 46 56 66 76 47 57 67 77
-  const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-  const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-  const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-  const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-  const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-  const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-  const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-  const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-  // 00 10 20 30 40 50 60 70
-  // 01 11 21 31 41 51 61 71
-  // 02 12 22 32 42 52 62 72
-  // 03 13 23 33 43 53 63 73
-  // 04 14 24 34 44 54 64 74
-  // 05 15 25 35 45 55 65 75
-  // 06 16 26 36 46 56 66 76
-  // 07 17 27 37 47 57 67 77
-  if (pass == 0) {
-    _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7);
-  } else {
-    storeu_output(&tr2_0, (out1_ptr + 0 * 16));
-    storeu_output(&tr2_1, (out1_ptr + 1 * 16));
-    storeu_output(&tr2_2, (out1_ptr + 2 * 16));
-    storeu_output(&tr2_3, (out1_ptr + 3 * 16));
-    storeu_output(&tr2_4, (out1_ptr + 4 * 16));
-    storeu_output(&tr2_5, (out1_ptr + 5 * 16));
-    storeu_output(&tr2_6, (out1_ptr + 6 * 16));
-    storeu_output(&tr2_7, (out1_ptr + 7 * 16));
-  }
-}
-
-void fdct32_8col(__m128i *in0, __m128i *in1);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
index 8fa1c04d0..c1fb259a1 100644
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -13,10 +13,6 @@
 
 %include "third_party/x86inc/x86inc.asm"
 
-; This file provides SSSE3 version of the forward transformation. Part
-; of the macro definitions are originally derived from the ffmpeg project.
-; The current version applies to x86 64-bit only.
-
 SECTION_RODATA
 
 pw_11585x2: times 8 dw 23170
@@ -32,106 +28,7 @@ TRANSFORM_COEFFS 15137,   6270
 TRANSFORM_COEFFS 16069,   3196
 TRANSFORM_COEFFS  9102,  13623
 
-SECTION .text
-
-%if ARCH_X86_64
-%macro SUM_SUB 3
-  psubw  m%3, m%1, m%2
-  paddw  m%1, m%2
-  SWAP    %2, %3
-%endmacro
-
-; butterfly operation
-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
-  pmaddwd            m%1, m%3, %5
-  pmaddwd            m%2, m%3, %6
-  paddd              m%1,  %4
-  paddd              m%2,  %4
-  psrad              m%1,  14
-  psrad              m%2,  14
-%endmacro
-
-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
-  punpckhwd          m%6, m%2, m%1
-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_%4_%3], [pw_%3_m%4]
-  punpcklwd          m%2, m%1
-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_%4_%3], [pw_%3_m%4]
-  packssdw           m%1, m%7
-  packssdw           m%2, m%6
-%endmacro
-
-; matrix transpose
-%macro INTERLEAVE_2X 4
-  punpckh%1          m%4, m%2, m%3
-  punpckl%1          m%2, m%3
-  SWAP               %3,  %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
-  INTERLEAVE_2X  wd, %1, %2, %9
-  INTERLEAVE_2X  wd, %3, %4, %9
-  INTERLEAVE_2X  wd, %5, %6, %9
-  INTERLEAVE_2X  wd, %7, %8, %9
-
-  INTERLEAVE_2X  dq, %1, %3, %9
-  INTERLEAVE_2X  dq, %2, %4, %9
-  INTERLEAVE_2X  dq, %5, %7, %9
-  INTERLEAVE_2X  dq, %6, %8, %9
-
-  INTERLEAVE_2X  qdq, %1, %5, %9
-  INTERLEAVE_2X  qdq, %3, %7, %9
-  INTERLEAVE_2X  qdq, %2, %6, %9
-  INTERLEAVE_2X  qdq, %4, %8, %9
-
-  SWAP  %2, %5
-  SWAP  %4, %7
-%endmacro
-
-; 1D forward 8x8 DCT transform
-%macro FDCT8_1D 1
-  SUM_SUB            0,  7,  9
-  SUM_SUB            1,  6,  9
-  SUM_SUB            2,  5,  9
-  SUM_SUB            3,  4,  9
-
-  SUM_SUB            0,  3,  9
-  SUM_SUB            1,  2,  9
-  SUM_SUB            6,  5,  9
-%if %1 == 0
-  SUM_SUB            0,  1,  9
-%endif
-
-  BUTTERFLY_4X       2,  3,  6270,  15137,  m8,  9,  10
-
-  pmulhrsw           m6, m12
-  pmulhrsw           m5, m12
-%if %1 == 0
-  pmulhrsw           m0, m12
-  pmulhrsw           m1, m12
-%else
-  BUTTERFLY_4X       1,  0,  11585, 11585,  m8,  9,  10
-  SWAP               0,  1
-%endif
-
-  SUM_SUB            4,  5,  9
-  SUM_SUB            7,  6,  9
-  BUTTERFLY_4X       4,  7,  3196,  16069,  m8,  9,  10
-  BUTTERFLY_4X       5,  6,  13623,  9102,  m8,  9,  10
-  SWAP               1,  4
-  SWAP               3,  6
-%endmacro
-
-%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2
-  psraw              m%3, m%1, 15
-  psraw              m%4, m%2, 15
-  psubw              m%1, m%3
-  psubw              m%2, m%4
-  psraw              m%1, 1
-  psraw              m%2, 1
-%endmacro
-
 %macro STORE_OUTPUT 2 ; index, result
-%if CONFIG_HIGHBITDEPTH
   ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
   ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
   ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
@@ -144,16 +41,16 @@ SECTION .text
   punpckhwd          m12, m11
   mova               [outputq + 4*%1 +  0], m%2
   mova               [outputq + 4*%1 + 16], m12
-%else
-  mova               [outputq + 2*%1], m%2
-%endif
 %endmacro
 
+SECTION .text
+
+%if ARCH_X86_64
 INIT_XMM ssse3
 cglobal fdct8x8, 3, 5, 13, input, output, stride
 
-  mova               m8, [pd_8192]
-  mova              m12, [pw_11585x2]
+  mova               m8, [GLOBAL(pd_8192)]
+  mova              m12, [GLOBAL(pw_11585x2)]
 
   lea                r3, [2 * strideq]
   lea                r4, [4 * strideq]
@@ -180,25 +77,303 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
   psllw              m7, 2
 
   ; column transform
-  FDCT8_1D  0
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  FDCT8_1D  1
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  DIVIDE_ROUND_2X   0, 1, 9, 10
-  DIVIDE_ROUND_2X   2, 3, 9, 10
-  DIVIDE_ROUND_2X   4, 5, 9, 10
-  DIVIDE_ROUND_2X   6, 7, 9, 10
-
-  STORE_OUTPUT       0, 0
-  STORE_OUTPUT       8, 1
-  STORE_OUTPUT      16, 2
-  STORE_OUTPUT      24, 3
-  STORE_OUTPUT      32, 4
-  STORE_OUTPUT      40, 5
-  STORE_OUTPUT      48, 6
-  STORE_OUTPUT      56, 7
+  ; stage 1
+  paddw m10, m0, m7
+  psubw m0, m7
+
+  paddw m9, m1, m6
+  psubw m1, m6
+
+  paddw m7, m2, m5
+  psubw m2, m5
+
+  paddw m6, m3, m4
+  psubw m3, m4
+
+  ; stage 2
+  paddw m5, m9, m7
+  psubw m9, m7
+
+  paddw m4, m10, m6
+  psubw m10, m6
+
+  paddw m7, m1, m2
+  psubw m1, m2
+
+  ; stage 3
+  paddw m6, m4, m5
+  psubw m4, m5
+
+  pmulhrsw m1, m12
+  pmulhrsw m7, m12
+
+  ; sin(pi / 8), cos(pi / 8)
+  punpcklwd m2, m10, m9
+  punpckhwd m10, m9
+  pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
+  pmaddwd m2, [GLOBAL(pw_6270_m15137)]
+  pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
+  pmaddwd m10, [GLOBAL(pw_6270_m15137)]
+  paddd m5, m8
+  paddd m2, m8
+  paddd m9, m8
+  paddd m10, m8
+  psrad m5, 14
+  psrad m2, 14
+  psrad m9, 14
+  psrad m10, 14
+  packssdw m5, m9
+  packssdw m2, m10
+
+  pmulhrsw m6, m12
+  pmulhrsw m4, m12
+
+  paddw m9, m3, m1
+  psubw m3, m1
+
+  paddw m10, m0, m7
+  psubw m0, m7
+
+  ; stage 4
+  ; sin(pi / 16), cos(pi / 16)
+  punpcklwd m1, m10, m9
+  punpckhwd m10, m9
+  pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
+  pmaddwd m1, [GLOBAL(pw_3196_m16069)]
+  pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
+  pmaddwd m10, [GLOBAL(pw_3196_m16069)]
+  paddd m7, m8
+  paddd m1, m8
+  paddd m9, m8
+  paddd m10, m8
+  psrad m7, 14
+  psrad m1, 14
+  psrad m9, 14
+  psrad m10, 14
+  packssdw m7, m9
+  packssdw m1, m10
+
+  ; sin(3 * pi / 16), cos(3 * pi / 16)
+  punpcklwd m11, m0, m3
+  punpckhwd m0, m3
+  pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
+  pmaddwd m11, [GLOBAL(pw_13623_m9102)]
+  pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
+  pmaddwd m0, [GLOBAL(pw_13623_m9102)]
+  paddd m9, m8
+  paddd m11, m8
+  paddd m3, m8
+  paddd m0, m8
+  psrad m9, 14
+  psrad m11, 14
+  psrad m3, 14
+  psrad m0, 14
+  packssdw m9, m3
+  packssdw m11, m0
+
+  ; transpose
+  ; stage 1
+  punpcklwd m0, m6, m7
+  punpcklwd m3, m5, m11
+  punpckhwd m6, m7
+  punpckhwd m5, m11
+  punpcklwd m7, m4, m9
+  punpcklwd m10, m2, m1
+  punpckhwd m4, m9
+  punpckhwd m2, m1
+
+  ; stage 2
+  punpckldq m9, m0, m3
+  punpckldq m1, m6, m5
+  punpckhdq m0, m3
+  punpckhdq m6, m5
+  punpckldq m3, m7, m10
+  punpckldq m5, m4, m2
+  punpckhdq m7, m10
+  punpckhdq m4, m2
+
+  ; stage 3
+  punpcklqdq m10, m9, m3
+  punpckhqdq m9, m3
+  punpcklqdq m2, m0, m7
+  punpckhqdq m0, m7
+  punpcklqdq m3, m1, m5
+  punpckhqdq m1, m5
+  punpcklqdq m7, m6, m4
+  punpckhqdq m6, m4
+
+  ; row transform
+  ; stage 1
+  paddw m5, m10, m6
+  psubw m10, m6
+
+  paddw m4, m9, m7
+  psubw m9, m7
+
+  paddw m6, m2, m1
+  psubw m2, m1
+
+  paddw m7, m0, m3
+  psubw m0, m3
+
+  ;stage 2
+  paddw m1, m5, m7
+  psubw m5, m7
+
+  paddw m3, m4, m6
+  psubw m4, m6
+
+  paddw m7, m9, m2
+  psubw m9, m2
+
+  ; stage 3
+  punpcklwd m6, m1, m3
+  punpckhwd m1, m3
+  pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
+  pmaddwd m6, [GLOBAL(pw_11585_m11585)]
+  pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
+  pmaddwd m1, [GLOBAL(pw_11585_m11585)]
+  paddd m2, m8
+  paddd m6, m8
+  paddd m3, m8
+  paddd m1, m8
+  psrad m2, 14
+  psrad m6, 14
+  psrad m3, 14
+  psrad m1, 14
+  packssdw m2, m3
+  packssdw m6, m1
+
+  pmulhrsw m7, m12
+  pmulhrsw m9, m12
+
+  punpcklwd m3, m5, m4
+  punpckhwd m5, m4
+  pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
+  pmaddwd m3, [GLOBAL(pw_6270_m15137)]
+  pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
+  pmaddwd m5, [GLOBAL(pw_6270_m15137)]
+  paddd m1, m8
+  paddd m3, m8
+  paddd m4, m8
+  paddd m5, m8
+  psrad m1, 14
+  psrad m3, 14
+  psrad m4, 14
+  psrad m5, 14
+  packssdw m1, m4
+  packssdw m3, m5
+
+  paddw m4, m0, m9
+  psubw m0, m9
+
+  paddw m5, m10, m7
+  psubw m10, m7
+
+  ; stage 4
+  punpcklwd m9, m5, m4
+  punpckhwd m5, m4
+  pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
+  pmaddwd m9, [GLOBAL(pw_3196_m16069)]
+  pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
+  pmaddwd m5, [GLOBAL(pw_3196_m16069)]
+  paddd m7, m8
+  paddd m9, m8
+  paddd m4, m8
+  paddd m5, m8
+  psrad m7, 14
+  psrad m9, 14
+  psrad m4, 14
+  psrad m5, 14
+  packssdw m7, m4
+  packssdw m9, m5
+
+  punpcklwd m4, m10, m0
+  punpckhwd m10, m0
+  pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
+  pmaddwd m4, [GLOBAL(pw_13623_m9102)]
+  pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
+  pmaddwd m10, [GLOBAL(pw_13623_m9102)]
+  paddd m5, m8
+  paddd m4, m8
+  paddd m0, m8
+  paddd m10, m8
+  psrad m5, 14
+  psrad m4, 14
+  psrad m0, 14
+  psrad m10, 14
+  packssdw m5, m0
+  packssdw m4, m10
+
+  ; transpose
+  ; stage 1
+  punpcklwd m0, m2, m7
+  punpcklwd m10, m1, m4
+  punpckhwd m2, m7
+  punpckhwd m1, m4
+  punpcklwd m7, m6, m5
+  punpcklwd m4, m3, m9
+  punpckhwd m6, m5
+  punpckhwd m3, m9
+
+  ; stage 2
+  punpckldq m5, m0, m10
+  punpckldq m9, m2, m1
+  punpckhdq m0, m10
+  punpckhdq m2, m1
+  punpckldq m10, m7, m4
+  punpckldq m1, m6, m3
+  punpckhdq m7, m4
+  punpckhdq m6, m3
+
+  ; stage 3
+  punpcklqdq m4, m5, m10
+  punpckhqdq m5, m10
+  punpcklqdq m3, m0, m7
+  punpckhqdq m0, m7
+  punpcklqdq m10, m9, m1
+  punpckhqdq m9, m1
+  punpcklqdq m7, m2, m6
+  punpckhqdq m2, m6
+
+  psraw m1, m4, 15
+  psraw m6, m5, 15
+  psraw m8, m3, 15
+  psraw m11, m0, 15
+
+  psubw m4, m1
+  psubw m5, m6
+  psubw m3, m8
+  psubw m0, m11
+
+  psraw m4, 1
+  psraw m5, 1
+  psraw m3, 1
+  psraw m0, 1
+
+  psraw m1, m10, 15
+  psraw m6, m9, 15
+  psraw m8, m7, 15
+  psraw m11, m2, 15
+
+  psubw m10, m1
+  psubw m9, m6
+  psubw m7, m8
+  psubw m2, m11
+
+  psraw m10, 1
+  psraw m9, 1
+  psraw m7, 1
+  psraw m2, 1
+
+  STORE_OUTPUT  0,  4
+  STORE_OUTPUT  8,  5
+  STORE_OUTPUT 16,  3
+  STORE_OUTPUT 24,  0
+  STORE_OUTPUT 32, 10
+  STORE_OUTPUT 40,  9
+  STORE_OUTPUT 48,  7
+  STORE_OUTPUT 56,  2
 
   RET
 %endif
diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm
index 60446b086..99f17ebdf 100644
--- a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm
@@ -13,6 +13,8 @@
 
 %include "aom_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;void aom_half_horiz_vert_variance16x_h_sse2(unsigned char *ref,
 ;                                            int ref_stride,
 ;                                            unsigned char *src,
diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c
index a99c0b40e..2a018c1cf 100644
--- a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c
@@ -11,8 +11,9 @@
 
 #include <assert.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 void aom_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref,
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
index 133640eb7..e5e3238d5 100644
--- a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
@@ -11,8 +11,11 @@
 #include <immintrin.h>
 #include <string.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
 
 // -----------------------------------------------------------------------------
 // Copy and average
@@ -100,103 +103,258 @@ void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
-void aom_highbd_convolve_avg_avx2(const uint8_t *src8, ptrdiff_t src_stride,
-                                  uint8_t *dst8, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int filter_x_stride,
-                                  const int16_t *filter_y, int filter_y_stride,
-                                  int width, int h, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-  (void)bd;
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
+                                   uint16_t *dst, int dst_stride, int w, int h,
+                                   InterpFilterParams *filter_params_x,
+                                   InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  __m256i s[8], coeffs_y[4];
+
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    const uint16_t *data = &src_ptr[j];
+    /* Vertical filter */
+    {
+      __m256i src6;
+      __m256i s01 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+          0x20);
+      __m256i s12 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+          0x20);
+      __m256i s23 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+          0x20);
+      __m256i s34 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+          0x20);
+      __m256i s45 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+          0x20);
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+      __m256i s56 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+          src6, 0x20);
+
+      s[0] = _mm256_unpacklo_epi16(s01, s12);
+      s[1] = _mm256_unpacklo_epi16(s23, s34);
+      s[2] = _mm256_unpacklo_epi16(s45, s56);
+
+      s[4] = _mm256_unpackhi_epi16(s01, s12);
+      s[5] = _mm256_unpackhi_epi16(s23, s34);
+      s[6] = _mm256_unpackhi_epi16(s45, s56);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        const __m256i s67 = _mm256_permute2x128_si256(
+            src6,
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            0x20);
+
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+
+        const __m256i s78 = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            src6, 0x20);
+
+        s[3] = _mm256_unpacklo_epi16(s67, s78);
+        s[7] = _mm256_unpackhi_epi16(s67, s78);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+
+        __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          __m256i res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
+
+          __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+          res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+          res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+                           _mm256_castsi256_si128(res_16bit));
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           _mm256_extracti128_si256(res_16bit, 1));
+        } else if (w == 4) {
+          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+          res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+                           _mm256_castsi256_si128(res_a_round));
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           _mm256_extracti128_si256(res_a_round, 1));
+        } else {
+          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+          res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+          xx_storel_32((__m128i *)&dst[i * dst_stride + j],
+                       _mm256_castsi256_si128(res_a_round));
+          xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                       _mm256_extracti128_si256(res_a_round, 1));
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
 
-  assert(width % 4 == 0);
-  if (width > 32) {  // width = 64
-    __m256i p0, p1, p2, p3, u0, u1, u2, u3;
-    do {
-      p0 = _mm256_loadu_si256((const __m256i *)src);
-      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
-      p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
-      p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
-      src += src_stride;
-      u0 = _mm256_loadu_si256((const __m256i *)dst);
-      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
-      u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
-      u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
-      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
-      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
-      _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
-      _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (width > 16) {  // width = 32
-    __m256i p0, p1, u0, u1;
-    do {
-      p0 = _mm256_loadu_si256((const __m256i *)src);
-      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
-      src += src_stride;
-      u0 = _mm256_loadu_si256((const __m256i *)dst);
-      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
-      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
-      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (width > 8) {  // width = 16
-    __m256i p0, p1, u0, u1;
-    do {
-      p0 = _mm256_loadu_si256((const __m256i *)src);
-      p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
-      src += src_stride << 1;
-      u0 = _mm256_loadu_si256((const __m256i *)dst);
-      u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
-
-      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
-      _mm256_storeu_si256((__m256i *)(dst + dst_stride),
-                          _mm256_avg_epu16(p1, u1));
-      dst += dst_stride << 1;
-      h -= 2;
-    } while (h > 0);
-  } else if (width > 4) {  // width = 8
-    __m128i p0, p1, u0, u1;
-    do {
-      p0 = _mm_loadu_si128((const __m128i *)src);
-      p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
-      src += src_stride << 1;
-      u0 = _mm_loadu_si128((const __m128i *)dst);
-      u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
-
-      _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
-      _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
-      dst += dst_stride << 1;
-      h -= 2;
-    } while (h > 0);
-  } else {  // width = 4
-    __m128i p0, p1, u0, u1;
-    do {
-      p0 = _mm_loadl_epi64((const __m128i *)src);
-      p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
-      src += src_stride << 1;
-      u0 = _mm_loadl_epi64((const __m128i *)dst);
-      u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
-
-      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
-      _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
-      dst += dst_stride << 1;
-      h -= 2;
-    } while (h > 0);
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
+                                   uint16_t *dst, int dst_stride, int w, int h,
+                                   InterpFilterParams *filter_params_x,
+                                   InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+  (void)subpel_y_q4;
+  (void)filter_params_y;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  __m256i s[4], coeffs_x[4];
+
+  const __m256i round_const_x =
+      _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    for (i = 0; i < h; i += 2) {
+      const __m256i row0 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+      __m256i row1 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+      // even pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 0);
+      s[1] = _mm256_alignr_epi8(r1, r0, 4);
+      s[2] = _mm256_alignr_epi8(r1, r0, 8);
+      s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+      __m256i res_even = convolve(s, coeffs_x);
+      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+                                  round_shift_x);
+
+      // odd pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 2);
+      s[1] = _mm256_alignr_epi8(r1, r0, 6);
+      s[2] = _mm256_alignr_epi8(r1, r0, 10);
+      s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+      __m256i res_odd = convolve(s, coeffs_x);
+      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+                                 round_shift_x);
+
+      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits),
+                                  round_shift_bits);
+      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits),
+                                 round_shift_bits);
+
+      __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+      __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+
+      __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+      res = _mm256_min_epi16(res, clip_pixel);
+      res = _mm256_max_epi16(res, zero);
+
+      if (w - j > 4) {
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+                         _mm256_castsi256_si128(res));
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                         _mm256_extracti128_si256(res, 1));
+      } else if (w == 4) {
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+                         _mm256_castsi256_si128(res));
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                         _mm256_extracti128_si256(res, 1));
+      } else {
+        xx_storel_32((__m128i *)&dst[i * dst_stride + j],
+                     _mm256_castsi256_si128(res));
+        xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                     _mm256_extracti128_si256(res, 1));
+      }
+    }
   }
 }
 
+#define CONV8_ROUNDING_BITS (7)
+
 // -----------------------------------------------------------------------------
 // Horizontal and vertical filtering
 
-#define CONV8_ROUNDING_BITS (7)
-
 static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
                                               7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
                                               4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
@@ -817,250 +975,6 @@ static void aom_highbd_filter_block1d8_v2_avx2(
   } while (height > 0);
 }
 
-// Calculation with averaging the input pixels
-
-static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask,
-                                        uint16_t *dst) {
-  const __m128i a0 = _mm256_castsi256_si128(*y0);
-  const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
-  __m128i res = _mm_packus_epi32(a0, a1);
-  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
-  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
-  res = _mm_avg_epu16(res, pix);
-  _mm_storeu_si128((__m128i *)dst, res);
-}
-
-static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
-                                        const __m256i *mask, uint16_t *dst,
-                                        ptrdiff_t pitch) {
-  __m256i a = _mm256_packus_epi32(*y0, *y1);
-  const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
-  const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
-  const __m256i pix =
-      _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
-  a = _mm256_min_epi16(a, *mask);
-  a = _mm256_avg_epu16(a, pix);
-  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
-  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
-}
-
-static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
-                                         const __m256i *mask, uint16_t *dst) {
-  __m256i a = _mm256_packus_epi32(*y0, *y1);
-  const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
-  a = _mm256_min_epi16(a, *mask);
-  a = _mm256_avg_epu16(a, pix);
-  _mm256_storeu_si256((__m256i *)dst, a);
-}
-
-static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
-                                         const __m256i *mask, uint16_t *dst,
-                                         ptrdiff_t pitch) {
-  const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
-  const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
-  __m256i p = _mm256_min_epi16(*y0, *mask);
-  p = _mm256_avg_epu16(p, pix0);
-  _mm256_storeu_si256((__m256i *)dst, p);
-
-  p = _mm256_min_epi16(*y1, *mask);
-  p = _mm256_avg_epu16(p, pix1);
-  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
-}
-
-static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0,
-                                               const __m128i *y1,
-                                               const __m128i *mask,
-                                               uint16_t *dst) {
-  __m128i res = _mm_packus_epi32(*y0, *y1);
-  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
-  res = _mm_min_epi16(res, *mask);
-  res = _mm_avg_epu16(res, pix);
-  _mm_storeu_si128((__m128i *)dst, res);
-}
-
-static void aom_highbd_filter_block1d8_h8_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[8], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff[4];
-  pack_filters(filter, ff);
-
-  src_ptr -= 3;
-  do {
-    pack_8x2_pixels(src_ptr, src_pitch, signal);
-    filter_8x1_pixels(signal, ff, &res0);
-    filter_8x1_pixels(&signal[4], ff, &res1);
-    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
-    height -= 2;
-    src_ptr += src_pitch << 1;
-    dst_ptr += dst_pitch << 1;
-  } while (height > 1);
-
-  if (height > 0) {
-    pack_8x1_pixels(src_ptr, signal);
-    filter_8x1_pixels(signal, ff, &res0);
-    store_8x1_avg_pixels(&res0, &max, dst_ptr);
-  }
-}
-
-static void aom_highbd_filter_block1d16_h8_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[8], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff[4];
-  pack_filters(filter, ff);
-
-  src_ptr -= 3;
-  do {
-    pack_16x1_pixels(src_ptr, signal);
-    filter_8x1_pixels(signal, ff, &res0);
-    filter_8x1_pixels(&signal[4], ff, &res1);
-    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
-    height -= 1;
-    src_ptr += src_pitch;
-    dst_ptr += dst_pitch;
-  } while (height > 0);
-}
-
-static void aom_highbd_filter_block1d8_v8_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[9], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff[4];
-  pack_filters(filter, ff);
-
-  pack_8x9_init(src_ptr, src_pitch, signal);
-
-  do {
-    pack_8x9_pixels(src_ptr, src_pitch, signal);
-
-    filter_8x9_pixels(signal, ff, &res0, &res1);
-    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
-    update_pixels(signal);
-
-    src_ptr += src_pitch << 1;
-    dst_ptr += dst_pitch << 1;
-    height -= 2;
-  } while (height > 0);
-}
-
-static void aom_highbd_filter_block1d16_v8_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[17], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff[4];
-  pack_filters(filter, ff);
-
-  pack_16x9_init(src_ptr, src_pitch, signal);
-
-  do {
-    pack_16x9_pixels(src_ptr, src_pitch, signal);
-    filter_16x9_pixels(signal, ff, &res0, &res1);
-    store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
-    update_16x9_pixels(signal);
-
-    src_ptr += src_pitch << 1;
-    dst_ptr += dst_pitch << 1;
-    height -= 2;
-  } while (height > 0);
-}
-
-static void aom_highbd_filter_block1d8_h2_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[2], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff;
-  pack_2t_filter(filter, &ff);
-
-  src_ptr -= 3;
-  do {
-    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
-    filter_16_2t_pixels(signal, &ff, &res0, &res1);
-    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
-    height -= 2;
-    src_ptr += src_pitch << 1;
-    dst_ptr += dst_pitch << 1;
-  } while (height > 1);
-
-  if (height > 0) {
-    pack_8x1_2t_pixels(src_ptr, signal);
-    filter_8x1_2t_pixels(signal, &ff, &res0);
-    store_8x1_avg_pixels(&res0, &max, dst_ptr);
-  }
-}
-
-static void aom_highbd_filter_block1d16_h2_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[2], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff;
-  pack_2t_filter(filter, &ff);
-
-  src_ptr -= 3;
-  do {
-    pack_16x1_2t_pixels(src_ptr, signal);
-    filter_16_2t_pixels(signal, &ff, &res0, &res1);
-    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
-    height -= 1;
-    src_ptr += src_pitch;
-    dst_ptr += dst_pitch;
-  } while (height > 0);
-}
-
-static void aom_highbd_filter_block1d16_v2_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[3], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-  __m256i ff;
-
-  pack_2t_filter(filter, &ff);
-  pack_16x2_init(src_ptr, signal);
-
-  do {
-    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
-    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
-    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
-
-    src_ptr += src_pitch;
-    dst_ptr += dst_pitch;
-    height -= 1;
-  } while (height > 0);
-}
-
-static void aom_highbd_filter_block1d8_v2_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m128i signal[3], res0, res1;
-  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
-  __m128i ff;
-
-  pack_8x1_2t_filter(filter, &ff);
-  pack_8x2_init(src_ptr, signal);
-
-  do {
-    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
-    filter_8_2t_pixels(signal, &ff, &res0, &res1);
-    store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr);
-
-    src_ptr += src_pitch;
-    dst_ptr += dst_pitch;
-    height -= 1;
-  } while (height > 0);
-}
-
 void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
                                         ptrdiff_t, uint32_t, const int16_t *,
                                         int);
@@ -1080,32 +994,5 @@ void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
 
 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
 HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
-HIGH_FUN_CONV_2D(, avx2);
-
-void aom_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t,
-                                            uint16_t *, ptrdiff_t, uint32_t,
-                                            const int16_t *, int);
-void aom_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t,
-                                            uint16_t *, ptrdiff_t, uint32_t,
-                                            const int16_t *, int);
-void aom_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t,
-                                            uint16_t *, ptrdiff_t, uint32_t,
-                                            const int16_t *, int);
-void aom_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t,
-                                            uint16_t *, ptrdiff_t, uint32_t,
-                                            const int16_t *, int);
-#define aom_highbd_filter_block1d4_h8_avg_avx2 \
-  aom_highbd_filter_block1d4_h8_avg_sse2
-#define aom_highbd_filter_block1d4_h2_avg_avx2 \
-  aom_highbd_filter_block1d4_h2_avg_sse2
-#define aom_highbd_filter_block1d4_v8_avg_avx2 \
-  aom_highbd_filter_block1d4_v8_avg_sse2
-#define aom_highbd_filter_block1d4_v2_avg_avx2 \
-  aom_highbd_filter_block1d4_v2_avg_sse2
-
-HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, avx2);
-HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-                 avx2);
-HIGH_FUN_CONV_2D(avg_, avx2);
 
 #undef HIGHBD_FUNC
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
new file mode 100644
index 000000000..f7ac9b496
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    InterpFilterParams *filter_params_x,
+                                    InterpFilterParams *filter_params_y,
+                                    const int subpel_x_q4,
+                                    const int subpel_y_q4,
+                                    ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  __m128i s[16], coeffs_y[4];
+
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    const uint16_t *data = &src_ptr[j];
+    /* Vertical filter */
+    {
+      __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+      __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+      __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+
+      s[0] = _mm_unpacklo_epi16(s0, s1);
+      s[1] = _mm_unpacklo_epi16(s2, s3);
+      s[2] = _mm_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm_unpackhi_epi16(s0, s1);
+      s[5] = _mm_unpackhi_epi16(s2, s3);
+      s[6] = _mm_unpackhi_epi16(s4, s5);
+
+      s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+      s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+      s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+      s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+      s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+      s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+        __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+
+        s[3] = _mm_unpacklo_epi16(s6, s7);
+        s[7] = _mm_unpackhi_epi16(s6, s7);
+
+        s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+        s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+        const __m128i res_a0 = convolve(s, coeffs_y);
+        __m128i res_a_round0 = _mm_sra_epi32(
+            _mm_add_epi32(res_a0, round_const_bits), round_shift_bits);
+
+        const __m128i res_a1 = convolve(s + 8, coeffs_y);
+        __m128i res_a_round1 = _mm_sra_epi32(
+            _mm_add_epi32(res_a1, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m128i res_b0 = convolve(s + 4, coeffs_y);
+          __m128i res_b_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_b0, round_const_bits), round_shift_bits);
+
+          const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+          __m128i res_b_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_b1, round_const_bits), round_shift_bits);
+
+          __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+          res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+          res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+          __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+          res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+          res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_16bit1);
+        } else if (w == 4) {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_a_round1);
+        } else {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+          *((uint32_t *)(&dst[i * dst_stride + j])) =
+              _mm_cvtsi128_si32(res_a_round0);
+
+          *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
+              _mm_cvtsi128_si32(res_a_round1);
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+
+        s[0 + 8] = s[1 + 8];
+        s[1 + 8] = s[2 + 8];
+        s[2 + 8] = s[3 + 8];
+
+        s[4 + 8] = s[5 + 8];
+        s[5 + 8] = s[6 + 8];
+        s[6 + 8] = s[7 + 8];
+
+        s6 = s8;
+      }
+    }
+  }
+}
+
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    InterpFilterParams *filter_params_x,
+                                    InterpFilterParams *filter_params_y,
+                                    const int subpel_x_q4,
+                                    const int subpel_y_q4,
+                                    ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+  (void)subpel_y_q4;
+  (void)filter_params_y;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  __m128i s[4], coeffs_x[4];
+
+  const __m128i round_const_x =
+      _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const int bits = FILTER_BITS - conv_params->round_0;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      for (i = 0; i < h; i += 1) {
+        const __m128i row00 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        const __m128i row01 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+        // even pixels
+        s[0] = _mm_alignr_epi8(row01, row00, 0);
+        s[1] = _mm_alignr_epi8(row01, row00, 4);
+        s[2] = _mm_alignr_epi8(row01, row00, 8);
+        s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+        __m128i res_even = convolve(s, coeffs_x);
+        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+                                 round_shift_x);
+
+        // odd pixels
+        s[0] = _mm_alignr_epi8(row01, row00, 2);
+        s[1] = _mm_alignr_epi8(row01, row00, 6);
+        s[2] = _mm_alignr_epi8(row01, row00, 10);
+        s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+        __m128i res_odd = convolve(s, coeffs_x);
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
+
+        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
+                                 round_shift_bits);
+        res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
+                                round_shift_bits);
+
+        __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+        __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+        __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+        res = _mm_min_epi16(res, clip_pixel);
+        res = _mm_max_epi16(res, zero);
+
+        if (w - j > 4) {
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+        } else if (w == 4) {
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
+        } else {
+          *((uint32_t *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c
deleted file mode 100644
index e001a1d70..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "aom_ports/msvc.h"
-#include "./aom_dsp_rtcd.h"
-
-// -----------------------------------------------------------------------------
-// D45E_PRED
-/*
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-*/
-static INLINE __m256i avg3_epu16(const __m256i *x, const __m256i *y,
-                                 const __m256i *z) {
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i a = _mm256_avg_epu16(*x, *z);
-  const __m256i b =
-      _mm256_subs_epu16(a, _mm256_and_si256(_mm256_xor_si256(*x, *z), one));
-  return _mm256_avg_epu16(b, *y);
-}
-
-static INLINE void d45e_w16(const __m256i *a0, const __m256i *a1,
-                            const __m256i *a2, uint16_t **dst,
-                            ptrdiff_t stride) {
-  const __m256i y = avg3_epu16(a0, a1, a2);
-  _mm256_storeu_si256((__m256i *)*dst, y);
-  *dst += stride;
-}
-
-void aom_highbd_d45e_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
-                                         const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
-  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
-
-  d45e_w16(&x0, &x1, &x2, &dst, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x1, &x2, &x0, &dst, stride);
-
-    x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x2, &x0, &x1, &dst, stride);
-
-    x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x0, &x1, &x2, &dst, stride);
-  } while (i < 9);
-
-  x0 = _mm256_loadu_si256((const __m256i *)(above + 9));
-  x0 = _mm256_insert_epi16(x0, above[23], 15);
-  const __m256i y = avg3_epu16(&x1, &x2, &x0);
-  _mm256_storeu_si256((__m256i *)dst, y);
-}
-
-void aom_highbd_d45e_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
-  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
-
-  d45e_w16(&x0, &x1, &x2, &dst, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x1, &x2, &x0, &dst, stride);
-
-    x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x2, &x0, &x1, &dst, stride);
-
-    x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x0, &x1, &x2, &dst, stride);
-  } while (i < 15);
-
-  x0 = _mm256_loadu_si256((const __m256i *)(above + 15));
-  d45e_w16(&x1, &x2, &x0, &dst, stride);
-
-  x1 = _mm256_loadu_si256((const __m256i *)(above + 16));
-  d45e_w16(&x2, &x0, &x1, &dst, stride);
-
-  x2 = _mm256_loadu_si256((const __m256i *)(above + 17));
-  x2 = _mm256_insert_epi16(x2, above[31], 15);
-  const __m256i y = avg3_epu16(&x0, &x1, &x2);
-  _mm256_storeu_si256((__m256i *)dst, y);
-}
-
-void aom_highbd_d45e_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
-  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
-
-  d45e_w16(&x0, &x1, &x2, &dst, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x1, &x2, &x0, &dst, stride);
-
-    x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x2, &x0, &x1, &dst, stride);
-
-    x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x0, &x1, &x2, &dst, stride);
-  } while (i < 33);
-
-  x0 = _mm256_loadu_si256((const __m256i *)(above + 33));
-  x0 = _mm256_insert_epi16(x0, above[47], 15);
-  const __m256i y = avg3_epu16(&x1, &x2, &x0);
-  _mm256_storeu_si256((__m256i *)dst, y);
-}
-
-void aom_highbd_d45e_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
-  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
-  __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16));
-  __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17));
-  __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18));
-
-  uint16_t *dst1 = dst;
-  uint16_t *dst2 = dst + 16;
-
-  d45e_w16(&x0, &x1, &x2, &dst1, stride);
-  d45e_w16(&y0, &y1, &y2, &dst2, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm256_loadu_si256((const __m256i *)(above + i));
-    d45e_w16(&x1, &x2, &x0, &dst1, stride);
-    y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
-    d45e_w16(&y1, &y2, &y0, &dst2, stride);
-
-    x1 = _mm256_loadu_si256((const __m256i *)(above + i));
-    d45e_w16(&x2, &x0, &x1, &dst1, stride);
-    y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
-    d45e_w16(&y2, &y0, &y1, &dst2, stride);
-
-    x2 = _mm256_loadu_si256((const __m256i *)(above + i));
-    d45e_w16(&x0, &x1, &x2, &dst1, stride);
-    y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
-    d45e_w16(&y0, &y1, &y2, &dst2, stride);
-  } while (i < 15);
-
-  x0 = _mm256_loadu_si256((const __m256i *)(above + 15));
-  d45e_w16(&x1, &x2, &x0, &dst1, stride);
-  y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 15));
-  d45e_w16(&y1, &y2, &y0, &dst2, stride);
-
-  x1 = _mm256_loadu_si256((const __m256i *)(above + 16));
-  d45e_w16(&x2, &x0, &x1, &dst1, stride);
-  y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + 16));
-  d45e_w16(&y2, &y0, &y1, &dst2, stride);
-
-  x2 = _mm256_loadu_si256((const __m256i *)(above + 17));
-  __m256i u = avg3_epu16(&x0, &x1, &x2);
-  _mm256_storeu_si256((__m256i *)dst1, u);
-
-  y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + 17));
-  y2 = _mm256_insert_epi16(y2, above[47], 15);
-  u = avg3_epu16(&y0, &y1, &y2);
-  _mm256_storeu_si256((__m256i *)dst2, u);
-}
-
-void aom_highbd_d45e_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
-  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
-  __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16));
-  __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17));
-  __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18));
-
-  uint16_t *dst1 = dst;
-  uint16_t *dst2 = dst + 16;
-
-  d45e_w16(&x0, &x1, &x2, &dst1, stride);
-  d45e_w16(&y0, &y1, &y2, &dst2, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm256_loadu_si256((const __m256i *)(above + i));
-    d45e_w16(&x1, &x2, &x0, &dst1, stride);
-    y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
-    d45e_w16(&y1, &y2, &y0, &dst2, stride);
-
-    x1 = _mm256_loadu_si256((const __m256i *)(above + i));
-    d45e_w16(&x2, &x0, &x1, &dst1, stride);
-    y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
-    d45e_w16(&y2, &y0, &y1, &dst2, stride);
-
-    x2 = _mm256_loadu_si256((const __m256i *)(above + i));
-    d45e_w16(&x0, &x1, &x2, &dst1, stride);
-    y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
-    d45e_w16(&y0, &y1, &y2, &dst2, stride);
-  } while (i < 33);
-
-  x0 = _mm256_loadu_si256((const __m256i *)(above + 33));
-  __m256i u = avg3_epu16(&x1, &x2, &x0);
-  _mm256_storeu_si256((__m256i *)dst1, u);
-
-  y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 33));
-  y0 = _mm256_insert_epi16(y0, above[63], 15);
-  u = avg3_epu16(&y1, &y2, &y0);
-  _mm256_storeu_si256((__m256i *)dst2, u);
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
index 691e166cf..5a55736c4 100644
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
@@ -11,7 +11,7 @@
 
 #include <emmintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 // -----------------------------------------------------------------------------
 // H_PRED
@@ -982,275 +982,3 @@ void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
     dst += stride;
   }
 }
-
-// -----------------------------------------------------------------------------
-/*
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-*/
-static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
-                                 const __m128i *z) {
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i a = _mm_avg_epu16(*x, *z);
-  const __m128i b =
-      _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
-  return _mm_avg_epu16(b, *y);
-}
-
-void aom_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
-  const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
-  const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
-  const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
-  const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
-  const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4);
-  const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0);
-  const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00);
-  const __m128i row0 = _mm_srli_si128(avg2, 6);
-  const __m128i row1 = _mm_srli_si128(avg3, 4);
-  const __m128i row2 = _mm_srli_si128(avg2, 4);
-  const __m128i row3 = _mm_srli_si128(avg3, 2);
-  (void)bd;
-  _mm_storel_epi64((__m128i *)dst, row0);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
-
-  dst -= stride;
-  dst[0] = _mm_extract_epi16(avg3, 1);
-  dst[stride] = _mm_extract_epi16(avg3, 0);
-}
-
-void aom_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const int L = left[3];
-  const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
-  const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
-  const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
-  const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
-  const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
-  const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0);
-  const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC);
-  const __m128i row0 = _mm_srli_si128(avg3, 6);
-  const __m128i row1 = _mm_srli_si128(avg3, 4);
-  const __m128i row2 = _mm_srli_si128(avg3, 2);
-  const __m128i row3 = avg3;
-  (void)bd;
-  _mm_storel_epi64((__m128i *)dst, row0);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
-}
-
-void aom_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const int L = left[3];
-  const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5));
-  const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0);
-  const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1);
-  const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2);
-  const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3);
-  const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2);
-  const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4);
-  const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00);
-  const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0);
-  const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3);
-  const __m128i row2 = _mm_srli_si128(row3, 4);
-  const __m128i row1 = _mm_srli_si128(row3, 8);
-  const __m128i row0 = _mm_srli_si128(avg3, 4);
-  (void)bd;
-  _mm_storel_epi64((__m128i *)dst, row0);
-  dst[0] = _mm_extract_epi16(avg2, 3);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
-}
-
-void aom_highbd_d45e_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
-  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
-  __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
-  CDEFGH00 = _mm_insert_epi16(CDEFGH00, above[7], 6);
-  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
-  (void)left;
-  (void)bd;
-  _mm_storel_epi64((__m128i *)dst, avg3);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
-}
-
-void aom_highbd_d45e_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m128i h76543210 = _mm_load_si128((const __m128i *)above);
-  __m128i hx7654321 = _mm_srli_si128(h76543210, 2);
-  __m128i h87654321 = _mm_insert_epi16(hx7654321, above[8], 7);
-  __m128i hx8765432 = _mm_srli_si128(h87654321, 2);
-  __m128i h98765432 = _mm_insert_epi16(hx8765432, above[9], 7);
-  __m128i avg3 = avg3_epu16(&h76543210, &h87654321, &h98765432);
-  _mm_storel_epi64((__m128i *)dst, avg3);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 8));
-  dst += stride;
-
-  // hcba98765
-  h76543210 = _mm_loadu_si128((const __m128i *)((above + 5)));
-  h76543210 = _mm_insert_epi16(h76543210, above[11], 7);
-  // hxcba9876
-  hx7654321 = _mm_srli_si128(h76543210, 2);
-  // hxxcba987
-  hx8765432 = _mm_srli_si128(h76543210, 4);
-  avg3 = avg3_epu16(&h76543210, &hx7654321, &hx8765432);
-  _mm_storel_epi64((__m128i *)dst, avg3);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
-}
-
-void aom_highbd_d45e_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m128i x0 = _mm_load_si128((const __m128i *)above);
-  __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
-  __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
-  __m128i y = avg3_epu16(&x0, &x1, &x2);
-  _mm_store_si128((__m128i *)dst, y);
-  dst += stride;
-
-  x0 = _mm_loadu_si128((const __m128i *)(above + 3));
-  y = avg3_epu16(&x1, &x2, &x0);
-  _mm_store_si128((__m128i *)dst, y);
-  dst += stride;
-
-  x1 = _mm_loadu_si128((const __m128i *)(above + 4));
-  y = avg3_epu16(&x2, &x0, &x1);
-  _mm_store_si128((__m128i *)dst, y);
-  dst += stride;
-
-  x2 = _mm_loadu_si128((const __m128i *)(above + 5));
-  x2 = _mm_insert_epi16(x2, above[11], 7);
-  y = avg3_epu16(&x0, &x1, &x2);
-  _mm_store_si128((__m128i *)dst, y);
-}
-
-static INLINE void d45e_w8(const __m128i *a0, const __m128i *a1,
-                           const __m128i *a2, uint16_t **dst,
-                           ptrdiff_t stride) {
-  const __m128i y = avg3_epu16(a0, a1, a2);
-  _mm_storeu_si128((__m128i *)*dst, y);
-  *dst += stride;
-}
-
-void aom_highbd_d45e_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m128i x0 = _mm_load_si128((const __m128i *)above);
-  __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
-  __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
-
-  d45e_w8(&x0, &x1, &x2, &dst, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm_loadu_si128((const __m128i *)(above + i++));
-    d45e_w8(&x1, &x2, &x0, &dst, stride);
-
-    x1 = _mm_loadu_si128((const __m128i *)(above + i++));
-    d45e_w8(&x2, &x0, &x1, &dst, stride);
-
-    x2 = _mm_loadu_si128((const __m128i *)(above + i++));
-    d45e_w8(&x0, &x1, &x2, &dst, stride);
-  } while (i < 9);
-
-  x0 = _mm_loadu_si128((const __m128i *)(above + 9));
-  x0 = _mm_insert_epi16(x0, above[15], 7);
-  const __m128i y = avg3_epu16(&x1, &x2, &x0);
-  _mm_store_si128((__m128i *)dst, y);
-}
-
-void aom_highbd_d45e_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                         const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m128i x0 = _mm_load_si128((const __m128i *)above);
-  __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
-  __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
-
-  d45e_w8(&x0, &x1, &x2, &dst, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm_loadu_si128((const __m128i *)(above + i++));
-    d45e_w8(&x1, &x2, &x0, &dst, stride);
-
-    x1 = _mm_loadu_si128((const __m128i *)(above + i++));
-    d45e_w8(&x2, &x0, &x1, &dst, stride);
-
-    x2 = _mm_loadu_si128((const __m128i *)(above + i++));
-    d45e_w8(&x0, &x1, &x2, &dst, stride);
-  } while (i < 15);
-
-  x0 = _mm_loadu_si128((const __m128i *)(above + 15));
-  __m128i y = avg3_epu16(&x1, &x2, &x0);
-  _mm_store_si128((__m128i *)dst, y);
-  dst += stride;
-
-  x1 = _mm_loadu_si128((const __m128i *)(above + 16));
-  y = avg3_epu16(&x2, &x0, &x1);
-  _mm_store_si128((__m128i *)dst, y);
-  dst += stride;
-
-  x2 = _mm_loadu_si128((const __m128i *)(above + 17));
-  x2 = _mm_insert_epi16(x2, above[23], 7);
-  y = avg3_epu16(&x0, &x1, &x2);
-  _mm_store_si128((__m128i *)dst, y);
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c
deleted file mode 100644
index b089a3f43..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c
+++ /dev/null
@@ -1,521 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-
-#include "./aom_dsp_rtcd.h"
-
-// -----------------------------------------------------------------------------
-/*
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-*/
-static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
-                                 const __m128i *z) {
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i a = _mm_avg_epu16(*x, *z);
-  const __m128i b =
-      _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
-  return _mm_avg_epu16(b, *y);
-}
-
-DECLARE_ALIGNED(16, static const uint8_t, rotate_right_epu16[16]) = {
-  2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1
-};
-
-static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) {
-  *a = _mm_shuffle_epi8(*a, *rotrw);
-  return *a;
-}
-
-void aom_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                         const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
-  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
-  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
-  const __m128i IXABCDEF =
-      _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14);
-  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF);
-  const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG);
-  const __m128i XIJKLMNO =
-      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
-  const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2);
-  __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0);
-  __m128i rowa = avg2;
-  __m128i rowb = avg3;
-  int i;
-  (void)bd;
-  for (i = 0; i < 8; i += 2) {
-    _mm_store_si128((__m128i *)dst, rowa);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, rowb);
-    dst += stride;
-    rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
-    rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14);
-  }
-}
-
-void aom_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
-  const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i A0 = _mm_load_si128((const __m128i *)above);
-  const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
-  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
-  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
-  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
-  const __m128i L0 = _mm_load_si128((const __m128i *)left);
-  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
-  const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
-  const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
-  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
-  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
-  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
-  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
-  const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
-  const __m128i L1_ = _mm_srli_si128(L1, 2);
-  __m128i rowa_0 = avg2_0;
-  __m128i rowa_1 = avg2_1;
-  __m128i rowb_0 = avg3_0;
-  __m128i rowb_1 = avg3_1;
-  __m128i avg3_left[2];
-  int i, j;
-  (void)bd;
-  avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
-  avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
-  for (i = 0; i < 2; ++i) {
-    __m128i avg_left = avg3_left[i];
-    for (j = 0; j < 8; j += 2) {
-      _mm_store_si128((__m128i *)dst, rowa_0);
-      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
-      dst += stride;
-      _mm_store_si128((__m128i *)dst, rowb_0);
-      _mm_store_si128((__m128i *)(dst + 8), rowb_1);
-      dst += stride;
-      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
-      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
-      rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
-      rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
-    }
-  }
-}
-
-void aom_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
-  const __m128i A0 = _mm_load_si128((const __m128i *)above);
-  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
-  const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
-  const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
-  const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15));
-  const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23));
-  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
-  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
-  const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
-  const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
-  const __m128i L0 = _mm_load_si128((const __m128i *)left);
-  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
-  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
-  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
-  const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
-  const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
-  const __m128i C2 = _mm_alignr_epi8(B2, B1, 14);
-  const __m128i C3 = _mm_alignr_epi8(B3, B2, 14);
-  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
-  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
-  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
-  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
-  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
-  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
-  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
-  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
-  const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
-  const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2);
-  const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2);
-  const __m128i L3_ = _mm_srli_si128(L3, 2);
-  __m128i rowa_0 = avg2_0;
-  __m128i rowa_1 = avg2_1;
-  __m128i rowa_2 = avg2_2;
-  __m128i rowa_3 = avg2_3;
-  __m128i rowb_0 = avg3_0;
-  __m128i rowb_1 = avg3_1;
-  __m128i rowb_2 = avg3_2;
-  __m128i rowb_3 = avg3_3;
-  __m128i avg3_left[4];
-  int i, j;
-  (void)bd;
-  avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
-  avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
-  avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_);
-  avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_);
-  for (i = 0; i < 4; ++i) {
-    __m128i avg_left = avg3_left[i];
-    for (j = 0; j < 8; j += 2) {
-      _mm_store_si128((__m128i *)dst, rowa_0);
-      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
-      _mm_store_si128((__m128i *)(dst + 16), rowa_2);
-      _mm_store_si128((__m128i *)(dst + 24), rowa_3);
-      dst += stride;
-      _mm_store_si128((__m128i *)dst, rowb_0);
-      _mm_store_si128((__m128i *)(dst + 8), rowb_1);
-      _mm_store_si128((__m128i *)(dst + 16), rowb_2);
-      _mm_store_si128((__m128i *)(dst + 24), rowb_3);
-      dst += stride;
-      rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
-      rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
-      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
-      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
-      rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14);
-      rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14);
-      rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
-      rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
-    }
-  }
-}
-
-void aom_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                         const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
-  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
-  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
-  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
-  const __m128i XIJKLMNO =
-      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
-  const __m128i AXIJKLMN =
-      _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14);
-  const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0);
-  __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
-  __m128i rowa = avg3;
-  int i;
-  (void)bd;
-  for (i = 0; i < 8; ++i) {
-    rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
-    _mm_store_si128((__m128i *)dst, rowa);
-    dst += stride;
-  }
-}
-
-void aom_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
-  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i B0 = _mm_load_si128((const __m128i *)above);
-  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
-  const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
-  const __m128i L0 = _mm_load_si128((const __m128i *)left);
-  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
-  const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
-  const __m128i C1 = _mm_srli_si128(B1, 2);
-  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
-  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
-  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
-  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
-  const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
-  const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
-  __m128i rowa_0 = avg3_0;
-  __m128i rowa_1 = avg3_1;
-  __m128i avg3_left[2];
-  int i, j;
-  (void)bd;
-  avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
-  avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
-  for (i = 0; i < 2; ++i) {
-    __m128i avg_left = avg3_left[i];
-    for (j = 0; j < 8; ++j) {
-      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
-      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
-      _mm_store_si128((__m128i *)dst, rowa_0);
-      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
-      dst += stride;
-    }
-  }
-}
-
-void aom_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
-  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
-  const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
-  const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
-  const __m128i B0 = _mm_load_si128((const __m128i *)above);
-  const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
-  const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24));
-  const __m128i L0 = _mm_load_si128((const __m128i *)left);
-  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
-  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
-  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
-  const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
-  const __m128i C1 = _mm_alignr_epi8(B2, B1, 2);
-  const __m128i C2 = _mm_alignr_epi8(B3, B2, 2);
-  const __m128i C3 = _mm_srli_si128(B3, 2);
-  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
-  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
-  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
-  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
-  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
-  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
-  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
-  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
-  const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
-  const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
-  const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14);
-  const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14);
-  __m128i rowa_0 = avg3_0;
-  __m128i rowa_1 = avg3_1;
-  __m128i rowa_2 = avg3_2;
-  __m128i rowa_3 = avg3_3;
-  __m128i avg3_left[4];
-  int i, j;
-  (void)bd;
-  avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
-  avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
-  avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_);
-  avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_);
-  for (i = 0; i < 4; ++i) {
-    __m128i avg_left = avg3_left[i];
-    for (j = 0; j < 8; ++j) {
-      rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
-      rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
-      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
-      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
-      _mm_store_si128((__m128i *)dst, rowa_0);
-      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
-      _mm_store_si128((__m128i *)(dst + 16), rowa_2);
-      _mm_store_si128((__m128i *)(dst + 24), rowa_3);
-      dst += stride;
-    }
-  }
-}
-
-void aom_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                         const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2);
-  const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4);
-  const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG);
-  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
-  const __m128i XIJKLMNO =
-      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
-  const __m128i AXIJKLMN =
-      _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14);
-  const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
-  const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO);
-  const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left);
-  const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left);
-  const __m128i row0 =
-      _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12);
-  const __m128i row1 =
-      _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12);
-  const __m128i row2 =
-      _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12);
-  const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12);
-  const __m128i row4 =
-      _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12);
-  const __m128i row5 =
-      _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12);
-  const __m128i row6 =
-      _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12);
-  const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12);
-  (void)bd;
-  _mm_store_si128((__m128i *)dst, row0);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row1);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row2);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row3);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row4);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row5);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row6);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row7);
-}
-
-void aom_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
-  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
-  const __m128i B1 = _mm_srli_si128(A1, 2);
-  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
-  const __m128i C1 = _mm_srli_si128(A1, 4);
-  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
-  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
-  const __m128i L0 = _mm_load_si128((const __m128i *)left);
-  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
-  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
-  const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
-  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
-  const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
-  const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
-  const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
-  const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
-  const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
-  __m128i row_0 = avg3_0;
-  __m128i row_1 = avg3_1;
-  __m128i avg2_avg3_left[2][2];
-  int i, j;
-  (void)bd;
-
-  avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
-  avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
-  avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
-  avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
-
-  for (j = 0; j < 2; ++j) {
-    for (i = 0; i < 2; ++i) {
-      const __m128i avg2_avg3 = avg2_avg3_left[j][i];
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      dst += stride;
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      dst += stride;
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      dst += stride;
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      dst += stride;
-    }
-  }
-}
-
-void aom_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
-  const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
-  const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
-  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
-  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
-  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
-  const __m128i B3 = _mm_srli_si128(A3, 2);
-  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
-  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
-  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
-  const __m128i C3 = _mm_srli_si128(A3, 4);
-  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
-  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
-  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
-  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
-  const __m128i L0 = _mm_load_si128((const __m128i *)left);
-  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
-  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
-  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
-  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
-  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
-  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
-  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
-  const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
-  const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
-  const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12);
-  const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12);
-  const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
-  const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
-  const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2);
-  const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3);
-  const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
-  const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
-  const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2);
-  const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3);
-  __m128i row_0 = avg3_0;
-  __m128i row_1 = avg3_1;
-  __m128i row_2 = avg3_2;
-  __m128i row_3 = avg3_3;
-  __m128i avg2_avg3_left[4][2];
-  int i, j;
-  (void)bd;
-
-  avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
-  avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
-  avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
-  avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
-  avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2);
-  avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2);
-  avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3);
-  avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3);
-
-  for (j = 0; j < 4; ++j) {
-    for (i = 0; i < 2; ++i) {
-      const __m128i avg2_avg3 = avg2_avg3_left[j][i];
-      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
-      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      _mm_store_si128((__m128i *)(dst + 16), row_2);
-      _mm_store_si128((__m128i *)(dst + 24), row_3);
-      dst += stride;
-      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
-      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      _mm_store_si128((__m128i *)(dst + 16), row_2);
-      _mm_store_si128((__m128i *)(dst + 24), row_3);
-      dst += stride;
-      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
-      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      _mm_store_si128((__m128i *)(dst + 16), row_2);
-      _mm_store_si128((__m128i *)(dst + 24), row_3);
-      dst += stride;
-      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
-      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      _mm_store_si128((__m128i *)(dst + 16), row_2);
-      _mm_store_si128((__m128i *)(dst + 24), row_3);
-      dst += stride;
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
index 94c68885c..c954da94e 100644
--- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
@@ -11,210 +11,26 @@
 
 #include <immintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/x86/common_avx2.h"
 #include "aom_dsp/x86/lpf_common_sse2.h"
 #include "aom/aom_integer.h"
 
-#if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4
-static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
-                             const uint8_t *t, int bd, __m256i *blt,
-                             __m256i *lt, __m256i *thr) {
-  const int shift = bd - 8;
-  const __m128i zero = _mm_setzero_si128();
-
-  __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero);
-  __m256i y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
-  *blt = _mm256_slli_epi16(y, shift);
-
-  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero);
-  y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
-  *lt = _mm256_slli_epi16(y, shift);
-
-  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
-  y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
-  *thr = _mm256_slli_epi16(y, shift);
-}
-
-static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
-                                     __m256i *p, __m256i *q) {
-  int i;
-  for (i = 0; i < size; i++) {
-    p[i] = _mm256_loadu_si256((__m256i *)(s - (i + 1) * pitch));
-    q[i] = _mm256_loadu_si256((__m256i *)(s + i * pitch));
-  }
-}
-
-static INLINE void highbd_hev_mask(const __m256i *p, const __m256i *q,
-                                   const __m256i *t, __m256i *hev) {
-  const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], p[0]));
-  const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q[1], q[0]));
-  __m256i h = _mm256_max_epi16(abs_p1p0, abs_q1q0);
-  h = _mm256_subs_epu16(h, *t);
-
-  const __m256i ffff = _mm256_set1_epi16(0xFFFF);
-  const __m256i zero = _mm256_setzero_si256();
-  *hev = _mm256_xor_si256(_mm256_cmpeq_epi16(h, zero), ffff);
-}
-
-static INLINE void highbd_filter_mask(const __m256i *p, const __m256i *q,
-                                      const __m256i *l, const __m256i *bl,
-                                      __m256i *mask) {
-  __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p[0], q[0]));
-  __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], q[1]));
-  abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1);
-
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i ffff = _mm256_set1_epi16(0xFFFF);
-  __m256i max = _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), *bl);
-  max = _mm256_xor_si256(_mm256_cmpeq_epi16(max, zero), ffff);
-  max = _mm256_and_si256(max, _mm256_adds_epu16(*l, one));
-
-  int i;
-  for (i = 1; i < 4; ++i) {
-    max = _mm256_max_epi16(max,
-                           _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[i - 1])));
-    max = _mm256_max_epi16(max,
-                           _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[i - 1])));
-  }
-  max = _mm256_subs_epu16(max, *l);
-  *mask = _mm256_cmpeq_epi16(max, zero);  // return ~mask
-}
-
-static INLINE void flat_mask_internal(const __m256i *th, const __m256i *p,
-                                      const __m256i *q, int bd, int start,
-                                      int end, __m256i *flat) {
-  __m256i max = _mm256_setzero_si256();
-  int i;
-  for (i = start; i < end; ++i) {
-    max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[0])));
-    max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[0])));
-  }
-
-  __m256i ft;
-  if (bd == 8)
-    ft = _mm256_subs_epu16(max, *th);
-  else if (bd == 10)
-    ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 2));
-  else  // bd == 12
-    ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 4));
-
-  const __m256i zero = _mm256_setzero_si256();
-  *flat = _mm256_cmpeq_epi16(ft, zero);
-}
-
-// Note:
-//  Access p[3-1], p[0], and q[3-1], q[0]
-static INLINE void highbd_flat_mask4(const __m256i *th, const __m256i *p,
-                                     const __m256i *q, __m256i *flat, int bd) {
-  // check the distance 1,2,3 against 0
-  flat_mask_internal(th, p, q, bd, 1, 4, flat);
-}
-
-// Note:
-//  access p[7-4], p[0], and q[7-4], q[0]
-static INLINE void highbd_flat_mask5(const __m256i *th, const __m256i *p,
-                                     const __m256i *q, __m256i *flat, int bd) {
-  flat_mask_internal(th, p, q, bd, 4, 8, flat);
-}
-
-static INLINE void pixel_clamp(const __m256i *min, const __m256i *max,
-                               __m256i *pixel) {
-  __m256i clamped, mask;
-
-  mask = _mm256_cmpgt_epi16(*pixel, *max);
-  clamped = _mm256_andnot_si256(mask, *pixel);
-  mask = _mm256_and_si256(mask, *max);
-  clamped = _mm256_or_si256(mask, clamped);
-
-  mask = _mm256_cmpgt_epi16(clamped, *min);
-  clamped = _mm256_and_si256(mask, clamped);
-  mask = _mm256_andnot_si256(mask, *min);
-  *pixel = _mm256_or_si256(clamped, mask);
-}
-
-static INLINE void highbd_filter4(__m256i *p, __m256i *q, const __m256i *mask,
-                                  const __m256i *th, int bd, __m256i *ps,
-                                  __m256i *qs) {
-  __m256i t80;
-  if (bd == 8)
-    t80 = _mm256_set1_epi16(0x80);
-  else if (bd == 10)
-    t80 = _mm256_set1_epi16(0x200);
-  else  // bd == 12
-    t80 = _mm256_set1_epi16(0x800);
-
-  __m256i ps0 = _mm256_subs_epi16(p[0], t80);
-  __m256i ps1 = _mm256_subs_epi16(p[1], t80);
-  __m256i qs0 = _mm256_subs_epi16(q[0], t80);
-  __m256i qs1 = _mm256_subs_epi16(q[1], t80);
-
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i pmax = _mm256_subs_epi16(
-      _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80);
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i pmin = _mm256_subs_epi16(zero, t80);
-
-  __m256i filter = _mm256_subs_epi16(ps1, qs1);
-  pixel_clamp(&pmin, &pmax, &filter);
-
-  __m256i hev;
-  highbd_hev_mask(p, q, th, &hev);
-  filter = _mm256_and_si256(filter, hev);
-
-  const __m256i x = _mm256_subs_epi16(qs0, ps0);
-  filter = _mm256_adds_epi16(filter, x);
-  filter = _mm256_adds_epi16(filter, x);
-  filter = _mm256_adds_epi16(filter, x);
-  pixel_clamp(&pmin, &pmax, &filter);
-  filter = _mm256_and_si256(filter, *mask);
-
-  const __m256i t3 = _mm256_set1_epi16(3);
-  const __m256i t4 = _mm256_set1_epi16(4);
-
-  __m256i filter1 = _mm256_adds_epi16(filter, t4);
-  __m256i filter2 = _mm256_adds_epi16(filter, t3);
-  pixel_clamp(&pmin, &pmax, &filter1);
-  pixel_clamp(&pmin, &pmax, &filter2);
-  filter1 = _mm256_srai_epi16(filter1, 3);
-  filter2 = _mm256_srai_epi16(filter2, 3);
-
-  qs0 = _mm256_subs_epi16(qs0, filter1);
-  pixel_clamp(&pmin, &pmax, &qs0);
-  ps0 = _mm256_adds_epi16(ps0, filter2);
-  pixel_clamp(&pmin, &pmax, &ps0);
-
-  qs[0] = _mm256_adds_epi16(qs0, t80);
-  ps[0] = _mm256_adds_epi16(ps0, t80);
-
-  filter = _mm256_adds_epi16(filter1, one);
-  filter = _mm256_srai_epi16(filter, 1);
-  filter = _mm256_andnot_si256(hev, filter);
-
-  qs1 = _mm256_subs_epi16(qs1, filter);
-  pixel_clamp(&pmin, &pmax, &qs1);
-  ps1 = _mm256_adds_epi16(ps1, filter);
-  pixel_clamp(&pmin, &pmax, &ps1);
-
-  qs[1] = _mm256_adds_epi16(qs1, t80);
-  ps[1] = _mm256_adds_epi16(ps1, t80);
-}
-#endif  // #if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4
-
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int p,
-                                            const uint8_t *blt,
-                                            const uint8_t *lt,
-                                            const uint8_t *thr, int bd) {
-  aom_highbd_lpf_horizontal_edge_16_sse2(s, p, blt, lt, thr, bd);
+void aom_highbd_lpf_horizontal_14_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0,
+                                         blimit1, limit1, thresh1, bd);
 }
 
-void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p,
-                                          const uint8_t *blt, const uint8_t *lt,
-                                          const uint8_t *thr, int bd) {
-  aom_highbd_lpf_vertical_16_dual_sse2(s, p, blt, lt, thr, bd);
+void aom_highbd_lpf_vertical_14_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                       limit1, thresh1, bd);
 }
 
 void aom_highbd_lpf_horizontal_4_dual_avx2(
@@ -248,626 +64,3 @@ void aom_highbd_lpf_vertical_8_dual_avx2(
   aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
                                       limit1, thresh1, bd);
 }
-#else
-void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int pitch,
-                                            const uint8_t *blt,
-                                            const uint8_t *lt,
-                                            const uint8_t *thr, int bd) {
-  __m256i blimit, limit, thresh;
-  get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh);
-
-  __m256i p[8], q[8];
-  load_highbd_pixel(s, 8, pitch, p, q);
-
-  __m256i mask;
-  highbd_filter_mask(p, q, &limit, &blimit, &mask);
-
-  __m256i flat, flat2;
-  const __m256i one = _mm256_set1_epi16(1);
-  highbd_flat_mask4(&one, p, q, &flat, bd);
-  highbd_flat_mask5(&one, p, q, &flat2, bd);
-
-  flat = _mm256_and_si256(flat, mask);
-  flat2 = _mm256_and_si256(flat2, flat);
-
-  __m256i ps[2], qs[2];
-  highbd_filter4(p, q, &mask, &thresh, bd, ps, qs);
-
-  // flat and wide flat calculations
-  __m256i flat_p[3], flat_q[3];
-  __m256i flat2_p[7], flat2_q[7];
-  {
-    const __m256i eight = _mm256_set1_epi16(8);
-    const __m256i four = _mm256_set1_epi16(4);
-
-    __m256i sum_p = _mm256_add_epi16(_mm256_add_epi16(p[6], p[5]),
-                                     _mm256_add_epi16(p[4], p[3]));
-    __m256i sum_q = _mm256_add_epi16(_mm256_add_epi16(q[6], q[5]),
-                                     _mm256_add_epi16(q[4], q[3]));
-
-    __m256i sum_lp = _mm256_add_epi16(p[0], _mm256_add_epi16(p[2], p[1]));
-    sum_p = _mm256_add_epi16(sum_p, sum_lp);
-
-    __m256i sum_lq = _mm256_add_epi16(q[0], _mm256_add_epi16(q[2], q[1]));
-    sum_q = _mm256_add_epi16(sum_q, sum_lq);
-    sum_p = _mm256_add_epi16(eight, _mm256_add_epi16(sum_p, sum_q));
-    sum_lp = _mm256_add_epi16(four, _mm256_add_epi16(sum_lp, sum_lq));
-
-    flat2_p[0] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_p, _mm256_add_epi16(p[7], p[0])), 4);
-    flat2_q[0] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_p, _mm256_add_epi16(q[7], q[0])), 4);
-    flat_p[0] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_lp, _mm256_add_epi16(p[3], p[0])), 3);
-    flat_q[0] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_lp, _mm256_add_epi16(q[3], q[0])), 3);
-
-    __m256i sum_p7 = _mm256_add_epi16(p[7], p[7]);
-    __m256i sum_q7 = _mm256_add_epi16(q[7], q[7]);
-    __m256i sum_p3 = _mm256_add_epi16(p[3], p[3]);
-    __m256i sum_q3 = _mm256_add_epi16(q[3], q[3]);
-
-    sum_q = _mm256_sub_epi16(sum_p, p[6]);
-    sum_p = _mm256_sub_epi16(sum_p, q[6]);
-    flat2_p[1] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[1])), 4);
-    flat2_q[1] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[1])), 4);
-
-    sum_lq = _mm256_sub_epi16(sum_lp, p[2]);
-    sum_lp = _mm256_sub_epi16(sum_lp, q[2]);
-    flat_p[1] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[1])), 3);
-    flat_q[1] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[1])), 3);
-
-    sum_p7 = _mm256_add_epi16(sum_p7, p[7]);
-    sum_q7 = _mm256_add_epi16(sum_q7, q[7]);
-    sum_p3 = _mm256_add_epi16(sum_p3, p[3]);
-    sum_q3 = _mm256_add_epi16(sum_q3, q[3]);
-
-    sum_p = _mm256_sub_epi16(sum_p, q[5]);
-    sum_q = _mm256_sub_epi16(sum_q, p[5]);
-    flat2_p[2] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[2])), 4);
-    flat2_q[2] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[2])), 4);
-
-    sum_lp = _mm256_sub_epi16(sum_lp, q[1]);
-    sum_lq = _mm256_sub_epi16(sum_lq, p[1]);
-    flat_p[2] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[2])), 3);
-    flat_q[2] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[2])), 3);
-
-    int i;
-    for (i = 3; i < 7; ++i) {
-      sum_p7 = _mm256_add_epi16(sum_p7, p[7]);
-      sum_q7 = _mm256_add_epi16(sum_q7, q[7]);
-      sum_p = _mm256_sub_epi16(sum_p, q[7 - i]);
-      sum_q = _mm256_sub_epi16(sum_q, p[7 - i]);
-      flat2_p[i] = _mm256_srli_epi16(
-          _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[i])), 4);
-      flat2_q[i] = _mm256_srli_epi16(
-          _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[i])), 4);
-    }
-  }
-
-  // highbd_filter8
-  p[2] = _mm256_andnot_si256(flat, p[2]);
-  //  p2 remains unchanged if !(flat && mask)
-  flat_p[2] = _mm256_and_si256(flat, flat_p[2]);
-  //  when (flat && mask)
-  p[2] = _mm256_or_si256(p[2], flat_p[2]);  // full list of p2 values
-  q[2] = _mm256_andnot_si256(flat, q[2]);
-  flat_q[2] = _mm256_and_si256(flat, flat_q[2]);
-  q[2] = _mm256_or_si256(q[2], flat_q[2]);  // full list of q2 values
-
-  int i;
-  for (i = 1; i >= 0; i--) {
-    ps[i] = _mm256_andnot_si256(flat, ps[i]);
-    flat_p[i] = _mm256_and_si256(flat, flat_p[i]);
-    p[i] = _mm256_or_si256(ps[i], flat_p[i]);
-    qs[i] = _mm256_andnot_si256(flat, qs[i]);
-    flat_q[i] = _mm256_and_si256(flat, flat_q[i]);
-    q[i] = _mm256_or_si256(qs[i], flat_q[i]);
-  }
-
-  // highbd_filter16
-
-  for (i = 6; i >= 0; i--) {
-    //  p[i] remains unchanged if !(flat2 && flat && mask)
-    p[i] = _mm256_andnot_si256(flat2, p[i]);
-    flat2_p[i] = _mm256_and_si256(flat2, flat2_p[i]);
-    //  get values for when (flat2 && flat && mask)
-    p[i] = _mm256_or_si256(p[i], flat2_p[i]);  // full list of p values
-
-    q[i] = _mm256_andnot_si256(flat2, q[i]);
-    flat2_q[i] = _mm256_and_si256(flat2, flat2_q[i]);
-    q[i] = _mm256_or_si256(q[i], flat2_q[i]);
-    _mm256_storeu_si256((__m256i *)(s - (i + 1) * pitch), p[i]);
-    _mm256_storeu_si256((__m256i *)(s + i * pitch), q[i]);
-  }
-}
-
-static INLINE void highbd_transpose16x16(uint16_t *src, int src_p,
-                                         uint16_t *dst, int dst_p) {
-  __m256i x[16];
-  int i;
-  for (i = 0; i < 16; ++i) {
-    x[i] = _mm256_loadu_si256((const __m256i *)src);
-    src += src_p;
-  }
-  mm256_transpose_16x16(x, x);
-  for (i = 0; i < 16; ++i) {
-    _mm256_storeu_si256((__m256i *)dst, x[i]);
-    dst += dst_p;
-  }
-}
-
-void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p,
-                                          const uint8_t *blimit,
-                                          const uint8_t *limit,
-                                          const uint8_t *thresh, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
-
-  //  Transpose 16x16
-  highbd_transpose16x16(s - 8, p, t_dst, 16);
-
-  //  Loop filtering
-  aom_highbd_lpf_horizontal_edge_16_avx2(t_dst + 8 * 16, 16, blimit, limit,
-                                         thresh, bd);
-
-  //  Transpose back
-  highbd_transpose16x16(t_dst, 16, s - 8, p);
-}
-
-static INLINE void get_dual_limit(const uint8_t *b0, const uint8_t *l0,
-                                  const uint8_t *t0, const uint8_t *b1,
-                                  const uint8_t *l1, const uint8_t *t1, int bd,
-                                  __m256i *blt, __m256i *lt, __m256i *thr) {
-  const __m128i z128 = _mm_setzero_si128();
-  const __m128i blimit0 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b0), z128);
-  const __m128i limit0 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l0), z128);
-  const __m128i thresh0 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t0), z128);
-  const __m128i blimit1 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b1), z128);
-  const __m128i limit1 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l1), z128);
-  const __m128i thresh1 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t1), z128);
-
-  *blt = _mm256_inserti128_si256(_mm256_castsi128_si256(blimit0), blimit1, 1);
-  *lt = _mm256_inserti128_si256(_mm256_castsi128_si256(limit0), limit1, 1);
-  *thr = _mm256_inserti128_si256(_mm256_castsi128_si256(thresh0), thresh1, 1);
-
-  int shift = bd - 8;
-  *blt = _mm256_slli_epi16(*blt, shift);
-  *lt = _mm256_slli_epi16(*lt, shift);
-  *thr = _mm256_slli_epi16(*thr, shift);
-}
-
-void aom_highbd_lpf_horizontal_4_dual_avx2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p));
-  __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p));
-  __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p));
-  __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p));
-  __m256i q0 = _mm256_loadu_si256((__m256i *)(s - 0 * p));
-  __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p));
-  __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p));
-  __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p));
-
-  const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0));
-  const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0));
-
-  __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0));
-  __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1));
-
-  __m256i blimit, limit, thresh;
-  get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
-                 &blimit, &limit, &thresh);
-
-  __m256i t80, tff80, tffe0, t1f, t7f;
-  if (bd == 8) {
-    t80 = _mm256_set1_epi16(0x80);
-    tff80 = _mm256_set1_epi16(0xff80);
-    tffe0 = _mm256_set1_epi16(0xffe0);
-    t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 8);
-    t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 8);
-  } else if (bd == 10) {
-    t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 2);
-    tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 2);
-    tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 2);
-    t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 6);
-    t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 6);
-  } else {  // bd == 12
-    t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 4);
-    tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 4);
-    tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 4);
-    t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 4);
-    t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 4);
-  }
-
-  __m256i ps1 =
-      _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 2 * p)), t80);
-  __m256i ps0 =
-      _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 1 * p)), t80);
-  __m256i qs0 =
-      _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 0 * p)), t80);
-  __m256i qs1 =
-      _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 1 * p)), t80);
-
-  // filter_mask and hev_mask
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0);
-  __m256i hev = _mm256_subs_epu16(flat, thresh);
-  const __m256i ffff = _mm256_set1_epi16(0xFFFF);
-  hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff);
-
-  abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1);
-  __m256i mask =
-      _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit);
-  mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-  // So taking maximums continues to work:
-  const __m256i one = _mm256_set1_epi16(1);
-  mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one));
-  mask = _mm256_max_epi16(flat, mask);
-  // mask |= (abs(p1 - p0) > limit) * -1;
-  // mask |= (abs(q1 - q0) > limit) * -1;
-  __m256i work = _mm256_max_epi16(
-      _mm256_or_si256(_mm256_subs_epu16(p2, p1), _mm256_subs_epu16(p1, p2)),
-      _mm256_or_si256(_mm256_subs_epu16(p3, p2), _mm256_subs_epu16(p2, p3)));
-  mask = _mm256_max_epi16(work, mask);
-  work = _mm256_max_epi16(
-      _mm256_or_si256(_mm256_subs_epu16(q2, q1), _mm256_subs_epu16(q1, q2)),
-      _mm256_or_si256(_mm256_subs_epu16(q3, q2), _mm256_subs_epu16(q2, q3)));
-  mask = _mm256_max_epi16(work, mask);
-  mask = _mm256_subs_epu16(mask, limit);
-  mask = _mm256_cmpeq_epi16(mask, zero);
-
-  // filter4
-  const __m256i pmax = _mm256_subs_epi16(
-      _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80);
-  const __m256i pmin = _mm256_subs_epi16(zero, t80);
-
-  __m256i filt = _mm256_subs_epi16(ps1, qs1);
-  pixel_clamp(&pmin, &pmax, &filt);
-  filt = _mm256_and_si256(filt, hev);
-  __m256i work_a = _mm256_subs_epi16(qs0, ps0);
-  filt = _mm256_adds_epi16(filt, work_a);
-  filt = _mm256_adds_epi16(filt, work_a);
-  filt = _mm256_adds_epi16(filt, work_a);
-  pixel_clamp(&pmin, &pmax, &filt);
-
-  // (aom_filter + 3 * (qs0 - ps0)) & mask
-  filt = _mm256_and_si256(filt, mask);
-
-  const __m256i t4 = _mm256_set1_epi16(4);
-  const __m256i t3 = _mm256_set1_epi16(3);
-
-  __m256i filter1 = _mm256_adds_epi16(filt, t4);
-  pixel_clamp(&pmin, &pmax, &filter1);
-  __m256i filter2 = _mm256_adds_epi16(filt, t3);
-  pixel_clamp(&pmin, &pmax, &filter2);
-
-  // Filter1 >> 3
-  work_a = _mm256_cmpgt_epi16(zero, filter1);  // get the values that are <0
-  filter1 = _mm256_srli_epi16(filter1, 3);
-  work_a = _mm256_and_si256(work_a, tffe0);    // sign bits for the values < 0
-  filter1 = _mm256_and_si256(filter1, t1f);    // clamp the range
-  filter1 = _mm256_or_si256(filter1, work_a);  // reinsert the sign bits
-
-  // Filter2 >> 3
-  work_a = _mm256_cmpgt_epi16(zero, filter2);
-  filter2 = _mm256_srli_epi16(filter2, 3);
-  work_a = _mm256_and_si256(work_a, tffe0);
-  filter2 = _mm256_and_si256(filter2, t1f);
-  filter2 = _mm256_or_si256(filter2, work_a);
-
-  // filt >> 1
-  // equivalent to shifting 0x1f left by bitdepth - 8
-  // and setting new bits to 1
-  filt = _mm256_adds_epi16(filter1, one);
-  work_a = _mm256_cmpgt_epi16(zero, filt);
-  filt = _mm256_srli_epi16(filt, 1);
-  work_a = _mm256_and_si256(work_a, tff80);
-  filt = _mm256_and_si256(filt, t7f);
-  filt = _mm256_or_si256(filt, work_a);
-
-  filt = _mm256_andnot_si256(hev, filt);
-
-  filter1 = _mm256_subs_epi16(qs0, filter1);
-  pixel_clamp(&pmin, &pmax, &filter1);
-  q0 = _mm256_adds_epi16(filter1, t80);
-
-  filter1 = _mm256_subs_epi16(qs1, filt);
-  pixel_clamp(&pmin, &pmax, &filter1);
-  q1 = _mm256_adds_epi16(filter1, t80);
-
-  filter2 = _mm256_adds_epi16(ps0, filter2);
-  pixel_clamp(&pmin, &pmax, &filter2);
-  p0 = _mm256_adds_epi16(filter2, t80);
-
-  filter2 = _mm256_adds_epi16(ps1, filt);
-  pixel_clamp(&pmin, &pmax, &filter2);
-  p1 = _mm256_adds_epi16(filter2, t80);
-
-  _mm256_storeu_si256((__m256i *)(s - 2 * p), p1);
-  _mm256_storeu_si256((__m256i *)(s - 1 * p), p0);
-  _mm256_storeu_si256((__m256i *)(s + 0 * p), q0);
-  _mm256_storeu_si256((__m256i *)(s + 1 * p), q1);
-}
-
-void aom_highbd_lpf_horizontal_8_dual_avx2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
-
-  __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p));
-  __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p));
-  __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p));
-  __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p));
-  __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p));
-  __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p));
-  __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p));
-  __m256i q0 = _mm256_loadu_si256((__m256i *)(s + 0 * p));
-
-  __m256i blimit, limit, thresh;
-  get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
-                 &blimit, &limit, &thresh);
-
-  __m256i t80;
-  if (bd == 8) {
-    t80 = _mm256_set1_epi16(0x80);
-  } else if (bd == 10) {
-    t80 = _mm256_set1_epi16(0x200);
-  } else {  // bd == 12
-    t80 = _mm256_set1_epi16(0x800);
-  }
-
-  __m256i ps1, ps0, qs0, qs1;
-  ps1 = _mm256_subs_epi16(p1, t80);
-  ps0 = _mm256_subs_epi16(p0, t80);
-  qs0 = _mm256_subs_epi16(q0, t80);
-  qs1 = _mm256_subs_epi16(q1, t80);
-
-  // filter_mask and hev_mask
-  __m256i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-  abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0));
-  abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0));
-
-  abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0));
-  abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1));
-  __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0);
-  __m256i hev = _mm256_subs_epu16(flat, thresh);
-  const __m256i zero = _mm256_set1_epi16(0);
-  const __m256i ffff = _mm256_set1_epi16(0xFFFF);
-  hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff);
-
-  abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1);
-  __m256i mask =
-      _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit);
-  mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-  // So taking maximums continues to work:
-
-  const __m256i one = _mm256_set1_epi16(1);
-  mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one));
-  mask = _mm256_max_epi16(abs_p1p0, mask);
-  // mask |= (abs(p1 - p0) > limit) * -1;
-  mask = _mm256_max_epi16(abs_q1q0, mask);
-  // mask |= (abs(q1 - q0) > limit) * -1;
-
-  work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p1)),
-                          _mm256_abs_epi16(_mm256_sub_epi16(q2, q1)));
-  mask = _mm256_max_epi16(work, mask);
-  work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p2)),
-                          _mm256_abs_epi16(_mm256_sub_epi16(q3, q2)));
-  mask = _mm256_max_epi16(work, mask);
-  mask = _mm256_subs_epu16(mask, limit);
-  mask = _mm256_cmpeq_epi16(mask, zero);
-
-  // flat_mask4
-  flat = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p0)),
-                          _mm256_abs_epi16(_mm256_sub_epi16(q2, q0)));
-  work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p0)),
-                          _mm256_abs_epi16(_mm256_sub_epi16(q3, q0)));
-  flat = _mm256_max_epi16(work, flat);
-  flat = _mm256_max_epi16(abs_p1p0, flat);
-  flat = _mm256_max_epi16(abs_q1q0, flat);
-
-  if (bd == 8)
-    flat = _mm256_subs_epu16(flat, one);
-  else if (bd == 10)
-    flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 2));
-  else  // bd == 12
-    flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 4));
-
-  flat = _mm256_cmpeq_epi16(flat, zero);
-  flat = _mm256_and_si256(flat, mask);  // flat & mask
-
-  // Added before shift for rounding part of ROUND_POWER_OF_TWO
-  __m256i workp_a, workp_b, workp_shft;
-  workp_a =
-      _mm256_add_epi16(_mm256_add_epi16(p3, p3), _mm256_add_epi16(p2, p1));
-  const __m256i four = _mm256_set1_epi16(4);
-  workp_a = _mm256_add_epi16(_mm256_add_epi16(workp_a, four), p0);
-  workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, p2), p3);
-  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
-  _mm256_storeu_si256((__m256i *)&flat_op2[0], workp_shft);
-
-  workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, q1), p1);
-  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
-  _mm256_storeu_si256((__m256i *)&flat_op1[0], workp_shft);
-
-  workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q2);
-  workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p1), p0);
-  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
-  _mm256_storeu_si256((__m256i *)&flat_op0[0], workp_shft);
-
-  workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q3);
-  workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p0), q0);
-  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
-  _mm256_storeu_si256((__m256i *)&flat_oq0[0], workp_shft);
-
-  workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p2), q3);
-  workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q0), q1);
-  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
-  _mm256_storeu_si256((__m256i *)&flat_oq1[0], workp_shft);
-
-  workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p1), q3);
-  workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q1), q2);
-  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
-  _mm256_storeu_si256((__m256i *)&flat_oq2[0], workp_shft);
-
-  // lp filter
-  const __m256i pmax = _mm256_subs_epi16(
-      _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80);
-  const __m256i pmin = _mm256_subs_epi16(zero, t80);
-
-  __m256i filt, filter1, filter2, work_a;
-  filt = _mm256_subs_epi16(ps1, qs1);
-  pixel_clamp(&pmin, &pmax, &filt);
-  filt = _mm256_and_si256(filt, hev);
-  work_a = _mm256_subs_epi16(qs0, ps0);
-  filt = _mm256_adds_epi16(filt, work_a);
-  filt = _mm256_adds_epi16(filt, work_a);
-  filt = _mm256_adds_epi16(filt, work_a);
-  // (aom_filter + 3 * (qs0 - ps0)) & mask
-  pixel_clamp(&pmin, &pmax, &filt);
-  filt = _mm256_and_si256(filt, mask);
-
-  const __m256i t4 = _mm256_set1_epi16(4);
-  const __m256i t3 = _mm256_set1_epi16(3);
-
-  filter1 = _mm256_adds_epi16(filt, t4);
-  filter2 = _mm256_adds_epi16(filt, t3);
-
-  // Filter1 >> 3
-  pixel_clamp(&pmin, &pmax, &filter1);
-  filter1 = _mm256_srai_epi16(filter1, 3);
-
-  // Filter2 >> 3
-  pixel_clamp(&pmin, &pmax, &filter2);
-  filter2 = _mm256_srai_epi16(filter2, 3);
-
-  // filt >> 1
-  filt = _mm256_adds_epi16(filter1, one);
-  filt = _mm256_srai_epi16(filt, 1);
-  // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
-  filt = _mm256_andnot_si256(hev, filt);
-
-  work_a = _mm256_subs_epi16(qs0, filter1);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm256_adds_epi16(work_a, t80);
-  q0 = _mm256_loadu_si256((__m256i *)flat_oq0);
-  work_a = _mm256_andnot_si256(flat, work_a);
-  q0 = _mm256_and_si256(flat, q0);
-  q0 = _mm256_or_si256(work_a, q0);
-
-  work_a = _mm256_subs_epi16(qs1, filt);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm256_adds_epi16(work_a, t80);
-  q1 = _mm256_loadu_si256((__m256i *)flat_oq1);
-  work_a = _mm256_andnot_si256(flat, work_a);
-  q1 = _mm256_and_si256(flat, q1);
-  q1 = _mm256_or_si256(work_a, q1);
-
-  work_a = _mm256_loadu_si256((__m256i *)(s + 2 * p));
-  q2 = _mm256_loadu_si256((__m256i *)flat_oq2);
-  work_a = _mm256_andnot_si256(flat, work_a);
-  q2 = _mm256_and_si256(flat, q2);
-  q2 = _mm256_or_si256(work_a, q2);
-
-  work_a = _mm256_adds_epi16(ps0, filter2);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm256_adds_epi16(work_a, t80);
-  p0 = _mm256_loadu_si256((__m256i *)flat_op0);
-  work_a = _mm256_andnot_si256(flat, work_a);
-  p0 = _mm256_and_si256(flat, p0);
-  p0 = _mm256_or_si256(work_a, p0);
-
-  work_a = _mm256_adds_epi16(ps1, filt);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm256_adds_epi16(work_a, t80);
-  p1 = _mm256_loadu_si256((__m256i *)flat_op1);
-  work_a = _mm256_andnot_si256(flat, work_a);
-  p1 = _mm256_and_si256(flat, p1);
-  p1 = _mm256_or_si256(work_a, p1);
-
-  work_a = _mm256_loadu_si256((__m256i *)(s - 3 * p));
-  p2 = _mm256_loadu_si256((__m256i *)flat_op2);
-  work_a = _mm256_andnot_si256(flat, work_a);
-  p2 = _mm256_and_si256(flat, p2);
-  p2 = _mm256_or_si256(work_a, p2);
-
-  _mm256_storeu_si256((__m256i *)(s - 3 * p), p2);
-  _mm256_storeu_si256((__m256i *)(s - 2 * p), p1);
-  _mm256_storeu_si256((__m256i *)(s - 1 * p), p0);
-  _mm256_storeu_si256((__m256i *)(s + 0 * p), q0);
-  _mm256_storeu_si256((__m256i *)(s + 1 * p), q1);
-  _mm256_storeu_si256((__m256i *)(s + 2 * p), q2);
-}
-
-void aom_highbd_lpf_vertical_4_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
-  uint16_t *src[2];
-  uint16_t *dst[2];
-
-  // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
-
-  // Loop filtering
-  aom_highbd_lpf_horizontal_4_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0,
-                                        thresh0, blimit1, limit1, thresh1, bd);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
-
-  // Transpose back
-  highbd_transpose(src, 16, dst, p, 2);
-}
-
-void aom_highbd_lpf_vertical_8_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
-  uint16_t *src[2];
-  uint16_t *dst[2];
-
-  // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
-
-  // Loop filtering
-  aom_highbd_lpf_horizontal_8_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0,
-                                        thresh0, blimit1, limit1, thresh1, bd);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
-
-  // Transpose back
-  highbd_transpose(src, 16, dst, p, 2);
-}
-#endif  // CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
index 0a399edf2..83e0098ba 100644
--- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -11,29 +11,23 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/x86/lpf_common_sse2.h"
-#include "aom_ports/emmintrin_compat.h"
-#include "aom_ports/mem.h"
+#include "config/aom_dsp_rtcd.h"
 
-static INLINE void pixel_clamp(const __m128i *min, const __m128i *max,
-                               __m128i *pixel) {
-  __m128i clamped, mask;
+#include "aom_dsp/x86/lpf_common_sse2.h"
 
-  mask = _mm_cmpgt_epi16(*pixel, *max);
-  clamped = _mm_andnot_si128(mask, *pixel);
-  mask = _mm_and_si128(mask, *max);
-  clamped = _mm_or_si128(mask, clamped);
+static AOM_FORCE_INLINE void pixel_clamp(const __m128i *min, const __m128i *max,
+                                         __m128i *pixel) {
+  *pixel = _mm_min_epi16(*pixel, *max);
+  *pixel = _mm_max_epi16(*pixel, *min);
+}
 
-  mask = _mm_cmpgt_epi16(clamped, *min);
-  clamped = _mm_and_si128(mask, clamped);
-  mask = _mm_andnot_si128(mask, *min);
-  *pixel = _mm_or_si128(clamped, mask);
+static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) {
+  return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
 }
 
 static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
                              const uint8_t *t, int bd, __m128i *blt,
-                             __m128i *lt, __m128i *thr) {
+                             __m128i *lt, __m128i *thr, __m128i *t80_out) {
   const int shift = bd - 8;
   const __m128i zero = _mm_setzero_si128();
 
@@ -45,6 +39,36 @@ static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
 
   x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
   *thr = _mm_slli_epi16(x, shift);
+
+  *t80_out = _mm_set1_epi16(1 << (bd - 1));
+}
+
+static INLINE void get_limit_dual(
+    const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0,
+    const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1,
+    int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out,
+    __m128i *t80_out) {
+  const int shift = bd - 8;
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i x0 =
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit0), zero);
+  __m128i x1 =
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit1), zero);
+  x0 = _mm_unpacklo_epi64(x0, x1);
+  *blt_out = _mm_slli_epi16(x0, shift);
+
+  x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit0), zero);
+  x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit1), zero);
+  x0 = _mm_unpacklo_epi64(x0, x1);
+  *lt_out = _mm_slli_epi16(x0, shift);
+
+  x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh0), zero);
+  x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh1), zero);
+  x0 = _mm_unpacklo_epi64(x0, x1);
+  *thr_out = _mm_slli_epi16(x0, shift);
+
+  *t80_out = _mm_set1_epi16(1 << (bd - 1));
 }
 
 static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
@@ -55,115 +79,217 @@ static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
     q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch));
   }
 }
-// _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
-static INLINE void highbd_hev_mask(const __m128i *p, const __m128i *q,
-                                   const __m128i *t, __m128i *hev) {
-  const __m128i abs_p1p0 =
-      _mm_or_si128(_mm_subs_epu16(p[1], p[0]), _mm_subs_epu16(p[0], p[1]));
-  const __m128i abs_q1q0 =
-      _mm_or_si128(_mm_subs_epu16(q[1], q[0]), _mm_subs_epu16(q[0], q[1]));
-  __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  h = _mm_subs_epu16(h, *t);
 
-  const __m128i ffff = _mm_set1_epi16(0xFFFF);
-  const __m128i zero = _mm_setzero_si128();
-  *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
-}
-
-static INLINE void highbd_filter_mask(const __m128i *p, const __m128i *q,
-                                      const __m128i *l, const __m128i *bl,
-                                      __m128i *mask) {
-  __m128i abs_p0q0 =
-      _mm_or_si128(_mm_subs_epu16(p[0], q[0]), _mm_subs_epu16(q[0], p[0]));
-  __m128i abs_p1q1 =
-      _mm_or_si128(_mm_subs_epu16(p[1], q[1]), _mm_subs_epu16(q[1], p[1]));
+static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q,
+                                           const __m128i *l, const __m128i *bl,
+                                           __m128i *mask) {
+  __m128i abs_p0q0 = abs_diff16(p[0], q[0]);
+  __m128i abs_p1q1 = abs_diff16(p[1], q[1]);
   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
 
   const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
   const __m128i ffff = _mm_set1_epi16(0xFFFF);
+
   __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
   max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
   max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
 
   int i;
   for (i = 1; i < 4; ++i) {
-    max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[i - 1]),
-                                          _mm_subs_epu16(p[i - 1], p[i])));
-    max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[i - 1]),
-                                          _mm_subs_epu16(q[i - 1], q[i])));
+    max = _mm_max_epi16(max, abs_diff16(p[i], p[i - 1]));
+    max = _mm_max_epi16(max, abs_diff16(q[i], q[i - 1]));
   }
   max = _mm_subs_epu16(max, *l);
   *mask = _mm_cmpeq_epi16(max, zero);  // return ~mask
 }
 
-static INLINE void flat_mask_internal(const __m128i *th, const __m128i *p,
-                                      const __m128i *q, int bd, int start,
-                                      int end, __m128i *flat) {
-  __m128i max = _mm_setzero_si128();
+static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x,
+                                                 __m128i *p1p0, __m128i *q1q0,
+                                                 __m128i *abs_p1p0, __m128i *l,
+                                                 __m128i *bl, __m128i *t,
+                                                 __m128i *hev, __m128i *mask) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i ffff = _mm_set1_epi16(0xFFFF);
+  __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0;
+  __m128i max, max01, h;
+
+  *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]);
+  *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]);
+
+  abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0);
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1);
+  abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+  abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);  // divide by 2
+
+  max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
+  max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
+  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
+
+  *abs_p1p0 = abs_diff16(pq[0], pq[1]);
+  abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8);
+  max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0);
+  // mask |= (abs(*p1 - *p0) > limit) * -1;
+  // mask |= (abs(*q1 - *q0) > limit) * -1;
+  h = _mm_subs_epu16(max01, *t);
+
+  *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
+  // replicate for the further "merged variables" usage
+  *hev = _mm_unpacklo_epi64(*hev, *hev);
+
+  max = _mm_max_epi16(max, max01);
+  int i;
+  for (i = 2; i < x; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1]));
+  }
+  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
+
+  max = _mm_subs_epu16(max, *l);
+  *mask = _mm_cmpeq_epi16(max, zero);  //  ~mask
+}
+
+static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq,
+                                      int start, int end, __m128i *flat) {
+  int i;
+  __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]),
+                              abs_diff16(pq[start + 1], pq[0]));
+
+  for (i = start + 2; i < end; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0]));
+  }
+  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
+
+  __m128i ft;
+  ft = _mm_subs_epu16(max, *th);
+
+  const __m128i zero = _mm_setzero_si128();
+  *flat = _mm_cmpeq_epi16(ft, zero);
+}
+
+static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p,
+                                           const __m128i *q, int start, int end,
+                                           __m128i *flat) {
   int i;
-  for (i = start; i < end; ++i) {
-    max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[0]),
-                                          _mm_subs_epu16(p[0], p[i])));
-    max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[0]),
-                                          _mm_subs_epu16(q[0], q[i])));
+  __m128i max =
+      _mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0]));
+
+  for (i = start + 1; i < end; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(p[i], p[0]));
+    max = _mm_max_epi16(max, abs_diff16(q[i], q[0]));
   }
 
   __m128i ft;
-  if (bd == 8)
-    ft = _mm_subs_epu16(max, *th);
-  else if (bd == 10)
-    ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 2));
-  else  // bd == 12
-    ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 4));
+  ft = _mm_subs_epu16(max, *th);
 
   const __m128i zero = _mm_setzero_si128();
   *flat = _mm_cmpeq_epi16(ft, zero);
 }
 
-// Note:
-//  Access p[3-1], p[0], and q[3-1], q[0]
-static INLINE void highbd_flat_mask4(const __m128i *th, const __m128i *p,
-                                     const __m128i *q, __m128i *flat, int bd) {
+static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat,
+                                          __m128i *flat2, int bd) {
   // check the distance 1,2,3 against 0
-  flat_mask_internal(th, p, q, bd, 1, 4, flat);
+  __m128i th = _mm_set1_epi16(1);
+  th = _mm_slli_epi16(th, bd - 8);
+  flat_mask_internal(&th, pq, 1, 4, flat);
+  flat_mask_internal(&th, pq, 4, 7, flat2);
 }
 
-// Note:
-//  access p[7-4], p[0], and q[7-4], q[0]
-static INLINE void highbd_flat_mask5(const __m128i *th, const __m128i *p,
-                                     const __m128i *q, __m128i *flat, int bd) {
-  flat_mask_internal(th, p, q, bd, 4, 8, flat);
+static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p,
+                                               const __m128i *q, __m128i *flat,
+                                               __m128i *flat2, int bd) {
+  // check the distance 1,2,3 against 0
+  __m128i th = _mm_set1_epi16(1);
+  th = _mm_slli_epi16(th, bd - 8);
+  flat_mask_internal_dual(&th, p, q, 1, 4, flat);
+  flat_mask_internal_dual(&th, p, q, 4, 7, flat2);
 }
 
-static INLINE void highbd_filter4(__m128i *p, __m128i *q, const __m128i *mask,
-                                  const __m128i *th, int bd, __m128i *ps,
-                                  __m128i *qs) {
-  __m128i t80;
-  if (bd == 8)
-    t80 = _mm_set1_epi16(0x80);
-  else if (bd == 10)
-    t80 = _mm_set1_epi16(0x200);
-  else  // bd == 12
-    t80 = _mm_set1_epi16(0x800);
-
-  __m128i ps0 = _mm_subs_epi16(p[0], t80);
-  __m128i ps1 = _mm_subs_epi16(p[1], t80);
-  __m128i qs0 = _mm_subs_epi16(q[0], t80);
-  __m128i qs1 = _mm_subs_epi16(q[1], t80);
+static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0,
+                                                 __m128i *hev, __m128i *mask,
+                                                 __m128i *qs1qs0,
+                                                 __m128i *ps1ps0, __m128i *t80,
+                                                 int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i pmax =
+      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
+  const __m128i pmin = _mm_subs_epi16(zero, *t80);
+
+  const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4);
+  __m128i ps1ps0_work, qs1qs0_work, work;
+  __m128i filt, filter2filter1, filter2filt, filter1filt;
+
+  ps1ps0_work = _mm_subs_epi16(*p1p0, *t80);
+  qs1qs0_work = _mm_subs_epi16(*q1q0, *t80);
+
+  work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work);
+  pixel_clamp(&pmin, &pmax, &work);
+  filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
+
+  filt = _mm_subs_epi16(filt, work);
+  filt = _mm_subs_epi16(filt, work);
+  filt = _mm_subs_epi16(filt, work);
+  // (aom_filter + 3 * (qs0 - ps0)) & mask
+  pixel_clamp(&pmin, &pmax, &filt);
+  filt = _mm_and_si128(filt, *mask);
+  filt = _mm_unpacklo_epi64(filt, filt);
+
+  filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */
+  pixel_clamp(&pmin, &pmax, &filter2filter1);
+  filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */
 
+  filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1);
+
+  // filt >> 1
+  filt = _mm_adds_epi16(filt, one);
+  filt = _mm_srai_epi16(filt, 1);
+  filt = _mm_andnot_si128(*hev, filt);
+
+  filter2filt = _mm_unpackhi_epi64(filter2filter1, filt);
+  filter1filt = _mm_unpacklo_epi64(filter2filter1, filt);
+
+  qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt);
+  ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt);
+
+  pixel_clamp(&pmin, &pmax, &qs1qs0_work);
+  pixel_clamp(&pmin, &pmax, &ps1ps0_work);
+
+  *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80);
+  *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80);
+}
+
+static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps,
+                                            __m128i *qs, const __m128i *mask,
+                                            const __m128i *th, int bd,
+                                            __m128i *t80) {
+  __m128i ps0 = _mm_subs_epi16(p[0], *t80);
+  __m128i ps1 = _mm_subs_epi16(p[1], *t80);
+  __m128i qs0 = _mm_subs_epi16(q[0], *t80);
+  __m128i qs1 = _mm_subs_epi16(q[1], *t80);
   const __m128i one = _mm_set1_epi16(1);
   const __m128i pmax =
-      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i pmin = _mm_subs_epi16(zero, t80);
+      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
 
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i pmin = _mm_subs_epi16(zero, *t80);
   __m128i filter = _mm_subs_epi16(ps1, qs1);
   pixel_clamp(&pmin, &pmax, &filter);
 
+  // hev_filter
   __m128i hev;
-  highbd_hev_mask(p, q, th, &hev);
+  const __m128i abs_p1p0 = abs_diff16(p[1], p[0]);
+  const __m128i abs_q1q0 = abs_diff16(q[1], q[0]);
+  __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  h = _mm_subs_epu16(h, *th);
+  const __m128i ffff = _mm_cmpeq_epi16(h, h);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
+
   filter = _mm_and_si128(filter, hev);
 
   const __m128i x = _mm_subs_epi16(qs0, ps0);
@@ -172,145 +298,332 @@ static INLINE void highbd_filter4(__m128i *p, __m128i *q, const __m128i *mask,
   filter = _mm_adds_epi16(filter, x);
   pixel_clamp(&pmin, &pmax, &filter);
   filter = _mm_and_si128(filter, *mask);
-
   const __m128i t3 = _mm_set1_epi16(3);
   const __m128i t4 = _mm_set1_epi16(4);
-
   __m128i filter1 = _mm_adds_epi16(filter, t4);
   __m128i filter2 = _mm_adds_epi16(filter, t3);
   pixel_clamp(&pmin, &pmax, &filter1);
   pixel_clamp(&pmin, &pmax, &filter2);
   filter1 = _mm_srai_epi16(filter1, 3);
   filter2 = _mm_srai_epi16(filter2, 3);
-
   qs0 = _mm_subs_epi16(qs0, filter1);
   pixel_clamp(&pmin, &pmax, &qs0);
   ps0 = _mm_adds_epi16(ps0, filter2);
   pixel_clamp(&pmin, &pmax, &ps0);
-
-  qs[0] = _mm_adds_epi16(qs0, t80);
-  ps[0] = _mm_adds_epi16(ps0, t80);
-
+  qs[0] = _mm_adds_epi16(qs0, *t80);
+  ps[0] = _mm_adds_epi16(ps0, *t80);
   filter = _mm_adds_epi16(filter1, one);
   filter = _mm_srai_epi16(filter, 1);
   filter = _mm_andnot_si128(hev, filter);
-
   qs1 = _mm_subs_epi16(qs1, filter);
   pixel_clamp(&pmin, &pmax, &qs1);
   ps1 = _mm_adds_epi16(ps1, filter);
   pixel_clamp(&pmin, &pmax, &ps1);
-
-  qs[1] = _mm_adds_epi16(qs1, t80);
-  ps[1] = _mm_adds_epi16(ps1, t80);
+  qs[1] = _mm_adds_epi16(qs1, *t80);
+  ps[1] = _mm_adds_epi16(ps1, *t80);
 }
 
-typedef enum { FOUR_PIXELS, EIGHT_PIXELS } PixelOutput;
-
-static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch,
-                                                   const uint8_t *blt,
-                                                   const uint8_t *lt,
-                                                   const uint8_t *thr, int bd,
-                                                   PixelOutput pixel_output) {
+static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
+    __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt,
+    const unsigned char *lt, const unsigned char *thr, int bd) {
+  int i;
   __m128i blimit, limit, thresh;
-  get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh);
+  __m128i t80;
+  get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
+
+  for (i = 0; i < 7; i++) {
+    pq[i] = _mm_unpacklo_epi64(p[i], q[i]);
+  }
+  __m128i mask, hevhev;
+  __m128i p1p0, q1q0, abs_p1p0;
 
-  __m128i p[8], q[8];
-  load_highbd_pixel(s, 8, pitch, p, q);
+  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hevhev, &mask);
 
-  __m128i mask;
-  highbd_filter_mask(p, q, &limit, &blimit, &mask);
+  __m128i ps0ps1, qs0qs1;
+  // filter4
+  highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd);
 
   __m128i flat, flat2;
-  const __m128i one = _mm_set1_epi16(1);
-  highbd_flat_mask4(&one, p, q, &flat, bd);
-  highbd_flat_mask5(&one, p, q, &flat2, bd);
+  highbd_flat_mask4_sse2(pq, &flat, &flat2, bd);
 
   flat = _mm_and_si128(flat, mask);
   flat2 = _mm_and_si128(flat2, flat);
 
-  __m128i ps[2], qs[2];
-  highbd_filter4(p, q, &mask, &thresh, bd, ps, qs);
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi64(flat, flat);
+  flat2 = _mm_unpacklo_epi64(flat2, flat2);
 
   // flat and wide flat calculations
-  __m128i flat_p[3], flat_q[3];
-  __m128i flat2_p[7], flat2_q[7];
+  __m128i flat_p[3], flat_q[3], flat_pq[3];
+  __m128i flat2_p[6], flat2_q[6];
+  __m128i flat2_pq[6];
   {
+    __m128i work0;
     const __m128i eight = _mm_set1_epi16(8);
     const __m128i four = _mm_set1_epi16(4);
+    __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3]));
+    __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1]));
+    sum_p = _mm_add_epi16(sum_p, sum_lp);
+
+    __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
+    __m128i sum_q = _mm_srli_si128(sum_p, 8);
+
+    sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+
+    work0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
+    flat2_p[0] = _mm_add_epi16(sum_p, _mm_add_epi16(work0, q[0]));
+    flat2_q[0] =
+        _mm_add_epi16(sum_p, _mm_add_epi16(_mm_srli_si128(work0, 8), p[0]));
+
+    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0]));
+    flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0]));
+
+    __m128i sum_p6, sum_p3;
+    sum_p6 = _mm_add_epi16(pq[6], pq[6]);
+    sum_p3 = _mm_add_epi16(pq[3], pq[3]);
+
+    sum_q = _mm_sub_epi16(sum_p, p[5]);
+    sum_p = _mm_sub_epi16(sum_p, q[5]);
+
+    work0 = _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
+    flat2_p[1] = _mm_add_epi16(sum_p, work0);
+    flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+
+    sum_lq = _mm_sub_epi16(sum_lp, p[2]);
+    sum_lp = _mm_sub_epi16(sum_lp, q[2]);
+
+    work0 = _mm_add_epi16(sum_p3, pq[1]);
+    flat_p[1] = _mm_add_epi16(sum_lp, work0);
+    flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+
+    flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
+    flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
+
+    flat2_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
+    flat2_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
+
+    sum_p = _mm_sub_epi16(sum_p, q[4]);
+    sum_q = _mm_sub_epi16(sum_q, p[4]);
 
-    __m128i sum_p =
-        _mm_add_epi16(_mm_add_epi16(p[6], p[5]), _mm_add_epi16(p[4], p[3]));
-    __m128i sum_q =
-        _mm_add_epi16(_mm_add_epi16(q[6], q[5]), _mm_add_epi16(q[4], q[3]));
+    sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+    work0 = _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
+    flat2_p[2] = _mm_add_epi16(sum_p, work0);
+    flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+    flat2_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
 
+    sum_lp = _mm_sub_epi16(sum_lp, q[1]);
+    sum_lq = _mm_sub_epi16(sum_lq, p[1]);
+
+    sum_p3 = _mm_add_epi16(sum_p3, pq[3]);
+    work0 = _mm_add_epi16(sum_p3, pq[2]);
+
+    flat_p[2] = _mm_add_epi16(sum_lp, work0);
+    flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+    flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
+
+    sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+    sum_p = _mm_sub_epi16(sum_p, q[3]);
+    sum_q = _mm_sub_epi16(sum_q, p[3]);
+
+    work0 = _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
+    flat2_p[3] = _mm_add_epi16(sum_p, work0);
+    flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+    flat2_pq[3] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
+
+    sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+    sum_p = _mm_sub_epi16(sum_p, q[2]);
+    sum_q = _mm_sub_epi16(sum_q, p[2]);
+
+    work0 = _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
+    flat2_p[4] = _mm_add_epi16(sum_p, work0);
+    flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+    flat2_pq[4] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
+
+    sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+    sum_p = _mm_sub_epi16(sum_p, q[1]);
+    sum_q = _mm_sub_epi16(sum_q, p[1]);
+
+    work0 = _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
+    flat2_p[5] = _mm_add_epi16(sum_p, work0);
+    flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+    flat2_pq[5] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
+  }
+
+  // highbd_filter8
+  pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+  pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
+
+  for (i = 0; i < 3; i++) {
+    pq[i] = _mm_andnot_si128(flat, pq[i]);
+    flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
+    pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
+  }
+
+  // highbd_filter16
+  for (i = 5; i >= 0; i--) {
+    //  p[i] remains unchanged if !(flat2 && flat && mask)
+    pq[i] = _mm_andnot_si128(flat2, pq[i]);
+    flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
+    //  get values for when (flat2 && flat && mask)
+    pq[i] = _mm_or_si128(pq[i], flat2_pq[i]);  // full list of pq values
+  }
+}
+
+void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch,
+                                       const uint8_t *blt, const uint8_t *lt,
+                                       const uint8_t *thr, int bd) {
+  __m128i p[7], q[7], pq[7];
+  int i;
+
+  for (i = 0; i < 7; i++) {
+    p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch));
+    q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch));
+  }
+
+  highbd_lpf_internal_14_sse2(p, q, pq, blt, lt, thr, bd);
+
+  for (i = 0; i < 6; i++) {
+    _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]);
+    _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8));
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
+    __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0,
+    const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1,
+    const uint8_t *thr1, int bd) {
+  __m128i blimit, limit, thresh, t80;
+  get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh,
+                 &t80);
+  __m128i mask;
+  highbd_filter_mask_dual(p, q, &limit, &blimit, &mask);
+  __m128i flat, flat2;
+  highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd);
+
+  flat = _mm_and_si128(flat, mask);
+  flat2 = _mm_and_si128(flat2, flat);
+  __m128i ps[2], qs[2];
+  highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80);
+  // flat and wide flat calculations
+  __m128i flat_p[3], flat_q[3];
+  __m128i flat2_p[6], flat2_q[6];
+  {
+    const __m128i eight = _mm_set1_epi16(8);
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i sum_p = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
+    __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3]));
     __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
     sum_p = _mm_add_epi16(sum_p, sum_lp);
-
     __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
     sum_q = _mm_add_epi16(sum_q, sum_lq);
     sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
     sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
-
-    flat2_p[0] =
-        _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(p[7], p[0])), 4);
-    flat2_q[0] =
-        _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(q[7], q[0])), 4);
+    flat2_p[0] = _mm_srli_epi16(
+        _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
+                                           _mm_add_epi16(p[1], q[0]))),
+        4);
+    flat2_q[0] = _mm_srli_epi16(
+        _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
+                                           _mm_add_epi16(p[0], q[1]))),
+        4);
     flat_p[0] =
         _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3);
     flat_q[0] =
         _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3);
-
-    __m128i sum_p7 = _mm_add_epi16(p[7], p[7]);
-    __m128i sum_q7 = _mm_add_epi16(q[7], q[7]);
+    __m128i sum_p6 = _mm_add_epi16(p[6], p[6]);
+    __m128i sum_q6 = _mm_add_epi16(q[6], q[6]);
     __m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
     __m128i sum_q3 = _mm_add_epi16(q[3], q[3]);
-
-    sum_q = _mm_sub_epi16(sum_p, p[6]);
-    sum_p = _mm_sub_epi16(sum_p, q[6]);
-    flat2_p[1] =
-        _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[1])), 4);
-    flat2_q[1] =
-        _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[1])), 4);
-
+    sum_q = _mm_sub_epi16(sum_p, p[5]);
+    sum_p = _mm_sub_epi16(sum_p, q[5]);
+    flat2_p[1] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_p, _mm_add_epi16(
+                       sum_p6, _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
+        4);
+    flat2_q[1] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_q, _mm_add_epi16(
+                       sum_q6, _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
+        4);
     sum_lq = _mm_sub_epi16(sum_lp, p[2]);
     sum_lp = _mm_sub_epi16(sum_lp, q[2]);
     flat_p[1] =
         _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3);
     flat_q[1] =
         _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3);
-
-    sum_p7 = _mm_add_epi16(sum_p7, p[7]);
-    sum_q7 = _mm_add_epi16(sum_q7, q[7]);
+    sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+    sum_q6 = _mm_add_epi16(sum_q6, q[6]);
     sum_p3 = _mm_add_epi16(sum_p3, p[3]);
     sum_q3 = _mm_add_epi16(sum_q3, q[3]);
-
-    sum_p = _mm_sub_epi16(sum_p, q[5]);
-    sum_q = _mm_sub_epi16(sum_q, p[5]);
-    flat2_p[2] =
-        _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[2])), 4);
-    flat2_q[2] =
-        _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[2])), 4);
-
+    sum_p = _mm_sub_epi16(sum_p, q[4]);
+    sum_q = _mm_sub_epi16(sum_q, p[4]);
+    flat2_p[2] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_p, _mm_add_epi16(
+                       sum_p6, _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
+        4);
+    flat2_q[2] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_q, _mm_add_epi16(
+                       sum_q6, _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
+        4);
     sum_lp = _mm_sub_epi16(sum_lp, q[1]);
     sum_lq = _mm_sub_epi16(sum_lq, p[1]);
     flat_p[2] =
         _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3);
     flat_q[2] =
         _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3);
-
-    int i;
-    for (i = 3; i < 7; ++i) {
-      sum_p7 = _mm_add_epi16(sum_p7, p[7]);
-      sum_q7 = _mm_add_epi16(sum_q7, q[7]);
-      sum_p = _mm_sub_epi16(sum_p, q[7 - i]);
-      sum_q = _mm_sub_epi16(sum_q, p[7 - i]);
-      flat2_p[i] =
-          _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[i])), 4);
-      flat2_q[i] =
-          _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[i])), 4);
-    }
+    sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+    sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+    sum_p = _mm_sub_epi16(sum_p, q[3]);
+    sum_q = _mm_sub_epi16(sum_q, p[3]);
+    flat2_p[3] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_p, _mm_add_epi16(
+                       sum_p6, _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
+        4);
+    flat2_q[3] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_q, _mm_add_epi16(
+                       sum_q6, _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
+        4);
+    sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+    sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+    sum_p = _mm_sub_epi16(sum_p, q[2]);
+    sum_q = _mm_sub_epi16(sum_q, p[2]);
+    flat2_p[4] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_p, _mm_add_epi16(
+                       sum_p6, _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
+        4);
+    flat2_q[4] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_q, _mm_add_epi16(
+                       sum_q6, _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
+        4);
+    sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+    sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+    sum_p = _mm_sub_epi16(sum_p, q[1]);
+    sum_q = _mm_sub_epi16(sum_q, p[1]);
+    flat2_p[5] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_p, _mm_add_epi16(
+                       sum_p6, _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
+        4);
+    flat2_q[5] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_q, _mm_add_epi16(
+                       sum_q6, _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
+        4);
   }
-
   // highbd_filter8
   p[2] = _mm_andnot_si128(flat, p[2]);
   //  p2 remains unchanged if !(flat && mask)
@@ -320,7 +633,6 @@ static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch,
   q[2] = _mm_andnot_si128(flat, q[2]);
   flat_q[2] = _mm_and_si128(flat, flat_q[2]);
   q[2] = _mm_or_si128(q[2], flat_q[2]);  // full list of q2 values
-
   int i;
   for (i = 1; i >= 0; i--) {
     ps[i] = _mm_andnot_si128(flat, ps[i]);
@@ -330,675 +642,979 @@ static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch,
     flat_q[i] = _mm_and_si128(flat, flat_q[i]);
     q[i] = _mm_or_si128(qs[i], flat_q[i]);
   }
-
   // highbd_filter16
-
-  if (pixel_output == FOUR_PIXELS) {
-    for (i = 6; i >= 0; i--) {
-      //  p[i] remains unchanged if !(flat2 && flat && mask)
-      p[i] = _mm_andnot_si128(flat2, p[i]);
-      flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
-      //  get values for when (flat2 && flat && mask)
-      p[i] = _mm_or_si128(p[i], flat2_p[i]);  // full list of p values
-
-      q[i] = _mm_andnot_si128(flat2, q[i]);
-      flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
-      q[i] = _mm_or_si128(q[i], flat2_q[i]);
-      _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), p[i]);
-      _mm_storel_epi64((__m128i *)(s + i * pitch), q[i]);
-    }
-  } else {  // EIGHT_PIXELS
-    for (i = 6; i >= 0; i--) {
-      //  p[i] remains unchanged if !(flat2 && flat && mask)
-      p[i] = _mm_andnot_si128(flat2, p[i]);
-      flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
-      //  get values for when (flat2 && flat && mask)
-      p[i] = _mm_or_si128(p[i], flat2_p[i]);  // full list of p values
-
-      q[i] = _mm_andnot_si128(flat2, q[i]);
-      flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
-      q[i] = _mm_or_si128(q[i], flat2_q[i]);
-      _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
-      _mm_store_si128((__m128i *)(s + i * pitch), q[i]);
-    }
+  for (i = 5; i >= 0; i--) {
+    //  p[i] remains unchanged if !(flat2 && flat && mask)
+    p[i] = _mm_andnot_si128(flat2, p[i]);
+    flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
+    //  get values for when (flat2 && flat && mask)
+    p[i] = _mm_or_si128(p[i], flat2_p[i]);  // full list of p values
+    q[i] = _mm_andnot_si128(flat2, q[i]);
+    flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
+    q[i] = _mm_or_si128(q[i], flat2_q[i]);
   }
 }
 
-// Note:
-//  highbd_lpf_horz_edge_8_8p() output 8 pixels per register
-//  highbd_lpf_horz_edge_8_4p() output 4 pixels per register
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-static INLINE void highbd_lpf_horz_edge_8_4p(uint16_t *s, int pitch,
-                                             const uint8_t *blt,
-                                             const uint8_t *lt,
-                                             const uint8_t *thr, int bd) {
-  highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, FOUR_PIXELS);
-}
-#endif  // #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-
-static INLINE void highbd_lpf_horz_edge_8_8p(uint16_t *s, int pitch,
-                                             const uint8_t *blt,
-                                             const uint8_t *lt,
-                                             const uint8_t *thr, int bd) {
-  highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, EIGHT_PIXELS);
-}
-
-void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
-                                           const uint8_t *_blimit,
-                                           const uint8_t *_limit,
-                                           const uint8_t *_thresh, int bd) {
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd);
-#else
-  highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd);
-#endif
-}
-
-void aom_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p,
-                                            const uint8_t *_blimit,
-                                            const uint8_t *_limit,
-                                            const uint8_t *_thresh, int bd) {
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd);
-#else
-  highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd);
-  highbd_lpf_horz_edge_8_8p(s + 8, p, _blimit, _limit, _thresh, bd);
-#endif
-}
-
-static INLINE void store_horizontal_8(const __m128i *p2, const __m128i *p1,
-                                      const __m128i *p0, const __m128i *q0,
-                                      const __m128i *q1, const __m128i *q2,
-                                      int p, uint16_t *s) {
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  _mm_storel_epi64((__m128i *)(s - 3 * p), *p2);
-  _mm_storel_epi64((__m128i *)(s - 2 * p), *p1);
-  _mm_storel_epi64((__m128i *)(s - 1 * p), *p0);
-  _mm_storel_epi64((__m128i *)(s + 0 * p), *q0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), *q1);
-  _mm_storel_epi64((__m128i *)(s + 2 * p), *q2);
-#else
-  _mm_store_si128((__m128i *)(s - 3 * p), *p2);
-  _mm_store_si128((__m128i *)(s - 2 * p), *p1);
-  _mm_store_si128((__m128i *)(s - 1 * p), *p0);
-  _mm_store_si128((__m128i *)(s + 0 * p), *q0);
-  _mm_store_si128((__m128i *)(s + 1 * p), *q1);
-  _mm_store_si128((__m128i *)(s + 2 * p), *q2);
-#endif
+void aom_highbd_lpf_horizontal_14_dual_sse2(
+    uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i p[7], q[7];
+  int i;
+  load_highbd_pixel(s, 7, pitch, p, q);
+
+  highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1,
+                                   _limit1, _thresh1, bd);
+
+  for (i = 0; i < 6; i++) {
+    _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
+    _mm_store_si128((__m128i *)(s + i * pitch), q[i]);
+  }
 }
 
-void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
+static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
+    __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
+    __m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit,
+    const uint8_t *_limit, const uint8_t *_thresh, int bd) {
   __m128i blimit, limit, thresh;
   __m128i mask, hev, flat;
-  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i ffff = _mm_cmpeq_epi16(one, one);
-  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-  const __m128i four = _mm_set1_epi16(4);
-  __m128i workp_a, workp_b, workp_shft;
+  __m128i pq[3];
+  __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0;
+  __m128i flat_p1p0, flat_q0q1;
 
-  const __m128i t4 = _mm_set1_epi16(4);
-  const __m128i t3 = _mm_set1_epi16(3);
+  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i four = _mm_set1_epi16(4);
   __m128i t80;
-  const __m128i t1 = _mm_set1_epi16(0x1);
-  __m128i ps1, ps0, qs0, qs1;
-  __m128i filt;
-  __m128i work_a;
-  __m128i filter1, filter2;
-
-  if (bd == 8) {
-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
-    t80 = _mm_set1_epi16(0x80);
-  } else if (bd == 10) {
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
-    t80 = _mm_set1_epi16(0x200);
-  } else {  // bd == 12
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
-    t80 = _mm_set1_epi16(0x800);
+  const __m128i one = _mm_set1_epi16(0x1);
+
+  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+  highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hev, &mask);
+
+  // flat_mask
+  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0);
+  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
+
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi64(flat, flat);
+
+  {
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+
+    // op1
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0),
+                            _mm_add_epi16(*p1, *p1));  // *p0 *2 + *p1 * 2
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
+                            *p2);  // *p2 + *p0 * 2 + *p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0);
+    workp_shft0 = _mm_add_epi16(
+        workp_a, workp_b);  // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4
+
+    // op0
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1);  // *q0 * 2 + *q1
+    workp_a =
+        _mm_add_epi16(workp_a,
+                      workp_b);  // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4
+
+    flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_a, workp_shft0), 3);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2),
+                            *p1);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 + 4
+    workp_b = _mm_add_epi16(*q1, *q2);
+    workp_shft0 = _mm_add_epi16(
+        workp_a, workp_b);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 * 2 + *q2 + 4
+
+    // oq1
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1),
+                            *p0);  // *p0   + *q0 * 2 + *q1 * 2 + *q2 + 4
+    workp_b = _mm_add_epi16(*q2, *q2);
+    workp_shft1 = _mm_add_epi16(
+        workp_a, workp_b);  // *p0  + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
+
+    flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
   }
+  // lp filter
+  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd);
 
-  ps1 = _mm_subs_epi16(p1, t80);
-  ps0 = _mm_subs_epi16(p0, t80);
-  qs0 = _mm_subs_epi16(q0, t80);
-  qs1 = _mm_subs_epi16(q1, t80);
+  qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
+  q1q0 = _mm_and_si128(flat, flat_q0q1);
+  *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
 
-  // filter_mask and hev_mask
-  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
-  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+  ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
+  p1p0 = _mm_and_si128(flat, flat_p1p0);
+  *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+}
 
-  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
-  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
-  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
-  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
+    __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
+    __m128i *q2, const unsigned char *_blimit0, const unsigned char *_limit0,
+    const unsigned char *_thresh0, const unsigned char *_blimit1,
+    const unsigned char *_limit1, const unsigned char *_thresh1, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i blimit0, limit0, thresh0;
+  __m128i t80;
+  __m128i mask, flat, work;
+  __m128i abs_p1q1, abs_p0q0, abs_p1p0, abs_p2p1, abs_q1q0, abs_q2q1;
+  __m128i op1, op0, oq0, oq1;
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i one = _mm_set1_epi16(0x1);
+  const __m128i ffff = _mm_cmpeq_epi16(one, one);
+
+  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+                 &blimit0, &limit0, &thresh0, &t80);
+
+  abs_p2p1 = abs_diff16(*p2, *p1);
+  abs_p1p0 = abs_diff16(*p1, *p0);
+  abs_q1q0 = abs_diff16(*q1, *q0);
+  abs_q2q1 = abs_diff16(*q2, *q1);
+
+  abs_p0q0 = abs_diff16(*p0, *q0);
+  abs_p1q1 = abs_diff16(*p1, *q1);
 
   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
   // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
-  mask = _mm_max_epi16(abs_p1p0, mask);
-  // mask |= (abs(p1 - p0) > limit) * -1;
-  mask = _mm_max_epi16(abs_q1q0, mask);
-  // mask |= (abs(q1 - q0) > limit) * -1;
-
-  work = _mm_max_epi16(
-      _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
-      _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
-  mask = _mm_max_epi16(work, mask);
-  work = _mm_max_epi16(
-      _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
-      _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+
+  mask = _mm_max_epi16(abs_q2q1, mask);
+  work = _mm_max_epi16(abs_p1p0, abs_q1q0);
   mask = _mm_max_epi16(work, mask);
-  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_max_epi16(mask, abs_p2p1);
+  mask = _mm_subs_epu16(mask, limit0);
   mask = _mm_cmpeq_epi16(mask, zero);
 
+  // flat_mask
+  flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0));
+  flat = _mm_max_epi16(flat, work);
+
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);  // flat & mask
+
+  {
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+
+    // op1
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0),
+                            _mm_add_epi16(*p1, *p1));  // *p0 *2 + *p1 * 2
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
+                            *p2);  // *p2 + *p0 * 2 + *p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0);
+    workp_shft0 = _mm_add_epi16(
+        workp_a, workp_b);  // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4
+    op1 = _mm_srli_epi16(workp_shft0, 3);
+
+    // op0
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1);  // *q0 * 2 + *q1
+    workp_a =
+        _mm_add_epi16(workp_a,
+                      workp_b);  // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4
+    op0 = _mm_srli_epi16(workp_a, 3);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2),
+                            *p1);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 + 4
+    workp_b = _mm_add_epi16(*q1, *q2);
+    workp_shft0 = _mm_add_epi16(
+        workp_a, workp_b);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 * 2 + *q2 + 4
+    oq0 = _mm_srli_epi16(workp_shft0, 3);
+
+    // oq1
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1),
+                            *p0);  // *p0   + *q0 * 2 + *q1 * 2 + *q2 + 4
+    workp_b = _mm_add_epi16(*q2, *q2);
+    workp_shft1 = _mm_add_epi16(
+        workp_a, workp_b);  // *p0  + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
+    oq1 = _mm_srli_epi16(workp_shft1, 3);
+  }
+  // lp filter
+  __m128i ps[2], qs[2], p[2], q[2];
+  {
+    p[0] = *p0;
+    p[1] = *p1;
+    q[0] = *q0;
+    q[1] = *q1;
+    // filter_mask and hev_mask
+    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+  }
+
+  qs[0] = _mm_andnot_si128(flat, qs[0]);
+  oq0 = _mm_and_si128(flat, oq0);
+  *q0 = _mm_or_si128(qs[0], oq0);
+
+  qs[1] = _mm_andnot_si128(flat, qs[1]);
+  oq1 = _mm_and_si128(flat, oq1);
+  *q1 = _mm_or_si128(qs[1], oq1);
+
+  ps[0] = _mm_andnot_si128(flat, ps[0]);
+  op0 = _mm_and_si128(flat, op0);
+  *p0 = _mm_or_si128(ps[0], op0);
+
+  ps[1] = _mm_andnot_si128(flat, ps[1]);
+  op1 = _mm_and_si128(flat, op1);
+  *p1 = _mm_or_si128(ps[1], op1);
+}
+
+void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out;
+
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+
+  highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out,
+                             _blimit, _limit, _thresh, bd);
+
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8));
+}
+
+void aom_highbd_lpf_horizontal_6_dual_sse2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2;
+
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+
+  highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
+                                  _limit0, _thresh0, _blimit1, _limit1,
+                                  _thresh1, bd);
+
+  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+    const unsigned char *_blimit, const unsigned char *_limit,
+    const unsigned char *_thresh, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i blimit, limit, thresh;
+  __m128i mask, hev, flat;
+  __m128i pq[4];
+  __m128i p1p0, q1q0, ps1ps0, qs1qs0;
+  __m128i work_a, op2, oq2, flat_p1p0, flat_q0q1;
+
+  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
+  pq[3] = _mm_unpacklo_epi64(*p3, *q3);
+
+  __m128i abs_p1p0;
+
+  const __m128i four = _mm_set1_epi16(4);
+  __m128i t80;
+  const __m128i one = _mm_set1_epi16(0x1);
+
+  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hev, &mask);
+
   // flat_mask4
-  flat = _mm_max_epi16(
-      _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
-      _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)));
-  work = _mm_max_epi16(
-      _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)),
-      _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
-  flat = _mm_max_epi16(work, flat);
+  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0]));
   flat = _mm_max_epi16(abs_p1p0, flat);
-  flat = _mm_max_epi16(abs_q1q0, flat);
+  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
 
-  if (bd == 8)
-    flat = _mm_subs_epu16(flat, one);
-  else if (bd == 10)
-    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
-  else  // bd == 12
-    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
 
   flat = _mm_cmpeq_epi16(flat, zero);
-  flat = _mm_and_si128(flat, mask);  // flat & mask
+  flat = _mm_and_si128(flat, mask);
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi64(flat, flat);
 
-  // Added before shift for rounding part of ROUND_POWER_OF_TWO
+  {
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+    // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+    // o*p2
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
+    op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // o*p1
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
+    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
+
+    // o*p0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
+    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+    flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
+    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
+    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+    flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
+    oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  }
+
+  // lp filter
+  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd);
 
-  workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
-  workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
-  workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
+  qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
+  q1q0 = _mm_and_si128(flat, flat_q0q1);
+  *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
 
-  workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
+  ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
+  p1p0 = _mm_and_si128(flat, flat_p1p0);
+  *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
 
-  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
-  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
+  work_a = _mm_andnot_si128(flat, *q2);
+  *q2 = _mm_and_si128(flat, oq2);
+  *q2 = _mm_or_si128(work_a, *q2);
 
-  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
-  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
+  work_a = _mm_andnot_si128(flat, *p2);
+  *p2 = _mm_and_si128(flat, op2);
+  *p2 = _mm_or_si128(work_a, *p2);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, const unsigned char *_blimit0,
+    const unsigned char *_limit0, const unsigned char *_thresh0,
+    const unsigned char *_blimit1, const unsigned char *_limit1,
+    const unsigned char *_thresh1, int bd) {
+  __m128i blimit0, limit0, thresh0;
+  __m128i t80;
+  __m128i mask, flat;
+  __m128i work_a, op2, oq2, op1, op0, oq0, oq1;
+  __m128i abs_p1q1, abs_p0q0, work0, work1, work2;
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i one = _mm_set1_epi16(0x1);
+  const __m128i ffff = _mm_cmpeq_epi16(one, one);
+
+  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+                 &blimit0, &limit0, &thresh0, &t80);
+
+  abs_p0q0 = abs_diff16(*p0, *q0);
+  abs_p1q1 = abs_diff16(*p1, *q1);
+
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2  > blimit) * -1;
+
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+
+  work0 = _mm_max_epi16(abs_diff16(*p3, *p2), abs_diff16(*p2, *p1));
+  work1 =
+      _mm_max_epi16(abs_diff16(*p1, *p0), abs_diff16(*q1, *q0));  // tbu 4 flat
+  work0 = _mm_max_epi16(work0, work1);
+  work2 = _mm_max_epi16(abs_diff16(*q2, *q1), abs_diff16(*q2, *q3));
+  work2 = _mm_max_epi16(work2, work0);
+  mask = _mm_max_epi16(work2, mask);
+
+  mask = _mm_subs_epu16(mask, limit0);
+  mask = _mm_cmpeq_epi16(mask, zero);
 
-  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
-  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
+  flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0));
+  flat = _mm_max_epi16(work1, flat);
+  work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0));
+  flat = _mm_max_epi16(work0, flat);
 
-  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
-  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);  // flat & mask
+
+  {
+    __m128i workp_a, workp_b;
+    // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+    // o*p2
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
+    op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // o*p1
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
+    op1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // o*p0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
+    op0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
+    oq0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
+    oq1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
+    oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  }
 
   // lp filter
-  const __m128i pmax =
-      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
-  const __m128i pmin = _mm_subs_epi16(zero, t80);
+  __m128i ps[2], qs[2], p[2], q[2];
+  {
+    p[0] = *p0;
+    p[1] = *p1;
+    q[0] = *q0;
+    q[1] = *q1;
+    // filter_mask and hev_mask
+    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+  }
 
-  filt = _mm_subs_epi16(ps1, qs1);
-  pixel_clamp(&pmin, &pmax, &filt);
+  qs[0] = _mm_andnot_si128(flat, qs[0]);
+  oq0 = _mm_and_si128(flat, oq0);
+  *q0 = _mm_or_si128(qs[0], oq0);
 
-  filt = _mm_and_si128(filt, hev);
-  work_a = _mm_subs_epi16(qs0, ps0);
-  filt = _mm_adds_epi16(filt, work_a);
-  filt = _mm_adds_epi16(filt, work_a);
-  filt = _mm_adds_epi16(filt, work_a);
-  // (aom_filter + 3 * (qs0 - ps0)) & mask
-  pixel_clamp(&pmin, &pmax, &filt);
-  filt = _mm_and_si128(filt, mask);
+  qs[1] = _mm_andnot_si128(flat, qs[1]);
+  oq1 = _mm_and_si128(flat, oq1);
+  *q1 = _mm_or_si128(qs[1], oq1);
 
-  filter1 = _mm_adds_epi16(filt, t4);
-  filter2 = _mm_adds_epi16(filt, t3);
+  ps[0] = _mm_andnot_si128(flat, ps[0]);
+  op0 = _mm_and_si128(flat, op0);
+  *p0 = _mm_or_si128(ps[0], op0);
 
-  // Filter1 >> 3
-  pixel_clamp(&pmin, &pmax, &filter1);
-  filter1 = _mm_srai_epi16(filter1, 3);
+  ps[1] = _mm_andnot_si128(flat, ps[1]);
+  op1 = _mm_and_si128(flat, op1);
+  *p1 = _mm_or_si128(ps[1], op1);
 
-  // Filter2 >> 3
-  pixel_clamp(&pmin, &pmax, &filter2);
-  filter2 = _mm_srai_epi16(filter2, 3);
+  work_a = _mm_andnot_si128(flat, *q2);
+  *q2 = _mm_and_si128(flat, oq2);
+  *q2 = _mm_or_si128(work_a, *q2);
 
-  // filt >> 1
-  filt = _mm_adds_epi16(filter1, t1);
-  filt = _mm_srai_epi16(filt, 1);
-  filt = _mm_andnot_si128(hev, filt);
-
-  work_a = _mm_subs_epi16(qs0, filter1);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm_adds_epi16(work_a, t80);
-  q0 = _mm_load_si128((__m128i *)flat_oq0);
-  work_a = _mm_andnot_si128(flat, work_a);
-  q0 = _mm_and_si128(flat, q0);
-  q0 = _mm_or_si128(work_a, q0);
-
-  work_a = _mm_subs_epi16(qs1, filt);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm_adds_epi16(work_a, t80);
-  q1 = _mm_load_si128((__m128i *)flat_oq1);
-  work_a = _mm_andnot_si128(flat, work_a);
-  q1 = _mm_and_si128(flat, q1);
-  q1 = _mm_or_si128(work_a, q1);
-
-  work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q2 = _mm_load_si128((__m128i *)flat_oq2);
-  work_a = _mm_andnot_si128(flat, work_a);
-  q2 = _mm_and_si128(flat, q2);
-  q2 = _mm_or_si128(work_a, q2);
-
-  work_a = _mm_adds_epi16(ps0, filter2);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm_adds_epi16(work_a, t80);
-  p0 = _mm_load_si128((__m128i *)flat_op0);
-  work_a = _mm_andnot_si128(flat, work_a);
-  p0 = _mm_and_si128(flat, p0);
-  p0 = _mm_or_si128(work_a, p0);
-
-  work_a = _mm_adds_epi16(ps1, filt);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm_adds_epi16(work_a, t80);
-  p1 = _mm_load_si128((__m128i *)flat_op1);
-  work_a = _mm_andnot_si128(flat, work_a);
-  p1 = _mm_and_si128(flat, p1);
-  p1 = _mm_or_si128(work_a, p1);
-
-  work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p2 = _mm_load_si128((__m128i *)flat_op2);
-  work_a = _mm_andnot_si128(flat, work_a);
-  p2 = _mm_and_si128(flat, p2);
-  p2 = _mm_or_si128(work_a, p2);
-
-  store_horizontal_8(&p2, &p1, &p0, &q0, &q1, &q2, p, s);
+  work_a = _mm_andnot_si128(flat, *p2);
+  *p2 = _mm_and_si128(flat, op2);
+  *p2 = _mm_or_si128(work_a, *p2);
+}
+
+void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+  __m128i q1q0, p1p0;
+
+  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
+
+  highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0,
+                             &p1p0, _blimit, _limit, _thresh, bd);
+
+  _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
 }
 
 void aom_highbd_lpf_horizontal_8_dual_sse2(
     uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
     const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
     const uint8_t *_thresh1, int bd) {
-  aom_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
-  aom_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
+
+  highbd_lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0,
+                                  _blimit0, _limit0, _thresh0, _blimit1,
+                                  _limit1, _thresh1, bd);
+
+  _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+  _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
 }
 
-void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
-  const __m128i zero = _mm_set1_epi16(0);
+static AOM_FORCE_INLINE void highbd_lpf_internal_4_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q1q0_out,
+    __m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit,
+    const uint8_t *_thresh, int bd) {
   __m128i blimit, limit, thresh;
-  __m128i mask, hev, flat;
-#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4)
-  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-#endif
-  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4)
-  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-#endif
-  const __m128i abs_p1p0 =
-      _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
-  const __m128i abs_q1q0 =
-      _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+  __m128i mask, hev;
+  __m128i p1p0, q1q0;
+  __m128i pq[2];
+
+  __m128i abs_p1p0;
+
+  __m128i t80;
+  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+
+  highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hev, &mask);
+
+  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_4_dual_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *ps,
+    __m128i *qs, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i blimit0, limit0, thresh0;
+  __m128i mask, flat;
+  __m128i p[2], q[2];
+
+  const __m128i zero = _mm_setzero_si128();
+  __m128i abs_p0q0 = abs_diff16(*q0, *p0);
+  __m128i abs_p1q1 = abs_diff16(*q1, *p1);
+
+  __m128i abs_p1p0 = abs_diff16(*p1, *p0);
+  __m128i abs_q1q0 = abs_diff16(*q1, *q0);
+
   const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
   const __m128i one = _mm_set1_epi16(1);
-  __m128i abs_p0q0 =
-      _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
-  __m128i abs_p1q1 =
-      _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
 
-  const __m128i t4 = _mm_set1_epi16(4);
-  const __m128i t3 = _mm_set1_epi16(3);
   __m128i t80;
-  __m128i tff80;
-  __m128i tffe0;
-  __m128i t1f;
-  // equivalent to shifting 0x1f left by bitdepth - 8
-  // and setting new bits to 1
-  const __m128i t1 = _mm_set1_epi16(0x1);
-  __m128i t7f;
-  // equivalent to shifting 0x7f left by bitdepth - 8
-  // and setting new bits to 1
-  __m128i ps1, ps0, qs0, qs1;
-  __m128i filt;
-  __m128i work_a;
-  __m128i filter1, filter2;
-
-  if (bd == 8) {
-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
-    t80 = _mm_set1_epi16(0x80);
-    tff80 = _mm_set1_epi16(0xff80);
-    tffe0 = _mm_set1_epi16(0xffe0);
-    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
-    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
-  } else if (bd == 10) {
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
-    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
-    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
-    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
-    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
-    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
-  } else {  // bd == 12
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
-    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
-    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
-    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
-    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
-    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
-  }
 
-  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
-  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
-  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
-  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+                 &blimit0, &limit0, &thresh0, &t80);
 
   // filter_mask and hev_mask
   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
-  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
 
   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
   // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
   mask = _mm_max_epi16(flat, mask);
 
-#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4)
-  __m128i work = _mm_max_epi16(
-      _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
-      _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)));
-  mask = _mm_max_epi16(work, mask);
-  work = _mm_max_epi16(
-      _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),
-      _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
-  mask = _mm_max_epi16(work, mask);
-#endif
-  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_subs_epu16(mask, limit0);
   mask = _mm_cmpeq_epi16(mask, zero);
 
-  // filter4
-  const __m128i pmax =
-      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
-  const __m128i pmin = _mm_subs_epi16(zero, t80);
-
-  filt = _mm_subs_epi16(ps1, qs1);
-  pixel_clamp(&pmin, &pmax, &filt);
-  filt = _mm_and_si128(filt, hev);
-  work_a = _mm_subs_epi16(qs0, ps0);
-  filt = _mm_adds_epi16(filt, work_a);
-  filt = _mm_adds_epi16(filt, work_a);
-  filt = _mm_adds_epi16(filt, work_a);
-  pixel_clamp(&pmin, &pmax, &filt);
-
-  // (aom_filter + 3 * (qs0 - ps0)) & mask
-  filt = _mm_and_si128(filt, mask);
-
-  filter1 = _mm_adds_epi16(filt, t4);
-  pixel_clamp(&pmin, &pmax, &filter1);
-
-  filter2 = _mm_adds_epi16(filt, t3);
-  pixel_clamp(&pmin, &pmax, &filter2);
-
-  // Filter1 >> 3
-  work_a = _mm_cmpgt_epi16(zero, filter1);  // get the values that are <0
-  filter1 = _mm_srli_epi16(filter1, 3);
-  work_a = _mm_and_si128(work_a, tffe0);    // sign bits for the values < 0
-  filter1 = _mm_and_si128(filter1, t1f);    // clamp the range
-  filter1 = _mm_or_si128(filter1, work_a);  // reinsert the sign bits
+  p[0] = *p0;
+  p[1] = *p1;
+  q[0] = *q0;
+  q[1] = *q1;
 
-  // Filter2 >> 3
-  work_a = _mm_cmpgt_epi16(zero, filter2);
-  filter2 = _mm_srli_epi16(filter2, 3);
-  work_a = _mm_and_si128(work_a, tffe0);
-  filter2 = _mm_and_si128(filter2, t1f);
-  filter2 = _mm_or_si128(filter2, work_a);
+  highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+}
 
-  // filt >> 1
-  filt = _mm_adds_epi16(filter1, t1);
-  work_a = _mm_cmpgt_epi16(zero, filt);
-  filt = _mm_srli_epi16(filt, 1);
-  work_a = _mm_and_si128(work_a, tff80);
-  filt = _mm_and_si128(filt, t7f);
-  filt = _mm_or_si128(filt, work_a);
-
-  filt = _mm_andnot_si128(hev, filt);
-
-  q0 = _mm_subs_epi16(qs0, filter1);
-  pixel_clamp(&pmin, &pmax, &q0);
-  q0 = _mm_adds_epi16(q0, t80);
-
-  q1 = _mm_subs_epi16(qs1, filt);
-  pixel_clamp(&pmin, &pmax, &q1);
-  q1 = _mm_adds_epi16(q1, t80);
-
-  p0 = _mm_adds_epi16(ps0, filter2);
-  pixel_clamp(&pmin, &pmax, &p0);
-  p0 = _mm_adds_epi16(p0, t80);
-
-  p1 = _mm_adds_epi16(ps1, filt);
-  pixel_clamp(&pmin, &pmax, &p1);
-  p1 = _mm_adds_epi16(p1, t80);
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
-  _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
-  _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
-#else
-  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-#endif
+void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh, int bd) {
+  __m128i p1p0, q1q0;
+  __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+
+  highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit,
+                             _thresh, bd);
+
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
 }
 
 void aom_highbd_lpf_horizontal_4_dual_sse2(
     uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
     const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
     const uint8_t *_thresh1, int bd) {
-  aom_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
-  aom_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
+  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  __m128i ps[2], qs[2];
+
+  highbd_lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, ps, qs, _blimit0, _limit0,
+                                  _thresh0, _blimit1, _limit1, _thresh1, bd);
+
+  _mm_storeu_si128((__m128i *)(s - 2 * p), ps[1]);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), ps[0]);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), qs[0]);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), qs[1]);
 }
 
 void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
                                     const uint8_t *limit, const uint8_t *thresh,
                                     int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
-  uint16_t *src[1];
-  uint16_t *dst[1];
+  __m128i x0, x1, x2, x3, d0, d1, d2, d3;
+  __m128i p1p0, q1q0;
+  __m128i p1, q1;
 
-  // Transpose 8x8
-  src[0] = s - 4;
-  dst[0] = t_dst;
+  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
 
-  highbd_transpose(src, p, dst, 8, 1);
+  highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3);
 
-  // Loop filtering
-  aom_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
+  highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit,
+                             thresh, bd);
+
+  p1 = _mm_srli_si128(p1p0, 8);
+  q1 = _mm_srli_si128(q1q0, 8);
 
-  src[0] = t_dst;
-  dst[0] = s - 4;
+  // transpose from 8x4 to 4x8
+  highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
 
-  // Transpose back
-  highbd_transpose(src, 8, dst, p, 1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
 }
 
 void aom_highbd_lpf_vertical_4_dual_sse2(
     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
-  uint16_t *src[2];
-  uint16_t *dst[2];
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i ps[2], qs[2];
 
-  // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)(s - 2 + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)(s - 2 + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)(s - 2 + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)(s - 2 + 7 * p));
+
+  highbd_transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
+                               &d2, &d3);
+
+  highbd_lpf_internal_4_dual_sse2(&d0, &d1, &d2, &d3, ps, qs, blimit0, limit0,
+                                  thresh0, blimit1, limit1, thresh1, bd);
+
+  highbd_transpose4x8_8x4_sse2(&ps[1], &ps[0], &qs[0], &qs[1], &d0, &d1, &d2,
+                               &d3, &d4, &d5, &d6, &d7);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+  _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
+  _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
+  _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
+  _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
+}
 
-  // Loop filtering
-  aom_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
-                                        thresh0, blimit1, limit1, thresh1, bd);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int bd) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x3, x2, x1, x0, p0, q0;
+  __m128i p1p0, q1q0;
+
+  x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
+  x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
+  x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
+  x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
+
+  highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5,
+                               &d6, &d7);
+
+  highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit,
+                             limit, thresh, bd);
+
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
 
-  // Transpose back
-  highbd_transpose(src, 16, dst, p, 2);
+  highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+}
+
+void aom_highbd_lpf_vertical_6_dual_sse2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i p0, q0, p1, q1, p2, q2;
+
+  x0 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
+  x1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
+  x2 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
+  x3 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
+  x4 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p));
+  x5 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p));
+  x6 = _mm_loadu_si128((__m128i *)((s - 3) + 6 * p));
+  x7 = _mm_loadu_si128((__m128i *)((s - 3) + 7 * p));
+
+  highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p2, &p1,
+                           &p0, &q0, &q1, &q2, &d6, &d7);
+
+  highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
+                                  _limit0, _thresh0, _blimit1, _limit1,
+                                  _thresh1, bd);
+
+  highbd_transpose4x8_8x4_sse2(&p1, &p0, &q0, &q1, &d0, &d1, &d2, &d3, &d4, &d5,
+                               &d6, &d7);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+  _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
+  _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
+  _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
+  _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
 }
 
 void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
                                     const uint8_t *limit, const uint8_t *thresh,
                                     int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
-  uint16_t *src[1];
-  uint16_t *dst[1];
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i p2, p1, p0, p3, q0;
+  __m128i q1q0, p1p0;
 
-  // Transpose 8x8
-  src[0] = s - 4;
-  dst[0] = t_dst;
+  p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p));
+  p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p));
+  p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p));
 
-  highbd_transpose(src, p, dst, 8, 1);
+  highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5,
+                               &d6, &d7);
 
   // Loop filtering
-  aom_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
+  highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0,
+                             &p1p0, blimit, limit, thresh, bd);
 
-  src[0] = t_dst;
-  dst[0] = s - 4;
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
 
-  // Transpose back
-  highbd_transpose(src, 8, dst, p, 1);
+  highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0,
+                               &d1, &d2, &d3);
+
+  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0);
+  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1);
+  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2);
+  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3);
 }
 
 void aom_highbd_lpf_vertical_8_dual_sse2(
     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
-  uint16_t *src[2];
-  uint16_t *dst[2];
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+
+  x0 = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p));
+  x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p));
+  x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p));
+  x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p));
+  x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p));
+  x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p));
+  x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p));
+  x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p));
+
+  highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
+                           &d2, &d3, &d4, &d5, &d6, &d7);
+
+  highbd_lpf_internal_8_dual_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4,
+                                  blimit0, limit0, thresh0, blimit1, limit1,
+                                  thresh1, bd);
+
+  highbd_transpose8x8_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &x0, &x1,
+                           &x2, &x3, &x4, &x5, &x6, &x7);
+
+  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0);
+  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1);
+  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2);
+  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3);
+  _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4);
+  _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5);
+  _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6);
+  _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7);
+}
 
-  // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh, int bd) {
+  __m128i q[7], p[7], pq[7];
+  __m128i p6, p5, p4, p3;
+  __m128i p6_2, p5_2, p4_2, p3_2;
+  __m128i d0, d1, d2, d3;
+  __m128i d0_2, d1_2, d2_2, d3_2, d7_2;
 
-  // Loop filtering
-  aom_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
-                                        thresh0, blimit1, limit1, thresh1, bd);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
+  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
+  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
+  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
+  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
 
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4],
+                               &p[3], &p[2], &p[1], &p[0]);
 
-  // Transpose back
-  highbd_transpose(src, 16, dst, p, 2);
-}
+  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
+  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
 
-void aom_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                     const uint8_t *limit,
-                                     const uint8_t *thresh, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
-  uint16_t *src[2];
-  uint16_t *dst[2];
+  highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2],
+                               &q[3], &q[4], &q[5], &q[6], &d7_2);
 
-  src[0] = s - 8;
-  src[1] = s;
-  dst[0] = t_dst;
-  dst[1] = t_dst + 8 * 8;
+  highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
 
-  // Transpose 16x8
-  highbd_transpose(src, p, dst, 8, 2);
+  highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2],
+                               &pq[1], &pq[0], &d0, &d1, &d2, &d3);
 
-  // Loop filtering
-  aom_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
-                                        bd);
-  src[0] = t_dst;
-  src[1] = t_dst + 8 * 8;
-  dst[0] = s - 8;
-  dst[1] = s;
-
-  // Transpose back
-  highbd_transpose(src, 8, dst, p, 2);
-}
-
-void aom_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p,
-                                          const uint8_t *blimit,
-                                          const uint8_t *limit,
-                                          const uint8_t *thresh, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
-
-  //  Transpose 16x16
-  highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
-  highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
-
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  highbd_lpf_horz_edge_8_8p(t_dst + 8 * 16, 16, blimit, limit, thresh, bd);
-#else
-  aom_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit,
-                                         thresh, bd);
-#endif
-  //  Transpose back
-  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
-  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+  q[0] = _mm_srli_si128(pq[0], 8);
+  q[1] = _mm_srli_si128(pq[1], 8);
+  q[2] = _mm_srli_si128(pq[2], 8);
+  q[3] = _mm_srli_si128(pq[3], 8);
+  q[4] = _mm_srli_si128(pq[4], 8);
+  q[5] = _mm_srli_si128(pq[5], 8);
+
+  highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6],
+                               &d7_2, &d0_2, &d1_2, &d2_2, &d3_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0);
+  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1);
+  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2);
+  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3);
+  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2);
+}
+
+void aom_highbd_lpf_vertical_14_dual_sse2(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  __m128i q[7], p[7];
+  __m128i p6, p5, p4, p3, p2, p1, p0, q0;
+  __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2;
+  __m128i d0, d7;
+  __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out;
+
+  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
+  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
+  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
+  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
+  p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch));
+  p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch));
+  p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch));
+  q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch));
+
+  highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6],
+                           &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]);
+
+  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
+  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+  p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
+  p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
+  p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
+  q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
+
+  highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2,
+                           &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5],
+                           &q[6], &d7);
+
+  highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1,
+                                   limit1, thresh1, bd);
+
+  highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0],
+                           &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
+                           &d6_out, &d7_out);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out);
+
+  highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7,
+                           &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
+                           &d6_out, &d7_out);
+
+  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out);
+  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out);
+  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out);
+  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out);
+  _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out);
+  _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out);
+  _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out);
+  _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out);
 }
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
index 2bbf15ef2..dea113a29 100644
--- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -11,7 +11,8 @@
 
 #include <immintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
index 855bc6558..e0d22522d 100644
--- a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
@@ -288,11 +288,9 @@ HIGH_SADNXN4D  8,  8
 HIGH_SADNXN4D  8,  4
 HIGH_SADNXN4D  4,  8
 HIGH_SADNXN4D  4,  4
-%if CONFIG_EXT_PARTITION_TYPES
 HIGH_SADNXN4D  4, 16
 HIGH_SADNXN4D 16,  4
 HIGH_SADNXN4D  8, 32
 HIGH_SADNXN4D 32,  8
 HIGH_SADNXN4D 16, 64
 HIGH_SADNXN4D 64, 16
-%endif
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
index 760e68aab..3398d8a2a 100644
--- a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
@@ -158,10 +158,8 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2
 HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2
-%endif
 
 ; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
@@ -230,10 +228,8 @@ HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2
 HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2
-%endif
 
 ; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
@@ -302,12 +298,10 @@ HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
 HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2
 HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2
 HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2
 HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2
-%endif
 
 ; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
@@ -376,7 +370,5 @@ HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
 HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
 HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
 HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
-%endif
diff --git a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index ee19796e3..61f5b8e86 100644
--- a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -94,7 +94,7 @@ SECTION .text
 %define filter_idx_shift 5
 
 
-%ifdef PIC    ; 64bit PIC
+%if ARCH_X86_64
   %if %2 == 1 ; avg
     cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
                                       x_offset, y_offset, \
@@ -102,19 +102,20 @@ SECTION .text
                                       sec, sec_stride, height, sse
     %define sec_str sec_strideq
   %else
-    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
-                                  y_offset, dst, dst_stride, height, sse
+    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+                                  x_offset, y_offset, \
+                                  dst, dst_stride, height, sse
   %endif
   %define block_height heightd
   %define bilin_filter sseq
 %else
-  %if ARCH_X86=1 && CONFIG_PIC=1
+  %if CONFIG_PIC=1
     %if %2 == 1 ; avg
       cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                  x_offset, y_offset, \
-                                  dst, dst_stride, \
-                                  sec, sec_stride, \
-                                  height, sse, g_bilin_filter, g_pw_8
+                                        x_offset, y_offset, \
+                                        dst, dst_stride, \
+                                        sec, sec_stride, height, sse, \
+                                        g_bilin_filter, g_pw_8
       %define block_height dword heightm
       %define sec_str sec_stridemp
 
@@ -133,8 +134,9 @@ SECTION .text
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %else
       cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                                x_offset, y_offset, dst, dst_stride, height, \
-                                sse, g_bilin_filter, g_pw_8
+                                    x_offset, y_offset, \
+                                    dst, dst_stride, height, sse, \
+                                    g_bilin_filter, g_pw_8
       %define block_height heightd
 
       ; Store bilin_filter and pw_8 location in stack
@@ -153,22 +155,16 @@ SECTION .text
     %endif
   %else
     %if %2 == 1 ; avg
-      cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
-                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
-                                             x_offset, y_offset, \
-                                             dst, dst_stride, \
-                                             sec, sec_stride, \
-                                             height, sse
-      %if ARCH_X86_64
-      %define block_height heightd
-      %define sec_str sec_strideq
-      %else
+      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                        x_offset, y_offset, \
+                                        dst, dst_stride, \
+                                        sec, sec_stride, height, sse
       %define block_height dword heightm
       %define sec_str sec_stridemp
-      %endif
     %else
       cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                              x_offset, y_offset, dst, dst_stride, height, sse
+                                    x_offset, y_offset, \
+                                    dst, dst_stride, height, sse
       %define block_height heightd
     %endif
 
@@ -287,14 +283,14 @@ SECTION .text
 
 .x_zero_y_nonhalf:
   ; x_offset == 0 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+y_offsetq]
   mova                 m9, [bilin_filter+y_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
@@ -311,7 +307,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -514,14 +510,14 @@ SECTION .text
 
 .x_half_y_nonhalf:
   ; x_offset == 0.5 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+y_offsetq]
   mova                 m9, [bilin_filter+y_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
@@ -538,7 +534,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -636,14 +632,14 @@ SECTION .text
   jnz .x_nonhalf_y_nonzero
 
   ; x_offset == bilin interpolation && y_offset == 0
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+x_offsetq]
   mova                 m9, [bilin_filter+x_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
@@ -660,7 +656,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -735,14 +731,14 @@ SECTION .text
   jne .x_nonhalf_y_nonhalf
 
   ; x_offset == bilin interpolation && y_offset == 0.5
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+x_offsetq]
   mova                 m9, [bilin_filter+x_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
@@ -759,7 +755,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -862,8 +858,8 @@ SECTION .text
 
 .x_nonhalf_y_nonhalf:
 ; loading filter - this is same as in 8-bit depth
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5
   shl           y_offsetd, filter_idx_shift
@@ -872,7 +868,7 @@ SECTION .text
   mova                 m9, [bilin_filter+x_offsetq+16]
   mova                m10, [bilin_filter+y_offsetq]
   mova                m11, [bilin_filter+y_offsetq+16]
-  mova                m12, [pw_8]
+  mova                m12, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_y_a m10
@@ -900,7 +896,7 @@ SECTION .text
 %define filter_x_b [x_offsetq+16]
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 ; end of load filter
diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
index befd81269..18eb03d12 100644
--- a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
@@ -13,8 +13,8 @@
 #include <emmintrin.h>
 #include <stddef.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride,
                                     const uint16_t *src, ptrdiff_t src_stride,
@@ -204,21 +204,15 @@ SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); }
 SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); }
 SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); }
 SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); }
-#if CONFIG_EXT_PARTITION
 SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); }
 SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); }
 SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); }
-#endif
 SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); }
 SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); }
 SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); }
 SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); }
 SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); }
 SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); }
-#if CONFIG_EXT_PARTITION
-SUBTRACT_FUN(32x128) { STACK_V(64, subtract_32x64); }
-SUBTRACT_FUN(128x32) { STACK_H(64, subtract_64x32); }
-#endif
 
 static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
   if (rows == 4) {
@@ -244,25 +238,17 @@ static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
     if (cols == 16) return subtract_16x32;
     if (cols == 32) return subtract_32x32;
     if (cols == 64) return subtract_64x32;
-#if CONFIG_EXT_PARTITION
-    if (cols == 128) return subtract_128x32;
-#endif  // CONFIG_EXT_PARTITION
   }
   if (rows == 64) {
     if (cols == 16) return subtract_16x64;
     if (cols == 32) return subtract_32x64;
     if (cols == 64) return subtract_64x64;
-#if CONFIG_EXT_PARTITION
     if (cols == 128) return subtract_128x64;
-#endif  // CONFIG_EXT_PARTITION
   }
-#if CONFIG_EXT_PARTITION
   if (rows == 128) {
-    if (cols == 32) return subtract_32x128;
     if (cols == 64) return subtract_64x128;
     if (cols == 128) return subtract_128x128;
   }
-#endif  // CONFIG_EXT_PARTITION
   assert(0);
   return NULL;
 }
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
index cf8ea498c..0d954e178 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
@@ -14,6 +14,8 @@
 
 %include "aom_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;unsigned int aom_highbd_calc16x16var_sse2
 ;(
 ;    unsigned char   *  src_ptr,
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
index 62acf3ed3..fdfadc886 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
@@ -12,13 +12,17 @@
 #include <assert.h>
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
 
 #include "aom_ports/mem.h"
 
-#include "./av1_rtcd.h"
 #include "av1/common/filter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
 
 typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
                                        const uint16_t *ref, int ref_stride,
@@ -185,13 +189,11 @@ VAR_FN(16, 16, 16, 8);
 VAR_FN(16, 8, 8, 7);
 VAR_FN(8, 16, 8, 7);
 VAR_FN(8, 8, 8, 6);
-#if CONFIG_EXT_PARTITION_TYPES
 VAR_FN(16, 4, 16, 6);
 VAR_FN(8, 32, 8, 8);
-VAR_FN(32, 8, 16, 8);
+VAR_FN(32, 8, 8, 8);
 VAR_FN(16, 64, 16, 10);
 VAR_FN(64, 16, 16, 10);
-#endif
 
 #undef VAR_FN
 
@@ -398,7 +400,6 @@ DECLS(sse2);
     return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
-#if CONFIG_EXT_PARTITION_TYPES
 #define FNS(opt)                        \
   FN(64, 64, 16, 6, 6, opt, (int64_t)); \
   FN(64, 32, 16, 6, 5, opt, (int64_t)); \
@@ -416,20 +417,6 @@ DECLS(sse2);
   FN(32, 8, 16, 5, 3, opt, (int64_t));  \
   FN(16, 64, 16, 4, 6, opt, (int64_t)); \
   FN(64, 16, 16, 6, 4, opt, (int64_t))
-#else
-#define FNS(opt)                        \
-  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
-  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt, (int64_t))
-#endif
 
 FNS(sse2);
 
@@ -577,7 +564,6 @@ DECLS(sse2);
     return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
-#if CONFIG_EXT_PARTITION_TYPES
 #define FNS(opt)                        \
   FN(64, 64, 16, 6, 6, opt, (int64_t)); \
   FN(64, 32, 16, 6, 5, opt, (int64_t)); \
@@ -595,30 +581,104 @@ DECLS(sse2);
   FN(32, 8, 16, 5, 3, opt, (int64_t));  \
   FN(16, 64, 16, 4, 6, opt, (int64_t)); \
   FN(64, 16, 16, 6, 4, opt, (int64_t));
-#else
-#define FNS(opt)                        \
-  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
-  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt, (int64_t));
-#endif
 
 FNS(sse2);
 
 #undef FNS
 #undef FN
 
-void aom_highbd_upsampled_pred_sse2(uint16_t *comp_pred, int width, int height,
+void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
+                                    const struct AV1Common *const cm,
+                                    int mi_row, int mi_col, const MV *const mv,
+                                    uint16_t *comp_pred, int width, int height,
                                     int subpel_x_q3, int subpel_y_q3,
                                     const uint8_t *ref8, int ref_stride,
                                     int bd) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      // Note: This is mostly a copy from the >=8X8 case in
+      // build_inter_predictors() function, with some small tweaks.
+      uint8_t *comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred);
+
+      // Some assumptions.
+      const int plane = 0;
+
+      // Get pre-requisites.
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const int ssx = pd->subsampling_x;
+      const int ssy = pd->subsampling_y;
+      assert(ssx == 0 && ssy == 0);
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+
+      // Calculate subpel_x/y and x/y_step.
+      const int row_start = 0;  // Because ss_y is 0.
+      const int col_start = 0;  // Because ss_x is 0.
+      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
+      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
+      int orig_pos_y = pre_y << SUBPEL_BITS;
+      orig_pos_y += mv->row * (1 << (1 - ssy));
+      int orig_pos_x = pre_x << SUBPEL_BITS;
+      orig_pos_x += mv->col * (1 << (1 - ssx));
+      int pos_y = sf->scale_value_y(orig_pos_y, sf);
+      int pos_x = sf->scale_value_x(orig_pos_x, sf);
+      pos_x += SCALE_EXTRA_OFF;
+      pos_y += SCALE_EXTRA_OFF;
+
+      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                         << SCALE_SUBPEL_BITS;
+      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
+                        << SCALE_SUBPEL_BITS;
+      pos_y = clamp(pos_y, top, bottom);
+      pos_x = clamp(pos_x, left, right);
+
+      const uint8_t *const pre =
+          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+          (pos_x >> SCALE_SUBPEL_BITS);
+      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+                                           pos_x & SCALE_SUBPEL_MASK,
+                                           pos_y & SCALE_SUBPEL_MASK };
+
+      // Get warp types.
+      const WarpedMotionParams *const wm =
+          &xd->global_motion[mi->ref_frame[ref_num]];
+      const int is_global = is_global_mv_block(mi, wm->wmtype);
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global;
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+      // Get convolve parameters.
+      ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd);
+      const InterpFilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+      // Get the inter predictor.
+      const int build_for_obmc = 0;
+      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width,
+                               &subpel_params, sf, width, height, &conv_params,
+                               filters, &warp_types, mi_x >> pd->subsampling_x,
+                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
+                               build_for_obmc, xd, cm->allow_warped_motion);
+      return;
+    }
+  }
+
+  const InterpFilterParams filter =
+      av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+
   if (!subpel_x_q3 && !subpel_y_q3) {
     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
     if (width >= 8) {
@@ -648,54 +708,48 @@ void aom_highbd_upsampled_pred_sse2(uint16_t *comp_pred, int width, int height,
         ref += 2 * ref_stride;
       }
     }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_highbd_convolve8_horiz(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
+                               width, kernel, 16, NULL, -1, width, height, bd);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
+                              width, NULL, -1, kernel, 16, width, height, bd);
   } else {
-    InterpFilterParams filter;
-    filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR);
-    if (!subpel_y_q3) {
-      const int16_t *kernel;
-      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-      aom_highbd_convolve8_horiz(ref8, ref_stride,
-                                 CONVERT_TO_BYTEPTR(comp_pred), width, kernel,
-                                 16, NULL, -1, width, height, bd);
-    } else if (!subpel_x_q3) {
-      const int16_t *kernel;
-      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-      aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
-                                width, NULL, -1, kernel, 16, width, height, bd);
-    } else {
-      DECLARE_ALIGNED(16, uint16_t,
-                      temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
-      const int16_t *kernel_x;
-      const int16_t *kernel_y;
-      int intermediate_height;
-      kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-      kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-      intermediate_height =
-          (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
-      assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-      aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1),
-                                 ref_stride, CONVERT_TO_BYTEPTR(temp),
-                                 MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
-                                 intermediate_height, bd);
-      aom_highbd_convolve8_vert(
-          CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)),
-          MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
-          16, width, height, bd);
-    }
+    DECLARE_ALIGNED(16, uint16_t,
+                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1),
+                               ref_stride, CONVERT_TO_BYTEPTR(temp),
+                               MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+                               intermediate_height, bd);
+    aom_highbd_convolve8_vert(
+        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)),
+        MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
+        16, width, height, bd);
   }
 }
 
-void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred,
-                                             const uint8_t *pred8, int width,
-                                             int height, int subpel_x_q3,
-                                             int subpel_y_q3,
-                                             const uint8_t *ref8,
-                                             int ref_stride, int bd) {
+void aom_highbd_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd) {
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   int n;
   int i;
-  aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3,
-                            ref8, ref_stride, bd);
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd);
   /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
   assert(!(width * height & 7));
   n = width * height >> 3;
@@ -707,3 +761,102 @@ void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred,
     pred += 8;
   }
 }
+
+static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
+                                               const __m128i *w0,
+                                               const __m128i *w1,
+                                               const __m128i *r,
+                                               void *const result) {
+  assert(DIST_PRECISION_BITS <= 4);
+  __m128i mult0 = _mm_mullo_epi16(*p0, *w0);
+  __m128i mult1 = _mm_mullo_epi16(*p1, *w1);
+  __m128i sum = _mm_adds_epu16(mult0, mult1);
+  __m128i round = _mm_adds_epu16(sum, *r);
+  __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS);
+
+  xx_storeu_128(result, shift);
+}
+
+void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred,
+                                       const uint8_t *pred8, int width,
+                                       int height, const uint8_t *ref8,
+                                       int ref_stride,
+                                       const JNT_COMP_PARAMS *jcp_param) {
+  int i;
+  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
+  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
+  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
+  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+  if (width >= 8) {
+    // Read 8 pixels one row at a time
+    assert(!(width & 7));
+    for (i = 0; i < height; ++i) {
+      int j;
+      for (j = 0; j < width; j += 8) {
+        __m128i p0 = xx_loadu_128(ref);
+        __m128i p1 = xx_loadu_128(pred);
+
+        highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+
+        comp_pred += 8;
+        pred += 8;
+        ref += 8;
+      }
+      ref += ref_stride - width;
+    }
+  } else {
+    // Read 4 pixels two rows at a time
+    assert(!(width & 3));
+    for (i = 0; i < height; i += 2) {
+      __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
+      __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
+      __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
+      __m128i p1 = xx_loadu_128(pred);
+
+      highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+
+      comp_pred += 8;
+      pred += 8;
+      ref += 2 * ref_stride;
+    }
+  }
+}
+
+void aom_highbd_jnt_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param) {
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  int n;
+  int i;
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd);
+  assert(!(width * height & 7));
+  n = width * height >> 3;
+
+  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
+  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
+  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
+  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  for (i = 0; i < n; i++) {
+    __m128i p0 = xx_loadu_128(comp_pred);
+    __m128i p1 = xx_loadu_128(pred);
+
+    highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+
+    comp_pred += 8;
+    pred += 8;
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
index cc7f52811..6c247a91b 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
@@ -11,8 +11,8 @@
 
 #include <smmintrin.h> /* SSE4.1 */
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/variance.h"
 #include "aom_dsp/aom_filter.h"
diff --git a/third_party/aom/aom_dsp/x86/intrapred_avx2.c b/third_party/aom/aom_dsp/x86/intrapred_avx2.c
index 6b8922b8c..1e67d392e 100644
--- a/third_party/aom/aom_dsp/x86/intrapred_avx2.c
+++ b/third_party/aom/aom_dsp/x86/intrapred_avx2.c
@@ -11,7 +11,20 @@
 
 #include <immintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE __m256i dc_sum_64(const uint8_t *ref) {
+  const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
+  const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i y0 = _mm256_sad_epu8(x0, zero);
+  __m256i y1 = _mm256_sad_epu8(x1, zero);
+  y0 = _mm256_add_epi64(y0, y1);
+  __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
+  y0 = _mm256_add_epi64(u0, y0);
+  u0 = _mm256_unpackhi_epi64(y0, y0);
+  return _mm256_add_epi16(y0, u0);
+}
 
 static INLINE __m256i dc_sum_32(const uint8_t *ref) {
   const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
@@ -25,13 +38,31 @@ static INLINE __m256i dc_sum_32(const uint8_t *ref) {
 
 static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
                                   ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < height; ++i) {
+  for (int i = 0; i < height; ++i) {
     _mm256_storeu_si256((__m256i *)dst, *r);
     dst += stride;
   }
 }
 
+static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
+                                    int height, uint8_t *dst,
+                                    ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm256_storeu_si256((__m256i *)dst, *r0);
+    _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
+    dst += stride;
+  }
+}
+
+static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
+                                  ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm256_storeu_si256((__m256i *)dst, *r);
+    _mm256_storeu_si256((__m256i *)(dst + 32), *r);
+    dst += stride;
+  }
+}
+
 void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   const __m256i sum_above = dc_sum_32(above);
@@ -168,11 +199,58 @@ void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
   uint32_t sum = _mm_cvtsi128_si32(left_sum);
   sum += 24;
   sum /= 48;
-
   const __m256i row = _mm256_set1_epi8((uint8_t)sum);
   row_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_32(above);
+  __m256i sum_left = dc_sum_64(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 48;
+  sum /= 96;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_64(above);
+  __m256i sum_left = dc_sum_64(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 64;
+  sum /= 128;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_64(above);
+  __m256i sum_left = dc_sum_32(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 48;
+  sum /= 96;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_64(above);
+  __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 40;
+  sum /= 80;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
 void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
@@ -187,6 +265,62 @@ void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
   row_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_32(above);
+  (void)left;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
 void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
@@ -202,6 +336,63 @@ void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
   row_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_64(left);
+  (void)above;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_64(left);
+  (void)above;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_32(left);
+  (void)above;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i sum = dc_sum_16_sse2(left);
+  (void)above;
+
+  const __m128i eight = _mm_set1_epi16(8);
+  sum = _mm_add_epi16(sum, eight);
+  sum = _mm_srai_epi16(sum, 4);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i r = _mm_shuffle_epi8(sum, zero);
+  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
 void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
@@ -211,6 +402,42 @@ void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
   row_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
 void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   const __m256i row = _mm256_loadu_si256((const __m256i *)above);
@@ -218,8 +445,39 @@ void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
   row_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+  (void)left;
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 64, dst, stride);
+}
+
+void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 32, dst, stride);
+}
+
+void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 16, dst, stride);
+}
+
 // -----------------------------------------------------------------------------
-// TM_PRED
+// PAETH_PRED
 
 // Return 16 16-bit pixels in one row (__m256i)
 static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
@@ -336,6 +594,26 @@ void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i top = get_top_vector(above);
+
+  for (int j = 0; j < 4; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16(0x8000);
+    for (int i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+      const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+      _mm_store_si128((__m128i *)dst, row);
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
 // Return 32 8-bit pixels in one row (__m256i)
 static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
                                       const __m256i *top1,
@@ -411,3 +689,123 @@ void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
     rep = _mm256_add_epi16(rep, one);
   }
 }
+
+void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i t2 = get_top_vector(above + 32);
+  const __m256i t3 = get_top_vector(above + 48);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i, j;
+  for (j = 0; j < 2; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i t2 = get_top_vector(above + 32);
+  const __m256i t3 = get_top_vector(above + 48);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i t2 = get_top_vector(above + 32);
+  const __m256i t3 = get_top_vector(above + 48);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i;
+  const __m256i l = get_left_vector(left);
+  __m256i rep = _mm256_set1_epi16(0x8000);
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+    const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+    const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r1);
+    _mm_store_si128((__m128i *)(dst + 32), r2);
+    _mm_store_si128((__m128i *)(dst + 48), r3);
+
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.c b/third_party/aom/aom_dsp/x86/intrapred_sse2.c
index 2a83b9001..5b2452c8e 100644
--- a/third_party/aom/aom_dsp/x86/intrapred_sse2.c
+++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.c
@@ -11,11 +11,11 @@
 
 #include <emmintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
-static INLINE void dc_store_4x8(uint32_t dc, uint8_t *dst, ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < 4; ++i) {
+static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
+                                ptrdiff_t stride) {
+  for (int i = 0; i < height; i += 2) {
     *(uint32_t *)dst = dc;
     dst += stride;
     *(uint32_t *)dst = dc;
@@ -51,6 +51,17 @@ static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
   }
 }
 
+static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
+                                 ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, *row);
+    _mm_store_si128((__m128i *)(dst + 16), *row);
+    _mm_store_si128((__m128i *)(dst + 32), *row);
+    _mm_store_si128((__m128i *)(dst + 48), *row);
+    dst += stride;
+  }
+}
+
 static INLINE __m128i dc_sum_4(const uint8_t *ref) {
   __m128i x = _mm_loadl_epi64((__m128i const *)ref);
   const __m128i zero = _mm_setzero_si128();
@@ -83,6 +94,34 @@ static INLINE __m128i dc_sum_32(const uint8_t *ref) {
   return _mm_add_epi16(x0, high);
 }
 
+static INLINE __m128i dc_sum_64(const uint8_t *ref) {
+  __m128i x0 = _mm_load_si128((__m128i const *)ref);
+  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
+  __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
+  __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
+  const __m128i zero = _mm_setzero_si128();
+  x0 = _mm_sad_epu8(x0, zero);
+  x1 = _mm_sad_epu8(x1, zero);
+  x2 = _mm_sad_epu8(x2, zero);
+  x3 = _mm_sad_epu8(x3, zero);
+  x0 = _mm_add_epi16(x0, x1);
+  x2 = _mm_add_epi16(x2, x3);
+  x0 = _mm_add_epi16(x0, x2);
+  const __m128i high = _mm_unpackhi_epi64(x0, x0);
+  return _mm_add_epi16(x0, high);
+}
+
+#define DC_MULTIPLIER_1X2 0x5556
+#define DC_MULTIPLIER_1X4 0x3334
+
+#define DC_SHIFT2 16
+
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+                                              int multiplier) {
+  const int interm = num >> shift1;
+  return interm * multiplier >> DC_SHIFT2;
+}
+
 // -----------------------------------------------------------------------------
 // DC_PRED
 
@@ -94,11 +133,26 @@ void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
   sum += 6;
-  sum /= 12;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
 
   const __m128i row = _mm_set1_epi8((uint8_t)sum);
   const uint32_t pred = _mm_cvtsi128_si32(row);
-  dc_store_4x8(pred, dst, stride);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_16(left);
+  __m128i sum_above = dc_sum_4(above);
+  sum_above = _mm_add_epi16(sum_left, sum_above);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 10;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
+
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  const uint32_t pred = _mm_cvtsi128_si32(row);
+  dc_store_4xh(pred, 16, dst, stride);
 }
 
 void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
@@ -109,7 +163,7 @@ void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
   sum += 6;
-  sum /= 12;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
 
   const __m128i row = _mm_set1_epi8((uint8_t)sum);
   dc_store_8xh(&row, 4, dst, stride);
@@ -123,11 +177,37 @@ void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
   sum += 12;
-  sum /= 24;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
   const __m128i row = _mm_set1_epi8((uint8_t)sum);
   dc_store_8xh(&row, 16, dst, stride);
 }
 
+void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_32(left);
+  __m128i sum_above = dc_sum_8(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 20;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_4(left);
+  __m128i sum_above = dc_sum_16(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 10;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
 void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   const __m128i sum_left = dc_sum_8(left);
@@ -136,7 +216,7 @@ void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
   sum += 12;
-  sum /= 24;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
   const __m128i row = _mm_set1_epi8((uint8_t)sum);
   dc_store_16xh(&row, 8, dst, stride);
 }
@@ -149,11 +229,37 @@ void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
   sum += 24;
-  sum /= 48;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
   const __m128i row = _mm_set1_epi8((uint8_t)sum);
   dc_store_16xh(&row, 32, dst, stride);
 }
 
+void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_64(left);
+  __m128i sum_above = dc_sum_16(above);
+  sum_above = _mm_add_epi16(sum_left, sum_above);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 40;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sum_left = dc_sum_8(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 20;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
 void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   __m128i sum_above = dc_sum_32(above);
@@ -162,11 +268,63 @@ void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
   sum += 24;
-  sum /= 48;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
   const __m128i row = _mm_set1_epi8((uint8_t)sum);
   dc_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sum_left = dc_sum_64(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 48;
+  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i sum_left = dc_sum_64(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 64;
+  sum /= 128;
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i sum_left = dc_sum_32(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 48;
+  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i sum_left = dc_sum_16(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 40;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
 // -----------------------------------------------------------------------------
 // DC_TOP
 
@@ -181,7 +339,21 @@ void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
   sum_above = _mm_packus_epi16(sum_above, sum_above);
 
   const uint32_t pred = _mm_cvtsi128_si32(sum_above);
-  dc_store_4x8(pred, dst, stride);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_4(above);
+  const __m128i two = _mm_set1_epi16((int16_t)2);
+  sum_above = _mm_add_epi16(sum_above, two);
+  sum_above = _mm_srai_epi16(sum_above, 2);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  sum_above = _mm_packus_epi16(sum_above, sum_above);
+
+  const uint32_t pred = _mm_cvtsi128_si32(sum_above);
+  dc_store_4xh(pred, 16, dst, stride);
 }
 
 void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
@@ -208,6 +380,31 @@ void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_8xh(&row, 16, dst, stride);
 }
 
+void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_8(above);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_above = _mm_add_epi16(sum_above, four);
+  sum_above = _mm_srai_epi16(sum_above, 3);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_16(above);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_above = _mm_add_epi16(sum_above, eight);
+  sum_above = _mm_srai_epi16(sum_above, 4);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
 void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
@@ -235,6 +432,33 @@ void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_16xh(&row, 32, dst, stride);
 }
 
+void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_16(above);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_above = _mm_add_epi16(sum_above, eight);
+  sum_above = _mm_srai_epi16(sum_above, 4);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_above = _mm_add_epi16(sum_above, sixteen);
+  sum_above = _mm_srai_epi16(sum_above, 5);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
 void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
@@ -249,6 +473,62 @@ void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_above = _mm_add_epi16(sum_above, sixteen);
+  sum_above = _mm_srai_epi16(sum_above, 5);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
 // -----------------------------------------------------------------------------
 // DC_LEFT
 
@@ -263,7 +543,22 @@ void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
   sum_left = _mm_packus_epi16(sum_left, sum_left);
 
   const uint32_t pred = _mm_cvtsi128_si32(sum_left);
-  dc_store_4x8(pred, dst, stride);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_16(left);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_left = _mm_add_epi16(sum_left, eight);
+  sum_left = _mm_srai_epi16(sum_left, 4);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  sum_left = _mm_packus_epi16(sum_left, sum_left);
+
+  const uint32_t pred = _mm_cvtsi128_si32(sum_left);
+  dc_store_4xh(pred, 16, dst, stride);
 }
 
 void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
@@ -291,6 +586,33 @@ void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_8xh(&row, 16, dst, stride);
 }
 
+void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_32(left);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_left = _mm_add_epi16(sum_left, sixteen);
+  sum_left = _mm_srai_epi16(sum_left, 5);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_4(left);
+  const __m128i two = _mm_set1_epi16((uint16_t)2);
+  sum_left = _mm_add_epi16(sum_left, two);
+  sum_left = _mm_srai_epi16(sum_left, 2);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
 void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
@@ -319,6 +641,34 @@ void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_16xh(&row, 32, dst, stride);
 }
 
+void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_64(left);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_left = _mm_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm_srai_epi16(sum_left, 6);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_8(left);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_left = _mm_add_epi16(sum_left, four);
+  sum_left = _mm_srai_epi16(sum_left, 3);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
 void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
@@ -333,6 +683,62 @@ void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_64(left);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_left = _mm_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm_srai_epi16(sum_left, 6);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_64(left);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_left = _mm_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm_srai_epi16(sum_left, 6);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_32(left);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_left = _mm_add_epi16(sum_left, sixteen);
+  sum_left = _mm_srai_epi16(sum_left, 5);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_16(left);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_left = _mm_add_epi16(sum_left, eight);
+  sum_left = _mm_srai_epi16(sum_left, 4);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
 // -----------------------------------------------------------------------------
 // DC_128
 
@@ -341,7 +747,15 @@ void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
   (void)above;
   (void)left;
   const uint32_t pred = 0x80808080;
-  dc_store_4x8(pred, dst, stride);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const uint32_t pred = 0x80808080;
+  dc_store_4xh(pred, 16, dst, stride);
 }
 
 void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
@@ -360,6 +774,22 @@ void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_8xh(&row, 16, dst, stride);
 }
 
+void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
 void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)above;
@@ -377,6 +807,23 @@ void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_16xh(&row, 32, dst, stride);
 }
 
+void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
 void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
@@ -386,6 +833,42 @@ void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
 // -----------------------------------------------------------------------------
 // V_PRED
 
@@ -393,7 +876,14 @@ void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
   const uint32_t pred = *(uint32_t *)above;
   (void)left;
-  dc_store_4x8(pred, dst, stride);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint32_t pred = *(uint32_t *)above;
+  (void)left;
+  dc_store_4xh(pred, 16, dst, stride);
 }
 
 void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
@@ -410,6 +900,20 @@ void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_8xh(&row, 16, dst, stride);
 }
 
+void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+  (void)left;
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_load_si128((__m128i const *)above);
+  (void)left;
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
 void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   const __m128i row = _mm_load_si128((__m128i const *)above);
@@ -424,19 +928,75 @@ void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_16xh(&row, 32, dst, stride);
 }
 
-void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_load_si128((__m128i const *)above);
+  (void)left;
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, int height) {
   const __m128i row0 = _mm_load_si128((__m128i const *)above);
   const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+  for (int i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, row0);
+    _mm_store_si128((__m128i *)(dst + 16), row1);
+    dst += stride;
+  }
+}
+
+void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
   (void)left;
-  int i;
-  for (i = 0; i < 16; ++i) {
+  v_predictor_32xh(dst, stride, above, 8);
+}
+
+void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_32xh(dst, stride, above, 16);
+}
+
+void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_32xh(dst, stride, above, 64);
+}
+
+static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, int height) {
+  const __m128i row0 = _mm_load_si128((__m128i const *)above);
+  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+  const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
+  const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
+  for (int i = 0; i < height; ++i) {
     _mm_store_si128((__m128i *)dst, row0);
     _mm_store_si128((__m128i *)(dst + 16), row1);
+    _mm_store_si128((__m128i *)(dst + 32), row2);
+    _mm_store_si128((__m128i *)(dst + 48), row3);
     dst += stride;
   }
 }
 
+void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_64xh(dst, stride, above, 64);
+}
+
+void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_64xh(dst, stride, above, 32);
+}
+
+void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_64xh(dst, stride, above, 16);
+}
+
 // -----------------------------------------------------------------------------
 // H_PRED
 
@@ -471,25 +1031,7 @@ void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
   *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
 }
 
-void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
-  left_col = _mm_unpacklo_epi8(left_col, left_col);
-  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
-  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
-  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
-  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
-  _mm_storel_epi64((__m128i *)dst, row0);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
-}
-
-void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   (void)above;
   const __m128i left_col = _mm_load_si128((__m128i const *)left);
@@ -500,13 +1042,13 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
   __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
   __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
-  _mm_storel_epi64((__m128i *)dst, row0);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
   dst += stride;
 
   left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
@@ -514,26 +1056,26 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
   row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
   row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
-  _mm_storel_epi64((__m128i *)dst, row0);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
   dst += stride;
 
   row0 = _mm_shufflelo_epi16(left_col_high, 0);
   row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
   row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
   row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
-  _mm_storel_epi64((__m128i *)dst, row0);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
   dst += stride;
 
   left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
@@ -541,6 +1083,24 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
   row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
   row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
+}
+
+void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
+  left_col = _mm_unpacklo_epi8(left_col, left_col);
+  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
+  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
+  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
   _mm_storel_epi64((__m128i *)dst, row0);
   dst += stride;
   _mm_storel_epi64((__m128i *)dst, row1);
@@ -550,6 +1110,82 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   _mm_storel_epi64((__m128i *)dst, row3);
 }
 
+static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, const uint8_t *left,
+                                      int count) {
+  (void)above;
+  for (int i = 0; i < count; ++i) {
+    const __m128i left_col = _mm_load_si128((__m128i const *)left);
+    __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
+    __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
+
+    __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
+    __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+    __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+    __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+
+    left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
+    row0 = _mm_shufflelo_epi16(left_col_low, 0);
+    row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+    row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+    row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+
+    row0 = _mm_shufflelo_epi16(left_col_high, 0);
+    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+
+    left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
+    row0 = _mm_shufflelo_epi16(left_col_high, 0);
+    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+    left += 16;
+  }
+}
+
+void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  h_predictor_8x16xc(dst, stride, above, left, 1);
+}
+
+void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  h_predictor_8x16xc(dst, stride, above, left, 2);
+}
+
 static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
                                      ptrdiff_t stride) {
   int i;
@@ -601,6 +1237,14 @@ static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
   h_pred_store_16xh(row, 4, dst, stride);
 }
 
+void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+  h_prediction_16x8_1(&left_col_8p, dst, stride);
+}
+
 void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   (void)above;
@@ -611,29 +1255,38 @@ void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
   h_prediction_16x8_2(&left_col_8p, dst, stride);
 }
 
-void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  __m128i left_col, left_col_8p;
-  (void)above;
+static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *left, int count) {
   int i = 0;
-
   do {
-    left_col = _mm_load_si128((const __m128i *)left);
-    left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
-    h_prediction_16x8_1(&left_col_8p, dst, stride);
+    const __m128i left_col = _mm_load_si128((const __m128i *)left);
+    const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
+    h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
     dst += stride << 2;
-    h_prediction_16x8_2(&left_col_8p, dst, stride);
+    h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
     dst += stride << 2;
 
-    left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
-    h_prediction_16x8_1(&left_col_8p, dst, stride);
+    const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
+    h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
     dst += stride << 2;
-    h_prediction_16x8_2(&left_col_8p, dst, stride);
+    h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
     dst += stride << 2;
 
     left += 16;
     i++;
-  } while (i < 2);
+  } while (i < count);
+}
+
+void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_16xh(dst, stride, left, 2);
+}
+
+void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_16xh(dst, stride, left, 4);
 }
 
 static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
@@ -664,6 +1317,19 @@ static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
   h_pred_store_32xh(row, 4, dst, stride);
 }
 
+void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  __m128i left_col, left_col_8p;
+  (void)above;
+
+  left_col = _mm_load_si128((const __m128i *)left);
+
+  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+  h_prediction_32x8_1(&left_col_8p, dst, stride);
+  dst += stride << 2;
+  h_prediction_32x8_2(&left_col_8p, dst, stride);
+}
+
 void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   __m128i left_col, left_col_8p;
@@ -682,3 +1348,83 @@ void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dst += stride << 2;
   h_prediction_32x8_2(&left_col_8p, dst, stride);
 }
+
+static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *left, int height) {
+  int i = height >> 2;
+  do {
+    __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
+    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r0);
+    _mm_store_si128((__m128i *)(dst + stride), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
+    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
+    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
+    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
+    left += 4;
+    dst += stride * 4;
+  } while (--i);
+}
+
+void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_32xh(dst, stride, left, 64);
+}
+
+static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *left, int height) {
+  int i = height >> 2;
+  do {
+    __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
+    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r0);
+    _mm_store_si128((__m128i *)(dst + 32), r0);
+    _mm_store_si128((__m128i *)(dst + 48), r0);
+    _mm_store_si128((__m128i *)(dst + stride), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 32), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 48), r1);
+    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
+    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
+    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
+    left += 4;
+    dst += stride * 4;
+  } while (--i);
+}
+
+void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_64xh(dst, stride, left, 64);
+}
+
+void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_64xh(dst, stride, left, 32);
+}
+
+void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_64xh(dst, stride, left, 16);
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
index 85b82744e..807ed1770 100644
--- a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
@@ -11,11 +11,12 @@
 
 #include <tmmintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/intrapred_common.h"
 
 // -----------------------------------------------------------------------------
-// TM_PRED
+// PAETH_PRED
 
 // Return 8 16-bit pixels in one row
 static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
@@ -82,6 +83,26 @@ void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 16; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
 void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
   __m128i l = _mm_loadl_epi64((const __m128i *)left);
@@ -145,6 +166,28 @@ void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int j = 0; j < 2; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (int i = 0; i < 16; ++i) {
+      const __m128i l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+      _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
 // Return 16 8-bit pixels in one row
 static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
                                       const __m128i *top1,
@@ -154,6 +197,27 @@ static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
   return _mm_packus_epi16(p0, p1);
 }
 
+void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 4; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
 void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   __m128i l = _mm_loadl_epi64((const __m128i *)left);
@@ -234,6 +298,57 @@ void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int j = 0; j < 4; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (int i = 0; i < 16; ++i) {
+      const __m128i l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+      _mm_store_si128((__m128i *)dst, row);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  __m128i l16;
+
+  for (int i = 0; i < 8; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r32l);
+    _mm_store_si128((__m128i *)(dst + 16), r32h);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
 void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
@@ -307,6 +422,162 @@ void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+      const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+      _mm_store_si128((__m128i *)dst, r32l);
+      _mm_store_si128((__m128i *)(dst + 16), r32h);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+  const __m128i cl = _mm_unpacklo_epi8(c, zero);
+  const __m128i ch = _mm_unpackhi_epi8(c, zero);
+  const __m128i dl = _mm_unpacklo_epi8(d, zero);
+  const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i, j;
+  for (j = 0; j < 2; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+  const __m128i cl = _mm_unpacklo_epi8(c, zero);
+  const __m128i ch = _mm_unpackhi_epi8(c, zero);
+  const __m128i dl = _mm_unpacklo_epi8(d, zero);
+  const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+  const __m128i cl = _mm_unpacklo_epi8(c, zero);
+  const __m128i ch = _mm_unpackhi_epi8(c, zero);
+  const __m128i dl = _mm_unpacklo_epi8(d, zero);
+  const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i;
+  const __m128i l = _mm_load_si128((const __m128i *)left);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+    const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+    const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r1);
+    _mm_store_si128((__m128i *)(dst + 32), r2);
+    _mm_store_si128((__m128i *)(dst + 48), r3);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
 // -----------------------------------------------------------------------------
 // SMOOTH_PRED
 
@@ -315,9 +586,15 @@ void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
 // pixels[2]: right_pred vector
 static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
                                  int height, __m128i *pixels) {
-  __m128i d = _mm_loadl_epi64((const __m128i *)above);
+  __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+  if (height == 4)
+    pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  else if (height == 8)
+    pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
+  else
+    pixels[1] = _mm_loadu_si128(((const __m128i *)left));
+
   pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
-  pixels[1] = _mm_loadl_epi64((const __m128i *)left);
 
   const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
   const __m128i zero = _mm_setzero_si128();
@@ -325,45 +602,52 @@ static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
   pixels[0] = _mm_unpacklo_epi16(d, bp);
 }
 
-// weights[0]: weights_h vector
-// weights[1]: scale - weights_h vecotr
-// weights[2]: weights_w and scale - weights_w interleave vector
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], second half for height = 16 only
+// weight_h[3]: same as [1], second half for height = 16 only
+// weight_w[0]: weights_w and scale - weights_w interleave vector
 static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
-                                  __m128i *weights) {
-  __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
+                                  __m128i *weight_h, __m128i *weight_w) {
   const __m128i zero = _mm_setzero_si128();
-
-  weights[0] = _mm_unpacklo_epi8(t, zero);
   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  weights[1] = _mm_sub_epi16(d, weights[0]);
-  weights[2] = _mm_unpacklo_epi16(weights[0], weights[1]);
+  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
+  weight_h[0] = _mm_unpacklo_epi8(t, zero);
+  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+  weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
 
   if (height == 8) {
-    t = _mm_srli_si128(t, 4);
-    weights[0] = _mm_unpacklo_epi8(t, zero);
-    weights[1] = _mm_sub_epi16(d, weights[0]);
+    const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+  } else if (height == 16) {
+    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
   }
 }
 
-static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *weight,
-                                   int h, uint8_t *dst, ptrdiff_t stride) {
+static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
+                                   const __m128i *ww, int h, uint8_t *dst,
+                                   ptrdiff_t stride, int second_half) {
   const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
   const __m128i one = _mm_set1_epi16(1);
   const __m128i inc = _mm_set1_epi16(0x202);
   const __m128i gat = _mm_set1_epi32(0xc080400);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
   __m128i d = _mm_set1_epi16(0x100);
 
-  int i;
-  for (i = 0; i < h; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
+  for (int i = 0; i < h; ++i) {
+    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
+    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
     __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
 
     __m128i b = _mm_shuffle_epi8(pixel[1], rep);
     b = _mm_unpacklo_epi16(b, pixel[2]);
-    __m128i sum = _mm_madd_epi16(b, weight[2]);
+    __m128i sum = _mm_madd_epi16(b, ww[0]);
 
     sum = _mm_add_epi32(s, sum);
     sum = _mm_add_epi32(sum, round);
@@ -383,10 +667,10 @@ void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
   __m128i pixels[3];
   load_pixel_w4(above, left, 4, pixels);
 
-  __m128i weights[3];
-  load_weight_w4(sm_weight_arrays, 4, weights);
+  __m128i wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 4, wh, ww);
 
-  smooth_pred_4xh(pixels, weights, 4, dst, stride);
+  smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
 }
 
 void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
@@ -394,33 +678,68 @@ void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   __m128i pixels[3];
   load_pixel_w4(above, left, 8, pixels);
 
-  __m128i weights[3];
-  load_weight_w4(sm_weight_arrays, 8, weights);
+  __m128i wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 8, wh, ww);
+
+  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i pixels[3];
+  load_pixel_w4(above, left, 16, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 16, wh, ww);
 
-  smooth_pred_4xh(pixels, weights, 8, dst, stride);
+  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
 }
 
 // pixels[0]: above and below_pred interleave vector, first half
 // pixels[1]: above and below_pred interleave vector, second half
 // pixels[2]: left vector
 // pixels[3]: right_pred vector
+// pixels[4]: above and below_pred interleave vector, first half
+// pixels[5]: above and below_pred interleave vector, second half
+// pixels[6]: left vector + 16
+// pixels[7]: right_pred vector
 static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
                                  int height, __m128i *pixels) {
-  __m128i d = _mm_loadl_epi64((const __m128i *)above);
-  pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
-  pixels[2] = _mm_load_si128((const __m128i *)left);
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
   const __m128i zero = _mm_setzero_si128();
-
+  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  __m128i d = _mm_loadl_epi64((const __m128i *)above);
   d = _mm_unpacklo_epi8(d, zero);
   pixels[0] = _mm_unpacklo_epi16(d, bp);
   pixels[1] = _mm_unpackhi_epi16(d, bp);
+
+  pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
+
+  if (height == 4) {
+    pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  } else if (height == 8) {
+    pixels[2] = _mm_loadl_epi64((const __m128i *)left);
+  } else if (height == 16) {
+    pixels[2] = _mm_load_si128((const __m128i *)left);
+  } else {
+    pixels[2] = _mm_load_si128((const __m128i *)left);
+    pixels[4] = pixels[0];
+    pixels[5] = pixels[1];
+    pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
+    pixels[7] = pixels[3];
+  }
 }
 
 // weight_h[0]: weight_h vector
 // weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], second half for height = 16 only
-// weight_h[3]: same as [1], second half for height = 16 only
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
 static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
@@ -429,7 +748,6 @@ static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
   const int we_offset = height < 8 ? 4 : 8;
   __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
   weight_h[0] = _mm_unpacklo_epi8(we, zero);
-
   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
   weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
 
@@ -450,6 +768,19 @@ static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
     weight_h[2] = _mm_unpackhi_epi8(we, zero);
     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+  } else if (height == 32) {
+    const __m128i weight_lo =
+        _mm_loadu_si128((const __m128i *)&weight_array[32]);
+    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+    const __m128i weight_hi =
+        _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
+    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
+    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
+    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
+    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
   }
 }
 
@@ -531,355 +862,831 @@ void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
 }
 
-// pixels[0]: above and below_pred interleave vector, 1/4
-// pixels[1]: above and below_pred interleave vector, 2/4
-// pixels[2]: above and below_pred interleave vector, 3/4
-// pixels[3]: above and below_pred interleave vector, 3/4
-// pixels[4]: left vector
-// pixels[5]: left vector, h = 32 only
-// pixels[6]: right_pred vector
-static INLINE void load_pixel_w16(const uint8_t *above, const uint8_t *left,
-                                  int height, __m128i *pixels) {
-  __m128i ab = _mm_load_si128((const __m128i *)above);
-  pixels[6] = _mm_set1_epi16((uint16_t)above[15]);
-  pixels[4] = _mm_load_si128((const __m128i *)left);
-  pixels[5] = _mm_load_si128((const __m128i *)(left + 16));
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i pixels[8];
+  load_pixel_w8(above, left, 32, pixels);
+
+  __m128i wh[8], ww[2];
+  load_weight_w8(sm_weight_arrays, 32, wh, ww);
+
+  smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
+}
+
+static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left, uint32_t bw,
+                                        uint32_t bh) {
+  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
+  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
   const __m128i zero = _mm_setzero_si128();
+  const __m128i scale_value =
+      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const __m128i top_right =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+  const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
+
+  for (uint32_t y = 0; y < bh; ++y) {
+    const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
+    const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
+    const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
+    __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
+    const __m128i wl_y =
+        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
+    pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
+    pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
+
+    for (uint32_t x = 0; x < bw; x += 8) {
+      const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
+      const __m128i weights_x =
+          _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
+      const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
+      const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
+      const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
+
+      __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
+      __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
+
+      const __m128i scale_m_weights_x =
+          _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
+      const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
+      const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
+      const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
+
+      pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
+      pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
+
+      pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
+      pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
+
+      pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
+      pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
+
+      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+      pred = _mm_shuffle_epi8(pred, gat);
+      _mm_storel_epi64((__m128i *)(dst + x), pred);
+    }
+    dst += stride;
+  }
+}
+
+void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 4);
+}
 
-  __m128i x = _mm_unpacklo_epi8(ab, zero);
-  pixels[0] = _mm_unpacklo_epi16(x, bp);
-  pixels[1] = _mm_unpackhi_epi16(x, bp);
+void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 8);
+}
 
-  x = _mm_unpackhi_epi8(ab, zero);
-  pixels[2] = _mm_unpacklo_epi16(x, bp);
-  pixels[3] = _mm_unpackhi_epi16(x, bp);
+void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 16);
 }
 
-// weight_h[0]: weight_h vector
-// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], second half for height = 16 only
-// weight_h[3]: same as [1], second half for height = 16 only
-// ... ...
-// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
-// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
-// ... ...
-static INLINE void load_weight_w16(const uint8_t *weight_array, int height,
-                                   __m128i *weight_h, __m128i *weight_w) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i w8 = _mm_loadu_si128((const __m128i *)&weight_array[8]);
-  __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]);
-  __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]);
-  __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 32);
+}
 
-  if (height == 8) {
-    weight_h[0] = _mm_unpacklo_epi8(w8, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);  // scale - weight_h
+void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 8);
+}
 
-    __m128i x = _mm_unpacklo_epi8(w16, zero);
-    __m128i y = _mm_sub_epi16(d, x);
-    weight_w[0] = _mm_unpacklo_epi16(x, y);
-    weight_w[1] = _mm_unpackhi_epi16(x, y);
-    x = _mm_unpackhi_epi8(w16, zero);
-    y = _mm_sub_epi16(d, x);
-    weight_w[2] = _mm_unpacklo_epi16(x, y);
-    weight_w[3] = _mm_unpackhi_epi16(x, y);
+void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 16);
+}
+
+void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 32);
+}
+
+void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 64);
+}
+
+void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 16);
+}
+
+void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 64);
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_V_PRED
+
+// pixels[0]: above and below_pred interleave vector
+static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  d = _mm_unpacklo_epi8(d, zero);
+  pixels[0] = _mm_unpacklo_epi16(d, bp);
+}
+
+// weights[0]: weights_h vector
+// weights[1]: scale - weights_h vector
+static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height,
+                                    __m128i *weights) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+
+  if (height == 4) {
+    const __m128i weight =
+        _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
+    weights[0] = _mm_unpacklo_epi8(weight, zero);
+    weights[1] = _mm_sub_epi16(d, weights[0]);
+  } else if (height == 8) {
+    const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
+    weights[0] = _mm_unpacklo_epi8(weight, zero);
+    weights[1] = _mm_sub_epi16(d, weights[0]);
+  } else {
+    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    weights[0] = _mm_unpacklo_epi8(weight, zero);
+    weights[1] = _mm_sub_epi16(d, weights[0]);
+    weights[2] = _mm_unpackhi_epi8(weight, zero);
+    weights[3] = _mm_sub_epi16(d, weights[2]);
   }
+}
 
-  if (height == 16) {
-    weight_h[0] = _mm_unpacklo_epi8(w16, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(w16, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
+                                     const __m128i *weight, int h, uint8_t *dst,
+                                     ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+  const __m128i inc = _mm_set1_epi16(0x202);
+  const __m128i gat = _mm_set1_epi32(0xc080400);
+  __m128i d = _mm_set1_epi16(0x100);
 
-    weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
-    weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
-    weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]);
-    weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]);
+  for (int i = 0; i < h; ++i) {
+    const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
+    const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
+    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+    __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
+    sum = _mm_add_epi32(sum, pred_round);
+    sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
+    sum = _mm_shuffle_epi8(sum, gat);
+    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
+    dst += stride;
+    d = _mm_add_epi16(d, inc);
   }
+}
 
-  if (height == 32) {
-    weight_h[0] = _mm_unpacklo_epi8(w32_0, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(w32_0, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels;
+  load_pixel_v_w4(above, left, 4, &pixels);
+
+  __m128i weights[2];
+  load_weight_v_w4(sm_weight_arrays, 4, weights);
+
+  smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
+}
+
+void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels;
+  load_pixel_v_w4(above, left, 8, &pixels);
 
-    __m128i x = _mm_unpacklo_epi8(w16, zero);
-    __m128i y = _mm_sub_epi16(d, x);
-    weight_w[0] = _mm_unpacklo_epi16(x, y);
-    weight_w[1] = _mm_unpackhi_epi16(x, y);
-    x = _mm_unpackhi_epi8(w16, zero);
-    y = _mm_sub_epi16(d, x);
-    weight_w[2] = _mm_unpacklo_epi16(x, y);
-    weight_w[3] = _mm_unpackhi_epi16(x, y);
+  __m128i weights[2];
+  load_weight_v_w4(sm_weight_arrays, 8, weights);
 
-    weight_h[4] = _mm_unpacklo_epi8(w32_1, zero);
+  smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
+}
+
+void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels;
+  load_pixel_v_w4(above, left, 16, &pixels);
+
+  __m128i weights[4];
+  load_weight_v_w4(sm_weight_arrays, 16, weights);
+
+  smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
+  dst += stride << 3;
+  smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
+}
+
+// pixels[0]: above and below_pred interleave vector, first half
+// pixels[1]: above and below_pred interleave vector, second half
+static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  d = _mm_unpacklo_epi8(d, zero);
+  pixels[0] = _mm_unpacklo_epi16(d, bp);
+  pixels[1] = _mm_unpackhi_epi16(d, bp);
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
+static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height,
+                                    __m128i *weight_h) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+
+  if (height < 16) {
+    const int offset = height < 8 ? 4 : 8;
+    const __m128i weight =
+        _mm_loadu_si128((const __m128i *)&weight_array[offset]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+  } else if (height == 16) {
+    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+  } else {
+    const __m128i weight_lo =
+        _mm_loadu_si128((const __m128i *)&weight_array[32]);
+    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+    const __m128i weight_hi =
+        _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
+    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
     weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
-    weight_h[6] = _mm_unpackhi_epi8(w32_1, zero);
+    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
     weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
   }
 }
 
-static INLINE void smooth_pred_16x8(const __m128i *pixels, const __m128i *wh,
-                                    const __m128i *ww, uint8_t *dst,
-                                    ptrdiff_t stride, int quarter) {
-  __m128i d = _mm_set1_epi16(0x100);
-  const __m128i one = _mm_set1_epi16(1);
+static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
+                                     int h, uint8_t *dst, ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
   const __m128i inc = _mm_set1_epi16(0x202);
   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
-  __m128i rep =
-      (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008);
-  const __m128i left = (quarter < 2) ? pixels[4] : pixels[5];
+  __m128i d = _mm_set1_epi16(0x100);
 
-  int i;
-  for (i = 0; i < 8; ++i) {
+  for (int i = 0; i < h; ++i) {
     const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
     const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
     __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
     __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
-    __m128i s2 = _mm_madd_epi16(pixels[2], wh_sc);
-    __m128i s3 = _mm_madd_epi16(pixels[3], wh_sc);
 
-    __m128i b = _mm_shuffle_epi8(left, rep);
-    b = _mm_unpacklo_epi16(b, pixels[6]);
-    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
-    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
-    __m128i sum2 = _mm_madd_epi16(b, ww[2]);
-    __m128i sum3 = _mm_madd_epi16(b, ww[3]);
+    s0 = _mm_add_epi32(s0, pred_round);
+    s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
 
-    s0 = _mm_add_epi32(s0, sum0);
-    s0 = _mm_add_epi32(s0, round);
-    s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
-
-    s1 = _mm_add_epi32(s1, sum1);
-    s1 = _mm_add_epi32(s1, round);
-    s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
-
-    s2 = _mm_add_epi32(s2, sum2);
-    s2 = _mm_add_epi32(s2, round);
-    s2 = _mm_srai_epi32(s2, 1 + sm_weight_log2_scale);
-
-    s3 = _mm_add_epi32(s3, sum3);
-    s3 = _mm_add_epi32(s3, round);
-    s3 = _mm_srai_epi32(s3, 1 + sm_weight_log2_scale);
-
-    sum0 = _mm_packus_epi16(s0, s1);
-    sum0 = _mm_shuffle_epi8(sum0, gat);
-    sum1 = _mm_packus_epi16(s2, s3);
-    sum1 = _mm_shuffle_epi8(sum1, gat);
-
-    _mm_storel_epi64((__m128i *)dst, sum0);
-    _mm_storel_epi64((__m128i *)(dst + 8), sum1);
+    s1 = _mm_add_epi32(s1, pred_round);
+    s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
 
+    __m128i sum01 = _mm_packus_epi16(s0, s1);
+    sum01 = _mm_shuffle_epi8(sum01, gat);
+    _mm_storel_epi64((__m128i *)dst, sum01);
     dst += stride;
-    rep = _mm_add_epi16(rep, one);
+
     d = _mm_add_epi16(d, inc);
   }
 }
 
-void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i pixels[7];
-  load_pixel_w16(above, left, 8, pixels);
+void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_v_w8(above, left, 4, pixels);
 
-  __m128i wh[2], ww[4];
-  load_weight_w16(sm_weight_arrays, 8, wh, ww);
+  __m128i wh[2];
+  load_weight_v_w8(sm_weight_arrays, 4, wh);
 
-  smooth_pred_16x8(pixels, wh, ww, dst, stride, 0);
+  smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
 }
 
-void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  __m128i pixels[7];
-  load_pixel_w16(above, left, 16, pixels);
+  __m128i pixels[2];
+  load_pixel_v_w8(above, left, 8, pixels);
 
-  __m128i wh[4], ww[4];
-  load_weight_w16(sm_weight_arrays, 16, wh, ww);
+  __m128i wh[2];
+  load_weight_v_w8(sm_weight_arrays, 8, wh);
 
-  smooth_pred_16x8(pixels, wh, ww, dst, stride, 0);
+  smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
+}
+
+void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_v_w8(above, left, 16, pixels);
+
+  __m128i wh[4];
+  load_weight_v_w8(sm_weight_arrays, 16, wh);
+
+  smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
   dst += stride << 3;
-  smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1);
+  smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
 }
 
-void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[7];
-  load_pixel_w16(above, left, 32, pixels);
+void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_v_w8(above, left, 32, pixels);
 
-  __m128i wh[8], ww[4];
-  load_weight_w16(sm_weight_arrays, 32, wh, ww);
+  __m128i wh[8];
+  load_weight_v_w8(sm_weight_arrays, 32, wh);
 
-  smooth_pred_16x8(pixels, wh, ww, dst, stride, 0);
+  smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
   dst += stride << 3;
-  smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1);
+  smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
   dst += stride << 3;
-  smooth_pred_16x8(pixels, &wh[4], ww, dst, stride, 2);
+  smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
   dst += stride << 3;
-  smooth_pred_16x8(pixels, &wh[6], ww, dst, stride, 3);
+  smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
 }
 
-static INLINE void load_pixel_w32(const uint8_t *above, const uint8_t *left,
-                                  int height, __m128i *pixels) {
-  __m128i ab0 = _mm_load_si128((const __m128i *)above);
-  __m128i ab1 = _mm_load_si128((const __m128i *)(above + 16));
+static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *above,
+                                          const uint8_t *left, uint32_t bw,
+                                          uint32_t bh) {
+  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i scale_value =
+      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const __m128i bottom_left =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+  const __m128i round =
+      _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
+
+  for (uint32_t y = 0; y < bh; ++y) {
+    const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
+    const __m128i scale_m_weights_y =
+        _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
+    const __m128i wl_y =
+        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
+
+    for (uint32_t x = 0; x < bw; x += 8) {
+      const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
+      // 8 -> 16
+      const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
+      const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
+      const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
+      // top_x * weights_y + scale_m_weights_y * bottom_left
+      __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
+      __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
+
+      pred_lo = _mm_add_epi32(pred_lo, round);
+      pred_hi = _mm_add_epi32(pred_hi, round);
+      pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
+      pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
+
+      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+      pred = _mm_shuffle_epi8(pred, gat);
+      _mm_storel_epi64((__m128i *)(dst + x), pred);
+    }
+    dst += stride;
+  }
+}
 
-  pixels[10] = _mm_set1_epi16((uint16_t)above[31]);
-  pixels[8] = _mm_load_si128((const __m128i *)left);
-  pixels[9] = _mm_load_si128((const __m128i *)(left + 16));
+void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
+}
 
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
-  const __m128i zero = _mm_setzero_si128();
+void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
+}
 
-  __m128i x = _mm_unpacklo_epi8(ab0, zero);
-  pixels[0] = _mm_unpacklo_epi16(x, bp);
-  pixels[1] = _mm_unpackhi_epi16(x, bp);
+void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
+}
+
+void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
+}
 
-  x = _mm_unpackhi_epi8(ab0, zero);
-  pixels[2] = _mm_unpacklo_epi16(x, bp);
-  pixels[3] = _mm_unpackhi_epi16(x, bp);
+void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
+}
 
-  x = _mm_unpacklo_epi8(ab1, zero);
-  pixels[4] = _mm_unpacklo_epi16(x, bp);
-  pixels[5] = _mm_unpackhi_epi16(x, bp);
+void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
+}
 
-  x = _mm_unpackhi_epi8(ab1, zero);
-  pixels[6] = _mm_unpacklo_epi16(x, bp);
-  pixels[7] = _mm_unpackhi_epi16(x, bp);
+void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
 }
 
-static INLINE void load_weight_w32(const uint8_t *weight_array, int height,
-                                   __m128i *weight_h, __m128i *weight_w) {
+void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
+}
+
+void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_H_PRED
+
+// pixels[0]: left vector
+// pixels[1]: right_pred vector
+static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  if (height == 4)
+    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  else if (height == 8)
+    pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
+  else
+    pixels[0] = _mm_loadu_si128(((const __m128i *)left));
+  pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
+}
+
+// weights[0]: weights_w and scale - weights_w interleave vector
+static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height,
+                                    __m128i *weights) {
+  (void)height;
+  const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
   const __m128i zero = _mm_setzero_si128();
-  __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]);
-  __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]);
-  __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
+
+  const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
+  weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
+}
 
-  if (height == 16) {
-    weight_h[0] = _mm_unpacklo_epi8(w16, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(w16, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
+                                     const __m128i *weight, int h, uint8_t *dst,
+                                     ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i gat = _mm_set1_epi32(0xc080400);
+  __m128i rep = _mm_set1_epi16(0x8000);
 
-    __m128i x = _mm_unpacklo_epi8(w32_0, zero);
-    __m128i y = _mm_sub_epi16(d, x);
-    weight_w[0] = _mm_unpacklo_epi16(x, y);
-    weight_w[1] = _mm_unpackhi_epi16(x, y);
+  for (int i = 0; i < h; ++i) {
+    __m128i b = _mm_shuffle_epi8(pixel[0], rep);
+    b = _mm_unpacklo_epi16(b, pixel[1]);
+    __m128i sum = _mm_madd_epi16(b, weight[0]);
 
-    x = _mm_unpackhi_epi8(w32_0, zero);
-    y = _mm_sub_epi16(d, x);
-    weight_w[2] = _mm_unpacklo_epi16(x, y);
-    weight_w[3] = _mm_unpackhi_epi16(x, y);
+    sum = _mm_add_epi32(sum, pred_round);
+    sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
 
-    x = _mm_unpacklo_epi8(w32_1, zero);
-    y = _mm_sub_epi16(d, x);
-    weight_w[4] = _mm_unpacklo_epi16(x, y);
-    weight_w[5] = _mm_unpackhi_epi16(x, y);
+    sum = _mm_shuffle_epi8(sum, gat);
+    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
+    dst += stride;
 
-    x = _mm_unpackhi_epi8(w32_1, zero);
-    y = _mm_sub_epi16(d, x);
-    weight_w[6] = _mm_unpacklo_epi16(x, y);
-    weight_w[7] = _mm_unpackhi_epi16(x, y);
+    rep = _mm_add_epi16(rep, one);
   }
+}
 
-  if (height == 32) {
-    weight_h[0] = _mm_unpacklo_epi8(w32_0, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(w32_0, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w4(above, left, 4, pixels);
 
-    weight_h[4] = _mm_unpacklo_epi8(w32_1, zero);
-    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
-    weight_h[6] = _mm_unpackhi_epi8(w32_1, zero);
-    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
+  __m128i weights;
+  load_weight_h_w4(sm_weight_arrays, 4, &weights);
 
-    weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
-    weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
-    weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]);
-    weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]);
+  smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
+}
 
-    weight_w[4] = _mm_unpacklo_epi16(weight_h[4], weight_h[5]);
-    weight_w[5] = _mm_unpackhi_epi16(weight_h[4], weight_h[5]);
-    weight_w[6] = _mm_unpacklo_epi16(weight_h[6], weight_h[7]);
-    weight_w[7] = _mm_unpackhi_epi16(weight_h[6], weight_h[7]);
+void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w4(above, left, 8, pixels);
+
+  __m128i weights;
+  load_weight_h_w4(sm_weight_arrays, 8, &weights);
+
+  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+}
+
+void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w4(above, left, 16, pixels);
+
+  __m128i weights;
+  load_weight_h_w4(sm_weight_arrays, 8, &weights);
+
+  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+  dst += stride << 3;
+
+  pixels[0] = _mm_srli_si128(pixels[0], 8);
+  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+}
+
+// pixels[0]: left vector
+// pixels[1]: right_pred vector
+// pixels[2]: left vector + 16
+// pixels[3]: right_pred vector
+static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
+
+  if (height == 4) {
+    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  } else if (height == 8) {
+    pixels[0] = _mm_loadl_epi64((const __m128i *)left);
+  } else if (height == 16) {
+    pixels[0] = _mm_load_si128((const __m128i *)left);
+  } else {
+    pixels[0] = _mm_load_si128((const __m128i *)left);
+    pixels[2] = _mm_load_si128((const __m128i *)(left + 16));
+    pixels[3] = pixels[1];
   }
 }
 
-static INLINE void smooth_pred_32x8(const __m128i *pixels, const __m128i *wh,
-                                    const __m128i *ww, uint8_t *dst,
-                                    ptrdiff_t stride, int quarter) {
-  __m128i d = _mm_set1_epi16(0x100);
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height,
+                                    __m128i *weight_w) {
+  (void)height;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
+  const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
+  const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
+  weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
+  weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
+}
+
+static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
+                                     int h, uint8_t *dst, ptrdiff_t stride,
+                                     int second_half) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
   const __m128i one = _mm_set1_epi16(1);
-  const __m128i inc = _mm_set1_epi16(0x202);
   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
-  __m128i rep =
-      (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008);
-  const __m128i left = (quarter < 2) ? pixels[8] : pixels[9];
+  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
 
-  int i;
-  for (i = 0; i < 8; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
-    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+  for (int i = 0; i < h; ++i) {
+    __m128i b = _mm_shuffle_epi8(pixels[0], rep);
+    b = _mm_unpacklo_epi16(b, pixels[1]);
+    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
+    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
 
-    int j;
-    __m128i s[8];
-    __m128i b = _mm_shuffle_epi8(left, rep);
-    b = _mm_unpacklo_epi16(b, pixels[10]);
+    sum0 = _mm_add_epi32(sum0, pred_round);
+    sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
 
-    for (j = 0; j < 8; ++j) {
-      s[j] = _mm_madd_epi16(pixels[j], wh_sc);
-      s[j] = _mm_add_epi32(s[j], _mm_madd_epi16(b, ww[j]));
-      s[j] = _mm_add_epi32(s[j], round);
-      s[j] = _mm_srai_epi32(s[j], 1 + sm_weight_log2_scale);
-    }
+    sum1 = _mm_add_epi32(sum1, pred_round);
+    sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
 
-    for (j = 0; j < 8; j += 2) {
-      __m128i sum = _mm_packus_epi16(s[j], s[j + 1]);
-      sum = _mm_shuffle_epi8(sum, gat);
-      _mm_storel_epi64((__m128i *)(dst + (j << 2)), sum);
-    }
+    sum0 = _mm_packus_epi16(sum0, sum1);
+    sum0 = _mm_shuffle_epi8(sum0, gat);
+    _mm_storel_epi64((__m128i *)dst, sum0);
     dst += stride;
+
     rep = _mm_add_epi16(rep, one);
-    d = _mm_add_epi16(d, inc);
   }
 }
 
-void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  __m128i pixels[11];
-  load_pixel_w32(above, left, 16, pixels);
+  __m128i pixels[2];
+  load_pixel_h_w8(above, left, 4, pixels);
 
-  __m128i wh[4], ww[8];
-  load_weight_w32(sm_weight_arrays, 16, wh, ww);
+  __m128i ww[2];
+  load_weight_h_w8(sm_weight_arrays, 4, ww);
 
-  smooth_pred_32x8(pixels, wh, ww, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_32x8(pixels, &wh[2], ww, dst, stride, 1);
+  smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
 }
 
-void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  __m128i pixels[11];
-  load_pixel_w32(above, left, 32, pixels);
+  __m128i pixels[2];
+  load_pixel_h_w8(above, left, 8, pixels);
+
+  __m128i ww[2];
+  load_weight_h_w8(sm_weight_arrays, 8, ww);
+
+  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w8(above, left, 16, pixels);
+
+  __m128i ww[2];
+  load_weight_h_w8(sm_weight_arrays, 16, ww);
+
+  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
+}
 
-  __m128i wh[8], ww[8];
-  load_weight_w32(sm_weight_arrays, 32, wh, ww);
+void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[4];
+  load_pixel_h_w8(above, left, 32, pixels);
+
+  __m128i ww[2];
+  load_weight_h_w8(sm_weight_arrays, 32, ww);
 
-  smooth_pred_32x8(pixels, &wh[0], ww, dst, stride, 0);
+  smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
   dst += stride << 3;
-  smooth_pred_32x8(pixels, &wh[2], ww, dst, stride, 1);
+  smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
   dst += stride << 3;
-  smooth_pred_32x8(pixels, &wh[4], ww, dst, stride, 2);
+  smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
   dst += stride << 3;
-  smooth_pred_32x8(pixels, &wh[6], ww, dst, stride, 3);
+  smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
+}
+
+static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *above,
+                                          const uint8_t *left, uint32_t bw,
+                                          uint32_t bh) {
+  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i scale_value =
+      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+
+  for (uint32_t y = 0; y < bh; ++y) {
+    const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
+    const __m128i tr_ly =
+        _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
+
+    for (uint32_t x = 0; x < bw; x += 8) {
+      const __m128i weights_x =
+          _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
+      const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
+      const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
+      const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
+      const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
+      __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
+      __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
+
+      pred_lo = _mm_add_epi32(pred_lo, pred_round);
+      pred_hi = _mm_add_epi32(pred_hi, pred_round);
+
+      pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
+      pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
+
+      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+      pred = _mm_shuffle_epi8(pred, gat);
+      _mm_storel_epi64((__m128i *)(dst + x), pred);
+    }
+    dst += stride;
+  }
+}
+
+void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
+}
+
+void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
+}
+
+void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
+}
+
+void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
+}
+
+void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
+}
+
+void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
+}
+
+void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
+}
+
+void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
+}
+
+void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
 }
diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3_asm.asm b/third_party/aom/aom_dsp/x86/intrapred_ssse3_asm.asm
deleted file mode 100644
index bc1bb2ff3..000000000
--- a/third_party/aom/aom_dsp/x86/intrapred_ssse3_asm.asm
+++ /dev/null
@@ -1,410 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-
-pb_1: times 16 db 1
-sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
-sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
-sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
-sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
-sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-
-SECTION .text
-
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
-  pavgb               %4, %1, %3
-  pxor                %3, %1
-  pand                %3, [GLOBAL(pb_1)]
-  psubb               %4, %3
-  pavgb               %4, %2
-%endmacro
-
-INIT_XMM ssse3
-cglobal d63e_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
-  GET_GOT     goffsetq
-
-  movq                m3, [aboveq]
-  pshufb              m1, m3, [GLOBAL(sh_b23456777)]
-  pshufb              m2, m3, [GLOBAL(sh_b12345677)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
-  pavgb               m3, m2
-
-  ; store 4 lines
-  movd    [dstq        ], m3
-  movd    [dstq+strideq], m4
-  lea               dstq, [dstq+strideq*2]
-  psrldq              m3, 1
-  psrldq              m4, 1
-  movd    [dstq        ], m3
-  movd    [dstq+strideq], m4
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-  movd                m0, [leftq]               ; l1, l2, l3, l4
-  movd                m1, [aboveq-1]            ; tl, t1, t2, t3
-  punpckldq           m0, m1                    ; l1, l2, l3, l4, tl, t1, t2, t3
-  pshufb              m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
-  psrldq              m1, m0, 1                 ; l3, l2, l1, tl, t1, t2, t3
-  psrldq              m2, m0, 2                 ; l2, l1, tl, t1, t2, t3
-  ; comments below are for a predictor like this
-  ; A1 B1 C1 D1
-  ; A2 B2 A1 B1
-  ; A3 B3 A2 B2
-  ; A4 B4 A3 B3
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3  ; 3-tap avg B4 B3 B2 B1 C1 D1
-  pavgb               m1, m0                    ; 2-tap avg A4 A3 A2 A1
-
-  punpcklqdq          m3, m1                    ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
-
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-  pshufb              m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
-  movd  [dstq+stride3q ], m3
-  psrldq              m3, 2                     ; A3 B3 A2 B2 A1 B1 C1 D1 ..
-  movd  [dstq+strideq*2], m3
-  psrldq              m3, 2                     ; A2 B2 A1 B1 C1 D1 ..
-  movd  [dstq+strideq  ], m3
-  psrldq              m3, 2                     ; A1 B1 C1 D1 ..
-  movd  [dstq          ], m3
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-  movq                m0, [leftq]                     ; [0- 7] l1-8 [byte]
-  movhps              m0, [aboveq-1]                  ; [8-15] tl, t1-7 [byte]
-  pshufb              m1, m0, [GLOBAL(sh_b76543210)]  ; l8-1 [word]
-  pshufb              m2, m0, [GLOBAL(sh_b65432108)]  ; l7-1,tl [word]
-  pshufb              m3, m0, [GLOBAL(sh_b54321089)]  ; l6-1,tl,t1 [word]
-  pshufb              m0, [GLOBAL(sh_b89abcdef)]      ; tl,t1-7 [word]
-  psrldq              m4, m0, 1                       ; t1-7 [word]
-  psrldq              m5, m0, 2                       ; t2-7 [word]
-  ; comments below are for a predictor like this
-  ; A1 B1 C1 D1 E1 F1 G1 H1
-  ; A2 B2 A1 B1 C1 D1 E1 F1
-  ; A3 B3 A2 B2 A1 B1 C1 D1
-  ; A4 B4 A3 B3 A2 B2 A1 B1
-  ; A5 B5 A4 B4 A3 B3 A2 B2
-  ; A6 B6 A5 B5 A4 B4 A3 B3
-  ; A7 B7 A6 B6 A5 B5 A4 B4
-  ; A8 B8 A7 B7 A6 B6 A5 B5
-  pavgb               m6, m1, m2                ; 2-tap avg A8-A1
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7  ; 3-tap avg C-H1
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0  ; 3-tap avg B8-1
-
-  punpcklbw           m6, m0                    ; A-B8, A-B7 ... A-B2, A-B1
-
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-
-  movhps [dstq+stride3q], m6                    ; A-B4, A-B3, A-B2, A-B1
-  palignr             m0, m7, m6, 10            ; A-B3, A-B2, A-B1, C-H1
-  movq  [dstq+strideq*2], m0
-  psrldq              m0, 2                     ; A-B2, A-B1, C-H1
-  movq  [dstq+strideq  ], m0
-  psrldq              m0, 2                     ; A-H1
-  movq  [dstq          ], m0
-  lea               dstq, [dstq+strideq*4]
-  movq  [dstq+stride3q ], m6                    ; A-B8, A-B7, A-B6, A-B5
-  psrldq              m6, 2                     ; A-B7, A-B6, A-B5, A-B4
-  movq  [dstq+strideq*2], m6
-  psrldq              m6, 2                     ; A-B6, A-B5, A-B4, A-B3
-  movq  [dstq+strideq  ], m6
-  psrldq              m6, 2                     ; A-B5, A-B4, A-B3, A-B2
-  movq  [dstq          ], m6
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-  mova                m0, [leftq]
-  movu                m7, [aboveq-1]
-  ; comments below are for a predictor like this
-  ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
-  ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
-  ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
-  ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
-  ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
-  ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
-  ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
-  ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
-  ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
-  ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
-  ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
-  ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
-  ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
-  ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
-  ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
-  ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
-  pshufb              m6, m7, [GLOBAL(sh_bfedcba9876543210)]
-  palignr             m5, m0, m6, 15
-  palignr             m3, m0, m6, 14
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
-  pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)]
-  pavgb               m5, m0                            ; A1 - Ag
-
-  punpcklbw           m0, m4, m5                        ; A-B8 ... A-B1
-  punpckhbw           m4, m5                            ; A-B9 ... A-Bg
-
-  pshufb              m3, m7, [GLOBAL(sh_b123456789abcdeff)]
-  pshufb              m5, m7, [GLOBAL(sh_b23456789abcdefff)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg C1-P1
-
-  pshufb              m6, m0, [GLOBAL(sh_bfedcba9876543210)]
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-  palignr             m2, m1, m6, 14
-  mova  [dstq          ], m2
-  palignr             m2, m1, m6, 12
-  mova  [dstq+strideq  ], m2
-  palignr             m2, m1, m6, 10
-  mova  [dstq+strideq*2], m2
-  palignr             m2, m1, m6, 8
-  mova  [dstq+stride3q ], m2
-  lea               dstq, [dstq+strideq*4]
-  palignr             m2, m1, m6, 6
-  mova  [dstq          ], m2
-  palignr             m2, m1, m6, 4
-  mova  [dstq+strideq  ], m2
-  palignr             m2, m1, m6, 2
-  mova  [dstq+strideq*2], m2
-  pshufb              m4, [GLOBAL(sh_bfedcba9876543210)]
-  mova  [dstq+stride3q ], m6
-  lea               dstq, [dstq+strideq*4]
-
-  palignr             m2, m6, m4, 14
-  mova  [dstq          ], m2
-  palignr             m2, m6, m4, 12
-  mova  [dstq+strideq  ], m2
-  palignr             m2, m6, m4, 10
-  mova  [dstq+strideq*2], m2
-  palignr             m2, m6, m4, 8
-  mova  [dstq+stride3q ], m2
-  lea               dstq, [dstq+strideq*4]
-  palignr             m2, m6, m4, 6
-  mova  [dstq          ], m2
-  palignr             m2, m6, m4, 4
-  mova  [dstq+strideq  ], m2
-  palignr             m2, m6, m4, 2
-  mova  [dstq+strideq*2], m2
-  mova  [dstq+stride3q ], m4
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-  mova                  m0, [leftq]
-  movu                  m7, [aboveq-1]
-  movu                  m1, [aboveq+15]
-
-  pshufb                m4, m1, [GLOBAL(sh_b123456789abcdeff)]
-  pshufb                m6, m1, [GLOBAL(sh_b23456789abcdefff)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2          ; 3-tap avg above [high]
-
-  palignr               m3, m1, m7, 1
-  palignr               m5, m1, m7, 2
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg above [low]
-
-  pshufb                m7, [GLOBAL(sh_bfedcba9876543210)]
-  palignr               m5, m0, m7, 15
-  palignr               m3, m0, m7, 14
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
-  pavgb                 m5, m0                            ; A1 - Ag
-  punpcklbw             m6, m4, m5                        ; A-B8 ... A-B1
-  punpckhbw             m4, m5                            ; A-B9 ... A-Bg
-  pshufb                m6, [GLOBAL(sh_bfedcba9876543210)]
-  pshufb                m4, [GLOBAL(sh_bfedcba9876543210)]
-
-  DEFINE_ARGS dst, stride, stride3, left, line
-  lea             stride3q, [strideq*3]
-
-  palignr               m5, m2, m1, 14
-  palignr               m7, m1, m6, 14
-  mova  [dstq            ], m7
-  mova  [dstq+16         ], m5
-  palignr               m5, m2, m1, 12
-  palignr               m7, m1, m6, 12
-  mova  [dstq+strideq    ], m7
-  mova  [dstq+strideq+16 ], m5
-  palignr                m5, m2, m1, 10
-  palignr                m7, m1, m6, 10
-  mova  [dstq+strideq*2   ], m7
-  mova  [dstq+strideq*2+16], m5
-  palignr                m5, m2, m1, 8
-  palignr                m7, m1, m6, 8
-  mova  [dstq+stride3q    ], m7
-  mova  [dstq+stride3q+16 ], m5
-  lea                  dstq, [dstq+strideq*4]
-  palignr                m5, m2, m1, 6
-  palignr                m7, m1, m6, 6
-  mova  [dstq             ], m7
-  mova  [dstq+16          ], m5
-  palignr                m5, m2, m1, 4
-  palignr                m7, m1, m6, 4
-  mova  [dstq+strideq     ], m7
-  mova  [dstq+strideq+16  ], m5
-  palignr                m5, m2, m1, 2
-  palignr                m7, m1, m6, 2
-  mova  [dstq+strideq*2   ], m7
-  mova  [dstq+strideq*2+16], m5
-  mova  [dstq+stride3q    ], m6
-  mova  [dstq+stride3q+16 ], m1
-  lea                  dstq, [dstq+strideq*4]
-
-  palignr                m5, m1, m6, 14
-  palignr                m3, m6, m4, 14
-  mova  [dstq             ], m3
-  mova  [dstq+16          ], m5
-  palignr                m5, m1, m6, 12
-  palignr                m3, m6, m4, 12
-  mova  [dstq+strideq     ], m3
-  mova  [dstq+strideq+16  ], m5
-  palignr                m5, m1, m6, 10
-  palignr                m3, m6, m4, 10
-  mova  [dstq+strideq*2   ], m3
-  mova  [dstq+strideq*2+16], m5
-  palignr                m5, m1, m6, 8
-  palignr                m3, m6, m4, 8
-  mova  [dstq+stride3q    ], m3
-  mova  [dstq+stride3q+16 ], m5
-  lea                  dstq, [dstq+strideq*4]
-  palignr                m5, m1, m6, 6
-  palignr                m3, m6, m4, 6
-  mova  [dstq             ], m3
-  mova  [dstq+16          ], m5
-  palignr                m5, m1, m6, 4
-  palignr                m3, m6, m4, 4
-  mova  [dstq+strideq     ], m3
-  mova  [dstq+strideq+16  ], m5
-  palignr                m5, m1, m6, 2
-  palignr                m3, m6, m4, 2
-  mova  [dstq+strideq*2   ], m3
-  mova  [dstq+strideq*2+16], m5
-  mova  [dstq+stride3q    ], m4
-  mova  [dstq+stride3q+16 ], m6
-  lea               dstq, [dstq+strideq*4]
-
-  mova                   m7, [leftq]
-  mova                   m3, [leftq+16]
-  palignr                m5, m3, m7, 15
-  palignr                m0, m3, m7, 14
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2          ; 3-tap avg Bh -
-  pavgb                  m5, m3                            ; Ah -
-  punpcklbw              m3, m2, m5                        ; A-B8 ... A-B1
-  punpckhbw              m2, m5                            ; A-B9 ... A-Bg
-  pshufb                 m3, [GLOBAL(sh_bfedcba9876543210)]
-  pshufb                 m2, [GLOBAL(sh_bfedcba9876543210)]
-
-  palignr                m7, m6, m4, 14
-  palignr                m0, m4, m3, 14
-  mova  [dstq             ], m0
-  mova  [dstq+16          ], m7
-  palignr                m7, m6, m4, 12
-  palignr                m0, m4, m3, 12
-  mova  [dstq+strideq     ], m0
-  mova  [dstq+strideq+16  ], m7
-  palignr                m7, m6, m4, 10
-  palignr                m0, m4, m3, 10
-  mova  [dstq+strideq*2   ], m0
-  mova  [dstq+strideq*2+16], m7
-  palignr                m7, m6, m4, 8
-  palignr                m0, m4, m3, 8
-  mova  [dstq+stride3q    ], m0
-  mova  [dstq+stride3q+16 ], m7
-  lea                  dstq, [dstq+strideq*4]
-  palignr                m7, m6, m4, 6
-  palignr                m0, m4, m3, 6
-  mova  [dstq             ], m0
-  mova  [dstq+16          ], m7
-  palignr                m7, m6, m4, 4
-  palignr                m0, m4, m3, 4
-  mova  [dstq+strideq     ], m0
-  mova  [dstq+strideq+16  ], m7
-  palignr                m7, m6, m4, 2
-  palignr                m0, m4, m3, 2
-  mova  [dstq+strideq*2   ], m0
-  mova  [dstq+strideq*2+16], m7
-  mova  [dstq+stride3q    ], m3
-  mova  [dstq+stride3q+16 ], m4
-  lea                  dstq, [dstq+strideq*4]
-
-  palignr                m7, m4, m3, 14
-  palignr                m0, m3, m2, 14
-  mova  [dstq             ], m0
-  mova  [dstq+16          ], m7
-  palignr                m7, m4, m3, 12
-  palignr                m0, m3, m2, 12
-  mova  [dstq+strideq     ], m0
-  mova  [dstq+strideq+16  ], m7
-  palignr                m7, m4, m3, 10
-  palignr                m0, m3, m2, 10
-  mova  [dstq+strideq*2   ], m0
-  mova  [dstq+strideq*2+16], m7
-  palignr                m7, m4, m3, 8
-  palignr                m0, m3, m2, 8
-  mova  [dstq+stride3q    ], m0
-  mova  [dstq+stride3q+16 ], m7
-  lea                  dstq, [dstq+strideq*4]
-  palignr                m7, m4, m3, 6
-  palignr                m0, m3, m2, 6
-  mova  [dstq             ], m0
-  mova  [dstq+16          ], m7
-  palignr                m7, m4, m3, 4
-  palignr                m0, m3, m2, 4
-  mova  [dstq+strideq     ], m0
-  mova  [dstq+strideq+16  ], m7
-  palignr                m7, m4, m3, 2
-  palignr                m0, m3, m2, 2
-  mova  [dstq+strideq*2   ], m0
-  mova  [dstq+strideq*2+16], m7
-  mova  [dstq+stride3q    ], m2
-  mova  [dstq+stride3q+16 ], m3
-
-  RESTORE_GOT
-  RET
diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_avx2.c b/third_party/aom/aom_dsp/x86/inv_txfm_avx2.c
deleted file mode 100644
index a9d6a127c..000000000
--- a/third_party/aom/aom_dsp/x86/inv_txfm_avx2.c
+++ /dev/null
@@ -1,1238 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/inv_txfm.h"
-#include "aom_dsp/x86/inv_txfm_common_avx2.h"
-#include "aom_dsp/x86/txfm_common_avx2.h"
-
-void aom_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
-                                int stride) {
-  __m256i in[16];
-  load_buffer_16x16(input, in);
-  mm256_transpose_16x16(in, in);
-  av1_idct16_avx2(in);
-  mm256_transpose_16x16(in, in);
-  av1_idct16_avx2(in);
-  store_buffer_16xN(in, stride, dest, 16);
-}
-
-static INLINE void transpose_col_to_row_nz4x4(__m256i *in /*in[4]*/) {
-  const __m256i u0 = _mm256_unpacklo_epi16(in[0], in[1]);
-  const __m256i u1 = _mm256_unpacklo_epi16(in[2], in[3]);
-  const __m256i v0 = _mm256_unpacklo_epi32(u0, u1);
-  const __m256i v1 = _mm256_unpackhi_epi32(u0, u1);
-  in[0] = _mm256_permute4x64_epi64(v0, 0xA8);
-  in[1] = _mm256_permute4x64_epi64(v0, 0xA9);
-  in[2] = _mm256_permute4x64_epi64(v1, 0xA8);
-  in[3] = _mm256_permute4x64_epi64(v1, 0xA9);
-}
-
-#define MM256_SHUFFLE_EPI64(x0, x1, imm8)                        \
-  _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(x0), \
-                                        _mm256_castsi256_pd(x1), imm8))
-
-static INLINE void transpose_col_to_row_nz4x16(__m256i *in /*in[16]*/) {
-  int i;
-  for (i = 0; i < 16; i += 4) {
-    transpose_col_to_row_nz4x4(&in[i]);
-  }
-
-  for (i = 0; i < 4; ++i) {
-    in[i] = MM256_SHUFFLE_EPI64(in[i], in[i + 4], 0);
-    in[i + 8] = MM256_SHUFFLE_EPI64(in[i + 8], in[i + 12], 0);
-  }
-
-  for (i = 0; i < 4; ++i) {
-    in[i] = _mm256_permute2x128_si256(in[i], in[i + 8], 0x20);
-  }
-}
-
-// Coefficients 0-7 before the final butterfly
-static INLINE void idct16_10_first_half(const __m256i *in, __m256i *out) {
-  const __m256i c2p28 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-  const __m256i c2p04 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-  const __m256i v4 = _mm256_mulhrs_epi16(in[2], c2p28);
-  const __m256i v7 = _mm256_mulhrs_epi16(in[2], c2p04);
-
-  const __m256i c2p16 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-  const __m256i v0 = _mm256_mulhrs_epi16(in[0], c2p16);
-  const __m256i v1 = v0;
-
-  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  __m256i v5, v6;
-  unpack_butter_fly(&v7, &v4, &cospi_p16_m16, &cospi_p16_p16, &v5, &v6);
-
-  out[0] = _mm256_add_epi16(v0, v7);
-  out[1] = _mm256_add_epi16(v1, v6);
-  out[2] = _mm256_add_epi16(v1, v5);
-  out[3] = _mm256_add_epi16(v0, v4);
-  out[4] = _mm256_sub_epi16(v0, v4);
-  out[5] = _mm256_sub_epi16(v1, v5);
-  out[6] = _mm256_sub_epi16(v1, v6);
-  out[7] = _mm256_sub_epi16(v0, v7);
-}
-
-// Coefficients 8-15 before the final butterfly
-static INLINE void idct16_10_second_half(const __m256i *in, __m256i *out) {
-  const __m256i c2p30 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
-  const __m256i c2p02 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
-  const __m256i t0 = _mm256_mulhrs_epi16(in[1], c2p30);
-  const __m256i t7 = _mm256_mulhrs_epi16(in[1], c2p02);
-
-  const __m256i c2m26 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
-  const __m256i c2p06 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-  const __m256i t3 = _mm256_mulhrs_epi16(in[3], c2m26);
-  const __m256i t4 = _mm256_mulhrs_epi16(in[3], c2p06);
-
-  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  __m256i t1, t2, t5, t6;
-  unpack_butter_fly(&t0, &t7, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6);
-  unpack_butter_fly(&t3, &t4, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5);
-
-  out[0] = _mm256_add_epi16(t0, t3);
-  out[1] = _mm256_add_epi16(t1, t2);
-  out[6] = _mm256_add_epi16(t6, t5);
-  out[7] = _mm256_add_epi16(t7, t4);
-
-  const __m256i v2 = _mm256_sub_epi16(t1, t2);
-  const __m256i v3 = _mm256_sub_epi16(t0, t3);
-  const __m256i v4 = _mm256_sub_epi16(t7, t4);
-  const __m256i v5 = _mm256_sub_epi16(t6, t5);
-  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &out[2], &out[5]);
-  unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &out[3], &out[4]);
-}
-
-static INLINE void add_sub_butterfly(const __m256i *in, __m256i *out,
-                                     int size) {
-  int i = 0;
-  const int num = size >> 1;
-  const int bound = size - 1;
-  while (i < num) {
-    out[i] = _mm256_add_epi16(in[i], in[bound - i]);
-    out[bound - i] = _mm256_sub_epi16(in[i], in[bound - i]);
-    i++;
-  }
-}
-
-static INLINE void idct16_10(__m256i *in /*in[16]*/) {
-  __m256i out[16];
-  idct16_10_first_half(in, out);
-  idct16_10_second_half(in, &out[8]);
-  add_sub_butterfly(out, in, 16);
-}
-
-void aom_idct16x16_10_add_avx2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  __m256i in[16];
-
-  load_coeff(input, &in[0]);
-  load_coeff(input + 16, &in[1]);
-  load_coeff(input + 32, &in[2]);
-  load_coeff(input + 48, &in[3]);
-
-  transpose_col_to_row_nz4x4(in);
-  idct16_10(in);
-
-  transpose_col_to_row_nz4x16(in);
-  idct16_10(in);
-
-  store_buffer_16xN(in, stride, dest, 16);
-}
-
-// Note:
-//  For 16x16 int16_t matrix
-//  transpose first 8 columns into first 8 rows.
-//  Since only upper-left 8x8 are non-zero, the input are first 8 rows (in[8]).
-//  After transposing, the 8 row vectors are in in[8].
-void transpose_col_to_row_nz8x8(__m256i *in /*in[8]*/) {
-  __m256i u0 = _mm256_unpacklo_epi16(in[0], in[1]);
-  __m256i u1 = _mm256_unpackhi_epi16(in[0], in[1]);
-  __m256i u2 = _mm256_unpacklo_epi16(in[2], in[3]);
-  __m256i u3 = _mm256_unpackhi_epi16(in[2], in[3]);
-
-  const __m256i v0 = _mm256_unpacklo_epi32(u0, u2);
-  const __m256i v1 = _mm256_unpackhi_epi32(u0, u2);
-  const __m256i v2 = _mm256_unpacklo_epi32(u1, u3);
-  const __m256i v3 = _mm256_unpackhi_epi32(u1, u3);
-
-  u0 = _mm256_unpacklo_epi16(in[4], in[5]);
-  u1 = _mm256_unpackhi_epi16(in[4], in[5]);
-  u2 = _mm256_unpacklo_epi16(in[6], in[7]);
-  u3 = _mm256_unpackhi_epi16(in[6], in[7]);
-
-  const __m256i v4 = _mm256_unpacklo_epi32(u0, u2);
-  const __m256i v5 = _mm256_unpackhi_epi32(u0, u2);
-  const __m256i v6 = _mm256_unpacklo_epi32(u1, u3);
-  const __m256i v7 = _mm256_unpackhi_epi32(u1, u3);
-
-  in[0] = MM256_SHUFFLE_EPI64(v0, v4, 0);
-  in[1] = MM256_SHUFFLE_EPI64(v0, v4, 3);
-  in[2] = MM256_SHUFFLE_EPI64(v1, v5, 0);
-  in[3] = MM256_SHUFFLE_EPI64(v1, v5, 3);
-  in[4] = MM256_SHUFFLE_EPI64(v2, v6, 0);
-  in[5] = MM256_SHUFFLE_EPI64(v2, v6, 3);
-  in[6] = MM256_SHUFFLE_EPI64(v3, v7, 0);
-  in[7] = MM256_SHUFFLE_EPI64(v3, v7, 3);
-}
-
-// Note:
-//  For 16x16 int16_t matrix
-//  transpose first 8 columns into first 8 rows.
-//  Since only matrix left 8x16 are non-zero, the input are total 16 rows
-//  (in[16]).
-//  After transposing, the 8 row vectors are in in[8]. All else are zero.
-static INLINE void transpose_col_to_row_nz8x16(__m256i *in /*in[16]*/) {
-  transpose_col_to_row_nz8x8(in);
-  transpose_col_to_row_nz8x8(&in[8]);
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    in[i] = _mm256_permute2x128_si256(in[i], in[i + 8], 0x20);
-  }
-}
-
-static INLINE void idct16_38_first_half(const __m256i *in, __m256i *out) {
-  const __m256i c2p28 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-  const __m256i c2p04 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-  __m256i t4 = _mm256_mulhrs_epi16(in[2], c2p28);
-  __m256i t7 = _mm256_mulhrs_epi16(in[2], c2p04);
-
-  const __m256i c2m20 = pair256_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
-  const __m256i c2p12 = pair256_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
-  __m256i t5 = _mm256_mulhrs_epi16(in[6], c2m20);
-  __m256i t6 = _mm256_mulhrs_epi16(in[6], c2p12);
-
-  const __m256i c2p16 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-  const __m256i c2p24 = pair256_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
-  const __m256i c2p08 = pair256_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
-  const __m256i u0 = _mm256_mulhrs_epi16(in[0], c2p16);
-  const __m256i u1 = _mm256_mulhrs_epi16(in[0], c2p16);
-  const __m256i u2 = _mm256_mulhrs_epi16(in[4], c2p24);
-  const __m256i u3 = _mm256_mulhrs_epi16(in[4], c2p08);
-
-  const __m256i u4 = _mm256_add_epi16(t4, t5);
-  const __m256i u5 = _mm256_sub_epi16(t4, t5);
-  const __m256i u6 = _mm256_sub_epi16(t7, t6);
-  const __m256i u7 = _mm256_add_epi16(t7, t6);
-
-  const __m256i t0 = _mm256_add_epi16(u0, u3);
-  const __m256i t1 = _mm256_add_epi16(u1, u2);
-  const __m256i t2 = _mm256_sub_epi16(u1, u2);
-  const __m256i t3 = _mm256_sub_epi16(u0, u3);
-
-  t4 = u4;
-  t7 = u7;
-
-  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  unpack_butter_fly(&u6, &u5, &cospi_p16_m16, &cospi_p16_p16, &t5, &t6);
-
-  out[0] = _mm256_add_epi16(t0, t7);
-  out[1] = _mm256_add_epi16(t1, t6);
-  out[2] = _mm256_add_epi16(t2, t5);
-  out[3] = _mm256_add_epi16(t3, t4);
-  out[4] = _mm256_sub_epi16(t3, t4);
-  out[5] = _mm256_sub_epi16(t2, t5);
-  out[6] = _mm256_sub_epi16(t1, t6);
-  out[7] = _mm256_sub_epi16(t0, t7);
-}
-
-static INLINE void idct16_38_second_half(const __m256i *in, __m256i *out) {
-  const __m256i c2p30 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
-  const __m256i c2p02 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
-  __m256i t0 = _mm256_mulhrs_epi16(in[1], c2p30);
-  __m256i t7 = _mm256_mulhrs_epi16(in[1], c2p02);
-
-  const __m256i c2m18 = pair256_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64);
-  const __m256i c2p14 = pair256_set_epi16(2 * cospi_14_64, 2 * cospi_14_64);
-  __m256i t1 = _mm256_mulhrs_epi16(in[7], c2m18);
-  __m256i t6 = _mm256_mulhrs_epi16(in[7], c2p14);
-
-  const __m256i c2p22 = pair256_set_epi16(2 * cospi_22_64, 2 * cospi_22_64);
-  const __m256i c2p10 = pair256_set_epi16(2 * cospi_10_64, 2 * cospi_10_64);
-  __m256i t2 = _mm256_mulhrs_epi16(in[5], c2p22);
-  __m256i t5 = _mm256_mulhrs_epi16(in[5], c2p10);
-
-  const __m256i c2m26 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
-  const __m256i c2p06 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-  __m256i t3 = _mm256_mulhrs_epi16(in[3], c2m26);
-  __m256i t4 = _mm256_mulhrs_epi16(in[3], c2p06);
-
-  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
-  v0 = _mm256_add_epi16(t0, t1);
-  v1 = _mm256_sub_epi16(t0, t1);
-  v2 = _mm256_sub_epi16(t3, t2);
-  v3 = _mm256_add_epi16(t2, t3);
-  v4 = _mm256_add_epi16(t4, t5);
-  v5 = _mm256_sub_epi16(t4, t5);
-  v6 = _mm256_sub_epi16(t7, t6);
-  v7 = _mm256_add_epi16(t6, t7);
-
-  t0 = v0;
-  t7 = v7;
-  t3 = v3;
-  t4 = v4;
-  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-  unpack_butter_fly(&v1, &v6, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6);
-  unpack_butter_fly(&v2, &v5, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5);
-
-  v0 = _mm256_add_epi16(t0, t3);
-  v1 = _mm256_add_epi16(t1, t2);
-  v2 = _mm256_sub_epi16(t1, t2);
-  v3 = _mm256_sub_epi16(t0, t3);
-  v4 = _mm256_sub_epi16(t7, t4);
-  v5 = _mm256_sub_epi16(t6, t5);
-  v6 = _mm256_add_epi16(t6, t5);
-  v7 = _mm256_add_epi16(t7, t4);
-
-  // stage 6, (8-15)
-  out[0] = v0;
-  out[1] = v1;
-  out[6] = v6;
-  out[7] = v7;
-  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &out[2], &out[5]);
-  unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &out[3], &out[4]);
-}
-
-static INLINE void idct16_38(__m256i *in /*in[16]*/) {
-  __m256i out[16];
-  idct16_38_first_half(in, out);
-  idct16_38_second_half(in, &out[8]);
-  add_sub_butterfly(out, in, 16);
-}
-
-void aom_idct16x16_38_add_avx2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  __m256i in[16];
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    load_coeff(input + (i << 4), &in[i]);
-  }
-
-  transpose_col_to_row_nz8x8(in);
-  idct16_38(in);
-
-  transpose_col_to_row_nz8x16(in);
-  idct16_38(in);
-
-  store_buffer_16xN(in, stride, dest, 16);
-}
-
-static INLINE int calculate_dc(const tran_low_t *input) {
-  int dc = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  dc = (int)dct_const_round_shift(dc * cospi_16_64);
-  dc = ROUND_POWER_OF_TWO(dc, IDCT_ROUNDING_POS);
-  return dc;
-}
-
-void aom_idct16x16_1_add_avx2(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  const int dc = calculate_dc(input);
-  if (dc == 0) return;
-
-  const __m256i dc_value = _mm256_set1_epi16(dc);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    recon_and_store(&dc_value, dest);
-    dest += stride;
-  }
-}
-
-// -----------------------------------------------------------------------------
-// 32x32 partial IDCT
-
-void aom_idct32x32_1_add_avx2(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  const int dc = calculate_dc(input);
-  if (dc == 0) return;
-
-  const __m256i dc_value = _mm256_set1_epi16(dc);
-
-  int i;
-  for (i = 0; i < 32; ++i) {
-    recon_and_store(&dc_value, dest);
-    recon_and_store(&dc_value, dest + 16);
-    dest += stride;
-  }
-}
-
-static void load_buffer_32x16(const tran_low_t *input, __m256i *in /*in[32]*/) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    load_coeff(input, &in[i]);
-    load_coeff(input + 16, &in[i + 16]);
-    input += 32;
-  }
-}
-
-// Note:
-//  We extend SSSE3 operations to AVX2. Instead of operating on __m128i, we
-// operate coefficients on __m256i. Our operation capacity doubles for each
-// instruction.
-#define BUTTERFLY_PAIR(x0, x1, co0, co1)            \
-  do {                                              \
-    tmp0 = _mm256_madd_epi16(x0, co0);              \
-    tmp1 = _mm256_madd_epi16(x1, co0);              \
-    tmp2 = _mm256_madd_epi16(x0, co1);              \
-    tmp3 = _mm256_madd_epi16(x1, co1);              \
-    tmp0 = _mm256_add_epi32(tmp0, rounding);        \
-    tmp1 = _mm256_add_epi32(tmp1, rounding);        \
-    tmp2 = _mm256_add_epi32(tmp2, rounding);        \
-    tmp3 = _mm256_add_epi32(tmp3, rounding);        \
-    tmp0 = _mm256_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm256_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm256_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm256_srai_epi32(tmp3, DCT_CONST_BITS); \
-  } while (0)
-
-static INLINE void butterfly(const __m256i *x0, const __m256i *x1,
-                             const __m256i *c0, const __m256i *c1, __m256i *y0,
-                             __m256i *y1) {
-  __m256i tmp0, tmp1, tmp2, tmp3, u0, u1;
-  const __m256i rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-
-  u0 = _mm256_unpacklo_epi16(*x0, *x1);
-  u1 = _mm256_unpackhi_epi16(*x0, *x1);
-  BUTTERFLY_PAIR(u0, u1, *c0, *c1);
-  *y0 = _mm256_packs_epi32(tmp0, tmp1);
-  *y1 = _mm256_packs_epi32(tmp2, tmp3);
-}
-
-static INLINE void butterfly_self(__m256i *x0, __m256i *x1, const __m256i *c0,
-                                  const __m256i *c1) {
-  __m256i tmp0, tmp1, tmp2, tmp3, u0, u1;
-  const __m256i rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-
-  u0 = _mm256_unpacklo_epi16(*x0, *x1);
-  u1 = _mm256_unpackhi_epi16(*x0, *x1);
-  BUTTERFLY_PAIR(u0, u1, *c0, *c1);
-  *x0 = _mm256_packs_epi32(tmp0, tmp1);
-  *x1 = _mm256_packs_epi32(tmp2, tmp3);
-}
-
-// For each 16x32 block __m256i in[32],
-// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
-// output pixels: 8-15 in __m256i in[32]
-static void idct32_full_16x32_quarter_2(const __m256i *in /*in[32]*/,
-                                        __m256i *out /*out[16]*/) {
-  __m256i u8, u9, u10, u11, u12, u13, u14, u15;  // stp2_
-  __m256i v8, v9, v10, v11, v12, v13, v14, v15;  // stp1_
-
-  {
-    const __m256i stg2_0 = pair256_set_epi16(cospi_30_64, -cospi_2_64);
-    const __m256i stg2_1 = pair256_set_epi16(cospi_2_64, cospi_30_64);
-    const __m256i stg2_2 = pair256_set_epi16(cospi_14_64, -cospi_18_64);
-    const __m256i stg2_3 = pair256_set_epi16(cospi_18_64, cospi_14_64);
-    butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15);
-    butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14);
-  }
-
-  v8 = _mm256_add_epi16(u8, u9);
-  v9 = _mm256_sub_epi16(u8, u9);
-  v14 = _mm256_sub_epi16(u15, u14);
-  v15 = _mm256_add_epi16(u15, u14);
-
-  {
-    const __m256i stg2_4 = pair256_set_epi16(cospi_22_64, -cospi_10_64);
-    const __m256i stg2_5 = pair256_set_epi16(cospi_10_64, cospi_22_64);
-    const __m256i stg2_6 = pair256_set_epi16(cospi_6_64, -cospi_26_64);
-    const __m256i stg2_7 = pair256_set_epi16(cospi_26_64, cospi_6_64);
-    butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13);
-    butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12);
-  }
-
-  v10 = _mm256_sub_epi16(u11, u10);
-  v11 = _mm256_add_epi16(u11, u10);
-  v12 = _mm256_add_epi16(u12, u13);
-  v13 = _mm256_sub_epi16(u12, u13);
-
-  {
-    const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-    const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
-    butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm256_add_epi16(v8, v11);
-  out[1] = _mm256_add_epi16(v9, v10);
-  out[6] = _mm256_add_epi16(v14, v13);
-  out[7] = _mm256_add_epi16(v15, v12);
-
-  out[2] = _mm256_sub_epi16(v9, v10);
-  out[3] = _mm256_sub_epi16(v8, v11);
-  out[4] = _mm256_sub_epi16(v15, v12);
-  out[5] = _mm256_sub_epi16(v14, v13);
-
-  {
-    const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-    const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
-    butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
-  }
-}
-
-// For each 8x32 block __m256i in[32],
-// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
-// output pixels: 0-7 in __m256i in[32]
-static void idct32_full_16x32_quarter_1(const __m256i *in /*in[32]*/,
-                                        __m256i *out /*out[8]*/) {
-  __m256i u0, u1, u2, u3, u4, u5, u6, u7;  // stp1_
-  __m256i v0, v1, v2, v3, v4, v5, v6, v7;  // stp2_
-
-  {
-    const __m256i stg3_0 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
-    const __m256i stg3_1 = pair256_set_epi16(cospi_4_64, cospi_28_64);
-    const __m256i stg3_2 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
-    const __m256i stg3_3 = pair256_set_epi16(cospi_20_64, cospi_12_64);
-    butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7);
-    butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6);
-  }
-
-  v4 = _mm256_add_epi16(u4, u5);
-  v5 = _mm256_sub_epi16(u4, u5);
-  v6 = _mm256_sub_epi16(u7, u6);
-  v7 = _mm256_add_epi16(u7, u6);
-
-  {
-    const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-    const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-    const __m256i stg4_2 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
-    const __m256i stg4_3 = pair256_set_epi16(cospi_8_64, cospi_24_64);
-    butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
-
-    butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1);
-    butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3);
-  }
-
-  v0 = _mm256_add_epi16(u0, u3);
-  v1 = _mm256_add_epi16(u1, u2);
-  v2 = _mm256_sub_epi16(u1, u2);
-  v3 = _mm256_sub_epi16(u0, u3);
-
-  out[0] = _mm256_add_epi16(v0, v7);
-  out[1] = _mm256_add_epi16(v1, v6);
-  out[2] = _mm256_add_epi16(v2, v5);
-  out[3] = _mm256_add_epi16(v3, v4);
-  out[4] = _mm256_sub_epi16(v3, v4);
-  out[5] = _mm256_sub_epi16(v2, v5);
-  out[6] = _mm256_sub_epi16(v1, v6);
-  out[7] = _mm256_sub_epi16(v0, v7);
-}
-
-// For each 8x32 block __m256i in[32],
-// Input with odd index,
-// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
-// output pixels: 16-23, 24-31 in __m256i in[32]
-// We avoid hide an offset, 16, inside this function. So we output 0-15 into
-// array out[16]
-static void idct32_full_16x32_quarter_3_4(const __m256i *in /*in[32]*/,
-                                          __m256i *out /*out[16]*/) {
-  __m256i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m256i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m256i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m256i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  {
-    const __m256i stg1_0 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
-    const __m256i stg1_1 = pair256_set_epi16(cospi_1_64, cospi_31_64);
-    const __m256i stg1_2 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
-    const __m256i stg1_3 = pair256_set_epi16(cospi_17_64, cospi_15_64);
-    const __m256i stg1_4 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
-    const __m256i stg1_5 = pair256_set_epi16(cospi_9_64, cospi_23_64);
-    const __m256i stg1_6 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
-    const __m256i stg1_7 = pair256_set_epi16(cospi_25_64, cospi_7_64);
-    const __m256i stg1_8 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
-    const __m256i stg1_9 = pair256_set_epi16(cospi_5_64, cospi_27_64);
-    const __m256i stg1_10 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
-    const __m256i stg1_11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
-    const __m256i stg1_12 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
-    const __m256i stg1_13 = pair256_set_epi16(cospi_13_64, cospi_19_64);
-    const __m256i stg1_14 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
-    const __m256i stg1_15 = pair256_set_epi16(cospi_29_64, cospi_3_64);
-    butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31);
-    butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30);
-    butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29);
-    butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28);
-
-    butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27);
-    butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26);
-
-    butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25);
-    butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24);
-  }
-
-  v16 = _mm256_add_epi16(u16, u17);
-  v17 = _mm256_sub_epi16(u16, u17);
-  v18 = _mm256_sub_epi16(u19, u18);
-  v19 = _mm256_add_epi16(u19, u18);
-
-  v20 = _mm256_add_epi16(u20, u21);
-  v21 = _mm256_sub_epi16(u20, u21);
-  v22 = _mm256_sub_epi16(u23, u22);
-  v23 = _mm256_add_epi16(u23, u22);
-
-  v24 = _mm256_add_epi16(u24, u25);
-  v25 = _mm256_sub_epi16(u24, u25);
-  v26 = _mm256_sub_epi16(u27, u26);
-  v27 = _mm256_add_epi16(u27, u26);
-
-  v28 = _mm256_add_epi16(u28, u29);
-  v29 = _mm256_sub_epi16(u28, u29);
-  v30 = _mm256_sub_epi16(u31, u30);
-  v31 = _mm256_add_epi16(u31, u30);
-
-  {
-    const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
-    const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64);
-    const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
-    const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
-    const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64);
-    const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
-    butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
-    butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
-    butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
-    butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
-  }
-
-  u16 = _mm256_add_epi16(v16, v19);
-  u17 = _mm256_add_epi16(v17, v18);
-  u18 = _mm256_sub_epi16(v17, v18);
-  u19 = _mm256_sub_epi16(v16, v19);
-  u20 = _mm256_sub_epi16(v23, v20);
-  u21 = _mm256_sub_epi16(v22, v21);
-  u22 = _mm256_add_epi16(v22, v21);
-  u23 = _mm256_add_epi16(v23, v20);
-
-  u24 = _mm256_add_epi16(v24, v27);
-  u25 = _mm256_add_epi16(v25, v26);
-  u26 = _mm256_sub_epi16(v25, v26);
-  u27 = _mm256_sub_epi16(v24, v27);
-
-  u28 = _mm256_sub_epi16(v31, v28);
-  u29 = _mm256_sub_epi16(v30, v29);
-  u30 = _mm256_add_epi16(v29, v30);
-  u31 = _mm256_add_epi16(v28, v31);
-
-  {
-    const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-    const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-    butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-    butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-    butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm256_add_epi16(u16, u23);
-  out[1] = _mm256_add_epi16(u17, u22);
-  out[2] = _mm256_add_epi16(u18, u21);
-  out[3] = _mm256_add_epi16(u19, u20);
-  out[4] = _mm256_sub_epi16(u19, u20);
-  out[5] = _mm256_sub_epi16(u18, u21);
-  out[6] = _mm256_sub_epi16(u17, u22);
-  out[7] = _mm256_sub_epi16(u16, u23);
-
-  out[8] = _mm256_sub_epi16(u31, u24);
-  out[9] = _mm256_sub_epi16(u30, u25);
-  out[10] = _mm256_sub_epi16(u29, u26);
-  out[11] = _mm256_sub_epi16(u28, u27);
-  out[12] = _mm256_add_epi16(u27, u28);
-  out[13] = _mm256_add_epi16(u26, u29);
-  out[14] = _mm256_add_epi16(u25, u30);
-  out[15] = _mm256_add_epi16(u24, u31);
-
-  {
-    const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-    const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0);
-    butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0);
-    butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0);
-    butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0);
-  }
-}
-
-static void idct32_full_16x32_quarter_1_2(const __m256i *in /*in[32]*/,
-                                          __m256i *out /*out[32]*/) {
-  __m256i temp[16];
-  idct32_full_16x32_quarter_1(in, temp);
-  idct32_full_16x32_quarter_2(in, &temp[8]);
-  add_sub_butterfly(temp, out, 16);
-}
-
-static void idct32_16x32(const __m256i *in /*in[32]*/,
-                         __m256i *out /*out[32]*/) {
-  __m256i temp[32];
-  idct32_full_16x32_quarter_1_2(in, temp);
-  idct32_full_16x32_quarter_3_4(in, &temp[16]);
-  add_sub_butterfly(temp, out, 32);
-}
-
-void aom_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest,
-                                 int stride) {
-  __m256i col[64], in[32];
-  int i;
-
-  for (i = 0; i < 2; ++i) {
-    load_buffer_32x16(input, in);
-    input += 32 << 4;
-
-    mm256_transpose_16x16(in, in);
-    mm256_transpose_16x16(&in[16], &in[16]);
-    idct32_16x32(in, col + (i << 5));
-  }
-
-  for (i = 0; i < 2; ++i) {
-    int j = i << 4;
-    mm256_transpose_16x16(col + j, in);
-    mm256_transpose_16x16(col + j + 32, &in[16]);
-    idct32_16x32(in, in);
-    store_buffer_16xN(in, stride, dest, 32);
-    dest += 16;
-  }
-}
-
-// Group the coefficient calculation into smaller functions
-// to prevent stack spillover:
-// quarter_1: 0-7
-// quarter_2: 8-15
-// quarter_3_4: 16-23, 24-31
-static void idct32_16x32_135_quarter_1(const __m256i *in /*in[16]*/,
-                                       __m256i *out /*out[8]*/) {
-  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
-
-  {
-    const __m256i stk4_0 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-    const __m256i stk4_2 = pair256_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
-    const __m256i stk4_3 = pair256_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
-    u0 = _mm256_mulhrs_epi16(in[0], stk4_0);
-    u2 = _mm256_mulhrs_epi16(in[8], stk4_2);
-    u3 = _mm256_mulhrs_epi16(in[8], stk4_3);
-    u1 = u0;
-  }
-
-  v0 = _mm256_add_epi16(u0, u3);
-  v1 = _mm256_add_epi16(u1, u2);
-  v2 = _mm256_sub_epi16(u1, u2);
-  v3 = _mm256_sub_epi16(u0, u3);
-
-  {
-    const __m256i stk3_0 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-    const __m256i stk3_1 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-    const __m256i stk3_2 =
-        pair256_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
-    const __m256i stk3_3 = pair256_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
-    u4 = _mm256_mulhrs_epi16(in[4], stk3_0);
-    u7 = _mm256_mulhrs_epi16(in[4], stk3_1);
-    u5 = _mm256_mulhrs_epi16(in[12], stk3_2);
-    u6 = _mm256_mulhrs_epi16(in[12], stk3_3);
-  }
-
-  v4 = _mm256_add_epi16(u4, u5);
-  v5 = _mm256_sub_epi16(u4, u5);
-  v6 = _mm256_sub_epi16(u7, u6);
-  v7 = _mm256_add_epi16(u7, u6);
-
-  {
-    const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-    const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-    butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
-  }
-
-  out[0] = _mm256_add_epi16(v0, v7);
-  out[1] = _mm256_add_epi16(v1, v6);
-  out[2] = _mm256_add_epi16(v2, v5);
-  out[3] = _mm256_add_epi16(v3, v4);
-  out[4] = _mm256_sub_epi16(v3, v4);
-  out[5] = _mm256_sub_epi16(v2, v5);
-  out[6] = _mm256_sub_epi16(v1, v6);
-  out[7] = _mm256_sub_epi16(v0, v7);
-}
-
-static void idct32_16x32_135_quarter_2(const __m256i *in /*in[16]*/,
-                                       __m256i *out /*out[8]*/) {
-  __m256i u8, u9, u10, u11, u12, u13, u14, u15;
-  __m256i v8, v9, v10, v11, v12, v13, v14, v15;
-
-  {
-    const __m256i stk2_0 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
-    const __m256i stk2_1 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
-    const __m256i stk2_2 =
-        pair256_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64);
-    const __m256i stk2_3 = pair256_set_epi16(2 * cospi_14_64, 2 * cospi_14_64);
-    const __m256i stk2_4 = pair256_set_epi16(2 * cospi_22_64, 2 * cospi_22_64);
-    const __m256i stk2_5 = pair256_set_epi16(2 * cospi_10_64, 2 * cospi_10_64);
-    const __m256i stk2_6 =
-        pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
-    const __m256i stk2_7 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-    u8 = _mm256_mulhrs_epi16(in[2], stk2_0);
-    u15 = _mm256_mulhrs_epi16(in[2], stk2_1);
-    u9 = _mm256_mulhrs_epi16(in[14], stk2_2);
-    u14 = _mm256_mulhrs_epi16(in[14], stk2_3);
-    u10 = _mm256_mulhrs_epi16(in[10], stk2_4);
-    u13 = _mm256_mulhrs_epi16(in[10], stk2_5);
-    u11 = _mm256_mulhrs_epi16(in[6], stk2_6);
-    u12 = _mm256_mulhrs_epi16(in[6], stk2_7);
-  }
-
-  v8 = _mm256_add_epi16(u8, u9);
-  v9 = _mm256_sub_epi16(u8, u9);
-  v10 = _mm256_sub_epi16(u11, u10);
-  v11 = _mm256_add_epi16(u11, u10);
-  v12 = _mm256_add_epi16(u12, u13);
-  v13 = _mm256_sub_epi16(u12, u13);
-  v14 = _mm256_sub_epi16(u15, u14);
-  v15 = _mm256_add_epi16(u15, u14);
-
-  {
-    const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-    const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
-    butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm256_add_epi16(v8, v11);
-  out[1] = _mm256_add_epi16(v9, v10);
-  out[2] = _mm256_sub_epi16(v9, v10);
-  out[3] = _mm256_sub_epi16(v8, v11);
-  out[4] = _mm256_sub_epi16(v15, v12);
-  out[5] = _mm256_sub_epi16(v14, v13);
-  out[6] = _mm256_add_epi16(v14, v13);
-  out[7] = _mm256_add_epi16(v15, v12);
-
-  {
-    const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-    const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
-    butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
-  }
-}
-
-// 8x32 block even indexed 8 inputs of in[16],
-// output first half 16 to out[32]
-static void idct32_16x32_quarter_1_2(const __m256i *in /*in[16]*/,
-                                     __m256i *out /*out[32]*/) {
-  __m256i temp[16];
-  idct32_16x32_135_quarter_1(in, temp);
-  idct32_16x32_135_quarter_2(in, &temp[8]);
-  add_sub_butterfly(temp, out, 16);
-}
-
-// 8x32 block odd indexed 8 inputs of in[16],
-// output second half 16 to out[32]
-static void idct32_16x32_quarter_3_4(const __m256i *in /*in[16]*/,
-                                     __m256i *out /*out[32]*/) {
-  __m256i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m256i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m256i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m256i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  {
-    const __m256i stk1_0 = pair256_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
-    const __m256i stk1_1 = pair256_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
-    const __m256i stk1_2 =
-        pair256_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64);
-    const __m256i stk1_3 = pair256_set_epi16(2 * cospi_15_64, 2 * cospi_15_64);
-
-    const __m256i stk1_4 = pair256_set_epi16(2 * cospi_23_64, 2 * cospi_23_64);
-    const __m256i stk1_5 = pair256_set_epi16(2 * cospi_9_64, 2 * cospi_9_64);
-    const __m256i stk1_6 =
-        pair256_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
-    const __m256i stk1_7 = pair256_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
-    const __m256i stk1_8 = pair256_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
-    const __m256i stk1_9 = pair256_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
-    const __m256i stk1_10 =
-        pair256_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64);
-    const __m256i stk1_11 = pair256_set_epi16(2 * cospi_11_64, 2 * cospi_11_64);
-
-    const __m256i stk1_12 = pair256_set_epi16(2 * cospi_19_64, 2 * cospi_19_64);
-    const __m256i stk1_13 = pair256_set_epi16(2 * cospi_13_64, 2 * cospi_13_64);
-    const __m256i stk1_14 =
-        pair256_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
-    const __m256i stk1_15 = pair256_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
-    u16 = _mm256_mulhrs_epi16(in[1], stk1_0);
-    u31 = _mm256_mulhrs_epi16(in[1], stk1_1);
-    u17 = _mm256_mulhrs_epi16(in[15], stk1_2);
-    u30 = _mm256_mulhrs_epi16(in[15], stk1_3);
-
-    u18 = _mm256_mulhrs_epi16(in[9], stk1_4);
-    u29 = _mm256_mulhrs_epi16(in[9], stk1_5);
-    u19 = _mm256_mulhrs_epi16(in[7], stk1_6);
-    u28 = _mm256_mulhrs_epi16(in[7], stk1_7);
-
-    u20 = _mm256_mulhrs_epi16(in[5], stk1_8);
-    u27 = _mm256_mulhrs_epi16(in[5], stk1_9);
-    u21 = _mm256_mulhrs_epi16(in[11], stk1_10);
-    u26 = _mm256_mulhrs_epi16(in[11], stk1_11);
-
-    u22 = _mm256_mulhrs_epi16(in[13], stk1_12);
-    u25 = _mm256_mulhrs_epi16(in[13], stk1_13);
-    u23 = _mm256_mulhrs_epi16(in[3], stk1_14);
-    u24 = _mm256_mulhrs_epi16(in[3], stk1_15);
-  }
-
-  v16 = _mm256_add_epi16(u16, u17);
-  v17 = _mm256_sub_epi16(u16, u17);
-  v18 = _mm256_sub_epi16(u19, u18);
-  v19 = _mm256_add_epi16(u19, u18);
-
-  v20 = _mm256_add_epi16(u20, u21);
-  v21 = _mm256_sub_epi16(u20, u21);
-  v22 = _mm256_sub_epi16(u23, u22);
-  v23 = _mm256_add_epi16(u23, u22);
-
-  v24 = _mm256_add_epi16(u24, u25);
-  v25 = _mm256_sub_epi16(u24, u25);
-  v26 = _mm256_sub_epi16(u27, u26);
-  v27 = _mm256_add_epi16(u27, u26);
-
-  v28 = _mm256_add_epi16(u28, u29);
-  v29 = _mm256_sub_epi16(u28, u29);
-  v30 = _mm256_sub_epi16(u31, u30);
-  v31 = _mm256_add_epi16(u31, u30);
-
-  {
-    const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
-    const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64);
-    const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
-    const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
-    const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64);
-    const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
-
-    butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
-    butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
-    butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
-    butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
-  }
-
-  u16 = _mm256_add_epi16(v16, v19);
-  u17 = _mm256_add_epi16(v17, v18);
-  u18 = _mm256_sub_epi16(v17, v18);
-  u19 = _mm256_sub_epi16(v16, v19);
-  u20 = _mm256_sub_epi16(v23, v20);
-  u21 = _mm256_sub_epi16(v22, v21);
-  u22 = _mm256_add_epi16(v22, v21);
-  u23 = _mm256_add_epi16(v23, v20);
-
-  u24 = _mm256_add_epi16(v24, v27);
-  u25 = _mm256_add_epi16(v25, v26);
-  u26 = _mm256_sub_epi16(v25, v26);
-  u27 = _mm256_sub_epi16(v24, v27);
-  u28 = _mm256_sub_epi16(v31, v28);
-  u29 = _mm256_sub_epi16(v30, v29);
-  u30 = _mm256_add_epi16(v29, v30);
-  u31 = _mm256_add_epi16(v28, v31);
-
-  {
-    const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-    const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-    butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-    butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-    butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm256_add_epi16(u16, u23);
-  out[1] = _mm256_add_epi16(u17, u22);
-  out[2] = _mm256_add_epi16(u18, u21);
-  out[3] = _mm256_add_epi16(u19, u20);
-  v20 = _mm256_sub_epi16(u19, u20);
-  v21 = _mm256_sub_epi16(u18, u21);
-  v22 = _mm256_sub_epi16(u17, u22);
-  v23 = _mm256_sub_epi16(u16, u23);
-
-  v24 = _mm256_sub_epi16(u31, u24);
-  v25 = _mm256_sub_epi16(u30, u25);
-  v26 = _mm256_sub_epi16(u29, u26);
-  v27 = _mm256_sub_epi16(u28, u27);
-  out[12] = _mm256_add_epi16(u27, u28);
-  out[13] = _mm256_add_epi16(u26, u29);
-  out[14] = _mm256_add_epi16(u25, u30);
-  out[15] = _mm256_add_epi16(u24, u31);
-
-  {
-    const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-    const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]);
-    butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]);
-    butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]);
-    butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]);
-  }
-}
-
-// 16x16 block input __m256i in[32], output 16x32 __m256i in[32]
-static void idct32_16x32_135(__m256i *in /*in[32]*/) {
-  __m256i out[32];
-  idct32_16x32_quarter_1_2(in, out);
-  idct32_16x32_quarter_3_4(in, &out[16]);
-  add_sub_butterfly(out, in, 32);
-}
-
-static INLINE void load_buffer_from_32x32(const tran_low_t *coeff, __m256i *in,
-                                          int size) {
-  int i = 0;
-  while (i < size) {
-    load_coeff(coeff + (i << 5), &in[i]);
-    i += 1;
-  }
-}
-
-static INLINE void zero_buffer(__m256i *in, int num) {
-  int i;
-  for (i = 0; i < num; ++i) {
-    in[i] = _mm256_setzero_si256();
-  }
-}
-
-// Only upper-left 16x16 has non-zero coeff
-void aom_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest,
-                                int stride) {
-  __m256i in[32];
-  zero_buffer(in, 32);
-  load_buffer_from_32x32(input, in, 16);
-  mm256_transpose_16x16(in, in);
-  idct32_16x32_135(in);
-
-  __m256i out[32];
-  mm256_transpose_16x16(in, out);
-  idct32_16x32_135(out);
-  store_buffer_16xN(out, stride, dest, 32);
-  mm256_transpose_16x16(&in[16], in);
-  idct32_16x32_135(in);
-  store_buffer_16xN(in, stride, dest + 16, 32);
-}
-
-static void idct32_34_first_half(const __m256i *in, __m256i *stp1) {
-  const __m256i stk2_0 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
-  const __m256i stk2_1 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
-  const __m256i stk2_6 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
-  const __m256i stk2_7 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-
-  const __m256i stk3_0 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-  const __m256i stk3_1 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-
-  const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-  const __m256i stk4_0 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-  const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-  const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m256i x0, x1, x4, x5, x6, x7;
-  __m256i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-
-  // phase 1
-
-  // 0, 15
-  u2 = _mm256_mulhrs_epi16(in[2], stk2_1);  // stp2_15
-  u3 = _mm256_mulhrs_epi16(in[6], stk2_7);  // stp2_12
-  v15 = _mm256_add_epi16(u2, u3);
-  // in[0], in[4]
-  x0 = _mm256_mulhrs_epi16(in[0], stk4_0);  // stp1[0]
-  x7 = _mm256_mulhrs_epi16(in[4], stk3_1);  // stp1[7]
-  v0 = _mm256_add_epi16(x0, x7);            // stp2_0
-  stp1[0] = _mm256_add_epi16(v0, v15);
-  stp1[15] = _mm256_sub_epi16(v0, v15);
-
-  // in[2], in[6]
-  u0 = _mm256_mulhrs_epi16(in[2], stk2_0);          // stp2_8
-  u1 = _mm256_mulhrs_epi16(in[6], stk2_6);          // stp2_11
-  butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5);  // stp2_9, stp2_14
-  butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7);  // stp2_10, stp2_13
-
-  v8 = _mm256_add_epi16(u0, u1);
-  v9 = _mm256_add_epi16(u4, u6);
-  v10 = _mm256_sub_epi16(u4, u6);
-  v11 = _mm256_sub_epi16(u0, u1);
-  v12 = _mm256_sub_epi16(u2, u3);
-  v13 = _mm256_sub_epi16(u5, u7);
-  v14 = _mm256_add_epi16(u5, u7);
-
-  butterfly_self(&v10, &v13, &stg6_0, &stg4_0);
-  butterfly_self(&v11, &v12, &stg6_0, &stg4_0);
-
-  // 1, 14
-  x1 = _mm256_mulhrs_epi16(in[0], stk4_0);  // stp1[1], stk4_1 = stk4_0
-  // stp1[2] = stp1[0], stp1[3] = stp1[1]
-  x4 = _mm256_mulhrs_epi16(in[4], stk3_0);  // stp1[4]
-  butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6);
-  v1 = _mm256_add_epi16(x1, x6);  // stp2_1
-  v2 = _mm256_add_epi16(x0, x5);  // stp2_2
-  stp1[1] = _mm256_add_epi16(v1, v14);
-  stp1[14] = _mm256_sub_epi16(v1, v14);
-
-  stp1[2] = _mm256_add_epi16(v2, v13);
-  stp1[13] = _mm256_sub_epi16(v2, v13);
-
-  v3 = _mm256_add_epi16(x1, x4);  // stp2_3
-  v4 = _mm256_sub_epi16(x1, x4);  // stp2_4
-
-  v5 = _mm256_sub_epi16(x0, x5);  // stp2_5
-
-  v6 = _mm256_sub_epi16(x1, x6);  // stp2_6
-  v7 = _mm256_sub_epi16(x0, x7);  // stp2_7
-  stp1[3] = _mm256_add_epi16(v3, v12);
-  stp1[12] = _mm256_sub_epi16(v3, v12);
-
-  stp1[6] = _mm256_add_epi16(v6, v9);
-  stp1[9] = _mm256_sub_epi16(v6, v9);
-
-  stp1[7] = _mm256_add_epi16(v7, v8);
-  stp1[8] = _mm256_sub_epi16(v7, v8);
-
-  stp1[4] = _mm256_add_epi16(v4, v11);
-  stp1[11] = _mm256_sub_epi16(v4, v11);
-
-  stp1[5] = _mm256_add_epi16(v5, v10);
-  stp1[10] = _mm256_sub_epi16(v5, v10);
-}
-
-static void idct32_34_second_half(const __m256i *in, __m256i *stp1) {
-  const __m256i stk1_0 = pair256_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
-  const __m256i stk1_1 = pair256_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
-  const __m256i stk1_6 = pair256_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
-  const __m256i stk1_7 = pair256_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
-  const __m256i stk1_8 = pair256_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
-  const __m256i stk1_9 = pair256_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
-  const __m256i stk1_14 = pair256_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
-  const __m256i stk1_15 = pair256_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
-  const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64);
-  const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64);
-  const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-  const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-  const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-  __m256i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m256i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m256i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m256i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  v16 = _mm256_mulhrs_epi16(in[1], stk1_0);
-  v31 = _mm256_mulhrs_epi16(in[1], stk1_1);
-
-  v19 = _mm256_mulhrs_epi16(in[7], stk1_6);
-  v28 = _mm256_mulhrs_epi16(in[7], stk1_7);
-
-  v20 = _mm256_mulhrs_epi16(in[5], stk1_8);
-  v27 = _mm256_mulhrs_epi16(in[5], stk1_9);
-
-  v23 = _mm256_mulhrs_epi16(in[3], stk1_14);
-  v24 = _mm256_mulhrs_epi16(in[3], stk1_15);
-
-  butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30);
-  butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29);
-  butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26);
-  butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25);
-
-  u16 = _mm256_add_epi16(v16, v19);
-  u17 = _mm256_add_epi16(v17, v18);
-  u18 = _mm256_sub_epi16(v17, v18);
-  u19 = _mm256_sub_epi16(v16, v19);
-  u20 = _mm256_sub_epi16(v23, v20);
-  u21 = _mm256_sub_epi16(v22, v21);
-  u22 = _mm256_add_epi16(v22, v21);
-  u23 = _mm256_add_epi16(v23, v20);
-  u24 = _mm256_add_epi16(v24, v27);
-  u27 = _mm256_sub_epi16(v24, v27);
-  u25 = _mm256_add_epi16(v25, v26);
-  u26 = _mm256_sub_epi16(v25, v26);
-  u28 = _mm256_sub_epi16(v31, v28);
-  u31 = _mm256_add_epi16(v28, v31);
-  u29 = _mm256_sub_epi16(v30, v29);
-  u30 = _mm256_add_epi16(v29, v30);
-
-  butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-  butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-  butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-  butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-
-  stp1[0] = _mm256_add_epi16(u16, u23);
-  stp1[7] = _mm256_sub_epi16(u16, u23);
-
-  stp1[1] = _mm256_add_epi16(u17, u22);
-  stp1[6] = _mm256_sub_epi16(u17, u22);
-
-  stp1[2] = _mm256_add_epi16(u18, u21);
-  stp1[5] = _mm256_sub_epi16(u18, u21);
-
-  stp1[3] = _mm256_add_epi16(u19, u20);
-  stp1[4] = _mm256_sub_epi16(u19, u20);
-
-  stp1[8] = _mm256_sub_epi16(u31, u24);
-  stp1[15] = _mm256_add_epi16(u24, u31);
-
-  stp1[9] = _mm256_sub_epi16(u30, u25);
-  stp1[14] = _mm256_add_epi16(u25, u30);
-
-  stp1[10] = _mm256_sub_epi16(u29, u26);
-  stp1[13] = _mm256_add_epi16(u26, u29);
-
-  stp1[11] = _mm256_sub_epi16(u28, u27);
-  stp1[12] = _mm256_add_epi16(u27, u28);
-
-  butterfly_self(&stp1[4], &stp1[11], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[5], &stp1[10], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[6], &stp1[9], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[7], &stp1[8], &stg6_0, &stg4_0);
-}
-
-// 16x16 block input __m256i in[32], output 16x32 __m256i in[32]
-static void idct32_16x32_34(__m256i *in /*in[32]*/) {
-  __m256i out[32];
-  idct32_34_first_half(in, out);
-  idct32_34_second_half(in, &out[16]);
-  add_sub_butterfly(out, in, 32);
-}
-
-// Only upper-left 8x8 has non-zero coeff
-void aom_idct32x32_34_add_avx2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  __m256i in[32];
-  zero_buffer(in, 32);
-  load_buffer_from_32x32(input, in, 8);
-  mm256_transpose_16x16(in, in);
-  idct32_16x32_34(in);
-
-  __m256i out[32];
-  mm256_transpose_16x16(in, out);
-  idct32_16x32_34(out);
-  store_buffer_16xN(out, stride, dest, 32);
-  mm256_transpose_16x16(&in[16], in);
-  idct32_16x32_34(in);
-  store_buffer_16xN(in, stride, dest + 16, 32);
-}
diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h
deleted file mode 100644
index 26c5cfe59..000000000
--- a/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H
-#define AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H
-
-#include <immintrin.h>
-
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_avx2.h"
-
-static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
-  if (sizeof(tran_low_t) == 4) {
-    *in = _mm256_setr_epi16(
-        (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
-        (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
-        (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
-        (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
-        (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
-        (int16_t)coeff[15]);
-  } else {
-    *in = _mm256_loadu_si256((const __m256i *)coeff);
-  }
-}
-
-static INLINE void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
-  int i = 0;
-  while (i < 16) {
-    load_coeff(coeff + (i << 4), &in[i]);
-    i += 1;
-  }
-}
-
-static INLINE void recon_and_store(const __m256i *res, uint8_t *output) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i x = _mm_loadu_si128((__m128i const *)output);
-  __m128i p0 = _mm_unpacklo_epi8(x, zero);
-  __m128i p1 = _mm_unpackhi_epi8(x, zero);
-
-  p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res));
-  p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1));
-  x = _mm_packus_epi16(p0, p1);
-  _mm_storeu_si128((__m128i *)output, x);
-}
-
-#define IDCT_ROUNDING_POS (6)
-static INLINE void store_buffer_16xN(__m256i *in, const int stride,
-                                     uint8_t *output, int num) {
-  const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1));
-  int i = 0;
-
-  while (i < num) {
-    in[i] = _mm256_adds_epi16(in[i], rounding);
-    in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS);
-    recon_and_store(&in[i], output + i * stride);
-    i += 1;
-  }
-}
-
-static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1,
-                                     const __m256i *c0, const __m256i *c1,
-                                     __m256i *b0, __m256i *b1) {
-  __m256i x0, x1;
-  x0 = _mm256_unpacklo_epi16(*a0, *a1);
-  x1 = _mm256_unpackhi_epi16(*a0, *a1);
-  *b0 = butter_fly(&x0, &x1, c0);
-  *b1 = butter_fly(&x0, &x1, c1);
-}
-
-void av1_idct16_avx2(__m256i *in);
-
-#endif  // AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H
diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c
deleted file mode 100644
index 86ce928b7..000000000
--- a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c
+++ /dev/null
@@ -1,3500 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/x86/inv_txfm_sse2.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-#define RECON_AND_STORE4X4(dest, in_x)                    \
-  {                                                       \
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
-    d0 = _mm_unpacklo_epi8(d0, zero);                     \
-    d0 = _mm_add_epi16(in_x, d0);                         \
-    d0 = _mm_packus_epi16(d0, d0);                        \
-    *(int *)(dest) = _mm_cvtsi128_si32(d0);               \
-  }
-
-void aom_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i eight = _mm_set1_epi16(8);
-  const __m128i cst = _mm_setr_epi16(
-      (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
-      (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
-      (int16_t)cospi_8_64, (int16_t)cospi_24_64);
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i input0, input1, input2, input3;
-
-  // Rows
-  input0 = load_input_data(input);
-  input2 = load_input_data(input + 8);
-
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_shufflelo_epi16(input0, 0xd8);
-  input0 = _mm_shufflehi_epi16(input0, 0xd8);
-  input2 = _mm_shufflelo_epi16(input2, 0xd8);
-  input2 = _mm_shufflehi_epi16(input2, 0xd8);
-
-  input1 = _mm_unpackhi_epi32(input0, input0);
-  input0 = _mm_unpacklo_epi32(input0, input0);
-  input3 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpacklo_epi32(input2, input2);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input1);
-  input1 = _mm_packs_epi32(input2, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
-
-  // Columns
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_unpacklo_epi32(input2, input2);
-  input1 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpackhi_epi32(input3, input3);
-  input3 = _mm_unpacklo_epi32(input3, input3);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input2);
-  input1 = _mm_packs_epi32(input1, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
-
-  // Final round and shift
-  input2 = _mm_add_epi16(input2, eight);
-  input3 = _mm_add_epi16(input3, eight);
-
-  input2 = _mm_srai_epi16(input2, 4);
-  input3 = _mm_srai_epi16(input3, 4);
-
-  // Reconstruction and Store
-  {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
-    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    d0 = _mm_unpacklo_epi32(d0,
-                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
-    d2 = _mm_unpacklo_epi32(
-        _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
-    d0 = _mm_unpacklo_epi8(d0, zero);
-    d2 = _mm_unpacklo_epi8(d2, zero);
-    d0 = _mm_add_epi16(d0, input2);
-    d2 = _mm_add_epi16(d2, input3);
-    d0 = _mm_packus_epi16(d0, d2);
-    // store input0
-    *(int *)dest = _mm_cvtsi128_si32(d0);
-    // store input1
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-    // store input2
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
-    // store input3
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
-  }
-}
-
-void aom_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 4);
-
-  if (a == 0) return;
-
-  dc_value = _mm_set1_epi16(a);
-
-  RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
-}
-
-void aom_idct4_sse2(__m128i *in) {
-  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8];
-
-  array_transpose_4x4(in);
-  // stage 1
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
-  u[0] = _mm_packs_epi32(v[0], v[1]);
-  u[1] = _mm_packs_epi32(v[3], v[2]);
-
-  // stage 2
-  in[0] = _mm_add_epi16(u[0], u[1]);
-  in[1] = _mm_sub_epi16(u[0], u[1]);
-  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
-}
-
-void aom_iadst4_sse2(__m128i *in) {
-  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
-  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
-  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
-  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
-  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
-  const __m128i kZero = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8], in7;
-
-  array_transpose_4x4(in);
-  in7 = _mm_srli_si128(in[1], 8);
-  in7 = _mm_add_epi16(in7, in[0]);
-  in7 = _mm_sub_epi16(in7, in[1]);
-
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  u[2] = _mm_unpacklo_epi16(in7, kZero);
-  u[3] = _mm_unpackhi_epi16(in[0], kZero);
-
-  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
-  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
-  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
-  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
-  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
-  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
-
-  u[0] = _mm_add_epi32(v[0], v[1]);
-  u[1] = _mm_add_epi32(v[3], v[4]);
-  u[2] = v[2];
-  u[3] = _mm_add_epi32(u[0], u[1]);
-  u[4] = _mm_slli_epi32(v[5], 2);
-  u[5] = _mm_add_epi32(u[3], v[5]);
-  u[6] = _mm_sub_epi32(u[5], u[4]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u[0], u[1]);
-  in[1] = _mm_packs_epi32(u[2], u[3]);
-}
-
-// Define Macro for multiplying elements by constants and adding them together.
-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
-                               res0, res1, res2, res3)                         \
-  {                                                                            \
-    tmp0 = _mm_madd_epi16(lo_0, cst0);                                         \
-    tmp1 = _mm_madd_epi16(hi_0, cst0);                                         \
-    tmp2 = _mm_madd_epi16(lo_0, cst1);                                         \
-    tmp3 = _mm_madd_epi16(hi_0, cst1);                                         \
-    tmp4 = _mm_madd_epi16(lo_1, cst2);                                         \
-    tmp5 = _mm_madd_epi16(hi_1, cst2);                                         \
-    tmp6 = _mm_madd_epi16(lo_1, cst3);                                         \
-    tmp7 = _mm_madd_epi16(hi_1, cst3);                                         \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-    tmp4 = _mm_add_epi32(tmp4, rounding);                                      \
-    tmp5 = _mm_add_epi32(tmp5, rounding);                                      \
-    tmp6 = _mm_add_epi32(tmp6, rounding);                                      \
-    tmp7 = _mm_add_epi32(tmp7, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);                               \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);                               \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);                               \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);                               \
-                                                                               \
-    res0 = _mm_packs_epi32(tmp0, tmp1);                                        \
-    res1 = _mm_packs_epi32(tmp2, tmp3);                                        \
-    res2 = _mm_packs_epi32(tmp4, tmp5);                                        \
-    res3 = _mm_packs_epi32(tmp6, tmp7);                                        \
-  }
-
-#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
-  {                                                                  \
-    tmp0 = _mm_madd_epi16(lo_0, cst0);                               \
-    tmp1 = _mm_madd_epi16(hi_0, cst0);                               \
-    tmp2 = _mm_madd_epi16(lo_0, cst1);                               \
-    tmp3 = _mm_madd_epi16(hi_0, cst1);                               \
-                                                                     \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                            \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                            \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                            \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                            \
-                                                                     \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                     \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                     \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                     \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                     \
-                                                                     \
-    res0 = _mm_packs_epi32(tmp0, tmp1);                              \
-    res1 = _mm_packs_epi32(tmp2, tmp3);                              \
-  }
-
-#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
-              out4, out5, out6, out7)                                         \
-  {                                                                           \
-    /* Stage1 */                                                              \
-    {                                                                         \
-      const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);                     \
-      const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);                     \
-      const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);                     \
-      const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);                     \
-                                                                              \
-      MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1,      \
-                             stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6)  \
-    }                                                                         \
-                                                                              \
-    /* Stage2 */                                                              \
-    {                                                                         \
-      const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4);                     \
-      const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4);                     \
-      const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);                     \
-      const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);                     \
-                                                                              \
-      MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1,      \
-                             stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3)  \
-                                                                              \
-      stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);                                \
-      stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);                                \
-      stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);                                \
-      stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);                                \
-    }                                                                         \
-                                                                              \
-    /* Stage3 */                                                              \
-    {                                                                         \
-      const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5);               \
-      const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5);               \
-                                                                              \
-      stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);                                \
-      stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);                                \
-      stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);                                \
-      stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);                                \
-                                                                              \
-      tmp0 = _mm_madd_epi16(lo_56, stg2_1);                                   \
-      tmp1 = _mm_madd_epi16(hi_56, stg2_1);                                   \
-      tmp2 = _mm_madd_epi16(lo_56, stg2_0);                                   \
-      tmp3 = _mm_madd_epi16(hi_56, stg2_0);                                   \
-                                                                              \
-      tmp0 = _mm_add_epi32(tmp0, rounding);                                   \
-      tmp1 = _mm_add_epi32(tmp1, rounding);                                   \
-      tmp2 = _mm_add_epi32(tmp2, rounding);                                   \
-      tmp3 = _mm_add_epi32(tmp3, rounding);                                   \
-                                                                              \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                            \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                            \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                            \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                            \
-                                                                              \
-      stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                   \
-      stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                   \
-    }                                                                         \
-                                                                              \
-    /* Stage4  */                                                             \
-    out0 = _mm_adds_epi16(stp1_0, stp2_7);                                    \
-    out1 = _mm_adds_epi16(stp1_1, stp1_6);                                    \
-    out2 = _mm_adds_epi16(stp1_2, stp1_5);                                    \
-    out3 = _mm_adds_epi16(stp1_3, stp2_4);                                    \
-    out4 = _mm_subs_epi16(stp1_3, stp2_4);                                    \
-    out5 = _mm_subs_epi16(stp1_2, stp1_5);                                    \
-    out6 = _mm_subs_epi16(stp1_1, stp1_6);                                    \
-    out7 = _mm_subs_epi16(stp1_0, stp2_7);                                    \
-  }
-
-void aom_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  // Load input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-  in4 = load_input_data(input + 8 * 4);
-  in5 = load_input_data(input + 8 * 5);
-  in6 = load_input_data(input + 8 * 6);
-  in7 = load_input_data(input + 8 * 7);
-
-  // 2-D
-  for (i = 0; i < 2; i++) {
-    // 8x8 Transpose is copied from aom_fdct8x8_sse2()
-    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                  in4, in5, in6, in7);
-
-    // 4-stage 1D idct8x8
-    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
-          in6, in7);
-  }
-
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-void aom_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 5);
-
-  if (a == 0) return;
-
-  dc_value = _mm_set1_epi16(a);
-
-  RECON_AND_STORE(dest + 0 * stride, dc_value);
-  RECON_AND_STORE(dest + 1 * stride, dc_value);
-  RECON_AND_STORE(dest + 2 * stride, dc_value);
-  RECON_AND_STORE(dest + 3 * stride, dc_value);
-  RECON_AND_STORE(dest + 4 * stride, dc_value);
-  RECON_AND_STORE(dest + 5 * stride, dc_value);
-  RECON_AND_STORE(dest + 6 * stride, dc_value);
-  RECON_AND_STORE(dest + 7 * stride, dc_value);
-}
-
-void aom_idct8_sse2(__m128i *in) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  // 8x8 Transpose is copied from aom_fdct8x8_sse2()
-  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
-                in1, in2, in3, in4, in5, in6, in7);
-
-  // 4-stage 1D idct8x8
-  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
-        in[4], in[5], in[6], in[7]);
-}
-
-void aom_iadst8_sse2(__m128i *in) {
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__const_0 = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
-  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-
-  // transpose
-  array_transpose_8x8(in, in);
-
-  // properly aligned for butterfly input
-  in0 = in[7];
-  in1 = in[0];
-  in2 = in[5];
-  in3 = in[2];
-  in4 = in[3];
-  in5 = in[4];
-  in6 = in[1];
-  in7 = in[6];
-
-  // column transformation
-  // stage 1
-  // interleave and multiply/add into 32-bit integer
-  s0 = _mm_unpacklo_epi16(in0, in1);
-  s1 = _mm_unpackhi_epi16(in0, in1);
-  s2 = _mm_unpacklo_epi16(in2, in3);
-  s3 = _mm_unpackhi_epi16(in2, in3);
-  s4 = _mm_unpacklo_epi16(in4, in5);
-  s5 = _mm_unpackhi_epi16(in4, in5);
-  s6 = _mm_unpacklo_epi16(in6, in7);
-  s7 = _mm_unpackhi_epi16(in6, in7);
-
-  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
-  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
-  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
-  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
-  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
-  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
-  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
-  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
-  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
-  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
-  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
-  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
-  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
-  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
-  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
-  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
-
-  // addition
-  w0 = _mm_add_epi32(u0, u8);
-  w1 = _mm_add_epi32(u1, u9);
-  w2 = _mm_add_epi32(u2, u10);
-  w3 = _mm_add_epi32(u3, u11);
-  w4 = _mm_add_epi32(u4, u12);
-  w5 = _mm_add_epi32(u5, u13);
-  w6 = _mm_add_epi32(u6, u14);
-  w7 = _mm_add_epi32(u7, u15);
-  w8 = _mm_sub_epi32(u0, u8);
-  w9 = _mm_sub_epi32(u1, u9);
-  w10 = _mm_sub_epi32(u2, u10);
-  w11 = _mm_sub_epi32(u3, u11);
-  w12 = _mm_sub_epi32(u4, u12);
-  w13 = _mm_sub_epi32(u5, u13);
-  w14 = _mm_sub_epi32(u6, u14);
-  w15 = _mm_sub_epi32(u7, u15);
-
-  // shift and rounding
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
-  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
-  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
-  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
-  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
-  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
-  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
-  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
-  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
-  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
-  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
-  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
-  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
-  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
-  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
-
-  // back to 16-bit and pack 8 integers into __m128i
-  in[0] = _mm_packs_epi32(u0, u1);
-  in[1] = _mm_packs_epi32(u2, u3);
-  in[2] = _mm_packs_epi32(u4, u5);
-  in[3] = _mm_packs_epi32(u6, u7);
-  in[4] = _mm_packs_epi32(u8, u9);
-  in[5] = _mm_packs_epi32(u10, u11);
-  in[6] = _mm_packs_epi32(u12, u13);
-  in[7] = _mm_packs_epi32(u14, u15);
-
-  // stage 2
-  s0 = _mm_add_epi16(in[0], in[2]);
-  s1 = _mm_add_epi16(in[1], in[3]);
-  s2 = _mm_sub_epi16(in[0], in[2]);
-  s3 = _mm_sub_epi16(in[1], in[3]);
-  u0 = _mm_unpacklo_epi16(in[4], in[5]);
-  u1 = _mm_unpackhi_epi16(in[4], in[5]);
-  u2 = _mm_unpacklo_epi16(in[6], in[7]);
-  u3 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
-  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
-  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
-  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
-  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
-  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
-  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
-  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
-
-  w0 = _mm_add_epi32(v0, v4);
-  w1 = _mm_add_epi32(v1, v5);
-  w2 = _mm_add_epi32(v2, v6);
-  w3 = _mm_add_epi32(v3, v7);
-  w4 = _mm_sub_epi32(v0, v4);
-  w5 = _mm_sub_epi32(v1, v5);
-  w6 = _mm_sub_epi32(v2, v6);
-  w7 = _mm_sub_epi32(v3, v7);
-
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
-  // back to 16-bit intergers
-  s4 = _mm_packs_epi32(u0, u1);
-  s5 = _mm_packs_epi32(u2, u3);
-  s6 = _mm_packs_epi32(u4, u5);
-  s7 = _mm_packs_epi32(u6, u7);
-
-  // stage 3
-  u0 = _mm_unpacklo_epi16(s2, s3);
-  u1 = _mm_unpackhi_epi16(s2, s3);
-  u2 = _mm_unpacklo_epi16(s6, s7);
-  u3 = _mm_unpackhi_epi16(s6, s7);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
-  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
-  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
-  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
-  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
-  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
-  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  s2 = _mm_packs_epi32(v0, v1);
-  s3 = _mm_packs_epi32(v2, v3);
-  s6 = _mm_packs_epi32(v4, v5);
-  s7 = _mm_packs_epi32(v6, v7);
-
-  in[0] = s0;
-  in[1] = _mm_sub_epi16(k__const_0, s4);
-  in[2] = s6;
-  in[3] = _mm_sub_epi16(k__const_0, s2);
-  in[4] = s3;
-  in[5] = _mm_sub_epi16(k__const_0, s7);
-  in[6] = s5;
-  in[7] = _mm_sub_epi16(k__const_0, s1);
-}
-
-void aom_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  // Rows. Load 4-row input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-
-  // 8x4 Transpose
-  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
-  // Stage1
-  {
-    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
-    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
-    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
-    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
-    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
-  }
-
-  // Stage2
-  {
-    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
-    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
-    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
-    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
-
-    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
-    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
-
-    stp2_4 = tmp0;
-    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-
-    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
-    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
-
-    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
-    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
-
-    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
-  }
-
-  // Stage4
-  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
-  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
-  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
-  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
-
-  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
-
-  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
-        in5, in6, in7);
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-#define IDCT16                                                                 \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]);                 \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]);                 \
-    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);                   \
-    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);                   \
-    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]);                 \
-    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]);                 \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]);                 \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1,   \
-                           stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14)   \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
-                           stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]);                 \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]);                 \
-    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]);                 \
-    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
-                           stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6)     \
-                                                                               \
-    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);                                  \
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
-                                                                               \
-    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);                               \
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);                   \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]);                   \
-    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]);                 \
-    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]);                 \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1,   \
-                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
-                                                                               \
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
-                                                                               \
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-  }
-
-#define IDCT16_10                                                              \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero);                   \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero);                   \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
-                           stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11,         \
-                           stp1_12_0)                                          \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero);                   \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
-                                                                               \
-    stp1_9 = stp1_8_0;                                                         \
-    stp1_10 = stp1_11;                                                         \
-                                                                               \
-    stp1_13 = stp1_12_0;                                                       \
-    stp1_14 = stp1_15;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);                    \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero);                    \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1)   \
-    stp2_5 = stp2_4;                                                           \
-    stp2_6 = stp2_7;                                                           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-                                                                               \
-    stp1_2 = stp1_1;                                                           \
-    stp1_3 = stp1_0;                                                           \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
-                                                                               \
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-  }
-
-void aom_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
-                                int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[16], l[16], r[16], *curr1;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_8_0, stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  curr1 = l;
-  for (i = 0; i < 2; i++) {
-    // 1-D idct
-
-    // Load input data.
-    in[0] = load_input_data(input);
-    in[8] = load_input_data(input + 8 * 1);
-    in[1] = load_input_data(input + 8 * 2);
-    in[9] = load_input_data(input + 8 * 3);
-    in[2] = load_input_data(input + 8 * 4);
-    in[10] = load_input_data(input + 8 * 5);
-    in[3] = load_input_data(input + 8 * 6);
-    in[11] = load_input_data(input + 8 * 7);
-    in[4] = load_input_data(input + 8 * 8);
-    in[12] = load_input_data(input + 8 * 9);
-    in[5] = load_input_data(input + 8 * 10);
-    in[13] = load_input_data(input + 8 * 11);
-    in[6] = load_input_data(input + 8 * 12);
-    in[14] = load_input_data(input + 8 * 13);
-    in[7] = load_input_data(input + 8 * 14);
-    in[15] = load_input_data(input + 8 * 15);
-
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-
-    IDCT16
-
-    // Stage7
-    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
-    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
-    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
-    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
-    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
-    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
-    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
-    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
-    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    curr1 = r;
-    input += 128;
-  }
-  for (i = 0; i < 2; i++) {
-    int j;
-    // 1-D idct
-    array_transpose_8x8(l + i * 8, in);
-    array_transpose_8x8(r + i * 8, in + 8);
-
-    IDCT16
-
-    // 2-D
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void aom_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, i;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  if (a == 0) return;
-
-  dc_value = _mm_set1_epi16(a);
-
-  for (i = 0; i < 16; ++i) {
-    RECON_AND_STORE(dest + 0, dc_value);
-    RECON_AND_STORE(dest + 8, dc_value);
-    dest += stride;
-  }
-}
-
-void iadst16_8col(__m128i *in) {
-  // perform 16x16 1-D ADST for 8 columns
-  __m128i s[16], x[16], u[32], v[32];
-  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
-  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
-
-  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
-  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
-  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
-  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
-  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
-  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
-  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
-  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
-  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
-  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
-  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
-  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
-  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
-  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
-  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
-  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
-  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
-  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
-  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
-  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
-  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
-  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
-  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
-  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
-  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
-  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
-  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
-  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
-  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
-  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
-  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
-  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
-
-  u[0] = _mm_add_epi32(v[0], v[16]);
-  u[1] = _mm_add_epi32(v[1], v[17]);
-  u[2] = _mm_add_epi32(v[2], v[18]);
-  u[3] = _mm_add_epi32(v[3], v[19]);
-  u[4] = _mm_add_epi32(v[4], v[20]);
-  u[5] = _mm_add_epi32(v[5], v[21]);
-  u[6] = _mm_add_epi32(v[6], v[22]);
-  u[7] = _mm_add_epi32(v[7], v[23]);
-  u[8] = _mm_add_epi32(v[8], v[24]);
-  u[9] = _mm_add_epi32(v[9], v[25]);
-  u[10] = _mm_add_epi32(v[10], v[26]);
-  u[11] = _mm_add_epi32(v[11], v[27]);
-  u[12] = _mm_add_epi32(v[12], v[28]);
-  u[13] = _mm_add_epi32(v[13], v[29]);
-  u[14] = _mm_add_epi32(v[14], v[30]);
-  u[15] = _mm_add_epi32(v[15], v[31]);
-  u[16] = _mm_sub_epi32(v[0], v[16]);
-  u[17] = _mm_sub_epi32(v[1], v[17]);
-  u[18] = _mm_sub_epi32(v[2], v[18]);
-  u[19] = _mm_sub_epi32(v[3], v[19]);
-  u[20] = _mm_sub_epi32(v[4], v[20]);
-  u[21] = _mm_sub_epi32(v[5], v[21]);
-  u[22] = _mm_sub_epi32(v[6], v[22]);
-  u[23] = _mm_sub_epi32(v[7], v[23]);
-  u[24] = _mm_sub_epi32(v[8], v[24]);
-  u[25] = _mm_sub_epi32(v[9], v[25]);
-  u[26] = _mm_sub_epi32(v[10], v[26]);
-  u[27] = _mm_sub_epi32(v[11], v[27]);
-  u[28] = _mm_sub_epi32(v[12], v[28]);
-  u[29] = _mm_sub_epi32(v[13], v[29]);
-  u[30] = _mm_sub_epi32(v[14], v[30]);
-  u[31] = _mm_sub_epi32(v[15], v[31]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
-  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
-  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
-  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
-  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
-  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
-  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
-  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
-  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
-  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
-  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
-  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
-  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
-  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
-  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
-  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
-  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
-  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
-  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
-  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
-  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
-  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
-  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
-  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
-  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
-  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
-  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
-  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
-  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
-  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
-  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_packs_epi32(u[8], u[9]);
-  s[5] = _mm_packs_epi32(u[10], u[11]);
-  s[6] = _mm_packs_epi32(u[12], u[13]);
-  s[7] = _mm_packs_epi32(u[14], u[15]);
-  s[8] = _mm_packs_epi32(u[16], u[17]);
-  s[9] = _mm_packs_epi32(u[18], u[19]);
-  s[10] = _mm_packs_epi32(u[20], u[21]);
-  s[11] = _mm_packs_epi32(u[22], u[23]);
-  s[12] = _mm_packs_epi32(u[24], u[25]);
-  s[13] = _mm_packs_epi32(u[26], u[27]);
-  s[14] = _mm_packs_epi32(u[28], u[29]);
-  s[15] = _mm_packs_epi32(u[30], u[31]);
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
-  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], v[8]);
-  u[1] = _mm_add_epi32(v[1], v[9]);
-  u[2] = _mm_add_epi32(v[2], v[10]);
-  u[3] = _mm_add_epi32(v[3], v[11]);
-  u[4] = _mm_add_epi32(v[4], v[12]);
-  u[5] = _mm_add_epi32(v[5], v[13]);
-  u[6] = _mm_add_epi32(v[6], v[14]);
-  u[7] = _mm_add_epi32(v[7], v[15]);
-  u[8] = _mm_sub_epi32(v[0], v[8]);
-  u[9] = _mm_sub_epi32(v[1], v[9]);
-  u[10] = _mm_sub_epi32(v[2], v[10]);
-  u[11] = _mm_sub_epi32(v[3], v[11]);
-  u[12] = _mm_sub_epi32(v[4], v[12]);
-  u[13] = _mm_sub_epi32(v[5], v[13]);
-  u[14] = _mm_sub_epi32(v[6], v[14]);
-  u[15] = _mm_sub_epi32(v[7], v[15]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-  x[0] = _mm_add_epi16(s[0], s[4]);
-  x[1] = _mm_add_epi16(s[1], s[5]);
-  x[2] = _mm_add_epi16(s[2], s[6]);
-  x[3] = _mm_add_epi16(s[3], s[7]);
-  x[4] = _mm_sub_epi16(s[0], s[4]);
-  x[5] = _mm_sub_epi16(s[1], s[5]);
-  x[6] = _mm_sub_epi16(s[2], s[6]);
-  x[7] = _mm_sub_epi16(s[3], s[7]);
-  x[8] = _mm_packs_epi32(u[0], u[1]);
-  x[9] = _mm_packs_epi32(u[2], u[3]);
-  x[10] = _mm_packs_epi32(u[4], u[5]);
-  x[11] = _mm_packs_epi32(u[6], u[7]);
-  x[12] = _mm_packs_epi32(u[8], u[9]);
-  x[13] = _mm_packs_epi32(u[10], u[11]);
-  x[14] = _mm_packs_epi32(u[12], u[13]);
-  x[15] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
-  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
-  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
-  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
-  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
-  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
-  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
-  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], v[4]);
-  u[1] = _mm_add_epi32(v[1], v[5]);
-  u[2] = _mm_add_epi32(v[2], v[6]);
-  u[3] = _mm_add_epi32(v[3], v[7]);
-  u[4] = _mm_sub_epi32(v[0], v[4]);
-  u[5] = _mm_sub_epi32(v[1], v[5]);
-  u[6] = _mm_sub_epi32(v[2], v[6]);
-  u[7] = _mm_sub_epi32(v[3], v[7]);
-  u[8] = _mm_add_epi32(v[8], v[12]);
-  u[9] = _mm_add_epi32(v[9], v[13]);
-  u[10] = _mm_add_epi32(v[10], v[14]);
-  u[11] = _mm_add_epi32(v[11], v[15]);
-  u[12] = _mm_sub_epi32(v[8], v[12]);
-  u[13] = _mm_sub_epi32(v[9], v[13]);
-  u[14] = _mm_sub_epi32(v[10], v[14]);
-  u[15] = _mm_sub_epi32(v[11], v[15]);
-
-  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_add_epi16(x[0], x[2]);
-  s[1] = _mm_add_epi16(x[1], x[3]);
-  s[2] = _mm_sub_epi16(x[0], x[2]);
-  s[3] = _mm_sub_epi16(x[1], x[3]);
-  s[4] = _mm_packs_epi32(v[0], v[1]);
-  s[5] = _mm_packs_epi32(v[2], v[3]);
-  s[6] = _mm_packs_epi32(v[4], v[5]);
-  s[7] = _mm_packs_epi32(v[6], v[7]);
-  s[8] = _mm_add_epi16(x[8], x[10]);
-  s[9] = _mm_add_epi16(x[9], x[11]);
-  s[10] = _mm_sub_epi16(x[8], x[10]);
-  s[11] = _mm_sub_epi16(x[9], x[11]);
-  s[12] = _mm_packs_epi32(v[8], v[9]);
-  s[13] = _mm_packs_epi32(v[10], v[11]);
-  s[14] = _mm_packs_epi32(v[12], v[13]);
-  s[15] = _mm_packs_epi32(v[14], v[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
-  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
-  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
-  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  in[0] = s[0];
-  in[1] = _mm_sub_epi16(kZero, s[8]);
-  in[2] = s[12];
-  in[3] = _mm_sub_epi16(kZero, s[4]);
-  in[4] = _mm_packs_epi32(v[4], v[5]);
-  in[5] = _mm_packs_epi32(v[12], v[13]);
-  in[6] = _mm_packs_epi32(v[8], v[9]);
-  in[7] = _mm_packs_epi32(v[0], v[1]);
-  in[8] = _mm_packs_epi32(v[2], v[3]);
-  in[9] = _mm_packs_epi32(v[10], v[11]);
-  in[10] = _mm_packs_epi32(v[14], v[15]);
-  in[11] = _mm_packs_epi32(v[6], v[7]);
-  in[12] = s[5];
-  in[13] = _mm_sub_epi16(kZero, s[13]);
-  in[14] = s[9];
-  in[15] = _mm_sub_epi16(kZero, s[1]);
-}
-
-void idct16_8col(__m128i *in) {
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i v[16], u[16], s[16], t[16];
-
-  // stage 1
-  s[0] = in[0];
-  s[1] = in[8];
-  s[2] = in[4];
-  s[3] = in[12];
-  s[4] = in[2];
-  s[5] = in[10];
-  s[6] = in[6];
-  s[7] = in[14];
-  s[8] = in[1];
-  s[9] = in[9];
-  s[10] = in[5];
-  s[11] = in[13];
-  s[12] = in[3];
-  s[13] = in[11];
-  s[14] = in[7];
-  s[15] = in[15];
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
-  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
-  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
-  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[8] = _mm_packs_epi32(u[0], u[1]);
-  s[15] = _mm_packs_epi32(u[2], u[3]);
-  s[9] = _mm_packs_epi32(u[4], u[5]);
-  s[14] = _mm_packs_epi32(u[6], u[7]);
-  s[10] = _mm_packs_epi32(u[8], u[9]);
-  s[13] = _mm_packs_epi32(u[10], u[11]);
-  s[11] = _mm_packs_epi32(u[12], u[13]);
-  s[12] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  t[0] = s[0];
-  t[1] = s[1];
-  t[2] = s[2];
-  t[3] = s[3];
-  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
-  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
-  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  t[4] = _mm_packs_epi32(u[0], u[1]);
-  t[7] = _mm_packs_epi32(u[2], u[3]);
-  t[5] = _mm_packs_epi32(u[4], u[5]);
-  t[6] = _mm_packs_epi32(u[6], u[7]);
-  t[8] = _mm_add_epi16(s[8], s[9]);
-  t[9] = _mm_sub_epi16(s[8], s[9]);
-  t[10] = _mm_sub_epi16(s[11], s[10]);
-  t[11] = _mm_add_epi16(s[10], s[11]);
-  t[12] = _mm_add_epi16(s[12], s[13]);
-  t[13] = _mm_sub_epi16(s[12], s[13]);
-  t[14] = _mm_sub_epi16(s[15], s[14]);
-  t[15] = _mm_add_epi16(s[14], s[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
-  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
-  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
-  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
-  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
-  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
-  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_add_epi16(t[4], t[5]);
-  s[5] = _mm_sub_epi16(t[4], t[5]);
-  s[6] = _mm_sub_epi16(t[7], t[6]);
-  s[7] = _mm_add_epi16(t[6], t[7]);
-  s[8] = t[8];
-  s[15] = t[15];
-  s[9] = _mm_packs_epi32(u[8], u[9]);
-  s[14] = _mm_packs_epi32(u[10], u[11]);
-  s[10] = _mm_packs_epi32(u[12], u[13]);
-  s[13] = _mm_packs_epi32(u[14], u[15]);
-  s[11] = t[11];
-  s[12] = t[12];
-
-  // stage 5
-  t[0] = _mm_add_epi16(s[0], s[3]);
-  t[1] = _mm_add_epi16(s[1], s[2]);
-  t[2] = _mm_sub_epi16(s[1], s[2]);
-  t[3] = _mm_sub_epi16(s[0], s[3]);
-  t[4] = s[4];
-  t[7] = s[7];
-
-  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  t[5] = _mm_packs_epi32(u[0], u[1]);
-  t[6] = _mm_packs_epi32(u[2], u[3]);
-
-  t[8] = _mm_add_epi16(s[8], s[11]);
-  t[9] = _mm_add_epi16(s[9], s[10]);
-  t[10] = _mm_sub_epi16(s[9], s[10]);
-  t[11] = _mm_sub_epi16(s[8], s[11]);
-  t[12] = _mm_sub_epi16(s[15], s[12]);
-  t[13] = _mm_sub_epi16(s[14], s[13]);
-  t[14] = _mm_add_epi16(s[13], s[14]);
-  t[15] = _mm_add_epi16(s[12], s[15]);
-
-  // stage 6
-  s[0] = _mm_add_epi16(t[0], t[7]);
-  s[1] = _mm_add_epi16(t[1], t[6]);
-  s[2] = _mm_add_epi16(t[2], t[5]);
-  s[3] = _mm_add_epi16(t[3], t[4]);
-  s[4] = _mm_sub_epi16(t[3], t[4]);
-  s[5] = _mm_sub_epi16(t[2], t[5]);
-  s[6] = _mm_sub_epi16(t[1], t[6]);
-  s[7] = _mm_sub_epi16(t[0], t[7]);
-  s[8] = t[8];
-  s[9] = t[9];
-
-  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
-  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
-  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  s[10] = _mm_packs_epi32(u[0], u[1]);
-  s[13] = _mm_packs_epi32(u[2], u[3]);
-  s[11] = _mm_packs_epi32(u[4], u[5]);
-  s[12] = _mm_packs_epi32(u[6], u[7]);
-  s[14] = t[14];
-  s[15] = t[15];
-
-  // stage 7
-  in[0] = _mm_add_epi16(s[0], s[15]);
-  in[1] = _mm_add_epi16(s[1], s[14]);
-  in[2] = _mm_add_epi16(s[2], s[13]);
-  in[3] = _mm_add_epi16(s[3], s[12]);
-  in[4] = _mm_add_epi16(s[4], s[11]);
-  in[5] = _mm_add_epi16(s[5], s[10]);
-  in[6] = _mm_add_epi16(s[6], s[9]);
-  in[7] = _mm_add_epi16(s[7], s[8]);
-  in[8] = _mm_sub_epi16(s[7], s[8]);
-  in[9] = _mm_sub_epi16(s[6], s[9]);
-  in[10] = _mm_sub_epi16(s[5], s[10]);
-  in[11] = _mm_sub_epi16(s[4], s[11]);
-  in[12] = _mm_sub_epi16(s[3], s[12]);
-  in[13] = _mm_sub_epi16(s[2], s[13]);
-  in[14] = _mm_sub_epi16(s[1], s[14]);
-  in[15] = _mm_sub_epi16(s[0], s[15]);
-}
-
-void aom_idct16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  idct16_8col(in0);
-  idct16_8col(in1);
-}
-
-void aom_iadst16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  iadst16_8col(in0);
-  iadst16_8col(in1);
-}
-
-void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  __m128i in[16], l[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
-      stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
-      stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-  // First 1-D inverse DCT
-  // Load input data.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 2);
-  in[2] = load_input_data(input + 8 * 4);
-  in[3] = load_input_data(input + 8 * 6);
-
-  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
-
-  // Stage2
-  {
-    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
-    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
-
-    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
-    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
-    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp2_8 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
-
-    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
-    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-  }
-
-  // Stage4
-  {
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
-
-    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
-    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
-    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
-    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
-    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
-    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
-    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
-    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
-    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
-
-    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
-  }
-
-  // Stage5 and Stage6
-  {
-    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
-    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
-    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
-    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
-
-    stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
-    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
-    stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
-    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
-
-    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
-    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
-    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
-    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
-  }
-
-  // Stage6
-  {
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
-
-    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
-    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
-    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
-    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
-    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
-    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
-
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
-
-    stp2_10 = _mm_packs_epi32(tmp0, zero);
-    stp2_13 = _mm_packs_epi32(tmp2, zero);
-    stp2_11 = _mm_packs_epi32(tmp4, zero);
-    stp2_12 = _mm_packs_epi32(tmp6, zero);
-
-    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
-    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
-    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
-    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
-
-    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
-    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
-    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
-    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
-    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
-    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
-    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage7. Left 8x16 only.
-  l[0] = _mm_add_epi16(stp2_0, stp1_15);
-  l[1] = _mm_add_epi16(stp2_1, stp1_14);
-  l[2] = _mm_add_epi16(stp2_2, stp2_13);
-  l[3] = _mm_add_epi16(stp2_3, stp2_12);
-  l[4] = _mm_add_epi16(stp2_4, stp2_11);
-  l[5] = _mm_add_epi16(stp2_5, stp2_10);
-  l[6] = _mm_add_epi16(stp2_6, stp1_9);
-  l[7] = _mm_add_epi16(stp2_7, stp1_8);
-  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
-  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
-  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
-  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
-  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
-  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
-  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
-  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-  // Second 1-D inverse transform, performed per 8x16 block
-  for (i = 0; i < 2; i++) {
-    int j;
-    array_transpose_4X8(l + 8 * i, in);
-
-    IDCT16_10
-
-    // Stage7
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-#define LOAD_DQCOEFF(reg, input)  \
-  {                               \
-    reg = load_input_data(input); \
-    input += 8;                   \
-  }
-
-#define IDCT32_34                                                              \
-  /* Stage1 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero);                   \
-    const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero);                   \
-                                                                               \
-    const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]);                   \
-    const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]);                   \
-                                                                               \
-    const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero);                   \
-    const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero);                   \
-                                                                               \
-    const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
-    const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16,        \
-                             stp1_31);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19,        \
-                             stp1_28);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20,        \
-                             stp1_27);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23,      \
-                             stp1_24);                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero);                   \
-    const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero);                   \
-                                                                               \
-    const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]);                   \
-    const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8,         \
-                             stp2_15);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11,        \
-                             stp2_12);                                         \
-                                                                               \
-    stp2_16 = stp1_16;                                                         \
-    stp2_19 = stp1_19;                                                         \
-                                                                               \
-    stp2_20 = stp1_20;                                                         \
-    stp2_23 = stp1_23;                                                         \
-                                                                               \
-    stp2_24 = stp1_24;                                                         \
-    stp2_27 = stp1_27;                                                         \
-                                                                               \
-    stp2_28 = stp1_28;                                                         \
-    stp2_31 = stp1_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero);                   \
-    const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero);                   \
-                                                                               \
-    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31);             \
-    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31);             \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27);             \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4,         \
-                             stp1_7);                                          \
-                                                                               \
-    stp1_8 = stp2_8;                                                           \
-    stp1_11 = stp2_11;                                                         \
-    stp1_12 = stp2_12;                                                         \
-    stp1_15 = stp2_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
-                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
-                           stp1_29)                                            \
-    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
-                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
-                           stp1_25)                                            \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_31 = stp2_31;                                                         \
-    stp1_19 = stp2_19;                                                         \
-    stp1_20 = stp2_20;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_27 = stp2_27;                                                         \
-    stp1_28 = stp2_28;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero);                   \
-    const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero);                   \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0,         \
-                             stp2_1);                                          \
-                                                                               \
-    stp2_4 = stp1_4;                                                           \
-    stp2_5 = stp1_4;                                                           \
-    stp2_6 = stp1_7;                                                           \
-    stp2_7 = stp1_7;                                                           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_15 = stp1_15;                                                         \
-    stp2_11 = stp1_11;                                                         \
-    stp2_12 = stp1_12;                                                         \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
-    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
-    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
-    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
-    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
-    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    stp1_0 = stp2_0;                                                           \
-    stp1_1 = stp2_1;                                                           \
-    stp1_2 = stp2_1;                                                           \
-    stp1_3 = stp2_0;                                                           \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_4 = stp2_4;                                                           \
-    stp1_7 = stp2_7;                                                           \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
-                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
-                           stp1_28)                                            \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
-                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-                                                                               \
-    stp1_22 = stp2_22;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_25 = stp2_25;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_9 = stp1_9;                                                           \
-    stp2_14 = stp1_14;                                                         \
-    stp2_15 = stp1_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
-    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
-    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
-                                                                               \
-    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
-    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
-    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage7 */                                                                 \
-  {                                                                            \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
-    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
-    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
-    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
-    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
-    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
-    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
-    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
-    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
-    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
-    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
-    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-    stp1_18 = stp2_18;                                                         \
-    stp1_19 = stp2_19;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
-                           stp1_24)                                            \
-                                                                               \
-    stp1_28 = stp2_28;                                                         \
-    stp1_29 = stp2_29;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }
-
-#define IDCT32(in0, in1)                                                       \
-  /* Stage1 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_31 = _mm_unpacklo_epi16((in0)[1], (in1)[15]);           \
-    const __m128i hi_1_31 = _mm_unpackhi_epi16((in0)[1], (in1)[15]);           \
-    const __m128i lo_17_15 = _mm_unpacklo_epi16((in1)[1], (in0)[15]);          \
-    const __m128i hi_17_15 = _mm_unpackhi_epi16((in1)[1], (in0)[15]);          \
-                                                                               \
-    const __m128i lo_9_23 = _mm_unpacklo_epi16((in0)[9], (in1)[7]);            \
-    const __m128i hi_9_23 = _mm_unpackhi_epi16((in0)[9], (in1)[7]);            \
-    const __m128i lo_25_7 = _mm_unpacklo_epi16((in1)[9], (in0)[7]);            \
-    const __m128i hi_25_7 = _mm_unpackhi_epi16((in1)[9], (in0)[7]);            \
-                                                                               \
-    const __m128i lo_5_27 = _mm_unpacklo_epi16((in0)[5], (in1)[11]);           \
-    const __m128i hi_5_27 = _mm_unpackhi_epi16((in0)[5], (in1)[11]);           \
-    const __m128i lo_21_11 = _mm_unpacklo_epi16((in1)[5], (in0)[11]);          \
-    const __m128i hi_21_11 = _mm_unpackhi_epi16((in1)[5], (in0)[11]);          \
-                                                                               \
-    const __m128i lo_13_19 = _mm_unpacklo_epi16((in0)[13], (in1)[3]);          \
-    const __m128i hi_13_19 = _mm_unpackhi_epi16((in0)[13], (in1)[3]);          \
-    const __m128i lo_29_3 = _mm_unpacklo_epi16((in1)[13], (in0)[3]);           \
-    const __m128i hi_29_3 = _mm_unpackhi_epi16((in1)[13], (in0)[3]);           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,       \
-                           stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17,  \
-                           stp1_30)                                            \
-    MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
-                           stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
-    MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,       \
-                           stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,         \
-                           stp1_21, stp1_26)                                   \
-    MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,      \
-                           stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,        \
-                           stp1_23, stp1_24)                                   \
-  }                                                                            \
-                                                                               \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_30 = _mm_unpacklo_epi16((in0)[2], (in1)[14]);           \
-    const __m128i hi_2_30 = _mm_unpackhi_epi16((in0)[2], (in1)[14]);           \
-    const __m128i lo_18_14 = _mm_unpacklo_epi16((in1)[2], (in0)[14]);          \
-    const __m128i hi_18_14 = _mm_unpackhi_epi16((in1)[2], (in0)[14]);          \
-                                                                               \
-    const __m128i lo_10_22 = _mm_unpacklo_epi16((in0)[10], (in1)[6]);          \
-    const __m128i hi_10_22 = _mm_unpackhi_epi16((in0)[10], (in1)[6]);          \
-    const __m128i lo_26_6 = _mm_unpacklo_epi16((in1)[10], (in0)[6]);           \
-    const __m128i hi_26_6 = _mm_unpackhi_epi16((in1)[10], (in0)[6]);           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,       \
-                           stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,    \
-                           stp2_14)                                            \
-    MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,       \
-                           stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_17);                                 \
-    stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_18);                                 \
-                                                                               \
-    stp2_20 = _mm_add_epi16(stp1_20, stp1_21);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_22);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_25);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);                                 \
-    stp2_27 = _mm_add_epi16(stp1_27, stp1_26);                                 \
-                                                                               \
-    stp2_28 = _mm_add_epi16(stp1_28, stp1_29);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);                                 \
-    stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_31, stp1_30);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_4_28 = _mm_unpacklo_epi16((in0)[4], (in1)[12]);           \
-    const __m128i hi_4_28 = _mm_unpackhi_epi16((in0)[4], (in1)[12]);           \
-    const __m128i lo_20_12 = _mm_unpacklo_epi16((in1)[4], (in0)[12]);          \
-    const __m128i hi_20_12 = _mm_unpackhi_epi16((in1)[4], (in0)[12]);          \
-                                                                               \
-    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);             \
-    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);             \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,       \
-                           stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,     \
-                           stp1_6)                                             \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_9);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
-    stp1_12 = _mm_add_epi16(stp2_12, stp2_13);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
-                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
-                           stp1_29)                                            \
-    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
-                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
-                           stp1_25)                                            \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_31 = stp2_31;                                                         \
-    stp1_19 = stp2_19;                                                         \
-    stp1_20 = stp2_20;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_27 = stp2_27;                                                         \
-    stp1_28 = stp2_28;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_16 = _mm_unpacklo_epi16((in0)[0], (in1)[0]);            \
-    const __m128i hi_0_16 = _mm_unpackhi_epi16((in0)[0], (in1)[0]);            \
-    const __m128i lo_8_24 = _mm_unpacklo_epi16((in0)[8], (in1)[8]);            \
-    const __m128i hi_8_24 = _mm_unpackhi_epi16((in0)[8], (in1)[8]);            \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
-                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
-                                                                               \
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_15 = stp1_15;                                                         \
-    stp2_11 = stp1_11;                                                         \
-    stp2_12 = stp1_12;                                                         \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
-    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
-    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
-    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
-    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
-    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_4 = stp2_4;                                                           \
-    stp1_7 = stp2_7;                                                           \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
-                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
-                           stp1_28)                                            \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
-                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-                                                                               \
-    stp1_22 = stp2_22;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_25 = stp2_25;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_9 = stp1_9;                                                           \
-    stp2_14 = stp1_14;                                                         \
-    stp2_15 = stp1_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
-    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
-    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
-                                                                               \
-    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
-    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
-    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage7 */                                                                 \
-  {                                                                            \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
-    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
-    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
-    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
-    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
-    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
-    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
-    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
-    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
-    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
-    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
-    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-    stp1_18 = stp2_18;                                                         \
-    stp1_19 = stp2_19;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
-                           stp1_24)                                            \
-                                                                               \
-    stp1_28 = stp2_28;                                                         \
-    stp1_29 = stp2_29;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }
-
-// Only upper-left 8x8 has non-zero coeff
-void aom_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[32];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
-      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
-      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  // Load input data. Only need to load the top left 8x8 block.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 32);
-  in[2] = load_input_data(input + 64);
-  in[3] = load_input_data(input + 96);
-  in[4] = load_input_data(input + 128);
-  in[5] = load_input_data(input + 160);
-  in[6] = load_input_data(input + 192);
-  in[7] = load_input_data(input + 224);
-
-  for (i = 8; i < 32; ++i) {
-    in[i] = _mm_setzero_si128();
-  }
-
-  array_transpose_8x8(in, in);
-  // TODO(hkuang): Following transposes are unnecessary. But remove them will
-  // lead to performance drop on some devices.
-  array_transpose_8x8(in + 8, in + 8);
-  array_transpose_8x8(in + 16, in + 16);
-  array_transpose_8x8(in + 24, in + 24);
-
-  IDCT32_34
-
-  // 1_D: Store 32 intermediate results for each 8x32 block.
-  col[0] = _mm_add_epi16(stp1_0, stp1_31);
-  col[1] = _mm_add_epi16(stp1_1, stp1_30);
-  col[2] = _mm_add_epi16(stp1_2, stp1_29);
-  col[3] = _mm_add_epi16(stp1_3, stp1_28);
-  col[4] = _mm_add_epi16(stp1_4, stp1_27);
-  col[5] = _mm_add_epi16(stp1_5, stp1_26);
-  col[6] = _mm_add_epi16(stp1_6, stp1_25);
-  col[7] = _mm_add_epi16(stp1_7, stp1_24);
-  col[8] = _mm_add_epi16(stp1_8, stp1_23);
-  col[9] = _mm_add_epi16(stp1_9, stp1_22);
-  col[10] = _mm_add_epi16(stp1_10, stp1_21);
-  col[11] = _mm_add_epi16(stp1_11, stp1_20);
-  col[12] = _mm_add_epi16(stp1_12, stp1_19);
-  col[13] = _mm_add_epi16(stp1_13, stp1_18);
-  col[14] = _mm_add_epi16(stp1_14, stp1_17);
-  col[15] = _mm_add_epi16(stp1_15, stp1_16);
-  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
-  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
-  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
-  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
-  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
-  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
-  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
-  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
-  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
-  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
-  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
-  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
-  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
-  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
-  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
-  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
-  for (i = 0; i < 4; i++) {
-    int j;
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + i * 8, in);
-    IDCT32_34
-
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void aom_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
-                                 int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[128], zero_idx[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
-      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
-      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i, j, i32;
-
-  for (i = 0; i < 4; i++) {
-    i32 = (i << 5);
-    // First 1-D idct
-    // Load input data.
-    LOAD_DQCOEFF(in[0], input);
-    LOAD_DQCOEFF(in[8], input);
-    LOAD_DQCOEFF(in[16], input);
-    LOAD_DQCOEFF(in[24], input);
-    LOAD_DQCOEFF(in[1], input);
-    LOAD_DQCOEFF(in[9], input);
-    LOAD_DQCOEFF(in[17], input);
-    LOAD_DQCOEFF(in[25], input);
-    LOAD_DQCOEFF(in[2], input);
-    LOAD_DQCOEFF(in[10], input);
-    LOAD_DQCOEFF(in[18], input);
-    LOAD_DQCOEFF(in[26], input);
-    LOAD_DQCOEFF(in[3], input);
-    LOAD_DQCOEFF(in[11], input);
-    LOAD_DQCOEFF(in[19], input);
-    LOAD_DQCOEFF(in[27], input);
-
-    LOAD_DQCOEFF(in[4], input);
-    LOAD_DQCOEFF(in[12], input);
-    LOAD_DQCOEFF(in[20], input);
-    LOAD_DQCOEFF(in[28], input);
-    LOAD_DQCOEFF(in[5], input);
-    LOAD_DQCOEFF(in[13], input);
-    LOAD_DQCOEFF(in[21], input);
-    LOAD_DQCOEFF(in[29], input);
-    LOAD_DQCOEFF(in[6], input);
-    LOAD_DQCOEFF(in[14], input);
-    LOAD_DQCOEFF(in[22], input);
-    LOAD_DQCOEFF(in[30], input);
-    LOAD_DQCOEFF(in[7], input);
-    LOAD_DQCOEFF(in[15], input);
-    LOAD_DQCOEFF(in[23], input);
-    LOAD_DQCOEFF(in[31], input);
-
-    // checking if all entries are zero
-    zero_idx[0] = _mm_or_si128(in[0], in[1]);
-    zero_idx[1] = _mm_or_si128(in[2], in[3]);
-    zero_idx[2] = _mm_or_si128(in[4], in[5]);
-    zero_idx[3] = _mm_or_si128(in[6], in[7]);
-    zero_idx[4] = _mm_or_si128(in[8], in[9]);
-    zero_idx[5] = _mm_or_si128(in[10], in[11]);
-    zero_idx[6] = _mm_or_si128(in[12], in[13]);
-    zero_idx[7] = _mm_or_si128(in[14], in[15]);
-    zero_idx[8] = _mm_or_si128(in[16], in[17]);
-    zero_idx[9] = _mm_or_si128(in[18], in[19]);
-    zero_idx[10] = _mm_or_si128(in[20], in[21]);
-    zero_idx[11] = _mm_or_si128(in[22], in[23]);
-    zero_idx[12] = _mm_or_si128(in[24], in[25]);
-    zero_idx[13] = _mm_or_si128(in[26], in[27]);
-    zero_idx[14] = _mm_or_si128(in[28], in[29]);
-    zero_idx[15] = _mm_or_si128(in[30], in[31]);
-
-    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
-
-    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-
-    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
-      col[i32 + 0] = _mm_setzero_si128();
-      col[i32 + 1] = _mm_setzero_si128();
-      col[i32 + 2] = _mm_setzero_si128();
-      col[i32 + 3] = _mm_setzero_si128();
-      col[i32 + 4] = _mm_setzero_si128();
-      col[i32 + 5] = _mm_setzero_si128();
-      col[i32 + 6] = _mm_setzero_si128();
-      col[i32 + 7] = _mm_setzero_si128();
-      col[i32 + 8] = _mm_setzero_si128();
-      col[i32 + 9] = _mm_setzero_si128();
-      col[i32 + 10] = _mm_setzero_si128();
-      col[i32 + 11] = _mm_setzero_si128();
-      col[i32 + 12] = _mm_setzero_si128();
-      col[i32 + 13] = _mm_setzero_si128();
-      col[i32 + 14] = _mm_setzero_si128();
-      col[i32 + 15] = _mm_setzero_si128();
-      col[i32 + 16] = _mm_setzero_si128();
-      col[i32 + 17] = _mm_setzero_si128();
-      col[i32 + 18] = _mm_setzero_si128();
-      col[i32 + 19] = _mm_setzero_si128();
-      col[i32 + 20] = _mm_setzero_si128();
-      col[i32 + 21] = _mm_setzero_si128();
-      col[i32 + 22] = _mm_setzero_si128();
-      col[i32 + 23] = _mm_setzero_si128();
-      col[i32 + 24] = _mm_setzero_si128();
-      col[i32 + 25] = _mm_setzero_si128();
-      col[i32 + 26] = _mm_setzero_si128();
-      col[i32 + 27] = _mm_setzero_si128();
-      col[i32 + 28] = _mm_setzero_si128();
-      col[i32 + 29] = _mm_setzero_si128();
-      col[i32 + 30] = _mm_setzero_si128();
-      col[i32 + 31] = _mm_setzero_si128();
-      continue;
-    }
-
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-    array_transpose_8x8(in + 16, in + 16);
-    array_transpose_8x8(in + 24, in + 24);
-
-    IDCT32(in, in + 16)
-
-    // 1_D: Store 32 intermediate results for each 8x32 block.
-    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
-    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
-    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
-    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
-    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
-    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
-    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
-    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
-    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
-    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
-    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
-    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
-    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
-    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
-    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
-    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
-    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
-    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
-    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
-    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
-    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
-    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
-    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
-    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
-    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
-    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
-    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
-    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
-    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
-    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
-    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
-    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
-  }
-  for (i = 0; i < 4; i++) {
-    // Second 1-D idct
-    j = i << 3;
-
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + j, in);
-    array_transpose_8x8(col + j + 32, in + 8);
-    array_transpose_8x8(col + j + 64, in + 16);
-    array_transpose_8x8(col + j + 96, in + 24);
-
-    IDCT32(in, in + 16)
-
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void aom_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, j;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  if (a == 0) return;
-
-  dc_value = _mm_set1_epi16(a);
-
-  for (j = 0; j < 32; ++j) {
-    RECON_AND_STORE(dest + 0 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 8 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 16 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 24 + j * stride, dc_value);
-  }
-}
-
-// Apply a 32-element IDCT to 8 columns. This does not do any transposition
-// of its input - the caller is expected to have done that.
-// The input buffers are the top and bottom halves of an 8x32 block.
-void idct32_8col(__m128i *in0, __m128i *in1) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
-      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
-      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  IDCT32(in0, in1)
-
-  // 2_D: Calculate the results and store them to destination.
-  in0[0] = _mm_add_epi16(stp1_0, stp1_31);
-  in0[1] = _mm_add_epi16(stp1_1, stp1_30);
-  in0[2] = _mm_add_epi16(stp1_2, stp1_29);
-  in0[3] = _mm_add_epi16(stp1_3, stp1_28);
-  in0[4] = _mm_add_epi16(stp1_4, stp1_27);
-  in0[5] = _mm_add_epi16(stp1_5, stp1_26);
-  in0[6] = _mm_add_epi16(stp1_6, stp1_25);
-  in0[7] = _mm_add_epi16(stp1_7, stp1_24);
-  in0[8] = _mm_add_epi16(stp1_8, stp1_23);
-  in0[9] = _mm_add_epi16(stp1_9, stp1_22);
-  in0[10] = _mm_add_epi16(stp1_10, stp1_21);
-  in0[11] = _mm_add_epi16(stp1_11, stp1_20);
-  in0[12] = _mm_add_epi16(stp1_12, stp1_19);
-  in0[13] = _mm_add_epi16(stp1_13, stp1_18);
-  in0[14] = _mm_add_epi16(stp1_14, stp1_17);
-  in0[15] = _mm_add_epi16(stp1_15, stp1_16);
-  in1[0] = _mm_sub_epi16(stp1_15, stp1_16);
-  in1[1] = _mm_sub_epi16(stp1_14, stp1_17);
-  in1[2] = _mm_sub_epi16(stp1_13, stp1_18);
-  in1[3] = _mm_sub_epi16(stp1_12, stp1_19);
-  in1[4] = _mm_sub_epi16(stp1_11, stp1_20);
-  in1[5] = _mm_sub_epi16(stp1_10, stp1_21);
-  in1[6] = _mm_sub_epi16(stp1_9, stp1_22);
-  in1[7] = _mm_sub_epi16(stp1_8, stp1_23);
-  in1[8] = _mm_sub_epi16(stp1_7, stp1_24);
-  in1[9] = _mm_sub_epi16(stp1_6, stp1_25);
-  in1[10] = _mm_sub_epi16(stp1_5, stp1_26);
-  in1[11] = _mm_sub_epi16(stp1_4, stp1_27);
-  in1[12] = _mm_sub_epi16(stp1_3, stp1_28);
-  in1[13] = _mm_sub_epi16(stp1_2, stp1_29);
-  in1[14] = _mm_sub_epi16(stp1_1, stp1_30);
-  in1[15] = _mm_sub_epi16(stp1_0, stp1_31);
-}
diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h
deleted file mode 100644
index 342816977..000000000
--- a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_INV_TXFM_SSE2_H_
-#define AOM_DSP_X86_INV_TXFM_SSE2_H_
-
-#include <emmintrin.h>  // SSE2
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/inv_txfm.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-// perform 8x8 transpose
-static INLINE void array_transpose_4x4(__m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
-  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
-  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
-  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
-}
-
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-
-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-}
-
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                      out2, out3, out4, out5, out6, out7)                 \
-  {                                                                       \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1);                   \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3);                   \
-    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1);                   \
-    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3);                   \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5);                   \
-    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7);                   \
-    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5);                   \
-    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7);                   \
-                                                                          \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);               \
-    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);               \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);               \
-    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);               \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);               \
-    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);               \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);               \
-    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);               \
-                                                                          \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4);                              \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4);                              \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6);                              \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6);                              \
-    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5);                              \
-    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5);                              \
-    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7);                              \
-    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7);                              \
-  }
-
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1)   \
-  {                                                     \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-                                                        \
-    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
-    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
-  }
-
-static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-
-  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
-  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
-  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
-  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
-}
-
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
-  __m128i tbuf[8];
-  array_transpose_8x8(res0, res0);
-  array_transpose_8x8(res1, tbuf);
-  array_transpose_8x8(res0 + 8, res1);
-  array_transpose_8x8(res1 + 8, res1 + 8);
-
-  res0[8] = tbuf[0];
-  res0[9] = tbuf[1];
-  res0[10] = tbuf[2];
-  res0[11] = tbuf[3];
-  res0[12] = tbuf[4];
-  res0[13] = tbuf[5];
-  res0[14] = tbuf[6];
-  res0[15] = tbuf[7];
-}
-
-// Function to allow 8 bit optimisations to be used when profile 0 is used with
-// highbitdepth enabled
-static INLINE __m128i load_input_data(const tran_low_t *data) {
-  if (sizeof(tran_low_t) == 4) {
-    return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
-                          data[6], data[7]);
-  } else {
-    return _mm_load_si128((const __m128i *)data);
-  }
-}
-
-static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
-  in[0] = load_input_data(input + 0 * 16);
-  in[1] = load_input_data(input + 1 * 16);
-  in[2] = load_input_data(input + 2 * 16);
-  in[3] = load_input_data(input + 3 * 16);
-  in[4] = load_input_data(input + 4 * 16);
-  in[5] = load_input_data(input + 5 * 16);
-  in[6] = load_input_data(input + 6 * 16);
-  in[7] = load_input_data(input + 7 * 16);
-
-  in[8] = load_input_data(input + 8 * 16);
-  in[9] = load_input_data(input + 9 * 16);
-  in[10] = load_input_data(input + 10 * 16);
-  in[11] = load_input_data(input + 11 * 16);
-  in[12] = load_input_data(input + 12 * 16);
-  in[13] = load_input_data(input + 13 * 16);
-  in[14] = load_input_data(input + 14 * 16);
-  in[15] = load_input_data(input + 15 * 16);
-}
-
-#define RECON_AND_STORE(dest, in_x)                  \
-  {                                                  \
-    __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
-    d0 = _mm_unpacklo_epi8(d0, zero);                \
-    d0 = _mm_add_epi16(in_x, d0);                    \
-    d0 = _mm_packus_epi16(d0, d0);                   \
-    _mm_storel_epi64((__m128i *)(dest), d0);         \
-  }
-
-static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-  // Final rounding and shift
-  in[0] = _mm_adds_epi16(in[0], final_rounding);
-  in[1] = _mm_adds_epi16(in[1], final_rounding);
-  in[2] = _mm_adds_epi16(in[2], final_rounding);
-  in[3] = _mm_adds_epi16(in[3], final_rounding);
-  in[4] = _mm_adds_epi16(in[4], final_rounding);
-  in[5] = _mm_adds_epi16(in[5], final_rounding);
-  in[6] = _mm_adds_epi16(in[6], final_rounding);
-  in[7] = _mm_adds_epi16(in[7], final_rounding);
-  in[8] = _mm_adds_epi16(in[8], final_rounding);
-  in[9] = _mm_adds_epi16(in[9], final_rounding);
-  in[10] = _mm_adds_epi16(in[10], final_rounding);
-  in[11] = _mm_adds_epi16(in[11], final_rounding);
-  in[12] = _mm_adds_epi16(in[12], final_rounding);
-  in[13] = _mm_adds_epi16(in[13], final_rounding);
-  in[14] = _mm_adds_epi16(in[14], final_rounding);
-  in[15] = _mm_adds_epi16(in[15], final_rounding);
-
-  in[0] = _mm_srai_epi16(in[0], 6);
-  in[1] = _mm_srai_epi16(in[1], 6);
-  in[2] = _mm_srai_epi16(in[2], 6);
-  in[3] = _mm_srai_epi16(in[3], 6);
-  in[4] = _mm_srai_epi16(in[4], 6);
-  in[5] = _mm_srai_epi16(in[5], 6);
-  in[6] = _mm_srai_epi16(in[6], 6);
-  in[7] = _mm_srai_epi16(in[7], 6);
-  in[8] = _mm_srai_epi16(in[8], 6);
-  in[9] = _mm_srai_epi16(in[9], 6);
-  in[10] = _mm_srai_epi16(in[10], 6);
-  in[11] = _mm_srai_epi16(in[11], 6);
-  in[12] = _mm_srai_epi16(in[12], 6);
-  in[13] = _mm_srai_epi16(in[13], 6);
-  in[14] = _mm_srai_epi16(in[14], 6);
-  in[15] = _mm_srai_epi16(in[15], 6);
-
-  RECON_AND_STORE(dest + 0 * stride, in[0]);
-  RECON_AND_STORE(dest + 1 * stride, in[1]);
-  RECON_AND_STORE(dest + 2 * stride, in[2]);
-  RECON_AND_STORE(dest + 3 * stride, in[3]);
-  RECON_AND_STORE(dest + 4 * stride, in[4]);
-  RECON_AND_STORE(dest + 5 * stride, in[5]);
-  RECON_AND_STORE(dest + 6 * stride, in[6]);
-  RECON_AND_STORE(dest + 7 * stride, in[7]);
-  RECON_AND_STORE(dest + 8 * stride, in[8]);
-  RECON_AND_STORE(dest + 9 * stride, in[9]);
-  RECON_AND_STORE(dest + 10 * stride, in[10]);
-  RECON_AND_STORE(dest + 11 * stride, in[11]);
-  RECON_AND_STORE(dest + 12 * stride, in[12]);
-  RECON_AND_STORE(dest + 13 * stride, in[13]);
-  RECON_AND_STORE(dest + 14 * stride, in[14]);
-  RECON_AND_STORE(dest + 15 * stride, in[15]);
-}
-
-#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
-  {                                                                      \
-    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1);                \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0);                \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3);                \
-    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2);                \
-                                                                         \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);              \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);              \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);              \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);              \
-                                                                         \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4);                             \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4);                             \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6);                             \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6);                             \
-  }
-
-#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
-  {                                                      \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1);  \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3);  \
-    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1);             \
-    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1);             \
-  }
-
-void iadst16_8col(__m128i *in);
-void idct16_8col(__m128i *in);
-void aom_idct4_sse2(__m128i *in);
-void aom_idct8_sse2(__m128i *in);
-void aom_idct16_sse2(__m128i *in0, __m128i *in1);
-void aom_iadst4_sse2(__m128i *in);
-void aom_iadst8_sse2(__m128i *in);
-void aom_iadst16_sse2(__m128i *in0, __m128i *in1);
-void idct32_8col(__m128i *in0, __m128i *in1);
-
-#endif  // AOM_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c b/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c
deleted file mode 100644
index 9d006797b..000000000
--- a/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c
+++ /dev/null
@@ -1,1333 +0,0 @@
-/*
- *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <tmmintrin.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/x86/inv_txfm_sse2.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-void aom_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  // Load input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-  in4 = load_input_data(input + 8 * 4);
-  in5 = load_input_data(input + 8 * 5);
-  in6 = load_input_data(input + 8 * 6);
-  in7 = load_input_data(input + 8 * 7);
-
-  // 2-D
-  for (i = 0; i < 2; i++) {
-    // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
-    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                  in4, in5, in6, in7);
-
-    // 4-stage 1D idct8x8
-    {
-      /* Stage1 */
-      {
-        const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);
-        const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);
-        const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);
-        const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);
-
-        {
-          tmp0 = _mm_madd_epi16(lo_17, stg1_0);
-          tmp1 = _mm_madd_epi16(hi_17, stg1_0);
-          tmp2 = _mm_madd_epi16(lo_17, stg1_1);
-          tmp3 = _mm_madd_epi16(hi_17, stg1_1);
-          tmp4 = _mm_madd_epi16(lo_35, stg1_2);
-          tmp5 = _mm_madd_epi16(hi_35, stg1_2);
-          tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-          tmp7 = _mm_madd_epi16(hi_35, stg1_3);
-
-          tmp0 = _mm_add_epi32(tmp0, rounding);
-          tmp1 = _mm_add_epi32(tmp1, rounding);
-          tmp2 = _mm_add_epi32(tmp2, rounding);
-          tmp3 = _mm_add_epi32(tmp3, rounding);
-          tmp4 = _mm_add_epi32(tmp4, rounding);
-          tmp5 = _mm_add_epi32(tmp5, rounding);
-          tmp6 = _mm_add_epi32(tmp6, rounding);
-          tmp7 = _mm_add_epi32(tmp7, rounding);
-
-          tmp0 = _mm_srai_epi32(tmp0, 14);
-          tmp1 = _mm_srai_epi32(tmp1, 14);
-          tmp2 = _mm_srai_epi32(tmp2, 14);
-          tmp3 = _mm_srai_epi32(tmp3, 14);
-          tmp4 = _mm_srai_epi32(tmp4, 14);
-          tmp5 = _mm_srai_epi32(tmp5, 14);
-          tmp6 = _mm_srai_epi32(tmp6, 14);
-          tmp7 = _mm_srai_epi32(tmp7, 14);
-
-          stp1_4 = _mm_packs_epi32(tmp0, tmp1);
-          stp1_7 = _mm_packs_epi32(tmp2, tmp3);
-          stp1_5 = _mm_packs_epi32(tmp4, tmp5);
-          stp1_6 = _mm_packs_epi32(tmp6, tmp7);
-        }
-      }
-
-      /* Stage2 */
-      {
-        const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);
-        const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);
-
-        {
-          tmp0 = _mm_unpacklo_epi16(in0, in4);
-          tmp1 = _mm_unpackhi_epi16(in0, in4);
-
-          tmp2 = _mm_madd_epi16(tmp0, stk2_0);
-          tmp3 = _mm_madd_epi16(tmp1, stk2_0);
-          tmp4 = _mm_madd_epi16(tmp0, stk2_1);
-          tmp5 = _mm_madd_epi16(tmp1, stk2_1);
-
-          tmp2 = _mm_add_epi32(tmp2, rounding);
-          tmp3 = _mm_add_epi32(tmp3, rounding);
-          tmp4 = _mm_add_epi32(tmp4, rounding);
-          tmp5 = _mm_add_epi32(tmp5, rounding);
-
-          tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-          tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-          tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-          tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-
-          stp2_0 = _mm_packs_epi32(tmp2, tmp3);
-          stp2_1 = _mm_packs_epi32(tmp4, tmp5);
-
-          tmp0 = _mm_madd_epi16(lo_26, stg2_2);
-          tmp1 = _mm_madd_epi16(hi_26, stg2_2);
-          tmp2 = _mm_madd_epi16(lo_26, stg2_3);
-          tmp3 = _mm_madd_epi16(hi_26, stg2_3);
-
-          tmp0 = _mm_add_epi32(tmp0, rounding);
-          tmp1 = _mm_add_epi32(tmp1, rounding);
-          tmp2 = _mm_add_epi32(tmp2, rounding);
-          tmp3 = _mm_add_epi32(tmp3, rounding);
-
-          tmp0 = _mm_srai_epi32(tmp0, 14);
-          tmp1 = _mm_srai_epi32(tmp1, 14);
-          tmp2 = _mm_srai_epi32(tmp2, 14);
-          tmp3 = _mm_srai_epi32(tmp3, 14);
-
-          stp2_2 = _mm_packs_epi32(tmp0, tmp1);
-          stp2_3 = _mm_packs_epi32(tmp2, tmp3);
-        }
-
-        stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
-        stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
-        stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
-        stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
-      }
-
-      /* Stage3 */
-      {
-        stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
-        stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
-        stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
-        stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
-        tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
-        tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
-
-        tmp2 = _mm_madd_epi16(tmp0, stk2_1);
-        tmp3 = _mm_madd_epi16(tmp1, stk2_1);
-        tmp4 = _mm_madd_epi16(tmp0, stk2_0);
-        tmp5 = _mm_madd_epi16(tmp1, stk2_0);
-
-        tmp2 = _mm_add_epi32(tmp2, rounding);
-        tmp3 = _mm_add_epi32(tmp3, rounding);
-        tmp4 = _mm_add_epi32(tmp4, rounding);
-        tmp5 = _mm_add_epi32(tmp5, rounding);
-
-        tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-        tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-        tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-        tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-
-        stp1_5 = _mm_packs_epi32(tmp2, tmp3);
-        stp1_6 = _mm_packs_epi32(tmp4, tmp5);
-      }
-
-      /* Stage4  */
-      in0 = _mm_add_epi16(stp1_0, stp2_7);
-      in1 = _mm_add_epi16(stp1_1, stp1_6);
-      in2 = _mm_add_epi16(stp1_2, stp1_5);
-      in3 = _mm_add_epi16(stp1_3, stp2_4);
-      in4 = _mm_sub_epi16(stp1_3, stp2_4);
-      in5 = _mm_sub_epi16(stp1_2, stp1_5);
-      in6 = _mm_sub_epi16(stp1_1, stp1_6);
-      in7 = _mm_sub_epi16(stp1_0, stp2_7);
-    }
-  }
-
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-void aom_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-  const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-  const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
-  const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
-  const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-  const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
-  const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
-  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3;
-
-  // Rows. Load 4-row input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-
-  // 8x4 Transpose
-  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
-
-  // Stage1
-  tmp0 = _mm_mulhrs_epi16(in0, stg1_0);
-  tmp1 = _mm_mulhrs_epi16(in0, stg1_1);
-  tmp2 = _mm_mulhrs_epi16(in1, stg1_2);
-  tmp3 = _mm_mulhrs_epi16(in1, stg1_3);
-
-  stp1_4 = _mm_unpackhi_epi64(tmp0, tmp1);
-  stp1_5 = _mm_unpackhi_epi64(tmp2, tmp3);
-
-  // Stage2
-  tmp0 = _mm_mulhrs_epi16(in0, stg2_0);
-  stp2_0 = _mm_unpacklo_epi64(tmp0, tmp0);
-
-  tmp1 = _mm_mulhrs_epi16(in1, stg2_2);
-  tmp2 = _mm_mulhrs_epi16(in1, stg2_3);
-  stp2_2 = _mm_unpacklo_epi64(tmp2, tmp1);
-
-  tmp0 = _mm_add_epi16(stp1_4, stp1_5);
-  tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
-
-  stp2_4 = tmp0;
-  stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
-  stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
-
-  tmp0 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-  tmp1 = _mm_madd_epi16(tmp0, stg3_0);
-  tmp2 = _mm_madd_epi16(tmp0, stk2_0);  // stg3_1 = stk2_0
-
-  tmp1 = _mm_add_epi32(tmp1, rounding);
-  tmp2 = _mm_add_epi32(tmp2, rounding);
-  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-  stp1_5 = _mm_packs_epi32(tmp1, tmp2);
-
-  // Stage3
-  tmp2 = _mm_add_epi16(stp2_0, stp2_2);
-  tmp3 = _mm_sub_epi16(stp2_0, stp2_2);
-
-  stp1_2 = _mm_unpackhi_epi64(tmp3, tmp2);
-  stp1_3 = _mm_unpacklo_epi64(tmp3, tmp2);
-
-  // Stage4
-  tmp0 = _mm_add_epi16(stp1_3, stp2_4);
-  tmp1 = _mm_add_epi16(stp1_2, stp1_5);
-  tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
-  tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
-
-  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
-
-  /* Stage1 */
-  stp1_4 = _mm_mulhrs_epi16(in1, stg1_0);
-  stp1_7 = _mm_mulhrs_epi16(in1, stg1_1);
-  stp1_5 = _mm_mulhrs_epi16(in3, stg1_2);
-  stp1_6 = _mm_mulhrs_epi16(in3, stg1_3);
-
-  /* Stage2 */
-  stp2_0 = _mm_mulhrs_epi16(in0, stg2_0);
-  stp2_1 = _mm_mulhrs_epi16(in0, stg2_0);
-
-  stp2_2 = _mm_mulhrs_epi16(in2, stg2_2);
-  stp2_3 = _mm_mulhrs_epi16(in2, stg2_3);
-
-  stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
-  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
-  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
-  stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
-
-  /* Stage3 */
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
-  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
-  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
-  tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
-  tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
-
-  tmp2 = _mm_madd_epi16(tmp0, stk2_0);
-  tmp3 = _mm_madd_epi16(tmp1, stk2_0);
-  tmp2 = _mm_add_epi32(tmp2, rounding);
-  tmp3 = _mm_add_epi32(tmp3, rounding);
-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-  stp1_6 = _mm_packs_epi32(tmp2, tmp3);
-
-  tmp2 = _mm_madd_epi16(tmp0, stk2_1);
-  tmp3 = _mm_madd_epi16(tmp1, stk2_1);
-  tmp2 = _mm_add_epi32(tmp2, rounding);
-  tmp3 = _mm_add_epi32(tmp3, rounding);
-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-  stp1_5 = _mm_packs_epi32(tmp2, tmp3);
-
-  /* Stage4  */
-  in0 = _mm_add_epi16(stp1_0, stp2_7);
-  in1 = _mm_add_epi16(stp1_1, stp1_6);
-  in2 = _mm_add_epi16(stp1_2, stp1_5);
-  in3 = _mm_add_epi16(stp1_3, stp2_4);
-  in4 = _mm_sub_epi16(stp1_3, stp2_4);
-  in5 = _mm_sub_epi16(stp1_2, stp1_5);
-  in6 = _mm_sub_epi16(stp1_1, stp1_6);
-  in7 = _mm_sub_epi16(stp1_0, stp2_7);
-
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-// Only do addition and subtraction butterfly, size = 16, 32
-static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out,
-                                     int size) {
-  int i = 0;
-  const int num = size >> 1;
-  const int bound = size - 1;
-  while (i < num) {
-    out[i] = _mm_add_epi16(in[i], in[bound - i]);
-    out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]);
-    i++;
-  }
-}
-
-#define BUTTERFLY_PAIR(x0, x1, co0, co1)         \
-  do {                                           \
-    tmp0 = _mm_madd_epi16(x0, co0);              \
-    tmp1 = _mm_madd_epi16(x1, co0);              \
-    tmp2 = _mm_madd_epi16(x0, co1);              \
-    tmp3 = _mm_madd_epi16(x1, co1);              \
-    tmp0 = _mm_add_epi32(tmp0, rounding);        \
-    tmp1 = _mm_add_epi32(tmp1, rounding);        \
-    tmp2 = _mm_add_epi32(tmp2, rounding);        \
-    tmp3 = _mm_add_epi32(tmp3, rounding);        \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-  } while (0)
-
-static INLINE void butterfly(const __m128i *x0, const __m128i *x1,
-                             const __m128i *c0, const __m128i *c1, __m128i *y0,
-                             __m128i *y1) {
-  __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  u0 = _mm_unpacklo_epi16(*x0, *x1);
-  u1 = _mm_unpackhi_epi16(*x0, *x1);
-  BUTTERFLY_PAIR(u0, u1, *c0, *c1);
-  *y0 = _mm_packs_epi32(tmp0, tmp1);
-  *y1 = _mm_packs_epi32(tmp2, tmp3);
-}
-
-static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0,
-                                  const __m128i *c1) {
-  __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  u0 = _mm_unpacklo_epi16(*x0, *x1);
-  u1 = _mm_unpackhi_epi16(*x0, *x1);
-  BUTTERFLY_PAIR(u0, u1, *c0, *c1);
-  *x0 = _mm_packs_epi32(tmp0, tmp1);
-  *x1 = _mm_packs_epi32(tmp2, tmp3);
-}
-
-static void idct32_34_first_half(const __m128i *in, __m128i *stp1) {
-  const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
-  const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
-  const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
-  const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-
-  const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-  const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i x0, x1, x4, x5, x6, x7;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-
-  // phase 1
-
-  // 0, 15
-  u2 = _mm_mulhrs_epi16(in[2], stk2_1);  // stp2_15
-  u3 = _mm_mulhrs_epi16(in[6], stk2_7);  // stp2_12
-  v15 = _mm_add_epi16(u2, u3);
-  // in[0], in[4]
-  x0 = _mm_mulhrs_epi16(in[0], stk4_0);  // stp1[0]
-  x7 = _mm_mulhrs_epi16(in[4], stk3_1);  // stp1[7]
-  v0 = _mm_add_epi16(x0, x7);            // stp2_0
-  stp1[0] = _mm_add_epi16(v0, v15);
-  stp1[15] = _mm_sub_epi16(v0, v15);
-
-  // in[2], in[6]
-  u0 = _mm_mulhrs_epi16(in[2], stk2_0);             // stp2_8
-  u1 = _mm_mulhrs_epi16(in[6], stk2_6);             // stp2_11
-  butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5);  // stp2_9, stp2_14
-  butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7);  // stp2_10, stp2_13
-
-  v8 = _mm_add_epi16(u0, u1);
-  v9 = _mm_add_epi16(u4, u6);
-  v10 = _mm_sub_epi16(u4, u6);
-  v11 = _mm_sub_epi16(u0, u1);
-  v12 = _mm_sub_epi16(u2, u3);
-  v13 = _mm_sub_epi16(u5, u7);
-  v14 = _mm_add_epi16(u5, u7);
-
-  butterfly_self(&v10, &v13, &stg6_0, &stg4_0);
-  butterfly_self(&v11, &v12, &stg6_0, &stg4_0);
-
-  // 1, 14
-  x1 = _mm_mulhrs_epi16(in[0], stk4_0);  // stp1[1], stk4_1 = stk4_0
-  // stp1[2] = stp1[0], stp1[3] = stp1[1]
-  x4 = _mm_mulhrs_epi16(in[4], stk3_0);  // stp1[4]
-  butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6);
-  v1 = _mm_add_epi16(x1, x6);  // stp2_1
-  v2 = _mm_add_epi16(x0, x5);  // stp2_2
-  stp1[1] = _mm_add_epi16(v1, v14);
-  stp1[14] = _mm_sub_epi16(v1, v14);
-
-  stp1[2] = _mm_add_epi16(v2, v13);
-  stp1[13] = _mm_sub_epi16(v2, v13);
-
-  v3 = _mm_add_epi16(x1, x4);  // stp2_3
-  v4 = _mm_sub_epi16(x1, x4);  // stp2_4
-
-  v5 = _mm_sub_epi16(x0, x5);  // stp2_5
-
-  v6 = _mm_sub_epi16(x1, x6);  // stp2_6
-  v7 = _mm_sub_epi16(x0, x7);  // stp2_7
-  stp1[3] = _mm_add_epi16(v3, v12);
-  stp1[12] = _mm_sub_epi16(v3, v12);
-
-  stp1[6] = _mm_add_epi16(v6, v9);
-  stp1[9] = _mm_sub_epi16(v6, v9);
-
-  stp1[7] = _mm_add_epi16(v7, v8);
-  stp1[8] = _mm_sub_epi16(v7, v8);
-
-  stp1[4] = _mm_add_epi16(v4, v11);
-  stp1[11] = _mm_sub_epi16(v4, v11);
-
-  stp1[5] = _mm_add_epi16(v5, v10);
-  stp1[10] = _mm_sub_epi16(v5, v10);
-}
-
-static void idct32_34_second_half(const __m128i *in, __m128i *stp1) {
-  const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
-  const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
-  const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
-  const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
-  const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
-  const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
-  const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
-  const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  __m128i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m128i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m128i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m128i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  v16 = _mm_mulhrs_epi16(in[1], stk1_0);
-  v31 = _mm_mulhrs_epi16(in[1], stk1_1);
-
-  v19 = _mm_mulhrs_epi16(in[7], stk1_6);
-  v28 = _mm_mulhrs_epi16(in[7], stk1_7);
-
-  v20 = _mm_mulhrs_epi16(in[5], stk1_8);
-  v27 = _mm_mulhrs_epi16(in[5], stk1_9);
-
-  v23 = _mm_mulhrs_epi16(in[3], stk1_14);
-  v24 = _mm_mulhrs_epi16(in[3], stk1_15);
-
-  butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30);
-  butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29);
-  butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26);
-  butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25);
-
-  u16 = _mm_add_epi16(v16, v19);
-  u17 = _mm_add_epi16(v17, v18);
-  u18 = _mm_sub_epi16(v17, v18);
-  u19 = _mm_sub_epi16(v16, v19);
-  u20 = _mm_sub_epi16(v23, v20);
-  u21 = _mm_sub_epi16(v22, v21);
-  u22 = _mm_add_epi16(v22, v21);
-  u23 = _mm_add_epi16(v23, v20);
-  u24 = _mm_add_epi16(v24, v27);
-  u27 = _mm_sub_epi16(v24, v27);
-  u25 = _mm_add_epi16(v25, v26);
-  u26 = _mm_sub_epi16(v25, v26);
-  u28 = _mm_sub_epi16(v31, v28);
-  u31 = _mm_add_epi16(v28, v31);
-  u29 = _mm_sub_epi16(v30, v29);
-  u30 = _mm_add_epi16(v29, v30);
-
-  butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-  butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-  butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-  butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-
-  stp1[16] = _mm_add_epi16(u16, u23);
-  stp1[23] = _mm_sub_epi16(u16, u23);
-
-  stp1[17] = _mm_add_epi16(u17, u22);
-  stp1[22] = _mm_sub_epi16(u17, u22);
-
-  stp1[18] = _mm_add_epi16(u18, u21);
-  stp1[21] = _mm_sub_epi16(u18, u21);
-
-  stp1[19] = _mm_add_epi16(u19, u20);
-  stp1[20] = _mm_sub_epi16(u19, u20);
-
-  stp1[24] = _mm_sub_epi16(u31, u24);
-  stp1[31] = _mm_add_epi16(u24, u31);
-
-  stp1[25] = _mm_sub_epi16(u30, u25);
-  stp1[30] = _mm_add_epi16(u25, u30);
-
-  stp1[26] = _mm_sub_epi16(u29, u26);
-  stp1[29] = _mm_add_epi16(u26, u29);
-
-  stp1[27] = _mm_sub_epi16(u28, u27);
-  stp1[28] = _mm_add_epi16(u27, u28);
-
-  butterfly_self(&stp1[20], &stp1[27], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[21], &stp1[26], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[22], &stp1[25], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[23], &stp1[24], &stg6_0, &stg4_0);
-}
-
-// Only upper-left 8x8 has non-zero coeff
-void aom_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest,
-                                int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  __m128i in[32], col[32];
-  __m128i stp1[32];
-  int i;
-
-  // Load input data. Only need to load the top left 8x8 block.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 32);
-  in[2] = load_input_data(input + 64);
-  in[3] = load_input_data(input + 96);
-  in[4] = load_input_data(input + 128);
-  in[5] = load_input_data(input + 160);
-  in[6] = load_input_data(input + 192);
-  in[7] = load_input_data(input + 224);
-
-  array_transpose_8x8(in, in);
-  idct32_34_first_half(in, stp1);
-  idct32_34_second_half(in, stp1);
-
-  // 1_D: Store 32 intermediate results for each 8x32 block.
-  add_sub_butterfly(stp1, col, 32);
-  for (i = 0; i < 4; i++) {
-    int j;
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + i * 8, in);
-    idct32_34_first_half(in, stp1);
-    idct32_34_second_half(in, stp1);
-
-    // 2_D: Calculate the results and store them to destination.
-    add_sub_butterfly(stp1, in, 32);
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-// in0[16] represents the left 8x16 block
-// in1[16] represents the right 8x16 block
-static void load_buffer_16x16(const tran_low_t *input, __m128i *in0,
-                              __m128i *in1) {
-  int i;
-  for (i = 0; i < 16; i++) {
-    in0[i] = load_input_data(input);
-    in1[i] = load_input_data(input + 8);
-    input += 32;
-  }
-}
-
-static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0,
-                                    __m128i *out1) {
-  array_transpose_8x8(in0, out0);
-  array_transpose_8x8(&in0[8], out1);
-  array_transpose_8x8(in1, &out0[8]);
-  array_transpose_8x8(&in1[8], &out1[8]);
-}
-
-// Group the coefficient calculation into smaller functions
-// to prevent stack spillover:
-// quarter_1: 0-7
-// quarter_2: 8-15
-// quarter_3_4: 16-23, 24-31
-static void idct32_8x32_135_quarter_1(const __m128i *in /*in[16]*/,
-                                      __m128i *out /*out[8]*/) {
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-
-  {
-    const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-    const __m128i stk4_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
-    const __m128i stk4_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
-    u0 = _mm_mulhrs_epi16(in[0], stk4_0);
-    u2 = _mm_mulhrs_epi16(in[8], stk4_2);
-    u3 = _mm_mulhrs_epi16(in[8], stk4_3);
-    u1 = u0;
-  }
-
-  v0 = _mm_add_epi16(u0, u3);
-  v1 = _mm_add_epi16(u1, u2);
-  v2 = _mm_sub_epi16(u1, u2);
-  v3 = _mm_sub_epi16(u0, u3);
-
-  {
-    const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-    const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-    const __m128i stk3_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
-    const __m128i stk3_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
-    u4 = _mm_mulhrs_epi16(in[4], stk3_0);
-    u7 = _mm_mulhrs_epi16(in[4], stk3_1);
-    u5 = _mm_mulhrs_epi16(in[12], stk3_2);
-    u6 = _mm_mulhrs_epi16(in[12], stk3_3);
-  }
-
-  v4 = _mm_add_epi16(u4, u5);
-  v5 = _mm_sub_epi16(u4, u5);
-  v6 = _mm_sub_epi16(u7, u6);
-  v7 = _mm_add_epi16(u7, u6);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-    butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
-  }
-
-  out[0] = _mm_add_epi16(v0, v7);
-  out[1] = _mm_add_epi16(v1, v6);
-  out[2] = _mm_add_epi16(v2, v5);
-  out[3] = _mm_add_epi16(v3, v4);
-  out[4] = _mm_sub_epi16(v3, v4);
-  out[5] = _mm_sub_epi16(v2, v5);
-  out[6] = _mm_sub_epi16(v1, v6);
-  out[7] = _mm_sub_epi16(v0, v7);
-}
-
-static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/,
-                                      __m128i *out /*out[8]*/) {
-  __m128i u8, u9, u10, u11, u12, u13, u14, u15;
-  __m128i v8, v9, v10, v11, v12, v13, v14, v15;
-
-  {
-    const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
-    const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
-    const __m128i stk2_2 = pair_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64);
-    const __m128i stk2_3 = pair_set_epi16(2 * cospi_14_64, 2 * cospi_14_64);
-    const __m128i stk2_4 = pair_set_epi16(2 * cospi_22_64, 2 * cospi_22_64);
-    const __m128i stk2_5 = pair_set_epi16(2 * cospi_10_64, 2 * cospi_10_64);
-    const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
-    const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-    u8 = _mm_mulhrs_epi16(in[2], stk2_0);
-    u15 = _mm_mulhrs_epi16(in[2], stk2_1);
-    u9 = _mm_mulhrs_epi16(in[14], stk2_2);
-    u14 = _mm_mulhrs_epi16(in[14], stk2_3);
-    u10 = _mm_mulhrs_epi16(in[10], stk2_4);
-    u13 = _mm_mulhrs_epi16(in[10], stk2_5);
-    u11 = _mm_mulhrs_epi16(in[6], stk2_6);
-    u12 = _mm_mulhrs_epi16(in[6], stk2_7);
-  }
-
-  v8 = _mm_add_epi16(u8, u9);
-  v9 = _mm_sub_epi16(u8, u9);
-  v10 = _mm_sub_epi16(u11, u10);
-  v11 = _mm_add_epi16(u11, u10);
-  v12 = _mm_add_epi16(u12, u13);
-  v13 = _mm_sub_epi16(u12, u13);
-  v14 = _mm_sub_epi16(u15, u14);
-  v15 = _mm_add_epi16(u15, u14);
-
-  {
-    const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-    const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
-    butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm_add_epi16(v8, v11);
-  out[1] = _mm_add_epi16(v9, v10);
-  out[2] = _mm_sub_epi16(v9, v10);
-  out[3] = _mm_sub_epi16(v8, v11);
-  out[4] = _mm_sub_epi16(v15, v12);
-  out[5] = _mm_sub_epi16(v14, v13);
-  out[6] = _mm_add_epi16(v14, v13);
-  out[7] = _mm_add_epi16(v15, v12);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
-    butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
-  }
-}
-
-// 8x32 block even indexed 8 inputs of in[16],
-// output first half 16 to out[32]
-static void idct32_8x32_quarter_1_2(const __m128i *in /*in[16]*/,
-                                    __m128i *out /*out[32]*/) {
-  __m128i temp[16];
-  idct32_8x32_135_quarter_1(in, temp);
-  idct32_8x32_135_quarter_2(in, &temp[8]);
-  add_sub_butterfly(temp, out, 16);
-}
-
-// 8x32 block odd indexed 8 inputs of in[16],
-// output second half 16 to out[32]
-static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/,
-                                    __m128i *out /*out[32]*/) {
-  __m128i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m128i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m128i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m128i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  {
-    const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
-    const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
-    const __m128i stk1_2 = pair_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64);
-    const __m128i stk1_3 = pair_set_epi16(2 * cospi_15_64, 2 * cospi_15_64);
-
-    const __m128i stk1_4 = pair_set_epi16(2 * cospi_23_64, 2 * cospi_23_64);
-    const __m128i stk1_5 = pair_set_epi16(2 * cospi_9_64, 2 * cospi_9_64);
-    const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
-    const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
-    const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
-    const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
-    const __m128i stk1_10 = pair_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64);
-    const __m128i stk1_11 = pair_set_epi16(2 * cospi_11_64, 2 * cospi_11_64);
-
-    const __m128i stk1_12 = pair_set_epi16(2 * cospi_19_64, 2 * cospi_19_64);
-    const __m128i stk1_13 = pair_set_epi16(2 * cospi_13_64, 2 * cospi_13_64);
-    const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
-    const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
-    u16 = _mm_mulhrs_epi16(in[1], stk1_0);
-    u31 = _mm_mulhrs_epi16(in[1], stk1_1);
-    u17 = _mm_mulhrs_epi16(in[15], stk1_2);
-    u30 = _mm_mulhrs_epi16(in[15], stk1_3);
-
-    u18 = _mm_mulhrs_epi16(in[9], stk1_4);
-    u29 = _mm_mulhrs_epi16(in[9], stk1_5);
-    u19 = _mm_mulhrs_epi16(in[7], stk1_6);
-    u28 = _mm_mulhrs_epi16(in[7], stk1_7);
-
-    u20 = _mm_mulhrs_epi16(in[5], stk1_8);
-    u27 = _mm_mulhrs_epi16(in[5], stk1_9);
-    u21 = _mm_mulhrs_epi16(in[11], stk1_10);
-    u26 = _mm_mulhrs_epi16(in[11], stk1_11);
-
-    u22 = _mm_mulhrs_epi16(in[13], stk1_12);
-    u25 = _mm_mulhrs_epi16(in[13], stk1_13);
-    u23 = _mm_mulhrs_epi16(in[3], stk1_14);
-    u24 = _mm_mulhrs_epi16(in[3], stk1_15);
-  }
-
-  v16 = _mm_add_epi16(u16, u17);
-  v17 = _mm_sub_epi16(u16, u17);
-  v18 = _mm_sub_epi16(u19, u18);
-  v19 = _mm_add_epi16(u19, u18);
-
-  v20 = _mm_add_epi16(u20, u21);
-  v21 = _mm_sub_epi16(u20, u21);
-  v22 = _mm_sub_epi16(u23, u22);
-  v23 = _mm_add_epi16(u23, u22);
-
-  v24 = _mm_add_epi16(u24, u25);
-  v25 = _mm_sub_epi16(u24, u25);
-  v26 = _mm_sub_epi16(u27, u26);
-  v27 = _mm_add_epi16(u27, u26);
-
-  v28 = _mm_add_epi16(u28, u29);
-  v29 = _mm_sub_epi16(u28, u29);
-  v30 = _mm_sub_epi16(u31, u30);
-  v31 = _mm_add_epi16(u31, u30);
-
-  {
-    const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-    const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-    const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-    const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-    const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-    const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-    butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
-    butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
-    butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
-    butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
-  }
-
-  u16 = _mm_add_epi16(v16, v19);
-  u17 = _mm_add_epi16(v17, v18);
-  u18 = _mm_sub_epi16(v17, v18);
-  u19 = _mm_sub_epi16(v16, v19);
-  u20 = _mm_sub_epi16(v23, v20);
-  u21 = _mm_sub_epi16(v22, v21);
-  u22 = _mm_add_epi16(v22, v21);
-  u23 = _mm_add_epi16(v23, v20);
-
-  u24 = _mm_add_epi16(v24, v27);
-  u25 = _mm_add_epi16(v25, v26);
-  u26 = _mm_sub_epi16(v25, v26);
-  u27 = _mm_sub_epi16(v24, v27);
-  u28 = _mm_sub_epi16(v31, v28);
-  u29 = _mm_sub_epi16(v30, v29);
-  u30 = _mm_add_epi16(v29, v30);
-  u31 = _mm_add_epi16(v28, v31);
-
-  {
-    const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-    const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-    butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-    butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-    butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm_add_epi16(u16, u23);
-  out[1] = _mm_add_epi16(u17, u22);
-  out[2] = _mm_add_epi16(u18, u21);
-  out[3] = _mm_add_epi16(u19, u20);
-  v20 = _mm_sub_epi16(u19, u20);
-  v21 = _mm_sub_epi16(u18, u21);
-  v22 = _mm_sub_epi16(u17, u22);
-  v23 = _mm_sub_epi16(u16, u23);
-
-  v24 = _mm_sub_epi16(u31, u24);
-  v25 = _mm_sub_epi16(u30, u25);
-  v26 = _mm_sub_epi16(u29, u26);
-  v27 = _mm_sub_epi16(u28, u27);
-  out[12] = _mm_add_epi16(u27, u28);
-  out[13] = _mm_add_epi16(u26, u29);
-  out[14] = _mm_add_epi16(u25, u30);
-  out[15] = _mm_add_epi16(u24, u31);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]);
-    butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]);
-    butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]);
-    butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]);
-  }
-}
-
-// 8x16 block, input __m128i in[16], output __m128i in[32]
-static void idct32_8x32_135(__m128i *in /*in[32]*/) {
-  __m128i out[32];
-  idct32_8x32_quarter_1_2(in, out);
-  idct32_8x32_quarter_3_4(in, &out[16]);
-  add_sub_butterfly(out, in, 32);
-}
-
-static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-  int j = 0;
-  while (j < 32) {
-    in[j] = _mm_adds_epi16(in[j], final_rounding);
-    in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding);
-
-    in[j] = _mm_srai_epi16(in[j], 6);
-    in[j + 1] = _mm_srai_epi16(in[j + 1], 6);
-
-    RECON_AND_STORE(dst, in[j]);
-    dst += stride;
-    RECON_AND_STORE(dst, in[j + 1]);
-    dst += stride;
-    j += 2;
-  }
-}
-
-static INLINE void recon_and_store(__m128i *in0, __m128i *in1, uint8_t *dest,
-                                   int stride) {
-  store_buffer_8x32(in0, dest, stride);
-  store_buffer_8x32(in1, dest + 8, stride);
-}
-
-static INLINE void idct32_135(__m128i *col0, __m128i *col1) {
-  idct32_8x32_135(col0);
-  idct32_8x32_135(col1);
-}
-
-typedef enum { left_16, right_16 } ColsIndicator;
-
-static void transpose_and_copy_16x16(__m128i *in0, __m128i *in1, __m128i *store,
-                                     ColsIndicator cols) {
-  switch (cols) {
-    case left_16: {
-      int i;
-      array_transpose_16x16(in0, in1);
-      for (i = 0; i < 16; ++i) {
-        store[i] = in0[16 + i];
-        store[16 + i] = in1[16 + i];
-      }
-      break;
-    }
-    case right_16: {
-      array_transpose_16x16_2(store, &store[16], in0, in1);
-      break;
-    }
-    default: { assert(0); }
-  }
-}
-
-// Only upper-left 16x16 has non-zero coeff
-void aom_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest,
-                                 int stride) {
-  // Each array represents an 8x32 block
-  __m128i col0[32], col1[32];
-  // This array represents a 16x16 block
-  __m128i temp[32];
-
-  // Load input data. Only need to load the top left 16x16 block.
-  load_buffer_16x16(input, col0, col1);
-
-  // columns
-  array_transpose_16x16(col0, col1);
-  idct32_135(col0, col1);
-
-  // rows
-  transpose_and_copy_16x16(col0, col1, temp, left_16);
-  idct32_135(col0, col1);
-  recon_and_store(col0, col1, dest, stride);
-
-  transpose_and_copy_16x16(col0, col1, temp, right_16);
-  idct32_135(col0, col1);
-  recon_and_store(col0, col1, dest + 16, stride);
-}
-
-// For each 8x32 block __m128i in[32],
-// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
-// output pixels: 8-15 in __m128i in[32]
-static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/,
-                                       __m128i *out /*out[16]*/) {
-  __m128i u8, u9, u10, u11, u12, u13, u14, u15;  // stp2_
-  __m128i v8, v9, v10, v11, v12, v13, v14, v15;  // stp1_
-
-  {
-    const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-    const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-    const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-    const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-    butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15);
-    butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14);
-  }
-
-  v8 = _mm_add_epi16(u8, u9);
-  v9 = _mm_sub_epi16(u8, u9);
-  v14 = _mm_sub_epi16(u15, u14);
-  v15 = _mm_add_epi16(u15, u14);
-
-  {
-    const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-    const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-    const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-    const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-    butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13);
-    butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12);
-  }
-
-  v10 = _mm_sub_epi16(u11, u10);
-  v11 = _mm_add_epi16(u11, u10);
-  v12 = _mm_add_epi16(u12, u13);
-  v13 = _mm_sub_epi16(u12, u13);
-
-  {
-    const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-    const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
-    butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm_add_epi16(v8, v11);
-  out[1] = _mm_add_epi16(v9, v10);
-  out[6] = _mm_add_epi16(v14, v13);
-  out[7] = _mm_add_epi16(v15, v12);
-
-  out[2] = _mm_sub_epi16(v9, v10);
-  out[3] = _mm_sub_epi16(v8, v11);
-  out[4] = _mm_sub_epi16(v15, v12);
-  out[5] = _mm_sub_epi16(v14, v13);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
-    butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
-  }
-}
-
-// For each 8x32 block __m128i in[32],
-// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
-// output pixels: 0-7 in __m128i in[32]
-static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/,
-                                       __m128i *out /*out[8]*/) {
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;  // stp1_
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;  // stp2_
-
-  {
-    const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-    const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-    const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-    const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-    butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7);
-    butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6);
-  }
-
-  v4 = _mm_add_epi16(u4, u5);
-  v5 = _mm_sub_epi16(u4, u5);
-  v6 = _mm_sub_epi16(u7, u6);
-  v7 = _mm_add_epi16(u7, u6);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-    const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-    const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-    butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
-
-    butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1);
-    butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3);
-  }
-
-  v0 = _mm_add_epi16(u0, u3);
-  v1 = _mm_add_epi16(u1, u2);
-  v2 = _mm_sub_epi16(u1, u2);
-  v3 = _mm_sub_epi16(u0, u3);
-
-  out[0] = _mm_add_epi16(v0, v7);
-  out[1] = _mm_add_epi16(v1, v6);
-  out[2] = _mm_add_epi16(v2, v5);
-  out[3] = _mm_add_epi16(v3, v4);
-  out[4] = _mm_sub_epi16(v3, v4);
-  out[5] = _mm_sub_epi16(v2, v5);
-  out[6] = _mm_sub_epi16(v1, v6);
-  out[7] = _mm_sub_epi16(v0, v7);
-}
-
-// For each 8x32 block __m128i in[32],
-// Input with odd index,
-// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
-// output pixels: 16-23, 24-31 in __m128i in[32]
-// We avoid hide an offset, 16, inside this function. So we output 0-15 into
-// array out[16]
-static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/,
-                                         __m128i *out /*out[16]*/) {
-  __m128i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m128i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m128i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m128i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  {
-    const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-    const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-    const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-    const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-    const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-    const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-    const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-    const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-    const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-    const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-    const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-    const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-    const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-    const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-    const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-    const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-    butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31);
-    butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30);
-    butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29);
-    butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28);
-
-    butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27);
-    butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26);
-
-    butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25);
-    butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24);
-  }
-
-  v16 = _mm_add_epi16(u16, u17);
-  v17 = _mm_sub_epi16(u16, u17);
-  v18 = _mm_sub_epi16(u19, u18);
-  v19 = _mm_add_epi16(u19, u18);
-
-  v20 = _mm_add_epi16(u20, u21);
-  v21 = _mm_sub_epi16(u20, u21);
-  v22 = _mm_sub_epi16(u23, u22);
-  v23 = _mm_add_epi16(u23, u22);
-
-  v24 = _mm_add_epi16(u24, u25);
-  v25 = _mm_sub_epi16(u24, u25);
-  v26 = _mm_sub_epi16(u27, u26);
-  v27 = _mm_add_epi16(u27, u26);
-
-  v28 = _mm_add_epi16(u28, u29);
-  v29 = _mm_sub_epi16(u28, u29);
-  v30 = _mm_sub_epi16(u31, u30);
-  v31 = _mm_add_epi16(u31, u30);
-
-  {
-    const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-    const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-    const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-    const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-    const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-    const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-    butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
-    butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
-    butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
-    butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
-  }
-
-  u16 = _mm_add_epi16(v16, v19);
-  u17 = _mm_add_epi16(v17, v18);
-  u18 = _mm_sub_epi16(v17, v18);
-  u19 = _mm_sub_epi16(v16, v19);
-  u20 = _mm_sub_epi16(v23, v20);
-  u21 = _mm_sub_epi16(v22, v21);
-  u22 = _mm_add_epi16(v22, v21);
-  u23 = _mm_add_epi16(v23, v20);
-
-  u24 = _mm_add_epi16(v24, v27);
-  u25 = _mm_add_epi16(v25, v26);
-  u26 = _mm_sub_epi16(v25, v26);
-  u27 = _mm_sub_epi16(v24, v27);
-
-  u28 = _mm_sub_epi16(v31, v28);
-  u29 = _mm_sub_epi16(v30, v29);
-  u30 = _mm_add_epi16(v29, v30);
-  u31 = _mm_add_epi16(v28, v31);
-
-  {
-    const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-    const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-    butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-    butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-    butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm_add_epi16(u16, u23);
-  out[1] = _mm_add_epi16(u17, u22);
-  out[2] = _mm_add_epi16(u18, u21);
-  out[3] = _mm_add_epi16(u19, u20);
-  out[4] = _mm_sub_epi16(u19, u20);
-  out[5] = _mm_sub_epi16(u18, u21);
-  out[6] = _mm_sub_epi16(u17, u22);
-  out[7] = _mm_sub_epi16(u16, u23);
-
-  out[8] = _mm_sub_epi16(u31, u24);
-  out[9] = _mm_sub_epi16(u30, u25);
-  out[10] = _mm_sub_epi16(u29, u26);
-  out[11] = _mm_sub_epi16(u28, u27);
-  out[12] = _mm_add_epi16(u27, u28);
-  out[13] = _mm_add_epi16(u26, u29);
-  out[14] = _mm_add_epi16(u25, u30);
-  out[15] = _mm_add_epi16(u24, u31);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0);
-    butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0);
-    butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0);
-    butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0);
-  }
-}
-
-static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/,
-                                         __m128i *out /*out[32]*/) {
-  __m128i temp[16];
-  idct32_full_8x32_quarter_1(in, temp);
-  idct32_full_8x32_quarter_2(in, &temp[8]);
-  add_sub_butterfly(temp, out, 16);
-}
-
-static void idct32_full_8x32(const __m128i *in /*in[32]*/,
-                             __m128i *out /*out[32]*/) {
-  __m128i temp[32];
-  idct32_full_8x32_quarter_1_2(in, temp);
-  idct32_full_8x32_quarter_3_4(in, &temp[16]);
-  add_sub_butterfly(temp, out, 32);
-}
-
-static void load_buffer_8x32(const tran_low_t *input, __m128i *in) {
-  int i;
-  for (i = 0; i < 8; ++i) {
-    in[i] = load_input_data(input);
-    in[i + 8] = load_input_data(input + 8);
-    in[i + 16] = load_input_data(input + 16);
-    in[i + 24] = load_input_data(input + 24);
-    input += 32;
-  }
-}
-
-void aom_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest,
-                                  int stride) {
-  __m128i col[128], in[32];
-  int i, j;
-
-  // rows
-  for (i = 0; i < 4; ++i) {
-    load_buffer_8x32(input, in);
-    input += 32 << 3;
-
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-    array_transpose_8x8(in + 16, in + 16);
-    array_transpose_8x8(in + 24, in + 24);
-
-    idct32_full_8x32(in, col + (i << 5));
-  }
-
-  // columns
-  for (i = 0; i < 4; ++i) {
-    j = i << 3;
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + j, in);
-    array_transpose_8x8(col + j + 32, in + 8);
-    array_transpose_8x8(col + j + 64, in + 16);
-    array_transpose_8x8(col + j + 96, in + 24);
-
-    idct32_full_8x32(in, in);
-    store_buffer_8x32(in, dest, stride);
-    dest += 8;
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
index f0668e6f3..0bc841a7a 100644
--- a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
@@ -85,15 +85,10 @@ SECTION .text
 
 INIT_XMM sse2
 cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
-%if CONFIG_HIGHBITDEPTH
   mova            m0,        [inputq +  0]
   packssdw        m0,        [inputq + 16]
   mova            m1,        [inputq + 32]
   packssdw        m1,        [inputq + 48]
-%else
-  mova            m0,        [inputq +  0]
-  mova            m1,        [inputq + 16]
-%endif
   psraw           m0,        2
   psraw           m1,        2
 
diff --git a/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c
new file mode 100644
index 000000000..c3c88245a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+unsigned int aom_sad4xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int width, int height) {
+  int i;
+  assert(width == 4);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; i += 4) {
+    __m128i x0 = xx_loadl_32(a + 0 * a_stride);
+    __m128i x1 = xx_loadl_32(a + 1 * a_stride);
+    __m128i x2 = xx_loadl_32(a + 2 * a_stride);
+    __m128i x3 = xx_loadl_32(a + 3 * a_stride);
+    __m128i x_lo = _mm_unpacklo_epi32(x0, x1);
+    __m128i x_hi = _mm_unpacklo_epi32(x2, x3);
+
+    __m128i x = _mm_unpacklo_epi64(x_lo, x_hi);
+
+    x0 = xx_loadl_32(b + 0 * b_stride);
+    x1 = xx_loadl_32(b + 1 * b_stride);
+    x2 = xx_loadl_32(b + 2 * b_stride);
+    x3 = xx_loadl_32(b + 3 * b_stride);
+    x_lo = _mm_unpacklo_epi32(x0, x1);
+    x_hi = _mm_unpacklo_epi32(x2, x3);
+
+    __m128i y = _mm_unpacklo_epi64(x_lo, x_hi);
+
+    __m128i sad4x4 = _mm_sad_epu8(x, y);
+    sad = _mm_add_epi32(sad, sad4x4);
+
+    a += 4 * a_stride;
+    b += 4 * b_stride;
+  }
+
+  // At this point, we have two 32-bit partial SADs at bit[0:31] and [64:95].
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad8xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int width, int height) {
+  int i;
+  assert(width == 8);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; i += 2) {
+    __m128i x0 = xx_loadl_64(a + 0 * a_stride);
+    __m128i x1 = xx_loadl_64(a + 1 * a_stride);
+
+    __m128i x = _mm_unpacklo_epi64(x0, x1);
+
+    x0 = xx_loadl_64(b + 0 * b_stride);
+    x1 = xx_loadl_64(b + 1 * b_stride);
+
+    __m128i y = _mm_unpacklo_epi64(x0, x1);
+
+    __m128i sad8x2 = _mm_sad_epu8(x, y);
+    sad = _mm_add_epi32(sad, sad8x2);
+
+    a += 2 * a_stride;
+    b += 2 * b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad16xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int width, int height) {
+  int i;
+  assert(width == 16);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    __m128i x = xx_loadu_128(a);
+    __m128i y = xx_loadu_128(b);
+
+    __m128i sad16x1 = _mm_sad_epu8(x, y);
+    sad = _mm_add_epi32(sad, sad16x1);
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad32xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int width, int height) {
+  int i, j;
+  assert(width == 32);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < 2; ++j) {
+      __m128i x = xx_loadu_128(a + j * 16);
+      __m128i y = xx_loadu_128(b + j * 16);
+
+      __m128i sad32_half = _mm_sad_epu8(x, y);
+      sad = _mm_add_epi32(sad, sad32_half);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad64xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int width, int height) {
+  int i, j;
+  assert(width == 64);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < 4; ++j) {
+      __m128i x = xx_loadu_128(a + j * 16);
+      __m128i y = xx_loadu_128(b + j * 16);
+
+      __m128i sad64_quarter = _mm_sad_epu8(x, y);
+      sad = _mm_add_epi32(sad, sad64_quarter);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad128xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, int width, int height) {
+  int i, j;
+  assert(width == 128);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < 8; ++j) {
+      __m128i x = xx_loadu_128(a + j * 16);
+      __m128i y = xx_loadu_128(b + j * 16);
+
+      __m128i sad64_quarter = _mm_sad_epu8(x, y);
+      sad = _mm_add_epi32(sad, sad64_quarter);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+#define jnt_sadMxN_sse2(m, n)                                                 \
+  unsigned int aom_jnt_sad##m##x##n##_avg_ssse3(                              \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride,      \
+                          jcp_param);                                         \
+    return aom_sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n);          \
+  }
+
+#define jnt_sadMxN_avx2(m, n)                                                 \
+  unsigned int aom_jnt_sad##m##x##n##_avg_avx2(                               \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride,      \
+                          jcp_param);                                         \
+    return aom_sad##m##xh_avx2(src, src_stride, comp_pred, m, m, n);          \
+  }
+
+/* clang-format off */
+jnt_sadMxN_sse2(128, 128)
+jnt_sadMxN_sse2(128, 64)
+jnt_sadMxN_sse2(64, 128)
+jnt_sadMxN_sse2(64, 64)
+jnt_sadMxN_sse2(64, 32)
+jnt_sadMxN_sse2(32, 64)
+jnt_sadMxN_sse2(32, 32)
+jnt_sadMxN_sse2(32, 16)
+jnt_sadMxN_sse2(16, 32)
+jnt_sadMxN_sse2(16, 16)
+jnt_sadMxN_sse2(16, 8)
+jnt_sadMxN_sse2(8, 16)
+jnt_sadMxN_sse2(8, 8)
+jnt_sadMxN_sse2(8, 4)
+jnt_sadMxN_sse2(4, 8)
+jnt_sadMxN_sse2(4, 4)
+jnt_sadMxN_sse2(4, 16)
+jnt_sadMxN_sse2(16, 4)
+jnt_sadMxN_sse2(8, 32)
+jnt_sadMxN_sse2(32, 8)
+jnt_sadMxN_sse2(16, 64)
+jnt_sadMxN_sse2(64, 16)
+    /* clang-format on */
diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
new file mode 100644
index 000000000..9801e285c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter) {
+  // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow
+  // in computation using _mm_maddubs_epi16.
+  // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
+  const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
+  const __m128i r = _mm_set1_epi16(round);
+  const uint8_t f0 = filter[0] >> 1;
+  const uint8_t f1 = filter[1] >> 1;
+  const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1,
+                                        f0, f1, f0, f1, f0, f1);
+  const __m128i shuffle_mask =
+      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+  unsigned int i, j;
+  (void)pixel_step;
+
+  if (output_width >= 8) {
+    for (i = 0; i < output_height; ++i) {
+      for (j = 0; j < output_width; j += 8) {
+        // load source
+        __m128i source_low = xx_loadl_64(a);
+        __m128i source_hi = _mm_setzero_si128();
+
+        // avoid load undefined memory
+        if (a + 8 != NULL) source_hi = xx_loadl_64(a + 8);
+        __m128i source = _mm_unpacklo_epi64(source_low, source_hi);
+
+        // shuffle to:
+        // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+        //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+        __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+        // b[i] = a[i] * filter[0] + a[i + 1] * filter[1]
+        __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
+
+        // round
+        res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+        xx_storeu_128(b, res);
+
+        a += 8;
+        b += 8;
+      }
+
+      a += src_pixels_per_line - output_width;
+    }
+  } else {
+    for (i = 0; i < output_height; ++i) {
+      // load source, only first 5 values are meaningful:
+      // { a[0], a[1], a[2], a[3], a[4], xxxx }
+      __m128i source = xx_loadl_64(a);
+
+      // shuffle, up to the first 8 are useful
+      // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+      //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+      __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
+      res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+      xx_storel_64(b, res);
+
+      a += src_pixels_per_line;
+      b += output_width;
+    }
+  }
+}
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter) {
+  const int16_t round = (1 << FILTER_BITS) >> 1;
+  const __m128i r = _mm_set1_epi32(round);
+  const __m128i filters =
+      _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0],
+                     filter[1], filter[0], filter[1]);
+  const __m128i shuffle_mask =
+      _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+  const __m128i mask =
+      _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; j += 4) {
+      // load source as:
+      // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] }
+      __m128i source1 = xx_loadl_64(a);
+      __m128i source2 = xx_loadl_64(a + pixel_step);
+      __m128i source = _mm_unpacklo_epi64(source1, source2);
+
+      // shuffle source to:
+      // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] }
+      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+      // b[i] = a[i] * filter[0] + a[w + i] * filter[1]
+      __m128i res = _mm_madd_epi16(source_shuffle, filters);
+
+      // round
+      res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS);
+
+      // shuffle to get each lower 8 bit of every 32 bit
+      res = _mm_shuffle_epi8(res, mask);
+
+      xx_storel_32(b, res);
+
+      a += 4;
+      b += 4;
+    }
+
+    a += src_pixels_per_line - output_width;
+  }
+}
+
+static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
+                                        const __m128i *w, const __m128i *r,
+                                        void *const result) {
+  __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
+  __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
+  __m128i round_lo = _mm_add_epi16(mult_lo, *r);
+  __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
+
+  __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
+  __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
+  __m128i round_hi = _mm_add_epi16(mult_hi, *r);
+  __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
+
+  xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
+}
+
+void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+                                 int width, int height, const uint8_t *ref,
+                                 int ref_stride,
+                                 const JNT_COMP_PARAMS *jcp_param) {
+  int i;
+  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
+  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
+  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+                                 w1, w0, w1, w0);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  if (width >= 16) {
+    // Read 16 pixels one row at a time
+    assert(!(width & 15));
+    for (i = 0; i < height; ++i) {
+      int j;
+      for (j = 0; j < width; j += 16) {
+        __m128i p0 = xx_loadu_128(ref);
+        __m128i p1 = xx_loadu_128(pred);
+
+        compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+        comp_pred += 16;
+        pred += 16;
+        ref += 16;
+      }
+      ref += ref_stride - width;
+    }
+  } else if (width >= 8) {
+    // Read 8 pixels two row at a time
+    assert(!(width & 7));
+    assert(!(width & 1));
+    for (i = 0; i < height; i += 2) {
+      __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
+      __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
+      __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
+      __m128i p1 = xx_loadu_128(pred);
+
+      compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+      comp_pred += 16;
+      pred += 16;
+      ref += 2 * ref_stride;
+    }
+  } else {
+    // Read 4 pixels four row at a time
+    assert(!(width & 3));
+    assert(!(height & 3));
+    for (i = 0; i < height; i += 4) {
+      const uint8_t *row0 = ref + 0 * ref_stride;
+      const uint8_t *row1 = ref + 1 * ref_stride;
+      const uint8_t *row2 = ref + 2 * ref_stride;
+      const uint8_t *row3 = ref + 3 * ref_stride;
+
+      __m128i p0 =
+          _mm_setr_epi8(row0[0], row0[1], row0[2], row0[3], row1[0], row1[1],
+                        row1[2], row1[3], row2[0], row2[1], row2[2], row2[3],
+                        row3[0], row3[1], row3[2], row3[3]);
+      __m128i p1 = xx_loadu_128(pred);
+
+      compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+      comp_pred += 16;
+      pred += 16;
+      ref += 4 * ref_stride;
+    }
+  }
+}
+
+void aom_jnt_comp_avg_upsampled_pred_ssse3(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const JNT_COMP_PARAMS *jcp_param) {
+  int n;
+  int i;
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride);
+  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
+  assert(!(width * height & 15));
+  n = width * height >> 4;
+
+  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
+  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
+  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+                                 w1, w0, w1, w0);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  for (i = 0; i < n; i++) {
+    __m128i p0 = xx_loadu_128(comp_pred);
+    __m128i p1 = xx_loadu_128(pred);
+
+    compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+    comp_pred += 16;
+    pred += 16;
+  }
+}
+
+#define JNT_SUBPIX_AVG_VAR(W, H)                                         \
+  uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_ssse3(              \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,          \
+      const uint8_t *b, int b_stride, uint32_t *sse,                     \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {    \
+    uint16_t fdata3[(H + 1) * W];                                        \
+    uint8_t temp2[H * W];                                                \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                          \
+                                                                         \
+    aom_var_filter_block2d_bil_first_pass_ssse3(                         \
+        a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_var_filter_block2d_bil_second_pass_ssse3(                        \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);        \
+                                                                         \
+    aom_jnt_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W,      \
+                                jcp_param);                              \
+                                                                         \
+    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);            \
+  }
+
+JNT_SUBPIX_AVG_VAR(128, 128)
+JNT_SUBPIX_AVG_VAR(128, 64)
+JNT_SUBPIX_AVG_VAR(64, 128)
+JNT_SUBPIX_AVG_VAR(64, 64)
+JNT_SUBPIX_AVG_VAR(64, 32)
+JNT_SUBPIX_AVG_VAR(32, 64)
+JNT_SUBPIX_AVG_VAR(32, 32)
+JNT_SUBPIX_AVG_VAR(32, 16)
+JNT_SUBPIX_AVG_VAR(16, 32)
+JNT_SUBPIX_AVG_VAR(16, 16)
+JNT_SUBPIX_AVG_VAR(16, 8)
+JNT_SUBPIX_AVG_VAR(8, 16)
+JNT_SUBPIX_AVG_VAR(8, 8)
+JNT_SUBPIX_AVG_VAR(8, 4)
+JNT_SUBPIX_AVG_VAR(4, 8)
+JNT_SUBPIX_AVG_VAR(4, 4)
+JNT_SUBPIX_AVG_VAR(4, 16)
+JNT_SUBPIX_AVG_VAR(16, 4)
+JNT_SUBPIX_AVG_VAR(8, 32)
+JNT_SUBPIX_AVG_VAR(32, 8)
+JNT_SUBPIX_AVG_VAR(16, 64)
+JNT_SUBPIX_AVG_VAR(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
index bf8150e2a..18862dd3e 100644
--- a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
+++ b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
@@ -11,13 +11,14 @@
 
 #include <immintrin.h> /* AVX2 */
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_ports/mem.h"
 
-void aom_lpf_horizontal_edge_8_avx2(unsigned char *s, int p,
-                                    const unsigned char *_blimit,
-                                    const unsigned char *_limit,
-                                    const unsigned char *_thresh) {
+void aom_lpf_horizontal_16_avx2(unsigned char *s, int p,
+                                const unsigned char *_blimit,
+                                const unsigned char *_limit,
+                                const unsigned char *_thresh) {
   __m128i mask, hev, flat, flat2;
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
@@ -368,7 +369,7 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
   8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
 };
 
-void aom_lpf_horizontal_edge_16_avx2(unsigned char *s, int p,
+void aom_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
                                      const unsigned char *_blimit,
                                      const unsigned char *_limit,
                                      const unsigned char *_thresh) {
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
index 8343dbbed..f1eac233b 100644
--- a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
+++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
@@ -11,7 +11,9 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/emmintrin_compat.h"
 
@@ -19,1047 +21,1016 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
 }
 
-#if CONFIG_PARALLEL_DEBLOCKING
-// filter_mask and hev_mask
-#define FILTER_HEV_MASK4                                                      \
-  do {                                                                        \
-    /* (abs(q1 - q0), abs(p1 - p0) */                                         \
-    __m128i flat = abs_diff(q1p1, q0p0);                                      \
-    /* abs(p1 - q1), abs(p0 - q0) */                                          \
-    const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                        \
-    __m128i abs_p0q0, abs_p1q1;                                               \
-                                                                              \
-    /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
-    hev =                                                                     \
-        _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
-    hev = _mm_cmpgt_epi16(hev, thresh);                                       \
-    hev = _mm_packs_epi16(hev, hev);                                          \
-                                                                              \
-    /* const int8_t mask = filter_mask2(*limit, *blimit, */                   \
-    /*                                  p1, p0, q0, q1); */                   \
-    abs_p0q0 =                                                                \
-        _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */     \
-    abs_p1q1 =                                                                \
-        _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */     \
-    abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                   \
-    abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */    \
-    /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                 \
-    mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                 \
-    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
-    mask = _mm_unpacklo_epi64(mask, flat);                                    \
-    mask = _mm_subs_epu8(mask, limit);                                        \
-    mask = _mm_cmpeq_epi8(mask, zero);                                        \
-    mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
-  } while (0)
-#endif  // CONFIG_PARALLEL_DEBLOCKING
-
-// filter_mask and hev_mask
-#define FILTER_HEV_MASK                                                       \
-  do {                                                                        \
-    /* (abs(q1 - q0), abs(p1 - p0) */                                         \
-    __m128i flat = abs_diff(q1p1, q0p0);                                      \
-    /* abs(p1 - q1), abs(p0 - q0) */                                          \
-    const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                        \
-    __m128i abs_p0q0, abs_p1q1, work;                                         \
-                                                                              \
-    /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
-    hev =                                                                     \
-        _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
-    hev = _mm_cmpgt_epi16(hev, thresh);                                       \
-    hev = _mm_packs_epi16(hev, hev);                                          \
-                                                                              \
-    /* const int8_t mask = filter_mask(*limit, *blimit, */                    \
-    /*                                 p3, p2, p1, p0, q0, q1, q2, q3); */    \
-    abs_p0q0 =                                                                \
-        _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */     \
-    abs_p1q1 =                                                                \
-        _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */     \
-    abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                   \
-    abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */    \
-    /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                 \
-    mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                 \
-    /* abs(p3 - p2), abs(p2 - p1) */                                          \
-    work = abs_diff(p3p2, p2p1);                                              \
-    flat = _mm_max_epu8(work, flat);                                          \
-    /* abs(q3 - q2), abs(q2 - q1) */                                          \
-    work = abs_diff(q3q2, q2q1);                                              \
-    flat = _mm_max_epu8(work, flat);                                          \
-    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
-    mask = _mm_unpacklo_epi64(mask, flat);                                    \
-    mask = _mm_subs_epu8(mask, limit);                                        \
-    mask = _mm_cmpeq_epi8(mask, zero);                                        \
-    mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
-  } while (0)
-
-#define FILTER4                                                             \
-  do {                                                                      \
-    const __m128i t3t4 =                                                    \
-        _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);       \
-    const __m128i t80 = _mm_set1_epi8(0x80);                                \
-    __m128i filter, filter2filter1, work;                                   \
-                                                                            \
-    ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */                         \
-    qs1qs0 = _mm_xor_si128(q1q0, t80);                                      \
-                                                                            \
-    /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */               \
-    work = _mm_subs_epi8(ps1ps0, qs1qs0);                                   \
-    filter = _mm_and_si128(_mm_srli_si128(work, 8), hev);                   \
-    /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */      \
-    filter = _mm_subs_epi8(filter, work);                                   \
-    filter = _mm_subs_epi8(filter, work);                                   \
-    filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */           \
-    filter = _mm_and_si128(filter, mask); /* & mask */                      \
-    filter = _mm_unpacklo_epi64(filter, filter);                            \
-                                                                            \
-    /* filter1 = signed_char_clamp(filter + 4) >> 3; */                     \
-    /* filter2 = signed_char_clamp(filter + 3) >> 3; */                     \
-    filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */   \
-    filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);             \
-    filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);     \
-    filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */         \
-    filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */         \
-    filter2filter1 = _mm_packs_epi16(filter2filter1, filter);               \
-                                                                            \
-    /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */                   \
-    filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */                   \
-    filter = _mm_unpacklo_epi8(filter, filter);                             \
-    filter = _mm_srai_epi16(filter, 9); /* round */                         \
-    filter = _mm_packs_epi16(filter, filter);                               \
-    filter = _mm_andnot_si128(hev, filter);                                 \
-                                                                            \
-    hev = _mm_unpackhi_epi64(filter2filter1, filter);                       \
-    filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);            \
-                                                                            \
-    /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
-    qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1);                         \
-    /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
-    ps1ps0 = _mm_adds_epi8(ps1ps0, hev);                                    \
-    qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */                       \
-    ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */                       \
-  } while (0)
+static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+                                             __m128i *x2, __m128i *x3,
+                                             __m128i *d0, __m128i *d1,
+                                             __m128i *d2, __m128i *d3) {
+  // input
+  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+  // output
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  *d0 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+
+  *d1 = _mm_srli_si128(*d0,
+                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(*d0,
+                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(*d0,
+                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                         __m128i *x3, __m128i *d0, __m128i *d1,
+                                         __m128i *d2, __m128i *d3, __m128i *d4,
+                                         __m128i *d5, __m128i *d6,
+                                         __m128i *d7) {
+  // input
+  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+  // output
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1, ww0, ww1;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  ww0 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  ww1 = _mm_unpackhi_epi16(
+      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+
+  *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d1 = _mm_srli_si128(ww0,
+                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(ww0,
+                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(ww0,
+                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d5 = _mm_srli_si128(ww1,
+                       4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d6 = _mm_srli_si128(ww1,
+                       8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d7 = _mm_srli_si128(ww1,
+                       12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                         __m128i *x3, __m128i *x4, __m128i *x5,
+                                         __m128i *x6, __m128i *x7, __m128i *d0,
+                                         __m128i *d1, __m128i *d2,
+                                         __m128i *d3) {
+  // input
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5  50 51 52 53 54 55 56 57
+  // x6  60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  // output
+  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
+  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
+  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1, w2, w3, w4, w5;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  w2 = _mm_unpacklo_epi8(
+      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+  w3 = _mm_unpacklo_epi8(
+      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+  w4 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpacklo_epi16(
+      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  *d0 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  *d1 = _mm_srli_si128(*d0, 8);
+  *d2 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+  *d3 = _mm_srli_si128(*d2, 8);
+}
+
+static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                     __m128i *x3, __m128i *x4, __m128i *x5,
+                                     __m128i *x6, __m128i *x7, __m128i *d0d1,
+                                     __m128i *d2d3, __m128i *d4d5,
+                                     __m128i *d6d7) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7;
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  // x4 40 41 42 43 44 45 46 47
+  // x5  50 51 52 53 54 55 56 57
+  w2 = _mm_unpacklo_epi8(
+      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+  // x6  60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  w3 = _mm_unpacklo_epi8(
+      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+  w4 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpacklo_epi16(
+      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  *d0d1 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  *d2d3 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+  w6 = _mm_unpackhi_epi16(
+      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+  w7 = _mm_unpackhi_epi16(
+      w2, w3);  // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+
+  *d4d5 = _mm_unpacklo_epi32(
+      w6, w7);  // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+  *d6d7 = _mm_unpackhi_epi32(
+      w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+}
+
+static INLINE void transpose16x8_8x16_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
+    __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
+    __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
+    __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(*x0, *x1);
+  w1 = _mm_unpacklo_epi8(*x2, *x3);
+  w2 = _mm_unpacklo_epi8(*x4, *x5);
+  w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+  w8 = _mm_unpacklo_epi8(*x8, *x9);
+  w9 = _mm_unpacklo_epi8(*x10, *x11);
+  w10 = _mm_unpacklo_epi8(*x12, *x13);
+  w11 = _mm_unpacklo_epi8(*x14, *x15);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  *d0 = _mm_unpacklo_epi64(w6, w14);
+  *d1 = _mm_unpackhi_epi64(w6, w14);
+  *d2 = _mm_unpacklo_epi64(w7, w15);
+  *d3 = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  *d4 = _mm_unpacklo_epi64(w6, w14);
+  *d5 = _mm_unpackhi_epi64(w6, w14);
+  *d6 = _mm_unpacklo_epi64(w7, w15);
+  *d7 = _mm_unpackhi_epi64(w7, w15);
+}
+
+static INLINE void transpose8x16_16x8_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
+    __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
+    __m128i *d12d13, __m128i *d14d15) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(*x0, *x1);
+  w1 = _mm_unpacklo_epi8(*x2, *x3);
+  w2 = _mm_unpacklo_epi8(*x4, *x5);
+  w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+  w8 = _mm_unpackhi_epi8(*x0, *x1);
+  w9 = _mm_unpackhi_epi8(*x2, *x3);
+  w10 = _mm_unpackhi_epi8(*x4, *x5);
+  w11 = _mm_unpackhi_epi8(*x6, *x7);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  *d0d1 = _mm_unpacklo_epi64(w6, w14);
+  *d2d3 = _mm_unpackhi_epi64(w6, w14);
+  *d4d5 = _mm_unpacklo_epi64(w7, w15);
+  *d6d7 = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  *d8d9 = _mm_unpacklo_epi64(w6, w14);
+  *d10d11 = _mm_unpackhi_epi64(w6, w14);
+  *d12d13 = _mm_unpacklo_epi64(w7, w15);
+  *d14d15 = _mm_unpackhi_epi64(w7, w15);
+}
+
+static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
+                                          __m128i *hev, __m128i *mask,
+                                          __m128i *qs1qs0, __m128i *ps1ps0) {
+  const __m128i t3t4 =
+      _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
+  const __m128i t80 = _mm_set1_epi8(0x80);
+  __m128i filter, filter2filter1, work;
+  __m128i ps1ps0_work, qs1qs0_work;
+  __m128i hev1;
+  const __m128i ff = _mm_cmpeq_epi8(t80, t80);
+
+  ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
+  qs1qs0_work = _mm_xor_si128(*q1q0, t80);
+
+  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
+  work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
+  filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
+  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
+  filter = _mm_subs_epi8(filter, work);
+  filter = _mm_subs_epi8(filter, work);
+  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
+  filter = _mm_and_si128(filter, *mask); /* & mask */
+  filter = _mm_unpacklo_epi64(filter, filter);
+
+  /* filter1 = signed_char_clamp(filter + 4) >> 3; */
+  /* filter2 = signed_char_clamp(filter + 3) >> 3; */
+  filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
+  filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);
+  filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);
+  filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
+  filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */
+  filter2filter1 = _mm_packs_epi16(filter2filter1, filter);
+
+  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
+  filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
+  filter = _mm_unpacklo_epi8(filter, filter);
+  filter = _mm_srai_epi16(filter, 9); /* round */
+  filter = _mm_packs_epi16(filter, filter);
+  filter = _mm_andnot_si128(*hev, filter);
+
+  hev1 = _mm_unpackhi_epi64(filter2filter1, filter);
+  filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);
+
+  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
+  qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
+  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
+  ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
+  *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
+  *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
+}
+
+static AOM_FORCE_INLINE void lpf_internal_4_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
+    __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
+  __m128i q1p1, q0p0, p1p0, q1q0;
+  __m128i abs_p0q0, abs_p1q1;
+  __m128i mask, hev;
+  const __m128i zero = _mm_setzero_si128();
+
+  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+
+  /* (abs(q1 - q0), abs(p1 - p0) */
+  __m128i flat = abs_diff(q1p1, q0p0);
+  /* abs(p1 - q1), abs(p0 - q0) */
+  const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
+
+  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+  hev = _mm_unpacklo_epi8(flat, zero);
+
+  hev = _mm_cmpgt_epi16(hev, *thresh);
+  hev = _mm_packs_epi16(hev, hev);
+
+  /* const int8_t mask = filter_mask2(*limit, *blimit, */
+  /*                                  p1, p0, q0, q1); */
+  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
+  abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
+  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
+  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
+  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
+  mask = _mm_unpacklo_epi64(mask, flat);
+  mask = _mm_subs_epu8(mask, *limit);
+  mask = _mm_cmpeq_epi8(mask, zero);
+  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));
+
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+}
 
 void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
                                const uint8_t *_blimit, const uint8_t *_limit,
                                const uint8_t *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
-                         _mm_loadl_epi64((const __m128i *)_limit));
-  const __m128i thresh =
+  const __m128i zero = _mm_setzero_si128();
+  __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+                                     _mm_loadl_epi64((const __m128i *)_limit));
+  __m128i thresh =
       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
-  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  __m128i p3p2, p2p1, q3q2, q2q1;
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
-  __m128i mask, hev;
-#if !CONFIG_PARALLEL_DEBLOCKING
-  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 4 * p)));
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 0 * p)));
-#if !CONFIG_PARALLEL_DEBLOCKING
-  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
-  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-#if !CONFIG_PARALLEL_DEBLOCKING
-  FILTER_HEV_MASK;
-#else   // CONFIG_PARALLEL_DEBLOCKING
-  FILTER_HEV_MASK4;
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  FILTER4;
-
-#if CONFIG_PARALLEL_DEBLOCKING
-  *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(ps1ps0);
-  ps1ps0 = _mm_srli_si128(ps1ps0, 8);
-  *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(ps1ps0);
-
-  *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(qs1qs0);
-  qs1qs0 = _mm_srli_si128(qs1qs0, 8);
-  *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(qs1qs0);
-#else
-  _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
-  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);               // *op0
-  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);               // *oq0
-  _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
-#endif
+
+  __m128i qs1qs0, ps1ps0;
+  __m128i p1, p0, q0, q1;
+
+  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
+  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
+  q0 = _mm_cvtsi32_si128(*(int *)(s + 0 * p));
+  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
+
+  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
+
+  xx_storel_32(s - 1 * p, ps1ps0);
+  xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 8));
+  xx_storel_32(s + 0 * p, qs1qs0);
+  xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 8));
 }
 
 void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
                              const uint8_t *_blimit, const uint8_t *_limit,
                              const uint8_t *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
-                         _mm_loadl_epi64((const __m128i *)_limit));
-  const __m128i thresh =
+  __m128i p1p0, q1q0;
+  __m128i p1, p0, q0, q1;
+
+  const __m128i zero = _mm_setzero_si128();
+  __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+                                     _mm_loadl_epi64((const __m128i *)_limit));
+  __m128i thresh =
       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
-  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+
   __m128i x0, x1, x2, x3;
-#if !CONFIG_PARALLEL_DEBLOCKING
-  __m128i p3p2, p2p1, q3q2, q2q1;
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
-  __m128i mask, hev;
+  __m128i d0, d1, d2, d3;
+  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
 
-  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
-                           _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
-
-  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
-
-  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
-
-  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
-
-  // Transpose 8x8
-  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
-  p1p0 = _mm_unpacklo_epi16(q1q0, x1);
-  // 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
-  x0 = _mm_unpacklo_epi16(x2, x3);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  // 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
-  p3p2 = _mm_unpacklo_epi32(p1p0, x0);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  // 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
-  p1p0 = _mm_unpackhi_epi32(p1p0, x0);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8));  // swap lo and high
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8));  // swap lo and high
-
-  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
-  q1q0 = _mm_unpackhi_epi16(q1q0, x1);
-  // 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
-  x2 = _mm_unpackhi_epi16(x2, x3);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  // 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
-  q3q2 = _mm_unpackhi_epi32(q1q0, x2);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  // 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
-  q1q0 = _mm_unpacklo_epi32(q1q0, x2);
-
-  q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
-  q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
-  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
-  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-#if !CONFIG_PARALLEL_DEBLOCKING
-  FILTER_HEV_MASK;
-#else   // CONFIG_PARALLEL_DEBLOCKING
-  FILTER_HEV_MASK4;
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  FILTER4;
+  transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1);
+
+  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0);
 
   // Transpose 8x4 to 4x8
-  // qs1qs0: 20 21 22 23 24 25 26 27  30 31 32 33 34 34 36 37
-  // ps1ps0: 10 11 12 13 14 15 16 17  00 01 02 03 04 05 06 07
-  // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
-  ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
-  // 10 30 11 31 12 32 13 33  14 34 15 35 16 36 17 37
-  x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
-  // 00 20 01 21 02 22 03 23  04 24 05 25 06 26 07 27
-  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
-  qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
-#endif
-  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
-  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
-
-  *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-#endif
+  p1 = _mm_srli_si128(p1p0, 8);
+  q1 = _mm_srli_si128(q1q0, 8);
+
+  transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
+
+  xx_storel_32(s + 0 * p - 2, d0);
+  xx_storel_32(s + 1 * p - 2, d1);
+  xx_storel_32(s + 2 * p - 2, d2);
+  xx_storel_32(s + 3 * p - 2, d3);
 }
 
-static INLINE void store_buffer_horz_8(const __m128i *x, int p, int num,
-                                       uint8_t *s) {
-#if CONFIG_PARALLEL_DEBLOCKING
-  *(int32_t *)(s - (num + 1) * p) = _mm_cvtsi128_si32(*x);
-  const __m128i hi = _mm_srli_si128(*x, 8);
-  *(int32_t *)(s + num * p) = _mm_cvtsi128_si32(hi);
-#else
-  _mm_storel_epi64((__m128i *)(s - (num + 1) * p), *x);
-  _mm_storeh_pi((__m64 *)(s + num * p), _mm_castsi128_ps(*x));
-#endif
+static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
+  xx_storel_32(s - (num + 1) * p, x);
+  xx_storel_32(s + num * p, _mm_srli_si128(x, 8));
 }
 
-void aom_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
-                                    const unsigned char *_blimit,
-                                    const unsigned char *_limit,
-                                    const unsigned char *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
+static AOM_FORCE_INLINE void lpf_internal_14_sse2(
+    __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
+    __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
   __m128i mask, hev, flat, flat2;
-  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+  __m128i qs0ps0, qs1ps1;
+  __m128i p1p0, q1q0, qs1qs0, ps1ps0;
   __m128i abs_p1p0;
 
-  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
-  q4p4 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
-  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
-  q3p3 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
-  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  q2p2 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
-  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  q1p1 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
-  p1q1 = _mm_shuffle_epi32(q1p1, 78);
-  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0p0 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
-  p0q0 = _mm_shuffle_epi32(q0p0, 78);
+  p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1);
+  q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1);
 
   {
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
-    abs_p1p0 = abs_diff(q1p1, q0p0);
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0;
+    __m128i fe, ff, work;
+    abs_p1p0 = abs_diff(*q1p1, *q0p0);
     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
     fe = _mm_set1_epi8(0xfe);
     ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    abs_p0q0 = abs_diff(q0p0, p0q0);
-    abs_p1q1 = abs_diff(q1p1, p1q1);
+    abs_p0q0 = abs_diff(p1p0, q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, *thresh);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi64(hev, hev);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(abs_p1p0, mask);
     // mask |= (abs(p1 - p0) > limit) * -1;
     // mask |= (abs(q1 - q0) > limit) * -1;
 
-    work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+    work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
     mask = _mm_max_epu8(work, mask);
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, *limit);
     mask = _mm_cmpeq_epi8(mask, zero);
+    // replicate for the further "merged variables" usage
+    mask = _mm_unpacklo_epi64(mask, mask);
   }
 
-  // lp filter
+  // lp filter - the same for 6, 8 and 14 versions
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+  qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0);
+  qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0);
+  // loopfilter done
+
+  __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+  __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+  {
+    __m128i work;
+    flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
+    flat = _mm_max_epu8(abs_p1p0, flat);
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+
+    flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
+    work = abs_diff(*q6p6, *q0p0);
+    flat2 = _mm_max_epu8(work, flat2);
+    flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+    flat2 = _mm_subs_epu8(flat2, one);
+    flat2 = _mm_cmpeq_epi8(flat2, zero);
+    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+  }
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // flat and wide flat calculations
   {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i t1 = _mm_set1_epi16(0x1);
-    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
-    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
-    __m128i qs0 = _mm_xor_si128(p0q0, t80);
-    __m128i qs1 = _mm_xor_si128(p1q1, t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
-    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
-
-    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, qs0ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (aom_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    filter1 = _mm_unpacklo_epi8(zero, filter1);
-    filter1 = _mm_srai_epi16(filter1, 0xB);
-    filter2 = _mm_unpacklo_epi8(zero, filter2);
-    filter2 = _mm_srai_epi16(filter2, 0xB);
-
-    // Filter1 >> 3
-    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
-    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
-
-    // filt >> 1
-    filt = _mm_adds_epi16(filter1, t1);
-    filt = _mm_srai_epi16(filt, 1);
-    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
-                            filt);
-    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
-    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
-    // loopfilter done
-
-    {
-      __m128i work;
-      flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
-      flat = _mm_max_epu8(abs_p1p0, flat);
-      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-      flat = _mm_subs_epu8(flat, one);
-      flat = _mm_cmpeq_epi8(flat, zero);
-      flat = _mm_and_si128(flat, mask);
-
-      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
-      q5p5 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
-
-      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
-      q6p6 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
-      flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
-
-      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
-      q7p7 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
-      work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
-      flat2 = _mm_max_epu8(work, flat2);
-      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
-      flat2 = _mm_subs_epu8(flat2, one);
-      flat2 = _mm_cmpeq_epi8(flat2, zero);
-      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-    }
-
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // flat and wide flat calculations
-    {
-      const __m128i eight = _mm_set1_epi16(8);
-      const __m128i four = _mm_set1_epi16(4);
-      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
-      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
-      __m128i pixelFilter_p, pixelFilter_q;
-      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
-      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
-
-      p7_16 = _mm_unpacklo_epi8(q7p7, zero);
-      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
-      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
-      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
-      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
-      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
-      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
-      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
-      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
-      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
-      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
-      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
-      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
-      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
-      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
-      q7_16 = _mm_unpackhi_epi8(q7p7, zero);
-
-      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
-                                    _mm_add_epi16(p4_16, p3_16));
-      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
-                                    _mm_add_epi16(q4_16, q3_16));
-
-      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
-      pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
-      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
-      pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-      pixelFilter_p =
-          _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
-      pixetFilter_p2p1p0 = _mm_add_epi16(
-          four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
-      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
-
-      flat_q0p0 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(p7_16, p7_16);
-      sum_q7 = _mm_add_epi16(q7_16, q7_16);
-      sum_p3 = _mm_add_epi16(p3_16, p3_16);
-      sum_q3 = _mm_add_epi16(q3_16, q3_16);
-
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
-      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
-      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
-      flat_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
-      sum_q3 = _mm_add_epi16(sum_q3, q3_16);
-
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
-      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
-      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
-
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
-      flat_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
-      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
-      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
-      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
-      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
-    }
-    // wide flat
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-    flat = _mm_shuffle_epi32(flat, 68);
-    flat2 = _mm_shuffle_epi32(flat2, 68);
-
-    q2p2 = _mm_andnot_si128(flat, q2p2);
-    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
-    q2p2 = _mm_or_si128(q2p2, flat_q2p2);
-
-    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
-    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
-    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
-
-    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
-    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
-    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
-
-    q6p6 = _mm_andnot_si128(flat2, q6p6);
-    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
-    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
-    store_buffer_horz_8(&q6p6, p, 6, s);
-
-    q5p5 = _mm_andnot_si128(flat2, q5p5);
-    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
-    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
-    store_buffer_horz_8(&q5p5, p, 5, s);
-
-    q4p4 = _mm_andnot_si128(flat2, q4p4);
-    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
-    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
-    store_buffer_horz_8(&q4p4, p, 4, s);
-
-    q3p3 = _mm_andnot_si128(flat2, q3p3);
-    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
-    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
-    store_buffer_horz_8(&q3p3, p, 3, s);
-
-    q2p2 = _mm_andnot_si128(flat2, q2p2);
-    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
-    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
-    store_buffer_horz_8(&q2p2, p, 2, s);
-
-    q1p1 = _mm_andnot_si128(flat2, q1p1);
-    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
-    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
-    store_buffer_horz_8(&q1p1, p, 1, s);
-
-    q0p0 = _mm_andnot_si128(flat2, q0p0);
-    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
-    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
-    store_buffer_horz_8(&q0p0, p, 0, s);
+    const __m128i eight = _mm_set1_epi16(8);
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+    __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+    __m128i pixelFilter_p, pixelFilter_q;
+    __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+    __m128i sum_p6, sum_q6;
+    __m128i sum_p3, sum_q3, res_p, res_q;
+
+    p6_16 = _mm_unpacklo_epi8(*q6p6, zero);
+    p5_16 = _mm_unpacklo_epi8(*q5p5, zero);
+    p4_16 = _mm_unpacklo_epi8(*q4p4, zero);
+    p3_16 = _mm_unpacklo_epi8(*q3p3, zero);
+    p2_16 = _mm_unpacklo_epi8(*q2p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*q1p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*q0p0, zero);
+    q0_16 = _mm_unpackhi_epi8(*q0p0, zero);
+    q1_16 = _mm_unpackhi_epi8(*q1p1, zero);
+    q2_16 = _mm_unpackhi_epi8(*q2p2, zero);
+    q3_16 = _mm_unpackhi_epi8(*q3p3, zero);
+    q4_16 = _mm_unpackhi_epi8(*q4p4, zero);
+    q5_16 = _mm_unpackhi_epi8(*q5p5, zero);
+    q6_16 = _mm_unpackhi_epi8(*q6p6, zero);
+    pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16));
+    pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16));
+
+    pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+    pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+    pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+    pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+    pixelFilter_p =
+        _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+    pixetFilter_p2p1p0 = _mm_add_epi16(
+        four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixelFilter_p,
+                      _mm_add_epi16(_mm_add_epi16(p6_16, p0_16),
+                                    _mm_add_epi16(p1_16, q0_16))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixelFilter_p,
+                      _mm_add_epi16(_mm_add_epi16(q6_16, q0_16),
+                                    _mm_add_epi16(p0_16, q1_16))),
+        4);
+    flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
+
+    flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+    sum_p6 = _mm_add_epi16(p6_16, p6_16);
+    sum_q6 = _mm_add_epi16(q6_16, q6_16);
+    sum_p3 = _mm_add_epi16(p3_16, p3_16);
+    sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+    pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16);
+    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_p,
+            _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_q,
+            _mm_add_epi16(sum_q6,
+                          _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))),
+        4);
+    flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+    pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+    pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
+    flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+    sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+    sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+    sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+    sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+    pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_p,
+            _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_q,
+            _mm_add_epi16(sum_q6,
+                          _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
+        4);
+    flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+    pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+    pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
+    flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+    sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+    sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+    pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_p,
+            _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_q,
+            _mm_add_epi16(sum_q6,
+                          _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
+        4);
+    flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+    sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+    sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+    pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_p,
+            _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_q,
+            _mm_add_epi16(sum_q6,
+                          _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
+        4);
+    flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+    sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+    sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+    pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_p,
+            _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_q,
+            _mm_add_epi16(sum_q6,
+                          _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
+        4);
+    flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
   }
-}
+  // wide flat
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-static INLINE __m128i filter_add2_sub2(const __m128i *const total,
-                                       const __m128i *const a1,
-                                       const __m128i *const a2,
-                                       const __m128i *const s1,
-                                       const __m128i *const s2) {
-  __m128i x = _mm_add_epi16(*a1, *total);
-  x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
-  return x;
-}
+  flat = _mm_shuffle_epi32(flat, 68);
+  flat2 = _mm_shuffle_epi32(flat2, 68);
+
+  *q2p2 = _mm_andnot_si128(flat, *q2p2);
+  flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+  *q2p2 = _mm_or_si128(*q2p2, flat_q2p2);
+
+  qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+  flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+  *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+  qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+  flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+  *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+  *q5p5 = _mm_andnot_si128(flat2, *q5p5);
+  flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+  *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5);
+
+  *q4p4 = _mm_andnot_si128(flat2, *q4p4);
+  flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+  *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4);
+
+  *q3p3 = _mm_andnot_si128(flat2, *q3p3);
+  flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+  *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3);
+
+  *q2p2 = _mm_andnot_si128(flat2, *q2p2);
+  flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+  *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2);
 
-static INLINE __m128i filter8_mask(const __m128i *const flat,
-                                   const __m128i *const other_filt,
-                                   const __m128i *const f8_lo,
-                                   const __m128i *const f8_hi) {
-  const __m128i f8 =
-      _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
-  const __m128i result = _mm_and_si128(*flat, f8);
-  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+  *q1p1 = _mm_andnot_si128(flat2, *q1p1);
+  flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+  *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1);
+
+  *q0p0 = _mm_andnot_si128(flat2, *q0p0);
+  flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+  *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0);
 }
 
-static INLINE __m128i filter16_mask(const __m128i *const flat,
-                                    const __m128i *const other_filt,
-                                    const __m128i *const f_lo,
-                                    const __m128i *const f_hi) {
-  const __m128i f =
-      _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
-  const __m128i result = _mm_and_si128(*flat, f);
-  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
+                                const unsigned char *_blimit,
+                                const unsigned char *_limit,
+                                const unsigned char *_thresh) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+  q4p4 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 5 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 4 * p)));
+  q3p3 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 4 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 3 * p)));
+  q2p2 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 3 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 2 * p)));
+  q1p1 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 2 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 1 * p)));
+
+  q0p0 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 1 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s - 0 * p)));
+
+  q5p5 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 6 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 5 * p)));
+
+  q6p6 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 7 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 6 * p)));
+
+  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+                       &limit, &thresh);
+
+  store_buffer_horz_8(q0p0, p, 0, s);
+  store_buffer_horz_8(q1p1, p, 1, s);
+  store_buffer_horz_8(q2p2, p, 2, s);
+  store_buffer_horz_8(q3p3, p, 3, s);
+  store_buffer_horz_8(q4p4, p, 4, s);
+  store_buffer_horz_8(q5p5, p, 5, s);
 }
 
-typedef enum { FOUR_PIXELS, EIGHT_PIXELS, SIXTEEN_PIXELS } PixelOutput;
+static AOM_FORCE_INLINE void lpf_internal_6_sse2(
+    __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
+    __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, hev, flat;
+  __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
+  __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16;
+  __m128i ps1ps0, qs1qs0;
 
-static INLINE void store_buffer_horz_16(PixelOutput pixel_num, const __m128i *x,
-                                        int p, int offset, uint8_t *s) {
-  int i;
-  if (pixel_num == FOUR_PIXELS) {
-    for (i = 13; i >= 0; i--) {
-      *(int32_t *)(s - (i - offset) * p) = _mm_cvtsi128_si32(x[i]);
-    }
-  }
-  if (pixel_num == EIGHT_PIXELS) {
-    for (i = 13; i >= 0; i--) {
-      _mm_storel_epi64((__m128i *)(s - (i - offset) * p), x[i]);
-    }
-  }
-  if (pixel_num == SIXTEEN_PIXELS) {
-    for (i = 13; i >= 0; i--) {
-      _mm_storeu_si128((__m128i *)(s - (i - offset) * p), x[i]);
-    }
-  }
-}
+  q2p2 = _mm_unpacklo_epi64(*p2, *q2);
+  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
 
-static INLINE void lpf_horz_edge_16_internal(PixelOutput pixel_num,
-                                             unsigned char *s, int p,
-                                             const unsigned char *_blimit,
-                                             const unsigned char *_limit,
-                                             const unsigned char *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i one = _mm_set1_epi8(1);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-  __m128i mask, hev, flat, flat2;
-  __m128i p7, p6, p5;
-  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
-  __m128i q5, q6, q7;
-
-  __m128i op2, op1, op0, oq0, oq1, oq2;
-
-  __m128i max_abs_p1p0q1q0;
-
-  p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
-  p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
-  p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
-  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
-  q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
-  q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
-  q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
+  *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
 
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i fe = _mm_set1_epi8(0xfe);
+  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
   {
-    const __m128i abs_p1p0 = abs_diff(p1, p0);
-    const __m128i abs_q1q0 = abs_diff(q1, q0);
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
-    __m128i abs_p0q0 = abs_diff(p0, q0);
-    __m128i abs_p1q1 = abs_diff(p1, q1);
-    __m128i work;
-    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    // filter_mask and hev_mask
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+    abs_p1p0 = abs_diff(q1p1, q0p0);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+
+    abs_p0q0 = abs_diff(*p1p0, *q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+    // considering sse doesn't have unsigned elements comparison the idea is
+    // to find at least one case when X > limit, it means the corresponding
+    // mask bit is set.
+    // to achieve that we find global max value of all inputs of abs(x-y) or
+    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+    // otherwise - not
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, *thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi64(hev, hev);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+    mask = _mm_max_epu8(abs_p1p0, mask);
     // mask |= (abs(p1 - p0) > limit) * -1;
     // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
-    mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
+
+    work = abs_diff(q2p2, q1p1);
     mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+    mask = _mm_subs_epu8(mask, *limit);
     mask = _mm_cmpeq_epi8(mask, zero);
-  }
+    // replicate for the further "merged variables" usage
+    mask = _mm_unpacklo_epi64(mask, mask);
 
-  {
-    __m128i work;
-    work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
-    flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
-    work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
-    flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
+    // flat_mask
+    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
     flat = _mm_subs_epu8(flat, one);
     flat = _mm_cmpeq_epi8(flat, zero);
     flat = _mm_and_si128(flat, mask);
-    flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
-    flat2 = _mm_max_epu8(work, flat2);
-    work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
-    flat2 = _mm_max_epu8(work, flat2);
-    work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
-    flat2 = _mm_max_epu8(work, flat2);
-    flat2 = _mm_subs_epu8(flat2, one);
-    flat2 = _mm_cmpeq_epi8(flat2, zero);
-    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+    // replicate for the further "merged variables" usage
+    flat = _mm_unpacklo_epi64(flat, flat);
   }
 
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  // filter4
+  // 5 tap filter
   {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
-    const __m128i t1f = _mm_set1_epi8(0x1f);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
-
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    op1 = _mm_xor_si128(p1, t80);
-    op0 = _mm_xor_si128(p0, t80);
-    oq0 = _mm_xor_si128(q0, t80);
-    oq1 = _mm_xor_si128(q1, t80);
-
-    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
-
-    work_a = _mm_subs_epi8(oq0, op0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (aom_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    // Filter1 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter1);
-    filter1 = _mm_srli_epi16(filter1, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter1 = _mm_and_si128(filter1, t1f);
-    filter1 = _mm_or_si128(filter1, work_a);
-    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
-
-    // Filter2 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter2);
-    filter2 = _mm_srli_epi16(filter2, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter2 = _mm_and_si128(filter2, t1f);
-    filter2 = _mm_or_si128(filter2, work_a);
-    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
-
-    // filt >> 1
-    filt = _mm_adds_epi8(filter1, t1);
-    work_a = _mm_cmpgt_epi8(zero, filt);
-    filt = _mm_srli_epi16(filt, 1);
-    work_a = _mm_and_si128(work_a, t80);
-    filt = _mm_and_si128(filt, t7f);
-    filt = _mm_or_si128(filt, work_a);
-    filt = _mm_andnot_si128(hev, filt);
-    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
-    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
-    // loopfilter done
-
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // filter8
-    {
-      const __m128i four = _mm_set1_epi16(4);
-      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
-      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
-      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
-      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
-      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
-      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
-      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
-      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
-
-      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
-      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
-      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
-      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
-      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
-      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
-      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
-      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
-      __m128i f8_lo, f8_hi;
-
-      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
-                            _mm_add_epi16(p3_lo, p2_lo));
-      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
-                            _mm_add_epi16(p2_lo, p1_lo));
-      f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
-
-      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
-                            _mm_add_epi16(p3_hi, p2_hi));
-      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
-                            _mm_add_epi16(p2_hi, p1_hi));
-      f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
-
-      op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
-      op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
-      op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
-      oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
-      oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
-      oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
-    }
-
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // wide flat calculations
-    {
-      const __m128i eight = _mm_set1_epi16(8);
-      const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
-      const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
-      const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
-      const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
-      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
-      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
-      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
-      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
-      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
-      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
-      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
-      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
-      const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
-      const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
-      const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
-      const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
-
-      const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
-      const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
-      const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
-      const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
-      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
-      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
-      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
-      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
-      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
-      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
-      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
-      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
-      const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
-      const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
-      const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
-      const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
-
-      __m128i f_lo;
-      __m128i f_hi;
-
-      f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo);  // p7 * 7
-      f_lo =
-          _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo));
-      f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
-                           _mm_add_epi16(p2_lo, p1_lo));
-      f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
-      f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
-
-      f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi);  // p7 * 7
-      f_hi =
-          _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi));
-      f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
-                           _mm_add_epi16(p2_hi, p1_hi));
-      f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
-      f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
-
-      __m128i x[14];
-      x[13] = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
-      x[12] = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
-      x[11] = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
-      x[10] = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
-      x[9] = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
-      x[8] = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
-      x[7] = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
-      x[6] = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
-      x[5] = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
-      x[4] = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
-      x[3] = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
-      x[2] = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
-      x[1] = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
-      x[0] = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
-
-      store_buffer_horz_16(pixel_num, x, p, 6, s);
-    }
-    // wide flat
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    const __m128i four = _mm_set1_epi16(4);
+
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+    p2_16 = _mm_unpacklo_epi8(*p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*p0, zero);
+    q0_16 = _mm_unpacklo_epi8(*q0, zero);
+    q1_16 = _mm_unpacklo_epi8(*q1, zero);
+    q2_16 = _mm_unpacklo_epi8(*q2, zero);
+
+    // op1
+    workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16),
+                            _mm_add_epi16(p1_16, p1_16));  // p0 *2 + p1 * 2
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
+                            p2_16);  // p2 + p0 * 2 + p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
+                                 3);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+
+    // op0
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16);  // q0 * 2 + q1
+    workp_a = _mm_add_epi16(workp_a,
+                            workp_b);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+    workp_shft1 = _mm_srli_epi16(workp_a, 3);
+
+    flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16),
+                            p1_16);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
+    workp_b = _mm_add_epi16(q1_16, q2_16);
+    workp_a = _mm_add_epi16(
+        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
+    workp_shft0 = _mm_srli_epi16(workp_a, 3);
+
+    // oq1
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16),
+                            p0_16);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
+    workp_b = _mm_add_epi16(q2_16, q2_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
+                                 3);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
+
+    flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
   }
+
+  // lp filter - the same for 6, 8 and 14 versions
+  filter4_sse2(p1p0, q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+
+  qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
+  *q1q0 = _mm_and_si128(flat, flat_q0q1);
+  *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+
+  ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
+  *p1p0 = _mm_and_si128(flat, flat_p1p0);
+  *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
 }
 
-void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
+void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
                                const unsigned char *_blimit,
                                const unsigned char *_limit,
                                const unsigned char *_thresh) {
-  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  __m128i p2, p1, p0, q0, q1, q2;
+  __m128i p1p0, q1q0;
+  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+  __m128i limit = _mm_load_si128((__m128i *)_limit);
+  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+  p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
+  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
+  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
+  q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
+  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
+  q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
+
+  lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
+                      &limit, &thresh);
+
+  xx_storel_32(s - 1 * p, p1p0);
+  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8));
+  xx_storel_32(s + 0 * p, q1q0);
+  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8));
+}
+
+void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0,
+                                    const unsigned char *_blimit1,
+                                    const unsigned char *_limit1,
+                                    const unsigned char *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i p2, p1, p0, q0, q1, q2;
+  __m128i p1p0, q1q0;
+
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+
+  lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
+                      &limit, &thresh);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+}
+
+static AOM_FORCE_INLINE void lpf_internal_8_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+    __m128i *p2_out, __m128i *q2_out, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
   __m128i mask, hev, flat;
-  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
-  __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
+  __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
+      flat_p1p0, flat_q0q1;
+  __m128i q2p2, q1p1, q0p0;
+  __m128i q1q0, p1p0, ps1ps0, qs1qs0;
+  __m128i work_a, op2, oq2;
 
-  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
-  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
-  p1q1 = _mm_shuffle_epi32(q1p1, 78);
-  p0q0 = _mm_shuffle_epi32(q0p0, 78);
+  q3p3 = _mm_unpacklo_epi64(*p3, *q3);
+  q2p2 = _mm_unpacklo_epi64(*p2, *q2);
+  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
 
   {
     // filter_mask and hev_mask
+
+    // considering sse doesn't have unsigned elements comparison the idea is to
+    // find at least one case when X > limit, it means the corresponding  mask
+    // bit is set.
+    // to achieve that we find global max value of all inputs of abs(x-y) or
+    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+    // otherwise - not
+
     const __m128i one = _mm_set1_epi8(1);
     const __m128i fe = _mm_set1_epi8(0xfe);
     const __m128i ff = _mm_cmpeq_epi8(fe, fe);
     __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+
     abs_p1p0 = abs_diff(q1p1, q0p0);
     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
 
-    abs_p0q0 = abs_diff(q0p0, p0q0);
-    abs_p1q1 = abs_diff(q1p1, p1q1);
+    abs_p0q0 = abs_diff(p1p0, q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0);
+
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, *thresh);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi64(hev, hev);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(abs_p1p0, mask);
@@ -1067,424 +1038,215 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
     // mask |= (abs(q1 - q0) > limit) * -1;
 
     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+
     mask = _mm_max_epu8(work, mask);
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, *limit);
     mask = _mm_cmpeq_epi8(mask, zero);
+    // replicate for the further "merged variables" usage
+    mask = _mm_unpacklo_epi64(mask, mask);
 
     // flat_mask4
 
     flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
     flat = _mm_max_epu8(abs_p1p0, flat);
+
     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
     flat = _mm_subs_epu8(flat, one);
     flat = _mm_cmpeq_epi8(flat, zero);
     flat = _mm_and_si128(flat, mask);
+    // replicate for the further "merged variables" usage
+    flat = _mm_unpacklo_epi64(flat, flat);
   }
 
+  // filter8
   {
     const __m128i four = _mm_set1_epi16(4);
-    unsigned char *src = s;
-    {
-      __m128i workp_a, workp_b, workp_shft;
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
-
-      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
-      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op2[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op1[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op0[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq0[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq1[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq2[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-    }
-  }
-  // lp filter
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i ps1 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80);
-    const __m128i ps0 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80);
-    const __m128i qs0 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80);
-    const __m128i qs1 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (aom_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    // Filter1 >> 3
-    filter1 = _mm_unpacklo_epi8(zero, filter1);
-    filter1 = _mm_srai_epi16(filter1, 11);
-    filter1 = _mm_packs_epi16(filter1, filter1);
-
-    // Filter2 >> 3
-    filter2 = _mm_unpacklo_epi8(zero, filter2);
-    filter2 = _mm_srai_epi16(filter2, 11);
-    filter2 = _mm_packs_epi16(filter2, zero);
-
-    // filt >> 1
-    filt = _mm_adds_epi8(filter1, t1);
-    filt = _mm_unpacklo_epi8(zero, filt);
-    filt = _mm_srai_epi16(filt, 9);
-    filt = _mm_packs_epi16(filt, zero);
-
-    filt = _mm_andnot_si128(hev, filt);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-    q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q0 = _mm_and_si128(flat, q0);
-    q0 = _mm_or_si128(work_a, q0);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-    q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q1 = _mm_and_si128(flat, q1);
-    q1 = _mm_or_si128(work_a, q1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
-    q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q2 = _mm_and_si128(flat, q2);
-    q2 = _mm_or_si128(work_a, q2);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-    p0 = _mm_loadl_epi64((__m128i *)flat_op0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p0 = _mm_and_si128(flat, p0);
-    p0 = _mm_or_si128(work_a, p0);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-    p1 = _mm_loadl_epi64((__m128i *)flat_op1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p1 = _mm_and_si128(flat, p1);
-    p1 = _mm_or_si128(work_a, p1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
-    p2 = _mm_loadl_epi64((__m128i *)flat_op2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p2 = _mm_and_si128(flat, p2);
-    p2 = _mm_or_si128(work_a, p2);
-
-#if CONFIG_PARALLEL_DEBLOCKING
-    *(int32_t *)(s - 3 * p) = _mm_cvtsi128_si32(p2);
-    *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(p1);
-    *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(p0);
-    *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(q0);
-    *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(q1);
-    *(int32_t *)(s + 2 * p) = _mm_cvtsi128_si32(q2);
-#else
-    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
-    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
-    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
-    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
-#endif
+
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+    p2_16 = _mm_unpacklo_epi8(*p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*p0, zero);
+    q0_16 = _mm_unpacklo_epi8(*q0, zero);
+    q1_16 = _mm_unpacklo_epi8(*q1, zero);
+    q2_16 = _mm_unpacklo_epi8(*q2, zero);
+    p3_16 = _mm_unpacklo_epi8(*p3, zero);
+    q3_16 = _mm_unpacklo_epi8(*q3, zero);
+
+    // op2
+    workp_a =
+        _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+    op2 = _mm_packus_epi16(workp_shft0, workp_shft0);
+
+    // op1
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // op0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+    oq2 = _mm_packus_epi16(workp_shft1, workp_shft1);
   }
+
+  // lp filter - the same for 6, 8 and 14 versions
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+
+  qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
+  q1q0 = _mm_and_si128(flat, flat_q0q1);
+  *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+  ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
+  p1p0 = _mm_and_si128(flat, flat_p1p0);
+  *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+
+  work_a = _mm_andnot_si128(flat, *q2);
+  q2_16 = _mm_and_si128(flat, oq2);
+  *q2_out = _mm_or_si128(work_a, q2_16);
+
+  work_a = _mm_andnot_si128(flat, *p2);
+  p2_16 = _mm_and_si128(flat, op2);
+  *p2_out = _mm_or_si128(work_a, p2_16);
 }
 
-void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
-                                     const unsigned char *_blimit,
-                                     const unsigned char *_limit,
-                                     const unsigned char *_thresh) {
-#if CONFIG_PARALLEL_DEBLOCKING
-  lpf_horz_edge_16_internal(FOUR_PIXELS, s, p, _blimit, _limit, _thresh);
-#else
-  lpf_horz_edge_16_internal(SIXTEEN_PIXELS, s, p, _blimit, _limit, _thresh);
-#endif
+void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
+                               const unsigned char *_blimit,
+                               const unsigned char *_limit,
+                               const unsigned char *_thresh) {
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+  __m128i q1q0, p1p0, p2_out, q2_out;
+  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+  p3 = _mm_cvtsi32_si128(*(int *)(s - 4 * p));
+  p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
+  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
+  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
+  q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
+  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
+  q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
+  q3 = _mm_cvtsi32_si128(*(int *)(s + 3 * p));
+
+  lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+                      &p2_out, &q2_out, &blimit, &limit, &thresh);
+
+  xx_storel_32(s - 1 * p, p1p0);
+  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8));
+  xx_storel_32(s + 0 * p, q1q0);
+  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8));
+  xx_storel_32(s - 3 * p, p2_out);
+  xx_storel_32(s + 2 * p, q2_out);
 }
 
-void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
-                                    const uint8_t *_limit0,
-                                    const uint8_t *_thresh0,
-                                    const uint8_t *_blimit1,
-                                    const uint8_t *_limit1,
-                                    const uint8_t *_thresh1) {
-  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i blimit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
+void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
+                                     const unsigned char *_blimit0,
+                                     const unsigned char *_limit0,
+                                     const unsigned char *_thresh0,
+                                     const unsigned char *_blimit1,
+                                     const unsigned char *_limit1,
+                                     const unsigned char *_thresh1) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
                          _mm_load_si128((const __m128i *)_blimit1));
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
-  const __m128i thresh =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                                     _mm_load_si128((const __m128i *)_limit1));
+  __m128i thresh =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
                          _mm_load_si128((const __m128i *)_thresh1));
 
-  __m128i mask, hev, flat;
-  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
-
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  {
-    const __m128i abs_p1p0 =
-        _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
-    const __m128i abs_q1q0 =
-        _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
-    const __m128i one = _mm_set1_epi8(1);
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    __m128i abs_p0q0 =
-        _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
-    __m128i abs_p1q1 =
-        _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
-    __m128i work;
+  q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 4 * p)));
+  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
 
-    // filter_mask and hev_mask
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
 
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(flat, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
-        _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
-    mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
-        _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
+  q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 5 * p)));
+
+  q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 6 * p)));
+
+  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+                       &limit, &thresh);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8));
+  _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8));
+  _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
+  _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
+  _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8));
+  _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
+  _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8));
+  _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
+  _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8));
+}
 
-    // flat_mask4
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
-        _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
-    flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
-        _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
-    flat = _mm_max_epu8(work, flat);
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
-  }
-  {
-    const __m128i four = _mm_set1_epi16(4);
-    unsigned char *src = s;
-    int i = 0;
-
-    do {
-      __m128i workp_a, workp_b, workp_shft;
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
-
-      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
-      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      src += 8;
-    } while (++i < 2);
-  }
-  // lp filter
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
-    const __m128i t1f = _mm_set1_epi8(0x1f);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-
-    const __m128i ps1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
-    const __m128i ps0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
-    const __m128i qs0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
-    const __m128i qs1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (aom_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    // Filter1 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter1);
-    filter1 = _mm_srli_epi16(filter1, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter1 = _mm_and_si128(filter1, t1f);
-    filter1 = _mm_or_si128(filter1, work_a);
-
-    // Filter2 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter2);
-    filter2 = _mm_srli_epi16(filter2, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter2 = _mm_and_si128(filter2, t1f);
-    filter2 = _mm_or_si128(filter2, work_a);
-
-    // filt >> 1
-    filt = _mm_adds_epi8(filter1, t1);
-    work_a = _mm_cmpgt_epi8(zero, filt);
-    filt = _mm_srli_epi16(filt, 1);
-    work_a = _mm_and_si128(work_a, t80);
-    filt = _mm_and_si128(filt, t7f);
-    filt = _mm_or_si128(filt, work_a);
-
-    filt = _mm_andnot_si128(hev, filt);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-    q0 = _mm_load_si128((__m128i *)flat_oq0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q0 = _mm_and_si128(flat, q0);
-    q0 = _mm_or_si128(work_a, q0);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-    q1 = _mm_load_si128((__m128i *)flat_oq1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q1 = _mm_and_si128(flat, q1);
-    q1 = _mm_or_si128(work_a, q1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
-    q2 = _mm_load_si128((__m128i *)flat_oq2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q2 = _mm_and_si128(flat, q2);
-    q2 = _mm_or_si128(work_a, q2);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-    p0 = _mm_load_si128((__m128i *)flat_op0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p0 = _mm_and_si128(flat, p0);
-    p0 = _mm_or_si128(work_a, p0);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-    p1 = _mm_load_si128((__m128i *)flat_op1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p1 = _mm_and_si128(flat, p1);
-    p1 = _mm_or_si128(work_a, p1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
-    p2 = _mm_load_si128((__m128i *)flat_op2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p2 = _mm_and_si128(flat, p2);
-    p2 = _mm_or_si128(work_a, p2);
-
-    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
-  }
+void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                    const uint8_t *_limit0,
+                                    const uint8_t *_thresh0,
+                                    const uint8_t *_blimit1,
+                                    const uint8_t *_limit1,
+                                    const uint8_t *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+  __m128i q1q0, p1p0, p2_out, q2_out;
+
+  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+
+  lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+                      &p2_out, &q2_out, &blimit, &limit, &thresh);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+  _mm_storel_epi64((__m128i *)(s - 3 * p), p2_out);
+  _mm_storel_epi64((__m128i *)(s + 2 * p), q2_out);
 }
 
 void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
@@ -1494,449 +1256,405 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
                                     const unsigned char *_blimit1,
                                     const unsigned char *_limit1,
                                     const unsigned char *_thresh1) {
+  __m128i p1, p0, q0, q1;
+  __m128i qs1qs0, ps1ps0;
+
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+
+  const __m128i zero = _mm_setzero_si128();
   const __m128i blimit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
                          _mm_load_si128((const __m128i *)_blimit1));
   const __m128i limit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
                          _mm_load_si128((const __m128i *)_limit1));
-  const __m128i thresh =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
-  const __m128i zero = _mm_set1_epi16(0);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  __m128i p3, p2, q2, q3;
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  __m128i p1, p0, q0, q1;
-  __m128i mask, hev, flat;
-#if !CONFIG_PARALLEL_DEBLOCKING
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-#if !CONFIG_PARALLEL_DEBLOCKING
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  // filter_mask and hev_mask
-  {
-    const __m128i abs_p1p0 =
-        _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
-    const __m128i abs_q1q0 =
-        _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    __m128i abs_p0q0 =
-        _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
-    __m128i abs_p1q1 =
-        _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
-#if !CONFIG_PARALLEL_DEBLOCKING
-    __m128i work;
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(flat, mask);
-#if !CONFIG_PARALLEL_DEBLOCKING
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
-        _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
-    mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
-        _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
-    mask = _mm_max_epu8(work, mask);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-    mask = _mm_subs_epu8(mask, limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-  }
 
-  // filter4
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
-    const __m128i t1f = _mm_set1_epi8(0x1f);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-
-    const __m128i ps1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
-    const __m128i ps0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
-    const __m128i qs0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
-    const __m128i qs1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (aom_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    // Filter1 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter1);
-    filter1 = _mm_srli_epi16(filter1, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter1 = _mm_and_si128(filter1, t1f);
-    filter1 = _mm_or_si128(filter1, work_a);
-
-    // Filter2 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter2);
-    filter2 = _mm_srli_epi16(filter2, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter2 = _mm_and_si128(filter2, t1f);
-    filter2 = _mm_or_si128(filter2, work_a);
-
-    // filt >> 1
-    filt = _mm_adds_epi8(filter1, t1);
-    work_a = _mm_cmpgt_epi8(zero, filt);
-    filt = _mm_srli_epi16(filt, 1);
-    work_a = _mm_and_si128(work_a, t80);
-    filt = _mm_and_si128(filt, t7f);
-    filt = _mm_or_si128(filt, work_a);
-
-    filt = _mm_andnot_si128(hev, filt);
-
-    q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-    q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-    p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-    p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-  }
-}
+  __m128i l = _mm_unpacklo_epi64(blimit, limit);
 
-static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
-                                 int in_p, unsigned char *out, int out_p) {
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i x8, x9, x10, x11, x12, x13, x14, x15;
-
-  // 2-way interleave w/hoisting of unpacks
-  x0 = _mm_loadl_epi64((__m128i *)in0);           // 1
-  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));  // 3
-  x0 = _mm_unpacklo_epi8(x0, x1);                 // 1
-
-  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));  // 5
-  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p));  // 7
-  x1 = _mm_unpacklo_epi8(x2, x3);                     // 2
-
-  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p));  // 9
-  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p));  // 11
-  x2 = _mm_unpacklo_epi8(x4, x5);                     // 3
-
-  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p));  // 13
-  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p));  // 15
-  x3 = _mm_unpacklo_epi8(x6, x7);                     // 4
-  x4 = _mm_unpacklo_epi16(x0, x1);                    // 9
-
-  x8 = _mm_loadl_epi64((__m128i *)in1);           // 2
-  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));  // 4
-  x8 = _mm_unpacklo_epi8(x8, x9);                 // 5
-  x5 = _mm_unpacklo_epi16(x2, x3);                // 10
-
-  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));  // 6
-  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p));  // 8
-  x9 = _mm_unpacklo_epi8(x10, x11);                    // 6
-
-  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p));  // 10
-  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p));  // 12
-  x10 = _mm_unpacklo_epi8(x12, x13);                   // 7
-  x12 = _mm_unpacklo_epi16(x8, x9);                    // 11
-
-  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p));  // 14
-  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p));  // 16
-  x11 = _mm_unpacklo_epi8(x14, x15);                   // 8
-  x13 = _mm_unpacklo_epi16(x10, x11);                  // 12
-
-  x6 = _mm_unpacklo_epi32(x4, x5);     // 13
-  x7 = _mm_unpackhi_epi32(x4, x5);     // 14
-  x14 = _mm_unpacklo_epi32(x12, x13);  // 15
-  x15 = _mm_unpackhi_epi32(x12, x13);  // 16
+  __m128i thresh0 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
 
-  // Store first 4-line result
-  _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
-  _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
-  _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
-  _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
+  __m128i thresh1 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
 
-  x4 = _mm_unpackhi_epi16(x0, x1);
-  x5 = _mm_unpackhi_epi16(x2, x3);
-  x12 = _mm_unpackhi_epi16(x8, x9);
-  x13 = _mm_unpackhi_epi16(x10, x11);
+  __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
 
-  x6 = _mm_unpacklo_epi32(x4, x5);
-  x7 = _mm_unpackhi_epi32(x4, x5);
-  x14 = _mm_unpacklo_epi32(x12, x13);
-  x15 = _mm_unpackhi_epi32(x12, x13);
+  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
 
-  // Store second 4-line result
-  _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
-  _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
-  _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
-  _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8));
+  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8));
 }
 
-#if CONFIG_PARALLEL_DEBLOCKING
-#define movq(p) _mm_loadl_epi64((const __m128i *)(p))
-#define punpcklbw(r0, r1) _mm_unpacklo_epi8(r0, r1)
-#define punpcklwd(r0, r1) _mm_unpacklo_epi16(r0, r1)
-#define punpckhwd(r0, r1) _mm_unpackhi_epi16(r0, r1)
-#define movd(p, r) *((uint32_t *)(p)) = _mm_cvtsi128_si32(r)
-#define pshufd(r, imm) _mm_shuffle_epi32(r, imm)
-enum { ROTATE_DWORD_RIGHT = 0x39 };
-static INLINE void transpose16x4(uint8_t *pDst, const ptrdiff_t dstStride,
-                                 const uint8_t *pSrc,
-                                 const ptrdiff_t srcStride) {
-  for (uint32_t idx = 0; idx < 2; idx += 1) {
-    __m128i r0, r1, r2, r3;
-    // load data
-    r0 = movq(pSrc);
-    r1 = movq(pSrc + srcStride);
-    r2 = movq(pSrc + srcStride * 2);
-    r3 = movq(pSrc + srcStride * 3);
-    // transpose
-    r0 = punpcklbw(r0, r1);
-    r2 = punpcklbw(r2, r3);
-    r1 = punpckhwd(r0, r2);
-    r0 = punpcklwd(r0, r2);
-    // store data
-    movd(pDst, r0);
-    r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
-    movd(pDst + dstStride, r0);
-    r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
-    movd(pDst + dstStride * 2, r0);
-    r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
-    movd(pDst + dstStride * 3, r0);
-    movd(pDst + dstStride * 4, r1);
-    r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
-    movd(pDst + dstStride * 5, r1);
-    r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
-    movd(pDst + dstStride * 6, r1);
-    r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
-    movd(pDst + dstStride * 7, r1);
-    // advance the pointers
-    pDst += dstStride * 8;
-    pSrc += 8;
-  }
-}
-
-#endif  // CONFIG_PARALLEL_DEBLOCKING
-static INLINE void transpose(unsigned char *src[], int in_p,
-                             unsigned char *dst[], int out_p,
-                             int num_8x8_to_transpose) {
-  int idx8x8 = 0;
+void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0,
+                                  const uint8_t *_blimit1,
+                                  const uint8_t *_limit1,
+                                  const uint8_t *_thresh1) {
+  __m128i p0, q0, q1, p1;
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  do {
-    unsigned char *in = src[idx8x8];
-    unsigned char *out = dst[idx8x8];
-
-    x0 =
-        _mm_loadl_epi64((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
-    x1 =
-        _mm_loadl_epi64((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
-    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-    x0 = _mm_unpacklo_epi8(x0, x1);
-
-    x2 =
-        _mm_loadl_epi64((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
-    x3 =
-        _mm_loadl_epi64((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
-    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-    x1 = _mm_unpacklo_epi8(x2, x3);
-
-    x4 =
-        _mm_loadl_epi64((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
-    x5 =
-        _mm_loadl_epi64((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
-    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-    x2 = _mm_unpacklo_epi8(x4, x5);
-
-    x6 =
-        _mm_loadl_epi64((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
-    x7 =
-        _mm_loadl_epi64((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
-    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-    x3 = _mm_unpacklo_epi8(x6, x7);
-
-    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-    x4 = _mm_unpacklo_epi16(x0, x1);
-    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-    x5 = _mm_unpacklo_epi16(x2, x3);
-    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-    x6 = _mm_unpacklo_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 0 * out_p),
-                  _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
-    _mm_storeh_pd((double *)(out + 1 * out_p),
-                  _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
-    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-    x7 = _mm_unpackhi_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 2 * out_p),
-                  _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
-    _mm_storeh_pd((double *)(out + 3 * out_p),
-                  _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
-
-    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
-    x4 = _mm_unpackhi_epi16(x0, x1);
-    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
-    x5 = _mm_unpackhi_epi16(x2, x3);
-    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
-    x6 = _mm_unpacklo_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 4 * out_p),
-                  _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
-    _mm_storeh_pd((double *)(out + 5 * out_p),
-                  _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
-    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
-    x7 = _mm_unpackhi_epi32(x4, x5);
-
-    _mm_storel_pd((double *)(out + 6 * out_p),
-                  _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
-    _mm_storeh_pd((double *)(out + 7 * out_p),
-                  _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
-  } while (++idx8x8 < num_8x8_to_transpose);
-}
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i qs1qs0, ps1ps0;
 
-void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  unsigned char *src[2];
-  unsigned char *dst[2];
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  // Transpose 8x16
-  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  const __m128i limit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                         _mm_load_si128((const __m128i *)_limit1));
 
-  // Loop filtering
-  aom_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
-                                 blimit1, limit1, thresh1);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
-
-  // Transpose back
-  transpose(src, 16, dst, p, 2);
-#else  // CONFIG_PARALLEL_DEBLOCKING
-  transpose16x4(s - 2, p, t_dst + 16 * 2, 16);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-}
+  __m128i l = _mm_unpacklo_epi64(blimit, limit);
 
-void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
-                             const unsigned char *blimit,
-                             const unsigned char *limit,
-                             const unsigned char *thresh) {
-  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
-  unsigned char *src[1];
-  unsigned char *dst[1];
+  __m128i thresh0 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
 
-  // Transpose 8x8
-  src[0] = s - 4;
-  dst[0] = t_dst;
+  __m128i thresh1 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
 
-  transpose(src, p, dst, 8, 1);
+  __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
 
-  // Loop filtering
-  aom_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
+  x0 = _mm_loadl_epi64((__m128i *)((s - 2)));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p));
 
-  src[0] = t_dst;
-  dst[0] = s - 4;
+  transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0,
+                        &q1);
 
-  // Transpose back
-  transpose(src, 8, dst, p, 1);
-}
+  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
 
-void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
-  unsigned char *src[2];
-  unsigned char *dst[2];
+  p1 = _mm_srli_si128(ps1ps0, 8);
+  q1 = _mm_srli_si128(qs1qs0, 8);
 
-  // Transpose 8x16
-  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4,
+                        &d5, &d6, &d7);
 
-  // Loop filtering
-  aom_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
-                                 blimit1, limit1, thresh1);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
+  xx_storel_32((s - 2 + 0 * p), d0);
+  xx_storel_32((s - 2 + 1 * p), d1);
+  xx_storel_32((s - 2 + 2 * p), d2);
+  xx_storel_32((s - 2 + 3 * p), d3);
+  xx_storel_32((s - 2 + 4 * p), d4);
+  xx_storel_32((s - 2 + 5 * p), d5);
+  xx_storel_32((s - 2 + 6 * p), d6);
+  xx_storel_32((s - 2 + 7 * p), d7);
+}
 
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+void aom_lpf_vertical_6_sse2(unsigned char *s, int p,
+                             const unsigned char *_blimit,
+                             const unsigned char *_limit,
+                             const unsigned char *_thresh) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x2, x1, x0, x3;
+  __m128i p0, q0;
+  __m128i p1p0, q1q0;
+  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+  __m128i limit = _mm_load_si128((__m128i *)_limit);
+  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+  x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
+  x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
+
+  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
+                        &d7);
+
+  lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit,
+                      &limit, &thresh);
+
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
+
+  transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
+
+  xx_storel_32(s + 0 * p - 2, d0);
+  xx_storel_32(s + 1 * p - 2, d1);
+  xx_storel_32(s + 2 * p - 2, d2);
+  xx_storel_32(s + 3 * p - 2, d3);
+}
 
-  // Transpose back
-  transpose(src, 16, dst, p, 2);
+void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0,
+                                  const uint8_t *_blimit1,
+                                  const uint8_t *_limit1,
+                                  const uint8_t *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i p0, q0;
+  __m128i p1p0, q1q0;
+  __m128i d0d1, d2d3, d4d5, d6d7;
+
+  x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p));
+
+  transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
+                    &d6d7);
+
+  d1 = _mm_srli_si128(d0d1, 8);
+  d3 = _mm_srli_si128(d2d3, 8);
+  d5 = _mm_srli_si128(d4d5, 8);
+  d7 = _mm_srli_si128(d6d7, 8);
+
+  lpf_internal_6_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0, &blimit,
+                      &limit, &thresh);
+
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
+
+  transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5,
+                        &d6, &d7);
+
+  xx_storel_32((s - 2 + 0 * p), d0);
+  xx_storel_32((s - 2 + 1 * p), d1);
+  xx_storel_32((s - 2 + 2 * p), d2);
+  xx_storel_32((s - 2 + 3 * p), d3);
+  xx_storel_32((s - 2 + 4 * p), d4);
+  xx_storel_32((s - 2 + 5 * p), d5);
+  xx_storel_32((s - 2 + 6 * p), d6);
+  xx_storel_32((s - 2 + 7 * p), d7);
 }
 
-void aom_lpf_vertical_16_sse2(unsigned char *s, int p,
-                              const unsigned char *blimit,
-                              const unsigned char *limit,
-                              const unsigned char *thresh) {
-  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
-  unsigned char *src[2];
-  unsigned char *dst[2];
+void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
+                             const unsigned char *_blimit,
+                             const unsigned char *_limit,
+                             const unsigned char *_thresh) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+
+  __m128i p2, p0, q0, q2;
+  __m128i x2, x1, x0, x3;
+  __m128i q1q0, p1p0;
+  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+  x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p));
+  x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p));
+
+  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
+                        &d7);
+  // Loop filtering
+  lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, &p2,
+                      &q2, &blimit, &limit, &thresh);
 
-  src[0] = s - 8;
-  src[1] = s;
-  dst[0] = t_dst;
-  dst[1] = t_dst + 8 * 8;
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
 
-  // Transpose 16x8
-  transpose(src, p, dst, 8, 2);
+  transpose8x8_low_sse2(&d0, &p2, &p0, &p1p0, &q1q0, &q0, &q2, &d7, &d0, &d1,
+                        &d2, &d3);
 
-  // Loop filtering
-  aom_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
+  _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3);
+}
 
-  src[0] = t_dst;
-  src[1] = t_dst + 8 * 8;
-  dst[0] = s - 8;
-  dst[1] = s;
+void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0,
+                                  const uint8_t *_blimit1,
+                                  const uint8_t *_limit1,
+                                  const uint8_t *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
 
-  // Transpose back
-  transpose(src, 8, dst, p, 2);
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d1, d3, d5, d7;
+  __m128i q1q0, p1p0;
+  __m128i p2, p1, q1, q2;
+  __m128i d0d1, d2d3, d4d5, d6d7;
+
+  x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p));
+
+  transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
+                    &d6d7);
+
+  d1 = _mm_srli_si128(d0d1, 8);
+  d3 = _mm_srli_si128(d2d3, 8);
+  d5 = _mm_srli_si128(d4d5, 8);
+  d7 = _mm_srli_si128(d6d7, 8);
+
+  lpf_internal_8_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5, &q1q0,
+                      &p1p0, &p2, &q2, &blimit, &limit, &thresh);
+
+  p1 = _mm_srli_si128(p1p0, 8);
+  q1 = _mm_srli_si128(q1q0, 8);
+
+  transpose8x8_sse2(&d0d1, &p2, &p1, &p1p0, &q1q0, &q1, &q2, &d7, &d0d1, &d2d3,
+                    &d4d5, &d6d7);
+
+  _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1);
+  _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3);
+  _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5);
+  _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7);
+  _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8));
 }
 
-void aom_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
-                                   const uint8_t *blimit, const uint8_t *limit,
-                                   const uint8_t *thresh) {
-  DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
-
-  // Transpose 16x16
-  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
-  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+void aom_lpf_vertical_14_sse2(unsigned char *s, int p,
+                              const unsigned char *_blimit,
+                              const unsigned char *_limit,
+                              const unsigned char *_thresh) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i x6, x5, x4, x3, x2, x1, x0;
+  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
+  __m128i q0, q1, q2, q3, q4, q5, q6, q7;
+  __m128i p0_out, p1_out, p2_out, p3_out;
+  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+  __m128i limit = _mm_load_si128((__m128i *)_limit);
+  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+  x6 = _mm_loadl_epi64((__m128i *)((s - 8) + 0 * p));
+  x5 = _mm_loadl_epi64((__m128i *)((s - 8) + 1 * p));
+  x4 = _mm_loadl_epi64((__m128i *)((s - 8) + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)((s - 8) + 3 * p));
+
+  transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &p0, &p1, &p2, &p3, &p4, &p5, &p6,
+                        &p7);
+
+  x6 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
+  x5 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  x4 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+
+  transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &q0, &q1, &q2, &q3, &q4, &q5, &q6,
+                        &q7);
+
+  q6p6 = _mm_unpacklo_epi64(p1, q6);
+  q5p5 = _mm_unpacklo_epi64(p2, q5);
+  q4p4 = _mm_unpacklo_epi64(p3, q4);
+  q3p3 = _mm_unpacklo_epi64(p4, q3);
+  q2p2 = _mm_unpacklo_epi64(p5, q2);
+  q1p1 = _mm_unpacklo_epi64(p6, q1);
+  q0p0 = _mm_unpacklo_epi64(p7, q0);
+
+  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+                       &limit, &thresh);
+
+  transpose8x8_low_sse2(&p0, &p1, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
+                        &p0_out, &p1_out, &p2_out, &p3_out);
+
+  x0 = _mm_srli_si128(q0p0, 8);
+  x1 = _mm_srli_si128(q1p1, 8);
+  x2 = _mm_srli_si128(q2p2, 8);
+  x3 = _mm_srli_si128(q3p3, 8);
+  x4 = _mm_srli_si128(q4p4, 8);
+  x5 = _mm_srli_si128(q5p5, 8);
+  x6 = _mm_srli_si128(q6p6, 8);
+
+  transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &q0, &q1, &q2,
+                        &q3);
+
+  _mm_storel_epi64((__m128i *)(s - 8 + 0 * p), p0_out);
+  _mm_storel_epi64((__m128i *)(s - 8 + 1 * p), p1_out);
+  _mm_storel_epi64((__m128i *)(s - 8 + 2 * p), p2_out);
+  _mm_storel_epi64((__m128i *)(s - 8 + 3 * p), p3_out);
+
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
+  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+  _mm_storel_epi64((__m128i *)(s + 3 * p), q3);
+}
 
-  // Loop filtering
-  aom_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
+void aom_lpf_vertical_14_dual_sse2(
+    unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i x7, x6, x5, x4, x3, x2, x1, x0;
+  __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15;
+  __m128i q0, q1, q2, q3, q7;
+  __m128i p0p1, p2p3, p4p5, p6p7;
+
+  __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                                     _mm_load_si128((const __m128i *)_limit1));
+  __m128i thresh =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
+                         _mm_load_si128((const __m128i *)_thresh1));
 
-  // Transpose back
-  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
-  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+  x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
+  x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
+  x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
+  x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
+  x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p));
+  x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p));
+  x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p));
+  x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p));
+
+  transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3,
+                          &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15);
+
+  q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8));
+  q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8));
+  q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8));
+  q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8));
+  q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8));
+  q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8));
+  q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8));
+  q7 = _mm_srli_si128(d14d15, 8);
+
+  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+                       &limit, &thresh);
+
+  x0 = _mm_srli_si128(q0p0, 8);
+  x1 = _mm_srli_si128(q1p1, 8);
+  x2 = _mm_srli_si128(q2p2, 8);
+  x3 = _mm_srli_si128(q3p3, 8);
+  x4 = _mm_srli_si128(q4p4, 8);
+  x5 = _mm_srli_si128(q5p5, 8);
+  x6 = _mm_srli_si128(q6p6, 8);
+
+  transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
+                          &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1,
+                          &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1);
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3);
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5);
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7);
+  _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0);
+  _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1);
+  _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2);
+  _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3);
 }
diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
index 027c890dc..c6b6469b4 100644
--- a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
+++ b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
@@ -14,117 +14,202 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_config.h"
-
-static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
-                                    int out_p, int num_8x8_to_transpose) {
-  int idx8x8 = 0;
-  __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
-  do {
-    uint16_t *in = src[idx8x8];
-    uint16_t *out = dst[idx8x8];
-
-    p0 =
-        _mm_loadu_si128((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
-    p1 =
-        _mm_loadu_si128((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
-    p2 =
-        _mm_loadu_si128((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
-    p3 =
-        _mm_loadu_si128((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
-    p4 =
-        _mm_loadu_si128((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
-    p5 =
-        _mm_loadu_si128((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
-    p6 =
-        _mm_loadu_si128((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
-    p7 =
-        _mm_loadu_si128((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
-    // 00 10 01 11 02 12 03 13
-    x0 = _mm_unpacklo_epi16(p0, p1);
-    // 20 30 21 31 22 32 23 33
-    x1 = _mm_unpacklo_epi16(p2, p3);
-    // 40 50 41 51 42 52 43 53
-    x2 = _mm_unpacklo_epi16(p4, p5);
-    // 60 70 61 71 62 72 63 73
-    x3 = _mm_unpacklo_epi16(p6, p7);
-    // 00 10 20 30 01 11 21 31
-    x4 = _mm_unpacklo_epi32(x0, x1);
-    // 40 50 60 70 41 51 61 71
-    x5 = _mm_unpacklo_epi32(x2, x3);
-    // 00 10 20 30 40 50 60 70
-    x6 = _mm_unpacklo_epi64(x4, x5);
-    // 01 11 21 31 41 51 61 71
-    x7 = _mm_unpackhi_epi64(x4, x5);
-
-    _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6);
-    // 00 10 20 30 40 50 60 70
-    _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7);
-    // 01 11 21 31 41 51 61 71
-
-    // 02 12 22 32 03 13 23 33
-    x4 = _mm_unpackhi_epi32(x0, x1);
-    // 42 52 62 72 43 53 63 73
-    x5 = _mm_unpackhi_epi32(x2, x3);
-    // 02 12 22 32 42 52 62 72
-    x6 = _mm_unpacklo_epi64(x4, x5);
-    // 03 13 23 33 43 53 63 73
-    x7 = _mm_unpackhi_epi64(x4, x5);
-
-    _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6);
-    // 02 12 22 32 42 52 62 72
-    _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7);
-    // 03 13 23 33 43 53 63 73
-
-    // 04 14 05 15 06 16 07 17
-    x0 = _mm_unpackhi_epi16(p0, p1);
-    // 24 34 25 35 26 36 27 37
-    x1 = _mm_unpackhi_epi16(p2, p3);
-    // 44 54 45 55 46 56 47 57
-    x2 = _mm_unpackhi_epi16(p4, p5);
-    // 64 74 65 75 66 76 67 77
-    x3 = _mm_unpackhi_epi16(p6, p7);
-    // 04 14 24 34 05 15 25 35
-    x4 = _mm_unpacklo_epi32(x0, x1);
-    // 44 54 64 74 45 55 65 75
-    x5 = _mm_unpacklo_epi32(x2, x3);
-    // 04 14 24 34 44 54 64 74
-    x6 = _mm_unpacklo_epi64(x4, x5);
-    // 05 15 25 35 45 55 65 75
-    x7 = _mm_unpackhi_epi64(x4, x5);
-
-    _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6);
-    // 04 14 24 34 44 54 64 74
-    _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7);
-    // 05 15 25 35 45 55 65 75
-
-    // 06 16 26 36 07 17 27 37
-    x4 = _mm_unpackhi_epi32(x0, x1);
-    // 46 56 66 76 47 57 67 77
-    x5 = _mm_unpackhi_epi32(x2, x3);
-    // 06 16 26 36 46 56 66 76
-    x6 = _mm_unpacklo_epi64(x4, x5);
-    // 07 17 27 37 47 57 67 77
-    x7 = _mm_unpackhi_epi64(x4, x5);
-
-    _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6);
-    // 06 16 26 36 46 56 66 76
-    _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7);
-    // 07 17 27 37 47 57 67 77
-  } while (++idx8x8 < num_8x8_to_transpose);
+#include "config/aom_config.h"
+
+static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
+                                            __m128i *x2, __m128i *x3,
+                                            __m128i *x4, __m128i *x5,
+                                            __m128i *d0, __m128i *d1,
+                                            __m128i *d2, __m128i *d3,
+                                            __m128i *d4, __m128i *d5) {
+  __m128i w0, w1, w2, w3, w4, w5, ww0;
+
+  // 00 01 02 03 04 05 xx xx
+  // 10 11 12 13 14 15 xx xx
+  // 20 21 22 23 24 25 xx xx
+  // 30 31 32 33 34 35 xx xx
+  // 40 41 42 43 44 45 xx xx
+  // 50 51 52 53 54 55 xx xx
+
+  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
+  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
+  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);   // 00 10 20 30 01 11 21 31
+  *d0 = _mm_unpacklo_epi64(ww0, w2);  // 00 10 20 30 40 50 41 51
+  *d1 = _mm_unpackhi_epi64(ww0,
+                           _mm_srli_si128(w2, 4));  // 01 11 21 31 41 51 xx xx
+
+  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+  *d2 = _mm_unpacklo_epi64(ww0,
+                           _mm_srli_si128(w2, 8));  // 02 12 22 32 42 52 xx xx
+
+  w3 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 xx xx xx xx
+  w4 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 xx xx xx xx
+  w5 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 xx xx xx xx
+
+  *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4));  // 03 13 23 33 43 53
+
+  ww0 = _mm_unpacklo_epi32(w3, w4);   //  04 14 24 34 05 15 25 35
+  *d4 = _mm_unpacklo_epi64(ww0, w5);  //  04 14 24 34 44 54 45 55
+  *d5 = _mm_unpackhi_epi64(ww0,
+                           _mm_slli_si128(w5, 4));  // 05 15 25 35 45 55 xx xx
+}
+
+static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+                                                    __m128i *x2, __m128i *x3,
+                                                    __m128i *d0, __m128i *d1,
+                                                    __m128i *d2, __m128i *d3) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i w0, w1, ww0, ww1;
+
+  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
+  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
+  ww1 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+
+  *d0 = _mm_unpacklo_epi64(ww0, zero);  // 00 10 20 30 xx xx xx xx
+  *d1 = _mm_unpackhi_epi64(ww0, zero);  // 01 11 21 31 xx xx xx xx
+  *d2 = _mm_unpacklo_epi64(ww1, zero);  // 02 12 22 32 xx xx xx xx
+  *d3 = _mm_unpackhi_epi64(ww1, zero);  // 03 13 23 33 xx xx xx xx
+}
+
+static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
+                                                     __m128i *x2, __m128i *x3,
+                                                     __m128i *d4, __m128i *d5,
+                                                     __m128i *d6, __m128i *d7) {
+  __m128i w0, w1, ww2, ww3;
+  __m128i zero = _mm_setzero_si128();
+
+  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
+  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
+
+  ww2 = _mm_unpacklo_epi32(w0, w1);  //  04 14 24 34 05 15 25 35
+  ww3 = _mm_unpackhi_epi32(w0, w1);  //  06 16 26 36 07 17 27 37
+
+  *d4 = _mm_unpacklo_epi64(ww2, zero);  // 04 14 24 34 xx xx xx xx
+  *d5 = _mm_unpackhi_epi64(ww2, zero);  // 05 15 25 35 xx xx xx xx
+  *d6 = _mm_unpacklo_epi64(ww3, zero);  // 06 16 26 36 xx xx xx xx
+  *d7 = _mm_unpackhi_epi64(ww3, zero);  // 07 17 27 37 xx xx xx xx
 }
 
-static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
-                                        uint16_t *out, int out_p) {
-  uint16_t *src0[1];
-  uint16_t *src1[1];
-  uint16_t *dest0[1];
-  uint16_t *dest1[1];
-  src0[0] = in0;
-  src1[0] = in1;
-  dest0[0] = out;
-  dest1[0] = out + 8;
-  highbd_transpose(src0, in_p, dest0, out_p, 1);
-  highbd_transpose(src1, in_p, dest1, out_p, 1);
+// here in and out pointers (x and d) should be different! we don't store their
+// values inside
+static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
+                                                __m128i *x2, __m128i *x3,
+                                                __m128i *d0, __m128i *d1,
+                                                __m128i *d2, __m128i *d3,
+                                                __m128i *d4, __m128i *d5,
+                                                __m128i *d6, __m128i *d7) {
+  // input
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // output
+  // 00 10 20 30 xx xx xx xx
+  // 01 11 21 31 xx xx xx xx
+  // 02 12 22 32 xx xx xx xx
+  // 03 13 23 33 xx xx xx xx
+  // 04 14 24 34 xx xx xx xx
+  // 05 15 25 35 xx xx xx xx
+  // 06 16 26 36 xx xx xx xx
+  // 07 17 27 37 xx xx xx xx
+  highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
+  highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
 }
+
+static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
+                                                __m128i *x2, __m128i *x3,
+                                                __m128i *x4, __m128i *x5,
+                                                __m128i *x6, __m128i *x7,
+                                                __m128i *d0, __m128i *d1,
+                                                __m128i *d2, __m128i *d3) {
+  __m128i w0, w1, w2, w3, ww0, ww1;
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5 50 51 52 53 54 55 56 57
+  // x6 60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+
+  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
+  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
+  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
+  w3 = _mm_unpacklo_epi16(*x6, *x7);  // 60 70 61 71 62 72 63 73
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
+  ww1 = _mm_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
+
+  *d0 = _mm_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
+  *d1 = _mm_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
+
+  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+  ww1 = _mm_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
+
+  *d2 = _mm_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
+  *d3 = _mm_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
+}
+
+static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
+                                                 __m128i *x2, __m128i *x3,
+                                                 __m128i *x4, __m128i *x5,
+                                                 __m128i *x6, __m128i *x7,
+                                                 __m128i *d4, __m128i *d5,
+                                                 __m128i *d6, __m128i *d7) {
+  __m128i w0, w1, w2, w3, ww0, ww1;
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5 50 51 52 53 54 55 56 57
+  // x6 60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
+  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
+  w2 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 46 56 47 57
+  w3 = _mm_unpackhi_epi16(*x6, *x7);  // 64 74 65 75 66 76 67 77
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
+  ww1 = _mm_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
+
+  *d4 = _mm_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
+  *d5 = _mm_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
+
+  ww0 = _mm_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
+  ww1 = _mm_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
+
+  *d6 = _mm_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
+  *d7 = _mm_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
+}
+
+// here in and out pointers (x and d) should be different! we don't store their
+// values inside
+static INLINE void highbd_transpose8x8_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
+    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
+    __m128i *d7) {
+  highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
+  highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
+}
+
+// here in and out pointers (x and d arrays) should be different! we don't store
+// their values inside
+static INLINE void highbd_transpose8x16_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
+    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
+    __m128i *d7) {
+  highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
+                           d5, d6, d7);
+  highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
+                           x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
+                           d4 + 1, d5 + 1, d6 + 1, d7 + 1);
+}
+
 #endif  // _AOM_DSP_X86_LPF_COMMON_X86_H
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
index 2536f91d2..1f42eec2f 100644
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -12,8 +12,9 @@
 #include <stdio.h>
 #include <tmmintrin.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/blend.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/synonyms.h"
@@ -75,11 +76,9 @@ static INLINE unsigned int masked_sad4xh_ssse3(
                                  ref_stride, msk, msk_stride, n);             \
   }
 
-#if CONFIG_EXT_PARTITION
 MASKSADMXN_SSSE3(128, 128)
 MASKSADMXN_SSSE3(128, 64)
 MASKSADMXN_SSSE3(64, 128)
-#endif  // CONFIG_EXT_PARTITION
 MASKSADMXN_SSSE3(64, 64)
 MASKSADMXN_SSSE3(64, 32)
 MASKSADMXN_SSSE3(32, 64)
@@ -93,18 +92,12 @@ MASKSAD8XN_SSSE3(8)
 MASKSAD8XN_SSSE3(4)
 MASKSAD4XN_SSSE3(8)
 MASKSAD4XN_SSSE3(4)
-#if CONFIG_EXT_PARTITION_TYPES
 MASKSAD4XN_SSSE3(16)
 MASKSADMXN_SSSE3(16, 4)
 MASKSAD8XN_SSSE3(32)
 MASKSADMXN_SSSE3(32, 8)
 MASKSADMXN_SSSE3(16, 64)
 MASKSADMXN_SSSE3(64, 16)
-#if CONFIG_EXT_PARTITION
-MASKSADMXN_SSSE3(32, 128)
-MASKSADMXN_SSSE3(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
 static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
                                             int src_stride,
@@ -239,7 +232,6 @@ static INLINE unsigned int masked_sad4xh_ssse3(
   return (sad + 31) >> 6;
 }
 
-#if CONFIG_HIGHBITDEPTH
 // For width a multiple of 8
 static INLINE unsigned int highbd_masked_sad_ssse3(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
@@ -277,11 +269,9 @@ static INLINE unsigned int highbd_masked_sad4xh_ssse3(
                                         ref8, ref_stride, msk, msk_stride, n); \
   }
 
-#if CONFIG_EXT_PARTITION
 HIGHBD_MASKSADMXN_SSSE3(128, 128)
 HIGHBD_MASKSADMXN_SSSE3(128, 64)
 HIGHBD_MASKSADMXN_SSSE3(64, 128)
-#endif  // CONFIG_EXT_PARTITION
 HIGHBD_MASKSADMXN_SSSE3(64, 64)
 HIGHBD_MASKSADMXN_SSSE3(64, 32)
 HIGHBD_MASKSADMXN_SSSE3(32, 64)
@@ -295,18 +285,12 @@ HIGHBD_MASKSADMXN_SSSE3(8, 8)
 HIGHBD_MASKSADMXN_SSSE3(8, 4)
 HIGHBD_MASKSAD4XN_SSSE3(8)
 HIGHBD_MASKSAD4XN_SSSE3(4)
-#if CONFIG_EXT_PARTITION_TYPES
 HIGHBD_MASKSAD4XN_SSSE3(16)
 HIGHBD_MASKSADMXN_SSSE3(16, 4)
 HIGHBD_MASKSADMXN_SSSE3(8, 32)
 HIGHBD_MASKSADMXN_SSSE3(32, 8)
 HIGHBD_MASKSADMXN_SSSE3(16, 64)
 HIGHBD_MASKSADMXN_SSSE3(64, 16)
-#if CONFIG_EXT_PARTITION
-HIGHBD_MASKSADMXN_SSSE3(32, 128)
-HIGHBD_MASKSADMXN_SSSE3(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
 static INLINE unsigned int highbd_masked_sad_ssse3(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
@@ -424,5 +408,3 @@ static INLINE unsigned int highbd_masked_sad4xh_ssse3(
   int sad = _mm_cvtsi128_si32(res);
   return (sad + 31) >> 6;
 }
-
-#endif
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
index 3ffe132be..d7dbefd7d 100644
--- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -13,13 +13,15 @@
 #include <string.h>
 #include <tmmintrin.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/blend.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
 #include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/mem.h"
 
 // For width a multiple of 16
 static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
@@ -108,11 +110,9 @@ static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
     return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H));                 \
   }
 
-#if CONFIG_EXT_PARTITION
 MASK_SUBPIX_VAR_SSSE3(128, 128)
 MASK_SUBPIX_VAR_SSSE3(128, 64)
 MASK_SUBPIX_VAR_SSSE3(64, 128)
-#endif
 MASK_SUBPIX_VAR_SSSE3(64, 64)
 MASK_SUBPIX_VAR_SSSE3(64, 32)
 MASK_SUBPIX_VAR_SSSE3(32, 64)
@@ -126,18 +126,12 @@ MASK_SUBPIX_VAR8XH_SSSE3(8)
 MASK_SUBPIX_VAR8XH_SSSE3(4)
 MASK_SUBPIX_VAR4XH_SSSE3(8)
 MASK_SUBPIX_VAR4XH_SSSE3(4)
-#if CONFIG_EXT_PARTITION_TYPES
 MASK_SUBPIX_VAR4XH_SSSE3(16)
 MASK_SUBPIX_VAR_SSSE3(16, 4)
 MASK_SUBPIX_VAR8XH_SSSE3(32)
 MASK_SUBPIX_VAR_SSSE3(32, 8)
 MASK_SUBPIX_VAR_SSSE3(64, 16)
 MASK_SUBPIX_VAR_SSSE3(16, 64)
-#if CONFIG_EXT_PARTITION
-MASK_SUBPIX_VAR_SSSE3(128, 32)
-MASK_SUBPIX_VAR_SSSE3(32, 128)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
 static INLINE __m128i filter_block(const __m128i a, const __m128i b,
                                    const __m128i filter) {
@@ -523,7 +517,6 @@ static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
   *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
 }
 
-#if CONFIG_HIGHBITDEPTH
 // For width a multiple of 8
 static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
                                    int xoffset, int yoffset, uint16_t *dst,
@@ -695,11 +688,9 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
     return (var >= 0) ? (uint32_t)var : 0;                                  \
   }
 
-#if CONFIG_EXT_PARTITION
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128)
-#endif
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64)
@@ -713,18 +704,12 @@ HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4)
 HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8)
 HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4)
-#if CONFIG_EXT_PARTITION_TYPES
 HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16)
-#if CONFIG_EXT_PARTITION
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 128)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 32)
-#endif
-#endif
 
 static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b,
                                           const __m128i filter) {
@@ -1040,4 +1025,40 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
   *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
 }
 
-#endif
+void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+                              int width, int height, const uint8_t *ref,
+                              int ref_stride, const uint8_t *mask,
+                              int mask_stride, int invert_mask) {
+  const uint8_t *src0 = invert_mask ? pred : ref;
+  const uint8_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  assert(height % 2 == 0);
+  int i = 0;
+  if (width == 8) {
+    comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
+                           mask, mask_stride);
+  } else if (width == 16) {
+    do {
+      comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
+      comp_mask_pred_16_ssse3(src0 + stride0, src1 + stride1,
+                              mask + mask_stride, comp_pred + width);
+      comp_pred += (width << 1);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      i += 2;
+    } while (i < height);
+  } else {  // width == 32
+    assert(width == 32);
+    do {
+      comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
+      comp_mask_pred_16_ssse3(src0 + 16, src1 + 16, mask + 16, comp_pred + 16);
+      comp_pred += (width);
+      src0 += (stride0);
+      src1 += (stride1);
+      mask += (mask_stride);
+      i += 1;
+    } while (i < height);
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
new file mode 100644
index 000000000..dc41a8342
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H
+#define _AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+
+static INLINE void comp_mask_pred_16_ssse3(const uint8_t *src0,
+                                           const uint8_t *src1,
+                                           const uint8_t *mask, uint8_t *dst) {
+  const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i round_offset =
+      _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+  const __m128i sA0 = _mm_lddqu_si128((const __m128i *)(src0));
+  const __m128i sA1 = _mm_lddqu_si128((const __m128i *)(src1));
+  const __m128i aA = _mm_load_si128((const __m128i *)(mask));
+
+  const __m128i maA = _mm_sub_epi8(alpha_max, aA);
+
+  const __m128i ssAL = _mm_unpacklo_epi8(sA0, sA1);
+  const __m128i aaAL = _mm_unpacklo_epi8(aA, maA);
+  const __m128i ssAH = _mm_unpackhi_epi8(sA0, sA1);
+  const __m128i aaAH = _mm_unpackhi_epi8(aA, maA);
+
+  const __m128i blendAL = _mm_maddubs_epi16(ssAL, aaAL);
+  const __m128i blendAH = _mm_maddubs_epi16(ssAH, aaAH);
+
+  const __m128i roundAL = _mm_mulhrs_epi16(blendAL, round_offset);
+  const __m128i roundAH = _mm_mulhrs_epi16(blendAH, round_offset);
+  _mm_store_si128((__m128i *)dst, _mm_packus_epi16(roundAL, roundAH));
+}
+
+static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height,
+                                          const uint8_t *src0, int stride0,
+                                          const uint8_t *src1, int stride1,
+                                          const uint8_t *mask,
+                                          int mask_stride) {
+  int i = 0;
+  const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i round_offset =
+      _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    // odd line A
+    const __m128i sA0 = _mm_loadl_epi64((const __m128i *)(src0));
+    const __m128i sA1 = _mm_loadl_epi64((const __m128i *)(src1));
+    const __m128i aA = _mm_loadl_epi64((const __m128i *)(mask));
+    // even line B
+    const __m128i sB0 = _mm_loadl_epi64((const __m128i *)(src0 + stride0));
+    const __m128i sB1 = _mm_loadl_epi64((const __m128i *)(src1 + stride1));
+    const __m128i a = _mm_castps_si128(_mm_loadh_pi(
+        _mm_castsi128_ps(aA), (const __m64 *)(mask + mask_stride)));
+
+    const __m128i ssA = _mm_unpacklo_epi8(sA0, sA1);
+    const __m128i ssB = _mm_unpacklo_epi8(sB0, sB1);
+
+    const __m128i ma = _mm_sub_epi8(alpha_max, a);
+    const __m128i aaA = _mm_unpacklo_epi8(a, ma);
+    const __m128i aaB = _mm_unpackhi_epi8(a, ma);
+
+    const __m128i blendA = _mm_maddubs_epi16(ssA, aaA);
+    const __m128i blendB = _mm_maddubs_epi16(ssB, aaB);
+    const __m128i roundA = _mm_mulhrs_epi16(blendA, round_offset);
+    const __m128i roundB = _mm_mulhrs_epi16(blendB, round_offset);
+    const __m128i round = _mm_packus_epi16(roundA, roundB);
+    // comp_pred's stride == width == 8
+    _mm_store_si128((__m128i *)(comp_pred), round);
+    comp_pred += (8 << 1);
+    src0 += (stride0 << 1);
+    src1 += (stride1 << 1);
+    mask += (mask_stride << 1);
+    i += 2;
+  } while (i < height);
+}
+
+#endif
diff --git a/third_party/aom/aom_dsp/x86/mem_sse2.h b/third_party/aom/aom_dsp/x86/mem_sse2.h
new file mode 100644
index 000000000..8b69606dd
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/mem_sse2.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_MEM_SSE2_H_
+#define AOM_DSP_X86_MEM_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
+  return _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
+}
+
+static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
+                                                  const int byte_stride) {
+  return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride),
+                        *(const int32_t *)((int8_t *)src + 1 * byte_stride),
+                        *(const int32_t *)((int8_t *)src + 2 * byte_stride),
+                        *(const int32_t *)((int8_t *)src + 3 * byte_stride));
+}
+
+static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
+                                                  const int byte_stride) {
+  __m128i dst;
+  dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
+  dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
+  return dst;
+}
+
+#endif  // AOM_DSP_X86_MEM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
index 73589a32a..a3535f985 100644
--- a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
+++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
@@ -14,7 +14,7 @@
 
 #include <immintrin.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) {
   v_d = _mm_hadd_epi32(v_d, v_d);
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
index 52dd508ec..0338a8c77 100644
--- a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
+++ b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
@@ -12,7 +12,8 @@
 #include <assert.h>
 #include <immintrin.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_ports/mem.h"
 #include "aom/aom_integer.h"
 
@@ -24,9 +25,11 @@
 // 8 bit
 ////////////////////////////////////////////////////////////////////////////////
 
-static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride,
-                                       const int32_t *wsrc, const int32_t *mask,
-                                       const int height) {
+static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
+                                                 const int pre_stride,
+                                                 const int32_t *wsrc,
+                                                 const int32_t *mask,
+                                                 const int height) {
   const int pre_step = pre_stride - 4;
   int n = 0;
   __m128i v_sad_d = _mm_setzero_si128();
@@ -59,11 +62,9 @@ static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride,
   return xx_hsum_epi32_si32(v_sad_d);
 }
 
-static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre,
-                                        const int pre_stride,
-                                        const int32_t *wsrc,
-                                        const int32_t *mask, const int width,
-                                        const int height) {
+static AOM_FORCE_INLINE unsigned int obmc_sad_w8n(
+    const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
   const int pre_step = pre_stride - width;
   int n = 0;
   __m128i v_sad_d = _mm_setzero_si128();
@@ -119,11 +120,9 @@ static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre,
     }                                                          \
   }
 
-#if CONFIG_EXT_PARTITION
 OBMCSADWXH(128, 128)
 OBMCSADWXH(128, 64)
 OBMCSADWXH(64, 128)
-#endif  // CONFIG_EXT_PARTITION
 OBMCSADWXH(64, 64)
 OBMCSADWXH(64, 32)
 OBMCSADWXH(32, 64)
@@ -137,25 +136,22 @@ OBMCSADWXH(8, 8)
 OBMCSADWXH(8, 4)
 OBMCSADWXH(4, 8)
 OBMCSADWXH(4, 4)
-#if CONFIG_EXT_PARTITION_TYPES
 OBMCSADWXH(4, 16)
 OBMCSADWXH(16, 4)
 OBMCSADWXH(8, 32)
 OBMCSADWXH(32, 8)
 OBMCSADWXH(16, 64)
 OBMCSADWXH(64, 16)
-#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
 
-#if CONFIG_HIGHBITDEPTH
-static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
-                                           const int pre_stride,
-                                           const int32_t *wsrc,
-                                           const int32_t *mask,
-                                           const int height) {
+static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
+                                                     const int pre_stride,
+                                                     const int32_t *wsrc,
+                                                     const int32_t *mask,
+                                                     const int height) {
   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
   const int pre_step = pre_stride - 4;
   int n = 0;
@@ -189,11 +185,9 @@ static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
   return xx_hsum_epi32_si32(v_sad_d);
 }
 
-static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8,
-                                            const int pre_stride,
-                                            const int32_t *wsrc,
-                                            const int32_t *mask,
-                                            const int width, const int height) {
+static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n(
+    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
   const int pre_step = pre_stride - width;
   int n = 0;
@@ -250,11 +244,9 @@ static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8,
     }                                                             \
   }
 
-#if CONFIG_EXT_PARTITION
 HBD_OBMCSADWXH(128, 128)
 HBD_OBMCSADWXH(128, 64)
 HBD_OBMCSADWXH(64, 128)
-#endif  // CONFIG_EXT_PARTITION
 HBD_OBMCSADWXH(64, 64)
 HBD_OBMCSADWXH(64, 32)
 HBD_OBMCSADWXH(32, 64)
@@ -268,12 +260,9 @@ HBD_OBMCSADWXH(8, 8)
 HBD_OBMCSADWXH(8, 4)
 HBD_OBMCSADWXH(4, 8)
 HBD_OBMCSADWXH(4, 4)
-#if CONFIG_EXT_PARTITION_TYPES
 HBD_OBMCSADWXH(4, 16)
 HBD_OBMCSADWXH(16, 4)
 HBD_OBMCSADWXH(8, 32)
 HBD_OBMCSADWXH(32, 8)
 HBD_OBMCSADWXH(16, 64)
 HBD_OBMCSADWXH(64, 16)
-#endif
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
index 392616af3..571aa770b 100644
--- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
@@ -12,7 +12,8 @@
 #include <assert.h>
 #include <immintrin.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_ports/mem.h"
 #include "aom/aom_integer.h"
 
@@ -128,11 +129,9 @@ static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));      \
   }
 
-#if CONFIG_EXT_PARTITION
 OBMCVARWXH(128, 128)
 OBMCVARWXH(128, 64)
 OBMCVARWXH(64, 128)
-#endif  // CONFIG_EXT_PARTITION
 OBMCVARWXH(64, 64)
 OBMCVARWXH(64, 32)
 OBMCVARWXH(32, 64)
@@ -146,24 +145,17 @@ OBMCVARWXH(8, 8)
 OBMCVARWXH(8, 4)
 OBMCVARWXH(4, 8)
 OBMCVARWXH(4, 4)
-#if CONFIG_EXT_PARTITION_TYPES
 OBMCVARWXH(4, 16)
 OBMCVARWXH(16, 4)
 OBMCVARWXH(8, 32)
 OBMCVARWXH(32, 8)
 OBMCVARWXH(16, 64)
 OBMCVARWXH(64, 16)
-#if CONFIG_EXT_PARTITION
-OBMCVARWXH(32, 128)
-OBMCVARWXH(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
 ////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
 
-#if CONFIG_HIGHBITDEPTH
 static INLINE void hbd_obmc_variance_w4(
     const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
     const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
@@ -278,8 +270,19 @@ static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
   uint64_t sse64 = 0;
   if (w == 4) {
     hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
-  } else {
+  } else if (w < 128 || h < 128) {
     hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+  } else {
+    assert(w == 128 && h == 128);
+
+    do {
+      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
+                            64);
+      pre8 += 64 * pre_stride;
+      wsrc += 64 * w;
+      mask += 64 * w;
+      h -= 64;
+    } while (h > 0);
   }
   *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
@@ -291,28 +294,23 @@ static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
                                            unsigned int *sse, int *sum) {
   int64_t sum64 = 0;
   uint64_t sse64 = 0;
-  if (w == 128) {
-    do {
-      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 128,
-                            32);
-      pre8 += 32 * pre_stride;
-      wsrc += 32 * 128;
-      mask += 32 * 128;
-      h -= 32;
-    } while (h > 0);
-  } else if (w == 64 && h >= 128) {
-    do {
-      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 64,
-                            64);
-      pre8 += 64 * pre_stride;
-      wsrc += 64 * 64;
-      mask += 64 * 64;
-      h -= 64;
-    } while (h > 0);
-  } else if (w == 4) {
+  int max_pel_allowed_per_ovf = 512;
+  if (w == 4) {
     hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
-  } else {
+  } else if (w * h <= max_pel_allowed_per_ovf) {
     hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+  } else {
+    int h_per_ovf = max_pel_allowed_per_ovf / w;
+
+    assert(max_pel_allowed_per_ovf % w == 0);
+    do {
+      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
+                            h_per_ovf);
+      pre8 += h_per_ovf * pre_stride;
+      wsrc += h_per_ovf * w;
+      mask += h_per_ovf * w;
+      h -= h_per_ovf;
+    } while (h > 0);
   }
   *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
@@ -347,11 +345,9 @@ static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
     return (var >= 0) ? (uint32_t)var : 0;                                 \
   }
 
-#if CONFIG_EXT_PARTITION
 HBD_OBMCVARWXH(128, 128)
 HBD_OBMCVARWXH(128, 64)
 HBD_OBMCVARWXH(64, 128)
-#endif  // CONFIG_EXT_PARTITION
 HBD_OBMCVARWXH(64, 64)
 HBD_OBMCVARWXH(64, 32)
 HBD_OBMCVARWXH(32, 64)
@@ -365,16 +361,9 @@ HBD_OBMCVARWXH(8, 8)
 HBD_OBMCVARWXH(8, 4)
 HBD_OBMCVARWXH(4, 8)
 HBD_OBMCVARWXH(4, 4)
-#if CONFIG_EXT_PARTITION_TYPES
 HBD_OBMCVARWXH(4, 16)
 HBD_OBMCVARWXH(16, 4)
 HBD_OBMCVARWXH(8, 32)
 HBD_OBMCVARWXH(32, 8)
 HBD_OBMCVARWXH(16, 64)
 HBD_OBMCVARWXH(64, 16)
-#if CONFIG_EXT_PARTITION
-HBD_OBMCVARWXH(32, 128)
-HBD_OBMCVARWXH(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
index 954a95b98..e6b40262d 100644
--- a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
+++ b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
@@ -44,16 +44,11 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova                            m0, [zbinq]              ; m0 = zbin
 
   ; Get DC and first 15 AC coeffs - in this special case, that is all.
-%if CONFIG_HIGHBITDEPTH
   ; coeff stored as 32bit numbers but we process them as 16 bit numbers
   mova                            m9, [coeffq]
   packssdw                        m9, [coeffq+16]          ; m9 = c[i]
   mova                           m10, [coeffq+32]
   packssdw                       m10, [coeffq+48]          ; m10 = c[i]
-%else
-  mova                            m9, [coeffq]             ; m9 = c[i]
-  mova                           m10, [coeffq+16]          ; m10 = c[i]
-%endif
 
   mov                             r0, eobmp                ; Output pointer
   mov                             r1, qcoeffmp             ; Output pointer
@@ -76,15 +71,10 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   ptest                          m14, m14
   jnz .single_nonzero
 
-%if CONFIG_HIGHBITDEPTH
   mova                       [r1   ], ymm5
   mova                       [r1+32], ymm5
   mova                       [r2   ], ymm5
   mova                       [r2+32], ymm5
-%else
-  mova                          [r1], ymm5
-  mova                          [r2], ymm5
-%endif
   mov                           [r0], word 0
 
   vzeroupper
@@ -124,7 +114,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pand                            m8, m7
   pand                           m13, m12
 
-%if CONFIG_HIGHBITDEPTH
   ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pcmpgtw                         m6, m5, m8
   punpckhwd                       m6, m8, m6
@@ -136,16 +125,11 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pmovsxwd                       m11, m13
   mova                  [qcoeffq+32], m11
   mova                  [qcoeffq+48], m6
-%else
-  mova                  [qcoeffq   ], m8
-  mova                  [qcoeffq+16], m13
-%endif
 
   pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
   punpckhqdq                      m3, m3
   pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
 
-%if CONFIG_HIGHBITDEPTH
   ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pcmpgtw                         m6, m5, m8
   punpckhwd                       m6, m8, m6
@@ -157,10 +141,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pmovsxwd                       m11, m13
   mova                 [dqcoeffq+32], m11
   mova                 [dqcoeffq+48], m6
-%else
-  mova                 [dqcoeffq   ], m8
-  mova                 [dqcoeffq+16], m13
-%endif
 
   mova                            m6, [iscanq]            ; m6 = scan[i]
   mova                           m11, [iscanq+16]         ; m11 = scan[i]
@@ -229,29 +209,20 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
 
   DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
 
-%if CONFIG_HIGHBITDEPTH
+
   lea                         coeffq, [  coeffq+ncoeffq*4]
   lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
   lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-%else
-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-%endif
+
   lea                         iscanq, [  iscanq+ncoeffq*2]
   neg                        ncoeffq
 
   ; get DC and first 15 AC coeffs
-%if CONFIG_HIGHBITDEPTH
   ; coeff stored as 32bit numbers & require 16bit numbers
   mova                            m9, [coeffq+ncoeffq*4+ 0]
   packssdw                        m9, [coeffq+ncoeffq*4+16]
   mova                           m10, [coeffq+ncoeffq*4+32]
   packssdw                       m10, [coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
 
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
@@ -264,16 +235,10 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   ptest                          m14, m14
   jnz .first_nonzero
 
-%if CONFIG_HIGHBITDEPTH
   mova        [qcoeffq+ncoeffq*4   ], ymm5
   mova        [qcoeffq+ncoeffq*4+32], ymm5
   mova       [dqcoeffq+ncoeffq*4   ], ymm5
   mova       [dqcoeffq+ncoeffq*4+32], ymm5
-%else
-  mova           [qcoeffq+ncoeffq*2], ymm5
-  mova          [dqcoeffq+ncoeffq*2], ymm5
-%endif
-
   add                        ncoeffq, mmsize
 
   punpckhqdq                      m1, m1
@@ -302,7 +267,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   pand                            m8, m7
   pand                           m13, m12
 
-%if CONFIG_HIGHBITDEPTH
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pcmpgtw                         m6, m5, m8
   punpckhwd                       m6, m8, m6
@@ -314,10 +278,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   pmovsxwd                       m11, m13
   mova        [qcoeffq+ncoeffq*4+32], m11
   mova        [qcoeffq+ncoeffq*4+48], m6
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
 
 %ifidn %1, b_32x32
   pabsw                           m8, m8
@@ -333,7 +293,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   psignw                         m13, m10
 %endif
 
-%if CONFIG_HIGHBITDEPTH
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pcmpgtw                         m6, m5, m8
   punpckhwd                       m6, m8, m6
@@ -345,10 +304,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   pmovsxwd                       m11, m13
   mova       [dqcoeffq+ncoeffq*4+32], m11
   mova       [dqcoeffq+ncoeffq*4+48], m6
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
 
   pcmpeqw                         m8, m5                    ; m8 = c[i] == 0
   pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
@@ -363,16 +318,11 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
 
 .ac_only_loop:
 
-%if CONFIG_HIGHBITDEPTH
   ; pack coeff from 32bit to 16bit array
   mova                            m9, [coeffq+ncoeffq*4+ 0]
   packssdw                        m9, [coeffq+ncoeffq*4+16]
   mova                           m10, [coeffq+ncoeffq*4+32]
   packssdw                       m10, [coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
 
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
@@ -385,15 +335,11 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   ptest                          m14, m14
   jnz .rest_nonzero
 
-%if CONFIG_HIGHBITDEPTH
   mova        [qcoeffq+ncoeffq*4+ 0], ymm5
   mova        [qcoeffq+ncoeffq*4+32], ymm5
   mova       [dqcoeffq+ncoeffq*4+ 0], ymm5
   mova       [dqcoeffq+ncoeffq*4+32], ymm5
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], ymm5
-  mova       [dqcoeffq+ncoeffq*2+ 0], ymm5
-%endif
+
   add                        ncoeffq, mmsize
   jnz .ac_only_loop
 
@@ -424,7 +370,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   pand                           m14, m7
   pand                           m13, m12
 
-%if CONFIG_HIGHBITDEPTH
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pcmpgtw                         m6, m5, m14
   punpckhwd                       m6, m14, m6
@@ -436,10 +381,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   pmovsxwd                       m11, m13
   mova        [qcoeffq+ncoeffq*4+32], m11
   mova        [qcoeffq+ncoeffq*4+48], m6
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
 
 %ifidn %1, b_32x32
   pabsw                          m14, m14
@@ -454,7 +395,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   psignw                         m13, m10
 %endif
 
-%if CONFIG_HIGHBITDEPTH
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pcmpgtw                         m6, m5, m14
   punpckhwd                       m6, m14, m6
@@ -466,10 +406,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   pmovsxwd                       m11, m13
   mova       [dqcoeffq+ncoeffq*4+32], m11
   mova       [dqcoeffq+ncoeffq*4+48], m6
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
 
   pcmpeqw                        m14, m5                    ; m14 = c[i] == 0
   pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
@@ -510,27 +446,16 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
 
 DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
 
-%if CONFIG_HIGHBITDEPTH
   lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
   lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-%else
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-%endif
-
   neg                        ncoeffq
   pxor                            m7, m7
 
 .blank_loop:
-%if CONFIG_HIGHBITDEPTH
   mova       [dqcoeffq+ncoeffq*4+ 0], ymm7
   mova       [dqcoeffq+ncoeffq*4+32], ymm7
   mova        [qcoeffq+ncoeffq*4+ 0], ymm7
   mova        [qcoeffq+ncoeffq*4+32], ymm7
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], ymm7
-  mova        [qcoeffq+ncoeffq*2+ 0], ymm7
-%endif
   add                        ncoeffq, mmsize
   jl .blank_loop
 
@@ -543,5 +468,3 @@ DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
 INIT_XMM avx
 QUANTIZE_FN b, 7
 QUANTIZE_FN b_32x32, 7
-
-END
diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c
index 0e7f679d0..46b9c7d29 100644
--- a/third_party/aom/aom_dsp/x86/quantize_sse2.c
+++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c
@@ -12,7 +12,8 @@
 #include <emmintrin.h>
 #include <xmmintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
index 36b4dddbd..e2c1ebb71 100644
--- a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
+++ b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
@@ -45,7 +45,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
 %endif
   mova                            m3, [r2q]                ; m3 = dequant
-  psubw                           m0, [pw_1]
+  psubw                           m0, [GLOBAL(pw_1)]
   mov                             r2, shiftmp
   mov                             r3, qcoeffmp
   mova                            m4, [r2]                 ; m4 = shift
@@ -56,29 +56,18 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
 %endif
   pxor                            m5, m5                   ; m5 = dedicated zero
   DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
-%if CONFIG_HIGHBITDEPTH
   lea                         coeffq, [  coeffq+ncoeffq*4]
   lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
   lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-%else
-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-%endif
   lea                         iscanq, [  iscanq+ncoeffq*2]
   neg                        ncoeffq
 
   ; get DC and first 15 AC coeffs
-%if CONFIG_HIGHBITDEPTH
   ; coeff stored as 32bit numbers & require 16bit numbers
   mova                            m9, [  coeffq+ncoeffq*4+ 0]
   packssdw                        m9, [  coeffq+ncoeffq*4+16]
   mova                           m10, [  coeffq+ncoeffq*4+32]
   packssdw                       m10, [  coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
@@ -99,7 +88,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psignw                         m13, m10                  ; m13 = reinsert sign
   pand                            m8, m7
   pand                           m13, m12
-%if CONFIG_HIGHBITDEPTH
+
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   mova                           m11, m8
   mova                            m6, m8
@@ -117,10 +106,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova        [qcoeffq+ncoeffq*4+32], m11
   mova        [qcoeffq+ncoeffq*4+48], m6
   pxor                            m5, m5             ; reset m5 to zero register
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
+
 %ifidn %1, b_32x32
   pabsw                           m8, m8
   pabsw                          m13, m13
@@ -134,7 +120,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psignw                          m8, m9
   psignw                         m13, m10
 %endif
-%if CONFIG_HIGHBITDEPTH
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   mova                            m11, m8
   mova                            m6, m8
@@ -152,10 +137,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova       [dqcoeffq+ncoeffq*4+32], m11
   mova       [dqcoeffq+ncoeffq*4+48], m6
   pxor                            m5, m5             ; reset m5 to zero register
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
   pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
   mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
@@ -169,16 +150,12 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   jz .accumulate_eob
 
 .ac_only_loop:
-%if CONFIG_HIGHBITDEPTH
   ; pack coeff from 32bit to 16bit array
   mova                            m9, [  coeffq+ncoeffq*4+ 0]
   packssdw                        m9, [  coeffq+ncoeffq*4+16]
   mova                           m10, [  coeffq+ncoeffq*4+32]
   packssdw                       m10, [  coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
+
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
@@ -201,7 +178,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psignw                         m13, m10                  ; m13 = reinsert sign
   pand                           m14, m7
   pand                           m13, m12
-%if CONFIG_HIGHBITDEPTH
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pxor                           m11, m11
   mova                           m11, m14
@@ -220,10 +196,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova        [qcoeffq+ncoeffq*4+32], m11
   mova        [qcoeffq+ncoeffq*4+48], m6
   pxor                            m5, m5             ; reset m5 to zero register
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
+
 %ifidn %1, b_32x32
   pabsw                          m14, m14
   pabsw                          m13, m13
@@ -236,7 +209,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psignw                         m14, m9
   psignw                         m13, m10
 %endif
-%if CONFIG_HIGHBITDEPTH
+
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   mova                           m11, m14
   mova                            m6, m14
@@ -254,10 +227,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova       [dqcoeffq+ncoeffq*4+32], m11
   mova       [dqcoeffq+ncoeffq*4+48], m6
   pxor                            m5, m5
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
+
   pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
   mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
@@ -274,7 +244,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
 %ifidn %1, b_32x32
   jmp .accumulate_eob
 .skip_iter:
-%if CONFIG_HIGHBITDEPTH
   mova        [qcoeffq+ncoeffq*4+ 0], m5
   mova        [qcoeffq+ncoeffq*4+16], m5
   mova        [qcoeffq+ncoeffq*4+32], m5
@@ -283,12 +252,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova       [dqcoeffq+ncoeffq*4+16], m5
   mova       [dqcoeffq+ncoeffq*4+32], m5
   mova       [dqcoeffq+ncoeffq*4+48], m5
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m5
-  mova        [qcoeffq+ncoeffq*2+16], m5
-  mova       [dqcoeffq+ncoeffq*2+ 0], m5
-  mova       [dqcoeffq+ncoeffq*2+16], m5
-%endif
   add                        ncoeffq, mmsize
   jl .ac_only_loop
 %endif
@@ -313,17 +276,11 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mov                             r2, qcoeffmp
   mov                             r3, eobmp
   DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-%if CONFIG_HIGHBITDEPTH
   lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
   lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-%else
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-%endif
   neg                        ncoeffq
   pxor                            m7, m7
 .blank_loop:
-%if CONFIG_HIGHBITDEPTH
   mova       [dqcoeffq+ncoeffq*4+ 0], m7
   mova       [dqcoeffq+ncoeffq*4+16], m7
   mova       [dqcoeffq+ncoeffq*4+32], m7
@@ -332,12 +289,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova        [qcoeffq+ncoeffq*4+16], m7
   mova        [qcoeffq+ncoeffq*4+32], m7
   mova        [qcoeffq+ncoeffq*4+48], m7
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m7
-  mova       [dqcoeffq+ncoeffq*2+16], m7
-  mova        [qcoeffq+ncoeffq*2+ 0], m7
-  mova        [qcoeffq+ncoeffq*2+16], m7
-%endif
   add                        ncoeffq, mmsize
   jl .blank_loop
   mov                    word [eobq], 0
diff --git a/third_party/aom/aom_dsp/x86/sad4d_avx2.c b/third_party/aom/aom_dsp/x86/sad4d_avx2.c
index e60f518b4..f662b62b1 100644
--- a/third_party/aom/aom_dsp/x86/sad4d_avx2.c
+++ b/third_party/aom/aom_dsp/x86/sad4d_avx2.c
@@ -9,7 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 #include <immintrin.h>  // AVX2
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
index 2c67f450f..55a856985 100644
--- a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
@@ -233,11 +233,9 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
 %endmacro
 
 INIT_XMM sse2
-%if CONFIG_EXT_PARTITION
 SADNXN4D 128, 128
 SADNXN4D 128, 64
 SADNXN4D 64,  128
-%endif
 SADNXN4D 64, 64
 SADNXN4D 64, 32
 SADNXN4D 32, 64
@@ -251,11 +249,9 @@ SADNXN4D  8,  8
 SADNXN4D  8,  4
 SADNXN4D  4,  8
 SADNXN4D  4,  4
-%if CONFIG_EXT_PARTITION_TYPES
 SADNXN4D  4, 16
 SADNXN4D 16,  4
 SADNXN4D  8, 32
 SADNXN4D 32,  8
 SADNXN4D 16, 64
 SADNXN4D 64, 16
-%endif
diff --git a/third_party/aom/aom_dsp/x86/sad_avx2.c b/third_party/aom/aom_dsp/x86/sad_avx2.c
index efba61289..a50dba64a 100644
--- a/third_party/aom/aom_dsp/x86/sad_avx2.c
+++ b/third_party/aom/aom_dsp/x86/sad_avx2.c
@@ -9,7 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 #include <immintrin.h>
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_ports/mem.h"
 
 #define FSAD64_H(h)                                                           \
diff --git a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
index e8dd87a26..b506d4663 100644
--- a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
+++ b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
@@ -11,10 +11,11 @@
 
 #include <immintrin.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
 #include "aom_ports/mem.h"
 
 // SAD
@@ -360,7 +361,6 @@ unsigned int aom_highbd_sad64x64_avx2(const uint8_t *src, int src_stride,
   return sum;
 }
 
-#if CONFIG_EXT_PARTITION
 static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr,
                      const uint16_t *sec_ptr, __m256i *sad_acc) {
   __m256i s[8], r[8];
@@ -471,7 +471,6 @@ unsigned int aom_highbd_sad128x128_avx2(const uint8_t *src, int src_stride,
   sum += aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride);
   return sum;
 }
-#endif  // CONFIG_EXT_PARTITION
 
 // If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD.
 static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride,
@@ -649,7 +648,6 @@ unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride,
   return sum;
 }
 
-#if CONFIG_EXT_PARTITION
 unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride,
                                            const uint8_t *ref, int ref_stride,
                                            const uint8_t *second_pred) {
@@ -697,19 +695,13 @@ unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
                                        second_pred);
   return sum;
 }
-#endif  // CONFIG_EXT_PARTITION
 
 // SAD 4D
 // Combine 4 __m256i vectors to uint32_t result[4]
 static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
                                                uint32_t *res) {
   __m256i u0, u1, u2, u3;
-#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
-  const __m256i mask = _mm256_setr_epi32(UINT32_MAX, 0, UINT32_MAX, 0,
-                                         UINT32_MAX, 0, UINT32_MAX, 0);
-#else
-  const __m256i mask = _mm256_set1_epi64x(UINT32_MAX);
-#endif
+  const __m256i mask = yy_set1_64_from_32i(UINT32_MAX);
   __m128i sad;
 
   // 8 32-bit summation
@@ -967,7 +959,6 @@ void aom_highbd_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
   sad_array[3] = first_half[3] + second_half[3];
 }
 
-#if CONFIG_EXT_PARTITION
 void aom_highbd_sad64x128x4d_avx2(const uint8_t *src, int src_stride,
                                   const uint8_t *const ref_array[],
                                   int ref_stride, uint32_t *sad_array) {
@@ -1045,4 +1036,3 @@ void aom_highbd_sad128x128x4d_avx2(const uint8_t *src, int src_stride,
   sad_array[2] = first_half[2] + second_half[2];
   sad_array[3] = first_half[3] + second_half[3];
 }
-#endif  // CONFIG_EXT_PARTITION
diff --git a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
index 4419c65b2..c6fd62c9e 100644
--- a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
+++ b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
@@ -10,7 +10,8 @@
  */
 
 #include <immintrin.h>
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_dsp_rtcd.h"
 
 static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *ref_ptr, int ref_stride) {
diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm
index b4cc6abf1..3251b7655 100644
--- a/third_party/aom/aom_dsp/x86/sad_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/sad_sse2.asm
@@ -47,7 +47,6 @@ cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
 %endif ; %3 == 7
 %endmacro
 
-%if CONFIG_EXT_PARTITION
 ; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride,
 ;                                  uint8_t *ref, int ref_stride);
 %macro SAD128XN 1-2 0
@@ -114,7 +113,6 @@ SAD128XN 128     ; sad128x128_sse2
 SAD128XN 128, 1  ; sad128x128_avg_sse2
 SAD128XN 64      ; sad128x64_sse2
 SAD128XN 64, 1   ; sad128x64_avg_sse2
-%endif
 
 
 ; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
@@ -155,18 +153,14 @@ SAD128XN 64, 1   ; sad128x64_avg_sse2
 %endmacro
 
 INIT_XMM sse2
-%if CONFIG_EXT_PARTITION
 SAD64XN 128     ; sad64x128_sse2
 SAD64XN 128, 1  ; sad64x128_avg_sse2
-%endif
 SAD64XN 64 ; sad64x64_sse2
 SAD64XN 32 ; sad64x32_sse2
 SAD64XN 64, 1 ; sad64x64_avg_sse2
 SAD64XN 32, 1 ; sad64x32_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 SAD64XN 16 ; sad64x16_sse2
 SAD64XN 16, 1 ; sad64x16_avg_sse2
-%endif
 
 ; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
@@ -212,10 +206,8 @@ SAD32XN 16 ; sad32x16_sse2
 SAD32XN 64, 1 ; sad32x64_avg_sse2
 SAD32XN 32, 1 ; sad32x32_avg_sse2
 SAD32XN 16, 1 ; sad32x16_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 SAD32XN 8 ; sad_32x8_sse2
 SAD32XN 8, 1 ; sad_32x8_avg_sse2
-%endif
 
 ; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
@@ -262,12 +254,10 @@ SAD16XN  8 ; sad16x8_sse2
 SAD16XN 32, 1 ; sad16x32_avg_sse2
 SAD16XN 16, 1 ; sad16x16_avg_sse2
 SAD16XN  8, 1 ; sad16x8_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 SAD16XN 4 ; sad_16x4_sse2
 SAD16XN 4, 1 ; sad_16x4_avg_sse2
 SAD16XN 64 ; sad_16x64_sse2
 SAD16XN 64, 1 ; sad_16x64_avg_sse2
-%endif
 
 ; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
@@ -312,10 +302,8 @@ SAD8XN  4 ; sad8x4_sse2
 SAD8XN 16, 1 ; sad8x16_avg_sse2
 SAD8XN  8, 1 ; sad8x8_avg_sse2
 SAD8XN  4, 1 ; sad8x4_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 SAD8XN 32 ; sad_8x32_sse2
 SAD8XN 32, 1 ; sad_8x32_avg_sse2
-%endif
 
 ; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
@@ -361,7 +349,5 @@ SAD4XN  8 ; sad4x8_sse
 SAD4XN  4 ; sad4x4_sse
 SAD4XN  8, 1 ; sad4x8_avg_sse
 SAD4XN  4, 1 ; sad4x4_avg_sse
-%if CONFIG_EXT_PARTITION_TYPES
 SAD4XN 16 ; sad_4x16_sse2
 SAD4XN 16, 1 ; sad_4x16_avg_sse2
-%endif
diff --git a/third_party/aom/aom_dsp/x86/sad_sse3.asm b/third_party/aom/aom_dsp/x86/sad_sse3.asm
deleted file mode 100644
index f6c27c855..000000000
--- a/third_party/aom/aom_dsp/x86/sad_sse3.asm
+++ /dev/null
@@ -1,377 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-%macro STACK_FRAME_CREATE_X3 0
-%if ABI_IS_32BIT
-  %define     src_ptr       rsi
-  %define     src_stride    rax
-  %define     ref_ptr       rdi
-  %define     ref_stride    rdx
-  %define     end_ptr       rcx
-  %define     ret_var       rbx
-  %define     result_ptr    arg(4)
-  %define     height        dword ptr arg(4)
-    push        rbp
-    mov         rbp,        rsp
-    push        rsi
-    push        rdi
-    push        rbx
-
-    mov         rsi,        arg(0)              ; src_ptr
-    mov         rdi,        arg(2)              ; ref_ptr
-
-    movsxd      rax,        dword ptr arg(1)    ; src_stride
-    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
-%else
-  %if LIBAOM_YASM_WIN64
-    SAVE_XMM 7, u
-    %define     src_ptr     rcx
-    %define     src_stride  rdx
-    %define     ref_ptr     r8
-    %define     ref_stride  r9
-    %define     end_ptr     r10
-    %define     ret_var     r11
-    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
-    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
-  %else
-    %define     src_ptr     rdi
-    %define     src_stride  rsi
-    %define     ref_ptr     rdx
-    %define     ref_stride  rcx
-    %define     end_ptr     r9
-    %define     ret_var     r10
-    %define     result_ptr  r8
-    %define     height      r8
-  %endif
-%endif
-
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X3 0
-  %define     src_ptr
-  %define     src_stride
-  %define     ref_ptr
-  %define     ref_stride
-  %define     end_ptr
-  %define     ret_var
-  %define     result_ptr
-  %define     height
-
-%if ABI_IS_32BIT
-    pop         rbx
-    pop         rdi
-    pop         rsi
-    pop         rbp
-%else
-  %if LIBAOM_YASM_WIN64
-    RESTORE_XMM
-  %endif
-%endif
-    ret
-%endmacro
-
-%macro PROCESS_16X2X3 5
-%if %1==0
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm5,       XMMWORD PTR [%3]
-        lddqu           xmm6,       XMMWORD PTR [%3+1]
-        lddqu           xmm7,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm1,       XMMWORD PTR [%3]
-        lddqu           xmm2,       XMMWORD PTR [%3+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [%2+%4]
-        lddqu           xmm1,       XMMWORD PTR [%3+%5]
-        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,         [%2+%4*2]
-        lea             %3,         [%3+%5*2]
-%endif
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_8X2X3 5
-%if %1==0
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm5,       QWORD PTR [%3]
-        movq            mm6,       QWORD PTR [%3+1]
-        movq            mm7,       QWORD PTR [%3+2]
-
-        psadbw          mm5,       mm0
-        psadbw          mm6,       mm0
-        psadbw          mm7,       mm0
-%else
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm1,       QWORD PTR [%3]
-        movq            mm2,       QWORD PTR [%3+1]
-        movq            mm3,       QWORD PTR [%3+2]
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endif
-        movq            mm0,       QWORD PTR [%2+%4]
-        movq            mm1,       QWORD PTR [%3+%5]
-        movq            mm2,       QWORD PTR [%3+%5+1]
-        movq            mm3,       QWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,        [%2+%4*2]
-        lea             %3,        [%3+%5*2]
-%endif
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endmacro
-
-;void int aom_sad16x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad16x16x3_sse3) PRIVATE
-sym(aom_sad16x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int aom_sad16x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad16x8x3_sse3) PRIVATE
-sym(aom_sad16x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int aom_sad8x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad8x16x3_sse3) PRIVATE
-sym(aom_sad8x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int aom_sad8x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad8x8x3_sse3) PRIVATE
-sym(aom_sad8x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int aom_sad4x4x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad4x4x3_sse3) PRIVATE
-sym(aom_sad4x4x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm1,        DWORD PTR [ref_ptr]
-
-        movd            mm2,        DWORD PTR [src_ptr+src_stride]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movd            mm4,        DWORD PTR [ref_ptr+1]
-        movd            mm5,        DWORD PTR [ref_ptr+2]
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        psadbw          mm1,        mm0
-
-        punpcklbw       mm4,        mm2
-        punpcklbw       mm5,        mm3
-
-        psadbw          mm4,        mm0
-        psadbw          mm5,        mm0
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm2,        DWORD PTR [ref_ptr]
-
-        movd            mm3,        DWORD PTR [src_ptr+src_stride]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm3
-        punpcklbw       mm2,        mm6
-
-        movd            mm3,        DWORD PTR [ref_ptr+1]
-        movd            mm7,        DWORD PTR [ref_ptr+2]
-
-        psadbw          mm2,        mm0
-
-        paddw           mm1,        mm2
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        punpcklbw       mm3,        mm2
-        punpcklbw       mm7,        mm6
-
-        psadbw          mm3,        mm0
-        psadbw          mm7,        mm0
-
-        paddw           mm3,        mm4
-        paddw           mm7,        mm5
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm1,        mm3
-
-        movq            [rcx],      mm1
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
diff --git a/third_party/aom/aom_dsp/x86/sad_sse4.asm b/third_party/aom/aom_dsp/x86/sad_sse4.asm
deleted file mode 100644
index 5e9c75845..000000000
--- a/third_party/aom/aom_dsp/x86/sad_sse4.asm
+++ /dev/null
@@ -1,362 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-%include "aom_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X8 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm1,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm1,       xmm2
-        paddw           xmm1,       xmm3
-        paddw           xmm1,       xmm4
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_8X2X8 1
-%if %1
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm1,       xmm2
-%else
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endif
-        movq            xmm0,       MMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_4X2X8 1
-%if %1
-        movd            xmm0,       [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        mpsadbw         xmm1,       xmm0,  0x0
-%else
-        movd            xmm0,       [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endif
-        movd            xmm0,       [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro WRITE_AS_INTS 0
-    mov             rdi,        arg(4)           ;Results
-    pxor            xmm0, xmm0
-    movdqa          xmm2, xmm1
-    punpcklwd       xmm1, xmm0
-    punpckhwd       xmm2, xmm0
-
-    movdqa          [rdi],    xmm1
-    movdqa          [rdi + 16],    xmm2
-%endmacro
-
-;void aom_sad16x16x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array);
-global sym(aom_sad16x16x8_sse4_1) PRIVATE
-sym(aom_sad16x16x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_16X2X8 1
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void aom_sad16x8x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(aom_sad16x8x8_sse4_1) PRIVATE
-sym(aom_sad16x8x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_16X2X8 1
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void aom_sad8x8x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(aom_sad8x8x8_sse4_1) PRIVATE
-sym(aom_sad8x8x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_8X2X8 1
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void aom_sad8x16x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(aom_sad8x16x8_sse4_1) PRIVATE
-sym(aom_sad8x16x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_8X2X8 1
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void aom_sad4x4x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(aom_sad4x4x8_sse4_1) PRIVATE
-sym(aom_sad4x4x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_4X2X8 1
-    PROCESS_4X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
diff --git a/third_party/aom/aom_dsp/x86/sad_ssse3.asm b/third_party/aom/aom_dsp/x86/sad_ssse3.asm
deleted file mode 100644
index 96b64b040..000000000
--- a/third_party/aom/aom_dsp/x86/sad_ssse3.asm
+++ /dev/null
@@ -1,373 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-%include "aom_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X3 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm5,       XMMWORD PTR [rdi]
-        lddqu           xmm6,       XMMWORD PTR [rdi+1]
-        lddqu           xmm7,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm1,       XMMWORD PTR [rdi]
-        lddqu           xmm2,       XMMWORD PTR [rdi+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
-        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X2X3_OFFSET 2
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm7,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm5,       xmm7
-        palignr         xmm5,       xmm4,       %2
-
-        movdqa          xmm6,       xmm7
-        palignr         xmm6,       xmm4,       (%2+1)
-
-        palignr         xmm7,       xmm4,       (%2+2)
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm3,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
-        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X16X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-%macro PROCESS_16X8X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-;void int aom_sad16x16x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad16x16x3_ssse3) PRIVATE
-sym(aom_sad16x16x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .aom_sad16x16x3_ssse3_skiptable
-.aom_sad16x16x3_ssse3_jumptable:
-        dd .aom_sad16x16x3_ssse3_aligned_by_0  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_1  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_2  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_3  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_4  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_5  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_6  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_7  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_8  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_9  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_10 - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_11 - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_12 - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_13 - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_14 - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_15 - .aom_sad16x16x3_ssse3_do_jump
-.aom_sad16x16x3_ssse3_skiptable:
-
-        call .aom_sad16x16x3_ssse3_do_jump
-.aom_sad16x16x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .aom_sad16x16x3_ssse3_jumptable - .aom_sad16x16x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of aom_sad16x16x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X16X3_OFFSET 0,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 1,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 2,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 3,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 4,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 5,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 6,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 7,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 8,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 9,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 10, .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 11, .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 12, .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 13, .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 14, .aom_sad16x16x3_ssse3
-
-.aom_sad16x16x3_ssse3_aligned_by_15:
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.aom_sad16x16x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void int aom_sad16x8x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad16x8x3_ssse3) PRIVATE
-sym(aom_sad16x8x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .aom_sad16x8x3_ssse3_skiptable
-.aom_sad16x8x3_ssse3_jumptable:
-        dd .aom_sad16x8x3_ssse3_aligned_by_0  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_1  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_2  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_3  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_4  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_5  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_6  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_7  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_8  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_9  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_10 - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_11 - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_12 - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_13 - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_14 - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_15 - .aom_sad16x8x3_ssse3_do_jump
-.aom_sad16x8x3_ssse3_skiptable:
-
-        call .aom_sad16x8x3_ssse3_do_jump
-.aom_sad16x8x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .aom_sad16x8x3_ssse3_jumptable - .aom_sad16x8x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of aom_sad16x8x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X8X3_OFFSET 0,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 1,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 2,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 3,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 4,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 5,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 6,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 7,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 8,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 9,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 10, .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 11, .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 12, .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 13, .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 14, .aom_sad16x8x3_ssse3
-
-.aom_sad16x8x3_ssse3_aligned_by_15:
-
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.aom_sad16x8x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
index aa70106c8..6d9b5a12f 100644
--- a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
+++ b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
@@ -47,6 +47,9 @@
         paddd           %1, xmm1
         SUM_ACROSS_Q    %1
 %endmacro
+
+SECTION .text
+
 ;void ssim_parms_sse2(
 ;    unsigned char *s,
 ;    int sp,
diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
index d3feb7ec0..45bf6ec3c 100644
--- a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
@@ -117,27 +117,26 @@ SECTION .text
 ; 11, not 13, if the registers are ordered correctly. May make a minor speed
 ; difference on Win64
 
-%ifdef PIC    ; 64bit PIC
+%if ARCH_X86_64
   %if %2 == 1 ; avg
     cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
-                                      x_offset, y_offset, \
-                                      dst, dst_stride, \
-                                      sec, sec_stride, height, sse
+                                        x_offset, y_offset, dst, dst_stride, \
+                                        sec, sec_stride, height, sse
     %define sec_str sec_strideq
   %else
-    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
-                                  y_offset, dst, dst_stride, height, sse
+    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+                                    x_offset, y_offset, dst, dst_stride, \
+                                    height, sse
   %endif
   %define block_height heightd
   %define bilin_filter sseq
 %else
-  %if ARCH_X86=1 && CONFIG_PIC=1
+  %if CONFIG_PIC=1
     %if %2 == 1 ; avg
       cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                  x_offset, y_offset, \
-                                  dst, dst_stride, \
-                                  sec, sec_stride, \
-                                  height, sse, g_bilin_filter, g_pw_8
+                                          x_offset, y_offset, dst, dst_stride, \
+                                          sec, sec_stride, height, sse, \
+                                          g_bilin_filter, g_pw_8
       %define block_height dword heightm
       %define sec_str sec_stridemp
 
@@ -155,9 +154,9 @@ SECTION .text
 
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %else
-      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
-                                y_offset, dst, dst_stride, height, sse, \
-                                g_bilin_filter, g_pw_8
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                      x_offset, y_offset, dst, dst_stride, \
+                                      height, sse, g_bilin_filter, g_pw_8
       %define block_height heightd
 
       ;Store bilin_filter and pw_8 location in stack
@@ -176,25 +175,18 @@ SECTION .text
     %endif
   %else
     %if %2 == 1 ; avg
-      cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
-                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
-                                             x_offset, y_offset, \
-                                             dst, dst_stride, \
-                                             sec, sec_stride, \
-                                             height, sse
-      %if ARCH_X86_64
-      %define block_height heightd
-      %define sec_str sec_strideq
-      %else
+      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                          x_offset, y_offset, \
+                                          dst, dst_stride, sec, sec_stride, \
+                                          height, sse
       %define block_height dword heightm
       %define sec_str sec_stridemp
-      %endif
     %else
-      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
-                              y_offset, dst, dst_stride, height, sse
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                      x_offset, y_offset, dst, dst_stride, \
+                                      height, sse
       %define block_height heightd
     %endif
-
     %define bilin_filter bilin_filter_m
   %endif
 %endif
@@ -374,8 +366,8 @@ SECTION .text
 
 .x_zero_y_nonhalf:
   ; x_offset == 0 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
@@ -383,7 +375,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
@@ -400,7 +392,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -697,8 +689,8 @@ SECTION .text
 
 .x_half_y_nonhalf:
   ; x_offset == 0.5 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
@@ -706,7 +698,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
@@ -723,7 +715,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -855,8 +847,8 @@ SECTION .text
   jnz .x_nonhalf_y_nonzero
 
   ; x_offset == bilin interpolation && y_offset == 0
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
@@ -864,7 +856,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
@@ -881,7 +873,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -997,8 +989,8 @@ SECTION .text
   jne .x_nonhalf_y_nonhalf
 
   ; x_offset == bilin interpolation && y_offset == 0.5
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
@@ -1006,7 +998,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
@@ -1023,7 +1015,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -1195,8 +1187,8 @@ SECTION .text
   STORE_AND_RET %1
 
 .x_nonhalf_y_nonhalf:
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
   shl           y_offsetd, filter_idx_shift
@@ -1209,7 +1201,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                m11, [bilin_filter+y_offsetq+16]
 %endif
-  mova                m12, [pw_8]
+  mova                m12, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_y_a m10
@@ -1237,7 +1229,7 @@ SECTION .text
 %define filter_x_b [x_offsetq+16]
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
diff --git a/third_party/aom/aom_dsp/x86/subtract_sse2.asm b/third_party/aom/aom_dsp/x86/subtract_sse2.asm
index 7bd5b23ad..1a75a234f 100644
--- a/third_party/aom/aom_dsp/x86/subtract_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/subtract_sse2.asm
@@ -34,10 +34,8 @@ cglobal subtract_block, 7, 7, 8, \
   je .case_16
   cmp                colsd, 32
   je .case_32
-%if CONFIG_EXT_PARTITION
   cmp                colsd, 64
   je .case_64
-%endif
 
 %macro loop16 6
   mova                  m0, [srcq+%1]
@@ -62,7 +60,6 @@ cglobal subtract_block, 7, 7, 8, \
   mova [diffq+mmsize*1+%6], m1
 %endmacro
 
-%if CONFIG_EXT_PARTITION
   mov             pred_str, pred_stridemp
 .loop_128:
   loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize,  0*mmsize,  2*mmsize
@@ -77,7 +74,6 @@ cglobal subtract_block, 7, 7, 8, \
   RET
 
 .case_64:
-%endif
   mov             pred_str, pred_stridemp
 .loop_64:
   loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
index 6be99fbca..a79f22d79 100644
--- a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
+++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
@@ -14,35 +14,62 @@
 #include <stdio.h>
 
 #include "aom_dsp/x86/synonyms.h"
+#include "config/aom_dsp_rtcd.h"
 
-#include "./aom_dsp_rtcd.h"
+static INLINE __m128i xx_loadh_64(__m128i a, const void *b) {
+  const __m128d ad = _mm_castsi128_pd(a);
+  return _mm_castpd_si128(_mm_loadh_pd(ad, (double *)b));
+}
+
+static INLINE uint64_t xx_cvtsi128_si64(__m128i a) {
+#if ARCH_X86_64
+  return (uint64_t)_mm_cvtsi128_si64(a);
+#else
+  {
+    uint64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, a);
+    return tmp;
+  }
+#endif
+}
+
+static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) {
+  const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride);
+  const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride);
+  const __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride);
+  const __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride);
+  const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w);
+  const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w);
+
+  return _mm_add_epi32(v_sq_01_d, v_sq_23_d);
+}
 
 static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
                                                 int stride) {
-  const __m128i v_val_0_w =
-      _mm_loadl_epi64((const __m128i *)(src + 0 * stride));
-  const __m128i v_val_1_w =
-      _mm_loadl_epi64((const __m128i *)(src + 1 * stride));
-  const __m128i v_val_2_w =
-      _mm_loadl_epi64((const __m128i *)(src + 2 * stride));
-  const __m128i v_val_3_w =
-      _mm_loadl_epi64((const __m128i *)(src + 3 * stride));
-
-  const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
-  const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
-  const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
-  const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
-
-  const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
-  const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
-  const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
-
-  const __m128i v_sum_d =
+  const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride);
+  __m128i v_sum_d =
       _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
-
+  v_sum_d = _mm_add_epi32(v_sum_d, _mm_srli_si128(v_sum_d, 8));
   return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
 }
 
+static uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
+                                                int height) {
+  int r = 0;
+  __m128i v_acc_q = _mm_setzero_si128();
+  do {
+    const __m128i v_acc_d = sum_squares_i16_4x4_sse2(src, stride);
+    v_acc_q = _mm_add_epi32(v_acc_q, v_acc_d);
+    src += stride << 2;
+    r += 4;
+  } while (r < height);
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
+  __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32),
+                                   _mm_and_si128(v_acc_q, v_zext_mask_q));
+  v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8));
+  return xx_cvtsi128_si64(v_acc_64);
+}
+
 #ifdef __GNUC__
 // This prevents GCC/Clang from inlining this function into
 // aom_sum_squares_2d_i16_sse2, which in turn saves some stack
@@ -52,72 +79,45 @@ __attribute__((noinline))
 static uint64_t
 aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
                                 int height) {
-  int r, c;
+  int r = 0;
 
-  const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
   __m128i v_acc_q = _mm_setzero_si128();
 
-  for (r = 0; r < height; r += 8) {
+  do {
     __m128i v_acc_d = _mm_setzero_si128();
-
-    for (c = 0; c < width; c += 8) {
+    int c = 0;
+    do {
       const int16_t *b = src + c;
 
-      const __m128i v_val_0_w =
-          _mm_load_si128((const __m128i *)(b + 0 * stride));
-      const __m128i v_val_1_w =
-          _mm_load_si128((const __m128i *)(b + 1 * stride));
-      const __m128i v_val_2_w =
-          _mm_load_si128((const __m128i *)(b + 2 * stride));
-      const __m128i v_val_3_w =
-          _mm_load_si128((const __m128i *)(b + 3 * stride));
-      const __m128i v_val_4_w =
-          _mm_load_si128((const __m128i *)(b + 4 * stride));
-      const __m128i v_val_5_w =
-          _mm_load_si128((const __m128i *)(b + 5 * stride));
-      const __m128i v_val_6_w =
-          _mm_load_si128((const __m128i *)(b + 6 * stride));
-      const __m128i v_val_7_w =
-          _mm_load_si128((const __m128i *)(b + 7 * stride));
+      const __m128i v_val_0_w = xx_load_128(b + 0 * stride);
+      const __m128i v_val_1_w = xx_load_128(b + 1 * stride);
+      const __m128i v_val_2_w = xx_load_128(b + 2 * stride);
+      const __m128i v_val_3_w = xx_load_128(b + 3 * stride);
 
       const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
       const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
       const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
       const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
-      const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
-      const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
-      const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
-      const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
 
       const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
       const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
-      const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
-      const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
 
       const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
-      const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
 
       v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
-      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
-    }
+      c += 8;
+    } while (c < width);
 
     v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
     v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
 
-    src += 8 * stride;
-  }
+    src += 4 * stride;
+    r += 4;
+  } while (r < height);
 
   v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
-
-#if ARCH_X86_64
-  return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
-#else
-  {
-    uint64_t tmp;
-    _mm_storel_epi64((__m128i *)&tmp, v_acc_q);
-    return tmp;
-  }
-#endif
+  return xx_cvtsi128_si64(v_acc_q);
 }
 
 uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
@@ -127,7 +127,9 @@ uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
   // are with size == 4, so it is also the common case.
   if (LIKELY(width == 4 && height == 4)) {
     return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
-  } else if (LIKELY(width % 8 == 0 && height % 8 == 0)) {
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
+  } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
     // Generic case
     return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
   } else {
@@ -140,7 +142,7 @@ uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
 //////////////////////////////////////////////////////////////////////////////
 
 static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
-  const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
   __m128i v_acc0_q = _mm_setzero_si128();
   __m128i v_acc1_q = _mm_setzero_si128();
 
@@ -185,16 +187,7 @@ static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
 
   v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q);
   v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
-
-#if ARCH_X86_64
-  return (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
-#else
-  {
-    uint64_t tmp;
-    _mm_storel_epi64((__m128i *)&tmp, v_acc0_q);
-    return tmp;
-  }
-#endif
+  return xx_cvtsi128_si64(v_acc0_q);
 }
 
 uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) {
diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h
index cd049a454..d9a53fcc5 100644
--- a/third_party/aom/aom_dsp/x86/synonyms.h
+++ b/third_party/aom/aom_dsp/x86/synonyms.h
@@ -14,7 +14,8 @@
 
 #include <immintrin.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 
 /**
@@ -58,6 +59,28 @@ static INLINE void xx_storeu_128(void *const a, const __m128i v) {
   _mm_storeu_si128((__m128i *)a, v);
 }
 
+// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm_set_epi64x()
+// acting on 32-bit integers.
+static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) {
+#if defined(_MSC_VER) && _MSC_VER < 1900
+  return _mm_set_epi32(0, e1, 0, e0);
+#else
+  return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0);
+#endif
+}
+
+// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm_set1_epi64x()
+// acting on a 32-bit integer.
+static INLINE __m128i xx_set1_64_from_32i(int32_t a) {
+#if defined(_MSC_VER) && _MSC_VER < 1900
+  return _mm_set_epi32(0, a, 0, a);
+#else
+  return _mm_set1_epi64x((uint32_t)a);
+#endif
+}
+
 static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
   return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
 }
@@ -89,4 +112,12 @@ static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
   return _mm_srai_epi32(v_tmp_d, bits);
 }
 
+static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
+  const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15);
+  const __m128i v_tmp_d =
+      _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d);
+  return _mm_srai_epi16(v_tmp_d, bits);
+}
+
 #endif  // AOM_DSP_X86_SYNONYMS_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
new file mode 100644
index 000000000..39f371fc9
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_SYNONYMS_AVX2_H_
+#define AOM_DSP_X86_SYNONYMS_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+/**
+ * Various reusable shorthands for x86 SIMD intrinsics.
+ *
+ * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
+ * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
+ */
+
+// Loads and stores to do away with the tedium of casting the address
+// to the right type.
+static INLINE __m256i yy_load_256(const void *a) {
+  return _mm256_load_si256((const __m256i *)a);
+}
+
+static INLINE __m256i yy_loadu_256(const void *a) {
+  return _mm256_loadu_si256((const __m256i *)a);
+}
+
+static INLINE void yy_store_256(void *const a, const __m256i v) {
+  _mm256_store_si256((__m256i *)a, v);
+}
+
+static INLINE void yy_storeu_256(void *const a, const __m256i v) {
+  _mm256_storeu_si256((__m256i *)a, v);
+}
+
+// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm256_set1_epi64x()
+// acting on a 32-bit integer.
+static INLINE __m256i yy_set1_64_from_32i(int32_t a) {
+#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+  return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a);
+#else
+  return _mm256_set1_epi64x((uint32_t)a);
+#endif
+}
+
+// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
+// therefore define an equivalent function using a different intrinsic.
+// ([ hi ], [ lo ]) -> [ hi ][ lo ]
+static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
+#endif  // AOM_DSP_X86_SYNONYMS_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/transpose_sse2.h b/third_party/aom/aom_dsp/x86/transpose_sse2.h
new file mode 100644
index 000000000..f88a1527d
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/transpose_sse2.h
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_TRANSPOSE_SSE2_H_
+#define AOM_DSP_X86_TRANSPOSE_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+
+static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
+  return _mm_unpacklo_epi16(a0, a1);
+}
+
+static INLINE void transpose_8bit_8x8(const __m128i *const in,
+                                      __m128i *const out) {
+  // Unpack 8 bit elements. Goes from:
+  // in[0]: 00 01 02 03 04 05 06 07
+  // in[1]: 10 11 12 13 14 15 16 17
+  // in[2]: 20 21 22 23 24 25 26 27
+  // in[3]: 30 31 32 33 34 35 36 37
+  // in[4]: 40 41 42 43 44 45 46 47
+  // in[5]: 50 51 52 53 54 55 56 57
+  // in[6]: 60 61 62 63 64 65 66 67
+  // in[7]: 70 71 72 73 74 75 76 77
+  // to:
+  // a0:    00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+  // a1:    20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+  // a2:    40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+  // a3:    60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
+
+  // Unpack 16 bit elements resulting in:
+  // b0: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  // b1: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
+  // b2: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  // b3: 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+  const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+
+  // Unpack 32 bit elements resulting in:
+  // c0: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  // c1: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // c2: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  // c3: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
+  const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
+  const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
+  const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30 40 50 60 70
+  // out[1]: 01 11 21 31 41 51 61 71
+  // out[2]: 02 12 22 32 42 52 62 72
+  // out[3]: 03 13 23 33 43 53 63 73
+  // out[4]: 04 14 24 34 44 54 64 74
+  // out[5]: 05 15 25 35 45 55 65 75
+  // out[6]: 06 16 26 36 46 56 66 76
+  // out[7]: 07 17 27 37 47 57 67 77
+  out[0] = _mm_unpacklo_epi64(c0, c0);
+  out[1] = _mm_unpackhi_epi64(c0, c0);
+  out[2] = _mm_unpacklo_epi64(c1, c1);
+  out[3] = _mm_unpackhi_epi64(c1, c1);
+  out[4] = _mm_unpacklo_epi64(c2, c2);
+  out[5] = _mm_unpackhi_epi64(c2, c2);
+  out[6] = _mm_unpacklo_epi64(c3, c3);
+  out[7] = _mm_unpackhi_epi64(c3, c3);
+}
+
+static INLINE void transpose_16bit_4x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  out[0] = _mm_unpacklo_epi32(a0, a1);
+  out[1] = _mm_srli_si128(out[0], 8);
+  out[2] = _mm_unpackhi_epi32(a0, a1);
+  out[3] = _mm_srli_si128(out[2], 8);
+}
+
+static INLINE void transpose_16bit_4x8(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // in[4]: 40 41 42 43  XX XX XX XX
+  // in[5]: 50 51 52 53  XX XX XX XX
+  // in[6]: 60 61 62 63  XX XX XX XX
+  // in[7]: 70 71 72 73  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 02 12 22 32  03 13 23 33
+  // b3: 42 52 62 72  43 53 63 73
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b2, b3);
+  out[3] = _mm_unpackhi_epi64(b2, b3);
+}
+
+static INLINE void transpose_16bit_8x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b2: 04 14 24 34  05 15 25 35
+  // b4: 02 12 22 32  03 13 23 33
+  // b6: 06 16 26 36  07 17 27 37
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  XX XX XX XX
+  // out[1]: 01 11 21 31  XX XX XX XX
+  // out[2]: 02 12 22 32  XX XX XX XX
+  // out[3]: 03 13 23 33  XX XX XX XX
+  // out[4]: 04 14 24 34  XX XX XX XX
+  // out[5]: 05 15 25 35  XX XX XX XX
+  // out[6]: 06 16 26 36  XX XX XX XX
+  // out[7]: 07 17 27 37  XX XX XX XX
+  const __m128i zeros = _mm_setzero_si128();
+  out[0] = _mm_unpacklo_epi64(b0, zeros);
+  out[1] = _mm_unpackhi_epi64(b0, zeros);
+  out[2] = _mm_unpacklo_epi64(b4, zeros);
+  out[3] = _mm_unpackhi_epi64(b4, zeros);
+  out[4] = _mm_unpacklo_epi64(b2, zeros);
+  out[5] = _mm_unpackhi_epi64(b2, zeros);
+  out[6] = _mm_unpacklo_epi64(b6, zeros);
+  out[7] = _mm_unpackhi_epi64(b6, zeros);
+}
+
+static INLINE void transpose_16bit_8x8(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+  // in[4]: 40 41 42 43  44 45 46 47
+  // in[5]: 50 51 52 53  54 55 56 57
+  // in[6]: 60 61 62 63  64 65 66 67
+  // in[7]: 70 71 72 73  74 75 76 77
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  // a6:    44 54 45 55  46 56 47 57
+  // a7:    64 74 65 75  66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 04 14 24 34  05 15 25 35
+  // b3: 44 54 64 74  45 55 65 75
+  // b4: 02 12 22 32  03 13 23 33
+  // b5: 42 52 62 72  43 53 63 73
+  // b6: 06 16 26 36  07 17 27 37
+  // b7: 46 56 66 76  47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  // out[4]: 04 14 24 34  44 54 64 74
+  // out[5]: 05 15 25 35  45 55 65 75
+  // out[6]: 06 16 26 36  46 56 66 76
+  // out[7]: 07 17 27 37  47 57 67 77
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b4, b5);
+  out[3] = _mm_unpackhi_epi64(b4, b5);
+  out[4] = _mm_unpacklo_epi64(b2, b3);
+  out[5] = _mm_unpackhi_epi64(b2, b3);
+  out[6] = _mm_unpacklo_epi64(b6, b7);
+  out[7] = _mm_unpackhi_epi64(b6, b7);
+}
+
+// Transpose in-place
+static INLINE void transpose_16bit_16x16(__m128i *const left,
+                                         __m128i *const right) {
+  __m128i tbuf[8];
+  transpose_16bit_8x8(left, left);
+  transpose_16bit_8x8(right, tbuf);
+  transpose_16bit_8x8(left + 8, right);
+  transpose_16bit_8x8(right + 8, right + 8);
+
+  left[8] = tbuf[0];
+  left[9] = tbuf[1];
+  left[10] = tbuf[2];
+  left[11] = tbuf[3];
+  left[12] = tbuf[4];
+  left[13] = tbuf[5];
+  left[14] = tbuf[6];
+  left[15] = tbuf[7];
+}
+
+static INLINE void transpose_32bit_4x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11
+  // a1:    20 30 21 31
+  // a2:    02 12 03 13
+  // a3:    22 32 23 33
+
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+}
+
+static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
+                                         __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // in[4]: 04 05 06 07
+  // in[5]: 14 15 16 17
+  // in[6]: 24 25 26 27
+  // in[7]: 34 35 36 37
+  // to:
+  // a0:    00 10 01 11
+  // a1:    20 30 21 31
+  // a2:    02 12 03 13
+  // a3:    22 32 23 33
+  // a4:    04 14 05 15
+  // a5:    24 34 25 35
+  // a6:    06 16 07 17
+  // a7:    26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+  const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
+  const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
+  const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+  out[4] = _mm_unpacklo_epi64(a4, a5);
+  out[5] = _mm_unpackhi_epi64(a4, a5);
+  out[6] = _mm_unpacklo_epi64(a6, a7);
+  out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+static INLINE void transpose_32bit_8x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 04 05 06 07
+  // in[2]: 10 11 12 13
+  // in[3]: 14 15 16 17
+  // in[4]: 20 21 22 23
+  // in[5]: 24 25 26 27
+  // in[6]: 30 31 32 33
+  // in[7]: 34 35 36 37
+  // to:
+  // a0: 00 10 01 11
+  // a1: 20 30 21 31
+  // a2: 02 12 03 13
+  // a3: 22 32 23 33
+  // a4: 04 14 05 15
+  // a5: 24 34 25 35
+  // a6: 06 16 07 17
+  // a7: 26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
+  const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
+  const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
+  const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
+  const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+  out[4] = _mm_unpacklo_epi64(a4, a5);
+  out[5] = _mm_unpackhi_epi64(a4, a5);
+  out[6] = _mm_unpacklo_epi64(a6, a7);
+  out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+#endif  // AOM_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
deleted file mode 100644
index 1a8fed710..000000000
--- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H
-#define AOM_DSP_X86_TXFM_COMMON_AVX2_H
-
-#include <immintrin.h>
-
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/common_avx2.h"
-
-#define pair256_set_epi16(a, b)                                            \
-  _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
-                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
-                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
-                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
-
-#define pair256_set_epi32(a, b)                                                \
-  _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \
-                   (int)(b), (int)(a))
-
-static INLINE void mm256_reverse_epi16(__m256i *u) {
-  const __m256i control = _mm256_set_epi16(
-      0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100,
-      0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E);
-  __m256i v = _mm256_shuffle_epi8(*u, control);
-  *u = _mm256_permute2x128_si256(v, v, 1);
-}
-
-static INLINE __m256i butter_fly(const __m256i *a0, const __m256i *a1,
-                                 const __m256i *cospi) {
-  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  __m256i y0 = _mm256_madd_epi16(*a0, *cospi);
-  __m256i y1 = _mm256_madd_epi16(*a1, *cospi);
-
-  y0 = _mm256_add_epi32(y0, dct_rounding);
-  y1 = _mm256_add_epi32(y1, dct_rounding);
-  y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS);
-  y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS);
-
-  return _mm256_packs_epi32(y0, y1);
-}
-
-static INLINE void txfm_scaling16_avx2(const int16_t c, __m256i *in) {
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i sqrt2_epi16 = _mm256_set1_epi16(c);
-  const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  __m256i u0, u1;
-  int i = 0;
-
-  while (i < 16) {
-    in[i] = _mm256_slli_epi16(in[i], 1);
-
-    u0 = _mm256_unpacklo_epi16(zero, in[i]);
-    u1 = _mm256_unpackhi_epi16(zero, in[i]);
-
-    u0 = _mm256_madd_epi16(u0, sqrt2_epi16);
-    u1 = _mm256_madd_epi16(u1, sqrt2_epi16);
-
-    u0 = _mm256_add_epi32(u0, dct_const_rounding);
-    u1 = _mm256_add_epi32(u1, dct_const_rounding);
-
-    u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
-    u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
-    in[i] = _mm256_packs_epi32(u0, u1);
-    i++;
-  }
-}
-
-#endif  // AOM_DSP_X86_TXFM_COMMON_AVX2_H
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h b/third_party/aom/aom_dsp/x86/txfm_common_intrin.h
deleted file mode 100644
index 4e6eecd32..000000000
--- a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
-#define _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
-
-// Note:
-//  This header file should be put below any x86 intrinsics head file
-
-static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
-  if (sizeof(tran_low_t) == 4) {
-    const __m128i zero = _mm_setzero_si128();
-    const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
-    __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
-    __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
-    _mm_storeu_si128((__m128i *)(dst_ptr), out0);
-    _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
-  } else {
-    _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
-  }
-}
-
-#endif  // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
index 4257d8b9c..58a792424 100644
--- a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
+++ b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
@@ -16,17 +16,8 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/synonyms.h"
 
-#define pair_set_epi16(a, b)                                            \
-  _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
-                (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
-
-#define dual_set_epi16(a, b)                                            \
-  _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
-                (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
-
-#define octa_set_epi16(a, b, c, d, e, f, g, h)                           \
-  _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
-                 (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
+#define pair_set_epi16(a, b) \
+  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
 
 // Reverse the 8 16 bit words in __m128i
 static INLINE __m128i mm_reverse_epi16(const __m128i x) {
@@ -35,292 +26,4 @@ static INLINE __m128i mm_reverse_epi16(const __m128i x) {
   return _mm_shuffle_epi32(b, 0x4e);
 }
 
-#if CONFIG_EXT_TX
-// Identity transform (both forward and inverse).
-static INLINE void idtx16_8col(__m128i *in) {
-  const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
-  const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i y0, y1, y2, y3, y4, y5, y6, y7;
-
-  in[0] = _mm_slli_epi16(in[0], 1);
-  in[1] = _mm_slli_epi16(in[1], 1);
-  in[2] = _mm_slli_epi16(in[2], 1);
-  in[3] = _mm_slli_epi16(in[3], 1);
-  in[4] = _mm_slli_epi16(in[4], 1);
-  in[5] = _mm_slli_epi16(in[5], 1);
-  in[6] = _mm_slli_epi16(in[6], 1);
-  in[7] = _mm_slli_epi16(in[7], 1);
-  in[8] = _mm_slli_epi16(in[8], 1);
-  in[9] = _mm_slli_epi16(in[9], 1);
-  in[10] = _mm_slli_epi16(in[10], 1);
-  in[11] = _mm_slli_epi16(in[11], 1);
-  in[12] = _mm_slli_epi16(in[12], 1);
-  in[13] = _mm_slli_epi16(in[13], 1);
-  in[14] = _mm_slli_epi16(in[14], 1);
-  in[15] = _mm_slli_epi16(in[15], 1);
-
-  v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
-  v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
-  v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
-  v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
-  v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16);
-  v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16);
-  v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16);
-  v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16);
-
-  u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16);
-  u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16);
-  u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16);
-  u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16);
-  u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16);
-  u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16);
-  u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16);
-  u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16);
-
-  x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16);
-  x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16);
-  x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16);
-  x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16);
-  x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16);
-  x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16);
-  x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16);
-  x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16);
-
-  y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16);
-  y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16);
-  y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16);
-  y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16);
-  y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16);
-  y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16);
-  y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16);
-  y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16);
-
-  v0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
-  v1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
-  v2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
-  v3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
-  v4 = _mm_madd_epi16(v4, k__sqrt2_epi16);
-  v5 = _mm_madd_epi16(v5, k__sqrt2_epi16);
-  v6 = _mm_madd_epi16(v6, k__sqrt2_epi16);
-  v7 = _mm_madd_epi16(v7, k__sqrt2_epi16);
-
-  x0 = _mm_madd_epi16(x0, k__sqrt2_epi16);
-  x1 = _mm_madd_epi16(x1, k__sqrt2_epi16);
-  x2 = _mm_madd_epi16(x2, k__sqrt2_epi16);
-  x3 = _mm_madd_epi16(x3, k__sqrt2_epi16);
-  x4 = _mm_madd_epi16(x4, k__sqrt2_epi16);
-  x5 = _mm_madd_epi16(x5, k__sqrt2_epi16);
-  x6 = _mm_madd_epi16(x6, k__sqrt2_epi16);
-  x7 = _mm_madd_epi16(x7, k__sqrt2_epi16);
-
-  u0 = _mm_madd_epi16(u0, k__sqrt2_epi16);
-  u1 = _mm_madd_epi16(u1, k__sqrt2_epi16);
-  u2 = _mm_madd_epi16(u2, k__sqrt2_epi16);
-  u3 = _mm_madd_epi16(u3, k__sqrt2_epi16);
-  u4 = _mm_madd_epi16(u4, k__sqrt2_epi16);
-  u5 = _mm_madd_epi16(u5, k__sqrt2_epi16);
-  u6 = _mm_madd_epi16(u6, k__sqrt2_epi16);
-  u7 = _mm_madd_epi16(u7, k__sqrt2_epi16);
-
-  y0 = _mm_madd_epi16(y0, k__sqrt2_epi16);
-  y1 = _mm_madd_epi16(y1, k__sqrt2_epi16);
-  y2 = _mm_madd_epi16(y2, k__sqrt2_epi16);
-  y3 = _mm_madd_epi16(y3, k__sqrt2_epi16);
-  y4 = _mm_madd_epi16(y4, k__sqrt2_epi16);
-  y5 = _mm_madd_epi16(y5, k__sqrt2_epi16);
-  y6 = _mm_madd_epi16(y6, k__sqrt2_epi16);
-  y7 = _mm_madd_epi16(y7, k__sqrt2_epi16);
-
-  v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
-  x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
-  x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
-  x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
-  x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING);
-  x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING);
-  x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING);
-  x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-
-  y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
-  y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
-  y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
-  y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
-  y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING);
-  y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING);
-  y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING);
-  y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
-  x0 = _mm_srai_epi32(x0, DCT_CONST_BITS);
-  x1 = _mm_srai_epi32(x1, DCT_CONST_BITS);
-  x2 = _mm_srai_epi32(x2, DCT_CONST_BITS);
-  x3 = _mm_srai_epi32(x3, DCT_CONST_BITS);
-  x4 = _mm_srai_epi32(x4, DCT_CONST_BITS);
-  x5 = _mm_srai_epi32(x5, DCT_CONST_BITS);
-  x6 = _mm_srai_epi32(x6, DCT_CONST_BITS);
-  x7 = _mm_srai_epi32(x7, DCT_CONST_BITS);
-
-  u0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  y0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
-  y1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
-  y2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
-  y3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
-  y4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
-  y5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
-  y6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
-  y7 = _mm_srai_epi32(y7, DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(v0, x0);
-  in[1] = _mm_packs_epi32(v1, x1);
-  in[2] = _mm_packs_epi32(v2, x2);
-  in[3] = _mm_packs_epi32(v3, x3);
-  in[4] = _mm_packs_epi32(v4, x4);
-  in[5] = _mm_packs_epi32(v5, x5);
-  in[6] = _mm_packs_epi32(v6, x6);
-  in[7] = _mm_packs_epi32(v7, x7);
-
-  in[8] = _mm_packs_epi32(u0, y0);
-  in[9] = _mm_packs_epi32(u1, y1);
-  in[10] = _mm_packs_epi32(u2, y2);
-  in[11] = _mm_packs_epi32(u3, y3);
-  in[12] = _mm_packs_epi32(u4, y4);
-  in[13] = _mm_packs_epi32(u5, y5);
-  in[14] = _mm_packs_epi32(u6, y6);
-  in[15] = _mm_packs_epi32(u7, y7);
-}
-#endif  // CONFIG_EXT_TX
-
-static INLINE void scale_sqrt2_8x4(__m128i *in) {
-  // Implements ROUND_POWER_OF_TWO(input * Sqrt2, DCT_CONST_BITS), for 32
-  // consecutive elements.
-  const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);
-
-  const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
-  const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
-  const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
-  const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
-  const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
-  const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
-  const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
-  const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
-
-  const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
-  const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
-  const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
-  const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
-  const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
-  const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
-  const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
-  const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
-
-  in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
-  in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
-  in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS));
-  in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
-}
-
-static INLINE void scale_sqrt2_8x8(__m128i *in) {
-  // Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)'
-  // for each element.
-  const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);
-
-  const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
-  const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
-  const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
-  const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
-  const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
-  const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
-  const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
-  const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
-  const __m128i v_p4l_w = _mm_mullo_epi16(in[4], v_scale_w);
-  const __m128i v_p4h_w = _mm_mulhi_epi16(in[4], v_scale_w);
-  const __m128i v_p5l_w = _mm_mullo_epi16(in[5], v_scale_w);
-  const __m128i v_p5h_w = _mm_mulhi_epi16(in[5], v_scale_w);
-  const __m128i v_p6l_w = _mm_mullo_epi16(in[6], v_scale_w);
-  const __m128i v_p6h_w = _mm_mulhi_epi16(in[6], v_scale_w);
-  const __m128i v_p7l_w = _mm_mullo_epi16(in[7], v_scale_w);
-  const __m128i v_p7h_w = _mm_mulhi_epi16(in[7], v_scale_w);
-
-  const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
-  const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
-  const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
-  const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
-  const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
-  const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
-  const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
-  const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
-  const __m128i v_p4a_d = _mm_unpacklo_epi16(v_p4l_w, v_p4h_w);
-  const __m128i v_p4b_d = _mm_unpackhi_epi16(v_p4l_w, v_p4h_w);
-  const __m128i v_p5a_d = _mm_unpacklo_epi16(v_p5l_w, v_p5h_w);
-  const __m128i v_p5b_d = _mm_unpackhi_epi16(v_p5l_w, v_p5h_w);
-  const __m128i v_p6a_d = _mm_unpacklo_epi16(v_p6l_w, v_p6h_w);
-  const __m128i v_p6b_d = _mm_unpackhi_epi16(v_p6l_w, v_p6h_w);
-  const __m128i v_p7a_d = _mm_unpacklo_epi16(v_p7l_w, v_p7h_w);
-  const __m128i v_p7b_d = _mm_unpackhi_epi16(v_p7l_w, v_p7h_w);
-
-  in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
-  in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
-  in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS));
-  in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
-  in[4] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p4a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p4b_d, DCT_CONST_BITS));
-  in[5] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p5a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p5b_d, DCT_CONST_BITS));
-  in[6] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p6a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p6b_d, DCT_CONST_BITS));
-  in[7] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p7a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p7b_d, DCT_CONST_BITS));
-}
-
-static INLINE void scale_sqrt2_8x16(__m128i *in) {
-  scale_sqrt2_8x8(in);
-  scale_sqrt2_8x8(in + 8);
-}
-
 #endif  // AOM_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c
index 18a70dffe..7d6b7d287 100644
--- a/third_party/aom/aom_dsp/x86/variance_avx2.c
+++ b/third_party/aom/aom_dsp/x86/variance_avx2.c
@@ -10,109 +10,224 @@
  */
 
 #include <immintrin.h>
-#include "./aom_dsp_rtcd.h"
-
-typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride,
-                             unsigned int *sse, int *sum);
-
-void aom_get32x32var_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride, unsigned int *sse,
-                          int *sum);
-
-static void variance_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride, int w, int h,
-                          unsigned int *sse, int *sum, get_var_avx2 var_fn,
-                          int block_size) {
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i += 16) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j],
-             ref_stride, &sse0, &sum0);
-      *sse += sse0;
-      *sum += sum0;
-    }
-  }
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
+
+static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) {
+  return _mm_add_epi16(_mm256_castsi256_si128(val),
+                       _mm256_extractf128_si256(val, 1));
 }
 
-unsigned int aom_variance16x16_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  unsigned int variance;
-  variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
-                aom_get16x16var_avx2, 16);
+static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) {
+  return _mm_add_epi32(_mm256_castsi256_si128(val),
+                       _mm256_extractf128_si256(val, 1));
+}
+
+static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
+                                        __m256i *const sse,
+                                        __m256i *const sum) {
+  const __m256i adj_sub = _mm256_set1_epi16(0xff01);  // (1,-1)
+
+  // unpack into pairs of source and reference values
+  const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
+  const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
+
+  // subtract adjacent elements using src*1 + ref*-1
+  const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
+  const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
+  const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
+  const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
 
-  variance = *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
-  _mm256_zeroupper();
-  return variance;
+  // add to the running totals
+  *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1));
+  *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
 }
 
-unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
-                               unsigned int *sse) {
-  int sum;
-  aom_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
-  _mm256_zeroupper();
-  return *sse;
+static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum,
+                                                     unsigned int *const sse) {
+  // extract the low lane and add it to the high lane
+  const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse);
+
+  // unpack sse and sum registers and add
+  const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
+  const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
+  const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
+
+  // perform the final summation and extract the results
+  const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
+  *((int *)sse) = _mm_cvtsi128_si32(res);
+  return _mm_extract_epi32(res, 1);
+}
+
+// handle pixels (<= 512)
+static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum,
+                                          unsigned int *const sse) {
+  // extract the low lane and add it to the high lane
+  const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
+  const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8));
+  const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64);
+  return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse);
+}
+
+// handle 1024 pixels (32x32, 16x64, 64x16)
+static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum,
+                                           unsigned int *const sse) {
+  // extract the low lane and add it to the high lane
+  const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
+  const __m128i vsum_64 =
+      _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
+                    _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
+  return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse);
+}
+
+static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
+  const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
+  const __m256i sum_hi =
+      _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
+  return _mm256_add_epi32(sum_lo, sum_hi);
+}
+
+// handle 2048 pixels (32x64, 64x32)
+static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum,
+                                           unsigned int *const sse) {
+  vsum = sum_to_32bit_avx2(vsum);
+  const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);
+  return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);
+}
+
+static INLINE void variance16_kernel_avx2(
+    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+    const int ref_stride, __m256i *const sse, __m256i *const sum) {
+  const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
+  const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
+  const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
+  const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
+  variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance32_kernel_avx2(const uint8_t *const src,
+                                          const uint8_t *const ref,
+                                          __m256i *const sse,
+                                          __m256i *const sum) {
+  const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
+  const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
+  variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
+
+  for (int i = 0; i < h; i += 2) {
+    variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+  }
 }
 
-unsigned int aom_variance32x16_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  unsigned int variance;
-  variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
-                aom_get32x32var_avx2, 32);
+static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
 
-  variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
-  _mm256_zeroupper();
-  return variance;
+  for (int i = 0; i < h; i++) {
+    variance32_kernel_avx2(src, ref, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance32x32_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  unsigned int variance;
-  variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
-                aom_get32x32var_avx2, 32);
+static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
 
-  variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
-  _mm256_zeroupper();
-  return variance;
+  for (int i = 0; i < h; i++) {
+    variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
+    variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance64x64_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  unsigned int variance;
-  variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
-                aom_get32x32var_avx2, 32);
+static INLINE void variance128_avx2(const uint8_t *src, const int src_stride,
+                                    const uint8_t *ref, const int ref_stride,
+                                    const int h, __m256i *const vsse,
+                                    __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
 
-  variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
-  _mm256_zeroupper();
-  return variance;
+  for (int i = 0; i < h; i++) {
+    variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
+    variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
+    variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum);
+    variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance64x32_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  unsigned int variance;
-  variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
-                aom_get32x32var_avx2, 32);
+#define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel)                         \
+  unsigned int aom_variance##bw##x##bh##_avx2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m256i vsse = _mm256_setzero_si256();                                    \
+    __m256i vsum;                                                             \
+    variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum);  \
+    const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse);       \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
+  }
+
+AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024);
+
+AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512);
+AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512);
+AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024);
+AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048);
 
-  variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
-  _mm256_zeroupper();
-  return variance;
+AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024);
+AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048);
+
+#define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh)                                   \
+  unsigned int aom_variance##bw##x##bh##_avx2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m256i vsse = _mm256_setzero_si256();                                    \
+    __m256i vsum = _mm256_setzero_si256();                                    \
+    for (int i = 0; i < (bh / uh); i++) {                                     \
+      __m256i vsum16;                                                         \
+      variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse,        \
+                          &vsum16);                                           \
+      vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16));               \
+      src += uh * src_stride;                                                 \
+      ref += uh * ref_stride;                                                 \
+    }                                                                         \
+    const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);                     \
+    const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);  \
+    return *sse - (unsigned int)(((int64_t)sum * sum) >> bits);               \
+  }
+
+AOM_VAR_LOOP_AVX2(64, 64, 12, 32);    // 64x32 * ( 64/32)
+AOM_VAR_LOOP_AVX2(64, 128, 13, 32);   // 64x32 * (128/32)
+AOM_VAR_LOOP_AVX2(128, 64, 13, 16);   // 128x16 * ( 64/16)
+AOM_VAR_LOOP_AVX2(128, 128, 14, 16);  // 128x16 * (128/16)
+
+unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *sse) {
+  aom_variance16x16_avx2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
 }
 
 unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
@@ -125,68 +240,164 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
     const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
     int height, unsigned int *sseptr);
 
-unsigned int aom_sub_pixel_variance64x64_avx2(const uint8_t *src,
-                                              int src_stride, int x_offset,
-                                              int y_offset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  unsigned int sse1;
-  const int se1 = aom_sub_pixel_variance32xh_avx2(
-      src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1);
-  unsigned int sse2;
-  const int se2 =
-      aom_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
-                                      dst + 32, dst_stride, 64, &sse2);
-  const int se = se1 + se2;
-  unsigned int variance;
-  *sse = sse1 + sse2;
-
-  variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
-  _mm256_zeroupper();
-  return variance;
-}
-
-unsigned int aom_sub_pixel_variance32x32_avx2(const uint8_t *src,
-                                              int src_stride, int x_offset,
-                                              int y_offset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  const int se = aom_sub_pixel_variance32xh_avx2(
-      src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
-
-  const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
-  _mm256_zeroupper();
-  return variance;
-}
-
-unsigned int aom_sub_pixel_avg_variance64x64_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
-  unsigned int sse1;
-  const int se1 = aom_sub_pixel_avg_variance32xh_avx2(
-      src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1);
-  unsigned int sse2;
-  const int se2 = aom_sub_pixel_avg_variance32xh_avx2(
-      src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32,
-      64, 64, &sse2);
-  const int se = se1 + se2;
-  unsigned int variance;
+#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, wlog2, hlog2)                        \
+  unsigned int aom_sub_pixel_variance##w##x##h##_avx2(                        \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
+    /*Avoid overflow in helper by capping height.*/                           \
+    const int hf = AOMMIN(h, 64);                                             \
+    unsigned int sse = 0;                                                     \
+    int se = 0;                                                               \
+    for (int i = 0; i < (w / wf); ++i) {                                      \
+      const uint8_t *src_ptr = src;                                           \
+      const uint8_t *dst_ptr = dst;                                           \
+      for (int j = 0; j < (h / hf); ++j) {                                    \
+        unsigned int sse2;                                                    \
+        const int se2 = aom_sub_pixel_variance##wf##xh_avx2(                  \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
+            &sse2);                                                           \
+        dst_ptr += hf * dst_stride;                                           \
+        src_ptr += hf * src_stride;                                           \
+        se += se2;                                                            \
+        sse += sse2;                                                          \
+      }                                                                       \
+      src += wf;                                                              \
+      dst += wf;                                                              \
+    }                                                                         \
+    *sse_ptr = sse;                                                           \
+    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));       \
+  }
+
+AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 7, 7);
+AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 7, 6);
+AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 6, 7);
+AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 6, 6);
+AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5);
+AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6);
+AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5);
+AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4);
+
+#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2)                \
+  unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2(                \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,     \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,          \
+      const uint8_t *sec) {                                               \
+    /*Avoid overflow in helper by capping height.*/                       \
+    const int hf = AOMMIN(h, 64);                                         \
+    unsigned int sse = 0;                                                 \
+    int se = 0;                                                           \
+    for (int i = 0; i < (w / wf); ++i) {                                  \
+      const uint8_t *src_ptr = src;                                       \
+      const uint8_t *dst_ptr = dst;                                       \
+      const uint8_t *sec_ptr = sec;                                       \
+      for (int j = 0; j < (h / hf); ++j) {                                \
+        unsigned int sse2;                                                \
+        const int se2 = aom_sub_pixel_avg_variance##wf##xh_avx2(          \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
+            sec_ptr, w, hf, &sse2);                                       \
+        dst_ptr += hf * dst_stride;                                       \
+        src_ptr += hf * src_stride;                                       \
+        sec_ptr += hf * w;                                                \
+        se += se2;                                                        \
+        sse += sse2;                                                      \
+      }                                                                   \
+      src += wf;                                                          \
+      dst += wf;                                                          \
+      sec += wf;                                                          \
+    }                                                                     \
+    *sse_ptr = sse;                                                       \
+    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));   \
+  }
 
-  *sse = sse1 + sse2;
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 7, 7);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 7, 6);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 6, 7);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 6, 6);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 6, 5);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 64, 32, 5, 6);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 32, 32, 5, 5);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 16, 32, 5, 4);
 
-  variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
-  _mm256_zeroupper();
-  return variance;
+static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
+  const __m256i d =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
+  return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
 }
 
-unsigned int aom_sub_pixel_avg_variance32x32_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
-  // Process 32 elements in parallel.
-  const int se = aom_sub_pixel_avg_variance32xh_avx2(
-      src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
-
-  const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
-  _mm256_zeroupper();
-  return variance;
+static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
+                                            const __m256i a,
+                                            uint8_t *comp_pred) {
+  const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS;
+  const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits));
+
+  const __m256i ma = _mm256_sub_epi8(alpha_max, a);
+
+  const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1);
+  const __m256i aaAL = _mm256_unpacklo_epi8(a, ma);
+  const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1);
+  const __m256i aaAH = _mm256_unpackhi_epi8(a, ma);
+
+  const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL);
+  const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH);
+  const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset);
+  const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset);
+
+  const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH);
+  _mm256_storeu_si256((__m256i *)(comp_pred), roundA);
+}
+
+void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
+                             int height, const uint8_t *ref, int ref_stride,
+                             const uint8_t *mask, int mask_stride,
+                             int invert_mask) {
+  int i = 0;
+  const uint8_t *src0 = invert_mask ? pred : ref;
+  const uint8_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  if (width == 8) {
+    comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
+                           mask, mask_stride);
+  } else if (width == 16) {
+    do {
+      const __m256i sA0 = mm256_loadu2(src0 + stride0, src0);
+      const __m256i sA1 = mm256_loadu2(src1 + stride1, src1);
+      const __m256i aA = mm256_loadu2(mask + mask_stride, mask);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      const __m256i sB0 = mm256_loadu2(src0 + stride0, src0);
+      const __m256i sB1 = mm256_loadu2(src1 + stride1, src1);
+      const __m256i aB = mm256_loadu2(mask + mask_stride, mask);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      // comp_pred's stride == width == 16
+      comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
+      comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
+      comp_pred += (16 << 2);
+      i += 4;
+    } while (i < height);
+  } else {  // for width == 32
+    do {
+      const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0));
+      const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1));
+      const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask));
+
+      const __m256i sB0 = _mm256_lddqu_si256((const __m256i *)(src0 + stride0));
+      const __m256i sB1 = _mm256_lddqu_si256((const __m256i *)(src1 + stride1));
+      const __m256i aB =
+          _mm256_lddqu_si256((const __m256i *)(mask + mask_stride));
+
+      comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
+      comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
+      comp_pred += (32 << 1);
+
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      i += 2;
+    } while (i < height);
+  }
 }
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
index 999b541e3..88e27aef3 100644
--- a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
+++ b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
@@ -11,7 +11,8 @@
 
 #include <immintrin.h>  // AVX2
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_ports/mem.h"
 
 /* clang-format off */
@@ -35,203 +36,6 @@ DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
 };
 /* clang-format on */
 
-void aom_get16x16var_avx2(const unsigned char *src_ptr, int source_stride,
-                          const unsigned char *ref_ptr, int recon_stride,
-                          unsigned int *SSE, int *Sum) {
-  __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
-  __m256i ref_expand_high, madd_low, madd_high;
-  unsigned int i, src_2strides, ref_2strides;
-  __m256i zero_reg = _mm256_set1_epi16(0);
-  __m256i sum_ref_src = _mm256_set1_epi16(0);
-  __m256i madd_ref_src = _mm256_set1_epi16(0);
-
-  // processing two strides in a 256 bit register reducing the number
-  // of loop stride by half (comparing to the sse2 code)
-  src_2strides = source_stride << 1;
-  ref_2strides = recon_stride << 1;
-  for (i = 0; i < 8; i++) {
-    src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr)));
-    src = _mm256_inserti128_si256(
-        src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1);
-
-    ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr)));
-    ref = _mm256_inserti128_si256(
-        ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1);
-
-    // expanding to 16 bit each lane
-    src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
-    src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
-
-    ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
-    ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
-
-    // src-ref
-    src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
-    src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
-
-    // madd low (src - ref)
-    madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
-
-    // add high to low
-    src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
-
-    // madd high (src - ref)
-    madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
-
-    sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
-
-    // add high to low
-    madd_ref_src =
-        _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
-
-    src_ptr += src_2strides;
-    ref_ptr += ref_2strides;
-  }
-
-  {
-    __m128i sum_res, madd_res;
-    __m128i expand_sum_low, expand_sum_high, expand_sum;
-    __m128i expand_madd_low, expand_madd_high, expand_madd;
-    __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
-
-    // extract the low lane and add it to the high lane
-    sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
-                            _mm256_extractf128_si256(sum_ref_src, 1));
-
-    madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
-                             _mm256_extractf128_si256(madd_ref_src, 1));
-
-    // padding each 2 bytes with another 2 zeroed bytes
-    expand_sum_low =
-        _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
-    expand_sum_high =
-        _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
-
-    // shifting the sign 16 bits right
-    expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
-    expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
-
-    expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
-
-    // expand each 32 bits of the madd result to 64 bits
-    expand_madd_low =
-        _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
-    expand_madd_high =
-        _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
-
-    expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
-
-    ex_expand_sum_low =
-        _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
-    ex_expand_sum_high =
-        _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
-
-    ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
-
-    // shift 8 bytes eight
-    madd_res = _mm_srli_si128(expand_madd, 8);
-    sum_res = _mm_srli_si128(ex_expand_sum, 8);
-
-    madd_res = _mm_add_epi32(madd_res, expand_madd);
-    sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
-
-    *((int *)SSE) = _mm_cvtsi128_si32(madd_res);
-
-    *((int *)Sum) = _mm_cvtsi128_si32(sum_res);
-  }
-  _mm256_zeroupper();
-}
-
-void aom_get32x32var_avx2(const unsigned char *src_ptr, int source_stride,
-                          const unsigned char *ref_ptr, int recon_stride,
-                          unsigned int *SSE, int *Sum) {
-  __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
-  __m256i ref_expand_high, madd_low, madd_high;
-  unsigned int i;
-  __m256i zero_reg = _mm256_set1_epi16(0);
-  __m256i sum_ref_src = _mm256_set1_epi16(0);
-  __m256i madd_ref_src = _mm256_set1_epi16(0);
-
-  // processing 32 elements in parallel
-  for (i = 0; i < 16; i++) {
-    src = _mm256_loadu_si256((__m256i const *)(src_ptr));
-
-    ref = _mm256_loadu_si256((__m256i const *)(ref_ptr));
-
-    // expanding to 16 bit each lane
-    src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
-    src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
-
-    ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
-    ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
-
-    // src-ref
-    src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
-    src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
-
-    // madd low (src - ref)
-    madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
-
-    // add high to low
-    src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
-
-    // madd high (src - ref)
-    madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
-
-    sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
-
-    // add high to low
-    madd_ref_src =
-        _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
-
-    src_ptr += source_stride;
-    ref_ptr += recon_stride;
-  }
-
-  {
-    __m256i expand_sum_low, expand_sum_high, expand_sum;
-    __m256i expand_madd_low, expand_madd_high, expand_madd;
-    __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
-
-    // padding each 2 bytes with another 2 zeroed bytes
-    expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
-    expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
-
-    // shifting the sign 16 bits right
-    expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
-    expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
-
-    expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
-
-    // expand each 32 bits of the madd result to 64 bits
-    expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
-    expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
-
-    expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
-
-    ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
-    ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
-
-    ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
-
-    // shift 8 bytes eight
-    madd_ref_src = _mm256_srli_si256(expand_madd, 8);
-    sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
-
-    madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
-    sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
-
-    // extract the low lane and the high lane and add the results
-    *((int *)SSE) =
-        _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
-        _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
-
-    *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
-                    _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
-  }
-  _mm256_zeroupper();
-}
-
 #define FILTER_SRC(filter)                               \
   /* filter the source */                                \
   exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c
index 211fad3f8..c8c90a7dc 100644
--- a/third_party/aom/aom_dsp/x86/variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/variance_sse2.c
@@ -12,24 +12,24 @@
 #include <assert.h>
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
 
 #include "aom_ports/mem.h"
 
-#include "./av1_rtcd.h"
 #include "av1/common/filter.h"
-
-typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride,
-                               const unsigned char *ref, int ref_stride,
-                               unsigned int *sse, int *sum);
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
 
 unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
   __m128i vsum = _mm_setzero_si128();
   int i;
 
   for (i = 0; i < 32; ++i) {
-    const __m128i v = _mm_loadu_si128((const __m128i *)src);
+    const __m128i v = xx_loadu_128(src);
     vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
     src += 8;
   }
@@ -39,276 +39,265 @@ unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
   return _mm_cvtsi128_si32(vsum);
 }
 
-#define READ64(p, stride, i)                                  \
-  _mm_unpacklo_epi8(                                          \
-      _mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
-      _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
+static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
+  const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride));
+  const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride));
+  return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
+}
 
-static void get4x4var_sse2(const uint8_t *src, int src_stride,
-                           const uint8_t *ref, int ref_stride,
-                           unsigned int *sse, int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
-  const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
-  const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
-  const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
-  const __m128i diff0 = _mm_sub_epi16(src0, ref0);
-  const __m128i diff1 = _mm_sub_epi16(src1, ref1);
-
-  // sum
-  __m128i vsum = _mm_add_epi16(diff0, diff1);
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
-  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) {
+  const __m128i p0 = _mm_loadl_epi64((const __m128i *)p);
+  return _mm_unpacklo_epi8(p0, _mm_setzero_si128());
+}
 
-  // sse
-  vsum =
-      _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1));
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
-  *sse = _mm_cvtsi128_si32(vsum);
+// Accumulate 4 32bit numbers in val to 1 32bit number
+static INLINE unsigned int add32x4_sse2(__m128i val) {
+  val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
+  val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
+  return _mm_cvtsi128_si32(val);
 }
 
-void aom_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref,
-                        int ref_stride, unsigned int *sse, int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i vsum = _mm_setzero_si128();
-  __m128i vsse = _mm_setzero_si128();
-  int i;
+// Accumulate 8 16bit in sum to 4 32bit number
+static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
+  const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
+  const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
+  return _mm_add_epi32(sum_lo, sum_hi);
+}
 
-  for (i = 0; i < 8; i += 2) {
-    const __m128i src0 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(src + i * src_stride)), zero);
-    const __m128i ref0 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(ref + i * ref_stride)), zero);
-    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
-
-    const __m128i src1 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(src + (i + 1) * src_stride)), zero);
-    const __m128i ref1 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(ref + (i + 1) * ref_stride)), zero);
-    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
-
-    vsum = _mm_add_epi16(vsum, diff0);
-    vsum = _mm_add_epi16(vsum, diff1);
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
-  }
+static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref,
+                                        __m128i *const sse,
+                                        __m128i *const sum) {
+  const __m128i diff = _mm_sub_epi16(src, ref);
+  *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
+  *sum = _mm_add_epi16(*sum, diff);
+}
+
+// Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
+// Slightly faster than variance_final_256_pel_sse2()
+// diff sum of 128 pixels can still fit in 16bit integer
+static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
 
-  // sum
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
   *sum = (int16_t)_mm_extract_epi16(vsum, 0);
-
-  // sse
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
-  *sse = _mm_cvtsi128_si32(vsse);
 }
 
-void aom_get16x16var_sse2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride, unsigned int *sse,
-                          int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i vsum = _mm_setzero_si128();
-  __m128i vsse = _mm_setzero_si128();
-  int i;
+// Can handle 256 pixels' diff sum (such as 16x16)
+static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
 
-  for (i = 0; i < 16; ++i) {
-    const __m128i s = _mm_loadu_si128((const __m128i *)src);
-    const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+  *sum += (int16_t)_mm_extract_epi16(vsum, 1);
+}
 
-    const __m128i src0 = _mm_unpacklo_epi8(s, zero);
-    const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
-    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+// Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
+static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
 
-    const __m128i src1 = _mm_unpackhi_epi8(s, zero);
-    const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
-    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_unpacklo_epi16(vsum, vsum);
+  vsum = _mm_srai_epi32(vsum, 16);
+  *sum = add32x4_sse2(vsum);
+}
 
-    vsum = _mm_add_epi16(vsum, diff0);
-    vsum = _mm_add_epi16(vsum, diff1);
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+// Can handle 1024 pixels' diff sum (such as 32x32)
+static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum,
+                                                unsigned int *const sse,
+                                                int *const sum) {
+  *sse = add32x4_sse2(vsse);
 
-    src += src_stride;
-    ref += ref_stride;
-  }
+  vsum = sum_to_32bit_sse2(vsum);
+  *sum = add32x4_sse2(vsum);
+}
 
-  // sum
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  *sum =
-      (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1);
+static INLINE void variance4_sse2(const uint8_t *src, const int src_stride,
+                                  const uint8_t *ref, const int ref_stride,
+                                  const int h, __m128i *const sse,
+                                  __m128i *const sum) {
+  assert(h <= 256);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
 
-  // sse
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
-  *sse = _mm_cvtsi128_si32(vsse);
-}
+  for (int i = 0; i < h; i += 2) {
+    const __m128i s = load4x2_sse2(src, src_stride);
+    const __m128i r = load4x2_sse2(ref, ref_stride);
 
-static void variance_sse2(const unsigned char *src, int src_stride,
-                          const unsigned char *ref, int ref_stride, int w,
-                          int h, unsigned int *sse, int *sum,
-                          getNxMvar_fn_t var_fn, int block_size) {
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
-             ref_stride, &sse0, &sum0);
-      *sse += sse0;
-      *sum += sum0;
-    }
+    variance_kernel_sse2(s, r, sse, sum);
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
   }
 }
 
-unsigned int aom_variance4x4_sse2(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  assert(sum <= 255 * 4 * 4);
-  assert(sum >= -255 * 4 * 4);
-  return *sse - ((sum * sum) >> 4);
-}
-
-unsigned int aom_variance8x4_sse2(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum,
-                get4x4var_sse2, 4);
-  assert(sum <= 255 * 8 * 4);
-  assert(sum >= -255 * 8 * 4);
-  return *sse - ((sum * sum) >> 5);
+static INLINE void variance8_sse2(const uint8_t *src, const int src_stride,
+                                  const uint8_t *ref, const int ref_stride,
+                                  const int h, __m128i *const sse,
+                                  __m128i *const sum) {
+  assert(h <= 128);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+  for (int i = 0; i < h; i++) {
+    const __m128i s = load8_8to16_sse2(src);
+    const __m128i r = load8_8to16_sse2(ref);
+
+    variance_kernel_sse2(s, r, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance4x8_sse2(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum,
-                get4x4var_sse2, 4);
-  assert(sum <= 255 * 8 * 4);
-  assert(sum >= -255 * 8 * 4);
-  return *sse - ((sum * sum) >> 5);
+static INLINE void variance16_kernel_sse2(const uint8_t *const src,
+                                          const uint8_t *const ref,
+                                          __m128i *const sse,
+                                          __m128i *const sum) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i s = _mm_loadu_si128((const __m128i *)src);
+  const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+  const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+  const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+  const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+  const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+
+  variance_kernel_sse2(src0, ref0, sse, sum);
+  variance_kernel_sse2(src1, ref1, sse, sum);
 }
 
-unsigned int aom_variance8x8_sse2(const unsigned char *src, int src_stride,
-                                  const unsigned char *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  aom_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  assert(sum <= 255 * 8 * 8);
-  assert(sum >= -255 * 8 * 8);
-  return *sse - ((sum * sum) >> 6);
-}
+static INLINE void variance16_sse2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  assert(h <= 64);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
 
-unsigned int aom_variance16x8_sse2(const unsigned char *src, int src_stride,
-                                   const unsigned char *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum,
-                aom_get8x8var_sse2, 8);
-  assert(sum <= 255 * 16 * 8);
-  assert(sum >= -255 * 16 * 8);
-  return *sse - ((sum * sum) >> 7);
+  for (int i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src, ref, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance8x16_sse2(const unsigned char *src, int src_stride,
-                                   const unsigned char *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum,
-                aom_get8x8var_sse2, 8);
-  assert(sum <= 255 * 16 * 8);
-  assert(sum >= -255 * 16 * 8);
-  return *sse - ((sum * sum) >> 7);
+static INLINE void variance32_sse2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  assert(h <= 32);  // May overflow for larger height.
+  // Don't initialize sse here since it's an accumulation.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
+    variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance16x16_sse2(const unsigned char *src, int src_stride,
-                                    const unsigned char *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  aom_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  assert(sum <= 255 * 16 * 16);
-  assert(sum >= -255 * 16 * 16);
-  return *sse - ((uint32_t)((int64_t)sum * sum) >> 8);
+static INLINE void variance64_sse2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  assert(h <= 16);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
+    variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
+    variance16_kernel_sse2(src + 32, ref + 32, sse, sum);
+    variance16_kernel_sse2(src + 48, ref + 48, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance32x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 32 * 32);
-  assert(sum >= -255 * 32 * 32);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
+static INLINE void variance128_sse2(const uint8_t *src, const int src_stride,
+                                    const uint8_t *ref, const int ref_stride,
+                                    const int h, __m128i *const sse,
+                                    __m128i *const sum) {
+  assert(h <= 8);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      const int offset0 = j << 5;
+      const int offset1 = offset0 + 16;
+      variance16_kernel_sse2(src + offset0, ref + offset0, sse, sum);
+      variance16_kernel_sse2(src + offset1, ref + offset1, sse, sum);
+    }
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance32x16_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 32 * 16);
-  assert(sum >= -255 * 32 * 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
-}
+#define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels)                        \
+  unsigned int aom_variance##bw##x##bh##_sse2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m128i vsse = _mm_setzero_si128();                                       \
+    __m128i vsum;                                                             \
+    int sum = 0;                                                              \
+    variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum);  \
+    variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum);            \
+    assert(sum <= 255 * bw * bh);                                             \
+    assert(sum >= -255 * bw * bh);                                            \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
+  }
 
-unsigned int aom_variance16x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 32 * 16);
-  assert(sum >= -255 * 32 * 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
-}
+AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128);
+AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128);
+AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128);
+
+AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128);
+AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128);
+AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128);
+AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256);
+
+AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128);
+AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128);
+AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256);
+AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512);
+AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024);
+
+AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256);
+AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512);
+AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024);
+
+#define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh)                                   \
+  unsigned int aom_variance##bw##x##bh##_sse2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m128i vsse = _mm_setzero_si128();                                       \
+    __m128i vsum = _mm_setzero_si128();                                       \
+    for (int i = 0; i < (bh / uh); ++i) {                                     \
+      __m128i vsum16;                                                         \
+      variance##bw##_sse2(src, src_stride, ref, ref_stride, uh, &vsse,        \
+                          &vsum16);                                           \
+      vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));                  \
+      src += (src_stride * uh);                                               \
+      ref += (ref_stride * uh);                                               \
+    }                                                                         \
+    *sse = add32x4_sse2(vsse);                                                \
+    int sum = add32x4_sse2(vsum);                                             \
+    assert(sum <= 255 * bw * bh);                                             \
+    assert(sum >= -255 * bw * bh);                                            \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
+  }
 
-unsigned int aom_variance64x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 64 * 64);
-  assert(sum >= -255 * 64 * 64);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
-}
+AOM_VAR_LOOP_SSE2(32, 64, 11, 32);  // 32x32 * ( 64/32 )
 
-unsigned int aom_variance64x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 64 * 32);
-  assert(sum >= -255 * 64 * 32);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
-}
+AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024);
+AOM_VAR_LOOP_SSE2(64, 32, 11, 16);   // 64x16 * ( 32/16 )
+AOM_VAR_LOOP_SSE2(64, 64, 12, 16);   // 64x16 * ( 64/16 )
+AOM_VAR_LOOP_SSE2(64, 128, 13, 16);  // 64x16 * ( 128/16 )
 
-unsigned int aom_variance32x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 64 * 32);
-  assert(sum >= -255 * 64 * 32);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
-}
+AOM_VAR_LOOP_SSE2(128, 64, 13, 8);   // 128x8 * ( 64/8 )
+AOM_VAR_LOOP_SSE2(128, 128, 14, 8);  // 128x8 * ( 128/8 )
 
 unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
                              const uint8_t *ref, int ref_stride,
@@ -338,74 +327,6 @@ unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
   return *sse;
 }
 
-#if CONFIG_EXT_PARTITION_TYPES
-unsigned int aom_variance4x16_sse2(const uint8_t *src, int src_stride,
-                                   const uint8_t *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 4, 16, sse, &sum,
-                get4x4var_sse2, 4);
-  assert(sum <= 255 * 4 * 16);
-  assert(sum >= -255 * 4 * 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 6);
-}
-
-unsigned int aom_variance16x4_sse2(const uint8_t *src, int src_stride,
-                                   const uint8_t *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 4, sse, &sum,
-                get4x4var_sse2, 4);
-  assert(sum <= 255 * 16 * 4);
-  assert(sum >= -255 * 16 * 4);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 6);
-}
-
-unsigned int aom_variance8x32_sse2(const uint8_t *src, int src_stride,
-                                   const uint8_t *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 32, sse, &sum,
-                aom_get8x8var_sse2, 8);
-  assert(sum <= 255 * 8 * 32);
-  assert(sum >= -255 * 8 * 32);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 8);
-}
-
-unsigned int aom_variance32x8_sse2(const uint8_t *src, int src_stride,
-                                   const uint8_t *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 8, sse, &sum,
-                aom_get8x8var_sse2, 8);
-  assert(sum <= 255 * 32 * 8);
-  assert(sum >= -255 * 32 * 8);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 8);
-}
-
-unsigned int aom_variance16x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 64, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 16 * 64);
-  assert(sum >= -255 * 16 * 64);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
-}
-
-unsigned int aom_variance64x16_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 16, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 64 * 16);
-  assert(sum >= -255 * 64 * 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
-}
-#endif
-
 // The 2 unused parameters are place holders for PIC enabled build.
 // These definitions are for functions defined in subpel_variance.asm
 #define DECL(w, opt)                                                           \
@@ -423,75 +344,57 @@ DECLS(ssse3);
 #undef DECLS
 #undef DECL
 
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                       \
-  unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                        \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,          \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {             \
-    unsigned int sse;                                                          \
-    int se = aom_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset,   \
-                                                  y_offset, dst, dst_stride,   \
-                                                  h, &sse, NULL, NULL);        \
-    if (w > wf) {                                                              \
-      unsigned int sse2;                                                       \
-      int se2 = aom_sub_pixel_variance##wf##xh_##opt(                          \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
-          &sse2, NULL, NULL);                                                  \
-      se += se2;                                                               \
-      sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = aom_sub_pixel_variance##wf##xh_##opt(                            \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-        se2 = aom_sub_pixel_variance##wf##xh_##opt(                            \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-      }                                                                        \
-    }                                                                          \
-    *sse_ptr = sse;                                                            \
-    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));   \
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                      \
+  unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                       \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
+    /*Avoid overflow in helper by capping height.*/                           \
+    const int hf = AOMMIN(h, 64);                                             \
+    unsigned int sse = 0;                                                     \
+    int se = 0;                                                               \
+    for (int i = 0; i < (w / wf); ++i) {                                      \
+      const uint8_t *src_ptr = src;                                           \
+      const uint8_t *dst_ptr = dst;                                           \
+      for (int j = 0; j < (h / hf); ++j) {                                    \
+        unsigned int sse2;                                                    \
+        const int se2 = aom_sub_pixel_variance##wf##xh_##opt(                 \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
+            &sse2, NULL, NULL);                                               \
+        dst_ptr += hf * dst_stride;                                           \
+        src_ptr += hf * src_stride;                                           \
+        se += se2;                                                            \
+        sse += sse2;                                                          \
+      }                                                                       \
+      src += wf;                                                              \
+      dst += wf;                                                              \
+    }                                                                         \
+    *sse_ptr = sse;                                                           \
+    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));  \
   }
 
-#if CONFIG_EXT_PARTITION_TYPES
-#define FNS(opt)                                    \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t));   \
-  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t));    \
-  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t));     \
-  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t));     \
-  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t));     \
-  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t));     \
-  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));    \
-  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));   \
-  FN(8, 32, 8, 3, 5, opt, (int32_t), (int32_t));    \
-  FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t));   \
-  FN(16, 64, 16, 4, 6, opt, (int32_t), (int32_t));  \
-  FN(64, 16, 16, 6, 4, opt, (int32_t), (int32_t))
-#else
-#define FNS(opt)                                    \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t));   \
-  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t));    \
-  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t));     \
-  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t));     \
-  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t));     \
-  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
-#endif
+#define FNS(opt)                                     \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
+  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t));    \
+  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t));     \
+  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t));      \
+  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t));      \
+  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t));      \
+  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t));      \
+  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));     \
+  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));    \
+  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t));    \
+  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
+  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
 
 FNS(sse2);
 FNS(ssse3);
@@ -516,76 +419,61 @@ DECLS(ssse3);
 #undef DECL
 #undef DECLS
 
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                       \
-  unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                    \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,          \
-      const uint8_t *dst, int dst_stride, unsigned int *sseptr,                \
-      const uint8_t *sec) {                                                    \
-    unsigned int sse;                                                          \
-    int se = aom_sub_pixel_avg_variance##wf##xh_##opt(                         \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
-        NULL, NULL);                                                           \
-    if (w > wf) {                                                              \
-      unsigned int sse2;                                                       \
-      int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(                      \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
-          sec + 16, w, h, &sse2, NULL, NULL);                                  \
-      se += se2;                                                               \
-      sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(                        \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
-            sec + 32, w, h, &sse2, NULL, NULL);                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-        se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(                        \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
-            sec + 48, w, h, &sse2, NULL, NULL);                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-      }                                                                        \
-    }                                                                          \
-    *sseptr = sse;                                                             \
-    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));   \
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                     \
+  unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                  \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,        \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,             \
+      const uint8_t *sec) {                                                  \
+    /*Avoid overflow in helper by capping height.*/                          \
+    const int hf = AOMMIN(h, 64);                                            \
+    unsigned int sse = 0;                                                    \
+    int se = 0;                                                              \
+    for (int i = 0; i < (w / wf); ++i) {                                     \
+      const uint8_t *src_ptr = src;                                          \
+      const uint8_t *dst_ptr = dst;                                          \
+      const uint8_t *sec_ptr = sec;                                          \
+      for (int j = 0; j < (h / hf); ++j) {                                   \
+        unsigned int sse2;                                                   \
+        const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(            \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride,    \
+            sec_ptr, w, hf, &sse2, NULL, NULL);                              \
+        dst_ptr += hf * dst_stride;                                          \
+        src_ptr += hf * src_stride;                                          \
+        sec_ptr += hf * w;                                                   \
+        se += se2;                                                           \
+        sse += sse2;                                                         \
+      }                                                                      \
+      src += wf;                                                             \
+      dst += wf;                                                             \
+      sec += wf;                                                             \
+    }                                                                        \
+    *sse_ptr = sse;                                                          \
+    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
   }
 
-#if CONFIG_EXT_PARTITION_TYPES
-#define FNS(opt)                                    \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t));  \
-  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t));   \
-  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t));    \
-  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t));    \
-  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t));    \
-  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t));    \
-  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));    \
-  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));   \
-  FN(8, 32, 8, 3, 5, opt, (int32_t), (int32_t));    \
-  FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t));   \
-  FN(16, 64, 16, 4, 6, opt, (int32_t), (int32_t));  \
-  FN(64, 16, 16, 6, 4, opt, (int32_t), (int32_t))
-#else
-#define FNS(opt)                                    \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t));  \
-  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t));   \
-  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t));    \
-  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t));    \
-  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t));    \
-  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
-#endif
+#define FNS(opt)                                     \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
+  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t));   \
+  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t));    \
+  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t));     \
+  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t));     \
+  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t));     \
+  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t));     \
+  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));     \
+  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));    \
+  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t));    \
+  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
+  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
 
 FNS(sse2);
 FNS(ssse3);
@@ -593,9 +481,97 @@ FNS(ssse3);
 #undef FNS
 #undef FN
 
-void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height,
+void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
+                             int mi_row, int mi_col, const MV *const mv,
+                             uint8_t *comp_pred, int width, int height,
                              int subpel_x_q3, int subpel_y_q3,
                              const uint8_t *ref, int ref_stride) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      // Note: This is mostly a copy from the >=8X8 case in
+      // build_inter_predictors() function, with some small tweaks.
+
+      // Some assumptions.
+      const int plane = 0;
+
+      // Get pre-requisites.
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const int ssx = pd->subsampling_x;
+      const int ssy = pd->subsampling_y;
+      assert(ssx == 0 && ssy == 0);
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+
+      // Calculate subpel_x/y and x/y_step.
+      const int row_start = 0;  // Because ss_y is 0.
+      const int col_start = 0;  // Because ss_x is 0.
+      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
+      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
+      int orig_pos_y = pre_y << SUBPEL_BITS;
+      orig_pos_y += mv->row * (1 << (1 - ssy));
+      int orig_pos_x = pre_x << SUBPEL_BITS;
+      orig_pos_x += mv->col * (1 << (1 - ssx));
+      int pos_y = sf->scale_value_y(orig_pos_y, sf);
+      int pos_x = sf->scale_value_x(orig_pos_x, sf);
+      pos_x += SCALE_EXTRA_OFF;
+      pos_y += SCALE_EXTRA_OFF;
+
+      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                         << SCALE_SUBPEL_BITS;
+      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
+                        << SCALE_SUBPEL_BITS;
+      pos_y = clamp(pos_y, top, bottom);
+      pos_x = clamp(pos_x, left, right);
+
+      const uint8_t *const pre =
+          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+          (pos_x >> SCALE_SUBPEL_BITS);
+
+      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+                                           pos_x & SCALE_SUBPEL_MASK,
+                                           pos_y & SCALE_SUBPEL_MASK };
+
+      // Get warp types.
+      const WarpedMotionParams *const wm =
+          &xd->global_motion[mi->ref_frame[ref_num]];
+      const int is_global = is_global_mv_block(mi, wm->wmtype);
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global;
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+      // Get convolve parameters.
+      ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd);
+      const InterpFilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+      // Get the inter predictor.
+      const int build_for_obmc = 0;
+      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width,
+                               &subpel_params, sf, width, height, &conv_params,
+                               filters, &warp_types, mi_x >> pd->subsampling_x,
+                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
+                               build_for_obmc, xd, cm->allow_warped_motion);
+
+      return;
+    }
+  }
+
+  const InterpFilterParams filter =
+      av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+
   if (!subpel_x_q3 && !subpel_y_q3) {
     if (width >= 16) {
       int i;
@@ -604,8 +580,7 @@ void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height,
       for (i = 0; i < height; i++) {
         int j;
         for (j = 0; j < width; j += 16) {
-          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
-          _mm_storeu_si128((__m128i *)comp_pred, s0);
+          xx_storeu_128(comp_pred, xx_loadu_128(ref));
           comp_pred += 16;
           ref += 16;
         }
@@ -617,10 +592,9 @@ void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height,
       assert(!(height & 1));
       /*Read 8 pixels two rows at a time.*/
       for (i = 0; i < height; i += 2) {
-        __m128i s0 = _mm_loadl_epi64((const __m128i *)ref);
-        __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
-        __m128i t0 = _mm_unpacklo_epi64(s0, s1);
-        _mm_storeu_si128((__m128i *)comp_pred, t0);
+        __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
+        __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
+        xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
         comp_pred += 16;
         ref += 2 * ref_stride;
       }
@@ -630,69 +604,62 @@ void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height,
       assert(!(height & 3));
       /*Read 4 pixels four rows at a time.*/
       for (i = 0; i < height; i++) {
-        __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
-        __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + ref_stride));
-        __m128i s2 =
-            _mm_cvtsi32_si128(*(const uint32_t *)(ref + 2 * ref_stride));
-        __m128i s3 =
-            _mm_cvtsi32_si128(*(const uint32_t *)(ref + 3 * ref_stride));
-        __m128i t0 = _mm_unpacklo_epi32(s0, s1);
-        __m128i t1 = _mm_unpacklo_epi32(s2, s3);
-        __m128i u0 = _mm_unpacklo_epi64(t0, t1);
-        _mm_storeu_si128((__m128i *)comp_pred, u0);
+        const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride);
+        const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride);
+        const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride);
+        const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride);
+        const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
+                                               _mm_unpacklo_epi32(row2, row3));
+        xx_storeu_128(comp_pred, reg);
         comp_pred += 16;
         ref += 4 * ref_stride;
       }
     }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
+                        width, height);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
+                       width, height);
   } else {
-    InterpFilterParams filter;
-    filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR);
-    if (!subpel_y_q3) {
-      const int16_t *kernel;
-      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-      aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
-                          -1, width, height);
-    } else if (!subpel_x_q3) {
-      const int16_t *kernel;
-      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-      aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
-                         16, width, height);
-    } else {
-      DECLARE_ALIGNED(16, uint8_t,
-                      temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-      const int16_t *kernel_x;
-      const int16_t *kernel_y;
-      int intermediate_height;
-      kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-      kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-      intermediate_height =
-          (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
-      assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-      aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1),
-                          ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
-                          width, intermediate_height);
-      aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
-                         MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
-                         width, height);
-    }
+    DECLARE_ALIGNED(16, uint8_t,
+                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), ref_stride,
+                        temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+                        intermediate_height);
+    aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
+                       MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
+                       width, height);
   }
 }
 
-void aom_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred,
-                                      int width, int height, int subpel_x_q3,
-                                      int subpel_y_q3, const uint8_t *ref,
-                                      int ref_stride) {
+void aom_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride) {
   int n;
   int i;
-  aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
-                     ref_stride);
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride);
   /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
   assert(!(width * height & 15));
   n = width * height >> 4;
   for (i = 0; i < n; i++) {
-    __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred);
-    __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
-    _mm_storeu_si128((__m128i *)comp_pred, _mm_avg_epu8(s0, p0));
+    __m128i s0 = xx_loadu_128(comp_pred);
+    __m128i p0 = xx_loadu_128(pred);
+    xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
     comp_pred += 16;
     pred += 16;
   }
diff --git a/third_party/aom/aom_mem/aom_mem.c b/third_party/aom/aom_mem/aom_mem.c
index 66a0c08de..e603fc5bf 100644
--- a/third_party/aom/aom_mem/aom_mem.c
+++ b/third_party/aom/aom_mem/aom_mem.c
@@ -9,8 +9,6 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#define __AOM_MEM_C__
-
 #include "aom_mem.h"
 #include <stdio.h>
 #include <stdlib.h>
@@ -18,6 +16,17 @@
 #include "include/aom_mem_intrnl.h"
 #include "aom/aom_integer.h"
 
+#if defined(AOM_MAX_ALLOCABLE_MEMORY)
+// Returns 0 in case of overflow of nmemb * size.
+static int check_size_argument_overflow(uint64_t nmemb, uint64_t size) {
+  const uint64_t total_size = nmemb * size;
+  if (nmemb == 0) return 1;
+  if (size > AOM_MAX_ALLOCABLE_MEMORY / nmemb) return 0;
+  if (total_size != (size_t)total_size) return 0;
+  return 1;
+}
+#endif
+
 static size_t GetAlignedMallocSize(size_t size, size_t align) {
   return size + align - 1 + ADDRESS_STORAGE_SIZE;
 }
@@ -40,6 +49,9 @@ static void *GetActualMallocAddress(void *const mem) {
 void *aom_memalign(size_t align, size_t size) {
   void *x = NULL;
   const size_t aligned_size = GetAlignedMallocSize(size, align);
+#if defined(AOM_MAX_ALLOCABLE_MEMORY)
+  if (!check_size_argument_overflow(1, aligned_size)) return NULL;
+#endif
   void *const addr = malloc(aligned_size);
   if (addr) {
     x = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, align);
@@ -64,11 +76,9 @@ void aom_free(void *memblk) {
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 void *aom_memset16(void *dest, int val, size_t length) {
   size_t i;
   uint16_t *dest16 = (uint16_t *)dest;
   for (i = 0; i < length; i++) *dest16++ = val;
   return dest;
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_mem/aom_mem.cmake b/third_party/aom/aom_mem/aom_mem.cmake
index a142824c2..eaee8440b 100644
--- a/third_party/aom/aom_mem/aom_mem.cmake
+++ b/third_party/aom/aom_mem/aom_mem.cmake
@@ -1,27 +1,26 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_AOM_MEM_AOM_MEM_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_MEM_AOM_MEM_CMAKE_)
+  return()
+endif() # AOM_AOM_MEM_AOM_MEM_CMAKE_
 set(AOM_AOM_MEM_AOM_MEM_CMAKE_ 1)
 
-set(AOM_MEM_SOURCES
-    "${AOM_ROOT}/aom_mem/aom_mem.c"
-    "${AOM_ROOT}/aom_mem/aom_mem.h"
-    "${AOM_ROOT}/aom_mem/include/aom_mem_intrnl.h")
+list(APPEND AOM_MEM_SOURCES "${AOM_ROOT}/aom_mem/aom_mem.c"
+            "${AOM_ROOT}/aom_mem/aom_mem.h"
+            "${AOM_ROOT}/aom_mem/include/aom_mem_intrnl.h")
 
 # Creates the aom_mem build target and makes libaom depend on it. The libaom
 # target must exist before this function is called.
-function (setup_aom_mem_targets)
+function(setup_aom_mem_targets)
   add_library(aom_mem OBJECT ${AOM_MEM_SOURCES})
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_mem PARENT_SCOPE)
   target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_mem>)
-endfunction ()
-
-endif ()  # AOM_AOM_MEM_AOM_MEM_CMAKE_
+endfunction()
diff --git a/third_party/aom/aom_mem/aom_mem.h b/third_party/aom/aom_mem/aom_mem.h
index 0d533c813..a36ee3e03 100644
--- a/third_party/aom/aom_mem/aom_mem.h
+++ b/third_party/aom/aom_mem/aom_mem.h
@@ -12,26 +12,31 @@
 #ifndef AOM_MEM_AOM_MEM_H_
 #define AOM_MEM_AOM_MEM_H_
 
-#include "aom_config.h"
+#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+
 #if defined(__uClinux__)
 #include <lddk.h>
 #endif
 
-#include <stdlib.h>
-#include <stddef.h>
-
 #if defined(__cplusplus)
 extern "C" {
 #endif
 
+#ifndef AOM_MAX_ALLOCABLE_MEMORY
+#if SIZE_MAX > (1ULL << 32)
+#define AOM_MAX_ALLOCABLE_MEMORY 8589934592  // 8 GB
+#else
+// For 32-bit targets keep this below INT_MAX to avoid valgrind warnings.
+#define AOM_MAX_ALLOCABLE_MEMORY ((1ULL << 31) - (1 << 16))
+#endif
+#endif
+
 void *aom_memalign(size_t align, size_t size);
 void *aom_malloc(size_t size);
 void *aom_calloc(size_t num, size_t size);
 void aom_free(void *memblk);
-
-#if CONFIG_HIGHBITDEPTH
 void *aom_memset16(void *dest, int val, size_t length);
-#endif
 
 #include <string.h>
 
diff --git a/third_party/aom/aom_mem/aom_mem.mk b/third_party/aom/aom_mem/aom_mem.mk
deleted file mode 100644
index e9162c284..000000000
--- a/third_party/aom/aom_mem/aom_mem.mk
+++ /dev/null
@@ -1,4 +0,0 @@
-MEM_SRCS-yes += aom_mem.mk
-MEM_SRCS-yes += aom_mem.c
-MEM_SRCS-yes += aom_mem.h
-MEM_SRCS-yes += include/aom_mem_intrnl.h
diff --git a/third_party/aom/aom_mem/include/aom_mem_intrnl.h b/third_party/aom/aom_mem/include/aom_mem_intrnl.h
index 3cdfbe08d..977ebadcd 100644
--- a/third_party/aom/aom_mem/include/aom_mem_intrnl.h
+++ b/third_party/aom/aom_mem/include/aom_mem_intrnl.h
@@ -11,7 +11,8 @@
 
 #ifndef AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_
 #define AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_
-#include "./aom_config.h"
+
+#include "config/aom_config.h"
 
 #define ADDRESS_STORAGE_SIZE sizeof(size_t)
 
diff --git a/third_party/aom/aom_ports/aom_once.h b/third_party/aom/aom_ports/aom_once.h
index 3cfd2fd95..bb1e21366 100644
--- a/third_party/aom/aom_ports/aom_once.h
+++ b/third_party/aom/aom_ports/aom_once.h
@@ -12,7 +12,7 @@
 #ifndef AOM_PORTS_AOM_ONCE_H_
 #define AOM_PORTS_AOM_ONCE_H_
 
-#include "aom_config.h"
+#include "config/aom_config.h"
 
 /* Implement a function wrapper to guarantee initialization
  * thread-safety for library singletons.
diff --git a/third_party/aom/aom_ports/aom_ports.cmake b/third_party/aom/aom_ports/aom_ports.cmake
index e1ffb56f5..6272fc0e3 100644
--- a/third_party/aom/aom_ports/aom_ports.cmake
+++ b/third_party/aom/aom_ports/aom_ports.cmake
@@ -1,70 +1,81 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_AOM_PORTS_AOM_PORTS_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_PORTS_AOM_PORTS_CMAKE_)
+  return()
+endif() # AOM_AOM_PORTS_AOM_PORTS_CMAKE_
 set(AOM_AOM_PORTS_AOM_PORTS_CMAKE_ 1)
 
-set(AOM_PORTS_INCLUDES
-    "${AOM_ROOT}/aom_ports/aom_once.h"
-    "${AOM_ROOT}/aom_ports/aom_timer.h"
-    "${AOM_ROOT}/aom_ports/bitops.h"
-    "${AOM_ROOT}/aom_ports/emmintrin_compat.h"
-    "${AOM_ROOT}/aom_ports/mem.h"
-    "${AOM_ROOT}/aom_ports/mem_ops.h"
-    "${AOM_ROOT}/aom_ports/mem_ops_aligned.h"
-    "${AOM_ROOT}/aom_ports/msvc.h"
-    "${AOM_ROOT}/aom_ports/system_state.h")
+list(APPEND AOM_PORTS_INCLUDES
+            "${AOM_ROOT}/aom_ports/aom_once.h"
+            "${AOM_ROOT}/aom_ports/aom_timer.h"
+            "${AOM_ROOT}/aom_ports/bitops.h"
+            "${AOM_ROOT}/aom_ports/emmintrin_compat.h"
+            "${AOM_ROOT}/aom_ports/mem.h"
+            "${AOM_ROOT}/aom_ports/mem_ops.h"
+            "${AOM_ROOT}/aom_ports/mem_ops_aligned.h"
+            "${AOM_ROOT}/aom_ports/msvc.h"
+            "${AOM_ROOT}/aom_ports/sanitizer.h"
+            "${AOM_ROOT}/aom_ports/system_state.h")
 
-set(AOM_PORTS_ASM_X86 "${AOM_ROOT}/aom_ports/emms.asm")
+list(APPEND AOM_PORTS_ASM_X86 "${AOM_ROOT}/aom_ports/emms.asm")
 
-set(AOM_PORTS_INCLUDES_X86
-    "${AOM_ROOT}/aom_ports/x86_abi_support.asm")
+list(APPEND AOM_PORTS_INCLUDES_X86 "${AOM_ROOT}/aom_ports/x86_abi_support.asm")
 
-set(AOM_PORTS_SOURCES_ARM
-    "${AOM_ROOT}/aom_ports/arm.h"
-    "${AOM_ROOT}/aom_ports/arm_cpudetect.c")
+list(APPEND AOM_PORTS_SOURCES_ARM "${AOM_ROOT}/aom_ports/arm.h"
+            "${AOM_ROOT}/aom_ports/arm_cpudetect.c")
+
+list(APPEND AOM_PORTS_SOURCES_PPC "${AOM_ROOT}/aom_ports/ppc.h"
+            "${AOM_ROOT}/aom_ports/ppc_cpudetect.c")
 
 # For arm and x86 targets:
-#   Creates the aom_ports build target, adds the includes in aom_ports to the
+#
+# * Creates the aom_ports build target, adds the includes in aom_ports to the
 #   target, and makes libaom depend on it.
+#
 # Otherwise:
-#   Adds the includes in aom_ports to the libaom target.
+#
+# * Adds the includes in aom_ports to the libaom target.
+#
 # For all target platforms:
-#   The libaom target must exist before this function is called.
-function (setup_aom_ports_targets)
-  if ("${AOM_TARGET_CPU}" MATCHES "^x86")
+#
+# * The libaom target must exist before this function is called.
+function(setup_aom_ports_targets)
+  if("${AOM_TARGET_CPU}" MATCHES "^x86")
     add_asm_library("aom_ports" "AOM_PORTS_ASM_X86" "aom")
     set(aom_ports_has_symbols 1)
-  elseif ("${AOM_TARGET_CPU}" MATCHES "arm")
+  elseif("${AOM_TARGET_CPU}" MATCHES "arm")
     add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_ARM})
     set(aom_ports_has_symbols 1)
     target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_ports>)
-  endif ()
+  elseif("${AOM_TARGET_CPU}" MATCHES "ppc")
+    add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_PPC})
+    set(aom_ports_has_symbols 1)
+    target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_ports>)
+  endif()
 
-  if (aom_ports_has_symbols)
+  if(aom_ports_has_symbols)
     target_sources(aom_ports PRIVATE ${AOM_PORTS_INCLUDES})
 
-    if ("${AOM_TARGET_CPU}" STREQUAL "x86" OR
-        "${AOM_TARGET_CPU}" STREQUAL "x86_64")
+    if("${AOM_TARGET_CPU}" STREQUAL "x86" OR "${AOM_TARGET_CPU}" STREQUAL
+       "x86_64")
       target_sources(aom_ports PRIVATE ${AOM_PORTS_INCLUDES_X86})
-    endif ()
+    endif()
 
     set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
-  else ()
+  else()
     target_sources(aom PRIVATE ${AOM_PORTS_INCLUDES})
 
-    if ("${AOM_TARGET_CPU}" STREQUAL "x86" OR
-        "${AOM_TARGET_CPU}" STREQUAL "x86_64")
+    if("${AOM_TARGET_CPU}" STREQUAL "x86" OR "${AOM_TARGET_CPU}" STREQUAL
+       "x86_64")
       target_sources(aom PRIVATE ${AOM_PORTS_INCLUDES_X86})
-    endif ()
-  endif ()
-endfunction ()
-
-endif ()  # AOM_AOM_PORTS_AOM_PORTS_CMAKE_
+    endif()
+  endif()
+endfunction()
diff --git a/third_party/aom/aom_ports/aom_ports.mk b/third_party/aom/aom_ports/aom_ports.mk
deleted file mode 100644
index 1f18f6bd1..000000000
--- a/third_party/aom/aom_ports/aom_ports.mk
+++ /dev/null
@@ -1,29 +0,0 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-
-
-PORTS_SRCS-yes += aom_ports.mk
-
-PORTS_SRCS-yes += bitops.h
-PORTS_SRCS-yes += mem.h
-PORTS_SRCS-yes += msvc.h
-PORTS_SRCS-yes += system_state.h
-PORTS_SRCS-yes += aom_timer.h
-
-ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
-PORTS_SRCS-yes += emms.asm
-PORTS_SRCS-yes += x86.h
-PORTS_SRCS-yes += x86_abi_support.asm
-endif
-
-PORTS_SRCS-$(ARCH_ARM) += arm_cpudetect.c
-PORTS_SRCS-$(ARCH_ARM) += arm.h
diff --git a/third_party/aom/aom_ports/aom_timer.h b/third_party/aom/aom_ports/aom_timer.h
index 904f2fe51..c719ec677 100644
--- a/third_party/aom/aom_ports/aom_timer.h
+++ b/third_party/aom/aom_ports/aom_timer.h
@@ -12,7 +12,7 @@
 #ifndef AOM_PORTS_AOM_TIMER_H_
 #define AOM_PORTS_AOM_TIMER_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #include "aom/aom_integer.h"
 
@@ -82,7 +82,7 @@ static INLINE int64_t aom_usec_timer_elapsed(struct aom_usec_timer *t) {
   struct timeval diff;
 
   timersub(&t->end, &t->begin, &diff);
-  return diff.tv_sec * 1000000 + diff.tv_usec;
+  return ((int64_t)diff.tv_sec) * 1000000 + diff.tv_usec;
 #endif
 }
 
diff --git a/third_party/aom/aom_ports/arm.h b/third_party/aom/aom_ports/arm.h
index 448a70dcc..a1a2ab765 100644
--- a/third_party/aom/aom_ports/arm.h
+++ b/third_party/aom/aom_ports/arm.h
@@ -12,7 +12,8 @@
 #ifndef AOM_PORTS_ARM_H_
 #define AOM_PORTS_ARM_H_
 #include <stdlib.h>
-#include "aom_config.h"
+
+#include "config/aom_config.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/third_party/aom/aom_ports/arm_cpudetect.c b/third_party/aom/aom_ports/arm_cpudetect.c
index 4dd1a1a62..70efee996 100644
--- a/third_party/aom/aom_ports/arm_cpudetect.c
+++ b/third_party/aom/aom_ports/arm_cpudetect.c
@@ -12,7 +12,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "aom_ports/arm.h"
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #ifdef WINAPI_FAMILY
 #include <winapifamily.h>
@@ -50,9 +50,9 @@ int arm_cpu_caps(void) {
     return flags;
   }
   mask = arm_cpu_env_mask();
-#if HAVE_NEON || HAVE_NEON_ASM
+#if HAVE_NEON
   flags |= HAS_NEON;
-#endif /* HAVE_NEON  || HAVE_NEON_ASM */
+#endif /* HAVE_NEON */
   return flags & mask;
 }
 
@@ -73,7 +73,7 @@ int arm_cpu_caps(void) {
  *  instructions via their assembled hex code.
  * All of these instructions should be essentially nops.
  */
-#if HAVE_NEON || HAVE_NEON_ASM
+#if HAVE_NEON
   if (mask & HAS_NEON) {
     __try {
       /*VORR q0,q0,q0*/
@@ -83,7 +83,7 @@ int arm_cpu_caps(void) {
       /*Ignore exception.*/
     }
   }
-#endif /* HAVE_NEON || HAVE_NEON_ASM */
+#endif /* HAVE_NEON */
   return flags & mask;
 }
 
@@ -100,9 +100,9 @@ int arm_cpu_caps(void) {
   mask = arm_cpu_env_mask();
   features = android_getCpuFeatures();
 
-#if HAVE_NEON || HAVE_NEON_ASM
+#if HAVE_NEON
   if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON;
-#endif /* HAVE_NEON || HAVE_NEON_ASM */
+#endif /* HAVE_NEON */
   return flags & mask;
 }
 
@@ -129,7 +129,7 @@ int arm_cpu_caps(void) {
      */
     char buf[512];
     while (fgets(buf, 511, fin) != NULL) {
-#if HAVE_NEON || HAVE_NEON_ASM
+#if HAVE_NEON
       if (memcmp(buf, "Features", 8) == 0) {
         char *p;
         p = strstr(buf, " neon");
@@ -137,7 +137,7 @@ int arm_cpu_caps(void) {
           flags |= HAS_NEON;
         }
       }
-#endif /* HAVE_NEON || HAVE_NEON_ASM */
+#endif /* HAVE_NEON */
     }
     fclose(fin);
   }
diff --git a/third_party/aom/aom_ports/mem.h b/third_party/aom/aom_ports/mem.h
index 500e397c6..0793d82e4 100644
--- a/third_party/aom/aom_ports/mem.h
+++ b/third_party/aom/aom_ports/mem.h
@@ -12,8 +12,8 @@
 #ifndef AOM_PORTS_MEM_H_
 #define AOM_PORTS_MEM_H_
 
-#include "aom_config.h"
 #include "aom/aom_integer.h"
+#include "config/aom_config.h"
 
 #if (defined(__GNUC__) && __GNUC__) || defined(__SUNPRO_C)
 #define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
@@ -61,6 +61,8 @@
 #define ALIGN_POWER_OF_TWO(value, n) \
   (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
 
+#define DIVIDE_AND_ROUND(x, y) (((x) + ((y) >> 1)) / (y))
+
 #define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
 #define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
 
diff --git a/third_party/aom/aom_ports/mem_ops.h b/third_party/aom/aom_ports/mem_ops.h
index 62126755c..ef0ee17ee 100644
--- a/third_party/aom/aom_ports/mem_ops.h
+++ b/third_party/aom/aom_ports/mem_ops.h
@@ -225,5 +225,4 @@ static AOM_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) {
   mem[3] = (MAU_T)((val >> 24) & 0xff);
 }
 /* clang-format on */
-
 #endif  // AOM_PORTS_MEM_OPS_H_
diff --git a/third_party/aom/aom_ports/mem_ops_aligned.h b/third_party/aom/aom_ports/mem_ops_aligned.h
index 8c3ab1cb1..81fe41a63 100644
--- a/third_party/aom/aom_ports/mem_ops_aligned.h
+++ b/third_party/aom/aom_ports/mem_ops_aligned.h
@@ -91,7 +91,8 @@
     *mem = (uint##sz##_t)raw;                                           \
   }
 
-#include "aom_config.h"
+#include "config/aom_config.h"
+
 #if CONFIG_BIG_ENDIAN
 #define mem_get_be_aligned_generic(sz) mem_get_ne_aligned_generic(be, sz)
 #define mem_get_sbe_aligned_generic(sz) mem_get_sne_aligned_generic(be, sz)
diff --git a/third_party/aom/aom_ports/msvc.h b/third_party/aom/aom_ports/msvc.h
index 5a41d29d2..7d2b54028 100644
--- a/third_party/aom/aom_ports/msvc.h
+++ b/third_party/aom/aom_ports/msvc.h
@@ -13,7 +13,7 @@
 #define AOM_PORTS_MSVC_H_
 #ifdef _MSC_VER
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #if _MSC_VER < 1900  // VS2015 provides snprintf
 #define snprintf _snprintf
@@ -60,8 +60,16 @@ static INLINE long lroundf(float x) {
       a,                                                                     \
       _mm_insert_epi16(_mm256_extractf128_si256(a, indx >> 3), d, indx % 8), \
       indx >> 3)
+
+static INLINE int _mm256_extract_epi32(__m256i a, const int i) {
+  return a.m256i_i32[i & 7];
+}
+static INLINE __m256i _mm256_insert_epi32(__m256i a, int b, const int i) {
+  __m256i c = a;
+  c.m256i_i32[i & 7] = b;
+  return c;
+}
 #endif  // _MSC_VER <= 1900
 #endif  // HAVE_AVX
-
 #endif  // _MSC_VER
 #endif  // AOM_PORTS_MSVC_H_
diff --git a/third_party/aom/aom_ports/config.h b/third_party/aom/aom_ports/ppc.h
index 462ec66fb..ec487c2bc 100644
--- a/third_party/aom/aom_ports/config.h
+++ b/third_party/aom/aom_ports/ppc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -9,9 +9,22 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_PORTS_CONFIG_H_
-#define AOM_PORTS_CONFIG_H_
+#ifndef AOM_PORTS_PPC_H_
+#define AOM_PORTS_PPC_H_
+#include <stdlib.h>
 
-#include "aom_config.h"
+#include "config/aom_config.h"
 
-#endif  // AOM_PORTS_CONFIG_H_
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define HAS_VSX 0x01
+
+int ppc_simd_caps(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_PORTS_PPC_H_
diff --git a/third_party/aom/aom_ports/ppc_cpudetect.c b/third_party/aom/aom_ports/ppc_cpudetect.c
new file mode 100644
index 000000000..82b4f58cc
--- /dev/null
+++ b/third_party/aom/aom_ports/ppc_cpudetect.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <asm/cputable.h>
+#include <linux/auxvec.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/ppc.h"
+
+#if CONFIG_RUNTIME_CPU_DETECT
+static int cpu_env_flags(int *flags) {
+  char *env;
+  env = getenv("AOM_SIMD_CAPS");
+  if (env && *env) {
+    *flags = (int)strtol(env, NULL, 0);
+    return 0;
+  }
+  *flags = 0;
+  return -1;
+}
+
+static int cpu_env_mask(void) {
+  char *env;
+  env = getenv("AOM_SIMD_CAPS_MASK");
+  return env && *env ? (int)strtol(env, NULL, 0) : ~0;
+}
+
+int ppc_simd_caps(void) {
+  int flags;
+  int mask;
+  int fd;
+  ssize_t count;
+  unsigned int i;
+  uint64_t buf[64];
+
+  // If VPX_SIMD_CAPS is set then allow only those capabilities.
+  if (!cpu_env_flags(&flags)) {
+    return flags;
+  }
+
+  mask = cpu_env_mask();
+
+  fd = open("/proc/self/auxv", O_RDONLY);
+  if (fd < 0) {
+    return 0;
+  }
+
+  while ((count = read(fd, buf, sizeof(buf))) > 0) {
+    for (i = 0; i < (count / sizeof(*buf)); i += 2) {
+      if (buf[i] == AT_HWCAP) {
+#if HAVE_VSX
+        if (buf[i + 1] & PPC_FEATURE_HAS_VSX) {
+          flags |= HAS_VSX;
+        }
+#endif  // HAVE_VSX
+        goto out_close;
+      } else if (buf[i] == AT_NULL) {
+        goto out_close;
+      }
+    }
+  }
+out_close:
+  close(fd);
+  return flags & mask;
+}
+#else
+// If there is no RTCD the function pointers are not used and can not be
+// changed.
+int ppc_simd_caps(void) { return 0; }
+#endif  // CONFIG_RUNTIME_CPU_DETECT
diff --git a/third_party/aom/aom_ports/sanitizer.h b/third_party/aom/aom_ports/sanitizer.h
new file mode 100644
index 000000000..d4e197e2f
--- /dev/null
+++ b/third_party/aom/aom_ports/sanitizer.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_PORTS_SANITIZER_H_
+#define AOM_PORTS_SANITIZER_H_
+
+// AddressSanitizer support.
+
+// Define AOM_ADDRESS_SANITIZER if AddressSanitizer is used.
+// Clang.
+#if defined(__has_feature)
+#if __has_feature(address_sanitizer)
+#define AOM_ADDRESS_SANITIZER 1
+#endif
+#endif  // defined(__has_feature)
+// GCC.
+#if defined(__SANITIZE_ADDRESS__)
+#define AOM_ADDRESS_SANITIZER 1
+#endif  // defined(__SANITIZE_ADDRESS__)
+
+// Define the macros for AddressSanitizer manual memory poisoning. See
+// https://github.com/google/sanitizers/wiki/AddressSanitizerManualPoisoning.
+#if defined(AOM_ADDRESS_SANITIZER)
+#include <sanitizer/asan_interface.h>
+#else
+#define ASAN_POISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
+#endif
+
+#endif  // AOM_PORTS_SANITIZER_H_
diff --git a/third_party/aom/aom_ports/system_state.h b/third_party/aom/aom_ports/system_state.h
index 5d40d4cb8..0f2c3d8b5 100644
--- a/third_party/aom/aom_ports/system_state.h
+++ b/third_party/aom/aom_ports/system_state.h
@@ -12,7 +12,7 @@
 #ifndef AOM_PORTS_SYSTEM_STATE_H_
 #define AOM_PORTS_SYSTEM_STATE_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #if ARCH_X86 || ARCH_X86_64
 void aom_reset_mmx_state(void);
diff --git a/third_party/aom/aom_ports/x86.h b/third_party/aom/aom_ports/x86.h
index e5680ca93..b642a57f7 100644
--- a/third_party/aom/aom_ports/x86.h
+++ b/third_party/aom/aom_ports/x86.h
@@ -17,8 +17,8 @@
 #include <intrin.h> /* For __cpuidex, __rdtsc */
 #endif
 
-#include "aom_config.h"
 #include "aom/aom_integer.h"
+#include "config/aom_config.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -162,6 +162,7 @@ static INLINE uint64_t xgetbv(void) {
 #define HAS_SSE4_1 0x20
 #define HAS_AVX 0x40
 #define HAS_AVX2 0x80
+#define HAS_SSE4_2 0x100
 #ifndef BIT
 #define BIT(n) (1 << n)
 #endif
@@ -202,6 +203,8 @@ static INLINE int x86_simd_caps(void) {
 
   if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
 
+  if (reg_ecx & BIT(20)) flags |= HAS_SSE4_2;
+
   // bits 27 (OSXSAVE) & 28 (256-bit AVX)
   if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) {
     if ((xgetbv() & 0x6) == 0x6) {
diff --git a/third_party/aom/aom_ports/x86_abi_support.asm b/third_party/aom/aom_ports/x86_abi_support.asm
index 6aeee60a0..0e7c26287 100644
--- a/third_party/aom/aom_ports/x86_abi_support.asm
+++ b/third_party/aom/aom_ports/x86_abi_support.asm
@@ -12,7 +12,7 @@
 ;
 
 
-%include "aom_config.asm"
+%include "config/aom_config.asm"
 
 ; 32/64 bit compatibility macros
 ;
diff --git a/third_party/aom/aom_scale/aom_scale.cmake b/third_party/aom/aom_scale/aom_scale.cmake
index 1c00b590e..197dea6bd 100644
--- a/third_party/aom/aom_scale/aom_scale.cmake
+++ b/third_party/aom/aom_scale/aom_scale.cmake
@@ -1,39 +1,38 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_AOM_SCALE_AOM_SCALE_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_SCALE_AOM_SCALE_CMAKE_)
+  return()
+endif() # AOM_AOM_SCALE_AOM_SCALE_CMAKE_
 set(AOM_AOM_SCALE_AOM_SCALE_CMAKE_ 1)
 
-set(AOM_SCALE_SOURCES
-    "${AOM_ROOT}/aom_scale/aom_scale.h"
-    "${AOM_ROOT}/aom_scale/generic/aom_scale.c"
-    "${AOM_ROOT}/aom_scale/generic/gen_scalers.c"
-    "${AOM_ROOT}/aom_scale/generic/yv12config.c"
-    "${AOM_ROOT}/aom_scale/generic/yv12extend.c"
-    "${AOM_ROOT}/aom_scale/yv12config.h")
+list(APPEND AOM_SCALE_SOURCES "${AOM_ROOT}/aom_scale/aom_scale.h"
+            "${AOM_ROOT}/aom_scale/generic/aom_scale.c"
+            "${AOM_ROOT}/aom_scale/generic/gen_scalers.c"
+            "${AOM_ROOT}/aom_scale/generic/yv12config.c"
+            "${AOM_ROOT}/aom_scale/generic/yv12extend.c"
+            "${AOM_ROOT}/aom_scale/yv12config.h")
 
-set(AOM_SCALE_INTRIN_DSPR2
-    "${AOM_ROOT}/aom_scale/mips/dspr2/yv12extend_dspr2.c")
+list(APPEND AOM_SCALE_INTRIN_DSPR2
+            "${AOM_ROOT}/aom_scale/mips/dspr2/yv12extend_dspr2.c")
 
 # Creates the aom_scale build target and makes libaom depend on it. The libaom
 # target must exist before this function is called.
-function (setup_aom_scale_targets)
+function(setup_aom_scale_targets)
   add_library(aom_scale OBJECT ${AOM_SCALE_SOURCES})
   target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_scale>)
 
-  if (HAVE_DSPR2)
+  if(HAVE_DSPR2)
     add_intrinsics_object_library("" "dspr2" "aom_scale"
                                   "AOM_SCALE_INTRIN_DSPR2" "aom")
-  endif ()
+  endif()
 
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_scale PARENT_SCOPE)
-endfunction ()
-
-endif ()  # AOM_AOM_SCALE_AOM_SCALE_CMAKE_
+endfunction()
diff --git a/third_party/aom/aom_scale/aom_scale.h b/third_party/aom/aom_scale/aom_scale.h
index 6e089f5aa..a4aef6c65 100644
--- a/third_party/aom/aom_scale/aom_scale.h
+++ b/third_party/aom/aom_scale/aom_scale.h
@@ -18,6 +18,6 @@ extern void aom_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
                             unsigned char *temp_area, unsigned char temp_height,
                             unsigned int hscale, unsigned int hratio,
                             unsigned int vscale, unsigned int vratio,
-                            unsigned int interlaced);
+                            unsigned int interlaced, const int num_planes);
 
 #endif  // AOM_SCALE_AOM_SCALE_H_
diff --git a/third_party/aom/aom_scale/aom_scale.mk b/third_party/aom/aom_scale/aom_scale.mk
deleted file mode 100644
index e3a68cfcf..000000000
--- a/third_party/aom/aom_scale/aom_scale.mk
+++ /dev/null
@@ -1,16 +0,0 @@
-SCALE_SRCS-yes += aom_scale.mk
-SCALE_SRCS-yes += yv12config.h
-SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += aom_scale.h
-SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/aom_scale.c
-SCALE_SRCS-yes += generic/yv12config.c
-SCALE_SRCS-yes += generic/yv12extend.c
-SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/gen_scalers.c
-SCALE_SRCS-yes += aom_scale_rtcd.c
-SCALE_SRCS-yes += aom_scale_rtcd.pl
-
-#mips(dspr2)
-SCALE_SRCS-$(HAVE_DSPR2)  += mips/dspr2/yv12extend_dspr2.c
-
-SCALE_SRCS-no += $(SCALE_SRCS_REMOVE-yes)
-
-$(eval $(call rtcd_h_template,aom_scale_rtcd,aom_scale/aom_scale_rtcd.pl))
diff --git a/third_party/aom/aom_scale/aom_scale_rtcd.c b/third_party/aom/aom_scale/aom_scale_rtcd.c
index dec23735b..08f1a376d 100644
--- a/third_party/aom/aom_scale/aom_scale_rtcd.c
+++ b/third_party/aom/aom_scale/aom_scale_rtcd.c
@@ -8,9 +8,11 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #define RTCD_C
-#include "./aom_scale_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
 #include "aom_ports/aom_once.h"
 
 void aom_scale_rtcd() { once(setup_rtcd_internal); }
diff --git a/third_party/aom/aom_scale/aom_scale_rtcd.pl b/third_party/aom/aom_scale/aom_scale_rtcd.pl
index c44966f96..c5990b1bb 100644
--- a/third_party/aom/aom_scale/aom_scale_rtcd.pl
+++ b/third_party/aom/aom_scale/aom_scale_rtcd.pl
@@ -1,3 +1,13 @@
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
 sub aom_scale_forward_decls() {
 print <<EOF
 struct yv12_buffer_config;
@@ -16,9 +26,9 @@ if (aom_config("CONFIG_SPATIAL_RESAMPLING") eq "yes") {
   add_proto qw/void aom_vertical_band_2_1_scale_i/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
 }
 
-add_proto qw/void aom_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf";
+add_proto qw/void aom_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf, const int num_planes";
 
-add_proto qw/void aom_yv12_copy_frame/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc";
+add_proto qw/void aom_yv12_copy_frame/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes";
 
 add_proto qw/void aom_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
 
@@ -26,13 +36,17 @@ add_proto qw/void aom_yv12_copy_u/, "const struct yv12_buffer_config *src_bc, st
 
 add_proto qw/void aom_yv12_copy_v/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc";
 
-if (aom_config("CONFIG_AV1") eq "yes") {
-  add_proto qw/void aom_extend_frame_borders/, "struct yv12_buffer_config *ybf";
-  specialize qw/aom_extend_frame_borders dspr2/;
+add_proto qw/void aom_yv12_partial_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int hstart, int hend, int vstart, int vend";
 
-  add_proto qw/void aom_extend_frame_inner_borders/, "struct yv12_buffer_config *ybf";
-  specialize qw/aom_extend_frame_inner_borders dspr2/;
+add_proto qw/void aom_yv12_partial_copy_u/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend";
 
-  add_proto qw/void aom_extend_frame_borders_y/, "struct yv12_buffer_config *ybf";
-}
+add_proto qw/void aom_yv12_partial_copy_v/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend";
+
+add_proto qw/void aom_extend_frame_borders/, "struct yv12_buffer_config *ybf, const int num_planes";
+specialize qw/aom_extend_frame_borders dspr2/;
+
+add_proto qw/void aom_extend_frame_inner_borders/, "struct yv12_buffer_config *ybf, const int num_planes";
+specialize qw/aom_extend_frame_inner_borders dspr2/;
+
+add_proto qw/void aom_extend_frame_borders_y/, "struct yv12_buffer_config *ybf";
 1;
diff --git a/third_party/aom/aom_scale/generic/aom_scale.c b/third_party/aom/aom_scale/generic/aom_scale.c
index d124832b7..206c42c9f 100644
--- a/third_party/aom/aom_scale/generic/aom_scale.c
+++ b/third_party/aom/aom_scale/generic/aom_scale.c
@@ -18,9 +18,10 @@
  ***************************************************************************/
 
 /****************************************************************************
-*  Header Files
-****************************************************************************/
-#include "./aom_scale_rtcd.h"
+ *  Header Files
+ ****************************************************************************/
+#include "config/aom_scale_rtcd.h"
+
 #include "aom_mem/aom_mem.h"
 #include "aom_scale/aom_scale.h"
 #include "aom_scale/yv12config.h"
@@ -475,11 +476,11 @@ void aom_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
                      unsigned char *temp_area, unsigned char temp_height,
                      unsigned int hscale, unsigned int hratio,
                      unsigned int vscale, unsigned int vratio,
-                     unsigned int interlaced) {
+                     unsigned int interlaced, const int num_planes) {
   const int dw = (hscale - 1 + src->y_width * hratio) / hscale;
   const int dh = (vscale - 1 + src->y_height * vratio) / vscale;
 
-  for (int plane = 0; plane < 3; ++plane) {
+  for (int plane = 0; plane < num_planes; ++plane) {
     const int is_uv = plane > 0;
     const int plane_dw = dw >> is_uv;
     const int plane_dh = dh >> is_uv;
diff --git a/third_party/aom/aom_scale/generic/gen_scalers.c b/third_party/aom/aom_scale/generic/gen_scalers.c
index 71fa82f30..549e2aa69 100644
--- a/third_party/aom/aom_scale/generic/gen_scalers.c
+++ b/third_party/aom/aom_scale/generic/gen_scalers.c
@@ -9,12 +9,13 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_scale_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
 #include "aom_scale/aom_scale.h"
 #include "aom_mem/aom_mem.h"
 /****************************************************************************
-*  Imports
-****************************************************************************/
+ *  Imports
+ ****************************************************************************/
 
 /****************************************************************************
  *
diff --git a/third_party/aom/aom_scale/generic/yv12config.c b/third_party/aom/aom_scale/generic/yv12config.c
index fce719273..cce915165 100644
--- a/third_party/aom/aom_scale/generic/yv12config.c
+++ b/third_party/aom/aom_scale/generic/yv12config.c
@@ -16,8 +16,8 @@
 #include "aom_scale/yv12config.h"
 
 /****************************************************************************
-*  Exports
-****************************************************************************/
+ *  Exports
+ ****************************************************************************/
 
 /****************************************************************************
  *
@@ -25,7 +25,6 @@
 #define yv12_align_addr(addr, align) \
   (void *)(((size_t)(addr) + ((align)-1)) & (size_t) - (align))
 
-#if CONFIG_AV1
 // TODO(jkoleszar): Maybe replace this with struct aom_image
 
 int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
@@ -33,10 +32,7 @@ int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
     if (ybf->buffer_alloc_sz > 0) {
       aom_free(ybf->buffer_alloc);
     }
-
-#if CONFIG_HIGHBITDEPTH && CONFIG_GLOBAL_MOTION
     if (ybf->y_buffer_8bit) aom_free(ybf->y_buffer_8bit);
-#endif
 
     /* buffer_alloc isn't accessed by most functions.  Rather y_buffer,
       u_buffer and v_buffer point to buffer_alloc and are used.  Clear out
@@ -50,10 +46,7 @@ int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
 }
 
 int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
-                             int ss_x, int ss_y,
-#if CONFIG_HIGHBITDEPTH
-                             int use_highbitdepth,
-#endif
+                             int ss_x, int ss_y, int use_highbitdepth,
                              int border, int byte_alignment,
                              aom_codec_frame_buffer_t *fb,
                              aom_get_frame_buffer_cb_fn_t cb, void *cb_priv) {
@@ -72,12 +65,8 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
     const uint64_t uvplane_size =
         (uv_height + 2 * uv_border_h) * (uint64_t)uv_stride + byte_alignment;
 
-#if CONFIG_HIGHBITDEPTH
     const uint64_t frame_size =
         (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size);
-#else
-    const uint64_t frame_size = yplane_size + 2 * uvplane_size;
-#endif  // CONFIG_HIGHBITDEPTH
 
     uint8_t *buf = NULL;
 
@@ -147,7 +136,6 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
     ybf->subsampling_y = ss_y;
 
     buf = ybf->buffer_alloc;
-#if CONFIG_HIGHBITDEPTH
     if (use_highbitdepth) {
       // Store uint16 addresses when using 16bit framebuffers
       buf = CONVERT_TO_BYTEPTR(ybf->buffer_alloc);
@@ -155,7 +143,6 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
     } else {
       ybf->flags = 0;
     }
-#endif  // CONFIG_HIGHBITDEPTH
 
     ybf->y_buffer = (uint8_t *)yv12_align_addr(
         buf + (border * y_stride) + border, aom_byte_align);
@@ -167,7 +154,8 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
                                        (uv_border_h * uv_stride) + uv_border_w,
                                    aom_byte_align);
 
-#if CONFIG_HIGHBITDEPTH && CONFIG_GLOBAL_MOTION
+    ybf->use_external_refernce_buffers = 0;
+
     if (use_highbitdepth) {
       if (ybf->y_buffer_8bit) aom_free(ybf->y_buffer_8bit);
       ybf->y_buffer_8bit = (uint8_t *)aom_memalign(32, (size_t)yplane_size);
@@ -175,7 +163,6 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
     } else {
       assert(!ybf->y_buffer_8bit);
     }
-#endif
 
     ybf->corrupted = 0; /* assume not corrupted by errors */
     return 0;
@@ -184,19 +171,13 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
 }
 
 int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
-                           int ss_x, int ss_y,
-#if CONFIG_HIGHBITDEPTH
-                           int use_highbitdepth,
-#endif
-                           int border, int byte_alignment) {
+                           int ss_x, int ss_y, int use_highbitdepth, int border,
+                           int byte_alignment) {
   if (ybf) {
     aom_free_frame_buffer(ybf);
     return aom_realloc_frame_buffer(ybf, width, height, ss_x, ss_y,
-#if CONFIG_HIGHBITDEPTH
-                                    use_highbitdepth,
-#endif
-                                    border, byte_alignment, NULL, NULL, NULL);
+                                    use_highbitdepth, border, byte_alignment,
+                                    NULL, NULL, NULL);
   }
   return -2;
 }
-#endif
diff --git a/third_party/aom/aom_scale/generic/yv12extend.c b/third_party/aom/aom_scale/generic/yv12extend.c
index 8266743cf..ba183520a 100644
--- a/third_party/aom/aom_scale/generic/yv12extend.c
+++ b/third_party/aom/aom_scale/generic/yv12extend.c
@@ -10,8 +10,10 @@
  */
 
 #include <assert.h>
-#include "./aom_config.h"
-#include "./aom_scale_rtcd.h"
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
@@ -57,7 +59,6 @@ static void extend_plane(uint8_t *const src, int src_stride, int width,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 static void extend_plane_high(uint8_t *const src8, int src_stride, int width,
                               int height, int extend_top, int extend_left,
                               int extend_bottom, int extend_right) {
@@ -98,18 +99,17 @@ static void extend_plane_high(uint8_t *const src8, int src_stride, int width,
     dst_ptr2 += src_stride;
   }
 }
-#endif
 
-void aom_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
+void aom_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
+                                     const int num_planes) {
   assert(ybf->border % 2 == 0);
   assert(ybf->y_height - ybf->y_crop_height < 16);
   assert(ybf->y_width - ybf->y_crop_width < 16);
   assert(ybf->y_height - ybf->y_crop_height >= 0);
   assert(ybf->y_width - ybf->y_crop_width >= 0);
 
-#if CONFIG_HIGHBITDEPTH
   if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    for (int plane = 0; plane < 3; ++plane) {
+    for (int plane = 0; plane < num_planes; ++plane) {
       const int is_uv = plane > 0;
       const int plane_border = ybf->border >> is_uv;
       extend_plane_high(
@@ -120,8 +120,7 @@ void aom_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
     }
     return;
   }
-#endif
-  for (int plane = 0; plane < 3; ++plane) {
+  for (int plane = 0; plane < num_planes; ++plane) {
     const int is_uv = plane > 0;
     const int plane_border = ybf->border >> is_uv;
     extend_plane(ybf->buffers[plane], ybf->strides[is_uv],
@@ -132,8 +131,8 @@ void aom_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
   }
 }
 
-#if CONFIG_AV1
-static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) {
+static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size,
+                         const int num_planes) {
   const int ss_x = ybf->uv_width < ybf->y_width;
   const int ss_y = ybf->uv_height < ybf->y_height;
 
@@ -142,9 +141,8 @@ static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) {
   assert(ybf->y_height - ybf->y_crop_height >= 0);
   assert(ybf->y_width - ybf->y_crop_width >= 0);
 
-#if CONFIG_HIGHBITDEPTH
   if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    for (int plane = 0; plane < 3; ++plane) {
+    for (int plane = 0; plane < num_planes; ++plane) {
       const int is_uv = plane > 0;
       const int top = ext_size >> (is_uv ? ss_y : 0);
       const int left = ext_size >> (is_uv ? ss_x : 0);
@@ -156,8 +154,7 @@ static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) {
     }
     return;
   }
-#endif
-  for (int plane = 0; plane < 3; ++plane) {
+  for (int plane = 0; plane < num_planes; ++plane) {
     const int is_uv = plane > 0;
     const int top = ext_size >> (is_uv ? ss_y : 0);
     const int left = ext_size >> (is_uv ? ss_x : 0);
@@ -169,15 +166,16 @@ static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) {
   }
 }
 
-void aom_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
-  extend_frame(ybf, ybf->border);
+void aom_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf, const int num_planes) {
+  extend_frame(ybf, ybf->border, num_planes);
 }
 
-void aom_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf) {
+void aom_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf,
+                                      const int num_planes) {
   const int inner_bw = (ybf->border > AOMINNERBORDERINPIXELS)
                            ? AOMINNERBORDERINPIXELS
                            : ybf->border;
-  extend_frame(ybf, inner_bw);
+  extend_frame(ybf, inner_bw, num_planes);
 }
 
 void aom_extend_frame_borders_y_c(YV12_BUFFER_CONFIG *ybf) {
@@ -187,7 +185,6 @@ void aom_extend_frame_borders_y_c(YV12_BUFFER_CONFIG *ybf) {
   assert(ybf->y_height - ybf->y_crop_height >= 0);
   assert(ybf->y_width - ybf->y_crop_width >= 0);
 
-#if CONFIG_HIGHBITDEPTH
   if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
     extend_plane_high(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
                       ybf->y_crop_height, ext_size, ext_size,
@@ -195,27 +192,23 @@ void aom_extend_frame_borders_y_c(YV12_BUFFER_CONFIG *ybf) {
                       ext_size + ybf->y_width - ybf->y_crop_width);
     return;
   }
-#endif
   extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
                ybf->y_crop_height, ext_size, ext_size,
                ext_size + ybf->y_height - ybf->y_crop_height,
                ext_size + ybf->y_width - ybf->y_crop_width);
 }
-#endif  // CONFIG_AV1
 
-#if CONFIG_HIGHBITDEPTH
 static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   memcpy(dst, src, num * sizeof(uint16_t));
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 // Copies the source image into the destination image and updates the
 // destination's UMV borders.
 // Note: The frames are assumed to be identical in size.
 void aom_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_bc,
-                           YV12_BUFFER_CONFIG *dst_bc) {
+                           YV12_BUFFER_CONFIG *dst_bc, const int num_planes) {
 #if 0
   /* These assertions are valid in the codec, but the libaom-tester uses
    * this code slightly differently.
@@ -224,12 +217,11 @@ void aom_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_bc,
   assert(src_bc->y_height == dst_bc->y_height);
 #endif
 
-#if CONFIG_HIGHBITDEPTH
   assert((src_bc->flags & YV12_FLAG_HIGHBITDEPTH) ==
          (dst_bc->flags & YV12_FLAG_HIGHBITDEPTH));
 
   if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
-    for (int plane = 0; plane < 3; ++plane) {
+    for (int plane = 0; plane < num_planes; ++plane) {
       const uint8_t *plane_src = src_bc->buffers[plane];
       uint8_t *plane_dst = dst_bc->buffers[plane];
       const int is_uv = plane > 0;
@@ -240,11 +232,10 @@ void aom_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_bc,
         plane_dst += dst_bc->strides[is_uv];
       }
     }
-    aom_yv12_extend_frame_borders_c(dst_bc);
+    aom_yv12_extend_frame_borders_c(dst_bc, num_planes);
     return;
   }
-#endif
-  for (int plane = 0; plane < 3; ++plane) {
+  for (int plane = 0; plane < num_planes; ++plane) {
     const uint8_t *plane_src = src_bc->buffers[plane];
     uint8_t *plane_dst = dst_bc->buffers[plane];
     const int is_uv = plane > 0;
@@ -255,7 +246,7 @@ void aom_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_bc,
       plane_dst += dst_bc->strides[is_uv];
     }
   }
-  aom_yv12_extend_frame_borders_c(dst_bc);
+  aom_yv12_extend_frame_borders_c(dst_bc, num_planes);
 }
 
 void aom_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
@@ -264,7 +255,6 @@ void aom_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
   const uint8_t *src = src_ybc->y_buffer;
   uint8_t *dst = dst_ybc->y_buffer;
 
-#if CONFIG_HIGHBITDEPTH
   if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
     const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
     uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
@@ -275,7 +265,6 @@ void aom_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
     }
     return;
   }
-#endif  // CONFIG_HIGHBITDEPTH
 
   for (row = 0; row < src_ybc->y_height; ++row) {
     memcpy(dst, src, src_ybc->y_width);
@@ -290,7 +279,6 @@ void aom_yv12_copy_u_c(const YV12_BUFFER_CONFIG *src_bc,
   const uint8_t *src = src_bc->u_buffer;
   uint8_t *dst = dst_bc->u_buffer;
 
-#if CONFIG_HIGHBITDEPTH
   if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
     const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
     uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
@@ -301,7 +289,6 @@ void aom_yv12_copy_u_c(const YV12_BUFFER_CONFIG *src_bc,
     }
     return;
   }
-#endif  // CONFIG_HIGHBITDEPTH
 
   for (row = 0; row < src_bc->uv_height; ++row) {
     memcpy(dst, src, src_bc->uv_width);
@@ -316,7 +303,6 @@ void aom_yv12_copy_v_c(const YV12_BUFFER_CONFIG *src_bc,
   const uint8_t *src = src_bc->v_buffer;
   uint8_t *dst = dst_bc->v_buffer;
 
-#if CONFIG_HIGHBITDEPTH
   if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
     const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
     uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
@@ -327,7 +313,6 @@ void aom_yv12_copy_v_c(const YV12_BUFFER_CONFIG *src_bc,
     }
     return;
   }
-#endif  // CONFIG_HIGHBITDEPTH
 
   for (row = 0; row < src_bc->uv_height; ++row) {
     memcpy(dst, src, src_bc->uv_width);
@@ -335,3 +320,92 @@ void aom_yv12_copy_v_c(const YV12_BUFFER_CONFIG *src_bc,
     dst += dst_bc->uv_stride;
   }
 }
+
+void aom_yv12_partial_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
+                               YV12_BUFFER_CONFIG *dst_ybc, int hstart,
+                               int hend, int vstart, int vend) {
+  int row;
+  const uint8_t *src = src_ybc->y_buffer;
+  uint8_t *dst = dst_ybc->y_buffer;
+
+  if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 =
+        CONVERT_TO_SHORTPTR(src + vstart * src_ybc->y_stride + hstart);
+    uint16_t *dst16 =
+        CONVERT_TO_SHORTPTR(dst + vstart * dst_ybc->y_stride + hstart);
+    for (row = vstart; row < vend; ++row) {
+      memcpy(dst16, src16, (hend - hstart) * sizeof(uint16_t));
+      src16 += src_ybc->y_stride;
+      dst16 += dst_ybc->y_stride;
+    }
+    return;
+  }
+  src = (src + vstart * src_ybc->y_stride + hstart);
+  dst = (dst + vstart * dst_ybc->y_stride + hstart);
+
+  for (row = vstart; row < vend; ++row) {
+    memcpy(dst, src, (hend - hstart));
+    src += src_ybc->y_stride;
+    dst += dst_ybc->y_stride;
+  }
+}
+
+void aom_yv12_partial_copy_u_c(const YV12_BUFFER_CONFIG *src_bc,
+                               YV12_BUFFER_CONFIG *dst_bc, int hstart, int hend,
+                               int vstart, int vend) {
+  int row;
+  const uint8_t *src = src_bc->u_buffer;
+  uint8_t *dst = dst_bc->u_buffer;
+
+  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 =
+        CONVERT_TO_SHORTPTR(src + vstart * src_bc->uv_stride + hstart);
+    uint16_t *dst16 =
+        CONVERT_TO_SHORTPTR(dst + vstart * dst_bc->uv_stride + hstart);
+    for (row = vstart; row < vend; ++row) {
+      memcpy(dst16, src16, (hend - hstart) * sizeof(uint16_t));
+      src16 += src_bc->uv_stride;
+      dst16 += dst_bc->uv_stride;
+    }
+    return;
+  }
+
+  src = (src + vstart * src_bc->uv_stride + hstart);
+  dst = (dst + vstart * dst_bc->uv_stride + hstart);
+
+  for (row = vstart; row < vend; ++row) {
+    memcpy(dst, src, (hend - hstart));
+    src += src_bc->uv_stride;
+    dst += dst_bc->uv_stride;
+  }
+}
+
+void aom_yv12_partial_copy_v_c(const YV12_BUFFER_CONFIG *src_bc,
+                               YV12_BUFFER_CONFIG *dst_bc, int hstart, int hend,
+                               int vstart, int vend) {
+  int row;
+  const uint8_t *src = src_bc->v_buffer;
+  uint8_t *dst = dst_bc->v_buffer;
+
+  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 =
+        CONVERT_TO_SHORTPTR(src + vstart * src_bc->uv_stride + hstart);
+    uint16_t *dst16 =
+        CONVERT_TO_SHORTPTR(dst + vstart * dst_bc->uv_stride + hstart);
+    for (row = vstart; row < vend; ++row) {
+      memcpy(dst16, src16, (hend - hstart) * sizeof(uint16_t));
+      src16 += src_bc->uv_stride;
+      dst16 += dst_bc->uv_stride;
+    }
+    return;
+  }
+
+  src = (src + vstart * src_bc->uv_stride + hstart);
+  dst = (dst + vstart * dst_bc->uv_stride + hstart);
+
+  for (row = vstart; row < vend; ++row) {
+    memcpy(dst, src, (hend - hstart));
+    src += src_bc->uv_stride;
+    dst += dst_bc->uv_stride;
+  }
+}
diff --git a/third_party/aom/aom_scale/mips/dspr2/yv12extend_dspr2.c b/third_party/aom/aom_scale/mips/dspr2/yv12extend_dspr2.c
index 51192f7b9..869e594d7 100644
--- a/third_party/aom/aom_scale/mips/dspr2/yv12extend_dspr2.c
+++ b/third_party/aom/aom_scale/mips/dspr2/yv12extend_dspr2.c
@@ -11,7 +11,8 @@
 
 #include <assert.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_scale/yv12config.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_scale/aom_scale.h"
@@ -126,14 +127,16 @@ static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) {
   extend_plane(ybf->v_buffer, ybf->uv_stride, c_w, c_h, c_et, c_el, c_eb, c_er);
 }
 
-void aom_extend_frame_borders_dspr2(YV12_BUFFER_CONFIG *ybf) {
-  extend_frame(ybf, ybf->border);
+void aom_extend_frame_borders_dspr2(YV12_BUFFER_CONFIG *ybf,
+                                    const int num_planes) {
+  extend_frame(ybf, ybf->border, num_planes);
 }
 
-void aom_extend_frame_inner_borders_dspr2(YV12_BUFFER_CONFIG *ybf) {
+void aom_extend_frame_inner_borders_dspr2(YV12_BUFFER_CONFIG *ybf,
+                                          const int num_planes) {
   const int inner_bw = (ybf->border > AOMINNERBORDERINPIXELS)
                            ? AOMINNERBORDERINPIXELS
                            : ybf->border;
-  extend_frame(ybf, inner_bw);
+  extend_frame(ybf, inner_bw, num_planes);
 }
 #endif
diff --git a/third_party/aom/aom_scale/yv12config.h b/third_party/aom/aom_scale/yv12config.h
index 0c47f650d..8f1c60069 100644
--- a/third_party/aom/aom_scale/yv12config.h
+++ b/third_party/aom/aom_scale/yv12config.h
@@ -16,22 +16,19 @@
 extern "C" {
 #endif
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_codec.h"
 #include "aom/aom_frame_buffer.h"
 #include "aom/aom_integer.h"
 
-#if CONFIG_EXT_PARTITION
 #define AOMINNERBORDERINPIXELS 160
-#else
-#define AOMINNERBORDERINPIXELS 96
-#endif  // CONFIG_EXT_PARTITION
 #define AOM_INTERP_EXTEND 4
 
 // TODO(jingning): Use unified inter predictor for encoder and
 // decoder during the development process. Revisit the frame border
 // to improve the decoder performance.
-#define AOM_BORDER_IN_PIXELS 160
+#define AOM_BORDER_IN_PIXELS 288
 
 typedef struct yv12_buffer_config {
   union {
@@ -82,12 +79,18 @@ typedef struct yv12_buffer_config {
     uint8_t *buffers[4];
   };
 
-#if CONFIG_HIGHBITDEPTH && CONFIG_GLOBAL_MOTION
+  // Indicate whether y_buffer, u_buffer, and v_buffer points to the internally
+  // allocated memory or external buffers.
+  int use_external_refernce_buffers;
+  // This is needed to store y_buffer, u_buffer, and v_buffer when set reference
+  // uses an external refernece, and restore those buffer pointers after the
+  // external reference frame is no longer used.
+  uint8_t *store_buf_adr[3];
+
   // If the frame is stored in a 16-bit buffer, this stores an 8-bit version
   // for use in global motion detection. It is allocated on-demand.
   uint8_t *y_buffer_8bit;
   int buf_8bit_valid;
-#endif
 
   uint8_t *buffer_alloc;
   size_t buffer_alloc_sz;
@@ -96,8 +99,10 @@ typedef struct yv12_buffer_config {
   int subsampling_x;
   int subsampling_y;
   unsigned int bit_depth;
-  aom_color_space_t color_space;
-  aom_transfer_function_t transfer_function;
+  aom_color_primaries_t color_primaries;
+  aom_transfer_characteristics_t transfer_characteristics;
+  aom_matrix_coefficients_t matrix_coefficients;
+  int monochrome;
   aom_chroma_sample_position_t chroma_sample_position;
   aom_color_range_t color_range;
   int render_width;
@@ -110,11 +115,8 @@ typedef struct yv12_buffer_config {
 #define YV12_FLAG_HIGHBITDEPTH 8
 
 int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
-                           int ss_x, int ss_y,
-#if CONFIG_HIGHBITDEPTH
-                           int use_highbitdepth,
-#endif
-                           int border, int byte_alignment);
+                           int ss_x, int ss_y, int use_highbitdepth, int border,
+                           int byte_alignment);
 
 // Updates the yv12 buffer config with the frame buffer. |byte_alignment| must
 // be a power of 2, from 32 to 1024. 0 sets legacy alignment. If cb is not
@@ -124,10 +126,7 @@ int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
 // internally to decode the current frame. Returns 0 on success. Returns < 0
 // on failure.
 int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
-                             int ss_x, int ss_y,
-#if CONFIG_HIGHBITDEPTH
-                             int use_highbitdepth,
-#endif
+                             int ss_x, int ss_y, int use_highbitdepth,
                              int border, int byte_alignment,
                              aom_codec_frame_buffer_t *fb,
                              aom_get_frame_buffer_cb_fn_t cb, void *cb_priv);
diff --git a/third_party/aom/aom_util/aom_thread.c b/third_party/aom/aom_util/aom_thread.c
index 954b8f99c..cae9f5e25 100644
--- a/third_party/aom/aom_util/aom_thread.c
+++ b/third_party/aom/aom_util/aom_thread.c
@@ -16,8 +16,9 @@
 
 #include <assert.h>
 #include <string.h>  // for memset()
-#include "./aom_thread.h"
+
 #include "aom_mem/aom_mem.h"
+#include "aom_util/aom_thread.h"
 
 #if CONFIG_MULTITHREAD
 
diff --git a/third_party/aom/aom_util/aom_thread.h b/third_party/aom/aom_util/aom_thread.h
index 0ae8f2f49..3b22ac70c 100644
--- a/third_party/aom/aom_util/aom_thread.h
+++ b/third_party/aom/aom_util/aom_thread.h
@@ -17,7 +17,7 @@
 #ifndef AOM_THREAD_H_
 #define AOM_THREAD_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -173,6 +173,23 @@ static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
   return !ok;
 }
 
+static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
+  int ok = 1;
+#ifdef USE_WINDOWS_CONDITION_VARIABLE
+  WakeAllConditionVariable(condition);
+#else
+  while (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
+    // a thread is waiting in pthread_cond_wait: allow it to be notified
+    ok &= SetEvent(condition->signal_event_);
+    // wait until the event is consumed so the signaler cannot consume
+    // the event via its own pthread_cond_wait.
+    ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
+           WAIT_OBJECT_0);
+  }
+#endif
+  return !ok;
+}
+
 static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
                                     pthread_mutex_t *const mutex) {
   int ok;
diff --git a/third_party/aom/aom_util/aom_util.cmake b/third_party/aom/aom_util/aom_util.cmake
index 144d773e3..d4f3bce74 100644
--- a/third_party/aom/aom_util/aom_util.cmake
+++ b/third_party/aom/aom_util/aom_util.cmake
@@ -1,34 +1,28 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_AOM_UTIL_AOM_UTIL_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_UTIL_AOM_UTIL_CMAKE_)
+  return()
+endif() # AOM_AOM_UTIL_AOM_UTIL_CMAKE_
 set(AOM_AOM_UTIL_AOM_UTIL_CMAKE_ 1)
 
-set(AOM_UTIL_SOURCES
-    "${AOM_ROOT}/aom_util/aom_thread.c"
-    "${AOM_ROOT}/aom_util/aom_thread.h"
-    "${AOM_ROOT}/aom_util/endian_inl.h")
-
-if (CONFIG_BITSTREAM_DEBUG)
-  set(AOM_UTIL_SOURCES
-      ${AOM_UTIL_SOURCES}
-      "${AOM_ROOT}/aom_util/debug_util.c"
-      "${AOM_ROOT}/aom_util/debug_util.h")
-endif ()
+list(APPEND AOM_UTIL_SOURCES "${AOM_ROOT}/aom_util/aom_thread.c"
+            "${AOM_ROOT}/aom_util/aom_thread.h"
+            "${AOM_ROOT}/aom_util/endian_inl.h"
+            "${AOM_ROOT}/aom_util/debug_util.c"
+            "${AOM_ROOT}/aom_util/debug_util.h")
 
 # Creates the aom_util build target and makes libaom depend on it. The libaom
 # target must exist before this function is called.
-function (setup_aom_util_targets)
+function(setup_aom_util_targets)
   add_library(aom_util OBJECT ${AOM_UTIL_SOURCES})
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_util PARENT_SCOPE)
   target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_util>)
-endfunction ()
-
-endif ()  # AOM_AOM_UTIL_AOM_UTIL_CMAKE_
+endfunction()
diff --git a/third_party/aom/aom_util/aom_util.mk b/third_party/aom/aom_util/aom_util.mk
deleted file mode 100644
index 14b484a15..000000000
--- a/third_party/aom/aom_util/aom_util.mk
+++ /dev/null
@@ -1,18 +0,0 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-
-UTIL_SRCS-yes += aom_util.mk
-UTIL_SRCS-yes += aom_thread.c
-UTIL_SRCS-yes += aom_thread.h
-UTIL_SRCS-$(CONFIG_BITSTREAM_DEBUG) += debug_util.c
-UTIL_SRCS-$(CONFIG_BITSTREAM_DEBUG) += debug_util.h
-UTIL_SRCS-yes += endian_inl.h
diff --git a/third_party/aom/aom_util/debug_util.c b/third_party/aom/aom_util/debug_util.c
index ea73df8da..468c47ed1 100644
--- a/third_party/aom/aom_util/debug_util.c
+++ b/third_party/aom/aom_util/debug_util.c
@@ -14,17 +14,6 @@
 #include <string.h>
 #include "aom_util/debug_util.h"
 
-#define QUEUE_MAX_SIZE 2000000
-static int result_queue[QUEUE_MAX_SIZE];
-static int nsymbs_queue[QUEUE_MAX_SIZE];
-static aom_cdf_prob cdf_queue[QUEUE_MAX_SIZE][16];
-
-static int queue_r = 0;
-static int queue_w = 0;
-static int queue_prev_w = -1;
-static int skip_r = 0;
-static int skip_w = 0;
-
 static int frame_idx_w = 0;
 
 static int frame_idx_r = 0;
@@ -37,6 +26,18 @@ void bitstream_queue_set_frame_read(int frame_idx) { frame_idx_r = frame_idx; }
 
 int bitstream_queue_get_frame_read(void) { return frame_idx_r; }
 
+#if CONFIG_BITSTREAM_DEBUG
+#define QUEUE_MAX_SIZE 2000000
+static int result_queue[QUEUE_MAX_SIZE];
+static int nsymbs_queue[QUEUE_MAX_SIZE];
+static aom_cdf_prob cdf_queue[QUEUE_MAX_SIZE][16];
+
+static int queue_r = 0;
+static int queue_w = 0;
+static int queue_prev_w = -1;
+static int skip_r = 0;
+static int skip_w = 0;
+
 void bitstream_queue_set_skip_write(int skip) { skip_w = skip; }
 
 void bitstream_queue_set_skip_read(int skip) { skip_r = skip; }
@@ -74,3 +75,201 @@ void bitstream_queue_push(int result, const aom_cdf_prob *cdf, int nsymbs) {
     }
   }
 }
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#if CONFIG_MISMATCH_DEBUG
+static int frame_buf_idx_r = 0;
+static int frame_buf_idx_w = 0;
+static int max_frame_buf_num = 5;
+#define MAX_FRAME_STRIDE 1280
+#define MAX_FRAME_HEIGHT 720
+static uint16_t
+    frame_pre[5][3][MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT];  // prediction only
+static uint16_t
+    frame_tx[5][3][MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT];  // prediction + txfm
+static int frame_stride = MAX_FRAME_STRIDE;
+static int frame_height = MAX_FRAME_HEIGHT;
+static int frame_size = MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT;
+void mismatch_move_frame_idx_w() {
+  frame_buf_idx_w = (frame_buf_idx_w + 1) % max_frame_buf_num;
+  if (frame_buf_idx_w == frame_buf_idx_r) {
+    printf("frame_buf overflow\n");
+    assert(0);
+  }
+}
+
+void mismatch_reset_frame(int num_planes) {
+  for (int plane = 0; plane < num_planes; ++plane) {
+    memset(frame_pre[frame_buf_idx_w][plane], 0,
+           sizeof(frame_pre[frame_buf_idx_w][plane][0]) * frame_size);
+    memset(frame_tx[frame_buf_idx_w][plane], 0,
+           sizeof(frame_tx[frame_buf_idx_w][plane][0]) * frame_size);
+  }
+}
+
+void mismatch_move_frame_idx_r() {
+  if (frame_buf_idx_w == frame_buf_idx_r) {
+    printf("frame_buf underflow\n");
+    assert(0);
+  }
+  frame_buf_idx_r = (frame_buf_idx_r + 1) % max_frame_buf_num;
+}
+
+void mismatch_record_block_pre(const uint8_t *src, int src_stride,
+                               int frame_offset, int plane, int pixel_c,
+                               int pixel_r, int blk_w, int blk_h, int highbd) {
+  if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+    printf("frame_buf undersized\n");
+    assert(0);
+  }
+
+  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  for (int r = 0; r < blk_h; ++r) {
+    for (int c = 0; c < blk_w; ++c) {
+      frame_pre[frame_buf_idx_w][plane]
+               [(r + pixel_r) * frame_stride + c + pixel_c] =
+                   src16 ? src16[r * src_stride + c] : src[r * src_stride + c];
+    }
+  }
+#if 0
+  int ref_frame_idx = 3;
+  int ref_frame_offset = 4;
+  int ref_plane = 1;
+  int ref_pixel_c = 162;
+  int ref_pixel_r = 16;
+  if (frame_idx_w == ref_frame_idx && plane == ref_plane &&
+      frame_offset == ref_frame_offset && ref_pixel_c >= pixel_c &&
+      ref_pixel_c < pixel_c + blk_w && ref_pixel_r >= pixel_r &&
+      ref_pixel_r < pixel_r + blk_h) {
+    printf(
+        "\nrecord_block_pre frame_idx %d frame_offset %d plane %d pixel_c %d pixel_r %d blk_w "
+        "%d blk_h %d\n",
+        frame_idx_w, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h);
+  }
+#endif
+}
+void mismatch_record_block_tx(const uint8_t *src, int src_stride,
+                              int frame_offset, int plane, int pixel_c,
+                              int pixel_r, int blk_w, int blk_h, int highbd) {
+  if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+    printf("frame_buf undersized\n");
+    assert(0);
+  }
+
+  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  for (int r = 0; r < blk_h; ++r) {
+    for (int c = 0; c < blk_w; ++c) {
+      frame_tx[frame_buf_idx_w][plane]
+              [(r + pixel_r) * frame_stride + c + pixel_c] =
+                  src16 ? src16[r * src_stride + c] : src[r * src_stride + c];
+    }
+  }
+#if 0
+  int ref_frame_idx = 3;
+  int ref_frame_offset = 4;
+  int ref_plane = 1;
+  int ref_pixel_c = 162;
+  int ref_pixel_r = 16;
+  if (frame_idx_w == ref_frame_idx && plane == ref_plane && frame_offset == ref_frame_offset &&
+      ref_pixel_c >= pixel_c && ref_pixel_c < pixel_c + blk_w &&
+      ref_pixel_r >= pixel_r && ref_pixel_r < pixel_r + blk_h) {
+    printf(
+        "\nrecord_block_tx frame_idx %d frame_offset %d plane %d pixel_c %d pixel_r %d blk_w "
+        "%d blk_h %d\n",
+        frame_idx_w, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h);
+  }
+#endif
+}
+void mismatch_check_block_pre(const uint8_t *src, int src_stride,
+                              int frame_offset, int plane, int pixel_c,
+                              int pixel_r, int blk_w, int blk_h, int highbd) {
+  if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+    printf("frame_buf undersized\n");
+    assert(0);
+  }
+
+  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  int mismatch = 0;
+  for (int r = 0; r < blk_h; ++r) {
+    for (int c = 0; c < blk_w; ++c) {
+      if (frame_pre[frame_buf_idx_r][plane]
+                   [(r + pixel_r) * frame_stride + c + pixel_c] !=
+          (uint16_t)(src16 ? src16[r * src_stride + c]
+                           : src[r * src_stride + c])) {
+        mismatch = 1;
+      }
+    }
+  }
+  if (mismatch) {
+    printf(
+        "\ncheck_block_pre failed frame_idx %d frame_offset %d plane %d "
+        "pixel_c %d pixel_r "
+        "%d blk_w %d blk_h %d\n",
+        frame_idx_r, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h);
+    printf("enc\n");
+    for (int rr = 0; rr < blk_h; ++rr) {
+      for (int cc = 0; cc < blk_w; ++cc) {
+        printf("%d ", frame_pre[frame_buf_idx_r][plane]
+                               [(rr + pixel_r) * frame_stride + cc + pixel_c]);
+      }
+      printf("\n");
+    }
+
+    printf("dec\n");
+    for (int rr = 0; rr < blk_h; ++rr) {
+      for (int cc = 0; cc < blk_w; ++cc) {
+        printf("%d ",
+               src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]);
+      }
+      printf("\n");
+    }
+    assert(0);
+  }
+}
+void mismatch_check_block_tx(const uint8_t *src, int src_stride,
+                             int frame_offset, int plane, int pixel_c,
+                             int pixel_r, int blk_w, int blk_h, int highbd) {
+  if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+    printf("frame_buf undersized\n");
+    assert(0);
+  }
+
+  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  int mismatch = 0;
+  for (int r = 0; r < blk_h; ++r) {
+    for (int c = 0; c < blk_w; ++c) {
+      if (frame_tx[frame_buf_idx_r][plane]
+                  [(r + pixel_r) * frame_stride + c + pixel_c] !=
+          (uint16_t)(src16 ? src16[r * src_stride + c]
+                           : src[r * src_stride + c])) {
+        mismatch = 1;
+      }
+    }
+  }
+  if (mismatch) {
+    printf(
+        "\ncheck_block_tx failed frame_idx %d frame_offset %d plane %d pixel_c "
+        "%d pixel_r "
+        "%d blk_w %d blk_h %d\n",
+        frame_idx_r, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h);
+    printf("enc\n");
+    for (int rr = 0; rr < blk_h; ++rr) {
+      for (int cc = 0; cc < blk_w; ++cc) {
+        printf("%d ", frame_tx[frame_buf_idx_r][plane]
+                              [(rr + pixel_r) * frame_stride + cc + pixel_c]);
+      }
+      printf("\n");
+    }
+
+    printf("dec\n");
+    for (int rr = 0; rr < blk_h; ++rr) {
+      for (int cc = 0; cc < blk_w; ++cc) {
+        printf("%d ",
+               src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]);
+      }
+      printf("\n");
+    }
+    assert(0);
+  }
+}
+#endif  // CONFIG_MISMATCH_DEBUG
diff --git a/third_party/aom/aom_util/debug_util.h b/third_party/aom/aom_util/debug_util.h
index 3740620a2..4096801db 100644
--- a/third_party/aom/aom_util/debug_util.h
+++ b/third_party/aom/aom_util/debug_util.h
@@ -12,13 +12,20 @@
 #ifndef AOM_UTIL_DEBUG_UTIL_H_
 #define AOM_UTIL_DEBUG_UTIL_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_dsp/prob.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+void bitstream_queue_set_frame_write(int frame_idx);
+int bitstream_queue_get_frame_write(void);
+void bitstream_queue_set_frame_read(int frame_idx);
+int bitstream_queue_get_frame_read(void);
+
+#if CONFIG_BITSTREAM_DEBUG
 /* This is a debug tool used to detect bitstream error. On encoder side, it
  * pushes each bit and probability into a queue before the bit is written into
  * the Arithmetic coder. On decoder side, whenever a bit is read out from the
@@ -35,10 +42,25 @@ void bitstream_queue_pop(int *result, aom_cdf_prob *cdf, int *nsymbs);
 void bitstream_queue_push(int result, const aom_cdf_prob *cdf, int nsymbs);
 void bitstream_queue_set_skip_write(int skip);
 void bitstream_queue_set_skip_read(int skip);
-void bitstream_queue_set_frame_write(int frame_idx);
-int bitstream_queue_get_frame_write(void);
-void bitstream_queue_set_frame_read(int frame_idx);
-int bitstream_queue_get_frame_read(void);
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#if CONFIG_MISMATCH_DEBUG
+void mismatch_move_frame_idx_w();
+void mismatch_move_frame_idx_r();
+void mismatch_reset_frame(int num_planes);
+void mismatch_record_block_pre(const uint8_t *src, int src_stride,
+                               int frame_offset, int plane, int pixel_c,
+                               int pixel_r, int blk_w, int blk_h, int highbd);
+void mismatch_record_block_tx(const uint8_t *src, int src_stride,
+                              int frame_offset, int plane, int pixel_c,
+                              int pixel_r, int blk_w, int blk_h, int highbd);
+void mismatch_check_block_pre(const uint8_t *src, int src_stride,
+                              int frame_offset, int plane, int pixel_c,
+                              int pixel_r, int blk_w, int blk_h, int highbd);
+void mismatch_check_block_tx(const uint8_t *src, int src_stride,
+                             int frame_offset, int plane, int pixel_c,
+                             int pixel_r, int blk_w, int blk_h, int highbd);
+#endif  // CONFIG_MISMATCH_DEBUG
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/aom_util/endian_inl.h b/third_party/aom/aom_util/endian_inl.h
index 17a238649..2d2822141 100644
--- a/third_party/aom/aom_util/endian_inl.h
+++ b/third_party/aom/aom_util/endian_inl.h
@@ -15,7 +15,9 @@
 #define AOM_UTIL_ENDIAN_INL_H_
 
 #include <stdlib.h>
-#include "./aom_config.h"
+
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 
 #if defined(__GNUC__)
diff --git a/third_party/aom/aomdec.c b/third_party/aom/apps/aomdec.c
index e07af353a..6c4d724a4 100644
--- a/third_party/aom/aomdec.c
+++ b/third_party/aom/apps/aomdec.c
@@ -16,7 +16,7 @@
 #include <string.h>
 #include <limits.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #if CONFIG_OS_SUPPORT
 #if HAVE_UNISTD_H
@@ -26,36 +26,36 @@
 #endif
 #endif
 
-#if CONFIG_LIBYUV
-#include "third_party/libyuv/include/libyuv/scale.h"
-#endif
-
-#include "./args.h"
-#include "./ivfdec.h"
-
 #include "aom/aom_decoder.h"
-#include "aom_ports/mem_ops.h"
+#include "aom/aomdx.h"
 #include "aom_ports/aom_timer.h"
+#include "aom_ports/mem_ops.h"
+#include "common/args.h"
+#include "common/ivfdec.h"
+#include "common/md5_utils.h"
+#include "common/obudec.h"
+#include "common/tools_common.h"
 
-#if CONFIG_AV1_DECODER
-#include "aom/aomdx.h"
+#if CONFIG_WEBM_IO
+#include "common/webmdec.h"
 #endif
 
-#include "./md5_utils.h"
+#include "common/y4menc.h"
 
-#include "./tools_common.h"
-#if CONFIG_WEBM_IO
-#include "./webmdec.h"
+#if CONFIG_LIBYUV
+#include "third_party/libyuv/include/libyuv/scale.h"
 #endif
-#include "./y4menc.h"
 
 static const char *exec_name;
 
 struct AvxDecInputContext {
   struct AvxInputContext *aom_input_ctx;
+  struct ObuDecInputContext *obu_ctx;
   struct WebmInputContext *webm_ctx;
 };
 
+static const arg_def_t help =
+    ARG_DEF(NULL, "help", 0, "Show usage options and exit");
 static const arg_def_t looparg =
     ARG_DEF(NULL, "loops", 1, "Number of times to decode the file");
 static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use");
@@ -83,8 +83,6 @@ static const arg_def_t outputfile =
     ARG_DEF("o", "output", 1, "Output file name pattern (see below)");
 static const arg_def_t threadsarg =
     ARG_DEF("t", "threads", 1, "Max threads to use");
-static const arg_def_t frameparallelarg =
-    ARG_DEF(NULL, "frame-parallel", 0, "Frame parallel decode");
 static const arg_def_t verbosearg =
     ARG_DEF("v", "verbose", 0, "Show version string");
 static const arg_def_t scalearg =
@@ -97,52 +95,36 @@ static const arg_def_t md5arg =
     ARG_DEF(NULL, "md5", 0, "Compute the MD5 sum of the decoded frame");
 static const arg_def_t framestatsarg =
     ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)");
-#if CONFIG_HIGHBITDEPTH
 static const arg_def_t outbitdeptharg =
     ARG_DEF(NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames");
-#endif
-#if CONFIG_EXT_TILE
+static const arg_def_t tilem = ARG_DEF(NULL, "tile-mode", 1,
+                                       "Tile coding mode "
+                                       "(0 for normal tile coding mode)");
 static const arg_def_t tiler = ARG_DEF(NULL, "tile-row", 1,
                                        "Row index of tile to decode "
                                        "(-1 for all rows)");
 static const arg_def_t tilec = ARG_DEF(NULL, "tile-column", 1,
                                        "Column index of tile to decode "
                                        "(-1 for all columns)");
-#endif  // CONFIG_EXT_TILE
-
-static const arg_def_t *all_args[] = { &codecarg,
-                                       &use_yv12,
-                                       &use_i420,
-                                       &flipuvarg,
-                                       &rawvideo,
-                                       &noblitarg,
-                                       &progressarg,
-                                       &limitarg,
-                                       &skiparg,
-                                       &postprocarg,
-                                       &summaryarg,
-                                       &outputfile,
-                                       &threadsarg,
-                                       &frameparallelarg,
-                                       &verbosearg,
-                                       &scalearg,
-                                       &fb_arg,
-                                       &md5arg,
-                                       &framestatsarg,
-                                       &continuearg,
-#if CONFIG_HIGHBITDEPTH
-                                       &outbitdeptharg,
-#endif
-#if CONFIG_EXT_TILE
-                                       &tiler,
-                                       &tilec,
-#endif  // CONFIG_EXT_TILE
-                                       NULL };
+static const arg_def_t isannexb =
+    ARG_DEF(NULL, "annexb", 0, "Bitstream is in Annex-B format");
+static const arg_def_t oppointarg = ARG_DEF(
+    NULL, "oppoint", 1, "Select an operating point of a scalable bitstream");
+static const arg_def_t outallarg = ARG_DEF(
+    NULL, "all-layers", 0, "Output all decoded frames of a scalable bitstream");
+
+static const arg_def_t *all_args[] = {
+  &help,           &codecarg,   &use_yv12,    &use_i420,      &flipuvarg,
+  &rawvideo,       &noblitarg,  &progressarg, &limitarg,      &skiparg,
+  &postprocarg,    &summaryarg, &outputfile,  &threadsarg,    &verbosearg,
+  &scalearg,       &fb_arg,     &md5arg,      &framestatsarg, &continuearg,
+  &outbitdeptharg, &tilem,      &tiler,       &tilec,         &isannexb,
+  &oppointarg,     &outallarg,  NULL
+};
 
 #if CONFIG_LIBYUV
 static INLINE int libyuv_scale(aom_image_t *src, aom_image_t *dst,
                                FilterModeEnum mode) {
-#if CONFIG_HIGHBITDEPTH
   if (src->fmt == AOM_IMG_FMT_I42016) {
     assert(dst->fmt == AOM_IMG_FMT_I42016);
     return I420Scale_16(
@@ -154,7 +136,6 @@ static INLINE int libyuv_scale(aom_image_t *src, aom_image_t *dst,
         dst->stride[AOM_PLANE_U] / 2, (uint16_t *)dst->planes[AOM_PLANE_V],
         dst->stride[AOM_PLANE_V] / 2, dst->d_w, dst->d_h, mode);
   }
-#endif
   assert(src->fmt == AOM_IMG_FMT_I420);
   assert(dst->fmt == AOM_IMG_FMT_I420);
   return I420Scale(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y],
@@ -167,36 +148,41 @@ static INLINE int libyuv_scale(aom_image_t *src, aom_image_t *dst,
 }
 #endif
 
-void usage_exit(void) {
-  int i;
+void show_help(FILE *fout, int shorthelp) {
+  fprintf(fout, "Usage: %s <options> filename\n\n", exec_name);
 
-  fprintf(stderr,
-          "Usage: %s <options> filename\n\n"
-          "Options:\n",
-          exec_name);
-  arg_show_usage(stderr, all_args);
-  fprintf(stderr,
+  if (shorthelp) {
+    fprintf(fout, "Use --help to see the full list of options.\n");
+    return;
+  }
+
+  fprintf(fout, "Options:\n");
+  arg_show_usage(fout, all_args);
+  fprintf(fout,
           "\nOutput File Patterns:\n\n"
           "  The -o argument specifies the name of the file(s) to "
           "write to. If the\n  argument does not include any escape "
           "characters, the output will be\n  written to a single file. "
           "Otherwise, the filename will be calculated by\n  expanding "
           "the following escape characters:\n");
-  fprintf(stderr,
+  fprintf(fout,
           "\n\t%%w   - Frame width"
           "\n\t%%h   - Frame height"
           "\n\t%%<n> - Frame number, zero padded to <n> places (1..9)"
           "\n\n  Pattern arguments are only supported in conjunction "
           "with the --yv12 and\n  --i420 options. If the -o option is "
           "not specified, the output will be\n  directed to stdout.\n");
-  fprintf(stderr, "\nIncluded decoders:\n\n");
+  fprintf(fout, "\nIncluded decoders:\n\n");
 
-  for (i = 0; i < get_aom_decoder_count(); ++i) {
+  for (int i = 0; i < get_aom_decoder_count(); ++i) {
     const AvxInterface *const decoder = get_aom_decoder_by_index(i);
-    fprintf(stderr, "    %-6s - %s\n", decoder->name,
+    fprintf(fout, "    %-6s - %s\n", decoder->name,
             aom_codec_iface_name(decoder->codec_interface()));
   }
+}
 
+void usage_exit(void) {
+  show_help(stderr, 1);
   exit(EXIT_FAILURE);
 }
 
@@ -250,14 +236,18 @@ static int read_frame(struct AvxDecInputContext *input, uint8_t **buf,
   switch (input->aom_input_ctx->file_type) {
 #if CONFIG_WEBM_IO
     case FILE_TYPE_WEBM:
-      return webm_read_frame(input->webm_ctx, buf, bytes_in_buffer);
+      return webm_read_frame(input->webm_ctx, buf, bytes_in_buffer,
+                             buffer_size);
 #endif
     case FILE_TYPE_RAW:
       return raw_read_frame(input->aom_input_ctx->file, buf, bytes_in_buffer,
                             buffer_size);
     case FILE_TYPE_IVF:
       return ivf_read_frame(input->aom_input_ctx->file, buf, bytes_in_buffer,
-                            buffer_size);
+                            buffer_size, NULL);
+    case FILE_TYPE_OBU:
+      return obudec_read_temporal_unit(input->obu_ctx, buf, bytes_in_buffer,
+                                       buffer_size);
     default: return 1;
   }
 }
@@ -281,16 +271,12 @@ static void update_image_md5(const aom_image_t *img, const int planes[3],
   }
 }
 
-static void write_image_file(const aom_image_t *img, const int planes[3],
-                             FILE *file) {
+static void write_image_file(const aom_image_t *img, const int *planes,
+                             const int num_planes, FILE *file) {
   int i, y;
-#if CONFIG_HIGHBITDEPTH
   const int bytes_per_sample = ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
-#else
-  const int bytes_per_sample = 1;
-#endif
 
-  for (i = 0; i < 3; ++i) {
+  for (i = 0; i < num_planes; ++i) {
     const int plane = planes[i];
     const unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
@@ -308,6 +294,7 @@ static int file_is_raw(struct AvxInputContext *input) {
   uint8_t buf[32];
   int is_raw = 0;
   aom_codec_stream_info_t si;
+  memset(&si, 0, sizeof(si));
 
   if (fread(buf, 1, 32, input->file) == 32) {
     int i;
@@ -483,14 +470,12 @@ static FILE *open_outfile(const char *name) {
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 static int img_shifted_realloc_required(const aom_image_t *img,
                                         const aom_image_t *shifted,
                                         aom_img_fmt_t required_fmt) {
   return img->d_w != shifted->d_w || img->d_h != shifted->d_h ||
          required_fmt != shifted->fmt;
 }
-#endif
 
 static int main_loop(int argc, const char **argv_) {
   aom_codec_ctx_t decoder;
@@ -501,7 +486,7 @@ static int main_loop(int argc, const char **argv_) {
   size_t bytes_in_buffer = 0, buffer_size = 0;
   FILE *infile;
   int frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0;
-  int do_md5 = 0, progress = 0, frame_parallel = 0;
+  int do_md5 = 0, progress = 0;
   int stop_after = 0, postproc = 0, summary = 0, quiet = 1;
   int arg_skip = 0;
   int keep_going = 0;
@@ -515,21 +500,20 @@ static int main_loop(int argc, const char **argv_) {
   int use_y4m = 1;
   int opt_yv12 = 0;
   int opt_i420 = 0;
-  aom_codec_dec_cfg_t cfg = { 0, 0, 0, CONFIG_LOWBITDEPTH };
-#if CONFIG_HIGHBITDEPTH
+  int opt_raw = 0;
+  aom_codec_dec_cfg_t cfg = { 0, 0, 0, CONFIG_LOWBITDEPTH, { 1 } };
   unsigned int output_bit_depth = 0;
-#endif
-#if CONFIG_EXT_TILE
+  unsigned int tile_mode = 0;
+  unsigned int is_annexb = 0;
   int tile_row = -1;
   int tile_col = -1;
-#endif  // CONFIG_EXT_TILE
   int frames_corrupted = 0;
   int dec_flags = 0;
   int do_scale = 0;
+  int operating_point = 0;
+  int output_all_layers = 0;
   aom_image_t *scaled_img = NULL;
-#if CONFIG_HIGHBITDEPTH
   aom_image_t *img_shifted = NULL;
-#endif
   int frame_avail, got_data, flush_decoder = 0;
   int num_external_frame_buffers = 0;
   struct ExternalFrameBufferList ext_fb_list = { 0, NULL };
@@ -543,13 +527,18 @@ static int main_loop(int argc, const char **argv_) {
   MD5Context md5_ctx;
   unsigned char md5_digest[16];
 
-  struct AvxDecInputContext input = { NULL, NULL };
+  struct AvxDecInputContext input = { NULL, NULL, NULL };
   struct AvxInputContext aom_input_ctx;
+  memset(&aom_input_ctx, 0, sizeof(aom_input_ctx));
 #if CONFIG_WEBM_IO
   struct WebmInputContext webm_ctx;
-  memset(&(webm_ctx), 0, sizeof(webm_ctx));
+  memset(&webm_ctx, 0, sizeof(webm_ctx));
   input.webm_ctx = &webm_ctx;
 #endif
+  struct ObuDecInputContext obu_ctx = { NULL, NULL, 0, 0, 0 };
+
+  obu_ctx.avx_ctx = &aom_input_ctx;
+  input.obu_ctx = &obu_ctx;
   input.aom_input_ctx = &aom_input_ctx;
 
   /* Parse command line */
@@ -560,7 +549,10 @@ static int main_loop(int argc, const char **argv_) {
     memset(&arg, 0, sizeof(arg));
     arg.argv_step = 1;
 
-    if (arg_match(&arg, &codecarg, argi)) {
+    if (arg_match(&arg, &help, argi)) {
+      show_help(stdout, 0);
+      exit(EXIT_SUCCESS);
+    } else if (arg_match(&arg, &codecarg, argi)) {
       interface = get_aom_decoder_by_name(arg.val);
       if (!interface)
         die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
@@ -572,12 +564,19 @@ static int main_loop(int argc, const char **argv_) {
       use_y4m = 0;
       flipuv = 1;
       opt_yv12 = 1;
+      opt_i420 = 0;
+      opt_raw = 0;
     } else if (arg_match(&arg, &use_i420, argi)) {
       use_y4m = 0;
       flipuv = 0;
+      opt_yv12 = 0;
       opt_i420 = 1;
+      opt_raw = 0;
     } else if (arg_match(&arg, &rawvideo, argi)) {
       use_y4m = 0;
+      opt_yv12 = 0;
+      opt_i420 = 0;
+      opt_raw = 1;
     } else if (arg_match(&arg, &flipuvarg, argi)) {
       flipuv = 1;
     } else if (arg_match(&arg, &noblitarg, argi)) {
@@ -602,32 +601,32 @@ static int main_loop(int argc, const char **argv_) {
       summary = 1;
     } else if (arg_match(&arg, &threadsarg, argi)) {
       cfg.threads = arg_parse_uint(&arg);
-    }
-#if CONFIG_AV1_DECODER
-    else if (arg_match(&arg, &frameparallelarg, argi))
-      frame_parallel = 1;
-#endif
-    else if (arg_match(&arg, &verbosearg, argi))
+    } else if (arg_match(&arg, &verbosearg, argi)) {
       quiet = 0;
-    else if (arg_match(&arg, &scalearg, argi))
+    } else if (arg_match(&arg, &scalearg, argi)) {
       do_scale = 1;
-    else if (arg_match(&arg, &fb_arg, argi))
+    } else if (arg_match(&arg, &fb_arg, argi)) {
       num_external_frame_buffers = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &continuearg, argi))
+    } else if (arg_match(&arg, &continuearg, argi)) {
       keep_going = 1;
-#if CONFIG_HIGHBITDEPTH
-    else if (arg_match(&arg, &outbitdeptharg, argi)) {
+    } else if (arg_match(&arg, &outbitdeptharg, argi)) {
       output_bit_depth = arg_parse_uint(&arg);
-    }
-#endif
-#if CONFIG_EXT_TILE
-    else if (arg_match(&arg, &tiler, argi))
+    } else if (arg_match(&arg, &tilem, argi)) {
+      tile_mode = arg_parse_int(&arg);
+    } else if (arg_match(&arg, &isannexb, argi)) {
+      is_annexb = 1;
+      input.obu_ctx->is_annexb = 1;
+    } else if (arg_match(&arg, &tiler, argi)) {
       tile_row = arg_parse_int(&arg);
-    else if (arg_match(&arg, &tilec, argi))
+    } else if (arg_match(&arg, &tilec, argi)) {
       tile_col = arg_parse_int(&arg);
-#endif  // CONFIG_EXT_TILE
-    else
+    } else if (arg_match(&arg, &oppointarg, argi)) {
+      operating_point = arg_parse_int(&arg);
+    } else if (arg_match(&arg, &outallarg, argi)) {
+      output_all_layers = 1;
+    } else {
       argj++;
+    }
   }
 
   /* Check for unrecognized options */
@@ -640,6 +639,7 @@ static int main_loop(int argc, const char **argv_) {
 
   if (!fn) {
     free(argv);
+    fprintf(stderr, "No input file specified!\n");
     usage_exit();
   }
   /* Open file */
@@ -657,6 +657,7 @@ static int main_loop(int argc, const char **argv_) {
     return EXIT_FAILURE;
   }
 #endif
+  input.aom_input_ctx->filename = fn;
   input.aom_input_ctx->file = infile;
   if (file_is_ivf(input.aom_input_ctx))
     input.aom_input_ctx->file_type = FILE_TYPE_IVF;
@@ -664,6 +665,8 @@ static int main_loop(int argc, const char **argv_) {
   else if (file_is_webm(input.webm_ctx, input.aom_input_ctx))
     input.aom_input_ctx->file_type = FILE_TYPE_WEBM;
 #endif
+  else if (file_is_obu(&obu_ctx))
+    input.aom_input_ctx->file_type = FILE_TYPE_OBU;
   else if (file_is_raw(input.aom_input_ctx))
     input.aom_input_ctx->file_type = FILE_TYPE_RAW;
   else {
@@ -714,8 +717,7 @@ static int main_loop(int argc, const char **argv_) {
 
   if (!interface) interface = get_aom_decoder_by_index(0);
 
-  dec_flags = (postproc ? AOM_CODEC_USE_POSTPROC : 0) |
-              (frame_parallel ? AOM_CODEC_USE_FRAME_THREADING : 0);
+  dec_flags = (postproc ? AOM_CODEC_USE_POSTPROC : 0);
   if (aom_codec_dec_init(&decoder, interface->codec_interface(), &cfg,
                          dec_flags)) {
     fprintf(stderr, "Failed to initialize decoder: %s\n",
@@ -725,7 +727,18 @@ static int main_loop(int argc, const char **argv_) {
 
   if (!quiet) fprintf(stderr, "%s\n", decoder.name);
 
-#if CONFIG_AV1_DECODER && CONFIG_EXT_TILE
+#if CONFIG_AV1_DECODER
+  if (aom_codec_control(&decoder, AV1_SET_TILE_MODE, tile_mode)) {
+    fprintf(stderr, "Failed to set decode_tile_mode: %s\n",
+            aom_codec_error(&decoder));
+    goto fail;
+  }
+
+  if (aom_codec_control(&decoder, AV1D_SET_IS_ANNEXB, is_annexb)) {
+    fprintf(stderr, "Failed to set is_annexb: %s\n", aom_codec_error(&decoder));
+    goto fail;
+  }
+
   if (aom_codec_control(&decoder, AV1_SET_DECODE_TILE_ROW, tile_row)) {
     fprintf(stderr, "Failed to set decode_tile_row: %s\n",
             aom_codec_error(&decoder));
@@ -737,6 +750,19 @@ static int main_loop(int argc, const char **argv_) {
             aom_codec_error(&decoder));
     goto fail;
   }
+
+  if (aom_codec_control(&decoder, AV1D_SET_OPERATING_POINT, operating_point)) {
+    fprintf(stderr, "Failed to set operating_point: %s\n",
+            aom_codec_error(&decoder));
+    goto fail;
+  }
+
+  if (aom_codec_control(&decoder, AV1D_SET_OUTPUT_ALL_LAYERS,
+                        output_all_layers)) {
+    fprintf(stderr, "Failed to set output_all_layers: %s\n",
+            aom_codec_error(&decoder));
+    goto fail;
+  }
 #endif
 
   if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip);
@@ -778,8 +804,7 @@ static int main_loop(int argc, const char **argv_) {
 
         aom_usec_timer_start(&timer);
 
-        if (aom_codec_decode(&decoder, buf, (unsigned int)bytes_in_buffer, NULL,
-                             0)) {
+        if (aom_codec_decode(&decoder, buf, bytes_in_buffer, NULL)) {
           const char *detail = aom_codec_error_detail(&decoder);
           warn("Failed to decode frame %d: %s", frame_in,
                aom_codec_error(&decoder));
@@ -811,181 +836,178 @@ static int main_loop(int argc, const char **argv_) {
 
     if (flush_decoder) {
       // Flush the decoder in frame parallel decode.
-      if (aom_codec_decode(&decoder, NULL, 0, NULL, 0)) {
+      if (aom_codec_decode(&decoder, NULL, 0, NULL)) {
         warn("Failed to flush decoder: %s", aom_codec_error(&decoder));
       }
     }
 
+    aom_usec_timer_mark(&timer);
+    dx_time += aom_usec_timer_elapsed(&timer);
+
     got_data = 0;
-    if ((img = aom_codec_get_frame(&decoder, &iter))) {
+    while ((img = aom_codec_get_frame(&decoder, &iter))) {
       ++frame_out;
       got_data = 1;
-    }
 
-    aom_usec_timer_mark(&timer);
-    dx_time += (unsigned int)aom_usec_timer_elapsed(&timer);
+      if (aom_codec_control(&decoder, AOMD_GET_FRAME_CORRUPTED, &corrupted)) {
+        warn("Failed AOM_GET_FRAME_CORRUPTED: %s", aom_codec_error(&decoder));
+        if (!keep_going) goto fail;
+      }
+      frames_corrupted += corrupted;
 
-    if (!frame_parallel &&
-        aom_codec_control(&decoder, AOMD_GET_FRAME_CORRUPTED, &corrupted)) {
-      warn("Failed AOM_GET_FRAME_CORRUPTED: %s", aom_codec_error(&decoder));
-      if (!keep_going) goto fail;
-    }
-    frames_corrupted += corrupted;
-
-    if (progress) show_progress(frame_in, frame_out, dx_time);
-
-    if (!noblit && img) {
-      const int PLANES_YUV[] = { AOM_PLANE_Y, AOM_PLANE_U, AOM_PLANE_V };
-      const int PLANES_YVU[] = { AOM_PLANE_Y, AOM_PLANE_V, AOM_PLANE_U };
-      const int *planes = flipuv ? PLANES_YVU : PLANES_YUV;
-
-      if (do_scale) {
-        if (frame_out == 1) {
-          // If the output frames are to be scaled to a fixed display size then
-          // use the width and height specified in the container. If either of
-          // these is set to 0, use the display size set in the first frame
-          // header. If that is unavailable, use the raw decoded size of the
-          // first decoded frame.
-          int render_width = aom_input_ctx.width;
-          int render_height = aom_input_ctx.height;
-          if (!render_width || !render_height) {
-            int render_size[2];
-            if (aom_codec_control(&decoder, AV1D_GET_DISPLAY_SIZE,
-                                  render_size)) {
-              // As last resort use size of first frame as display size.
-              render_width = img->d_w;
-              render_height = img->d_h;
-            } else {
-              render_width = render_size[0];
-              render_height = render_size[1];
+      if (progress) show_progress(frame_in, frame_out, dx_time);
+
+      if (!noblit) {
+        const int PLANES_YUV[] = { AOM_PLANE_Y, AOM_PLANE_U, AOM_PLANE_V };
+        const int PLANES_YVU[] = { AOM_PLANE_Y, AOM_PLANE_V, AOM_PLANE_U };
+        const int *planes = flipuv ? PLANES_YVU : PLANES_YUV;
+
+        if (do_scale) {
+          if (frame_out == 1) {
+            // If the output frames are to be scaled to a fixed display size
+            // then use the width and height specified in the container. If
+            // either of these is set to 0, use the display size set in the
+            // first frame header. If that is unavailable, use the raw decoded
+            // size of the first decoded frame.
+            int render_width = aom_input_ctx.width;
+            int render_height = aom_input_ctx.height;
+            if (!render_width || !render_height) {
+              int render_size[2];
+              if (aom_codec_control(&decoder, AV1D_GET_DISPLAY_SIZE,
+                                    render_size)) {
+                // As last resort use size of first frame as display size.
+                render_width = img->d_w;
+                render_height = img->d_h;
+              } else {
+                render_width = render_size[0];
+                render_height = render_size[1];
+              }
             }
+            scaled_img =
+                aom_img_alloc(NULL, img->fmt, render_width, render_height, 16);
+            scaled_img->bit_depth = img->bit_depth;
           }
-          scaled_img =
-              aom_img_alloc(NULL, img->fmt, render_width, render_height, 16);
-          scaled_img->bit_depth = img->bit_depth;
-        }
 
-        if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) {
+          if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) {
 #if CONFIG_LIBYUV
-          libyuv_scale(img, scaled_img, kFilterBox);
-          img = scaled_img;
+            libyuv_scale(img, scaled_img, kFilterBox);
+            img = scaled_img;
 #else
-          fprintf(stderr,
-                  "Failed  to scale output frame: %s.\n"
-                  "Scaling is disabled in this configuration. "
-                  "To enable scaling, configure with --enable-libyuv\n",
-                  aom_codec_error(&decoder));
-          goto fail;
+            fprintf(
+                stderr,
+                "Failed to scale output frame: %s.\n"
+                "libyuv is required for scaling but is currently disabled.\n"
+                "Be sure to specify -DCONFIG_LIBYUV=1 when running cmake.\n",
+                aom_codec_error(&decoder));
+            goto fail;
 #endif
-        }
-      }
-#if CONFIG_HIGHBITDEPTH
-      // Default to codec bit depth if output bit depth not set
-      if (!output_bit_depth && single_file && !do_md5) {
-        output_bit_depth = img->bit_depth;
-      }
-      // Shift up or down if necessary
-      if (output_bit_depth != 0) {
-        const aom_img_fmt_t shifted_fmt =
-            output_bit_depth == 8
-                ? img->fmt ^ (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH)
-                : img->fmt | AOM_IMG_FMT_HIGHBITDEPTH;
-
-        if (shifted_fmt != img->fmt || output_bit_depth != img->bit_depth) {
-          if (img_shifted &&
-              img_shifted_realloc_required(img, img_shifted, shifted_fmt)) {
-            aom_img_free(img_shifted);
-            img_shifted = NULL;
-          }
-          if (!img_shifted) {
-            img_shifted =
-                aom_img_alloc(NULL, shifted_fmt, img->d_w, img->d_h, 16);
-            img_shifted->bit_depth = output_bit_depth;
           }
-          if (output_bit_depth > img->bit_depth) {
-            aom_img_upshift(img_shifted, img,
-                            output_bit_depth - img->bit_depth);
-          } else {
-            aom_img_downshift(img_shifted, img,
-                              img->bit_depth - output_bit_depth);
+        }
+        // Default to codec bit depth if output bit depth not set
+        if (!output_bit_depth && single_file && !do_md5) {
+          output_bit_depth = img->bit_depth;
+        }
+        // Shift up or down if necessary
+        if (output_bit_depth != 0) {
+          const aom_img_fmt_t shifted_fmt =
+              output_bit_depth == 8
+                  ? img->fmt ^ (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH)
+                  : img->fmt | AOM_IMG_FMT_HIGHBITDEPTH;
+
+          if (shifted_fmt != img->fmt || output_bit_depth != img->bit_depth) {
+            if (img_shifted &&
+                img_shifted_realloc_required(img, img_shifted, shifted_fmt)) {
+              aom_img_free(img_shifted);
+              img_shifted = NULL;
+            }
+            if (!img_shifted) {
+              img_shifted =
+                  aom_img_alloc(NULL, shifted_fmt, img->d_w, img->d_h, 16);
+              img_shifted->bit_depth = output_bit_depth;
+              img_shifted->monochrome = img->monochrome;
+            }
+            if (output_bit_depth > img->bit_depth) {
+              aom_img_upshift(img_shifted, img,
+                              output_bit_depth - img->bit_depth);
+            } else {
+              aom_img_downshift(img_shifted, img,
+                                img->bit_depth - output_bit_depth);
+            }
+            img = img_shifted;
           }
-          img = img_shifted;
         }
-      }
-#endif
 
-#if CONFIG_EXT_TILE
-      aom_input_ctx.width = img->d_w;
-      aom_input_ctx.height = img->d_h;
-#endif  // CONFIG_EXT_TILE
-
-      if (single_file) {
-        if (use_y4m) {
-          char y4m_buf[Y4M_BUFFER_SIZE] = { 0 };
-          size_t len = 0;
-          if (img->fmt == AOM_IMG_FMT_I440 || img->fmt == AOM_IMG_FMT_I44016) {
-            fprintf(stderr, "Cannot produce y4m output for 440 sampling.\n");
-            goto fail;
-          }
-          if (frame_out == 1) {
-            // Y4M file header
-            len = y4m_write_file_header(
-                y4m_buf, sizeof(y4m_buf), aom_input_ctx.width,
-                aom_input_ctx.height, &aom_input_ctx.framerate, img->fmt,
-                img->bit_depth);
+        aom_input_ctx.width = img->d_w;
+        aom_input_ctx.height = img->d_h;
+
+        int num_planes = (!use_y4m && opt_raw && img->monochrome) ? 1 : 3;
+
+        if (single_file) {
+          if (use_y4m) {
+            char y4m_buf[Y4M_BUFFER_SIZE] = { 0 };
+            size_t len = 0;
+            if (frame_out == 1) {
+              // Y4M file header
+              len = y4m_write_file_header(
+                  y4m_buf, sizeof(y4m_buf), aom_input_ctx.width,
+                  aom_input_ctx.height, &aom_input_ctx.framerate, img->fmt,
+                  img->bit_depth);
+              if (do_md5) {
+                MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len);
+              } else {
+                fputs(y4m_buf, outfile);
+              }
+            }
+
+            // Y4M frame header
+            len = y4m_write_frame_header(y4m_buf, sizeof(y4m_buf));
             if (do_md5) {
               MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len);
             } else {
               fputs(y4m_buf, outfile);
             }
-          }
-
-          // Y4M frame header
-          len = y4m_write_frame_header(y4m_buf, sizeof(y4m_buf));
-          if (do_md5) {
-            MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len);
           } else {
-            fputs(y4m_buf, outfile);
-          }
-        } else {
-          if (frame_out == 1) {
-            // Check if --yv12 or --i420 options are consistent with the
-            // bit-stream decoded
-            if (opt_i420) {
-              if (img->fmt != AOM_IMG_FMT_I420 &&
-                  img->fmt != AOM_IMG_FMT_I42016) {
-                fprintf(stderr, "Cannot produce i420 output for bit-stream.\n");
-                goto fail;
+            if (frame_out == 1) {
+              // Check if --yv12 or --i420 options are consistent with the
+              // bit-stream decoded
+              if (opt_i420) {
+                if (img->fmt != AOM_IMG_FMT_I420 &&
+                    img->fmt != AOM_IMG_FMT_I42016) {
+                  fprintf(stderr,
+                          "Cannot produce i420 output for bit-stream.\n");
+                  goto fail;
+                }
               }
-            }
-            if (opt_yv12) {
-              if ((img->fmt != AOM_IMG_FMT_I420 &&
-                   img->fmt != AOM_IMG_FMT_YV12) ||
-                  img->bit_depth != 8) {
-                fprintf(stderr, "Cannot produce yv12 output for bit-stream.\n");
-                goto fail;
+              if (opt_yv12) {
+                if ((img->fmt != AOM_IMG_FMT_I420 &&
+                     img->fmt != AOM_IMG_FMT_YV12) ||
+                    img->bit_depth != 8) {
+                  fprintf(stderr,
+                          "Cannot produce yv12 output for bit-stream.\n");
+                  goto fail;
+                }
               }
             }
           }
-        }
 
-        if (do_md5) {
-          update_image_md5(img, planes, &md5_ctx);
-        } else {
-          write_image_file(img, planes, outfile);
-        }
-      } else {
-        generate_filename(outfile_pattern, outfile_name, PATH_MAX, img->d_w,
-                          img->d_h, frame_in);
-        if (do_md5) {
-          MD5Init(&md5_ctx);
-          update_image_md5(img, planes, &md5_ctx);
-          MD5Final(md5_digest, &md5_ctx);
-          print_md5(md5_digest, outfile_name);
+          if (do_md5) {
+            update_image_md5(img, planes, &md5_ctx);
+          } else {
+            write_image_file(img, planes, num_planes, outfile);
+          }
         } else {
-          outfile = open_outfile(outfile_name);
-          write_image_file(img, planes, outfile);
-          fclose(outfile);
+          generate_filename(outfile_pattern, outfile_name, PATH_MAX, img->d_w,
+                            img->d_h, frame_in);
+          if (do_md5) {
+            MD5Init(&md5_ctx);
+            update_image_md5(img, planes, &md5_ctx);
+            MD5Final(md5_digest, &md5_ctx);
+            print_md5(md5_digest, outfile_name);
+          } else {
+            outfile = open_outfile(outfile_name);
+            write_image_file(img, planes, num_planes, outfile);
+            fclose(outfile);
+          }
         }
       }
     }
@@ -1024,13 +1046,13 @@ fail2:
   if (input.aom_input_ctx->file_type == FILE_TYPE_WEBM)
     webm_free(input.webm_ctx);
 #endif
+  if (input.aom_input_ctx->file_type == FILE_TYPE_OBU)
+    obudec_free(input.obu_ctx);
 
   if (input.aom_input_ctx->file_type != FILE_TYPE_WEBM) free(buf);
 
   if (scaled_img) aom_img_free(scaled_img);
-#if CONFIG_HIGHBITDEPTH
   if (img_shifted) aom_img_free(img_shifted);
-#endif
 
   for (i = 0; i < ext_fb_list.num_external_frame_buffers; ++i) {
     free(ext_fb_list.ext_fb[i].data);
diff --git a/third_party/aom/aomenc.c b/third_party/aom/apps/aomenc.c
index 43ad4bf03..db0910220 100644
--- a/third_party/aom/aomenc.c
+++ b/third_party/aom/apps/aomenc.c
@@ -9,8 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aomenc.h"
-#include "./aom_config.h"
+#include "apps/aomenc.h"
+
+#include "config/aom_config.h"
 
 #include <assert.h>
 #include <limits.h>
@@ -20,38 +21,34 @@
 #include <stdlib.h>
 #include <string.h>
 
-#if CONFIG_LIBYUV
-#include "third_party/libyuv/include/libyuv/scale.h"
-#endif
-
-#include "aom/aom_encoder.h"
 #if CONFIG_AV1_DECODER
 #include "aom/aom_decoder.h"
-#endif
-
-#include "./args.h"
-#include "./ivfenc.h"
-#include "./tools_common.h"
-#include "examples/encoder_util.h"
-
-#if CONFIG_AV1_ENCODER
-#include "aom/aomcx.h"
-#endif
-#if CONFIG_AV1_DECODER
 #include "aom/aomdx.h"
 #endif
 
-#include "./aomstats.h"
-#include "./rate_hist.h"
-#include "./warnings.h"
+#include "aom/aom_encoder.h"
 #include "aom/aom_integer.h"
+#include "aom/aomcx.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem_ops.h"
+#include "common/args.h"
+#include "common/ivfenc.h"
+#include "common/tools_common.h"
+#include "common/warnings.h"
+
 #if CONFIG_WEBM_IO
-#include "./webmenc.h"
+#include "common/webmenc.h"
+#endif
+
+#include "common/y4minput.h"
+#include "examples/encoder_util.h"
+#include "stats/aomstats.h"
+#include "stats/rate_hist.h"
+
+#if CONFIG_LIBYUV
+#include "third_party/libyuv/include/libyuv/scale.h"
 #endif
-#include "./y4minput.h"
 
 /* Swallow warnings about unused results of fread/fwrite */
 static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
@@ -126,6 +123,8 @@ static int fourcc_is_ivf(const char detect[4]) {
   return 0;
 }
 
+static const arg_def_t help =
+    ARG_DEF(NULL, "help", 0, "Show usage options and exit");
 static const arg_def_t debugmode =
     ARG_DEF("D", "debug", 0, "Debug mode (makes output deterministic)");
 static const arg_def_t outputfile =
@@ -138,8 +137,6 @@ static const arg_def_t use_i422 =
     ARG_DEF(NULL, "i422", 0, "Input file is I422");
 static const arg_def_t use_i444 =
     ARG_DEF(NULL, "i444", 0, "Input file is I444");
-static const arg_def_t use_i440 =
-    ARG_DEF(NULL, "i440", 0, "Input file is I440");
 static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use");
 static const arg_def_t passes =
     ARG_DEF("p", "passes", 1, "Number of passes (1/2)");
@@ -155,8 +152,6 @@ static const arg_def_t limit =
     ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames");
 static const arg_def_t skip =
     ARG_DEF(NULL, "skip", 1, "Skip the first n input frames");
-static const arg_def_t deadline =
-    ARG_DEF("d", "deadline", 1, "Deadline per frame (usec)");
 static const arg_def_t good_dl =
     ARG_DEF(NULL, "good", 0, "Use Good Quality Deadline");
 static const arg_def_t quietarg =
@@ -165,6 +160,11 @@ static const arg_def_t verbosearg =
     ARG_DEF("v", "verbose", 0, "Show encoder parameters");
 static const arg_def_t psnrarg =
     ARG_DEF(NULL, "psnr", 0, "Show PSNR in status line");
+#if CONFIG_FILEOPTIONS
+static const arg_def_t use_cfg = ARG_DEF("c", "cfg", 1, "Config file to use");
+static const arg_def_t ext_partition =
+    ARG_DEF(NULL, "ext-partition", 1, "corresponds to extended partitions");
+#endif
 
 static const struct arg_enum_list test_decode_enum[] = {
   { "off", TEST_DECODE_OFF },
@@ -179,6 +179,7 @@ static const arg_def_t framerate =
 static const arg_def_t use_webm =
     ARG_DEF(NULL, "webm", 0, "Output WebM (default when WebM IO is enabled)");
 static const arg_def_t use_ivf = ARG_DEF(NULL, "ivf", 0, "Output IVF");
+static const arg_def_t use_obu = ARG_DEF(NULL, "obu", 0, "Output OBU");
 static const arg_def_t out_part =
     ARG_DEF("P", "output-partitions", 0,
             "Makes encoder output partitions. Requires IVF output!");
@@ -192,8 +193,6 @@ static const arg_def_t disable_warnings =
 static const arg_def_t disable_warning_prompt =
     ARG_DEF("y", "disable-warning-prompt", 0,
             "Display warnings, but do not prompt user to continue.");
-
-#if CONFIG_HIGHBITDEPTH
 static const struct arg_enum_list bitdepth_enum[] = {
   { "8", AOM_BITS_8 }, { "10", AOM_BITS_10 }, { "12", AOM_BITS_12 }, { NULL, 0 }
 };
@@ -204,9 +203,11 @@ static const arg_def_t bitdeptharg = ARG_DEF_ENUM(
     bitdepth_enum);
 static const arg_def_t inbitdeptharg =
     ARG_DEF(NULL, "input-bit-depth", 1, "Bit depth of input");
+static const arg_def_t *main_args[] = { &help,
+#if CONFIG_FILEOPTIONS
+                                        &use_cfg,
 #endif
-
-static const arg_def_t *main_args[] = { &debugmode,
+                                        &debugmode,
                                         &outputfile,
                                         &codecarg,
                                         &passes,
@@ -214,13 +215,13 @@ static const arg_def_t *main_args[] = { &debugmode,
                                         &fpf_name,
                                         &limit,
                                         &skip,
-                                        &deadline,
                                         &good_dl,
                                         &quietarg,
                                         &verbosearg,
                                         &psnrarg,
                                         &use_webm,
                                         &use_ivf,
+                                        &use_obu,
                                         &out_part,
                                         &q_hist_n,
                                         &rate_hist_n,
@@ -237,6 +238,10 @@ static const arg_def_t profile =
     ARG_DEF(NULL, "profile", 1, "Bitstream profile number to use");
 static const arg_def_t width = ARG_DEF("w", "width", 1, "Frame width");
 static const arg_def_t height = ARG_DEF("h", "height", 1, "Frame height");
+static const arg_def_t forced_max_frame_width = ARG_DEF(
+    NULL, "forced_max_frame_width", 0, "Maximum frame width value to force");
+static const arg_def_t forced_max_frame_height = ARG_DEF(
+    NULL, "forced_max_frame_height", 0, "Maximum frame height value to force");
 #if CONFIG_WEBM_IO
 static const struct arg_enum_list stereo_mode_enum[] = {
   { "mono", STEREO_FORMAT_MONO },
@@ -251,39 +256,41 @@ static const arg_def_t stereo_mode = ARG_DEF_ENUM(
 #endif
 static const arg_def_t timebase = ARG_DEF(
     NULL, "timebase", 1, "Output timestamp precision (fractional seconds)");
-static const arg_def_t error_resilient =
-    ARG_DEF(NULL, "error-resilient", 1, "Enable error resiliency features");
+static const arg_def_t global_error_resilient =
+    ARG_DEF(NULL, "global-error-resilient", 1,
+            "Enable global error resiliency features");
 static const arg_def_t lag_in_frames =
     ARG_DEF(NULL, "lag-in-frames", 1, "Max number of frames to lag");
-#if CONFIG_EXT_TILE
 static const arg_def_t large_scale_tile =
     ARG_DEF(NULL, "large-scale-tile", 1,
             "Large scale tile coding (0: off (default), 1: on)");
-#endif  // CONFIG_EXT_TILE
+static const arg_def_t monochrome =
+    ARG_DEF(NULL, "monochrome", 0, "Monochrome video (no chroma planes)");
+static const arg_def_t full_still_picture_hdr = ARG_DEF(
+    NULL, "full-still-picture-hdr", 0, "Use full header for still picture");
 
 static const arg_def_t *global_args[] = { &use_yv12,
                                           &use_i420,
                                           &use_i422,
                                           &use_i444,
-                                          &use_i440,
                                           &usage,
                                           &threads,
                                           &profile,
                                           &width,
                                           &height,
+                                          &forced_max_frame_width,
+                                          &forced_max_frame_height,
 #if CONFIG_WEBM_IO
                                           &stereo_mode,
 #endif
                                           &timebase,
                                           &framerate,
-                                          &error_resilient,
-#if CONFIG_HIGHBITDEPTH
+                                          &global_error_resilient,
                                           &bitdeptharg,
-#endif
                                           &lag_in_frames,
-#if CONFIG_EXT_TILE
                                           &large_scale_tile,
-#endif  // CONFIG_EXT_TILE
+                                          &monochrome,
+                                          &full_still_picture_hdr,
                                           NULL };
 
 static const arg_def_t dropframe_thresh =
@@ -294,7 +301,6 @@ static const arg_def_t resize_denominator =
     ARG_DEF(NULL, "resize-denominator", 1, "Frame resize denominator");
 static const arg_def_t resize_kf_denominator = ARG_DEF(
     NULL, "resize-kf-denominator", 1, "Frame resize keyframe denominator");
-#if CONFIG_FRAME_SUPERRES
 static const arg_def_t superres_mode =
     ARG_DEF(NULL, "superres-mode", 1, "Frame super-resolution mode");
 static const arg_def_t superres_denominator = ARG_DEF(
@@ -307,7 +313,6 @@ static const arg_def_t superres_qthresh = ARG_DEF(
 static const arg_def_t superres_kf_qthresh =
     ARG_DEF(NULL, "superres-kf-qthresh", 1,
             "Frame super-resolution keyframe qindex threshold");
-#endif  // CONFIG_FRAME_SUPERRES
 static const struct arg_enum_list end_usage_enum[] = { { "vbr", AOM_VBR },
                                                        { "cbr", AOM_CBR },
                                                        { "cq", AOM_CQ },
@@ -335,13 +340,11 @@ static const arg_def_t *rc_args[] = { &dropframe_thresh,
                                       &resize_mode,
                                       &resize_denominator,
                                       &resize_kf_denominator,
-#if CONFIG_FRAME_SUPERRES
                                       &superres_mode,
                                       &superres_denominator,
                                       &superres_kf_denominator,
                                       &superres_qthresh,
                                       &superres_kf_qthresh,
-#endif  // CONFIG_FRAME_SUPERRES
                                       &end_usage,
                                       &target_bitrate,
                                       &min_quantizer,
@@ -361,16 +364,22 @@ static const arg_def_t maxsection_pct =
     ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)");
 static const arg_def_t *rc_twopass_args[] = { &bias_pct, &minsection_pct,
                                               &maxsection_pct, NULL };
-
+static const arg_def_t fwd_kf_enabled =
+    ARG_DEF(NULL, "enable-fwd-kf", 1, "Enable forward reference keyframes");
 static const arg_def_t kf_min_dist =
     ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)");
 static const arg_def_t kf_max_dist =
     ARG_DEF(NULL, "kf-max-dist", 1, "Maximum keyframe interval (frames)");
 static const arg_def_t kf_disabled =
     ARG_DEF(NULL, "disable-kf", 0, "Disable keyframe placement");
-static const arg_def_t *kf_args[] = { &kf_min_dist, &kf_max_dist, &kf_disabled,
-                                      NULL };
-
+static const arg_def_t *kf_args[] = { &fwd_kf_enabled, &kf_min_dist,
+                                      &kf_max_dist, &kf_disabled, NULL };
+static const arg_def_t sframe_dist =
+    ARG_DEF(NULL, "sframe-dist", 1, "S-Frame interval (frames)");
+static const arg_def_t sframe_mode =
+    ARG_DEF(NULL, "sframe-mode", 1, "S-Frame insertion mode (1..2)");
+static const arg_def_t save_as_annexb =
+    ARG_DEF(NULL, "annexb", 1, "Save as Annex-B");
 static const arg_def_t noise_sens =
     ARG_DEF(NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)");
 static const arg_def_t sharpness =
@@ -402,41 +411,41 @@ static const arg_def_t max_intra_rate_pct =
 #if CONFIG_AV1_ENCODER
 static const arg_def_t cpu_used_av1 =
     ARG_DEF(NULL, "cpu-used", 1, "CPU Used (0..8)");
-#if CONFIG_EXT_TILE
+static const arg_def_t dev_sf_av1 =
+    ARG_DEF(NULL, "dev-sf", 1, "Dev Speed (0..255)");
 static const arg_def_t single_tile_decoding =
     ARG_DEF(NULL, "single-tile-decoding", 1,
             "Single tile decoding (0: off (default), 1: on)");
-#endif  // CONFIG_EXT_TILE
 static const arg_def_t tile_cols =
     ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2");
 static const arg_def_t tile_rows =
     ARG_DEF(NULL, "tile-rows", 1,
             "Number of tile rows to use, log2 (set to 0 while threads > 1)");
-#if CONFIG_MAX_TILE
 static const arg_def_t tile_width =
     ARG_DEF(NULL, "tile-width", 1, "Tile widths (comma separated)");
 static const arg_def_t tile_height =
     ARG_DEF(NULL, "tile-height", 1, "Tile heights (command separated)");
-#endif
-#if CONFIG_DEPENDENT_HORZTILES
-static const arg_def_t tile_dependent_rows =
-    ARG_DEF(NULL, "tile-dependent-rows", 1, "Enable dependent Tile rows");
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-static const arg_def_t tile_loopfilter = ARG_DEF(
-    NULL, "tile-loopfilter", 1, "Enable loop filter across tile boundary");
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
 static const arg_def_t lossless =
     ARG_DEF(NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)");
-#if CONFIG_AOM_QM
+static const arg_def_t enable_cdef =
+    ARG_DEF(NULL, "enable-cdef", 1,
+            "Enable the constrained directional enhancement filter (0: false, "
+            "1: true (default))");
+static const arg_def_t enable_restoration =
+    ARG_DEF(NULL, "enable-restoration", 1,
+            "Enable the loop restoration filter (0: false, "
+            "1: true (default))");
+static const arg_def_t disable_trellis_quant =
+    ARG_DEF(NULL, "disable-trellis-quant", 1,
+            "Disable trellis optimization of quantized coefficients (0: false ("
+            "default) 1: true)");
 static const arg_def_t enable_qm =
     ARG_DEF(NULL, "enable-qm", 1,
             "Enable quantisation matrices (0: false (default), 1: true)");
 static const arg_def_t qm_min = ARG_DEF(
     NULL, "qm-min", 1, "Min quant matrix flatness (0..15), default is 8");
 static const arg_def_t qm_max = ARG_DEF(
-    NULL, "qm-max", 1, "Max quant matrix flatness (0..15), default is 16");
-#endif
+    NULL, "qm-max", 1, "Max quant matrix flatness (0..15), default is 15");
 #if CONFIG_DIST_8X8
 static const arg_def_t enable_dist_8x8 =
     ARG_DEF(NULL, "enable-dist-8x8", 1,
@@ -448,30 +457,42 @@ static const arg_def_t mtu_size =
     ARG_DEF(NULL, "mtu-size", 1,
             "MTU size for a tile group, default is 0 (no MTU targeting), "
             "overrides maximum number of tile groups");
-#if CONFIG_TEMPMV_SIGNALING
-static const arg_def_t disable_tempmv = ARG_DEF(
-    NULL, "disable-tempmv", 1, "Disable temporal mv prediction (default is 0)");
-#endif
+static const struct arg_enum_list timing_info_enum[] = {
+  { "unspecified", AOM_TIMING_UNSPECIFIED },
+  { "constant", AOM_TIMING_EQUAL },
+  { "model", AOM_TIMING_DEC_MODEL },
+  { NULL, 0 }
+};
+static const arg_def_t timing_info =
+    ARG_DEF_ENUM(NULL, "timing-info", 1,
+                 "Signal timing info in the bitstream (model unly works for no "
+                 "hidden frames, no super-res yet):",
+                 timing_info_enum);
+static const arg_def_t film_grain_test =
+    ARG_DEF(NULL, "film-grain-test", 1,
+            "Film grain test vectors (0: none (default), 1: test-1  2: test-2, "
+            "... 16: test-16)");
+static const arg_def_t film_grain_table =
+    ARG_DEF(NULL, "film-grain-table", 1,
+            "Path to file containing film grain parameters");
+static const arg_def_t enable_ref_frame_mvs =
+    ARG_DEF(NULL, "enable-ref-frame-mvs", 1,
+            "Enable temporal mv prediction (default is 1)");
 static const arg_def_t frame_parallel_decoding =
     ARG_DEF(NULL, "frame-parallel", 1,
             "Enable frame parallel decodability features "
             "(0: false (default), 1: true)");
-#if !CONFIG_EXT_DELTA_Q
-static const arg_def_t aq_mode = ARG_DEF(
-    NULL, "aq-mode", 1,
-    "Adaptive quantization mode (0: off (default), 1: variance 2: complexity, "
-    "3: cyclic refresh, 4: delta quant)");
-#else
+static const arg_def_t error_resilient_mode =
+    ARG_DEF(NULL, "error-resilient", 1,
+            "Enable error resilient features "
+            "(0: false (default), 1: true)");
 static const arg_def_t aq_mode = ARG_DEF(
     NULL, "aq-mode", 1,
     "Adaptive quantization mode (0: off (default), 1: variance 2: complexity, "
     "3: cyclic refresh)");
-#endif
-#if CONFIG_EXT_DELTA_Q
 static const arg_def_t deltaq_mode = ARG_DEF(
     NULL, "deltaq-mode", 1,
     "Delta qindex mode (0: off (default), 1: deltaq 2: deltaq + deltalf)");
-#endif
 static const arg_def_t frame_periodic_boost =
     ARG_DEF(NULL, "frame-boost", 1,
             "Enable frame periodic boost (0: off (default), 1: on)");
@@ -486,29 +507,73 @@ static const arg_def_t max_gf_interval = ARG_DEF(
     NULL, "max-gf-interval", 1,
     "max gf/arf frame interval (default 0, indicating in-built behavior)");
 
-static const struct arg_enum_list color_space_enum[] = {
-  { "unknown", AOM_CS_UNKNOWN },     { "bt601", AOM_CS_BT_601 },
-  { "bt709", AOM_CS_BT_709 },        { "smpte170", AOM_CS_SMPTE_170 },
-  { "smpte240", AOM_CS_SMPTE_240 },  { "bt2020ncl", AOM_CS_BT_2020_NCL },
-  { "bt2020cl", AOM_CS_BT_2020_CL }, { "sRGB", AOM_CS_SRGB },
-  { "ICtCp", AOM_CS_ICTCP },         { NULL, 0 }
+static const struct arg_enum_list color_primaries_enum[] = {
+  { "bt709", AOM_CICP_CP_BT_709 },
+  { "unspecified", AOM_CICP_CP_UNSPECIFIED },
+  { "bt601", AOM_CICP_CP_BT_601 },
+  { "bt470m", AOM_CICP_CP_BT_470_M },
+  { "bt470bg", AOM_CICP_CP_BT_470_B_G },
+  { "smpte240", AOM_CICP_CP_SMPTE_240 },
+  { "film", AOM_CICP_CP_GENERIC_FILM },
+  { "bt2020", AOM_CICP_CP_BT_2020 },
+  { "xyz", AOM_CICP_CP_XYZ },
+  { "smpte431", AOM_CICP_CP_SMPTE_431 },
+  { "smpte432", AOM_CICP_CP_SMPTE_432 },
+  { "ebu3213", AOM_CICP_CP_EBU_3213 },
+  { NULL, 0 }
 };
 
-static const arg_def_t input_color_space =
-    ARG_DEF_ENUM(NULL, "color-space", 1,
-                 "The color space of input content:", color_space_enum);
+static const arg_def_t input_color_primaries = ARG_DEF_ENUM(
+    NULL, "color-primaries", 1,
+    "Color primaries (CICP) of input content:", color_primaries_enum);
+
+static const struct arg_enum_list transfer_characteristics_enum[] = {
+  { "unspecified", AOM_CICP_CP_UNSPECIFIED },
+  { "bt709", AOM_CICP_TC_BT_709 },
+  { "bt470m", AOM_CICP_TC_BT_470_M },
+  { "bt470bg", AOM_CICP_TC_BT_470_B_G },
+  { "bt601", AOM_CICP_TC_BT_601 },
+  { "smpte240", AOM_CICP_TC_SMPTE_240 },
+  { "lin", AOM_CICP_TC_LINEAR },
+  { "log100", AOM_CICP_TC_LOG_100 },
+  { "log100sq10", AOM_CICP_TC_LOG_100_SQRT10 },
+  { "iec61966", AOM_CICP_TC_IEC_61966 },
+  { "bt1361", AOM_CICP_TC_BT_1361 },
+  { "srgb", AOM_CICP_TC_SRGB },
+  { "bt2020-10bit", AOM_CICP_TC_BT_2020_10_BIT },
+  { "bt2020-12bit", AOM_CICP_TC_BT_2020_12_BIT },
+  { "smpte2084", AOM_CICP_TC_SMPTE_2084 },
+  { "hlg", AOM_CICP_TC_HLG },
+  { "smpte428", AOM_CICP_TC_SMPTE_428 },
+  { NULL, 0 }
+};
 
-static const struct arg_enum_list transfer_function_enum[] = {
-  { "unknown", AOM_TF_UNKNOWN },
-  { "bt709", AOM_TF_BT_709 },
-  { "pq", AOM_TF_PQ },
-  { "hlg", AOM_TF_HLG },
+static const arg_def_t input_transfer_characteristics =
+    ARG_DEF_ENUM(NULL, "transfer-characteristics", 1,
+                 "Transfer characteristics (CICP) of input content:",
+                 transfer_characteristics_enum);
+
+static const struct arg_enum_list matrix_coefficients_enum[] = {
+  { "identity", AOM_CICP_MC_IDENTITY },
+  { "bt709", AOM_CICP_MC_BT_709 },
+  { "unspecified", AOM_CICP_MC_UNSPECIFIED },
+  { "fcc73", AOM_CICP_MC_FCC },
+  { "bt470bg", AOM_CICP_MC_BT_470_B_G },
+  { "bt601", AOM_CICP_MC_BT_601 },
+  { "smpte240", AOM_CICP_CP_SMPTE_240 },
+  { "ycgco", AOM_CICP_MC_SMPTE_YCGCO },
+  { "bt2020ncl", AOM_CICP_MC_BT_2020_NCL },
+  { "bt2020cl", AOM_CICP_MC_BT_2020_CL },
+  { "smpte2085", AOM_CICP_MC_SMPTE_2085 },
+  { "chromncl", AOM_CICP_MC_CHROMAT_NCL },
+  { "chromcl", AOM_CICP_MC_CHROMAT_CL },
+  { "ictcp", AOM_CICP_MC_ICTCP },
   { NULL, 0 }
 };
 
-static const arg_def_t input_transfer_function = ARG_DEF_ENUM(
-    NULL, "transfer-function", 1,
-    "The transfer function of input content:", transfer_function_enum);
+static const arg_def_t input_matrix_coefficients = ARG_DEF_ENUM(
+    NULL, "matrix-coefficients", 1,
+    "Matrix coefficients (CICP) of input content:", matrix_coefficients_enum);
 
 static const struct arg_enum_list chroma_sample_position_enum[] = {
   { "unknown", AOM_CSP_UNKNOWN },
@@ -530,10 +595,13 @@ static const struct arg_enum_list tune_content_enum[] = {
 
 static const arg_def_t tune_content = ARG_DEF_ENUM(
     NULL, "tune-content", 1, "Tune content type", tune_content_enum);
-#endif
 
-#if CONFIG_AV1_ENCODER
-#if CONFIG_EXT_PARTITION
+static const arg_def_t cdf_update_mode =
+    ARG_DEF(NULL, "cdf-update-mode", 1,
+            "CDF update mode for entropy coding "
+            "(0: no CDF update; 1: update CDF on all frames(default); "
+            "2: selectively update CDF on some frames");
+
 static const struct arg_enum_list superblock_size_enum[] = {
   { "dynamic", AOM_SUPERBLOCK_SIZE_DYNAMIC },
   { "64", AOM_SUPERBLOCK_SIZE_64X64 },
@@ -542,23 +610,15 @@ static const struct arg_enum_list superblock_size_enum[] = {
 };
 static const arg_def_t superblock_size = ARG_DEF_ENUM(
     NULL, "sb-size", 1, "Superblock size to use", superblock_size_enum);
-#endif  // CONFIG_EXT_PARTITION
 
 static const arg_def_t *av1_args[] = { &cpu_used_av1,
+                                       &dev_sf_av1,
                                        &auto_altref,
                                        &sharpness,
                                        &static_thresh,
-#if CONFIG_EXT_TILE
                                        &single_tile_decoding,
-#endif  // CONFIG_EXT_TILE
                                        &tile_cols,
                                        &tile_rows,
-#if CONFIG_DEPENDENT_HORZTILES
-                                       &tile_dependent_rows,
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-                                       &tile_loopfilter,
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
                                        &arnr_maxframes,
                                        &arnr_strength,
                                        &tune_metric,
@@ -567,55 +627,50 @@ static const arg_def_t *av1_args[] = { &cpu_used_av1,
                                        &max_inter_rate_pct,
                                        &gf_cbr_boost_pct,
                                        &lossless,
-#if CONFIG_AOM_QM
+                                       &enable_cdef,
+                                       &enable_restoration,
+                                       &disable_trellis_quant,
                                        &enable_qm,
                                        &qm_min,
                                        &qm_max,
-#endif
 #if CONFIG_DIST_8X8
                                        &enable_dist_8x8,
 #endif
                                        &frame_parallel_decoding,
+                                       &error_resilient_mode,
                                        &aq_mode,
-#if CONFIG_EXT_DELTA_Q
                                        &deltaq_mode,
-#endif
                                        &frame_periodic_boost,
                                        &noise_sens,
                                        &tune_content,
-                                       &input_color_space,
-                                       &input_transfer_function,
+                                       &cdf_update_mode,
+                                       &input_color_primaries,
+                                       &input_transfer_characteristics,
+                                       &input_matrix_coefficients,
                                        &input_chroma_sample_position,
                                        &min_gf_interval,
                                        &max_gf_interval,
-#if CONFIG_EXT_PARTITION
                                        &superblock_size,
-#endif  // CONFIG_EXT_PARTITION
                                        &num_tg,
                                        &mtu_size,
-#if CONFIG_TEMPMV_SIGNALING
-                                       &disable_tempmv,
-#endif
-#if CONFIG_HIGHBITDEPTH
+                                       &timing_info,
+                                       &film_grain_test,
+                                       &film_grain_table,
+                                       &enable_ref_frame_mvs,
                                        &bitdeptharg,
                                        &inbitdeptharg,
-#endif  // CONFIG_HIGHBITDEPTH
+                                       &sframe_dist,
+                                       &sframe_mode,
+                                       &save_as_annexb,
                                        NULL };
 static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
+                                        AOME_SET_DEVSF,
                                         AOME_SET_ENABLEAUTOALTREF,
                                         AOME_SET_SHARPNESS,
                                         AOME_SET_STATIC_THRESHOLD,
-#if CONFIG_EXT_TILE
                                         AV1E_SET_SINGLE_TILE_DECODING,
-#endif  // CONFIG_EXT_TILE
                                         AV1E_SET_TILE_COLUMNS,
                                         AV1E_SET_TILE_ROWS,
-#if CONFIG_DEPENDENT_HORZTILES
-                                        AV1E_SET_TILE_DEPENDENT_ROWS,
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-                                        AV1E_SET_TILE_LOOPFILTER,
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
                                         AOME_SET_ARNR_MAXFRAMES,
                                         AOME_SET_ARNR_STRENGTH,
                                         AOME_SET_TUNING,
@@ -624,76 +679,87 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
                                         AV1E_SET_MAX_INTER_BITRATE_PCT,
                                         AV1E_SET_GF_CBR_BOOST_PCT,
                                         AV1E_SET_LOSSLESS,
-#if CONFIG_AOM_QM
+                                        AV1E_SET_ENABLE_CDEF,
+                                        AV1E_SET_ENABLE_RESTORATION,
+                                        AV1E_SET_DISABLE_TRELLIS_QUANT,
                                         AV1E_SET_ENABLE_QM,
                                         AV1E_SET_QM_MIN,
                                         AV1E_SET_QM_MAX,
-#endif
 #if CONFIG_DIST_8X8
                                         AV1E_SET_ENABLE_DIST_8X8,
 #endif
                                         AV1E_SET_FRAME_PARALLEL_DECODING,
+                                        AV1E_SET_ERROR_RESILIENT_MODE,
                                         AV1E_SET_AQ_MODE,
-#if CONFIG_EXT_DELTA_Q
                                         AV1E_SET_DELTAQ_MODE,
-#endif
                                         AV1E_SET_FRAME_PERIODIC_BOOST,
                                         AV1E_SET_NOISE_SENSITIVITY,
                                         AV1E_SET_TUNE_CONTENT,
-                                        AV1E_SET_COLOR_SPACE,
-                                        AV1E_SET_TRANSFER_FUNCTION,
+                                        AV1E_SET_CDF_UPDATE_MODE,
+                                        AV1E_SET_COLOR_PRIMARIES,
+                                        AV1E_SET_TRANSFER_CHARACTERISTICS,
+                                        AV1E_SET_MATRIX_COEFFICIENTS,
                                         AV1E_SET_CHROMA_SAMPLE_POSITION,
                                         AV1E_SET_MIN_GF_INTERVAL,
                                         AV1E_SET_MAX_GF_INTERVAL,
-#if CONFIG_EXT_PARTITION
                                         AV1E_SET_SUPERBLOCK_SIZE,
-#endif  // CONFIG_EXT_PARTITION
                                         AV1E_SET_NUM_TG,
                                         AV1E_SET_MTU,
-#if CONFIG_TEMPMV_SIGNALING
-                                        AV1E_SET_DISABLE_TEMPMV,
-#endif
+                                        AV1E_SET_TIMING_INFO_TYPE,
+                                        AV1E_SET_FILM_GRAIN_TEST_VECTOR,
+                                        AV1E_SET_FILM_GRAIN_TABLE,
+                                        AV1E_SET_ENABLE_REF_FRAME_MVS,
+                                        AV1E_SET_ENABLE_DF,
+                                        AV1E_SET_ENABLE_ORDER_HINT,
+                                        AV1E_SET_ENABLE_JNT_COMP,
+                                        AV1E_SET_ENABLE_SUPERRES,
                                         0 };
-#endif
+#endif  // CONFIG_AV1_ENCODER
 
 static const arg_def_t *no_args[] = { NULL };
 
-void usage_exit(void) {
-  int i;
-  const int num_encoder = get_aom_encoder_count();
-
-  fprintf(stderr, "Usage: %s <options> -o dst_filename src_filename \n",
+void show_help(FILE *fout, int shorthelp) {
+  fprintf(fout, "Usage: %s <options> -o dst_filename src_filename \n",
           exec_name);
 
-  fprintf(stderr, "\nOptions:\n");
-  arg_show_usage(stderr, main_args);
-  fprintf(stderr, "\nEncoder Global Options:\n");
-  arg_show_usage(stderr, global_args);
-  fprintf(stderr, "\nRate Control Options:\n");
-  arg_show_usage(stderr, rc_args);
-  fprintf(stderr, "\nTwopass Rate Control Options:\n");
-  arg_show_usage(stderr, rc_twopass_args);
-  fprintf(stderr, "\nKeyframe Placement Options:\n");
-  arg_show_usage(stderr, kf_args);
+  if (shorthelp) {
+    fprintf(fout, "Use --help to see the full list of options.\n");
+    return;
+  }
+
+  fprintf(fout, "\nOptions:\n");
+  arg_show_usage(fout, main_args);
+  fprintf(fout, "\nEncoder Global Options:\n");
+  arg_show_usage(fout, global_args);
+  fprintf(fout, "\nRate Control Options:\n");
+  arg_show_usage(fout, rc_args);
+  fprintf(fout, "\nTwopass Rate Control Options:\n");
+  arg_show_usage(fout, rc_twopass_args);
+  fprintf(fout, "\nKeyframe Placement Options:\n");
+  arg_show_usage(fout, kf_args);
 #if CONFIG_AV1_ENCODER
-  fprintf(stderr, "\nAV1 Specific Options:\n");
-  arg_show_usage(stderr, av1_args);
+  fprintf(fout, "\nAV1 Specific Options:\n");
+  arg_show_usage(fout, av1_args);
 #endif
-  fprintf(stderr,
+  fprintf(fout,
           "\nStream timebase (--timebase):\n"
           "  The desired precision of timestamps in the output, expressed\n"
           "  in fractional seconds. Default is 1/1000.\n");
-  fprintf(stderr, "\nIncluded encoders:\n\n");
+  fprintf(fout, "\nIncluded encoders:\n\n");
 
-  for (i = 0; i < num_encoder; ++i) {
+  const int num_encoder = get_aom_encoder_count();
+  for (int i = 0; i < num_encoder; ++i) {
     const AvxInterface *const encoder = get_aom_encoder_by_index(i);
     const char *defstr = (i == (num_encoder - 1)) ? "(default)" : "";
-    fprintf(stderr, "    %-6s - %s %s\n", encoder->name,
+    fprintf(fout, "    %-6s - %s %s\n", encoder->name,
             aom_codec_iface_name(encoder->codec_interface()), defstr);
   }
-  fprintf(stderr, "\n        ");
-  fprintf(stderr, "Use --codec to switch to a non-default encoder.\n\n");
+  fprintf(fout, "\n        ");
+  fprintf(fout, "Use --codec to switch to a non-default encoder.\n\n");
+}
 
+void usage_exit(void) {
+  show_help(stderr, 1);
   exit(EXIT_FAILURE);
 }
 
@@ -720,6 +786,8 @@ struct stream_config {
   int arg_ctrls[ARG_CTRL_CNT_MAX][2];
   int arg_ctrl_cnt;
   int write_webm;
+  const char *film_grain_filename;
+  int write_ivf;
   // whether to use 16bit internal buffers
   int use_16bit_internal;
 };
@@ -761,11 +829,15 @@ static void validate_positive_rational(const char *msg,
   if (!rat->den) die("Error: %s has zero denominator\n", msg);
 }
 
-static void parse_global_config(struct AvxEncoderConfig *global, char **argv) {
+static void parse_global_config(struct AvxEncoderConfig *global, int *argc,
+                                char ***argv) {
   char **argi, **argj;
   struct arg arg;
   const int num_encoder = get_aom_encoder_count();
-
+  char **argv_local = (char **)*argv;
+#if CONFIG_FILEOPTIONS
+  int argc_local = *argc;
+#endif
   if (num_encoder < 1) die("Error: no valid encoder available\n");
 
   /* Initialize default parameters */
@@ -773,13 +845,32 @@ static void parse_global_config(struct AvxEncoderConfig *global, char **argv) {
   global->codec = get_aom_encoder_by_index(num_encoder - 1);
   global->passes = 0;
   global->color_type = I420;
-  /* Assign default deadline to good quality */
-  global->deadline = AOM_DL_GOOD_QUALITY;
 
-  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+#if CONFIG_FILEOPTIONS
+  const char *cfg = NULL;
+  int cfg_included = 0;
+#endif
+  for (argi = argj = argv_local; (*argj = *argi); argi += arg.argv_step) {
     arg.argv_step = 1;
 
-    if (arg_match(&arg, &codecarg, argi)) {
+#if CONFIG_FILEOPTIONS
+    if (arg_match(&arg, &use_cfg, argi)) {
+      if (cfg_included) continue;
+      cfg = arg.val;
+
+      arg_cfg(&argc_local, &argv_local, cfg);
+
+      *argj = *argi = *argv_local;
+      argj = argi = argv_local;
+      *argv = argv_local;
+      cfg_included = 1;
+      continue;
+    }
+#endif
+    if (arg_match(&arg, &help, argi)) {
+      show_help(stdout, 0);
+      exit(EXIT_SUCCESS);
+    } else if (arg_match(&arg, &codecarg, argi)) {
       global->codec = get_aom_encoder_by_name(arg.val);
       if (!global->codec)
         die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
@@ -795,10 +886,8 @@ static void parse_global_config(struct AvxEncoderConfig *global, char **argv) {
         die("Error: Invalid pass selected (%d)\n", global->pass);
     } else if (arg_match(&arg, &usage, argi))
       global->usage = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &deadline, argi))
-      global->deadline = arg_parse_uint(&arg);
     else if (arg_match(&arg, &good_dl, argi))
-      global->deadline = AOM_DL_GOOD_QUALITY;
+      warn("Deprecated --good option! Ignoring\n");
     else if (arg_match(&arg, &use_yv12, argi))
       global->color_type = YV12;
     else if (arg_match(&arg, &use_i420, argi))
@@ -807,8 +896,6 @@ static void parse_global_config(struct AvxEncoderConfig *global, char **argv) {
       global->color_type = I422;
     else if (arg_match(&arg, &use_i444, argi))
       global->color_type = I444;
-    else if (arg_match(&arg, &use_i440, argi))
-      global->color_type = I440;
     else if (arg_match(&arg, &quietarg, argi))
       global->quiet = 1;
     else if (arg_match(&arg, &verbosearg, argi))
@@ -947,6 +1034,8 @@ static struct stream_state *new_stream(struct AvxEncoderConfig *global,
 
     /* Initialize remaining stream parameters */
     stream->config.write_webm = 1;
+    stream->config.write_ivf = 0;
+
 #if CONFIG_WEBM_IO
     stream->config.stereo_fmt = STEREO_FORMAT_MONO;
     stream->webm_ctx.last_pts_ns = -1;
@@ -965,6 +1054,27 @@ static struct stream_state *new_stream(struct AvxEncoderConfig *global,
   return stream;
 }
 
+static void set_config_arg_ctrls(struct stream_config *config, int key,
+                                 const struct arg *arg) {
+  int j;
+  if (key == AV1E_SET_FILM_GRAIN_TABLE) {
+    config->film_grain_filename = arg->val;
+    return;
+  }
+
+  /* Point either to the next free element or the first instance of this
+   * control.
+   */
+  for (j = 0; j < config->arg_ctrl_cnt; j++)
+    if (config->arg_ctrls[j][0] == key) break;
+
+  /* Update/insert */
+  assert(j < (int)ARG_CTRL_CNT_MAX);
+  config->arg_ctrls[j][0] = key;
+  config->arg_ctrls[j][1] = arg_parse_enum_or_int(arg);
+  if (j == config->arg_ctrl_cnt) config->arg_ctrl_cnt++;
+}
+
 static int parse_stream_params(struct AvxEncoderConfig *global,
                                struct stream_state *stream, char **argv) {
   char **argi, **argj;
@@ -1007,6 +1117,11 @@ static int parse_stream_params(struct AvxEncoderConfig *global,
         if (out_fn_len >= 4 &&
             !strcmp(config->out_fn + out_fn_len - 4, ".ivf")) {
           config->write_webm = 0;
+          config->write_ivf = 1;
+        } else if (out_fn_len >= 4 &&
+                   !strcmp(config->out_fn + out_fn_len - 4, ".obu")) {
+          config->write_webm = 0;
+          config->write_ivf = 0;
         }
       }
     } else if (arg_match(&arg, &fpf_name, argi)) {
@@ -1024,6 +1139,10 @@ static int parse_stream_params(struct AvxEncoderConfig *global,
 #endif
     } else if (arg_match(&arg, &use_ivf, argi)) {
       config->write_webm = 0;
+      config->write_ivf = 1;
+    } else if (arg_match(&arg, &use_obu, argi)) {
+      config->write_webm = 0;
+      config->write_ivf = 0;
     } else if (arg_match(&arg, &threads, argi)) {
       config->cfg.g_threads = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &profile, argi)) {
@@ -1032,12 +1151,14 @@ static int parse_stream_params(struct AvxEncoderConfig *global,
       config->cfg.g_w = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &height, argi)) {
       config->cfg.g_h = arg_parse_uint(&arg);
-#if CONFIG_HIGHBITDEPTH
+    } else if (arg_match(&arg, &forced_max_frame_width, argi)) {
+      config->cfg.g_forced_max_frame_width = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &forced_max_frame_height, argi)) {
+      config->cfg.g_forced_max_frame_height = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &bitdeptharg, argi)) {
       config->cfg.g_bit_depth = arg_parse_enum_or_int(&arg);
     } else if (arg_match(&arg, &inbitdeptharg, argi)) {
       config->cfg.g_input_bit_depth = arg_parse_uint(&arg);
-#endif
 #if CONFIG_WEBM_IO
     } else if (arg_match(&arg, &stereo_mode, argi)) {
       config->stereo_fmt = arg_parse_enum_or_int(&arg);
@@ -1045,14 +1166,16 @@ static int parse_stream_params(struct AvxEncoderConfig *global,
     } else if (arg_match(&arg, &timebase, argi)) {
       config->cfg.g_timebase = arg_parse_rational(&arg);
       validate_positive_rational(arg.name, &config->cfg.g_timebase);
-    } else if (arg_match(&arg, &error_resilient, argi)) {
+    } else if (arg_match(&arg, &global_error_resilient, argi)) {
       config->cfg.g_error_resilient = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &lag_in_frames, argi)) {
       config->cfg.g_lag_in_frames = arg_parse_uint(&arg);
-#if CONFIG_EXT_TILE
     } else if (arg_match(&arg, &large_scale_tile, argi)) {
       config->cfg.large_scale_tile = arg_parse_uint(&arg);
-#endif  // CONFIG_EXT_TILE
+    } else if (arg_match(&arg, &monochrome, argi)) {
+      config->cfg.monochrome = 1;
+    } else if (arg_match(&arg, &full_still_picture_hdr, argi)) {
+      config->cfg.full_still_picture_hdr = 1;
     } else if (arg_match(&arg, &dropframe_thresh, argi)) {
       config->cfg.rc_dropframe_thresh = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &resize_mode, argi)) {
@@ -1061,7 +1184,6 @@ static int parse_stream_params(struct AvxEncoderConfig *global,
       config->cfg.rc_resize_denominator = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &resize_kf_denominator, argi)) {
       config->cfg.rc_resize_kf_denominator = arg_parse_uint(&arg);
-#if CONFIG_FRAME_SUPERRES
     } else if (arg_match(&arg, &superres_mode, argi)) {
       config->cfg.rc_superres_mode = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &superres_denominator, argi)) {
@@ -1072,7 +1194,6 @@ static int parse_stream_params(struct AvxEncoderConfig *global,
       config->cfg.rc_superres_qthresh = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &superres_kf_qthresh, argi)) {
       config->cfg.rc_superres_kf_qthresh = arg_parse_uint(&arg);
-#endif  // CONFIG_FRAME_SUPERRES
     } else if (arg_match(&arg, &end_usage, argi)) {
       config->cfg.rc_end_usage = arg_parse_enum_or_int(&arg);
     } else if (arg_match(&arg, &target_bitrate, argi)) {
@@ -1105,51 +1226,45 @@ static int parse_stream_params(struct AvxEncoderConfig *global,
 
       if (global->passes < 2)
         warn("option %s ignored in one-pass mode.\n", arg.name);
+    } else if (arg_match(&arg, &fwd_kf_enabled, argi)) {
+      config->cfg.fwd_kf_enabled = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &kf_min_dist, argi)) {
       config->cfg.kf_min_dist = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &kf_max_dist, argi)) {
       config->cfg.kf_max_dist = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &kf_disabled, argi)) {
       config->cfg.kf_mode = AOM_KF_DISABLED;
-#if CONFIG_MAX_TILE
+    } else if (arg_match(&arg, &sframe_dist, argi)) {
+      config->cfg.sframe_dist = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &sframe_mode, argi)) {
+      config->cfg.sframe_mode = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &save_as_annexb, argi)) {
+      config->cfg.save_as_annexb = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &tile_width, argi)) {
       config->cfg.tile_width_count =
           arg_parse_list(&arg, config->cfg.tile_widths, MAX_TILE_WIDTHS);
     } else if (arg_match(&arg, &tile_height, argi)) {
       config->cfg.tile_height_count =
           arg_parse_list(&arg, config->cfg.tile_heights, MAX_TILE_HEIGHTS);
+#if CONFIG_FILEOPTIONS
+    } else if (arg_match(&arg, &ext_partition, argi)) {
+      config->cfg.cfg.ext_partition = !!arg_parse_uint(&arg) > 0;
 #endif
     } else {
       int i, match = 0;
       for (i = 0; ctrl_args[i]; i++) {
         if (arg_match(&arg, ctrl_args[i], argi)) {
-          int j;
           match = 1;
-
-          /* Point either to the next free element or the first
-          * instance of this control.
-          */
-          for (j = 0; j < config->arg_ctrl_cnt; j++)
-            if (ctrl_args_map != NULL &&
-                config->arg_ctrls[j][0] == ctrl_args_map[i])
-              break;
-
-          /* Update/insert */
-          assert(j < (int)ARG_CTRL_CNT_MAX);
-          if (ctrl_args_map != NULL && j < (int)ARG_CTRL_CNT_MAX) {
-            config->arg_ctrls[j][0] = ctrl_args_map[i];
-            config->arg_ctrls[j][1] = arg_parse_enum_or_int(&arg);
-            if (j == config->arg_ctrl_cnt) config->arg_ctrl_cnt++;
+          if (ctrl_args_map) {
+            set_config_arg_ctrls(config, ctrl_args_map[i], &arg);
           }
         }
       }
       if (!match) argj++;
     }
   }
-#if CONFIG_HIGHBITDEPTH
   config->use_16bit_internal =
       config->cfg.g_bit_depth > AOM_BITS_8 || !CONFIG_LOWBITDEPTH;
-#endif
   return eos_mark_found;
 }
 
@@ -1239,12 +1354,10 @@ static const char *image_format_to_string(aom_img_fmt_t f) {
     case AOM_IMG_FMT_I420: return "I420";
     case AOM_IMG_FMT_I422: return "I422";
     case AOM_IMG_FMT_I444: return "I444";
-    case AOM_IMG_FMT_I440: return "I440";
     case AOM_IMG_FMT_YV12: return "YV12";
     case AOM_IMG_FMT_I42016: return "I42016";
     case AOM_IMG_FMT_I42216: return "I42216";
     case AOM_IMG_FMT_I44416: return "I44416";
-    case AOM_IMG_FMT_I44016: return "I44016";
     default: return "Other";
   }
 }
@@ -1281,20 +1394,16 @@ static void show_stream_config(struct stream_state *stream,
   SHOW(g_error_resilient);
   SHOW(g_pass);
   SHOW(g_lag_in_frames);
-#if CONFIG_EXT_TILE
   SHOW(large_scale_tile);
-#endif  // CONFIG_EXT_TILE
   SHOW(rc_dropframe_thresh);
   SHOW(rc_resize_mode);
   SHOW(rc_resize_denominator);
   SHOW(rc_resize_kf_denominator);
-#if CONFIG_FRAME_SUPERRES
   SHOW(rc_superres_mode);
   SHOW(rc_superres_denominator);
   SHOW(rc_superres_kf_denominator);
   SHOW(rc_superres_qthresh);
   SHOW(rc_superres_kf_qthresh);
-#endif  // CONFIG_FRAME_SUPERRES
   SHOW(rc_end_usage);
   SHOW(rc_target_bitrate);
   SHOW(rc_min_quantizer);
@@ -1307,6 +1416,7 @@ static void show_stream_config(struct stream_state *stream,
   SHOW(rc_2pass_vbr_bias_pct);
   SHOW(rc_2pass_vbr_minsection_pct);
   SHOW(rc_2pass_vbr_maxsection_pct);
+  SHOW(fwd_kf_enabled);
   SHOW(kf_mode);
   SHOW(kf_min_dist);
   SHOW(kf_max_dist);
@@ -1337,7 +1447,7 @@ static void open_output_file(struct stream_state *stream,
   (void)pixel_aspect_ratio;
 #endif
 
-  if (!stream->config.write_webm) {
+  if (!stream->config.write_webm && stream->config.write_ivf) {
     ivf_write_file_header(stream->file, cfg, global->codec->fourcc, 0);
   }
 }
@@ -1354,7 +1464,7 @@ static void close_output_file(struct stream_state *stream,
   }
 #endif
 
-  if (!stream->config.write_webm) {
+  if (!stream->config.write_webm && stream->config.write_ivf) {
     if (!fseek(stream->file, 0, SEEK_SET))
       ivf_write_file_header(stream->file, &stream->config.cfg, fourcc,
                             stream->frames_out);
@@ -1407,9 +1517,7 @@ static void initialize_encoder(struct stream_state *stream,
 
   flags |= global->show_psnr ? AOM_CODEC_USE_PSNR : 0;
   flags |= global->out_part ? AOM_CODEC_USE_OUTPUT_PARTITION : 0;
-#if CONFIG_HIGHBITDEPTH
   flags |= stream->config.use_16bit_internal ? AOM_CODEC_USE_HIGHBITDEPTH : 0;
-#endif
 
   /* Construct Encoder Context */
   aom_codec_enc_init(&stream->encoder, global->codec->codec_interface(),
@@ -1428,22 +1536,32 @@ static void initialize_encoder(struct stream_state *stream,
 
     ctx_exit_on_error(&stream->encoder, "Failed to control codec");
   }
+  if (stream->config.film_grain_filename) {
+    aom_codec_control_(&stream->encoder, AV1E_SET_FILM_GRAIN_TABLE,
+                       stream->config.film_grain_filename);
+  }
 
 #if CONFIG_AV1_DECODER
   if (global->test_decode != TEST_DECODE_OFF) {
     const AvxInterface *decoder = get_aom_decoder_by_name(global->codec->name);
-    aom_codec_dec_cfg_t cfg = { 0, 0, 0, CONFIG_LOWBITDEPTH };
+    aom_codec_dec_cfg_t cfg = { 0, 0, 0, CONFIG_LOWBITDEPTH, { 1 } };
     aom_codec_dec_init(&stream->decoder, decoder->codec_interface(), &cfg, 0);
 
-#if CONFIG_EXT_TILE
     if (strcmp(global->codec->name, "av1") == 0) {
+      aom_codec_control(&stream->decoder, AV1_SET_TILE_MODE,
+                        stream->config.cfg.large_scale_tile);
+      ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_mode");
+
+      aom_codec_control(&stream->decoder, AV1D_SET_IS_ANNEXB,
+                        stream->config.cfg.save_as_annexb);
+      ctx_exit_on_error(&stream->decoder, "Failed to set is_annexb");
+
       aom_codec_control(&stream->decoder, AV1_SET_DECODE_TILE_ROW, -1);
       ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_row");
 
       aom_codec_control(&stream->decoder, AV1_SET_DECODE_TILE_COL, -1);
       ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_col");
     }
-#endif  // CONFIG_EXT_TILE
   }
 #endif
 }
@@ -1462,8 +1580,7 @@ static void encode_frame(struct stream_state *stream,
       (cfg->g_timebase.den * (int64_t)(frames_in)*global->framerate.den) /
       cfg->g_timebase.num / global->framerate.num;
 
-/* Scale if necessary */
-#if CONFIG_HIGHBITDEPTH
+  /* Scale if necessary */
   if (img) {
     if ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) &&
         (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) {
@@ -1492,13 +1609,14 @@ static void encode_frame(struct stream_state *stream,
       stream->encoder.err = 1;
       ctx_exit_on_error(&stream->encoder,
                         "Stream %d: Failed to encode frame.\n"
-                        "Scaling disabled in this configuration. \n"
-                        "To enable, configure with --enable-libyuv\n",
+                        "libyuv is required for scaling but is currently "
+                        "disabled.\n"
+                        "Be sure to specify -DCONFIG_LIBYUV=1 when running "
+                        "cmake.\n",
                         stream->index);
 #endif
     }
   }
-#endif
   if (img && (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) {
     if (img->fmt != AOM_IMG_FMT_I420 && img->fmt != AOM_IMG_FMT_YV12) {
       fprintf(stderr, "%s can only scale 4:2:0 8bpp inputs\n", exec_name);
@@ -1529,8 +1647,7 @@ static void encode_frame(struct stream_state *stream,
 
   aom_usec_timer_start(&timer);
   aom_codec_encode(&stream->encoder, img, frame_start,
-                   (unsigned long)(next_frame_start - frame_start), 0,
-                   global->deadline);
+                   (uint32_t)(next_frame_start - frame_start), 0);
   aom_usec_timer_mark(&timer);
   stream->cx_time += aom_usec_timer_elapsed(&timer);
   ctx_exit_on_error(&stream->encoder, "Stream %d: Failed to encode frame",
@@ -1573,19 +1690,21 @@ static void get_cx_data(struct stream_state *stream,
         }
 #endif
         if (!stream->config.write_webm) {
-          if (pkt->data.frame.partition_id <= 0) {
-            ivf_header_pos = ftello(stream->file);
-            fsize = pkt->data.frame.sz;
-
-            ivf_write_frame_header(stream->file, pkt->data.frame.pts, fsize);
-          } else {
-            fsize += pkt->data.frame.sz;
-
-            if (!(pkt->data.frame.flags & AOM_FRAME_IS_FRAGMENT)) {
-              const FileOffset currpos = ftello(stream->file);
-              fseeko(stream->file, ivf_header_pos, SEEK_SET);
-              ivf_write_frame_size(stream->file, fsize);
-              fseeko(stream->file, currpos, SEEK_SET);
+          if (stream->config.write_ivf) {
+            if (pkt->data.frame.partition_id <= 0) {
+              ivf_header_pos = ftello(stream->file);
+              fsize = pkt->data.frame.sz;
+
+              ivf_write_frame_header(stream->file, pkt->data.frame.pts, fsize);
+            } else {
+              fsize += pkt->data.frame.sz;
+
+              if (!(pkt->data.frame.flags & AOM_FRAME_IS_FRAGMENT)) {
+                const FileOffset currpos = ftello(stream->file);
+                fseeko(stream->file, ivf_header_pos, SEEK_SET);
+                ivf_write_frame_size(stream->file, fsize);
+                fseeko(stream->file, currpos, SEEK_SET);
+              }
             }
           }
 
@@ -1598,7 +1717,7 @@ static void get_cx_data(struct stream_state *stream,
 #if CONFIG_AV1_DECODER
         if (global->test_decode != TEST_DECODE_OFF && !stream->mismatch_seen) {
           aom_codec_decode(&stream->decoder, pkt->data.frame.buf,
-                           (unsigned int)pkt->data.frame.sz, NULL, 0);
+                           pkt->data.frame.sz, NULL);
           if (stream->decoder.err) {
             warn_or_exit_on_error(&stream->decoder,
                                   global->test_decode == TEST_DECODE_FATAL,
@@ -1643,7 +1762,7 @@ static void get_cx_data(struct stream_state *stream,
   }
 }
 
-static void show_psnr(struct stream_state *stream, double peak) {
+static void show_psnr(struct stream_state *stream, double peak, int64_t bps) {
   int i;
   double ovpsnr;
 
@@ -1657,6 +1776,10 @@ static void show_psnr(struct stream_state *stream, double peak) {
   for (i = 0; i < 4; i++) {
     fprintf(stderr, " %.3f", stream->psnr_totals[i] / stream->psnr_count);
   }
+  if (bps > 0) {
+    fprintf(stderr, " %7" PRId64 " bps", bps);
+  }
+  fprintf(stderr, " %7" PRId64 " ms", stream->cx_time / 1000);
   fprintf(stderr, "\n");
 }
 
@@ -1674,7 +1797,6 @@ static void test_decode(struct stream_state *stream,
   aom_codec_control(&stream->encoder, AV1_GET_NEW_FRAME_IMAGE, &enc_img);
   aom_codec_control(&stream->decoder, AV1_GET_NEW_FRAME_IMAGE, &dec_img);
 
-#if CONFIG_HIGHBITDEPTH
   if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) !=
       (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
     if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
@@ -1692,22 +1814,17 @@ static void test_decode(struct stream_state *stream,
       dec_img = dec_hbd_img;
     }
   }
-#endif
 
   ctx_exit_on_error(&stream->encoder, "Failed to get encoder reference frame");
   ctx_exit_on_error(&stream->decoder, "Failed to get decoder reference frame");
 
   if (!aom_compare_img(&enc_img, &dec_img)) {
     int y[4], u[4], v[4];
-#if CONFIG_HIGHBITDEPTH
     if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
       aom_find_mismatch_high(&enc_img, &dec_img, y, u, v);
     } else {
       aom_find_mismatch(&enc_img, &dec_img, y, u, v);
     }
-#else
-    aom_find_mismatch(&enc_img, &dec_img, y, u, v);
-#endif
     stream->decoder.err = 1;
     warn_or_exit_on_error(&stream->decoder, fatal == TEST_DECODE_FATAL,
                           "Stream %d: Encode/decode mismatch on frame %d at"
@@ -1745,12 +1862,10 @@ static void print_time(const char *label, int64_t etl) {
 int main(int argc, const char **argv_) {
   int pass;
   aom_image_t raw;
-#if CONFIG_HIGHBITDEPTH
   aom_image_t raw_shift;
   int allocated_raw_shift = 0;
   int use_16bit_internal = 0;
   int input_shift = 0;
-#endif
   int frame_avail, got_data;
 
   struct AvxInputContext input;
@@ -1765,8 +1880,6 @@ int main(int argc, const char **argv_) {
   memset(&input, 0, sizeof(input));
   exec_name = argv_[0];
 
-  if (argc < 3) usage_exit();
-
   /* Setup default input stream settings */
   input.framerate.numerator = 30;
   input.framerate.denominator = 1;
@@ -1778,13 +1891,18 @@ int main(int argc, const char **argv_) {
    * codec.
    */
   argv = argv_dup(argc - 1, argv_ + 1);
-  parse_global_config(&global, argv);
+  parse_global_config(&global, &argc, &argv);
+
+#if CONFIG_FILEOPTIONS
+  if (argc < 2) usage_exit();
+#else
+  if (argc < 3) usage_exit();
+#endif
 
   switch (global.color_type) {
     case I420: input.fmt = AOM_IMG_FMT_I420; break;
     case I422: input.fmt = AOM_IMG_FMT_I422; break;
     case I444: input.fmt = AOM_IMG_FMT_I444; break;
-    case I440: input.fmt = AOM_IMG_FMT_I440; break;
     case YV12: input.fmt = AOM_IMG_FMT_YV12; break;
   }
 
@@ -1815,7 +1933,10 @@ int main(int argc, const char **argv_) {
   /* Handle non-option arguments */
   input.filename = argv[0];
 
-  if (!input.filename) usage_exit();
+  if (!input.filename) {
+    fprintf(stderr, "No input file specified!\n");
+    usage_exit();
+  }
 
   /* Decide if other chroma subsamplings than 4:2:0 are supported */
   if (global.codec->fourcc == AV1_FOURCC) input.only_i420 = 0;
@@ -1873,12 +1994,41 @@ int main(int argc, const char **argv_) {
            was selected. */
         switch (stream->config.cfg.g_profile) {
           case 0:
-            stream->config.cfg.g_profile = 1;
-            profile_updated = 1;
+            if (input.bit_depth < 12 && (input.fmt == AOM_IMG_FMT_I444 ||
+                                         input.fmt == AOM_IMG_FMT_I44416)) {
+              if (!stream->config.cfg.monochrome) {
+                stream->config.cfg.g_profile = 1;
+                profile_updated = 1;
+              }
+            } else if (input.bit_depth == 12 || input.fmt == AOM_IMG_FMT_I422 ||
+                       input.fmt == AOM_IMG_FMT_I42216) {
+              stream->config.cfg.g_profile = 2;
+              profile_updated = 1;
+            }
+            break;
+          case 1:
+            if (input.bit_depth == 12 || input.fmt == AOM_IMG_FMT_I422 ||
+                input.fmt == AOM_IMG_FMT_I42216) {
+              stream->config.cfg.g_profile = 2;
+              profile_updated = 1;
+            } else if (input.bit_depth < 12 &&
+                       (input.fmt == AOM_IMG_FMT_I420 ||
+                        input.fmt == AOM_IMG_FMT_I42016)) {
+              stream->config.cfg.g_profile = 0;
+              profile_updated = 1;
+            }
             break;
           case 2:
-            stream->config.cfg.g_profile = 3;
-            profile_updated = 1;
+            if (input.bit_depth < 12 && (input.fmt == AOM_IMG_FMT_I444 ||
+                                         input.fmt == AOM_IMG_FMT_I44416)) {
+              stream->config.cfg.g_profile = 1;
+              profile_updated = 1;
+            } else if (input.bit_depth < 12 &&
+                       (input.fmt == AOM_IMG_FMT_I420 ||
+                        input.fmt == AOM_IMG_FMT_I42016)) {
+              stream->config.cfg.g_profile = 0;
+              profile_updated = 1;
+            }
             break;
           default: break;
         }
@@ -1889,29 +2039,27 @@ int main(int argc, const char **argv_) {
           (unsigned int)stream->config.cfg.g_bit_depth) {
         stream->config.cfg.g_bit_depth = stream->config.cfg.g_input_bit_depth;
       }
-      if (stream->config.cfg.g_bit_depth > 8) {
+      if (stream->config.cfg.g_bit_depth > 10) {
         switch (stream->config.cfg.g_profile) {
           case 0:
-            stream->config.cfg.g_profile = 2;
-            profile_updated = 1;
-            break;
           case 1:
-            stream->config.cfg.g_profile = 3;
+            stream->config.cfg.g_profile = 2;
             profile_updated = 1;
             break;
           default: break;
         }
       }
-      if (stream->config.cfg.g_profile > 1) {
-        if (!CONFIG_HIGHBITDEPTH) fatal("Unsupported profile.");
+      if (stream->config.cfg.g_bit_depth > 8) {
         stream->config.use_16bit_internal = 1;
       }
       if (profile_updated && !global.quiet) {
         fprintf(stderr,
-                "Warning: automatically upgrading to profile %d to "
+                "Warning: automatically updating to profile %d to "
                 "match input format.\n",
                 stream->config.cfg.g_profile);
       }
+      /* Set limit */
+      stream->config.cfg.g_limit = global.limit;
     }
 
     FOREACH_STREAM(stream, streams) {
@@ -1935,9 +2083,8 @@ int main(int argc, const char **argv_) {
     FOREACH_STREAM(stream, streams) {
       if (stream->config.write_webm) {
         stream->config.write_webm = 0;
-        warn(
-            "aomenc was compiled without WebM container support."
-            "Producing IVF output");
+        stream->config.write_ivf = 0;
+        warn("aomenc compiled w/o WebM support. Writing OBU stream.");
       }
     }
 #endif
@@ -1948,12 +2095,11 @@ int main(int argc, const char **argv_) {
     if (!global.have_framerate) {
       global.framerate.num = input.framerate.numerator;
       global.framerate.den = input.framerate.denominator;
-      FOREACH_STREAM(stream, streams) {
-        stream->config.cfg.g_timebase.den = global.framerate.num;
-        stream->config.cfg.g_timebase.num = global.framerate.den;
-      }
     }
-
+    FOREACH_STREAM(stream, streams) {
+      stream->config.cfg.g_timebase.den = global.framerate.num;
+      stream->config.cfg.g_timebase.num = global.framerate.den;
+    }
     /* Show configuration */
     if (global.verbose && pass == 0) {
       FOREACH_STREAM(stream, streams) {
@@ -1981,8 +2127,6 @@ int main(int argc, const char **argv_) {
       open_output_file(stream, &global, &input.pixel_aspect_ratio);
     }
     FOREACH_STREAM(stream, streams) { initialize_encoder(stream, &global); }
-
-#if CONFIG_HIGHBITDEPTH
     if (strcmp(global.codec->name, "av1") == 0 ||
         strcmp(global.codec->name, "av1") == 0) {
       // Check to see if at least one stream uses 16 bit internal.
@@ -1992,15 +2136,10 @@ int main(int argc, const char **argv_) {
         if (stream->config.use_16bit_internal) {
           use_16bit_internal = 1;
         }
-        if (stream->config.cfg.g_profile == 0) {
-          input_shift = 0;
-        } else {
-          input_shift = (int)stream->config.cfg.g_bit_depth -
-                        stream->config.cfg.g_input_bit_depth;
-        }
+        input_shift = (int)stream->config.cfg.g_bit_depth -
+                      stream->config.cfg.g_input_bit_depth;
       };
     }
-#endif
 
     frame_avail = 1;
     got_data = 0;
@@ -2037,7 +2176,6 @@ int main(int argc, const char **argv_) {
       }
 
       if (frames_in > global.skip_frames) {
-#if CONFIG_HIGHBITDEPTH
         aom_image_t *frame_to_encode;
         if (input_shift || (use_16bit_internal && input.bit_depth == 8)) {
           assert(use_16bit_internal);
@@ -2070,12 +2208,6 @@ int main(int argc, const char **argv_) {
                          frames_in);
           }
         }
-#else
-        aom_usec_timer_start(&timer);
-        FOREACH_STREAM(stream, streams) {
-          encode_frame(stream, &global, frame_avail ? &raw : NULL, frames_in);
-        }
-#endif
         aom_usec_timer_mark(&timer);
         cx_time += aom_usec_timer_elapsed(&timer);
 
@@ -2128,18 +2260,16 @@ int main(int argc, const char **argv_) {
 
     if (!global.quiet) {
       FOREACH_STREAM(stream, streams) {
-        fprintf(stderr, "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64
-                        "b/f %7" PRId64
-                        "b/s"
-                        " %7" PRId64 " %s (%.2f fps)\033[K\n",
+        const int64_t bpf =
+            seen_frames ? (int64_t)(stream->nbytes * 8 / seen_frames) : 0;
+        const int64_t bps = bpf * global.framerate.num / global.framerate.den;
+        fprintf(stderr,
+                "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64
+                "b/f %7" PRId64
+                "b/s"
+                " %7" PRId64 " %s (%.2f fps)\033[K\n",
                 pass + 1, global.passes, frames_in, stream->frames_out,
-                (int64_t)stream->nbytes,
-                seen_frames ? (int64_t)(stream->nbytes * 8 / seen_frames) : 0,
-                seen_frames
-                    ? (int64_t)stream->nbytes * 8 *
-                          (int64_t)global.framerate.num / global.framerate.den /
-                          seen_frames
-                    : 0,
+                (int64_t)stream->nbytes, bpf, bps,
                 stream->cx_time > 9999999 ? stream->cx_time / 1000
                                           : stream->cx_time,
                 stream->cx_time > 9999999 ? "ms" : "us",
@@ -2150,10 +2280,16 @@ int main(int argc, const char **argv_) {
     if (global.show_psnr) {
       if (global.codec->fourcc == AV1_FOURCC) {
         FOREACH_STREAM(stream, streams) {
-          show_psnr(stream, (1 << stream->config.cfg.g_input_bit_depth) - 1);
+          int64_t bps = 0;
+          if (stream->psnr_count && seen_frames && global.framerate.den) {
+            bps = (int64_t)stream->nbytes * 8 * (int64_t)global.framerate.num /
+                  global.framerate.den / seen_frames;
+          }
+          show_psnr(stream, (1 << stream->config.cfg.g_input_bit_depth) - 1,
+                    bps);
         }
       } else {
-        FOREACH_STREAM(stream, streams) { show_psnr(stream, 255.0); }
+        FOREACH_STREAM(stream, streams) { show_psnr(stream, 255.0, 0); }
       }
     }
 
@@ -2217,9 +2353,7 @@ int main(int argc, const char **argv_) {
   }
 #endif
 
-#if CONFIG_HIGHBITDEPTH
   if (allocated_raw_shift) aom_img_free(&raw_shift);
-#endif
   aom_img_free(&raw);
   free(argv);
   free(streams);
diff --git a/third_party/aom/aomenc.h b/third_party/aom/apps/aomenc.h
index 248e58356..976079d74 100644
--- a/third_party/aom/aomenc.h
+++ b/third_party/aom/apps/aomenc.h
@@ -27,7 +27,6 @@ typedef enum {
   I420,  // 4:2:0 8+ bit-depth
   I422,  // 4:2:2 8+ bit-depth
   I444,  // 4:4:4 8+ bit-depth
-  I440,  // 4:4:0 8+ bit-depth
   YV12,  // 4:2:0 with uv flipped, only 8-bit depth
 } ColorInputType;
 
@@ -39,7 +38,6 @@ struct AvxEncoderConfig {
   int passes;
   int pass;
   int usage;
-  int deadline;
   ColorInputType color_type;
   int quiet;
   int verbose;
diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake
index 140eec815..1c7f937e1 100644
--- a/third_party/aom/av1/av1.cmake
+++ b/third_party/aom/av1/av1.cmake
@@ -1,530 +1,326 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_AV1_AV1_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AV1_AV1_CMAKE_)
+  return()
+endif() # AOM_AV1_AV1_CMAKE_
 set(AOM_AV1_AV1_CMAKE_ 1)
 
-set(AOM_AV1_COMMON_SOURCES
-    "${AOM_ROOT}/av1/av1_iface_common.h"
-    "${AOM_ROOT}/av1/common/alloccommon.c"
-    "${AOM_ROOT}/av1/common/alloccommon.h"
-    # TODO(tomfinegan): Foward transform belongs in encoder.
-    "${AOM_ROOT}/av1/common/av1_fwd_txfm1d.c"
-    "${AOM_ROOT}/av1/common/av1_fwd_txfm1d.h"
-    "${AOM_ROOT}/av1/common/av1_fwd_txfm2d.c"
-    "${AOM_ROOT}/av1/common/av1_fwd_txfm1d_cfg.h"
-    "${AOM_ROOT}/av1/common/av1_inv_txfm1d.c"
-    "${AOM_ROOT}/av1/common/av1_inv_txfm1d.h"
-    "${AOM_ROOT}/av1/common/av1_inv_txfm2d.c"
-    "${AOM_ROOT}/av1/common/av1_inv_txfm1d_cfg.h"
-    "${AOM_ROOT}/av1/common/av1_loopfilter.c"
-    "${AOM_ROOT}/av1/common/av1_loopfilter.h"
-    "${AOM_ROOT}/av1/common/av1_txfm.h"
-    "${AOM_ROOT}/av1/common/blockd.c"
-    "${AOM_ROOT}/av1/common/blockd.h"
-    "${AOM_ROOT}/av1/common/common.h"
-    "${AOM_ROOT}/av1/common/common_data.h"
-    "${AOM_ROOT}/av1/common/convolve.c"
-    "${AOM_ROOT}/av1/common/convolve.h"
-    "${AOM_ROOT}/av1/common/daala_tx.c"
-    "${AOM_ROOT}/av1/common/daala_tx.h"
-    "${AOM_ROOT}/av1/common/debugmodes.c"
-    "${AOM_ROOT}/av1/common/entropy.c"
-    "${AOM_ROOT}/av1/common/entropy.h"
-    "${AOM_ROOT}/av1/common/entropymode.c"
-    "${AOM_ROOT}/av1/common/entropymode.h"
-    "${AOM_ROOT}/av1/common/entropymv.c"
-    "${AOM_ROOT}/av1/common/entropymv.h"
-    "${AOM_ROOT}/av1/common/enums.h"
-    "${AOM_ROOT}/av1/common/filter.c"
-    "${AOM_ROOT}/av1/common/filter.h"
-    "${AOM_ROOT}/av1/common/frame_buffers.c"
-    "${AOM_ROOT}/av1/common/frame_buffers.h"
-    "${AOM_ROOT}/av1/common/idct.c"
-    "${AOM_ROOT}/av1/common/idct.h"
-    "${AOM_ROOT}/av1/common/mv.h"
-    "${AOM_ROOT}/av1/common/mvref_common.c"
-    "${AOM_ROOT}/av1/common/mvref_common.h"
-    "${AOM_ROOT}/av1/common/odintrin.c"
-    "${AOM_ROOT}/av1/common/odintrin.h"
-    "${AOM_ROOT}/av1/common/onyxc_int.h"
-    "${AOM_ROOT}/av1/common/pred_common.c"
-    "${AOM_ROOT}/av1/common/pred_common.h"
-    "${AOM_ROOT}/av1/common/quant_common.c"
-    "${AOM_ROOT}/av1/common/quant_common.h"
-    "${AOM_ROOT}/av1/common/reconinter.c"
-    "${AOM_ROOT}/av1/common/reconinter.h"
-    "${AOM_ROOT}/av1/common/reconintra.c"
-    "${AOM_ROOT}/av1/common/reconintra.h"
-    "${AOM_ROOT}/av1/common/resize.c"
-    "${AOM_ROOT}/av1/common/resize.h"
-    "${AOM_ROOT}/av1/common/scale.c"
-    "${AOM_ROOT}/av1/common/scale.h"
-    "${AOM_ROOT}/av1/common/scan.c"
-    "${AOM_ROOT}/av1/common/scan.h"
-    "${AOM_ROOT}/av1/common/seg_common.c"
-    "${AOM_ROOT}/av1/common/seg_common.h"
-    "${AOM_ROOT}/av1/common/thread_common.c"
-    "${AOM_ROOT}/av1/common/thread_common.h"
-    "${AOM_ROOT}/av1/common/tile_common.c"
-    "${AOM_ROOT}/av1/common/tile_common.h")
-
-set(AOM_AV1_DECODER_SOURCES
-    "${AOM_ROOT}/av1/av1_dx_iface.c"
-    "${AOM_ROOT}/av1/decoder/decodeframe.c"
-    "${AOM_ROOT}/av1/decoder/decodeframe.h"
-    "${AOM_ROOT}/av1/decoder/decodemv.c"
-    "${AOM_ROOT}/av1/decoder/decodemv.h"
-    "${AOM_ROOT}/av1/decoder/decoder.c"
-    "${AOM_ROOT}/av1/decoder/decoder.h"
-    "${AOM_ROOT}/av1/decoder/detokenize.c"
-    "${AOM_ROOT}/av1/decoder/detokenize.h"
-    "${AOM_ROOT}/av1/decoder/dsubexp.c"
-    "${AOM_ROOT}/av1/decoder/dsubexp.h"
-    "${AOM_ROOT}/av1/decoder/dthread.c"
-    "${AOM_ROOT}/av1/decoder/dthread.h"
-    "${AOM_ROOT}/av1/decoder/symbolrate.h")
-
-set(AOM_AV1_ENCODER_SOURCES
-    "${AOM_ROOT}/av1/av1_cx_iface.c"
-    "${AOM_ROOT}/av1/encoder/aq_complexity.c"
-    "${AOM_ROOT}/av1/encoder/aq_complexity.h"
-    "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.c"
-    "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
-    "${AOM_ROOT}/av1/encoder/aq_variance.c"
-    "${AOM_ROOT}/av1/encoder/aq_variance.h"
-    "${AOM_ROOT}/av1/encoder/av1_quantize.c"
-    "${AOM_ROOT}/av1/encoder/av1_quantize.h"
-    "${AOM_ROOT}/av1/encoder/bitstream.c"
-    "${AOM_ROOT}/av1/encoder/bitstream.h"
-    "${AOM_ROOT}/av1/encoder/block.h"
-    "${AOM_ROOT}/av1/encoder/context_tree.c"
-    "${AOM_ROOT}/av1/encoder/context_tree.h"
-    "${AOM_ROOT}/av1/encoder/cost.c"
-    "${AOM_ROOT}/av1/encoder/cost.h"
-    "${AOM_ROOT}/av1/encoder/dct.c"
-    "${AOM_ROOT}/av1/encoder/encodeframe.c"
-    "${AOM_ROOT}/av1/encoder/encodeframe.h"
-    "${AOM_ROOT}/av1/encoder/encodemb.c"
-    "${AOM_ROOT}/av1/encoder/encodemb.h"
-    "${AOM_ROOT}/av1/encoder/encodemv.c"
-    "${AOM_ROOT}/av1/encoder/encodemv.h"
-    "${AOM_ROOT}/av1/encoder/encoder.c"
-    "${AOM_ROOT}/av1/encoder/encoder.h"
-    "${AOM_ROOT}/av1/encoder/ethread.c"
-    "${AOM_ROOT}/av1/encoder/ethread.h"
-    "${AOM_ROOT}/av1/encoder/extend.c"
-    "${AOM_ROOT}/av1/encoder/extend.h"
-    "${AOM_ROOT}/av1/encoder/firstpass.c"
-    "${AOM_ROOT}/av1/encoder/firstpass.h"
-    "${AOM_ROOT}/av1/encoder/hash.c"
-    "${AOM_ROOT}/av1/encoder/hash.h"
-    "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
-    "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h"
-    "${AOM_ROOT}/av1/encoder/lookahead.c"
-    "${AOM_ROOT}/av1/encoder/lookahead.h"
-    "${AOM_ROOT}/av1/encoder/mbgraph.c"
-    "${AOM_ROOT}/av1/encoder/mbgraph.h"
-    "${AOM_ROOT}/av1/encoder/mcomp.c"
-    "${AOM_ROOT}/av1/encoder/mcomp.h"
-    "${AOM_ROOT}/av1/encoder/palette.c"
-    "${AOM_ROOT}/av1/encoder/palette.h"
-    "${AOM_ROOT}/av1/encoder/picklpf.c"
-    "${AOM_ROOT}/av1/encoder/picklpf.h"
-    "${AOM_ROOT}/av1/encoder/ratectrl.c"
-    "${AOM_ROOT}/av1/encoder/ratectrl.h"
-    "${AOM_ROOT}/av1/encoder/rd.c"
-    "${AOM_ROOT}/av1/encoder/rd.h"
-    "${AOM_ROOT}/av1/encoder/rdopt.c"
-    "${AOM_ROOT}/av1/encoder/rdopt.h"
-    "${AOM_ROOT}/av1/encoder/segmentation.c"
-    "${AOM_ROOT}/av1/encoder/segmentation.h"
-    "${AOM_ROOT}/av1/encoder/speed_features.c"
-    "${AOM_ROOT}/av1/encoder/speed_features.h"
-    "${AOM_ROOT}/av1/encoder/subexp.c"
-    "${AOM_ROOT}/av1/encoder/subexp.h"
-    "${AOM_ROOT}/av1/encoder/temporal_filter.c"
-    "${AOM_ROOT}/av1/encoder/temporal_filter.h"
-    "${AOM_ROOT}/av1/encoder/tokenize.c"
-    "${AOM_ROOT}/av1/encoder/tokenize.h"
-    "${AOM_ROOT}/av1/encoder/treewriter.c"
-    "${AOM_ROOT}/av1/encoder/treewriter.h")
-
-set(AOM_AV1_COMMON_INTRIN_SSE2
-    "${AOM_ROOT}/av1/common/x86/idct_intrin_sse2.c")
-
-set(AOM_AV1_COMMON_INTRIN_SSSE3
-    "${AOM_ROOT}/av1/common/x86/av1_convolve_ssse3.c")
-
-set(AOM_AV1_COMMON_INTRIN_SSE4_1
-    "${AOM_ROOT}/av1/common/x86/av1_fwd_txfm1d_sse4.c"
-    "${AOM_ROOT}/av1/common/x86/av1_fwd_txfm2d_sse4.c"
-    "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_sse4.c")
-
-set(AOM_AV1_COMMON_INTRIN_AVX2
-    "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c"
-    "${AOM_ROOT}/av1/common/x86/hybrid_inv_txfm_avx2.c")
-
-set(AOM_AV1_COMMON_INTRIN_MSA
-    "${AOM_ROOT}/av1/common/mips/msa/av1_idct16x16_msa.c"
-    "${AOM_ROOT}/av1/common/mips/msa/av1_idct4x4_msa.c"
-    "${AOM_ROOT}/av1/common/mips/msa/av1_idct8x8_msa.c")
-
-set(AOM_AV1_ENCODER_ASM_SSE2
-    "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm"
-    "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm"
-    "${AOM_ROOT}/av1/encoder/x86/temporal_filter_apply_sse2.asm")
-
-set(AOM_AV1_ENCODER_INTRIN_SSE2
-    "${AOM_ROOT}/av1/encoder/x86/dct_intrin_sse2.c"
-    "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
-    "${AOM_ROOT}/av1/encoder/x86/av1_quantize_sse2.c")
-
-set(AOM_AV1_ENCODER_ASM_SSSE3_X86_64
-    "${AOM_ROOT}/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm")
-
-set(AOM_AV1_ENCODER_INTRIN_SSE4_1
-    ${AOM_AV1_ENCODER_INTRIN_SSE4_1}
-    "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_sse4.c"
-    "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c")
-
-set(AOM_AV1_ENCODER_INTRIN_AVX2
-    "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
-    "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c"
-    "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c"
-    "${AOM_ROOT}/av1/encoder/x86/hybrid_fwd_txfm_avx2.c")
-
-set(AOM_AV1_ENCODER_INTRIN_NEON
-    "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c")
-
-set(AOM_AV1_ENCODER_INTRIN_MSA
-    "${AOM_ROOT}/av1/encoder/mips/msa/error_msa.c"
-    "${AOM_ROOT}/av1/encoder/mips/msa/fdct16x16_msa.c"
-    "${AOM_ROOT}/av1/encoder/mips/msa/fdct4x4_msa.c"
-    "${AOM_ROOT}/av1/encoder/mips/msa/fdct8x8_msa.c"
-    "${AOM_ROOT}/av1/encoder/mips/msa/fdct_msa.h"
-    "${AOM_ROOT}/av1/encoder/mips/msa/temporal_filter_msa.c")
-
-if (CONFIG_HIGHBITDEPTH)
-  set(AOM_AV1_COMMON_INTRIN_SSE4_1
-      ${AOM_AV1_COMMON_INTRIN_SSE4_1}
-      "${AOM_ROOT}/av1/common/x86/av1_highbd_convolve_sse4.c")
-else ()
-  set(AOM_AV1_COMMON_INTRIN_NEON
-      ${AOM_AV1_COMMON_INTRIN_NEON}
-      "${AOM_ROOT}/av1/common/arm/neon/iht4x4_add_neon.c"
-      "${AOM_ROOT}/av1/common/arm/neon/iht8x8_add_neon.c")
-
-  set(AOM_AV1_ENCODER_INTRIN_NEON
-      ${AOM_AV1_ENCODER_INTRIN_NEON}
-      "${AOM_ROOT}/av1/encoder/arm/neon/error_neon.c")
-endif ()
-
-if (CONFIG_CDEF)
-  set(AOM_AV1_COMMON_SOURCES
-      ${AOM_AV1_COMMON_SOURCES}
-      "${AOM_ROOT}/av1/common/cdef.c"
-      "${AOM_ROOT}/av1/common/cdef.h"
-      "${AOM_ROOT}/av1/common/cdef_block.c"
-      "${AOM_ROOT}/av1/common/cdef_block.h")
-
-  set(AOM_AV1_ENCODER_SOURCES
-      ${AOM_AV1_ENCODER_SOURCES}
-      "${AOM_ROOT}/av1/encoder/pickcdef.c")
-
-  set(AOM_AV1_COMMON_INTRIN_SSE2
-      ${AOM_AV1_COMMON_INTRIN_SSE2}
-      "${AOM_ROOT}/av1/common/cdef_block_sse2.c")
-
-  set(AOM_AV1_COMMON_INTRIN_SSSE3
-      ${AOM_AV1_COMMON_INTRIN_SSSE3}
-      "${AOM_ROOT}/av1/common/cdef_block_ssse3.c")
-
-  set(AOM_AV1_COMMON_INTRIN_SSE4_1
-      ${AOM_AV1_COMMON_INTRIN_SSE4_1}
-      "${AOM_ROOT}/av1/common/cdef_block_sse4.c")
-
-  set(AOM_AV1_COMMON_INTRIN_AVX2
-      ${AOM_AV1_COMMON_INTRIN_AVX2}
-      "${AOM_ROOT}/av1/common/cdef_block_avx2.c")
-
-  set(AOM_AV1_COMMON_INTRIN_NEON
-      ${AOM_AV1_COMMON_INTRIN_NEON}
-      "${AOM_ROOT}/av1/common/cdef_block_neon.c")
-
-  if (NOT CONFIG_CDEF_SINGLEPASS)
-    set(AOM_AV1_COMMON_SOURCES
-        ${AOM_AV1_COMMON_SOURCES}
-        "${AOM_ROOT}/av1/common/clpf.c"
-        "${AOM_ROOT}/av1/common/clpf_simd.h"
-        "${AOM_ROOT}/av1/common/cdef_block_simd.h")
-
-    set(AOM_AV1_COMMON_INTRIN_SSE2
-        ${AOM_AV1_COMMON_INTRIN_SSE2}
-        "${AOM_ROOT}/av1/common/clpf_sse2.c")
-
-    set(AOM_AV1_COMMON_INTRIN_SSSE3
-        ${AOM_AV1_COMMON_INTRIN_SSSE3}
-        "${AOM_ROOT}/av1/common/clpf_ssse3.c")
-
-    set(AOM_AV1_COMMON_INTRIN_SSE4_1
-        ${AOM_AV1_COMMON_INTRIN_SSE4_1}
-        "${AOM_ROOT}/av1/common/clpf_sse4.c")
-
-    set(AOM_AV1_COMMON_INTRIN_NEON
-        ${AOM_AV1_COMMON_INTRIN_NEON}
-        "${AOM_ROOT}/av1/common/clpf_neon.c")
-  endif ()
-endif ()
-
-if (CONFIG_CONVOLVE_ROUND)
-  set(AOM_AV1_COMMON_INTRIN_SSE2
-      ${AOM_AV1_COMMON_INTRIN_SSE2}
-      "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c")
-  if (CONFIG_HIGHBITDEPTH)
-    set(AOM_AV1_COMMON_INTRIN_SSSE3
-        ${AOM_AV1_COMMON_INTRIN_SSSE3}
-        "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c")
-  endif ()
-
-  if(NOT CONFIG_COMPOUND_ROUND)
-    set(AOM_AV1_COMMON_INTRIN_SSE4_1
-        ${AOM_AV1_COMMON_INTRIN_SSE4_1}
-        "${AOM_ROOT}/av1/common/x86/av1_convolve_scale_sse4.c")
-  endif()
-
-  set(AOM_AV1_COMMON_INTRIN_AVX2
-      ${AOM_AV1_COMMON_INTRIN_AVX2}
-      "${AOM_ROOT}/av1/common/x86/convolve_avx2.c")
-endif ()
-
-  set(AOM_AV1_ENCODER_SOURCES
-      ${AOM_AV1_ENCODER_SOURCES}
-      "${AOM_ROOT}/av1/encoder/wedge_utils.c")
-
-  set(AOM_AV1_ENCODER_INTRIN_SSE2
-      ${AOM_AV1_ENCODER_INTRIN_SSE2}
-      "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
-
-if (CONFIG_FILTER_INTRA)
-  set(AOM_AV1_COMMON_INTRIN_SSE4_1
-      ${AOM_AV1_COMMON_INTRIN_SSE4_1}
-      "${AOM_ROOT}/av1/common/x86/filterintra_sse4.c")
-endif ()
-
-if (CONFIG_ACCOUNTING)
-  set(AOM_AV1_DECODER_SOURCES
-      ${AOM_AV1_DECODER_SOURCES}
-      "${AOM_ROOT}/av1/decoder/accounting.c"
-      "${AOM_ROOT}/av1/decoder/accounting.h")
-endif ()
-
-if (CONFIG_BGSPRITE)
-  set(AOM_AV1_ENCODER_SOURCES
-      ${AOM_AV1_ENCODER_SOURCES}
-      "${AOM_ROOT}/av1/encoder/bgsprite.c"
-      "${AOM_ROOT}/av1/encoder/bgsprite.h")
-endif ()
-
-if (CONFIG_GLOBAL_MOTION)
-  set(AOM_AV1_ENCODER_SOURCES
-      ${AOM_AV1_ENCODER_SOURCES}
-      "${AOM_ROOT}/av1/encoder/corner_detect.c"
-      "${AOM_ROOT}/av1/encoder/corner_detect.h"
-      "${AOM_ROOT}/av1/encoder/corner_match.c"
-      "${AOM_ROOT}/av1/encoder/corner_match.h"
-      "${AOM_ROOT}/av1/encoder/global_motion.c"
-      "${AOM_ROOT}/av1/encoder/global_motion.h"
-      "${AOM_ROOT}/av1/encoder/ransac.c"
-      "${AOM_ROOT}/av1/encoder/ransac.h"
-      "${AOM_ROOT}/third_party/fastfeat/fast_9.c"
-      "${AOM_ROOT}/third_party/fastfeat/fast.c"
-      "${AOM_ROOT}/third_party/fastfeat/fast.h"
-      "${AOM_ROOT}/third_party/fastfeat/nonmax.c")
-
-  set(AOM_AV1_ENCODER_INTRIN_SSE4_1
-      ${AOM_AV1_ENCODER_INTRIN_SSE4_1}
-      "${AOM_ROOT}/av1/encoder/x86/corner_match_sse4.c")
-endif ()
-
-if (CONFIG_INSPECTION)
-  set(AOM_AV1_DECODER_SOURCES
-      ${AOM_AV1_DECODER_SOURCES}
-      "${AOM_ROOT}/av1/decoder/inspection.c"
-      "${AOM_ROOT}/av1/decoder/inspection.h")
-endif ()
-
-if (CONFIG_INTERNAL_STATS)
-  set(AOM_AV1_ENCODER_SOURCES
-      ${AOM_AV1_ENCODER_SOURCES}
-      "${AOM_ROOT}/av1/encoder/blockiness.c")
-endif ()
-
-if (CONFIG_LV_MAP)
-  set(AOM_AV1_COMMON_SOURCES
-      ${AOM_AV1_COMMON_SOURCES}
-      "${AOM_ROOT}/av1/common/txb_common.c"
-      "${AOM_ROOT}/av1/common/txb_common.h")
-
-  set(AOM_AV1_DECODER_SOURCES
-      ${AOM_AV1_DECODER_SOURCES}
-      "${AOM_ROOT}/av1/decoder/decodetxb.c"
-      "${AOM_ROOT}/av1/decoder/decodetxb.h")
-
-  set(AOM_AV1_ENCODER_SOURCES
-      ${AOM_AV1_ENCODER_SOURCES}
-      "${AOM_ROOT}/av1/encoder/encodetxb.c"
-      "${AOM_ROOT}/av1/encoder/encodetxb.h")
-endif ()
-
-if (CONFIG_CFL)
-  set(AOM_AV1_COMMON_SOURCES
-      ${AOM_AV1_COMMON_SOURCES}
-    "${AOM_ROOT}/av1/common/cfl.c"
-    "${AOM_ROOT}/av1/common/cfl.h")
-endif ()
-
-if (CONFIG_LOOP_RESTORATION)
-  set(AOM_AV1_COMMON_SOURCES
-      ${AOM_AV1_COMMON_SOURCES}
-      "${AOM_ROOT}/av1/common/restoration.c"
-      "${AOM_ROOT}/av1/common/restoration.h")
-
-  set(AOM_AV1_COMMON_INTRIN_SSE4_1
-      ${AOM_AV1_COMMON_INTRIN_SSE4_1}
-      "${AOM_ROOT}/av1/common/x86/selfguided_sse4.c")
-
-  set(AOM_AV1_ENCODER_SOURCES
-      ${AOM_AV1_ENCODER_SOURCES}
-      "${AOM_ROOT}/av1/encoder/pickrst.c"
-      "${AOM_ROOT}/av1/encoder/pickrst.h")
-endif ()
-
-if (CONFIG_INTRA_EDGE)
-  set(AOM_AV1_COMMON_INTRIN_SSE4_1
-      ${AOM_AV1_COMMON_INTRIN_SSE4_1}
-      "${AOM_ROOT}/av1/common/x86/intra_edge_sse4.c")
-endif ()
-
-if (CONFIG_NCOBMC_ADAPT_WEIGHT)
-  set(AOM_AV1_COMMON_SOURCES
-      ${AOM_AV1_COMMON_SOURCES}
-      "${AOM_ROOT}/av1/common/ncobmc_kernels.c"
-      "${AOM_ROOT}/av1/common/ncobmc_kernels.h")
-endif ()
-
-if (CONFIG_PVQ)
-  set(AOM_AV1_COMMON_SOURCES
-      ${AOM_AV1_COMMON_SOURCES}
-      "${AOM_ROOT}/av1/common/laplace_tables.c"
-      "${AOM_ROOT}/av1/common/pvq.c"
-      "${AOM_ROOT}/av1/common/pvq.h"
-      "${AOM_ROOT}/av1/common/pvq_state.c"
-      "${AOM_ROOT}/av1/common/pvq_state.h"
-      "${AOM_ROOT}/av1/common/partition.c"
-      "${AOM_ROOT}/av1/common/partition.h"
-      "${AOM_ROOT}/av1/common/generic_code.c"
-      "${AOM_ROOT}/av1/common/generic_code.h"
-      "${AOM_ROOT}/av1/common/zigzag4.c"
-      "${AOM_ROOT}/av1/common/zigzag8.c"
-      "${AOM_ROOT}/av1/common/zigzag16.c"
-      "${AOM_ROOT}/av1/common/zigzag32.c")
-
-    set(AOM_AV1_DECODER_SOURCES
-        ${AOM_AV1_DECODER_SOURCES}
-        "${AOM_ROOT}/av1/decoder/decint.h"
-        "${AOM_ROOT}/av1/decoder/pvq_decoder.c"
-        "${AOM_ROOT}/av1/decoder/pvq_decoder.h"
-        "${AOM_ROOT}/av1/decoder/generic_decoder.c"
-        "${AOM_ROOT}/av1/decoder/laplace_decoder.c")
-
-    set(AOM_AV1_ENCODER_SOURCES
-        ${AOM_AV1_ENCODER_SOURCES}
-        "${AOM_ROOT}/av1/encoder/daala_compat_enc.c"
-        "${AOM_ROOT}/av1/encoder/encint.h"
-        "${AOM_ROOT}/av1/encoder/pvq_encoder.c"
-        "${AOM_ROOT}/av1/encoder/pvq_encoder.h"
-        "${AOM_ROOT}/av1/encoder/generic_encoder.c"
-        "${AOM_ROOT}/av1/encoder/laplace_encoder.c")
-
-    set(AOM_AV1_COMMON_INTRIN_SSE4_1
-        ${AOM_AV1_COMMON_INTRIN_SSE4_1}
-        "${AOM_ROOT}/av1/common/x86/pvq_sse4.c"
-        "${AOM_ROOT}/av1/common/x86/pvq_sse4.h")
-
-    if (NOT CONFIG_AV1_ENCODER)
-      # TODO(tomfinegan): These should probably be in av1/common, and in a
-      # common source list. For now this mirrors the original build system.
-      set(AOM_AV1_DECODER_SOURCES
-          ${AOM_AV1_DECODER_SOURCES}
-          "${AOM_ROOT}/av1/encoder/dct.c"
-          "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
-          "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h")
-
-      set(AOM_AV1_DECODER_ASM_SSE2
-          ${AOM_AV1_DECODER_ASM_SSE2}
-          "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm")
-
-      set(AOM_AV1_DECODER_INTRIN_SSE2
-          ${AOM_AV1_DECODER_INTRIN_SSE2}
-          "${AOM_ROOT}/av1/encoder/x86/dct_intrin_sse2.c")
-
-    endif ()
-endif ()
-
-if (CONFIG_WARPED_MOTION OR CONFIG_GLOBAL_MOTION)
-  set(AOM_AV1_COMMON_SOURCES
-      ${AOM_AV1_COMMON_SOURCES}
-      "${AOM_ROOT}/av1/common/warped_motion.c"
-      "${AOM_ROOT}/av1/common/warped_motion.h")
-
-  set(AOM_AV1_COMMON_INTRIN_SSE2
-      ${AOM_AV1_COMMON_INTRIN_SSE2}
-      "${AOM_ROOT}/av1/common/x86/warp_plane_sse2.c")
-
-  set(AOM_AV1_COMMON_INTRIN_SSSE3
-      ${AOM_AV1_COMMON_INTRIN_SSSE3}
-      "${AOM_ROOT}/av1/common/x86/warp_plane_ssse3.c")
-
-  if (CONFIG_HIGHBITDEPTH)
-    set(AOM_AV1_COMMON_INTRIN_SSSE3
-        ${AOM_AV1_COMMON_INTRIN_SSSE3}
-        "${AOM_ROOT}/av1/common/x86/highbd_warp_plane_ssse3.c")
-  endif ()
-endif ()
-
-if (CONFIG_HASH_ME)
-  set(AOM_AV1_ENCODER_SOURCES
-      ${AOM_AV1_ENCODER_SOURCES}
-      "${AOM_ROOT}/av1/encoder/hash_motion.h"
-      "${AOM_ROOT}/av1/encoder/hash_motion.c"
-      "${AOM_ROOT}/third_party/vector/vector.h"
-      "${AOM_ROOT}/third_party/vector/vector.c")
-endif ()
-
-if (CONFIG_Q_ADAPT_PROBS)
-  set(AOM_AV1_COMMON_SOURCES
-      ${AOM_AV1_COMMON_SOURCES}
-      "${AOM_ROOT}/av1/common/token_cdfs.h")
-endif ()
-
-if (CONFIG_XIPHRC)
-  set(AOM_AV1_ENCODER_SOURCES
-      ${AOM_AV1_ENCODER_SOURCES}
-      "${AOM_ROOT}/av1/encoder/ratectrl_xiph.c"
-      "${AOM_ROOT}/av1/encoder/ratectrl_xiph.h")
-endif ()
+list(APPEND AOM_AV1_COMMON_SOURCES
+            "${AOM_ROOT}/av1/av1_iface_common.h"
+            "${AOM_ROOT}/av1/common/alloccommon.c"
+            "${AOM_ROOT}/av1/common/alloccommon.h"
+            "${AOM_ROOT}/av1/common/av1_inv_txfm1d.c"
+            "${AOM_ROOT}/av1/common/av1_inv_txfm1d.h"
+            "${AOM_ROOT}/av1/common/av1_inv_txfm1d_cfg.h"
+            "${AOM_ROOT}/av1/common/av1_inv_txfm2d.c"
+            "${AOM_ROOT}/av1/common/av1_loopfilter.c"
+            "${AOM_ROOT}/av1/common/av1_loopfilter.h"
+            "${AOM_ROOT}/av1/common/av1_txfm.c"
+            "${AOM_ROOT}/av1/common/av1_txfm.h"
+            "${AOM_ROOT}/av1/common/blockd.c"
+            "${AOM_ROOT}/av1/common/blockd.h"
+            "${AOM_ROOT}/av1/common/cdef.c"
+            "${AOM_ROOT}/av1/common/cdef.h"
+            "${AOM_ROOT}/av1/common/cdef_block.c"
+            "${AOM_ROOT}/av1/common/cdef_block.h"
+            "${AOM_ROOT}/av1/common/cfl.c"
+            "${AOM_ROOT}/av1/common/cfl.h"
+            "${AOM_ROOT}/av1/common/common.h"
+            "${AOM_ROOT}/av1/common/common_data.h"
+            "${AOM_ROOT}/av1/common/convolve.c"
+            "${AOM_ROOT}/av1/common/convolve.h"
+            "${AOM_ROOT}/av1/common/debugmodes.c"
+            "${AOM_ROOT}/av1/common/entropy.c"
+            "${AOM_ROOT}/av1/common/entropy.h"
+            "${AOM_ROOT}/av1/common/entropymode.c"
+            "${AOM_ROOT}/av1/common/entropymode.h"
+            "${AOM_ROOT}/av1/common/entropymv.c"
+            "${AOM_ROOT}/av1/common/entropymv.h"
+            "${AOM_ROOT}/av1/common/enums.h"
+            "${AOM_ROOT}/av1/common/filter.c"
+            "${AOM_ROOT}/av1/common/filter.h"
+            "${AOM_ROOT}/av1/common/frame_buffers.c"
+            "${AOM_ROOT}/av1/common/frame_buffers.h"
+            "${AOM_ROOT}/av1/common/idct.c"
+            "${AOM_ROOT}/av1/common/idct.h"
+            "${AOM_ROOT}/av1/common/mv.h"
+            "${AOM_ROOT}/av1/common/mvref_common.c"
+            "${AOM_ROOT}/av1/common/mvref_common.h"
+            "${AOM_ROOT}/av1/common/odintrin.c"
+            "${AOM_ROOT}/av1/common/odintrin.h"
+            "${AOM_ROOT}/av1/common/onyxc_int.h"
+            "${AOM_ROOT}/av1/common/pred_common.c"
+            "${AOM_ROOT}/av1/common/pred_common.h"
+            "${AOM_ROOT}/av1/common/quant_common.c"
+            "${AOM_ROOT}/av1/common/quant_common.h"
+            "${AOM_ROOT}/av1/common/reconinter.c"
+            "${AOM_ROOT}/av1/common/reconinter.h"
+            "${AOM_ROOT}/av1/common/reconintra.c"
+            "${AOM_ROOT}/av1/common/reconintra.h"
+            "${AOM_ROOT}/av1/common/resize.c"
+            "${AOM_ROOT}/av1/common/resize.h"
+            "${AOM_ROOT}/av1/common/restoration.c"
+            "${AOM_ROOT}/av1/common/restoration.h"
+            "${AOM_ROOT}/av1/common/scale.c"
+            "${AOM_ROOT}/av1/common/scale.h"
+            "${AOM_ROOT}/av1/common/scan.c"
+            "${AOM_ROOT}/av1/common/scan.h"
+            "${AOM_ROOT}/av1/common/seg_common.c"
+            "${AOM_ROOT}/av1/common/seg_common.h"
+            "${AOM_ROOT}/av1/common/thread_common.c"
+            "${AOM_ROOT}/av1/common/thread_common.h"
+            "${AOM_ROOT}/av1/common/tile_common.c"
+            "${AOM_ROOT}/av1/common/tile_common.h"
+            "${AOM_ROOT}/av1/common/timing.h"
+            "${AOM_ROOT}/av1/common/timing.c"
+            "${AOM_ROOT}/av1/common/token_cdfs.h"
+            "${AOM_ROOT}/av1/common/txb_common.c"
+            "${AOM_ROOT}/av1/common/txb_common.h"
+            "${AOM_ROOT}/av1/common/warped_motion.c"
+            "${AOM_ROOT}/av1/common/warped_motion.h")
+
+list(APPEND AOM_AV1_DECODER_SOURCES
+            "${AOM_ROOT}/av1/av1_dx_iface.c"
+            "${AOM_ROOT}/av1/decoder/decodeframe.c"
+            "${AOM_ROOT}/av1/decoder/decodeframe.h"
+            "${AOM_ROOT}/av1/decoder/decodemv.c"
+            "${AOM_ROOT}/av1/decoder/decodemv.h"
+            "${AOM_ROOT}/av1/decoder/decoder.c"
+            "${AOM_ROOT}/av1/decoder/decoder.h"
+            "${AOM_ROOT}/av1/decoder/decodetxb.c"
+            "${AOM_ROOT}/av1/decoder/decodetxb.h"
+            "${AOM_ROOT}/av1/decoder/detokenize.c"
+            "${AOM_ROOT}/av1/decoder/detokenize.h"
+            "${AOM_ROOT}/av1/decoder/dthread.c"
+            "${AOM_ROOT}/av1/decoder/dthread.h"
+            "${AOM_ROOT}/av1/decoder/obu.h"
+            "${AOM_ROOT}/av1/decoder/obu.c")
+
+list(APPEND AOM_AV1_ENCODER_SOURCES
+            "${AOM_ROOT}/av1/av1_cx_iface.c"
+            "${AOM_ROOT}/av1/encoder/aq_complexity.c"
+            "${AOM_ROOT}/av1/encoder/aq_complexity.h"
+            "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.c"
+            "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
+            "${AOM_ROOT}/av1/encoder/aq_variance.c"
+            "${AOM_ROOT}/av1/encoder/aq_variance.h"
+            "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.c"
+            "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.h"
+            "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d_cfg.h"
+            "${AOM_ROOT}/av1/encoder/av1_fwd_txfm2d.c"
+            "${AOM_ROOT}/av1/encoder/av1_quantize.c"
+            "${AOM_ROOT}/av1/encoder/av1_quantize.h"
+            "${AOM_ROOT}/av1/encoder/bitstream.c"
+            "${AOM_ROOT}/av1/encoder/bitstream.h"
+            "${AOM_ROOT}/av1/encoder/block.h"
+            "${AOM_ROOT}/av1/encoder/context_tree.c"
+            "${AOM_ROOT}/av1/encoder/context_tree.h"
+            "${AOM_ROOT}/av1/encoder/corner_detect.c"
+            "${AOM_ROOT}/av1/encoder/corner_detect.h"
+            "${AOM_ROOT}/av1/encoder/corner_match.c"
+            "${AOM_ROOT}/av1/encoder/corner_match.h"
+            "${AOM_ROOT}/av1/encoder/cost.c"
+            "${AOM_ROOT}/av1/encoder/cost.h"
+            "${AOM_ROOT}/av1/encoder/encodeframe.c"
+            "${AOM_ROOT}/av1/encoder/encodeframe.h"
+            "${AOM_ROOT}/av1/encoder/encodemb.c"
+            "${AOM_ROOT}/av1/encoder/encodemb.h"
+            "${AOM_ROOT}/av1/encoder/encodemv.c"
+            "${AOM_ROOT}/av1/encoder/encodemv.h"
+            "${AOM_ROOT}/av1/encoder/encoder.c"
+            "${AOM_ROOT}/av1/encoder/encoder.h"
+            "${AOM_ROOT}/av1/encoder/encodetxb.c"
+            "${AOM_ROOT}/av1/encoder/encodetxb.h"
+            "${AOM_ROOT}/av1/encoder/ethread.c"
+            "${AOM_ROOT}/av1/encoder/ethread.h"
+            "${AOM_ROOT}/av1/encoder/extend.c"
+            "${AOM_ROOT}/av1/encoder/extend.h"
+            "${AOM_ROOT}/av1/encoder/firstpass.c"
+            "${AOM_ROOT}/av1/encoder/firstpass.h"
+            "${AOM_ROOT}/av1/encoder/global_motion.c"
+            "${AOM_ROOT}/av1/encoder/global_motion.h"
+            "${AOM_ROOT}/av1/encoder/grain_test_vectors.h"
+            "${AOM_ROOT}/av1/encoder/hash.c"
+            "${AOM_ROOT}/av1/encoder/hash.h"
+            "${AOM_ROOT}/av1/encoder/hash_motion.c"
+            "${AOM_ROOT}/av1/encoder/hash_motion.h"
+            "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
+            "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h"
+            "${AOM_ROOT}/av1/encoder/lookahead.c"
+            "${AOM_ROOT}/av1/encoder/lookahead.h"
+            "${AOM_ROOT}/av1/encoder/mbgraph.c"
+            "${AOM_ROOT}/av1/encoder/mbgraph.h"
+            "${AOM_ROOT}/av1/encoder/mcomp.c"
+            "${AOM_ROOT}/av1/encoder/mcomp.h"
+            "${AOM_ROOT}/av1/encoder/ml.c"
+            "${AOM_ROOT}/av1/encoder/ml.h"
+            "${AOM_ROOT}/av1/encoder/palette.c"
+            "${AOM_ROOT}/av1/encoder/palette.h"
+            "${AOM_ROOT}/av1/encoder/pickcdef.c"
+            "${AOM_ROOT}/av1/encoder/picklpf.c"
+            "${AOM_ROOT}/av1/encoder/picklpf.h"
+            "${AOM_ROOT}/av1/encoder/pickrst.c"
+            "${AOM_ROOT}/av1/encoder/pickrst.h"
+            "${AOM_ROOT}/av1/encoder/ransac.c"
+            "${AOM_ROOT}/av1/encoder/ransac.h"
+            "${AOM_ROOT}/av1/encoder/ratectrl.c"
+            "${AOM_ROOT}/av1/encoder/ratectrl.h"
+            "${AOM_ROOT}/av1/encoder/rd.c"
+            "${AOM_ROOT}/av1/encoder/rd.h"
+            "${AOM_ROOT}/av1/encoder/rdopt.c"
+            "${AOM_ROOT}/av1/encoder/rdopt.h"
+            "${AOM_ROOT}/av1/encoder/segmentation.c"
+            "${AOM_ROOT}/av1/encoder/segmentation.h"
+            "${AOM_ROOT}/av1/encoder/speed_features.c"
+            "${AOM_ROOT}/av1/encoder/speed_features.h"
+            "${AOM_ROOT}/av1/encoder/temporal_filter.c"
+            "${AOM_ROOT}/av1/encoder/temporal_filter.h"
+            "${AOM_ROOT}/av1/encoder/tokenize.c"
+            "${AOM_ROOT}/av1/encoder/tokenize.h"
+            "${AOM_ROOT}/av1/encoder/wedge_utils.c"
+            "${AOM_ROOT}/third_party/fastfeat/fast.c"
+            "${AOM_ROOT}/third_party/fastfeat/fast.h"
+            "${AOM_ROOT}/third_party/fastfeat/fast_9.c"
+            "${AOM_ROOT}/third_party/fastfeat/nonmax.c"
+            "${AOM_ROOT}/third_party/vector/vector.c"
+            "${AOM_ROOT}/third_party/vector/vector.h"
+            "${AOM_ROOT}/av1/encoder/dwt.c"
+            "${AOM_ROOT}/av1/encoder/dwt.h")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SSE2
+            "${AOM_ROOT}/av1/common/cdef_block_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/cfl_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/convolve_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
+            "${AOM_ROOT}/av1/common/cdef_block_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.h"
+            "${AOM_ROOT}/av1/common/x86/cfl_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/jnt_convolve_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/reconinter_ssse3.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SSE4_1
+            "${AOM_ROOT}/av1/common/cdef_block_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/av1_convolve_horiz_rs_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/av1_convolve_scale_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/av1_highbd_convolve_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.h"
+            "${AOM_ROOT}/av1/common/x86/filterintra_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_warp_plane_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/intra_edge_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/reconinter_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/selfguided_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/warp_plane_sse4.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_AVX2
+            "${AOM_ROOT}/av1/common/cdef_block_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.h"
+            "${AOM_ROOT}/av1/common/x86/cfl_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/convolve_2d_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/convolve_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/jnt_convolve_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/reconinter_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c")
+
+list(APPEND AOM_AV1_ENCODER_ASM_SSE2 "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm"
+            "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm"
+            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_apply_sse2.asm")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.h"
+            "${AOM_ROOT}/av1/encoder/x86/av1_quantize_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
+
+list(APPEND AOM_AV1_ENCODER_ASM_SSSE3_X86_64
+            "${AOM_ROOT}/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm1d_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/corner_match_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
+            "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
+            "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_MSA
+            "${AOM_ROOT}/av1/encoder/mips/msa/error_msa.c"
+            "${AOM_ROOT}/av1/encoder/mips/msa/fdct4x4_msa.c"
+            "${AOM_ROOT}/av1/encoder/mips/msa/temporal_filter_msa.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_NEON
+            "${AOM_ROOT}/av1/common/arm/av1_txfm_neon.c"
+            "${AOM_ROOT}/av1/common/arm/cfl_neon.c"
+            "${AOM_ROOT}/av1/common/arm/convolve_neon.c"
+            "${AOM_ROOT}/av1/common/arm/convolve_neon.h"
+            "${AOM_ROOT}/av1/common/arm/jnt_convolve_neon.c"
+            "${AOM_ROOT}/av1/common/arm/mem_neon.h"
+            "${AOM_ROOT}/av1/common/arm/transpose_neon.h"
+            "${AOM_ROOT}/av1/common/arm/blend_a64_hmask_neon.c"
+            "${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c"
+            "${AOM_ROOT}/av1/common/arm/reconinter_neon.c"
+            "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c"
+            "${AOM_ROOT}/av1/common/arm/intrapred_neon.c"
+            "${AOM_ROOT}/av1/common/cdef_block_neon.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
+            "${AOM_ROOT}/av1/encoder/x86/hash_sse42.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_VSX "${AOM_ROOT}/av1/common/ppc/cfl_ppc.c")
+
+if(CONFIG_ACCOUNTING)
+  list(APPEND AOM_AV1_DECODER_SOURCES "${AOM_ROOT}/av1/decoder/accounting.c"
+              "${AOM_ROOT}/av1/decoder/accounting.h")
+endif()
+
+if(CONFIG_INSPECTION)
+  list(APPEND AOM_AV1_DECODER_SOURCES "${AOM_ROOT}/av1/decoder/inspection.c"
+              "${AOM_ROOT}/av1/decoder/inspection.h")
+endif()
+
+if(CONFIG_INTERNAL_STATS)
+  list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/blockiness.c")
+endif()
 
 # Setup AV1 common/decoder/encoder targets. The libaom target must exist before
 # this function is called.
-function (setup_av1_targets)
+function(setup_av1_targets)
   add_library(aom_av1_common OBJECT ${AOM_AV1_COMMON_SOURCES})
   list(APPEND AOM_LIB_TARGETS aom_av1_common)
 
@@ -537,104 +333,117 @@ function (setup_av1_targets)
   # dummy source file to the aom_av1 target.
   add_dummy_source_file_to_target("aom_av1" "c")
 
-  if (CONFIG_AV1_DECODER)
+  if(CONFIG_AV1_DECODER)
     add_library(aom_av1_decoder OBJECT ${AOM_AV1_DECODER_SOURCES})
     set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_decoder)
     target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_decoder>)
-  endif ()
+  endif()
 
-  if (CONFIG_AV1_ENCODER)
+  if(CONFIG_AV1_ENCODER)
     add_library(aom_av1_encoder OBJECT ${AOM_AV1_ENCODER_SOURCES})
     set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_encoder)
     target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_encoder>)
-  endif ()
+  endif()
 
-  if (HAVE_SSE2)
+  if(HAVE_SSE2)
     require_compiler_flag_nomsvc("-msse2" NO)
     add_intrinsics_object_library("-msse2" "sse2" "aom_av1_common"
                                   "AOM_AV1_COMMON_INTRIN_SSE2" "aom")
-    if (CONFIG_AV1_DECODER)
-      if (AOM_AV1_DECODER_ASM_SSE2)
+    if(CONFIG_AV1_DECODER)
+      if(AOM_AV1_DECODER_ASM_SSE2)
         add_asm_library("aom_av1_decoder_sse2" "AOM_AV1_DECODER_ASM_SSE2" "aom")
-      endif ()
+      endif()
 
-      if (AOM_AV1_DECODER_INTRIN_SSE2)
+      if(AOM_AV1_DECODER_INTRIN_SSE2)
         add_intrinsics_object_library("-msse2" "sse2" "aom_av1_decoder"
                                       "AOM_AV1_DECODER_INTRIN_SSE2" "aom")
-      endif ()
-    endif ()
+      endif()
+    endif()
 
-    if (CONFIG_AV1_ENCODER)
+    if(CONFIG_AV1_ENCODER)
       add_asm_library("aom_av1_encoder_sse2" "AOM_AV1_ENCODER_ASM_SSE2" "aom")
       add_intrinsics_object_library("-msse2" "sse2" "aom_av1_encoder"
                                     "AOM_AV1_ENCODER_INTRIN_SSE2" "aom")
-    endif ()
-  endif ()
+    endif()
+  endif()
 
-  if (HAVE_SSSE3)
+  if(HAVE_SSSE3)
     require_compiler_flag_nomsvc("-mssse3" NO)
     add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_common"
                                   "AOM_AV1_COMMON_INTRIN_SSSE3" "aom")
 
-    if (CONFIG_AV1_DECODER)
-      if (AOM_AV1_DECODER_INTRIN_SSSE3)
+    if(CONFIG_AV1_DECODER)
+      if(AOM_AV1_DECODER_INTRIN_SSSE3)
         add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_decoder"
                                       "AOM_AV1_DECODER_INTRIN_SSSE3" "aom")
-      endif ()
-    endif ()
-  endif ()
+      endif()
+    endif()
+  endif()
 
-  if (HAVE_SSE4_1)
+  if(HAVE_SSE4_1)
     require_compiler_flag_nomsvc("-msse4.1" NO)
     add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_common"
                                   "AOM_AV1_COMMON_INTRIN_SSE4_1" "aom")
 
-    if (CONFIG_AV1_ENCODER)
-      if ("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+    if(CONFIG_AV1_ENCODER)
+      if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
         add_asm_library("aom_av1_encoder_ssse3"
                         "AOM_AV1_ENCODER_ASM_SSSE3_X86_64" "aom")
-      endif ()
+      endif()
 
-      if (AOM_AV1_ENCODER_INTRIN_SSE4_1)
+      if(AOM_AV1_ENCODER_INTRIN_SSE4_1)
         add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_encoder"
                                       "AOM_AV1_ENCODER_INTRIN_SSE4_1" "aom")
-      endif ()
-    endif ()
-  endif ()
+      endif()
+    endif()
+  endif()
 
-  if (HAVE_AVX2)
+  if(HAVE_SSE4_2)
+    require_compiler_flag_nomsvc("-msse4.2" NO)
+    if(CONFIG_AV1_ENCODER)
+      if(AOM_AV1_ENCODER_INTRIN_SSE4_2)
+        add_intrinsics_object_library("-msse4.2" "sse42" "aom_av1_encoder"
+                                      "AOM_AV1_ENCODER_INTRIN_SSE4_2" "aom")
+      endif()
+    endif()
+  endif()
+
+  if(HAVE_AVX2)
     require_compiler_flag_nomsvc("-mavx2" NO)
     add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_common"
                                   "AOM_AV1_COMMON_INTRIN_AVX2" "aom")
 
-    if (CONFIG_AV1_ENCODER)
+    if(CONFIG_AV1_ENCODER)
       add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_encoder"
                                     "AOM_AV1_ENCODER_INTRIN_AVX2" "aom")
-    endif ()
-  endif ()
+    endif()
+  endif()
 
-  if (HAVE_NEON)
-    if (AOM_AV1_COMMON_INTRIN_NEON)
-      add_intrinsics_object_library("${AOM_INTRIN_NEON_FLAG}"
-                                    "neon"
+  if(HAVE_NEON)
+    if(AOM_AV1_COMMON_INTRIN_NEON)
+      add_intrinsics_object_library("${AOM_INTRIN_NEON_FLAG}" "neon"
                                     "aom_av1_common"
                                     "AOM_AV1_COMMON_INTRIN_NEON" "aom")
-    endif ()
+    endif()
 
-    if (AOM_AV1_ENCODER_INTRIN_NEON)
-      add_intrinsics_object_library("${AOM_INTRIN_NEON_FLAG}"
-                                    "neon"
+    if(AOM_AV1_ENCODER_INTRIN_NEON)
+      add_intrinsics_object_library("${AOM_INTRIN_NEON_FLAG}" "neon"
                                     "aom_av1_encoder"
                                     "AOM_AV1_ENCODER_INTRIN_NEON" "aom")
-    endif ()
-  endif ()
+    endif()
+  endif()
 
-  if (HAVE_MSA)
-    add_intrinsics_object_library("" "msa" "aom_av1_common"
-                                  "AOM_AV1_COMMON_INTRIN_MSA" "aom")
+  if(HAVE_VSX)
+    if(AOM_AV1_COMMON_INTRIN_VSX)
+      add_intrinsics_object_library("-mvsx -maltivec" "vsx" "aom_av1_common"
+                                    "AOM_AV1_COMMON_INTRIN_VSX" "aom")
+    endif()
+  endif()
+
+  if(HAVE_MSA)
     add_intrinsics_object_library("" "msa" "aom_av1_encoder"
                                   "AOM_AV1_ENCODER_INTRIN_MSA" "aom")
-  endif ()
+  endif()
 
   target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp>)
   target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_scale>)
@@ -642,9 +451,7 @@ function (setup_av1_targets)
   # Pass the new lib targets up to the parent scope instance of
   # $AOM_LIB_TARGETS.
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
-endfunction ()
-
-function (setup_av1_test_targets)
-endfunction ()
+endfunction()
 
-endif ()  # AOM_AV1_AV1_CMAKE_
+function(setup_av1_test_targets)
+endfunction()
diff --git a/third_party/aom/av1/av1_common.mk b/third_party/aom/av1/av1_common.mk
deleted file mode 100644
index 35466ac88..000000000
--- a/third_party/aom/av1/av1_common.mk
+++ /dev/null
@@ -1,205 +0,0 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-AV1_COMMON_SRCS-yes += av1_common.mk
-AV1_COMMON_SRCS-yes += av1_iface_common.h
-AV1_COMMON_SRCS-yes += common/alloccommon.c
-AV1_COMMON_SRCS-yes += common/av1_loopfilter.c
-AV1_COMMON_SRCS-yes += common/av1_loopfilter.h
-AV1_COMMON_SRCS-yes += common/blockd.c
-AV1_COMMON_SRCS-yes += common/debugmodes.c
-AV1_COMMON_SRCS-yes += common/entropy.c
-AV1_COMMON_SRCS-yes += common/entropymode.c
-AV1_COMMON_SRCS-yes += common/entropymv.c
-AV1_COMMON_SRCS-yes += common/frame_buffers.c
-AV1_COMMON_SRCS-yes += common/frame_buffers.h
-AV1_COMMON_SRCS-yes += common/alloccommon.h
-AV1_COMMON_SRCS-yes += common/blockd.h
-AV1_COMMON_SRCS-yes += common/common.h
-AV1_COMMON_SRCS-yes += common/daala_tx.c
-AV1_COMMON_SRCS-yes += common/daala_tx.h
-AV1_COMMON_SRCS-yes += common/entropy.h
-AV1_COMMON_SRCS-yes += common/entropymode.h
-AV1_COMMON_SRCS-yes += common/entropymv.h
-AV1_COMMON_SRCS-yes += common/enums.h
-AV1_COMMON_SRCS-yes += common/filter.h
-AV1_COMMON_SRCS-yes += common/filter.c
-AV1_COMMON_SRCS-yes += common/idct.h
-AV1_COMMON_SRCS-yes += common/idct.c
-AV1_COMMON_SRCS-yes += common/thread_common.h
-AV1_COMMON_SRCS-$(CONFIG_LV_MAP) += common/txb_common.h
-AV1_COMMON_SRCS-$(CONFIG_LV_MAP) += common/txb_common.c
-AV1_COMMON_SRCS-yes += common/mv.h
-AV1_COMMON_SRCS-yes += common/onyxc_int.h
-AV1_COMMON_SRCS-yes += common/pred_common.h
-AV1_COMMON_SRCS-yes += common/pred_common.c
-AV1_COMMON_SRCS-yes += common/quant_common.h
-AV1_COMMON_SRCS-yes += common/reconinter.h
-AV1_COMMON_SRCS-yes += common/reconintra.h
-AV1_COMMON_SRCS-yes += common/av1_rtcd.c
-AV1_COMMON_SRCS-yes += common/av1_rtcd_defs.pl
-AV1_COMMON_SRCS-yes += common/scale.h
-AV1_COMMON_SRCS-yes += common/scale.c
-AV1_COMMON_SRCS-yes += common/seg_common.h
-AV1_COMMON_SRCS-yes += common/seg_common.c
-AV1_COMMON_SRCS-yes += common/tile_common.h
-AV1_COMMON_SRCS-yes += common/tile_common.c
-AV1_COMMON_SRCS-yes += common/thread_common.c
-AV1_COMMON_SRCS-yes += common/mvref_common.c
-AV1_COMMON_SRCS-yes += common/mvref_common.h
-AV1_COMMON_SRCS-yes += common/quant_common.c
-AV1_COMMON_SRCS-yes += common/reconinter.c
-AV1_COMMON_SRCS-yes += common/reconintra.c
-AV1_COMMON_SRCS-yes += common/resize.c
-AV1_COMMON_SRCS-yes += common/resize.h
-AV1_COMMON_SRCS-yes += common/common_data.h
-AV1_COMMON_SRCS-yes += common/scan.c
-AV1_COMMON_SRCS-yes += common/scan.h
-# TODO(angiebird) the forward transform belongs under encoder/
-AV1_COMMON_SRCS-yes += common/av1_txfm.h
-AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d.h
-AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d.c
-AV1_COMMON_SRCS-yes += common/av1_inv_txfm1d.h
-AV1_COMMON_SRCS-yes += common/av1_inv_txfm1d.c
-AV1_COMMON_SRCS-yes += common/av1_fwd_txfm2d.c
-AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d_cfg.h
-AV1_COMMON_SRCS-yes += common/av1_inv_txfm2d.c
-AV1_COMMON_SRCS-yes += common/av1_inv_txfm1d_cfg.h
-AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/convolve_avx2.c
-AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/av1_convolve_ssse3.c
-ifeq ($(CONFIG_CONVOLVE_ROUND)x$(CONFIG_COMPOUND_ROUND),yesx)
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_convolve_scale_sse4.c
-endif
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_highbd_convolve_sse4.c
-endif
-AV1_COMMON_SRCS-yes += common/convolve.c
-AV1_COMMON_SRCS-yes += common/convolve.h
-ifeq ($(CONFIG_LOOP_RESTORATION),yes)
-AV1_COMMON_SRCS-yes += common/restoration.h
-AV1_COMMON_SRCS-yes += common/restoration.c
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/selfguided_sse4.c
-endif
-ifeq ($(CONFIG_INTRA_EDGE),yes)
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/intra_edge_sse4.c
-endif
-ifeq (yes,$(filter $(CONFIG_GLOBAL_MOTION) $(CONFIG_WARPED_MOTION),yes))
-AV1_COMMON_SRCS-yes += common/warped_motion.h
-AV1_COMMON_SRCS-yes += common/warped_motion.c
-endif
-ifeq ($(CONFIG_CDEF),yes)
-ifeq ($(CONFIG_CDEF_SINGLEPASS),yes)
-AV1_COMMON_SRCS-$(HAVE_AVX2) += common/cdef_block_avx2.c
-else
-AV1_COMMON_SRCS-yes += common/clpf.c
-AV1_COMMON_SRCS-yes += common/clpf_simd.h
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/clpf_sse2.c
-AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/clpf_ssse3.c
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/clpf_sse4.c
-AV1_COMMON_SRCS-$(HAVE_NEON) += common/clpf_neon.c
-endif
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/cdef_block_sse2.c
-AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/cdef_block_ssse3.c
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/cdef_block_sse4.c
-AV1_COMMON_SRCS-$(HAVE_NEON) += common/cdef_block_neon.c
-AV1_COMMON_SRCS-yes += common/cdef_block.c
-AV1_COMMON_SRCS-yes += common/cdef_block.h
-AV1_COMMON_SRCS-yes += common/cdef_block_simd.h
-AV1_COMMON_SRCS-yes += common/cdef.c
-AV1_COMMON_SRCS-yes += common/cdef.h
-endif
-AV1_COMMON_SRCS-yes += common/odintrin.c
-AV1_COMMON_SRCS-yes += common/odintrin.h
-
-ifeq ($(CONFIG_CFL),yes)
-AV1_COMMON_SRCS-yes += common/cfl.h
-AV1_COMMON_SRCS-yes += common/cfl.c
-endif
-
-ifeq ($(CONFIG_MOTION_VAR),yes)
-AV1_COMMON_SRCS-yes += common/obmc.h
-endif
-
-ifeq ($(CONFIG_PVQ),yes)
-# PVQ from daala
-AV1_COMMON_SRCS-yes += common/pvq.c
-AV1_COMMON_SRCS-yes += common/partition.c
-AV1_COMMON_SRCS-yes += common/partition.h
-AV1_COMMON_SRCS-yes += common/zigzag4.c
-AV1_COMMON_SRCS-yes += common/zigzag8.c
-AV1_COMMON_SRCS-yes += common/zigzag16.c
-AV1_COMMON_SRCS-yes += common/zigzag32.c
-AV1_COMMON_SRCS-yes += common/zigzag.h
-AV1_COMMON_SRCS-yes += common/generic_code.c
-AV1_COMMON_SRCS-yes += common/pvq_state.c
-AV1_COMMON_SRCS-yes += common/laplace_tables.c
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/pvq_sse4.c
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/pvq_sse4.h
-endif
-ifneq ($(findstring yes,$(CONFIG_PVQ)$(CONFIG_DAALA_DIST)$(CONFIG_XIPHRC)),)
-AV1_COMMON_SRCS-yes += common/pvq.h
-AV1_COMMON_SRCS-yes += common/pvq_state.h
-AV1_COMMON_SRCS-yes += common/generic_code.h
-endif
-
-# common (msa)
-AV1_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/av1_idct4x4_msa.c
-AV1_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/av1_idct8x8_msa.c
-AV1_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/av1_idct16x16_msa.c
-
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_intrin_sse2.c
-AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/hybrid_inv_txfm_avx2.c
-
-ifeq ($(CONFIG_AV1_ENCODER),yes)
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_txfm1d_sse4.h
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_fwd_txfm1d_sse4.c
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_fwd_txfm2d_sse4.c
-endif
-
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/highbd_txfm_utility_sse4.h
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/highbd_inv_txfm_sse4.c
-AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/highbd_inv_txfm_avx2.c
-
-ifneq ($(CONFIG_HIGHBITDEPTH),yes)
-AV1_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht4x4_add_neon.c
-AV1_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht8x8_add_neon.c
-endif
-
-ifeq ($(CONFIG_FILTER_INTRA),yes)
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/filterintra_sse4.c
-endif
-
-ifneq ($(findstring yes,$(CONFIG_GLOBAL_MOTION) $(CONFIG_WARPED_MOTION)),)
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/warp_plane_sse2.c
-AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/warp_plane_ssse3.c
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/highbd_warp_plane_ssse3.c
-endif
-endif
-
-ifeq ($(CONFIG_CONVOLVE_ROUND),yes)
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/convolve_2d_sse2.c
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/highbd_convolve_2d_ssse3.c
-endif
-endif
-
-
-ifeq ($(CONFIG_Q_ADAPT_PROBS),yes)
-AV1_COMMON_SRCS-yes += common/token_cdfs.h
-endif
-
-ifeq ($(CONFIG_NCOBMC_ADAPT_WEIGHT),yes)
-AV1_COMMON_SRCS-yes += common/ncobmc_kernels.h
-AV1_COMMON_SRCS-yes += common/ncobmc_kernels.c
-endif
-
-$(eval $(call rtcd_h_template,av1_rtcd,av1/common/av1_rtcd_defs.pl))
diff --git a/third_party/aom/av1/av1_cx.mk b/third_party/aom/av1/av1_cx.mk
deleted file mode 100644
index 13f297403..000000000
--- a/third_party/aom/av1/av1_cx.mk
+++ /dev/null
@@ -1,176 +0,0 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-AV1_CX_EXPORTS += exports_enc
-
-AV1_CX_SRCS-yes += $(AV1_COMMON_SRCS-yes)
-AV1_CX_SRCS-no  += $(AV1_COMMON_SRCS-no)
-AV1_CX_SRCS_REMOVE-yes += $(AV1_COMMON_SRCS_REMOVE-yes)
-AV1_CX_SRCS_REMOVE-no  += $(AV1_COMMON_SRCS_REMOVE-no)
-
-AV1_CX_SRCS-yes += av1_cx_iface.c
-
-AV1_CX_SRCS-yes += encoder/av1_quantize.c
-AV1_CX_SRCS-yes += encoder/av1_quantize.h
-AV1_CX_SRCS-yes += encoder/bitstream.c
-AV1_CX_SRCS-$(CONFIG_BGSPRITE) += encoder/bgsprite.c
-AV1_CX_SRCS-$(CONFIG_BGSPRITE) += encoder/bgsprite.h
-AV1_CX_SRCS-yes += encoder/context_tree.c
-AV1_CX_SRCS-yes += encoder/context_tree.h
-AV1_CX_SRCS-yes += encoder/cost.h
-AV1_CX_SRCS-yes += encoder/cost.c
-AV1_CX_SRCS-yes += encoder/dct.c
-AV1_CX_SRCS-yes += encoder/hybrid_fwd_txfm.c
-AV1_CX_SRCS-yes += encoder/hybrid_fwd_txfm.h
-AV1_CX_SRCS-yes += encoder/encodeframe.c
-AV1_CX_SRCS-yes += encoder/encodeframe.h
-AV1_CX_SRCS-yes += encoder/encodemb.c
-AV1_CX_SRCS-yes += encoder/encodemv.c
-AV1_CX_SRCS-yes += encoder/ethread.h
-AV1_CX_SRCS-yes += encoder/ethread.c
-AV1_CX_SRCS-yes += encoder/extend.c
-AV1_CX_SRCS-yes += encoder/firstpass.c
-AV1_CX_SRCS-yes += encoder/mathutils.h
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += ../third_party/fastfeat/fast.h
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += ../third_party/fastfeat/nonmax.c
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += ../third_party/fastfeat/fast_9.c
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += ../third_party/fastfeat/fast.c
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/corner_match.c
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/corner_match.h
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/corner_detect.c
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/corner_detect.h
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/global_motion.c
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/global_motion.h
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/ransac.c
-AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/ransac.h
-AV1_CX_SRCS-yes += encoder/block.h
-AV1_CX_SRCS-yes += encoder/bitstream.h
-AV1_CX_SRCS-yes += encoder/encodemb.h
-AV1_CX_SRCS-yes += encoder/encodemv.h
-AV1_CX_SRCS-$(CONFIG_LV_MAP) += encoder/encodetxb.c
-AV1_CX_SRCS-$(CONFIG_LV_MAP) += encoder/encodetxb.h
-AV1_CX_SRCS-yes += encoder/extend.h
-AV1_CX_SRCS-yes += encoder/firstpass.h
-AV1_CX_SRCS-yes += encoder/lookahead.c
-AV1_CX_SRCS-yes += encoder/lookahead.h
-AV1_CX_SRCS-yes += encoder/mcomp.h
-AV1_CX_SRCS-yes += encoder/encoder.h
-AV1_CX_SRCS-yes += encoder/random.h
-AV1_CX_SRCS-yes += encoder/ratectrl.h
-ifeq ($(CONFIG_XIPHRC),yes)
-AV1_CX_SRCS-yes += encoder/ratectrl_xiph.h
-endif
-AV1_CX_SRCS-yes += encoder/rd.h
-AV1_CX_SRCS-yes += encoder/rdopt.h
-AV1_CX_SRCS-yes += encoder/tokenize.h
-AV1_CX_SRCS-yes += encoder/treewriter.h
-AV1_CX_SRCS-yes += encoder/mcomp.c
-AV1_CX_SRCS-yes += encoder/encoder.c
-AV1_CX_SRCS-yes += encoder/k_means_template.h
-AV1_CX_SRCS-yes += encoder/palette.h
-AV1_CX_SRCS-yes += encoder/palette.c
-AV1_CX_SRCS-yes += encoder/picklpf.c
-AV1_CX_SRCS-yes += encoder/picklpf.h
-AV1_CX_SRCS-$(CONFIG_LOOP_RESTORATION) += encoder/pickrst.c
-AV1_CX_SRCS-$(CONFIG_LOOP_RESTORATION) += encoder/pickrst.h
-AV1_CX_SRCS-yes += encoder/ratectrl.c
-ifeq ($(CONFIG_XIPHRC),yes)
-AV1_CX_SRCS-yes += encoder/ratectrl_xiph.c
-endif
-AV1_CX_SRCS-yes += encoder/rd.c
-AV1_CX_SRCS-yes += encoder/rdopt.c
-AV1_CX_SRCS-yes += encoder/segmentation.c
-AV1_CX_SRCS-yes += encoder/segmentation.h
-AV1_CX_SRCS-yes += encoder/speed_features.c
-AV1_CX_SRCS-yes += encoder/speed_features.h
-AV1_CX_SRCS-yes += encoder/subexp.c
-AV1_CX_SRCS-yes += encoder/subexp.h
-AV1_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/blockiness.c
-
-AV1_CX_SRCS-yes += encoder/tokenize.c
-AV1_CX_SRCS-yes += encoder/treewriter.c
-AV1_CX_SRCS-yes += encoder/aq_variance.c
-AV1_CX_SRCS-yes += encoder/aq_variance.h
-AV1_CX_SRCS-yes += encoder/aq_cyclicrefresh.c
-AV1_CX_SRCS-yes += encoder/aq_cyclicrefresh.h
-AV1_CX_SRCS-yes += encoder/aq_complexity.c
-AV1_CX_SRCS-yes += encoder/aq_complexity.h
-AV1_CX_SRCS-yes += encoder/temporal_filter.c
-AV1_CX_SRCS-yes += encoder/temporal_filter.h
-AV1_CX_SRCS-yes += encoder/mbgraph.c
-AV1_CX_SRCS-yes += encoder/mbgraph.h
-AV1_CX_SRCS-yes += encoder/hash.c
-AV1_CX_SRCS-yes += encoder/hash.h
-ifeq ($(CONFIG_HASH_ME),yes)
-AV1_CX_SRCS-yes += ../third_party/vector/vector.h
-AV1_CX_SRCS-yes += ../third_party/vector/vector.c
-AV1_CX_SRCS-yes += encoder/hash_motion.c
-AV1_CX_SRCS-yes += encoder/hash_motion.h
-endif
-ifeq ($(CONFIG_CDEF),yes)
-AV1_CX_SRCS-yes += encoder/pickcdef.c
-endif
-ifeq ($(CONFIG_PVQ),yes)
-# PVQ from daala
-AV1_CX_SRCS-yes += encoder/daala_compat_enc.c
-AV1_CX_SRCS-yes += encoder/pvq_encoder.c
-AV1_CX_SRCS-yes += encoder/pvq_encoder.h
-AV1_CX_SRCS-yes += encoder/generic_encoder.c
-AV1_CX_SRCS-yes += encoder/laplace_encoder.c
-endif
-ifneq ($(findstring yes,$(CONFIG_XIPHRC)$(CONFIG_PVQ)),)
-AV1_CX_SRCS-yes += encoder/encint.h
-endif
-
-AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/av1_quantize_sse2.c
-AV1_CX_SRCS-$(HAVE_AVX2) += encoder/x86/av1_quantize_avx2.c
-AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
-
-AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/highbd_block_error_intrin_sse2.c
-AV1_CX_SRCS-$(HAVE_AVX2) += encoder/x86/av1_highbd_quantize_avx2.c
-
-
-AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
-AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/error_sse2.asm
-
-ifeq ($(ARCH_X86_64),yes)
-AV1_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/av1_quantize_ssse3_x86_64.asm
-endif
-
-AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_intrin_sse2.c
-AV1_CX_SRCS-$(HAVE_AVX2) += encoder/x86/hybrid_fwd_txfm_avx2.c
-
-AV1_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/av1_highbd_quantize_sse4.c
-
-AV1_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_fwd_txfm_sse4.c
-
-AV1_CX_SRCS-yes += encoder/wedge_utils.c
-AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/wedge_utils_sse2.c
-
-AV1_CX_SRCS-$(HAVE_AVX2) += encoder/x86/error_intrin_avx2.c
-
-ifneq ($(CONFIG_HIGHBITDEPTH),yes)
-AV1_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/error_neon.c
-endif
-AV1_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/quantize_neon.c
-
-AV1_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/error_msa.c
-AV1_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct4x4_msa.c
-AV1_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct8x8_msa.c
-AV1_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct16x16_msa.c
-AV1_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct_msa.h
-AV1_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
-
-ifeq ($(CONFIG_GLOBAL_MOTION),yes)
-AV1_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/corner_match_sse4.c
-endif
-
-AV1_CX_SRCS-yes := $(filter-out $(AV1_CX_SRCS_REMOVE-yes),$(AV1_CX_SRCS-yes))
diff --git a/third_party/aom/av1/av1_cx_iface.c b/third_party/aom/av1/av1_cx_iface.c
index 0f6c1c4d7..9d5414c1e 100644
--- a/third_party/aom/av1/av1_cx_iface.c
+++ b/third_party/aom/av1/av1_cx_iface.c
@@ -11,37 +11,33 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+#include "config/aom_version.h"
+
 #include "aom/aom_encoder.h"
 #include "aom_ports/aom_once.h"
 #include "aom_ports/system_state.h"
 #include "aom/internal/aom_codec_internal.h"
-#include "./aom_version.h"
 #include "av1/encoder/encoder.h"
 #include "aom/aomcx.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/av1_iface_common.h"
+#include "av1/encoder/bitstream.h"
+#include "aom_ports/mem_ops.h"
 
 #define MAG_SIZE (4)
-#define MAX_INDEX_SIZE (256)
+#define MAX_NUM_ENHANCEMENT_LAYERS 3
 
 struct av1_extracfg {
   int cpu_used;  // available cpu percentage in 1/16
+  int dev_sf;
   unsigned int enable_auto_alt_ref;
-#if CONFIG_EXT_REFS
   unsigned int enable_auto_bwd_ref;
-#endif  // CONFIG_EXT_REFS
   unsigned int noise_sensitivity;
   unsigned int sharpness;
   unsigned int static_thresh;
   unsigned int tile_columns;  // log2 number of tile columns
   unsigned int tile_rows;     // log2 number of tile rows
-#if CONFIG_DEPENDENT_HORZTILES
-  unsigned int dependent_horz_tiles;
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  unsigned int loop_filter_across_tiles_enabled;
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
   unsigned int arnr_max_frames;
   unsigned int arnr_strength;
   unsigned int min_gf_interval;
@@ -52,107 +48,118 @@ struct av1_extracfg {
   unsigned int rc_max_inter_bitrate_pct;
   unsigned int gf_cbr_boost_pct;
   unsigned int lossless;
-#if CONFIG_AOM_QM
+  unsigned int enable_cdef;
+  unsigned int enable_restoration;
+  unsigned int disable_trellis_quant;
   unsigned int enable_qm;
+  unsigned int qm_y;
+  unsigned int qm_u;
+  unsigned int qm_v;
   unsigned int qm_min;
   unsigned int qm_max;
-#endif
 #if CONFIG_DIST_8X8
   unsigned int enable_dist_8x8;
 #endif
   unsigned int num_tg;
   unsigned int mtu_size;
-#if CONFIG_TEMPMV_SIGNALING
-  unsigned int disable_tempmv;
-#endif
+
+  aom_timing_info_type_t timing_info_type;
   unsigned int frame_parallel_decoding_mode;
+  int use_dual_filter;
   AQ_MODE aq_mode;
-#if CONFIG_EXT_DELTA_Q
   DELTAQ_MODE deltaq_mode;
-#endif
   unsigned int frame_periodic_boost;
   aom_bit_depth_t bit_depth;
   aom_tune_content content;
-  aom_color_space_t color_space;
-  aom_transfer_function_t transfer_function;
+  aom_color_primaries_t color_primaries;
+  aom_transfer_characteristics_t transfer_characteristics;
+  aom_matrix_coefficients_t matrix_coefficients;
   aom_chroma_sample_position_t chroma_sample_position;
   int color_range;
   int render_width;
   int render_height;
   aom_superblock_size_t superblock_size;
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  int ans_window_size_log2;
-#endif
-#if CONFIG_EXT_TILE
   unsigned int single_tile_decoding;
-#endif  // CONFIG_EXT_TILE
+  int error_resilient_mode;
+  int s_frame_mode;
 
+  int film_grain_test_vector;
+  const char *film_grain_table_filename;
   unsigned int motion_vector_unit_test;
+  unsigned int cdf_update_mode;
+  int enable_order_hint;
+  int enable_jnt_comp;
+  int enable_ref_frame_mvs;  // sequence level
+  int allow_ref_frame_mvs;   // frame level
+  int enable_warped_motion;  // sequence level
+  int allow_warped_motion;   // frame level
+  int enable_superres;
 };
 
 static struct av1_extracfg default_extra_cfg = {
-  0,  // cpu_used
-  1,  // enable_auto_alt_ref
-#if CONFIG_EXT_REFS
-  0,    // enable_auto_bwd_ref
-#endif  // CONFIG_EXT_REFS
-  0,    // noise_sensitivity
-  0,    // sharpness
-  0,    // static_thresh
-  0,    // tile_columns
-  0,    // tile_rows
-#if CONFIG_DEPENDENT_HORZTILES
-  0,  // Dependent Horizontal tiles
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  1,              // loop_filter_across_tiles_enabled
-#endif            // CONFIG_LOOPFILTERING_ACROSS_TILES
-  7,              // arnr_max_frames
-  5,              // arnr_strength
-  0,              // min_gf_interval; 0 -> default decision
-  0,              // max_gf_interval; 0 -> default decision
-  AOM_TUNE_PSNR,  // tuning
-  10,             // cq_level
-  0,              // rc_max_intra_bitrate_pct
-  0,              // rc_max_inter_bitrate_pct
-  0,              // gf_cbr_boost_pct
-  0,              // lossless
-#if CONFIG_AOM_QM
+  0,                 // cpu_used
+  0,                 // dev_sf
+  1,                 // enable_auto_alt_ref
+  0,                 // enable_auto_bwd_ref
+  0,                 // noise_sensitivity
+  0,                 // sharpness
+  0,                 // static_thresh
+  0,                 // tile_columns
+  0,                 // tile_rows
+  7,                 // arnr_max_frames
+  5,                 // arnr_strength
+  0,                 // min_gf_interval; 0 -> default decision
+  0,                 // max_gf_interval; 0 -> default decision
+  AOM_TUNE_PSNR,     // tuning
+  10,                // cq_level
+  0,                 // rc_max_intra_bitrate_pct
+  0,                 // rc_max_inter_bitrate_pct
+  0,                 // gf_cbr_boost_pct
+  0,                 // lossless
+  1,                 // enable_cdef
+  1,                 // enable_restoration
+  0,                 // disable_trellis_quant
   0,                 // enable_qm
+  DEFAULT_QM_Y,      // qm_y
+  DEFAULT_QM_U,      // qm_u
+  DEFAULT_QM_V,      // qm_v
   DEFAULT_QM_FIRST,  // qm_min
   DEFAULT_QM_LAST,   // qm_max
-#endif
 #if CONFIG_DIST_8X8
   0,
 #endif
-  1,  // max number of tile groups
-  0,  // mtu_size
-#if CONFIG_TEMPMV_SIGNALING
-  0,  // disable temporal mv prediction
-#endif
-  1,      // frame_parallel_decoding_mode
-  NO_AQ,  // aq_mode
-#if CONFIG_EXT_DELTA_Q
-  NO_DELTA_Q,  // deltaq_mode
-#endif
-  CONFIG_XIPHRC,                // frame_periodic_delta_q
+  1,                            // max number of tile groups
+  0,                            // mtu_size
+  AOM_TIMING_UNSPECIFIED,       // No picture timing signaling in bitstream
+  1,                            // frame_parallel_decoding_mode
+  1,                            // enable dual filter
+  NO_AQ,                        // aq_mode
+  NO_DELTA_Q,                   // deltaq_mode
+  0,                            // frame_periodic_delta_q
   AOM_BITS_8,                   // Bit depth
   AOM_CONTENT_DEFAULT,          // content
-  AOM_CS_UNKNOWN,               // color space
-  AOM_TF_UNKNOWN,               // transfer function
+  AOM_CICP_CP_UNSPECIFIED,      // CICP color space
+  AOM_CICP_TC_UNSPECIFIED,      // CICP transfer characteristics
+  AOM_CICP_MC_UNSPECIFIED,      // CICP matrix coefficients
   AOM_CSP_UNKNOWN,              // chroma sample position
   0,                            // color range
   0,                            // render width
   0,                            // render height
   AOM_SUPERBLOCK_SIZE_DYNAMIC,  // superblock_size
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  23,  // ans_window_size_log2
-#endif
-#if CONFIG_EXT_TILE
-  0,    // Single tile decoding is off by default.
-#endif  // CONFIG_EXT_TILE
-
-  0,  // motion_vector_unit_test
+  0,                            // Single tile decoding is off by default.
+  0,                            // error_resilient_mode off by default.
+  0,                            // s_frame_mode off by default.
+  0,                            // film_grain_test_vector
+  0,                            // film_grain_table_filename
+  0,                            // motion_vector_unit_test
+  1,                            // CDF update mode
+  1,                            // frame order hint
+  1,                            // jnt_comp
+  1,                            // enable_ref_frame_mvs sequence level
+  1,                            // allow ref_frame_mvs frame level
+  1,                            // enable_warped_motion at sequence level
+  1,                            // allow_warped_motion at frame level
+  1,                            // superres
 };
 
 struct aom_codec_alg_priv {
@@ -204,11 +211,6 @@ static aom_codec_err_t update_error_state(
     if (!((p)->memb <= (hi))) ERROR(#memb " out of range [.." #hi "]"); \
   } while (0)
 
-#define RANGE_CHECK_LO(p, memb, lo)                                     \
-  do {                                                                  \
-    if (!((p)->memb >= (lo))) ERROR(#memb " out of range [" #lo "..]"); \
-  } while (0)
-
 #define RANGE_CHECK_BOOL(p, memb)                                     \
   do {                                                                \
     if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean"); \
@@ -221,15 +223,13 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
   RANGE_CHECK(cfg, g_h, 1, 65535);  // 16 bits available
   RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
   RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den);
-  RANGE_CHECK_HI(cfg, g_profile, 3);
+  RANGE_CHECK_HI(cfg, g_profile, MAX_PROFILES - 1);
 
   RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
   RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
   RANGE_CHECK_BOOL(extra_cfg, lossless);
   RANGE_CHECK_HI(extra_cfg, aq_mode, AQ_MODE_COUNT - 1);
-#if CONFIG_EXT_DELTA_Q
   RANGE_CHECK_HI(extra_cfg, deltaq_mode, DELTAQ_MODE_COUNT - 1);
-#endif
   RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1);
   RANGE_CHECK_HI(cfg, g_threads, 64);
   RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
@@ -255,7 +255,6 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
               SCALE_NUMERATOR << 1);
   RANGE_CHECK(cfg, rc_resize_kf_denominator, SCALE_NUMERATOR,
               SCALE_NUMERATOR << 1);
-#if CONFIG_FRAME_SUPERRES
   RANGE_CHECK_HI(cfg, rc_superres_mode, SUPERRES_MODES - 1);
   RANGE_CHECK(cfg, rc_superres_denominator, SCALE_NUMERATOR,
               SCALE_NUMERATOR << 1);
@@ -263,7 +262,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
               SCALE_NUMERATOR << 1);
   RANGE_CHECK(cfg, rc_superres_qthresh, 1, 63);
   RANGE_CHECK(cfg, rc_superres_kf_qthresh, 1, 63);
-#endif  // CONFIG_FRAME_SUPERRES
+  RANGE_CHECK_HI(extra_cfg, cdf_update_mode, 2);
 
   // AV1 does not support a lower bound on the keyframe interval in
   // automatic keyframe placement mode.
@@ -275,53 +274,25 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
 
   RANGE_CHECK_HI(extra_cfg, motion_vector_unit_test, 2);
   RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 2);
-#if CONFIG_EXT_REFS
   RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2);
-#endif  // CONFIG_EXT_REFS
   RANGE_CHECK(extra_cfg, cpu_used, 0, 8);
+  RANGE_CHECK(extra_cfg, dev_sf, 0, UINT8_MAX);
   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
   RANGE_CHECK(extra_cfg, superblock_size, AOM_SUPERBLOCK_SIZE_64X64,
               AOM_SUPERBLOCK_SIZE_DYNAMIC);
-#if CONFIG_EXT_TILE
   RANGE_CHECK_HI(cfg, large_scale_tile, 1);
   RANGE_CHECK_HI(extra_cfg, single_tile_decoding, 1);
 
-  if (cfg->large_scale_tile) {
-// TODO(any): Waring. If CONFIG_EXT_TILE is true, tile_columns really
-// means tile_width, and tile_rows really means tile_hight. The interface
-// should be sanitized.
-#if CONFIG_EXT_PARTITION
-    if (extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_64X64) {
-      if (extra_cfg->tile_columns != 0)
-        RANGE_CHECK(extra_cfg, tile_columns, 1, 32);
-      if (extra_cfg->tile_rows != 0) RANGE_CHECK(extra_cfg, tile_rows, 1, 32);
-    } else {
-#endif  // CONFIG_EXT_PARTITION
-      if (extra_cfg->tile_columns != 0)
-        RANGE_CHECK(extra_cfg, tile_columns, 1, 64);
-      if (extra_cfg->tile_rows != 0) RANGE_CHECK(extra_cfg, tile_rows, 1, 64);
-#if CONFIG_EXT_PARTITION
-    }
-#endif  // CONFIG_EXT_PARTITION
-  } else {
-#endif  // CONFIG_EXT_TILE
-#if CONFIG_MAX_TILE
-    RANGE_CHECK_HI(extra_cfg, tile_columns, 6);
-    RANGE_CHECK_HI(extra_cfg, tile_rows, 6);
-#else   // CONFIG_MAX_TILE
   RANGE_CHECK_HI(extra_cfg, tile_columns, 6);
-  RANGE_CHECK_HI(extra_cfg, tile_rows, 2);
-#endif  // CONFIG_MAX_TILE
-#if CONFIG_EXT_TILE
-  }
-#endif  // CONFIG_EXT_TILE
+  RANGE_CHECK_HI(extra_cfg, tile_rows, 6);
+
+  RANGE_CHECK_HI(cfg, monochrome, 1);
+
+  if (cfg->large_scale_tile && extra_cfg->aq_mode)
+    ERROR(
+        "Adaptive quantization are not supported in large scale tile "
+        "coding.");
 
-#if CONFIG_DEPENDENT_HORZTILES
-  RANGE_CHECK_HI(extra_cfg, dependent_horz_tiles, 1);
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  RANGE_CHECK_HI(extra_cfg, loop_filter_across_tiles_enabled, 1);
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
   RANGE_CHECK_HI(extra_cfg, sharpness, 7);
   RANGE_CHECK_HI(extra_cfg, arnr_max_frames, 15);
   RANGE_CHECK_HI(extra_cfg, arnr_strength, 6);
@@ -334,25 +305,14 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
   if (extra_cfg->tuning == AOM_TUNE_SSIM)
     ERROR("Option --tune=ssim is not currently supported in AV1.");
 
-// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
-#if CONFIG_PVQ
-  if (extra_cfg->content == AOM_CONTENT_SCREEN)
-    ERROR(
-        "Option --tune-content=screen is not currently supported when PVQ is "
-        "enabled.");
-#endif  // CONFIG_PVQ
-
   if (cfg->g_pass == AOM_RC_LAST_PASS) {
-#if !CONFIG_XIPHRC
     const size_t packet_sz = sizeof(FIRSTPASS_STATS);
     const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
     const FIRSTPASS_STATS *stats;
-#endif
 
     if (cfg->rc_twopass_stats_in.buf == NULL)
       ERROR("rc_twopass_stats_in.buf not set.");
 
-#if !CONFIG_XIPHRC
     if (cfg->rc_twopass_stats_in.sz % packet_sz)
       ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
 
@@ -364,37 +324,46 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
 
     if ((int)(stats->count + 0.5) != n_packets - 1)
       ERROR("rc_twopass_stats_in missing EOS stats packet");
-#endif
   }
 
-#if !CONFIG_HIGHBITDEPTH
-  if (cfg->g_profile > (unsigned int)PROFILE_1) {
-    ERROR("Profile > 1 not supported in this build configuration");
-  }
-#endif
   if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
-      cfg->g_bit_depth > AOM_BITS_8) {
-    ERROR("Codec high bit-depth not supported in profile < 2");
-  }
-  if (cfg->g_profile <= (unsigned int)PROFILE_1 && cfg->g_input_bit_depth > 8) {
-    ERROR("Source high bit-depth not supported in profile < 2");
+      cfg->g_bit_depth > AOM_BITS_10) {
+    ERROR("Codec bit-depth 12 not supported in profile < 2");
   }
-  if (cfg->g_profile > (unsigned int)PROFILE_1 &&
-      cfg->g_bit_depth == AOM_BITS_8) {
-    ERROR("Codec bit-depth 8 not supported in profile > 1");
+  if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
+      cfg->g_input_bit_depth > 10) {
+    ERROR("Source bit-depth 12 not supported in profile < 2");
   }
-#if CONFIG_COLORSPACE_HEADERS
-  RANGE_CHECK(extra_cfg, color_space, AOM_CS_UNKNOWN, AOM_CS_ICTCP);
-  RANGE_CHECK(extra_cfg, transfer_function, AOM_TF_UNKNOWN, AOM_TF_HLG);
-  RANGE_CHECK(extra_cfg, chroma_sample_position, AOM_CSP_UNKNOWN,
-              AOM_CSP_COLOCATED);
+
+  RANGE_CHECK(extra_cfg, color_primaries, AOM_CICP_CP_BT_709,
+              AOM_CICP_CP_EBU_3213);  // Need to check range more precisely to
+                                      // check for reserved values?
+  RANGE_CHECK(extra_cfg, transfer_characteristics, AOM_CICP_TC_BT_709,
+              AOM_CICP_TC_HLG);
+  RANGE_CHECK(extra_cfg, matrix_coefficients, AOM_CICP_MC_IDENTITY,
+              AOM_CICP_MC_ICTCP);
+  RANGE_CHECK(extra_cfg, color_range, 0, 1);
+
+#if CONFIG_DIST_8X8
+  RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_DAALA_DIST);
 #else
-  RANGE_CHECK(extra_cfg, color_space, AOM_CS_UNKNOWN, AOM_CS_SRGB);
+  RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_SSIM);
 #endif
-  RANGE_CHECK(extra_cfg, color_range, 0, 1);
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  RANGE_CHECK(extra_cfg, ans_window_size_log2, 8, 23);
+
+  RANGE_CHECK(extra_cfg, timing_info_type, AOM_TIMING_UNSPECIFIED,
+              AOM_TIMING_DEC_MODEL);
+
+  RANGE_CHECK(extra_cfg, film_grain_test_vector, 0, 16);
+
+  if (extra_cfg->lossless) {
+    if (extra_cfg->aq_mode != 0)
+      ERROR("Only --aq_mode=0 can be used with --lossless=1.");
+#if CONFIG_DIST_8X8
+    if (extra_cfg->enable_dist_8x8)
+      ERROR("dist-8x8 cannot be used with lossless compression.");
 #endif
+  }
+
   return AOM_CODEC_OK;
 }
 
@@ -404,23 +373,17 @@ static aom_codec_err_t validate_img(aom_codec_alg_priv_t *ctx,
     case AOM_IMG_FMT_YV12:
     case AOM_IMG_FMT_I420:
     case AOM_IMG_FMT_I42016: break;
-    case AOM_IMG_FMT_I422:
     case AOM_IMG_FMT_I444:
-    case AOM_IMG_FMT_I440:
-      if (ctx->cfg.g_profile != (unsigned int)PROFILE_1) {
-        ERROR(
-            "Invalid image format. I422, I444, I440 images are "
-            "not supported in profile.");
+    case AOM_IMG_FMT_I44416:
+      if (ctx->cfg.g_profile == (unsigned int)PROFILE_0 &&
+          !ctx->cfg.monochrome) {
+        ERROR("Invalid image format. I444 images not supported in profile.");
       }
       break;
+    case AOM_IMG_FMT_I422:
     case AOM_IMG_FMT_I42216:
-    case AOM_IMG_FMT_I44416:
-    case AOM_IMG_FMT_I44016:
-      if (ctx->cfg.g_profile != (unsigned int)PROFILE_1 &&
-          ctx->cfg.g_profile != (unsigned int)PROFILE_3) {
-        ERROR(
-            "Invalid image format. 16-bit I422, I444, I440 images are "
-            "not supported in profile.");
+      if (ctx->cfg.g_profile != (unsigned int)PROFILE_2) {
+        ERROR("Invalid image format. I422 images not supported in profile.");
       }
       break;
     default:
@@ -442,31 +405,74 @@ static int get_image_bps(const aom_image_t *img) {
     case AOM_IMG_FMT_I420: return 12;
     case AOM_IMG_FMT_I422: return 16;
     case AOM_IMG_FMT_I444: return 24;
-    case AOM_IMG_FMT_I440: return 16;
     case AOM_IMG_FMT_I42016: return 24;
     case AOM_IMG_FMT_I42216: return 32;
     case AOM_IMG_FMT_I44416: return 48;
-    case AOM_IMG_FMT_I44016: return 32;
     default: assert(0 && "Invalid image format"); break;
   }
   return 0;
 }
 
+// Set appropriate options to disable frame super-resolution.
+static void disable_superres(AV1EncoderConfig *const oxcf) {
+  oxcf->superres_mode = SUPERRES_NONE;
+  oxcf->superres_scale_denominator = SCALE_NUMERATOR;
+  oxcf->superres_kf_scale_denominator = SCALE_NUMERATOR;
+  oxcf->superres_qthresh = 255;
+  oxcf->superres_kf_qthresh = 255;
+}
+
 static aom_codec_err_t set_encoder_config(
     AV1EncoderConfig *oxcf, const aom_codec_enc_cfg_t *cfg,
     const struct av1_extracfg *extra_cfg) {
   const int is_vbr = cfg->rc_end_usage == AOM_VBR;
   oxcf->profile = cfg->g_profile;
+  oxcf->fwd_kf_enabled = cfg->fwd_kf_enabled;
   oxcf->max_threads = (int)cfg->g_threads;
   oxcf->width = cfg->g_w;
   oxcf->height = cfg->g_h;
+  oxcf->forced_max_frame_width = cfg->g_forced_max_frame_width;
+  oxcf->forced_max_frame_height = cfg->g_forced_max_frame_height;
   oxcf->bit_depth = cfg->g_bit_depth;
   oxcf->input_bit_depth = cfg->g_input_bit_depth;
   // guess a frame rate if out of whack, use 30
   oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
-  if (oxcf->init_framerate > 180) oxcf->init_framerate = 30;
-
+  if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL ||
+      extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) {
+    oxcf->timing_info_present = 1;
+    oxcf->timing_info.num_units_in_display_tick = cfg->g_timebase.num;
+    oxcf->timing_info.time_scale = cfg->g_timebase.den;
+    oxcf->timing_info.num_ticks_per_picture = 1;
+  } else {
+    oxcf->timing_info_present = 0;
+  }
+  if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL) {
+    oxcf->timing_info.equal_picture_interval = 1;
+    oxcf->decoder_model_info_present_flag = 0;
+    oxcf->display_model_info_present_flag = 1;
+  } else if (extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) {
+    //    if( extra_cfg->arnr_strength > 0 )
+    //    {
+    //      printf("Only --arnr-strength=0 can currently be used with
+    //      --timing-info=model."); return AOM_CODEC_INVALID_PARAM;
+    //    }
+    //    if( extra_cfg->enable_superres)
+    //    {
+    //      printf("Only --superres-mode=0 can currently be used with
+    //      --timing-info=model."); return AOM_CODEC_INVALID_PARAM;
+    //    }
+    oxcf->buffer_model.num_units_in_decoding_tick = cfg->g_timebase.num;
+    oxcf->timing_info.equal_picture_interval = 0;
+    oxcf->decoder_model_info_present_flag = 1;
+    oxcf->buffer_removal_delay_present = 1;
+    oxcf->display_model_info_present_flag = 1;
+  }
+  if (oxcf->init_framerate > 180) {
+    oxcf->init_framerate = 30;
+    oxcf->timing_info_present = 0;
+  }
   oxcf->mode = GOOD;
+  oxcf->cfg = &cfg->cfg;
 
   switch (cfg->g_pass) {
     case AOM_RC_ONE_PASS: oxcf->pass = 0; break;
@@ -491,11 +497,15 @@ static aom_codec_err_t set_encoder_config(
   oxcf->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level);
   oxcf->fixed_q = -1;
 
-#if CONFIG_AOM_QM
+  oxcf->enable_cdef = extra_cfg->enable_cdef;
+  oxcf->enable_restoration = extra_cfg->enable_restoration;
+  oxcf->disable_trellis_quant = extra_cfg->disable_trellis_quant;
   oxcf->using_qm = extra_cfg->enable_qm;
+  oxcf->qm_y = extra_cfg->qm_y;
+  oxcf->qm_u = extra_cfg->qm_u;
+  oxcf->qm_v = extra_cfg->qm_v;
   oxcf->qm_minlevel = extra_cfg->qm_min;
   oxcf->qm_maxlevel = extra_cfg->qm_max;
-#endif
 #if CONFIG_DIST_8X8
   oxcf->using_dist_8x8 = extra_cfg->enable_dist_8x8;
   if (extra_cfg->tuning == AOM_TUNE_CDEF_DIST ||
@@ -503,15 +513,16 @@ static aom_codec_err_t set_encoder_config(
     oxcf->using_dist_8x8 = 1;
 #endif
   oxcf->num_tile_groups = extra_cfg->num_tg;
-#if CONFIG_EXT_TILE
   // In large-scale tile encoding mode, num_tile_groups is always 1.
   if (cfg->large_scale_tile) oxcf->num_tile_groups = 1;
-#endif  // CONFIG_EXT_TILE
   oxcf->mtu = extra_cfg->mtu_size;
 
-#if CONFIG_TEMPMV_SIGNALING
-  oxcf->disable_tempmv = extra_cfg->disable_tempmv;
-#endif
+  // FIXME(debargha): Should this be:
+  // oxcf->allow_ref_frame_mvs = extra_cfg->allow_ref_frame_mvs &
+  //                             extra_cfg->enable_order_hint ?
+  // Disallow using temporal MVs while large_scale_tile = 1.
+  oxcf->allow_ref_frame_mvs =
+      extra_cfg->allow_ref_frame_mvs && !cfg->large_scale_tile;
   oxcf->under_shoot_pct = cfg->rc_undershoot_pct;
   oxcf->over_shoot_pct = cfg->rc_overshoot_pct;
 
@@ -523,26 +534,26 @@ static aom_codec_err_t set_encoder_config(
       oxcf->resize_kf_scale_denominator == SCALE_NUMERATOR)
     oxcf->resize_mode = RESIZE_NONE;
 
-#if CONFIG_FRAME_SUPERRES
-  oxcf->superres_mode = (SUPERRES_MODE)cfg->rc_superres_mode;
-  oxcf->superres_scale_denominator = (uint8_t)cfg->rc_superres_denominator;
-  oxcf->superres_kf_scale_denominator =
-      (uint8_t)cfg->rc_superres_kf_denominator;
-  oxcf->superres_qthresh =
-      extra_cfg->lossless ? 255
-                          : av1_quantizer_to_qindex(cfg->rc_superres_qthresh);
-  oxcf->superres_kf_qthresh =
-      extra_cfg->lossless
-          ? 255
-          : av1_quantizer_to_qindex(cfg->rc_superres_kf_qthresh);
-  if (oxcf->superres_mode == SUPERRES_FIXED &&
-      oxcf->superres_scale_denominator == SCALE_NUMERATOR &&
-      oxcf->superres_kf_scale_denominator == SCALE_NUMERATOR)
-    oxcf->superres_mode = SUPERRES_NONE;
-  if (oxcf->superres_mode == SUPERRES_QTHRESH &&
-      oxcf->superres_qthresh == 255 && oxcf->superres_kf_qthresh == 255)
-    oxcf->superres_mode = SUPERRES_NONE;
-#endif  // CONFIG_FRAME_SUPERRES
+  if (extra_cfg->lossless || cfg->large_scale_tile) {
+    disable_superres(oxcf);
+  } else {
+    oxcf->superres_mode = (SUPERRES_MODE)cfg->rc_superres_mode;
+    oxcf->superres_scale_denominator = (uint8_t)cfg->rc_superres_denominator;
+    oxcf->superres_kf_scale_denominator =
+        (uint8_t)cfg->rc_superres_kf_denominator;
+    oxcf->superres_qthresh = av1_quantizer_to_qindex(cfg->rc_superres_qthresh);
+    oxcf->superres_kf_qthresh =
+        av1_quantizer_to_qindex(cfg->rc_superres_kf_qthresh);
+    if (oxcf->superres_mode == SUPERRES_FIXED &&
+        oxcf->superres_scale_denominator == SCALE_NUMERATOR &&
+        oxcf->superres_kf_scale_denominator == SCALE_NUMERATOR) {
+      disable_superres(oxcf);
+    }
+    if (oxcf->superres_mode == SUPERRES_QTHRESH &&
+        oxcf->superres_qthresh == 255 && oxcf->superres_kf_qthresh == 255) {
+      disable_superres(oxcf);
+    }
+  }
 
   oxcf->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz;
   oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
@@ -558,12 +569,13 @@ static aom_codec_err_t set_encoder_config(
       cfg->kf_mode == AOM_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist;
 
   oxcf->key_freq = cfg->kf_max_dist;
-
+  oxcf->sframe_dist = cfg->sframe_dist;
+  oxcf->sframe_mode = cfg->sframe_mode;
+  oxcf->sframe_enabled = cfg->sframe_dist != 0;
   oxcf->speed = extra_cfg->cpu_used;
+  oxcf->dev_sf = extra_cfg->dev_sf;
   oxcf->enable_auto_arf = extra_cfg->enable_auto_alt_ref;
-#if CONFIG_EXT_REFS
   oxcf->enable_auto_brf = extra_cfg->enable_auto_bwd_ref;
-#endif  // CONFIG_EXT_REFS
   oxcf->noise_sensitivity = extra_cfg->noise_sensitivity;
   oxcf->sharpness = extra_cfg->sharpness;
 
@@ -573,64 +585,68 @@ static aom_codec_err_t set_encoder_config(
   oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in;
 #endif
 
-  oxcf->color_space = extra_cfg->color_space;
-
-#if CONFIG_COLORSPACE_HEADERS
-  oxcf->transfer_function = extra_cfg->transfer_function;
+  oxcf->color_primaries = extra_cfg->color_primaries;
+  oxcf->transfer_characteristics = extra_cfg->transfer_characteristics;
+  oxcf->matrix_coefficients = extra_cfg->matrix_coefficients;
   oxcf->chroma_sample_position = extra_cfg->chroma_sample_position;
-#else
-  if (extra_cfg->transfer_function != AOM_TF_UNKNOWN)
-    return AOM_CODEC_UNSUP_FEATURE;
-  if (extra_cfg->chroma_sample_position != AOM_CSP_UNKNOWN)
-    return AOM_CODEC_UNSUP_FEATURE;
-#endif
 
   oxcf->color_range = extra_cfg->color_range;
   oxcf->render_width = extra_cfg->render_width;
   oxcf->render_height = extra_cfg->render_height;
   oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
+  // Adjust g_lag_in_frames down if not needed
+  oxcf->lag_in_frames =
+      AOMMIN(MAX_GF_INTERVAL + oxcf->arnr_max_frames / 2, oxcf->lag_in_frames);
   oxcf->arnr_strength = extra_cfg->arnr_strength;
   oxcf->min_gf_interval = extra_cfg->min_gf_interval;
   oxcf->max_gf_interval = extra_cfg->max_gf_interval;
 
   oxcf->tuning = extra_cfg->tuning;
   oxcf->content = extra_cfg->content;
-
-#if CONFIG_EXT_PARTITION
+  oxcf->cdf_update_mode = (uint8_t)extra_cfg->cdf_update_mode;
   oxcf->superblock_size = extra_cfg->superblock_size;
-#endif  // CONFIG_EXT_PARTITION
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  oxcf->ans_window_size_log2 = extra_cfg->ans_window_size_log2;
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-
-#if CONFIG_EXT_TILE
+  if (cfg->large_scale_tile) {
+    oxcf->film_grain_test_vector = 0;
+    oxcf->film_grain_table_filename = NULL;
+  } else {
+    oxcf->film_grain_test_vector = extra_cfg->film_grain_test_vector;
+    oxcf->film_grain_table_filename = extra_cfg->film_grain_table_filename;
+  }
   oxcf->large_scale_tile = cfg->large_scale_tile;
   oxcf->single_tile_decoding =
       (oxcf->large_scale_tile) ? extra_cfg->single_tile_decoding : 0;
   if (oxcf->large_scale_tile) {
-#if CONFIG_EXT_PARTITION
-    const unsigned int max =
-        extra_cfg->superblock_size == AOM_SUPERBLOCK_SIZE_64X64 ? 64 : 32;
-#else
-    const unsigned int max = 64;
-#endif  // CONFIG_EXT_PARTITION
-    // If tile size is not set, set it to the default value.
-    const unsigned int tc =
-        (!extra_cfg->tile_columns) ? UINT_MAX : extra_cfg->tile_columns;
-    const unsigned int tr =
-        (!extra_cfg->tile_rows) ? UINT_MAX : extra_cfg->tile_rows;
-
-    oxcf->tile_columns = AOMMIN(tc, max);
-    oxcf->tile_rows = AOMMIN(tr, max);
-  } else {
-#endif  // CONFIG_EXT_TILE
-    oxcf->tile_columns = extra_cfg->tile_columns;
-    oxcf->tile_rows = extra_cfg->tile_rows;
-#if CONFIG_EXT_TILE
+    // The superblock_size can only be AOM_SUPERBLOCK_SIZE_64X64 or
+    // AOM_SUPERBLOCK_SIZE_128X128 while oxcf->large_scale_tile = 1. If
+    // superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC, hard set it to
+    // AOM_SUPERBLOCK_SIZE_64X64(default value in large_scale_tile).
+    if (extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_64X64 &&
+        extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_128X128)
+      oxcf->superblock_size = AOM_SUPERBLOCK_SIZE_64X64;
+  }
+
+  oxcf->tile_columns = extra_cfg->tile_columns;
+  oxcf->tile_rows = extra_cfg->tile_rows;
+
+  oxcf->monochrome = cfg->monochrome;
+  oxcf->full_still_picture_hdr = cfg->full_still_picture_hdr;
+  oxcf->enable_dual_filter = extra_cfg->use_dual_filter;
+  oxcf->enable_order_hint = extra_cfg->enable_order_hint;
+  oxcf->enable_jnt_comp =
+      extra_cfg->enable_jnt_comp & extra_cfg->enable_order_hint;
+  oxcf->enable_ref_frame_mvs =
+      extra_cfg->enable_ref_frame_mvs & extra_cfg->enable_order_hint;
+
+  oxcf->enable_warped_motion = extra_cfg->enable_warped_motion;
+  oxcf->allow_warped_motion =
+      extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion;
+
+  oxcf->enable_superres =
+      (oxcf->superres_mode != SUPERRES_NONE) && extra_cfg->enable_superres;
+  if (!oxcf->enable_superres) {
+    disable_superres(oxcf);
   }
-#endif  // CONFIG_EXT_TILE
 
-#if CONFIG_MAX_TILE
   oxcf->tile_width_count = AOMMIN(cfg->tile_width_count, MAX_TILE_COLS);
   oxcf->tile_height_count = AOMMIN(cfg->tile_height_count, MAX_TILE_ROWS);
   for (int i = 0; i < oxcf->tile_width_count; i++) {
@@ -639,25 +655,28 @@ static aom_codec_err_t set_encoder_config(
   for (int i = 0; i < oxcf->tile_height_count; i++) {
     oxcf->tile_heights[i] = AOMMAX(cfg->tile_heights[i], 1);
   }
-#endif
-#if CONFIG_DEPENDENT_HORZTILES
-  oxcf->dependent_horz_tiles =
-#if CONFIG_EXT_TILE
-      (cfg->large_scale_tile) ? 0 :
-#endif  // CONFIG_EXT_TILE
-                              extra_cfg->dependent_horz_tiles;
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  oxcf->loop_filter_across_tiles_enabled =
-      extra_cfg->loop_filter_across_tiles_enabled;
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-  oxcf->error_resilient_mode = cfg->g_error_resilient;
+  oxcf->error_resilient_mode =
+      cfg->g_error_resilient | extra_cfg->error_resilient_mode;
+  oxcf->s_frame_mode = extra_cfg->s_frame_mode;
   oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode;
+  if (cfg->g_pass == AOM_RC_LAST_PASS) {
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
+    oxcf->limit = n_packets - 1;
+  } else {
+    oxcf->limit = cfg->g_limit;
+  }
+
+  if (oxcf->limit == 1) {
+    // still picture mode, display model and timing is meaningless
+    oxcf->display_model_info_present_flag = 0;
+    oxcf->timing_info_present = 0;
+  }
 
   oxcf->aq_mode = extra_cfg->aq_mode;
-#if CONFIG_EXT_DELTA_Q
   oxcf->deltaq_mode = extra_cfg->deltaq_mode;
-#endif
+
+  oxcf->save_as_annexb = cfg->save_as_annexb;
 
   oxcf->frame_periodic_boost = extra_cfg->frame_periodic_boost;
   oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test;
@@ -734,6 +753,12 @@ static aom_codec_err_t ctrl_set_cpuused(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_devsf(aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.dev_sf = CAST(AOME_SET_DEVSF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_enable_auto_alt_ref(aom_codec_alg_priv_t *ctx,
                                                     va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -741,14 +766,12 @@ static aom_codec_err_t ctrl_set_enable_auto_alt_ref(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-#if CONFIG_EXT_REFS
 static aom_codec_err_t ctrl_set_enable_auto_bwd_ref(aom_codec_alg_priv_t *ctx,
                                                     va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
   extra_cfg.enable_auto_bwd_ref = CAST(AOME_SET_ENABLEAUTOBWDREF, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
-#endif  // CONFIG_EXT_REFS
 
 static aom_codec_err_t ctrl_set_noise_sensitivity(aom_codec_alg_priv_t *ctx,
                                                   va_list args) {
@@ -785,24 +808,6 @@ static aom_codec_err_t ctrl_set_tile_rows(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-#if CONFIG_DEPENDENT_HORZTILES
-static aom_codec_err_t ctrl_set_tile_dependent_rows(aom_codec_alg_priv_t *ctx,
-                                                    va_list args) {
-  struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.dependent_horz_tiles = CAST(AV1E_SET_TILE_DEPENDENT_ROWS, args);
-  return update_extra_cfg(ctx, &extra_cfg);
-}
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-static aom_codec_err_t ctrl_set_tile_loopfilter(aom_codec_alg_priv_t *ctx,
-                                                va_list args) {
-  struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.loop_filter_across_tiles_enabled =
-      CAST(AV1E_SET_TILE_LOOPFILTER, args);
-  return update_extra_cfg(ctx, &extra_cfg);
-}
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-
 static aom_codec_err_t ctrl_set_arnr_max_frames(aom_codec_alg_priv_t *ctx,
                                                 va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -861,14 +866,48 @@ static aom_codec_err_t ctrl_set_lossless(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-#if CONFIG_AOM_QM
+static aom_codec_err_t ctrl_set_enable_cdef(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_cdef = CAST(AV1E_SET_ENABLE_CDEF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_restoration(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_restoration = CAST(AV1E_SET_ENABLE_RESTORATION, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_disable_trellis_quant(aom_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.disable_trellis_quant = CAST(AV1E_SET_DISABLE_TRELLIS_QUANT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_enable_qm(aom_codec_alg_priv_t *ctx,
                                           va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
   extra_cfg.enable_qm = CAST(AV1E_SET_ENABLE_QM, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
-
+static aom_codec_err_t ctrl_set_qm_y(aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.qm_y = CAST(AV1E_SET_QM_Y, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_qm_u(aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.qm_u = CAST(AV1E_SET_QM_U, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_qm_v(aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.qm_v = CAST(AV1E_SET_QM_V, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
 static aom_codec_err_t ctrl_set_qm_min(aom_codec_alg_priv_t *ctx,
                                        va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -882,7 +921,6 @@ static aom_codec_err_t ctrl_set_qm_max(aom_codec_alg_priv_t *ctx,
   extra_cfg.qm_max = CAST(AV1E_SET_QM_MAX, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
-#endif
 #if CONFIG_DIST_8X8
 static aom_codec_err_t ctrl_set_enable_dist_8x8(aom_codec_alg_priv_t *ctx,
                                                 va_list args) {
@@ -903,14 +941,83 @@ static aom_codec_err_t ctrl_set_mtu(aom_codec_alg_priv_t *ctx, va_list args) {
   extra_cfg.mtu_size = CAST(AV1E_SET_MTU, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
-#if CONFIG_TEMPMV_SIGNALING
-static aom_codec_err_t ctrl_set_disable_tempmv(aom_codec_alg_priv_t *ctx,
-                                               va_list args) {
+static aom_codec_err_t ctrl_set_timing_info_type(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.disable_tempmv = CAST(AV1E_SET_DISABLE_TEMPMV, args);
+  extra_cfg.timing_info_type = CAST(AV1E_SET_TIMING_INFO_TYPE, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
-#endif
+
+static aom_codec_err_t ctrl_set_enable_df(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.use_dual_filter = CAST(AV1E_SET_ENABLE_DF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_order_hint(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_order_hint = CAST(AV1E_SET_ENABLE_ORDER_HINT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_jnt_comp(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_jnt_comp = CAST(AV1E_SET_ENABLE_JNT_COMP, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_ref_frame_mvs(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_ref_frame_mvs = CAST(AV1E_SET_ENABLE_REF_FRAME_MVS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_allow_ref_frame_mvs(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.allow_ref_frame_mvs = CAST(AV1E_SET_ALLOW_REF_FRAME_MVS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_warped_motion(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_warped_motion = CAST(AV1E_SET_ENABLE_WARPED_MOTION, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_allow_warped_motion(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.allow_warped_motion = CAST(AV1E_SET_ALLOW_WARPED_MOTION, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_superres(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_superres = CAST(AV1E_SET_ENABLE_SUPERRES, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_error_resilient_mode(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.error_resilient_mode = CAST(AV1E_SET_ERROR_RESILIENT_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_s_frame_mode(aom_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.s_frame_mode = CAST(AV1E_SET_S_FRAME_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_frame_parallel_decoding_mode(
     aom_codec_alg_priv_t *ctx, va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -919,14 +1026,12 @@ static aom_codec_err_t ctrl_set_frame_parallel_decoding_mode(
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-#if CONFIG_EXT_TILE
 static aom_codec_err_t ctrl_set_single_tile_decoding(aom_codec_alg_priv_t *ctx,
                                                      va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
   extra_cfg.single_tile_decoding = CAST(AV1E_SET_SINGLE_TILE_DECODING, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
-#endif  // CONFIG_EXT_TILE
 
 static aom_codec_err_t ctrl_set_aq_mode(aom_codec_alg_priv_t *ctx,
                                         va_list args) {
@@ -935,14 +1040,28 @@ static aom_codec_err_t ctrl_set_aq_mode(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-#if CONFIG_EXT_DELTA_Q
+static aom_codec_err_t ctrl_set_film_grain_test_vector(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.film_grain_test_vector =
+      CAST(AV1E_SET_FILM_GRAIN_TEST_VECTOR, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_film_grain_table(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.film_grain_table_filename = CAST(AV1E_SET_FILM_GRAIN_TABLE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_deltaq_mode(aom_codec_alg_priv_t *ctx,
                                             va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
   extra_cfg.deltaq_mode = CAST(AV1E_SET_DELTAQ_MODE, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
-#endif
+
 static aom_codec_err_t ctrl_set_min_gf_interval(aom_codec_alg_priv_t *ctx,
                                                 va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1006,10 +1125,8 @@ static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx,
 
     if (res == AOM_CODEC_OK) {
       set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
-#if CONFIG_HIGHBITDEPTH
       priv->oxcf.use_highbitdepth =
           (ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
-#endif
       priv->cpi = av1_create_compressor(&priv->oxcf, priv->buffer_pool);
       if (priv->cpi == NULL)
         res = AOM_CODEC_MEM_ERROR;
@@ -1032,108 +1149,6 @@ static aom_codec_err_t encoder_destroy(aom_codec_alg_priv_t *ctx) {
   return AOM_CODEC_OK;
 }
 
-static void pick_quickcompress_mode(aom_codec_alg_priv_t *ctx,
-                                    unsigned long deadline) {
-  MODE new_mode = GOOD;
-
-  switch (ctx->cfg.g_pass) {
-    case AOM_RC_ONE_PASS:
-      switch (deadline) {
-        default: new_mode = GOOD; break;
-      }
-      break;
-    case AOM_RC_FIRST_PASS: break;
-    case AOM_RC_LAST_PASS: new_mode = GOOD;
-  }
-
-  if (ctx->oxcf.mode != new_mode) {
-    ctx->oxcf.mode = new_mode;
-    av1_change_config(ctx->cpi, &ctx->oxcf);
-  }
-}
-
-// Turn on to test if supplemental superframe data breaks decoding
-#define TEST_SUPPLEMENTAL_SUPERFRAME_DATA 0
-
-static int write_superframe_index(aom_codec_alg_priv_t *ctx) {
-  uint8_t marker = 0xc0;
-  size_t max_frame_sz = 0;
-
-  assert(ctx->pending_frame_count);
-  assert(ctx->pending_frame_count <= 8);
-
-  // Add the number of frames to the marker byte
-  marker |= ctx->pending_frame_count - 1;
-  for (int i = 0; i < ctx->pending_frame_count - 1; i++) {
-    const size_t frame_sz = ctx->pending_frame_sizes[i] - 1;
-    max_frame_sz = AOMMAX(frame_sz, max_frame_sz);
-  }
-
-  // Choose the magnitude
-  int mag;
-  unsigned int mask;
-  for (mag = 0, mask = 0xff; mag < MAG_SIZE; mag++) {
-    if (max_frame_sz <= mask) break;
-    mask <<= 8;
-    mask |= 0xff;
-  }
-  marker |= mag << 3;
-
-  // Write the index
-  uint8_t buffer[MAX_INDEX_SIZE];
-  uint8_t *x = buffer;
-
-  if (TEST_SUPPLEMENTAL_SUPERFRAME_DATA) {
-    uint8_t marker_test = 0xc0;
-    int mag_test = 2;     // 1 - 4
-    int frames_test = 4;  // 1 - 8
-    marker_test |= frames_test - 1;
-    marker_test |= (mag_test - 1) << 3;
-    *x++ = marker_test;
-    for (int i = 0; i < mag_test * frames_test; ++i)
-      *x++ = 0;  // fill up with arbitrary data
-    *x++ = marker_test;
-    printf("Added supplemental superframe data\n");
-  }
-
-  *x++ = marker;
-  for (int i = 0; i < ctx->pending_frame_count - 1; i++) {
-    assert(ctx->pending_frame_sizes[i] > 0);
-    unsigned int this_sz = (unsigned int)ctx->pending_frame_sizes[i] - 1;
-    for (int j = 0; j <= mag; j++) {
-      *x++ = this_sz & 0xff;
-      this_sz >>= 8;
-    }
-  }
-  *x++ = marker;
-
-  const size_t index_sz = x - buffer;
-  assert(index_sz < MAX_INDEX_SIZE);
-  assert(ctx->pending_cx_data_sz + index_sz < ctx->cx_data_sz);
-
-  // move the frame to make room for the index
-  memmove(ctx->pending_cx_data + index_sz, ctx->pending_cx_data,
-          ctx->pending_cx_data_sz);
-  memcpy(ctx->pending_cx_data, buffer, index_sz);
-  ctx->pending_cx_data_sz += index_sz;
-
-  return (int)index_sz;
-}
-
-// av1 uses 10,000,000 ticks/second as time stamp
-#define TICKS_PER_SEC 10000000LL
-
-static int64_t timebase_units_to_ticks(const aom_rational_t *timebase,
-                                       int64_t n) {
-  return n * TICKS_PER_SEC * timebase->num / timebase->den;
-}
-
-static int64_t ticks_to_timebase_units(const aom_rational_t *timebase,
-                                       int64_t n) {
-  const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
-  return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
-}
-
 static aom_codec_frame_flags_t get_frame_pkt_flags(const AV1_COMP *cpi,
                                                    unsigned int lib_flags) {
   aom_codec_frame_flags_t flags = lib_flags << 16;
@@ -1149,8 +1164,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
                                       const aom_image_t *img,
                                       aom_codec_pts_t pts,
                                       unsigned long duration,
-                                      aom_enc_frame_flags_t enc_flags,
-                                      unsigned long deadline) {
+                                      aom_enc_frame_flags_t enc_flags) {
   const size_t kMinCompressedSize = 8192;
   volatile aom_codec_err_t res = AOM_CODEC_OK;
   AV1_COMP *const cpi = ctx->cpi;
@@ -1163,17 +1177,8 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
     // TODO(jzern) the checks related to cpi's validity should be treated as a
     // failure condition, encoder setup is done fully in init() currently.
     if (res == AOM_CODEC_OK) {
-#if CONFIG_EXT_REFS
       size_t data_sz = ALIGN_POWER_OF_TWO(ctx->cfg.g_w, 5) *
                        ALIGN_POWER_OF_TWO(ctx->cfg.g_h, 5) * get_image_bps(img);
-#else
-      // There's no codec control for multiple alt-refs so check the encoder
-      // instance for its status to determine the compressed data size.
-      size_t data_sz = ALIGN_POWER_OF_TWO(ctx->cfg.g_w, 5) *
-                       ALIGN_POWER_OF_TWO(ctx->cfg.g_h, 5) *
-                       get_image_bps(img) / 8 *
-                       (cpi->multi_arf_allowed ? 8 : 2);
-#endif  // CONFIG_EXT_REFS
       if (data_sz < kMinCompressedSize) data_sz = kMinCompressedSize;
       if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
         ctx->cx_data_sz = data_sz;
@@ -1186,18 +1191,15 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
     }
   }
 
-  pick_quickcompress_mode(ctx, deadline);
+  if (ctx->oxcf.mode != GOOD) {
+    ctx->oxcf.mode = GOOD;
+    av1_change_config(ctx->cpi, &ctx->oxcf);
+  }
+
   aom_codec_pkt_list_init(&ctx->pkt_list);
 
   volatile aom_enc_frame_flags_t flags = enc_flags;
 
-  // Handle Flags
-  if (((flags & AOM_EFLAG_NO_UPD_GF) && (flags & AOM_EFLAG_FORCE_GF)) ||
-      ((flags & AOM_EFLAG_NO_UPD_ARF) && (flags & AOM_EFLAG_FORCE_ARF))) {
-    ctx->base.err_detail = "Conflicting flags.";
-    return AOM_CODEC_INVALID_PARAM;
-  }
-
   if (setjmp(cpi->common.error.jmp)) {
     cpi->common.error.setjmp = 0;
     res = update_error_state(ctx, &cpi->common.error);
@@ -1206,6 +1208,9 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
   }
   cpi->common.error.setjmp = 1;
 
+  // Note(yunqing): While applying encoding flags, always start from enabling
+  // all, and then modifying according to the flags. Previous frame's flags are
+  // overwritten.
   av1_apply_encoding_flags(cpi, flags);
 
   // Handle fixed keyframe intervals
@@ -1267,18 +1272,66 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
            !is_frame_visible &&
            -1 != av1_get_compressed_data(cpi, &lib_flags, &frame_size, cx_data,
                                          &dst_time_stamp, &dst_end_time_stamp,
-                                         !img)) {
-#if CONFIG_REFERENCE_BUFFER
+                                         !img, timebase)) {
       if (cpi->common.seq_params.frame_id_numbers_present_flag) {
-        if (cpi->common.invalid_delta_frame_id_minus1) {
-          ctx->base.err_detail = "Invalid delta_frame_id_minus1";
+        if (cpi->common.invalid_delta_frame_id_minus_1) {
+          ctx->base.err_detail = "Invalid delta_frame_id_minus_1";
           return AOM_CODEC_ERROR;
         }
       }
-#endif  // CONFIG_REFERENCE_BUFFER
+      cpi->seq_params_locked = 1;
       if (frame_size) {
         if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data;
 
+        const int write_temporal_delimiter =
+            !cpi->common.spatial_layer_id && !ctx->pending_frame_count;
+
+        if (write_temporal_delimiter) {
+          uint32_t obu_header_size = 1;
+          const uint32_t obu_payload_size = 0;
+          const size_t length_field_size =
+              aom_uleb_size_in_bytes(obu_payload_size);
+
+          if (ctx->pending_cx_data) {
+            const size_t move_offset = length_field_size + 1;
+            memmove(ctx->pending_cx_data + move_offset, ctx->pending_cx_data,
+                    frame_size);
+          }
+          const uint32_t obu_header_offset = 0;
+          obu_header_size = write_obu_header(
+              OBU_TEMPORAL_DELIMITER, 0,
+              (uint8_t *)(ctx->pending_cx_data + obu_header_offset));
+
+          // OBUs are preceded/succeeded by an unsigned leb128 coded integer.
+          if (write_uleb_obu_size(obu_header_size, obu_payload_size,
+                                  ctx->pending_cx_data) != AOM_CODEC_OK) {
+            return AOM_CODEC_ERROR;
+          }
+
+          frame_size += obu_header_size + obu_payload_size + length_field_size;
+        }
+
+        if (ctx->oxcf.save_as_annexb) {
+          size_t curr_frame_size = frame_size;
+          if (av1_convert_sect5obus_to_annexb(cx_data, &curr_frame_size) !=
+              AOM_CODEC_OK) {
+            return AOM_CODEC_ERROR;
+          }
+          frame_size = curr_frame_size;
+
+          // B_PRIME (add frame size)
+          const size_t length_field_size = aom_uleb_size_in_bytes(frame_size);
+          if (ctx->pending_cx_data) {
+            const size_t move_offset = length_field_size;
+            memmove(cx_data + move_offset, cx_data, frame_size);
+          }
+          if (write_uleb_obu_size(0, (uint32_t)frame_size, cx_data) !=
+              AOM_CODEC_OK) {
+            return AOM_CODEC_ERROR;
+          }
+          frame_size += length_field_size;
+        }
+
         ctx->pending_frame_sizes[ctx->pending_frame_count++] = frame_size;
         ctx->pending_cx_data_sz += frame_size;
 
@@ -1291,23 +1344,31 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
       }
     }
     if (is_frame_visible) {
-      // insert superframe index if needed
-      if (ctx->pending_frame_count > 1) {
-#if CONFIG_DEBUG
-        assert(index_size >= write_superframe_index(ctx));
-#else
-        write_superframe_index(ctx);
-#endif
-      }
-
       // Add the frame packet to the list of returned packets.
       aom_codec_cx_pkt_t pkt;
 
+      if (ctx->oxcf.save_as_annexb) {
+        //  B_PRIME (add TU size)
+        size_t tu_size = ctx->pending_cx_data_sz;
+        const size_t length_field_size = aom_uleb_size_in_bytes(tu_size);
+        if (ctx->pending_cx_data) {
+          const size_t move_offset = length_field_size;
+          memmove(ctx->pending_cx_data + move_offset, ctx->pending_cx_data,
+                  tu_size);
+        }
+        if (write_uleb_obu_size(0, (uint32_t)tu_size, ctx->pending_cx_data) !=
+            AOM_CODEC_OK) {
+          return AOM_CODEC_ERROR;
+        }
+        ctx->pending_cx_data_sz += length_field_size;
+      }
+
       pkt.kind = AOM_CODEC_CX_FRAME_PKT;
 
       pkt.data.frame.buf = ctx->pending_cx_data;
       pkt.data.frame.sz = ctx->pending_cx_data_sz;
       pkt.data.frame.partition_id = -1;
+      pkt.data.frame.vis_frame_size = frame_size;
 
       pkt.data.frame.pts = ticks_to_timebase_units(timebase, dst_time_stamp);
       pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
@@ -1394,6 +1455,25 @@ static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx,
   }
 }
 
+static aom_codec_err_t ctrl_copy_new_frame_image(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  aom_image_t *const new_img = va_arg(args, aom_image_t *);
+
+  if (new_img != NULL) {
+    YV12_BUFFER_CONFIG new_frame;
+
+    if (av1_get_last_show_frame(ctx->cpi, &new_frame) == 0) {
+      YV12_BUFFER_CONFIG sd;
+      image2yuvconfig(new_img, &sd);
+      return av1_copy_new_frame_enc(&ctx->cpi->common, &new_frame, &sd);
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
 static aom_codec_err_t ctrl_set_previewpp(aom_codec_alg_priv_t *ctx,
                                           va_list args) {
   (void)ctx;
@@ -1473,6 +1553,24 @@ static aom_codec_err_t ctrl_set_scale_mode(aom_codec_alg_priv_t *ctx,
   }
 }
 
+static aom_codec_err_t ctrl_set_spatial_layer_id(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  const int spatial_layer_id = va_arg(args, int);
+  if (spatial_layer_id > MAX_NUM_ENHANCEMENT_LAYERS)
+    return AOM_CODEC_INVALID_PARAM;
+  ctx->cpi->common.spatial_layer_id = spatial_layer_id;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_number_spatial_layers(aom_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  const int number_spatial_layers = va_arg(args, int);
+  if (number_spatial_layers > MAX_NUM_ENHANCEMENT_LAYERS)
+    return AOM_CODEC_INVALID_PARAM;
+  ctx->cpi->common.number_spatial_layers = number_spatial_layers;
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_err_t ctrl_set_tune_content(aom_codec_alg_priv_t *ctx,
                                              va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1480,38 +1578,41 @@ static aom_codec_err_t ctrl_set_tune_content(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-static aom_codec_err_t ctrl_set_color_space(aom_codec_alg_priv_t *ctx,
-                                            va_list args) {
+static aom_codec_err_t ctrl_set_cdf_update_mode(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.color_space = CAST(AV1E_SET_COLOR_SPACE, args);
+  extra_cfg.cdf_update_mode = CAST(AV1E_SET_CDF_UPDATE_MODE, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-static aom_codec_err_t ctrl_set_transfer_function(aom_codec_alg_priv_t *ctx,
-                                                  va_list args) {
-#if CONFIG_COLORSPACE_HEADERS
+static aom_codec_err_t ctrl_set_color_primaries(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.transfer_function = CAST(AV1E_SET_TRANSFER_FUNCTION, args);
+  extra_cfg.color_primaries = CAST(AV1E_SET_COLOR_PRIMARIES, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_transfer_characteristics(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.transfer_characteristics =
+      CAST(AV1E_SET_TRANSFER_CHARACTERISTICS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_matrix_coefficients(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.matrix_coefficients = CAST(AV1E_SET_MATRIX_COEFFICIENTS, args);
   return update_extra_cfg(ctx, &extra_cfg);
-#else
-  (void)ctx;
-  (void)args;
-  return AOM_CODEC_UNSUP_FEATURE;
-#endif
 }
 
 static aom_codec_err_t ctrl_set_chroma_sample_position(
     aom_codec_alg_priv_t *ctx, va_list args) {
-#if CONFIG_COLORSPACE_HEADERS
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
   extra_cfg.chroma_sample_position =
       CAST(AV1E_SET_CHROMA_SAMPLE_POSITION, args);
   return update_extra_cfg(ctx, &extra_cfg);
-#else
-  (void)ctx;
-  (void)args;
-  return AOM_CODEC_UNSUP_FEATURE;
-#endif
 }
 
 static aom_codec_err_t ctrl_set_color_range(aom_codec_alg_priv_t *ctx,
@@ -1537,15 +1638,6 @@ static aom_codec_err_t ctrl_set_superblock_size(aom_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-static aom_codec_err_t ctrl_set_ans_window_size_log2(aom_codec_alg_priv_t *ctx,
-                                                     va_list args) {
-  struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.ans_window_size_log2 = CAST(AV1E_SET_ANS_WINDOW_SIZE_LOG2, args);
-  return update_extra_cfg(ctx, &extra_cfg);
-}
-#endif
-
 static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1_COPY_REFERENCE, ctrl_copy_reference },
   { AOME_USE_REFERENCE, ctrl_use_reference },
@@ -1556,51 +1648,58 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AOME_SET_ROI_MAP, ctrl_set_roi_map },
   { AOME_SET_ACTIVEMAP, ctrl_set_active_map },
   { AOME_SET_SCALEMODE, ctrl_set_scale_mode },
+  { AOME_SET_SPATIAL_LAYER_ID, ctrl_set_spatial_layer_id },
   { AOME_SET_CPUUSED, ctrl_set_cpuused },
+  { AOME_SET_DEVSF, ctrl_set_devsf },
   { AOME_SET_ENABLEAUTOALTREF, ctrl_set_enable_auto_alt_ref },
-#if CONFIG_EXT_REFS
   { AOME_SET_ENABLEAUTOBWDREF, ctrl_set_enable_auto_bwd_ref },
-#endif  // CONFIG_EXT_REFS
   { AOME_SET_SHARPNESS, ctrl_set_sharpness },
   { AOME_SET_STATIC_THRESHOLD, ctrl_set_static_thresh },
   { AV1E_SET_TILE_COLUMNS, ctrl_set_tile_columns },
   { AV1E_SET_TILE_ROWS, ctrl_set_tile_rows },
-#if CONFIG_DEPENDENT_HORZTILES
-  { AV1E_SET_TILE_DEPENDENT_ROWS, ctrl_set_tile_dependent_rows },
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  { AV1E_SET_TILE_LOOPFILTER, ctrl_set_tile_loopfilter },
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
   { AOME_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames },
   { AOME_SET_ARNR_STRENGTH, ctrl_set_arnr_strength },
   { AOME_SET_TUNING, ctrl_set_tuning },
   { AOME_SET_CQ_LEVEL, ctrl_set_cq_level },
   { AOME_SET_MAX_INTRA_BITRATE_PCT, ctrl_set_rc_max_intra_bitrate_pct },
+  { AOME_SET_NUMBER_SPATIAL_LAYERS, ctrl_set_number_spatial_layers },
   { AV1E_SET_MAX_INTER_BITRATE_PCT, ctrl_set_rc_max_inter_bitrate_pct },
   { AV1E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct },
   { AV1E_SET_LOSSLESS, ctrl_set_lossless },
-#if CONFIG_AOM_QM
+  { AV1E_SET_ENABLE_CDEF, ctrl_set_enable_cdef },
+  { AV1E_SET_ENABLE_RESTORATION, ctrl_set_enable_restoration },
+  { AV1E_SET_DISABLE_TRELLIS_QUANT, ctrl_set_disable_trellis_quant },
   { AV1E_SET_ENABLE_QM, ctrl_set_enable_qm },
+  { AV1E_SET_QM_Y, ctrl_set_qm_y },
+  { AV1E_SET_QM_U, ctrl_set_qm_u },
+  { AV1E_SET_QM_V, ctrl_set_qm_v },
   { AV1E_SET_QM_MIN, ctrl_set_qm_min },
   { AV1E_SET_QM_MAX, ctrl_set_qm_max },
-#endif
 #if CONFIG_DIST_8X8
   { AV1E_SET_ENABLE_DIST_8X8, ctrl_set_enable_dist_8x8 },
 #endif
   { AV1E_SET_NUM_TG, ctrl_set_num_tg },
   { AV1E_SET_MTU, ctrl_set_mtu },
-#if CONFIG_TEMPMV_SIGNALING
-  { AV1E_SET_DISABLE_TEMPMV, ctrl_set_disable_tempmv },
-#endif
+  { AV1E_SET_TIMING_INFO_TYPE, ctrl_set_timing_info_type },
   { AV1E_SET_FRAME_PARALLEL_DECODING, ctrl_set_frame_parallel_decoding_mode },
+  { AV1E_SET_ERROR_RESILIENT_MODE, ctrl_set_error_resilient_mode },
+  { AV1E_SET_S_FRAME_MODE, ctrl_set_s_frame_mode },
+  { AV1E_SET_ENABLE_DF, ctrl_set_enable_df },
+  { AV1E_SET_ENABLE_ORDER_HINT, ctrl_set_enable_order_hint },
+  { AV1E_SET_ENABLE_JNT_COMP, ctrl_set_enable_jnt_comp },
+  { AV1E_SET_ENABLE_REF_FRAME_MVS, ctrl_set_enable_ref_frame_mvs },
+  { AV1E_SET_ALLOW_REF_FRAME_MVS, ctrl_set_allow_ref_frame_mvs },
+  { AV1E_SET_ENABLE_WARPED_MOTION, ctrl_set_enable_warped_motion },
+  { AV1E_SET_ALLOW_WARPED_MOTION, ctrl_set_allow_warped_motion },
+  { AV1E_SET_ENABLE_SUPERRES, ctrl_set_enable_superres },
   { AV1E_SET_AQ_MODE, ctrl_set_aq_mode },
-#if CONFIG_EXT_DELTA_Q
   { AV1E_SET_DELTAQ_MODE, ctrl_set_deltaq_mode },
-#endif
   { AV1E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost },
   { AV1E_SET_TUNE_CONTENT, ctrl_set_tune_content },
-  { AV1E_SET_COLOR_SPACE, ctrl_set_color_space },
-  { AV1E_SET_TRANSFER_FUNCTION, ctrl_set_transfer_function },
+  { AV1E_SET_CDF_UPDATE_MODE, ctrl_set_cdf_update_mode },
+  { AV1E_SET_COLOR_PRIMARIES, ctrl_set_color_primaries },
+  { AV1E_SET_TRANSFER_CHARACTERISTICS, ctrl_set_transfer_characteristics },
+  { AV1E_SET_MATRIX_COEFFICIENTS, ctrl_set_matrix_coefficients },
   { AV1E_SET_CHROMA_SAMPLE_POSITION, ctrl_set_chroma_sample_position },
   { AV1E_SET_COLOR_RANGE, ctrl_set_color_range },
   { AV1E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity },
@@ -1608,12 +1707,9 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1E_SET_MAX_GF_INTERVAL, ctrl_set_max_gf_interval },
   { AV1E_SET_RENDER_SIZE, ctrl_set_render_size },
   { AV1E_SET_SUPERBLOCK_SIZE, ctrl_set_superblock_size },
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  { AV1E_SET_ANS_WINDOW_SIZE_LOG2, ctrl_set_ans_window_size_log2 },
-#endif
-#if CONFIG_EXT_TILE
   { AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding },
-#endif  // CONFIG_EXT_TILE
+  { AV1E_SET_FILM_GRAIN_TEST_VECTOR, ctrl_set_film_grain_test_vector },
+  { AV1E_SET_FILM_GRAIN_TABLE, ctrl_set_film_grain_table },
   { AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test },
 
   // Getters
@@ -1622,6 +1718,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AV1_GET_REFERENCE, ctrl_get_reference },
   { AV1E_GET_ACTIVEMAP, ctrl_get_active_map },
   { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
+  { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image },
 
   { -1, NULL },
 };
@@ -1636,6 +1733,9 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
 
         320,         // g_width
         240,         // g_height
+        0,           // g_limit
+        0,           // g_forced_max_frame_width
+        0,           // g_forced_max_frame_height
         AOM_BITS_8,  // g_bit_depth
         8,           // g_input_bit_depth
 
@@ -1645,7 +1745,7 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
 
         AOM_RC_ONE_PASS,  // g_pass
 
-        17,  // g_lag_in_frames
+        19,  // g_lag_in_frames
 
         0,                // rc_dropframe_thresh
         RESIZE_NONE,      // rc_resize_mode
@@ -1676,14 +1776,21 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
         2000,  // rc_two_pass_vbrmax_section
 
         // keyframing settings (kf)
+        0,            // fwd_kf_enabled
         AOM_KF_AUTO,  // g_kfmode
         0,            // kf_min_dist
         9999,         // kf_max_dist
+        0,            // sframe_dist
+        1,            // sframe_mode
         0,            // large_scale_tile
+        0,            // monochrome
+        0,            // full_still_picture_hdr
+        0,            // save_as_annexb
         0,            // tile_width_count
         0,            // tile_height_count
         { 0 },        // tile_widths
         { 0 },        // tile_heights
+        { 1 },        // config file
     } },
 };
 
@@ -1693,13 +1800,11 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
 CODEC_INTERFACE(aom_codec_av1_cx) = {
   "AOMedia Project AV1 Encoder" VERSION_STRING,
   AOM_CODEC_INTERNAL_ABI_VERSION,
-#if CONFIG_HIGHBITDEPTH
-  AOM_CODEC_CAP_HIGHBITDEPTH |
-#endif
-      AOM_CODEC_CAP_ENCODER | AOM_CODEC_CAP_PSNR,  // aom_codec_caps_t
-  encoder_init,                                    // aom_codec_init_fn_t
-  encoder_destroy,                                 // aom_codec_destroy_fn_t
-  encoder_ctrl_maps,                               // aom_codec_ctrl_fn_map_t
+  AOM_CODEC_CAP_HIGHBITDEPTH | AOM_CODEC_CAP_ENCODER |
+      AOM_CODEC_CAP_PSNR,  // aom_codec_caps_t
+  encoder_init,            // aom_codec_init_fn_t
+  encoder_destroy,         // aom_codec_destroy_fn_t
+  encoder_ctrl_maps,       // aom_codec_ctrl_fn_map_t
   {
       // NOLINT
       NULL,  // aom_codec_peek_si_fn_t
diff --git a/third_party/aom/av1/av1_dx.mk b/third_party/aom/av1/av1_dx.mk
deleted file mode 100644
index 6f113c3c6..000000000
--- a/third_party/aom/av1/av1_dx.mk
+++ /dev/null
@@ -1,67 +0,0 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-AV1_DX_EXPORTS += exports_dec
-
-AV1_DX_SRCS-yes += $(AV1_COMMON_SRCS-yes)
-AV1_DX_SRCS-no  += $(AV1_COMMON_SRCS-no)
-AV1_DX_SRCS_REMOVE-yes += $(AV1_COMMON_SRCS_REMOVE-yes)
-AV1_DX_SRCS_REMOVE-no  += $(AV1_COMMON_SRCS_REMOVE-no)
-
-AV1_DX_SRCS-yes += av1_dx_iface.c
-
-AV1_DX_SRCS-yes += decoder/decodemv.c
-AV1_DX_SRCS-yes += decoder/decodeframe.c
-AV1_DX_SRCS-yes += decoder/decodeframe.h
-AV1_DX_SRCS-yes += decoder/detokenize.c
-AV1_DX_SRCS-yes += decoder/decodemv.h
-AV1_DX_SRCS-$(CONFIG_LV_MAP) += decoder/decodetxb.c
-AV1_DX_SRCS-$(CONFIG_LV_MAP) += decoder/decodetxb.h
-AV1_DX_SRCS-yes += decoder/detokenize.h
-AV1_DX_SRCS-yes += decoder/dthread.c
-AV1_DX_SRCS-yes += decoder/dthread.h
-AV1_DX_SRCS-yes += decoder/decoder.c
-AV1_DX_SRCS-yes += decoder/decoder.h
-AV1_DX_SRCS-yes += decoder/dsubexp.c
-AV1_DX_SRCS-yes += decoder/dsubexp.h
-AV1_DX_SRCS-yes += decoder/symbolrate.h
-
-ifeq ($(CONFIG_ACCOUNTING),yes)
-AV1_DX_SRCS-yes += decoder/accounting.h
-AV1_DX_SRCS-yes += decoder/accounting.c
-endif
-
-ifeq ($(CONFIG_INSPECTION),yes)
-AV1_DX_SRCS-yes += decoder/inspection.c
-AV1_DX_SRCS-yes += decoder/inspection.h
-endif
-
-ifeq ($(CONFIG_PVQ),yes)
-# PVQ from daala
-AV1_DX_SRCS-yes += decoder/pvq_decoder.c
-AV1_DX_SRCS-yes += decoder/pvq_decoder.h
-AV1_DX_SRCS-yes += decoder/decint.h
-AV1_DX_SRCS-yes += decoder/generic_decoder.c
-AV1_DX_SRCS-yes += decoder/laplace_decoder.c
-AV1_DX_SRCS-yes += encoder/hybrid_fwd_txfm.c
-AV1_DX_SRCS-yes += encoder/hybrid_fwd_txfm.h
-
-AV1_DX_SRCS-yes += encoder/dct.c
-AV1_DX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
-AV1_DX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_intrin_sse2.c
-
-AV1_DX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct4x4_msa.c
-AV1_DX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct8x8_msa.c
-AV1_DX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct16x16_msa.c
-AV1_DX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct_msa.h
-endif
-
-AV1_DX_SRCS-yes := $(filter-out $(AV1_DX_SRCS_REMOVE-yes),$(AV1_DX_SRCS-yes))
diff --git a/third_party/aom/av1/av1_dx_iface.c b/third_party/aom/av1/av1_dx_iface.c
index c2f433d38..db338f7e3 100644
--- a/third_party/aom/av1/av1_dx_iface.c
+++ b/third_party/aom/av1/av1_dx_iface.c
@@ -12,14 +12,15 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "./aom_config.h"
-#include "./aom_version.h"
+#include "config/aom_config.h"
+#include "config/aom_version.h"
 
 #include "aom/internal/aom_codec_internal.h"
 #include "aom/aomdx.h"
 #include "aom/aom_decoder.h"
 #include "aom_dsp/bitreader_buffer.h"
 #include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem_ops.h"
 #include "aom_util/aom_thread.h"
 
 #include "av1/common/alloccommon.h"
@@ -28,26 +29,16 @@
 
 #include "av1/decoder/decoder.h"
 #include "av1/decoder/decodeframe.h"
+#include "av1/decoder/obu.h"
 
 #include "av1/av1_iface_common.h"
 
-// This limit is due to framebuffer numbers.
-// TODO(hkuang): Remove this limit after implementing ondemand framebuffers.
-#define FRAME_CACHE_SIZE 6  // Cache maximum 6 decoded frames.
-
-typedef struct cache_frame {
-  int fb_idx;
-  aom_image_t img;
-} cache_frame;
-
 struct aom_codec_alg_priv {
   aom_codec_priv_t base;
   aom_codec_dec_cfg_t cfg;
   aom_codec_stream_info_t si;
   int postproc_cfg_set;
   aom_postproc_cfg_t postproc_cfg;
-  aom_decrypt_cb decrypt_cb;
-  void *decrypt_state;
   aom_image_t img;
   int img_avail;
   int flushed;
@@ -57,19 +48,20 @@ struct aom_codec_alg_priv {
   int skip_loop_filter;
   int decode_tile_row;
   int decode_tile_col;
+  unsigned int tile_mode;
+  unsigned int ext_tile_debug;
+  EXTERNAL_REFERENCES ext_refs;
+  unsigned int is_annexb;
+  int operating_point;
+  int output_all_layers;
 
-  // Frame parallel related.
-  int frame_parallel_decode;  // frame-based threading.
   AVxWorker *frame_workers;
   int num_frame_workers;
   int next_submit_worker_id;
   int last_submit_worker_id;
   int next_output_worker_id;
   int available_threads;
-  cache_frame frame_cache[FRAME_CACHE_SIZE];
-  int frame_cache_write;
-  int frame_cache_read;
-  int num_cache_frames;
+  aom_image_t *image_with_grain;
   int need_resync;  // wait for key/intra-only frame
   // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
   BufferPool *buffer_pool;
@@ -100,18 +92,16 @@ static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx,
     ctx->priv = (aom_codec_priv_t *)priv;
     ctx->priv->init_flags = ctx->init_flags;
     priv->flushed = 0;
-    // Only do frame parallel decode when threads > 1.
-    priv->frame_parallel_decode =
-        (ctx->config.dec && (ctx->config.dec->threads > 1) &&
-         (ctx->init_flags & AOM_CODEC_USE_FRAME_THREADING))
-            ? 1
-            : 0;
+
     // TODO(tdaede): this should not be exposed to the API
     priv->cfg.allow_lowbitdepth = CONFIG_LOWBITDEPTH;
     if (ctx->config.dec) {
       priv->cfg = *ctx->config.dec;
       ctx->config.dec = &priv->cfg;
+      // default values
+      priv->cfg.cfg.ext_partition = 1;
     }
+    priv->image_with_grain = NULL;
   }
 
   return AOM_CODEC_OK;
@@ -125,10 +115,10 @@ static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
       aom_get_worker_interface()->end(worker);
+      aom_free(frame_worker_data->pbi->common.tpl_mvs);
+      frame_worker_data->pbi->common.tpl_mvs = NULL;
       av1_remove_common(&frame_worker_data->pbi->common);
-#if CONFIG_LOOP_RESTORATION
       av1_free_restoration_buffers(&frame_worker_data->pbi->common);
-#endif  // CONFIG_LOOP_RESTORATION
       av1_decoder_remove(frame_worker_data->pbi);
       aom_free(frame_worker_data->scratch_buffer);
 #if CONFIG_MULTITHREAD
@@ -149,176 +139,143 @@ static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {
 
   aom_free(ctx->frame_workers);
   aom_free(ctx->buffer_pool);
+  if (ctx->image_with_grain) aom_img_free(ctx->image_with_grain);
   aom_free(ctx);
   return AOM_CODEC_OK;
 }
 
-#if !CONFIG_OBU
-static int parse_bitdepth_colorspace_sampling(BITSTREAM_PROFILE profile,
-                                              struct aom_read_bit_buffer *rb) {
-  aom_color_space_t color_space;
-#if CONFIG_COLORSPACE_HEADERS
-  int subsampling_x = 0;
-  int subsampling_y = 0;
-#endif
-
-  if (profile >= PROFILE_2) rb->bit_offset += 1;  // Bit-depth 10 or 12.
-#if CONFIG_COLORSPACE_HEADERS
-  color_space = (aom_color_space_t)aom_rb_read_literal(rb, 5);
-  rb->bit_offset += 5;  // Transfer function
-#else
-  color_space = (aom_color_space_t)aom_rb_read_literal(rb, 3);
-#endif
-  if (color_space != AOM_CS_SRGB) {
-    rb->bit_offset += 1;  // [16,235] (including xvycc) vs [0,255] range.
+// Parses the operating points (including operating_point_idc, seq_level_idx,
+// and seq_tier) and then sets si->number_spatial_layers and
+// si->number_temporal_layers based on operating_point_idc[0].
+static aom_codec_err_t parse_operating_points(struct aom_read_bit_buffer *rb,
+                                              int is_reduced_header,
+                                              aom_codec_stream_info_t *si) {
+  int operating_point_idc0 = 0;
 
-    if (profile == PROFILE_1 || profile == PROFILE_3) {
-#if CONFIG_COLORSPACE_HEADERS
-      subsampling_x = aom_rb_read_bit(rb);
-      subsampling_y = aom_rb_read_bit(rb);
-#else
-      rb->bit_offset += 2;  // subsampling x/y.
-#endif
-      rb->bit_offset += 1;  // unused.
-#if CONFIG_COLORSPACE_HEADERS
-    } else {
-      subsampling_x = 1;
-      subsampling_y = 1;
-    }
-    if (subsampling_x == 1 && subsampling_y == 1) {
-      rb->bit_offset += 2;
-    }
-#else
-    }
-#endif
+  if (is_reduced_header) {
+    aom_rb_read_literal(rb, LEVEL_BITS);  // level
   } else {
-    if (profile == PROFILE_1 || profile == PROFILE_3) {
-      rb->bit_offset += 1;  // unused
-    } else {
-      // RGB is only available in version 1.
-      return 0;
+    const uint8_t operating_points_cnt_minus_1 =
+        aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS);
+    for (int i = 0; i < operating_points_cnt_minus_1 + 1; i++) {
+      int operating_point_idc;
+      operating_point_idc = aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
+      if (i == 0) operating_point_idc0 = operating_point_idc;
+      int seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS);  // level
+      if (seq_level_idx > 7) aom_rb_read_bit(rb);               // tier
     }
   }
-  return 1;
-}
-#endif
-
-static aom_codec_err_t decoder_peek_si_internal(
-    const uint8_t *data, unsigned int data_sz, aom_codec_stream_info_t *si,
-    int *is_intra_only, aom_decrypt_cb decrypt_cb, void *decrypt_state) {
-  int intra_only_flag = 0;
-  uint8_t clear_buffer[9];
 
-  if (data + data_sz <= data) return AOM_CODEC_INVALID_PARAM;
-
-  si->is_kf = 0;
-  si->w = si->h = 0;
-
-  if (decrypt_cb) {
-    data_sz = AOMMIN(sizeof(clear_buffer), data_sz);
-    decrypt_cb(decrypt_state, data, clear_buffer, data_sz);
-    data = clear_buffer;
+  if (aom_get_num_layers_from_operating_point_idc(
+          operating_point_idc0, &si->number_spatial_layers,
+          &si->number_temporal_layers) != AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
   }
 
-  // skip a potential superframe index
-  {
-    uint32_t frame_sizes[8];
-    int frame_count;
-    int index_size = 0;
-    aom_codec_err_t res = av1_parse_superframe_index(
-        data, data_sz, frame_sizes, &frame_count, &index_size, NULL, NULL);
-    if (res != AOM_CODEC_OK) return res;
+  return AOM_CODEC_OK;
+}
 
-    data += index_size;
-    data_sz -= index_size;
-#if CONFIG_OBU
-    if (data + data_sz <= data) return AOM_CODEC_INVALID_PARAM;
-#endif
+static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data,
+                                                size_t data_sz,
+                                                aom_codec_stream_info_t *si,
+                                                int *is_intra_only) {
+  int intra_only_flag = 0;
+  int got_sequence_header = 0;
+  int found_keyframe = 0;
+
+  if (data + data_sz <= data || data_sz < 1) return AOM_CODEC_INVALID_PARAM;
+
+  si->w = 0;
+  si->h = 0;
+  si->is_kf = 0;  // is_kf indicates whether the current packet contains a RAP
+
+  ObuHeader obu_header;
+  memset(&obu_header, 0, sizeof(obu_header));
+  size_t payload_size = 0;
+  size_t bytes_read = 0;
+  int reduced_still_picture_hdr = 0;
+  aom_codec_err_t status = aom_read_obu_header_and_size(
+      data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
+  if (status != AOM_CODEC_OK) return status;
+
+  // If the first OBU is a temporal delimiter, skip over it and look at the next
+  // OBU in the bitstream
+  if (obu_header.type == OBU_TEMPORAL_DELIMITER) {
+    // Skip any associated payload (there shouldn't be one, but just in case)
+    if (data_sz < bytes_read + payload_size) return AOM_CODEC_CORRUPT_FRAME;
+    data += bytes_read + payload_size;
+    data_sz -= bytes_read + payload_size;
+
+    status = aom_read_obu_header_and_size(
+        data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
+    if (status != AOM_CODEC_OK) return status;
   }
-
-  {
-#if CONFIG_OBU
-    // Proper fix needed
-    si->is_kf = 1;
-    intra_only_flag = 1;
-    si->h = 1;
-#else
-    int show_frame;
-    int error_resilient;
-    struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
-    const int frame_marker = aom_rb_read_literal(&rb, 2);
-    const BITSTREAM_PROFILE profile = av1_read_profile(&rb);
-#if CONFIG_EXT_TILE
-    unsigned int large_scale_tile;
-#endif  // CONFIG_EXT_TILE
-
-    if (frame_marker != AOM_FRAME_MARKER) return AOM_CODEC_UNSUP_BITSTREAM;
-
-    if (profile >= MAX_PROFILES) return AOM_CODEC_UNSUP_BITSTREAM;
-
-    if ((profile >= 2 && data_sz <= 1) || data_sz < 1)
-      return AOM_CODEC_UNSUP_BITSTREAM;
-
-#if CONFIG_EXT_TILE
-    large_scale_tile = aom_rb_read_literal(&rb, 1);
-#endif  // CONFIG_EXT_TILE
-
-    if (aom_rb_read_bit(&rb)) {     // show an existing frame
-      aom_rb_read_literal(&rb, 3);  // Frame buffer to show.
-      return AOM_CODEC_OK;
-    }
-
-    if (data_sz <= 8) return AOM_CODEC_UNSUP_BITSTREAM;
-
-    si->is_kf = !aom_rb_read_bit(&rb);
-    show_frame = aom_rb_read_bit(&rb);
-    if (!si->is_kf) {
-      if (!show_frame) intra_only_flag = show_frame ? 0 : aom_rb_read_bit(&rb);
-    }
-    error_resilient = aom_rb_read_bit(&rb);
-#if CONFIG_REFERENCE_BUFFER
-    SequenceHeader seq_params = { 0, 0, 0 };
-    if (si->is_kf) {
-      /* TODO: Move outside frame loop or inside key-frame branch */
-      read_sequence_header(&seq_params, &rb);
-#if CONFIG_EXT_TILE
-      if (large_scale_tile) seq_params.frame_id_numbers_present_flag = 0;
-#endif  // CONFIG_EXT_TILE
-    }
-#endif  // CONFIG_REFERENCE_BUFFER
-#if CONFIG_REFERENCE_BUFFER
-    if (seq_params.frame_id_numbers_present_flag) {
-      int frame_id_len;
-      frame_id_len = seq_params.frame_id_length_minus7 + 7;
-      aom_rb_read_literal(&rb, frame_id_len);
-    }
-#endif  // CONFIG_REFERENCE_BUFFER
-    if (si->is_kf) {
-      if (!parse_bitdepth_colorspace_sampling(profile, &rb))
+  while (1) {
+    data += bytes_read;
+    data_sz -= bytes_read;
+    const uint8_t *payload_start = data;
+    // Check that the selected OBU is a sequence header
+    if (obu_header.type == OBU_SEQUENCE_HEADER) {
+      // Sanity check on sequence header size
+      if (data_sz < 2) return AOM_CODEC_CORRUPT_FRAME;
+      // Read a few values from the sequence header payload
+      struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
+
+      av1_read_profile(&rb);  // profile
+      const int still_picture = aom_rb_read_bit(&rb);
+      reduced_still_picture_hdr = aom_rb_read_bit(&rb);
+
+      if (!still_picture && reduced_still_picture_hdr) {
         return AOM_CODEC_UNSUP_BITSTREAM;
-      av1_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
-    } else {
-      rb.bit_offset += error_resilient ? 0 : 2;  // reset_frame_context
+      }
 
-      if (intra_only_flag) {
-        if (profile > PROFILE_0) {
-          if (!parse_bitdepth_colorspace_sampling(profile, &rb))
-            return AOM_CODEC_UNSUP_BITSTREAM;
+      if (parse_operating_points(&rb, reduced_still_picture_hdr, si) !=
+          AOM_CODEC_OK) {
+        return AOM_CODEC_ERROR;
+      }
+
+      int num_bits_width = aom_rb_read_literal(&rb, 4) + 1;
+      int num_bits_height = aom_rb_read_literal(&rb, 4) + 1;
+      int max_frame_width = aom_rb_read_literal(&rb, num_bits_width) + 1;
+      int max_frame_height = aom_rb_read_literal(&rb, num_bits_height) + 1;
+      si->w = max_frame_width;
+      si->h = max_frame_height;
+      got_sequence_header = 1;
+    } else if (obu_header.type == OBU_FRAME_HEADER ||
+               obu_header.type == OBU_FRAME) {
+      if (got_sequence_header && reduced_still_picture_hdr) {
+        found_keyframe = 1;
+        break;
+      } else {
+        // make sure we have enough bits to get the frame type out
+        if (data_sz < 1) return AOM_CODEC_CORRUPT_FRAME;
+        struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
+        const int show_existing_frame = aom_rb_read_bit(&rb);
+        if (!show_existing_frame) {
+          const FRAME_TYPE frame_type = (FRAME_TYPE)aom_rb_read_literal(&rb, 2);
+          if (frame_type == KEY_FRAME) {
+            found_keyframe = 1;
+            break;  // Stop here as no further OBUs will change the outcome.
+          }
         }
-        rb.bit_offset += REF_FRAMES;  // refresh_frame_flags
-        av1_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
       }
     }
-#endif  // CONFIG_OBU
+    // skip past any unread OBU header data
+    data = payload_start + payload_size;
+    data_sz -= payload_size;
+    if (data_sz <= 0) break;  // exit if we're out of OBUs
+    status = aom_read_obu_header_and_size(
+        data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
+    if (status != AOM_CODEC_OK) return status;
   }
+  if (got_sequence_header && found_keyframe) si->is_kf = 1;
   if (is_intra_only != NULL) *is_intra_only = intra_only_flag;
   return AOM_CODEC_OK;
 }
 
-static aom_codec_err_t decoder_peek_si(const uint8_t *data,
-                                       unsigned int data_sz,
+static aom_codec_err_t decoder_peek_si(const uint8_t *data, size_t data_sz,
                                        aom_codec_stream_info_t *si) {
-  return decoder_peek_si_internal(data, data_sz, si, NULL, NULL, NULL);
+  return decoder_peek_si_internal(data, data_sz, si, NULL);
 }
 
 static aom_codec_err_t decoder_get_si(aom_codec_alg_priv_t *ctx,
@@ -386,25 +343,7 @@ static int frame_worker_hook(void *arg1, void *arg2) {
       frame_worker_data->pbi, frame_worker_data->data_size, &data);
   frame_worker_data->data_end = data;
 
-  if (frame_worker_data->pbi->common.frame_parallel_decode) {
-    // In frame parallel decoding, a worker thread must successfully decode all
-    // the compressed data.
-    if (frame_worker_data->result != 0 ||
-        frame_worker_data->data + frame_worker_data->data_size - 1 > data) {
-      AVxWorker *const worker = frame_worker_data->pbi->frame_worker_owner;
-      BufferPool *const pool = frame_worker_data->pbi->common.buffer_pool;
-      // Signal all the other threads that are waiting for this frame.
-      av1_frameworker_lock_stats(worker);
-      frame_worker_data->frame_context_ready = 1;
-      lock_buffer_pool(pool);
-      frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
-      unlock_buffer_pool(pool);
-      frame_worker_data->pbi->need_resync = 1;
-      av1_frameworker_signal_stats(worker);
-      av1_frameworker_unlock_stats(worker);
-      return 0;
-    }
-  } else if (frame_worker_data->result != 0) {
+  if (frame_worker_data->result != 0) {
     // Check decode result in serial decode.
     frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
     frame_worker_data->pbi->need_resync = 1;
@@ -420,12 +359,8 @@ static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
   ctx->next_submit_worker_id = 0;
   ctx->last_submit_worker_id = 0;
   ctx->next_output_worker_id = 0;
-  ctx->frame_cache_read = 0;
-  ctx->frame_cache_write = 0;
-  ctx->num_cache_frames = 0;
   ctx->need_resync = 1;
-  ctx->num_frame_workers =
-      (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads : 1;
+  ctx->num_frame_workers = 1;
   if (ctx->num_frame_workers > MAX_DECODE_THREADS)
     ctx->num_frame_workers = MAX_DECODE_THREADS;
   ctx->available_threads = ctx->num_frame_workers;
@@ -463,6 +398,7 @@ static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
       set_error_detail(ctx, "Failed to allocate frame_worker_data");
       return AOM_CODEC_MEM_ERROR;
     }
+    frame_worker_data->pbi->common.options = &ctx->cfg.cfg;
     frame_worker_data->pbi->frame_worker_owner = worker;
     frame_worker_data->worker_id = i;
     frame_worker_data->scratch_buffer = NULL;
@@ -484,12 +420,16 @@ static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
 
     // If decoding in serial mode, FrameWorker thread could create tile worker
     // thread or loopfilter thread.
-    frame_worker_data->pbi->max_threads =
-        (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0;
-
+    frame_worker_data->pbi->max_threads = ctx->cfg.threads;
     frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
-    frame_worker_data->pbi->common.frame_parallel_decode =
-        ctx->frame_parallel_decode;
+    frame_worker_data->pbi->common.large_scale_tile = ctx->tile_mode;
+    frame_worker_data->pbi->common.is_annexb = ctx->is_annexb;
+    frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
+    frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
+    frame_worker_data->pbi->operating_point = ctx->operating_point;
+    frame_worker_data->pbi->output_all_layers = ctx->output_all_layers;
+    frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
+
     worker->hook = (AVxWorkerHook)frame_worker_hook;
     if (!winterface->reset(worker)) {
       set_error_detail(ctx, "Frame Worker thread creation failed");
@@ -516,137 +456,82 @@ static INLINE void check_resync(aom_codec_alg_priv_t *const ctx,
 }
 
 static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx,
-                                  const uint8_t **data, unsigned int data_sz,
-                                  void *user_priv, int64_t deadline) {
+                                  const uint8_t **data, size_t data_sz,
+                                  void *user_priv) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  (void)deadline;
 
   // Determine the stream parameters. Note that we rely on peek_si to
   // validate that we have a buffer that does not wrap around the top
   // of the heap.
   if (!ctx->si.h) {
     int is_intra_only = 0;
+    ctx->si.is_annexb = ctx->is_annexb;
     const aom_codec_err_t res =
-        decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only,
-                                 ctx->decrypt_cb, ctx->decrypt_state);
+        decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only);
     if (res != AOM_CODEC_OK) return res;
 
     if (!ctx->si.is_kf && !is_intra_only) return AOM_CODEC_ERROR;
   }
 
-  if (!ctx->frame_parallel_decode) {
-    AVxWorker *const worker = ctx->frame_workers;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    frame_worker_data->data = *data;
-    frame_worker_data->data_size = data_sz;
-    frame_worker_data->user_priv = user_priv;
-    frame_worker_data->received_frame = 1;
-
-    // Set these even if already initialized.  The caller may have changed the
-    // decrypt config between frames.
-    frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
-    frame_worker_data->pbi->decrypt_state = ctx->decrypt_state;
+  AVxWorker *const worker = ctx->frame_workers;
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+  frame_worker_data->data = *data;
+  frame_worker_data->data_size = data_sz;
+  frame_worker_data->user_priv = user_priv;
+  frame_worker_data->received_frame = 1;
+
 #if CONFIG_INSPECTION
-    frame_worker_data->pbi->inspect_cb = ctx->inspect_cb;
-    frame_worker_data->pbi->inspect_ctx = ctx->inspect_ctx;
+  frame_worker_data->pbi->inspect_cb = ctx->inspect_cb;
+  frame_worker_data->pbi->inspect_ctx = ctx->inspect_ctx;
 #endif
 
-#if CONFIG_EXT_TILE
-    frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
-    frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
-#endif  // CONFIG_EXT_TILE
+  frame_worker_data->pbi->common.large_scale_tile = ctx->tile_mode;
+  frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
+  frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
+  frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
+  frame_worker_data->pbi->ext_refs = ctx->ext_refs;
 
-    worker->had_error = 0;
-    winterface->execute(worker);
+  frame_worker_data->pbi->common.is_annexb = ctx->is_annexb;
 
-    // Update data pointer after decode.
-    *data = frame_worker_data->data_end;
+  worker->had_error = 0;
+  winterface->execute(worker);
 
-    if (worker->had_error)
-      return update_error_state(ctx, &frame_worker_data->pbi->common.error);
+  // Update data pointer after decode.
+  *data = frame_worker_data->data_end;
 
-    check_resync(ctx, frame_worker_data->pbi);
-  } else {
-    AVxWorker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id];
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    // Copy context from last worker thread to next worker thread.
-    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
-      av1_frameworker_copy_context(
-          &ctx->frame_workers[ctx->next_submit_worker_id],
-          &ctx->frame_workers[ctx->last_submit_worker_id]);
-
-    frame_worker_data->pbi->ready_for_new_data = 0;
-    // Copy the compressed data into worker's internal buffer.
-    // TODO(hkuang): Will all the workers allocate the same size
-    // as the size of the first intra frame be better? This will
-    // avoid too many deallocate and allocate.
-    if (frame_worker_data->scratch_buffer_size < data_sz) {
-      aom_free(frame_worker_data->scratch_buffer);
-      frame_worker_data->scratch_buffer = (uint8_t *)aom_malloc(data_sz);
-      if (frame_worker_data->scratch_buffer == NULL) {
-        set_error_detail(ctx, "Failed to reallocate scratch buffer");
-        return AOM_CODEC_MEM_ERROR;
-      }
-      frame_worker_data->scratch_buffer_size = data_sz;
-    }
-    frame_worker_data->data_size = data_sz;
-    memcpy(frame_worker_data->scratch_buffer, *data, data_sz);
-
-    frame_worker_data->frame_decoded = 0;
-    frame_worker_data->frame_context_ready = 0;
-    frame_worker_data->received_frame = 1;
-    frame_worker_data->data = frame_worker_data->scratch_buffer;
-    frame_worker_data->user_priv = user_priv;
-
-    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
-      ctx->last_submit_worker_id =
-          (ctx->last_submit_worker_id + 1) % ctx->num_frame_workers;
-
-    ctx->next_submit_worker_id =
-        (ctx->next_submit_worker_id + 1) % ctx->num_frame_workers;
-    --ctx->available_threads;
-    worker->had_error = 0;
-    winterface->launch(worker);
-  }
-
-  return AOM_CODEC_OK;
-}
-
-static void wait_worker_and_cache_frame(aom_codec_alg_priv_t *ctx) {
-  YV12_BUFFER_CONFIG sd;
-  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
-  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-  ctx->next_output_worker_id =
-      (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
-  // TODO(hkuang): Add worker error handling here.
-  winterface->sync(worker);
-  frame_worker_data->received_frame = 0;
-  ++ctx->available_threads;
+  if (worker->had_error)
+    return update_error_state(ctx, &frame_worker_data->pbi->common.error);
 
   check_resync(ctx, frame_worker_data->pbi);
 
-  if (av1_get_raw_frame(frame_worker_data->pbi, &sd) == 0) {
-    AV1_COMMON *const cm = &frame_worker_data->pbi->common;
-    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
-    ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx;
-    yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd,
-                    frame_worker_data->user_priv);
-    ctx->frame_cache[ctx->frame_cache_write].img.fb_priv =
-        frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
-    ctx->frame_cache_write = (ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE;
-    ++ctx->num_cache_frames;
-  }
+  return AOM_CODEC_OK;
 }
 
 static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
-                                      const uint8_t *data, unsigned int data_sz,
-                                      void *user_priv, long deadline) {
+                                      const uint8_t *data, size_t data_sz,
+                                      void *user_priv) {
   const uint8_t *data_start = data;
-  const uint8_t *const data_end = data + data_sz;
-  aom_codec_err_t res;
-  uint32_t frame_sizes[8];
-  int frame_count;
+  const uint8_t *data_end = data + data_sz;
+  aom_codec_err_t res = AOM_CODEC_OK;
+
+  // Release any pending output frames from the previous decoder call.
+  // We need to do this even if the decoder is being flushed
+  if (ctx->frame_workers) {
+    BufferPool *const pool = ctx->buffer_pool;
+    RefCntBuffer *const frame_bufs = pool->frame_bufs;
+    lock_buffer_pool(pool);
+    for (int i = 0; i < ctx->num_frame_workers; ++i) {
+      AVxWorker *const worker = &ctx->frame_workers[i];
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      struct AV1Decoder *pbi = frame_worker_data->pbi;
+      for (size_t j = 0; j < pbi->num_output_frames; j++) {
+        decrease_ref_count((int)pbi->output_frame_index[j], frame_bufs, pool);
+      }
+      pbi->num_output_frames = 0;
+    }
+    unlock_buffer_pool(ctx->buffer_pool);
+  }
 
   if (data == NULL && data_sz == 0) {
     ctx->flushed = 1;
@@ -662,142 +547,91 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
     if (res != AOM_CODEC_OK) return res;
   }
 
-  int index_size = 0;
-  res = av1_parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
-                                   &index_size, ctx->decrypt_cb,
-                                   ctx->decrypt_state);
-  if (res != AOM_CODEC_OK) return res;
-
-  data_start += index_size;
-
-  if (ctx->frame_parallel_decode) {
-    // Decode in frame parallel mode. When decoding in this mode, the frame
-    // passed to the decoder must be either a normal frame or a superframe with
-    // superframe index so the decoder could get each frame's start position
-    // in the superframe.
-    if (frame_count > 0) {
-      int i;
-
-      for (i = 0; i < frame_count; ++i) {
-        const uint8_t *data_start_copy = data_start;
-        const uint32_t frame_size = frame_sizes[i];
-        if (data_start < data ||
-            frame_size > (uint32_t)(data_end - data_start)) {
-          set_error_detail(ctx, "Invalid frame size in index");
-          return AOM_CODEC_CORRUPT_FRAME;
-        }
-
-        if (ctx->available_threads == 0) {
-          // No more threads for decoding. Wait until the next output worker
-          // finishes decoding. Then copy the decoded frame into cache.
-          if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
-            wait_worker_and_cache_frame(ctx);
-          } else {
-            // TODO(hkuang): Add unit test to test this path.
-            set_error_detail(ctx, "Frame output cache is full.");
-            return AOM_CODEC_ERROR;
-          }
-        }
+  if (ctx->is_annexb) {
+    // read the size of this temporal unit
+    size_t length_of_size;
+    uint64_t temporal_unit_size;
+    if (aom_uleb_decode(data_start, data_sz, &temporal_unit_size,
+                        &length_of_size) != 0) {
+      return AOM_CODEC_CORRUPT_FRAME;
+    }
+    data_start += length_of_size;
+    if (temporal_unit_size > (size_t)(data_end - data_start))
+      return AOM_CODEC_CORRUPT_FRAME;
+    data_end = data_start + temporal_unit_size;
+  }
 
-        res =
-            decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline);
-        if (res != AOM_CODEC_OK) return res;
-        data_start += frame_size;
+  // Decode in serial mode.
+  while (data_start < data_end) {
+    uint64_t frame_size;
+    if (ctx->is_annexb) {
+      // read the size of this frame unit
+      size_t length_of_size;
+      if (aom_uleb_decode(data_start, (size_t)(data_end - data_start),
+                          &frame_size, &length_of_size) != 0) {
+        return AOM_CODEC_CORRUPT_FRAME;
       }
+      data_start += length_of_size;
+      if (frame_size > (size_t)(data_end - data_start))
+        return AOM_CODEC_CORRUPT_FRAME;
     } else {
-      if (ctx->available_threads == 0) {
-        // No more threads for decoding. Wait until the next output worker
-        // finishes decoding. Then copy the decoded frame into cache.
-        if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
-          wait_worker_and_cache_frame(ctx);
-        } else {
-          // TODO(hkuang): Add unit test to test this path.
-          set_error_detail(ctx, "Frame output cache is full.");
-          return AOM_CODEC_ERROR;
-        }
-      }
-
-      res = decode_one(ctx, &data, data_sz, user_priv, deadline);
-      if (res != AOM_CODEC_OK) return res;
+      frame_size = (uint64_t)(data_end - data_start);
     }
-  } else {
-    // Decode in serial mode.
-    if (frame_count > 0) {
-      int i;
-
-      for (i = 0; i < frame_count; ++i) {
-        const uint8_t *data_start_copy = data_start;
-        const uint32_t frame_size = frame_sizes[i];
-        if (data_start < data ||
-            frame_size > (uint32_t)(data_end - data_start)) {
-          set_error_detail(ctx, "Invalid frame size in index");
-          return AOM_CODEC_CORRUPT_FRAME;
-        }
 
-        res =
-            decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline);
-        if (res != AOM_CODEC_OK) return res;
+    res = decode_one(ctx, &data_start, (size_t)frame_size, user_priv);
+    if (res != AOM_CODEC_OK) return res;
 
-        data_start += frame_size;
-      }
-    } else {
-      while (data_start < data_end) {
-        const uint32_t frame_size = (uint32_t)(data_end - data_start);
-        res = decode_one(ctx, &data_start, frame_size, user_priv, deadline);
-        if (res != AOM_CODEC_OK) return res;
-
-        // Account for suboptimal termination by the encoder.
-        while (data_start < data_end) {
-          const uint8_t marker =
-              read_marker(ctx->decrypt_cb, ctx->decrypt_state, data_start);
-          if (marker) break;
-          ++data_start;
-        }
-      }
+    // Allow extra zero bytes after the frame end
+    while (data_start < data_end) {
+      const uint8_t marker = data_start[0];
+      if (marker) break;
+      ++data_start;
     }
   }
 
   return res;
 }
 
-static void release_last_output_frame(aom_codec_alg_priv_t *ctx) {
-  RefCntBuffer *const frame_bufs = ctx->buffer_pool->frame_bufs;
-  // Decrease reference count of last output frame in frame parallel mode.
-  if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) {
-    BufferPool *const pool = ctx->buffer_pool;
-    lock_buffer_pool(pool);
-    decrease_ref_count(ctx->last_show_frame, frame_bufs, pool);
-    unlock_buffer_pool(pool);
+aom_image_t *add_grain_if_needed(aom_image_t *img, aom_image_t *grain_img_buf,
+                                 aom_film_grain_t *grain_params) {
+  if (!grain_params->apply_grain) return img;
+
+  if (grain_img_buf &&
+      (img->d_w != grain_img_buf->d_w || img->d_h != grain_img_buf->d_h ||
+       img->fmt != grain_img_buf->fmt || !(img->d_h % 2) || !(img->d_w % 2))) {
+    aom_img_free(grain_img_buf);
+    grain_img_buf = NULL;
   }
+  if (!grain_img_buf) {
+    int w_even = img->d_w % 2 ? img->d_w + 1 : img->d_w;
+    int h_even = img->d_h % 2 ? img->d_h + 1 : img->d_h;
+    grain_img_buf = aom_img_alloc(NULL, img->fmt, w_even, h_even, 16);
+    grain_img_buf->bit_depth = img->bit_depth;
+  }
+
+  av1_add_film_grain(grain_params, img, grain_img_buf);
+
+  return grain_img_buf;
 }
 
 static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
                                       aom_codec_iter_t *iter) {
   aom_image_t *img = NULL;
 
-  // Only return frame when all the cpu are busy or
-  // application fluhsed the decoder in frame parallel decode.
-  if (ctx->frame_parallel_decode && ctx->available_threads > 0 &&
-      !ctx->flushed) {
+  if (!iter) {
     return NULL;
   }
 
-  // Output the frames in the cache first.
-  if (ctx->num_cache_frames > 0) {
-    release_last_output_frame(ctx);
-    ctx->last_show_frame = ctx->frame_cache[ctx->frame_cache_read].fb_idx;
-    if (ctx->need_resync) return NULL;
-    img = &ctx->frame_cache[ctx->frame_cache_read].img;
-    ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE;
-    --ctx->num_cache_frames;
-    return img;
-  }
+  // To avoid having to allocate any extra storage, treat 'iter' as
+  // simply a pointer to an integer index
+  uintptr_t *index = (uintptr_t *)iter;
 
-  // iter acts as a flip flop, so an image is only returned on the first
-  // call to get_frame.
-  if (*iter == NULL && ctx->frame_workers != NULL) {
+  if (ctx->frame_workers != NULL) {
     do {
-      YV12_BUFFER_CONFIG sd;
+      YV12_BUFFER_CONFIG *sd;
+      // NOTE(david.barker): This code does not support multiple worker threads
+      // yet. We should probably move the iteration over threads into *iter
+      // instead of using ctx->next_output_worker_id.
       const AVxWorkerInterface *const winterface = aom_get_worker_interface();
       AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
       FrameWorkerData *const frame_worker_data =
@@ -812,50 +646,64 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
           frame_worker_data->received_frame = 0;
           check_resync(ctx, frame_worker_data->pbi);
         }
-        if (av1_get_raw_frame(frame_worker_data->pbi, &sd) == 0) {
-          AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+        aom_film_grain_t *grain_params;
+        if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd,
+                              &grain_params) == 0) {
+          *index += 1;  // Advance the iterator to point to the next image
+
+          AV1Decoder *const pbi = frame_worker_data->pbi;
+          AV1_COMMON *const cm = &pbi->common;
           RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
-          release_last_output_frame(ctx);
-          ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx;
+          ctx->last_show_frame = cm->new_fb_idx;
           if (ctx->need_resync) return NULL;
-          yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv);
+          yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv);
+
+          if (!pbi->ext_tile_debug && cm->large_scale_tile) {
+            img = &ctx->img;
+            img->img_data = pbi->tile_list_output;
+            img->sz = pbi->tile_list_size;
+            return img;
+          }
 
-#if CONFIG_EXT_TILE
-          if (cm->single_tile_decoding &&
-              frame_worker_data->pbi->dec_tile_row >= 0) {
-            const int tile_row =
-                AOMMIN(frame_worker_data->pbi->dec_tile_row, cm->tile_rows - 1);
+          const int num_planes = av1_num_planes(cm);
+          if (pbi->ext_tile_debug && cm->single_tile_decoding &&
+              pbi->dec_tile_row >= 0) {
+            const int tile_row = AOMMIN(pbi->dec_tile_row, cm->tile_rows - 1);
             const int mi_row = tile_row * cm->tile_height;
             const int ssy = ctx->img.y_chroma_shift;
             int plane;
             ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0];
-            for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-              ctx->img.planes[plane] +=
-                  mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane];
+            if (num_planes > 1) {
+              for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+                ctx->img.planes[plane] +=
+                    mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane];
+              }
             }
             ctx->img.d_h =
                 AOMMIN(cm->tile_height, cm->mi_rows - mi_row) * MI_SIZE;
           }
 
-          if (cm->single_tile_decoding &&
-              frame_worker_data->pbi->dec_tile_col >= 0) {
-            const int tile_col =
-                AOMMIN(frame_worker_data->pbi->dec_tile_col, cm->tile_cols - 1);
+          if (pbi->ext_tile_debug && cm->single_tile_decoding &&
+              pbi->dec_tile_col >= 0) {
+            const int tile_col = AOMMIN(pbi->dec_tile_col, cm->tile_cols - 1);
             const int mi_col = tile_col * cm->tile_width;
             const int ssx = ctx->img.x_chroma_shift;
             int plane;
             ctx->img.planes[0] += mi_col * MI_SIZE;
-            for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-              ctx->img.planes[plane] += mi_col * (MI_SIZE >> ssx);
+            if (num_planes > 1) {
+              for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+                ctx->img.planes[plane] += mi_col * (MI_SIZE >> ssx);
+              }
             }
             ctx->img.d_w =
                 AOMMIN(cm->tile_width, cm->mi_cols - mi_col) * MI_SIZE;
           }
-#endif  // CONFIG_EXT_TILE
 
           ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
           img = &ctx->img;
-          return img;
+          img->temporal_id = cm->temporal_layer_id;
+          img->spatial_id = cm->spatial_layer_id;
+          return add_grain_if_needed(img, ctx->image_with_grain, grain_params);
         }
       } else {
         // Decoding failed. Release the worker thread.
@@ -890,12 +738,6 @@ static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx,
                                           va_list args) {
   av1_ref_frame_t *const data = va_arg(args, av1_ref_frame_t *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return AOM_CODEC_INCAPABLE;
-  }
-
   if (data) {
     av1_ref_frame_t *const frame = data;
     YV12_BUFFER_CONFIG sd;
@@ -903,7 +745,7 @@ static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx,
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
     return av1_set_reference_dec(&frame_worker_data->pbi->common, frame->idx,
-                                 &sd);
+                                 frame->use_external_ref, &sd);
   } else {
     return AOM_CODEC_INVALID_PARAM;
   }
@@ -912,13 +754,6 @@ static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx,
 static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx,
                                            va_list args) {
   const av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
-
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return AOM_CODEC_INCAPABLE;
-  }
-
   if (frame) {
     YV12_BUFFER_CONFIG sd;
     AVxWorker *const worker = ctx->frame_workers;
@@ -933,13 +768,6 @@ static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx,
 static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx,
                                           va_list args) {
   av1_ref_frame_t *data = va_arg(args, av1_ref_frame_t *);
-
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return AOM_CODEC_INCAPABLE;
-  }
-
   if (data) {
     YV12_BUFFER_CONFIG *fb;
     AVxWorker *const worker = ctx->frame_workers;
@@ -956,13 +784,6 @@ static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx,
 static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx,
                                                 va_list args) {
   aom_image_t *new_img = va_arg(args, aom_image_t *);
-
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return AOM_CODEC_INCAPABLE;
-  }
-
   if (new_img) {
     YV12_BUFFER_CONFIG new_frame;
     AVxWorker *const worker = ctx->frame_workers;
@@ -979,6 +800,27 @@ static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx,
   }
 }
 
+static aom_codec_err_t ctrl_copy_new_frame_image(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  aom_image_t *img = va_arg(args, aom_image_t *);
+  if (img) {
+    YV12_BUFFER_CONFIG new_frame;
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+
+    if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) {
+      YV12_BUFFER_CONFIG sd;
+      image2yuvconfig(img, &sd);
+      return av1_copy_new_frame_dec(&frame_worker_data->pbi->common, &new_frame,
+                                    &sd);
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
 static aom_codec_err_t ctrl_set_postproc(aom_codec_alg_priv_t *ctx,
                                          va_list args) {
   (void)ctx;
@@ -997,12 +839,6 @@ static aom_codec_err_t ctrl_get_last_ref_updates(aom_codec_alg_priv_t *ctx,
                                                  va_list args) {
   int *const update_info = va_arg(args, int *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return AOM_CODEC_INCAPABLE;
-  }
-
   if (update_info) {
     if (ctx->frame_workers) {
       AVxWorker *const worker = ctx->frame_workers;
@@ -1036,9 +872,9 @@ static aom_codec_err_t ctrl_get_frame_corrupted(aom_codec_alg_priv_t *ctx,
       AVxWorker *const worker = ctx->frame_workers;
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
-      RefCntBuffer *const frame_bufs =
-          frame_worker_data->pbi->common.buffer_pool->frame_bufs;
-      if (frame_worker_data->pbi->common.frame_to_show == NULL)
+      AV1Decoder *const pbi = frame_worker_data->pbi;
+      RefCntBuffer *const frame_bufs = pbi->common.buffer_pool->frame_bufs;
+      if (pbi->seen_frame_header && pbi->num_output_frames == 0)
         return AOM_CODEC_ERROR;
       if (ctx->last_show_frame >= 0)
         *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted;
@@ -1055,12 +891,6 @@ static aom_codec_err_t ctrl_get_frame_size(aom_codec_alg_priv_t *ctx,
                                            va_list args) {
   int *const frame_size = va_arg(args, int *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return AOM_CODEC_INCAPABLE;
-  }
-
   if (frame_size) {
     if (ctx->frame_workers) {
       AVxWorker *const worker = ctx->frame_workers;
@@ -1078,15 +908,69 @@ static aom_codec_err_t ctrl_get_frame_size(aom_codec_alg_priv_t *ctx,
   return AOM_CODEC_INVALID_PARAM;
 }
 
-static aom_codec_err_t ctrl_get_render_size(aom_codec_alg_priv_t *ctx,
+static aom_codec_err_t ctrl_get_frame_header_info(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  aom_tile_data *const frame_header_info = va_arg(args, aom_tile_data *);
+
+  if (frame_header_info) {
+    if (ctx->frame_workers) {
+      AVxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1Decoder *pbi = frame_worker_data->pbi;
+      frame_header_info->coded_tile_data_size = pbi->obu_size_hdr.size;
+      frame_header_info->coded_tile_data = pbi->obu_size_hdr.data;
+      frame_header_info->extra_size = pbi->frame_header_size;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_tile_data(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  aom_tile_data *const tile_data = va_arg(args, aom_tile_data *);
+
+  if (tile_data) {
+    if (ctx->frame_workers) {
+      AVxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1Decoder *pbi = frame_worker_data->pbi;
+      tile_data->coded_tile_data_size =
+          pbi->tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].size;
+      tile_data->coded_tile_data =
+          pbi->tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].data;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_set_ext_ref_ptr(aom_codec_alg_priv_t *ctx,
                                             va_list args) {
-  int *const render_size = va_arg(args, int *);
+  av1_ext_ref_frame_t *const data = va_arg(args, av1_ext_ref_frame_t *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return AOM_CODEC_INCAPABLE;
+  if (data) {
+    av1_ext_ref_frame_t *const ext_frames = data;
+    ctx->ext_refs.num = ext_frames->num;
+    for (int i = 0; i < ctx->ext_refs.num; i++) {
+      image2yuvconfig(ext_frames->img++, &ctx->ext_refs.refs[i]);
+    }
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
   }
+}
+
+static aom_codec_err_t ctrl_get_render_size(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  int *const render_size = va_arg(args, int *);
 
   if (render_size) {
     if (ctx->frame_workers) {
@@ -1131,14 +1015,6 @@ static aom_codec_err_t ctrl_set_invert_tile_order(aom_codec_alg_priv_t *ctx,
   return AOM_CODEC_OK;
 }
 
-static aom_codec_err_t ctrl_set_decryptor(aom_codec_alg_priv_t *ctx,
-                                          va_list args) {
-  aom_decrypt_init *init = va_arg(args, aom_decrypt_init *);
-  ctx->decrypt_cb = init ? init->decrypt_cb : NULL;
-  ctx->decrypt_state = init ? init->decrypt_state : NULL;
-  return AOM_CODEC_OK;
-}
-
 static aom_codec_err_t ctrl_set_byte_alignment(aom_codec_alg_priv_t *ctx,
                                                va_list args) {
   const int legacy_byte_alignment = 0;
@@ -1204,6 +1080,30 @@ static aom_codec_err_t ctrl_set_decode_tile_col(aom_codec_alg_priv_t *ctx,
   return AOM_CODEC_OK;
 }
 
+static aom_codec_err_t ctrl_set_tile_mode(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  ctx->tile_mode = va_arg(args, unsigned int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_is_annexb(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  ctx->is_annexb = va_arg(args, unsigned int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_operating_point(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  ctx->operating_point = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_output_all_layers(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  ctx->output_all_layers = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_err_t ctrl_set_inspection_callback(aom_codec_alg_priv_t *ctx,
                                                     va_list args) {
 #if !CONFIG_INSPECTION
@@ -1218,6 +1118,12 @@ static aom_codec_err_t ctrl_set_inspection_callback(aom_codec_alg_priv_t *ctx,
 #endif
 }
 
+static aom_codec_err_t ctrl_ext_tile_debug(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  ctx->ext_tile_debug = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { AV1_COPY_REFERENCE, ctrl_copy_reference },
 
@@ -1229,12 +1135,17 @@ static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { AOM_SET_DBG_COLOR_B_MODES, ctrl_set_dbg_options },
   { AOM_SET_DBG_DISPLAY_MV, ctrl_set_dbg_options },
   { AV1_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order },
-  { AOMD_SET_DECRYPTOR, ctrl_set_decryptor },
   { AV1_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment },
   { AV1_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter },
   { AV1_SET_DECODE_TILE_ROW, ctrl_set_decode_tile_row },
   { AV1_SET_DECODE_TILE_COL, ctrl_set_decode_tile_col },
+  { AV1_SET_TILE_MODE, ctrl_set_tile_mode },
+  { AV1D_SET_IS_ANNEXB, ctrl_set_is_annexb },
+  { AV1D_SET_OPERATING_POINT, ctrl_set_operating_point },
+  { AV1D_SET_OUTPUT_ALL_LAYERS, ctrl_set_output_all_layers },
   { AV1_SET_INSPECTION_CALLBACK, ctrl_set_inspection_callback },
+  { AV1D_EXT_TILE_DEBUG, ctrl_ext_tile_debug },
+  { AV1D_SET_EXT_REF_PTR, ctrl_set_ext_ref_ptr },
 
   // Getters
   { AOMD_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted },
@@ -1245,7 +1156,10 @@ static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { AV1D_GET_FRAME_SIZE, ctrl_get_frame_size },
   { AV1_GET_ACCOUNTING, ctrl_get_accounting },
   { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
+  { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image },
   { AV1_GET_REFERENCE, ctrl_get_reference },
+  { AV1D_GET_FRAME_HEADER_INFO, ctrl_get_frame_header_info },
+  { AV1D_GET_TILE_DATA, ctrl_get_tile_data },
 
   { -1, NULL },
 };
diff --git a/third_party/aom/av1/av1_iface_common.h b/third_party/aom/av1/av1_iface_common.h
index 6c9a2a6cb..c03892b73 100644
--- a/third_party/aom/av1/av1_iface_common.h
+++ b/third_party/aom/av1/av1_iface_common.h
@@ -15,10 +15,11 @@
 
 static void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12,
                             void *user_priv) {
-  /** aom_img_wrap() doesn't allow specifying independent strides for
-    * the Y, U, and V planes, nor other alignment adjustments that
-    * might be representable by a YV12_BUFFER_CONFIG, so we just
-    * initialize all the fields.*/
+  /* aom_img_wrap() doesn't allow specifying independent strides for
+   * the Y, U, and V planes, nor other alignment adjustments that
+   * might be representable by a YV12_BUFFER_CONFIG, so we just
+   * initialize all the fields.
+   */
   int bps;
   if (!yv12->subsampling_y) {
     if (!yv12->subsampling_x) {
@@ -29,23 +30,18 @@ static void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12,
       bps = 16;
     }
   } else {
-    if (!yv12->subsampling_x) {
-      img->fmt = AOM_IMG_FMT_I440;
-      bps = 16;
-    } else {
-      img->fmt = AOM_IMG_FMT_I420;
-      bps = 12;
-    }
+    img->fmt = AOM_IMG_FMT_I420;
+    bps = 12;
   }
-  img->cs = yv12->color_space;
-#if CONFIG_COLORSPACE_HEADERS
-  img->tf = yv12->transfer_function;
+  img->cp = yv12->color_primaries;
+  img->tc = yv12->transfer_characteristics;
+  img->mc = yv12->matrix_coefficients;
+  img->monochrome = yv12->monochrome;
   img->csp = yv12->chroma_sample_position;
-#endif
   img->range = yv12->color_range;
   img->bit_depth = 8;
-  img->w = yv12->y_stride;
-  img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * AOM_BORDER_IN_PIXELS, 3);
+  img->w = yv12->y_width;
+  img->h = yv12->y_height;
   img->d_w = yv12->y_crop_width;
   img->d_h = yv12->y_crop_height;
   img->r_w = yv12->render_width;
@@ -60,7 +56,6 @@ static void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12,
   img->stride[AOM_PLANE_U] = yv12->uv_stride;
   img->stride[AOM_PLANE_V] = yv12->uv_stride;
   img->stride[AOM_PLANE_ALPHA] = yv12->y_stride;
-#if CONFIG_HIGHBITDEPTH
   if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) {
     // aom_image_t uses byte strides and a pointer to the first byte
     // of the image.
@@ -75,7 +70,6 @@ static void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12,
     img->stride[AOM_PLANE_V] = 2 * yv12->uv_stride;
     img->stride[AOM_PLANE_ALPHA] = 2 * yv12->y_stride;
   }
-#endif  // CONFIG_HIGHBITDEPTH
   img->bps = bps;
   img->user_priv = user_priv;
   img->img_data = yv12->buffer_alloc;
@@ -93,8 +87,8 @@ static aom_codec_err_t image2yuvconfig(const aom_image_t *img,
   yv12->y_crop_height = img->d_h;
   yv12->render_width = img->r_w;
   yv12->render_height = img->r_h;
-  yv12->y_width = img->d_w;
-  yv12->y_height = img->d_h;
+  yv12->y_width = img->w;
+  yv12->y_height = img->h;
 
   yv12->uv_width =
       img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2 : yv12->y_width;
@@ -105,14 +99,13 @@ static aom_codec_err_t image2yuvconfig(const aom_image_t *img,
 
   yv12->y_stride = img->stride[AOM_PLANE_Y];
   yv12->uv_stride = img->stride[AOM_PLANE_U];
-  yv12->color_space = img->cs;
-#if CONFIG_COLORSPACE_HEADERS
-  yv12->transfer_function = img->tf;
+  yv12->color_primaries = img->cp;
+  yv12->transfer_characteristics = img->tc;
+  yv12->matrix_coefficients = img->mc;
+  yv12->monochrome = img->monochrome;
   yv12->chroma_sample_position = img->csp;
-#endif
   yv12->color_range = img->range;
 
-#if CONFIG_HIGHBITDEPTH
   if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
     // In aom_image_t
     //     planes point to uint8 address of start of data
@@ -134,9 +127,6 @@ static aom_codec_err_t image2yuvconfig(const aom_image_t *img,
     yv12->flags = 0;
   }
   yv12->border = (yv12->y_stride - img->w) / 2;
-#else
-  yv12->border = (img->stride[AOM_PLANE_Y] - img->w) / 2;
-#endif  // CONFIG_HIGHBITDEPTH
   yv12->subsampling_x = img->x_chroma_shift;
   yv12->subsampling_y = img->y_chroma_shift;
   return AOM_CODEC_OK;
diff --git a/third_party/aom/av1/common/alloccommon.c b/third_party/aom/av1/common/alloccommon.c
index fd635686f..49902cc7d 100644
--- a/third_party/aom/av1/common/alloccommon.c
+++ b/third_party/aom/av1/common/alloccommon.c
@@ -10,7 +10,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_mem/aom_mem.h"
 
 #include "av1/common/alloccommon.h"
@@ -25,16 +26,43 @@ int av1_get_MBs(int width, int height) {
   const int mi_cols = aligned_width >> MI_SIZE_LOG2;
   const int mi_rows = aligned_height >> MI_SIZE_LOG2;
 
-#if CONFIG_CB4X4
   const int mb_cols = (mi_cols + 2) >> 2;
   const int mb_rows = (mi_rows + 2) >> 2;
-#else
-  const int mb_cols = (mi_cols + 1) >> 1;
-  const int mb_rows = (mi_rows + 1) >> 1;
-#endif
   return mb_rows * mb_cols;
 }
 
+#if LOOP_FILTER_BITMASK
+static int alloc_loop_filter_mask(AV1_COMMON *cm) {
+  aom_free(cm->lf.lfm);
+  cm->lf.lfm = NULL;
+
+  // Each lfm holds bit masks for all the 4x4 blocks in a max
+  // 64x64 (128x128 for ext_partitions) region.  The stride
+  // and rows are rounded up / truncated to a multiple of 16
+  // (32 for ext_partition).
+  cm->lf.lfm_stride = (cm->mi_cols + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2;
+  cm->lf.lfm_num = ((cm->mi_rows + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2) *
+                   cm->lf.lfm_stride;
+  cm->lf.lfm =
+      (LoopFilterMask *)aom_calloc(cm->lf.lfm_num, sizeof(*cm->lf.lfm));
+  if (!cm->lf.lfm) return 1;
+
+  unsigned int i;
+  for (i = 0; i < cm->lf.lfm_num; ++i) av1_zero(cm->lf.lfm[i]);
+
+  return 0;
+}
+
+static void free_loop_filter_mask(AV1_COMMON *cm) {
+  if (cm->lf.lfm == NULL) return;
+
+  aom_free(cm->lf.lfm);
+  cm->lf.lfm = NULL;
+  cm->lf.lfm_num = 0;
+  cm->lf.lfm_stride = 0;
+}
+#endif
+
 void av1_set_mb_mi(AV1_COMMON *cm, int width, int height) {
   // Ensure that the decoded width and height are both multiples of
   // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
@@ -48,79 +76,13 @@ void av1_set_mb_mi(AV1_COMMON *cm, int width, int height) {
   cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
   cm->mi_stride = calc_mi_size(cm->mi_cols);
 
-#if CONFIG_CB4X4
   cm->mb_cols = (cm->mi_cols + 2) >> 2;
   cm->mb_rows = (cm->mi_rows + 2) >> 2;
-#else
-  cm->mb_cols = (cm->mi_cols + 1) >> 1;
-  cm->mb_rows = (cm->mi_rows + 1) >> 1;
-#endif
   cm->MBs = cm->mb_rows * cm->mb_cols;
-}
-
-static int alloc_seg_map(AV1_COMMON *cm, int seg_map_size) {
-  int i;
-
-  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
-    cm->seg_map_array[i] = (uint8_t *)aom_calloc(seg_map_size, 1);
-    if (cm->seg_map_array[i] == NULL) return 1;
-  }
-  cm->seg_map_alloc_size = seg_map_size;
-
-  // Init the index.
-  cm->seg_map_idx = 0;
-  cm->prev_seg_map_idx = 1;
-
-  cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
-  if (!cm->frame_parallel_decode)
-    cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
-
-  return 0;
-}
-
-static void free_seg_map(AV1_COMMON *cm) {
-  int i;
 
-  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
-    aom_free(cm->seg_map_array[i]);
-    cm->seg_map_array[i] = NULL;
-  }
-
-  cm->current_frame_seg_map = NULL;
-
-  if (!cm->frame_parallel_decode) {
-    cm->last_frame_seg_map = NULL;
-  }
-  cm->seg_map_alloc_size = 0;
-}
-
-static void free_scratch_buffers(AV1_COMMON *cm) {
-  (void)cm;
-#if CONFIG_NCOBMC && CONFIG_NCOBMC_ADAPT_WEIGHT
-  for (int i = 0; i < 4; ++i) {
-    if (cm->ncobmcaw_buf[i]) {
-      aom_free(cm->ncobmcaw_buf[i]);
-      cm->ncobmcaw_buf[i] = NULL;
-    }
-  }
-#endif  // CONFIG_NCOBMC && CONFIG_NCOBMC_ADAPT_WEIGHT
-}
-
-static int alloc_scratch_buffers(AV1_COMMON *cm) {
-  (void)cm;
-#if CONFIG_NCOBMC && CONFIG_NCOBMC_ADAPT_WEIGHT
-  // If not allocated already, allocate
-  if (!cm->ncobmcaw_buf[0] && !cm->ncobmcaw_buf[1] && !cm->ncobmcaw_buf[2] &&
-      !cm->ncobmcaw_buf[3]) {
-    for (int i = 0; i < 4; ++i) {
-      CHECK_MEM_ERROR(
-          cm, cm->ncobmcaw_buf[i],
-          (uint8_t *)aom_memalign(
-              16, (1 + CONFIG_HIGHBITDEPTH) * MAX_MB_PLANE * MAX_SB_SQUARE));
-    }
-  }
-#endif  // CONFIG_NCOBMC && CONFIG_NCOBMC_ADAPT_WEIGHT
-  return 0;
+#if LOOP_FILTER_BITMASK
+  alloc_loop_filter_mask(cm);
+#endif
 }
 
 void av1_free_ref_frame_buffers(BufferPool *pool) {
@@ -134,97 +96,179 @@ void av1_free_ref_frame_buffers(BufferPool *pool) {
     }
     aom_free(pool->frame_bufs[i].mvs);
     pool->frame_bufs[i].mvs = NULL;
-#if CONFIG_MFMV
-    aom_free(pool->frame_bufs[i].tpl_mvs);
-    pool->frame_bufs[i].tpl_mvs = NULL;
-#endif
+    aom_free(pool->frame_bufs[i].seg_map);
+    pool->frame_bufs[i].seg_map = NULL;
     aom_free_frame_buffer(&pool->frame_bufs[i].buf);
-#if CONFIG_HASH_ME
-    av1_hash_table_destroy(&pool->frame_bufs[i].hash_table);
-#endif
   }
 }
 
-#if CONFIG_LOOP_RESTORATION
-// Assumes cm->rst_info[p].restoration_tilesize is already initialized
+// Assumes cm->rst_info[p].restoration_unit_size is already initialized
 void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
-  int p;
-#if CONFIG_FRAME_SUPERRES
-  int width = cm->superres_upscaled_width;
-  int height = cm->superres_upscaled_height;
-#else
-  int width = cm->width;
-  int height = cm->height;
-#endif  // CONFIG_FRAME_SUPERRES
-  av1_alloc_restoration_struct(cm, &cm->rst_info[0], width, height);
-  for (p = 1; p < MAX_MB_PLANE; ++p)
-    av1_alloc_restoration_struct(cm, &cm->rst_info[p],
-                                 ROUND_POWER_OF_TWO(width, cm->subsampling_x),
-                                 ROUND_POWER_OF_TWO(height, cm->subsampling_y));
-  aom_free(cm->rst_internal.tmpbuf);
-  CHECK_MEM_ERROR(cm, cm->rst_internal.tmpbuf,
-                  (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
-
-#if CONFIG_STRIPED_LOOP_RESTORATION
-  // Allocate internal storage for the loop restoration stripe boundary lines
-  for (p = 0; p < MAX_MB_PLANE; ++p) {
-    int w = p == 0 ? width : ROUND_POWER_OF_TWO(width, cm->subsampling_x);
-    int align_bits = 5;  // align for efficiency
-    int stride = ALIGN_POWER_OF_TWO(w, align_bits);
-    int num_stripes = (height + 63) / 64;
-    // for each processing stripe: 2 lines above, 2 below
-    int buf_size = num_stripes * 2 * stride;
-    uint8_t *above_buf, *below_buf;
-
-    aom_free(cm->rst_internal.stripe_boundary_above[p]);
-    aom_free(cm->rst_internal.stripe_boundary_below[p]);
-
-#if CONFIG_HIGHBITDEPTH
-    if (cm->use_highbitdepth) buf_size = buf_size * 2;
-#endif
-    CHECK_MEM_ERROR(cm, above_buf,
-                    (uint8_t *)aom_memalign(1 << align_bits, buf_size));
-    CHECK_MEM_ERROR(cm, below_buf,
-                    (uint8_t *)aom_memalign(1 << align_bits, buf_size));
-    cm->rst_internal.stripe_boundary_above[p] = above_buf;
-    cm->rst_internal.stripe_boundary_below[p] = below_buf;
-    cm->rst_internal.stripe_boundary_stride[p] = stride;
+  const int num_planes = av1_num_planes(cm);
+  for (int p = 0; p < num_planes; ++p)
+    av1_alloc_restoration_struct(cm, &cm->rst_info[p], p > 0);
+
+  if (cm->rst_tmpbuf == NULL) {
+    CHECK_MEM_ERROR(cm, cm->rst_tmpbuf,
+                    (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
+  }
+
+  if (cm->rlbs == NULL) {
+    CHECK_MEM_ERROR(cm, cm->rlbs, aom_malloc(sizeof(RestorationLineBuffers)));
+  }
+
+  // For striped loop restoration, we divide each row of tiles into "stripes",
+  // of height 64 luma pixels but with an offset by RESTORATION_UNIT_OFFSET
+  // luma pixels to match the output from CDEF. We will need to store 2 *
+  // RESTORATION_CTX_VERT lines of data for each stripe, and also need to be
+  // able to quickly answer the question "Where is the <n>'th stripe for tile
+  // row <m>?" To make that efficient, we generate the rst_last_stripe array.
+  int num_stripes = 0;
+  for (int i = 0; i < cm->tile_rows; ++i) {
+    TileInfo tile_info;
+    av1_tile_set_row(&tile_info, cm, i);
+    const int mi_h = tile_info.mi_row_end - tile_info.mi_row_start;
+    const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2);
+    const int tile_stripes = (ext_h + 63) / 64;
+    num_stripes += tile_stripes;
+    cm->rst_end_stripe[i] = num_stripes;
+  }
+
+  // Now we need to allocate enough space to store the line buffers for the
+  // stripes
+  const int frame_w = cm->superres_upscaled_width;
+  const int use_highbd = cm->use_highbitdepth ? 1 : 0;
+
+  for (int p = 0; p < num_planes; ++p) {
+    const int is_uv = p > 0;
+    const int ss_x = is_uv && cm->subsampling_x;
+    const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ;
+    const int stride = ALIGN_POWER_OF_TWO(plane_w, 5);
+    const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT
+                         << use_highbd;
+    RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
+
+    if (buf_size != boundaries->stripe_boundary_size ||
+        boundaries->stripe_boundary_above == NULL ||
+        boundaries->stripe_boundary_below == NULL) {
+      aom_free(boundaries->stripe_boundary_above);
+      aom_free(boundaries->stripe_boundary_below);
+
+      CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_above,
+                      (uint8_t *)aom_memalign(32, buf_size));
+      CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_below,
+                      (uint8_t *)aom_memalign(32, buf_size));
+
+      boundaries->stripe_boundary_size = buf_size;
+    }
+    boundaries->stripe_boundary_stride = stride;
   }
-#endif  // CONFIG_STRIPED_LOOP_RESTORATION
 }
 
 void av1_free_restoration_buffers(AV1_COMMON *cm) {
   int p;
   for (p = 0; p < MAX_MB_PLANE; ++p)
     av1_free_restoration_struct(&cm->rst_info[p]);
-  aom_free(cm->rst_internal.tmpbuf);
-  cm->rst_internal.tmpbuf = NULL;
+  aom_free(cm->rst_tmpbuf);
+  cm->rst_tmpbuf = NULL;
+  aom_free(cm->rlbs);
+  cm->rlbs = NULL;
+  for (p = 0; p < MAX_MB_PLANE; ++p) {
+    RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
+    aom_free(boundaries->stripe_boundary_above);
+    aom_free(boundaries->stripe_boundary_below);
+    boundaries->stripe_boundary_above = NULL;
+    boundaries->stripe_boundary_below = NULL;
+  }
+
+  aom_free_frame_buffer(&cm->rst_frame);
 }
-#endif  // CONFIG_LOOP_RESTORATION
 
-void av1_free_context_buffers(AV1_COMMON *cm) {
+void av1_free_above_context_buffers(AV1_COMMON *cm,
+                                    int num_free_above_contexts) {
   int i;
-  cm->free_mi(cm);
-  free_seg_map(cm);
-  free_scratch_buffers(cm);
-  for (i = 0; i < MAX_MB_PLANE; i++) {
+  const int num_planes = cm->num_allocated_above_context_planes;
+
+  for (int tile_row = 0; tile_row < num_free_above_contexts; tile_row++) {
+    for (i = 0; i < num_planes; i++) {
+      aom_free(cm->above_context[i][tile_row]);
+      cm->above_context[i][tile_row] = NULL;
+    }
+    aom_free(cm->above_seg_context[tile_row]);
+    cm->above_seg_context[tile_row] = NULL;
+
+    aom_free(cm->above_txfm_context[tile_row]);
+    cm->above_txfm_context[tile_row] = NULL;
+  }
+  for (i = 0; i < num_planes; i++) {
     aom_free(cm->above_context[i]);
     cm->above_context[i] = NULL;
   }
   aom_free(cm->above_seg_context);
   cm->above_seg_context = NULL;
-  cm->above_context_alloc_cols = 0;
-#if CONFIG_VAR_TX
+
   aom_free(cm->above_txfm_context);
   cm->above_txfm_context = NULL;
 
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    aom_free(cm->top_txfm_context[i]);
-    cm->top_txfm_context[i] = NULL;
-  }
+  cm->num_allocated_above_contexts = 0;
+  cm->num_allocated_above_context_mi_col = 0;
+  cm->num_allocated_above_context_planes = 0;
+}
+
+void av1_free_context_buffers(AV1_COMMON *cm) {
+  cm->free_mi(cm);
+
+  av1_free_above_context_buffers(cm, cm->num_allocated_above_contexts);
+
+#if LOOP_FILTER_BITMASK
+  free_loop_filter_mask(cm);
 #endif
 }
 
+int av1_alloc_above_context_buffers(AV1_COMMON *cm,
+                                    int num_alloc_above_contexts) {
+  const int num_planes = av1_num_planes(cm);
+  int plane_idx;
+  const int aligned_mi_cols =
+      ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+
+  // Allocate above context buffers
+  cm->num_allocated_above_contexts = num_alloc_above_contexts;
+  cm->num_allocated_above_context_mi_col = aligned_mi_cols;
+  cm->num_allocated_above_context_planes = num_planes;
+  for (plane_idx = 0; plane_idx < num_planes; plane_idx++) {
+    cm->above_context[plane_idx] = (ENTROPY_CONTEXT **)aom_calloc(
+        num_alloc_above_contexts, sizeof(cm->above_context[0]));
+    if (!cm->above_context[plane_idx]) return 1;
+  }
+
+  cm->above_seg_context = (PARTITION_CONTEXT **)aom_calloc(
+      num_alloc_above_contexts, sizeof(cm->above_seg_context));
+  if (!cm->above_seg_context) return 1;
+
+  cm->above_txfm_context = (TXFM_CONTEXT **)aom_calloc(
+      num_alloc_above_contexts, sizeof(cm->above_txfm_context));
+  if (!cm->above_txfm_context) return 1;
+
+  for (int tile_row = 0; tile_row < num_alloc_above_contexts; tile_row++) {
+    for (plane_idx = 0; plane_idx < num_planes; plane_idx++) {
+      cm->above_context[plane_idx][tile_row] = (ENTROPY_CONTEXT *)aom_calloc(
+          aligned_mi_cols, sizeof(*cm->above_context[0][tile_row]));
+      if (!cm->above_context[plane_idx][tile_row]) return 1;
+    }
+
+    cm->above_seg_context[tile_row] = (PARTITION_CONTEXT *)aom_calloc(
+        aligned_mi_cols, sizeof(*cm->above_seg_context[tile_row]));
+    if (!cm->above_seg_context[tile_row]) return 1;
+
+    cm->above_txfm_context[tile_row] = (TXFM_CONTEXT *)aom_calloc(
+        aligned_mi_cols, sizeof(*cm->above_txfm_context[tile_row]));
+    if (!cm->above_txfm_context[tile_row]) return 1;
+  }
+
+  return 0;
+}
+
 int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height) {
   int new_mi_size;
 
@@ -235,52 +279,6 @@ int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height) {
     if (cm->alloc_mi(cm, new_mi_size)) goto fail;
   }
 
-  if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
-    // Create the segmentation map structure and set to 0.
-    free_seg_map(cm);
-    if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail;
-  }
-  if (alloc_scratch_buffers(cm)) goto fail;
-
-  if (cm->above_context_alloc_cols < cm->mi_cols) {
-    // TODO(geza.lore): These are bigger than they need to be.
-    // cm->tile_width would be enough but it complicates indexing a
-    // little elsewhere.
-    const int aligned_mi_cols =
-        ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
-    int i;
-
-    for (i = 0; i < MAX_MB_PLANE; i++) {
-      aom_free(cm->above_context[i]);
-      cm->above_context[i] = (ENTROPY_CONTEXT *)aom_calloc(
-          aligned_mi_cols << (MI_SIZE_LOG2 - tx_size_wide_log2[0]),
-          sizeof(*cm->above_context[0]));
-      if (!cm->above_context[i]) goto fail;
-    }
-
-    aom_free(cm->above_seg_context);
-    cm->above_seg_context = (PARTITION_CONTEXT *)aom_calloc(
-        aligned_mi_cols, sizeof(*cm->above_seg_context));
-    if (!cm->above_seg_context) goto fail;
-
-#if CONFIG_VAR_TX
-    aom_free(cm->above_txfm_context);
-    cm->above_txfm_context = (TXFM_CONTEXT *)aom_calloc(
-        aligned_mi_cols << TX_UNIT_WIDE_LOG2, sizeof(*cm->above_txfm_context));
-    if (!cm->above_txfm_context) goto fail;
-
-    for (i = 0; i < MAX_MB_PLANE; ++i) {
-      aom_free(cm->top_txfm_context[i]);
-      cm->top_txfm_context[i] =
-          (TXFM_CONTEXT *)aom_calloc(aligned_mi_cols << TX_UNIT_WIDE_LOG2,
-                                     sizeof(*cm->top_txfm_context[0]));
-      if (!cm->top_txfm_context[i]) goto fail;
-    }
-#endif
-
-    cm->above_context_alloc_cols = aligned_mi_cols;
-  }
-
   return 0;
 
 fail:
@@ -299,18 +297,4 @@ void av1_remove_common(AV1_COMMON *cm) {
   cm->frame_contexts = NULL;
 }
 
-void av1_init_context_buffers(AV1_COMMON *cm) {
-  cm->setup_mi(cm);
-  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
-    memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
-}
-
-void av1_swap_current_and_last_seg_map(AV1_COMMON *cm) {
-  // Swap indices.
-  const int tmp = cm->seg_map_idx;
-  cm->seg_map_idx = cm->prev_seg_map_idx;
-  cm->prev_seg_map_idx = tmp;
-
-  cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
-  cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
-}
+void av1_init_context_buffers(AV1_COMMON *cm) { cm->setup_mi(cm); }
diff --git a/third_party/aom/av1/common/alloccommon.h b/third_party/aom/av1/common/alloccommon.h
index 0d420f825..dbcb5b947 100644
--- a/third_party/aom/av1/common/alloccommon.h
+++ b/third_party/aom/av1/common/alloccommon.h
@@ -23,15 +23,17 @@ struct BufferPool;
 
 void av1_remove_common(struct AV1Common *cm);
 
+int av1_alloc_above_context_buffers(struct AV1Common *cm,
+                                    int num_alloc_above_contexts);
+void av1_free_above_context_buffers(struct AV1Common *cm,
+                                    int num_free_above_contexts);
 int av1_alloc_context_buffers(struct AV1Common *cm, int width, int height);
 void av1_init_context_buffers(struct AV1Common *cm);
 void av1_free_context_buffers(struct AV1Common *cm);
 
 void av1_free_ref_frame_buffers(struct BufferPool *pool);
-#if CONFIG_LOOP_RESTORATION
 void av1_alloc_restoration_buffers(struct AV1Common *cm);
 void av1_free_restoration_buffers(struct AV1Common *cm);
-#endif  // CONFIG_LOOP_RESTORATION
 
 int av1_alloc_state_buffers(struct AV1Common *cm, int width, int height);
 void av1_free_state_buffers(struct AV1Common *cm);
@@ -39,8 +41,6 @@ void av1_free_state_buffers(struct AV1Common *cm);
 void av1_set_mb_mi(struct AV1Common *cm, int width, int height);
 int av1_get_MBs(int width, int height);
 
-void av1_swap_current_and_last_seg_map(struct AV1Common *cm);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/arm/av1_txfm_neon.c b/third_party/aom/av1/common/arm/av1_txfm_neon.c
new file mode 100644
index 000000000..de3c54724
--- /dev/null
+++ b/third_party/aom/av1/common/arm/av1_txfm_neon.c
@@ -0,0 +1,28 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+
+void av1_round_shift_array_neon(int32_t *arr, int size, int bit) {
+  assert(!(size % 4));
+  if (!bit) return;
+  const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit));
+  for (int i = 0; i < size; i += 4) {
+    int32x4_t tmp_q_s32 = vld1q_s32(arr);
+    tmp_q_s32 = vrshlq_s32(tmp_q_s32, dup_bits_n_32x4);
+    vst1q_s32(arr, tmp_q_s32);
+    arr += 4;
+  }
+}
diff --git a/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
new file mode 100644
index 000000000..0d8233744
--- /dev/null
+++ b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
@@ -0,0 +1,134 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "config/aom_dsp_rtcd.h"
+
+void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride,
+                              const uint8_t *src0, uint32_t src0_stride,
+                              const uint8_t *src1, uint32_t src1_stride,
+                              const uint8_t *mask, int w, int h) {
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 2);
+  assert(w >= 2);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+  uint8x8_t tmp0, tmp1;
+  uint8x16_t res_q;
+  uint16x8_t res, res_low, res_high;
+  uint32x2_t tmp0_32, tmp1_32;
+  uint16x4_t tmp0_16, tmp1_16;
+  const uint8x8_t vdup_64 = vdup_n_u8((uint8_t)64);
+
+  if (w >= 16) {
+    const uint8x16_t vdup_64_q = vdupq_n_u8((uint8_t)64);
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        __builtin_prefetch(src0);
+        __builtin_prefetch(src1);
+        const uint8x16_t tmp0_q = vld1q_u8(src0);
+        const uint8x16_t tmp1_q = vld1q_u8(src1);
+        const uint8x16_t m_q = vld1q_u8(mask);
+        const uint8x16_t max_minus_m_q = vsubq_u8(vdup_64_q, m_q);
+        res_low = vmull_u8(vget_low_u8(m_q), vget_low_u8(tmp0_q));
+        res_low =
+            vmlal_u8(res_low, vget_low_u8(max_minus_m_q), vget_low_u8(tmp1_q));
+        res_high = vmull_u8(vget_high_u8(m_q), vget_high_u8(tmp0_q));
+        res_high = vmlal_u8(res_high, vget_high_u8(max_minus_m_q),
+                            vget_high_u8(tmp1_q));
+        res_q = vcombine_u8(vrshrn_n_u16(res_low, AOM_BLEND_A64_ROUND_BITS),
+                            vrshrn_n_u16(res_high, AOM_BLEND_A64_ROUND_BITS));
+        vst1q_u8(dst, res_q);
+        src0 += 16;
+        src1 += 16;
+        dst += 16;
+        mask += 16;
+      }
+      src0 += src0_stride - w;
+      src1 += src1_stride - w;
+      dst += dst_stride - w;
+      mask -= w;
+    }
+  } else if (w == 8) {
+    const uint8x8_t m = vld1_u8(mask);
+    const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
+    for (int i = 0; i < h; ++i) {
+      __builtin_prefetch(src0);
+      __builtin_prefetch(src1);
+      tmp0 = vld1_u8(src0);
+      tmp1 = vld1_u8(src1);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_u8(dst, vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else if (w == 4) {
+    const uint8x8_t m = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)mask));
+    const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
+    for (int i = 0; i < h; i += 2) {
+      __builtin_prefetch(src0 + 0 * src0_stride);
+      __builtin_prefetch(src0 + 1 * src0_stride);
+      __builtin_prefetch(src1 + 0 * src1_stride);
+      __builtin_prefetch(src1 + 1 * src1_stride);
+      load_unaligned_u8_4x2(src0, src0_stride, &tmp0_32);
+      tmp0 = vreinterpret_u8_u32(tmp0_32);
+      load_unaligned_u8_4x2(src1, src1_stride, &tmp1_32);
+      tmp1 = vreinterpret_u8_u32(tmp1_32);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_lane_u32(
+          (uint32_t *)(dst + (0 * dst_stride)),
+          vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
+      vst1_lane_u32(
+          (uint32_t *)(dst + (1 * dst_stride)),
+          vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+      src0 += (2 * src0_stride);
+      src1 += (2 * src1_stride);
+      dst += (2 * dst_stride);
+    }
+  } else if (w == 2) {
+    const uint8x8_t m = vreinterpret_u8_u16(vld1_dup_u16((uint16_t *)mask));
+    const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
+    for (int i = 0; i < h; i += 2) {
+      __builtin_prefetch(src0 + 0 * src0_stride);
+      __builtin_prefetch(src0 + 1 * src0_stride);
+      __builtin_prefetch(src1 + 0 * src1_stride);
+      __builtin_prefetch(src1 + 1 * src1_stride);
+      load_unaligned_u8_2x2(src0, src0_stride, &tmp0_16);
+      tmp0 = vreinterpret_u8_u16(tmp0_16);
+      load_unaligned_u8_2x2(src1, src1_stride, &tmp1_16);
+      tmp1 = vreinterpret_u8_u16(tmp1_16);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_lane_u16(
+          (uint16_t *)(dst + (0 * dst_stride)),
+          vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
+      vst1_lane_u16(
+          (uint16_t *)(dst + (1 * dst_stride)),
+          vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+      src0 += (2 * src0_stride);
+      src1 += (2 * src1_stride);
+      dst += (2 * dst_stride);
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
new file mode 100644
index 000000000..33b06b767
--- /dev/null
+++ b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
@@ -0,0 +1,141 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "config/aom_dsp_rtcd.h"
+
+void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride,
+                              const uint8_t *src0, uint32_t src0_stride,
+                              const uint8_t *src1, uint32_t src1_stride,
+                              const uint8_t *mask, int w, int h) {
+  uint8x8_t tmp0, tmp1;
+  uint8x16_t tmp0_q, tmp1_q, res_q;
+  uint16x8_t res, res_low, res_high;
+  uint32x2_t tmp0_32, tmp1_32;
+  uint16x4_t tmp0_16, tmp1_16;
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 2);
+  assert(w >= 2);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (w >= 16) {
+    for (int i = 0; i < h; ++i) {
+      const uint8x8_t m = vdup_n_u8((uint8_t)mask[i]);
+      const uint8x8_t max_minus_m = vdup_n_u8(64 - (uint8_t)mask[i]);
+      for (int j = 0; j < w; j += 16) {
+        __builtin_prefetch(src0);
+        __builtin_prefetch(src1);
+        tmp0_q = vld1q_u8(src0);
+        tmp1_q = vld1q_u8(src1);
+        res_low = vmull_u8(m, vget_low_u8(tmp0_q));
+        res_low = vmlal_u8(res_low, max_minus_m, vget_low_u8(tmp1_q));
+        res_high = vmull_u8(m, vget_high_u8(tmp0_q));
+        res_high = vmlal_u8(res_high, max_minus_m, vget_high_u8(tmp1_q));
+        res_q = vcombine_u8(vrshrn_n_u16(res_low, AOM_BLEND_A64_ROUND_BITS),
+                            vrshrn_n_u16(res_high, AOM_BLEND_A64_ROUND_BITS));
+        vst1q_u8(dst, res_q);
+        src0 += 16;
+        src1 += 16;
+        dst += 16;
+      }
+      src0 += src0_stride - w;
+      src1 += src1_stride - w;
+      dst += dst_stride - w;
+    }
+  } else if (w == 8) {
+    for (int i = 0; i < h; ++i) {
+      __builtin_prefetch(src0);
+      __builtin_prefetch(src1);
+      const uint8x8_t m = vdup_n_u8((uint8_t)mask[i]);
+      const uint8x8_t max_minus_m = vdup_n_u8(64 - (uint8_t)mask[i]);
+      tmp0 = vld1_u8(src0);
+      tmp1 = vld1_u8(src1);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_u8(dst, vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else if (w == 4) {
+    for (int i = 0; i < h; i += 2) {
+      __builtin_prefetch(src0 + 0 * src0_stride);
+      __builtin_prefetch(src0 + 1 * src0_stride);
+      __builtin_prefetch(src1 + 0 * src1_stride);
+      __builtin_prefetch(src1 + 1 * src1_stride);
+      const uint16x4_t m1 = vdup_n_u16((uint16_t)mask[i]);
+      const uint16x4_t m2 = vdup_n_u16((uint16_t)mask[i + 1]);
+      const uint8x8_t m = vmovn_u16(vcombine_u16(m1, m2));
+      const uint16x4_t max_minus_m1 = vdup_n_u16(64 - (uint16_t)mask[i]);
+      const uint16x4_t max_minus_m2 = vdup_n_u16(64 - (uint16_t)mask[i + 1]);
+      const uint8x8_t max_minus_m =
+          vmovn_u16(vcombine_u16(max_minus_m1, max_minus_m2));
+      load_unaligned_u8_4x2(src0, src0_stride, &tmp0_32);
+      tmp0 = vreinterpret_u8_u32(tmp0_32);
+      load_unaligned_u8_4x2(src1, src1_stride, &tmp1_32);
+      tmp1 = vreinterpret_u8_u32(tmp1_32);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_lane_u32(
+          (uint32_t *)(dst + (0 * dst_stride)),
+          vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
+      vst1_lane_u32(
+          (uint32_t *)(dst + (1 * dst_stride)),
+          vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+      src0 += (2 * src0_stride);
+      src1 += (2 * src1_stride);
+      dst += (2 * dst_stride);
+    }
+  } else if (w == 2) {
+    for (int i = 0; i < h; i += 2) {
+      __builtin_prefetch(src0 + 0 * src0_stride);
+      __builtin_prefetch(src0 + 1 * src0_stride);
+      __builtin_prefetch(src1 + 0 * src1_stride);
+      __builtin_prefetch(src1 + 1 * src1_stride);
+      const uint8x8_t m1 = vdup_n_u8(mask[i]);
+      const uint8x8_t m2 = vdup_n_u8(mask[i + 1]);
+      const uint16x4x2_t m_trn =
+          vtrn_u16(vreinterpret_u16_u8(m1), vreinterpret_u16_u8(m2));
+      const uint8x8_t m = vreinterpret_u8_u16(m_trn.val[0]);
+      const uint8x8_t max_minus_m1 = vdup_n_u8(64 - mask[i]);
+      const uint8x8_t max_minus_m2 = vdup_n_u8(64 - mask[i + 1]);
+      const uint16x4x2_t max_minus_m_trn = vtrn_u16(
+          vreinterpret_u16_u8(max_minus_m1), vreinterpret_u16_u8(max_minus_m2));
+      const uint8x8_t max_minus_m = vreinterpret_u8_u16(max_minus_m_trn.val[0]);
+      load_unaligned_u8_2x2(src0, src0_stride, &tmp0_16);
+      tmp0 = vreinterpret_u8_u16(tmp0_16);
+      load_unaligned_u8_2x2(src1, src1_stride, &tmp1_16);
+      tmp1 = vreinterpret_u8_u16(tmp1_16);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_lane_u16(
+          (uint16_t *)(dst + (0 * dst_stride)),
+          vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
+      vst1_lane_u16(
+          (uint16_t *)(dst + (1 * dst_stride)),
+          vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+      src0 += (2 * src0_stride);
+      src1 += (2 * src1_stride);
+      dst += (2 * dst_stride);
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/arm/cfl_neon.c b/third_party/aom/av1/common/arm/cfl_neon.c
new file mode 100644
index 000000000..d731b6a66
--- /dev/null
+++ b/third_party/aom/av1/common/arm/cfl_neon.c
@@ -0,0 +1,584 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+static INLINE void vldsubstq_s16(int16_t *dst, const uint16_t *src, int offset,
+                                 int16x8_t sub) {
+  vst1q_s16(dst + offset,
+            vsubq_s16(vreinterpretq_s16_u16(vld1q_u16(src + offset)), sub));
+}
+
+static INLINE uint16x8_t vldaddq_u16(const uint16_t *buf, size_t offset) {
+  return vaddq_u16(vld1q_u16(buf), vld1q_u16(buf + offset));
+}
+
+// Load half of a vector and duplicated in other half
+static INLINE uint8x8_t vldh_dup_u8(const uint8_t *ptr) {
+  return vreinterpret_u8_u32(vld1_dup_u32((const uint32_t *)ptr));
+}
+
+// Store half of a vector.
+static INLINE void vsth_u16(uint16_t *ptr, uint16x4_t val) {
+  *((uint32_t *)ptr) = vreinterpret_u32_u16(val)[0];
+}
+
+// Store half of a vector.
+static INLINE void vsth_u8(uint8_t *ptr, uint8x8_t val) {
+  *((uint32_t *)ptr) = vreinterpret_u32_u8(val)[0];
+}
+
+static void cfl_luma_subsampling_420_lbd_neon(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+  const int luma_stride = input_stride << 1;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
+      const uint16x4_t sum = vpadal_u8(top, vldh_dup_u8(input + input_stride));
+      vsth_u16(pred_buf_q3, vshl_n_u16(sum, 1));
+    } else if (width == 8) {
+      const uint16x4_t top = vpaddl_u8(vld1_u8(input));
+      const uint16x4_t sum = vpadal_u8(top, vld1_u8(input + input_stride));
+      vst1_u16(pred_buf_q3, vshl_n_u16(sum, 1));
+    } else if (width == 16) {
+      const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
+      const uint16x8_t sum = vpadalq_u8(top, vld1q_u8(input + input_stride));
+      vst1q_u16(pred_buf_q3, vshlq_n_u16(sum, 1));
+    } else {
+      const uint8x8x4_t top = vld4_u8(input);
+      const uint8x8x4_t bot = vld4_u8(input + input_stride);
+      // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      const uint16x8_t top_0 = vaddl_u8(top.val[0], top.val[1]);
+      // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      const uint16x8_t bot_0 = vaddl_u8(bot.val[0], bot.val[1]);
+      // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      const uint16x8_t top_1 = vaddl_u8(top.val[2], top.val[3]);
+      // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      const uint16x8_t bot_1 = vaddl_u8(bot.val[2], bot.val[3]);
+      uint16x8x2_t sum;
+      sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1);
+      sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1);
+      vst2q_u16(pred_buf_q3, sum);
+    }
+    input += luma_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_422_lbd_neon(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
+      vsth_u16(pred_buf_q3, vshl_n_u16(top, 2));
+    } else if (width == 8) {
+      const uint16x4_t top = vpaddl_u8(vld1_u8(input));
+      vst1_u16(pred_buf_q3, vshl_n_u16(top, 2));
+    } else if (width == 16) {
+      const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
+      vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 2));
+    } else {
+      const uint8x8x4_t top = vld4_u8(input);
+      uint16x8x2_t sum;
+      // vaddl_u8 is equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      sum.val[0] = vshlq_n_u16(vaddl_u8(top.val[0], top.val[1]), 2);
+      sum.val[1] = vshlq_n_u16(vaddl_u8(top.val[2], top.val[3]), 2);
+      vst2q_u16(pred_buf_q3, sum);
+    }
+    input += input_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const uint16x8_t top = vshll_n_u8(vldh_dup_u8(input), 3);
+      vst1_u16(pred_buf_q3, vget_low_u16(top));
+    } else if (width == 8) {
+      const uint16x8_t top = vshll_n_u8(vld1_u8(input), 3);
+      vst1q_u16(pred_buf_q3, top);
+    } else {
+      const uint8x16_t top = vld1q_u8(input);
+      vst1q_u16(pred_buf_q3, vshll_n_u8(vget_low_u8(top), 3));
+      vst1q_u16(pred_buf_q3 + 8, vshll_n_u8(vget_high_u8(top), 3));
+      if (width == 32) {
+        const uint8x16_t next_top = vld1q_u8(input + 16);
+        vst1q_u16(pred_buf_q3 + 16, vshll_n_u8(vget_low_u8(next_top), 3));
+        vst1q_u16(pred_buf_q3 + 24, vshll_n_u8(vget_high_u8(next_top), 3));
+      }
+    }
+    input += input_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+#if __ARM_ARCH <= 7
+uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
+  return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
+                      vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
+}
+#endif
+
+static void cfl_luma_subsampling_420_hbd_neon(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+  const int luma_stride = input_stride << 1;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vld1_u16(input);
+      const uint16x4_t bot = vld1_u16(input + input_stride);
+      const uint16x4_t sum = vadd_u16(top, bot);
+      const uint16x4_t hsum = vpadd_u16(sum, sum);
+      vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 1));
+    } else if (width < 32) {
+      const uint16x8_t top = vld1q_u16(input);
+      const uint16x8_t bot = vld1q_u16(input + input_stride);
+      const uint16x8_t sum = vaddq_u16(top, bot);
+      if (width == 8) {
+        const uint16x4_t hsum = vget_low_u16(vpaddq_u16(sum, sum));
+        vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 1));
+      } else {
+        const uint16x8_t top_1 = vld1q_u16(input + 8);
+        const uint16x8_t bot_1 = vld1q_u16(input + 8 + input_stride);
+        const uint16x8_t sum_1 = vaddq_u16(top_1, bot_1);
+        const uint16x8_t hsum = vpaddq_u16(sum, sum_1);
+        vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 1));
+      }
+    } else {
+      const uint16x8x4_t top = vld4q_u16(input);
+      const uint16x8x4_t bot = vld4q_u16(input + input_stride);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t top_0 = vaddq_u16(top.val[0], top.val[1]);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t bot_0 = vaddq_u16(bot.val[0], bot.val[1]);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t top_1 = vaddq_u16(top.val[2], top.val[3]);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t bot_1 = vaddq_u16(bot.val[2], bot.val[3]);
+      uint16x8x2_t sum;
+      sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1);
+      sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1);
+      vst2q_u16(pred_buf_q3, sum);
+    }
+    input += luma_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_422_hbd_neon(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vld1_u16(input);
+      const uint16x4_t hsum = vpadd_u16(top, top);
+      vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 2));
+    } else if (width == 8) {
+      const uint16x4x2_t top = vld2_u16(input);
+      // equivalent to a vpadd_u16 (because vld2 interleaves)
+      const uint16x4_t hsum = vadd_u16(top.val[0], top.val[1]);
+      vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 2));
+    } else if (width == 16) {
+      const uint16x8x2_t top = vld2q_u16(input);
+      // equivalent to a vpaddq_u16 (because vld2q interleaves)
+      const uint16x8_t hsum = vaddq_u16(top.val[0], top.val[1]);
+      vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 2));
+    } else {
+      const uint16x8x4_t top = vld4q_u16(input);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t hsum_0 = vaddq_u16(top.val[0], top.val[1]);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t hsum_1 = vaddq_u16(top.val[2], top.val[3]);
+      uint16x8x2_t result = { { vshlq_n_u16(hsum_0, 2),
+                                vshlq_n_u16(hsum_1, 2) } };
+      vst2q_u16(pred_buf_q3, result);
+    }
+    input += input_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_444_hbd_neon(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vld1_u16(input);
+      vst1_u16(pred_buf_q3, vshl_n_u16(top, 3));
+    } else if (width == 8) {
+      const uint16x8_t top = vld1q_u16(input);
+      vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 3));
+    } else if (width == 16) {
+      uint16x8x2_t top = vld2q_u16(input);
+      top.val[0] = vshlq_n_u16(top.val[0], 3);
+      top.val[1] = vshlq_n_u16(top.val[1], 3);
+      vst2q_u16(pred_buf_q3, top);
+    } else {
+      uint16x8x4_t top = vld4q_u16(input);
+      top.val[0] = vshlq_n_u16(top.val[0], 3);
+      top.val[1] = vshlq_n_u16(top.val[1], 3);
+      top.val[2] = vshlq_n_u16(top.val[2], 3);
+      top.val[3] = vshlq_n_u16(top.val[3], 3);
+      vst4q_u16(pred_buf_q3, top);
+    }
+    input += input_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION(neon)
+
+static INLINE void subtract_average_neon(const uint16_t *src, int16_t *dst,
+                                         int width, int height,
+                                         int round_offset,
+                                         const int num_pel_log2) {
+  const uint16_t *const end = src + height * CFL_BUF_LINE;
+
+  // Round offset is not needed, because NEON will handle the rounding.
+  (void)round_offset;
+
+  // To optimize the use of the CPU pipeline, we process 4 rows per iteration
+  const int step = 4 * CFL_BUF_LINE;
+
+  // At this stage, the prediction buffer contains scaled reconstructed luma
+  // pixels, which are positive integer and only require 15 bits. By using
+  // unsigned integer for the sum, we can do one addition operation inside 16
+  // bits (8 lanes) before having to convert to 32 bits (4 lanes).
+  const uint16_t *sum_buf = src;
+  uint32x4_t sum_32x4 = { 0, 0, 0, 0 };
+  do {
+    // For all widths, we load, add and combine the data so it fits in 4 lanes.
+    if (width == 4) {
+      const uint16x4_t a0 =
+          vadd_u16(vld1_u16(sum_buf), vld1_u16(sum_buf + CFL_BUF_LINE));
+      const uint16x4_t a1 = vadd_u16(vld1_u16(sum_buf + 2 * CFL_BUF_LINE),
+                                     vld1_u16(sum_buf + 3 * CFL_BUF_LINE));
+      sum_32x4 = vaddq_u32(sum_32x4, vaddl_u16(a0, a1));
+    } else if (width == 8) {
+      const uint16x8_t a0 = vldaddq_u16(sum_buf, CFL_BUF_LINE);
+      const uint16x8_t a1 =
+          vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, CFL_BUF_LINE);
+      sum_32x4 = vpadalq_u16(sum_32x4, a0);
+      sum_32x4 = vpadalq_u16(sum_32x4, a1);
+    } else {
+      const uint16x8_t row0 = vldaddq_u16(sum_buf, 8);
+      const uint16x8_t row1 = vldaddq_u16(sum_buf + CFL_BUF_LINE, 8);
+      const uint16x8_t row2 = vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, 8);
+      const uint16x8_t row3 = vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE, 8);
+      sum_32x4 = vpadalq_u16(sum_32x4, row0);
+      sum_32x4 = vpadalq_u16(sum_32x4, row1);
+      sum_32x4 = vpadalq_u16(sum_32x4, row2);
+      sum_32x4 = vpadalq_u16(sum_32x4, row3);
+
+      if (width == 32) {
+        const uint16x8_t row0_1 = vldaddq_u16(sum_buf + 16, 8);
+        const uint16x8_t row1_1 = vldaddq_u16(sum_buf + CFL_BUF_LINE + 16, 8);
+        const uint16x8_t row2_1 =
+            vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE + 16, 8);
+        const uint16x8_t row3_1 =
+            vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE + 16, 8);
+
+        sum_32x4 = vpadalq_u16(sum_32x4, row0_1);
+        sum_32x4 = vpadalq_u16(sum_32x4, row1_1);
+        sum_32x4 = vpadalq_u16(sum_32x4, row2_1);
+        sum_32x4 = vpadalq_u16(sum_32x4, row3_1);
+      }
+    }
+    sum_buf += step;
+  } while (sum_buf < end);
+
+  // Permute and add in such a way that each lane contains the block sum.
+  // [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A]
+#if __ARM_ARCH >= 8
+  sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
+  sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
+#else
+  uint32x4_t flip =
+      vcombine_u32(vget_high_u32(sum_32x4), vget_low_u32(sum_32x4));
+  sum_32x4 = vaddq_u32(sum_32x4, flip);
+  sum_32x4 = vaddq_u32(sum_32x4, vrev64q_u32(sum_32x4));
+#endif
+
+  // Computing the average could be done using scalars, but getting off the NEON
+  // engine introduces latency, so we use vqrshrn.
+  int16x4_t avg_16x4;
+  // Constant propagation makes for some ugly code.
+  switch (num_pel_log2) {
+    case 4: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 4)); break;
+    case 5: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 5)); break;
+    case 6: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 6)); break;
+    case 7: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 7)); break;
+    case 8: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 8)); break;
+    case 9: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 9)); break;
+    case 10:
+      avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 10));
+      break;
+    default: assert(0);
+  }
+
+  if (width == 4) {
+    do {
+      vst1_s16(dst, vsub_s16(vreinterpret_s16_u16(vld1_u16(src)), avg_16x4));
+      src += CFL_BUF_LINE;
+      dst += CFL_BUF_LINE;
+    } while (src < end);
+  } else {
+    const int16x8_t avg_16x8 = vcombine_s16(avg_16x4, avg_16x4);
+    do {
+      vldsubstq_s16(dst, src, 0, avg_16x8);
+      vldsubstq_s16(dst, src, CFL_BUF_LINE, avg_16x8);
+      vldsubstq_s16(dst, src, 2 * CFL_BUF_LINE, avg_16x8);
+      vldsubstq_s16(dst, src, 3 * CFL_BUF_LINE, avg_16x8);
+
+      if (width > 8) {
+        vldsubstq_s16(dst, src, 8, avg_16x8);
+        vldsubstq_s16(dst, src, 8 + CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 8 + 2 * CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 8 + 3 * CFL_BUF_LINE, avg_16x8);
+      }
+      if (width == 32) {
+        vldsubstq_s16(dst, src, 16, avg_16x8);
+        vldsubstq_s16(dst, src, 16 + CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 16 + 2 * CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 16 + 3 * CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 24, avg_16x8);
+        vldsubstq_s16(dst, src, 24 + CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 24 + 2 * CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 24 + 3 * CFL_BUF_LINE, avg_16x8);
+      }
+      src += step;
+      dst += step;
+    } while (src < end);
+  }
+}
+
+CFL_SUB_AVG_FN(neon)
+
+// Saturating negate 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative.
+// Notes:
+//   * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in
+//   practice, as scaled_luma is the multiplication of two absolute values.
+//   * In the Intel equivalent, elements in a are zeroed out when the
+//   corresponding elements in b are zero. Because vsign is used twice in a
+//   row, with b in the first call becoming a in the second call, there's no
+//   impact from not zeroing out.
+static int16x4_t vsign_s16(int16x4_t a, int16x4_t b) {
+  const int16x4_t mask = vshr_n_s16(b, 15);
+  return veor_s16(vadd_s16(a, mask), mask);
+}
+
+// Saturating negate 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative.
+// Notes:
+//   * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in
+//   practice, as scaled_luma is the multiplication of two absolute values.
+//   * In the Intel equivalent, elements in a are zeroed out when the
+//   corresponding elements in b are zero. Because vsignq is used twice in a
+//   row, with b in the first call becoming a in the second call, there's no
+//   impact from not zeroing out.
+static int16x8_t vsignq_s16(int16x8_t a, int16x8_t b) {
+  const int16x8_t mask = vshrq_n_s16(b, 15);
+  return veorq_s16(vaddq_s16(a, mask), mask);
+}
+
+static INLINE int16x4_t predict_w4(const int16_t *pred_buf_q3,
+                                   int16x4_t alpha_sign, int abs_alpha_q12,
+                                   int16x4_t dc) {
+  const int16x4_t ac_q3 = vld1_s16(pred_buf_q3);
+  const int16x4_t ac_sign = veor_s16(alpha_sign, ac_q3);
+  int16x4_t scaled_luma = vqrdmulh_n_s16(vabs_s16(ac_q3), abs_alpha_q12);
+  return vadd_s16(vsign_s16(scaled_luma, ac_sign), dc);
+}
+
+static INLINE int16x8_t predict_w8(const int16_t *pred_buf_q3,
+                                   int16x8_t alpha_sign, int abs_alpha_q12,
+                                   int16x8_t dc) {
+  const int16x8_t ac_q3 = vld1q_s16(pred_buf_q3);
+  const int16x8_t ac_sign = veorq_s16(alpha_sign, ac_q3);
+  int16x8_t scaled_luma = vqrdmulhq_n_s16(vabsq_s16(ac_q3), abs_alpha_q12);
+  return vaddq_s16(vsignq_s16(scaled_luma, ac_sign), dc);
+}
+
+static INLINE int16x8x2_t predict_w16(const int16_t *pred_buf_q3,
+                                      int16x8_t alpha_sign, int abs_alpha_q12,
+                                      int16x8_t dc) {
+  // vld2q_s16 interleaves, which is not useful for prediction. vst1q_s16_x2
+  // does not interleave, but is not currently available in the compilier used
+  // by the AOM build system.
+  const int16x8x2_t ac_q3 = vld2q_s16(pred_buf_q3);
+  const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
+  const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
+  const int16x8_t scaled_luma_0 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12);
+  const int16x8_t scaled_luma_1 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[1]), abs_alpha_q12);
+  int16x8x2_t result;
+  result.val[0] = vaddq_s16(vsignq_s16(scaled_luma_0, ac_sign_0), dc);
+  result.val[1] = vaddq_s16(vsignq_s16(scaled_luma_1, ac_sign_1), dc);
+  return result;
+}
+
+static INLINE int16x8x4_t predict_w32(const int16_t *pred_buf_q3,
+                                      int16x8_t alpha_sign, int abs_alpha_q12,
+                                      int16x8_t dc) {
+  // vld4q_s16 interleaves, which is not useful for prediction. vst1q_s16_x4
+  // does not interleave, but is not currently available in the compilier used
+  // by the AOM build system.
+  const int16x8x4_t ac_q3 = vld4q_s16(pred_buf_q3);
+  const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
+  const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
+  const int16x8_t ac_sign_2 = veorq_s16(alpha_sign, ac_q3.val[2]);
+  const int16x8_t ac_sign_3 = veorq_s16(alpha_sign, ac_q3.val[3]);
+  const int16x8_t scaled_luma_0 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12);
+  const int16x8_t scaled_luma_1 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[1]), abs_alpha_q12);
+  const int16x8_t scaled_luma_2 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[2]), abs_alpha_q12);
+  const int16x8_t scaled_luma_3 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[3]), abs_alpha_q12);
+  int16x8x4_t result;
+  result.val[0] = vaddq_s16(vsignq_s16(scaled_luma_0, ac_sign_0), dc);
+  result.val[1] = vaddq_s16(vsignq_s16(scaled_luma_1, ac_sign_1), dc);
+  result.val[2] = vaddq_s16(vsignq_s16(scaled_luma_2, ac_sign_2), dc);
+  result.val[3] = vaddq_s16(vsignq_s16(scaled_luma_3, ac_sign_3), dc);
+  return result;
+}
+
+static INLINE void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
+                                        uint8_t *dst, int dst_stride,
+                                        int alpha_q3, int width, int height) {
+  const int16_t abs_alpha_q12 = abs(alpha_q3) << 9;
+  const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE;
+  if (width == 4) {
+    const int16x4_t alpha_sign = vdup_n_s16(alpha_q3);
+    const int16x4_t dc = vdup_n_s16(*dst);
+    do {
+      const int16x4_t pred =
+          predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+      vsth_u8(dst, vqmovun_s16(vcombine_s16(pred, pred)));
+      dst += dst_stride;
+    } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+  } else {
+    const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3);
+    const int16x8_t dc = vdupq_n_s16(*dst);
+    do {
+      if (width == 8) {
+        vst1_u8(dst, vqmovun_s16(predict_w8(pred_buf_q3, alpha_sign,
+                                            abs_alpha_q12, dc)));
+      } else if (width == 16) {
+        const int16x8x2_t pred =
+            predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+        const uint8x8x2_t predun = { { vqmovun_s16(pred.val[0]),
+                                       vqmovun_s16(pred.val[1]) } };
+        vst2_u8(dst, predun);
+      } else {
+        const int16x8x4_t pred =
+            predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+        const uint8x8x4_t predun = {
+          { vqmovun_s16(pred.val[0]), vqmovun_s16(pred.val[1]),
+            vqmovun_s16(pred.val[2]), vqmovun_s16(pred.val[3]) }
+        };
+        vst4_u8(dst, predun);
+      }
+      dst += dst_stride;
+    } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+  }
+}
+
+CFL_PREDICT_FN(neon, lbd)
+
+static INLINE uint16x4_t clamp_s16(int16x4_t a, int16x4_t max) {
+  return vreinterpret_u16_s16(vmax_s16(vmin_s16(a, max), vdup_n_s16(0)));
+}
+
+static INLINE uint16x8_t clampq_s16(int16x8_t a, int16x8_t max) {
+  return vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(a, max), vdupq_n_s16(0)));
+}
+
+static INLINE uint16x8x2_t clamp2q_s16(int16x8x2_t a, int16x8_t max) {
+  uint16x8x2_t result;
+  result.val[0] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0)));
+  result.val[1] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0)));
+  return result;
+}
+
+static INLINE uint16x8x4_t clamp4q_s16(int16x8x4_t a, int16x8_t max) {
+  uint16x8x4_t result;
+  result.val[0] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0)));
+  result.val[1] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0)));
+  result.val[2] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[2], max), vdupq_n_s16(0)));
+  result.val[3] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[3], max), vdupq_n_s16(0)));
+  return result;
+}
+
+static INLINE void cfl_predict_hbd_neon(const int16_t *pred_buf_q3,
+                                        uint16_t *dst, int dst_stride,
+                                        int alpha_q3, int bd, int width,
+                                        int height) {
+  const int max = (1 << bd) - 1;
+  const int16_t abs_alpha_q12 = abs(alpha_q3) << 9;
+  const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE;
+  if (width == 4) {
+    const int16x4_t alpha_sign = vdup_n_s16(alpha_q3);
+    const int16x4_t dc = vdup_n_s16(*dst);
+    const int16x4_t max_16x4 = vdup_n_s16(max);
+    do {
+      const int16x4_t scaled_luma =
+          predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+      vst1_u16(dst, clamp_s16(scaled_luma, max_16x4));
+      dst += dst_stride;
+    } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+  } else {
+    const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3);
+    const int16x8_t dc = vdupq_n_s16(*dst);
+    const int16x8_t max_16x8 = vdupq_n_s16(max);
+    do {
+      if (width == 8) {
+        const int16x8_t pred =
+            predict_w8(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+        vst1q_u16(dst, clampq_s16(pred, max_16x8));
+      } else if (width == 16) {
+        const int16x8x2_t pred =
+            predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+        vst2q_u16(dst, clamp2q_s16(pred, max_16x8));
+      } else {
+        const int16x8x4_t pred =
+            predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+        vst4q_u16(dst, clamp4q_s16(pred, max_16x8));
+      }
+      dst += dst_stride;
+    } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+  }
+}
+
+CFL_PREDICT_FN(neon, hbd)
diff --git a/third_party/aom/av1/common/arm/convolve_neon.c b/third_party/aom/av1/common/arm/convolve_neon.c
new file mode 100644
index 000000000..86a25e109
--- /dev/null
+++ b/third_party/aom/av1/common/arm/convolve_neon.c
@@ -0,0 +1,1134 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+static INLINE int16x4_t convolve8_4x4(const int16x4_t s0, const int16x4_t s1,
+                                      const int16x4_t s2, const int16x4_t s3,
+                                      const int16x4_t s4, const int16x4_t s5,
+                                      const int16x4_t s6, const int16x4_t s7,
+                                      const int16_t *filter) {
+  int16x4_t sum;
+
+  sum = vmul_n_s16(s0, filter[0]);
+  sum = vmla_n_s16(sum, s1, filter[1]);
+  sum = vmla_n_s16(sum, s2, filter[2]);
+  sum = vmla_n_s16(sum, s5, filter[5]);
+  sum = vmla_n_s16(sum, s6, filter[6]);
+  sum = vmla_n_s16(sum, s7, filter[7]);
+  /* filter[3] can take a max value of 128. So the max value of the result :
+   * 128*255 + sum > 16 bits
+   */
+  sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
+  sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
+
+  return sum;
+}
+
+static INLINE uint8x8_t convolve8_horiz_8x8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
+    const int16x8_t shift_round_0, const int16x8_t shift_by_bits) {
+  int16x8_t sum;
+
+  sum = vmulq_n_s16(s0, filter[0]);
+  sum = vmlaq_n_s16(sum, s1, filter[1]);
+  sum = vmlaq_n_s16(sum, s2, filter[2]);
+  sum = vmlaq_n_s16(sum, s5, filter[5]);
+  sum = vmlaq_n_s16(sum, s6, filter[6]);
+  sum = vmlaq_n_s16(sum, s7, filter[7]);
+  /* filter[3] can take a max value of 128. So the max value of the result :
+   * 128*255 + sum > 16 bits
+   */
+  sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
+  sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
+
+  sum = vqrshlq_s16(sum, shift_round_0);
+  sum = vqrshlq_s16(sum, shift_by_bits);
+
+  return vqmovun_s16(sum);
+}
+
+static INLINE uint8x8_t convolve8_vert_8x4(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16_t *filter) {
+  int16x8_t sum;
+
+  sum = vmulq_n_s16(s0, filter[0]);
+  sum = vmlaq_n_s16(sum, s1, filter[1]);
+  sum = vmlaq_n_s16(sum, s2, filter[2]);
+  sum = vmlaq_n_s16(sum, s5, filter[5]);
+  sum = vmlaq_n_s16(sum, s6, filter[6]);
+  sum = vmlaq_n_s16(sum, s7, filter[7]);
+  /* filter[3] can take a max value of 128. So the max value of the result :
+   * 128*255 + sum > 16 bits
+   */
+  sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
+  sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
+
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+static INLINE uint16x4_t convolve8_vert_4x4_s32(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
+    const int32x4_t round_shift_vec, const int32x4_t offset_const,
+    const int32x4_t sub_const_vec) {
+  int32x4_t sum0;
+  uint16x4_t res;
+  const int32x4_t zero = vdupq_n_s32(0);
+
+  sum0 = vmull_n_s16(s0, y_filter[0]);
+  sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
+  sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
+  sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
+  sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
+  sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
+  sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
+  sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
+
+  sum0 = vaddq_s32(sum0, offset_const);
+  sum0 = vqrshlq_s32(sum0, round_shift_vec);
+  sum0 = vsubq_s32(sum0, sub_const_vec);
+  sum0 = vmaxq_s32(sum0, zero);
+
+  res = vmovn_u32(vreinterpretq_u32_s32(sum0));
+
+  return res;
+}
+
+static INLINE uint8x8_t convolve8_vert_8x4_s32(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16_t *y_filter,
+    const int32x4_t round_shift_vec, const int32x4_t offset_const,
+    const int32x4_t sub_const_vec, const int16x8_t vec_round_bits) {
+  int32x4_t sum0, sum1;
+  uint16x8_t res;
+  const int32x4_t zero = vdupq_n_s32(0);
+
+  sum0 = vmull_n_s16(vget_low_s16(s0), y_filter[0]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s1), y_filter[1]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s2), y_filter[2]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), y_filter[3]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s4), y_filter[4]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s5), y_filter[5]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s6), y_filter[6]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s7), y_filter[7]);
+
+  sum1 = vmull_n_s16(vget_high_s16(s0), y_filter[0]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s1), y_filter[1]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s2), y_filter[2]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), y_filter[3]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s4), y_filter[4]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s5), y_filter[5]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s6), y_filter[6]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s7), y_filter[7]);
+
+  sum0 = vaddq_s32(sum0, offset_const);
+  sum1 = vaddq_s32(sum1, offset_const);
+  sum0 = vqrshlq_s32(sum0, round_shift_vec);
+  sum1 = vqrshlq_s32(sum1, round_shift_vec);
+  sum0 = vsubq_s32(sum0, sub_const_vec);
+  sum1 = vsubq_s32(sum1, sub_const_vec);
+  sum0 = vmaxq_s32(sum0, zero);
+  sum1 = vmaxq_s32(sum1, zero);
+  res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(sum0)),
+                     vqmovn_u32(vreinterpretq_u32_s32(sum1)));
+
+  res = vqrshlq_u16(res, vec_round_bits);
+
+  return vqmovn_u16(res);
+}
+
+void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            InterpFilterParams *filter_params_x,
+                            InterpFilterParams *filter_params_y,
+                            const int subpel_x_q4, const int subpel_y_q4,
+                            ConvolveParams *conv_params) {
+  const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
+  const int8_t bits = FILTER_BITS - conv_params->round_0;
+
+  (void)subpel_y_q4;
+  (void)conv_params;
+  (void)filter_params_y;
+
+  uint8x8_t t0, t1, t2, t3;
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+
+  const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
+  const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
+
+  src -= horiz_offset;
+
+  if (h == 4) {
+    uint8x8_t d01, d23;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+    int16x8_t d01_temp, d23_temp;
+
+    __builtin_prefetch(src + 0 * src_stride);
+    __builtin_prefetch(src + 1 * src_stride);
+    __builtin_prefetch(src + 2 * src_stride);
+    __builtin_prefetch(src + 3 * src_stride);
+
+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+    transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    __builtin_prefetch(dst + 0 * dst_stride);
+    __builtin_prefetch(dst + 1 * dst_stride);
+    __builtin_prefetch(dst + 2 * dst_stride);
+    __builtin_prefetch(dst + 3 * dst_stride);
+    src += 7;
+
+    do {
+      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+      d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter);
+
+      d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter);
+
+      d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter);
+
+      d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter);
+
+      d01_temp = vqrshlq_s16(vcombine_s16(d0, d1), shift_round_0);
+      d23_temp = vqrshlq_s16(vcombine_s16(d2, d3), shift_round_0);
+
+      d01_temp = vqrshlq_s16(d01_temp, shift_by_bits);
+      d23_temp = vqrshlq_s16(d23_temp, shift_by_bits);
+
+      d01 = vqmovun_s16(d01_temp);
+      d23 = vqmovun_s16(d23_temp);
+
+      transpose_u8_4x4(&d01, &d23);
+
+      if (w != 2) {
+        vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),  // 00 01 02 03
+                      vreinterpret_u32_u8(d01), 0);
+        vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),  // 10 11 12 13
+                      vreinterpret_u32_u8(d23), 0);
+        vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),  // 20 21 22 23
+                      vreinterpret_u32_u8(d01), 1);
+        vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),  // 30 31 32 33
+                      vreinterpret_u32_u8(d23), 1);
+      } else {
+        vst1_lane_u16((uint16_t *)(dst + 0 * dst_stride),  // 00 01
+                      vreinterpret_u16_u8(d01), 0);
+        vst1_lane_u16((uint16_t *)(dst + 1 * dst_stride),  // 10 11
+                      vreinterpret_u16_u8(d23), 0);
+        vst1_lane_u16((uint16_t *)(dst + 2 * dst_stride),  // 20 21
+                      vreinterpret_u16_u8(d01), 2);
+        vst1_lane_u16((uint16_t *)(dst + 3 * dst_stride),  // 30 31
+                      vreinterpret_u16_u8(d23), 2);
+      }
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      src += 4;
+      dst += 4;
+      w -= 4;
+    } while (w > 0);
+  } else {
+    int width;
+    const uint8_t *s;
+    uint8x8_t t4, t5, t6, t7;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+    if (w <= 4) {
+      do {
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+                    &t7);
+        src += 8 * src_stride;
+        __builtin_prefetch(dst + 0 * dst_stride);
+        __builtin_prefetch(dst + 1 * dst_stride);
+        __builtin_prefetch(dst + 2 * dst_stride);
+        __builtin_prefetch(dst + 3 * dst_stride);
+        __builtin_prefetch(dst + 4 * dst_stride);
+        __builtin_prefetch(dst + 5 * dst_stride);
+        __builtin_prefetch(dst + 6 * dst_stride);
+        __builtin_prefetch(dst + 7 * dst_stride);
+
+        transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
+
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        __builtin_prefetch(src + 0 * src_stride);
+        __builtin_prefetch(src + 1 * src_stride);
+        __builtin_prefetch(src + 2 * src_stride);
+        __builtin_prefetch(src + 3 * src_stride);
+        __builtin_prefetch(src + 4 * src_stride);
+        __builtin_prefetch(src + 5 * src_stride);
+        __builtin_prefetch(src + 6 * src_stride);
+        __builtin_prefetch(src + 7 * src_stride);
+        t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                 shift_round_0, shift_by_bits);
+        t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+                                 shift_round_0, shift_by_bits);
+        t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+                                 shift_round_0, shift_by_bits);
+        t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+                                 shift_round_0, shift_by_bits);
+
+        transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+        if ((w == 4) && (h > 4)) {
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+                        0);  // 00 01 02 03
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
+                        0);  // 10 11 12 13
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
+                        0);  // 20 21 22 23
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
+                        0);  // 30 31 32 33
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+                        1);  // 40 41 42 43
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
+                        1);  // 50 51 52 53
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
+                        1);  // 60 61 62 63
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
+                        1);  // 70 71 72 73
+          dst += dst_stride;
+        } else if ((w == 4) && (h == 2)) {
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+                        0);  // 00 01 02 03
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
+                        0);  // 10 11 12 13
+          dst += dst_stride;
+        } else if ((w == 2) && (h > 4)) {
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0);  // 10 11
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 0);  // 20 21
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 0);  // 30 31
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 2);  // 40 41
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 2);  // 50 51
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 2);  // 60 61
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 2);  // 70 71
+          dst += dst_stride;
+        } else if ((w == 2) && (h == 2)) {
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0);  // 10 11
+          dst += dst_stride;
+        }
+        h -= 8;
+      } while (h > 0);
+    } else {
+      uint8_t *d;
+      int16x8_t s11, s12, s13, s14;
+
+      do {
+        __builtin_prefetch(src + 0 * src_stride);
+        __builtin_prefetch(src + 1 * src_stride);
+        __builtin_prefetch(src + 2 * src_stride);
+        __builtin_prefetch(src + 3 * src_stride);
+        __builtin_prefetch(src + 4 * src_stride);
+        __builtin_prefetch(src + 5 * src_stride);
+        __builtin_prefetch(src + 6 * src_stride);
+        __builtin_prefetch(src + 7 * src_stride);
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        width = w;
+        s = src + 7;
+        d = dst;
+        __builtin_prefetch(dst + 0 * dst_stride);
+        __builtin_prefetch(dst + 1 * dst_stride);
+        __builtin_prefetch(dst + 2 * dst_stride);
+        __builtin_prefetch(dst + 3 * dst_stride);
+        __builtin_prefetch(dst + 4 * dst_stride);
+        __builtin_prefetch(dst + 5 * dst_stride);
+        __builtin_prefetch(dst + 6 * dst_stride);
+        __builtin_prefetch(dst + 7 * dst_stride);
+
+        do {
+          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+          s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+          s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+          s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+          s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+          s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+          s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+          s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+          t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t4 = convolve8_horiz_8x8(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t5 = convolve8_horiz_8x8(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t6 = convolve8_horiz_8x8(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t7 = convolve8_horiz_8x8(s7, s8, s9, s10, s11, s12, s13, s14,
+                                   x_filter, shift_round_0, shift_by_bits);
+
+          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          if (h != 2) {
+            store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
+          } else {
+            store_row2_u8_8x8(d, dst_stride, t0, t1);
+          }
+          s0 = s8;
+          s1 = s9;
+          s2 = s10;
+          s3 = s11;
+          s4 = s12;
+          s5 = s13;
+          s6 = s14;
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width > 0);
+        src += 8 * src_stride;
+        dst += 8 * dst_stride;
+        h -= 8;
+      } while (h > 0);
+    }
+  }
+}
+
+void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            InterpFilterParams *filter_params_x,
+                            InterpFilterParams *filter_params_y,
+                            const int subpel_x_q4, const int subpel_y_q4,
+                            ConvolveParams *conv_params) {
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+
+  src -= vert_offset * src_stride;
+
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+
+  if (w <= 4) {
+    uint8x8_t d01, d23;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+
+    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+
+    do {
+      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+
+      __builtin_prefetch(dst + 0 * dst_stride);
+      __builtin_prefetch(dst + 1 * dst_stride);
+      __builtin_prefetch(dst + 2 * dst_stride);
+      __builtin_prefetch(dst + 3 * dst_stride);
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+      d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+      d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+      d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+      d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      if ((w == 4) && (h != 2)) {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
+                      0);  // 00 01 02 03
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
+                      1);  // 10 11 12 13
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
+                      0);  // 20 21 22 23
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
+                      1);  // 30 31 32 33
+        dst += dst_stride;
+      } else if ((w == 4) && (h == 2)) {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
+                      0);  // 00 01 02 03
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
+                      1);  // 10 11 12 13
+        dst += dst_stride;
+      } else if ((w == 2) && (h != 2)) {
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);  // 00 01
+        dst += dst_stride;
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2);  // 10 11
+        dst += dst_stride;
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 0);  // 20 21
+        dst += dst_stride;
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 2);  // 30 31
+        dst += dst_stride;
+      } else if ((w == 2) && (h == 2)) {
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);  // 00 01
+        dst += dst_stride;
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2);  // 10 11
+        dst += dst_stride;
+      }
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    int height;
+    const uint8_t *s;
+    uint8_t *d;
+    uint8x8_t t0, t1, t2, t3;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+    do {
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+      __builtin_prefetch(src + 4 * src_stride);
+      __builtin_prefetch(src + 5 * src_stride);
+      __builtin_prefetch(src + 6 * src_stride);
+      s = src;
+      s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      d = dst;
+      height = h;
+
+      do {
+        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+        s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+        s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+        s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+
+        __builtin_prefetch(d + 0 * dst_stride);
+        __builtin_prefetch(d + 1 * dst_stride);
+        __builtin_prefetch(d + 2 * dst_stride);
+        __builtin_prefetch(d + 3 * dst_stride);
+        __builtin_prefetch(s + 0 * src_stride);
+        __builtin_prefetch(s + 1 * src_stride);
+        __builtin_prefetch(s + 2 * src_stride);
+        __builtin_prefetch(s + 3 * src_stride);
+        t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+        t1 = convolve8_vert_8x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+        t2 = convolve8_vert_8x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+        t3 = convolve8_vert_8x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+        if (h != 2) {
+          vst1_u8(d, t0);
+          d += dst_stride;
+          vst1_u8(d, t1);
+          d += dst_stride;
+          vst1_u8(d, t2);
+          d += dst_stride;
+          vst1_u8(d, t3);
+          d += dst_stride;
+        } else {
+          vst1_u8(d, t0);
+          d += dst_stride;
+          vst1_u8(d, t1);
+          d += dst_stride;
+        }
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        height -= 4;
+      } while (height > 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
+void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             InterpFilterParams *filter_params_x,
+                             InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  int im_dst_stride;
+  int width, height;
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+
+  const int bd = 8;
+  const int im_h = h + filter_params_y->taps - 1;
+  const int im_stride = MAX_SB_SIZE;
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+
+  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+  const uint8_t *s;
+  int16_t *dst_ptr;
+
+  dst_ptr = im_block;
+  im_dst_stride = im_stride;
+  height = im_h;
+  width = w;
+
+  const int16_t round_bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+
+  int16_t x_filter_tmp[8];
+  int16x8_t filter_x_coef = vld1q_s16(x_filter);
+
+  // filter coeffs are even, so downshifting by 1 to reduce intermediate
+  // precision requirements.
+  filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
+  vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+
+  assert(conv_params->round_0 > 0);
+
+  if (w <= 4) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+
+    const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
+    const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
+
+    do {
+      s = src_ptr;
+      __builtin_prefetch(s + 0 * src_stride);
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+      s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+      s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+      __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
+      s += 7;
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                             horiz_const, shift_round_0);
+
+      transpose_s16_4x4d(&d0, &d1, &d2, &d3);
+      if (w == 4) {
+        vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
+        vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
+        vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
+        vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
+      } else if (w == 2) {
+        vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
+                      vreinterpret_u32_s16(d0), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
+                      vreinterpret_u32_s16(d1), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
+                      vreinterpret_u32_s16(d2), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
+                      vreinterpret_u32_s16(d3), 0);
+      }
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * im_dst_stride;
+      height -= 4;
+    } while (height > 0);
+  } else {
+    int16_t *d_tmp;
+    int16x8_t s11, s12, s13, s14;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+
+    const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
+    const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
+
+    do {
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+      __builtin_prefetch(src_ptr + 4 * src_stride);
+      __builtin_prefetch(src_ptr + 5 * src_stride);
+      __builtin_prefetch(src_ptr + 6 * src_stride);
+      __builtin_prefetch(src_ptr + 7 * src_stride);
+
+      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      width = w;
+      s = src_ptr + 7;
+      d_tmp = dst_ptr;
+
+      __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
+
+      do {
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+
+        transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+                          &res7);
+
+        store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5,
+                      res6, res7);
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += 8 * src_stride;
+      dst_ptr += 8 * im_dst_stride;
+      height -= 8;
+    } while (height > 0);
+  }
+
+  // vertical
+  {
+    uint8_t *dst_u8_ptr, *d_u8;
+    int16_t *v_src_ptr, *v_s;
+
+    const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
+                              (1 << (offset_bits - conv_params->round_1 - 1));
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+
+    const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
+    const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+    const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
+
+    src_stride = im_stride;
+    v_src_ptr = im_block;
+    dst_u8_ptr = dst;
+
+    height = h;
+    width = w;
+
+    if (width <= 4) {
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      uint16x4_t d0, d1, d2, d3;
+      uint16x8_t dd0, dd1;
+      uint8x8_t d01, d23;
+
+      d_u8 = dst_u8_ptr;
+      v_s = v_src_ptr;
+
+      __builtin_prefetch(v_s + 0 * im_stride);
+      __builtin_prefetch(v_s + 1 * im_stride);
+      __builtin_prefetch(v_s + 2 * im_stride);
+      __builtin_prefetch(v_s + 3 * im_stride);
+      __builtin_prefetch(v_s + 4 * im_stride);
+      __builtin_prefetch(v_s + 5 * im_stride);
+      __builtin_prefetch(v_s + 6 * im_stride);
+      __builtin_prefetch(v_s + 7 * im_stride);
+
+      load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+      v_s += (7 * im_stride);
+
+      do {
+        load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
+        v_s += (im_stride << 2);
+
+        __builtin_prefetch(d_u8 + 0 * dst_stride);
+        __builtin_prefetch(d_u8 + 1 * dst_stride);
+        __builtin_prefetch(d_u8 + 2 * dst_stride);
+        __builtin_prefetch(d_u8 + 3 * dst_stride);
+
+        d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+        d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+        d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+        d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+
+        dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits);
+        dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits);
+
+        d01 = vqmovn_u16(dd0);
+        d23 = vqmovn_u16(dd1);
+
+        if ((w == 4) && (h != 2)) {
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        0);  // 00 01 02 03
+          d_u8 += dst_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        1);  // 10 11 12 13
+          d_u8 += dst_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
+                        0);  // 20 21 22 23
+          d_u8 += dst_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
+                        1);  // 30 31 32 33
+          d_u8 += dst_stride;
+        } else if ((w == 2) && (h != 2)) {
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        0);  // 00 01
+          d_u8 += dst_stride;
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        2);  // 10 11
+          d_u8 += dst_stride;
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
+                        0);  // 20 21
+          d_u8 += dst_stride;
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
+                        2);  // 30 31
+          d_u8 += dst_stride;
+        } else if ((w == 4) && (h == 2)) {
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        0);  // 00 01 02 03
+          d_u8 += dst_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        1);  // 10 11 12 13
+          d_u8 += dst_stride;
+        } else if ((w == 2) && (h == 2)) {
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        0);  // 00 01
+          d_u8 += dst_stride;
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        2);  // 10 11
+          d_u8 += dst_stride;
+        }
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        height -= 4;
+      } while (height > 0);
+    } else {
+      // if width is a multiple of 8 & height is a multiple of 4
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      uint8x8_t res0, res1, res2, res3;
+
+      do {
+        __builtin_prefetch(v_src_ptr + 0 * im_stride);
+        __builtin_prefetch(v_src_ptr + 1 * im_stride);
+        __builtin_prefetch(v_src_ptr + 2 * im_stride);
+        __builtin_prefetch(v_src_ptr + 3 * im_stride);
+        __builtin_prefetch(v_src_ptr + 4 * im_stride);
+        __builtin_prefetch(v_src_ptr + 5 * im_stride);
+        __builtin_prefetch(v_src_ptr + 6 * im_stride);
+        __builtin_prefetch(v_src_ptr + 7 * im_stride);
+
+        v_s = v_src_ptr;
+        load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+        v_s += (7 * im_stride);
+
+        d_u8 = dst_u8_ptr;
+        height = h;
+
+        do {
+          load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
+          v_s += (im_stride << 2);
+
+          __builtin_prefetch(d_u8 + 4 * dst_stride);
+          __builtin_prefetch(d_u8 + 5 * dst_stride);
+          __builtin_prefetch(d_u8 + 6 * dst_stride);
+          __builtin_prefetch(d_u8 + 7 * dst_stride);
+
+          res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+          res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+          res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+          res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+
+          if (h != 2) {
+            vst1_u8(d_u8, res0);
+            d_u8 += dst_stride;
+            vst1_u8(d_u8, res1);
+            d_u8 += dst_stride;
+            vst1_u8(d_u8, res2);
+            d_u8 += dst_stride;
+            vst1_u8(d_u8, res3);
+            d_u8 += dst_stride;
+          } else {
+            vst1_u8(d_u8, res0);
+            d_u8 += dst_stride;
+            vst1_u8(d_u8, res1);
+            d_u8 += dst_stride;
+          }
+          s0 = s4;
+          s1 = s5;
+          s2 = s6;
+          s3 = s7;
+          s4 = s8;
+          s5 = s9;
+          s6 = s10;
+          height -= 4;
+        } while (height > 0);
+        v_src_ptr += 8;
+        dst_u8_ptr += 8;
+        w -= 8;
+      } while (w > 0);
+    }
+  }
+}
+void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  InterpFilterParams *filter_params_x,
+                                  InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+
+  const uint8_t *src1;
+  uint8_t *dst1;
+  int y;
+
+  if (!(w & 0x0F)) {
+    for (y = 0; y < h; ++y) {
+      src1 = src;
+      dst1 = dst;
+      for (int x = 0; x < (w >> 4); ++x) {
+        vst1q_u8(dst1, vld1q_u8(src1));
+        src1 += 16;
+        dst1 += 16;
+      }
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x07)) {
+    for (y = 0; y < h; ++y) {
+      vst1_u8(dst, vld1_u8(src));
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x03)) {
+    for (y = 0; y < h; ++y) {
+      vst1_lane_u32((uint32_t *)(dst), vreinterpret_u32_u8(vld1_u8(src)), 0);
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x01)) {
+    for (y = 0; y < h; ++y) {
+      vst1_lane_u16((uint16_t *)(dst), vreinterpret_u16_u8(vld1_u8(src)), 0);
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/arm/convolve_neon.h b/third_party/aom/av1/common/arm/convolve_neon.h
new file mode 100644
index 000000000..47c93d645
--- /dev/null
+++ b/third_party/aom/av1/common/arm/convolve_neon.h
@@ -0,0 +1,228 @@
+/*
+ *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AV1_COMMON_ARM_CONVOLVE_NEON_H_
+#define AV1_COMMON_ARM_CONVOLVE_NEON_H_
+
+#include <arm_neon.h>
+
+#define HORIZ_EXTRA_ROWS ((SUBPEL_TAPS + 7) & ~0x07)
+
+static INLINE uint8x8_t wiener_convolve8_vert_4x8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, int16_t *filter_y, const int bd,
+    const int round1_bits) {
+  int16x8_t ss0, ss1, ss2;
+  int32x4_t sum0, sum1;
+  uint16x4_t tmp0, tmp1;
+  uint16x8_t tmp;
+  uint8x8_t res;
+
+  const int32_t round_const = (1 << (bd + round1_bits - 1));
+  const int32x4_t round_bits = vdupq_n_s32(-round1_bits);
+  const int32x4_t zero = vdupq_n_s32(0);
+  const int32x4_t round_vec = vdupq_n_s32(round_const);
+
+  ss0 = vaddq_s16(s0, s6);
+  ss1 = vaddq_s16(s1, s5);
+  ss2 = vaddq_s16(s2, s4);
+
+  sum0 = vmull_n_s16(vget_low_s16(ss0), filter_y[0]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(ss1), filter_y[1]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(ss2), filter_y[2]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), filter_y[3]);
+
+  sum1 = vmull_n_s16(vget_high_s16(ss0), filter_y[0]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(ss1), filter_y[1]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(ss2), filter_y[2]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), filter_y[3]);
+
+  sum0 = vsubq_s32(sum0, round_vec);
+  sum1 = vsubq_s32(sum1, round_vec);
+
+  /* right shift & rounding */
+  sum0 = vrshlq_s32(sum0, round_bits);
+  sum1 = vrshlq_s32(sum1, round_bits);
+
+  sum0 = vmaxq_s32(sum0, zero);
+  sum1 = vmaxq_s32(sum1, zero);
+
+  /* from int32x4_t to uint8x8_t */
+  tmp0 = vqmovn_u32(vreinterpretq_u32_s32(sum0));
+  tmp1 = vqmovn_u32(vreinterpretq_u32_s32(sum1));
+  tmp = vcombine_u16(tmp0, tmp1);
+  res = vqmovn_u16(tmp);
+
+  return res;
+}
+
+static INLINE uint16x8_t wiener_convolve8_horiz_8x8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, int16_t *filter_x, const int bd,
+    const int round0_bits) {
+  int16x8_t sum;
+  uint16x8_t res;
+  int32x4_t sum_0, sum_1;
+  int32x4_t s3_0, s3_1;
+  const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
+  const int32_t round_const_1 = (1 << ((bd) + 1 + FILTER_BITS - round0_bits));
+
+  /* for the purpose of right shift by { conv_params->round_0 } */
+  const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
+
+  const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
+  const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
+
+  sum = vmulq_n_s16(s0, filter_x[0]);
+  sum = vmlaq_n_s16(sum, s1, filter_x[1]);
+  sum = vmlaq_n_s16(sum, s2, filter_x[2]);
+
+  /* sum from 16x8 to 2 32x4 registers */
+  sum_0 = vmovl_s16(vget_low_s16(sum));
+  sum_1 = vmovl_s16(vget_high_s16(sum));
+
+  /* s[3]*128 -- and filter coef max can be 128
+   *  then max value possible = 128*128*255 exceeding 16 bit
+   */
+
+  s3_0 = vmull_n_s16(vget_low_s16(s3), filter_x[3]);
+  s3_1 = vmull_n_s16(vget_high_s16(s3), filter_x[3]);
+  sum_0 = vaddq_s32(sum_0, s3_0);
+  sum_1 = vaddq_s32(sum_1, s3_1);
+
+  /* Add the constant value */
+  sum_0 = vaddq_s32(sum_0, round_vec_0);
+  sum_1 = vaddq_s32(sum_1, round_vec_0);
+
+  /* right shift & rounding & saturating */
+  sum_0 = vqrshlq_s32(sum_0, round_bits);
+  sum_1 = vqrshlq_s32(sum_1, round_bits);
+
+  /* Clipping to max value */
+  sum_0 = vminq_s32(sum_0, round_vec_1);
+  sum_1 = vminq_s32(sum_1, round_vec_1);
+
+  res = vcombine_u16(vqmovun_s32(sum_0), vqmovun_s32(sum_1));
+  return res;
+}
+
+static INLINE uint16x4_t wiener_convolve8_horiz_4x8(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, int16_t *filter_x, const int bd,
+    const int round0_bits) {
+  uint16x4_t res;
+  int32x4_t sum_0, s3_0;
+  int16x4_t sum, temp0, temp1, temp2;
+
+  const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
+  const int32_t round_const_1 = (1 << ((bd) + 1 + FILTER_BITS - round0_bits));
+  const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
+  const int32x4_t zero = vdupq_n_s32(0);
+  const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
+  const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
+
+  temp0 = vadd_s16(s0, s6);
+  temp1 = vadd_s16(s1, s5);
+  temp2 = vadd_s16(s2, s4);
+
+  sum = vmul_n_s16(temp0, filter_x[0]);
+  sum = vmla_n_s16(sum, temp1, filter_x[1]);
+  sum = vmla_n_s16(sum, temp2, filter_x[2]);
+  sum_0 = vmovl_s16(sum);
+
+  /* s[3]*128 -- and filter coff max can be 128.
+   * then max value possible = 128*128*255 Therefore, 32 bits are required to
+   * hold the result.
+   */
+  s3_0 = vmull_n_s16(s3, filter_x[3]);
+  sum_0 = vaddq_s32(sum_0, s3_0);
+
+  sum_0 = vaddq_s32(sum_0, round_vec_0);
+  sum_0 = vrshlq_s32(sum_0, round_bits);
+
+  sum_0 = vmaxq_s32(sum_0, zero);
+  sum_0 = vminq_s32(sum_0, round_vec_1);
+  res = vqmovun_s32(sum_0);
+  return res;
+}
+
+static INLINE int16x8_t
+convolve8_8x8_s16(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+                  const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+                  const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
+                  const int16x8_t horiz_const, const int16x8_t shift_round_0) {
+  int16x8_t sum;
+  int16x8_t res;
+
+  sum = horiz_const;
+  sum = vmlaq_n_s16(sum, s0, filter[0]);
+  sum = vmlaq_n_s16(sum, s1, filter[1]);
+  sum = vmlaq_n_s16(sum, s2, filter[2]);
+  sum = vmlaq_n_s16(sum, s3, filter[3]);
+  sum = vmlaq_n_s16(sum, s4, filter[4]);
+  sum = vmlaq_n_s16(sum, s5, filter[5]);
+  sum = vmlaq_n_s16(sum, s6, filter[6]);
+  sum = vmlaq_n_s16(sum, s7, filter[7]);
+
+  res = vqrshlq_s16(sum, shift_round_0);
+
+  return res;
+}
+
+static INLINE int16x4_t
+convolve8_4x4_s16(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+                  const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+                  const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
+                  const int16x4_t horiz_const, const int16x4_t shift_round_0) {
+  int16x4_t sum;
+  sum = horiz_const;
+  sum = vmla_n_s16(sum, s0, filter[0]);
+  sum = vmla_n_s16(sum, s1, filter[1]);
+  sum = vmla_n_s16(sum, s2, filter[2]);
+  sum = vmla_n_s16(sum, s3, filter[3]);
+  sum = vmla_n_s16(sum, s4, filter[4]);
+  sum = vmla_n_s16(sum, s5, filter[5]);
+  sum = vmla_n_s16(sum, s6, filter[6]);
+  sum = vmla_n_s16(sum, s7, filter[7]);
+
+  sum = vqrshl_s16(sum, shift_round_0);
+
+  return sum;
+}
+
+static INLINE uint16x4_t convolve8_4x4_s32(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
+    const int32x4_t round_shift_vec, const int32x4_t offset_const) {
+  int32x4_t sum0;
+  uint16x4_t res;
+  const int32x4_t zero = vdupq_n_s32(0);
+
+  sum0 = vmull_n_s16(s0, y_filter[0]);
+  sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
+  sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
+  sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
+  sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
+  sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
+  sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
+  sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
+
+  sum0 = vaddq_s32(sum0, offset_const);
+  sum0 = vqrshlq_s32(sum0, round_shift_vec);
+  sum0 = vmaxq_s32(sum0, zero);
+  res = vmovn_u32(vreinterpretq_u32_s32(sum0));
+
+  return res;
+}
+
+#endif  // AV1_COMMON_ARM_CONVOLVE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/intrapred_neon.c b/third_party/aom/av1/common/arm/intrapred_neon.c
new file mode 100644
index 000000000..799355553
--- /dev/null
+++ b/third_party/aom/av1/common/arm/intrapred_neon.c
@@ -0,0 +1,79 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void highbd_dc_predictor_neon(uint16_t *dst, ptrdiff_t stride,
+                                            int bw, const uint16_t *above,
+                                            const uint16_t *left) {
+  assert(bw >= 4);
+  assert(IS_POWER_OF_TWO(bw));
+  int expected_dc, sum = 0;
+  const int count = bw * 2;
+  uint32x4_t sum_q = vdupq_n_u32(0);
+  uint32x2_t sum_d;
+  uint16_t *dst_1;
+  if (bw >= 8) {
+    for (int i = 0; i < bw; i += 8) {
+      sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
+      sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
+      above += 8;
+      left += 8;
+    }
+    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
+    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
+    expected_dc = (sum + (count >> 1)) / count;
+    const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
+    for (int r = 0; r < bw; r++) {
+      dst_1 = dst;
+      for (int i = 0; i < bw; i += 8) {
+        vst1q_u16(dst_1, dc);
+        dst_1 += 8;
+      }
+      dst += stride;
+    }
+  } else {  // 4x4
+    sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
+    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
+    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
+    expected_dc = (sum + (count >> 1)) / count;
+    const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
+    for (int r = 0; r < bw; r++) {
+      vst1_u16(dst, dc);
+      dst += stride;
+    }
+  }
+}
+
+#define intra_pred_highbd_sized(type, width)                         \
+  void aom_highbd_##type##_predictor_##width##x##width##_neon(       \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,        \
+      const uint16_t *left, int bd) {                                \
+    (void)bd;                                                        \
+    highbd_##type##_predictor_neon(dst, stride, width, above, left); \
+  }
+
+#define intra_pred_square(type)      \
+  intra_pred_highbd_sized(type, 4);  \
+  intra_pred_highbd_sized(type, 8);  \
+  intra_pred_highbd_sized(type, 16); \
+  intra_pred_highbd_sized(type, 32); \
+  intra_pred_highbd_sized(type, 64);
+
+intra_pred_square(dc);
+
+#undef intra_pred_square
diff --git a/third_party/aom/av1/common/arm/jnt_convolve_neon.c b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
new file mode 100644
index 000000000..992be4a9e
--- /dev/null
+++ b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
@@ -0,0 +1,1326 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/common.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+static INLINE void compute_avg_4x4(
+    uint16x4_t res0, uint16x4_t res1, uint16x4_t res2, uint16x4_t res3,
+    uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
+    const uint16_t fwd_offset, const uint16_t bck_offset,
+    const int16x4_t sub_const_vec, const int16_t round_bits,
+    const int32_t use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
+  int16x4_t tmp0, tmp1, tmp2, tmp3;
+  uint16x4_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
+  uint32x4_t sum0, sum1, sum2, sum3;
+
+  int32x4_t dst0, dst1, dst2, dst3;
+  int16x8_t tmp4, tmp5;
+  const int16x8_t zero = vdupq_n_s16(0);
+
+  if (use_jnt_comp_avg) {
+    const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
+    const int32x4_t const_vec = vmovl_s16(sub_const_vec);
+
+    sum0 = vmull_n_u16(res0, fwd_offset);
+    sum0 = vmlal_n_u16(sum0, d0, bck_offset);
+    sum1 = vmull_n_u16(res1, fwd_offset);
+    sum1 = vmlal_n_u16(sum1, d1, bck_offset);
+    sum2 = vmull_n_u16(res2, fwd_offset);
+    sum2 = vmlal_n_u16(sum2, d2, bck_offset);
+    sum3 = vmull_n_u16(res3, fwd_offset);
+    sum3 = vmlal_n_u16(sum3, d3, bck_offset);
+
+    sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+    sum1 = vshrq_n_u32(sum1, DIST_PRECISION_BITS);
+    sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS);
+    sum3 = vshrq_n_u32(sum3, DIST_PRECISION_BITS);
+
+    dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), const_vec);
+    dst1 = vsubq_s32(vreinterpretq_s32_u32(sum1), const_vec);
+    dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), const_vec);
+    dst3 = vsubq_s32(vreinterpretq_s32_u32(sum3), const_vec);
+
+    dst0 = vqrshlq_s32(dst0, round_bits_vec);
+    dst1 = vqrshlq_s32(dst1, round_bits_vec);
+    dst2 = vqrshlq_s32(dst2, round_bits_vec);
+    dst3 = vqrshlq_s32(dst3, round_bits_vec);
+
+    tmp0 = vqmovn_s32(dst0);
+    tmp1 = vqmovn_s32(dst1);
+    tmp2 = vqmovn_s32(dst2);
+    tmp3 = vqmovn_s32(dst3);
+    tmp4 = vcombine_s16(tmp0, tmp1);
+    tmp5 = vcombine_s16(tmp2, tmp3);
+    tmp4 = vmaxq_s16(tmp4, zero);
+    tmp5 = vmaxq_s16(tmp5, zero);
+
+    *t0 = vqmovn_u16(vreinterpretq_u16_s16(tmp4));
+    *t1 = vqmovn_u16(vreinterpretq_u16_s16(tmp5));
+  } else {
+    const int16x4_t round_bits_vec = vdup_n_s16(-round_bits);
+    tmp_u0 = vhadd_u16(res0, d0);
+    tmp_u1 = vhadd_u16(res1, d1);
+    tmp_u2 = vhadd_u16(res2, d2);
+    tmp_u3 = vhadd_u16(res3, d3);
+
+    tmp0 = vsub_s16(vreinterpret_s16_u16(tmp_u0), sub_const_vec);
+    tmp1 = vsub_s16(vreinterpret_s16_u16(tmp_u1), sub_const_vec);
+    tmp2 = vsub_s16(vreinterpret_s16_u16(tmp_u2), sub_const_vec);
+    tmp3 = vsub_s16(vreinterpret_s16_u16(tmp_u3), sub_const_vec);
+
+    tmp0 = vqrshl_s16(tmp0, round_bits_vec);
+    tmp1 = vqrshl_s16(tmp1, round_bits_vec);
+    tmp2 = vqrshl_s16(tmp2, round_bits_vec);
+    tmp3 = vqrshl_s16(tmp3, round_bits_vec);
+
+    tmp4 = vcombine_s16(tmp0, tmp1);
+    tmp5 = vcombine_s16(tmp2, tmp3);
+    tmp4 = vmaxq_s16(tmp4, zero);
+    tmp5 = vmaxq_s16(tmp5, zero);
+
+    *t0 = vqmovn_u16(vreinterpretq_u16_s16(tmp4));
+    *t1 = vqmovn_u16(vreinterpretq_u16_s16(tmp5));
+  }
+}
+
+static INLINE void compute_avg_8x4(
+    uint16x8_t res0, uint16x8_t res1, uint16x8_t res2, uint16x8_t res3,
+    uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
+    const uint16_t fwd_offset, const uint16_t bck_offset,
+    const int16x4_t sub_const, const int16_t round_bits,
+    const int32_t use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2,
+    uint8x8_t *t3) {
+  int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int16x8_t f0, f1, f2, f3;
+  uint32x4_t sum0, sum1, sum2, sum3;
+  uint32x4_t sum4, sum5, sum6, sum7;
+  int32x4_t dst0, dst1, dst2, dst3;
+  int32x4_t dst4, dst5, dst6, dst7;
+  uint16x8_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
+  const int16x8_t zero = vdupq_n_s16(0);
+
+  if (use_jnt_comp_avg) {
+    const int32x4_t sub_const_vec = vmovl_s16(sub_const);
+    const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits);
+
+    sum0 = vmull_n_u16(vget_low_u16(res0), fwd_offset);
+    sum0 = vmlal_n_u16(sum0, vget_low_u16(d0), bck_offset);
+    sum1 = vmull_n_u16(vget_low_u16(res1), fwd_offset);
+    sum1 = vmlal_n_u16(sum1, vget_low_u16(d1), bck_offset);
+    sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+    sum1 = vshrq_n_u32(sum1, DIST_PRECISION_BITS);
+
+    sum2 = vmull_n_u16(vget_high_u16(res0), fwd_offset);
+    sum2 = vmlal_n_u16(sum2, vget_high_u16(d0), bck_offset);
+    sum3 = vmull_n_u16(vget_high_u16(res1), fwd_offset);
+    sum3 = vmlal_n_u16(sum3, vget_high_u16(d1), bck_offset);
+    sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS);
+    sum3 = vshrq_n_u32(sum3, DIST_PRECISION_BITS);
+
+    sum4 = vmull_n_u16(vget_low_u16(res2), fwd_offset);
+    sum4 = vmlal_n_u16(sum4, vget_low_u16(d2), bck_offset);
+    sum5 = vmull_n_u16(vget_low_u16(res3), fwd_offset);
+    sum5 = vmlal_n_u16(sum5, vget_low_u16(d3), bck_offset);
+    sum4 = vshrq_n_u32(sum4, DIST_PRECISION_BITS);
+    sum5 = vshrq_n_u32(sum5, DIST_PRECISION_BITS);
+
+    sum6 = vmull_n_u16(vget_high_u16(res2), fwd_offset);
+    sum6 = vmlal_n_u16(sum6, vget_high_u16(d2), bck_offset);
+    sum7 = vmull_n_u16(vget_high_u16(res3), fwd_offset);
+    sum7 = vmlal_n_u16(sum7, vget_high_u16(d3), bck_offset);
+    sum6 = vshrq_n_u32(sum6, DIST_PRECISION_BITS);
+    sum7 = vshrq_n_u32(sum7, DIST_PRECISION_BITS);
+
+    dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), sub_const_vec);
+    dst1 = vsubq_s32(vreinterpretq_s32_u32(sum1), sub_const_vec);
+    dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), sub_const_vec);
+    dst3 = vsubq_s32(vreinterpretq_s32_u32(sum3), sub_const_vec);
+    dst4 = vsubq_s32(vreinterpretq_s32_u32(sum4), sub_const_vec);
+    dst5 = vsubq_s32(vreinterpretq_s32_u32(sum5), sub_const_vec);
+    dst6 = vsubq_s32(vreinterpretq_s32_u32(sum6), sub_const_vec);
+    dst7 = vsubq_s32(vreinterpretq_s32_u32(sum7), sub_const_vec);
+
+    dst0 = vqrshlq_s32(dst0, round_bits_vec);
+    dst1 = vqrshlq_s32(dst1, round_bits_vec);
+    dst2 = vqrshlq_s32(dst2, round_bits_vec);
+    dst3 = vqrshlq_s32(dst3, round_bits_vec);
+    dst4 = vqrshlq_s32(dst4, round_bits_vec);
+    dst5 = vqrshlq_s32(dst5, round_bits_vec);
+    dst6 = vqrshlq_s32(dst6, round_bits_vec);
+    dst7 = vqrshlq_s32(dst7, round_bits_vec);
+
+    tmp0 = vqmovn_s32(dst0);
+    tmp1 = vqmovn_s32(dst1);
+    tmp2 = vqmovn_s32(dst2);
+    tmp3 = vqmovn_s32(dst3);
+    tmp4 = vqmovn_s32(dst4);
+    tmp5 = vqmovn_s32(dst5);
+    tmp6 = vqmovn_s32(dst6);
+    tmp7 = vqmovn_s32(dst7);
+
+    f0 = vcombine_s16(tmp0, tmp2);
+    f1 = vcombine_s16(tmp1, tmp3);
+    f2 = vcombine_s16(tmp4, tmp6);
+    f3 = vcombine_s16(tmp5, tmp7);
+
+    f0 = vmaxq_s16(f0, zero);
+    f1 = vmaxq_s16(f1, zero);
+    f2 = vmaxq_s16(f2, zero);
+    f3 = vmaxq_s16(f3, zero);
+
+    *t0 = vqmovn_u16(vreinterpretq_u16_s16(f0));
+    *t1 = vqmovn_u16(vreinterpretq_u16_s16(f1));
+    *t2 = vqmovn_u16(vreinterpretq_u16_s16(f2));
+    *t3 = vqmovn_u16(vreinterpretq_u16_s16(f3));
+
+  } else {
+    const int16x8_t sub_const_vec = vcombine_s16(sub_const, sub_const);
+    const int16x8_t round_bits_vec = vdupq_n_s16(-round_bits);
+
+    tmp_u0 = vhaddq_u16(res0, d0);
+    tmp_u1 = vhaddq_u16(res1, d1);
+    tmp_u2 = vhaddq_u16(res2, d2);
+    tmp_u3 = vhaddq_u16(res3, d3);
+
+    f0 = vsubq_s16(vreinterpretq_s16_u16(tmp_u0), sub_const_vec);
+    f1 = vsubq_s16(vreinterpretq_s16_u16(tmp_u1), sub_const_vec);
+    f2 = vsubq_s16(vreinterpretq_s16_u16(tmp_u2), sub_const_vec);
+    f3 = vsubq_s16(vreinterpretq_s16_u16(tmp_u3), sub_const_vec);
+
+    f0 = vqrshlq_s16(f0, round_bits_vec);
+    f1 = vqrshlq_s16(f1, round_bits_vec);
+    f2 = vqrshlq_s16(f2, round_bits_vec);
+    f3 = vqrshlq_s16(f3, round_bits_vec);
+
+    f0 = vmaxq_s16(f0, zero);
+    f1 = vmaxq_s16(f1, zero);
+    f2 = vmaxq_s16(f2, zero);
+    f3 = vmaxq_s16(f3, zero);
+
+    *t0 = vqmovn_u16(vreinterpretq_u16_s16(f0));
+    *t1 = vqmovn_u16(vreinterpretq_u16_s16(f1));
+    *t2 = vqmovn_u16(vreinterpretq_u16_s16(f2));
+    *t3 = vqmovn_u16(vreinterpretq_u16_s16(f3));
+  }
+}
+
+static INLINE void jnt_convolve_2d_horiz_neon(
+    const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
+    int16_t *x_filter_tmp, const int im_h, int w, const int round_0) {
+  const int bd = 8;
+  const uint8_t *s;
+  int16_t *dst_ptr;
+  int dst_stride;
+  int width, height;
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+
+  dst_ptr = im_block;
+  dst_stride = im_stride;
+  height = im_h;
+  width = w;
+
+  if (w == 4) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+    int16x8_t tt0, tt1, tt2, tt3;
+
+    const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
+    const int16x4_t shift_round_0 = vdup_n_s16(-(round_0));
+
+    do {
+      s = src;
+      __builtin_prefetch(s + 0 * src_stride);
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s0 = vget_low_s16(tt0);
+      s1 = vget_low_s16(tt1);
+      s2 = vget_low_s16(tt2);
+      s3 = vget_low_s16(tt3);
+      s4 = vget_high_s16(tt0);
+      s5 = vget_high_s16(tt1);
+      s6 = vget_high_s16(tt2);
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+      s += 7;
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s7 = vget_low_s16(tt0);
+      s8 = vget_low_s16(tt1);
+      s9 = vget_low_s16(tt2);
+      s10 = vget_low_s16(tt3);
+
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                             horiz_const, shift_round_0);
+
+      transpose_s16_4x4d(&d0, &d1, &d2, &d3);
+
+      vst1_s16((dst_ptr + 0 * dst_stride), d0);
+      vst1_s16((dst_ptr + 1 * dst_stride), d1);
+      vst1_s16((dst_ptr + 2 * dst_stride), d2);
+      vst1_s16((dst_ptr + 3 * dst_stride), d3);
+
+      src += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  } else {
+    int16_t *d_tmp;
+    int16x8_t s11, s12, s13, s14;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+
+    const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
+    const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0));
+
+    do {
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+      __builtin_prefetch(src + 4 * src_stride);
+      __builtin_prefetch(src + 5 * src_stride);
+      __builtin_prefetch(src + 6 * src_stride);
+      __builtin_prefetch(src + 7 * src_stride);
+      load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      width = w;
+      s = src + 7;
+      d_tmp = dst_ptr;
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+      __builtin_prefetch(dst_ptr + 4 * dst_stride);
+      __builtin_prefetch(dst_ptr + 5 * dst_stride);
+      __builtin_prefetch(dst_ptr + 6 * dst_stride);
+      __builtin_prefetch(dst_ptr + 7 * dst_stride);
+
+      do {
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+
+        transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+                          &res7);
+
+        store_s16_8x8(d_tmp, dst_stride, res0, res1, res2, res3, res4, res5,
+                      res6, res7);
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 8 * src_stride;
+      dst_ptr += 8 * dst_stride;
+      height -= 8;
+    } while (height > 0);
+  }
+}
+
+static INLINE void jnt_convolve_2d_vert_neon(
+    int16_t *im_block, const int im_stride, uint8_t *dst8, int dst8_stride,
+    ConvolveParams *conv_params, const int16_t *y_filter, int h, int w) {
+  uint8_t *dst_u8_ptr, *d_u8;
+  CONV_BUF_TYPE *dst_ptr, *dst;
+  int16_t *src_ptr, *s;
+  uint16_t *d;
+
+  const int bd = 8;
+  int height;
+  int dst_stride = conv_params->dst_stride;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int16_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
+                            (1 << (offset_bits - conv_params->round_1 - 1));
+
+  const int16_t round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
+  const int32x4_t offset_const = vdupq_n_s32(1 << offset);
+  const int16x4_t sub_const_vec = vdup_n_s16(sub_const);
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+  int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  uint16x4_t res4, res5, res6, res7;
+  uint16x4_t d0, d1, d2, d3;
+  uint8x8_t t0, t1;
+
+  dst = conv_params->dst;
+  src_ptr = im_block;
+  dst_u8_ptr = dst8;
+  dst_ptr = dst;
+  height = h;
+
+  do {
+    d = dst_ptr;
+    d_u8 = dst_u8_ptr;
+    s = src_ptr;
+    height = h;
+
+    __builtin_prefetch(s + 0 * im_stride);
+    __builtin_prefetch(s + 1 * im_stride);
+    __builtin_prefetch(s + 2 * im_stride);
+    __builtin_prefetch(s + 3 * im_stride);
+    __builtin_prefetch(s + 4 * im_stride);
+    __builtin_prefetch(s + 5 * im_stride);
+    __builtin_prefetch(s + 6 * im_stride);
+    __builtin_prefetch(s + 7 * im_stride);
+
+    load_s16_4x8(s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+    s += (7 * im_stride);
+
+    do {
+      load_s16_4x4(s, im_stride, &s7, &s8, &s9, &s10);
+      s += (im_stride << 2);
+
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d + 1 * dst_stride);
+      __builtin_prefetch(d + 2 * dst_stride);
+      __builtin_prefetch(d + 3 * dst_stride);
+
+      __builtin_prefetch(d_u8 + 4 * dst8_stride);
+      __builtin_prefetch(d_u8 + 5 * dst8_stride);
+      __builtin_prefetch(d_u8 + 6 * dst8_stride);
+      __builtin_prefetch(d_u8 + 7 * dst8_stride);
+
+      d0 = convolve8_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                             round_shift_vec, offset_const);
+      d1 = convolve8_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                             round_shift_vec, offset_const);
+      d2 = convolve8_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                             round_shift_vec, offset_const);
+      d3 = convolve8_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                             round_shift_vec, offset_const);
+
+      if (do_average) {
+        load_u16_4x4(d, dst_stride, &res4, &res5, &res6, &res7);
+        d += (dst_stride << 2);
+
+        compute_avg_4x4(res4, res5, res6, res7, d0, d1, d2, d3, fwd_offset,
+                        bck_offset, sub_const_vec, round_bits, use_jnt_comp_avg,
+                        &t0, &t1);
+
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
+                      0);  // 00 01 02 03
+        d_u8 += dst8_stride;
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
+                      1);  // 10 11 12 13
+        d_u8 += dst8_stride;
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
+                      0);  // 20 21 22 23
+        d_u8 += dst8_stride;
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
+                      1);  // 30 31 32 33
+        d_u8 += dst8_stride;
+
+      } else {
+        store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+        d += (dst_stride << 2);
+      }
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      height -= 4;
+    } while (height > 0);
+    src_ptr += 4;
+    dst_ptr += 4;
+    dst_u8_ptr += 4;
+    w -= 4;
+  } while (w > 0);
+}
+
+void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
+                              int dst8_stride, int w, int h,
+                              InterpFilterParams *filter_params_x,
+                              InterpFilterParams *filter_params_y,
+                              const int subpel_x_q4, const int subpel_y_q4,
+                              ConvolveParams *conv_params) {
+  assert(!(w % 4));
+  assert(!(h % 4));
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+
+  const int im_h = h + filter_params_y->taps - 1;
+  const int im_stride = MAX_SB_SIZE;
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const int round_0 = conv_params->round_0 - 1;
+  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+
+  int16_t x_filter_tmp[8];
+  int16x8_t filter_x_coef = vld1q_s16(x_filter);
+
+  // filter coeffs are even, so downshifting by 1 to reduce intermediate
+  // precision requirements.
+  filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
+  vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+
+  jnt_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
+                             x_filter_tmp, im_h, w, round_0);
+
+  jnt_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride, conv_params,
+                            y_filter, h, w);
+}
+
+void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
+                                   uint8_t *dst8, int dst8_stride, int w, int h,
+                                   InterpFilterParams *filter_params_x,
+                                   InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params) {
+  uint8x8_t res0_8, res1_8, res2_8, res3_8, tmp_shift0, tmp_shift1, tmp_shift2,
+      tmp_shift3;
+  uint16x8_t res_q0, res_q1, res_q2, res_q3, tmp_q0, tmp_q1, tmp_q2, tmp_q3;
+  uint16x4_t tmp4, tmp5, tmp6, tmp7, res4, res5, res6, res7;
+  const uint8_t *src1, *src2;
+  uint8_t *dst8_1;
+  CONV_BUF_TYPE *dst = conv_params->dst, *dst_1, *dst_2;
+  const int dst_stride = conv_params->dst_stride;
+  int x, y;
+  const int16_t bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int16x4_t sub_const_vec = vdup_n_s16((int16_t)round_offset);
+  const uint16x8_t dup_round_offset16x8 = vdupq_n_u16((uint16_t)round_offset);
+  const int16x4_t dup_bits16x4 = vdup_n_s16(bits);
+  const int16x8_t dup_bits16x8 = vdupq_n_s16(bits);
+
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  if (!(w & 0x07)) {
+    for (y = 0; y < (h >> 2); ++y) {
+      src1 = src;
+      dst8_1 = dst8;
+      dst_1 = dst;
+      for (x = 0; x < (w >> 3); ++x) {
+        src2 = src1;
+        load_u8_8x4(src2, src_stride, &res0_8, &res1_8, &res2_8, &res3_8);
+
+        res_q0 = vaddq_u16(vshlq_u16(vmovl_u8(res0_8), dup_bits16x8),
+                           dup_round_offset16x8);
+        res_q1 = vaddq_u16(vshlq_u16(vmovl_u8(res1_8), dup_bits16x8),
+                           dup_round_offset16x8);
+        res_q2 = vaddq_u16(vshlq_u16(vmovl_u8(res2_8), dup_bits16x8),
+                           dup_round_offset16x8);
+        res_q3 = vaddq_u16(vshlq_u16(vmovl_u8(res3_8), dup_bits16x8),
+                           dup_round_offset16x8);
+
+        if (conv_params->do_average) {
+          dst_2 = dst_1;
+          load_u16_8x4(dst_2, dst_stride, &tmp_q0, &tmp_q1, &tmp_q2, &tmp_q3);
+
+          compute_avg_8x4(tmp_q0, tmp_q1, tmp_q2, tmp_q3, res_q0, res_q1,
+                          res_q2, res_q3, conv_params->fwd_offset,
+                          conv_params->bck_offset, sub_const_vec, bits,
+                          conv_params->use_jnt_comp_avg, &tmp_shift0,
+                          &tmp_shift1, &tmp_shift2, &tmp_shift3);
+
+          vst1_u8(dst8_1 + (0 * dst8_stride), tmp_shift0);
+          vst1_u8(dst8_1 + (1 * dst8_stride), tmp_shift1);
+          vst1_u8(dst8_1 + (2 * dst8_stride), tmp_shift2);
+          vst1_u8(dst8_1 + (3 * dst8_stride), tmp_shift3);
+
+        } else {
+          vst1q_u16(dst_1 + (0 * dst_stride), res_q0);
+          vst1q_u16(dst_1 + (1 * dst_stride), res_q1);
+          vst1q_u16(dst_1 + (2 * dst_stride), res_q2);
+          vst1q_u16(dst_1 + (3 * dst_stride), res_q3);
+        }
+        src1 = src1 + 8;
+        dst_1 = dst_1 + 8;
+        dst8_1 = dst8_1 + 8;
+      }
+      src += src_stride * 4;
+      dst8 += dst8_stride * 4;
+      dst += dst_stride * 4;
+    }
+  } else if (!(w & 0x03)) {
+    for (y = 0; y < (h >> 2); ++y) {
+      src1 = src;
+      dst8_1 = dst8;
+      dst_1 = dst;
+
+      load_u8_8x4(src1, src_stride, &res0_8, &res1_8, &res2_8, &res3_8);
+
+      res4 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res0_8)), dup_bits16x4),
+                      vreinterpret_u16_s16(sub_const_vec));
+      res5 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res1_8)), dup_bits16x4),
+                      vreinterpret_u16_s16(sub_const_vec));
+      res6 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res2_8)), dup_bits16x4),
+                      vreinterpret_u16_s16(sub_const_vec));
+      res7 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res3_8)), dup_bits16x4),
+                      vreinterpret_u16_s16(sub_const_vec));
+      if (conv_params->do_average) {
+        load_u16_4x4(dst_1, dst_stride, &tmp4, &tmp5, &tmp6, &tmp7);
+
+        compute_avg_4x4(tmp4, tmp5, tmp6, tmp7, res4, res5, res6, res7,
+                        conv_params->fwd_offset, conv_params->bck_offset,
+                        sub_const_vec, bits, conv_params->use_jnt_comp_avg,
+                        &tmp_shift0, &tmp_shift1);
+
+        vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift0), 0);
+        dst8_1 += dst8_stride;
+        vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift0), 1);
+        dst8_1 += dst8_stride;
+        vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift1), 0);
+        dst8_1 += dst8_stride;
+        vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift1), 1);
+
+      } else {
+        vst1_u16(dst_1, res4);
+        dst_1 += dst_stride;
+        vst1_u16(dst_1, res5);
+        dst_1 += dst_stride;
+        vst1_u16(dst_1, res6);
+        dst_1 += dst_stride;
+        vst1_u16(dst_1, res7);
+      }
+      src += src_stride * 4;
+      dst += dst_stride * 4;
+      dst8 += dst8_stride * 4;
+    }
+  }
+}
+
+void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
+                             int dst8_stride, int w, int h,
+                             InterpFilterParams *filter_params_x,
+                             InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  assert(!(w % 4));
+  assert(!(h % 4));
+
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  // horizontal filter
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+
+  const uint8_t *src_ptr = src - horiz_offset;
+
+  int16_t x_filter_tmp[8];
+  int16x8_t filter_x_coef = vld1q_s16(x_filter);
+
+  // filter coeffs are even, so downshifting by 1 to reduce intermediate
+  // precision requirements.
+  filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
+  vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+
+  const uint8_t *s;
+  uint8_t *d_u8;
+  uint8_t *dst_u8_ptr;
+  CONV_BUF_TYPE *d, *dst_ptr;
+  int width, height;
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+
+  s = src_ptr;
+  dst_ptr = dst;
+  dst_u8_ptr = dst8;
+  width = w;
+  height = h;
+
+  if ((w == 4) || (h == 4)) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+    int16x8_t tt0, tt1, tt2, tt3;
+    uint16x4_t res4, res5, res6, res7;
+    uint32x2_t tu0, tu1;
+    int16x8_t u0, u1;
+    const int16x4_t zero = vdup_n_s16(0);
+    const int16x4_t round_offset_vec = vdup_n_s16(round_offset);
+    const int16x4_t shift_round_0 = vdup_n_s16(-conv_params->round_0 + 1);
+    const int16x4_t horiz_const = vdup_n_s16(bits);
+    do {
+      s = src_ptr;
+      d = dst_ptr;
+      d_u8 = dst_u8_ptr;
+      width = w;
+      __builtin_prefetch(s + 0 * src_stride);
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s0 = vget_low_s16(tt0);
+      s1 = vget_low_s16(tt1);
+      s2 = vget_low_s16(tt2);
+      s3 = vget_low_s16(tt3);
+      s4 = vget_high_s16(tt0);
+      s5 = vget_high_s16(tt1);
+      s6 = vget_high_s16(tt2);
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d + 1 * dst_stride);
+      __builtin_prefetch(d + 2 * dst_stride);
+      __builtin_prefetch(d + 3 * dst_stride);
+      s += 7;
+      do {
+        load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1);
+        t0 = vreinterpret_u8_u32(tu0);
+        t1 = vreinterpret_u8_u32(tu1);
+
+        transpose_u8_4x4(&t0, &t1);
+        u0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        u1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+
+        s7 = vget_low_s16(u0);
+        s8 = vget_low_s16(u1);
+        s9 = vget_high_s16(u0);
+        s10 = vget_high_s16(u1);
+
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                               zero, shift_round_0);
+        d0 = vrshl_s16(d0, horiz_const);
+        d0 = vadd_s16(d0, round_offset_vec);
+        d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                               zero, shift_round_0);
+        d1 = vrshl_s16(d1, horiz_const);
+        d1 = vadd_s16(d1, round_offset_vec);
+        d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                               zero, shift_round_0);
+        d2 = vrshl_s16(d2, horiz_const);
+        d2 = vadd_s16(d2, round_offset_vec);
+        d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                               zero, shift_round_0);
+        d3 = vrshl_s16(d3, horiz_const);
+        d3 = vadd_s16(d3, round_offset_vec);
+
+        transpose_s16_4x4d(&d0, &d1, &d2, &d3);
+
+        if (conv_params->do_average) {
+          __builtin_prefetch(d + 0 * dst_stride);
+          __builtin_prefetch(d + 1 * dst_stride);
+          __builtin_prefetch(d + 2 * dst_stride);
+          __builtin_prefetch(d + 3 * dst_stride);
+
+          __builtin_prefetch(d_u8 + 0 * dst8_stride);
+          __builtin_prefetch(d_u8 + 1 * dst8_stride);
+          __builtin_prefetch(d_u8 + 2 * dst8_stride);
+          __builtin_prefetch(d_u8 + 3 * dst8_stride);
+
+          load_u16_4x4(d, dst_stride, &res4, &res5, &res6, &res7);
+
+          compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0),
+                          vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
+                          vreinterpret_u16_s16(d3), fwd_offset, bck_offset,
+                          round_offset_vec, round_bits, use_jnt_comp_avg, &t0,
+                          &t1);
+
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
+                        0);  // 00 01 02 03
+          vst1_lane_u32((uint32_t *)(d_u8 + dst8_stride),
+                        vreinterpret_u32_u8(t0),
+                        1);  // 10 11 12 13
+          vst1_lane_u32((uint32_t *)(d_u8 + 2 * dst8_stride),
+                        vreinterpret_u32_u8(t1),
+                        0);  // 20 21 22 23
+          vst1_lane_u32((uint32_t *)(d_u8 + 3 * dst8_stride),
+                        vreinterpret_u32_u8(t1),
+                        1);  // 30 31 32 33
+        } else {
+          store_u16_4x4(d, dst_stride, vreinterpret_u16_s16(d0),
+                        vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
+                        vreinterpret_u16_s16(d3));
+        }
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+
+        s += 4;
+        width -= 4;
+        d += 4;
+        d_u8 += 4;
+      } while (width > 0);
+      src_ptr += (src_stride << 2);
+      dst_ptr += (dst_stride << 2);
+      dst_u8_ptr += (dst8_stride << 2);
+      height -= 4;
+    } while (height > 0);
+  } else {
+    CONV_BUF_TYPE *d_tmp;
+    uint8_t *d_u8_tmp;
+    int16x8_t s11, s12, s13, s14;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+    uint16x8_t res8, res9, res10, res11;
+
+    const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
+    const int16x4_t round_offset64 = vdup_n_s16(round_offset);
+    const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0 + 1);
+    const int16x8_t horiz_const = vdupq_n_s16(bits);
+    const int16x8_t zero = vdupq_n_s16(0);
+
+    d = dst_ptr = dst;
+    d_u8 = dst_u8_ptr = dst8;
+    do {
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+      __builtin_prefetch(src_ptr + 4 * src_stride);
+      __builtin_prefetch(src_ptr + 5 * src_stride);
+      __builtin_prefetch(src_ptr + 6 * src_stride);
+      __builtin_prefetch(src_ptr + 7 * src_stride);
+      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      width = w;
+      s = src_ptr + 7;
+      d = dst_ptr;
+      d_u8_tmp = dst_u8_ptr;
+
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+      __builtin_prefetch(dst_ptr + 4 * dst_stride);
+      __builtin_prefetch(dst_ptr + 5 * dst_stride);
+      __builtin_prefetch(dst_ptr + 6 * dst_stride);
+      __builtin_prefetch(dst_ptr + 7 * dst_stride);
+
+      do {
+        d_u8 = d_u8_tmp;
+        d_tmp = d;
+
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                                 zero, shift_round_0);
+
+        res0 = vrshlq_s16(res0, horiz_const);
+        res0 = vaddq_s16(res0, round_offset128);
+
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                                 zero, shift_round_0);
+        res1 = vrshlq_s16(res1, horiz_const);
+        res1 = vaddq_s16(res1, round_offset128);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                                 zero, shift_round_0);
+        res2 = vrshlq_s16(res2, horiz_const);
+        res2 = vaddq_s16(res2, round_offset128);
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                                 zero, shift_round_0);
+        res3 = vrshlq_s16(res3, horiz_const);
+        res3 = vaddq_s16(res3, round_offset128);
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+                                 zero, shift_round_0);
+        res4 = vrshlq_s16(res4, horiz_const);
+        res4 = vaddq_s16(res4, round_offset128);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+                                 x_filter_tmp, zero, shift_round_0);
+        res5 = vrshlq_s16(res5, horiz_const);
+        res5 = vaddq_s16(res5, round_offset128);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+                                 x_filter_tmp, zero, shift_round_0);
+        res6 = vrshlq_s16(res6, horiz_const);
+        res6 = vaddq_s16(res6, round_offset128);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+                                 x_filter_tmp, zero, shift_round_0);
+        res7 = vrshlq_s16(res7, horiz_const);
+        res7 = vaddq_s16(res7, round_offset128);
+
+        transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+                          &res7);
+
+        if (conv_params->do_average) {
+          load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
+          d_tmp += (dst_stride << 2);
+
+          compute_avg_8x4(
+              res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
+              vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
+              vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
+              round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+
+          store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
+          d_u8 += (dst8_stride << 2);
+
+          load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
+          d_tmp += (dst_stride << 2);
+
+          compute_avg_8x4(
+              res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
+              vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
+              vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
+              round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+
+          store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
+          d_u8 += (dst8_stride << 2);
+        } else {
+          store_u16_8x8(
+              d_tmp, dst_stride, vreinterpretq_u16_s16(res0),
+              vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
+              vreinterpretq_u16_s16(res3), vreinterpretq_u16_s16(res4),
+              vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
+              vreinterpretq_u16_s16(res7));
+          d_tmp += (dst_stride << 3);
+        }
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d += 8;
+        width -= 8;
+        d_u8_tmp += 8;
+      } while (width > 0);
+      src_ptr += 8 * src_stride;
+      dst_ptr += 8 * dst_stride;
+      dst_u8_ptr += 8 * dst8_stride;
+      height -= 8;
+    } while (height > 0);
+  }
+}
+
+void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
+                             int dst8_stride, int w, int h,
+                             InterpFilterParams *filter_params_x,
+                             InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  assert(!(w % 4));
+  assert(!(h % 4));
+
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int shift_value = (conv_params->round_1 - 1 - bits);
+
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+
+  // vertical filter
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+
+  const uint8_t *src_ptr = src - (vert_offset * src_stride);
+
+  int16_t y_filter_tmp[8];
+  int16x8_t filter_y_coef = vld1q_s16(y_filter);
+
+  // filter coeffs are even, so downshifting by 1 to reduce intermediate
+  // precision requirements.
+  filter_y_coef = vshrq_n_s16(filter_y_coef, 1);
+  vst1q_s16(&y_filter_tmp[0], filter_y_coef);
+
+  const uint8_t *s;
+  uint8_t *d_u8;
+  uint8_t *dst_u8_ptr;
+  CONV_BUF_TYPE *d, *dst_ptr;
+  int width, height;
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+
+  s = src_ptr;
+  dst_ptr = dst;
+  dst_u8_ptr = dst8;
+  width = w;
+  height = h;
+
+  // used to get rid of multiplication = (vertical filter output sum) *
+  // (1<<bits).
+  assert((conv_params->round_1 - 2) >= bits);
+
+  if ((w == 4) || (h == 4)) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+    uint16x4_t res4, res5, res6, res7;
+    uint32x2_t tu0, tu1, tu2, tu3;
+    int16x8_t u0, u1, u2, u3;
+
+    const int16x4_t round_offset64 = vdup_n_s16(round_offset);
+    const int16x4_t shift_vec = vdup_n_s16(-shift_value);
+    const int16x4_t zero = vdup_n_s16(0);
+
+    do {
+      s = src_ptr;
+      d = dst_ptr;
+      d_u8 = dst_u8_ptr;
+      height = h;
+      __builtin_prefetch(s + 0 * src_stride);
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+
+      load_unaligned_u8_4x8(s, src_stride, &tu0, &tu1, &tu2, &tu3);
+
+      u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+      u1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
+      u2 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu2)));
+      u3 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu3)));
+
+      s0 = vget_low_s16(u0);
+      s1 = vget_high_s16(u0);
+      s2 = vget_low_s16(u1);
+      s3 = vget_high_s16(u1);
+      s4 = vget_low_s16(u2);
+      s5 = vget_high_s16(u2);
+      s6 = vget_low_s16(u3);
+
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d + 1 * dst_stride);
+      __builtin_prefetch(d + 2 * dst_stride);
+      __builtin_prefetch(d + 3 * dst_stride);
+
+      s += (7 * src_stride);
+      do {
+        load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1);
+
+        u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+        u1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
+
+        s7 = vget_low_s16(u0);
+        s8 = vget_high_s16(u0);
+        s9 = vget_low_s16(u1);
+        s10 = vget_high_s16(u1);
+
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+                               zero, shift_vec);
+        d0 = vadd_s16(d0, round_offset64);
+        d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter_tmp,
+                               zero, shift_vec);
+        d1 = vadd_s16(d1, round_offset64);
+        d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter_tmp,
+                               zero, shift_vec);
+        d2 = vadd_s16(d2, round_offset64);
+        d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter_tmp,
+                               zero, shift_vec);
+        d3 = vadd_s16(d3, round_offset64);
+
+        if (conv_params->do_average) {
+          __builtin_prefetch(d + 0 * dst_stride);
+          __builtin_prefetch(d + 1 * dst_stride);
+          __builtin_prefetch(d + 2 * dst_stride);
+          __builtin_prefetch(d + 3 * dst_stride);
+
+          __builtin_prefetch(d_u8 + 0 * dst8_stride);
+          __builtin_prefetch(d_u8 + 1 * dst8_stride);
+          __builtin_prefetch(d_u8 + 2 * dst8_stride);
+          __builtin_prefetch(d_u8 + 3 * dst8_stride);
+
+          load_u16_4x4(d, dst_stride, &res4, &res5, &res6, &res7);
+          d += (dst_stride << 2);
+
+          compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0),
+                          vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
+                          vreinterpret_u16_s16(d3), fwd_offset, bck_offset,
+                          round_offset64, round_bits, use_jnt_comp_avg, &t0,
+                          &t1);
+
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
+                        0);  // 00 01 02 03
+          d_u8 += dst8_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
+                        1);  // 10 11 12 13
+          d_u8 += dst8_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
+                        0);  // 20 21 22 23
+          d_u8 += dst8_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
+                        1);  // 30 31 32 33
+          d_u8 += dst8_stride;
+        } else {
+          store_u16_4x4(d, dst_stride, vreinterpret_u16_s16(d0),
+                        vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
+                        vreinterpret_u16_s16(d3));
+          d += (dst_stride << 2);
+        }
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+
+        s += (src_stride << 2);
+        height -= 4;
+      } while (height > 0);
+      src_ptr += 4;
+      dst_ptr += 4;
+      dst_u8_ptr += 4;
+      width -= 4;
+    } while (width > 0);
+  } else {
+    CONV_BUF_TYPE *d_tmp;
+    int16x8_t s11, s12, s13, s14;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+    uint16x8_t res8, res9, res10, res11;
+    const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
+    const int16x8_t shift_vec = vdupq_n_s16(-shift_value);
+    const int16x4_t round_offset64 = vdup_n_s16(round_offset);
+    const int16x8_t zero = vdupq_n_s16(0);
+
+    dst_ptr = dst;
+    dst_u8_ptr = dst8;
+    do {
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+      __builtin_prefetch(src_ptr + 4 * src_stride);
+      __builtin_prefetch(src_ptr + 5 * src_stride);
+      __builtin_prefetch(src_ptr + 6 * src_stride);
+      __builtin_prefetch(src_ptr + 7 * src_stride);
+      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      height = h;
+      s = src_ptr + (7 * src_stride);
+      d_tmp = dst_ptr;
+      d_u8 = dst_u8_ptr;
+
+      do {
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        __builtin_prefetch(dst_ptr + 0 * dst_stride);
+        __builtin_prefetch(dst_ptr + 1 * dst_stride);
+        __builtin_prefetch(dst_ptr + 2 * dst_stride);
+        __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+                                 zero, shift_vec);
+        res0 = vaddq_s16(res0, round_offset128);
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter_tmp,
+                                 zero, shift_vec);
+        res1 = vaddq_s16(res1, round_offset128);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter_tmp,
+                                 zero, shift_vec);
+        res2 = vaddq_s16(res2, round_offset128);
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter_tmp,
+                                 zero, shift_vec);
+        res3 = vaddq_s16(res3, round_offset128);
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, y_filter_tmp,
+                                 zero, shift_vec);
+        res4 = vaddq_s16(res4, round_offset128);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+                                 y_filter_tmp, zero, shift_vec);
+        res5 = vaddq_s16(res5, round_offset128);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+                                 y_filter_tmp, zero, shift_vec);
+        res6 = vaddq_s16(res6, round_offset128);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+                                 y_filter_tmp, zero, shift_vec);
+        res7 = vaddq_s16(res7, round_offset128);
+
+        if (conv_params->do_average) {
+          __builtin_prefetch(d_tmp + 0 * dst8_stride);
+          __builtin_prefetch(d_tmp + 1 * dst8_stride);
+          __builtin_prefetch(d_tmp + 2 * dst8_stride);
+          __builtin_prefetch(d_tmp + 3 * dst8_stride);
+
+          load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
+          d_tmp += (dst_stride << 2);
+
+          compute_avg_8x4(
+              res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
+              vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
+              vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
+              round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+
+          store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
+          d_u8 += (dst8_stride << 2);
+
+          load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
+          d_tmp += (dst_stride << 2);
+
+          compute_avg_8x4(
+              res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
+              vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
+              vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
+              round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+
+          store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
+          d_u8 += (dst8_stride << 2);
+        } else {
+          store_u16_8x8(
+              d_tmp, dst_stride, vreinterpretq_u16_s16(res0),
+              vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
+              vreinterpretq_u16_s16(res3), vreinterpretq_u16_s16(res4),
+              vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
+              vreinterpretq_u16_s16(res7));
+          d_tmp += (dst_stride << 3);
+        }
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += (8 * src_stride);
+        height -= 8;
+      } while (height > 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      dst_u8_ptr += 8;
+      width -= 8;
+    } while (width > 0);
+  }
+}
diff --git a/third_party/aom/av1/common/arm/mem_neon.h b/third_party/aom/av1/common/arm/mem_neon.h
new file mode 100644
index 000000000..214b14bf7
--- /dev/null
+++ b/third_party/aom/av1/common/arm/mem_neon.h
@@ -0,0 +1,401 @@
+/*
+ *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AV1_COMMON_ARM_MEM_NEON_H_
+#define AV1_COMMON_ARM_MEM_NEON_H_
+
+#include <arm_neon.h>
+#include <string.h>
+
+static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
+                                     const uint8x8_t s1) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+}
+
+static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3,
+                               uint8x8_t *const s4, uint8x8_t *const s5,
+                               uint8x8_t *const s6, uint8x8_t *const s7) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+  s += p;
+  *s4 = vld1_u8(s);
+  s += p;
+  *s5 = vld1_u8(s);
+  s += p;
+  *s6 = vld1_u8(s);
+  s += p;
+  *s7 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x16(const uint8_t *s, ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+}
+
+static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+}
+
+static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
+                                uint16x4_t *const s0, uint16x4_t *const s1,
+                                uint16x4_t *const s2, uint16x4_t *const s3) {
+  *s0 = vld1_u16(s);
+  s += p;
+  *s1 = vld1_u16(s);
+  s += p;
+  *s2 = vld1_u16(s);
+  s += p;
+  *s3 = vld1_u16(s);
+  s += p;
+}
+
+static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
+                                uint16x8_t *const s0, uint16x8_t *const s1,
+                                uint16x8_t *const s2, uint16x8_t *const s3) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+  s += p;
+}
+
+static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p,
+                                int16x4_t *const s0, int16x4_t *const s1,
+                                int16x4_t *const s2, int16x4_t *const s3,
+                                int16x4_t *const s4, int16x4_t *const s5,
+                                int16x4_t *const s6, int16x4_t *const s7) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+  s += p;
+  *s5 = vld1_s16(s);
+  s += p;
+  *s6 = vld1_s16(s);
+  s += p;
+  *s7 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
+                                int16x4_t *const s0, int16x4_t *const s1,
+                                int16x4_t *const s2, int16x4_t *const s3) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+}
+
+static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+                                const uint8x8_t s1, const uint8x8_t s2,
+                                const uint8x8_t s3, const uint8x8_t s4,
+                                const uint8x8_t s5, const uint8x8_t s6,
+                                const uint8x8_t s7) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+  s += p;
+  vst1_u8(s, s4);
+  s += p;
+  vst1_u8(s, s5);
+  s += p;
+  vst1_u8(s, s6);
+  s += p;
+  vst1_u8(s, s7);
+}
+
+static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+                                const uint8x8_t s1, const uint8x8_t s2,
+                                const uint8x8_t s3) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+}
+
+static INLINE void store_u8_8x16(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
+                                 const uint8x16_t s1, const uint8x16_t s2,
+                                 const uint8x16_t s3) {
+  vst1q_u8(s, s0);
+  s += p;
+  vst1q_u8(s, s1);
+  s += p;
+  vst1q_u8(s, s2);
+  s += p;
+  vst1q_u8(s, s3);
+}
+
+static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x8_t s0, const uint16x8_t s1,
+                                 const uint16x8_t s2, const uint16x8_t s3,
+                                 const uint16x8_t s4, const uint16x8_t s5,
+                                 const uint16x8_t s6, const uint16x8_t s7) {
+  vst1q_u16(s, s0);
+  s += dst_stride;
+  vst1q_u16(s, s1);
+  s += dst_stride;
+  vst1q_u16(s, s2);
+  s += dst_stride;
+  vst1q_u16(s, s3);
+  s += dst_stride;
+  vst1q_u16(s, s4);
+  s += dst_stride;
+  vst1q_u16(s, s5);
+  s += dst_stride;
+  vst1q_u16(s, s6);
+  s += dst_stride;
+  vst1q_u16(s, s7);
+}
+
+static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x4_t s0, const uint16x4_t s1,
+                                 const uint16x4_t s2, const uint16x4_t s3) {
+  vst1_u16(s, s0);
+  s += dst_stride;
+  vst1_u16(s, s1);
+  s += dst_stride;
+  vst1_u16(s, s2);
+  s += dst_stride;
+  vst1_u16(s, s3);
+}
+
+static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x8_t s0, const uint16x8_t s1,
+                                 const uint16x8_t s2, const uint16x8_t s3) {
+  vst1q_u16(s, s0);
+  s += dst_stride;
+  vst1q_u16(s, s1);
+  s += dst_stride;
+  vst1q_u16(s, s2);
+  s += dst_stride;
+  vst1q_u16(s, s3);
+}
+
+static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
+                                 const int16x8_t s0, const int16x8_t s1,
+                                 const int16x8_t s2, const int16x8_t s3,
+                                 const int16x8_t s4, const int16x8_t s5,
+                                 const int16x8_t s6, const int16x8_t s7) {
+  vst1q_s16(s, s0);
+  s += dst_stride;
+  vst1q_s16(s, s1);
+  s += dst_stride;
+  vst1q_s16(s, s2);
+  s += dst_stride;
+  vst1q_s16(s, s3);
+  s += dst_stride;
+  vst1q_s16(s, s4);
+  s += dst_stride;
+  vst1q_s16(s, s5);
+  s += dst_stride;
+  vst1q_s16(s, s6);
+  s += dst_stride;
+  vst1q_s16(s, s7);
+}
+
+static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
+                                int16x8_t *const s0, int16x8_t *const s1,
+                                int16x8_t *const s2, int16x8_t *const s3,
+                                int16x8_t *const s4, int16x8_t *const s5,
+                                int16x8_t *const s6, int16x8_t *const s7) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+  s += p;
+  *s7 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
+                                int16x8_t *const s0, int16x8_t *const s1,
+                                int16x8_t *const s2, int16x8_t *const s3) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+}
+
+static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
+                                         uint32x2_t *tu0, uint32x2_t *tu1,
+                                         uint32x2_t *tu2, uint32x2_t *tu3) {
+  uint32_t a;
+
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu1 = vset_lane_u32(a, *tu1, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu1 = vset_lane_u32(a, *tu1, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu2 = vset_lane_u32(a, *tu2, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu2 = vset_lane_u32(a, *tu2, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu3 = vset_lane_u32(a, *tu3, 0);
+  memcpy(&a, buf, 4);
+  *tu3 = vset_lane_u32(a, *tu3, 1);
+}
+
+static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
+                                         uint32x2_t *tu0, uint32x2_t *tu1) {
+  uint32_t a;
+
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu1 = vset_lane_u32(a, *tu1, 0);
+  memcpy(&a, buf, 4);
+  *tu1 = vset_lane_u32(a, *tu1, 1);
+}
+
+static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
+                                         uint32x2_t *tu0) {
+  uint32_t a;
+
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 1);
+}
+
+static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride,
+                                         uint16x4_t *tu0) {
+  uint16_t a;
+
+  memcpy(&a, buf, 2);
+  buf += stride;
+  *tu0 = vset_lane_u16(a, *tu0, 0);
+  memcpy(&a, buf, 2);
+  buf += stride;
+  *tu0 = vset_lane_u16(a, *tu0, 1);
+}
+
+static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3,
+                                uint8x16_t *const s4, uint8x16_t *const s5,
+                                uint8x16_t *const s6, uint8x16_t *const s7) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+  s += p;
+  *s4 = vld1q_u8(s);
+  s += p;
+  *s5 = vld1q_u8(s);
+  s += p;
+  *s6 = vld1q_u8(s);
+  s += p;
+  *s7 = vld1q_u8(s);
+}
+
+static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+}
+
+static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
+                                          uint64x2_t *tu0, uint64x2_t *tu1) {
+  uint64_t a;
+
+  memcpy(&a, buf, 8);
+  buf += stride;
+  *tu0 = vsetq_lane_u64(a, *tu0, 0);
+  memcpy(&a, buf, 8);
+  buf += stride;
+  *tu0 = vsetq_lane_u64(a, *tu0, 1);
+  memcpy(&a, buf, 8);
+  buf += stride;
+  *tu1 = vsetq_lane_u64(a, *tu1, 0);
+  memcpy(&a, buf, 8);
+  *tu1 = vsetq_lane_u64(a, *tu1, 1);
+}
+
+#endif  // AV1_COMMON_ARM_MEM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/neon/iht4x4_add_neon.c b/third_party/aom/av1/common/arm/neon/iht4x4_add_neon.c
deleted file mode 100644
index b29228e43..000000000
--- a/third_party/aom/av1/common/arm/neon/iht4x4_add_neon.c
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
-#include "aom_dsp/txfm_common.h"
-#include "av1/common/common.h"
-
-static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) {
-  int32x4_t q8s32, q9s32;
-  int16x4x2_t d0x2s16, d1x2s16;
-  int32x4x2_t q0x2s32;
-
-  d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
-  d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
-
-  q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
-  q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
-  q0x2s32 = vtrnq_s32(q8s32, q9s32);
-
-  *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
-  *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
-  return;
-}
-
-static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16,
-                                             int16x4_t *d2s16) {
-  *d0s16 = vdup_n_s16((int16_t)cospi_8_64);
-  *d1s16 = vdup_n_s16((int16_t)cospi_16_64);
-  *d2s16 = vdup_n_s16((int16_t)cospi_24_64);
-  return;
-}
-
-static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16,
-                                           int16x4_t *d5s16, int16x8_t *q3s16) {
-  *d3s16 = vdup_n_s16((int16_t)sinpi_1_9);
-  *d4s16 = vdup_n_s16((int16_t)sinpi_2_9);
-  *q3s16 = vdupq_n_s16((int16_t)sinpi_3_9);
-  *d5s16 = vdup_n_s16((int16_t)sinpi_4_9);
-  return;
-}
-
-static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16,
-                              int16x4_t *d2s16, int16x8_t *q8s16,
-                              int16x8_t *q9s16) {
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
-  int16x4_t d26s16, d27s16, d28s16, d29s16;
-  int32x4_t q10s32, q13s32, q14s32, q15s32;
-  int16x8_t q13s16, q14s16;
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-
-  d23s16 = vadd_s16(d16s16, d18s16);
-  d24s16 = vsub_s16(d16s16, d18s16);
-
-  q15s32 = vmull_s16(d17s16, *d2s16);
-  q10s32 = vmull_s16(d17s16, *d0s16);
-  q13s32 = vmull_s16(d23s16, *d1s16);
-  q14s32 = vmull_s16(d24s16, *d1s16);
-  q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
-  q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
-
-  d26s16 = vqrshrn_n_s32(q13s32, 14);
-  d27s16 = vqrshrn_n_s32(q14s32, 14);
-  d29s16 = vqrshrn_n_s32(q15s32, 14);
-  d28s16 = vqrshrn_n_s32(q10s32, 14);
-
-  q13s16 = vcombine_s16(d26s16, d27s16);
-  q14s16 = vcombine_s16(d28s16, d29s16);
-  *q8s16 = vaddq_s16(q13s16, q14s16);
-  *q9s16 = vsubq_s16(q13s16, q14s16);
-  *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16));  // vswp
-  return;
-}
-
-static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
-                               int16x4_t *d5s16, int16x8_t *q3s16,
-                               int16x8_t *q8s16, int16x8_t *q9s16) {
-  int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
-  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
-
-  d6s16 = vget_low_s16(*q3s16);
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-
-  q10s32 = vmull_s16(*d3s16, d16s16);
-  q11s32 = vmull_s16(*d4s16, d16s16);
-  q12s32 = vmull_s16(d6s16, d17s16);
-  q13s32 = vmull_s16(*d5s16, d18s16);
-  q14s32 = vmull_s16(*d3s16, d18s16);
-  q15s32 = vmovl_s16(d16s16);
-  q15s32 = vaddw_s16(q15s32, d19s16);
-  q8s32 = vmull_s16(*d4s16, d19s16);
-  q15s32 = vsubw_s16(q15s32, d18s16);
-  q9s32 = vmull_s16(*d5s16, d19s16);
-
-  q10s32 = vaddq_s32(q10s32, q13s32);
-  q10s32 = vaddq_s32(q10s32, q8s32);
-  q11s32 = vsubq_s32(q11s32, q14s32);
-  q8s32 = vdupq_n_s32((int32_t)sinpi_3_9);
-  q11s32 = vsubq_s32(q11s32, q9s32);
-  q15s32 = vmulq_s32(q15s32, q8s32);
-
-  q13s32 = vaddq_s32(q10s32, q12s32);
-  q10s32 = vaddq_s32(q10s32, q11s32);
-  q14s32 = vaddq_s32(q11s32, q12s32);
-  q10s32 = vsubq_s32(q10s32, q12s32);
-
-  d16s16 = vqrshrn_n_s32(q13s32, 14);
-  d17s16 = vqrshrn_n_s32(q14s32, 14);
-  d18s16 = vqrshrn_n_s32(q15s32, 14);
-  d19s16 = vqrshrn_n_s32(q10s32, 14);
-
-  *q8s16 = vcombine_s16(d16s16, d17s16);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-  return;
-}
-
-void av1_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
-                            int dest_stride, const TxfmParam *txfm_param) {
-  uint8x8_t d26u8, d27u8;
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
-  uint32x2_t d26u32, d27u32;
-  int16x8_t q3s16, q8s16, q9s16;
-  uint16x8_t q8u16, q9u16;
-
-  d26u32 = d27u32 = vdup_n_u32(0);
-
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-
-  TRANSPOSE4X4(&q8s16, &q9s16);
-
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  switch (tx_type) {
-    case DCT_DCT:  // idct_idct is not supported. Fall back to C
-      av1_iht4x4_16_add_c(input, dest, dest_stride, txfm_param);
-      return;
-      break;
-    case ADST_DCT:  // iadst_idct
-      // generate constants
-      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
-      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
-
-      // first transform rows
-      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
-
-      // transpose the matrix
-      TRANSPOSE4X4(&q8s16, &q9s16);
-
-      // then transform columns
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
-      break;
-    case DCT_ADST:  // idct_iadst
-      // generate constantsyy
-      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
-      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
-
-      // first transform rows
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
-
-      // transpose the matrix
-      TRANSPOSE4X4(&q8s16, &q9s16);
-
-      // then transform columns
-      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
-      break;
-    case ADST_ADST:  // iadst_iadst
-      // generate constants
-      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
-
-      // first transform rows
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
-
-      // transpose the matrix
-      TRANSPOSE4X4(&q8s16, &q9s16);
-
-      // then transform columns
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
-      break;
-    default:  // iadst_idct
-      assert(0);
-      break;
-  }
-
-  q8s16 = vrshrq_n_s16(q8s16, 4);
-  q9s16 = vrshrq_n_s16(q9s16, 4);
-
-  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
-  dest += dest_stride;
-  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
-  dest += dest_stride;
-  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
-  dest += dest_stride;
-  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
-
-  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
-  dest -= dest_stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
-  dest -= dest_stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
-  dest -= dest_stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
-  return;
-}
diff --git a/third_party/aom/av1/common/arm/neon/iht8x8_add_neon.c b/third_party/aom/av1/common/arm/neon/iht8x8_add_neon.c
deleted file mode 100644
index 4cd43a99d..000000000
--- a/third_party/aom/av1/common/arm/neon/iht8x8_add_neon.c
+++ /dev/null
@@ -1,594 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
-#include "aom_dsp/txfm_common.h"
-#include "av1/common/common.h"
-
-static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
-                                int16x8_t *q10s16, int16x8_t *q11s16,
-                                int16x8_t *q12s16, int16x8_t *q13s16,
-                                int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
-  int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
-
-  *q8s16 = vcombine_s16(d16s16, d24s16);   // vswp d17, d24
-  *q9s16 = vcombine_s16(d18s16, d26s16);   // vswp d19, d26
-  *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
-  *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
-  *q12s16 = vcombine_s16(d17s16, d25s16);
-  *q13s16 = vcombine_s16(d19s16, d27s16);
-  *q14s16 = vcombine_s16(d21s16, d29s16);
-  *q15s16 = vcombine_s16(d23s16, d31s16);
-
-  q0x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
-  q1x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
-  q2x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
-  q3x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
-
-  q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
-                      vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
-  q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
-                      vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
-  q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
-                      vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
-  q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
-                      vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
-
-  *q8s16 = q0x2s16.val[0];
-  *q9s16 = q0x2s16.val[1];
-  *q10s16 = q1x2s16.val[0];
-  *q11s16 = q1x2s16.val[1];
-  *q12s16 = q2x2s16.val[0];
-  *q13s16 = q2x2s16.val[1];
-  *q14s16 = q3x2s16.val[0];
-  *q15s16 = q3x2s16.val[1];
-  return;
-}
-
-static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
-                              int16x8_t *q10s16, int16x8_t *q11s16,
-                              int16x8_t *q12s16, int16x8_t *q13s16,
-                              int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-
-  d0s16 = vdup_n_s16((int16_t)cospi_28_64);
-  d1s16 = vdup_n_s16((int16_t)cospi_4_64);
-  d2s16 = vdup_n_s16((int16_t)cospi_12_64);
-  d3s16 = vdup_n_s16((int16_t)cospi_20_64);
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
-
-  q2s32 = vmull_s16(d18s16, d0s16);
-  q3s32 = vmull_s16(d19s16, d0s16);
-  q5s32 = vmull_s16(d26s16, d2s16);
-  q6s32 = vmull_s16(d27s16, d2s16);
-
-  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
-  q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
-  q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
-
-  d8s16 = vqrshrn_n_s32(q2s32, 14);
-  d9s16 = vqrshrn_n_s32(q3s32, 14);
-  d10s16 = vqrshrn_n_s32(q5s32, 14);
-  d11s16 = vqrshrn_n_s32(q6s32, 14);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  q2s32 = vmull_s16(d18s16, d1s16);
-  q3s32 = vmull_s16(d19s16, d1s16);
-  q9s32 = vmull_s16(d26s16, d3s16);
-  q13s32 = vmull_s16(d27s16, d3s16);
-
-  q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
-  q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
-  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
-  q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
-
-  d14s16 = vqrshrn_n_s32(q2s32, 14);
-  d15s16 = vqrshrn_n_s32(q3s32, 14);
-  d12s16 = vqrshrn_n_s32(q9s32, 14);
-  d13s16 = vqrshrn_n_s32(q13s32, 14);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-  q7s16 = vcombine_s16(d14s16, d15s16);
-
-  d0s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q2s32 = vmull_s16(d16s16, d0s16);
-  q3s32 = vmull_s16(d17s16, d0s16);
-  q13s32 = vmull_s16(d16s16, d0s16);
-  q15s32 = vmull_s16(d17s16, d0s16);
-
-  q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
-  q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
-  q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
-  q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
-
-  d0s16 = vdup_n_s16((int16_t)cospi_24_64);
-  d1s16 = vdup_n_s16((int16_t)cospi_8_64);
-
-  d18s16 = vqrshrn_n_s32(q2s32, 14);
-  d19s16 = vqrshrn_n_s32(q3s32, 14);
-  d22s16 = vqrshrn_n_s32(q13s32, 14);
-  d23s16 = vqrshrn_n_s32(q15s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-  *q11s16 = vcombine_s16(d22s16, d23s16);
-
-  q2s32 = vmull_s16(d20s16, d0s16);
-  q3s32 = vmull_s16(d21s16, d0s16);
-  q8s32 = vmull_s16(d20s16, d1s16);
-  q12s32 = vmull_s16(d21s16, d1s16);
-
-  q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
-  q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
-  q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
-
-  d26s16 = vqrshrn_n_s32(q2s32, 14);
-  d27s16 = vqrshrn_n_s32(q3s32, 14);
-  d30s16 = vqrshrn_n_s32(q8s32, 14);
-  d31s16 = vqrshrn_n_s32(q12s32, 14);
-  *q13s16 = vcombine_s16(d26s16, d27s16);
-  *q15s16 = vcombine_s16(d30s16, d31s16);
-
-  q0s16 = vaddq_s16(*q9s16, *q15s16);
-  q1s16 = vaddq_s16(*q11s16, *q13s16);
-  q2s16 = vsubq_s16(*q11s16, *q13s16);
-  q3s16 = vsubq_s16(*q9s16, *q15s16);
-
-  *q13s16 = vsubq_s16(q4s16, q5s16);
-  q4s16 = vaddq_s16(q4s16, q5s16);
-  *q14s16 = vsubq_s16(q7s16, q6s16);
-  q7s16 = vaddq_s16(q7s16, q6s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-
-  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q9s32 = vmull_s16(d28s16, d16s16);
-  q10s32 = vmull_s16(d29s16, d16s16);
-  q11s32 = vmull_s16(d28s16, d16s16);
-  q12s32 = vmull_s16(d29s16, d16s16);
-
-  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
-  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
-  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
-  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
-  d10s16 = vqrshrn_n_s32(q9s32, 14);
-  d11s16 = vqrshrn_n_s32(q10s32, 14);
-  d12s16 = vqrshrn_n_s32(q11s32, 14);
-  d13s16 = vqrshrn_n_s32(q12s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  *q8s16 = vaddq_s16(q0s16, q7s16);
-  *q9s16 = vaddq_s16(q1s16, q6s16);
-  *q10s16 = vaddq_s16(q2s16, q5s16);
-  *q11s16 = vaddq_s16(q3s16, q4s16);
-  *q12s16 = vsubq_s16(q3s16, q4s16);
-  *q13s16 = vsubq_s16(q2s16, q5s16);
-  *q14s16 = vsubq_s16(q1s16, q6s16);
-  *q15s16 = vsubq_s16(q0s16, q7s16);
-  return;
-}
-
-static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
-                               int16x8_t *q10s16, int16x8_t *q11s16,
-                               int16x8_t *q12s16, int16x8_t *q13s16,
-                               int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int16x8_t q2s16, q4s16, q5s16, q6s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
-  int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
-
-  d14s16 = vdup_n_s16((int16_t)cospi_2_64);
-  d15s16 = vdup_n_s16((int16_t)cospi_30_64);
-
-  q1s32 = vmull_s16(d30s16, d14s16);
-  q2s32 = vmull_s16(d31s16, d14s16);
-  q3s32 = vmull_s16(d30s16, d15s16);
-  q4s32 = vmull_s16(d31s16, d15s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_18_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_14_64);
-
-  q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
-  q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
-  q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
-  q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);
-
-  q5s32 = vmull_s16(d22s16, d30s16);
-  q6s32 = vmull_s16(d23s16, d30s16);
-  q7s32 = vmull_s16(d22s16, d31s16);
-  q8s32 = vmull_s16(d23s16, d31s16);
-
-  q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
-  q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
-  q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
-  q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);
-
-  q11s32 = vaddq_s32(q1s32, q5s32);
-  q12s32 = vaddq_s32(q2s32, q6s32);
-  q1s32 = vsubq_s32(q1s32, q5s32);
-  q2s32 = vsubq_s32(q2s32, q6s32);
-
-  d22s16 = vqrshrn_n_s32(q11s32, 14);
-  d23s16 = vqrshrn_n_s32(q12s32, 14);
-  *q11s16 = vcombine_s16(d22s16, d23s16);
-
-  q12s32 = vaddq_s32(q3s32, q7s32);
-  q15s32 = vaddq_s32(q4s32, q8s32);
-  q3s32 = vsubq_s32(q3s32, q7s32);
-  q4s32 = vsubq_s32(q4s32, q8s32);
-
-  d2s16 = vqrshrn_n_s32(q1s32, 14);
-  d3s16 = vqrshrn_n_s32(q2s32, 14);
-  d24s16 = vqrshrn_n_s32(q12s32, 14);
-  d25s16 = vqrshrn_n_s32(q15s32, 14);
-  d6s16 = vqrshrn_n_s32(q3s32, 14);
-  d7s16 = vqrshrn_n_s32(q4s32, 14);
-  *q12s16 = vcombine_s16(d24s16, d25s16);
-
-  d0s16 = vdup_n_s16((int16_t)cospi_10_64);
-  d1s16 = vdup_n_s16((int16_t)cospi_22_64);
-  q4s32 = vmull_s16(d26s16, d0s16);
-  q5s32 = vmull_s16(d27s16, d0s16);
-  q2s32 = vmull_s16(d26s16, d1s16);
-  q6s32 = vmull_s16(d27s16, d1s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_26_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_6_64);
-
-  q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
-  q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
-  q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
-  q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);
-
-  q0s32 = vmull_s16(d18s16, d30s16);
-  q13s32 = vmull_s16(d19s16, d30s16);
-
-  q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
-  q13s32 = vmlal_s16(q13s32, d29s16, d31s16);
-
-  q10s32 = vmull_s16(d18s16, d31s16);
-  q9s32 = vmull_s16(d19s16, d31s16);
-
-  q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
-  q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
-
-  q14s32 = vaddq_s32(q2s32, q10s32);
-  q15s32 = vaddq_s32(q6s32, q9s32);
-  q2s32 = vsubq_s32(q2s32, q10s32);
-  q6s32 = vsubq_s32(q6s32, q9s32);
-
-  d28s16 = vqrshrn_n_s32(q14s32, 14);
-  d29s16 = vqrshrn_n_s32(q15s32, 14);
-  d4s16 = vqrshrn_n_s32(q2s32, 14);
-  d5s16 = vqrshrn_n_s32(q6s32, 14);
-  *q14s16 = vcombine_s16(d28s16, d29s16);
-
-  q9s32 = vaddq_s32(q4s32, q0s32);
-  q10s32 = vaddq_s32(q5s32, q13s32);
-  q4s32 = vsubq_s32(q4s32, q0s32);
-  q5s32 = vsubq_s32(q5s32, q13s32);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_8_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_24_64);
-
-  d18s16 = vqrshrn_n_s32(q9s32, 14);
-  d19s16 = vqrshrn_n_s32(q10s32, 14);
-  d8s16 = vqrshrn_n_s32(q4s32, 14);
-  d9s16 = vqrshrn_n_s32(q5s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-
-  q5s32 = vmull_s16(d2s16, d30s16);
-  q6s32 = vmull_s16(d3s16, d30s16);
-  q7s32 = vmull_s16(d2s16, d31s16);
-  q0s32 = vmull_s16(d3s16, d31s16);
-
-  q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
-  q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
-  q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
-  q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
-
-  q1s32 = vmull_s16(d4s16, d30s16);
-  q3s32 = vmull_s16(d5s16, d30s16);
-  q10s32 = vmull_s16(d4s16, d31s16);
-  q2s32 = vmull_s16(d5s16, d31s16);
-
-  q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
-  q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
-  q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
-  q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
-
-  *q8s16 = vaddq_s16(*q11s16, *q9s16);
-  *q11s16 = vsubq_s16(*q11s16, *q9s16);
-  q4s16 = vaddq_s16(*q12s16, *q14s16);
-  *q12s16 = vsubq_s16(*q12s16, *q14s16);
-
-  q14s32 = vaddq_s32(q5s32, q1s32);
-  q15s32 = vaddq_s32(q6s32, q3s32);
-  q5s32 = vsubq_s32(q5s32, q1s32);
-  q6s32 = vsubq_s32(q6s32, q3s32);
-
-  d18s16 = vqrshrn_n_s32(q14s32, 14);
-  d19s16 = vqrshrn_n_s32(q15s32, 14);
-  d10s16 = vqrshrn_n_s32(q5s32, 14);
-  d11s16 = vqrshrn_n_s32(q6s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-
-  q1s32 = vaddq_s32(q7s32, q10s32);
-  q3s32 = vaddq_s32(q0s32, q2s32);
-  q7s32 = vsubq_s32(q7s32, q10s32);
-  q0s32 = vsubq_s32(q0s32, q2s32);
-
-  d28s16 = vqrshrn_n_s32(q1s32, 14);
-  d29s16 = vqrshrn_n_s32(q3s32, 14);
-  d14s16 = vqrshrn_n_s32(q7s32, 14);
-  d15s16 = vqrshrn_n_s32(q0s32, 14);
-  *q14s16 = vcombine_s16(d28s16, d29s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  q2s32 = vmull_s16(d22s16, d30s16);
-  q3s32 = vmull_s16(d23s16, d30s16);
-  q13s32 = vmull_s16(d22s16, d30s16);
-  q1s32 = vmull_s16(d23s16, d30s16);
-
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
-  q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
-  q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
-  q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
-
-  d4s16 = vqrshrn_n_s32(q2s32, 14);
-  d5s16 = vqrshrn_n_s32(q3s32, 14);
-  d24s16 = vqrshrn_n_s32(q13s32, 14);
-  d25s16 = vqrshrn_n_s32(q1s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  *q12s16 = vcombine_s16(d24s16, d25s16);
-
-  q13s32 = vmull_s16(d10s16, d30s16);
-  q1s32 = vmull_s16(d11s16, d30s16);
-  q11s32 = vmull_s16(d10s16, d30s16);
-  q0s32 = vmull_s16(d11s16, d30s16);
-
-  q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
-  q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
-  q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
-  q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
-
-  d20s16 = vqrshrn_n_s32(q13s32, 14);
-  d21s16 = vqrshrn_n_s32(q1s32, 14);
-  d12s16 = vqrshrn_n_s32(q11s32, 14);
-  d13s16 = vqrshrn_n_s32(q0s32, 14);
-  *q10s16 = vcombine_s16(d20s16, d21s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  q5s16 = vdupq_n_s16(0);
-
-  *q9s16 = vsubq_s16(q5s16, *q9s16);
-  *q11s16 = vsubq_s16(q5s16, q2s16);
-  *q13s16 = vsubq_s16(q5s16, q6s16);
-  *q15s16 = vsubq_s16(q5s16, q4s16);
-  return;
-}
-
-void av1_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
-                            int dest_stride, const TxfmParam *txfm_param) {
-  int i;
-  uint8_t *d1, *d2;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8;
-  uint64x1_t d0u64, d1u64, d2u64, d3u64;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  uint16x8_t q8u16, q9u16, q10u16, q11u16;
-
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-  q10s16 = vld1q_s16(input + 8 * 2);
-  q11s16 = vld1q_s16(input + 8 * 3);
-  q12s16 = vld1q_s16(input + 8 * 4);
-  q13s16 = vld1q_s16(input + 8 * 5);
-  q14s16 = vld1q_s16(input + 8 * 6);
-  q15s16 = vld1q_s16(input + 8 * 7);
-
-  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-               &q15s16);
-
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  switch (tx_type) {
-    case DCT_DCT:  // idct_idct is not supported. Fall back to C
-      av1_iht8x8_64_add_c(input, dest, dest_stride, txfm_param);
-      return;
-      break;
-    case ADST_DCT:  // iadst_idct
-      // generate IDCT constants
-      // GENERATE_IDCT_CONSTANTS
-
-      // first transform rows
-      IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                 &q15s16);
-
-      // transpose the matrix
-      TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                   &q15s16);
-
-      // generate IADST constants
-      // GENERATE_IADST_CONSTANTS
-
-      // then transform columns
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
-      break;
-    case DCT_ADST:  // idct_iadst
-      // generate IADST constants
-      // GENERATE_IADST_CONSTANTS
-
-      // first transform rows
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
-
-      // transpose the matrix
-      TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                   &q15s16);
-
-      // generate IDCT constants
-      // GENERATE_IDCT_CONSTANTS
-
-      // then transform columns
-      IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                 &q15s16);
-      break;
-    case ADST_ADST:  // iadst_iadst
-      // generate IADST constants
-      // GENERATE_IADST_CONSTANTS
-
-      // first transform rows
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
-
-      // transpose the matrix
-      TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                   &q15s16);
-
-      // then transform columns
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
-      break;
-    default:  // iadst_idct
-      assert(0);
-      break;
-  }
-
-  q8s16 = vrshrq_n_s16(q8s16, 5);
-  q9s16 = vrshrq_n_s16(q9s16, 5);
-  q10s16 = vrshrq_n_s16(q10s16, 5);
-  q11s16 = vrshrq_n_s16(q11s16, 5);
-  q12s16 = vrshrq_n_s16(q12s16, 5);
-  q13s16 = vrshrq_n_s16(q13s16, 5);
-  q14s16 = vrshrq_n_s16(q14s16, 5);
-  q15s16 = vrshrq_n_s16(q15s16, 5);
-
-  for (d1 = d2 = dest, i = 0; i < 2; i++) {
-    if (i != 0) {
-      q8s16 = q12s16;
-      q9s16 = q13s16;
-      q10s16 = q14s16;
-      q11s16 = q15s16;
-    }
-
-    d0u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d1u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d2u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d3u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-    q10u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-    q11u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-    d2 += dest_stride;
-  }
-  return;
-}
diff --git a/third_party/aom/av1/common/arm/reconinter_neon.c b/third_party/aom/av1/common/arm/reconinter_neon.c
new file mode 100644
index 000000000..44e064195
--- /dev/null
+++ b/third_party/aom/av1/common/arm/reconinter_neon.c
@@ -0,0 +1,86 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/blockd.h"
+#include "config/av1_rtcd.h"
+
+void av1_build_compound_diffwtd_mask_d16_neon(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
+  assert(h >= 4);
+  assert(w >= 4);
+  assert((mask_type == DIFFWTD_38_INV) || (mask_type == DIFFWTD_38));
+  const int round =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+  uint16x8_t diff_q, tmp0, tmp1;
+  uint8x8_t diff_d, diff_select;
+  const CONV_BUF_TYPE *src0_1, *src1_1;
+  const int16x8_t dup_round = vdupq_n_s16((int16_t)(-round));
+  const uint8x8_t dup_38 = vdup_n_u8(38);
+  const uint8x8_t dup_64 = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA);
+  if (mask_type == DIFFWTD_38) {
+    diff_select = vdup_n_u8(255);
+  } else {
+    diff_select = vdup_n_u8(0);
+  }
+  if (w >= 8) {
+    for (int i = 0; i < h; ++i) {
+      src0_1 = src0;
+      src1_1 = src1;
+      for (int j = 0; j < w; j += 8) {
+        __builtin_prefetch(src0_1);
+        __builtin_prefetch(src1_1);
+        diff_q = vabdq_u16(vld1q_u16(src0_1), vld1q_u16(src1_1));
+        diff_q = vrshlq_u16(diff_q, dup_round);
+        diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
+        diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
+        diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
+        vst1_u8(mask, diff_d);
+        src0_1 += 8;
+        src1_1 += 8;
+        mask += 8;
+      }
+      src0 += src0_stride;
+      src1 += src1_stride;
+    }
+  } else if (w == 4) {
+    for (int i = 0; i < h; i += 2) {
+      src0_1 = src0;
+      src1_1 = src1;
+      __builtin_prefetch(src0_1 + 0 * src0_stride);
+      __builtin_prefetch(src0_1 + 1 * src0_stride);
+      __builtin_prefetch(src1_1 + 0 * src1_stride);
+      __builtin_prefetch(src1_1 + 1 * src1_stride);
+      tmp0 = vcombine_u16(vld1_u16(src0_1 + (0 * src0_stride)),
+                          vld1_u16(src0_1 + (1 * src0_stride)));
+      tmp1 = vcombine_u16(vld1_u16(src1_1 + (0 * src1_stride)),
+                          vld1_u16(src1_1 + (1 * src1_stride)));
+      diff_q = vabdq_u16(tmp0, tmp1);
+      diff_q = vrshlq_u16(diff_q, dup_round);
+      diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
+      diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
+      diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
+      vst1_u8(mask, diff_d);
+      src0 += src0_stride * 2;
+      src1 += src1_stride * 2;
+      mask += w * 2;
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/arm/transpose_neon.h b/third_party/aom/av1/common/arm/transpose_neon.h
new file mode 100644
index 000000000..53727bb43
--- /dev/null
+++ b/third_party/aom/av1/common/arm/transpose_neon.h
@@ -0,0 +1,422 @@
+/*
+ *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#define AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+                                    uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
+                                    uint8x8_t *a6, uint8x8_t *a7) {
+  // Swap 8 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
+  // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
+  // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
+  // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
+
+  const uint8x16x2_t b0 =
+      vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
+  const uint8x16x2_t b1 =
+      vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
+  // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
+  // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
+  // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
+
+  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+                                    vreinterpretq_u16_u8(b1.val[0]));
+  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+                                    vreinterpretq_u16_u8(b1.val[1]));
+
+  // Unzip 32 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+                                    vreinterpretq_u32_u16(c1.val[0]));
+  const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+                                    vreinterpretq_u32_u16(c1.val[1]));
+
+  *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
+  *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
+  *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+  *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
+  *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+  *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+  *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
+  *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
+}
+
+static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+                                    uint8x8_t *a3) {
+  // Swap 8 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+
+  const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
+  const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+
+  const uint16x4x2_t c0 =
+      vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
+  const uint16x4x2_t c1 =
+      vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
+
+  *a0 = vreinterpret_u8_u16(c0.val[0]);
+  *a1 = vreinterpret_u8_u16(c1.val[0]);
+  *a2 = vreinterpret_u8_u16(c0.val[1]);
+  *a3 = vreinterpret_u8_u16(c1.val[1]);
+}
+
+static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03  10 11 12 13
+  // a1: 20 21 22 23  30 31 32 33
+  // to:
+  // b0.val[0]: 00 01 20 21  10 11 30 31
+  // b0.val[1]: 02 03 22 23  12 13 32 33
+
+  const uint16x4x2_t b0 =
+      vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 01 20 21  02 03 22 23
+  // c0.val[1]: 10 11 30 31  12 13 32 33
+
+  const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+                                   vreinterpret_u32_u16(b0.val[1]));
+
+  // Swap 8 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30  02 12 22 32
+  // d0.val[1]: 01 11 21 31  03 13 23 33
+
+  const uint8x8x2_t d0 =
+      vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
+
+  *a0 = d0.val[0];
+  *a1 = d0.val[1];
+}
+
+static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+                                    uint8x8_t *a3, const uint8x8_t a4,
+                                    const uint8x8_t a5, const uint8x8_t a6,
+                                    const uint8x8_t a7) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03 XX XX XX XX
+  // a1: 10 11 12 13 XX XX XX XX
+  // a2: 20 21 22 23 XX XX XX XX
+  // a3; 30 31 32 33 XX XX XX XX
+  // a4: 40 41 42 43 XX XX XX XX
+  // a5: 50 51 52 53 XX XX XX XX
+  // a6: 60 61 62 63 XX XX XX XX
+  // a7: 70 71 72 73 XX XX XX XX
+  // to:
+  // b0.val[0]: 00 01 02 03 40 41 42 43
+  // b1.val[0]: 10 11 12 13 50 51 52 53
+  // b2.val[0]: 20 21 22 23 60 61 62 63
+  // b3.val[0]: 30 31 32 33 70 71 72 73
+
+  const uint32x2x2_t b0 =
+      vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
+  const uint32x2x2_t b1 =
+      vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
+  const uint32x2x2_t b2 =
+      vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
+  const uint32x2x2_t b3 =
+      vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 01 20 21 40 41 60 61
+  // c0.val[1]: 02 03 22 23 42 43 62 63
+  // c1.val[0]: 10 11 30 31 50 51 70 71
+  // c1.val[1]: 12 13 32 33 52 53 72 73
+
+  const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
+                                   vreinterpret_u16_u32(b2.val[0]));
+  const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
+                                   vreinterpret_u16_u32(b3.val[0]));
+
+  // Swap 8 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 01 11 21 31 41 51 61 71
+  // d1.val[0]: 02 12 22 32 42 52 62 72
+  // d1.val[1]: 03 13 23 33 43 53 63 73
+
+  const uint8x8x2_t d0 =
+      vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
+  const uint8x8x2_t d1 =
+      vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
+
+  *a0 = d0.val[0];
+  *a1 = d0.val[1];
+  *a2 = d1.val[0];
+  *a3 = d1.val[1];
+}
+
+static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1,
+                                     uint16x4_t *a2, uint16x4_t *a3,
+                                     uint16x4_t *a4, uint16x4_t *a5,
+                                     uint16x4_t *a6, uint16x4_t *a7,
+                                     uint16x8_t *o0, uint16x8_t *o1,
+                                     uint16x8_t *o2, uint16x8_t *o3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // a4: 40 41 42 43
+  // a5: 50 51 52 53
+  // a6: 60 61 62 63
+  // a7: 70 71 72 73
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+  // b2.val[0]: 40 50 42 52
+  // b2.val[1]: 41 51 43 53
+  // b3.val[0]: 60 70 62 72
+  // b3.val[1]: 61 71 63 73
+
+  uint16x4x2_t b0 = vtrn_u16(*a0, *a1);
+  uint16x4x2_t b1 = vtrn_u16(*a2, *a3);
+  uint16x4x2_t b2 = vtrn_u16(*a4, *a5);
+  uint16x4x2_t b3 = vtrn_u16(*a6, *a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+  // c2.val[0]: 40 50 60 70
+  // c2.val[1]: 42 52 62 72
+  // c3.val[0]: 41 51 61 71
+  // c3.val[1]: 43 53 63 73
+
+  uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+                             vreinterpret_u32_u16(b1.val[0]));
+  uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
+                             vreinterpret_u32_u16(b1.val[1]));
+  uint32x2x2_t c2 = vtrn_u32(vreinterpret_u32_u16(b2.val[0]),
+                             vreinterpret_u32_u16(b3.val[0]));
+  uint32x2x2_t c3 = vtrn_u32(vreinterpret_u32_u16(b2.val[1]),
+                             vreinterpret_u32_u16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // o0: 00 10 20 30 40 50 60 70
+  // o1: 01 11 21 31 41 51 61 71
+  // o2: 02 12 22 32 42 52 62 72
+  // o3: 03 13 23 33 43 53 63 73
+
+  *o0 = vcombine_u16(vreinterpret_u16_u32(c0.val[0]),
+                     vreinterpret_u16_u32(c2.val[0]));
+  *o1 = vcombine_u16(vreinterpret_u16_u32(c1.val[0]),
+                     vreinterpret_u16_u32(c3.val[0]));
+  *o2 = vcombine_u16(vreinterpret_u16_u32(c0.val[1]),
+                     vreinterpret_u16_u32(c2.val[1]));
+  *o3 = vcombine_u16(vreinterpret_u16_u32(c1.val[1]),
+                     vreinterpret_u16_u32(c3.val[1]));
+}
+
+static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
+                                     uint16x8_t *a2, uint16x8_t *a3,
+                                     uint16x8_t *a4, uint16x8_t *a5,
+                                     uint16x8_t *a6, uint16x8_t *a7) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
+  const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
+  const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
+  const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+  const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
+                                    vreinterpretq_u32_u16(b3.val[0]));
+  const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
+                                    vreinterpretq_u32_u16(b3.val[1]));
+
+  *a0 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[0])),
+                     vget_low_u16(vreinterpretq_u16_u32(c2.val[0])));
+  *a4 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[0])),
+                     vget_high_u16(vreinterpretq_u16_u32(c2.val[0])));
+
+  *a2 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[1])),
+                     vget_low_u16(vreinterpretq_u16_u32(c2.val[1])));
+  *a6 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[1])),
+                     vget_high_u16(vreinterpretq_u16_u32(c2.val[1])));
+
+  *a1 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[0])),
+                     vget_low_u16(vreinterpretq_u16_u32(c3.val[0])));
+  *a5 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[0])),
+                     vget_high_u16(vreinterpretq_u16_u32(c3.val[0])));
+
+  *a3 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[1])),
+                     vget_low_u16(vreinterpretq_u16_u32(c3.val[1])));
+  *a7 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[1])),
+                     vget_high_u16(vreinterpretq_u16_u32(c3.val[1])));
+}
+
+static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
+                                     int16x8_t *a2, int16x8_t *a3,
+                                     int16x8_t *a4, int16x8_t *a5,
+                                     int16x8_t *a6, int16x8_t *a7) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
+  const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
+  const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
+  const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  *a0 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[0])),
+                     vget_low_s16(vreinterpretq_s16_s32(c2.val[0])));
+  *a4 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[0])),
+                     vget_high_s16(vreinterpretq_s16_s32(c2.val[0])));
+
+  *a2 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[1])),
+                     vget_low_s16(vreinterpretq_s16_s32(c2.val[1])));
+  *a6 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[1])),
+                     vget_high_s16(vreinterpretq_s16_s32(c2.val[1])));
+
+  *a1 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[0])),
+                     vget_low_s16(vreinterpretq_s16_s32(c3.val[0])));
+  *a5 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[0])),
+                     vget_high_s16(vreinterpretq_s16_s32(c3.val[0])));
+
+  *a3 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[1])),
+                     vget_low_s16(vreinterpretq_s16_s32(c3.val[1])));
+  *a7 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[1])),
+                     vget_high_s16(vreinterpretq_s16_s32(c3.val[1])));
+}
+
+static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
+                                      int16x4_t *a2, int16x4_t *a3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+
+  const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
+  const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+
+  const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+                                  vreinterpret_s32_s16(b1.val[0]));
+  const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+                                  vreinterpret_s32_s16(b1.val[1]));
+
+  *a0 = vreinterpret_s16_s32(c0.val[0]);
+  *a1 = vreinterpret_s16_s32(c1.val[0]);
+  *a2 = vreinterpret_s16_s32(c0.val[1]);
+  *a3 = vreinterpret_s16_s32(c1.val[1]);
+}
+
+#endif  // AV1_COMMON_ARM_TRANSPOSE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/wiener_convolve_neon.c b/third_party/aom/av1/common/arm/wiener_convolve_neon.c
new file mode 100644
index 000000000..72fbed4d4
--- /dev/null
+++ b/third_party/aom/av1/common/arm/wiener_convolve_neon.c
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/common.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+/* Wiener filter 2D
+   Apply horizontal filter and store in a temporary buffer. When applying
+   vertical filter, overwrite the original pixel values.
+ */
+
+void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h,
+                                      const ConvolveParams *conv_params) {
+  uint16_t *d_tmp;
+  uint8_t *d;
+  const uint8_t *src_ptr, *s_tmp;
+  uint16_t *dst_ptr;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  int width, height;
+  const int bd = 8;
+  const int intermediate_height = h + SUBPEL_TAPS - 1;
+  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+  int16_t filter_x_tmp[7], filter_y_tmp[7];
+
+  DECLARE_ALIGNED(16, uint16_t,
+                  temp[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(!(w % 8));
+
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
+
+  assert(filter_x[7] == 0);
+  assert(filter_y[7] == 0);
+
+  /* assumption of horizontal filtering output will not exceed 15 bit.
+     ((bd) + 1 + FILTER_BITS - conv_params->round_0) <= 15
+     16 - conv_params->round_0 <= 15 -- (conv_params->round_0) >= 1
+   */
+  assert((conv_params->round_0) >= 1);
+
+  memcpy(&filter_x_tmp[0], filter_x, sizeof(*filter_x) * FILTER_BITS);
+  memcpy(&filter_y_tmp[0], filter_y, sizeof(*filter_y) * FILTER_BITS);
+
+  filter_x_tmp[3] += (1 << FILTER_BITS);
+  filter_y_tmp[3] += (1 << FILTER_BITS);
+
+  s_tmp = src - center_tap * src_stride - center_tap;
+  dst_ptr = temp;
+  src_ptr = s_tmp;
+  height = intermediate_height;
+
+  /* if height is a multiple of 8 */
+  if (!(h & 7)) {
+    int16x8_t res0, res1, res2, res3;
+    uint16x8_t res4, res5, res6, res7, res8, res9, res10, res11;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+    uint8x8_t t8, t9, t10, t11, t12, t13, t14;
+
+    do {
+      const uint8_t *s;
+
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+      __builtin_prefetch(src_ptr + 4 * src_stride);
+      __builtin_prefetch(src_ptr + 5 * src_stride);
+      __builtin_prefetch(src_ptr + 6 * src_stride);
+      __builtin_prefetch(src_ptr + 7 * src_stride);
+
+      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+      s = src_ptr + 7;
+      d_tmp = dst_ptr;
+      width = w;
+
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+      __builtin_prefetch(dst_ptr + 4 * dst_stride);
+      __builtin_prefetch(dst_ptr + 5 * dst_stride);
+      __builtin_prefetch(dst_ptr + 6 * dst_stride);
+      __builtin_prefetch(dst_ptr + 7 * dst_stride);
+
+      do {
+        load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
+        transpose_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t0, t6));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t1, t7));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t2, t6));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t3, t5));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        res5 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t2, t8));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t3, t7));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t4, t6));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        res6 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t3, t9));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t4, t8));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t5, t7));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        res7 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t4, t10));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t5, t9));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t6, t8));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t7));
+        res8 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t5, t11));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t6, t10));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t7, t9));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t8));
+        res9 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t6, t12));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t7, t11));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t8, t10));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t9));
+        res10 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                           bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t7, t13));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t8, t12));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t9, t11));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t10));
+        res11 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                           bd, conv_params->round_0);
+
+        transpose_u16_8x8(&res4, &res5, &res6, &res7, &res8, &res9, &res10,
+                          &res11);
+        store_u16_8x8(d_tmp, MAX_SB_SIZE, res4, res5, res6, res7, res8, res9,
+                      res10, res11);
+
+        t0 = t8;
+        t1 = t9;
+        t2 = t10;
+        t3 = t11;
+        t4 = t12;
+        t5 = t13;
+        t6 = t14;
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += 8 * src_stride;
+      dst_ptr += 8 * MAX_SB_SIZE;
+      height -= 8;
+    } while (height > 0);
+  } else {
+    /*if height is a multiple of 4*/
+    int16x8_t tt0, tt1, tt2, tt3;
+    const uint8_t *s;
+    uint16x4_t res0, res1, res2, res3, res4, res5, res6, res7;
+    uint16x8_t d0, d1, d2, d3;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+    int16x4_t s11, s12, s13, s14;
+    uint8x8_t t0, t1, t2, t3;
+
+    do {
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+
+      load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3); /*8x4*/
+      transpose_u8_8x4(&t0, &t1, &t2,
+                       &t3); /*first 8 pixels of 4 rows transposed-- 4x8*/
+
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+      s0 = vget_low_s16(tt0);  /*pa0 pb0 pc0 pd0 -- pixel_a0*/
+      s1 = vget_low_s16(tt1);  /*pa1 pb1 pc1 pd1 */
+      s2 = vget_low_s16(tt2);  /*pa2 pb2 pc2 pd2 */
+      s3 = vget_low_s16(tt3);  /*pa3 pb3 pc3 pd3 */
+      s4 = vget_high_s16(tt0); /*pa4 pb4 pc4 pd4 */
+      s5 = vget_high_s16(tt1); /*pa5 pb5 pc5 pd5 */
+      s6 = vget_high_s16(tt2); /*pa6 pb6 pc6 pd6 */
+
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+      s = src_ptr + 7;
+      d_tmp = dst_ptr;
+      width = w;
+
+      do {
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); /*8x4*/
+        transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+        tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        s7 = vget_low_s16(tt0); /*pa7  pb7  pc7  pd7  */ /*4x8*/
+        s8 = vget_low_s16(tt1);   /*pa8  pb8  pc8  pd8  */
+        s9 = vget_low_s16(tt2);   /*pa9  pb9  pc9  pd9  */
+        s10 = vget_low_s16(tt3);  /*pa10 pb10 pc10 pd10 */
+        s11 = vget_high_s16(tt0); /*pa11 pb11 pc11 pd11 */
+        s12 = vget_high_s16(tt1); /*pa12 pb12 pc12 pd12 */
+        s13 = vget_high_s16(tt2); /*pa13 pb13 pc13 pd13 */
+        s14 = vget_high_s16(tt3); /*pa14 pb14 pc14 pd14 */
+
+        res0 = wiener_convolve8_horiz_4x8(
+            s0, s1, s2, s3, s4, s5, s6, filter_x_tmp, bd, conv_params->round_0);
+        res1 = wiener_convolve8_horiz_4x8(
+            s1, s2, s3, s4, s5, s6, s7, filter_x_tmp, bd, conv_params->round_0);
+        res2 = wiener_convolve8_horiz_4x8(
+            s2, s3, s4, s5, s6, s7, s8, filter_x_tmp, bd, conv_params->round_0);
+        res3 = wiener_convolve8_horiz_4x8(
+            s3, s4, s5, s6, s7, s8, s9, filter_x_tmp, bd, conv_params->round_0);
+        res4 =
+            wiener_convolve8_horiz_4x8(s4, s5, s6, s7, s8, s9, s10,
+                                       filter_x_tmp, bd, conv_params->round_0);
+        res5 =
+            wiener_convolve8_horiz_4x8(s5, s6, s7, s8, s9, s10, s11,
+                                       filter_x_tmp, bd, conv_params->round_0);
+        res6 =
+            wiener_convolve8_horiz_4x8(s6, s7, s8, s9, s10, s11, s12,
+                                       filter_x_tmp, bd, conv_params->round_0);
+        res7 =
+            wiener_convolve8_horiz_4x8(s7, s8, s9, s10, s11, s12, s13,
+                                       filter_x_tmp, bd, conv_params->round_0);
+
+        transpose_u16_4x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+                          &res7, &d0, &d1, &d2, &d3);
+
+        store_u16_8x4(d_tmp, MAX_SB_SIZE, d0, d1, d2, d3);
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * MAX_SB_SIZE;
+      height -= 4;
+    } while (height > 0);
+  }
+
+  {
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+    uint8x8_t t0, t1, t2, t3;
+    int16_t *src_tmp_ptr, *s;
+    uint8_t *dst_tmp_ptr;
+    height = h;
+    width = w;
+    src_tmp_ptr = (int16_t *)temp;
+    dst_tmp_ptr = dst;
+    src_stride = MAX_SB_SIZE;
+
+    do {
+      s = src_tmp_ptr;
+      s0 = vld1q_s16(s);
+      s += src_stride;
+      s1 = vld1q_s16(s);
+      s += src_stride;
+      s2 = vld1q_s16(s);
+      s += src_stride;
+      s3 = vld1q_s16(s);
+      s += src_stride;
+      s4 = vld1q_s16(s);
+      s += src_stride;
+      s5 = vld1q_s16(s);
+      s += src_stride;
+      s6 = vld1q_s16(s);
+      s += src_stride;
+      d = dst_tmp_ptr;
+      height = h;
+
+      do {
+        __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
+        __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
+        __builtin_prefetch(dst_tmp_ptr + 2 * dst_stride);
+        __builtin_prefetch(dst_tmp_ptr + 3 * dst_stride);
+
+        s7 = vld1q_s16(s);
+        s += src_stride;
+        s8 = vld1q_s16(s);
+        s += src_stride;
+        s9 = vld1q_s16(s);
+        s += src_stride;
+        s10 = vld1q_s16(s);
+        s += src_stride;
+
+        t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
+                                       bd, conv_params->round_1);
+        t1 = wiener_convolve8_vert_4x8(s1, s2, s3, s4, s5, s6, s7, filter_y_tmp,
+                                       bd, conv_params->round_1);
+        t2 = wiener_convolve8_vert_4x8(s2, s3, s4, s5, s6, s7, s8, filter_y_tmp,
+                                       bd, conv_params->round_1);
+        t3 = wiener_convolve8_vert_4x8(s3, s4, s5, s6, s7, s8, s9, filter_y_tmp,
+                                       bd, conv_params->round_1);
+
+        vst1_u8(d, t0);
+        d += dst_stride;
+        vst1_u8(d, t1);
+        d += dst_stride;
+        vst1_u8(d, t2);
+        d += dst_stride;
+        vst1_u8(d, t3);
+        d += dst_stride;
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        height -= 4;
+      } while (height > 3);
+
+      if (height != 0) {
+        __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
+        __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
+
+        do {
+          s7 = vld1q_s16(s);
+          s += src_stride;
+
+          t0 =
+              wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6,
+                                        filter_y_tmp, bd, conv_params->round_1);
+          vst1_u8(d, t0);
+          d += dst_stride;
+
+          s0 = s1;
+          s1 = s2;
+          s2 = s3;
+          s3 = s4;
+          s4 = s5;
+          s5 = s6;
+          s6 = s7;
+          height -= 1;
+        } while (height > 0);
+      }
+
+      src_tmp_ptr += 8;
+      dst_tmp_ptr += 8;
+
+      w -= 8;
+    } while (w > 0);
+  }
+}
diff --git a/third_party/aom/av1/common/av1_fwd_txfm1d.h b/third_party/aom/av1/common/av1_fwd_txfm1d.h
deleted file mode 100644
index f880239f7..000000000
--- a/third_party/aom/av1/common/av1_fwd_txfm1d.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_FWD_TXFM1D_H_
-#define AV1_FWD_TXFM1D_H_
-
-#include "av1/common/av1_txfm.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void av1_fdct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
-                   const int8_t *stage_range);
-void av1_fdct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
-                   const int8_t *stage_range);
-void av1_fdct16_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fdct32_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range);
-#if CONFIG_TX64X64
-void av1_fdct64_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range);
-#endif  // CONFIG_TX64X64
-
-void av1_fadst4_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fadst8_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fadst16_new(const int32_t *input, int32_t *output,
-                     const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fadst32_new(const int32_t *input, int32_t *output,
-                     const int8_t *cos_bit, const int8_t *stage_range);
-#if CONFIG_EXT_TX
-void av1_fidentity4_c(const int32_t *input, int32_t *output,
-                      const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fidentity8_c(const int32_t *input, int32_t *output,
-                      const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fidentity16_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fidentity32_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range);
-#if CONFIG_TX64X64
-void av1_fidentity64_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range);
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // AV1_FWD_TXFM1D_H_
diff --git a/third_party/aom/av1/common/av1_fwd_txfm1d_cfg.h b/third_party/aom/av1/common/av1_fwd_txfm1d_cfg.h
deleted file mode 100644
index f2ed93151..000000000
--- a/third_party/aom/av1/common/av1_fwd_txfm1d_cfg.h
+++ /dev/null
@@ -1,363 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_FWD_TXFM2D_CFG_H_
-#define AV1_FWD_TXFM2D_CFG_H_
-#include "av1/common/enums.h"
-#include "av1/common/av1_fwd_txfm1d.h"
-
-//  ---------------- 4x4 1D constants -----------------------
-// shift
-static const int8_t fwd_shift_4[3] = { 2, 0, 0 };
-
-// stage range
-static const int8_t fwd_stage_range_col_dct_4[4] = { 0, 1, 2, 2 };
-static const int8_t fwd_stage_range_row_dct_4[4] = { 2, 3, 3, 3 };
-static const int8_t fwd_stage_range_col_adst_4[6] = { 0, 0, 1, 2, 2, 2 };
-static const int8_t fwd_stage_range_row_adst_4[6] = { 2, 2, 2, 3, 3, 3 };
-static const int8_t fwd_stage_range_idx_4[1] = { 0 };
-
-// cos bit
-static const int8_t fwd_cos_bit_col_dct_4[4] = { 13, 13, 13, 13 };
-static const int8_t fwd_cos_bit_row_dct_4[4] = { 13, 13, 13, 13 };
-static const int8_t fwd_cos_bit_col_adst_4[6] = { 13, 13, 13, 13, 13, 13 };
-static const int8_t fwd_cos_bit_row_adst_4[6] = { 13, 13, 13, 13, 13, 13 };
-
-//  ---------------- 8x8 1D constants -----------------------
-// shift
-static const int8_t fwd_shift_8[3] = { 2, -1, 0 };
-
-// stage range
-static const int8_t fwd_stage_range_col_dct_8[6] = { 0, 1, 2, 3, 3, 3 };
-static const int8_t fwd_stage_range_row_dct_8[6] = { 3, 4, 5, 5, 5, 5 };
-static const int8_t fwd_stage_range_col_adst_8[8] = { 0, 0, 1, 2, 2, 3, 3, 3 };
-static const int8_t fwd_stage_range_row_adst_8[8] = { 3, 3, 3, 4, 4, 5, 5, 5 };
-static const int8_t fwd_stage_range_idx_8[1] = { 0 };
-
-// cos bit
-static const int8_t fwd_cos_bit_col_dct_8[6] = { 13, 13, 13, 13, 13, 13 };
-static const int8_t fwd_cos_bit_row_dct_8[6] = { 13, 13, 13, 13, 13, 13 };
-static const int8_t fwd_cos_bit_col_adst_8[8] = {
-  13, 13, 13, 13, 13, 13, 13, 13
-};
-static const int8_t fwd_cos_bit_row_adst_8[8] = {
-  13, 13, 13, 13, 13, 13, 13, 13
-};
-
-//  ---------------- 16x16 1D constants -----------------------
-// shift
-static const int8_t fwd_shift_16[3] = { 2, -2, 0 };
-
-// stage range
-static const int8_t fwd_stage_range_col_dct_16[8] = { 0, 1, 2, 3, 4, 4, 4, 4 };
-static const int8_t fwd_stage_range_row_dct_16[8] = { 4, 5, 6, 7, 7, 7, 7, 7 };
-static const int8_t fwd_stage_range_col_adst_16[10] = { 0, 0, 1, 2, 2,
-                                                        3, 3, 4, 4, 4 };
-static const int8_t fwd_stage_range_row_adst_16[10] = {
-  4, 4, 4, 5, 5, 6, 6, 7, 7, 7,
-};
-static const int8_t fwd_stage_range_idx_16[1] = { 0 };
-
-// cos bit
-static const int8_t fwd_cos_bit_col_dct_16[8] = {
-  13, 13, 13, 13, 13, 13, 13, 13
-};
-static const int8_t fwd_cos_bit_row_dct_16[8] = {
-  12, 12, 12, 12, 12, 12, 12, 12
-};
-static const int8_t fwd_cos_bit_col_adst_16[10] = { 13, 13, 13, 13, 13,
-                                                    13, 13, 13, 13, 13 };
-static const int8_t fwd_cos_bit_row_adst_16[10] = { 12, 12, 12, 12, 12,
-                                                    12, 12, 12, 12, 12 };
-
-//  ---------------- 32x32 1D constants -----------------------
-// shift
-static const int8_t fwd_shift_32[3] = { 2, -4, 0 };
-
-// stage range
-static const int8_t fwd_stage_range_col_dct_32[10] = { 0, 1, 2, 3, 4,
-                                                       5, 5, 5, 5, 5 };
-static const int8_t fwd_stage_range_row_dct_32[10] = { 5, 6, 7, 8, 9,
-                                                       9, 9, 9, 9, 9 };
-static const int8_t fwd_stage_range_col_adst_32[12] = { 0, 0, 1, 2, 2, 3,
-                                                        3, 4, 4, 5, 5, 5 };
-static const int8_t fwd_stage_range_row_adst_32[12] = { 5, 5, 5, 6, 6, 7,
-                                                        7, 8, 8, 9, 9, 9 };
-static const int8_t fwd_stage_range_idx_32[1] = { 0 };
-
-// cos bit
-static const int8_t fwd_cos_bit_col_dct_32[10] = { 12, 12, 12, 12, 12,
-                                                   12, 12, 12, 12, 12 };
-static const int8_t fwd_cos_bit_row_dct_32[10] = { 12, 12, 12, 12, 12,
-                                                   12, 12, 12, 12, 12 };
-static const int8_t fwd_cos_bit_col_adst_32[12] = { 12, 12, 12, 12, 12, 12,
-                                                    12, 12, 12, 12, 12, 12 };
-static const int8_t fwd_cos_bit_row_adst_32[12] = { 12, 12, 12, 12, 12, 12,
-                                                    12, 12, 12, 12, 12, 12 };
-
-//  ---------------- 64x64 1D constants -----------------------
-// shift
-static const int8_t fwd_shift_64[3] = { 0, -2, -2 };
-
-// stage range
-static const int8_t fwd_stage_range_col_dct_64[12] = { 0, 1, 2, 3, 4, 5,
-                                                       6, 6, 6, 6, 6, 6 };
-static const int8_t fwd_stage_range_row_dct_64[12] = { 6,  7,  8,  9,  10, 11,
-                                                       11, 11, 11, 11, 11, 11 };
-static const int8_t fwd_stage_range_idx_64[1] = { 0 };
-
-// cos bit
-static const int8_t fwd_cos_bit_col_dct_64[12] = { 15, 15, 15, 15, 15, 14,
-                                                   13, 13, 13, 13, 13, 13 };
-static const int8_t fwd_cos_bit_row_dct_64[12] = { 15, 14, 13, 12, 11, 10,
-                                                   10, 10, 10, 10, 10, 10 };
-
-//  ---------------- row config fwd_dct_4 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_row_cfg_dct_4 = {
-  4,  // .txfm_size
-  4,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_4,                // .shift
-  fwd_stage_range_row_dct_4,  // .stage_range
-  fwd_cos_bit_row_dct_4,      // .cos_bit
-  TXFM_TYPE_DCT4              // .txfm_type
-};
-
-//  ---------------- row config fwd_dct_8 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_row_cfg_dct_8 = {
-  8,  // .txfm_size
-  6,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_8,                // .shift
-  fwd_stage_range_row_dct_8,  // .stage_range
-  fwd_cos_bit_row_dct_8,      // .cos_bit_
-  TXFM_TYPE_DCT8              // .txfm_type
-};
-//  ---------------- row config fwd_dct_16 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_row_cfg_dct_16 = {
-  16,  // .txfm_size
-  8,   // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_16,                // .shift
-  fwd_stage_range_row_dct_16,  // .stage_range
-  fwd_cos_bit_row_dct_16,      // .cos_bit
-  TXFM_TYPE_DCT16              // .txfm_type
-};
-
-//  ---------------- row config fwd_dct_32 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_row_cfg_dct_32 = {
-  32,  // .txfm_size
-  10,  // .stage_num
-  // 1,  // .log_scale
-  fwd_shift_32,                // .shift
-  fwd_stage_range_row_dct_32,  // .stage_range
-  fwd_cos_bit_row_dct_32,      // .cos_bit_row
-  TXFM_TYPE_DCT32              // .txfm_type
-};
-
-//  ---------------- row config fwd_dct_64 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_row_cfg_dct_64 = {
-  64,                          // .txfm_size
-  12,                          // .stage_num
-  fwd_shift_64,                // .shift
-  fwd_stage_range_row_dct_64,  // .stage_range
-  fwd_cos_bit_row_dct_64,      // .cos_bit
-  TXFM_TYPE_DCT64,             // .txfm_type_col
-};
-
-//  ---------------- row config fwd_adst_4 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_row_cfg_adst_4 = {
-  4,  // .txfm_size
-  6,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_4,                 // .shift
-  fwd_stage_range_row_adst_4,  // .stage_range
-  fwd_cos_bit_row_adst_4,      // .cos_bit
-  TXFM_TYPE_ADST4,             // .txfm_type
-};
-
-//  ---------------- row config fwd_adst_8 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_row_cfg_adst_8 = {
-  8,  // .txfm_size
-  8,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_8,                 // .shift
-  fwd_stage_range_row_adst_8,  // .stage_range
-  fwd_cos_bit_row_adst_8,      // .cos_bit
-  TXFM_TYPE_ADST8,             // .txfm_type_col
-};
-
-//  ---------------- row config fwd_adst_16 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_row_cfg_adst_16 = {
-  16,  // .txfm_size
-  10,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_16,                 // .shift
-  fwd_stage_range_row_adst_16,  // .stage_range
-  fwd_cos_bit_row_adst_16,      // .cos_bit
-  TXFM_TYPE_ADST16,             // .txfm_type
-};
-
-//  ---------------- row config fwd_adst_32 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_row_cfg_adst_32 = {
-  32,  // .txfm_size
-  12,  // .stage_num
-  // 1,  // .log_scale
-  fwd_shift_32,                 // .shift
-  fwd_stage_range_row_adst_32,  // .stage_range
-  fwd_cos_bit_row_adst_32,      // .cos_bit
-  TXFM_TYPE_ADST32,             // .txfm_type
-};
-
-//  ---------------- col config fwd_dct_4 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_col_cfg_dct_4 = {
-  4,  // .txfm_size
-  4,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_4,                // .shift
-  fwd_stage_range_col_dct_4,  // .stage_range
-  fwd_cos_bit_col_dct_4,      // .cos_bit
-  TXFM_TYPE_DCT4              // .txfm_type
-};
-
-//  ---------------- col config fwd_dct_8 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_col_cfg_dct_8 = {
-  8,  // .txfm_size
-  6,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_8,                // .shift
-  fwd_stage_range_col_dct_8,  // .stage_range
-  fwd_cos_bit_col_dct_8,      // .cos_bit_
-  TXFM_TYPE_DCT8              // .txfm_type
-};
-//  ---------------- col config fwd_dct_16 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_col_cfg_dct_16 = {
-  16,  // .txfm_size
-  8,   // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_16,                // .shift
-  fwd_stage_range_col_dct_16,  // .stage_range
-  fwd_cos_bit_col_dct_16,      // .cos_bit
-  TXFM_TYPE_DCT16              // .txfm_type
-};
-
-//  ---------------- col config fwd_dct_32 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_col_cfg_dct_32 = {
-  32,  // .txfm_size
-  10,  // .stage_num
-  // 1,  // .log_scale
-  fwd_shift_32,                // .shift
-  fwd_stage_range_col_dct_32,  // .stage_range
-  fwd_cos_bit_col_dct_32,      // .cos_bit_col
-  TXFM_TYPE_DCT32              // .txfm_type
-};
-
-//  ---------------- col config fwd_dct_64 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_col_cfg_dct_64 = {
-  64,                          // .txfm_size
-  12,                          // .stage_num
-  fwd_shift_64,                // .shift
-  fwd_stage_range_col_dct_64,  // .stage_range
-  fwd_cos_bit_col_dct_64,      // .cos_bit
-  TXFM_TYPE_DCT64,             // .txfm_type_col
-};
-
-//  ---------------- col config fwd_adst_4 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_col_cfg_adst_4 = {
-  4,  // .txfm_size
-  6,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_4,                 // .shift
-  fwd_stage_range_col_adst_4,  // .stage_range
-  fwd_cos_bit_col_adst_4,      // .cos_bit
-  TXFM_TYPE_ADST4,             // .txfm_type
-};
-
-//  ---------------- col config fwd_adst_8 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_col_cfg_adst_8 = {
-  8,  // .txfm_size
-  8,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_8,                 // .shift
-  fwd_stage_range_col_adst_8,  // .stage_range
-  fwd_cos_bit_col_adst_8,      // .cos_bit
-  TXFM_TYPE_ADST8,             // .txfm_type_col
-};
-
-//  ---------------- col config fwd_adst_16 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_col_cfg_adst_16 = {
-  16,  // .txfm_size
-  10,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_16,                 // .shift
-  fwd_stage_range_col_adst_16,  // .stage_range
-  fwd_cos_bit_col_adst_16,      // .cos_bit
-  TXFM_TYPE_ADST16,             // .txfm_type
-};
-
-//  ---------------- col config fwd_adst_32 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_col_cfg_adst_32 = {
-  32,  // .txfm_size
-  12,  // .stage_num
-  // 1,  // .log_scale
-  fwd_shift_32,                 // .shift
-  fwd_stage_range_col_adst_32,  // .stage_range
-  fwd_cos_bit_col_adst_32,      // .cos_bit
-  TXFM_TYPE_ADST32,             // .txfm_type
-};
-
-#if CONFIG_EXT_TX
-// identity does not need to differentiate between row and col
-//  ---------------- row/col config fwd_identity_4 ----------
-static const TXFM_1D_CFG fwd_txfm_1d_cfg_identity_4 = {
-  4,  // .txfm_size
-  1,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_4,            // .shift
-  fwd_stage_range_idx_4,  // .stage_range
-  NULL,                   // .cos_bit
-  TXFM_TYPE_IDENTITY4,    // .txfm_type
-};
-
-//  ---------------- row/col config fwd_identity_8 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_cfg_identity_8 = {
-  8,  // .txfm_size
-  1,  // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_8,            // .shift
-  fwd_stage_range_idx_8,  // .stage_range
-  NULL,                   // .cos_bit
-  TXFM_TYPE_IDENTITY8,    // .txfm_type
-};
-
-//  ---------------- row/col config fwd_identity_16 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_cfg_identity_16 = {
-  16,  // .txfm_size
-  1,   // .stage_num
-  // 0,  // .log_scale
-  fwd_shift_16,            // .shift
-  fwd_stage_range_idx_16,  // .stage_range
-  NULL,                    // .cos_bit
-  TXFM_TYPE_IDENTITY16,    // .txfm_type
-};
-
-//  ---------------- row/col config fwd_identity_32 ----------------
-static const TXFM_1D_CFG fwd_txfm_1d_cfg_identity_32 = {
-  32,  // .txfm_size
-  1,   // .stage_num
-  // 1,  // .log_scale
-  fwd_shift_32,            // .shift
-  fwd_stage_range_idx_32,  // .stage_range
-  NULL,                    // .cos_bit
-  TXFM_TYPE_IDENTITY32,    // .txfm_type
-};
-#endif  // CONFIG_EXT_TX
-#endif  // AV1_FWD_TXFM2D_CFG_H_
diff --git a/third_party/aom/av1/common/av1_fwd_txfm2d.c b/third_party/aom/av1/common/av1_fwd_txfm2d.c
deleted file mode 100644
index 740c63322..000000000
--- a/third_party/aom/av1/common/av1_fwd_txfm2d.c
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "./av1_rtcd.h"
-#include "aom_dsp/txfm_common.h"
-#include "av1/common/enums.h"
-#include "av1/common/av1_fwd_txfm1d.h"
-#include "av1/common/av1_fwd_txfm1d_cfg.h"
-#include "av1/common/av1_txfm.h"
-
-static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
-  switch (txfm_type) {
-    case TXFM_TYPE_DCT4: return av1_fdct4_new;
-    case TXFM_TYPE_DCT8: return av1_fdct8_new;
-    case TXFM_TYPE_DCT16: return av1_fdct16_new;
-    case TXFM_TYPE_DCT32: return av1_fdct32_new;
-#if CONFIG_TX64X64
-    case TXFM_TYPE_DCT64: return av1_fdct64_new;
-#endif  // CONFIG_TX64X64
-    case TXFM_TYPE_ADST4: return av1_fadst4_new;
-    case TXFM_TYPE_ADST8: return av1_fadst8_new;
-    case TXFM_TYPE_ADST16: return av1_fadst16_new;
-    case TXFM_TYPE_ADST32: return av1_fadst32_new;
-#if CONFIG_EXT_TX
-    case TXFM_TYPE_IDENTITY4: return av1_fidentity4_c;
-    case TXFM_TYPE_IDENTITY8: return av1_fidentity8_c;
-    case TXFM_TYPE_IDENTITY16: return av1_fidentity16_c;
-    case TXFM_TYPE_IDENTITY32: return av1_fidentity32_c;
-#if CONFIG_TX64X64
-    case TXFM_TYPE_IDENTITY64: return av1_fidentity64_c;
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
-    default: assert(0); return NULL;
-  }
-}
-
-void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
-                             const TXFM_2D_FLIP_CFG *cfg, int bd) {
-  // Note when assigning txfm_size_col, we use the txfm_size from the
-  // row configuration and vice versa. This is intentionally done to
-  // accurately perform rectangular transforms. When the transform is
-  // rectangular, the number of columns will be the same as the
-  // txfm_size stored in the row cfg struct. It will make no difference
-  // for square transforms.
-  const int txfm_size_col = cfg->row_cfg->txfm_size;
-  const int txfm_size_row = cfg->col_cfg->txfm_size;
-  // Take the shift from the larger dimension in the rectangular case.
-  const int8_t *shift = (txfm_size_col > txfm_size_row) ? cfg->row_cfg->shift
-                                                        : cfg->col_cfg->shift;
-  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
-  for (int i = 0; i < cfg->col_cfg->stage_num && i < MAX_TXFM_STAGE_NUM; ++i) {
-    stage_range_col[i] = cfg->col_cfg->stage_range[i] + shift[0] + bd + 1;
-  }
-
-  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
-  for (int i = 0; i < cfg->row_cfg->stage_num && i < MAX_TXFM_STAGE_NUM; ++i) {
-    stage_range_row[i] =
-        cfg->row_cfg->stage_range[i] + shift[0] + shift[1] + bd + 1;
-  }
-}
-
-static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
-                                const int stride, const TXFM_2D_FLIP_CFG *cfg,
-                                int32_t *buf, int bd) {
-  int c, r;
-  // Note when assigning txfm_size_col, we use the txfm_size from the
-  // row configuration and vice versa. This is intentionally done to
-  // accurately perform rectangular transforms. When the transform is
-  // rectangular, the number of columns will be the same as the
-  // txfm_size stored in the row cfg struct. It will make no difference
-  // for square transforms.
-  const int txfm_size_col = cfg->row_cfg->txfm_size;
-  const int txfm_size_row = cfg->col_cfg->txfm_size;
-  // Take the shift from the larger dimension in the rectangular case.
-  const int8_t *shift = (txfm_size_col > txfm_size_row) ? cfg->row_cfg->shift
-                                                        : cfg->col_cfg->shift;
-  int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
-  int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
-  assert(cfg->col_cfg->stage_num <= MAX_TXFM_STAGE_NUM);
-  assert(cfg->row_cfg->stage_num <= MAX_TXFM_STAGE_NUM);
-  av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bd);
-
-  const int8_t *cos_bit_col = cfg->col_cfg->cos_bit;
-  const int8_t *cos_bit_row = cfg->row_cfg->cos_bit;
-  const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->col_cfg->txfm_type);
-  const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->row_cfg->txfm_type);
-
-  // use output buffer as temp buffer
-  int32_t *temp_in = output;
-  int32_t *temp_out = output + txfm_size_row;
-
-  // Columns
-  for (c = 0; c < txfm_size_col; ++c) {
-    if (cfg->ud_flip == 0) {
-      for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * stride + c];
-    } else {
-      for (r = 0; r < txfm_size_row; ++r)
-        // flip upside down
-        temp_in[r] = input[(txfm_size_row - r - 1) * stride + c];
-    }
-    round_shift_array(temp_in, txfm_size_row, -shift[0]);
-    // Multiply everything by Sqrt2 on the larger dimension if the
-    // transform is rectangular
-    if (txfm_size_col > txfm_size_row) {
-      for (r = 0; r < txfm_size_row; ++r)
-        temp_in[r] = (int32_t)fdct_round_shift(temp_in[r] * Sqrt2);
-    }
-    txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
-    round_shift_array(temp_out, txfm_size_row, -shift[1]);
-    if (cfg->lr_flip == 0) {
-      for (r = 0; r < txfm_size_row; ++r)
-        buf[r * txfm_size_col + c] = temp_out[r];
-    } else {
-      for (r = 0; r < txfm_size_row; ++r)
-        // flip from left to right
-        buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
-    }
-  }
-
-  // Rows
-  for (r = 0; r < txfm_size_row; ++r) {
-    // Multiply everything by Sqrt2 on the larger dimension if the
-    // transform is rectangular
-    if (txfm_size_row > txfm_size_col) {
-      for (c = 0; c < txfm_size_col; ++c)
-        buf[r * txfm_size_col + c] =
-            (int32_t)fdct_round_shift(buf[r * txfm_size_col + c] * Sqrt2);
-    }
-    txfm_func_row(buf + r * txfm_size_col, output + r * txfm_size_col,
-                  cos_bit_row, stage_range_row);
-    round_shift_array(output + r * txfm_size_col, txfm_size_col, -shift[2]);
-  }
-}
-
-void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride,
-                          TX_TYPE tx_type, int bd) {
-#if CONFIG_TXMG
-  int32_t txfm_buf[4 * 8];
-  int16_t rinput[4 * 8];
-  TX_SIZE tx_size = TX_4X8;
-  TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
-  TX_TYPE rtx_type = av1_rotate_tx_type(tx_type);
-  int w = tx_size_wide[tx_size];
-  int h = tx_size_high[tx_size];
-  int rw = h;
-  int rh = w;
-  transpose_int16(rinput, rw, input, stride, w, h);
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(rtx_type, rtx_size);
-  fwd_txfm2d_c(rinput, txfm_buf, rw, &cfg, output, bd);
-  transpose_int32(output, w, txfm_buf, rw, rw, rh);
-#else
-  int32_t txfm_buf[4 * 8];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_4X8);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-#endif
-}
-
-void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride,
-                          TX_TYPE tx_type, int bd) {
-  int32_t txfm_buf[8 * 4];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_8X4);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-}
-
-void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride,
-                           TX_TYPE tx_type, int bd) {
-#if CONFIG_TXMG
-  int32_t txfm_buf[8 * 16];
-  int16_t rinput[8 * 16];
-  TX_SIZE tx_size = TX_8X16;
-  TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
-  TX_TYPE rtx_type = av1_rotate_tx_type(tx_type);
-  int w = tx_size_wide[tx_size];
-  int h = tx_size_high[tx_size];
-  int rw = h;
-  int rh = w;
-  transpose_int16(rinput, rw, input, stride, w, h);
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(rtx_type, rtx_size);
-  fwd_txfm2d_c(rinput, txfm_buf, rw, &cfg, output, bd);
-  transpose_int32(output, w, txfm_buf, rw, rw, rh);
-#else
-  int32_t txfm_buf[8 * 16];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_8X16);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-#endif
-}
-
-void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride,
-                           TX_TYPE tx_type, int bd) {
-  int32_t txfm_buf[16 * 8];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_16X8);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-}
-
-void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride,
-                            TX_TYPE tx_type, int bd) {
-#if CONFIG_TXMG
-  int32_t txfm_buf[16 * 32];
-  int16_t rinput[16 * 32];
-  TX_SIZE tx_size = TX_16X32;
-  TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
-  TX_TYPE rtx_type = av1_rotate_tx_type(tx_type);
-  int w = tx_size_wide[tx_size];
-  int h = tx_size_high[tx_size];
-  int rw = h;
-  int rh = w;
-  transpose_int16(rinput, rw, input, stride, w, h);
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(rtx_type, rtx_size);
-  fwd_txfm2d_c(rinput, txfm_buf, rw, &cfg, output, bd);
-  transpose_int32(output, w, txfm_buf, rw, rw, rh);
-#else
-  int32_t txfm_buf[16 * 32];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_16X32);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-#endif
-}
-
-void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride,
-                            TX_TYPE tx_type, int bd) {
-  int32_t txfm_buf[32 * 16];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_32X16);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-}
-
-void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride,
-                          TX_TYPE tx_type, int bd) {
-  int32_t txfm_buf[4 * 4];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_4X4);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-}
-
-void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride,
-                          TX_TYPE tx_type, int bd) {
-  int32_t txfm_buf[8 * 8];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_8X8);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-}
-
-void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride,
-                            TX_TYPE tx_type, int bd) {
-  int32_t txfm_buf[16 * 16];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_16X16);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-}
-
-void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride,
-                            TX_TYPE tx_type, int bd) {
-  int32_t txfm_buf[32 * 32];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_32X32);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-}
-
-#if CONFIG_TX64X64
-void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride,
-                            TX_TYPE tx_type, int bd) {
-  int32_t txfm_buf[64 * 64];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_64x64_cfg(tx_type);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-}
-
-void av1_fwd_txfm2d_32x64_c(const int16_t *input, int32_t *output, int stride,
-                            TX_TYPE tx_type, int bd) {
-  int32_t txfm_buf[32 * 64];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_32x64_cfg(tx_type);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-}
-
-void av1_fwd_txfm2d_64x32_c(const int16_t *input, int32_t *output, int stride,
-                            TX_TYPE tx_type, int bd) {
-  int32_t txfm_buf[64 * 32];
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_64x32_cfg(tx_type);
-  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
-}
-#endif  // CONFIG_TX64X64
-
-static const TXFM_1D_CFG *fwd_txfm_col_cfg_ls[TX_TYPES_1D][TX_SIZES] = {
-  // DCT
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &fwd_txfm_1d_col_cfg_dct_4, &fwd_txfm_1d_col_cfg_dct_8,
-      &fwd_txfm_1d_col_cfg_dct_16, &fwd_txfm_1d_col_cfg_dct_32 },
-  // ADST
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &fwd_txfm_1d_col_cfg_adst_4, &fwd_txfm_1d_col_cfg_adst_8,
-      &fwd_txfm_1d_col_cfg_adst_16, &fwd_txfm_1d_col_cfg_adst_32 },
-#if CONFIG_EXT_TX
-  // FLIPADST
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &fwd_txfm_1d_col_cfg_adst_4, &fwd_txfm_1d_col_cfg_adst_8,
-      &fwd_txfm_1d_col_cfg_adst_16, &fwd_txfm_1d_col_cfg_adst_32 },
-  // IDENTITY
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &fwd_txfm_1d_cfg_identity_4, &fwd_txfm_1d_cfg_identity_8,
-      &fwd_txfm_1d_cfg_identity_16, &fwd_txfm_1d_cfg_identity_32 },
-#endif  // CONFIG_EXT_TX
-};
-
-static const TXFM_1D_CFG *fwd_txfm_row_cfg_ls[TX_TYPES_1D][TX_SIZES] = {
-  // DCT
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &fwd_txfm_1d_row_cfg_dct_4, &fwd_txfm_1d_row_cfg_dct_8,
-      &fwd_txfm_1d_row_cfg_dct_16, &fwd_txfm_1d_row_cfg_dct_32 },
-  // ADST
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &fwd_txfm_1d_row_cfg_adst_4, &fwd_txfm_1d_row_cfg_adst_8,
-      &fwd_txfm_1d_row_cfg_adst_16, &fwd_txfm_1d_row_cfg_adst_32 },
-#if CONFIG_EXT_TX
-  // FLIPADST
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &fwd_txfm_1d_row_cfg_adst_4, &fwd_txfm_1d_row_cfg_adst_8,
-      &fwd_txfm_1d_row_cfg_adst_16, &fwd_txfm_1d_row_cfg_adst_32 },
-  // IDENTITY
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &fwd_txfm_1d_cfg_identity_4, &fwd_txfm_1d_cfg_identity_8,
-      &fwd_txfm_1d_cfg_identity_16, &fwd_txfm_1d_cfg_identity_32 },
-#endif  // CONFIG_EXT_TX
-};
-
-TXFM_2D_FLIP_CFG av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size) {
-  TXFM_2D_FLIP_CFG cfg;
-  set_flip_cfg(tx_type, &cfg);
-  const TX_TYPE_1D tx_type_col = vtx_tab[tx_type];
-  const TX_TYPE_1D tx_type_row = htx_tab[tx_type];
-  const TX_SIZE tx_size_col = txsize_vert_map[tx_size];
-  const TX_SIZE tx_size_row = txsize_horz_map[tx_size];
-  cfg.col_cfg = fwd_txfm_col_cfg_ls[tx_type_col][tx_size_col];
-  cfg.row_cfg = fwd_txfm_row_cfg_ls[tx_type_row][tx_size_row];
-  return cfg;
-}
-
-#if CONFIG_TX64X64
-TXFM_2D_FLIP_CFG av1_get_fwd_txfm_32x64_cfg(TX_TYPE tx_type) {
-  TXFM_2D_FLIP_CFG cfg;
-  const TX_TYPE_1D tx_type_row = htx_tab[tx_type];
-  const TX_SIZE tx_size_row = txsize_horz_map[TX_32X64];
-  switch (tx_type) {
-    case DCT_DCT:
-      cfg.col_cfg = &fwd_txfm_1d_col_cfg_dct_64;
-      cfg.row_cfg = fwd_txfm_row_cfg_ls[tx_type_row][tx_size_row];
-      cfg.ud_flip = 0;
-      cfg.lr_flip = 0;
-      break;
-    default: assert(0);
-  }
-  return cfg;
-}
-
-TXFM_2D_FLIP_CFG av1_get_fwd_txfm_64x32_cfg(TX_TYPE tx_type) {
-  TXFM_2D_FLIP_CFG cfg;
-  const TX_TYPE_1D tx_type_col = vtx_tab[tx_type];
-  const TX_SIZE tx_size_col = txsize_vert_map[TX_64X32];
-  switch (tx_type) {
-    case DCT_DCT:
-      cfg.col_cfg = fwd_txfm_col_cfg_ls[tx_type_col][tx_size_col];
-      cfg.row_cfg = &fwd_txfm_1d_row_cfg_dct_64;
-      cfg.ud_flip = 0;
-      cfg.lr_flip = 0;
-      break;
-    default: assert(0);
-  }
-  return cfg;
-}
-
-TXFM_2D_FLIP_CFG av1_get_fwd_txfm_64x64_cfg(TX_TYPE tx_type) {
-  TXFM_2D_FLIP_CFG cfg;
-  switch (tx_type) {
-    case DCT_DCT:
-      cfg.col_cfg = &fwd_txfm_1d_col_cfg_dct_64;
-      cfg.row_cfg = &fwd_txfm_1d_row_cfg_dct_64;
-      cfg.ud_flip = 0;
-      cfg.lr_flip = 0;
-      break;
-    default:
-      cfg.ud_flip = 0;
-      cfg.lr_flip = 0;
-      assert(0);
-  }
-  return cfg;
-}
-#endif  // CONFIG_TX64X64
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.c b/third_party/aom/av1/common/av1_inv_txfm1d.c
index 51f4b6362..8514dc64c 100644
--- a/third_party/aom/av1/common/av1_inv_txfm1d.c
+++ b/third_party/aom/av1/common/av1_inv_txfm1d.c
@@ -10,28 +10,28 @@
  */
 
 #include <stdlib.h>
-#include "aom_dsp/inv_txfm.h"
 #include "av1/common/av1_inv_txfm1d.h"
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
 
-void range_check_func(int32_t stage, const int32_t *input, const int32_t *buf,
-                      int32_t size, int8_t bit) {
-  const int64_t maxValue = (1LL << (bit - 1)) - 1;
-  const int64_t minValue = -(1LL << (bit - 1));
+static void range_check_buf(int32_t stage, const int32_t *input,
+                            const int32_t *buf, int32_t size, int8_t bit) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  const int64_t max_value = (1LL << (bit - 1)) - 1;
+  const int64_t min_value = -(1LL << (bit - 1));
 
   int in_range = 1;
 
   for (int i = 0; i < size; ++i) {
-    if (buf[i] < minValue || buf[i] > maxValue) {
+    if (buf[i] < min_value || buf[i] > max_value) {
       in_range = 0;
     }
   }
 
   if (!in_range) {
     fprintf(stderr, "Error: coeffs contain out-of-range values\n");
+    fprintf(stderr, "size: %d\n", size);
     fprintf(stderr, "stage: %d\n", stage);
-    fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", minValue,
-            maxValue);
+    fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value,
+            max_value);
 
     fprintf(stderr, "coeffs: ");
 
@@ -53,81 +53,73 @@ void range_check_func(int32_t stage, const int32_t *input, const int32_t *buf,
   }
 
   assert(in_range);
-}
-
-#define range_check(stage, input, buf, size, bit) \
-  range_check_func(stage, input, buf, size, bit)
 #else
-#define range_check(stage, input, buf, size, bit) \
-  {                                               \
-    (void)stage;                                  \
-    (void)input;                                  \
-    (void)buf;                                    \
-    (void)size;                                   \
-    (void)bit;                                    \
-  }
+  (void)stage;
+  (void)input;
+  (void)buf;
+  (void)size;
+  (void)bit;
 #endif
+}
 
 // TODO(angiebird): Make 1-d txfm functions static
-void av1_idct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+//
+
+void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
                    const int8_t *stage_range) {
+  assert(output != input);
   const int32_t size = 4;
-  const int32_t *cospi;
+  const int32_t *cospi = cospi_arr(cos_bit);
 
   int32_t stage = 0;
   int32_t *bf0, *bf1;
   int32_t step[4];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
-  assert(output != input);
   bf1 = output;
   bf1[0] = input[0];
   bf1[1] = input[2];
   bf1[2] = input[1];
   bf1[3] = input[3];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[3];
-  bf1[1] = bf0[1] + bf0[2];
-  bf1[2] = bf0[1] - bf0[2];
-  bf1[3] = bf0[0] - bf0[3];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
 }
 
-void av1_idct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
                    const int8_t *stage_range) {
+  assert(output != input);
   const int32_t size = 8;
-  const int32_t *cospi;
+  const int32_t *cospi = cospi_arr(cos_bit);
 
   int32_t stage = 0;
   int32_t *bf0, *bf1;
   int32_t step[8];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
-  assert(output != input);
   bf1 = output;
   bf1[0] = input[0];
   bf1[1] = input[4];
@@ -137,83 +129,78 @@ void av1_idct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
   bf1[5] = input[5];
   bf1[6] = input[3];
   bf1[7] = input[7];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
   bf1[2] = bf0[2];
   bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
-  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
-  bf1[4] = bf0[4] + bf0[5];
-  bf1[5] = bf0[4] - bf0[5];
-  bf1[6] = -bf0[6] + bf0[7];
-  bf1[7] = bf0[6] + bf0[7];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = bf0[0] + bf0[3];
-  bf1[1] = bf0[1] + bf0[2];
-  bf1[2] = bf0[1] - bf0[2];
-  bf1[3] = bf0[0] - bf0[3];
+  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
   bf1[4] = bf0[4];
-  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
   bf1[7] = bf0[7];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[7];
-  bf1[1] = bf0[1] + bf0[6];
-  bf1[2] = bf0[2] + bf0[5];
-  bf1[3] = bf0[3] + bf0[4];
-  bf1[4] = bf0[3] - bf0[4];
-  bf1[5] = bf0[2] - bf0[5];
-  bf1[6] = bf0[1] - bf0[6];
-  bf1[7] = bf0[0] - bf0[7];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
 }
 
-void av1_idct16_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  assert(output != input);
   const int32_t size = 16;
-  const int32_t *cospi;
+  const int32_t *cospi = cospi_arr(cos_bit);
 
   int32_t stage = 0;
   int32_t *bf0, *bf1;
   int32_t step[16];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
-  assert(output != input);
   bf1 = output;
   bf1[0] = input[0];
   bf1[1] = input[8];
@@ -231,11 +218,10 @@ void av1_idct16_new(const int32_t *input, int32_t *output,
   bf1[13] = input[11];
   bf1[14] = input[7];
   bf1[15] = input[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
@@ -246,146 +232,140 @@ void av1_idct16_new(const int32_t *input, int32_t *output,
   bf1[5] = bf0[5];
   bf1[6] = bf0[6];
   bf1[7] = bf0[7];
-  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
-  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
   bf1[2] = bf0[2];
   bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
-  bf1[8] = bf0[8] + bf0[9];
-  bf1[9] = bf0[8] - bf0[9];
-  bf1[10] = -bf0[10] + bf0[11];
-  bf1[11] = bf0[10] + bf0[11];
-  bf1[12] = bf0[12] + bf0[13];
-  bf1[13] = bf0[12] - bf0[13];
-  bf1[14] = -bf0[14] + bf0[15];
-  bf1[15] = bf0[14] + bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+  bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
-  bf1[4] = bf0[4] + bf0[5];
-  bf1[5] = bf0[4] - bf0[5];
-  bf1[6] = -bf0[6] + bf0[7];
-  bf1[7] = bf0[6] + bf0[7];
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
   bf1[8] = bf0[8];
-  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
   bf1[11] = bf0[11];
   bf1[12] = bf0[12];
-  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
   bf1[15] = bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[3];
-  bf1[1] = bf0[1] + bf0[2];
-  bf1[2] = bf0[1] - bf0[2];
-  bf1[3] = bf0[0] - bf0[3];
+  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
   bf1[4] = bf0[4];
-  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
   bf1[7] = bf0[7];
-  bf1[8] = bf0[8] + bf0[11];
-  bf1[9] = bf0[9] + bf0[10];
-  bf1[10] = bf0[9] - bf0[10];
-  bf1[11] = bf0[8] - bf0[11];
-  bf1[12] = -bf0[12] + bf0[15];
-  bf1[13] = -bf0[13] + bf0[14];
-  bf1[14] = bf0[13] + bf0[14];
-  bf1[15] = bf0[12] + bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
+  bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = bf0[0] + bf0[7];
-  bf1[1] = bf0[1] + bf0[6];
-  bf1[2] = bf0[2] + bf0[5];
-  bf1[3] = bf0[3] + bf0[4];
-  bf1[4] = bf0[3] - bf0[4];
-  bf1[5] = bf0[2] - bf0[5];
-  bf1[6] = bf0[1] - bf0[6];
-  bf1[7] = bf0[0] - bf0[7];
+  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
   bf1[8] = bf0[8];
   bf1[9] = bf0[9];
-  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
   bf1[14] = bf0[14];
   bf1[15] = bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[15];
-  bf1[1] = bf0[1] + bf0[14];
-  bf1[2] = bf0[2] + bf0[13];
-  bf1[3] = bf0[3] + bf0[12];
-  bf1[4] = bf0[4] + bf0[11];
-  bf1[5] = bf0[5] + bf0[10];
-  bf1[6] = bf0[6] + bf0[9];
-  bf1[7] = bf0[7] + bf0[8];
-  bf1[8] = bf0[7] - bf0[8];
-  bf1[9] = bf0[6] - bf0[9];
-  bf1[10] = bf0[5] - bf0[10];
-  bf1[11] = bf0[4] - bf0[11];
-  bf1[12] = bf0[3] - bf0[12];
-  bf1[13] = bf0[2] - bf0[13];
-  bf1[14] = bf0[1] - bf0[14];
-  bf1[15] = bf0[0] - bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
 }
 
-void av1_idct32_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  assert(output != input);
   const int32_t size = 32;
-  const int32_t *cospi;
+  const int32_t *cospi = cospi_arr(cos_bit);
 
   int32_t stage = 0;
   int32_t *bf0, *bf1;
   int32_t step[32];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
-  assert(output != input);
   bf1 = output;
   bf1[0] = input[0];
   bf1[1] = input[16];
@@ -419,11 +399,10 @@ void av1_idct32_new(const int32_t *input, int32_t *output,
   bf1[29] = input[23];
   bf1[30] = input[15];
   bf1[31] = input[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
@@ -442,27 +421,26 @@ void av1_idct32_new(const int32_t *input, int32_t *output,
   bf1[13] = bf0[13];
   bf1[14] = bf0[14];
   bf1[15] = bf0[15];
-  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit[stage]);
-  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit[stage]);
-  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit[stage]);
-  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit[stage]);
-  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit[stage]);
-  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit[stage]);
-  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit[stage]);
-  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit[stage]);
-  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
+  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
+  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
+  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
+  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
+  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
+  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
+  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
+  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
+  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
+  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
+  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
+  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
+  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0];
@@ -473,572 +451,506 @@ void av1_idct32_new(const int32_t *input, int32_t *output,
   bf1[5] = bf0[5];
   bf1[6] = bf0[6];
   bf1[7] = bf0[7];
-  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
-  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
-  bf1[16] = bf0[16] + bf0[17];
-  bf1[17] = bf0[16] - bf0[17];
-  bf1[18] = -bf0[18] + bf0[19];
-  bf1[19] = bf0[18] + bf0[19];
-  bf1[20] = bf0[20] + bf0[21];
-  bf1[21] = bf0[20] - bf0[21];
-  bf1[22] = -bf0[22] + bf0[23];
-  bf1[23] = bf0[22] + bf0[23];
-  bf1[24] = bf0[24] + bf0[25];
-  bf1[25] = bf0[24] - bf0[25];
-  bf1[26] = -bf0[26] + bf0[27];
-  bf1[27] = bf0[26] + bf0[27];
-  bf1[28] = bf0[28] + bf0[29];
-  bf1[29] = bf0[28] - bf0[29];
-  bf1[30] = -bf0[30] + bf0[31];
-  bf1[31] = bf0[30] + bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
+  bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
+  bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
+  bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
   bf1[2] = bf0[2];
   bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
-  bf1[8] = bf0[8] + bf0[9];
-  bf1[9] = bf0[8] - bf0[9];
-  bf1[10] = -bf0[10] + bf0[11];
-  bf1[11] = bf0[10] + bf0[11];
-  bf1[12] = bf0[12] + bf0[13];
-  bf1[13] = bf0[12] - bf0[13];
-  bf1[14] = -bf0[14] + bf0[15];
-  bf1[15] = bf0[14] + bf0[15];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+  bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
   bf1[16] = bf0[16];
-  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
-  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
   bf1[19] = bf0[19];
   bf1[20] = bf0[20];
-  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
   bf1[23] = bf0[23];
   bf1[24] = bf0[24];
-  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit[stage]);
+  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
   bf1[27] = bf0[27];
   bf1[28] = bf0[28];
-  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit[stage]);
-  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit[stage]);
+  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
+  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
   bf1[31] = bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
-  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
-  bf1[4] = bf0[4] + bf0[5];
-  bf1[5] = bf0[4] - bf0[5];
-  bf1[6] = -bf0[6] + bf0[7];
-  bf1[7] = bf0[6] + bf0[7];
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
   bf1[8] = bf0[8];
-  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
   bf1[11] = bf0[11];
   bf1[12] = bf0[12];
-  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
   bf1[15] = bf0[15];
-  bf1[16] = bf0[16] + bf0[19];
-  bf1[17] = bf0[17] + bf0[18];
-  bf1[18] = bf0[17] - bf0[18];
-  bf1[19] = bf0[16] - bf0[19];
-  bf1[20] = -bf0[20] + bf0[23];
-  bf1[21] = -bf0[21] + bf0[22];
-  bf1[22] = bf0[21] + bf0[22];
-  bf1[23] = bf0[20] + bf0[23];
-  bf1[24] = bf0[24] + bf0[27];
-  bf1[25] = bf0[25] + bf0[26];
-  bf1[26] = bf0[25] - bf0[26];
-  bf1[27] = bf0[24] - bf0[27];
-  bf1[28] = -bf0[28] + bf0[31];
-  bf1[29] = -bf0[29] + bf0[30];
-  bf1[30] = bf0[29] + bf0[30];
-  bf1[31] = bf0[28] + bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
+  bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
+  bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
+  bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = bf0[0] + bf0[3];
-  bf1[1] = bf0[1] + bf0[2];
-  bf1[2] = bf0[1] - bf0[2];
-  bf1[3] = bf0[0] - bf0[3];
+  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
   bf1[4] = bf0[4];
-  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
   bf1[7] = bf0[7];
-  bf1[8] = bf0[8] + bf0[11];
-  bf1[9] = bf0[9] + bf0[10];
-  bf1[10] = bf0[9] - bf0[10];
-  bf1[11] = bf0[8] - bf0[11];
-  bf1[12] = -bf0[12] + bf0[15];
-  bf1[13] = -bf0[13] + bf0[14];
-  bf1[14] = bf0[13] + bf0[14];
-  bf1[15] = bf0[12] + bf0[15];
+  bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
+  bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
   bf1[16] = bf0[16];
   bf1[17] = bf0[17];
-  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
-  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
-  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
   bf1[22] = bf0[22];
   bf1[23] = bf0[23];
   bf1[24] = bf0[24];
   bf1[25] = bf0[25];
-  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit[stage]);
-  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit[stage]);
-  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit[stage]);
+  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
+  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
+  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
+  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
   bf1[30] = bf0[30];
   bf1[31] = bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[7];
-  bf1[1] = bf0[1] + bf0[6];
-  bf1[2] = bf0[2] + bf0[5];
-  bf1[3] = bf0[3] + bf0[4];
-  bf1[4] = bf0[3] - bf0[4];
-  bf1[5] = bf0[2] - bf0[5];
-  bf1[6] = bf0[1] - bf0[6];
-  bf1[7] = bf0[0] - bf0[7];
+  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
   bf1[8] = bf0[8];
   bf1[9] = bf0[9];
-  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
   bf1[14] = bf0[14];
   bf1[15] = bf0[15];
-  bf1[16] = bf0[16] + bf0[23];
-  bf1[17] = bf0[17] + bf0[22];
-  bf1[18] = bf0[18] + bf0[21];
-  bf1[19] = bf0[19] + bf0[20];
-  bf1[20] = bf0[19] - bf0[20];
-  bf1[21] = bf0[18] - bf0[21];
-  bf1[22] = bf0[17] - bf0[22];
-  bf1[23] = bf0[16] - bf0[23];
-  bf1[24] = -bf0[24] + bf0[31];
-  bf1[25] = -bf0[25] + bf0[30];
-  bf1[26] = -bf0[26] + bf0[29];
-  bf1[27] = -bf0[27] + bf0[28];
-  bf1[28] = bf0[27] + bf0[28];
-  bf1[29] = bf0[26] + bf0[29];
-  bf1[30] = bf0[25] + bf0[30];
-  bf1[31] = bf0[24] + bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
+  bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
+  bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
+  bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 8
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = bf0[0] + bf0[15];
-  bf1[1] = bf0[1] + bf0[14];
-  bf1[2] = bf0[2] + bf0[13];
-  bf1[3] = bf0[3] + bf0[12];
-  bf1[4] = bf0[4] + bf0[11];
-  bf1[5] = bf0[5] + bf0[10];
-  bf1[6] = bf0[6] + bf0[9];
-  bf1[7] = bf0[7] + bf0[8];
-  bf1[8] = bf0[7] - bf0[8];
-  bf1[9] = bf0[6] - bf0[9];
-  bf1[10] = bf0[5] - bf0[10];
-  bf1[11] = bf0[4] - bf0[11];
-  bf1[12] = bf0[3] - bf0[12];
-  bf1[13] = bf0[2] - bf0[13];
-  bf1[14] = bf0[1] - bf0[14];
-  bf1[15] = bf0[0] - bf0[15];
+  bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
   bf1[16] = bf0[16];
   bf1[17] = bf0[17];
   bf1[18] = bf0[18];
   bf1[19] = bf0[19];
-  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
-  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
-  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
   bf1[28] = bf0[28];
   bf1[29] = bf0[29];
   bf1[30] = bf0[30];
   bf1[31] = bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 9
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[31];
-  bf1[1] = bf0[1] + bf0[30];
-  bf1[2] = bf0[2] + bf0[29];
-  bf1[3] = bf0[3] + bf0[28];
-  bf1[4] = bf0[4] + bf0[27];
-  bf1[5] = bf0[5] + bf0[26];
-  bf1[6] = bf0[6] + bf0[25];
-  bf1[7] = bf0[7] + bf0[24];
-  bf1[8] = bf0[8] + bf0[23];
-  bf1[9] = bf0[9] + bf0[22];
-  bf1[10] = bf0[10] + bf0[21];
-  bf1[11] = bf0[11] + bf0[20];
-  bf1[12] = bf0[12] + bf0[19];
-  bf1[13] = bf0[13] + bf0[18];
-  bf1[14] = bf0[14] + bf0[17];
-  bf1[15] = bf0[15] + bf0[16];
-  bf1[16] = bf0[15] - bf0[16];
-  bf1[17] = bf0[14] - bf0[17];
-  bf1[18] = bf0[13] - bf0[18];
-  bf1[19] = bf0[12] - bf0[19];
-  bf1[20] = bf0[11] - bf0[20];
-  bf1[21] = bf0[10] - bf0[21];
-  bf1[22] = bf0[9] - bf0[22];
-  bf1[23] = bf0[8] - bf0[23];
-  bf1[24] = bf0[7] - bf0[24];
-  bf1[25] = bf0[6] - bf0[25];
-  bf1[26] = bf0[5] - bf0[26];
-  bf1[27] = bf0[4] - bf0[27];
-  bf1[28] = bf0[3] - bf0[28];
-  bf1[29] = bf0[2] - bf0[29];
-  bf1[30] = bf0[1] - bf0[30];
-  bf1[31] = bf0[0] - bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
+  bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
+  bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
 }
 
-void av1_iadst4_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range) {
-  const int32_t size = 4;
-  const int32_t *cospi;
+void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  int bit = cos_bit;
+  const int32_t *sinpi = sinpi_arr(bit);
+  int32_t s0, s1, s2, s3, s4, s5, s6, s7;
 
-  int32_t stage = 0;
-  int32_t *bf0, *bf1;
-  int32_t step[4];
+  int32_t x0 = input[0];
+  int32_t x1 = input[1];
+  int32_t x2 = input[2];
+  int32_t x3 = input[3];
 
-  // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
 
-  // stage 1;
-  stage++;
-  assert(output != input);
-  bf1 = output;
-  bf1[0] = input[0];
-  bf1[1] = -input[3];
-  bf1[2] = -input[1];
-  bf1[3] = input[2];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  assert(sinpi[1] + sinpi[2] == sinpi[4]);
+
+  // stage 1
+  s0 = range_check_value(sinpi[1] * x0, stage_range[1] + bit);
+  s1 = range_check_value(sinpi[2] * x0, stage_range[1] + bit);
+  s2 = range_check_value(sinpi[3] * x1, stage_range[1] + bit);
+  s3 = range_check_value(sinpi[4] * x2, stage_range[1] + bit);
+  s4 = range_check_value(sinpi[1] * x2, stage_range[1] + bit);
+  s5 = range_check_value(sinpi[2] * x3, stage_range[1] + bit);
+  s6 = range_check_value(sinpi[4] * x3, stage_range[1] + bit);
 
   // stage 2
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  // NOTICE: (x0 - x2) here may use one extra bit compared to the
+  // opt_range_row/col specified in av1_gen_inv_stage_range()
+  s7 = range_check_value((x0 - x2) + x3, stage_range[2]);
 
   // stage 3
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[2];
-  bf1[1] = bf0[1] + bf0[3];
-  bf1[2] = bf0[0] - bf0[2];
-  bf1[3] = bf0[1] - bf0[3];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  s0 = range_check_value(s0 + s3, stage_range[3] + bit);
+  s1 = range_check_value(s1 - s4, stage_range[3] + bit);
+  s3 = range_check_value(s2, stage_range[3] + bit);
+  s2 = range_check_value(sinpi[3] * s7, stage_range[3] + bit);
 
   // stage 4
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(cospi[56], bf0[0], -cospi[8], bf0[1], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[24], bf0[2], -cospi[40], bf0[3], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  s0 = range_check_value(s0 + s5, stage_range[4] + bit);
+  s1 = range_check_value(s1 - s6, stage_range[4] + bit);
 
   // stage 5
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[1];
-  bf1[1] = bf0[2];
-  bf1[2] = bf0[3];
-  bf1[3] = bf0[0];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  x0 = range_check_value(s0 + s3, stage_range[5] + bit);
+  x1 = range_check_value(s1 + s3, stage_range[5] + bit);
+  x2 = range_check_value(s2, stage_range[5] + bit);
+  x3 = range_check_value(s0 + s1, stage_range[5] + bit);
+
+  // stage 6
+  x3 = range_check_value(x3 - s3, stage_range[6] + bit);
+
+  output[0] = round_shift(x0, bit);
+  output[1] = round_shift(x1, bit);
+  output[2] = round_shift(x2, bit);
+  output[3] = round_shift(x3, bit);
+  range_check_buf(6, input, output, 4, stage_range[6]);
 }
 
-void av1_iadst8_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  assert(output != input);
   const int32_t size = 8;
-  const int32_t *cospi;
+  const int32_t *cospi = cospi_arr(cos_bit);
 
   int32_t stage = 0;
   int32_t *bf0, *bf1;
   int32_t step[8];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
-  assert(output != input);
   bf1 = output;
-  bf1[0] = input[0];
-  bf1[1] = -input[7];
-  bf1[2] = -input[3];
-  bf1[3] = input[4];
-  bf1[4] = -input[1];
-  bf1[5] = input[6];
-  bf1[6] = input[2];
-  bf1[7] = -input[5];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = input[7];
+  bf1[1] = input[0];
+  bf1[2] = input[5];
+  bf1[3] = input[2];
+  bf1[4] = input[3];
+  bf1[5] = input[4];
+  bf1[6] = input[1];
+  bf1[7] = input[6];
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
+  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
+  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[2];
-  bf1[1] = bf0[1] + bf0[3];
-  bf1[2] = bf0[0] - bf0[2];
-  bf1[3] = bf0[1] - bf0[3];
-  bf1[4] = bf0[4] + bf0[6];
-  bf1[5] = bf0[5] + bf0[7];
-  bf1[6] = bf0[4] - bf0[6];
-  bf1[7] = bf0[5] - bf0[7];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
   bf1[2] = bf0[2];
   bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
-  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[4];
-  bf1[1] = bf0[1] + bf0[5];
-  bf1[2] = bf0[2] + bf0[6];
-  bf1[3] = bf0[3] + bf0[7];
-  bf1[4] = bf0[0] - bf0[4];
-  bf1[5] = bf0[1] - bf0[5];
-  bf1[6] = bf0[2] - bf0[6];
-  bf1[7] = bf0[3] - bf0[7];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit[stage]);
-  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[1];
-  bf1[1] = bf0[6];
-  bf1[2] = bf0[3];
-  bf1[3] = bf0[4];
-  bf1[4] = bf0[5];
-  bf1[5] = bf0[2];
-  bf1[6] = bf0[7];
-  bf1[7] = bf0[0];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[4];
+  bf1[2] = bf0[6];
+  bf1[3] = -bf0[2];
+  bf1[4] = bf0[3];
+  bf1[5] = -bf0[7];
+  bf1[6] = bf0[5];
+  bf1[7] = -bf0[1];
 }
 
-void av1_iadst16_new(const int32_t *input, int32_t *output,
-                     const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                     const int8_t *stage_range) {
+  assert(output != input);
   const int32_t size = 16;
-  const int32_t *cospi;
+  const int32_t *cospi = cospi_arr(cos_bit);
 
   int32_t stage = 0;
   int32_t *bf0, *bf1;
   int32_t step[16];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
-  assert(output != input);
   bf1 = output;
-  bf1[0] = input[0];
-  bf1[1] = -input[15];
-  bf1[2] = -input[7];
-  bf1[3] = input[8];
-  bf1[4] = -input[3];
-  bf1[5] = input[12];
-  bf1[6] = input[4];
-  bf1[7] = -input[11];
-  bf1[8] = -input[1];
-  bf1[9] = input[14];
-  bf1[10] = input[6];
-  bf1[11] = -input[9];
-  bf1[12] = input[2];
-  bf1[13] = -input[13];
-  bf1[14] = -input[5];
-  bf1[15] = input[10];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = input[15];
+  bf1[1] = input[0];
+  bf1[2] = input[13];
+  bf1[3] = input[2];
+  bf1[4] = input[11];
+  bf1[5] = input[4];
+  bf1[6] = input[9];
+  bf1[7] = input[6];
+  bf1[8] = input[7];
+  bf1[9] = input[8];
+  bf1[10] = input[5];
+  bf1[11] = input[10];
+  bf1[12] = input[3];
+  bf1[13] = input[12];
+  bf1[14] = input[1];
+  bf1[15] = input[14];
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
-  bf1[8] = bf0[8];
-  bf1[9] = bf0[9];
-  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit[stage]);
-  bf1[12] = bf0[12];
-  bf1[13] = bf0[13];
-  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
+  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
+  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
+  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
+  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
+  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
+  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[2];
-  bf1[1] = bf0[1] + bf0[3];
-  bf1[2] = bf0[0] - bf0[2];
-  bf1[3] = bf0[1] - bf0[3];
-  bf1[4] = bf0[4] + bf0[6];
-  bf1[5] = bf0[5] + bf0[7];
-  bf1[6] = bf0[4] - bf0[6];
-  bf1[7] = bf0[5] - bf0[7];
-  bf1[8] = bf0[8] + bf0[10];
-  bf1[9] = bf0[9] + bf0[11];
-  bf1[10] = bf0[8] - bf0[10];
-  bf1[11] = bf0[9] - bf0[11];
-  bf1[12] = bf0[12] + bf0[14];
-  bf1[13] = bf0[13] + bf0[15];
-  bf1[14] = bf0[12] - bf0[14];
-  bf1[15] = bf0[13] - bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = clamp_value(bf0[0] + bf0[8], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[9], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[10], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[11], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[12], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[13], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[14], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[15], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[0] - bf0[8], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[1] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[2] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[3] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[4] - bf0[12], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
-  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
-  bf1[8] = bf0[8];
-  bf1[9] = bf0[9];
-  bf1[10] = bf0[10];
-  bf1[11] = bf0[11];
-  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 5
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[4];
-  bf1[1] = bf0[1] + bf0[5];
-  bf1[2] = bf0[2] + bf0[6];
-  bf1[3] = bf0[3] + bf0[7];
-  bf1[4] = bf0[0] - bf0[4];
-  bf1[5] = bf0[1] - bf0[5];
-  bf1[6] = bf0[2] - bf0[6];
-  bf1[7] = bf0[3] - bf0[7];
-  bf1[8] = bf0[8] + bf0[12];
-  bf1[9] = bf0[9] + bf0[13];
-  bf1[10] = bf0[10] + bf0[14];
-  bf1[11] = bf0[11] + bf0[15];
-  bf1[12] = bf0[8] - bf0[12];
-  bf1[13] = bf0[9] - bf0[13];
-  bf1[14] = bf0[10] - bf0[14];
-  bf1[15] = bf0[11] - bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 6
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
@@ -1049,579 +961,173 @@ void av1_iadst16_new(const int32_t *input, int32_t *output,
   bf1[5] = bf0[5];
   bf1[6] = bf0[6];
   bf1[7] = bf0[7];
-  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
-  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit[stage]);
-  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 7
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[8];
-  bf1[1] = bf0[1] + bf0[9];
-  bf1[2] = bf0[2] + bf0[10];
-  bf1[3] = bf0[3] + bf0[11];
-  bf1[4] = bf0[4] + bf0[12];
-  bf1[5] = bf0[5] + bf0[13];
-  bf1[6] = bf0[6] + bf0[14];
-  bf1[7] = bf0[7] + bf0[15];
-  bf1[8] = bf0[0] - bf0[8];
-  bf1[9] = bf0[1] - bf0[9];
-  bf1[10] = bf0[2] - bf0[10];
-  bf1[11] = bf0[3] - bf0[11];
-  bf1[12] = bf0[4] - bf0[12];
-  bf1[13] = bf0[5] - bf0[13];
-  bf1[14] = bf0[6] - bf0[14];
-  bf1[15] = bf0[7] - bf0[15];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 8
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit[stage]);
-  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit[stage]);
-  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit[stage]);
-  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 9
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[1];
-  bf1[1] = bf0[14];
-  bf1[2] = bf0[3];
-  bf1[3] = bf0[12];
-  bf1[4] = bf0[5];
-  bf1[5] = bf0[10];
-  bf1[6] = bf0[7];
-  bf1[7] = bf0[8];
-  bf1[8] = bf0[9];
-  bf1[9] = bf0[6];
-  bf1[10] = bf0[11];
-  bf1[11] = bf0[4];
-  bf1[12] = bf0[13];
-  bf1[13] = bf0[2];
-  bf1[14] = bf0[15];
-  bf1[15] = bf0[0];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-}
-
-void av1_iadst32_new(const int32_t *input, int32_t *output,
-                     const int8_t *cos_bit, const int8_t *stage_range) {
-  const int32_t size = 32;
-  const int32_t *cospi;
-
-  int32_t stage = 0;
-  int32_t *bf0, *bf1;
-  int32_t step[32];
-
-  // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
-
-  // stage 1;
-  stage++;
-  assert(output != input);
-  bf1 = output;
-  bf1[0] = input[0];
-  bf1[1] = -input[31];
-  bf1[2] = -input[15];
-  bf1[3] = input[16];
-  bf1[4] = -input[7];
-  bf1[5] = input[24];
-  bf1[6] = input[8];
-  bf1[7] = -input[23];
-  bf1[8] = -input[3];
-  bf1[9] = input[28];
-  bf1[10] = input[12];
-  bf1[11] = -input[19];
-  bf1[12] = input[4];
-  bf1[13] = -input[27];
-  bf1[14] = -input[11];
-  bf1[15] = input[20];
-  bf1[16] = -input[1];
-  bf1[17] = input[30];
-  bf1[18] = input[14];
-  bf1[19] = -input[17];
-  bf1[20] = input[6];
-  bf1[21] = -input[25];
-  bf1[22] = -input[9];
-  bf1[23] = input[22];
-  bf1[24] = input[2];
-  bf1[25] = -input[29];
-  bf1[26] = -input[13];
-  bf1[27] = input[18];
-  bf1[28] = -input[5];
-  bf1[29] = input[26];
-  bf1[30] = input[10];
-  bf1[31] = -input[21];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
+  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
-  // stage 2
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
-  bf1[8] = bf0[8];
-  bf1[9] = bf0[9];
-  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit[stage]);
-  bf1[12] = bf0[12];
-  bf1[13] = bf0[13];
-  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit[stage]);
-  bf1[16] = bf0[16];
-  bf1[17] = bf0[17];
-  bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit[stage]);
-  bf1[19] = half_btf(cospi[32], bf0[18], -cospi[32], bf0[19], cos_bit[stage]);
-  bf1[20] = bf0[20];
-  bf1[21] = bf0[21];
-  bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit[stage]);
-  bf1[23] = half_btf(cospi[32], bf0[22], -cospi[32], bf0[23], cos_bit[stage]);
-  bf1[24] = bf0[24];
-  bf1[25] = bf0[25];
-  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[32], bf0[26], -cospi[32], bf0[27], cos_bit[stage]);
-  bf1[28] = bf0[28];
-  bf1[29] = bf0[29];
-  bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[32], bf0[30], -cospi[32], bf0[31], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 3
+  // stage 5
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[2];
-  bf1[1] = bf0[1] + bf0[3];
-  bf1[2] = bf0[0] - bf0[2];
-  bf1[3] = bf0[1] - bf0[3];
-  bf1[4] = bf0[4] + bf0[6];
-  bf1[5] = bf0[5] + bf0[7];
-  bf1[6] = bf0[4] - bf0[6];
-  bf1[7] = bf0[5] - bf0[7];
-  bf1[8] = bf0[8] + bf0[10];
-  bf1[9] = bf0[9] + bf0[11];
-  bf1[10] = bf0[8] - bf0[10];
-  bf1[11] = bf0[9] - bf0[11];
-  bf1[12] = bf0[12] + bf0[14];
-  bf1[13] = bf0[13] + bf0[15];
-  bf1[14] = bf0[12] - bf0[14];
-  bf1[15] = bf0[13] - bf0[15];
-  bf1[16] = bf0[16] + bf0[18];
-  bf1[17] = bf0[17] + bf0[19];
-  bf1[18] = bf0[16] - bf0[18];
-  bf1[19] = bf0[17] - bf0[19];
-  bf1[20] = bf0[20] + bf0[22];
-  bf1[21] = bf0[21] + bf0[23];
-  bf1[22] = bf0[20] - bf0[22];
-  bf1[23] = bf0[21] - bf0[23];
-  bf1[24] = bf0[24] + bf0[26];
-  bf1[25] = bf0[25] + bf0[27];
-  bf1[26] = bf0[24] - bf0[26];
-  bf1[27] = bf0[25] - bf0[27];
-  bf1[28] = bf0[28] + bf0[30];
-  bf1[29] = bf0[29] + bf0[31];
-  bf1[30] = bf0[28] - bf0[30];
-  bf1[31] = bf0[29] - bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[12], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[13], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[10] + bf0[14], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[11] + bf0[15], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[8] - bf0[12], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
-  // stage 4
+  // stage 6
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
   bf1[2] = bf0[2];
   bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
-  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
   bf1[8] = bf0[8];
   bf1[9] = bf0[9];
   bf1[10] = bf0[10];
   bf1[11] = bf0[11];
-  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit[stage]);
-  bf1[16] = bf0[16];
-  bf1[17] = bf0[17];
-  bf1[18] = bf0[18];
-  bf1[19] = bf0[19];
-  bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit[stage]);
-  bf1[21] = half_btf(cospi[48], bf0[20], -cospi[16], bf0[21], cos_bit[stage]);
-  bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit[stage]);
-  bf1[23] = half_btf(cospi[16], bf0[22], cospi[48], bf0[23], cos_bit[stage]);
-  bf1[24] = bf0[24];
-  bf1[25] = bf0[25];
-  bf1[26] = bf0[26];
-  bf1[27] = bf0[27];
-  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[48], bf0[28], -cospi[16], bf0[29], cos_bit[stage]);
-  bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[16], bf0[30], cospi[48], bf0[31], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 5
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[4];
-  bf1[1] = bf0[1] + bf0[5];
-  bf1[2] = bf0[2] + bf0[6];
-  bf1[3] = bf0[3] + bf0[7];
-  bf1[4] = bf0[0] - bf0[4];
-  bf1[5] = bf0[1] - bf0[5];
-  bf1[6] = bf0[2] - bf0[6];
-  bf1[7] = bf0[3] - bf0[7];
-  bf1[8] = bf0[8] + bf0[12];
-  bf1[9] = bf0[9] + bf0[13];
-  bf1[10] = bf0[10] + bf0[14];
-  bf1[11] = bf0[11] + bf0[15];
-  bf1[12] = bf0[8] - bf0[12];
-  bf1[13] = bf0[9] - bf0[13];
-  bf1[14] = bf0[10] - bf0[14];
-  bf1[15] = bf0[11] - bf0[15];
-  bf1[16] = bf0[16] + bf0[20];
-  bf1[17] = bf0[17] + bf0[21];
-  bf1[18] = bf0[18] + bf0[22];
-  bf1[19] = bf0[19] + bf0[23];
-  bf1[20] = bf0[16] - bf0[20];
-  bf1[21] = bf0[17] - bf0[21];
-  bf1[22] = bf0[18] - bf0[22];
-  bf1[23] = bf0[19] - bf0[23];
-  bf1[24] = bf0[24] + bf0[28];
-  bf1[25] = bf0[25] + bf0[29];
-  bf1[26] = bf0[26] + bf0[30];
-  bf1[27] = bf0[27] + bf0[31];
-  bf1[28] = bf0[24] - bf0[28];
-  bf1[29] = bf0[25] - bf0[29];
-  bf1[30] = bf0[26] - bf0[30];
-  bf1[31] = bf0[27] - bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 6
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = bf0[6];
-  bf1[7] = bf0[7];
-  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
-  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit[stage]);
-  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit[stage]);
-  bf1[16] = bf0[16];
-  bf1[17] = bf0[17];
-  bf1[18] = bf0[18];
-  bf1[19] = bf0[19];
-  bf1[20] = bf0[20];
-  bf1[21] = bf0[21];
-  bf1[22] = bf0[22];
-  bf1[23] = bf0[23];
-  bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[56], bf0[24], -cospi[8], bf0[25], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[24], bf0[26], -cospi[40], bf0[27], cos_bit[stage]);
-  bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[8], bf0[28], cospi[56], bf0[29], cos_bit[stage]);
-  bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[40], bf0[30], cospi[24], bf0[31], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[8];
-  bf1[1] = bf0[1] + bf0[9];
-  bf1[2] = bf0[2] + bf0[10];
-  bf1[3] = bf0[3] + bf0[11];
-  bf1[4] = bf0[4] + bf0[12];
-  bf1[5] = bf0[5] + bf0[13];
-  bf1[6] = bf0[6] + bf0[14];
-  bf1[7] = bf0[7] + bf0[15];
-  bf1[8] = bf0[0] - bf0[8];
-  bf1[9] = bf0[1] - bf0[9];
-  bf1[10] = bf0[2] - bf0[10];
-  bf1[11] = bf0[3] - bf0[11];
-  bf1[12] = bf0[4] - bf0[12];
-  bf1[13] = bf0[5] - bf0[13];
-  bf1[14] = bf0[6] - bf0[14];
-  bf1[15] = bf0[7] - bf0[15];
-  bf1[16] = bf0[16] + bf0[24];
-  bf1[17] = bf0[17] + bf0[25];
-  bf1[18] = bf0[18] + bf0[26];
-  bf1[19] = bf0[19] + bf0[27];
-  bf1[20] = bf0[20] + bf0[28];
-  bf1[21] = bf0[21] + bf0[29];
-  bf1[22] = bf0[22] + bf0[30];
-  bf1[23] = bf0[23] + bf0[31];
-  bf1[24] = bf0[16] - bf0[24];
-  bf1[25] = bf0[17] - bf0[25];
-  bf1[26] = bf0[18] - bf0[26];
-  bf1[27] = bf0[19] - bf0[27];
-  bf1[28] = bf0[20] - bf0[28];
-  bf1[29] = bf0[21] - bf0[29];
-  bf1[30] = bf0[22] - bf0[30];
-  bf1[31] = bf0[23] - bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[10], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[11], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[8] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[9] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[14], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 8
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
   bf1[4] = bf0[4];
   bf1[5] = bf0[5];
-  bf1[6] = bf0[6];
-  bf1[7] = bf0[7];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
   bf1[8] = bf0[8];
   bf1[9] = bf0[9];
-  bf1[10] = bf0[10];
-  bf1[11] = bf0[11];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
   bf1[12] = bf0[12];
   bf1[13] = bf0[13];
-  bf1[14] = bf0[14];
-  bf1[15] = bf0[15];
-  bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit[stage]);
-  bf1[17] = half_btf(cospi[60], bf0[16], -cospi[4], bf0[17], cos_bit[stage]);
-  bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit[stage]);
-  bf1[19] = half_btf(cospi[44], bf0[18], -cospi[20], bf0[19], cos_bit[stage]);
-  bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit[stage]);
-  bf1[21] = half_btf(cospi[28], bf0[20], -cospi[36], bf0[21], cos_bit[stage]);
-  bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit[stage]);
-  bf1[23] = half_btf(cospi[12], bf0[22], -cospi[52], bf0[23], cos_bit[stage]);
-  bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[4], bf0[24], cospi[60], bf0[25], cos_bit[stage]);
-  bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[20], bf0[26], cospi[44], bf0[27], cos_bit[stage]);
-  bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[36], bf0[28], cospi[28], bf0[29], cos_bit[stage]);
-  bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[52], bf0[30], cospi[12], bf0[31], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 9
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[16];
-  bf1[1] = bf0[1] + bf0[17];
-  bf1[2] = bf0[2] + bf0[18];
-  bf1[3] = bf0[3] + bf0[19];
-  bf1[4] = bf0[4] + bf0[20];
-  bf1[5] = bf0[5] + bf0[21];
-  bf1[6] = bf0[6] + bf0[22];
-  bf1[7] = bf0[7] + bf0[23];
-  bf1[8] = bf0[8] + bf0[24];
-  bf1[9] = bf0[9] + bf0[25];
-  bf1[10] = bf0[10] + bf0[26];
-  bf1[11] = bf0[11] + bf0[27];
-  bf1[12] = bf0[12] + bf0[28];
-  bf1[13] = bf0[13] + bf0[29];
-  bf1[14] = bf0[14] + bf0[30];
-  bf1[15] = bf0[15] + bf0[31];
-  bf1[16] = bf0[0] - bf0[16];
-  bf1[17] = bf0[1] - bf0[17];
-  bf1[18] = bf0[2] - bf0[18];
-  bf1[19] = bf0[3] - bf0[19];
-  bf1[20] = bf0[4] - bf0[20];
-  bf1[21] = bf0[5] - bf0[21];
-  bf1[22] = bf0[6] - bf0[22];
-  bf1[23] = bf0[7] - bf0[23];
-  bf1[24] = bf0[8] - bf0[24];
-  bf1[25] = bf0[9] - bf0[25];
-  bf1[26] = bf0[10] - bf0[26];
-  bf1[27] = bf0[11] - bf0[27];
-  bf1[28] = bf0[12] - bf0[28];
-  bf1[29] = bf0[13] - bf0[29];
-  bf1[30] = bf0[14] - bf0[30];
-  bf1[31] = bf0[15] - bf0[31];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 10
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(cospi[63], bf0[0], -cospi[1], bf0[1], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[59], bf0[2], -cospi[5], bf0[3], cos_bit[stage]);
-  bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[55], bf0[4], -cospi[9], bf0[5], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[51], bf0[6], -cospi[13], bf0[7], cos_bit[stage]);
-  bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit[stage]);
-  bf1[9] = half_btf(cospi[47], bf0[8], -cospi[17], bf0[9], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[43], bf0[10], -cospi[21], bf0[11], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[39], bf0[12], -cospi[25], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[35], bf0[14], -cospi[29], bf0[15], cos_bit[stage]);
-  bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit[stage]);
-  bf1[17] = half_btf(cospi[31], bf0[16], -cospi[33], bf0[17], cos_bit[stage]);
-  bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit[stage]);
-  bf1[19] = half_btf(cospi[27], bf0[18], -cospi[37], bf0[19], cos_bit[stage]);
-  bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit[stage]);
-  bf1[21] = half_btf(cospi[23], bf0[20], -cospi[41], bf0[21], cos_bit[stage]);
-  bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit[stage]);
-  bf1[23] = half_btf(cospi[19], bf0[22], -cospi[45], bf0[23], cos_bit[stage]);
-  bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[15], bf0[24], -cospi[49], bf0[25], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[11], bf0[26], -cospi[53], bf0[27], cos_bit[stage]);
-  bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[7], bf0[28], -cospi[57], bf0[29], cos_bit[stage]);
-  bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[3], bf0[30], -cospi[61], bf0[31], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 11
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[1];
-  bf1[1] = bf0[30];
-  bf1[2] = bf0[3];
-  bf1[3] = bf0[28];
-  bf1[4] = bf0[5];
-  bf1[5] = bf0[26];
-  bf1[6] = bf0[7];
-  bf1[7] = bf0[24];
-  bf1[8] = bf0[9];
-  bf1[9] = bf0[22];
-  bf1[10] = bf0[11];
-  bf1[11] = bf0[20];
-  bf1[12] = bf0[13];
-  bf1[13] = bf0[18];
-  bf1[14] = bf0[15];
-  bf1[15] = bf0[16];
-  bf1[16] = bf0[17];
-  bf1[17] = bf0[14];
-  bf1[18] = bf0[19];
-  bf1[19] = bf0[12];
-  bf1[20] = bf0[21];
-  bf1[21] = bf0[10];
-  bf1[22] = bf0[23];
-  bf1[23] = bf0[8];
-  bf1[24] = bf0[25];
-  bf1[25] = bf0[6];
-  bf1[26] = bf0[27];
-  bf1[27] = bf0[4];
-  bf1[28] = bf0[29];
-  bf1[29] = bf0[2];
-  bf1[30] = bf0[31];
-  bf1[31] = bf0[0];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[8];
+  bf1[2] = bf0[12];
+  bf1[3] = -bf0[4];
+  bf1[4] = bf0[6];
+  bf1[5] = -bf0[14];
+  bf1[6] = bf0[10];
+  bf1[7] = -bf0[2];
+  bf1[8] = bf0[3];
+  bf1[9] = -bf0[11];
+  bf1[10] = bf0[15];
+  bf1[11] = -bf0[7];
+  bf1[12] = bf0[5];
+  bf1[13] = -bf0[13];
+  bf1[14] = bf0[9];
+  bf1[15] = -bf0[1];
 }
 
-#if CONFIG_EXT_TX
-void av1_iidentity4_c(const int32_t *input, int32_t *output,
-                      const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range) {
   (void)cos_bit;
-  for (int i = 0; i < 4; ++i)
-    output[i] = (int32_t)dct_const_round_shift(input[i] * Sqrt2);
-  range_check(0, input, output, 4, stage_range[0]);
+  (void)stage_range;
+  for (int i = 0; i < 4; ++i) {
+    output[i] = round_shift((int64_t)NewSqrt2 * input[i], NewSqrt2Bits);
+  }
+  assert(stage_range[0] + NewSqrt2Bits <= 32);
 }
 
-void av1_iidentity8_c(const int32_t *input, int32_t *output,
-                      const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range) {
   (void)cos_bit;
-  for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
-  range_check(0, input, output, 8, stage_range[0]);
+  (void)stage_range;
+  for (int i = 0; i < 8; ++i) output[i] = (int32_t)((int64_t)input[i] * 2);
 }
 
-void av1_iidentity16_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range) {
   (void)cos_bit;
+  (void)stage_range;
   for (int i = 0; i < 16; ++i)
-    output[i] = (int32_t)dct_const_round_shift(input[i] * 2 * Sqrt2);
-  range_check(0, input, output, 16, stage_range[0]);
-}
-
-void av1_iidentity32_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range) {
-  (void)cos_bit;
-  for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
-  range_check(0, input, output, 32, stage_range[0]);
+    output[i] = round_shift((int64_t)NewSqrt2 * 2 * input[i], NewSqrt2Bits);
+  assert(stage_range[0] + NewSqrt2Bits <= 32);
 }
 
-#if CONFIG_TX64X64
-void av1_iidentity64_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range) {
   (void)cos_bit;
-  for (int i = 0; i < 64; ++i)
-    output[i] = (int32_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
-  range_check(0, input, output, 64, stage_range[0]);
+  (void)stage_range;
+  for (int i = 0; i < 32; ++i) output[i] = (int32_t)((int64_t)input[i] * 4);
 }
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
 
-#if CONFIG_TX64X64
-void av1_idct64_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  assert(output != input);
   const int32_t size = 64;
-  const int32_t *cospi;
+  const int32_t *cospi = cospi_arr(cos_bit);
 
   int32_t stage = 0;
   int32_t *bf0, *bf1;
   int32_t step[64];
 
   // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
 
   // stage 1;
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  assert(output != input);
   bf1 = output;
   bf1[0] = input[0];
   bf1[1] = input[32];
@@ -1687,11 +1193,10 @@ void av1_idct64_new(const int32_t *input, int32_t *output,
   bf1[61] = input[47];
   bf1[62] = input[31];
   bf1[63] = input[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
@@ -1726,43 +1231,42 @@ void av1_idct64_new(const int32_t *input, int32_t *output,
   bf1[29] = bf0[29];
   bf1[30] = bf0[30];
   bf1[31] = bf0[31];
-  bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit[stage]);
-  bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit[stage]);
-  bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit[stage]);
-  bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit[stage]);
-  bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit[stage]);
-  bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit[stage]);
-  bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit[stage]);
-  bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit[stage]);
-  bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit[stage]);
-  bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit[stage]);
-  bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit[stage]);
-  bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit[stage]);
-  bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit[stage]);
-  bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit[stage]);
-  bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit[stage]);
-  bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit[stage]);
-  bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit[stage]);
-  bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit[stage]);
-  bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit[stage]);
-  bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit[stage]);
-  bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit[stage]);
-  bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit[stage]);
-  bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit[stage]);
-  bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit[stage]);
-  bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit[stage]);
-  bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit[stage]);
-  bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit[stage]);
-  bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit[stage]);
-  bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit[stage]);
-  bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit[stage]);
-  bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit[stage]);
-  bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit);
+  bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit);
+  bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit);
+  bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit);
+  bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit);
+  bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit);
+  bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit);
+  bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit);
+  bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit);
+  bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit);
+  bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit);
+  bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit);
+  bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit);
+  bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit);
+  bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit);
+  bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit);
+  bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit);
+  bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit);
+  bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit);
+  bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit);
+  bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit);
+  bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit);
+  bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit);
+  bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit);
+  bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit);
+  bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit);
+  bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit);
+  bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit);
+  bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit);
+  bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit);
+  bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit);
+  bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0];
@@ -1781,59 +1285,58 @@ void av1_idct64_new(const int32_t *input, int32_t *output,
   bf1[13] = bf0[13];
   bf1[14] = bf0[14];
   bf1[15] = bf0[15];
-  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit[stage]);
-  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit[stage]);
-  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit[stage]);
-  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit[stage]);
-  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit[stage]);
-  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit[stage]);
-  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit[stage]);
-  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit[stage]);
-  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit[stage]);
-  bf1[32] = bf0[32] + bf0[33];
-  bf1[33] = bf0[32] - bf0[33];
-  bf1[34] = -bf0[34] + bf0[35];
-  bf1[35] = bf0[34] + bf0[35];
-  bf1[36] = bf0[36] + bf0[37];
-  bf1[37] = bf0[36] - bf0[37];
-  bf1[38] = -bf0[38] + bf0[39];
-  bf1[39] = bf0[38] + bf0[39];
-  bf1[40] = bf0[40] + bf0[41];
-  bf1[41] = bf0[40] - bf0[41];
-  bf1[42] = -bf0[42] + bf0[43];
-  bf1[43] = bf0[42] + bf0[43];
-  bf1[44] = bf0[44] + bf0[45];
-  bf1[45] = bf0[44] - bf0[45];
-  bf1[46] = -bf0[46] + bf0[47];
-  bf1[47] = bf0[46] + bf0[47];
-  bf1[48] = bf0[48] + bf0[49];
-  bf1[49] = bf0[48] - bf0[49];
-  bf1[50] = -bf0[50] + bf0[51];
-  bf1[51] = bf0[50] + bf0[51];
-  bf1[52] = bf0[52] + bf0[53];
-  bf1[53] = bf0[52] - bf0[53];
-  bf1[54] = -bf0[54] + bf0[55];
-  bf1[55] = bf0[54] + bf0[55];
-  bf1[56] = bf0[56] + bf0[57];
-  bf1[57] = bf0[56] - bf0[57];
-  bf1[58] = -bf0[58] + bf0[59];
-  bf1[59] = bf0[58] + bf0[59];
-  bf1[60] = bf0[60] + bf0[61];
-  bf1[61] = bf0[60] - bf0[61];
-  bf1[62] = -bf0[62] + bf0[63];
-  bf1[63] = bf0[62] + bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
+  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
+  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
+  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
+  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
+  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
+  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
+  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
+  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
+  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
+  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
+  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
+  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
+  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
+  bf1[32] = clamp_value(bf0[32] + bf0[33], stage_range[stage]);
+  bf1[33] = clamp_value(bf0[32] - bf0[33], stage_range[stage]);
+  bf1[34] = clamp_value(-bf0[34] + bf0[35], stage_range[stage]);
+  bf1[35] = clamp_value(bf0[34] + bf0[35], stage_range[stage]);
+  bf1[36] = clamp_value(bf0[36] + bf0[37], stage_range[stage]);
+  bf1[37] = clamp_value(bf0[36] - bf0[37], stage_range[stage]);
+  bf1[38] = clamp_value(-bf0[38] + bf0[39], stage_range[stage]);
+  bf1[39] = clamp_value(bf0[38] + bf0[39], stage_range[stage]);
+  bf1[40] = clamp_value(bf0[40] + bf0[41], stage_range[stage]);
+  bf1[41] = clamp_value(bf0[40] - bf0[41], stage_range[stage]);
+  bf1[42] = clamp_value(-bf0[42] + bf0[43], stage_range[stage]);
+  bf1[43] = clamp_value(bf0[42] + bf0[43], stage_range[stage]);
+  bf1[44] = clamp_value(bf0[44] + bf0[45], stage_range[stage]);
+  bf1[45] = clamp_value(bf0[44] - bf0[45], stage_range[stage]);
+  bf1[46] = clamp_value(-bf0[46] + bf0[47], stage_range[stage]);
+  bf1[47] = clamp_value(bf0[46] + bf0[47], stage_range[stage]);
+  bf1[48] = clamp_value(bf0[48] + bf0[49], stage_range[stage]);
+  bf1[49] = clamp_value(bf0[48] - bf0[49], stage_range[stage]);
+  bf1[50] = clamp_value(-bf0[50] + bf0[51], stage_range[stage]);
+  bf1[51] = clamp_value(bf0[50] + bf0[51], stage_range[stage]);
+  bf1[52] = clamp_value(bf0[52] + bf0[53], stage_range[stage]);
+  bf1[53] = clamp_value(bf0[52] - bf0[53], stage_range[stage]);
+  bf1[54] = clamp_value(-bf0[54] + bf0[55], stage_range[stage]);
+  bf1[55] = clamp_value(bf0[54] + bf0[55], stage_range[stage]);
+  bf1[56] = clamp_value(bf0[56] + bf0[57], stage_range[stage]);
+  bf1[57] = clamp_value(bf0[56] - bf0[57], stage_range[stage]);
+  bf1[58] = clamp_value(-bf0[58] + bf0[59], stage_range[stage]);
+  bf1[59] = clamp_value(bf0[58] + bf0[59], stage_range[stage]);
+  bf1[60] = clamp_value(bf0[60] + bf0[61], stage_range[stage]);
+  bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]);
+  bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]);
+  bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
@@ -1844,326 +1347,322 @@ void av1_idct64_new(const int32_t *input, int32_t *output,
   bf1[5] = bf0[5];
   bf1[6] = bf0[6];
   bf1[7] = bf0[7];
-  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
-  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
-  bf1[16] = bf0[16] + bf0[17];
-  bf1[17] = bf0[16] - bf0[17];
-  bf1[18] = -bf0[18] + bf0[19];
-  bf1[19] = bf0[18] + bf0[19];
-  bf1[20] = bf0[20] + bf0[21];
-  bf1[21] = bf0[20] - bf0[21];
-  bf1[22] = -bf0[22] + bf0[23];
-  bf1[23] = bf0[22] + bf0[23];
-  bf1[24] = bf0[24] + bf0[25];
-  bf1[25] = bf0[24] - bf0[25];
-  bf1[26] = -bf0[26] + bf0[27];
-  bf1[27] = bf0[26] + bf0[27];
-  bf1[28] = bf0[28] + bf0[29];
-  bf1[29] = bf0[28] - bf0[29];
-  bf1[30] = -bf0[30] + bf0[31];
-  bf1[31] = bf0[30] + bf0[31];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
+  bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
+  bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
+  bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
   bf1[32] = bf0[32];
-  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit[stage]);
-  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit[stage]);
+  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
+  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
   bf1[35] = bf0[35];
   bf1[36] = bf0[36];
-  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit[stage]);
-  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
+  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
   bf1[39] = bf0[39];
   bf1[40] = bf0[40];
-  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit[stage]);
-  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
   bf1[43] = bf0[43];
   bf1[44] = bf0[44];
-  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit[stage]);
-  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
+  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
   bf1[47] = bf0[47];
   bf1[48] = bf0[48];
-  bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit[stage]);
-  bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit[stage]);
+  bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit);
+  bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit);
   bf1[51] = bf0[51];
   bf1[52] = bf0[52];
-  bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit[stage]);
-  bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit[stage]);
+  bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit);
+  bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit);
   bf1[55] = bf0[55];
   bf1[56] = bf0[56];
-  bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit[stage]);
-  bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit[stage]);
+  bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit);
+  bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit);
   bf1[59] = bf0[59];
   bf1[60] = bf0[60];
-  bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit[stage]);
-  bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit[stage]);
+  bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit);
+  bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit);
   bf1[63] = bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
   bf1[2] = bf0[2];
   bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
-  bf1[8] = bf0[8] + bf0[9];
-  bf1[9] = bf0[8] - bf0[9];
-  bf1[10] = -bf0[10] + bf0[11];
-  bf1[11] = bf0[10] + bf0[11];
-  bf1[12] = bf0[12] + bf0[13];
-  bf1[13] = bf0[12] - bf0[13];
-  bf1[14] = -bf0[14] + bf0[15];
-  bf1[15] = bf0[14] + bf0[15];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+  bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
   bf1[16] = bf0[16];
-  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
-  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
   bf1[19] = bf0[19];
   bf1[20] = bf0[20];
-  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
   bf1[23] = bf0[23];
   bf1[24] = bf0[24];
-  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit[stage]);
+  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
   bf1[27] = bf0[27];
   bf1[28] = bf0[28];
-  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit[stage]);
-  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit[stage]);
+  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
+  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
   bf1[31] = bf0[31];
-  bf1[32] = bf0[32] + bf0[35];
-  bf1[33] = bf0[33] + bf0[34];
-  bf1[34] = bf0[33] - bf0[34];
-  bf1[35] = bf0[32] - bf0[35];
-  bf1[36] = -bf0[36] + bf0[39];
-  bf1[37] = -bf0[37] + bf0[38];
-  bf1[38] = bf0[37] + bf0[38];
-  bf1[39] = bf0[36] + bf0[39];
-  bf1[40] = bf0[40] + bf0[43];
-  bf1[41] = bf0[41] + bf0[42];
-  bf1[42] = bf0[41] - bf0[42];
-  bf1[43] = bf0[40] - bf0[43];
-  bf1[44] = -bf0[44] + bf0[47];
-  bf1[45] = -bf0[45] + bf0[46];
-  bf1[46] = bf0[45] + bf0[46];
-  bf1[47] = bf0[44] + bf0[47];
-  bf1[48] = bf0[48] + bf0[51];
-  bf1[49] = bf0[49] + bf0[50];
-  bf1[50] = bf0[49] - bf0[50];
-  bf1[51] = bf0[48] - bf0[51];
-  bf1[52] = -bf0[52] + bf0[55];
-  bf1[53] = -bf0[53] + bf0[54];
-  bf1[54] = bf0[53] + bf0[54];
-  bf1[55] = bf0[52] + bf0[55];
-  bf1[56] = bf0[56] + bf0[59];
-  bf1[57] = bf0[57] + bf0[58];
-  bf1[58] = bf0[57] - bf0[58];
-  bf1[59] = bf0[56] - bf0[59];
-  bf1[60] = -bf0[60] + bf0[63];
-  bf1[61] = -bf0[61] + bf0[62];
-  bf1[62] = bf0[61] + bf0[62];
-  bf1[63] = bf0[60] + bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[32] = clamp_value(bf0[32] + bf0[35], stage_range[stage]);
+  bf1[33] = clamp_value(bf0[33] + bf0[34], stage_range[stage]);
+  bf1[34] = clamp_value(bf0[33] - bf0[34], stage_range[stage]);
+  bf1[35] = clamp_value(bf0[32] - bf0[35], stage_range[stage]);
+  bf1[36] = clamp_value(-bf0[36] + bf0[39], stage_range[stage]);
+  bf1[37] = clamp_value(-bf0[37] + bf0[38], stage_range[stage]);
+  bf1[38] = clamp_value(bf0[37] + bf0[38], stage_range[stage]);
+  bf1[39] = clamp_value(bf0[36] + bf0[39], stage_range[stage]);
+  bf1[40] = clamp_value(bf0[40] + bf0[43], stage_range[stage]);
+  bf1[41] = clamp_value(bf0[41] + bf0[42], stage_range[stage]);
+  bf1[42] = clamp_value(bf0[41] - bf0[42], stage_range[stage]);
+  bf1[43] = clamp_value(bf0[40] - bf0[43], stage_range[stage]);
+  bf1[44] = clamp_value(-bf0[44] + bf0[47], stage_range[stage]);
+  bf1[45] = clamp_value(-bf0[45] + bf0[46], stage_range[stage]);
+  bf1[46] = clamp_value(bf0[45] + bf0[46], stage_range[stage]);
+  bf1[47] = clamp_value(bf0[44] + bf0[47], stage_range[stage]);
+  bf1[48] = clamp_value(bf0[48] + bf0[51], stage_range[stage]);
+  bf1[49] = clamp_value(bf0[49] + bf0[50], stage_range[stage]);
+  bf1[50] = clamp_value(bf0[49] - bf0[50], stage_range[stage]);
+  bf1[51] = clamp_value(bf0[48] - bf0[51], stage_range[stage]);
+  bf1[52] = clamp_value(-bf0[52] + bf0[55], stage_range[stage]);
+  bf1[53] = clamp_value(-bf0[53] + bf0[54], stage_range[stage]);
+  bf1[54] = clamp_value(bf0[53] + bf0[54], stage_range[stage]);
+  bf1[55] = clamp_value(bf0[52] + bf0[55], stage_range[stage]);
+  bf1[56] = clamp_value(bf0[56] + bf0[59], stage_range[stage]);
+  bf1[57] = clamp_value(bf0[57] + bf0[58], stage_range[stage]);
+  bf1[58] = clamp_value(bf0[57] - bf0[58], stage_range[stage]);
+  bf1[59] = clamp_value(bf0[56] - bf0[59], stage_range[stage]);
+  bf1[60] = clamp_value(-bf0[60] + bf0[63], stage_range[stage]);
+  bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]);
+  bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]);
+  bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
-  bf1[4] = bf0[4] + bf0[5];
-  bf1[5] = bf0[4] - bf0[5];
-  bf1[6] = -bf0[6] + bf0[7];
-  bf1[7] = bf0[6] + bf0[7];
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
   bf1[8] = bf0[8];
-  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
   bf1[11] = bf0[11];
   bf1[12] = bf0[12];
-  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
   bf1[15] = bf0[15];
-  bf1[16] = bf0[16] + bf0[19];
-  bf1[17] = bf0[17] + bf0[18];
-  bf1[18] = bf0[17] - bf0[18];
-  bf1[19] = bf0[16] - bf0[19];
-  bf1[20] = -bf0[20] + bf0[23];
-  bf1[21] = -bf0[21] + bf0[22];
-  bf1[22] = bf0[21] + bf0[22];
-  bf1[23] = bf0[20] + bf0[23];
-  bf1[24] = bf0[24] + bf0[27];
-  bf1[25] = bf0[25] + bf0[26];
-  bf1[26] = bf0[25] - bf0[26];
-  bf1[27] = bf0[24] - bf0[27];
-  bf1[28] = -bf0[28] + bf0[31];
-  bf1[29] = -bf0[29] + bf0[30];
-  bf1[30] = bf0[29] + bf0[30];
-  bf1[31] = bf0[28] + bf0[31];
+  bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
+  bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
+  bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
+  bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
   bf1[32] = bf0[32];
   bf1[33] = bf0[33];
-  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit[stage]);
-  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit[stage]);
-  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit[stage]);
-  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit[stage]);
+  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
+  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
+  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
+  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
   bf1[38] = bf0[38];
   bf1[39] = bf0[39];
   bf1[40] = bf0[40];
   bf1[41] = bf0[41];
-  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit[stage]);
-  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit[stage]);
-  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit[stage]);
-  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
+  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
+  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
   bf1[46] = bf0[46];
   bf1[47] = bf0[47];
   bf1[48] = bf0[48];
   bf1[49] = bf0[49];
-  bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit[stage]);
-  bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit[stage]);
-  bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit[stage]);
-  bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit[stage]);
+  bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit);
+  bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit);
+  bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit);
+  bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit);
   bf1[54] = bf0[54];
   bf1[55] = bf0[55];
   bf1[56] = bf0[56];
   bf1[57] = bf0[57];
-  bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit[stage]);
-  bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit[stage]);
-  bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit[stage]);
-  bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit[stage]);
+  bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit);
+  bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit);
+  bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit);
+  bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit);
   bf1[62] = bf0[62];
   bf1[63] = bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[3];
-  bf1[1] = bf0[1] + bf0[2];
-  bf1[2] = bf0[1] - bf0[2];
-  bf1[3] = bf0[0] - bf0[3];
+  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
   bf1[4] = bf0[4];
-  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
   bf1[7] = bf0[7];
-  bf1[8] = bf0[8] + bf0[11];
-  bf1[9] = bf0[9] + bf0[10];
-  bf1[10] = bf0[9] - bf0[10];
-  bf1[11] = bf0[8] - bf0[11];
-  bf1[12] = -bf0[12] + bf0[15];
-  bf1[13] = -bf0[13] + bf0[14];
-  bf1[14] = bf0[13] + bf0[14];
-  bf1[15] = bf0[12] + bf0[15];
+  bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
+  bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
   bf1[16] = bf0[16];
   bf1[17] = bf0[17];
-  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
-  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
-  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
   bf1[22] = bf0[22];
   bf1[23] = bf0[23];
   bf1[24] = bf0[24];
   bf1[25] = bf0[25];
-  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit[stage]);
-  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit[stage]);
-  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit[stage]);
+  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
+  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
+  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
+  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
   bf1[30] = bf0[30];
   bf1[31] = bf0[31];
-  bf1[32] = bf0[32] + bf0[39];
-  bf1[33] = bf0[33] + bf0[38];
-  bf1[34] = bf0[34] + bf0[37];
-  bf1[35] = bf0[35] + bf0[36];
-  bf1[36] = bf0[35] - bf0[36];
-  bf1[37] = bf0[34] - bf0[37];
-  bf1[38] = bf0[33] - bf0[38];
-  bf1[39] = bf0[32] - bf0[39];
-  bf1[40] = -bf0[40] + bf0[47];
-  bf1[41] = -bf0[41] + bf0[46];
-  bf1[42] = -bf0[42] + bf0[45];
-  bf1[43] = -bf0[43] + bf0[44];
-  bf1[44] = bf0[43] + bf0[44];
-  bf1[45] = bf0[42] + bf0[45];
-  bf1[46] = bf0[41] + bf0[46];
-  bf1[47] = bf0[40] + bf0[47];
-  bf1[48] = bf0[48] + bf0[55];
-  bf1[49] = bf0[49] + bf0[54];
-  bf1[50] = bf0[50] + bf0[53];
-  bf1[51] = bf0[51] + bf0[52];
-  bf1[52] = bf0[51] - bf0[52];
-  bf1[53] = bf0[50] - bf0[53];
-  bf1[54] = bf0[49] - bf0[54];
-  bf1[55] = bf0[48] - bf0[55];
-  bf1[56] = -bf0[56] + bf0[63];
-  bf1[57] = -bf0[57] + bf0[62];
-  bf1[58] = -bf0[58] + bf0[61];
-  bf1[59] = -bf0[59] + bf0[60];
-  bf1[60] = bf0[59] + bf0[60];
-  bf1[61] = bf0[58] + bf0[61];
-  bf1[62] = bf0[57] + bf0[62];
-  bf1[63] = bf0[56] + bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[32] = clamp_value(bf0[32] + bf0[39], stage_range[stage]);
+  bf1[33] = clamp_value(bf0[33] + bf0[38], stage_range[stage]);
+  bf1[34] = clamp_value(bf0[34] + bf0[37], stage_range[stage]);
+  bf1[35] = clamp_value(bf0[35] + bf0[36], stage_range[stage]);
+  bf1[36] = clamp_value(bf0[35] - bf0[36], stage_range[stage]);
+  bf1[37] = clamp_value(bf0[34] - bf0[37], stage_range[stage]);
+  bf1[38] = clamp_value(bf0[33] - bf0[38], stage_range[stage]);
+  bf1[39] = clamp_value(bf0[32] - bf0[39], stage_range[stage]);
+  bf1[40] = clamp_value(-bf0[40] + bf0[47], stage_range[stage]);
+  bf1[41] = clamp_value(-bf0[41] + bf0[46], stage_range[stage]);
+  bf1[42] = clamp_value(-bf0[42] + bf0[45], stage_range[stage]);
+  bf1[43] = clamp_value(-bf0[43] + bf0[44], stage_range[stage]);
+  bf1[44] = clamp_value(bf0[43] + bf0[44], stage_range[stage]);
+  bf1[45] = clamp_value(bf0[42] + bf0[45], stage_range[stage]);
+  bf1[46] = clamp_value(bf0[41] + bf0[46], stage_range[stage]);
+  bf1[47] = clamp_value(bf0[40] + bf0[47], stage_range[stage]);
+  bf1[48] = clamp_value(bf0[48] + bf0[55], stage_range[stage]);
+  bf1[49] = clamp_value(bf0[49] + bf0[54], stage_range[stage]);
+  bf1[50] = clamp_value(bf0[50] + bf0[53], stage_range[stage]);
+  bf1[51] = clamp_value(bf0[51] + bf0[52], stage_range[stage]);
+  bf1[52] = clamp_value(bf0[51] - bf0[52], stage_range[stage]);
+  bf1[53] = clamp_value(bf0[50] - bf0[53], stage_range[stage]);
+  bf1[54] = clamp_value(bf0[49] - bf0[54], stage_range[stage]);
+  bf1[55] = clamp_value(bf0[48] - bf0[55], stage_range[stage]);
+  bf1[56] = clamp_value(-bf0[56] + bf0[63], stage_range[stage]);
+  bf1[57] = clamp_value(-bf0[57] + bf0[62], stage_range[stage]);
+  bf1[58] = clamp_value(-bf0[58] + bf0[61], stage_range[stage]);
+  bf1[59] = clamp_value(-bf0[59] + bf0[60], stage_range[stage]);
+  bf1[60] = clamp_value(bf0[59] + bf0[60], stage_range[stage]);
+  bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]);
+  bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]);
+  bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 8
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = bf0[0] + bf0[7];
-  bf1[1] = bf0[1] + bf0[6];
-  bf1[2] = bf0[2] + bf0[5];
-  bf1[3] = bf0[3] + bf0[4];
-  bf1[4] = bf0[3] - bf0[4];
-  bf1[5] = bf0[2] - bf0[5];
-  bf1[6] = bf0[1] - bf0[6];
-  bf1[7] = bf0[0] - bf0[7];
+  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
   bf1[8] = bf0[8];
   bf1[9] = bf0[9];
-  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
   bf1[14] = bf0[14];
   bf1[15] = bf0[15];
-  bf1[16] = bf0[16] + bf0[23];
-  bf1[17] = bf0[17] + bf0[22];
-  bf1[18] = bf0[18] + bf0[21];
-  bf1[19] = bf0[19] + bf0[20];
-  bf1[20] = bf0[19] - bf0[20];
-  bf1[21] = bf0[18] - bf0[21];
-  bf1[22] = bf0[17] - bf0[22];
-  bf1[23] = bf0[16] - bf0[23];
-  bf1[24] = -bf0[24] + bf0[31];
-  bf1[25] = -bf0[25] + bf0[30];
-  bf1[26] = -bf0[26] + bf0[29];
-  bf1[27] = -bf0[27] + bf0[28];
-  bf1[28] = bf0[27] + bf0[28];
-  bf1[29] = bf0[26] + bf0[29];
-  bf1[30] = bf0[25] + bf0[30];
-  bf1[31] = bf0[24] + bf0[31];
+  bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
+  bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
+  bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
+  bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
   bf1[32] = bf0[32];
   bf1[33] = bf0[33];
   bf1[34] = bf0[34];
   bf1[35] = bf0[35];
-  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit[stage]);
-  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit[stage]);
-  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit[stage]);
-  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit[stage]);
-  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit[stage]);
-  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit[stage]);
-  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit[stage]);
-  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit[stage]);
+  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
+  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
+  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
+  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
+  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
+  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
   bf1[44] = bf0[44];
   bf1[45] = bf0[45];
   bf1[46] = bf0[46];
@@ -2172,128 +1671,126 @@ void av1_idct64_new(const int32_t *input, int32_t *output,
   bf1[49] = bf0[49];
   bf1[50] = bf0[50];
   bf1[51] = bf0[51];
-  bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit[stage]);
-  bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit[stage]);
-  bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit[stage]);
-  bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit[stage]);
-  bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit[stage]);
-  bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit[stage]);
-  bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit[stage]);
-  bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit[stage]);
+  bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit);
+  bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit);
+  bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit);
+  bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit);
+  bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit);
+  bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit);
+  bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit);
+  bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit);
   bf1[60] = bf0[60];
   bf1[61] = bf0[61];
   bf1[62] = bf0[62];
   bf1[63] = bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 9
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[15];
-  bf1[1] = bf0[1] + bf0[14];
-  bf1[2] = bf0[2] + bf0[13];
-  bf1[3] = bf0[3] + bf0[12];
-  bf1[4] = bf0[4] + bf0[11];
-  bf1[5] = bf0[5] + bf0[10];
-  bf1[6] = bf0[6] + bf0[9];
-  bf1[7] = bf0[7] + bf0[8];
-  bf1[8] = bf0[7] - bf0[8];
-  bf1[9] = bf0[6] - bf0[9];
-  bf1[10] = bf0[5] - bf0[10];
-  bf1[11] = bf0[4] - bf0[11];
-  bf1[12] = bf0[3] - bf0[12];
-  bf1[13] = bf0[2] - bf0[13];
-  bf1[14] = bf0[1] - bf0[14];
-  bf1[15] = bf0[0] - bf0[15];
+  bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
   bf1[16] = bf0[16];
   bf1[17] = bf0[17];
   bf1[18] = bf0[18];
   bf1[19] = bf0[19];
-  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
-  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
-  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
   bf1[28] = bf0[28];
   bf1[29] = bf0[29];
   bf1[30] = bf0[30];
   bf1[31] = bf0[31];
-  bf1[32] = bf0[32] + bf0[47];
-  bf1[33] = bf0[33] + bf0[46];
-  bf1[34] = bf0[34] + bf0[45];
-  bf1[35] = bf0[35] + bf0[44];
-  bf1[36] = bf0[36] + bf0[43];
-  bf1[37] = bf0[37] + bf0[42];
-  bf1[38] = bf0[38] + bf0[41];
-  bf1[39] = bf0[39] + bf0[40];
-  bf1[40] = bf0[39] - bf0[40];
-  bf1[41] = bf0[38] - bf0[41];
-  bf1[42] = bf0[37] - bf0[42];
-  bf1[43] = bf0[36] - bf0[43];
-  bf1[44] = bf0[35] - bf0[44];
-  bf1[45] = bf0[34] - bf0[45];
-  bf1[46] = bf0[33] - bf0[46];
-  bf1[47] = bf0[32] - bf0[47];
-  bf1[48] = -bf0[48] + bf0[63];
-  bf1[49] = -bf0[49] + bf0[62];
-  bf1[50] = -bf0[50] + bf0[61];
-  bf1[51] = -bf0[51] + bf0[60];
-  bf1[52] = -bf0[52] + bf0[59];
-  bf1[53] = -bf0[53] + bf0[58];
-  bf1[54] = -bf0[54] + bf0[57];
-  bf1[55] = -bf0[55] + bf0[56];
-  bf1[56] = bf0[55] + bf0[56];
-  bf1[57] = bf0[54] + bf0[57];
-  bf1[58] = bf0[53] + bf0[58];
-  bf1[59] = bf0[52] + bf0[59];
-  bf1[60] = bf0[51] + bf0[60];
-  bf1[61] = bf0[50] + bf0[61];
-  bf1[62] = bf0[49] + bf0[62];
-  bf1[63] = bf0[48] + bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[32] = clamp_value(bf0[32] + bf0[47], stage_range[stage]);
+  bf1[33] = clamp_value(bf0[33] + bf0[46], stage_range[stage]);
+  bf1[34] = clamp_value(bf0[34] + bf0[45], stage_range[stage]);
+  bf1[35] = clamp_value(bf0[35] + bf0[44], stage_range[stage]);
+  bf1[36] = clamp_value(bf0[36] + bf0[43], stage_range[stage]);
+  bf1[37] = clamp_value(bf0[37] + bf0[42], stage_range[stage]);
+  bf1[38] = clamp_value(bf0[38] + bf0[41], stage_range[stage]);
+  bf1[39] = clamp_value(bf0[39] + bf0[40], stage_range[stage]);
+  bf1[40] = clamp_value(bf0[39] - bf0[40], stage_range[stage]);
+  bf1[41] = clamp_value(bf0[38] - bf0[41], stage_range[stage]);
+  bf1[42] = clamp_value(bf0[37] - bf0[42], stage_range[stage]);
+  bf1[43] = clamp_value(bf0[36] - bf0[43], stage_range[stage]);
+  bf1[44] = clamp_value(bf0[35] - bf0[44], stage_range[stage]);
+  bf1[45] = clamp_value(bf0[34] - bf0[45], stage_range[stage]);
+  bf1[46] = clamp_value(bf0[33] - bf0[46], stage_range[stage]);
+  bf1[47] = clamp_value(bf0[32] - bf0[47], stage_range[stage]);
+  bf1[48] = clamp_value(-bf0[48] + bf0[63], stage_range[stage]);
+  bf1[49] = clamp_value(-bf0[49] + bf0[62], stage_range[stage]);
+  bf1[50] = clamp_value(-bf0[50] + bf0[61], stage_range[stage]);
+  bf1[51] = clamp_value(-bf0[51] + bf0[60], stage_range[stage]);
+  bf1[52] = clamp_value(-bf0[52] + bf0[59], stage_range[stage]);
+  bf1[53] = clamp_value(-bf0[53] + bf0[58], stage_range[stage]);
+  bf1[54] = clamp_value(-bf0[54] + bf0[57], stage_range[stage]);
+  bf1[55] = clamp_value(-bf0[55] + bf0[56], stage_range[stage]);
+  bf1[56] = clamp_value(bf0[55] + bf0[56], stage_range[stage]);
+  bf1[57] = clamp_value(bf0[54] + bf0[57], stage_range[stage]);
+  bf1[58] = clamp_value(bf0[53] + bf0[58], stage_range[stage]);
+  bf1[59] = clamp_value(bf0[52] + bf0[59], stage_range[stage]);
+  bf1[60] = clamp_value(bf0[51] + bf0[60], stage_range[stage]);
+  bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]);
+  bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]);
+  bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 10
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = output;
   bf1 = step;
-  bf1[0] = bf0[0] + bf0[31];
-  bf1[1] = bf0[1] + bf0[30];
-  bf1[2] = bf0[2] + bf0[29];
-  bf1[3] = bf0[3] + bf0[28];
-  bf1[4] = bf0[4] + bf0[27];
-  bf1[5] = bf0[5] + bf0[26];
-  bf1[6] = bf0[6] + bf0[25];
-  bf1[7] = bf0[7] + bf0[24];
-  bf1[8] = bf0[8] + bf0[23];
-  bf1[9] = bf0[9] + bf0[22];
-  bf1[10] = bf0[10] + bf0[21];
-  bf1[11] = bf0[11] + bf0[20];
-  bf1[12] = bf0[12] + bf0[19];
-  bf1[13] = bf0[13] + bf0[18];
-  bf1[14] = bf0[14] + bf0[17];
-  bf1[15] = bf0[15] + bf0[16];
-  bf1[16] = bf0[15] - bf0[16];
-  bf1[17] = bf0[14] - bf0[17];
-  bf1[18] = bf0[13] - bf0[18];
-  bf1[19] = bf0[12] - bf0[19];
-  bf1[20] = bf0[11] - bf0[20];
-  bf1[21] = bf0[10] - bf0[21];
-  bf1[22] = bf0[9] - bf0[22];
-  bf1[23] = bf0[8] - bf0[23];
-  bf1[24] = bf0[7] - bf0[24];
-  bf1[25] = bf0[6] - bf0[25];
-  bf1[26] = bf0[5] - bf0[26];
-  bf1[27] = bf0[4] - bf0[27];
-  bf1[28] = bf0[3] - bf0[28];
-  bf1[29] = bf0[2] - bf0[29];
-  bf1[30] = bf0[1] - bf0[30];
-  bf1[31] = bf0[0] - bf0[31];
+  bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
+  bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
+  bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
   bf1[32] = bf0[32];
   bf1[33] = bf0[33];
   bf1[34] = bf0[34];
@@ -2302,22 +1799,22 @@ void av1_idct64_new(const int32_t *input, int32_t *output,
   bf1[37] = bf0[37];
   bf1[38] = bf0[38];
   bf1[39] = bf0[39];
-  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
-  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
-  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
-  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
-  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
-  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
-  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
-  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
-  bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
-  bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
-  bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
-  bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
-  bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
-  bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
-  bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
-  bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
+  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
+  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
+  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
+  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
+  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
+  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
+  bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
+  bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
+  bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
+  bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
+  bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
+  bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
+  bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
+  bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
   bf1[56] = bf0[56];
   bf1[57] = bf0[57];
   bf1[58] = bf0[58];
@@ -2326,77 +1823,74 @@ void av1_idct64_new(const int32_t *input, int32_t *output,
   bf1[61] = bf0[61];
   bf1[62] = bf0[62];
   bf1[63] = bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  range_check_buf(stage, input, bf1, size, stage_range[stage]);
 
   // stage 11
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[63];
-  bf1[1] = bf0[1] + bf0[62];
-  bf1[2] = bf0[2] + bf0[61];
-  bf1[3] = bf0[3] + bf0[60];
-  bf1[4] = bf0[4] + bf0[59];
-  bf1[5] = bf0[5] + bf0[58];
-  bf1[6] = bf0[6] + bf0[57];
-  bf1[7] = bf0[7] + bf0[56];
-  bf1[8] = bf0[8] + bf0[55];
-  bf1[9] = bf0[9] + bf0[54];
-  bf1[10] = bf0[10] + bf0[53];
-  bf1[11] = bf0[11] + bf0[52];
-  bf1[12] = bf0[12] + bf0[51];
-  bf1[13] = bf0[13] + bf0[50];
-  bf1[14] = bf0[14] + bf0[49];
-  bf1[15] = bf0[15] + bf0[48];
-  bf1[16] = bf0[16] + bf0[47];
-  bf1[17] = bf0[17] + bf0[46];
-  bf1[18] = bf0[18] + bf0[45];
-  bf1[19] = bf0[19] + bf0[44];
-  bf1[20] = bf0[20] + bf0[43];
-  bf1[21] = bf0[21] + bf0[42];
-  bf1[22] = bf0[22] + bf0[41];
-  bf1[23] = bf0[23] + bf0[40];
-  bf1[24] = bf0[24] + bf0[39];
-  bf1[25] = bf0[25] + bf0[38];
-  bf1[26] = bf0[26] + bf0[37];
-  bf1[27] = bf0[27] + bf0[36];
-  bf1[28] = bf0[28] + bf0[35];
-  bf1[29] = bf0[29] + bf0[34];
-  bf1[30] = bf0[30] + bf0[33];
-  bf1[31] = bf0[31] + bf0[32];
-  bf1[32] = bf0[31] - bf0[32];
-  bf1[33] = bf0[30] - bf0[33];
-  bf1[34] = bf0[29] - bf0[34];
-  bf1[35] = bf0[28] - bf0[35];
-  bf1[36] = bf0[27] - bf0[36];
-  bf1[37] = bf0[26] - bf0[37];
-  bf1[38] = bf0[25] - bf0[38];
-  bf1[39] = bf0[24] - bf0[39];
-  bf1[40] = bf0[23] - bf0[40];
-  bf1[41] = bf0[22] - bf0[41];
-  bf1[42] = bf0[21] - bf0[42];
-  bf1[43] = bf0[20] - bf0[43];
-  bf1[44] = bf0[19] - bf0[44];
-  bf1[45] = bf0[18] - bf0[45];
-  bf1[46] = bf0[17] - bf0[46];
-  bf1[47] = bf0[16] - bf0[47];
-  bf1[48] = bf0[15] - bf0[48];
-  bf1[49] = bf0[14] - bf0[49];
-  bf1[50] = bf0[13] - bf0[50];
-  bf1[51] = bf0[12] - bf0[51];
-  bf1[52] = bf0[11] - bf0[52];
-  bf1[53] = bf0[10] - bf0[53];
-  bf1[54] = bf0[9] - bf0[54];
-  bf1[55] = bf0[8] - bf0[55];
-  bf1[56] = bf0[7] - bf0[56];
-  bf1[57] = bf0[6] - bf0[57];
-  bf1[58] = bf0[5] - bf0[58];
-  bf1[59] = bf0[4] - bf0[59];
-  bf1[60] = bf0[3] - bf0[60];
-  bf1[61] = bf0[2] - bf0[61];
-  bf1[62] = bf0[1] - bf0[62];
-  bf1[63] = bf0[0] - bf0[63];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  bf1[0] = clamp_value(bf0[0] + bf0[63], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[62], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[61], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[60], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[59], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[58], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[57], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[56], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[55], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[54], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[10] + bf0[53], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[11] + bf0[52], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[51], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[13] + bf0[50], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[14] + bf0[49], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[15] + bf0[48], stage_range[stage]);
+  bf1[16] = clamp_value(bf0[16] + bf0[47], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[17] + bf0[46], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[18] + bf0[45], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[19] + bf0[44], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[20] + bf0[43], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[21] + bf0[42], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[22] + bf0[41], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[23] + bf0[40], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[24] + bf0[39], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[25] + bf0[38], stage_range[stage]);
+  bf1[26] = clamp_value(bf0[26] + bf0[37], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[27] + bf0[36], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[28] + bf0[35], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[29] + bf0[34], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[30] + bf0[33], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[31] + bf0[32], stage_range[stage]);
+  bf1[32] = clamp_value(bf0[31] - bf0[32], stage_range[stage]);
+  bf1[33] = clamp_value(bf0[30] - bf0[33], stage_range[stage]);
+  bf1[34] = clamp_value(bf0[29] - bf0[34], stage_range[stage]);
+  bf1[35] = clamp_value(bf0[28] - bf0[35], stage_range[stage]);
+  bf1[36] = clamp_value(bf0[27] - bf0[36], stage_range[stage]);
+  bf1[37] = clamp_value(bf0[26] - bf0[37], stage_range[stage]);
+  bf1[38] = clamp_value(bf0[25] - bf0[38], stage_range[stage]);
+  bf1[39] = clamp_value(bf0[24] - bf0[39], stage_range[stage]);
+  bf1[40] = clamp_value(bf0[23] - bf0[40], stage_range[stage]);
+  bf1[41] = clamp_value(bf0[22] - bf0[41], stage_range[stage]);
+  bf1[42] = clamp_value(bf0[21] - bf0[42], stage_range[stage]);
+  bf1[43] = clamp_value(bf0[20] - bf0[43], stage_range[stage]);
+  bf1[44] = clamp_value(bf0[19] - bf0[44], stage_range[stage]);
+  bf1[45] = clamp_value(bf0[18] - bf0[45], stage_range[stage]);
+  bf1[46] = clamp_value(bf0[17] - bf0[46], stage_range[stage]);
+  bf1[47] = clamp_value(bf0[16] - bf0[47], stage_range[stage]);
+  bf1[48] = clamp_value(bf0[15] - bf0[48], stage_range[stage]);
+  bf1[49] = clamp_value(bf0[14] - bf0[49], stage_range[stage]);
+  bf1[50] = clamp_value(bf0[13] - bf0[50], stage_range[stage]);
+  bf1[51] = clamp_value(bf0[12] - bf0[51], stage_range[stage]);
+  bf1[52] = clamp_value(bf0[11] - bf0[52], stage_range[stage]);
+  bf1[53] = clamp_value(bf0[10] - bf0[53], stage_range[stage]);
+  bf1[54] = clamp_value(bf0[9] - bf0[54], stage_range[stage]);
+  bf1[55] = clamp_value(bf0[8] - bf0[55], stage_range[stage]);
+  bf1[56] = clamp_value(bf0[7] - bf0[56], stage_range[stage]);
+  bf1[57] = clamp_value(bf0[6] - bf0[57], stage_range[stage]);
+  bf1[58] = clamp_value(bf0[5] - bf0[58], stage_range[stage]);
+  bf1[59] = clamp_value(bf0[4] - bf0[59], stage_range[stage]);
+  bf1[60] = clamp_value(bf0[3] - bf0[60], stage_range[stage]);
+  bf1[61] = clamp_value(bf0[2] - bf0[61], stage_range[stage]);
+  bf1[62] = clamp_value(bf0[1] - bf0[62], stage_range[stage]);
+  bf1[63] = clamp_value(bf0[0] - bf0[63], stage_range[stage]);
 }
-#endif  // CONFIG_TX64X64
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.h b/third_party/aom/av1/common/av1_inv_txfm1d.h
index 8996f7c9d..64a1a921c 100644
--- a/third_party/aom/av1/common/av1_inv_txfm1d.h
+++ b/third_party/aom/av1/common/av1_inv_txfm1d.h
@@ -18,41 +18,41 @@
 extern "C" {
 #endif
 
-void av1_idct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+static INLINE int32_t clamp_value(int32_t value, int8_t bit) {
+  if (bit <= 0) return value;  // Do nothing for invalid clamp bit.
+  const int64_t max_value = (1LL << (bit - 1)) - 1;
+  const int64_t min_value = -(1LL << (bit - 1));
+  return (int32_t)clamp64(value, min_value, max_value);
+}
+
+static INLINE void clamp_buf(int32_t *buf, int32_t size, int8_t bit) {
+  for (int i = 0; i < size; ++i) buf[i] = clamp_value(buf[i], bit);
+}
+
+void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
                    const int8_t *stage_range);
-void av1_idct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
                    const int8_t *stage_range);
-void av1_idct16_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range);
-void av1_idct32_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range);
-#if CONFIG_TX64X64
-void av1_idct64_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range);
-#endif  // CONFIG_TX64X64
-
-void av1_iadst4_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iadst8_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iadst16_new(const int32_t *input, int32_t *output,
-                     const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iadst32_new(const int32_t *input, int32_t *output,
-                     const int8_t *cos_bit, const int8_t *stage_range);
-#if CONFIG_EXT_TX
-void av1_iidentity4_c(const int32_t *input, int32_t *output,
-                      const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iidentity8_c(const int32_t *input, int32_t *output,
-                      const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iidentity16_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iidentity32_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range);
-#if CONFIG_TX64X64
-void av1_iidentity64_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range);
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
+void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                     const int8_t *stage_range);
+void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range);
+void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range);
+void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range);
+void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range);
 
 #ifdef __cplusplus
 }
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
index 8bcf84e05..4c600f756 100644
--- a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
+++ b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
@@ -14,358 +14,34 @@
 #include "av1/common/av1_inv_txfm1d.h"
 
 // sum of fwd_shift_##
-#if CONFIG_CHROMA_2X2
-#if CONFIG_TX64X64
-static const int8_t fwd_shift_sum[TX_SIZES] = { 3, 2, 1, 0, -2, -4 };
-#else   // CONFIG_TX64X64
-static const int8_t fwd_shift_sum[TX_SIZES] = { 3, 2, 1, 0, -2 };
-#endif  // CONFIG_TX64X64
-#else   // CONFIG_CHROMA_2X2
-#if CONFIG_TX64X64
-static const int8_t fwd_shift_sum[TX_SIZES] = { 2, 1, 0, -2, -4 };
-#else  // CONFIG_TX64X64
-static const int8_t fwd_shift_sum[TX_SIZES] = { 2, 1, 0, -2 };
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_CHROMA_2X2
+static const int8_t inv_start_range[TX_SIZES_ALL] = {
+  5,  // 4x4 transform
+  6,  // 8x8 transform
+  7,  // 16x16 transform
+  7,  // 32x32 transform
+  7,  // 64x64 transform
+  5,  // 4x8 transform
+  5,  // 8x4 transform
+  6,  // 8x16 transform
+  6,  // 16x8 transform
+  6,  // 16x32 transform
+  6,  // 32x16 transform
+  6,  // 32x64 transform
+  6,  // 64x32 transform
+  6,  // 4x16 transform
+  6,  // 16x4 transform
+  7,  // 8x32 transform
+  7,  // 32x8 transform
+  7,  // 16x64 transform
+  7,  // 64x16 transform
+};
+
+extern const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL];
+
+// Values in both inv_cos_bit_col and inv_cos_bit_row are always 12
+// for each valid row and col combination
+#define INV_COS_BIT 12
+extern const int8_t inv_cos_bit_col[5 /*row*/][5 /*col*/];
+extern const int8_t inv_cos_bit_row[5 /*row*/][5 /*col*/];
 
-//  ---------------- 4x4 1D config -----------------------
-// shift
-static const int8_t inv_shift_4[2] = { 0, -4 };
-
-// stage range
-static const int8_t inv_stage_range_col_dct_4[4] = { 3, 3, 2, 2 };
-static const int8_t inv_stage_range_row_dct_4[4] = { 3, 3, 3, 3 };
-static const int8_t inv_stage_range_col_adst_4[6] = { 3, 3, 3, 3, 2, 2 };
-static const int8_t inv_stage_range_row_adst_4[6] = { 3, 3, 3, 3, 3, 3 };
-static const int8_t inv_stage_range_idx_4[1] = { 0 };
-
-// cos bit
-static const int8_t inv_cos_bit_col_dct_4[4] = { 13, 13, 13, 13 };
-static const int8_t inv_cos_bit_row_dct_4[4] = { 13, 13, 13, 13 };
-static const int8_t inv_cos_bit_col_adst_4[6] = { 13, 13, 13, 13, 13, 13 };
-static const int8_t inv_cos_bit_row_adst_4[6] = { 13, 13, 13, 13, 13, 13 };
-
-//  ---------------- 8x8 1D constants -----------------------
-// shift
-static const int8_t inv_shift_8[2] = { 0, -5 };
-
-// stage range
-static const int8_t inv_stage_range_col_dct_8[6] = { 5, 5, 5, 5, 4, 4 };
-static const int8_t inv_stage_range_row_dct_8[6] = { 5, 5, 5, 5, 5, 5 };
-static const int8_t inv_stage_range_col_adst_8[8] = { 5, 5, 5, 5, 5, 5, 4, 4 };
-static const int8_t inv_stage_range_row_adst_8[8] = { 5, 5, 5, 5, 5, 5, 5, 5 };
-static const int8_t inv_stage_range_idx_8[1] = { 0 };
-
-// cos bit
-static const int8_t inv_cos_bit_col_dct_8[6] = { 13, 13, 13, 13, 13, 13 };
-static const int8_t inv_cos_bit_row_dct_8[6] = { 13, 13, 13, 13, 13, 13 };
-static const int8_t inv_cos_bit_col_adst_8[8] = {
-  13, 13, 13, 13, 13, 13, 13, 13
-};
-static const int8_t inv_cos_bit_row_adst_8[8] = {
-  13, 13, 13, 13, 13, 13, 13, 13
-};
-
-//  ---------------- 16x16 1D constants -----------------------
-// shift
-static const int8_t inv_shift_16[2] = { -1, -5 };
-
-// stage range
-static const int8_t inv_stage_range_col_dct_16[8] = { 7, 7, 7, 7, 7, 7, 6, 6 };
-static const int8_t inv_stage_range_row_dct_16[8] = { 7, 7, 7, 7, 7, 7, 7, 7 };
-static const int8_t inv_stage_range_col_adst_16[10] = { 7, 7, 7, 7, 7,
-                                                        7, 7, 7, 6, 6 };
-static const int8_t inv_stage_range_row_adst_16[10] = { 7, 7, 7, 7, 7,
-                                                        7, 7, 7, 7, 7 };
-static const int8_t inv_stage_range_idx_16[1] = { 0 };
-
-// cos bit
-static const int8_t inv_cos_bit_col_dct_16[8] = {
-  13, 13, 13, 13, 13, 13, 13, 13
-};
-static const int8_t inv_cos_bit_row_dct_16[8] = {
-  12, 12, 12, 12, 12, 12, 12, 12
-};
-static const int8_t inv_cos_bit_col_adst_16[10] = { 13, 13, 13, 13, 13,
-                                                    13, 13, 13, 13, 13 };
-static const int8_t inv_cos_bit_row_adst_16[10] = { 12, 12, 12, 12, 12,
-                                                    12, 12, 12, 12, 12 };
-
-//  ---------------- 32x32 1D constants -----------------------
-// shift
-static const int8_t inv_shift_32[2] = { -1, -5 };
-
-// stage range
-static const int8_t inv_stage_range_col_dct_32[10] = { 9, 9, 9, 9, 9,
-                                                       9, 9, 9, 8, 8 };
-static const int8_t inv_stage_range_row_dct_32[10] = { 9, 9, 9, 9, 9,
-                                                       9, 9, 9, 9, 9 };
-static const int8_t inv_stage_range_col_adst_32[12] = { 9, 9, 9, 9, 9, 9,
-                                                        9, 9, 9, 9, 8, 8 };
-static const int8_t inv_stage_range_row_adst_32[12] = { 9, 9, 9, 9, 9, 9,
-                                                        9, 9, 9, 9, 9, 9 };
-static const int8_t inv_stage_range_idx_32[1] = { 0 };
-
-// cos bit
-static const int8_t inv_cos_bit_col_dct_32[10] = { 13, 13, 13, 13, 13,
-                                                   13, 13, 13, 13, 13 };
-static const int8_t inv_cos_bit_row_dct_32[10] = { 12, 12, 12, 12, 12,
-                                                   12, 12, 12, 12, 12 };
-static const int8_t inv_cos_bit_col_adst_32[12] = { 13, 13, 13, 13, 13, 13,
-                                                    13, 13, 13, 13, 13, 13 };
-static const int8_t inv_cos_bit_row_adst_32[12] = { 12, 12, 12, 12, 12, 12,
-                                                    12, 12, 12, 12, 12, 12 };
-
-//  ---------------- 64x64 1D constants -----------------------
-// shift
-static const int8_t inv_shift_64[2] = { -1, -5 };
-
-// stage range
-static const int8_t inv_stage_range_col_dct_64[12] = { 11, 11, 11, 11, 11, 11,
-                                                       11, 11, 11, 11, 10, 10 };
-static const int8_t inv_stage_range_row_dct_64[12] = { 11, 11, 11, 11, 11, 11,
-                                                       11, 11, 11, 11, 11, 11 };
-
-static const int8_t inv_stage_range_idx_64[1] = { 0 };
-
-// cos bit
-static const int8_t inv_cos_bit_col_dct_64[12] = { 13, 13, 13, 13, 13, 13,
-                                                   13, 13, 13, 13, 13, 13 };
-static const int8_t inv_cos_bit_row_dct_64[12] = { 12, 12, 12, 12, 12, 12,
-                                                   12, 12, 12, 12, 12, 12 };
-
-//  ---------------- row config inv_dct_4 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_row_cfg_dct_4 = {
-  4,                          // .txfm_size
-  4,                          // .stage_num
-  inv_shift_4,                // .shift
-  inv_stage_range_row_dct_4,  // .stage_range
-  inv_cos_bit_row_dct_4,      // .cos_bit
-  TXFM_TYPE_DCT4              // .txfm_type
-};
-
-//  ---------------- row config inv_dct_8 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_row_cfg_dct_8 = {
-  8,                          // .txfm_size
-  6,                          // .stage_num
-  inv_shift_8,                // .shift
-  inv_stage_range_row_dct_8,  // .stage_range
-  inv_cos_bit_row_dct_8,      // .cos_bit_
-  TXFM_TYPE_DCT8              // .txfm_type
-};
-//  ---------------- row config inv_dct_16 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_row_cfg_dct_16 = {
-  16,                          // .txfm_size
-  8,                           // .stage_num
-  inv_shift_16,                // .shift
-  inv_stage_range_row_dct_16,  // .stage_range
-  inv_cos_bit_row_dct_16,      // .cos_bit
-  TXFM_TYPE_DCT16              // .txfm_type
-};
-
-//  ---------------- row config inv_dct_32 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_row_cfg_dct_32 = {
-  32,                          // .txfm_size
-  10,                          // .stage_num
-  inv_shift_32,                // .shift
-  inv_stage_range_row_dct_32,  // .stage_range
-  inv_cos_bit_row_dct_32,      // .cos_bit_row
-  TXFM_TYPE_DCT32              // .txfm_type
-};
-
-#if CONFIG_TX64X64
-//  ---------------- row config inv_dct_64 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_row_cfg_dct_64 = {
-  64,                          // .txfm_size
-  12,                          // .stage_num
-  inv_shift_64,                // .shift
-  inv_stage_range_row_dct_64,  // .stage_range
-  inv_cos_bit_row_dct_64,      // .cos_bit
-  TXFM_TYPE_DCT64,             // .txfm_type_col
-};
-#endif  // CONFIG_TX64X64
-
-//  ---------------- row config inv_adst_4 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_row_cfg_adst_4 = {
-  4,                           // .txfm_size
-  6,                           // .stage_num
-  inv_shift_4,                 // .shift
-  inv_stage_range_row_adst_4,  // .stage_range
-  inv_cos_bit_row_adst_4,      // .cos_bit
-  TXFM_TYPE_ADST4,             // .txfm_type
-};
-
-//  ---------------- row config inv_adst_8 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_row_cfg_adst_8 = {
-  8,                           // .txfm_size
-  8,                           // .stage_num
-  inv_shift_8,                 // .shift
-  inv_stage_range_row_adst_8,  // .stage_range
-  inv_cos_bit_row_adst_8,      // .cos_bit
-  TXFM_TYPE_ADST8,             // .txfm_type_col
-};
-
-//  ---------------- row config inv_adst_16 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_row_cfg_adst_16 = {
-  16,                           // .txfm_size
-  10,                           // .stage_num
-  inv_shift_16,                 // .shift
-  inv_stage_range_row_adst_16,  // .stage_range
-  inv_cos_bit_row_adst_16,      // .cos_bit
-  TXFM_TYPE_ADST16,             // .txfm_type
-};
-
-//  ---------------- row config inv_adst_32 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_row_cfg_adst_32 = {
-  32,                           // .txfm_size
-  12,                           // .stage_num
-  inv_shift_32,                 // .shift
-  inv_stage_range_row_adst_32,  // .stage_range
-  inv_cos_bit_row_adst_32,      // .cos_bit
-  TXFM_TYPE_ADST32,             // .txfm_type
-};
-
-//  ---------------- col config inv_dct_4 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_col_cfg_dct_4 = {
-  4,                          // .txfm_size
-  4,                          // .stage_num
-  inv_shift_4,                // .shift
-  inv_stage_range_col_dct_4,  // .stage_range
-  inv_cos_bit_col_dct_4,      // .cos_bit
-  TXFM_TYPE_DCT4              // .txfm_type
-};
-
-//  ---------------- col config inv_dct_8 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_col_cfg_dct_8 = {
-  8,                          // .txfm_size
-  6,                          // .stage_num
-  inv_shift_8,                // .shift
-  inv_stage_range_col_dct_8,  // .stage_range
-  inv_cos_bit_col_dct_8,      // .cos_bit_
-  TXFM_TYPE_DCT8              // .txfm_type
-};
-//  ---------------- col config inv_dct_16 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_col_cfg_dct_16 = {
-  16,                          // .txfm_size
-  8,                           // .stage_num
-  inv_shift_16,                // .shift
-  inv_stage_range_col_dct_16,  // .stage_range
-  inv_cos_bit_col_dct_16,      // .cos_bit
-  TXFM_TYPE_DCT16              // .txfm_type
-};
-
-//  ---------------- col config inv_dct_32 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_col_cfg_dct_32 = {
-  32,                          // .txfm_size
-  10,                          // .stage_num
-  inv_shift_32,                // .shift
-  inv_stage_range_col_dct_32,  // .stage_range
-  inv_cos_bit_col_dct_32,      // .cos_bit_col
-  TXFM_TYPE_DCT32              // .txfm_type
-};
-
-//  ---------------- col config inv_dct_64 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_col_cfg_dct_64 = {
-  64,                          // .txfm_size
-  12,                          // .stage_num
-  inv_shift_64,                // .shift
-  inv_stage_range_col_dct_64,  // .stage_range
-  inv_cos_bit_col_dct_64,      // .cos_bit
-  TXFM_TYPE_DCT64,             // .txfm_type_col
-};
-
-//  ---------------- col config inv_adst_4 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_col_cfg_adst_4 = {
-  4,                           // .txfm_size
-  6,                           // .stage_num
-  inv_shift_4,                 // .shift
-  inv_stage_range_col_adst_4,  // .stage_range
-  inv_cos_bit_col_adst_4,      // .cos_bit
-  TXFM_TYPE_ADST4,             // .txfm_type
-};
-
-//  ---------------- col config inv_adst_8 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_col_cfg_adst_8 = {
-  8,                           // .txfm_size
-  8,                           // .stage_num
-  inv_shift_8,                 // .shift
-  inv_stage_range_col_adst_8,  // .stage_range
-  inv_cos_bit_col_adst_8,      // .cos_bit
-  TXFM_TYPE_ADST8,             // .txfm_type_col
-};
-
-//  ---------------- col config inv_adst_16 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_col_cfg_adst_16 = {
-  16,                           // .txfm_size
-  10,                           // .stage_num
-  inv_shift_16,                 // .shift
-  inv_stage_range_col_adst_16,  // .stage_range
-  inv_cos_bit_col_adst_16,      // .cos_bit
-  TXFM_TYPE_ADST16,             // .txfm_type
-};
-
-//  ---------------- col config inv_adst_32 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_col_cfg_adst_32 = {
-  32,                           // .txfm_size
-  12,                           // .stage_num
-  inv_shift_32,                 // .shift
-  inv_stage_range_col_adst_32,  // .stage_range
-  inv_cos_bit_col_adst_32,      // .cos_bit
-  TXFM_TYPE_ADST32,             // .txfm_type
-};
-
-#if CONFIG_EXT_TX
-// identity does not need to differentiate between row and col
-//  ---------------- row/col config inv_identity_4 ----------
-static const TXFM_1D_CFG inv_txfm_1d_cfg_identity_4 = {
-  4,                      // .txfm_size
-  1,                      // .stage_num
-  inv_shift_4,            // .shift
-  inv_stage_range_idx_4,  // .stage_range
-  NULL,                   // .cos_bit
-  TXFM_TYPE_IDENTITY4,    // .txfm_type
-};
-
-//  ---------------- row/col config inv_identity_8 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_cfg_identity_8 = {
-  8,                      // .txfm_size
-  1,                      // .stage_num
-  inv_shift_8,            // .shift
-  inv_stage_range_idx_8,  // .stage_range
-  NULL,                   // .cos_bit
-  TXFM_TYPE_IDENTITY8,    // .txfm_type
-};
-
-//  ---------------- row/col config inv_identity_16 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_cfg_identity_16 = {
-  16,                      // .txfm_size
-  1,                       // .stage_num
-  inv_shift_16,            // .shift
-  inv_stage_range_idx_16,  // .stage_range
-  NULL,                    // .cos_bit
-  TXFM_TYPE_IDENTITY16,    // .txfm_type
-};
-
-//  ---------------- row/col config inv_identity_32 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_cfg_identity_32 = {
-  32,                      // .txfm_size
-  1,                       // .stage_num
-  inv_shift_32,            // .shift
-  inv_stage_range_idx_32,  // .stage_range
-  NULL,                    // .cos_bit
-  TXFM_TYPE_IDENTITY32,    // .txfm_type
-};
-
-#if CONFIG_TX64X64
-//  ---------------- row/col config inv_identity_32 ----------------
-static const TXFM_1D_CFG inv_txfm_1d_cfg_identity_64 = {
-  64,                      // .txfm_size
-  1,                       // .stage_num
-  inv_shift_64,            // .shift
-  inv_stage_range_idx_64,  // .stage_range
-  NULL,                    // .cos_bit
-  TXFM_TYPE_IDENTITY64,    // .txfm_type
-};
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
 #endif  // AV1_INV_TXFM2D_CFG_H_
diff --git a/third_party/aom/av1/common/av1_inv_txfm2d.c b/third_party/aom/av1/common/av1_inv_txfm2d.c
index 031d11b40..4e6944314 100644
--- a/third_party/aom/av1/common/av1_inv_txfm2d.c
+++ b/third_party/aom/av1/common/av1_inv_txfm2d.c
@@ -9,218 +9,252 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./av1_rtcd.h"
-#include "aom_dsp/inv_txfm.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "av1/common/enums.h"
 #include "av1/common/av1_txfm.h"
 #include "av1/common/av1_inv_txfm1d.h"
 #include "av1/common/av1_inv_txfm1d_cfg.h"
 
+void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
+  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+     0.5 shifts per pixel. */
+  int i;
+  tran_low_t output[16];
+  tran_low_t a1, b1, c1, d1, e1;
+  const tran_low_t *ip = input;
+  tran_low_t *op = output;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0] >> UNIT_QUANT_SHIFT;
+    c1 = ip[1] >> UNIT_QUANT_SHIFT;
+    d1 = ip[2] >> UNIT_QUANT_SHIFT;
+    b1 = ip[3] >> UNIT_QUANT_SHIFT;
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+
+    op[0] = a1;
+    op[1] = b1;
+    op[2] = c1;
+    op[3] = d1;
+    ip += 4;
+    op += 4;
+  }
+
+  ip = output;
+  for (i = 0; i < 4; i++) {
+    a1 = ip[4 * 0];
+    c1 = ip[4 * 1];
+    d1 = ip[4 * 2];
+    b1 = ip[4 * 3];
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+
+    range_check_value(a1, bd + 1);
+    range_check_value(b1, bd + 1);
+    range_check_value(c1, bd + 1);
+    range_check_value(d1, bd + 1);
+
+    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
+    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
+    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
+    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
+
+    ip++;
+    dest++;
+  }
+}
+
+void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+                                int dest_stride, int bd) {
+  int i;
+  tran_low_t a1, e1;
+  tran_low_t tmp[4];
+  const tran_low_t *ip = in;
+  tran_low_t *op = tmp;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  (void)bd;
+
+  a1 = ip[0] >> UNIT_QUANT_SHIFT;
+  e1 = a1 >> 1;
+  a1 -= e1;
+  op[0] = a1;
+  op[1] = op[2] = op[3] = e1;
+
+  ip = tmp;
+  for (i = 0; i < 4; i++) {
+    e1 = ip[0] >> 1;
+    a1 = ip[0] - e1;
+    dest[dest_stride * 0] =
+        highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
+    dest[dest_stride * 1] =
+        highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
+    dest[dest_stride * 2] =
+        highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
+    dest[dest_stride * 3] =
+        highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
+    ip++;
+    dest++;
+  }
+}
+
 static INLINE TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) {
   switch (txfm_type) {
     case TXFM_TYPE_DCT4: return av1_idct4_new;
     case TXFM_TYPE_DCT8: return av1_idct8_new;
     case TXFM_TYPE_DCT16: return av1_idct16_new;
     case TXFM_TYPE_DCT32: return av1_idct32_new;
-#if CONFIG_TX64X64
     case TXFM_TYPE_DCT64: return av1_idct64_new;
-#endif  // CONFIG_TX64X64
     case TXFM_TYPE_ADST4: return av1_iadst4_new;
     case TXFM_TYPE_ADST8: return av1_iadst8_new;
     case TXFM_TYPE_ADST16: return av1_iadst16_new;
-    case TXFM_TYPE_ADST32: return av1_iadst32_new;
-#if CONFIG_EXT_TX
     case TXFM_TYPE_IDENTITY4: return av1_iidentity4_c;
     case TXFM_TYPE_IDENTITY8: return av1_iidentity8_c;
     case TXFM_TYPE_IDENTITY16: return av1_iidentity16_c;
     case TXFM_TYPE_IDENTITY32: return av1_iidentity32_c;
-#if CONFIG_TX64X64
-    case TXFM_TYPE_IDENTITY64: return av1_iidentity64_c;
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
     default: assert(0); return NULL;
   }
 }
 
-static const TXFM_1D_CFG *inv_txfm_col_cfg_ls[TX_TYPES_1D][TX_SIZES] = {
-  // DCT
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &inv_txfm_1d_col_cfg_dct_4, &inv_txfm_1d_col_cfg_dct_8,
-      &inv_txfm_1d_col_cfg_dct_16, &inv_txfm_1d_col_cfg_dct_32,
-#if CONFIG_TX64X64
-      &inv_txfm_1d_col_cfg_dct_64
-#endif  // CONFIG_TX64X64
-  },
-  // ADST
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &inv_txfm_1d_col_cfg_adst_4, &inv_txfm_1d_col_cfg_adst_8,
-      &inv_txfm_1d_col_cfg_adst_16, &inv_txfm_1d_col_cfg_adst_32,
-#if CONFIG_TX64X64
-      NULL
-#endif  // CONFIG_TX64X64
-  },
-#if CONFIG_EXT_TX
-  // FLIPADST
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &inv_txfm_1d_col_cfg_adst_4, &inv_txfm_1d_col_cfg_adst_8,
-      &inv_txfm_1d_col_cfg_adst_16, &inv_txfm_1d_col_cfg_adst_32,
-#if CONFIG_TX64X64
-      NULL
-#endif  // CONFIG_TX64X64
-  },
-  // IDENTITY
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &inv_txfm_1d_cfg_identity_4, &inv_txfm_1d_cfg_identity_8,
-      &inv_txfm_1d_cfg_identity_16, &inv_txfm_1d_cfg_identity_32,
-#if CONFIG_TX64X64
-      &inv_txfm_1d_cfg_identity_64
-#endif  // CONFIG_TX64X64
-  },
-#endif  // CONFIG_EXT_TX
-};
+static const int8_t inv_shift_4x4[2] = { 0, -4 };
+static const int8_t inv_shift_8x8[2] = { -1, -4 };
+static const int8_t inv_shift_16x16[2] = { -2, -4 };
+static const int8_t inv_shift_32x32[2] = { -2, -4 };
+static const int8_t inv_shift_64x64[2] = { -2, -4 };
+static const int8_t inv_shift_4x8[2] = { 0, -4 };
+static const int8_t inv_shift_8x4[2] = { 0, -4 };
+static const int8_t inv_shift_8x16[2] = { -1, -4 };
+static const int8_t inv_shift_16x8[2] = { -1, -4 };
+static const int8_t inv_shift_16x32[2] = { -1, -4 };
+static const int8_t inv_shift_32x16[2] = { -1, -4 };
+static const int8_t inv_shift_32x64[2] = { -1, -4 };
+static const int8_t inv_shift_64x32[2] = { -1, -4 };
+static const int8_t inv_shift_4x16[2] = { -1, -4 };
+static const int8_t inv_shift_16x4[2] = { -1, -4 };
+static const int8_t inv_shift_8x32[2] = { -2, -4 };
+static const int8_t inv_shift_32x8[2] = { -2, -4 };
+static const int8_t inv_shift_16x64[2] = { -2, -4 };
+static const int8_t inv_shift_64x16[2] = { -2, -4 };
 
-static const TXFM_1D_CFG *inv_txfm_row_cfg_ls[TX_TYPES_1D][TX_SIZES] = {
-  // DCT
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &inv_txfm_1d_row_cfg_dct_4, &inv_txfm_1d_row_cfg_dct_8,
-      &inv_txfm_1d_row_cfg_dct_16, &inv_txfm_1d_row_cfg_dct_32,
-#if CONFIG_TX64X64
-      &inv_txfm_1d_row_cfg_dct_64,
-#endif  // CONFIG_TX64X64
-  },
-  // ADST
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &inv_txfm_1d_row_cfg_adst_4, &inv_txfm_1d_row_cfg_adst_8,
-      &inv_txfm_1d_row_cfg_adst_16, &inv_txfm_1d_row_cfg_adst_32,
-#if CONFIG_TX64X64
-      NULL
-#endif  // CONFIG_TX64X64
-  },
-#if CONFIG_EXT_TX
-  // FLIPADST
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &inv_txfm_1d_row_cfg_adst_4, &inv_txfm_1d_row_cfg_adst_8,
-      &inv_txfm_1d_row_cfg_adst_16, &inv_txfm_1d_row_cfg_adst_32,
-#if CONFIG_TX64X64
-      NULL
-#endif  // CONFIG_TX64X64
-  },
-  // IDENTITY
-  {
-#if CONFIG_CHROMA_2X2
-      NULL,
-#endif
-      &inv_txfm_1d_cfg_identity_4, &inv_txfm_1d_cfg_identity_8,
-      &inv_txfm_1d_cfg_identity_16, &inv_txfm_1d_cfg_identity_32,
-#if CONFIG_TX64X64
-      &inv_txfm_1d_cfg_identity_64
-#endif  // CONFIG_TX64X64
-  },
-#endif  // CONFIG_EXT_TX
+const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL] = {
+  inv_shift_4x4,   inv_shift_8x8,   inv_shift_16x16, inv_shift_32x32,
+  inv_shift_64x64, inv_shift_4x8,   inv_shift_8x4,   inv_shift_8x16,
+  inv_shift_16x8,  inv_shift_16x32, inv_shift_32x16, inv_shift_32x64,
+  inv_shift_64x32, inv_shift_4x16,  inv_shift_16x4,  inv_shift_8x32,
+  inv_shift_32x8,  inv_shift_16x64, inv_shift_64x16,
 };
 
-TXFM_2D_FLIP_CFG av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size) {
-  TXFM_2D_FLIP_CFG cfg;
-  set_flip_cfg(tx_type, &cfg);
-  const TX_TYPE_1D tx_type_col = vtx_tab[tx_type];
-  const TX_TYPE_1D tx_type_row = htx_tab[tx_type];
-  const TX_SIZE tx_size_col = txsize_vert_map[tx_size];
-  const TX_SIZE tx_size_row = txsize_horz_map[tx_size];
-  cfg.col_cfg = inv_txfm_col_cfg_ls[tx_type_col][tx_size_col];
-  cfg.row_cfg = inv_txfm_row_cfg_ls[tx_type_row][tx_size_row];
-  return cfg;
-}
+/* clang-format off */
+const int8_t inv_cos_bit_col[MAX_TXWH_IDX]      // txw_idx
+                            [MAX_TXWH_IDX] = {  // txh_idx
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0,           0 },
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0 },
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
+    {           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
+    {           0,           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }
+  };
 
-#if CONFIG_TX64X64
-TXFM_2D_FLIP_CFG av1_get_inv_txfm_64x64_cfg(TX_TYPE tx_type) {
-  TXFM_2D_FLIP_CFG cfg = { 0, 0, NULL, NULL };
-  switch (tx_type) {
-    case DCT_DCT:
-      cfg.col_cfg = &inv_txfm_1d_col_cfg_dct_64;
-      cfg.row_cfg = &inv_txfm_1d_row_cfg_dct_64;
-      set_flip_cfg(tx_type, &cfg);
-      break;
-    default: assert(0);
-  }
-  return cfg;
-}
+const int8_t inv_cos_bit_row[MAX_TXWH_IDX]      // txw_idx
+                            [MAX_TXWH_IDX] = {  // txh_idx
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0,           0 },
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0 },
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
+    {           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
+    {           0,           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }
+  };
+/* clang-format on */
 
-TXFM_2D_FLIP_CFG av1_get_inv_txfm_32x64_cfg(int tx_type) {
-  TXFM_2D_FLIP_CFG cfg = { 0, 0, NULL, NULL };
-  switch (tx_type) {
-    case DCT_DCT:
-      cfg.col_cfg = &inv_txfm_1d_col_cfg_dct_64;
-      cfg.row_cfg = &inv_txfm_1d_row_cfg_dct_32;
-      set_flip_cfg(tx_type, &cfg);
-      break;
-    default: assert(0);
-  }
-  return cfg;
-}
+const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 };
 
-TXFM_2D_FLIP_CFG av1_get_inv_txfm_64x32_cfg(int tx_type) {
-  TXFM_2D_FLIP_CFG cfg = { 0, 0, NULL, NULL };
-  switch (tx_type) {
-    case DCT_DCT:
-      cfg.col_cfg = &inv_txfm_1d_col_cfg_dct_32;
-      cfg.row_cfg = &inv_txfm_1d_row_cfg_dct_64;
-      set_flip_cfg(tx_type, &cfg);
-      break;
-    default: assert(0);
+void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+                          TXFM_2D_FLIP_CFG *cfg) {
+  assert(cfg != NULL);
+  cfg->tx_size = tx_size;
+  set_flip_cfg(tx_type, cfg);
+  av1_zero(cfg->stage_range_col);
+  av1_zero(cfg->stage_range_row);
+  set_flip_cfg(tx_type, cfg);
+  const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+  const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+  cfg->shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  cfg->cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  cfg->cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
+  if (cfg->txfm_type_col == TXFM_TYPE_ADST4) {
+    memcpy(cfg->stage_range_col, iadst4_range, sizeof(iadst4_range));
+  }
+  cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
+  if (cfg->txfm_type_row == TXFM_TYPE_ADST4) {
+    memcpy(cfg->stage_range_row, iadst4_range, sizeof(iadst4_range));
   }
-  return cfg;
+  cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col];
+  cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row];
 }
-#endif  // CONFIG_TX64X64
 
 void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
-                             const TXFM_2D_FLIP_CFG *cfg, int8_t fwd_shift,
+                             const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size,
                              int bd) {
-  // Note when assigning txfm_size_col, we use the txfm_size from the
-  // row configuration and vice versa. This is intentionally done to
-  // accurately perform rectangular transforms. When the transform is
-  // rectangular, the number of columns will be the same as the
-  // txfm_size stored in the row cfg struct. It will make no difference
-  // for square transforms.
-  const int txfm_size_col = cfg->row_cfg->txfm_size;
-  const int txfm_size_row = cfg->col_cfg->txfm_size;
-  // Take the shift from the larger dimension in the rectangular case.
-  const int8_t *shift = (txfm_size_col > txfm_size_row) ? cfg->row_cfg->shift
-                                                        : cfg->col_cfg->shift;
+  const int fwd_shift = inv_start_range[tx_size];
+  const int8_t *shift = cfg->shift;
+  int8_t opt_range_row, opt_range_col;
+  if (bd == 8) {
+    opt_range_row = 16;
+    opt_range_col = 16;
+  } else if (bd == 10) {
+    opt_range_row = 18;
+    opt_range_col = 16;
+  } else {
+    assert(bd == 12);
+    opt_range_row = 20;
+    opt_range_col = 18;
+  }
   // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
-  for (int i = 0; i < cfg->row_cfg->stage_num && i < MAX_TXFM_STAGE_NUM; ++i) {
-    stage_range_row[i] = cfg->row_cfg->stage_range[i] + fwd_shift + bd + 1;
+  for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) {
+    int real_range_row = cfg->stage_range_row[i] + fwd_shift + bd + 1;
+    (void)real_range_row;
+    if (cfg->txfm_type_row == TXFM_TYPE_ADST4 && i == 1) {
+      // the adst4 may use 1 extra bit on top of opt_range_row at stage 1
+      // so opt_range_col >= real_range_col will not hold
+      stage_range_row[i] = opt_range_row;
+    } else {
+      assert(opt_range_row >= real_range_row);
+      stage_range_row[i] = opt_range_row;
+    }
   }
   // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
-  for (int i = 0; i < cfg->col_cfg->stage_num && i < MAX_TXFM_STAGE_NUM; ++i) {
-    stage_range_col[i] =
-        cfg->col_cfg->stage_range[i] + fwd_shift + shift[0] + bd + 1;
+  for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) {
+    int real_range_col =
+        cfg->stage_range_col[i] + fwd_shift + shift[0] + bd + 1;
+    (void)real_range_col;
+    if (cfg->txfm_type_col == TXFM_TYPE_ADST4 && i == 1) {
+      // the adst4 may use 1 extra bit on top of opt_range_row at stage 1
+      // so opt_range_col >= real_range_col will not hold
+      stage_range_col[i] = opt_range_col;
+    } else {
+      assert(opt_range_col >= real_range_col);
+      stage_range_col[i] = opt_range_col;
+    }
   }
 }
 
 static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
                                     int stride, TXFM_2D_FLIP_CFG *cfg,
-                                    int32_t *txfm_buf, int8_t fwd_shift,
+                                    int32_t *txfm_buf, TX_SIZE tx_size,
                                     int bd) {
   // Note when assigning txfm_size_col, we use the txfm_size from the
   // row configuration and vice versa. This is intentionally done to
@@ -228,39 +262,48 @@ static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
   // rectangular, the number of columns will be the same as the
   // txfm_size stored in the row cfg struct. It will make no difference
   // for square transforms.
-  const int txfm_size_col = cfg->row_cfg->txfm_size;
-  const int txfm_size_row = cfg->col_cfg->txfm_size;
+  const int txfm_size_col = tx_size_wide[cfg->tx_size];
+  const int txfm_size_row = tx_size_high[cfg->tx_size];
   // Take the shift from the larger dimension in the rectangular case.
-  const int8_t *shift = (txfm_size_col > txfm_size_row) ? cfg->row_cfg->shift
-                                                        : cfg->col_cfg->shift;
+  const int8_t *shift = cfg->shift;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
   int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
-  assert(cfg->row_cfg->stage_num <= MAX_TXFM_STAGE_NUM);
-  assert(cfg->col_cfg->stage_num <= MAX_TXFM_STAGE_NUM);
-  av1_gen_inv_stage_range(stage_range_col, stage_range_row, cfg, fwd_shift, bd);
+  assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
+  assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
+  av1_gen_inv_stage_range(stage_range_col, stage_range_row, cfg, tx_size, bd);
 
-  const int8_t *cos_bit_col = cfg->col_cfg->cos_bit;
-  const int8_t *cos_bit_row = cfg->row_cfg->cos_bit;
-  const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->col_cfg->txfm_type);
-  const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->row_cfg->txfm_type);
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->txfm_type_row);
 
-  // txfm_buf's length is  txfm_size_row * txfm_size_col + 2 * txfm_size_row
+  // txfm_buf's length is  txfm_size_row * txfm_size_col + 2 *
+  // AOMMAX(txfm_size_row, txfm_size_col)
   // it is used for intermediate data buffering
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
   int32_t *temp_in = txfm_buf;
-  int32_t *temp_out = temp_in + txfm_size_row;
-  int32_t *buf = temp_out + txfm_size_row;
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
   int32_t *buf_ptr = buf;
   int c, r;
 
   // Rows
   for (r = 0; r < txfm_size_row; ++r) {
-    txfm_func_row(input, buf_ptr, cos_bit_row, stage_range_row);
-    round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
-    // Multiply everything by Sqrt2 if the transform is rectangular
-    if (txfm_size_row != txfm_size_col) {
-      for (c = 0; c < txfm_size_col; ++c)
-        buf_ptr[c] = (int32_t)dct_const_round_shift(buf_ptr[c] * Sqrt2);
+    if (abs(rect_type) == 1) {
+      for (c = 0; c < txfm_size_col; ++c) {
+        temp_in[c] = round_shift((int64_t)input[c] * NewInvSqrt2, NewSqrt2Bits);
+      }
+      clamp_buf(temp_in, txfm_size_col, bd + 8);
+      txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row);
+    } else {
+      for (c = 0; c < txfm_size_col; ++c) {
+        temp_in[c] = input[c];
+      }
+      clamp_buf(temp_in, txfm_size_col, bd + 8);
+      txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row);
     }
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
     input += txfm_size_col;
     buf_ptr += txfm_size_col;
   }
@@ -275,8 +318,9 @@ static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
       for (r = 0; r < txfm_size_row; ++r)
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
+    clamp_buf(temp_in, txfm_size_row, AOMMAX(bd + 6, 16));
     txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
-    round_shift_array(temp_out, txfm_size_row, -shift[1]);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
     if (cfg->ud_flip == 0) {
       for (r = 0; r < txfm_size_row; ++r) {
         output[r * stride + c] =
@@ -296,156 +340,166 @@ static INLINE void inv_txfm2d_add_facade(const int32_t *input, uint16_t *output,
                                          int stride, int32_t *txfm_buf,
                                          TX_TYPE tx_type, TX_SIZE tx_size,
                                          int bd) {
-  TXFM_2D_FLIP_CFG cfg = av1_get_inv_txfm_cfg(tx_type, tx_size);
-  TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
-  inv_txfm2d_add_c(input, output, stride, &cfg, txfm_buf,
-                   fwd_shift_sum[tx_size_sqr], bd);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_inv_txfm_cfg(tx_type, tx_size, &cfg);
+  // Forward shift sum uses larger square size, to be consistent with what
+  // av1_gen_inv_stage_range() does for inverse shifts.
+  inv_txfm2d_add_c(input, output, stride, &cfg, txfm_buf, tx_size, bd);
 }
 
 void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output,
                               int stride, TX_TYPE tx_type, int bd) {
-  int txfm_buf[4 * 8 + 8 + 8];
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X8, bd);
 }
 
 void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output,
                               int stride, TX_TYPE tx_type, int bd) {
-#if CONFIG_TXMG
-  int txfm_buf[8 * 4 + 8 + 8];
-  int32_t rinput[8 * 4];
-  uint16_t routput[8 * 4];
-  TX_SIZE tx_size = TX_8X4;
-  TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
-  TX_TYPE rtx_type = av1_rotate_tx_type(tx_type);
-  int w = tx_size_wide[tx_size];
-  int h = tx_size_high[tx_size];
-  int rw = h;
-  int rh = w;
-  transpose_int32(rinput, rw, input, w, w, h);
-  transpose_uint16(routput, rw, output, stride, w, h);
-  inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
-  transpose_uint16(output, stride, routput, rw, rw, rh);
-#else
-  int txfm_buf[8 * 4 + 4 + 4];
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X4, bd);
-#endif
 }
 
 void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output,
                                int stride, TX_TYPE tx_type, int bd) {
-  int txfm_buf[8 * 16 + 16 + 16];
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 16 + 16 + 16]);
   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X16, bd);
 }
 
 void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output,
                                int stride, TX_TYPE tx_type, int bd) {
-#if CONFIG_TXMG
-  int txfm_buf[16 * 8 + 16 + 16];
-  int32_t rinput[16 * 8];
-  uint16_t routput[16 * 8];
-  TX_SIZE tx_size = TX_16X8;
-  TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
-  TX_TYPE rtx_type = av1_rotate_tx_type(tx_type);
-  int w = tx_size_wide[tx_size];
-  int h = tx_size_high[tx_size];
-  int rw = h;
-  int rh = w;
-  transpose_int32(rinput, rw, input, w, w, h);
-  transpose_uint16(routput, rw, output, stride, w, h);
-  inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
-  transpose_uint16(output, stride, routput, rw, rw, rh);
-#else
-  int txfm_buf[16 * 8 + 8 + 8];
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 8 + 16 + 16]);
   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X8, bd);
-#endif
 }
 
 void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output,
                                 int stride, TX_TYPE tx_type, int bd) {
-  int txfm_buf[16 * 32 + 32 + 32];
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 32 + 32 + 32]);
   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X32, bd);
 }
 
 void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output,
                                 int stride, TX_TYPE tx_type, int bd) {
-#if CONFIG_TXMG
-  int txfm_buf[32 * 16 + 32 + 32];
-  int32_t rinput[32 * 16];
-  uint16_t routput[32 * 16];
-  TX_SIZE tx_size = TX_32X16;
-  TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
-  TX_TYPE rtx_type = av1_rotate_tx_type(tx_type);
-  int w = tx_size_wide[tx_size];
-  int h = tx_size_high[tx_size];
-  int rw = h;
-  int rh = w;
-  transpose_int32(rinput, rw, input, w, w, h);
-  transpose_uint16(routput, rw, output, stride, w, h);
-  inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
-  transpose_uint16(output, stride, routput, rw, rw, rh);
-#else
-  int txfm_buf[32 * 16 + 16 + 16];
+  DECLARE_ALIGNED(32, int, txfm_buf[32 * 16 + 32 + 32]);
   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X16, bd);
-#endif
 }
 
 void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output,
                               int stride, TX_TYPE tx_type, int bd) {
-  int txfm_buf[4 * 4 + 4 + 4];
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 4 + 4]);
   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X4, bd);
 }
 
 void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output,
                               int stride, TX_TYPE tx_type, int bd) {
-  int txfm_buf[8 * 8 + 8 + 8];
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 8 + 8 + 8]);
   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X8, bd);
 }
 
 void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output,
                                 int stride, TX_TYPE tx_type, int bd) {
-  int txfm_buf[16 * 16 + 16 + 16];
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 16 + 16 + 16]);
   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X16, bd);
 }
 
 void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output,
                                 int stride, TX_TYPE tx_type, int bd) {
-  int txfm_buf[32 * 32 + 32 + 32];
+  DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
   inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X32, bd);
 }
 
-#if CONFIG_TX64X64
 void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output,
                                 int stride, TX_TYPE tx_type, int bd) {
-  int txfm_buf[64 * 64 + 64 + 64];
-  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_64X64, bd);
+  // TODO(urvang): Can the same array be reused, instead of using a new array?
+  // Remap 32x32 input into a modified 64x64 by:
+  // - Copying over these values in top-left 32x32 locations.
+  // - Setting the rest of the locations to 0.
+  int32_t mod_input[64 * 64];
+  for (int row = 0; row < 32; ++row) {
+    memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+    memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+  }
+  memset(mod_input + 32 * 64, 0, 32 * 64 * sizeof(*mod_input));
+  DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]);
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X64,
+                        bd);
 }
 
 void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output,
                                 int stride, TX_TYPE tx_type, int bd) {
-#if CONFIG_TXMG
-  int txfm_buf[64 * 32 + 64 + 64];
-  int32_t rinput[64 * 32];
-  uint16_t routput[64 * 32];
-  TX_SIZE tx_size = TX_64X32;
-  TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
-  TX_TYPE rtx_type = av1_rotate_tx_type(tx_type);
-  int w = tx_size_wide[tx_size];
-  int h = tx_size_high[tx_size];
-  int rw = h;
-  int rh = w;
-  transpose_int32(rinput, rw, input, w, w, h);
-  transpose_uint16(routput, rw, output, stride, w, h);
-  inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
-  transpose_uint16(output, stride, routput, rw, rw, rh);
-#else
-  int txfm_buf[64 * 32 + 64 + 64];
-  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_64X32, bd);
-#endif
+  // Remap 32x32 input into a modified 64x32 by:
+  // - Copying over these values in top-left 32x32 locations.
+  // - Setting the rest of the locations to 0.
+  int32_t mod_input[64 * 32];
+  for (int row = 0; row < 32; ++row) {
+    memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+    memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+  }
+  DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]);
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X32,
+                        bd);
 }
 
 void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output,
                                 int stride, TX_TYPE tx_type, int bd) {
-  int txfm_buf[64 * 32 + 64 + 64];
-  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X64, bd);
+  // Remap 32x32 input into a modified 32x64 input by:
+  // - Copying over these values in top-left 32x32 locations.
+  // - Setting the rest of the locations to 0.
+  int32_t mod_input[32 * 64];
+  memcpy(mod_input, input, 32 * 32 * sizeof(*mod_input));
+  memset(mod_input + 32 * 32, 0, 32 * 32 * sizeof(*mod_input));
+  DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]);
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_32X64,
+                        bd);
+}
+
+void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  // Remap 16x32 input into a modified 16x64 input by:
+  // - Copying over these values in top-left 16x32 locations.
+  // - Setting the rest of the locations to 0.
+  int32_t mod_input[16 * 64];
+  memcpy(mod_input, input, 16 * 32 * sizeof(*mod_input));
+  memset(mod_input + 16 * 32, 0, 16 * 32 * sizeof(*mod_input));
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]);
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_16X64,
+                        bd);
+}
+
+void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  // Remap 32x16 input into a modified 64x16 by:
+  // - Copying over these values in top-left 32x16 locations.
+  // - Setting the rest of the locations to 0.
+  int32_t mod_input[64 * 16];
+  for (int row = 0; row < 16; ++row) {
+    memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+    memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+  }
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]);
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X16,
+                        bd);
+}
+
+void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X16, bd);
+}
+
+void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X4, bd);
+}
+
+void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X32, bd);
+}
+
+void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X8, bd);
 }
-#endif  // CONFIG_TX64X64
diff --git a/third_party/aom/av1/common/av1_loopfilter.c b/third_party/aom/av1/common/av1_loopfilter.c
index 95f7a8687..738290fad 100644
--- a/third_party/aom/av1/common/av1_loopfilter.c
+++ b/third_party/aom/av1/common/av1_loopfilter.c
@@ -11,8 +11,9 @@
 
 #include <math.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
@@ -21,590 +22,211 @@
 #include "av1/common/reconinter.h"
 #include "av1/common/seg_common.h"
 
-#if CONFIG_LOOPFILTER_LEVEL
 static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_MB_PLANE][2] = {
   { SEG_LVL_ALT_LF_Y_V, SEG_LVL_ALT_LF_Y_H },
   { SEG_LVL_ALT_LF_U, SEG_LVL_ALT_LF_U },
   { SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V }
 };
 
-#if CONFIG_EXT_DELTA_Q
 static const int delta_lf_id_lut[MAX_MB_PLANE][2] = {
   { 0, 1 }, { 2, 2 }, { 3, 3 }
 };
-#endif  // CONFIG_EXT_DELTA_Q
-#endif  // CONFIG_LOOPFILTER_LEVEL
-
-#if CONFIG_LPF_DIRECT
-static void pick_filter_pixel_left(uint8_t *const src, uint8_t *const line,
-                                   int *const orig_pos, int length, int row,
-                                   int col, int width, int height, int pitch,
-                                   int pivot, int direct) {
-  int i;
-  int pos = row * pitch + col;
-
-  for (i = 0; i < length; ++i) {
-    int dy = 0;
-    switch (direct) {
-      case VERT_HORZ: dy = 0; break;
-      case DEGREE_45: dy = 1; break;
-      case DEGREE_135: dy = -1; break;
-    }
-    col -= 1;
-    row += dy;
-    if (col >= 0 && col < width && row >= 0 && row < height) {
-      pos = row * pitch + col;
-      line[pivot - 1 - i] = src[pos];
-      orig_pos[pivot - 1 - i] = pos;
-    }
-  }
-}
 
-static void pick_filter_pixel_right(uint8_t *const src, uint8_t *const line,
-                                    int *const orig_pos, int length, int row,
-                                    int col, int width, int height, int pitch,
-                                    int pivot, int direct) {
-  int i;
-  int pos = row * pitch + col;
+typedef enum EDGE_DIR { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } EDGE_DIR;
 
-  line[pivot] = src[pos];
-  orig_pos[pivot] = pos;
+static const int mode_lf_lut[] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // INTRA_MODES
+  1, 1, 0, 1,                             // INTER_MODES (GLOBALMV == 0)
+  1, 1, 1, 1, 1, 1, 0, 1  // INTER_COMPOUND_MODES (GLOBAL_GLOBALMV == 0)
+};
 
-  for (i = 1; i < length; ++i) {
-    int dy = 0;
-    switch (direct) {
-      case VERT_HORZ: dy = 0; break;
-      case DEGREE_45: dy = -1; break;
-      case DEGREE_135: dy = 1; break;
-    }
-    col += 1;
-    row += dy;
-    if (col >= 0 && col < width && row >= 0 && row < height) {
-      pos = row * pitch + col;
-      line[pivot + i] = src[pos];
-      orig_pos[pivot + i] = pos;
-    }
-  }
-}
+#if LOOP_FILTER_BITMASK
+// 256 bit masks (64x64 / 4x4) for left transform size for Y plane.
+// We use 4 uint64_t to represent the 256 bit.
+// Each 1 represents a position where we should apply a loop filter
+// across the left border of an 4x4 block boundary.
+//
+// In the case of TX_8x8->  ( in low order byte first we end up with
+// a mask that looks like this (-- and | are used for better view)
+//
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    -----------------
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//
+// A loopfilter should be applied to every other 4x4 horizontally.
+// TODO(chengchen): make these tables static
+const FilterMask left_txform_mask[TX_SIZES] = {
+  { { 0xffffffffffffffffULL,  // TX_4X4,
+      0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
 
-static void pick_filter_pixel_above(uint8_t *const src, uint8_t *const line,
-                                    int *const orig_pos, int length, int row,
-                                    int col, int width, int height, int pitch,
-                                    int pivot, int direct) {
-  int i;
-  int pos = row * pitch + col;
-
-  for (i = 0; i < length; ++i) {
-    int dx = 0;
-    switch (direct) {
-      case VERT_HORZ: dx = 0; break;
-      case DEGREE_45: dx = 1; break;
-      case DEGREE_135: dx = -1; break;
-    }
-    col += dx;
-    row -= 1;
-    if (col >= 0 && col < width && row >= 0 && row < height) {
-      pos = row * pitch + col;
-      line[pivot - 1 - i] = src[pos];
-      orig_pos[pivot - 1 - i] = pos;
-    }
-  }
-}
+  { { 0x5555555555555555ULL,  // TX_8X8,
+      0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL } },
 
-static void pick_filter_pixel_bot(uint8_t *const src, uint8_t *const line,
-                                  int *const orig_pos, int length, int row,
-                                  int col, int width, int height, int pitch,
-                                  int pivot, int direct) {
-  int i;
-  int pos = row * pitch + col;
+  { { 0x1111111111111111ULL,  // TX_16X16,
+      0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL } },
 
-  line[pivot] = src[pos];
-  orig_pos[pivot] = pos;
+  { { 0x0101010101010101ULL,  // TX_32X32,
+      0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL } },
 
-  for (i = 1; i < length; ++i) {
-    int dx = 0;
-    switch (direct) {
-      case VERT_HORZ: dx = 0; break;
-      case DEGREE_45: dx = -1; break;
-      case DEGREE_135: dx = 1; break;
-    }
-    col += dx;
-    row += 1;
-    if (col >= 0 && col < width && row >= 0 && row < height) {
-      pos = row * pitch + col;
-      line[pivot + i] = src[pos];
-      orig_pos[pivot + i] = pos;
-    }
-  }
-}
+  { { 0x0001000100010001ULL,  // TX_64X64,
+      0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } },
+};
 
-static void pick_filter_block_vert(uint8_t *const src, uint8_t *const block,
-                                   int *const orig_pos, int length, int row,
-                                   int col, int width, int height, int pitch,
-                                   int pivot, int line_length, int unit,
-                                   int direct) {
-  int i;
-  for (i = 0; i < 8 * unit; ++i) {
-    pick_filter_pixel_left(src, block + i * line_length,
-                           orig_pos + i * line_length, length, row + i, col,
-                           width, height, pitch, pivot, direct);
-    pick_filter_pixel_right(src, block + i * line_length,
-                            orig_pos + i * line_length, length, row + i, col,
-                            width, height, pitch, pivot, direct);
-  }
-}
+// 256 bit masks (64x64 / 4x4) for above transform size for Y plane.
+// We use 4 uint64_t to represent the 256 bit.
+// Each 1 represents a position where we should apply a loop filter
+// across the top border of an 4x4 block boundary.
+//
+// In the case of TX_8x8->  ( in low order byte first we end up with
+// a mask that looks like this
+//
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    -----------------
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//
+// A loopfilter should be applied to every other 4x4 horizontally.
+const FilterMask above_txform_mask[TX_SIZES] = {
+  { { 0xffffffffffffffffULL,  // TX_4X4
+      0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
 
-static void pick_filter_block_horz(uint8_t *const src, uint8_t *const block,
-                                   int *const orig_pos, int length, int row,
-                                   int col, int width, int height, int pitch,
-                                   int pivot, int line_length, int unit,
-                                   int direct) {
-  int i, j;
-  int num = 8 * unit;
-  for (i = 0; i < num; ++i) {
-    pick_filter_pixel_above(src, block + i * line_length,
-                            orig_pos + i * line_length, length, row, col + i,
-                            width, height, pitch, pivot, direct);
-    pick_filter_pixel_bot(src, block + i * line_length,
-                          orig_pos + i * line_length, length, row, col + i,
-                          width, height, pitch, pivot, direct);
-  }
+  { { 0x0000ffff0000ffffULL,  // TX_8X8
+      0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL } },
 
-  // rearrange block
-  // TODO(chengchen): make it in-place or a stand alone function
-  uint8_t tmp_block[256];
-  int tmp_pos[256];
-  for (i = 0; i < 256; ++i) {
-    tmp_block[i] = 0;
-    tmp_pos[i] = -1;
-  }
-  for (i = 0; i < num; ++i) {
-    for (j = 0; j < line_length; ++j) {
-      tmp_block[j * line_length + i] = block[i * line_length + j];
-      tmp_pos[j * line_length + i] = orig_pos[i * line_length + j];
-    }
-  }
-  for (i = 0; i < 256; ++i) {
-    block[i] = tmp_block[i];
-    orig_pos[i] = tmp_pos[i];
-  }
-}
+  { { 0x000000000000ffffULL,  // TX_16X16
+      0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL } },
 
-static int compute_block_grad(uint8_t *const src, int length, int row, int col,
-                              int width, int height, int pitch, int unit,
-                              int vert_or_horz, int direct) {
-  int i, j;
-  int r0, c0, pos0, r1 = 0, c1 = 0, pos1;
-  int sum_grad = 0;
-  for (i = 0; i < 8 * unit; ++i) {
-    // vert_or_horz: 0 vertical edge, 1 horizontal edge
-    r0 = vert_or_horz ? row : row + i;
-    c0 = vert_or_horz ? col + i : col;
-    pos0 = r0 * pitch + c0;
-
-    for (j = 0; j < length; ++j) {
-      if (vert_or_horz == 0) {
-        switch (direct) {
-          case VERT_HORZ: r1 = r0; break;
-          case DEGREE_45: r1 = r0 + 1; break;
-          case DEGREE_135: r1 = r0 - 1; break;
-        }
-        c1 = c0 - 1;
-      } else {
-        r1 = r0 - 1;
-        switch (direct) {
-          case VERT_HORZ: c1 = c0; break;
-          case DEGREE_45: c1 = c0 + 1; break;
-          case DEGREE_135: c1 = c0 - 1; break;
-        }
-      }
-      pos1 = r1 * pitch + c1;
+  { { 0x000000000000ffffULL,  // TX_32X32
+      0x0000000000000000ULL, 0x000000000000ffffULL, 0x0000000000000000ULL } },
 
-      if (r0 >= 0 && r0 < height && c0 >= 0 && c0 < width && r1 >= 0 &&
-          r1 < height && c1 >= 0 && c1 < width) {
-        sum_grad += abs(src[pos1] - src[pos0]);
-      } else {
-        sum_grad += 255;  // penalize unreachable boundary
-      }
-      r0 = r1;
-      c0 = c1;
-      pos0 = pos1;
-    }
+  { { 0x000000000000ffffULL,  // TX_64X64
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+};
 
-    r0 = vert_or_horz ? row : row + i;
-    c0 = vert_or_horz ? col + i : col;
-    pos0 = r0 * pitch + c0;
+// 64 bit mask to shift and set for each prediction size. A bit is set for
+// each 4x4 block that would be in the top left most block of the given block
+// size in the 64x64 block.
+const FilterMask size_mask_y[BLOCK_SIZES_ALL] = {
+  { { 0x0000000000000001ULL,  // BLOCK_4X4
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-    for (j = 0; j < length - 1; ++j) {
-      if (vert_or_horz == 0) {
-        switch (direct) {
-          case VERT_HORZ: r1 = r0; break;
-          case DEGREE_45: r1 = r0 - 1; break;
-          case DEGREE_135: r1 = r0 + 1; break;
-        }
-        c1 = c0 + 1;
-      } else {
-        r1 = r0 + 1;
-        switch (direct) {
-          case VERT_HORZ: c1 = c0; break;
-          case DEGREE_45: c1 = c0 - 1; break;
-          case DEGREE_135: c1 = c0 + 1; break;
-        }
-      }
-      pos1 = r1 * pitch + c1;
+  { { 0x0000000000010001ULL,  // BLOCK_4X8
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-      if (r0 >= 0 && r0 < height && c0 >= 0 && c0 < width && r1 >= 0 &&
-          r1 < height && c1 >= 0 && c1 < width) {
-        sum_grad += abs(src[pos1] - src[pos0]);
-      } else {
-        sum_grad += 255;  // penalize unreachable boundary
-      }
-      r0 = r1;
-      c0 = c1;
-      pos0 = pos1;
-    }
-  }
+  { { 0x0000000000000003ULL,  // BLOCK_8X4
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-  return sum_grad;
-}
+  { { 0x0000000000030003ULL,  // BLOCK_8X8
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-static int pick_min_grad_direct(uint8_t *const src, int length, int row,
-                                int col, int width, int height, int pitch,
-                                int unit, int vert_or_horz) {
-  int direct = VERT_HORZ;
-  int min_grad = INT_MAX, sum_grad = 0;
-
-  int degree;
-  for (degree = 0; degree < FILTER_DEGREES; ++degree) {
-    // compute abs gradient along each line for the filter block
-    sum_grad = compute_block_grad(src, length, row, col, width, height, pitch,
-                                  unit, vert_or_horz, degree);
-    if (sum_grad < min_grad) {
-      min_grad = sum_grad;
-      direct = degree;
-    }
-  }
+  { { 0x0003000300030003ULL,  // BLOCK_8X16
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-  return direct;
-}
-#endif  // CONFIG_LPF_DIRECT
+  { { 0x00000000000f000fULL,  // BLOCK_16X8
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-#define PARALLEL_DEBLOCKING_15TAPLUMAONLY 1
-#define PARALLEL_DEBLOCKING_DISABLE_15TAP 0
-#if CONFIG_DEBLOCK_13TAP
-#define PARALLEL_DEBLOCKING_5_TAP_CHROMA 1
-#else
-#define PARALLEL_DEBLOCKING_5_TAP_CHROMA 0
-#endif
+  { { 0x000f000f000f000fULL,  // BLOCK_16X16
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
-extern void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit,
-                                 const uint8_t *limit, const uint8_t *thresh);
+  { { 0x000f000f000f000fULL,  // BLOCK_16X32
+      0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-extern void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit, const uint8_t *thresh);
+  { { 0x00ff00ff00ff00ffULL,  // BLOCK_32X16
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-extern void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p,
-                                          const uint8_t *blimit,
-                                          const uint8_t *limit,
-                                          const uint8_t *thresh, int bd);
+  { { 0x00ff00ff00ff00ffULL,  // BLOCK_32X32
+      0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-extern void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch,
-                                        const uint8_t *blimit,
-                                        const uint8_t *limit,
-                                        const uint8_t *thresh, int bd);
-#endif
+  { { 0x00ff00ff00ff00ffULL,  // BLOCK_32X64
+      0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL } },
 
-// 64 bit masks for left transform size. Each 1 represents a position where
-// we should apply a loop filter across the left border of an 8x8 block
-// boundary.
-//
-// In the case of TX_16X16->  ( in low order byte first we end up with
-// a mask that looks like this
-//
-//    10101010
-//    10101010
-//    10101010
-//    10101010
-//    10101010
-//    10101010
-//    10101010
-//    10101010
-//
-// A loopfilter should be applied to every other 8x8 horizontally.
-static const uint64_t left_64x64_txform_mask[TX_SIZES] = {
-#if CONFIG_CHROMA_2X2
-  0xffffffffffffffffULL,  // TX_2X2
-#endif
-  0xffffffffffffffffULL,  // TX_4X4
-  0xffffffffffffffffULL,  // TX_8x8
-  0x5555555555555555ULL,  // TX_16x16
-  0x1111111111111111ULL,  // TX_32x32
-#if CONFIG_TX64X64
-  0x0101010101010101ULL,  // TX_64x64
-#endif                    // CONFIG_TX64X64
-};
+  { { 0xffffffffffffffffULL,  // BLOCK_64X32
+      0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-// 64 bit masks for above transform size. Each 1 represents a position where
-// we should apply a loop filter across the top border of an 8x8 block
-// boundary.
-//
-// In the case of TX_32x32 ->  ( in low order byte first we end up with
-// a mask that looks like this
-//
-//    11111111
-//    00000000
-//    00000000
-//    00000000
-//    11111111
-//    00000000
-//    00000000
-//    00000000
-//
-// A loopfilter should be applied to every other 4 the row vertically.
-static const uint64_t above_64x64_txform_mask[TX_SIZES] = {
-#if CONFIG_CHROMA_2X2
-  0xffffffffffffffffULL,  // TX_4X4
-#endif
-  0xffffffffffffffffULL,  // TX_4X4
-  0xffffffffffffffffULL,  // TX_8x8
-  0x00ff00ff00ff00ffULL,  // TX_16x16
-  0x000000ff000000ffULL,  // TX_32x32
-#if CONFIG_TX64X64
-  0x00000000000000ffULL,  // TX_64x64
-#endif                    // CONFIG_TX64X64
-};
+  { { 0xffffffffffffffffULL,  // BLOCK_64X64
+      0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
+  // Y plane max coding block size is 128x128, but the codec divides it
+  // into 4 64x64 blocks.
+  // BLOCK_64X128
+  { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
+  // BLOCK_128X64
+  { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
+  // BLOCK_128X128
+  { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
 
-// 64 bit masks for prediction sizes (left). Each 1 represents a position
-// where left border of an 8x8 block. These are aligned to the right most
-// appropriate bit, and then shifted into place.
-//
-// In the case of TX_16x32 ->  ( low order byte first ) we end up with
-// a mask that looks like this :
-//
-//  10000000
-//  10000000
-//  10000000
-//  10000000
-//  00000000
-//  00000000
-//  00000000
-//  00000000
-static const uint64_t left_prediction_mask[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0x0000000000000001ULL,  // BLOCK_2X2,
-  0x0000000000000001ULL,  // BLOCK_2X4,
-  0x0000000000000001ULL,  // BLOCK_4X2,
-#endif
-  0x0000000000000001ULL,  // BLOCK_4X4,
-  0x0000000000000001ULL,  // BLOCK_4X8,
-  0x0000000000000001ULL,  // BLOCK_8X4,
-  0x0000000000000001ULL,  // BLOCK_8X8,
-  0x0000000000000101ULL,  // BLOCK_8X16,
-  0x0000000000000001ULL,  // BLOCK_16X8,
-  0x0000000000000101ULL,  // BLOCK_16X16,
-  0x0000000001010101ULL,  // BLOCK_16X32,
-  0x0000000000000101ULL,  // BLOCK_32X16,
-  0x0000000001010101ULL,  // BLOCK_32X32,
-  0x0101010101010101ULL,  // BLOCK_32X64,
-  0x0000000001010101ULL,  // BLOCK_64X32,
-  0x0101010101010101ULL,  // BLOCK_64X64,
-  0x0000000000000101ULL,  // BLOCK_4X16,
-  0x0000000000000001ULL,  // BLOCK_16X4,
-  0x0000000001010101ULL,  // BLOCK_8X32,
-  0x0000000000000001ULL,  // BLOCK_32X8,
-  0x0101010101010101ULL,  // BLOCK_16X64,
-  0x0000000000000101ULL,  // BLOCK_64X16
-};
+  { { 0x0001000100010001ULL,  // BLOCK_4X16
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-// 64 bit mask to shift and set for each prediction size.
-static const uint64_t above_prediction_mask[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0x0000000000000001ULL,  // BLOCK_2X2
-  0x0000000000000001ULL,  // BLOCK_2X4
-  0x0000000000000001ULL,  // BLOCK_4X2
-#endif
-  0x0000000000000001ULL,  // BLOCK_4X4
-  0x0000000000000001ULL,  // BLOCK_4X8
-  0x0000000000000001ULL,  // BLOCK_8X4
-  0x0000000000000001ULL,  // BLOCK_8X8
-  0x0000000000000001ULL,  // BLOCK_8X16,
-  0x0000000000000003ULL,  // BLOCK_16X8
-  0x0000000000000003ULL,  // BLOCK_16X16
-  0x0000000000000003ULL,  // BLOCK_16X32,
-  0x000000000000000fULL,  // BLOCK_32X16,
-  0x000000000000000fULL,  // BLOCK_32X32,
-  0x000000000000000fULL,  // BLOCK_32X64,
-  0x00000000000000ffULL,  // BLOCK_64X32,
-  0x00000000000000ffULL,  // BLOCK_64X64,
-  0x0000000000000001ULL,  // BLOCK_4X16,
-  0x0000000000000003ULL,  // BLOCK_16X4,
-  0x0000000000000001ULL,  // BLOCK_8X32,
-  0x000000000000000fULL,  // BLOCK_32X8,
-  0x0000000000000003ULL,  // BLOCK_16X64,
-  0x00000000000000ffULL,  // BLOCK_64X16
-};
-// 64 bit mask to shift and set for each prediction size. A bit is set for
-// each 8x8 block that would be in the top left most block of the given block
-// size in the 64x64 block.
-static const uint64_t size_mask[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0x0000000000000001ULL,  // BLOCK_2X2
-  0x0000000000000001ULL,  // BLOCK_2X4
-  0x0000000000000001ULL,  // BLOCK_4X2
-#endif
-  0x0000000000000001ULL,  // BLOCK_4X4
-  0x0000000000000001ULL,  // BLOCK_4X8
-  0x0000000000000001ULL,  // BLOCK_8X4
-  0x0000000000000001ULL,  // BLOCK_8X8
-  0x0000000000000101ULL,  // BLOCK_8X16,
-  0x0000000000000003ULL,  // BLOCK_16X8
-  0x0000000000000303ULL,  // BLOCK_16X16
-  0x0000000003030303ULL,  // BLOCK_16X32,
-  0x0000000000000f0fULL,  // BLOCK_32X16,
-  0x000000000f0f0f0fULL,  // BLOCK_32X32,
-  0x0f0f0f0f0f0f0f0fULL,  // BLOCK_32X64,
-  0x00000000ffffffffULL,  // BLOCK_64X32,
-  0xffffffffffffffffULL,  // BLOCK_64X64,
-  0x0000000000000101ULL,  // BLOCK_4X16,
-  0x0000000000000003ULL,  // BLOCK_16X4,
-  0x0000000001010101ULL,  // BLOCK_8X32,
-  0x000000000000000fULL,  // BLOCK_32X8,
-  0x0303030303030303ULL,  // BLOCK_16X64,
-  0x000000000000ffffULL,  // BLOCK_64X16
-};
+  { { 0x000000000000000fULL,  // BLOCK_16X4
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-// These are used for masking the left and above 32x32 borders.
-static const uint64_t left_border = 0x1111111111111111ULL;
-static const uint64_t above_border = 0x000000ff000000ffULL;
+  { { 0x0003000300030003ULL,  // BLOCK_8X32
+      0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-// 16 bit masks for uv transform sizes.
-static const uint16_t left_64x64_txform_mask_uv[TX_SIZES] = {
-#if CONFIG_CHROMA_2X2
-  0xffff,  // TX_2X2
-#endif
-  0xffff,  // TX_4X4
-  0xffff,  // TX_8x8
-  0x5555,  // TX_16x16
-  0x1111,  // TX_32x32
-#if CONFIG_TX64X64
-  0x0101,  // TX_64x64, never used
-#endif     // CONFIG_TX64X64
-};
+  { { 0x0000000000ff00ffULL,  // BLOCK_32X8
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
 
-static const uint16_t above_64x64_txform_mask_uv[TX_SIZES] = {
-#if CONFIG_CHROMA_2X2
-  0xffff,  // TX_2X2
-#endif
-  0xffff,  // TX_4X4
-  0xffff,  // TX_8x8
-  0x0f0f,  // TX_16x16
-  0x000f,  // TX_32x32
-#if CONFIG_TX64X64
-  0x0003,  // TX_64x64, never used
-#endif     // CONFIG_TX64X64
-};
+  { { 0x000f000f000f000fULL,  // BLOCK_16X64
+      0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL } },
 
-// 16 bit left mask to shift and set for each uv prediction size.
-static const uint16_t left_prediction_mask_uv[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0x0001,  // BLOCK_2X2,
-  0x0001,  // BLOCK_2X4,
-  0x0001,  // BLOCK_4X2,
-#endif
-  0x0001,  // BLOCK_4X4,
-  0x0001,  // BLOCK_4X8,
-  0x0001,  // BLOCK_8X4,
-  0x0001,  // BLOCK_8X8,
-  0x0001,  // BLOCK_8X16,
-  0x0001,  // BLOCK_16X8,
-  0x0001,  // BLOCK_16X16,
-  0x0011,  // BLOCK_16X32,
-  0x0001,  // BLOCK_32X16,
-  0x0011,  // BLOCK_32X32,
-  0x1111,  // BLOCK_32X64
-  0x0011,  // BLOCK_64X32,
-  0x1111,  // BLOCK_64X64,
-  0x0001,  // BLOCK_4X16,
-  0x0001,  // BLOCK_16X4,
-  0x0011,  // BLOCK_8X32,
-  0x0001,  // BLOCK_32X8,
-  0x1111,  // BLOCK_16X64,
-  0x0001,  // BLOCK_64X16,
+  { { 0xffffffffffffffffULL,  // BLOCK_64X16
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }
 };
 
-// 16 bit above mask to shift and set for uv each prediction size.
-static const uint16_t above_prediction_mask_uv[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0x0001,  // BLOCK_2X2
-  0x0001,  // BLOCK_2X4
-  0x0001,  // BLOCK_4X2
-#endif
-  0x0001,  // BLOCK_4X4
-  0x0001,  // BLOCK_4X8
-  0x0001,  // BLOCK_8X4
-  0x0001,  // BLOCK_8X8
-  0x0001,  // BLOCK_8X16,
-  0x0001,  // BLOCK_16X8
-  0x0001,  // BLOCK_16X16
-  0x0001,  // BLOCK_16X32,
-  0x0003,  // BLOCK_32X16,
-  0x0003,  // BLOCK_32X32,
-  0x0003,  // BLOCK_32X64,
-  0x000f,  // BLOCK_64X32,
-  0x000f,  // BLOCK_64X64,
-  0x0001,  // BLOCK_4X16,
-  0x0001,  // BLOCK_16X4,
-  0x0001,  // BLOCK_8X32,
-  0x0003,  // BLOCK_32X8,
-  0x0001,  // BLOCK_16X64,
-  0x000f,  // BLOCK_64X16
-};
+LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm, int mi_row,
+                                     int mi_col) {
+  if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
+      (mi_col << MI_SIZE_LOG2) >= cm->width)
+    return NULL;
+  assert(cm->lf.lfm != NULL);
+  const int row = mi_row >> MIN_MIB_SIZE_LOG2;  // 64x64
+  const int col = mi_col >> MIN_MIB_SIZE_LOG2;
+  return &cm->lf.lfm[row * cm->lf.lfm_stride + col];
+}
 
-// 64 bit mask to shift and set for each uv prediction size
-static const uint16_t size_mask_uv[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0x0001,  // BLOCK_2X2
-  0x0001,  // BLOCK_2X4
-  0x0001,  // BLOCK_4X2
-#endif
-  0x0001,  // BLOCK_4X4
-  0x0001,  // BLOCK_4X8
-  0x0001,  // BLOCK_8X4
-  0x0001,  // BLOCK_8X8
-  0x0001,  // BLOCK_8X16,
-  0x0001,  // BLOCK_16X8
-  0x0001,  // BLOCK_16X16
-  0x0011,  // BLOCK_16X32,
-  0x0003,  // BLOCK_32X16,
-  0x0033,  // BLOCK_32X32,
-  0x3333,  // BLOCK_32X64,
-  0x00ff,  // BLOCK_64X32,
-  0xffff,  // BLOCK_64X64,
-  0x0001,  // BLOCK_4X16,
-  0x0001,  // BLOCK_16X4,
-  0x0011,  // BLOCK_8X32,
-  0x0003,  // BLOCK_32X8,
-  0x1111,  // BLOCK_16X64,
-  0x000f,  // BLOCK_64X16
-};
-static const uint16_t left_border_uv = 0x1111;
-static const uint16_t above_border_uv = 0x000f;
+typedef void (*LpfFunc)(uint8_t *s, int p, const uint8_t *blimit,
+                        const uint8_t *limit, const uint8_t *thresh);
 
-static const int mode_lf_lut[] = {
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // INTRA_MODES
-  0,
-#if CONFIG_SMOOTH_HV
-  0, 0,
-#endif         // CONFIG_SMOOTH_HV
-  1, 1, 0, 1,  // INTER_MODES (ZEROMV == 0)
-#if CONFIG_COMPOUND_SINGLEREF
-  // 1, 1, 1, 1, 1,       // INTER_SINGLEREF_COMP_MODES
-  // NOTE(zoeliu): Remove SR_NEAREST_NEWMV
-  1, 1, 1, 1,             // INTER_SINGLEREF_COMP_MODES
-#endif                    // CONFIG_COMPOUND_SINGLEREF
-  1, 1, 1, 1, 1, 1, 0, 1  // INTER_COMPOUND_MODES (ZERO_ZEROMV == 0)
-};
+typedef void (*LpfDualFunc)(uint8_t *s, int p, const uint8_t *blimit0,
+                            const uint8_t *limit0, const uint8_t *thresh0,
+                            const uint8_t *blimit1, const uint8_t *limit1,
+                            const uint8_t *thresh1);
+
+typedef void (*HbdLpfFunc)(uint16_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh, int bd);
+
+typedef void (*HbdLpfDualFunc)(uint16_t *s, int p, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1, int bd);
+#endif  // LOOP_FILTER_BITMASK
 
 static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
   int lvl;
@@ -626,64 +248,36 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
            SIMD_WIDTH);
   }
 }
-#if CONFIG_EXT_DELTA_Q
 static uint8_t get_filter_level(const AV1_COMMON *cm,
                                 const loop_filter_info_n *lfi_n,
-#if CONFIG_LOOPFILTER_LEVEL
                                 const int dir_idx, int plane,
-#endif
-#if CONFIG_LPF_SB
-                                int mi_row, int mi_col,
-#endif
                                 const MB_MODE_INFO *mbmi) {
-#if CONFIG_LPF_SB
-  return cm->mi[mi_row * cm->mi_stride + mi_col].mbmi.filt_lvl;
-#endif
-
-#if CONFIG_SUPERTX
-  const int segment_id = AOMMIN(mbmi->segment_id, mbmi->segment_id_supertx);
-  assert(
-      IMPLIES(supertx_enabled(mbmi), mbmi->segment_id_supertx != MAX_SEGMENTS));
-  assert(IMPLIES(supertx_enabled(mbmi),
-                 mbmi->segment_id_supertx <= mbmi->segment_id));
-#else
   const int segment_id = mbmi->segment_id;
-#endif  // CONFIG_SUPERTX
   if (cm->delta_lf_present_flag) {
-#if CONFIG_LOOPFILTER_LEVEL
     int delta_lf;
     if (cm->delta_lf_multi) {
       const int delta_lf_idx = delta_lf_id_lut[plane][dir_idx];
-      delta_lf = mbmi->curr_delta_lf[delta_lf_idx];
+      delta_lf = mbmi->delta_lf[delta_lf_idx];
     } else {
-      delta_lf = mbmi->current_delta_lf_from_base;
+      delta_lf = mbmi->delta_lf_from_base;
     }
-    int lvl_seg =
-        clamp(delta_lf + cm->lf.filter_level[dir_idx], 0, MAX_LOOP_FILTER);
-#else
-    int lvl_seg = clamp(mbmi->current_delta_lf_from_base + cm->lf.filter_level,
-                        0, MAX_LOOP_FILTER);
-#endif
-    const int scale = 1 << (lvl_seg >> 5);
-#if CONFIG_LOOPFILTER_LEVEL
+    int base_level;
+    if (plane == 0)
+      base_level = cm->lf.filter_level[dir_idx];
+    else if (plane == 1)
+      base_level = cm->lf.filter_level_u;
+    else
+      base_level = cm->lf.filter_level_v;
+    int lvl_seg = clamp(delta_lf + base_level, 0, MAX_LOOP_FILTER);
     assert(plane >= 0 && plane <= 2);
     const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir_idx];
     if (segfeature_active(&cm->seg, segment_id, seg_lf_feature_id)) {
       const int data = get_segdata(&cm->seg, segment_id, seg_lf_feature_id);
-      lvl_seg =
-          clamp(cm->seg.abs_delta == SEGMENT_ABSDATA ? data : lvl_seg + data, 0,
-                MAX_LOOP_FILTER);
-    }
-#else
-    if (segfeature_active(&cm->seg, segment_id, SEG_LVL_ALT_LF)) {
-      const int data = get_segdata(&cm->seg, segment_id, SEG_LVL_ALT_LF);
-      lvl_seg =
-          clamp(cm->seg.abs_delta == SEGMENT_ABSDATA ? data : lvl_seg + data, 0,
-                MAX_LOOP_FILTER);
+      lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
     }
-#endif  // CONFIG_LOOPFILTER_LEVEL
 
     if (cm->lf.mode_ref_delta_enabled) {
+      const int scale = 1 << (lvl_seg >> 5);
       lvl_seg += cm->lf.ref_deltas[mbmi->ref_frame[0]] * scale;
       if (mbmi->ref_frame[0] > INTRA_FRAME)
         lvl_seg += cm->lf.mode_deltas[mode_lf_lut[mbmi->mode]] * scale;
@@ -691,29 +285,10 @@ static uint8_t get_filter_level(const AV1_COMMON *cm,
     }
     return lvl_seg;
   } else {
-#if CONFIG_LOOPFILTER_LEVEL
-    return lfi_n
-        ->lvl[segment_id][dir_idx][mbmi->ref_frame[0]][mode_lf_lut[mbmi->mode]];
-#else
-    return lfi_n->lvl[segment_id][mbmi->ref_frame[0]][mode_lf_lut[mbmi->mode]];
-#endif
+    return lfi_n->lvl[plane][segment_id][dir_idx][mbmi->ref_frame[0]]
+                     [mode_lf_lut[mbmi->mode]];
   }
 }
-#else
-static uint8_t get_filter_level(const loop_filter_info_n *lfi_n,
-                                const MB_MODE_INFO *mbmi) {
-#if CONFIG_SUPERTX
-  const int segment_id = AOMMIN(mbmi->segment_id, mbmi->segment_id_supertx);
-  assert(
-      IMPLIES(supertx_enabled(mbmi), mbmi->segment_id_supertx != MAX_SEGMENTS));
-  assert(IMPLIES(supertx_enabled(mbmi),
-                 mbmi->segment_id_supertx <= mbmi->segment_id));
-#else
-  const int segment_id = mbmi->segment_id;
-#endif  // CONFIG_SUPERTX
-  return lfi_n->lvl[segment_id][mbmi->ref_frame[0]][mode_lf_lut[mbmi->mode]];
-}
-#endif
 
 void av1_loop_filter_init(AV1_COMMON *cm) {
   assert(MB_MODE_COUNT == NELEMENTS(mode_lf_lut));
@@ -721,172 +296,728 @@ void av1_loop_filter_init(AV1_COMMON *cm) {
   struct loopfilter *lf = &cm->lf;
   int lvl;
 
+  lf->combine_vert_horz_lf = 1;
+
   // init limits for given sharpness
   update_sharpness(lfi, lf->sharpness_level);
-  lf->last_sharpness_level = lf->sharpness_level;
 
   // init hev threshold const vectors
   for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
     memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
 }
 
-#if CONFIG_LPF_SB
-void av1_loop_filter_sb_level_init(AV1_COMMON *cm, int mi_row, int mi_col,
-                                   int lvl) {
-  const int mi_row_start = AOMMAX(0, mi_row - FILT_BOUNDARY_MI_OFFSET);
-  const int mi_col_start = AOMMAX(0, mi_col - FILT_BOUNDARY_MI_OFFSET);
-  const int mi_row_range = mi_row - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE;
-  const int mi_col_range = mi_col - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE;
-  const int mi_row_end = AOMMIN(mi_row_range, cm->mi_rows);
-  const int mi_col_end = AOMMIN(mi_col_range, cm->mi_cols);
-
-  int row, col;
-  for (row = mi_row_start; row < mi_row_end; ++row) {
-    for (col = mi_col_start; col < mi_col_end; ++col) {
-      // Note: can't use cm->mi_grid_visible. Because for each partition,
-      // all visible pointers will point to the first of the partition.
-      cm->mi[row * cm->mi_stride + col].mbmi.filt_lvl = lvl;
-    }
-  }
-}
-#endif  // CONFIG_LPF_SB
-
-void av1_loop_filter_frame_init(AV1_COMMON *cm, int default_filt_lvl,
-                                int default_filt_lvl_r
-#if CONFIG_LOOPFILTER_LEVEL
-                                ,
-                                int plane
-#endif
-                                ) {
+// Update the loop filter for the current frame.
+// This should be called before loop_filter_rows(),
+// av1_loop_filter_frame() calls this function directly.
+void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
+                                int plane_end) {
+  int filt_lvl[MAX_MB_PLANE], filt_lvl_r[MAX_MB_PLANE];
+  int plane;
   int seg_id;
   // n_shift is the multiplier for lf_deltas
   // the multiplier is 1 for when filter_lvl is between 0 and 31;
   // 2 when filter_lvl is between 32 and 63
-  int scale = 1 << (default_filt_lvl >> 5);
   loop_filter_info_n *const lfi = &cm->lf_info;
   struct loopfilter *const lf = &cm->lf;
   const struct segmentation *const seg = &cm->seg;
 
-  // update limits if sharpness has changed
-  if (lf->last_sharpness_level != lf->sharpness_level) {
-    update_sharpness(lfi, lf->sharpness_level);
-    lf->last_sharpness_level = lf->sharpness_level;
-  }
+  // update sharpness limits
+  update_sharpness(lfi, lf->sharpness_level);
+
+  filt_lvl[0] = cm->lf.filter_level[0];
+  filt_lvl[1] = cm->lf.filter_level_u;
+  filt_lvl[2] = cm->lf.filter_level_v;
+
+  filt_lvl_r[0] = cm->lf.filter_level[1];
+  filt_lvl_r[1] = cm->lf.filter_level_u;
+  filt_lvl_r[2] = cm->lf.filter_level_v;
+
+  for (plane = plane_start; plane < plane_end; plane++) {
+    if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0])
+      break;
+    else if (plane == 1 && !filt_lvl[1])
+      continue;
+    else if (plane == 2 && !filt_lvl[2])
+      continue;
 
-  for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
-    for (int dir = 0; dir < 2; ++dir) {
-      int lvl_seg = (dir == 0) ? default_filt_lvl : default_filt_lvl_r;
-#if CONFIG_LOOPFILTER_LEVEL
-      assert(plane >= 0 && plane <= 2);
-      const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir];
-      if (segfeature_active(seg, seg_id, seg_lf_feature_id)) {
-        const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id);
-        lvl_seg = clamp(
-            seg->abs_delta == SEGMENT_ABSDATA ? data : default_filt_lvl + data,
-            0, MAX_LOOP_FILTER);
+    for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
+      for (int dir = 0; dir < 2; ++dir) {
+        int lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane];
+        assert(plane >= 0 && plane <= 2);
+        const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir];
+        if (segfeature_active(seg, seg_id, seg_lf_feature_id)) {
+          const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id);
+          lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
+        }
+
+        if (!lf->mode_ref_delta_enabled) {
+          // we could get rid of this if we assume that deltas are set to
+          // zero when not in use; encoder always uses deltas
+          memset(lfi->lvl[plane][seg_id][dir], lvl_seg,
+                 sizeof(lfi->lvl[plane][seg_id][dir]));
+        } else {
+          int ref, mode;
+          const int scale = 1 << (lvl_seg >> 5);
+          const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
+          lfi->lvl[plane][seg_id][dir][INTRA_FRAME][0] =
+              clamp(intra_lvl, 0, MAX_LOOP_FILTER);
+
+          for (ref = LAST_FRAME; ref < REF_FRAMES; ++ref) {
+            for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
+              const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
+                                    lf->mode_deltas[mode] * scale;
+              lfi->lvl[plane][seg_id][dir][ref][mode] =
+                  clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+            }
+          }
+        }
       }
+    }
+  }
+
+#if LOOP_FILTER_BITMASK
+  memset(lf->neighbor_sb_lpf_info.tx_size_y_above, TX_64X64,
+         sizeof(TX_SIZE) * MI_SIZE_64X64);
+  memset(lf->neighbor_sb_lpf_info.tx_size_y_left, TX_64X64,
+         sizeof(TX_SIZE) * MI_SIZE_64X64);
+  memset(lf->neighbor_sb_lpf_info.tx_size_uv_above, TX_64X64,
+         sizeof(TX_SIZE) * MI_SIZE_64X64);
+  memset(lf->neighbor_sb_lpf_info.tx_size_uv_left, TX_64X64,
+         sizeof(TX_SIZE) * MI_SIZE_64X64);
+  memset(lf->neighbor_sb_lpf_info.y_level_above, 0,
+         sizeof(uint8_t) * MI_SIZE_64X64);
+  memset(lf->neighbor_sb_lpf_info.y_level_left, 0,
+         sizeof(uint8_t) * MI_SIZE_64X64);
+  memset(lf->neighbor_sb_lpf_info.u_level_above, 0,
+         sizeof(uint8_t) * MI_SIZE_64X64);
+  memset(lf->neighbor_sb_lpf_info.u_level_left, 0,
+         sizeof(uint8_t) * MI_SIZE_64X64);
+  memset(lf->neighbor_sb_lpf_info.v_level_above, 0,
+         sizeof(uint8_t) * MI_SIZE_64X64);
+  memset(lf->neighbor_sb_lpf_info.v_level_left, 0,
+         sizeof(uint8_t) * MI_SIZE_64X64);
+  memset(lf->neighbor_sb_lpf_info.skip, 0, sizeof(uint8_t) * MI_SIZE_64X64);
+#endif  // LOOP_FILTER_BITMASK
+}
+
+#if LOOP_FILTER_BITMASK
+// A 64x64 tx block requires 256 bits to represent each 4x4 tx block.
+// Every 4 rows is represented by one uint64_t mask. Hence,
+// there are 4 uint64_t bitmask[4] to represent the 64x64 block.
+//
+// Given a location by (mi_col, mi_row), This function returns the index
+// 0, 1, 2, 3 to select which bitmask[] to use, and the shift value.
+//
+// For example, mi_row is the offset of pixels in mi size (4),
+// (mi_row / 4) returns which uint64_t.
+// After locating which uint64_t, mi_row % 4 is the
+// row offset, and each row has 16 = 1 << stride_log2 4x4 units.
+// Therefore, shift = (row << stride_log2) + mi_col;
+static int get_index_shift(int mi_col, int mi_row, int *index) {
+  // *index = mi_row >> 2;
+  // rows = mi_row % 4;
+  // stride_log2 = 4;
+  // shift = (rows << stride_log2) + mi_col;
+  *index = mi_row >> 2;
+  return ((mi_row & 3) << 4) | mi_col;
+}
+
+static void check_mask(const FilterMask *lfm) {
+#ifndef NDEBUG
+  for (int i = 0; i < 4; ++i) {
+    assert(!(lfm[TX_4X4].bits[i] & lfm[TX_8X8].bits[i]));
+    assert(!(lfm[TX_4X4].bits[i] & lfm[TX_16X16].bits[i]));
+    assert(!(lfm[TX_4X4].bits[i] & lfm[TX_32X32].bits[i]));
+    assert(!(lfm[TX_4X4].bits[i] & lfm[TX_64X64].bits[i]));
+    assert(!(lfm[TX_8X8].bits[i] & lfm[TX_16X16].bits[i]));
+    assert(!(lfm[TX_8X8].bits[i] & lfm[TX_32X32].bits[i]));
+    assert(!(lfm[TX_8X8].bits[i] & lfm[TX_64X64].bits[i]));
+    assert(!(lfm[TX_16X16].bits[i] & lfm[TX_32X32].bits[i]));
+    assert(!(lfm[TX_16X16].bits[i] & lfm[TX_64X64].bits[i]));
+    assert(!(lfm[TX_32X32].bits[i] & lfm[TX_64X64].bits[i]));
+  }
 #else
-      if (segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
-        const int data = get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
-        lvl_seg = clamp(
-            seg->abs_delta == SEGMENT_ABSDATA ? data : default_filt_lvl + data,
-            0, MAX_LOOP_FILTER);
+  (void)lfm;
+#endif
+}
+
+static void check_loop_filter_masks(const LoopFilterMask *lfm, int plane) {
+  if (plane == 0) {
+    // Assert if we try to apply 2 different loop filters at the same
+    // position.
+    check_mask(lfm->left_y);
+    check_mask(lfm->above_y);
+  } else if (plane == 1) {
+    check_mask(lfm->left_u);
+    check_mask(lfm->above_u);
+  } else {
+    check_mask(lfm->left_v);
+    check_mask(lfm->above_v);
+  }
+}
+
+static void update_masks(EDGE_DIR dir, int plane, uint64_t *mask,
+                         TX_SIZE sqr_tx_size, LoopFilterMask *lfm) {
+  if (dir == VERT_EDGE) {
+    switch (plane) {
+      case 0:
+        for (int i = 0; i < 4; ++i) lfm->left_y[sqr_tx_size].bits[i] |= mask[i];
+        break;
+      case 1:
+        for (int i = 0; i < 4; ++i) lfm->left_u[sqr_tx_size].bits[i] |= mask[i];
+        break;
+      case 2:
+        for (int i = 0; i < 4; ++i) lfm->left_v[sqr_tx_size].bits[i] |= mask[i];
+        break;
+      default: assert(plane <= 2);
+    }
+  } else {
+    switch (plane) {
+      case 0:
+        for (int i = 0; i < 4; ++i)
+          lfm->above_y[sqr_tx_size].bits[i] |= mask[i];
+        break;
+      case 1:
+        for (int i = 0; i < 4; ++i)
+          lfm->above_u[sqr_tx_size].bits[i] |= mask[i];
+        break;
+      case 2:
+        for (int i = 0; i < 4; ++i)
+          lfm->above_v[sqr_tx_size].bits[i] |= mask[i];
+        break;
+      default: assert(plane <= 2);
+    }
+  }
+}
+
+static int is_frame_boundary(AV1_COMMON *const cm, int plane, int mi_row,
+                             int mi_col, int ssx, int ssy, EDGE_DIR dir) {
+  if (plane && (ssx || ssy)) {
+    if (ssx && ssy) {  // format 420
+      if ((mi_row << MI_SIZE_LOG2) > cm->height ||
+          (mi_col << MI_SIZE_LOG2) > cm->width)
+        return 1;
+    } else if (ssx) {  // format 422
+      if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
+          (mi_col << MI_SIZE_LOG2) > cm->width)
+        return 1;
+    }
+  } else {
+    if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
+        (mi_col << MI_SIZE_LOG2) >= cm->width)
+      return 1;
+  }
+
+  int row_or_col;
+  if (plane == 0) {
+    row_or_col = dir == VERT_EDGE ? mi_col : mi_row;
+  } else {
+    // chroma sub8x8 block uses bottom/right mi of co-located 8x8 luma block.
+    // So if mi_col == 1, it is actually the frame boundary.
+    if (dir == VERT_EDGE) {
+      row_or_col = ssx ? (mi_col & 0x0FFFFFFE) : mi_col;
+    } else {
+      row_or_col = ssy ? (mi_row & 0x0FFFFFFE) : mi_row;
+    }
+  }
+  return row_or_col == 0;
+}
+
+static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
+                        int ssx, int ssy, TX_SIZE tx_size) {
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  const int x = (mi_col << (MI_SIZE_LOG2 - ssx));
+  const int y = (mi_row << (MI_SIZE_LOG2 - ssy));
+  // decide whether current vertical/horizontal edge needs loop filtering
+  for (EDGE_DIR dir = VERT_EDGE; dir <= HORZ_EDGE; ++dir) {
+    // chroma sub8x8 block uses bottom/right mi of co-located 8x8 luma block.
+    mi_row |= ssy;
+    mi_col |= ssx;
+
+    MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
+    const MB_MODE_INFO *const mbmi = mi[0];
+    const int curr_skip = mbmi->skip && is_inter_block(mbmi);
+    const BLOCK_SIZE bsize = mbmi->sb_type;
+    const BLOCK_SIZE bsizec = scale_chroma_bsize(bsize, ssx, ssy);
+    const BLOCK_SIZE plane_bsize = ss_size_lookup[bsizec][ssx][ssy];
+    const uint8_t level = get_filter_level(cm, &cm->lf_info, dir, plane, mbmi);
+    const int prediction_masks = dir == VERT_EDGE
+                                     ? block_size_wide[plane_bsize] - 1
+                                     : block_size_high[plane_bsize] - 1;
+    const int is_coding_block_border =
+        dir == VERT_EDGE ? !(x & prediction_masks) : !(y & prediction_masks);
+
+    // TODO(chengchen): step can be optimized.
+    const int row_step = mi_size_high[TX_4X4] << ssy;
+    const int col_step = mi_size_wide[TX_4X4] << ssx;
+    const int mi_height =
+        dir == VERT_EDGE ? tx_size_high_unit[tx_size] << ssy : row_step;
+    const int mi_width =
+        dir == VERT_EDGE ? col_step : tx_size_wide_unit[tx_size] << ssx;
+
+    // assign filter levels
+    for (int r = mi_row; r < mi_row + mi_height; r += row_step) {
+      for (int c = mi_col; c < mi_col + mi_width; c += col_step) {
+        // do not filter frame boundary
+        // Note: when chroma planes' size are half of luma plane,
+        // chroma plane mi corresponds to even position.
+        // If frame size is not even, we still need to filter this chroma
+        // position. Therefore the boundary condition check needs to be
+        // separated to two cases.
+        if (plane && (ssx || ssy)) {
+          if (ssx && ssy) {  // format 420
+            if ((r << MI_SIZE_LOG2) > cm->height ||
+                (c << MI_SIZE_LOG2) > cm->width)
+              continue;
+          } else if (ssx) {  // format 422
+            if ((r << MI_SIZE_LOG2) >= cm->height ||
+                (c << MI_SIZE_LOG2) > cm->width)
+              continue;
+          }
+        } else {
+          if ((r << MI_SIZE_LOG2) >= cm->height ||
+              (c << MI_SIZE_LOG2) >= cm->width)
+            continue;
+        }
+
+        const int row = r % MI_SIZE_64X64;
+        const int col = c % MI_SIZE_64X64;
+        if (plane == 0) {
+          if (dir == VERT_EDGE)
+            lfm->lfl_y_ver[row][col] = level;
+          else
+            lfm->lfl_y_hor[row][col] = level;
+        } else if (plane == 1) {
+          if (dir == VERT_EDGE)
+            lfm->lfl_u_ver[row][col] = level;
+          else
+            lfm->lfl_u_hor[row][col] = level;
+        } else {
+          if (dir == VERT_EDGE)
+            lfm->lfl_v_ver[row][col] = level;
+          else
+            lfm->lfl_v_hor[row][col] = level;
+        }
       }
-#endif  // CONFIG_LOOPFILTER_LEVEL
+    }
 
-      if (!lf->mode_ref_delta_enabled) {
-// we could get rid of this if we assume that deltas are set to
-// zero when not in use; encoder always uses deltas
-#if CONFIG_LOOPFILTER_LEVEL
-        memset(lfi->lvl[seg_id][dir], lvl_seg, sizeof(lfi->lvl[seg_id][dir]));
-#else
-        memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id]));
-#endif  // CONFIG_LOOPFILTER_LEVEL
+    for (int r = mi_row; r < mi_row + mi_height; r += row_step) {
+      for (int c = mi_col; c < mi_col + mi_width; c += col_step) {
+        // do not filter frame boundary
+        if (is_frame_boundary(cm, plane, r, c, ssx, ssy, dir)) continue;
+
+        uint64_t mask[4] = { 0 };
+        const int prev_row = dir == VERT_EDGE ? r : r - (1 << ssy);
+        const int prev_col = dir == VERT_EDGE ? c - (1 << ssx) : c;
+        MB_MODE_INFO **mi_prev =
+            cm->mi_grid_visible + prev_row * cm->mi_stride + prev_col;
+        const MB_MODE_INFO *const mbmi_prev = mi_prev[0];
+        const int prev_skip = mbmi_prev->skip && is_inter_block(mbmi_prev);
+        const uint8_t level_prev =
+            get_filter_level(cm, &cm->lf_info, dir, plane, mbmi_prev);
+        const int is_edge =
+            (level || level_prev) &&
+            (!curr_skip || !prev_skip || is_coding_block_border);
+
+        if (is_edge) {
+          const TX_SIZE prev_tx_size =
+              plane ? av1_get_max_uv_txsize(mbmi_prev->sb_type, ssx, ssy)
+                    : mbmi_prev->tx_size;
+          const TX_SIZE min_tx_size =
+              (dir == VERT_EDGE) ? AOMMIN(txsize_horz_map[tx_size],
+                                          txsize_horz_map[prev_tx_size])
+                                 : AOMMIN(txsize_vert_map[tx_size],
+                                          txsize_vert_map[prev_tx_size]);
+          assert(min_tx_size < TX_SIZES);
+          const int row = r % MI_SIZE_64X64;
+          const int col = c % MI_SIZE_64X64;
+          int index = 0;
+          const int shift = get_index_shift(col, row, &index);
+          assert(index < 4 && index >= 0);
+          mask[index] |= ((uint64_t)1 << shift);
+          // set mask on corresponding bit
+          update_masks(dir, plane, mask, min_tx_size, lfm);
+        }
+      }
+    }
+  }
+}
+
+static void setup_tx_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
+                                int blk_row, int blk_col,
+                                BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                int plane, int ssx, int ssy) {
+  blk_row <<= ssy;
+  blk_col <<= ssx;
+  if (((mi_row + blk_row) << MI_SIZE_LOG2) >= cm->height ||
+      ((mi_col + blk_col) << MI_SIZE_LOG2) >= cm->width)
+    return;
+
+  // U/V plane, tx_size is always the largest size
+  if (plane) {
+    assert(tx_size_wide[tx_size] <= 32 && tx_size_high[tx_size] <= 32);
+    setup_masks(cm, mi_row + blk_row, mi_col + blk_col, plane, ssx, ssy,
+                tx_size);
+    return;
+  }
+
+  MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
+  const MB_MODE_INFO *const mbmi = mi[0];
+  // For Y plane:
+  // If intra block, tx size is univariant.
+  // If inter block, tx size follows inter_tx_size.
+  TX_SIZE plane_tx_size = tx_size;
+  const int is_inter = is_inter_block(mbmi);
+
+  if (plane == 0) {
+    if (is_inter) {
+      if (mbmi->skip) {
+        // TODO(chengchen): change av1_get_transform_size() to be consistant.
+        // plane_tx_size = get_max_rect_tx_size(plane_bsize);
+        plane_tx_size = mbmi->tx_size;
       } else {
-        int ref, mode;
-#if CONFIG_LOOPFILTER_LEVEL
-        scale = 1 << (lvl_seg >> 5);
-
-        const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
-        lfi->lvl[seg_id][dir][INTRA_FRAME][0] =
-            clamp(intra_lvl, 0, MAX_LOOP_FILTER);
-
-        for (ref = LAST_FRAME; ref < TOTAL_REFS_PER_FRAME; ++ref) {
-          for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
-            const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
-                                  lf->mode_deltas[mode] * scale;
-            lfi->lvl[seg_id][dir][ref][mode] =
-                clamp(inter_lvl, 0, MAX_LOOP_FILTER);
-          }
+        plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
+            plane_bsize, blk_row, blk_col)];
+      }
+    } else {
+      MB_MODE_INFO **mi_this = cm->mi_grid_visible +
+                               (mi_row + blk_row) * cm->mi_stride + mi_col +
+                               blk_col;
+      const MB_MODE_INFO *const mbmi_this = mi_this[0];
+      plane_tx_size = mbmi_this->tx_size;
+    }
+  }
+
+  assert(txsize_to_bsize[plane_tx_size] <= plane_bsize);
+
+  if (plane || plane_tx_size == tx_size) {
+    setup_masks(cm, mi_row + blk_row, mi_col + blk_col, plane, ssx, ssy,
+                tx_size);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
+        setup_tx_block_mask(cm, mi_row, mi_col, offsetr, offsetc, plane_bsize,
+                            sub_txs, plane, ssx, ssy);
+      }
+    }
+  }
+}
+
+static void setup_fix_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
+                                 int plane, int ssx, int ssy) {
+  MB_MODE_INFO **mi =
+      cm->mi_grid_visible + (mi_row | ssy) * cm->mi_stride + (mi_col | ssx);
+  const MB_MODE_INFO *const mbmi = mi[0];
+
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsizec = scale_chroma_bsize(bsize, ssx, ssy);
+  const BLOCK_SIZE plane_bsize = ss_size_lookup[bsizec][ssx][ssy];
+
+  const int block_width = mi_size_wide[plane_bsize];
+  const int block_height = mi_size_high[plane_bsize];
+
+  TX_SIZE max_txsize = max_txsize_rect_lookup[plane_bsize];
+  // The decoder is designed so that it can process 64x64 luma pixels at a
+  // time. If this is a chroma plane with subsampling and bsize corresponds to
+  // a subsampled BLOCK_128X128 then the lookup above will give TX_64X64. That
+  // mustn't be used for the subsampled plane (because it would be bigger than
+  // a 64x64 luma block) so we round down to TX_32X32.
+  if (plane && txsize_sqr_up_map[max_txsize] == TX_64X64) {
+    if (max_txsize == TX_16X64)
+      max_txsize = TX_16X32;
+    else if (max_txsize == TX_64X16)
+      max_txsize = TX_32X16;
+    else
+      max_txsize = TX_32X32;
+  }
+
+  const BLOCK_SIZE txb_size = txsize_to_bsize[max_txsize];
+  const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
+  const int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
+  const BLOCK_SIZE max_unit_bsize = ss_size_lookup[BLOCK_64X64][ssx][ssy];
+  int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+  int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
+  mu_blocks_wide = AOMMIN(block_width, mu_blocks_wide);
+  mu_blocks_high = AOMMIN(block_height, mu_blocks_high);
+
+  // Y: Largest tx_size is 64x64, while superblock size can be 128x128.
+  // Here we ensure that setup_tx_block_mask process at most a 64x64 block.
+  // U/V: largest tx size is 32x32.
+  for (int idy = 0; idy < block_height; idy += mu_blocks_high) {
+    for (int idx = 0; idx < block_width; idx += mu_blocks_wide) {
+      const int unit_height = AOMMIN(mu_blocks_high + idy, block_height);
+      const int unit_width = AOMMIN(mu_blocks_wide + idx, block_width);
+      for (int blk_row = idy; blk_row < unit_height; blk_row += bh) {
+        for (int blk_col = idx; blk_col < unit_width; blk_col += bw) {
+          setup_tx_block_mask(cm, mi_row, mi_col, blk_row, blk_col, plane_bsize,
+                              max_txsize, plane, ssx, ssy);
         }
-#else
-        (void)default_filt_lvl_r;
-        const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
-        lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
-
-        for (ref = LAST_FRAME; ref < TOTAL_REFS_PER_FRAME; ++ref) {
-          for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
-            const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
-                                  lf->mode_deltas[mode] * scale;
-            lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+      }
+    }
+  }
+}
+
+static void setup_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize, int plane, int ssx, int ssy) {
+  if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
+      (mi_col << MI_SIZE_LOG2) >= cm->width)
+    return;
+
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+  const int hbs = mi_size_wide[bsize] / 2;
+  const int quarter_step = mi_size_wide[bsize] / 4;
+  const int allow_sub8x8 = (ssx || ssy) ? bsize > BLOCK_8X8 : 1;
+  const int has_next_row =
+      (((mi_row + hbs) << MI_SIZE_LOG2) < cm->height) & allow_sub8x8;
+  const int has_next_col =
+      (((mi_col + hbs) << MI_SIZE_LOG2) < cm->width) & allow_sub8x8;
+  int i;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      break;
+    case PARTITION_HORZ:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      if (has_next_row)
+        setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
+      break;
+    case PARTITION_VERT:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      if (has_next_col)
+        setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
+      break;
+    case PARTITION_SPLIT:
+      setup_block_mask(cm, mi_row, mi_col, subsize, plane, ssx, ssy);
+      if (has_next_col)
+        setup_block_mask(cm, mi_row, mi_col + hbs, subsize, plane, ssx, ssy);
+      if (has_next_row)
+        setup_block_mask(cm, mi_row + hbs, mi_col, subsize, plane, ssx, ssy);
+      if (has_next_col & has_next_row)
+        setup_block_mask(cm, mi_row + hbs, mi_col + hbs, subsize, plane, ssx,
+                         ssy);
+      break;
+    case PARTITION_HORZ_A:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      if (has_next_col)
+        setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
+      if (has_next_row)
+        setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
+      break;
+    case PARTITION_HORZ_B:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      if (has_next_row)
+        setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
+      if (has_next_col & has_next_row)
+        setup_fix_block_mask(cm, mi_row + hbs, mi_col + hbs, plane, ssx, ssy);
+      break;
+    case PARTITION_VERT_A:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      if (has_next_row)
+        setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
+      if (has_next_col)
+        setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
+      break;
+    case PARTITION_VERT_B:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      if (has_next_col)
+        setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
+      if (has_next_row)
+        setup_fix_block_mask(cm, mi_row + hbs, mi_col + hbs, plane, ssx, ssy);
+      break;
+    case PARTITION_HORZ_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && (this_mi_row << MI_SIZE_LOG2) >= cm->height) break;
+        // chroma plane filter the odd location
+        if (plane && bsize == BLOCK_16X16 && (i & 0x01)) continue;
+
+        setup_fix_block_mask(cm, this_mi_row, mi_col, plane, ssx, ssy);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= cm->mi_cols) break;
+        // chroma plane filter the odd location
+        if (plane && bsize == BLOCK_16X16 && (i & 0x01)) continue;
+
+        setup_fix_block_mask(cm, mi_row, this_mi_col, plane, ssx, ssy);
+      }
+      break;
+    default: assert(0);
+  }
+}
+
+// TODO(chengchen): if lossless, do not need to setup mask. But when
+// segments enabled, each segment has different lossless settings.
+void av1_setup_bitmask(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
+                       int subsampling_x, int subsampling_y, int row_end,
+                       int col_end) {
+  const int num_64x64 = cm->seq_params.mib_size >> MIN_MIB_SIZE_LOG2;
+  for (int y = 0; y < num_64x64; ++y) {
+    for (int x = 0; x < num_64x64; ++x) {
+      const int row = mi_row + y * MI_SIZE_64X64;
+      const int col = mi_col + x * MI_SIZE_64X64;
+      if (row >= row_end || col >= col_end) continue;
+      if ((row << MI_SIZE_LOG2) >= cm->height ||
+          (col << MI_SIZE_LOG2) >= cm->width)
+        continue;
+
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, row, col);
+      if (lfm == NULL) return;
+
+      // init mask to zero
+      if (plane == 0) {
+        av1_zero(lfm->left_y);
+        av1_zero(lfm->above_y);
+        av1_zero(lfm->lfl_y_ver);
+        av1_zero(lfm->lfl_y_hor);
+      } else if (plane == 1) {
+        av1_zero(lfm->left_u);
+        av1_zero(lfm->above_u);
+        av1_zero(lfm->lfl_u_ver);
+        av1_zero(lfm->lfl_u_hor);
+      } else {
+        av1_zero(lfm->left_v);
+        av1_zero(lfm->above_v);
+        av1_zero(lfm->lfl_v_ver);
+        av1_zero(lfm->lfl_v_hor);
+      }
+    }
+  }
+
+  // set up bitmask for each superblock
+  setup_block_mask(cm, mi_row, mi_col, cm->seq_params.sb_size, plane,
+                   subsampling_x, subsampling_y);
+
+  for (int y = 0; y < num_64x64; ++y) {
+    for (int x = 0; x < num_64x64; ++x) {
+      const int row = mi_row + y * MI_SIZE_64X64;
+      const int col = mi_col + x * MI_SIZE_64X64;
+      if (row >= row_end || col >= col_end) continue;
+      if ((row << MI_SIZE_LOG2) >= cm->height ||
+          (col << MI_SIZE_LOG2) >= cm->width)
+        continue;
+
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, row, col);
+      if (lfm == NULL) return;
+
+      // check if the mask is valid
+      check_loop_filter_masks(lfm, plane);
+
+      {
+        // Let 16x16 hold 32x32 (Y/U/V) and 64x64(Y only).
+        // Even tx size is greater, we only apply max length filter, which
+        // is 16.
+        if (plane == 0) {
+          for (int j = 0; j < 4; ++j) {
+            lfm->left_y[TX_16X16].bits[j] |= lfm->left_y[TX_32X32].bits[j];
+            lfm->left_y[TX_16X16].bits[j] |= lfm->left_y[TX_64X64].bits[j];
+            lfm->above_y[TX_16X16].bits[j] |= lfm->above_y[TX_32X32].bits[j];
+            lfm->above_y[TX_16X16].bits[j] |= lfm->above_y[TX_64X64].bits[j];
+
+            // set 32x32 and 64x64 to 0
+            lfm->left_y[TX_32X32].bits[j] = 0;
+            lfm->left_y[TX_64X64].bits[j] = 0;
+            lfm->above_y[TX_32X32].bits[j] = 0;
+            lfm->above_y[TX_64X64].bits[j] = 0;
+          }
+        } else if (plane == 1) {
+          for (int j = 0; j < 4; ++j) {
+            lfm->left_u[TX_16X16].bits[j] |= lfm->left_u[TX_32X32].bits[j];
+            lfm->above_u[TX_16X16].bits[j] |= lfm->above_u[TX_32X32].bits[j];
+
+            // set 32x32 to 0
+            lfm->left_u[TX_32X32].bits[j] = 0;
+            lfm->above_u[TX_32X32].bits[j] = 0;
+          }
+        } else {
+          for (int j = 0; j < 4; ++j) {
+            lfm->left_v[TX_16X16].bits[j] |= lfm->left_v[TX_32X32].bits[j];
+            lfm->above_v[TX_16X16].bits[j] |= lfm->above_v[TX_32X32].bits[j];
+
+            // set 32x32 to 0
+            lfm->left_v[TX_32X32].bits[j] = 0;
+            lfm->above_v[TX_32X32].bits[j] = 0;
           }
         }
-#endif
       }
+
+      // check if the mask is valid
+      check_loop_filter_masks(lfm, plane);
     }
   }
 }
 
-static void filter_selectively_vert_row2(int subsampling_factor, uint8_t *s,
-                                         int pitch, unsigned int mask_16x16_l,
-                                         unsigned int mask_8x8_l,
-                                         unsigned int mask_4x4_l,
-                                         unsigned int mask_4x4_int_l,
-                                         const loop_filter_info_n *lfi_n,
-                                         const uint8_t *lfl) {
-  const int mask_shift = subsampling_factor ? 4 : 8;
-  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
-  const int lfl_forward = subsampling_factor ? 4 : 8;
-
-  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
-  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
-  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
-  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
-  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
-  unsigned int mask;
-
-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
-              mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
-       mask; mask >>= 1) {
+static void filter_selectively_vert_row2(
+    int subsampling_factor, uint8_t *s, int pitch, int plane,
+    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
+    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
+    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2) {
+  uint64_t mask;
+  const int step = 1 << subsampling_factor;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
+              mask_8x8_1 | mask_4x4_1;
+       mask; mask >>= step) {
     const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
 
     if (mask & 1) {
       if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_14;
+
         if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          aom_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                   lfi0->hev_thr);
+          if (plane) {
+            // TODO(any): add aom_lpf_vertical_6_dual for chroma plane.
+            aom_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+            aom_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                               lfi1->hev_thr);
+          } else {
+            // TODO(any): add dual function simd function. Current sse2 code
+            // just called aom_lpf_vertical_14_sse2 twice.
+            aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                     lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                     lfi1->hev_thr);
+          }
         } else if (mask_16x16_0 & 1) {
-          aom_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
         } else {
-          aom_lpf_vertical_16(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                              lfi1->hev_thr);
+          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                       lfi1->hev_thr);
         }
       }
 
       if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_8;
+
         if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
+          if (plane) {
+            aom_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+            aom_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                               lfi1->hev_thr);
+          } else {
+            aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                    lfi1->hev_thr);
+          }
         } else if (mask_8x8_0 & 1) {
-          aom_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
         } else {
-          aom_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr);
+          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                       lfi1->hev_thr);
         }
       }
 
@@ -898,90 +1029,86 @@ static void filter_selectively_vert_row2(int subsampling_factor, uint8_t *s,
         } else if (mask_4x4_0 & 1) {
           aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
         } else {
-          aom_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr);
-        }
-      }
-
-      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
-        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          aom_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_4x4_int_0 & 1) {
-          aom_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                             lfi0->hev_thr);
-        } else {
-          aom_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,
+          aom_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
                              lfi1->hev_thr);
         }
       }
     }
 
-    s += 8;
-    lfl += 1;
-    mask_16x16_0 >>= 1;
-    mask_8x8_0 >>= 1;
-    mask_4x4_0 >>= 1;
-    mask_4x4_int_0 >>= 1;
-    mask_16x16_1 >>= 1;
-    mask_8x8_1 >>= 1;
-    mask_4x4_1 >>= 1;
-    mask_4x4_int_1 >>= 1;
+    s += 4;
+    lfl += step;
+    lfl2 += step;
+    mask_16x16_0 >>= step;
+    mask_8x8_0 >>= step;
+    mask_4x4_0 >>= step;
+    mask_16x16_1 >>= step;
+    mask_8x8_1 >>= step;
+    mask_4x4_1 >>= step;
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 static void highbd_filter_selectively_vert_row2(
-    int subsampling_factor, uint16_t *s, int pitch, unsigned int mask_16x16_l,
-    unsigned int mask_8x8_l, unsigned int mask_4x4_l,
-    unsigned int mask_4x4_int_l, const loop_filter_info_n *lfi_n,
-    const uint8_t *lfl, int bd) {
-  const int mask_shift = subsampling_factor ? 4 : 8;
-  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
-  const int lfl_forward = subsampling_factor ? 4 : 8;
-
-  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
-  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
-  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
-  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
-  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
-  unsigned int mask;
-
-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
-              mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
-       mask; mask >>= 1) {
+    int subsampling_factor, uint16_t *s, int pitch, int plane,
+    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
+    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
+    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2, int bd) {
+  uint64_t mask;
+  const int step = 1 << subsampling_factor;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
+              mask_8x8_1 | mask_4x4_1;
+       mask; mask >>= step) {
     const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
 
     if (mask & 1) {
       if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        HbdLpfFunc highbd_lpf_vertical =
+            plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_14;
+
         if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          aom_highbd_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                          lfi0->hev_thr, bd);
+          if (plane) {
+            aom_highbd_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim,
+                                      lfi0->hev_thr, bd);
+            aom_highbd_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim,
+                                      lfi1->lim, lfi1->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                            lfi0->hev_thr, lfi1->mblim,
+                                            lfi1->lim, lfi1->hev_thr, bd);
+          }
         } else if (mask_16x16_0 & 1) {
-          aom_highbd_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
-                                     lfi0->hev_thr, bd);
+          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+                              bd);
         } else {
-          aom_highbd_lpf_vertical_16(s + 8 * pitch, pitch, lfi1->mblim,
-                                     lfi1->lim, lfi1->hev_thr, bd);
+          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                              lfi1->hev_thr, bd);
         }
       }
 
       if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        HbdLpfFunc highbd_lpf_vertical =
+            plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_8;
+
         if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
+          if (plane) {
+            aom_highbd_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim,
+                                      lfi0->hev_thr, bd);
+            aom_highbd_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim,
+                                      lfi1->lim, lfi1->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                           lfi0->hev_thr, lfi1->mblim,
+                                           lfi1->lim, lfi1->hev_thr, bd);
+          }
         } else if (mask_8x8_0 & 1) {
-          aom_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, bd);
+          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+                              bd);
         } else {
-          aom_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, bd);
+          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                              lfi1->hev_thr, bd);
         }
       }
 
@@ -994,1925 +1121,396 @@ static void highbd_filter_selectively_vert_row2(
           aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
                                     lfi0->hev_thr, bd);
         } else {
-          aom_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, bd);
-        }
-      }
-
-      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
-        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          aom_highbd_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-        } else if (mask_4x4_int_0 & 1) {
-          aom_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, bd);
-        } else {
-          aom_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
+          aom_highbd_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim,
                                     lfi1->lim, lfi1->hev_thr, bd);
         }
       }
     }
 
-    s += 8;
-    lfl += 1;
-    mask_16x16_0 >>= 1;
-    mask_8x8_0 >>= 1;
-    mask_4x4_0 >>= 1;
-    mask_4x4_int_0 >>= 1;
-    mask_16x16_1 >>= 1;
-    mask_8x8_1 >>= 1;
-    mask_4x4_1 >>= 1;
-    mask_4x4_int_1 >>= 1;
+    s += 4;
+    lfl += step;
+    lfl2 += step;
+    mask_16x16_0 >>= step;
+    mask_8x8_0 >>= step;
+    mask_4x4_0 >>= step;
+    mask_16x16_1 >>= step;
+    mask_8x8_1 >>= step;
+    mask_4x4_1 >>= step;
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
-
-static void filter_selectively_horiz(
-    uint8_t *s, int pitch, unsigned int mask_16x16, unsigned int mask_8x8,
-    unsigned int mask_4x4, unsigned int mask_4x4_int,
-    const loop_filter_info_n *lfi_n, const uint8_t *lfl
-#if CONFIG_LPF_DIRECT
-    ,
-    uint8_t *const src, int mi_row, int mi_col, int idx_r, int col_step,
-    int width, int height, int ss_x, int ss_y
-#endif
-    ) {
-  unsigned int mask;
+
+static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
+                                     int subsampling, uint64_t mask_16x16,
+                                     uint64_t mask_8x8, uint64_t mask_4x4,
+                                     const loop_filter_info_n *lfi_n,
+                                     const uint8_t *lfl) {
+  uint64_t mask;
   int count;
-#if CONFIG_LPF_DIRECT
-  // scale for u, v plane
-  width >>= ss_x;
-  height >>= ss_y;
-  int idx_c = 0;
-#endif
+  const int step = 1 << subsampling;
+  const unsigned int two_block_mask = subsampling ? 5 : 3;
 
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask;
-       mask >>= count) {
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
     const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+    // Next block's thresholds.
+    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + step);
 
     count = 1;
     if (mask & 1) {
-#if CONFIG_LPF_DIRECT
-      int i;
-      const int line_length = 16;
-      const int pivot = 8;
-      const int above_filt_len = mask_16x16 & 1 ? 8 : 4;
-      const int bot_filt_len = mask_16x16 & 1 ? 8 : 4;
-      uint8_t block[256];  // line_length * size_of(BLOCK_8X8) * two_blocks
-      int orig_pos[256];
-      int direct;
-
-      assert(above_filt_len == bot_filt_len);
-      (void)bot_filt_len;
-      for (i = 0; i < 256; ++i) {
-        block[i] = 0;
-        orig_pos[i] = -1;
-      }
-
-      // actual position for current pixel
-      const int row = (mi_row + idx_r) * MI_SIZE >> ss_y;
-      const int col = (mi_col + idx_c) * MI_SIZE >> ss_x;
-
-      // Next block's thresholds.
-      const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
-
       if (mask_16x16 & 1) {
-        if ((mask_16x16 & 3) == 3) {
-          // Could use asymmetric length in the future
-          direct = pick_min_grad_direct(src, above_filt_len, row, col, width,
-                                        height, pitch, 2, 1);
-
-          pick_filter_block_horz(src, block, orig_pos, above_filt_len, row, col,
-                                 width, height, pitch, pivot, line_length, 2,
-                                 direct);
-
-          aom_lpf_horizontal_edge_16(block + pivot * line_length, line_length,
-                                     lfi->mblim, lfi->lim, lfi->hev_thr);
-          count = 2;
-        } else {
-          direct = pick_min_grad_direct(src, above_filt_len, row, col, width,
-                                        height, pitch, 1, 1);
-
-          pick_filter_block_horz(src, block, orig_pos, above_filt_len, row, col,
-                                 width, height, pitch, pivot, line_length, 1,
-                                 direct);
-
-          aom_lpf_horizontal_edge_8(block + pivot * line_length, line_length,
-                                    lfi->mblim, lfi->lim, lfi->hev_thr);
-        }
-
-        for (i = 0; i < 256; ++i)
-          if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-      } else if (mask_8x8 & 1) {
-        if ((mask_8x8 & 3) == 3) {
-          count = 2;
-          direct = pick_min_grad_direct(src, above_filt_len, row, col, width,
-                                        height, pitch, 2, 1);
-
-          pick_filter_block_horz(src, block, orig_pos, above_filt_len, row, col,
-                                 width, height, pitch, pivot, line_length, 2,
-                                 direct);
-
-          aom_lpf_horizontal_8_dual(block + pivot * line_length, line_length,
-                                    lfi->mblim, lfi->lim, lfi->hev_thr,
-                                    lfin->mblim, lfin->lim, lfin->hev_thr);
-
-          for (i = 0; i < 256; ++i)
-            if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-
-          if ((mask_4x4_int & 3) == 3) {
-            for (i = 0; i < 256; ++i) {
-              block[i] = 0;
-              orig_pos[i] = -1;
-            }
-
-            direct = pick_min_grad_direct(src, 4, row, col, width, height,
-                                          pitch, 2, 1);
-
-            pick_filter_block_horz(src, block, orig_pos, 4, row + 4, col, width,
-                                   height, pitch, pivot, line_length, 2,
-                                   direct);
-
-            aom_lpf_horizontal_4_dual(block + pivot * line_length, line_length,
-                                      lfi->mblim, lfi->lim, lfi->hev_thr,
-                                      lfin->mblim, lfin->lim, lfin->hev_thr);
-
-            for (i = 0; i < 256; ++i)
-              if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-          } else {
-            for (i = 0; i < 256; ++i) {
-              block[i] = 0;
-              orig_pos[i] = -1;
-            }
-
-            if (mask_4x4_int & 1) {
-              direct = pick_min_grad_direct(src, 4, row, col, width, height,
-                                            pitch, 1, 1);
-
-              pick_filter_block_horz(src, block, orig_pos, 4, row + 4, col,
-                                     width, height, pitch, pivot, line_length,
-                                     1, direct);
-
-              aom_lpf_horizontal_4(block + pivot * line_length, line_length,
-                                   lfi->mblim, lfi->lim, lfi->hev_thr);
-            } else if (mask_4x4_int & 2) {
-              direct = pick_min_grad_direct(src, 4, row, col, width, height,
-                                            pitch, 1, 1);
-
-              pick_filter_block_horz(src, block, orig_pos, 4, row + 4, col + 8,
-                                     width, height, pitch, pivot, line_length,
-                                     1, direct);
-
-              aom_lpf_horizontal_4(block + pivot * line_length, line_length,
-                                   lfin->mblim, lfin->lim, lfin->hev_thr);
-            }
-
-            for (i = 0; i < 256; ++i)
-              if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-          }
-        } else {
-          direct = pick_min_grad_direct(src, above_filt_len, row, col, width,
-                                        height, pitch, 1, 1);
-
-          pick_filter_block_horz(src, block, orig_pos, above_filt_len, row, col,
-                                 width, height, pitch, pivot, line_length, 1,
-                                 direct);
-
-          aom_lpf_horizontal_8(block + pivot * line_length, line_length,
-                               lfi->mblim, lfi->lim, lfi->hev_thr);
-
-          for (i = 0; i < 256; ++i)
-            if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-
-          if (mask_4x4_int & 1) {
-            for (i = 0; i < 256; ++i) {
-              block[i] = 0;
-              orig_pos[i] = -1;
-            }
-            direct = pick_min_grad_direct(src, 4, row, col, width, height,
-                                          pitch, 1, 1);
-
-            pick_filter_block_horz(src, block, orig_pos, 4, row + 4, col, width,
-                                   height, pitch, pivot, line_length, 1,
-                                   direct);
-
-            aom_lpf_horizontal_4(block + pivot * line_length, line_length,
-                                 lfi->mblim, lfi->lim, lfi->hev_thr);
-
-            for (i = 0; i < 256; ++i)
-              if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-          }
-        }
-      } else if (mask_4x4 & 1) {
-        if ((mask_4x4 & 3) == 3) {
-          count = 2;
-          direct = pick_min_grad_direct(src, 4, row, col, width, height, pitch,
-                                        2, 1);
-
-          pick_filter_block_horz(src, block, orig_pos, 4, row, col, width,
-                                 height, pitch, pivot, line_length, 2, direct);
-
-          aom_lpf_horizontal_4_dual(block + pivot * line_length, line_length,
-                                    lfi->mblim, lfi->lim, lfi->hev_thr,
-                                    lfin->mblim, lfin->lim, lfin->hev_thr);
-
-          for (i = 0; i < 256; ++i)
-            if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-
-          if ((mask_4x4_int & 3) == 3) {
-            for (i = 0; i < 256; ++i) {
-              block[i] = 0;
-              orig_pos[i] = -1;
-            }
-
-            direct = pick_min_grad_direct(src, 4, row, col, width, height,
-                                          pitch, 2, 1);
-
-            pick_filter_block_horz(src, block, orig_pos, 4, row + 4, col, width,
-                                   height, pitch, pivot, line_length, 2,
-                                   direct);
-
-            aom_lpf_horizontal_4_dual(block + pivot * line_length, line_length,
-                                      lfi->mblim, lfi->lim, lfi->hev_thr,
-                                      lfin->mblim, lfin->lim, lfin->hev_thr);
-
-            for (i = 0; i < 256; ++i)
-              if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-          } else {
-            for (i = 0; i < 256; ++i) {
-              block[i] = 0;
-              orig_pos[i] = -1;
-            }
-
-            if (mask_4x4_int & 1) {
-              direct = pick_min_grad_direct(src, 4, row, col, width, height,
-                                            pitch, 1, 1);
-
-              pick_filter_block_horz(src, block, orig_pos, 4, row + 4, col,
-                                     width, height, pitch, pivot, line_length,
-                                     1, direct);
-
-              aom_lpf_horizontal_4(block + pivot * line_length, line_length,
-                                   lfi->mblim, lfi->lim, lfi->hev_thr);
-            } else if (mask_4x4_int & 2) {
-              direct = pick_min_grad_direct(src, 4, row, col, width, height,
-                                            pitch, 1, 1);
-
-              pick_filter_block_horz(src, block, orig_pos, 4, row + 4, col + 8,
-                                     width, height, pitch, pivot, line_length,
-                                     1, direct);
-
-              aom_lpf_horizontal_4(block + pivot * line_length, line_length,
-                                   lfin->mblim, lfin->lim, lfin->hev_thr);
-            }
-
-            for (i = 0; i < 256; ++i)
-              if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-          }
-        } else {
-          direct = pick_min_grad_direct(src, above_filt_len, row, col, width,
-                                        height, pitch, 1, 1);
-
-          pick_filter_block_horz(src, block, orig_pos, above_filt_len, row, col,
-                                 width, height, pitch, pivot, line_length, 1,
-                                 direct);
-
-          aom_lpf_horizontal_4(block + pivot * line_length, line_length,
-                               lfi->mblim, lfi->lim, lfi->hev_thr);
-
-          for (i = 0; i < 256; ++i)
-            if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-
-          if (mask_4x4_int & 1) {
-            for (i = 0; i < 256; ++i) {
-              block[i] = 0;
-              orig_pos[i] = -1;
-            }
-            direct = pick_min_grad_direct(src, above_filt_len, row, col, width,
-                                          height, pitch, 1, 1);
-
-            pick_filter_block_horz(src, block, orig_pos, 4, row + 4, col, width,
-                                   height, pitch, pivot, line_length, 1,
-                                   direct);
-
-            aom_lpf_horizontal_4(block + pivot * line_length, line_length,
-                                 lfi->mblim, lfi->lim, lfi->hev_thr);
-
-            for (i = 0; i < 256; ++i)
-              if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-          }
-        }
-      } else if (mask_4x4_int & 1) {
-        direct =
-            pick_min_grad_direct(src, 4, row, col, width, height, pitch, 1, 1);
-
-        pick_filter_block_horz(src, block, orig_pos, 4, row + 4, col, width,
-                               height, pitch, pivot, line_length, 1, direct);
-
-        aom_lpf_horizontal_4(block + pivot * line_length, line_length,
-                             lfi->mblim, lfi->lim, lfi->hev_thr);
-
-        for (i = 0; i < 256; ++i)
-          if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-      }
-#else   // CONFIG_LPF_DIRECT
-      if (mask_16x16 & 1) {
-        if ((mask_16x16 & 3) == 3) {
-          aom_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_horizontal =
+            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14;
+
+        if ((mask_16x16 & two_block_mask) == two_block_mask) {
+          /*
+          aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
                                      lfi->hev_thr);
+          */
+
+          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+          lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim, lfin->hev_thr);
           count = 2;
         } else {
-          aom_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr);
+          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
         }
       } else if (mask_8x8 & 1) {
-        if ((mask_8x8 & 3) == 3) {
-          // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_horizontal =
+            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8;
 
+        if ((mask_8x8 & two_block_mask) == two_block_mask) {
+          /*
           aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, lfin->mblim, lfin->lim,
                                     lfin->hev_thr);
+          */
 
-          if ((mask_4x4_int & 3) == 3) {
-            aom_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
-                                      lfi->lim, lfi->hev_thr, lfin->mblim,
-                                      lfin->lim, lfin->hev_thr);
-          } else {
-            if (mask_4x4_int & 1)
-              aom_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                   lfi->hev_thr);
-            else if (mask_4x4_int & 2)
-              aom_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
-                                   lfin->lim, lfin->hev_thr);
-          }
+          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+          lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim, lfin->hev_thr);
           count = 2;
         } else {
-          aom_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-
-          if (mask_4x4_int & 1)
-            aom_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                 lfi->hev_thr);
+          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
         }
       } else if (mask_4x4 & 1) {
-        if ((mask_4x4 & 3) == 3) {
-          // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
-
+        if ((mask_4x4 & two_block_mask) == two_block_mask) {
+          /*
           aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, lfin->mblim, lfin->lim,
                                     lfin->hev_thr);
-
-          if ((mask_4x4_int & 3) == 3) {
-            aom_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
-                                      lfi->lim, lfi->hev_thr, lfin->mblim,
-                                      lfin->lim, lfin->hev_thr);
-          } else {
-            if (mask_4x4_int & 1)
-              aom_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                   lfi->hev_thr);
-            else if (mask_4x4_int & 2)
-              aom_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
-                                   lfin->lim, lfin->hev_thr);
-          }
+          */
+          aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+          aom_lpf_horizontal_4(s + 4, pitch, lfin->mblim, lfin->lim,
+                               lfin->hev_thr);
           count = 2;
         } else {
           aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-
-          if (mask_4x4_int & 1)
-            aom_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                 lfi->hev_thr);
         }
-      } else if (mask_4x4_int & 1) {
-        aom_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                             lfi->hev_thr);
       }
-#endif  // CONFIG_LPF_DIRECT
     }
-#if CONFIG_LPF_DIRECT
-    idx_c += col_step * count;
-#endif
-    s += 8 * count;
-    lfl += count;
-    mask_16x16 >>= count;
-    mask_8x8 >>= count;
-    mask_4x4 >>= count;
-    mask_4x4_int >>= count;
+
+    s += 4 * count;
+    lfl += step * count;
+    mask_16x16 >>= step * count;
+    mask_8x8 >>= step * count;
+    mask_4x4 >>= step * count;
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 static void highbd_filter_selectively_horiz(
-    uint16_t *s, int pitch, unsigned int mask_16x16, unsigned int mask_8x8,
-    unsigned int mask_4x4, unsigned int mask_4x4_int,
-    const loop_filter_info_n *lfi_n, const uint8_t *lfl, int bd) {
-  unsigned int mask;
+    uint16_t *s, int pitch, int plane, int subsampling, uint64_t mask_16x16,
+    uint64_t mask_8x8, uint64_t mask_4x4, const loop_filter_info_n *lfi_n,
+    uint8_t *lfl, int bd) {
+  uint64_t mask;
   int count;
+  const int step = 1 << subsampling;
+  const unsigned int two_block_mask = subsampling ? 5 : 3;
 
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask;
-       mask >>= count) {
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
     const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+    // Next block's thresholds.
+    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + step);
 
     count = 1;
     if (mask & 1) {
       if (mask_16x16 & 1) {
-        if ((mask_16x16 & 3) == 3) {
-          aom_highbd_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,
+        HbdLpfFunc highbd_lpf_horizontal =
+            plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14;
+
+        if ((mask_16x16 & two_block_mask) == two_block_mask) {
+          /*
+          aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
                                             lfi->hev_thr, bd);
+          */
+
+          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                                bd);
+          highbd_lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim,
+                                lfin->hev_thr, bd);
           count = 2;
         } else {
-          aom_highbd_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim,
-                                           lfi->hev_thr, bd);
+          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                                bd);
         }
       } else if (mask_8x8 & 1) {
-        if ((mask_8x8 & 3) == 3) {
-          // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+        HbdLpfFunc highbd_lpf_horizontal =
+            plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8;
 
+        if ((mask_8x8 & two_block_mask) == two_block_mask) {
+          /*
           aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
                                            lfi->hev_thr, lfin->mblim, lfin->lim,
                                            lfin->hev_thr, bd);
-
-          if ((mask_4x4_int & 3) == 3) {
-            aom_highbd_lpf_horizontal_4_dual(
-                s + 4 * pitch, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                lfin->mblim, lfin->lim, lfin->hev_thr, bd);
-          } else {
-            if (mask_4x4_int & 1) {
-              aom_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
-                                          lfi->lim, lfi->hev_thr, bd);
-            } else if (mask_4x4_int & 2) {
-              aom_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
-                                          lfin->lim, lfin->hev_thr, bd);
-            }
-          }
+          */
+          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                                bd);
+          highbd_lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim,
+                                lfin->hev_thr, bd);
           count = 2;
         } else {
-          aom_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, bd);
-
-          if (mask_4x4_int & 1) {
-            aom_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
-                                        lfi->lim, lfi->hev_thr, bd);
-          }
+          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                                bd);
         }
       } else if (mask_4x4 & 1) {
-        if ((mask_4x4 & 3) == 3) {
-          // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
-
+        if ((mask_4x4 & two_block_mask) == two_block_mask) {
+          /*
           aom_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                            lfi->hev_thr, lfin->mblim, lfin->lim,
                                            lfin->hev_thr, bd);
-          if ((mask_4x4_int & 3) == 3) {
-            aom_highbd_lpf_horizontal_4_dual(
-                s + 4 * pitch, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                lfin->mblim, lfin->lim, lfin->hev_thr, bd);
-          } else {
-            if (mask_4x4_int & 1) {
-              aom_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
-                                          lfi->lim, lfi->hev_thr, bd);
-            } else if (mask_4x4_int & 2) {
-              aom_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
-                                          lfin->lim, lfin->hev_thr, bd);
-            }
-          }
+          */
+          aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, bd);
+          aom_highbd_lpf_horizontal_4(s + 4, pitch, lfin->mblim, lfin->lim,
+                                      lfin->hev_thr, bd);
           count = 2;
         } else {
           aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
                                       lfi->hev_thr, bd);
-
-          if (mask_4x4_int & 1) {
-            aom_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
-                                        lfi->lim, lfi->hev_thr, bd);
-          }
         }
-      } else if (mask_4x4_int & 1) {
-        aom_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr, bd);
       }
     }
-    s += 8 * count;
-    lfl += count;
-    mask_16x16 >>= count;
-    mask_8x8 >>= count;
-    mask_4x4 >>= count;
-    mask_4x4_int >>= count;
-  }
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-// This function ors into the current lfm structure, where to do loop
-// filters for the specific mi we are looking at. It uses information
-// including the block_size_type (32x16, 32x32, etc.), the transform size,
-// whether there were any coefficients encoded, and the loop filter strength
-// block we are currently looking at. Shift is used to position the
-// 1's we produce.
-// TODO(JBB) Need another function for different resolution color..
-static void build_masks(AV1_COMMON *const cm,
-                        const loop_filter_info_n *const lfi_n,
-                        const MODE_INFO *mi, const int shift_y,
-                        const int shift_uv, LOOP_FILTER_MASK *lfm) {
-  const MB_MODE_INFO *mbmi = &mi->mbmi;
-  const BLOCK_SIZE block_size = mbmi->sb_type;
-  // TODO(debargha): Check if masks can be setup correctly when
-  // rectangular transfroms are used with the EXT_TX expt.
-  const TX_SIZE tx_size_y = txsize_sqr_map[mbmi->tx_size];
-  const TX_SIZE tx_size_y_left = txsize_horz_map[mbmi->tx_size];
-  const TX_SIZE tx_size_y_above = txsize_vert_map[mbmi->tx_size];
-  const TX_SIZE tx_size_uv =
-      txsize_sqr_map[uv_txsize_lookup[block_size][mbmi->tx_size][1][1]];
-  const TX_SIZE tx_size_uv_left =
-      txsize_horz_map[uv_txsize_lookup[block_size][mbmi->tx_size][1][1]];
-  const TX_SIZE tx_size_uv_above =
-      txsize_vert_map[uv_txsize_lookup[block_size][mbmi->tx_size][1][1]];
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-  const int filter_level = get_filter_level(cm, lfi_n, 0, 0, mbmi);
-#else
-#if CONFIG_LPF_SB
-  const int filter_level = get_filter_level(cm, lfi_n, 0, 0, mbmi);
-#else
-  const int filter_level = get_filter_level(cm, lfi_n, mbmi);
-#endif  // CONFIG_LPF_SB
-#endif
-#else
-  const int filter_level = get_filter_level(lfi_n, mbmi);
-  (void)cm;
-#endif
-  uint64_t *const left_y = &lfm->left_y[tx_size_y_left];
-  uint64_t *const above_y = &lfm->above_y[tx_size_y_above];
-  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
-  uint16_t *const left_uv = &lfm->left_uv[tx_size_uv_left];
-  uint16_t *const above_uv = &lfm->above_uv[tx_size_uv_above];
-  uint16_t *const int_4x4_uv = &lfm->left_int_4x4_uv;
-  int i;
-
-  // If filter level is 0 we don't loop filter.
-  if (!filter_level) {
-    return;
-  } else {
-    const int w = num_8x8_blocks_wide_lookup[block_size];
-    const int h = num_8x8_blocks_high_lookup[block_size];
-    const int row = (shift_y >> MAX_MIB_SIZE_LOG2);
-    const int col = shift_y - (row << MAX_MIB_SIZE_LOG2);
-
-    for (i = 0; i < h; i++) memset(&lfm->lfl_y[row + i][col], filter_level, w);
-  }
-
-  // These set 1 in the current block size for the block size edges.
-  // For instance if the block size is 32x16, we'll set:
-  //    above =   1111
-  //              0000
-  //    and
-  //    left  =   1000
-  //          =   1000
-  // NOTE : In this example the low bit is left most ( 1000 ) is stored as
-  //        1,  not 8...
-  //
-  // U and V set things on a 16 bit scale.
-  //
-  *above_y |= above_prediction_mask[block_size] << shift_y;
-  *above_uv |= above_prediction_mask_uv[block_size] << shift_uv;
-  *left_y |= left_prediction_mask[block_size] << shift_y;
-  *left_uv |= left_prediction_mask_uv[block_size] << shift_uv;
-
-  // If the block has no coefficients and is not intra we skip applying
-  // the loop filter on block edges.
-  if (mbmi->skip && is_inter_block(mbmi)) return;
-
-  // Here we are adding a mask for the transform size. The transform
-  // size mask is set to be correct for a 64x64 prediction block size. We
-  // mask to match the size of the block we are working on and then shift it
-  // into place..
-  *above_y |= (size_mask[block_size] & above_64x64_txform_mask[tx_size_y_above])
-              << shift_y;
-  *above_uv |=
-      (size_mask_uv[block_size] & above_64x64_txform_mask_uv[tx_size_uv_above])
-      << shift_uv;
-
-  *left_y |= (size_mask[block_size] & left_64x64_txform_mask[tx_size_y_left])
-             << shift_y;
-  *left_uv |=
-      (size_mask_uv[block_size] & left_64x64_txform_mask_uv[tx_size_uv_left])
-      << shift_uv;
-
-  // Here we are trying to determine what to do with the internal 4x4 block
-  // boundaries.  These differ from the 4x4 boundaries on the outside edge of
-  // an 8x8 in that the internal ones can be skipped and don't depend on
-  // the prediction block size.
-  if (tx_size_y == TX_4X4)
-    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y;
-
-  if (tx_size_uv == TX_4X4)
-    *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
-}
-
-// This function does the same thing as the one above with the exception that
-// it only affects the y masks. It exists because for blocks < 16x16 in size,
-// we only update u and v masks on the first block.
-static void build_y_mask(AV1_COMMON *const cm,
-                         const loop_filter_info_n *const lfi_n,
-                         const MODE_INFO *mi, const int shift_y,
-#if CONFIG_SUPERTX
-                         int supertx_enabled,
-#endif  // CONFIG_SUPERTX
-                         LOOP_FILTER_MASK *lfm) {
-  const MB_MODE_INFO *mbmi = &mi->mbmi;
-  const TX_SIZE tx_size_y = txsize_sqr_map[mbmi->tx_size];
-  const TX_SIZE tx_size_y_left = txsize_horz_map[mbmi->tx_size];
-  const TX_SIZE tx_size_y_above = txsize_vert_map[mbmi->tx_size];
-#if CONFIG_SUPERTX
-  const BLOCK_SIZE block_size =
-      supertx_enabled ? (BLOCK_SIZE)(3 * tx_size_y) : mbmi->sb_type;
-#else
-  const BLOCK_SIZE block_size = mbmi->sb_type;
-#endif
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-  const int filter_level = get_filter_level(cm, lfi_n, 0, 0, mbmi);
-#else
-#if CONFIG_LPF_SB
-  const int filter_level = get_filter_level(cm, lfi_n, 0, 0, mbmi);
-#else
-  const int filter_level = get_filter_level(cm, lfi_n, mbmi);
-#endif  // CONFIG_LPF_SB
-#endif
-#else
-  const int filter_level = get_filter_level(lfi_n, mbmi);
-  (void)cm;
-#endif
-  uint64_t *const left_y = &lfm->left_y[tx_size_y_left];
-  uint64_t *const above_y = &lfm->above_y[tx_size_y_above];
-  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
-  int i;
-
-  if (!filter_level) {
-    return;
-  } else {
-    const int w = num_8x8_blocks_wide_lookup[block_size];
-    const int h = num_8x8_blocks_high_lookup[block_size];
-    const int row = (shift_y >> MAX_MIB_SIZE_LOG2);
-    const int col = shift_y - (row << MAX_MIB_SIZE_LOG2);
 
-    for (i = 0; i < h; i++) memset(&lfm->lfl_y[row + i][col], filter_level, w);
+    s += 4 * count;
+    lfl += step * count;
+    mask_16x16 >>= step * count;
+    mask_8x8 >>= step * count;
+    mask_4x4 >>= step * count;
   }
-
-  *above_y |= above_prediction_mask[block_size] << shift_y;
-  *left_y |= left_prediction_mask[block_size] << shift_y;
-
-  if (mbmi->skip && is_inter_block(mbmi)) return;
-
-  *above_y |= (size_mask[block_size] & above_64x64_txform_mask[tx_size_y_above])
-              << shift_y;
-
-  *left_y |= (size_mask[block_size] & left_64x64_txform_mask[tx_size_y_left])
-             << shift_y;
-
-  if (tx_size_y == TX_4X4)
-    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y;
 }
 
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-// This function update the bit masks for the entire 64x64 region represented
-// by mi_row, mi_col. In case one of the edge is a tile boundary, loop filtering
-// for that edge is disabled. This function only check the tile boundary info
-// for the top left corner mi to determine the boundary information for the
-// top and left edge of the whole super block
-static void update_tile_boundary_filter_mask(AV1_COMMON *const cm,
-                                             const int mi_row, const int mi_col,
-                                             LOOP_FILTER_MASK *lfm) {
-  int i;
-  MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride + mi_col;
-
-  if (mi->mbmi.boundary_info & TILE_LEFT_BOUNDARY) {
-    for (i = 0; i <= TX_32X32; i++) {
-      lfm->left_y[i] &= 0xfefefefefefefefeULL;
-      lfm->left_uv[i] &= 0xeeee;
-    }
-  }
-
-  if (mi->mbmi.boundary_info & TILE_ABOVE_BOUNDARY) {
-    for (i = 0; i <= TX_32X32; i++) {
-      lfm->above_y[i] &= 0xffffffffffffff00ULL;
-      lfm->above_uv[i] &= 0xfff0;
-    }
-  }
-}
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-
-// This function sets up the bit masks for the entire 64x64 region represented
-// by mi_row, mi_col.
-// TODO(JBB): This function only works for yv12.
-void av1_setup_mask(AV1_COMMON *const cm, const int mi_row, const int mi_col,
-                    MODE_INFO **mi, const int mode_info_stride,
-                    LOOP_FILTER_MASK *lfm) {
-#if CONFIG_EXT_PARTITION
-  assert(0 && "Not yet updated");
-#endif  // CONFIG_EXT_PARTITION
-  int idx_32, idx_16, idx_8;
-  const loop_filter_info_n *const lfi_n = &cm->lf_info;
-  MODE_INFO **mip = mi;
-  MODE_INFO **mip2 = mi;
-
-  // These are offsets to the next mi in the 64x64 block. It is what gets
-  // added to the mi ptr as we go through each loop. It helps us to avoid
-  // setting up special row and column counters for each index. The last step
-  // brings us out back to the starting position.
-  const int offset_32[] = { 4, (mode_info_stride << 2) - 4, 4,
-                            -(mode_info_stride << 2) - 4 };
-  const int offset_16[] = { 2, (mode_info_stride << 1) - 2, 2,
-                            -(mode_info_stride << 1) - 2 };
-  const int offset[] = { 1, mode_info_stride - 1, 1, -mode_info_stride - 1 };
-
-  // Following variables represent shifts to position the current block
-  // mask over the appropriate block. A shift of 36 to the left will move
-  // the bits for the final 32 by 32 block in the 64x64 up 4 rows and left
-  // 4 rows to the appropriate spot.
-  const int shift_32_y[] = { 0, 4, 32, 36 };
-  const int shift_16_y[] = { 0, 2, 16, 18 };
-  const int shift_8_y[] = { 0, 1, 8, 9 };
-  const int shift_32_uv[] = { 0, 2, 8, 10 };
-  const int shift_16_uv[] = { 0, 1, 4, 5 };
-  int i;
-  const int max_rows = AOMMIN(cm->mi_rows - mi_row, MAX_MIB_SIZE);
-  const int max_cols = AOMMIN(cm->mi_cols - mi_col, MAX_MIB_SIZE);
-
-  av1_zero(*lfm);
-  assert(mip[0] != NULL);
-
-  // TODO(jimbankoski): Try moving most of the following code into decode
-  // loop and storing lfm in the mbmi structure so that we don't have to go
-  // through the recursive loop structure multiple times.
-  switch (mip[0]->mbmi.sb_type) {
-    case BLOCK_64X64: build_masks(cm, lfi_n, mip[0], 0, 0, lfm); break;
-    case BLOCK_64X32: build_masks(cm, lfi_n, mip[0], 0, 0, lfm);
-#if CONFIG_SUPERTX && CONFIG_TX64X64
-      if (supertx_enabled(&mip[0]->mbmi)) break;
-#endif  // CONFIG_SUPERTX && CONFIG_TX64X64
-      mip2 = mip + mode_info_stride * 4;
-      if (4 >= max_rows) break;
-      build_masks(cm, lfi_n, mip2[0], 32, 8, lfm);
-      break;
-    case BLOCK_32X64: build_masks(cm, lfi_n, mip[0], 0, 0, lfm);
-#if CONFIG_SUPERTX && CONFIG_TX64X64
-      if (supertx_enabled(&mip[0]->mbmi)) break;
-#endif  // CONFIG_SUPERTX && CONFIG_TX64X64
-      mip2 = mip + 4;
-      if (4 >= max_cols) break;
-      build_masks(cm, lfi_n, mip2[0], 4, 2, lfm);
-      break;
-    default:
-#if CONFIG_SUPERTX && CONFIG_TX64X64
-      if (mip[0]->mbmi.tx_size == TX_64X64) {
-        build_masks(cm, lfi_n, mip[0], 0, 0, lfm);
-      } else {
-#endif  // CONFIG_SUPERTX && CONFIG_TX64X64
-        for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) {
-          const int shift_y_32 = shift_32_y[idx_32];
-          const int shift_uv_32 = shift_32_uv[idx_32];
-          const int mi_32_col_offset = ((idx_32 & 1) << 2);
-          const int mi_32_row_offset = ((idx_32 >> 1) << 2);
-          if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows)
-            continue;
-          switch (mip[0]->mbmi.sb_type) {
-            case BLOCK_32X32:
-              build_masks(cm, lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
-              break;
-            case BLOCK_32X16:
-              build_masks(cm, lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
-#if CONFIG_SUPERTX
-              if (supertx_enabled(&mip[0]->mbmi)) break;
-#endif
-              if (mi_32_row_offset + 2 >= max_rows) continue;
-              mip2 = mip + mode_info_stride * 2;
-              build_masks(cm, lfi_n, mip2[0], shift_y_32 + 16, shift_uv_32 + 4,
-                          lfm);
-              break;
-            case BLOCK_16X32:
-              build_masks(cm, lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
-#if CONFIG_SUPERTX
-              if (supertx_enabled(&mip[0]->mbmi)) break;
-#endif
-              if (mi_32_col_offset + 2 >= max_cols) continue;
-              mip2 = mip + 2;
-              build_masks(cm, lfi_n, mip2[0], shift_y_32 + 2, shift_uv_32 + 1,
-                          lfm);
-              break;
-            default:
-#if CONFIG_SUPERTX
-              if (mip[0]->mbmi.tx_size == TX_32X32) {
-                build_masks(cm, lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
-                break;
-              }
-#endif
-              for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
-                const int shift_y_32_16 = shift_y_32 + shift_16_y[idx_16];
-                const int shift_uv_32_16 = shift_uv_32 + shift_16_uv[idx_16];
-                const int mi_16_col_offset =
-                    mi_32_col_offset + ((idx_16 & 1) << 1);
-                const int mi_16_row_offset =
-                    mi_32_row_offset + ((idx_16 >> 1) << 1);
-
-                if (mi_16_col_offset >= max_cols ||
-                    mi_16_row_offset >= max_rows)
-                  continue;
-
-                switch (mip[0]->mbmi.sb_type) {
-                  case BLOCK_16X16:
-                    build_masks(cm, lfi_n, mip[0], shift_y_32_16,
-                                shift_uv_32_16, lfm);
-                    break;
-                  case BLOCK_16X8:
-#if CONFIG_SUPERTX
-                    if (supertx_enabled(&mip[0]->mbmi)) break;
-#endif
-                    build_masks(cm, lfi_n, mip[0], shift_y_32_16,
-                                shift_uv_32_16, lfm);
-                    if (mi_16_row_offset + 1 >= max_rows) continue;
-                    mip2 = mip + mode_info_stride;
-                    build_y_mask(cm, lfi_n, mip2[0], shift_y_32_16 + 8,
-#if CONFIG_SUPERTX
-                                 0,
-#endif
-                                 lfm);
-                    break;
-                  case BLOCK_8X16:
-#if CONFIG_SUPERTX
-                    if (supertx_enabled(&mip[0]->mbmi)) break;
-#endif
-                    build_masks(cm, lfi_n, mip[0], shift_y_32_16,
-                                shift_uv_32_16, lfm);
-                    if (mi_16_col_offset + 1 >= max_cols) continue;
-                    mip2 = mip + 1;
-                    build_y_mask(cm, lfi_n, mip2[0], shift_y_32_16 + 1,
-#if CONFIG_SUPERTX
-                                 0,
-#endif
-                                 lfm);
-                    break;
-                  default: {
-                    const int shift_y_32_16_8_zero =
-                        shift_y_32_16 + shift_8_y[0];
-#if CONFIG_SUPERTX
-                    if (mip[0]->mbmi.tx_size == TX_16X16) {
-                      build_masks(cm, lfi_n, mip[0], shift_y_32_16_8_zero,
-                                  shift_uv_32_16, lfm);
-                      break;
-                    }
-#endif
-                    build_masks(cm, lfi_n, mip[0], shift_y_32_16_8_zero,
-                                shift_uv_32_16, lfm);
-                    mip += offset[0];
-                    for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
-                      const int shift_y_32_16_8 =
-                          shift_y_32_16 + shift_8_y[idx_8];
-                      const int mi_8_col_offset =
-                          mi_16_col_offset + ((idx_8 & 1));
-                      const int mi_8_row_offset =
-                          mi_16_row_offset + ((idx_8 >> 1));
-
-                      if (mi_8_col_offset >= max_cols ||
-                          mi_8_row_offset >= max_rows)
-                        continue;
-                      build_y_mask(cm, lfi_n, mip[0], shift_y_32_16_8,
-#if CONFIG_SUPERTX
-                                   supertx_enabled(&mip[0]->mbmi),
-#endif
-                                   lfm);
-                    }
-                    break;
-                  }
-                }
-              }
-              break;
-          }
+static int compare_ref_dst(AV1_COMMON *const cm, uint8_t *ref_buf,
+                           uint8_t *dst_buf, int ref_stride, int dst_stride,
+                           int start, int end) {
+  return 0;
+
+  start <<= MI_SIZE_LOG2;
+  end <<= MI_SIZE_LOG2;
+  uint8_t *ref0 = ref_buf;
+  uint8_t *dst0 = dst_buf;
+  if (cm->use_highbitdepth) {
+    const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref_buf);
+    const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst_buf);
+    for (int j = 0; j < 4; ++j) {
+      for (int i = start; i < end; ++i)
+        if (ref16[i] != dst16[i]) {
+          ref_buf = ref0;
+          dst_buf = dst0;
+          return i + 1;
         }
-#if CONFIG_SUPERTX && CONFIG_TX64X64
-      }
-#endif  // CONFIG_SUPERTX && CONFIG_TX64X64
-      break;
-  }
-  // The largest loopfilter we have is 16x16 so we use the 16x16 mask
-  // for 32x32 transforms also.
-  lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
-  lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
-  lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
-  lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32];
-
-  // We do at least 8 tap filter on every 32x32 even if the transform size
-  // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and
-  // remove it from the 4x4.
-  lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border;
-  lfm->left_y[TX_4X4] &= ~left_border;
-  lfm->above_y[TX_8X8] |= lfm->above_y[TX_4X4] & above_border;
-  lfm->above_y[TX_4X4] &= ~above_border;
-  lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_4X4] & left_border_uv;
-  lfm->left_uv[TX_4X4] &= ~left_border_uv;
-  lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_4X4] & above_border_uv;
-  lfm->above_uv[TX_4X4] &= ~above_border_uv;
-
-  // We do some special edge handling.
-  if (mi_row + MAX_MIB_SIZE > cm->mi_rows) {
-    const uint64_t rows = cm->mi_rows - mi_row;
-
-    // Each pixel inside the border gets a 1,
-    const uint64_t mask_y = (((uint64_t)1 << (rows << MAX_MIB_SIZE_LOG2)) - 1);
-    const uint16_t mask_uv =
-        (((uint16_t)1 << (((rows + 1) >> 1) << (MAX_MIB_SIZE_LOG2 - 1))) - 1);
-
-    // Remove values completely outside our border.
-    for (i = 0; i < TX_32X32; i++) {
-      lfm->left_y[i] &= mask_y;
-      lfm->above_y[i] &= mask_y;
-      lfm->left_uv[i] &= mask_uv;
-      lfm->above_uv[i] &= mask_uv;
-    }
-    lfm->int_4x4_y &= mask_y;
-    lfm->above_int_4x4_uv = lfm->left_int_4x4_uv & mask_uv;
-
-    // We don't apply a wide loop filter on the last uv block row. If set
-    // apply the shorter one instead.
-    if (rows == 1) {
-      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16];
-      lfm->above_uv[TX_16X16] = 0;
-    }
-    if (rows == 5) {
-      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00;
-      lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00);
+      ref16 += ref_stride;
+      dst16 += dst_stride;
     }
   } else {
-    lfm->above_int_4x4_uv = lfm->left_int_4x4_uv;
-  }
-
-  if (mi_col + MAX_MIB_SIZE > cm->mi_cols) {
-    const uint64_t columns = cm->mi_cols - mi_col;
-
-    // Each pixel inside the border gets a 1, the multiply copies the border
-    // to where we need it.
-    const uint64_t mask_y = (((1 << columns) - 1)) * 0x0101010101010101ULL;
-    const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;
-
-    // Internal edges are not applied on the last column of the image so
-    // we mask 1 more for the internal edges
-    const uint16_t mask_uv_int = ((1 << (columns >> 1)) - 1) * 0x1111;
-
-    // Remove the bits outside the image edge.
-    for (i = 0; i < TX_32X32; i++) {
-      lfm->left_y[i] &= mask_y;
-      lfm->above_y[i] &= mask_y;
-      lfm->left_uv[i] &= mask_uv;
-      lfm->above_uv[i] &= mask_uv;
-    }
-    lfm->int_4x4_y &= mask_y;
-    lfm->left_int_4x4_uv &= mask_uv_int;
-
-    // We don't apply a wide loop filter on the last uv column. If set
-    // apply the shorter one instead.
-    if (columns == 1) {
-      lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16];
-      lfm->left_uv[TX_16X16] = 0;
-    }
-    if (columns == 5) {
-      lfm->left_uv[TX_8X8] |= (lfm->left_uv[TX_16X16] & 0xcccc);
-      lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc);
-    }
-  }
-  // We don't apply a loop filter on the first column in the image, mask that
-  // out.
-  if (mi_col == 0) {
-    for (i = 0; i < TX_32X32; i++) {
-      lfm->left_y[i] &= 0xfefefefefefefefeULL;
-      lfm->left_uv[i] &= 0xeeee;
-    }
-  }
-
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  if (av1_disable_loopfilter_on_tile_boundary(cm)) {
-    update_tile_boundary_filter_mask(cm, mi_row, mi_col, lfm);
-  }
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-
-  // Assert if we try to apply 2 different loop filters at the same position.
-  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_8X8]));
-  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_4X4]));
-  assert(!(lfm->left_y[TX_8X8] & lfm->left_y[TX_4X4]));
-  assert(!(lfm->int_4x4_y & lfm->left_y[TX_16X16]));
-  assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_8X8]));
-  assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_4X4]));
-  assert(!(lfm->left_uv[TX_8X8] & lfm->left_uv[TX_4X4]));
-  assert(!(lfm->left_int_4x4_uv & lfm->left_uv[TX_16X16]));
-  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_8X8]));
-  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_4X4]));
-  assert(!(lfm->above_y[TX_8X8] & lfm->above_y[TX_4X4]));
-  assert(!(lfm->int_4x4_y & lfm->above_y[TX_16X16]));
-  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_8X8]));
-  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_4X4]));
-  assert(!(lfm->above_uv[TX_8X8] & lfm->above_uv[TX_4X4]));
-  assert(!(lfm->above_int_4x4_uv & lfm->above_uv[TX_16X16]));
-}
-
-static void filter_selectively_vert(
-    uint8_t *s, int pitch, unsigned int mask_16x16, unsigned int mask_8x8,
-    unsigned int mask_4x4, unsigned int mask_4x4_int,
-    const loop_filter_info_n *lfi_n, const uint8_t *lfl
-#if CONFIG_LPF_DIRECT
-    ,
-    uint8_t *const src, int mi_row, int mi_col, int idx_r, int col_step,
-    int width, int height, int ss_x, int ss_y
-#endif
-    ) {
-  unsigned int mask;
-#if CONFIG_LPF_DIRECT
-  // scale for u, v plane
-  width >>= ss_x;
-  height >>= ss_y;
-  int idx_c = 0;
-#endif
-
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask;
-       mask >>= 1) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
-
-#if CONFIG_LPF_DIRECT
-    int i;
-    const int pivot = 8;
-    const int left_filt_len = mask_16x16 & 1 ? 8 : 4;
-    const int right_filt_len = mask_16x16 & 1 ? 8 : 4;
-    const int line_length = 16;
-    uint8_t block[128];
-    int orig_pos[128];
-
-    // actual position for current pixel
-    const int row = (mi_row + idx_r) * MI_SIZE >> ss_y;
-    const int col = (mi_col + idx_c) * MI_SIZE >> ss_x;
-
-    // Could use asymmetric length in the future
-    assert(left_filt_len == right_filt_len);
-    (void)right_filt_len;
-
-    if ((mask_16x16 & 1) || (mask_8x8 & 1) || (mask_4x4 & 1)) {
-      for (i = 0; i < 128; ++i) {
-        block[i] = 0;
-        orig_pos[i] = -1;
-      }
-
-      const int direct = pick_min_grad_direct(src, left_filt_len, row, col,
-                                              width, height, pitch, 1, 0);
-
-      pick_filter_block_vert(src, block, orig_pos, left_filt_len, row, col,
-                             width, height, pitch, pivot, line_length, 1,
-                             direct);
-
-      // apply filtering
-      if (mask_16x16 & 1) {
-        aom_lpf_vertical_16(block + pivot, line_length, lfi->mblim, lfi->lim,
-                            lfi->hev_thr);
-      } else if (mask_8x8 & 1) {
-        aom_lpf_vertical_8(block + pivot, line_length, lfi->mblim, lfi->lim,
-                           lfi->hev_thr);
-      } else if (mask_4x4 & 1) {
-        aom_lpf_vertical_4(block + pivot, line_length, lfi->mblim, lfi->lim,
-                           lfi->hev_thr);
-      }
-
-      for (i = 0; i < 128; ++i)
-        if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-    }
-
-    // filter inner 4x4
-    if (mask_4x4_int & 1) {
-      for (i = 0; i < 128; ++i) {
-        block[i] = 0;
-        orig_pos[i] = -1;
-      }
-
-      const int direct = pick_min_grad_direct(src, 4, row, col + 4, width,
-                                              height, pitch, 1, 0);
-
-      pick_filter_block_vert(src, block, orig_pos, 4, row, col + 4, width,
-                             height, pitch, pivot, line_length, 1, direct);
-
-      aom_lpf_vertical_4(block + pivot, line_length, lfi->mblim, lfi->lim,
-                         lfi->hev_thr);
-
-      for (i = 0; i < 128; ++i)
-        if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-    }
-#else
-    if (mask & 1) {
-      if (mask_16x16 & 1) {
-        aom_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-      } else if (mask_8x8 & 1) {
-        aom_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-      } else if (mask_4x4 & 1) {
-        aom_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-      }
-    }
-    if (mask_4x4_int & 1)
-      aom_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-#endif  // CONFIG_LPF_DIRECT
-#if CONFIG_LPF_DIRECT
-    idx_c += col_step;
-#endif
-    s += 8;
-    lfl += 1;
-    mask_16x16 >>= 1;
-    mask_8x8 >>= 1;
-    mask_4x4 >>= 1;
-    mask_4x4_int >>= 1;
-  }
-}
-
-#if CONFIG_HIGHBITDEPTH
-static void highbd_filter_selectively_vert(
-    uint16_t *s, int pitch, unsigned int mask_16x16, unsigned int mask_8x8,
-    unsigned int mask_4x4, unsigned int mask_4x4_int,
-    const loop_filter_info_n *lfi_n, const uint8_t *lfl, int bd) {
-  unsigned int mask;
-
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask;
-       mask >>= 1) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
-
-    if (mask & 1) {
-      if (mask_16x16 & 1) {
-        aom_highbd_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                                   bd);
-      } else if (mask_8x8 & 1) {
-        aom_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                                  bd);
-      } else if (mask_4x4 & 1) {
-        aom_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                                  bd);
-      }
-    }
-    if (mask_4x4_int & 1)
-      aom_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim,
-                                lfi->hev_thr, bd);
-    s += 8;
-    lfl += 1;
-    mask_16x16 >>= 1;
-    mask_8x8 >>= 1;
-    mask_4x4 >>= 1;
-    mask_4x4_int >>= 1;
-  }
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-typedef struct {
-  unsigned int m16x16;
-  unsigned int m8x8;
-  unsigned int m4x4;
-} FilterMasks;
-
-// Get filter level and masks for the given row index 'idx_r'. (Only used for
-// the non420 case).
-// Note: 'row_masks_ptr' and/or 'col_masks_ptr' can be passed NULL.
-static void get_filter_level_and_masks_non420(
-    AV1_COMMON *const cm, const struct macroblockd_plane *const plane, int pl,
-    MODE_INFO **mib, int mi_row, int mi_col, int idx_r, uint8_t *const lfl_r,
-    unsigned int *const mask_4x4_int_r_ptr,
-    unsigned int *const mask_4x4_int_c_ptr, FilterMasks *const row_masks_ptr,
-    FilterMasks *const col_masks_ptr) {
-  const int ss_x = plane->subsampling_x;
-  const int ss_y = plane->subsampling_y;
-  const int col_step = mi_size_wide[BLOCK_8X8] << ss_x;
-  FilterMasks row_masks, col_masks;
-  memset(&row_masks, 0, sizeof(row_masks));
-  memset(&col_masks, 0, sizeof(col_masks));
-  unsigned int mask_4x4_int_r = 0, mask_4x4_int_c = 0;
-  const int r = idx_r >> mi_height_log2_lookup[BLOCK_8X8];
-
-  // Determine the vertical edges that need filtering
-  int idx_c;
-  for (idx_c = 0; idx_c < cm->mib_size && mi_col + idx_c < cm->mi_cols;
-       idx_c += col_step) {
-    const MODE_INFO *mi = mib[idx_r * cm->mi_stride + idx_c];
-    const MB_MODE_INFO *mbmi = &mi[0].mbmi;
-    const BLOCK_SIZE sb_type = mbmi->sb_type;
-    const int skip_this = mbmi->skip && is_inter_block(mbmi);
-    // Map index to 8x8 unit
-    const int c = idx_c >> mi_width_log2_lookup[BLOCK_8X8];
-
-    const int blk_row = r & (num_8x8_blocks_high_lookup[sb_type] - 1);
-    const int blk_col = c & (num_8x8_blocks_wide_lookup[sb_type] - 1);
-
-    // left edge of current unit is block/partition edge -> no skip
-    const int block_edge_left =
-        (num_4x4_blocks_wide_lookup[sb_type] > 1) ? !blk_col : 1;
-    const int skip_this_c = skip_this && !block_edge_left;
-    // top edge of current unit is block/partition edge -> no skip
-    const int block_edge_above =
-        (num_4x4_blocks_high_lookup[sb_type] > 1) ? !blk_row : 1;
-    const int skip_this_r = skip_this && !block_edge_above;
-
-    TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
-                          ? av1_get_uv_tx_size(mbmi, plane)
-                          : mbmi->tx_size;
-
-    const int skip_border_4x4_c =
-        ss_x && mi_col + idx_c >= cm->mi_cols - mi_size_wide[BLOCK_8X8];
-    const int skip_border_4x4_r =
-        ss_y && mi_row + idx_r >= cm->mi_rows - mi_size_high[BLOCK_8X8];
-
-    int tx_size_mask = 0;
-    const int c_step = (c >> ss_x);
-    const int r_step = (r >> ss_y);
-    const int col_mask = 1 << c_step;
-
-#if CONFIG_VAR_TX
-    if (is_inter_block(mbmi) && !mbmi->skip) {
-      const int tx_row_idx =
-          (blk_row * mi_size_high[BLOCK_8X8] << TX_UNIT_HIGH_LOG2) >> 1;
-      const int tx_col_idx =
-          (blk_col * mi_size_wide[BLOCK_8X8] << TX_UNIT_WIDE_LOG2) >> 1;
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      const BLOCK_SIZE bsize =
-          AOMMAX(BLOCK_4X4, get_plane_block_size(mbmi->sb_type, plane));
-#else
-      const BLOCK_SIZE bsize = get_plane_block_size(mbmi->sb_type, plane);
-#endif
-      const TX_SIZE mb_tx_size = mbmi->inter_tx_size[tx_row_idx][tx_col_idx];
-      tx_size = (plane->plane_type == PLANE_TYPE_UV)
-                    ? uv_txsize_lookup[bsize][mb_tx_size][0][0]
-                    : mb_tx_size;
-    }
-#endif
-
-// Filter level can vary per MI
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-    if (!(lfl_r[c_step] = get_filter_level(cm, &cm->lf_info, 0, 0, mbmi)))
-      continue;
-#else
-#if CONFIG_LPF_SB
-    if (!(lfl_r[c_step] =
-              get_filter_level(cm, &cm->lf_info, mi_row, mi_col, mbmi)))
-      continue;
-#else
-    if (!(lfl_r[c_step] = get_filter_level(cm, &cm->lf_info, mbmi))) continue;
-#endif  // CONFIG_LPF_SB
-#endif
-#else
-    if (!(lfl_r[c_step] = get_filter_level(&cm->lf_info, mbmi))) continue;
-#endif
-
-#if CONFIG_VAR_TX
-    TX_SIZE tx_size_horz_edge, tx_size_vert_edge;
-
-    // filt_len_vert_edge is the length of deblocking filter for a vertical edge
-    // The filter direction of a vertical edge is horizontal.
-    // Thus, filt_len_vert_edge is determined as the minimum width of the two
-    // transform block sizes on the left and right (current block) side of edge
-    const int filt_len_vert_edge = AOMMIN(
-        tx_size_wide[tx_size],
-        tx_size_wide[cm->left_txfm_context[pl][((mi_row + idx_r) & MAX_MIB_MASK)
-                                               << TX_UNIT_HIGH_LOG2]]);
-
-    // filt_len_horz_edge is the len of deblocking filter for a horizontal edge
-    // The filter direction of a horizontal edge is vertical.
-    // Thus, filt_len_horz_edge is determined as the minimum height of the two
-    // transform block sizes on the top and bottom (current block) side of edge
-    const int filt_len_horz_edge =
-        AOMMIN(tx_size_high[tx_size],
-               tx_size_high[cm->top_txfm_context[pl][(mi_col + idx_c)
-                                                     << TX_UNIT_WIDE_LOG2]]);
-
-    // transform width/height of current block
-    const int tx_wide_cur = tx_size_wide[tx_size];
-    const int tx_high_cur = tx_size_high[tx_size];
-
-    // tx_size_vert_edge is square transform size for a vertical deblocking edge
-    // It determines the type of filter applied to the vertical edge
-    // Similarly, tx_size_horz_edge is for a horizontal deblocking edge
-    tx_size_vert_edge = get_sqr_tx_size(filt_len_vert_edge);
-    tx_size_horz_edge = get_sqr_tx_size(filt_len_horz_edge);
-
-    memset(cm->top_txfm_context[pl] + ((mi_col + idx_c) << TX_UNIT_WIDE_LOG2),
-           tx_size, mi_size_wide[BLOCK_8X8] << TX_UNIT_WIDE_LOG2);
-    memset(cm->left_txfm_context[pl] +
-               (((mi_row + idx_r) & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2),
-           tx_size, mi_size_high[BLOCK_8X8] << TX_UNIT_HIGH_LOG2);
-#else
-    // The length (or equally the square tx size) of deblocking filter is only
-    // determined by
-    // a) current block's width for a vertical deblocking edge
-    // b) current block's height for a horizontal deblocking edge
-    TX_SIZE tx_size_vert_edge = txsize_horz_map[tx_size];
-    TX_SIZE tx_size_horz_edge = txsize_vert_map[tx_size];
-    (void)pl;
-#endif  // CONFIG_VAR_TX
-
-    if (tx_size_vert_edge == TX_32X32)
-      tx_size_mask = 3;
-    else if (tx_size_vert_edge == TX_16X16)
-      tx_size_mask = 1;
-    else
-      tx_size_mask = 0;
-
-    // Build masks based on the transform size of each block
-    // handle vertical mask
-    if (tx_size_vert_edge == TX_32X32) {
-      if (!skip_this_c && (c_step & tx_size_mask) == 0) {
-        if (!skip_border_4x4_c)
-          col_masks.m16x16 |= col_mask;
-        else
-          col_masks.m8x8 |= col_mask;
-      }
-    } else if (tx_size_vert_edge == TX_16X16) {
-      if (!skip_this_c && (c_step & tx_size_mask) == 0) {
-        if (!skip_border_4x4_c)
-          col_masks.m16x16 |= col_mask;
-        else
-          col_masks.m8x8 |= col_mask;
-      }
-    } else {
-      // force 8x8 filtering on 32x32 boundaries
-      if (!skip_this_c && (c_step & tx_size_mask) == 0) {
-        if (tx_size_vert_edge == TX_8X8 || (c_step & 3) == 0)
-          col_masks.m8x8 |= col_mask;
-        else
-          col_masks.m4x4 |= col_mask;
-      }
-
-#if CONFIG_VAR_TX
-      if (!skip_this && tx_wide_cur < 8 && !skip_border_4x4_c &&
-          (c_step & tx_size_mask) == 0)
-#else
-      if (!skip_this && tx_size_vert_edge < TX_8X8 && !skip_border_4x4_c &&
-          (c_step & tx_size_mask) == 0)
-#endif  // CONFIG_VAR_TX
-        mask_4x4_int_c |= col_mask;
-    }
-
-    if (tx_size_horz_edge == TX_32X32)
-      tx_size_mask = 3;
-    else if (tx_size_horz_edge == TX_16X16)
-      tx_size_mask = 1;
-    else
-      tx_size_mask = 0;
-
-    // set horizontal mask
-    if (tx_size_horz_edge == TX_32X32) {
-      if (!skip_this_r && (r_step & tx_size_mask) == 0) {
-        if (!skip_border_4x4_r)
-          row_masks.m16x16 |= col_mask;
-        else
-          row_masks.m8x8 |= col_mask;
-      }
-    } else if (tx_size_horz_edge == TX_16X16) {
-      if (!skip_this_r && (r_step & tx_size_mask) == 0) {
-        if (!skip_border_4x4_r)
-          row_masks.m16x16 |= col_mask;
-        else
-          row_masks.m8x8 |= col_mask;
-      }
-    } else {
-      // force 8x8 filtering on 32x32 boundaries
-      if (!skip_this_r && (r_step & tx_size_mask) == 0) {
-        if (tx_size_horz_edge == TX_8X8 || (r_step & 3) == 0)
-          row_masks.m8x8 |= col_mask;
-        else
-          row_masks.m4x4 |= col_mask;
-      }
-
-#if CONFIG_VAR_TX
-      if (!skip_this && tx_high_cur < 8 && !skip_border_4x4_r &&
-          (r_step & tx_size_mask) == 0)
-#else
-      if (!skip_this && tx_size_horz_edge < TX_8X8 && !skip_border_4x4_r &&
-          (r_step & tx_size_mask) == 0)
-#endif  // CONFIG_VAR_TX
-        mask_4x4_int_r |= col_mask;
-    }
-  }
-
-  if (row_masks_ptr) *row_masks_ptr = row_masks;
-  if (col_masks_ptr) *col_masks_ptr = col_masks;
-  if (mask_4x4_int_c_ptr) *mask_4x4_int_c_ptr = mask_4x4_int_c;
-  if (mask_4x4_int_r_ptr) *mask_4x4_int_r_ptr = mask_4x4_int_r;
-}
-
-void av1_filter_block_plane_non420_ver(AV1_COMMON *const cm,
-                                       struct macroblockd_plane *plane,
-                                       MODE_INFO **mib, int mi_row, int mi_col,
-                                       int pl) {
-  const int ss_y = plane->subsampling_y;
-  const int row_step = mi_size_high[BLOCK_8X8] << ss_y;
-#if CONFIG_LPF_DIRECT
-  const int ss_x = plane->subsampling_x;
-  const int col_step = mi_size_wide[BLOCK_8X8] << ss_x;
-#endif
-  struct buf_2d *const dst = &plane->dst;
-  uint8_t *const dst0 = dst->buf;
-  uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE] = { { 0 } };
-
-  int idx_r;
-  for (idx_r = 0; idx_r < cm->mib_size && mi_row + idx_r < cm->mi_rows;
-       idx_r += row_step) {
-    unsigned int mask_4x4_int;
-    FilterMasks col_masks;
-    const int r = idx_r >> mi_height_log2_lookup[BLOCK_8X8];
-    get_filter_level_and_masks_non420(cm, plane, pl, mib, mi_row, mi_col, idx_r,
-                                      &lfl[r][0], NULL, &mask_4x4_int, NULL,
-                                      &col_masks);
-
-    // Disable filtering on the leftmost column or tile boundary
-    unsigned int border_mask = ~(mi_col == 0 ? 1 : 0);
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-    MODE_INFO *const mi = cm->mi + (mi_row + idx_r) * cm->mi_stride + mi_col;
-    if (av1_disable_loopfilter_on_tile_boundary(cm) &&
-        ((mi->mbmi.boundary_info & TILE_LEFT_BOUNDARY) != 0)) {
-      border_mask = 0xfffffffe;
-    }
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-
-#if CONFIG_HIGHBITDEPTH
-    if (cm->use_highbitdepth)
-      highbd_filter_selectively_vert(
-          CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
-          col_masks.m16x16 & border_mask, col_masks.m8x8 & border_mask,
-          col_masks.m4x4 & border_mask, mask_4x4_int, &cm->lf_info, &lfl[r][0],
-          (int)cm->bit_depth);
-    else
-#endif  // CONFIG_HIGHBITDEPTH
-      filter_selectively_vert(
-          dst->buf, dst->stride, col_masks.m16x16 & border_mask,
-          col_masks.m8x8 & border_mask, col_masks.m4x4 & border_mask,
-          mask_4x4_int, &cm->lf_info, &lfl[r][0]
-#if CONFIG_LPF_DIRECT
-          ,
-          dst->buf0, mi_row, mi_col, idx_r, col_step, cm->width, cm->height,
-          ss_x, ss_y
-#endif  // CONFIG_LPF_DIRECT
-          );
-    dst->buf += 8 * dst->stride;
-  }
-
-  // Now do horizontal pass
-  dst->buf = dst0;
-}
-
-void av1_filter_block_plane_non420_hor(AV1_COMMON *const cm,
-                                       struct macroblockd_plane *plane,
-                                       MODE_INFO **mib, int mi_row, int mi_col,
-                                       int pl) {
-  const int ss_y = plane->subsampling_y;
-  const int row_step = mi_size_high[BLOCK_8X8] << ss_y;
-#if CONFIG_LPF_DIRECT
-  const int ss_x = plane->subsampling_x;
-  const int col_step = mi_size_wide[BLOCK_8X8] << ss_x;
-#endif
-  struct buf_2d *const dst = &plane->dst;
-  uint8_t *const dst0 = dst->buf;
-  uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE] = { { 0 } };
-
-  int idx_r;
-  for (idx_r = 0; idx_r < cm->mib_size && mi_row + idx_r < cm->mi_rows;
-       idx_r += row_step) {
-    unsigned int mask_4x4_int;
-    FilterMasks row_masks;
-    const int r = idx_r >> mi_height_log2_lookup[BLOCK_8X8];
-    get_filter_level_and_masks_non420(cm, plane, pl, mib, mi_row, mi_col, idx_r,
-                                      &lfl[r][0], &mask_4x4_int, NULL,
-                                      &row_masks, NULL);
-
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-    // Disable filtering on the abovemost row or tile boundary
-    const MODE_INFO *mi = cm->mi + (mi_row + idx_r) * cm->mi_stride + mi_col;
-    if ((av1_disable_loopfilter_on_tile_boundary(cm) &&
-         (mi->mbmi.boundary_info & TILE_ABOVE_BOUNDARY)) ||
-        (mi_row + idx_r == 0))
-      memset(&row_masks, 0, sizeof(row_masks));
-#else
-    if (mi_row + idx_r == 0) memset(&row_masks, 0, sizeof(row_masks));
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-
-#if CONFIG_HIGHBITDEPTH
-    if (cm->use_highbitdepth)
-      highbd_filter_selectively_horiz(
-          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, row_masks.m16x16,
-          row_masks.m8x8, row_masks.m4x4, mask_4x4_int, &cm->lf_info,
-          &lfl[r][0], (int)cm->bit_depth);
-    else
-#endif  // CONFIG_HIGHBITDEPTH
-      filter_selectively_horiz(dst->buf, dst->stride, row_masks.m16x16,
-                               row_masks.m8x8, row_masks.m4x4, mask_4x4_int,
-                               &cm->lf_info, &lfl[r][0]
-#if CONFIG_LPF_DIRECT
-                               ,
-                               dst->buf0, mi_row, mi_col, idx_r, col_step,
-                               cm->width, cm->height, ss_x, ss_y
-#endif  // CONFIG_LPF_DIRECT
-                               );
-    dst->buf += 8 * dst->stride;
-  }
-  dst->buf = dst0;
-}
-
-void av1_filter_block_plane_ss00_ver(AV1_COMMON *const cm,
-                                     struct macroblockd_plane *const plane,
-                                     int mi_row, LOOP_FILTER_MASK *lfm) {
-  struct buf_2d *const dst = &plane->dst;
-  uint8_t *const dst0 = dst->buf;
-  int r;
-  uint64_t mask_16x16 = lfm->left_y[TX_16X16];
-  uint64_t mask_8x8 = lfm->left_y[TX_8X8];
-  uint64_t mask_4x4 = lfm->left_y[TX_4X4];
-  uint64_t mask_4x4_int = lfm->int_4x4_y;
-
-  assert(plane->subsampling_x == 0 && plane->subsampling_y == 0);
-
-  // Vertical pass: do 2 rows at one time
-  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 2) {
-    unsigned int mask_16x16_l = mask_16x16 & 0xffff;
-    unsigned int mask_8x8_l = mask_8x8 & 0xffff;
-    unsigned int mask_4x4_l = mask_4x4 & 0xffff;
-    unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
-
-// Disable filtering on the leftmost column.
-#if CONFIG_HIGHBITDEPTH
-    if (cm->use_highbitdepth)
-      highbd_filter_selectively_vert_row2(
-          plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
-          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-          &lfm->lfl_y[r][0], (int)cm->bit_depth);
-    else
-#endif  // CONFIG_HIGHBITDEPTH
-      filter_selectively_vert_row2(
-          plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
-          mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r][0]);
-
-    dst->buf += 2 * MI_SIZE * dst->stride;
-    mask_16x16 >>= 2 * MI_SIZE;
-    mask_8x8 >>= 2 * MI_SIZE;
-    mask_4x4 >>= 2 * MI_SIZE;
-    mask_4x4_int >>= 2 * MI_SIZE;
-  }
-
-  // Horizontal pass
-  dst->buf = dst0;
-}
-
-void av1_filter_block_plane_ss00_hor(AV1_COMMON *const cm,
-                                     struct macroblockd_plane *const plane,
-                                     int mi_row, LOOP_FILTER_MASK *lfm) {
-  struct buf_2d *const dst = &plane->dst;
-  uint8_t *const dst0 = dst->buf;
-  int r;
-  uint64_t mask_16x16 = lfm->above_y[TX_16X16];
-  uint64_t mask_8x8 = lfm->above_y[TX_8X8];
-  uint64_t mask_4x4 = lfm->above_y[TX_4X4];
-  uint64_t mask_4x4_int = lfm->int_4x4_y;
-
-  assert(plane->subsampling_x == 0 && plane->subsampling_y == 0);
-
-  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r++) {
-    unsigned int mask_16x16_r;
-    unsigned int mask_8x8_r;
-    unsigned int mask_4x4_r;
-
-    if (mi_row + r == 0) {
-      mask_16x16_r = 0;
-      mask_8x8_r = 0;
-      mask_4x4_r = 0;
-    } else {
-      mask_16x16_r = mask_16x16 & 0xff;
-      mask_8x8_r = mask_8x8 & 0xff;
-      mask_4x4_r = mask_4x4 & 0xff;
+    for (int j = 0; j < 4; ++j) {
+      for (int i = start; i < end; ++i)
+        if (ref_buf[i] != dst_buf[i]) {
+          ref_buf = ref0;
+          dst_buf = dst0;
+          return i + 1;
+        }
+      ref_buf += ref_stride;
+      dst_buf += dst_stride;
     }
-
-#if CONFIG_HIGHBITDEPTH
-    if (cm->use_highbitdepth)
-      highbd_filter_selectively_horiz(
-          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
-          mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info, &lfm->lfl_y[r][0],
-          (int)cm->bit_depth);
-    else
-#endif  // CONFIG_HIGHBITDEPTH
-#if !CONFIG_LPF_DIRECT
-      filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
-                               mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
-                               &lfm->lfl_y[r][0]);
-#endif  // CONFIG_LPF_DIRECT
-
-    dst->buf += MI_SIZE * dst->stride;
-    mask_16x16 >>= MI_SIZE;
-    mask_8x8 >>= MI_SIZE;
-    mask_4x4 >>= MI_SIZE;
-    mask_4x4_int >>= MI_SIZE;
   }
-  // restore the buf pointer in case there is additional filter pass.
-  dst->buf = dst0;
+  ref_buf = ref0;
+  dst_buf = dst0;
+  return 0;
 }
 
-void av1_filter_block_plane_ss11_ver(AV1_COMMON *const cm,
-                                     struct macroblockd_plane *const plane,
-                                     int mi_row, LOOP_FILTER_MASK *lfm) {
-  struct buf_2d *const dst = &plane->dst;
-  uint8_t *const dst0 = dst->buf;
+void av1_filter_block_plane_ver(AV1_COMMON *const cm,
+                                struct macroblockd_plane *const plane_ptr,
+                                int pl, int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
   int r, c;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int single_step = 1 << ssy;
+  const int r_step = 2 << ssy;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+  uint8_t *lfl2;
+
+  // filter two rows at a time
+  for (r = 0; r < cm->seq_params.mib_size &&
+              ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
+       r += r_step) {
+    for (c = 0; c < cm->seq_params.mib_size &&
+                ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
+         c += MI_SIZE_64X64) {
+      dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
+      assert(lfm);
+      const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
+      const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
+      int index = 0;
+      const int shift = get_index_shift(col, row, &index);
+      // current and next row should belong to the same mask_idx and index
+      // next row's shift
+      const int row_next = row + single_step;
+      int index_next = 0;
+      const int shift_next = get_index_shift(col, row_next, &index_next);
+      switch (pl) {
+        case 0:
+          mask_16x16 = lfm->left_y[TX_16X16].bits[index];
+          mask_8x8 = lfm->left_y[TX_8X8].bits[index];
+          mask_4x4 = lfm->left_y[TX_4X4].bits[index];
+          lfl = &lfm->lfl_y_ver[row][col];
+          lfl2 = &lfm->lfl_y_ver[row_next][col];
+          break;
+        case 1:
+          mask_16x16 = lfm->left_u[TX_16X16].bits[index];
+          mask_8x8 = lfm->left_u[TX_8X8].bits[index];
+          mask_4x4 = lfm->left_u[TX_4X4].bits[index];
+          lfl = &lfm->lfl_u_ver[row][col];
+          lfl2 = &lfm->lfl_u_ver[row_next][col];
+          break;
+        case 2:
+          mask_16x16 = lfm->left_v[TX_16X16].bits[index];
+          mask_8x8 = lfm->left_v[TX_8X8].bits[index];
+          mask_4x4 = lfm->left_v[TX_4X4].bits[index];
+          lfl = &lfm->lfl_v_ver[row][col];
+          lfl2 = &lfm->lfl_v_ver[row_next][col];
+          break;
+        default: assert(pl >= 0 && pl <= 2); return;
+      }
+      uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
+      uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
+      uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
+      uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
+      uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
+      uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
 
-  uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
-  uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
-  uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
-  uint16_t mask_4x4_int = lfm->left_int_4x4_uv;
-
-  assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
-  assert(plane->plane_type == PLANE_TYPE_UV);
-  memset(lfm->lfl_uv, 0, sizeof(lfm->lfl_uv));
-
-  // Vertical pass: do 2 rows at one time
-  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 4) {
-    for (c = 0; c < (cm->mib_size >> 1); c++) {
-      lfm->lfl_uv[r >> 1][c] = lfm->lfl_y[r][c << 1];
-      lfm->lfl_uv[(r + 2) >> 1][c] = lfm->lfl_y[r + 2][c << 1];
-    }
-
-    {
-      unsigned int mask_16x16_l = mask_16x16 & 0xff;
-      unsigned int mask_8x8_l = mask_8x8 & 0xff;
-      unsigned int mask_4x4_l = mask_4x4 & 0xff;
-      unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
-
-// Disable filtering on the leftmost column.
-#if CONFIG_HIGHBITDEPTH
       if (cm->use_highbitdepth)
         highbd_filter_selectively_vert_row2(
-            plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
-            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfm->lfl_uv[r >> 1][0], (int)cm->bit_depth);
+            ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
+            mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
+            &cm->lf_info, lfl, lfl2, (int)cm->bit_depth);
       else
-#endif  // CONFIG_HIGHBITDEPTH
-        filter_selectively_vert_row2(plane->subsampling_x, dst->buf,
-                                     dst->stride, mask_16x16_l, mask_8x8_l,
-                                     mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-                                     &lfm->lfl_uv[r >> 1][0]);
-
-      dst->buf += 2 * MI_SIZE * dst->stride;
-      mask_16x16 >>= MI_SIZE;
-      mask_8x8 >>= MI_SIZE;
-      mask_4x4 >>= MI_SIZE;
-      mask_4x4_int >>= MI_SIZE;
+        filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl,
+                                     mask_16x16_0, mask_8x8_0, mask_4x4_0,
+                                     mask_16x16_1, mask_8x8_1, mask_4x4_1,
+                                     &cm->lf_info, lfl, lfl2);
+      dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
     }
+    dst->buf += 2 * MI_SIZE * dst->stride;
   }
-
-  // Horizontal pass
-  dst->buf = dst0;
 }
 
-void av1_filter_block_plane_ss11_hor(AV1_COMMON *const cm,
-                                     struct macroblockd_plane *const plane,
-                                     int mi_row, LOOP_FILTER_MASK *lfm) {
-  struct buf_2d *const dst = &plane->dst;
-  uint8_t *const dst0 = dst->buf;
+void av1_filter_block_plane_hor(AV1_COMMON *const cm,
+                                struct macroblockd_plane *const plane_ptr,
+                                int pl, int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
   int r, c;
-  uint64_t mask_16x16 = lfm->above_uv[TX_16X16];
-  uint64_t mask_8x8 = lfm->above_uv[TX_8X8];
-  uint64_t mask_4x4 = lfm->above_uv[TX_4X4];
-  uint64_t mask_4x4_int = lfm->above_int_4x4_uv;
-
-  assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
-  memset(lfm->lfl_uv, 0, sizeof(lfm->lfl_uv));
-
-  // re-porpulate the filter level for uv, same as the code for vertical
-  // filter in av1_filter_block_plane_ss11_ver
-  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 4) {
-    for (c = 0; c < (cm->mib_size >> 1); c++) {
-      lfm->lfl_uv[r >> 1][c] = lfm->lfl_y[r][c << 1];
-      lfm->lfl_uv[(r + 2) >> 1][c] = lfm->lfl_y[r + 2][c << 1];
-    }
-  }
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int r_step = 1 << ssy;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+
+  for (r = 0; r < cm->seq_params.mib_size &&
+              ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
+       r += r_step) {
+    for (c = 0; c < cm->seq_params.mib_size &&
+                ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
+         c += MI_SIZE_64X64) {
+      if (mi_row + r == 0) continue;
+
+      dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
+      assert(lfm);
+      const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
+      const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
+      int index = 0;
+      const int shift = get_index_shift(col, row, &index);
+      switch (pl) {
+        case 0:
+          mask_16x16 = lfm->above_y[TX_16X16].bits[index];
+          mask_8x8 = lfm->above_y[TX_8X8].bits[index];
+          mask_4x4 = lfm->above_y[TX_4X4].bits[index];
+          lfl = &lfm->lfl_y_hor[row][col];
+          break;
+        case 1:
+          mask_16x16 = lfm->above_u[TX_16X16].bits[index];
+          mask_8x8 = lfm->above_u[TX_8X8].bits[index];
+          mask_4x4 = lfm->above_u[TX_4X4].bits[index];
+          lfl = &lfm->lfl_u_hor[row][col];
+          break;
+        case 2:
+          mask_16x16 = lfm->above_v[TX_16X16].bits[index];
+          mask_8x8 = lfm->above_v[TX_8X8].bits[index];
+          mask_4x4 = lfm->above_v[TX_4X4].bits[index];
+          lfl = &lfm->lfl_v_hor[row][col];
+          break;
+        default: assert(pl >= 0 && pl <= 2); return;
+      }
+      mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
+      mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
+      mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
 
-  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 2) {
-    const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
-    const unsigned int mask_4x4_int_r =
-        skip_border_4x4_r ? 0 : (mask_4x4_int & 0xf);
-    unsigned int mask_16x16_r;
-    unsigned int mask_8x8_r;
-    unsigned int mask_4x4_r;
-
-    if (mi_row + r == 0) {
-      mask_16x16_r = 0;
-      mask_8x8_r = 0;
-      mask_4x4_r = 0;
-    } else {
-      mask_16x16_r = mask_16x16 & 0xf;
-      mask_8x8_r = mask_8x8 & 0xf;
-      mask_4x4_r = mask_4x4 & 0xf;
+      if (cm->use_highbitdepth)
+        highbd_filter_selectively_horiz(
+            CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
+            mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->bit_depth);
+      else
+        filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+                                 mask_8x8, mask_4x4, &cm->lf_info, lfl);
+      dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
     }
-
-#if CONFIG_HIGHBITDEPTH
-    if (cm->use_highbitdepth)
-      highbd_filter_selectively_horiz(
-          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
-          mask_4x4_r, mask_4x4_int_r, &cm->lf_info, &lfm->lfl_uv[r >> 1][0],
-          (int)cm->bit_depth);
-    else
-#endif  // CONFIG_HIGHBITDEPTH
-#if !CONFIG_LPF_DIRECT
-      filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
-                               mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                               &lfm->lfl_uv[r >> 1][0]);
-#endif  // CONFIG_LPF_DIRECT
-
     dst->buf += MI_SIZE * dst->stride;
-    mask_16x16 >>= MI_SIZE / 2;
-    mask_8x8 >>= MI_SIZE / 2;
-    mask_4x4 >>= MI_SIZE / 2;
-    mask_4x4_int >>= MI_SIZE / 2;
   }
-  // restore the buf pointer in case there is additional filter pass.
-  dst->buf = dst0;
 }
-
-#if CONFIG_PARALLEL_DEBLOCKING
-typedef enum EDGE_DIR { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } EDGE_DIR;
-static const uint32_t av1_prediction_masks[NUM_EDGE_DIRS][BLOCK_SIZES_ALL] = {
-  // mask for vertical edges filtering
-  {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      2 - 1,   // BLOCK_2X2
-      2 - 1,   // BLOCK_2X4
-      4 - 1,   // BLOCK_4X2
-#endif         // CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      4 - 1,   // BLOCK_4X4
-      4 - 1,   // BLOCK_4X8
-      8 - 1,   // BLOCK_8X4
-      8 - 1,   // BLOCK_8X8
-      8 - 1,   // BLOCK_8X16
-      16 - 1,  // BLOCK_16X8
-      16 - 1,  // BLOCK_16X16
-      16 - 1,  // BLOCK_16X32
-      32 - 1,  // BLOCK_32X16
-      32 - 1,  // BLOCK_32X32
-      32 - 1,  // BLOCK_32X64
-      64 - 1,  // BLOCK_64X32
-      64 - 1,  // BLOCK_64X64
-#if CONFIG_EXT_PARTITION
-      64 - 1,   // BLOCK_64X128
-      128 - 1,  // BLOCK_128X64
-      128 - 1,  // BLOCK_128X128
-#endif          // CONFIG_EXT_PARTITION
-      4 - 1,    // BLOCK_4X16,
-      16 - 1,   // BLOCK_16X4,
-      8 - 1,    // BLOCK_8X32,
-      32 - 1,   // BLOCK_32X8,
-      16 - 1,   // BLOCK_16X64,
-      64 - 1,   // BLOCK_64X16
-#if CONFIG_EXT_PARTITION
-      32 - 1,   // BLOCK_32X128
-      128 - 1,  // BLOCK_128X32
-#endif          // CONFIG_EXT_PARTITION
-  },
-  // mask for horizontal edges filtering
-  {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      2 - 1,   // BLOCK_2X2
-      4 - 1,   // BLOCK_2X4
-      2 - 1,   // BLOCK_4X2
-#endif         // CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      4 - 1,   // BLOCK_4X4
-      8 - 1,   // BLOCK_4X8
-      4 - 1,   // BLOCK_8X4
-      8 - 1,   // BLOCK_8X8
-      16 - 1,  // BLOCK_8X16
-      8 - 1,   // BLOCK_16X8
-      16 - 1,  // BLOCK_16X16
-      32 - 1,  // BLOCK_16X32
-      16 - 1,  // BLOCK_32X16
-      32 - 1,  // BLOCK_32X32
-      64 - 1,  // BLOCK_32X64
-      32 - 1,  // BLOCK_64X32
-      64 - 1,  // BLOCK_64X64
-#if CONFIG_EXT_PARTITION
-      128 - 1,  // BLOCK_64X128
-      64 - 1,   // BLOCK_128X64
-      128 - 1,  // BLOCK_128X128
-#endif          // CONFIG_EXT_PARTITION
-      16 - 1,   // BLOCK_4X16,
-      4 - 1,    // BLOCK_16X4,
-      32 - 1,   // BLOCK_8X32,
-      8 - 1,    // BLOCK_32X8,
-      64 - 1,   // BLOCK_16X64,
-      16 - 1,   // BLOCK_64X16
-#if CONFIG_EXT_PARTITION
-      128 - 1,  // BLOCK_32X128
-      32 - 1,   // BLOCK_128X32
-#endif          // CONFIG_EXT_PARTITION
-  },
-};
-
-static const uint32_t av1_transform_masks[NUM_EDGE_DIRS][TX_SIZES_ALL] = {
-  {
-#if CONFIG_CHROMA_2X2
-      2 - 1,  // TX_2X2
-#endif
-      4 - 1,   // TX_4X4
-      8 - 1,   // TX_8X8
-      16 - 1,  // TX_16X16
-      32 - 1,  // TX_32X32
-#if CONFIG_TX64X64
-      64 - 1,  // TX_64X64
-#endif         // CONFIG_TX64X64
-      4 - 1,   // TX_4X8
-      8 - 1,   // TX_8X4
-      8 - 1,   // TX_8X16
-      16 - 1,  // TX_16X8
-      16 - 1,  // TX_16X32
-      32 - 1,  // TX_32X16
-#if CONFIG_TX64X64
-      32 - 1,  // TX_32X64
-      64 - 1,  // TX_64X32
-#endif         // CONFIG_TX64X64
-      4 - 1,   // TX_4X16
-      16 - 1,  // TX_16X4
-      8 - 1,   // TX_8X32
-      32 - 1   // TX_32X8
-  },
-  {
-#if CONFIG_CHROMA_2X2
-      2 - 1,  // TX_2X2
-#endif
-      4 - 1,   // TX_4X4
-      8 - 1,   // TX_8X8
-      16 - 1,  // TX_16X16
-      32 - 1,  // TX_32X32
-#if CONFIG_TX64X64
-      64 - 1,  // TX_64X64
-#endif         // CONFIG_TX64X64
-      8 - 1,   // TX_4X8
-      4 - 1,   // TX_8X4
-      16 - 1,  // TX_8X16
-      8 - 1,   // TX_16X8
-      32 - 1,  // TX_16X32
-      16 - 1,  // TX_32X16
-#if CONFIG_TX64X64
-      64 - 1,  // TX_32X64
-      32 - 1,  // TX_64X32
-#endif         // CONFIG_TX64X64
-      16 - 1,  // TX_4X16
-      4 - 1,   // TX_16X4
-      32 - 1,  // TX_8X32
-      8 - 1    // TX_32X8
-  }
-};
-
-static TX_SIZE av1_get_transform_size(const MODE_INFO *const mi,
-                                      const EDGE_DIR edge_dir, const int mi_row,
-                                      const int mi_col, const int plane,
-                                      const struct macroblockd_plane *plane_ptr,
-                                      const uint32_t scale_horz,
-                                      const uint32_t scale_vert) {
-  const MB_MODE_INFO *mbmi = &mi->mbmi;
-  TX_SIZE tx_size = (plane == AOM_PLANE_Y)
-                        ? mbmi->tx_size
-                        : av1_get_uv_tx_size(mbmi, plane_ptr);
+#endif  // LOOP_FILTER_BITMASK
+
+static TX_SIZE get_transform_size(const MACROBLOCKD *const xd,
+                                  const MB_MODE_INFO *const mbmi,
+                                  const EDGE_DIR edge_dir, const int mi_row,
+                                  const int mi_col, const int plane,
+                                  const struct macroblockd_plane *plane_ptr) {
+  assert(mbmi != NULL);
+  if (xd && xd->lossless[mbmi->segment_id]) return TX_4X4;
+
+  TX_SIZE tx_size =
+      (plane == AOM_PLANE_Y)
+          ? mbmi->tx_size
+          : av1_get_max_uv_txsize(mbmi->sb_type, plane_ptr->subsampling_x,
+                                  plane_ptr->subsampling_y);
   assert(tx_size < TX_SIZES_ALL);
-
-#if CONFIG_VAR_TX
-  // mi_row and mi_col is the absolute position of the MI block.
-  // idx_c and idx_r is the relative offset of the MI within the super block
-  // c and r is the relative offset of the 8x8 block within the supert block
-  // blk_row and block_col is the relative offset of the current 8x8 block
-  // within the current partition.
-  const int idx_c = mi_col & MAX_MIB_MASK;
-  const int idx_r = mi_row & MAX_MIB_MASK;
-  const int c = idx_c >> mi_width_log2_lookup[BLOCK_8X8];
-  const int r = idx_r >> mi_height_log2_lookup[BLOCK_8X8];
-  const BLOCK_SIZE sb_type = mi->mbmi.sb_type;
-  const int blk_row = r & (num_8x8_blocks_high_lookup[sb_type] - 1);
-  const int blk_col = c & (num_8x8_blocks_wide_lookup[sb_type] - 1);
-
-  if (is_inter_block(mbmi) && !mbmi->skip) {
-    const int tx_row_idx =
-        (blk_row * mi_size_high[BLOCK_8X8] << TX_UNIT_HIGH_LOG2) >> 1;
-    const int tx_col_idx =
-        (blk_col * mi_size_wide[BLOCK_8X8] << TX_UNIT_WIDE_LOG2) >> 1;
-
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    const BLOCK_SIZE bsize =
-        AOMMAX(BLOCK_4X4, ss_size_lookup[sb_type][scale_horz][scale_vert]);
-#else
-    const BLOCK_SIZE bsize = ss_size_lookup[sb_type][scale_horz][scale_vert];
-#endif
-    const TX_SIZE mb_tx_size = mbmi->inter_tx_size[tx_row_idx][tx_col_idx];
-
+  if ((plane == AOM_PLANE_Y) && is_inter_block(mbmi) && !mbmi->skip) {
+    const BLOCK_SIZE sb_type = mbmi->sb_type;
+    const int blk_row = mi_row & (mi_size_high[sb_type] - 1);
+    const int blk_col = mi_col & (mi_size_wide[sb_type] - 1);
+    const TX_SIZE mb_tx_size =
+        mbmi->inter_tx_size[av1_get_txb_size_index(sb_type, blk_row, blk_col)];
     assert(mb_tx_size < TX_SIZES_ALL);
-
-    tx_size = (plane == AOM_PLANE_Y)
-                  ? mb_tx_size
-                  : uv_txsize_lookup[bsize][mb_tx_size][0][0];
-    assert(tx_size < TX_SIZES_ALL);
+    tx_size = mb_tx_size;
   }
-#else
-  (void)mi_row;
-  (void)mi_col;
-  (void)scale_horz;
-  (void)scale_vert;
-#endif  // CONFIG_VAR_TX
 
   // since in case of chrominance or non-square transorm need to convert
   // transform size into transform size in particular direction.
@@ -2926,111 +1524,84 @@ static TX_SIZE av1_get_transform_size(const MODE_INFO *const mi,
 typedef struct AV1_DEBLOCKING_PARAMETERS {
   // length of the filter applied to the outer edge
   uint32_t filter_length;
-  // length of the filter applied to the inner edge
-  uint32_t filter_length_internal;
   // deblocking limits
   const uint8_t *lim;
   const uint8_t *mblim;
   const uint8_t *hev_thr;
 } AV1_DEBLOCKING_PARAMETERS;
 
-static void set_lpf_parameters(
+// Return TX_SIZE from get_transform_size(), so it is plane and direction
+// awared
+static TX_SIZE set_lpf_parameters(
     AV1_DEBLOCKING_PARAMETERS *const params, const ptrdiff_t mode_step,
-    const AV1_COMMON *const cm, const EDGE_DIR edge_dir, const uint32_t x,
-    const uint32_t y, const int plane,
-    const struct macroblockd_plane *const plane_ptr) {
+    const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+    const EDGE_DIR edge_dir, const uint32_t x, const uint32_t y,
+    const int plane, const struct macroblockd_plane *const plane_ptr) {
   // reset to initial values
   params->filter_length = 0;
-  params->filter_length_internal = 0;
 
   // no deblocking is required
   const uint32_t width = plane_ptr->dst.width;
   const uint32_t height = plane_ptr->dst.height;
   if ((width <= x) || (height <= y)) {
-    return;
+    // just return the smallest transform unit size
+    return TX_4X4;
   }
 
   const uint32_t scale_horz = plane_ptr->subsampling_x;
   const uint32_t scale_vert = plane_ptr->subsampling_y;
-  const int mi_row = (y << scale_vert) >> MI_SIZE_LOG2;
-  const int mi_col = (x << scale_horz) >> MI_SIZE_LOG2;
-  MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
-  const MB_MODE_INFO *mbmi = &mi[0]->mbmi;
+  // for sub8x8 block, chroma prediction mode is obtained from the bottom/right
+  // mi structure of the co-located 8x8 luma block. so for chroma plane, mi_row
+  // and mi_col should map to the bottom/right mi structure, i.e, both mi_row
+  // and mi_col should be odd number for chroma plane.
+  const int mi_row = scale_vert | ((y << scale_vert) >> MI_SIZE_LOG2);
+  const int mi_col = scale_horz | ((x << scale_horz) >> MI_SIZE_LOG2);
+  MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
+  const MB_MODE_INFO *mbmi = mi[0];
+  // If current mbmi is not correctly setup, return an invalid value to stop
+  // filtering. One example is that if this tile is not coded, then its mbmi
+  // it not set up.
+  if (mbmi == NULL) return TX_INVALID;
+
+  const TX_SIZE ts =
+      get_transform_size(xd, mi[0], edge_dir, mi_row, mi_col, plane, plane_ptr);
 
   {
-    const TX_SIZE ts =
-        av1_get_transform_size(mi[0], edge_dir, mi_row, mi_col, plane,
-                               plane_ptr, scale_horz, scale_vert);
-
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-    const uint32_t curr_level =
-        get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
-#else
-#if CONFIG_LPF_SB
-    const uint32_t curr_level =
-        get_filter_level(cm, &cm->lf_info, mi_row, mi_col, mbmi);
-#else
-    const uint32_t curr_level = get_filter_level(cm, &cm->lf_info, mbmi);
-#endif  // CONFIG_LPF_SB
-#endif
-#else
-    const uint32_t curr_level = get_filter_level(&cm->lf_info, mbmi);
-#endif  // CONFIG_EXT_DELTA_Q
-
-    const int curr_skipped = mbmi->skip && is_inter_block(mbmi);
     const uint32_t coord = (VERT_EDGE == edge_dir) ? (x) : (y);
-    uint32_t level = curr_level;
+    const uint32_t transform_masks =
+        edge_dir == VERT_EDGE ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1;
+    const int32_t tu_edge = (coord & transform_masks) ? (0) : (1);
+
+    if (!tu_edge) return ts;
+
     // prepare outer edge parameters. deblock the edge if it's an edge of a TU
-    if (coord) {
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-      MODE_INFO *const mi_bound = cm->mi + mi_row * cm->mi_stride + mi_col;
-      if (!av1_disable_loopfilter_on_tile_boundary(cm) ||
-          ((VERT_EDGE == edge_dir) &&
-           (0 == (mi_bound->mbmi.boundary_info & TILE_LEFT_BOUNDARY))) ||
-          ((HORZ_EDGE == edge_dir) &&
-           (0 == (mi_bound->mbmi.boundary_info & TILE_ABOVE_BOUNDARY))))
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-      {
-        const int32_t tu_edge =
-            (coord & av1_transform_masks[edge_dir][ts]) ? (0) : (1);
-        if (tu_edge) {
-          const MODE_INFO *const mi_prev = *(mi - mode_step);
+    {
+      const uint32_t curr_level =
+          get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
+      const int curr_skipped = mbmi->skip && is_inter_block(mbmi);
+      uint32_t level = curr_level;
+      if (coord) {
+        {
+          const MB_MODE_INFO *const mi_prev = *(mi - mode_step);
+          if (mi_prev == NULL) return TX_INVALID;
           const int pv_row =
               (VERT_EDGE == edge_dir) ? (mi_row) : (mi_row - (1 << scale_vert));
           const int pv_col =
               (VERT_EDGE == edge_dir) ? (mi_col - (1 << scale_horz)) : (mi_col);
-          const TX_SIZE pv_ts =
-              av1_get_transform_size(mi_prev, edge_dir, pv_row, pv_col, plane,
-                                     plane_ptr, scale_horz, scale_vert);
-
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-          const uint32_t pv_lvl = get_filter_level(cm, &cm->lf_info, edge_dir,
-                                                   plane, &mi_prev->mbmi);
-#else
-#if CONFIG_LPF_SB
-          const uint32_t pv_lvl = get_filter_level(cm, &cm->lf_info, pv_row,
-                                                   pv_col, &mi_prev->mbmi);
-#else
-          const uint32_t pv_lvl =
-              get_filter_level(cm, &cm->lf_info, &mi_prev->mbmi);
-#endif  // CONFIG_LPF_SB
-#endif
-#else
+          const TX_SIZE pv_ts = get_transform_size(
+              xd, mi_prev, edge_dir, pv_row, pv_col, plane, plane_ptr);
+
           const uint32_t pv_lvl =
-              get_filter_level(&cm->lf_info, &mi_prev->mbmi);
-#endif  // CONFIG_EXT_DELTA_Q
-
-          const int pv_skip =
-              mi_prev->mbmi.skip && is_inter_block(&mi_prev->mbmi);
-          const int32_t pu_edge =
-              (coord &
-               av1_prediction_masks[edge_dir]
-                                   [ss_size_lookup[mbmi->sb_type][scale_horz]
-                                                  [scale_vert]])
-                  ? (0)
-                  : (1);
+              get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
+
+          const int pv_skip = mi_prev->skip && is_inter_block(mi_prev);
+          const BLOCK_SIZE bsize =
+              get_plane_block_size(mbmi->sb_type, plane_ptr->subsampling_x,
+                                   plane_ptr->subsampling_y);
+          const int prediction_masks = edge_dir == VERT_EDGE
+                                           ? block_size_wide[bsize] - 1
+                                           : block_size_high[bsize] - 1;
+          const int32_t pu_edge = !(coord & prediction_masks);
           // if the current and the previous blocks are skipped,
           // deblock the edge if the edge belongs to a PU's edge only.
           if ((curr_level || pv_lvl) &&
@@ -3039,41 +1610,26 @@ static void set_lpf_parameters(
             if (TX_4X4 >= min_ts) {
               params->filter_length = 4;
             } else if (TX_8X8 == min_ts) {
-              params->filter_length = 8;
+              if (plane != 0)
+                params->filter_length = 6;
+              else
+                params->filter_length = 8;
             } else {
-              params->filter_length = 16;
-#if PARALLEL_DEBLOCKING_15TAPLUMAONLY
+              params->filter_length = 14;
               // No wide filtering for chroma plane
               if (plane != 0) {
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
                 params->filter_length = 6;
-#else
-                params->filter_length = 8;
-#endif
               }
-#endif
             }
 
-#if PARALLEL_DEBLOCKING_DISABLE_15TAP
-            params->filter_length = (TX_4X4 >= AOMMIN(ts, pv_ts)) ? (4) : (8);
-#endif  // PARALLEL_DEBLOCKING_DISABLE_15TAP
-
             // update the level if the current block is skipped,
             // but the previous one is not
             level = (curr_level) ? (curr_level) : (pv_lvl);
           }
         }
       }
-
-#if !CONFIG_CB4X4
-      // prepare internal edge parameters
-      if (curr_level && !curr_skipped) {
-        params->filter_length_internal = (TX_4X4 >= ts) ? (4) : (0);
-      }
-#endif
-
       // prepare common parameters
-      if (params->filter_length || params->filter_length_internal) {
+      if (params->filter_length) {
         const loop_filter_thresh *const limits = cm->lf_info.lfthr + level;
         params->lim = limits->lim;
         params->mblim = limits->mblim;
@@ -3081,654 +1637,278 @@ static void set_lpf_parameters(
       }
     }
   }
+
+  return ts;
 }
 
-static void av1_filter_block_plane_vert(
-    const AV1_COMMON *const cm, const int plane,
-    const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
-    const uint32_t mi_col) {
-  const int col_step = MI_SIZE >> MI_SIZE_LOG2;
+void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
+                                 const MACROBLOCKD *const xd, const int plane,
+                                 const MACROBLOCKD_PLANE *const plane_ptr,
+                                 const uint32_t mi_row, const uint32_t mi_col) {
   const int row_step = MI_SIZE >> MI_SIZE_LOG2;
   const uint32_t scale_horz = plane_ptr->subsampling_x;
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   uint8_t *const dst_ptr = plane_ptr->dst.buf;
   const int dst_stride = plane_ptr->dst.stride;
-#if CONFIG_LPF_SB
-  int y_range = mi_row ? MAX_MIB_SIZE : MAX_MIB_SIZE - FILT_BOUNDARY_MI_OFFSET;
-  y_range = AOMMIN(y_range, cm->mi_rows);
-  y_range >>= scale_vert;
-
-  int x_range = mi_col ? MAX_MIB_SIZE : MAX_MIB_SIZE - FILT_BOUNDARY_MI_OFFSET;
-  x_range = AOMMIN(x_range, cm->mi_cols);
-  x_range >>= scale_horz;
-#else
   const int y_range = (MAX_MIB_SIZE >> scale_vert);
   const int x_range = (MAX_MIB_SIZE >> scale_horz);
-#endif  // CONFIG_LPF_SB
   for (int y = 0; y < y_range; y += row_step) {
     uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
-    for (int x = 0; x < x_range; x += col_step) {
+    for (int x = 0; x < x_range;) {
       // inner loop always filter vertical edges in a MI block. If MI size
       // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
       // If 4x4 trasnform is used, it will then filter the internal edge
       //  aligned with a 4x4 block
       const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
       const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+      uint32_t advance_units;
+      TX_SIZE tx_size;
       AV1_DEBLOCKING_PARAMETERS params;
       memset(&params, 0, sizeof(params));
 
-      set_lpf_parameters(&params, ((ptrdiff_t)1 << scale_horz), cm, VERT_EDGE,
-                         curr_x, curr_y, plane, plane_ptr);
-
-#if CONFIG_LPF_DIRECT
-      uint8_t *const src = plane_ptr->dst.buf0;
-      const int width = cm->width >> scale_horz;
-      const int height = cm->height >> scale_vert;
-      const int pivot = 8;
-      const int line_length = 16;
-      uint8_t block[128];
-      int orig_pos[128];
-      const int vert_or_horz = 0;  // 0: vertical
-      const int unit = 1;
-      int i;
-      for (i = 0; i < 128; ++i) {
-        block[i] = 0;
-        orig_pos[i] = -1;
+      tx_size =
+          set_lpf_parameters(&params, ((ptrdiff_t)1 << scale_horz), cm, xd,
+                             VERT_EDGE, curr_x, curr_y, plane, plane_ptr);
+      if (tx_size == TX_INVALID) {
+        params.filter_length = 0;
+        tx_size = TX_4X4;
       }
 
-      if (params.filter_length) {
-        const int filt_len = params.filter_length == 16 ? 8 : 4;
-        const int direct =
-            pick_min_grad_direct(src, filt_len, curr_y, curr_x, width, height,
-                                 dst_stride, unit, vert_or_horz);
-
-        pick_filter_block_vert(src, block, orig_pos, filt_len, curr_y, curr_x,
-                               width, height, dst_stride, pivot, line_length,
-                               unit, direct);
-        uint8_t *const filt_start = block + pivot;
-        switch (params.filter_length) {
-          // apply 4-tap filtering
-          case 4:
-#if CONFIG_HIGHBITDEPTH
-            if (cm->use_highbitdepth)
-              aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(filt_start),
-                                        line_length, params.mblim, params.lim,
-                                        params.hev_thr, cm->bit_depth);
-            else
-#endif  // CONFIG_HIGHBITDEPTH
-              aom_lpf_vertical_4(filt_start, line_length, params.mblim,
-                                 params.lim, params.hev_thr);
-            break;
-          // apply 8-tap filtering
-          case 8:
-#if CONFIG_HIGHBITDEPTH
-            if (cm->use_highbitdepth)
-              aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(filt_start),
-                                        line_length, params.mblim, params.lim,
-                                        params.hev_thr, cm->bit_depth);
-            else
-#endif  // CONFIG_HIGHBITDEPTH
-              aom_lpf_vertical_8(filt_start, line_length, params.mblim,
-                                 params.lim, params.hev_thr);
-            break;
-          // apply 16-tap filtering
-          case 16:
-#if CONFIG_HIGHBITDEPTH
-            if (cm->use_highbitdepth)
-              aom_highbd_lpf_vertical_16(CONVERT_TO_SHORTPTR(filt_start),
-                                         line_length, params.mblim, params.lim,
-                                         params.hev_thr, cm->bit_depth);
-            else
-#endif  // CONFIG_HIGHBITDEPTH
-              aom_lpf_vertical_16(filt_start, line_length, params.mblim,
-                                  params.lim, params.hev_thr);
-            break;
-          // no filtering
-          default: break;
-        }
-
-        for (i = 0; i < 128; ++i) {
-          if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-        }
-      }
-
-      if (params.filter_length_internal) {
-        for (i = 0; i < 128; ++i) {
-          block[i] = 0;
-          orig_pos[i] = -1;
-        }
-
-        const int direct =
-            pick_min_grad_direct(src, 4, curr_y, curr_x + 4, width, height,
-                                 dst_stride, unit, vert_or_horz);
-
-        pick_filter_block_vert(src, block, orig_pos, 4, curr_y, curr_x + 4,
-                               width, height, dst_stride, pivot, line_length,
-                               unit, direct);
-
-        uint8_t *const filt_start = block + pivot;
-#if CONFIG_HIGHBITDEPTH
-        if (cm->use_highbitdepth)
-          aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(filt_start),
-                                    line_length, params.mblim, params.lim,
-                                    params.hev_thr, cm->bit_depth);
-        else
-#endif  // CONFIG_HIGHBITDEPTH
-          aom_lpf_vertical_4(filt_start, line_length, params.mblim, params.lim,
-                             params.hev_thr);
-
-        for (i = 0; i < 128; ++i) {
-          if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-        }
-      }
-#else  // !CONFIG_LPF_DIRECT
       switch (params.filter_length) {
         // apply 4-tap filtering
         case 4:
-#if CONFIG_HIGHBITDEPTH
           if (cm->use_highbitdepth)
             aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p), dst_stride,
                                       params.mblim, params.lim, params.hev_thr,
                                       cm->bit_depth);
           else
-#endif  // CONFIG_HIGHBITDEPTH
             aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
                                params.hev_thr);
           break;
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
         case 6:  // apply 6-tap filter for chroma plane only
           assert(plane != 0);
-#if CONFIG_HIGHBITDEPTH
           if (cm->use_highbitdepth)
-            aom_highbd_lpf_vertical_6_c(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                        params.mblim, params.lim,
-                                        params.hev_thr, cm->bit_depth);
+            aom_highbd_lpf_vertical_6(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                      params.mblim, params.lim, params.hev_thr,
+                                      cm->bit_depth);
           else
-#endif  // CONFIG_HIGHBITDEPTH
-            aom_lpf_vertical_6_c(p, dst_stride, params.mblim, params.lim,
-                                 params.hev_thr);
+            aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim,
+                               params.hev_thr);
           break;
-#endif
         // apply 8-tap filtering
         case 8:
-#if CONFIG_HIGHBITDEPTH
           if (cm->use_highbitdepth)
             aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(p), dst_stride,
                                       params.mblim, params.lim, params.hev_thr,
                                       cm->bit_depth);
           else
-#endif  // CONFIG_HIGHBITDEPTH
             aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim,
                                params.hev_thr);
           break;
-        // apply 16-tap filtering
-        case 16:
-#if CONFIG_HIGHBITDEPTH
+        // apply 14-tap filtering
+        case 14:
           if (cm->use_highbitdepth)
-#if CONFIG_DEBLOCK_13TAP
-            // TODO(olah): Remove _c once SIMD for 13-tap is available
-            aom_highbd_lpf_vertical_16_c(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                         params.mblim, params.lim,
-                                         params.hev_thr, cm->bit_depth);
-#else
-            aom_highbd_lpf_vertical_16(CONVERT_TO_SHORTPTR(p), dst_stride,
+            aom_highbd_lpf_vertical_14(CONVERT_TO_SHORTPTR(p), dst_stride,
                                        params.mblim, params.lim, params.hev_thr,
                                        cm->bit_depth);
-#endif
           else
-#endif  // CONFIG_HIGHBITDEPTH
-#if CONFIG_DEBLOCK_13TAP
-            aom_lpf_vertical_16_c(p, dst_stride, params.mblim, params.lim,
-                                  params.hev_thr);
-#else
-          aom_lpf_vertical_16(p, dst_stride, params.mblim, params.lim,
-                              params.hev_thr);
-#endif
+            aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim,
+                                params.hev_thr);
           break;
         // no filtering
         default: break;
       }
-      // process the internal edge
-      if (params.filter_length_internal) {
-#if CONFIG_HIGHBITDEPTH
-        if (cm->use_highbitdepth)
-          aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p + 4), dst_stride,
-                                    params.mblim, params.lim, params.hev_thr,
-                                    cm->bit_depth);
-        else
-#endif  // CONFIG_HIGHBITDEPTH
-          aom_lpf_vertical_4(p + 4, dst_stride, params.mblim, params.lim,
-                             params.hev_thr);
-      }
-#endif  // CONFIG_LPF_DIRECT
       // advance the destination pointer
-      p += MI_SIZE;
+      advance_units = tx_size_wide_unit[tx_size];
+      x += advance_units;
+      p += advance_units * MI_SIZE;
     }
   }
 }
 
-static void av1_filter_block_plane_horz(
-    const AV1_COMMON *const cm, const int plane,
-    const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
-    const uint32_t mi_col) {
+void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
+                                 const MACROBLOCKD *const xd, const int plane,
+                                 const MACROBLOCKD_PLANE *const plane_ptr,
+                                 const uint32_t mi_row, const uint32_t mi_col) {
   const int col_step = MI_SIZE >> MI_SIZE_LOG2;
-  const int row_step = MI_SIZE >> MI_SIZE_LOG2;
   const uint32_t scale_horz = plane_ptr->subsampling_x;
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   uint8_t *const dst_ptr = plane_ptr->dst.buf;
   const int dst_stride = plane_ptr->dst.stride;
-#if CONFIG_LPF_SB
-  int y_range = mi_row ? MAX_MIB_SIZE : MAX_MIB_SIZE - FILT_BOUNDARY_MI_OFFSET;
-  y_range = AOMMIN(y_range, cm->mi_rows);
-  y_range >>= scale_vert;
-
-  int x_range = mi_col ? MAX_MIB_SIZE : MAX_MIB_SIZE - FILT_BOUNDARY_MI_OFFSET;
-  x_range = AOMMIN(x_range, cm->mi_cols);
-  x_range >>= scale_horz;
-#else
   const int y_range = (MAX_MIB_SIZE >> scale_vert);
   const int x_range = (MAX_MIB_SIZE >> scale_horz);
-#endif  // CONFIG_LPF_SB
-  for (int y = 0; y < y_range; y += row_step) {
-    uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
-    for (int x = 0; x < x_range; x += col_step) {
+  for (int x = 0; x < x_range; x += col_step) {
+    uint8_t *p = dst_ptr + x * MI_SIZE;
+    for (int y = 0; y < y_range;) {
       // inner loop always filter vertical edges in a MI block. If MI size
       // is 8x8, it will first filter the vertical edge aligned with a 8x8
       // block. If 4x4 trasnform is used, it will then filter the internal
       // edge aligned with a 4x4 block
       const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
       const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+      uint32_t advance_units;
+      TX_SIZE tx_size;
       AV1_DEBLOCKING_PARAMETERS params;
       memset(&params, 0, sizeof(params));
 
-      set_lpf_parameters(&params, (cm->mi_stride << scale_vert), cm, HORZ_EDGE,
-                         curr_x, curr_y, plane, plane_ptr);
-
-#if CONFIG_LPF_DIRECT
-      uint8_t *const src = plane_ptr->dst.buf0;
-      const int width = cm->width >> scale_horz;
-      const int height = cm->height >> scale_vert;
-      const int pivot = 8;
-      const int line_length = 16;
-      uint8_t block[256];
-      int orig_pos[256];
-      const int vert_or_horz = 1;  // 1: horizontal
-      const int unit = 1;
-      int i;
-      for (i = 0; i < 256; ++i) {
-        block[i] = 0;
-        orig_pos[i] = -1;
-      }
-
-      if (params.filter_length) {
-        const int filt_len = params.filter_length == 16 ? 8 : 4;
-        const int direct =
-            pick_min_grad_direct(src, filt_len, curr_y, curr_x, width, height,
-                                 dst_stride, unit, vert_or_horz);
-
-        pick_filter_block_horz(src, block, orig_pos, filt_len, curr_y, curr_x,
-                               width, height, dst_stride, pivot, line_length,
-                               unit, direct);
-        uint8_t *const filt_start = block + pivot * line_length;
-        switch (params.filter_length) {
-          // apply 4-tap filtering
-          case 4:
-#if CONFIG_HIGHBITDEPTH
-            if (cm->use_highbitdepth)
-              aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(filt_start),
-                                          line_length, params.mblim, params.lim,
-                                          params.hev_thr, cm->bit_depth);
-            else
-#endif  // CONFIG_HIGHBITDEPTH
-              aom_lpf_horizontal_4(filt_start, line_length, params.mblim,
-                                   params.lim, params.hev_thr);
-            break;
-          // apply 8-tap filtering
-          case 8:
-#if CONFIG_HIGHBITDEPTH
-            if (cm->use_highbitdepth)
-              aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(filt_start),
-                                          line_length, params.mblim, params.lim,
-                                          params.hev_thr, cm->bit_depth);
-            else
-#endif  // CONFIG_HIGHBITDEPTH
-              aom_lpf_horizontal_8(filt_start, line_length, params.mblim,
-                                   params.lim, params.hev_thr);
-            break;
-          // apply 16-tap filtering
-          case 16:
-#if CONFIG_HIGHBITDEPTH
-            if (cm->use_highbitdepth)
-              aom_highbd_lpf_horizontal_edge_16(
-                  CONVERT_TO_SHORTPTR(filt_start), line_length, params.mblim,
-                  params.lim, params.hev_thr, cm->bit_depth);
-            else
-#endif  // CONFIG_HIGHBITDEPTH
-              aom_lpf_horizontal_edge_16(filt_start, line_length, params.mblim,
-                                         params.lim, params.hev_thr);
-            break;
-          // no filtering
-          default: break;
-        }
-
-        for (i = 0; i < 256; ++i) {
-          if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-        }
+      tx_size =
+          set_lpf_parameters(&params, (cm->mi_stride << scale_vert), cm, xd,
+                             HORZ_EDGE, curr_x, curr_y, plane, plane_ptr);
+      if (tx_size == TX_INVALID) {
+        params.filter_length = 0;
+        tx_size = TX_4X4;
       }
-      if (params.filter_length_internal) {
-        for (i = 0; i < 256; ++i) {
-          block[i] = 0;
-          orig_pos[i] = -1;
-        }
 
-        const int direct =
-            pick_min_grad_direct(src, 4, curr_y + 4, curr_x, width, height,
-                                 dst_stride, unit, vert_or_horz);
-
-        pick_filter_block_horz(src, block, orig_pos, 4, curr_y + 4, curr_x,
-                               width, height, dst_stride, pivot, line_length,
-                               unit, direct);
-
-        uint8_t *const filt_start = block + pivot * line_length;
-#if CONFIG_HIGHBITDEPTH
-        if (cm->use_highbitdepth)
-          aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(filt_start),
-                                      line_length, params.mblim, params.lim,
-                                      params.hev_thr, cm->bit_depth);
-        else
-#endif  // CONFIG_HIGHBITDEPTH
-          aom_lpf_horizontal_4(filt_start, line_length, params.mblim,
-                               params.lim, params.hev_thr);
-
-        for (i = 0; i < 256; ++i) {
-          if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
-        }
-      }
-#else  // !CONFIG_LPF_DIRECT
       switch (params.filter_length) {
         // apply 4-tap filtering
         case 4:
-#if CONFIG_HIGHBITDEPTH
           if (cm->use_highbitdepth)
             aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p), dst_stride,
                                         params.mblim, params.lim,
                                         params.hev_thr, cm->bit_depth);
           else
-#endif  // CONFIG_HIGHBITDEPTH
             aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
                                  params.hev_thr);
           break;
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
         // apply 6-tap filtering
-        case 6: assert(plane != 0);
-#if CONFIG_HIGHBITDEPTH
+        case 6:
+          assert(plane != 0);
           if (cm->use_highbitdepth)
-            aom_highbd_lpf_horizontal_6_c(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                          params.mblim, params.lim,
-                                          params.hev_thr, cm->bit_depth);
+            aom_highbd_lpf_horizontal_6(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                        params.mblim, params.lim,
+                                        params.hev_thr, cm->bit_depth);
           else
-#endif  // CONFIG_HIGHBITDEPTH
-            aom_lpf_horizontal_6_c(p, dst_stride, params.mblim, params.lim,
-                                   params.hev_thr);
+            aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim,
+                                 params.hev_thr);
           break;
-#endif
         // apply 8-tap filtering
         case 8:
-#if CONFIG_HIGHBITDEPTH
           if (cm->use_highbitdepth)
             aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(p), dst_stride,
                                         params.mblim, params.lim,
                                         params.hev_thr, cm->bit_depth);
           else
-#endif  // CONFIG_HIGHBITDEPTH
             aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim,
                                  params.hev_thr);
           break;
-        // apply 16-tap filtering
-        case 16:
-#if CONFIG_HIGHBITDEPTH
+        // apply 14-tap filtering
+        case 14:
           if (cm->use_highbitdepth)
-#if CONFIG_DEBLOCK_13TAP
-            // TODO(olah): Remove _c once SIMD for 13-tap is available
-            aom_highbd_lpf_horizontal_edge_16_c(
-                CONVERT_TO_SHORTPTR(p), dst_stride, params.mblim, params.lim,
-                params.hev_thr, cm->bit_depth);
-#else
-            aom_highbd_lpf_horizontal_edge_16(
-                CONVERT_TO_SHORTPTR(p), dst_stride, params.mblim, params.lim,
-                params.hev_thr, cm->bit_depth);
-#endif
+            aom_highbd_lpf_horizontal_14(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                         params.mblim, params.lim,
+                                         params.hev_thr, cm->bit_depth);
           else
-#endif  // CONFIG_HIGHBITDEPTH
-#if CONFIG_DEBLOCK_13TAP
-            aom_lpf_horizontal_edge_16_c(p, dst_stride, params.mblim,
-                                         params.lim, params.hev_thr);
-#else
-          aom_lpf_horizontal_edge_16(p, dst_stride, params.mblim, params.lim,
-                                     params.hev_thr);
-#endif
+            aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim,
+                                  params.hev_thr);
           break;
         // no filtering
         default: break;
       }
-      // process the internal edge
-      if (params.filter_length_internal) {
-#if CONFIG_HIGHBITDEPTH
-        if (cm->use_highbitdepth)
-          aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p + 4 * dst_stride),
-                                      dst_stride, params.mblim, params.lim,
-                                      params.hev_thr, cm->bit_depth);
-        else
-#endif  // CONFIG_HIGHBITDEPTH
-          aom_lpf_horizontal_4(p + 4 * dst_stride, dst_stride, params.mblim,
-                               params.lim, params.hev_thr);
-      }
-#endif  // CONFIG_LPF_DIRECT
+
       // advance the destination pointer
-      p += MI_SIZE;
+      advance_units = tx_size_high_unit[tx_size];
+      y += advance_units;
+      p += advance_units * dst_stride * MI_SIZE;
     }
   }
 }
-#endif  // CONFIG_PARALLEL_DEBLOCKING
 
-void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
-                          struct macroblockd_plane *planes, int start, int stop,
-#if CONFIG_LPF_SB
-                          int col_start, int col_end,
-#endif
-                          int y_only) {
-#if CONFIG_LOOPFILTER_LEVEL
-  // y_only no longer has its original meaning.
-  // Here it means which plane to filter
-  // when y_only = {0, 1, 2}, it means we are searching for filter level for
-  // Y/U/V plane individually.
-  const int plane_start = y_only;
-  const int plane_end = plane_start + 1;
-#else
-  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
-  const int plane_start = 0;
-  const int plane_end = num_planes;
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#if !CONFIG_LPF_SB
+static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
+                             MACROBLOCKD *xd, int start, int stop,
+                             int plane_start, int plane_end) {
+  struct macroblockd_plane *pd = xd->plane;
   const int col_start = 0;
   const int col_end = cm->mi_cols;
-#endif  // CONFIG_LPF_SB
   int mi_row, mi_col;
   int plane;
 
-#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES || \
-    CONFIG_CB4X4
-
-#if !CONFIG_PARALLEL_DEBLOCKING
-#if CONFIG_VAR_TX
-  for (int i = 0; i < MAX_MB_PLANE; ++i)
-    memset(cm->top_txfm_context[i], TX_32X32, cm->mi_cols << TX_UNIT_WIDE_LOG2);
-#endif  // CONFIG_VAR_TX
-  for (mi_row = start; mi_row < stop; mi_row += cm->mib_size) {
-    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
-#if CONFIG_VAR_TX
-    for (int i = 0; i < MAX_MB_PLANE; ++i)
-      memset(cm->left_txfm_context[i], TX_32X32,
-             MAX_MIB_SIZE << TX_UNIT_HIGH_LOG2);
-#endif  // CONFIG_VAR_TX
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += cm->mib_size) {
-      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
-
-      for (plane = plane_start; plane < plane_end; ++plane) {
-        av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
-                                          mi_row, mi_col, plane);
-        av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col,
-                                          mi_row, mi_col, plane);
-      }
-    }
-  }
-#else
-
-  // filter all vertical edges in every 64x64 super block
-  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
-    for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
-      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
-      for (plane = plane_start; plane < plane_end; ++plane) {
-        av1_filter_block_plane_vert(cm, plane, &planes[plane], mi_row, mi_col);
-      }
-    }
-  }
+  for (plane = plane_start; plane < plane_end; plane++) {
+    if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
+      break;
+    else if (plane == 1 && !(cm->lf.filter_level_u))
+      continue;
+    else if (plane == 2 && !(cm->lf.filter_level_v))
+      continue;
 
-  // filter all horizontal edges in every 64x64 super block
-  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
-    for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
-      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
-      for (plane = plane_start; plane < plane_end; ++plane) {
-        av1_filter_block_plane_horz(cm, plane, &planes[plane], mi_row, mi_col);
+#if LOOP_FILTER_BITMASK
+    // filter all vertical edges every superblock (could be 128x128 or 64x64)
+    for (mi_row = start; mi_row < stop; mi_row += cm->seq_params.mib_size) {
+      for (mi_col = col_start; mi_col < col_end;
+           mi_col += cm->seq_params.mib_size) {
+        av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                             mi_col, plane, plane + 1);
+
+        av1_setup_bitmask(cm, mi_row, mi_col, plane, pd[plane].subsampling_x,
+                          pd[plane].subsampling_y, stop, col_end);
+        av1_filter_block_plane_ver(cm, &pd[plane], plane, mi_row, mi_col);
       }
     }
-  }
-#endif  // CONFIG_PARALLEL_DEBLOCKING
 
-#else  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
+    // filter all horizontal edges every superblock
+    for (mi_row = start; mi_row < stop; mi_row += cm->seq_params.mib_size) {
+      for (mi_col = col_start; mi_col < col_end;
+           mi_col += cm->seq_params.mib_size) {
+        av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                             mi_col, plane, plane + 1);
 
-#if CONFIG_PARALLEL_DEBLOCKING
-  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
-      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
-      // filter all vertical edges in every 64x64 super block
-      for (plane = plane_start; plane < plane_end; plane += 1) {
-        av1_filter_block_plane_vert(cm, plane, &planes[plane], mi_row, mi_col);
+        av1_filter_block_plane_hor(cm, &pd[plane], plane, mi_row, mi_col);
       }
     }
-  }
-  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
-      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
-      // filter all horizontal edges in every 64x64 super block
-      for (plane = plane_start; plane < plane_end; plane += 1) {
-        av1_filter_block_plane_horz(cm, plane, &planes[plane], mi_row, mi_col);
+#else
+    if (cm->lf.combine_vert_horz_lf) {
+      // filter all vertical and horizontal edges in every 128x128 super block
+      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+        for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
+          // filter vertical edges
+          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                               mi_col, plane, plane + 1);
+          av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
+                                      mi_col);
+          // filter horizontal edges
+          if (mi_col - MAX_MIB_SIZE >= 0) {
+            av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer,
+                                 mi_row, mi_col - MAX_MIB_SIZE, plane,
+                                 plane + 1);
+            av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+                                        mi_col - MAX_MIB_SIZE);
+          }
+        }
+        // filter horizontal edges
+        av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                             mi_col - MAX_MIB_SIZE, plane, plane + 1);
+        av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+                                    mi_col - MAX_MIB_SIZE);
       }
-    }
-  }
-#else   // CONFIG_PARALLEL_DEBLOCKING
-  enum lf_path path;
-  LOOP_FILTER_MASK lfm;
-
-  if (y_only)
-    path = LF_PATH_444;
-  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
-    path = LF_PATH_420;
-  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
-    path = LF_PATH_444;
-  else
-    path = LF_PATH_SLOW;
-
-  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
-    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
-      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
-
-      // TODO(JBB): Make setup_mask work for non 420.
-      av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
-
-      av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, &lfm);
-      av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, &lfm);
-      for (plane = 1; plane < num_planes; ++plane) {
-        switch (path) {
-          case LF_PATH_420:
-            av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, &lfm);
-            av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, &lfm);
-            break;
-          case LF_PATH_444:
-            av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, &lfm);
-            av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, &lfm);
-            break;
-          case LF_PATH_SLOW:
-            av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
-                                              mi_row, mi_col, plane);
-            av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col,
-                                              mi_row, mi_col, plane);
-
-            break;
+    } else {
+      // filter all vertical edges in every 128x128 super block
+      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+        for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
+          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                               mi_col, plane, plane + 1);
+          av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
+                                      mi_col);
+        }
+      }
+
+      // filter all horizontal edges in every 128x128 super block
+      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+        for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
+          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                               mi_col, plane, plane + 1);
+          av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+                                      mi_col);
         }
       }
     }
+#endif  // LOOP_FILTER_BITMASK
   }
-#endif  // CONFIG_PARALLEL_DEBLOCKING
-#endif  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
 }
 
 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
-                           MACROBLOCKD *xd, int frame_filter_level,
-#if CONFIG_LOOPFILTER_LEVEL
-                           int frame_filter_level_r,
-#endif
-                           int y_only, int partial_frame
-#if CONFIG_LPF_SB
-                           ,
-                           int mi_row, int mi_col
-#endif
-                           ) {
+                           MACROBLOCKD *xd, int plane_start, int plane_end,
+                           int partial_frame) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-  int orig_filter_level[2] = { cm->lf.filter_level[0], cm->lf.filter_level[1] };
-#else
-  int orig_filter_level = cm->lf.filter_level;
-#endif
-#endif
 
-#if CONFIG_LPF_SB
-  if (partial_frame && !frame_filter_level) return;
-#else
-#if CONFIG_LOOPFILTER_LEVEL
-  if (!frame_filter_level && !frame_filter_level_r) return;
-#else
-  if (!frame_filter_level) return;
-#endif
-#endif  // CONFIG_LPF_SB
-#if CONFIG_LPF_SB
-  int start_mi_col;
-  int end_mi_col;
-
-  // In the experiment of deblocking filtering per superblock.
-  // When partial_frame is 1, it indicates we are searching for the best filter
-  // level for current superblock. We reuse frame_filter_level as filter level
-  // for superblock, no longer for the whole frame.
-  // When partial_frame is 0, it's in the actual filtering stage for the frame
-  if (partial_frame) {
-    start_mi_row = AOMMAX(0, mi_row - FILT_BOUNDARY_MI_OFFSET);
-    start_mi_col = AOMMAX(0, mi_col - FILT_BOUNDARY_MI_OFFSET);
-    const int mi_row_range = mi_row - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE;
-    const int mi_col_range = mi_col - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE;
-    end_mi_row = AOMMIN(mi_row_range, cm->mi_rows);
-    end_mi_col = AOMMIN(mi_col_range, cm->mi_cols);
-
-    av1_loop_filter_sb_level_init(cm, mi_row, mi_col, frame_filter_level);
-  } else {
-    start_mi_row = 0;
-    mi_rows_to_filter = cm->mi_rows;
-    end_mi_row = start_mi_row + mi_rows_to_filter;
-    start_mi_col = 0;
-    end_mi_col = cm->mi_cols;
-  }
-#else
   start_mi_row = 0;
   mi_rows_to_filter = cm->mi_rows;
   if (partial_frame && cm->mi_rows > 8) {
@@ -3737,61 +1917,7 @@ void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
     mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8);
   }
   end_mi_row = start_mi_row + mi_rows_to_filter;
-#if CONFIG_LOOPFILTER_LEVEL
-  // TODO(chengchen): refactor the code such that y_only has its matching
-  // meaning. Now it means the plane to be filtered in this experiment.
-  av1_loop_filter_frame_init(cm, frame_filter_level, frame_filter_level_r,
-                             y_only);
-#else
-  av1_loop_filter_frame_init(cm, frame_filter_level, frame_filter_level);
-#endif
-#endif  // CONFIG_LPF_SB
-
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-  cm->lf.filter_level[0] = frame_filter_level;
-  cm->lf.filter_level[1] = frame_filter_level_r;
-#else
-  cm->lf.filter_level = frame_filter_level;
-#endif
-#endif
-
-#if CONFIG_LPF_SB
-  av1_loop_filter_rows(frame, cm, xd->plane, start_mi_row, end_mi_row,
-                       start_mi_col, end_mi_col, y_only);
-#else
-  av1_loop_filter_rows(frame, cm, xd->plane, start_mi_row, end_mi_row, y_only);
-#endif  // CONFIG_LPF_SB
-
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-  cm->lf.filter_level[0] = orig_filter_level[0];
-  cm->lf.filter_level[1] = orig_filter_level[1];
-#else
-  cm->lf.filter_level = orig_filter_level;
-#endif
-#endif
-}
-
-void av1_loop_filter_data_reset(LFWorkerData *lf_data,
-                                YV12_BUFFER_CONFIG *frame_buffer,
-                                struct AV1Common *cm,
-                                const struct macroblockd_plane *planes) {
-  lf_data->frame_buffer = frame_buffer;
-  lf_data->cm = cm;
-  lf_data->start = 0;
-  lf_data->stop = 0;
-  lf_data->y_only = 0;
-  memcpy(lf_data->planes, planes, sizeof(lf_data->planes));
-}
-
-int av1_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
-  (void)unused;
-#if !CONFIG_LPF_SB
-  av1_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
-                       lf_data->start, lf_data->stop, lf_data->y_only);
-#else
-  (void)lf_data;
-#endif  // CONFIG_LPF_SB
-  return 1;
+  av1_loop_filter_frame_init(cm, plane_start, plane_end);
+  loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
+                   plane_end);
 }
diff --git a/third_party/aom/av1/common/av1_loopfilter.h b/third_party/aom/av1/common/av1_loopfilter.h
index ee32c368c..c35c3b2dc 100644
--- a/third_party/aom/av1/common/av1_loopfilter.h
+++ b/third_party/aom/av1/common/av1_loopfilter.h
@@ -12,9 +12,9 @@
 #ifndef AV1_COMMON_LOOPFILTER_H_
 #define AV1_COMMON_LOOPFILTER_H_
 
-#include "aom_ports/mem.h"
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
+#include "aom_ports/mem.h"
 #include "av1/common/blockd.h"
 #include "av1/common/seg_common.h"
 
@@ -27,37 +27,111 @@ extern "C" {
 
 #define SIMD_WIDTH 16
 
-#define MAX_MODE_LF_DELTAS 2
-
 enum lf_path {
   LF_PATH_420,
   LF_PATH_444,
   LF_PATH_SLOW,
 };
 
+#if LOOP_FILTER_BITMASK
+typedef struct {
+  uint64_t bits[4];
+} FilterMask;
+
+// This structure holds bit masks for all 4x4 blocks in a 64x64 region.
+// Each 1 bit represents a position in which we want to apply the loop filter.
+// For Y plane, 4x4 in 64x64 requires 16x16 = 256 bit, therefore we use 4
+// uint64_t; For U, V plane, for 420 format, plane size is 32x32, thus we use
+// a uint64_t to represent bitmask.
+// Left_ entries refer to whether we apply a filter on the border to the
+// left of the block.   Above_ entries refer to whether or not to apply a
+// filter on the above border.
+// Since each transform is accompanied by a potentially different type of
+// loop filter there is a different entry in the array for each transform size.
+typedef struct {
+  FilterMask left_y[TX_SIZES];
+  FilterMask above_y[TX_SIZES];
+  FilterMask left_u[TX_SIZES];
+  FilterMask above_u[TX_SIZES];
+  FilterMask left_v[TX_SIZES];
+  FilterMask above_v[TX_SIZES];
+
+  // Y plane vertical edge and horizontal edge filter level
+  uint8_t lfl_y_hor[MI_SIZE_64X64][MI_SIZE_64X64];
+  uint8_t lfl_y_ver[MI_SIZE_64X64][MI_SIZE_64X64];
+
+  // U plane vertical edge and horizontal edge filter level
+  uint8_t lfl_u_hor[MI_SIZE_64X64][MI_SIZE_64X64];
+  uint8_t lfl_u_ver[MI_SIZE_64X64][MI_SIZE_64X64];
+
+  // V plane vertical edge and horizontal edge filter level
+  uint8_t lfl_v_hor[MI_SIZE_64X64][MI_SIZE_64X64];
+  uint8_t lfl_v_ver[MI_SIZE_64X64][MI_SIZE_64X64];
+} LoopFilterMask;
+
+// To determine whether to apply loop filtering at one transform block edge,
+// we need information of the neighboring transform block. Specifically,
+// in determining a vertical edge, we need the information of the tx block
+// to its left. For a horizontal edge, we need info of the tx block above it.
+// Thus, we need to record info of right column and bottom row of tx blocks.
+// We record the information of the neighboring superblock, when bitmask
+// building for a superblock is finished. And it will be used for next
+// superblock bitmask building.
+// Information includes:
+// ------------------------------------------------------------
+//                    MI_SIZE_64X64
+// Y  tx_size above |--------------|
+// Y  tx_size left  |--------------|
+// UV tx_size above |--------------|
+// UV tx_size left  |--------------|
+// Y level above    |--------------|
+// Y level left     |--------------|
+// U level above    |--------------|
+// U level left     |--------------|
+// V level above    |--------------|
+// V level left     |--------------|
+// skip             |--------------|
+// ------------------------------------------------------------
+typedef struct {
+  TX_SIZE tx_size_y_above[MI_SIZE_64X64];
+  TX_SIZE tx_size_y_left[MI_SIZE_64X64];
+  TX_SIZE tx_size_uv_above[MI_SIZE_64X64];
+  TX_SIZE tx_size_uv_left[MI_SIZE_64X64];
+  uint8_t y_level_above[MI_SIZE_64X64];
+  uint8_t y_level_left[MI_SIZE_64X64];
+  uint8_t u_level_above[MI_SIZE_64X64];
+  uint8_t u_level_left[MI_SIZE_64X64];
+  uint8_t v_level_above[MI_SIZE_64X64];
+  uint8_t v_level_left[MI_SIZE_64X64];
+  uint8_t skip[MI_SIZE_64X64];
+} LpfSuperblockInfo;
+#endif  // LOOP_FILTER_BITMASK
+
 struct loopfilter {
-#if CONFIG_LOOPFILTER_LEVEL
   int filter_level[2];
   int filter_level_u;
   int filter_level_v;
-#else
-  int filter_level;
-#endif
 
   int sharpness_level;
-  int last_sharpness_level;
 
   uint8_t mode_ref_delta_enabled;
   uint8_t mode_ref_delta_update;
 
-  // 0 = Intra, Last, Last2+Last3(CONFIG_EXT_REFS),
-  // GF, BRF(CONFIG_EXT_REFS), ARF2(CONFIG_EXT_REFS), ARF
-  int8_t ref_deltas[TOTAL_REFS_PER_FRAME];
-  int8_t last_ref_deltas[TOTAL_REFS_PER_FRAME];
+  // 0 = Intra, Last, Last2+Last3,
+  // GF, BRF, ARF2, ARF
+  int8_t ref_deltas[REF_FRAMES];
 
   // 0 = ZERO_MV, MV
   int8_t mode_deltas[MAX_MODE_LF_DELTAS];
-  int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
+
+  int combine_vert_horz_lf;
+
+#if LOOP_FILTER_BITMASK
+  LoopFilterMask *lfm;
+  size_t lfm_num;
+  int lfm_stride;
+  LpfSuperblockInfo neighbor_sb_lpf_info;
+#endif  // LOOP_FILTER_BITMASK
 };
 
 // Need to align this structure so when it is declared and
@@ -70,127 +144,56 @@ typedef struct {
 
 typedef struct {
   loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
-#if CONFIG_LOOPFILTER_LEVEL
-  uint8_t lvl[MAX_SEGMENTS][2][TOTAL_REFS_PER_FRAME][MAX_MODE_LF_DELTAS];
-#else
-  uint8_t lvl[MAX_SEGMENTS][TOTAL_REFS_PER_FRAME][MAX_MODE_LF_DELTAS];
-#endif
+  uint8_t lvl[MAX_MB_PLANE][MAX_SEGMENTS][2][REF_FRAMES][MAX_MODE_LF_DELTAS];
 } loop_filter_info_n;
 
-// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
-// Each 1 bit represents a position in which we want to apply the loop filter.
-// Left_ entries refer to whether we apply a filter on the border to the
-// left of the block.   Above_ entries refer to whether or not to apply a
-// filter on the above border.   Int_ entries refer to whether or not to
-// apply borders on the 4x4 edges within the 8x8 block that each bit
-// represents.
-// Since each transform is accompanied by a potentially different type of
-// loop filter there is a different entry in the array for each transform size.
-typedef struct {
-  uint64_t left_y[TX_SIZES];
-  uint64_t above_y[TX_SIZES];
-  uint64_t int_4x4_y;
-  uint16_t left_uv[TX_SIZES];
-  uint16_t above_uv[TX_SIZES];
-  uint16_t left_int_4x4_uv;
-  uint16_t above_int_4x4_uv;
-  uint8_t lfl_y[MAX_MIB_SIZE][MAX_MIB_SIZE];
-  uint8_t lfl_uv[MAX_MIB_SIZE / 2][MAX_MIB_SIZE / 2];
-} LOOP_FILTER_MASK;
-
 /* assorted loopfilter functions which get used elsewhere */
 struct AV1Common;
 struct macroblockd;
 struct AV1LfSyncData;
 
-// This function sets up the bit masks for the entire 64x64 region represented
-// by mi_row, mi_col.
-void av1_setup_mask(struct AV1Common *const cm, const int mi_row,
-                    const int mi_col, MODE_INFO **mi_8x8,
-                    const int mode_info_stride, LOOP_FILTER_MASK *lfm);
-
-void av1_filter_block_plane_ss00_ver(struct AV1Common *const cm,
-                                     struct macroblockd_plane *const plane,
-                                     int mi_row, LOOP_FILTER_MASK *lfm);
-void av1_filter_block_plane_ss00_hor(struct AV1Common *const cm,
-                                     struct macroblockd_plane *const plane,
-                                     int mi_row, LOOP_FILTER_MASK *lfm);
-void av1_filter_block_plane_ss11_ver(struct AV1Common *const cm,
-                                     struct macroblockd_plane *const plane,
-                                     int mi_row, LOOP_FILTER_MASK *lfm);
-void av1_filter_block_plane_ss11_hor(struct AV1Common *const cm,
-                                     struct macroblockd_plane *const plane,
-                                     int mi_row, LOOP_FILTER_MASK *lfm);
-
-void av1_filter_block_plane_non420_ver(struct AV1Common *const cm,
-                                       struct macroblockd_plane *plane,
-                                       MODE_INFO **mi_8x8, int mi_row,
-                                       int mi_col, int pl);
-void av1_filter_block_plane_non420_hor(struct AV1Common *const cm,
-                                       struct macroblockd_plane *plane,
-                                       MODE_INFO **mi_8x8, int mi_row,
-                                       int mi_col, int pl);
-
 void av1_loop_filter_init(struct AV1Common *cm);
 
-// Update the loop filter for the current frame.
-// This should be called before av1_loop_filter_rows(),
-// av1_loop_filter_frame()
-// calls this function directly.
-void av1_loop_filter_frame_init(struct AV1Common *cm, int default_filt_lvl,
-                                int default_filt_lvl_r
-#if CONFIG_LOOPFILTER_LEVEL
-                                ,
-                                int plane
-#endif
-                                );
+void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start,
+                                int plane_end);
 
-#if CONFIG_LPF_SB
-void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
-                           struct macroblockd *mbd, int filter_level,
-                           int y_only, int partial_frame, int mi_row,
-                           int mi_col);
-
-// Apply the loop filter to [start, stop) macro block rows in frame_buffer.
-void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
-                          struct AV1Common *cm,
-                          struct macroblockd_plane *planes, int start, int stop,
-                          int col_start, int col_end, int y_only);
-
-void av1_loop_filter_sb_level_init(struct AV1Common *cm, int mi_row, int mi_col,
-                                   int lvl);
-#else
 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
-                           struct macroblockd *mbd, int filter_level,
-#if CONFIG_LOOPFILTER_LEVEL
-                           int filter_level_r,
-#endif
-                           int y_only, int partial_frame);
+                           struct macroblockd *mbd, int plane_start,
+                           int plane_end, int partial_frame);
+
+void av1_filter_block_plane_vert(const struct AV1Common *const cm,
+                                 const MACROBLOCKD *const xd, const int plane,
+                                 const MACROBLOCKD_PLANE *const plane_ptr,
+                                 const uint32_t mi_row, const uint32_t mi_col);
 
-// Apply the loop filter to [start, stop) macro block rows in frame_buffer.
-void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
-                          struct AV1Common *cm,
-                          struct macroblockd_plane *planes, int start, int stop,
-                          int y_only);
-#endif  // CONFIG_LPF_SB
+void av1_filter_block_plane_horz(const struct AV1Common *const cm,
+                                 const MACROBLOCKD *const xd, const int plane,
+                                 const MACROBLOCKD_PLANE *const plane_ptr,
+                                 const uint32_t mi_row, const uint32_t mi_col);
 
 typedef struct LoopFilterWorkerData {
   YV12_BUFFER_CONFIG *frame_buffer;
   struct AV1Common *cm;
   struct macroblockd_plane planes[MAX_MB_PLANE];
-
-  int start;
-  int stop;
-  int y_only;
+  // TODO(Ranjit): When the filter functions are modified to use xd->lossless
+  // add lossless as a member here.
+  MACROBLOCKD *xd;
 } LFWorkerData;
 
-void av1_loop_filter_data_reset(LFWorkerData *lf_data,
-                                YV12_BUFFER_CONFIG *frame_buffer,
-                                struct AV1Common *cm,
-                                const struct macroblockd_plane *planes);
+#if LOOP_FILTER_BITMASK
+void av1_setup_bitmask(struct AV1Common *const cm, int mi_row, int mi_col,
+                       int plane, int subsampling_x, int subsampling_y,
+                       int row_end, int col_end);
+
+void av1_filter_block_plane_ver(struct AV1Common *const cm,
+                                struct macroblockd_plane *const plane_ptr,
+                                int pl, int mi_row, int mi_col);
+
+void av1_filter_block_plane_hor(struct AV1Common *const cm,
+                                struct macroblockd_plane *const plane, int pl,
+                                int mi_row, int mi_col);
+#endif
 
-// Operates on the rows described by 'lf_data'.
-int av1_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/av1_rtcd.c b/third_party/aom/av1/common/av1_rtcd.c
index f9ccd1979..38e26bee1 100644
--- a/third_party/aom/av1/common/av1_rtcd.c
+++ b/third_party/aom/av1/common/av1_rtcd.c
@@ -8,9 +8,11 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #define RTCD_C
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom_ports/aom_once.h"
 
 void av1_rtcd() {
diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl
index 203426e59..6aa925515 100755
--- a/third_party/aom/av1/common/av1_rtcd_defs.pl
+++ b/third_party/aom/av1/common/av1_rtcd_defs.pl
@@ -1,3 +1,13 @@
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
 sub av1_common_forward_decls() {
 print <<EOF
 /*
@@ -13,6 +23,7 @@ print <<EOF
 #include "av1/common/convolve.h"
 #include "av1/common/av1_txfm.h"
 #include "av1/common/odintrin.h"
+#include "av1/common/restoration.h"
 
 struct macroblockd;
 
@@ -21,9 +32,22 @@ struct macroblock;
 struct txfm_param;
 struct aom_variance_vtable;
 struct search_site_config;
-struct mv;
-union int_mv;
 struct yv12_buffer_config;
+
+/* Function pointers return by CfL functions */
+typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
+
+typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
+                                   int dst_stride, int alpha_q3);
+
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+                                   int dst_stride, int alpha_q3, int bd);
 EOF
 }
 forward_decls qw/av1_common_forward_decls/;
@@ -38,233 +62,55 @@ if ($opts{arch} eq "x86_64") {
   $avx2_x86_64 = 'avx2';
 }
 
-#
-# 10/12-tap convolution filters
-#
-add_proto qw/void av1_lowbd_convolve_init/, "void";
-specialize qw/av1_lowbd_convolve_init ssse3/;
-
-add_proto qw/void av1_convolve_horiz/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params";
-specialize qw/av1_convolve_horiz ssse3/;
-
-add_proto qw/void av1_convolve_vert/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, ConvolveParams *conv_params";
-specialize qw/av1_convolve_vert ssse3/;
-
-if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void av1_highbd_convolve_init/, "void";
-  specialize qw/av1_highbd_convolve_init sse4_1/;
-  add_proto qw/void av1_highbd_convolve_horiz/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
-  specialize qw/av1_highbd_convolve_horiz sse4_1/;
-  add_proto qw/void av1_highbd_convolve_vert/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
-  specialize qw/av1_highbd_convolve_vert sse4_1/;
-}
-
-#
-# Inverse dct
-#
-if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht4x4_16_add sse2/;
-
-  add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht4x8_32_add sse2/;
-
-  add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht8x4_32_add sse2/;
-
-  add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht8x16_128_add sse2/;
-
-  add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht16x8_128_add sse2/;
-
-  add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht16x32_512_add sse2/;
-
-  add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht32x16_512_add sse2/;
-
-  add_proto qw/void av1_iht4x16_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+add_proto qw/void av1_convolve_horiz_rs/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn";
+specialize qw/av1_convolve_horiz_rs sse4_1/;
 
-  add_proto qw/void av1_iht16x4_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd";
+specialize qw/av1_highbd_convolve_horiz_rs sse4_1/;
 
-  add_proto qw/void av1_iht8x32_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+add_proto qw/void av1_wiener_convolve_add_src/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params";
 
-  add_proto qw/void av1_iht32x8_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps";
 
-  add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht8x8_64_add sse2/;
+specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/;
+specialize qw/av1_highbd_wiener_convolve_add_src ssse3/;
+specialize qw/av1_highbd_wiener_convolve_add_src avx2/;
 
-  add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
-  specialize qw/av1_iht16x16_256_add sse2 avx2/;
+# directional intra predictor functions
+add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
+add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
+add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
 
-  add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
-} else {
-  add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  if (aom_config("CONFIG_DAALA_DCT4") ne "yes") {
-    specialize qw/av1_iht4x4_16_add sse2 neon/;
-  }
-
-  add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht4x8_32_add sse2/;
-
-  add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht8x4_32_add sse2/;
-
-  add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht8x16_128_add sse2/;
-
-  add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht16x8_128_add sse2/;
-
-  add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht16x32_512_add sse2/;
-
-  add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  specialize qw/av1_iht32x16_512_add sse2/;
-
-  add_proto qw/void av1_iht4x16_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-
-  add_proto qw/void av1_iht16x4_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-
-  add_proto qw/void av1_iht8x32_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-
-  add_proto qw/void av1_iht32x8_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-
-  add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-  if (aom_config("CONFIG_DAALA_DCT8") ne "yes") {
-    specialize qw/av1_iht8x8_64_add sse2 neon/;
-  }
-
-  add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
-  if (aom_config("CONFIG_DAALA_DCT16") ne "yes") {
-    specialize qw/av1_iht16x16_256_add sse2 avx2/;
-  }
-
-  add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
-
-  if (aom_config("CONFIG_EXT_TX") ne "yes") {
-    if (aom_config("CONFIG_DAALA_DCT4") ne "yes") {
-      specialize qw/av1_iht4x4_16_add msa/;
-    }
-    if (aom_config("CONFIG_DAALA_DCT8") ne "yes") {
-      specialize qw/av1_iht8x8_64_add msa/;
-    }
-    if (aom_config("CONFIG_DAALA_DCT16") ne "yes") {
-      specialize qw/av1_iht16x16_256_add msa/;
-    }
-  }
-}
-
-add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
-
-if (aom_config("CONFIG_TX64X64") eq "yes") {
-  add_proto qw/void av1_iht64x64_4096_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
-  add_proto qw/void av1_iht32x64_2048_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
-  add_proto qw/void av1_iht64x32_2048_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
-}
-
-if (aom_config("CONFIG_NEW_QUANT") eq "yes") {
-  add_proto qw/void quantize_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-
-  add_proto qw/void quantize_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-
-  add_proto qw/void quantize_32x32_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-
-  add_proto qw/void quantize_32x32_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-
-  if (aom_config("CONFIG_TX64X64") eq "yes") {
-    add_proto qw/void quantize_64x64_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-
-    add_proto qw/void quantize_64x64_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-  }
-}
 
 # FILTER_INTRA predictor functions
-if (aom_config("CONFIG_FILTER_INTRA") eq "yes") {
-  add_proto qw/void av1_dc_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_v_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_h_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_d45_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_d135_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_d117_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_d153_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_d207_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_d63_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_tm_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  # High bitdepth functions
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void av1_highbd_dc_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-    add_proto qw/void av1_highbd_v_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-    add_proto qw/void av1_highbd_h_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-    add_proto qw/void av1_highbd_d45_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-    add_proto qw/void av1_highbd_d135_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-    add_proto qw/void av1_highbd_d117_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-    add_proto qw/void av1_highbd_d153_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-    add_proto qw/void av1_highbd_d207_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-    add_proto qw/void av1_highbd_d63_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-    add_proto qw/void av1_highbd_tm_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-  }
-}
+add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode";
+specialize qw/av1_filter_intra_predictor sse4_1/;
 
 # High bitdepth functions
-if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-  #
-  # Sub Pixel Filters
-  #
-  add_proto qw/void av1_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-
-  add_proto qw/void av1_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-
-  add_proto qw/void av1_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/av1_highbd_convolve8/, "$sse2_x86_64";
-
-  add_proto qw/void av1_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/av1_highbd_convolve8_horiz/, "$sse2_x86_64";
-
-  add_proto qw/void av1_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
 
-  add_proto qw/void av1_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/av1_highbd_convolve8_avg/, "$sse2_x86_64";
-
-  add_proto qw/void av1_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/av1_highbd_convolve8_avg_horiz/, "$sse2_x86_64";
-
-  add_proto qw/void av1_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/av1_highbd_convolve8_avg_vert/, "$sse2_x86_64";
-
-  #
-  # dct
-  #
-  add_proto qw/void av1_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-
-  add_proto qw/void av1_highbd_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-
-  add_proto qw/void av1_highbd_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-
-  add_proto qw/void av1_highbd_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-
-  add_proto qw/void av1_highbd_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-
-  add_proto qw/void av1_highbd_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
-
-  add_proto qw/void av1_highbd_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+#
+# Sub Pixel Filters
+#
+add_proto qw/void av1_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
 
-  add_proto qw/void av1_highbd_iht4x16_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+add_proto qw/void av1_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
 
-  add_proto qw/void av1_highbd_iht16x4_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+add_proto qw/void av1_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/av1_highbd_convolve8/, "$sse2_x86_64";
 
-  add_proto qw/void av1_highbd_iht8x32_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+add_proto qw/void av1_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/av1_highbd_convolve8_horiz/, "$sse2_x86_64";
 
-  add_proto qw/void av1_highbd_iht32x8_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+add_proto qw/void av1_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
 
-  add_proto qw/void av1_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, const struct txfm_param *param";
+#inv txfm
+add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_inv_txfm_add ssse3 avx2/;
 
-  add_proto qw/void av1_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, const struct txfm_param *param";
-}
+add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 
-#inv txfm
 add_proto qw/void av1_inv_txfm2d_add_4x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_8x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_8x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -272,227 +118,128 @@ add_proto qw/void av1_inv_txfm2d_add_16x8/, "const int32_t *input, uint16_t *out
 add_proto qw/void av1_inv_txfm2d_add_16x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_32x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_4x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-if (aom_config("CONFIG_DAALA_DCT4") ne "yes") {
-  specialize qw/av1_inv_txfm2d_add_4x4 sse4_1/;
-}
+specialize qw/av1_inv_txfm2d_add_4x4 sse4_1/;
 add_proto qw/void av1_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-if (aom_config("CONFIG_DAALA_DCT8") ne "yes") {
-  specialize qw/av1_inv_txfm2d_add_8x8 sse4_1/;
-}
+specialize qw/av1_inv_txfm2d_add_8x8 sse4_1/;
 add_proto qw/void av1_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-if (aom_config("CONFIG_DAALA_DCT16") ne "yes") {
-  specialize qw/av1_inv_txfm2d_add_16x16 sse4_1/;
-}
+specialize qw/av1_inv_txfm2d_add_16x16 sse4_1/;
 add_proto qw/void av1_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-if (aom_config("CONFIG_DAALA_DCT32") ne "yes") {
-  specialize qw/av1_inv_txfm2d_add_32x32 avx2/;
-}
-if (aom_config("CONFIG_TX64X64") eq "yes") {
-  add_proto qw/void av1_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-  add_proto qw/void av1_inv_txfm2d_add_64x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-  add_proto qw/void av1_inv_txfm2d_add_32x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-}
+specialize qw/av1_inv_txfm2d_add_32x32 avx2/;
 
-#
-# Encoder functions below this point.
-#
-if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
-
-  # ENCODEMB INVOKE
-
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    # the transform coefficients are held in 32-bit
-    # values, so the assembler code for  av1_block_error can no longer be used.
-    add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-    specialize qw/av1_block_error avx2/;
-
-    add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/av1_quantize_fp sse2 avx2/;
-
-    add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/av1_quantize_fp_32x32 avx2/;
-
-    if (aom_config("CONFIG_TX64X64") eq "yes") {
-      add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    }
-  } else {
-    add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-    specialize qw/av1_block_error sse2 avx2 msa/;
-
-    add_proto qw/int64_t av1_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
-    specialize qw/av1_block_error_fp neon sse2/;
-
-    add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/av1_quantize_fp neon sse2 avx2/, "$ssse3_x86_64";
-
-    add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/av1_quantize_fp_32x32 avx2/, "$ssse3_x86_64";
-
-    if (aom_config("CONFIG_TX64X64") eq "yes") {
-      add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    }
-
-  }
-
-  # fdct functions
-
-  add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  if (aom_config("CONFIG_DAALA_DCT4") ne "yes") {
-    specialize qw/av1_fht4x4 sse2/;
-  }
+add_proto qw/void av1_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_32x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_64x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_16x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_64x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 
-  add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-
-  add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  if (aom_config("CONFIG_DAALA_DCT8") ne "yes") {
-    specialize qw/av1_fht8x8 sse2/;
-  }
-
-  add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  if (aom_config("CONFIG_DAALA_DCT16") ne "yes") {
-    specialize qw/av1_fht16x16 sse2 avx2/;
-  }
+specialize qw/av1_inv_txfm2d_add_64x64 sse4_1/;
 
-  add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  if (aom_config("CONFIG_DAALA_DCT32") ne "yes") {
-    specialize qw/av1_fht32x32 sse2 avx2/;
-  }
+add_proto qw/void av1_inv_txfm2d_add_4x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_16x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_8x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_32x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 
-  if (aom_config("CONFIG_TX64X64") eq "yes") {
-    add_proto qw/void av1_fht64x64/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-    add_proto qw/void av1_fht64x32/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-    add_proto qw/void av1_fht32x64/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  }
+# directional intra predictor functions
+add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd";
+add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd";
+add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd";
 
-  add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  specialize qw/av1_fht4x8 sse2/;
+# build compound seg mask functions
+add_proto qw/void av1_build_compound_diffwtd_mask/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w";
+specialize qw/av1_build_compound_diffwtd_mask sse4_1/;
 
-  add_proto qw/void av1_fht8x4/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  specialize qw/av1_fht8x4 sse2/;
+add_proto qw/void av1_build_compound_diffwtd_mask_highbd/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd";
+specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/;
 
-  add_proto qw/void av1_fht8x16/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  specialize qw/av1_fht8x16 sse2/;
+add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd";
+specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 neon/;
 
-  add_proto qw/void av1_fht16x8/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  specialize qw/av1_fht16x8 sse2/;
-
-  add_proto qw/void av1_fht16x32/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  specialize qw/av1_fht16x32 sse2/;
+#
+# Encoder functions below this point.
+#
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
-  add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  specialize qw/av1_fht32x16 sse2/;
+  # ENCODEMB INVOKE
 
-  add_proto qw/void av1_fht4x16/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
+  # the transform coefficients are held in 32-bit
+  # values, so the assembler code for  av1_block_error can no longer be used.
+  add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+  specialize qw/av1_block_error avx2/;
 
-  add_proto qw/void av1_fht16x4/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
+  add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/av1_quantize_fp sse2 avx2/;
 
-  add_proto qw/void av1_fht8x32/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
+  add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/av1_quantize_fp_32x32 avx2/;
 
-  add_proto qw/void av1_fht32x8/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
+  add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/av1_quantize_fp_64x64 avx2/;
 
-  if (aom_config("CONFIG_HIGHBITDEPTH") ne "yes") {
-    if (aom_config("CONFIG_EXT_TX") ne "yes") {
-      if (aom_config("CONFIG_DAALA_DCT4") ne "yes") {
-        specialize qw/av1_fht4x4 msa/;
-      }
-      if (aom_config("CONFIG_DAALA_DCT8") ne "yes") {
-        specialize qw/av1_fht8x8 msa/;
-      }
-      if (aom_config("CONFIG_DAALA_DCT16") ne "yes") {
-        specialize qw/av1_fht16x16 msa/;
-      }
-    }
-  }
+  # fdct functions
 
-  add_proto qw/void av1_fwd_idtx/, "const int16_t *src_diff, tran_low_t *coeff, int stride, int bsx, int bsy, TX_TYPE tx_type";
+  add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
 
   #fwd txfm
+  add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param";
+  specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1/;
+
   add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_16x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_8x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_32x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
   add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  if (aom_config("CONFIG_DAALA_DCT4") ne "yes") {
-    specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
-  }
+  specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  if (aom_config("CONFIG_DAALA_DCT8") ne "yes") {
-    specialize qw/av1_fwd_txfm2d_8x8 sse4_1/;
-  }
+  specialize qw/av1_fwd_txfm2d_8x8 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  if (aom_config("CONFIG_DAALA_DCT16") ne "yes") {
-    specialize qw/av1_fwd_txfm2d_16x16 sse4_1/;
-  }
+  specialize qw/av1_fwd_txfm2d_16x16 sse4_1/;
   add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  if (aom_config("CONFIG_DAALA_DCT32") ne "yes") {
-    specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
-  }
-
-  if (aom_config("CONFIG_TX64X64") eq "yes") {
-    add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-    add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-    add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
-  }
+  specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
+
+  add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_16x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_64x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+
   #
   # Motion search
   #
-  add_proto qw/int av1_full_search_sad/, "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv";
-  specialize qw/av1_full_search_sad sse3 sse4_1/;
-  $av1_full_search_sad_sse3=av1_full_search_sadx3;
-  $av1_full_search_sad_sse4_1=av1_full_search_sadx8;
+  add_proto qw/int av1_diamond_search_sad/, "struct macroblock *x, const struct search_site_config *cfg,  MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
 
-  add_proto qw/int av1_diamond_search_sad/, "struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv";
-
-  add_proto qw/int av1_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv";
+  add_proto qw/int av1_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
 
   add_proto qw/void av1_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
   specialize qw/av1_temporal_filter_apply sse2 msa/;
 
-  if (aom_config("CONFIG_AOM_QM") eq "yes") {
-    add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
-  } else {
-    add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
-  }
-
-  if (aom_config("CONFIG_LGT_FROM_PRED") eq "yes") {
-    add_proto qw/void flgt2d_from_pred/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-  }
-
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-
-    # ENCODEMB INVOKE
-    if (aom_config("CONFIG_NEW_QUANT") eq "yes") {
-      add_proto qw/void highbd_quantize_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+  add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
 
-      add_proto qw/void highbd_quantize_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-
-      add_proto qw/void highbd_quantize_32x32_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-
-      add_proto qw/void highbd_quantize_32x32_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-
-      if (aom_config("CONFIG_TX64X64") eq "yes") {
-        add_proto qw/void highbd_quantize_64x64_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-
-        add_proto qw/void highbd_quantize_64x64_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
-      }
-    }
-
-    add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
-    specialize qw/av1_highbd_block_error sse2/;
+  # ENCODEMB INVOKE
 
-    add_proto qw/void av1_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+  add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+  specialize qw/av1_highbd_block_error sse2/;
 
-  }
+  add_proto qw/void av1_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
 
-  add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
+  add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
   specialize qw/av1_highbd_quantize_fp sse4_1 avx2/;
 
   add_proto qw/void av1_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
 
   # End av1_high encoder functions
 
+  # txb
+  add_proto qw/void av1_get_nz_map_contexts/, "const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts";
+  specialize qw/av1_get_nz_map_contexts sse2/;
+  add_proto qw/void av1_txb_init_levels/, "const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels";
+  specialize qw/av1_txb_init_levels sse4_1/;
+
   add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
   specialize qw/av1_wedge_sse_from_residuals sse2/;
   add_proto qw/int av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
@@ -500,179 +247,132 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
   specialize qw/av1_wedge_compute_delta_squares sse2/;
 
-}
-# end encoder functions
-
-# If PVQ is enabled, fwd transforms are required by decoder
-if (aom_config("CONFIG_PVQ") eq "yes") {
-  # fdct functions
-
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-    specialize qw/av1_fht4x4 sse2/;
-
-    add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-    specialize qw/av1_fht8x8 sse2/;
-
-    add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-    specialize qw/av1_fht16x16 sse2/;
-
-    add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fwht4x4 sse2/;
-  } else {
-    add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-    specialize qw/av1_fht4x4 sse2 msa/;
-
-    add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-    specialize qw/av1_fht8x8 sse2 msa/;
-
-    add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, struct txfm_param *param";
-    specialize qw/av1_fht16x16 sse2 msa/;
-
-    add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fwht4x4 msa sse2/;
-  }
+  # hash
+  add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, int length";
+  specialize qw/av1_get_crc32c_value sse4_2/;
 
 }
+# end encoder functions
 
 # Deringing Functions
 
-if (aom_config("CONFIG_CDEF") eq "yes") {
-  add_proto qw/int cdef_find_dir/, "const uint16_t *img, int stride, int32_t *var, int coeff_shift";
-  if (aom_config("CONFIG_CDEF_SINGLEPASS") ne "yes") {
-    add_proto qw/void aom_clpf_block_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
-    add_proto qw/void aom_clpf_hblock_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
-    add_proto qw/void aom_clpf_block/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
-    add_proto qw/void aom_clpf_hblock/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
-    add_proto qw/void cdef_direction_4x4/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping";
-    add_proto qw/void cdef_direction_8x8/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping";
-    add_proto qw/void copy_8x8_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
-    add_proto qw/void copy_4x4_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
-    add_proto qw/void copy_8x8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride";
-    add_proto qw/void copy_4x4_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride";
-  } else {
-    add_proto qw/void cdef_filter_block/, "uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max";
-  }
-
-  add_proto qw/void copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
-  add_proto qw/void copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";
-
-  # VS compiling for 32 bit targets does not support vector types in
-  # structs as arguments, which makes the v256 type of the intrinsics
-  # hard to support, so optimizations for this target are disabled.
-  if ($opts{config} !~ /libs-x86-win32-vs.*/) {
-    if (aom_config("CONFIG_CDEF_SINGLEPASS") eq "yes") {
-      specialize qw/cdef_find_dir sse2 ssse3 sse4_1 avx2 neon/;
-      specialize qw/cdef_filter_block sse2 ssse3 sse4_1 avx2 neon/;
-      specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
-      specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
-    } else {
-      specialize qw/cdef_find_dir sse2 ssse3 sse4_1 neon/;
-      specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
-      specialize qw/aom_clpf_hblock_hbd sse2 ssse3 sse4_1 neon/;
-      specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
-      specialize qw/aom_clpf_hblock sse2 ssse3 sse4_1 neon/;
-      specialize qw/cdef_find_dir sse2 ssse3 sse4_1 neon/;
-      specialize qw/cdef_direction_4x4 sse2 ssse3 sse4_1 neon/;
-      specialize qw/cdef_direction_8x8 sse2 ssse3 sse4_1 neon/;
-
-      specialize qw/copy_8x8_16bit_to_8bit sse2 ssse3 sse4_1 neon/;
-      specialize qw/copy_4x4_16bit_to_8bit sse2 ssse3 sse4_1 neon/;
-      specialize qw/copy_8x8_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
-      specialize qw/copy_4x4_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
-      specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 neon/;
-      specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
-    }
-  }
-}
+add_proto qw/int cdef_find_dir/, "const uint16_t *img, int stride, int32_t *var, int coeff_shift";
+add_proto qw/void cdef_filter_block/, "uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift";
 
-# PVQ Functions
+add_proto qw/void copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
+add_proto qw/void copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";
 
-if (aom_config("CONFIG_PVQ") eq "yes") {
-  add_proto qw/double pvq_search_rdo_double/, "const od_val16 *xcoeff, int n, int k, int *ypulse, double g2, double pvq_norm_lambda, int prev_k";
-  specialize qw/pvq_search_rdo_double sse4_1/;
+# VS compiling for 32 bit targets does not support vector types in
+# structs as arguments, which makes the v256 type of the intrinsics
+# hard to support, so optimizations for this target are disabled.
+if ($opts{config} !~ /libs-x86-win32-vs.*/) {
+  specialize qw/cdef_find_dir sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/cdef_filter_block sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
 }
 
 # WARPED_MOTION / GLOBAL_MOTION functions
 
-if ((aom_config("CONFIG_WARPED_MOTION") eq "yes") ||
-    (aom_config("CONFIG_GLOBAL_MOTION") eq "yes")) {
-  add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-  specialize qw/av1_warp_affine sse2 ssse3/;
+add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
+specialize qw/av1_warp_affine sse4_1/;
 
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-    specialize qw/av1_highbd_warp_affine ssse3/;
-  }
-}
+add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
+specialize qw/av1_highbd_warp_affine sse4_1/;
 
-if (aom_config("CONFIG_GLOBAL_MOTION") eq "yes" &&
-    aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/double compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
   specialize qw/compute_cross_correlation sse4_1/;
 }
 
 # LOOP_RESTORATION functions
 
-if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
-  add_proto qw/void apply_selfguided_restoration/, "uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf";
-  specialize qw/apply_selfguided_restoration sse4_1/;
+add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
+specialize qw/apply_selfguided_restoration sse4_1 avx2/;
 
-  add_proto qw/void av1_selfguided_restoration/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
-  specialize qw/av1_selfguided_restoration sse4_1/;
+add_proto qw/void av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
+                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                  int sgr_params_idx, int bit_depth, int highbd";
+specialize qw/av1_selfguided_restoration sse4_1 avx2/;
 
-  add_proto qw/void av1_highpass_filter/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
-  specialize qw/av1_highpass_filter sse4_1/;
+# CONVOLVE_ROUND/COMPOUND_ROUND functions
 
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void apply_selfguided_restoration_highbd/, "uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf";
-    specialize qw/apply_selfguided_restoration_highbd sse4_1/;
+add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+
+  add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
+  add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
+
+  specialize qw/av1_convolve_2d_sr sse2 avx2 neon/;
+  specialize qw/av1_convolve_2d_copy_sr sse2 avx2 neon/;
+  specialize qw/av1_convolve_x_sr sse2 avx2 neon/;
+  specialize qw/av1_convolve_y_sr sse2 avx2 neon/;
+  specialize qw/av1_convolve_2d_scale sse4_1/;
+  specialize qw/av1_jnt_convolve_2d ssse3 avx2 neon/;
+  specialize qw/av1_jnt_convolve_2d_copy sse2 avx2 neon/;
+  specialize qw/av1_jnt_convolve_x sse2 avx2 neon/;
+  specialize qw/av1_jnt_convolve_y sse2 avx2 neon/;
+  specialize qw/av1_highbd_convolve_2d_copy_sr sse2 avx2/;
+  specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2/;
+  specialize qw/av1_highbd_convolve_x_sr ssse3 avx2/;
+  specialize qw/av1_highbd_convolve_y_sr ssse3 avx2/;
+  specialize qw/av1_highbd_convolve_2d_scale sse4_1/;
+  specialize qw/av1_highbd_jnt_convolve_2d sse4_1 avx2/;
+  specialize qw/av1_highbd_jnt_convolve_x sse4_1 avx2/;
+  specialize qw/av1_highbd_jnt_convolve_y sse4_1 avx2/;
+  specialize qw/av1_highbd_jnt_convolve_2d_copy sse4_1 avx2/;
 
-    add_proto qw/void av1_selfguided_restoration_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps";
-    specialize qw/av1_selfguided_restoration_highbd sse4_1/;
+# INTRA_EDGE functions
+add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength";
+specialize qw/av1_filter_intra_edge sse4_1/;
+add_proto qw/void av1_upsample_intra_edge/, "uint8_t *p, int sz";
+specialize qw/av1_upsample_intra_edge sse4_1/;
 
-    add_proto qw/void av1_highpass_filter_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
-    specialize qw/av1_highpass_filter_highbd sse4_1/;
-  }
-}
+add_proto qw/void av1_filter_intra_edge_high/, "uint16_t *p, int sz, int strength";
+specialize qw/av1_filter_intra_edge_high sse4_1/;
+add_proto qw/void av1_upsample_intra_edge_high/, "uint16_t *p, int sz, int bd";
+specialize qw/av1_upsample_intra_edge_high sse4_1/;
 
-# CONVOLVE_ROUND/COMPOUND_ROUND functions
+# CFL
+add_proto qw/cfl_subtract_average_fn get_subtract_average_fn/, "TX_SIZE tx_size";
+specialize qw/get_subtract_average_fn sse2 avx2 neon vsx/;
 
-if (aom_config("CONFIG_CONVOLVE_ROUND") eq "yes") {
-  add_proto qw/void av1_convolve_2d/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-  specialize qw/av1_convolve_2d sse2/;
-  add_proto qw/void av1_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits";
-  specialize qw/av1_convolve_rounding avx2/;
-
-  add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
-  if (aom_config("CONFIG_COMPOUND_ROUND") ne "yes") {
-    specialize qw/av1_convolve_2d_scale sse4_1/;
-  }
-
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void av1_highbd_convolve_2d/, "const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-    specialize qw/av1_highbd_convolve_2d ssse3/;
-    add_proto qw/void av1_highbd_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd";
-    specialize qw/av1_highbd_convolve_rounding avx2/;
-
-    add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
-    if (aom_config("CONFIG_COMPOUND_ROUND") ne "yes") {
-        specialize qw/av1_highbd_convolve_2d_scale sse4_1/;
-    }
-  }
-}
+add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_420_lbd ssse3 avx2 neon/;
 
-# INTRA_EDGE functions
-if (aom_config("CONFIG_INTRA_EDGE") eq "yes") {
-  add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength";
-  specialize qw/av1_filter_intra_edge sse4_1/;
-  add_proto qw/void av1_upsample_intra_edge/, "uint8_t *p, int sz";
-  specialize qw/av1_upsample_intra_edge sse4_1/;
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void av1_filter_intra_edge_high/, "uint16_t *p, int sz, int strength";
-    specialize qw/av1_filter_intra_edge_high sse4_1/;
-    add_proto qw/void av1_upsample_intra_edge_high/, "uint16_t *p, int sz, int bd";
-    specialize qw/av1_upsample_intra_edge_high sse4_1/;
-  }
-}
+add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_422_lbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_444_lbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_420_hbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_422_hbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_444_hbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_predict_lbd_fn get_predict_lbd_fn/, "TX_SIZE tx_size";
+specialize qw/get_predict_lbd_fn ssse3 avx2 neon/;
+
+add_proto qw/cfl_predict_hbd_fn get_predict_hbd_fn/, "TX_SIZE tx_size";
+specialize qw/get_predict_hbd_fn ssse3 avx2 neon/;
 
+1;
diff --git a/third_party/aom/av1/common/av1_txfm.c b/third_party/aom/av1/common/av1_txfm.c
new file mode 100644
index 000000000..1e6654121
--- /dev/null
+++ b/third_party/aom/av1/common/av1_txfm.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/av1_txfm.h"
+
+// av1_cospi_arr[i][j] = (int)round(cos(M_PI*j/128) * (1<<(cos_bit_min+i)));
+const int32_t av1_cospi_arr_data[7][64] = {
+  { 1024, 1024, 1023, 1021, 1019, 1016, 1013, 1009, 1004, 999, 993, 987, 980,
+    972,  964,  955,  946,  936,  926,  915,  903,  891,  878, 865, 851, 837,
+    822,  807,  792,  775,  759,  742,  724,  706,  688,  669, 650, 630, 610,
+    590,  569,  548,  526,  505,  483,  460,  438,  415,  392, 369, 345, 321,
+    297,  273,  249,  224,  200,  175,  150,  125,  100,  75,  50,  25 },
+  { 2048, 2047, 2046, 2042, 2038, 2033, 2026, 2018, 2009, 1998, 1987,
+    1974, 1960, 1945, 1928, 1911, 1892, 1872, 1851, 1829, 1806, 1782,
+    1757, 1730, 1703, 1674, 1645, 1615, 1583, 1551, 1517, 1483, 1448,
+    1412, 1375, 1338, 1299, 1260, 1220, 1179, 1138, 1096, 1053, 1009,
+    965,  921,  876,  830,  784,  737,  690,  642,  595,  546,  498,
+    449,  400,  350,  301,  251,  201,  151,  100,  50 },
+  { 4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973,
+    3948, 3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564,
+    3513, 3461, 3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896,
+    2824, 2751, 2675, 2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019,
+    1931, 1842, 1751, 1660, 1567, 1474, 1380, 1285, 1189, 1092, 995,
+    897,  799,  700,  601,  501,  401,  301,  201,  101 },
+  { 8192, 8190, 8182, 8170, 8153, 8130, 8103, 8071, 8035, 7993, 7946,
+    7895, 7839, 7779, 7713, 7643, 7568, 7489, 7405, 7317, 7225, 7128,
+    7027, 6921, 6811, 6698, 6580, 6458, 6333, 6203, 6070, 5933, 5793,
+    5649, 5501, 5351, 5197, 5040, 4880, 4717, 4551, 4383, 4212, 4038,
+    3862, 3683, 3503, 3320, 3135, 2948, 2760, 2570, 2378, 2185, 1990,
+    1795, 1598, 1401, 1202, 1003, 803,  603,  402,  201 },
+  { 16384, 16379, 16364, 16340, 16305, 16261, 16207, 16143, 16069, 15986, 15893,
+    15791, 15679, 15557, 15426, 15286, 15137, 14978, 14811, 14635, 14449, 14256,
+    14053, 13842, 13623, 13395, 13160, 12916, 12665, 12406, 12140, 11866, 11585,
+    11297, 11003, 10702, 10394, 10080, 9760,  9434,  9102,  8765,  8423,  8076,
+    7723,  7366,  7005,  6639,  6270,  5897,  5520,  5139,  4756,  4370,  3981,
+    3590,  3196,  2801,  2404,  2006,  1606,  1205,  804,   402 },
+  { 32768, 32758, 32729, 32679, 32610, 32522, 32413, 32286, 32138, 31972, 31786,
+    31581, 31357, 31114, 30853, 30572, 30274, 29957, 29622, 29269, 28899, 28511,
+    28106, 27684, 27246, 26791, 26320, 25833, 25330, 24812, 24279, 23732, 23170,
+    22595, 22006, 21403, 20788, 20160, 19520, 18868, 18205, 17531, 16846, 16151,
+    15447, 14733, 14010, 13279, 12540, 11793, 11039, 10279, 9512,  8740,  7962,
+    7180,  6393,  5602,  4808,  4011,  3212,  2411,  1608,  804 },
+  { 65536, 65516, 65457, 65358, 65220, 65043, 64827, 64571, 64277, 63944, 63572,
+    63162, 62714, 62228, 61705, 61145, 60547, 59914, 59244, 58538, 57798, 57022,
+    56212, 55368, 54491, 53581, 52639, 51665, 50660, 49624, 48559, 47464, 46341,
+    45190, 44011, 42806, 41576, 40320, 39040, 37736, 36410, 35062, 33692, 32303,
+    30893, 29466, 28020, 26558, 25080, 23586, 22078, 20557, 19024, 17479, 15924,
+    14359, 12785, 11204, 9616,  8022,  6424,  4821,  3216,  1608 }
+};
+
+// av1_sinpi_arr_data[i][j] = (int)round((sqrt(2) * sin(j*Pi/9) * 2 / 3) * (1
+// << (cos_bit_min + i))) modified so that elements j=1,2 sum to element j=4.
+const int32_t av1_sinpi_arr_data[7][5] = {
+  { 0, 330, 621, 836, 951 },        { 0, 660, 1241, 1672, 1901 },
+  { 0, 1321, 2482, 3344, 3803 },    { 0, 2642, 4964, 6689, 7606 },
+  { 0, 5283, 9929, 13377, 15212 },  { 0, 10566, 19858, 26755, 30424 },
+  { 0, 21133, 39716, 53510, 60849 }
+};
+
+void av1_round_shift_array_c(int32_t *arr, int size, int bit) {
+  int i;
+  if (bit == 0) {
+    return;
+  } else {
+    if (bit > 0) {
+      for (i = 0; i < size; i++) {
+        arr[i] = round_shift(arr[i], bit);
+      }
+    } else {
+      for (i = 0; i < size; i++) {
+        arr[i] = (int32_t)clamp64(((int64_t)1 << (-bit)) * arr[i], INT32_MIN,
+                                  INT32_MAX);
+      }
+    }
+  }
+}
+
+const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D] = {
+  { TXFM_TYPE_DCT4, TXFM_TYPE_ADST4, TXFM_TYPE_ADST4, TXFM_TYPE_IDENTITY4 },
+  { TXFM_TYPE_DCT8, TXFM_TYPE_ADST8, TXFM_TYPE_ADST8, TXFM_TYPE_IDENTITY8 },
+  { TXFM_TYPE_DCT16, TXFM_TYPE_ADST16, TXFM_TYPE_ADST16, TXFM_TYPE_IDENTITY16 },
+  { TXFM_TYPE_DCT32, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID,
+    TXFM_TYPE_IDENTITY32 },
+  { TXFM_TYPE_DCT64, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID }
+};
+
+const int8_t av1_txfm_stage_num_list[TXFM_TYPES] = {
+  4,   // TXFM_TYPE_DCT4
+  6,   // TXFM_TYPE_DCT8
+  8,   // TXFM_TYPE_DCT16
+  10,  // TXFM_TYPE_DCT32
+  12,  // TXFM_TYPE_DCT64
+  7,   // TXFM_TYPE_ADST4
+  8,   // TXFM_TYPE_ADST8
+  10,  // TXFM_TYPE_ADST16
+  1,   // TXFM_TYPE_IDENTITY4
+  1,   // TXFM_TYPE_IDENTITY8
+  1,   // TXFM_TYPE_IDENTITY16
+  1,   // TXFM_TYPE_IDENTITY32
+};
diff --git a/third_party/aom/av1/common/av1_txfm.h b/third_party/aom/av1/common/av1_txfm.h
index bd365de59..5db3233f5 100644
--- a/third_party/aom/av1/common/av1_txfm.h
+++ b/third_party/aom/av1/common/av1_txfm.h
@@ -16,6 +16,8 @@
 #include <math.h>
 #include <stdio.h>
 
+#include "config/aom_config.h"
+
 #include "av1/common/enums.h"
 #include "av1/common/blockd.h"
 #include "aom/aom_integer.h"
@@ -25,100 +27,73 @@
 extern "C" {
 #endif
 
+#if !defined(DO_RANGE_CHECK_CLAMP)
+#define DO_RANGE_CHECK_CLAMP 0
+#endif
+
+extern const int32_t av1_cospi_arr_data[7][64];
+extern const int32_t av1_sinpi_arr_data[7][5];
+
 #define MAX_TXFM_STAGE_NUM 12
 
 static const int cos_bit_min = 10;
 static const int cos_bit_max = 16;
 
-// cospi_arr[i][j] = (int)round(cos(M_PI*j/128) * (1<<(cos_bit_min+i)));
-static const int32_t cospi_arr_data[7][64] = {
-  { 1024, 1024, 1023, 1021, 1019, 1016, 1013, 1009, 1004, 999, 993, 987, 980,
-    972,  964,  955,  946,  936,  926,  915,  903,  891,  878, 865, 851, 837,
-    822,  807,  792,  775,  759,  742,  724,  706,  688,  669, 650, 630, 610,
-    590,  569,  548,  526,  505,  483,  460,  438,  415,  392, 369, 345, 321,
-    297,  273,  249,  224,  200,  175,  150,  125,  100,  75,  50,  25 },
-  { 2048, 2047, 2046, 2042, 2038, 2033, 2026, 2018, 2009, 1998, 1987,
-    1974, 1960, 1945, 1928, 1911, 1892, 1872, 1851, 1829, 1806, 1782,
-    1757, 1730, 1703, 1674, 1645, 1615, 1583, 1551, 1517, 1483, 1448,
-    1412, 1375, 1338, 1299, 1260, 1220, 1179, 1138, 1096, 1053, 1009,
-    965,  921,  876,  830,  784,  737,  690,  642,  595,  546,  498,
-    449,  400,  350,  301,  251,  201,  151,  100,  50 },
-  { 4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973,
-    3948, 3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564,
-    3513, 3461, 3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896,
-    2824, 2751, 2675, 2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019,
-    1931, 1842, 1751, 1660, 1567, 1474, 1380, 1285, 1189, 1092, 995,
-    897,  799,  700,  601,  501,  401,  301,  201,  101 },
-  { 8192, 8190, 8182, 8170, 8153, 8130, 8103, 8071, 8035, 7993, 7946,
-    7895, 7839, 7779, 7713, 7643, 7568, 7489, 7405, 7317, 7225, 7128,
-    7027, 6921, 6811, 6698, 6580, 6458, 6333, 6203, 6070, 5933, 5793,
-    5649, 5501, 5351, 5197, 5040, 4880, 4717, 4551, 4383, 4212, 4038,
-    3862, 3683, 3503, 3320, 3135, 2948, 2760, 2570, 2378, 2185, 1990,
-    1795, 1598, 1401, 1202, 1003, 803,  603,  402,  201 },
-  { 16384, 16379, 16364, 16340, 16305, 16261, 16207, 16143, 16069, 15986, 15893,
-    15791, 15679, 15557, 15426, 15286, 15137, 14978, 14811, 14635, 14449, 14256,
-    14053, 13842, 13623, 13395, 13160, 12916, 12665, 12406, 12140, 11866, 11585,
-    11297, 11003, 10702, 10394, 10080, 9760,  9434,  9102,  8765,  8423,  8076,
-    7723,  7366,  7005,  6639,  6270,  5897,  5520,  5139,  4756,  4370,  3981,
-    3590,  3196,  2801,  2404,  2006,  1606,  1205,  804,   402 },
-  { 32768, 32758, 32729, 32679, 32610, 32522, 32413, 32286, 32138, 31972, 31786,
-    31581, 31357, 31114, 30853, 30572, 30274, 29957, 29622, 29269, 28899, 28511,
-    28106, 27684, 27246, 26791, 26320, 25833, 25330, 24812, 24279, 23732, 23170,
-    22595, 22006, 21403, 20788, 20160, 19520, 18868, 18205, 17531, 16846, 16151,
-    15447, 14733, 14010, 13279, 12540, 11793, 11039, 10279, 9512,  8740,  7962,
-    7180,  6393,  5602,  4808,  4011,  3212,  2411,  1608,  804 },
-  { 65536, 65516, 65457, 65358, 65220, 65043, 64827, 64571, 64277, 63944, 63572,
-    63162, 62714, 62228, 61705, 61145, 60547, 59914, 59244, 58538, 57798, 57022,
-    56212, 55368, 54491, 53581, 52639, 51665, 50660, 49624, 48559, 47464, 46341,
-    45190, 44011, 42806, 41576, 40320, 39040, 37736, 36410, 35062, 33692, 32303,
-    30893, 29466, 28020, 26558, 25080, 23586, 22078, 20557, 19024, 17479, 15924,
-    14359, 12785, 11204, 9616,  8022,  6424,  4821,  3216,  1608 }
-};
+static const int NewSqrt2Bits = 12;
+// 2^12 * sqrt(2)
+static const int32_t NewSqrt2 = 5793;
+// 2^12 / sqrt(2)
+static const int32_t NewInvSqrt2 = 2896;
 
 static INLINE const int32_t *cospi_arr(int n) {
-  return cospi_arr_data[n - cos_bit_min];
+  return av1_cospi_arr_data[n - cos_bit_min];
 }
 
-static INLINE int32_t round_shift(int32_t value, int bit) {
-  assert(bit >= 1);
-  return (value + (1 << (bit - 1))) >> bit;
+static INLINE const int32_t *sinpi_arr(int n) {
+  return av1_sinpi_arr_data[n - cos_bit_min];
 }
 
-static INLINE void round_shift_array(int32_t *arr, int size, int bit) {
-  int i;
-  if (bit == 0) {
-    return;
-  } else {
-    if (bit > 0) {
-      for (i = 0; i < size; i++) {
-        arr[i] = round_shift(arr[i], bit);
-      }
-    } else {
-      for (i = 0; i < size; i++) {
-        arr[i] = arr[i] * (1 << (-bit));
-      }
-    }
+static INLINE int32_t range_check_value(int32_t value, int8_t bit) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  const int64_t max_value = (1LL << (bit - 1)) - 1;
+  const int64_t min_value = -(1LL << (bit - 1));
+  if (value < min_value || value > max_value) {
+    fprintf(stderr, "coeff out of bit range, value: %d bit %d\n", value, bit);
+    assert(0);
   }
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+#if DO_RANGE_CHECK_CLAMP
+  bit = AOMMIN(bit, 31);
+  return clamp(value, (1 << (bit - 1)) - 1, -(1 << (bit - 1)));
+#endif  // DO_RANGE_CHECK_CLAMP
+  (void)bit;
+  return value;
+}
+
+static INLINE int32_t round_shift(int64_t value, int bit) {
+  assert(bit >= 1);
+  return (int32_t)((value + (1ll << (bit - 1))) >> bit);
 }
 
 static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
                                int bit) {
-  int32_t result_32 = w0 * in0 + w1 * in1;
+  int64_t result_64 = (int64_t)(w0 * in0) + (int64_t)(w1 * in1);
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
-  int64_t result_64 = (int64_t)w0 * (int64_t)in0 + (int64_t)w1 * (int64_t)in1;
-  if (result_64 < INT32_MIN || result_64 > INT32_MAX) {
-    printf("%s %d overflow result_32: %d result_64: %" PRId64
-           " w0: %d in0: %d w1: %d in1: "
-           "%d\n",
-           __FILE__, __LINE__, result_32, result_64, w0, in0, w1, in1);
-    assert(0 && "half_btf overflow");
-  }
+  assert(result_64 >= INT32_MIN && result_64 <= INT32_MAX);
 #endif
-  return round_shift(result_32, bit);
+  return round_shift(result_64, bit);
 }
 
-typedef void (*TxfmFunc)(const int32_t *input, int32_t *output,
-                         const int8_t *cos_bit, const int8_t *stage_range);
+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
+                                             int bd) {
+  return clip_pixel_highbd(dest + (int)trans, bd);
+}
+
+typedef void (*TxfmFunc)(const int32_t *input, int32_t *output, int8_t cos_bit,
+                         const int8_t *stage_range);
+
+typedef void (*FwdTxfm2dFunc)(const int16_t *input, int32_t *output, int stride,
+                              TX_TYPE tx_type, int bd);
 
 typedef enum TXFM_TYPE {
   TXFM_TYPE_DCT4,
@@ -129,88 +104,82 @@ typedef enum TXFM_TYPE {
   TXFM_TYPE_ADST4,
   TXFM_TYPE_ADST8,
   TXFM_TYPE_ADST16,
-  TXFM_TYPE_ADST32,
   TXFM_TYPE_IDENTITY4,
   TXFM_TYPE_IDENTITY8,
   TXFM_TYPE_IDENTITY16,
   TXFM_TYPE_IDENTITY32,
-  TXFM_TYPE_IDENTITY64,
+  TXFM_TYPES,
+  TXFM_TYPE_INVALID,
 } TXFM_TYPE;
 
-typedef struct TXFM_1D_CFG {
-  const int txfm_size;
-  const int stage_num;
-
-  const int8_t *shift;
-  const int8_t *stage_range;
-  const int8_t *cos_bit;
-  const TXFM_TYPE txfm_type;
-} TXFM_1D_CFG;
-
 typedef struct TXFM_2D_FLIP_CFG {
+  TX_SIZE tx_size;
   int ud_flip;  // flip upside down
   int lr_flip;  // flip left to right
-  const TXFM_1D_CFG *col_cfg;
-  const TXFM_1D_CFG *row_cfg;
+  const int8_t *shift;
+  int8_t cos_bit_col;
+  int8_t cos_bit_row;
+  int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+  int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+  TXFM_TYPE txfm_type_col;
+  TXFM_TYPE txfm_type_row;
+  int stage_num_col;
+  int stage_num_row;
 } TXFM_2D_FLIP_CFG;
 
-static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) {
+static INLINE void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) {
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
-      cfg->ud_flip = 0;
-      cfg->lr_flip = 0;
+      *ud_flip = 0;
+      *lr_flip = 0;
       break;
-#if CONFIG_EXT_TX
     case IDTX:
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
-      cfg->ud_flip = 0;
-      cfg->lr_flip = 0;
+      *ud_flip = 0;
+      *lr_flip = 0;
       break;
     case FLIPADST_DCT:
     case FLIPADST_ADST:
     case V_FLIPADST:
-      cfg->ud_flip = 1;
-      cfg->lr_flip = 0;
+      *ud_flip = 1;
+      *lr_flip = 0;
       break;
     case DCT_FLIPADST:
     case ADST_FLIPADST:
     case H_FLIPADST:
-      cfg->ud_flip = 0;
-      cfg->lr_flip = 1;
+      *ud_flip = 0;
+      *lr_flip = 1;
       break;
     case FLIPADST_FLIPADST:
-      cfg->ud_flip = 1;
-      cfg->lr_flip = 1;
+      *ud_flip = 1;
+      *lr_flip = 1;
       break;
-#endif  // CONFIG_EXT_TX
     default:
-      cfg->ud_flip = 0;
-      cfg->lr_flip = 0;
+      *ud_flip = 0;
+      *lr_flip = 0;
       assert(0);
   }
 }
 
-#if CONFIG_TXMG
+static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) {
+  get_flip_cfg(tx_type, &cfg->ud_flip, &cfg->lr_flip);
+}
+
 static INLINE TX_SIZE av1_rotate_tx_size(TX_SIZE tx_size) {
   switch (tx_size) {
-#if CONFIG_CHROMA_2X2
-    case TX_2X2: return TX_2X2;
-#endif
     case TX_4X4: return TX_4X4;
     case TX_8X8: return TX_8X8;
     case TX_16X16: return TX_16X16;
     case TX_32X32: return TX_32X32;
-#if CONFIG_TX64X64
     case TX_64X64: return TX_64X64;
     case TX_32X64: return TX_64X32;
     case TX_64X32: return TX_32X64;
-#endif
     case TX_4X8: return TX_8X4;
     case TX_8X4: return TX_4X8;
     case TX_8X16: return TX_16X8;
@@ -221,6 +190,8 @@ static INLINE TX_SIZE av1_rotate_tx_size(TX_SIZE tx_size) {
     case TX_16X4: return TX_4X16;
     case TX_8X32: return TX_32X8;
     case TX_32X8: return TX_8X32;
+    case TX_16X64: return TX_64X16;
+    case TX_64X16: return TX_16X64;
     default: assert(0); return TX_INVALID;
   }
 }
@@ -231,7 +202,6 @@ static INLINE TX_TYPE av1_rotate_tx_type(TX_TYPE tx_type) {
     case ADST_DCT: return DCT_ADST;
     case DCT_ADST: return ADST_DCT;
     case ADST_ADST: return ADST_ADST;
-#if CONFIG_EXT_TX
     case FLIPADST_DCT: return DCT_FLIPADST;
     case DCT_FLIPADST: return FLIPADST_DCT;
     case FLIPADST_FLIPADST: return FLIPADST_FLIPADST;
@@ -244,123 +214,46 @@ static INLINE TX_TYPE av1_rotate_tx_type(TX_TYPE tx_type) {
     case H_ADST: return V_ADST;
     case V_FLIPADST: return H_FLIPADST;
     case H_FLIPADST: return V_FLIPADST;
-#endif  // CONFIG_EXT_TX
-#if CONFIG_MRC_TX
-    case MRC_DCT: return MRC_DCT;
-#endif  // CONFIG_MRC_TX
     default: assert(0); return TX_TYPES;
   }
 }
-#endif  // CONFIG_TXMG
-
-#if CONFIG_MRC_TX
-static INLINE int get_mrc_diff_mask_inter(const int16_t *diff, int diff_stride,
-                                          uint8_t *mask, int mask_stride,
-                                          int width, int height) {
-  // placeholder mask generation function
-  assert(SIGNAL_MRC_MASK_INTER);
-  int n_masked_vals = 0;
-  for (int i = 0; i < height; ++i) {
-    for (int j = 0; j < width; ++j) {
-      mask[i * mask_stride + j] = diff[i * diff_stride + j] > 100 ? 1 : 0;
-      n_masked_vals += mask[i * mask_stride + j];
-    }
-  }
-  return n_masked_vals;
-}
-
-static INLINE int get_mrc_pred_mask_inter(const uint8_t *pred, int pred_stride,
-                                          uint8_t *mask, int mask_stride,
-                                          int width, int height) {
-  // placeholder mask generation function
-  int n_masked_vals = 0;
-  for (int i = 0; i < height; ++i) {
-    for (int j = 0; j < width; ++j) {
-      mask[i * mask_stride + j] = pred[i * pred_stride + j] > 100 ? 1 : 0;
-      n_masked_vals += mask[i * mask_stride + j];
-    }
-  }
-  return n_masked_vals;
-}
-
-static INLINE int get_mrc_diff_mask_intra(const int16_t *diff, int diff_stride,
-                                          uint8_t *mask, int mask_stride,
-                                          int width, int height) {
-  // placeholder mask generation function
-  assert(SIGNAL_MRC_MASK_INTRA);
-  int n_masked_vals = 0;
-  for (int i = 0; i < height; ++i) {
-    for (int j = 0; j < width; ++j) {
-      mask[i * mask_stride + j] = diff[i * diff_stride + j] > 100 ? 1 : 0;
-      n_masked_vals += mask[i * mask_stride + j];
-    }
-  }
-  return n_masked_vals;
-}
 
-static INLINE int get_mrc_pred_mask_intra(const uint8_t *pred, int pred_stride,
-                                          uint8_t *mask, int mask_stride,
-                                          int width, int height) {
-  // placeholder mask generation function
-  int n_masked_vals = 0;
-  for (int i = 0; i < height; ++i) {
-    for (int j = 0; j < width; ++j) {
-      mask[i * mask_stride + j] = pred[i * pred_stride + j] > 100 ? 1 : 0;
-      n_masked_vals += mask[i * mask_stride + j];
-    }
-  }
-  return n_masked_vals;
-}
-
-static INLINE int get_mrc_diff_mask(const int16_t *diff, int diff_stride,
-                                    uint8_t *mask, int mask_stride, int width,
-                                    int height, int is_inter) {
-  if (is_inter) {
-    assert(USE_MRC_INTER && "MRC invalid for inter blocks");
-    assert(SIGNAL_MRC_MASK_INTER);
-    return get_mrc_diff_mask_inter(diff, diff_stride, mask, mask_stride, width,
-                                   height);
+// Utility function that returns the log of the ratio of the col and row
+// sizes.
+static INLINE int get_rect_tx_log_ratio(int col, int row) {
+  if (col == row) return 0;
+  if (col > row) {
+    if (col == row * 2) return 1;
+    if (col == row * 4) return 2;
+    assert(0 && "Unsupported transform size");
   } else {
-    assert(USE_MRC_INTRA && "MRC invalid for intra blocks");
-    assert(SIGNAL_MRC_MASK_INTRA);
-    return get_mrc_diff_mask_intra(diff, diff_stride, mask, mask_stride, width,
-                                   height);
+    if (row == col * 2) return -1;
+    if (row == col * 4) return -2;
+    assert(0 && "Unsupported transform size");
   }
+  return 0;  // Invalid
 }
 
-static INLINE int get_mrc_pred_mask(const uint8_t *pred, int pred_stride,
-                                    uint8_t *mask, int mask_stride, int width,
-                                    int height, int is_inter) {
-  if (is_inter) {
-    assert(USE_MRC_INTER && "MRC invalid for inter blocks");
-    return get_mrc_pred_mask_inter(pred, pred_stride, mask, mask_stride, width,
-                                   height);
-  } else {
-    assert(USE_MRC_INTRA && "MRC invalid for intra blocks");
-    return get_mrc_pred_mask_intra(pred, pred_stride, mask, mask_stride, width,
-                                   height);
-  }
-}
-
-static INLINE int is_valid_mrc_mask(int n_masked_vals, int width, int height) {
-  return !(n_masked_vals == 0 || n_masked_vals == (width * height));
-}
-#endif  // CONFIG_MRC_TX
-
 void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
                              const TXFM_2D_FLIP_CFG *cfg, int bd);
 
 void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
-                             const TXFM_2D_FLIP_CFG *cfg, int8_t fwd_shift,
+                             const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size,
                              int bd);
 
-TXFM_2D_FLIP_CFG av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size);
-#if CONFIG_TX64X64
-TXFM_2D_FLIP_CFG av1_get_fwd_txfm_64x64_cfg(TX_TYPE tx_type);
-TXFM_2D_FLIP_CFG av1_get_fwd_txfm_64x32_cfg(TX_TYPE tx_type);
-TXFM_2D_FLIP_CFG av1_get_fwd_txfm_32x64_cfg(TX_TYPE tx_type);
-#endif  // CONFIG_TX64X64
-TXFM_2D_FLIP_CFG av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size);
+void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+                          TXFM_2D_FLIP_CFG *cfg);
+void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+                          TXFM_2D_FLIP_CFG *cfg);
+extern const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D];
+extern const int8_t av1_txfm_stage_num_list[TXFM_TYPES];
+static INLINE int get_txw_idx(TX_SIZE tx_size) {
+  return tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
+}
+static INLINE int get_txh_idx(TX_SIZE tx_size) {
+  return tx_size_high_log2[tx_size] - tx_size_high_log2[0];
+}
+#define MAX_TXWH_IDX 5
 #ifdef __cplusplus
 }
 #endif  // __cplusplus
diff --git a/third_party/aom/av1/common/blockd.c b/third_party/aom/av1/common/blockd.c
index 7bada8bb1..86b4b5d6c 100644
--- a/third_party/aom/av1/common/blockd.c
+++ b/third_party/aom/av1/common/blockd.c
@@ -16,109 +16,17 @@
 #include "av1/common/blockd.h"
 #include "av1/common/onyxc_int.h"
 
-PREDICTION_MODE av1_left_block_mode(const MODE_INFO *cur_mi,
-                                    const MODE_INFO *left_mi, int b) {
-  if (b == 0 || b == 2) {
-    if (!left_mi || is_inter_block(&left_mi->mbmi)) return DC_PRED;
-
-    return get_y_mode(left_mi, b + 1);
-  } else {
-    assert(b == 1 || b == 3);
-    return cur_mi->bmi[b - 1].as_mode;
-  }
-}
-
-PREDICTION_MODE av1_above_block_mode(const MODE_INFO *cur_mi,
-                                     const MODE_INFO *above_mi, int b) {
-  if (b == 0 || b == 1) {
-    if (!above_mi || is_inter_block(&above_mi->mbmi)) return DC_PRED;
-
-    return get_y_mode(above_mi, b + 2);
-  } else {
-    assert(b == 2 || b == 3);
-    return cur_mi->bmi[b - 2].as_mode;
-  }
+PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi) {
+  if (!left_mi) return DC_PRED;
+  assert(!is_inter_block(left_mi) || is_intrabc_block(left_mi));
+  return left_mi->mode;
 }
 
-#if CONFIG_COEF_INTERLEAVE
-void av1_foreach_transformed_block_interleave(
-    const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
-    foreach_transformed_block_visitor visit, void *arg) {
-  const struct macroblockd_plane *const pd_y = &xd->plane[0];
-  const struct macroblockd_plane *const pd_c = &xd->plane[1];
-  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-
-  const TX_SIZE tx_log2_y = mbmi->tx_size;
-  const TX_SIZE tx_log2_c = av1_get_uv_tx_size(mbmi, pd_c);
-  const int tx_sz_y = (1 << tx_log2_y);
-  const int tx_sz_c = (1 << tx_log2_c);
-
-  const BLOCK_SIZE plane_bsize_y = get_plane_block_size(bsize, pd_y);
-  const BLOCK_SIZE plane_bsize_c = get_plane_block_size(bsize, pd_c);
-
-  const int num_4x4_w_y = num_4x4_blocks_wide_lookup[plane_bsize_y];
-  const int num_4x4_w_c = num_4x4_blocks_wide_lookup[plane_bsize_c];
-  const int num_4x4_h_y = num_4x4_blocks_high_lookup[plane_bsize_y];
-  const int num_4x4_h_c = num_4x4_blocks_high_lookup[plane_bsize_c];
-
-  const int step_y = 1 << (tx_log2_y << 1);
-  const int step_c = 1 << (tx_log2_c << 1);
-
-  const int max_4x4_w_y =
-      get_max_4x4_size(num_4x4_w_y, xd->mb_to_right_edge, pd_y->subsampling_x);
-  const int max_4x4_h_y =
-      get_max_4x4_size(num_4x4_h_y, xd->mb_to_bottom_edge, pd_y->subsampling_y);
-
-  const int extra_step_y = ((num_4x4_w_y - max_4x4_w_y) >> tx_log2_y) * step_y;
-
-  const int max_4x4_w_c =
-      get_max_4x4_size(num_4x4_w_c, xd->mb_to_right_edge, pd_c->subsampling_x);
-  const int max_4x4_h_c =
-      get_max_4x4_size(num_4x4_h_c, xd->mb_to_bottom_edge, pd_c->subsampling_y);
-
-  const int extra_step_c = ((num_4x4_w_c - max_4x4_w_c) >> tx_log2_c) * step_c;
-
-  // The max_4x4_w/h may be smaller than tx_sz under some corner cases,
-  // i.e. when the SB is splitted by tile boundaries.
-  const int tu_num_w_y = (max_4x4_w_y + tx_sz_y - 1) / tx_sz_y;
-  const int tu_num_h_y = (max_4x4_h_y + tx_sz_y - 1) / tx_sz_y;
-  const int tu_num_w_c = (max_4x4_w_c + tx_sz_c - 1) / tx_sz_c;
-  const int tu_num_h_c = (max_4x4_h_c + tx_sz_c - 1) / tx_sz_c;
-  const int tu_num_c = tu_num_w_c * tu_num_h_c;
-
-  int tu_idx_c = 0;
-  int offset_y, row_y, col_y;
-  int offset_c, row_c, col_c;
-
-  for (row_y = 0; row_y < tu_num_h_y; row_y++) {
-    for (col_y = 0; col_y < tu_num_w_y; col_y++) {
-      // luma
-      offset_y = (row_y * tu_num_w_y + col_y) * step_y + row_y * extra_step_y;
-      visit(0, offset_y, row_y * tx_sz_y, col_y * tx_sz_y, plane_bsize_y,
-            tx_log2_y, arg);
-      // chroma
-      if (tu_idx_c < tu_num_c) {
-        row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
-        col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
-        offset_c = tu_idx_c * step_c + (tu_idx_c / tu_num_w_c) * extra_step_c;
-        visit(1, offset_c, row_c, col_c, plane_bsize_c, tx_log2_c, arg);
-        visit(2, offset_c, row_c, col_c, plane_bsize_c, tx_log2_c, arg);
-        tu_idx_c++;
-      }
-    }
-  }
-
-  // In 422 case, it's possible that Chroma has more TUs than Luma
-  while (tu_idx_c < tu_num_c) {
-    row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
-    col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
-    offset_c = tu_idx_c * step_c + row_c * extra_step_c;
-    visit(1, offset_c, row_c, col_c, plane_bsize_c, tx_log2_c, arg);
-    visit(2, offset_c, row_c, col_c, plane_bsize_c, tx_log2_c, arg);
-    tu_idx_c++;
-  }
+PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) {
+  if (!above_mi) return DC_PRED;
+  assert(!is_inter_block(above_mi) || is_intrabc_block(above_mi));
+  return above_mi->mode;
 }
-#endif
 
 void av1_foreach_transformed_block_in_plane(
     const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
@@ -128,12 +36,8 @@ void av1_foreach_transformed_block_in_plane(
   // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
   // transform size varies per plane, look it up in a common way.
   const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-#if CONFIG_CHROMA_SUB8X8
   const BLOCK_SIZE plane_bsize =
-      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#else
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#endif
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
   const uint8_t txw_unit = tx_size_wide_unit[tx_size];
   const uint8_t txh_unit = tx_size_high_unit[tx_size];
   const int step = txw_unit * txh_unit;
@@ -147,7 +51,8 @@ void av1_foreach_transformed_block_in_plane(
 
   int blk_row, blk_col;
 
-  const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd);
+  const BLOCK_SIZE max_unit_bsize =
+      get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
   int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
   int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
   mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
@@ -170,95 +75,60 @@ void av1_foreach_transformed_block_in_plane(
   }
 }
 
-#if CONFIG_LV_MAP
 void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
                                    foreach_transformed_block_visitor visit,
-                                   void *arg) {
-  int plane;
-
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-#if CONFIG_CB4X4
+                                   void *arg, const int num_planes) {
+  for (int plane = 0; plane < num_planes; ++plane) {
     if (!is_chroma_reference(mi_row, mi_col, bsize,
                              xd->plane[plane].subsampling_x,
                              xd->plane[plane].subsampling_y))
       continue;
-#else
-    (void)mi_row;
-    (void)mi_col;
-#endif
     av1_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
   }
 }
-#endif
 
-#if !CONFIG_PVQ || CONFIG_VAR_TX
 void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
-                      int plane, TX_SIZE tx_size, int has_eob, int aoff,
-                      int loff) {
+                      int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                      int has_eob, int aoff, int loff) {
   ENTROPY_CONTEXT *const a = pd->above_context + aoff;
   ENTROPY_CONTEXT *const l = pd->left_context + loff;
   const int txs_wide = tx_size_wide_unit[tx_size];
   const int txs_high = tx_size_high_unit[tx_size];
-#if CONFIG_CB4X4
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-#else
-  const BLOCK_SIZE bsize = AOMMAX(xd->mi[0]->mbmi.sb_type, BLOCK_8X8);
-#endif
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
 
   // above
   if (has_eob && xd->mb_to_right_edge < 0) {
-    int i;
     const int blocks_wide = max_block_wide(xd, plane_bsize, plane);
-    int above_contexts = txs_wide;
-    if (above_contexts + aoff > blocks_wide)
-      above_contexts = blocks_wide - aoff;
-
-    for (i = 0; i < above_contexts; ++i) a[i] = has_eob;
-    for (i = above_contexts; i < txs_wide; ++i) a[i] = 0;
+    const int above_contexts = AOMMIN(txs_wide, blocks_wide - aoff);
+    memset(a, has_eob, sizeof(*a) * above_contexts);
+    memset(a + above_contexts, 0, sizeof(*a) * (txs_wide - above_contexts));
   } else {
-    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * txs_wide);
+    memset(a, has_eob, sizeof(*a) * txs_wide);
   }
 
   // left
   if (has_eob && xd->mb_to_bottom_edge < 0) {
-    int i;
     const int blocks_high = max_block_high(xd, plane_bsize, plane);
-    int left_contexts = txs_high;
-    if (left_contexts + loff > blocks_high) left_contexts = blocks_high - loff;
-
-    for (i = 0; i < left_contexts; ++i) l[i] = has_eob;
-    for (i = left_contexts; i < txs_high; ++i) l[i] = 0;
+    const int left_contexts = AOMMIN(txs_high, blocks_high - loff);
+    memset(l, has_eob, sizeof(*l) * left_contexts);
+    memset(l + left_contexts, 0, sizeof(*l) * (txs_high - left_contexts));
   } else {
-    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * txs_high);
+    memset(l, has_eob, sizeof(*l) * txs_high);
   }
 }
-#endif
-
 void av1_reset_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
-                            BLOCK_SIZE bsize) {
+                            BLOCK_SIZE bsize, const int num_planes) {
   int i;
   int nplanes;
-#if CONFIG_CB4X4
   int chroma_ref;
   chroma_ref =
       is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
                           xd->plane[1].subsampling_y);
-  nplanes = 1 + (MAX_MB_PLANE - 1) * chroma_ref;
-#else
-  (void)mi_row;
-  (void)mi_col;
-  nplanes = MAX_MB_PLANE;
-#endif
+  nplanes = 1 + (num_planes - 1) * chroma_ref;
   for (i = 0; i < nplanes; i++) {
     struct macroblockd_plane *const pd = &xd->plane[i];
-#if CONFIG_CHROMA_SUB8X8
     const BLOCK_SIZE plane_bsize =
-        AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#else
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#endif
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
     const int txs_wide = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
     const int txs_high = block_size_high[plane_bsize] >> tx_size_high_log2[0];
     memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide);
@@ -266,38 +136,61 @@ void av1_reset_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
   }
 }
 
-void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y) {
+void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes) {
+  xd->delta_lf_from_base = 0;
+  const int frame_lf_count =
+      num_planes > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+  for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) xd->delta_lf[lf_id] = 0;
+}
+
+void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes) {
+  for (int p = 0; p < num_planes; ++p) {
+    set_default_wiener(xd->wiener_info + p);
+    set_default_sgrproj(xd->sgrproj_info + p);
+  }
+}
+
+void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
+                            const int num_planes) {
   int i;
 
-  for (i = 0; i < MAX_MB_PLANE; i++) {
+  for (i = 0; i < num_planes; i++) {
     xd->plane[i].plane_type = get_plane_type(i);
     xd->plane[i].subsampling_x = i ? ss_x : 0;
     xd->plane[i].subsampling_y = i ? ss_y : 0;
   }
 }
 
-#if CONFIG_EXT_INTRA
 const int16_t dr_intra_derivative[90] = {
-  1,    14666, 7330, 4884, 3660, 2926, 2435, 2084, 1821, 1616, 1451, 1317, 1204,
-  1108, 1026,  955,  892,  837,  787,  743,  703,  666,  633,  603,  574,  548,
-  524,  502,   481,  461,  443,  426,  409,  394,  379,  365,  352,  339,  327,
-  316,  305,   294,  284,  274,  265,  256,  247,  238,  230,  222,  214,  207,
-  200,  192,   185,  179,  172,  166,  159,  153,  147,  141,  136,  130,  124,
-  119,  113,   108,  103,  98,   93,   88,   83,   78,   73,   68,   63,   59,
-  54,   49,    45,   40,   35,   31,   26,   22,   17,   13,   8,    4,
+  // More evenly spread out angles and limited to 10-bit
+  // Values that are 0 will never be used
+  //                    Approx angle
+  0,    0, 0,        //
+  1023, 0, 0,        // 3, ...
+  547,  0, 0,        // 6, ...
+  372,  0, 0, 0, 0,  // 9, ...
+  273,  0, 0,        // 14, ...
+  215,  0, 0,        // 17, ...
+  178,  0, 0,        // 20, ...
+  151,  0, 0,        // 23, ... (113 & 203 are base angles)
+  132,  0, 0,        // 26, ...
+  116,  0, 0,        // 29, ...
+  102,  0, 0, 0,     // 32, ...
+  90,   0, 0,        // 36, ...
+  80,   0, 0,        // 39, ...
+  71,   0, 0,        // 42, ...
+  64,   0, 0,        // 45, ... (45 & 135 are base angles)
+  57,   0, 0,        // 48, ...
+  51,   0, 0,        // 51, ...
+  45,   0, 0, 0,     // 54, ...
+  40,   0, 0,        // 58, ...
+  35,   0, 0,        // 61, ...
+  31,   0, 0,        // 64, ...
+  27,   0, 0,        // 67, ... (67 & 157 are base angles)
+  23,   0, 0,        // 70, ...
+  19,   0, 0,        // 73, ...
+  15,   0, 0, 0, 0,  // 76, ...
+  11,   0, 0,        // 81, ...
+  7,    0, 0,        // 84, ...
+  3,    0, 0,        // 87, ...
 };
-
-#if CONFIG_INTRA_INTERP
-int av1_is_intra_filter_switchable(int angle) {
-  assert(angle > 0 && angle < 270);
-  if (angle % 45 == 0) return 0;
-  if (angle > 90 && angle < 180) {
-    return 1;
-  } else {
-    return ((angle < 90 ? dr_intra_derivative[angle]
-                        : dr_intra_derivative[270 - angle]) &
-            0xFF) > 0;
-  }
-}
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
diff --git a/third_party/aom/av1/common/blockd.h b/third_party/aom/av1/common/blockd.h
index 01a449a1c..3e8d1d6c6 100644
--- a/third_party/aom/av1/common/blockd.h
+++ b/third_party/aom/av1/common/blockd.h
@@ -12,7 +12,7 @@
 #ifndef AV1_COMMON_BLOCKD_H_
 #define AV1_COMMON_BLOCKD_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/mem.h"
@@ -26,104 +26,40 @@
 #include "av1/common/scale.h"
 #include "av1/common/seg_common.h"
 #include "av1/common/tile_common.h"
-#if CONFIG_PVQ
-#include "av1/common/pvq.h"
-#include "av1/common/pvq_state.h"
-#include "av1/decoder/decint.h"
-#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#if (CONFIG_CHROMA_SUB8X8 || CONFIG_CHROMA_2X2)
-#define SUB8X8_COMP_REF 0
-#else
-#define SUB8X8_COMP_REF 1
-#endif
+#define USE_B_QUANT_NO_TRELLIS 1
 
 #define MAX_MB_PLANE 3
 
-#if CONFIG_COMPOUND_SEGMENT
-// Set COMPOUND_SEGMENT_TYPE to one of the three
-// 0: Uniform
-// 1: Difference weighted
-#define COMPOUND_SEGMENT_TYPE 1
-#define MAX_SEG_MASK_BITS 1
+#define MAX_DIFFWTD_MASK_BITS 1
 
-// SEG_MASK_TYPES should not surpass 1 << MAX_SEG_MASK_BITS
+// DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS
 typedef enum {
-#if COMPOUND_SEGMENT_TYPE == 0
-  UNIFORM_45 = 0,
-  UNIFORM_45_INV,
-#elif COMPOUND_SEGMENT_TYPE == 1
   DIFFWTD_38 = 0,
   DIFFWTD_38_INV,
-#endif  // COMPOUND_SEGMENT_TYPE
-  SEG_MASK_TYPES,
-} SEG_MASK_TYPE;
-
-#endif  // CONFIG_COMPOUND_SEGMENT
+  DIFFWTD_MASK_TYPES,
+} DIFFWTD_MASK_TYPE;
 
 typedef enum {
   KEY_FRAME = 0,
   INTER_FRAME = 1,
-#if CONFIG_OBU
   INTRA_ONLY_FRAME = 2,  // replaces intra-only
   S_FRAME = 3,
-#endif
   FRAME_TYPES,
 } FRAME_TYPE;
 
 static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) {
-  (void)bsize;
-#if SUB8X8_COMP_REF
-  return 1;
-#else
   return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
-#endif  // SUB8X8_COMP_REF
 }
 
 static INLINE int is_inter_mode(PREDICTION_MODE mode) {
   return mode >= NEARESTMV && mode <= NEW_NEWMV;
 }
 
-#if CONFIG_PVQ
-typedef struct PVQ_INFO {
-  int theta[PVQ_MAX_PARTITIONS];
-  int qg[PVQ_MAX_PARTITIONS];
-  int k[PVQ_MAX_PARTITIONS];
-  od_coeff y[OD_TXSIZE_MAX * OD_TXSIZE_MAX];
-  int nb_bands;
-  int off[PVQ_MAX_PARTITIONS];
-  int size[PVQ_MAX_PARTITIONS];
-  int skip_rest;
-  int skip_dir;
-  int bs;  // log of the block size minus two,
-           // i.e. equivalent to aom's TX_SIZE
-  // Block skip info, indicating whether DC/AC, is coded.
-  PVQ_SKIP_TYPE ac_dc_coded;  // bit0: DC coded, bit1 : AC coded (1 means coded)
-  tran_low_t dq_dc_residue;
-} PVQ_INFO;
-
-typedef struct PVQ_QUEUE {
-  PVQ_INFO *buf;  // buffer for pvq info, stored in encoding order
-  int curr_pos;   // curr position to write PVQ_INFO
-  int buf_len;    // allocated buffer length
-  int last_pos;   // last written position of PVQ_INFO in a tile
-} PVQ_QUEUE;
-#endif
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-typedef struct superblock_mi_boundaries {
-  int mi_row_begin;
-  int mi_col_begin;
-  int mi_row_end;
-  int mi_col_end;
-} SB_MI_BD;
-
-typedef struct { int16_t KERNEL[4][MAX_SB_SIZE][MAX_SB_SIZE]; } NCOBMC_KERNELS;
-#endif
-
 typedef struct {
   uint8_t *plane[MAX_MB_PLANE];
   int stride[MAX_MB_PLANE];
@@ -135,14 +71,6 @@ static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) {
 static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) {
   return mode >= NEAREST_NEARESTMV && mode <= NEW_NEWMV;
 }
-#if CONFIG_COMPOUND_SINGLEREF
-static INLINE int is_inter_singleref_comp_mode(PREDICTION_MODE mode) {
-  return mode >= SR_NEAREST_NEARMV && mode <= SR_NEW_NEWMV;
-}
-static INLINE int is_inter_anyref_comp_mode(PREDICTION_MODE mode) {
-  return is_inter_compound_mode(mode) || is_inter_singleref_comp_mode(mode);
-}
-#endif  // CONFIG_COMPOUND_SINGLEREF
 
 static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
   static PREDICTION_MODE lut[] = {
@@ -151,42 +79,29 @@ static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
     MB_MODE_COUNT,  // H_PRED
     MB_MODE_COUNT,  // D45_PRED
     MB_MODE_COUNT,  // D135_PRED
-    MB_MODE_COUNT,  // D117_PRED
-    MB_MODE_COUNT,  // D153_PRED
-    MB_MODE_COUNT,  // D207_PRED
-    MB_MODE_COUNT,  // D63_PRED
+    MB_MODE_COUNT,  // D113_PRED
+    MB_MODE_COUNT,  // D157_PRED
+    MB_MODE_COUNT,  // D203_PRED
+    MB_MODE_COUNT,  // D67_PRED
     MB_MODE_COUNT,  // SMOOTH_PRED
-#if CONFIG_SMOOTH_HV
     MB_MODE_COUNT,  // SMOOTH_V_PRED
     MB_MODE_COUNT,  // SMOOTH_H_PRED
-#endif              // CONFIG_SMOOTH_HV
-    MB_MODE_COUNT,  // TM_PRED
+    MB_MODE_COUNT,  // PAETH_PRED
     MB_MODE_COUNT,  // NEARESTMV
     MB_MODE_COUNT,  // NEARMV
-    MB_MODE_COUNT,  // ZEROMV
+    MB_MODE_COUNT,  // GLOBALMV
     MB_MODE_COUNT,  // NEWMV
-#if CONFIG_COMPOUND_SINGLEREF
-    NEARESTMV,  // SR_NEAREST_NEARMV
-    // NEARESTMV,  // SR_NEAREST_NEWMV
-    NEARMV,     // SR_NEAR_NEWMV
-    ZEROMV,     // SR_ZERO_NEWMV
-    NEWMV,      // SR_NEW_NEWMV
-#endif          // CONFIG_COMPOUND_SINGLEREF
-    NEARESTMV,  // NEAREST_NEARESTMV
-    NEARMV,     // NEAR_NEARMV
-    NEARESTMV,  // NEAREST_NEWMV
-    NEWMV,      // NEW_NEARESTMV
-    NEARMV,     // NEAR_NEWMV
-    NEWMV,      // NEW_NEARMV
-    ZEROMV,     // ZERO_ZEROMV
-    NEWMV,      // NEW_NEWMV
+    NEARESTMV,      // NEAREST_NEARESTMV
+    NEARMV,         // NEAR_NEARMV
+    NEARESTMV,      // NEAREST_NEWMV
+    NEWMV,          // NEW_NEARESTMV
+    NEARMV,         // NEAR_NEWMV
+    NEWMV,          // NEW_NEARMV
+    GLOBALMV,       // GLOBAL_GLOBALMV
+    NEWMV,          // NEW_NEWMV
   };
   assert(NELEMENTS(lut) == MB_MODE_COUNT);
-#if CONFIG_COMPOUND_SINGLEREF
-  assert(is_inter_anyref_comp_mode(mode));
-#else   // !CONFIG_COMPOUND_SINGLEREF
   assert(is_inter_compound_mode(mode));
-#endif  // CONFIG_COMPOUND_SINGLEREF
   return lut[mode];
 }
 
@@ -197,94 +112,54 @@ static INLINE PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) {
     MB_MODE_COUNT,  // H_PRED
     MB_MODE_COUNT,  // D45_PRED
     MB_MODE_COUNT,  // D135_PRED
-    MB_MODE_COUNT,  // D117_PRED
-    MB_MODE_COUNT,  // D153_PRED
-    MB_MODE_COUNT,  // D207_PRED
-    MB_MODE_COUNT,  // D63_PRED
+    MB_MODE_COUNT,  // D113_PRED
+    MB_MODE_COUNT,  // D157_PRED
+    MB_MODE_COUNT,  // D203_PRED
+    MB_MODE_COUNT,  // D67_PRED
     MB_MODE_COUNT,  // SMOOTH_PRED
-#if CONFIG_SMOOTH_HV
     MB_MODE_COUNT,  // SMOOTH_V_PRED
     MB_MODE_COUNT,  // SMOOTH_H_PRED
-#endif              // CONFIG_SMOOTH_HV
-    MB_MODE_COUNT,  // TM_PRED
+    MB_MODE_COUNT,  // PAETH_PRED
     MB_MODE_COUNT,  // NEARESTMV
     MB_MODE_COUNT,  // NEARMV
-    MB_MODE_COUNT,  // ZEROMV
+    MB_MODE_COUNT,  // GLOBALMV
     MB_MODE_COUNT,  // NEWMV
-#if CONFIG_COMPOUND_SINGLEREF
-    NEARMV,  // SR_NEAREST_NEARMV
-    // NEWMV,      // SR_NEAREST_NEWMV
-    NEWMV,      // SR_NEAR_NEWMV
-    NEWMV,      // SR_ZERO_NEWMV
-    NEWMV,      // SR_NEW_NEWMV
-#endif          // CONFIG_COMPOUND_SINGLEREF
-    NEARESTMV,  // NEAREST_NEARESTMV
-    NEARMV,     // NEAR_NEARMV
-    NEWMV,      // NEAREST_NEWMV
-    NEARESTMV,  // NEW_NEARESTMV
-    NEWMV,      // NEAR_NEWMV
-    NEARMV,     // NEW_NEARMV
-    ZEROMV,     // ZERO_ZEROMV
-    NEWMV,      // NEW_NEWMV
+    NEARESTMV,      // NEAREST_NEARESTMV
+    NEARMV,         // NEAR_NEARMV
+    NEWMV,          // NEAREST_NEWMV
+    NEARESTMV,      // NEW_NEARESTMV
+    NEWMV,          // NEAR_NEWMV
+    NEARMV,         // NEW_NEARMV
+    GLOBALMV,       // GLOBAL_GLOBALMV
+    NEWMV,          // NEW_NEWMV
   };
   assert(NELEMENTS(lut) == MB_MODE_COUNT);
-#if CONFIG_COMPOUND_SINGLEREF
-  assert(is_inter_anyref_comp_mode(mode));
-#else   // !CONFIG_COMPOUND_SINGLEREF
   assert(is_inter_compound_mode(mode));
-#endif  // CONFIG_COMPOUND_SINGLEREF
   return lut[mode];
 }
 
 static INLINE int have_nearmv_in_inter_mode(PREDICTION_MODE mode) {
   return (mode == NEARMV || mode == NEAR_NEARMV || mode == NEAR_NEWMV ||
-#if CONFIG_COMPOUND_SINGLEREF
-          mode == SR_NEAREST_NEARMV || mode == SR_NEAR_NEWMV ||
-#endif  // CONFIG_COMPOUND_SINGLEREF
           mode == NEW_NEARMV);
 }
 
 static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
   return (mode == NEWMV || mode == NEW_NEWMV || mode == NEAREST_NEWMV ||
-#if CONFIG_COMPOUND_SINGLEREF
-          /* mode == SR_NEAREST_NEWMV || */ mode == SR_NEAR_NEWMV ||
-          mode == SR_ZERO_NEWMV || mode == SR_NEW_NEWMV ||
-#endif  // CONFIG_COMPOUND_SINGLEREF
           mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV);
 }
 
 static INLINE int use_masked_motion_search(COMPOUND_TYPE type) {
-#if CONFIG_WEDGE
   return (type == COMPOUND_WEDGE);
-#else
-  (void)type;
-  return 0;
-#endif
 }
 
 static INLINE int is_masked_compound_type(COMPOUND_TYPE type) {
-#if CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-  return (type == COMPOUND_WEDGE || type == COMPOUND_SEG);
-#elif !CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-  return (type == COMPOUND_WEDGE);
-#elif CONFIG_COMPOUND_SEGMENT && !CONFIG_WEDGE
-  return (type == COMPOUND_SEG);
-#endif  // CONFIG_COMPOUND_SEGMENT
-  (void)type;
-  return 0;
+  return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD);
 }
 
 /* For keyframes, intra block modes are predicted by the (already decoded)
    modes for the Y blocks to the left and above us; for interframes, there
    is a single probability table. */
 
-typedef struct {
-  PREDICTION_MODE as_mode;
-  int_mv as_mv[2];  // first, second inter predictor motion vectors
-  int_mv pred_mv[2];
-  int_mv ref_mv[2];
-} b_mode_info;
-
 typedef int8_t MV_REFERENCE_FRAME;
 
 typedef struct {
@@ -294,19 +169,17 @@ typedef struct {
   uint16_t palette_colors[3 * PALETTE_MAX_SIZE];
 } PALETTE_MODE_INFO;
 
-#if CONFIG_FILTER_INTRA
-#define USE_3TAP_INTRA_FILTER 1  // 0: 4-tap; 1: 3-tap
 typedef struct {
-  // 1: an ext intra mode is used; 0: otherwise.
-  uint8_t use_filter_intra_mode[PLANE_TYPES];
-  FILTER_INTRA_MODE filter_intra_mode[PLANE_TYPES];
+  uint8_t use_filter_intra;
+  FILTER_INTRA_MODE filter_intra_mode;
 } FILTER_INTRA_MODE_INFO;
-#endif  // CONFIG_FILTER_INTRA
 
-#if CONFIG_VAR_TX
+static const PREDICTION_MODE fimode_to_intradir[FILTER_INTRA_MODES] = {
+  DC_PRED, V_PRED, H_PRED, D157_PRED, DC_PRED
+};
+
 #if CONFIG_RD_DEBUG
-#define TXB_COEFF_COST_MAP_SIZE (2 * MAX_MIB_SIZE)
-#endif
+#define TXB_COEFF_COST_MAP_SIZE (MAX_MIB_SIZE)
 #endif
 
 typedef struct RD_STATS {
@@ -325,213 +198,122 @@ typedef struct RD_STATS {
   uint8_t invalid_rate;
 #if CONFIG_RD_DEBUG
   int txb_coeff_cost[MAX_MB_PLANE];
-#if CONFIG_VAR_TX
   int txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE]
                         [TXB_COEFF_COST_MAP_SIZE];
-#endif  // CONFIG_VAR_TX
 #endif  // CONFIG_RD_DEBUG
 } RD_STATS;
 
 // This struct is used to group function args that are commonly
 // sent together in functions related to interinter compound modes
 typedef struct {
-#if CONFIG_WEDGE
   int wedge_index;
   int wedge_sign;
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-  SEG_MASK_TYPE mask_type;
+  DIFFWTD_MASK_TYPE mask_type;
   uint8_t *seg_mask;
-#endif  // CONFIG_COMPOUND_SEGMENT
-  COMPOUND_TYPE interinter_compound_type;
+  COMPOUND_TYPE type;
 } INTERINTER_COMPOUND_DATA;
 
-// This structure now relates to 8x8 block regions.
+#define INTER_TX_SIZE_BUF_LEN 16
+#define TXK_TYPE_BUF_LEN 64
+// This structure now relates to 4x4 block regions.
 typedef struct MB_MODE_INFO {
   // Common for both INTER and INTRA blocks
   BLOCK_SIZE sb_type;
   PREDICTION_MODE mode;
   TX_SIZE tx_size;
-#if CONFIG_VAR_TX
-  // TODO(jingning): This effectively assigned a separate entry for each
-  // 8x8 block. Apparently it takes much more space than needed.
-  TX_SIZE inter_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
-  TX_SIZE min_tx_size;
-#endif
+  uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN];
   int8_t skip;
+  int8_t skip_mode;
   int8_t segment_id;
-#if CONFIG_SUPERTX
-  // Minimum of all segment IDs under the current supertx block.
-  int8_t segment_id_supertx;
-#endif                      // CONFIG_SUPERTX
   int8_t seg_id_predicted;  // valid only when temporal_update is enabled
 
-#if CONFIG_MRC_TX
-  int valid_mrc_mask;
-#endif  // CONFIG_MRC_TX
-
   // Only for INTRA blocks
   UV_PREDICTION_MODE uv_mode;
 
   PALETTE_MODE_INFO palette_mode_info;
-#if CONFIG_INTRABC
   uint8_t use_intrabc;
-#endif  // CONFIG_INTRABC
 
   // Only for INTER blocks
   InterpFilters interp_filters;
   MV_REFERENCE_FRAME ref_frame[2];
-  TX_TYPE tx_type;
-#if CONFIG_TXK_SEL
-  TX_TYPE txk_type[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
-#endif
-#if CONFIG_LGT_FROM_PRED
-  int use_lgt;
-#endif
 
-#if CONFIG_FILTER_INTRA
+  TX_TYPE txk_type[TXK_TYPE_BUF_LEN];
+
   FILTER_INTRA_MODE_INFO filter_intra_mode_info;
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_EXT_INTRA
+
   // The actual prediction angle is the base angle + (angle_delta * step).
-  int8_t angle_delta[2];
-#if CONFIG_INTRA_INTERP
-  // To-Do (huisu): this may be replaced by interp_filter
-  INTRA_FILTER intra_filter;
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-
-#if CONFIG_INTERINTRA
+  int8_t angle_delta[PLANE_TYPES];
+
   // interintra members
   INTERINTRA_MODE interintra_mode;
-#endif
   // TODO(debargha): Consolidate these flags
   int use_wedge_interintra;
   int interintra_wedge_index;
   int interintra_wedge_sign;
   // interinter members
-  COMPOUND_TYPE interinter_compound_type;
-#if CONFIG_WEDGE
-  int wedge_index;
-  int wedge_sign;
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-  SEG_MASK_TYPE mask_type;
-#endif  // CONFIG_COMPOUND_SEGMENT
+  INTERINTER_COMPOUND_DATA interinter_comp;
   MOTION_MODE motion_mode;
-#if CONFIG_MOTION_VAR
   int overlappable_neighbors[2];
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  // Applying different weighting kernels in ncobmc
-  // In current implementation, interpolation modes only defined for squared
-  // blocks. A rectangular block is divided into two squared blocks and each
-  // squared block has an interpolation mode.
-  NCOBMC_MODE ncobmc_mode[2];
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR
   int_mv mv[2];
-  int_mv pred_mv[2];
   uint8_t ref_mv_idx;
-#if CONFIG_EXT_PARTITION_TYPES
   PARTITION_TYPE partition;
-#endif
-#if CONFIG_NEW_QUANT
-  int dq_off_index;
-  int send_dq_bit;
-#endif  // CONFIG_NEW_QUANT
   /* deringing gain *per-superblock* */
   int8_t cdef_strength;
-  int current_q_index;
-#if CONFIG_EXT_DELTA_Q
-  int current_delta_lf_from_base;
-#if CONFIG_LOOPFILTER_LEVEL
-  int curr_delta_lf[FRAME_LF_COUNT];
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif
+  int current_qindex;
+  int delta_lf_from_base;
+  int delta_lf[FRAME_LF_COUNT];
 #if CONFIG_RD_DEBUG
   RD_STATS rd_stats;
   int mi_row;
   int mi_col;
 #endif
-#if CONFIG_WARPED_MOTION
   int num_proj_ref[2];
   WarpedMotionParams wm_params[2];
-#endif  // CONFIG_WARPED_MOTION
 
-#if CONFIG_CFL
   // Index of the alpha Cb and alpha Cr combination
   int cfl_alpha_idx;
   // Joint sign of alpha Cb and alpha Cr
   int cfl_alpha_signs;
-#endif
 
-  BOUNDARY_TYPE boundary_info;
-#if CONFIG_LPF_SB
-  uint8_t filt_lvl;
-  int reuse_sb_lvl;
-  int sign;
-  int delta;
-#endif
+  int compound_idx;
+  int comp_group_idx;
 } MB_MODE_INFO;
 
-typedef struct MODE_INFO {
-  MB_MODE_INFO mbmi;
-  b_mode_info bmi[4];
-} MODE_INFO;
-
-#if CONFIG_INTRABC
 static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) {
   return mbmi->use_intrabc;
 }
-#endif
-
-static INLINE PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) {
-#if CONFIG_CB4X4
-  (void)block;
-  return mi->mbmi.mode;
-#else
-  return mi->mbmi.sb_type < BLOCK_8X8 ? mi->bmi[block].as_mode : mi->mbmi.mode;
-#endif
-}
 
-#if CONFIG_CFL
 static INLINE PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) {
-  static const PREDICTION_MODE uv2y[UV_INTRA_MODES] = {
-    DC_PRED,      // UV_DC_PRED
-    V_PRED,       // UV_V_PRED
-    H_PRED,       // UV_H_PRED
-    D45_PRED,     // UV_D45_PRED
-    D135_PRED,    // UV_D135_PRED
-    D117_PRED,    // UV_D117_PRED
-    D153_PRED,    // UV_D153_PRED
-    D207_PRED,    // UV_D207_PRED
-    D63_PRED,     // UV_D63_PRED
-    SMOOTH_PRED,  // UV_SMOOTH_PRED
-#if CONFIG_SMOOTH_HV
+  assert(mode < UV_INTRA_MODES);
+  static const PREDICTION_MODE uv2y[] = {
+    DC_PRED,        // UV_DC_PRED
+    V_PRED,         // UV_V_PRED
+    H_PRED,         // UV_H_PRED
+    D45_PRED,       // UV_D45_PRED
+    D135_PRED,      // UV_D135_PRED
+    D113_PRED,      // UV_D113_PRED
+    D157_PRED,      // UV_D157_PRED
+    D203_PRED,      // UV_D203_PRED
+    D67_PRED,       // UV_D67_PRED
+    SMOOTH_PRED,    // UV_SMOOTH_PRED
     SMOOTH_V_PRED,  // UV_SMOOTH_V_PRED
     SMOOTH_H_PRED,  // UV_SMOOTH_H_PRED
-#endif              // CONFIG_SMOOTH_HV
-    TM_PRED,        // UV_TM_PRED
-    DC_PRED,        // CFL_PRED
+    PAETH_PRED,     // UV_PAETH_PRED
+    DC_PRED,        // UV_CFL_PRED
+    INTRA_INVALID,  // UV_INTRA_MODES
+    INTRA_INVALID,  // UV_MODE_INVALID
   };
   return uv2y[mode];
 }
-#else
-static INLINE PREDICTION_MODE get_uv_mode(PREDICTION_MODE mode) { return mode; }
-#endif  // CONFIG_CFL
 
 static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
-#if CONFIG_INTRABC
-  if (is_intrabc_block(mbmi)) return 1;
-#endif
-  return mbmi->ref_frame[0] > INTRA_FRAME;
+  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
 }
 
 static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) {
   return mbmi->ref_frame[1] > INTRA_FRAME;
 }
 
-#if CONFIG_EXT_COMP_REFS
 static INLINE int has_uni_comp_refs(const MB_MODE_INFO *mbmi) {
   return has_second_ref(mbmi) && (!((mbmi->ref_frame[0] >= BWDREF_FRAME) ^
                                     (mbmi->ref_frame[1] >= BWDREF_FRAME)));
@@ -539,48 +321,60 @@ static INLINE int has_uni_comp_refs(const MB_MODE_INFO *mbmi) {
 
 static INLINE MV_REFERENCE_FRAME comp_ref0(int ref_idx) {
   static const MV_REFERENCE_FRAME lut[] = {
-    LAST_FRAME,    // LAST_LAST2_FRAMES,
-    LAST_FRAME,    // LAST_LAST3_FRAMES,
-    LAST_FRAME,    // LAST_GOLDEN_FRAMES,
-    BWDREF_FRAME,  // BWDREF_ALTREF_FRAMES,
+    LAST_FRAME,     // LAST_LAST2_FRAMES,
+    LAST_FRAME,     // LAST_LAST3_FRAMES,
+    LAST_FRAME,     // LAST_GOLDEN_FRAMES,
+    BWDREF_FRAME,   // BWDREF_ALTREF_FRAMES,
+    LAST2_FRAME,    // LAST2_LAST3_FRAMES
+    LAST2_FRAME,    // LAST2_GOLDEN_FRAMES,
+    LAST3_FRAME,    // LAST3_GOLDEN_FRAMES,
+    BWDREF_FRAME,   // BWDREF_ALTREF2_FRAMES,
+    ALTREF2_FRAME,  // ALTREF2_ALTREF_FRAMES,
   };
-  assert(NELEMENTS(lut) == UNIDIR_COMP_REFS);
+  assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS);
   return lut[ref_idx];
 }
 
 static INLINE MV_REFERENCE_FRAME comp_ref1(int ref_idx) {
   static const MV_REFERENCE_FRAME lut[] = {
-    LAST2_FRAME,   // LAST_LAST2_FRAMES,
-    LAST3_FRAME,   // LAST_LAST3_FRAMES,
-    GOLDEN_FRAME,  // LAST_GOLDEN_FRAMES,
-    ALTREF_FRAME,  // BWDREF_ALTREF_FRAMES,
+    LAST2_FRAME,    // LAST_LAST2_FRAMES,
+    LAST3_FRAME,    // LAST_LAST3_FRAMES,
+    GOLDEN_FRAME,   // LAST_GOLDEN_FRAMES,
+    ALTREF_FRAME,   // BWDREF_ALTREF_FRAMES,
+    LAST3_FRAME,    // LAST2_LAST3_FRAMES
+    GOLDEN_FRAME,   // LAST2_GOLDEN_FRAMES,
+    GOLDEN_FRAME,   // LAST3_GOLDEN_FRAMES,
+    ALTREF2_FRAME,  // BWDREF_ALTREF2_FRAMES,
+    ALTREF_FRAME,   // ALTREF2_ALTREF_FRAMES,
   };
-  assert(NELEMENTS(lut) == UNIDIR_COMP_REFS);
+  assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS);
   return lut[ref_idx];
 }
-#endif  // CONFIG_EXT_COMP_REFS
 
-PREDICTION_MODE av1_left_block_mode(const MODE_INFO *cur_mi,
-                                    const MODE_INFO *left_mi, int b);
+PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi);
 
-PREDICTION_MODE av1_above_block_mode(const MODE_INFO *cur_mi,
-                                     const MODE_INFO *above_mi, int b);
+PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi);
 
-#if CONFIG_GLOBAL_MOTION
-static INLINE int is_global_mv_block(const MODE_INFO *mi, int block,
+static INLINE int is_global_mv_block(const MB_MODE_INFO *const mbmi,
                                      TransformationType type) {
-  PREDICTION_MODE mode = get_y_mode(mi, block);
-#if GLOBAL_SUB8X8_USED
-  const int block_size_allowed = 1;
-#else
-  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
+  const PREDICTION_MODE mode = mbmi->mode;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
   const int block_size_allowed =
       AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
-#endif  // GLOBAL_SUB8X8_USED
-  return (mode == ZEROMV || mode == ZERO_ZEROMV) && type > TRANSLATION &&
+  return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && type > TRANSLATION &&
          block_size_allowed;
 }
-#endif  // CONFIG_GLOBAL_MOTION
+
+#if CONFIG_MISMATCH_DEBUG
+static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col,
+                                   int mi_row, int tx_blk_col, int tx_blk_row,
+                                   int subsampling_x, int subsampling_y) {
+  *pixel_c = ((mi_col >> subsampling_x) << MI_SIZE_LOG2) +
+             (tx_blk_col << tx_size_wide_log2[0]);
+  *pixel_r = ((mi_row >> subsampling_y) << MI_SIZE_LOG2) +
+             (tx_blk_row << tx_size_high_log2[0]);
+}
+#endif
 
 enum mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 };
 
@@ -592,8 +386,22 @@ struct buf_2d {
   int stride;
 };
 
+typedef struct eob_info {
+  uint16_t eob;
+  uint16_t max_scan_line;
+} eob_info;
+
+typedef struct {
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MAX_MB_PLANE][MAX_SB_SQUARE]);
+  eob_info eob_data[MAX_MB_PLANE]
+                   [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
+} CB_BUFFER;
+
 typedef struct macroblockd_plane {
   tran_low_t *dqcoeff;
+  tran_low_t *dqcoeff_block;
+  eob_info *eob_data;
   PLANE_TYPE plane_type;
   int subsampling_x;
   int subsampling_y;
@@ -601,56 +409,36 @@ typedef struct macroblockd_plane {
   struct buf_2d pre[2];
   ENTROPY_CONTEXT *above_context;
   ENTROPY_CONTEXT *left_context;
-  int16_t seg_dequant[MAX_SEGMENTS][2];
-#if CONFIG_NEW_QUANT
-  dequant_val_type_nuq seg_dequant_nuq[MAX_SEGMENTS][QUANT_PROFILES]
-                                      [COEF_BANDS];
-#endif
+
+  // The dequantizers below are true dequntizers used only in the
+  // dequantization process.  They have the same coefficient
+  // shift/scale as TX.
+  int16_t seg_dequant_QTX[MAX_SEGMENTS][2];
   uint8_t *color_index_map;
 
-  // number of 4x4s in current block
-  uint16_t n4_w, n4_h;
-  // log2 of n4_w, n4_h
-  uint8_t n4_wl, n4_hl;
   // block size in pixels
   uint8_t width, height;
 
-#if CONFIG_AOM_QM
-  qm_val_t *seg_iqmatrix[MAX_SEGMENTS][2][TX_SIZES_ALL];
-  qm_val_t *seg_qmatrix[MAX_SEGMENTS][2][TX_SIZES_ALL];
-#endif
-  // encoder
-  const int16_t *dequant;
-#if CONFIG_NEW_QUANT
-  const dequant_val_type_nuq *dequant_val_nuq[QUANT_PROFILES];
-#endif  // CONFIG_NEW_QUANT
-
-#if CONFIG_PVQ || CONFIG_DIST_8X8
-  DECLARE_ALIGNED(16, int16_t, pred[MAX_SB_SQUARE]);
-#endif
-#if CONFIG_PVQ
-  // PVQ: forward transformed predicted image, a reference for PVQ.
-  tran_low_t *pvq_ref_coeff;
-#endif
+  qm_val_t *seg_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  qm_val_t *seg_qmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+
+  // the 'dequantizers' below are not literal dequantizer values.
+  // They're used by encoder RDO to generate ad-hoc lambda values.
+  // They use a hardwired Q3 coeff shift and do not necessarily match
+  // the TX scale in use.
+  const int16_t *dequant_Q3;
 } MACROBLOCKD_PLANE;
 
 #define BLOCK_OFFSET(x, i) \
   ((x) + (i) * (1 << (tx_size_wide_log2[0] + tx_size_high_log2[0])))
 
 typedef struct RefBuffer {
-  int idx;
+  int idx;      // frame buf idx
+  int map_idx;  // frame map idx
   YV12_BUFFER_CONFIG *buf;
   struct scale_factors sf;
-#if CONFIG_VAR_REFS
-  int is_valid;
-#endif  // CONFIG_VAR_REFS
 } RefBuffer;
 
-#if CONFIG_ADAPT_SCAN
-typedef int16_t EobThresholdMD[TX_TYPES][EOB_THRESHOLD_NUM];
-#endif
-
-#if CONFIG_LOOP_RESTORATION
 typedef struct {
   DECLARE_ALIGNED(16, InterpKernel, vfilter);
   DECLARE_ALIGNED(16, InterpKernel, hfilter);
@@ -660,77 +448,75 @@ typedef struct {
   int ep;
   int xqd[2];
 } SgrprojInfo;
-#endif  // CONFIG_LOOP_RESTORATION
 
-#if CONFIG_CFL
-#if CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+#if CONFIG_DEBUG
 #define CFL_SUB8X8_VAL_MI_SIZE (4)
 #define CFL_SUB8X8_VAL_MI_SQUARE \
   (CFL_SUB8X8_VAL_MI_SIZE * CFL_SUB8X8_VAL_MI_SIZE)
-#endif  // CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+#endif  // CONFIG_DEBUG
+#define CFL_MAX_BLOCK_SIZE (BLOCK_32X32)
+#define CFL_BUF_LINE (32)
+#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
+#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
+#define CFL_BUF_SQUARE (CFL_BUF_LINE * CFL_BUF_LINE)
 typedef struct cfl_ctx {
-  // The CfL prediction buffer is used in two steps:
-  //   1. Stores Q3 reconstructed luma pixels
-  //      (only Q2 is required, but Q3 is used to avoid shifts)
-  //   2. Stores Q3 AC contributions (step1 - tx block avg)
-  int16_t pred_buf_q3[MAX_SB_SQUARE];
+  // Q3 reconstructed luma pixels (only Q2 is required, but Q3 is used to avoid
+  // shifts)
+  uint16_t recon_buf_q3[CFL_BUF_SQUARE];
+  // Q3 AC contributions (reconstructed luma pixels - tx block avg)
+  int16_t ac_buf_q3[CFL_BUF_SQUARE];
+
+  // Cache the DC_PRED when performing RDO, so it does not have to be recomputed
+  // for every scaling parameter
+  int dc_pred_is_cached[CFL_PRED_PLANES];
+  // The DC_PRED cache is disable when decoding
+  int use_dc_pred_cache;
+  // Only cache the first row of the DC_PRED
+  int16_t dc_pred_cache[CFL_PRED_PLANES][CFL_BUF_LINE];
 
   // Height and width currently used in the CfL prediction buffer.
   int buf_height, buf_width;
 
-  // Height and width of the chroma prediction block currently associated with
-  // this context
-  int uv_height, uv_width;
-
   int are_parameters_computed;
 
   // Chroma subsampling
   int subsampling_x, subsampling_y;
 
-  // Block level DC_PRED for each chromatic plane
-  int dc_pred[CFL_PRED_PLANES];
-
   int mi_row, mi_col;
 
   // Whether the reconstructed luma pixels need to be stored
   int store_y;
 
-#if CONFIG_CB4X4
+#if CONFIG_DEBUG
+  int rate;
+#endif  // CONFIG_DEBUG
+
   int is_chroma_reference;
-#if CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-  // The prediction used for sub8x8 blocks originates from multiple luma blocks,
-  // this array is used to validate that cfl_store() is called only once for
-  // each luma block
-  uint8_t sub8x8_val[CFL_SUB8X8_VAL_MI_SQUARE];
-#endif  // CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-#endif  // CONFIG_CB4X4
 } CFL_CTX;
-#endif  // CONFIG_CFL
+
+typedef struct jnt_comp_params {
+  int use_jnt_comp_avg;
+  int fwd_offset;
+  int bck_offset;
+} JNT_COMP_PARAMS;
 
 typedef struct macroblockd {
   struct macroblockd_plane plane[MAX_MB_PLANE];
-  uint8_t bmode_blocks_wl;
-  uint8_t bmode_blocks_hl;
 
-  FRAME_COUNTS *counts;
   TileInfo tile;
 
   int mi_stride;
 
-  MODE_INFO **mi;
-  MODE_INFO *left_mi;
-  MODE_INFO *above_mi;
+  MB_MODE_INFO **mi;
   MB_MODE_INFO *left_mbmi;
   MB_MODE_INFO *above_mbmi;
+  MB_MODE_INFO *chroma_left_mbmi;
+  MB_MODE_INFO *chroma_above_mbmi;
 
   int up_available;
   int left_available;
-#if CONFIG_CHROMA_SUB8X8
   int chroma_up_available;
   int chroma_left_available;
-#endif
-
-  const aom_prob (*partition_probs)[PARTITION_TYPES - 1];
 
   /* Distance of MB away from frame edges in subpixels (1/8th pixel)  */
   int mb_to_left_edge;
@@ -738,40 +524,24 @@ typedef struct macroblockd {
   int mb_to_top_edge;
   int mb_to_bottom_edge;
 
-  FRAME_CONTEXT *fc;
-
   /* pointers to reference frames */
   const RefBuffer *block_refs[2];
 
   /* pointer to current frame */
   const YV12_BUFFER_CONFIG *cur_buf;
 
-#if CONFIG_INTRABC
-  /* Scale of the current frame with respect to itself */
-  struct scale_factors sf_identity;
-#endif
-
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
-  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
+  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][MAX_MIB_SIZE];
 
   PARTITION_CONTEXT *above_seg_context;
   PARTITION_CONTEXT left_seg_context[MAX_MIB_SIZE];
 
-#if CONFIG_VAR_TX
   TXFM_CONTEXT *above_txfm_context;
   TXFM_CONTEXT *left_txfm_context;
-  TXFM_CONTEXT left_txfm_context_buffer[2 * MAX_MIB_SIZE];
-
-  TX_SIZE max_tx_size;
-#if CONFIG_SUPERTX
-  TX_SIZE supertx_size;
-#endif
-#endif
+  TXFM_CONTEXT left_txfm_context_buffer[MAX_MIB_SIZE];
 
-#if CONFIG_LOOP_RESTORATION
   WienerInfo wiener_info[MAX_MB_PLANE];
   SgrprojInfo sgrproj_info[MAX_MB_PLANE];
-#endif  // CONFIG_LOOP_RESTORATION
 
   // block dimension in the unit of mode_info.
   uint8_t n8_w, n8_h;
@@ -780,9 +550,10 @@ typedef struct macroblockd {
   CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
   uint8_t is_sec_rect;
 
-#if CONFIG_PVQ
-  daala_dec_ctx daala_dec;
-#endif
+  // Counts of each reference frame in the above and left neighboring blocks.
+  // NOTE: Take into account both single and comp references.
+  uint8_t neighbors_ref_counts[REF_FRAMES];
+
   FRAME_CONTEXT *tile_ctx;
   /* Bit depth: 8, 10, 12 */
   int bd;
@@ -790,27 +561,19 @@ typedef struct macroblockd {
   int qindex[MAX_SEGMENTS];
   int lossless[MAX_SEGMENTS];
   int corrupted;
-#if CONFIG_AMVR
-  int cur_frame_mv_precision_level;
-// same with that in AV1_COMMON
-#endif
+  int cur_frame_force_integer_mv;
+  // same with that in AV1_COMMON
   struct aom_internal_error_info *error_info;
-#if CONFIG_GLOBAL_MOTION
-  WarpedMotionParams *global_motion;
-#endif  // CONFIG_GLOBAL_MOTION
-  int prev_qindex;
+  const WarpedMotionParams *global_motion;
   int delta_qindex;
   int current_qindex;
-#if CONFIG_EXT_DELTA_Q
   // Since actual frame level loop filtering level value is not available
   // at the beginning of the tile (only available during actual filtering)
   // at encoder side.we record the delta_lf (against the frame level loop
   // filtering level) and code the delta between previous superblock's delta
   // lf and current delta lf. It is equivalent to the delta between previous
   // superblock's actual lf and current lf.
-  int prev_delta_lf_from_base;
-  int current_delta_lf_from_base;
-#if CONFIG_LOOPFILTER_LEVEL
+  int delta_lf_from_base;
   // For this experiment, we have four frame filter levels for different plane
   // and direction. So, to support the per superblock update, we need to add
   // a few more params as below.
@@ -824,420 +587,151 @@ typedef struct macroblockd {
   // SEG_LVL_ALT_LF_Y_H = 2;
   // SEG_LVL_ALT_LF_U   = 3;
   // SEG_LVL_ALT_LF_V   = 4;
-  int prev_delta_lf[FRAME_LF_COUNT];
-  int curr_delta_lf[FRAME_LF_COUNT];
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif
-#if CONFIG_ADAPT_SCAN
-  const EobThresholdMD *eob_threshold_md;
-#endif
+  int delta_lf[FRAME_LF_COUNT];
+  int cdef_preset[4];
 
-#if CONFIG_COMPOUND_SEGMENT
   DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
-#endif  // CONFIG_COMPOUND_SEGMENT
+  uint8_t *mc_buf[2];
+  CFL_CTX cfl;
 
-#if CONFIG_MRC_TX
-  uint8_t *mrc_mask;
-#endif  // CONFIG_MRC_TX
+  JNT_COMP_PARAMS jcp_param;
 
-#if CONFIG_CFL
-  CFL_CTX *cfl;
-#endif
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  uint8_t *ncobmc_pred_buf[MAX_MB_PLANE];
-  int ncobmc_pred_buf_stride[MAX_MB_PLANE];
-  SB_MI_BD sb_mi_bd;
-#endif
+  uint16_t cb_offset[MAX_MB_PLANE];
+  uint16_t txb_offset[MAX_MB_PLANE];
+  uint16_t color_index_map_offset[2];
 } MACROBLOCKD;
 
 static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) {
   return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
 }
 
-static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
-                                     PARTITION_TYPE partition) {
-  if (partition == PARTITION_INVALID)
-    return BLOCK_INVALID;
-  else
-    return subsize_lookup[partition][bsize];
-}
-
-static const TX_TYPE intra_mode_to_tx_type_context[INTRA_MODES] = {
-  DCT_DCT,    // DC
-  ADST_DCT,   // V
-  DCT_ADST,   // H
-  DCT_DCT,    // D45
-  ADST_ADST,  // D135
-  ADST_DCT,   // D117
-  DCT_ADST,   // D153
-  DCT_ADST,   // D207
-  ADST_DCT,   // D63
-  ADST_ADST,  // SMOOTH
-#if CONFIG_SMOOTH_HV
-  ADST_DCT,   // SMOOTH_V
-  DCT_ADST,   // SMOOTH_H
-#endif        // CONFIG_SMOOTH_HV
-  ADST_ADST,  // TM
-};
+static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
+  switch (bsize) {
+    case BLOCK_4X4: return 0;
+    case BLOCK_8X8: return 1;
+    case BLOCK_16X16: return 2;
+    case BLOCK_32X32: return 3;
+    case BLOCK_64X64: return 4;
+    case BLOCK_128X128: return 5;
+    default: return SQR_BLOCK_SIZES;
+  }
+}
 
-#if CONFIG_SUPERTX
-static INLINE int supertx_enabled(const MB_MODE_INFO *mbmi) {
-  TX_SIZE max_tx_size = txsize_sqr_map[mbmi->tx_size];
-  return tx_size_wide[max_tx_size] >
-         AOMMIN(block_size_wide[mbmi->sb_type], block_size_high[mbmi->sb_type]);
+// Note: the input block size should be square.
+// Otherwise it's considered invalid.
+static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize,
+                                               PARTITION_TYPE partition) {
+  if (partition == PARTITION_INVALID) {
+    return BLOCK_INVALID;
+  } else {
+    const int sqr_bsize_idx = get_sqr_bsize_idx(bsize);
+    return sqr_bsize_idx >= SQR_BLOCK_SIZES
+               ? BLOCK_INVALID
+               : subsize_lookup[partition][sqr_bsize_idx];
+  }
 }
-#endif  // CONFIG_SUPERTX
 
-#define USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4 1
+static TX_TYPE intra_mode_to_tx_type(const MB_MODE_INFO *mbmi,
+                                     PLANE_TYPE plane_type) {
+  static const TX_TYPE _intra_mode_to_tx_type[INTRA_MODES] = {
+    DCT_DCT,    // DC
+    ADST_DCT,   // V
+    DCT_ADST,   // H
+    DCT_DCT,    // D45
+    ADST_ADST,  // D135
+    ADST_DCT,   // D117
+    DCT_ADST,   // D153
+    DCT_ADST,   // D207
+    ADST_DCT,   // D63
+    ADST_ADST,  // SMOOTH
+    ADST_DCT,   // SMOOTH_V
+    DCT_ADST,   // SMOOTH_H
+    ADST_ADST,  // PAETH
+  };
+  const PREDICTION_MODE mode =
+      (plane_type == PLANE_TYPE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
+  assert(mode < INTRA_MODES);
+  return _intra_mode_to_tx_type[mode];
+}
 
-#if CONFIG_RECT_TX
 static INLINE int is_rect_tx(TX_SIZE tx_size) { return tx_size >= TX_SIZES; }
-#endif  // CONFIG_RECT_TX
 
 static INLINE int block_signals_txsize(BLOCK_SIZE bsize) {
-#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_EXT_TX) && CONFIG_RECT_TX
   return bsize > BLOCK_4X4;
-#else
-  return bsize >= BLOCK_8X8;
-#endif
 }
 
-#if CONFIG_MRC_TX
-#define USE_MRC_INTRA 0
-#define USE_MRC_INTER 1
-#define SIGNAL_MRC_MASK_INTRA (USE_MRC_INTRA && 0)
-#define SIGNAL_MRC_MASK_INTER (USE_MRC_INTER && 1)
-#define SIGNAL_ANY_MRC_MASK (SIGNAL_MRC_MASK_INTRA || SIGNAL_MRC_MASK_INTER)
-#endif  // CONFIG_MRC_TX
-
-#if CONFIG_EXT_TX
-#define ALLOW_INTRA_EXT_TX 1
-
 // Number of transform types in each set type
 static const int av1_num_ext_tx_set[EXT_TX_SET_TYPES] = {
-  1, 2,
-#if CONFIG_MRC_TX
-  2, 3,
-#endif  // CONFIG_MRC_TX
-  5, 7, 12, 16,
-};
-
-static const int av1_ext_tx_set_idx_to_type[2][AOMMAX(EXT_TX_SETS_INTRA,
-                                                      EXT_TX_SETS_INTER)] = {
-  {
-      // Intra
-      EXT_TX_SET_DCTONLY, EXT_TX_SET_DTT4_IDTX_1DDCT, EXT_TX_SET_DTT4_IDTX,
-#if CONFIG_MRC_TX
-      EXT_TX_SET_MRC_DCT,
-#endif  // CONFIG_MRC_TX
-  },
-  {
-      // Inter
-      EXT_TX_SET_DCTONLY, EXT_TX_SET_ALL16, EXT_TX_SET_DTT9_IDTX_1DDCT,
-      EXT_TX_SET_DCT_IDTX,
-#if CONFIG_MRC_TX
-      EXT_TX_SET_MRC_DCT_IDTX,
-#endif  // CONFIG_MRC_TX
-  }
+  1, 2, 5, 7, 12, 16,
 };
 
-#if CONFIG_MRC_TX
 static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = {
-  {
-      1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-  },
-  {
-      1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
-  },
-  {
-      1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
-  },
-  {
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
-  },
-  {
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
-  },
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
 };
-#else   // CONFIG_MRC_TX
-static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = {
-  {
-      1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
-  },
-  {
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
-  },
-  {
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  },
-};
-#endif  // CONFIG_MRC_TX
 
-static INLINE TxSetType get_ext_tx_set_type(TX_SIZE tx_size, BLOCK_SIZE bs,
-                                            int is_inter, int use_reduced_set) {
+static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter,
+                                                int use_reduced_set) {
   const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
-  const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
-#if CONFIG_CB4X4 && USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
-  (void)bs;
   if (tx_size_sqr_up > TX_32X32) return EXT_TX_SET_DCTONLY;
-#else
-  if (tx_size_sqr_up > TX_32X32 || bs < BLOCK_8X8) return EXT_TX_SET_DCTONLY;
-#endif
-  if (use_reduced_set)
-    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
-#if CONFIG_MRC_TX
-  if (tx_size == TX_32X32) {
-    if (is_inter && USE_MRC_INTER)
-      return EXT_TX_SET_MRC_DCT_IDTX;
-    else if (!is_inter && USE_MRC_INTRA)
-      return EXT_TX_SET_MRC_DCT;
-  }
-#endif  // CONFIG_MRC_TX
   if (tx_size_sqr_up == TX_32X32)
     return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
-  if (is_inter)
+  if (use_reduced_set)
+    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
+  const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
+  if (is_inter) {
     return (tx_size_sqr == TX_16X16 ? EXT_TX_SET_DTT9_IDTX_1DDCT
                                     : EXT_TX_SET_ALL16);
-  else
+  } else {
     return (tx_size_sqr == TX_16X16 ? EXT_TX_SET_DTT4_IDTX
                                     : EXT_TX_SET_DTT4_IDTX_1DDCT);
+  }
 }
 
 // Maps tx set types to the indices.
 static const int ext_tx_set_index[2][EXT_TX_SET_TYPES] = {
-  {
-      // Intra
-      0, -1,
-#if CONFIG_MRC_TX
-      3, -1,
-#endif  // CONFIG_MRC_TX
-      2, 1, -1, -1,
-  },
-  {
-      // Inter
-      0, 3,
-#if CONFIG_MRC_TX
-      -1, 4,
-#endif  // CONFIG_MRC_TX
-      -1, -1, 2, 1,
-  },
+  { // Intra
+    0, -1, 2, 1, -1, -1 },
+  { // Inter
+    0, 3, -1, -1, 2, 1 },
 };
 
-static INLINE int get_ext_tx_set(TX_SIZE tx_size, BLOCK_SIZE bs, int is_inter,
+static INLINE int get_ext_tx_set(TX_SIZE tx_size, int is_inter,
                                  int use_reduced_set) {
   const TxSetType set_type =
-      get_ext_tx_set_type(tx_size, bs, is_inter, use_reduced_set);
+      av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set);
   return ext_tx_set_index[is_inter][set_type];
 }
 
-static INLINE int get_ext_tx_types(TX_SIZE tx_size, BLOCK_SIZE bs, int is_inter,
+static INLINE int get_ext_tx_types(TX_SIZE tx_size, int is_inter,
                                    int use_reduced_set) {
   const int set_type =
-      get_ext_tx_set_type(tx_size, bs, is_inter, use_reduced_set);
+      av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set);
   return av1_num_ext_tx_set[set_type];
 }
 
-#if CONFIG_LGT_FROM_PRED
-static INLINE int is_lgt_allowed(PREDICTION_MODE mode, TX_SIZE tx_size) {
-  if (!LGT_FROM_PRED_INTRA && !is_inter_mode(mode)) return 0;
-  if (!LGT_FROM_PRED_INTER && is_inter_mode(mode)) return 0;
-
-  switch (mode) {
-    case D45_PRED:
-    case D63_PRED:
-    case D117_PRED:
-    case V_PRED:
-#if CONFIG_SMOOTH_HV
-    case SMOOTH_V_PRED:
-#endif
-      return tx_size_wide[tx_size] <= 8;
-    case D135_PRED:
-    case D153_PRED:
-    case D207_PRED:
-    case H_PRED:
-#if CONFIG_SMOOTH_HV
-    case SMOOTH_H_PRED:
-#endif
-      return tx_size_high[tx_size] <= 8;
-    case DC_PRED:
-    case SMOOTH_PRED: return 0;
-    case TM_PRED:
-    default: return tx_size_wide[tx_size] <= 8 || tx_size_high[tx_size] <= 8;
-  }
-}
-#endif  // CONFIG_LGT_FROM_PRED
-
-#if CONFIG_RECT_TX
-static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
-  static const char LUT[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    0,  // BLOCK_2X2
-    0,  // BLOCK_2X4
-    0,  // BLOCK_4X2
-#endif
-    0,  // BLOCK_4X4
-    1,  // BLOCK_4X8
-    1,  // BLOCK_8X4
-    0,  // BLOCK_8X8
-    1,  // BLOCK_8X16
-    1,  // BLOCK_16X8
-    0,  // BLOCK_16X16
-    1,  // BLOCK_16X32
-    1,  // BLOCK_32X16
-    0,  // BLOCK_32X32
-    1,  // BLOCK_32X64
-    1,  // BLOCK_64X32
-    0,  // BLOCK_64X64
-#if CONFIG_EXT_PARTITION
-    0,  // BLOCK_64X128
-    0,  // BLOCK_128X64
-    0,  // BLOCK_128X128
-#endif  // CONFIG_EXT_PARTITION
-    0,  // BLOCK_4X16
-    0,  // BLOCK_16X4
-    0,  // BLOCK_8X32
-    0,  // BLOCK_32X8
-    0,  // BLOCK_16X64
-    0,  // BLOCK_64X16
-#if CONFIG_EXT_PARTITION
-    0,  // BLOCK_32X128
-    0,  // BLOCK_128X32
-#endif  // CONFIG_EXT_PARTITION
-  };
-
-  return LUT[bsize];
-}
+#define TXSIZEMAX(t1, t2) (tx_size_2d[(t1)] >= tx_size_2d[(t2)] ? (t1) : (t2))
+#define TXSIZEMIN(t1, t2) (tx_size_2d[(t1)] <= tx_size_2d[(t2)] ? (t1) : (t2))
 
-static INLINE int is_rect_tx_allowed(const MACROBLOCKD *xd,
-                                     const MB_MODE_INFO *mbmi) {
-  return is_rect_tx_allowed_bsize(mbmi->sb_type) &&
-         !xd->lossless[mbmi->segment_id];
-}
-#endif  // CONFIG_RECT_TX
-#endif  // CONFIG_EXT_TX
-
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-static INLINE int is_quarter_tx_allowed_bsize(BLOCK_SIZE bsize) {
-  static const char LUT_QTTX[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    0,  // BLOCK_2X2
-    0,  // BLOCK_2X4
-    0,  // BLOCK_4X2
-#endif
-    0,  // BLOCK_4X4
-    0,  // BLOCK_4X8
-    0,  // BLOCK_8X4
-    0,  // BLOCK_8X8
-    1,  // BLOCK_8X16
-    1,  // BLOCK_16X8
-    0,  // BLOCK_16X16
-    0,  // BLOCK_16X32
-    0,  // BLOCK_32X16
-    0,  // BLOCK_32X32
-    0,  // BLOCK_32X64
-    0,  // BLOCK_64X32
-    0,  // BLOCK_64X64
-#if CONFIG_EXT_PARTITION
-    0,  // BLOCK_64X128
-    0,  // BLOCK_128X64
-    0,  // BLOCK_128X128
-#endif  // CONFIG_EXT_PARTITION
-    0,  // BLOCK_4X16
-    0,  // BLOCK_16X4
-    0,  // BLOCK_8X32
-    0,  // BLOCK_32X8
-    0,  // BLOCK_16X64
-    0,  // BLOCK_64X16
-#if CONFIG_EXT_PARTITION
-    0,  // BLOCK_32X128
-    0,  // BLOCK_128X32
-#endif  // CONFIG_EXT_PARTITION
-  };
-
-  return LUT_QTTX[bsize];
-}
-
-static INLINE int is_quarter_tx_allowed(const MACROBLOCKD *xd,
-                                        const MB_MODE_INFO *mbmi,
-                                        int is_inter) {
-  return is_quarter_tx_allowed_bsize(mbmi->sb_type) && is_inter &&
-         !xd->lossless[mbmi->segment_id];
-}
-#endif
-
-static INLINE TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode,
-                                           int is_inter) {
+static INLINE TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode) {
   const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
-#if (CONFIG_VAR_TX || CONFIG_EXT_TX) && CONFIG_RECT_TX
   const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bsize];
-#else
-  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
-#endif  // (CONFIG_VAR_TX || CONFIG_EXT_TX) && CONFIG_RECT_TX
-  (void)is_inter;
-#if CONFIG_VAR_TX && CONFIG_RECT_TX
-#if CONFIG_CB4X4
   if (bsize == BLOCK_4X4)
     return AOMMIN(max_txsize_lookup[bsize], largest_tx_size);
-#else
-  if (bsize < BLOCK_8X8)
-    return AOMMIN(max_txsize_lookup[bsize], largest_tx_size);
-#endif
   if (txsize_sqr_map[max_rect_tx_size] <= largest_tx_size)
     return max_rect_tx_size;
   else
     return largest_tx_size;
-#elif CONFIG_EXT_TX && CONFIG_RECT_TX
-  if (txsize_sqr_up_map[max_rect_tx_size] <= largest_tx_size) {
-    return max_rect_tx_size;
-  } else {
-    return largest_tx_size;
-  }
-#else
-  return AOMMIN(max_tx_size, largest_tx_size);
-#endif  // CONFIG_VAR_TX && CONFIG_RECT_TX
 }
 
-#if CONFIG_EXT_INTRA
-#define MAX_ANGLE_DELTA 3
-#define ANGLE_STEP 3
 extern const int16_t dr_intra_derivative[90];
 static const uint8_t mode_to_angle_map[] = {
-  0, 90, 180, 45, 135, 111, 157, 203, 67, 0, 0,
-#if CONFIG_SMOOTH_HV
-  0, 0,
-#endif  // CONFIG_SMOOTH_HV
+  0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0,
 };
-#if CONFIG_INTRA_INTERP
-// Returns whether filter selection is needed for a given
-// intra prediction angle.
-int av1_is_intra_filter_switchable(int angle);
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-
-#if CONFIG_DCT_ONLY
-#define FIXED_TX_TYPE 1
-#else
-#define FIXED_TX_TYPE 0
-#endif
 
 // Converts block_index for given transform size to index of the block in raster
 // order.
@@ -1261,168 +755,182 @@ static INLINE int av1_raster_order_to_block_index(TX_SIZE tx_size,
 }
 
 static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
-                                          const MACROBLOCKD *xd, int block_idx,
+                                          const MACROBLOCKD *xd,
                                           TX_SIZE tx_size) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
 
-  if (CONFIG_DCT_ONLY || is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y ||
+  if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y ||
       xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32)
     return DCT_DCT;
 
-  return intra_mode_to_tx_type_context[plane_type == PLANE_TYPE_Y
-                                           ? get_y_mode(xd->mi[0], block_idx)
-                                           : get_uv_mode(mbmi->uv_mode)];
+  return intra_mode_to_tx_type(mbmi, plane_type);
+}
+
+static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
+                                              int subsampling_x,
+                                              int subsampling_y) {
+  if (bsize == BLOCK_INVALID) return BLOCK_INVALID;
+  return ss_size_lookup[bsize][subsampling_x][subsampling_y];
+}
+
+static INLINE int av1_get_txb_size_index(BLOCK_SIZE bsize, int blk_row,
+                                         int blk_col) {
+  TX_SIZE txs = max_txsize_rect_lookup[bsize];
+  for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level)
+    txs = sub_tx_size_map[txs];
+  const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
+  const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
+  const int bw_log2 = mi_size_wide_log2[bsize];
+  const int stride_log2 = bw_log2 - tx_w_log2;
+  const int index =
+      ((blk_row >> tx_h_log2) << stride_log2) + (blk_col >> tx_w_log2);
+  assert(index < INTER_TX_SIZE_BUF_LEN);
+  return index;
+}
+
+static INLINE int av1_get_txk_type_index(BLOCK_SIZE bsize, int blk_row,
+                                         int blk_col) {
+  TX_SIZE txs = max_txsize_rect_lookup[bsize];
+  for (int level = 0; level < MAX_VARTX_DEPTH; ++level)
+    txs = sub_tx_size_map[txs];
+  const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
+  const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
+  const int bw_uint_log2 = mi_size_wide_log2[bsize];
+  const int stride_log2 = bw_uint_log2 - tx_w_log2;
+  const int index =
+      ((blk_row >> tx_h_log2) << stride_log2) + (blk_col >> tx_w_log2);
+  assert(index < TXK_TYPE_BUF_LEN);
+  return index;
+}
+
+static INLINE void update_txk_array(TX_TYPE *txk_type, BLOCK_SIZE bsize,
+                                    int blk_row, int blk_col, TX_SIZE tx_size,
+                                    TX_TYPE tx_type) {
+  const int txk_type_idx = av1_get_txk_type_index(bsize, blk_row, blk_col);
+  txk_type[txk_type_idx] = tx_type;
+
+  const int txw = tx_size_wide_unit[tx_size];
+  const int txh = tx_size_high_unit[tx_size];
+  // The 16x16 unit is due to the constraint from tx_64x64 which sets the
+  // maximum tx size for chroma as 32x32. Coupled with 4x1 transform block
+  // size, the constraint takes effect in 32x16 / 16x32 size too. To solve
+  // the intricacy, cover all the 16x16 units inside a 64 level transform.
+  if (txw == tx_size_wide_unit[TX_64X64] ||
+      txh == tx_size_high_unit[TX_64X64]) {
+    const int tx_unit = tx_size_wide_unit[TX_16X16];
+    for (int idy = 0; idy < txh; idy += tx_unit) {
+      for (int idx = 0; idx < txw; idx += tx_unit) {
+        const int this_index =
+            av1_get_txk_type_index(bsize, blk_row + idy, blk_col + idx);
+        txk_type[this_index] = tx_type;
+      }
+    }
+  }
 }
 
 static INLINE TX_TYPE av1_get_tx_type(PLANE_TYPE plane_type,
                                       const MACROBLOCKD *xd, int blk_row,
-                                      int blk_col, int block, TX_SIZE tx_size) {
-  const MODE_INFO *const mi = xd->mi[0];
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
-  (void)blk_row;
-  (void)blk_col;
-#if CONFIG_INTRABC && (!CONFIG_EXT_TX || CONFIG_TXK_SEL)
-  // TODO(aconverse@google.com): Handle INTRABC + EXT_TX + TXK_SEL
-  if (is_intrabc_block(mbmi)) return DCT_DCT;
-#endif  // CONFIG_INTRABC && (!CONFIG_EXT_TX || CONFIG_TXK_SEL)
-
-#if CONFIG_TXK_SEL
+                                      int blk_col, TX_SIZE tx_size,
+                                      int reduced_tx_set) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const struct macroblockd_plane *const pd = &xd->plane[plane_type];
+  const TxSetType tx_set_type =
+      av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set);
+
   TX_TYPE tx_type;
-  if (xd->lossless[mbmi->segment_id] || txsize_sqr_map[tx_size] >= TX_32X32) {
+  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) {
     tx_type = DCT_DCT;
   } else {
-    if (plane_type == PLANE_TYPE_Y)
-      tx_type = mbmi->txk_type[(blk_row << 4) + blk_col];
-    else if (is_inter_block(mbmi))
-      tx_type = mbmi->txk_type[(blk_row << 5) + (blk_col << 1)];
-    else
-      tx_type = intra_mode_to_tx_type_context[mbmi->uv_mode];
-  }
-  assert(tx_type >= DCT_DCT && tx_type < TX_TYPES);
-  return tx_type;
-#endif  // CONFIG_TXK_SEL
-
-#if FIXED_TX_TYPE
-  const int block_raster_idx = av1_block_index_to_raster_order(tx_size, block);
-  return get_default_tx_type(plane_type, xd, block_raster_idx, tx_size);
-#endif  // FIXED_TX_TYPE
-
-#if CONFIG_EXT_TX
-#if CONFIG_MRC_TX
-  if (mbmi->tx_type == MRC_DCT) {
-    assert(((is_inter_block(mbmi) && USE_MRC_INTER) ||
-            (!is_inter_block(mbmi) && USE_MRC_INTRA)) &&
-           "INVALID BLOCK TYPE FOR MRC_DCT");
     if (plane_type == PLANE_TYPE_Y) {
-      assert(tx_size == TX_32X32);
-      return mbmi->tx_type;
+      const int txk_type_idx =
+          av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+      tx_type = mbmi->txk_type[txk_type_idx];
+    } else if (is_inter_block(mbmi)) {
+      // scale back to y plane's coordinate
+      blk_row <<= pd->subsampling_y;
+      blk_col <<= pd->subsampling_x;
+      const int txk_type_idx =
+          av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+      tx_type = mbmi->txk_type[txk_type_idx];
+    } else {
+      // In intra mode, uv planes don't share the same prediction mode as y
+      // plane, so the tx_type should not be shared
+      tx_type = intra_mode_to_tx_type(mbmi, PLANE_TYPE_UV);
     }
-    return DCT_DCT;
   }
-#endif  // CONFIG_MRC_TX
-  if (xd->lossless[mbmi->segment_id] || txsize_sqr_map[tx_size] > TX_32X32 ||
-      (txsize_sqr_map[tx_size] >= TX_32X32 && !is_inter_block(mbmi)))
-    return DCT_DCT;
-  if (mbmi->sb_type >= BLOCK_8X8 || CONFIG_CB4X4) {
-    if (plane_type == PLANE_TYPE_Y) {
-#if !ALLOW_INTRA_EXT_TX
-      if (is_inter_block(mbmi))
-#endif  // ALLOW_INTRA_EXT_TX
-        return mbmi->tx_type;
-    }
+  assert(tx_type < TX_TYPES);
+  if (!av1_ext_tx_used[tx_set_type][tx_type]) return DCT_DCT;
+  return tx_type;
+}
 
-    if (is_inter_block(mbmi)) {
-// UV Inter only
-#if CONFIG_CHROMA_2X2
-      if (tx_size < TX_4X4) return DCT_DCT;
-#endif
-      return (mbmi->tx_type == IDTX && txsize_sqr_map[tx_size] >= TX_32X32)
-                 ? DCT_DCT
-                 : mbmi->tx_type;
-    }
-  }
+void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
+                            const int num_planes);
 
-#if CONFIG_CB4X4
-  (void)block;
-#if CONFIG_CHROMA_2X2
-  if (tx_size < TX_4X4)
-    return DCT_DCT;
-  else
-#endif  // CONFIG_CHROMA_2X2
-    return intra_mode_to_tx_type_context[get_uv_mode(mbmi->uv_mode)];
-#else   // CONFIG_CB4X4
-  // Sub8x8-Inter/Intra OR UV-Intra
-  if (is_inter_block(mbmi)) {  // Sub8x8-Inter
-    return DCT_DCT;
-  } else {  // Sub8x8 Intra OR UV-Intra
-    const int block_raster_idx =
-        av1_block_index_to_raster_order(tx_size, block);
-    return intra_mode_to_tx_type_context[plane_type == PLANE_TYPE_Y
-                                             ? get_y_mode(mi, block_raster_idx)
-                                             : get_uv_mode(mbmi->uv_mode)];
-  }
-#endif  // CONFIG_CB4X4
-#else   // CONFIG_EXT_TX
-  (void)block;
-#if CONFIG_MRC_TX
-  if (mbmi->tx_type == MRC_DCT) {
-    if (plane_type == PLANE_TYPE_Y && !xd->lossless[mbmi->segment_id]) {
-      assert(tx_size == TX_32X32);
-      return mbmi->tx_type;
-    }
-    return DCT_DCT;
+static INLINE int bsize_to_max_depth(BLOCK_SIZE bsize) {
+  TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+  int depth = 0;
+  while (depth < MAX_TX_DEPTH && tx_size != TX_4X4) {
+    depth++;
+    tx_size = sub_tx_size_map[tx_size];
   }
-#endif  // CONFIG_MRC_TX
-  if (plane_type != PLANE_TYPE_Y || xd->lossless[mbmi->segment_id] ||
-      txsize_sqr_map[tx_size] >= TX_32X32)
-    return DCT_DCT;
-  return mbmi->tx_type;
-#endif  // CONFIG_EXT_TX
+  return depth;
 }
 
-void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y);
-
-static INLINE int tx_size_to_depth(TX_SIZE tx_size) {
-  return (int)(tx_size - TX_SIZE_LUMA_MIN);
+static INLINE int bsize_to_tx_size_cat(BLOCK_SIZE bsize) {
+  TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+  assert(tx_size != TX_4X4);
+  int depth = 0;
+  while (tx_size != TX_4X4) {
+    depth++;
+    tx_size = sub_tx_size_map[tx_size];
+    assert(depth < 10);
+  }
+  assert(depth <= MAX_TX_CATS);
+  return depth - 1;
 }
 
-static INLINE TX_SIZE depth_to_tx_size(int depth) {
-  return (TX_SIZE)(depth + TX_SIZE_LUMA_MIN);
+static INLINE TX_SIZE depth_to_tx_size(int depth, BLOCK_SIZE bsize) {
+  TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
+  TX_SIZE tx_size = max_tx_size;
+  for (int d = 0; d < depth; ++d) tx_size = sub_tx_size_map[tx_size];
+  return tx_size;
 }
 
-static INLINE TX_SIZE av1_get_uv_tx_size(const MB_MODE_INFO *mbmi,
-                                         const struct macroblockd_plane *pd) {
-#if CONFIG_CHROMA_2X2
-  assert(mbmi->tx_size > TX_2X2);
-#endif  // CONFIG_CHROMA_2X2
-
-#if CONFIG_SUPERTX
-  if (supertx_enabled(mbmi))
-    return uvsupertx_size_lookup[txsize_sqr_map[mbmi->tx_size]]
-                                [pd->subsampling_x][pd->subsampling_y];
-#endif  // CONFIG_SUPERTX
+static INLINE TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) {
+  switch (tx_size) {
+    case TX_64X64:
+    case TX_64X32:
+    case TX_32X64: return TX_32X32;
+    case TX_64X16: return TX_32X16;
+    case TX_16X64: return TX_16X32;
+    default: return tx_size;
+  }
+}
 
-  const TX_SIZE uv_txsize =
-      uv_txsize_lookup[mbmi->sb_type][mbmi->tx_size][pd->subsampling_x]
-                      [pd->subsampling_y];
-  assert(uv_txsize != TX_INVALID);
-  return uv_txsize;
+static INLINE TX_SIZE av1_get_max_uv_txsize(BLOCK_SIZE bsize, int subsampling_x,
+                                            int subsampling_y) {
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, subsampling_x, subsampling_y);
+  assert(plane_bsize < BLOCK_SIZES_ALL);
+  const TX_SIZE uv_tx = max_txsize_rect_lookup[plane_bsize];
+  return av1_get_adjusted_tx_size(uv_tx);
 }
 
 static INLINE TX_SIZE av1_get_tx_size(int plane, const MACROBLOCKD *xd) {
-  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  if (xd->lossless[mbmi->segment_id]) return TX_4X4;
   if (plane == 0) return mbmi->tx_size;
   const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
-  return av1_get_uv_tx_size(mbmi, pd);
-}
-
-static INLINE BLOCK_SIZE
-get_plane_block_size(BLOCK_SIZE bsize, const struct macroblockd_plane *pd) {
-  return ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y];
+  return av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+                               pd->subsampling_y);
 }
 
 void av1_reset_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
-                            BLOCK_SIZE bsize);
+                            BLOCK_SIZE bsize, const int num_planes);
+
+void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes);
+
+void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes);
 
 typedef void (*foreach_transformed_block_visitor)(int plane, int block,
                                                   int blk_row, int blk_col,
@@ -1433,54 +941,31 @@ void av1_foreach_transformed_block_in_plane(
     const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
     foreach_transformed_block_visitor visit, void *arg);
 
-#if CONFIG_LV_MAP
 void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
                                    foreach_transformed_block_visitor visit,
-                                   void *arg);
-#endif
-
-#if CONFIG_COEF_INTERLEAVE
-static INLINE int get_max_4x4_size(int num_4x4, int mb_to_edge,
-                                   int subsampling) {
-  return num_4x4 + (mb_to_edge >= 0 ? 0 : mb_to_edge >> (5 + subsampling));
-}
-
-void av1_foreach_transformed_block_interleave(
-    const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
-    foreach_transformed_block_visitor visit, void *arg);
-#endif
+                                   void *arg, const int num_planes);
 
 void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
-                      int plane, TX_SIZE tx_size, int has_eob, int aoff,
-                      int loff);
+                      int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                      int has_eob, int aoff, int loff);
+
+#define MAX_INTERINTRA_SB_SQUARE 32 * 32
+static INLINE int is_interintra_mode(const MB_MODE_INFO *mbmi) {
+  return (mbmi->ref_frame[0] > INTRA_FRAME &&
+          mbmi->ref_frame[1] == INTRA_FRAME);
+}
 
 static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
-#if CONFIG_INTERINTRA
-  // TODO(debargha): Should this be bsize < BLOCK_LARGEST?
-  return (bsize >= BLOCK_8X8) && (bsize < BLOCK_64X64);
-#else
-  (void)bsize;
-  return 0;
-#endif  // CONFIG_INTERINTRA
+  return (bsize >= BLOCK_8X8) && (bsize <= BLOCK_32X32);
 }
 
 static INLINE int is_interintra_allowed_mode(const PREDICTION_MODE mode) {
-#if CONFIG_INTERINTRA
   return (mode >= NEARESTMV) && (mode <= NEWMV);
-#else
-  (void)mode;
-  return 0;
-#endif  // CONFIG_INTERINTRA
 }
 
 static INLINE int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) {
-#if CONFIG_INTERINTRA
   return (rf[0] > INTRA_FRAME) && (rf[1] <= INTRA_FRAME);
-#else
-  (void)rf;
-  return 0;
-#endif  // CONFIG_INTERINTRA
 }
 
 static INLINE int is_interintra_allowed(const MB_MODE_INFO *mbmi) {
@@ -1501,54 +986,30 @@ static INLINE int is_interintra_allowed_bsize_group(int group) {
 }
 
 static INLINE int is_interintra_pred(const MB_MODE_INFO *mbmi) {
-  return (mbmi->ref_frame[1] == INTRA_FRAME) && is_interintra_allowed(mbmi);
-}
-
-#if CONFIG_VAR_TX
-static INLINE int get_vartx_max_txsize(const MB_MODE_INFO *const mbmi,
-                                       BLOCK_SIZE bsize, int subsampled) {
-#if CONFIG_CB4X4
-  (void)mbmi;
-  TX_SIZE max_txsize = max_txsize_rect_lookup[bsize];
-#else
-  TX_SIZE max_txsize = mbmi->sb_type < BLOCK_8X8
-                           ? max_txsize_rect_lookup[mbmi->sb_type]
-                           : max_txsize_rect_lookup[bsize];
-#endif  // CONFIG_C4X4
-
-#if CONFIG_EXT_PARTITION && CONFIG_TX64X64
-  // The decoder is designed so that it can process 64x64 luma pixels at a
-  // time. If this is a chroma plane with subsampling and bsize corresponds to
-  // a subsampled BLOCK_128X128 then the lookup above will give TX_64X64. That
-  // mustn't be used for the subsampled plane (because it would be bigger than
-  // a 64x64 luma block) so we round down to TX_32X32.
-  if (subsampled && max_txsize == TX_64X64) max_txsize = TX_32X32;
-#else
-  (void)subsampled;
-#endif
+  return mbmi->ref_frame[0] > INTRA_FRAME &&
+         mbmi->ref_frame[1] == INTRA_FRAME && is_interintra_allowed(mbmi);
+}
 
-  return max_txsize;
+static INLINE int get_vartx_max_txsize(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                       int plane) {
+  if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;
+  const TX_SIZE max_txsize = max_txsize_rect_lookup[bsize];
+  if (plane == 0) return max_txsize;            // luma
+  return av1_get_adjusted_tx_size(max_txsize);  // chroma
 }
-#endif  // CONFIG_VAR_TX
 
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 static INLINE int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) {
   return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
 }
 
 static INLINE int is_motion_variation_allowed_compound(
     const MB_MODE_INFO *mbmi) {
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!has_second_ref(mbmi) && !is_inter_singleref_comp_mode(mbmi->mode))
-#else
   if (!has_second_ref(mbmi))
-#endif  // CONFIG_COMPOUND_SINGLEREF
     return 1;
   else
     return 0;
 }
 
-#if CONFIG_MOTION_VAR
 // input: log2 of length, 0(4), 1(8), ...
 static const int max_neighbor_obmc[6] = { 0, 1, 2, 3, 4, 4 };
 
@@ -1556,102 +1017,53 @@ static INLINE int check_num_overlappable_neighbors(const MB_MODE_INFO *mbmi) {
   return !(mbmi->overlappable_neighbors[0] == 0 &&
            mbmi->overlappable_neighbors[1] == 0);
 }
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-static INLINE NCOBMC_MODE ncobmc_mode_allowed_bsize(BLOCK_SIZE bsize) {
-  if (bsize < BLOCK_8X8 || bsize >= BLOCK_64X64)
-    return NO_OVERLAP;
-  else
-    return MAX_NCOBMC_MODES;
-}
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR
 
-static INLINE MOTION_MODE motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION
-    int block, const WarpedMotionParams *gm_params,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-    const MACROBLOCKD *xd,
-#endif
-    const MODE_INFO *mi) {
-  const MB_MODE_INFO *mbmi = &mi->mbmi;
-#if CONFIG_AMVR
-  if (xd->cur_frame_mv_precision_level == 0) {
-#endif
-#if CONFIG_GLOBAL_MOTION
+static INLINE MOTION_MODE
+motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
+                    const MB_MODE_INFO *mbmi, int allow_warped_motion) {
+  if (xd->cur_frame_force_integer_mv == 0) {
     const TransformationType gm_type = gm_params[mbmi->ref_frame[0]].wmtype;
-    if (is_global_mv_block(mi, block, gm_type)) return SIMPLE_TRANSLATION;
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_AMVR
+    if (is_global_mv_block(mbmi, gm_type)) return SIMPLE_TRANSLATION;
   }
-#endif
   if (is_motion_variation_allowed_bsize(mbmi->sb_type) &&
       is_inter_mode(mbmi->mode) && mbmi->ref_frame[1] != INTRA_FRAME &&
       is_motion_variation_allowed_compound(mbmi)) {
-#if CONFIG_MOTION_VAR
     if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
-#endif
-#if CONFIG_WARPED_MOTION
-    if (!has_second_ref(mbmi) && mbmi->num_proj_ref[0] >= 1 &&
-        !av1_is_scaled(&(xd->block_refs[0]->sf))) {
-#if CONFIG_AMVR
-      if (xd->cur_frame_mv_precision_level) {
+    assert(!has_second_ref(mbmi));
+    if (mbmi->num_proj_ref[0] >= 1 &&
+        (allow_warped_motion && !av1_is_scaled(&(xd->block_refs[0]->sf)))) {
+      if (xd->cur_frame_force_integer_mv) {
         return OBMC_CAUSAL;
       }
-#endif
       return WARPED_CAUSAL;
     }
-
-#endif  // CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-    if (ncobmc_mode_allowed_bsize(mbmi->sb_type) < NO_OVERLAP)
-      return NCOBMC_ADAPT_WEIGHT;
-    else
-#endif
-      return OBMC_CAUSAL;
-#else
-    return SIMPLE_TRANSLATION;
-#endif  // CONFIG_MOTION_VAR
+    return OBMC_CAUSAL;
   } else {
     return SIMPLE_TRANSLATION;
   }
 }
 
 static INLINE void assert_motion_mode_valid(MOTION_MODE mode,
-#if CONFIG_GLOBAL_MOTION
-                                            int block,
                                             const WarpedMotionParams *gm_params,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
                                             const MACROBLOCKD *xd,
-#endif
-                                            const MODE_INFO *mi) {
-  const MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION
-      block, gm_params,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-      xd,
-#endif
-      mi);
+                                            const MB_MODE_INFO *mbmi,
+                                            int allow_warped_motion) {
+  const MOTION_MODE last_motion_mode_allowed =
+      motion_mode_allowed(gm_params, xd, mbmi, allow_warped_motion);
 
   // Check that the input mode is not illegal
   if (last_motion_mode_allowed < mode)
     assert(0 && "Illegal motion mode selected");
 }
 
-#if CONFIG_MOTION_VAR
 static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
   return (is_inter_block(mbmi));
 }
-#endif  // CONFIG_MOTION_VAR
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
 static INLINE int av1_allow_palette(int allow_screen_content_tools,
                                     BLOCK_SIZE sb_type) {
-  return allow_screen_content_tools && sb_type >= BLOCK_8X8 &&
-         sb_type <= BLOCK_LARGEST;
+  return allow_screen_content_tools && block_size_wide[sb_type] <= 64 &&
+         block_size_high[sb_type] <= 64 && sb_type >= BLOCK_8X8;
 }
 
 // Returns sub-sampled dimensions of the given block.
@@ -1677,10 +1089,21 @@ static INLINE void av1_get_block_dimensions(BLOCK_SIZE bsize, int plane,
   assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_y == 0));
   assert(block_width >= block_cols);
   assert(block_height >= block_rows);
-  if (width) *width = block_width >> pd->subsampling_x;
-  if (height) *height = block_height >> pd->subsampling_y;
-  if (rows_within_bounds) *rows_within_bounds = block_rows >> pd->subsampling_y;
-  if (cols_within_bounds) *cols_within_bounds = block_cols >> pd->subsampling_x;
+  const int plane_block_width = block_width >> pd->subsampling_x;
+  const int plane_block_height = block_height >> pd->subsampling_y;
+  // Special handling for chroma sub8x8.
+  const int is_chroma_sub8_x = plane > 0 && plane_block_width < 4;
+  const int is_chroma_sub8_y = plane > 0 && plane_block_height < 4;
+  if (width) *width = plane_block_width + 2 * is_chroma_sub8_x;
+  if (height) *height = plane_block_height + 2 * is_chroma_sub8_y;
+  if (rows_within_bounds) {
+    *rows_within_bounds =
+        (block_rows >> pd->subsampling_y) + 2 * is_chroma_sub8_y;
+  }
+  if (cols_within_bounds) {
+    *cols_within_bounds =
+        (block_cols >> pd->subsampling_x) + 2 * is_chroma_sub8_x;
+  }
 }
 
 /* clang-format off */
@@ -1701,39 +1124,22 @@ typedef struct {
   ColorCost color_cost;
 } Av1ColorMapParam;
 
-#if CONFIG_GLOBAL_MOTION
-static INLINE int is_nontrans_global_motion(const MACROBLOCKD *xd) {
-  const MODE_INFO *mi = xd->mi[0];
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+static INLINE int is_nontrans_global_motion(const MACROBLOCKD *xd,
+                                            const MB_MODE_INFO *mbmi) {
   int ref;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
 
-  // First check if all modes are ZEROMV
-  if (mbmi->sb_type >= BLOCK_8X8 || unify_bsize) {
-    if (mbmi->mode != ZEROMV && mbmi->mode != ZERO_ZEROMV) return 0;
-  } else {
-    if ((mi->bmi[0].as_mode != ZEROMV && mi->bmi[0].as_mode != ZERO_ZEROMV) ||
-        (mi->bmi[1].as_mode != ZEROMV && mi->bmi[1].as_mode != ZERO_ZEROMV) ||
-        (mi->bmi[2].as_mode != ZEROMV && mi->bmi[2].as_mode != ZERO_ZEROMV) ||
-        (mi->bmi[3].as_mode != ZEROMV && mi->bmi[3].as_mode != ZERO_ZEROMV))
-      return 0;
-  }
+  // First check if all modes are GLOBALMV
+  if (mbmi->mode != GLOBALMV && mbmi->mode != GLOBAL_GLOBALMV) return 0;
 
-#if !GLOBAL_SUB8X8_USED
-  if (mbmi->sb_type < BLOCK_8X8) return 0;
-#endif
+  if (AOMMIN(mi_size_wide[mbmi->sb_type], mi_size_high[mbmi->sb_type]) < 2)
+    return 0;
 
   // Now check if all global motion is non translational
   for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
-    if (xd->global_motion[mbmi->ref_frame[ref]].wmtype <= TRANSLATION) return 0;
+    if (xd->global_motion[mbmi->ref_frame[ref]].wmtype == TRANSLATION) return 0;
   }
   return 1;
 }
-#endif  // CONFIG_GLOBAL_MOTION
 
 static INLINE PLANE_TYPE get_plane_type(int plane) {
   return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
@@ -1771,6 +1177,16 @@ static INLINE void transpose_int32(int32_t *dst, int dst_stride,
     for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
 }
 
+static INLINE int av1_get_max_eob(TX_SIZE tx_size) {
+  if (tx_size == TX_64X64 || tx_size == TX_64X32 || tx_size == TX_32X64) {
+    return 1024;
+  }
+  if (tx_size == TX_16X64 || tx_size == TX_64X16) {
+    return 512;
+  }
+  return tx_size_2d[tx_size];
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/cdef.c b/third_party/aom/av1/common/cdef.c
index 397a14845..c9b974900 100644
--- a/third_party/aom/av1/common/cdef.c
+++ b/third_party/aom/av1/common/cdef.c
@@ -13,7 +13,8 @@
 #include <math.h>
 #include <string.h>
 
-#include "./aom_scale_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "av1/common/cdef.h"
 #include "av1/common/cdef_block.h"
@@ -21,7 +22,6 @@
 #include "av1/common/reconinter.h"
 
 int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col) {
-  int r, c;
   int maxc, maxr;
   int skip = 1;
   maxc = cm->mi_cols - mi_col;
@@ -30,38 +30,40 @@ int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col) {
   maxr = AOMMIN(maxr, MI_SIZE_64X64);
   maxc = AOMMIN(maxc, MI_SIZE_64X64);
 
-  for (r = 0; r < maxr; r++) {
-    for (c = 0; c < maxc; c++) {
-      skip = skip &&
-             cm->mi_grid_visible[(mi_row + r) * cm->mi_stride + mi_col + c]
-                 ->mbmi.skip;
+  for (int r = 0; r < maxr; r++) {
+    for (int c = 0; c < maxc; c++) {
+      skip =
+          skip &&
+          cm->mi_grid_visible[(mi_row + r) * cm->mi_stride + mi_col + c]->skip;
     }
   }
   return skip;
 }
 
-static int is_8x8_block_skip(MODE_INFO **grid, int mi_row, int mi_col,
+static int is_8x8_block_skip(MB_MODE_INFO **grid, int mi_row, int mi_col,
                              int mi_stride) {
   int is_skip = 1;
   for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r)
     for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c)
-      is_skip &= grid[(mi_row + r) * mi_stride + (mi_col + c)]->mbmi.skip;
+      is_skip &= grid[(mi_row + r) * mi_stride + (mi_col + c)]->skip;
 
   return is_skip;
 }
 
 int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
-                         cdef_list *dlist, int filter_skip) {
-  int r, c;
-  int maxc, maxr;
-  MODE_INFO **grid;
-  int count = 0;
-  grid = cm->mi_grid_visible;
-  maxc = cm->mi_cols - mi_col;
-  maxr = cm->mi_rows - mi_row;
+                         cdef_list *dlist, BLOCK_SIZE bs) {
+  MB_MODE_INFO **grid = cm->mi_grid_visible;
+  int maxc = cm->mi_cols - mi_col;
+  int maxr = cm->mi_rows - mi_row;
 
-  maxr = AOMMIN(maxr, MI_SIZE_64X64);
-  maxc = AOMMIN(maxc, MI_SIZE_64X64);
+  if (bs == BLOCK_128X128 || bs == BLOCK_128X64)
+    maxc = AOMMIN(maxc, MI_SIZE_128X128);
+  else
+    maxc = AOMMIN(maxc, MI_SIZE_64X64);
+  if (bs == BLOCK_128X128 || bs == BLOCK_64X128)
+    maxr = AOMMIN(maxr, MI_SIZE_128X128);
+  else
+    maxr = AOMMIN(maxr, MI_SIZE_64X64);
 
   const int r_step = mi_size_high[BLOCK_8X8];
   const int c_step = mi_size_wide[BLOCK_8X8];
@@ -71,36 +73,25 @@ int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
   assert(r_step == 1 || r_step == 2);
   assert(c_step == 1 || c_step == 2);
 
-  if (filter_skip) {
-    for (r = 0; r < maxr; r += r_step) {
-      for (c = 0; c < maxc; c += c_step) {
+  int count = 0;
+
+  for (int r = 0; r < maxr; r += r_step) {
+    for (int c = 0; c < maxc; c += c_step) {
+      if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c, cm->mi_stride)) {
         dlist[count].by = r >> r_shift;
         dlist[count].bx = c >> c_shift;
-        dlist[count].skip =
-            is_8x8_block_skip(grid, mi_row + r, mi_col + c, cm->mi_stride);
+        dlist[count].skip = 0;
         count++;
       }
     }
-  } else {
-    for (r = 0; r < maxr; r += r_step) {
-      for (c = 0; c < maxc; c += c_step) {
-        if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c, cm->mi_stride)) {
-          dlist[count].by = r >> r_shift;
-          dlist[count].bx = c >> c_shift;
-          dlist[count].skip = 0;
-          count++;
-        }
-      }
-    }
   }
   return count;
 }
 
 void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src,
                                 int sstride, int v, int h) {
-  int i, j;
-  for (i = 0; i < v; i++) {
-    for (j = 0; j < h; j++) {
+  for (int i = 0; i < v; i++) {
+    for (int j = 0; j < h; j++) {
       dst[i * dstride + j] = src[i * sstride + j];
     }
   }
@@ -109,36 +100,30 @@ void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src,
 void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
                                  const uint16_t *src, int sstride, int v,
                                  int h) {
-  int i, j;
-  for (i = 0; i < v; i++) {
-    for (j = 0; j < h; j++) {
+  for (int i = 0; i < v; i++) {
+    for (int j = 0; j < h; j++) {
       dst[i * dstride + j] = src[i * sstride + j];
     }
   }
 }
 
-static void copy_sb8_16(UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride,
+static void copy_sb8_16(AOM_UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride,
                         const uint8_t *src, int src_voffset, int src_hoffset,
                         int sstride, int vsize, int hsize) {
-#if CONFIG_HIGHBITDEPTH
   if (cm->use_highbitdepth) {
     const uint16_t *base =
         &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
     copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
   } else {
-#endif
     const uint8_t *base = &src[src_voffset * sstride + src_hoffset];
     copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
-#if CONFIG_HIGHBITDEPTH
   }
-#endif
 }
 
 static INLINE void fill_rect(uint16_t *dst, int dstride, int v, int h,
                              uint16_t x) {
-  int i, j;
-  for (i = 0; i < v; i++) {
-    for (j = 0; j < h; j++) {
+  for (int i = 0; i < v; i++) {
+    for (int j = 0; j < h; j++) {
       dst[i * dstride + j] = x;
     }
   }
@@ -146,9 +131,8 @@ static INLINE void fill_rect(uint16_t *dst, int dstride, int v, int h,
 
 static INLINE void copy_rect(uint16_t *dst, int dstride, const uint16_t *src,
                              int sstride, int v, int h) {
-  int i, j;
-  for (i = 0; i < v; i++) {
-    for (j = 0; j < h; j++) {
+  for (int i = 0; i < v; i++) {
+    for (int j = 0; j < h; j++) {
       dst[i * dstride + j] = src[i * sstride + j];
     }
   }
@@ -156,9 +140,8 @@ static INLINE void copy_rect(uint16_t *dst, int dstride, const uint16_t *src,
 
 void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                     MACROBLOCKD *xd) {
-  int fbr, fbc;
-  int nhfb, nvfb;
-  uint16_t src[CDEF_INBUF_SIZE];
+  const int num_planes = av1_num_planes(cm);
+  DECLARE_ALIGNED(16, uint16_t, src[CDEF_INBUF_SIZE]);
   uint16_t *linebuf[3];
   uint16_t *colbuf[3];
   cdef_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64];
@@ -166,48 +149,42 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
   int cdef_count;
   int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
   int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
-  int stride;
   int mi_wide_l2[3];
   int mi_high_l2[3];
   int xdec[3];
   int ydec[3];
-  int pli;
-  int cdef_left;
   int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
-  int nplanes = MAX_MB_PLANE;
-  int chroma_cdef = xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
-                    xd->plane[2].subsampling_x == xd->plane[2].subsampling_y;
-  nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  av1_setup_dst_planes(xd->plane, cm->sb_size, frame, 0, 0);
+  const int nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
+                       num_planes);
   row_cdef = aom_malloc(sizeof(*row_cdef) * (nhfb + 2) * 2);
   memset(row_cdef, 1, sizeof(*row_cdef) * (nhfb + 2) * 2);
   prev_row_cdef = row_cdef + 1;
   curr_row_cdef = prev_row_cdef + nhfb + 2;
-  for (pli = 0; pli < nplanes; pli++) {
+  for (int pli = 0; pli < num_planes; pli++) {
     xdec[pli] = xd->plane[pli].subsampling_x;
     ydec[pli] = xd->plane[pli].subsampling_y;
     mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
     mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
-    if (xdec[pli] != ydec[pli]) nplanes = 1;
   }
-  stride = (cm->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
-  for (pli = 0; pli < nplanes; pli++) {
+  const int stride = (cm->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
+  for (int pli = 0; pli < num_planes; pli++) {
     linebuf[pli] = aom_malloc(sizeof(*linebuf) * CDEF_VBORDER * stride);
     colbuf[pli] =
         aom_malloc(sizeof(*colbuf) *
                    ((CDEF_BLOCKSIZE << mi_high_l2[pli]) + 2 * CDEF_VBORDER) *
                    CDEF_HBORDER);
   }
-  for (fbr = 0; fbr < nvfb; fbr++) {
-    for (pli = 0; pli < nplanes; pli++) {
+  for (int fbr = 0; fbr < nvfb; fbr++) {
+    for (int pli = 0; pli < num_planes; pli++) {
       const int block_height =
           (MI_SIZE_64X64 << mi_high_l2[pli]) + 2 * CDEF_VBORDER;
       fill_rect(colbuf[pli], CDEF_HBORDER, block_height, CDEF_HBORDER,
                 CDEF_VERY_LARGE);
     }
-    cdef_left = 1;
-    for (fbc = 0; fbc < nhfb; fbc++) {
+    int cdef_left = 1;
+    for (int fbc = 0; fbc < nhfb; fbc++) {
       int level, sec_strength;
       int uv_level, uv_sec_strength;
       int nhb, nvb;
@@ -217,38 +194,43 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                               MI_SIZE_64X64 * fbc] == NULL ||
           cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
                               MI_SIZE_64X64 * fbc]
-                  ->mbmi.cdef_strength == -1) {
+                  ->cdef_strength == -1) {
         cdef_left = 0;
         continue;
       }
       if (!cdef_left) cstart = -CDEF_HBORDER;
       nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * fbc);
       nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * fbr);
-      int tile_top, tile_left, tile_bottom, tile_right;
-      int mi_idx = MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc;
-      MODE_INFO *const mi_tl = cm->mi + mi_idx;
-      BOUNDARY_TYPE boundary_tl = mi_tl->mbmi.boundary_info;
-      tile_top = boundary_tl & TILE_ABOVE_BOUNDARY;
-      tile_left = boundary_tl & TILE_LEFT_BOUNDARY;
+      int frame_top, frame_left, frame_bottom, frame_right;
+
+      int mi_row = MI_SIZE_64X64 * fbr;
+      int mi_col = MI_SIZE_64X64 * fbc;
+      // for the current filter block, it's top left corner mi structure (mi_tl)
+      // is first accessed to check whether the top and left boundaries are
+      // frame boundaries. Then bottom-left and top-right mi structures are
+      // accessed to check whether the bottom and right boundaries
+      // (respectively) are frame boundaries.
+      //
+      // Note that we can't just check the bottom-right mi structure - eg. if
+      // we're at the right-hand edge of the frame but not the bottom, then
+      // the bottom-right mi is NULL but the bottom-left is not.
+      frame_top = (mi_row == 0) ? 1 : 0;
+      frame_left = (mi_col == 0) ? 1 : 0;
 
-      if (fbr != nvfb - 1 &&
-          (&cm->mi[mi_idx + (MI_SIZE_64X64 - 1) * cm->mi_stride]))
-        tile_bottom = cm->mi[mi_idx + (MI_SIZE_64X64 - 1) * cm->mi_stride]
-                          .mbmi.boundary_info &
-                      TILE_BOTTOM_BOUNDARY;
+      if (fbr != nvfb - 1)
+        frame_bottom = (mi_row + MI_SIZE_64X64 == cm->mi_rows) ? 1 : 0;
       else
-        tile_bottom = 1;
+        frame_bottom = 1;
 
-      if (fbc != nhfb - 1 && (&cm->mi[mi_idx + MI_SIZE_64X64 - 1]))
-        tile_right = cm->mi[mi_idx + MI_SIZE_64X64 - 1].mbmi.boundary_info &
-                     TILE_RIGHT_BOUNDARY;
+      if (fbc != nhfb - 1)
+        frame_right = (mi_col + MI_SIZE_64X64 == cm->mi_cols) ? 1 : 0;
       else
-        tile_right = 1;
+        frame_right = 1;
 
       const int mbmi_cdef_strength =
           cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
                               MI_SIZE_64X64 * fbc]
-              ->mbmi.cdef_strength;
+              ->cdef_strength;
       level = cm->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
       sec_strength =
           cm->cdef_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
@@ -259,23 +241,15 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
       uv_sec_strength += uv_sec_strength == 3;
       if ((level == 0 && sec_strength == 0 && uv_level == 0 &&
            uv_sec_strength == 0) ||
-          (cdef_count = sb_compute_cdef_list(
-               cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist,
-#if CONFIG_CDEF_SINGLEPASS
-               (level & 1) || (uv_level & 1))) == 0)
-#else
-                 get_filter_skip(level) || get_filter_skip(uv_level))) == 0)
-#endif
-      {
+          (cdef_count = sb_compute_cdef_list(cm, fbr * MI_SIZE_64X64,
+                                             fbc * MI_SIZE_64X64, dlist,
+                                             BLOCK_64X64)) == 0) {
         cdef_left = 0;
         continue;
       }
 
       curr_row_cdef[fbc] = 1;
-      for (pli = 0; pli < nplanes; pli++) {
-#if !CONFIG_CDEF_SINGLEPASS
-        uint16_t dst[CDEF_BLOCKSIZE * CDEF_BLOCKSIZE];
-#endif
+      for (int pli = 0; pli < num_planes; pli++) {
         int coffset;
         int rend, cend;
         int pri_damping = cm->cdef_pri_damping;
@@ -284,10 +258,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
         int vsize = nvb << mi_high_l2[pli];
 
         if (pli) {
-          if (chroma_cdef)
-            level = uv_level;
-          else
-            level = 0;
+          level = uv_level;
           sec_strength = uv_sec_strength;
         }
 
@@ -375,81 +346,57 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
             (MI_SIZE_64X64 << mi_high_l2[pli]) * (fbr + 1) - CDEF_VBORDER,
             coffset, xd->plane[pli].dst.stride, CDEF_VBORDER, hsize);
 
-        if (tile_top) {
+        if (frame_top) {
           fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, hsize + 2 * CDEF_HBORDER,
                     CDEF_VERY_LARGE);
         }
-        if (tile_left) {
+        if (frame_left) {
           fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER,
                     CDEF_VERY_LARGE);
         }
-        if (tile_bottom) {
+        if (frame_bottom) {
           fill_rect(&src[(vsize + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE,
                     CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE);
         }
-        if (tile_right) {
+        if (frame_right) {
           fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
                     vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
         }
-#if CONFIG_HIGHBITDEPTH
+
         if (cm->use_highbitdepth) {
           cdef_filter_fb(
-#if CONFIG_CDEF_SINGLEPASS
               NULL,
-              &CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)
-#else
-              (uint8_t *)&CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)
-#endif
-                  [xd->plane[pli].dst.stride *
-                       (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
-                   (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
-#if CONFIG_CDEF_SINGLEPASS
+              &CONVERT_TO_SHORTPTR(
+                  xd->plane[pli]
+                      .dst.buf)[xd->plane[pli].dst.stride *
+                                    (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
+                                (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
               xd->plane[pli].dst.stride,
-#else
-              xd->plane[pli].dst.stride, dst,
-#endif
               &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
               ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
-#if CONFIG_CDEF_SINGLEPASS
               sec_strength, pri_damping, sec_damping, coeff_shift);
-#else
-              sec_strength, sec_damping, pri_damping, coeff_shift, 0, 1);
-#endif
         } else {
-#endif
           cdef_filter_fb(
               &xd->plane[pli]
                    .dst.buf[xd->plane[pli].dst.stride *
                                 (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
                             (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
-#if CONFIG_CDEF_SINGLEPASS
               NULL, xd->plane[pli].dst.stride,
-#else
-              xd->plane[pli].dst.stride, dst,
-#endif
               &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
               ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
-#if CONFIG_CDEF_SINGLEPASS
               sec_strength, pri_damping, sec_damping, coeff_shift);
-#else
-              sec_strength, sec_damping, pri_damping, coeff_shift, 0, 0);
-#endif
-
-#if CONFIG_HIGHBITDEPTH
         }
-#endif
       }
       cdef_left = 1;
     }
     {
-      unsigned char *tmp;
-      tmp = prev_row_cdef;
+      unsigned char *tmp = prev_row_cdef;
       prev_row_cdef = curr_row_cdef;
       curr_row_cdef = tmp;
     }
   }
   aom_free(row_cdef);
-  for (pli = 0; pli < nplanes; pli++) {
+  for (int pli = 0; pli < num_planes; pli++) {
     aom_free(linebuf[pli]);
     aom_free(colbuf[pli]);
   }
diff --git a/third_party/aom/av1/common/cdef.h b/third_party/aom/av1/common/cdef.h
index 9de24bf92..092230de9 100644
--- a/third_party/aom/av1/common/cdef.h
+++ b/third_party/aom/av1/common/cdef.h
@@ -11,12 +11,13 @@
 #ifndef AV1_COMMON_CDEF_H_
 #define AV1_COMMON_CDEF_H_
 
-#define CDEF_STRENGTH_BITS 7
+#define CDEF_STRENGTH_BITS 6
 
-#define CDEF_PRI_STRENGTHS 32
+#define CDEF_PRI_STRENGTHS 16
 #define CDEF_SEC_STRENGTHS 4
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 #include "av1/common/cdef_block.h"
@@ -38,7 +39,7 @@ extern "C" {
 
 int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col);
 int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
-                         cdef_list *dlist, int filter_skip);
+                         cdef_list *dlist, BLOCK_SIZE bsize);
 void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd);
 
 void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
diff --git a/third_party/aom/av1/common/cdef_block.c b/third_party/aom/av1/common/cdef_block.c
index aaa32c950..df1de89be 100644
--- a/third_party/aom/av1/common/cdef_block.c
+++ b/third_party/aom/av1/common/cdef_block.c
@@ -12,28 +12,13 @@
 #include <math.h>
 #include <stdlib.h>
 
-#ifdef HAVE_CONFIG_H
-#include "./config.h"
-#endif
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
-#include "./cdef.h"
+#include "av1/common/cdef.h"
 
 /* Generated from gen_filter_tables.c. */
-#if !CONFIG_CDEF_SINGLEPASS || CDEF_FULL
-const int cdef_directions[8][3] = {
-  { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2, -3 * CDEF_BSTRIDE + 3 },
-  { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2, -1 * CDEF_BSTRIDE + 3 },
-  { 0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2, 0 * CDEF_BSTRIDE + 3 },
-  { 0 * CDEF_BSTRIDE + 1, 1 * CDEF_BSTRIDE + 2, 1 * CDEF_BSTRIDE + 3 },
-  { 1 * CDEF_BSTRIDE + 1, 2 * CDEF_BSTRIDE + 2, 3 * CDEF_BSTRIDE + 3 },
-  { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 1, 3 * CDEF_BSTRIDE + 1 },
-  { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0, 3 * CDEF_BSTRIDE + 0 },
-  { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1, 3 * CDEF_BSTRIDE - 1 }
-};
-#else
-const int cdef_directions[8][2] = {
+DECLARE_ALIGNED(16, const int, cdef_directions[8][2]) = {
   { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 },
   { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 },
   { 0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2 },
@@ -43,7 +28,6 @@ const int cdef_directions[8][2] = {
   { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 },
   { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 }
 };
-#endif
 
 /* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
    The search minimizes the weighted variance along all the lines in a
@@ -123,65 +107,38 @@ int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var,
   return best_dir;
 }
 
-#if CONFIG_CDEF_SINGLEPASS
-#if CDEF_FULL
-const int cdef_pri_taps[2][3] = { { 3, 2, 1 }, { 2, 2, 2 } };
-const int cdef_sec_taps[2][2] = { { 3, 1 }, { 3, 1 } };
-#else
 const int cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
 const int cdef_sec_taps[2][2] = { { 2, 1 }, { 2, 1 } };
-#endif
 
 /* Smooth in the direction detected. */
-#if CDEF_CAP
-void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
-                         const uint16_t *in, int pri_strength, int sec_strength,
-                         int dir, int pri_damping, int sec_damping, int bsize,
-                         UNUSED int max_unused)
-#else
 void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
                          const uint16_t *in, int pri_strength, int sec_strength,
                          int dir, int pri_damping, int sec_damping, int bsize,
-                         int max)
-#endif
-{
+                         AOM_UNUSED int max_unused, int coeff_shift) {
   int i, j, k;
   const int s = CDEF_BSTRIDE;
-  const int *pri_taps = cdef_pri_taps[pri_strength & 1];
-  const int *sec_taps = cdef_sec_taps[pri_strength & 1];
-  for (i = 0; i < 4 << (bsize == BLOCK_8X8); i++) {
-    for (j = 0; j < 4 << (bsize == BLOCK_8X8); j++) {
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+  for (i = 0; i < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_4X8); i++) {
+    for (j = 0; j < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_8X4); j++) {
       int16_t sum = 0;
       int16_t y;
       int16_t x = in[i * s + j];
-#if CDEF_CAP
       int max = x;
       int min = x;
-#endif
-#if CDEF_FULL
-      for (k = 0; k < 3; k++)
-#else
-      for (k = 0; k < 2; k++)
-#endif
-      {
+      for (k = 0; k < 2; k++) {
         int16_t p0 = in[i * s + j + cdef_directions[dir][k]];
         int16_t p1 = in[i * s + j - cdef_directions[dir][k]];
         sum += pri_taps[k] * constrain(p0 - x, pri_strength, pri_damping);
         sum += pri_taps[k] * constrain(p1 - x, pri_strength, pri_damping);
-#if CDEF_CAP
         if (p0 != CDEF_VERY_LARGE) max = AOMMAX(p0, max);
         if (p1 != CDEF_VERY_LARGE) max = AOMMAX(p1, max);
         min = AOMMIN(p0, min);
         min = AOMMIN(p1, min);
-#endif
-#if CDEF_FULL
-        if (k == 2) continue;
-#endif
         int16_t s0 = in[i * s + j + cdef_directions[(dir + 2) & 7][k]];
         int16_t s1 = in[i * s + j - cdef_directions[(dir + 2) & 7][k]];
         int16_t s2 = in[i * s + j + cdef_directions[(dir + 6) & 7][k]];
         int16_t s3 = in[i * s + j - cdef_directions[(dir + 6) & 7][k]];
-#if CDEF_CAP
         if (s0 != CDEF_VERY_LARGE) max = AOMMAX(s0, max);
         if (s1 != CDEF_VERY_LARGE) max = AOMMAX(s1, max);
         if (s2 != CDEF_VERY_LARGE) max = AOMMAX(s2, max);
@@ -190,17 +147,12 @@ void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
         min = AOMMIN(s1, min);
         min = AOMMIN(s2, min);
         min = AOMMIN(s3, min);
-#endif
         sum += sec_taps[k] * constrain(s0 - x, sec_strength, sec_damping);
         sum += sec_taps[k] * constrain(s1 - x, sec_strength, sec_damping);
         sum += sec_taps[k] * constrain(s2 - x, sec_strength, sec_damping);
         sum += sec_taps[k] * constrain(s3 - x, sec_strength, sec_damping);
       }
-#if CDEF_CAP
       y = clamp((int16_t)x + ((8 + sum - (sum < 0)) >> 4), min, max);
-#else
-      y = clamp((int16_t)x + ((8 + sum - (sum < 0)) >> 4), 0, max);
-#endif
       if (dst8)
         dst8[i * dstride + j] = (uint8_t)y;
       else
@@ -209,67 +161,6 @@ void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
   }
 }
 
-#else
-
-/* Smooth in the direction detected. */
-void cdef_direction_8x8_c(uint16_t *y, int ystride, const uint16_t *in,
-                          int threshold, int dir, int damping) {
-  int i;
-  int j;
-  int k;
-  static const int taps[3] = { 3, 2, 1 };
-  for (i = 0; i < 8; i++) {
-    for (j = 0; j < 8; j++) {
-      int16_t sum;
-      int16_t xx;
-      int16_t yy;
-      xx = in[i * CDEF_BSTRIDE + j];
-      sum = 0;
-      for (k = 0; k < 3; k++) {
-        int16_t p0;
-        int16_t p1;
-        p0 = in[i * CDEF_BSTRIDE + j + cdef_directions[dir][k]] - xx;
-        p1 = in[i * CDEF_BSTRIDE + j - cdef_directions[dir][k]] - xx;
-        sum += taps[k] * constrain(p0, threshold, damping);
-        sum += taps[k] * constrain(p1, threshold, damping);
-      }
-      sum = (sum + 8) >> 4;
-      yy = xx + sum;
-      y[i * ystride + j] = yy;
-    }
-  }
-}
-
-/* Smooth in the direction detected. */
-void cdef_direction_4x4_c(uint16_t *y, int ystride, const uint16_t *in,
-                          int threshold, int dir, int damping) {
-  int i;
-  int j;
-  int k;
-  static const int taps[2] = { 4, 1 };
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      int16_t sum;
-      int16_t xx;
-      int16_t yy;
-      xx = in[i * CDEF_BSTRIDE + j];
-      sum = 0;
-      for (k = 0; k < 2; k++) {
-        int16_t p0;
-        int16_t p1;
-        p0 = in[i * CDEF_BSTRIDE + j + cdef_directions[dir][k]] - xx;
-        p1 = in[i * CDEF_BSTRIDE + j - cdef_directions[dir][k]] - xx;
-        sum += taps[k] * constrain(p0, threshold, damping);
-        sum += taps[k] * constrain(p1, threshold, damping);
-      }
-      sum = (sum + 8) >> 4;
-      yy = xx + sum;
-      y[i * ystride + j] = yy;
-    }
-  }
-}
-#endif
-
 /* Compute the primary filter strength for an 8x8 block based on the
    directional variance difference. A high variance difference means
    that we have a highly directional pattern (e.g. a high contrast
@@ -282,172 +173,26 @@ static INLINE int adjust_strength(int strength, int32_t var) {
   return var ? (strength * (4 + i) + 8) >> 4 : 0;
 }
 
-#if !CONFIG_CDEF_SINGLEPASS
-void copy_8x8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src,
-                               int sstride) {
-  int i, j;
-  for (i = 0; i < 8; i++)
-    for (j = 0; j < 8; j++) dst[i * dstride + j] = src[i * sstride + j];
-}
-
-void copy_4x4_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src,
-                               int sstride) {
-  int i, j;
-  for (i = 0; i < 4; i++)
-    for (j = 0; j < 4; j++) dst[i * dstride + j] = src[i * sstride + j];
-}
-
-static void copy_block_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
-                                      cdef_list *dlist, int cdef_count,
-                                      int bsize) {
-  int bi, bx, by;
-
-  if (bsize == BLOCK_8X8) {
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      copy_8x8_16bit_to_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
-                              &src[bi << (3 + 3)], 8);
-    }
-  } else if (bsize == BLOCK_4X8) {
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      copy_4x4_16bit_to_16bit(&dst[(by << 3) * dstride + (bx << 2)], dstride,
-                              &src[bi << (3 + 2)], 4);
-      copy_4x4_16bit_to_16bit(&dst[((by << 3) + 4) * dstride + (bx << 2)],
-                              dstride, &src[(bi << (3 + 2)) + 4 * 4], 4);
-    }
-  } else if (bsize == BLOCK_8X4) {
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      copy_4x4_16bit_to_16bit(&dst[(by << 2) * dstride + (bx << 3)], dstride,
-                              &src[bi << (2 + 3)], 8);
-      copy_4x4_16bit_to_16bit(&dst[(by << 2) * dstride + (bx << 3) + 4],
-                              dstride, &src[(bi << (2 + 3)) + 4], 8);
-    }
-  } else {
-    assert(bsize == BLOCK_4X4);
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      copy_4x4_16bit_to_16bit(&dst[(by << 2) * dstride + (bx << 2)], dstride,
-                              &src[bi << (2 + 2)], 4);
-    }
-  }
-}
-
-void copy_8x8_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src,
-                              int sstride) {
-  int i, j;
-  for (i = 0; i < 8; i++)
-    for (j = 0; j < 8; j++)
-      dst[i * dstride + j] = (uint8_t)src[i * sstride + j];
-}
-
-void copy_4x4_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src,
-                              int sstride) {
-  int i, j;
-  for (i = 0; i < 4; i++)
-    for (j = 0; j < 4; j++)
-      dst[i * dstride + j] = (uint8_t)src[i * sstride + j];
-}
-
-static void copy_block_16bit_to_8bit(uint8_t *dst, int dstride,
-                                     const uint16_t *src, cdef_list *dlist,
-                                     int cdef_count, int bsize) {
-  int bi, bx, by;
-  if (bsize == BLOCK_8X8) {
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      copy_8x8_16bit_to_8bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
-                             &src[bi << (3 + 3)], 8);
-    }
-  } else if (bsize == BLOCK_4X8) {
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      copy_4x4_16bit_to_8bit(&dst[(by << 3) * dstride + (bx << 2)], dstride,
-                             &src[bi << (3 + 2)], 4);
-      copy_4x4_16bit_to_8bit(&dst[((by << 3) + 4) * dstride + (bx << 2)],
-                             dstride, &src[(bi << (3 + 2)) + 4 * 4], 4);
-    }
-  } else if (bsize == BLOCK_8X4) {
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      copy_4x4_16bit_to_8bit(&dst[(by << 2) * dstride + (bx << 3)], dstride,
-                             &src[bi << (2 + 3)], 8);
-      copy_4x4_16bit_to_8bit(&dst[(by << 2) * dstride + (bx << 3) + 4], dstride,
-                             &src[(bi << (2 + 3)) + 4], 8);
-    }
-  } else {
-    assert(bsize == BLOCK_4X4);
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      copy_4x4_16bit_to_8bit(&dst[(by << 2) * dstride + (bx << 2)], dstride,
-                             &src[bi << (2 * 2)], 4);
-    }
-  }
-}
-
-int get_filter_skip(int level) {
-  int filter_skip = level & 1;
-  if (level == 1) filter_skip = 0;
-  return filter_skip;
-}
-
-void cdef_filter_fb(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in,
-                    int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
-                    int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
-                    cdef_list *dlist, int cdef_count, int level,
-                    int sec_strength, int sec_damping, int pri_damping,
-                    int coeff_shift, int skip_dering, int hbd) {
-#else
-
 void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
                     int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
                     int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
                     cdef_list *dlist, int cdef_count, int level,
                     int sec_strength, int pri_damping, int sec_damping,
                     int coeff_shift) {
-#endif
   int bi;
   int bx;
   int by;
   int bsize, bsizex, bsizey;
 
-#if CONFIG_CDEF_SINGLEPASS
-  int pri_strength = (level >> 1) << coeff_shift;
-  int filter_skip = level & 1;
-  if (!pri_strength && !sec_strength && filter_skip) {
-    pri_strength = 19 << coeff_shift;
-    sec_strength = 7 << coeff_shift;
-  }
-#else
-  int threshold = (level >> 1) << coeff_shift;
-  int filter_skip = get_filter_skip(level);
-  if (level == 1) threshold = 31 << coeff_shift;
-
-  cdef_direction_func cdef_direction[] = { cdef_direction_4x4,
-                                           cdef_direction_8x8 };
-#endif
+  int pri_strength = level << coeff_shift;
+  sec_strength <<= coeff_shift;
   sec_damping += coeff_shift - (pli != AOM_PLANE_Y);
   pri_damping += coeff_shift - (pli != AOM_PLANE_Y);
   bsize =
       ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8);
   bsizex = 3 - xdec;
   bsizey = 3 - ydec;
-#if CONFIG_CDEF_SINGLEPASS
-  if (dirinit && pri_strength == 0 && sec_strength == 0)
-#else
-  if (!skip_dering)
-#endif
-  {
-#if CONFIG_CDEF_SINGLEPASS
+  if (dirinit && pri_strength == 0 && sec_strength == 0) {
     // If we're here, both primary and secondary strengths are 0, and
     // we still haven't written anything to y[] yet, so we just copy
     // the input to y[]. This is necessary only for av1_cdef_search()
@@ -455,97 +200,16 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
     for (bi = 0; bi < cdef_count; bi++) {
       by = dlist[bi].by;
       bx = dlist[bi].bx;
-#else
-    if (pli == 0) {
-      if (!dirinit || !*dirinit) {
-        for (bi = 0; bi < cdef_count; bi++) {
-          by = dlist[bi].by;
-          bx = dlist[bi].bx;
-          dir[by][bx] = cdef_find_dir(&in[8 * by * CDEF_BSTRIDE + 8 * bx],
-                                      CDEF_BSTRIDE, &var[by][bx], coeff_shift);
-        }
-        if (dirinit) *dirinit = 1;
-      }
-    }
-    // Only run dering for non-zero threshold (which is always the case for
-    // 4:2:2 or 4:4:0). If we don't dering, we still need to eventually write
-    // something out in y[] later.
-    if (threshold != 0) {
-      assert(bsize == BLOCK_8X8 || bsize == BLOCK_4X4);
-      for (bi = 0; bi < cdef_count; bi++) {
-        int t = !filter_skip && dlist[bi].skip ? 0 : threshold;
-        by = dlist[bi].by;
-        bx = dlist[bi].bx;
-        (cdef_direction[bsize == BLOCK_8X8])(
-            &y[bi << (bsizex + bsizey)], 1 << bsizex,
-            &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
-            pli ? t : adjust_strength(t, var[by][bx]), dir[by][bx],
-            pri_damping);
-      }
-    }
-  }
-
-  if (sec_strength) {
-    if (threshold && !skip_dering)
-      copy_block_16bit_to_16bit(in, CDEF_BSTRIDE, y, dlist, cdef_count, bsize);
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      int py = by << bsizey;
-      int px = bx << bsizex;
-
-      if (!filter_skip && dlist[bi].skip) continue;
-      if (!dst || hbd) {
-        // 16 bit destination if high bitdepth or 8 bit destination not given
-        (!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block_hbd
-                                                        : aom_clpf_hblock_hbd)(
-            dst ? (uint16_t *)dst + py * dstride + px
-                : &y[bi << (bsizex + bsizey)],
-            in + py * CDEF_BSTRIDE + px, dst && hbd ? dstride : 1 << bsizex,
-            CDEF_BSTRIDE, 1 << bsizex, 1 << bsizey, sec_strength << coeff_shift,
-            sec_damping);
-      } else {
-        // Do clpf and write the result to an 8 bit destination
-        (!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block
-                                                        : aom_clpf_hblock)(
-            dst + py * dstride + px, in + py * CDEF_BSTRIDE + px, dstride,
-            CDEF_BSTRIDE, 1 << bsizex, 1 << bsizey, sec_strength << coeff_shift,
-            sec_damping);
-      }
-    }
-  } else if (threshold != 0) {
-    // No clpf, so copy instead
-    if (hbd) {
-      copy_block_16bit_to_16bit((uint16_t *)dst, dstride, y, dlist, cdef_count,
-                                bsize);
-    } else {
-      copy_block_16bit_to_8bit(dst, dstride, y, dlist, cdef_count, bsize);
-    }
-  } else if (dirinit) {
-    // If we're here, both dering and clpf are off, and we still haven't written
-    // anything to y[] yet, so we just copy the input to y[]. This is necessary
-    // only for av1_cdef_search() and only av1_cdef_search() sets dirinit.
-    for (bi = 0; bi < cdef_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-#endif
       int iy, ix;
       // TODO(stemidts/jmvalin): SIMD optimisations
       for (iy = 0; iy < 1 << bsizey; iy++)
         for (ix = 0; ix < 1 << bsizex; ix++)
-#if CONFIG_CDEF_SINGLEPASS
           dst16[(bi << (bsizex + bsizey)) + (iy << bsizex) + ix] =
-#else
-          y[(bi << (bsizex + bsizey)) + (iy << bsizex) + ix] =
-#endif
               in[((by << bsizey) + iy) * CDEF_BSTRIDE + (bx << bsizex) + ix];
     }
-#if CONFIG_CDEF_SINGLEPASS
     return;
-#endif
   }
 
-#if CONFIG_CDEF_SINGLEPASS
   if (pli == 0) {
     if (!dirinit || !*dirinit) {
       for (bi = 0; bi < cdef_count; bi++) {
@@ -557,19 +221,28 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
       if (dirinit) *dirinit = 1;
     }
   }
+  if (pli == 1 && xdec != ydec) {
+    for (bi = 0; bi < cdef_count; bi++) {
+      static const int conv422[8] = { 7, 0, 2, 4, 5, 6, 6, 6 };
+      static const int conv440[8] = { 1, 2, 2, 2, 3, 4, 6, 0 };
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      dir[by][bx] = (xdec ? conv422 : conv440)[dir[by][bx]];
+    }
+  }
 
-  assert(bsize == BLOCK_8X8 || bsize == BLOCK_4X4);
   for (bi = 0; bi < cdef_count; bi++) {
-    int t = !filter_skip && dlist[bi].skip ? 0 : pri_strength;
-    int s = !filter_skip && dlist[bi].skip ? 0 : sec_strength;
+    int t = dlist[bi].skip ? 0 : pri_strength;
+    int s = dlist[bi].skip ? 0 : sec_strength;
     by = dlist[bi].by;
     bx = dlist[bi].bx;
     if (dst8)
-      cdef_filter_block(
-          &dst8[(by << bsizey) * dstride + (bx << bsizex)], NULL, dstride,
-          &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
-          (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
-          pri_damping, sec_damping, bsize, (256 << coeff_shift) - 1);
+      cdef_filter_block(&dst8[(by << bsizey) * dstride + (bx << bsizex)], NULL,
+                        dstride,
+                        &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
+                        (pli ? t : adjust_strength(t, var[by][bx])), s,
+                        t ? dir[by][bx] : 0, pri_damping, sec_damping, bsize,
+                        (256 << coeff_shift) - 1, coeff_shift);
     else
       cdef_filter_block(
           NULL,
@@ -578,7 +251,7 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
           dirinit ? 1 << bsizex : dstride,
           &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
           (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
-          pri_damping, sec_damping, bsize, (256 << coeff_shift) - 1);
+          pri_damping, sec_damping, bsize, (256 << coeff_shift) - 1,
+          coeff_shift);
   }
-#endif
 }
diff --git a/third_party/aom/av1/common/cdef_block.h b/third_party/aom/av1/common/cdef_block.h
index bf277faad..81c6da077 100644
--- a/third_party/aom/av1/common/cdef_block.h
+++ b/third_party/aom/av1/common/cdef_block.h
@@ -12,43 +12,28 @@
 #if !defined(_CDEF_BLOCK_H)
 #define _CDEF_BLOCK_H (1)
 
-#include "./odintrin.h"
+#include "av1/common/odintrin.h"
 
 #define CDEF_BLOCKSIZE 64
 #define CDEF_BLOCKSIZE_LOG2 6
-#define CDEF_NBLOCKS (CDEF_BLOCKSIZE / 8)
-#if CONFIG_CDEF_SINGLEPASS
+#define CDEF_NBLOCKS ((1 << MAX_SB_SIZE_LOG2) / 8)
 #define CDEF_SB_SHIFT (MAX_SB_SIZE_LOG2 - CDEF_BLOCKSIZE_LOG2)
-#endif
 
 /* We need to buffer three vertical lines. */
 #define CDEF_VBORDER (3)
 /* We only need to buffer three horizontal pixels too, but let's align to
    16 bytes (8 x 16 bits) to make vectorization easier. */
 #define CDEF_HBORDER (8)
-#define CDEF_BSTRIDE ALIGN_POWER_OF_TWO(CDEF_BLOCKSIZE + 2 * CDEF_HBORDER, 3)
+#define CDEF_BSTRIDE \
+  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
 
 #define CDEF_VERY_LARGE (30000)
-#define CDEF_INBUF_SIZE (CDEF_BSTRIDE * (CDEF_BLOCKSIZE + 2 * CDEF_VBORDER))
-
-#if CONFIG_CDEF_SINGLEPASS
-// Filter configuration
-#define CDEF_CAP 1   // 1 = Cap change to largest diff
-#define CDEF_FULL 0  // 1 = 7x7 filter, 0 = 5x5 filter
+#define CDEF_INBUF_SIZE \
+  (CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER))
 
-#if CDEF_FULL
-extern const int cdef_pri_taps[2][3];
-extern const int cdef_sec_taps[2][2];
-extern const int cdef_directions[8][3];
-#else
 extern const int cdef_pri_taps[2][2];
 extern const int cdef_sec_taps[2][2];
-extern const int cdef_directions[8][2];
-#endif
-
-#else  // CONFIG_CDEF_SINGLEPASS
-extern const int cdef_directions[8][3];
-#endif
+DECLARE_ALIGNED(16, extern const int, cdef_directions[8][2]);
 
 typedef struct {
   uint8_t by;
@@ -56,35 +41,19 @@ typedef struct {
   uint8_t skip;
 } cdef_list;
 
-#if CONFIG_CDEF_SINGLEPASS
 typedef void (*cdef_filter_block_func)(uint8_t *dst8, uint16_t *dst16,
                                        int dstride, const uint16_t *in,
                                        int pri_strength, int sec_strength,
                                        int dir, int pri_damping,
-                                       int sec_damping, int bsize, int max);
+                                       int sec_damping, int bsize, int max,
+                                       int coeff_shift);
 void copy_cdef_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
                               cdef_list *dlist, int cdef_count, int bsize);
-#else
-typedef void (*cdef_direction_func)(uint16_t *y, int ystride,
-                                    const uint16_t *in, int threshold, int dir,
-                                    int damping);
 
-int get_filter_skip(int level);
-#endif
-
-#if CONFIG_CDEF_SINGLEPASS
 void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
                     int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
                     int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
                     cdef_list *dlist, int cdef_count, int level,
                     int sec_strength, int pri_damping, int sec_damping,
                     int coeff_shift);
-#else
-void cdef_filter_fb(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in,
-                    int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
-                    int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
-                    cdef_list *dlist, int cdef_count, int level,
-                    int sec_strength, int sec_damping, int pri_damping,
-                    int coeff_shift, int skip_dering, int hbd);
-#endif
 #endif
diff --git a/third_party/aom/av1/common/cdef_block_avx2.c b/third_party/aom/av1/common/cdef_block_avx2.c
index 5e48045c0..e2b85b3e2 100644
--- a/third_party/aom/av1/common/cdef_block_avx2.c
+++ b/third_party/aom/av1/common/cdef_block_avx2.c
@@ -11,4 +11,4 @@
 
 #include "aom_dsp/aom_simd.h"
 #define SIMD_FUNC(name) name##_avx2
-#include "./cdef_block_simd.h"
+#include "av1/common/cdef_block_simd.h"
diff --git a/third_party/aom/av1/common/cdef_block_neon.c b/third_party/aom/av1/common/cdef_block_neon.c
index 030b32531..2d6bc65e3 100644
--- a/third_party/aom/av1/common/cdef_block_neon.c
+++ b/third_party/aom/av1/common/cdef_block_neon.c
@@ -11,4 +11,4 @@
 
 #include "aom_dsp/aom_simd.h"
 #define SIMD_FUNC(name) name##_neon
-#include "./cdef_block_simd.h"
+#include "av1/common/cdef_block_simd.h"
diff --git a/third_party/aom/av1/common/cdef_block_simd.h b/third_party/aom/av1/common/cdef_block_simd.h
index aa7d3c3ca..d24a7c0fa 100644
--- a/third_party/aom/av1/common/cdef_block_simd.h
+++ b/third_party/aom/av1/common/cdef_block_simd.h
@@ -9,8 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./av1_rtcd.h"
-#include "./cdef_block.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cdef_block.h"
 
 /* partial A is a 16-bit vector of the form:
    [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
@@ -167,39 +168,22 @@ int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var,
         v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128));
   }
 
-#if defined(__SSE4_1__)
   /* Compute "mostly vertical" directions. */
-  __m128i dir47 = compute_directions(lines, cost + 4);
+  v128 dir47 = compute_directions(lines, cost + 4);
 
   array_reverse_transpose_8x8(lines, lines);
 
   /* Compute "mostly horizontal" directions. */
-  __m128i dir03 = compute_directions(lines, cost);
-
-  __m128i max = _mm_max_epi32(dir03, dir47);
-  max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(1, 0, 3, 2)));
-  max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(2, 3, 0, 1)));
-  best_cost = _mm_cvtsi128_si32(max);
-  __m128i t =
-      _mm_packs_epi32(_mm_cmpeq_epi32(max, dir03), _mm_cmpeq_epi32(max, dir47));
-  best_dir = _mm_movemask_epi8(_mm_packs_epi16(t, t));
+  v128 dir03 = compute_directions(lines, cost);
+
+  v128 max = v128_max_s32(dir03, dir47);
+  max = v128_max_s32(max, v128_align(max, max, 8));
+  max = v128_max_s32(max, v128_align(max, max, 4));
+  best_cost = v128_low_u32(max);
+  v128 t =
+      v128_pack_s32_s16(v128_cmpeq_32(max, dir47), v128_cmpeq_32(max, dir03));
+  best_dir = v128_movemask_8(v128_pack_s16_s8(t, t));
   best_dir = get_msb(best_dir ^ (best_dir - 1));  // Count trailing zeros
-#else
-  /* Compute "mostly vertical" directions. */
-  compute_directions(lines, cost + 4);
-
-  array_reverse_transpose_8x8(lines, lines);
-
-  /* Compute "mostly horizontal" directions. */
-  compute_directions(lines, cost);
-
-  for (i = 0; i < 8; i++) {
-    if (cost[i] > best_cost) {
-      best_cost = cost[i];
-      best_dir = i;
-    }
-  }
-#endif
 
   /* Difference between the optimal variance and the variance along the
      orthogonal direction. Again, the sum(x^2) terms cancel out. */
@@ -211,17 +195,16 @@ int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var,
 }
 
 // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
-SIMD_INLINE v128 constrain16(v128 a, v128 b, unsigned int threshold,
+SIMD_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold,
                              unsigned int adjdamp) {
-  v128 diff = v128_sub_16(a, b);
-  const v128 sign = v128_shr_n_s16(diff, 15);
-  diff = v128_abs_s16(diff);
-  const v128 s =
-      v128_ssub_u16(v128_dup_16(threshold), v128_shr_u16(diff, adjdamp));
-  return v128_xor(v128_add_16(sign, v128_min_s16(diff, s)), sign);
+  v256 diff = v256_sub_16(a, b);
+  const v256 sign = v256_shr_n_s16(diff, 15);
+  diff = v256_abs_s16(diff);
+  const v256 s =
+      v256_ssub_u16(v256_dup_16(threshold), v256_shr_u16(diff, adjdamp));
+  return v256_xor(v256_add_16(sign, v256_min_s16(diff, s)), sign);
 }
 
-#if CONFIG_CDEF_SINGLEPASS
 // sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
 SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
                            unsigned int adjdamp) {
@@ -236,37 +219,24 @@ SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
       sign);
 }
 
-#if CDEF_CAP
-void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
-                                        const uint16_t *in, int pri_strength,
-                                        int sec_strength, int dir,
-                                        int pri_damping, int sec_damping,
-                                        UNUSED int max_unused)
-#else
 void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
                                         const uint16_t *in, int pri_strength,
                                         int sec_strength, int dir,
                                         int pri_damping, int sec_damping,
-                                        int max)
-#endif
-{
+                                        AOM_UNUSED int max_unused,
+                                        int coeff_shift) {
   v128 p0, p1, p2, p3;
   v256 sum, row, tap, res;
-#if CDEF_CAP
   v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
-#endif
   int po1 = cdef_directions[dir][0];
   int po2 = cdef_directions[dir][1];
-#if CDEF_FULL
-  int po3 = cdef_directions[dir][2];
-#endif
   int s1o1 = cdef_directions[(dir + 2) & 7][0];
   int s1o2 = cdef_directions[(dir + 2) & 7][1];
   int s2o1 = cdef_directions[(dir + 6) & 7][0];
   int s2o2 = cdef_directions[(dir + 6) & 7][1];
 
-  const int *pri_taps = cdef_pri_taps[pri_strength & 1];
-  const int *sec_taps = cdef_sec_taps[pri_strength & 1];
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
 
   if (pri_strength)
     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
@@ -278,9 +248,7 @@ void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
                       v64_load_aligned(&in[1 * CDEF_BSTRIDE]),
                       v64_load_aligned(&in[2 * CDEF_BSTRIDE]),
                       v64_load_aligned(&in[3 * CDEF_BSTRIDE]));
-#if CDEF_CAP
   max = min = row;
-#endif
 
   if (pri_strength) {
     // Primary near taps
@@ -288,19 +256,15 @@ void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po1]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po1]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p0 = constrain(tap, row, pri_strength, pri_damping);
     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po1]),
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po1]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po1]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p1 = constrain(tap, row, pri_strength, pri_damping);
 
     // sum += pri_taps[0] * (p0 + p1)
@@ -313,52 +277,21 @@ void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po2]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po2]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p0 = constrain(tap, row, pri_strength, pri_damping);
     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po2]),
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po2]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po2]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p1 = constrain(tap, row, pri_strength, pri_damping);
 
     // sum += pri_taps[1] * (p0 + p1)
     sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]),
                                          v256_from_v128(v128_ziphi_8(p0, p1),
                                                         v128_ziplo_8(p0, p1))));
-
-#if CDEF_FULL
-    // Primary extra taps
-    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po3]),
-                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po3]),
-                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po3]),
-                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po3]));
-#if CDEF_CAP
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-#endif
-    p0 = constrain(tap, row, pri_strength, pri_damping);
-    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po3]),
-                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po3]),
-                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po3]),
-                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po3]));
-#if CDEF_CAP
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-#endif
-    p1 = constrain(tap, row, pri_strength, pri_damping);
-
-    // sum += pri_taps[2] * (p0 + p1)
-    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[2]),
-                                         v256_from_v128(v128_ziphi_8(p0, p1),
-                                                        v128_ziplo_8(p0, p1))));
-#endif
   }
 
   if (sec_strength) {
@@ -367,37 +300,29 @@ void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o1]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o1]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p0 = constrain(tap, row, sec_strength, sec_damping);
     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o1]),
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o1]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o1]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p1 = constrain(tap, row, sec_strength, sec_damping);
     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o1]),
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o1]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o1]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p2 = constrain(tap, row, sec_strength, sec_damping);
     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o1]),
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o1]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o1]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p3 = constrain(tap, row, sec_strength, sec_damping);
 
     // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
@@ -412,37 +337,29 @@ void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o2]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o2]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p0 = constrain(tap, row, sec_strength, sec_damping);
     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o2]),
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o2]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o2]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p1 = constrain(tap, row, sec_strength, sec_damping);
     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o2]),
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o2]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o2]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p2 = constrain(tap, row, sec_strength, sec_damping);
     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o2]),
                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o2]),
                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o2]),
                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p3 = constrain(tap, row, sec_strength, sec_damping);
 
     // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
@@ -459,11 +376,7 @@ void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
   res = v256_add_16(sum, v256_dup_16(8));
   res = v256_shr_n_s16(res, 4);
   res = v256_add_16(row, res);
-#if CDEF_CAP
   res = v256_min_s16(v256_max_s16(res, min), max);
-#else
-  res = v256_min_s16(v256_max_s16(res, v256_zero()), v256_dup_16(max));
-#endif
   res = v256_pack_s16_u8(res, res);
 
   p0 = v256_low_v128(res);
@@ -473,38 +386,25 @@ void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
   u32_store_aligned(&dst[3 * dstride], v64_low_u32(v128_low_v64(p0)));
 }
 
-#if CDEF_CAP
 void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
                                         const uint16_t *in, int pri_strength,
                                         int sec_strength, int dir,
                                         int pri_damping, int sec_damping,
-                                        UNUSED int max_unused)
-#else
-void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
-                                        const uint16_t *in, int pri_strength,
-                                        int sec_strength, int dir,
-                                        int pri_damping, int sec_damping,
-                                        int max)
-#endif
-{
+                                        AOM_UNUSED int max_unused,
+                                        int coeff_shift) {
   int i;
   v128 p0, p1, p2, p3;
   v256 sum, row, res, tap;
-#if CDEF_CAP
   v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
-#endif
   int po1 = cdef_directions[dir][0];
   int po2 = cdef_directions[dir][1];
-#if CDEF_FULL
-  int po3 = cdef_directions[dir][2];
-#endif
   int s1o1 = cdef_directions[(dir + 2) & 7][0];
   int s1o2 = cdef_directions[(dir + 2) & 7][1];
   int s2o1 = cdef_directions[(dir + 6) & 7][0];
   int s2o2 = cdef_directions[(dir + 6) & 7][1];
 
-  const int *pri_taps = cdef_pri_taps[pri_strength & 1];
-  const int *sec_taps = cdef_sec_taps[pri_strength & 1];
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
 
   if (pri_strength)
     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
@@ -515,25 +415,19 @@ void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
     row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
                          v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
 
-#if CDEF_CAP
     max = min = row;
-#endif
     // Primary near taps
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p0 = constrain(tap, row, pri_strength, pri_damping);
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p1 = constrain(tap, row, pri_strength, pri_damping);
 
     // sum += pri_taps[0] * (p0 + p1)
@@ -545,18 +439,14 @@ void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p0 = constrain(tap, row, pri_strength, pri_damping);
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p1 = constrain(tap, row, pri_strength, pri_damping);
 
     // sum += pri_taps[1] * (p0 + p1)
@@ -564,63 +454,30 @@ void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
                                          v256_from_v128(v128_ziphi_8(p0, p1),
                                                         v128_ziplo_8(p0, p1))));
 
-#if CDEF_FULL
-    // Primary extra taps
-    tap =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po3]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po3]));
-#if CDEF_CAP
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-#endif
-    p0 = constrain(tap, row, pri_strength, pri_damping);
-    tap =
-        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po3]),
-                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po3]));
-#if CDEF_CAP
-    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
-    min = v256_min_s16(min, tap);
-#endif
-    p1 = constrain(tap, row, pri_strength, pri_damping);
-
-    // sum += pri_taps[2] * (p0 + p1)
-    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[2]),
-                                         v256_from_v128(v128_ziphi_8(p0, p1),
-                                                        v128_ziplo_8(p0, p1))));
-#endif
-
     // Secondary near taps
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p0 = constrain(tap, row, sec_strength, sec_damping);
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p1 = constrain(tap, row, sec_strength, sec_damping);
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p2 = constrain(tap, row, sec_strength, sec_damping);
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p3 = constrain(tap, row, sec_strength, sec_damping);
 
     // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
@@ -634,34 +491,26 @@ void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p0 = constrain(tap, row, sec_strength, sec_damping);
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p1 = constrain(tap, row, sec_strength, sec_damping);
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p2 = constrain(tap, row, sec_strength, sec_damping);
     tap =
         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
-#if CDEF_CAP
     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
     min = v256_min_s16(min, tap);
-#endif
     p3 = constrain(tap, row, sec_strength, sec_damping);
 
     // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
@@ -676,11 +525,7 @@ void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
     res = v256_add_16(sum, v256_dup_16(8));
     res = v256_shr_n_s16(res, 4);
     res = v256_add_16(row, res);
-#if CDEF_CAP
     res = v256_min_s16(v256_max_s16(res, min), max);
-#else
-    res = v256_min_s16(v256_max_s16(res, v256_zero()), v256_dup_16(max));
-#endif
     res = v256_pack_s16_u8(res, res);
 
     p0 = v256_low_v128(res);
@@ -689,499 +534,355 @@ void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
   }
 }
 
-#if CDEF_CAP
 void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride,
                                          const uint16_t *in, int pri_strength,
                                          int sec_strength, int dir,
                                          int pri_damping, int sec_damping,
-                                         UNUSED int max_unused)
-#else
-void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride,
-                                         const uint16_t *in, int pri_strength,
-                                         int sec_strength, int dir,
-                                         int pri_damping, int sec_damping,
-                                         int max)
-#endif
-{
+                                         AOM_UNUSED int max_unused,
+                                         int coeff_shift) {
   int i;
-  v128 p0, p1, p2, p3, sum, row, res;
-#if CDEF_CAP
-  v128 max, min, large = v128_dup_16(CDEF_VERY_LARGE);
-#endif
+  v256 p0, p1, p2, p3, sum, row, res;
+  v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
   int po1 = cdef_directions[dir][0];
   int po2 = cdef_directions[dir][1];
-#if CDEF_FULL
-  int po3 = cdef_directions[dir][2];
-#endif
   int s1o1 = cdef_directions[(dir + 2) & 7][0];
   int s1o2 = cdef_directions[(dir + 2) & 7][1];
   int s2o1 = cdef_directions[(dir + 6) & 7][0];
   int s2o2 = cdef_directions[(dir + 6) & 7][1];
 
-  const int *pri_taps = cdef_pri_taps[pri_strength & 1];
-  const int *sec_taps = cdef_sec_taps[pri_strength & 1];
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
 
   if (pri_strength)
     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
   if (sec_strength)
     sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
-  for (i = 0; i < 4; i += 2) {
-    sum = v128_zero();
-    row = v128_from_v64(v64_load_aligned(&in[i * CDEF_BSTRIDE]),
-                        v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
-#if CDEF_CAP
+  for (i = 0; i < 4; i += 4) {
+    sum = v256_zero();
+    row = v256_from_v64(v64_load_aligned(&in[i * CDEF_BSTRIDE]),
+                        v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]),
+                        v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]),
+                        v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE]));
     min = max = row;
-#endif
 
     // Primary near taps
-    p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
-    p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
-#if CDEF_CAP
+    p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po1]));
+    p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po1]));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
-                     v128_andn(p1, v128_cmpeq_16(p1, large)));
-    min = v128_min_s16(v128_min_s16(min, p0), p1);
-#endif
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    min = v256_min_s16(v256_min_s16(min, p0), p1);
     p0 = constrain16(p0, row, pri_strength, pri_damping);
     p1 = constrain16(p1, row, pri_strength, pri_damping);
 
     // sum += pri_taps[0] * (p0 + p1)
-    sum = v128_add_16(
-        sum, v128_mullo_s16(v128_dup_16(pri_taps[0]), v128_add_16(p0, p1)));
+    sum = v256_add_16(
+        sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
 
     // Primary far taps
-    p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
-    p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
-#if CDEF_CAP
+    p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po2]));
+    p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po2]));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
-                     v128_andn(p1, v128_cmpeq_16(p1, large)));
-    min = v128_min_s16(v128_min_s16(min, p0), p1);
-#endif
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    min = v256_min_s16(v256_min_s16(min, p0), p1);
     p0 = constrain16(p0, row, pri_strength, pri_damping);
     p1 = constrain16(p1, row, pri_strength, pri_damping);
 
     // sum += pri_taps[1] * (p0 + p1)
-    sum = v128_add_16(
-        sum, v128_mullo_s16(v128_dup_16(pri_taps[1]), v128_add_16(p0, p1)));
-
-#if CDEF_FULL
-    // Primary extra taps
-    p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po3]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po3]));
-    p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po3]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po3]));
-#if CDEF_CAP
-    max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
-                     v128_andn(p1, v128_cmpeq_16(p1, large)));
-    min = v128_min_s16(v128_min_s16(min, p0), p1);
-#endif
-    p0 = constrain16(p0, row, pri_strength, pri_damping);
-    p1 = constrain16(p1, row, pri_strength, pri_damping);
-
-    // sum += pri_taps[2] * (p0 + p1)
-    sum = v128_add_16(
-        sum, v128_mullo_s16(v128_dup_16(pri_taps[2]), v128_add_16(p0, p1)));
-#endif
+    sum = v256_add_16(
+        sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
 
     // Secondary near taps
-    p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
-    p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
-    p2 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
-    p3 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
-#if CDEF_CAP
+    p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o1]));
+    p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o1]));
+    p2 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o1]));
+    p3 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o1]));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
-                     v128_andn(p1, v128_cmpeq_16(p1, large)));
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p2, v128_cmpeq_16(p2, large))),
-                     v128_andn(p3, v128_cmpeq_16(p3, large)));
-    min = v128_min_s16(
-        v128_min_s16(v128_min_s16(v128_min_s16(min, p0), p1), p2), p3);
-#endif
+        v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
+                     v256_andn(p3, v256_cmpeq_16(p3, large)));
+    min = v256_min_s16(
+        v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
     p0 = constrain16(p0, row, sec_strength, sec_damping);
     p1 = constrain16(p1, row, sec_strength, sec_damping);
     p2 = constrain16(p2, row, sec_strength, sec_damping);
     p3 = constrain16(p3, row, sec_strength, sec_damping);
 
     // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
-    sum = v128_add_16(sum, v128_mullo_s16(v128_dup_16(sec_taps[0]),
-                                          v128_add_16(v128_add_16(p0, p1),
-                                                      v128_add_16(p2, p3))));
+    sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
+                                          v256_add_16(v256_add_16(p0, p1),
+                                                      v256_add_16(p2, p3))));
 
     // Secondary far taps
-    p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
-    p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
-    p2 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
-    p3 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
-#if CDEF_CAP
+    p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o2]));
+    p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o2]));
+    p2 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o2]));
+    p3 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o2]));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
-                     v128_andn(p1, v128_cmpeq_16(p1, large)));
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p2, v128_cmpeq_16(p2, large))),
-                     v128_andn(p3, v128_cmpeq_16(p3, large)));
-    min = v128_min_s16(
-        v128_min_s16(v128_min_s16(v128_min_s16(min, p0), p1), p2), p3);
-#endif
+        v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
+                     v256_andn(p3, v256_cmpeq_16(p3, large)));
+    min = v256_min_s16(
+        v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
     p0 = constrain16(p0, row, sec_strength, sec_damping);
     p1 = constrain16(p1, row, sec_strength, sec_damping);
     p2 = constrain16(p2, row, sec_strength, sec_damping);
     p3 = constrain16(p3, row, sec_strength, sec_damping);
 
     // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
-    sum = v128_add_16(sum, v128_mullo_s16(v128_dup_16(sec_taps[1]),
-                                          v128_add_16(v128_add_16(p0, p1),
-                                                      v128_add_16(p2, p3))));
+    sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
+                                          v256_add_16(v256_add_16(p0, p1),
+                                                      v256_add_16(p2, p3))));
 
     // res = row + ((sum - (sum < 0) + 8) >> 4)
-    sum = v128_add_16(sum, v128_cmplt_s16(sum, v128_zero()));
-    res = v128_add_16(sum, v128_dup_16(8));
-    res = v128_shr_n_s16(res, 4);
-    res = v128_add_16(row, res);
-#if CDEF_CAP
-    res = v128_min_s16(v128_max_s16(res, min), max);
-#else
-    res = v128_min_s16(v128_max_s16(res, v128_zero()), v128_dup_16(max));
-#endif
-    v64_store_aligned(&dst[i * dstride], v128_high_v64(res));
-    v64_store_aligned(&dst[(i + 1) * dstride], v128_low_v64(res));
+    sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+    res = v256_add_16(sum, v256_dup_16(8));
+    res = v256_shr_n_s16(res, 4);
+    res = v256_add_16(row, res);
+    res = v256_min_s16(v256_max_s16(res, min), max);
+
+    v64_store_aligned(&dst[i * dstride], v128_high_v64(v256_high_v128(res)));
+    v64_store_aligned(&dst[(i + 1) * dstride],
+                      v128_low_v64(v256_high_v128(res)));
+    v64_store_aligned(&dst[(i + 2) * dstride],
+                      v128_high_v64(v256_low_v128(res)));
+    v64_store_aligned(&dst[(i + 3) * dstride],
+                      v128_low_v64(v256_low_v128(res)));
   }
 }
 
-#if CDEF_CAP
 void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
                                          const uint16_t *in, int pri_strength,
                                          int sec_strength, int dir,
                                          int pri_damping, int sec_damping,
-                                         UNUSED int max_unused)
-#else
-void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
-                                         const uint16_t *in, int pri_strength,
-                                         int sec_strength, int dir,
-                                         int pri_damping, int sec_damping,
-                                         int max)
-#endif
-{
+                                         AOM_UNUSED int max_unused,
+                                         int coeff_shift) {
   int i;
-  v128 sum, p0, p1, p2, p3, row, res;
-#if CDEF_CAP
-  v128 max, min, large = v128_dup_16(CDEF_VERY_LARGE);
-#endif
+  v256 sum, p0, p1, p2, p3, row, res;
+  v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
   int po1 = cdef_directions[dir][0];
   int po2 = cdef_directions[dir][1];
-#if CDEF_FULL
-  int po3 = cdef_directions[dir][2];
-#endif
   int s1o1 = cdef_directions[(dir + 2) & 7][0];
   int s1o2 = cdef_directions[(dir + 2) & 7][1];
   int s2o1 = cdef_directions[(dir + 6) & 7][0];
   int s2o2 = cdef_directions[(dir + 6) & 7][1];
 
-  const int *pri_taps = cdef_pri_taps[pri_strength & 1];
-  const int *sec_taps = cdef_sec_taps[pri_strength & 1];
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
 
   if (pri_strength)
     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
   if (sec_strength)
     sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
 
-  for (i = 0; i < 8; i++) {
-    sum = v128_zero();
-    row = v128_load_aligned(&in[i * CDEF_BSTRIDE]);
+  for (i = 0; i < 8; i += 2) {
+    sum = v256_zero();
+    row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
+                         v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
 
-#if CDEF_CAP
     min = max = row;
-#endif
     // Primary near taps
-    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]);
-    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]);
-#if CDEF_CAP
+    p0 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
+                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
+    p1 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
+                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
-                     v128_andn(p1, v128_cmpeq_16(p1, large)));
-    min = v128_min_s16(v128_min_s16(min, p0), p1);
-#endif
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    min = v256_min_s16(v256_min_s16(min, p0), p1);
     p0 = constrain16(p0, row, pri_strength, pri_damping);
     p1 = constrain16(p1, row, pri_strength, pri_damping);
 
     // sum += pri_taps[0] * (p0 + p1)
-    sum = v128_add_16(
-        sum, v128_mullo_s16(v128_dup_16(pri_taps[0]), v128_add_16(p0, p1)));
+    sum = v256_add_16(
+        sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
 
     // Primary far taps
-    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]);
-    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]);
-#if CDEF_CAP
+    p0 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
+                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
+    p1 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
+                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
-                     v128_andn(p1, v128_cmpeq_16(p1, large)));
-    min = v128_min_s16(v128_min_s16(min, p0), p1);
-#endif
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    min = v256_min_s16(v256_min_s16(min, p0), p1);
     p0 = constrain16(p0, row, pri_strength, pri_damping);
     p1 = constrain16(p1, row, pri_strength, pri_damping);
 
     // sum += pri_taps[1] * (p0 + p1)
-    sum = v128_add_16(
-        sum, v128_mullo_s16(v128_dup_16(pri_taps[1]), v128_add_16(p0, p1)));
-
-#if CDEF_FULL
-    // Primary extra taps
-    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + po3]);
-    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - po3]);
-#if CDEF_CAP
-    max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
-                     v128_andn(p1, v128_cmpeq_16(p1, large)));
-    min = v128_min_s16(v128_min_s16(min, p0), p1);
-#endif
-    p0 = constrain16(p0, row, pri_strength, pri_damping);
-    p1 = constrain16(p1, row, pri_strength, pri_damping);
-
-    // sum += pri_taps[2] * (p0 + p1)
-    sum = v128_add_16(
-        sum, v128_mullo_s16(v128_dup_16(pri_taps[2]), v128_add_16(p0, p1)));
-#endif
+    sum = v256_add_16(
+        sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
 
     // Secondary near taps
-    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]);
-    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]);
-    p2 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]);
-    p3 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]);
-#if CDEF_CAP
+    p0 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
+    p1 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
+    p2 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
+    p3 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
-                     v128_andn(p1, v128_cmpeq_16(p1, large)));
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p2, v128_cmpeq_16(p2, large))),
-                     v128_andn(p3, v128_cmpeq_16(p3, large)));
-    min = v128_min_s16(
-        v128_min_s16(v128_min_s16(v128_min_s16(min, p0), p1), p2), p3);
-#endif
+        v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
+                     v256_andn(p3, v256_cmpeq_16(p3, large)));
+    min = v256_min_s16(
+        v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
     p0 = constrain16(p0, row, sec_strength, sec_damping);
     p1 = constrain16(p1, row, sec_strength, sec_damping);
     p2 = constrain16(p2, row, sec_strength, sec_damping);
     p3 = constrain16(p3, row, sec_strength, sec_damping);
 
     // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
-    sum = v128_add_16(sum, v128_mullo_s16(v128_dup_16(sec_taps[0]),
-                                          v128_add_16(v128_add_16(p0, p1),
-                                                      v128_add_16(p2, p3))));
+    sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
+                                          v256_add_16(v256_add_16(p0, p1),
+                                                      v256_add_16(p2, p3))));
 
     // Secondary far taps
-    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]);
-    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]);
-    p2 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]);
-    p3 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]);
-#if CDEF_CAP
+    p0 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
+    p1 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
+    p2 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
+    p3 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
-                     v128_andn(p1, v128_cmpeq_16(p1, large)));
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
     max =
-        v128_max_s16(v128_max_s16(max, v128_andn(p2, v128_cmpeq_16(p2, large))),
-                     v128_andn(p3, v128_cmpeq_16(p3, large)));
-    min = v128_min_s16(
-        v128_min_s16(v128_min_s16(v128_min_s16(min, p0), p1), p2), p3);
-#endif
+        v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
+                     v256_andn(p3, v256_cmpeq_16(p3, large)));
+    min = v256_min_s16(
+        v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
     p0 = constrain16(p0, row, sec_strength, sec_damping);
     p1 = constrain16(p1, row, sec_strength, sec_damping);
     p2 = constrain16(p2, row, sec_strength, sec_damping);
     p3 = constrain16(p3, row, sec_strength, sec_damping);
 
     // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
-    sum = v128_add_16(sum, v128_mullo_s16(v128_dup_16(sec_taps[1]),
-                                          v128_add_16(v128_add_16(p0, p1),
-                                                      v128_add_16(p2, p3))));
+    sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
+                                          v256_add_16(v256_add_16(p0, p1),
+                                                      v256_add_16(p2, p3))));
 
     // res = row + ((sum - (sum < 0) + 8) >> 4)
-    sum = v128_add_16(sum, v128_cmplt_s16(sum, v128_zero()));
-    res = v128_add_16(sum, v128_dup_16(8));
-    res = v128_shr_n_s16(res, 4);
-    res = v128_add_16(row, res);
-#if CDEF_CAP
-    res = v128_min_s16(v128_max_s16(res, min), max);
-#else
-    res = v128_min_s16(v128_max_s16(res, v128_zero()), v128_dup_16(max));
-#endif
-    v128_store_unaligned(&dst[i * dstride], res);
+    sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+    res = v256_add_16(sum, v256_dup_16(8));
+    res = v256_shr_n_s16(res, 4);
+    res = v256_add_16(row, res);
+    res = v256_min_s16(v256_max_s16(res, min), max);
+    v128_store_unaligned(&dst[i * dstride], v256_high_v128(res));
+    v128_store_unaligned(&dst[(i + 1) * dstride], v256_low_v128(res));
   }
 }
 
 void SIMD_FUNC(cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride,
                                   const uint16_t *in, int pri_strength,
                                   int sec_strength, int dir, int pri_damping,
-                                  int sec_damping, int bsize, int max) {
-  if (dst8)
-    (bsize == BLOCK_8X8 ? SIMD_FUNC(cdef_filter_block_8x8_8)
-                        : SIMD_FUNC(cdef_filter_block_4x4_8))(
-        dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
-        sec_damping, max);
-  else
-    (bsize == BLOCK_8X8 ? SIMD_FUNC(cdef_filter_block_8x8_16)
-                        : SIMD_FUNC(cdef_filter_block_4x4_16))(
-        dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
-        sec_damping, max);
-}
-
-#else
-
-void SIMD_FUNC(cdef_direction_4x4)(uint16_t *y, int ystride, const uint16_t *in,
-                                   int threshold, int dir, int damping) {
-  int i;
-  v128 p0, p1, sum, row, res;
-  int o1 = cdef_directions[dir][0];
-  int o2 = cdef_directions[dir][1];
-
-  if (threshold) damping -= get_msb(threshold);
-  for (i = 0; i < 4; i += 2) {
-    sum = v128_zero();
-    row = v128_from_v64(v64_load_aligned(&in[i * CDEF_BSTRIDE]),
-                        v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
-
-    // p0 = constrain16(in[i*CDEF_BSTRIDE + offset], row, threshold, damping)
-    p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + o1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + o1]));
-    p0 = constrain16(p0, row, threshold, damping);
-
-    // p1 = constrain16(in[i*CDEF_BSTRIDE - offset], row, threshold, damping)
-    p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - o1]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - o1]));
-    p1 = constrain16(p1, row, threshold, damping);
-
-    // sum += 4 * (p0 + p1)
-    sum = v128_add_16(sum, v128_shl_n_16(v128_add_16(p0, p1), 2));
-
-    // p0 = constrain16(in[i*CDEF_BSTRIDE + offset], row, threshold, damping)
-    p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + o2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + o2]));
-    p0 = constrain16(p0, row, threshold, damping);
-
-    // p1 = constrain16(in[i*CDEF_BSTRIDE - offset], row, threshold, damping)
-    p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - o2]),
-                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - o2]));
-    p1 = constrain16(p1, row, threshold, damping);
-
-    // sum += 1 * (p0 + p1)
-    sum = v128_add_16(sum, v128_add_16(p0, p1));
-
-    // res = row + ((sum + 8) >> 4)
-    res = v128_add_16(sum, v128_dup_16(8));
-    res = v128_shr_n_s16(res, 4);
-    res = v128_add_16(row, res);
-    v64_store_aligned(&y[i * ystride], v128_high_v64(res));
-    v64_store_aligned(&y[(i + 1) * ystride], v128_low_v64(res));
-  }
-}
-
-void SIMD_FUNC(cdef_direction_8x8)(uint16_t *y, int ystride, const uint16_t *in,
-                                   int threshold, int dir, int damping) {
-  int i;
-  v128 sum, p0, p1, row, res;
-  int o1 = cdef_directions[dir][0];
-  int o2 = cdef_directions[dir][1];
-  int o3 = cdef_directions[dir][2];
-
-  if (threshold) damping -= get_msb(threshold);
-  for (i = 0; i < 8; i++) {
-    sum = v128_zero();
-    row = v128_load_aligned(&in[i * CDEF_BSTRIDE]);
-
-    // p0 = constrain16(in[i*CDEF_BSTRIDE + offset], row, threshold, damping)
-    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + o1]);
-    p0 = constrain16(p0, row, threshold, damping);
-
-    // p1 = constrain16(in[i*CDEF_BSTRIDE - offset], row, threshold, damping)
-    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - o1]);
-    p1 = constrain16(p1, row, threshold, damping);
-
-    // sum += 3 * (p0 + p1)
-    p0 = v128_add_16(p0, p1);
-    p0 = v128_add_16(p0, v128_shl_n_16(p0, 1));
-    sum = v128_add_16(sum, p0);
-
-    // p0 = constrain16(in[i*CDEF_BSTRIDE + offset], row, threshold, damping)
-    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + o2]);
-    p0 = constrain16(p0, row, threshold, damping);
-
-    // p1 = constrain16(in[i*CDEF_BSTRIDE - offset], row, threshold, damping)
-    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - o2]);
-    p1 = constrain16(p1, row, threshold, damping);
-
-    // sum += 2 * (p0 + p1)
-    p0 = v128_shl_n_16(v128_add_16(p0, p1), 1);
-    sum = v128_add_16(sum, p0);
-
-    // p0 = constrain16(in[i*CDEF_BSTRIDE + offset], row, threshold, damping)
-    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + o3]);
-    p0 = constrain16(p0, row, threshold, damping);
-
-    // p1 = constrain16(in[i*CDEF_BSTRIDE - offset], row, threshold, damping)
-    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - o3]);
-    p1 = constrain16(p1, row, threshold, damping);
-
-    // sum += (p0 + p1)
-    p0 = v128_add_16(p0, p1);
-    sum = v128_add_16(sum, p0);
-
-    // res = row + ((sum + 8) >> 4)
-    res = v128_add_16(sum, v128_dup_16(8));
-    res = v128_shr_n_s16(res, 4);
-    res = v128_add_16(row, res);
-    v128_store_unaligned(&y[i * ystride], res);
-  }
-}
-
-void SIMD_FUNC(copy_8x8_16bit_to_8bit)(uint8_t *dst, int dstride,
-                                       const uint16_t *src, int sstride) {
-  int i;
-  for (i = 0; i < 8; i++) {
-    v128 row = v128_load_unaligned(&src[i * sstride]);
-    row = v128_pack_s16_u8(row, row);
-    v64_store_unaligned(&dst[i * dstride], v128_low_v64(row));
-  }
-}
-
-void SIMD_FUNC(copy_4x4_16bit_to_8bit)(uint8_t *dst, int dstride,
-                                       const uint16_t *src, int sstride) {
-  int i;
-  for (i = 0; i < 4; i++) {
-    v128 row = v128_load_unaligned(&src[i * sstride]);
-    row = v128_pack_s16_u8(row, row);
-    u32_store_unaligned(&dst[i * dstride], v128_low_u32(row));
-  }
-}
-
-void SIMD_FUNC(copy_8x8_16bit_to_16bit)(uint16_t *dst, int dstride,
-                                        const uint16_t *src, int sstride) {
-  int i;
-  for (i = 0; i < 8; i++) {
-    v128 row = v128_load_unaligned(&src[i * sstride]);
-    v128_store_unaligned(&dst[i * dstride], row);
-  }
-}
-
-void SIMD_FUNC(copy_4x4_16bit_to_16bit)(uint16_t *dst, int dstride,
-                                        const uint16_t *src, int sstride) {
-  int i;
-  for (i = 0; i < 4; i++) {
-    v64 row = v64_load_unaligned(&src[i * sstride]);
-    v64_store_unaligned(&dst[i * dstride], row);
+                                  int sec_damping, int bsize, int max,
+                                  int coeff_shift) {
+  if (dst8) {
+    if (bsize == BLOCK_8X8) {
+      SIMD_FUNC(cdef_filter_block_8x8_8)
+      (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+    } else if (bsize == BLOCK_4X8) {
+      SIMD_FUNC(cdef_filter_block_4x4_8)
+      (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+      SIMD_FUNC(cdef_filter_block_4x4_8)
+      (dst8 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
+       sec_strength, dir, pri_damping, sec_damping, max, coeff_shift);
+    } else if (bsize == BLOCK_8X4) {
+      SIMD_FUNC(cdef_filter_block_4x4_8)
+      (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+      SIMD_FUNC(cdef_filter_block_4x4_8)
+      (dst8 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+    } else {
+      SIMD_FUNC(cdef_filter_block_4x4_8)
+      (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+    }
+  } else {
+    if (bsize == BLOCK_8X8) {
+      SIMD_FUNC(cdef_filter_block_8x8_16)
+      (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+    } else if (bsize == BLOCK_4X8) {
+      SIMD_FUNC(cdef_filter_block_4x4_16)
+      (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+      SIMD_FUNC(cdef_filter_block_4x4_16)
+      (dst16 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
+       sec_strength, dir, pri_damping, sec_damping, max, coeff_shift);
+    } else if (bsize == BLOCK_8X4) {
+      SIMD_FUNC(cdef_filter_block_4x4_16)
+      (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+      SIMD_FUNC(cdef_filter_block_4x4_16)
+      (dst16 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+    } else {
+      assert(bsize == BLOCK_4X4);
+      SIMD_FUNC(cdef_filter_block_4x4_16)
+      (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+    }
   }
 }
-#endif
 
 void SIMD_FUNC(copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
                                          const uint8_t *src, int sstride, int v,
diff --git a/third_party/aom/av1/common/cdef_block_sse2.c b/third_party/aom/av1/common/cdef_block_sse2.c
index f3de763fa..73f115d17 100644
--- a/third_party/aom/av1/common/cdef_block_sse2.c
+++ b/third_party/aom/av1/common/cdef_block_sse2.c
@@ -11,4 +11,4 @@
 
 #include "aom_dsp/aom_simd.h"
 #define SIMD_FUNC(name) name##_sse2
-#include "./cdef_block_simd.h"
+#include "av1/common/cdef_block_simd.h"
diff --git a/third_party/aom/av1/common/cdef_block_sse4.c b/third_party/aom/av1/common/cdef_block_sse4.c
index 27e9ff32e..349329af6 100644
--- a/third_party/aom/av1/common/cdef_block_sse4.c
+++ b/third_party/aom/av1/common/cdef_block_sse4.c
@@ -11,4 +11,4 @@
 
 #include "aom_dsp/aom_simd.h"
 #define SIMD_FUNC(name) name##_sse4_1
-#include "./cdef_block_simd.h"
+#include "av1/common/cdef_block_simd.h"
diff --git a/third_party/aom/av1/common/cdef_block_ssse3.c b/third_party/aom/av1/common/cdef_block_ssse3.c
index 863522199..3a93b150f 100644
--- a/third_party/aom/av1/common/cdef_block_ssse3.c
+++ b/third_party/aom/av1/common/cdef_block_ssse3.c
@@ -11,4 +11,4 @@
 
 #include "aom_dsp/aom_simd.h"
 #define SIMD_FUNC(name) name##_ssse3
-#include "./cdef_block_simd.h"
+#include "av1/common/cdef_block_simd.h"
diff --git a/third_party/aom/av1/common/cfl.c b/third_party/aom/av1/common/cfl.c
index f9acfcbc9..ee19f0bcf 100644
--- a/third_party/aom/av1/common/cfl.c
+++ b/third_party/aom/av1/common/cfl.c
@@ -13,20 +13,77 @@
 #include "av1/common/common_data.h"
 #include "av1/common/onyxc_int.h"
 
+#include "config/av1_rtcd.h"
+
 void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm) {
-  if (!((cm->subsampling_x == 0 && cm->subsampling_y == 0) ||
-        (cm->subsampling_x == 1 && cm->subsampling_y == 1))) {
+  assert(block_size_wide[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
+  assert(block_size_high[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
+  if (!(cm->subsampling_x == 0 && cm->subsampling_y == 0) &&
+      !(cm->subsampling_x == 1 && cm->subsampling_y == 1) &&
+      !(cm->subsampling_x == 1 && cm->subsampling_y == 0)) {
     aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                       "Only 4:4:4 and 4:2:0 are currently supported by CfL");
+                       "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported by "
+                       "CfL, %d %d subsampling is not supported.\n",
+                       cm->subsampling_x, cm->subsampling_y);
   }
-  memset(&cfl->pred_buf_q3, 0, sizeof(cfl->pred_buf_q3));
+  memset(&cfl->recon_buf_q3, 0, sizeof(cfl->recon_buf_q3));
+  memset(&cfl->ac_buf_q3, 0, sizeof(cfl->ac_buf_q3));
   cfl->subsampling_x = cm->subsampling_x;
   cfl->subsampling_y = cm->subsampling_y;
   cfl->are_parameters_computed = 0;
   cfl->store_y = 0;
-#if CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-  cfl_clear_sub8x8_val(cfl);
-#endif  // CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+  // The DC_PRED cache is disabled by default and is only enabled in
+  // cfl_rd_pick_alpha
+  cfl->use_dc_pred_cache = 0;
+  cfl->dc_pred_is_cached[CFL_PRED_U] = 0;
+  cfl->dc_pred_is_cached[CFL_PRED_V] = 0;
+}
+
+void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
+                       CFL_PRED_TYPE pred_plane, int width) {
+  assert(pred_plane < CFL_PRED_PLANES);
+  assert(width <= CFL_BUF_LINE);
+
+  if (get_bitdepth_data_path_index(xd)) {
+    uint16_t *const input_16 = CONVERT_TO_SHORTPTR(input);
+    memcpy(xd->cfl.dc_pred_cache[pred_plane], input_16, width << 1);
+    return;
+  }
+
+  memcpy(xd->cfl.dc_pred_cache[pred_plane], input, width);
+}
+
+static void cfl_load_dc_pred_lbd(const int16_t *dc_pred_cache, uint8_t *dst,
+                                 int dst_stride, int width, int height) {
+  for (int j = 0; j < height; j++) {
+    memcpy(dst, dc_pred_cache, width);
+    dst += dst_stride;
+  }
+}
+
+static void cfl_load_dc_pred_hbd(const int16_t *dc_pred_cache, uint16_t *dst,
+                                 int dst_stride, int width, int height) {
+  const size_t num_bytes = width << 1;
+  for (int j = 0; j < height; j++) {
+    memcpy(dst, dc_pred_cache, num_bytes);
+    dst += dst_stride;
+  }
+}
+void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+                      TX_SIZE tx_size, CFL_PRED_TYPE pred_plane) {
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  assert(pred_plane < CFL_PRED_PLANES);
+  assert(width <= CFL_BUF_LINE);
+  assert(height <= CFL_BUF_LINE);
+  if (get_bitdepth_data_path_index(xd)) {
+    uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
+    cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst_16, dst_stride,
+                         width, height);
+    return;
+  }
+  cfl_load_dc_pred_lbd(xd->cfl.dc_pred_cache[pred_plane], dst, dst_stride,
+                       width, height);
 }
 
 // Due to frame boundary issues, it is possible that the total area covered by
@@ -38,217 +95,54 @@ static INLINE void cfl_pad(CFL_CTX *cfl, int width, int height) {
 
   if (diff_width > 0) {
     const int min_height = height - diff_height;
-    int16_t *pred_buf_q3 = cfl->pred_buf_q3 + (width - diff_width);
+    uint16_t *recon_buf_q3 = cfl->recon_buf_q3 + (width - diff_width);
     for (int j = 0; j < min_height; j++) {
-      const int last_pixel = pred_buf_q3[-1];
+      const uint16_t last_pixel = recon_buf_q3[-1];
+      assert(recon_buf_q3 + diff_width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE);
       for (int i = 0; i < diff_width; i++) {
-        pred_buf_q3[i] = last_pixel;
+        recon_buf_q3[i] = last_pixel;
       }
-      pred_buf_q3 += MAX_SB_SIZE;
+      recon_buf_q3 += CFL_BUF_LINE;
     }
     cfl->buf_width = width;
   }
   if (diff_height > 0) {
-    int16_t *pred_buf_q3 =
-        cfl->pred_buf_q3 + ((height - diff_height) * MAX_SB_SIZE);
+    uint16_t *recon_buf_q3 =
+        cfl->recon_buf_q3 + ((height - diff_height) * CFL_BUF_LINE);
     for (int j = 0; j < diff_height; j++) {
-      const int16_t *last_row_q3 = pred_buf_q3 - MAX_SB_SIZE;
+      const uint16_t *last_row_q3 = recon_buf_q3 - CFL_BUF_LINE;
+      assert(recon_buf_q3 + width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE);
       for (int i = 0; i < width; i++) {
-        pred_buf_q3[i] = last_row_q3[i];
+        recon_buf_q3[i] = last_row_q3[i];
       }
-      pred_buf_q3 += MAX_SB_SIZE;
+      recon_buf_q3 += CFL_BUF_LINE;
     }
     cfl->buf_height = height;
   }
 }
 
-static void sum_above_row_lbd(const uint8_t *above_u, const uint8_t *above_v,
-                              int width, int *out_sum_u, int *out_sum_v) {
-  int sum_u = 0;
-  int sum_v = 0;
-  for (int i = 0; i < width; i++) {
-    sum_u += above_u[i];
-    sum_v += above_v[i];
-  }
-  *out_sum_u += sum_u;
-  *out_sum_v += sum_v;
-}
-#if CONFIG_HIGHBITDEPTH
-static void sum_above_row_hbd(const uint16_t *above_u, const uint16_t *above_v,
-                              int width, int *out_sum_u, int *out_sum_v) {
-  int sum_u = 0;
-  int sum_v = 0;
-  for (int i = 0; i < width; i++) {
-    sum_u += above_u[i];
-    sum_v += above_v[i];
-  }
-  *out_sum_u += sum_u;
-  *out_sum_v += sum_v;
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-static void sum_above_row(const MACROBLOCKD *xd, int width, int *out_sum_u,
-                          int *out_sum_v) {
-  const struct macroblockd_plane *const pd_u = &xd->plane[AOM_PLANE_U];
-  const struct macroblockd_plane *const pd_v = &xd->plane[AOM_PLANE_V];
-#if CONFIG_HIGHBITDEPTH
-  if (get_bitdepth_data_path_index(xd)) {
-    const uint16_t *above_u_16 =
-        CONVERT_TO_SHORTPTR(pd_u->dst.buf) - pd_u->dst.stride;
-    const uint16_t *above_v_16 =
-        CONVERT_TO_SHORTPTR(pd_v->dst.buf) - pd_v->dst.stride;
-    sum_above_row_hbd(above_u_16, above_v_16, width, out_sum_u, out_sum_v);
-    return;
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-  const uint8_t *above_u = pd_u->dst.buf - pd_u->dst.stride;
-  const uint8_t *above_v = pd_v->dst.buf - pd_v->dst.stride;
-  sum_above_row_lbd(above_u, above_v, width, out_sum_u, out_sum_v);
-}
-
-static void sum_left_col_lbd(const uint8_t *left_u, int u_stride,
-                             const uint8_t *left_v, int v_stride, int height,
-                             int *out_sum_u, int *out_sum_v) {
-  int sum_u = 0;
-  int sum_v = 0;
-  for (int i = 0; i < height; i++) {
-    sum_u += left_u[i * u_stride];
-    sum_v += left_v[i * v_stride];
-  }
-  *out_sum_u += sum_u;
-  *out_sum_v += sum_v;
-}
-#if CONFIG_HIGHBITDEPTH
-static void sum_left_col_hbd(const uint16_t *left_u, int u_stride,
-                             const uint16_t *left_v, int v_stride, int height,
-                             int *out_sum_u, int *out_sum_v) {
-  int sum_u = 0;
-  int sum_v = 0;
-  for (int i = 0; i < height; i++) {
-    sum_u += left_u[i * u_stride];
-    sum_v += left_v[i * v_stride];
-  }
-  *out_sum_u += sum_u;
-  *out_sum_v += sum_v;
-}
-#endif  // CONFIG_HIGHBITDEPTH
-static void sum_left_col(const MACROBLOCKD *xd, int height, int *out_sum_u,
-                         int *out_sum_v) {
-  const struct macroblockd_plane *const pd_u = &xd->plane[AOM_PLANE_U];
-  const struct macroblockd_plane *const pd_v = &xd->plane[AOM_PLANE_V];
-
-#if CONFIG_HIGHBITDEPTH
-  if (get_bitdepth_data_path_index(xd)) {
-    const uint16_t *left_u_16 = CONVERT_TO_SHORTPTR(pd_u->dst.buf) - 1;
-    const uint16_t *left_v_16 = CONVERT_TO_SHORTPTR(pd_v->dst.buf) - 1;
-    sum_left_col_hbd(left_u_16, pd_u->dst.stride, left_v_16, pd_v->dst.stride,
-                     height, out_sum_u, out_sum_v);
-    return;
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-  const uint8_t *left_u = pd_u->dst.buf - 1;
-  const uint8_t *left_v = pd_v->dst.buf - 1;
-  sum_left_col_lbd(left_u, pd_u->dst.stride, left_v, pd_v->dst.stride, height,
-                   out_sum_u, out_sum_v);
-}
-
-// CfL computes its own block-level DC_PRED. This is required to compute both
-// alpha_cb and alpha_cr before the prediction are computed.
-static void cfl_dc_pred(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize) {
-  CFL_CTX *const cfl = xd->cfl;
-
-  // Compute DC_PRED until block boundary. We can't assume the neighbor will use
-  // the same transform size.
-  const int width = max_block_wide(xd, plane_bsize, AOM_PLANE_U)
-                    << tx_size_wide_log2[0];
-  const int height = max_block_high(xd, plane_bsize, AOM_PLANE_U)
-                     << tx_size_high_log2[0];
-  // Number of pixel on the top and left borders.
-  const int num_pel = width + height;
-
-  int sum_u = 0;
-  int sum_v = 0;
-
-// Match behavior of build_intra_predictors_high (reconintra.c) at superblock
-// boundaries:
-// base-1 base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
-// base+1   A      B  ..     Y      Z
-// base+1   C      D  ..     W      X
-// base+1   E      F  ..     U      V
-// base+1   G      H  ..     S      T      T      T      T      T
-// ..
-
-#if CONFIG_CHROMA_SUB8X8
-  if (xd->chroma_up_available && xd->mb_to_right_edge >= 0) {
-#else
-  if (xd->up_available && xd->mb_to_right_edge >= 0) {
-#endif
-    sum_above_row(xd, width, &sum_u, &sum_v);
-  } else {
-    const int base = 128 << (xd->bd - 8);
-    sum_u = width * (base - 1);
-    sum_v = width * (base - 1);
-  }
-
-#if CONFIG_CHROMA_SUB8X8
-  if (xd->chroma_left_available && xd->mb_to_bottom_edge >= 0) {
-#else
-  if (xd->left_available && xd->mb_to_bottom_edge >= 0) {
-#endif
-    sum_left_col(xd, height, &sum_u, &sum_v);
-  } else {
-    const int base = 128 << (xd->bd - 8);
-    sum_u += height * (base + 1);
-    sum_v += height * (base + 1);
+static void subtract_average_c(const uint16_t *src, int16_t *dst, int width,
+                               int height, int round_offset, int num_pel_log2) {
+  int sum = round_offset;
+  const uint16_t *recon = src;
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      sum += recon[i];
+    }
+    recon += CFL_BUF_LINE;
   }
-
-  // TODO(ltrudeau) Because of max_block_wide and max_block_high, num_pel will
-  // not be a power of two. So these divisions will have to use a lookup table.
-  cfl->dc_pred[CFL_PRED_U] = (sum_u + (num_pel >> 1)) / num_pel;
-  cfl->dc_pred[CFL_PRED_V] = (sum_v + (num_pel >> 1)) / num_pel;
-}
-
-static void cfl_subtract_averages(CFL_CTX *cfl, TX_SIZE tx_size) {
-  const int width = cfl->uv_width;
-  const int height = cfl->uv_height;
-  const int tx_height = tx_size_high[tx_size];
-  const int tx_width = tx_size_wide[tx_size];
-  const int block_row_stride = MAX_SB_SIZE << tx_size_high_log2[tx_size];
-  const int num_pel_log2 =
-      (tx_size_high_log2[tx_size] + tx_size_wide_log2[tx_size]);
-
-  int16_t *pred_buf_q3 = cfl->pred_buf_q3;
-
-  cfl_pad(cfl, width, height);
-
-  for (int b_j = 0; b_j < height; b_j += tx_height) {
-    for (int b_i = 0; b_i < width; b_i += tx_width) {
-      int sum_q3 = 0;
-      int16_t *tx_pred_buf_q3 = pred_buf_q3;
-      for (int t_j = 0; t_j < tx_height; t_j++) {
-        for (int t_i = b_i; t_i < b_i + tx_width; t_i++) {
-          sum_q3 += tx_pred_buf_q3[t_i];
-        }
-        tx_pred_buf_q3 += MAX_SB_SIZE;
-      }
-      int avg_q3 = (sum_q3 + (1 << (num_pel_log2 - 1))) >> num_pel_log2;
-      // Loss is never more than 1/2 (in Q3)
-      assert(fabs((double)avg_q3 - (sum_q3 / ((double)(1 << num_pel_log2)))) <=
-             0.5);
-
-      tx_pred_buf_q3 = pred_buf_q3;
-      for (int t_j = 0; t_j < tx_height; t_j++) {
-        for (int t_i = b_i; t_i < b_i + tx_width; t_i++) {
-          tx_pred_buf_q3[t_i] -= avg_q3;
-        }
-
-        tx_pred_buf_q3 += MAX_SB_SIZE;
-      }
+  const int avg = sum >> num_pel_log2;
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      dst[i] = src[i] - avg;
     }
-    pred_buf_q3 += block_row_stride;
+    src += CFL_BUF_LINE;
+    dst += CFL_BUF_LINE;
   }
 }
 
+CFL_SUB_AVG_FN(c)
+
 static INLINE int cfl_idx_to_alpha(int alpha_idx, int joint_sign,
                                    CFL_PRED_TYPE pred_type) {
   const int alpha_sign = (pred_type == CFL_PRED_U) ? CFL_SIGN_U(joint_sign)
@@ -259,159 +153,218 @@ static INLINE int cfl_idx_to_alpha(int alpha_idx, int joint_sign,
   return (alpha_sign == CFL_SIGN_POS) ? abs_alpha_q3 + 1 : -abs_alpha_q3 - 1;
 }
 
-static void cfl_build_prediction_lbd(const int16_t *pred_buf_q3, uint8_t *dst,
-                                     int dst_stride, int width, int height,
-                                     int alpha_q3, int dc_pred) {
+static INLINE void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst,
+                                     int dst_stride, int alpha_q3, int width,
+                                     int height) {
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
-      dst[i] =
-          clip_pixel(get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]) + dc_pred);
+      dst[i] = clip_pixel(get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i]);
     }
     dst += dst_stride;
-    pred_buf_q3 += MAX_SB_SIZE;
+    ac_buf_q3 += CFL_BUF_LINE;
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
-static void cfl_build_prediction_hbd(const int16_t *pred_buf_q3, uint16_t *dst,
-                                     int dst_stride, int width, int height,
-                                     int alpha_q3, int dc_pred, int bit_depth) {
+// Null function used for invalid tx_sizes
+void cfl_predict_lbd_null(const int16_t *ac_buf_q3, uint8_t *dst,
+                          int dst_stride, int alpha_q3) {
+  (void)ac_buf_q3;
+  (void)dst;
+  (void)dst_stride;
+  (void)alpha_q3;
+  assert(0);
+}
+
+CFL_PREDICT_FN(c, lbd)
+
+void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride,
+                       int alpha_q3, int bit_depth, int width, int height) {
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
       dst[i] = clip_pixel_highbd(
-          get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]) + dc_pred, bit_depth);
+          get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i], bit_depth);
     }
     dst += dst_stride;
-    pred_buf_q3 += MAX_SB_SIZE;
+    ac_buf_q3 += CFL_BUF_LINE;
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
-static void cfl_build_prediction(const int16_t *pred_buf_q3, uint8_t *dst,
-                                 int dst_stride, int width, int height,
-                                 int alpha_q3, int dc_pred, int use_hbd,
-                                 int bit_depth) {
-#if CONFIG_HIGHBITDEPTH
-  if (use_hbd) {
-    uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
-    cfl_build_prediction_hbd(pred_buf_q3, dst_16, dst_stride, width, height,
-                             alpha_q3, dc_pred, bit_depth);
-    return;
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-  (void)use_hbd;
-  (void)bit_depth;
-  cfl_build_prediction_lbd(pred_buf_q3, dst, dst_stride, width, height,
-                           alpha_q3, dc_pred);
+// Null function used for invalid tx_sizes
+void cfl_predict_hbd_null(const int16_t *ac_buf_q3, uint16_t *dst,
+                          int dst_stride, int alpha_q3, int bd) {
+  (void)ac_buf_q3;
+  (void)dst;
+  (void)dst_stride;
+  (void)alpha_q3;
+  (void)bd;
+  assert(0);
+}
+
+CFL_PREDICT_FN(c, hbd)
+
+static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
+  CFL_CTX *const cfl = &xd->cfl;
+  // Do not call cfl_compute_parameters multiple time on the same values.
+  assert(cfl->are_parameters_computed == 0);
+
+  cfl_pad(cfl, tx_size_wide[tx_size], tx_size_high[tx_size]);
+  get_subtract_average_fn(tx_size)(cfl->recon_buf_q3, cfl->ac_buf_q3);
+  cfl->are_parameters_computed = 1;
 }
 
 void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
-                       int row, int col, TX_SIZE tx_size, int plane) {
-  CFL_CTX *const cfl = xd->cfl;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+                       TX_SIZE tx_size, int plane) {
+  CFL_CTX *const cfl = &xd->cfl;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  assert(is_cfl_allowed(xd));
 
-  // CfL parameters must be computed before prediction can be done.
-  assert(cfl->are_parameters_computed == 1);
+  if (!cfl->are_parameters_computed) cfl_compute_parameters(xd, tx_size);
 
-  const int16_t *pred_buf_q3 =
-      cfl->pred_buf_q3 + ((row * MAX_SB_SIZE + col) << tx_size_wide_log2[0]);
   const int alpha_q3 =
       cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
+  assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <=
+         CFL_BUF_SQUARE);
+  if (get_bitdepth_data_path_index(xd)) {
+    uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
+    get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride, alpha_q3,
+                                xd->bd);
+    return;
+  }
+  get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3);
+}
 
-  cfl_build_prediction(pred_buf_q3, dst, dst_stride, tx_size_wide[tx_size],
-                       tx_size_high[tx_size], alpha_q3, cfl->dc_pred[plane - 1],
-                       get_bitdepth_data_path_index(xd), xd->bd);
+// Null function used for invalid tx_sizes
+void cfl_subsample_lbd_null(const uint8_t *input, int input_stride,
+                            uint16_t *output_q3) {
+  (void)input;
+  (void)input_stride;
+  (void)output_q3;
+  assert(0);
 }
 
-static void cfl_luma_subsampling_420_lbd(const uint8_t *input, int input_stride,
-                                         int16_t *output_q3, int width,
-                                         int height) {
-  for (int j = 0; j < height; j++) {
-    for (int i = 0; i < width; i++) {
-      int top = i << 1;
-      int bot = top + input_stride;
-      output_q3[i] = (input[top] + input[top + 1] + input[bot] + input[bot + 1])
-                     << 1;
+// Null function used for invalid tx_sizes
+void cfl_subsample_hbd_null(const uint16_t *input, int input_stride,
+                            uint16_t *output_q3) {
+  (void)input;
+  (void)input_stride;
+  (void)output_q3;
+  assert(0);
+}
+
+static void cfl_luma_subsampling_420_lbd_c(const uint8_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  for (int j = 0; j < height; j += 2) {
+    for (int i = 0; i < width; i += 2) {
+      const int bot = i + input_stride;
+      output_q3[i >> 1] =
+          (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1;
     }
     input += input_stride << 1;
-    output_q3 += MAX_SB_SIZE;
+    output_q3 += CFL_BUF_LINE;
   }
 }
 
-static void cfl_luma_subsampling_444_lbd(const uint8_t *input, int input_stride,
-                                         int16_t *output_q3, int width,
-                                         int height) {
+static void cfl_luma_subsampling_422_lbd_c(const uint8_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
   for (int j = 0; j < height; j++) {
-    for (int i = 0; i < width; i++) {
-      output_q3[i] = input[i] << 3;
+    for (int i = 0; i < width; i += 2) {
+      output_q3[i >> 1] = (input[i] + input[i + 1]) << 2;
     }
     input += input_stride;
-    output_q3 += MAX_SB_SIZE;
+    output_q3 += CFL_BUF_LINE;
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
-static void cfl_luma_subsampling_420_hbd(const uint16_t *input,
-                                         int input_stride, int16_t *output_q3,
-                                         int width, int height) {
+static void cfl_luma_subsampling_444_lbd_c(const uint8_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
-      int top = i << 1;
-      int bot = top + input_stride;
-      output_q3[i] = (input[top] + input[top + 1] + input[bot] + input[bot + 1])
-                     << 1;
+      output_q3[i] = input[i] << 3;
+    }
+    input += input_stride;
+    output_q3 += CFL_BUF_LINE;
+  }
+}
+
+static void cfl_luma_subsampling_420_hbd_c(const uint16_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  for (int j = 0; j < height; j += 2) {
+    for (int i = 0; i < width; i += 2) {
+      const int bot = i + input_stride;
+      output_q3[i >> 1] =
+          (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1;
     }
     input += input_stride << 1;
-    output_q3 += MAX_SB_SIZE;
+    output_q3 += CFL_BUF_LINE;
   }
 }
 
-static void cfl_luma_subsampling_444_hbd(const uint16_t *input,
-                                         int input_stride, int16_t *output_q3,
-                                         int width, int height) {
+static void cfl_luma_subsampling_422_hbd_c(const uint16_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i += 2) {
+      output_q3[i >> 1] = (input[i] + input[i + 1]) << 2;
+    }
+    input += input_stride;
+    output_q3 += CFL_BUF_LINE;
+  }
+}
+
+static void cfl_luma_subsampling_444_hbd_c(const uint16_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
       output_q3[i] = input[i] << 3;
     }
     input += input_stride;
-    output_q3 += MAX_SB_SIZE;
+    output_q3 += CFL_BUF_LINE;
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
-static void cfl_luma_subsampling_420(const uint8_t *input, int input_stride,
-                                     int16_t *output_q3, int width, int height,
-                                     int use_hbd) {
-#if CONFIG_HIGHBITDEPTH
-  if (use_hbd) {
-    const uint16_t *input_16 = CONVERT_TO_SHORTPTR(input);
-    cfl_luma_subsampling_420_hbd(input_16, input_stride, output_q3, width,
-                                 height);
-    return;
+CFL_GET_SUBSAMPLE_FUNCTION(c)
+
+static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size,
+                                                       int sub_x, int sub_y) {
+  if (sub_x == 1) {
+    if (sub_y == 1) {
+      return cfl_get_luma_subsampling_420_hbd(tx_size);
+    }
+    return cfl_get_luma_subsampling_422_hbd(tx_size);
   }
-#endif  // CONFIG_HIGHBITDEPTH
-  (void)use_hbd;
-  cfl_luma_subsampling_420_lbd(input, input_stride, output_q3, width, height);
+  return cfl_get_luma_subsampling_444_hbd(tx_size);
 }
 
-static void cfl_luma_subsampling_444(const uint8_t *input, int input_stride,
-                                     int16_t *output_q3, int width, int height,
-                                     int use_hbd) {
-#if CONFIG_HIGHBITDEPTH
-  if (use_hbd) {
-    uint16_t *input_16 = CONVERT_TO_SHORTPTR(input);
-    cfl_luma_subsampling_444_hbd(input_16, input_stride, output_q3, width,
-                                 height);
-    return;
+static INLINE cfl_subsample_lbd_fn cfl_subsampling_lbd(TX_SIZE tx_size,
+                                                       int sub_x, int sub_y) {
+  if (sub_x == 1) {
+    if (sub_y == 1) {
+      return cfl_get_luma_subsampling_420_lbd(tx_size);
+    }
+    return cfl_get_luma_subsampling_422_lbd(tx_size);
   }
-#endif  // CONFIG_HIGHBITDEPTH
-  (void)use_hbd;
-  cfl_luma_subsampling_444_lbd(input, input_stride, output_q3, width, height);
+  return cfl_get_luma_subsampling_444_lbd(tx_size);
 }
 
-static INLINE void cfl_store(CFL_CTX *cfl, const uint8_t *input,
-                             int input_stride, int row, int col, int width,
-                             int height, int use_hbd) {
+static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride,
+                      int row, int col, TX_SIZE tx_size, int use_hbd) {
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
   const int tx_off_log2 = tx_size_wide_log2[0];
   const int sub_x = cfl->subsampling_x;
   const int sub_y = cfl->subsampling_y;
@@ -435,26 +388,22 @@ static INLINE void cfl_store(CFL_CTX *cfl, const uint8_t *input,
   }
 
   // Check that we will remain inside the pixel buffer.
-  assert(store_row + store_height <= MAX_SB_SIZE);
-  assert(store_col + store_width <= MAX_SB_SIZE);
+  assert(store_row + store_height <= CFL_BUF_LINE);
+  assert(store_col + store_width <= CFL_BUF_LINE);
 
   // Store the input into the CfL pixel buffer
-  int16_t *pred_buf_q3 =
-      cfl->pred_buf_q3 + (store_row * MAX_SB_SIZE + store_col);
-
-  if (sub_y == 0 && sub_x == 0) {
-    cfl_luma_subsampling_444(input, input_stride, pred_buf_q3, store_width,
-                             store_height, use_hbd);
-  } else if (sub_y == 1 && sub_x == 1) {
-    cfl_luma_subsampling_420(input, input_stride, pred_buf_q3, store_width,
-                             store_height, use_hbd);
+  uint16_t *recon_buf_q3 =
+      cfl->recon_buf_q3 + (store_row * CFL_BUF_LINE + store_col);
+
+  if (use_hbd) {
+    cfl_subsampling_hbd(tx_size, sub_x, sub_y)(CONVERT_TO_SHORTPTR(input),
+                                               input_stride, recon_buf_q3);
   } else {
-    // TODO(ltrudeau) add support for 4:2:2
-    assert(0);  // Unsupported chroma subsampling
+    cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride,
+                                               recon_buf_q3);
   }
 }
 
-#if CONFIG_CHROMA_SUB8X8
 // Adjust the row and column of blocks smaller than 8X8, as chroma-referenced
 // and non-chroma-referenced blocks are stored together in the CfL buffer.
 static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int *row_out,
@@ -471,99 +420,36 @@ static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int *row_out,
     (*col_out)++;
   }
 }
-#if CONFIG_DEBUG
-static INLINE void sub8x8_set_val(CFL_CTX *cfl, int row, int col, int val_high,
-                                  int val_wide) {
-  for (int val_r = 0; val_r < val_high; val_r++) {
-    assert(row + val_r < CFL_SUB8X8_VAL_MI_SIZE);
-    int row_off = (row + val_r) * CFL_SUB8X8_VAL_MI_SIZE;
-    for (int val_c = 0; val_c < val_wide; val_c++) {
-      assert(col + val_c < CFL_SUB8X8_VAL_MI_SIZE);
-      assert(cfl->sub8x8_val[row_off + col + val_c] == 0);
-      cfl->sub8x8_val[row_off + col + val_c]++;
-    }
-  }
-}
-#endif  // CONFIG_DEBUG
-#endif  // CONFIG_CHROMA_SUB8X8
 
 void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
                   BLOCK_SIZE bsize) {
-  CFL_CTX *const cfl = xd->cfl;
+  CFL_CTX *const cfl = &xd->cfl;
   struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
   uint8_t *dst =
       &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
-  (void)bsize;
-#if CONFIG_CHROMA_SUB8X8
 
   if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
     // Only dimensions of size 4 can have an odd offset.
     assert(!((col & 1) && tx_size_wide[tx_size] != 4));
     assert(!((row & 1) && tx_size_high[tx_size] != 4));
     sub8x8_adjust_offset(cfl, &row, &col);
-#if CONFIG_DEBUG
-    sub8x8_set_val(cfl, row, col, tx_size_high_unit[tx_size],
-                   tx_size_wide_unit[tx_size]);
-#endif  // CONFIG_DEBUG
   }
-#endif
-  cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size_wide[tx_size],
-            tx_size_high[tx_size], get_bitdepth_data_path_index(xd));
+  cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size,
+            get_bitdepth_data_path_index(xd));
 }
 
 void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
-  CFL_CTX *const cfl = xd->cfl;
+  CFL_CTX *const cfl = &xd->cfl;
   struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
   int row = 0;
   int col = 0;
-#if CONFIG_CHROMA_SUB8X8
-  bsize = AOMMAX(BLOCK_4X4, bsize);
+
   if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
     sub8x8_adjust_offset(cfl, &row, &col);
-#if CONFIG_DEBUG
-    sub8x8_set_val(cfl, row, col, mi_size_high[bsize], mi_size_wide[bsize]);
-#endif  // CONFIG_DEBUG
   }
-#endif  // CONFIG_CHROMA_SUB8X8
   const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size);
   const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size);
-  cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, width, height,
+  tx_size = get_tx_size(width, height);
+  cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size,
             get_bitdepth_data_path_index(xd));
 }
-
-void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
-  CFL_CTX *const cfl = xd->cfl;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-
-  // Do not call cfl_compute_parameters multiple time on the same values.
-  assert(cfl->are_parameters_computed == 0);
-
-#if CONFIG_CHROMA_SUB8X8
-  const BLOCK_SIZE plane_bsize = AOMMAX(
-      BLOCK_4X4, get_plane_block_size(mbmi->sb_type, &xd->plane[AOM_PLANE_U]));
-#if CONFIG_DEBUG
-  if (mbmi->sb_type < BLOCK_8X8) {
-    for (int val_r = 0; val_r < mi_size_high[mbmi->sb_type]; val_r++) {
-      for (int val_c = 0; val_c < mi_size_wide[mbmi->sb_type]; val_c++) {
-        assert(cfl->sub8x8_val[val_r * CFL_SUB8X8_VAL_MI_SIZE + val_c] == 1);
-      }
-    }
-    cfl_clear_sub8x8_val(cfl);
-  }
-#endif  // CONFIG_DEBUG
-#else
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(mbmi->sb_type, &xd->plane[AOM_PLANE_U]);
-#endif
-  // AOM_PLANE_U is used, but both planes will have the same sizes.
-  cfl->uv_width = max_intra_block_width(xd, plane_bsize, AOM_PLANE_U, tx_size);
-  cfl->uv_height =
-      max_intra_block_height(xd, plane_bsize, AOM_PLANE_U, tx_size);
-
-  assert(cfl->buf_width <= cfl->uv_width);
-  assert(cfl->buf_height <= cfl->uv_height);
-
-  cfl_dc_pred(xd, plane_bsize);
-  cfl_subtract_averages(cfl, tx_size);
-  cfl->are_parameters_computed = 1;
-}
diff --git a/third_party/aom/av1/common/cfl.h b/third_party/aom/av1/common/cfl.h
index 4ac0b401c..bc9fbce1b 100644
--- a/third_party/aom/av1/common/cfl.h
+++ b/third_party/aom/av1/common/cfl.h
@@ -13,20 +13,290 @@
 #define AV1_COMMON_CFL_H_
 
 #include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+
+// Can we use CfL for the current block?
+static INLINE CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(bsize < BLOCK_SIZES_ALL);
+  if (xd->lossless[mbmi->segment_id]) {
+    // In lossless, CfL is available when the partition size is equal to the
+    // transform size.
+    const int ssx = xd->plane[AOM_PLANE_U].subsampling_x;
+    const int ssy = xd->plane[AOM_PLANE_U].subsampling_y;
+    const int plane_bsize = get_plane_block_size(bsize, ssx, ssy);
+    return (CFL_ALLOWED_TYPE)(plane_bsize == BLOCK_4X4);
+  }
+  // Spec: CfL is available to luma partitions lesser than or equal to 32x32
+  return (CFL_ALLOWED_TYPE)(block_size_wide[bsize] <= 32 &&
+                            block_size_high[bsize] <= 32);
+}
+
+// Do we need to save the luma pixels from the current block,
+// for a possible future CfL prediction?
+static INLINE CFL_ALLOWED_TYPE store_cfl_required(const AV1_COMMON *cm,
+                                                  const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+
+  if (cm->seq_params.monochrome) return CFL_DISALLOWED;
+
+  if (!xd->cfl.is_chroma_reference) {
+    // For non-chroma-reference blocks, we should always store the luma pixels,
+    // in case the corresponding chroma-reference block uses CfL.
+    // Note that this can only happen for block sizes which are <8 on
+    // their shortest side, as otherwise they would be chroma reference
+    // blocks.
+    return CFL_ALLOWED;
+  }
+
+  // If this block has chroma information, we know whether we're
+  // actually going to perform a CfL prediction
+  return (CFL_ALLOWED_TYPE)(!is_inter_block(mbmi) &&
+                            mbmi->uv_mode == UV_CFL_PRED);
+}
 
 static INLINE int get_scaled_luma_q0(int alpha_q3, int16_t pred_buf_q3) {
   int scaled_luma_q6 = alpha_q3 * pred_buf_q3;
   return ROUND_POWER_OF_TWO_SIGNED(scaled_luma_q6, 6);
 }
 
+static INLINE CFL_PRED_TYPE get_cfl_pred_type(PLANE_TYPE plane) {
+  assert(plane > 0);
+  return (CFL_PRED_TYPE)(plane - 1);
+}
+
 void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
-                       int row, int col, TX_SIZE tx_size, int plane);
+                       TX_SIZE tx_size, int plane);
 
 void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size);
 
 void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
                   BLOCK_SIZE bsize);
 
-void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size);
+void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
+                       CFL_PRED_TYPE pred_plane, int width);
+
+void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+                      TX_SIZE tx_size, CFL_PRED_TYPE pred_plane);
+
+// Null function used for invalid tx_sizes
+void cfl_subsample_lbd_null(const uint8_t *input, int input_stride,
+                            uint16_t *output_q3);
+
+// Null function used for invalid tx_sizes
+void cfl_subsample_hbd_null(const uint16_t *input, int input_stride,
+                            uint16_t *output_q3);
+
+// Allows the CFL_SUBSAMPLE function to switch types depending on the bitdepth.
+#define CFL_lbd_TYPE uint8_t *cfl_type
+#define CFL_hbd_TYPE uint16_t *cfl_type
+
+// Declare a size-specific wrapper for the size-generic function. The compiler
+// will inline the size generic function in here, the advantage is that the size
+// will be constant allowing for loop unrolling and other constant propagated
+// goodness.
+#define CFL_SUBSAMPLE(arch, sub, bd, width, height)                       \
+  void subsample_##bd##_##sub##_##width##x##height##_##arch(              \
+      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
+    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
+                                               output_q3, width, height); \
+  }
+
+// Declare size-specific wrappers for all valid CfL sizes.
+#define CFL_SUBSAMPLE_FUNCTIONS(arch, sub, bd)                            \
+  CFL_SUBSAMPLE(arch, sub, bd, 4, 4)                                      \
+  CFL_SUBSAMPLE(arch, sub, bd, 8, 8)                                      \
+  CFL_SUBSAMPLE(arch, sub, bd, 16, 16)                                    \
+  CFL_SUBSAMPLE(arch, sub, bd, 32, 32)                                    \
+  CFL_SUBSAMPLE(arch, sub, bd, 4, 8)                                      \
+  CFL_SUBSAMPLE(arch, sub, bd, 8, 4)                                      \
+  CFL_SUBSAMPLE(arch, sub, bd, 8, 16)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 16, 8)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 16, 32)                                    \
+  CFL_SUBSAMPLE(arch, sub, bd, 32, 16)                                    \
+  CFL_SUBSAMPLE(arch, sub, bd, 4, 16)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 16, 4)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 8, 32)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 32, 8)                                     \
+  cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_##arch( \
+      TX_SIZE tx_size) {                                                  \
+    CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd)                           \
+    return subfn_##sub[tx_size];                                          \
+  }
+
+// Declare an architecture-specific array of function pointers for size-specific
+// wrappers.
+#define CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd)                       \
+  static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {      \
+    subsample_##bd##_##sub##_4x4_##arch,   /* 4x4 */                      \
+    subsample_##bd##_##sub##_8x8_##arch,   /* 8x8 */                      \
+    subsample_##bd##_##sub##_16x16_##arch, /* 16x16 */                    \
+    subsample_##bd##_##sub##_32x32_##arch, /* 32x32 */                    \
+    cfl_subsample_##bd##_null,             /* 64x64 (invalid CFL size) */ \
+    subsample_##bd##_##sub##_4x8_##arch,   /* 4x8 */                      \
+    subsample_##bd##_##sub##_8x4_##arch,   /* 8x4 */                      \
+    subsample_##bd##_##sub##_8x16_##arch,  /* 8x16 */                     \
+    subsample_##bd##_##sub##_16x8_##arch,  /* 16x8 */                     \
+    subsample_##bd##_##sub##_16x32_##arch, /* 16x32 */                    \
+    subsample_##bd##_##sub##_32x16_##arch, /* 32x16 */                    \
+    cfl_subsample_##bd##_null,             /* 32x64 (invalid CFL size) */ \
+    cfl_subsample_##bd##_null,             /* 64x32 (invalid CFL size) */ \
+    subsample_##bd##_##sub##_4x16_##arch,  /* 4x16  */                    \
+    subsample_##bd##_##sub##_16x4_##arch,  /* 16x4  */                    \
+    subsample_##bd##_##sub##_8x32_##arch,  /* 8x32  */                    \
+    subsample_##bd##_##sub##_32x8_##arch,  /* 32x8  */                    \
+    cfl_subsample_##bd##_null,             /* 16x64 (invalid CFL size) */ \
+    cfl_subsample_##bd##_null,             /* 64x16 (invalid CFL size) */ \
+  };
+
+// The RTCD script does not support passing in an array, so we wrap it in this
+// function.
+#define CFL_GET_SUBSAMPLE_FUNCTION(arch)  \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 420, hbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 422, hbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 444, hbd)
+
+// Null function used for invalid tx_sizes
+static INLINE void cfl_subtract_average_null(const uint16_t *src,
+                                             int16_t *dst) {
+  (void)dst;
+  (void)src;
+  assert(0);
+}
+
+// Declare a size-specific wrapper for the size-generic function. The compiler
+// will inline the size generic function in here, the advantage is that the size
+// will be constant allowing for loop unrolling and other constant propagated
+// goodness.
+#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2)   \
+  void subtract_average_##width##x##height##_##arch(const uint16_t *src, \
+                                                    int16_t *dst) {      \
+    subtract_average_##arch(src, dst, width, height, round_offset,       \
+                            num_pel_log2);                               \
+  }
+
+// Declare size-specific wrappers for all valid CfL sizes.
+#define CFL_SUB_AVG_FN(arch)                                                \
+  CFL_SUB_AVG_X(arch, 4, 4, 8, 4)                                           \
+  CFL_SUB_AVG_X(arch, 4, 8, 16, 5)                                          \
+  CFL_SUB_AVG_X(arch, 4, 16, 32, 6)                                         \
+  CFL_SUB_AVG_X(arch, 8, 4, 16, 5)                                          \
+  CFL_SUB_AVG_X(arch, 8, 8, 32, 6)                                          \
+  CFL_SUB_AVG_X(arch, 8, 16, 64, 7)                                         \
+  CFL_SUB_AVG_X(arch, 8, 32, 128, 8)                                        \
+  CFL_SUB_AVG_X(arch, 16, 4, 32, 6)                                         \
+  CFL_SUB_AVG_X(arch, 16, 8, 64, 7)                                         \
+  CFL_SUB_AVG_X(arch, 16, 16, 128, 8)                                       \
+  CFL_SUB_AVG_X(arch, 16, 32, 256, 9)                                       \
+  CFL_SUB_AVG_X(arch, 32, 8, 128, 8)                                        \
+  CFL_SUB_AVG_X(arch, 32, 16, 256, 9)                                       \
+  CFL_SUB_AVG_X(arch, 32, 32, 512, 10)                                      \
+  cfl_subtract_average_fn get_subtract_average_fn_##arch(TX_SIZE tx_size) { \
+    static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {          \
+      subtract_average_4x4_##arch,   /* 4x4 */                              \
+      subtract_average_8x8_##arch,   /* 8x8 */                              \
+      subtract_average_16x16_##arch, /* 16x16 */                            \
+      subtract_average_32x32_##arch, /* 32x32 */                            \
+      cfl_subtract_average_null,     /* 64x64 (invalid CFL size) */         \
+      subtract_average_4x8_##arch,   /* 4x8 */                              \
+      subtract_average_8x4_##arch,   /* 8x4 */                              \
+      subtract_average_8x16_##arch,  /* 8x16 */                             \
+      subtract_average_16x8_##arch,  /* 16x8 */                             \
+      subtract_average_16x32_##arch, /* 16x32 */                            \
+      subtract_average_32x16_##arch, /* 32x16 */                            \
+      cfl_subtract_average_null,     /* 32x64 (invalid CFL size) */         \
+      cfl_subtract_average_null,     /* 64x32 (invalid CFL size) */         \
+      subtract_average_4x16_##arch,  /* 4x16 (invalid CFL size) */          \
+      subtract_average_16x4_##arch,  /* 16x4 (invalid CFL size) */          \
+      subtract_average_8x32_##arch,  /* 8x32 (invalid CFL size) */          \
+      subtract_average_32x8_##arch,  /* 32x8 (invalid CFL size) */          \
+      cfl_subtract_average_null,     /* 16x64 (invalid CFL size) */         \
+      cfl_subtract_average_null,     /* 64x16 (invalid CFL size) */         \
+    };                                                                      \
+    /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */   \
+    /* index the function pointer array out of bounds. */                   \
+    return sub_avg[tx_size % TX_SIZES_ALL];                                 \
+  }
+
+// For VSX SIMD optimization, the C versions of width == 4 subtract are
+// faster than the VSX. As such, the VSX code calls the C versions.
+void subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
+void subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
+void subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
+
+#define CFL_PREDICT_lbd(arch, width, height)                                 \
+  void predict_lbd_##width##x##height##_##arch(const int16_t *pred_buf_q3,   \
+                                               uint8_t *dst, int dst_stride, \
+                                               int alpha_q3) {               \
+    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,    \
+                           height);                                          \
+  }
+
+#define CFL_PREDICT_hbd(arch, width, height)                                  \
+  void predict_hbd_##width##x##height##_##arch(const int16_t *pred_buf_q3,    \
+                                               uint16_t *dst, int dst_stride, \
+                                               int alpha_q3, int bd) {        \
+    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width, \
+                           height);                                           \
+  }
+
+// This wrapper exists because clang format does not like calling macros with
+// lowercase letters.
+#define CFL_PREDICT_X(arch, width, height, bd) \
+  CFL_PREDICT_##bd(arch, width, height)
+
+// Null function used for invalid tx_sizes
+void cfl_predict_lbd_null(const int16_t *pred_buf_q3, uint8_t *dst,
+                          int dst_stride, int alpha_q3);
+
+// Null function used for invalid tx_sizes
+void cfl_predict_hbd_null(const int16_t *pred_buf_q3, uint16_t *dst,
+                          int dst_stride, int alpha_q3, int bd);
+
+#define CFL_PREDICT_FN(arch, bd)                                          \
+  CFL_PREDICT_X(arch, 4, 4, bd)                                           \
+  CFL_PREDICT_X(arch, 4, 8, bd)                                           \
+  CFL_PREDICT_X(arch, 4, 16, bd)                                          \
+  CFL_PREDICT_X(arch, 8, 4, bd)                                           \
+  CFL_PREDICT_X(arch, 8, 8, bd)                                           \
+  CFL_PREDICT_X(arch, 8, 16, bd)                                          \
+  CFL_PREDICT_X(arch, 8, 32, bd)                                          \
+  CFL_PREDICT_X(arch, 16, 4, bd)                                          \
+  CFL_PREDICT_X(arch, 16, 8, bd)                                          \
+  CFL_PREDICT_X(arch, 16, 16, bd)                                         \
+  CFL_PREDICT_X(arch, 16, 32, bd)                                         \
+  CFL_PREDICT_X(arch, 32, 8, bd)                                          \
+  CFL_PREDICT_X(arch, 32, 16, bd)                                         \
+  CFL_PREDICT_X(arch, 32, 32, bd)                                         \
+  cfl_predict_##bd##_fn get_predict_##bd##_fn_##arch(TX_SIZE tx_size) {   \
+    static const cfl_predict_##bd##_fn pred[TX_SIZES_ALL] = {             \
+      predict_##bd##_4x4_##arch,   /* 4x4 */                              \
+      predict_##bd##_8x8_##arch,   /* 8x8 */                              \
+      predict_##bd##_16x16_##arch, /* 16x16 */                            \
+      predict_##bd##_32x32_##arch, /* 32x32 */                            \
+      cfl_predict_##bd##_null,     /* 64x64 (invalid CFL size) */         \
+      predict_##bd##_4x8_##arch,   /* 4x8 */                              \
+      predict_##bd##_8x4_##arch,   /* 8x4 */                              \
+      predict_##bd##_8x16_##arch,  /* 8x16 */                             \
+      predict_##bd##_16x8_##arch,  /* 16x8 */                             \
+      predict_##bd##_16x32_##arch, /* 16x32 */                            \
+      predict_##bd##_32x16_##arch, /* 32x16 */                            \
+      cfl_predict_##bd##_null,     /* 32x64 (invalid CFL size) */         \
+      cfl_predict_##bd##_null,     /* 64x32 (invalid CFL size) */         \
+      predict_##bd##_4x16_##arch,  /* 4x16  */                            \
+      predict_##bd##_16x4_##arch,  /* 16x4  */                            \
+      predict_##bd##_8x32_##arch,  /* 8x32  */                            \
+      predict_##bd##_32x8_##arch,  /* 32x8  */                            \
+      cfl_predict_##bd##_null,     /* 16x64 (invalid CFL size) */         \
+      cfl_predict_##bd##_null,     /* 64x16 (invalid CFL size) */         \
+    };                                                                    \
+    /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \
+    /* index the function pointer array out of bounds. */                 \
+    return pred[tx_size % TX_SIZES_ALL];                                  \
+  }
 
 #endif  // AV1_COMMON_CFL_H_
diff --git a/third_party/aom/av1/common/clpf.c b/third_party/aom/av1/common/clpf.c
deleted file mode 100644
index d643236aa..000000000
--- a/third_party/aom/av1/common/clpf.c
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./av1_rtcd.h"
-#include "./cdef.h"
-#include "aom/aom_image.h"
-#include "aom_dsp/aom_dsp_common.h"
-
-static int clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G,
-                       int H, int s, unsigned int dmp) {
-  int delta = 1 * constrain(A - X, s, dmp) + 3 * constrain(B - X, s, dmp) +
-              1 * constrain(C - X, s, dmp) + 3 * constrain(D - X, s, dmp) +
-              3 * constrain(E - X, s, dmp) + 1 * constrain(F - X, s, dmp) +
-              3 * constrain(G - X, s, dmp) + 1 * constrain(H - X, s, dmp);
-  return (8 + delta - (delta < 0)) >> 4;
-}
-
-static int clpf_hsample(int X, int A, int B, int C, int D, int s,
-                        unsigned int dmp) {
-  int delta = 1 * constrain(A - X, s, dmp) + 3 * constrain(B - X, s, dmp) +
-              3 * constrain(C - X, s, dmp) + 1 * constrain(D - X, s, dmp);
-  return (4 + delta - (delta < 0)) >> 3;
-}
-
-void aom_clpf_block_c(uint8_t *dst, const uint16_t *src, int dstride,
-                      int sstride, int sizex, int sizey, unsigned int strength,
-                      unsigned int damping) {
-  int x, y;
-
-  for (y = 0; y < sizey; y++) {
-    for (x = 0; x < sizex; x++) {
-      const int X = src[y * sstride + x];
-      const int A = src[(y - 2) * sstride + x];
-      const int B = src[(y - 1) * sstride + x];
-      const int C = src[y * sstride + x - 2];
-      const int D = src[y * sstride + x - 1];
-      const int E = src[y * sstride + x + 1];
-      const int F = src[y * sstride + x + 2];
-      const int G = src[(y + 1) * sstride + x];
-      const int H = src[(y + 2) * sstride + x];
-      const int delta =
-          clpf_sample(X, A, B, C, D, E, F, G, H, strength, damping);
-      dst[y * dstride + x] = X + delta;
-    }
-  }
-}
-
-// Identical to aom_clpf_block_c() apart from "dst".
-void aom_clpf_block_hbd_c(uint16_t *dst, const uint16_t *src, int dstride,
-                          int sstride, int sizex, int sizey,
-                          unsigned int strength, unsigned int damping) {
-  int x, y;
-
-  for (y = 0; y < sizey; y++) {
-    for (x = 0; x < sizex; x++) {
-      const int X = src[y * sstride + x];
-      const int A = src[(y - 2) * sstride + x];
-      const int B = src[(y - 1) * sstride + x];
-      const int C = src[y * sstride + x - 2];
-      const int D = src[y * sstride + x - 1];
-      const int E = src[y * sstride + x + 1];
-      const int F = src[y * sstride + x + 2];
-      const int G = src[(y + 1) * sstride + x];
-      const int H = src[(y + 2) * sstride + x];
-      const int delta =
-          clpf_sample(X, A, B, C, D, E, F, G, H, strength, damping);
-      dst[y * dstride + x] = X + delta;
-    }
-  }
-}
-
-// Vertically restricted filter
-void aom_clpf_hblock_c(uint8_t *dst, const uint16_t *src, int dstride,
-                       int sstride, int sizex, int sizey, unsigned int strength,
-                       unsigned int damping) {
-  int x, y;
-
-  for (y = 0; y < sizey; y++) {
-    for (x = 0; x < sizex; x++) {
-      const int X = src[y * sstride + x];
-      const int A = src[y * sstride + x - 2];
-      const int B = src[y * sstride + x - 1];
-      const int C = src[y * sstride + x + 1];
-      const int D = src[y * sstride + x + 2];
-      const int delta = clpf_hsample(X, A, B, C, D, strength, damping);
-      dst[y * dstride + x] = X + delta;
-    }
-  }
-}
-
-void aom_clpf_hblock_hbd_c(uint16_t *dst, const uint16_t *src, int dstride,
-                           int sstride, int sizex, int sizey,
-                           unsigned int strength, unsigned int damping) {
-  int x, y;
-
-  for (y = 0; y < sizey; y++) {
-    for (x = 0; x < sizex; x++) {
-      const int X = src[y * sstride + x];
-      const int A = src[y * sstride + x - 2];
-      const int B = src[y * sstride + x - 1];
-      const int C = src[y * sstride + x + 1];
-      const int D = src[y * sstride + x + 2];
-      const int delta = clpf_hsample(X, A, B, C, D, strength, damping);
-      dst[y * dstride + x] = X + delta;
-    }
-  }
-}
diff --git a/third_party/aom/av1/common/clpf_neon.c b/third_party/aom/av1/common/clpf_neon.c
deleted file mode 100644
index f1a004c2c..000000000
--- a/third_party/aom/av1/common/clpf_neon.c
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/aom_simd.h"
-#define SIMD_FUNC(name) name##_neon
-#include "./clpf_simd.h"
diff --git a/third_party/aom/av1/common/clpf_simd.h b/third_party/aom/av1/common/clpf_simd.h
deleted file mode 100644
index c7ffc569a..000000000
--- a/third_party/aom/av1/common/clpf_simd.h
+++ /dev/null
@@ -1,456 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./av1_rtcd.h"
-#include "aom_ports/bitops.h"
-#include "aom_ports/mem.h"
-
-// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
-SIMD_INLINE v128 constrain16(v128 a, v128 b, unsigned int threshold,
-                             unsigned int adjdamp) {
-  v128 diff = v128_sub_16(a, b);
-  const v128 sign = v128_shr_n_s16(diff, 15);
-  diff = v128_abs_s16(diff);
-  const v128 s =
-      v128_ssub_u16(v128_dup_16(threshold), v128_shr_u16(diff, adjdamp));
-  return v128_xor(v128_add_16(sign, v128_min_s16(diff, s)), sign);
-}
-
-// sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
-SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
-                           unsigned int adjdamp) {
-  const v256 diff16 = v256_sub_16(a, b);
-  v128 diff = v128_pack_s16_s8(v256_high_v128(diff16), v256_low_v128(diff16));
-  const v128 sign = v128_cmplt_s8(diff, v128_zero());
-  diff = v128_abs_s8(diff);
-  return v128_xor(
-      v128_add_8(sign,
-                 v128_min_u8(diff, v128_ssub_u8(v128_dup_8(strength),
-                                                v128_shr_u8(diff, adjdamp)))),
-      sign);
-}
-
-// delta = 1/16 * constrain(a, x, s, d) + 3/16 * constrain(b, x, s, d) +
-//         1/16 * constrain(c, x, s, d) + 3/16 * constrain(d, x, s, d) +
-//         3/16 * constrain(e, x, s, d) + 1/16 * constrain(f, x, s, d) +
-//         3/16 * constrain(g, x, s, d) + 1/16 * constrain(h, x, s, d)
-SIMD_INLINE v128 calc_delta(v256 x, v256 a, v256 b, v256 c, v256 d, v256 e,
-                            v256 f, v256 g, v256 h, unsigned int s,
-                            unsigned int dmp) {
-  const v128 bdeg =
-      v128_add_8(v128_add_8(constrain(b, x, s, dmp), constrain(d, x, s, dmp)),
-                 v128_add_8(constrain(e, x, s, dmp), constrain(g, x, s, dmp)));
-  const v128 delta = v128_add_8(
-      v128_add_8(v128_add_8(constrain(a, x, s, dmp), constrain(c, x, s, dmp)),
-                 v128_add_8(constrain(f, x, s, dmp), constrain(h, x, s, dmp))),
-      v128_add_8(v128_add_8(bdeg, bdeg), bdeg));
-  return v128_add_8(
-      v128_pack_s16_u8(v256_high_v128(x), v256_low_v128(x)),
-      v128_shr_s8(
-          v128_add_8(v128_dup_8(8),
-                     v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
-          4));
-}
-
-// delta = 1/8 * constrain(a, x, s, d) + 3/8 * constrain(b, x, s, d) +
-//         3/8 * constrain(c, x, s, d) + 1/8 * constrain(d, x, s, d) +
-SIMD_INLINE v128 calc_hdelta(v256 x, v256 a, v256 b, v256 c, v256 d,
-                             unsigned int s, unsigned int dmp) {
-  const v128 bc = v128_add_8(constrain(b, x, s, dmp), constrain(c, x, s, dmp));
-  const v128 delta =
-      v128_add_8(v128_add_8(constrain(a, x, s, dmp), constrain(d, x, s, dmp)),
-                 v128_add_8(v128_add_8(bc, bc), bc));
-  return v128_add_8(
-      v128_pack_s16_u8(v256_high_v128(x), v256_low_v128(x)),
-      v128_shr_s8(
-          v128_add_8(v128_dup_8(4),
-                     v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
-          3));
-}
-
-// Process blocks of width 8, two lines at a time, 8 bit.
-static void SIMD_FUNC(clpf_block8)(uint8_t *dst, const uint16_t *src,
-                                   int dstride, int sstride, int sizey,
-                                   unsigned int strength,
-                                   unsigned int adjdamp) {
-  int y;
-
-  for (y = 0; y < sizey; y += 2) {
-    const v128 l1 = v128_load_aligned(src);
-    const v128 l2 = v128_load_aligned(src + sstride);
-    const v128 l3 = v128_load_aligned(src - sstride);
-    const v128 l4 = v128_load_aligned(src + 2 * sstride);
-    const v256 a = v256_from_v128(v128_load_aligned(src - 2 * sstride), l3);
-    const v256 b = v256_from_v128(l3, l1);
-    const v256 g = v256_from_v128(l2, l4);
-    const v256 h = v256_from_v128(l4, v128_load_aligned(src + 3 * sstride));
-    const v256 c = v256_from_v128(v128_load_unaligned(src - 2),
-                                  v128_load_unaligned(src - 2 + sstride));
-    const v256 d = v256_from_v128(v128_load_unaligned(src - 1),
-                                  v128_load_unaligned(src - 1 + sstride));
-    const v256 e = v256_from_v128(v128_load_unaligned(src + 1),
-                                  v128_load_unaligned(src + 1 + sstride));
-    const v256 f = v256_from_v128(v128_load_unaligned(src + 2),
-                                  v128_load_unaligned(src + 2 + sstride));
-    const v128 o = calc_delta(v256_from_v128(l1, l2), a, b, c, d, e, f, g, h,
-                              strength, adjdamp);
-
-    v64_store_aligned(dst, v128_high_v64(o));
-    v64_store_aligned(dst + dstride, v128_low_v64(o));
-    src += sstride * 2;
-    dst += dstride * 2;
-  }
-}
-
-// Process blocks of width 4, four lines at a time, 8 bit.
-static void SIMD_FUNC(clpf_block4)(uint8_t *dst, const uint16_t *src,
-                                   int dstride, int sstride, int sizey,
-                                   unsigned int strength,
-                                   unsigned int adjdamp) {
-  int y;
-
-  for (y = 0; y < sizey; y += 4) {
-    const v64 l0 = v64_load_aligned(src - 2 * sstride);
-    const v64 l1 = v64_load_aligned(src - sstride);
-    const v64 l2 = v64_load_aligned(src);
-    const v64 l3 = v64_load_aligned(src + sstride);
-    const v64 l4 = v64_load_aligned(src + 2 * sstride);
-    const v64 l5 = v64_load_aligned(src + 3 * sstride);
-    const v64 l6 = v64_load_aligned(src + 4 * sstride);
-    const v64 l7 = v64_load_aligned(src + 5 * sstride);
-    const v128 o =
-        calc_delta(v256_from_v64(l2, l3, l4, l5), v256_from_v64(l0, l1, l2, l3),
-                   v256_from_v64(l1, l2, l3, l4),
-                   v256_from_v64(v64_load_unaligned(src - 2),
-                                 v64_load_unaligned(src + sstride - 2),
-                                 v64_load_unaligned(src + 2 * sstride - 2),
-                                 v64_load_unaligned(src + 3 * sstride - 2)),
-                   v256_from_v64(v64_load_unaligned(src - 1),
-                                 v64_load_unaligned(src + sstride - 1),
-                                 v64_load_unaligned(src + 2 * sstride - 1),
-                                 v64_load_unaligned(src + 3 * sstride - 1)),
-                   v256_from_v64(v64_load_unaligned(src + 1),
-                                 v64_load_unaligned(src + sstride + 1),
-                                 v64_load_unaligned(src + 2 * sstride + 1),
-                                 v64_load_unaligned(src + 3 * sstride + 1)),
-                   v256_from_v64(v64_load_unaligned(src + 2),
-                                 v64_load_unaligned(src + sstride + 2),
-                                 v64_load_unaligned(src + 2 * sstride + 2),
-                                 v64_load_unaligned(src + 3 * sstride + 2)),
-                   v256_from_v64(l3, l4, l5, l6), v256_from_v64(l4, l5, l6, l7),
-                   strength, adjdamp);
-
-    u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
-    u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
-    u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
-    u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
-
-    dst += 4 * dstride;
-    src += 4 * sstride;
-  }
-}
-
-static void SIMD_FUNC(clpf_hblock8)(uint8_t *dst, const uint16_t *src,
-                                    int dstride, int sstride, int sizey,
-                                    unsigned int strength,
-                                    unsigned int adjdamp) {
-  int y;
-
-  for (y = 0; y < sizey; y += 2) {
-    const v256 x = v256_from_v128(v128_load_aligned(src),
-                                  v128_load_aligned(src + sstride));
-    const v256 a = v256_from_v128(v128_load_unaligned(src - 2),
-                                  v128_load_unaligned(src - 2 + sstride));
-    const v256 b = v256_from_v128(v128_load_unaligned(src - 1),
-                                  v128_load_unaligned(src - 1 + sstride));
-    const v256 c = v256_from_v128(v128_load_unaligned(src + 1),
-                                  v128_load_unaligned(src + 1 + sstride));
-    const v256 d = v256_from_v128(v128_load_unaligned(src + 2),
-                                  v128_load_unaligned(src + 2 + sstride));
-    const v128 o = calc_hdelta(x, a, b, c, d, strength, adjdamp);
-
-    v64_store_aligned(dst, v128_high_v64(o));
-    v64_store_aligned(dst + dstride, v128_low_v64(o));
-    src += sstride * 2;
-    dst += dstride * 2;
-  }
-}
-
-// Process blocks of width 4, four lines at a time, 8 bit.
-static void SIMD_FUNC(clpf_hblock4)(uint8_t *dst, const uint16_t *src,
-                                    int dstride, int sstride, int sizey,
-                                    unsigned int strength,
-                                    unsigned int adjdamp) {
-  int y;
-
-  for (y = 0; y < sizey; y += 4) {
-    const v256 a = v256_from_v64(v64_load_unaligned(src - 2),
-                                 v64_load_unaligned(src + sstride - 2),
-                                 v64_load_unaligned(src + 2 * sstride - 2),
-                                 v64_load_unaligned(src + 3 * sstride - 2));
-    const v256 b = v256_from_v64(v64_load_unaligned(src - 1),
-                                 v64_load_unaligned(src + sstride - 1),
-                                 v64_load_unaligned(src + 2 * sstride - 1),
-                                 v64_load_unaligned(src + 3 * sstride - 1));
-    const v256 c = v256_from_v64(v64_load_unaligned(src + 1),
-                                 v64_load_unaligned(src + sstride + 1),
-                                 v64_load_unaligned(src + 2 * sstride + 1),
-                                 v64_load_unaligned(src + 3 * sstride + 1));
-    const v256 d = v256_from_v64(v64_load_unaligned(src + 2),
-                                 v64_load_unaligned(src + sstride + 2),
-                                 v64_load_unaligned(src + 2 * sstride + 2),
-                                 v64_load_unaligned(src + 3 * sstride + 2));
-
-    const v128 o = calc_hdelta(
-        v256_from_v64(v64_load_aligned(src), v64_load_aligned(src + sstride),
-                      v64_load_aligned(src + 2 * sstride),
-                      v64_load_aligned(src + 3 * sstride)),
-        a, b, c, d, strength, adjdamp);
-
-    u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
-    u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
-    u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
-    u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
-
-    dst += 4 * dstride;
-    src += 4 * sstride;
-  }
-}
-
-void SIMD_FUNC(aom_clpf_block)(uint8_t *dst, const uint16_t *src, int dstride,
-                               int sstride, int sizex, int sizey,
-                               unsigned int strength, unsigned int dmp) {
-  if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
-    // Fallback to C for odd sizes:
-    // * block widths not 4 or 8
-    // * block heights not a multiple of 4 if the block width is 4
-    aom_clpf_block_c(dst, src, dstride, sstride, sizex, sizey, strength, dmp);
-  } else {
-    (sizex == 4 ? SIMD_FUNC(clpf_block4) : SIMD_FUNC(clpf_block8))(
-        dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
-  }
-}
-
-void SIMD_FUNC(aom_clpf_hblock)(uint8_t *dst, const uint16_t *src, int dstride,
-                                int sstride, int sizex, int sizey,
-                                unsigned int strength, unsigned int dmp) {
-  if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
-    // Fallback to C for odd sizes:
-    // * block widths not 4 or 8
-    // * block heights not a multiple of 4 if the block width is 4
-    aom_clpf_hblock_c(dst, src, dstride, sstride, sizex, sizey, strength, dmp);
-  } else {
-    (sizex == 4 ? SIMD_FUNC(clpf_hblock4) : SIMD_FUNC(clpf_hblock8))(
-        dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
-  }
-}
-
-// delta = 1/16 * constrain(a, x, s, d) + 3/16 * constrain(b, x, s, d) +
-//         1/16 * constrain(c, x, s, d) + 3/16 * constrain(d, x, s, d) +
-//         3/16 * constrain(e, x, s, d) + 1/16 * constrain(f, x, s, d) +
-//         3/16 * constrain(g, x, s, d) + 1/16 * constrain(h, x, s, d)
-SIMD_INLINE v128 calc_delta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
-                                v128 f, v128 g, v128 h, unsigned int s,
-                                unsigned int dmp) {
-  const v128 bdeg = v128_add_16(
-      v128_add_16(constrain16(b, x, s, dmp), constrain16(d, x, s, dmp)),
-      v128_add_16(constrain16(e, x, s, dmp), constrain16(g, x, s, dmp)));
-  const v128 delta = v128_add_16(
-      v128_add_16(
-          v128_add_16(constrain16(a, x, s, dmp), constrain16(c, x, s, dmp)),
-          v128_add_16(constrain16(f, x, s, dmp), constrain16(h, x, s, dmp))),
-      v128_add_16(v128_add_16(bdeg, bdeg), bdeg));
-  return v128_add_16(
-      x,
-      v128_shr_s16(
-          v128_add_16(v128_dup_16(8),
-                      v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
-          4));
-}
-
-static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
-                            v128 f, v128 g, v128 h, uint16_t *dst,
-                            unsigned int s, unsigned int dmp, int dstride) {
-  o = calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, dmp);
-  v64_store_aligned(dst, v128_high_v64(o));
-  v64_store_aligned(dst + dstride, v128_low_v64(o));
-}
-
-static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
-                            v128 f, v128 g, v128 h, uint16_t *dst,
-                            unsigned int s, unsigned int adjdamp) {
-  v128_store_aligned(dst,
-                     calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, adjdamp));
-}
-
-// delta = 1/16 * constrain(a, x, s, dmp) + 3/16 * constrain(b, x, s, dmp) +
-//         3/16 * constrain(c, x, s, dmp) + 1/16 * constrain(d, x, s, dmp)
-SIMD_INLINE v128 calc_hdelta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d,
-                                 unsigned int s, unsigned int dmp) {
-  const v128 bc =
-      v128_add_16(constrain16(b, x, s, dmp), constrain16(c, x, s, dmp));
-  const v128 delta = v128_add_16(
-      v128_add_16(constrain16(a, x, s, dmp), constrain16(d, x, s, dmp)),
-      v128_add_16(v128_add_16(bc, bc), bc));
-  return v128_add_16(
-      x,
-      v128_shr_s16(
-          v128_add_16(v128_dup_16(4),
-                      v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
-          3));
-}
-
-static void calc_hdelta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d,
-                             uint16_t *dst, unsigned int s,
-                             unsigned int adjdamp, int dstride) {
-  o = calc_hdelta_hbd(o, a, b, c, d, s, adjdamp);
-  v64_store_aligned(dst, v128_high_v64(o));
-  v64_store_aligned(dst + dstride, v128_low_v64(o));
-}
-
-static void calc_hdelta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d,
-                             uint16_t *dst, unsigned int s,
-                             unsigned int adjdamp) {
-  v128_store_aligned(dst, calc_hdelta_hbd(o, a, b, c, d, s, adjdamp));
-}
-
-// Process blocks of width 4, two lines at time.
-static void SIMD_FUNC(clpf_block_hbd4)(uint16_t *dst, const uint16_t *src,
-                                       int dstride, int sstride, int sizey,
-                                       unsigned int strength,
-                                       unsigned int adjdamp) {
-  int y;
-
-  for (y = 0; y < sizey; y += 2) {
-    const v64 l1 = v64_load_aligned(src);
-    const v64 l2 = v64_load_aligned(src + sstride);
-    const v64 l3 = v64_load_aligned(src - sstride);
-    const v64 l4 = v64_load_aligned(src + 2 * sstride);
-    const v128 a = v128_from_v64(v64_load_aligned(src - 2 * sstride), l3);
-    const v128 b = v128_from_v64(l3, l1);
-    const v128 g = v128_from_v64(l2, l4);
-    const v128 h = v128_from_v64(l4, v64_load_aligned(src + 3 * sstride));
-    const v128 c = v128_from_v64(v64_load_unaligned(src - 2),
-                                 v64_load_unaligned(src - 2 + sstride));
-    const v128 d = v128_from_v64(v64_load_unaligned(src - 1),
-                                 v64_load_unaligned(src - 1 + sstride));
-    const v128 e = v128_from_v64(v64_load_unaligned(src + 1),
-                                 v64_load_unaligned(src + 1 + sstride));
-    const v128 f = v128_from_v64(v64_load_unaligned(src + 2),
-                                 v64_load_unaligned(src + 2 + sstride));
-
-    calc_delta_hbd4(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h, dst,
-                    strength, adjdamp, dstride);
-    src += sstride * 2;
-    dst += dstride * 2;
-  }
-}
-
-// The most simple case.  Start here if you need to understand the functions.
-static void SIMD_FUNC(clpf_block_hbd)(uint16_t *dst, const uint16_t *src,
-                                      int dstride, int sstride, int sizey,
-                                      unsigned int strength,
-                                      unsigned int adjdamp) {
-  int y;
-
-  for (y = 0; y < sizey; y++) {
-    const v128 o = v128_load_aligned(src);
-    const v128 a = v128_load_aligned(src - 2 * sstride);
-    const v128 b = v128_load_aligned(src - 1 * sstride);
-    const v128 g = v128_load_aligned(src + sstride);
-    const v128 h = v128_load_aligned(src + 2 * sstride);
-    const v128 c = v128_load_unaligned(src - 2);
-    const v128 d = v128_load_unaligned(src - 1);
-    const v128 e = v128_load_unaligned(src + 1);
-    const v128 f = v128_load_unaligned(src + 2);
-
-    calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, adjdamp);
-    src += sstride;
-    dst += dstride;
-  }
-}
-
-// Process blocks of width 4, horizontal filter, two lines at time.
-static void SIMD_FUNC(clpf_hblock_hbd4)(uint16_t *dst, const uint16_t *src,
-                                        int dstride, int sstride, int sizey,
-                                        unsigned int strength,
-                                        unsigned int adjdamp) {
-  int y;
-
-  for (y = 0; y < sizey; y += 2) {
-    const v128 a = v128_from_v64(v64_load_unaligned(src - 2),
-                                 v64_load_unaligned(src - 2 + sstride));
-    const v128 b = v128_from_v64(v64_load_unaligned(src - 1),
-                                 v64_load_unaligned(src - 1 + sstride));
-    const v128 c = v128_from_v64(v64_load_unaligned(src + 1),
-                                 v64_load_unaligned(src + 1 + sstride));
-    const v128 d = v128_from_v64(v64_load_unaligned(src + 2),
-                                 v64_load_unaligned(src + 2 + sstride));
-
-    calc_hdelta_hbd4(v128_from_v64(v64_load_unaligned(src),
-                                   v64_load_unaligned(src + sstride)),
-                     a, b, c, d, dst, strength, adjdamp, dstride);
-    src += sstride * 2;
-    dst += dstride * 2;
-  }
-}
-
-// Process blocks of width 8, horizontal filter, two lines at time.
-static void SIMD_FUNC(clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src,
-                                       int dstride, int sstride, int sizey,
-                                       unsigned int strength,
-                                       unsigned int adjdamp) {
-  int y;
-
-  for (y = 0; y < sizey; y++) {
-    const v128 o = v128_load_aligned(src);
-    const v128 a = v128_load_unaligned(src - 2);
-    const v128 b = v128_load_unaligned(src - 1);
-    const v128 c = v128_load_unaligned(src + 1);
-    const v128 d = v128_load_unaligned(src + 2);
-
-    calc_hdelta_hbd8(o, a, b, c, d, dst, strength, adjdamp);
-    src += sstride;
-    dst += dstride;
-  }
-}
-
-void SIMD_FUNC(aom_clpf_block_hbd)(uint16_t *dst, const uint16_t *src,
-                                   int dstride, int sstride, int sizex,
-                                   int sizey, unsigned int strength,
-                                   unsigned int dmp) {
-  if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
-    // Fallback to C for odd sizes:
-    // * block width not 4 or 8
-    // * block heights not a multiple of 2 if the block width is 4
-    aom_clpf_block_hbd_c(dst, src, dstride, sstride, sizex, sizey, strength,
-                         dmp);
-  } else {
-    (sizex == 4 ? SIMD_FUNC(clpf_block_hbd4) : SIMD_FUNC(clpf_block_hbd))(
-        dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
-  }
-}
-
-void SIMD_FUNC(aom_clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src,
-                                    int dstride, int sstride, int sizex,
-                                    int sizey, unsigned int strength,
-                                    unsigned int dmp) {
-  if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
-    // Fallback to C for odd sizes:
-    // * block width not 4 or 8
-    // * block heights not a multiple of 2 if the block width is 4
-    aom_clpf_hblock_hbd_c(dst, src, dstride, sstride, sizex, sizey, strength,
-                          dmp);
-  } else {
-    (sizex == 4 ? SIMD_FUNC(clpf_hblock_hbd4) : SIMD_FUNC(clpf_hblock_hbd))(
-        dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
-  }
-}
diff --git a/third_party/aom/av1/common/clpf_sse2.c b/third_party/aom/av1/common/clpf_sse2.c
deleted file mode 100644
index e29c2ab7e..000000000
--- a/third_party/aom/av1/common/clpf_sse2.c
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/aom_simd.h"
-#define SIMD_FUNC(name) name##_sse2
-#include "./clpf_simd.h"
diff --git a/third_party/aom/av1/common/clpf_ssse3.c b/third_party/aom/av1/common/clpf_ssse3.c
deleted file mode 100644
index d7ed8dec5..000000000
--- a/third_party/aom/av1/common/clpf_ssse3.c
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/aom_simd.h"
-#define SIMD_FUNC(name) name##_ssse3
-#include "./clpf_simd.h"
diff --git a/third_party/aom/av1/common/common.h b/third_party/aom/av1/common/common.h
index 8611b776f..72c6d3a1e 100644
--- a/third_party/aom/av1/common/common.h
+++ b/third_party/aom/av1/common/common.h
@@ -20,6 +20,7 @@
 #include "aom_mem/aom_mem.h"
 #include "aom/aom_integer.h"
 #include "aom_ports/bitops.h"
+#include "config/aom_config.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -53,6 +54,8 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
 
 #define AOM_FRAME_MARKER 0x2
 
+#define AV1_MIN_TILE_SIZE_BYTES 1
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/common_data.h b/third_party/aom/av1/common/common_data.h
index 1a74fe76e..f521f10bf 100644
--- a/third_party/aom/av1/common/common_data.h
+++ b/third_party/aom/av1/common/common_data.h
@@ -20,600 +20,78 @@
 extern "C" {
 #endif
 
-#if CONFIG_EXT_PARTITION
-#define IF_EXT_PARTITION(...) __VA_ARGS__,
-#else
-#define IF_EXT_PARTITION(...)
-#endif
-
-// Log 2 conversion lookup tables for block width and height
-static const uint8_t b_width_log2_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0, 0,
-  0,
-#endif
-  0, 0,
-  1, 1,
-  1, 2,
-  2, 2,
-  3, 3,
-  3, 4,
-  4, IF_EXT_PARTITION(4, 5, 5) 0,
-  2, 1,
-  3, 2,
-  4, IF_EXT_PARTITION(3, 5)
-};
-static const uint8_t b_height_log2_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0, 0,
-  0,
-#endif
-  0, 1,
-  0, 1,
-  2, 1,
-  2, 3,
-  2, 3,
-  4, 3,
-  4, IF_EXT_PARTITION(5, 4, 5) 2,
-  0, 3,
-  1, 4,
-  2, IF_EXT_PARTITION(5, 3)
+// Log 2 conversion lookup tables in units of mode info(4x4).
+static const uint8_t mi_size_wide_log2[BLOCK_SIZES_ALL] = {
+  0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 0, 2, 1, 3, 2, 4
 };
-// Log 2 conversion lookup tables for modeinfo width and height
-static const uint8_t mi_width_log2_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0, 0,
-  0,
-#endif
-  0, 0,
-  1, 1,
-  1, 2,
-  2, 2,
-  3, 3,
-  3, 4,
-  4, IF_EXT_PARTITION(4, 5, 5) 0,
-  2, 1,
-  3, 2,
-  4, IF_EXT_PARTITION(3, 5)
-#else  // CONFIG_CB4X4
-  0, 0,
-  0, 0,
-  0, 1,
-  1, 1,
-  2, 2,
-  2, 3,
-  3, IF_EXT_PARTITION(3, 4, 4) 0,
-  1, 0,
-  2, 1,
-  3, IF_EXT_PARTITION(2, 4)
-#endif
-};
-static const uint8_t mi_height_log2_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0, 0,
-  0,
-#endif
-  0, 1,
-  0, 1,
-  2, 1,
-  2, 3,
-  2, 3,
-  4, 3,
-  4, IF_EXT_PARTITION(5, 4, 5) 2,
-  0, 3,
-  1, 4,
-  2, IF_EXT_PARTITION(5, 3)
-#else  // CONFIG_CB4X4
-  0, 0,
-  0, 0,
-  1, 0,
-  1, 2,
-  1, 2,
-  3, 2,
-  3, IF_EXT_PARTITION(4, 3, 4) 1,
-  0, 2,
-  0, 3,
-  1, IF_EXT_PARTITION(2, 4)
-#endif
+static const uint8_t mi_size_high_log2[BLOCK_SIZES_ALL] = {
+  0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 2, 0, 3, 1, 4, 2
 };
 
-/* clang-format off */
 static const uint8_t mi_size_wide[BLOCK_SIZES_ALL] = {
-#if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  1, 1, 1,
-#endif
-  1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16,
-  IF_EXT_PARTITION(16, 32, 32)  1, 4, 2, 8, 4, 16, IF_EXT_PARTITION(8, 32)
-#else  // CONFIG_CB4X4
-  1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, IF_EXT_PARTITION(8, 16, 16) 1, 2, 1, 4,
-  2, 8, IF_EXT_PARTITION(4, 16)
-#endif
+  1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 1, 4, 2, 8, 4, 16
 };
+
 static const uint8_t mi_size_high[BLOCK_SIZES_ALL] = {
-#if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  1, 1, 1,
-#endif
-  1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16,
-  IF_EXT_PARTITION(32, 16, 32)  4, 1, 8, 2, 16, 4, IF_EXT_PARTITION(32, 8)
-#else  // CONFIG_CB4X4
-  1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, IF_EXT_PARTITION(16, 8, 16) 2, 1, 4, 1,
-  8, 2, IF_EXT_PARTITION(16, 4)
-#endif
+  1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 4, 1, 8, 2, 16, 4
 };
-/* clang-format on */
 
 // Width/height lookup tables in units of various block sizes
 static const uint8_t block_size_wide[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  2,  2,
-  4,
-#endif
-  4,  4,
-  8,  8,
-  8,  16,
-  16, 16,
-  32, 32,
-  32, 64,
-  64, IF_EXT_PARTITION(64, 128, 128) 4,
-  16, 8,
-  32, 16,
-  64, IF_EXT_PARTITION(32, 128)
+  4,  4,  8,  8,   8,   16, 16, 16, 32, 32, 32,
+  64, 64, 64, 128, 128, 4,  16, 8,  32, 16, 64
 };
 
 static const uint8_t block_size_high[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  2,  4,
-  2,
-#endif
-  4,  8,
-  4,  8,
-  16, 8,
-  16, 32,
-  16, 32,
-  64, 32,
-  64, IF_EXT_PARTITION(128, 64, 128) 16,
-  4,  32,
-  8,  64,
-  16, IF_EXT_PARTITION(128, 32)
-};
-
-static const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  1,  1,
-  1,
-#endif
-  1,  1,
-  2,  2,
-  2,  4,
-  4,  4,
-  8,  8,
-  8,  16,
-  16, IF_EXT_PARTITION(16, 32, 32) 1,
-  4,  2,
-  8,  4,
-  16, IF_EXT_PARTITION(8, 32)
-};
-static const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  1,  1,
-  1,
-#endif
-  1,  2,
-  1,  2,
-  4,  2,
-  4,  8,
-  4,  8,
-  16, 8,
-  16, IF_EXT_PARTITION(32, 16, 32) 4,
-  1,  8,
-  2,  16,
-  4,  IF_EXT_PARTITION(32, 8)
-};
-static const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  1, 1,
-  1,
-#endif
-  1, 1,
-  1, 1,
-  1, 2,
-  2, 2,
-  4, 4,
-  4, 8,
-  8, IF_EXT_PARTITION(8, 16, 16) 1,
-  2, 1,
-  4, 2,
-  8, IF_EXT_PARTITION(4, 16)
-};
-static const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  1, 1,
-  1,
-#endif
-  1, 1,
-  1, 1,
-  2, 1,
-  2, 4,
-  2, 4,
-  8, 4,
-  8, IF_EXT_PARTITION(16, 8, 16) 2,
-  1, 4,
-  1, 8,
-  2, IF_EXT_PARTITION(16, 4)
-};
-static const uint8_t num_16x16_blocks_wide_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  1, 1,
-  1,
-#endif
-  1, 1,
-  1, 1,
-  1, 1,
-  1, 1,
-  2, 2,
-  2, 4,
-  4, IF_EXT_PARTITION(4, 8, 8) 1,
-  1, 1,
-  2, 2,
-  4, IF_EXT_PARTITION(2, 8)
-};
-static const uint8_t num_16x16_blocks_high_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  1, 1,
-  1,
-#endif
-  1, 1,
-  1, 1,
-  1, 1,
-  1, 2,
-  1, 2,
-  4, 2,
-  4, IF_EXT_PARTITION(8, 4, 8) 1,
-  1, 2,
-  1, 4,
-  2, IF_EXT_PARTITION(8, 2)
+  4,  8,  4,   8,  16,  8,  16, 32, 16, 32, 64,
+  32, 64, 128, 64, 128, 16, 4,  32, 8,  64, 16
 };
 
 // AOMMIN(3, AOMMIN(b_width_log2(bsize), b_height_log2(bsize)))
 static const uint8_t size_group_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  0, 0,
-  0,
-#endif
-  0, 0,
-  0, 1,
-  1, 1,
-  2, 2,
-  2, 3,
-  3, 3,
-  3, IF_EXT_PARTITION(3, 3, 3) 0,
-  0, 1,
-  1, 2,
-  2, IF_EXT_PARTITION(3, 3)
+  0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2
 };
 
 static const uint8_t num_pels_log2_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  2,  3,
-  3,
-#endif
-  4,  5,
-  5,  6,
-  7,  7,
-  8,  9,
-  9,  10,
-  11, 11,
-  12, IF_EXT_PARTITION(13, 13, 14) 6,
-  6,  8,
-  8,  10,
-  10, IF_EXT_PARTITION(12, 12)
+  4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 6, 6, 8, 8, 10, 10
 };
 
 /* clang-format off */
-#if CONFIG_EXT_PARTITION_TYPES
-static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][BLOCK_SIZES_ALL] =
-#else
-static const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES_ALL] =
-#endif  // CONFIG_EXT_PARTITION_TYPES
-{
+static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][SQR_BLOCK_SIZES] = {
   {     // PARTITION_NONE
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    // 2X2,        2X4,           4X2,
-    BLOCK_2X2,     BLOCK_2X4,     BLOCK_4X2,
-#endif
-    //                            4X4
-                                  BLOCK_4X4,
-    // 4X8,        8X4,           8X8
-    BLOCK_4X8,     BLOCK_8X4,     BLOCK_8X8,
-    // 8X16,       16X8,          16X16
-    BLOCK_8X16,    BLOCK_16X8,    BLOCK_16X16,
-    // 16X32,      32X16,         32X32
-    BLOCK_16X32,   BLOCK_32X16,   BLOCK_32X32,
-    // 32X64,      64X32,         64X64
-    BLOCK_32X64,   BLOCK_64X32,   BLOCK_64X64,
-#if CONFIG_EXT_PARTITION
-    // 64x128,     128x64,        128x128
-    BLOCK_64X128,  BLOCK_128X64,  BLOCK_128X128,
-#endif  // CONFIG_EXT_PARTITION
-    // 4X16,       16X4,          8X32
-    BLOCK_4X16,    BLOCK_16X4,    BLOCK_8X32,
-    // 32X8,       16X64,         64X16
-    BLOCK_32X8,    BLOCK_16X64,   BLOCK_64X16,
-#if CONFIG_EXT_PARTITION
-    // 32x128,     128x32
-    BLOCK_32X128,  BLOCK_128X32
-#endif  // CONFIG_EXT_PARTITION
+    BLOCK_4X4, BLOCK_8X8, BLOCK_16X16,
+    BLOCK_32X32, BLOCK_64X64, BLOCK_128X128
   }, {  // PARTITION_HORZ
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    // 2X2,        2X4,           4X2,
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    //                            4X4
-                                  BLOCK_4X2,
-#else
-    //                            4X4
-                                  BLOCK_INVALID,
-#endif
-    // 4X8,        8X4,           8X8
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
-    // 8X16,       16X8,          16X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
-    // 16X32,      32X16,         32X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
-    // 32X64,      64X32,         64X64
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
-#if CONFIG_EXT_PARTITION
-    // 64x128,     128x64,        128x128
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
-#endif  // CONFIG_EXT_PARTITION
-    // 4X16,       16X4,          8X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 32X8,       16X64,         64X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#if CONFIG_EXT_PARTITION
-    // 32x128,     128x32
-    BLOCK_INVALID, BLOCK_INVALID
-#endif  // CONFIG_EXT_PARTITION
+    BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8,
+    BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
   }, {  // PARTITION_VERT
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    // 2X2,        2X4,           4X2,
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    //                            4X4
-                                  BLOCK_2X4,
-#else
-    //                            4X4
-                                  BLOCK_INVALID,
-#endif
-    // 4X8,        8X4,           8X8
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
-    // 8X16,       16X8,          16X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
-    // 16X32,      32X16,         32X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
-    // 32X64,      64X32,         64X64
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
-#if CONFIG_EXT_PARTITION
-    // 64x128,     128x64,        128x128
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
-#endif  // CONFIG_EXT_PARTITION
-    // 4X16,       16X4,          8X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 32X8,       16X64,         64X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#if CONFIG_EXT_PARTITION
-    // 32x128,     128x32
-    BLOCK_INVALID, BLOCK_INVALID
-#endif  // CONFIG_EXT_PARTITION
+    BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16,
+    BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
   }, {  // PARTITION_SPLIT
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    // 2X2,        2X4,           4X2,
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#endif
-    //                            4X4
-                                  BLOCK_INVALID,
-    // 4X8,        8X4,           8X8
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
-    // 8X16,       16X8,          16X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X8,
-    // 16X32,      32X16,         32X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X16,
-    // 32X64,      64X32,         64X64
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X32,
-#if CONFIG_EXT_PARTITION
-    // 64x128,     128x64,        128x128
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X64,
-#endif  // CONFIG_EXT_PARTITION
-    // 4X16,       16X4,          8X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 32X8,       16X64,         64X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#if CONFIG_EXT_PARTITION
-    // 32x128,     128x32
-    BLOCK_INVALID, BLOCK_INVALID
-#endif  // CONFIG_EXT_PARTITION
-#if CONFIG_EXT_PARTITION_TYPES
+    BLOCK_INVALID, BLOCK_4X4, BLOCK_8X8,
+    BLOCK_16X16, BLOCK_32X32, BLOCK_64X64
   }, {  // PARTITION_HORZ_A
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    // 2X2,        2X4,           4X2,
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#endif
-    //                            4X4
-                                  BLOCK_INVALID,
-    // 4X8,        8X4,           8X8
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
-    // 8X16,       16X8,          16X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
-    // 16X32,      32X16,         32X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
-    // 32X64,      64X32,         64X64
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
-#if CONFIG_EXT_PARTITION
-    // 64x128,     128x64,        128x128
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
-#endif  // CONFIG_EXT_PARTITION
-    // 4X16,       16X4,          8X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 32X8,       16X64,         64X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#if CONFIG_EXT_PARTITION
-    // 32x128,     128x32
-    BLOCK_INVALID, BLOCK_INVALID
-#endif  // CONFIG_EXT_PARTITION
+    BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8,
+    BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
   }, {  // PARTITION_HORZ_B
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    // 2X2,        2X4,           4X2,
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#endif
-    //                            4X4
-                                  BLOCK_INVALID,
-    // 4X8,        8X4,           8X8
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
-    // 8X16,       16X8,          16X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
-    // 16X32,      32X16,         32X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
-    // 32X64,      64X32,         64X64
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
-#if CONFIG_EXT_PARTITION
-    // 64x128,     128x64,        128x128
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
-#endif  // CONFIG_EXT_PARTITION
-    // 4X16,       16X4,          8X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 32X8,       16X64,         64X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#if CONFIG_EXT_PARTITION
-    // 32x128,     128x32
-    BLOCK_INVALID, BLOCK_INVALID
-#endif  // CONFIG_EXT_PARTITION
+    BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8,
+    BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
   }, {  // PARTITION_VERT_A
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    // 2X2,        2X4,           4X2,
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#endif
-    //                            4X4
-                                  BLOCK_INVALID,
-    // 4X8,        8X4,           8X8
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
-    // 8X16,       16X8,          16X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
-    // 16X32,      32X16,         32X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
-    // 32X64,      64X32,         64X64
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
-#if CONFIG_EXT_PARTITION
-    // 64x128,     128x64,        128x128
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
-#endif  // CONFIG_EXT_PARTITION
-    // 4X16,       16X4,          8X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 32X8,       16X64,         64X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#if CONFIG_EXT_PARTITION
-    // 32x128,     128x32
-    BLOCK_INVALID, BLOCK_INVALID
-#endif  // CONFIG_EXT_PARTITION
+    BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16,
+    BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
   }, {  // PARTITION_VERT_B
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    // 2X2,        2X4,           4X2,
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#endif
-    //                            4X4
-                                  BLOCK_INVALID,
-    // 4X8,        8X4,           8X8
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
-    // 8X16,       16X8,          16X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
-    // 16X32,      32X16,         32X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
-    // 32X64,      64X32,         64X64
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
-#if CONFIG_EXT_PARTITION
-    // 64x128,     128x64,        128x128
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
-#endif  // CONFIG_EXT_PARTITION
-    // 4X16,       16X4,          8X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 32X8,       16X64,         64X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#if CONFIG_EXT_PARTITION
-    // 32x128,     128x32
-    BLOCK_INVALID, BLOCK_INVALID
-#endif  // CONFIG_EXT_PARTITION
+    BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16,
+    BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
   }, {  // PARTITION_HORZ_4
-#if CONFIG_CB4X4
-    // 2X2,        2X4,           4X2,
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    //                            4X4
-                                  BLOCK_INVALID,
-#else
-    //                            4X4
-                                  BLOCK_INVALID,
-#endif
-    // 4X8,        8X4,           8X8
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 8X16,       16X8,          16X16
     BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X4,
-    // 16X32,      32X16,         32X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X8,
-    // 32X64,      64X32,         64X64
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X16,
-#if CONFIG_EXT_PARTITION
-    // 64x128,     128x64,        128x128
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X32,
-#endif  // CONFIG_EXT_PARTITION
-    // 4X16,       16X4,          8X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 32X8,       16X64,         64X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#if CONFIG_EXT_PARTITION
-    // 32x128,     128x32
-    BLOCK_INVALID, BLOCK_INVALID
-#endif  // CONFIG_EXT_PARTITION
+    BLOCK_32X8, BLOCK_64X16, BLOCK_INVALID
   }, {  // PARTITION_VERT_4
-#if CONFIG_CB4X4
-    // 2X2,        2X4,           4X2,
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    //                            4X4
-                                  BLOCK_INVALID,
-#else
-    //                            4X4
-                                  BLOCK_INVALID,
-#endif
-    // 4X8,        8X4,           8X8
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 8X16,       16X8,          16X16
     BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
-    // 16X32,      32X16,         32X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X32,
-    // 32X64,      64X32,         64X64
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X64,
-#if CONFIG_EXT_PARTITION
-    // 64x128,     128x64,        128x128
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X128,
-#endif  // CONFIG_EXT_PARTITION
-    // 4X16,       16X4,          8X32
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    // 32X8,       16X64,         64X16
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-#if CONFIG_EXT_PARTITION
-    // 32x128,     128x32
-    BLOCK_INVALID, BLOCK_INVALID
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
+    BLOCK_8X32, BLOCK_16X64, BLOCK_INVALID
   }
 };
 
 static const TX_SIZE max_txsize_lookup[BLOCK_SIZES_ALL] = {
-  // 2X2,    2X4,      4X2,
-#if CONFIG_CHROMA_2X2
-  TX_2X2,    TX_2X2,   TX_2X2,
-#elif CONFIG_CHROMA_SUB8X8
-  TX_4X4,    TX_4X4,   TX_4X4,
-#endif
   //                   4X4
                        TX_4X4,
   // 4X8,    8X4,      8X8
@@ -624,1436 +102,291 @@ static const TX_SIZE max_txsize_lookup[BLOCK_SIZES_ALL] = {
   TX_16X16,  TX_16X16, TX_32X32,
   // 32X64,  64X32,
   TX_32X32,  TX_32X32,
-#if CONFIG_TX64X64
   // 64X64
   TX_64X64,
-#if CONFIG_EXT_PARTITION
   // 64x128, 128x64,   128x128
   TX_64X64,  TX_64X64, TX_64X64,
-#endif  // CONFIG_EXT_PARTITION
-#else
-  // 64X64
-  TX_32X32,
-#if CONFIG_EXT_PARTITION
-  // 64x128, 128x64,   128x128
-  TX_32X32,  TX_32X32, TX_32X32,
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_TX64X64
   // 4x16,   16x4,     8x32
   TX_4X4,    TX_4X4,   TX_8X8,
   // 32x8,   16x64     64x16
-  TX_8X8,    TX_16X16, TX_16X16,
-#if CONFIG_EXT_PARTITION
-  // 32x128  128x32
-  TX_32X32,  TX_32X32
-#endif  // CONFIG_EXT_PARTITION
+  TX_8X8,    TX_16X16, TX_16X16
 };
 
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
 static const TX_SIZE max_txsize_rect_lookup[BLOCK_SIZES_ALL] = {
-  // 2X2,    2X4,      4X2,
-#if CONFIG_CHROMA_2X2
-  TX_2X2,    TX_2X2,   TX_2X2,
-#elif CONFIG_CHROMA_SUB8X8
-  TX_4X4,    TX_4X4,   TX_4X4,
-#endif  // CONFIG_CHROMA_SUB8X8
-  //                   4X4
-                       TX_4X4,
-  // 4X8,    8X4,      8X8
-  TX_4X8,    TX_8X4,   TX_8X8,
-  // 8X16,   16X8,     16X16
-  TX_8X16,   TX_16X8,  TX_16X16,
-  // 16X32,  32X16,    32X32
-  TX_16X32,  TX_32X16, TX_32X32,
-#if CONFIG_TX64X64
-  // 32X64,  64X32,
-  TX_32X64,  TX_64X32,
-  // 64X64
-  TX_64X64,
-#if CONFIG_EXT_PARTITION
-  // 64x128, 128x64,   128x128
-  TX_64X64,  TX_64X64, TX_64X64,
-#endif  // CONFIG_EXT_PARTITION
-#else
-  // 32X64,  64X32,
-  TX_32X32,  TX_32X32,
-  // 64X64
-  TX_32X32,
-#if CONFIG_EXT_PARTITION
-  // 64x128, 128x64,   128x128
-  TX_32X32,  TX_32X32, TX_32X32,
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_TX64X64
-#if CONFIG_RECT_TX_EXT
-  // 4x16,   16x4,     8x32
-  TX_4X16,   TX_16X4,  TX_8X32,
-  // 32x8
-  TX_32X8,
-#else
-  // 4x16,   16x4,     8x32
-  TX_4X8,    TX_8X4,   TX_8X16,
-  // 32x8
-  TX_16X8,
-#endif
-  // 16x64,  64x16
-  TX_16X32,  TX_32X16,
-#if CONFIG_EXT_PARTITION
-  // 32x128  128x32
-  TX_32X32,  TX_32X32
-#endif  // CONFIG_EXT_PARTITION
-};
-
-#if CONFIG_RECT_TX_EXT
-static const TX_SIZE quarter_txsize_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  //     2X2,        2X4,        4X2,
-  TX_INVALID, TX_INVALID, TX_INVALID,
-#endif
-  //                             4x4,
-                          TX_INVALID,
-  //     4x8,        8x4,        8x8,
-  TX_INVALID, TX_INVALID, TX_INVALID,
-  // 8x16, 16x8, 16x16,
-  TX_4X16, TX_16X4, TX_INVALID,
-  // 16x32, 32x16, 32x32,
-  TX_8X32, TX_32X8, TX_INVALID,
-  // 32x64, 64x32, 64x64
-  TX_INVALID, TX_INVALID, TX_INVALID,
-#if CONFIG_EXT_PARTITION
-  // 64x128, 128x64, 128x128
-  TX_INVALID, TX_INVALID, TX_INVALID,
-#endif
-  // 4x16,    16x4,       8x32
-  TX_4X16,    TX_16X4,    TX_8X32,
-  // 32x8     16x64       64x16
-  TX_32X8,    TX_INVALID, TX_INVALID,
-#if CONFIG_EXT_PARTITION
-  // 32x128   128x32
-  TX_INVALID, TX_INVALID
-#endif  // CONFIG_EXT_PARTITION
+      // 4X4
+      TX_4X4,
+      // 4X8,    8X4,      8X8
+      TX_4X8,    TX_8X4,   TX_8X8,
+      // 8X16,   16X8,     16X16
+      TX_8X16,   TX_16X8,  TX_16X16,
+      // 16X32,  32X16,    32X32
+      TX_16X32,  TX_32X16, TX_32X32,
+      // 32X64,  64X32,
+      TX_32X64,  TX_64X32,
+      // 64X64
+      TX_64X64,
+      // 64x128, 128x64,   128x128
+      TX_64X64,  TX_64X64, TX_64X64,
+      // 4x16,   16x4,
+      TX_4X16,   TX_16X4,
+      // 8x32,   32x8
+      TX_8X32,   TX_32X8,
+      // 16x64,  64x16
+      TX_16X64,  TX_64X16
 };
-#endif
-#else
-#define max_txsize_rect_lookup max_txsize_lookup
-#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
 
 static const TX_TYPE_1D vtx_tab[TX_TYPES] = {
   DCT_1D,      ADST_1D, DCT_1D,      ADST_1D,
-#if CONFIG_EXT_TX
   FLIPADST_1D, DCT_1D,  FLIPADST_1D, ADST_1D, FLIPADST_1D, IDTX_1D,
   DCT_1D,      IDTX_1D, ADST_1D,     IDTX_1D, FLIPADST_1D, IDTX_1D,
-#endif  // CONFIG_EXT_TX
 };
 
 static const TX_TYPE_1D htx_tab[TX_TYPES] = {
   DCT_1D,  DCT_1D,      ADST_1D,     ADST_1D,
-#if CONFIG_EXT_TX
   DCT_1D,  FLIPADST_1D, FLIPADST_1D, FLIPADST_1D, ADST_1D, IDTX_1D,
   IDTX_1D, DCT_1D,      IDTX_1D,     ADST_1D,     IDTX_1D, FLIPADST_1D,
-#endif  // CONFIG_EXT_TX
-};
-
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-// Same as "max_txsize_lookup[bsize] - TX_8X8", except for rectangular
-// block which may use a rectangular transform, in which  case it is
-// "(max_txsize_lookup[bsize] + 1) - TX_8X8", invalid for bsize < 8X8
-static const int32_t intra_tx_size_cat_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  // 2X2,             2X4,                4X2,
-  INT32_MIN,          INT32_MIN,          INT32_MIN,
-#endif
-  //                                      4X4,
-                                          INT32_MIN,
-  // 4X8,             8X4,                8X8,
-  TX_8X8 - TX_8X8,    TX_8X8 - TX_8X8,    TX_8X8 - TX_8X8,
-#else  // CONFIG_CB4X4
-  //                                      4X4
-                                          INT32_MIN,
-  // 4X8,             8X4,                8X8
-  INT32_MIN,          INT32_MIN,          TX_8X8 - TX_8X8,
-#endif  // CONFIG_CB4X4
-  // 8X16,            16X8,               16X16
-  TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,
-  // 16X32,           32X16,              32X32
-  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
-#if CONFIG_TX64X64
-  // 32X64,           64X32,
-  TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,
-  // 64X64
-  TX_64X64 - TX_8X8,
-#if CONFIG_EXT_PARTITION
-  // 64x128,          128x64,             128x128
-  TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,
-#endif  // CONFIG_EXT_PARTITION
-#else
-  // 32X64,           64X32,
-  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
-  // 64X64
-  TX_32X32 - TX_8X8,
-#if CONFIG_EXT_PARTITION
-  // 64x128,          128x64,             128x128
-  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_TX64X64
-  // TODO(david.barker): Change these if we support rectangular transforms
-  // for 4:1 shaped partitions
-  // 4x16,            16x4,               8x32
-  TX_8X8 - TX_8X8,    TX_8X8 - TX_8X8,    TX_8X8 - TX_8X8,
-  // 32x8,            16x64,              64x16
-  TX_8X8 - TX_8X8,    TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,
-#if CONFIG_EXT_PARTITION
-  // 32x128,          128x32
-  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8
-#endif  // CONFIG_EXT_PARTITION
-};
-#else
-// Same as "max_txsize_lookup[bsize] - TX_8X8", invalid for bsize < 8X8
-static const int32_t intra_tx_size_cat_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  // 2X2,             2X4,                4X2,
-  INT32_MIN,          INT32_MIN,          INT32_MIN,
-#endif
-  //                                      4X4
-                                          INT32_MIN,
-  // 4X8,             8X4,                8X8
-  INT32_MIN,          INT32_MIN,          TX_8X8 - TX_8X8,
-  // 8X16,            16X8,               16X16
-  TX_8X8 - TX_8X8,    TX_8X8 - TX_8X8,    TX_16X16 - TX_8X8,
-  // 16X32,           32X16,              32X32
-  TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,  TX_32X32 - TX_8X8,
-#if CONFIG_TX64X64
-  // 32X64,           64X32,
-  TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,
-  // 64X64
-  TX_64X64 - TX_8X8,
-#if CONFIG_EXT_PARTITION
-  // 64x128,          128x64,             128x128
-  TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,
-#endif  // CONFIG_EXT_PARTITION
-#else
-  // 32X64,           64X32,
-  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
-  // 64X64
-  TX_32X32 - TX_8X8,
-#if CONFIG_EXT_PARTITION
-  // 64x128,          128x64,             128x128
-  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_TX64X64
-  // 4x16,            16x4,               8x32
-  TX_8X8 - TX_8X8,    TX_8X8 - TX_8X8,    TX_8X8 - TX_8X8,
-  // 32x8             16x64,              64x16
-  TX_8X8 - TX_8X8,    TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,
-#if CONFIG_EXT_PARTITION
-  // 32x128,          128x32
-  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8
-#endif  // CONFIG_EXT_PARTITION
 };
-#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
 
-#define inter_tx_size_cat_lookup intra_tx_size_cat_lookup
+#define TXSIZE_CAT_INVALID (-1)
 
 /* clang-format on */
 
 static const TX_SIZE sub_tx_size_map[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  TX_2X2,  // TX_2X2
-#endif
   TX_4X4,    // TX_4X4
   TX_4X4,    // TX_8X8
   TX_8X8,    // TX_16X16
   TX_16X16,  // TX_32X32
-#if CONFIG_TX64X64
   TX_32X32,  // TX_64X64
-#endif       // CONFIG_TX64X64
   TX_4X4,    // TX_4X8
   TX_4X4,    // TX_8X4
   TX_8X8,    // TX_8X16
   TX_8X8,    // TX_16X8
   TX_16X16,  // TX_16X32
   TX_16X16,  // TX_32X16
-#if CONFIG_TX64X64
   TX_32X32,  // TX_32X64
   TX_32X32,  // TX_64X32
-#endif       // CONFIG_TX64X64
-  TX_4X4,    // TX_4X16
-  TX_4X4,    // TX_16X4
-  TX_8X8,    // TX_8X32
-  TX_8X8,    // TX_32X8
+  TX_4X8,    // TX_4X16
+  TX_8X4,    // TX_16X4
+  TX_8X16,   // TX_8X32
+  TX_16X8,   // TX_32X8
+  TX_16X32,  // TX_16X64
+  TX_32X16,  // TX_64X16
 };
 
 static const TX_SIZE txsize_horz_map[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  TX_2X2,  // TX_2X2
-#endif
   TX_4X4,    // TX_4X4
   TX_8X8,    // TX_8X8
   TX_16X16,  // TX_16X16
   TX_32X32,  // TX_32X32
-#if CONFIG_TX64X64
   TX_64X64,  // TX_64X64
-#endif       // CONFIG_TX64X64
   TX_4X4,    // TX_4X8
   TX_8X8,    // TX_8X4
   TX_8X8,    // TX_8X16
   TX_16X16,  // TX_16X8
   TX_16X16,  // TX_16X32
   TX_32X32,  // TX_32X16
-#if CONFIG_TX64X64
   TX_32X32,  // TX_32X64
   TX_64X64,  // TX_64X32
-#endif       // CONFIG_TX64X64
   TX_4X4,    // TX_4X16
   TX_16X16,  // TX_16X4
   TX_8X8,    // TX_8X32
   TX_32X32,  // TX_32X8
+  TX_16X16,  // TX_16X64
+  TX_64X64,  // TX_64X16
 };
 
 static const TX_SIZE txsize_vert_map[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  TX_2X2,  // TX_2X2
-#endif
   TX_4X4,    // TX_4X4
   TX_8X8,    // TX_8X8
   TX_16X16,  // TX_16X16
   TX_32X32,  // TX_32X32
-#if CONFIG_TX64X64
   TX_64X64,  // TX_64X64
-#endif       // CONFIG_TX64X64
   TX_8X8,    // TX_4X8
   TX_4X4,    // TX_8X4
   TX_16X16,  // TX_8X16
   TX_8X8,    // TX_16X8
   TX_32X32,  // TX_16X32
   TX_16X16,  // TX_32X16
-#if CONFIG_TX64X64
   TX_64X64,  // TX_32X64
   TX_32X32,  // TX_64X32
-#endif       // CONFIG_TX64X64
   TX_16X16,  // TX_4X16
   TX_4X4,    // TX_16X4
   TX_32X32,  // TX_8X32
   TX_8X8,    // TX_32X8
+  TX_64X64,  // TX_16X64
+  TX_16X16,  // TX_64X16
 };
 
-#if CONFIG_CHROMA_2X2
-#define TX_SIZE_W_MIN 2
-#else
 #define TX_SIZE_W_MIN 4
-#endif
 
 // Transform block width in pixels
 static const int tx_size_wide[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  2,
-#endif
-  4,  8,  16, 32,
-#if CONFIG_TX64X64
-  64,
-#endif  // CONFIG_TX64X64
-  4,  8,  8,  16, 16, 32,
-#if CONFIG_TX64X64
-  32, 64,
-#endif  // CONFIG_TX64X64
-  4,  16, 8,  32
+  4, 8, 16, 32, 64, 4, 8, 8, 16, 16, 32, 32, 64, 4, 16, 8, 32, 16, 64,
 };
 
-#if CONFIG_CHROMA_2X2
-#define TX_SIZE_H_MIN 2
-#else
 #define TX_SIZE_H_MIN 4
-#endif
 
 // Transform block height in pixels
 static const int tx_size_high[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  2,
-#endif
-  4,  8,  16, 32,
-#if CONFIG_TX64X64
-  64,
-#endif  // CONFIG_TX64X64
-  8,  4,  16, 8,  32, 16,
-#if CONFIG_TX64X64
-  64, 32,
-#endif  // CONFIG_TX64X64
-  16, 4,  32, 8
+  4, 8, 16, 32, 64, 8, 4, 16, 8, 32, 16, 64, 32, 16, 4, 32, 8, 64, 16,
 };
 
 // Transform block width in unit
 static const int tx_size_wide_unit[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  1,  2,  4, 8, 16,
-#if CONFIG_TX64X64
-  32,
-#endif  // CONFIG_TX64X64
-  2,  4,  4, 8, 8,  16,
-#if CONFIG_TX64X64
-  16, 32,
-#endif  // CONFIG_TX64X64
-  2,  8,  4, 16
-#else  // CONFIG_CHROMA_2X2
-  1,  2,  4, 8,
-#if CONFIG_TX64X64
-  16,
-#endif  // CONFIG_TX64X64
-  1,  2,  2, 4, 4, 8,
-#if CONFIG_TX64X64
-  8,  16,
-#endif  // CONFIG_TX64X64
-  1,  4,  2, 8
-#endif  // CONFIG_CHROMA_2X2
+  1, 2, 4, 8, 16, 1, 2, 2, 4, 4, 8, 8, 16, 1, 4, 2, 8, 4, 16,
 };
 
 // Transform block height in unit
 static const int tx_size_high_unit[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  1,  2,  4,  8, 16,
-#if CONFIG_TX64X64
-  32,
-#endif  // CONFIG_TX64X64
-  4,  2,  8,  4, 16, 8,
-#if CONFIG_TX64X64
-  32, 16,
-#endif  // CONFIG_TX64X64
-  8,  2,  16, 4
-#else  // CONFIG_CHROMA_2X2
-  1,  2, 4, 8,
-#if CONFIG_TX64X64
-  16,
-#endif  // CONFIG_TX64X64
-  2,  1, 4, 2, 8, 4,
-#if CONFIG_TX64X64
-  16, 8,
-#endif  // CONFIG_TX64X64
-  4,  1, 8, 2
-#endif  // CONFIG_CHROMA_2X2
+  1, 2, 4, 8, 16, 2, 1, 4, 2, 8, 4, 16, 8, 4, 1, 8, 2, 16, 4,
 };
 
 // Transform block width in log2
 static const int tx_size_wide_log2[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  1,
-#endif
-  2, 3, 4, 5,
-#if CONFIG_TX64X64
-  6,
-#endif  // CONFIG_TX64X64
-  2, 3, 3, 4, 4, 5,
-#if CONFIG_TX64X64
-  5, 6,
-#endif  // CONFIG_TX64X64
-  2, 4, 3, 5
+  2, 3, 4, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 2, 4, 3, 5, 4, 6,
 };
 
 // Transform block height in log2
 static const int tx_size_high_log2[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  1,
-#endif
-  2, 3, 4, 5,
-#if CONFIG_TX64X64
-  6,
-#endif  // CONFIG_TX64X64
-  3, 2, 4, 3, 5, 4,
-#if CONFIG_TX64X64
-  6, 5,
-#endif  // CONFIG_TX64X64
-  4, 2, 5, 3
+  2, 3, 4, 5, 6, 3, 2, 4, 3, 5, 4, 6, 5, 4, 2, 5, 3, 6, 4,
 };
 
-#define TX_UNIT_WIDE_LOG2 (MI_SIZE_LOG2 - tx_size_wide_log2[0])
-#define TX_UNIT_HIGH_LOG2 (MI_SIZE_LOG2 - tx_size_high_log2[0])
-
-static const int tx_size_2d[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  4,
-#endif
-  16,   64,   256, 1024,
-#if CONFIG_TX64X64
-  4096,
-#endif  // CONFIG_TX64X64
-  32,   32,   128, 128,  512, 512,
-#if CONFIG_TX64X64
-  2048, 2048,
-#endif  // CONFIG_TX64X64
-  64,   64,   256, 256
+static const int tx_size_2d[TX_SIZES_ALL + 1] = {
+  16,  64,   256,  1024, 4096, 32,  32,  128,  128,  512,
+  512, 2048, 2048, 64,   64,   256, 256, 1024, 1024,
 };
 
 static const BLOCK_SIZE txsize_to_bsize[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  BLOCK_2X2,  // TX_2X2
-#endif
   BLOCK_4X4,    // TX_4X4
   BLOCK_8X8,    // TX_8X8
   BLOCK_16X16,  // TX_16X16
   BLOCK_32X32,  // TX_32X32
-#if CONFIG_TX64X64
   BLOCK_64X64,  // TX_64X64
-#endif          // CONFIG_TX64X64
   BLOCK_4X8,    // TX_4X8
   BLOCK_8X4,    // TX_8X4
   BLOCK_8X16,   // TX_8X16
   BLOCK_16X8,   // TX_16X8
   BLOCK_16X32,  // TX_16X32
   BLOCK_32X16,  // TX_32X16
-#if CONFIG_TX64X64
   BLOCK_32X64,  // TX_32X64
   BLOCK_64X32,  // TX_64X32
-#endif          // CONFIG_TX64X64
   BLOCK_4X16,   // TX_4X16
   BLOCK_16X4,   // TX_16X4
   BLOCK_8X32,   // TX_8X32
   BLOCK_32X8,   // TX_32X8
+  BLOCK_16X64,  // TX_16X64
+  BLOCK_64X16,  // TX_64X16
 };
 
 static const TX_SIZE txsize_sqr_map[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  TX_2X2,  // TX_2X2
-#endif
   TX_4X4,    // TX_4X4
   TX_8X8,    // TX_8X8
   TX_16X16,  // TX_16X16
   TX_32X32,  // TX_32X32
-#if CONFIG_TX64X64
   TX_64X64,  // TX_64X64
-#endif       // CONFIG_TX64X64
   TX_4X4,    // TX_4X8
   TX_4X4,    // TX_8X4
   TX_8X8,    // TX_8X16
   TX_8X8,    // TX_16X8
   TX_16X16,  // TX_16X32
   TX_16X16,  // TX_32X16
-#if CONFIG_TX64X64
   TX_32X32,  // TX_32X64
   TX_32X32,  // TX_64X32
-#endif       // CONFIG_TX64X64
   TX_4X4,    // TX_4X16
   TX_4X4,    // TX_16X4
   TX_8X8,    // TX_8X32
   TX_8X8,    // TX_32X8
+  TX_16X16,  // TX_16X64
+  TX_16X16,  // TX_64X16
 };
 
 static const TX_SIZE txsize_sqr_up_map[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  TX_2X2,  // TX_2X2
-#endif
   TX_4X4,    // TX_4X4
   TX_8X8,    // TX_8X8
   TX_16X16,  // TX_16X16
   TX_32X32,  // TX_32X32
-#if CONFIG_TX64X64
   TX_64X64,  // TX_64X64
-#endif       // CONFIG_TX64X64
   TX_8X8,    // TX_4X8
   TX_8X8,    // TX_8X4
   TX_16X16,  // TX_8X16
   TX_16X16,  // TX_16X8
   TX_32X32,  // TX_16X32
   TX_32X32,  // TX_32X16
-#if CONFIG_TX64X64
   TX_64X64,  // TX_32X64
   TX_64X64,  // TX_64X32
-#endif       // CONFIG_TX64X64
   TX_16X16,  // TX_4X16
   TX_16X16,  // TX_16X4
   TX_32X32,  // TX_8X32
   TX_32X32,  // TX_32X8
+  TX_64X64,  // TX_16X64
+  TX_64X64,  // TX_64X16
+};
+
+static const int8_t txsize_log2_minus4[TX_SIZES_ALL] = {
+  0,  // TX_4X4
+  2,  // TX_8X8
+  4,  // TX_16X16
+  6,  // TX_32X32
+  6,  // TX_64X64
+  1,  // TX_4X8
+  1,  // TX_8X4
+  3,  // TX_8X16
+  3,  // TX_16X8
+  5,  // TX_16X32
+  5,  // TX_32X16
+  6,  // TX_32X64
+  6,  // TX_64X32
+  2,  // TX_4X16
+  2,  // TX_16X4
+  4,  // TX_8X32
+  4,  // TX_32X8
+  5,  // TX_16X64
+  5,  // TX_64X16
 };
 
 /* clang-format off */
 static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
   TX_4X4,    // ONLY_4X4
-  TX_8X8,    // ALLOW_8X8
-  TX_16X16,  // ALLOW_16X16
-  TX_32X32,  // ALLOW_32X32
-#if CONFIG_TX64X64
-  TX_64X64,  // ALLOW_64X64
+  TX_64X64,  // TX_MODE_LARGEST
   TX_64X64,  // TX_MODE_SELECT
-#else
-  TX_32X32,  // TX_MODE_SELECT
-#endif  // CONFIG_TX64X64
 };
 /* clang-format on */
 
 static const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES_ALL][2][2] = {
-//  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
-//  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  { { BLOCK_2X2, BLOCK_INVALID }, { BLOCK_INVALID, BLOCK_INVALID } },
-  { { BLOCK_2X4, BLOCK_INVALID }, { BLOCK_INVALID, BLOCK_INVALID } },
-  { { BLOCK_4X2, BLOCK_INVALID }, { BLOCK_INVALID, BLOCK_INVALID } },
-  { { BLOCK_4X4, BLOCK_4X2 }, { BLOCK_2X4, BLOCK_2X2 } },
-  { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_INVALID, BLOCK_2X4 } },
-  { { BLOCK_8X4, BLOCK_INVALID }, { BLOCK_4X4, BLOCK_4X2 } },
-#elif CONFIG_CB4X4
+  //  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
+  //  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
   { { BLOCK_4X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
-  { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_INVALID, BLOCK_4X4 } },
-  { { BLOCK_8X4, BLOCK_INVALID }, { BLOCK_4X4, BLOCK_4X4 } },
-#else
-  { { BLOCK_4X4, BLOCK_INVALID }, { BLOCK_INVALID, BLOCK_INVALID } },
-  { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_INVALID, BLOCK_INVALID } },
-  { { BLOCK_8X4, BLOCK_INVALID }, { BLOCK_4X4, BLOCK_INVALID } },
-#endif
+  { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
+  { { BLOCK_8X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
   { { BLOCK_8X8, BLOCK_8X4 }, { BLOCK_4X8, BLOCK_4X4 } },
-  { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_INVALID, BLOCK_4X8 } },
-  { { BLOCK_16X8, BLOCK_INVALID }, { BLOCK_8X8, BLOCK_8X4 } },
+  { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_4X16, BLOCK_4X8 } },
+  { { BLOCK_16X8, BLOCK_16X4 }, { BLOCK_8X8, BLOCK_8X4 } },
   { { BLOCK_16X16, BLOCK_16X8 }, { BLOCK_8X16, BLOCK_8X8 } },
-  { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_INVALID, BLOCK_8X16 } },
-  { { BLOCK_32X16, BLOCK_INVALID }, { BLOCK_16X16, BLOCK_16X8 } },
+  { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_8X32, BLOCK_8X16 } },
+  { { BLOCK_32X16, BLOCK_32X8 }, { BLOCK_16X16, BLOCK_16X8 } },
   { { BLOCK_32X32, BLOCK_32X16 }, { BLOCK_16X32, BLOCK_16X16 } },
-  { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_INVALID, BLOCK_16X32 } },
-  { { BLOCK_64X32, BLOCK_INVALID }, { BLOCK_32X32, BLOCK_32X16 } },
+  { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_16X64, BLOCK_16X32 } },
+  { { BLOCK_64X32, BLOCK_64X16 }, { BLOCK_32X32, BLOCK_32X16 } },
   { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } },
-#if CONFIG_EXT_PARTITION
   { { BLOCK_64X128, BLOCK_64X64 }, { BLOCK_INVALID, BLOCK_32X64 } },
   { { BLOCK_128X64, BLOCK_INVALID }, { BLOCK_64X64, BLOCK_64X32 } },
   { { BLOCK_128X128, BLOCK_128X64 }, { BLOCK_64X128, BLOCK_64X64 } },
-#endif  // CONFIG_EXT_PARTITION
-  { { BLOCK_4X16, BLOCK_4X8 }, { BLOCK_INVALID, BLOCK_4X8 } },
-  { { BLOCK_16X4, BLOCK_INVALID }, { BLOCK_8X4, BLOCK_8X4 } },
+  { { BLOCK_4X16, BLOCK_4X8 }, { BLOCK_4X16, BLOCK_4X8 } },
+  { { BLOCK_16X4, BLOCK_16X4 }, { BLOCK_8X4, BLOCK_8X4 } },
   { { BLOCK_8X32, BLOCK_8X16 }, { BLOCK_INVALID, BLOCK_4X16 } },
   { { BLOCK_32X8, BLOCK_INVALID }, { BLOCK_16X8, BLOCK_16X4 } },
   { { BLOCK_16X64, BLOCK_16X32 }, { BLOCK_INVALID, BLOCK_8X32 } },
-  { { BLOCK_64X16, BLOCK_INVALID }, { BLOCK_32X16, BLOCK_32X8 } },
-#if CONFIG_EXT_PARTITION
-  { { BLOCK_32X128, BLOCK_32X64 }, { BLOCK_INVALID, BLOCK_16X64 } },
-  { { BLOCK_128X32, BLOCK_INVALID }, { BLOCK_64X32, BLOCK_64X16 } },
-#endif  // CONFIG_EXT_PARTITION
+  { { BLOCK_64X16, BLOCK_INVALID }, { BLOCK_32X16, BLOCK_32X8 } }
 };
 
-static const TX_SIZE uv_txsize_lookup[BLOCK_SIZES_ALL][TX_SIZES_ALL][2][2] = {
-//  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
-//  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
-#if CONFIG_CHROMA_2X2
-  {
-      // BLOCK_2X2
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#if CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#endif  // CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#if CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#endif  // CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-  },
-  {
-      // BLOCK_2X4
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#if CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#endif  // CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#if CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#endif  // CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-  },
-  {
-      // BLOCK_4X2
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#if CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#endif  // CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#if CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#endif  // CONFIG_TX64X64
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-  },
-#elif CONFIG_CHROMA_SUB8X8
-  {
-      // BLOCK_2x2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-  },
-  {
-      // BLOCK_2X4
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-  },
-  {
-      // BLOCK_4X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-  },
-#endif
-  {
-// BLOCK_4X4
-#if CONFIG_CHROMA_2X2
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_4X4, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#else
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-  },
-  {
-// BLOCK_4X8
-#if CONFIG_CHROMA_2X2
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_4X4, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#else
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-#if CONFIG_CHROMA_2X2
-      { { TX_4X8, TX_4X4 }, { TX_2X2, TX_2X2 } },  // used
-#else
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },  // used
-#endif
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-  },
-  {
-// BLOCK_8X4
-#if CONFIG_CHROMA_2X2
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_4X4, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#else
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_CHROMA_2X2
-      { { TX_8X4, TX_2X2 }, { TX_4X4, TX_2X2 } },  // used
-#else
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },  // used
-#endif
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-  },
-  {
-// BLOCK_8X8
-#if CONFIG_CHROMA_2X2
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X4 }, { TX_4X8, TX_4X4 } },
-      { { TX_8X4, TX_8X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X4 }, { TX_4X8, TX_4X4 } },
-      { { TX_8X8, TX_8X4 }, { TX_4X8, TX_4X4 } },
-      { { TX_8X8, TX_8X4 }, { TX_4X8, TX_4X4 } },
-      { { TX_8X8, TX_8X4 }, { TX_4X8, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X4 }, { TX_4X8, TX_4X4 } },
-      { { TX_8X4, TX_8X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X4 }, { TX_4X8, TX_4X4 } },
-      { { TX_8X8, TX_8X4 }, { TX_4X8, TX_4X4 } },
-  },
-  {
-// BLOCK_8X16
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_8X16, TX_8X8 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_4X8, TX_4X8 } },  // used
-      { { TX_8X16, TX_8X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X16, TX_8X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X16, TX_8X8 }, { TX_4X8, TX_4X8 } },
-#if CONFIG_TX64X64
-      { { TX_8X16, TX_8X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_8X8, TX_8X8 }, { TX_4X8, TX_4X8 } },
-  },
-  {
-// BLOCK_16X8
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_16X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X4 }, { TX_4X8, TX_4X4 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_16X8, TX_8X4 }, { TX_8X8, TX_8X4 } },
-      { { TX_16X8, TX_8X4 }, { TX_8X8, TX_8X4 } },  // used
-      { { TX_16X8, TX_8X4 }, { TX_8X8, TX_8X4 } },
-      { { TX_16X8, TX_8X4 }, { TX_8X8, TX_8X4 } },
-#if CONFIG_TX64X64
-      { { TX_16X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-      { { TX_16X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X4 }, { TX_4X8, TX_4X4 } },
-      { { TX_16X4, TX_16X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X8, TX_8X4 }, { TX_8X8, TX_8X4 } },
-      { { TX_16X8, TX_16X4 }, { TX_8X8, TX_8X4 } },
-  },
-  {
-// BLOCK_16X16
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
-#if CONFIG_TX64X64
-      { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_16X8, TX_16X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_16X16, TX_16X8 }, { TX_8X16, TX_8X8 } },
-#if CONFIG_TX64X64
-      { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_16X4, TX_16X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_16X8, TX_16X8 }, { TX_8X8, TX_8X8 } },
-  },
-  {
-// BLOCK_16X32
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
-#if CONFIG_TX64X64
-      { { TX_16X32, TX_16X16 }, { TX_8X8, TX_8X8 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X32, TX_16X16 }, { TX_8X16, TX_8X16 } },  // used
-      { { TX_16X32, TX_16X16 }, { TX_8X16, TX_8X16 } },
-#if CONFIG_TX64X64
-      { { TX_16X32, TX_16X16 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X32, TX_16X16 }, { TX_8X8, TX_8X8 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X16 }, { TX_4X16, TX_4X16 } },
-      { { TX_16X4, TX_16X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X32, TX_8X16 }, { TX_8X32, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_8X8, TX_8X8 } },
-  },
-  {
-// BLOCK_32X16
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
-      { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
-#if CONFIG_TX64X64
-      { { TX_32X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_32X16, TX_16X8 }, { TX_16X16, TX_16X8 } },
-      { { TX_32X16, TX_16X8 }, { TX_16X16, TX_16X8 } },  // used
-#if CONFIG_TX64X64
-      { { TX_32X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
-      { { TX_32X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_32X8, TX_32X8 }, { TX_16X8, TX_16X8 } },
-  },
-  {
-// BLOCK_32X32
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_16X16 }, { TX_16X16, TX_16X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X32, TX_16X16 }, { TX_16X16, TX_16X16 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_16X32, TX_16X16 }, { TX_16X32, TX_16X16 } },
-      { { TX_32X16, TX_32X16 }, { TX_16X16, TX_16X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X32, TX_16X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_16X16 }, { TX_16X16, TX_16X16 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_32X8, TX_32X8 }, { TX_16X8, TX_16X8 } },
-  },
-  {
-// BLOCK_32X64
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_32X32 }, { TX_16X16, TX_16X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X32, TX_32X32 }, { TX_16X16, TX_16X16 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_16X32, TX_16X32 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X16, TX_32X16 }, { TX_16X16, TX_16X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X64, TX_32X32 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_32X32 }, { TX_16X16, TX_16X16 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_32X8, TX_32X8 }, { TX_16X8, TX_16X8 } },
-  },
-  {
-// BLOCK_64X32
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_16X32, TX_16X16 }, { TX_16X32, TX_16X16 } },
-      { { TX_32X16, TX_16X16 }, { TX_32X16, TX_16X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
-      { { TX_64X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_32X8, TX_32X8 }, { TX_16X8, TX_16X8 } },
-  },
-  {
-// BLOCK_64X64
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_32X32 }, { TX_32X32, TX_32X32 } },
-#if CONFIG_TX64X64
-      { { TX_64X64, TX_32X32 }, { TX_32X32, TX_32X32 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_16X32, TX_16X32 }, { TX_16X32, TX_16X32 } },
-      { { TX_32X16, TX_32X16 }, { TX_32X16, TX_16X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X64, TX_32X32 }, { TX_16X16, TX_16X16 } },
-      { { TX_64X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_32X8, TX_32X8 }, { TX_16X8, TX_16X8 } },
-  },
-#if CONFIG_EXT_PARTITION
-  {
-// BLOCK_64X128
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_32X32 }, { TX_32X32, TX_32X32 } },
-#if CONFIG_TX64X64
-      { { TX_64X64, TX_32X32 }, { TX_32X32, TX_32X32 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_16X32, TX_16X32 }, { TX_16X32, TX_16X32 } },
-      { { TX_32X16, TX_32X16 }, { TX_32X16, TX_32X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X64, TX_32X32 }, { TX_16X16, TX_16X16 } },
-      { { TX_64X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
-#endif  // CONFIG_TX64X64
-      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
-      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
-      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
-      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
-  },
-  {
-// BLOCK_128X64
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_32X32 }, { TX_32X32, TX_32X32 } },
-#if CONFIG_TX64X64
-      { { TX_64X64, TX_32X32 }, { TX_32X32, TX_32X32 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_16X32, TX_16X32 }, { TX_16X32, TX_16X32 } },
-      { { TX_32X16, TX_32X16 }, { TX_32X16, TX_32X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X64, TX_32X32 }, { TX_16X16, TX_16X16 } },
-      { { TX_64X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_32X8, TX_32X8 }, { TX_16X8, TX_16X8 } },
-  },
-  {
-// BLOCK_128X128
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_32X32 }, { TX_32X32, TX_32X32 } },
-#if CONFIG_TX64X64
-      { { TX_64X64, TX_32X32 }, { TX_32X32, TX_32X32 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_16X32, TX_16X32 }, { TX_16X32, TX_16X32 } },
-      { { TX_32X16, TX_32X16 }, { TX_32X16, TX_32X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X64, TX_32X32 }, { TX_16X16, TX_16X16 } },
-      { { TX_64X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_32X8, TX_32X8 }, { TX_16X8, TX_16X8 } },
-  },
-#endif  // CONFIG_EXT_PARTITION
-  {
-// BLOCK_4X16
-#if CONFIG_CHROMA_2X2
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_4X4, TX_4X4 }, { TX_2X2, TX_2X2 } },
-#else
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X8 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X16, TX_4X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X8, TX_4X8 }, { TX_4X4, TX_4X4 } },
-  },
-  {
-// BLOCK_16X4
-#if CONFIG_CHROMA_2X2
-      { { TX_2X2, TX_2X2 }, { TX_2X2, TX_2X2 } },
-      { { TX_4X4, TX_2X2 }, { TX_4X4, TX_2X2 } },
-#else
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_8X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_8X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_8X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_8X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_8X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_16X4, TX_4X4 }, { TX_8X4, TX_4X4 } },
-      { { TX_8X4, TX_4X4 }, { TX_8X4, TX_4X4 } },
-      { { TX_16X4, TX_4X4 }, { TX_8X4, TX_4X4 } },
-  },
-  {
-// BLOCK_8X32
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X8, TX_8X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X16, TX_8X16 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X16, TX_8X16 }, { TX_4X8, TX_4X8 } },
-#if CONFIG_TX64X64
-      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X16, TX_4X16 }, { TX_4X16, TX_4X16 } },
-      { { TX_8X4, TX_8X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X32, TX_8X16 }, { TX_4X16, TX_4X16 } },
-      { { TX_8X8, TX_8X8 }, { TX_4X8, TX_4X8 } },
-  },
-  {
-// BLOCK_32X8
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif  // CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-#if CONFIG_TX64X64
-      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X4 }, { TX_4X8, TX_4X4 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X8, TX_8X4 }, { TX_8X8, TX_8X4 } },
-      { { TX_16X8, TX_8X4 }, { TX_16X8, TX_8X4 } },
-      { { TX_16X8, TX_8X4 }, { TX_16X8, TX_8X4 } },
-      { { TX_16X8, TX_8X4 }, { TX_16X8, TX_8X4 } },
-#if CONFIG_TX64X64
-      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
-#endif  // CONFIG_TX64X64
-      { { TX_4X8, TX_4X4 }, { TX_4X8, TX_4X4 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X8, TX_8X4 }, { TX_8X8, TX_8X4 } },
-      { { TX_32X8, TX_16X4 }, { TX_16X8, TX_16X4 } },
-  },
-  {
-// BLOCK_16X64
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
-#if CONFIG_TX64X64
-      { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
-#endif
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X32, TX_16X32 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X16, TX_16X16 }, { TX_8X16, TX_8X16 } },
-#if CONFIG_TX64X64
-      { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
-#endif
-      { { TX_4X16, TX_4X16 }, { TX_4X16, TX_4X16 } },
-      { { TX_16X4, TX_16X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X32, TX_8X32 }, { TX_8X32, TX_8X32 } },
-      { { TX_16X8, TX_16X8 }, { TX_8X8, TX_8X8 } },
-  },
-  {
-// BLOCK_64X16
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
-      { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
-#if CONFIG_TX64X64
-      { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
-#endif
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_16X16, TX_16X8 }, { TX_16X16, TX_16X8 } },
-      { { TX_32X16, TX_16X8 }, { TX_32X16, TX_16X8 } },
-#if CONFIG_TX64X64
-      { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
-      { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
-#endif
-      { { TX_4X16, TX_4X8 }, { TX_4X16, TX_4X8 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
-      { { TX_32X8, TX_32X8 }, { TX_32X8, TX_32X8 } },
-  },
-#if CONFIG_EXT_PARTITION
-  {
-// BLOCK_32X128
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_32X32 }, { TX_16X16, TX_16X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X32, TX_32X32 }, { TX_16X16, TX_16X16 } },
-#endif
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_16X32, TX_16X32 }, { TX_16X32, TX_16X32 } },
-      { { TX_32X16, TX_32X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_4X16, TX_4X16 }, { TX_4X16, TX_4X16 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X32, TX_8X32 }, { TX_8X32, TX_8X32 } },
-      { { TX_32X8, TX_32X8 }, { TX_16X8, TX_16X8 } },
-  },
-  {
-// BLOCK_128X32
-#if CONFIG_CHROMA_2X2
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
-      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
-      { { TX_32X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
-#if CONFIG_TX64X64
-      { { TX_32X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
-#endif
-      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
-      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
-      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
-      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
-      { { TX_16X32, TX_16X16 }, { TX_16X32, TX_16X16 } },
-      { { TX_32X16, TX_32X16 }, { TX_32X16, TX_32X16 } },
-      { { TX_4X16, TX_4X16 }, { TX_4X16, TX_4X16 } },
-      { { TX_16X4, TX_16X4 }, { TX_16X4, TX_16X4 } },
-      { { TX_8X32, TX_8X16 }, { TX_8X32, TX_8X16 } },
-      { { TX_32X8, TX_32X8 }, { TX_32X8, TX_32X8 } },
-  },
-#endif
-};
-
-// Generates 4 bit field in which each bit set to 1 represents
-// a blocksize partition  1111 means we split 64x64, 32x32, 16x16
-// and 8x8.  1000 means we just split the 64x64 to 32x32
+// Generates 5 bit field in which each bit set to 1 represents
+// a blocksize partition  11111 means we split 128x128, 64x64, 32x32, 16x16
+// and 8x8.  10000 means we just split the 128x128 to 64x64
 /* clang-format off */
 static const struct {
   PARTITION_CONTEXT above;
   PARTITION_CONTEXT left;
 } partition_context_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_EXT_PARTITION
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  { 31, 31 },  // 2X2   - {0b11111, 0b11111}
-  { 31, 31 },  // 2X4   - {0b11111, 0b11111}
-  { 31, 31 },  // 4X2   - {0b11111, 0b11111}
-#endif
   { 31, 31 },  // 4X4   - {0b11111, 0b11111}
   { 31, 30 },  // 4X8   - {0b11111, 0b11110}
   { 30, 31 },  // 8X4   - {0b11110, 0b11111}
@@ -2070,131 +403,29 @@ static const struct {
   { 16, 0 },   // 64X128- {0b10000, 0b00000}
   { 0, 16 },   // 128X64- {0b00000, 0b10000}
   { 0, 0 },    // 128X128-{0b00000, 0b00000}
-
   { 31, 28 },  // 4X16  - {0b11111, 0b11100}
   { 28, 31 },  // 16X4  - {0b11100, 0b11111}
   { 30, 24 },  // 8X32  - {0b11110, 0b11000}
   { 24, 30 },  // 32X8  - {0b11000, 0b11110}
   { 28, 16 },  // 16X64 - {0b11100, 0b10000}
   { 16, 28 },  // 64X16 - {0b10000, 0b11100}
-  { 24, 0 },   // 32X128- {0b11000, 0b00000}
-  { 0, 24 },   // 128X32- {0b00000, 0b11000}
-#else
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  { 15, 15 },  // 2X2   - {0b1111, 0b1111}
-  { 15, 15 },  // 2X4   - {0b1111, 0b1111}
-  { 15, 15 },  // 4X2   - {0b1111, 0b1111}
-#endif
-  { 15, 15 },  // 4X4   - {0b1111, 0b1111}
-  { 15, 14 },  // 4X8   - {0b1111, 0b1110}
-  { 14, 15 },  // 8X4   - {0b1110, 0b1111}
-  { 14, 14 },  // 8X8   - {0b1110, 0b1110}
-  { 14, 12 },  // 8X16  - {0b1110, 0b1100}
-  { 12, 14 },  // 16X8  - {0b1100, 0b1110}
-  { 12, 12 },  // 16X16 - {0b1100, 0b1100}
-  { 12, 8 },   // 16X32 - {0b1100, 0b1000}
-  { 8, 12 },   // 32X16 - {0b1000, 0b1100}
-  { 8, 8 },    // 32X32 - {0b1000, 0b1000}
-  { 8, 0 },    // 32X64 - {0b1000, 0b0000}
-  { 0, 8 },    // 64X32 - {0b0000, 0b1000}
-  { 0, 0 },    // 64X64 - {0b0000, 0b0000}
-
-  { 15, 12 },  // 4X16 - {0b1111, 0b1100}
-  { 12, 15 },  // 16X4 - {0b1100, 0b1111}
-  { 8, 14 },   // 8X32 - {0b1110, 0b1000}
-  { 14, 8 },   // 32X8 - {0b1000, 0b1110}
-  { 12, 0 },   // 16X64- {0b1100, 0b0000}
-  { 0, 12 },   // 64X16- {0b0000, 0b1100}
-#endif  // CONFIG_EXT_PARTITION
 };
 /* clang-format on */
 
-#if CONFIG_KF_CTX
 static const int intra_mode_context[INTRA_MODES] = {
-  0, 1, 2, 3, 4, 4, 4, 4, 3, 0,
-#if CONFIG_SMOOTH_HV
-  1, 2,
-#endif
-  0,
-};
-#endif
-
-#if CONFIG_SUPERTX
-static const TX_SIZE uvsupertx_size_lookup[TX_SIZES][2][2] = {
-//  ss_x == 0 ss_x == 0   ss_x == 1 ss_x == 1
-//  ss_y == 0 ss_y == 1   ss_y == 0 ss_y == 1
-#if CONFIG_CHROMA_2X2
-  { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-#endif
-  { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
-  { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
-  { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
-  { { TX_32X32, TX_16X16 }, { TX_16X16, TX_16X16 } },
-#if CONFIG_TX64X64
-  { { TX_64X64, TX_32X32 }, { TX_32X32, TX_32X32 } },
-#endif  // CONFIG_TX64X64
-};
-
-#if CONFIG_EXT_PARTITION_TYPES
-static const int partition_supertx_context_lookup[EXT_PARTITION_TYPES] = {
-  -1, 0, 0, 1, 0, 0, 0, 0, 0, 0
+  0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0,
 };
 
-#else
-static const int partition_supertx_context_lookup[PARTITION_TYPES] = { -1, 0, 0,
-                                                                       1 };
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#endif  // CONFIG_SUPERTX
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-// NCOBMC_ADAPT_INTRPL only supports block size >= BLOCK_8X8 and <= BLOCK_64X64
-static const ADAPT_OVERLAP_BLOCK adapt_overlap_block_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  ADAPT_OVERLAP_BLOCK_INVALID,  // BLOCK_2X2
-  ADAPT_OVERLAP_BLOCK_INVALID,  // BLOCK_2X4
-  ADAPT_OVERLAP_BLOCK_INVALID,  // BLOCK_4X2
-#endif
-  ADAPT_OVERLAP_BLOCK_INVALID,  // BLOCK_4X4
-  ADAPT_OVERLAP_BLOCK_INVALID,  // BLOCK_4X8
-  ADAPT_OVERLAP_BLOCK_INVALID,  // BLOCK_8X4
-
-  // the rest of the block sizes round to the largest squared block less than
-  // the given block size
-  ADAPT_OVERLAP_BLOCK_8X8, ADAPT_OVERLAP_BLOCK_8X8, ADAPT_OVERLAP_BLOCK_8X8,
-  ADAPT_OVERLAP_BLOCK_16X16, ADAPT_OVERLAP_BLOCK_16X16,
-  ADAPT_OVERLAP_BLOCK_16X16, ADAPT_OVERLAP_BLOCK_32X32,
-  ADAPT_OVERLAP_BLOCK_32X32, ADAPT_OVERLAP_BLOCK_32X32,
-  ADAPT_OVERLAP_BLOCK_64X64,
-#if CONFIG_EXT_PARTITION
-  ADAPT_OVERLAP_BLOCK_INVALID, ADAPT_OVERLAP_BLOCK_INVALID,
-  ADAPT_OVERLAP_BLOCK_INVALID,
-#endif  // CONFIG_EXT_PARTITION
-  ADAPT_OVERLAP_BLOCK_INVALID, ADAPT_OVERLAP_BLOCK_INVALID,
-  ADAPT_OVERLAP_BLOCK_INVALID, ADAPT_OVERLAP_BLOCK_INVALID,
-  ADAPT_OVERLAP_BLOCK_INVALID, ADAPT_OVERLAP_BLOCK_INVALID,
-#if CONFIG_EXT_PARTITION
-  ADAPT_OVERLAP_BLOCK_INVALID, ADAPT_OVERLAP_BLOCK_INVALID
-#endif  // CONFIG_EXT_PARTITION
+// Note: this is also used in unit tests. So whenever one changes the table,
+// the unit tests need to be changed accordingly.
+static const int quant_dist_weight[4][2] = {
+  { 2, 3 }, { 2, 5 }, { 2, 7 }, { 1, MAX_FRAME_DISTANCE }
 };
-
-static const BLOCK_SIZE bsize_2_sqr_bsize[BLOCK_SIZES] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  BLOCK_2X2,   BLOCK_2X2,   BLOCK_2X2,
-#endif
-  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,   BLOCK_8X8,
-  BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_32X32,
-  BLOCK_32X32, BLOCK_32X32, BLOCK_64X64,
-#if CONFIG_EXT_PARTITION
-  BLOCK_64X64, BLOCK_64X64,
-#endif
+static const int quant_dist_lookup_table[2][4][2] = {
+  { { 9, 7 }, { 11, 5 }, { 12, 4 }, { 13, 3 } },
+  { { 7, 9 }, { 5, 11 }, { 4, 12 }, { 3, 13 } },
 };
 
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-
-#if CONFIG_ADAPT_SCAN
-#define EOB_THRESHOLD_NUM 2
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/convolve.c b/third_party/aom/av1/common/convolve.c
index 5476f59a6..d57f44f8b 100644
--- a/third_party/aom/av1/common/convolve.c
+++ b/third_party/aom/av1/common/convolve.c
@@ -12,76 +12,60 @@
 #include <assert.h>
 #include <string.h>
 
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "av1/common/blockd.h"
 #include "av1/common/convolve.h"
 #include "av1/common/filter.h"
 #include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/mem.h"
 
-#define MAX_BLOCK_WIDTH (MAX_SB_SIZE)
-#define MAX_BLOCK_HEIGHT (MAX_SB_SIZE)
-#define MAX_STEP (32)
-
-void av1_convolve_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst,
-                          int dst_stride, int w, int h,
-                          const InterpFilterParams filter_params,
-                          const int subpel_x_q4, int x_step_q4,
-                          ConvolveParams *conv_params) {
-  int x, y;
-  int filter_size = filter_params.taps;
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  src -= filter_size / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = subpel_x_q4;
-    for (x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-          filter_params, x_q4 & SUBPEL_MASK);
-      int k, sum = 0;
-      for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
-
-      sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      if (conv_params->do_average)
-        dst[x] = ROUND_POWER_OF_TWO(dst[x] + sum, 1);
-      else
-        dst[x] = sum;
-
-      x_q4 += x_step_q4;
+void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             const int16_t *x_filters, int x0_qn,
+                             int x_step_qn) {
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_qn = x0_qn;
+    for (int x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
+      const int x_filter_idx =
+          (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+      assert(x_filter_idx <= RS_SUBPEL_MASK);
+      const int16_t *const x_filter =
+          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
+      int sum = 0;
+      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      x_qn += x_step_qn;
     }
     src += src_stride;
     dst += dst_stride;
   }
 }
 
-void av1_convolve_horiz_scale(const uint8_t *src, int src_stride, uint8_t *dst,
-                              int dst_stride, int w, int h,
-                              const InterpFilterParams filter_params,
-                              const int subpel_x_qn, int x_step_qn,
-                              ConvolveParams *conv_params) {
-  int x, y;
-  int filter_size = filter_params.taps;
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  src -= filter_size / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_qn = subpel_x_qn;
-    for (x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];
-      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(x_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *x_filter =
-          av1_get_interp_filter_subpel_kernel(filter_params, x_filter_idx);
-      int k, sum = 0;
-      for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
-
-      sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      if (conv_params->do_average)
-        dst[x] = ROUND_POWER_OF_TWO(dst[x] + sum, 1);
-      else
-        dst[x] = sum;
-
+void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    const int16_t *x_filters, int x0_qn,
+                                    int x_step_qn, int bd) {
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_qn = x0_qn;
+    for (int x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
+      const int x_filter_idx =
+          (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+      assert(x_filter_idx <= RS_SUBPEL_MASK);
+      const int16_t *const x_filter =
+          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
+      int sum = 0;
+      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
       x_qn += x_step_qn;
     }
     src += src_stride;
@@ -89,417 +73,358 @@ void av1_convolve_horiz_scale(const uint8_t *src, int src_stride, uint8_t *dst,
   }
 }
 
-void av1_convolve_vert_c(const uint8_t *src, int src_stride, uint8_t *dst,
-                         int dst_stride, int w, int h,
-                         const InterpFilterParams filter_params,
-                         const int subpel_y_q4, int y_step_q4,
-                         ConvolveParams *conv_params) {
-  int x, y;
-  int filter_size = filter_params.taps;
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  src -= src_stride * (filter_size / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    int y_q4 = subpel_y_q4;
-    for (y = 0; y < h; ++y) {
-      const uint8_t *const src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-          filter_params, y_q4 & SUBPEL_MASK);
-      int k, sum = 0;
-      for (k = 0; k < filter_size; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-
-      sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      if (conv_params->do_average)
-        dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + sum, 1);
-      else
-        dst[y * dst_stride] = sum;
-
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-void av1_convolve_vert_scale(const uint8_t *src, int src_stride, uint8_t *dst,
-                             int dst_stride, int w, int h,
-                             const InterpFilterParams filter_params,
-                             const int subpel_y_qn, int y_step_qn,
-                             ConvolveParams *conv_params) {
-  int x, y;
-  int filter_size = filter_params.taps;
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  src -= src_stride * (filter_size / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    int y_qn = subpel_y_qn;
-    for (y = 0; y < h; ++y) {
-      const uint8_t *const src_y =
-          &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(y_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *y_filter =
-          av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx);
-      int k, sum = 0;
-      for (k = 0; k < filter_size; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-
-      sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      if (conv_params->do_average)
-        dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + sum, 1);
-      else
-        dst[y * dst_stride] = sum;
-
-      y_qn += y_step_qn;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void convolve_copy(const uint8_t *src, int src_stride, uint8_t *dst,
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
                           int dst_stride, int w, int h,
+                          InterpFilterParams *filter_params_x,
+                          InterpFilterParams *filter_params_y,
+                          const int subpel_x_q4, const int subpel_y_q4,
                           ConvolveParams *conv_params) {
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  if (conv_params->do_average == 0) {
-    int r;
-    for (r = 0; r < h; ++r) {
-      memcpy(dst, src, w);
-      src += src_stride;
-      dst += dst_stride;
-    }
-  } else {
-    int r, c;
-    for (r = 0; r < h; ++r) {
-      for (c = 0; c < w; ++c) {
-        dst[c] = clip_pixel(ROUND_POWER_OF_TWO(dst[c] + src[c], 1));
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = w;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bd = 8;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+
+  // horizontal filter
+  const uint8_t *src_horiz = src - fo_vert * src_stride;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  for (int y = 0; y < im_h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
       }
-      src += src_stride;
-      dst += dst_stride;
+      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+      im_block[y * im_stride + x] =
+          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
     }
   }
-}
 
-void av1_convolve_horiz_facade(const uint8_t *src, int src_stride, uint8_t *dst,
-                               int dst_stride, int w, int h,
-                               const InterpFilterParams filter_params,
-                               const int subpel_x_q4, int x_step_q4,
-                               ConvolveParams *conv_params) {
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  if (filter_params.taps == SUBPEL_TAPS) {
-    const int16_t *filter_x =
-        av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
-    if (conv_params->do_average == 0)
-      aom_convolve8_horiz(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                          NULL, -1, w, h);
-    else
-      aom_convolve8_avg_horiz(src, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, NULL, -1, w, h);
-  } else {
-    av1_convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params,
-                       subpel_x_q4, x_step_q4, conv_params);
+  // vertical filter
+  int16_t *src_vert = im_block + fo_vert * im_stride;
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+      }
+      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+      int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+                    ((1 << (offset_bits - conv_params->round_1)) +
+                     (1 << (offset_bits - conv_params->round_1 - 1)));
+      dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
+    }
   }
 }
 
-void av1_convolve_horiz_facade_c(const uint8_t *src, int src_stride,
-                                 uint8_t *dst, int dst_stride, int w, int h,
-                                 const InterpFilterParams filter_params,
-                                 const int subpel_x_q4, int x_step_q4,
-                                 ConvolveParams *conv_params) {
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  if (filter_params.taps == SUBPEL_TAPS) {
-    const int16_t *filter_x =
-        av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
-    if (conv_params->do_average == 0)
-      aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, NULL, -1, w, h);
-    else
-      aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                x_step_q4, NULL, -1, w, h);
-  } else {
-    av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
-                         subpel_x_q4, x_step_q4, conv_params);
-  }
-}
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                         int dst_stride, int w, int h,
+                         InterpFilterParams *filter_params_x,
+                         InterpFilterParams *filter_params_y,
+                         const int subpel_x_q4, const int subpel_y_q4,
+                         ConvolveParams *conv_params) {
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
 
-void av1_convolve_horiz_facade_scale(const uint8_t *src, int src_stride,
-                                     uint8_t *dst, int dst_stride, int w, int h,
-                                     const InterpFilterParams filter_params,
-                                     const int subpel_x_qn, int x_step_qn,
-                                     ConvolveParams *conv_params) {
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  if (filter_params.taps == SUBPEL_TAPS) {
-    const int16_t *filter_x = av1_get_interp_filter_subpel_kernel(
-        filter_params, subpel_x_qn >> SCALE_EXTRA_BITS);
-    if (conv_params->do_average == 0)
-      aom_convolve8_horiz_scale(src, src_stride, dst, dst_stride, filter_x,
-                                subpel_x_qn, x_step_qn, NULL, 0, -1, w, h);
-    else
-      aom_convolve8_avg_horiz_scale(src, src_stride, dst, dst_stride, filter_x,
-                                    subpel_x_qn, x_step_qn, NULL, 0, -1, w, h);
-  } else {
-    av1_convolve_horiz_scale(src, src_stride, dst, dst_stride, w, h,
-                             filter_params, subpel_x_qn, x_step_qn,
-                             conv_params);
-  }
-}
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
 
-void av1_convolve_vert_facade(const uint8_t *src, int src_stride, uint8_t *dst,
-                              int dst_stride, int w, int h,
-                              const InterpFilterParams filter_params,
-                              const int subpel_y_q4, int y_step_q4,
-                              ConvolveParams *conv_params) {
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  if (filter_params.taps == SUBPEL_TAPS) {
-    const int16_t *filter_y =
-        av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
-    if (conv_params->do_average == 0) {
-      aom_convolve8_vert(src, src_stride, dst, dst_stride, NULL, -1, filter_y,
-                         y_step_q4, w, h);
-    } else {
-      aom_convolve8_avg_vert(src, src_stride, dst, dst_stride, NULL, -1,
-                             filter_y, y_step_q4, w, h);
+  // vertical filter
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+      }
+      dst[y * dst_stride + x] =
+          clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
     }
-  } else {
-    av1_convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
-                      subpel_y_q4, y_step_q4, conv_params);
   }
 }
 
-void av1_convolve_vert_facade_c(const uint8_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride, int w, int h,
-                                const InterpFilterParams filter_params,
-                                const int subpel_y_q4, int y_step_q4,
-                                ConvolveParams *conv_params) {
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  if (filter_params.taps == SUBPEL_TAPS) {
-    const int16_t *filter_y =
-        av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
-    if (conv_params->do_average == 0) {
-      aom_convolve8_vert_c(src, src_stride, dst, dst_stride, NULL, -1, filter_y,
-                           y_step_q4, w, h);
-    } else {
-      aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, NULL, -1,
-                               filter_y, y_step_q4, w, h);
-    }
-  } else {
-    av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
-                        subpel_y_q4, y_step_q4, conv_params);
-  }
-}
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                         int dst_stride, int w, int h,
+                         InterpFilterParams *filter_params_x,
+                         InterpFilterParams *filter_params_y,
+                         const int subpel_x_q4, const int subpel_y_q4,
+                         ConvolveParams *conv_params) {
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+  (void)conv_params;
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
 
-void av1_convolve_vert_facade_scale(const uint8_t *src, int src_stride,
-                                    uint8_t *dst, int dst_stride, int w, int h,
-                                    const InterpFilterParams filter_params,
-                                    const int subpel_y_qn, int y_step_qn,
-                                    ConvolveParams *conv_params) {
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-  if (filter_params.taps == SUBPEL_TAPS) {
-    const int16_t *filter_y = av1_get_interp_filter_subpel_kernel(
-        filter_params, subpel_y_qn >> SCALE_EXTRA_BITS);
-    if (conv_params->do_average == 0) {
-      aom_convolve8_vert_scale(src, src_stride, dst, dst_stride, NULL, 0, -1,
-                               filter_y, subpel_y_qn, y_step_qn, w, h);
-    } else {
-      aom_convolve8_avg_vert_scale(src, src_stride, dst, dst_stride, NULL, 0,
-                                   -1, filter_y, subpel_y_qn, y_step_qn, w, h);
+  // horizontal filter
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+      }
+      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
+      dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
     }
-  } else {
-    av1_convolve_vert_scale(src, src_stride, dst, dst_stride, w, h,
-                            filter_params, subpel_y_qn, y_step_qn, conv_params);
   }
 }
 
-#if CONFIG_CONVOLVE_ROUND
-void av1_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst,
-                             int dst_stride, int w, int h, int bits) {
-  int r, c;
-  for (r = 0; r < h; ++r) {
-    for (c = 0; c < w; ++c) {
-      dst[r * dst_stride + c] =
-          clip_pixel(ROUND_POWER_OF_TWO(src[r * src_stride + c], bits));
-    }
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                               int dst_stride, int w, int h,
+                               InterpFilterParams *filter_params_x,
+                               InterpFilterParams *filter_params_y,
+                               const int subpel_x_q4, const int subpel_y_q4,
+                               ConvolveParams *conv_params) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+
+  for (int y = 0; y < h; ++y) {
+    memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
   }
 }
 
-#if CONFIG_COMPOUND_ROUND
-void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
-                       int dst_stride, int w, int h,
-                       InterpFilterParams *filter_params_x,
-                       InterpFilterParams *filter_params_y,
-                       const int subpel_x_q4, const int subpel_y_q4,
-                       ConvolveParams *conv_params) {
-  int x, y, k;
-  uint8_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
+                           int dst8_stride, int w, int h,
+                           InterpFilterParams *filter_params_x,
+                           InterpFilterParams *filter_params_y,
+                           const int subpel_x_q4, const int subpel_y_q4,
+                           ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
   int im_h = h + filter_params_y->taps - 1;
   int im_stride = w;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bd = 8;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
 
   // horizontal filter
   const uint8_t *src_horiz = src - fo_vert * src_stride;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
       *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
-  for (y = 0; y < im_h; ++y) {
-    for (x = 0; x < w; ++x) {
-      int32_t sum = 0;
-      for (k = 0; k < filter_params_x->taps; ++k) {
+  for (int y = 0; y < im_h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < filter_params_x->taps; ++k) {
         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
       }
+      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
       im_block[y * im_stride + x] =
-          clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0));
+          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
     }
   }
 
   // vertical filter
-  uint8_t *src_vert = im_block + fo_vert * im_stride;
+  int16_t *src_vert = im_block + fo_vert * im_stride;
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
       *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      CONV_BUF_TYPE sum = 0;
-      for (k = 0; k < filter_params_y->taps; ++k) {
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
       }
+      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= (1 << (offset_bits - conv_params->round_1)) +
+               (1 << (offset_bits - conv_params->round_1 - 1));
+        dst8[y * dst8_stride + x] =
+            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
+      } else {
         dst[y * dst_stride + x] = res;
+      }
     }
   }
 }
 
-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
-                             CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
-                             InterpFilterParams *filter_params_x,
-                             InterpFilterParams *filter_params_y,
-                             const int subpel_x_qn, const int x_step_qn,
-                             const int subpel_y_qn, const int y_step_qn,
-                             ConvolveParams *conv_params) {
-  int x, y, k;
-  uint8_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
-  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
-             filter_params_y->taps;
-  int im_stride = w;
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
+                          int dst8_stride, int w, int h,
+                          InterpFilterParams *filter_params_x,
+                          InterpFilterParams *filter_params_y,
+                          const int subpel_x_q4, const int subpel_y_q4,
+                          ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
   const int fo_vert = filter_params_y->taps / 2 - 1;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-
-  // horizontal filter
-  const uint8_t *src_horiz = src - fo_vert * src_stride;
-  for (y = 0; y < im_h; ++y) {
-    int x_qn = subpel_x_qn;
-    for (x = 0; x < w; ++x, x_qn += x_step_qn) {
-      const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
-      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(x_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *x_filter =
-          av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
-      int sum = 0;
-      for (k = 0; k < filter_params_x->taps; ++k)
-        sum += x_filter[k] * src_x[k - fo_horiz];
-      im_block[y * im_stride + x] =
-          clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0));
-    }
-    src_horiz += src_stride;
-  }
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
 
   // vertical filter
-  const uint8_t *src_vert = im_block + fo_vert * im_stride;
-  for (x = 0; x < w; ++x) {
-    int y_qn = subpel_y_qn;
-    for (y = 0; y < h; ++y, y_qn += y_step_qn) {
-      const uint8_t *const src_y =
-          &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
-      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(y_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *y_filter =
-          av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
-      CONV_BUF_TYPE sum = 0;
-      for (k = 0; k < filter_params_y->taps; ++k) {
-        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
       }
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
+      res *= (1 << bits);
+      res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
+
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst8[y * dst8_stride + x] =
+            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
+      } else {
         dst[y * dst_stride + x] = res;
+      }
     }
-    src_vert++;
   }
 }
 
-#else
-
-/* When convolve-round is enabled and compound-round is disabled, we use a
-   high-precision convolve filter.
-   Note: For notes on hardware implementations, including the required
-   bit widths for various intermediate values, see the comments above
-   av1_warp_affine_c.
-*/
-void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
-                       int dst_stride, int w, int h,
-                       InterpFilterParams *filter_params_x,
-                       InterpFilterParams *filter_params_y,
-                       const int subpel_x_q4, const int subpel_y_q4,
-                       ConvolveParams *conv_params) {
-  int x, y, k;
-  int32_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
-  int im_h = h + filter_params_y->taps - 1;
-  int im_stride = w;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
+                          int dst8_stride, int w, int h,
+                          InterpFilterParams *filter_params_x,
+                          InterpFilterParams *filter_params_y,
+                          const int subpel_x_q4, const int subpel_y_q4,
+                          ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_1;
   const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  (void)filter_params_y;
+  (void)subpel_y_q4;
 
   // horizontal filter
-  const uint8_t *src_horiz = src - fo_vert * src_stride;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
       *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
-  for (y = 0; y < im_h; ++y) {
-    for (x = 0; x < w; ++x) {
-      int32_t sum = (1 << (bd + FILTER_BITS - 1));
-      for (k = 0; k < filter_params_x->taps; ++k) {
-        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+      }
+      res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
+      res += round_offset;
+
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst8[y * dst8_stride + x] =
+            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
+      } else {
+        dst[y * dst_stride + x] = res;
       }
-      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
-      im_block[y * im_stride + x] =
-          ROUND_POWER_OF_TWO(sum, conv_params->round_0);
     }
   }
+}
 
-  // vertical filter
-  int32_t *src_vert = im_block + fo_vert * im_stride;
-  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
+                                uint8_t *dst8, int dst8_stride, int w, int h,
+                                InterpFilterParams *filter_params_x,
+                                InterpFilterParams *filter_params_y,
+                                const int subpel_x_q4, const int subpel_y_q4,
+                                ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const int bd = 8;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      CONV_BUF_TYPE sum = 1 << offset_bits;
-      for (k = 0; k < filter_params_y->taps; ++k) {
-        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
-      }
-      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
-                          ((1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1)));
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
+      res += round_offset;
+
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+      } else {
         dst[y * dst_stride + x] = res;
+      }
     }
   }
 }
 
-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
-                             CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
+                             int dst8_stride, int w, int h,
                              InterpFilterParams *filter_params_x,
                              InterpFilterParams *filter_params_y,
                              const int subpel_x_qn, const int x_step_qn,
                              const int subpel_y_qn, const int y_step_qn,
                              ConvolveParams *conv_params) {
-  int x, y, k;
-  int32_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+  int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
              filter_params_y->taps;
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  assert(bits >= 0);
   int im_stride = w;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -507,245 +432,255 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
 
   // horizontal filter
   const uint8_t *src_horiz = src - fo_vert * src_stride;
-  for (y = 0; y < im_h; ++y) {
+  for (int y = 0; y < im_h; ++y) {
     int x_qn = subpel_x_qn;
-    for (x = 0; x < w; ++x, x_qn += x_step_qn) {
+    for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
       const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
       assert(x_filter_idx < SUBPEL_SHIFTS);
       const int16_t *x_filter =
           av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
       int32_t sum = (1 << (bd + FILTER_BITS - 1));
-      for (k = 0; k < filter_params_x->taps; ++k) {
+      for (int k = 0; k < filter_params_x->taps; ++k) {
         sum += x_filter[k] * src_x[k - fo_horiz];
       }
       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
       im_block[y * im_stride + x] =
-          ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
     }
     src_horiz += src_stride;
   }
 
   // vertical filter
-  int32_t *src_vert = im_block + fo_vert * im_stride;
+  int16_t *src_vert = im_block + fo_vert * im_stride;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  for (x = 0; x < w; ++x) {
+  for (int x = 0; x < w; ++x) {
     int y_qn = subpel_y_qn;
-    for (y = 0; y < h; ++y, y_qn += y_step_qn) {
-      const int32_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
+    for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+      const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
       assert(y_filter_idx < SUBPEL_SHIFTS);
       const int16_t *y_filter =
           av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
-      CONV_BUF_TYPE sum = 1 << offset_bits;
-      for (k = 0; k < filter_params_y->taps; ++k) {
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
       }
       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
-                          ((1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1)));
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
-        dst[y * dst_stride + x] = res;
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          int32_t tmp = dst16[y * dst16_stride + x];
+          if (conv_params->use_jnt_comp_avg) {
+            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+            tmp = tmp >> DIST_PRECISION_BITS;
+          } else {
+            tmp += res;
+            tmp = tmp >> 1;
+          }
+          /* Subtract round offset and convolve round */
+          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
+                       (1 << (offset_bits - conv_params->round_1 - 1)));
+          dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+        } else {
+          dst16[y * dst16_stride + x] = res;
+        }
+      } else {
+        /* Subtract round offset and convolve round */
+        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+                             (1 << (offset_bits - conv_params->round_1 - 1)));
+        dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+      }
     }
     src_vert++;
   }
 }
-#endif  // CONFIG_COMPOUND_ROUND
+
+static void convolve_2d_scale_wrapper(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
+    ConvolveParams *conv_params) {
+  if (conv_params->is_compound) {
+    assert(conv_params->dst != NULL);
+  }
+  av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                        filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
+                        y_step_qn, conv_params);
+}
 
 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             InterpFilters interp_filters, const int subpel_x_q4,
                             int x_step_q4, const int subpel_y_q4, int y_step_q4,
-                            int scaled, ConvolveParams *conv_params) {
+                            int scaled, ConvolveParams *conv_params,
+                            const struct scale_factors *sf) {
   (void)x_step_q4;
   (void)y_step_q4;
   (void)dst;
   (void)dst_stride;
 
   InterpFilterParams filter_params_x, filter_params_y;
-  av1_get_convolve_filter_params(interp_filters, 1, &filter_params_x,
-                                 &filter_params_y);
-
-  if (filter_params_y.taps < filter_params_x.taps) {
-    uint8_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) *
-                   (MAX_SB_SIZE + MAX_FILTER_TAP - 1)];
-    int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
-    CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE];
-    int tr_dst_stride = MAX_SB_SIZE;
-    int fo_vert = filter_params_y.taps / 2 - 1;
-    int fo_horiz = filter_params_x.taps / 2 - 1;
-
-    transpose_uint8(tr_src, tr_src_stride,
-                    src - fo_vert * src_stride - fo_horiz, src_stride,
-                    w + filter_params_x.taps - 1, h + filter_params_y.taps - 1);
-    transpose_int32(tr_dst, tr_dst_stride, conv_params->dst,
-                    conv_params->dst_stride, w, h);
-
-    // horizontal and vertical parameters are swapped because of the transpose
-    if (scaled)
-      av1_convolve_2d_scale(tr_src + fo_horiz * tr_src_stride + fo_vert,
-                            tr_src_stride, tr_dst, tr_dst_stride, h, w,
-                            &filter_params_y, &filter_params_x, subpel_y_q4,
-                            y_step_q4, subpel_x_q4, x_step_q4, conv_params);
-    else
-      av1_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
-                      tr_src_stride, tr_dst, tr_dst_stride, h, w,
-                      &filter_params_y, &filter_params_x, subpel_y_q4,
-                      subpel_x_q4, conv_params);
-    transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst,
-                    tr_dst_stride, h, w);
-  } else {
-    if (scaled)
-      av1_convolve_2d_scale(src, src_stride, conv_params->dst,
-                            conv_params->dst_stride, w, h, &filter_params_x,
-                            &filter_params_y, subpel_x_q4, x_step_q4,
-                            subpel_y_q4, y_step_q4, conv_params);
-    else
-      av1_convolve_2d(src, src_stride, conv_params->dst,
-                      conv_params->dst_stride, w, h, &filter_params_x,
-                      &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
-  }
+  av1_get_convolve_filter_params(interp_filters, &filter_params_x,
+                                 &filter_params_y, w, h);
+
+  if (scaled)
+    convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
+                              &filter_params_x, &filter_params_y, subpel_x_q4,
+                              x_step_q4, subpel_y_q4, y_step_q4, conv_params);
+  else
+    sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
+        src, src_stride, dst, dst_stride, w, h, &filter_params_x,
+        &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
 }
 
-#if CONFIG_HIGHBITDEPTH
-void av1_highbd_convolve_rounding_c(const int32_t *src, int src_stride,
-                                    uint8_t *dst8, int dst_stride, int w, int h,
-                                    int bits, int bd) {
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  int r, c;
-  for (r = 0; r < h; ++r) {
-    for (c = 0; c < w; ++c) {
-      dst[r * dst_stride + c] = clip_pixel_highbd(
-          ROUND_POWER_OF_TWO(src[r * src_stride + c], bits), bd);
-    }
+void av1_highbd_convolve_2d_copy_sr_c(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+  (void)bd;
+
+  for (int y = 0; y < h; ++y) {
+    memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
   }
 }
 
-#if CONFIG_COMPOUND_ROUND
-void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride,
-                              CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
-                              InterpFilterParams *filter_params_x,
-                              InterpFilterParams *filter_params_y,
-                              const int subpel_x_q4, const int subpel_y_q4,
-                              ConvolveParams *conv_params, int bd) {
-  int x, y, k;
-  uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
-  int im_h = h + filter_params_y->taps - 1;
-  int im_stride = w;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
+                                uint16_t *dst, int dst_stride, int w, int h,
+                                InterpFilterParams *filter_params_x,
+                                InterpFilterParams *filter_params_y,
+                                const int subpel_x_q4, const int subpel_y_q4,
+                                ConvolveParams *conv_params, int bd) {
   const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
 
   // horizontal filter
-  const uint16_t *src_horiz = src - fo_vert * src_stride;
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
       *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
-  for (y = 0; y < im_h; ++y) {
-    for (x = 0; x < w; ++x) {
-      int32_t sum = 0;
-      for (k = 0; k < filter_params_x->taps; ++k) {
-        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
       }
-      im_block[y * im_stride + x] =
-          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, conv_params->round_0), bd);
+      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
+      dst[y * dst_stride + x] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
     }
   }
+}
 
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
+                                uint16_t *dst, int dst_stride, int w, int h,
+                                InterpFilterParams *filter_params_x,
+                                InterpFilterParams *filter_params_y,
+                                const int subpel_x_q4, const int subpel_y_q4,
+                                ConvolveParams *conv_params, int bd) {
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
   // vertical filter
-  uint16_t *src_vert = im_block + fo_vert * im_stride;
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
       *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      CONV_BUF_TYPE sum = 0;
-      for (k = 0; k < filter_params_y->taps; ++k) {
-        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
       }
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
-        dst[y * dst_stride + x] = res;
+      dst[y * dst_stride + x] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
     }
   }
 }
 
-void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
-                                    CONV_BUF_TYPE *dst, int dst_stride, int w,
-                                    int h, InterpFilterParams *filter_params_x,
-                                    InterpFilterParams *filter_params_y,
-                                    const int subpel_x_qn, const int x_step_qn,
-                                    const int subpel_y_qn, const int y_step_qn,
-                                    ConvolveParams *conv_params, int bd) {
-  int x, y, k;
-  uint16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
-  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
-             filter_params_y->taps;
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
+                                 uint16_t *dst, int dst_stride, int w, int h,
+                                 InterpFilterParams *filter_params_x,
+                                 InterpFilterParams *filter_params_y,
+                                 const int subpel_x_q4, const int subpel_y_q4,
+                                 ConvolveParams *conv_params, int bd) {
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  int im_h = h + filter_params_y->taps - 1;
   int im_stride = w;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
-  (void)bd;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  assert(bits >= 0);
 
   // horizontal filter
   const uint16_t *src_horiz = src - fo_vert * src_stride;
-  for (y = 0; y < im_h; ++y) {
-    int x_qn = subpel_x_qn;
-    for (x = 0; x < w; ++x, x_qn += x_step_qn) {
-      const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
-      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(x_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *x_filter =
-          av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
-      int sum = 0;
-      for (k = 0; k < filter_params_x->taps; ++k)
-        sum += x_filter[k] * src_x[k - fo_horiz];
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  for (int y = 0; y < im_h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+      }
+      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
       im_block[y * im_stride + x] =
-          clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0));
+          ROUND_POWER_OF_TWO(sum, conv_params->round_0);
     }
-    src_horiz += src_stride;
   }
 
   // vertical filter
-  uint16_t *src_vert = im_block + fo_vert * im_stride;
-  for (x = 0; x < w; ++x) {
-    int y_qn = subpel_y_qn;
-    for (y = 0; y < h; ++y, y_qn += y_step_qn) {
-      const uint16_t *const src_y =
-          &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
-      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(y_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *y_filter =
-          av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
-      CONV_BUF_TYPE sum = 0;
-      for (k = 0; k < filter_params_y->taps; ++k) {
-        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
+  int16_t *src_vert = im_block + fo_vert * im_stride;
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
       }
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
-        dst[y * dst_stride + x] = res;
+      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+      int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+                    ((1 << (offset_bits - conv_params->round_1)) +
+                     (1 << (offset_bits - conv_params->round_1 - 1)));
+      dst[y * dst_stride + x] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
     }
-    src_vert++;
   }
 }
 
-#else
-
-void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride,
-                              CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
-                              InterpFilterParams *filter_params_x,
-                              InterpFilterParams *filter_params_y,
-                              const int subpel_x_q4, const int subpel_y_q4,
-                              ConvolveParams *conv_params, int bd) {
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
+                                  uint16_t *dst16, int dst16_stride, int w,
+                                  int h, InterpFilterParams *filter_params_x,
+                                  InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params, int bd) {
   int x, y, k;
-  int32_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
   int im_h = h + filter_params_y->taps - 1;
   int im_stride = w;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  assert(round_bits >= 0);
 
   // horizontal filter
   const uint16_t *src_horiz = src - fo_vert * src_stride;
@@ -760,439 +695,367 @@ void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride,
       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
       (void)bd;
       im_block[y * im_stride + x] =
-          ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
     }
   }
 
   // vertical filter
-  int32_t *src_vert = im_block + fo_vert * im_stride;
+  int16_t *src_vert = im_block + fo_vert * im_stride;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
       *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
   for (y = 0; y < h; ++y) {
     for (x = 0; x < w; ++x) {
-      CONV_BUF_TYPE sum = 1 << offset_bits;
+      int32_t sum = 1 << offset_bits;
       for (k = 0; k < filter_params_y->taps; ++k) {
         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
       }
       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
-                          ((1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1)));
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= (1 << (offset_bits - conv_params->round_1)) +
+               (1 << (offset_bits - conv_params->round_1 - 1));
+        dst16[y * dst16_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
+      } else {
+        dst[y * dst_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
+                                 uint16_t *dst16, int dst16_stride, int w,
+                                 int h, InterpFilterParams *filter_params_x,
+                                 InterpFilterParams *filter_params_y,
+                                 const int subpel_x_q4, const int subpel_y_q4,
+                                 ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  assert(round_bits >= 0);
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+  assert(bits >= 0);
+  // horizontal filter
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+      }
+      res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
+      res += round_offset;
+
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst16[y * dst16_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
+      } else {
+        dst[y * dst_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
+                                 uint16_t *dst16, int dst16_stride, int w,
+                                 int h, InterpFilterParams *filter_params_x,
+                                 InterpFilterParams *filter_params_y,
+                                 const int subpel_x_q4, const int subpel_y_q4,
+                                 ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  assert(round_bits >= 0);
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  assert(bits >= 0);
+  // vertical filter
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+      }
+      res *= (1 << bits);
+      res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
+
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst16[y * dst16_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
+      } else {
         dst[y * dst_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_2d_copy_c(
+    const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
+    int w, int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  assert(bits >= 0);
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
+      res += round_offset;
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst16[y * dst16_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+      } else {
+        dst[y * dst_stride + x] = res;
+      }
     }
   }
 }
 
 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
-                                    CONV_BUF_TYPE *dst, int dst_stride, int w,
-                                    int h, InterpFilterParams *filter_params_x,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    InterpFilterParams *filter_params_x,
                                     InterpFilterParams *filter_params_y,
                                     const int subpel_x_qn, const int x_step_qn,
                                     const int subpel_y_qn, const int y_step_qn,
                                     ConvolveParams *conv_params, int bd) {
-  int x, y, k;
-  int32_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+  int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
              filter_params_y->taps;
   int im_stride = w;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
-
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  assert(bits >= 0);
   // horizontal filter
   const uint16_t *src_horiz = src - fo_vert * src_stride;
-  for (y = 0; y < im_h; ++y) {
+  for (int y = 0; y < im_h; ++y) {
     int x_qn = subpel_x_qn;
-    for (x = 0; x < w; ++x, x_qn += x_step_qn) {
+    for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
       const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
       assert(x_filter_idx < SUBPEL_SHIFTS);
       const int16_t *x_filter =
           av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
       int32_t sum = (1 << (bd + FILTER_BITS - 1));
-      for (k = 0; k < filter_params_x->taps; ++k) {
+      for (int k = 0; k < filter_params_x->taps; ++k) {
         sum += x_filter[k] * src_x[k - fo_horiz];
       }
       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
       im_block[y * im_stride + x] =
-          ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
     }
     src_horiz += src_stride;
   }
 
   // vertical filter
-  int32_t *src_vert = im_block + fo_vert * im_stride;
+  int16_t *src_vert = im_block + fo_vert * im_stride;
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  for (x = 0; x < w; ++x) {
+  for (int x = 0; x < w; ++x) {
     int y_qn = subpel_y_qn;
-    for (y = 0; y < h; ++y, y_qn += y_step_qn) {
-      const int32_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
+    for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+      const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
       assert(y_filter_idx < SUBPEL_SHIFTS);
       const int16_t *y_filter =
           av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
-      CONV_BUF_TYPE sum = 1 << offset_bits;
-      for (k = 0; k < filter_params_y->taps; ++k) {
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
       }
       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
-                          ((1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1)));
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
-        dst[y * dst_stride + x] = res;
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          int32_t tmp = dst16[y * dst16_stride + x];
+          if (conv_params->use_jnt_comp_avg) {
+            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+            tmp = tmp >> DIST_PRECISION_BITS;
+          } else {
+            tmp += res;
+            tmp = tmp >> 1;
+          }
+          /* Subtract round offset and convolve round */
+          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
+                       (1 << (offset_bits - conv_params->round_1 - 1)));
+          dst[y * dst_stride + x] =
+              clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+        } else {
+          dst16[y * dst16_stride + x] = res;
+        }
+      } else {
+        /* Subtract round offset and convolve round */
+        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+                             (1 << (offset_bits - conv_params->round_1 - 1)));
+        dst[y * dst_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+      }
     }
     src_vert++;
   }
 }
-#endif  // CONFIG_COMPOUND_ROUND
 
 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
-                                   uint8_t *dst, int dst_stride, int w, int h,
+                                   uint8_t *dst8, int dst_stride, int w, int h,
                                    InterpFilters interp_filters,
                                    const int subpel_x_q4, int x_step_q4,
                                    const int subpel_y_q4, int y_step_q4,
                                    int scaled, ConvolveParams *conv_params,
-                                   int bd) {
+                                   const struct scale_factors *sf, int bd) {
   (void)x_step_q4;
   (void)y_step_q4;
-  (void)dst;
   (void)dst_stride;
 
-  InterpFilterParams filter_params_x, filter_params_y;
-  av1_get_convolve_filter_params(interp_filters, 1, &filter_params_x,
-                                 &filter_params_y);
-
   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  if (filter_params_y.taps < filter_params_x.taps) {
-    uint16_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) *
-                    (MAX_SB_SIZE + MAX_FILTER_TAP - 1)];
-    int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
-    CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE];
-    int tr_dst_stride = MAX_SB_SIZE;
-    int fo_vert = filter_params_y.taps / 2 - 1;
-    int fo_horiz = filter_params_x.taps / 2 - 1;
-
-    transpose_uint16(
-        tr_src, tr_src_stride, src - fo_vert * src_stride - fo_horiz,
-        src_stride, w + filter_params_x.taps - 1, h + filter_params_y.taps - 1);
-    transpose_int32(tr_dst, tr_dst_stride, conv_params->dst,
-                    conv_params->dst_stride, w, h);
-
-    // horizontal and vertical parameters are swapped because of the transpose
-    if (scaled)
-      av1_highbd_convolve_2d_scale(
-          tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst,
-          tr_dst_stride, h, w, &filter_params_y, &filter_params_x, subpel_y_q4,
-          y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd);
-    else
-      av1_highbd_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
-                             tr_src_stride, tr_dst, tr_dst_stride, h, w,
-                             &filter_params_y, &filter_params_x, subpel_y_q4,
-                             subpel_x_q4, conv_params, bd);
-    transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst,
-                    tr_dst_stride, h, w);
-  } else {
-    if (scaled)
-      av1_highbd_convolve_2d_scale(
-          src, src_stride, conv_params->dst, conv_params->dst_stride, w, h,
-          &filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4,
-          subpel_y_q4, y_step_q4, conv_params, bd);
-    else
-      av1_highbd_convolve_2d(src, src_stride, conv_params->dst,
-                             conv_params->dst_stride, w, h, &filter_params_x,
-                             &filter_params_y, subpel_x_q4, subpel_y_q4,
-                             conv_params, bd);
-  }
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-#endif  // CONFIG_CONVOLVE_ROUND
-
-typedef void (*ConvolveFunc)(const uint8_t *src, int src_stride, uint8_t *dst,
-                             int dst_stride, int w, int h,
-                             const InterpFilterParams filter_params,
-                             const int subpel_q4, int step_q4,
-                             ConvolveParams *conv_params);
-
-static void convolve_helper(const uint8_t *src, int src_stride, uint8_t *dst,
-                            int dst_stride, int w, int h,
-                            const InterpFilters interp_filters,
-                            const int subpel_x_q4, int x_step_q4,
-                            const int subpel_y_q4, int y_step_q4,
-                            ConvolveParams *conv_params,
-                            ConvolveFunc convolve_horiz,
-                            ConvolveFunc convolve_vert) {
-  int ignore_horiz = x_step_q4 == SUBPEL_SHIFTS && subpel_x_q4 == 0;
-  int ignore_vert = y_step_q4 == SUBPEL_SHIFTS && subpel_y_q4 == 0;
-
   InterpFilterParams filter_params_x, filter_params_y;
-  av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x,
-                                 &filter_params_y);
-
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-
-  assert(w <= MAX_BLOCK_WIDTH);
-  assert(h <= MAX_BLOCK_HEIGHT);
-  assert(y_step_q4 <= MAX_STEP);
-  assert(x_step_q4 <= MAX_STEP);
-
-  if (ignore_horiz && ignore_vert) {
-    convolve_copy(src, src_stride, dst, dst_stride, w, h, conv_params);
-  } else if (ignore_vert) {
-    assert(filter_params_x.taps <= MAX_FILTER_TAP);
-    convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params_x,
-                   subpel_x_q4, x_step_q4, conv_params);
-  } else if (ignore_horiz) {
-    assert(filter_params_y.taps <= MAX_FILTER_TAP);
-    convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params_y,
-                  subpel_y_q4, y_step_q4, conv_params);
-  } else {
-    // temp's size is set to a 256 aligned value to facilitate SIMD
-    // implementation. The value is greater than (maximum possible intermediate
-    // height or width) * MAX_SB_SIZE
-    DECLARE_ALIGNED(16, uint8_t,
-                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-    int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
-    int filter_size;
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-    av1_convolve_filter_params_fixup_1212(&filter_params_x, &filter_params_y);
-
-    // we do filter with fewer taps first to reduce hardware implementation
-    // complexity
-    if (filter_params_y.taps < filter_params_x.taps) {
-      int intermediate_width;
-      int temp_stride = max_intermediate_size;
-      ConvolveParams temp_conv_params;
-      temp_conv_params.ref = 0;
-      temp_conv_params.do_average = 0;
-      temp_conv_params.round = CONVOLVE_OPT_ROUND;
-      filter_size = filter_params_x.taps;
-      intermediate_width =
-          (((w - 1) * x_step_q4 + subpel_x_q4) >> SUBPEL_BITS) + filter_size;
-      assert(intermediate_width <= max_intermediate_size);
-
-      assert(filter_params_y.taps <= MAX_FILTER_TAP);
-
-      convolve_vert(src - (filter_size / 2 - 1), src_stride, temp, temp_stride,
-                    intermediate_width, h, filter_params_y, subpel_y_q4,
-                    y_step_q4, &temp_conv_params);
-
-      assert(filter_params_x.taps <= MAX_FILTER_TAP);
-      convolve_horiz(temp + (filter_size / 2 - 1), temp_stride, dst, dst_stride,
-                     w, h, filter_params_x, subpel_x_q4, x_step_q4,
-                     conv_params);
-    } else
-#endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-    {
-      int intermediate_height;
-      int temp_stride = MAX_SB_SIZE;
-      ConvolveParams temp_conv_params;
-      temp_conv_params.ref = 0;
-      temp_conv_params.do_average = 0;
-      temp_conv_params.round = CONVOLVE_OPT_ROUND;
-      filter_size = filter_params_y.taps;
-      intermediate_height =
-          (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
-      assert(intermediate_height <= max_intermediate_size);
-      (void)max_intermediate_size;
-
-      assert(filter_params_x.taps <= MAX_FILTER_TAP);
-
-      convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride, temp,
-                     temp_stride, w, intermediate_height, filter_params_x,
-                     subpel_x_q4, x_step_q4, &temp_conv_params);
-
-      assert(filter_params_y.taps <= MAX_FILTER_TAP);
-
-      convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride,
-                    dst, dst_stride, w, h, filter_params_y, subpel_y_q4,
-                    y_step_q4, conv_params);
-    }
-  }
-}
+  av1_get_convolve_filter_params(interp_filters, &filter_params_x,
+                                 &filter_params_y, w, h);
 
-static void convolve_scale_helper(const uint8_t *src, int src_stride,
-                                  uint8_t *dst, int dst_stride, int w, int h,
-                                  const InterpFilters interp_filters,
-                                  const int subpel_x_qn, int x_step_qn,
-                                  const int subpel_y_qn, int y_step_qn,
-                                  ConvolveParams *conv_params,
-                                  ConvolveFunc convolve_horiz,
-                                  ConvolveFunc convolve_vert) {
-  int ignore_horiz = x_step_qn == SCALE_SUBPEL_SHIFTS && subpel_x_qn == 0;
-  int ignore_vert = y_step_qn == SCALE_SUBPEL_SHIFTS && subpel_y_qn == 0;
-
-  InterpFilterParams filter_params_x, filter_params_y;
-  av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x,
-                                 &filter_params_y);
-
-  assert(conv_params->round == CONVOLVE_OPT_ROUND);
-
-  assert(w <= MAX_BLOCK_WIDTH);
-  assert(h <= MAX_BLOCK_HEIGHT);
-  assert(y_step_qn <= (MAX_STEP << SCALE_EXTRA_BITS));
-  assert(x_step_qn <= (MAX_STEP << SCALE_EXTRA_BITS));
-
-  if (ignore_horiz && ignore_vert) {
-    convolve_copy(src, src_stride, dst, dst_stride, w, h, conv_params);
-  } else if (ignore_vert) {
-    assert(filter_params_x.taps <= MAX_FILTER_TAP);
-    convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params_x,
-                   subpel_x_qn, x_step_qn, conv_params);
-  } else if (ignore_horiz) {
-    assert(filter_params_y.taps <= MAX_FILTER_TAP);
-    convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params_y,
-                  subpel_y_qn, y_step_qn, conv_params);
-  } else {
-    // temp's size is set to a 256 aligned value to facilitate SIMD
-    // implementation. The value is greater than (maximum possible intermediate
-    // height or width) * MAX_SB_SIZE
-    DECLARE_ALIGNED(16, uint8_t,
-                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-    int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
-    int filter_size;
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-    av1_convolve_filter_params_fixup_1212(&filter_params_x, &filter_params_y);
-
-    // we do filter with fewer taps first to reduce hardware implementation
-    // complexity
-    if (filter_params_y.taps < filter_params_x.taps) {
-      int intermediate_width;
-      int temp_stride = max_intermediate_size;
-      ConvolveParams temp_conv_params;
-      temp_conv_params.ref = 0;
-      temp_conv_params.do_average = 0;
-      temp_conv_params.round = CONVOLVE_OPT_ROUND;
-      filter_size = filter_params_x.taps;
-      intermediate_width =
-          (((w - 1) * x_step_qn + subpel_x_qn) >> SCALE_SUBPEL_BITS) +
-          filter_size;
-      assert(intermediate_width <= max_intermediate_size);
-
-      assert(filter_params_y.taps <= MAX_FILTER_TAP);
-
-      convolve_vert(src - (filter_size / 2 - 1), src_stride, temp, temp_stride,
-                    intermediate_width, h, filter_params_y, subpel_y_qn,
-                    y_step_qn, &temp_conv_params);
-
-      assert(filter_params_x.taps <= MAX_FILTER_TAP);
-      convolve_horiz(temp + (filter_size / 2 - 1), temp_stride, dst, dst_stride,
-                     w, h, filter_params_x, subpel_x_qn, x_step_qn,
-                     conv_params);
-    } else {
-#endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-      int intermediate_height;
-      int temp_stride = MAX_SB_SIZE;
-      ConvolveParams temp_conv_params;
-      temp_conv_params.ref = 0;
-      temp_conv_params.do_average = 0;
-      temp_conv_params.round = CONVOLVE_OPT_ROUND;
-      filter_size = filter_params_y.taps;
-      intermediate_height =
-          (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
-          filter_size;
-      assert(intermediate_height <= max_intermediate_size);
-      (void)max_intermediate_size;
-
-      assert(filter_params_x.taps <= MAX_FILTER_TAP);
-
-      convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride, temp,
-                     temp_stride, w, intermediate_height, filter_params_x,
-                     subpel_x_qn, x_step_qn, &temp_conv_params);
-
-      assert(filter_params_y.taps <= MAX_FILTER_TAP);
-
-      convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride,
-                    dst, dst_stride, w, h, filter_params_y, subpel_y_qn,
-                    y_step_qn, conv_params);
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
+  if (scaled) {
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+    if (conv_params->is_compound) {
+      assert(conv_params->dst != NULL);
     }
-#endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-  }
-}
+    av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
+                                 &filter_params_x, &filter_params_y,
+                                 subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4,
+                                 conv_params, bd);
+  } else {
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
 
-void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
-                  int dst_stride, int w, int h, InterpFilters interp_filters,
-                  const int subpel_x_q4, int x_step_q4, const int subpel_y_q4,
-                  int y_step_q4, ConvolveParams *conv_params) {
-  convolve_helper(src, src_stride, dst, dst_stride, w, h, interp_filters,
-                  subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, conv_params,
-                  av1_convolve_horiz_facade, av1_convolve_vert_facade);
+    sf->highbd_convolve[subpel_x_q4 != 0][subpel_y_q4 !=
+                                          0][conv_params->is_compound](
+        src, src_stride, dst, dst_stride, w, h, &filter_params_x,
+        &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd);
+  }
 }
 
-void av1_convolve_c(const uint8_t *src, int src_stride, uint8_t *dst,
-                    int dst_stride, int w, int h, InterpFilters interp_filters,
-                    const int subpel_x_q4, int x_step_q4, const int subpel_y_q4,
-                    int y_step_q4, ConvolveParams *conv_params) {
-  convolve_helper(src, src_stride, dst, dst_stride, w, h, interp_filters,
-                  subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, conv_params,
-                  av1_convolve_horiz_facade_c, av1_convolve_vert_facade_c);
+// Note: Fixed size intermediate buffers, place limits on parameters
+// of some functions. 2d filtering proceeds in 2 steps:
+//   (1) Interpolate horizontally into an intermediate buffer, temp.
+//   (2) Interpolate temp vertically to derive the sub-pixel result.
+// Deriving the maximum number of rows in the temp buffer (135):
+// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+// --Largest block size is 128x128 pixels.
+// --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
+//   original frame (in 1/16th pixel units).
+// --Must round-up because block may be located at sub-pixel position.
+// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+// --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
+#define WIENER_MAX_EXT_SIZE 263
+
+static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+  return sum;
 }
 
-void av1_convolve_scale(const uint8_t *src, int src_stride, uint8_t *dst,
-                        int dst_stride, int w, int h,
-                        InterpFilters interp_filters, const int subpel_x_qn,
-                        int x_step_qn, const int subpel_y_qn, int y_step_qn,
-                        ConvolveParams *conv_params) {
-  convolve_scale_helper(src, src_stride, dst, dst_stride, w, h, interp_filters,
-                        subpel_x_qn, x_step_qn, subpel_y_qn, y_step_qn,
-                        conv_params, av1_convolve_horiz_facade_scale,
-                        av1_convolve_vert_facade_scale);
+static INLINE int highbd_horz_scalar_product(const uint16_t *a,
+                                             const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+  return sum;
 }
 
-void av1_lowbd_convolve_init_c(void) {
-  // A placeholder for SIMD initialization
-  return;
+static INLINE int highbd_vert_scalar_product(const uint16_t *a,
+                                             ptrdiff_t a_stride,
+                                             const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
+  return sum;
 }
 
-void av1_highbd_convolve_init_c(void) {
-  // A placeholder for SIMD initialization
-  return;
+static const InterpKernel *get_filter_base(const int16_t *filter) {
+  // NOTE: This assumes that the filter table is 256-byte aligned.
+  // TODO(agrange) Modify to make independent of table alignment.
+  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
 }
 
-void av1_convolve_init(AV1_COMMON *cm) {
-#if CONFIG_HIGHBITDEPTH
-  if (cm->use_highbitdepth)
-    av1_highbd_convolve_init();
-  else
-    av1_lowbd_convolve_init();
-#else
-  (void)cm;
-  av1_lowbd_convolve_init();
-#endif
-  return;
+static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
+  return (int)((const InterpKernel *)(intptr_t)f - base);
 }
 
-#if CONFIG_HIGHBITDEPTH
-void av1_highbd_convolve_horiz_c(const uint16_t *src, int src_stride,
-                                 uint16_t *dst, int dst_stride, int w, int h,
-                                 const InterpFilterParams filter_params,
-                                 const int subpel_x_q4, int x_step_q4, int avg,
-                                 int bd) {
-  int x, y;
-  int filter_size = filter_params.taps;
-  src -= filter_size / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = subpel_x_q4;
-    for (x = 0; x < w; ++x) {
-      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-          filter_params, x_q4 & SUBPEL_MASK);
-      int k, sum = 0;
-      for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
-      if (avg)
-        dst[x] = ROUND_POWER_OF_TWO(
-            dst[x] +
-                clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
-            1);
-      else
-        dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
+                                       uint16_t *dst, ptrdiff_t dst_stride,
+                                       const InterpKernel *x_filters, int x0_q4,
+                                       int x_step_q4, int w, int h,
+                                       int round0_bits) {
+  const int bd = 8;
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (int x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
+                           (1 << (bd + FILTER_BITS - 1));
+      const int sum = horz_scalar_product(src_x, x_filter) + rounding;
+      dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
+                               WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
       x_q4 += x_step_q4;
     }
     src += src_stride;
@@ -1200,66 +1063,25 @@ void av1_highbd_convolve_horiz_c(const uint16_t *src, int src_stride,
   }
 }
 
-void av1_highbd_convolve_horiz_scale(const uint16_t *src, int src_stride,
-                                     uint16_t *dst, int dst_stride, int w,
-                                     int h,
-                                     const InterpFilterParams filter_params,
-                                     const int subpel_x_qn, int x_step_qn,
-                                     int avg, int bd) {
-  int x, y;
-  int filter_size = filter_params.taps;
-  src -= filter_size / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_qn = subpel_x_qn;
-    for (x = 0; x < w; ++x) {
-      const uint16_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];
-      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(x_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *x_filter =
-          av1_get_interp_filter_subpel_kernel(filter_params, x_filter_idx);
-      int k, sum = 0;
-      for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
-      if (avg)
-        dst[x] = ROUND_POWER_OF_TWO(
-            dst[x] +
-                clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
-            1);
-      else
-        dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
-      x_qn += x_step_qn;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void av1_highbd_convolve_vert_c(const uint16_t *src, int src_stride,
-                                uint16_t *dst, int dst_stride, int w, int h,
-                                const InterpFilterParams filter_params,
-                                const int subpel_y_q4, int y_step_q4, int avg,
-                                int bd) {
-  int x, y;
-  int filter_size = filter_params.taps;
-  src -= src_stride * (filter_size / 2 - 1);
-
-  for (x = 0; x < w; ++x) {
-    int y_q4 = subpel_y_q4;
-    for (y = 0; y < h; ++y) {
-      const uint16_t *const src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-          filter_params, y_q4 & SUBPEL_MASK);
-      int k, sum = 0;
-      for (k = 0; k < filter_size; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      if (avg) {
-        dst[y * dst_stride] = ROUND_POWER_OF_TWO(
-            dst[y * dst_stride] +
-                clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
-            1);
-      } else {
-        dst[y * dst_stride] =
-            clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
-      }
+static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *y_filters, int y0_q4,
+                                      int y_step_q4, int w, int h,
+                                      int round1_bits) {
+  const int bd = 8;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (int x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (int y = 0; y < h; ++y) {
+      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      const int rounding =
+          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
+          (1 << (bd + round1_bits - 1));
+      const int sum =
+          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
+      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
       y_q4 += y_step_q4;
     }
     ++src;
@@ -1267,325 +1089,111 @@ void av1_highbd_convolve_vert_c(const uint16_t *src, int src_stride,
   }
 }
 
-void av1_highbd_convolve_vert_scale(const uint16_t *src, int src_stride,
-                                    uint16_t *dst, int dst_stride, int w, int h,
-                                    const InterpFilterParams filter_params,
-                                    const int subpel_y_qn, int y_step_qn,
-                                    int avg, int bd) {
-  int x, y;
-  int filter_size = filter_params.taps;
-  src -= src_stride * (filter_size / 2 - 1);
-
-  for (x = 0; x < w; ++x) {
-    int y_qn = subpel_y_qn;
-    for (y = 0; y < h; ++y) {
-      const uint16_t *const src_y =
-          &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(y_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *y_filter =
-          av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx);
-      int k, sum = 0;
-      for (k = 0; k < filter_size; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      if (avg) {
-        dst[y * dst_stride] = ROUND_POWER_OF_TWO(
-            dst[y * dst_stride] +
-                clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
-            1);
-      } else {
-        dst[y * dst_stride] =
-            clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
-      }
-      y_qn += y_step_qn;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void highbd_convolve_copy(const uint16_t *src, int src_stride,
-                                 uint16_t *dst, int dst_stride, int w, int h,
-                                 int avg, int bd) {
-  if (avg == 0) {
-    int r;
-    for (r = 0; r < h; ++r) {
-      memcpy(dst, src, w * sizeof(*src));
-      src += src_stride;
-      dst += dst_stride;
-    }
-  } else {
-    int r, c;
-    for (r = 0; r < h; ++r) {
-      for (c = 0; c < w; ++c) {
-        dst[c] = clip_pixel_highbd(ROUND_POWER_OF_TWO(dst[c] + src[c], 1), bd);
-      }
-      src += src_stride;
-      dst += dst_stride;
-    }
-  }
+void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h,
+                                   const ConvolveParams *conv_params) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+
+  convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                             src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
+                             x_step_q4, w, intermediate_height,
+                             conv_params->round_0);
+  convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+                            MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
+                            y_step_q4, w, h, conv_params->round_1);
 }
 
-void av1_highbd_convolve_horiz_facade(const uint8_t *src8, int src_stride,
-                                      uint8_t *dst8, int dst_stride, int w,
-                                      int h,
-                                      const InterpFilterParams filter_params,
-                                      const int subpel_x_q4, int x_step_q4,
-                                      int avg, int bd) {
+static void highbd_convolve_add_src_horiz_hip(
+    const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
+    int x_step_q4, int w, int h, int round0_bits, int bd) {
+  const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  if (filter_params.taps == SUBPEL_TAPS) {
-    const int16_t *filter_x =
-        av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
-    if (avg == 0)
-      aom_highbd_convolve8_horiz(src8, src_stride, dst8, dst_stride, filter_x,
-                                 x_step_q4, NULL, -1, w, h, bd);
-    else
-      aom_highbd_convolve8_avg_horiz(src8, src_stride, dst8, dst_stride,
-                                     filter_x, x_step_q4, NULL, -1, w, h, bd);
-  } else {
-    av1_highbd_convolve_horiz(src, src_stride, dst, dst_stride, w, h,
-                              filter_params, subpel_x_q4, x_step_q4, avg, bd);
-  }
-}
-
-void av1_highbd_convolve_horiz_facade_scale(
-    const uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride, int w,
-    int h, const InterpFilterParams filter_params, const int subpel_x_qn,
-    int x_step_qn, int avg, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  // TODO(debargha): Add special functions for filter_params.taps == SUBPEL_TAPS
-  // as in the function above.
-  av1_highbd_convolve_horiz_scale(src, src_stride, dst, dst_stride, w, h,
-                                  filter_params, subpel_x_qn, x_step_qn, avg,
-                                  bd);
-}
-
-void av1_highbd_convolve_vert_facade(const uint8_t *src8, int src_stride,
-                                     uint8_t *dst8, int dst_stride, int w,
-                                     int h,
-                                     const InterpFilterParams filter_params,
-                                     const int subpel_y_q4, int y_step_q4,
-                                     int avg, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-
-  if (filter_params.taps == SUBPEL_TAPS) {
-    const int16_t *filter_y =
-        av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
-    if (avg == 0) {
-      aom_highbd_convolve8_vert(src8, src_stride, dst8, dst_stride, NULL, -1,
-                                filter_y, y_step_q4, w, h, bd);
-    } else {
-      aom_highbd_convolve8_avg_vert(src8, src_stride, dst8, dst_stride, NULL,
-                                    -1, filter_y, y_step_q4, w, h, bd);
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (int x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
+                           (1 << (bd + FILTER_BITS - 1));
+      const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
+      dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
+                               extraprec_clamp_limit - 1);
+      x_q4 += x_step_q4;
     }
-  } else {
-    av1_highbd_convolve_vert(src, src_stride, dst, dst_stride, w, h,
-                             filter_params, subpel_y_q4, y_step_q4, avg, bd);
+    src += src_stride;
+    dst += dst_stride;
   }
 }
 
-void av1_highbd_convolve_vert_facade_scale(
-    const uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride, int w,
-    int h, const InterpFilterParams filter_params, const int subpel_y_qn,
-    int y_step_qn, int avg, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  // TODO(debargha): Add special functions for filter_params.taps == SUBPEL_TAPS
-  // as in the function above.
-  av1_highbd_convolve_vert_scale(src, src_stride, dst, dst_stride, w, h,
-                                 filter_params, subpel_y_qn, y_step_qn, avg,
-                                 bd);
-}
-
-void av1_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
-                         int dst_stride, int w, int h,
-                         InterpFilters interp_filters, const int subpel_x_q4,
-                         int x_step_q4, const int subpel_y_q4, int y_step_q4,
-                         int ref_idx, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+static void highbd_convolve_add_src_vert_hip(
+    const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
+    ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
+    int y_step_q4, int w, int h, int round1_bits, int bd) {
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  int ignore_horiz = x_step_q4 == SUBPEL_SHIFTS && subpel_x_q4 == 0;
-  int ignore_vert = y_step_q4 == SUBPEL_SHIFTS && subpel_y_q4 == 0;
-
-  assert(w <= MAX_BLOCK_WIDTH);
-  assert(h <= MAX_BLOCK_HEIGHT);
-  assert(y_step_q4 <= MAX_STEP);
-  assert(x_step_q4 <= MAX_STEP);
-
-  if (ignore_horiz && ignore_vert) {
-    highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h, ref_idx, bd);
-    return;
-  }
-
-  InterpFilterParams filter_params_x, filter_params_y;
-  av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x,
-                                 &filter_params_y);
-
-  if (ignore_vert) {
-    av1_highbd_convolve_horiz_facade(src8, src_stride, dst8, dst_stride, w, h,
-                                     filter_params_x, subpel_x_q4, x_step_q4,
-                                     ref_idx, bd);
-  } else if (ignore_horiz) {
-    av1_highbd_convolve_vert_facade(src8, src_stride, dst8, dst_stride, w, h,
-                                    filter_params_y, subpel_y_q4, y_step_q4,
-                                    ref_idx, bd);
-  } else {
-    // temp's size is set to a 256 aligned value to facilitate SIMD
-    // implementation. The value is greater than (maximum possible intermediate
-    // height or width) * MAX_SB_SIZE
-    DECLARE_ALIGNED(16, uint16_t,
-                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-    uint8_t *temp8 = CONVERT_TO_BYTEPTR(temp);
-    int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
-    int filter_size;
-
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-    av1_convolve_filter_params_fixup_1212(&filter_params_x, &filter_params_y);
-
-    if (filter_params_y.taps < filter_params_x.taps) {
-      int intermediate_width;
-      int temp_stride = max_intermediate_size;
-      filter_size = filter_params_x.taps;
-      intermediate_width =
-          (((w - 1) * x_step_q4 + subpel_x_q4) >> SUBPEL_BITS) + filter_size;
-      assert(intermediate_width <= max_intermediate_size);
-
-      assert(filter_params_y.taps <= MAX_FILTER_TAP);
-
-      av1_highbd_convolve_vert_facade(src8 - (filter_size / 2 - 1), src_stride,
-                                      temp8, temp_stride, intermediate_width, h,
-                                      filter_params_y, subpel_y_q4, y_step_q4,
-                                      0, bd);
-
-      assert(filter_params_x.taps <= MAX_FILTER_TAP);
-
-      av1_highbd_convolve_horiz_facade(
-          temp8 + (filter_size / 2 - 1), temp_stride, dst8, dst_stride, w, h,
-          filter_params_x, subpel_x_q4, x_step_q4, ref_idx, bd);
-    } else
-#endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-    {
-      int intermediate_height;
-      int temp_stride = MAX_SB_SIZE;
-      filter_size = filter_params_y.taps;
-
-      intermediate_height =
-          (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
-      assert(intermediate_height <= max_intermediate_size);
-      (void)max_intermediate_size;
-
-      av1_highbd_convolve_horiz_facade(
-          src8 - src_stride * (filter_size / 2 - 1), src_stride, temp8,
-          temp_stride, w, intermediate_height, filter_params_x, subpel_x_q4,
-          x_step_q4, 0, bd);
-
-      filter_size = filter_params_y.taps;
-      assert(filter_params_y.taps <= MAX_FILTER_TAP);
-
-      av1_highbd_convolve_vert_facade(
-          temp8 + temp_stride * (filter_size / 2 - 1), temp_stride, dst8,
-          dst_stride, w, h, filter_params_y, subpel_y_q4, y_step_q4, ref_idx,
-          bd);
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (int x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (int y = 0; y < h; ++y) {
+      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      const int rounding =
+          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
+          (1 << (bd + round1_bits - 1));
+      const int sum =
+          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
+      dst[y * dst_stride] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
+      y_q4 += y_step_q4;
     }
+    ++src;
+    ++dst;
   }
 }
 
-void av1_highbd_convolve_scale(const uint8_t *src8, int src_stride,
-                               uint8_t *dst8, int dst_stride, int w, int h,
-                               InterpFilters interp_filters,
-                               const int subpel_x_qn, int x_step_qn,
-                               const int subpel_y_qn, int y_step_qn,
-                               int ref_idx, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  int ignore_horiz = x_step_qn == SCALE_SUBPEL_SHIFTS && subpel_x_qn == 0;
-  int ignore_vert = y_step_qn == SCALE_SUBPEL_SHIFTS && subpel_y_qn == 0;
-
-  assert(w <= MAX_BLOCK_WIDTH);
-  assert(h <= MAX_BLOCK_HEIGHT);
-  assert(y_step_qn <= (MAX_STEP << SCALE_EXTRA_BITS));
-  assert(x_step_qn <= (MAX_STEP << SCALE_EXTRA_BITS));
-
-  if (ignore_horiz && ignore_vert) {
-    highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h, ref_idx, bd);
-    return;
-  }
-
-  InterpFilterParams filter_params_x, filter_params_y;
-  av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x,
-                                 &filter_params_y);
-
-  if (ignore_vert) {
-    av1_highbd_convolve_horiz_facade_scale(src8, src_stride, dst8, dst_stride,
-                                           w, h, filter_params_x, subpel_x_qn,
-                                           x_step_qn, ref_idx, bd);
-  } else if (ignore_horiz) {
-    av1_highbd_convolve_vert_facade_scale(src8, src_stride, dst8, dst_stride, w,
-                                          h, filter_params_y, subpel_y_qn,
-                                          y_step_qn, ref_idx, bd);
-  } else {
-    // temp's size is set to a 256 aligned value to facilitate SIMD
-    // implementation. The value is greater than (maximum possible intermediate
-    // height or width) * MAX_SB_SIZE
-    DECLARE_ALIGNED(16, uint16_t,
-                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-    uint8_t *temp8 = CONVERT_TO_BYTEPTR(temp);
-    int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
-    int filter_size;
-
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-    av1_convolve_filter_params_fixup_1212(&filter_params_x, &filter_params_y);
-
-    if (filter_params_y.taps < filter_params_x.taps) {
-      int intermediate_width;
-      int temp_stride = max_intermediate_size;
-      filter_size = filter_params_x.taps;
-      intermediate_width =
-          (((w - 1) * x_step_qn + subpel_x_qn) >> SCALE_SUBPEL_BITS) +
-          filter_size;
-      assert(intermediate_width <= max_intermediate_size);
-
-      assert(filter_params_y.taps <= MAX_FILTER_TAP);
-
-      av1_highbd_convolve_vert_facade_scale(
-          src8 - (filter_size / 2 - 1), src_stride, temp8, temp_stride,
-          intermediate_width, h, filter_params_y, subpel_y_qn, y_step_qn, 0,
-          bd);
-
-      assert(filter_params_x.taps <= MAX_FILTER_TAP);
-
-      av1_highbd_convolve_horiz_facade_scale(
-          temp8 + (filter_size / 2 - 1), temp_stride, dst8, dst_stride, w, h,
-          filter_params_x, subpel_x_qn, x_step_qn, ref_idx, bd);
-    } else {
-#endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-      int intermediate_height;
-      int temp_stride = MAX_SB_SIZE;
-      filter_size = filter_params_y.taps;
-      intermediate_height =
-          (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
-          filter_size;
-      assert(intermediate_height <= max_intermediate_size);
-      (void)max_intermediate_size;
-
-      av1_highbd_convolve_horiz_facade_scale(
-          src8 - src_stride * (filter_size / 2 - 1), src_stride, temp8,
-          temp_stride, w, intermediate_height, filter_params_x, subpel_x_qn,
-          x_step_qn, 0, bd);
-
-      filter_size = filter_params_y.taps;
-      assert(filter_params_y.taps <= MAX_FILTER_TAP);
-
-      av1_highbd_convolve_vert_facade_scale(
-          temp8 + temp_stride * (filter_size / 2 - 1), temp_stride, dst8,
-          dst_stride, w, h, filter_params_y, subpel_y_qn, y_step_qn, ref_idx,
-          bd);
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-    }
-#endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-  }
+void av1_highbd_wiener_convolve_add_src_c(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+    const int16_t *filter_y, int y_step_q4, int w, int h,
+    const ConvolveParams *conv_params, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+
+  highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                                    src_stride, temp, MAX_SB_SIZE, filters_x,
+                                    x0_q4, x_step_q4, w, intermediate_height,
+                                    conv_params->round_0, bd);
+  highbd_convolve_add_src_vert_hip(
+      temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
+      filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/av1/common/convolve.h b/third_party/aom/av1/common/convolve.h
index c43f649e0..1b2c2d0d5 100644
--- a/third_party/aom/av1/common/convolve.h
+++ b/third_party/aom/av1/common/convolve.h
@@ -17,140 +17,119 @@
 extern "C" {
 #endif
 
-typedef enum CONVOLVE_OPT {
-  // indicate the results in dst buf is rounded by FILTER_BITS or not
-  CONVOLVE_OPT_ROUND,
-  CONVOLVE_OPT_NO_ROUND,
-} CONVOLVE_OPT;
-
-typedef int32_t CONV_BUF_TYPE;
-
+typedef uint16_t CONV_BUF_TYPE;
 typedef struct ConvolveParams {
   int ref;
   int do_average;
-  CONVOLVE_OPT round;
   CONV_BUF_TYPE *dst;
   int dst_stride;
   int round_0;
   int round_1;
   int plane;
-  int do_post_rounding;
+  int is_compound;
+  int use_jnt_comp_avg;
+  int fwd_offset;
+  int bck_offset;
 } ConvolveParams;
 
-static INLINE ConvolveParams get_conv_params(int ref, int do_average,
-                                             int plane) {
-  ConvolveParams conv_params;
-  conv_params.ref = ref;
-  conv_params.do_average = do_average;
-  conv_params.round = CONVOLVE_OPT_ROUND;
-  conv_params.plane = plane;
-  conv_params.do_post_rounding = 0;
-  return conv_params;
-}
-
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-static INLINE void av1_convolve_filter_params_fixup_1212(
-    const InterpFilterParams *params_x, InterpFilterParams *params_y) {
-  if (params_x->interp_filter == MULTITAP_SHARP &&
-      params_y->interp_filter == MULTITAP_SHARP) {
-    // Avoid two directions both using 12-tap filter.
-    // This will reduce hardware implementation cost.
-    *params_y = av1_get_interp_filter_params(EIGHTTAP_SHARP);
-  }
-}
-#endif
-
-static INLINE void av1_get_convolve_filter_params(
-    InterpFilters interp_filters, int avoid_1212, InterpFilterParams *params_x,
-    InterpFilterParams *params_y) {
-#if CONFIG_DUAL_FILTER
+#define ROUND0_BITS 3
+#define COMPOUND_ROUND1_BITS 7
+#define WIENER_ROUND0_BITS 3
+
+#define WIENER_CLAMP_LIMIT(r0, bd) (1 << ((bd) + 1 + FILTER_BITS - r0))
+
+typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  InterpFilterParams *filter_params_x,
+                                  InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params);
+
+typedef void (*aom_highbd_convolve_fn_t)(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+static INLINE void av1_get_convolve_filter_params(InterpFilters interp_filters,
+                                                  InterpFilterParams *params_x,
+                                                  InterpFilterParams *params_y,
+                                                  int w, int h) {
   InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
   InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
-#else
-  InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 0);
-  InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
-#endif
-
-  *params_x = av1_get_interp_filter_params(filter_x);
-  *params_y = av1_get_interp_filter_params(filter_y);
-
-  if (avoid_1212) {
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-    convolve_filter_params_fixup_1212(params_x, params_y);
-#endif
-  }
+  *params_x = av1_get_interp_filter_params_with_block_size(filter_x, w);
+  *params_y = av1_get_interp_filter_params_with_block_size(filter_y, h);
 }
 
 struct AV1Common;
-void av1_convolve_init(struct AV1Common *cm);
+struct scale_factors;
 
-#if CONFIG_CONVOLVE_ROUND
 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             InterpFilters interp_filters, const int subpel_x_q4,
                             int x_step_q4, const int subpel_y_q4, int y_step_q4,
-                            int scaled, ConvolveParams *conv_params);
+                            int scaled, ConvolveParams *conv_params,
+                            const struct scale_factors *sf);
 
 static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average,
-                                                      int plane, int32_t *dst,
-                                                      int dst_stride) {
+                                                      int plane,
+                                                      CONV_BUF_TYPE *dst,
+                                                      int dst_stride,
+                                                      int is_compound, int bd) {
   ConvolveParams conv_params;
   conv_params.ref = ref;
   conv_params.do_average = do_average;
-  conv_params.round = CONVOLVE_OPT_NO_ROUND;
-#if CONFIG_COMPOUND_ROUND
-  conv_params.round_0 = FILTER_BITS;
-#else
-  conv_params.round_0 = 5;
-#endif
-  conv_params.round_1 = 0;
+  assert(IMPLIES(do_average, is_compound));
+  conv_params.is_compound = is_compound;
+  conv_params.round_0 = ROUND0_BITS;
+  conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS
+                                    : 2 * FILTER_BITS - conv_params.round_0;
+  const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
+  assert(IMPLIES(bd < 12, intbufrange <= 16));
+  if (intbufrange > 16) {
+    conv_params.round_0 += intbufrange - 16;
+    if (!is_compound) conv_params.round_1 -= intbufrange - 16;
+  }
+  // TODO(yunqing): The following dst should only be valid while
+  // is_compound = 1;
   conv_params.dst = dst;
   conv_params.dst_stride = dst_stride;
   conv_params.plane = plane;
-  conv_params.do_post_rounding = 0;
   return conv_params;
 }
 
-#if CONFIG_HIGHBITDEPTH
+static INLINE ConvolveParams get_conv_params(int ref, int do_average, int plane,
+                                             int bd) {
+  return get_conv_params_no_round(ref, do_average, plane, NULL, 0, 0, bd);
+}
+
+static INLINE ConvolveParams get_conv_params_wiener(int bd) {
+  ConvolveParams conv_params;
+  (void)bd;
+  conv_params.ref = 0;
+  conv_params.do_average = 0;
+  conv_params.is_compound = 0;
+  conv_params.round_0 = WIENER_ROUND0_BITS;
+  conv_params.round_1 = 2 * FILTER_BITS - conv_params.round_0;
+  const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
+  assert(IMPLIES(bd < 12, intbufrange <= 16));
+  if (intbufrange > 16) {
+    conv_params.round_0 += intbufrange - 16;
+    conv_params.round_1 -= intbufrange - 16;
+  }
+  conv_params.dst = NULL;
+  conv_params.dst_stride = 0;
+  conv_params.plane = 0;
+  return conv_params;
+}
+
 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
                                    uint8_t *dst, int dst_stride, int w, int h,
                                    InterpFilters interp_filters,
                                    const int subpel_x_q4, int x_step_q4,
                                    const int subpel_y_q4, int y_step_q4,
                                    int scaled, ConvolveParams *conv_params,
-                                   int bd);
-#endif
-#endif  // CONFIG_CONVOLVE_ROUND
-
-void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
-                  int dst_stride, int w, int h, InterpFilters interp_filters,
-                  const int subpel_x, int xstep, const int subpel_y, int ystep,
-                  ConvolveParams *conv_params);
-
-void av1_convolve_c(const uint8_t *src, int src_stride, uint8_t *dst,
-                    int dst_stride, int w, int h, InterpFilters interp_filters,
-                    const int subpel_x, int xstep, const int subpel_y,
-                    int ystep, ConvolveParams *conv_params);
-
-void av1_convolve_scale(const uint8_t *src, int src_stride, uint8_t *dst,
-                        int dst_stride, int w, int h,
-                        InterpFilters interp_filters, const int subpel_x,
-                        int xstep, const int subpel_y, int ystep,
-                        ConvolveParams *conv_params);
-
-#if CONFIG_HIGHBITDEPTH
-void av1_highbd_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
-                         int dst_stride, int w, int h,
-                         InterpFilters interp_filters, const int subpel_x,
-                         int xstep, const int subpel_y, int ystep, int avg,
-                         int bd);
-
-void av1_highbd_convolve_scale(const uint8_t *src, int src_stride, uint8_t *dst,
-                               int dst_stride, int w, int h,
-                               InterpFilters interp_filters, const int subpel_x,
-                               int xstep, const int subpel_y, int ystep,
-                               int avg, int bd);
-#endif  // CONFIG_HIGHBITDEPTH
+                                   const struct scale_factors *sf, int bd);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/common/daala_tx.c b/third_party/aom/av1/common/daala_tx.c
deleted file mode 100644
index e5b2372e3..000000000
--- a/third_party/aom/av1/common/daala_tx.c
+++ /dev/null
@@ -1,4331 +0,0 @@
-#include "av1/common/daala_tx.h"
-#include "av1/common/odintrin.h"
-
-/* clang-format off */
-
-# define OD_DCT_RSHIFT(_a, _b) OD_UNBIASED_RSHIFT32(_a, _b)
-
-/* TODO: Daala DCT overflow checks need to be ported as a later test */
-# if defined(OD_DCT_CHECK_OVERFLOW)
-# else
-#  define OD_DCT_OVERFLOW_CHECK(val, scale, offset, idx)
-# endif
-
-#define OD_FDCT_2(p0, p1) \
-  /* Embedded 2-point orthonormal Type-II fDCT. */ \
-  do { \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(p1, 13573, 16384, 100); \
-    p0 -= (p1*13573 + 16384) >> 15; \
-    /* 5793/8192 ~= Sin[pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(p0, 5793, 4096, 101); \
-    p1 += (p0*5793 + 4096) >> 13; \
-    /* 3393/8192 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(p1, 3393, 4096, 102); \
-    p0 -= (p1*3393 + 4096) >> 13; \
-  } \
-  while (0)
-
-#define OD_IDCT_2(p0, p1) \
-  /* Embedded 2-point orthonormal Type-II iDCT. */ \
-  do { \
-    /* 3393/8192 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    p0 += (p1*3393 + 4096) >> 13; \
-    /* 5793/8192 ~= Sin[pi/4] ~= 0.707106781186547 */ \
-    p1 -= (p0*5793 + 4096) >> 13; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    p0 += (p1*13573 + 16384) >> 15; \
-  } \
-  while (0)
-
-#define OD_FDCT_2_ASYM(p0, p1, p1h) \
-  /* Embedded 2-point asymmetric Type-II fDCT. */ \
-  do { \
-    p0 += p1h; \
-    p1 = p0 - p1; \
-  } \
-  while (0)
-
-#define OD_IDCT_2_ASYM(p0, p1, p1h) \
-  /* Embedded 2-point asymmetric Type-II iDCT. */ \
-  do { \
-    p1 = p0 - p1; \
-    p1h = OD_DCT_RSHIFT(p1, 1); \
-    p0 -= p1h; \
-  } \
-  while (0)
-
-#define OD_FDST_2(p0, p1) \
-  /* Embedded 2-point orthonormal Type-IV fDST. */ \
-  do { \
-    /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(p1, 10947, 8192, 103); \
-    p0 -= (p1*10947 + 8192) >> 14; \
-    /* 473/512 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(p0, 473, 256, 104); \
-    p1 += (p0*473 + 256) >> 9; \
-    /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(p1, 10947, 8192, 105); \
-    p0 -= (p1*10947 + 8192) >> 14; \
-  } \
-  while (0)
-
-#define OD_IDST_2(p0, p1) \
-  /* Embedded 2-point orthonormal Type-IV iDST. */ \
-  do { \
-    /* 10947/16384 ~= Tan[3*Pi/16]) ~= 0.668178637919299 */ \
-    p0 += (p1*10947 + 8192) >> 14; \
-    /* 473/512 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    p1 -= (p0*473 + 256) >> 9; \
-    /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    p0 += (p1*10947 + 8192) >> 14; \
-  } \
-  while (0)
-
-#define OD_FDST_2_ASYM(p0, p1) \
-  /* Embedded 2-point asymmetric Type-IV fDST. */ \
-  do { \
-    /* 11507/16384 ~= 4*Sin[Pi/8] - 2*Tan[Pi/8] ~= 0.702306604714169 */ \
-    OD_DCT_OVERFLOW_CHECK(p1, 11507, 8192, 187); \
-    p0 -= (p1*11507 + 8192) >> 14; \
-    /* 669/1024 ~= Cos[Pi/8]/Sqrt[2] ~= 0.653281482438188 */ \
-    OD_DCT_OVERFLOW_CHECK(p0, 669, 512, 188); \
-    p1 += (p0*669 + 512) >> 10; \
-    /* 4573/4096 ~= 4*Sin[Pi/8] - Tan[Pi/8] ~= 1.11652016708726 */ \
-    OD_DCT_OVERFLOW_CHECK(p1, 4573, 2048, 189); \
-    p0 -= (p1*4573 + 2048) >> 12; \
-  } \
-  while (0)
-
-#define OD_IDST_2_ASYM(p0, p1) \
-  /* Embedded 2-point asymmetric Type-IV iDST. */ \
-  do { \
-    /* 4573/4096 ~= 4*Sin[Pi/8] - Tan[Pi/8] ~= 1.11652016708726 */ \
-    p0 += (p1*4573 + 2048) >> 12; \
-    /* 669/1024 ~= Cos[Pi/8]/Sqrt[2] ~= 0.653281482438188 */ \
-    p1 -= (p0*669 + 512) >> 10; \
-    /* 11507/16384 ~= 4*Sin[Pi/8] - 2*Tan[Pi/8] ~= 0.702306604714169 */ \
-    p0 += (p1*11507 + 8192) >> 14; \
-  } \
-  while (0)
-
-#define OD_FDCT_4(q0, q2, q1, q3) \
-  /* Embedded 4-point orthonormal Type-II fDCT. */ \
-  do { \
-    int q2h; \
-    int q3h; \
-    q3 = q0 - q3; \
-    q3h = OD_DCT_RSHIFT(q3, 1); \
-    q0 -= q3h; \
-    q2 += q1; \
-    q2h = OD_DCT_RSHIFT(q2, 1); \
-    q1 = q2h - q1; \
-    OD_FDCT_2_ASYM(q0, q2, q2h); \
-    OD_FDST_2_ASYM(q3, q1); \
-  } \
-  while (0)
-
-#define OD_IDCT_4(q0, q2, q1, q3) \
-  /* Embedded 4-point orthonormal Type-II iDCT. */ \
-  do { \
-    int q1h; \
-    int q3h; \
-    OD_IDST_2_ASYM(q3, q2); \
-    OD_IDCT_2_ASYM(q0, q1, q1h); \
-    q3h = OD_DCT_RSHIFT(q3, 1); \
-    q0 += q3h; \
-    q3 = q0 - q3; \
-    q2 = q1h - q2; \
-    q1 -= q2; \
-  } \
-  while (0)
-
-#define OD_FDCT_4_ASYM(q0, q2, q2h, q1, q3, q3h) \
-  /* Embedded 4-point asymmetric Type-II fDCT. */ \
-  do { \
-    q0 += q3h; \
-    q3 = q0 - q3; \
-    q1 = q2h - q1; \
-    q2 = q1 - q2; \
-    OD_FDCT_2(q0, q2); \
-    OD_FDST_2(q3, q1); \
-  } \
-  while (0)
-
-#define OD_IDCT_4_ASYM(q0, q2, q1, q1h, q3, q3h) \
-  /* Embedded 4-point asymmetric Type-II iDCT. */ \
-  do { \
-    OD_IDST_2(q3, q2); \
-    OD_IDCT_2(q0, q1); \
-    q1 = q2 - q1; \
-    q1h = OD_DCT_RSHIFT(q1, 1); \
-    q2 = q1h - q2; \
-    q3 = q0 - q3; \
-    q3h = OD_DCT_RSHIFT(q3, 1); \
-    q0 -= q3h; \
-  } \
-  while (0)
-
-#define OD_FDST_4(q0, q2, q1, q3) \
-  /* Embedded 4-point orthonormal Type-IV fDST. */ \
-  do { \
-    int q0h; \
-    int q1h; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(q1, 13573, 16384, 190); \
-    q2 += (q1*13573 + 16384) >> 15; \
-    /* 5793/8192 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(q2, 5793, 4096, 191); \
-    q1 -= (q2*5793 + 4096) >> 13; \
-    /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(q1, 3393, 4096, 192); \
-    q2 += (q1*3393 + 4096) >> 13; \
-    q0 += q2; \
-    q0h = OD_DCT_RSHIFT(q0, 1); \
-    q2 = q0h - q2; \
-    q1 += q3; \
-    q1h = OD_DCT_RSHIFT(q1, 1); \
-    q3 -= q1h; \
-    /* 537/1024 ~= (1/Sqrt[2] - Cos[3*Pi/16]/2)/Sin[3*Pi/16] ~=
-        0.524455699240090 */ \
-    OD_DCT_OVERFLOW_CHECK(q1, 537, 512, 193); \
-    q2 -= (q1*537 + 512) >> 10; \
-    /* 1609/2048 ~= Sqrt[2]*Sin[3*Pi/16] ~= 0.785694958387102 */ \
-    OD_DCT_OVERFLOW_CHECK(q2, 1609, 1024, 194); \
-    q1 += (q2*1609 + 1024) >> 11; \
-    /* 7335/32768 ~= (1/Sqrt[2] - Cos[3*Pi/16])/Sin[3*Pi/16] ~=
-        0.223847182092655 */ \
-    OD_DCT_OVERFLOW_CHECK(q1, 7335, 16384, 195); \
-    q2 += (q1*7335 + 16384) >> 15; \
-    /* 5091/8192 ~= (1/Sqrt[2] - Cos[7*Pi/16]/2)/Sin[7*Pi/16] ~=
-        0.6215036383171189 */ \
-    OD_DCT_OVERFLOW_CHECK(q0, 5091, 4096, 196); \
-    q3 += (q0*5091 + 4096) >> 13; \
-    /* 5681/4096 ~= Sqrt[2]*Sin[7*Pi/16] ~= 1.38703984532215 */ \
-    OD_DCT_OVERFLOW_CHECK(q3, 5681, 2048, 197); \
-    q0 -= (q3*5681 + 2048) >> 12; \
-    /* 4277/8192 ~= (1/Sqrt[2] - Cos[7*Pi/16])/Sin[7*Pi/16] ~=
-        0.52204745462729 */ \
-    OD_DCT_OVERFLOW_CHECK(q0, 4277, 4096, 198); \
-    q3 += (q0*4277 + 4096) >> 13; \
-  } \
-  while (0)
-
-#define OD_IDST_4(q0, q2, q1, q3) \
-  /* Embedded 4-point orthonormal Type-IV iDST. */ \
-  do { \
-    int q0h; \
-    int q2h; \
-    /* 4277/8192 ~= (1/Sqrt[2] - Cos[7*Pi/16])/Sin[7*Pi/16] ~=
-        0.52204745462729 */ \
-    q3 -= (q0*4277 + 4096) >> 13; \
-    /* 5681/4096 ~= Sqrt[2]*Sin[7*Pi/16] ~= 1.38703984532215 */ \
-    q0 += (q3*5681 + 2048) >> 12; \
-    /* 5091/8192 ~= (1/Sqrt[2] - Cos[7*Pi/16]/2)/Sin[7*Pi/16] ~=
-        0.6215036383171189 */ \
-    q3 -= (q0*5091 + 4096) >> 13; \
-    /* 7335/32768 ~= (1/Sqrt[2] - Cos[3*Pi/16])/Sin[3*Pi/16] ~=
-        0.223847182092655 */ \
-    q1 -= (q2*7335 + 16384) >> 15; \
-    /* 1609/2048 ~= Sqrt[2]*Sin[3*Pi/16] ~= 0.785694958387102 */ \
-    q2 -= (q1*1609 + 1024) >> 11; \
-    /* 537/1024 ~= (1/Sqrt[2] - Cos[3*Pi/16]/2)/Sin[3*Pi/16] ~=
-        0.524455699240090 */ \
-    q1 += (q2*537 + 512) >> 10; \
-    q2h = OD_DCT_RSHIFT(q2, 1); \
-    q3 += q2h; \
-    q2 -= q3; \
-    q0h = OD_DCT_RSHIFT(q0, 1); \
-    q1 = q0h - q1; \
-    q0 -= q1; \
-    /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    q1 -= (q2*3393 + 4096) >> 13; \
-    /* 5793/8192 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    q2 += (q1*5793 + 4096) >> 13; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    q1 -= (q2*13573 + 16384) >> 15; \
-  } \
-  while (0)
-
-#define OD_FDST_4_ASYM(t0, t0h, t2, t1, t3) \
-  /* Embedded 4-point asymmetric Type-IV fDST. */ \
-  do { \
-    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 7489, 4096, 106); \
-    t2 -= (t1*7489 + 4096) >> 13; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 11585, 8192, 107); \
-    t1 += (t2*11585 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 19195, 16384, 108); \
-    t2 += (t1*19195 + 16384) >> 15; \
-    t3 += OD_DCT_RSHIFT(t2, 1); \
-    t2 -= t3; \
-    t1 = t0h - t1; \
-    t0 -= t1; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 6723, 4096, 109); \
-    t3 += (t0*6723 + 4096) >> 13; \
-    /* 8035/8192 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 8035, 4096, 110); \
-    t0 -= (t3*8035 + 4096) >> 13; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 6723, 4096, 111); \
-    t3 += (t0*6723 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 8757, 8192, 112); \
-    t2 += (t1*8757 + 8192) >> 14; \
-    /* 6811/8192 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 6811, 4096, 113); \
-    t1 -= (t2*6811 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 8757, 8192, 114); \
-    t2 += (t1*8757 + 8192) >> 14; \
-  } \
-  while (0)
-
-#define OD_IDST_4_ASYM(t0, t0h, t2, t1, t3) \
-  /* Embedded 4-point asymmetric Type-IV iDST. */ \
-  do { \
-    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    t1 -= (t2*8757 + 8192) >> 14; \
-    /* 6811/8192 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
-    t2 += (t1*6811 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    t1 -= (t2*8757 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    t3 -= (t0*6723 + 4096) >> 13; \
-    /* 8035/8192 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    t0 += (t3*8035 + 4096) >> 13; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    t3 -= (t0*6723 + 4096) >> 13; \
-    t0 += t2; \
-    t0h = OD_DCT_RSHIFT(t0, 1); \
-    t2 = t0h - t2; \
-    t1 += t3; \
-    t3 -= OD_DCT_RSHIFT(t1, 1); \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    t1 -= (t2*19195 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    t2 -= (t1*11585 + 8192) >> 14; \
-    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    t1 += (t2*7489 + 4096) >> 13; \
-  } \
-  while (0)
-
-#define OD_FDCT_8(r0, r4, r2, r6, r1, r5, r3, r7) \
-  /* Embedded 8-point orthonormal Type-II fDCT. */ \
-  do { \
-    int r4h; \
-    int r5h; \
-    int r6h; \
-    int r7h; \
-    r7 = r0 - r7; \
-    r7h = OD_DCT_RSHIFT(r7, 1); \
-    r0 -= r7h; \
-    r6 += r1; \
-    r6h = OD_DCT_RSHIFT(r6, 1); \
-    r1 = r6h - r1; \
-    r5 = r2 - r5; \
-    r5h = OD_DCT_RSHIFT(r5, 1); \
-    r2 -= r5h; \
-    r4 += r3; \
-    r4h = OD_DCT_RSHIFT(r4, 1); \
-    r3 = r4h - r3; \
-    OD_FDCT_4_ASYM(r0, r4, r4h, r2, r6, r6h); \
-    OD_FDST_4_ASYM(r7, r7h, r3, r5, r1); \
-  } \
-  while (0)
-
-#define OD_IDCT_8(r0, r4, r2, r6, r1, r5, r3, r7) \
-  /* Embedded 8-point orthonormal Type-II iDCT. */ \
-  do { \
-    int r1h; \
-    int r3h; \
-    int r5h; \
-    int r7h; \
-    OD_IDST_4_ASYM(r7, r7h, r5, r6, r4); \
-    OD_IDCT_4_ASYM(r0, r2, r1, r1h, r3, r3h); \
-    r0 += r7h; \
-    r7 = r0 - r7; \
-    r6 = r1h - r6; \
-    r1 -= r6; \
-    r5h = OD_DCT_RSHIFT(r5, 1); \
-    r2 += r5h; \
-    r5 = r2 - r5; \
-    r4 = r3h - r4; \
-    r3 -= r4; \
-  } \
-  while (0)
-
-#define OD_FDCT_8_ASYM(r0, r4, r4h, r2, r6, r6h, r1, r5, r5h, r3, r7, r7h) \
-  /* Embedded 8-point asymmetric Type-II fDCT. */ \
-  do { \
-    r0 += r7h; \
-    r7 = r0 - r7; \
-    r1 = r6h - r1; \
-    r6 -= r1; \
-    r2 += r5h; \
-    r5 = r2 - r5; \
-    r3 = r4h - r3; \
-    r4 -= r3; \
-    OD_FDCT_4(r0, r4, r2, r6); \
-    OD_FDST_4(r7, r3, r5, r1); \
-  } \
-  while (0)
-
-#define OD_IDCT_8_ASYM(r0, r4, r2, r6, r1, r1h, r5, r5h, r3, r3h, r7, r7h) \
-  /* Embedded 8-point asymmetric Type-II iDCT. */ \
-  do { \
-    OD_IDST_4(r7, r5, r6, r4); \
-    OD_IDCT_4(r0, r2, r1, r3); \
-    r7 = r0 - r7; \
-    r7h = OD_DCT_RSHIFT(r7, 1); \
-    r0 -= r7h; \
-    r1 += r6; \
-    r1h = OD_DCT_RSHIFT(r1, 1); \
-    r6 = r1h - r6; \
-    r5 = r2 - r5; \
-    r5h = OD_DCT_RSHIFT(r5, 1); \
-    r2 -= r5h; \
-    r3 += r4; \
-    r3h = OD_DCT_RSHIFT(r3, 1); \
-    r4 = r3h - r4; \
-  } \
-  while (0)
-
-#define OD_FDST_8(t0, t4, t2, t6, t1, t5, t3, t7)  \
-  /* Embedded 8-point orthonormal Type-IV fDST. */ \
-  do { \
-    int t0h; \
-    int t2h; \
-    int t5h; \
-    int t7h; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 13573, 16384, 115); \
-    t6 -= (t1*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 11585, 8192, 116); \
-    t1 += (t6*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 13573, 16384, 117); \
-    t6 -= (t1*13573 + 16384) >> 15; \
-    /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 21895, 16384, 118); \
-    t5 -= (t2*21895 + 16384) >> 15; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 15137, 8192, 119); \
-    t2 += (t5*15137 + 8192) >> 14; \
-    /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 10947, 8192, 120); \
-    t5 -= (t2*10947 + 8192) >> 14; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 3259, 8192, 121); \
-    t4 -= (t3*3259 + 8192) >> 14; \
-    /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 3135, 4096, 122); \
-    t3 += (t4*3135 + 4096) >> 13; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 3259, 8192, 123); \
-    t4 -= (t3*3259 + 8192) >> 14; \
-    t7 += t1; \
-    t7h = OD_DCT_RSHIFT(t7, 1); \
-    t1 -= t7h; \
-    t2 = t3 - t2; \
-    t2h = OD_DCT_RSHIFT(t2, 1); \
-    t3 -= t2h; \
-    t0 -= t6; \
-    t0h = OD_DCT_RSHIFT(t0, 1); \
-    t6 += t0h; \
-    t5 = t4 - t5; \
-    t5h = OD_DCT_RSHIFT(t5, 1); \
-    t4 -= t5h; \
-    t1 += t5h; \
-    t5 = t1 - t5; \
-    t4 += t0h; \
-    t0 -= t4; \
-    t6 -= t2h; \
-    t2 += t6; \
-    t3 -= t7h; \
-    t7 += t3; \
-    /* TODO: Can we move this into another operation */ \
-    t7 = -t7; \
-    /* 7425/8192 ~= Tan[15*Pi/64] ~= 0.906347169019147 */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 7425, 4096, 124); \
-    t0 -= (t7*7425 + 4096) >> 13; \
-    /* 8153/8192 ~= Sin[15*Pi/32] ~= 0.995184726672197 */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 8153, 4096, 125); \
-    t7 += (t0*8153 + 4096) >> 13; \
-    /* 7425/8192 ~= Tan[15*Pi/64] ~= 0.906347169019147 */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 7425, 4096, 126); \
-    t0 -= (t7*7425 + 4096) >> 13; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.148335987538347 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 4861, 16384, 127); \
-    t6 -= (t1*4861 + 16384) >> 15; \
-    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.290284677254462 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 1189, 2048, 128); \
-    t1 += (t6*1189 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.148335987538347 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 4861, 16384, 129); \
-    t6 -= (t1*4861 + 16384) >> 15; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.599376933681924 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 2455, 2048, 130); \
-    t2 -= (t5*2455 + 2048) >> 12; \
-    /* 7225/8192 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 7225, 4096, 131); \
-    t5 += (t2*7225 + 4096) >> 13; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.599376933681924 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 2455, 2048, 132); \
-    t2 -= (t5*2455 + 2048) >> 12; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.357805721314524 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 11725, 16384, 133); \
-    t4 -= (t3*11725 + 16384) >> 15; \
-    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.634393284163645 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 5197, 4096, 134); \
-    t3 += (t4*5197 + 4096) >> 13; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.357805721314524 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 11725, 16384, 135); \
-    t4 -= (t3*11725 + 16384) >> 15; \
-  } \
-  while (0)
-
-#define OD_IDST_8(t0, t4, t2, t6, t1, t5, t3, t7) \
-  /* Embedded 8-point orthonormal Type-IV iDST. */ \
-  do { \
-    int t0h; \
-    int t2h; \
-    int t5h_; \
-    int t7h_; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.357805721314524 */ \
-    t1 += (t6*11725 + 16384) >> 15; \
-    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.634393284163645 */ \
-    t6 -= (t1*5197 + 4096) >> 13; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.357805721314524 */ \
-    t1 += (t6*11725 + 16384) >> 15; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.599376933681924 */ \
-    t2 += (t5*2455 + 2048) >> 12; \
-    /* 7225/8192 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \
-    t5 -= (t2*7225 + 4096) >> 13; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.599376933681924 */ \
-    t2 += (t5*2455 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.148335987538347 */ \
-    t3 += (t4*4861 + 16384) >> 15; \
-    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.290284677254462 */ \
-    t4 -= (t3*1189 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.148335987538347 */ \
-    t3 += (t4*4861 + 16384) >> 15; \
-    /* 7425/8192 ~= Tan[15*Pi/64] ~= 0.906347169019147 */ \
-    t0 += (t7*7425 + 4096) >> 13; \
-    /* 8153/8192 ~= Sin[15*Pi/32] ~= 0.995184726672197 */ \
-    t7 -= (t0*8153 + 4096) >> 13; \
-    /* 7425/8192 ~= Tan[15*Pi/64] ~= 0.906347169019147 */ \
-    t0 += (t7*7425 + 4096) >> 13; \
-    /* TODO: Can we move this into another operation */ \
-    t7 = -t7; \
-    t7 -= t6; \
-    t7h_ = OD_DCT_RSHIFT(t7, 1); \
-    t6 += t7h_; \
-    t2 -= t3; \
-    t2h = OD_DCT_RSHIFT(t2, 1); \
-    t3 += t2h; \
-    t0 += t1; \
-    t0h = OD_DCT_RSHIFT(t0, 1); \
-    t1 -= t0h; \
-    t5 = t4 - t5; \
-    t5h_ = OD_DCT_RSHIFT(t5, 1); \
-    t4 -= t5h_; \
-    t1 += t5h_; \
-    t5 = t1 - t5; \
-    t3 -= t0h; \
-    t0 += t3; \
-    t6 += t2h; \
-    t2 = t6 - t2; \
-    t4 += t7h_; \
-    t7 -= t4; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    t1 += (t6*3259 + 8192) >> 14; \
-    /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
-    t6 -= (t1*3135 + 4096) >> 13; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    t1 += (t6*3259 + 8192) >> 14; \
-    /* 10947/16384 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    t5 += (t2*10947 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t2 -= (t5*15137 + 8192) >> 14; \
-    /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    t5 += (t2*21895 + 16384) >> 15; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    t3 += (t4*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    t4 -= (t3*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    t3 += (t4*13573 + 16384) >> 15; \
-  } \
-  while (0)
-
-/* Rewrite this so that t0h can be passed in. */
-#define OD_FDST_8_ASYM(t0, t4, t2, t6, t1, t5, t3, t7) \
-  /* Embedded 8-point asymmetric Type-IV fDST. */ \
-  do { \
-    int t0h; \
-    int t2h; \
-    int t5h; \
-    int t7h; \
-    /* 1035/2048 ~= (Sqrt[2] - Cos[7*Pi/32])/(2*Sin[7*Pi/32]) */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 1035, 1024, 199); \
-    t6 += (t1*1035 + 1024) >> 11; \
-    /* 3675/4096 ~= Sqrt[2]*Sin[7*Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 3675, 2048, 200); \
-    t1 -= (t6*3675 + 2048) >> 12; \
-    /* 851/8192 ~= (Cos[7*Pi/32] - 1/Sqrt[2])/Sin[7*Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 851, 4096, 201); \
-    t6 -= (t1*851 + 4096) >> 13; \
-    /* 4379/8192 ~= (Sqrt[2] - Sin[5*Pi/32])/(2*Cos[5*Pi/32]) */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 4379, 4096, 202); \
-    t5 += (t2*4379 + 4096) >> 13; \
-    /* 10217/8192 ~= Sqrt[2]*Cos[5*Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 10217, 4096, 203); \
-    t2 -= (t5*10217 + 4096) >> 13; \
-    /* 4379/16384 ~= (1/Sqrt[2] - Sin[5*Pi/32])/Cos[5*Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 4379, 8192, 204); \
-    t5 += (t2*4379 + 8192) >> 14; \
-    /* 12905/16384 ~= (Sqrt[2] - Cos[3*Pi/32])/(2*Sin[3*Pi/32]) */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 12905, 8192, 205); \
-    t4 += (t3*12905 + 8192) >> 14; \
-    /* 3363/8192 ~= Sqrt[2]*Sin[3*Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 3363, 4096, 206); \
-    t3 -= (t4*3363 + 4096) >> 13; \
-    /* 3525/4096 ~= (Cos[3*Pi/32] - 1/Sqrt[2])/Sin[3*Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 3525, 2048, 207); \
-    t4 -= (t3*3525 + 2048) >> 12; \
-    /* 5417/8192 ~= (Sqrt[2] - Sin[Pi/32])/(2*Cos[Pi/32]) */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 5417, 4096, 208); \
-    t7 += (t0*5417 + 4096) >> 13; \
-    /* 5765/4096 ~= Sqrt[2]*Cos[Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 5765, 2048, 209); \
-    t0 -= (t7*5765 + 2048) >> 12; \
-    /* 2507/4096 ~= (1/Sqrt[2] - Sin[Pi/32])/Cos[Pi/32] */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 2507, 2048, 210); \
-    t7 += (t0*2507 + 2048) >> 12; \
-    t0 += t1; \
-    t0h = OD_DCT_RSHIFT(t0, 1); \
-    t1 -= t0h; \
-    t2 -= t3; \
-    t2h = OD_DCT_RSHIFT(t2, 1); \
-    t3 += t2h; \
-    t5 -= t4; \
-    t5h = OD_DCT_RSHIFT(t5, 1); \
-    t4 += t5h; \
-    t7 += t6; \
-    t7h = OD_DCT_RSHIFT(t7, 1); \
-    t6 = t7h - t6; \
-    t4 = t7h - t4; \
-    t7 -= t4; \
-    t1 += t5h; \
-    t5 = t1 - t5; \
-    t6 += t2h; \
-    t2 = t6 - t2; \
-    t3 -= t0h; \
-    t0 += t3; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 3259, 8192, 211); \
-    t1 += (t6*3259 + 8192) >> 14; \
-    /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 3135, 4096, 212); \
-    t6 -= (t1*3135 + 4096) >> 13; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 3259, 8192, 213); \
-    t1 += (t6*3259 + 8192) >> 14; \
-    /* 2737/4096 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 2737, 2048, 214); \
-    t5 += (t2*2737 + 2048) >> 12; \
-    /* 473/512 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 473, 256, 215); \
-    t2 -= (t5*473 + 256) >> 9; \
-    /* 2737/4096 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 2737, 2048, 216); \
-    t5 += (t2*2737 + 2048) >> 12; \
-    /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 3393, 4096, 217); \
-    t3 += (t4*3393 + 4096) >> 13; \
-    /* 5793/8192 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 5793, 4096, 218); \
-    t4 -= (t3*5793 + 4096) >> 13; \
-    /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 3393, 4096, 219); \
-    t3 += (t4*3393 + 4096) >> 13; \
-  } \
-  while (0)
-
-#define OD_IDST_8_ASYM(t0, t4, t2, t6, t1, t5, t3, t7) \
-  /* Embedded 8-point asymmetric Type-IV iDST. */ \
-  do { \
-    int t0h; \
-    int t2h; \
-    int t5h__; \
-    int t7h__; \
-    /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    t6 -= (t1*3393 + 4096) >> 13; \
-    /* 5793/8192 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    t1 += (t6*5793 + 4096) >> 13; \
-    /* 3393/8192 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    t6 -= (t1*3393 + 4096) >> 13; \
-    /* 2737/4096 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    t5 -= (t2*2737 + 2048) >> 12; \
-    /* 473/512 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t2 += (t5*473 + 256) >> 9; \
-    /* 2737/4096 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    t5 -= (t2*2737 + 2048) >> 12; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    t4 -= (t3*3259 + 8192) >> 14; \
-    /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
-    t3 += (t4*3135 + 4096) >> 13; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    t4 -= (t3*3259 + 8192) >> 14; \
-    t0 -= t6; \
-    t0h = OD_DCT_RSHIFT(t0, 1); \
-    t6 += t0h; \
-    t2 = t3 - t2; \
-    t2h = OD_DCT_RSHIFT(t2, 1); \
-    t3 -= t2h; \
-    t5 = t4 - t5; \
-    t5h__ = OD_DCT_RSHIFT(t5, 1); \
-    t4 -= t5h__; \
-    t7 += t1; \
-    t7h__ = OD_DCT_RSHIFT(t7, 1); \
-    t1 = t7h__ - t1; \
-    t3 = t7h__ - t3; \
-    t7 -= t3; \
-    t1 -= t5h__; \
-    t5 += t1; \
-    t6 -= t2h; \
-    t2 += t6; \
-    t4 += t0h; \
-    t0 -= t4; \
-    /* 2507/4096 ~= (1/Sqrt[2] - Sin[Pi/32])/Cos[Pi/32] */ \
-    t7 -= (t0*2507 + 2048) >> 12; \
-    /* 5765/4096 ~= Sqrt[2]*Cos[Pi/32] */ \
-    t0 += (t7*5765 + 2048) >> 12; \
-    /* 5417/8192 ~= (Sqrt[2] - Sin[Pi/32])/(2*Cos[Pi/32]) */ \
-    t7 -= (t0*5417 + 4096) >> 13; \
-    /* 3525/4096 ~= (Cos[3*Pi/32] - 1/Sqrt[2])/Sin[3*Pi/32] */ \
-    t1 += (t6*3525 + 2048) >> 12; \
-    /* 3363/8192 ~= Sqrt[2]*Sin[3*Pi/32] */ \
-    t6 += (t1*3363 + 4096) >> 13; \
-    /* 12905/16384 ~= (1/Sqrt[2] - Cos[3*Pi/32]/1)/Sin[3*Pi/32] */ \
-    t1 -= (t6*12905 + 8192) >> 14; \
-    /* 4379/16384 ~= (1/Sqrt[2] - Sin[5*Pi/32])/Cos[5*Pi/32] */ \
-    t5 -= (t2*4379 + 8192) >> 14; \
-    /* 10217/8192 ~= Sqrt[2]*Cos[5*Pi/32] */ \
-    t2 += (t5*10217 + 4096) >> 13; \
-    /* 4379/8192 ~= (Sqrt[2] - Sin[5*Pi/32])/(2*Cos[5*Pi/32]) */ \
-    t5 -= (t2*4379 + 4096) >> 13; \
-    /* 851/8192 ~= (Cos[7*Pi/32] - 1/Sqrt[2])/Sin[7*Pi/32] */ \
-    t3 += (t4*851 + 4096) >> 13; \
-    /* 3675/4096 ~= Sqrt[2]*Sin[7*Pi/32] */ \
-    t4 += (t3*3675 + 2048) >> 12; \
-    /* 1035/2048 ~= (Sqrt[2] - Cos[7*Pi/32])/(2*Sin[7*Pi/32]) */ \
-    t3 -= (t4*1035 + 1024) >> 11; \
-  } \
-  while (0)
-
-#define OD_FDCT_16(s0, s8, s4, sc, s2, sa, s6, se, \
-  s1, s9, s5, sd, s3, sb, s7, sf) \
-  /* Embedded 16-point orthonormal Type-II fDCT. */ \
-  do { \
-    int s8h; \
-    int sah; \
-    int sch; \
-    int seh; \
-    int sfh; \
-    sf = s0 - sf; \
-    sfh = OD_DCT_RSHIFT(sf, 1); \
-    s0 -= sfh; \
-    se += s1; \
-    seh = OD_DCT_RSHIFT(se, 1); \
-    s1 = seh - s1; \
-    sd = s2 - sd; \
-    s2 -= OD_DCT_RSHIFT(sd, 1); \
-    sc += s3; \
-    sch = OD_DCT_RSHIFT(sc, 1); \
-    s3 = sch - s3; \
-    sb = s4 - sb; \
-    s4 -= OD_DCT_RSHIFT(sb, 1); \
-    sa += s5; \
-    sah = OD_DCT_RSHIFT(sa, 1); \
-    s5 = sah - s5; \
-    s9 = s6 - s9; \
-    s6 -= OD_DCT_RSHIFT(s9, 1); \
-    s8 += s7; \
-    s8h = OD_DCT_RSHIFT(s8, 1); \
-    s7 = s8h - s7; \
-    OD_FDCT_8_ASYM(s0, s8, s8h, s4, sc, sch, s2, sa, sah, s6, se, seh); \
-    OD_FDST_8_ASYM(sf, s7, sb, s3, sd, s5, s9, s1); \
-  } \
-  while (0)
-
-#define OD_IDCT_16(s0, s8, s4, sc, s2, sa, s6, se, \
-  s1, s9, s5, sd, s3, sb, s7, sf) \
-  /* Embedded 16-point orthonormal Type-II iDCT. */ \
-  do { \
-    int s1h; \
-    int s3h; \
-    int s5h; \
-    int s7h; \
-    int sfh; \
-    OD_IDST_8_ASYM(sf, sb, sd, s9, se, sa, sc, s8); \
-    OD_IDCT_8_ASYM(s0, s4, s2, s6, s1, s1h, s5, s5h, s3, s3h, s7, s7h); \
-    sfh = OD_DCT_RSHIFT(sf, 1); \
-    s0 += sfh; \
-    sf = s0 - sf; \
-    se = s1h - se; \
-    s1 -= se; \
-    s2 += OD_DCT_RSHIFT(sd, 1); \
-    sd = s2 - sd; \
-    sc = s3h - sc; \
-    s3 -= sc; \
-    s4 += OD_DCT_RSHIFT(sb, 1); \
-    sb = s4 - sb; \
-    sa = s5h - sa; \
-    s5 -= sa; \
-    s6 += OD_DCT_RSHIFT(s9, 1); \
-    s9 = s6 - s9; \
-    s8 = s7h - s8; \
-    s7 -= s8; \
-  } \
-  while (0)
-
-#define OD_FDCT_16_ASYM(t0, t8, t8h, t4, tc, tch, t2, ta, tah, t6, te, teh, \
-  t1, t9, t9h, t5, td, tdh, t3, tb, tbh, t7, tf, tfh) \
-  /* Embedded 16-point asymmetric Type-II fDCT. */ \
-  do { \
-    t0 += tfh; \
-    tf = t0 - tf; \
-    t1 -= teh; \
-    te += t1; \
-    t2 += tdh; \
-    td = t2 - td; \
-    t3 -= tch; \
-    tc += t3; \
-    t4 += tbh; \
-    tb = t4 - tb; \
-    t5 -= tah; \
-    ta += t5; \
-    t6 += t9h; \
-    t9 = t6 - t9; \
-    t7 -= t8h; \
-    t8 += t7; \
-    OD_FDCT_8(t0, t8, t4, tc, t2, ta, t6, te); \
-    OD_FDST_8(tf, t7, tb, t3, td, t5, t9, t1); \
-  } \
-  while (0)
-
-#define OD_IDCT_16_ASYM(t0, t8, t4, tc, t2, ta, t6, te, \
-  t1, t1h, t9, t9h, t5, t5h, td, tdh, t3, t3h, tb, tbh, t7, t7h, tf, tfh) \
-  /* Embedded 16-point asymmetric Type-II iDCT. */ \
-  do { \
-    OD_IDST_8(tf, tb, td, t9, te, ta, tc, t8); \
-    OD_IDCT_8(t0, t4, t2, t6, t1, t5, t3, t7); \
-    t1 -= te; \
-    t1h = OD_DCT_RSHIFT(t1, 1); \
-    te += t1h; \
-    t9 = t6 - t9; \
-    t9h = OD_DCT_RSHIFT(t9, 1); \
-    t6 -= t9h; \
-    t5 -= ta; \
-    t5h = OD_DCT_RSHIFT(t5, 1); \
-    ta += t5h; \
-    td = t2 - td; \
-    tdh = OD_DCT_RSHIFT(td, 1); \
-    t2 -= tdh; \
-    t3 -= tc; \
-    t3h = OD_DCT_RSHIFT(t3, 1); \
-    tc += t3h; \
-    tb = t4 - tb; \
-    tbh = OD_DCT_RSHIFT(tb, 1); \
-    t4 -= tbh; \
-    t7 -= t8; \
-    t7h = OD_DCT_RSHIFT(t7, 1); \
-    t8 += t7h; \
-    tf = t0 - tf; \
-    tfh = OD_DCT_RSHIFT(tf, 1); \
-    t0 -= tfh; \
-  } \
-  while (0)
-
-#define OD_FDST_16(s0, s8, s4, sc, s2, sa, s6, se, \
-  s1, s9, s5, sd, s3, sb, s7, sf) \
-  /* Embedded 16-point orthonormal Type-IV fDST. */ \
-  do { \
-    int s0h; \
-    int s2h; \
-    int sdh; \
-    int sfh; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(s3, 13573, 16384, 220); \
-    s1 += (se*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(s1, 11585, 8192, 221); \
-    se -= (s1*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(s3, 13573, 16384, 222); \
-    s1 += (se*13573 + 16384) >> 15; \
-    /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(s2, 21895, 16384, 223); \
-    sd += (s2*21895 + 16384) >> 15; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(sd, 15137, 16384, 224); \
-    s2 -= (sd*15137 + 8192) >> 14; \
-    /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    OD_DCT_OVERFLOW_CHECK(s2, 21895, 16384, 225); \
-    sd += (s2*21895 + 16384) >> 15; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    OD_DCT_OVERFLOW_CHECK(s3, 3259, 8192, 226); \
-    sc += (s3*3259 + 8192) >> 14; \
-    /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
-    OD_DCT_OVERFLOW_CHECK(sc, 3135, 4096, 227); \
-    s3 -= (sc*3135 + 4096) >> 13; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    OD_DCT_OVERFLOW_CHECK(s3, 3259, 8192, 228); \
-    sc += (s3*3259 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(s5, 13573, 16384, 229); \
-    sa += (s5*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(sa, 11585, 8192, 230); \
-    s5 -= (sa*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[Pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(s5, 13573, 16384, 231); \
-    sa += (s5*13573 + 16384) >> 15; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(s9, 13573, 16384, 232); \
-    s6 += (s9*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[pi/4] ~= 0.707106781186547 */ \
-    OD_DCT_OVERFLOW_CHECK(s6, 11585, 8192, 233); \
-    s9 -= (s6*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(s9, 13573, 16384, 234); \
-    s6 += (s9*13573 + 16384) >> 15; \
-    sf += se; \
-    sfh = OD_DCT_RSHIFT(sf, 1); \
-    se = sfh - se; \
-    s0 += s1; \
-    s0h = OD_DCT_RSHIFT(s0, 1); \
-    s1 = s0h - s1; \
-    s2 = s3 - s2; \
-    s2h = OD_DCT_RSHIFT(s2, 1); \
-    s3 -= s2h; \
-    sd -= sc; \
-    sdh = OD_DCT_RSHIFT(sd, 1); \
-    sc += sdh; \
-    sa = s4 - sa; \
-    s4 -= OD_DCT_RSHIFT(sa, 1); \
-    s5 += sb; \
-    sb = OD_DCT_RSHIFT(s5, 1) - sb; \
-    s8 += s6; \
-    s6 -= OD_DCT_RSHIFT(s8, 1); \
-    s7 = s9 - s7; \
-    s9 -= OD_DCT_RSHIFT(s7, 1); \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(sb, 6723, 4096, 235); \
-    s4 += (sb*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    OD_DCT_OVERFLOW_CHECK(s4, 16069, 8192, 236); \
-    sb -= (s4*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(sb, 6723, 4096, 237); \
-    s4 += (sb*6723 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32]) ~= 0.534511135950792 */ \
-    OD_DCT_OVERFLOW_CHECK(s5, 8757, 8192, 238); \
-    sa += (s5*8757 + 8192) >> 14; \
-    /* 6811/8192 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
-    OD_DCT_OVERFLOW_CHECK(sa, 6811, 4096, 239); \
-    s5 -= (sa*6811 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    OD_DCT_OVERFLOW_CHECK(s5, 8757, 8192, 240); \
-    sa += (s5*8757 + 8192) >> 14; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    OD_DCT_OVERFLOW_CHECK(s9, 2485, 4096, 241); \
-    s6 += (s9*2485 + 4096) >> 13; \
-    /* 4551/8192 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
-    OD_DCT_OVERFLOW_CHECK(s6, 4551, 4096, 242); \
-    s9 -= (s6*4551 + 4096) >> 13; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    OD_DCT_OVERFLOW_CHECK(s9, 2485, 4096, 243); \
-    s6 += (s9*2485 + 4096) >> 13; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    OD_DCT_OVERFLOW_CHECK(s8, 3227, 16384, 244); \
-    s7 += (s8*3227 + 16384) >> 15; \
-    /* 6393/32768 ~= Sin[Pi/16] ~= 0.19509032201612825 */ \
-    OD_DCT_OVERFLOW_CHECK(s7, 6393, 16384, 245); \
-    s8 -= (s7*6393 + 16384) >> 15; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    OD_DCT_OVERFLOW_CHECK(s8, 3227, 16384, 246); \
-    s7 += (s8*3227 + 16384) >> 15; \
-    s1 -= s2h; \
-    s2 += s1; \
-    se += sdh; \
-    sd = se - sd; \
-    s3 += sfh; \
-    sf -= s3; \
-    sc = s0h - sc; \
-    s0 -= sc; \
-    sb += OD_DCT_RSHIFT(s8, 1); \
-    s8 = sb - s8; \
-    s4 += OD_DCT_RSHIFT(s7, 1); \
-    s7 -= s4; \
-    s6 += OD_DCT_RSHIFT(s5, 1); \
-    s5 = s6 - s5; \
-    s9 -= OD_DCT_RSHIFT(sa, 1); \
-    sa += s9; \
-    s8 += s0; \
-    s0 -= OD_DCT_RSHIFT(s8, 1); \
-    sf += s7; \
-    s7 = OD_DCT_RSHIFT(sf, 1) - s7; \
-    s1 -= s6; \
-    s6 += OD_DCT_RSHIFT(s1, 1); \
-    s9 += se; \
-    se = OD_DCT_RSHIFT(s9, 1) - se; \
-    s2 += sa; \
-    sa = OD_DCT_RSHIFT(s2, 1) - sa; \
-    s5 += sd; \
-    sd -= OD_DCT_RSHIFT(s5, 1); \
-    s4 = sc - s4; \
-    sc -= OD_DCT_RSHIFT(s4, 1); \
-    s3 -= sb; \
-    sb += OD_DCT_RSHIFT(s3, 1); \
-    /* 2799/4096 ~= (1/Sqrt[2] - Cos[31*Pi/64]/2)/Sin[31*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(sf, 2799, 2048, 247); \
-    s0 -= (sf*2799 + 2048) >> 12; \
-    /* 2893/2048 ~= Sqrt[2]*Sin[31*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s0, 2893, 1024, 248); \
-    sf += (s0*2893 + 1024) >> 11; \
-    /* 5397/8192 ~= (Cos[Pi/4] - Cos[31*Pi/64])/Sin[31*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(sf, 5397, 4096, 249); \
-    s0 -= (sf*5397 + 4096) >> 13; \
-    /* 41/64 ~= (1/Sqrt[2] - Cos[29*Pi/64]/2)/Sin[29*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s1, 41, 32, 250); \
-    se += (s1*41 + 32) >> 6; \
-    /* 2865/2048 ~= Sqrt[2]*Sin[29*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(se, 2865, 1024, 251); \
-    s1 -= (se*2865 + 1024) >> 11; \
-    /* 4641/8192 ~= (1/Sqrt[2] - Cos[29*Pi/64])/Sin[29*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s1, 4641, 4096, 252); \
-    se += (s1*4641 + 4096) >> 13; \
-    /* 2473/4096 ~= (1/Sqrt[2] - Cos[27*Pi/64]/2)/Sin[27*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s2, 2473, 2048, 253); \
-    sd += (s2*2473 + 2048) >> 12; \
-    /* 5619/4096 ~= Sqrt[2]*Sin[27*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(sd, 5619, 2048, 254); \
-    s2 -= (sd*5619 + 2048) >> 12; \
-    /* 7839/16384 ~= (1/Sqrt[2] - Cos[27*Pi/64])/Sin[27*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s2, 7839, 8192, 255); \
-    sd += (s2*7839 + 8192) >> 14; \
-    /* 5747/8192 ~= (1/Sqrt[2] - Cos[7*Pi/64]/2)/Sin[7*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s3, 5747, 4096, 256); \
-    sc -= (s3*5747 + 4096) >> 13; \
-    /* 3903/8192 ~= Sqrt[2]*Sin[7*Pi/64] ~= */ \
-    OD_DCT_OVERFLOW_CHECK(sc, 3903, 4096, 257); \
-    s3 += (sc*3903 + 4096) >> 13; \
-    /* 5701/8192 ~= (1/Sqrt[2] - Cos[7*Pi/64])/Sin[7*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s3, 5701, 4096, 258); \
-    sc += (s3*5701 + 4096) >> 13; \
-    /* 4471/8192 ~= (1/Sqrt[2] - Cos[23*Pi/64]/2)/Sin[23*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s4, 4471, 4096, 259); \
-    sb += (s4*4471 + 4096) >> 13; \
-    /* 1309/1024 ~= Sqrt[2]*Sin[23*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(sb, 1309, 512, 260); \
-    s4 -= (sb*1309 + 512) >> 10; \
-    /* 5067/16384 ~= (1/Sqrt[2] - Cos[23*Pi/64])/Sin[23*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s4, 5067, 8192, 261); \
-    sb += (s4*5067 + 8192) >> 14; \
-    /* 2217/4096 ~= (1/Sqrt[2] - Cos[11*Pi/64]/2)/Sin[11*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s5, 2217, 2048, 262); \
-    sa -= (s5*2217 + 2048) >> 12; \
-    /* 1489/2048 ~= Sqrt[2]*Sin[11*Pi/64] ~= 0.72705107329128 */ \
-    OD_DCT_OVERFLOW_CHECK(sa, 1489, 1024, 263); \
-    s5 += (sa*1489 + 1024) >> 11; \
-    /* 75/256 ~= (1/Sqrt[2] - Cos[11*Pi/64])/Sin[11*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s5, 75, 128, 264); \
-    sa += (s5*75 + 128) >> 8; \
-    /* 2087/4096 ~= (1/Sqrt[2] - Cos[19*Pi/64]/2)/Sin[19*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s9, 2087, 2048, 265); \
-    s6 -= (s9*2087 + 2048) >> 12; \
-    /* 4653/4096 ~= Sqrt[2]*Sin[19*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s6, 4653, 2048, 266); \
-    s9 += (s6*4653 + 2048) >> 12; \
-    /* 4545/32768 ~= (1/Sqrt[2] - Cos[19*Pi/64])/Sin[19*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s9, 4545, 16384, 267); \
-    s6 -= (s9*4545 + 16384) >> 15; \
-    /* 2053/4096 ~= (1/Sqrt[2] - Cos[15*Pi/64]/2)/Sin[15*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s8, 2053, 2048, 268); \
-    s7 += (s8*2053 + 2048) >> 12; \
-    /* 1945/2048 ~= Sqrt[2]*Sin[15*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s7, 1945, 1024, 269); \
-    s8 -= (s7*1945 + 1024) >> 11; \
-    /* 1651/32768 ~= (1/Sqrt[2] - Cos[15*Pi/64])/Sin[15*Pi/64] */ \
-    OD_DCT_OVERFLOW_CHECK(s8, 1651, 16384, 270); \
-    s7 -= (s8*1651 + 16384) >> 15; \
-  } \
-  while (0)
-
-#define OD_IDST_16(s0, s8, s4, sc, s2, sa, s6, se, \
-  s1, s9, s5, sd, s3, sb, s7, sf) \
-  /* Embedded 16-point orthonormal Type-IV iDST. */ \
-  do { \
-    int s0h; \
-    int s4h; \
-    int sbh; \
-    int sfh; \
-    /* 1651/32768 ~= (1/Sqrt[2] - Cos[15*Pi/64])/Sin[15*Pi/64] */ \
-    se += (s1*1651 + 16384) >> 15; \
-    /* 1945/2048 ~= Sqrt[2]*Sin[15*Pi/64] */ \
-    s1 += (se*1945 + 1024) >> 11; \
-    /* 2053/4096 ~= (1/Sqrt[2] - Cos[15*Pi/64]/2)/Sin[15*Pi/64] */ \
-    se -= (s1*2053 + 2048) >> 12; \
-    /* 4545/32768 ~= (1/Sqrt[2] - Cos[19*Pi/64])/Sin[19*Pi/64] */ \
-    s6 += (s9*4545 + 16384) >> 15; \
-    /* 4653/32768 ~= Sqrt[2]*Sin[19*Pi/64] */ \
-    s9 -= (s6*4653 + 2048) >> 12; \
-    /* 2087/4096 ~= (1/Sqrt[2] - Cos[19*Pi/64]/2)/Sin[19*Pi/64] */ \
-    s6 += (s9*2087 + 2048) >> 12; \
-    /* 75/256 ~= (1/Sqrt[2] - Cos[11*Pi/64])/Sin[11*Pi/64] */ \
-    s5 -= (sa*75 + 128) >> 8; \
-    /* 1489/2048 ~= Sqrt[2]*Sin[11*Pi/64] */ \
-    sa -= (s5*1489 + 1024) >> 11; \
-    /* 2217/4096 ~= (1/Sqrt[2] - Cos[11*Pi/64]/2)/Sin[11*Pi/64] */ \
-    s5 += (sa*2217 + 2048) >> 12; \
-    /* 5067/16384 ~= (1/Sqrt[2] - Cos[23*Pi/64])/Sin[23*Pi/64] */ \
-    sd -= (s2*5067 + 8192) >> 14; \
-    /* 1309/1024 ~= Sqrt[2]*Sin[23*Pi/64] */ \
-    s2 += (sd*1309 + 512) >> 10; \
-    /* 4471/8192 ~= (1/Sqrt[2] - Cos[23*Pi/64]/2)/Sin[23*Pi/64] */ \
-    sd -= (s2*4471 + 4096) >> 13; \
-    /* 5701/8192 ~= (1/Sqrt[2] - Cos[7*Pi/64])/Sin[7*Pi/64] */  \
-    s3 -= (sc*5701 + 4096) >> 13; \
-    /* 3903/8192 ~= Sqrt[2]*Sin[7*Pi/64] */ \
-    sc -= (s3*3903 + 4096) >> 13; \
-    /* 5747/8192 ~= (1/Sqrt[2] - Cos[7*Pi/64]/2)/Sin[7*Pi/64] */ \
-    s3 += (sc*5747 + 4096) >> 13; \
-    /* 7839/16384 ~= (1/Sqrt[2] - Cos[27*Pi/64])/Sin[27*Pi/64] */ \
-    sb -= (s4*7839 + 8192) >> 14; \
-    /* 5619/4096 ~= Sqrt[2]*Sin[27*Pi/64] */ \
-    s4 += (sb*5619 + 2048) >> 12; \
-    /* 2473/4096 ~= (1/Sqrt[2] - Cos[27*Pi/64]/2)/Sin[27*Pi/64] */ \
-    sb -= (s4*2473 + 2048) >> 12; \
-    /* 4641/8192 ~= (1/Sqrt[2] - Cos[29*Pi/64])/Sin[29*Pi/64] */ \
-    s7 -= (s8*4641 + 4096) >> 13; \
-    /* 2865/2048 ~= Sqrt[2]*Sin[29*Pi/64] */ \
-    s8 += (s7*2865 + 1024) >> 11; \
-    /* 41/64 ~= (1/Sqrt[2] - Cos[29*Pi/64]/2)/Sin[29*Pi/64] */ \
-    s7 -= (s8*41 + 32) >> 6; \
-    /* 5397/8192 ~= (Cos[Pi/4] - Cos[31*Pi/64])/Sin[31*Pi/64] */ \
-    s0 += (sf*5397 + 4096) >> 13; \
-    /* 2893/2048 ~= Sqrt[2]*Sin[31*Pi/64] */ \
-    sf -= (s0*2893 + 1024) >> 11; \
-    /* 2799/4096 ~= (1/Sqrt[2] - Cos[31*Pi/64]/2)/Sin[31*Pi/64] */ \
-    s0 += (sf*2799 + 2048) >> 12; \
-    sd -= OD_DCT_RSHIFT(sc, 1); \
-    sc += sd; \
-    s3 += OD_DCT_RSHIFT(s2, 1); \
-    s2 = s3 - s2; \
-    sb += OD_DCT_RSHIFT(sa, 1); \
-    sa -= sb; \
-    s5 = OD_DCT_RSHIFT(s4, 1) - s5; \
-    s4 -= s5; \
-    s7 = OD_DCT_RSHIFT(s9, 1) - s7; \
-    s9 -= s7; \
-    s6 -= OD_DCT_RSHIFT(s8, 1); \
-    s8 += s6; \
-    se = OD_DCT_RSHIFT(sf, 1) - se; \
-    sf -= se; \
-    s0 += OD_DCT_RSHIFT(s1, 1); \
-    s1 -= s0; \
-    s5 -= s9; \
-    s9 += OD_DCT_RSHIFT(s5, 1); \
-    sa = s6 - sa; \
-    s6 -= OD_DCT_RSHIFT(sa, 1); \
-    se += s2; \
-    s2 -= OD_DCT_RSHIFT(se, 1); \
-    s1 = sd - s1; \
-    sd -= OD_DCT_RSHIFT(s1, 1); \
-    s0 += s3; \
-    s0h = OD_DCT_RSHIFT(s0, 1); \
-    s3 = s0h - s3; \
-    sf += sc; \
-    sfh = OD_DCT_RSHIFT(sf, 1); \
-    sc -= sfh; \
-    sb = s7 - sb; \
-    sbh = OD_DCT_RSHIFT(sb, 1); \
-    s7 -= sbh; \
-    s4 -= s8; \
-    s4h = OD_DCT_RSHIFT(s4, 1); \
-    s8 += s4h; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    se -= (s1*3227 + 16384) >> 15; \
-    /* 6393/32768 ~= Sin[Pi/16] ~= 0.19509032201612825 */ \
-    s1 += (se*6393 + 16384) >> 15; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    se -= (s1*3227 + 16384) >> 15; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    s6 -= (s9*2485 + 4096) >> 13; \
-    /* 4551/8192 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
-    s9 += (s6*4551 + 4096) >> 13; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    s6 -= (s9*2485 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    s5 -= (sa*8757 + 8192) >> 14; \
-    /* 6811/8192 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
-    sa += (s5*6811 + 4096) >> 13; \
-    /* 8757/16384 ~= Tan[5*Pi/32]) ~= 0.534511135950792 */ \
-    s5 -= (sa*8757 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
-    s2 -= (sd*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    sd += (s2*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    s2 -= (sd*6723 + 4096) >> 13; \
-    s9 += OD_DCT_RSHIFT(se, 1); \
-    se = s9 - se; \
-    s6 += OD_DCT_RSHIFT(s1, 1); \
-    s1 -= s6; \
-    sd = OD_DCT_RSHIFT(sa, 1) - sd; \
-    sa -= sd; \
-    s2 += OD_DCT_RSHIFT(s5, 1); \
-    s5 = s2 - s5; \
-    s3 -= sbh; \
-    sb += s3; \
-    sc += s4h; \
-    s4 = sc - s4; \
-    s8 = s0h - s8; \
-    s0 -= s8; \
-    s7 = sfh - s7; \
-    sf -= s7; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    s6 -= (s9*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[pi/4] ~= 0.707106781186547 */ \
-    s9 += (s6*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    s6 -= (s9*13573 + 16384) >> 15; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    s5 -= (sa*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[pi/4] ~= 0.707106781186547 */ \
-    sa += (s5*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    s5 -= (sa*13573 + 16384) >> 15; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    s3 -= (sc*3259 + 8192) >> 14; \
-    /* 3135/8192 ~= Sin[Pi/8] ~= 0.382683432365090 */ \
-    sc += (s3*3135 + 4096) >> 13; \
-    /* 3259/16384 ~= Tan[Pi/16] ~= 0.198912367379658 */ \
-    s3 -= (sc*3259 + 8192) >> 14; \
-    /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    sb -= (s4*21895 + 16384) >> 15; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    s4 += (sb*15137 + 8192) >> 14; \
-    /* 21895/32768 ~= Tan[3*Pi/16] ~= 0.668178637919299 */ \
-    sb -= (s4*21895 + 16384) >> 15; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    s8 -= (s7*13573 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[pi/4] ~= 0.707106781186547 */ \
-    s7 += (s8*11585 + 8192) >> 14; \
-    /* 13573/32768 ~= Tan[pi/8] ~= 0.414213562373095 */ \
-    s8 -= (s7*13573 + 16384) >> 15; \
-  } \
-  while (0)
-
-/* TODO: rewrite this to match OD_FDST_16. */
-#define OD_FDST_16_ASYM(t0, t0h, t8, t4, t4h, tc, t2, ta, t6, te, \
-  t1, t9, t5, td, t3, tb, t7, t7h, tf) \
-  /* Embedded 16-point asymmetric Type-IV fDST. */ \
-  do { \
-    int t2h; \
-    int t3h; \
-    int t6h; \
-    int t8h; \
-    int t9h; \
-    int tch; \
-    int tdh; \
-    /* TODO: Can we move these into another operation */ \
-    t8 = -t8; \
-    t9 = -t9; \
-    ta = -ta; \
-    tb = -tb; \
-    td = -td; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 13573, 8192, 136); \
-    t1 -= (te*13573 + 8192) >> 14; \
-    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 11585, 16384, 137); \
-    te += (t1*11585 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 13573, 8192, 138); \
-    t1 -= (te*13573 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(td, 4161, 8192, 139); \
-    t2 += (td*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 15137, 8192, 140); \
-    td -= (t2*15137 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(td, 14341, 8192, 141); \
-    t2 += (td*14341 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 14341, 8192, 142); \
-    tc -= (t3*14341 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 15137, 8192, 143); \
-    t3 += (tc*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 4161, 8192, 144); \
-    tc -= (t3*4161 + 8192) >> 14; \
-    te = t0h - te; \
-    t0 -= te; \
-    tf = OD_DCT_RSHIFT(t1, 1) - tf; \
-    t1 -= tf; \
-    /* TODO: Can we move this into another operation */ \
-    tc = -tc; \
-    t2 = OD_DCT_RSHIFT(tc, 1) - t2; \
-    tc -= t2; \
-    t3 = OD_DCT_RSHIFT(td, 1) - t3; \
-    td = t3 - td; \
-    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 7489, 4096, 145); \
-    t9 -= (t6*7489 + 4096) >> 13; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 11585, 8192, 146); \
-    t6 += (t9*11585 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 19195, 16384, 147); \
-    t9 += (t6*19195 + 16384) >> 15; \
-    t8 += OD_DCT_RSHIFT(t9, 1); \
-    t9 -= t8; \
-    t6 = t7h - t6; \
-    t7 -= t6; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 6723, 4096, 148); \
-    t8 += (t7*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    OD_DCT_OVERFLOW_CHECK(t8, 16069, 8192, 149); \
-    t7 -= (t8*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 6723, 4096, 150); \
-    t8 += (t7*6723 + 4096) >> 13; \
-    /* 17515/32768 ~= Tan[5*Pi/32]) ~= 0.534511135950792 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 17515, 16384, 151); \
-    t9 += (t6*17515 + 16384) >> 15; \
-    /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 13623, 8192, 152); \
-    t6 -= (t9*13623 + 8192) >> 14; \
-    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 17515, 16384, 153); \
-    t9 += (t6*17515 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 13573, 8192, 154); \
-    t5 += (ta*13573 + 8192) >> 14; \
-    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 11585, 16384, 155); \
-    ta -= (t5*11585 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 13573, 8192, 156); \
-    t5 += (ta*13573 + 8192) >> 14; \
-    tb += OD_DCT_RSHIFT(t5, 1); \
-    t5 = tb - t5; \
-    ta += t4h; \
-    t4 -= ta; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 2485, 4096, 157); \
-    ta += (t5*2485 + 4096) >> 13; \
-    /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 18205, 16384, 158); \
-    t5 -= (ta*18205 + 16384) >> 15; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 2485, 4096, 159); \
-    ta += (t5*2485 + 4096) >> 13; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 6723, 4096, 160); \
-    tb -= (t4*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 16069, 8192, 161); \
-    t4 += (tb*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 6723, 4096, 162); \
-    tb -= (t4*6723 + 4096) >> 13; \
-    /* TODO: Can we move this into another operation */ \
-    t5 = -t5; \
-    tc -= tf; \
-    tch = OD_DCT_RSHIFT(tc, 1); \
-    tf += tch; \
-    t3 += t0; \
-    t3h = OD_DCT_RSHIFT(t3, 1); \
-    t0 -= t3h; \
-    td -= t1; \
-    tdh = OD_DCT_RSHIFT(td, 1); \
-    t1 += tdh; \
-    t2 += te; \
-    t2h = OD_DCT_RSHIFT(t2, 1); \
-    te -= t2h; \
-    t8 += t4; \
-    t8h = OD_DCT_RSHIFT(t8, 1); \
-    t4 = t8h - t4; \
-    t7 = tb - t7; \
-    t7h = OD_DCT_RSHIFT(t7, 1); \
-    tb = t7h - tb; \
-    t6 -= ta; \
-    t6h = OD_DCT_RSHIFT(t6, 1); \
-    ta += t6h; \
-    t9 = t5 - t9; \
-    t9h = OD_DCT_RSHIFT(t9, 1); \
-    t5 -= t9h; \
-    t0 -= t7h; \
-    t7 += t0; \
-    tf += t8h; \
-    t8 -= tf; \
-    te -= t6h; \
-    t6 += te; \
-    t1 += t9h; \
-    t9 -= t1; \
-    tb -= tch; \
-    tc += tb; \
-    t4 += t3h; \
-    t3 -= t4; \
-    ta -= tdh; \
-    td += ta; \
-    t5 = t2h - t5; \
-    t2 -= t5; \
-    /* TODO: Can we move these into another operation */ \
-    t8 = -t8; \
-    t9 = -t9; \
-    ta = -ta; \
-    tb = -tb; \
-    tc = -tc; \
-    td = -td; \
-    tf = -tf; \
-    /* 7799/8192 ~= Tan[31*Pi/128] ~= 0.952079146700925 */ \
-    OD_DCT_OVERFLOW_CHECK(tf, 7799, 4096, 163); \
-    t0 -= (tf*7799 + 4096) >> 13; \
-    /* 4091/4096 ~= Sin[31*Pi/64] ~= 0.998795456205172 */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 4091, 2048, 164); \
-    tf += (t0*4091 + 2048) >> 12; \
-    /* 7799/8192 ~= Tan[31*Pi/128] ~= 0.952079146700925 */ \
-    OD_DCT_OVERFLOW_CHECK(tf, 7799, 4096, 165); \
-    t0 -= (tf*7799 + 4096) >> 13; \
-    /* 2417/32768 ~= Tan[3*Pi/128] ~= 0.0737644315224493 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 2417, 16384, 166); \
-    t1 += (te*2417 + 16384) >> 15; \
-    /* 601/4096 ~= Sin[3*Pi/64] ~= 0.146730474455362 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 601, 2048, 167); \
-    te -= (t1*601 + 2048) >> 12; \
-    /* 2417/32768 ~= Tan[3*Pi/128] ~= 0.0737644315224493 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 2417, 16384, 168); \
-    t1 += (te*2417 + 16384) >> 15; \
-    /* 14525/32768 ~= Tan[17*Pi/128] ~= 0.443269513890864 */ \
-    OD_DCT_OVERFLOW_CHECK(t8, 14525, 16384, 169); \
-    t7 -= (t8*14525 + 16384) >> 15; \
-    /* 3035/4096 ~= Sin[17*Pi/64] ~= 0.740951125354959 */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 3035, 2048, 170); \
-    t8 += (t7*3035 + 2048) >> 12; \
-    /* 7263/16384 ~= Tan[17*Pi/128] ~= 0.443269513890864 */ \
-    OD_DCT_OVERFLOW_CHECK(t8, 7263, 8192, 171); \
-    t7 -= (t8*7263 + 8192) >> 14; \
-    /* 6393/8192 ~= Tan[27*Pi/128] ~= 0.780407659653944 */ \
-    OD_DCT_OVERFLOW_CHECK(td, 6393, 4096, 172); \
-    t2 -= (td*6393 + 4096) >> 13; \
-    /* 3973/4096 ~= Sin[27*Pi/64] ~= 0.970031253194544 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 3973, 2048, 173); \
-    td += (t2*3973 + 2048) >> 12; \
-    /* 6393/8192 ~= Tan[27*Pi/128] ~= 0.780407659653944 */ \
-    OD_DCT_OVERFLOW_CHECK(td, 6393, 4096, 174); \
-    t2 -= (td*6393 + 4096) >> 13; \
-    /* 9281/16384 ~= Tan[21*Pi/128] ~= 0.566493002730344 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 9281, 8192, 175); \
-    t5 -= (ta*9281 + 8192) >> 14; \
-    /* 7027/8192 ~= Sin[21*Pi/64] ~= 0.857728610000272 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 7027, 4096, 176); \
-    ta += (t5*7027 + 4096) >> 13; \
-    /* 9281/16384 ~= Tan[21*Pi/128] ~= 0.566493002730344 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 9281, 8192, 177); \
-    t5 -= (ta*9281 + 8192) >> 14; \
-    /* 11539/16384 ~= Tan[25*Pi/128] ~= 0.704279460865044 */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 11539, 8192, 178); \
-    t3 -= (tc*11539 + 8192) >> 14; \
-    /* 7713/8192 ~= Sin[25*Pi/64] ~= 0.941544065183021 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 7713, 4096, 179); \
-    tc += (t3*7713 + 4096) >> 13; \
-    /* 11539/16384 ~= Tan[25*Pi/128] ~= 0.704279460865044 */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 11539, 8192, 180); \
-    t3 -= (tc*11539 + 8192) >> 14; \
-    /* 10375/16384 ~= Tan[23*Pi/128] ~= 0.633243016177569 */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 10375, 8192, 181); \
-    t4 -= (tb*10375 + 8192) >> 14; \
-    /* 7405/8192 ~= Sin[23*Pi/64] ~= 0.903989293123443 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 7405, 4096, 182); \
-    tb += (t4*7405 + 4096) >> 13; \
-    /* 10375/16384 ~= Tan[23*Pi/128] ~= 0.633243016177569 */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 10375, 8192, 183); \
-    t4 -= (tb*10375 + 8192) >> 14; \
-    /* 8247/16384 ~= Tan[19*Pi/128] ~= 0.503357699799294 */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 8247, 8192, 184); \
-    t6 -= (t9*8247 + 8192) >> 14; \
-    /* 1645/2048 ~= Sin[19*Pi/64] ~= 0.803207531480645 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 1645, 1024, 185); \
-    t9 += (t6*1645 + 1024) >> 11; \
-    /* 8247/16384 ~= Tan[19*Pi/128] ~= 0.503357699799294 */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 8247, 8192, 186); \
-    t6 -= (t9*8247 + 8192) >> 14; \
-  } \
-  while (0)
-
-#define OD_IDST_16_ASYM(t0, t0h, t8, t4, tc, t2, t2h, ta, t6, te, teh, \
-  t1, t9, t5, td, t3, tb, t7, tf) \
-  /* Embedded 16-point asymmetric Type-IV iDST. */ \
-  do { \
-    int t1h_; \
-    int t3h_; \
-    int t4h; \
-    int t6h; \
-    int t9h_; \
-    int tbh_; \
-    int tch; \
-    /* 8247/16384 ~= Tan[19*Pi/128] ~= 0.503357699799294 */ \
-    t6 += (t9*8247 + 8192) >> 14; \
-    /* 1645/2048 ~= Sin[19*Pi/64] ~= 0.803207531480645 */ \
-    t9 -= (t6*1645 + 1024) >> 11; \
-    /* 8247/16384 ~= Tan[19*Pi/128] ~= 0.503357699799294 */ \
-    t6 += (t9*8247 + 8192) >> 14; \
-    /* 10375/16384 ~= Tan[23*Pi/128] ~= 0.633243016177569 */ \
-    t2 += (td*10375 + 8192) >> 14; \
-    /* 7405/8192 ~= Sin[23*Pi/64] ~= 0.903989293123443 */ \
-    td -= (t2*7405 + 4096) >> 13; \
-    /* 10375/16384 ~= Tan[23*Pi/128] ~= 0.633243016177569 */ \
-    t2 += (td*10375 + 8192) >> 14; \
-    /* 11539/16384 ~= Tan[25*Pi/128] ~= 0.704279460865044 */ \
-    tc += (t3*11539 + 8192) >> 14; \
-    /* 7713/8192 ~= Sin[25*Pi/64] ~= 0.941544065183021 */ \
-    t3 -= (tc*7713 + 4096) >> 13; \
-    /* 11539/16384 ~= Tan[25*Pi/128] ~= 0.704279460865044 */ \
-    tc += (t3*11539 + 8192) >> 14; \
-    /* 9281/16384 ~= Tan[21*Pi/128] ~= 0.566493002730344 */ \
-    ta += (t5*9281 + 8192) >> 14; \
-    /* 7027/8192 ~= Sin[21*Pi/64] ~= 0.857728610000272 */ \
-    t5 -= (ta*7027 + 4096) >> 13; \
-    /* 9281/16384 ~= Tan[21*Pi/128] ~= 0.566493002730344 */ \
-    ta += (t5*9281 + 8192) >> 14; \
-    /* 6393/8192 ~= Tan[27*Pi/128] ~= 0.780407659653944 */ \
-    t4 += (tb*6393 + 4096) >> 13; \
-    /* 3973/4096 ~= Sin[27*Pi/64] ~= 0.970031253194544 */ \
-    tb -= (t4*3973 + 2048) >> 12; \
-    /* 6393/8192 ~= Tan[27*Pi/128] ~= 0.780407659653944 */ \
-    t4 += (tb*6393 + 4096) >> 13; \
-    /* 7263/16384 ~= Tan[17*Pi/128] ~= 0.443269513890864 */ \
-    te += (t1*7263 + 8192) >> 14; \
-    /* 3035/4096 ~= Sin[17*Pi/64] ~= 0.740951125354959 */ \
-    t1 -= (te*3035 + 2048) >> 12; \
-    /* 14525/32768 ~= Tan[17*Pi/128] ~= 0.443269513890864 */ \
-    te += (t1*14525 + 16384) >> 15; \
-    /* 2417/32768 ~= Tan[3*Pi/128] ~= 0.0737644315224493 */ \
-    t8 -= (t7*2417 + 16384) >> 15; \
-    /* 601/4096 ~= Sin[3*Pi/64] ~= 0.146730474455362 */ \
-    t7 += (t8*601 + 2048) >> 12; \
-    /* 2417/32768 ~= Tan[3*Pi/128] ~= 0.0737644315224493 */ \
-    t8 -= (t7*2417 + 16384) >> 15; \
-    /* 7799/8192 ~= Tan[31*Pi/128] ~= 0.952079146700925 */ \
-    t0 += (tf*7799 + 4096) >> 13; \
-    /* 4091/4096 ~= Sin[31*Pi/64] ~= 0.998795456205172 */ \
-    tf -= (t0*4091 + 2048) >> 12; \
-    /* 7799/8192 ~= Tan[31*Pi/128] ~= 0.952079146700925 */ \
-    t0 += (tf*7799 + 4096) >> 13; \
-    /* TODO: Can we move these into another operation */ \
-    t1 = -t1; \
-    t3 = -t3; \
-    t5 = -t5; \
-    t9 = -t9; \
-    tb = -tb; \
-    td = -td; \
-    tf = -tf; \
-    t4 += ta; \
-    t4h = OD_DCT_RSHIFT(t4, 1); \
-    ta = t4h - ta; \
-    tb -= t5; \
-    tbh_ = OD_DCT_RSHIFT(tb, 1); \
-    t5 += tbh_; \
-    tc += t2; \
-    tch = OD_DCT_RSHIFT(tc, 1); \
-    t2 -= tch; \
-    t3 -= td; \
-    t3h_ = OD_DCT_RSHIFT(t3, 1); \
-    td += t3h_; \
-    t9 += t8; \
-    t9h_ = OD_DCT_RSHIFT(t9, 1); \
-    t8 -= t9h_; \
-    t6 -= t7; \
-    t6h = OD_DCT_RSHIFT(t6, 1); \
-    t7 += t6h; \
-    t1 += tf; \
-    t1h_ = OD_DCT_RSHIFT(t1, 1); \
-    tf -= t1h_; \
-    te -= t0; \
-    teh = OD_DCT_RSHIFT(te, 1); \
-    t0 += teh; \
-    ta += t9h_; \
-    t9 = ta - t9; \
-    t5 -= t6h; \
-    t6 += t5; \
-    td = teh - td; \
-    te = td - te; \
-    t2 = t1h_ - t2; \
-    t1 -= t2; \
-    t7 += t4h; \
-    t4 -= t7; \
-    t8 -= tbh_; \
-    tb += t8; \
-    t0 += tch; \
-    tc -= t0; \
-    tf -= t3h_; \
-    t3 += tf; \
-    /* TODO: Can we move this into another operation */ \
-    ta = -ta; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    td += (t2*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    t2 -= (td*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.820678790828660 */ \
-    td += (t2*6723 + 4096) >> 13; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    t5 -= (ta*2485 + 4096) >> 13; \
-    /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
-    ta += (t5*18205 + 16384) >> 15; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    t5 -= (ta*2485 + 4096) >> 13; \
-    t2 += t5; \
-    t2h = OD_DCT_RSHIFT(t2, 1); \
-    t5 -= t2h; \
-    ta = td - ta; \
-    td -= OD_DCT_RSHIFT(ta, 1); \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    ta -= (t5*13573 + 8192) >> 14; \
-    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
-    t5 += (ta*11585 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    ta -= (t5*13573 + 8192) >> 14; \
-    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.534511135950792 */ \
-    t9 -= (t6*17515 + 16384) >> 15; \
-    /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.831469612302545 */ \
-    t6 += (t9*13623 + 8192) >> 14; \
-    /* 17515/32768 ~= Tan[5*Pi/32]) ~= 0.534511135950792 */ \
-    t9 -= (t6*17515 + 16384) >> 15; \
-    /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
-    t1 -= (te*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.980785280403230 */ \
-    te += (t1*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32]) ~= 0.820678790828660 */ \
-    t1 -= (te*6723 + 4096) >> 13; \
-    te += t6; \
-    teh = OD_DCT_RSHIFT(te, 1); \
-    t6 = teh - t6; \
-    t9 += t1; \
-    t1 -= OD_DCT_RSHIFT(t9, 1); \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    t9 -= (t6*19195 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    t6 -= (t9*11585 + 8192) >> 14; \
-    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    t9 += (t6*7489 + 4096) >> 13; \
-    tb = tc - tb; \
-    tc = OD_DCT_RSHIFT(tb, 1) - tc; \
-    t3 += t4; \
-    t4 = OD_DCT_RSHIFT(t3, 1) - t4; \
-    /* TODO: Can we move this into another operation */ \
-    t3 = -t3; \
-    t8 += tf; \
-    tf = OD_DCT_RSHIFT(t8, 1) - tf; \
-    t0 += t7; \
-    t0h = OD_DCT_RSHIFT(t0, 1); \
-    t7 = t0h - t7; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    t3 += (tc*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    tc -= (t3*15137 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    t3 += (tc*14341 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    t4 -= (tb*14341 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    tb += (t4*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    t4 -= (tb*4161 + 8192) >> 14; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    t8 += (t7*13573 + 8192) >> 14; \
-    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
-    t7 -= (t8*11585 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    t8 += (t7*13573 + 8192) >> 14; \
-    /* TODO: Can we move these into another operation */ \
-    t1 = -t1; \
-    t5 = -t5; \
-    t9 = -t9; \
-    tb = -tb; \
-    td = -td; \
-  } \
-  while (0)
-
-#define OD_FDCT_32(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, \
-  te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
-  /* Embedded 32-point orthonormal Type-II fDCT. */ \
-  do { \
-    int tgh; \
-    int thh; \
-    int tih; \
-    int tkh; \
-    int tmh; \
-    int tnh; \
-    int toh; \
-    int tqh; \
-    int tsh; \
-    int tuh; \
-    int tvh; \
-    tv = t0 - tv; \
-    tvh = OD_DCT_RSHIFT(tv, 1); \
-    t0 -= tvh; \
-    tu += t1; \
-    tuh = OD_DCT_RSHIFT(tu, 1); \
-    t1 = tuh - t1; \
-    tt = t2 - tt; \
-    t2 -= OD_DCT_RSHIFT(tt, 1); \
-    ts += t3; \
-    tsh = OD_DCT_RSHIFT(ts, 1); \
-    t3 = tsh - t3; \
-    tr = t4 - tr; \
-    t4 -= OD_DCT_RSHIFT(tr, 1); \
-    tq += t5; \
-    tqh = OD_DCT_RSHIFT(tq, 1); \
-    t5 = tqh - t5; \
-    tp = t6 - tp; \
-    t6 -= OD_DCT_RSHIFT(tp, 1); \
-    to += t7; \
-    toh = OD_DCT_RSHIFT(to, 1); \
-    t7 = toh - t7; \
-    tn = t8 - tn; \
-    tnh = OD_DCT_RSHIFT(tn, 1); \
-    t8 -= tnh; \
-    tm += t9; \
-    tmh = OD_DCT_RSHIFT(tm, 1); \
-    t9 = tmh - t9; \
-    tl = ta - tl; \
-    ta -= OD_DCT_RSHIFT(tl, 1); \
-    tk += tb; \
-    tkh = OD_DCT_RSHIFT(tk, 1); \
-    tb = tkh - tb; \
-    tj = tc - tj; \
-    tc -= OD_DCT_RSHIFT(tj, 1); \
-    ti += td; \
-    tih = OD_DCT_RSHIFT(ti, 1); \
-    td = tih - td; \
-    th = te - th; \
-    thh = OD_DCT_RSHIFT(th, 1); \
-    te -= thh; \
-    tg += tf; \
-    tgh = OD_DCT_RSHIFT(tg, 1); \
-    tf = tgh - tf; \
-    OD_FDCT_16_ASYM(t0, tg, tgh, t8, to, toh, t4, tk, tkh, tc, ts, tsh, \
-     t2, ti, tih, ta, tq, tqh, t6, tm, tmh, te, tu, tuh); \
-    OD_FDST_16_ASYM(tv, tvh, tf, tn, tnh, t7, tr, tb, tj, t3, \
-     tt, td, tl, t5, tp, t9, th, thh, t1); \
-  } \
-  while (0)
-
-#define OD_IDCT_32(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, \
-  te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
-  /* Embedded 32-point orthonormal Type-II iDCT. */ \
-  do { \
-    int t1h; \
-    int t3h; \
-    int t5h; \
-    int t7h; \
-    int t9h; \
-    int tbh; \
-    int tdh; \
-    int tfh; \
-    int thh; \
-    int tth; \
-    int tvh; \
-    OD_IDST_16_ASYM(tv, tvh, tn, tr, tj, tt, tth, tl, tp, th, thh, \
-     tu, tm, tq, ti, ts, tk, to, tg); \
-    OD_IDCT_16_ASYM(t0, t8, t4, tc, t2, ta, t6, te, \
-     t1, t1h, t9, t9h, t5, t5h, td, tdh, t3, t3h, tb, tbh, t7, t7h, tf, tfh); \
-    tu = t1h - tu; \
-    t1 -= tu; \
-    te += thh; \
-    th = te - th; \
-    tm = t9h - tm; \
-    t9 -= tm; \
-    t6 += OD_DCT_RSHIFT(tp, 1); \
-    tp = t6 - tp; \
-    tq = t5h - tq; \
-    t5 -= tq; \
-    ta += OD_DCT_RSHIFT(tl, 1); \
-    tl = ta - tl; \
-    ti = tdh - ti; \
-    td -= ti; \
-    t2 += tth; \
-    tt = t2 - tt; \
-    ts = t3h - ts; \
-    t3 -= ts; \
-    tc += OD_DCT_RSHIFT(tj, 1); \
-    tj = tc - tj; \
-    tk = tbh - tk; \
-    tb -= tk; \
-    t4 += OD_DCT_RSHIFT(tr, 1); \
-    tr = t4 - tr; \
-    to = t7h - to; \
-    t7 -= to; \
-    t8 += OD_DCT_RSHIFT(tn, 1); \
-    tn = t8 - tn; \
-    tg = tfh - tg; \
-    tf -= tg; \
-    t0 += tvh; \
-    tv = t0 - tv; \
-  } \
-  while (0)
-
-#if CONFIG_TX64X64
-#define OD_FDCT_32_ASYM(t0, tg, tgh, t8, to, toh, t4, tk, tkh, tc, ts, tsh, \
-  t2, ti, tih, ta, tq, tqh, t6, tm, tmh, te, tu, tuh, t1, th, thh, \
-  t9, tp, tph, t5, tl, tlh, td, tt, tth, t3, tj, tjh, tb, tr, trh, \
-  t7, tn, tnh, tf, tv, tvh) \
-  /* Embedded 32-point asymmetric Type-II fDCT. */ \
-  do { \
-    t0 += tvh; \
-    tv = t0 - tv; \
-    t1 = tuh - t1; \
-    tu -= t1; \
-    t2 += tth; \
-    tt = t2 - tt; \
-    t3 = tsh - t3; \
-    ts -= t3; \
-    t4 += trh; \
-    tr = t4 - tr; \
-    t5 = tqh - t5; \
-    tq -= t5; \
-    t6 += tph; \
-    tp = t6 - tp; \
-    t7 = toh - t7; \
-    to -= t7; \
-    t8 += tnh; \
-    tn = t8 - tn; \
-    t9 = tmh - t9; \
-    tm -= t9; \
-    ta += tlh; \
-    tl = ta - tl; \
-    tb = tkh - tb; \
-    tk -= tb; \
-    tc += tjh; \
-    tj = tc - tj; \
-    td = tih - td; \
-    ti -= td; \
-    te += thh; \
-    th = te - th; \
-    tf = tgh - tf; \
-    tg -= tf; \
-    OD_FDCT_16(t0, tg, t8, to, t4, tk, tc, ts, \
-     t2, ti, ta, tq, t6, tm, te, tu); \
-    OD_FDST_16(tv, tf, tn, t7, tr, tb, tj, t3, \
-     tt, td, tl, t5, tp, t9, th, t1); \
-  } \
-  while (0)
-
-#define OD_IDCT_32_ASYM(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, \
-  t6, tm, te, tu, t1, t1h, th, thh, t9, t9h, tp, tph, t5, t5h, tl, tlh, \
-  td, tdh, tt, tth, t3, t3h, tj, tjh, tb, tbh, tr, trh, t7, t7h, tn, tnh, \
-  tf, tfh, tv, tvh) \
-  /* Embedded 32-point asymmetric Type-II iDCT. */ \
-  do { \
-    OD_IDST_16(tv, tn, tr, tj, tt, tl, tp, th, \
-     tu, tm, tq, ti, ts, tk, to, tg); \
-    OD_IDCT_16(t0, t8, t4, tc, t2, ta, t6, te, \
-     t1, t9, t5, td, t3, tb, t7, tf); \
-    tv = t0 - tv; \
-    tvh = OD_DCT_RSHIFT(tv, 1); \
-    t0 -= tvh; \
-    t1 += tu; \
-    t1h = OD_DCT_RSHIFT(t1, 1); \
-    tu = t1h - tu; \
-    tt = t2 - tt; \
-    tth = OD_DCT_RSHIFT(tt, 1); \
-    t2 -= tth; \
-    t3 += ts; \
-    t3h = OD_DCT_RSHIFT(t3, 1); \
-    ts = t3h - ts; \
-    tr = t4 - tr; \
-    trh = OD_DCT_RSHIFT(tr, 1); \
-    t4 -= trh; \
-    t5 += tq; \
-    t5h = OD_DCT_RSHIFT(t5, 1); \
-    tq = t5h - tq; \
-    tp = t6 - tp; \
-    tph = OD_DCT_RSHIFT(tp, 1); \
-    t6 -= tph; \
-    t7 += to; \
-    t7h = OD_DCT_RSHIFT(t7, 1); \
-    to = t7h - to; \
-    tn = t8 - tn; \
-    tnh = OD_DCT_RSHIFT(tn, 1); \
-    t8 -= tnh; \
-    t9 += tm; \
-    t9h = OD_DCT_RSHIFT(t9, 1); \
-    tm = t9h - tm; \
-    tl = ta - tl; \
-    tlh = OD_DCT_RSHIFT(tl, 1); \
-    ta -= tlh; \
-    tb += tk; \
-    tbh = OD_DCT_RSHIFT(tb, 1); \
-    tk = tbh - tk; \
-    tj = tc - tj; \
-    tjh = OD_DCT_RSHIFT(tj, 1); \
-    tc -= tjh; \
-    td += ti; \
-    tdh = OD_DCT_RSHIFT(td, 1); \
-    ti = tdh - ti; \
-    th = te - th; \
-    thh = OD_DCT_RSHIFT(th, 1); \
-    te -= thh; \
-    tf += tg; \
-    tfh = OD_DCT_RSHIFT(tf, 1); \
-    tg = tfh - tg; \
-  } \
-  while (0)
-
-#define OD_FDST_32_ASYM(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, \
-  tm, te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
-  /* Embedded 32-point asymmetric Type-IV fDST. */ \
-  do { \
-    int t0h; \
-    int t1h; \
-    int t4h; \
-    int t5h; \
-    int tqh; \
-    int trh; \
-    int tuh; \
-    int tvh; \
-    \
-    tu = -tu; \
-    \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    OD_DCT_OVERFLOW_CHECK(tq, 13573, 8192, 271); \
-    t5 -= (tq*13573 + 8192) >> 14; \
-    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 11585, 16384, 272); \
-    tq += (t5*11585 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    OD_DCT_OVERFLOW_CHECK(tq, 13573, 8192, 273); \
-    t5 -= (tq*13573 + 8192) >> 14; \
-    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 29957, 16384, 274); \
-    tp += (t6*29957 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    OD_DCT_OVERFLOW_CHECK(tp, 11585, 8192, 275); \
-    t6 -= (tp*11585 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 19195, 16384, 276); \
-    tp -= (t6*19195 + 16384) >> 15; \
-    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 29957, 16384, 277); \
-    tu += (t1*29957 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    OD_DCT_OVERFLOW_CHECK(tu, 11585, 8192, 278); \
-    t1 -= (tu*11585 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 19195, 16384, 279); \
-    tu -= (t1*19195 + 16384) >> 15; \
-    /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 28681, 16384, 280); \
-    tt += (t2*28681 + 16384) >> 15; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(tt, 15137, 8192, 281); \
-    t2 -= (tt*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 4161, 8192, 282); \
-    tt += (t2*4161 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(ts, 4161, 8192, 283); \
-    t3 += (ts*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 15137, 8192, 284); \
-    ts -= (t3*15137 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(ts, 14341, 8192, 285); \
-    t3 += (ts*14341 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    OD_DCT_OVERFLOW_CHECK(tm, 19195, 16384, 286); \
-    t9 -= (tm*19195 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 11585, 8192, 287); \
-    tm -= (t9*11585 + 8192) >> 14; \
-    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(tm, 7489, 4096, 288); \
-    t9 += (tm*7489 + 4096) >> 13; \
-    /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \
-    OD_DCT_OVERFLOW_CHECK(tl, 3259, 4096, 289); \
-    ta += (tl*3259 + 4096) >> 13; \
-    /* 3135/16384 ~= Sin[Pi/8]/2 ~= 0.1913417161825449 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 3135, 8192, 290); \
-    tl -= (ta*3135 + 8192) >> 14; \
-    /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \
-    OD_DCT_OVERFLOW_CHECK(tl, 3259, 4096, 291); \
-    ta += (tl*3259 + 4096) >> 13; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(tk, 4161, 8192, 292); \
-    tb += (tk*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 15137, 8192, 293); \
-    tk -= (tb*15137 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(tk, 14341, 8192, 294); \
-    tb += (tk*14341 + 8192) >> 14; \
-    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 29957, 16384, 295); \
-    th += (te*29957 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    OD_DCT_OVERFLOW_CHECK(th, 11585, 8192, 296); \
-    te -= (th*11585 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 19195, 16384, 297); \
-    th -= (te*19195 + 16384) >> 15; \
-    /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 28681, 16384, 298); \
-    tj += (tc*28681 + 16384) >> 15; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(tj, 15137, 8192, 299); \
-    tc -= (tj*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 4161, 8192, 300); \
-    tj += (tc*4161 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    OD_DCT_OVERFLOW_CHECK(ti, 4161, 8192, 301); \
-    td += (ti*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    OD_DCT_OVERFLOW_CHECK(td, 15137, 8192, 302); \
-    ti -= (td*15137 + 8192) >> 14; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    OD_DCT_OVERFLOW_CHECK(ti, 14341, 8192, 303); \
-    td += (ti*14341 + 8192) >> 14; \
-    \
-    t1 = -t1; \
-    t2 = -t2; \
-    t3 = -t3; \
-    td = -td; \
-    tg = -tg; \
-    to = -to; \
-    ts = -ts; \
-    \
-    tr -= OD_DCT_RSHIFT(t5, 1); \
-    t5 += tr; \
-    tq -= OD_DCT_RSHIFT(t4, 1); /* pass */ \
-    t4 += tq; \
-    t6 -= OD_DCT_RSHIFT(t7, 1); \
-    t7 += t6; \
-    to -= OD_DCT_RSHIFT(tp, 1); /* pass */ \
-    tp += to; \
-    t1 += OD_DCT_RSHIFT(t0, 1); /* pass */ \
-    t0 -= t1; \
-    tv -= OD_DCT_RSHIFT(tu, 1); \
-    tu += tv; \
-    t3 -= OD_DCT_RSHIFT(tt, 1); \
-    tt += t3; \
-    t2 += OD_DCT_RSHIFT(ts, 1); \
-    ts -= t2; \
-    t9 -= OD_DCT_RSHIFT(t8, 1); /* pass */ \
-    t8 += t9; \
-    tn += OD_DCT_RSHIFT(tm, 1); \
-    tm -= tn; \
-    tb += OD_DCT_RSHIFT(ta, 1); \
-    ta -= tb; \
-    tl -= OD_DCT_RSHIFT(tk, 1); \
-    tk += tl; \
-    te -= OD_DCT_RSHIFT(tf, 1); /* pass */ \
-    tf += te; \
-    tg -= OD_DCT_RSHIFT(th, 1); \
-    th += tg; \
-    tc -= OD_DCT_RSHIFT(ti, 1); \
-    ti += tc; \
-    td += OD_DCT_RSHIFT(tj, 1); \
-    tj -= td; \
-    \
-    t4 = -t4; \
-    \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \
-    OD_DCT_OVERFLOW_CHECK(tr, 6723, 4096, 304); \
-    t4 += (tr*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.9807852804032304 */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 16069, 8192, 305); \
-    tr -= (t4*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \
-    OD_DCT_OVERFLOW_CHECK(tr, 6723, 4096, 306); \
-    t4 += (tr*6723 + 4096) >> 13; \
-    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \
-    OD_DCT_OVERFLOW_CHECK(tq, 17515, 16384, 307); \
-    t5 += (tq*17515 + 16384) >> 15; \
-    /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.8314696123025452 */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 13623, 8192, 308); \
-    tq -= (t5*13623 + 8192) >> 14; \
-    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \
-    OD_DCT_OVERFLOW_CHECK(tq, 17515, 16384, 309); \
-    t5 += (tq*17515 + 16384) >> 15; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    OD_DCT_OVERFLOW_CHECK(to, 3227, 16384, 310); \
-    t7 += (to*3227 + 16384) >> 15; \
-    /* 6393/32768 ~= Sin[Pi/16] ~= 0.19509032201612825 */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 6393, 16384, 311); \
-    to -= (t7*6393 + 16384) >> 15; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    OD_DCT_OVERFLOW_CHECK(to, 3227, 16384, 312); \
-    t7 += (to*3227 + 16384) >> 15; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    OD_DCT_OVERFLOW_CHECK(tp, 2485, 4096, 313); \
-    t6 += (tp*2485 + 4096) >> 13; \
-    /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 18205, 16384, 314); \
-    tp -= (t6*18205 + 16384) >> 15; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    OD_DCT_OVERFLOW_CHECK(tp, 2485, 4096, 315); \
-    t6 += (tp*2485 + 4096) >> 13; \
-    \
-    t5 = -t5; \
-    \
-    tr += to; \
-    trh = OD_DCT_RSHIFT(tr, 1); \
-    to -= trh; \
-    t4 += t7; \
-    t4h = OD_DCT_RSHIFT(t4, 1); \
-    t7 -= t4h; \
-    t5 += tp; \
-    t5h = OD_DCT_RSHIFT(t5, 1); \
-    tp -= t5h; \
-    tq += t6; \
-    tqh = OD_DCT_RSHIFT(tq, 1); \
-    t6 -= tqh; \
-    t0 -= t3; \
-    t0h = OD_DCT_RSHIFT(t0, 1); \
-    t3 += t0h; \
-    tv -= ts; \
-    tvh = OD_DCT_RSHIFT(tv, 1); \
-    ts += tvh; \
-    tu += tt; \
-    tuh = OD_DCT_RSHIFT(tu, 1); \
-    tt -= tuh; \
-    t1 -= t2; \
-    t1h = OD_DCT_RSHIFT(t1, 1); \
-    t2 += t1h; \
-    t8 += tb; \
-    tb -= OD_DCT_RSHIFT(t8, 1); \
-    tn += tk; \
-    tk -= OD_DCT_RSHIFT(tn, 1); \
-    t9 += tl; \
-    tl -= OD_DCT_RSHIFT(t9, 1); \
-    tm -= ta; \
-    ta += OD_DCT_RSHIFT(tm, 1); \
-    tc -= tf; \
-    tf += OD_DCT_RSHIFT(tc, 1); \
-    tj += tg; \
-    tg -= OD_DCT_RSHIFT(tj, 1); \
-    td -= te; \
-    te += OD_DCT_RSHIFT(td, 1); \
-    ti += th; \
-    th -= OD_DCT_RSHIFT(ti, 1); \
-    \
-    t9 = -t9; \
-    tl = -tl; \
-    \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    OD_DCT_OVERFLOW_CHECK(tn, 805, 8192, 316); \
-    t8 += (tn*805 + 8192) >> 14; \
-    /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \
-    OD_DCT_OVERFLOW_CHECK(t8, 803, 4096, 317); \
-    tn -= (t8*803 + 4096) >> 13; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    OD_DCT_OVERFLOW_CHECK(tn, 805, 8192, 318); \
-    t8 += (tn*805 + 8192) >> 14; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 11725, 16384, 319); \
-    tk += (tb*11725 + 16384) >> 15; \
-    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \
-    OD_DCT_OVERFLOW_CHECK(tk, 5197, 4096, 320); \
-    tb -= (tk*5197 + 4096) >> 13; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 11725, 16384, 321); \
-    tk += (tb*11725 + 16384) >> 15; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \
-    OD_DCT_OVERFLOW_CHECK(tl, 2455, 2048, 322); \
-    ta += (tl*2455 + 2048) >> 12; \
-    /* 14449/16384 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 14449, 8192, 323); \
-    tl -= (ta*14449 + 8192) >> 14; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \
-    OD_DCT_OVERFLOW_CHECK(tl, 2455, 2048, 324); \
-    ta += (tl*2455 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    OD_DCT_OVERFLOW_CHECK(tm, 4861, 16384, 325); \
-    t9 += (tm*4861 + 16384) >> 15; \
-    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 1189, 2048, 326); \
-    tm -= (t9*1189 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    OD_DCT_OVERFLOW_CHECK(tm, 4861, 16384, 327); \
-    t9 += (tm*4861 + 16384) >> 15; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    OD_DCT_OVERFLOW_CHECK(tg, 805, 8192, 328); \
-    tf += (tg*805 + 8192) >> 14; \
-    /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \
-    OD_DCT_OVERFLOW_CHECK(tf, 803, 4096, 329); \
-    tg -= (tf*803 + 4096) >> 13; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    OD_DCT_OVERFLOW_CHECK(tg, 805, 8192, 330); \
-    tf += (tg*805 + 8192) >> 14; \
-    /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    OD_DCT_OVERFLOW_CHECK(tj, 2931, 4096, 331); \
-    tc += (tj*2931 + 4096) >> 13; \
-    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 5197, 4096, 332); \
-    tj -= (tc*5197 + 4096) >> 13; \
-    /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    OD_DCT_OVERFLOW_CHECK(tj, 2931, 4096, 333); \
-    tc += (tj*2931 + 4096) >> 13; \
-    /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \
-    OD_DCT_OVERFLOW_CHECK(ti, 513, 1024, 334); \
-    td += (ti*513 + 1024) >> 11; \
-    /* 7723/16384 ~= Sin[5*Pi/32] ~= 0.47139673682599764 */ \
-    OD_DCT_OVERFLOW_CHECK(td, 7723, 8192, 335); \
-    ti -= (td*7723 + 8192) >> 14; \
-    /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \
-    OD_DCT_OVERFLOW_CHECK(ti, 513, 1024, 336); \
-    td += (ti*513 + 1024) >> 11; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    OD_DCT_OVERFLOW_CHECK(th, 4861, 16384, 337); \
-    te += (th*4861 + 16384) >> 15; \
-    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \
-    OD_DCT_OVERFLOW_CHECK(te, 1189, 2048, 338); \
-    th -= (te*1189 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    OD_DCT_OVERFLOW_CHECK(th, 4861, 16384, 339); \
-    te += (th*4861 + 16384) >> 15; \
-    \
-    ta = -ta; \
-    tb = -tb; \
-    \
-    tt += t5h; \
-    t5 -= tt; \
-    t2 -= tqh; \
-    tq += t2; \
-    tp += t1h; \
-    t1 -= tp; \
-    t6 -= tuh; \
-    tu += t6; \
-    t7 += tvh; \
-    tv -= t7; \
-    to += t0h; \
-    t0 -= to; \
-    t3 -= t4h; \
-    t4 += t3; \
-    ts += trh; \
-    tr -= ts; \
-    tf -= OD_DCT_RSHIFT(tn, 1); \
-    tn += tf; \
-    tg -= OD_DCT_RSHIFT(t8, 1); \
-    t8 += tg; \
-    tk += OD_DCT_RSHIFT(tc, 1); \
-    tc -= tk; \
-    tb += OD_DCT_RSHIFT(tj, 1); \
-    tj -= tb; \
-    ta += OD_DCT_RSHIFT(ti, 1); \
-    ti -= ta; \
-    tl += OD_DCT_RSHIFT(td, 1); \
-    td -= tl; \
-    te -= OD_DCT_RSHIFT(tm, 1); \
-    tm += te; \
-    th -= OD_DCT_RSHIFT(t9, 1); \
-    t9 += th; \
-    ta -= t5; \
-    t5 += OD_DCT_RSHIFT(ta, 1); \
-    tq -= tl; \
-    tl += OD_DCT_RSHIFT(tq, 1); \
-    t2 -= ti; \
-    ti += OD_DCT_RSHIFT(t2, 1); \
-    td -= tt; \
-    tt += OD_DCT_RSHIFT(td, 1); \
-    tm += tp; \
-    tp -= OD_DCT_RSHIFT(tm, 1); \
-    t6 += t9; \
-    t9 -= OD_DCT_RSHIFT(t6, 1); \
-    te -= tu; \
-    tu += OD_DCT_RSHIFT(te, 1); \
-    t1 -= th; \
-    th += OD_DCT_RSHIFT(t1, 1); \
-    t0 -= tg; \
-    tg += OD_DCT_RSHIFT(t0, 1); \
-    tf += tv; \
-    tv -= OD_DCT_RSHIFT(tf, 1); \
-    t8 -= t7; \
-    t7 += OD_DCT_RSHIFT(t8, 1); \
-    to -= tn; \
-    tn += OD_DCT_RSHIFT(to, 1); \
-    t4 -= tk; \
-    tk += OD_DCT_RSHIFT(t4, 1); \
-    tb -= tr; \
-    tr += OD_DCT_RSHIFT(tb, 1); \
-    t3 -= tj; \
-    tj += OD_DCT_RSHIFT(t3, 1); \
-    tc -= ts; \
-    ts += OD_DCT_RSHIFT(tc, 1); \
-    \
-    tr = -tr; \
-    ts = -ts; \
-    tt = -tt; \
-    tu = -tu; \
-    \
-    /* 2847/4096 ~= (1/Sqrt[2] - Cos[63*Pi/128]/2)/Sin[63*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 2847, 2048, 340); \
-    tv += (t0*2847 + 2048) >> 12; \
-    /* 5791/4096 ~= Sqrt[2]*Sin[63*Pi/128] */  \
-    OD_DCT_OVERFLOW_CHECK(tv, 5791, 2048, 341); \
-    t0 -= (tv*5791 + 2048) >> 12; \
-    /* 5593/8192 ~= (1/Sqrt[2] - Cos[63*Pi/128])/Sin[63*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t0, 5593, 4096, 342); \
-    tv += (t0*5593 + 4096) >> 13; \
-    /* 4099/8192 ~= (1/Sqrt[2] - Cos[31*Pi/128]/2)/Sin[31*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tf, 4099, 4096, 343); \
-    tg -= (tf*4099 + 4096) >> 13; \
-    /* 1997/2048 ~= Sqrt[2]*Sin[31*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tg, 1997, 1024, 344); \
-    tf += (tg*1997 + 1024) >> 11; \
-    /* -815/32768 ~= (1/Sqrt[2] - Cos[31*Pi/128])/Sin[31*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tf, 815, 16384, 345); \
-    tg += (tf*815 + 16384) >> 15; \
-    /* 2527/4096 ~= (1/Sqrt[2] - Cos[17*Pi/128]/2)/Sin[17*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t8, 2527, 2048, 346); \
-    tn -= (t8*2527 + 2048) >> 12; \
-    /* 4695/8192 ~= Sqrt[2]*Sin[17*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tn, 4695, 4096, 347); \
-    t8 += (tn*4695 + 4096) >> 13; \
-    /* -4187/8192 ~= (1/Sqrt[2] - Cos[17*Pi/128])/Sin[17*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t8, 4187, 4096, 348); \
-    tn += (t8*4187 + 4096) >> 13; \
-    /* 5477/8192 ~= (1/Sqrt[2] - Cos[15*Pi/128]/2)/Sin[15*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(to, 5477, 4096, 349); \
-    t7 += (to*5477 + 4096) >> 13; \
-    /* 4169/8192 ~= Sqrt[2]*Sin[15*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t7, 4169, 4096, 350); \
-    to -= (t7*4169 + 4096) >> 13; \
-    /* -2571/4096 ~= (1/Sqrt[2] - Cos[15*Pi/128])/Sin[15*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(to, 2571, 2048, 351); \
-    t7 -= (to*2571 + 2048) >> 12; \
-    /* 5331/8192 ~= (1/Sqrt[2] - Cos[59*Pi/128]/2)/Sin[59*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 5331, 4096, 352); \
-    tt += (t2*5331 + 4096) >> 13; \
-    /* 5749/4096 ~= Sqrt[2]*Sin[59*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tt, 5749, 2048, 353); \
-    t2 -= (tt*5749 + 2048) >> 12; \
-    /* 2413/4096 ~= (1/Sqrt[2] - Cos[59*Pi/128])/Sin[59*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t2, 2413, 2048, 354); \
-    tt += (t2*2413 + 2048) >> 12; \
-    /* 4167/8192 ~= (1/Sqrt[2] - Cos[27*Pi/128]/2)/Sin[27*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(td, 4167, 4096, 355); \
-    ti -= (td*4167 + 4096) >> 13; \
-    /* 891/1024 ~= Sqrt[2]*Sin[27*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(ti, 891, 512, 356); \
-    td += (ti*891 + 512) >> 10; \
-    /* -4327/32768 ~= (1/Sqrt[2] - Cos[27*Pi/128])/Sin[27*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(td, 4327, 16384, 357); \
-    ti += (td*4327 + 16384) >> 15; \
-    /* 2261/4096 ~= (1/Sqrt[2] - Cos[21*Pi/128]/2)/Sin[21*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 2261, 2048, 358); \
-    tl -= (ta*2261 + 2048) >> 12; \
-    /* 2855/4096 ~= Sqrt[2]*Sin[21*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tl, 2855, 2048, 359); \
-    ta += (tl*2855 + 2048) >> 12; \
-    /* -5417/16384 ~= (1/Sqrt[2] - Cos[21*Pi/128])/Sin[21*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(ta, 5417, 8192, 360); \
-    tl += (ta*5417 + 8192) >> 14; \
-    /* 3459/4096 ~= (1/Sqrt[2] - Cos[11*Pi/128]/2)/Sin[11*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tq, 3459, 2048, 361); \
-    t5 += (tq*3459 + 2048) >> 12; \
-    /* 1545/4096 ~= Sqrt[2]*Sin[11*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t5, 1545, 2048, 362); \
-    tq -= (t5*1545 + 2048) >> 12; \
-    /* -1971/2048 ~= (1/Sqrt[2] - Cos[11*Pi/128])/Sin[11*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tq, 1971, 1024, 363); \
-    t5 -= (tq*1971 + 1024) >> 11; \
-    /* 323/512 ~= (1/Sqrt[2] - Cos[57*Pi/128]/2)/Sin[57*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 323, 256, 364); \
-    ts += (t3*323 + 256) >> 9; \
-    /* 5707/4096 ~= Sqrt[2]*Sin[57*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(ts, 5707, 2048, 365); \
-    t3 -= (ts*5707 + 2048) >> 12; \
-    /* 2229/4096 ~= (1/Sqrt[2] - Cos[57*Pi/128])/Sin[57*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t3, 2229, 2048, 366); \
-    ts += (t3*2229 + 2048) >> 12; \
-    /* 1061/2048 ~= (1/Sqrt[2] - Cos[25*Pi/128]/2)/Sin[25*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 1061, 1024, 367); \
-    tj -= (tc*1061 + 1024) >> 11; \
-    /* 6671/8192 ~= Sqrt[2]*Sin[25*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tj, 6671, 4096, 368); \
-    tc += (tj*6671 + 4096) >> 13; \
-    /* -6287/32768 ~= (1/Sqrt[2] - Cos[25*Pi/128])/Sin[25*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tc, 6287, 16384, 369); \
-    tj += (tc*6287 + 16384) >> 15; \
-    /* 4359/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128]/2)/Sin[23*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 4359, 4096, 370); \
-    tk -= (tb*4359 + 4096) >> 13; \
-    /* 3099/4096 ~= Sqrt[2]*Sin[23*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tk, 3099, 2048, 371); \
-    tb += (tk*3099 + 2048) >> 12; \
-    /* -2109/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128])/Sin[23*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tb, 2109, 4096, 372); \
-    tk += (tb*2109 + 4096) >> 13; \
-    /* 5017/8192 ~= (1/Sqrt[2] - Cos[55*Pi/128]/2)/Sin[55*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 5017, 4096, 373); \
-    tr += (t4*5017 + 4096) >> 13; \
-    /* 1413/1024 ~= Sqrt[2]*Sin[55*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tr, 1413, 512, 374); \
-    t4 -= (tr*1413 + 512) >> 10; \
-    /* 8195/16384 ~= (1/Sqrt[2] - Cos[55*Pi/128])/Sin[55*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t4, 8195, 8192, 375); \
-    tr += (t4*8195 + 8192) >> 14; \
-    /* 2373/4096 ~= (1/Sqrt[2] - Cos[19*Pi/128]/2)/Sin[19*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tm, 2373, 2048, 376); \
-    t9 += (tm*2373 + 2048) >> 12; \
-    /* 5209/8192 ~= Sqrt[2]*Sin[19*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t9, 5209, 4096, 377); \
-    tm -= (t9*5209 + 4096) >> 13; \
-    /* -3391/8192 ~= (1/Sqrt[2] - Cos[19*Pi/128])/Sin[19*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tm, 3391, 4096, 378); \
-    t9 -= (tm*3391 + 4096) >> 13; \
-    /* 1517/2048 ~= (1/Sqrt[2] - Cos[13*Pi/128]/2)/Sin[13*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 1517, 1024, 379); \
-    tp -= (t6*1517 + 1024) >> 11; \
-    /* 1817/4096 ~= Sqrt[2]*Sin[13*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tp, 1817, 2048, 380); \
-    t6 += (tp*1817 + 2048) >> 12; \
-    /* -6331/8192 ~= (1/Sqrt[2] - Cos[13*Pi/128])/Sin[13*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t6, 6331, 4096, 381); \
-    tp += (t6*6331 + 4096) >> 13; \
-    /* 515/1024 ~= (1/Sqrt[2] - Cos[29*Pi/128]/2)/Sin[29*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(te, 515, 512, 382); \
-    th -= (te*515 + 512) >> 10; \
-    /* 7567/8192 ~= Sqrt[2]*Sin[29*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(th, 7567, 4096, 383); \
-    te += (th*7567 + 4096) >> 13; \
-    /* -2513/32768 ~= (1/Sqrt[2] - Cos[29*Pi/128])/Sin[29*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(te, 2513, 16384, 384); \
-    th += (te*2513 + 16384) >> 15; \
-    /* 2753/4096 ~= (1/Sqrt[2] - Cos[61*Pi/128]/2)/Sin[61*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 2753, 2048, 385); \
-    tu += (t1*2753 + 2048) >> 12; \
-    /* 5777/4096 ~= Sqrt[2]*Sin[61*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(tu, 5777, 2048, 386); \
-    t1 -= (tu*5777 + 2048) >> 12; \
-    /* 1301/2048 ~= (1/Sqrt[2] - Cos[61*Pi/128])/Sin[61*Pi/128] */ \
-    OD_DCT_OVERFLOW_CHECK(t1, 1301, 1024, 387); \
-    tu += (t1*1301 + 1024) >> 11; \
-  } \
-  while (0)
-
-#define OD_IDST_32_ASYM(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, \
-  tm, te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \
-  /* Embedded 32-point asymmetric Type-IV iDST. */ \
-  do { \
-    int t0h; \
-    int t4h; \
-    int tbh; \
-    int tfh; \
-    int tgh; \
-    int tkh; \
-    int trh; \
-    int tvh; \
-    /* 1301/2048 ~= (1/Sqrt[2] - Cos[61*Pi/128])/Sin[61*Pi/128] */ \
-    tf -= (tg*1301 + 1024) >> 11; \
-    /* 5777/4096 ~= Sqrt[2]*Sin[61*Pi/128] */ \
-    tg += (tf*5777 + 2048) >> 12; \
-    /* 2753/4096 ~= (1/Sqrt[2] - Cos[61*Pi/128]/2)/Sin[61*Pi/128] */ \
-    tf -= (tg*2753 + 2048) >> 12; \
-    /* -2513/32768 ~= (1/Sqrt[2] - Cos[29*Pi/128])/Sin[29*Pi/128] */ \
-    th -= (te*2513 + 16384) >> 15; \
-    /* 7567/8192 ~= Sqrt[2]*Sin[29*Pi/128] */ \
-    te -= (th*7567 + 4096) >> 13; \
-    /* 515/1024 ~= (1/Sqrt[2] - Cos[29*Pi/128]/2)/Sin[29*Pi/128] */ \
-    th += (te*515 + 512) >> 10; \
-    /* -6331/8192 ~= (1/Sqrt[2] - Cos[13*Pi/128])/Sin[13*Pi/128] */ \
-    tj -= (tc*6331 + 4096) >> 13; \
-    /* 1817/4096 ~= Sqrt[2]*Sin[13*Pi/128] */ \
-    tc -= (tj*1817 + 2048) >> 12; \
-    /* 1517/2048 ~= (1/Sqrt[2] - Cos[13*Pi/128]/2)/Sin[13*Pi/128] */ \
-    tj += (tc*1517 + 1024) >> 11; \
-    /* -3391/8192 ~= (1/Sqrt[2] - Cos[19*Pi/128])/Sin[19*Pi/128] */ \
-    ti += (td*3391 + 4096) >> 13; \
-    /* 5209/8192 ~= Sqrt[2]*Sin[19*Pi/128] */ \
-    td += (ti*5209 + 4096) >> 13; \
-    /* 2373/4096 ~= (1/Sqrt[2] - Cos[19*Pi/128]/2)/Sin[19*Pi/128] */ \
-    ti -= (td*2373 + 2048) >> 12; \
-    /* 8195/16384 ~= (1/Sqrt[2] - Cos[55*Pi/128])/Sin[55*Pi/128] */ \
-    tr -= (t4*8195 + 8192) >> 14; \
-    /* 1413/1024 ~= Sqrt[2]*Sin[55*Pi/128] */ \
-    t4 += (tr*1413 + 512) >> 10; \
-    /* 5017/8192 ~= (1/Sqrt[2] - Cos[55*Pi/128]/2)/Sin[55*Pi/128] */ \
-    tr -= (t4*5017 + 4096) >> 13; \
-    /* -2109/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128])/Sin[23*Pi/128] */ \
-    t5 -= (tq*2109 + 4096) >> 13; \
-    /* 3099/4096 ~= Sqrt[2]*Sin[23*Pi/128] */ \
-    tq -= (t5*3099 + 2048) >> 12; \
-    /* 4359/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128]/2)/Sin[23*Pi/128] */ \
-    t5 += (tq*4359 + 4096) >> 13; \
-    /* -6287/32768 ~= (1/Sqrt[2] - Cos[25*Pi/128])/Sin[25*Pi/128] */ \
-    tp -= (t6*6287 + 16384) >> 15; \
-    /* 6671/8192 ~= Sqrt[2]*Sin[25*Pi/128] */ \
-    t6 -= (tp*6671 + 4096) >> 13; \
-    /* 1061/2048 ~= (1/Sqrt[2] - Cos[25*Pi/128]/2)/Sin[25*Pi/128] */ \
-    tp += (t6*1061 + 1024) >> 11; \
-    /* 2229/4096 ~= (1/Sqrt[2] - Cos[57*Pi/128])/Sin[57*Pi/128] */ \
-    t7 -= (to*2229 + 2048) >> 12; \
-    /* 5707/4096 ~= Sqrt[2]*Sin[57*Pi/128] */ \
-    to += (t7*5707 + 2048) >> 12; \
-    /* 323/512 ~= (1/Sqrt[2] - Cos[57*Pi/128]/2)/Sin[57*Pi/128] */ \
-    t7 -= (to*323 + 256) >> 9; \
-    /* -1971/2048 ~= (1/Sqrt[2] - Cos[11*Pi/128])/Sin[11*Pi/128] */ \
-    tk += (tb*1971 + 1024) >> 11; \
-    /* 1545/4096 ~= Sqrt[2]*Sin[11*Pi/128] */ \
-    tb += (tk*1545 + 2048) >> 12; \
-    /* 3459/4096 ~= (1/Sqrt[2] - Cos[11*Pi/128]/2)/Sin[11*Pi/128] */ \
-    tk -= (tb*3459 + 2048) >> 12; \
-    /* -5417/16384 ~= (1/Sqrt[2] - Cos[21*Pi/128])/Sin[21*Pi/128] */ \
-    tl -= (ta*5417 + 8192) >> 14; \
-    /* 2855/4096 ~= Sqrt[2]*Sin[21*Pi/128] */ \
-    ta -= (tl*2855 + 2048) >> 12; \
-    /* 2261/4096 ~= (1/Sqrt[2] - Cos[21*Pi/128]/2)/Sin[21*Pi/128] */ \
-    tl += (ta*2261 + 2048) >> 12; \
-    /* -4327/32768 ~= (1/Sqrt[2] - Cos[27*Pi/128])/Sin[27*Pi/128] */ \
-    t9 -= (tm*4327 + 16384) >> 15; \
-    /* 891/1024 ~= Sqrt[2]*Sin[27*Pi/128] */ \
-    tm -= (t9*891 + 512) >> 10; \
-    /* 4167/8192 ~= (1/Sqrt[2] - Cos[27*Pi/128]/2)/Sin[27*Pi/128] */ \
-    t9 += (tm*4167 + 4096) >> 13; \
-    /* 2413/4096 ~= (1/Sqrt[2] - Cos[59*Pi/128])/Sin[59*Pi/128] */ \
-    tn -= (t8*2413 + 2048) >> 12; \
-    /* 5749/4096 ~= Sqrt[2]*Sin[59*Pi/128] */ \
-    t8 += (tn*5749 + 2048) >> 12; \
-    /* 5331/8192 ~= (1/Sqrt[2] - Cos[59*Pi/128]/2)/Sin[59*Pi/128] */ \
-    tn -= (t8*5331 + 4096) >> 13; \
-    /* -2571/4096 ~= (1/Sqrt[2] - Cos[15*Pi/128])/Sin[15*Pi/128] */ \
-    ts += (t3*2571 + 2048) >> 12; \
-    /* 4169/8192 ~= Sqrt[2]*Sin[15*Pi/128] */ \
-    t3 += (ts*4169 + 4096) >> 13; \
-    /* 5477/8192 ~= (1/Sqrt[2] - Cos[15*Pi/128]/2)/Sin[15*Pi/128] */ \
-    ts -= (t3*5477 + 4096) >> 13; \
-    /* -4187/8192 ~= (1/Sqrt[2] - Cos[17*Pi/128])/Sin[17*Pi/128] */ \
-    tt -= (t2*4187 + 4096) >> 13; \
-    /* 4695/8192 ~= Sqrt[2]*Sin[17*Pi/128] */ \
-    t2 -= (tt*4695 + 4096) >> 13; \
-    /* 2527/4096 ~= (1/Sqrt[2] - Cos[17*Pi/128]/2)/Sin[17*Pi/128] */ \
-    tt += (t2*2527 + 2048) >> 12; \
-    /* -815/32768 ~= (1/Sqrt[2] - Cos[31*Pi/128])/Sin[31*Pi/128] */ \
-    t1 -= (tu*815 + 16384) >> 15; \
-    /* 1997/2048 ~= Sqrt[2]*Sin[31*Pi/128] */ \
-    tu -= (t1*1997 + 1024) >> 11; \
-    /* 4099/8192 ~= (1/Sqrt[2] - Cos[31*Pi/128]/2)/Sin[31*Pi/128] */ \
-    t1 += (tu*4099 + 4096) >> 13; \
-    /* 5593/8192 ~= (1/Sqrt[2] - Cos[63*Pi/128])/Sin[63*Pi/128] */ \
-    tv -= (t0*5593 + 4096) >> 13; \
-    /* 5791/4096 ~= Sqrt[2]*Sin[63*Pi/128] */ \
-    t0 += (tv*5791 + 2048) >> 12; \
-    /* 2847/4096 ~= (1/Sqrt[2] - Cos[63*Pi/128]/2)/Sin[63*Pi/128] */ \
-    tv -= (t0*2847 + 2048) >> 12; \
-    \
-    t7 = -t7; \
-    tf = -tf; \
-    tn = -tn; \
-    tr = -tr; \
-    \
-    t7 -= OD_DCT_RSHIFT(t6, 1); \
-    t6 += t7; \
-    tp -= OD_DCT_RSHIFT(to, 1); \
-    to += tp; \
-    tr -= OD_DCT_RSHIFT(tq, 1); \
-    tq += tr; \
-    t5 -= OD_DCT_RSHIFT(t4, 1); \
-    t4 += t5; \
-    tt -= OD_DCT_RSHIFT(t3, 1); \
-    t3 += tt; \
-    ts -= OD_DCT_RSHIFT(t2, 1); \
-    t2 += ts; \
-    tv += OD_DCT_RSHIFT(tu, 1); \
-    tu -= tv; \
-    t1 -= OD_DCT_RSHIFT(t0, 1); \
-    t0 += t1; \
-    th -= OD_DCT_RSHIFT(tg, 1); \
-    tg += th; \
-    tf -= OD_DCT_RSHIFT(te, 1); \
-    te += tf; \
-    ti += OD_DCT_RSHIFT(tc, 1); \
-    tc -= ti; \
-    tj += OD_DCT_RSHIFT(td, 1); \
-    td -= tj; \
-    tn -= OD_DCT_RSHIFT(tm, 1); \
-    tm += tn; \
-    t9 -= OD_DCT_RSHIFT(t8, 1); \
-    t8 += t9; \
-    tl -= OD_DCT_RSHIFT(tb, 1); \
-    tb += tl; \
-    tk -= OD_DCT_RSHIFT(ta, 1); \
-    ta += tk; \
-    \
-    ti -= th; \
-    th += OD_DCT_RSHIFT(ti, 1); \
-    td -= te; \
-    te += OD_DCT_RSHIFT(td, 1); \
-    tm += tl; \
-    tl -= OD_DCT_RSHIFT(tm, 1); \
-    t9 += ta; \
-    ta -= OD_DCT_RSHIFT(t9, 1); \
-    tp += tq; \
-    tq -= OD_DCT_RSHIFT(tp, 1); \
-    t6 += t5; \
-    t5 -= OD_DCT_RSHIFT(t6, 1); \
-    t2 -= t1; \
-    t1 += OD_DCT_RSHIFT(t2, 1); \
-    tt -= tu; \
-    tu += OD_DCT_RSHIFT(tt, 1); \
-    tr += t7; \
-    trh = OD_DCT_RSHIFT(tr, 1); \
-    t7 -= trh; \
-    t4 -= to; \
-    t4h = OD_DCT_RSHIFT(t4, 1); \
-    to += t4h; \
-    t0 += t3; \
-    t0h = OD_DCT_RSHIFT(t0, 1); \
-    t3 -= t0h; \
-    tv += ts; \
-    tvh = OD_DCT_RSHIFT(tv, 1); \
-    ts -= tvh; \
-    tf -= tc; \
-    tfh = OD_DCT_RSHIFT(tf, 1); \
-    tc += tfh; \
-    tg += tj; \
-    tgh = OD_DCT_RSHIFT(tg, 1); \
-    tj -= tgh; \
-    tb -= t8; \
-    tbh = OD_DCT_RSHIFT(tb, 1); \
-    t8 += tbh; \
-    tk += tn; \
-    tkh = OD_DCT_RSHIFT(tk, 1); \
-    tn -= tkh; \
-    \
-    ta = -ta; \
-    tq = -tq; \
-    \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    te -= (th*4861 + 16384) >> 15; \
-    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \
-    th += (te*1189 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    te -= (th*4861 + 16384) >> 15; \
-    /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \
-    tm -= (t9*513 + 1024) >> 11; \
-    /* 7723/16384 ~= Sin[5*Pi/32] ~= 0.47139673682599764 */ \
-    t9 += (tm*7723 + 8192) >> 14; \
-    /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \
-    tm -= (t9*513 + 1024) >> 11; \
-    /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    t6 -= (tp*2931 + 4096) >> 13; \
-    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \
-    tp += (t6*5197 + 4096) >> 13; \
-    /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    t6 -= (tp*2931 + 4096) >> 13; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    tu -= (t1*805 + 8192) >> 14; \
-    /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \
-    t1 += (tu*803 + 4096) >> 13; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    tu -= (t1*805 + 8192) >> 14; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    ti -= (td*4861 + 16384) >> 15; \
-    /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \
-    td += (ti*1189 + 2048) >> 12; \
-    /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \
-    ti -= (td*4861 + 16384) >> 15; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \
-    ta -= (tl*2455 + 2048) >> 12; \
-    /* 14449/16384 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \
-    tl += (ta*14449 + 8192) >> 14; \
-    /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \
-    ta -= (tl*2455 + 2048) >> 12; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    t5 -= (tq*11725 + 16384) >> 15; \
-    /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \
-    tq += (t5*5197 + 4096) >> 13; \
-    /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \
-    t5 -= (tq*11725 + 16384) >> 15; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    t2 -= (tt*805 + 8192) >> 14; \
-    /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \
-    tt += (t2*803 + 4096) >> 13; \
-    /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \
-    t2 -= (tt*805 + 8192) >> 14; \
-    \
-    tl = -tl; \
-    ti = -ti; \
-    \
-    th += OD_DCT_RSHIFT(t9, 1); \
-    t9 -= th; \
-    te -= OD_DCT_RSHIFT(tm, 1); \
-    tm += te; \
-    t1 += OD_DCT_RSHIFT(tp, 1); \
-    tp -= t1; \
-    tu -= OD_DCT_RSHIFT(t6, 1); \
-    t6 += tu; \
-    ta -= OD_DCT_RSHIFT(td, 1); \
-    td += ta; \
-    tl += OD_DCT_RSHIFT(ti, 1); \
-    ti -= tl; \
-    t5 += OD_DCT_RSHIFT(tt, 1); \
-    tt -= t5; \
-    tq += OD_DCT_RSHIFT(t2, 1); \
-    t2 -= tq; \
-    \
-    t8 -= tgh; \
-    tg += t8; \
-    tn += tfh; \
-    tf -= tn; \
-    t7 -= tvh; \
-    tv += t7; \
-    to -= t0h; \
-    t0 += to; \
-    tc += tbh; \
-    tb -= tc; \
-    tj += tkh; \
-    tk -= tj; \
-    ts += t4h; \
-    t4 -= ts; \
-    t3 += trh; \
-    tr -= t3; \
-    \
-    tk = -tk; \
-    \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    tc -= (tj*2485 + 4096) >> 13; \
-    /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \
-    tj += (tc*18205 + 16384) >> 15; \
-    /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \
-    tc -= (tj*2485 + 4096) >> 13; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    ts -= (t3*3227 + 16384) >> 15; \
-    /* 6393/32768 ~= Sin[Pi/16] ~= 0.19509032201612825 */ \
-    t3 += (ts*6393 + 16384) >> 15; \
-    /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \
-    ts -= (t3*3227 + 16384) >> 15; \
-    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \
-    tk -= (tb*17515 + 16384) >> 15; \
-    /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.8314696123025452 */ \
-    tb += (tk*13623 + 8192) >> 14; \
-    /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \
-    tk -= (tb*17515 + 16384) >> 15; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \
-    t4 -= (tr*6723 + 4096) >> 13; \
-    /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.9807852804032304 */ \
-    tr += (t4*16069 + 8192) >> 14; \
-    /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \
-    t4 -= (tr*6723 + 4096) >> 13; \
-    \
-    t4 = -t4; \
-    \
-    tp += tm; \
-    tm -= OD_DCT_RSHIFT(tp, 1); \
-    t9 -= t6; \
-    t6 += OD_DCT_RSHIFT(t9, 1); \
-    th -= t1; \
-    t1 += OD_DCT_RSHIFT(th, 1); \
-    tu -= te; \
-    te += OD_DCT_RSHIFT(tu, 1); /* pass */ \
-    t5 -= tl; \
-    tl += OD_DCT_RSHIFT(t5, 1); \
-    ta += tq; \
-    tq -= OD_DCT_RSHIFT(ta, 1); \
-    td += tt; \
-    tt -= OD_DCT_RSHIFT(td, 1); \
-    t2 -= ti; \
-    ti += OD_DCT_RSHIFT(t2, 1); /* pass */ \
-    t7 += t8; \
-    t8 -= OD_DCT_RSHIFT(t7, 1); \
-    tn -= to; \
-    to += OD_DCT_RSHIFT(tn, 1); \
-    tf -= tv; \
-    tv += OD_DCT_RSHIFT(tf, 1); \
-    t0 += tg; \
-    tg -= OD_DCT_RSHIFT(t0, 1); /* pass */ \
-    tj -= t3; \
-    t3 += OD_DCT_RSHIFT(tj, 1); /* pass */ \
-    ts -= tc; \
-    tc += OD_DCT_RSHIFT(ts, 1); \
-    t4 -= tb; \
-    tb += OD_DCT_RSHIFT(t4, 1); /* pass */ \
-    tk -= tr; \
-    tr += OD_DCT_RSHIFT(tk, 1); \
-    \
-    t1 = -t1; \
-    t3 = -t3; \
-    t7 = -t7; \
-    t8 = -t8; \
-    tg = -tg; \
-    tm = -tm; \
-    to = -to; \
-    \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    tm -= (t9*14341 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t9 += (tm*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    tm -= (t9*4161 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    tp -= (t6*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t6 += (tp*15137 + 8192) >> 14; \
-    /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    tp -= (t6*28681 + 16384) >> 15; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    th += (te*19195 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    te += (th*11585 + 8192) >> 14; \
-    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    th -= (te*29957 + 16384) >> 15; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    tq -= (t5*14341 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t5 += (tq*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    tq -= (t5*4161 + 8192) >> 14; \
-    /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \
-    ta -= (tl*3259 + 4096) >> 13; \
-    /* 3135/16384 ~= Sin[Pi/8]/2 ~= 0.1913417161825449 */ \
-    tl += (ta*3135 + 8192) >> 14; \
-    /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \
-    ta -= (tl*3259 + 4096) >> 13; \
-    /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    ti -= (td*7489 + 4096) >> 13; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    td += (ti*11585 + 8192) >> 14; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    ti += (td*19195 + 16384) >> 15; \
-    /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    to -= (t7*14341 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t7 += (to*15137 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    to -= (t7*4161 + 8192) >> 14; \
-    /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \
-    tn -= (t8*4161 + 8192) >> 14; \
-    /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \
-    t8 += (tn*15137 + 8192) >> 14; \
-    /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \
-    tn -= (t8*28681 + 16384) >> 15; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    tf += (tg*19195 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    tg += (tf*11585 + 8192) >> 14; \
-    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    tf -= (tg*29957 + 16384) >> 15; \
-    /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \
-    tj += (tc*19195 + 16384) >> 15; \
-    /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \
-    tc += (tj*11585 + 8192) >> 14; \
-    /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \
-    tj -= (tc*29957 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    tk += (tb*13573 + 8192) >> 14; \
-    /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \
-    tb -= (tk*11585 + 16384) >> 15; \
-    /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \
-    tk += (tb*13573 + 8192) >> 14; \
-    \
-    tf = -tf; \
-    \
-  } \
-  while (0)
-
-#define OD_FDCT_64(u0, uw, ug, uM, u8, uE, uo, uU, u4, uA, uk, uQ, uc, uI, \
-  us, uY, u2, uy, ui, uO, ua, uG, uq, uW, u6, uC, um, uS, ue, uK, uu, u_, u1, \
-  ux, uh, uN, u9, uF, up, uV, u5, uB, ul, uR, ud, uJ, ut, uZ, u3, uz, uj, uP, \
-  ub, uH, ur, uX, u7, uD, un, uT, uf, uL, uv, u) \
-  /* Embedded 64-point orthonormal Type-II fDCT. */ \
-  do { \
-    int uwh; \
-    int uxh; \
-    int uyh; \
-    int uzh; \
-    int uAh; \
-    int uBh; \
-    int uCh; \
-    int uDh; \
-    int uEh; \
-    int uFh; \
-    int uGh; \
-    int uHh; \
-    int uIh; \
-    int uJh; \
-    int uKh; \
-    int uLh; \
-    int uMh; \
-    int uNh; \
-    int uOh; \
-    int uPh; \
-    int uQh; \
-    int uRh; \
-    int uSh; \
-    int uTh; \
-    int uUh; \
-    int uVh; \
-    int uWh; \
-    int uXh; \
-    int uYh; \
-    int uZh; \
-    int u_h; \
-    int uh_; \
-    u = u0 - u; \
-    uh_ = OD_DCT_RSHIFT(u, 1); \
-    u0 -= uh_; \
-    u_ += u1; \
-    u_h = OD_DCT_RSHIFT(u_, 1); \
-    u1 = u_h - u1; \
-    uZ = u2 - uZ; \
-    uZh = OD_DCT_RSHIFT(uZ, 1); \
-    u2 -= uZh; \
-    uY += u3; \
-    uYh = OD_DCT_RSHIFT(uY, 1); \
-    u3 = uYh - u3; \
-    uX = u4 - uX; \
-    uXh = OD_DCT_RSHIFT(uX, 1); \
-    u4 -= uXh; \
-    uW += u5; \
-    uWh = OD_DCT_RSHIFT(uW, 1); \
-    u5 = uWh - u5; \
-    uV = u6 - uV; \
-    uVh = OD_DCT_RSHIFT(uV, 1); \
-    u6 -= uVh; \
-    uU += u7; \
-    uUh = OD_DCT_RSHIFT(uU, 1); \
-    u7 = uUh - u7; \
-    uT = u8 - uT; \
-    uTh = OD_DCT_RSHIFT(uT, 1); \
-    u8 -= uTh; \
-    uS += u9; \
-    uSh = OD_DCT_RSHIFT(uS, 1); \
-    u9 = uSh - u9; \
-    uR = ua - uR; \
-    uRh = OD_DCT_RSHIFT(uR, 1); \
-    ua -= uRh; \
-    uQ += ub; \
-    uQh = OD_DCT_RSHIFT(uQ, 1); \
-    ub = uQh - ub; \
-    uP = uc - uP; \
-    uPh = OD_DCT_RSHIFT(uP, 1); \
-    uc -= uPh; \
-    uO += ud; \
-    uOh = OD_DCT_RSHIFT(uO, 1); \
-    ud = uOh - ud; \
-    uN = ue - uN; \
-    uNh = OD_DCT_RSHIFT(uN, 1); \
-    ue -= uNh; \
-    uM += uf; \
-    uMh = OD_DCT_RSHIFT(uM, 1); \
-    uf = uMh - uf; \
-    uL = ug - uL; \
-    uLh = OD_DCT_RSHIFT(uL, 1); \
-    ug -= uLh; \
-    uK += uh; \
-    uKh = OD_DCT_RSHIFT(uK, 1); \
-    uh = uKh - uh; \
-    uJ = ui - uJ; \
-    uJh = OD_DCT_RSHIFT(uJ, 1); \
-    ui -= uJh; \
-    uI += uj; \
-    uIh = OD_DCT_RSHIFT(uI, 1); \
-    uj = uIh - uj; \
-    uH = uk - uH; \
-    uHh = OD_DCT_RSHIFT(uH, 1); \
-    uk -= uHh; \
-    uG += ul; \
-    uGh = OD_DCT_RSHIFT(uG, 1); \
-    ul = uGh - ul; \
-    uF = um - uF; \
-    uFh = OD_DCT_RSHIFT(uF, 1); \
-    um -= uFh; \
-    uE += un; \
-    uEh = OD_DCT_RSHIFT(uE, 1); \
-    un = uEh - un; \
-    uD = uo - uD; \
-    uDh = OD_DCT_RSHIFT(uD, 1); \
-    uo -= uDh; \
-    uC += up; \
-    uCh = OD_DCT_RSHIFT(uC, 1); \
-    up = uCh - up; \
-    uB = uq - uB; \
-    uBh = OD_DCT_RSHIFT(uB, 1); \
-    uq -= uBh; \
-    uA += ur; \
-    uAh = OD_DCT_RSHIFT(uA, 1); \
-    ur = uAh - ur; \
-    uz = us - uz; \
-    uzh = OD_DCT_RSHIFT(uz, 1); \
-    us -= uzh; \
-    uy += ut; \
-    uyh = OD_DCT_RSHIFT(uy, 1); \
-    ut = uyh - ut; \
-    ux = uu - ux; \
-    uxh = OD_DCT_RSHIFT(ux, 1); \
-    uu -= uxh; \
-    uw += uv; \
-    uwh = OD_DCT_RSHIFT(uw, 1); \
-    uv = uwh - uv; \
-    OD_FDCT_32_ASYM(u0, uw, uwh, ug, uM, uMh, u8, uE, uEh, uo, uU, uUh, \
-      u4, uA, uAh, uk, uQ, uQh, uc, uI, uIh, us, uY, uYh, u2, uy, uyh, \
-      ui, uO, uOh, ua, uG, uGh, uq, uW, uWh, u6, uC, uCh, um, uS, uSh, \
-      ue, uK, uKh, uu, u_, u_h); \
-    OD_FDST_32_ASYM(u, uv, uL, uf, uT, un, uD, u7, uX, ur, uH, ub, uP, uj, \
-      uz, u3, uZ, ut, uJ, ud, uR, ul, uB, u5, uV, up, uF, u9, uN, uh, ux, u1); \
-  } \
-  while (0)
-
-#define OD_IDCT_64(u0, uw, ug, uM, u8, uE, uo, uU, u4, uA, uk, uQ, uc, uI, \
-  us, uY, u2, uy, ui, uO, ua, uG, uq, uW, u6, uC, um, uS, ue, uK, uu, u_, u1, \
-  ux, uh, uN, u9, uF, up, uV, u5, uB, ul, uR, ud, uJ, ut, uZ, u3, uz, uj, uP, \
-  ub, uH, ur, uX, u7, uD, un, uT, uf, uL, uv, u) \
-  /* Embedded 64-point orthonormal Type-II fDCT. */ \
-  do { \
-    int u1h; \
-    int u3h; \
-    int u5h; \
-    int u7h; \
-    int u9h; \
-    int ubh; \
-    int udh; \
-    int ufh; \
-    int uhh; \
-    int ujh; \
-    int ulh; \
-    int unh; \
-    int uph; \
-    int urh; \
-    int uth; \
-    int uvh; \
-    int uxh; \
-    int uzh; \
-    int uBh; \
-    int uDh; \
-    int uFh; \
-    int uHh; \
-    int uJh; \
-    int uLh; \
-    int uNh; \
-    int uPh; \
-    int uRh; \
-    int uTh; \
-    int uVh; \
-    int uXh; \
-    int uZh; \
-    int uh_; \
-    OD_IDST_32_ASYM(u, uL, uT, uD, uX, uH, uP, uz, uZ, uJ, uR, uB, uV, uF, \
-      uN, ux, u_, uK, uS, uC, uW, uG, uO, uy, uY, uI, uQ, uA, uU, uE, uM, uw); \
-    OD_IDCT_32_ASYM(u0, ug, u8, uo, u4, uk, uc, us, u2, ui, ua, uq, u6, um, \
-      ue, uu, u1, u1h, uh, uhh, u9, u9h, up, uph, u5, u5h, ul, ulh, ud, udh, \
-      ut, uth, u3, u3h, uj, ujh, ub, ubh, ur, urh, u7, u7h, un, unh, uf, ufh, \
-      uv, uvh); \
-    uh_ = OD_DCT_RSHIFT(u, 1); \
-    u0 += uh_; \
-    u = u0 - u; \
-    u_ = u1h - u_; \
-    u1 -= u_; \
-    uZh = OD_DCT_RSHIFT(uZ, 1); \
-    u2 += uZh; \
-    uZ = u2 - uZ; \
-    uY = u3h - uY; \
-    u3 -= uY; \
-    uXh = OD_DCT_RSHIFT(uX, 1); \
-    u4 += uXh; \
-    uX = u4 - uX; \
-    uW = u5h - uW; \
-    u5 -= uW; \
-    uVh = OD_DCT_RSHIFT(uV, 1); \
-    u6 += uVh; \
-    uV = u6 - uV; \
-    uU = u7h - uU; \
-    u7 -= uU; \
-    uTh = OD_DCT_RSHIFT(uT, 1); \
-    u8 += uTh; \
-    uT = u8 - uT; \
-    uS = u9h - uS; \
-    u9 -= uS; \
-    uRh = OD_DCT_RSHIFT(uR, 1); \
-    ua += uRh; \
-    uR = ua - uR; \
-    uQ = ubh - uQ; \
-    ub -= uQ; \
-    uPh = OD_DCT_RSHIFT(uP, 1); \
-    uc += uPh; \
-    uP = uc - uP; \
-    uO = udh - uO; \
-    ud -= uO; \
-    uNh = OD_DCT_RSHIFT(uN, 1); \
-    ue += uNh; \
-    uN = ue - uN; \
-    uM = ufh - uM; \
-    uf -= uM; \
-    uLh = OD_DCT_RSHIFT(uL, 1); \
-    ug += uLh; \
-    uL = ug - uL; \
-    uK = uhh - uK; \
-    uh -= uK; \
-    uJh = OD_DCT_RSHIFT(uJ, 1); \
-    ui += uJh; \
-    uJ = ui - uJ; \
-    uI = ujh - uI; \
-    uj -= uI; \
-    uHh = OD_DCT_RSHIFT(uH, 1); \
-    uk += uHh; \
-    uH = uk - uH; \
-    uG = ulh - uG; \
-    ul -= uG; \
-    uFh = OD_DCT_RSHIFT(uF, 1); \
-    um += uFh; \
-    uF = um - uF; \
-    uE = unh - uE; \
-    un -= uE; \
-    uDh = OD_DCT_RSHIFT(uD, 1); \
-    uo += uDh; \
-    uD = uo - uD; \
-    uC = uph - uC; \
-    up -= uC; \
-    uBh = OD_DCT_RSHIFT(uB, 1); \
-    uq += uBh; \
-    uB = uq - uB; \
-    uA = urh - uA; \
-    ur -= uA; \
-    uzh = OD_DCT_RSHIFT(uz, 1); \
-    us += uzh; \
-    uz = us - uz; \
-    uy = uth - uy; \
-    ut -= uy; \
-    uxh = OD_DCT_RSHIFT(ux, 1); \
-    uu += uxh; \
-    ux = uu - ux; \
-    uw = uvh - uw; \
-    uv -= uw; \
-  } while (0)
-#endif
-
-void od_bin_fdct4(od_coeff y[4], const od_coeff *x, int xstride) {
-  int q0;
-  int q1;
-  int q2;
-  int q3;
-  q0 = x[0*xstride];
-  q2 = x[1*xstride];
-  q1 = x[2*xstride];
-  q3 = x[3*xstride];
-  OD_FDCT_4(q0, q2, q1, q3);
-  y[0] = (od_coeff)q0;
-  y[1] = (od_coeff)q1;
-  y[2] = (od_coeff)q2;
-  y[3] = (od_coeff)q3;
-}
-
-void od_bin_idct4(od_coeff *x, int xstride, const od_coeff y[4]) {
-  int q0;
-  int q1;
-  int q2;
-  int q3;
-  q0 = y[0];
-  q2 = y[1];
-  q1 = y[2];
-  q3 = y[3];
-  OD_IDCT_4(q0, q2, q1, q3);
-  x[0*xstride] = q0;
-  x[1*xstride] = q1;
-  x[2*xstride] = q2;
-  x[3*xstride] = q3;
-}
-
-void od_bin_fdst4(od_coeff y[4], const od_coeff *x, int xstride) {
-  int q0;
-  int q1;
-  int q2;
-  int q3;
-  q0 = x[3*xstride];
-  q2 = x[2*xstride];
-  q1 = x[1*xstride];
-  q3 = x[0*xstride];
-  OD_FDST_4(q0, q2, q1, q3);
-  y[0] = (od_coeff)q3;
-  y[1] = (od_coeff)q2;
-  y[2] = (od_coeff)q1;
-  y[3] = (od_coeff)q0;
-}
-
-void od_bin_idst4(od_coeff *x, int xstride, const od_coeff y[4]) {
-  int q0;
-  int q1;
-  int q2;
-  int q3;
-  q0 = y[3];
-  q2 = y[2];
-  q1 = y[1];
-  q3 = y[0];
-  OD_IDST_4(q0, q2, q1, q3);
-  x[0*xstride] = q3;
-  x[1*xstride] = q2;
-  x[2*xstride] = q1;
-  x[3*xstride] = q0;
-}
-
-void od_bin_fdct8(od_coeff y[8], const od_coeff *x, int xstride) {
-  int r0;
-  int r1;
-  int r2;
-  int r3;
-  int r4;
-  int r5;
-  int r6;
-  int r7;
-  r0 = x[0*xstride];
-  r4 = x[1*xstride];
-  r2 = x[2*xstride];
-  r6 = x[3*xstride];
-  r1 = x[4*xstride];
-  r5 = x[5*xstride];
-  r3 = x[6*xstride];
-  r7 = x[7*xstride];
-  OD_FDCT_8(r0, r4, r2, r6, r1, r5, r3, r7);
-  y[0] = (od_coeff)r0;
-  y[1] = (od_coeff)r1;
-  y[2] = (od_coeff)r2;
-  y[3] = (od_coeff)r3;
-  y[4] = (od_coeff)r4;
-  y[5] = (od_coeff)r5;
-  y[6] = (od_coeff)r6;
-  y[7] = (od_coeff)r7;
-}
-
-void od_bin_idct8(od_coeff *x, int xstride, const od_coeff y[8]) {
-  int r0;
-  int r1;
-  int r2;
-  int r3;
-  int r4;
-  int r5;
-  int r6;
-  int r7;
-  r0 = y[0];
-  r4 = y[1];
-  r2 = y[2];
-  r6 = y[3];
-  r1 = y[4];
-  r5 = y[5];
-  r3 = y[6];
-  r7 = y[7];
-  OD_IDCT_8(r0, r4, r2, r6, r1, r5, r3, r7);
-  x[0*xstride] = (od_coeff)r0;
-  x[1*xstride] = (od_coeff)r1;
-  x[2*xstride] = (od_coeff)r2;
-  x[3*xstride] = (od_coeff)r3;
-  x[4*xstride] = (od_coeff)r4;
-  x[5*xstride] = (od_coeff)r5;
-  x[6*xstride] = (od_coeff)r6;
-  x[7*xstride] = (od_coeff)r7;
-}
-
-void od_bin_fdst8(od_coeff y[8], const od_coeff *x, int xstride) {
-  int r0;
-  int r1;
-  int r2;
-  int r3;
-  int r4;
-  int r5;
-  int r6;
-  int r7;
-  r0 = x[0*xstride];
-  r4 = x[1*xstride];
-  r2 = x[2*xstride];
-  r6 = x[3*xstride];
-  r1 = x[4*xstride];
-  r5 = x[5*xstride];
-  r3 = x[6*xstride];
-  r7 = x[7*xstride];
-  OD_FDST_8(r0, r4, r2, r6, r1, r5, r3, r7);
-  y[0] = (od_coeff)r0;
-  y[1] = (od_coeff)r1;
-  y[2] = (od_coeff)r2;
-  y[3] = (od_coeff)r3;
-  y[4] = (od_coeff)r4;
-  y[5] = (od_coeff)r5;
-  y[6] = (od_coeff)r6;
-  y[7] = (od_coeff)r7;
-}
-
-void od_bin_idst8(od_coeff *x, int xstride, const od_coeff y[8]) {
-  int r0;
-  int r1;
-  int r2;
-  int r3;
-  int r4;
-  int r5;
-  int r6;
-  int r7;
-  r0 = y[0];
-  r4 = y[1];
-  r2 = y[2];
-  r6 = y[3];
-  r1 = y[4];
-  r5 = y[5];
-  r3 = y[6];
-  r7 = y[7];
-  OD_IDST_8(r0, r4, r2, r6, r1, r5, r3, r7);
-  x[0*xstride] = (od_coeff)r0;
-  x[1*xstride] = (od_coeff)r1;
-  x[2*xstride] = (od_coeff)r2;
-  x[3*xstride] = (od_coeff)r3;
-  x[4*xstride] = (od_coeff)r4;
-  x[5*xstride] = (od_coeff)r5;
-  x[6*xstride] = (od_coeff)r6;
-  x[7*xstride] = (od_coeff)r7;
-}
-
-void od_bin_fdct16(od_coeff y[16], const od_coeff *x, int xstride) {
-  int s0;
-  int s1;
-  int s2;
-  int s3;
-  int s4;
-  int s5;
-  int s6;
-  int s7;
-  int s8;
-  int s9;
-  int sa;
-  int sb;
-  int sc;
-  int sd;
-  int se;
-  int sf;
-  s0 = x[0*xstride];
-  s8 = x[1*xstride];
-  s4 = x[2*xstride];
-  sc = x[3*xstride];
-  s2 = x[4*xstride];
-  sa = x[5*xstride];
-  s6 = x[6*xstride];
-  se = x[7*xstride];
-  s1 = x[8*xstride];
-  s9 = x[9*xstride];
-  s5 = x[10*xstride];
-  sd = x[11*xstride];
-  s3 = x[12*xstride];
-  sb = x[13*xstride];
-  s7 = x[14*xstride];
-  sf = x[15*xstride];
-  OD_FDCT_16(s0, s8, s4, sc, s2, sa, s6, se, s1, s9, s5, sd, s3, sb, s7, sf);
-  y[0] = (od_coeff)s0;
-  y[1] = (od_coeff)s1;
-  y[2] = (od_coeff)s2;
-  y[3] = (od_coeff)s3;
-  y[4] = (od_coeff)s4;
-  y[5] = (od_coeff)s5;
-  y[6] = (od_coeff)s6;
-  y[7] = (od_coeff)s7;
-  y[8] = (od_coeff)s8;
-  y[9] = (od_coeff)s9;
-  y[10] = (od_coeff)sa;
-  y[11] = (od_coeff)sb;
-  y[12] = (od_coeff)sc;
-  y[13] = (od_coeff)sd;
-  y[14] = (od_coeff)se;
-  y[15] = (od_coeff)sf;
-}
-
-void od_bin_idct16(od_coeff *x, int xstride, const od_coeff y[16]) {
-  int s0;
-  int s1;
-  int s2;
-  int s3;
-  int s4;
-  int s5;
-  int s6;
-  int s7;
-  int s8;
-  int s9;
-  int sa;
-  int sb;
-  int sc;
-  int sd;
-  int se;
-  int sf;
-  s0 = y[0];
-  s8 = y[1];
-  s4 = y[2];
-  sc = y[3];
-  s2 = y[4];
-  sa = y[5];
-  s6 = y[6];
-  se = y[7];
-  s1 = y[8];
-  s9 = y[9];
-  s5 = y[10];
-  sd = y[11];
-  s3 = y[12];
-  sb = y[13];
-  s7 = y[14];
-  sf = y[15];
-  OD_IDCT_16(s0, s8, s4, sc, s2, sa, s6, se, s1, s9, s5, sd, s3, sb, s7, sf);
-  x[0*xstride] = (od_coeff)s0;
-  x[1*xstride] = (od_coeff)s1;
-  x[2*xstride] = (od_coeff)s2;
-  x[3*xstride] = (od_coeff)s3;
-  x[4*xstride] = (od_coeff)s4;
-  x[5*xstride] = (od_coeff)s5;
-  x[6*xstride] = (od_coeff)s6;
-  x[7*xstride] = (od_coeff)s7;
-  x[8*xstride] = (od_coeff)s8;
-  x[9*xstride] = (od_coeff)s9;
-  x[10*xstride] = (od_coeff)sa;
-  x[11*xstride] = (od_coeff)sb;
-  x[12*xstride] = (od_coeff)sc;
-  x[13*xstride] = (od_coeff)sd;
-  x[14*xstride] = (od_coeff)se;
-  x[15*xstride] = (od_coeff)sf;
-}
-
-void od_bin_fdst16(od_coeff y[16], const od_coeff *x, int xstride) {
-  int s0;
-  int s1;
-  int s2;
-  int s3;
-  int s4;
-  int s5;
-  int s6;
-  int s7;
-  int s8;
-  int s9;
-  int sa;
-  int sb;
-  int sc;
-  int sd;
-  int se;
-  int sf;
-  s0 = x[15*xstride];
-  s8 = x[14*xstride];
-  s4 = x[13*xstride];
-  sc = x[12*xstride];
-  s2 = x[11*xstride];
-  sa = x[10*xstride];
-  s6 = x[9*xstride];
-  se = x[8*xstride];
-  s1 = x[7*xstride];
-  s9 = x[6*xstride];
-  s5 = x[5*xstride];
-  sd = x[4*xstride];
-  s3 = x[3*xstride];
-  sb = x[2*xstride];
-  s7 = x[1*xstride];
-  sf = x[0*xstride];
-  OD_FDST_16(s0, s8, s4, sc, s2, sa, s6, se, s1, s9, s5, sd, s3, sb, s7, sf);
-  y[0] = (od_coeff)sf;
-  y[1] = (od_coeff)se;
-  y[2] = (od_coeff)sd;
-  y[3] = (od_coeff)sc;
-  y[4] = (od_coeff)sb;
-  y[5] = (od_coeff)sa;
-  y[6] = (od_coeff)s9;
-  y[7] = (od_coeff)s8;
-  y[8] = (od_coeff)s7;
-  y[9] = (od_coeff)s6;
-  y[10] = (od_coeff)s5;
-  y[11] = (od_coeff)s4;
-  y[12] = (od_coeff)s3;
-  y[13] = (od_coeff)s2;
-  y[14] = (od_coeff)s1;
-  y[15] = (od_coeff)s0;
-}
-
-void od_bin_idst16(od_coeff *x, int xstride, const od_coeff y[16]) {
-  int s0;
-  int s1;
-  int s2;
-  int s3;
-  int s4;
-  int s5;
-  int s6;
-  int s7;
-  int s8;
-  int s9;
-  int sa;
-  int sb;
-  int sc;
-  int sd;
-  int se;
-  int sf;
-  s0 = y[15];
-  s8 = y[14];
-  s4 = y[13];
-  sc = y[12];
-  s2 = y[11];
-  sa = y[10];
-  s6 = y[9];
-  se = y[8];
-  s1 = y[7];
-  s9 = y[6];
-  s5 = y[5];
-  sd = y[4];
-  s3 = y[3];
-  sb = y[2];
-  s7 = y[1];
-  sf = y[0];
-  OD_IDST_16(s0, s8, s4, sc, s2, sa, s6, se, s1, s9, s5, sd, s3, sb, s7, sf);
-  x[0*xstride] = (od_coeff)sf;
-  x[1*xstride] = (od_coeff)se;
-  x[2*xstride] = (od_coeff)sd;
-  x[3*xstride] = (od_coeff)sc;
-  x[4*xstride] = (od_coeff)sb;
-  x[5*xstride] = (od_coeff)sa;
-  x[6*xstride] = (od_coeff)s9;
-  x[7*xstride] = (od_coeff)s8;
-  x[8*xstride] = (od_coeff)s7;
-  x[9*xstride] = (od_coeff)s6;
-  x[10*xstride] = (od_coeff)s5;
-  x[11*xstride] = (od_coeff)s4;
-  x[12*xstride] = (od_coeff)s3;
-  x[13*xstride] = (od_coeff)s2;
-  x[14*xstride] = (od_coeff)s1;
-  x[15*xstride] = (od_coeff)s0;
-}
-
-void od_bin_fdct32(od_coeff y[32], const od_coeff *x, int xstride) {
-  /*215 adds, 38 shifts, 87 "muls".*/
-  int t0;
-  int t1;
-  int t2;
-  int t3;
-  int t4;
-  int t5;
-  int t6;
-  int t7;
-  int t8;
-  int t9;
-  int ta;
-  int tb;
-  int tc;
-  int td;
-  int te;
-  int tf;
-  int tg;
-  int th;
-  int ti;
-  int tj;
-  int tk;
-  int tl;
-  int tm;
-  int tn;
-  int to;
-  int tp;
-  int tq;
-  int tr;
-  int ts;
-  int tt;
-  int tu;
-  int tv;
-  t0 = x[0*xstride];
-  tg = x[1*xstride];
-  t8 = x[2*xstride];
-  to = x[3*xstride];
-  t4 = x[4*xstride];
-  tk = x[5*xstride];
-  tc = x[6*xstride];
-  ts = x[7*xstride];
-  t2 = x[8*xstride];
-  ti = x[9*xstride];
-  ta = x[10*xstride];
-  tq = x[11*xstride];
-  t6 = x[12*xstride];
-  tm = x[13*xstride];
-  te = x[14*xstride];
-  tu = x[15*xstride];
-  t1 = x[16*xstride];
-  th = x[17*xstride];
-  t9 = x[18*xstride];
-  tp = x[19*xstride];
-  t5 = x[20*xstride];
-  tl = x[21*xstride];
-  td = x[22*xstride];
-  tt = x[23*xstride];
-  t3 = x[24*xstride];
-  tj = x[25*xstride];
-  tb = x[26*xstride];
-  tr = x[27*xstride];
-  t7 = x[28*xstride];
-  tn = x[29*xstride];
-  tf = x[30*xstride];
-  tv = x[31*xstride];
-  OD_FDCT_32(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, te, tu,
-    t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv);
-  y[0] = (od_coeff)t0;
-  y[1] = (od_coeff)t1;
-  y[2] = (od_coeff)t2;
-  y[3] = (od_coeff)t3;
-  y[4] = (od_coeff)t4;
-  y[5] = (od_coeff)t5;
-  y[6] = (od_coeff)t6;
-  y[7] = (od_coeff)t7;
-  y[8] = (od_coeff)t8;
-  y[9] = (od_coeff)t9;
-  y[10] = (od_coeff)ta;
-  y[11] = (od_coeff)tb;
-  y[12] = (od_coeff)tc;
-  y[13] = (od_coeff)td;
-  y[14] = (od_coeff)te;
-  y[15] = (od_coeff)tf;
-  y[16] = (od_coeff)tg;
-  y[17] = (od_coeff)th;
-  y[18] = (od_coeff)ti;
-  y[19] = (od_coeff)tj;
-  y[20] = (od_coeff)tk;
-  y[21] = (od_coeff)tl;
-  y[22] = (od_coeff)tm;
-  y[23] = (od_coeff)tn;
-  y[24] = (od_coeff)to;
-  y[25] = (od_coeff)tp;
-  y[26] = (od_coeff)tq;
-  y[27] = (od_coeff)tr;
-  y[28] = (od_coeff)ts;
-  y[29] = (od_coeff)tt;
-  y[30] = (od_coeff)tu;
-  y[31] = (od_coeff)tv;
-}
-
-void od_bin_idct32(od_coeff *x, int xstride, const od_coeff y[32]) {
-  int t0;
-  int t1;
-  int t2;
-  int t3;
-  int t4;
-  int t5;
-  int t6;
-  int t7;
-  int t8;
-  int t9;
-  int ta;
-  int tb;
-  int tc;
-  int td;
-  int te;
-  int tf;
-  int tg;
-  int th;
-  int ti;
-  int tj;
-  int tk;
-  int tl;
-  int tm;
-  int tn;
-  int to;
-  int tp;
-  int tq;
-  int tr;
-  int ts;
-  int tt;
-  int tu;
-  int tv;
-  t0 = y[0];
-  tg = y[1];
-  t8 = y[2];
-  to = y[3];
-  t4 = y[4];
-  tk = y[5];
-  tc = y[6];
-  ts = y[7];
-  t2 = y[8];
-  ti = y[9];
-  ta = y[10];
-  tq = y[11];
-  t6 = y[12];
-  tm = y[13];
-  te = y[14];
-  tu = y[15];
-  t1 = y[16];
-  th = y[17];
-  t9 = y[18];
-  tp = y[19];
-  t5 = y[20];
-  tl = y[21];
-  td = y[22];
-  tt = y[23];
-  t3 = y[24];
-  tj = y[25];
-  tb = y[26];
-  tr = y[27];
-  t7 = y[28];
-  tn = y[29];
-  tf = y[30];
-  tv = y[31];
-  OD_IDCT_32(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, tm, te, tu,
-    t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv);
-  x[0*xstride] = (od_coeff)t0;
-  x[1*xstride] = (od_coeff)t1;
-  x[2*xstride] = (od_coeff)t2;
-  x[3*xstride] = (od_coeff)t3;
-  x[4*xstride] = (od_coeff)t4;
-  x[5*xstride] = (od_coeff)t5;
-  x[6*xstride] = (od_coeff)t6;
-  x[7*xstride] = (od_coeff)t7;
-  x[8*xstride] = (od_coeff)t8;
-  x[9*xstride] = (od_coeff)t9;
-  x[10*xstride] = (od_coeff)ta;
-  x[11*xstride] = (od_coeff)tb;
-  x[12*xstride] = (od_coeff)tc;
-  x[13*xstride] = (od_coeff)td;
-  x[14*xstride] = (od_coeff)te;
-  x[15*xstride] = (od_coeff)tf;
-  x[16*xstride] = (od_coeff)tg;
-  x[17*xstride] = (od_coeff)th;
-  x[18*xstride] = (od_coeff)ti;
-  x[19*xstride] = (od_coeff)tj;
-  x[20*xstride] = (od_coeff)tk;
-  x[21*xstride] = (od_coeff)tl;
-  x[22*xstride] = (od_coeff)tm;
-  x[23*xstride] = (od_coeff)tn;
-  x[24*xstride] = (od_coeff)to;
-  x[25*xstride] = (od_coeff)tp;
-  x[26*xstride] = (od_coeff)tq;
-  x[27*xstride] = (od_coeff)tr;
-  x[28*xstride] = (od_coeff)ts;
-  x[29*xstride] = (od_coeff)tt;
-  x[30*xstride] = (od_coeff)tu;
-  x[31*xstride] = (od_coeff)tv;
-}
-
-#if CONFIG_TX64X64
-void od_bin_fdct64(od_coeff y[64], const od_coeff *x, int xstride) {
-  int t0;
-  int t1;
-  int t2;
-  int t3;
-  int t4;
-  int t5;
-  int t6;
-  int t7;
-  int t8;
-  int t9;
-  int ta;
-  int tb;
-  int tc;
-  int td;
-  int te;
-  int tf;
-  int tg;
-  int th;
-  int ti;
-  int tj;
-  int tk;
-  int tl;
-  int tm;
-  int tn;
-  int to;
-  int tp;
-  int tq;
-  int tr;
-  int ts;
-  int tt;
-  int tu;
-  int tv;
-  int tw;
-  int tx;
-  int ty;
-  int tz;
-  int tA;
-  int tB;
-  int tC;
-  int tD;
-  int tE;
-  int tF;
-  int tG;
-  int tH;
-  int tI;
-  int tJ;
-  int tK;
-  int tL;
-  int tM;
-  int tN;
-  int tO;
-  int tP;
-  int tQ;
-  int tR;
-  int tS;
-  int tT;
-  int tU;
-  int tV;
-  int tW;
-  int tX;
-  int tY;
-  int tZ;
-  int t_;
-  int t;
-  t0 = x[0*xstride];
-  tw = x[1*xstride];
-  tg = x[2*xstride];
-  tM = x[3*xstride];
-  t8 = x[4*xstride];
-  tE = x[5*xstride];
-  to = x[6*xstride];
-  tU = x[7*xstride];
-  t4 = x[8*xstride];
-  tA = x[9*xstride];
-  tk = x[10*xstride];
-  tQ = x[11*xstride];
-  tc = x[12*xstride];
-  tI = x[13*xstride];
-  ts = x[14*xstride];
-  tY = x[15*xstride];
-  t2 = x[16*xstride];
-  ty = x[17*xstride];
-  ti = x[18*xstride];
-  tO = x[19*xstride];
-  ta = x[20*xstride];
-  tG = x[21*xstride];
-  tq = x[22*xstride];
-  tW = x[23*xstride];
-  t6 = x[24*xstride];
-  tC = x[25*xstride];
-  tm = x[26*xstride];
-  tS = x[27*xstride];
-  te = x[28*xstride];
-  tK = x[29*xstride];
-  tu = x[30*xstride];
-  t_ = x[31*xstride];
-  t1 = x[32*xstride];
-  tx = x[33*xstride];
-  th = x[34*xstride];
-  tN = x[35*xstride];
-  t9 = x[36*xstride];
-  tF = x[37*xstride];
-  tp = x[38*xstride];
-  tV = x[39*xstride];
-  t5 = x[40*xstride];
-  tB = x[41*xstride];
-  tl = x[42*xstride];
-  tR = x[43*xstride];
-  td = x[44*xstride];
-  tJ = x[45*xstride];
-  tt = x[46*xstride];
-  tZ = x[47*xstride];
-  t3 = x[48*xstride];
-  tz = x[49*xstride];
-  tj = x[50*xstride];
-  tP = x[51*xstride];
-  tb = x[52*xstride];
-  tH = x[53*xstride];
-  tr = x[54*xstride];
-  tX = x[55*xstride];
-  t7 = x[56*xstride];
-  tD = x[57*xstride];
-  tn = x[58*xstride];
-  tT = x[59*xstride];
-  tf = x[60*xstride];
-  tL = x[61*xstride];
-  tv = x[62*xstride];
-  t = x[63*xstride];
-  OD_FDCT_64(t0, tw, tg, tM, t8, tE, to, tU, t4, tA, tk, tQ, tc, tI, ts, tY,
-    t2, ty, ti, tO, ta, tG, tq, tW, t6, tC, tm, tS, te, tK, tu, t_, t1, tx,
-    th, tN, t9, tF, tp, tV, t5, tB, tl, tR, td, tJ, tt, tZ, t3, tz, tj, tP,
-    tb, tH, tr, tX, t7, tD, tn, tT, tf, tL, tv, t);
-  y[0] = (od_coeff)t0;
-  y[1] = (od_coeff)t1;
-  y[2] = (od_coeff)t2;
-  y[3] = (od_coeff)t3;
-  y[4] = (od_coeff)t4;
-  y[5] = (od_coeff)t5;
-  y[6] = (od_coeff)t6;
-  y[7] = (od_coeff)t7;
-  y[8] = (od_coeff)t8;
-  y[9] = (od_coeff)t9;
-  y[10] = (od_coeff)ta;
-  y[11] = (od_coeff)tb;
-  y[12] = (od_coeff)tc;
-  y[13] = (od_coeff)td;
-  y[14] = (od_coeff)te;
-  y[15] = (od_coeff)tf;
-  y[16] = (od_coeff)tg;
-  y[17] = (od_coeff)th;
-  y[18] = (od_coeff)ti;
-  y[19] = (od_coeff)tj;
-  y[20] = (od_coeff)tk;
-  y[21] = (od_coeff)tl;
-  y[22] = (od_coeff)tm;
-  y[23] = (od_coeff)tn;
-  y[24] = (od_coeff)to;
-  y[25] = (od_coeff)tp;
-  y[26] = (od_coeff)tq;
-  y[27] = (od_coeff)tr;
-  y[28] = (od_coeff)ts;
-  y[29] = (od_coeff)tt;
-  y[30] = (od_coeff)tu;
-  y[31] = (od_coeff)tv;
-  y[32] = (od_coeff)tw;
-  y[33] = (od_coeff)tx;
-  y[34] = (od_coeff)ty;
-  y[35] = (od_coeff)tz;
-  y[36] = (od_coeff)tA;
-  y[37] = (od_coeff)tB;
-  y[38] = (od_coeff)tC;
-  y[39] = (od_coeff)tD;
-  y[40] = (od_coeff)tE;
-  y[41] = (od_coeff)tF;
-  y[41] = (od_coeff)tF;
-  y[42] = (od_coeff)tG;
-  y[43] = (od_coeff)tH;
-  y[44] = (od_coeff)tI;
-  y[45] = (od_coeff)tJ;
-  y[46] = (od_coeff)tK;
-  y[47] = (od_coeff)tL;
-  y[48] = (od_coeff)tM;
-  y[49] = (od_coeff)tN;
-  y[50] = (od_coeff)tO;
-  y[51] = (od_coeff)tP;
-  y[52] = (od_coeff)tQ;
-  y[53] = (od_coeff)tR;
-  y[54] = (od_coeff)tS;
-  y[55] = (od_coeff)tT;
-  y[56] = (od_coeff)tU;
-  y[57] = (od_coeff)tV;
-  y[58] = (od_coeff)tW;
-  y[59] = (od_coeff)tX;
-  y[60] = (od_coeff)tY;
-  y[61] = (od_coeff)tZ;
-  y[62] = (od_coeff)t_;
-  y[63] = (od_coeff)t;
-}
-
-void od_bin_idct64(od_coeff *x, int xstride, const od_coeff y[64]) {
-  int t0;
-  int t1;
-  int t2;
-  int t3;
-  int t4;
-  int t5;
-  int t6;
-  int t7;
-  int t8;
-  int t9;
-  int ta;
-  int tb;
-  int tc;
-  int td;
-  int te;
-  int tf;
-  int tg;
-  int th;
-  int ti;
-  int tj;
-  int tk;
-  int tl;
-  int tm;
-  int tn;
-  int to;
-  int tp;
-  int tq;
-  int tr;
-  int ts;
-  int tt;
-  int tu;
-  int tv;
-  int tw;
-  int tx;
-  int ty;
-  int tz;
-  int tA;
-  int tB;
-  int tC;
-  int tD;
-  int tE;
-  int tF;
-  int tG;
-  int tH;
-  int tI;
-  int tJ;
-  int tK;
-  int tL;
-  int tM;
-  int tN;
-  int tO;
-  int tP;
-  int tQ;
-  int tR;
-  int tS;
-  int tT;
-  int tU;
-  int tV;
-  int tW;
-  int tX;
-  int tY;
-  int tZ;
-  int t_;
-  int t;
-  t0 = y[0];
-  tw = y[1];
-  tg = y[2];
-  tM = y[3];
-  t8 = y[4];
-  tE = y[5];
-  to = y[6];
-  tU = y[7];
-  t4 = y[8];
-  tA = y[9];
-  tk = y[10];
-  tQ = y[11];
-  tc = y[12];
-  tI = y[13];
-  ts = y[14];
-  tY = y[15];
-  t2 = y[16];
-  ty = y[17];
-  ti = y[18];
-  tO = y[19];
-  ta = y[20];
-  tG = y[21];
-  tq = y[22];
-  tW = y[23];
-  t6 = y[24];
-  tC = y[25];
-  tm = y[26];
-  tS = y[27];
-  te = y[28];
-  tK = y[29];
-  tu = y[30];
-  t_ = y[31];
-  t1 = y[32];
-  tx = y[33];
-  th = y[34];
-  tN = y[35];
-  t9 = y[36];
-  tF = y[37];
-  tp = y[38];
-  tV = y[39];
-  t5 = y[40];
-  tB = y[41];
-  tl = y[42];
-  tR = y[43];
-  td = y[44];
-  tJ = y[45];
-  tt = y[46];
-  tZ = y[47];
-  t3 = y[48];
-  tz = y[49];
-  tj = y[50];
-  tP = y[51];
-  tb = y[52];
-  tH = y[53];
-  tr = y[54];
-  tX = y[55];
-  t7 = y[56];
-  tD = y[57];
-  tn = y[58];
-  tT = y[59];
-  tf = y[60];
-  tL = y[61];
-  tv = y[62];
-  t = y[63];
-  OD_IDCT_64(t0, tw, tg, tM, t8, tE, to, tU, t4, tA, tk, tQ, tc, tI, ts, tY,
-    t2, ty, ti, tO, ta, tG, tq, tW, t6, tC, tm, tS, te, tK, tu, t_, t1, tx,
-    th, tN, t9, tF, tp, tV, t5, tB, tl, tR, td, tJ, tt, tZ, t3, tz, tj, tP,
-    tb, tH, tr, tX, t7, tD, tn, tT, tf, tL, tv, t);
-  x[0*xstride] = (od_coeff)t0;
-  x[1*xstride] = (od_coeff)t1;
-  x[2*xstride] = (od_coeff)t2;
-  x[3*xstride] = (od_coeff)t3;
-  x[4*xstride] = (od_coeff)t4;
-  x[5*xstride] = (od_coeff)t5;
-  x[6*xstride] = (od_coeff)t6;
-  x[7*xstride] = (od_coeff)t7;
-  x[8*xstride] = (od_coeff)t8;
-  x[9*xstride] = (od_coeff)t9;
-  x[10*xstride] = (od_coeff)ta;
-  x[11*xstride] = (od_coeff)tb;
-  x[12*xstride] = (od_coeff)tc;
-  x[13*xstride] = (od_coeff)td;
-  x[14*xstride] = (od_coeff)te;
-  x[15*xstride] = (od_coeff)tf;
-  x[16*xstride] = (od_coeff)tg;
-  x[17*xstride] = (od_coeff)th;
-  x[18*xstride] = (od_coeff)ti;
-  x[19*xstride] = (od_coeff)tj;
-  x[20*xstride] = (od_coeff)tk;
-  x[21*xstride] = (od_coeff)tl;
-  x[22*xstride] = (od_coeff)tm;
-  x[23*xstride] = (od_coeff)tn;
-  x[24*xstride] = (od_coeff)to;
-  x[25*xstride] = (od_coeff)tp;
-  x[26*xstride] = (od_coeff)tq;
-  x[27*xstride] = (od_coeff)tr;
-  x[28*xstride] = (od_coeff)ts;
-  x[29*xstride] = (od_coeff)tt;
-  x[30*xstride] = (od_coeff)tu;
-  x[31*xstride] = (od_coeff)tv;
-  x[32*xstride] = (od_coeff)tw;
-  x[33*xstride] = (od_coeff)tx;
-  x[34*xstride] = (od_coeff)ty;
-  x[35*xstride] = (od_coeff)tz;
-  x[36*xstride] = (od_coeff)tA;
-  x[37*xstride] = (od_coeff)tB;
-  x[38*xstride] = (od_coeff)tC;
-  x[39*xstride] = (od_coeff)tD;
-  x[40*xstride] = (od_coeff)tE;
-  x[41*xstride] = (od_coeff)tF;
-  x[41*xstride] = (od_coeff)tF;
-  x[42*xstride] = (od_coeff)tG;
-  x[43*xstride] = (od_coeff)tH;
-  x[44*xstride] = (od_coeff)tI;
-  x[45*xstride] = (od_coeff)tJ;
-  x[46*xstride] = (od_coeff)tK;
-  x[47*xstride] = (od_coeff)tL;
-  x[48*xstride] = (od_coeff)tM;
-  x[49*xstride] = (od_coeff)tN;
-  x[50*xstride] = (od_coeff)tO;
-  x[51*xstride] = (od_coeff)tP;
-  x[52*xstride] = (od_coeff)tQ;
-  x[53*xstride] = (od_coeff)tR;
-  x[54*xstride] = (od_coeff)tS;
-  x[55*xstride] = (od_coeff)tT;
-  x[56*xstride] = (od_coeff)tU;
-  x[57*xstride] = (od_coeff)tV;
-  x[58*xstride] = (od_coeff)tW;
-  x[59*xstride] = (od_coeff)tX;
-  x[60*xstride] = (od_coeff)tY;
-  x[61*xstride] = (od_coeff)tZ;
-  x[62*xstride] = (od_coeff)t_;
-  x[63*xstride] = (od_coeff)t;
-}
-#endif
-
-void daala_fdct4(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[4];
-  od_coeff y[4];
-  for (i = 0; i < 4; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdct4(y, x, 1);
-  for (i = 0; i < 4; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idct4(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[4];
-  od_coeff y[4];
-  for (i = 0; i < 4; i++) y[i] = input[i];
-  od_bin_idct4(x, 1, y);
-  for (i = 0; i < 4; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_fdst4(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[4];
-  od_coeff y[4];
-  for (i = 0; i < 4; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdst4(y, x, 1);
-  for (i = 0; i < 4; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idst4(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[4];
-  od_coeff y[4];
-  for (i = 0; i < 4; i++) y[i] = input[i];
-  od_bin_idst4(x, 1, y);
-  for (i = 0; i < 4; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_idtx4(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 4; i++) output[i] = input[i];
-}
-
-void daala_fdct8(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[8];
-  od_coeff y[8];
-  for (i = 0; i < 8; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdct8(y, x, 1);
-  for (i = 0; i < 8; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idct8(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[8];
-  od_coeff y[8];
-  for (i = 0; i < 8; i++) y[i] = (od_coeff)input[i];
-  od_bin_idct8(x, 1, y);
-  for (i = 0; i < 8; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_fdst8(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[8];
-  od_coeff y[8];
-  for (i = 0; i < 8; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdst8(y, x, 1);
-  for (i = 0; i < 8; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idst8(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[8];
-  od_coeff y[8];
-  for (i = 0; i < 8; i++) y[i] = (od_coeff)input[i];
-  od_bin_idst8(x, 1, y);
-  for (i = 0; i < 8; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_idtx8(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 8; i++) output[i] = input[i];
-}
-
-void daala_fdct16(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[16];
-  od_coeff y[16];
-  for (i = 0; i < 16; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdct16(y, x, 1);
-  for (i = 0; i < 16; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idct16(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[16];
-  od_coeff y[16];
-  for (i = 0; i < 16; i++) y[i] = (od_coeff)input[i];
-  od_bin_idct16(x, 1, y);
-  for (i = 0; i < 16; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_fdst16(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[16];
-  od_coeff y[16];
-  for (i = 0; i < 16; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdst16(y, x, 1);
-  for (i = 0; i < 16; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idst16(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[16];
-  od_coeff y[16];
-  for (i = 0; i < 16; i++) y[i] = (od_coeff)input[i];
-  od_bin_idst16(x, 1, y);
-  for (i = 0; i < 16; i++) output[i] = (tran_low_t)x[i];
-}
-
-void daala_idtx16(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 16; i++) output[i] = input[i];
-}
-
-void daala_fdct32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[32];
-  od_coeff y[32];
-  for (i = 0; i < 32; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdct32(y, x, 1);
-  for (i = 0; i < 32; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idct32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[32];
-  od_coeff y[32];
-  for (i = 0; i < 32; i++) y[i] = (od_coeff)input[i];
-  od_bin_idct32(x, 1, y);
-  for (i = 0; i < 32; i++) output[i] = (tran_low_t)x[i];
-}
-
-/* Preserve the "half-right" transform behavior. */
-void daala_fdst32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[16];
-  for (i = 0; i < 16; ++i) {
-    output[16 + i] = input[i];
-  }
-  for (i = 0; i < 16; ++i) {
-    inputhalf[i] = input[i + 16];
-  }
-  daala_fdct16(inputhalf, output);
-}
-
-/* Preserve the "half-right" transform behavior. */
-void daala_idst32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[16];
-  for (i = 0; i < 16; ++i) {
-    inputhalf[i] = input[i];
-  }
-  for (i = 0; i < 16; ++i) {
-    output[i] = input[16 + i];
-  }
-  daala_idct16(inputhalf, output + 16);
-}
-
-void daala_idtx32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 32; i++) output[i] = input[i];
-}
-
-#if CONFIG_TX64X64
-void daala_fdct64(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[64];
-  od_coeff y[64];
-  for (i = 0; i < 64; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdct64(y, x, 1);
-  for (i = 0; i < 64; i++) output[i] = (tran_low_t)y[i];
-}
-
-void daala_idct64(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[64];
-  od_coeff y[64];
-  for (i = 0; i < 64; i++) y[i] = (od_coeff)input[i];
-  od_bin_idct64(x, 1, y);
-  for (i = 0; i < 64; i++) output[i] = (tran_low_t)x[i];
-}
-
-/* Preserve the "half-right" transform behavior. */
-void daala_fdst64(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[32];
-  for (i = 0; i < 32; ++i) {
-    output[32 + i] = input[i];
-  }
-  for (i = 0; i < 32; ++i) {
-    inputhalf[i] = input[i + 32];
-  }
-  daala_fdct32(inputhalf, output);
-}
-
-/* Preserve the "half-right" transform behavior. */
-void daala_idst64(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[32];
-  for (i = 0; i < 32; ++i) {
-    inputhalf[i] = input[i];
-  }
-  for (i = 0; i < 32; ++i) {
-    output[i] = input[32 + i];
-  }
-  daala_idct32(inputhalf, output + 32);
-}
-
-void daala_idtx64(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 64; i++) output[i] = input[i];
-}
-#endif
diff --git a/third_party/aom/av1/common/daala_tx.h b/third_party/aom/av1/common/daala_tx.h
deleted file mode 100644
index 7145b66a2..000000000
--- a/third_party/aom/av1/common/daala_tx.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef AOM_DSP_DAALA_TX_H_
-#define AOM_DSP_DAALA_TX_H_
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "av1/common/odintrin.h"
-
-void daala_fdct4(const tran_low_t *input, tran_low_t *output);
-void daala_idct4(const tran_low_t *input, tran_low_t *output);
-void daala_fdst4(const tran_low_t *input, tran_low_t *output);
-void daala_idst4(const tran_low_t *input, tran_low_t *output);
-void daala_idtx4(const tran_low_t *input, tran_low_t *output);
-void daala_fdct8(const tran_low_t *input, tran_low_t *output);
-void daala_idct8(const tran_low_t *input, tran_low_t *output);
-void daala_fdst8(const tran_low_t *input, tran_low_t *output);
-void daala_idst8(const tran_low_t *input, tran_low_t *output);
-void daala_idtx8(const tran_low_t *input, tran_low_t *output);
-void daala_fdct16(const tran_low_t *input, tran_low_t *output);
-void daala_idct16(const tran_low_t *input, tran_low_t *output);
-void daala_fdst16(const tran_low_t *input, tran_low_t *output);
-void daala_idst16(const tran_low_t *input, tran_low_t *output);
-void daala_idtx16(const tran_low_t *input, tran_low_t *output);
-void daala_fdct32(const tran_low_t *input, tran_low_t *output);
-void daala_idct32(const tran_low_t *input, tran_low_t *output);
-void daala_fdst32(const tran_low_t *input, tran_low_t *output);
-void daala_idst32(const tran_low_t *input, tran_low_t *output);
-void daala_idtx32(const tran_low_t *input, tran_low_t *output);
-#if CONFIG_TX64X64
-void daala_fdct64(const tran_low_t *input, tran_low_t *output);
-void daala_idct64(const tran_low_t *input, tran_low_t *output);
-void daala_fdst64(const tran_low_t *input, tran_low_t *output);
-void daala_idst64(const tran_low_t *input, tran_low_t *output);
-void daala_idtx64(const tran_low_t *input, tran_low_t *output);
-#endif
-
-void od_bin_fdct4(od_coeff y[4], const od_coeff *x, int xstride);
-void od_bin_idct4(od_coeff *x, int xstride, const od_coeff y[4]);
-void od_bin_fdst4(od_coeff y[4], const od_coeff *x, int xstride);
-void od_bin_idst4(od_coeff *x, int xstride, const od_coeff y[4]);
-void od_bin_fdct8(od_coeff y[8], const od_coeff *x, int xstride);
-void od_bin_idct8(od_coeff *x, int xstride, const od_coeff y[8]);
-void od_bin_fdst8(od_coeff y[8], const od_coeff *x, int xstride);
-void od_bin_idst8(od_coeff *x, int xstride, const od_coeff y[8]);
-void od_bin_fdct16(od_coeff y[16], const od_coeff *x, int xstride);
-void od_bin_idct16(od_coeff *x, int xstride, const od_coeff y[16]);
-void od_bin_fdst16(od_coeff y[16], const od_coeff *x, int xstride);
-void od_bin_idst16(od_coeff *x, int xstride, const od_coeff y[16]);
-void od_bin_fdct32(od_coeff y[32], const od_coeff *x, int xstride);
-void od_bin_idct32(od_coeff *x, int xstride, const od_coeff y[32]);
-#if CONFIG_TX64X64
-void od_bin_fdct64(od_coeff y[64], const od_coeff *x, int xstride);
-void od_bin_idct64(od_coeff *x, int xstride, const od_coeff y[64]);
-#endif
-#endif
diff --git a/third_party/aom/av1/common/debugmodes.c b/third_party/aom/av1/common/debugmodes.c
index 91f33d4e3..868f341b5 100644
--- a/third_party/aom/av1/common/debugmodes.c
+++ b/third_party/aom/av1/common/debugmodes.c
@@ -27,7 +27,7 @@ static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) {
 static void print_mi_data(AV1_COMMON *cm, FILE *file, const char *descriptor,
                           size_t member_offset) {
   int mi_row, mi_col;
-  MODE_INFO **mi = cm->mi_grid_visible;
+  MB_MODE_INFO **mi = cm->mi_grid_visible;
   int rows = cm->mi_rows;
   int cols = cm->mi_cols;
   char prefix = descriptor[0];
@@ -36,8 +36,7 @@ static void print_mi_data(AV1_COMMON *cm, FILE *file, const char *descriptor,
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(file, "%c ", prefix);
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(file, "%2d ",
-              *((char *)((char *)(&mi[0]->mbmi) + member_offset)));
+      fprintf(file, "%2d ", *((char *)((char *)(mi[0]) + member_offset)));
       mi++;
     }
     fprintf(file, "\n");
@@ -50,7 +49,7 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
   int mi_row;
   int mi_col;
   FILE *mvs = fopen(file, "a");
-  MODE_INFO **mi = cm->mi_grid_visible;
+  MB_MODE_INFO **mi = cm->mi_grid_visible;
   int rows = cm->mi_rows;
   int cols = cm->mi_cols;
 
@@ -65,7 +64,7 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(mvs, "S ");
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%2d ", mi[0]->mbmi.skip);
+      fprintf(mvs, "%2d ", mi[0]->skip);
       mi++;
     }
     fprintf(mvs, "\n");
@@ -79,8 +78,7 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(mvs, "V ");
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%4d:%4d ", mi[0]->mbmi.mv[0].as_mv.row,
-              mi[0]->mbmi.mv[0].as_mv.col);
+      fprintf(mvs, "%4d:%4d ", mi[0]->mv[0].as_mv.row, mi[0]->mv[0].as_mv.col);
       mi++;
     }
     fprintf(mvs, "\n");
@@ -90,3 +88,20 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
 
   fclose(mvs);
 }
+
+void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
+                                         const char *filename) {
+  FILE *hdrFile = fopen(filename, "w");
+  fwrite(data, size, sizeof(uint8_t), hdrFile);
+  fclose(hdrFile);
+}
+
+void av1_print_frame_contexts(const FRAME_CONTEXT *fc, const char *filename) {
+  FILE *fcFile = fopen(filename, "w");
+  const uint16_t *fcp = (uint16_t *)fc;
+  const unsigned int n_contexts = sizeof(FRAME_CONTEXT) / sizeof(uint16_t);
+  unsigned int i;
+
+  for (i = 0; i < n_contexts; ++i) fprintf(fcFile, "%d ", *fcp++);
+  fclose(fcFile);
+}
diff --git a/third_party/aom/av1/common/entropy.c b/third_party/aom/av1/common/entropy.c
index 17a8f1356..4f95ef69b 100644
--- a/third_party/aom/av1/common/entropy.c
+++ b/third_party/aom/av1/common/entropy.c
@@ -9,7 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_mem/aom_mem.h"
 #include "av1/common/blockd.h"
@@ -17,2442 +18,161 @@
 #include "av1/common/entropymode.h"
 #include "av1/common/onyxc_int.h"
 #include "av1/common/scan.h"
-#if CONFIG_Q_ADAPT_PROBS
 #include "av1/common/token_cdfs.h"
-#endif  // CONFIG_Q_ADAPT_PROBS
-#if CONFIG_LV_MAP
 #include "av1/common/txb_common.h"
-#endif
-
-// Unconstrained Node Tree
-/* clang-format off */
-const aom_tree_index av1_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
-  2, 6,                                // 0 = LOW_VAL
-  -TWO_TOKEN, 4,                       // 1 = TWO
-  -THREE_TOKEN, -FOUR_TOKEN,           // 2 = THREE
-  8, 10,                               // 3 = HIGH_LOW
-  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 4 = CAT_ONE
-  12, 14,                              // 5 = CAT_THREEFOUR
-  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 6 = CAT_THREE
-  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 7 = CAT_FIVE
-};
-/* clang-format on */
-
-#if CONFIG_NEW_MULTISYMBOL
-/* Extra bits coded from LSB to MSB */
-const aom_cdf_prob av1_cat1_cdf0[CDF_SIZE(2)] = { AOM_ICDF(20352),
-                                                  AOM_ICDF(32768), 0 };
-const aom_cdf_prob *av1_cat1_cdf[] = { av1_cat1_cdf0 };
-
-const aom_cdf_prob av1_cat2_cdf0[CDF_SIZE(4)] = {
-  AOM_ICDF(11963), AOM_ICDF(21121), AOM_ICDF(27719), AOM_ICDF(32768), 0
-};
-const aom_cdf_prob *av1_cat2_cdf[] = { av1_cat2_cdf0 };
-const aom_cdf_prob av1_cat3_cdf0[CDF_SIZE(8)] = {
-  AOM_ICDF(7001),  AOM_ICDF(12802), AOM_ICDF(17911),
-  AOM_ICDF(22144), AOM_ICDF(25503), AOM_ICDF(28286),
-  AOM_ICDF(30737), AOM_ICDF(32768), 0
-};
-const aom_cdf_prob *av1_cat3_cdf[] = { av1_cat3_cdf0 };
-
-const aom_cdf_prob av1_cat4_cdf0[CDF_SIZE(16)] = { AOM_ICDF(3934),
-                                                   AOM_ICDF(7460),
-                                                   AOM_ICDF(10719),
-                                                   AOM_ICDF(13640),
-                                                   AOM_ICDF(16203),
-                                                   AOM_ICDF(18500),
-                                                   AOM_ICDF(20624),
-                                                   AOM_ICDF(22528),
-                                                   AOM_ICDF(24316),
-                                                   AOM_ICDF(25919),
-                                                   AOM_ICDF(27401),
-                                                   AOM_ICDF(28729),
-                                                   AOM_ICDF(29894),
-                                                   AOM_ICDF(30938),
-                                                   AOM_ICDF(31903),
-                                                   AOM_ICDF(32768),
-                                                   0 };
-const aom_cdf_prob *av1_cat4_cdf[] = { av1_cat4_cdf0 };
-
-const aom_cdf_prob av1_cat5_cdf0[CDF_SIZE(16)] = { AOM_ICDF(2942),
-                                                   AOM_ICDF(5794),
-                                                   AOM_ICDF(8473),
-                                                   AOM_ICDF(11069),
-                                                   AOM_ICDF(13469),
-                                                   AOM_ICDF(15795),
-                                                   AOM_ICDF(17980),
-                                                   AOM_ICDF(20097),
-                                                   AOM_ICDF(21952),
-                                                   AOM_ICDF(23750),
-                                                   AOM_ICDF(25439),
-                                                   AOM_ICDF(27076),
-                                                   AOM_ICDF(28589),
-                                                   AOM_ICDF(30056),
-                                                   AOM_ICDF(31434),
-                                                   AOM_ICDF(32768),
-                                                   0 };
-const aom_cdf_prob av1_cat5_cdf1[CDF_SIZE(2)] = { AOM_ICDF(23040),
-                                                  AOM_ICDF(32768), 0 };
-const aom_cdf_prob *av1_cat5_cdf[] = { av1_cat5_cdf0, av1_cat5_cdf1 };
-
-const aom_cdf_prob av1_cat6_cdf0[CDF_SIZE(16)] = {
-  AOM_ICDF(2382),  AOM_ICDF(4727),  AOM_ICDF(7036),  AOM_ICDF(9309),
-  AOM_ICDF(11512), AOM_ICDF(13681), AOM_ICDF(15816), AOM_ICDF(17918),
-  AOM_ICDF(19892), AOM_ICDF(21835), AOM_ICDF(23748), AOM_ICDF(25632),
-  AOM_ICDF(27458), AOM_ICDF(29255), AOM_ICDF(31024), AOM_ICDF(32768)
-};
-const aom_cdf_prob av1_cat6_cdf1[CDF_SIZE(16)] = {
-  AOM_ICDF(9314),  AOM_ICDF(15584), AOM_ICDF(19741), AOM_ICDF(22540),
-  AOM_ICDF(25391), AOM_ICDF(27310), AOM_ICDF(28583), AOM_ICDF(29440),
-  AOM_ICDF(30493), AOM_ICDF(31202), AOM_ICDF(31672), AOM_ICDF(31988),
-  AOM_ICDF(32310), AOM_ICDF(32527), AOM_ICDF(32671), AOM_ICDF(32768)
-};
-const aom_cdf_prob av1_cat6_cdf2[CDF_SIZE(16)] = {
-  AOM_ICDF(29548), AOM_ICDF(31129), AOM_ICDF(31960), AOM_ICDF(32004),
-  AOM_ICDF(32473), AOM_ICDF(32498), AOM_ICDF(32511), AOM_ICDF(32512),
-  AOM_ICDF(32745), AOM_ICDF(32757), AOM_ICDF(32763), AOM_ICDF(32764),
-  AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)
-};
-const aom_cdf_prob av1_cat6_cdf3[CDF_SIZE(16)] = {
-  AOM_ICDF(32006), AOM_ICDF(32258), AOM_ICDF(32510), AOM_ICDF(32512),
-  AOM_ICDF(32638), AOM_ICDF(32639), AOM_ICDF(32640), AOM_ICDF(32641),
-  AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-  AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)
-};
-const aom_cdf_prob av1_cat6_cdf4[CDF_SIZE(4)] = {
-  AOM_ICDF(32513), AOM_ICDF(32641), AOM_ICDF(32767), AOM_ICDF(32768)
-};
-const aom_cdf_prob *av1_cat6_cdf[] = {
-  av1_cat6_cdf0, av1_cat6_cdf1, av1_cat6_cdf2, av1_cat6_cdf3, av1_cat6_cdf4
-};
-#endif
-/* Extra bits coded from MSB to LSB */
-const aom_prob av1_cat1_prob[] = { 159 };
-const aom_prob av1_cat2_prob[] = { 165, 145 };
-const aom_prob av1_cat3_prob[] = { 173, 148, 140 };
-const aom_prob av1_cat4_prob[] = { 176, 155, 140, 135 };
-const aom_prob av1_cat5_prob[] = { 180, 157, 141, 134, 130 };
-const aom_prob av1_cat6_prob[] = {
-  255, 255, 255, 255, 254, 254, 254, 252, 249,
-  243, 230, 196, 177, 153, 140, 133, 130, 129
-};
-
-const uint16_t band_count_table[TX_SIZES_ALL][8] = {
-#if CONFIG_CHROMA_2X2
-  { 1, 2, 2, 3, 0, 0, 0 },
-#endif
-  { 1, 2, 3, 4, 3, 16 - 13, 0 },    { 1, 2, 3, 4, 11, 64 - 21, 0 },
-  { 1, 2, 3, 4, 11, 256 - 21, 0 },  { 1, 2, 3, 4, 11, 1024 - 21, 0 },
-#if CONFIG_TX64X64
-  { 1, 2, 3, 4, 11, 4096 - 21, 0 },
-#endif  // CONFIG_TX64X64
-  { 1, 2, 3, 4, 8, 32 - 18, 0 },    { 1, 2, 3, 4, 8, 32 - 18, 0 },
-  { 1, 2, 3, 4, 11, 128 - 21, 0 },  { 1, 2, 3, 4, 11, 128 - 21, 0 },
-  { 1, 2, 3, 4, 11, 512 - 21, 0 },  { 1, 2, 3, 4, 11, 512 - 21, 0 },
-#if CONFIG_TX64X64
-  { 1, 2, 3, 4, 11, 2048 - 21, 0 }, { 1, 2, 3, 4, 11, 2048 - 21, 0 },
-#endif  // CONFIG_TX64X64
-  { 1, 2, 3, 4, 11, 64 - 21, 0 },   { 1, 2, 3, 4, 11, 64 - 21, 0 },
-  { 1, 2, 3, 4, 11, 256 - 21, 0 },  { 1, 2, 3, 4, 11, 256 - 21, 0 },
-};
-
-const uint16_t band_cum_count_table[TX_SIZES_ALL][8] = {
-#if CONFIG_CHROMA_2X2
-  { 0, 1, 3, 6, 10, 13, 16, 0 },
-#endif
-  { 0, 1, 3, 6, 10, 13, 16, 0 },   { 0, 1, 3, 6, 10, 21, 64, 0 },
-  { 0, 1, 3, 6, 10, 21, 256, 0 },  { 0, 1, 3, 6, 10, 21, 1024, 0 },
-#if CONFIG_TX64X64
-  { 0, 1, 3, 6, 10, 21, 4096, 0 },
-#endif  // CONFIG_TX64X64
-  { 0, 1, 3, 6, 10, 18, 32, 0 },   { 0, 1, 3, 6, 10, 18, 32, 0 },
-  { 0, 1, 3, 6, 10, 21, 128, 0 },  { 0, 1, 3, 6, 10, 21, 128, 0 },
-  { 0, 1, 3, 6, 10, 21, 512, 0 },  { 0, 1, 3, 6, 10, 21, 512, 0 },
-#if CONFIG_TX64X64
-  { 0, 1, 3, 6, 10, 21, 2048, 0 }, { 0, 1, 3, 6, 10, 21, 2048, 0 },
-#endif  // CONFIG_TX64X64
-  { 0, 1, 3, 6, 10, 21, 64, 0 },   { 0, 1, 3, 6, 10, 21, 64, 0 },
-  { 0, 1, 3, 6, 10, 21, 256, 0 },  { 0, 1, 3, 6, 10, 21, 256, 0 },
-};
 
-const uint8_t av1_coefband_trans_8x8plus[MAX_TX_SQUARE] = {
-  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5,
-  // beyond MAXBAND_INDEX+1 all values are filled as 5
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-#if CONFIG_TX64X64
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5
-#endif  // CONFIG_TX64X64
-};
-
-const uint8_t av1_coefband_trans_4x8_8x4[32] = {
-  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
-  4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-};
-
-const uint8_t av1_coefband_trans_4x4[16] = {
-  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
-};
-
-const uint8_t av1_pt_energy_class[ENTROPY_TOKENS] = { 0, 1, 2, 3, 3, 4,
-                                                      4, 5, 5, 5, 5, 5 };
-
-// Model obtained from a 2-sided zero-centered distribution derived
-// from a Pareto distribution. The cdf of the distribution is:
-// cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
-//
-// For a given beta and a given probablity of the 1-node, the alpha
-// is first solved, and then the {alpha, beta} pair is used to generate
-// the probabilities for the rest of the nodes.
-
-// beta = 8
-
-// Every odd line in this table can be generated from the even lines
-// by averaging :
-// av1_pareto8_full[l][node] = (av1_pareto8_full[l-1][node] +
-//                              av1_pareto8_full[l+1][node] ) >> 1;
-// Values for tokens ONE_TOKEN through CATEGORY6_TOKEN included here.
-const aom_prob av1_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES] = {
-  { 3, 86, 128, 6, 86, 23, 88, 29 },
-  { 6, 86, 128, 11, 87, 42, 91, 52 },
-  { 9, 86, 129, 17, 88, 61, 94, 76 },
-  { 12, 86, 129, 22, 88, 77, 97, 93 },
-  { 15, 87, 129, 28, 89, 93, 100, 110 },
-  { 17, 87, 129, 33, 90, 105, 103, 123 },
-  { 20, 88, 130, 38, 91, 118, 106, 136 },
-  { 23, 88, 130, 43, 91, 128, 108, 146 },
-  { 26, 89, 131, 48, 92, 139, 111, 156 },
-  { 28, 89, 131, 53, 93, 147, 114, 163 },
-  { 31, 90, 131, 58, 94, 156, 117, 171 },
-  { 34, 90, 131, 62, 94, 163, 119, 177 },
-  { 37, 90, 132, 66, 95, 171, 122, 184 },
-  { 39, 90, 132, 70, 96, 177, 124, 189 },
-  { 42, 91, 132, 75, 97, 183, 127, 194 },
-  { 44, 91, 132, 79, 97, 188, 129, 198 },
-  { 47, 92, 133, 83, 98, 193, 132, 202 },
-  { 49, 92, 133, 86, 99, 197, 134, 205 },
-  { 52, 93, 133, 90, 100, 201, 137, 208 },
-  { 54, 93, 133, 94, 100, 204, 139, 211 },
-  { 57, 94, 134, 98, 101, 208, 142, 214 },
-  { 59, 94, 134, 101, 102, 211, 144, 216 },
-  { 62, 94, 135, 105, 103, 214, 146, 218 },
-  { 64, 94, 135, 108, 103, 216, 148, 220 },
-  { 66, 95, 135, 111, 104, 219, 151, 222 },
-  { 68, 95, 135, 114, 105, 221, 153, 223 },
-  { 71, 96, 136, 117, 106, 224, 155, 225 },
-  { 73, 96, 136, 120, 106, 225, 157, 226 },
-  { 76, 97, 136, 123, 107, 227, 159, 228 },
-  { 78, 97, 136, 126, 108, 229, 160, 229 },
-  { 80, 98, 137, 129, 109, 231, 162, 231 },
-  { 82, 98, 137, 131, 109, 232, 164, 232 },
-  { 84, 98, 138, 134, 110, 234, 166, 233 },
-  { 86, 98, 138, 137, 111, 235, 168, 234 },
-  { 89, 99, 138, 140, 112, 236, 170, 235 },
-  { 91, 99, 138, 142, 112, 237, 171, 235 },
-  { 93, 100, 139, 145, 113, 238, 173, 236 },
-  { 95, 100, 139, 147, 114, 239, 174, 237 },
-  { 97, 101, 140, 149, 115, 240, 176, 238 },
-  { 99, 101, 140, 151, 115, 241, 177, 238 },
-  { 101, 102, 140, 154, 116, 242, 179, 239 },
-  { 103, 102, 140, 156, 117, 242, 180, 239 },
-  { 105, 103, 141, 158, 118, 243, 182, 240 },
-  { 107, 103, 141, 160, 118, 243, 183, 240 },
-  { 109, 104, 141, 162, 119, 244, 185, 241 },
-  { 111, 104, 141, 164, 119, 244, 186, 241 },
-  { 113, 104, 142, 166, 120, 245, 187, 242 },
-  { 114, 104, 142, 168, 121, 245, 188, 242 },
-  { 116, 105, 143, 170, 122, 246, 190, 243 },
-  { 118, 105, 143, 171, 122, 246, 191, 243 },
-  { 120, 106, 143, 173, 123, 247, 192, 244 },
-  { 121, 106, 143, 175, 124, 247, 193, 244 },
-  { 123, 107, 144, 177, 125, 248, 195, 244 },
-  { 125, 107, 144, 178, 125, 248, 196, 244 },
-  { 127, 108, 145, 180, 126, 249, 197, 245 },
-  { 128, 108, 145, 181, 127, 249, 198, 245 },
-  { 130, 109, 145, 183, 128, 249, 199, 245 },
-  { 132, 109, 145, 184, 128, 249, 200, 245 },
-  { 134, 110, 146, 186, 129, 250, 201, 246 },
-  { 135, 110, 146, 187, 130, 250, 202, 246 },
-  { 137, 111, 147, 189, 131, 251, 203, 246 },
-  { 138, 111, 147, 190, 131, 251, 204, 246 },
-  { 140, 112, 147, 192, 132, 251, 205, 247 },
-  { 141, 112, 147, 193, 132, 251, 206, 247 },
-  { 143, 113, 148, 194, 133, 251, 207, 247 },
-  { 144, 113, 148, 195, 134, 251, 207, 247 },
-  { 146, 114, 149, 197, 135, 252, 208, 248 },
-  { 147, 114, 149, 198, 135, 252, 209, 248 },
-  { 149, 115, 149, 199, 136, 252, 210, 248 },
-  { 150, 115, 149, 200, 137, 252, 210, 248 },
-  { 152, 115, 150, 201, 138, 252, 211, 248 },
-  { 153, 115, 150, 202, 138, 252, 212, 248 },
-  { 155, 116, 151, 204, 139, 253, 213, 249 },
-  { 156, 116, 151, 205, 139, 253, 213, 249 },
-  { 158, 117, 151, 206, 140, 253, 214, 249 },
-  { 159, 117, 151, 207, 141, 253, 215, 249 },
-  { 161, 118, 152, 208, 142, 253, 216, 249 },
-  { 162, 118, 152, 209, 142, 253, 216, 249 },
-  { 163, 119, 153, 210, 143, 253, 217, 249 },
-  { 164, 119, 153, 211, 143, 253, 217, 249 },
-  { 166, 120, 153, 212, 144, 254, 218, 250 },
-  { 167, 120, 153, 212, 145, 254, 219, 250 },
-  { 168, 121, 154, 213, 146, 254, 220, 250 },
-  { 169, 121, 154, 214, 146, 254, 220, 250 },
-  { 171, 122, 155, 215, 147, 254, 221, 250 },
-  { 172, 122, 155, 216, 147, 254, 221, 250 },
-  { 173, 123, 155, 217, 148, 254, 222, 250 },
-  { 174, 123, 155, 217, 149, 254, 222, 250 },
-  { 176, 124, 156, 218, 150, 254, 223, 250 },
-  { 177, 124, 156, 219, 150, 254, 223, 250 },
-  { 178, 125, 157, 220, 151, 254, 224, 251 },
-  { 179, 125, 157, 220, 151, 254, 224, 251 },
-  { 180, 126, 157, 221, 152, 254, 225, 251 },
-  { 181, 126, 157, 221, 152, 254, 225, 251 },
-  { 183, 127, 158, 222, 153, 254, 226, 251 },
-  { 184, 127, 158, 223, 154, 254, 226, 251 },
-  { 185, 128, 159, 224, 155, 255, 227, 251 },
-  { 186, 128, 159, 224, 155, 255, 227, 251 },
-  { 187, 129, 160, 225, 156, 255, 228, 251 },
-  { 188, 130, 160, 225, 156, 255, 228, 251 },
-  { 189, 131, 160, 226, 157, 255, 228, 251 },
-  { 190, 131, 160, 226, 158, 255, 228, 251 },
-  { 191, 132, 161, 227, 159, 255, 229, 251 },
-  { 192, 132, 161, 227, 159, 255, 229, 251 },
-  { 193, 133, 162, 228, 160, 255, 230, 252 },
-  { 194, 133, 162, 229, 160, 255, 230, 252 },
-  { 195, 134, 163, 230, 161, 255, 231, 252 },
-  { 196, 134, 163, 230, 161, 255, 231, 252 },
-  { 197, 135, 163, 231, 162, 255, 231, 252 },
-  { 198, 135, 163, 231, 162, 255, 231, 252 },
-  { 199, 136, 164, 232, 163, 255, 232, 252 },
-  { 200, 136, 164, 232, 164, 255, 232, 252 },
-  { 201, 137, 165, 233, 165, 255, 233, 252 },
-  { 201, 137, 165, 233, 165, 255, 233, 252 },
-  { 202, 138, 166, 233, 166, 255, 233, 252 },
-  { 203, 138, 166, 233, 166, 255, 233, 252 },
-  { 204, 139, 166, 234, 167, 255, 234, 252 },
-  { 205, 139, 166, 234, 167, 255, 234, 252 },
-  { 206, 140, 167, 235, 168, 255, 235, 252 },
-  { 206, 140, 167, 235, 168, 255, 235, 252 },
-  { 207, 141, 168, 236, 169, 255, 235, 252 },
-  { 208, 141, 168, 236, 170, 255, 235, 252 },
-  { 209, 142, 169, 237, 171, 255, 236, 252 },
-  { 209, 143, 169, 237, 171, 255, 236, 252 },
-  { 210, 144, 169, 237, 172, 255, 236, 252 },
-  { 211, 144, 169, 237, 172, 255, 236, 252 },
-  { 212, 145, 170, 238, 173, 255, 237, 252 },
-  { 213, 145, 170, 238, 173, 255, 237, 252 },
-  { 214, 146, 171, 239, 174, 255, 237, 253 },
-  { 214, 146, 171, 239, 174, 255, 237, 253 },
-  { 215, 147, 172, 240, 175, 255, 238, 253 },
-  { 215, 147, 172, 240, 175, 255, 238, 253 },
-  { 216, 148, 173, 240, 176, 255, 238, 253 },
-  { 217, 148, 173, 240, 176, 255, 238, 253 },
-  { 218, 149, 173, 241, 177, 255, 239, 253 },
-  { 218, 149, 173, 241, 178, 255, 239, 253 },
-  { 219, 150, 174, 241, 179, 255, 239, 253 },
-  { 219, 151, 174, 241, 179, 255, 239, 253 },
-  { 220, 152, 175, 242, 180, 255, 240, 253 },
-  { 221, 152, 175, 242, 180, 255, 240, 253 },
-  { 222, 153, 176, 242, 181, 255, 240, 253 },
-  { 222, 153, 176, 242, 181, 255, 240, 253 },
-  { 223, 154, 177, 243, 182, 255, 240, 253 },
-  { 223, 154, 177, 243, 182, 255, 240, 253 },
-  { 224, 155, 178, 244, 183, 255, 241, 253 },
-  { 224, 155, 178, 244, 183, 255, 241, 253 },
-  { 225, 156, 178, 244, 184, 255, 241, 253 },
-  { 225, 157, 178, 244, 184, 255, 241, 253 },
-  { 226, 158, 179, 244, 185, 255, 242, 253 },
-  { 227, 158, 179, 244, 185, 255, 242, 253 },
-  { 228, 159, 180, 245, 186, 255, 242, 253 },
-  { 228, 159, 180, 245, 186, 255, 242, 253 },
-  { 229, 160, 181, 245, 187, 255, 242, 253 },
-  { 229, 160, 181, 245, 187, 255, 242, 253 },
-  { 230, 161, 182, 246, 188, 255, 243, 253 },
-  { 230, 162, 182, 246, 188, 255, 243, 253 },
-  { 231, 163, 183, 246, 189, 255, 243, 253 },
-  { 231, 163, 183, 246, 189, 255, 243, 253 },
-  { 232, 164, 184, 247, 190, 255, 243, 253 },
-  { 232, 164, 184, 247, 190, 255, 243, 253 },
-  { 233, 165, 185, 247, 191, 255, 244, 253 },
-  { 233, 165, 185, 247, 191, 255, 244, 253 },
-  { 234, 166, 185, 247, 192, 255, 244, 253 },
-  { 234, 167, 185, 247, 192, 255, 244, 253 },
-  { 235, 168, 186, 248, 193, 255, 244, 253 },
-  { 235, 168, 186, 248, 193, 255, 244, 253 },
-  { 236, 169, 187, 248, 194, 255, 244, 253 },
-  { 236, 169, 187, 248, 194, 255, 244, 253 },
-  { 236, 170, 188, 248, 195, 255, 245, 253 },
-  { 236, 170, 188, 248, 195, 255, 245, 253 },
-  { 237, 171, 189, 249, 196, 255, 245, 254 },
-  { 237, 172, 189, 249, 196, 255, 245, 254 },
-  { 238, 173, 190, 249, 197, 255, 245, 254 },
-  { 238, 173, 190, 249, 197, 255, 245, 254 },
-  { 239, 174, 191, 249, 198, 255, 245, 254 },
-  { 239, 174, 191, 249, 198, 255, 245, 254 },
-  { 240, 175, 192, 249, 199, 255, 246, 254 },
-  { 240, 176, 192, 249, 199, 255, 246, 254 },
-  { 240, 177, 193, 250, 200, 255, 246, 254 },
-  { 240, 177, 193, 250, 200, 255, 246, 254 },
-  { 241, 178, 194, 250, 201, 255, 246, 254 },
-  { 241, 178, 194, 250, 201, 255, 246, 254 },
-  { 242, 179, 195, 250, 202, 255, 246, 254 },
-  { 242, 180, 195, 250, 202, 255, 246, 254 },
-  { 242, 181, 196, 250, 203, 255, 247, 254 },
-  { 242, 181, 196, 250, 203, 255, 247, 254 },
-  { 243, 182, 197, 251, 204, 255, 247, 254 },
-  { 243, 183, 197, 251, 204, 255, 247, 254 },
-  { 244, 184, 198, 251, 205, 255, 247, 254 },
-  { 244, 184, 198, 251, 205, 255, 247, 254 },
-  { 244, 185, 199, 251, 206, 255, 247, 254 },
-  { 244, 185, 199, 251, 206, 255, 247, 254 },
-  { 245, 186, 200, 251, 207, 255, 247, 254 },
-  { 245, 187, 200, 251, 207, 255, 247, 254 },
-  { 246, 188, 201, 252, 207, 255, 248, 254 },
-  { 246, 188, 201, 252, 207, 255, 248, 254 },
-  { 246, 189, 202, 252, 208, 255, 248, 254 },
-  { 246, 190, 202, 252, 208, 255, 248, 254 },
-  { 247, 191, 203, 252, 209, 255, 248, 254 },
-  { 247, 191, 203, 252, 209, 255, 248, 254 },
-  { 247, 192, 204, 252, 210, 255, 248, 254 },
-  { 247, 193, 204, 252, 210, 255, 248, 254 },
-  { 248, 194, 205, 252, 211, 255, 248, 254 },
-  { 248, 194, 205, 252, 211, 255, 248, 254 },
-  { 248, 195, 206, 252, 212, 255, 249, 254 },
-  { 248, 196, 206, 252, 212, 255, 249, 254 },
-  { 249, 197, 207, 253, 213, 255, 249, 254 },
-  { 249, 197, 207, 253, 213, 255, 249, 254 },
-  { 249, 198, 208, 253, 214, 255, 249, 254 },
-  { 249, 199, 209, 253, 214, 255, 249, 254 },
-  { 250, 200, 210, 253, 215, 255, 249, 254 },
-  { 250, 200, 210, 253, 215, 255, 249, 254 },
-  { 250, 201, 211, 253, 215, 255, 249, 254 },
-  { 250, 202, 211, 253, 215, 255, 249, 254 },
-  { 250, 203, 212, 253, 216, 255, 249, 254 },
-  { 250, 203, 212, 253, 216, 255, 249, 254 },
-  { 251, 204, 213, 253, 217, 255, 250, 254 },
-  { 251, 205, 213, 253, 217, 255, 250, 254 },
-  { 251, 206, 214, 254, 218, 255, 250, 254 },
-  { 251, 206, 215, 254, 218, 255, 250, 254 },
-  { 252, 207, 216, 254, 219, 255, 250, 254 },
-  { 252, 208, 216, 254, 219, 255, 250, 254 },
-  { 252, 209, 217, 254, 220, 255, 250, 254 },
-  { 252, 210, 217, 254, 220, 255, 250, 254 },
-  { 252, 211, 218, 254, 221, 255, 250, 254 },
-  { 252, 212, 218, 254, 221, 255, 250, 254 },
-  { 253, 213, 219, 254, 222, 255, 250, 254 },
-  { 253, 213, 220, 254, 222, 255, 250, 254 },
-  { 253, 214, 221, 254, 223, 255, 250, 254 },
-  { 253, 215, 221, 254, 223, 255, 250, 254 },
-  { 253, 216, 222, 254, 224, 255, 251, 254 },
-  { 253, 217, 223, 254, 224, 255, 251, 254 },
-  { 253, 218, 224, 254, 225, 255, 251, 254 },
-  { 253, 219, 224, 254, 225, 255, 251, 254 },
-  { 254, 220, 225, 254, 225, 255, 251, 254 },
-  { 254, 221, 226, 254, 225, 255, 251, 254 },
-  { 254, 222, 227, 255, 226, 255, 251, 254 },
-  { 254, 223, 227, 255, 226, 255, 251, 254 },
-  { 254, 224, 228, 255, 227, 255, 251, 254 },
-  { 254, 225, 229, 255, 227, 255, 251, 254 },
-  { 254, 226, 230, 255, 228, 255, 251, 254 },
-  { 254, 227, 230, 255, 229, 255, 251, 254 },
-  { 255, 228, 231, 255, 230, 255, 251, 254 },
-  { 255, 229, 232, 255, 230, 255, 251, 254 },
-  { 255, 230, 233, 255, 231, 255, 252, 254 },
-  { 255, 231, 234, 255, 231, 255, 252, 254 },
-  { 255, 232, 235, 255, 232, 255, 252, 254 },
-  { 255, 233, 236, 255, 232, 255, 252, 254 },
-  { 255, 235, 237, 255, 233, 255, 252, 254 },
-  { 255, 236, 238, 255, 234, 255, 252, 254 },
-  { 255, 238, 240, 255, 235, 255, 252, 255 },
-  { 255, 239, 241, 255, 235, 255, 252, 254 },
-  { 255, 241, 243, 255, 236, 255, 252, 254 },
-  { 255, 243, 245, 255, 237, 255, 252, 254 },
-  { 255, 246, 247, 255, 239, 255, 253, 255 },
-};
-
-// Model obtained from a 2-sided zero-centered distribution derived
-// from a Pareto distribution. The cdf of the distribution is:
-// cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
-//
-// For a given beta and a given probability of the 1-node, the alpha
-// is first solved, and then the {alpha, beta} pair is used to generate
-// the probabilities for the rest of the nodes.
-//
-// The full source code of the generating program is available in:
-// tools/gen_constrained_tokenset.py
-//
-// Values for tokens TWO_TOKEN through CATEGORY6_TOKEN included
-// in the table here : the ONE_TOKEN probability is
-// removed and the probabilities rescaled.
-//
-// ZERO_TOKEN and ONE_TOKEN are coded as one CDF,
-// and EOB_TOKEN is coded as flags outside this coder.
-const aom_cdf_prob av1_pareto8_tail_probs[COEFF_PROB_MODELS][TAIL_NODES] = {
-  { 128, 127, 127, 252, 497, 969, 1839, 3318, 25511 },
-  { 256, 254, 251, 496, 966, 1834, 3308, 5408, 19995 },
-  { 383, 378, 373, 732, 1408, 2605, 4470, 6646, 15773 },
-  { 511, 502, 493, 961, 1824, 3289, 5373, 7298, 12517 },
-  { 638, 625, 611, 1182, 2215, 3894, 6064, 7548, 9991 },
-  { 766, 746, 726, 1396, 2582, 4428, 6578, 7529, 8017 },
-  { 893, 866, 839, 1603, 2927, 4896, 6945, 7332, 6467 },
-  { 1020, 984, 950, 1803, 3250, 5305, 7191, 7022, 5243 },
-  { 1147, 1102, 1059, 1996, 3552, 5659, 7338, 6646, 4269 },
-  { 1274, 1218, 1166, 2183, 3835, 5963, 7403, 6234, 3492 },
-  { 1400, 1334, 1270, 2363, 4099, 6223, 7401, 5809, 2869 },
-  { 1527, 1447, 1372, 2537, 4345, 6442, 7346, 5386, 2366 },
-  { 1654, 1560, 1473, 2704, 4574, 6624, 7247, 4973, 1959 },
-  { 1780, 1672, 1571, 2866, 4787, 6771, 7114, 4579, 1628 },
-  { 1906, 1782, 1667, 3022, 4984, 6889, 6954, 4206, 1358 },
-  { 2032, 1891, 1762, 3172, 5167, 6979, 6773, 3856, 1136 },
-  { 2158, 2000, 1854, 3316, 5335, 7044, 6577, 3530, 954 },
-  { 2284, 2106, 1944, 3455, 5490, 7087, 6370, 3229, 803 },
-  { 2410, 2212, 2032, 3588, 5632, 7109, 6155, 2951, 679 },
-  { 2535, 2317, 2119, 3717, 5761, 7113, 5936, 2695, 575 },
-  { 2661, 2420, 2203, 3840, 5880, 7101, 5714, 2461, 488 },
-  { 2786, 2522, 2286, 3958, 5987, 7074, 5493, 2246, 416 },
-  { 2911, 2624, 2367, 4072, 6083, 7033, 5273, 2050, 355 },
-  { 3037, 2724, 2446, 4180, 6170, 6981, 5055, 1871, 304 },
-  { 3162, 2822, 2523, 4284, 6247, 6919, 4842, 1708, 261 },
-  { 3286, 2920, 2599, 4384, 6315, 6848, 4633, 1559, 224 },
-  { 3411, 3017, 2672, 4478, 6374, 6768, 4430, 1424, 194 },
-  { 3536, 3112, 2745, 4569, 6426, 6681, 4232, 1300, 167 },
-  { 3660, 3207, 2815, 4656, 6469, 6588, 4040, 1188, 145 },
-  { 3785, 3300, 2883, 4738, 6505, 6490, 3855, 1086, 126 },
-  { 3909, 3392, 2950, 4817, 6534, 6387, 3677, 993, 109 },
-  { 4033, 3483, 3015, 4891, 6557, 6281, 3505, 908, 95 },
-  { 4157, 3573, 3079, 4962, 6573, 6170, 3340, 831, 83 },
-  { 4281, 3662, 3141, 5029, 6584, 6058, 3181, 760, 72 },
-  { 4405, 3750, 3201, 5093, 6588, 5943, 3029, 696, 63 },
-  { 4529, 3837, 3260, 5152, 6587, 5826, 2883, 638, 56 },
-  { 4652, 3922, 3317, 5209, 6582, 5709, 2744, 584, 49 },
-  { 4775, 4007, 3373, 5262, 6572, 5590, 2610, 536, 43 },
-  { 4899, 4090, 3427, 5312, 6557, 5470, 2483, 492, 38 },
-  { 5022, 4173, 3480, 5359, 6538, 5351, 2361, 451, 33 },
-  { 5145, 4254, 3531, 5403, 6515, 5231, 2246, 414, 29 },
-  { 5268, 4334, 3581, 5443, 6489, 5112, 2135, 380, 26 },
-  { 5391, 4414, 3629, 5481, 6458, 4993, 2029, 350, 23 },
-  { 5514, 4492, 3676, 5515, 6425, 4875, 1929, 321, 21 },
-  { 5637, 4569, 3721, 5548, 6388, 4758, 1833, 296, 18 },
-  { 5759, 4645, 3766, 5577, 6349, 4642, 1742, 272, 16 },
-  { 5881, 4720, 3808, 5604, 6307, 4528, 1656, 250, 14 },
-  { 6004, 4794, 3849, 5628, 6262, 4414, 1573, 231, 13 },
-  { 6126, 4867, 3890, 5649, 6215, 4302, 1495, 213, 11 },
-  { 6248, 4939, 3928, 5669, 6166, 4192, 1420, 196, 10 },
-  { 6370, 5010, 3966, 5686, 6114, 4083, 1349, 181, 9 },
-  { 6492, 5080, 4002, 5700, 6061, 3976, 1282, 167, 8 },
-  { 6614, 5149, 4037, 5712, 6006, 3871, 1218, 154, 7 },
-  { 6735, 5217, 4070, 5723, 5950, 3767, 1157, 142, 7 },
-  { 6857, 5284, 4103, 5731, 5891, 3666, 1099, 131, 6 },
-  { 6978, 5351, 4134, 5737, 5832, 3566, 1044, 121, 5 },
-  { 7099, 5415, 4164, 5741, 5771, 3469, 992, 112, 5 },
-  { 7221, 5479, 4192, 5743, 5709, 3373, 943, 104, 4 },
-  { 7342, 5542, 4220, 5743, 5646, 3279, 896, 96, 4 },
-  { 7462, 5604, 4246, 5742, 5583, 3187, 851, 89, 4 },
-  { 7584, 5665, 4272, 5739, 5518, 3097, 808, 82, 3 },
-  { 7704, 5725, 4296, 5734, 5453, 3009, 768, 76, 3 },
-  { 7825, 5784, 4318, 5727, 5386, 2924, 730, 71, 3 },
-  { 7945, 5843, 4341, 5719, 5320, 2840, 693, 65, 2 },
-  { 8066, 5900, 4361, 5709, 5252, 2758, 659, 61, 2 },
-  { 8186, 5956, 4381, 5698, 5185, 2678, 626, 56, 2 },
-  { 8306, 6011, 4400, 5685, 5117, 2600, 595, 52, 2 },
-  { 8426, 6066, 4418, 5671, 5049, 2523, 565, 48, 2 },
-  { 8547, 6119, 4434, 5655, 4981, 2449, 537, 45, 1 },
-  { 8666, 6171, 4450, 5638, 4912, 2377, 511, 42, 1 },
-  { 8786, 6223, 4465, 5620, 4843, 2306, 485, 39, 1 },
-  { 8906, 6274, 4478, 5600, 4775, 2237, 461, 36, 1 },
-  { 9025, 6323, 4491, 5580, 4706, 2170, 438, 34, 1 },
-  { 9144, 6372, 4503, 5558, 4637, 2105, 417, 31, 1 },
-  { 9264, 6420, 4514, 5535, 4568, 2041, 396, 29, 1 },
-  { 9383, 6467, 4524, 5511, 4500, 1979, 376, 27, 1 },
-  { 9502, 6513, 4532, 5486, 4432, 1919, 358, 25, 1 },
-  { 9621, 6558, 4541, 5460, 4364, 1860, 340, 23, 1 },
-  { 9740, 6602, 4548, 5433, 4296, 1803, 323, 22, 1 },
-  { 9859, 6645, 4554, 5405, 4229, 1748, 307, 20, 1 },
-  { 9978, 6688, 4559, 5376, 4161, 1694, 292, 19, 1 },
-  { 10096, 6729, 4564, 5347, 4094, 1641, 278, 18, 1 },
-  { 10215, 6770, 4568, 5316, 4028, 1590, 264, 16, 1 },
-  { 10333, 6809, 4571, 5285, 3962, 1541, 251, 15, 1 },
-  { 10452, 6848, 4573, 5253, 3896, 1492, 239, 14, 1 },
-  { 10570, 6886, 4574, 5220, 3831, 1446, 227, 13, 1 },
-  { 10688, 6923, 4575, 5186, 3767, 1400, 216, 12, 1 },
-  { 10806, 6959, 4575, 5152, 3702, 1356, 205, 12, 1 },
-  { 10924, 6994, 4574, 5117, 3639, 1313, 195, 11, 1 },
-  { 11041, 7029, 4572, 5082, 3576, 1271, 186, 10, 1 },
-  { 11159, 7062, 4570, 5046, 3513, 1231, 177, 9, 1 },
-  { 11277, 7095, 4566, 5009, 3451, 1192, 168, 9, 1 },
-  { 11394, 7127, 4563, 4972, 3390, 1153, 160, 8, 1 },
-  { 11512, 7158, 4558, 4934, 3329, 1116, 152, 8, 1 },
-  { 11629, 7188, 4553, 4896, 3269, 1080, 145, 7, 1 },
-  { 11746, 7217, 4547, 4857, 3210, 1045, 138, 7, 1 },
-  { 11864, 7245, 4540, 4818, 3151, 1012, 131, 6, 1 },
-  { 11980, 7273, 4533, 4779, 3093, 979, 124, 6, 1 },
-  { 12097, 7300, 4525, 4739, 3035, 947, 118, 6, 1 },
-  { 12215, 7326, 4516, 4698, 2978, 916, 113, 5, 1 },
-  { 12331, 7351, 4507, 4658, 2922, 886, 107, 5, 1 },
-  { 12448, 7375, 4497, 4617, 2866, 857, 102, 5, 1 },
-  { 12564, 7398, 4487, 4576, 2812, 829, 97, 4, 1 },
-  { 12681, 7421, 4476, 4534, 2757, 802, 92, 4, 1 },
-  { 12797, 7443, 4464, 4492, 2704, 775, 88, 4, 1 },
-  { 12914, 7464, 4452, 4450, 2651, 749, 84, 3, 1 },
-  { 13030, 7484, 4439, 4408, 2599, 725, 79, 3, 1 },
-  { 13147, 7503, 4426, 4365, 2547, 700, 76, 3, 1 },
-  { 13262, 7522, 4412, 4322, 2497, 677, 72, 3, 1 },
-  { 13378, 7539, 4398, 4280, 2447, 654, 68, 3, 1 },
-  { 13494, 7556, 4383, 4237, 2397, 632, 65, 3, 1 },
-  { 13610, 7573, 4368, 4193, 2348, 611, 62, 2, 1 },
-  { 13726, 7588, 4352, 4150, 2300, 590, 59, 2, 1 },
-  { 13841, 7602, 4335, 4107, 2253, 571, 56, 2, 1 },
-  { 13957, 7616, 4318, 4063, 2207, 551, 53, 2, 1 },
-  { 14072, 7629, 4301, 4019, 2161, 532, 51, 2, 1 },
-  { 14188, 7641, 4283, 3976, 2115, 514, 48, 2, 1 },
-  { 14302, 7652, 4265, 3932, 2071, 497, 46, 2, 1 },
-  { 14418, 7663, 4246, 3888, 2027, 480, 44, 1, 1 },
-  { 14533, 7673, 4227, 3844, 1984, 463, 42, 1, 1 },
-  { 14649, 7682, 4207, 3800, 1941, 447, 40, 1, 1 },
-  { 14763, 7690, 4187, 3757, 1899, 432, 38, 1, 1 },
-  { 14878, 7698, 4166, 3713, 1858, 417, 36, 1, 1 },
-  { 14993, 7705, 4146, 3669, 1817, 402, 34, 1, 1 },
-  { 15109, 7711, 4124, 3625, 1777, 388, 32, 1, 1 },
-  { 15223, 7715, 4103, 3581, 1738, 375, 31, 1, 1 },
-  { 15337, 7720, 4081, 3538, 1699, 362, 29, 1, 1 },
-  { 15452, 7724, 4058, 3494, 1661, 349, 28, 1, 1 },
-  { 15567, 7727, 4035, 3450, 1624, 337, 26, 1, 1 },
-  { 15681, 7729, 4012, 3407, 1587, 325, 25, 1, 1 },
-  { 15795, 7730, 3989, 3364, 1551, 313, 24, 1, 1 },
-  { 15909, 7731, 3965, 3320, 1516, 302, 23, 1, 1 },
-  { 16024, 7731, 3940, 3277, 1481, 291, 22, 1, 1 },
-  { 16138, 7730, 3916, 3234, 1446, 281, 21, 1, 1 },
-  { 16252, 7728, 3891, 3191, 1413, 271, 20, 1, 1 },
-  { 16366, 7726, 3866, 3148, 1380, 261, 19, 1, 1 },
-  { 16480, 7723, 3840, 3106, 1347, 252, 18, 1, 1 },
-  { 16594, 7720, 3814, 3063, 1315, 243, 17, 1, 1 },
-  { 16708, 7715, 3788, 3021, 1284, 234, 16, 1, 1 },
-  { 16822, 7710, 3762, 2979, 1253, 225, 15, 1, 1 },
-  { 16936, 7704, 3735, 2937, 1223, 217, 14, 1, 1 },
-  { 17050, 7697, 3708, 2895, 1193, 209, 14, 1, 1 },
-  { 17162, 7690, 3681, 2854, 1164, 202, 13, 1, 1 },
-  { 17276, 7682, 3654, 2812, 1136, 194, 12, 1, 1 },
-  { 17389, 7673, 3626, 2771, 1108, 187, 12, 1, 1 },
-  { 17504, 7663, 3598, 2730, 1080, 180, 11, 1, 1 },
-  { 17617, 7653, 3570, 2689, 1053, 173, 11, 1, 1 },
-  { 17730, 7642, 3541, 2649, 1027, 167, 10, 1, 1 },
-  { 17843, 7630, 3513, 2608, 1001, 161, 10, 1, 1 },
-  { 17957, 7618, 3484, 2569, 975, 154, 9, 1, 1 },
-  { 18069, 7605, 3455, 2529, 950, 149, 9, 1, 1 },
-  { 18183, 7591, 3426, 2489, 926, 143, 8, 1, 1 },
-  { 18296, 7576, 3396, 2450, 902, 138, 8, 1, 1 },
-  { 18410, 7562, 3366, 2411, 878, 132, 7, 1, 1 },
-  { 18523, 7545, 3337, 2372, 855, 127, 7, 1, 1 },
-  { 18636, 7529, 3306, 2333, 833, 122, 7, 1, 1 },
-  { 18749, 7511, 3276, 2295, 811, 118, 6, 1, 1 },
-  { 18862, 7493, 3246, 2257, 789, 113, 6, 1, 1 },
-  { 18975, 7474, 3215, 2219, 768, 109, 6, 1, 1 },
-  { 19088, 7455, 3185, 2182, 747, 104, 5, 1, 1 },
-  { 19201, 7435, 3154, 2144, 727, 100, 5, 1, 1 },
-  { 19314, 7414, 3123, 2107, 707, 96, 5, 1, 1 },
-  { 19427, 7392, 3092, 2071, 687, 92, 5, 1, 1 },
-  { 19541, 7370, 3060, 2034, 668, 89, 4, 1, 1 },
-  { 19654, 7347, 3029, 1998, 649, 85, 4, 1, 1 },
-  { 19766, 7323, 2997, 1963, 631, 82, 4, 1, 1 },
-  { 19878, 7299, 2966, 1927, 613, 79, 4, 1, 1 },
-  { 19991, 7274, 2934, 1892, 596, 75, 4, 1, 1 },
-  { 20105, 7248, 2902, 1857, 579, 72, 3, 1, 1 },
-  { 20218, 7222, 2870, 1822, 562, 69, 3, 1, 1 },
-  { 20331, 7195, 2838, 1788, 545, 66, 3, 1, 1 },
-  { 20443, 7167, 2806, 1754, 529, 64, 3, 1, 1 },
-  { 20556, 7138, 2774, 1720, 514, 61, 3, 1, 1 },
-  { 20670, 7109, 2741, 1687, 498, 58, 3, 1, 1 },
-  { 20783, 7079, 2709, 1654, 483, 56, 2, 1, 1 },
-  { 20895, 7049, 2676, 1621, 469, 54, 2, 1, 1 },
-  { 21008, 7017, 2644, 1589, 455, 51, 2, 1, 1 },
-  { 21121, 6985, 2611, 1557, 441, 49, 2, 1, 1 },
-  { 21234, 6953, 2578, 1525, 427, 47, 2, 1, 1 },
-  { 21347, 6919, 2545, 1494, 414, 45, 2, 1, 1 },
-  { 21460, 6885, 2513, 1462, 401, 43, 2, 1, 1 },
-  { 21573, 6850, 2480, 1432, 388, 41, 2, 1, 1 },
-  { 21687, 6815, 2447, 1401, 375, 39, 2, 1, 1 },
-  { 21801, 6778, 2414, 1371, 363, 38, 1, 1, 1 },
-  { 21914, 6741, 2381, 1341, 352, 36, 1, 1, 1 },
-  { 22028, 6704, 2348, 1311, 340, 34, 1, 1, 1 },
-  { 22141, 6665, 2315, 1282, 329, 33, 1, 1, 1 },
-  { 22255, 6626, 2282, 1253, 318, 31, 1, 1, 1 },
-  { 22368, 6586, 2249, 1225, 307, 30, 1, 1, 1 },
-  { 22482, 6546, 2216, 1196, 297, 28, 1, 1, 1 },
-  { 22595, 6505, 2183, 1169, 286, 27, 1, 1, 1 },
-  { 22709, 6463, 2149, 1141, 277, 26, 1, 1, 1 },
-  { 22823, 6420, 2116, 1114, 267, 25, 1, 1, 1 },
-  { 22938, 6377, 2083, 1087, 257, 23, 1, 1, 1 },
-  { 23053, 6332, 2050, 1060, 248, 22, 1, 1, 1 },
-  { 23167, 6287, 2017, 1034, 239, 21, 1, 1, 1 },
-  { 23280, 6242, 1984, 1008, 231, 20, 1, 1, 1 },
-  { 23396, 6195, 1951, 982, 222, 19, 1, 1, 1 },
-  { 23510, 6148, 1918, 957, 214, 18, 1, 1, 1 },
-  { 23625, 6100, 1885, 932, 206, 17, 1, 1, 1 },
-  { 23741, 6051, 1852, 907, 198, 16, 1, 1, 1 },
-  { 23855, 6002, 1819, 883, 190, 16, 1, 1, 1 },
-  { 23971, 5951, 1786, 859, 183, 15, 1, 1, 1 },
-  { 24087, 5900, 1753, 835, 176, 14, 1, 1, 1 },
-  { 24203, 5848, 1720, 812, 169, 13, 1, 1, 1 },
-  { 24318, 5796, 1687, 789, 162, 13, 1, 1, 1 },
-  { 24435, 5742, 1655, 766, 155, 12, 1, 1, 1 },
-  { 24552, 5688, 1622, 743, 149, 11, 1, 1, 1 },
-  { 24669, 5632, 1589, 721, 143, 11, 1, 1, 1 },
-  { 24786, 5576, 1557, 699, 137, 10, 1, 1, 1 },
-  { 24903, 5519, 1524, 678, 131, 10, 1, 1, 1 },
-  { 25021, 5462, 1491, 657, 125, 9, 1, 1, 1 },
-  { 25139, 5403, 1459, 636, 120, 8, 1, 1, 1 },
-  { 25258, 5343, 1427, 615, 114, 8, 1, 1, 1 },
-  { 25376, 5283, 1394, 595, 109, 8, 1, 1, 1 },
-  { 25496, 5221, 1362, 575, 104, 7, 1, 1, 1 },
-  { 25614, 5159, 1330, 556, 99, 7, 1, 1, 1 },
-  { 25735, 5096, 1298, 536, 94, 6, 1, 1, 1 },
-  { 25856, 5031, 1265, 517, 90, 6, 1, 1, 1 },
-  { 25977, 4966, 1233, 499, 85, 5, 1, 1, 1 },
-  { 26098, 4899, 1202, 480, 81, 5, 1, 1, 1 },
-  { 26220, 4831, 1170, 462, 77, 5, 1, 1, 1 },
-  { 26343, 4763, 1138, 444, 73, 4, 1, 1, 1 },
-  { 26466, 4693, 1106, 427, 69, 4, 1, 1, 1 },
-  { 26589, 4622, 1075, 410, 65, 4, 1, 1, 1 },
-  { 26713, 4550, 1043, 393, 62, 4, 1, 1, 1 },
-  { 26840, 4476, 1012, 376, 58, 3, 1, 1, 1 },
-  { 26966, 4401, 980, 360, 55, 3, 1, 1, 1 },
-  { 27092, 4325, 949, 344, 52, 3, 1, 1, 1 },
-  { 27220, 4248, 918, 328, 48, 3, 1, 1, 1 },
-  { 27350, 4169, 886, 313, 45, 2, 1, 1, 1 },
-  { 27480, 4088, 855, 298, 42, 2, 1, 1, 1 },
-  { 27610, 4006, 824, 283, 40, 2, 1, 1, 1 },
-  { 27743, 3922, 793, 268, 37, 2, 1, 1, 1 },
-  { 27876, 3837, 762, 254, 34, 2, 1, 1, 1 },
-  { 28011, 3749, 731, 240, 32, 2, 1, 1, 1 },
-  { 28147, 3659, 701, 227, 30, 1, 1, 1, 1 },
-  { 28286, 3568, 670, 213, 27, 1, 1, 1, 1 },
-  { 28426, 3474, 639, 200, 25, 1, 1, 1, 1 },
-  { 28569, 3377, 608, 187, 23, 1, 1, 1, 1 },
-  { 28714, 3278, 577, 174, 21, 1, 1, 1, 1 },
-  { 28860, 3176, 547, 162, 19, 1, 1, 1, 1 },
-  { 29010, 3071, 516, 150, 17, 1, 1, 1, 1 },
-  { 29163, 2962, 485, 138, 16, 1, 1, 1, 1 },
-  { 29320, 2849, 454, 127, 14, 1, 1, 1, 1 },
-  { 29483, 2731, 423, 115, 12, 1, 1, 1, 1 },
-  { 29650, 2608, 391, 104, 11, 1, 1, 1, 1 },
-  { 29823, 2479, 360, 93, 9, 1, 1, 1, 1 },
-  { 30002, 2343, 328, 83, 8, 1, 1, 1, 1 },
-  { 30192, 2198, 295, 72, 7, 1, 1, 1, 1 },
-  { 30393, 2041, 262, 62, 6, 1, 1, 1, 1 },
-  { 30612, 1869, 227, 52, 4, 1, 1, 1, 1 },
-  { 30853, 1676, 191, 41, 3, 1, 1, 1, 1 },
-  { 31131, 1448, 152, 31, 2, 1, 1, 1, 1 },
-  { 31486, 1150, 107, 20, 1, 1, 1, 1, 1 },
-};
-
-#if !CONFIG_Q_ADAPT_PROBS
-static const coeff_cdf_model default_coef_head_cdf_4x4[PLANE_TYPES] = {
-  {     // Y plane
-    {   // Intra
-      { // Band 0
-        { AOM_ICDF(25024), AOM_ICDF(25863), AOM_ICDF(27361), AOM_ICDF(29796),
-          AOM_ICDF(30374), AOM_ICDF(32768) },
-        { AOM_ICDF(10816), AOM_ICDF(14127), AOM_ICDF(17116), AOM_ICDF(23516),
-          AOM_ICDF(24999), AOM_ICDF(32768) },
-        { AOM_ICDF(1088), AOM_ICDF(6358), AOM_ICDF(8428), AOM_ICDF(16648),
-          AOM_ICDF(18276), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(14529), AOM_ICDF(18769), AOM_ICDF(29100), AOM_ICDF(29634),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12993), AOM_ICDF(17117), AOM_ICDF(28404), AOM_ICDF(28988),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11201), AOM_ICDF(14084), AOM_ICDF(25818), AOM_ICDF(26504),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9793), AOM_ICDF(11267), AOM_ICDF(21775), AOM_ICDF(22451),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7105), AOM_ICDF(7562), AOM_ICDF(15777), AOM_ICDF(16225),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3905), AOM_ICDF(3966), AOM_ICDF(8359), AOM_ICDF(8526),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(20033), AOM_ICDF(23643), AOM_ICDF(31102), AOM_ICDF(31374),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16321), AOM_ICDF(20350), AOM_ICDF(30167), AOM_ICDF(30546),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12993), AOM_ICDF(15512), AOM_ICDF(26859), AOM_ICDF(27396),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10305), AOM_ICDF(11659), AOM_ICDF(21669), AOM_ICDF(22330),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7361), AOM_ICDF(7819), AOM_ICDF(15450), AOM_ICDF(15940),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3521), AOM_ICDF(3580), AOM_ICDF(7805), AOM_ICDF(7976),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(21057), AOM_ICDF(25460), AOM_ICDF(31740), AOM_ICDF(31952),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16449), AOM_ICDF(21173), AOM_ICDF(30761), AOM_ICDF(31092),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11841), AOM_ICDF(14615), AOM_ICDF(26188), AOM_ICDF(26824),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7745), AOM_ICDF(8991), AOM_ICDF(18937), AOM_ICDF(19707),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4417), AOM_ICDF(4706), AOM_ICDF(10342), AOM_ICDF(10890),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7617), AOM_ICDF(8392), AOM_ICDF(17295), AOM_ICDF(17915),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(20417), AOM_ICDF(26452), AOM_ICDF(32166), AOM_ICDF(32321),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15809), AOM_ICDF(21634), AOM_ICDF(30947), AOM_ICDF(31298),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10049), AOM_ICDF(12176), AOM_ICDF(23495), AOM_ICDF(24229),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5953), AOM_ICDF(6731), AOM_ICDF(16166), AOM_ICDF(16798),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6081), AOM_ICDF(6188), AOM_ICDF(8114), AOM_ICDF(8764),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2113), AOM_ICDF(2291), AOM_ICDF(4448), AOM_ICDF(5527),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(9153), AOM_ICDF(25905), AOM_ICDF(31431), AOM_ICDF(31934),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9025), AOM_ICDF(23345), AOM_ICDF(30033), AOM_ICDF(30965),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5953), AOM_ICDF(13835), AOM_ICDF(22032), AOM_ICDF(24664),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6337), AOM_ICDF(11435), AOM_ICDF(18366), AOM_ICDF(21418),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3137), AOM_ICDF(4871), AOM_ICDF(8519), AOM_ICDF(12426),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1857), AOM_ICDF(2727), AOM_ICDF(5540), AOM_ICDF(8757),
-          AOM_ICDF(32768) } } },
-    {   // Intra
-      { // Band 0
-        { AOM_ICDF(24512), AOM_ICDF(26673), AOM_ICDF(28962), AOM_ICDF(31929),
-          AOM_ICDF(32126), AOM_ICDF(32768) },
-        { AOM_ICDF(15936), AOM_ICDF(21711), AOM_ICDF(25569), AOM_ICDF(30899),
-          AOM_ICDF(31305), AOM_ICDF(32768) },
-        { AOM_ICDF(3264), AOM_ICDF(14756), AOM_ICDF(20107), AOM_ICDF(29407),
-          AOM_ICDF(30032), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(21313), AOM_ICDF(26020), AOM_ICDF(32523), AOM_ICDF(32575),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18369), AOM_ICDF(24215), AOM_ICDF(32291), AOM_ICDF(32391),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15297), AOM_ICDF(19637), AOM_ICDF(30414), AOM_ICDF(30752),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11713), AOM_ICDF(14040), AOM_ICDF(25408), AOM_ICDF(26033),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9537), AOM_ICDF(10173), AOM_ICDF(18839), AOM_ICDF(19315),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9025), AOM_ICDF(9093), AOM_ICDF(13987), AOM_ICDF(14115),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(22721), AOM_ICDF(27599), AOM_ICDF(32592), AOM_ICDF(32636),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(19009), AOM_ICDF(24676), AOM_ICDF(32258), AOM_ICDF(32367),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12737), AOM_ICDF(16769), AOM_ICDF(28739), AOM_ICDF(29247),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8769), AOM_ICDF(10956), AOM_ICDF(21941), AOM_ICDF(22840),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6721), AOM_ICDF(7678), AOM_ICDF(15319), AOM_ICDF(16290),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4417), AOM_ICDF(4430), AOM_ICDF(4583), AOM_ICDF(5712),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(22849), AOM_ICDF(28333), AOM_ICDF(32633), AOM_ICDF(32671),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18497), AOM_ICDF(24619), AOM_ICDF(32184), AOM_ICDF(32315),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11841), AOM_ICDF(14640), AOM_ICDF(27251), AOM_ICDF(27752),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8385), AOM_ICDF(10154), AOM_ICDF(18339), AOM_ICDF(19621),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(6977), AOM_ICDF(13787), AOM_ICDF(15289),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(194), AOM_ICDF(384), AOM_ICDF(479),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(20417), AOM_ICDF(28167), AOM_ICDF(32552), AOM_ICDF(32621),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16833), AOM_ICDF(23968), AOM_ICDF(31991), AOM_ICDF(32174),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10433), AOM_ICDF(13387), AOM_ICDF(26356), AOM_ICDF(26951),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5057), AOM_ICDF(6823), AOM_ICDF(18967), AOM_ICDF(19843),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(6479), AOM_ICDF(11672), AOM_ICDF(13052),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2241), AOM_ICDF(2265), AOM_ICDF(6355), AOM_ICDF(6432),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(12097), AOM_ICDF(28717), AOM_ICDF(32406), AOM_ICDF(32555),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10433), AOM_ICDF(26113), AOM_ICDF(31504), AOM_ICDF(31975),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5825), AOM_ICDF(14284), AOM_ICDF(21349), AOM_ICDF(24461),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4545), AOM_ICDF(8454), AOM_ICDF(12648), AOM_ICDF(17501),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(7173), AOM_ICDF(15272), AOM_ICDF(19322),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2113), AOM_ICDF(2183), AOM_ICDF(7202), AOM_ICDF(7377),
-          AOM_ICDF(32768) } } } },
-  {     // UV plane
-    {   // Inter
-      { // Band 0
-        { AOM_ICDF(27456), AOM_ICDF(28244), AOM_ICDF(31289), AOM_ICDF(32358),
-          AOM_ICDF(32534), AOM_ICDF(32768) },
-        { AOM_ICDF(16960), AOM_ICDF(21207), AOM_ICDF(26511), AOM_ICDF(30539),
-          AOM_ICDF(31190), AOM_ICDF(32768) },
-        { AOM_ICDF(5440), AOM_ICDF(13412), AOM_ICDF(18469), AOM_ICDF(26423),
-          AOM_ICDF(27669), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(17857), AOM_ICDF(26327), AOM_ICDF(31983), AOM_ICDF(32219),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16065), AOM_ICDF(24198), AOM_ICDF(31431), AOM_ICDF(31785),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12865), AOM_ICDF(18011), AOM_ICDF(28454), AOM_ICDF(29166),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9665), AOM_ICDF(12501), AOM_ICDF(24331), AOM_ICDF(25147),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2753), AOM_ICDF(3121), AOM_ICDF(12661), AOM_ICDF(13034),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4033), AOM_ICDF(4140), AOM_ICDF(11834), AOM_ICDF(11977),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(21185), AOM_ICDF(28338), AOM_ICDF(32249), AOM_ICDF(32417),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18497), AOM_ICDF(25227), AOM_ICDF(31905), AOM_ICDF(32122),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12097), AOM_ICDF(16516), AOM_ICDF(28610), AOM_ICDF(29166),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9281), AOM_ICDF(11157), AOM_ICDF(21438), AOM_ICDF(22312),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(6566), AOM_ICDF(15585), AOM_ICDF(16340),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9409), AOM_ICDF(9659), AOM_ICDF(11827), AOM_ICDF(12911),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(22337), AOM_ICDF(29459), AOM_ICDF(32382), AOM_ICDF(32519),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16961), AOM_ICDF(25262), AOM_ICDF(31874), AOM_ICDF(32123),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12353), AOM_ICDF(17748), AOM_ICDF(29300), AOM_ICDF(29852),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9025), AOM_ICDF(11528), AOM_ICDF(24468), AOM_ICDF(25141),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6209), AOM_ICDF(6565), AOM_ICDF(15806), AOM_ICDF(16121),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2497), AOM_ICDF(2524), AOM_ICDF(7050), AOM_ICDF(7125),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(20417), AOM_ICDF(29779), AOM_ICDF(32552), AOM_ICDF(32636),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15553), AOM_ICDF(26420), AOM_ICDF(32063), AOM_ICDF(32295),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9665), AOM_ICDF(17946), AOM_ICDF(29385), AOM_ICDF(30096),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5569), AOM_ICDF(10207), AOM_ICDF(22410), AOM_ICDF(23836),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16449), AOM_ICDF(16450), AOM_ICDF(16545), AOM_ICDF(16593),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2369), AOM_ICDF(2395), AOM_ICDF(6822), AOM_ICDF(6898),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(10177), AOM_ICDF(30567), AOM_ICDF(32725), AOM_ICDF(32745),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9537), AOM_ICDF(28243), AOM_ICDF(32179), AOM_ICDF(32423),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13377), AOM_ICDF(23187), AOM_ICDF(29322), AOM_ICDF(30382),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13121), AOM_ICDF(21346), AOM_ICDF(29507), AOM_ICDF(30326),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4417), AOM_ICDF(4939), AOM_ICDF(15104), AOM_ICDF(15535),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2625), AOM_ICDF(2680), AOM_ICDF(8218), AOM_ICDF(8338),
-          AOM_ICDF(32768) } } },
-    {   // Inter
-      { // Band 0
-        { AOM_ICDF(29376), AOM_ICDF(30098), AOM_ICDF(32421), AOM_ICDF(32766),
-          AOM_ICDF(32767), AOM_ICDF(32768) },
-        { AOM_ICDF(18368), AOM_ICDF(22916), AOM_ICDF(30116), AOM_ICDF(32541),
-          AOM_ICDF(32650), AOM_ICDF(32768) },
-        { AOM_ICDF(5952), AOM_ICDF(16505), AOM_ICDF(25955), AOM_ICDF(32163),
-          AOM_ICDF(32365), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(19649), AOM_ICDF(30160), AOM_ICDF(32743), AOM_ICDF(32753),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18881), AOM_ICDF(28724), AOM_ICDF(32688), AOM_ICDF(32717),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16833), AOM_ICDF(23053), AOM_ICDF(31244), AOM_ICDF(31573),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(14657), AOM_ICDF(17714), AOM_ICDF(26083), AOM_ICDF(26978),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(14657), AOM_ICDF(16618), AOM_ICDF(24597), AOM_ICDF(25403),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4289), AOM_ICDF(4326), AOM_ICDF(10686), AOM_ICDF(10751),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(21953), AOM_ICDF(30956), AOM_ICDF(32748), AOM_ICDF(32757),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20929), AOM_ICDF(29412), AOM_ICDF(32700), AOM_ICDF(32725),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13377), AOM_ICDF(21495), AOM_ICDF(31216), AOM_ICDF(31569),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9153), AOM_ICDF(15097), AOM_ICDF(28295), AOM_ICDF(28990),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5313), AOM_ICDF(5363), AOM_ICDF(13839), AOM_ICDF(13894),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2625), AOM_ICDF(2652), AOM_ICDF(7276), AOM_ICDF(7351),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(20289), AOM_ICDF(31164), AOM_ICDF(32745), AOM_ICDF(32755),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17601), AOM_ICDF(29635), AOM_ICDF(32739), AOM_ICDF(32751),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18241), AOM_ICDF(24284), AOM_ICDF(32116), AOM_ICDF(32258),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(32705), AOM_ICDF(32706), AOM_ICDF(32739), AOM_ICDF(32740),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(5750), AOM_ICDF(14739), AOM_ICDF(14792),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2881), AOM_ICDF(2913), AOM_ICDF(8427), AOM_ICDF(8498),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(19009), AOM_ICDF(31481), AOM_ICDF(32742), AOM_ICDF(32754),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15809), AOM_ICDF(30521), AOM_ICDF(32736), AOM_ICDF(32750),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16449), AOM_ICDF(32705), AOM_ICDF(32737), AOM_ICDF(32753),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7873), AOM_ICDF(8039), AOM_ICDF(19981), AOM_ICDF(20068),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5313), AOM_ICDF(5366), AOM_ICDF(14376), AOM_ICDF(14430),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2753), AOM_ICDF(2789), AOM_ICDF(8909), AOM_ICDF(8979),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(11841), AOM_ICDF(32116), AOM_ICDF(32728), AOM_ICDF(32748),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12353), AOM_ICDF(32132), AOM_ICDF(32729), AOM_ICDF(32748),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7489), AOM_ICDF(12435), AOM_ICDF(25708), AOM_ICDF(26666),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(7486), AOM_ICDF(20238), AOM_ICDF(21009),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4929), AOM_ICDF(5579), AOM_ICDF(16402), AOM_ICDF(16866),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3009), AOM_ICDF(3246), AOM_ICDF(10158), AOM_ICDF(10533),
-          AOM_ICDF(32768) } } } }
-};
-static const coeff_cdf_model default_coef_head_cdf_8x8[PLANE_TYPES] = {
-  {     // Y plane
-    {   // Intra
-      { // Band 0
-        { AOM_ICDF(16064), AOM_ICDF(18127), AOM_ICDF(22153), AOM_ICDF(27289),
-          AOM_ICDF(28507), AOM_ICDF(32768) },
-        { AOM_ICDF(6720), AOM_ICDF(10545), AOM_ICDF(13491), AOM_ICDF(20948),
-          AOM_ICDF(22631), AOM_ICDF(32768) },
-        { AOM_ICDF(832), AOM_ICDF(5270), AOM_ICDF(5918), AOM_ICDF(12645),
-          AOM_ICDF(13532), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(14017), AOM_ICDF(16139), AOM_ICDF(26799), AOM_ICDF(27295),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12737), AOM_ICDF(15136), AOM_ICDF(26235), AOM_ICDF(26816),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10817), AOM_ICDF(12445), AOM_ICDF(23637), AOM_ICDF(24217),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8897), AOM_ICDF(9702), AOM_ICDF(20040), AOM_ICDF(20500),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5953), AOM_ICDF(6156), AOM_ICDF(13966), AOM_ICDF(14205),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2497), AOM_ICDF(2519), AOM_ICDF(6222), AOM_ICDF(6300),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(19777), AOM_ICDF(21403), AOM_ICDF(30054), AOM_ICDF(30269),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16193), AOM_ICDF(17913), AOM_ICDF(28593), AOM_ICDF(28883),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12609), AOM_ICDF(13572), AOM_ICDF(25248), AOM_ICDF(25534),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9665), AOM_ICDF(10118), AOM_ICDF(20721), AOM_ICDF(20968),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6849), AOM_ICDF(7028), AOM_ICDF(15202), AOM_ICDF(15391),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3009), AOM_ICDF(3036), AOM_ICDF(7601), AOM_ICDF(7675),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(22593), AOM_ICDF(23915), AOM_ICDF(31159), AOM_ICDF(31283),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17345), AOM_ICDF(18690), AOM_ICDF(29425), AOM_ICDF(29611),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11969), AOM_ICDF(12540), AOM_ICDF(24685), AOM_ICDF(24867),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8129), AOM_ICDF(8355), AOM_ICDF(18668), AOM_ICDF(18819),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4673), AOM_ICDF(4714), AOM_ICDF(11752), AOM_ICDF(11814),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1857), AOM_ICDF(1876), AOM_ICDF(5057), AOM_ICDF(5138),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(24513), AOM_ICDF(25718), AOM_ICDF(31947), AOM_ICDF(32014),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18881), AOM_ICDF(20029), AOM_ICDF(30409), AOM_ICDF(30527),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12481), AOM_ICDF(12953), AOM_ICDF(25201), AOM_ICDF(25341),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8385), AOM_ICDF(8528), AOM_ICDF(18815), AOM_ICDF(18910),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4289), AOM_ICDF(4327), AOM_ICDF(10797), AOM_ICDF(10861),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1857), AOM_ICDF(1872), AOM_ICDF(4332), AOM_ICDF(4415),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(26049), AOM_ICDF(27752), AOM_ICDF(32415), AOM_ICDF(32462),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20417), AOM_ICDF(22100), AOM_ICDF(31056), AOM_ICDF(31192),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12481), AOM_ICDF(13075), AOM_ICDF(24646), AOM_ICDF(24844),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7489), AOM_ICDF(7696), AOM_ICDF(17117), AOM_ICDF(17285),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3777), AOM_ICDF(3814), AOM_ICDF(10062), AOM_ICDF(10129),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1473), AOM_ICDF(1486), AOM_ICDF(3735), AOM_ICDF(3820),
-          AOM_ICDF(32768) } } },
-    {   // Intra
-      { // Band 0
-        { AOM_ICDF(25920), AOM_ICDF(27743), AOM_ICDF(29455), AOM_ICDF(32147),
-          AOM_ICDF(32280), AOM_ICDF(32768) },
-        { AOM_ICDF(13888), AOM_ICDF(19845), AOM_ICDF(23350), AOM_ICDF(30219),
-          AOM_ICDF(30660), AOM_ICDF(32768) },
-        { AOM_ICDF(2368), AOM_ICDF(12781), AOM_ICDF(16196), AOM_ICDF(27232),
-          AOM_ICDF(27894), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(21697), AOM_ICDF(24758), AOM_ICDF(32358), AOM_ICDF(32417),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20289), AOM_ICDF(23960), AOM_ICDF(32111), AOM_ICDF(32213),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17345), AOM_ICDF(19966), AOM_ICDF(30630), AOM_ICDF(30841),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(14529), AOM_ICDF(16070), AOM_ICDF(27461), AOM_ICDF(27777),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9793), AOM_ICDF(10613), AOM_ICDF(21146), AOM_ICDF(21566),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6977), AOM_ICDF(7162), AOM_ICDF(15591), AOM_ICDF(15776),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(23617), AOM_ICDF(26783), AOM_ICDF(32572), AOM_ICDF(32607),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20801), AOM_ICDF(24292), AOM_ICDF(32185), AOM_ICDF(32275),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15169), AOM_ICDF(17905), AOM_ICDF(29916), AOM_ICDF(30181),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10945), AOM_ICDF(12972), AOM_ICDF(25565), AOM_ICDF(26064),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6849), AOM_ICDF(8334), AOM_ICDF(18543), AOM_ICDF(19446),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3649), AOM_ICDF(4346), AOM_ICDF(12351), AOM_ICDF(13169),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(25281), AOM_ICDF(28440), AOM_ICDF(32667), AOM_ICDF(32689),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22081), AOM_ICDF(25694), AOM_ICDF(32414), AOM_ICDF(32476),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15297), AOM_ICDF(18341), AOM_ICDF(30141), AOM_ICDF(30410),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10305), AOM_ICDF(12381), AOM_ICDF(24477), AOM_ICDF(25084),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(6673), AOM_ICDF(16325), AOM_ICDF(17080),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2369), AOM_ICDF(2393), AOM_ICDF(6466), AOM_ICDF(6543),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(25921), AOM_ICDF(29445), AOM_ICDF(32729), AOM_ICDF(32739),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22465), AOM_ICDF(26834), AOM_ICDF(32588), AOM_ICDF(32627),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16449), AOM_ICDF(20062), AOM_ICDF(31016), AOM_ICDF(31233),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11073), AOM_ICDF(13165), AOM_ICDF(25353), AOM_ICDF(25896),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11713), AOM_ICDF(13837), AOM_ICDF(20144), AOM_ICDF(21734),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2241), AOM_ICDF(2265), AOM_ICDF(6355), AOM_ICDF(6432),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(26177), AOM_ICDF(29403), AOM_ICDF(32705), AOM_ICDF(32721),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22337), AOM_ICDF(26344), AOM_ICDF(32545), AOM_ICDF(32589),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(19009), AOM_ICDF(21527), AOM_ICDF(31775), AOM_ICDF(31873),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11585), AOM_ICDF(12685), AOM_ICDF(22632), AOM_ICDF(23137),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8257), AOM_ICDF(8305), AOM_ICDF(16444), AOM_ICDF(16492),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2113), AOM_ICDF(2183), AOM_ICDF(7202), AOM_ICDF(7377),
-          AOM_ICDF(32768) } } } },
-  {     // UV plane
-    {   // Inter
-      { // Band 0
-        { AOM_ICDF(27200), AOM_ICDF(27981), AOM_ICDF(31389), AOM_ICDF(32444),
-          AOM_ICDF(32592), AOM_ICDF(32768) },
-        { AOM_ICDF(14528), AOM_ICDF(19068), AOM_ICDF(24887), AOM_ICDF(29901),
-          AOM_ICDF(30688), AOM_ICDF(32768) },
-        { AOM_ICDF(3776), AOM_ICDF(11778), AOM_ICDF(14700), AOM_ICDF(23745),
-          AOM_ICDF(24854), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(20289), AOM_ICDF(25202), AOM_ICDF(31672), AOM_ICDF(31909),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18369), AOM_ICDF(23493), AOM_ICDF(31166), AOM_ICDF(31487),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15425), AOM_ICDF(18619), AOM_ICDF(28941), AOM_ICDF(29393),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10945), AOM_ICDF(12535), AOM_ICDF(24287), AOM_ICDF(24792),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6465), AOM_ICDF(6810), AOM_ICDF(15764), AOM_ICDF(16080),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2113), AOM_ICDF(2137), AOM_ICDF(6125), AOM_ICDF(6203),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(23745), AOM_ICDF(27041), AOM_ICDF(31976), AOM_ICDF(32135),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(19521), AOM_ICDF(22766), AOM_ICDF(31139), AOM_ICDF(31367),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(14273), AOM_ICDF(15834), AOM_ICDF(27820), AOM_ICDF(28105),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9537), AOM_ICDF(10445), AOM_ICDF(22106), AOM_ICDF(22491),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7233), AOM_ICDF(7386), AOM_ICDF(15961), AOM_ICDF(16109),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2369), AOM_ICDF(2401), AOM_ICDF(7891), AOM_ICDF(7964),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(26305), AOM_ICDF(28703), AOM_ICDF(32352), AOM_ICDF(32435),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20673), AOM_ICDF(23490), AOM_ICDF(31517), AOM_ICDF(31680),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(14017), AOM_ICDF(15251), AOM_ICDF(27458), AOM_ICDF(27702),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10945), AOM_ICDF(11374), AOM_ICDF(22496), AOM_ICDF(22687),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9153), AOM_ICDF(9435), AOM_ICDF(22299), AOM_ICDF(22411),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(269), AOM_ICDF(13236), AOM_ICDF(13293),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(27713), AOM_ICDF(29770), AOM_ICDF(32522), AOM_ICDF(32575),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21569), AOM_ICDF(24342), AOM_ICDF(31785), AOM_ICDF(31919),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15297), AOM_ICDF(16497), AOM_ICDF(28367), AOM_ICDF(28569),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17601), AOM_ICDF(17828), AOM_ICDF(24444), AOM_ICDF(24582),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6977), AOM_ICDF(7035), AOM_ICDF(16901), AOM_ICDF(16947),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(384), AOM_ICDF(32706), AOM_ICDF(32707),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(28737), AOM_ICDF(30879), AOM_ICDF(32667), AOM_ICDF(32695),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22593), AOM_ICDF(26241), AOM_ICDF(32073), AOM_ICDF(32207),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16577), AOM_ICDF(19148), AOM_ICDF(28436), AOM_ICDF(28906),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12993), AOM_ICDF(14005), AOM_ICDF(23151), AOM_ICDF(23630),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7617), AOM_ICDF(9188), AOM_ICDF(22797), AOM_ICDF(23313),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2625), AOM_ICDF(2680), AOM_ICDF(8218), AOM_ICDF(8338),
-          AOM_ICDF(32768) } } },
-    {   // Inter
-      { // Band 0
-        { AOM_ICDF(28864), AOM_ICDF(29988), AOM_ICDF(32423), AOM_ICDF(32766),
-          AOM_ICDF(32767), AOM_ICDF(32768) },
-        { AOM_ICDF(18496), AOM_ICDF(24572), AOM_ICDF(30167), AOM_ICDF(32687),
-          AOM_ICDF(32737), AOM_ICDF(32768) },
-        { AOM_ICDF(5440), AOM_ICDF(19618), AOM_ICDF(25332), AOM_ICDF(32393),
-          AOM_ICDF(32491), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(23745), AOM_ICDF(29427), AOM_ICDF(32751), AOM_ICDF(32757),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(23745), AOM_ICDF(28704), AOM_ICDF(32716), AOM_ICDF(32731),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(23105), AOM_ICDF(27943), AOM_ICDF(32524), AOM_ICDF(32587),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21057), AOM_ICDF(24773), AOM_ICDF(29589), AOM_ICDF(30282),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12609), AOM_ICDF(14823), AOM_ICDF(23831), AOM_ICDF(24713),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16449), AOM_ICDF(16450), AOM_ICDF(16545), AOM_ICDF(16593),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(25025), AOM_ICDF(30203), AOM_ICDF(32754), AOM_ICDF(32759),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(23617), AOM_ICDF(28361), AOM_ICDF(32715), AOM_ICDF(32729),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17985), AOM_ICDF(21562), AOM_ICDF(31354), AOM_ICDF(31543),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12353), AOM_ICDF(18915), AOM_ICDF(28742), AOM_ICDF(29548),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(289), AOM_ICDF(16545), AOM_ICDF(16593),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2625), AOM_ICDF(2652), AOM_ICDF(7276), AOM_ICDF(7351),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(26433), AOM_ICDF(30892), AOM_ICDF(32757), AOM_ICDF(32761),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(24513), AOM_ICDF(29274), AOM_ICDF(32721), AOM_ICDF(32735),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20161), AOM_ICDF(24040), AOM_ICDF(32055), AOM_ICDF(32171),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21953), AOM_ICDF(24678), AOM_ICDF(27382), AOM_ICDF(28734),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(5750), AOM_ICDF(14739), AOM_ICDF(14792),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2881), AOM_ICDF(2913), AOM_ICDF(8427), AOM_ICDF(8498),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(27457), AOM_ICDF(31485), AOM_ICDF(32759), AOM_ICDF(32763),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(24129), AOM_ICDF(29502), AOM_ICDF(32752), AOM_ICDF(32757),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(19009), AOM_ICDF(25452), AOM_ICDF(32473), AOM_ICDF(32544),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(32705), AOM_ICDF(32706), AOM_ICDF(32737), AOM_ICDF(32738),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5313), AOM_ICDF(5366), AOM_ICDF(14376), AOM_ICDF(14430),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2753), AOM_ICDF(2789), AOM_ICDF(8909), AOM_ICDF(8979),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(27841), AOM_ICDF(32288), AOM_ICDF(32759), AOM_ICDF(32764),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(19137), AOM_ICDF(30271), AOM_ICDF(32742), AOM_ICDF(32753),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18625), AOM_ICDF(27739), AOM_ICDF(29979), AOM_ICDF(31099),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(7486), AOM_ICDF(20238), AOM_ICDF(21009),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4929), AOM_ICDF(5579), AOM_ICDF(16402), AOM_ICDF(16866),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3009), AOM_ICDF(3246), AOM_ICDF(10158), AOM_ICDF(10533),
-          AOM_ICDF(32768) } } } }
-};
-static const coeff_cdf_model default_coef_head_cdf_16x16[PLANE_TYPES] = {
-  {     // Y plane
-    {   // Intra
-      { // Band 0
-        { AOM_ICDF(960), AOM_ICDF(4882), AOM_ICDF(9467), AOM_ICDF(17710),
-          AOM_ICDF(20412), AOM_ICDF(32768) },
-        { AOM_ICDF(704), AOM_ICDF(4657), AOM_ICDF(6561), AOM_ICDF(14507),
-          AOM_ICDF(16279), AOM_ICDF(32768) },
-        { AOM_ICDF(192), AOM_ICDF(3443), AOM_ICDF(3759), AOM_ICDF(9011),
-          AOM_ICDF(9685), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(12481), AOM_ICDF(13958), AOM_ICDF(24487), AOM_ICDF(24997),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11457), AOM_ICDF(13075), AOM_ICDF(23820), AOM_ICDF(24406),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9793), AOM_ICDF(11127), AOM_ICDF(21775), AOM_ICDF(22387),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7745), AOM_ICDF(8457), AOM_ICDF(18155), AOM_ICDF(18655),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5441), AOM_ICDF(5668), AOM_ICDF(13180), AOM_ICDF(13467),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2497), AOM_ICDF(2520), AOM_ICDF(6340), AOM_ICDF(6417),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(19521), AOM_ICDF(20572), AOM_ICDF(28965), AOM_ICDF(29177),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15425), AOM_ICDF(16741), AOM_ICDF(27247), AOM_ICDF(27554),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11969), AOM_ICDF(12690), AOM_ICDF(23872), AOM_ICDF(24141),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9281), AOM_ICDF(9678), AOM_ICDF(19970), AOM_ICDF(20207),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6081), AOM_ICDF(6266), AOM_ICDF(14682), AOM_ICDF(14876),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2753), AOM_ICDF(2779), AOM_ICDF(7150), AOM_ICDF(7225),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(22337), AOM_ICDF(23293), AOM_ICDF(30630), AOM_ICDF(30753),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16321), AOM_ICDF(17427), AOM_ICDF(28368), AOM_ICDF(28570),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11457), AOM_ICDF(11907), AOM_ICDF(23570), AOM_ICDF(23741),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7233), AOM_ICDF(7331), AOM_ICDF(17258), AOM_ICDF(17334),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4033), AOM_ICDF(4070), AOM_ICDF(10375), AOM_ICDF(10441),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1601), AOM_ICDF(1619), AOM_ICDF(4706), AOM_ICDF(4788),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(24769), AOM_ICDF(25536), AOM_ICDF(31660), AOM_ICDF(31722),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18113), AOM_ICDF(18886), AOM_ICDF(29420), AOM_ICDF(29534),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11201), AOM_ICDF(11412), AOM_ICDF(23207), AOM_ICDF(23291),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6977), AOM_ICDF(7033), AOM_ICDF(16599), AOM_ICDF(16646),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4033), AOM_ICDF(4070), AOM_ICDF(10375), AOM_ICDF(10441),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1601), AOM_ICDF(1620), AOM_ICDF(4827), AOM_ICDF(4909),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(28353), AOM_ICDF(28831), AOM_ICDF(32502), AOM_ICDF(32517),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21441), AOM_ICDF(21869), AOM_ICDF(30977), AOM_ICDF(31017),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11969), AOM_ICDF(12088), AOM_ICDF(24116), AOM_ICDF(24158),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7489), AOM_ICDF(7547), AOM_ICDF(17413), AOM_ICDF(17458),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4545), AOM_ICDF(4585), AOM_ICDF(11325), AOM_ICDF(11388),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2113), AOM_ICDF(2133), AOM_ICDF(5526), AOM_ICDF(5606),
-          AOM_ICDF(32768) } } },
-    {   // Intra
-      { // Band 0
-        { AOM_ICDF(2496), AOM_ICDF(8717), AOM_ICDF(17280), AOM_ICDF(28922),
-          AOM_ICDF(29751), AOM_ICDF(32768) },
-        { AOM_ICDF(2496), AOM_ICDF(9665), AOM_ICDF(15235), AOM_ICDF(26542),
-          AOM_ICDF(27580), AOM_ICDF(32768) },
-        { AOM_ICDF(448), AOM_ICDF(9240), AOM_ICDF(11886), AOM_ICDF(24124),
-          AOM_ICDF(24898), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(21057), AOM_ICDF(22896), AOM_ICDF(31877), AOM_ICDF(31953),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20673), AOM_ICDF(23151), AOM_ICDF(31706), AOM_ICDF(31825),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18753), AOM_ICDF(20519), AOM_ICDF(30497), AOM_ICDF(30668),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15425), AOM_ICDF(16608), AOM_ICDF(27789), AOM_ICDF(28027),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10305), AOM_ICDF(10977), AOM_ICDF(21405), AOM_ICDF(21749),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3649), AOM_ICDF(3812), AOM_ICDF(11213), AOM_ICDF(11445),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(24001), AOM_ICDF(25899), AOM_ICDF(32307), AOM_ICDF(32360),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20929), AOM_ICDF(22941), AOM_ICDF(31775), AOM_ICDF(31867),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15169), AOM_ICDF(16734), AOM_ICDF(29228), AOM_ICDF(29425),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10561), AOM_ICDF(12047), AOM_ICDF(24918), AOM_ICDF(25324),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6977), AOM_ICDF(7929), AOM_ICDF(18311), AOM_ICDF(18918),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3649), AOM_ICDF(3760), AOM_ICDF(9962), AOM_ICDF(10162),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(25793), AOM_ICDF(27526), AOM_ICDF(32565), AOM_ICDF(32591),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21825), AOM_ICDF(23885), AOM_ICDF(32064), AOM_ICDF(32135),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15041), AOM_ICDF(16286), AOM_ICDF(29203), AOM_ICDF(29360),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10433), AOM_ICDF(11058), AOM_ICDF(24349), AOM_ICDF(24538),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5569), AOM_ICDF(6016), AOM_ICDF(16460), AOM_ICDF(16794),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(194), AOM_ICDF(384), AOM_ICDF(479),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(26433), AOM_ICDF(28398), AOM_ICDF(32682), AOM_ICDF(32696),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22977), AOM_ICDF(25086), AOM_ICDF(32367), AOM_ICDF(32412),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16577), AOM_ICDF(17928), AOM_ICDF(30144), AOM_ICDF(30275),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12481), AOM_ICDF(13352), AOM_ICDF(25993), AOM_ICDF(26211),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7745), AOM_ICDF(8069), AOM_ICDF(20501), AOM_ICDF(20657),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16449), AOM_ICDF(16450), AOM_ICDF(16545), AOM_ICDF(16593),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(27841), AOM_ICDF(29700), AOM_ICDF(32721), AOM_ICDF(32730),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(23873), AOM_ICDF(26202), AOM_ICDF(32578), AOM_ICDF(32604),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17729), AOM_ICDF(19046), AOM_ICDF(30448), AOM_ICDF(30568),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13505), AOM_ICDF(14508), AOM_ICDF(26034), AOM_ICDF(26304),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10049), AOM_ICDF(10494), AOM_ICDF(19945), AOM_ICDF(20233),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2113), AOM_ICDF(2183), AOM_ICDF(7202), AOM_ICDF(7377),
-          AOM_ICDF(32768) } } } },
-  {     // UV plane
-    {   // Inter
-      { // Band 0
-        { AOM_ICDF(27072), AOM_ICDF(27916), AOM_ICDF(31095), AOM_ICDF(32400),
-          AOM_ICDF(32553), AOM_ICDF(32768) },
-        { AOM_ICDF(12352), AOM_ICDF(16792), AOM_ICDF(22516), AOM_ICDF(28853),
-          AOM_ICDF(29797), AOM_ICDF(32768) },
-        { AOM_ICDF(2880), AOM_ICDF(9023), AOM_ICDF(11126), AOM_ICDF(20602),
-          AOM_ICDF(21713), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(20161), AOM_ICDF(24785), AOM_ICDF(31070), AOM_ICDF(31430),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17985), AOM_ICDF(22773), AOM_ICDF(30430), AOM_ICDF(30880),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15937), AOM_ICDF(18802), AOM_ICDF(28265), AOM_ICDF(28788),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11841), AOM_ICDF(13587), AOM_ICDF(24798), AOM_ICDF(25335),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8769), AOM_ICDF(9160), AOM_ICDF(19316), AOM_ICDF(19566),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5313), AOM_ICDF(5357), AOM_ICDF(12874), AOM_ICDF(12932),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(24129), AOM_ICDF(26501), AOM_ICDF(31672), AOM_ICDF(31844),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(19649), AOM_ICDF(21553), AOM_ICDF(30130), AOM_ICDF(30370),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11713), AOM_ICDF(13134), AOM_ICDF(25983), AOM_ICDF(26321),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9409), AOM_ICDF(9948), AOM_ICDF(21408), AOM_ICDF(21663),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5569), AOM_ICDF(5757), AOM_ICDF(14335), AOM_ICDF(14533),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2241), AOM_ICDF(2305), AOM_ICDF(13152), AOM_ICDF(13209),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(26817), AOM_ICDF(28135), AOM_ICDF(32130), AOM_ICDF(32209),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20161), AOM_ICDF(21412), AOM_ICDF(30331), AOM_ICDF(30481),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13377), AOM_ICDF(13798), AOM_ICDF(26065), AOM_ICDF(26176),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8129), AOM_ICDF(8290), AOM_ICDF(19920), AOM_ICDF(20008),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(5751), AOM_ICDF(14950), AOM_ICDF(15002),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5569), AOM_ICDF(5601), AOM_ICDF(11041), AOM_ICDF(11105),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(28225), AOM_ICDF(29079), AOM_ICDF(32387), AOM_ICDF(32426),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21185), AOM_ICDF(22046), AOM_ICDF(30982), AOM_ICDF(31061),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13377), AOM_ICDF(13595), AOM_ICDF(25762), AOM_ICDF(25824),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8001), AOM_ICDF(8123), AOM_ICDF(20530), AOM_ICDF(20590),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4289), AOM_ICDF(4322), AOM_ICDF(9907), AOM_ICDF(9974),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3393), AOM_ICDF(3412), AOM_ICDF(6663), AOM_ICDF(6739),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(30529), AOM_ICDF(31014), AOM_ICDF(32651), AOM_ICDF(32664),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(23489), AOM_ICDF(24268), AOM_ICDF(31627), AOM_ICDF(31682),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(14017), AOM_ICDF(14239), AOM_ICDF(26653), AOM_ICDF(26707),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11201), AOM_ICDF(11317), AOM_ICDF(23122), AOM_ICDF(23169),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6721), AOM_ICDF(6768), AOM_ICDF(14810), AOM_ICDF(14863),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6593), AOM_ICDF(6632), AOM_ICDF(13188), AOM_ICDF(13245),
-          AOM_ICDF(32768) } } },
-    {   // Inter
-      { // Band 0
-        { AOM_ICDF(29888), AOM_ICDF(30492), AOM_ICDF(32500), AOM_ICDF(32766),
-          AOM_ICDF(32767), AOM_ICDF(32768) },
-        { AOM_ICDF(18752), AOM_ICDF(23235), AOM_ICDF(29846), AOM_ICDF(32214),
-          AOM_ICDF(32442), AOM_ICDF(32768) },
-        { AOM_ICDF(5568), AOM_ICDF(17762), AOM_ICDF(25039), AOM_ICDF(31213),
-          AOM_ICDF(31651), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(26433), AOM_ICDF(29681), AOM_ICDF(32757), AOM_ICDF(32760),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(24769), AOM_ICDF(28761), AOM_ICDF(32722), AOM_ICDF(32734),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22209), AOM_ICDF(26975), AOM_ICDF(32418), AOM_ICDF(32500),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16321), AOM_ICDF(21333), AOM_ICDF(28368), AOM_ICDF(29283),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12865), AOM_ICDF(14775), AOM_ICDF(22545), AOM_ICDF(23553),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12353), AOM_ICDF(12354), AOM_ICDF(12473), AOM_ICDF(12532),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(27457), AOM_ICDF(30005), AOM_ICDF(32738), AOM_ICDF(32745),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(24897), AOM_ICDF(27541), AOM_ICDF(32723), AOM_ICDF(32731),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15297), AOM_ICDF(19106), AOM_ICDF(30414), AOM_ICDF(30711),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6593), AOM_ICDF(8826), AOM_ICDF(19732), AOM_ICDF(20840),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4161), AOM_ICDF(4233), AOM_ICDF(16509), AOM_ICDF(16557),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2625), AOM_ICDF(2652), AOM_ICDF(7276), AOM_ICDF(7351),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(28609), AOM_ICDF(30482), AOM_ICDF(32761), AOM_ICDF(32763),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(25665), AOM_ICDF(27830), AOM_ICDF(32727), AOM_ICDF(32733),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21057), AOM_ICDF(23803), AOM_ICDF(30367), AOM_ICDF(30721),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10945), AOM_ICDF(21878), AOM_ICDF(32726), AOM_ICDF(32737),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(5750), AOM_ICDF(14739), AOM_ICDF(14792),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2881), AOM_ICDF(2913), AOM_ICDF(8427), AOM_ICDF(8498),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(28993), AOM_ICDF(30944), AOM_ICDF(32762), AOM_ICDF(32764),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(26561), AOM_ICDF(28695), AOM_ICDF(32733), AOM_ICDF(32739),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17985), AOM_ICDF(19028), AOM_ICDF(31008), AOM_ICDF(31079),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7873), AOM_ICDF(8039), AOM_ICDF(19981), AOM_ICDF(20068),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5313), AOM_ICDF(5366), AOM_ICDF(14376), AOM_ICDF(14430),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2753), AOM_ICDF(2789), AOM_ICDF(8909), AOM_ICDF(8979),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(30273), AOM_ICDF(32029), AOM_ICDF(32764), AOM_ICDF(32766),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(28609), AOM_ICDF(30847), AOM_ICDF(32745), AOM_ICDF(32751),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21313), AOM_ICDF(24377), AOM_ICDF(31986), AOM_ICDF(32098),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(32705), AOM_ICDF(32709), AOM_ICDF(32739), AOM_ICDF(32741),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4929), AOM_ICDF(5579), AOM_ICDF(16402), AOM_ICDF(16866),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3009), AOM_ICDF(3246), AOM_ICDF(10158), AOM_ICDF(10533),
-          AOM_ICDF(32768) } } } }
-};
-static const coeff_cdf_model default_coef_head_cdf_32x32[PLANE_TYPES] = {
-  {     // Y plane
-    {   // Intra
-      { // Band 0
-        { AOM_ICDF(2240), AOM_ICDF(5407), AOM_ICDF(18304), AOM_ICDF(25601),
-          AOM_ICDF(27911), AOM_ICDF(32768) },
-        { AOM_ICDF(960), AOM_ICDF(4633), AOM_ICDF(8197), AOM_ICDF(16254),
-          AOM_ICDF(18796), AOM_ICDF(32768) },
-        { AOM_ICDF(192), AOM_ICDF(3061), AOM_ICDF(3557), AOM_ICDF(8701),
-          AOM_ICDF(9762), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(11969), AOM_ICDF(15846), AOM_ICDF(25660), AOM_ICDF(26667),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11713), AOM_ICDF(15794), AOM_ICDF(25737), AOM_ICDF(26760),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9281), AOM_ICDF(12675), AOM_ICDF(23181), AOM_ICDF(24351),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7105), AOM_ICDF(8757), AOM_ICDF(18383), AOM_ICDF(19437),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4289), AOM_ICDF(4579), AOM_ICDF(11353), AOM_ICDF(11792),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1857), AOM_ICDF(1874), AOM_ICDF(4695), AOM_ICDF(4777),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(20929), AOM_ICDF(22297), AOM_ICDF(29370), AOM_ICDF(29646),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17473), AOM_ICDF(18985), AOM_ICDF(28079), AOM_ICDF(28413),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13121), AOM_ICDF(14064), AOM_ICDF(24902), AOM_ICDF(25217),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9793), AOM_ICDF(10214), AOM_ICDF(20069), AOM_ICDF(20329),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5825), AOM_ICDF(5987), AOM_ICDF(13350), AOM_ICDF(13559),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2241), AOM_ICDF(2260), AOM_ICDF(5520), AOM_ICDF(5600),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(25921), AOM_ICDF(26891), AOM_ICDF(31632), AOM_ICDF(31729),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(18241), AOM_ICDF(19463), AOM_ICDF(29222), AOM_ICDF(29419),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11585), AOM_ICDF(12065), AOM_ICDF(23294), AOM_ICDF(23488),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6593), AOM_ICDF(6686), AOM_ICDF(16153), AOM_ICDF(16234),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3137), AOM_ICDF(3170), AOM_ICDF(8751), AOM_ICDF(8821),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1345), AOM_ICDF(1359), AOM_ICDF(3739), AOM_ICDF(3824),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(27713), AOM_ICDF(28504), AOM_ICDF(32068), AOM_ICDF(32132),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(19265), AOM_ICDF(20354), AOM_ICDF(29789), AOM_ICDF(29943),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11201), AOM_ICDF(11538), AOM_ICDF(22701), AOM_ICDF(22848),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6337), AOM_ICDF(6424), AOM_ICDF(15268), AOM_ICDF(15353),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3649), AOM_ICDF(3681), AOM_ICDF(9052), AOM_ICDF(9121),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1601), AOM_ICDF(1618), AOM_ICDF(4584), AOM_ICDF(4667),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(30913), AOM_ICDF(31044), AOM_ICDF(32635), AOM_ICDF(32640),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22081), AOM_ICDF(22261), AOM_ICDF(30452), AOM_ICDF(30477),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10561), AOM_ICDF(10625), AOM_ICDF(21535), AOM_ICDF(21568),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6081), AOM_ICDF(6130), AOM_ICDF(14369), AOM_ICDF(14423),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3777), AOM_ICDF(3809), AOM_ICDF(9156), AOM_ICDF(9225),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1857), AOM_ICDF(1875), AOM_ICDF(4936), AOM_ICDF(5018),
-          AOM_ICDF(32768) } } },
-    {   // Intra
-      { // Band 0
-        { AOM_ICDF(4672), AOM_ICDF(6927), AOM_ICDF(23534), AOM_ICDF(29846),
-          AOM_ICDF(30928), AOM_ICDF(32768) },
-        { AOM_ICDF(3776), AOM_ICDF(6784), AOM_ICDF(18075), AOM_ICDF(25863),
-          AOM_ICDF(27926), AOM_ICDF(32768) },
-        { AOM_ICDF(1344), AOM_ICDF(5588), AOM_ICDF(12166), AOM_ICDF(20966),
-          AOM_ICDF(23504), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(19393), AOM_ICDF(22016), AOM_ICDF(31280), AOM_ICDF(31444),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21185), AOM_ICDF(24329), AOM_ICDF(31706), AOM_ICDF(31865),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20673), AOM_ICDF(23240), AOM_ICDF(31186), AOM_ICDF(31379),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17857), AOM_ICDF(20035), AOM_ICDF(29594), AOM_ICDF(29889),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13633), AOM_ICDF(14929), AOM_ICDF(24883), AOM_ICDF(25337),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7873), AOM_ICDF(8416), AOM_ICDF(17452), AOM_ICDF(17886),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(25665), AOM_ICDF(27145), AOM_ICDF(32256), AOM_ICDF(32314),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21057), AOM_ICDF(22826), AOM_ICDF(31465), AOM_ICDF(31576),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13633), AOM_ICDF(14885), AOM_ICDF(27873), AOM_ICDF(28088),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8769), AOM_ICDF(9515), AOM_ICDF(21941), AOM_ICDF(22248),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(6209), AOM_ICDF(6594), AOM_ICDF(15598), AOM_ICDF(15950),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(1985), AOM_ICDF(2014), AOM_ICDF(6855), AOM_ICDF(6931),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(26817), AOM_ICDF(27824), AOM_ICDF(32362), AOM_ICDF(32399),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21185), AOM_ICDF(22321), AOM_ICDF(31389), AOM_ICDF(31466),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13761), AOM_ICDF(14154), AOM_ICDF(27163), AOM_ICDF(27245),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8897), AOM_ICDF(9011), AOM_ICDF(20600), AOM_ICDF(20659),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4673), AOM_ICDF(4774), AOM_ICDF(15044), AOM_ICDF(15131),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(194), AOM_ICDF(384), AOM_ICDF(479),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(28865), AOM_ICDF(29687), AOM_ICDF(32655), AOM_ICDF(32667),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(23233), AOM_ICDF(24218), AOM_ICDF(32080), AOM_ICDF(32118),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15041), AOM_ICDF(15444), AOM_ICDF(28787), AOM_ICDF(28845),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9921), AOM_ICDF(10248), AOM_ICDF(22818), AOM_ICDF(22944),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7745), AOM_ICDF(7866), AOM_ICDF(16591), AOM_ICDF(16702),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(194), AOM_ICDF(384), AOM_ICDF(479),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(31169), AOM_ICDF(31559), AOM_ICDF(32741), AOM_ICDF(32744),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(24769), AOM_ICDF(25583), AOM_ICDF(32347), AOM_ICDF(32370),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15937), AOM_ICDF(16169), AOM_ICDF(29120), AOM_ICDF(29152),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7489), AOM_ICDF(7578), AOM_ICDF(22647), AOM_ICDF(22677),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7617), AOM_ICDF(7689), AOM_ICDF(19849), AOM_ICDF(19887),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2113), AOM_ICDF(2183), AOM_ICDF(7202), AOM_ICDF(7377),
-          AOM_ICDF(32768) } } } },
-  {     // UV plane
-    {   // Inter
-      { // Band 0
-        { AOM_ICDF(23232), AOM_ICDF(24301), AOM_ICDF(30231), AOM_ICDF(31582),
-          AOM_ICDF(32091), AOM_ICDF(32768) },
-        { AOM_ICDF(7872), AOM_ICDF(11041), AOM_ICDF(22542), AOM_ICDF(27086),
-          AOM_ICDF(29145), AOM_ICDF(32768) },
-        { AOM_ICDF(1344), AOM_ICDF(3989), AOM_ICDF(18125), AOM_ICDF(25340),
-          AOM_ICDF(27820), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(15937), AOM_ICDF(29000), AOM_ICDF(32210), AOM_ICDF(32434),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(12353), AOM_ICDF(26626), AOM_ICDF(31533), AOM_ICDF(31993),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11457), AOM_ICDF(29187), AOM_ICDF(30896), AOM_ICDF(31750),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(21278), AOM_ICDF(28169), AOM_ICDF(29764),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(7489), AOM_ICDF(8855), AOM_ICDF(13365), AOM_ICDF(15620),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4289), AOM_ICDF(4833), AOM_ICDF(8572), AOM_ICDF(10108),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(25025), AOM_ICDF(30783), AOM_ICDF(32603), AOM_ICDF(32666),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(24385), AOM_ICDF(29586), AOM_ICDF(31803), AOM_ICDF(32142),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22337), AOM_ICDF(23002), AOM_ICDF(27573), AOM_ICDF(27903),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10945), AOM_ICDF(12336), AOM_ICDF(21900), AOM_ICDF(22590),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(8257), AOM_ICDF(8830), AOM_ICDF(19986), AOM_ICDF(20298),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10945), AOM_ICDF(10990), AOM_ICDF(18660), AOM_ICDF(18701),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(29761), AOM_ICDF(31473), AOM_ICDF(32693), AOM_ICDF(32715),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(20417), AOM_ICDF(24512), AOM_ICDF(31394), AOM_ICDF(31650),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(11713), AOM_ICDF(13283), AOM_ICDF(25819), AOM_ICDF(26206),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13121), AOM_ICDF(14099), AOM_ICDF(21909), AOM_ICDF(22514),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(248), AOM_ICDF(9546), AOM_ICDF(9614),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2497), AOM_ICDF(2524), AOM_ICDF(7050), AOM_ICDF(7125),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(30657), AOM_ICDF(31885), AOM_ICDF(32691), AOM_ICDF(32715),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(19393), AOM_ICDF(26050), AOM_ICDF(31698), AOM_ICDF(31988),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(15809), AOM_ICDF(15863), AOM_ICDF(24985), AOM_ICDF(25008),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(23489), AOM_ICDF(28138), AOM_ICDF(32751), AOM_ICDF(32756),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16449), AOM_ICDF(16450), AOM_ICDF(16545), AOM_ICDF(16593),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2369), AOM_ICDF(2395), AOM_ICDF(6822), AOM_ICDF(6898),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(32705), AOM_ICDF(32744), AOM_ICDF(32766), AOM_ICDF(32767),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21953), AOM_ICDF(24962), AOM_ICDF(32156), AOM_ICDF(32246),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(13121), AOM_ICDF(15358), AOM_ICDF(26284), AOM_ICDF(26835),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(7417), AOM_ICDF(20132), AOM_ICDF(20885),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4417), AOM_ICDF(4939), AOM_ICDF(15104), AOM_ICDF(15535),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2625), AOM_ICDF(2680), AOM_ICDF(8218), AOM_ICDF(8338),
-          AOM_ICDF(32768) } } },
-    {   // Inter
-      { // Band 0
-        { AOM_ICDF(25280), AOM_ICDF(25678), AOM_ICDF(32446), AOM_ICDF(32622),
-          AOM_ICDF(32724), AOM_ICDF(32768) },
-        { AOM_ICDF(10560), AOM_ICDF(11822), AOM_ICDF(28682), AOM_ICDF(29919),
-          AOM_ICDF(31276), AOM_ICDF(32768) },
-        { AOM_ICDF(3264), AOM_ICDF(5170), AOM_ICDF(21779), AOM_ICDF(24026),
-          AOM_ICDF(27905), AOM_ICDF(32768) } },
-      { // Band 1
-        { AOM_ICDF(24257), AOM_ICDF(30554), AOM_ICDF(32719), AOM_ICDF(32738),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17217), AOM_ICDF(27413), AOM_ICDF(32617), AOM_ICDF(32667),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22977), AOM_ICDF(27600), AOM_ICDF(32482), AOM_ICDF(32552),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16833), AOM_ICDF(24360), AOM_ICDF(30746), AOM_ICDF(31293),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(17089), AOM_ICDF(20060), AOM_ICDF(28880), AOM_ICDF(29370),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(10945), AOM_ICDF(11009), AOM_ICDF(21900), AOM_ICDF(21932),
-          AOM_ICDF(32768) } },
-      { // Band 2
-        { AOM_ICDF(27201), AOM_ICDF(30217), AOM_ICDF(32736), AOM_ICDF(32745),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22721), AOM_ICDF(27676), AOM_ICDF(32749), AOM_ICDF(32754),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5057), AOM_ICDF(12431), AOM_ICDF(25246), AOM_ICDF(26620),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(321), AOM_ICDF(22016), AOM_ICDF(22048),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5313), AOM_ICDF(5363), AOM_ICDF(13839), AOM_ICDF(13894),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2625), AOM_ICDF(2652), AOM_ICDF(7276), AOM_ICDF(7351),
-          AOM_ICDF(32768) } },
-      { // Band 3
-        { AOM_ICDF(27713), AOM_ICDF(30739), AOM_ICDF(32759), AOM_ICDF(32762),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(26177), AOM_ICDF(30430), AOM_ICDF(32756), AOM_ICDF(32760),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(384), AOM_ICDF(32706), AOM_ICDF(32707),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(9409), AOM_ICDF(9528), AOM_ICDF(21591), AOM_ICDF(21646),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(194), AOM_ICDF(384), AOM_ICDF(479),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2881), AOM_ICDF(2913), AOM_ICDF(8427), AOM_ICDF(8498),
-          AOM_ICDF(32768) } },
-      { // Band 4
-        { AOM_ICDF(28993), AOM_ICDF(31156), AOM_ICDF(32747), AOM_ICDF(32753),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(25153), AOM_ICDF(28701), AOM_ICDF(32754), AOM_ICDF(32758),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(16449), AOM_ICDF(16544), AOM_ICDF(32737), AOM_ICDF(32738),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(321), AOM_ICDF(22016), AOM_ICDF(22048),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(193), AOM_ICDF(194), AOM_ICDF(384), AOM_ICDF(479),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(2753), AOM_ICDF(2789), AOM_ICDF(8909), AOM_ICDF(8979),
-          AOM_ICDF(32768) } },
-      { // Band 5
-        { AOM_ICDF(30785), AOM_ICDF(32088), AOM_ICDF(32765), AOM_ICDF(32766),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(22977), AOM_ICDF(26623), AOM_ICDF(32750), AOM_ICDF(32754),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(21953), AOM_ICDF(21954), AOM_ICDF(22017), AOM_ICDF(22049),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(5697), AOM_ICDF(7486), AOM_ICDF(20238), AOM_ICDF(21009),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(4929), AOM_ICDF(5579), AOM_ICDF(16402), AOM_ICDF(16866),
-          AOM_ICDF(32768) },
-        { AOM_ICDF(3009), AOM_ICDF(3246), AOM_ICDF(10158), AOM_ICDF(10533),
-          AOM_ICDF(32768) } } } }
-};
-
-/* clang-format on */
-#endif  // !CONFIG_Q_ADAPT_PROBS
-
-static void build_tail_cdfs(aom_cdf_prob cdf_tail[CDF_SIZE(ENTROPY_TOKENS)],
-                            aom_cdf_prob cdf_head[CDF_SIZE(ENTROPY_TOKENS)],
-                            int band_zero) {
-  int probNZ, prob1, prob_idx, i;
-  int phead[HEAD_TOKENS + 1], sum;
-  const int is_dc = !!band_zero;
-  aom_cdf_prob prev_cdf;
-  prev_cdf = 0;
-  for (i = 0; i < HEAD_TOKENS + is_dc; ++i) {
-    phead[i] = AOM_ICDF(cdf_head[i]) - prev_cdf;
-    prev_cdf = AOM_ICDF(cdf_head[i]);
-  }
-  // Do the tail
-  probNZ = CDF_PROB_TOP - phead[ZERO_TOKEN + is_dc] - (is_dc ? phead[0] : 0);
-  prob1 = phead[is_dc + ONE_TOKEN_EOB] + phead[is_dc + ONE_TOKEN_NEOB];
-  prob_idx =
-      AOMMIN(COEFF_PROB_MODELS - 1, AOMMAX(0, ((256 * prob1) / probNZ) - 1));
-
-  sum = 0;
-  for (i = 0; i < TAIL_TOKENS; ++i) {
-    sum += av1_pareto8_tail_probs[prob_idx][i];
-    cdf_tail[i] = AOM_ICDF(sum);
-  }
-}
-
-#if !CONFIG_Q_ADAPT_PROBS
-// FIXME. Optimize for TX_2X2 and TX_64X64.
-static void av1_default_coef_cdfs(FRAME_CONTEXT *fc) {
-#if CONFIG_CHROMA_2X2
-  av1_copy(fc->coef_head_cdfs[TX_2X2], default_coef_head_cdf_4x4);
-#endif  // CONFIG_CHROMA_2X2
-  av1_copy(fc->coef_head_cdfs[TX_4X4], default_coef_head_cdf_4x4);
-  av1_copy(fc->coef_head_cdfs[TX_8X8], default_coef_head_cdf_8x8);
-  av1_copy(fc->coef_head_cdfs[TX_16X16], default_coef_head_cdf_16x16);
-  av1_copy(fc->coef_head_cdfs[TX_32X32], default_coef_head_cdf_32x32);
-#if CONFIG_TX64X64
-  av1_copy(fc->coef_head_cdfs[TX_64X64], default_coef_head_cdf_32x32);
-#endif  // CONFIG_TX64X64
-}
-#endif  // !CONFIG_Q_ADAPT_PROBS
-
-void av1_coef_pareto_cdfs(FRAME_CONTEXT *fc) {
-  /* Build the tail based on a Pareto distribution */
-  TX_SIZE t;
-  int i, j, k, l;
-  for (t = 0; t < TX_SIZES; ++t)
-    for (i = 0; i < PLANE_TYPES; ++i)
-      for (j = 0; j < REF_TYPES; ++j)
-        for (k = 0; k < COEF_BANDS; ++k)
-          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
-            build_tail_cdfs(fc->coef_tail_cdfs[t][i][j][k][l],
-                            fc->coef_head_cdfs[t][i][j][k][l], k == 0);
+static int get_q_ctx(int q) {
+  if (q <= 20) return 0;
+  if (q <= 60) return 1;
+  if (q <= 120) return 2;
+  return 3;
 }
 
 void av1_default_coef_probs(AV1_COMMON *cm) {
-#if CONFIG_Q_ADAPT_PROBS
-  const int index = AOMMIN(TOKEN_CDF_Q_CTXS - 1, cm->base_qindex / 64);
-#if CONFIG_CHROMA_2X2
-  av1_copy(cm->fc->coef_head_cdfs[TX_2X2],
-           (*av1_default_qctx_coef_cdfs[index])[TX_4X4]);
-#endif  // CONFIG_CHROMA_2X2
-  av1_copy(cm->fc->coef_head_cdfs[TX_4X4],
-           (*av1_default_qctx_coef_cdfs[index])[TX_4X4]);
-  av1_copy(cm->fc->coef_head_cdfs[TX_8X8],
-           (*av1_default_qctx_coef_cdfs[index])[TX_8X8]);
-  av1_copy(cm->fc->coef_head_cdfs[TX_16X16],
-           (*av1_default_qctx_coef_cdfs[index])[TX_16X16]);
-  av1_copy(cm->fc->coef_head_cdfs[TX_32X32],
-           (*av1_default_qctx_coef_cdfs[index])[TX_32X32]);
-#if CONFIG_TX64X64
-  av1_copy(cm->fc->coef_head_cdfs[TX_64X64],
-           (*av1_default_qctx_coef_cdfs[index])[TX_32X32]);
-#endif  // CONFIG_TX64X64
-#else
-  /* Load the head tokens */
-  av1_default_coef_cdfs(cm->fc);
-#endif  // CONFIG_Q_ADAPT_PROBS
-  av1_coef_pareto_cdfs(cm->fc);
+  const int index = get_q_ctx(cm->base_qindex);
+#if CONFIG_ENTROPY_STATS
+  cm->coef_cdf_category = index;
+#endif
+
+  av1_copy(cm->fc->txb_skip_cdf, av1_default_txb_skip_cdfs[index]);
+  av1_copy(cm->fc->eob_extra_cdf, av1_default_eob_extra_cdfs[index]);
+  av1_copy(cm->fc->dc_sign_cdf, av1_default_dc_sign_cdfs[index]);
+  av1_copy(cm->fc->coeff_br_cdf, av1_default_coeff_lps_multi_cdfs[index]);
+  av1_copy(cm->fc->coeff_base_cdf, av1_default_coeff_base_multi_cdfs[index]);
+  av1_copy(cm->fc->coeff_base_eob_cdf,
+           av1_default_coeff_base_eob_multi_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf16, av1_default_eob_multi16_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf32, av1_default_eob_multi32_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf64, av1_default_eob_multi64_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf128, av1_default_eob_multi128_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf256, av1_default_eob_multi256_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf512, av1_default_eob_multi512_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf1024, av1_default_eob_multi1024_cdfs[index]);
 }
 
-#if CONFIG_LV_MAP
-void av1_adapt_coef_probs(AV1_COMMON *cm) {
-  unsigned int count_sat, update_factor;
-  if (!frame_is_intra_only(cm) && cm->last_frame_type == KEY_FRAME) {
-    update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY; /* adapt quickly */
-    count_sat = COEF_COUNT_SAT_AFTER_KEY;
-  } else {
-    update_factor = COEF_MAX_UPDATE_FACTOR;
-    count_sat = COEF_COUNT_SAT;
+static void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr, int num_cdfs,
+                                     int cdf_stride, int nsymbs) {
+  for (int i = 0; i < num_cdfs; i++) {
+    cdf_ptr[i * cdf_stride + nsymbs] = 0;
   }
-  av1_adapt_txb_probs(cm, count_sat, update_factor);
 }
-#endif  // CONFIG_LV_MAP
 
-static void av1_average_cdf(aom_cdf_prob *cdf_ptr[], aom_cdf_prob *fc_cdf_ptr,
-                            int cdf_size, const int num_tiles) {
-  int i;
-  for (i = 0; i < cdf_size;) {
-    do {
-      int sum = 0;
-      int j;
-      assert(i < cdf_size);
-      for (j = 0; j < num_tiles; ++j) sum += AOM_ICDF(cdf_ptr[j][i]);
-      fc_cdf_ptr[i] = AOM_ICDF(sum / num_tiles);
-    } while (fc_cdf_ptr[i++] != AOM_ICDF(CDF_PROB_TOP));
-    // Zero symbol counts for the next frame
-    assert(i < cdf_size);
-    fc_cdf_ptr[i++] = 0;
-    // Skip trailing zeros until the start of the next CDF.
-    for (; i < cdf_size && fc_cdf_ptr[i] == 0; ++i) {
-    }
+#define RESET_CDF_COUNTER(cname, nsymbs) \
+  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
+
+#define RESET_CDF_COUNTER_STRIDE(cname, nsymbs, cdf_stride)          \
+  do {                                                               \
+    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
+    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
+    int num_cdfs = array_size / cdf_stride;                          \
+    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
+  } while (0)
+
+static void reset_nmv_counter(nmv_context *nmv) {
+  RESET_CDF_COUNTER(nmv->joints_cdf, 4);
+  for (int i = 0; i < 2; i++) {
+    RESET_CDF_COUNTER(nmv->comps[i].classes_cdf, MV_CLASSES);
+    RESET_CDF_COUNTER(nmv->comps[i].class0_fp_cdf, MV_FP_SIZE);
+    RESET_CDF_COUNTER(nmv->comps[i].fp_cdf, MV_FP_SIZE);
+    RESET_CDF_COUNTER(nmv->comps[i].sign_cdf, 2);
+    RESET_CDF_COUNTER(nmv->comps[i].class0_hp_cdf, 2);
+    RESET_CDF_COUNTER(nmv->comps[i].hp_cdf, 2);
+    RESET_CDF_COUNTER(nmv->comps[i].class0_cdf, CLASS0_SIZE);
+    RESET_CDF_COUNTER(nmv->comps[i].bits_cdf, 2);
   }
 }
 
-#define AVERAGE_TILE_CDFS(cname)                            \
-  for (i = 0; i < num_tiles; ++i)                           \
-    cdf_ptr[i] = (aom_cdf_prob *)&ec_ctxs[i]->cname;        \
-  fc_cdf_ptr = (aom_cdf_prob *)&fc->cname;                  \
-  cdf_size = (int)sizeof(fc->cname) / sizeof(aom_cdf_prob); \
-  av1_average_cdf(cdf_ptr, fc_cdf_ptr, cdf_size, num_tiles);
-
-void av1_average_tile_coef_cdfs(FRAME_CONTEXT *fc, FRAME_CONTEXT *ec_ctxs[],
-                                aom_cdf_prob *cdf_ptr[], int num_tiles) {
-  int i, cdf_size;
-
-  aom_cdf_prob *fc_cdf_ptr;
-
-#if CONFIG_LV_MAP
-  AVERAGE_TILE_CDFS(txb_skip_cdf)
-  AVERAGE_TILE_CDFS(nz_map_cdf)
-  AVERAGE_TILE_CDFS(eob_flag_cdf)
-  AVERAGE_TILE_CDFS(dc_sign_cdf)
-  AVERAGE_TILE_CDFS(coeff_base_cdf)
-  AVERAGE_TILE_CDFS(coeff_lps_cdf)
-#if BR_NODE
-  AVERAGE_TILE_CDFS(coeff_br_cdf)
-#endif
-#if CONFIG_CTX1D
-  AVERAGE_TILE_CDFS(eob_mode_cdf)
-  AVERAGE_TILE_CDFS(empty_line_cdf)
-  AVERAGE_TILE_CDFS(hv_eob_cdf)
-#endif
-#else
-  AVERAGE_TILE_CDFS(coef_head_cdfs)
-  AVERAGE_TILE_CDFS(coef_tail_cdfs)
-#endif
-}
-
-void av1_average_tile_mv_cdfs(FRAME_CONTEXT *fc, FRAME_CONTEXT *ec_ctxs[],
-                              aom_cdf_prob *cdf_ptr[], int num_tiles) {
-  int i, k, cdf_size;
-
-  aom_cdf_prob *fc_cdf_ptr;
-
-  int j;
-  for (j = 0; j < NMV_CONTEXTS; ++j) {
-    AVERAGE_TILE_CDFS(nmvc[j].joint_cdf)
-
-    for (k = 0; k < 2; ++k) {
-      AVERAGE_TILE_CDFS(nmvc[j].comps[k].class_cdf)
-      AVERAGE_TILE_CDFS(nmvc[j].comps[k].class0_fp_cdf)
-      AVERAGE_TILE_CDFS(nmvc[j].comps[k].fp_cdf)
-#if CONFIG_NEW_MULTISYMBOL
-      AVERAGE_TILE_CDFS(nmvc[j].comps[k].hp_cdf)
-      AVERAGE_TILE_CDFS(nmvc[j].comps[k].class0_hp_cdf)
-      AVERAGE_TILE_CDFS(nmvc[j].comps[k].class0_cdf)
-      AVERAGE_TILE_CDFS(nmvc[j].comps[k].bits_cdf)
-#endif
-    }
+void av1_reset_cdf_symbol_counters(FRAME_CONTEXT *fc) {
+  RESET_CDF_COUNTER(fc->txb_skip_cdf, 2);
+  RESET_CDF_COUNTER(fc->eob_extra_cdf, 2);
+  RESET_CDF_COUNTER(fc->dc_sign_cdf, 2);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf16, 5);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf32, 6);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf64, 7);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf128, 8);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf256, 9);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf512, 10);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf1024, 11);
+  RESET_CDF_COUNTER(fc->coeff_base_eob_cdf, 3);
+  RESET_CDF_COUNTER(fc->coeff_base_cdf, 4);
+  RESET_CDF_COUNTER(fc->coeff_br_cdf, BR_CDF_SIZE);
+  RESET_CDF_COUNTER(fc->newmv_cdf, 2);
+  RESET_CDF_COUNTER(fc->zeromv_cdf, 2);
+  RESET_CDF_COUNTER(fc->refmv_cdf, 2);
+  RESET_CDF_COUNTER(fc->drl_cdf, 2);
+  RESET_CDF_COUNTER(fc->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
+  RESET_CDF_COUNTER(fc->compound_type_cdf, COMPOUND_TYPES - 1);
+  RESET_CDF_COUNTER(fc->wedge_idx_cdf, 16);
+  RESET_CDF_COUNTER(fc->interintra_cdf, 2);
+  RESET_CDF_COUNTER(fc->wedge_interintra_cdf, 2);
+  RESET_CDF_COUNTER(fc->interintra_mode_cdf, INTERINTRA_MODES);
+  RESET_CDF_COUNTER(fc->motion_mode_cdf, MOTION_MODES);
+  RESET_CDF_COUNTER(fc->obmc_cdf, 2);
+  RESET_CDF_COUNTER(fc->palette_y_size_cdf, PALETTE_SIZES);
+  RESET_CDF_COUNTER(fc->palette_uv_size_cdf, PALETTE_SIZES);
+  for (int j = 0; j < PALETTE_SIZES; j++) {
+    int nsymbs = j + PALETTE_MIN_SIZE;
+    RESET_CDF_COUNTER_STRIDE(fc->palette_y_color_index_cdf[j], nsymbs,
+                             CDF_SIZE(PALETTE_COLORS));
+    RESET_CDF_COUNTER_STRIDE(fc->palette_uv_color_index_cdf[j], nsymbs,
+                             CDF_SIZE(PALETTE_COLORS));
   }
-}
-
-void av1_average_tile_intra_cdfs(FRAME_CONTEXT *fc, FRAME_CONTEXT *ec_ctxs[],
-                                 aom_cdf_prob *cdf_ptr[], int num_tiles) {
-  int i, cdf_size;
-
-  aom_cdf_prob *fc_cdf_ptr;
-
-  AVERAGE_TILE_CDFS(tx_size_cdf)
-
-  AVERAGE_TILE_CDFS(intra_ext_tx_cdf)
-  AVERAGE_TILE_CDFS(inter_ext_tx_cdf)
-
-  AVERAGE_TILE_CDFS(seg.tree_cdf)
-#if CONFIG_NEW_MULTISYMBOL
-  AVERAGE_TILE_CDFS(seg.pred_cdf)
-#endif
-  AVERAGE_TILE_CDFS(uv_mode_cdf)
-
-#if CONFIG_CFL
-  AVERAGE_TILE_CDFS(cfl_sign_cdf)
-  AVERAGE_TILE_CDFS(cfl_alpha_cdf)
-#endif
-
-  AVERAGE_TILE_CDFS(partition_cdf)
-
-  AVERAGE_TILE_CDFS(delta_q_cdf)
-#if CONFIG_EXT_DELTA_Q
-  AVERAGE_TILE_CDFS(delta_lf_cdf)
-#endif
-#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-  AVERAGE_TILE_CDFS(intra_filter_cdf)
-#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-
-#if CONFIG_NEW_MULTISYMBOL
-  AVERAGE_TILE_CDFS(skip_cdfs)
-#if CONFIG_VAR_TX
-  AVERAGE_TILE_CDFS(txfm_partition_cdf)
-#endif
-#endif  // CONFIG_NEW_MULTISYMBOL
-  AVERAGE_TILE_CDFS(palette_y_size_cdf)
-  AVERAGE_TILE_CDFS(palette_uv_size_cdf)
-  AVERAGE_TILE_CDFS(palette_y_color_index_cdf)
-  AVERAGE_TILE_CDFS(palette_uv_color_index_cdf)
-#if CONFIG_MRC_TX
-  AVERAGE_TILE_CDFS(mrc_mask_intra_cdf)
-#endif  // CONFIG_MRC_TX
-#if CONFIG_NEW_MULTISYMBOL
-  AVERAGE_TILE_CDFS(palette_y_mode_cdf)
-  AVERAGE_TILE_CDFS(palette_uv_mode_cdf)
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-  AVERAGE_TILE_CDFS(quarter_tx_size_cdf)
-#endif
-#endif
-#if CONFIG_LPF_SB
-  AVERAGE_TILE_CDFS(lpf_reuse_cdf);
-  AVERAGE_TILE_CDFS(lpf_delta_cdf);
-  AVERAGE_TILE_CDFS(lpf_sign_cdf);
-#endif  // CONFIG_LPF_SB
-}
-
-void av1_average_tile_inter_cdfs(AV1_COMMON *cm, FRAME_CONTEXT *fc,
-                                 FRAME_CONTEXT *ec_ctxs[],
-                                 aom_cdf_prob *cdf_ptr[], int num_tiles) {
-  int i, cdf_size;
-
-  aom_cdf_prob *fc_cdf_ptr;
-
-#if CONFIG_NEW_MULTISYMBOL
-  AVERAGE_TILE_CDFS(comp_inter_cdf)
-#if CONFIG_EXT_REFS
-  AVERAGE_TILE_CDFS(comp_ref_cdf)
-  AVERAGE_TILE_CDFS(comp_bwdref_cdf)
-#endif
-#endif
-
-#if CONFIG_NEW_MULTISYMBOL
-  AVERAGE_TILE_CDFS(single_ref_cdf)
-
-  AVERAGE_TILE_CDFS(newmv_cdf)
-  AVERAGE_TILE_CDFS(zeromv_cdf)
-  AVERAGE_TILE_CDFS(refmv_cdf)
-  AVERAGE_TILE_CDFS(drl_cdf)
-#if CONFIG_EXT_COMP_REFS
-  AVERAGE_TILE_CDFS(uni_comp_ref_cdf)
-  AVERAGE_TILE_CDFS(comp_ref_type_cdf)
-#endif
-#endif
-
-  // FIXME: cdfs not defined for super_tx
-
-  AVERAGE_TILE_CDFS(inter_compound_mode_cdf)
-
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-  AVERAGE_TILE_CDFS(compound_type_cdf)
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-#if CONFIG_COMPOUND_SINGLEREF
-  AVERAGE_TILE_CDFS(inter_singleref_comp_mode_cdf)
-#endif
-
-#if CONFIG_INTERINTRA
-#if CONFIG_NEW_MULTISYMBOL
-  AVERAGE_TILE_CDFS(interintra_cdf)
-  AVERAGE_TILE_CDFS(wedge_interintra_cdf)
-#endif
-  AVERAGE_TILE_CDFS(interintra_mode_cdf)
-#endif
-
-  /* NB: kf_y_cdf is discarded after use, so no need
-     for backwards update */
-  AVERAGE_TILE_CDFS(y_mode_cdf)
-
-  if (cm->interp_filter == SWITCHABLE) {
-    AVERAGE_TILE_CDFS(switchable_interp_cdf)
-  }
-#if CONFIG_NEW_MULTISYMBOL
-  AVERAGE_TILE_CDFS(intra_inter_cdf)
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  AVERAGE_TILE_CDFS(motion_mode_cdf)
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-  AVERAGE_TILE_CDFS(obmc_cdf)
-#endif
-#endif
-#endif
-#if CONFIG_MRC_TX
-  AVERAGE_TILE_CDFS(mrc_mask_inter_cdf)
-#endif  // CONFIG_MRC_TX
-#if CONFIG_LPF_SB
-  AVERAGE_TILE_CDFS(lpf_reuse_cdf);
-  AVERAGE_TILE_CDFS(lpf_delta_cdf);
-  AVERAGE_TILE_CDFS(lpf_sign_cdf);
-#endif  // CONFIG_LPF_SB
-}
-
-#if CONFIG_PVQ
-// Averaging PVQ's expected values for symbol coding
-static void av1_average_pvq_ex(int *cxt_ptr[], int *fc_cxt_ptr, int cxt_size,
-                               const int num_tiles) {
-  int i, j;
-  for (i = 0; i < cxt_size; ++i) {
-    int sum = 0;
-    for (j = 0; j < num_tiles; ++j) sum += cxt_ptr[j][i];
-    fc_cxt_ptr[i] = sum / num_tiles;
-  }
-}
-
-#define AVERAGE_TILE_PVQ_EX(cname)                                        \
-  for (i = 0; i < num_tiles; ++i) cxt_ptr[i] = (int *)&ec_ctxs[i]->cname; \
-  fc_cxt_ptr = (int *)&fc->cname;                                         \
-  cxt_size = (int)sizeof(fc->cname) / sizeof(int);                        \
-  av1_average_pvq_ex(cxt_ptr, fc_cxt_ptr, cxt_size, num_tiles);
-
-void av1_default_pvq_probs(AV1_COMMON *cm) {
-  od_adapt_ctx *adapt = &cm->fc->pvq_context;
-
-  // Init with flat probabilities.
-  od_adapt_ctx_reset(adapt, 0);
-
-  // TODO(yushin): Prepare offline cdf and context table for PVQ,
-  // i.e. od_adapt_ctx, then load them from table,
-  // for example od_adapt_ctx default_pvq_context.
-  // Then do sth like this:
-  // av1_copy(cm->fc->pvq_context, default_pvq_context);
-}
-
-void av1_average_tile_pvq_cdfs(FRAME_CONTEXT *fc, FRAME_CONTEXT *ec_ctxs[],
-                               const int num_tiles) {
-  int i, j, cdf_size, cxt_size;
-
-  aom_cdf_prob *cdf_ptr[MAX_TILE_ROWS * MAX_TILE_COLS];
-  aom_cdf_prob *fc_cdf_ptr;
-  int *cxt_ptr[MAX_TILE_ROWS * MAX_TILE_COLS];
-  int *fc_cxt_ptr;
-
-  AVERAGE_TILE_PVQ_EX(pvq_context.ex_dc)
-  AVERAGE_TILE_PVQ_EX(pvq_context.ex_g)
-
-  for (j = 0; j < OD_NPLANES_MAX; j++) {
-    AVERAGE_TILE_CDFS(pvq_context.model_dc[j].cdf)
+  RESET_CDF_COUNTER(fc->palette_y_mode_cdf, 2);
+  RESET_CDF_COUNTER(fc->palette_uv_mode_cdf, 2);
+  RESET_CDF_COUNTER(fc->comp_inter_cdf, 2);
+  RESET_CDF_COUNTER(fc->single_ref_cdf, 2);
+  RESET_CDF_COUNTER(fc->comp_ref_type_cdf, 2);
+  RESET_CDF_COUNTER(fc->uni_comp_ref_cdf, 2);
+  RESET_CDF_COUNTER(fc->comp_ref_cdf, 2);
+  RESET_CDF_COUNTER(fc->comp_bwdref_cdf, 2);
+  RESET_CDF_COUNTER(fc->txfm_partition_cdf, 2);
+  RESET_CDF_COUNTER(fc->compound_index_cdf, 2);
+  RESET_CDF_COUNTER(fc->comp_group_idx_cdf, 2);
+  RESET_CDF_COUNTER(fc->skip_mode_cdfs, 2);
+  RESET_CDF_COUNTER(fc->skip_cdfs, 2);
+  RESET_CDF_COUNTER(fc->intra_inter_cdf, 2);
+  reset_nmv_counter(&fc->nmvc);
+  reset_nmv_counter(&fc->ndvc);
+  RESET_CDF_COUNTER(fc->intrabc_cdf, 2);
+  RESET_CDF_COUNTER(fc->seg.tree_cdf, MAX_SEGMENTS);
+  RESET_CDF_COUNTER(fc->seg.pred_cdf, 2);
+  RESET_CDF_COUNTER(fc->seg.spatial_pred_seg_cdf, MAX_SEGMENTS);
+  RESET_CDF_COUNTER(fc->filter_intra_cdfs, 2);
+  RESET_CDF_COUNTER(fc->filter_intra_mode_cdf, FILTER_INTRA_MODES);
+  RESET_CDF_COUNTER(fc->switchable_restore_cdf, RESTORE_SWITCHABLE_TYPES);
+  RESET_CDF_COUNTER(fc->wiener_restore_cdf, 2);
+  RESET_CDF_COUNTER(fc->sgrproj_restore_cdf, 2);
+  RESET_CDF_COUNTER(fc->y_mode_cdf, INTRA_MODES);
+  RESET_CDF_COUNTER_STRIDE(fc->uv_mode_cdf[0], UV_INTRA_MODES - 1,
+                           CDF_SIZE(UV_INTRA_MODES));
+  RESET_CDF_COUNTER(fc->uv_mode_cdf[1], UV_INTRA_MODES);
+  for (int i = 0; i < PARTITION_CONTEXTS; i++) {
+    if (i < 4) {
+      RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 4, CDF_SIZE(10));
+    } else if (i < 16) {
+      RESET_CDF_COUNTER(fc->partition_cdf[i], 10);
+    } else {
+      RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 8, CDF_SIZE(10));
+    }
   }
-
-  AVERAGE_TILE_CDFS(pvq_context.skip_cdf)
-
-  AVERAGE_TILE_PVQ_EX(pvq_context.pvq.pvq_codeword_ctx.pvq_adapt)
-  AVERAGE_TILE_CDFS(pvq_context.pvq.pvq_codeword_ctx.pvq_k1_cdf)
-  AVERAGE_TILE_CDFS(pvq_context.pvq.pvq_codeword_ctx.pvq_split_cdf)
-
-  for (j = 0; j < 3; j++) {
-    AVERAGE_TILE_CDFS(pvq_context.pvq.pvq_param_model[j].cdf)
+  RESET_CDF_COUNTER(fc->switchable_interp_cdf, SWITCHABLE_FILTERS);
+  RESET_CDF_COUNTER(fc->kf_y_cdf, INTRA_MODES);
+  RESET_CDF_COUNTER(fc->angle_delta_cdf, 2 * MAX_ANGLE_DELTA + 1);
+  RESET_CDF_COUNTER_STRIDE(fc->tx_size_cdf[0], MAX_TX_DEPTH,
+                           CDF_SIZE(MAX_TX_DEPTH + 1));
+  RESET_CDF_COUNTER(fc->tx_size_cdf[1], MAX_TX_DEPTH + 1);
+  RESET_CDF_COUNTER(fc->tx_size_cdf[2], MAX_TX_DEPTH + 1);
+  RESET_CDF_COUNTER(fc->tx_size_cdf[3], MAX_TX_DEPTH + 1);
+  RESET_CDF_COUNTER(fc->delta_q_cdf, DELTA_Q_PROBS + 1);
+  RESET_CDF_COUNTER(fc->delta_lf_cdf, DELTA_LF_PROBS + 1);
+  for (int i = 0; i < FRAME_LF_COUNT; i++) {
+    RESET_CDF_COUNTER(fc->delta_lf_multi_cdf[i], DELTA_LF_PROBS + 1);
   }
-
-  AVERAGE_TILE_PVQ_EX(pvq_context.pvq.pvq_ext)
-  AVERAGE_TILE_PVQ_EX(pvq_context.pvq.pvq_exg)
-  AVERAGE_TILE_CDFS(pvq_context.pvq.pvq_gaintheta_cdf)
-  AVERAGE_TILE_CDFS(pvq_context.pvq.pvq_skip_dir_cdf)
+  RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[1], 7, CDF_SIZE(TX_TYPES));
+  RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[2], 5, CDF_SIZE(TX_TYPES));
+  RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[1], 16, CDF_SIZE(TX_TYPES));
+  RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[2], 12, CDF_SIZE(TX_TYPES));
+  RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[3], 2, CDF_SIZE(TX_TYPES));
+  RESET_CDF_COUNTER(fc->cfl_sign_cdf, CFL_JOINT_SIGNS);
+  RESET_CDF_COUNTER(fc->cfl_alpha_cdf, CFL_ALPHABET_SIZE);
 }
-#endif  // CONFIG_PVQ
diff --git a/third_party/aom/av1/common/entropy.h b/third_party/aom/av1/common/entropy.h
index 679aae837..ef944c5a0 100644
--- a/third_party/aom/av1/common/entropy.h
+++ b/third_party/aom/av1/common/entropy.h
@@ -12,7 +12,8 @@
 #ifndef AV1_COMMON_ENTROPY_H_
 #define AV1_COMMON_ENTROPY_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/prob.h"
 
@@ -24,82 +25,35 @@
 extern "C" {
 #endif
 
-#define DIFF_UPDATE_PROB 252
-#define GROUP_DIFF_UPDATE_PROB 252
-
-#if CONFIG_Q_ADAPT_PROBS
 #define TOKEN_CDF_Q_CTXS 4
-#endif  // CONFIG_Q_ADAPT_PROBS
-
-// Coefficient token alphabet
-#define ZERO_TOKEN 0        // 0     Extra Bits 0+0
-#define ONE_TOKEN 1         // 1     Extra Bits 0+1
-#define TWO_TOKEN 2         // 2     Extra Bits 0+1
-#define THREE_TOKEN 3       // 3     Extra Bits 0+1
-#define FOUR_TOKEN 4        // 4     Extra Bits 0+1
-#define CATEGORY1_TOKEN 5   // 5-6   Extra Bits 1+1
-#define CATEGORY2_TOKEN 6   // 7-10  Extra Bits 2+1
-#define CATEGORY3_TOKEN 7   // 11-18 Extra Bits 3+1
-#define CATEGORY4_TOKEN 8   // 19-34 Extra Bits 4+1
-#define CATEGORY5_TOKEN 9   // 35-66 Extra Bits 5+1
-#define CATEGORY6_TOKEN 10  // 67+   Extra Bits 14+1
-#define EOB_TOKEN 11        // EOB   Extra Bits 0+0
-#define NO_EOB 0            // Not an end-of-block
-#define EARLY_EOB 1         // End of block before the last position
-#define LAST_EOB 2          // End of block in the last position (implicit)
-#define BLOCK_Z_TOKEN 255   // block zero
-#define HEAD_TOKENS 5
-#define TAIL_TOKENS 9
-#define ONE_TOKEN_EOB 1
-#define ONE_TOKEN_NEOB 2
-#define TWO_TOKEN_PLUS_EOB 3
-#define TWO_TOKEN_PLUS_NEOB 4
-#define ENTROPY_TOKENS 12
-
-#define ENTROPY_NODES 11
 
-#if CONFIG_LV_MAP
 #define TXB_SKIP_CONTEXTS 13
 
-#if CONFIG_CTX1D
-#define EOB_COEF_CONTEXTS_2D 25
-#define EOB_COEF_CONTEXTS_1D 25
-#define EOB_COEF_CONTEXTS \
-  (EOB_COEF_CONTEXTS_2D + EOB_COEF_CONTEXTS_1D + EOB_COEF_CONTEXTS_1D)
-#else  // CONFIG_CTX1D
-#define EOB_COEF_CONTEXTS 25
-#endif  // CONFIG_CTX1D
+#define EOB_COEF_CONTEXTS 9
 
-#if CONFIG_EXT_TX
-#define SIG_COEF_CONTEXTS_2D 16
+#define SIG_COEF_CONTEXTS_2D 26
 #define SIG_COEF_CONTEXTS_1D 16
-#define SIG_COEF_CONTEXTS \
-  (SIG_COEF_CONTEXTS_2D + SIG_COEF_CONTEXTS_1D + SIG_COEF_CONTEXTS_1D)
-#else  // CONFIG_EXT_TX
-#define SIG_COEF_CONTEXTS_2D 16
-#define SIG_COEF_CONTEXTS 16
-#endif  // CONFIG_EXT_TX
-#define COEFF_BASE_CONTEXTS 42
+#define SIG_COEF_CONTEXTS_EOB 4
+#define SIG_COEF_CONTEXTS (SIG_COEF_CONTEXTS_2D + SIG_COEF_CONTEXTS_1D)
+
+#define COEFF_BASE_CONTEXTS (SIG_COEF_CONTEXTS)
 #define DC_SIGN_CONTEXTS 3
 
 #define BR_TMP_OFFSET 12
 #define BR_REF_CAT 4
-#define LEVEL_CONTEXTS (BR_TMP_OFFSET * BR_REF_CAT)
+#define LEVEL_CONTEXTS 21
 
 #define NUM_BASE_LEVELS 2
-#define COEFF_BASE_RANGE (16 - NUM_BASE_LEVELS)
-#define BASE_RANGE_SETS 3
+
+#define BR_CDF_SIZE (4)
+#define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1))
 
 #define COEFF_CONTEXT_BITS 6
 #define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
+#define MAX_BASE_BR_RANGE (COEFF_BASE_RANGE + NUM_BASE_LEVELS + 1)
 
 #define BASE_CONTEXT_POSITION_NUM 12
 
-#if CONFIG_CTX1D
-#define EMPTY_LINE_CONTEXTS 5
-#define HV_EOB_CONTEXTS 24
-#endif  // CONFIG_CTX1D
-
 typedef enum TX_CLASS {
   TX_CLASS_2D = 0,
   TX_CLASS_HORIZ = 1,
@@ -107,161 +61,21 @@ typedef enum TX_CLASS {
   TX_CLASSES = 3,
 } TX_CLASS;
 
-#endif
-
-DECLARE_ALIGNED(16, extern const uint8_t, av1_pt_energy_class[ENTROPY_TOKENS]);
-
-#define CAT1_MIN_VAL 5
-#define CAT2_MIN_VAL 7
-#define CAT3_MIN_VAL 11
-#define CAT4_MIN_VAL 19
-#define CAT5_MIN_VAL 35
-#define CAT6_MIN_VAL 67
-
-// Extra bit probabilities.
-DECLARE_ALIGNED(16, extern const uint8_t, av1_cat1_prob[1]);
-DECLARE_ALIGNED(16, extern const uint8_t, av1_cat2_prob[2]);
-DECLARE_ALIGNED(16, extern const uint8_t, av1_cat3_prob[3]);
-DECLARE_ALIGNED(16, extern const uint8_t, av1_cat4_prob[4]);
-DECLARE_ALIGNED(16, extern const uint8_t, av1_cat5_prob[5]);
-DECLARE_ALIGNED(16, extern const uint8_t, av1_cat6_prob[18]);
-#if CONFIG_NEW_MULTISYMBOL
-extern const aom_cdf_prob *av1_cat1_cdf[];
-extern const aom_cdf_prob *av1_cat2_cdf[];
-extern const aom_cdf_prob *av1_cat3_cdf[];
-extern const aom_cdf_prob *av1_cat4_cdf[];
-extern const aom_cdf_prob *av1_cat5_cdf[];
-extern const aom_cdf_prob *av1_cat6_cdf[];
-#endif
-
-#define EOB_MODEL_TOKEN 3
-
-typedef struct {
-#if CONFIG_NEW_MULTISYMBOL
-  const aom_cdf_prob **cdf;
-#else
-  const aom_prob *prob;
-#endif
-  int len;
-  int base_val;
-  const int16_t *cost;
-} av1_extra_bit;
-
-// indexed by token value
-extern const av1_extra_bit av1_extra_bits[ENTROPY_TOKENS];
-
-static INLINE int av1_get_cat6_extrabits_size(TX_SIZE tx_size,
-                                              aom_bit_depth_t bit_depth) {
-  tx_size = txsize_sqr_up_map[tx_size];
-#if CONFIG_TX64X64
-  // TODO(debargha): Does TX_64X64 require an additional extrabit?
-  if (tx_size > TX_32X32) tx_size = TX_32X32;
-#endif
-#if CONFIG_CHROMA_2X2
-  int tx_offset = (tx_size < TX_4X4) ? 0 : (int)(tx_size - TX_4X4);
-#else
-  int tx_offset = (int)(tx_size - TX_4X4);
-#endif
-  int bits = (int)bit_depth + 3 + tx_offset;
-#if CONFIG_NEW_MULTISYMBOL
-  // Round up
-  bits = AOMMIN((int)sizeof(av1_cat6_prob), ((bits + 3) & ~3));
-#endif
-  assert(bits <= (int)sizeof(av1_cat6_prob));
-  return bits;
-}
-
 #define DCT_MAX_VALUE 16384
-#if CONFIG_HIGHBITDEPTH
 #define DCT_MAX_VALUE_HIGH10 65536
 #define DCT_MAX_VALUE_HIGH12 262144
-#endif  // CONFIG_HIGHBITDEPTH
-
-/* Coefficients are predicted via a 3-dimensional probability table. */
 
+/* Coefficients are predicted via a 3-dimensional probability table indexed on
+ * REF_TYPES, COEF_BANDS and COEF_CONTEXTS. */
 #define REF_TYPES 2  // intra=0, inter=1
 
-/* Middle dimension reflects the coefficient position within the transform. */
-#define COEF_BANDS 6
-
-/* Inside dimension is measure of nearby complexity, that reflects the energy
-   of nearby coefficients are nonzero.  For the first coefficient (DC, unless
-   block type is 0), we look at the (already encoded) blocks above and to the
-   left of the current block.  The context index is then the number (0,1,or 2)
-   of these blocks having nonzero coefficients.
-   After decoding a coefficient, the measure is determined by the size of the
-   most recently decoded coefficient.
-   Note that the intuitive meaning of this measure changes as coefficients
-   are decoded, e.g., prior to the first token, a zero means that my neighbors
-   are empty while, after the first token, because of the use of end-of-block,
-   a zero means we just decoded a zero and hence guarantees that a non-zero
-   coefficient will appear later in this block.  However, this shift
-   in meaning is perfectly OK because our context depends also on the
-   coefficient band (and since zigzag positions 0, 1, and 2 are in
-   distinct bands). */
-
-#define COEFF_CONTEXTS 6
-#define COEFF_CONTEXTS0 3  // for band 0
-#define BAND_COEFF_CONTEXTS(band) \
-  ((band) == 0 ? COEFF_CONTEXTS0 : COEFF_CONTEXTS)
-
-#define SUBEXP_PARAM 4   /* Subexponential code parameter */
-#define MODULUS_PARAM 13 /* Modulus parameter */
-
 struct AV1Common;
 struct frame_contexts;
+void av1_reset_cdf_symbol_counters(struct frame_contexts *fc);
 void av1_default_coef_probs(struct AV1Common *cm);
-#if CONFIG_LV_MAP
-void av1_adapt_coef_probs(struct AV1Common *cm);
-#endif  // CONFIG_LV_MAP
-
-// This is the index in the scan order beyond which all coefficients for
-// 8x8 transform and above are in the top band.
-// This macro is currently unused but may be used by certain implementations
-#define MAXBAND_INDEX 21
 
-DECLARE_ALIGNED(16, extern const uint8_t,
-                av1_coefband_trans_8x8plus[MAX_TX_SQUARE]);
-DECLARE_ALIGNED(16, extern const uint8_t, av1_coefband_trans_4x8_8x4[32]);
-DECLARE_ALIGNED(16, extern const uint8_t, av1_coefband_trans_4x4[16]);
-
-DECLARE_ALIGNED(16, extern const uint16_t, band_count_table[TX_SIZES_ALL][8]);
-DECLARE_ALIGNED(16, extern const uint16_t,
-                band_cum_count_table[TX_SIZES_ALL][8]);
-
-static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
-  switch (tx_size) {
-    case TX_4X4: return av1_coefband_trans_4x4;
-    case TX_8X4:
-    case TX_4X8: return av1_coefband_trans_4x8_8x4;
-    default: return av1_coefband_trans_8x8plus;
-  }
-}
-
-// 128 lists of probabilities are stored for the following ONE node probs:
-// 1, 3, 5, 7, ..., 253, 255
-// In between probabilities are interpolated linearly
-
-#define COEFF_PROB_MODELS 255
-
-#define UNCONSTRAINED_NODES 3
-
-#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
-#define TAIL_NODES (MODEL_NODES + 1)
-extern const aom_tree_index av1_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)];
-extern const aom_prob av1_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
-
-typedef aom_cdf_prob coeff_cdf_model[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
-                                    [CDF_SIZE(ENTROPY_TOKENS)];
-extern const aom_cdf_prob av1_pareto8_token_probs[COEFF_PROB_MODELS]
-                                                 [ENTROPY_TOKENS - 2];
-extern const aom_cdf_prob av1_pareto8_tail_probs[COEFF_PROB_MODELS]
-                                                [ENTROPY_TOKENS - 3];
 struct frame_contexts;
 
-void av1_coef_head_cdfs(struct frame_contexts *fc);
-void av1_coef_pareto_cdfs(struct frame_contexts *fc);
-
 typedef char ENTROPY_CONTEXT;
 
 static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
@@ -273,93 +87,6 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
                                       const ENTROPY_CONTEXT *l) {
   ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
 
-#if CONFIG_CHROMA_2X2
-  switch (tx_size) {
-    case TX_2X2:
-      above_ec = a[0] != 0;
-      left_ec = l[0] != 0;
-      break;
-    case TX_4X4:
-      above_ec = !!*(const uint16_t *)a;
-      left_ec = !!*(const uint16_t *)l;
-      break;
-    case TX_4X8:
-      above_ec = !!*(const uint16_t *)a;
-      left_ec = !!*(const uint32_t *)l;
-      break;
-    case TX_8X4:
-      above_ec = !!*(const uint32_t *)a;
-      left_ec = !!*(const uint16_t *)l;
-      break;
-    case TX_8X8:
-      above_ec = !!*(const uint32_t *)a;
-      left_ec = !!*(const uint32_t *)l;
-      break;
-    case TX_8X16:
-      above_ec = !!*(const uint32_t *)a;
-      left_ec = !!*(const uint64_t *)l;
-      break;
-    case TX_16X8:
-      above_ec = !!*(const uint64_t *)a;
-      left_ec = !!*(const uint32_t *)l;
-      break;
-    case TX_16X16:
-      above_ec = !!*(const uint64_t *)a;
-      left_ec = !!*(const uint64_t *)l;
-      break;
-    case TX_16X32:
-      above_ec = !!*(const uint64_t *)a;
-      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
-      break;
-    case TX_32X16:
-      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
-      left_ec = !!*(const uint64_t *)l;
-      break;
-    case TX_32X32:
-      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
-      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
-      break;
-#if CONFIG_TX64X64
-    case TX_64X64:
-      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8) |
-                    *(const uint64_t *)(a + 16) | *(const uint64_t *)(a + 24));
-      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8) |
-                   *(const uint64_t *)(l + 16) | *(const uint64_t *)(l + 24));
-      break;
-    case TX_32X64:
-      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
-      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8) |
-                   *(const uint64_t *)(l + 16) | *(const uint64_t *)(l + 24));
-      break;
-    case TX_64X32:
-      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8) |
-                    *(const uint64_t *)(a + 16) | *(const uint64_t *)(a + 24));
-      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
-      break;
-#endif  // CONFIG_TX64X64
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    case TX_4X16:
-      above_ec = !!*(const uint16_t *)a;
-      left_ec = !!*(const uint64_t *)l;
-      break;
-    case TX_16X4:
-      above_ec = !!*(const uint64_t *)a;
-      left_ec = !!*(const uint16_t *)l;
-      break;
-    case TX_8X32:
-      above_ec = !!*(const uint32_t *)a;
-      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
-      break;
-    case TX_32X8:
-      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
-      left_ec = !!*(const uint32_t *)l;
-      break;
-#endif
-    default: assert(0 && "Invalid transform size."); break;
-  }
-  return combine_entropy_contexts(above_ec, left_ec);
-#endif  // CONFIG_CHROMA_2X2
-
   switch (tx_size) {
     case TX_4X4:
       above_ec = a[0] != 0;
@@ -401,7 +128,6 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
       above_ec = !!*(const uint64_t *)a;
       left_ec = !!*(const uint64_t *)l;
       break;
-#if CONFIG_TX64X64
     case TX_64X64:
       above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
       left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
@@ -414,8 +140,6 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
       above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
       left_ec = !!*(const uint64_t *)l;
       break;
-#endif  // CONFIG_TX64X64
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
     case TX_4X16:
       above_ec = a[0] != 0;
       left_ec = !!*(const uint32_t *)l;
@@ -432,55 +156,24 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
       above_ec = !!*(const uint64_t *)a;
       left_ec = !!*(const uint16_t *)l;
       break;
-#endif
+    case TX_16X64:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
+      break;
+    case TX_64X16:
+      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
+      left_ec = !!*(const uint32_t *)l;
+      break;
     default: assert(0 && "Invalid transform size."); break;
   }
   return combine_entropy_contexts(above_ec, left_ec);
 }
 
-#define COEF_COUNT_SAT 24
-#define COEF_MAX_UPDATE_FACTOR 112
-#define COEF_COUNT_SAT_AFTER_KEY 24
-#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
-
-#if CONFIG_ADAPT_SCAN
-#define ADAPT_SCAN_PROB_PRECISION 10
-// 1/8 update rate
-#define ADAPT_SCAN_UPDATE_LOG_RATE 3
-#define ADAPT_SCAN_UPDATE_RATE \
-  (1 << (ADAPT_SCAN_PROB_PRECISION - ADAPT_SCAN_UPDATE_LOG_RATE))
-#endif
-
-static INLINE aom_prob av1_merge_probs(aom_prob pre_prob,
-                                       const unsigned int ct[2],
-                                       unsigned int count_sat,
-                                       unsigned int max_update_factor) {
-  return merge_probs(pre_prob, ct, count_sat, max_update_factor);
-}
-
-static INLINE aom_prob av1_mode_mv_merge_probs(aom_prob pre_prob,
-                                               const unsigned int ct[2]) {
-  return mode_mv_merge_probs(pre_prob, ct);
+static INLINE TX_SIZE get_txsize_entropy_ctx(TX_SIZE txsize) {
+  return (TX_SIZE)((txsize_sqr_map[txsize] + txsize_sqr_up_map[txsize] + 1) >>
+                   1);
 }
 
-void av1_average_tile_coef_cdfs(struct frame_contexts *fc,
-                                struct frame_contexts *ec_ctxs[],
-                                aom_cdf_prob *cdf_ptrs[], int num_tiles);
-void av1_average_tile_mv_cdfs(struct frame_contexts *fc,
-                              struct frame_contexts *ec_ctxs[],
-                              aom_cdf_prob *cdf_ptrs[], int num_tiles);
-void av1_average_tile_intra_cdfs(struct frame_contexts *fc,
-                                 struct frame_contexts *ec_ctxs[],
-                                 aom_cdf_prob *cdf_ptrs[], int num_tiles);
-void av1_average_tile_inter_cdfs(struct AV1Common *cm,
-                                 struct frame_contexts *fc,
-                                 struct frame_contexts *ec_ctxs[],
-                                 aom_cdf_prob *cdf_ptrs[], int num_tiles);
-#if CONFIG_PVQ
-void av1_default_pvq_probs(struct AV1Common *cm);
-void av1_average_tile_pvq_cdfs(struct frame_contexts *fc,
-                               struct frame_contexts *ec_ctxs[], int num_tiles);
-#endif  // CONFIG_PVQ
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/entropymode.c b/third_party/aom/av1/common/entropymode.c
index 207f1e245..41dc30ddb 100644
--- a/third_party/aom/av1/common/entropymode.c
+++ b/third_party/aom/av1/common/entropymode.c
@@ -15,2466 +15,880 @@
 #include "av1/common/scan.h"
 #include "av1/common/onyxc_int.h"
 #include "av1/common/seg_common.h"
-#if CONFIG_LV_MAP
 #include "av1/common/txb_common.h"
-#endif
 
-#if CONFIG_LV_MAP
-#include "av1/common/txb_common.h"
-const aom_prob default_txb_skip[TX_SIZES][TXB_SKIP_CONTEXTS] = {
-#if CONFIG_CHROMA_2X2
-  { 252, 71, 126, 184, 178, 218, 251, 49, 133, 221, 27, 92, 197 },
-#endif
-  { 252, 71, 126, 184, 178, 218, 251, 49, 133, 221, 27, 92, 197 },
-  { 252, 71, 126, 184, 178, 218, 251, 49, 133, 221, 27, 92, 197 },
-  { 252, 71, 126, 184, 178, 218, 251, 49, 133, 221, 27, 92, 197 },
-  { 252, 71, 126, 184, 178, 218, 251, 49, 133, 221, 27, 92, 197 },
-};
-const aom_prob default_dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS] = {
-  { 125, 102, 147 }, { 119, 101, 135 },
+static const aom_cdf_prob
+    default_kf_y_mode_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][CDF_SIZE(
+        INTRA_MODES)] = {
+      { { AOM_CDF13(15588, 17027, 19338, 20218, 20682, 21110, 21825, 23244,
+                    24189, 28165, 29093, 30466) },
+        { AOM_CDF13(12016, 18066, 19516, 20303, 20719, 21444, 21888, 23032,
+                    24434, 28658, 30172, 31409) },
+        { AOM_CDF13(10052, 10771, 22296, 22788, 23055, 23239, 24133, 25620,
+                    26160, 29336, 29929, 31567) },
+        { AOM_CDF13(14091, 15406, 16442, 18808, 19136, 19546, 19998, 22096,
+                    24746, 29585, 30958, 32462) },
+        { AOM_CDF13(12122, 13265, 15603, 16501, 18609, 20033, 22391, 25583,
+                    26437, 30261, 31073, 32475) } },
+      { { AOM_CDF13(10023, 19585, 20848, 21440, 21832, 22760, 23089, 24023,
+                    25381, 29014, 30482, 31436) },
+        { AOM_CDF13(5983, 24099, 24560, 24886, 25066, 25795, 25913, 26423,
+                    27610, 29905, 31276, 31794) },
+        { AOM_CDF13(7444, 12781, 20177, 20728, 21077, 21607, 22170, 23405,
+                    24469, 27915, 29090, 30492) },
+        { AOM_CDF13(8537, 14689, 15432, 17087, 17408, 18172, 18408, 19825,
+                    24649, 29153, 31096, 32210) },
+        { AOM_CDF13(7543, 14231, 15496, 16195, 17905, 20717, 21984, 24516,
+                    26001, 29675, 30981, 31994) } },
+      { { AOM_CDF13(12613, 13591, 21383, 22004, 22312, 22577, 23401, 25055,
+                    25729, 29538, 30305, 32077) },
+        { AOM_CDF13(9687, 13470, 18506, 19230, 19604, 20147, 20695, 22062,
+                    23219, 27743, 29211, 30907) },
+        { AOM_CDF13(6183, 6505, 26024, 26252, 26366, 26434, 27082, 28354, 28555,
+                    30467, 30794, 32086) },
+        { AOM_CDF13(10718, 11734, 14954, 17224, 17565, 17924, 18561, 21523,
+                    23878, 28975, 30287, 32252) },
+        { AOM_CDF13(9194, 9858, 16501, 17263, 18424, 19171, 21563, 25961, 26561,
+                    30072, 30737, 32463) } },
+      { { AOM_CDF13(12602, 14399, 15488, 18381, 18778, 19315, 19724, 21419,
+                    25060, 29696, 30917, 32409) },
+        { AOM_CDF13(8203, 13821, 14524, 17105, 17439, 18131, 18404, 19468,
+                    25225, 29485, 31158, 32342) },
+        { AOM_CDF13(8451, 9731, 15004, 17643, 18012, 18425, 19070, 21538, 24605,
+                    29118, 30078, 32018) },
+        { AOM_CDF13(7714, 9048, 9516, 16667, 16817, 16994, 17153, 18767, 26743,
+                    30389, 31536, 32528) },
+        { AOM_CDF13(8843, 10280, 11496, 15317, 16652, 17943, 19108, 22718,
+                    25769, 29953, 30983, 32485) } },
+      { { AOM_CDF13(12578, 13671, 15979, 16834, 19075, 20913, 22989, 25449,
+                    26219, 30214, 31150, 32477) },
+        { AOM_CDF13(9563, 13626, 15080, 15892, 17756, 20863, 22207, 24236,
+                    25380, 29653, 31143, 32277) },
+        { AOM_CDF13(8356, 8901, 17616, 18256, 19350, 20106, 22598, 25947, 26466,
+                    29900, 30523, 32261) },
+        { AOM_CDF13(10835, 11815, 13124, 16042, 17018, 18039, 18947, 22753,
+                    24615, 29489, 30883, 32482) },
+        { AOM_CDF13(7618, 8288, 9859, 10509, 15386, 18657, 22903, 28776, 29180,
+                    31355, 31802, 32593) } }
+    };
+
+static const aom_cdf_prob default_angle_delta_cdf[DIRECTIONAL_MODES][CDF_SIZE(
+    2 * MAX_ANGLE_DELTA + 1)] = {
+  { AOM_CDF7(2180, 5032, 7567, 22776, 26989, 30217) },
+  { AOM_CDF7(2301, 5608, 8801, 23487, 26974, 30330) },
+  { AOM_CDF7(3780, 11018, 13699, 19354, 23083, 31286) },
+  { AOM_CDF7(4581, 11226, 15147, 17138, 21834, 28397) },
+  { AOM_CDF7(1737, 10927, 14509, 19588, 22745, 28823) },
+  { AOM_CDF7(2664, 10176, 12485, 17650, 21600, 30495) },
+  { AOM_CDF7(2240, 11096, 15453, 20341, 22561, 28917) },
+  { AOM_CDF7(3605, 10428, 12459, 17676, 21244, 30655) }
 };
 
-const aom_prob default_coeff_base
-    [TX_SIZES][PLANE_TYPES][NUM_BASE_LEVELS][COEFF_BASE_CONTEXTS] = {
-#if CONFIG_CHROMA_2X2
-      { // TX_2X2
-        {
-            { 73,  128, 131, 204, 165, 226, 169, 236, 18,  128, 51,
-              153, 97,  179, 123, 201, 145, 226, 20,  128, 59,  153,
-              107, 181, 129, 201, 142, 226, 3,   128, 19,  99,  46,
-              135, 92,  166, 129, 190, 157, 217, 128, 128 },
-
-            { 128, 128, 178, 218, 192, 236, 186, 243, 55,  128, 110,
-              183, 151, 205, 168, 221, 180, 238, 65,  128, 116, 178,
-              157, 206, 172, 222, 183, 238, 24,  128, 65,  127, 104,
-              164, 154, 195, 187, 216, 205, 230, 128, 128 },
-        },
-        {
-            { 73,  128, 131, 204, 165, 226, 169, 236, 18,  128, 51,
-              153, 97,  179, 123, 201, 145, 226, 20,  128, 59,  153,
-              107, 181, 129, 201, 142, 226, 3,   128, 19,  99,  46,
-              135, 92,  166, 129, 190, 157, 217, 128, 128 },
+static const aom_cdf_prob default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
+    INTRA_MODES)] = { { AOM_CDF13(22801, 23489, 24293, 24756, 25601, 26123,
+                                  26606, 27418, 27945, 29228, 29685, 30349) },
+                      { AOM_CDF13(18673, 19845, 22631, 23318, 23950, 24649,
+                                  25527, 27364, 28152, 29701, 29984, 30852) },
+                      { AOM_CDF13(19770, 20979, 23396, 23939, 24241, 24654,
+                                  25136, 27073, 27830, 29360, 29730, 30659) },
+                      { AOM_CDF13(20155, 21301, 22838, 23178, 23261, 23533,
+                                  23703, 24804, 25352, 26575, 27016, 28049) } };
 
-            { 128, 128, 178, 218, 192, 236, 186, 243, 55,  128, 110,
-              183, 151, 205, 168, 221, 180, 238, 65,  128, 116, 178,
-              157, 206, 172, 222, 183, 238, 24,  128, 65,  127, 104,
-              164, 154, 195, 187, 216, 205, 230, 128, 128 },
-        } },
-#endif
-      { // TX_4X4
-        {
-            // PLANE_Y
-            { 73,  128, 131, 204, 165, 226, 169, 236, 18,  128, 51,
-              153, 97,  179, 123, 201, 145, 226, 20,  128, 59,  153,
-              107, 181, 129, 201, 142, 226, 3,   128, 19,  99,  46,
-              135, 92,  166, 129, 190, 157, 217, 128, 128 },
-
-            { 128, 128, 178, 218, 192, 236, 186, 243, 55,  128, 110,
-              183, 151, 205, 168, 221, 180, 238, 65,  128, 116, 178,
-              157, 206, 172, 222, 183, 238, 24,  128, 65,  127, 104,
-              164, 154, 195, 187, 216, 205, 230, 128, 128 },
-        },
-        {
-            // PLANE_UV
-            { 47,  128, 100, 176, 140, 207, 150, 223, 11,  128, 35,
-              133, 79,  165, 115, 186, 129, 210, 8,   128, 30,  114,
-              80,  159, 116, 187, 146, 214, 2,   128, 9,   59,  28,
-              86,  71,  131, 117, 165, 149, 188, 128, 128 },
+static const aom_cdf_prob
+    default_uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES][CDF_SIZE(
+        UV_INTRA_MODES)] = {
+      { { AOM_CDF13(22631, 24152, 25378, 25661, 25986, 26520, 27055, 27923,
+                    28244, 30059, 30941, 31961) },
+        { AOM_CDF13(9513, 26881, 26973, 27046, 27118, 27664, 27739, 27824,
+                    28359, 29505, 29800, 31796) },
+        { AOM_CDF13(9845, 9915, 28663, 28704, 28757, 28780, 29198, 29822, 29854,
+                    30764, 31777, 32029) },
+        { AOM_CDF13(13639, 13897, 14171, 25331, 25606, 25727, 25953, 27148,
+                    28577, 30612, 31355, 32493) },
+        { AOM_CDF13(9764, 9835, 9930, 9954, 25386, 27053, 27958, 28148, 28243,
+                    31101, 31744, 32363) },
+        { AOM_CDF13(11825, 13589, 13677, 13720, 15048, 29213, 29301, 29458,
+                    29711, 31161, 31441, 32550) },
+        { AOM_CDF13(14175, 14399, 16608, 16821, 17718, 17775, 28551, 30200,
+                    30245, 31837, 32342, 32667) },
+        { AOM_CDF13(12885, 13038, 14978, 15590, 15673, 15748, 16176, 29128,
+                    29267, 30643, 31961, 32461) },
+        { AOM_CDF13(12026, 13661, 13874, 15305, 15490, 15726, 15995, 16273,
+                    28443, 30388, 30767, 32416) },
+        { AOM_CDF13(19052, 19840, 20579, 20916, 21150, 21467, 21885, 22719,
+                    23174, 28861, 30379, 32175) },
+        { AOM_CDF13(18627, 19649, 20974, 21219, 21492, 21816, 22199, 23119,
+                    23527, 27053, 31397, 32148) },
+        { AOM_CDF13(17026, 19004, 19997, 20339, 20586, 21103, 21349, 21907,
+                    22482, 25896, 26541, 31819) },
+        { AOM_CDF13(12124, 13759, 14959, 14992, 15007, 15051, 15078, 15166,
+                    15255, 15753, 16039, 16606) } },
+      { { AOM_CDF14(10407, 11208, 12900, 13181, 13823, 14175, 14899, 15656,
+                    15986, 20086, 20995, 22455, 24212) },
+        { AOM_CDF14(4532, 19780, 20057, 20215, 20428, 21071, 21199, 21451,
+                    22099, 24228, 24693, 27032, 29472) },
+        { AOM_CDF14(5273, 5379, 20177, 20270, 20385, 20439, 20949, 21695, 21774,
+                    23138, 24256, 24703, 26679) },
+        { AOM_CDF14(6740, 7167, 7662, 14152, 14536, 14785, 15034, 16741, 18371,
+                    21520, 22206, 23389, 24182) },
+        { AOM_CDF14(4987, 5368, 5928, 6068, 19114, 20315, 21857, 22253, 22411,
+                    24911, 25380, 26027, 26376) },
+        { AOM_CDF14(5370, 6889, 7247, 7393, 9498, 21114, 21402, 21753, 21981,
+                    24780, 25386, 26517, 27176) },
+        { AOM_CDF14(4816, 4961, 7204, 7326, 8765, 8930, 20169, 20682, 20803,
+                    23188, 23763, 24455, 24940) },
+        { AOM_CDF14(6608, 6740, 8529, 9049, 9257, 9356, 9735, 18827, 19059,
+                    22336, 23204, 23964, 24793) },
+        { AOM_CDF14(5998, 7419, 7781, 8933, 9255, 9549, 9753, 10417, 18898,
+                    22494, 23139, 24764, 25989) },
+        { AOM_CDF14(10660, 11298, 12550, 12957, 13322, 13624, 14040, 15004,
+                    15534, 20714, 21789, 23443, 24861) },
+        { AOM_CDF14(10522, 11530, 12552, 12963, 13378, 13779, 14245, 15235,
+                    15902, 20102, 22696, 23774, 25838) },
+        { AOM_CDF14(10099, 10691, 12639, 13049, 13386, 13665, 14125, 15163,
+                    15636, 19676, 20474, 23519, 25208) },
+        { AOM_CDF14(3144, 5087, 7382, 7504, 7593, 7690, 7801, 8064, 8232, 9248,
+                    9875, 10521, 29048) } }
+    };
+
+static const aom_cdf_prob default_partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(
+    EXT_PARTITION_TYPES)] = {
+  { AOM_CDF4(19132, 25510, 30392) },
+  { AOM_CDF4(13928, 19855, 28540) },
+  { AOM_CDF4(12522, 23679, 28629) },
+  { AOM_CDF4(9896, 18783, 25853) },
+  { AOM_CDF10(15597, 20929, 24571, 26706, 27664, 28821, 29601, 30571, 31902) },
+  { AOM_CDF10(7925, 11043, 16785, 22470, 23971, 25043, 26651, 28701, 29834) },
+  { AOM_CDF10(5414, 13269, 15111, 20488, 22360, 24500, 25537, 26336, 32117) },
+  { AOM_CDF10(2662, 6362, 8614, 20860, 23053, 24778, 26436, 27829, 31171) },
+  { AOM_CDF10(18462, 20920, 23124, 27647, 28227, 29049, 29519, 30178, 31544) },
+  { AOM_CDF10(7689, 9060, 12056, 24992, 25660, 26182, 26951, 28041, 29052) },
+  { AOM_CDF10(6015, 9009, 10062, 24544, 25409, 26545, 27071, 27526, 32047) },
+  { AOM_CDF10(1394, 2208, 2796, 28614, 29061, 29466, 29840, 30185, 31899) },
+  { AOM_CDF10(20137, 21547, 23078, 29566, 29837, 30261, 30524, 30892, 31724) },
+  { AOM_CDF10(6732, 7490, 9497, 27944, 28250, 28515, 28969, 29630, 30104) },
+  { AOM_CDF10(5945, 7663, 8348, 28683, 29117, 29749, 30064, 30298, 32238) },
+  { AOM_CDF10(870, 1212, 1487, 31198, 31394, 31574, 31743, 31881, 32332) },
+  { AOM_CDF8(27899, 28219, 28529, 32484, 32539, 32619, 32639) },
+  { AOM_CDF8(6607, 6990, 8268, 32060, 32219, 32338, 32371) },
+  { AOM_CDF8(5429, 6676, 7122, 32027, 32227, 32531, 32582) },
+  { AOM_CDF8(711, 966, 1172, 32448, 32538, 32617, 32664) },
+};
 
-            { 83,  128, 152, 205, 168, 227, 192, 238, 42,  128, 92,
-              169, 138, 193, 165, 209, 128, 206, 36,  128, 86,  159,
-              141, 198, 181, 213, 102, 223, 18,  128, 50,  132, 90,
-              144, 141, 169, 180, 191, 128, 217, 128, 128 },
-        } },
+static const aom_cdf_prob default_intra_ext_tx_cdf
+    [EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES][CDF_SIZE(TX_TYPES)] = {
       {
-          // TX_8X8
           {
-              // PLANE_Y
-              { 82,  128, 143, 203, 177, 225, 186, 237, 7,   128, 37,
-                109, 78,  151, 110, 182, 139, 213, 25,  128, 51,  115,
-                86,  146, 111, 175, 125, 205, 3,   128, 12,  55,  32,
-                78,  63,  111, 96,  148, 123, 185, 146, 206 },
-
-              { 136, 128, 182, 220, 201, 236, 205, 243, 46,  128, 101,
-                164, 147, 194, 170, 218, 177, 234, 62,  128, 104, 146,
-                143, 183, 165, 207, 183, 228, 30,  128, 60,  95,  95,
-                128, 135, 163, 166, 196, 175, 219, 192, 231 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
           },
           {
-              // PLANE_UV
-              { 47,  128, 112, 189, 164, 202, 163, 218, 8,   128, 32,
-                110, 68,  151, 102, 179, 134, 195, 5,   128, 22,  76,
-                54,  103, 80,  146, 101, 182, 1,   128, 5,   39,  17,
-                53,  46,  93,  79,  127, 112, 161, 64,  195 },
-
-              { 90,  128, 156, 210, 183, 225, 128, 236, 39,  128, 98,
-                164, 146, 201, 209, 219, 171, 208, 32,  128, 68,  123,
-                119, 169, 154, 184, 128, 213, 15,  128, 38,  111, 83,
-                112, 120, 163, 180, 170, 154, 213, 128, 205 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+          },
+          {
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+          },
+          {
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
           },
       },
-
       {
-          // TX_16X16
           {
-              // PLANE_Y
-              { 96,  128, 169, 218, 208, 233, 187, 244, 10,  128, 34,
-                101, 82,  153, 113, 184, 137, 212, 6,   128, 34,  104,
-                81,  145, 109, 176, 147, 202, 1,   128, 3,   43,  15,
-                53,  43,  89,  79,  129, 108, 168, 110, 194 },
-
-              { 156, 128, 206, 232, 218, 240, 128, 251, 39,  128, 108,
-                161, 156, 202, 187, 216, 179, 234, 40,  128, 103, 152,
-                144, 185, 159, 208, 205, 227, 14,  128, 39,  84,  76,
-                110, 121, 151, 157, 187, 201, 206, 64,  216 },
+              { AOM_CDF7(1535, 8035, 9461, 12751, 23467, 27825) },
+              { AOM_CDF7(564, 3335, 9709, 10870, 18143, 28094) },
+              { AOM_CDF7(672, 3247, 3676, 11982, 19415, 23127) },
+              { AOM_CDF7(5279, 13885, 15487, 18044, 23527, 30252) },
+              { AOM_CDF7(4423, 6074, 7985, 10416, 25693, 29298) },
+              { AOM_CDF7(1486, 4241, 9460, 10662, 16456, 27694) },
+              { AOM_CDF7(439, 2838, 3522, 6737, 18058, 23754) },
+              { AOM_CDF7(1190, 4233, 4855, 11670, 20281, 24377) },
+              { AOM_CDF7(1045, 4312, 8647, 10159, 18644, 29335) },
+              { AOM_CDF7(202, 3734, 4747, 7298, 17127, 24016) },
+              { AOM_CDF7(447, 4312, 6819, 8884, 16010, 23858) },
+              { AOM_CDF7(277, 4369, 5255, 8905, 16465, 22271) },
+              { AOM_CDF7(3409, 5436, 10599, 15599, 19687, 24040) },
           },
           {
-              // PLANE_UV
-              { 42, 128, 139, 211, 180, 230, 199, 238, 3,   128, 32,
-                96, 69,  145, 102, 186, 117, 212, 4,   128, 25,  72,
-                55, 111, 81,  159, 116, 198, 1,   128, 4,   22,  16,
-                34, 35,  68,  63,  116, 89,  165, 102, 199 },
-
-              { 135, 128, 193, 227, 182, 239, 128, 246, 42,  128, 115,
-                156, 146, 203, 188, 216, 128, 229, 32,  128, 82,  127,
-                120, 178, 165, 203, 213, 229, 11,  128, 32,  73,  79,
-                111, 129, 158, 162, 187, 156, 209, 85,  222 },
+              { AOM_CDF7(1870, 13742, 14530, 16498, 23770, 27698) },
+              { AOM_CDF7(326, 8796, 14632, 15079, 19272, 27486) },
+              { AOM_CDF7(484, 7576, 7712, 14443, 19159, 22591) },
+              { AOM_CDF7(1126, 15340, 15895, 17023, 20896, 30279) },
+              { AOM_CDF7(655, 4854, 5249, 5913, 22099, 27138) },
+              { AOM_CDF7(1299, 6458, 8885, 9290, 14851, 25497) },
+              { AOM_CDF7(311, 5295, 5552, 6885, 16107, 22672) },
+              { AOM_CDF7(883, 8059, 8270, 11258, 17289, 21549) },
+              { AOM_CDF7(741, 7580, 9318, 10345, 16688, 29046) },
+              { AOM_CDF7(110, 7406, 7915, 9195, 16041, 23329) },
+              { AOM_CDF7(363, 7974, 9357, 10673, 15629, 24474) },
+              { AOM_CDF7(153, 7647, 8112, 9936, 15307, 19996) },
+              { AOM_CDF7(3511, 6332, 11165, 15335, 19323, 23594) },
+          },
+          {
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+          },
+          {
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
           },
       },
-
       {
-          // TX_32X32
           {
-              // PLANE_Y
-              { 97,  128, 163, 232, 191, 246, 219, 252, 3,   128, 41,
-                108, 91,  147, 104, 183, 118, 225, 6,   128, 45,  91,
-                83,  125, 92,  160, 99,  215, 1,   128, 11,  36,  28,
-                46,  43,  59,  57,  86,  73,  145, 91,  210 },
-
-              { 127, 128, 201, 239, 247, 248, 128, 254, 40,  128, 103,
-                152, 158, 199, 186, 225, 181, 242, 38,  128, 92,  112,
-                146, 189, 162, 217, 112, 239, 17,  128, 30,  47,  63,
-                89,  113, 146, 147, 187, 168, 217, 150, 233 },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
           },
           {
-              // PLANE_UV
-              { 65,  128, 155, 223, 166, 235, 154, 244, 15,  128, 57,
-                154, 110, 199, 159, 224, 149, 239, 9,   128, 57,  140,
-                97,  185, 148, 218, 176, 236, 1,   128, 3,   43,  19,
-                42,  64,  98,  117, 167, 154, 199, 128, 158 },
-
-              { 130, 128, 189, 231, 171, 247, 128, 246, 63,  128, 132,
-                222, 186, 224, 199, 244, 128, 247, 55,  128, 113, 211,
-                164, 230, 225, 243, 128, 239, 7,   128, 31,  102, 106,
-                138, 147, 183, 171, 223, 171, 224, 128, 128 },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+          },
+          {
+              { AOM_CDF5(1127, 12814, 22772, 27483) },
+              { AOM_CDF5(145, 6761, 11980, 26667) },
+              { AOM_CDF5(362, 5887, 11678, 16725) },
+              { AOM_CDF5(385, 15213, 18587, 30693) },
+              { AOM_CDF5(25, 2914, 23134, 27903) },
+              { AOM_CDF5(60, 4470, 11749, 23991) },
+              { AOM_CDF5(37, 3332, 14511, 21448) },
+              { AOM_CDF5(157, 6320, 13036, 17439) },
+              { AOM_CDF5(119, 6719, 12906, 29396) },
+              { AOM_CDF5(47, 5537, 12576, 21499) },
+              { AOM_CDF5(269, 6076, 11258, 23115) },
+              { AOM_CDF5(83, 5615, 12001, 17228) },
+              { AOM_CDF5(1968, 5556, 12023, 18547) },
+          },
+          {
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
           },
       },
     };
 
-const aom_prob default_nz_map[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS] = {
-#if CONFIG_EXT_TX
-#if CONFIG_CHROMA_2X2
-  { { 56,  137, 82,  136, 83,  187, 124, 65,
-      215, 118, 155, 97,  160, 111, 71,  55,
-
-      142, 156, 91,  226, 107, 231, 146, 65,
-      105, 91,  232, 97,  185, 121, 90,  74,
-
-      153, 195, 123, 154, 106, 196, 143, 67,
-      232, 125, 121, 105, 159, 113, 88,  66 },
-    { 50,  124, 89,  135, 116, 189, 150, 81,
-      202, 126, 130, 107, 149, 110, 85,  67,
-
-      139, 174, 112, 200, 94,  206, 146, 71,
-      163, 164, 212, 99,  177, 143, 125, 85,
-
-      151, 181, 126, 168, 135, 186, 143, 94,
-      207, 129, 142, 135, 145, 112, 98,  81 } },
-#endif
-  { { 56,  137, 82,  136, 83,  187, 124, 65,
-      215, 118, 155, 97,  160, 111, 71,  55,
-
-      142, 156, 91,  226, 107, 231, 146, 65,
-      105, 91,  232, 97,  185, 121, 90,  74,
-
-      153, 195, 123, 154, 106, 196, 143, 67,
-      232, 125, 121, 105, 159, 113, 88,  66 },
-    { 50,  124, 89,  135, 116, 189, 150, 81,
-      202, 126, 130, 107, 149, 110, 85,  67,
-
-      139, 174, 112, 200, 94,  206, 146, 71,
-      163, 164, 212, 99,  177, 143, 125, 85,
-
-      151, 181, 126, 168, 135, 186, 143, 94,
-      207, 129, 142, 135, 145, 112, 98,  81 } },
-  { { 57,  156, 91,  162, 99,  212, 149, 81,
-      223, 128, 182, 121, 216, 163, 119, 94,
-
-      139, 183, 100, 206, 98,  242, 163, 79,
-      200, 127, 234, 112, 230, 169, 115, 90,
-
-      156, 190, 130, 172, 117, 209, 163, 80,
-      217, 145, 182, 135, 204, 163, 120, 88 },
-    { 48,  133, 102, 143, 119, 190, 170, 109,
-      197, 127, 176, 137, 214, 168, 130, 119,
-
-      139, 185, 129, 210, 84,  237, 177, 75,
-      182, 165, 216, 121, 206, 177, 147, 102,
-
-      159, 192, 153, 182, 139, 203, 160, 125,
-      193, 161, 176, 142, 173, 145, 131, 114 } },
-  { { 33,  148, 81,  149, 84,  219, 152, 76,
-      229, 127, 205, 120, 234, 170, 123, 88,
-
-      134, 197, 101, 213, 91,  244, 169, 85,
-      220, 141, 234, 123, 242, 183, 130, 94,
-
-      141, 184, 121, 173, 98,  213, 156, 85,
-      204, 156, 197, 119, 212, 174, 127, 92 },
-    { 14,  75,  45,  98,  83,  197, 150, 90,
-      235, 124, 242, 155, 246, 187, 143, 103,
-
-      78,  185, 111, 255, 116, 255, 224, 171,
-      185, 157, 255, 85,  219, 122, 128, 128,
-
-      117, 187, 102, 181, 132, 233, 197, 93,
-      207, 135, 191, 107, 222, 175, 130, 47 } },
-  {
-      { 14,  79,  44,  86,  59,  178, 124, 63,
-        244, 106, 233, 117, 252, 185, 132, 92,
-
-        85,  225, 47,  236, 103, 255, 190, 116,
-        235, 114, 247, 123, 250, 174, 122, 110,
-
-        109, 197, 78,  177, 76,  242, 148, 68,
-        236, 123, 231, 103, 247, 171, 122, 91 },
-      { 11,  40,  27,  92,  78,  183, 171, 70,
-        216, 74,  251, 146, 252, 213, 171, 148,
-
-        85,  225, 47,  236, 103, 255, 190, 116,
-        235, 114, 247, 123, 250, 174, 122, 110,
-
-        109, 197, 78,  177, 76,  242, 148, 68,
-        236, 123, 231, 103, 247, 171, 122, 91 },
-  },
-#else  // CONFIG_EXT_TX
-#if CONFIG_CHROMA_2X2
-  {
-      {
-          34, 103, 61, 106, 62, 160, 112, 54, 173, 121, 157, 92, 157, 129, 94,
-          65,
-      },
-
-      {
-          52, 124, 84, 136, 107, 197, 161, 82, 183, 151, 153, 140, 152, 134,
-          109, 81,
-      },
-  },
-#endif
-  {
-      {
-          34, 103, 61, 106, 62, 160, 112, 54, 173, 121, 157, 92, 157, 129, 94,
-          65,
-      },
-
-      {
-          52, 124, 84, 136, 107, 197, 161, 82, 183, 151, 153, 140, 152, 134,
-          109, 81,
-      },
-  },
-  {
-      {
-          34, 127, 74, 124, 74, 204, 153, 76, 226, 162, 207, 126, 227, 192, 149,
-          108,
-      },
-
-      {
-          43, 136, 115, 158, 130, 212, 187, 112, 231, 180, 202, 164, 236, 204,
-          168, 139,
-      },
-  },
-  {
+static const aom_cdf_prob
+    default_inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES][CDF_SIZE(
+        TX_TYPES)] = {
       {
-          25, 117, 70, 120, 77, 215, 171, 102, 234, 156, 235, 155, 247, 220,
-          176, 127,
+          { 0 },
+          { 0 },
+          { 0 },
+          { 0 },
       },
-
       {
-          24, 88, 49, 100, 62, 202, 148, 62, 237, 178, 233, 168, 244, 198, 162,
-          127,
+          { AOM_CDF16(4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266, 21504,
+                      22848, 23934, 25474, 27727, 28915, 30631) },
+          { AOM_CDF16(1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357, 17674,
+                      20408, 22517, 25010, 27116, 28856, 30749) },
+          { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                      20480, 22528, 24576, 26624, 28672, 30720) },
+          { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                      20480, 22528, 24576, 26624, 28672, 30720) },
       },
-  },
-  {
       {
-          11, 54, 17, 69, 26, 128, 125, 56, 232, 130, 237, 121, 250, 168, 134,
-          114,
+          { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845,
+                      24576, 27307, 30037) },
+          { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845,
+                      24576, 27307, 30037) },
+          { AOM_CDF12(770, 2421, 5225, 12907, 15819, 18927, 21561, 24089, 26595,
+                      28526, 30529) },
+          { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845,
+                      24576, 27307, 30037) },
       },
-
       {
-          21, 52, 32, 95, 64, 171, 152, 70, 247, 159, 252, 177, 252, 221, 192,
-          143,
+          { AOM_CDF2(16384) },
+          { AOM_CDF2(4167) },
+          { AOM_CDF2(1998) },
+          { AOM_CDF2(748) },
       },
-  },
-#endif  // CONFIG_EXT_TX
-};
-
-#if CONFIG_CTX1D
-const aom_prob default_eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS] = {
-#if CONFIG_CHROMA_2X2
-  { { 220, 225, 220, 216, 233, 225, 189, 178, 222, 199, 164, 112, 207,
-      171, 115, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      147, 125, 104, 36,  117, 107, 26,  128, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      156, 124, 128, 128, 146, 68,  128, 128, 131, 17,  128, 128, 64,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-
-    { 146, 150, 142, 144, 178, 167, 131, 116, 150, 123, 107, 63,  119,
-      89,  74,  128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      117, 127, 105, 69,  53,  56,  30,  128, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      128, 86,  128, 128, 140, 72,  128, 128, 120, 44,  128, 128, 80,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } },
-  { { 237, 242, 242, 219, 192, 246, 246, 243, 233, 184, 155, 234, 217,
-      188, 152, 195, 167, 114, 89,  128, 128, 128, 128, 128, 128,
-
-      180, 173, 154, 133, 112, 147, 145, 142, 102, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      198, 173, 130, 200, 128, 208, 182, 160, 106, 171, 128, 144, 128,
-      128, 128, 124, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-
-    { 140, 170, 162, 111, 94,  182, 195, 165, 153, 110, 81,  178, 169,
-      158, 83,  133, 85,  85,  38,  128, 128, 128, 128, 128, 128,
-
-      112, 127, 107, 87,  31,  57,  49,  128, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      160, 143, 99,  126, 128, 164, 133, 126, 59,  71,  128, 138, 128,
-      128, 128, 99,  128, 128, 128, 128, 128, 128, 128, 128, 128 } },
-#endif
-  { { 220, 225, 220, 216, 233, 225, 189, 178, 222, 199, 164, 112, 207,
-      171, 115, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      147, 125, 104, 36,  117, 107, 26,  128, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      156, 124, 128, 128, 146, 68,  128, 128, 131, 17,  128, 128, 64,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-
-    { 146, 150, 142, 144, 178, 167, 131, 116, 150, 123, 107, 63,  119,
-      89,  74,  128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      117, 127, 105, 69,  53,  56,  30,  128, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      128, 86,  128, 128, 140, 72,  128, 128, 120, 44,  128, 128, 80,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } },
-  { { 237, 242, 242, 219, 192, 246, 246, 243, 233, 184, 155, 234, 217,
-      188, 152, 195, 167, 114, 89,  128, 128, 128, 128, 128, 128,
-
-      180, 173, 154, 133, 112, 147, 145, 142, 102, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      198, 173, 130, 200, 128, 208, 182, 160, 106, 171, 128, 144, 128,
-      128, 128, 124, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-
-    { 140, 170, 162, 111, 94,  182, 195, 165, 153, 110, 81,  178, 169,
-      158, 83,  133, 85,  85,  38,  128, 128, 128, 128, 128, 128,
-
-      112, 127, 107, 87,  31,  57,  49,  128, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      160, 143, 99,  126, 128, 164, 133, 126, 59,  71,  128, 138, 128,
-      128, 128, 99,  128, 128, 128, 128, 128, 128, 128, 128, 128 } },
-  { { 229, 241, 243, 245, 247, 247, 251, 248, 235, 210, 247, 235, 208,
-      166, 245, 247, 244, 182, 236, 229, 180, 136, 128, 128, 128,
-
-      191, 197, 96,  70,  199, 128, 128, 191, 174, 117, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      211, 183, 215, 188, 138, 209, 136, 128, 170, 128, 191, 128, 161,
-      128, 182, 128, 128, 128, 164, 128, 128, 128, 128, 128, 128 },
-
-    { 106, 153, 182, 191, 186, 202, 211, 203, 166, 147, 205, 205, 195,
-      128, 206, 212, 182, 109, 192, 154, 139, 79,  128, 128, 128,
-
-      112, 133, 128, 255, 128, 128, 128, 130, 154, 98,  128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      144, 185, 169, 199, 85,  183, 128, 128, 64,  128, 146, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } },
-  { { 169, 203, 224, 222, 220, 228, 229, 223, 234, 247, 242, 230, 222,
-      238, 246, 234, 196, 245, 249, 245, 192, 240, 235, 199, 161,
-
-      176, 148, 158, 77,  178, 128, 128, 158, 128, 128, 196, 208, 155,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-
-      232, 187, 191, 221, 116, 217, 154, 128, 203, 128, 128, 192, 128,
-      201, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-
-    { 133, 182, 215, 204, 176, 220, 182, 168, 187, 197, 181, 145, 75,
-      164, 136, 51,  57,  156, 128, 128, 128, 85,  128, 128, 128,
-
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+    };
 
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } },
-};
-#else  // CONFIG_CTX1D
-const aom_prob default_eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS] = {
-#if CONFIG_CHROMA_2X2
-  {
-      { 229, 236, 231, 222, 239, 236, 214, 201, 236, 226, 195, 134, 228,
-        210, 150, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 182, 186, 172, 176, 207, 213, 152, 122, 187, 171, 131, 65, 170,
-        134, 101, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-  },
-#endif
-  {
-      { 229, 236, 231, 222, 239, 236, 214, 201, 236, 226, 195, 134, 228,
-        210, 150, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 182, 186, 172, 176, 207, 213, 152, 122, 187, 171, 131, 65, 170,
-        134, 101, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-  },
-  {
-      { 225, 234, 244, 236, 205, 242, 246, 247, 246, 234, 191, 242, 237,
-        215, 142, 224, 206, 142, 73,  128, 128, 128, 128, 128, 128 },
-      { 154, 171, 187, 175, 62,  199, 202, 206, 215, 200, 111, 197, 199,
-        174, 100, 135, 105, 104, 45,  128, 128, 128, 128, 128, 128 },
-  },
-  {
-      { 180, 213, 216, 229, 233, 232, 240, 235, 220, 178, 239, 238, 225,
-        187, 229, 214, 226, 200, 183, 141, 158, 179, 128, 128, 128 },
-      { 190, 225, 234, 248, 249, 248, 253, 251, 232, 110, 254, 252, 236,
-        57,  253, 248, 232, 85,  244, 189, 112, 64,  128, 128, 128 },
-  },
-  {
-      { 248, 224, 246, 244, 239, 245, 251, 246, 251, 255, 255, 255, 249,
-        255, 255, 255, 229, 255, 255, 255, 228, 255, 255, 247, 137 },
-      { 204, 207, 233, 215, 193, 228, 239, 221, 227, 250, 236, 207, 135,
-        236, 186, 182, 57,  209, 140, 128, 85,  184, 110, 128, 128 },
-  },
+static const aom_cdf_prob default_cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)] = {
+  AOM_CDF8(1418, 2123, 13340, 18405, 26972, 28343, 32294)
 };
-#endif  // CONFIG_CTX1D
 
-const aom_prob default_coeff_lps[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS] = {
-#if CONFIG_CHROMA_2X2
-  { { 96,  128, 86,  122, 128, 84,  125, 128, 88,  99,  126, 128,
-      135, 159, 99,  130, 134, 100, 128, 144, 70,  97,  128, 139,
-      157, 168, 127, 148, 162, 121, 149, 157, 118, 127, 143, 157,
-      178, 186, 168, 171, 183, 165, 169, 180, 180, 169, 166, 177 },
-    { 81,  128, 72,  95,  128, 64,  98,  128, 42,  66,  101, 128,
-      129, 163, 97,  122, 130, 91,  119, 141, 70,  94,  118, 166,
-      157, 168, 117, 143, 151, 111, 144, 154, 76,  113, 128, 158,
-      177, 185, 165, 167, 179, 155, 166, 179, 110, 137, 115, 165 } },
-#endif
-  { { 96,  128, 86,  122, 128, 84,  125, 128, 88,  99,  126, 128,
-      135, 159, 99,  130, 134, 100, 128, 144, 70,  97,  128, 139,
-      157, 168, 127, 148, 162, 121, 149, 157, 118, 127, 143, 157,
-      178, 186, 168, 171, 183, 165, 169, 180, 180, 169, 166, 177 },
-    { 81,  128, 72,  95,  128, 64,  98,  128, 42,  66,  101, 128,
-      129, 163, 97,  122, 130, 91,  119, 141, 70,  94,  118, 166,
-      157, 168, 117, 143, 151, 111, 144, 154, 76,  113, 128, 158,
-      177, 185, 165, 167, 179, 155, 166, 179, 110, 137, 115, 165 } },
-  { { 102, 128, 79,  125, 128, 74,  121, 128, 61,  98,  128, 128,
-      141, 164, 96,  132, 150, 90,  128, 153, 62,  100, 128, 153,
-      162, 172, 120, 146, 162, 113, 142, 154, 96,  113, 138, 155,
-      181, 188, 151, 170, 179, 147, 167, 181, 158, 157, 163, 176 },
-    { 103, 128, 80,  116, 128, 66,  94,  128, 35,  65,  109, 128,
-      134, 163, 104, 137, 154, 92,  128, 104, 58,  94,  129, 132,
-      156, 173, 137, 149, 165, 104, 143, 143, 112, 101, 133, 159,
-      176, 186, 134, 172, 175, 155, 169, 177, 255, 107, 137, 168 } },
-  { { 125, 128, 85,  157, 128, 82,  155, 128, 42,  83,  116, 128,
-      155, 174, 101, 144, 155, 93,  140, 155, 57,  92,  124, 149,
-      173, 178, 114, 148, 161, 111, 145, 161, 77,  101, 131, 153,
-      190, 191, 140, 169, 183, 140, 169, 179, 108, 122, 150, 171 },
-    { 136, 128, 108, 163, 128, 96,  140, 128, 48,  90,  85,  128,
-      144, 164, 113, 158, 179, 107, 159, 128, 43,  75,  133, 160,
-      157, 184, 144, 160, 189, 154, 152, 184, 128, 124, 137, 140,
-      188, 196, 148, 170, 178, 128, 177, 159, 128, 179, 135, 135 } },
-  { { 133, 128, 110, 153, 128, 101, 157, 128, 49,  91,  134, 128,
-      151, 168, 129, 158, 162, 112, 154, 168, 63,  99,  130, 158,
-      171, 178, 128, 160, 173, 111, 155, 171, 86,  108, 143, 159,
-      194, 196, 162, 177, 185, 123, 172, 181, 101, 132, 156, 178 },
-    { 133, 128, 129, 144, 128, 116, 135, 128, 43,  101, 100, 128,
-      140, 163, 158, 173, 205, 128, 165, 171, 128, 128, 210, 163,
-      172, 184, 192, 176, 201, 183, 177, 190, 128, 192, 199, 144,
-      192, 192, 1,   196, 192, 255, 171, 178, 255, 128, 171, 179 } }
-};
-#if BR_NODE
-const aom_prob
-    default_coeff_br[TX_SIZES][PLANE_TYPES][BASE_RANGE_SETS][LEVEL_CONTEXTS] = {
-#if CONFIG_CHROMA_2X2
-      { { { 62,  128, 54,  116, 128, 51,  97,  128, 59,  68,  107, 128,
-            119, 158, 68,  115, 131, 65,  112, 138, 34,  71,  118, 137,
-            171, 184, 110, 152, 178, 105, 146, 172, 89,  111, 145, 173,
-            214, 226, 201, 198, 214, 196, 193, 210, 239, 196, 186, 202 },
-          { 41,  128, 58,  52,  128, 51,  61,  128, 92,  54,  48,  128,
-            67,  113, 36,  55,  75,  30,  56,  72,  12,  25,  50,  79,
-            94,  131, 37,  75,  108, 42,  78,  103, 5,   31,  67,  103,
-            172, 192, 131, 135, 167, 129, 136, 165, 149, 144, 120, 149 },
-          { 35, 128, 74, 50, 128, 63, 59, 128, 87,  74,  38, 128,
-            32, 53,  23, 34, 50,  18, 30, 41,  15,  13,  18, 18,
-            52, 74,  18, 29, 36,  18, 31, 47,  51,  9,   15, 27,
-            96, 134, 85, 70, 93,  96, 79, 100, 108, 100, 55, 65 } },
-        { { 52,  128, 35,  79,  128, 29,  66,  128, 12,  30, 57,  128,
-            113, 156, 64,  107, 172, 54,  103, 145, 23,  57, 96,  110,
-            165, 184, 95,  138, 166, 95,  141, 184, 55,  80, 133, 165,
-            212, 222, 134, 175, 206, 158, 177, 197, 102, 61, 154, 190 },
-          { 36,  128, 18, 26,  128, 15, 29,  128, 4, 6,  30, 128,
-            63,  113, 25, 44,  66,  22, 40,  67,  9, 14, 34, 55,
-            90,  125, 26, 66,  82,  29, 73,  88,  1, 26, 34, 67,
-            158, 179, 70, 121, 134, 69, 111, 129, 1, 85, 54, 105 },
-          { 24, 128, 8,  31, 128, 15, 16, 128, 1,   1, 1,  128,
-            32, 39,  16, 18, 43,  5,  17, 13,  1,   1, 22, 1,
-            37, 65,  26, 20, 28,  16, 15, 24,  128, 1, 1,  1,
-            83, 107, 57, 56, 74,  34, 29, 73,  128, 1, 37, 47 } } },
-#endif
-      { { { 62,  128, 54,  116, 128, 51,  97,  128, 59,  68,  107, 128,
-            119, 158, 68,  115, 131, 65,  112, 138, 34,  71,  118, 137,
-            171, 184, 110, 152, 178, 105, 146, 172, 89,  111, 145, 173,
-            214, 226, 201, 198, 214, 196, 193, 210, 239, 196, 186, 202 },
-          { 41,  128, 58,  52,  128, 51,  61,  128, 92,  54,  48,  128,
-            67,  113, 36,  55,  75,  30,  56,  72,  12,  25,  50,  79,
-            94,  131, 37,  75,  108, 42,  78,  103, 5,   31,  67,  103,
-            172, 192, 131, 135, 167, 129, 136, 165, 149, 144, 120, 149 },
-          { 35, 128, 74, 50, 128, 63, 59, 128, 87,  74,  38, 128,
-            32, 53,  23, 34, 50,  18, 30, 41,  15,  13,  18, 18,
-            52, 74,  18, 29, 36,  18, 31, 47,  51,  9,   15, 27,
-            96, 134, 85, 70, 93,  96, 79, 100, 108, 100, 55, 65 } },
-        { { 52,  128, 35,  79,  128, 29,  66,  128, 12,  30, 57,  128,
-            113, 156, 64,  107, 172, 54,  103, 145, 23,  57, 96,  110,
-            165, 184, 95,  138, 166, 95,  141, 184, 55,  80, 133, 165,
-            212, 222, 134, 175, 206, 158, 177, 197, 102, 61, 154, 190 },
-          { 36,  128, 18, 26,  128, 15, 29,  128, 4, 6,  30, 128,
-            63,  113, 25, 44,  66,  22, 40,  67,  9, 14, 34, 55,
-            90,  125, 26, 66,  82,  29, 73,  88,  1, 26, 34, 67,
-            158, 179, 70, 121, 134, 69, 111, 129, 1, 85, 54, 105 },
-          { 24, 128, 8,  31, 128, 15, 16, 128, 1,   1, 1,  128,
-            32, 39,  16, 18, 43,  5,  17, 13,  1,   1, 22, 1,
-            37, 65,  26, 20, 28,  16, 15, 24,  128, 1, 1,  1,
-            83, 107, 57, 56, 74,  34, 29, 73,  128, 1, 37, 47 } } },
-      { { { 72,  128, 45,  113, 128, 38,  100, 128, 26,  63,  112, 128,
-            134, 177, 65,  121, 148, 57,  111, 143, 27,  68,  116, 152,
-            181, 198, 98,  148, 173, 84,  136, 168, 53,  89,  134, 170,
-            218, 230, 173, 194, 216, 160, 188, 213, 199, 177, 183, 204 },
-          { 54,  128, 34,  55,  128, 32,  53,  128, 66,  45,  54,  128,
-            81,  128, 33,  59,  102, 26,  55,  80,  7,   23,  49,  91,
-            116, 145, 36,  79,  107, 35,  73,  102, 12,  28,  57,  95,
-            170, 201, 102, 133, 173, 105, 127, 173, 166, 132, 114, 149 },
-          { 40,  128, 25, 30, 128, 21, 31, 128, 24, 17, 24, 128,
-            51,  67,  19, 28, 40,  17, 25, 42,  15, 13, 19, 19,
-            61,  77,  19, 30, 48,  13, 33, 50,  11, 15, 21, 30,
-            103, 147, 37, 69, 111, 37, 66, 105, 18, 18, 36, 76 } },
-        { { 74,  128, 42,  99,  128, 32,  57,  128, 9,  28, 76,  128,
-            115, 187, 70,  118, 120, 52,  109, 128, 19, 60, 93,  100,
-            178, 197, 119, 147, 179, 92,  137, 178, 37, 87, 110, 158,
-            216, 227, 169, 186, 201, 128, 178, 204, 1,  96, 155, 217 },
-          { 59,  128, 26, 34,  128, 11, 20,  128, 7,   8, 24, 128,
-            73,  125, 38, 74,  96,  23, 61,  79,  15,  9, 23, 110,
-            96,  151, 49, 79,  164, 22, 70,  65,  1,   1, 9,  69,
-            156, 196, 73, 105, 181, 17, 126, 155, 128, 1, 90, 111 },
-          { 42, 128, 10, 11, 128, 13, 1,  128, 1,   1,   1, 128,
-            55, 63,  13, 17, 85,  1,  16, 64,  1,   1,   1, 1,
-            62, 58,  32, 21, 53,  1,  37, 91,  128, 128, 1, 1,
-            81, 133, 51, 48, 79,  1,  25, 81,  128, 128, 1, 54 } } },
-      { { { 103, 128, 52,  163, 128, 46,  155, 128, 12, 45,  97,  128,
-            162, 196, 69,  140, 170, 60,  130, 158, 21, 58,  109, 150,
-            205, 214, 93,  149, 178, 79,  143, 179, 38, 71,  120, 159,
-            231, 240, 150, 192, 218, 140, 188, 220, 84, 112, 159, 196 },
-          { 93,  128, 42, 143, 128, 41, 132, 128, 6,  15, 40, 128,
-            113, 172, 39, 99,  113, 33, 91,  94,  5,  15, 42, 83,
-            148, 172, 37, 91,  130, 28, 81,  121, 9,  20, 47, 87,
-            201, 223, 75, 139, 183, 77, 132, 176, 23, 41, 82, 147 },
-          { 92,  128, 45, 123, 128, 28, 88, 128, 1,  8,  20, 128,
-            85,  94,  39, 95,  83,  33, 81, 61,  4,  5,  17, 25,
-            84,  109, 17, 59,  76,  11, 46, 62,  1,  4,  13, 35,
-            139, 184, 25, 86,  129, 25, 71, 123, 26, 13, 31, 84 } },
-        { { 123, 128, 82,  169, 128, 62,  139, 128, 1,   28,  77,  128,
-            139, 167, 92,  170, 146, 76,  149, 255, 19,  68,  160, 73,
-            190, 209, 171, 165, 218, 57,  152, 209, 128, 61,  122, 164,
-            237, 240, 146, 210, 227, 128, 224, 220, 128, 128, 196, 199 },
-          { 130, 128, 52,  141, 128, 32,  101, 128, 128, 1,  85,  128,
-            94,  155, 71,  121, 255, 30,  116, 85,  1,   8,  58,  255,
-            105, 169, 110, 101, 132, 1,   77,  142, 128, 1,  54,  96,
-            166, 214, 224, 154, 198, 255, 153, 230, 128, 85, 100, 146 },
-          { 103, 128, 26, 83, 128, 20,  47,  128, 128, 128, 1,  128,
-            91,  90,  19, 76, 128, 1,   42,  1,   128, 255, 64, 128,
-            74,  77,  1,  72, 68,  128, 13,  77,  128, 128, 64, 1,
-            71,  147, 37, 99, 171, 1,   104, 151, 128, 1,   1,  96 } } },
-      { { { 113, 128, 79,  165, 128, 69,  149, 128, 14, 55,  116, 128,
-            163, 202, 104, 169, 205, 82,  159, 180, 22, 64,  121, 165,
-            207, 216, 113, 177, 215, 95,  166, 195, 35, 77,  132, 179,
-            241, 244, 173, 207, 233, 128, 202, 227, 92, 121, 169, 209 },
-          { 114, 128, 67, 136, 128, 54, 132, 128, 6,  26, 62,  128,
-            85,  129, 85, 146, 173, 64, 129, 140, 7,  19, 65,  92,
-            139, 169, 42, 147, 186, 40, 129, 170, 18, 18, 65,  117,
-            213, 230, 74, 172, 213, 69, 165, 196, 1,  40, 103, 170 },
-          { 101, 128, 61, 134, 128, 52, 97,  128, 1,   14, 26, 128,
-            79,  72,  71, 135, 152, 56, 114, 117, 1,   10, 24, 58,
-            64,  66,  60, 133, 148, 16, 126, 123, 1,   32, 26, 56,
-            143, 197, 51, 141, 176, 59, 132, 162, 128, 17, 47, 106 } },
-        { { 115, 128, 112, 135, 128, 89,  130, 128, 15,  49,  89,  128,
-            143, 238, 154, 203, 255, 138, 172, 255, 1,   98,  196, 255,
-            185, 203, 255, 211, 255, 192, 217, 235, 128, 128, 171, 255,
-            233, 233, 255, 247, 255, 1,   239, 245, 1,   128, 255, 255 },
-          { 75,  128, 76,  118, 128, 35,  74,  128, 1,   13,  23,  128,
-            63,  138, 114, 164, 140, 91,  128, 128, 128, 1,   138, 64,
-            96,  128, 255, 175, 236, 85,  166, 209, 128, 1,   128, 146,
-            196, 217, 1,   204, 206, 128, 212, 221, 128, 128, 128, 219 },
-          { 49,  128, 36,  62,  128, 37,  56, 128, 128, 1,   1,   128,
-            45,  37,  68,  102, 128, 90,  56, 1,   128, 128, 37,  1,
-            26,  27,  128, 126, 128, 255, 63, 142, 128, 128, 1,   1,
-            125, 159, 128, 173, 212, 128, 85, 189, 128, 128, 255, 171 } } }
-    };
-#endif  // BR_NODE
-#if CONFIG_CTX1D
-static const aom_prob default_eob_mode[TX_SIZES][PLANE_TYPES][TX_CLASSES] = {
-#if CONFIG_CHROMA_2X2
-  { { 128, 176, 157 }, { 128, 222, 198 } },
-#endif
-  { { 128, 176, 157 }, { 128, 222, 198 } },
-  { { 128, 35, 56 }, { 128, 203, 225 } },
-  { { 128, 55, 136 }, { 128, 230, 253 } },
-  { { 128, 101, 188 }, { 128, 128, 128 } }
-};
-static const aom_prob default_empty_line[TX_SIZES][PLANE_TYPES][TX_CLASSES]
-                                        [EMPTY_LINE_CONTEXTS] = {
-#if CONFIG_CHROMA_2X2
-                                          { { { 128, 128, 128, 128, 128 },
-                                              { 142, 153, 211, 205, 128 },
-                                              { 162, 142, 203, 197, 128 } },
-                                            { { 128, 128, 128, 128, 128 },
-                                              { 133, 116, 178, 123, 128 },
-                                              { 139, 109, 159, 115, 128 } } },
-#endif
-                                          { { { 128, 128, 128, 128, 128 },
-                                              { 142, 153, 211, 205, 128 },
-                                              { 162, 142, 203, 197, 128 } },
-                                            { { 128, 128, 128, 128, 128 },
-                                              { 133, 116, 178, 123, 128 },
-                                              { 139, 109, 159, 115, 128 } } },
-                                          { { { 128, 128, 128, 128, 128 },
-                                              { 185, 130, 183, 204, 227 },
-                                              { 171, 81, 177, 200, 221 } },
-                                            { { 128, 128, 128, 128, 128 },
-                                              { 180, 127, 175, 189, 213 },
-                                              { 120, 74, 129, 134, 156 } } },
-                                          { { { 128, 128, 128, 128, 128 },
-                                              { 202, 82, 183, 214, 248 },
-                                              { 144, 41, 163, 185, 203 } },
-                                            { { 128, 128, 128, 128, 128 },
-                                              { 151, 93, 171, 224, 160 },
-                                              { 128, 51, 171, 128, 1 } } },
-                                          { { { 128, 128, 128, 128, 128 },
-                                              { 154, 48, 174, 210, 233 },
-                                              { 123, 16, 148, 189, 197 } },
-                                            { { 128, 128, 128, 128, 128 },
-                                              { 128, 128, 128, 128, 128 },
-                                              { 128, 128, 128, 128, 128 } } }
-                                        };
-static const aom_prob
-    default_hv_eob[TX_SIZES][PLANE_TYPES][TX_CLASSES][HV_EOB_CONTEXTS] = {
-#if CONFIG_CHROMA_2X2
-      { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 151, 173, 114, 128, 128, 128, 128, 128, 128, 162, 198, 128,
-            128, 128, 128, 128, 182, 198, 109, 128, 128, 128, 128, 128 },
-          { 152, 173, 119, 128, 128, 128, 128, 128, 128, 164, 193, 128,
-            128, 128, 128, 128, 198, 209, 121, 128, 128, 128, 128, 128 } },
-        { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 123, 143, 70,  128, 128, 128, 128, 128, 128, 127, 154, 128,
-            128, 128, 128, 128, 176, 148, 36,  128, 128, 128, 128, 128 },
-          { 132, 152, 73,  128, 128, 128, 128, 128, 128, 127, 159, 128,
-            128, 128, 128, 128, 186, 181, 48,  128, 128, 128, 128, 128 } } },
-#endif
-      { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 151, 173, 114, 128, 128, 128, 128, 128, 128, 162, 198, 128,
-            128, 128, 128, 128, 182, 198, 109, 128, 128, 128, 128, 128 },
-          { 152, 173, 119, 128, 128, 128, 128, 128, 128, 164, 193, 128,
-            128, 128, 128, 128, 198, 209, 121, 128, 128, 128, 128, 128 } },
-        { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 123, 143, 70,  128, 128, 128, 128, 128, 128, 127, 154, 128,
-            128, 128, 128, 128, 176, 148, 36,  128, 128, 128, 128, 128 },
-          { 132, 152, 73,  128, 128, 128, 128, 128, 128, 127, 159, 128,
-            128, 128, 128, 128, 186, 181, 48,  128, 128, 128, 128, 128 } } },
-      { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 109, 105, 78,  44,  128, 128, 128, 128, 128, 146, 185, 221,
-            128, 128, 128, 128, 199, 188, 134, 69,  128, 128, 128, 128 },
-          { 124, 127, 115, 82,  128, 128, 128, 128, 128, 162, 198, 224,
-            128, 128, 128, 128, 206, 214, 177, 135, 128, 128, 128, 128 } },
-        { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 95,  102, 65,  14,  128, 128, 128, 128, 128, 132, 164, 199,
-            128, 128, 128, 128, 162, 163, 66,  27,  128, 128, 128, 128 },
-          { 83,  141, 97,  38,  128, 128, 128, 128, 128, 154, 132, 184,
-            128, 128, 128, 128, 194, 218, 112, 63,  128, 128, 128, 128 } } },
-      { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 117, 107, 86,  61,  51,  104, 128, 128, 128, 160, 198, 238,
-            252, 251, 128, 128, 221, 223, 209, 186, 99,  81,  128, 128 },
-          { 118, 122, 121, 100, 91,  97,  128, 128, 128, 168, 190, 214,
-            233, 235, 128, 128, 197, 216, 177, 165, 147, 126, 128, 128 } },
-        { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 109, 102, 63,  51,  255, 85,  128, 128, 128, 163, 131, 175,
-            128, 128, 128, 128, 183, 102, 40,  1,   128, 128, 128, 128 },
-          { 255, 255, 1,   1,   128, 1, 128, 128, 128, 1,   128, 128,
-            128, 128, 128, 128, 255, 1, 128, 128, 128, 128, 128, 128 } } },
-      { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 114, 108, 83,  61,  53,  28,  77,  177, 128, 161, 187, 218,
-            240, 237, 228, 234, 200, 207, 167, 136, 98,  78,  183, 128 },
-          { 117, 138, 116, 77,  75,  85,  26,  1,   128, 197, 162, 200,
-            184, 212, 225, 236, 189, 225, 168, 124, 144, 171, 128, 128 } },
-        { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-          { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-            128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } } }
+static const aom_cdf_prob
+    default_cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)] = {
+      { AOM_CDF16(7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696, 32700,
+                  32704, 32708, 32712, 32716, 32720, 32724) },
+      { AOM_CDF16(14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573, 32620,
+                  32647, 32668, 32672, 32676, 32680, 32684) },
+      { AOM_CDF16(11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649, 32673,
+                  32677, 32681, 32685, 32689, 32693, 32697) },
+      { AOM_CDF16(26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704, 32708,
+                  32712, 32716, 32720, 32724, 32728, 32732) },
+      { AOM_CDF16(17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321, 32394,
+                  32464, 32516, 32560, 32576, 32593, 32622) },
+      { AOM_CDF16(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843, 32144,
+                  32413, 32520, 32594, 32622, 32656, 32660) }
     };
-#endif  // CONFIG_CTX1D
-#endif  // CONFIG_LV_MAP
 
-#if CONFIG_EXT_PARTITION_TYPES
-static const aom_prob
-    default_partition_probs[PARTITION_CONTEXTS][EXT_PARTITION_TYPES - 1] = {
-      // 8x8 -> 4x4
-      { 199, 122, 141, 128, 128, 128, 255, 128, 255 },  // a/l both not split
-      { 147, 63, 159, 128, 128, 128, 255, 128, 255 },   // a split, l not split
-      { 148, 133, 118, 128, 128, 128, 255, 128, 255 },  // l split, a not split
-      { 121, 104, 114, 128, 128, 128, 255, 128, 255 },  // a/l both split
-      // 16x16 -> 8x8
-      { 174, 73, 87, 128, 128, 128, 255, 128, 255 },  // a/l both not split
-      { 92, 41, 83, 128, 128, 128, 255, 128, 255 },   // a split, l not split
-      { 82, 99, 50, 128, 128, 128, 255, 128, 255 },   // l split, a not split
-      { 53, 39, 39, 128, 128, 128, 255, 128, 255 },   // a/l both split
-      // 32x32 -> 16x16
-      { 177, 58, 59, 128, 128, 85, 128, 85, 128 },  // a/l both not split
-      { 68, 26, 63, 128, 128, 85, 128, 85, 128 },   // a split, l not split
-      { 52, 79, 25, 128, 128, 85, 128, 85, 128 },   // l split, a not split
-      { 17, 14, 12, 128, 128, 85, 128, 85, 128 },   // a/l both split
-      // 64x64 -> 32x32
-      { 222, 34, 30, 128, 128, 85, 128, 85, 128 },  // a/l both not split
-      { 72, 16, 44, 128, 128, 85, 128, 85, 128 },   // a split, l not split
-      { 58, 32, 12, 128, 128, 85, 128, 85, 128 },   // l split, a not split
-      { 10, 7, 6, 128, 128, 85, 128, 85, 128 },     // a/l both split
-#if CONFIG_EXT_PARTITION
-      // 128x128 -> 64x64
-      { 222, 34, 30, 128, 128, 128, 255, 128, 255 },  // a/l both not split
-      { 72, 16, 44, 128, 128, 128, 255, 128, 255 },   // a split, l not split
-      { 58, 32, 12, 128, 128, 128, 255, 128, 255 },   // l split, a not split
-      { 10, 7, 6, 128, 128, 128, 255, 128, 255 },     // a/l both split
-#endif                                                // CONFIG_EXT_PARTITION
-#if CONFIG_UNPOISON_PARTITION_CTX
-      { 0, 0, 141, 0, 0, 0, 0, 0, 0 },  // 8x8 -> 4x4
-      { 0, 0, 87, 0, 0, 0, 0, 0, 0 },   // 16x16 -> 8x8
-      { 0, 0, 59, 0, 0, 0, 0, 0, 0 },   // 32x32 -> 16x16
-      { 0, 0, 30, 0, 0, 0, 0, 0, 0 },   // 64x64 -> 32x32
-#if CONFIG_EXT_PARTITION
-      { 0, 0, 30, 0, 0, 0, 0, 0, 0 },   // 128x128 -> 64x64
-#endif                                  // CONFIG_EXT_PARTITION
-      { 0, 122, 0, 0, 0, 0, 0, 0, 0 },  // 8x8 -> 4x4
-      { 0, 73, 0, 0, 0, 0, 0, 0, 0 },   // 16x16 -> 8x8
-      { 0, 58, 0, 0, 0, 0, 0, 0, 0 },   // 32x32 -> 16x16
-      { 0, 34, 0, 0, 0, 0, 0, 0, 0 },   // 64x64 -> 32x32
-#if CONFIG_EXT_PARTITION
-      { 0, 34, 0, 0, 0, 0, 0, 0, 0 },  // 128x128 -> 64x64
-#endif                                 // CONFIG_EXT_PARTITION
-#endif                                 // CONFIG_UNPOISON_PARTITION_CTX
-    };
-#else
-static const aom_prob
-    default_partition_probs[PARTITION_CONTEXTS][PARTITION_TYPES - 1] = {
-      // 8x8 -> 4x4
-      { 199, 122, 141 },  // a/l both not split
-      { 147, 63, 159 },   // a split, l not split
-      { 148, 133, 118 },  // l split, a not split
-      { 121, 104, 114 },  // a/l both split
-      // 16x16 -> 8x8
-      { 174, 73, 87 },  // a/l both not split
-      { 92, 41, 83 },   // a split, l not split
-      { 82, 99, 50 },   // l split, a not split
-      { 53, 39, 39 },   // a/l both split
-      // 32x32 -> 16x16
-      { 177, 58, 59 },  // a/l both not split
-      { 68, 26, 63 },   // a split, l not split
-      { 52, 79, 25 },   // l split, a not split
-      { 17, 14, 12 },   // a/l both split
-      // 64x64 -> 32x32
-      { 222, 34, 30 },  // a/l both not split
-      { 72, 16, 44 },   // a split, l not split
-      { 58, 32, 12 },   // l split, a not split
-      { 10, 7, 6 },     // a/l both split
-#if CONFIG_EXT_PARTITION
-      // 128x128 -> 64x64
-      { 222, 34, 30 },  // a/l both not split
-      { 72, 16, 44 },   // a split, l not split
-      { 58, 32, 12 },   // l split, a not split
-      { 10, 7, 6 },     // a/l both split
-#endif  // CONFIG_EXT_PARTITION
-#if CONFIG_UNPOISON_PARTITION_CTX
-      { 0, 0, 141 },    // 8x8 -> 4x4
-      { 0, 0, 87 },     // 16x16 -> 8x8
-      { 0, 0, 59 },     // 32x32 -> 16x16
-      { 0, 0, 30 },     // 64x64 -> 32x32
-#if CONFIG_EXT_PARTITION
-      { 0, 0, 30 },     // 128x128 -> 64x64
-#endif  // CONFIG_EXT_PARTITION
-      { 0, 122, 0 },    // 8x8 -> 4x4
-      { 0, 73, 0 },     // 16x16 -> 8x8
-      { 0, 58, 0 },     // 32x32 -> 16x16
-      { 0, 34, 0 },     // 64x64 -> 32x32
-#if CONFIG_EXT_PARTITION
-      { 0, 34, 0 },     // 128x128 -> 64x64
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_UNPOISON_PARTITION_CTX
+static const aom_cdf_prob
+    default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS][CDF_SIZE(
+        SWITCHABLE_FILTERS)] = {
+      { AOM_CDF3(31935, 32720) }, { AOM_CDF3(5568, 32719) },
+      { AOM_CDF3(422, 2938) },    { AOM_CDF3(28244, 32608) },
+      { AOM_CDF3(31206, 31953) }, { AOM_CDF3(4862, 32121) },
+      { AOM_CDF3(770, 1152) },    { AOM_CDF3(20889, 25637) },
+      { AOM_CDF3(31910, 32724) }, { AOM_CDF3(4120, 32712) },
+      { AOM_CDF3(305, 2247) },    { AOM_CDF3(27403, 32636) },
+      { AOM_CDF3(31022, 32009) }, { AOM_CDF3(2963, 32093) },
+      { AOM_CDF3(601, 943) },     { AOM_CDF3(14969, 21398) }
     };
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
-static const aom_prob default_newmv_prob[NEWMV_MODE_CONTEXTS] = {
-  155, 116, 94, 32, 96, 56, 30,
-};
-
-static const aom_prob default_zeromv_prob[ZEROMV_MODE_CONTEXTS] = {
-  45, 13,
-};
+static const aom_cdf_prob default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)] =
+    { { AOM_CDF2(24035) }, { AOM_CDF2(16630) }, { AOM_CDF2(15339) },
+      { AOM_CDF2(8386) },  { AOM_CDF2(12222) }, { AOM_CDF2(4676) } };
 
-static const aom_prob default_refmv_prob[REFMV_MODE_CONTEXTS] = {
-  178, 212, 135, 244, 203, 122, 128, 128, 128,
-};
+static const aom_cdf_prob default_zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(
+    2)] = { { AOM_CDF2(2175) }, { AOM_CDF2(1054) } };
 
-static const aom_prob default_drl_prob[DRL_MODE_CONTEXTS] = {
-  119, 128, 189, 134, 128,
-};
-#if CONFIG_NEW_MULTISYMBOL
-static const aom_cdf_prob default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)] =
-    { { AOM_ICDF(128 * 155), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 116), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 94), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 32), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 96), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 56), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 30), AOM_ICDF(32768), 0 } };
-static const aom_cdf_prob default_zeromv_cdf[ZEROMV_MODE_CONTEXTS][CDF_SIZE(
-    2)] = { { AOM_ICDF(128 * 45), AOM_ICDF(32768), 0 },
-            { AOM_ICDF(128 * 13), AOM_ICDF(32768), 0 } };
 static const aom_cdf_prob default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)] =
-    { { AOM_ICDF(128 * 178), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 212), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 135), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 244), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 203), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 122), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 } };
+    { { AOM_CDF2(23974) }, { AOM_CDF2(24188) }, { AOM_CDF2(17848) },
+      { AOM_CDF2(28622) }, { AOM_CDF2(24312) }, { AOM_CDF2(19923) } };
+
 static const aom_cdf_prob default_drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)] = {
-  { AOM_ICDF(128 * 119), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(128 * 189), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(128 * 134), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 }
+  { AOM_CDF2(13104) }, { AOM_CDF2(24560) }, { AOM_CDF2(18945) }
 };
-#endif
-
-static const aom_prob default_inter_compound_mode_probs
-    [INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES - 1] = {
-      { 154, 167, 233, 165, 143, 170, 167 },  // 0 = both zero mv
-      { 75, 168, 237, 155, 135, 176, 172 },   // 1 = 1 zero + 1 predicted
-      { 7, 173, 227, 128, 153, 188, 189 },    // 2 = two predicted mvs
-      { 8, 120, 214, 113, 154, 178, 174 },    // 3 = 1 pred/zero, 1 new
-      { 4, 85, 194, 94, 155, 173, 167 },      // 4 = two new mvs
-      { 23, 89, 180, 73, 157, 151, 155 },     // 5 = one intra neighbour
-      { 27, 49, 152, 91, 134, 153, 142 },     // 6 = two intra neighbours
-    };
 
 static const aom_cdf_prob
     default_inter_compound_mode_cdf[INTER_MODE_CONTEXTS][CDF_SIZE(
         INTER_COMPOUND_MODES)] = {
-      { AOM_ICDF(19712), AOM_ICDF(28229), AOM_ICDF(30892), AOM_ICDF(31437),
-        AOM_ICDF(31712), AOM_ICDF(32135), AOM_ICDF(32360), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9600), AOM_ICDF(24804), AOM_ICDF(29268), AOM_ICDF(30323),
-        AOM_ICDF(30802), AOM_ICDF(31726), AOM_ICDF(32177), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(896), AOM_ICDF(22434), AOM_ICDF(27015), AOM_ICDF(29026),
-        AOM_ICDF(29753), AOM_ICDF(31114), AOM_ICDF(31597), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(1024), AOM_ICDF(15904), AOM_ICDF(22127), AOM_ICDF(25421),
-        AOM_ICDF(26864), AOM_ICDF(28996), AOM_ICDF(30001), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(512), AOM_ICDF(11222), AOM_ICDF(17217), AOM_ICDF(21445),
-        AOM_ICDF(23473), AOM_ICDF(26133), AOM_ICDF(27550), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(2944), AOM_ICDF(13313), AOM_ICDF(17214), AOM_ICDF(20751),
-        AOM_ICDF(23211), AOM_ICDF(25500), AOM_ICDF(26992), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(3456), AOM_ICDF(9067), AOM_ICDF(14069), AOM_ICDF(16907),
-        AOM_ICDF(18817), AOM_ICDF(21214), AOM_ICDF(23139), AOM_ICDF(32768), 0 }
-    };
-
-#if CONFIG_COMPOUND_SINGLEREF
-// TODO(zoeliu): Default values to be further adjusted based on the collected
-//               stats.
-/*
-static const aom_prob default_inter_singleref_comp_mode_probs
-    [INTER_MODE_CONTEXTS][INTER_SINGLEREF_COMP_MODES - 1] = {
-      { 2, 173, 68, 180 },   // 0 = both zero mv
-      { 7, 145, 160, 180 },  // 1 = 1 zero + 1 predicted
-      { 7, 166, 126, 180 },  // 2 = two predicted mvs
-      { 7, 94, 132, 180 },   // 3 = 1 pred/zero, 1 new
-      { 8, 64, 64, 180 },    // 4 = two new mvs
-      { 17, 81, 52, 180 },   // 5 = one intra neighbour
-      { 25, 29, 50, 180 },   // 6 = two intra neighbours
-    };*/
-static const aom_prob default_inter_singleref_comp_mode_probs
-    [INTER_MODE_CONTEXTS][INTER_SINGLEREF_COMP_MODES - 1] = {
-      { 2, 173, 68 },   // 0 = both zero mv
-      { 7, 145, 160 },  // 1 = 1 zero + 1 predicted
-      { 7, 166, 126 },  // 2 = two predicted mvs
-      { 7, 94, 132 },   // 3 = 1 pred/zero, 1 new
-      { 8, 64, 64 },    // 4 = two new mvs
-      { 17, 81, 52 },   // 5 = one intra neighbour
-      { 25, 29, 50 },   // 6 = two intra neighbours
-    };
-
-static const aom_cdf_prob
-    default_inter_singleref_comp_mode_cdf[INTER_MODE_CONTEXTS][CDF_SIZE(
-        INTER_SINGLEREF_COMP_MODES)] = {
-      { AOM_ICDF(21971), AOM_ICDF(24771), AOM_ICDF(25027), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(18053), AOM_ICDF(26690), AOM_ICDF(27586), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(20667), AOM_ICDF(26182), AOM_ICDF(27078), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(11703), AOM_ICDF(22103), AOM_ICDF(22999), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(7936), AOM_ICDF(13888), AOM_ICDF(14912), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9679), AOM_ICDF(13927), AOM_ICDF(16103), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(3349), AOM_ICDF(8470), AOM_ICDF(11670), AOM_ICDF(32768), 0 }
+      { AOM_CDF8(7760, 13823, 15808, 17641, 19156, 20666, 26891) },
+      { AOM_CDF8(10730, 19452, 21145, 22749, 24039, 25131, 28724) },
+      { AOM_CDF8(10664, 20221, 21588, 22906, 24295, 25387, 28436) },
+      { AOM_CDF8(13298, 16984, 20471, 24182, 25067, 25736, 26422) },
+      { AOM_CDF8(18904, 23325, 25242, 27432, 27898, 28258, 30758) },
+      { AOM_CDF8(10725, 17454, 20124, 22820, 24195, 25168, 26046) },
+      { AOM_CDF8(17125, 24273, 25814, 27492, 28214, 28704, 30592) },
+      { AOM_CDF8(13046, 23214, 24505, 25942, 27435, 28442, 29330) }
     };
-#endif  // CONFIG_COMPOUND_SINGLEREF
 
-#if CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-static const aom_prob
-    default_compound_type_probs[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { 128, 128 }, { 128, 128 }, { 128, 128 },
-#endif
-      { 128, 128 }, { 255, 128 }, { 255, 128 }, { 66, 51 },   { 72, 35 },
-      { 79, 29 },   { 71, 18 },   { 81, 29 },   { 81, 26 },   { 69, 19 },
-      { 104, 1 },   { 99, 1 },    { 75, 1 },
-#if CONFIG_EXT_PARTITION
-      { 255, 1 },   { 255, 1 },   { 255, 1 },
-#endif  // CONFIG_EXT_PARTITION
-      { 208, 128 }, { 208, 128 }, { 208, 128 }, { 208, 128 }, { 208, 1 },
-      { 208, 1 },
-#if CONFIG_EXT_PARTITION
-      { 208, 1 },   { 208, 1 }
-#endif  // CONFIG_EXT_PARTITION
-    };
-#elif !CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-static const aom_prob
-    default_compound_type_probs[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { 255 }, { 255 }, { 255 },
-#endif
-      { 208 }, { 208 }, { 208 }, { 208 }, { 208 }, { 208 }, { 216 },
-      { 216 }, { 216 }, { 224 }, { 224 }, { 240 }, { 240 },
-#if CONFIG_EXT_PARTITION
-      { 255 }, { 255 }, { 255 },
-#endif  // CONFIG_EXT_PARTITION
-      { 208 }, { 208 }, { 208 }, { 208 }, { 255 }, { 255 },
-#if CONFIG_EXT_PARTITION
-      { 255 }, { 255 }
-#endif  // CONFIG_EXT_PARTITION
-    };
-#elif CONFIG_COMPOUND_SEGMENT && !CONFIG_WEDGE
-static const aom_prob
-    default_compound_type_probs[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { 255 }, { 255 }, { 255 },
-#endif
-      { 208 }, { 208 }, { 208 }, { 208 }, { 208 }, { 208 }, { 216 },
-      { 216 }, { 216 }, { 224 }, { 224 }, { 240 }, { 240 },
-#if CONFIG_EXT_PARTITION
-      { 255 }, { 255 }, { 255 },
-#endif  // CONFIG_EXT_PARTITION
-      { 208 }, { 208 }, { 208 }, { 208 }, { 208 }, { 208 },
-#if CONFIG_EXT_PARTITION
-      { 208 }, { 208 }
-#endif  // CONFIG_EXT_PARTITION
-    };
-#else
-static const aom_prob default_compound_type_probs[BLOCK_SIZES_ALL]
-                                                 [COMPOUND_TYPES - 1];
-#endif  // CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-
-#if CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-static const aom_cdf_prob
-    default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32704), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32704), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8448), AOM_ICDF(13293), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9216), AOM_ICDF(12436), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10112), AOM_ICDF(12679), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9088), AOM_ICDF(10753), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10368), AOM_ICDF(12906), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10368), AOM_ICDF(12643), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8832), AOM_ICDF(10609), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(13312), AOM_ICDF(13388), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(12672), AOM_ICDF(12751), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9600), AOM_ICDF(9691), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(32640), AOM_ICDF(32641), AOM_ICDF(32768), 0 },  // 255, 1
-      { AOM_ICDF(32640), AOM_ICDF(32641), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32641), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(26648), AOM_ICDF(32768), 0 },  // 208, 1
-      { AOM_ICDF(26624), AOM_ICDF(26648), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(26624), AOM_ICDF(26648), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(26648), AOM_ICDF(32768), 0 },
-#endif
-    };
-#elif !CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-static const aom_cdf_prob
-    default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },  // 255
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },  // 208
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(27648), AOM_ICDF(32768), 0 },  // 216
-      { AOM_ICDF(27648), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(27648), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(28672), AOM_ICDF(32768), 0 },  // 224
-      { AOM_ICDF(28672), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(30720), AOM_ICDF(32768), 0 },  // 240
-      { AOM_ICDF(30720), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },  // 255
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-    };
-#elif CONFIG_COMPOUND_SEGMENT && !CONFIG_WEDGE
-static const aom_cdf_prob
-    default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },  // 255
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },  // 208
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(27648), AOM_ICDF(32768), 0 },  // 216
-      { AOM_ICDF(27648), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(27648), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(28672), AOM_ICDF(32768), 0 },  // 224
-      { AOM_ICDF(28672), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(30720), AOM_ICDF(32768), 0 },  // 240
-      { AOM_ICDF(30720), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },  // 255
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },  // 208
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26624), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-    };
-#endif  // CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-
-#if CONFIG_INTERINTRA
-static const aom_prob default_interintra_prob[BLOCK_SIZE_GROUPS] = {
-  128, 226, 244, 254,
-};
-#if CONFIG_NEW_MULTISYMBOL
 static const aom_cdf_prob default_interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
-    2)] = { { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-            { AOM_ICDF(226 * 128), AOM_ICDF(32768), 0 },
-            { AOM_ICDF(244 * 128), AOM_ICDF(32768), 0 },
-            { AOM_ICDF(254 * 128), AOM_ICDF(32768), 0 } };
-#endif
+    2)] = { { AOM_CDF2(16384) },
+            { AOM_CDF2(26887) },
+            { AOM_CDF2(27597) },
+            { AOM_CDF2(30237) } };
 
-static const aom_prob
-    default_interintra_mode_prob[BLOCK_SIZE_GROUPS][INTERINTRA_MODES - 1] = {
-      { 128, 128, 128 },  // block_size < 8x8
-      { 24, 34, 119 },    // block_size < 16x16
-      { 38, 33, 95 },     // block_size < 32x32
-      { 51, 21, 110 },    // block_size >= 32x32
-    };
 static const aom_cdf_prob
-    default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
-        INTERINTRA_MODES)] = {
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(28672), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(3072), AOM_ICDF(7016), AOM_ICDF(18987), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4864), AOM_ICDF(8461), AOM_ICDF(17481), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(6528), AOM_ICDF(8681), AOM_ICDF(19031), AOM_ICDF(32768), 0 }
-    };
-
-static const aom_prob default_wedge_interintra_prob[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  128, 128, 128,
-#endif
-  128, 128, 128, 194, 213, 217, 222, 224, 226, 220, 128, 128, 128,
-#if CONFIG_EXT_PARTITION
-  255, 255, 255,
-#endif  // CONFIG_EXT_PARTITION
-  208, 208, 208, 208, 255, 255,
-#if CONFIG_EXT_PARTITION
-  255, 255
-#endif  // CONFIG_EXT_PARTITION
-};
+    default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTERINTRA_MODES)] =
+        { { AOM_CDF4(8192, 16384, 24576) },
+          { AOM_CDF4(1875, 11082, 27332) },
+          { AOM_CDF4(2473, 9996, 26388) },
+          { AOM_CDF4(4238, 11537, 25926) } };
 
-#if CONFIG_NEW_MULTISYMBOL
 static const aom_cdf_prob
     default_wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(194 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(213 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(217 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(222 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(224 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(226 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(220 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-      { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(20036) }, { AOM_CDF2(24957) }, { AOM_CDF2(26704) },
+      { AOM_CDF2(27530) }, { AOM_CDF2(29564) }, { AOM_CDF2(29444) },
+      { AOM_CDF2(26872) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }
     };
-#endif  // CONFIG_NEW_MULTISYMBOL
 
-#endif  // CONFIG_INTERINTRA
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-#ifdef TWO_MODE
-const aom_tree_index av1_ncobmc_mode_tree[TREE_SIZE(MAX_NCOBMC_MODES)] = {
-  -NCOBMC_MODE_0, -NCOBMC_MODE_1
-};
-#else
-const aom_tree_index av1_ncobmc_mode_tree[TREE_SIZE(MAX_NCOBMC_MODES)] = {
-  -NCOBMC_MODE_0, 2,
-  -NCOBMC_MODE_1, 4,
-  -NCOBMC_MODE_2, 6,
-  -NCOBMC_MODE_3, 8,
-  -NCOBMC_MODE_4, 10,
-  -NCOBMC_MODE_5, 12,
-  -NCOBMC_MODE_6, -NCOBMC_MODE_7
-};
-#endif  // TWO_MODE
-
-// TODO(weitinglin): find default prob
-//                   right now setting the first mode with probability 1/255,
-//                   the last eight modes with equal probabilities
-static const aom_prob
-    default_ncobmc_mode_prob[ADAPT_OVERLAP_BLOCKS][MAX_NCOBMC_MODES - 1] = {
-#ifdef TWO_MODE
-      { 127 }, { 127 }, { 127 }, { 127 }
-#else
-      { 32, 36, 43, 51, 64, 85, 128 },  // 8x8
-      { 32, 36, 43, 51, 64, 85, 128 },  // 16X16
-      { 32, 36, 43, 51, 64, 85, 128 },  // 32X32
-      { 32, 36, 43, 51, 64, 85, 128 }   // 64X64
-#endif  // TWO_MODE
-    };
-static const aom_cdf_prob
-    default_ncobmc_mode_cdf[ADAPT_OVERLAP_BLOCKS][CDF_SIZE(MAX_NCOBMC_MODES)] =
-#ifdef TWO_MODE
-        { { AOM_ICDF(16256), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(16256), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(16256), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(16256), AOM_ICDF(32768), 0 } };
-#else
-        { { AOM_ICDF(4096), AOM_ICDF(8192), AOM_ICDF(12288), AOM_ICDF(16384),
-            AOM_ICDF(20480), AOM_ICDF(24576), AOM_ICDF(28672), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(4096), AOM_ICDF(8192), AOM_ICDF(12288), AOM_ICDF(16384),
-            AOM_ICDF(20480), AOM_ICDF(24576), AOM_ICDF(28672), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(4096), AOM_ICDF(8192), AOM_ICDF(12288), AOM_ICDF(16384),
-            AOM_ICDF(20480), AOM_ICDF(24576), AOM_ICDF(28672), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(4096), AOM_ICDF(8192), AOM_ICDF(12288), AOM_ICDF(16384),
-            AOM_ICDF(20480), AOM_ICDF(24576), AOM_ICDF(28672), AOM_ICDF(32768),
-            0 } };
-#endif  // TWO_MODEE
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-// Change this section appropriately once warped motion is supported
-#if CONFIG_MOTION_VAR && !CONFIG_WARPED_MOTION
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)] = {
-  -SIMPLE_TRANSLATION, 2, -OBMC_CAUSAL, -NCOBMC_ADAPT_WEIGHT,
-};
-static const aom_prob
-    default_motion_mode_prob[BLOCK_SIZES_ALL][MOTION_MODES - 1] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { 255, 255 },
-      { 255, 255 },
-      { 255, 255 },
-#endif
-      { 255, 255 },
-      { 255, 255 },
-      { 255, 255 },
-      /** Only these nine block sizes allow ncobmc_adapt_weight **/
-      { 45, 207 },
-      { 42, 211 },
-      { 34, 207 },
-      { 181, 123 },
-      { 129, 141 },
-      { 15, 209 },
-      { 231, 122 },
-      { 195, 190 },
-      { 168, 190 },
-      /** ----------------------------------------------------- **/
-      { 244, 255 },
-#if CONFIG_EXT_PARTITION
-      { 252, 255 },
-      { 252, 255 },
-      { 252, 255 },
-#endif  // CONFIG_EXT_PARTITION
-      { 255, 200 },
-      { 255, 200 },
-      { 255, 200 },
-      { 255, 200 },
-#if CONFIG_EXT_PARTITION
-      { 252, 255 },
-      { 252, 200 },
-      { 252, 200 },
-#endif  // CONFIG_EXT_PARTITION
-    };
 static const aom_cdf_prob
-    default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0, 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0, 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0, 0 },
-#endif
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0, 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0, 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0, 0 },
-      /** Only these seven block sizes allow ncobmc_adapt_weight **/
-      { AOM_ICDF(5702), AOM_ICDF(27555), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(5408), AOM_ICDF(27964), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4330), AOM_ICDF(27298), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(23107), AOM_ICDF(27760), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16490), AOM_ICDF(25461), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(1959), AOM_ICDF(27153), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(29530), AOM_ICDF(31073), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(25057), AOM_ICDF(30840), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(21588), AOM_ICDF(29940), AOM_ICDF(32768), 0 },
-      /** ----------------------------------------------------- **/
-      { AOM_ICDF(244 * 128), AOM_ICDF(32768), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(32256), AOM_ICDF(32768), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32256), AOM_ICDF(32768), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32256), AOM_ICDF(32768), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 }
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-#endif
-    };
-#else  // CONFIG_NCOBMC_ADAPT_WEIGHT
-const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)] = {
-  -SIMPLE_TRANSLATION, -OBMC_CAUSAL
-};
-
-static const aom_prob
-    default_motion_mode_prob[BLOCK_SIZES_ALL][MOTION_MODES - 1] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { 255 }, { 255 }, { 255 },
-#endif
-      { 255 }, { 255 }, { 255 }, { 151 }, { 153 }, { 144 }, { 178 },
-      { 165 }, { 160 }, { 207 }, { 195 }, { 168 }, { 244 },
-#if CONFIG_EXT_PARTITION
-      { 252 }, { 252 }, { 252 },
-#endif  // CONFIG_EXT_PARTITION
-      { 208 }, { 208 }, { 208 }, { 208 }, { 208 }, { 208 },
-#if CONFIG_EXT_PARTITION
-      { 208 }, { 208 }
-#endif  // CONFIG_EXT_PARTITION
-    };
+    default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)] = {
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) },
+      { AOM_CDF2(9770) },  { AOM_CDF2(9100) },  { AOM_CDF2(8233) },
+      { AOM_CDF2(6172) },  { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(11820) }, { AOM_CDF2(7701) },  { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }
+    };
+
+static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)] =
+    { { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094, 20359,
+                  22362, 24127, 25702, 27752, 29450, 31171) },
+      { AOM_CDF16(806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, 16323, 17367,
+                  18452, 19422, 22839, 26127, 29629) },
+      { AOM_CDF16(2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, 17939, 21332,
+                  24520, 27470, 29456, 30529, 31656) },
+      { AOM_CDF16(1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144, 19163,
+                  20961, 22884, 24471, 26719, 28714, 30877) },
+      { AOM_CDF16(1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, 15369, 16730,
+                  18114, 19313, 22521, 26012, 29550) },
+      { AOM_CDF16(2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124, 17270,
+                  20533, 23434, 25972, 27944, 29570, 31416) },
+      { AOM_CDF16(1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944, 20638,
+                  22038, 23963, 25311, 26988, 28766, 31012) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(154, 987, 1925, 2051, 2088, 2111, 2151, 23033, 23703, 24284,
+                  24985, 25684, 27259, 28883, 30911) },
+      { AOM_CDF16(1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, 22935, 25057,
+                  27251, 29173, 30089, 30960, 31933) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) } };
+
+static const aom_cdf_prob default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(
+    MOTION_MODES)] = { { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) },
+                       { AOM_CDF3(10923, 21845) }, { AOM_CDF3(7651, 24760) },
+                       { AOM_CDF3(4738, 24765) },  { AOM_CDF3(5391, 25528) },
+                       { AOM_CDF3(19419, 26810) }, { AOM_CDF3(5123, 23606) },
+                       { AOM_CDF3(11606, 24308) }, { AOM_CDF3(26260, 29116) },
+                       { AOM_CDF3(20360, 28062) }, { AOM_CDF3(21679, 26830) },
+                       { AOM_CDF3(29516, 30701) }, { AOM_CDF3(28898, 30397) },
+                       { AOM_CDF3(30878, 31335) }, { AOM_CDF3(32507, 32558) },
+                       { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) },
+                       { AOM_CDF3(28799, 31390) }, { AOM_CDF3(26431, 30774) },
+                       { AOM_CDF3(28973, 31594) }, { AOM_CDF3(29742, 31203) } };
 
-static const aom_cdf_prob
-    default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(151 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(153 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(144 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(178 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(165 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(160 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(207 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(195 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(168 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(244 * 128), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-    };
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#elif !CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-
-const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)] = {
-  -SIMPLE_TRANSLATION, -WARPED_CAUSAL
-};
-
-static const aom_prob
-    default_motion_mode_prob[BLOCK_SIZES_ALL][MOTION_MODES - 1] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { 255 }, { 255 }, { 255 },
-#endif
-      { 255 }, { 255 }, { 255 }, { 151 }, { 153 }, { 144 }, { 178 },
-      { 165 }, { 160 }, { 207 }, { 195 }, { 168 }, { 244 },
-#if CONFIG_EXT_PARTITION
-      { 252 }, { 252 }, { 252 },
-#endif  // CONFIG_EXT_PARTITION
-      { 208 }, { 208 }, { 208 }, { 208 }, { 208 }, { 208 },
-#if CONFIG_EXT_PARTITION
-      { 252 }, { 252 }
-#endif  // CONFIG_EXT_PARTITION
-    };
-
-static const aom_cdf_prob
-    default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(151 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(153 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(144 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(178 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(165 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(160 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(207 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(195 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(168 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(244 * 128), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(255 * 128), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-    };
-
-#elif CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)] = {
-  -SIMPLE_TRANSLATION, 2, -OBMC_CAUSAL, 4, -NCOBMC_ADAPT_WEIGHT, -WARPED_CAUSAL
-};
-
-static const aom_prob default_motion_mode_prob[BLOCK_SIZES_ALL][MOTION_MODES -
-                                                                1] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  { 128, 128, 255 }, { 128, 128, 128 }, { 128, 128, 128 },
-#endif
-  { 128, 128, 128 }, { 128, 128, 128 }, { 128, 128, 128 }, { 62, 115, 128 },
-  { 39, 131, 128 },  { 39, 132, 128 },  { 118, 94, 128 },  { 77, 125, 128 },
-  { 100, 121, 128 }, { 190, 66, 128 },  { 207, 102, 128 }, { 197, 100, 128 },
-  { 239, 76, 128 },
-#if CONFIG_EXT_PARTITION
-  { 252, 200, 128 }, { 252, 200, 128 }, { 252, 200, 128 },
-#endif  // CONFIG_EXT_PARTITION
-  { 208, 200, 128 }, { 208, 200, 128 }, { 208, 200, 128 }, { 208, 200, 128 }
-};
-static const aom_cdf_prob
-    default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      /** Only these nine block sizes allow ncobmc_adapt_weight **/
-      { AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      /***********************************************************/
-      { AOM_ICDF(30592), AOM_ICDF(31238), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(32256), AOM_ICDF(32656), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32256), AOM_ICDF(32656), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32256), AOM_ICDF(32656), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32767), AOM_ICDF(32768), 0 }
-    };
-
-const aom_tree_index av1_ncobmc_tree[TREE_SIZE(OBMC_FAMILY_MODES)] = {
-  -SIMPLE_TRANSLATION, 2, -OBMC_CAUSAL, -NCOBMC_ADAPT_WEIGHT
-};
-
-static const aom_prob
-    default_ncobmc_prob[BLOCK_SIZES_ALL][OBMC_FAMILY_MODES - 1] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { 128, 255 }, { 128, 255 }, { 128, 255 },
-#endif
-      { 128, 255 }, { 128, 255 }, { 128, 255 }, { 45, 255 },  { 79, 255 },
-      { 75, 255 },  { 130, 255 }, { 141, 255 }, { 144, 255 }, { 208, 255 },
-      { 201, 255 }, { 186, 255 }, { 231, 255 },
-#if CONFIG_EXT_PARTITION
-      { 252, 255 }, { 252, 255 }, { 252, 255 },
-#endif  // CONFIG_EXT_PARTITION
-      { 208, 255 }, { 208, 255 }, { 208, 255 }, { 208, 255 }
-    };
-
-static const aom_cdf_prob
-    default_ncobmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(OBMC_FAMILY_MODES)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { AOM_ICDF(128 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(128 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      /** Only these nine block sizes allow ncobmc_adapt_weight **/
-      { AOM_ICDF(10922), AOM_ICDF(21845), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10922), AOM_ICDF(21845), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10922), AOM_ICDF(21845), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10922), AOM_ICDF(21845), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10922), AOM_ICDF(21845), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10922), AOM_ICDF(21845), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10922), AOM_ICDF(21845), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10922), AOM_ICDF(21845), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10922), AOM_ICDF(21845), AOM_ICDF(32768), 0 },
-      /***********************************************************/
-      { AOM_ICDF(231 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(252 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(252 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(252 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-      { AOM_ICDF(208 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(208 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(208 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(208 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(208 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(208 * 128), AOM_ICDF(32767), AOM_ICDF(32768), 0 }
-    };
-#else
-const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)] = {
-  -SIMPLE_TRANSLATION, 2, -OBMC_CAUSAL, -WARPED_CAUSAL,
-};
-
-static const aom_prob
-    default_motion_mode_prob[BLOCK_SIZES_ALL][MOTION_MODES - 1] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { 128, 128 }, { 128, 128 }, { 128, 128 },
-#endif
-      { 128, 128 }, { 128, 128 }, { 128, 128 }, { 62, 115 },  { 39, 131 },
-      { 39, 132 },  { 118, 94 },  { 77, 125 },  { 100, 121 }, { 190, 66 },
-      { 207, 102 }, { 197, 100 }, { 239, 76 },
-#if CONFIG_EXT_PARTITION
-      { 252, 200 }, { 252, 200 }, { 252, 200 },
-#endif  // CONFIG_EXT_PARTITION
-      { 208, 200 }, { 208, 200 }, { 208, 200 }, { 208, 200 }, { 208, 200 },
-      { 208, 200 },
-#if CONFIG_EXT_PARTITION
-      { 252, 200 }, { 252, 200 }
-#endif  // CONFIG_EXT_PARTITION
-    };
-static const aom_cdf_prob
-    default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(7936), AOM_ICDF(19091), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4991), AOM_ICDF(19205), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4992), AOM_ICDF(19314), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(15104), AOM_ICDF(21590), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9855), AOM_ICDF(21043), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(12800), AOM_ICDF(22238), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(24320), AOM_ICDF(26498), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26496), AOM_ICDF(28995), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(25216), AOM_ICDF(28166), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(30592), AOM_ICDF(31238), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(32256), AOM_ICDF(32656), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32256), AOM_ICDF(32656), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32256), AOM_ICDF(32656), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32640), AOM_ICDF(32740), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(32256), AOM_ICDF(32656), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32256), AOM_ICDF(32656), AOM_ICDF(32768), 0 },
-#endif
-    };
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-// Probability for the case that only 1 additional motion mode is allowed
-static const aom_prob default_obmc_prob[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  128, 128, 128,
-#endif
-  128, 128, 128, 45,  79,  75,  130, 141, 144, 208, 201, 186, 231,
-#if CONFIG_EXT_PARTITION
-  252, 252, 252,
-#endif  // CONFIG_EXT_PARTITION
-  208, 208, 208, 208, 208, 208,
-#if CONFIG_EXT_PARTITION
-  252, 252
-#endif  // CONFIG_EXT_PARTITION
-};
-
-#if CONFIG_NEW_MULTISYMBOL || CONFIG_NCOBMC_ADAPT_WEIGHT
 static const aom_cdf_prob default_obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-#endif
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(45 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(79 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(75 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(130 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(141 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(144 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(201 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(186 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(231 * 128), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-  { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-  { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-  { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(252 * 128), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_EXT_PARTITION
-};
-#endif  // CONFIG_NEW_MULTISYMBOL
-#endif
-
-static const aom_prob default_delta_q_probs[DELTA_Q_PROBS] = { 220, 220, 220 };
-static const aom_cdf_prob default_delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)] = {
-  AOM_ICDF(28160), AOM_ICDF(32120), AOM_ICDF(32677), AOM_ICDF(32768), 0
-};
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-static const aom_prob
-    default_delta_lf_multi_probs[FRAME_LF_COUNT][DELTA_LF_PROBS] = {
-      { 220, 220, 220 }, { 220, 220, 220 }, { 220, 220, 220 }, { 220, 220, 220 }
-    };
-static const aom_cdf_prob
-    default_delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE(DELTA_LF_PROBS + 1)] = {
-      { AOM_ICDF(28160), AOM_ICDF(32120), AOM_ICDF(32677), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(28160), AOM_ICDF(32120), AOM_ICDF(32677), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(28160), AOM_ICDF(32120), AOM_ICDF(32677), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(28160), AOM_ICDF(32120), AOM_ICDF(32677), AOM_ICDF(32768), 0 }
-    };
-#endif  // CONFIG_LOOPFILTER_LEVEL
-static const aom_prob default_delta_lf_probs[DELTA_LF_PROBS] = { 220, 220,
-                                                                 220 };
-static const aom_cdf_prob default_delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)] = {
-  AOM_ICDF(28160), AOM_ICDF(32120), AOM_ICDF(32677), AOM_ICDF(32768), 0
-};
-#endif
-
-/* clang-format off */
-#if CONFIG_INTERINTRA
-const aom_tree_index av1_interintra_mode_tree[TREE_SIZE(INTERINTRA_MODES)] = {
-  -II_DC_PRED, 2,        /* 0 = II_DC_NODE     */
-  -II_SMOOTH_PRED, 4,    /* 1 = II_SMOOTH_PRED */
-  -II_V_PRED, -II_H_PRED /* 2 = II_V_NODE      */
-};
-#endif  // CONFIG_INTERINTRA
-
-const aom_tree_index av1_inter_compound_mode_tree
-    [TREE_SIZE(INTER_COMPOUND_MODES)] = {
-  -INTER_COMPOUND_OFFSET(ZERO_ZEROMV), 2,
-  -INTER_COMPOUND_OFFSET(NEAREST_NEARESTMV), 4,
-  6, -INTER_COMPOUND_OFFSET(NEW_NEWMV),
-  -INTER_COMPOUND_OFFSET(NEAR_NEARMV), 8,
-  10, 12,
-  -INTER_COMPOUND_OFFSET(NEAREST_NEWMV), -INTER_COMPOUND_OFFSET(NEW_NEARESTMV),
-  -INTER_COMPOUND_OFFSET(NEAR_NEWMV), -INTER_COMPOUND_OFFSET(NEW_NEARMV)
-};
-
-#if CONFIG_COMPOUND_SINGLEREF
-// TODO(zoeliu): To redesign the tree structure once the number of mode changes.
-/*
-const aom_tree_index av1_inter_singleref_comp_mode_tree
-    [TREE_SIZE(INTER_SINGLEREF_COMP_MODES)] = {
-  -INTER_SINGLEREF_COMP_OFFSET(SR_ZERO_NEWMV), 2,
-  -INTER_SINGLEREF_COMP_OFFSET(SR_NEAREST_NEARMV), 4,
-  6, -INTER_SINGLEREF_COMP_OFFSET(SR_NEW_NEWMV),
-  -INTER_SINGLEREF_COMP_OFFSET(SR_NEAREST_NEWMV),
-  -INTER_SINGLEREF_COMP_OFFSET(SR_NEAR_NEWMV)
-};*/
-
-const aom_tree_index av1_inter_singleref_comp_mode_tree
-    [TREE_SIZE(INTER_SINGLEREF_COMP_MODES)] = {
-  -INTER_SINGLEREF_COMP_OFFSET(SR_ZERO_NEWMV), 2,
-  -INTER_SINGLEREF_COMP_OFFSET(SR_NEAREST_NEARMV), 4,
-  -INTER_SINGLEREF_COMP_OFFSET(SR_NEAR_NEWMV),
-      -INTER_SINGLEREF_COMP_OFFSET(SR_NEW_NEWMV)
-};
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-#if CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-const aom_tree_index av1_compound_type_tree[TREE_SIZE(COMPOUND_TYPES)] = {
-  -COMPOUND_AVERAGE, 2, -COMPOUND_WEDGE, -COMPOUND_SEG
-};
-#elif !CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-const aom_tree_index av1_compound_type_tree[TREE_SIZE(COMPOUND_TYPES)] = {
-  -COMPOUND_AVERAGE, -COMPOUND_WEDGE
-};
-#elif CONFIG_COMPOUND_SEGMENT && !CONFIG_WEDGE
-const aom_tree_index av1_compound_type_tree[TREE_SIZE(COMPOUND_TYPES)] = {
-  -COMPOUND_AVERAGE, -COMPOUND_SEG
-};
-#else
-const aom_tree_index av1_compound_type_tree[TREE_SIZE(COMPOUND_TYPES)] = {};
-#endif  // CONFIG_COMPOUND_SEGMENT && CONFIG_WEDGE
-/* clang-format on */
-
-const aom_tree_index av1_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
-  -PARTITION_NONE, 2, -PARTITION_HORZ, 4, -PARTITION_VERT, -PARTITION_SPLIT
-};
-
-#if CONFIG_EXT_PARTITION_TYPES
-/* clang-format off */
-const aom_tree_index av1_ext_partition_tree[TREE_SIZE(EXT_PARTITION_TYPES)] = {
-  -PARTITION_NONE, 2,
-  6, 4,
-  8, -PARTITION_SPLIT,
-  -PARTITION_HORZ, 10,
-  -PARTITION_VERT, 14,
-
-  -PARTITION_HORZ_A, 12,
-  -PARTITION_HORZ_B, -PARTITION_HORZ_4,
-
-  -PARTITION_VERT_A, 16,
-  -PARTITION_VERT_B, -PARTITION_VERT_4
-};
-/* clang-format on */
-#endif  // CONFIG_EXT_PARTITION_TYPES
-
-static const aom_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
-  6, 97, 151, 205,
+  { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+  { AOM_CDF2(10437) }, { AOM_CDF2(9371) },  { AOM_CDF2(9301) },
+  { AOM_CDF2(17432) }, { AOM_CDF2(14423) }, { AOM_CDF2(15142) },
+  { AOM_CDF2(25817) }, { AOM_CDF2(22823) }, { AOM_CDF2(22083) },
+  { AOM_CDF2(30128) }, { AOM_CDF2(31014) }, { AOM_CDF2(31560) },
+  { AOM_CDF2(32638) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+  { AOM_CDF2(23664) }, { AOM_CDF2(20901) }, { AOM_CDF2(24008) },
+  { AOM_CDF2(26879) }
 };
 
-#if CONFIG_NEW_MULTISYMBOL
-static const aom_cdf_prob
-    default_intra_inter_cdf[INTRA_INTER_CONTEXTS][CDF_SIZE(2)] = {
-      { AOM_ICDF(768), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(12416), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(19328), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(26240), AOM_ICDF(32768), 0 }
-    };
-#endif
+static const aom_cdf_prob default_intra_inter_cdf[INTRA_INTER_CONTEXTS]
+                                                 [CDF_SIZE(2)] = {
+                                                   { AOM_CDF2(806) },
+                                                   { AOM_CDF2(16662) },
+                                                   { AOM_CDF2(20186) },
+                                                   { AOM_CDF2(26538) }
+                                                 };
 
-static const aom_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = {
-  190, 156, 91, 77, 22
-};
-
-#if CONFIG_NEW_MULTISYMBOL
 static const aom_cdf_prob default_comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(
-    2)] = { { AOM_ICDF(24290), AOM_ICDF(32768), 0 },
-            { AOM_ICDF(19956), AOM_ICDF(32768), 0 },
-            { AOM_ICDF(11641), AOM_ICDF(32768), 0 },
-            { AOM_ICDF(9804), AOM_ICDF(32768), 0 },
-            { AOM_ICDF(2842), AOM_ICDF(32768), 0 } };
-#endif  // CONFIG_NEW_MULTISYMBOL
-
-#if CONFIG_EXT_COMP_REFS
-static const aom_prob default_comp_ref_type_p[COMP_REF_TYPE_CONTEXTS] = {
-  8, 20, 78, 91, 194
-};
-static const aom_prob
-    default_uni_comp_ref_p[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1] = {
-      { 88, 30, 28 }, { 218, 97, 105 }, { 254, 180, 196 }
-    };
+    2)] = { { AOM_CDF2(26828) },
+            { AOM_CDF2(24035) },
+            { AOM_CDF2(12031) },
+            { AOM_CDF2(10640) },
+            { AOM_CDF2(2901) } };
+
+static const aom_cdf_prob default_comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS]
+                                                   [CDF_SIZE(2)] = {
+                                                     { AOM_CDF2(1198) },
+                                                     { AOM_CDF2(2070) },
+                                                     { AOM_CDF2(9166) },
+                                                     { AOM_CDF2(7499) },
+                                                     { AOM_CDF2(22475) }
+                                                   };
 
-#if CONFIG_NEW_MULTISYMBOL
 static const aom_cdf_prob
-    default_comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS][CDF_SIZE(2)] = {
-      { AOM_ICDF(8 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(20 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(78 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(91 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(194 * 128), AOM_ICDF(32768), 0 }
-    };
-static const aom_cdf_prob
-    default_uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1]
-                            [CDF_SIZE(2)] = {
-                              { { AOM_ICDF(88 * 128), AOM_ICDF(32768), 0 },
-                                { AOM_ICDF(30 * 128), AOM_ICDF(32768), 0 },
-                                { AOM_ICDF(28 * 128), AOM_ICDF(32768), 0 } },
-                              { { AOM_ICDF(218 * 128), AOM_ICDF(32768), 0 },
-                                { AOM_ICDF(97 * 128), AOM_ICDF(32768), 0 },
-                                { AOM_ICDF(105 * 128), AOM_ICDF(32768), 0 } },
-                              { { AOM_ICDF(254 * 128), AOM_ICDF(32768), 0 },
-                                { AOM_ICDF(180 * 128), AOM_ICDF(32768), 0 },
-                                { AOM_ICDF(196 * 128), AOM_ICDF(32768), 0 } }
-                            };
-#endif  // CONFIG_NEW_MULTISYMBOL
-#endif  // CONFIG_EXT_COMP_REFS
-
-#if CONFIG_EXT_REFS
-static const aom_prob default_comp_ref_p[REF_CONTEXTS][FWD_REFS - 1] = {
-  { 28, 10, 8 },
-  { 77, 27, 26 },
-  { 127, 62, 56 },
-  { 186, 126, 160 },
-  { 236, 143, 172 }
-};
-
-static const aom_prob default_comp_bwdref_p[REF_CONTEXTS][BWD_REFS - 1] = {
-  { 22, 13 }, { 140, 124 }, { 241, 239 }, { 128, 128 }, { 128, 128 }
-};
+    default_uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS -
+                                                    1][CDF_SIZE(2)] = {
+      { { AOM_CDF2(5284) }, { AOM_CDF2(3865) }, { AOM_CDF2(3128) } },
+      { { AOM_CDF2(23152) }, { AOM_CDF2(14173) }, { AOM_CDF2(15270) } },
+      { { AOM_CDF2(31774) }, { AOM_CDF2(25120) }, { AOM_CDF2(26710) } }
+    };
+
+static const aom_cdf_prob default_single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1]
+                                                [CDF_SIZE(2)] = {
+                                                  { { AOM_CDF2(4897) },
+                                                    { AOM_CDF2(1555) },
+                                                    { AOM_CDF2(4236) },
+                                                    { AOM_CDF2(8650) },
+                                                    { AOM_CDF2(904) },
+                                                    { AOM_CDF2(1444) } },
+                                                  { { AOM_CDF2(16973) },
+                                                    { AOM_CDF2(16751) },
+                                                    { AOM_CDF2(19647) },
+                                                    { AOM_CDF2(24773) },
+                                                    { AOM_CDF2(11014) },
+                                                    { AOM_CDF2(15087) } },
+                                                  { { AOM_CDF2(29744) },
+                                                    { AOM_CDF2(30279) },
+                                                    { AOM_CDF2(31194) },
+                                                    { AOM_CDF2(31895) },
+                                                    { AOM_CDF2(26875) },
+                                                    { AOM_CDF2(30304) } }
+                                                };
 
-#if CONFIG_NEW_MULTISYMBOL
 static const aom_cdf_prob
     default_comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)] = {
-      { { AOM_ICDF(3556), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1217), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(988), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(9857), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(3394), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(3303), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(16237), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(7946), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(7195), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(23826), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(16124), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(20536), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(30195), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(18344), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(21980), AOM_ICDF(32768), 0 } }
+      { { AOM_CDF2(4946) }, { AOM_CDF2(9468) }, { AOM_CDF2(1503) } },
+      { { AOM_CDF2(19891) }, { AOM_CDF2(22441) }, { AOM_CDF2(15160) } },
+      { { AOM_CDF2(30731) }, { AOM_CDF2(31059) }, { AOM_CDF2(27544) } }
     };
 
 static const aom_cdf_prob
     default_comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)] = {
-      { { AOM_ICDF(2762), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1614), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(17976), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(15912), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(30894), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(30639), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(32768), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(32768), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(32768), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(32768), AOM_ICDF(32768), 0 } }
+      { { AOM_CDF2(2235) }, { AOM_CDF2(1423) } },
+      { { AOM_CDF2(17182) }, { AOM_CDF2(15175) } },
+      { { AOM_CDF2(30606) }, { AOM_CDF2(30489) } }
     };
-#endif  // CONFIG_NEW_MULTISYMBOL
 
-#else  // !CONFIG_EXT_REFS
-
-static const aom_prob default_comp_ref_p[REF_CONTEXTS][COMP_REFS - 1] = {
-  { 43 }, { 100 }, { 137 }, { 212 }, { 229 },
-};
-#if CONFIG_NEW_MULTISYMBOL
 static const aom_cdf_prob
-    default_comp_ref_cdf[REF_CONTEXTS][COMP_REFS - 1][CDF_SIZE(2)] = {
-      { { AOM_ICDF(43 * 128), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(100 * 128), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(137 * 128), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(212 * 128), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(229 * 128), AOM_ICDF(32768), 0 } }
+    default_palette_y_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)] = {
+      { AOM_CDF7(7952, 13000, 18149, 21478, 25527, 29241) },
+      { AOM_CDF7(7139, 11421, 16195, 19544, 23666, 28073) },
+      { AOM_CDF7(7788, 12741, 17325, 20500, 24315, 28530) },
+      { AOM_CDF7(8271, 14064, 18246, 21564, 25071, 28533) },
+      { AOM_CDF7(12725, 19180, 21863, 24839, 27535, 30120) },
+      { AOM_CDF7(9711, 14888, 16923, 21052, 25661, 27875) },
+      { AOM_CDF7(14940, 20797, 21678, 24186, 27033, 28999) }
     };
-#endif  // CONFIG_NEW_MULTISYMBOL
-#endif  // CONFIG_EXT_REFS
 
-static const aom_prob default_single_ref_p[REF_CONTEXTS][SINGLE_REFS - 1] = {
-#if CONFIG_EXT_REFS
-  { 36, 16, 32, 57, 11, 14 },
-  { 68, 128, 73, 128, 49, 124 },
-  { 136, 236, 127, 170, 81, 238 },
-  { 128, 128, 191, 211, 115, 128 },
-  { 224, 128, 230, 242, 208, 128 }
-#else   // !CONFIG_EXT_REFS
-  { 31, 25 }, { 72, 80 }, { 147, 148 }, { 197, 191 }, { 235, 247 },
-#endif  // CONFIG_EXT_REFS
-};
-
-#if CONFIG_NEW_MULTISYMBOL
 static const aom_cdf_prob
-    default_single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1][CDF_SIZE(2)] = {
-#if CONFIG_EXT_REFS
-      { { AOM_ICDF(4623), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(2110), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(4132), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(7309), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1392), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1781), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(8659), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(16372), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(9371), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(16322), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(6216), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(15834), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(17353), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(30182), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(16300), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(21702), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(10365), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(30486), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(32768), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(32768), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(24426), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(26972), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(14760), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(32768), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(28634), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(32768), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(29425), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(30969), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(26676), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(32768), AOM_ICDF(32768), 0 } }
-#else   // !CONFIG_EXT_REFS
-      { { AOM_ICDF(31 * 128), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(25 * 128), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(72 * 128), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(80 * 128), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(147 * 128), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(148 * 128), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(197 * 128), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(191 * 128), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(235 * 128), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(247 * 128), AOM_ICDF(32768), 0 } }
-#endif  // CONFIG_EXT_REFS
+    default_palette_uv_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)] = {
+      { AOM_CDF7(8713, 19979, 27128, 29609, 31331, 32272) },
+      { AOM_CDF7(5839, 15573, 23581, 26947, 29848, 31700) },
+      { AOM_CDF7(4426, 11260, 17999, 21483, 25863, 29430) },
+      { AOM_CDF7(3228, 9464, 14993, 18089, 22523, 27420) },
+      { AOM_CDF7(3768, 8886, 13091, 17852, 22495, 27207) },
+      { AOM_CDF7(2464, 8451, 12861, 21632, 25525, 28555) },
+      { AOM_CDF7(1269, 5435, 10433, 18963, 21700, 25865) }
+    };
+
+static const aom_cdf_prob default_palette_y_mode_cdf
+    [PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][CDF_SIZE(2)] = {
+      { { AOM_CDF2(31676) }, { AOM_CDF2(3419) }, { AOM_CDF2(1261) } },
+      { { AOM_CDF2(31912) }, { AOM_CDF2(2859) }, { AOM_CDF2(980) } },
+      { { AOM_CDF2(31823) }, { AOM_CDF2(3400) }, { AOM_CDF2(781) } },
+      { { AOM_CDF2(32030) }, { AOM_CDF2(3561) }, { AOM_CDF2(904) } },
+      { { AOM_CDF2(32309) }, { AOM_CDF2(7337) }, { AOM_CDF2(1462) } },
+      { { AOM_CDF2(32265) }, { AOM_CDF2(4015) }, { AOM_CDF2(1521) } },
+      { { AOM_CDF2(32450) }, { AOM_CDF2(7946) }, { AOM_CDF2(129) } }
     };
-#endif  // CONFIG_NEW_MULTISYMBOL
-
-#if CONFIG_COMPOUND_SINGLEREF
-// TODO(zoeliu): Default values to be further adjusted based on the collected
-//               stats.
-static const aom_prob default_comp_inter_mode_p[COMP_INTER_MODE_CONTEXTS] = {
-  40, 110, 160, 220
-};
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-// TODO(huisu): tune these cdfs
-const aom_cdf_prob
-    default_palette_y_size_cdf[PALETTE_BLOCK_SIZES][CDF_SIZE(PALETTE_SIZES)] = {
-      { AOM_ICDF(12288), AOM_ICDF(19408), AOM_ICDF(24627), AOM_ICDF(26662),
-        AOM_ICDF(28499), AOM_ICDF(30667), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(2815), AOM_ICDF(4570), AOM_ICDF(9416), AOM_ICDF(10875),
-        AOM_ICDF(13782), AOM_ICDF(19863), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(3839), AOM_ICDF(5986), AOM_ICDF(11949), AOM_ICDF(13413),
-        AOM_ICDF(16286), AOM_ICDF(21823), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(12032), AOM_ICDF(14948), AOM_ICDF(22187), AOM_ICDF(23138),
-        AOM_ICDF(24756), AOM_ICDF(27635), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(14847), AOM_ICDF(20167), AOM_ICDF(25433), AOM_ICDF(26751),
-        AOM_ICDF(28278), AOM_ICDF(30119), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(14336), AOM_ICDF(20240), AOM_ICDF(24840), AOM_ICDF(26079),
-        AOM_ICDF(27908), AOM_ICDF(30034), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(18816), AOM_ICDF(25574), AOM_ICDF(29030), AOM_ICDF(29877),
-        AOM_ICDF(30656), AOM_ICDF(31506), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(23039), AOM_ICDF(27333), AOM_ICDF(30220), AOM_ICDF(30708),
-        AOM_ICDF(31070), AOM_ICDF(31826), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(13696), AOM_ICDF(18911), AOM_ICDF(23620), AOM_ICDF(25371),
-        AOM_ICDF(29821), AOM_ICDF(31617), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(12543), AOM_ICDF(20838), AOM_ICDF(27455), AOM_ICDF(28762),
-        AOM_ICDF(29763), AOM_ICDF(31546), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(12543), AOM_ICDF(20838), AOM_ICDF(27455), AOM_ICDF(28762),
-        AOM_ICDF(29763), AOM_ICDF(31546), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(12543), AOM_ICDF(20838), AOM_ICDF(27455), AOM_ICDF(28762),
-        AOM_ICDF(29763), AOM_ICDF(31546), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(12543), AOM_ICDF(20838), AOM_ICDF(27455), AOM_ICDF(28762),
-        AOM_ICDF(29763), AOM_ICDF(31546), AOM_ICDF(32768), 0 },
-#endif
-    };
-
-const aom_cdf_prob default_palette_uv_size_cdf[PALETTE_BLOCK_SIZES][CDF_SIZE(
-    PALETTE_SIZES)] = {
-  { AOM_ICDF(20480), AOM_ICDF(29888), AOM_ICDF(32453), AOM_ICDF(32715),
-    AOM_ICDF(32751), AOM_ICDF(32766), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(11135), AOM_ICDF(23641), AOM_ICDF(31056), AOM_ICDF(31998),
-    AOM_ICDF(32496), AOM_ICDF(32668), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(9216), AOM_ICDF(23108), AOM_ICDF(30806), AOM_ICDF(31871),
-    AOM_ICDF(32414), AOM_ICDF(32637), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(9984), AOM_ICDF(21999), AOM_ICDF(29192), AOM_ICDF(30645),
-    AOM_ICDF(31640), AOM_ICDF(32402), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(7552), AOM_ICDF(16614), AOM_ICDF(24880), AOM_ICDF(27283),
-    AOM_ICDF(29254), AOM_ICDF(31203), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(9600), AOM_ICDF(20279), AOM_ICDF(27548), AOM_ICDF(29261),
-    AOM_ICDF(30494), AOM_ICDF(31631), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(11391), AOM_ICDF(18656), AOM_ICDF(23727), AOM_ICDF(26058),
-    AOM_ICDF(27788), AOM_ICDF(30278), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(8576), AOM_ICDF(13585), AOM_ICDF(17632), AOM_ICDF(20884),
-    AOM_ICDF(23948), AOM_ICDF(27152), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(15360), AOM_ICDF(24200), AOM_ICDF(26978), AOM_ICDF(30846),
-    AOM_ICDF(31409), AOM_ICDF(32545), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(9216), AOM_ICDF(14276), AOM_ICDF(19043), AOM_ICDF(22689),
-    AOM_ICDF(25799), AOM_ICDF(28712), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-  { AOM_ICDF(9216), AOM_ICDF(14276), AOM_ICDF(19043), AOM_ICDF(22689),
-    AOM_ICDF(25799), AOM_ICDF(28712), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(9216), AOM_ICDF(14276), AOM_ICDF(19043), AOM_ICDF(22689),
-    AOM_ICDF(25799), AOM_ICDF(28712), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(9216), AOM_ICDF(14276), AOM_ICDF(19043), AOM_ICDF(22689),
-    AOM_ICDF(25799), AOM_ICDF(28712), AOM_ICDF(32768), 0 },
-#endif
-};
-
-// When palette mode is enabled, following probability tables indicate the
-// probabilities to code the "is_palette" bit (i.e. the bit that indicates
-// if this block uses palette mode or DC_PRED mode).
-const aom_prob av1_default_palette_y_mode_prob
-    [PALETTE_BLOCK_SIZES][PALETTE_Y_MODE_CONTEXTS] = {
-      { 240, 180, 100 }, { 240, 180, 100 }, { 240, 180, 100 },
-      { 240, 180, 100 }, { 240, 180, 100 }, { 240, 180, 100 },
-      { 240, 180, 100 }, { 240, 180, 100 }, { 240, 180, 100 },
-      { 240, 180, 100 },
-#if CONFIG_EXT_PARTITION
-      { 240, 180, 100 }, { 240, 180, 100 }, { 240, 180, 100 },
-#endif  // CONFIG_EXT_PARTITION
-    };
-
-const aom_prob av1_default_palette_uv_mode_prob[PALETTE_UV_MODE_CONTEXTS] = {
-  253, 229
-};
-
-#if CONFIG_NEW_MULTISYMBOL
-const aom_cdf_prob
-    default_palette_y_mode_cdf[PALETTE_BLOCK_SIZES][PALETTE_Y_MODE_CONTEXTS]
-                              [CDF_SIZE(2)] = {
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-#if CONFIG_EXT_PARTITION
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-                                { { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 180), AOM_ICDF(32768), 0 },
-                                  { AOM_ICDF(128 * 100), AOM_ICDF(32768), 0 } },
-#endif  // CONFIG_EXT_PARTITION
-                              };
 
-const aom_cdf_prob
+static const aom_cdf_prob
     default_palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)] = {
-      { AOM_ICDF(128 * 253), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 229), AOM_ICDF(32768), 0 }
+      { AOM_CDF2(32461) }, { AOM_CDF2(21488) }
     };
 
-#endif
-
-const aom_cdf_prob default_palette_y_color_index_cdf
+static const aom_cdf_prob default_palette_y_color_index_cdf
     [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = {
       {
-          { AOM_ICDF(29568), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(16384), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(8832), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(28672), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(31872), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
+          { AOM_CDF2(28710) },
+          { AOM_CDF2(16384) },
+          { AOM_CDF2(10553) },
+          { AOM_CDF2(27036) },
+          { AOM_CDF2(31603) },
       },
       {
-          { AOM_ICDF(28032), AOM_ICDF(30326), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(11647), AOM_ICDF(27405), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(4352), AOM_ICDF(30659), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(23552), AOM_ICDF(27800), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(32256), AOM_ICDF(32504), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
+          { AOM_CDF3(27877, 30490) },
+          { AOM_CDF3(11532, 25697) },
+          { AOM_CDF3(6544, 30234) },
+          { AOM_CDF3(23018, 28072) },
+          { AOM_CDF3(31915, 32385) },
       },
       {
-          { AOM_ICDF(26112), AOM_ICDF(28374), AOM_ICDF(30039), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(9472), AOM_ICDF(22576), AOM_ICDF(27712), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(6656), AOM_ICDF(26138), AOM_ICDF(29608), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(19328), AOM_ICDF(23791), AOM_ICDF(28946), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(31744), AOM_ICDF(31984), AOM_ICDF(32336), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
+          { AOM_CDF4(25572, 28046, 30045) },
+          { AOM_CDF4(9478, 21590, 27256) },
+          { AOM_CDF4(7248, 26837, 29824) },
+          { AOM_CDF4(19167, 24486, 28349) },
+          { AOM_CDF4(31400, 31825, 32250) },
       },
       {
-          { AOM_ICDF(27904), AOM_ICDF(29215), AOM_ICDF(30075), AOM_ICDF(31190),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(9728), AOM_ICDF(22598), AOM_ICDF(26134), AOM_ICDF(29425),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(2688), AOM_ICDF(30066), AOM_ICDF(31058), AOM_ICDF(31933),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(22015), AOM_ICDF(25039), AOM_ICDF(27726), AOM_ICDF(29932),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(32383), AOM_ICDF(32482), AOM_ICDF(32554), AOM_ICDF(32660),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
+          { AOM_CDF5(24779, 26955, 28576, 30282) },
+          { AOM_CDF5(8669, 20364, 24073, 28093) },
+          { AOM_CDF5(4255, 27565, 29377, 31067) },
+          { AOM_CDF5(19864, 23674, 26716, 29530) },
+          { AOM_CDF5(31646, 31893, 32147, 32426) },
       },
       {
-          { AOM_ICDF(24319), AOM_ICDF(26299), AOM_ICDF(27486), AOM_ICDF(28600),
-            AOM_ICDF(29804), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(7935), AOM_ICDF(18217), AOM_ICDF(21116), AOM_ICDF(25440),
-            AOM_ICDF(28589), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(6656), AOM_ICDF(25016), AOM_ICDF(27105), AOM_ICDF(28698),
-            AOM_ICDF(30399), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(19967), AOM_ICDF(24117), AOM_ICDF(26550), AOM_ICDF(28566),
-            AOM_ICDF(30224), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(31359), AOM_ICDF(31607), AOM_ICDF(31775), AOM_ICDF(31977),
-            AOM_ICDF(32258), AOM_ICDF(32768), 0, 0, 0 },
+          { AOM_CDF6(23132, 25407, 26970, 28435, 30073) },
+          { AOM_CDF6(7443, 17242, 20717, 24762, 27982) },
+          { AOM_CDF6(6300, 24862, 26944, 28784, 30671) },
+          { AOM_CDF6(18916, 22895, 25267, 27435, 29652) },
+          { AOM_CDF6(31270, 31550, 31808, 32059, 32353) },
       },
       {
-          { AOM_ICDF(26368), AOM_ICDF(27768), AOM_ICDF(28588), AOM_ICDF(29274),
-            AOM_ICDF(29997), AOM_ICDF(30917), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(8960), AOM_ICDF(18260), AOM_ICDF(20810), AOM_ICDF(23986),
-            AOM_ICDF(26627), AOM_ICDF(28882), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(7295), AOM_ICDF(24111), AOM_ICDF(25836), AOM_ICDF(27515),
-            AOM_ICDF(29033), AOM_ICDF(30769), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(22016), AOM_ICDF(25208), AOM_ICDF(27305), AOM_ICDF(28159),
-            AOM_ICDF(29221), AOM_ICDF(30274), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(31744), AOM_ICDF(31932), AOM_ICDF(32050), AOM_ICDF(32199),
-            AOM_ICDF(32335), AOM_ICDF(32521), AOM_ICDF(32768), 0, 0 },
+          { AOM_CDF7(23105, 25199, 26464, 27684, 28931, 30318) },
+          { AOM_CDF7(6950, 15447, 18952, 22681, 25567, 28563) },
+          { AOM_CDF7(7560, 23474, 25490, 27203, 28921, 30708) },
+          { AOM_CDF7(18544, 22373, 24457, 26195, 28119, 30045) },
+          { AOM_CDF7(31198, 31451, 31670, 31882, 32123, 32391) },
       },
       {
-          { AOM_ICDF(26624), AOM_ICDF(27872), AOM_ICDF(28599), AOM_ICDF(29153),
-            AOM_ICDF(29633), AOM_ICDF(30172), AOM_ICDF(30841), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(6655), AOM_ICDF(17569), AOM_ICDF(19587), AOM_ICDF(23345),
-            AOM_ICDF(25884), AOM_ICDF(28088), AOM_ICDF(29678), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(3584), AOM_ICDF(27296), AOM_ICDF(28429), AOM_ICDF(29158),
-            AOM_ICDF(30032), AOM_ICDF(30780), AOM_ICDF(31572), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(23551), AOM_ICDF(25855), AOM_ICDF(27070), AOM_ICDF(27893),
-            AOM_ICDF(28597), AOM_ICDF(29721), AOM_ICDF(30970), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(32128), AOM_ICDF(32173), AOM_ICDF(32245), AOM_ICDF(32337),
-            AOM_ICDF(32416), AOM_ICDF(32500), AOM_ICDF(32609), AOM_ICDF(32768),
-            0 },
+          { AOM_CDF8(21689, 23883, 25163, 26352, 27506, 28827, 30195) },
+          { AOM_CDF8(6892, 15385, 17840, 21606, 24287, 26753, 29204) },
+          { AOM_CDF8(5651, 23182, 25042, 26518, 27982, 29392, 30900) },
+          { AOM_CDF8(19349, 22578, 24418, 25994, 27524, 29031, 30448) },
+          { AOM_CDF8(31028, 31270, 31504, 31705, 31927, 32153, 32392) },
       },
     };
 
-const aom_cdf_prob default_palette_uv_color_index_cdf
+static const aom_cdf_prob default_palette_uv_color_index_cdf
     [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = {
       {
-          { AOM_ICDF(29824), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(16384), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(8832), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(30720), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(31744), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
+          { AOM_CDF2(29089) },
+          { AOM_CDF2(16384) },
+          { AOM_CDF2(8713) },
+          { AOM_CDF2(29257) },
+          { AOM_CDF2(31610) },
       },
       {
-          { AOM_ICDF(27648), AOM_ICDF(30208), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(14080), AOM_ICDF(26563), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(5120), AOM_ICDF(30932), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(24448), AOM_ICDF(27828), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(31616), AOM_ICDF(32219), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
+          { AOM_CDF3(25257, 29145) },
+          { AOM_CDF3(12287, 27293) },
+          { AOM_CDF3(7033, 27960) },
+          { AOM_CDF3(20145, 25405) },
+          { AOM_CDF3(30608, 31639) },
       },
       {
-          { AOM_ICDF(25856), AOM_ICDF(28259), AOM_ICDF(30584), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(11520), AOM_ICDF(22476), AOM_ICDF(27944), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(8064), AOM_ICDF(26882), AOM_ICDF(30308), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(19455), AOM_ICDF(23823), AOM_ICDF(29134), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(30848), AOM_ICDF(31501), AOM_ICDF(32174), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
+          { AOM_CDF4(24210, 27175, 29903) },
+          { AOM_CDF4(9888, 22386, 27214) },
+          { AOM_CDF4(5901, 26053, 29293) },
+          { AOM_CDF4(18318, 22152, 28333) },
+          { AOM_CDF4(30459, 31136, 31926) },
       },
       {
-          { AOM_ICDF(26751), AOM_ICDF(28020), AOM_ICDF(29541), AOM_ICDF(31230),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(12032), AOM_ICDF(26045), AOM_ICDF(30772), AOM_ICDF(31497),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(1280), AOM_ICDF(32153), AOM_ICDF(32458), AOM_ICDF(32560),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(23424), AOM_ICDF(24154), AOM_ICDF(29201), AOM_ICDF(29856),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(32256), AOM_ICDF(32402), AOM_ICDF(32561), AOM_ICDF(32682),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
+          { AOM_CDF5(22980, 25479, 27781, 29986) },
+          { AOM_CDF5(8413, 21408, 24859, 28874) },
+          { AOM_CDF5(2257, 29449, 30594, 31598) },
+          { AOM_CDF5(19189, 21202, 25915, 28620) },
+          { AOM_CDF5(31844, 32044, 32281, 32518) },
       },
       {
-          { AOM_ICDF(24576), AOM_ICDF(26720), AOM_ICDF(28114), AOM_ICDF(28950),
-            AOM_ICDF(31694), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(7551), AOM_ICDF(16613), AOM_ICDF(20462), AOM_ICDF(25269),
-            AOM_ICDF(29077), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(6272), AOM_ICDF(23039), AOM_ICDF(25623), AOM_ICDF(28163),
-            AOM_ICDF(30861), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(17024), AOM_ICDF(18808), AOM_ICDF(20771), AOM_ICDF(27941),
-            AOM_ICDF(29845), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(31616), AOM_ICDF(31936), AOM_ICDF(32079), AOM_ICDF(32321),
-            AOM_ICDF(32546), AOM_ICDF(32768), 0, 0, 0 },
+          { AOM_CDF6(22217, 24567, 26637, 28683, 30548) },
+          { AOM_CDF6(7307, 16406, 19636, 24632, 28424) },
+          { AOM_CDF6(4441, 25064, 26879, 28942, 30919) },
+          { AOM_CDF6(17210, 20528, 23319, 26750, 29582) },
+          { AOM_CDF6(30674, 30953, 31396, 31735, 32207) },
       },
       {
-          { AOM_ICDF(23296), AOM_ICDF(25590), AOM_ICDF(27833), AOM_ICDF(29337),
-            AOM_ICDF(29954), AOM_ICDF(31229), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(7552), AOM_ICDF(13659), AOM_ICDF(16570), AOM_ICDF(21695),
-            AOM_ICDF(24506), AOM_ICDF(27701), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(6911), AOM_ICDF(24788), AOM_ICDF(26284), AOM_ICDF(27753),
-            AOM_ICDF(29575), AOM_ICDF(30872), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(17535), AOM_ICDF(22236), AOM_ICDF(24457), AOM_ICDF(26242),
-            AOM_ICDF(27363), AOM_ICDF(30191), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(30592), AOM_ICDF(31289), AOM_ICDF(31745), AOM_ICDF(31921),
-            AOM_ICDF(32149), AOM_ICDF(32321), AOM_ICDF(32768), 0, 0 },
+          { AOM_CDF7(21239, 23168, 25044, 26962, 28705, 30506) },
+          { AOM_CDF7(6545, 15012, 18004, 21817, 25503, 28701) },
+          { AOM_CDF7(3448, 26295, 27437, 28704, 30126, 31442) },
+          { AOM_CDF7(15889, 18323, 21704, 24698, 26976, 29690) },
+          { AOM_CDF7(30988, 31204, 31479, 31734, 31983, 32325) },
       },
       {
-          { AOM_ICDF(22016), AOM_ICDF(24242), AOM_ICDF(25141), AOM_ICDF(27137),
-            AOM_ICDF(27797), AOM_ICDF(29331), AOM_ICDF(30848), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(8063), AOM_ICDF(13564), AOM_ICDF(16940), AOM_ICDF(21948),
-            AOM_ICDF(24568), AOM_ICDF(25689), AOM_ICDF(26989), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(6528), AOM_ICDF(27028), AOM_ICDF(27835), AOM_ICDF(28741),
-            AOM_ICDF(30031), AOM_ICDF(31795), AOM_ICDF(32285), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(18047), AOM_ICDF(23797), AOM_ICDF(25444), AOM_ICDF(26274),
-            AOM_ICDF(27111), AOM_ICDF(27929), AOM_ICDF(30367), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(30208), AOM_ICDF(30628), AOM_ICDF(31046), AOM_ICDF(31658),
-            AOM_ICDF(31762), AOM_ICDF(32367), AOM_ICDF(32469), AOM_ICDF(32768),
-            0 },
-      }
-    };
-#if CONFIG_MRC_TX
-// TODO(sarahparker) Tune these cdfs
-const aom_cdf_prob default_mrc_mask_intra_cdf
-    [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = {
-      {
-          { AOM_ICDF(29568), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(16384), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(8832), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(28672), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(31872), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(28032), AOM_ICDF(30326), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(11647), AOM_ICDF(27405), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(4352), AOM_ICDF(30659), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(23552), AOM_ICDF(27800), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(32256), AOM_ICDF(32504), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-      },
-      {
-          { AOM_ICDF(26112), AOM_ICDF(28374), AOM_ICDF(30039), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(9472), AOM_ICDF(22576), AOM_ICDF(27712), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(6656), AOM_ICDF(26138), AOM_ICDF(29608), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(19328), AOM_ICDF(23791), AOM_ICDF(28946), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(31744), AOM_ICDF(31984), AOM_ICDF(32336), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(27904), AOM_ICDF(29215), AOM_ICDF(30075), AOM_ICDF(31190),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(9728), AOM_ICDF(22598), AOM_ICDF(26134), AOM_ICDF(29425),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(2688), AOM_ICDF(30066), AOM_ICDF(31058), AOM_ICDF(31933),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(22015), AOM_ICDF(25039), AOM_ICDF(27726), AOM_ICDF(29932),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(32383), AOM_ICDF(32482), AOM_ICDF(32554), AOM_ICDF(32660),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(24319), AOM_ICDF(26299), AOM_ICDF(27486), AOM_ICDF(28600),
-            AOM_ICDF(29804), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(7935), AOM_ICDF(18217), AOM_ICDF(21116), AOM_ICDF(25440),
-            AOM_ICDF(28589), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(6656), AOM_ICDF(25016), AOM_ICDF(27105), AOM_ICDF(28698),
-            AOM_ICDF(30399), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(19967), AOM_ICDF(24117), AOM_ICDF(26550), AOM_ICDF(28566),
-            AOM_ICDF(30224), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(31359), AOM_ICDF(31607), AOM_ICDF(31775), AOM_ICDF(31977),
-            AOM_ICDF(32258), AOM_ICDF(32768), 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(26368), AOM_ICDF(27768), AOM_ICDF(28588), AOM_ICDF(29274),
-            AOM_ICDF(29997), AOM_ICDF(30917), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(8960), AOM_ICDF(18260), AOM_ICDF(20810), AOM_ICDF(23986),
-            AOM_ICDF(26627), AOM_ICDF(28882), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(7295), AOM_ICDF(24111), AOM_ICDF(25836), AOM_ICDF(27515),
-            AOM_ICDF(29033), AOM_ICDF(30769), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(22016), AOM_ICDF(25208), AOM_ICDF(27305), AOM_ICDF(28159),
-            AOM_ICDF(29221), AOM_ICDF(30274), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(31744), AOM_ICDF(31932), AOM_ICDF(32050), AOM_ICDF(32199),
-            AOM_ICDF(32335), AOM_ICDF(32521), AOM_ICDF(32768), 0, 0 },
-      },
-      {
-          { AOM_ICDF(26624), AOM_ICDF(27872), AOM_ICDF(28599), AOM_ICDF(29153),
-            AOM_ICDF(29633), AOM_ICDF(30172), AOM_ICDF(30841), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(6655), AOM_ICDF(17569), AOM_ICDF(19587), AOM_ICDF(23345),
-            AOM_ICDF(25884), AOM_ICDF(28088), AOM_ICDF(29678), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(3584), AOM_ICDF(27296), AOM_ICDF(28429), AOM_ICDF(29158),
-            AOM_ICDF(30032), AOM_ICDF(30780), AOM_ICDF(31572), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(23551), AOM_ICDF(25855), AOM_ICDF(27070), AOM_ICDF(27893),
-            AOM_ICDF(28597), AOM_ICDF(29721), AOM_ICDF(30970), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(32128), AOM_ICDF(32173), AOM_ICDF(32245), AOM_ICDF(32337),
-            AOM_ICDF(32416), AOM_ICDF(32500), AOM_ICDF(32609), AOM_ICDF(32768),
-            0 },
+          { AOM_CDF8(21442, 23288, 24758, 26246, 27649, 28980, 30563) },
+          { AOM_CDF8(5863, 14933, 17552, 20668, 23683, 26411, 29273) },
+          { AOM_CDF8(3415, 25810, 26877, 27990, 29223, 30394, 31618) },
+          { AOM_CDF8(17965, 20084, 22232, 23974, 26274, 28402, 30390) },
+          { AOM_CDF8(31190, 31329, 31516, 31679, 31825, 32026, 32322) },
       },
     };
 
-const aom_cdf_prob default_mrc_mask_inter_cdf
-    [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = {
-      {
-          { AOM_ICDF(29568), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(16384), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(8832), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(28672), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-          { AOM_ICDF(31872), AOM_ICDF(32768), 0, 0, 0, 0, 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(28032), AOM_ICDF(30326), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(11647), AOM_ICDF(27405), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(4352), AOM_ICDF(30659), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(23552), AOM_ICDF(27800), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-          { AOM_ICDF(32256), AOM_ICDF(32504), AOM_ICDF(32768), 0, 0, 0, 0, 0,
-            0 },
-      },
-      {
-          { AOM_ICDF(26112), AOM_ICDF(28374), AOM_ICDF(30039), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(9472), AOM_ICDF(22576), AOM_ICDF(27712), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(6656), AOM_ICDF(26138), AOM_ICDF(29608), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(19328), AOM_ICDF(23791), AOM_ICDF(28946), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-          { AOM_ICDF(31744), AOM_ICDF(31984), AOM_ICDF(32336), AOM_ICDF(32768),
-            0, 0, 0, 0, 0 },
-      },
-      {
-          { AOM_ICDF(27904), AOM_ICDF(29215), AOM_ICDF(30075), AOM_ICDF(31190),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(9728), AOM_ICDF(22598), AOM_ICDF(26134), AOM_ICDF(29425),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(2688), AOM_ICDF(30066), AOM_ICDF(31058), AOM_ICDF(31933),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(22015), AOM_ICDF(25039), AOM_ICDF(27726), AOM_ICDF(29932),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-          { AOM_ICDF(32383), AOM_ICDF(32482), AOM_ICDF(32554), AOM_ICDF(32660),
-            AOM_ICDF(32768), 0, 0, 0, 0 },
-      },
+static const aom_cdf_prob
+    default_txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)] = {
+      { AOM_CDF2(28581) }, { AOM_CDF2(23846) }, { AOM_CDF2(20847) },
+      { AOM_CDF2(24315) }, { AOM_CDF2(18196) }, { AOM_CDF2(12133) },
+      { AOM_CDF2(18791) }, { AOM_CDF2(10887) }, { AOM_CDF2(11005) },
+      { AOM_CDF2(27179) }, { AOM_CDF2(20004) }, { AOM_CDF2(11281) },
+      { AOM_CDF2(26549) }, { AOM_CDF2(19308) }, { AOM_CDF2(14224) },
+      { AOM_CDF2(28015) }, { AOM_CDF2(21546) }, { AOM_CDF2(14400) },
+      { AOM_CDF2(28165) }, { AOM_CDF2(22401) }, { AOM_CDF2(16088) }
+    };
+
+static const aom_cdf_prob default_skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)] = {
+  { AOM_CDF2(31671) }, { AOM_CDF2(16515) }, { AOM_CDF2(4576) }
+};
+
+static const aom_cdf_prob default_skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE(
+    2)] = { { AOM_CDF2(32621) }, { AOM_CDF2(20708) }, { AOM_CDF2(8127) } };
+
+static const aom_cdf_prob
+    default_compound_idx_cdfs[COMP_INDEX_CONTEXTS][CDF_SIZE(2)] = {
+      { AOM_CDF2(18244) }, { AOM_CDF2(12865) }, { AOM_CDF2(7053) },
+      { AOM_CDF2(13259) }, { AOM_CDF2(9334) },  { AOM_CDF2(4644) }
+    };
+
+static const aom_cdf_prob
+    default_comp_group_idx_cdfs[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)] = {
+      { AOM_CDF2(26607) }, { AOM_CDF2(22891) }, { AOM_CDF2(18840) },
+      { AOM_CDF2(24594) }, { AOM_CDF2(19934) }, { AOM_CDF2(22674) }
+    };
+
+static const aom_cdf_prob default_intrabc_cdf[CDF_SIZE(2)] = { AOM_CDF2(
+    30531) };
+
+static const aom_cdf_prob default_filter_intra_mode_cdf[CDF_SIZE(
+    FILTER_INTRA_MODES)] = { AOM_CDF5(8949, 12776, 17211, 29558) };
+
+static const aom_cdf_prob default_filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(
+    2)] = { { AOM_CDF2(4621) },  { AOM_CDF2(6743) },  { AOM_CDF2(5893) },
+            { AOM_CDF2(7866) },  { AOM_CDF2(12551) }, { AOM_CDF2(9394) },
+            { AOM_CDF2(12408) }, { AOM_CDF2(14301) }, { AOM_CDF2(12756) },
+            { AOM_CDF2(22343) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+            { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+            { AOM_CDF2(16384) }, { AOM_CDF2(12770) }, { AOM_CDF2(10368) },
+            { AOM_CDF2(20229) }, { AOM_CDF2(18101) }, { AOM_CDF2(16384) },
+            { AOM_CDF2(16384) } };
+
+static const aom_cdf_prob default_switchable_restore_cdf[CDF_SIZE(
+    RESTORE_SWITCHABLE_TYPES)] = { AOM_CDF3(9413, 22581) };
+
+static const aom_cdf_prob default_wiener_restore_cdf[CDF_SIZE(2)] = { AOM_CDF2(
+    11570) };
+
+static const aom_cdf_prob default_sgrproj_restore_cdf[CDF_SIZE(2)] = { AOM_CDF2(
+    16855) };
+
+static const aom_cdf_prob default_delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)] = {
+  AOM_CDF4(28160, 32120, 32677)
+};
+
+static const aom_cdf_prob default_delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE(
+    DELTA_LF_PROBS + 1)] = { { AOM_CDF4(28160, 32120, 32677) },
+                             { AOM_CDF4(28160, 32120, 32677) },
+                             { AOM_CDF4(28160, 32120, 32677) },
+                             { AOM_CDF4(28160, 32120, 32677) } };
+static const aom_cdf_prob default_delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)] = {
+  AOM_CDF4(28160, 32120, 32677)
+};
+
+// FIXME(someone) need real defaults here
+static const aom_cdf_prob default_seg_tree_cdf[CDF_SIZE(MAX_SEGMENTS)] = {
+  AOM_CDF8(4096, 8192, 12288, 16384, 20480, 24576, 28672)
+};
+
+static const aom_cdf_prob
+    default_segment_pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)] = {
+      { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) }
+    };
+
+static const aom_cdf_prob
+    default_spatial_pred_seg_tree_cdf[SPATIAL_PREDICTION_PROBS][CDF_SIZE(
+        MAX_SEGMENTS)] = {
       {
-          { AOM_ICDF(24319), AOM_ICDF(26299), AOM_ICDF(27486), AOM_ICDF(28600),
-            AOM_ICDF(29804), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(7935), AOM_ICDF(18217), AOM_ICDF(21116), AOM_ICDF(25440),
-            AOM_ICDF(28589), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(6656), AOM_ICDF(25016), AOM_ICDF(27105), AOM_ICDF(28698),
-            AOM_ICDF(30399), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(19967), AOM_ICDF(24117), AOM_ICDF(26550), AOM_ICDF(28566),
-            AOM_ICDF(30224), AOM_ICDF(32768), 0, 0, 0 },
-          { AOM_ICDF(31359), AOM_ICDF(31607), AOM_ICDF(31775), AOM_ICDF(31977),
-            AOM_ICDF(32258), AOM_ICDF(32768), 0, 0, 0 },
+          AOM_CDF8(5622, 7893, 16093, 18233, 27809, 28373, 32533),
       },
       {
-          { AOM_ICDF(26368), AOM_ICDF(27768), AOM_ICDF(28588), AOM_ICDF(29274),
-            AOM_ICDF(29997), AOM_ICDF(30917), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(8960), AOM_ICDF(18260), AOM_ICDF(20810), AOM_ICDF(23986),
-            AOM_ICDF(26627), AOM_ICDF(28882), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(7295), AOM_ICDF(24111), AOM_ICDF(25836), AOM_ICDF(27515),
-            AOM_ICDF(29033), AOM_ICDF(30769), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(22016), AOM_ICDF(25208), AOM_ICDF(27305), AOM_ICDF(28159),
-            AOM_ICDF(29221), AOM_ICDF(30274), AOM_ICDF(32768), 0, 0 },
-          { AOM_ICDF(31744), AOM_ICDF(31932), AOM_ICDF(32050), AOM_ICDF(32199),
-            AOM_ICDF(32335), AOM_ICDF(32521), AOM_ICDF(32768), 0, 0 },
+          AOM_CDF8(14274, 18230, 22557, 24935, 29980, 30851, 32344),
       },
       {
-          { AOM_ICDF(26624), AOM_ICDF(27872), AOM_ICDF(28599), AOM_ICDF(29153),
-            AOM_ICDF(29633), AOM_ICDF(30172), AOM_ICDF(30841), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(6655), AOM_ICDF(17569), AOM_ICDF(19587), AOM_ICDF(23345),
-            AOM_ICDF(25884), AOM_ICDF(28088), AOM_ICDF(29678), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(3584), AOM_ICDF(27296), AOM_ICDF(28429), AOM_ICDF(29158),
-            AOM_ICDF(30032), AOM_ICDF(30780), AOM_ICDF(31572), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(23551), AOM_ICDF(25855), AOM_ICDF(27070), AOM_ICDF(27893),
-            AOM_ICDF(28597), AOM_ICDF(29721), AOM_ICDF(30970), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(32128), AOM_ICDF(32173), AOM_ICDF(32245), AOM_ICDF(32337),
-            AOM_ICDF(32416), AOM_ICDF(32500), AOM_ICDF(32609), AOM_ICDF(32768),
-            0 },
+          AOM_CDF8(27527, 28487, 28723, 28890, 32397, 32647, 32679),
       },
     };
-#endif  // CONFIG_MRC_TX
 
-#if CONFIG_INTRABC
-static const aom_cdf_prob default_intrabc_cdf[CDF_SIZE(2)] = {
-  AOM_ICDF(192 * 128), AOM_ICDF(32768), 0,
-};
-#endif  // CONFIG_INTRABC
+static const aom_cdf_prob default_tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS]
+                                             [CDF_SIZE(MAX_TX_DEPTH + 1)] = {
+                                               { { AOM_CDF2(19968) },
+                                                 { AOM_CDF2(19968) },
+                                                 { AOM_CDF2(24320) } },
+                                               { { AOM_CDF3(12272, 30172) },
+                                                 { AOM_CDF3(12272, 30172) },
+                                                 { AOM_CDF3(18677, 30848) } },
+                                               { { AOM_CDF3(12986, 15180) },
+                                                 { AOM_CDF3(12986, 15180) },
+                                                 { AOM_CDF3(24302, 25602) } },
+                                               { { AOM_CDF3(5782, 11475) },
+                                                 { AOM_CDF3(5782, 11475) },
+                                                 { AOM_CDF3(16803, 22759) } },
+                                             };
 
 #define MAX_COLOR_CONTEXT_HASH 8
 // Negative values are invalid
@@ -2482,68 +896,43 @@ static const int palette_color_index_context_lookup[MAX_COLOR_CONTEXT_HASH +
                                                     1] = { -1, -1, 0, -1, -1,
                                                            4,  3,  2, 1 };
 
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-static const aom_prob default_quarter_tx_size_prob = 192;
-#if CONFIG_NEW_MULTISYMBOL
-static const aom_cdf_prob default_quarter_tx_size_cdf[CDF_SIZE(2)] = {
-  AOM_ICDF(192 * 128), AOM_ICDF(32768), 0
-};
-#endif
-#endif
-
-#if CONFIG_LOOP_RESTORATION
-const aom_tree_index
-    av1_switchable_restore_tree[TREE_SIZE(RESTORE_SWITCHABLE_TYPES)] = {
-      -RESTORE_NONE, 2, -RESTORE_WIENER, -RESTORE_SGRPROJ,
-    };
-
-static const aom_prob
-    default_switchable_restore_prob[RESTORE_SWITCHABLE_TYPES - 1] = {
-      32, 128,
-    };
-#endif  // CONFIG_LOOP_RESTORATION
-
 #define NUM_PALETTE_NEIGHBORS 3  // left, top-left and top.
 int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
                                         int r, int c, int palette_size,
                                         uint8_t *color_order, int *color_idx) {
-  int i;
-  // The +10 below should not be needed. But we get a warning "array subscript
-  // is above array bounds [-Werror=array-bounds]" without it, possibly due to
-  // this (or similar) bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124
-  int scores[PALETTE_MAX_SIZE + 10];
-  const int weights[NUM_PALETTE_NEIGHBORS] = { 2, 1, 2 };
-  const int hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 };
-  int color_index_ctx_hash;
-  int color_index_ctx;
-  int color_neighbors[NUM_PALETTE_NEIGHBORS];
-  int inverse_color_order[PALETTE_MAX_SIZE];
   assert(palette_size <= PALETTE_MAX_SIZE);
   assert(r > 0 || c > 0);
 
   // Get color indices of neighbors.
+  int color_neighbors[NUM_PALETTE_NEIGHBORS];
   color_neighbors[0] = (c - 1 >= 0) ? color_map[r * stride + c - 1] : -1;
   color_neighbors[1] =
       (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * stride + c - 1] : -1;
   color_neighbors[2] = (r - 1 >= 0) ? color_map[(r - 1) * stride + c] : -1;
 
-  for (i = 0; i < PALETTE_MAX_SIZE; ++i) {
-    color_order[i] = i;
-    inverse_color_order[i] = i;
-  }
-  memset(scores, 0, PALETTE_MAX_SIZE * sizeof(scores[0]));
+  // The +10 below should not be needed. But we get a warning "array subscript
+  // is above array bounds [-Werror=array-bounds]" without it, possibly due to
+  // this (or similar) bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124
+  int scores[PALETTE_MAX_SIZE + 10] = { 0 };
+  int i;
+  static const int weights[NUM_PALETTE_NEIGHBORS] = { 2, 1, 2 };
   for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
     if (color_neighbors[i] >= 0) {
       scores[color_neighbors[i]] += weights[i];
     }
   }
 
+  int inverse_color_order[PALETTE_MAX_SIZE];
+  for (i = 0; i < PALETTE_MAX_SIZE; ++i) {
+    color_order[i] = i;
+    inverse_color_order[i] = i;
+  }
+
   // Get the top NUM_PALETTE_NEIGHBORS scores (sorted from large to small).
   for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
     int max = scores[i];
     int max_idx = i;
-    int j;
-    for (j = i + 1; j < palette_size; ++j) {
+    for (int j = i + 1; j < palette_size; ++j) {
       if (scores[j] > max) {
         max = scores[j];
         max_idx = j;
@@ -2554,8 +943,7 @@ int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
       // from 'i' to 'max_idx - 1' by 1.
       const int max_score = scores[max_idx];
       const uint8_t max_color_order = color_order[max_idx];
-      int k;
-      for (k = max_idx; k > i; --k) {
+      for (int k = max_idx; k > i; --k) {
         scores[k] = scores[k - 1];
         color_order[k] = color_order[k - 1];
         inverse_color_order[color_order[k]] = k;
@@ -2566,8 +954,12 @@ int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
     }
   }
 
+  if (color_idx != NULL)
+    *color_idx = inverse_color_order[color_map[r * stride + c]];
+
   // Get hash value of context.
-  color_index_ctx_hash = 0;
+  int color_index_ctx_hash = 0;
+  static const int hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 };
   for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
     color_index_ctx_hash += scores[i] * hash_multipliers[i];
   }
@@ -2575,3643 +967,137 @@ int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
   assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH);
 
   // Lookup context from hash.
-  color_index_ctx = palette_color_index_context_lookup[color_index_ctx_hash];
+  const int color_index_ctx =
+      palette_color_index_context_lookup[color_index_ctx_hash];
   assert(color_index_ctx >= 0);
   assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS);
-
-  if (color_idx != NULL) {
-    *color_idx = inverse_color_order[color_map[r * stride + c]];
-  }
   return color_index_ctx;
 }
 #undef NUM_PALETTE_NEIGHBORS
 #undef MAX_COLOR_CONTEXT_HASH
 
-#if CONFIG_VAR_TX
-static const aom_prob default_txfm_partition_probs[TXFM_PARTITION_CONTEXTS] = {
-#if CONFIG_TX64X64
-  249, 240, 223, 249, 229, 177, 250, 243, 208, 226, 187,
-  145, 236, 204, 150, 183, 149, 125, 181, 146, 113, 128
-#else
-  250, 231, 212, 241, 166, 66, 241, 230, 135, 243, 154, 64, 248, 161, 63, 128
-#endif  // CONFIG_TX64X64
-};
-#if CONFIG_NEW_MULTISYMBOL
-static const aom_cdf_prob
-    default_txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)] = {
-#if CONFIG_TX64X64
-      { AOM_ICDF(249 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(240 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(223 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(249 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(229 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(177 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(250 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(243 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(208 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(226 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(187 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(145 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(236 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(204 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(150 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(183 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(149 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(125 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(181 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(146 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(113 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 }
-#else
-      { AOM_ICDF(250 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(231 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(212 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(241 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(166 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(66 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(241 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(230 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(135 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(243 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(154 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(64 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(248 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(161 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(63 * 128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_TX64X64
-    };
-#endif  // CONFIG_NEW_MULTISYMBOL
-#endif  // CONFIG_VAR_TX
-
-static const aom_prob default_skip_probs[SKIP_CONTEXTS] = { 192, 128, 64 };
-#if CONFIG_NEW_MULTISYMBOL
-static const aom_cdf_prob default_skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)] = {
-  { AOM_ICDF(24576), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(16384), AOM_ICDF(32768), 0 },
-  { AOM_ICDF(8192), AOM_ICDF(32768), 0 }
-};
-#endif
-
-#if CONFIG_LGT_FROM_PRED
-static const aom_prob default_intra_lgt_prob[LGT_SIZES][INTRA_MODES] = {
-  { 255, 208, 208, 180, 230, 208, 194, 214, 220, 255,
-#if CONFIG_SMOOTH_HV
-    220, 220,
-#endif
-    230 },
-  { 255, 192, 216, 180, 180, 180, 180, 200, 200, 255,
-#if CONFIG_SMOOTH_HV
-    220, 220,
-#endif
-    222 },
-};
-
-static const aom_prob default_inter_lgt_prob[LGT_SIZES] = { 230, 230 };
-#endif  // CONFIG_LGT_FROM_PRED
-
-#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-static const aom_prob
-    default_intra_filter_probs[INTRA_FILTERS + 1][INTRA_FILTERS - 1] = {
-      { 98, 63, 60 }, { 98, 82, 80 }, { 94, 65, 103 },
-      { 49, 25, 24 }, { 72, 38, 50 },
-    };
-const aom_tree_index av1_intra_filter_tree[TREE_SIZE(INTRA_FILTERS)] = {
-  -INTRA_FILTER_LINEAR,      2, -INTRA_FILTER_8TAP, 4, -INTRA_FILTER_8TAP_SHARP,
-  -INTRA_FILTER_8TAP_SMOOTH,
-};
-int av1_intra_filter_ind[INTRA_FILTERS];
-int av1_intra_filter_inv[INTRA_FILTERS];
-#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-
-#if CONFIG_FILTER_INTRA
-static const aom_prob default_filter_intra_probs[2] = { 230, 230 };
-#endif  // CONFIG_FILTER_INTRA
-
-#if CONFIG_SUPERTX
-static const aom_prob
-    default_supertx_prob[PARTITION_SUPERTX_CONTEXTS][TX_SIZES] = {
-#if CONFIG_CHROMA_2X2
-#if CONFIG_TX64X64
-      { 1, 1, 160, 160, 170, 180 }, { 1, 1, 200, 200, 210, 220 },
-#else
-      { 1, 1, 160, 160, 170 }, { 1, 1, 200, 200, 210 },
-#endif  // CONFIG_TX64X64
-#else
-#if CONFIG_TX64X64
-      { 1, 160, 160, 170, 180 }, { 1, 200, 200, 210, 220 },
-#else
-      { 1, 160, 160, 170 }, { 1, 200, 200, 210 },
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_CHROMA_2X2
-    };
-#endif  // CONFIG_SUPERTX
-
-// FIXME(someone) need real defaults here
-static const aom_prob default_segment_tree_probs[SEG_TREE_PROBS] = {
-  128, 128, 128, 128, 128, 128, 128
-};
-// clang-format off
-static const aom_prob default_segment_pred_probs[PREDICTION_PROBS] = {
-  128, 128, 128
-};
-#if CONFIG_NEW_MULTISYMBOL
-static const aom_cdf_prob
-    default_segment_pred_cdf[PREDICTION_PROBS][CDF_SIZE(2)] = {
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0},
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0},
-  { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0}
-};
-#endif
-// clang-format on
-#if CONFIG_DUAL_FILTER
-#if USE_EXTRA_FILTER
-static const aom_cdf_prob
-    default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS][CDF_SIZE(
-        SWITCHABLE_FILTERS)] = {
-      { AOM_ICDF(30080), AOM_ICDF(31088), AOM_ICDF(31760), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4608), AOM_ICDF(9620), AOM_ICDF(11050), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(31880), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(31880), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(19072), AOM_ICDF(23352), AOM_ICDF(28488), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(30080), AOM_ICDF(31088), AOM_ICDF(31760), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4608), AOM_ICDF(9620), AOM_ICDF(11050), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(31880), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(31880), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(19072), AOM_ICDF(23352), AOM_ICDF(28488), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(30080), AOM_ICDF(31088), AOM_ICDF(31760), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4608), AOM_ICDF(9620), AOM_ICDF(11050), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(31880), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(31880), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(19072), AOM_ICDF(23352), AOM_ICDF(28488), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(30080), AOM_ICDF(31088), AOM_ICDF(31760), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4608), AOM_ICDF(9620), AOM_ICDF(11050), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(31880), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4352), AOM_ICDF(5240), AOM_ICDF(31880), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(19072), AOM_ICDF(23352), AOM_ICDF(28488), AOM_ICDF(32768), 0 },
-    };
-#else   // USE_EXTRA_FILTER
-static const aom_cdf_prob
-    default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS][CDF_SIZE(
-        SWITCHABLE_FILTERS)] = {
-      { AOM_ICDF(32256), AOM_ICDF(32654), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(2816), AOM_ICDF(32651), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(512), AOM_ICDF(764), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(30464), AOM_ICDF(31778), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32384), AOM_ICDF(32483), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(3072), AOM_ICDF(32652), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(256), AOM_ICDF(383), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(25344), AOM_ICDF(26533), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(32000), AOM_ICDF(32531), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(2048), AOM_ICDF(32648), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(384), AOM_ICDF(890), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(28928), AOM_ICDF(31358), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(31616), AOM_ICDF(31787), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4224), AOM_ICDF(32433), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(128), AOM_ICDF(256), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(17408), AOM_ICDF(18248), AOM_ICDF(32768), 0 }
-    };
-#endif  // USE_EXTRA_FILTER
-#else   // CONFIG_DUAL_FILTER
-static const aom_cdf_prob
-    default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS][CDF_SIZE(
-        SWITCHABLE_FILTERS)] = {
-      { AOM_ICDF(30080), AOM_ICDF(31781), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4608), AOM_ICDF(32658), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4352), AOM_ICDF(4685), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(19072), AOM_ICDF(26776), AOM_ICDF(32768), 0 },
-    };
-#endif  // CONFIG_DUAL_FILTER
-
-static const aom_cdf_prob default_seg_tree_cdf[CDF_SIZE(MAX_SEGMENTS)] = {
-  AOM_ICDF(4096),  AOM_ICDF(8192),  AOM_ICDF(12288),
-  AOM_ICDF(16384), AOM_ICDF(20480), AOM_ICDF(24576),
-  AOM_ICDF(28672), AOM_ICDF(32768), 0
-};
-
-static const aom_cdf_prob
-    default_tx_size_cdf[MAX_TX_DEPTH][TX_SIZE_CONTEXTS][CDF_SIZE(MAX_TX_DEPTH +
-                                                                 1)] = {
-      { { AOM_ICDF(12800), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(8448), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(2560), AOM_ICDF(20496), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1920), AOM_ICDF(14091), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(384), AOM_ICDF(17588), AOM_ICDF(19782), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(640), AOM_ICDF(7166), AOM_ICDF(8466), AOM_ICDF(32768), 0 } },
-#if CONFIG_TX64X64
-      { { AOM_ICDF(128), AOM_ICDF(8288), AOM_ICDF(21293), AOM_ICDF(26986),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(128), AOM_ICDF(4208), AOM_ICDF(10009), AOM_ICDF(15965),
-          AOM_ICDF(32768), 0 } },
-#endif
-    };
-
-#if CONFIG_SMOOTH_HV
-static const aom_cdf_prob
-    default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)] = {
-      {
-          AOM_ICDF(7168), AOM_ICDF(10680), AOM_ICDF(13913), AOM_ICDF(16928),
-          AOM_ICDF(20294), AOM_ICDF(22790), AOM_ICDF(24706), AOM_ICDF(26275),
-          AOM_ICDF(28139), AOM_ICDF(29751), AOM_ICDF(30563), AOM_ICDF(31468),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11776), AOM_ICDF(13823), AOM_ICDF(15307), AOM_ICDF(15725),
-          AOM_ICDF(16638), AOM_ICDF(17406), AOM_ICDF(17994), AOM_ICDF(18814),
-          AOM_ICDF(19634), AOM_ICDF(21513), AOM_ICDF(22198), AOM_ICDF(22928),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(14720), AOM_ICDF(16459), AOM_ICDF(18091), AOM_ICDF(18299),
-          AOM_ICDF(18757), AOM_ICDF(19125), AOM_ICDF(19423), AOM_ICDF(19924),
-          AOM_ICDF(20504), AOM_ICDF(22922), AOM_ICDF(24063), AOM_ICDF(25577),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(18944), AOM_ICDF(19925), AOM_ICDF(20908), AOM_ICDF(20998),
-          AOM_ICDF(21017), AOM_ICDF(21072), AOM_ICDF(21084), AOM_ICDF(21121),
-          AOM_ICDF(21159), AOM_ICDF(22064), AOM_ICDF(22820), AOM_ICDF(24290),
-          AOM_ICDF(32768), 0,
-      },
-    };
-
-#if CONFIG_CFL
-static const aom_cdf_prob
-    default_uv_mode_cdf[INTRA_MODES][CDF_SIZE(UV_INTRA_MODES)] = {
-      { AOM_ICDF(18377), AOM_ICDF(18815), AOM_ICDF(19743), AOM_ICDF(20178),
-        AOM_ICDF(20560), AOM_ICDF(20889), AOM_ICDF(21359), AOM_ICDF(22098),
-        AOM_ICDF(22481), AOM_ICDF(24563), AOM_ICDF(25781), AOM_ICDF(26662),
-        AOM_ICDF(28396), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(5350), AOM_ICDF(16837), AOM_ICDF(17066), AOM_ICDF(17360),
-        AOM_ICDF(17692), AOM_ICDF(18778), AOM_ICDF(18969), AOM_ICDF(19206),
-        AOM_ICDF(20291), AOM_ICDF(22367), AOM_ICDF(23212), AOM_ICDF(24670),
-        AOM_ICDF(27912), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(6671), AOM_ICDF(6759), AOM_ICDF(17812), AOM_ICDF(17998),
-        AOM_ICDF(18260), AOM_ICDF(18384), AOM_ICDF(19408), AOM_ICDF(20667),
-        AOM_ICDF(20806), AOM_ICDF(22760), AOM_ICDF(24142), AOM_ICDF(24875),
-        AOM_ICDF(28072), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(7461), AOM_ICDF(8082), AOM_ICDF(8515), AOM_ICDF(15013),
-        AOM_ICDF(15583), AOM_ICDF(16098), AOM_ICDF(16522), AOM_ICDF(18519),
-        AOM_ICDF(20348), AOM_ICDF(22954), AOM_ICDF(24130), AOM_ICDF(25342),
-        AOM_ICDF(26548), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(3694), AOM_ICDF(4403), AOM_ICDF(5370), AOM_ICDF(5854),
-        AOM_ICDF(17841), AOM_ICDF(19639), AOM_ICDF(21625), AOM_ICDF(22224),
-        AOM_ICDF(22651), AOM_ICDF(24613), AOM_ICDF(25399), AOM_ICDF(26143),
-        AOM_ICDF(26599), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(3700), AOM_ICDF(5651), AOM_ICDF(6112), AOM_ICDF(6541),
-        AOM_ICDF(8929), AOM_ICDF(20623), AOM_ICDF(21213), AOM_ICDF(21640),
-        AOM_ICDF(22214), AOM_ICDF(24306), AOM_ICDF(25412), AOM_ICDF(26406),
-        AOM_ICDF(27249), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4649), AOM_ICDF(4947), AOM_ICDF(7128), AOM_ICDF(7432),
-        AOM_ICDF(9439), AOM_ICDF(9903), AOM_ICDF(21163), AOM_ICDF(21774),
-        AOM_ICDF(22056), AOM_ICDF(24426), AOM_ICDF(25403), AOM_ICDF(26324),
-        AOM_ICDF(27128), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(7208), AOM_ICDF(7375), AOM_ICDF(8779), AOM_ICDF(9683),
-        AOM_ICDF(10072), AOM_ICDF(10284), AOM_ICDF(10796), AOM_ICDF(19786),
-        AOM_ICDF(20152), AOM_ICDF(22955), AOM_ICDF(24246), AOM_ICDF(25165),
-        AOM_ICDF(26589), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(5897), AOM_ICDF(7283), AOM_ICDF(7555), AOM_ICDF(8910),
-        AOM_ICDF(9391), AOM_ICDF(9937), AOM_ICDF(10276), AOM_ICDF(11044),
-        AOM_ICDF(19841), AOM_ICDF(22620), AOM_ICDF(23784), AOM_ICDF(25060),
-        AOM_ICDF(26418), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(12171), AOM_ICDF(12718), AOM_ICDF(13885), AOM_ICDF(14348),
-        AOM_ICDF(14925), AOM_ICDF(15394), AOM_ICDF(16108), AOM_ICDF(17075),
-        AOM_ICDF(17583), AOM_ICDF(21996), AOM_ICDF(23614), AOM_ICDF(25048),
-        AOM_ICDF(27011), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10192), AOM_ICDF(11222), AOM_ICDF(12318), AOM_ICDF(12877),
-        AOM_ICDF(13533), AOM_ICDF(14184), AOM_ICDF(14866), AOM_ICDF(15879),
-        AOM_ICDF(16650), AOM_ICDF(20419), AOM_ICDF(23265), AOM_ICDF(24295),
-        AOM_ICDF(26596), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10776), AOM_ICDF(11387), AOM_ICDF(12899), AOM_ICDF(13471),
-        AOM_ICDF(14088), AOM_ICDF(14575), AOM_ICDF(15366), AOM_ICDF(16456),
-        AOM_ICDF(17040), AOM_ICDF(20815), AOM_ICDF(22009), AOM_ICDF(24448),
-        AOM_ICDF(26492), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4015), AOM_ICDF(6473), AOM_ICDF(9853), AOM_ICDF(10285),
-        AOM_ICDF(10655), AOM_ICDF(11032), AOM_ICDF(11431), AOM_ICDF(12199),
-        AOM_ICDF(12738), AOM_ICDF(14760), AOM_ICDF(16121), AOM_ICDF(17263),
-        AOM_ICDF(28612), AOM_ICDF(32768), 0 },
-    };
-#else
-static const aom_cdf_prob
-    default_uv_mode_cdf[INTRA_MODES][CDF_SIZE(UV_INTRA_MODES)] = {
-      {
-          AOM_ICDF(23552), AOM_ICDF(25936), AOM_ICDF(28623), AOM_ICDF(29033),
-          AOM_ICDF(29395), AOM_ICDF(29892), AOM_ICDF(30252), AOM_ICDF(30905),
-          AOM_ICDF(31370), AOM_ICDF(31980), AOM_ICDF(32293), AOM_ICDF(32660),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2944), AOM_ICDF(26431), AOM_ICDF(27553), AOM_ICDF(27746),
-          AOM_ICDF(28022), AOM_ICDF(29080), AOM_ICDF(29204), AOM_ICDF(29377),
-          AOM_ICDF(30264), AOM_ICDF(31206), AOM_ICDF(31613), AOM_ICDF(32418),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4352), AOM_ICDF(5120), AOM_ICDF(27952), AOM_ICDF(28117),
-          AOM_ICDF(28473), AOM_ICDF(28759), AOM_ICDF(29563), AOM_ICDF(30864),
-          AOM_ICDF(31051), AOM_ICDF(31694), AOM_ICDF(32073), AOM_ICDF(32435),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(17664), AOM_ICDF(20288), AOM_ICDF(21839), AOM_ICDF(26072),
-          AOM_ICDF(26420), AOM_ICDF(26972), AOM_ICDF(27240), AOM_ICDF(28565),
-          AOM_ICDF(30914), AOM_ICDF(31694), AOM_ICDF(32083), AOM_ICDF(32591),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(16640), AOM_ICDF(18390), AOM_ICDF(20233), AOM_ICDF(20557),
-          AOM_ICDF(25162), AOM_ICDF(27789), AOM_ICDF(29397), AOM_ICDF(29895),
-          AOM_ICDF(30369), AOM_ICDF(31497), AOM_ICDF(32025), AOM_ICDF(32642),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(13952), AOM_ICDF(17947), AOM_ICDF(18918), AOM_ICDF(19206),
-          AOM_ICDF(21131), AOM_ICDF(30668), AOM_ICDF(31061), AOM_ICDF(31317),
-          AOM_ICDF(31838), AOM_ICDF(32137), AOM_ICDF(32342), AOM_ICDF(32547),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(15872), AOM_ICDF(16990), AOM_ICDF(21479), AOM_ICDF(21732),
-          AOM_ICDF(24134), AOM_ICDF(24854), AOM_ICDF(30296), AOM_ICDF(30887),
-          AOM_ICDF(31163), AOM_ICDF(31902), AOM_ICDF(32218), AOM_ICDF(32702),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(16256), AOM_ICDF(17280), AOM_ICDF(23081), AOM_ICDF(24039),
-          AOM_ICDF(24457), AOM_ICDF(24838), AOM_ICDF(25346), AOM_ICDF(30329),
-          AOM_ICDF(30908), AOM_ICDF(31746), AOM_ICDF(32206), AOM_ICDF(32639),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(14720), AOM_ICDF(19249), AOM_ICDF(20501), AOM_ICDF(22079),
-          AOM_ICDF(22439), AOM_ICDF(23218), AOM_ICDF(23463), AOM_ICDF(24107),
-          AOM_ICDF(30308), AOM_ICDF(31379), AOM_ICDF(31866), AOM_ICDF(32556),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(16768), AOM_ICDF(19967), AOM_ICDF(22374), AOM_ICDF(22976),
-          AOM_ICDF(23836), AOM_ICDF(24050), AOM_ICDF(24642), AOM_ICDF(25760),
-          AOM_ICDF(26653), AOM_ICDF(29585), AOM_ICDF(30937), AOM_ICDF(32518),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(16768), AOM_ICDF(20751), AOM_ICDF(23026), AOM_ICDF(23591),
-          AOM_ICDF(24299), AOM_ICDF(24516), AOM_ICDF(24981), AOM_ICDF(25876),
-          AOM_ICDF(26806), AOM_ICDF(29520), AOM_ICDF(31286), AOM_ICDF(32455),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(17536), AOM_ICDF(20055), AOM_ICDF(22965), AOM_ICDF(23507),
-          AOM_ICDF(24210), AOM_ICDF(24398), AOM_ICDF(25098), AOM_ICDF(26366),
-          AOM_ICDF(27033), AOM_ICDF(29674), AOM_ICDF(30689), AOM_ICDF(32530),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(17536), AOM_ICDF(22753), AOM_ICDF(27126), AOM_ICDF(27353),
-          AOM_ICDF(27571), AOM_ICDF(28139), AOM_ICDF(28505), AOM_ICDF(29198),
-          AOM_ICDF(29886), AOM_ICDF(30801), AOM_ICDF(31335), AOM_ICDF(32054),
-          AOM_ICDF(32768), 0,
-      },
-    };
-#endif  // CONFIG_CFL
-#else   // !CONFIG_SMOOTH_HV
-static const aom_cdf_prob
-    default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)] = {
-      {
-          AOM_ICDF(11264), AOM_ICDF(14965), AOM_ICDF(19742), AOM_ICDF(21904),
-          AOM_ICDF(24115), AOM_ICDF(25678), AOM_ICDF(27210), AOM_ICDF(28705),
-          AOM_ICDF(29782), AOM_ICDF(31424), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9600), AOM_ICDF(13747), AOM_ICDF(18569), AOM_ICDF(20091),
-          AOM_ICDF(21925), AOM_ICDF(23082), AOM_ICDF(24404), AOM_ICDF(26285),
-          AOM_ICDF(27574), AOM_ICDF(30415), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9344), AOM_ICDF(14319), AOM_ICDF(19567), AOM_ICDF(20476),
-          AOM_ICDF(21791), AOM_ICDF(22529), AOM_ICDF(23393), AOM_ICDF(24881),
-          AOM_ICDF(26012), AOM_ICDF(30572), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(12288), AOM_ICDF(15866), AOM_ICDF(21186), AOM_ICDF(21722),
-          AOM_ICDF(22209), AOM_ICDF(22564), AOM_ICDF(22966), AOM_ICDF(24025),
-          AOM_ICDF(24716), AOM_ICDF(30608), AOM_ICDF(32768), 0,
-      },
-    };
-
-static const aom_cdf_prob
-    default_uv_mode_cdf[INTRA_MODES][CDF_SIZE(UV_INTRA_MODES)] = {
-      {
-          AOM_ICDF(25472), AOM_ICDF(27697), AOM_ICDF(30693), AOM_ICDF(30916),
-          AOM_ICDF(31125), AOM_ICDF(31406), AOM_ICDF(31679), AOM_ICDF(32085),
-          AOM_ICDF(32334), AOM_ICDF(32682), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2176), AOM_ICDF(28142), AOM_ICDF(29335), AOM_ICDF(29504),
-          AOM_ICDF(29762), AOM_ICDF(30642), AOM_ICDF(30741), AOM_ICDF(30902),
-          AOM_ICDF(31683), AOM_ICDF(32529), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3328), AOM_ICDF(3901), AOM_ICDF(30984), AOM_ICDF(31068),
-          AOM_ICDF(31241), AOM_ICDF(31389), AOM_ICDF(31697), AOM_ICDF(32379),
-          AOM_ICDF(32483), AOM_ICDF(32653), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(23424), AOM_ICDF(25805), AOM_ICDF(27721), AOM_ICDF(29432),
-          AOM_ICDF(29659), AOM_ICDF(30032), AOM_ICDF(30282), AOM_ICDF(31192),
-          AOM_ICDF(32259), AOM_ICDF(32658), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(22784), AOM_ICDF(24177), AOM_ICDF(26209), AOM_ICDF(26476),
-          AOM_ICDF(28679), AOM_ICDF(29698), AOM_ICDF(30786), AOM_ICDF(31257),
-          AOM_ICDF(31596), AOM_ICDF(32690), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(19712), AOM_ICDF(22988), AOM_ICDF(24275), AOM_ICDF(24520),
-          AOM_ICDF(25823), AOM_ICDF(31469), AOM_ICDF(31880), AOM_ICDF(32189),
-          AOM_ICDF(32614), AOM_ICDF(32615), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(21376), AOM_ICDF(22085), AOM_ICDF(27643), AOM_ICDF(27799),
-          AOM_ICDF(28596), AOM_ICDF(28890), AOM_ICDF(31767), AOM_ICDF(32255),
-          AOM_ICDF(32405), AOM_ICDF(32723), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(19712), AOM_ICDF(20623), AOM_ICDF(28408), AOM_ICDF(28766),
-          AOM_ICDF(29070), AOM_ICDF(29355), AOM_ICDF(29759), AOM_ICDF(32034),
-          AOM_ICDF(32306), AOM_ICDF(32666), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(19584), AOM_ICDF(23437), AOM_ICDF(25295), AOM_ICDF(26200),
-          AOM_ICDF(26612), AOM_ICDF(27372), AOM_ICDF(27704), AOM_ICDF(28319),
-          AOM_ICDF(31664), AOM_ICDF(32562), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(20864), AOM_ICDF(23989), AOM_ICDF(26168), AOM_ICDF(26591),
-          AOM_ICDF(27345), AOM_ICDF(27348), AOM_ICDF(27809), AOM_ICDF(28575),
-          AOM_ICDF(29132), AOM_ICDF(32628), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(23680), AOM_ICDF(27582), AOM_ICDF(30197), AOM_ICDF(30312),
-          AOM_ICDF(30464), AOM_ICDF(30995), AOM_ICDF(31208), AOM_ICDF(31574),
-          AOM_ICDF(31985), AOM_ICDF(32519), AOM_ICDF(32768), 0,
-      },
-    };
-#endif  // CONFIG_SMOOTH_HV
-
-#if CONFIG_EXT_PARTITION_TYPES
-static const aom_cdf_prob
-    default_partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(EXT_PARTITION_TYPES)] = {
-      // 8x8 -> 4x4 only supports the four legacy partition types
-      { AOM_ICDF(25472), AOM_ICDF(28949), AOM_ICDF(31052), AOM_ICDF(32768), 0,
-        0, 0, 0, 0, 0, 0 },
-      { AOM_ICDF(18816), AOM_ICDF(22250), AOM_ICDF(28783), AOM_ICDF(32768), 0,
-        0, 0, 0, 0, 0, 0 },
-      { AOM_ICDF(18944), AOM_ICDF(26126), AOM_ICDF(29188), AOM_ICDF(32768), 0,
-        0, 0, 0, 0, 0, 0 },
-      { AOM_ICDF(15488), AOM_ICDF(22508), AOM_ICDF(27077), AOM_ICDF(32768), 0,
-        0, 0, 0, 0, 0, 0 },
-      // 16x16 -> 8x8
-      { AOM_ICDF(22272), AOM_ICDF(23768), AOM_ICDF(25043), AOM_ICDF(29996),
-        AOM_ICDF(30495), AOM_ICDF(30994), AOM_ICDF(31419), AOM_ICDF(31844),
-        AOM_ICDF(32343), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(11776), AOM_ICDF(13457), AOM_ICDF(16315), AOM_ICDF(28229),
-        AOM_ICDF(28789), AOM_ICDF(29349), AOM_ICDF(30302), AOM_ICDF(31255),
-        AOM_ICDF(31816), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10496), AOM_ICDF(14802), AOM_ICDF(16136), AOM_ICDF(27127),
-        AOM_ICDF(28563), AOM_ICDF(29999), AOM_ICDF(30444), AOM_ICDF(30889),
-        AOM_ICDF(32324), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(6784), AOM_ICDF(8763), AOM_ICDF(10440), AOM_ICDF(29110),
-        AOM_ICDF(29770), AOM_ICDF(30430), AOM_ICDF(30989), AOM_ICDF(31548),
-        AOM_ICDF(32208), AOM_ICDF(32768), 0 },
-      // 32x32 -> 16x16
-      { AOM_ICDF(22656), AOM_ICDF(23801), AOM_ICDF(24702), AOM_ICDF(30721),
-        AOM_ICDF(31103), AOM_ICDF(31485), AOM_ICDF(31785), AOM_ICDF(32085),
-        AOM_ICDF(32467), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8704), AOM_ICDF(9926), AOM_ICDF(12586), AOM_ICDF(28885),
-        AOM_ICDF(29292), AOM_ICDF(29699), AOM_ICDF(30586), AOM_ICDF(31473),
-        AOM_ICDF(31881), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(6656), AOM_ICDF(10685), AOM_ICDF(11566), AOM_ICDF(27857),
-        AOM_ICDF(29200), AOM_ICDF(30543), AOM_ICDF(30837), AOM_ICDF(31131),
-        AOM_ICDF(32474), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(2176), AOM_ICDF(3012), AOM_ICDF(3690), AOM_ICDF(31253),
-        AOM_ICDF(31532), AOM_ICDF(31811), AOM_ICDF(32037), AOM_ICDF(32263),
-        AOM_ICDF(32542), AOM_ICDF(32768), 0 },
-      // 64x64 -> 32x32
-      { AOM_ICDF(28416), AOM_ICDF(28705), AOM_ICDF(28926), AOM_ICDF(32258),
-        AOM_ICDF(32354), AOM_ICDF(32450), AOM_ICDF(32523), AOM_ICDF(32596),
-        AOM_ICDF(32693), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9216), AOM_ICDF(9952), AOM_ICDF(11849), AOM_ICDF(30134),
-        AOM_ICDF(30379), AOM_ICDF(30624), AOM_ICDF(31256), AOM_ICDF(31888),
-        AOM_ICDF(32134), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(7424), AOM_ICDF(9008), AOM_ICDF(9528), AOM_ICDF(30664),
-        AOM_ICDF(31192), AOM_ICDF(31720), AOM_ICDF(31893), AOM_ICDF(32066),
-        AOM_ICDF(32594), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(1280), AOM_ICDF(1710), AOM_ICDF(2069), AOM_ICDF(31978),
-        AOM_ICDF(32121), AOM_ICDF(32264), AOM_ICDF(32383), AOM_ICDF(32502),
-        AOM_ICDF(32647), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      // 128x128 -> 64x64
-      { AOM_ICDF(28416), AOM_ICDF(28705), AOM_ICDF(28926), AOM_ICDF(32258),
-        AOM_ICDF(32354), AOM_ICDF(32450), AOM_ICDF(32523), AOM_ICDF(32596),
-        AOM_ICDF(32693), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9216), AOM_ICDF(9952), AOM_ICDF(11849), AOM_ICDF(30134),
-        AOM_ICDF(30379), AOM_ICDF(30624), AOM_ICDF(31256), AOM_ICDF(31888),
-        AOM_ICDF(32134), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(7424), AOM_ICDF(9008), AOM_ICDF(9528), AOM_ICDF(30664),
-        AOM_ICDF(31192), AOM_ICDF(31720), AOM_ICDF(31893), AOM_ICDF(32066),
-        AOM_ICDF(32594), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(1280), AOM_ICDF(1710), AOM_ICDF(2069), AOM_ICDF(31978),
-        AOM_ICDF(32121), AOM_ICDF(32264), AOM_ICDF(32383), AOM_ICDF(32502),
-        AOM_ICDF(32647), AOM_ICDF(32768), 0 },
-#endif
-    };
-#else
-static const aom_cdf_prob
-    default_partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(PARTITION_TYPES)] = {
-      { AOM_ICDF(25472), AOM_ICDF(28949), AOM_ICDF(31052), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(18816), AOM_ICDF(22250), AOM_ICDF(28783), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(18944), AOM_ICDF(26126), AOM_ICDF(29188), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(15488), AOM_ICDF(22508), AOM_ICDF(27077), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(22272), AOM_ICDF(25265), AOM_ICDF(27815), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(11776), AOM_ICDF(15138), AOM_ICDF(20854), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(10496), AOM_ICDF(19109), AOM_ICDF(21777), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(6784), AOM_ICDF(10743), AOM_ICDF(14098), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(22656), AOM_ICDF(24947), AOM_ICDF(26749), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(8704), AOM_ICDF(11148), AOM_ICDF(16469), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(6656), AOM_ICDF(14714), AOM_ICDF(16477), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(2176), AOM_ICDF(3849), AOM_ICDF(5205), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(28416), AOM_ICDF(28994), AOM_ICDF(29436), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9216), AOM_ICDF(10688), AOM_ICDF(14483), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(7424), AOM_ICDF(10592), AOM_ICDF(11632), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(1280), AOM_ICDF(2141), AOM_ICDF(2859), AOM_ICDF(32768), 0 },
-#if CONFIG_EXT_PARTITION
-      { AOM_ICDF(28416), AOM_ICDF(28994), AOM_ICDF(29436), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9216), AOM_ICDF(10688), AOM_ICDF(14483), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(7424), AOM_ICDF(10592), AOM_ICDF(11632), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(1280), AOM_ICDF(2141), AOM_ICDF(2859), AOM_ICDF(32768), 0 },
-#endif
-    };
-#endif
-
-#if CONFIG_EXT_TX
-static const aom_cdf_prob default_intra_ext_tx_cdf
-    [EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES][CDF_SIZE(TX_TYPES)] = {
-      {
-// FIXME: unused zero positions, from uncoded trivial transform set
-#if CONFIG_CHROMA_2X2
-          {
-              { 0 },
-          },
-#endif
-          {
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-#if CONFIG_SMOOTH_HV
-              { 0 },
-              { 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { 0 },
-          },
-          {
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-#if CONFIG_SMOOTH_HV
-              { 0 },
-              { 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { 0 },
-          },
-          {
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-#if CONFIG_SMOOTH_HV
-              { 0 },
-              { 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { 0 },
-          },
-          {
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-              { 0 },
-#if CONFIG_SMOOTH_HV
-              { 0 },
-              { 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { 0 },
-          },
-      },
-      {
-          {
-              { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29048),
-                AOM_ICDF(29296), AOM_ICDF(30164), AOM_ICDF(31466),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(26284),
-                AOM_ICDF(26717), AOM_ICDF(28230), AOM_ICDF(30499),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(3938), AOM_ICDF(5860),
-                AOM_ICDF(29404), AOM_ICDF(31086), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
-                AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
-                AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-          },
-          {
-              { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29048),
-                AOM_ICDF(29296), AOM_ICDF(30164), AOM_ICDF(31466),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(26284),
-                AOM_ICDF(26717), AOM_ICDF(28230), AOM_ICDF(30499),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(3938), AOM_ICDF(5860),
-                AOM_ICDF(29404), AOM_ICDF(31086), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
-                AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
-                AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-          },
-          {
-              { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29048),
-                AOM_ICDF(29296), AOM_ICDF(30164), AOM_ICDF(31466),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(26284),
-                AOM_ICDF(26717), AOM_ICDF(28230), AOM_ICDF(30499),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(3938), AOM_ICDF(5860),
-                AOM_ICDF(29404), AOM_ICDF(31086), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
-                AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
-                AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-          },
-          {
-              { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29048),
-                AOM_ICDF(29296), AOM_ICDF(30164), AOM_ICDF(31466),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(26284),
-                AOM_ICDF(26717), AOM_ICDF(28230), AOM_ICDF(30499),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(3938), AOM_ICDF(5860),
-                AOM_ICDF(29404), AOM_ICDF(31086), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(27118), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
-                AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(5900), AOM_ICDF(7691),
-                AOM_ICDF(15528), AOM_ICDF(27380), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(8660),
-                AOM_ICDF(10167), AOM_ICDF(15817), AOM_ICDF(32768), 0 },
-          },
-      },
-      {
-          {
-              { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29792),
-                AOM_ICDF(31280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(27581),
-                AOM_ICDF(30174), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(28924),
-                AOM_ICDF(30846), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065),
-                AOM_ICDF(26611), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065),
-                AOM_ICDF(26611), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-          },
-          {
-              { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29792),
-                AOM_ICDF(31280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(27581),
-                AOM_ICDF(30174), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(28924),
-                AOM_ICDF(30846), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065),
-                AOM_ICDF(26611), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065),
-                AOM_ICDF(26611), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-          },
-          {
-              { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29792),
-                AOM_ICDF(31280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(27581),
-                AOM_ICDF(30174), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(28924),
-                AOM_ICDF(30846), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065),
-                AOM_ICDF(26611), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065),
-                AOM_ICDF(26611), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-          },
-          {
-              { AOM_ICDF(1024), AOM_ICDF(28800), AOM_ICDF(29792),
-                AOM_ICDF(31280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(25852), AOM_ICDF(27581),
-                AOM_ICDF(30174), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(28924),
-                AOM_ICDF(30846), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(26310),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065),
-                AOM_ICDF(26611), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(4109), AOM_ICDF(13065),
-                AOM_ICDF(26611), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(5216), AOM_ICDF(6938), AOM_ICDF(13396),
-                AOM_ICDF(32768), 0 },
-          },
-      },
-#if CONFIG_MRC_TX
-      {
-          {
-              { AOM_ICDF(1024), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-          },
-          {
-              { AOM_ICDF(1024), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-          },
-          {
-              { AOM_ICDF(1024), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-          },
-          {
-              { AOM_ICDF(1024), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1152), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1024), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-#if CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-#endif  // CONFIG_SMOOTH_HV
-              { AOM_ICDF(1280), AOM_ICDF(32768), 0 },
-          },
-      }
-#endif  // CONFIG_MRC_TX
-    };
-static const aom_cdf_prob
-    default_inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES][CDF_SIZE(
-        TX_TYPES)] = {
-      {
-#if CONFIG_CHROMA_2X2
-          { 0 },
-#endif
-          { 0 },
-          { 0 },
-          { 0 },
-          { 0 } },
-      {
-#if CONFIG_CHROMA_2X2
-          { 0 },
-#endif
-          { AOM_ICDF(1280), AOM_ICDF(1453), AOM_ICDF(1626), AOM_ICDF(2277),
-            AOM_ICDF(2929), AOM_ICDF(3580), AOM_ICDF(4232), AOM_ICDF(16717),
-            AOM_ICDF(19225), AOM_ICDF(21733), AOM_ICDF(24241), AOM_ICDF(26749),
-            AOM_ICDF(28253), AOM_ICDF(29758), AOM_ICDF(31263), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(1280), AOM_ICDF(1453), AOM_ICDF(1626), AOM_ICDF(2277),
-            AOM_ICDF(2929), AOM_ICDF(3580), AOM_ICDF(4232), AOM_ICDF(16717),
-            AOM_ICDF(19225), AOM_ICDF(21733), AOM_ICDF(24241), AOM_ICDF(26749),
-            AOM_ICDF(28253), AOM_ICDF(29758), AOM_ICDF(31263), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(1280), AOM_ICDF(1453), AOM_ICDF(1626), AOM_ICDF(2277),
-            AOM_ICDF(2929), AOM_ICDF(3580), AOM_ICDF(4232), AOM_ICDF(16717),
-            AOM_ICDF(19225), AOM_ICDF(21733), AOM_ICDF(24241), AOM_ICDF(26749),
-            AOM_ICDF(28253), AOM_ICDF(29758), AOM_ICDF(31263), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(1280), AOM_ICDF(1453), AOM_ICDF(1626), AOM_ICDF(2277),
-            AOM_ICDF(2929), AOM_ICDF(3580), AOM_ICDF(4232), AOM_ICDF(16717),
-            AOM_ICDF(19225), AOM_ICDF(21733), AOM_ICDF(24241), AOM_ICDF(26749),
-            AOM_ICDF(28253), AOM_ICDF(29758), AOM_ICDF(31263), AOM_ICDF(32768),
-            0 } },
-      {
-#if CONFIG_CHROMA_2X2
-          { 0 },
-#endif
-          { AOM_ICDF(1280), AOM_ICDF(3125), AOM_ICDF(4970), AOM_ICDF(17132),
-            AOM_ICDF(19575), AOM_ICDF(22018), AOM_ICDF(24461), AOM_ICDF(26904),
-            AOM_ICDF(28370), AOM_ICDF(29836), AOM_ICDF(31302), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(1280), AOM_ICDF(3125), AOM_ICDF(4970), AOM_ICDF(17132),
-            AOM_ICDF(19575), AOM_ICDF(22018), AOM_ICDF(24461), AOM_ICDF(26904),
-            AOM_ICDF(28370), AOM_ICDF(29836), AOM_ICDF(31302), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(1280), AOM_ICDF(3125), AOM_ICDF(4970), AOM_ICDF(17132),
-            AOM_ICDF(19575), AOM_ICDF(22018), AOM_ICDF(24461), AOM_ICDF(26904),
-            AOM_ICDF(28370), AOM_ICDF(29836), AOM_ICDF(31302), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(1280), AOM_ICDF(3125), AOM_ICDF(4970), AOM_ICDF(17132),
-            AOM_ICDF(19575), AOM_ICDF(22018), AOM_ICDF(24461), AOM_ICDF(26904),
-            AOM_ICDF(28370), AOM_ICDF(29836), AOM_ICDF(31302), AOM_ICDF(32768),
-            0 } },
-      {
-#if CONFIG_CHROMA_2X2
-          { 0 },
-#endif
-          { AOM_ICDF(1536), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(1536), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(1536), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(1536), AOM_ICDF(32768), 0 } },
-#if CONFIG_MRC_TX
-      {
-#if CONFIG_CHROMA_2X2
-          { 0 },
-#endif
-          { AOM_ICDF(30080), AOM_ICDF(31781), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(4608), AOM_ICDF(32658), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(4352), AOM_ICDF(4685), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(19072), AOM_ICDF(26776), AOM_ICDF(32768), 0 } },
-#endif  // CONFIG_MRC_TX
-    };
-#else
-#if CONFIG_MRC_TX
-static const aom_cdf_prob
-    default_intra_ext_tx_cdf[EXT_TX_SIZES][TX_TYPES][CDF_SIZE(TX_TYPES)] = {
-#if CONFIG_CHROMA_2X2
-      { { AOM_ICDF(30720), AOM_ICDF(31104), AOM_ICDF(31400), AOM_ICDF(32084),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(582), AOM_ICDF(638), AOM_ICDF(31764),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(582), AOM_ICDF(638), AOM_ICDF(1642),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(582), AOM_ICDF(638), AOM_ICDF(1642),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(1280), AOM_ICDF(31760), AOM_ICDF(32264),
-          AOM_ICDF(32768), 0 } },
-#endif
-      { { AOM_ICDF(30720), AOM_ICDF(31104), AOM_ICDF(31400), AOM_ICDF(32084),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(582), AOM_ICDF(638), AOM_ICDF(31764),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(582), AOM_ICDF(638), AOM_ICDF(1642),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(582), AOM_ICDF(638), AOM_ICDF(1642),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(1280), AOM_ICDF(31760), AOM_ICDF(32264),
-          AOM_ICDF(32768), 0 } },
-
-      { { AOM_ICDF(31232), AOM_ICDF(31488), AOM_ICDF(31742), AOM_ICDF(32255),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1024), AOM_ICDF(1152), AOM_ICDF(1272), AOM_ICDF(31784),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1024), AOM_ICDF(1152), AOM_ICDF(1272), AOM_ICDF(2256),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1024), AOM_ICDF(1052), AOM_ICDF(1272), AOM_ICDF(2256),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1024), AOM_ICDF(1792), AOM_ICDF(31776), AOM_ICDF(32272),
-          AOM_ICDF(32768), 0 } },
-
-      { { AOM_ICDF(31744), AOM_ICDF(31940), AOM_ICDF(32084), AOM_ICDF(32426),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(2048), AOM_ICDF(2176), AOM_ICDF(2528), AOM_ICDF(31823),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(2048), AOM_ICDF(2176), AOM_ICDF(2528), AOM_ICDF(3473),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(2048), AOM_ICDF(2176), AOM_ICDF(2528), AOM_ICDF(3473),
-          AOM_ICDF(32768), 0 },
-        { AOM_ICDF(2048), AOM_ICDF(28160), AOM_ICDF(31808), AOM_ICDF(32288),
-          AOM_ICDF(32768), 0 } },
-    };
-
-static const aom_cdf_prob
-    default_inter_ext_tx_cdf[EXT_TX_SIZES][CDF_SIZE(TX_TYPES)] = {
-#if CONFIG_CHROMA_2X2
-      { AOM_ICDF(20480), AOM_ICDF(23040), AOM_ICDF(24560), AOM_ICDF(28664),
-        AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(20480), AOM_ICDF(23040), AOM_ICDF(24560), AOM_ICDF(28664),
-        AOM_ICDF(32768), 0 },
-      { AOM_ICDF(22528), AOM_ICDF(24320), AOM_ICDF(25928), AOM_ICDF(29348),
-        AOM_ICDF(32768), 0 },
-      { AOM_ICDF(24576), AOM_ICDF(25600), AOM_ICDF(27296), AOM_ICDF(30032),
-        AOM_ICDF(32768), 0 },
-    };
-#else  // CONFIG_MRC_TX
-static const aom_cdf_prob
-    default_intra_ext_tx_cdf[EXT_TX_SIZES][TX_TYPES][CDF_SIZE(TX_TYPES)] = {
-#if CONFIG_CHROMA_2X2
-      { { AOM_ICDF(30720), AOM_ICDF(31400), AOM_ICDF(32084), AOM_ICDF(32768),
-          0 },
-        { AOM_ICDF(512), AOM_ICDF(638), AOM_ICDF(31764), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(638), AOM_ICDF(1642), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(31760), AOM_ICDF(32264), AOM_ICDF(32768),
-          0 } },
-#endif
-      { { AOM_ICDF(30720), AOM_ICDF(31400), AOM_ICDF(32084), AOM_ICDF(32768),
-          0 },
-        { AOM_ICDF(512), AOM_ICDF(638), AOM_ICDF(31764), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(638), AOM_ICDF(1642), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(512), AOM_ICDF(31760), AOM_ICDF(32264), AOM_ICDF(32768),
-          0 } },
-
-      { { AOM_ICDF(31232), AOM_ICDF(31742), AOM_ICDF(32255), AOM_ICDF(32768),
-          0 },
-        { AOM_ICDF(1024), AOM_ICDF(1272), AOM_ICDF(31784), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1024), AOM_ICDF(1272), AOM_ICDF(2256), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(1024), AOM_ICDF(31776), AOM_ICDF(32272), AOM_ICDF(32768),
-          0 } },
-      { { AOM_ICDF(31744), AOM_ICDF(32084), AOM_ICDF(32426), AOM_ICDF(32768),
-          0 },
-        { AOM_ICDF(2048), AOM_ICDF(2528), AOM_ICDF(31823), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(2048), AOM_ICDF(2528), AOM_ICDF(3473), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(2048), AOM_ICDF(31808), AOM_ICDF(32288), AOM_ICDF(32768),
-          0 } },
-    };
-
-static const aom_cdf_prob
-    default_inter_ext_tx_cdf[EXT_TX_SIZES][CDF_SIZE(TX_TYPES)] = {
-#if CONFIG_CHROMA_2X2
-      { AOM_ICDF(20480), AOM_ICDF(24560), AOM_ICDF(28664), AOM_ICDF(32768), 0 },
-#endif
-      { AOM_ICDF(20480), AOM_ICDF(24560), AOM_ICDF(28664), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(22528), AOM_ICDF(25928), AOM_ICDF(29348), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(24576), AOM_ICDF(27296), AOM_ICDF(30032), AOM_ICDF(32768), 0 },
-    };
-#endif  // CONFIG_MRC_TX
-#endif  // !CONFIG_EXT_TX
-
-#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-static const aom_cdf_prob
-    default_intra_filter_cdf[INTRA_FILTERS + 1][CDF_SIZE(INTRA_FILTERS)] = {
-      { AOM_ICDF(12544), AOM_ICDF(17521), AOM_ICDF(21095), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(12544), AOM_ICDF(19022), AOM_ICDF(23318), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(12032), AOM_ICDF(17297), AOM_ICDF(23522), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(6272), AOM_ICDF(8860), AOM_ICDF(11101), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(9216), AOM_ICDF(12712), AOM_ICDF(16629), AOM_ICDF(32768), 0 },
-    };
-#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-
-#if CONFIG_CFL
-static const aom_cdf_prob default_cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)] = {
-  AOM_ICDF(1892),  AOM_ICDF(2229),  AOM_ICDF(11464),
-  AOM_ICDF(14116), AOM_ICDF(25661), AOM_ICDF(26409),
-  AOM_ICDF(32508), AOM_ICDF(32768), 0
-};
-
-static const aom_cdf_prob
-    default_cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)] = {
-      { AOM_ICDF(16215), AOM_ICDF(27740), AOM_ICDF(31726), AOM_ICDF(32606),
-        AOM_ICDF(32736), AOM_ICDF(32751), AOM_ICDF(32757), AOM_ICDF(32759),
-        AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-        AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(15213), AOM_ICDF(24615), AOM_ICDF(29704), AOM_ICDF(31974),
-        AOM_ICDF(32545), AOM_ICDF(32673), AOM_ICDF(32713), AOM_ICDF(32746),
-        AOM_ICDF(32753), AOM_ICDF(32756), AOM_ICDF(32758), AOM_ICDF(32761),
-        AOM_ICDF(32763), AOM_ICDF(32764), AOM_ICDF(32766), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(13250), AOM_ICDF(24677), AOM_ICDF(29113), AOM_ICDF(31666),
-        AOM_ICDF(32408), AOM_ICDF(32578), AOM_ICDF(32628), AOM_ICDF(32711),
-        AOM_ICDF(32730), AOM_ICDF(32738), AOM_ICDF(32744), AOM_ICDF(32749),
-        AOM_ICDF(32752), AOM_ICDF(32756), AOM_ICDF(32759), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(24593), AOM_ICDF(30787), AOM_ICDF(32062), AOM_ICDF(32495),
-        AOM_ICDF(32656), AOM_ICDF(32707), AOM_ICDF(32735), AOM_ICDF(32747),
-        AOM_ICDF(32752), AOM_ICDF(32757), AOM_ICDF(32760), AOM_ICDF(32763),
-        AOM_ICDF(32764), AOM_ICDF(32765), AOM_ICDF(32767), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(19883), AOM_ICDF(27419), AOM_ICDF(30100), AOM_ICDF(31392),
-        AOM_ICDF(31896), AOM_ICDF(32184), AOM_ICDF(32299), AOM_ICDF(32511),
-        AOM_ICDF(32568), AOM_ICDF(32602), AOM_ICDF(32628), AOM_ICDF(32664),
-        AOM_ICDF(32680), AOM_ICDF(32691), AOM_ICDF(32708), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(15939), AOM_ICDF(24151), AOM_ICDF(27754), AOM_ICDF(29680),
-        AOM_ICDF(30651), AOM_ICDF(31267), AOM_ICDF(31527), AOM_ICDF(31868),
-        AOM_ICDF(32001), AOM_ICDF(32090), AOM_ICDF(32181), AOM_ICDF(32284),
-        AOM_ICDF(32314), AOM_ICDF(32366), AOM_ICDF(32486), AOM_ICDF(32768), 0 }
-    };
-#endif
-
-#if CONFIG_KF_CTX
-// TODO(jingning): This initial models are copied directly from the entries
-// from the original table. The copied indexes are (0, 0), (0, 1), .. (4, 4).
-// It is possible to re-train this model and bring back the 0.14% loss in CIF
-// set key frame coding. This reduction in context model does not change the
-// key frame coding stats for mid and high resolution sets.
-const aom_cdf_prob
-    default_kf_y_mode_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][CDF_SIZE(
-        INTRA_MODES)] = {
-      {
-          {
-              AOM_ICDF(14208), AOM_ICDF(17049), AOM_ICDF(20482),
-              AOM_ICDF(21400), AOM_ICDF(22520), AOM_ICDF(23261),
-              AOM_ICDF(23963), AOM_ICDF(25010), AOM_ICDF(25828),
-              AOM_ICDF(28398), AOM_ICDF(29394), AOM_ICDF(30738),
-              AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(10496), AOM_ICDF(18295), AOM_ICDF(19872),
-              AOM_ICDF(20945), AOM_ICDF(21933), AOM_ICDF(22818),
-              AOM_ICDF(23334), AOM_ICDF(24033), AOM_ICDF(24996),
-              AOM_ICDF(27652), AOM_ICDF(29060), AOM_ICDF(30071),
-              AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(5120), AOM_ICDF(6461), AOM_ICDF(19840), AOM_ICDF(20310),
-              AOM_ICDF(21151), AOM_ICDF(21506), AOM_ICDF(22535),
-              AOM_ICDF(23900), AOM_ICDF(24281), AOM_ICDF(26958),
-              AOM_ICDF(27680), AOM_ICDF(29636), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(12544), AOM_ICDF(15177), AOM_ICDF(17666),
-              AOM_ICDF(19855), AOM_ICDF(21147), AOM_ICDF(22017),
-              AOM_ICDF(22797), AOM_ICDF(24514), AOM_ICDF(25779),
-              AOM_ICDF(28716), AOM_ICDF(29772), AOM_ICDF(31267),
-              AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(7552), AOM_ICDF(9909), AOM_ICDF(11908), AOM_ICDF(13141),
-              AOM_ICDF(18765), AOM_ICDF(22029), AOM_ICDF(23872),
-              AOM_ICDF(24920), AOM_ICDF(25674), AOM_ICDF(29031),
-              AOM_ICDF(30244), AOM_ICDF(31684), AOM_ICDF(32768), 0,
-          },
-      },
-      {
-          {
-              AOM_ICDF(3968), AOM_ICDF(17613), AOM_ICDF(19125), AOM_ICDF(19550),
-              AOM_ICDF(20305), AOM_ICDF(21908), AOM_ICDF(22274),
-              AOM_ICDF(22719), AOM_ICDF(23959), AOM_ICDF(26970),
-              AOM_ICDF(29013), AOM_ICDF(29843), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(3072), AOM_ICDF(21231), AOM_ICDF(21863), AOM_ICDF(22306),
-              AOM_ICDF(22674), AOM_ICDF(23414), AOM_ICDF(23517),
-              AOM_ICDF(23798), AOM_ICDF(24770), AOM_ICDF(27032),
-              AOM_ICDF(29016), AOM_ICDF(29636), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(2560), AOM_ICDF(9825), AOM_ICDF(15681), AOM_ICDF(16370),
-              AOM_ICDF(17054), AOM_ICDF(17687), AOM_ICDF(18236),
-              AOM_ICDF(19273), AOM_ICDF(20311), AOM_ICDF(24863),
-              AOM_ICDF(26825), AOM_ICDF(28756), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(6912), AOM_ICDF(15140), AOM_ICDF(16485), AOM_ICDF(18364),
-              AOM_ICDF(19181), AOM_ICDF(20394), AOM_ICDF(20663),
-              AOM_ICDF(22098), AOM_ICDF(23936), AOM_ICDF(27555),
-              AOM_ICDF(29704), AOM_ICDF(30849), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(2944), AOM_ICDF(13101), AOM_ICDF(14006), AOM_ICDF(14974),
-              AOM_ICDF(17818), AOM_ICDF(21093), AOM_ICDF(21930),
-              AOM_ICDF(22566), AOM_ICDF(24137), AOM_ICDF(27732),
-              AOM_ICDF(29814), AOM_ICDF(30904), AOM_ICDF(32768), 0,
-          },
-      },
-      {
-          {
-              AOM_ICDF(11392), AOM_ICDF(12961), AOM_ICDF(20901),
-              AOM_ICDF(21544), AOM_ICDF(22490), AOM_ICDF(22928),
-              AOM_ICDF(23888), AOM_ICDF(25214), AOM_ICDF(25777),
-              AOM_ICDF(28256), AOM_ICDF(29102), AOM_ICDF(30513),
-              AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(8064), AOM_ICDF(13595), AOM_ICDF(18888), AOM_ICDF(19616),
-              AOM_ICDF(20765), AOM_ICDF(21454), AOM_ICDF(21990),
-              AOM_ICDF(23103), AOM_ICDF(23980), AOM_ICDF(26772),
-              AOM_ICDF(28070), AOM_ICDF(29197), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(4352), AOM_ICDF(5059), AOM_ICDF(21705), AOM_ICDF(22099),
-              AOM_ICDF(22703), AOM_ICDF(22846), AOM_ICDF(23679),
-              AOM_ICDF(25469), AOM_ICDF(25728), AOM_ICDF(27919),
-              AOM_ICDF(28484), AOM_ICDF(30215), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(10752), AOM_ICDF(12277), AOM_ICDF(16471),
-              AOM_ICDF(18276), AOM_ICDF(19443), AOM_ICDF(19917),
-              AOM_ICDF(21158), AOM_ICDF(23881), AOM_ICDF(24892),
-              AOM_ICDF(27709), AOM_ICDF(28771), AOM_ICDF(30274),
-              AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(8320), AOM_ICDF(10000), AOM_ICDF(14147), AOM_ICDF(15330),
-              AOM_ICDF(19197), AOM_ICDF(20923), AOM_ICDF(22954),
-              AOM_ICDF(24541), AOM_ICDF(25285), AOM_ICDF(28407),
-              AOM_ICDF(29431), AOM_ICDF(30953), AOM_ICDF(32768), 0,
-          },
-      },
-      {
-          {
-              AOM_ICDF(10240), AOM_ICDF(12819), AOM_ICDF(15545),
-              AOM_ICDF(18248), AOM_ICDF(19779), AOM_ICDF(20932),
-              AOM_ICDF(21899), AOM_ICDF(23377), AOM_ICDF(25448),
-              AOM_ICDF(28730), AOM_ICDF(29936), AOM_ICDF(31536),
-              AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(7552), AOM_ICDF(15309), AOM_ICDF(16645), AOM_ICDF(19760),
-              AOM_ICDF(20653), AOM_ICDF(21650), AOM_ICDF(22221),
-              AOM_ICDF(23273), AOM_ICDF(25509), AOM_ICDF(28683),
-              AOM_ICDF(30153), AOM_ICDF(31192), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(5248), AOM_ICDF(6840), AOM_ICDF(16129), AOM_ICDF(17940),
-              AOM_ICDF(19069), AOM_ICDF(19660), AOM_ICDF(20588),
-              AOM_ICDF(22760), AOM_ICDF(23927), AOM_ICDF(27538),
-              AOM_ICDF(28397), AOM_ICDF(30725), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(11008), AOM_ICDF(11903), AOM_ICDF(13794),
-              AOM_ICDF(21320), AOM_ICDF(21931), AOM_ICDF(22310),
-              AOM_ICDF(22546), AOM_ICDF(25375), AOM_ICDF(27347),
-              AOM_ICDF(29800), AOM_ICDF(30761), AOM_ICDF(31833),
-              AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(6272), AOM_ICDF(8678), AOM_ICDF(10313), AOM_ICDF(13073),
-              AOM_ICDF(16823), AOM_ICDF(19980), AOM_ICDF(21520),
-              AOM_ICDF(23242), AOM_ICDF(25344), AOM_ICDF(28797),
-              AOM_ICDF(30405), AOM_ICDF(31940), AOM_ICDF(32768), 0,
-          },
-      },
-      {
-          {
-              AOM_ICDF(7296), AOM_ICDF(9304), AOM_ICDF(11772), AOM_ICDF(12529),
-              AOM_ICDF(18014), AOM_ICDF(20418), AOM_ICDF(23076),
-              AOM_ICDF(24662), AOM_ICDF(25549), AOM_ICDF(29074),
-              AOM_ICDF(30392), AOM_ICDF(31773), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(7168), AOM_ICDF(11687), AOM_ICDF(13541), AOM_ICDF(14431),
-              AOM_ICDF(18214), AOM_ICDF(20761), AOM_ICDF(22269),
-              AOM_ICDF(23320), AOM_ICDF(24633), AOM_ICDF(28339),
-              AOM_ICDF(30193), AOM_ICDF(31268), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(3584), AOM_ICDF(4428), AOM_ICDF(13496), AOM_ICDF(14189),
-              AOM_ICDF(17372), AOM_ICDF(18617), AOM_ICDF(20609),
-              AOM_ICDF(22615), AOM_ICDF(23270), AOM_ICDF(27280),
-              AOM_ICDF(28305), AOM_ICDF(30602), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(7424), AOM_ICDF(8834), AOM_ICDF(10499), AOM_ICDF(14357),
-              AOM_ICDF(17671), AOM_ICDF(19150), AOM_ICDF(20460),
-              AOM_ICDF(23235), AOM_ICDF(24391), AOM_ICDF(28351),
-              AOM_ICDF(29843), AOM_ICDF(31481), AOM_ICDF(32768), 0,
-          },
-          {
-              AOM_ICDF(4480), AOM_ICDF(5888), AOM_ICDF(7093), AOM_ICDF(7902),
-              AOM_ICDF(18290), AOM_ICDF(22123), AOM_ICDF(24511),
-              AOM_ICDF(25532), AOM_ICDF(26360), AOM_ICDF(29653),
-              AOM_ICDF(30954), AOM_ICDF(32215), AOM_ICDF(32768), 0,
-          },
-      },
-    };
-#else
-const aom_cdf_prob default_kf_y_mode_cdf[INTRA_MODES][INTRA_MODES][CDF_SIZE(
-    INTRA_MODES)] = {
-#if CONFIG_SMOOTH_HV
-  {
-      {
-          AOM_ICDF(14208), AOM_ICDF(17049), AOM_ICDF(20482), AOM_ICDF(21400),
-          AOM_ICDF(22520), AOM_ICDF(23261), AOM_ICDF(23963), AOM_ICDF(25010),
-          AOM_ICDF(25828), AOM_ICDF(28398), AOM_ICDF(29394), AOM_ICDF(30738),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10496), AOM_ICDF(18295), AOM_ICDF(19872), AOM_ICDF(20945),
-          AOM_ICDF(21933), AOM_ICDF(22818), AOM_ICDF(23334), AOM_ICDF(24033),
-          AOM_ICDF(24996), AOM_ICDF(27652), AOM_ICDF(29060), AOM_ICDF(30071),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5120), AOM_ICDF(6461), AOM_ICDF(19840), AOM_ICDF(20310),
-          AOM_ICDF(21151), AOM_ICDF(21506), AOM_ICDF(22535), AOM_ICDF(23900),
-          AOM_ICDF(24281), AOM_ICDF(26958), AOM_ICDF(27680), AOM_ICDF(29636),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(12544), AOM_ICDF(15177), AOM_ICDF(17666), AOM_ICDF(19855),
-          AOM_ICDF(21147), AOM_ICDF(22017), AOM_ICDF(22797), AOM_ICDF(24514),
-          AOM_ICDF(25779), AOM_ICDF(28716), AOM_ICDF(29772), AOM_ICDF(31267),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7552), AOM_ICDF(9909), AOM_ICDF(11908), AOM_ICDF(13141),
-          AOM_ICDF(18765), AOM_ICDF(22029), AOM_ICDF(23872), AOM_ICDF(24920),
-          AOM_ICDF(25674), AOM_ICDF(29031), AOM_ICDF(30244), AOM_ICDF(31684),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11008), AOM_ICDF(15004), AOM_ICDF(16534), AOM_ICDF(18158),
-          AOM_ICDF(21515), AOM_ICDF(26668), AOM_ICDF(27834), AOM_ICDF(28735),
-          AOM_ICDF(30471), AOM_ICDF(30839), AOM_ICDF(30969), AOM_ICDF(31068),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6272), AOM_ICDF(7963), AOM_ICDF(11944), AOM_ICDF(12780),
-          AOM_ICDF(17944), AOM_ICDF(19198), AOM_ICDF(24071), AOM_ICDF(25295),
-          AOM_ICDF(25834), AOM_ICDF(29014), AOM_ICDF(29949), AOM_ICDF(31733),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8192), AOM_ICDF(10189), AOM_ICDF(14596), AOM_ICDF(15680),
-          AOM_ICDF(17143), AOM_ICDF(17909), AOM_ICDF(19201), AOM_ICDF(23711),
-          AOM_ICDF(24503), AOM_ICDF(28207), AOM_ICDF(29338), AOM_ICDF(31424),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10752), AOM_ICDF(13199), AOM_ICDF(15048), AOM_ICDF(17151),
-          AOM_ICDF(18445), AOM_ICDF(19604), AOM_ICDF(20363), AOM_ICDF(21782),
-          AOM_ICDF(24311), AOM_ICDF(28026), AOM_ICDF(29517), AOM_ICDF(30962),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7424), AOM_ICDF(10301), AOM_ICDF(13245), AOM_ICDF(14307),
-          AOM_ICDF(16021), AOM_ICDF(16257), AOM_ICDF(17265), AOM_ICDF(18739),
-          AOM_ICDF(20080), AOM_ICDF(26066), AOM_ICDF(28325), AOM_ICDF(31184),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6528), AOM_ICDF(10893), AOM_ICDF(13773), AOM_ICDF(14824),
-          AOM_ICDF(16540), AOM_ICDF(16926), AOM_ICDF(17748), AOM_ICDF(18876),
-          AOM_ICDF(20396), AOM_ICDF(25974), AOM_ICDF(28795), AOM_ICDF(30820),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8704), AOM_ICDF(11005), AOM_ICDF(14320), AOM_ICDF(15349),
-          AOM_ICDF(16746), AOM_ICDF(16884), AOM_ICDF(17887), AOM_ICDF(19304),
-          AOM_ICDF(20265), AOM_ICDF(26115), AOM_ICDF(27672), AOM_ICDF(31358),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6272), AOM_ICDF(9504), AOM_ICDF(15437), AOM_ICDF(16399),
-          AOM_ICDF(17355), AOM_ICDF(17948), AOM_ICDF(18814), AOM_ICDF(20270),
-          AOM_ICDF(21134), AOM_ICDF(23690), AOM_ICDF(24759), AOM_ICDF(26454),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(3968), AOM_ICDF(17613), AOM_ICDF(19125), AOM_ICDF(19550),
-          AOM_ICDF(20305), AOM_ICDF(21908), AOM_ICDF(22274), AOM_ICDF(22719),
-          AOM_ICDF(23959), AOM_ICDF(26970), AOM_ICDF(29013), AOM_ICDF(29843),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3072), AOM_ICDF(21231), AOM_ICDF(21863), AOM_ICDF(22306),
-          AOM_ICDF(22674), AOM_ICDF(23414), AOM_ICDF(23517), AOM_ICDF(23798),
-          AOM_ICDF(24770), AOM_ICDF(27032), AOM_ICDF(29016), AOM_ICDF(29636),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2560), AOM_ICDF(9825), AOM_ICDF(15681), AOM_ICDF(16370),
-          AOM_ICDF(17054), AOM_ICDF(17687), AOM_ICDF(18236), AOM_ICDF(19273),
-          AOM_ICDF(20311), AOM_ICDF(24863), AOM_ICDF(26825), AOM_ICDF(28756),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6912), AOM_ICDF(15140), AOM_ICDF(16485), AOM_ICDF(18364),
-          AOM_ICDF(19181), AOM_ICDF(20394), AOM_ICDF(20663), AOM_ICDF(22098),
-          AOM_ICDF(23936), AOM_ICDF(27555), AOM_ICDF(29704), AOM_ICDF(30849),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2944), AOM_ICDF(13101), AOM_ICDF(14006), AOM_ICDF(14974),
-          AOM_ICDF(17818), AOM_ICDF(21093), AOM_ICDF(21930), AOM_ICDF(22566),
-          AOM_ICDF(24137), AOM_ICDF(27732), AOM_ICDF(29814), AOM_ICDF(30904),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4352), AOM_ICDF(17824), AOM_ICDF(18715), AOM_ICDF(19632),
-          AOM_ICDF(21519), AOM_ICDF(26341), AOM_ICDF(26922), AOM_ICDF(27575),
-          AOM_ICDF(29863), AOM_ICDF(30432), AOM_ICDF(30769), AOM_ICDF(30881),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2944), AOM_ICDF(11971), AOM_ICDF(13509), AOM_ICDF(14295),
-          AOM_ICDF(17202), AOM_ICDF(19005), AOM_ICDF(21605), AOM_ICDF(22458),
-          AOM_ICDF(23839), AOM_ICDF(27774), AOM_ICDF(29492), AOM_ICDF(30787),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4224), AOM_ICDF(13072), AOM_ICDF(15288), AOM_ICDF(16406),
-          AOM_ICDF(17285), AOM_ICDF(18362), AOM_ICDF(19003), AOM_ICDF(21378),
-          AOM_ICDF(22942), AOM_ICDF(27093), AOM_ICDF(29381), AOM_ICDF(30872),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5120), AOM_ICDF(15375), AOM_ICDF(16188), AOM_ICDF(17415),
-          AOM_ICDF(18183), AOM_ICDF(19756), AOM_ICDF(20030), AOM_ICDF(20883),
-          AOM_ICDF(23935), AOM_ICDF(27428), AOM_ICDF(29627), AOM_ICDF(30608),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2816), AOM_ICDF(14999), AOM_ICDF(16352), AOM_ICDF(16969),
-          AOM_ICDF(17836), AOM_ICDF(18125), AOM_ICDF(18514), AOM_ICDF(19181),
-          AOM_ICDF(20650), AOM_ICDF(25773), AOM_ICDF(29172), AOM_ICDF(30662),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2560), AOM_ICDF(16158), AOM_ICDF(17320), AOM_ICDF(17839),
-          AOM_ICDF(18545), AOM_ICDF(18848), AOM_ICDF(19130), AOM_ICDF(19599),
-          AOM_ICDF(20863), AOM_ICDF(25449), AOM_ICDF(29304), AOM_ICDF(30408),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3328), AOM_ICDF(15146), AOM_ICDF(16880), AOM_ICDF(17523),
-          AOM_ICDF(18340), AOM_ICDF(18563), AOM_ICDF(18896), AOM_ICDF(19582),
-          AOM_ICDF(20944), AOM_ICDF(25914), AOM_ICDF(28759), AOM_ICDF(30583),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2560), AOM_ICDF(16618), AOM_ICDF(18460), AOM_ICDF(19207),
-          AOM_ICDF(19654), AOM_ICDF(20276), AOM_ICDF(20529), AOM_ICDF(21179),
-          AOM_ICDF(22355), AOM_ICDF(25423), AOM_ICDF(27696), AOM_ICDF(28638),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(11392), AOM_ICDF(12961), AOM_ICDF(20901), AOM_ICDF(21544),
-          AOM_ICDF(22490), AOM_ICDF(22928), AOM_ICDF(23888), AOM_ICDF(25214),
-          AOM_ICDF(25777), AOM_ICDF(28256), AOM_ICDF(29102), AOM_ICDF(30513),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8064), AOM_ICDF(13595), AOM_ICDF(18888), AOM_ICDF(19616),
-          AOM_ICDF(20765), AOM_ICDF(21454), AOM_ICDF(21990), AOM_ICDF(23103),
-          AOM_ICDF(23980), AOM_ICDF(26772), AOM_ICDF(28070), AOM_ICDF(29197),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4352), AOM_ICDF(5059), AOM_ICDF(21705), AOM_ICDF(22099),
-          AOM_ICDF(22703), AOM_ICDF(22846), AOM_ICDF(23679), AOM_ICDF(25469),
-          AOM_ICDF(25728), AOM_ICDF(27919), AOM_ICDF(28484), AOM_ICDF(30215),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10752), AOM_ICDF(12277), AOM_ICDF(16471), AOM_ICDF(18276),
-          AOM_ICDF(19443), AOM_ICDF(19917), AOM_ICDF(21158), AOM_ICDF(23881),
-          AOM_ICDF(24892), AOM_ICDF(27709), AOM_ICDF(28771), AOM_ICDF(30274),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8320), AOM_ICDF(10000), AOM_ICDF(14147), AOM_ICDF(15330),
-          AOM_ICDF(19197), AOM_ICDF(20923), AOM_ICDF(22954), AOM_ICDF(24541),
-          AOM_ICDF(25285), AOM_ICDF(28407), AOM_ICDF(29431), AOM_ICDF(30953),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11264), AOM_ICDF(14751), AOM_ICDF(18517), AOM_ICDF(20285),
-          AOM_ICDF(23172), AOM_ICDF(25970), AOM_ICDF(27312), AOM_ICDF(28684),
-          AOM_ICDF(29803), AOM_ICDF(30242), AOM_ICDF(30412), AOM_ICDF(30668),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6528), AOM_ICDF(7509), AOM_ICDF(14190), AOM_ICDF(14953),
-          AOM_ICDF(17905), AOM_ICDF(18452), AOM_ICDF(23074), AOM_ICDF(24910),
-          AOM_ICDF(25374), AOM_ICDF(28605), AOM_ICDF(29542), AOM_ICDF(31640),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6784), AOM_ICDF(7644), AOM_ICDF(15953), AOM_ICDF(17055),
-          AOM_ICDF(17945), AOM_ICDF(18242), AOM_ICDF(19351), AOM_ICDF(24705),
-          AOM_ICDF(25365), AOM_ICDF(28466), AOM_ICDF(29334), AOM_ICDF(31245),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8192), AOM_ICDF(9802), AOM_ICDF(14519), AOM_ICDF(15740),
-          AOM_ICDF(17351), AOM_ICDF(18084), AOM_ICDF(18962), AOM_ICDF(20908),
-          AOM_ICDF(22937), AOM_ICDF(26847), AOM_ICDF(28284), AOM_ICDF(29888),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5888), AOM_ICDF(7534), AOM_ICDF(14635), AOM_ICDF(15436),
-          AOM_ICDF(16710), AOM_ICDF(16830), AOM_ICDF(18000), AOM_ICDF(19760),
-          AOM_ICDF(20571), AOM_ICDF(25777), AOM_ICDF(27649), AOM_ICDF(30668),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5248), AOM_ICDF(7364), AOM_ICDF(14858), AOM_ICDF(15545),
-          AOM_ICDF(16861), AOM_ICDF(17016), AOM_ICDF(17859), AOM_ICDF(19384),
-          AOM_ICDF(20237), AOM_ICDF(25239), AOM_ICDF(27715), AOM_ICDF(29865),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6656), AOM_ICDF(7989), AOM_ICDF(15472), AOM_ICDF(16265),
-          AOM_ICDF(17271), AOM_ICDF(17334), AOM_ICDF(18563), AOM_ICDF(20327),
-          AOM_ICDF(20916), AOM_ICDF(26173), AOM_ICDF(27350), AOM_ICDF(31034),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4480), AOM_ICDF(6411), AOM_ICDF(17828), AOM_ICDF(18611),
-          AOM_ICDF(19399), AOM_ICDF(19684), AOM_ICDF(20504), AOM_ICDF(21782),
-          AOM_ICDF(22335), AOM_ICDF(25286), AOM_ICDF(26352), AOM_ICDF(28016),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(10240), AOM_ICDF(12819), AOM_ICDF(15545), AOM_ICDF(18248),
-          AOM_ICDF(19779), AOM_ICDF(20932), AOM_ICDF(21899), AOM_ICDF(23377),
-          AOM_ICDF(25448), AOM_ICDF(28730), AOM_ICDF(29936), AOM_ICDF(31536),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7552), AOM_ICDF(15309), AOM_ICDF(16645), AOM_ICDF(19760),
-          AOM_ICDF(20653), AOM_ICDF(21650), AOM_ICDF(22221), AOM_ICDF(23273),
-          AOM_ICDF(25509), AOM_ICDF(28683), AOM_ICDF(30153), AOM_ICDF(31192),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5248), AOM_ICDF(6840), AOM_ICDF(16129), AOM_ICDF(17940),
-          AOM_ICDF(19069), AOM_ICDF(19660), AOM_ICDF(20588), AOM_ICDF(22760),
-          AOM_ICDF(23927), AOM_ICDF(27538), AOM_ICDF(28397), AOM_ICDF(30725),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11008), AOM_ICDF(11903), AOM_ICDF(13794), AOM_ICDF(21320),
-          AOM_ICDF(21931), AOM_ICDF(22310), AOM_ICDF(22546), AOM_ICDF(25375),
-          AOM_ICDF(27347), AOM_ICDF(29800), AOM_ICDF(30761), AOM_ICDF(31833),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6272), AOM_ICDF(8678), AOM_ICDF(10313), AOM_ICDF(13073),
-          AOM_ICDF(16823), AOM_ICDF(19980), AOM_ICDF(21520), AOM_ICDF(23242),
-          AOM_ICDF(25344), AOM_ICDF(28797), AOM_ICDF(30405), AOM_ICDF(31940),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7424), AOM_ICDF(10835), AOM_ICDF(12653), AOM_ICDF(16345),
-          AOM_ICDF(19574), AOM_ICDF(24868), AOM_ICDF(25937), AOM_ICDF(27299),
-          AOM_ICDF(31104), AOM_ICDF(31448), AOM_ICDF(31580), AOM_ICDF(31679),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4992), AOM_ICDF(6458), AOM_ICDF(9945), AOM_ICDF(11961),
-          AOM_ICDF(16136), AOM_ICDF(17677), AOM_ICDF(20946), AOM_ICDF(23042),
-          AOM_ICDF(24475), AOM_ICDF(28304), AOM_ICDF(29748), AOM_ICDF(31791),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9600), AOM_ICDF(11879), AOM_ICDF(14703), AOM_ICDF(17653),
-          AOM_ICDF(19176), AOM_ICDF(20185), AOM_ICDF(20880), AOM_ICDF(25194),
-          AOM_ICDF(26584), AOM_ICDF(29655), AOM_ICDF(30430), AOM_ICDF(32044),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9856), AOM_ICDF(11385), AOM_ICDF(13457), AOM_ICDF(18705),
-          AOM_ICDF(19577), AOM_ICDF(20266), AOM_ICDF(20746), AOM_ICDF(22207),
-          AOM_ICDF(26724), AOM_ICDF(29431), AOM_ICDF(30645), AOM_ICDF(31604),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6272), AOM_ICDF(9318), AOM_ICDF(11569), AOM_ICDF(14812),
-          AOM_ICDF(16351), AOM_ICDF(16619), AOM_ICDF(17537), AOM_ICDF(19596),
-          AOM_ICDF(22025), AOM_ICDF(27384), AOM_ICDF(29277), AOM_ICDF(31422),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5888), AOM_ICDF(9348), AOM_ICDF(11416), AOM_ICDF(14690),
-          AOM_ICDF(16254), AOM_ICDF(16633), AOM_ICDF(17457), AOM_ICDF(19031),
-          AOM_ICDF(21875), AOM_ICDF(27080), AOM_ICDF(29442), AOM_ICDF(31193),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6912), AOM_ICDF(9329), AOM_ICDF(12218), AOM_ICDF(15177),
-          AOM_ICDF(16806), AOM_ICDF(16998), AOM_ICDF(17991), AOM_ICDF(20005),
-          AOM_ICDF(21952), AOM_ICDF(27108), AOM_ICDF(28867), AOM_ICDF(31657),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5120), AOM_ICDF(9098), AOM_ICDF(13132), AOM_ICDF(17701),
-          AOM_ICDF(18739), AOM_ICDF(19534), AOM_ICDF(20415), AOM_ICDF(22136),
-          AOM_ICDF(24213), AOM_ICDF(27199), AOM_ICDF(28504), AOM_ICDF(29960),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(7296), AOM_ICDF(9304), AOM_ICDF(11772), AOM_ICDF(12529),
-          AOM_ICDF(18014), AOM_ICDF(20418), AOM_ICDF(23076), AOM_ICDF(24662),
-          AOM_ICDF(25549), AOM_ICDF(29074), AOM_ICDF(30392), AOM_ICDF(31773),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7168), AOM_ICDF(11687), AOM_ICDF(13541), AOM_ICDF(14431),
-          AOM_ICDF(18214), AOM_ICDF(20761), AOM_ICDF(22269), AOM_ICDF(23320),
-          AOM_ICDF(24633), AOM_ICDF(28339), AOM_ICDF(30193), AOM_ICDF(31268),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3584), AOM_ICDF(4428), AOM_ICDF(13496), AOM_ICDF(14189),
-          AOM_ICDF(17372), AOM_ICDF(18617), AOM_ICDF(20609), AOM_ICDF(22615),
-          AOM_ICDF(23270), AOM_ICDF(27280), AOM_ICDF(28305), AOM_ICDF(30602),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7424), AOM_ICDF(8834), AOM_ICDF(10499), AOM_ICDF(14357),
-          AOM_ICDF(17671), AOM_ICDF(19150), AOM_ICDF(20460), AOM_ICDF(23235),
-          AOM_ICDF(24391), AOM_ICDF(28351), AOM_ICDF(29843), AOM_ICDF(31481),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4480), AOM_ICDF(5888), AOM_ICDF(7093), AOM_ICDF(7902),
-          AOM_ICDF(18290), AOM_ICDF(22123), AOM_ICDF(24511), AOM_ICDF(25532),
-          AOM_ICDF(26360), AOM_ICDF(29653), AOM_ICDF(30954), AOM_ICDF(32215),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7296), AOM_ICDF(10176), AOM_ICDF(11780), AOM_ICDF(12824),
-          AOM_ICDF(19608), AOM_ICDF(25882), AOM_ICDF(28520), AOM_ICDF(29445),
-          AOM_ICDF(31106), AOM_ICDF(31573), AOM_ICDF(31775), AOM_ICDF(31872),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3840), AOM_ICDF(4833), AOM_ICDF(7551), AOM_ICDF(8449),
-          AOM_ICDF(16668), AOM_ICDF(18614), AOM_ICDF(23952), AOM_ICDF(25668),
-          AOM_ICDF(26721), AOM_ICDF(29888), AOM_ICDF(30697), AOM_ICDF(32090),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6528), AOM_ICDF(8011), AOM_ICDF(11083), AOM_ICDF(12427),
-          AOM_ICDF(16188), AOM_ICDF(17548), AOM_ICDF(19625), AOM_ICDF(23787),
-          AOM_ICDF(24792), AOM_ICDF(28649), AOM_ICDF(29872), AOM_ICDF(31845),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7168), AOM_ICDF(9170), AOM_ICDF(10655), AOM_ICDF(12439),
-          AOM_ICDF(15550), AOM_ICDF(18128), AOM_ICDF(19565), AOM_ICDF(21412),
-          AOM_ICDF(23355), AOM_ICDF(28007), AOM_ICDF(30080), AOM_ICDF(31568),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5504), AOM_ICDF(7636), AOM_ICDF(10174), AOM_ICDF(11056),
-          AOM_ICDF(15562), AOM_ICDF(16252), AOM_ICDF(17931), AOM_ICDF(19598),
-          AOM_ICDF(20967), AOM_ICDF(26845), AOM_ICDF(29149), AOM_ICDF(31490),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5248), AOM_ICDF(7810), AOM_ICDF(10004), AOM_ICDF(11015),
-          AOM_ICDF(15359), AOM_ICDF(16310), AOM_ICDF(17834), AOM_ICDF(19185),
-          AOM_ICDF(20903), AOM_ICDF(26728), AOM_ICDF(29585), AOM_ICDF(31478),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5376), AOM_ICDF(7322), AOM_ICDF(10592), AOM_ICDF(11694),
-          AOM_ICDF(15586), AOM_ICDF(16103), AOM_ICDF(17999), AOM_ICDF(19740),
-          AOM_ICDF(20950), AOM_ICDF(26894), AOM_ICDF(28912), AOM_ICDF(31591),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4608), AOM_ICDF(7276), AOM_ICDF(12153), AOM_ICDF(13388),
-          AOM_ICDF(16091), AOM_ICDF(17970), AOM_ICDF(19548), AOM_ICDF(21175),
-          AOM_ICDF(22481), AOM_ICDF(26543), AOM_ICDF(28212), AOM_ICDF(29908),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(6656), AOM_ICDF(12225), AOM_ICDF(14441), AOM_ICDF(15158),
-          AOM_ICDF(19600), AOM_ICDF(27127), AOM_ICDF(28221), AOM_ICDF(29186),
-          AOM_ICDF(30439), AOM_ICDF(30913), AOM_ICDF(31135), AOM_ICDF(31238),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6400), AOM_ICDF(14608), AOM_ICDF(15920), AOM_ICDF(16643),
-          AOM_ICDF(20149), AOM_ICDF(27328), AOM_ICDF(27896), AOM_ICDF(28672),
-          AOM_ICDF(30227), AOM_ICDF(30778), AOM_ICDF(31053), AOM_ICDF(31120),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3840), AOM_ICDF(6925), AOM_ICDF(14671), AOM_ICDF(15709),
-          AOM_ICDF(19830), AOM_ICDF(24216), AOM_ICDF(25507), AOM_ICDF(27459),
-          AOM_ICDF(28552), AOM_ICDF(29569), AOM_ICDF(29808), AOM_ICDF(30169),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9600), AOM_ICDF(13604), AOM_ICDF(15202), AOM_ICDF(17530),
-          AOM_ICDF(20878), AOM_ICDF(24279), AOM_ICDF(25278), AOM_ICDF(28255),
-          AOM_ICDF(30651), AOM_ICDF(31170), AOM_ICDF(31343), AOM_ICDF(31410),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4608), AOM_ICDF(8535), AOM_ICDF(9588), AOM_ICDF(10740),
-          AOM_ICDF(18673), AOM_ICDF(27664), AOM_ICDF(28826), AOM_ICDF(29828),
-          AOM_ICDF(31081), AOM_ICDF(31503), AOM_ICDF(31680), AOM_ICDF(31778),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4864), AOM_ICDF(10543), AOM_ICDF(11313), AOM_ICDF(12197),
-          AOM_ICDF(16785), AOM_ICDF(27858), AOM_ICDF(28556), AOM_ICDF(29480),
-          AOM_ICDF(30892), AOM_ICDF(31486), AOM_ICDF(31722), AOM_ICDF(31787),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3968), AOM_ICDF(7492), AOM_ICDF(10283), AOM_ICDF(11318),
-          AOM_ICDF(18486), AOM_ICDF(24061), AOM_ICDF(26761), AOM_ICDF(28456),
-          AOM_ICDF(30126), AOM_ICDF(30872), AOM_ICDF(31088), AOM_ICDF(31305),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6016), AOM_ICDF(10246), AOM_ICDF(12999), AOM_ICDF(15083),
-          AOM_ICDF(18769), AOM_ICDF(22398), AOM_ICDF(23584), AOM_ICDF(27098),
-          AOM_ICDF(29574), AOM_ICDF(30609), AOM_ICDF(30898), AOM_ICDF(31200),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7808), AOM_ICDF(13404), AOM_ICDF(14723), AOM_ICDF(16413),
-          AOM_ICDF(20186), AOM_ICDF(24739), AOM_ICDF(25407), AOM_ICDF(27106),
-          AOM_ICDF(29929), AOM_ICDF(30507), AOM_ICDF(30827), AOM_ICDF(30915),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2816), AOM_ICDF(6530), AOM_ICDF(8123), AOM_ICDF(9240),
-          AOM_ICDF(12536), AOM_ICDF(17593), AOM_ICDF(18754), AOM_ICDF(20319),
-          AOM_ICDF(22070), AOM_ICDF(27037), AOM_ICDF(29332), AOM_ICDF(30779),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2432), AOM_ICDF(6577), AOM_ICDF(8010), AOM_ICDF(9215),
-          AOM_ICDF(12657), AOM_ICDF(18898), AOM_ICDF(19588), AOM_ICDF(20953),
-          AOM_ICDF(22766), AOM_ICDF(27231), AOM_ICDF(29927), AOM_ICDF(31109),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3200), AOM_ICDF(6974), AOM_ICDF(9162), AOM_ICDF(10450),
-          AOM_ICDF(13818), AOM_ICDF(17757), AOM_ICDF(19119), AOM_ICDF(20842),
-          AOM_ICDF(22269), AOM_ICDF(27170), AOM_ICDF(29271), AOM_ICDF(30804),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4480), AOM_ICDF(10689), AOM_ICDF(15307), AOM_ICDF(16589),
-          AOM_ICDF(19738), AOM_ICDF(24416), AOM_ICDF(25332), AOM_ICDF(26541),
-          AOM_ICDF(28634), AOM_ICDF(29711), AOM_ICDF(29913), AOM_ICDF(30116),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(9600), AOM_ICDF(11066), AOM_ICDF(15832), AOM_ICDF(16515),
-          AOM_ICDF(18844), AOM_ICDF(19883), AOM_ICDF(24302), AOM_ICDF(25759),
-          AOM_ICDF(26358), AOM_ICDF(29290), AOM_ICDF(30262), AOM_ICDF(31682),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8832), AOM_ICDF(12814), AOM_ICDF(16171), AOM_ICDF(17041),
-          AOM_ICDF(19066), AOM_ICDF(20145), AOM_ICDF(22933), AOM_ICDF(24074),
-          AOM_ICDF(25006), AOM_ICDF(28115), AOM_ICDF(29722), AOM_ICDF(30991),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3840), AOM_ICDF(4486), AOM_ICDF(15821), AOM_ICDF(16330),
-          AOM_ICDF(18461), AOM_ICDF(18879), AOM_ICDF(22436), AOM_ICDF(25051),
-          AOM_ICDF(25443), AOM_ICDF(28637), AOM_ICDF(29396), AOM_ICDF(31412),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9856), AOM_ICDF(10863), AOM_ICDF(14050), AOM_ICDF(15920),
-          AOM_ICDF(18783), AOM_ICDF(19531), AOM_ICDF(22502), AOM_ICDF(24577),
-          AOM_ICDF(25361), AOM_ICDF(28559), AOM_ICDF(29600), AOM_ICDF(31336),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6528), AOM_ICDF(7620), AOM_ICDF(10182), AOM_ICDF(11199),
-          AOM_ICDF(17281), AOM_ICDF(19946), AOM_ICDF(23885), AOM_ICDF(25333),
-          AOM_ICDF(26130), AOM_ICDF(29425), AOM_ICDF(30332), AOM_ICDF(31948),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9728), AOM_ICDF(11821), AOM_ICDF(13954), AOM_ICDF(15233),
-          AOM_ICDF(19855), AOM_ICDF(24478), AOM_ICDF(28675), AOM_ICDF(29878),
-          AOM_ICDF(31238), AOM_ICDF(31741), AOM_ICDF(31874), AOM_ICDF(32048),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5120), AOM_ICDF(5753), AOM_ICDF(9673), AOM_ICDF(10149),
-          AOM_ICDF(14343), AOM_ICDF(15190), AOM_ICDF(24967), AOM_ICDF(26378),
-          AOM_ICDF(26841), AOM_ICDF(29749), AOM_ICDF(30527), AOM_ICDF(32120),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5888), AOM_ICDF(6606), AOM_ICDF(11498), AOM_ICDF(12538),
-          AOM_ICDF(14737), AOM_ICDF(15425), AOM_ICDF(19549), AOM_ICDF(24047),
-          AOM_ICDF(24765), AOM_ICDF(28711), AOM_ICDF(29822), AOM_ICDF(32138),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10368), AOM_ICDF(11757), AOM_ICDF(14126), AOM_ICDF(15474),
-          AOM_ICDF(18311), AOM_ICDF(19358), AOM_ICDF(21539), AOM_ICDF(23451),
-          AOM_ICDF(25034), AOM_ICDF(28791), AOM_ICDF(30035), AOM_ICDF(31280),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6016), AOM_ICDF(7623), AOM_ICDF(11378), AOM_ICDF(12248),
-          AOM_ICDF(15171), AOM_ICDF(15459), AOM_ICDF(18958), AOM_ICDF(20875),
-          AOM_ICDF(21955), AOM_ICDF(27411), AOM_ICDF(29196), AOM_ICDF(31723),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5760), AOM_ICDF(7469), AOM_ICDF(11399), AOM_ICDF(12323),
-          AOM_ICDF(15165), AOM_ICDF(15528), AOM_ICDF(18804), AOM_ICDF(20769),
-          AOM_ICDF(21767), AOM_ICDF(27129), AOM_ICDF(29435), AOM_ICDF(31502),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7040), AOM_ICDF(8295), AOM_ICDF(12298), AOM_ICDF(13035),
-          AOM_ICDF(15194), AOM_ICDF(15357), AOM_ICDF(18976), AOM_ICDF(21100),
-          AOM_ICDF(21805), AOM_ICDF(26978), AOM_ICDF(28342), AOM_ICDF(31763),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5632), AOM_ICDF(7465), AOM_ICDF(14220), AOM_ICDF(15035),
-          AOM_ICDF(17014), AOM_ICDF(18105), AOM_ICDF(21111), AOM_ICDF(23027),
-          AOM_ICDF(23934), AOM_ICDF(27207), AOM_ICDF(28293), AOM_ICDF(30330),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(11008), AOM_ICDF(13089), AOM_ICDF(17144), AOM_ICDF(18425),
-          AOM_ICDF(19954), AOM_ICDF(20624), AOM_ICDF(21658), AOM_ICDF(24229),
-          AOM_ICDF(25290), AOM_ICDF(28803), AOM_ICDF(29938), AOM_ICDF(31493),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9088), AOM_ICDF(14218), AOM_ICDF(16378), AOM_ICDF(17699),
-          AOM_ICDF(18935), AOM_ICDF(19928), AOM_ICDF(20524), AOM_ICDF(22781),
-          AOM_ICDF(24155), AOM_ICDF(27523), AOM_ICDF(29068), AOM_ICDF(30270),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6144), AOM_ICDF(7194), AOM_ICDF(17912), AOM_ICDF(18991),
-          AOM_ICDF(19879), AOM_ICDF(20151), AOM_ICDF(21170), AOM_ICDF(23938),
-          AOM_ICDF(24712), AOM_ICDF(27763), AOM_ICDF(28556), AOM_ICDF(30584),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10496), AOM_ICDF(11614), AOM_ICDF(13652), AOM_ICDF(16928),
-          AOM_ICDF(18425), AOM_ICDF(18967), AOM_ICDF(19724), AOM_ICDF(23817),
-          AOM_ICDF(25594), AOM_ICDF(28685), AOM_ICDF(29734), AOM_ICDF(30941),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7296), AOM_ICDF(8915), AOM_ICDF(11163), AOM_ICDF(13821),
-          AOM_ICDF(16951), AOM_ICDF(18507), AOM_ICDF(20180), AOM_ICDF(22423),
-          AOM_ICDF(24017), AOM_ICDF(28294), AOM_ICDF(29614), AOM_ICDF(31673),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9728), AOM_ICDF(13441), AOM_ICDF(15858), AOM_ICDF(18860),
-          AOM_ICDF(21713), AOM_ICDF(24478), AOM_ICDF(25995), AOM_ICDF(28233),
-          AOM_ICDF(30347), AOM_ICDF(30853), AOM_ICDF(31081), AOM_ICDF(31328),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6400), AOM_ICDF(7480), AOM_ICDF(11482), AOM_ICDF(13206),
-          AOM_ICDF(16199), AOM_ICDF(16908), AOM_ICDF(20436), AOM_ICDF(23507),
-          AOM_ICDF(24650), AOM_ICDF(28360), AOM_ICDF(29438), AOM_ICDF(31532),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9856), AOM_ICDF(10979), AOM_ICDF(13430), AOM_ICDF(15195),
-          AOM_ICDF(15957), AOM_ICDF(16350), AOM_ICDF(16871), AOM_ICDF(26198),
-          AOM_ICDF(26991), AOM_ICDF(29612), AOM_ICDF(30438), AOM_ICDF(31962),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8960), AOM_ICDF(10529), AOM_ICDF(12640), AOM_ICDF(15350),
-          AOM_ICDF(16987), AOM_ICDF(17859), AOM_ICDF(18590), AOM_ICDF(21400),
-          AOM_ICDF(23812), AOM_ICDF(28188), AOM_ICDF(29589), AOM_ICDF(31280),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7168), AOM_ICDF(8877), AOM_ICDF(12393), AOM_ICDF(14015),
-          AOM_ICDF(15655), AOM_ICDF(15794), AOM_ICDF(16814), AOM_ICDF(19923),
-          AOM_ICDF(21086), AOM_ICDF(26723), AOM_ICDF(28669), AOM_ICDF(31468),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6400), AOM_ICDF(8900), AOM_ICDF(12241), AOM_ICDF(13828),
-          AOM_ICDF(15513), AOM_ICDF(15671), AOM_ICDF(16500), AOM_ICDF(19257),
-          AOM_ICDF(20456), AOM_ICDF(25984), AOM_ICDF(28658), AOM_ICDF(31017),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7296), AOM_ICDF(8820), AOM_ICDF(12885), AOM_ICDF(14441),
-          AOM_ICDF(15813), AOM_ICDF(15911), AOM_ICDF(16954), AOM_ICDF(20026),
-          AOM_ICDF(20950), AOM_ICDF(26563), AOM_ICDF(28140), AOM_ICDF(31673),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6272), AOM_ICDF(8455), AOM_ICDF(13328), AOM_ICDF(15907),
-          AOM_ICDF(17026), AOM_ICDF(17464), AOM_ICDF(18267), AOM_ICDF(21436),
-          AOM_ICDF(22712), AOM_ICDF(26403), AOM_ICDF(27660), AOM_ICDF(29559),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(6784), AOM_ICDF(11216), AOM_ICDF(13269), AOM_ICDF(15677),
-          AOM_ICDF(16931), AOM_ICDF(18445), AOM_ICDF(19097), AOM_ICDF(20082),
-          AOM_ICDF(24298), AOM_ICDF(28236), AOM_ICDF(30118), AOM_ICDF(31448),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5760), AOM_ICDF(13240), AOM_ICDF(14110), AOM_ICDF(16966),
-          AOM_ICDF(17743), AOM_ICDF(18916), AOM_ICDF(19281), AOM_ICDF(19848),
-          AOM_ICDF(25552), AOM_ICDF(28646), AOM_ICDF(30444), AOM_ICDF(31291),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4352), AOM_ICDF(6870), AOM_ICDF(14660), AOM_ICDF(16597),
-          AOM_ICDF(17361), AOM_ICDF(18126), AOM_ICDF(18852), AOM_ICDF(20765),
-          AOM_ICDF(23526), AOM_ICDF(27670), AOM_ICDF(29096), AOM_ICDF(31214),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9472), AOM_ICDF(11736), AOM_ICDF(13172), AOM_ICDF(18192),
-          AOM_ICDF(19070), AOM_ICDF(19651), AOM_ICDF(19991), AOM_ICDF(21793),
-          AOM_ICDF(26005), AOM_ICDF(29291), AOM_ICDF(30500), AOM_ICDF(31767),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4480), AOM_ICDF(7252), AOM_ICDF(8651), AOM_ICDF(12379),
-          AOM_ICDF(14936), AOM_ICDF(17493), AOM_ICDF(18326), AOM_ICDF(19527),
-          AOM_ICDF(23655), AOM_ICDF(28031), AOM_ICDF(29960), AOM_ICDF(31773),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6016), AOM_ICDF(11561), AOM_ICDF(12864), AOM_ICDF(15793),
-          AOM_ICDF(18765), AOM_ICDF(23040), AOM_ICDF(23640), AOM_ICDF(24415),
-          AOM_ICDF(31040), AOM_ICDF(31473), AOM_ICDF(31740), AOM_ICDF(31827),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4480), AOM_ICDF(6825), AOM_ICDF(8810), AOM_ICDF(11269),
-          AOM_ICDF(14257), AOM_ICDF(15716), AOM_ICDF(18397), AOM_ICDF(20006),
-          AOM_ICDF(24020), AOM_ICDF(28230), AOM_ICDF(29780), AOM_ICDF(31773),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6912), AOM_ICDF(9466), AOM_ICDF(11717), AOM_ICDF(15159),
-          AOM_ICDF(16237), AOM_ICDF(17145), AOM_ICDF(17814), AOM_ICDF(21258),
-          AOM_ICDF(24754), AOM_ICDF(28864), AOM_ICDF(30313), AOM_ICDF(32061),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7680), AOM_ICDF(10517), AOM_ICDF(11381), AOM_ICDF(16202),
-          AOM_ICDF(16809), AOM_ICDF(17425), AOM_ICDF(17774), AOM_ICDF(18764),
-          AOM_ICDF(26842), AOM_ICDF(29600), AOM_ICDF(31073), AOM_ICDF(31886),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4992), AOM_ICDF(8626), AOM_ICDF(10531), AOM_ICDF(13103),
-          AOM_ICDF(14495), AOM_ICDF(14784), AOM_ICDF(15365), AOM_ICDF(16657),
-          AOM_ICDF(21051), AOM_ICDF(27011), AOM_ICDF(29685), AOM_ICDF(31574),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4736), AOM_ICDF(9433), AOM_ICDF(10981), AOM_ICDF(13494),
-          AOM_ICDF(14644), AOM_ICDF(15043), AOM_ICDF(15396), AOM_ICDF(16378),
-          AOM_ICDF(21506), AOM_ICDF(26869), AOM_ICDF(29824), AOM_ICDF(31454),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5760), AOM_ICDF(9526), AOM_ICDF(11905), AOM_ICDF(14476),
-          AOM_ICDF(15722), AOM_ICDF(16103), AOM_ICDF(16768), AOM_ICDF(18070),
-          AOM_ICDF(21630), AOM_ICDF(27401), AOM_ICDF(29592), AOM_ICDF(31818),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4480), AOM_ICDF(9947), AOM_ICDF(12386), AOM_ICDF(15909),
-          AOM_ICDF(16496), AOM_ICDF(17397), AOM_ICDF(17866), AOM_ICDF(18927),
-          AOM_ICDF(24408), AOM_ICDF(27750), AOM_ICDF(29614), AOM_ICDF(30889),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(7424), AOM_ICDF(10538), AOM_ICDF(14098), AOM_ICDF(14891),
-          AOM_ICDF(16486), AOM_ICDF(16756), AOM_ICDF(17607), AOM_ICDF(18952),
-          AOM_ICDF(20168), AOM_ICDF(26275), AOM_ICDF(28617), AOM_ICDF(31580),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5376), AOM_ICDF(13070), AOM_ICDF(14969), AOM_ICDF(15848),
-          AOM_ICDF(17197), AOM_ICDF(17447), AOM_ICDF(17954), AOM_ICDF(18747),
-          AOM_ICDF(20137), AOM_ICDF(25628), AOM_ICDF(28753), AOM_ICDF(30628),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3584), AOM_ICDF(5287), AOM_ICDF(16141), AOM_ICDF(16840),
-          AOM_ICDF(17670), AOM_ICDF(17760), AOM_ICDF(18532), AOM_ICDF(20387),
-          AOM_ICDF(21102), AOM_ICDF(26118), AOM_ICDF(27535), AOM_ICDF(30830),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7424), AOM_ICDF(9594), AOM_ICDF(11770), AOM_ICDF(14505),
-          AOM_ICDF(16234), AOM_ICDF(16365), AOM_ICDF(17201), AOM_ICDF(20286),
-          AOM_ICDF(22128), AOM_ICDF(27371), AOM_ICDF(29426), AOM_ICDF(31580),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5632), AOM_ICDF(8393), AOM_ICDF(10566), AOM_ICDF(11917),
-          AOM_ICDF(16025), AOM_ICDF(16697), AOM_ICDF(18123), AOM_ICDF(19541),
-          AOM_ICDF(21135), AOM_ICDF(27059), AOM_ICDF(29325), AOM_ICDF(31814),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3840), AOM_ICDF(7916), AOM_ICDF(9526), AOM_ICDF(11010),
-          AOM_ICDF(14114), AOM_ICDF(18169), AOM_ICDF(19510), AOM_ICDF(21031),
-          AOM_ICDF(23083), AOM_ICDF(27769), AOM_ICDF(29782), AOM_ICDF(31299),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5376), AOM_ICDF(7338), AOM_ICDF(10657), AOM_ICDF(11699),
-          AOM_ICDF(14780), AOM_ICDF(15070), AOM_ICDF(18291), AOM_ICDF(20170),
-          AOM_ICDF(21347), AOM_ICDF(26985), AOM_ICDF(28811), AOM_ICDF(31805),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5632), AOM_ICDF(7669), AOM_ICDF(11558), AOM_ICDF(12653),
-          AOM_ICDF(13962), AOM_ICDF(14116), AOM_ICDF(15074), AOM_ICDF(19886),
-          AOM_ICDF(21123), AOM_ICDF(26953), AOM_ICDF(28755), AOM_ICDF(31708),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6528), AOM_ICDF(9739), AOM_ICDF(11612), AOM_ICDF(13211),
-          AOM_ICDF(14992), AOM_ICDF(15237), AOM_ICDF(16016), AOM_ICDF(17677),
-          AOM_ICDF(20588), AOM_ICDF(26647), AOM_ICDF(29116), AOM_ICDF(31435),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5376), AOM_ICDF(8346), AOM_ICDF(11022), AOM_ICDF(11976),
-          AOM_ICDF(13541), AOM_ICDF(13749), AOM_ICDF(14520), AOM_ICDF(16173),
-          AOM_ICDF(17567), AOM_ICDF(25182), AOM_ICDF(28111), AOM_ICDF(31591),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4736), AOM_ICDF(8894), AOM_ICDF(11294), AOM_ICDF(12220),
-          AOM_ICDF(13753), AOM_ICDF(14029), AOM_ICDF(14645), AOM_ICDF(16065),
-          AOM_ICDF(17621), AOM_ICDF(24911), AOM_ICDF(28655), AOM_ICDF(31344),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5760), AOM_ICDF(8193), AOM_ICDF(11667), AOM_ICDF(12461),
-          AOM_ICDF(13880), AOM_ICDF(14040), AOM_ICDF(14946), AOM_ICDF(16537),
-          AOM_ICDF(17642), AOM_ICDF(25117), AOM_ICDF(27333), AOM_ICDF(31713),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4096), AOM_ICDF(8479), AOM_ICDF(13751), AOM_ICDF(14813),
-          AOM_ICDF(15994), AOM_ICDF(16157), AOM_ICDF(16905), AOM_ICDF(18314),
-          AOM_ICDF(19575), AOM_ICDF(25132), AOM_ICDF(27445), AOM_ICDF(30192),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(7936), AOM_ICDF(12263), AOM_ICDF(15558), AOM_ICDF(16331),
-          AOM_ICDF(17779), AOM_ICDF(18148), AOM_ICDF(18810), AOM_ICDF(19794),
-          AOM_ICDF(21046), AOM_ICDF(26644), AOM_ICDF(29417), AOM_ICDF(31507),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5376), AOM_ICDF(15025), AOM_ICDF(16457), AOM_ICDF(17074),
-          AOM_ICDF(18079), AOM_ICDF(18299), AOM_ICDF(18648), AOM_ICDF(19240),
-          AOM_ICDF(20612), AOM_ICDF(25687), AOM_ICDF(29392), AOM_ICDF(30842),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3840), AOM_ICDF(6037), AOM_ICDF(17465), AOM_ICDF(18089),
-          AOM_ICDF(18869), AOM_ICDF(18953), AOM_ICDF(19688), AOM_ICDF(21223),
-          AOM_ICDF(21816), AOM_ICDF(26562), AOM_ICDF(28195), AOM_ICDF(30621),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8448), AOM_ICDF(11255), AOM_ICDF(13307), AOM_ICDF(15676),
-          AOM_ICDF(17392), AOM_ICDF(17603), AOM_ICDF(18268), AOM_ICDF(20783),
-          AOM_ICDF(22646), AOM_ICDF(27628), AOM_ICDF(29737), AOM_ICDF(31628),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5760), AOM_ICDF(9119), AOM_ICDF(11015), AOM_ICDF(12269),
-          AOM_ICDF(16280), AOM_ICDF(17023), AOM_ICDF(18282), AOM_ICDF(19418),
-          AOM_ICDF(21325), AOM_ICDF(27309), AOM_ICDF(30004), AOM_ICDF(31818),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3968), AOM_ICDF(9094), AOM_ICDF(10606), AOM_ICDF(12007),
-          AOM_ICDF(14218), AOM_ICDF(18911), AOM_ICDF(20089), AOM_ICDF(20924),
-          AOM_ICDF(23587), AOM_ICDF(27808), AOM_ICDF(30253), AOM_ICDF(31305),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6016), AOM_ICDF(8627), AOM_ICDF(11201), AOM_ICDF(12200),
-          AOM_ICDF(15305), AOM_ICDF(15671), AOM_ICDF(18639), AOM_ICDF(20185),
-          AOM_ICDF(21627), AOM_ICDF(26990), AOM_ICDF(29449), AOM_ICDF(31723),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6272), AOM_ICDF(8768), AOM_ICDF(12320), AOM_ICDF(13296),
-          AOM_ICDF(14643), AOM_ICDF(14970), AOM_ICDF(15760), AOM_ICDF(20545),
-          AOM_ICDF(21863), AOM_ICDF(27473), AOM_ICDF(29535), AOM_ICDF(31836),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6912), AOM_ICDF(10905), AOM_ICDF(12656), AOM_ICDF(14084),
-          AOM_ICDF(15705), AOM_ICDF(16069), AOM_ICDF(16674), AOM_ICDF(17779),
-          AOM_ICDF(21041), AOM_ICDF(26586), AOM_ICDF(29539), AOM_ICDF(31253),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5248), AOM_ICDF(9672), AOM_ICDF(12113), AOM_ICDF(12871),
-          AOM_ICDF(14423), AOM_ICDF(14710), AOM_ICDF(15376), AOM_ICDF(16708),
-          AOM_ICDF(18092), AOM_ICDF(25260), AOM_ICDF(28991), AOM_ICDF(31585),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4736), AOM_ICDF(10789), AOM_ICDF(13029), AOM_ICDF(13750),
-          AOM_ICDF(15040), AOM_ICDF(15385), AOM_ICDF(15840), AOM_ICDF(16887),
-          AOM_ICDF(18393), AOM_ICDF(25230), AOM_ICDF(29558), AOM_ICDF(31454),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6016), AOM_ICDF(9916), AOM_ICDF(12938), AOM_ICDF(13741),
-          AOM_ICDF(15030), AOM_ICDF(15297), AOM_ICDF(16116), AOM_ICDF(17333),
-          AOM_ICDF(18672), AOM_ICDF(25954), AOM_ICDF(28498), AOM_ICDF(31618),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4608), AOM_ICDF(10266), AOM_ICDF(15450), AOM_ICDF(16299),
-          AOM_ICDF(17114), AOM_ICDF(17288), AOM_ICDF(17775), AOM_ICDF(18835),
-          AOM_ICDF(20227), AOM_ICDF(25199), AOM_ICDF(28098), AOM_ICDF(30018),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(7296), AOM_ICDF(9951), AOM_ICDF(14124), AOM_ICDF(14806),
-          AOM_ICDF(16181), AOM_ICDF(16377), AOM_ICDF(17485), AOM_ICDF(19069),
-          AOM_ICDF(20078), AOM_ICDF(26051), AOM_ICDF(27777), AOM_ICDF(31574),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5376), AOM_ICDF(13823), AOM_ICDF(15889), AOM_ICDF(16620),
-          AOM_ICDF(17709), AOM_ICDF(17881), AOM_ICDF(18327), AOM_ICDF(19140),
-          AOM_ICDF(20374), AOM_ICDF(25685), AOM_ICDF(28160), AOM_ICDF(30521),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3200), AOM_ICDF(4602), AOM_ICDF(16404), AOM_ICDF(17042),
-          AOM_ICDF(17780), AOM_ICDF(17829), AOM_ICDF(18706), AOM_ICDF(20608),
-          AOM_ICDF(21115), AOM_ICDF(25884), AOM_ICDF(26960), AOM_ICDF(30804),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7040), AOM_ICDF(9444), AOM_ICDF(11770), AOM_ICDF(14321),
-          AOM_ICDF(15951), AOM_ICDF(16074), AOM_ICDF(17033), AOM_ICDF(20352),
-          AOM_ICDF(22301), AOM_ICDF(27567), AOM_ICDF(29151), AOM_ICDF(31662),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6016), AOM_ICDF(8316), AOM_ICDF(10849), AOM_ICDF(12136),
-          AOM_ICDF(15860), AOM_ICDF(16430), AOM_ICDF(17935), AOM_ICDF(19659),
-          AOM_ICDF(21083), AOM_ICDF(26968), AOM_ICDF(28839), AOM_ICDF(31618),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3840), AOM_ICDF(7472), AOM_ICDF(9436), AOM_ICDF(11038),
-          AOM_ICDF(13625), AOM_ICDF(17596), AOM_ICDF(18959), AOM_ICDF(20543),
-          AOM_ICDF(22879), AOM_ICDF(27487), AOM_ICDF(29351), AOM_ICDF(31186),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5376), AOM_ICDF(7117), AOM_ICDF(11424), AOM_ICDF(12381),
-          AOM_ICDF(14823), AOM_ICDF(15053), AOM_ICDF(18656), AOM_ICDF(20818),
-          AOM_ICDF(21722), AOM_ICDF(27042), AOM_ICDF(28233), AOM_ICDF(31591),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5760), AOM_ICDF(7281), AOM_ICDF(11910), AOM_ICDF(12912),
-          AOM_ICDF(14229), AOM_ICDF(14391), AOM_ICDF(15474), AOM_ICDF(20113),
-          AOM_ICDF(21128), AOM_ICDF(26627), AOM_ICDF(28077), AOM_ICDF(31713),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6656), AOM_ICDF(9452), AOM_ICDF(11526), AOM_ICDF(13288),
-          AOM_ICDF(14861), AOM_ICDF(15062), AOM_ICDF(15909), AOM_ICDF(17695),
-          AOM_ICDF(20429), AOM_ICDF(26225), AOM_ICDF(28603), AOM_ICDF(31340),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5376), AOM_ICDF(7722), AOM_ICDF(10921), AOM_ICDF(11813),
-          AOM_ICDF(13222), AOM_ICDF(13348), AOM_ICDF(14211), AOM_ICDF(15976),
-          AOM_ICDF(17110), AOM_ICDF(24634), AOM_ICDF(27176), AOM_ICDF(31484),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4736), AOM_ICDF(8226), AOM_ICDF(11137), AOM_ICDF(11988),
-          AOM_ICDF(13518), AOM_ICDF(13706), AOM_ICDF(14332), AOM_ICDF(16016),
-          AOM_ICDF(17301), AOM_ICDF(24641), AOM_ICDF(27704), AOM_ICDF(31016),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5760), AOM_ICDF(7592), AOM_ICDF(11880), AOM_ICDF(12612),
-          AOM_ICDF(13738), AOM_ICDF(13813), AOM_ICDF(14681), AOM_ICDF(16392),
-          AOM_ICDF(17306), AOM_ICDF(24619), AOM_ICDF(26334), AOM_ICDF(31818),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4096), AOM_ICDF(8524), AOM_ICDF(14316), AOM_ICDF(15392),
-          AOM_ICDF(16295), AOM_ICDF(16433), AOM_ICDF(17197), AOM_ICDF(18718),
-          AOM_ICDF(19924), AOM_ICDF(25123), AOM_ICDF(26953), AOM_ICDF(29856),
-          AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(6528), AOM_ICDF(13383), AOM_ICDF(17642), AOM_ICDF(18342),
-          AOM_ICDF(19224), AOM_ICDF(20209), AOM_ICDF(20899), AOM_ICDF(21944),
-          AOM_ICDF(23137), AOM_ICDF(25966), AOM_ICDF(27429), AOM_ICDF(28463),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4480), AOM_ICDF(16901), AOM_ICDF(18876), AOM_ICDF(19560),
-          AOM_ICDF(20257), AOM_ICDF(20912), AOM_ICDF(21169), AOM_ICDF(21959),
-          AOM_ICDF(23036), AOM_ICDF(25781), AOM_ICDF(27676), AOM_ICDF(28569),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2688), AOM_ICDF(5337), AOM_ICDF(18178), AOM_ICDF(18829),
-          AOM_ICDF(19344), AOM_ICDF(19628), AOM_ICDF(20267), AOM_ICDF(22135),
-          AOM_ICDF(22671), AOM_ICDF(25817), AOM_ICDF(26914), AOM_ICDF(28773),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8192), AOM_ICDF(11378), AOM_ICDF(14742), AOM_ICDF(17269),
-          AOM_ICDF(18230), AOM_ICDF(19001), AOM_ICDF(19655), AOM_ICDF(22949),
-          AOM_ICDF(24337), AOM_ICDF(28025), AOM_ICDF(29503), AOM_ICDF(30848),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5120), AOM_ICDF(10133), AOM_ICDF(13144), AOM_ICDF(14374),
-          AOM_ICDF(17020), AOM_ICDF(18920), AOM_ICDF(20235), AOM_ICDF(21677),
-          AOM_ICDF(23142), AOM_ICDF(27131), AOM_ICDF(28671), AOM_ICDF(30284),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7424), AOM_ICDF(15588), AOM_ICDF(18431), AOM_ICDF(19723),
-          AOM_ICDF(21455), AOM_ICDF(24705), AOM_ICDF(25461), AOM_ICDF(26753),
-          AOM_ICDF(28923), AOM_ICDF(29475), AOM_ICDF(29729), AOM_ICDF(29897),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4224), AOM_ICDF(8689), AOM_ICDF(13024), AOM_ICDF(13658),
-          AOM_ICDF(16637), AOM_ICDF(17307), AOM_ICDF(20836), AOM_ICDF(22665),
-          AOM_ICDF(23673), AOM_ICDF(27015), AOM_ICDF(28310), AOM_ICDF(30203),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5120), AOM_ICDF(7896), AOM_ICDF(13618), AOM_ICDF(14900),
-          AOM_ICDF(15708), AOM_ICDF(16153), AOM_ICDF(16997), AOM_ICDF(23625),
-          AOM_ICDF(24466), AOM_ICDF(27719), AOM_ICDF(28892), AOM_ICDF(30500),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5760), AOM_ICDF(11305), AOM_ICDF(13669), AOM_ICDF(15462),
-          AOM_ICDF(16564), AOM_ICDF(17683), AOM_ICDF(18252), AOM_ICDF(20073),
-          AOM_ICDF(22917), AOM_ICDF(27005), AOM_ICDF(28923), AOM_ICDF(30236),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4224), AOM_ICDF(9510), AOM_ICDF(13787), AOM_ICDF(14587),
-          AOM_ICDF(15753), AOM_ICDF(15925), AOM_ICDF(16513), AOM_ICDF(18193),
-          AOM_ICDF(19490), AOM_ICDF(24944), AOM_ICDF(27482), AOM_ICDF(29757),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3840), AOM_ICDF(10052), AOM_ICDF(14106), AOM_ICDF(14887),
-          AOM_ICDF(15827), AOM_ICDF(15996), AOM_ICDF(16522), AOM_ICDF(17939),
-          AOM_ICDF(19204), AOM_ICDF(24508), AOM_ICDF(27661), AOM_ICDF(29491),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(4736), AOM_ICDF(9676), AOM_ICDF(14492), AOM_ICDF(15163),
-          AOM_ICDF(16179), AOM_ICDF(16390), AOM_ICDF(17133), AOM_ICDF(18905),
-          AOM_ICDF(19864), AOM_ICDF(25185), AOM_ICDF(27191), AOM_ICDF(30030),
-          AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(3584), AOM_ICDF(9370), AOM_ICDF(14746), AOM_ICDF(15820),
-          AOM_ICDF(16708), AOM_ICDF(17224), AOM_ICDF(17718), AOM_ICDF(19329),
-          AOM_ICDF(20405), AOM_ICDF(23541), AOM_ICDF(25258), AOM_ICDF(26726),
-          AOM_ICDF(32768), 0,
-      },
-  },
-#else
-  {
-      {
-          AOM_ICDF(15488), AOM_ICDF(18706), AOM_ICDF(22561), AOM_ICDF(23619),
-          AOM_ICDF(24954), AOM_ICDF(25782), AOM_ICDF(26710), AOM_ICDF(27861),
-          AOM_ICDF(28656), AOM_ICDF(30743), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11648), AOM_ICDF(18744), AOM_ICDF(20846), AOM_ICDF(22100),
-          AOM_ICDF(23332), AOM_ICDF(24337), AOM_ICDF(25093), AOM_ICDF(26104),
-          AOM_ICDF(27097), AOM_ICDF(29633), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8448), AOM_ICDF(10732), AOM_ICDF(22507), AOM_ICDF(23254),
-          AOM_ICDF(24382), AOM_ICDF(24876), AOM_ICDF(25827), AOM_ICDF(27488),
-          AOM_ICDF(28040), AOM_ICDF(30108), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(13568), AOM_ICDF(16981), AOM_ICDF(19885), AOM_ICDF(22014),
-          AOM_ICDF(23543), AOM_ICDF(24658), AOM_ICDF(25641), AOM_ICDF(27378),
-          AOM_ICDF(28625), AOM_ICDF(31043), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9600), AOM_ICDF(12225), AOM_ICDF(14408), AOM_ICDF(16033),
-          AOM_ICDF(19544), AOM_ICDF(22318), AOM_ICDF(23960), AOM_ICDF(25617),
-          AOM_ICDF(26522), AOM_ICDF(30596), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(12160), AOM_ICDF(15078), AOM_ICDF(16990), AOM_ICDF(18964),
-          AOM_ICDF(22725), AOM_ICDF(25793), AOM_ICDF(27133), AOM_ICDF(28447),
-          AOM_ICDF(30831), AOM_ICDF(30836), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9088), AOM_ICDF(11274), AOM_ICDF(15818), AOM_ICDF(16940),
-          AOM_ICDF(21178), AOM_ICDF(22338), AOM_ICDF(26171), AOM_ICDF(27754),
-          AOM_ICDF(28503), AOM_ICDF(31473), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10880), AOM_ICDF(13846), AOM_ICDF(18649), AOM_ICDF(20252),
-          AOM_ICDF(22157), AOM_ICDF(22992), AOM_ICDF(24396), AOM_ICDF(27581),
-          AOM_ICDF(28501), AOM_ICDF(31400), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11008), AOM_ICDF(13462), AOM_ICDF(15747), AOM_ICDF(18378),
-          AOM_ICDF(20085), AOM_ICDF(21663), AOM_ICDF(22766), AOM_ICDF(24635),
-          AOM_ICDF(27476), AOM_ICDF(30643), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10112), AOM_ICDF(13147), AOM_ICDF(16135), AOM_ICDF(17577),
-          AOM_ICDF(19681), AOM_ICDF(19689), AOM_ICDF(20856), AOM_ICDF(22374),
-          AOM_ICDF(24454), AOM_ICDF(30555), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8704), AOM_ICDF(12176), AOM_ICDF(17582), AOM_ICDF(18905),
-          AOM_ICDF(19994), AOM_ICDF(20669), AOM_ICDF(21635), AOM_ICDF(23564),
-          AOM_ICDF(24741), AOM_ICDF(27222), AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(8448), AOM_ICDF(18738), AOM_ICDF(21694), AOM_ICDF(22413),
-          AOM_ICDF(23358), AOM_ICDF(24675), AOM_ICDF(25193), AOM_ICDF(26119),
-          AOM_ICDF(27310), AOM_ICDF(30773), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6656), AOM_ICDF(22027), AOM_ICDF(23242), AOM_ICDF(23986),
-          AOM_ICDF(24529), AOM_ICDF(25363), AOM_ICDF(25646), AOM_ICDF(26087),
-          AOM_ICDF(27130), AOM_ICDF(30218), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7168), AOM_ICDF(13862), AOM_ICDF(21137), AOM_ICDF(22124),
-          AOM_ICDF(23036), AOM_ICDF(23803), AOM_ICDF(24458), AOM_ICDF(26390),
-          AOM_ICDF(27342), AOM_ICDF(30968), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9600), AOM_ICDF(17409), AOM_ICDF(19830), AOM_ICDF(21521),
-          AOM_ICDF(22580), AOM_ICDF(23726), AOM_ICDF(24377), AOM_ICDF(25679),
-          AOM_ICDF(27269), AOM_ICDF(30867), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6912), AOM_ICDF(15832), AOM_ICDF(17559), AOM_ICDF(18777),
-          AOM_ICDF(20425), AOM_ICDF(22719), AOM_ICDF(23447), AOM_ICDF(24952),
-          AOM_ICDF(26527), AOM_ICDF(30950), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7808), AOM_ICDF(18730), AOM_ICDF(20143), AOM_ICDF(21445),
-          AOM_ICDF(23347), AOM_ICDF(26267), AOM_ICDF(27229), AOM_ICDF(28315),
-          AOM_ICDF(30911), AOM_ICDF(30915), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6784), AOM_ICDF(14299), AOM_ICDF(17264), AOM_ICDF(18505),
-          AOM_ICDF(20765), AOM_ICDF(22440), AOM_ICDF(24331), AOM_ICDF(26038),
-          AOM_ICDF(27481), AOM_ICDF(31448), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8832), AOM_ICDF(15726), AOM_ICDF(19455), AOM_ICDF(20668),
-          AOM_ICDF(21607), AOM_ICDF(22655), AOM_ICDF(23384), AOM_ICDF(26356),
-          AOM_ICDF(27697), AOM_ICDF(31459), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8192), AOM_ICDF(17385), AOM_ICDF(18866), AOM_ICDF(20120),
-          AOM_ICDF(21273), AOM_ICDF(22853), AOM_ICDF(23470), AOM_ICDF(24881),
-          AOM_ICDF(27216), AOM_ICDF(31040), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6656), AOM_ICDF(16341), AOM_ICDF(18497), AOM_ICDF(19439),
-          AOM_ICDF(20706), AOM_ICDF(20711), AOM_ICDF(21234), AOM_ICDF(22307),
-          AOM_ICDF(23950), AOM_ICDF(30728), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6400), AOM_ICDF(17625), AOM_ICDF(20326), AOM_ICDF(21821),
-          AOM_ICDF(22568), AOM_ICDF(23415), AOM_ICDF(23854), AOM_ICDF(24896),
-          AOM_ICDF(26171), AOM_ICDF(29575), AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(12032), AOM_ICDF(14259), AOM_ICDF(22597), AOM_ICDF(23443),
-          AOM_ICDF(24581), AOM_ICDF(25079), AOM_ICDF(26399), AOM_ICDF(27862),
-          AOM_ICDF(28509), AOM_ICDF(30419), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9216), AOM_ICDF(14883), AOM_ICDF(20941), AOM_ICDF(21958),
-          AOM_ICDF(23597), AOM_ICDF(24328), AOM_ICDF(25208), AOM_ICDF(26590),
-          AOM_ICDF(27377), AOM_ICDF(29364), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6784), AOM_ICDF(8088), AOM_ICDF(24407), AOM_ICDF(25006),
-          AOM_ICDF(25777), AOM_ICDF(25950), AOM_ICDF(26882), AOM_ICDF(28811),
-          AOM_ICDF(29159), AOM_ICDF(30636), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11904), AOM_ICDF(14425), AOM_ICDF(18729), AOM_ICDF(20730),
-          AOM_ICDF(21998), AOM_ICDF(22686), AOM_ICDF(23856), AOM_ICDF(26580),
-          AOM_ICDF(27613), AOM_ICDF(29834), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10752), AOM_ICDF(12784), AOM_ICDF(16305), AOM_ICDF(17624),
-          AOM_ICDF(20320), AOM_ICDF(22450), AOM_ICDF(24380), AOM_ICDF(26773),
-          AOM_ICDF(27837), AOM_ICDF(30016), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10496), AOM_ICDF(14090), AOM_ICDF(18314), AOM_ICDF(20621),
-          AOM_ICDF(23539), AOM_ICDF(25261), AOM_ICDF(26953), AOM_ICDF(28692),
-          AOM_ICDF(30064), AOM_ICDF(30071), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8448), AOM_ICDF(10229), AOM_ICDF(16542), AOM_ICDF(17725),
-          AOM_ICDF(21504), AOM_ICDF(22332), AOM_ICDF(26006), AOM_ICDF(27895),
-          AOM_ICDF(28487), AOM_ICDF(31248), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9728), AOM_ICDF(11162), AOM_ICDF(19379), AOM_ICDF(20981),
-          AOM_ICDF(22356), AOM_ICDF(22926), AOM_ICDF(24318), AOM_ICDF(28364),
-          AOM_ICDF(29020), AOM_ICDF(31328), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9216), AOM_ICDF(10861), AOM_ICDF(14850), AOM_ICDF(16471),
-          AOM_ICDF(18611), AOM_ICDF(19674), AOM_ICDF(21009), AOM_ICDF(23454),
-          AOM_ICDF(26078), AOM_ICDF(29272), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7808), AOM_ICDF(10132), AOM_ICDF(17327), AOM_ICDF(18472),
-          AOM_ICDF(20126), AOM_ICDF(20132), AOM_ICDF(21599), AOM_ICDF(23338),
-          AOM_ICDF(24514), AOM_ICDF(29843), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6784), AOM_ICDF(9210), AOM_ICDF(19309), AOM_ICDF(20715),
-          AOM_ICDF(21833), AOM_ICDF(22262), AOM_ICDF(23353), AOM_ICDF(24942),
-          AOM_ICDF(25800), AOM_ICDF(28200), AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(12288), AOM_ICDF(15040), AOM_ICDF(18401), AOM_ICDF(21071),
-          AOM_ICDF(22800), AOM_ICDF(23945), AOM_ICDF(25274), AOM_ICDF(26939),
-          AOM_ICDF(28554), AOM_ICDF(31328), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9344), AOM_ICDF(17170), AOM_ICDF(19325), AOM_ICDF(22119),
-          AOM_ICDF(23284), AOM_ICDF(24378), AOM_ICDF(24911), AOM_ICDF(26095),
-          AOM_ICDF(27781), AOM_ICDF(31121), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9344), AOM_ICDF(11650), AOM_ICDF(19788), AOM_ICDF(21928),
-          AOM_ICDF(22916), AOM_ICDF(23571), AOM_ICDF(24362), AOM_ICDF(26633),
-          AOM_ICDF(27946), AOM_ICDF(31212), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(12928), AOM_ICDF(14428), AOM_ICDF(17080), AOM_ICDF(20882),
-          AOM_ICDF(22104), AOM_ICDF(23149), AOM_ICDF(23715), AOM_ICDF(27167),
-          AOM_ICDF(28932), AOM_ICDF(31218), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9088), AOM_ICDF(11962), AOM_ICDF(13849), AOM_ICDF(16880),
-          AOM_ICDF(19818), AOM_ICDF(21895), AOM_ICDF(23000), AOM_ICDF(25923),
-          AOM_ICDF(27961), AOM_ICDF(31380), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10240), AOM_ICDF(13336), AOM_ICDF(15505), AOM_ICDF(18844),
-          AOM_ICDF(21646), AOM_ICDF(24723), AOM_ICDF(25832), AOM_ICDF(27802),
-          AOM_ICDF(31088), AOM_ICDF(31096), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8704), AOM_ICDF(10683), AOM_ICDF(14446), AOM_ICDF(17035),
-          AOM_ICDF(20211), AOM_ICDF(21577), AOM_ICDF(24370), AOM_ICDF(26477),
-          AOM_ICDF(28223), AOM_ICDF(31734), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(12928), AOM_ICDF(17358), AOM_ICDF(19982), AOM_ICDF(22123),
-          AOM_ICDF(23335), AOM_ICDF(23948), AOM_ICDF(24890), AOM_ICDF(28884),
-          AOM_ICDF(30197), AOM_ICDF(32148), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10496), AOM_ICDF(12429), AOM_ICDF(16401), AOM_ICDF(20493),
-          AOM_ICDF(21471), AOM_ICDF(22433), AOM_ICDF(23162), AOM_ICDF(24686),
-          AOM_ICDF(29027), AOM_ICDF(31115), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8448), AOM_ICDF(12157), AOM_ICDF(14796), AOM_ICDF(17676),
-          AOM_ICDF(19754), AOM_ICDF(19762), AOM_ICDF(20641), AOM_ICDF(23274),
-          AOM_ICDF(25569), AOM_ICDF(31058), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7296), AOM_ICDF(11083), AOM_ICDF(15313), AOM_ICDF(20550),
-          AOM_ICDF(21783), AOM_ICDF(22727), AOM_ICDF(23461), AOM_ICDF(25072),
-          AOM_ICDF(27195), AOM_ICDF(30380), AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(10880), AOM_ICDF(13214), AOM_ICDF(15829), AOM_ICDF(16866),
-          AOM_ICDF(20613), AOM_ICDF(22316), AOM_ICDF(24539), AOM_ICDF(27077),
-          AOM_ICDF(28116), AOM_ICDF(31485), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9984), AOM_ICDF(13868), AOM_ICDF(16397), AOM_ICDF(17486),
-          AOM_ICDF(20011), AOM_ICDF(22071), AOM_ICDF(23357), AOM_ICDF(24990),
-          AOM_ICDF(26336), AOM_ICDF(30276), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7168), AOM_ICDF(8637), AOM_ICDF(17963), AOM_ICDF(18813),
-          AOM_ICDF(21065), AOM_ICDF(22052), AOM_ICDF(23502), AOM_ICDF(25702),
-          AOM_ICDF(26745), AOM_ICDF(30668), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8960), AOM_ICDF(10682), AOM_ICDF(12496), AOM_ICDF(18240),
-          AOM_ICDF(20500), AOM_ICDF(21585), AOM_ICDF(23387), AOM_ICDF(25795),
-          AOM_ICDF(27119), AOM_ICDF(31001), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9856), AOM_ICDF(12056), AOM_ICDF(13722), AOM_ICDF(15196),
-          AOM_ICDF(19276), AOM_ICDF(21891), AOM_ICDF(23643), AOM_ICDF(25538),
-          AOM_ICDF(26854), AOM_ICDF(31515), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9984), AOM_ICDF(12963), AOM_ICDF(14960), AOM_ICDF(16734),
-          AOM_ICDF(21279), AOM_ICDF(25616), AOM_ICDF(27638), AOM_ICDF(28950),
-          AOM_ICDF(31161), AOM_ICDF(31166), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7168), AOM_ICDF(8604), AOM_ICDF(12044), AOM_ICDF(13632),
-          AOM_ICDF(18931), AOM_ICDF(20553), AOM_ICDF(23452), AOM_ICDF(25800),
-          AOM_ICDF(27754), AOM_ICDF(31668), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11520), AOM_ICDF(13372), AOM_ICDF(16642), AOM_ICDF(18137),
-          AOM_ICDF(20232), AOM_ICDF(21510), AOM_ICDF(23052), AOM_ICDF(26792),
-          AOM_ICDF(27974), AOM_ICDF(31274), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10240), AOM_ICDF(12483), AOM_ICDF(14364), AOM_ICDF(16168),
-          AOM_ICDF(18668), AOM_ICDF(20707), AOM_ICDF(22158), AOM_ICDF(24410),
-          AOM_ICDF(26370), AOM_ICDF(30744), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8064), AOM_ICDF(10798), AOM_ICDF(13829), AOM_ICDF(15128),
-          AOM_ICDF(19136), AOM_ICDF(19152), AOM_ICDF(21057), AOM_ICDF(22583),
-          AOM_ICDF(24513), AOM_ICDF(30645), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8448), AOM_ICDF(11025), AOM_ICDF(16073), AOM_ICDF(17603),
-          AOM_ICDF(20094), AOM_ICDF(21468), AOM_ICDF(22971), AOM_ICDF(24628),
-          AOM_ICDF(26015), AOM_ICDF(29728), AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(10368), AOM_ICDF(15372), AOM_ICDF(18442), AOM_ICDF(19576),
-          AOM_ICDF(22674), AOM_ICDF(27128), AOM_ICDF(28232), AOM_ICDF(29624),
-          AOM_ICDF(31363), AOM_ICDF(31368), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9472), AOM_ICDF(16687), AOM_ICDF(18957), AOM_ICDF(20272),
-          AOM_ICDF(22852), AOM_ICDF(27082), AOM_ICDF(27839), AOM_ICDF(28995),
-          AOM_ICDF(30943), AOM_ICDF(30948), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8064), AOM_ICDF(12334), AOM_ICDF(19197), AOM_ICDF(20956),
-          AOM_ICDF(24804), AOM_ICDF(26553), AOM_ICDF(27556), AOM_ICDF(29877),
-          AOM_ICDF(31311), AOM_ICDF(31320), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8960), AOM_ICDF(14083), AOM_ICDF(16058), AOM_ICDF(19129),
-          AOM_ICDF(21136), AOM_ICDF(23635), AOM_ICDF(24870), AOM_ICDF(27577),
-          AOM_ICDF(31176), AOM_ICDF(31187), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9984), AOM_ICDF(14208), AOM_ICDF(15589), AOM_ICDF(17640),
-          AOM_ICDF(22080), AOM_ICDF(26660), AOM_ICDF(27947), AOM_ICDF(29400),
-          AOM_ICDF(31605), AOM_ICDF(31611), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9216), AOM_ICDF(15167), AOM_ICDF(16263), AOM_ICDF(17767),
-          AOM_ICDF(21531), AOM_ICDF(26689), AOM_ICDF(27607), AOM_ICDF(28880),
-          AOM_ICDF(31291), AOM_ICDF(31296), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8448), AOM_ICDF(12756), AOM_ICDF(15781), AOM_ICDF(17279),
-          AOM_ICDF(21198), AOM_ICDF(24057), AOM_ICDF(26171), AOM_ICDF(29200),
-          AOM_ICDF(31901), AOM_ICDF(31913), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9984), AOM_ICDF(15074), AOM_ICDF(18244), AOM_ICDF(19878),
-          AOM_ICDF(22246), AOM_ICDF(24436), AOM_ICDF(25560), AOM_ICDF(28991),
-          AOM_ICDF(31687), AOM_ICDF(31700), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10496), AOM_ICDF(15128), AOM_ICDF(17012), AOM_ICDF(18989),
-          AOM_ICDF(21294), AOM_ICDF(25011), AOM_ICDF(25999), AOM_ICDF(27784),
-          AOM_ICDF(30934), AOM_ICDF(30941), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2944), AOM_ICDF(5875), AOM_ICDF(8846), AOM_ICDF(11817),
-          AOM_ICDF(14806), AOM_ICDF(17795), AOM_ICDF(20769), AOM_ICDF(23761),
-          AOM_ICDF(26747), AOM_ICDF(29739), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7424), AOM_ICDF(12915), AOM_ICDF(17544), AOM_ICDF(19392),
-          AOM_ICDF(23074), AOM_ICDF(25635), AOM_ICDF(26431), AOM_ICDF(28241),
-          AOM_ICDF(30088), AOM_ICDF(30095), AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(11648), AOM_ICDF(13565), AOM_ICDF(18996), AOM_ICDF(19908),
-          AOM_ICDF(21897), AOM_ICDF(22852), AOM_ICDF(26656), AOM_ICDF(28172),
-          AOM_ICDF(28995), AOM_ICDF(31283), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10240), AOM_ICDF(14255), AOM_ICDF(18109), AOM_ICDF(19716),
-          AOM_ICDF(21521), AOM_ICDF(22859), AOM_ICDF(24613), AOM_ICDF(26161),
-          AOM_ICDF(27279), AOM_ICDF(30392), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6784), AOM_ICDF(7848), AOM_ICDF(18820), AOM_ICDF(19447),
-          AOM_ICDF(22335), AOM_ICDF(22733), AOM_ICDF(25112), AOM_ICDF(28427),
-          AOM_ICDF(29013), AOM_ICDF(31550), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11904), AOM_ICDF(13581), AOM_ICDF(17695), AOM_ICDF(19311),
-          AOM_ICDF(21698), AOM_ICDF(22562), AOM_ICDF(24391), AOM_ICDF(26559),
-          AOM_ICDF(27779), AOM_ICDF(30567), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10624), AOM_ICDF(12334), AOM_ICDF(14643), AOM_ICDF(16255),
-          AOM_ICDF(20783), AOM_ICDF(22767), AOM_ICDF(24929), AOM_ICDF(26876),
-          AOM_ICDF(27998), AOM_ICDF(31470), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(12032), AOM_ICDF(14415), AOM_ICDF(16715), AOM_ICDF(18712),
-          AOM_ICDF(21557), AOM_ICDF(25332), AOM_ICDF(27840), AOM_ICDF(29663),
-          AOM_ICDF(31708), AOM_ICDF(31715), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9728), AOM_ICDF(10683), AOM_ICDF(13955), AOM_ICDF(14786),
-          AOM_ICDF(18481), AOM_ICDF(19492), AOM_ICDF(26749), AOM_ICDF(28483),
-          AOM_ICDF(29116), AOM_ICDF(31958), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8960), AOM_ICDF(10032), AOM_ICDF(15755), AOM_ICDF(16949),
-          AOM_ICDF(19144), AOM_ICDF(19744), AOM_ICDF(22082), AOM_ICDF(27608),
-          AOM_ICDF(28411), AOM_ICDF(31838), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(14592), AOM_ICDF(15937), AOM_ICDF(18518), AOM_ICDF(19566),
-          AOM_ICDF(21817), AOM_ICDF(23102), AOM_ICDF(24436), AOM_ICDF(26651),
-          AOM_ICDF(28100), AOM_ICDF(30993), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8960), AOM_ICDF(10791), AOM_ICDF(14718), AOM_ICDF(16094),
-          AOM_ICDF(18560), AOM_ICDF(18570), AOM_ICDF(22120), AOM_ICDF(24188),
-          AOM_ICDF(25677), AOM_ICDF(31280), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11136), AOM_ICDF(13058), AOM_ICDF(19006), AOM_ICDF(20135),
-          AOM_ICDF(21463), AOM_ICDF(22159), AOM_ICDF(24042), AOM_ICDF(26348),
-          AOM_ICDF(27367), AOM_ICDF(30064), AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(12544), AOM_ICDF(15384), AOM_ICDF(20327), AOM_ICDF(21555),
-          AOM_ICDF(23456), AOM_ICDF(24144), AOM_ICDF(25421), AOM_ICDF(27884),
-          AOM_ICDF(28875), AOM_ICDF(31188), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10368), AOM_ICDF(15009), AOM_ICDF(17631), AOM_ICDF(18970),
-          AOM_ICDF(20691), AOM_ICDF(21850), AOM_ICDF(22749), AOM_ICDF(25280),
-          AOM_ICDF(26570), AOM_ICDF(29530), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9088), AOM_ICDF(10956), AOM_ICDF(21554), AOM_ICDF(22698),
-          AOM_ICDF(23666), AOM_ICDF(24052), AOM_ICDF(25122), AOM_ICDF(27792),
-          AOM_ICDF(28612), AOM_ICDF(30825), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11520), AOM_ICDF(12888), AOM_ICDF(16374), AOM_ICDF(19132),
-          AOM_ICDF(21186), AOM_ICDF(21843), AOM_ICDF(22902), AOM_ICDF(26440),
-          AOM_ICDF(27928), AOM_ICDF(29946), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9984), AOM_ICDF(12199), AOM_ICDF(14625), AOM_ICDF(17321),
-          AOM_ICDF(20195), AOM_ICDF(21574), AOM_ICDF(23010), AOM_ICDF(25688),
-          AOM_ICDF(27600), AOM_ICDF(30988), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10112), AOM_ICDF(13705), AOM_ICDF(16847), AOM_ICDF(19242),
-          AOM_ICDF(22011), AOM_ICDF(24064), AOM_ICDF(26481), AOM_ICDF(29125),
-          AOM_ICDF(30545), AOM_ICDF(30555), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9344), AOM_ICDF(10994), AOM_ICDF(15018), AOM_ICDF(16915),
-          AOM_ICDF(20471), AOM_ICDF(21334), AOM_ICDF(24577), AOM_ICDF(27472),
-          AOM_ICDF(28592), AOM_ICDF(31578), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(12928), AOM_ICDF(14540), AOM_ICDF(18022), AOM_ICDF(19481),
-          AOM_ICDF(21028), AOM_ICDF(21825), AOM_ICDF(22728), AOM_ICDF(28191),
-          AOM_ICDF(29154), AOM_ICDF(31683), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10368), AOM_ICDF(12160), AOM_ICDF(14900), AOM_ICDF(17161),
-          AOM_ICDF(19379), AOM_ICDF(20521), AOM_ICDF(21747), AOM_ICDF(24534),
-          AOM_ICDF(26677), AOM_ICDF(30318), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8960), AOM_ICDF(11488), AOM_ICDF(16197), AOM_ICDF(18030),
-          AOM_ICDF(20010), AOM_ICDF(20018), AOM_ICDF(21347), AOM_ICDF(23948),
-          AOM_ICDF(25016), AOM_ICDF(30536), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7808), AOM_ICDF(10310), AOM_ICDF(15420), AOM_ICDF(18961),
-          AOM_ICDF(20114), AOM_ICDF(20772), AOM_ICDF(21721), AOM_ICDF(24599),
-          AOM_ICDF(26237), AOM_ICDF(29160), AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(9856), AOM_ICDF(13764), AOM_ICDF(16995), AOM_ICDF(19540),
-          AOM_ICDF(20802), AOM_ICDF(22302), AOM_ICDF(23113), AOM_ICDF(24519),
-          AOM_ICDF(27717), AOM_ICDF(31604), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8704), AOM_ICDF(15725), AOM_ICDF(17309), AOM_ICDF(20296),
-          AOM_ICDF(21257), AOM_ICDF(22573), AOM_ICDF(23165), AOM_ICDF(23893),
-          AOM_ICDF(27755), AOM_ICDF(31170), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7936), AOM_ICDF(11343), AOM_ICDF(19355), AOM_ICDF(21223),
-          AOM_ICDF(22121), AOM_ICDF(22978), AOM_ICDF(23703), AOM_ICDF(26079),
-          AOM_ICDF(27978), AOM_ICDF(31507), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11264), AOM_ICDF(14823), AOM_ICDF(17314), AOM_ICDF(20715),
-          AOM_ICDF(21999), AOM_ICDF(22982), AOM_ICDF(23728), AOM_ICDF(25229),
-          AOM_ICDF(28593), AOM_ICDF(31508), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8704), AOM_ICDF(11788), AOM_ICDF(13666), AOM_ICDF(16523),
-          AOM_ICDF(18630), AOM_ICDF(20579), AOM_ICDF(21574), AOM_ICDF(23335),
-          AOM_ICDF(26298), AOM_ICDF(31264), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9088), AOM_ICDF(14031), AOM_ICDF(15766), AOM_ICDF(18533),
-          AOM_ICDF(21457), AOM_ICDF(24078), AOM_ICDF(24973), AOM_ICDF(26102),
-          AOM_ICDF(31284), AOM_ICDF(31288), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7040), AOM_ICDF(9648), AOM_ICDF(12140), AOM_ICDF(14601),
-          AOM_ICDF(16742), AOM_ICDF(18070), AOM_ICDF(21154), AOM_ICDF(23582),
-          AOM_ICDF(27647), AOM_ICDF(31763), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(10240), AOM_ICDF(13466), AOM_ICDF(16837), AOM_ICDF(19351),
-          AOM_ICDF(20636), AOM_ICDF(21620), AOM_ICDF(22474), AOM_ICDF(25815),
-          AOM_ICDF(28364), AOM_ICDF(31976), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(11008), AOM_ICDF(13682), AOM_ICDF(15127), AOM_ICDF(18779),
-          AOM_ICDF(19841), AOM_ICDF(20792), AOM_ICDF(21954), AOM_ICDF(23365),
-          AOM_ICDF(29100), AOM_ICDF(31748), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7168), AOM_ICDF(12260), AOM_ICDF(15037), AOM_ICDF(17152),
-          AOM_ICDF(18730), AOM_ICDF(18736), AOM_ICDF(19436), AOM_ICDF(20484),
-          AOM_ICDF(24465), AOM_ICDF(30868), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6784), AOM_ICDF(12469), AOM_ICDF(15422), AOM_ICDF(19291),
-          AOM_ICDF(20301), AOM_ICDF(21344), AOM_ICDF(21894), AOM_ICDF(23415),
-          AOM_ICDF(27696), AOM_ICDF(31042), AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(10112), AOM_ICDF(13929), AOM_ICDF(17880), AOM_ICDF(18857),
-          AOM_ICDF(20955), AOM_ICDF(20963), AOM_ICDF(21974), AOM_ICDF(23273),
-          AOM_ICDF(24734), AOM_ICDF(31352), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8064), AOM_ICDF(15826), AOM_ICDF(17929), AOM_ICDF(19017),
-          AOM_ICDF(21016), AOM_ICDF(21024), AOM_ICDF(21687), AOM_ICDF(22701),
-          AOM_ICDF(24242), AOM_ICDF(30645), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6528), AOM_ICDF(9196), AOM_ICDF(20118), AOM_ICDF(21101),
-          AOM_ICDF(22227), AOM_ICDF(22231), AOM_ICDF(22997), AOM_ICDF(25070),
-          AOM_ICDF(25919), AOM_ICDF(30923), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9600), AOM_ICDF(13218), AOM_ICDF(15898), AOM_ICDF(17780),
-          AOM_ICDF(19991), AOM_ICDF(20000), AOM_ICDF(21196), AOM_ICDF(23912),
-          AOM_ICDF(26044), AOM_ICDF(31139), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8960), AOM_ICDF(12037), AOM_ICDF(14178), AOM_ICDF(15681),
-          AOM_ICDF(20126), AOM_ICDF(20143), AOM_ICDF(21435), AOM_ICDF(23083),
-          AOM_ICDF(24675), AOM_ICDF(31466), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(2944), AOM_ICDF(5875), AOM_ICDF(8846), AOM_ICDF(11817),
-          AOM_ICDF(14806), AOM_ICDF(17795), AOM_ICDF(20769), AOM_ICDF(23761),
-          AOM_ICDF(26747), AOM_ICDF(29739), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9472), AOM_ICDF(12334), AOM_ICDF(15469), AOM_ICDF(16848),
-          AOM_ICDF(19972), AOM_ICDF(19984), AOM_ICDF(22292), AOM_ICDF(24384),
-          AOM_ICDF(25891), AOM_ICDF(31676), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8448), AOM_ICDF(11176), AOM_ICDF(15497), AOM_ICDF(16676),
-          AOM_ICDF(18528), AOM_ICDF(18535), AOM_ICDF(19595), AOM_ICDF(24334),
-          AOM_ICDF(25725), AOM_ICDF(31723), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8704), AOM_ICDF(12141), AOM_ICDF(14313), AOM_ICDF(15828),
-          AOM_ICDF(18358), AOM_ICDF(18368), AOM_ICDF(19469), AOM_ICDF(21089),
-          AOM_ICDF(24027), AOM_ICDF(30700), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(7680), AOM_ICDF(11689), AOM_ICDF(14556), AOM_ICDF(15548),
-          AOM_ICDF(17878), AOM_ICDF(17887), AOM_ICDF(18873), AOM_ICDF(20512),
-          AOM_ICDF(22152), AOM_ICDF(31004), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6656), AOM_ICDF(11476), AOM_ICDF(16600), AOM_ICDF(18052),
-          AOM_ICDF(19683), AOM_ICDF(19689), AOM_ICDF(20509), AOM_ICDF(22077),
-          AOM_ICDF(23496), AOM_ICDF(29504), AOM_ICDF(32768), 0,
-      },
-  },
-  {
-      {
-          AOM_ICDF(9728), AOM_ICDF(14651), AOM_ICDF(19394), AOM_ICDF(20550),
-          AOM_ICDF(21680), AOM_ICDF(22479), AOM_ICDF(23516), AOM_ICDF(24952),
-          AOM_ICDF(26183), AOM_ICDF(28538), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8832), AOM_ICDF(18693), AOM_ICDF(20913), AOM_ICDF(21933),
-          AOM_ICDF(22956), AOM_ICDF(23831), AOM_ICDF(24341), AOM_ICDF(25317),
-          AOM_ICDF(26434), AOM_ICDF(29028), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(5888), AOM_ICDF(8413), AOM_ICDF(20542), AOM_ICDF(21609),
-          AOM_ICDF(22437), AOM_ICDF(22864), AOM_ICDF(23663), AOM_ICDF(26329),
-          AOM_ICDF(26900), AOM_ICDF(29828), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9984), AOM_ICDF(13134), AOM_ICDF(16328), AOM_ICDF(18267),
-          AOM_ICDF(19814), AOM_ICDF(21461), AOM_ICDF(22393), AOM_ICDF(24944),
-          AOM_ICDF(26320), AOM_ICDF(29653), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8448), AOM_ICDF(12425), AOM_ICDF(15474), AOM_ICDF(17031),
-          AOM_ICDF(19216), AOM_ICDF(20889), AOM_ICDF(23077), AOM_ICDF(25108),
-          AOM_ICDF(26548), AOM_ICDF(30108), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9856), AOM_ICDF(15675), AOM_ICDF(19169), AOM_ICDF(20837),
-          AOM_ICDF(22638), AOM_ICDF(24556), AOM_ICDF(25438), AOM_ICDF(27114),
-          AOM_ICDF(29449), AOM_ICDF(29456), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6784), AOM_ICDF(10294), AOM_ICDF(14542), AOM_ICDF(15724),
-          AOM_ICDF(19109), AOM_ICDF(19972), AOM_ICDF(24084), AOM_ICDF(26329),
-          AOM_ICDF(27637), AOM_ICDF(30433), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(8320), AOM_ICDF(10873), AOM_ICDF(17095), AOM_ICDF(18466),
-          AOM_ICDF(19674), AOM_ICDF(20129), AOM_ICDF(21230), AOM_ICDF(27562),
-          AOM_ICDF(28568), AOM_ICDF(30858), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(9088), AOM_ICDF(13196), AOM_ICDF(15898), AOM_ICDF(17566),
-          AOM_ICDF(19210), AOM_ICDF(20354), AOM_ICDF(21186), AOM_ICDF(23647),
-          AOM_ICDF(26235), AOM_ICDF(30548), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6912), AOM_ICDF(11512), AOM_ICDF(16390), AOM_ICDF(17479),
-          AOM_ICDF(19065), AOM_ICDF(19071), AOM_ICDF(19740), AOM_ICDF(21715),
-          AOM_ICDF(23208), AOM_ICDF(29132), AOM_ICDF(32768), 0,
-      },
-      {
-          AOM_ICDF(6656), AOM_ICDF(11485), AOM_ICDF(16060), AOM_ICDF(17734),
-          AOM_ICDF(19099), AOM_ICDF(19814), AOM_ICDF(21018), AOM_ICDF(23053),
-          AOM_ICDF(24333), AOM_ICDF(27260), AOM_ICDF(32768), 0,
-      },
-  },
-#endif  // CONFIG_SMOOTH_HV
-};
-#endif  // CONFIG_KF_CTX
-
-#if CONFIG_LPF_SB
-static const aom_cdf_prob default_lpf_reuse_cdf[LPF_REUSE_CONTEXT][CDF_SIZE(
-    2)] = { { AOM_ICDF(8192), AOM_ICDF(32768), 0 },
-            { AOM_ICDF(4096), AOM_ICDF(32768), 0 } };
-
-static const aom_cdf_prob
-    default_lpf_delta_cdf[LPF_DELTA_CONTEXT][CDF_SIZE(DELTA_RANGE)] = {
-      { AOM_ICDF(4096), AOM_ICDF(7680), AOM_ICDF(10816), AOM_ICDF(13560),
-        AOM_ICDF(15961), AOM_ICDF(18062), AOM_ICDF(19900), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4096), AOM_ICDF(7680), AOM_ICDF(10816), AOM_ICDF(13560),
-        AOM_ICDF(15961), AOM_ICDF(18062), AOM_ICDF(19900), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4096), AOM_ICDF(7680), AOM_ICDF(10816), AOM_ICDF(13560),
-        AOM_ICDF(15961), AOM_ICDF(18062), AOM_ICDF(19900), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4096), AOM_ICDF(7680), AOM_ICDF(10816), AOM_ICDF(13560),
-        AOM_ICDF(15961), AOM_ICDF(18062), AOM_ICDF(19900), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4096), AOM_ICDF(7680), AOM_ICDF(10816), AOM_ICDF(13560),
-        AOM_ICDF(15961), AOM_ICDF(18062), AOM_ICDF(19900), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4096), AOM_ICDF(7680), AOM_ICDF(10816), AOM_ICDF(13560),
-        AOM_ICDF(15961), AOM_ICDF(18062), AOM_ICDF(19900), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4096), AOM_ICDF(7680), AOM_ICDF(10816), AOM_ICDF(13560),
-        AOM_ICDF(15961), AOM_ICDF(18062), AOM_ICDF(19900), AOM_ICDF(32768), 0 },
-      { AOM_ICDF(4096), AOM_ICDF(7680), AOM_ICDF(10816), AOM_ICDF(13560),
-        AOM_ICDF(15961), AOM_ICDF(18062), AOM_ICDF(19900), AOM_ICDF(32768), 0 }
-    };
-
-static const aom_cdf_prob
-    default_lpf_sign_cdf[LPF_REUSE_CONTEXT][LPF_SIGN_CONTEXT][CDF_SIZE(2)] = {
-      { { AOM_ICDF(6554), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(26214), AOM_ICDF(32768), 0 } },
-      { { AOM_ICDF(16384), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(16384), AOM_ICDF(32768), 0 } }
-    };
-#endif  // CONFIG_LPF_SB
-
 static void init_mode_probs(FRAME_CONTEXT *fc) {
-  av1_copy(fc->partition_prob, default_partition_probs);
-  av1_copy(fc->intra_inter_prob, default_intra_inter_p);
-  av1_copy(fc->comp_inter_prob, default_comp_inter_p);
   av1_copy(fc->palette_y_size_cdf, default_palette_y_size_cdf);
   av1_copy(fc->palette_uv_size_cdf, default_palette_uv_size_cdf);
   av1_copy(fc->palette_y_color_index_cdf, default_palette_y_color_index_cdf);
   av1_copy(fc->palette_uv_color_index_cdf, default_palette_uv_color_index_cdf);
   av1_copy(fc->kf_y_cdf, default_kf_y_mode_cdf);
-#if CONFIG_MRC_TX
-  av1_copy(fc->mrc_mask_inter_cdf, default_mrc_mask_inter_cdf);
-  av1_copy(fc->mrc_mask_intra_cdf, default_mrc_mask_intra_cdf);
-#endif  // CONFIG_MRC_TX
-#if CONFIG_NEW_MULTISYMBOL
+  av1_copy(fc->angle_delta_cdf, default_angle_delta_cdf);
   av1_copy(fc->comp_inter_cdf, default_comp_inter_cdf);
-#endif  // CONFIG_NEW_MULTISYMBOL
-#if CONFIG_EXT_COMP_REFS
-  av1_copy(fc->comp_ref_type_prob, default_comp_ref_type_p);
-  av1_copy(fc->uni_comp_ref_prob, default_uni_comp_ref_p);
-#if CONFIG_NEW_MULTISYMBOL
   av1_copy(fc->comp_ref_type_cdf, default_comp_ref_type_cdf);
   av1_copy(fc->uni_comp_ref_cdf, default_uni_comp_ref_cdf);
-#endif  // CONFIG_NEW_MULTISYMBOL
-#endif  // CONFIG_EXT_COMP_REFS
-  av1_copy(fc->comp_ref_prob, default_comp_ref_p);
-#if CONFIG_NEW_MULTISYMBOL
   av1_copy(fc->palette_y_mode_cdf, default_palette_y_mode_cdf);
   av1_copy(fc->palette_uv_mode_cdf, default_palette_uv_mode_cdf);
   av1_copy(fc->comp_ref_cdf, default_comp_ref_cdf);
-#endif
-#if CONFIG_LV_MAP
-  av1_copy(fc->txb_skip, default_txb_skip);
-  av1_copy(fc->nz_map, default_nz_map);
-  av1_copy(fc->eob_flag, default_eob_flag);
-  av1_copy(fc->dc_sign, default_dc_sign);
-  av1_copy(fc->coeff_base, default_coeff_base);
-  av1_copy(fc->coeff_lps, default_coeff_lps);
-#if BR_NODE
-  av1_copy(fc->coeff_br, default_coeff_br);
-#endif
-#if CONFIG_CTX1D
-  av1_copy(fc->eob_mode, default_eob_mode);
-  av1_copy(fc->empty_line, default_empty_line);
-  av1_copy(fc->hv_eob, default_hv_eob);
-#endif  // CONFIG_CTX1D
-
-#if LV_MAP_PROB
-  av1_init_txb_probs(fc);
-#endif  // LV_MAP_PROB
-#endif
-#if CONFIG_EXT_REFS
-  av1_copy(fc->comp_bwdref_prob, default_comp_bwdref_p);
-#if CONFIG_NEW_MULTISYMBOL
   av1_copy(fc->comp_bwdref_cdf, default_comp_bwdref_cdf);
-#endif
-#endif  // CONFIG_EXT_REFS
-  av1_copy(fc->single_ref_prob, default_single_ref_p);
-#if CONFIG_NEW_MULTISYMBOL
   av1_copy(fc->single_ref_cdf, default_single_ref_cdf);
-#endif
-#if CONFIG_COMPOUND_SINGLEREF
-  av1_copy(fc->comp_inter_mode_prob, default_comp_inter_mode_p);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-  fc->quarter_tx_size_prob = default_quarter_tx_size_prob;
-#if CONFIG_NEW_MULTISYMBOL
-  av1_copy(fc->quarter_tx_size_cdf, default_quarter_tx_size_cdf);
-#endif  // CONFIG_NEW_MULTISYMBOL
-#endif
-#if CONFIG_VAR_TX
-  av1_copy(fc->txfm_partition_prob, default_txfm_partition_probs);
-#if CONFIG_NEW_MULTISYMBOL
   av1_copy(fc->txfm_partition_cdf, default_txfm_partition_cdf);
-#endif
-#endif
-  av1_copy(fc->skip_probs, default_skip_probs);
-  av1_copy(fc->newmv_prob, default_newmv_prob);
-  av1_copy(fc->zeromv_prob, default_zeromv_prob);
-  av1_copy(fc->refmv_prob, default_refmv_prob);
-  av1_copy(fc->drl_prob, default_drl_prob);
-#if CONFIG_NEW_MULTISYMBOL
+  av1_copy(fc->compound_index_cdf, default_compound_idx_cdfs);
+  av1_copy(fc->comp_group_idx_cdf, default_comp_group_idx_cdfs);
   av1_copy(fc->newmv_cdf, default_newmv_cdf);
   av1_copy(fc->zeromv_cdf, default_zeromv_cdf);
   av1_copy(fc->refmv_cdf, default_refmv_cdf);
   av1_copy(fc->drl_cdf, default_drl_cdf);
-#endif
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  av1_copy(fc->motion_mode_prob, default_motion_mode_prob);
   av1_copy(fc->motion_mode_cdf, default_motion_mode_cdf);
-#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_MOTION_VAR
-  av1_copy(fc->ncobmc_mode_prob, default_ncobmc_mode_prob);
-  av1_copy(fc->ncobmc_mode_cdf, default_ncobmc_mode_cdf);
-#endif
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-  av1_copy(fc->obmc_prob, default_obmc_prob);
-#if CONFIG_NEW_MULTISYMBOL || CONFIG_NCOBMC_ADAPT_WEIGHT
   av1_copy(fc->obmc_cdf, default_obmc_cdf);
-#endif
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  av1_copy(fc->ncobmc_prob, default_ncobmc_prob);
-  av1_copy(fc->ncobmc_cdf, default_ncobmc_cdf);
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  av1_copy(fc->inter_compound_mode_probs, default_inter_compound_mode_probs);
   av1_copy(fc->inter_compound_mode_cdf, default_inter_compound_mode_cdf);
-#if CONFIG_COMPOUND_SINGLEREF
-  av1_copy(fc->inter_singleref_comp_mode_probs,
-           default_inter_singleref_comp_mode_probs);
-  av1_copy(fc->inter_singleref_comp_mode_cdf,
-           default_inter_singleref_comp_mode_cdf);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  av1_copy(fc->compound_type_prob, default_compound_type_probs);
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
   av1_copy(fc->compound_type_cdf, default_compound_type_cdf);
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-#if CONFIG_INTERINTRA
-  av1_copy(fc->interintra_prob, default_interintra_prob);
-  av1_copy(fc->wedge_interintra_prob, default_wedge_interintra_prob);
-#if CONFIG_NEW_MULTISYMBOL
+  av1_copy(fc->wedge_idx_cdf, default_wedge_idx_cdf);
   av1_copy(fc->interintra_cdf, default_interintra_cdf);
   av1_copy(fc->wedge_interintra_cdf, default_wedge_interintra_cdf);
-#endif  // CONFIG_NEW_MULTISYMBOL
-  av1_copy(fc->interintra_mode_prob, default_interintra_mode_prob);
   av1_copy(fc->interintra_mode_cdf, default_interintra_mode_cdf);
-#endif  // CONFIG_INTERINTRA
-#if CONFIG_SUPERTX
-  av1_copy(fc->supertx_prob, default_supertx_prob);
-#endif  // CONFIG_SUPERTX
-  av1_copy(fc->seg.tree_probs, default_segment_tree_probs);
-  av1_copy(fc->seg.pred_probs, default_segment_pred_probs);
-#if CONFIG_NEW_MULTISYMBOL
   av1_copy(fc->seg.pred_cdf, default_segment_pred_cdf);
-#endif
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-  av1_copy(fc->intra_filter_probs, default_intra_filter_probs);
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  av1_copy(fc->filter_intra_probs, default_filter_intra_probs);
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_LGT_FROM_PRED
-  av1_copy(fc->intra_lgt_prob, default_intra_lgt_prob);
-  av1_copy(fc->inter_lgt_prob, default_inter_lgt_prob);
-#endif  // CONFIG_LGT_FROM_PRED
-#if CONFIG_LOOP_RESTORATION
-  av1_copy(fc->switchable_restore_prob, default_switchable_restore_prob);
-#endif  // CONFIG_LOOP_RESTORATION
+  av1_copy(fc->seg.tree_cdf, default_seg_tree_cdf);
+  av1_copy(fc->filter_intra_cdfs, default_filter_intra_cdfs);
+  av1_copy(fc->filter_intra_mode_cdf, default_filter_intra_mode_cdf);
+  av1_copy(fc->switchable_restore_cdf, default_switchable_restore_cdf);
+  av1_copy(fc->wiener_restore_cdf, default_wiener_restore_cdf);
+  av1_copy(fc->sgrproj_restore_cdf, default_sgrproj_restore_cdf);
   av1_copy(fc->y_mode_cdf, default_if_y_mode_cdf);
   av1_copy(fc->uv_mode_cdf, default_uv_mode_cdf);
   av1_copy(fc->switchable_interp_cdf, default_switchable_interp_cdf);
   av1_copy(fc->partition_cdf, default_partition_cdf);
   av1_copy(fc->intra_ext_tx_cdf, default_intra_ext_tx_cdf);
   av1_copy(fc->inter_ext_tx_cdf, default_inter_ext_tx_cdf);
-#if CONFIG_NEW_MULTISYMBOL
+  av1_copy(fc->skip_mode_cdfs, default_skip_mode_cdfs);
   av1_copy(fc->skip_cdfs, default_skip_cdfs);
   av1_copy(fc->intra_inter_cdf, default_intra_inter_cdf);
-#endif
-#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-  av1_copy(fc->intra_filter_cdf, default_intra_filter_cdf);
-#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-  av1_copy(fc->seg.tree_cdf, default_seg_tree_cdf);
+  for (int i = 0; i < SPATIAL_PREDICTION_PROBS; i++)
+    av1_copy(fc->seg.spatial_pred_seg_cdf[i],
+             default_spatial_pred_seg_tree_cdf[i]);
   av1_copy(fc->tx_size_cdf, default_tx_size_cdf);
-  av1_copy(fc->delta_q_prob, default_delta_q_probs);
   av1_copy(fc->delta_q_cdf, default_delta_q_cdf);
-#if CONFIG_EXT_DELTA_Q
-  av1_copy(fc->delta_lf_prob, default_delta_lf_probs);
   av1_copy(fc->delta_lf_cdf, default_delta_lf_cdf);
-#if CONFIG_LOOPFILTER_LEVEL
   av1_copy(fc->delta_lf_multi_cdf, default_delta_lf_multi_cdf);
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif
-#if CONFIG_CFL
   av1_copy(fc->cfl_sign_cdf, default_cfl_sign_cdf);
   av1_copy(fc->cfl_alpha_cdf, default_cfl_alpha_cdf);
-#endif
-#if CONFIG_INTRABC
   av1_copy(fc->intrabc_cdf, default_intrabc_cdf);
-#endif
-#if CONFIG_LPF_SB
-  av1_copy(fc->lpf_reuse_cdf, default_lpf_reuse_cdf);
-  av1_copy(fc->lpf_delta_cdf, default_lpf_delta_cdf);
-  av1_copy(fc->lpf_sign_cdf, default_lpf_sign_cdf);
-#endif  // CONFIG_LPF_SB
 }
 
-void av1_adapt_inter_frame_probs(AV1_COMMON *cm) {
-  int i, j;
-  FRAME_CONTEXT *fc = cm->fc;
-  const FRAME_CONTEXT *pre_fc = cm->pre_fc;
-  const FRAME_COUNTS *counts = &cm->counts;
-
-  for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-    fc->intra_inter_prob[i] = av1_mode_mv_merge_probs(
-        pre_fc->intra_inter_prob[i], counts->intra_inter[i]);
-
-  for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-    fc->comp_inter_prob[i] = av1_mode_mv_merge_probs(pre_fc->comp_inter_prob[i],
-                                                     counts->comp_inter[i]);
-
-#if CONFIG_EXT_COMP_REFS
-  for (i = 0; i < COMP_REF_TYPE_CONTEXTS; i++)
-    fc->comp_ref_type_prob[i] = av1_mode_mv_merge_probs(
-        pre_fc->comp_ref_type_prob[i], counts->comp_ref_type[i]);
-
-  for (i = 0; i < UNI_COMP_REF_CONTEXTS; i++)
-    for (j = 0; j < (UNIDIR_COMP_REFS - 1); j++)
-      fc->uni_comp_ref_prob[i][j] = av1_mode_mv_merge_probs(
-          pre_fc->uni_comp_ref_prob[i][j], counts->uni_comp_ref[i][j]);
-#endif  // CONFIG_EXT_COMP_REFS
-
-#if CONFIG_EXT_REFS
-  for (i = 0; i < REF_CONTEXTS; i++)
-    for (j = 0; j < (FWD_REFS - 1); j++)
-      fc->comp_ref_prob[i][j] = mode_mv_merge_probs(pre_fc->comp_ref_prob[i][j],
-                                                    counts->comp_ref[i][j]);
-  for (i = 0; i < REF_CONTEXTS; i++)
-    for (j = 0; j < (BWD_REFS - 1); j++)
-      fc->comp_bwdref_prob[i][j] = mode_mv_merge_probs(
-          pre_fc->comp_bwdref_prob[i][j], counts->comp_bwdref[i][j]);
-#else
-  for (i = 0; i < REF_CONTEXTS; i++)
-    for (j = 0; j < (COMP_REFS - 1); j++)
-      fc->comp_ref_prob[i][j] = mode_mv_merge_probs(pre_fc->comp_ref_prob[i][j],
-                                                    counts->comp_ref[i][j]);
-#endif  // CONFIG_EXT_REFS
-
-  for (i = 0; i < REF_CONTEXTS; i++)
-    for (j = 0; j < (SINGLE_REFS - 1); j++)
-      fc->single_ref_prob[i][j] = av1_mode_mv_merge_probs(
-          pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]);
-
-#if CONFIG_COMPOUND_SINGLEREF
-  for (i = 0; i < COMP_INTER_MODE_CONTEXTS; i++)
-    fc->comp_inter_mode_prob[i] = av1_mode_mv_merge_probs(
-        pre_fc->comp_inter_mode_prob[i], counts->comp_inter_mode[i]);
-
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-  for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
-    fc->newmv_prob[i] =
-        av1_mode_mv_merge_probs(pre_fc->newmv_prob[i], counts->newmv_mode[i]);
-  for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
-    fc->zeromv_prob[i] =
-        av1_mode_mv_merge_probs(pre_fc->zeromv_prob[i], counts->zeromv_mode[i]);
-  for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
-    fc->refmv_prob[i] =
-        av1_mode_mv_merge_probs(pre_fc->refmv_prob[i], counts->refmv_mode[i]);
-
-  for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
-    fc->drl_prob[i] =
-        av1_mode_mv_merge_probs(pre_fc->drl_prob[i], counts->drl_mode[i]);
-
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; ++i)
-    aom_tree_merge_probs(av1_motion_mode_tree, pre_fc->motion_mode_prob[i],
-                         counts->motion_mode[i], fc->motion_mode_prob[i]);
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  for (i = 0; i < ADAPT_OVERLAP_BLOCKS; ++i)
-    aom_tree_merge_probs(av1_ncobmc_mode_tree, pre_fc->ncobmc_mode_prob[i],
-                         counts->ncobmc_mode[i], fc->ncobmc_mode_prob[i]);
-#if CONFIG_WARPED_MOTION
-  for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; ++i)
-    aom_tree_merge_probs(av1_ncobmc_tree, pre_fc->ncobmc_prob[i],
-                         counts->ncobmc[i], fc->ncobmc_prob[i]);
-#endif
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-  for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; ++i)
-    fc->obmc_prob[i] =
-        av1_mode_mv_merge_probs(pre_fc->obmc_prob[i], counts->obmc[i]);
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-
-#if CONFIG_SUPERTX
-  for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
-    for (j = TX_8X8; j < TX_SIZES; ++j) {
-      fc->supertx_prob[i][j] = av1_mode_mv_merge_probs(
-          pre_fc->supertx_prob[i][j], counts->supertx[i][j]);
-    }
-  }
-#endif  // CONFIG_SUPERTX
-
-  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
-    aom_tree_merge_probs(
-        av1_inter_compound_mode_tree, pre_fc->inter_compound_mode_probs[i],
-        counts->inter_compound_mode[i], fc->inter_compound_mode_probs[i]);
-#if CONFIG_COMPOUND_SINGLEREF
-  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
-    aom_tree_merge_probs(av1_inter_singleref_comp_mode_tree,
-                         pre_fc->inter_singleref_comp_mode_probs[i],
-                         counts->inter_singleref_comp_mode[i],
-                         fc->inter_singleref_comp_mode_probs[i]);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_INTERINTRA
-  if (cm->allow_interintra_compound) {
-    for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
-      if (is_interintra_allowed_bsize_group(i))
-        fc->interintra_prob[i] = av1_mode_mv_merge_probs(
-            pre_fc->interintra_prob[i], counts->interintra[i]);
-    }
-    for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
-      aom_tree_merge_probs(
-          av1_interintra_mode_tree, pre_fc->interintra_mode_prob[i],
-          counts->interintra_mode[i], fc->interintra_mode_prob[i]);
-    }
-#if CONFIG_WEDGE
-    for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
-      if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i))
-        fc->wedge_interintra_prob[i] = av1_mode_mv_merge_probs(
-            pre_fc->wedge_interintra_prob[i], counts->wedge_interintra[i]);
-    }
-#endif  // CONFIG_WEDGE
-  }
-#endif  // CONFIG_INTERINTRA
-
-#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-  if (cm->allow_masked_compound) {
-    for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
-      aom_tree_merge_probs(
-          av1_compound_type_tree, pre_fc->compound_type_prob[i],
-          counts->compound_interinter[i], fc->compound_type_prob[i]);
-    }
-  }
-#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+void av1_set_default_ref_deltas(int8_t *ref_deltas) {
+  assert(ref_deltas != NULL);
+
+  ref_deltas[INTRA_FRAME] = 1;
+  ref_deltas[LAST_FRAME] = 0;
+  ref_deltas[LAST2_FRAME] = ref_deltas[LAST_FRAME];
+  ref_deltas[LAST3_FRAME] = ref_deltas[LAST_FRAME];
+  ref_deltas[BWDREF_FRAME] = ref_deltas[LAST_FRAME];
+  ref_deltas[GOLDEN_FRAME] = -1;
+  ref_deltas[ALTREF2_FRAME] = -1;
+  ref_deltas[ALTREF_FRAME] = -1;
 }
 
-void av1_adapt_intra_frame_probs(AV1_COMMON *cm) {
-  int i;
-  FRAME_CONTEXT *fc = cm->fc;
-  const FRAME_CONTEXT *pre_fc = cm->pre_fc;
-  const FRAME_COUNTS *counts = &cm->counts;
-
-  if (cm->tx_mode == TX_MODE_SELECT) {
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    fc->quarter_tx_size_prob = av1_mode_mv_merge_probs(
-        pre_fc->quarter_tx_size_prob, counts->quarter_tx_size);
-#endif
-  }
-
-#if CONFIG_VAR_TX
-  if (cm->tx_mode == TX_MODE_SELECT) {
-    for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i)
-      fc->txfm_partition_prob[i] = av1_mode_mv_merge_probs(
-          pre_fc->txfm_partition_prob[i], counts->txfm_partition[i]);
-  }
-#endif
-
-  for (i = 0; i < SKIP_CONTEXTS; ++i)
-    fc->skip_probs[i] =
-        av1_mode_mv_merge_probs(pre_fc->skip_probs[i], counts->skip[i]);
-
-#if CONFIG_LGT_FROM_PRED
-  int j;
-  if (LGT_FROM_PRED_INTRA) {
-    for (i = TX_4X4; i < LGT_SIZES; ++i) {
-      for (j = 0; j < INTRA_MODES; ++j)
-        fc->intra_lgt_prob[i][j] = av1_mode_mv_merge_probs(
-            pre_fc->intra_lgt_prob[i][j], counts->intra_lgt[i][j]);
-    }
-  }
-  if (LGT_FROM_PRED_INTER) {
-    for (i = TX_4X4; i < LGT_SIZES; ++i) {
-      fc->inter_lgt_prob[i] = av1_mode_mv_merge_probs(pre_fc->inter_lgt_prob[i],
-                                                      counts->inter_lgt[i]);
-    }
-  }
-#endif  // CONFIG_LGT_FROM_PRED
-
-  if (cm->seg.temporal_update) {
-    for (i = 0; i < PREDICTION_PROBS; i++)
-      fc->seg.pred_probs[i] = av1_mode_mv_merge_probs(pre_fc->seg.pred_probs[i],
-                                                      counts->seg.pred[i]);
-
-    aom_tree_merge_probs(av1_segment_tree, pre_fc->seg.tree_probs,
-                         counts->seg.tree_mispred, fc->seg.tree_probs);
-  } else {
-    aom_tree_merge_probs(av1_segment_tree, pre_fc->seg.tree_probs,
-                         counts->seg.tree_total, fc->seg.tree_probs);
-  }
+void av1_set_default_mode_deltas(int8_t *mode_deltas) {
+  assert(mode_deltas != NULL);
 
-#if CONFIG_EXT_PARTITION_TYPES
-  for (i = 0; i < PARTITION_PLOFFSET; ++i)
-    aom_tree_merge_probs(av1_partition_tree, pre_fc->partition_prob[i],
-                         counts->partition[i], fc->partition_prob[i]);
-  for (; i < PARTITION_CONTEXTS_PRIMARY; ++i)
-    aom_tree_merge_probs(av1_ext_partition_tree, pre_fc->partition_prob[i],
-                         counts->partition[i], fc->partition_prob[i]);
-#else
-  for (i = 0; i < PARTITION_CONTEXTS_PRIMARY; ++i) {
-    aom_tree_merge_probs(av1_partition_tree, pre_fc->partition_prob[i],
-                         counts->partition[i], fc->partition_prob[i]);
-  }
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_UNPOISON_PARTITION_CTX
-  for (i = PARTITION_CONTEXTS_PRIMARY;
-       i < PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES; ++i) {
-    unsigned int ct[2] = { counts->partition[i][PARTITION_VERT],
-                           counts->partition[i][PARTITION_SPLIT] };
-    assert(counts->partition[i][PARTITION_NONE] == 0);
-    assert(counts->partition[i][PARTITION_HORZ] == 0);
-    assert(fc->partition_prob[i][PARTITION_NONE] == 0);
-    assert(fc->partition_prob[i][PARTITION_HORZ] == 0);
-    fc->partition_prob[i][PARTITION_VERT] =
-        av1_mode_mv_merge_probs(pre_fc->partition_prob[i][PARTITION_VERT], ct);
-  }
-  for (i = PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES;
-       i < PARTITION_CONTEXTS_PRIMARY + 2 * PARTITION_BLOCK_SIZES; ++i) {
-    unsigned int ct[2] = { counts->partition[i][PARTITION_HORZ],
-                           counts->partition[i][PARTITION_SPLIT] };
-    assert(counts->partition[i][PARTITION_NONE] == 0);
-    assert(counts->partition[i][PARTITION_VERT] == 0);
-    assert(fc->partition_prob[i][PARTITION_NONE] == 0);
-    assert(fc->partition_prob[i][PARTITION_VERT] == 0);
-    fc->partition_prob[i][PARTITION_HORZ] =
-        av1_mode_mv_merge_probs(pre_fc->partition_prob[i][PARTITION_HORZ], ct);
-  }
-#endif
-  for (i = 0; i < DELTA_Q_PROBS; ++i)
-    fc->delta_q_prob[i] =
-        mode_mv_merge_probs(pre_fc->delta_q_prob[i], counts->delta_q[i]);
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-  for (i = 0; i < FRAME_LF_COUNT; ++i)
-    for (int j = 0; j < DELTA_LF_PROBS; ++j)
-      fc->delta_lf_multi_prob[i][j] = mode_mv_merge_probs(
-          pre_fc->delta_lf_multi_prob[i][j], counts->delta_lf_multi[i][j]);
-#endif  // CONFIG_LOOPFILTER_LEVEL
-  for (i = 0; i < DELTA_LF_PROBS; ++i)
-    fc->delta_lf_prob[i] =
-        mode_mv_merge_probs(pre_fc->delta_lf_prob[i], counts->delta_lf[i]);
-#endif  // CONFIG_EXT_DELTA_Q
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-  for (i = 0; i < INTRA_FILTERS + 1; ++i) {
-    aom_tree_merge_probs(av1_intra_filter_tree, pre_fc->intra_filter_probs[i],
-                         counts->intra_filter[i], fc->intra_filter_probs[i]);
-  }
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  for (i = 0; i < PLANE_TYPES; ++i) {
-    fc->filter_intra_probs[i] = av1_mode_mv_merge_probs(
-        pre_fc->filter_intra_probs[i], counts->filter_intra[i]);
-  }
-#endif  // CONFIG_FILTER_INTRA
+  mode_deltas[0] = 0;
+  mode_deltas[1] = 0;
 }
 
 static void set_default_lf_deltas(struct loopfilter *lf) {
   lf->mode_ref_delta_enabled = 1;
   lf->mode_ref_delta_update = 1;
 
-  lf->ref_deltas[INTRA_FRAME] = 1;
-  lf->ref_deltas[LAST_FRAME] = 0;
-#if CONFIG_EXT_REFS
-  lf->ref_deltas[LAST2_FRAME] = lf->ref_deltas[LAST_FRAME];
-  lf->ref_deltas[LAST3_FRAME] = lf->ref_deltas[LAST_FRAME];
-  lf->ref_deltas[BWDREF_FRAME] = lf->ref_deltas[LAST_FRAME];
-#endif  // CONFIG_EXT_REFS
-  lf->ref_deltas[GOLDEN_FRAME] = -1;
-#if CONFIG_EXT_REFS
-  lf->ref_deltas[ALTREF2_FRAME] = -1;
-#endif  // CONFIG_EXT_REFS
-  lf->ref_deltas[ALTREF_FRAME] = -1;
-
-  lf->mode_deltas[0] = 0;
-  lf->mode_deltas[1] = 0;
+  av1_set_default_ref_deltas(lf->ref_deltas);
+  av1_set_default_mode_deltas(lf->mode_deltas);
+}
 
-  av1_copy(lf->last_ref_deltas, lf->ref_deltas);
-  av1_copy(lf->last_mode_deltas, lf->mode_deltas);
+void av1_setup_frame_contexts(AV1_COMMON *cm) {
+  // Store the frame context into a special slot (not associated with any
+  // reference buffer), so that we can set up cm->pre_fc correctly later
+  // This function must ONLY be called when cm->fc has been initialized with
+  // default probs, either by av1_setup_past_independence or after manually
+  // initializing them
+  cm->frame_contexts[FRAME_CONTEXT_DEFAULTS] = *cm->fc;
+  if (cm->large_scale_tile) {
+    for (int i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
+  }
 }
 
 void av1_setup_past_independence(AV1_COMMON *cm) {
   // Reset the segment feature data to the default stats:
   // Features disabled, 0, with delta coding (Default state).
-  struct loopfilter *const lf = &cm->lf;
-
-  int i;
   av1_clearall_segfeatures(&cm->seg);
-  cm->seg.abs_delta = SEGMENT_DELTADATA;
 
-  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
-    memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+  cm->current_frame_seg_map = cm->cur_frame->seg_map;
 
   if (cm->current_frame_seg_map)
     memset(cm->current_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
 
-  // Reset the mode ref deltas for loop filter
-  av1_zero(lf->last_ref_deltas);
-  av1_zero(lf->last_mode_deltas);
-  set_default_lf_deltas(lf);
-
-  // To force update of the sharpness
-  lf->last_sharpness_level = -1;
+  // reset mode ref deltas
+  av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
+  av1_set_default_mode_deltas(cm->cur_frame->mode_deltas);
+  set_default_lf_deltas(&cm->lf);
 
   av1_default_coef_probs(cm);
   init_mode_probs(cm->fc);
   av1_init_mv_probs(cm);
-#if CONFIG_LV_MAP
   av1_init_lv_map(cm);
-#endif
-#if CONFIG_PVQ
-  av1_default_pvq_probs(cm);
-#endif  // CONFIG_PVQ
-#if CONFIG_ADAPT_SCAN
-  av1_init_scan_order(cm);
-#endif
-  av1_convolve_init(cm);
   cm->fc->initialized = 1;
-
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  if (cm->frame_type == KEY_FRAME) {
-    // Reset all frame contexts, as all reference frames will be lost.
-    for (i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
-  }
-#else
-  if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode ||
-      cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL) {
-    // Reset all frame contexts.
-    for (i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
-  } else if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT) {
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-    // Reset the frame context of the first specified ref frame.
-    if (cm->frame_refs[0].idx >= 0) {
-      cm->frame_contexts[cm->frame_refs[0].idx] = *cm->fc;
-    }
-#else
-    // Reset only the frame context specified in the frame header.
-    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
-#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  }
-#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
+  av1_setup_frame_contexts(cm);
 
   // prev_mip will only be allocated in encoder.
-  if (frame_is_intra_only(cm) && cm->prev_mip && !cm->frame_parallel_decode)
+  if (frame_is_intra_only(cm) && cm->prev_mip)
     memset(cm->prev_mip, 0,
-           cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip));
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  cm->frame_context_idx = 0;
-#endif  // !CONFIG_NO_FRAME_CONTEXT_SIGNALING
+           cm->mi_stride * cm->mi_rows * sizeof(*cm->prev_mip));
 }
diff --git a/third_party/aom/av1/common/entropymode.h b/third_party/aom/av1/common/entropymode.h
index 3452241b0..0bd2e20a1 100644
--- a/third_party/aom/av1/common/entropymode.h
+++ b/third_party/aom/av1/common/entropymode.h
@@ -18,25 +18,16 @@
 #include "av1/common/seg_common.h"
 #include "aom_dsp/aom_filter.h"
 
-#if CONFIG_PVQ
-#include "av1/common/pvq.h"
-#include "av1/common/pvq_state.h"
-#include "av1/common/generic_code.h"
-#endif  // CONFIG_PVQ
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #define BLOCK_SIZE_GROUPS 4
 
-#define TX_SIZE_CONTEXTS 2
+#define TX_SIZE_CONTEXTS 3
 
 #define INTER_OFFSET(mode) ((mode)-NEARESTMV)
-#if CONFIG_COMPOUND_SINGLEREF
-#define INTER_SINGLEREF_COMP_OFFSET(mode) ((mode)-SR_NEAREST_NEARMV)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#define INTER_COMPOUND_OFFSET(mode) ((mode)-NEAREST_NEARESTMV)
+#define INTER_COMPOUND_OFFSET(mode) (uint8_t)((mode)-NEAREST_NEARESTMV)
 
 // Number of possible contexts for a color index.
 // As can be seen from av1_get_palette_color_index_context(), the possible
@@ -44,14 +35,6 @@ extern "C" {
 // a value from 0 to 4 using 'palette_color_index_context_lookup' table.
 #define PALETTE_COLOR_INDEX_CONTEXTS 5
 
-// Maximum number of colors in a palette.
-#define PALETTE_MAX_SIZE 8
-// Minimum number of colors in a palette.
-#define PALETTE_MIN_SIZE 2
-
-// Palette mode is available for block sizes >= 8x8.
-#define PALETTE_BLOCK_SIZES (BLOCK_LARGEST - BLOCK_8X8 + 1)
-
 // Palette Y mode context for a block is determined by number of neighboring
 // blocks (top and/or left) using a palette for Y plane. So, possible Y mode'
 // context values are:
@@ -66,11 +49,14 @@ extern "C" {
 // 1 if this block uses palette for Y plane (i.e. Y palette size > 0).
 #define PALETTE_UV_MODE_CONTEXTS 2
 
-#define PALETTE_MAX_BLOCK_SIZE (64 * 64)
+// Map the number of pixels in a block size to a context
+//   64(BLOCK_8X8, BLOCK_4x16, BLOCK_16X4)  -> 0
+//  128(BLOCK_8X16, BLOCK_16x8)             -> 1
+//   ...
+// 4096(BLOCK_64X64)                        -> 6
+#define PALATTE_BSIZE_CTXS 7
 
-#if CONFIG_KF_CTX
 #define KF_MODE_CONTEXTS 5
-#endif
 
 struct AV1Common;
 
@@ -80,643 +66,128 @@ typedef struct {
   const int16_t *neighbors;
 } SCAN_ORDER;
 
-struct seg_counts {
-  unsigned int tree_total[MAX_SEGMENTS];
-  unsigned int tree_mispred[MAX_SEGMENTS];
-  unsigned int pred[PREDICTION_PROBS][2];
-};
-
 typedef struct frame_contexts {
-  aom_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
-  aom_prob uv_mode_prob[INTRA_MODES][UV_INTRA_MODES - 1];
-#if CONFIG_EXT_PARTITION_TYPES
-  aom_prob partition_prob[PARTITION_CONTEXTS][EXT_PARTITION_TYPES - 1];
-#else
-  aom_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
-#endif
-  coeff_cdf_model coef_tail_cdfs[TX_SIZES][PLANE_TYPES];
-  coeff_cdf_model coef_head_cdfs[TX_SIZES][PLANE_TYPES];
-#if CONFIG_ADAPT_SCAN
-// TODO(angiebird): try aom_prob
-#if CONFIG_CHROMA_2X2
-  uint32_t non_zero_prob_2x2[TX_TYPES][4];
-#endif
-  uint32_t non_zero_prob_4X4[TX_TYPES][16];
-  uint32_t non_zero_prob_8X8[TX_TYPES][64];
-  uint32_t non_zero_prob_16X16[TX_TYPES][256];
-  uint32_t non_zero_prob_32X32[TX_TYPES][1024];
-
-  uint32_t non_zero_prob_4X8[TX_TYPES][32];
-  uint32_t non_zero_prob_8X4[TX_TYPES][32];
-  uint32_t non_zero_prob_16X8[TX_TYPES][128];
-  uint32_t non_zero_prob_8X16[TX_TYPES][128];
-  uint32_t non_zero_prob_32X16[TX_TYPES][512];
-  uint32_t non_zero_prob_16X32[TX_TYPES][512];
-
-#if CONFIG_CHROMA_2X2
-  DECLARE_ALIGNED(16, int16_t, scan_2x2[TX_TYPES][4]);
-#endif
-  DECLARE_ALIGNED(16, int16_t, scan_4X4[TX_TYPES][16]);
-  DECLARE_ALIGNED(16, int16_t, scan_8X8[TX_TYPES][64]);
-  DECLARE_ALIGNED(16, int16_t, scan_16X16[TX_TYPES][256]);
-  DECLARE_ALIGNED(16, int16_t, scan_32X32[TX_TYPES][1024]);
-
-  DECLARE_ALIGNED(16, int16_t, scan_4X8[TX_TYPES][32]);
-  DECLARE_ALIGNED(16, int16_t, scan_8X4[TX_TYPES][32]);
-  DECLARE_ALIGNED(16, int16_t, scan_8X16[TX_TYPES][128]);
-  DECLARE_ALIGNED(16, int16_t, scan_16X8[TX_TYPES][128]);
-  DECLARE_ALIGNED(16, int16_t, scan_16X32[TX_TYPES][512]);
-  DECLARE_ALIGNED(16, int16_t, scan_32X16[TX_TYPES][512]);
-
-#if CONFIG_CHROMA_2X2
-  DECLARE_ALIGNED(16, int16_t, iscan_2x2[TX_TYPES][4]);
-#endif
-  DECLARE_ALIGNED(16, int16_t, iscan_4X4[TX_TYPES][16]);
-  DECLARE_ALIGNED(16, int16_t, iscan_8X8[TX_TYPES][64]);
-  DECLARE_ALIGNED(16, int16_t, iscan_16X16[TX_TYPES][256]);
-  DECLARE_ALIGNED(16, int16_t, iscan_32X32[TX_TYPES][1024]);
-
-  DECLARE_ALIGNED(16, int16_t, iscan_4X8[TX_TYPES][32]);
-  DECLARE_ALIGNED(16, int16_t, iscan_8X4[TX_TYPES][32]);
-  DECLARE_ALIGNED(16, int16_t, iscan_8X16[TX_TYPES][128]);
-  DECLARE_ALIGNED(16, int16_t, iscan_16X8[TX_TYPES][128]);
-  DECLARE_ALIGNED(16, int16_t, iscan_16X32[TX_TYPES][512]);
-  DECLARE_ALIGNED(16, int16_t, iscan_32X16[TX_TYPES][512]);
-
-#if CONFIG_CHROMA_2X2
-  int16_t nb_2x2[TX_TYPES][(4 + 1) * 2];
-#endif
-  int16_t nb_4X4[TX_TYPES][(16 + 1) * 2];
-  int16_t nb_8X8[TX_TYPES][(64 + 1) * 2];
-  int16_t nb_16X16[TX_TYPES][(256 + 1) * 2];
-  int16_t nb_32X32[TX_TYPES][(1024 + 1) * 2];
-
-  int16_t nb_4X8[TX_TYPES][(32 + 1) * 2];
-  int16_t nb_8X4[TX_TYPES][(32 + 1) * 2];
-  int16_t nb_8X16[TX_TYPES][(128 + 1) * 2];
-  int16_t nb_16X8[TX_TYPES][(128 + 1) * 2];
-  int16_t nb_16X32[TX_TYPES][(512 + 1) * 2];
-  int16_t nb_32X16[TX_TYPES][(512 + 1) * 2];
-
-  SCAN_ORDER sc[TX_SIZES_ALL][TX_TYPES];
-
-  int16_t eob_threshold[TX_SIZES_ALL][TX_TYPES][EOB_THRESHOLD_NUM];
-#endif  // CONFIG_ADAPT_SCAN
-
-#if CONFIG_LV_MAP
-  aom_prob txb_skip[TX_SIZES][TXB_SKIP_CONTEXTS];
-  aom_prob nz_map[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS];
-  aom_prob eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS];
-  aom_prob dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS];
-  aom_prob coeff_base[TX_SIZES][PLANE_TYPES][NUM_BASE_LEVELS]
-                     [COEFF_BASE_CONTEXTS];
-  aom_prob coeff_lps[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS];
-#if BR_NODE
-  aom_prob coeff_br[TX_SIZES][PLANE_TYPES][BASE_RANGE_SETS][LEVEL_CONTEXTS];
-#endif
-#if CONFIG_CTX1D
-  aom_prob eob_mode[TX_SIZES][PLANE_TYPES][TX_CLASSES];
-  aom_prob empty_line[TX_SIZES][PLANE_TYPES][TX_CLASSES][EMPTY_LINE_CONTEXTS];
-  aom_prob hv_eob[TX_SIZES][PLANE_TYPES][TX_CLASSES][HV_EOB_CONTEXTS];
-#endif  // CONFIG_CTX1D
-
-#if LV_MAP_PROB
   aom_cdf_prob txb_skip_cdf[TX_SIZES][TXB_SKIP_CONTEXTS][CDF_SIZE(2)];
-  aom_cdf_prob nz_map_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]
-                         [CDF_SIZE(2)];
-  aom_cdf_prob eob_flag_cdf[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS]
-                           [CDF_SIZE(2)];
-  aom_cdf_prob dc_sign_cdf[PLANE_TYPES][DC_SIGN_CONTEXTS][CDF_SIZE(2)];
-  aom_cdf_prob coeff_base_cdf[TX_SIZES][PLANE_TYPES][NUM_BASE_LEVELS]
-                             [COEFF_BASE_CONTEXTS][CDF_SIZE(2)];
-  aom_cdf_prob coeff_lps_cdf[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS]
+  aom_cdf_prob eob_extra_cdf[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS]
                             [CDF_SIZE(2)];
-#if BR_NODE
-  aom_cdf_prob coeff_br_cdf[TX_SIZES][PLANE_TYPES][BASE_RANGE_SETS]
-                           [LEVEL_CONTEXTS][CDF_SIZE(2)];
-#endif
-#if CONFIG_CTX1D
-  aom_cdf_prob eob_mode_cdf[TX_SIZES][PLANE_TYPES][TX_CLASSES][CDF_SIZE(2)];
-  aom_cdf_prob empty_line_cdf[TX_SIZES][PLANE_TYPES][TX_CLASSES]
-                             [EMPTY_LINE_CONTEXTS][CDF_SIZE(2)];
-  aom_cdf_prob hv_eob_cdf[TX_SIZES][PLANE_TYPES][TX_CLASSES][HV_EOB_CONTEXTS]
-                         [CDF_SIZE(2)];
-#endif  // CONFIG_CTX1D
-#endif  // LV_MAP_PROB
-#endif
+  aom_cdf_prob dc_sign_cdf[PLANE_TYPES][DC_SIGN_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob eob_flag_cdf16[PLANE_TYPES][2][CDF_SIZE(5)];
+  aom_cdf_prob eob_flag_cdf32[PLANE_TYPES][2][CDF_SIZE(6)];
+  aom_cdf_prob eob_flag_cdf64[PLANE_TYPES][2][CDF_SIZE(7)];
+  aom_cdf_prob eob_flag_cdf128[PLANE_TYPES][2][CDF_SIZE(8)];
+  aom_cdf_prob eob_flag_cdf256[PLANE_TYPES][2][CDF_SIZE(9)];
+  aom_cdf_prob eob_flag_cdf512[PLANE_TYPES][2][CDF_SIZE(10)];
+  aom_cdf_prob eob_flag_cdf1024[PLANE_TYPES][2][CDF_SIZE(11)];
+  aom_cdf_prob coeff_base_eob_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB]
+                                 [CDF_SIZE(3)];
+  aom_cdf_prob coeff_base_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]
+                             [CDF_SIZE(4)];
+  aom_cdf_prob coeff_br_cdf[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS]
+                           [CDF_SIZE(BR_CDF_SIZE)];
 
-  aom_prob newmv_prob[NEWMV_MODE_CONTEXTS];
-  aom_prob zeromv_prob[ZEROMV_MODE_CONTEXTS];
-  aom_prob refmv_prob[REFMV_MODE_CONTEXTS];
-  aom_prob drl_prob[DRL_MODE_CONTEXTS];
-#if CONFIG_NEW_MULTISYMBOL
   aom_cdf_prob newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)];
-  aom_cdf_prob zeromv_cdf[ZEROMV_MODE_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(2)];
   aom_cdf_prob refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)];
   aom_cdf_prob drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)];
-#endif
 
-  aom_prob inter_compound_mode_probs[INTER_MODE_CONTEXTS]
-                                    [INTER_COMPOUND_MODES - 1];
   aom_cdf_prob inter_compound_mode_cdf[INTER_MODE_CONTEXTS]
                                       [CDF_SIZE(INTER_COMPOUND_MODES)];
-#if CONFIG_COMPOUND_SINGLEREF
-  aom_prob inter_singleref_comp_mode_probs[INTER_MODE_CONTEXTS]
-                                          [INTER_SINGLEREF_COMP_MODES - 1];
-  aom_cdf_prob inter_singleref_comp_mode_cdf[INTER_MODE_CONTEXTS][CDF_SIZE(
-      INTER_SINGLEREF_COMP_MODES)];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  aom_prob compound_type_prob[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1];
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-  aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES)];
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-#if CONFIG_INTERINTRA
-  aom_prob interintra_prob[BLOCK_SIZE_GROUPS];
-  aom_prob wedge_interintra_prob[BLOCK_SIZES_ALL];
-  aom_prob interintra_mode_prob[BLOCK_SIZE_GROUPS][INTERINTRA_MODES - 1];
-#if CONFIG_NEW_MULTISYMBOL
+  aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)];
+  aom_cdf_prob wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)];
   aom_cdf_prob interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)];
   aom_cdf_prob wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)];
-#endif
   aom_cdf_prob interintra_mode_cdf[BLOCK_SIZE_GROUPS]
                                   [CDF_SIZE(INTERINTRA_MODES)];
-#endif  // CONFIG_INTERINTRA
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  aom_prob motion_mode_prob[BLOCK_SIZES_ALL][MOTION_MODES - 1];
   aom_cdf_prob motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)];
-#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_MOTION_VAR
-  aom_prob ncobmc_mode_prob[ADAPT_OVERLAP_BLOCKS][MAX_NCOBMC_MODES - 1];
-  aom_cdf_prob ncobmc_mode_cdf[ADAPT_OVERLAP_BLOCKS]
-                              [CDF_SIZE(MAX_NCOBMC_MODES)];
-#endif
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  aom_prob ncobmc_prob[BLOCK_SIZES_ALL][OBMC_FAMILY_MODES - 1];
-  aom_cdf_prob ncobmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(OBMC_FAMILY_MODES)];
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-  aom_prob obmc_prob[BLOCK_SIZES_ALL];
-#if CONFIG_NEW_MULTISYMBOL || CONFIG_NCOBMC_ADAPT_WEIGHT
   aom_cdf_prob obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)];
-#endif  // CONFIG_NEW_MULTISYMBOL
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  aom_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
-  aom_prob comp_inter_prob[COMP_INTER_CONTEXTS];
-  aom_cdf_prob palette_y_size_cdf[PALETTE_BLOCK_SIZES][CDF_SIZE(PALETTE_SIZES)];
-  aom_cdf_prob palette_uv_size_cdf[PALETTE_BLOCK_SIZES]
-                                  [CDF_SIZE(PALETTE_SIZES)];
+  aom_cdf_prob palette_y_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)];
+  aom_cdf_prob palette_uv_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)];
   aom_cdf_prob palette_y_color_index_cdf[PALETTE_SIZES]
                                         [PALETTE_COLOR_INDEX_CONTEXTS]
                                         [CDF_SIZE(PALETTE_COLORS)];
   aom_cdf_prob palette_uv_color_index_cdf[PALETTE_SIZES]
                                          [PALETTE_COLOR_INDEX_CONTEXTS]
                                          [CDF_SIZE(PALETTE_COLORS)];
-#if CONFIG_MRC_TX
-  aom_cdf_prob mrc_mask_inter_cdf[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
-                                 [CDF_SIZE(PALETTE_COLORS)];
-  aom_cdf_prob mrc_mask_intra_cdf[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
-                                 [CDF_SIZE(PALETTE_COLORS)];
-#endif  // CONFIG_MRC_TX
-#if CONFIG_NEW_MULTISYMBOL
-  aom_cdf_prob palette_y_mode_cdf[PALETTE_BLOCK_SIZES][PALETTE_Y_MODE_CONTEXTS]
+  aom_cdf_prob palette_y_mode_cdf[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS]
                                  [CDF_SIZE(2)];
   aom_cdf_prob palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)];
   aom_cdf_prob comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(2)];
   aom_cdf_prob single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1][CDF_SIZE(2)];
-#endif
-#if CONFIG_EXT_COMP_REFS
-  aom_prob comp_ref_type_prob[COMP_REF_TYPE_CONTEXTS];
-  aom_prob uni_comp_ref_prob[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1];
-#if CONFIG_NEW_MULTISYMBOL
   aom_cdf_prob comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS][CDF_SIZE(2)];
   aom_cdf_prob uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1]
                                [CDF_SIZE(2)];
-#endif  // CONFIG_NEW_MULTISYMBOL
-#endif  // CONFIG_EXT_COMP_REFS
-  aom_prob single_ref_prob[REF_CONTEXTS][SINGLE_REFS - 1];
-#if CONFIG_EXT_REFS
-  aom_prob comp_ref_prob[REF_CONTEXTS][FWD_REFS - 1];
-  aom_prob comp_bwdref_prob[REF_CONTEXTS][BWD_REFS - 1];
-#else
-  aom_prob comp_ref_prob[REF_CONTEXTS][COMP_REFS - 1];
-#endif  // CONFIG_EXT_REFS
-#if CONFIG_NEW_MULTISYMBOL
-#if CONFIG_EXT_REFS
   aom_cdf_prob comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)];
   aom_cdf_prob comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)];
-#else
-  aom_cdf_prob comp_ref_cdf[REF_CONTEXTS][COMP_REFS - 1][CDF_SIZE(2)];
-#endif  // CONFIG_EXT_REFS
-#endif
-#if CONFIG_COMPOUND_SINGLEREF
-  aom_prob comp_inter_mode_prob[COMP_INTER_MODE_CONTEXTS];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-  aom_prob quarter_tx_size_prob;
-#if CONFIG_NEW_MULTISYMBOL
-  aom_cdf_prob quarter_tx_size_cdf[CDF_SIZE(2)];
-#endif
-#endif
-#if CONFIG_VAR_TX
-  aom_prob txfm_partition_prob[TXFM_PARTITION_CONTEXTS];
-#if CONFIG_NEW_MULTISYMBOL
   aom_cdf_prob txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)];
-#endif
-#endif  // CONFIG_VAR_TX
-  aom_prob skip_probs[SKIP_CONTEXTS];
-#if CONFIG_NEW_MULTISYMBOL
+  aom_cdf_prob compound_index_cdf[COMP_INDEX_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob comp_group_idx_cdf[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob skip_mode_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)];
   aom_cdf_prob skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)];
   aom_cdf_prob intra_inter_cdf[INTRA_INTER_CONTEXTS][CDF_SIZE(2)];
-#endif
-  nmv_context nmvc[NMV_CONTEXTS];
-#if CONFIG_INTRABC
+  nmv_context nmvc;
   nmv_context ndvc;
   aom_cdf_prob intrabc_cdf[CDF_SIZE(2)];
-#endif
-  int initialized;
-#if CONFIG_SUPERTX
-  aom_prob supertx_prob[PARTITION_SUPERTX_CONTEXTS][TX_SIZES];
-#endif  // CONFIG_SUPERTX
   struct segmentation_probs seg;
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-  aom_prob intra_filter_probs[INTRA_FILTERS + 1][INTRA_FILTERS - 1];
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  aom_prob filter_intra_probs[PLANE_TYPES];
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_LOOP_RESTORATION
-  aom_prob switchable_restore_prob[RESTORE_SWITCHABLE_TYPES - 1];
-#endif  // CONFIG_LOOP_RESTORATION
+  aom_cdf_prob filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(2)];
+  aom_cdf_prob filter_intra_mode_cdf[CDF_SIZE(FILTER_INTRA_MODES)];
+  aom_cdf_prob switchable_restore_cdf[CDF_SIZE(RESTORE_SWITCHABLE_TYPES)];
+  aom_cdf_prob wiener_restore_cdf[CDF_SIZE(2)];
+  aom_cdf_prob sgrproj_restore_cdf[CDF_SIZE(2)];
   aom_cdf_prob y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)];
-  aom_cdf_prob uv_mode_cdf[INTRA_MODES][CDF_SIZE(UV_INTRA_MODES)];
-#if CONFIG_EXT_PARTITION_TYPES
+  aom_cdf_prob uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES]
+                          [CDF_SIZE(UV_INTRA_MODES)];
   aom_cdf_prob partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(EXT_PARTITION_TYPES)];
-#else
-  aom_cdf_prob partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(PARTITION_TYPES)];
-#endif
   aom_cdf_prob switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS]
                                     [CDF_SIZE(SWITCHABLE_FILTERS)];
-/* kf_y_cdf is discarded after use, so does not require persistent storage.
-   However, we keep it with the other CDFs in this struct since it needs to
-   be copied to each tile to support parallelism just like the others.
-*/
-#if CONFIG_KF_CTX
+  /* kf_y_cdf is discarded after use, so does not require persistent storage.
+     However, we keep it with the other CDFs in this struct since it needs to
+     be copied to each tile to support parallelism just like the others.
+  */
   aom_cdf_prob kf_y_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS]
                        [CDF_SIZE(INTRA_MODES)];
-#else
-  aom_cdf_prob kf_y_cdf[INTRA_MODES][INTRA_MODES][CDF_SIZE(INTRA_MODES)];
-#endif
-  aom_cdf_prob tx_size_cdf[MAX_TX_DEPTH][TX_SIZE_CONTEXTS]
+
+  aom_cdf_prob angle_delta_cdf[DIRECTIONAL_MODES]
+                              [CDF_SIZE(2 * MAX_ANGLE_DELTA + 1)];
+
+  aom_cdf_prob tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS]
                           [CDF_SIZE(MAX_TX_DEPTH + 1)];
   aom_cdf_prob delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)];
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
   aom_cdf_prob delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE(DELTA_LF_PROBS + 1)];
-#endif  // CONFIG_LOOPFILTER_LEVEL
   aom_cdf_prob delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)];
-#endif
-#if CONFIG_EXT_TX
   aom_cdf_prob intra_ext_tx_cdf[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
                                [CDF_SIZE(TX_TYPES)];
   aom_cdf_prob inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES]
                                [CDF_SIZE(TX_TYPES)];
-#else
-  aom_cdf_prob intra_ext_tx_cdf[EXT_TX_SIZES][TX_TYPES][CDF_SIZE(TX_TYPES)];
-  aom_cdf_prob inter_ext_tx_cdf[EXT_TX_SIZES][CDF_SIZE(TX_TYPES)];
-#endif  // CONFIG_EXT_TX
-#if CONFIG_LGT_FROM_PRED
-  aom_prob intra_lgt_prob[LGT_SIZES][INTRA_MODES];
-  aom_prob inter_lgt_prob[LGT_SIZES];
-#endif  // CONFIG_LGT_FROM_PRED
-#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-  aom_cdf_prob intra_filter_cdf[INTRA_FILTERS + 1][CDF_SIZE(INTRA_FILTERS)];
-#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-  aom_prob delta_q_prob[DELTA_Q_PROBS];
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-  aom_prob delta_lf_multi_prob[FRAME_LF_COUNT][DELTA_LF_PROBS];
-#endif  // CONFIG_LOOPFILTER_LEVEL
-  aom_prob delta_lf_prob[DELTA_LF_PROBS];
-#endif
-#if CONFIG_PVQ
-  // TODO(any): If PVQ is enabled, most of coefficient related cdf,
-  // such as coef_cdfs[], coef_tail_cdfs[], and coef_heaf_cdfs[] can be removed.
-  od_adapt_ctx pvq_context;
-#endif  // CONFIG_PVQ
-#if CONFIG_CFL
   aom_cdf_prob cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)];
   aom_cdf_prob cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)];
-#endif
-#if CONFIG_LPF_SB
-  aom_cdf_prob lpf_reuse_cdf[LPF_REUSE_CONTEXT][CDF_SIZE(2)];
-  aom_cdf_prob lpf_delta_cdf[LPF_DELTA_CONTEXT][CDF_SIZE(DELTA_RANGE)];
-  aom_cdf_prob lpf_sign_cdf[LPF_REUSE_CONTEXT][LPF_SIGN_CONTEXT][CDF_SIZE(2)];
-#endif  // CONFIG_LPF_SB
+  int initialized;
 } FRAME_CONTEXT;
 
-typedef struct FRAME_COUNTS {
-// Note: This structure should only contain 'unsigned int' fields, or
-// aggregates built solely from 'unsigned int' fields/elements
-#if CONFIG_ENTROPY_STATS
-  unsigned int kf_y_mode[INTRA_MODES][INTRA_MODES][INTRA_MODES];
-  unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
-  unsigned int uv_mode[INTRA_MODES][UV_INTRA_MODES];
-#endif  // CONFIG_ENTROPY_STATS
-#if CONFIG_EXT_PARTITION_TYPES
-  unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
-#else
-  unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES];
-#endif
-  unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
-                                [SWITCHABLE_FILTERS];
-#if CONFIG_ADAPT_SCAN
-#if CONFIG_CHROMA_2X2
-  unsigned int non_zero_count_2x2[TX_TYPES][4];
-#endif  // CONFIG_CHROMA_2X2
-  unsigned int non_zero_count_4X4[TX_TYPES][16];
-  unsigned int non_zero_count_8X8[TX_TYPES][64];
-  unsigned int non_zero_count_16X16[TX_TYPES][256];
-  unsigned int non_zero_count_32X32[TX_TYPES][1024];
-
-  unsigned int non_zero_count_4x8[TX_TYPES][32];
-  unsigned int non_zero_count_8x4[TX_TYPES][32];
-  unsigned int non_zero_count_8x16[TX_TYPES][128];
-  unsigned int non_zero_count_16x8[TX_TYPES][128];
-  unsigned int non_zero_count_16x32[TX_TYPES][512];
-  unsigned int non_zero_count_32x16[TX_TYPES][512];
-
-  unsigned int txb_count[TX_SIZES_ALL][TX_TYPES];
-#endif  // CONFIG_ADAPT_SCAN
-
-#if CONFIG_LV_MAP
-  unsigned int txb_skip[TX_SIZES][TXB_SKIP_CONTEXTS][2];
-  unsigned int nz_map[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS][2];
-  unsigned int eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS][2];
-  unsigned int dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS][2];
-  unsigned int coeff_base[TX_SIZES][PLANE_TYPES][NUM_BASE_LEVELS]
-                         [COEFF_BASE_CONTEXTS][2];
-  unsigned int coeff_lps[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS][2];
-  unsigned int coeff_br[TX_SIZES][PLANE_TYPES][BASE_RANGE_SETS][LEVEL_CONTEXTS]
-                       [2];
-#if CONFIG_CTX1D
-  unsigned int eob_mode[TX_SIZES][PLANE_TYPES][TX_CLASSES][2];
-  unsigned int empty_line[TX_SIZES][PLANE_TYPES][TX_CLASSES]
-                         [EMPTY_LINE_CONTEXTS][2];
-  unsigned int hv_eob[TX_SIZES][PLANE_TYPES][TX_CLASSES][HV_EOB_CONTEXTS][2];
-#endif  // CONFIG_CTX1D
-#endif  // CONFIG_LV_MAP
-
-#if CONFIG_SYMBOLRATE
-  unsigned int coeff_num[2];   // 0: zero coeff 1: non-zero coeff
-  unsigned int symbol_num[2];  // 0: entropy symbol 1: non-entropy symbol
-#endif
-
-  unsigned int newmv_mode[NEWMV_MODE_CONTEXTS][2];
-  unsigned int zeromv_mode[ZEROMV_MODE_CONTEXTS][2];
-  unsigned int refmv_mode[REFMV_MODE_CONTEXTS][2];
-  unsigned int drl_mode[DRL_MODE_CONTEXTS][2];
-
-  unsigned int inter_compound_mode[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
-#if CONFIG_COMPOUND_SINGLEREF
-  unsigned int inter_singleref_comp_mode[INTER_MODE_CONTEXTS]
-                                        [INTER_SINGLEREF_COMP_MODES];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_INTERINTRA
-  unsigned int interintra[BLOCK_SIZE_GROUPS][2];
-  unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
-  unsigned int wedge_interintra[BLOCK_SIZES_ALL][2];
-#endif  // CONFIG_INTERINTRA
-  unsigned int compound_interinter[BLOCK_SIZES_ALL][COMPOUND_TYPES];
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  unsigned int motion_mode[BLOCK_SIZES_ALL][MOTION_MODES];
-#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_MOTION_VAR
-  unsigned int ncobmc_mode[ADAPT_OVERLAP_BLOCKS][MAX_NCOBMC_MODES];
-#endif
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  unsigned int ncobmc[BLOCK_SIZES_ALL][OBMC_FAMILY_MODES];
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-  unsigned int obmc[BLOCK_SIZES_ALL][2];
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
-  unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
-#if CONFIG_EXT_COMP_REFS
-  unsigned int comp_ref_type[COMP_REF_TYPE_CONTEXTS][2];
-  unsigned int uni_comp_ref[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1][2];
-#endif  // CONFIG_EXT_COMP_REFS
-  unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS - 1][2];
-#if CONFIG_EXT_REFS
-  unsigned int comp_ref[REF_CONTEXTS][FWD_REFS - 1][2];
-  unsigned int comp_bwdref[REF_CONTEXTS][BWD_REFS - 1][2];
-#else
-  unsigned int comp_ref[REF_CONTEXTS][COMP_REFS - 1][2];
-#endif  // CONFIG_EXT_REFS
-#if CONFIG_COMPOUND_SINGLEREF
-  unsigned int comp_inter_mode[COMP_INTER_MODE_CONTEXTS][2];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  // TODO(urvang): Only needed for !CONFIG_VAR_TX case. So can be removed when
-  // CONFIG_VAR_TX flag is removed.
-  unsigned int tx_size[MAX_TX_DEPTH][TX_SIZE_CONTEXTS][MAX_TX_DEPTH + 1];
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-  unsigned int quarter_tx_size[2];
-#endif
-#if CONFIG_VAR_TX
-  unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2];
-#endif
-  unsigned int skip[SKIP_CONTEXTS][2];
-  nmv_context_counts mv[NMV_CONTEXTS];
-#if CONFIG_INTRABC
-  unsigned int intrabc[2];
-  nmv_context_counts dv;
-#endif
-#if CONFIG_LGT_FROM_PRED
-  unsigned int intra_lgt[LGT_SIZES][INTRA_MODES][2];
-  unsigned int inter_lgt[LGT_SIZES][2];
-#endif  // CONFIG_LGT_FROM_PRED
-  unsigned int delta_q[DELTA_Q_PROBS][2];
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
-  unsigned int delta_lf_multi[FRAME_LF_COUNT][DELTA_LF_PROBS][2];
-#endif  // CONFIG_LOOPFILTER_LEVEL
-  unsigned int delta_lf[DELTA_LF_PROBS][2];
-#endif
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-  unsigned int tx_size_implied[TX_SIZES][TX_SIZES];
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
-#if CONFIG_ENTROPY_STATS
-#if CONFIG_EXT_TX
-  unsigned int inter_ext_tx[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
-  unsigned int intra_ext_tx[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
-                           [TX_TYPES];
-#else
-  unsigned int intra_ext_tx[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
-  unsigned int inter_ext_tx[EXT_TX_SIZES][TX_TYPES];
-#endif  // CONFIG_EXT_TX
-#endif  // CONFIG_ENTROPY_STATS
-#if CONFIG_SUPERTX
-  unsigned int supertx[PARTITION_SUPERTX_CONTEXTS][TX_SIZES][2];
-  unsigned int supertx_size[TX_SIZES];
-#endif  // CONFIG_SUPERTX
-  struct seg_counts seg;
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-  unsigned int intra_filter[INTRA_FILTERS + 1][INTRA_FILTERS];
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  unsigned int filter_intra[PLANE_TYPES][2];
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_LPF_SB
-  unsigned int lpf_reuse[LPF_REUSE_CONTEXT][2];
-  unsigned int lpf_delta[LPF_DELTA_CONTEXT][DELTA_RANGE];
-  unsigned int lpf_sign[LPF_SIGN_CONTEXT][2];
-#endif  // CONFIG_LPF_SB
-} FRAME_COUNTS;
-
-#if CONFIG_KF_CTX
-extern const aom_cdf_prob default_kf_y_mode_cdf[KF_MODE_CONTEXTS]
-                                               [KF_MODE_CONTEXTS]
-                                               [CDF_SIZE(INTRA_MODES)];
-#else
-extern const aom_cdf_prob default_kf_y_mode_cdf[INTRA_MODES][INTRA_MODES]
-                                               [CDF_SIZE(INTRA_MODES)];
-#endif
-
-extern const aom_prob av1_default_palette_y_mode_prob[PALETTE_BLOCK_SIZES]
-                                                     [PALETTE_Y_MODE_CONTEXTS];
-extern const aom_prob
-    av1_default_palette_uv_mode_prob[PALETTE_UV_MODE_CONTEXTS];
-
-#if CONFIG_EXT_TX
 static const int av1_ext_tx_ind[EXT_TX_SET_TYPES][TX_TYPES] = {
-  {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-#if CONFIG_MRC_TX
-  {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-  },
-  {
-      1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
-  },
-#endif  // CONFIG_MRC_TX
-  {
-      1, 3, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      1, 5, 6, 4, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0,
-  },
-  {
-      3, 4, 5, 8, 6, 7, 9, 10, 11, 0, 1, 2, 0, 0, 0, 0,
-  },
-  {
-      7, 8, 9, 12, 10, 11, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6,
-  },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 3, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 5, 6, 4, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0 },
+  { 3, 4, 5, 8, 6, 7, 9, 10, 11, 0, 1, 2, 0, 0, 0, 0 },
+  { 7, 8, 9, 12, 10, 11, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6 },
 };
 
 static const int av1_ext_tx_inv[EXT_TX_SET_TYPES][TX_TYPES] = {
-  {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-#if CONFIG_MRC_TX
-  {
-      0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      9, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-#endif  // CONFIG_MRC_TX
-  {
-      9, 0, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      9, 0, 10, 11, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      9, 10, 11, 0, 1, 2, 4, 5, 3, 6, 7, 8, 0, 0, 0, 0,
-  },
-  {
-      9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 4, 5, 3, 6, 7, 8,
-  },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 9, 0, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 9, 0, 10, 11, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 9, 10, 11, 0, 1, 2, 4, 5, 3, 6, 7, 8, 0, 0, 0, 0 },
+  { 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 4, 5, 3, 6, 7, 8 },
 };
-#else
-#if CONFIG_MRC_TX
-static const int av1_ext_tx_ind[TX_TYPES] = {
-  0, 3, 4, 2, 1,
-};
-static const int av1_ext_tx_inv[TX_TYPES] = {
-  0, 4, 3, 1, 2,
-};
-#else
-static const int av1_ext_tx_ind[TX_TYPES] = {
-  0, 2, 3, 1,
-};
-static const int av1_ext_tx_inv[TX_TYPES] = {
-  0, 3, 1, 2,
-};
-#endif  // CONFIG_MRC_TX
-#endif  // CONFIG_EXT_TX
-
-#if CONFIG_INTERINTRA
-extern const aom_tree_index
-    av1_interintra_mode_tree[TREE_SIZE(INTERINTRA_MODES)];
-#endif
-extern const aom_tree_index
-    av1_inter_compound_mode_tree[TREE_SIZE(INTER_COMPOUND_MODES)];
-#if CONFIG_COMPOUND_SINGLEREF
-extern const aom_tree_index
-    av1_inter_singleref_comp_mode_tree[TREE_SIZE(INTER_SINGLEREF_COMP_MODES)];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-extern const aom_tree_index av1_compound_type_tree[TREE_SIZE(COMPOUND_TYPES)];
-extern const aom_tree_index av1_partition_tree[TREE_SIZE(PARTITION_TYPES)];
-#if CONFIG_EXT_PARTITION_TYPES
-extern const aom_tree_index
-    av1_ext_partition_tree[TREE_SIZE(EXT_PARTITION_TYPES)];
-#endif
-extern const aom_tree_index
-    av1_palette_color_index_tree[PALETTE_SIZES][TREE_SIZE(PALETTE_COLORS)];
-#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-extern const aom_tree_index av1_intra_filter_tree[TREE_SIZE(INTRA_FILTERS)];
-#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-#if CONFIG_EXT_TX
-extern const aom_tree_index av1_ext_tx_tree[EXT_TX_SET_TYPES]
-                                           [TREE_SIZE(TX_TYPES)];
-#else
-extern const aom_tree_index av1_ext_tx_tree[TREE_SIZE(TX_TYPES)];
-#endif  // CONFIG_EXT_TX
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-extern const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)];
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-extern const aom_tree_index av1_ncobmc_mode_tree[TREE_SIZE(MAX_NCOBMC_MODES)];
-#if CONFIG_WARPED_MOTION
-extern const aom_tree_index av1_ncobmc_tree[TREE_SIZE(OBMC_FAMILY_MODES)];
-#endif  // CONFIG_WARPED_MOTION
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#if CONFIG_LOOP_RESTORATION
-#define RESTORE_NONE_SGRPROJ_PROB 64
-#define RESTORE_NONE_BILATERAL_PROB 16
-#define RESTORE_NONE_WIENER_PROB 64
-#define RESTORE_NONE_DOMAINTXFMRF_PROB 64
-extern const aom_tree_index
-    av1_switchable_restore_tree[TREE_SIZE(RESTORE_SWITCHABLE_TYPES)];
-#endif  // CONFIG_LOOP_RESTORATION
 
+void av1_set_default_ref_deltas(int8_t *ref_deltas);
+void av1_set_default_mode_deltas(int8_t *mode_deltas);
+void av1_setup_frame_contexts(struct AV1Common *cm);
 void av1_setup_past_independence(struct AV1Common *cm);
 
-void av1_adapt_intra_frame_probs(struct AV1Common *cm);
-void av1_adapt_inter_frame_probs(struct AV1Common *cm);
-
 static INLINE int av1_ceil_log2(int n) {
+  if (n < 2) return 0;
   int i = 1, p = 2;
   while (p < n) {
     i++;
diff --git a/third_party/aom/av1/common/entropymv.c b/third_party/aom/av1/common/entropymv.c
index 2d0191366..446aa433c 100644
--- a/third_party/aom/av1/common/entropymv.c
+++ b/third_party/aom/av1/common/entropymv.c
@@ -12,100 +12,51 @@
 #include "av1/common/onyxc_int.h"
 #include "av1/common/entropymv.h"
 
-// Integer pel reference mv threshold for use of high-precision 1/8 mv
-#define COMPANDED_MVREF_THRESH 8
-
-const aom_tree_index av1_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
-  -MV_JOINT_ZERO, 2, -MV_JOINT_HNZVZ, 4, -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
-};
-
-/* clang-format off */
-const aom_tree_index av1_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
-  -MV_CLASS_0, 2,
-  -MV_CLASS_1, 4,
-  6, 8,
-  -MV_CLASS_2, -MV_CLASS_3,
-  10, 12,
-  -MV_CLASS_4, -MV_CLASS_5,
-  -MV_CLASS_6, 14,
-  16, 18,
-  -MV_CLASS_7, -MV_CLASS_8,
-  -MV_CLASS_9, -MV_CLASS_10,
-};
-/* clang-format on */
-
-const aom_tree_index av1_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
-  -0, -1,
-};
-
-const aom_tree_index av1_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = { -0, 2,  -1,
-                                                               4,  -2, -3 };
-
 static const nmv_context default_nmv_context = {
-  { 32, 64, 96 },  // joints
-  { AOM_ICDF(4096), AOM_ICDF(11264), AOM_ICDF(19328), AOM_ICDF(32768),
-    0 },  // joint_cdf
+  { AOM_CDF4(4096, 11264, 19328) },  // joints_cdf
   { {
         // Vertical component
-        128,                                                   // sign
-        { 224, 144, 192, 168, 192, 176, 192, 198, 198, 245 },  // class
-        { AOM_ICDF(28672), AOM_ICDF(30976), AOM_ICDF(31858), AOM_ICDF(32320),
-          AOM_ICDF(32551), AOM_ICDF(32656), AOM_ICDF(32740), AOM_ICDF(32757),
-          AOM_ICDF(32762), AOM_ICDF(32767), AOM_ICDF(32768), 0 },  // class_cdf
-        { 216 },                                                   // class0
-        { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 },      // bits
-        { { 128, 128, 64 }, { 96, 112, 64 } },                     // class0_fp
-        { 64, 96, 64 },                                            // fp
-        { { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(26624), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(12288), AOM_ICDF(21248), AOM_ICDF(24128), AOM_ICDF(32768),
-            0 } },  // class0_fp_cdf
-        { AOM_ICDF(8192), AOM_ICDF(17408), AOM_ICDF(21248), AOM_ICDF(32768),
-          0 },  // fp_cdf
-        160,    // class0_hp bit
-        128,    // hp
-#if CONFIG_NEW_MULTISYMBOL
-        { AOM_ICDF(160 * 128), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(216 * 128), AOM_ICDF(32768), 0 },
-        { { AOM_ICDF(128 * 196), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(128 * 198), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(128 * 208), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(128 * 224), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(128 * 245), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 } },  // bits_cdf
-#endif
+        { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757,
+                    32762, 32767) },  // class_cdf // fp
+        { { AOM_CDF4(16384, 24576, 26624) },
+          { AOM_CDF4(12288, 21248, 24128) } },  // class0_fp_cdf
+        { AOM_CDF4(8192, 17408, 21248) },       // fp_cdf
+        { AOM_CDF2(128 * 128) },                // sign_cdf
+        { AOM_CDF2(160 * 128) },                // class0_hp_cdf
+        { AOM_CDF2(128 * 128) },                // hp_cdf
+        { AOM_CDF2(216 * 128) },                // class0_cdf
+        { { AOM_CDF2(128 * 136) },
+          { AOM_CDF2(128 * 140) },
+          { AOM_CDF2(128 * 148) },
+          { AOM_CDF2(128 * 160) },
+          { AOM_CDF2(128 * 176) },
+          { AOM_CDF2(128 * 192) },
+          { AOM_CDF2(128 * 224) },
+          { AOM_CDF2(128 * 234) },
+          { AOM_CDF2(128 * 234) },
+          { AOM_CDF2(128 * 240) } },  // bits_cdf
     },
     {
         // Horizontal component
-        128,                                                   // sign
-        { 216, 128, 176, 160, 176, 176, 192, 198, 198, 208 },  // class
-        { AOM_ICDF(28672), AOM_ICDF(30976), AOM_ICDF(31858), AOM_ICDF(32320),
-          AOM_ICDF(32551), AOM_ICDF(32656), AOM_ICDF(32740), AOM_ICDF(32757),
-          AOM_ICDF(32762), AOM_ICDF(32767), AOM_ICDF(32768), 0 },  // class_cdf
-        { 208 },                                                   // class0
-        { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 },      // bits
-        { { 128, 128, 64 }, { 96, 112, 64 } },                     // class0_fp
-        { 64, 96, 64 },                                            // fp
-        { { AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(26624), AOM_ICDF(32768),
-            0 },
-          { AOM_ICDF(12288), AOM_ICDF(21248), AOM_ICDF(24128), AOM_ICDF(32768),
-            0 } },  // class0_fp_cdf
-        { AOM_ICDF(8192), AOM_ICDF(17408), AOM_ICDF(21248), AOM_ICDF(32768),
-          0 },  // fp_cdf
-        160,    // class0_hp bit
-        128,    // hp
-#if CONFIG_NEW_MULTISYMBOL
-        { AOM_ICDF(160 * 128), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(128 * 128), AOM_ICDF(32768), 0 },
-        { AOM_ICDF(216 * 128), AOM_ICDF(32768), 0 },
-        { { AOM_ICDF(128 * 196), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(128 * 198), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(128 * 208), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(128 * 224), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(128 * 245), AOM_ICDF(32768), 0 },
-          { AOM_ICDF(128 * 240), AOM_ICDF(32768), 0 } },  // bits_cdf
-#endif
+        { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757,
+                    32762, 32767) },  // class_cdf // fp
+        { { AOM_CDF4(16384, 24576, 26624) },
+          { AOM_CDF4(12288, 21248, 24128) } },  // class0_fp_cdf
+        { AOM_CDF4(8192, 17408, 21248) },       // fp_cdf
+        { AOM_CDF2(128 * 128) },                // sign_cdf
+        { AOM_CDF2(160 * 128) },                // class0_hp_cdf
+        { AOM_CDF2(128 * 128) },                // hp_cdf
+        { AOM_CDF2(216 * 128) },                // class0_cdf
+        { { AOM_CDF2(128 * 136) },
+          { AOM_CDF2(128 * 140) },
+          { AOM_CDF2(128 * 148) },
+          { AOM_CDF2(128 * 160) },
+          { AOM_CDF2(128 * 176) },
+          { AOM_CDF2(128 * 192) },
+          { AOM_CDF2(128 * 224) },
+          { AOM_CDF2(128 * 234) },
+          { AOM_CDF2(128 * 234) },
+          { AOM_CDF2(128 * 240) } },  // bits_cdf
     } },
 };
 
@@ -164,104 +115,8 @@ MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) {
   return c;
 }
 
-static void inc_mv_component(int v, nmv_component_counts *comp_counts, int incr,
-                             MvSubpelPrecision precision) {
-  int s, z, c, o, d, e, f;
-  assert(v != 0); /* should not be zero */
-  s = v < 0;
-  comp_counts->sign[s] += incr;
-  z = (s ? -v : v) - 1; /* magnitude - 1 */
-
-  c = av1_get_mv_class(z, &o);
-  comp_counts->classes[c] += incr;
-
-  d = (o >> 3);     /* int mv data */
-  f = (o >> 1) & 3; /* fractional pel mv data */
-  e = (o & 1);      /* high precision mv data */
-
-  if (c == MV_CLASS_0) {
-    comp_counts->class0[d] += incr;
-#if CONFIG_INTRABC || CONFIG_AMVR
-    if (precision > MV_SUBPEL_NONE)
-#endif
-      comp_counts->class0_fp[d][f] += incr;
-    if (precision > MV_SUBPEL_LOW_PRECISION) comp_counts->class0_hp[e] += incr;
-  } else {
-    int i;
-    int b = c + CLASS0_BITS - 1;  // number of bits
-    for (i = 0; i < b; ++i) comp_counts->bits[i][((d >> i) & 1)] += incr;
-#if CONFIG_INTRABC || CONFIG_AMVR
-    if (precision > MV_SUBPEL_NONE)
-#endif
-      comp_counts->fp[f] += incr;
-    if (precision > MV_SUBPEL_LOW_PRECISION) comp_counts->hp[e] += incr;
-  }
-}
-
-void av1_inc_mv(const MV *mv, nmv_context_counts *counts,
-                MvSubpelPrecision precision) {
-  if (counts != NULL) {
-    const MV_JOINT_TYPE j = av1_get_mv_joint(mv);
-    ++counts->joints[j];
-
-    if (mv_joint_vertical(j))
-      inc_mv_component(mv->row, &counts->comps[0], 1, precision);
-
-    if (mv_joint_horizontal(j))
-      inc_mv_component(mv->col, &counts->comps[1], 1, precision);
-  }
-}
-
-void av1_adapt_mv_probs(AV1_COMMON *cm, int allow_hp) {
-  int i, j;
-  int idx;
-  for (idx = 0; idx < NMV_CONTEXTS; ++idx) {
-    nmv_context *nmvc = &cm->fc->nmvc[idx];
-    const nmv_context *pre_nmvc = &cm->pre_fc->nmvc[idx];
-    const nmv_context_counts *counts = &cm->counts.mv[idx];
-    aom_tree_merge_probs(av1_mv_joint_tree, pre_nmvc->joints, counts->joints,
-                         nmvc->joints);
-    for (i = 0; i < 2; ++i) {
-      nmv_component *comp = &nmvc->comps[i];
-      const nmv_component *pre_comp = &pre_nmvc->comps[i];
-      const nmv_component_counts *c = &counts->comps[i];
-
-      comp->sign = av1_mode_mv_merge_probs(pre_comp->sign, c->sign);
-      aom_tree_merge_probs(av1_mv_class_tree, pre_comp->classes, c->classes,
-                           comp->classes);
-      aom_tree_merge_probs(av1_mv_class0_tree, pre_comp->class0, c->class0,
-                           comp->class0);
-
-      for (j = 0; j < MV_OFFSET_BITS; ++j)
-        comp->bits[j] = av1_mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]);
-#if CONFIG_AMVR
-      if (cm->cur_frame_mv_precision_level == 0) {
-#endif
-        for (j = 0; j < CLASS0_SIZE; ++j)
-          aom_tree_merge_probs(av1_mv_fp_tree, pre_comp->class0_fp[j],
-                               c->class0_fp[j], comp->class0_fp[j]);
-
-        aom_tree_merge_probs(av1_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
-
-        if (allow_hp) {
-          comp->class0_hp =
-              av1_mode_mv_merge_probs(pre_comp->class0_hp, c->class0_hp);
-          comp->hp = av1_mode_mv_merge_probs(pre_comp->hp, c->hp);
-        }
-#if CONFIG_AMVR
-      }
-#endif
-    }
-  }
-}
-
 void av1_init_mv_probs(AV1_COMMON *cm) {
-  int i;
-  for (i = 0; i < NMV_CONTEXTS; ++i) {
-    // NB: this sets CDFs too
-    cm->fc->nmvc[i] = default_nmv_context;
-  }
-#if CONFIG_INTRABC
+  // NB: this sets CDFs too
+  cm->fc->nmvc = default_nmv_context;
   cm->fc->ndvc = default_nmv_context;
-#endif  // CONFIG_INTRABC
 }
diff --git a/third_party/aom/av1/common/entropymv.h b/third_party/aom/av1/common/entropymv.h
index 9ce089f7d..02ca7b66b 100644
--- a/third_party/aom/av1/common/entropymv.h
+++ b/third_party/aom/av1/common/entropymv.h
@@ -12,7 +12,7 @@
 #ifndef AV1_COMMON_ENTROPYMV_H_
 #define AV1_COMMON_ENTROPYMV_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #include "aom_dsp/prob.h"
 
@@ -26,8 +26,6 @@ struct AV1Common;
 
 void av1_init_mv_probs(struct AV1Common *cm);
 
-void av1_adapt_mv_probs(struct AV1Common *cm, int usehp);
-
 #define MV_UPDATE_PROB 252
 
 /* Symbols for coding which components are zero jointly */
@@ -66,9 +64,7 @@ typedef enum {
 #define CLASS0_BITS 1 /* bits at integer precision for class 0 */
 #define CLASS0_SIZE (1 << CLASS0_BITS)
 #define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)
-#if CONFIG_NEW_MULTISYMBOL
 #define MV_BITS_CONTEXTS 6
-#endif
 #define MV_FP_SIZE 4
 
 #define MV_MAX_BITS (MV_CLASSES + CLASS0_BITS + 2)
@@ -76,37 +72,22 @@ typedef enum {
 #define MV_VALS ((MV_MAX << 1) + 1)
 
 #define MV_IN_USE_BITS 14
-#define MV_UPP ((1 << MV_IN_USE_BITS) - 1)
+#define MV_UPP (1 << MV_IN_USE_BITS)
 #define MV_LOW (-(1 << MV_IN_USE_BITS))
 
-extern const aom_tree_index av1_mv_joint_tree[];
-extern const aom_tree_index av1_mv_class_tree[];
-extern const aom_tree_index av1_mv_class0_tree[];
-extern const aom_tree_index av1_mv_fp_tree[];
-
 typedef struct {
-  aom_prob sign;
-  aom_prob classes[MV_CLASSES - 1];
-  aom_cdf_prob class_cdf[CDF_SIZE(MV_CLASSES)];
-  aom_prob class0[CLASS0_SIZE - 1];
-  aom_prob bits[MV_OFFSET_BITS];
-  aom_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1];
-  aom_prob fp[MV_FP_SIZE - 1];
+  aom_cdf_prob classes_cdf[CDF_SIZE(MV_CLASSES)];
   aom_cdf_prob class0_fp_cdf[CLASS0_SIZE][CDF_SIZE(MV_FP_SIZE)];
   aom_cdf_prob fp_cdf[CDF_SIZE(MV_FP_SIZE)];
-  aom_prob class0_hp;
-  aom_prob hp;
-#if CONFIG_NEW_MULTISYMBOL
+  aom_cdf_prob sign_cdf[CDF_SIZE(2)];
   aom_cdf_prob class0_hp_cdf[CDF_SIZE(2)];
   aom_cdf_prob hp_cdf[CDF_SIZE(2)];
   aom_cdf_prob class0_cdf[CDF_SIZE(CLASS0_SIZE)];
-  aom_cdf_prob bits_cdf[MV_BITS_CONTEXTS][CDF_SIZE(2)];
-#endif
+  aom_cdf_prob bits_cdf[MV_OFFSET_BITS][CDF_SIZE(2)];
 } nmv_component;
 
 typedef struct {
-  aom_prob joints[MV_JOINTS - 1];
-  aom_cdf_prob joint_cdf[CDF_SIZE(MV_JOINTS)];
+  aom_cdf_prob joints_cdf[CDF_SIZE(MV_JOINTS)];
   nmv_component comps[2];
 } nmv_context;
 
@@ -120,33 +101,12 @@ static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) {
 
 MV_CLASS_TYPE av1_get_mv_class(int z, int *offset);
 
-typedef struct {
-  unsigned int sign[2];
-  unsigned int classes[MV_CLASSES];
-  unsigned int class0[CLASS0_SIZE];
-  unsigned int bits[MV_OFFSET_BITS][2];
-  unsigned int class0_fp[CLASS0_SIZE][MV_FP_SIZE];
-  unsigned int fp[MV_FP_SIZE];
-  unsigned int class0_hp[2];
-  unsigned int hp[2];
-} nmv_component_counts;
-
-typedef struct {
-  unsigned int joints[MV_JOINTS];
-  nmv_component_counts comps[2];
-} nmv_context_counts;
-
 typedef enum {
-#if CONFIG_INTRABC || CONFIG_AMVR
   MV_SUBPEL_NONE = -1,
-#endif
   MV_SUBPEL_LOW_PRECISION = 0,
   MV_SUBPEL_HIGH_PRECISION,
 } MvSubpelPrecision;
 
-void av1_inc_mv(const MV *mv, nmv_context_counts *mvctx,
-                MvSubpelPrecision precision);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/enums.h b/third_party/aom/av1/common/enums.h
index e8c4003cc..a37ee9f24 100644
--- a/third_party/aom/av1/common/enums.h
+++ b/third_party/aom/av1/common/enums.h
@@ -12,7 +12,8 @@
 #ifndef AV1_COMMON_ENUMS_H_
 #define AV1_COMMON_ENUMS_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_codec.h"
 #include "aom/aom_integer.h"
 
@@ -22,22 +23,8 @@ extern "C" {
 
 #undef MAX_SB_SIZE
 
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-#define TWO_MODE
-#endif
-
-#if CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT
-#define NC_MODE_INFO 1
-#else
-#define NC_MODE_INFO 0
-#endif
-
 // Max superblock size
-#if CONFIG_EXT_PARTITION
 #define MAX_SB_SIZE_LOG2 7
-#else
-#define MAX_SB_SIZE_LOG2 6
-#endif  // CONFIG_EXT_PARTITION
 #define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
 #define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
 
@@ -45,11 +32,7 @@ extern "C" {
 #define MIN_SB_SIZE_LOG2 6
 
 // Pixels per Mode Info (MI) unit
-#if CONFIG_CB4X4
 #define MI_SIZE_LOG2 2
-#else
-#define MI_SIZE_LOG2 3
-#endif
 #define MI_SIZE (1 << MI_SIZE_LOG2)
 
 // MI-units per max superblock (MI Block - MIB)
@@ -63,73 +46,78 @@ extern "C" {
 #define MAX_MIB_MASK (MAX_MIB_SIZE - 1)
 
 // Maximum number of tile rows and tile columns
-#if CONFIG_EXT_TILE
-#define MAX_TILE_ROWS 1024
-#define MAX_TILE_COLS 1024
-#else
-#if CONFIG_MAX_TILE
 #define MAX_TILE_ROWS 64
 #define MAX_TILE_COLS 64
-#else
-#define MAX_TILE_ROWS 4
-#define MAX_TILE_COLS 64
-#endif
-#endif  // CONFIG_EXT_TILE
 
-#if CONFIG_VAR_TX
 #define MAX_VARTX_DEPTH 2
-#define SQR_VARTX_DEPTH_INIT 0
-#define RECT_VARTX_DEPTH_INIT 0
-#endif
 
 #define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
+#define MI_SIZE_128X128 (128 >> MI_SIZE_LOG2)
+
+#define MAX_PALETTE_SQUARE (64 * 64)
+// Maximum number of colors in a palette.
+#define PALETTE_MAX_SIZE 8
+// Minimum number of colors in a palette.
+#define PALETTE_MIN_SIZE 2
+
+#define FRAME_OFFSET_BITS 5
+#define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1)
+
+#define REF_FRAMES_LOG2 3
+#define REF_FRAMES (1 << REF_FRAMES_LOG2)
+
+// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
+// in parallel, 3 for scaled references on the encoder.
+// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
+// of framebuffers.
+// TODO(jkoleszar): These 3 extra references could probably come from the
+// normal reference pool.
+#define FRAME_BUFFERS (REF_FRAMES + 7)
 
-#if CONFIG_LOOPFILTER_LEVEL
 // 4 frame filter levels: y plane vertical, y plane horizontal,
 // u plane, and v plane
 #define FRAME_LF_COUNT 4
 #define DEFAULT_DELTA_LF_MULTI 0
-#endif  // CONFIG_LOOPFILTER_LEVEL
-
-#if CONFIG_LPF_SB
-#define LPF_DELTA_BITS 3
-#define LPF_STEP 2
-#define DELTA_RANGE (1 << LPF_DELTA_BITS)
-#define MAX_LPF_OFFSET (LPF_STEP * ((1 << LPF_DELTA_BITS) - 1))
-
-#define LPF_REUSE_CONTEXT 2
-#define LPF_DELTA_CONTEXT DELTA_RANGE
-#define LPF_SIGN_CONTEXT 2
-
-// Half of maximum loop filter length (15-tap)
-#define FILT_BOUNDARY_OFFSET 8
-#define FILT_BOUNDARY_MI_OFFSET (FILT_BOUNDARY_OFFSET >> MI_SIZE_LOG2)
-#endif  // CONFIG_LPF_SB
-
-// Bitstream profiles indicated by 2-3 bits in the uncompressed header.
-// 00: Profile 0.  8-bit 4:2:0 only.
-// 10: Profile 1.  8-bit 4:4:4, 4:2:2, and 4:4:0.
-// 01: Profile 2.  10-bit and 12-bit color only, with 4:2:0 sampling.
-// 110: Profile 3. 10-bit and 12-bit color only, with 4:2:2/4:4:4/4:4:0
-//                 sampling.
-// 111: Undefined profile.
+#define MAX_MODE_LF_DELTAS 2
+
+#define DIST_PRECISION_BITS 4
+#define DIST_PRECISION (1 << DIST_PRECISION_BITS)  // 16
+
+// TODO(chengchen): Temporal flag serve as experimental flag for WIP
+// bitmask construction.
+// Shall be removed when bitmask code is completely checkedin
+#define LOOP_FILTER_BITMASK 0
+
+#define PROFILE_BITS 3
+// The following three profiles are currently defined.
+// Profile 0.  8-bit and 10-bit 4:2:0 and 4:0:0 only.
+// Profile 1.  8-bit and 10-bit 4:4:4
+// Profile 2.  8-bit and 10-bit 4:2:2
+//            12-bit  4:0:0, 4:2:2 and 4:4:4
+// Since we have three bits for the profiles, it can be extended later.
 typedef enum BITSTREAM_PROFILE {
   PROFILE_0,
   PROFILE_1,
   PROFILE_2,
-  PROFILE_3,
-  MAX_PROFILES
+  MAX_PROFILES,
 } BITSTREAM_PROFILE;
 
+#define LEVEL_MAJOR_BITS 3
+#define LEVEL_MINOR_BITS 2
+#define LEVEL_BITS (LEVEL_MAJOR_BITS + LEVEL_MINOR_BITS)
+
+#define LEVEL_MAJOR_MIN 2
+#define LEVEL_MAJOR_MAX ((1 << LEVEL_MAJOR_BITS) - 1 + LEVEL_MAJOR_MIN)
+#define LEVEL_MINOR_MIN 0
+#define LEVEL_MINOR_MAX ((1 << LEVEL_MINOR_BITS) - 1)
+
+#define OP_POINTS_CNT_MINUS_1_BITS 5
+#define OP_POINTS_IDC_BITS 12
+
 // Note: Some enums use the attribute 'packed' to use smallest possible integer
 // type, so that we can save memory when they are used in structs/arrays.
 
 typedef enum ATTRIBUTE_PACKED {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  BLOCK_2X2,
-  BLOCK_2X4,
-  BLOCK_4X2,
-#endif
   BLOCK_4X4,
   BLOCK_4X8,
   BLOCK_8X4,
@@ -143,33 +131,29 @@ typedef enum ATTRIBUTE_PACKED {
   BLOCK_32X64,
   BLOCK_64X32,
   BLOCK_64X64,
-#if CONFIG_EXT_PARTITION
   BLOCK_64X128,
   BLOCK_128X64,
   BLOCK_128X128,
-#endif  // CONFIG_EXT_PARTITION
   BLOCK_4X16,
   BLOCK_16X4,
   BLOCK_8X32,
   BLOCK_32X8,
   BLOCK_16X64,
   BLOCK_64X16,
-#if CONFIG_EXT_PARTITION
-  BLOCK_32X128,
-  BLOCK_128X32,
-#endif  // CONFIG_EXT_PARTITION
   BLOCK_SIZES_ALL,
   BLOCK_SIZES = BLOCK_4X16,
   BLOCK_INVALID = 255,
   BLOCK_LARGEST = (BLOCK_SIZES - 1)
 } BLOCK_SIZE;
 
-typedef enum {
+// 4X4, 8X8, 16X16, 32X32, 64X64, 128X128
+#define SQR_BLOCK_SIZES 6
+
+typedef enum ATTRIBUTE_PACKED {
   PARTITION_NONE,
   PARTITION_HORZ,
   PARTITION_VERT,
   PARTITION_SPLIT,
-#if CONFIG_EXT_PARTITION_TYPES
   PARTITION_HORZ_A,  // HORZ split and the top partition is split again
   PARTITION_HORZ_B,  // HORZ split and the bottom partition is split again
   PARTITION_VERT_A,  // VERT split and the left partition is split again
@@ -177,134 +161,104 @@ typedef enum {
   PARTITION_HORZ_4,  // 4:1 horizontal partition
   PARTITION_VERT_4,  // 4:1 vertical partition
   EXT_PARTITION_TYPES,
-#endif  // CONFIG_EXT_PARTITION_TYPES
   PARTITION_TYPES = PARTITION_SPLIT + 1,
   PARTITION_INVALID = 255
 } PARTITION_TYPE;
 
 typedef char PARTITION_CONTEXT;
 #define PARTITION_PLOFFSET 4  // number of probability models per block size
-#define PARTITION_BLOCK_SIZES (4 + CONFIG_EXT_PARTITION)
-#define PARTITION_CONTEXTS_PRIMARY (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET)
-#if CONFIG_UNPOISON_PARTITION_CTX
-#define INVALID_PARTITION_CTX (-1)
-#define PARTITION_CONTEXTS \
-  (PARTITION_CONTEXTS_PRIMARY + 2 * PARTITION_BLOCK_SIZES)
-#else
-#define PARTITION_CONTEXTS PARTITION_CONTEXTS_PRIMARY
-#endif
+#define PARTITION_BLOCK_SIZES 5
+#define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET)
 
 // block transform size
+#if defined(_MSC_VER)
+typedef uint8_t TX_SIZE;
+enum ATTRIBUTE_PACKED {
+#else
 typedef enum ATTRIBUTE_PACKED {
-#if CONFIG_CHROMA_2X2
-  TX_2X2,  // 2x2 transform
 #endif
-  TX_4X4,    // 4x4 transform
-  TX_8X8,    // 8x8 transform
-  TX_16X16,  // 16x16 transform
-  TX_32X32,  // 32x32 transform
-#if CONFIG_TX64X64
-  TX_64X64,  // 64x64 transform
-#endif       // CONFIG_TX64X64
-  TX_4X8,    // 4x8 transform
-  TX_8X4,    // 8x4 transform
-  TX_8X16,   // 8x16 transform
-  TX_16X8,   // 16x8 transform
-  TX_16X32,  // 16x32 transform
-  TX_32X16,  // 32x16 transform
-#if CONFIG_TX64X64
+  TX_4X4,             // 4x4 transform
+  TX_8X8,             // 8x8 transform
+  TX_16X16,           // 16x16 transform
+  TX_32X32,           // 32x32 transform
+  TX_64X64,           // 64x64 transform
+  TX_4X8,             // 4x8 transform
+  TX_8X4,             // 8x4 transform
+  TX_8X16,            // 8x16 transform
+  TX_16X8,            // 16x8 transform
+  TX_16X32,           // 16x32 transform
+  TX_32X16,           // 32x16 transform
   TX_32X64,           // 32x64 transform
   TX_64X32,           // 64x32 transform
-#endif                // CONFIG_TX64X64
   TX_4X16,            // 4x16 transform
   TX_16X4,            // 16x4 transform
   TX_8X32,            // 8x32 transform
   TX_32X8,            // 32x8 transform
+  TX_16X64,           // 16x64 transform
+  TX_64X16,           // 64x16 transform
   TX_SIZES_ALL,       // Includes rectangular transforms
   TX_SIZES = TX_4X8,  // Does NOT include rectangular transforms
-  TX_INVALID = 255    // Invalid transform size
+  TX_SIZES_LARGEST = TX_64X64,
+  TX_INVALID = 255  // Invalid transform size
+#if defined(_MSC_VER)
+};
+#else
 } TX_SIZE;
+#endif
 
 #define TX_SIZE_LUMA_MIN (TX_4X4)
 /* We don't need to code a transform size unless the allowed size is at least
    one more than the minimum. */
 #define TX_SIZE_CTX_MIN (TX_SIZE_LUMA_MIN + 1)
 
-#define MAX_TX_DEPTH (TX_SIZES - TX_SIZE_CTX_MIN)
+// Maximum tx_size categories
+#define MAX_TX_CATS (TX_SIZES - TX_SIZE_CTX_MIN)
+#define MAX_TX_DEPTH 2
 
-#if CONFIG_CTX1D
-#define MAX_HVTX_SIZE (1 << 5)
-#endif  // CONFIG_CTX1D
-
-#define MAX_TX_SIZE_LOG2 (5 + CONFIG_TX64X64)
+#define MAX_TX_SIZE_LOG2 (6)
 #define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
 #define MIN_TX_SIZE_LOG2 2
 #define MIN_TX_SIZE (1 << MIN_TX_SIZE_LOG2)
 #define MAX_TX_SQUARE (MAX_TX_SIZE * MAX_TX_SIZE)
 
+// Pad 4 extra columns to remove horizontal availability check.
+#define TX_PAD_HOR_LOG2 2
+#define TX_PAD_HOR 4
+// Pad 6 extra rows (2 on top and 4 on bottom) to remove vertical availability
+// check.
+#define TX_PAD_TOP 2
+#define TX_PAD_BOTTOM 4
+#define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM)
+// Pad 16 extra bytes to avoid reading overflow in SIMD optimization.
+#define TX_PAD_END 16
+#define TX_PAD_2D ((32 + TX_PAD_HOR) * (32 + TX_PAD_VER) + TX_PAD_END)
+
 // Number of maxium size transform blocks in the maximum size superblock
 #define MAX_TX_BLOCKS_IN_MAX_SB_LOG2 ((MAX_SB_SIZE_LOG2 - MAX_TX_SIZE_LOG2) * 2)
 #define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2)
 
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-typedef enum ATTRIBUTE_PACKED {
-  NCOBMC_MODE_0,
-  NCOBMC_MODE_1,
-  NCOBMC_MODE_2,
-  NCOBMC_MODE_3,
-  NCOBMC_MODE_4,
-  NCOBMC_MODE_5,
-  NCOBMC_MODE_6,
-  NCOBMC_MODE_7,
-  ALL_NCOBMC_MODES,
-#ifdef TWO_MODE
-  MAX_NCOBMC_MODES = NCOBMC_MODE_1 + 1,
-#else
-  MAX_NCOBMC_MODES = ALL_NCOBMC_MODES,
-#endif
-  NO_OVERLAP = MAX_NCOBMC_MODES + 1
-} NCOBMC_MODE;
-
-typedef enum {
-  ADAPT_OVERLAP_BLOCK_8X8,
-  ADAPT_OVERLAP_BLOCK_16X16,
-  ADAPT_OVERLAP_BLOCK_32X32,
-  ADAPT_OVERLAP_BLOCK_64X64,
-  ADAPT_OVERLAP_BLOCKS,
-  ADAPT_OVERLAP_BLOCK_INVALID = 255
-} ADAPT_OVERLAP_BLOCK;
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-
 // frame transform mode
-typedef enum {
-  ONLY_4X4,     // only 4x4 transform used
-  ALLOW_8X8,    // allow block transform size up to 8x8
-  ALLOW_16X16,  // allow block transform size up to 16x16
-  ALLOW_32X32,  // allow block transform size up to 32x32
-#if CONFIG_TX64X64
-  ALLOW_64X64,  // allow block transform size up to 64x64
-#endif
-  TX_MODE_SELECT,  // transform specified for each block
+typedef enum ATTRIBUTE_PACKED {
+  ONLY_4X4,         // use only 4x4 transform
+  TX_MODE_LARGEST,  // transform size is the largest possible for pu size
+  TX_MODE_SELECT,   // transform specified for each block
   TX_MODES,
 } TX_MODE;
 
 // 1D tx types
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   DCT_1D,
   ADST_1D,
   FLIPADST_1D,
   IDTX_1D,
-  // TODO(sarahparker) need to eventually put something here for the
-  // mrc experiment to make this work with the ext-tx pruning functions
   TX_TYPES_1D,
 } TX_TYPE_1D;
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   DCT_DCT,    // DCT  in both horizontal and vertical
   ADST_DCT,   // ADST in vertical, DCT in horizontal
   DCT_ADST,   // DCT  in vertical, ADST in horizontal
   ADST_ADST,  // ADST in both directions
-#if CONFIG_EXT_TX
   FLIPADST_DCT,
   DCT_FLIPADST,
   FLIPADST_FLIPADST,
@@ -317,25 +271,26 @@ typedef enum {
   H_ADST,
   V_FLIPADST,
   H_FLIPADST,
-#endif  // CONFIG_EXT_TX
-#if CONFIG_MRC_TX
-  MRC_DCT,  // DCT in both directions with mrc based bitmask
-#endif      // CONFIG_MRC_TX
   TX_TYPES,
 } TX_TYPE;
 
-#if CONFIG_EXT_TX
 typedef enum {
+  REG_REG,
+  REG_SMOOTH,
+  REG_SHARP,
+  SMOOTH_REG,
+  SMOOTH_SMOOTH,
+  SMOOTH_SHARP,
+  SHARP_REG,
+  SHARP_SMOOTH,
+  SHARP_SHARP,
+} DUAL_FILTER_TYPE;
+
+typedef enum ATTRIBUTE_PACKED {
   // DCT only
   EXT_TX_SET_DCTONLY,
   // DCT + Identity only
   EXT_TX_SET_DCT_IDTX,
-#if CONFIG_MRC_TX
-  // DCT + MRC_DCT
-  EXT_TX_SET_MRC_DCT,
-  // DCT + MRC_DCT + IDTX
-  EXT_TX_SET_MRC_DCT_IDTX,
-#endif  // CONFIG_MRC_TX
   // Discrete Trig transforms w/o flip (4) + Identity (1)
   EXT_TX_SET_DTT4_IDTX,
   // Discrete Trig transforms w/o flip (4) + Identity (1) + 1D Hor/vert DCT (2)
@@ -348,45 +303,13 @@ typedef enum {
 } TxSetType;
 
 #define IS_2D_TRANSFORM(tx_type) (tx_type < IDTX)
-#else
-#define IS_2D_TRANSFORM(tx_type) 1
-#endif
 
-typedef enum {
-  TILE_LEFT_BOUNDARY = 1,
-  TILE_RIGHT_BOUNDARY = 2,
-  TILE_ABOVE_BOUNDARY = 4,
-  TILE_BOTTOM_BOUNDARY = 8,
-  FRAME_LEFT_BOUNDARY = 16,
-  FRAME_RIGHT_BOUNDARY = 32,
-  FRAME_ABOVE_BOUNDARY = 64,
-  FRAME_BOTTOM_BOUNDARY = 128,
-} BOUNDARY_TYPE;
-
-#if CONFIG_EXT_TX
-#if CONFIG_CHROMA_2X2
-#define EXT_TX_SIZES 5  // number of sizes that use extended transforms
-#else
-#define EXT_TX_SIZES 4  // number of sizes that use extended transforms
-#endif                  // CONFIG_CHROMA_2X2
-#if CONFIG_MRC_TX
-#define EXT_TX_SETS_INTER 5  // Sets of transform selections for INTER
-#define EXT_TX_SETS_INTRA 4  // Sets of transform selections for INTRA
-#else                        // CONFIG_MRC_TX
+#define EXT_TX_SIZES 4       // number of sizes that use extended transforms
 #define EXT_TX_SETS_INTER 4  // Sets of transform selections for INTER
 #define EXT_TX_SETS_INTRA 3  // Sets of transform selections for INTRA
-#endif                       // CONFIG_MRC_TX
-#else
-#if CONFIG_CHROMA_2X2
-#define EXT_TX_SIZES 4  // number of sizes that use extended transforms
-#else
-#define EXT_TX_SIZES 3  // number of sizes that use extended transforms
-#endif
-#endif  // CONFIG_EXT_TX
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   AOM_LAST_FLAG = 1 << 0,
-#if CONFIG_EXT_REFS
   AOM_LAST2_FLAG = 1 << 1,
   AOM_LAST3_FLAG = 1 << 2,
   AOM_GOLD_FLAG = 1 << 3,
@@ -394,43 +317,45 @@ typedef enum {
   AOM_ALT2_FLAG = 1 << 5,
   AOM_ALT_FLAG = 1 << 6,
   AOM_REFFRAME_ALL = (1 << 7) - 1
-#else   // !CONFIG_EXT_REFS
-  AOM_GOLD_FLAG = 1 << 1,
-  AOM_ALT_FLAG = 1 << 2,
-  AOM_REFFRAME_ALL = (1 << 3) - 1
-#endif  // CONFIG_EXT_REFS
 } AOM_REFFRAME;
 
-#if CONFIG_EXT_COMP_REFS
-#define USE_UNI_COMP_REFS 1
-
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   UNIDIR_COMP_REFERENCE,
   BIDIR_COMP_REFERENCE,
   COMP_REFERENCE_TYPES,
 } COMP_REFERENCE_TYPE;
-#else  // !CONFIG_EXT_COMP_REFS
-#define USE_UNI_COMP_REFS 0
-#endif  // CONFIG_EXT_COMP_REFS
 
-typedef enum { PLANE_TYPE_Y, PLANE_TYPE_UV, PLANE_TYPES } PLANE_TYPE;
+typedef enum ATTRIBUTE_PACKED {
+  PLANE_TYPE_Y,
+  PLANE_TYPE_UV,
+  PLANE_TYPES
+} PLANE_TYPE;
 
-#if CONFIG_CFL
 #define CFL_ALPHABET_SIZE_LOG2 4
 #define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2)
 #define CFL_MAGS_SIZE ((2 << CFL_ALPHABET_SIZE_LOG2) + 1)
 #define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2)
 #define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1))
 
-typedef enum { CFL_PRED_U, CFL_PRED_V, CFL_PRED_PLANES } CFL_PRED_TYPE;
+typedef enum ATTRIBUTE_PACKED {
+  CFL_PRED_U,
+  CFL_PRED_V,
+  CFL_PRED_PLANES
+} CFL_PRED_TYPE;
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   CFL_SIGN_ZERO,
   CFL_SIGN_NEG,
   CFL_SIGN_POS,
   CFL_SIGNS
 } CFL_SIGN_TYPE;
 
+typedef enum ATTRIBUTE_PACKED {
+  CFL_DISALLOWED,
+  CFL_ALLOWED,
+  CFL_ALLOWED_TYPES
+} CFL_ALLOWED_TYPE;
+
 // CFL_SIGN_ZERO,CFL_SIGN_ZERO is invalid
 #define CFL_JOINT_SIGNS (CFL_SIGNS * CFL_SIGNS - 1)
 // CFL_SIGN_U is equivalent to (js + 1) / 3 for js in 0 to 8
@@ -445,17 +370,13 @@ typedef enum {
 // Also, the contexts are symmetric under swapping the planes.
 #define CFL_CONTEXT_V(js) \
   (CFL_SIGN_V(js) * CFL_SIGNS + CFL_SIGN_U(js) - CFL_SIGNS)
-#endif
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   PALETTE_MAP,
-#if CONFIG_MRC_TX
-  MRC_MAP,
-#endif  // CONFIG_MRC_TX
   COLOR_MAP_TYPES,
 } COLOR_MAP_TYPE;
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   TWO_COLORS,
   THREE_COLORS,
   FOUR_COLORS,
@@ -466,7 +387,7 @@ typedef enum {
   PALETTE_SIZES
 } PALETTE_SIZE;
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   PALETTE_COLOR_ONE,
   PALETTE_COLOR_TWO,
   PALETTE_COLOR_THREE,
@@ -478,36 +399,26 @@ typedef enum {
   PALETTE_COLORS
 } PALETTE_COLOR;
 
-// Note: All directional predictors must be between V_PRED and D63_PRED (both
+// Note: All directional predictors must be between V_PRED and D67_PRED (both
 // inclusive).
 typedef enum ATTRIBUTE_PACKED {
-  DC_PRED,      // Average of above and left pixels
-  V_PRED,       // Vertical
-  H_PRED,       // Horizontal
-  D45_PRED,     // Directional 45  deg = round(arctan(1/1) * 180/pi)
-  D135_PRED,    // Directional 135 deg = 180 - 45
-  D117_PRED,    // Directional 117 deg = 180 - 63
-  D153_PRED,    // Directional 153 deg = 180 - 27
-  D207_PRED,    // Directional 207 deg = 180 + 27
-  D63_PRED,     // Directional 63  deg = round(arctan(2/1) * 180/pi)
-  SMOOTH_PRED,  // Combination of horizontal and vertical interpolation
-#if CONFIG_SMOOTH_HV
+  DC_PRED,        // Average of above and left pixels
+  V_PRED,         // Vertical
+  H_PRED,         // Horizontal
+  D45_PRED,       // Directional 45  degree
+  D135_PRED,      // Directional 135 degree
+  D113_PRED,      // Directional 113 degree
+  D157_PRED,      // Directional 157 degree
+  D203_PRED,      // Directional 203 degree
+  D67_PRED,       // Directional 67  degree
+  SMOOTH_PRED,    // Combination of horizontal and vertical interpolation
   SMOOTH_V_PRED,  // Vertical interpolation
   SMOOTH_H_PRED,  // Horizontal interpolation
-#endif            // CONFIG_SMOOTH_HV
-  TM_PRED,        // True-motion
+  PAETH_PRED,     // Predict from the direction of smallest gradient
   NEARESTMV,
   NEARMV,
-  ZEROMV,
+  GLOBALMV,
   NEWMV,
-#if CONFIG_COMPOUND_SINGLEREF
-  // Single ref compound modes
-  SR_NEAREST_NEARMV,
-  // SR_NEAREST_NEWMV,
-  SR_NEAR_NEWMV,
-  SR_ZERO_NEWMV,
-  SR_NEW_NEWMV,
-#endif  // CONFIG_COMPOUND_SINGLEREF
   // Compound ref compound modes
   NEAREST_NEARESTMV,
   NEAR_NEARMV,
@@ -515,175 +426,131 @@ typedef enum ATTRIBUTE_PACKED {
   NEW_NEARESTMV,
   NEAR_NEWMV,
   NEW_NEARMV,
-  ZERO_ZEROMV,
+  GLOBAL_GLOBALMV,
   NEW_NEWMV,
   MB_MODE_COUNT,
-  INTRA_MODES = TM_PRED + 1,     // TM_PRED has to be the last intra mode.
+  INTRA_MODE_START = DC_PRED,
+  INTRA_MODE_END = NEARESTMV,
+  INTRA_MODE_NUM = INTRA_MODE_END - INTRA_MODE_START,
+  SINGLE_INTER_MODE_START = NEARESTMV,
+  SINGLE_INTER_MODE_END = NEAREST_NEARESTMV,
+  SINGLE_INTER_MODE_NUM = SINGLE_INTER_MODE_END - SINGLE_INTER_MODE_START,
+  COMP_INTER_MODE_START = NEAREST_NEARESTMV,
+  COMP_INTER_MODE_END = MB_MODE_COUNT,
+  COMP_INTER_MODE_NUM = COMP_INTER_MODE_END - COMP_INTER_MODE_START,
+  INTRA_MODES = PAETH_PRED + 1,  // PAETH_PRED has to be the last intra mode.
   INTRA_INVALID = MB_MODE_COUNT  // For uv_mode in inter blocks
 } PREDICTION_MODE;
 
-#if CONFIG_CFL
 // TODO(ltrudeau) Do we really want to pack this?
 // TODO(ltrudeau) Do we match with PREDICTION_MODE?
 typedef enum ATTRIBUTE_PACKED {
-  UV_DC_PRED,      // Average of above and left pixels
-  UV_V_PRED,       // Vertical
-  UV_H_PRED,       // Horizontal
-  UV_D45_PRED,     // Directional 45  deg = round(arctan(1/1) * 180/pi)
-  UV_D135_PRED,    // Directional 135 deg = 180 - 45
-  UV_D117_PRED,    // Directional 117 deg = 180 - 63
-  UV_D153_PRED,    // Directional 153 deg = 180 - 27
-  UV_D207_PRED,    // Directional 207 deg = 180 + 27
-  UV_D63_PRED,     // Directional 63  deg = round(arctan(2/1) * 180/pi)
-  UV_SMOOTH_PRED,  // Combination of horizontal and vertical interpolation
-#if CONFIG_SMOOTH_HV
+  UV_DC_PRED,        // Average of above and left pixels
+  UV_V_PRED,         // Vertical
+  UV_H_PRED,         // Horizontal
+  UV_D45_PRED,       // Directional 45  degree
+  UV_D135_PRED,      // Directional 135 degree
+  UV_D113_PRED,      // Directional 113 degree
+  UV_D157_PRED,      // Directional 157 degree
+  UV_D203_PRED,      // Directional 203 degree
+  UV_D67_PRED,       // Directional 67  degree
+  UV_SMOOTH_PRED,    // Combination of horizontal and vertical interpolation
   UV_SMOOTH_V_PRED,  // Vertical interpolation
   UV_SMOOTH_H_PRED,  // Horizontal interpolation
-#endif               // CONFIG_SMOOTH_HV
-  UV_TM_PRED,        // True-motion
+  UV_PAETH_PRED,     // Predict from the direction of smallest gradient
   UV_CFL_PRED,       // Chroma-from-Luma
   UV_INTRA_MODES,
   UV_MODE_INVALID,  // For uv_mode in inter blocks
 } UV_PREDICTION_MODE;
-#else
-#define UV_INTRA_MODES (INTRA_MODES)
-#define UV_PREDICTION_MODE PREDICTION_MODE
-#define UV_DC_PRED (DC_PRED)
-#define UV_MODE_INVALID (INTRA_INVALID)
-#endif  // CONFIG_CFL
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   SIMPLE_TRANSLATION,
-#if CONFIG_MOTION_VAR
-  OBMC_CAUSAL,  // 2-sided OBMC
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  NCOBMC_ADAPT_WEIGHT,
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR
-#if CONFIG_WARPED_MOTION
+  OBMC_CAUSAL,    // 2-sided OBMC
   WARPED_CAUSAL,  // 2-sided WARPED
-#endif            // CONFIG_WARPED_MOTION
   MOTION_MODES
-#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
-  ,
-  OBMC_FAMILY_MODES = NCOBMC_ADAPT_WEIGHT + 1
-#endif
 } MOTION_MODE;
 
-#if CONFIG_INTERINTRA
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   II_DC_PRED,
   II_V_PRED,
   II_H_PRED,
   II_SMOOTH_PRED,
   INTERINTRA_MODES
 } INTERINTRA_MODE;
-#endif
 
 typedef enum {
   COMPOUND_AVERAGE,
-#if CONFIG_WEDGE
   COMPOUND_WEDGE,
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-  COMPOUND_SEG,
-#endif  // CONFIG_COMPOUND_SEGMENT
+  COMPOUND_DIFFWTD,
   COMPOUND_TYPES,
 } COMPOUND_TYPE;
 
-// TODO(huisu): Consider adding FILTER_SMOOTH_PRED to "FILTER_INTRA_MODE".
-#if CONFIG_FILTER_INTRA
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   FILTER_DC_PRED,
   FILTER_V_PRED,
   FILTER_H_PRED,
-  FILTER_D45_PRED,
-  FILTER_D135_PRED,
-  FILTER_D117_PRED,
-  FILTER_D153_PRED,
-  FILTER_D207_PRED,
-  FILTER_D63_PRED,
-  FILTER_TM_PRED,
+  FILTER_D157_PRED,
+  FILTER_PAETH_PRED,
   FILTER_INTRA_MODES,
 } FILTER_INTRA_MODE;
-#endif  // CONFIG_FILTER_INTRA
 
-#if CONFIG_EXT_INTRA
 #define DIRECTIONAL_MODES 8
-#endif  // CONFIG_EXT_INTRA
+#define MAX_ANGLE_DELTA 3
+#define ANGLE_STEP 3
 
 #define INTER_MODES (1 + NEWMV - NEARESTMV)
 
-#if CONFIG_COMPOUND_SINGLEREF
-#define INTER_SINGLEREF_COMP_MODES (1 + SR_NEW_NEWMV - SR_NEAREST_NEARMV)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
 #define INTER_COMPOUND_MODES (1 + NEW_NEWMV - NEAREST_NEARESTMV)
 
 #define SKIP_CONTEXTS 3
+#define SKIP_MODE_CONTEXTS 3
+
+#define COMP_INDEX_CONTEXTS 6
+#define COMP_GROUP_IDX_CONTEXTS 6
 
 #define NMV_CONTEXTS 3
 
-#define NEWMV_MODE_CONTEXTS 7
-#define ZEROMV_MODE_CONTEXTS 2
-#define REFMV_MODE_CONTEXTS 9
-#define DRL_MODE_CONTEXTS 5
+#define NEWMV_MODE_CONTEXTS 6
+#define GLOBALMV_MODE_CONTEXTS 2
+#define REFMV_MODE_CONTEXTS 6
+#define DRL_MODE_CONTEXTS 3
 
-#define ZEROMV_OFFSET 3
+#define GLOBALMV_OFFSET 3
 #define REFMV_OFFSET 4
 
-#define NEWMV_CTX_MASK ((1 << ZEROMV_OFFSET) - 1)
-#define ZEROMV_CTX_MASK ((1 << (REFMV_OFFSET - ZEROMV_OFFSET)) - 1)
+#define NEWMV_CTX_MASK ((1 << GLOBALMV_OFFSET) - 1)
+#define GLOBALMV_CTX_MASK ((1 << (REFMV_OFFSET - GLOBALMV_OFFSET)) - 1)
 #define REFMV_CTX_MASK ((1 << (8 - REFMV_OFFSET)) - 1)
 
-#define ALL_ZERO_FLAG_OFFSET 8
-#define SKIP_NEARESTMV_OFFSET 9
-#define SKIP_NEARMV_OFFSET 10
-#define SKIP_NEARESTMV_SUB8X8_OFFSET 11
+#define COMP_NEWMV_CTXS 5
+#define INTER_MODE_CONTEXTS 8
 
-#define INTER_MODE_CONTEXTS 7
 #define DELTA_Q_SMALL 3
 #define DELTA_Q_PROBS (DELTA_Q_SMALL)
 #define DEFAULT_DELTA_Q_RES 4
-#if CONFIG_EXT_DELTA_Q
 #define DELTA_LF_SMALL 3
 #define DELTA_LF_PROBS (DELTA_LF_SMALL)
 #define DEFAULT_DELTA_LF_RES 2
-#endif
 
 /* Segment Feature Masks */
 #define MAX_MV_REF_CANDIDATES 2
 
-#define MAX_REF_MV_STACK_SIZE 16
-#if CONFIG_EXT_PARTITION
+#define MAX_REF_MV_STACK_SIZE 8
 #define REF_CAT_LEVEL 640
-#else
-#define REF_CAT_LEVEL 255
-#endif  // CONFIG_EXT_PARTITION
 
 #define INTRA_INTER_CONTEXTS 4
 #define COMP_INTER_CONTEXTS 5
-#define REF_CONTEXTS 5
+#define REF_CONTEXTS 3
 
-#if CONFIG_EXT_COMP_REFS
 #define COMP_REF_TYPE_CONTEXTS 5
 #define UNI_COMP_REF_CONTEXTS 3
-#endif  // CONFIG_EXT_COMP_REFS
-
-#if CONFIG_COMPOUND_SINGLEREF
-#define COMP_INTER_MODE_CONTEXTS 4
-#endif  // CONFIG_COMPOUND_SINGLEREF
 
-#if CONFIG_VAR_TX
-#define TXFM_PARTITION_CONTEXTS ((TX_SIZES - TX_8X8) * 6 - 2)
+#define TXFM_PARTITION_CONTEXTS ((TX_SIZES - TX_8X8) * 6 - 3)
 typedef uint8_t TXFM_CONTEXT;
-#endif
 
 #define NONE_FRAME -1
 #define INTRA_FRAME 0
 #define LAST_FRAME 1
-
-#if CONFIG_EXT_REFS
 #define LAST2_FRAME 2
 #define LAST3_FRAME 3
 #define GOLDEN_FRAME 4
@@ -691,94 +558,55 @@ typedef uint8_t TXFM_CONTEXT;
 #define ALTREF2_FRAME 6
 #define ALTREF_FRAME 7
 #define LAST_REF_FRAMES (LAST3_FRAME - LAST_FRAME + 1)
-#else  // !CONFIG_EXT_REFS
-#define GOLDEN_FRAME 2
-#define ALTREF_FRAME 3
-#endif  // CONFIG_EXT_REFS
 
 #define INTER_REFS_PER_FRAME (ALTREF_FRAME - LAST_FRAME + 1)
-#define TOTAL_REFS_PER_FRAME (ALTREF_FRAME - INTRA_FRAME + 1)
 
 #define FWD_REFS (GOLDEN_FRAME - LAST_FRAME + 1)
 #define FWD_RF_OFFSET(ref) (ref - LAST_FRAME)
-#if CONFIG_EXT_REFS
 #define BWD_REFS (ALTREF_FRAME - BWDREF_FRAME + 1)
 #define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME)
-#else
-#define BWD_REFS 1
-#define BWD_RF_OFFSET(ref) (ref - ALTREF_FRAME)
-#endif  // CONFIG_EXT_REFS
 
 #define SINGLE_REFS (FWD_REFS + BWD_REFS)
-#if CONFIG_EXT_COMP_REFS
-typedef enum {
-  LAST_LAST2_FRAMES,     // { LAST_FRAME, LAST2_FRAME }
-  LAST_LAST3_FRAMES,     // { LAST_FRAME, LAST3_FRAME }
-  LAST_GOLDEN_FRAMES,    // { LAST_FRAME, GOLDEN_FRAME }
-  BWDREF_ALTREF_FRAMES,  // { BWDREF_FRAME, ALTREF_FRAME }
-  UNIDIR_COMP_REFS
+
+typedef enum ATTRIBUTE_PACKED {
+  LAST_LAST2_FRAMES,      // { LAST_FRAME, LAST2_FRAME }
+  LAST_LAST3_FRAMES,      // { LAST_FRAME, LAST3_FRAME }
+  LAST_GOLDEN_FRAMES,     // { LAST_FRAME, GOLDEN_FRAME }
+  BWDREF_ALTREF_FRAMES,   // { BWDREF_FRAME, ALTREF_FRAME }
+  LAST2_LAST3_FRAMES,     // { LAST2_FRAME, LAST3_FRAME }
+  LAST2_GOLDEN_FRAMES,    // { LAST2_FRAME, GOLDEN_FRAME }
+  LAST3_GOLDEN_FRAMES,    // { LAST3_FRAME, GOLDEN_FRAME }
+  BWDREF_ALTREF2_FRAMES,  // { BWDREF_FRAME, ALTREF2_FRAME }
+  ALTREF2_ALTREF_FRAMES,  // { ALTREF2_FRAME, ALTREF_FRAME }
+  TOTAL_UNIDIR_COMP_REFS,
+  // NOTE: UNIDIR_COMP_REFS is the number of uni-directional reference pairs
+  //       that are explicitly signaled.
+  UNIDIR_COMP_REFS = BWDREF_ALTREF_FRAMES + 1,
 } UNIDIR_COMP_REF;
-#define COMP_REFS (FWD_REFS * BWD_REFS + UNIDIR_COMP_REFS)
-#else  // !CONFIG_EXT_COMP_REFS
-#define COMP_REFS (FWD_REFS * BWD_REFS)
-#endif  // CONFIG_EXT_COMP_REFS
 
-#define MODE_CTX_REF_FRAMES (TOTAL_REFS_PER_FRAME + COMP_REFS)
+#define TOTAL_COMP_REFS (FWD_REFS * BWD_REFS + TOTAL_UNIDIR_COMP_REFS)
 
-#if CONFIG_SUPERTX
-#define PARTITION_SUPERTX_CONTEXTS 2
-#define MAX_SUPERTX_BLOCK_SIZE BLOCK_32X32
-#endif  // CONFIG_SUPERTX
+#define COMP_REFS (FWD_REFS * BWD_REFS + UNIDIR_COMP_REFS)
 
-#if CONFIG_LOOP_RESTORATION
-typedef enum {
+// NOTE: A limited number of unidirectional reference pairs can be signalled for
+//       compound prediction. The use of skip mode, on the other hand, makes it
+//       possible to have a reference pair not listed for explicit signaling.
+#define MODE_CTX_REF_FRAMES (REF_FRAMES + TOTAL_COMP_REFS)
+
+typedef enum ATTRIBUTE_PACKED {
   RESTORE_NONE,
   RESTORE_WIENER,
   RESTORE_SGRPROJ,
   RESTORE_SWITCHABLE,
   RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE,
-  RESTORE_TYPES,
+  RESTORE_TYPES = 4,
 } RestorationType;
-#endif  // CONFIG_LOOP_RESTORATION
 
-#if CONFIG_FRAME_SUPERRES
 #define SUPERRES_SCALE_BITS 3
-#define SUPERRES_SCALE_DENOMINATOR_MIN 8
-#endif  // CONFIG_FRAME_SUPERRES
-
-#if CONFIG_LPF_DIRECT
-typedef enum {
-  VERT_HORZ,
-  DEGREE_30,
-  DEGREE_45,
-  DEGREE_60,
-  DEGREE_120,
-  DEGREE_135,
-  DEGREE_150,
-  FILTER_DEGREES,
-} FILTER_DEGREE;
-#endif  // CONFIG_LPF_DIRECT
-
-#if CONFIG_OBU
-// R19
-typedef enum {
-  OBU_SEQUENCE_HEADER = 1,
-  OBU_TD = 2,
-  OBU_FRAME_HEADER = 3,
-  OBU_TILE_GROUP = 4,
-  OBU_METADATA = 5,
-  OBU_PADDING = 15,
-} OBU_TYPE;
-#endif
+#define SUPERRES_SCALE_DENOMINATOR_MIN (SCALE_NUMERATOR + 1)
 
-#if CONFIG_LGT_FROM_PRED
-#define LGT_SIZES 2
-// Note: at least one of LGT_FROM_PRED_INTRA and LGT_FROM_PRED_INTER must be 1
-#define LGT_FROM_PRED_INTRA 1
-#define LGT_FROM_PRED_INTER 1
-// LGT_SL_INTRA: LGTs with a mode-dependent first self-loop and a break point
-#define LGT_SL_INTRA 0
-#endif  // CONFIG_LGT_FROM_PRED
+// In large_scale_tile coding, external references are used.
+#define MAX_EXTERNAL_REFERENCES 128
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/common/filter.c b/third_party/aom/av1/common/filter.c
index 135132316..a7e67ea4a 100644
--- a/third_party/aom/av1/common/filter.c
+++ b/third_party/aom/av1/common/filter.c
@@ -25,153 +25,6 @@ DECLARE_ALIGNED(256, static const InterpKernel,
   { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 }
 };
 
-#if USE_TEMPORALFILTER_12TAP
-DECLARE_ALIGNED(16, static const int16_t,
-                sub_pel_filters_temporalfilter_12[SUBPEL_SHIFTS][12]) = {
-  // intfilt 0.8
-  { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 },
-  { 0, 1, -1, 3, -7, 127, 8, -4, 2, -1, 0, 0 },
-  { 0, 1, -3, 5, -12, 124, 18, -8, 4, -2, 1, 0 },
-  { -1, 2, -4, 8, -17, 120, 28, -11, 6, -3, 1, -1 },
-  { -1, 2, -4, 10, -21, 114, 38, -15, 8, -4, 2, -1 },
-  { -1, 3, -5, 11, -23, 107, 49, -18, 9, -5, 2, -1 },
-  { -1, 3, -6, 12, -25, 99, 60, -21, 11, -6, 3, -1 },
-  { -1, 3, -6, 12, -25, 90, 70, -23, 12, -6, 3, -1 },
-  { -1, 3, -6, 12, -24, 80, 80, -24, 12, -6, 3, -1 },
-  { -1, 3, -6, 12, -23, 70, 90, -25, 12, -6, 3, -1 },
-  { -1, 3, -6, 11, -21, 60, 99, -25, 12, -6, 3, -1 },
-  { -1, 2, -5, 9, -18, 49, 107, -23, 11, -5, 3, -1 },
-  { -1, 2, -4, 8, -15, 38, 114, -21, 10, -4, 2, -1 },
-  { -1, 1, -3, 6, -11, 28, 120, -17, 8, -4, 2, -1 },
-  { 0, 1, -2, 4, -8, 18, 124, -12, 5, -3, 1, 0 },
-  { 0, 0, -1, 2, -4, 8, 127, -7, 3, -1, 1, 0 },
-};
-#endif  // USE_TEMPORALFILTER_12TAP
-
-#if USE_EXTRA_FILTER
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_8[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },      { 0, 2, -6, 126, 8, -2, 0, 0 },
-  { 0, 2, -10, 122, 18, -4, 0, 0 },  { 0, 2, -12, 116, 28, -8, 2, 0 },
-  { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 },
-  { 0, 2, -16, 94, 58, -12, 2, 0 },  { 0, 2, -14, 84, 66, -12, 2, 0 },
-  { 0, 2, -14, 76, 76, -14, 2, 0 },  { 0, 2, -12, 66, 84, -14, 2, 0 },
-  { 0, 2, -12, 58, 94, -16, 2, 0 },  { 0, 2, -12, 48, 102, -14, 2, 0 },
-  { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 },
-  { 0, 0, -4, 18, 122, -10, 2, 0 },  { 0, 0, -2, 8, 126, -6, 2, 0 }
-};
-
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_regular_uv[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },      { 0, 2, -6, 126, 8, -2, 0, 0 },
-  { 0, 2, -10, 122, 18, -4, 0, 0 },  { 0, 2, -12, 116, 28, -8, 2, 0 },
-  { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 },
-  { 0, 2, -16, 94, 58, -12, 2, 0 },  { 0, 2, -14, 84, 66, -12, 2, 0 },
-  { 0, 2, -14, 76, 76, -14, 2, 0 },  { 0, 2, -12, 66, 84, -14, 2, 0 },
-  { 0, 2, -12, 58, 94, -16, 2, 0 },  { 0, 2, -12, 48, 102, -14, 2, 0 },
-  { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 },
-  { 0, 0, -4, 18, 122, -10, 2, 0 },  { 0, 0, -2, 8, 126, -6, 2, 0 }
-};
-
-#if USE_12TAP_FILTER
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
-  // intfilt 0.8
-  { 0, 0, 0, 128, 0, 0, 0, 0 },         { -1, 2, -6, 127, 9, -4, 2, -1 },
-  { -2, 5, -12, 124, 18, -7, 4, -2 },   { -2, 7, -16, 119, 28, -11, 5, -2 },
-  { -3, 8, -19, 114, 38, -14, 7, -3 },  { -3, 9, -22, 107, 49, -17, 8, -3 },
-  { -4, 10, -23, 99, 60, -20, 10, -4 }, { -4, 11, -23, 90, 70, -22, 10, -4 },
-  { -4, 11, -23, 80, 80, -23, 11, -4 }, { -4, 10, -22, 70, 90, -23, 11, -4 },
-  { -4, 10, -20, 60, 99, -23, 10, -4 }, { -3, 8, -17, 49, 107, -22, 9, -3 },
-  { -3, 7, -14, 38, 114, -19, 8, -3 },  { -2, 5, -11, 28, 119, -16, 7, -2 },
-  { -2, 4, -7, 18, 124, -12, 5, -2 },   { -1, 2, -4, 9, 127, -6, 2, -1 },
-};
-
-DECLARE_ALIGNED(256, static const int16_t,
-                sub_pel_filters_10sharp[SUBPEL_SHIFTS][12]) = {
-  // intfilt 0.85
-  { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 },
-  { 0, 1, -2, 3, -7, 127, 8, -4, 2, -1, 1, 0 },
-  { 0, 1, -3, 6, -13, 124, 18, -8, 4, -2, 1, 0 },
-  { 0, 2, -4, 8, -18, 120, 28, -12, 6, -4, 2, 0 },
-  { 0, 2, -5, 10, -21, 114, 38, -15, 8, -5, 2, 0 },
-  { 0, 3, -6, 11, -24, 107, 49, -19, 10, -6, 3, 0 },
-  { 0, 3, -7, 12, -25, 99, 59, -21, 11, -6, 3, 0 },
-  { 0, 3, -7, 12, -25, 90, 70, -23, 12, -7, 3, 0 },
-  { 0, 3, -7, 12, -25, 81, 81, -25, 12, -7, 3, 0 },
-  { 0, 3, -7, 12, -23, 70, 90, -25, 12, -7, 3, 0 },
-  { 0, 3, -6, 11, -21, 59, 99, -25, 12, -7, 3, 0 },
-  { 0, 3, -6, 10, -19, 49, 107, -24, 11, -6, 3, 0 },
-  { 0, 2, -5, 8, -15, 38, 114, -21, 10, -5, 2, 0 },
-  { 0, 2, -4, 6, -12, 28, 120, -18, 8, -4, 2, 0 },
-  { 0, 1, -2, 4, -8, 18, 124, -13, 6, -3, 1, 0 },
-  { 0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0 },
-};
-#else
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },         { -2, 2, -6, 126, 8, -2, 2, 0 },
-  { -2, 6, -12, 124, 16, -6, 4, -2 },   { -2, 8, -18, 120, 26, -10, 6, -2 },
-  { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 },
-  { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 },
-  { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 },
-  { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 },
-  { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 },
-  { -2, 4, -6, 16, 124, -12, 6, -2 },   { 0, 2, -2, 8, 126, -6, 2, -2 }
-};
-#endif
-
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_8smooth2[SUBPEL_SHIFTS]) = {
-  // freqmultiplier = 0.2
-  { 0, 0, 0, 128, 0, 0, 0, 0 },   { 0, 9, 30, 44, 32, 11, 2, 0 },
-  { 0, 8, 28, 44, 34, 12, 2, 0 }, { 0, 7, 27, 44, 35, 13, 2, 0 },
-  { 0, 6, 26, 43, 37, 14, 2, 0 }, { 0, 5, 24, 43, 38, 16, 2, 0 },
-  { 0, 5, 23, 42, 38, 17, 3, 0 }, { 0, 4, 21, 41, 40, 19, 3, 0 },
-  { 0, 4, 20, 40, 40, 20, 4, 0 }, { 0, 3, 19, 40, 41, 21, 4, 0 },
-  { 0, 3, 17, 38, 42, 23, 5, 0 }, { 0, 2, 16, 38, 43, 24, 5, 0 },
-  { 0, 2, 14, 37, 43, 26, 6, 0 }, { 0, 2, 13, 35, 44, 27, 7, 0 },
-  { 0, 2, 12, 34, 44, 28, 8, 0 }, { 0, 2, 11, 32, 44, 30, 9, 0 },
-};
-
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_smooth2_uv[SUBPEL_SHIFTS]) = {
-  // freqmultiplier = 0.2
-  { 0, 0, 0, 128, 0, 0, 0, 0 },   { 0, 9, 30, 44, 32, 11, 2, 0 },
-  { 0, 8, 28, 44, 34, 12, 2, 0 }, { 0, 7, 27, 44, 35, 13, 2, 0 },
-  { 0, 6, 26, 43, 37, 14, 2, 0 }, { 0, 5, 24, 43, 38, 16, 2, 0 },
-  { 0, 5, 23, 42, 38, 17, 3, 0 }, { 0, 4, 21, 41, 40, 19, 3, 0 },
-  { 0, 4, 20, 40, 40, 20, 4, 0 }, { 0, 3, 19, 40, 41, 21, 4, 0 },
-  { 0, 3, 17, 38, 42, 23, 5, 0 }, { 0, 2, 16, 38, 43, 24, 5, 0 },
-  { 0, 2, 14, 37, 43, 26, 6, 0 }, { 0, 2, 13, 35, 44, 27, 7, 0 },
-  { 0, 2, 12, 34, 44, 28, 8, 0 }, { 0, 2, 11, 32, 44, 30, 9, 0 },
-};
-
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 2, 28, 62, 34, 2, 0, 0 },
-  { 0, 0, 26, 62, 36, 4, 0, 0 },    { 0, 0, 22, 62, 40, 4, 0, 0 },
-  { 0, 0, 20, 60, 42, 6, 0, 0 },    { 0, 0, 18, 58, 44, 8, 0, 0 },
-  { 0, 0, 16, 56, 46, 10, 0, 0 },   { 0, -2, 16, 54, 48, 12, 0, 0 },
-  { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 },
-  { 0, 0, 10, 46, 56, 16, 0, 0 },   { 0, 0, 8, 44, 58, 18, 0, 0 },
-  { 0, 0, 6, 42, 60, 20, 0, 0 },    { 0, 0, 4, 40, 62, 22, 0, 0 },
-  { 0, 0, 4, 36, 62, 26, 0, 0 },    { 0, 0, 2, 34, 62, 28, 2, 0 }
-};
-
-DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_smooth_uv[SUBPEL_SHIFTS]) = {
-  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 2, 28, 62, 34, 2, 0, 0 },
-  { 0, 0, 26, 62, 36, 4, 0, 0 },    { 0, 0, 22, 62, 40, 4, 0, 0 },
-  { 0, 0, 20, 60, 42, 6, 0, 0 },    { 0, 0, 18, 58, 44, 8, 0, 0 },
-  { 0, 0, 16, 56, 46, 10, 0, 0 },   { 0, -2, 16, 54, 48, 12, 0, 0 },
-  { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 },
-  { 0, 0, 10, 46, 56, 16, 0, 0 },   { 0, 0, 8, 44, 58, 18, 0, 0 },
-  { 0, 0, 6, 42, 60, 20, 0, 0 },    { 0, 0, 4, 40, 62, 22, 0, 0 },
-  { 0, 0, 4, 36, 62, 26, 0, 0 },    { 0, 0, 2, 34, 62, 28, 2, 0 }
-};
-#else   // USE_EXTRA_FILTER
-
 DECLARE_ALIGNED(256, static const InterpKernel,
                 sub_pel_filters_8[SUBPEL_SHIFTS]) = {
   { 0, 0, 0, 128, 0, 0, 0, 0 },      { 0, 2, -6, 126, 8, -2, 0, 0 },
@@ -207,49 +60,7 @@ DECLARE_ALIGNED(256, static const InterpKernel,
   { 0, 0, 6, 42, 60, 20, 0, 0 },    { 0, 0, 4, 40, 62, 22, 0, 0 },
   { 0, 0, 4, 36, 62, 26, 0, 0 },    { 0, 0, 2, 34, 62, 28, 2, 0 }
 };
-#endif  // USE_EXTRA_FILTER
 
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-const InterpKernel *av1_intra_filter_kernels[INTRA_FILTERS] = {
-  bilinear_filters,         // INTRA_FILTER_LINEAR
-  sub_pel_filters_8,        // INTRA_FILTER_8TAP
-  sub_pel_filters_8sharp,   // INTRA_FILTER_8TAP_SHARP
-  sub_pel_filters_8smooth,  // INTRA_FILTER_8TAP_SMOOTH
-};
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-
-#if USE_EXTRA_FILTER
-static const InterpFilterParams
-    av1_interp_filter_params_list[SWITCHABLE_FILTERS + EXTRA_FILTERS] = {
-      { (const int16_t *)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        EIGHTTAP_REGULAR },
-      { (const int16_t *)sub_pel_filters_8smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        EIGHTTAP_SMOOTH },
-#if USE_12TAP_FILTER
-      { (const int16_t *)sub_pel_filters_10sharp, 12, SUBPEL_SHIFTS,
-        MULTITAP_SHARP },
-#else
-      { (const int16_t *)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        EIGHTTAP_SHARP },
-#endif
-      { (const int16_t *)sub_pel_filters_8smooth2, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        EIGHTTAP_SMOOTH2 },
-      { (const int16_t *)bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        BILINEAR },
-      { (const int16_t *)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        EIGHTTAP_SHARP },
-      { (const int16_t *)sub_pel_filters_regular_uv, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        FILTER_REGULAR_UV },
-      { (const int16_t *)sub_pel_filters_smooth_uv, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        FILTER_SMOOTH_UV },
-      { (const int16_t *)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        FILTER_SHARP_UV },
-      { (const int16_t *)sub_pel_filters_smooth2_uv, SUBPEL_TAPS, SUBPEL_SHIFTS,
-        FILTER_SMOOTH2_UV },
-    };
-#else
 static const InterpFilterParams
     av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
       { (const int16_t *)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
@@ -261,62 +72,49 @@ static const InterpFilterParams
       { (const int16_t *)bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
         BILINEAR }
     };
-#endif  // USE_EXTRA_FILTER
 
-#if USE_TEMPORALFILTER_12TAP
-static const InterpFilterParams av1_interp_temporalfilter_12tap = {
-  (const int16_t *)sub_pel_filters_temporalfilter_12, 12, SUBPEL_SHIFTS,
-  TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_4[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 0, -4, 126, 8, -2, 0, 0 },
+  { 0, 0, -8, 122, 18, -4, 0, 0 },  { 0, 0, -10, 116, 28, -6, 0, 0 },
+  { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 },
+  { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 },
+  { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 },
+  { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 },
+  { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 },
+  { 0, 0, -4, 18, 122, -8, 0, 0 },  { 0, 0, -2, 8, 126, -4, 0, 0 }
 };
-#endif  // USE_TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },   { 0, 0, 30, 62, 34, 2, 0, 0 },
+  { 0, 0, 26, 62, 36, 4, 0, 0 },  { 0, 0, 22, 62, 40, 4, 0, 0 },
+  { 0, 0, 20, 60, 42, 6, 0, 0 },  { 0, 0, 18, 58, 44, 8, 0, 0 },
+  { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 },
+  { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 },
+  { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
+  { 0, 0, 6, 42, 60, 20, 0, 0 },  { 0, 0, 4, 40, 62, 22, 0, 0 },
+  { 0, 0, 4, 36, 62, 26, 0, 0 },  { 0, 0, 2, 34, 62, 30, 0, 0 }
+};
+
+static const InterpFilterParams av1_interp_4tap[2] = {
+  { (const int16_t *)sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
+    EIGHTTAP_REGULAR },
+  { (const int16_t *)sub_pel_filters_4smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
+    EIGHTTAP_SMOOTH },
+};
+
+InterpFilterParams av1_get_interp_filter_params_with_block_size(
+    const InterpFilter interp_filter, const int w) {
+  if (w <= 4 &&
+      (interp_filter == MULTITAP_SHARP || interp_filter == EIGHTTAP_REGULAR))
+    return av1_interp_4tap[0];
+  else if (w <= 4 && interp_filter == EIGHTTAP_SMOOTH)
+    return av1_interp_4tap[1];
 
-InterpFilterParams av1_get_interp_filter_params(
-    const InterpFilter interp_filter) {
-#if USE_TEMPORALFILTER_12TAP
-  if (interp_filter == TEMPORALFILTER_12TAP)
-    return av1_interp_temporalfilter_12tap;
-#endif  // USE_TEMPORALFILTER_12TAP
   return av1_interp_filter_params_list[interp_filter];
 }
 
 const int16_t *av1_get_interp_filter_kernel(const InterpFilter interp_filter) {
-#if USE_TEMPORALFILTER_12TAP
-  if (interp_filter == TEMPORALFILTER_12TAP)
-    return av1_interp_temporalfilter_12tap.filter_ptr;
-#endif  // USE_TEMPORALFILTER_12TAP
   return (const int16_t *)av1_interp_filter_params_list[interp_filter]
       .filter_ptr;
 }
-
-#if CONFIG_DUAL_FILTER
-InterpFilter av1_get_plane_interp_filter(InterpFilter interp_filter,
-                                         int plane) {
-#if USE_TEMPORALFILTER_12TAP
-#if USE_EXTRA_FILTER
-  assert(interp_filter <= EIGHTTAP_SHARP ||
-         interp_filter == TEMPORALFILTER_12TAP);
-#else   // USE_EXTRA_FILTER
-  assert(interp_filter <= SWITCHABLE_FILTERS ||
-         interp_filter == TEMPORALFILTER_12TAP);
-#endif  // USE_EXTRA_FILTER
-#else
-  assert(interp_filter <= EIGHTTAP_SHARP);
-#endif
-#if USE_EXTRA_FILTER
-  if (plane == 0) {
-    return interp_filter;
-  } else {
-    switch (interp_filter) {
-      case EIGHTTAP_REGULAR: return FILTER_REGULAR_UV;
-      case EIGHTTAP_SMOOTH: return FILTER_SMOOTH_UV;
-      case MULTITAP_SHARP: return FILTER_SHARP_UV;
-      case EIGHTTAP_SMOOTH2: return FILTER_SMOOTH2_UV;
-      default: return interp_filter;
-    }
-  }
-#else   // USE_EXTRA_FILTER
-  (void)plane;
-  return interp_filter;
-#endif  // USE_EXTRA_FILTER
-}
-#endif
diff --git a/third_party/aom/av1/common/filter.h b/third_party/aom/av1/common/filter.h
index 343e87560..0c24ad9d0 100644
--- a/third_party/aom/av1/common/filter.h
+++ b/third_party/aom/av1/common/filter.h
@@ -14,7 +14,8 @@
 
 #include <assert.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_ports/mem.h"
@@ -23,34 +24,17 @@
 extern "C" {
 #endif
 
-#define USE_TEMPORALFILTER_12TAP 1
-#define MAX_FILTER_TAP 12
-
-#define USE_12TAP_FILTER 0
-#define USE_EXTRA_FILTER 0
+#define MAX_FILTER_TAP 8
 
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
   EIGHTTAP_REGULAR,
   EIGHTTAP_SMOOTH,
-#if USE_EXTRA_FILTER
-  EIGHTTAP_SMOOTH2,
-#endif  // USE_EXTRA_FILTER
   MULTITAP_SHARP,
   BILINEAR,
-#if USE_EXTRA_FILTER
-  EIGHTTAP_SHARP,
-  FILTER_REGULAR_UV,
-  FILTER_SMOOTH_UV,
-  FILTER_SHARP_UV,
-  FILTER_SMOOTH2_UV,
-#endif  // USE_EXTRA_FILTER
   INTERP_FILTERS_ALL,
   SWITCHABLE_FILTERS = BILINEAR,
   SWITCHABLE = SWITCHABLE_FILTERS + 1, /* the last switchable one */
   EXTRA_FILTERS = INTERP_FILTERS_ALL - SWITCHABLE_FILTERS,
-#if USE_TEMPORALFILTER_12TAP
-  TEMPORALFILTER_12TAP = SWITCHABLE_FILTERS + EXTRA_FILTERS,
-#endif
 } InterpFilter;
 
 // With CONFIG_DUAL_FILTER, pack two InterpFilter's into a uint32_t: since
@@ -59,73 +43,34 @@ typedef enum {
 // setting a (pair of) filters.
 //
 // Without CONFIG_DUAL_FILTER,
-#if CONFIG_DUAL_FILTER
 typedef uint32_t InterpFilters;
 static INLINE InterpFilter av1_extract_interp_filter(InterpFilters filters,
                                                      int x_filter) {
-  return (InterpFilter)((filters >> (x_filter ? 16 : 0)) & 0xffff);
+  return (InterpFilter)((filters >> (x_filter ? 16 : 0)) & 0xf);
 }
 
 static INLINE InterpFilters av1_make_interp_filters(InterpFilter y_filter,
                                                     InterpFilter x_filter) {
-  uint16_t y16 = y_filter & 0xffff;
-  uint16_t x16 = x_filter & 0xffff;
+  uint16_t y16 = y_filter & 0xf;
+  uint16_t x16 = x_filter & 0xf;
   return y16 | ((uint32_t)x16 << 16);
 }
 
 static INLINE InterpFilters av1_broadcast_interp_filter(InterpFilter filter) {
   return av1_make_interp_filters(filter, filter);
 }
-#else
-typedef InterpFilter InterpFilters;
-static INLINE InterpFilter av1_extract_interp_filter(InterpFilters filters,
-                                                     int x_filter) {
-#ifdef NDEBUG
-  (void)x_filter;
-#endif
-  assert(!x_filter);
-  return filters;
-}
-
-static INLINE InterpFilters av1_broadcast_interp_filter(InterpFilter filter) {
-  return filter;
-}
-#endif
 
 static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) {
   return filter == SWITCHABLE ? EIGHTTAP_REGULAR : filter;
 }
 
-#if USE_EXTRA_FILTER
-#define LOG_SWITCHABLE_FILTERS \
-  3 /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
-#else
 #define LOG_SWITCHABLE_FILTERS \
   2 /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
-#endif
 
-#if CONFIG_DUAL_FILTER
 #define MAX_SUBPEL_TAPS 12
 #define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4)
 #define INTER_FILTER_COMP_OFFSET (SWITCHABLE_FILTERS + 1)
 #define INTER_FILTER_DIR_OFFSET ((SWITCHABLE_FILTERS + 1) * 2)
-#else  // CONFIG_DUAL_FILTER
-#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
-#endif  // CONFIG_DUAL_FILTER
-
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-typedef enum {
-  INTRA_FILTER_LINEAR,
-  INTRA_FILTER_8TAP,
-  INTRA_FILTER_8TAP_SHARP,
-  INTRA_FILTER_8TAP_SMOOTH,
-  INTRA_FILTERS,
-} INTRA_FILTER;
-
-extern const InterpKernel *av1_intra_filter_kernels[INTRA_FILTERS];
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
 
 typedef struct InterpFilterParams {
   const int16_t *filter_ptr;
@@ -134,26 +79,16 @@ typedef struct InterpFilterParams {
   InterpFilter interp_filter;
 } InterpFilterParams;
 
-InterpFilterParams av1_get_interp_filter_params(
-    const InterpFilter interp_filter);
-
 const int16_t *av1_get_interp_filter_kernel(const InterpFilter interp_filter);
 
+InterpFilterParams av1_get_interp_filter_params_with_block_size(
+    const InterpFilter interp_filter, const int w);
+
 static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
     const InterpFilterParams filter_params, const int subpel) {
   return filter_params.filter_ptr + filter_params.taps * subpel;
 }
 
-static INLINE int av1_is_interpolating_filter(
-    const InterpFilter interp_filter) {
-  const InterpFilterParams ip = av1_get_interp_filter_params(interp_filter);
-  return (ip.filter_ptr[ip.taps / 2 - 1] == 128);
-}
-
-#if CONFIG_DUAL_FILTER
-InterpFilter av1_get_plane_interp_filter(InterpFilter interp_filter, int plane);
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/frame_buffers.c b/third_party/aom/av1/common/frame_buffers.c
index 0b6b78e3d..502ccd27d 100644
--- a/third_party/aom/av1/common/frame_buffers.c
+++ b/third_party/aom/av1/common/frame_buffers.c
@@ -75,5 +75,6 @@ int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb) {
   InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
   (void)cb_priv;
   if (int_fb) int_fb->in_use = 0;
+  fb->priv = NULL;
   return 0;
 }
diff --git a/third_party/aom/av1/common/generic_code.c b/third_party/aom/av1/common/generic_code.c
deleted file mode 100644
index 7285decc9..000000000
--- a/third_party/aom/av1/common/generic_code.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include "generic_code.h"
-
-void aom_cdf_init_q15_1D(uint16_t *cdf, int nsyms, int cdf_size) {
-  int i;
-  for (i = 0; i < nsyms; i++)
-    cdf[i] = AOM_ICDF((i + 1)*CDF_PROB_TOP/nsyms);
-
-  cdf[cdf_size - 1] = 0;
-}
-
-/** Adapts a Q15 cdf after encoding/decoding a symbol. */
-void aom_cdf_adapt_q15(int val, uint16_t *cdf, int n, int *count, int rate) {
-  int i;
-  *count = OD_MINI(*count + 1, 1 << rate);
-  OD_ASSERT(AOM_ICDF(cdf[n - 1]) == 32768);
-  if (*count >= 1 << rate) {
-    /* Steady-state adaptation based on a simple IIR with dyadic rate. */
-    for (i = 0; i < n; i++) {
-      int tmp;
-      /* When (i < val), we want the adjustment ((cdf[i] - tmp) >> rate) to be
-         positive so long as (cdf[i] > i + 1), and 0 when (cdf[i] == i + 1),
-         to ensure we don't drive any probabilities to 0. Replacing cdf[i] with
-         (i + 2) and solving ((i + 2 - tmp) >> rate == 1) for tmp produces
-         tmp == i + 2 - (1 << rate). Using this value of tmp with
-         cdf[i] == i + 1 instead gives an adjustment of 0 as desired.
-
-         When (i >= val), we want ((cdf[i] - tmp) >> rate) to be negative so
-         long as cdf[i] < 32768 - (n - 1 - i), and 0 when
-         cdf[i] == 32768 - (n - 1 - i), again to ensure we don't drive any
-         probabilities to 0. Since right-shifting any negative value is still
-         negative, we can solve (32768 - (n - 1 - i) - tmp == 0) for tmp,
-         producing tmp = 32769 - n + i. Using this value of tmp with smaller
-         values of cdf[i] instead gives negative adjustments, as desired.
-
-         Combining the two cases gives the expression below. These could be
-         stored in a lookup table indexed by n and rate to avoid the
-         arithmetic. */
-      tmp = 2 - (1<<rate) + i + (32767 + (1<<rate) - n)*(i >= val);
-      cdf[i] = AOM_ICDF(AOM_ICDF(cdf[i]) - ((AOM_ICDF(cdf[i]) - tmp) >> rate));
-    }
-  }
-  else {
-    int alpha;
-    /* Initial adaptation for the first symbols. The adaptation rate is
-       computed to be equivalent to what od_{en,de}code_cdf_adapt() does
-       when the initial cdf is set to increment/4. */
-    alpha = 4*32768/(n + 4**count);
-    for (i = 0; i < n; i++) {
-      int tmp;
-      tmp = (32768 - n)*(i >= val) + i + 1;
-      cdf[i] = AOM_ICDF(AOM_ICDF(cdf[i])
-          - (((AOM_ICDF(cdf[i]) - tmp)*alpha) >> 15));
-    }
-  }
-  OD_ASSERT(AOM_ICDF(cdf[n - 1]) == 32768);
-}
-
-/** Takes the base-2 log of E(x) in Q1.
- *
- * @param [in] ExQ16 expectation of x in Q16
- *
- * @retval 2*log2(ExQ16/2^16)
- */
-int log_ex(int ex_q16) {
-  int lg;
-  int lg_q1;
-  int odd;
-  lg = OD_ILOG(ex_q16);
-  if (lg < 15) {
-    odd = ex_q16*ex_q16 > 2 << 2*lg;
-  }
-  else {
-    int tmp;
-    tmp = ex_q16 >> (lg - 8);
-    odd = tmp*tmp > (1 << 15);
-  }
-  lg_q1 = OD_MAXI(0, 2*lg - 33 + odd);
-  return lg_q1;
-}
-
-/** Updates the probability model based on the encoded/decoded value
- *
- * @param [in,out] model generic prob model
- * @param [in,out] ExQ16 expectation of x
- * @param [in]     x     variable encoded/decoded (used for ExQ16)
- * @param [in]     xs    variable x after shift (used for the model)
- * @param [in]     id    id of the icdf to adapt
- * @param [in]     integration integration period of ExQ16 (leaky average over
- * 1<<integration samples)
- */
-void generic_model_update(int *ex_q16, int x, int integration) {
-  /* We could have saturated ExQ16 directly, but this is safe and simpler */
-  x = OD_MINI(x, 32767);
-  OD_IIR_DIADIC(*ex_q16, x << 16, integration);
-}
diff --git a/third_party/aom/av1/common/generic_code.h b/third_party/aom/av1/common/generic_code.h
deleted file mode 100644
index e1620ee8e..000000000
--- a/third_party/aom/av1/common/generic_code.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#if !defined(_generic_code_H)
-# define _generic_code_H
-
-# include "aom_dsp/bitreader.h"
-# include "aom_dsp/bitwriter.h"
-
-# define GENERIC_TABLES 12
-
-#define generic_decode(r, model, ex_q16, integration, ACCT_STR_NAME) \
-  generic_decode_(r, model, ex_q16, integration ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_decode_cdf_adapt_q15(r, cdf, n, count, rate, ACCT_STR_NAME) \
-  aom_decode_cdf_adapt_q15_(r, cdf, n, count, rate ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_decode_cdf_adapt(r, cdf, n, increment, ACCT_STR_NAME) \
-  aom_decode_cdf_adapt_(r, cdf, n, increment ACCT_STR_ARG(ACCT_STR_NAME))
-
-typedef struct {
-  /** cdf for multiple expectations of x */
-  uint16_t cdf[GENERIC_TABLES][CDF_SIZE(16)];
-} generic_encoder;
-
-#define OD_IIR_DIADIC(y, x, shift) ((y) += ((x) - (y)) >> (shift))
-
-void generic_model_init(generic_encoder *model);
-
-/* Initialize a CDF for use by aom_write_symbol_pvq()/aom_read_symbol_pvq().
-   This is used for CDFs whose size might not match the declared array size.
-   The only real requirement is that the first value of every CDF be zero.
-   Then aom_cdf_init_q15_1D() will be called with the real size the first time
-   the CDF is used. */
-#define OD_CDFS_INIT_DYNAMIC(cdf) (memset(cdf, 0, sizeof(cdf)))
-
-// WARNING: DO NOT USE this init function,
-// if the size of cdf is different from what is declared by code.
-#define OD_CDFS_INIT_Q15(cdfs) \
-  { int n_cdfs = sizeof(cdfs)/sizeof(cdfs[0]); \
-    int cdf_size = sizeof(cdfs[0])/sizeof(cdfs[0][0]); \
-    int nsyms = cdf_size - 1; \
-    int i_; \
-    for (i_ = 0; i_ < n_cdfs; i_++) \
-      aom_cdf_init_q15_1D(cdfs[i_], nsyms, cdf_size); \
-  }
-
-void aom_cdf_init(uint16_t *cdf, int ncdfs, int nsyms, int val, int first);
-
-void aom_cdf_init_q15_1D(uint16_t *cdf, int nsyms, int cdf_size);
-
-void aom_cdf_adapt_q15(int val, uint16_t *cdf, int n, int *count, int rate);
-
-void aom_encode_cdf_adapt_q15(aom_writer *w, int val, uint16_t *cdf, int n,
- int *count, int rate);
-
-void generic_encode(aom_writer *w, generic_encoder *model, int x,
- int *ex_q16, int integration);
-double generic_encode_cost(generic_encoder *model, int x, int *ex_q16);
-
-double od_encode_cdf_cost(int val, uint16_t *cdf, int n);
-
-int aom_decode_cdf_adapt_q15_(aom_reader *r, uint16_t *cdf, int n,
- int *count, int rate ACCT_STR_PARAM);
-
-int generic_decode_(aom_reader *r, generic_encoder *model,
- int *ex_q16, int integration ACCT_STR_PARAM);
-
-int log_ex(int ex_q16);
-
-void generic_model_update(int *ex_q16, int x, int integration);
-
-#endif
diff --git a/third_party/aom/av1/common/idct.c b/third_party/aom/av1/common/idct.c
index 53c2ba1f0..bc758eb57 100644
--- a/third_party/aom/av1/common/idct.c
+++ b/third_party/aom/av1/common/idct.c
@@ -11,2624 +11,34 @@
 
 #include <math.h>
 
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
-#include "aom_dsp/inv_txfm.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom_ports/mem.h"
 #include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
 #include "av1/common/blockd.h"
 #include "av1/common/enums.h"
 #include "av1/common/idct.h"
-#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \
-    CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64
-#include "av1/common/daala_tx.h"
-#endif
 
 int av1_get_tx_scale(const TX_SIZE tx_size) {
   const int pels = tx_size_2d[tx_size];
-  return (pels > 256) + (pels > 1024) + (pels > 4096);
+  // Largest possible pels is 4096 (64x64).
+  return (pels > 256) + (pels > 1024);
 }
 
 // NOTE: The implementation of all inverses need to be aware of the fact
 // that input and output could be the same buffer.
 
-#if CONFIG_EXT_TX
-static void iidtx4_c(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 4; ++i) {
-    output[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
-  }
-}
-
-static void iidtx8_c(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 8; ++i) {
-    output[i] = input[i] * 2;
-  }
-}
-
-static void iidtx16_c(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    output[i] = (tran_low_t)dct_const_round_shift(input[i] * 2 * Sqrt2);
-  }
-}
-
-static void iidtx32_c(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 32; ++i) {
-    output[i] = input[i] * 4;
-  }
-}
-
-#if CONFIG_TX64X64 && !CONFIG_DAALA_DCT64
-static void iidtx64_c(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 64; ++i) {
-    output[i] = (tran_low_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
-  }
-}
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
-
-// For use in lieu of ADST
-static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[16];
-  // Multiply input by sqrt(2)
-  for (i = 0; i < 16; ++i) {
-    inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
-  }
-  for (i = 0; i < 16; ++i) {
-    output[i] = input[16 + i] * 4;
-  }
-  aom_idct16_c(inputhalf, output + 16);
-  // Note overall scaling factor is 4 times orthogonal
-}
-
-#if CONFIG_TX64X64 && !CONFIG_DAALA_DCT64
-static void idct64_col_c(const tran_low_t *input, tran_low_t *output) {
-  int32_t in[64], out[64];
-  int i;
-  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
-  av1_idct64_new(in, out, inv_cos_bit_col_dct_64, inv_stage_range_col_dct_64);
-  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
-}
-
-static void idct64_row_c(const tran_low_t *input, tran_low_t *output) {
-  int32_t in[64], out[64];
-  int i;
-  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
-  av1_idct64_new(in, out, inv_cos_bit_row_dct_64, inv_stage_range_row_dct_64);
-  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
-}
-
-// For use in lieu of ADST
-static void ihalfright64_c(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[32];
-  // Multiply input by sqrt(2)
-  for (i = 0; i < 32; ++i) {
-    inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
-  }
-  for (i = 0; i < 32; ++i) {
-    output[i] = (tran_low_t)dct_const_round_shift(input[32 + i] * 4 * Sqrt2);
-  }
-  aom_idct32_c(inputhalf, output + 32);
-  // Note overall scaling factor is 4 * sqrt(2)  times orthogonal
-}
-#endif  // CONFIG_TX64X64
-
-// Inverse identity transform and add.
-#if CONFIG_EXT_TX
-static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                           int bsx, int bsy, TX_TYPE tx_type) {
-  int r, c;
-  const int pels = bsx * bsy;
-  const int shift = 3 - ((pels > 256) + (pels > 1024));
-  if (tx_type == IDTX) {
-    for (r = 0; r < bsy; ++r) {
-      for (c = 0; c < bsx; ++c)
-        dest[c] = clip_pixel_add(dest[c], input[c] >> shift);
-      dest += stride;
-      input += bsx;
-    }
-  }
-}
-#endif  // CONFIG_EXT_TX
-
-#define FLIPUD_PTR(dest, stride, size)       \
-  do {                                       \
-    (dest) = (dest) + ((size)-1) * (stride); \
-    (stride) = -(stride);                    \
-  } while (0)
-
-#if CONFIG_EXT_TX
-static void maybe_flip_strides(uint8_t **dst, int *dstride, tran_low_t **src,
-                               int *sstride, TX_TYPE tx_type, int sizey,
-                               int sizex) {
-  // Note that the transpose of src will be added to dst. In order to LR
-  // flip the addends (in dst coordinates), we UD flip the src. To UD flip
-  // the addends, we UD flip the dst.
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-    case IDTX:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST: break;
-    case FLIPADST_DCT:
-    case FLIPADST_ADST:
-    case V_FLIPADST:
-      // flip UD
-      FLIPUD_PTR(*dst, *dstride, sizey);
-      break;
-    case DCT_FLIPADST:
-    case ADST_FLIPADST:
-    case H_FLIPADST:
-      // flip LR
-      FLIPUD_PTR(*src, *sstride, sizex);
-      break;
-    case FLIPADST_FLIPADST:
-      // flip UD
-      FLIPUD_PTR(*dst, *dstride, sizey);
-      // flip LR
-      FLIPUD_PTR(*src, *sstride, sizex);
-      break;
-    default: assert(0); break;
-  }
-}
-#endif  // CONFIG_EXT_TX
-
-#if CONFIG_HIGHBITDEPTH
-#if CONFIG_EXT_TX && CONFIG_TX64X64
-static void highbd_inv_idtx_add_c(const tran_low_t *input, uint8_t *dest8,
-                                  int stride, int bsx, int bsy, TX_TYPE tx_type,
-                                  int bd) {
-  int r, c;
-  const int pels = bsx * bsy;
-  const int shift = 3 - ((pels > 256) + (pels > 1024));
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  if (tx_type == IDTX) {
-    for (r = 0; r < bsy; ++r) {
-      for (c = 0; c < bsx; ++c)
-        dest[c] = highbd_clip_pixel_add(dest[c], input[c] >> shift, bd);
-      dest += stride;
-      input += bsx;
-    }
-  }
-}
-#endif  // CONFIG_EXT_TX && CONFIG_TX64X64
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_LGT || CONFIG_LGT_FROM_PRED
-void ilgt4(const tran_low_t *input, tran_low_t *output,
-           const tran_high_t *lgtmtx) {
-  if (!lgtmtx) assert(0);
-#if CONFIG_LGT_FROM_PRED
-  // For DCT/ADST, use butterfly implementations
-  if (lgtmtx[0] == DCT4) {
-    aom_idct4_c(input, output);
-    return;
-  } else if (lgtmtx[0] == ADST4) {
-    aom_iadst4_c(input, output);
-    return;
-  }
-#endif  // CONFIG_LGT_FROM_PRED
-
-  // evaluate s[j] = sum of all lgtmtx[j]*input[i] over i=1,...,4
-  tran_high_t s[4] = { 0 };
-  for (int i = 0; i < 4; ++i)
-    for (int j = 0; j < 4; ++j) s[j] += lgtmtx[i * 4 + j] * input[i];
-
-  for (int i = 0; i < 4; ++i) output[i] = WRAPLOW(dct_const_round_shift(s[i]));
-}
-
-void ilgt8(const tran_low_t *input, tran_low_t *output,
-           const tran_high_t *lgtmtx) {
-  if (!lgtmtx) assert(0);
-#if CONFIG_LGT_FROM_PRED
-  // For DCT/ADST, use butterfly implementations
-  if (lgtmtx[0] == DCT8) {
-    aom_idct8_c(input, output);
-    return;
-  } else if (lgtmtx[0] == ADST8) {
-    aom_iadst8_c(input, output);
-    return;
-  }
-#endif  // CONFIG_LGT_FROM_PRED
-
-  // evaluate s[j] = sum of all lgtmtx[j]*input[i] over i=1,...,8
-  tran_high_t s[8] = { 0 };
-  for (int i = 0; i < 8; ++i)
-    for (int j = 0; j < 8; ++j) s[j] += lgtmtx[i * 8 + j] * input[i];
-
-  for (int i = 0; i < 8; ++i) output[i] = WRAPLOW(dct_const_round_shift(s[i]));
-}
-#endif  // CONFIG_LGT || CONFIG_LGT_FROM_PRED
-
-#if CONFIG_LGT
-// get_lgt4 and get_lgt8 return 1 and pick a lgt matrix if LGT is chosen to
-// apply. Otherwise they return 0
-int get_lgt4(const TxfmParam *txfm_param, int is_col,
-             const tran_high_t **lgtmtx) {
-  if (is_col && (vtx_tab[txfm_param->tx_type] == ADST_1D ||
-                 vtx_tab[txfm_param->tx_type] == FLIPADST_1D)) {
-    lgtmtx[0] = txfm_param->is_inter ? &lgt4_170[0][0] : &lgt4_140[0][0];
-    return 1;
-  } else if (!is_col && (htx_tab[txfm_param->tx_type] == ADST_1D ||
-                         htx_tab[txfm_param->tx_type] == FLIPADST_1D)) {
-    lgtmtx[0] = txfm_param->is_inter ? &lgt4_170[0][0] : &lgt4_140[0][0];
-    return 1;
-  }
-  lgtmtx[0] = NULL;
-  return 0;
-}
-
-int get_lgt8(const TxfmParam *txfm_param, int is_col,
-             const tran_high_t **lgtmtx) {
-  if (is_col && (vtx_tab[txfm_param->tx_type] == ADST_1D ||
-                 vtx_tab[txfm_param->tx_type] == FLIPADST_1D)) {
-    lgtmtx[0] = txfm_param->is_inter ? &lgt8_170[0][0] : &lgt8_150[0][0];
-    return 1;
-  } else if (!is_col && (htx_tab[txfm_param->tx_type] == ADST_1D ||
-                         htx_tab[txfm_param->tx_type] == FLIPADST_1D)) {
-    lgtmtx[0] = txfm_param->is_inter ? &lgt8_170[0][0] : &lgt8_150[0][0];
-    return 1;
-  }
-  lgtmtx[0] = NULL;
-  return 0;
-}
-#endif  // CONFIG_LGT
-
-#if CONFIG_LGT_FROM_PRED
-void ilgt16up(const tran_low_t *input, tran_low_t *output,
-              const tran_high_t *lgtmtx) {
-  if (lgtmtx[0] == DCT16) {
-    aom_idct16_c(input, output);
-    return;
-  } else if (lgtmtx[0] == ADST16) {
-    aom_iadst16_c(input, output);
-    return;
-  } else if (lgtmtx[0] == DCT32) {
-    aom_idct32_c(input, output);
-    return;
-  } else if (lgtmtx[0] == ADST32) {
-    ihalfright32_c(input, output);
-    return;
-  } else {
-    assert(0);
-  }
-}
-
-void get_discontinuity_1d(uint8_t *arr, int n, int *idx_max_diff) {
-  *idx_max_diff = -1;
-
-  int temp = 0, max_diff = 0, min_diff = INT_MAX;
-  for (int i = 1; i < n; ++i) {
-    temp = abs(arr[i] - arr[i - 1]);
-    if (temp > max_diff) {
-      max_diff = temp;
-      *idx_max_diff = i;
-    }
-    if (temp < min_diff) min_diff = temp;
-  }
-}
-
-void get_discontinuity_2d(uint8_t *dst, int stride, int n, int is_col,
-                          int *idx_max_diff, int ntx) {
-  *idx_max_diff = -1;
-
-  int diff = 0, temp = 0, max_diff = 0, min_diff = INT_MAX;
-  for (int i = 1; i < n; ++i) {
-    temp = 0;
-    for (int j = 0; j < ntx; ++j) {
-      if (is_col)  // vertical diff
-        diff = dst[i * stride + j] - dst[(i - 1) * stride + j];
-      else  // horizontal diff
-        diff = dst[j * stride + i] - dst[j * stride + i - 1];
-      temp += diff * diff;
-    }
-    // temp/w is the i-th avg square diff
-    if (temp > max_diff) {
-      max_diff = temp;
-      *idx_max_diff = i;
-    }
-    if (temp < min_diff) min_diff = temp;
-  }
-}
-
-int idx_selfloop_wrt_mode(PREDICTION_MODE mode, int is_col) {
-  // 0: no self-loop
-  // 1: small self-loop
-  // 2: medium self-loop
-  // 3: large self-loop
-  switch (mode) {
-    case DC_PRED:
-    case SMOOTH_PRED:
-      // predition is good for both directions: large SLs for row and col
-      return 3;
-    case TM_PRED: return 0;
-#if CONFIG_SMOOTH_HV
-    case SMOOTH_H_PRED:
-#endif
-    case H_PRED:
-      // prediction is good for H direction: large SL for row only
-      return is_col ? 0 : 3;
-#if CONFIG_SMOOTH_HV
-    case SMOOTH_V_PRED:
-#endif
-    case V_PRED:
-      // prediction is good for V direction: large SL for col only
-      return is_col ? 3 : 0;
-#if LGT_SL_INTRA
-    // directional mode: choose SL based on the direction
-    case D45_PRED: return is_col ? 2 : 0;
-    case D63_PRED: return is_col ? 3 : 0;
-    case D117_PRED: return is_col ? 3 : 1;
-    case D135_PRED: return 2;
-    case D153_PRED: return is_col ? 1 : 3;
-    case D207_PRED: return is_col ? 0 : 3;
-#else
-    case D45_PRED:
-    case D63_PRED:
-    case D117_PRED: return is_col ? 3 : 0;
-    case D135_PRED:
-    case D153_PRED:
-    case D207_PRED: return is_col ? 0 : 3;
-#endif
-    // inter: no SL
-    default: return 0;
-  }
-}
-
-void get_lgt4_from_pred(const TxfmParam *txfm_param, int is_col,
-                        const tran_high_t **lgtmtx, int ntx) {
-  PREDICTION_MODE mode = txfm_param->mode;
-  int stride = txfm_param->stride;
-  uint8_t *dst = txfm_param->dst;
-  int bp = -1;
-  uint8_t arr[4];
-
-  // Each lgt4mtx_arr[k][i] corresponds to a line graph with a self-loop on
-  // the first node, and possibly a weak edge within the line graph. i is
-  // the index of the weak edge (between the i-th and (i+1)-th pixels, i=0
-  // means no weak edge). k corresponds to the first self-loop's weight
-  const tran_high_t *lgt4mtx_arr[4][4] = {
-    { &lgt4_000[0][0], &lgt4_000w1[0][0], &lgt4_000w2[0][0],
-      &lgt4_000w3[0][0] },
-    { &lgt4_060[0][0], &lgt4_060_000w1[0][0], &lgt4_060_000w2[0][0],
-      &lgt4_060_000w3[0][0] },
-    { &lgt4_100[0][0], &lgt4_100_000w1[0][0], &lgt4_100_000w2[0][0],
-      &lgt4_100_000w3[0][0] },
-    { &lgt4_150[0][0], &lgt4_150_000w1[0][0], &lgt4_150_000w2[0][0],
-      &lgt4_150_000w3[0][0] },
-  };
-
-  // initialize to DCT or some LGTs, and then change later if necessary
-  int idx_sl = idx_selfloop_wrt_mode(mode, is_col);
-  lgtmtx[0] = lgt4mtx_arr[idx_sl][0];
-
-  // find the break point and replace the line graph by the one with a
-  // break point
-  if (mode == DC_PRED || mode == SMOOTH_PRED) {
-    // Do not use break point, since 1) is_left_available and is_top_available
-    // in DC_PRED are not known by txfm_param for now, so accessing
-    // both boundaries anyway may cause a mismatch 2) DC prediciton
-    // typically yields very smooth residues so having the break point
-    // does not usually improve the RD result.
-    return;
-  } else if (mode == TM_PRED) {
-    // TM_PRED: use both 1D top boundary and 1D left boundary
-    if (is_col)
-      for (int i = 0; i < 4; ++i) arr[i] = dst[i * stride];
-    else
-      for (int i = 0; i < 4; ++i) arr[i] = dst[i];
-    get_discontinuity_1d(&arr[0], 4, &bp);
-  } else if (mode == V_PRED) {
-    // V_PRED: use 1D top boundary only
-    if (is_col) return;
-    for (int i = 0; i < 4; ++i) arr[i] = dst[i];
-    get_discontinuity_1d(&arr[0], 4, &bp);
-  } else if (mode == H_PRED) {
-    // H_PRED: use 1D left boundary only
-    if (!is_col) return;
-    for (int i = 0; i < 4; ++i) arr[i] = dst[i * stride];
-    get_discontinuity_1d(&arr[0], 4, &bp);
-#if CONFIG_SMOOTH_HV
-  } else if (mode == SMOOTH_V_PRED) {
-    if (is_col) return;
-    for (int i = 0; i < 4; ++i) arr[i] = dst[-stride + i];
-    get_discontinuity_1d(&arr[0], 4, &bp);
-  } else if (mode == SMOOTH_H_PRED) {
-    if (!is_col) return;
-    for (int i = 0; i < 4; ++i) arr[i] = dst[i * stride - 1];
-    get_discontinuity_1d(&arr[0], 4, &bp);
-#endif
-  } else if (mode == D45_PRED || mode == D63_PRED || mode == D117_PRED) {
-    // directional modes closer to vertical (maybe include D135 later)
-    if (!is_col) get_discontinuity_2d(dst, stride, 4, 0, &bp, ntx);
-  } else if (mode == D135_PRED || mode == D153_PRED || mode == D207_PRED) {
-    // directional modes closer to horizontal
-    if (is_col) get_discontinuity_2d(dst, stride, 4, 1, &bp, ntx);
-  } else if (mode > TM_PRED) {
-    // inter
-    get_discontinuity_2d(dst, stride, 4, is_col, &bp, ntx);
-  }
-
-#if LGT_SL_INTRA
-  if (bp != -1) lgtmtx[0] = lgt4mtx_arr[idx_sl][bp];
-#else
-  if (bp != -1) lgtmtx[0] = lgt4mtx_arr[0][bp];
-#endif
-}
-
-void get_lgt8_from_pred(const TxfmParam *txfm_param, int is_col,
-                        const tran_high_t **lgtmtx, int ntx) {
-  PREDICTION_MODE mode = txfm_param->mode;
-  int stride = txfm_param->stride;
-  uint8_t *dst = txfm_param->dst;
-  int bp = -1;
-  uint8_t arr[8];
-
-  const tran_high_t *lgt8mtx_arr[4][8] = {
-    { &lgt8_000[0][0], &lgt8_000w1[0][0], &lgt8_000w2[0][0], &lgt8_000w3[0][0],
-      &lgt8_000w4[0][0], &lgt8_000w5[0][0], &lgt8_000w6[0][0],
-      &lgt8_000w7[0][0] },
-    { &lgt8_060[0][0], &lgt8_060_000w1[0][0], &lgt8_060_000w2[0][0],
-      &lgt8_060_000w3[0][0], &lgt8_060_000w4[0][0], &lgt8_060_000w5[0][0],
-      &lgt8_060_000w6[0][0], &lgt8_060_000w7[0][0] },
-    { &lgt8_100[0][0], &lgt8_100_000w1[0][0], &lgt8_100_000w2[0][0],
-      &lgt8_100_000w3[0][0], &lgt8_100_000w4[0][0], &lgt8_100_000w5[0][0],
-      &lgt8_100_000w6[0][0], &lgt8_100_000w7[0][0] },
-    { &lgt8_150[0][0], &lgt8_150_000w1[0][0], &lgt8_150_000w2[0][0],
-      &lgt8_150_000w3[0][0], &lgt8_150_000w4[0][0], &lgt8_150_000w5[0][0],
-      &lgt8_150_000w6[0][0], &lgt8_150_000w7[0][0] },
-  };
-
-  int idx_sl = idx_selfloop_wrt_mode(mode, is_col);
-  lgtmtx[0] = lgt8mtx_arr[idx_sl][0];
-
-  if (mode == DC_PRED || mode == SMOOTH_PRED) {
-    return;
-  } else if (mode == TM_PRED) {
-    if (is_col)
-      for (int i = 0; i < 8; ++i) arr[i] = dst[i * stride];
-    else
-      for (int i = 0; i < 8; ++i) arr[i] = dst[i];
-    get_discontinuity_1d(&arr[0], 8, &bp);
-  } else if (mode == V_PRED) {
-    if (is_col) return;
-    for (int i = 0; i < 8; ++i) arr[i] = dst[i];
-    get_discontinuity_1d(&arr[0], 8, &bp);
-  } else if (mode == H_PRED) {
-    if (!is_col) return;
-    for (int i = 0; i < 8; ++i) arr[i] = dst[i * stride];
-    get_discontinuity_1d(&arr[0], 8, &bp);
-#if CONFIG_SMOOTH_HV
-  } else if (mode == SMOOTH_V_PRED) {
-    if (is_col) return;
-    for (int i = 0; i < 8; ++i) arr[i] = dst[-stride + i];
-    get_discontinuity_1d(&arr[0], 8, &bp);
-  } else if (mode == SMOOTH_H_PRED) {
-    if (!is_col) return;
-    for (int i = 0; i < 8; ++i) arr[i] = dst[i * stride - 1];
-    get_discontinuity_1d(&arr[0], 8, &bp);
-#endif
-  } else if (mode == D45_PRED || mode == D63_PRED || mode == D117_PRED) {
-    if (!is_col) get_discontinuity_2d(dst, stride, 8, 0, &bp, ntx);
-  } else if (mode == D135_PRED || mode == D153_PRED || mode == D207_PRED) {
-    if (is_col) get_discontinuity_2d(dst, stride, 8, 1, &bp, ntx);
-  } else if (mode > TM_PRED) {
-    get_discontinuity_2d(dst, stride, 8, is_col, &bp, ntx);
-  }
-
-#if LGT_SL_INTRA
-  if (bp != -1) lgtmtx[0] = lgt8mtx_arr[idx_sl][bp];
-#else
-  if (bp != -1) lgtmtx[0] = lgt8mtx_arr[0][bp];
-#endif
-}
-
-// Since LGTs with length >8 are not implemented now, the following function
-// will just call DCT or ADST
-void get_lgt16up_from_pred(const TxfmParam *txfm_param, int is_col,
-                           const tran_high_t **lgtmtx, int ntx) {
-  int tx_length = is_col ? tx_size_high[txfm_param->tx_size]
-                         : tx_size_wide[txfm_param->tx_size];
-  assert(tx_length == 16 || tx_length == 32);
-  PREDICTION_MODE mode = txfm_param->mode;
-
-  (void)ntx;
-  const tran_high_t *dctmtx =
-      tx_length == 16 ? &lgt16_000[0][0] : &lgt32_000[0][0];
-  const tran_high_t *adstmtx =
-      tx_length == 16 ? &lgt16_200[0][0] : &lgt32_200[0][0];
-
-  switch (mode) {
-    case DC_PRED:
-    case TM_PRED:
-    case SMOOTH_PRED:
-      // prediction from both top and left -> ADST
-      lgtmtx[0] = adstmtx;
-      break;
-    case V_PRED:
-    case D45_PRED:
-    case D63_PRED:
-    case D117_PRED:
-#if CONFIG_SMOOTH_HV
-    case SMOOTH_V_PRED:
-#endif
-      // prediction from the top more than from the left -> ADST
-      lgtmtx[0] = is_col ? adstmtx : dctmtx;
-      break;
-    case H_PRED:
-    case D135_PRED:
-    case D153_PRED:
-    case D207_PRED:
-#if CONFIG_SMOOTH_HV
-    case SMOOTH_H_PRED:
-#endif
-      // prediction from the left more than from the top -> DCT
-      lgtmtx[0] = is_col ? dctmtx : adstmtx;
-      break;
-    default: lgtmtx[0] = dctmtx; break;
-  }
-}
-
-typedef void (*IlgtFunc)(const tran_low_t *input, tran_low_t *output,
-                         const tran_high_t *lgtmtx);
-
-static IlgtFunc ilgt_func[4] = { ilgt4, ilgt8, ilgt16up, ilgt16up };
-
-typedef void (*GetLgtFunc)(const TxfmParam *txfm_param, int is_col,
-                           const tran_high_t **lgtmtx, int ntx);
-
-static GetLgtFunc get_lgt_func[4] = { get_lgt4_from_pred, get_lgt8_from_pred,
-                                      get_lgt16up_from_pred,
-                                      get_lgt16up_from_pred };
-
-// this inline function corresponds to the up scaling before the transpose
-// operation in the av1_iht* functions
-static INLINE tran_low_t inv_upscale_wrt_txsize(const tran_high_t val,
-                                                const TX_SIZE tx_size) {
-  switch (tx_size) {
-    case TX_4X4:
-    case TX_8X8:
-    case TX_4X16:
-    case TX_16X4:
-    case TX_8X32:
-    case TX_32X8: return (tran_low_t)val;
-    case TX_4X8:
-    case TX_8X4:
-    case TX_8X16:
-    case TX_16X8: return (tran_low_t)dct_const_round_shift(val * Sqrt2);
-    default: assert(0); break;
-  }
-  return 0;
-}
-
-// This inline function corresponds to the bit shift before summing with the
-// destination in the av1_iht* functions
-static INLINE tran_low_t inv_downscale_wrt_txsize(const tran_low_t val,
-                                                  const TX_SIZE tx_size) {
-  switch (tx_size) {
-    case TX_4X4: return ROUND_POWER_OF_TWO(val, 4);
-    case TX_4X8:
-    case TX_8X4:
-    case TX_8X8:
-    case TX_4X16:
-    case TX_16X4: return ROUND_POWER_OF_TWO(val, 5);
-    case TX_8X16:
-    case TX_16X8:
-    case TX_8X32:
-    case TX_32X8: return ROUND_POWER_OF_TWO(val, 6);
-    default: assert(0); break;
-  }
-  return 0;
-}
-
-void ilgt2d_from_pred_add(const tran_low_t *input, uint8_t *dest, int stride,
-                          const TxfmParam *txfm_param) {
-  const TX_SIZE tx_size = txfm_param->tx_size;
-  const int w = tx_size_wide[tx_size];
-  const int h = tx_size_high[tx_size];
-  const int wlog2 = tx_size_wide_log2[tx_size];
-  const int hlog2 = tx_size_high_log2[tx_size];
-  assert(w <= 8 || h <= 8);
-
-  int i, j;
-  // largest 1D size allowed for LGT: 32
-  // largest 2D size allowed for LGT: 8x32=256
-  tran_low_t tmp[256], out[256], temp1d[32];
-  const tran_high_t *lgtmtx_col[1];
-  const tran_high_t *lgtmtx_row[1];
-  get_lgt_func[hlog2 - 2](txfm_param, 1, lgtmtx_col, w);
-  get_lgt_func[wlog2 - 2](txfm_param, 0, lgtmtx_row, h);
-
-// for inverse transform, to be consistent with av1_iht functions, we always
-// apply row transforms first and column transforms second, but both
-// row-first and column-first versions are implemented here for future
-// tests (use different lgtmtx_col[i], and choose row or column tx first
-// depending on transforms).
-#if 1
-  // inverse column transforms
-  for (i = 0; i < w; ++i) {
-    // transpose
-    for (j = 0; j < h; ++j) tmp[i * h + j] = input[j * w + i];
-    ilgt_func[hlog2 - 2](&tmp[i * h], temp1d, lgtmtx_col[0]);
-    // upscale, and store in place
-    for (j = 0; j < h; ++j)
-      tmp[i * h + j] = inv_upscale_wrt_txsize(temp1d[j], tx_size);
-  }
-  // inverse row transforms
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) temp1d[j] = tmp[j * h + i];
-    ilgt_func[wlog2 - 2](temp1d, &out[i * w], lgtmtx_row[0]);
-  }
-  // downscale + sum with the destination
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) {
-      int d = i * stride + j;
-      int s = i * w + j;
-      dest[d] =
-          clip_pixel_add(dest[d], inv_downscale_wrt_txsize(out[s], tx_size));
-    }
-  }
-#else
-  // inverse row transforms
-  for (i = 0; i < h; ++i) {
-    ilgt_func[wlog2 - 2](input, temp1d, lgtmtx_row[0]);
-    // upscale and transpose (tmp[j*h+i] <--> tmp[j][i])
-    for (j = 0; j < w; ++j)
-      tmp[j * h + i] = inv_upscale_wrt_txsize(temp1d[j], tx_size);
-    input += w;
-  }
-  // inverse column transforms
-  for (i = 0; i < w; ++i)
-    ilgt_func[hlog2 - 2](&tmp[i * h], &out[i * h], lgtmtx_col[0]);
-  // here, out[] is the transpose of 2D block of transform coefficients
-
-  // downscale + transform + sum with dest
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) {
-      int d = i * stride + j;
-      int s = j * h + i;
-      dest[d] =
-          clip_pixel_add(dest[d], inv_downscale_wrt_txsize(out[s], tx_size));
-    }
-  }
-#endif
-}
-#endif  // CONFIG_LGT_FROM_PRED
-
-void av1_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                         const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if !CONFIG_DAALA_DCT4
-  if (tx_type == DCT_DCT) {
-    aom_idct4x4_16_add(input, dest, stride);
-    return;
-  }
-#endif
-  static const transform_2d IHT_4[] = {
-#if CONFIG_DAALA_DCT4
-    { daala_idct4, daala_idct4 },  // DCT_DCT  = 0
-    { daala_idst4, daala_idct4 },  // ADST_DCT = 1
-    { daala_idct4, daala_idst4 },  // DCT_ADST = 2
-    { daala_idst4, daala_idst4 },  // ADST_ADST = 3
-#if CONFIG_EXT_TX
-    { daala_idst4, daala_idct4 },  // FLIPADST_DCT
-    { daala_idct4, daala_idst4 },  // DCT_FLIPADST
-    { daala_idst4, daala_idst4 },  // FLIPADST_FLIPADST
-    { daala_idst4, daala_idst4 },  // ADST_FLIPADST
-    { daala_idst4, daala_idst4 },  // FLIPADST_ADST
-    { daala_idtx4, daala_idtx4 },  // IDTX
-    { daala_idct4, daala_idtx4 },  // V_DCT
-    { daala_idtx4, daala_idct4 },  // H_DCT
-    { daala_idst4, daala_idtx4 },  // V_ADST
-    { daala_idtx4, daala_idst4 },  // H_ADST
-    { daala_idst4, daala_idtx4 },  // V_FLIPADST
-    { daala_idtx4, daala_idst4 },  // H_FLIPADST
-#endif
-#else
-    { aom_idct4_c, aom_idct4_c },    // DCT_DCT  = 0
-    { aom_iadst4_c, aom_idct4_c },   // ADST_DCT = 1
-    { aom_idct4_c, aom_iadst4_c },   // DCT_ADST = 2
-    { aom_iadst4_c, aom_iadst4_c },  // ADST_ADST = 3
-#if CONFIG_EXT_TX
-    { aom_iadst4_c, aom_idct4_c },   // FLIPADST_DCT
-    { aom_idct4_c, aom_iadst4_c },   // DCT_FLIPADST
-    { aom_iadst4_c, aom_iadst4_c },  // FLIPADST_FLIPADST
-    { aom_iadst4_c, aom_iadst4_c },  // ADST_FLIPADST
-    { aom_iadst4_c, aom_iadst4_c },  // FLIPADST_ADST
-    { iidtx4_c, iidtx4_c },          // IDTX
-    { aom_idct4_c, iidtx4_c },       // V_DCT
-    { iidtx4_c, aom_idct4_c },       // H_DCT
-    { aom_iadst4_c, iidtx4_c },      // V_ADST
-    { iidtx4_c, aom_iadst4_c },      // H_ADST
-    { aom_iadst4_c, iidtx4_c },      // V_FLIPADST
-    { iidtx4_c, aom_iadst4_c },      // H_FLIPADST
-#endif
-#endif
-  };
-
-  int i, j;
-  tran_low_t tmp[4][4];
-  tran_low_t out[4][4];
-  tran_low_t *outp = &out[0][0];
-  int outstride = 4;
-
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
-  int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // inverse transform row vectors
-  for (i = 0; i < 4; ++i) {
-#if CONFIG_DAALA_DCT4
-    tran_low_t temp_in[4];
-    for (j = 0; j < 4; j++) temp_in[j] = input[j] * 2;
-    IHT_4[tx_type].rows(temp_in, out[i]);
-#else
-#if CONFIG_LGT
-    if (use_lgt_row)
-      ilgt4(input, out[i], lgtmtx_row[0]);
-    else
-#endif
-      IHT_4[tx_type].rows(input, out[i]);
-#endif
-    input += 4;
-  }
-
-  // transpose
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      tmp[j][i] = out[i][j];
-    }
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < 4; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_col)
-      ilgt4(tmp[i], out[i], lgtmtx_col[0]);
-    else
-#endif
-      IHT_4[tx_type].cols(tmp[i], out[i]);
-  }
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4, 4);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-#if CONFIG_DAALA_DCT4
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#endif
-    }
-  }
-}
-
-void av1_iht4x8_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                         const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_4x8[] = {
-    { aom_idct8_c, aom_idct4_c },    // DCT_DCT
-    { aom_iadst8_c, aom_idct4_c },   // ADST_DCT
-    { aom_idct8_c, aom_iadst4_c },   // DCT_ADST
-    { aom_iadst8_c, aom_iadst4_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { aom_iadst8_c, aom_idct4_c },   // FLIPADST_DCT
-    { aom_idct8_c, aom_iadst4_c },   // DCT_FLIPADST
-    { aom_iadst8_c, aom_iadst4_c },  // FLIPADST_FLIPADST
-    { aom_iadst8_c, aom_iadst4_c },  // ADST_FLIPADST
-    { aom_iadst8_c, aom_iadst4_c },  // FLIPADST_ADST
-    { iidtx8_c, iidtx4_c },          // IDTX
-    { aom_idct8_c, iidtx4_c },       // V_DCT
-    { iidtx8_c, aom_idct4_c },       // H_DCT
-    { aom_iadst8_c, iidtx4_c },      // V_ADST
-    { iidtx8_c, aom_iadst4_c },      // H_ADST
-    { aom_iadst8_c, iidtx4_c },      // V_FLIPADST
-    { iidtx8_c, aom_iadst4_c },      // H_FLIPADST
-#endif
-  };
-
-  const int n = 4;
-  const int n2 = 8;
-  int i, j;
-  tran_low_t out[4][8], tmp[4][8], outtmp[4];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n2;
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
-  int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n2; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_row)
-      ilgt4(input, outtmp, lgtmtx_row[0]);
-    else
-#endif
-      IHT_4x8[tx_type].rows(input, outtmp);
-    for (j = 0; j < n; ++j)
-      tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-    input += n;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_col)
-      ilgt8(tmp[i], out[i], lgtmtx_col[0]);
-    else
-#endif
-      IHT_4x8[tx_type].cols(tmp[i], out[i]);
-  }
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-    }
-  }
-}
-
-void av1_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                         const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_8x4[] = {
-    { aom_idct4_c, aom_idct8_c },    // DCT_DCT
-    { aom_iadst4_c, aom_idct8_c },   // ADST_DCT
-    { aom_idct4_c, aom_iadst8_c },   // DCT_ADST
-    { aom_iadst4_c, aom_iadst8_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { aom_iadst4_c, aom_idct8_c },   // FLIPADST_DCT
-    { aom_idct4_c, aom_iadst8_c },   // DCT_FLIPADST
-    { aom_iadst4_c, aom_iadst8_c },  // FLIPADST_FLIPADST
-    { aom_iadst4_c, aom_iadst8_c },  // ADST_FLIPADST
-    { aom_iadst4_c, aom_iadst8_c },  // FLIPADST_ADST
-    { iidtx4_c, iidtx8_c },          // IDTX
-    { aom_idct4_c, iidtx8_c },       // V_DCT
-    { iidtx4_c, aom_idct8_c },       // H_DCT
-    { aom_iadst4_c, iidtx8_c },      // V_ADST
-    { iidtx4_c, aom_iadst8_c },      // H_ADST
-    { aom_iadst4_c, iidtx8_c },      // V_FLIPADST
-    { iidtx4_c, aom_iadst8_c },      // H_FLIPADST
-#endif
-  };
-
-  const int n = 4;
-  const int n2 = 8;
-
-  int i, j;
-  tran_low_t out[8][4], tmp[8][4], outtmp[8];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n;
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
-  int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_row)
-      ilgt8(input, outtmp, lgtmtx_row[0]);
-    else
-#endif
-      IHT_8x4[tx_type].rows(input, outtmp);
-    for (j = 0; j < n2; ++j)
-      tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-    input += n2;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n2; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_col)
-      ilgt4(tmp[i], out[i], lgtmtx_col[0]);
-    else
-#endif
-      IHT_8x4[tx_type].cols(tmp[i], out[i]);
-  }
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-    }
-  }
-}
-
-void av1_iht4x16_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                          const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_4x16[] = {
-    { aom_idct16_c, aom_idct4_c },    // DCT_DCT
-    { aom_iadst16_c, aom_idct4_c },   // ADST_DCT
-    { aom_idct16_c, aom_iadst4_c },   // DCT_ADST
-    { aom_iadst16_c, aom_iadst4_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { aom_iadst16_c, aom_idct4_c },   // FLIPADST_DCT
-    { aom_idct16_c, aom_iadst4_c },   // DCT_FLIPADST
-    { aom_iadst16_c, aom_iadst4_c },  // FLIPADST_FLIPADST
-    { aom_iadst16_c, aom_iadst4_c },  // ADST_FLIPADST
-    { aom_iadst16_c, aom_iadst4_c },  // FLIPADST_ADST
-    { iidtx16_c, iidtx4_c },          // IDTX
-    { aom_idct16_c, iidtx4_c },       // V_DCT
-    { iidtx16_c, aom_idct4_c },       // H_DCT
-    { aom_iadst16_c, iidtx4_c },      // V_ADST
-    { iidtx16_c, aom_iadst4_c },      // H_ADST
-    { aom_iadst16_c, iidtx4_c },      // V_FLIPADST
-    { iidtx16_c, aom_iadst4_c },      // H_FLIPADST
-#endif
-  };
-
-  const int n = 4;
-  const int n4 = 16;
-  int i, j;
-  tran_low_t out[4][16], tmp[4][16], outtmp[4];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n4;
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n4; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_row)
-      ilgt4(input, outtmp, lgtmtx_row[0]);
-    else
-#endif
-      IHT_4x16[tx_type].rows(input, outtmp);
-    for (j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
-    input += n;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n; ++i) {
-    IHT_4x16[tx_type].cols(tmp[i], out[i]);
-  }
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n4, n);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n4; ++i) {
-    for (j = 0; j < n; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-    }
-  }
-}
-
-void av1_iht16x4_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                          const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_16x4[] = {
-    { aom_idct4_c, aom_idct16_c },    // DCT_DCT
-    { aom_iadst4_c, aom_idct16_c },   // ADST_DCT
-    { aom_idct4_c, aom_iadst16_c },   // DCT_ADST
-    { aom_iadst4_c, aom_iadst16_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { aom_iadst4_c, aom_idct16_c },   // FLIPADST_DCT
-    { aom_idct4_c, aom_iadst16_c },   // DCT_FLIPADST
-    { aom_iadst4_c, aom_iadst16_c },  // FLIPADST_FLIPADST
-    { aom_iadst4_c, aom_iadst16_c },  // ADST_FLIPADST
-    { aom_iadst4_c, aom_iadst16_c },  // FLIPADST_ADST
-    { iidtx4_c, iidtx16_c },          // IDTX
-    { aom_idct4_c, iidtx16_c },       // V_DCT
-    { iidtx4_c, aom_idct16_c },       // H_DCT
-    { aom_iadst4_c, iidtx16_c },      // V_ADST
-    { iidtx4_c, aom_iadst16_c },      // H_ADST
-    { aom_iadst4_c, iidtx16_c },      // V_FLIPADST
-    { iidtx4_c, aom_iadst16_c },      // H_FLIPADST
-#endif
-  };
-
-  const int n = 4;
-  const int n4 = 16;
-
-  int i, j;
-  tran_low_t out[16][4], tmp[16][4], outtmp[16];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n;
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
-#endif
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n; ++i) {
-    IHT_16x4[tx_type].rows(input, outtmp);
-    for (j = 0; j < n4; ++j) tmp[j][i] = outtmp[j];
-    input += n4;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n4; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_col)
-      ilgt4(tmp[i], out[i], lgtmtx_col[0]);
-    else
-#endif
-      IHT_16x4[tx_type].cols(tmp[i], out[i]);
-  }
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n4);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n4; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-    }
-  }
-}
-
-void av1_iht8x16_128_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                           const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_8x16[] = {
-    { aom_idct16_c, aom_idct8_c },    // DCT_DCT
-    { aom_iadst16_c, aom_idct8_c },   // ADST_DCT
-    { aom_idct16_c, aom_iadst8_c },   // DCT_ADST
-    { aom_iadst16_c, aom_iadst8_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { aom_iadst16_c, aom_idct8_c },   // FLIPADST_DCT
-    { aom_idct16_c, aom_iadst8_c },   // DCT_FLIPADST
-    { aom_iadst16_c, aom_iadst8_c },  // FLIPADST_FLIPADST
-    { aom_iadst16_c, aom_iadst8_c },  // ADST_FLIPADST
-    { aom_iadst16_c, aom_iadst8_c },  // FLIPADST_ADST
-    { iidtx16_c, iidtx8_c },          // IDTX
-    { aom_idct16_c, iidtx8_c },       // V_DCT
-    { iidtx16_c, aom_idct8_c },       // H_DCT
-    { aom_iadst16_c, iidtx8_c },      // V_ADST
-    { iidtx16_c, aom_iadst8_c },      // H_ADST
-    { aom_iadst16_c, iidtx8_c },      // V_FLIPADST
-    { iidtx16_c, aom_iadst8_c },      // H_FLIPADST
-#endif
-  };
-
-  const int n = 8;
-  const int n2 = 16;
-  int i, j;
-  tran_low_t out[8][16], tmp[8][16], outtmp[8];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n2;
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n2; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_row)
-      ilgt8(input, outtmp, lgtmtx_row[0]);
-    else
-#endif
-      IHT_8x16[tx_type].rows(input, outtmp);
-    for (j = 0; j < n; ++j)
-      tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-    input += n;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n; ++i) {
-    IHT_8x16[tx_type].cols(tmp[i], out[i]);
-  }
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-    }
-  }
-}
-
-void av1_iht16x8_128_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                           const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_16x8[] = {
-    { aom_idct8_c, aom_idct16_c },    // DCT_DCT
-    { aom_iadst8_c, aom_idct16_c },   // ADST_DCT
-    { aom_idct8_c, aom_iadst16_c },   // DCT_ADST
-    { aom_iadst8_c, aom_iadst16_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { aom_iadst8_c, aom_idct16_c },   // FLIPADST_DCT
-    { aom_idct8_c, aom_iadst16_c },   // DCT_FLIPADST
-    { aom_iadst8_c, aom_iadst16_c },  // FLIPADST_FLIPADST
-    { aom_iadst8_c, aom_iadst16_c },  // ADST_FLIPADST
-    { aom_iadst8_c, aom_iadst16_c },  // FLIPADST_ADST
-    { iidtx8_c, iidtx16_c },          // IDTX
-    { aom_idct8_c, iidtx16_c },       // V_DCT
-    { iidtx8_c, aom_idct16_c },       // H_DCT
-    { aom_iadst8_c, iidtx16_c },      // V_ADST
-    { iidtx8_c, aom_iadst16_c },      // H_ADST
-    { aom_iadst8_c, iidtx16_c },      // V_FLIPADST
-    { iidtx8_c, aom_iadst16_c },      // H_FLIPADST
-#endif
-  };
-
-  const int n = 8;
-  const int n2 = 16;
-
-  int i, j;
-  tran_low_t out[16][8], tmp[16][8], outtmp[16];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n;
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
-#endif
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n; ++i) {
-    IHT_16x8[tx_type].rows(input, outtmp);
-    for (j = 0; j < n2; ++j)
-      tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-    input += n2;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n2; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_col)
-      ilgt8(tmp[i], out[i], lgtmtx_col[0]);
-    else
-#endif
-      IHT_16x8[tx_type].cols(tmp[i], out[i]);
-  }
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-    }
-  }
-}
-
-void av1_iht8x32_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                           const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_8x32[] = {
-    { aom_idct32_c, aom_idct8_c },     // DCT_DCT
-    { ihalfright32_c, aom_idct8_c },   // ADST_DCT
-    { aom_idct32_c, aom_iadst8_c },    // DCT_ADST
-    { ihalfright32_c, aom_iadst8_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { ihalfright32_c, aom_idct8_c },   // FLIPADST_DCT
-    { aom_idct32_c, aom_iadst8_c },    // DCT_FLIPADST
-    { ihalfright32_c, aom_iadst8_c },  // FLIPADST_FLIPADST
-    { ihalfright32_c, aom_iadst8_c },  // ADST_FLIPADST
-    { ihalfright32_c, aom_iadst8_c },  // FLIPADST_ADST
-    { iidtx32_c, iidtx8_c },           // IDTX
-    { aom_idct32_c, iidtx8_c },        // V_DCT
-    { iidtx32_c, aom_idct8_c },        // H_DCT
-    { ihalfright32_c, iidtx8_c },      // V_ADST
-    { iidtx32_c, aom_iadst8_c },       // H_ADST
-    { ihalfright32_c, iidtx8_c },      // V_FLIPADST
-    { iidtx32_c, aom_iadst8_c },       // H_FLIPADST
-#endif
-  };
-
-  const int n = 8;
-  const int n4 = 32;
-  int i, j;
-  tran_low_t out[8][32], tmp[8][32], outtmp[8];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n4;
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n4; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_row)
-      ilgt8(input, outtmp, lgtmtx_row[0]);
-    else
-#endif
-      IHT_8x32[tx_type].rows(input, outtmp);
-    for (j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
-    input += n;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n; ++i) {
-    IHT_8x32[tx_type].cols(tmp[i], out[i]);
-  }
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n4, n);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n4; ++i) {
-    for (j = 0; j < n; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-    }
-  }
-}
-
-void av1_iht32x8_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                           const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_32x8[] = {
-    { aom_idct8_c, aom_idct32_c },     // DCT_DCT
-    { aom_iadst8_c, aom_idct32_c },    // ADST_DCT
-    { aom_idct8_c, ihalfright32_c },   // DCT_ADST
-    { aom_iadst8_c, ihalfright32_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { aom_iadst8_c, aom_idct32_c },    // FLIPADST_DCT
-    { aom_idct8_c, ihalfright32_c },   // DCT_FLIPADST
-    { aom_iadst8_c, ihalfright32_c },  // FLIPADST_FLIPADST
-    { aom_iadst8_c, ihalfright32_c },  // ADST_FLIPADST
-    { aom_iadst8_c, ihalfright32_c },  // FLIPADST_ADST
-    { iidtx8_c, iidtx32_c },           // IDTX
-    { aom_idct8_c, iidtx32_c },        // V_DCT
-    { iidtx8_c, aom_idct32_c },        // H_DCT
-    { aom_iadst8_c, iidtx32_c },       // V_ADST
-    { iidtx8_c, ihalfright32_c },      // H_ADST
-    { aom_iadst8_c, iidtx32_c },       // V_FLIPADST
-    { iidtx8_c, ihalfright32_c },      // H_FLIPADST
-#endif
-  };
-
-  const int n = 8;
-  const int n4 = 32;
-
-  int i, j;
-  tran_low_t out[32][8], tmp[32][8], outtmp[32];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n;
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
-#endif
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n; ++i) {
-    IHT_32x8[tx_type].rows(input, outtmp);
-    for (j = 0; j < n4; ++j) tmp[j][i] = outtmp[j];
-    input += n4;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n4; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_col)
-      ilgt8(tmp[i], out[i], lgtmtx_col[0]);
-    else
-#endif
-      IHT_32x8[tx_type].cols(tmp[i], out[i]);
-  }
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n4);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n4; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-    }
-  }
-}
-
-void av1_iht16x32_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                            const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_16x32[] = {
-    { aom_idct32_c, aom_idct16_c },     // DCT_DCT
-    { ihalfright32_c, aom_idct16_c },   // ADST_DCT
-    { aom_idct32_c, aom_iadst16_c },    // DCT_ADST
-    { ihalfright32_c, aom_iadst16_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { ihalfright32_c, aom_idct16_c },   // FLIPADST_DCT
-    { aom_idct32_c, aom_iadst16_c },    // DCT_FLIPADST
-    { ihalfright32_c, aom_iadst16_c },  // FLIPADST_FLIPADST
-    { ihalfright32_c, aom_iadst16_c },  // ADST_FLIPADST
-    { ihalfright32_c, aom_iadst16_c },  // FLIPADST_ADST
-    { iidtx32_c, iidtx16_c },           // IDTX
-    { aom_idct32_c, iidtx16_c },        // V_DCT
-    { iidtx32_c, aom_idct16_c },        // H_DCT
-    { ihalfright32_c, iidtx16_c },      // V_ADST
-    { iidtx32_c, aom_iadst16_c },       // H_ADST
-    { ihalfright32_c, iidtx16_c },      // V_FLIPADST
-    { iidtx32_c, aom_iadst16_c },       // H_FLIPADST
-#endif
-  };
-
-  const int n = 16;
-  const int n2 = 32;
-  int i, j;
-  tran_low_t out[16][32], tmp[16][32], outtmp[16];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n2;
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n2; ++i) {
-    IHT_16x32[tx_type].rows(input, outtmp);
-    for (j = 0; j < n; ++j)
-      tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-    input += n;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n; ++i) IHT_16x32[tx_type].cols(tmp[i], out[i]);
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-    }
-  }
-}
-
-void av1_iht32x16_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                            const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_32x16[] = {
-    { aom_idct16_c, aom_idct32_c },     // DCT_DCT
-    { aom_iadst16_c, aom_idct32_c },    // ADST_DCT
-    { aom_idct16_c, ihalfright32_c },   // DCT_ADST
-    { aom_iadst16_c, ihalfright32_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { aom_iadst16_c, aom_idct32_c },    // FLIPADST_DCT
-    { aom_idct16_c, ihalfright32_c },   // DCT_FLIPADST
-    { aom_iadst16_c, ihalfright32_c },  // FLIPADST_FLIPADST
-    { aom_iadst16_c, ihalfright32_c },  // ADST_FLIPADST
-    { aom_iadst16_c, ihalfright32_c },  // FLIPADST_ADST
-    { iidtx16_c, iidtx32_c },           // IDTX
-    { aom_idct16_c, iidtx32_c },        // V_DCT
-    { iidtx16_c, aom_idct32_c },        // H_DCT
-    { aom_iadst16_c, iidtx32_c },       // V_ADST
-    { iidtx16_c, ihalfright32_c },      // H_ADST
-    { aom_iadst16_c, iidtx32_c },       // V_FLIPADST
-    { iidtx16_c, ihalfright32_c },      // H_FLIPADST
-#endif
-  };
-  const int n = 16;
-  const int n2 = 32;
-
-  int i, j;
-  tran_low_t out[32][16], tmp[32][16], outtmp[32];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n;
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n; ++i) {
-    IHT_32x16[tx_type].rows(input, outtmp);
-    for (j = 0; j < n2; ++j)
-      tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
-    input += n2;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n2; ++i) IHT_32x16[tx_type].cols(tmp[i], out[i]);
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-    }
-  }
-}
-
-void av1_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                         const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_8[] = {
-#if CONFIG_DAALA_DCT8
-    { daala_idct8, daala_idct8 },  // DCT_DCT  = 0
-    { daala_idst8, daala_idct8 },  // ADST_DCT = 1
-    { daala_idct8, daala_idst8 },  // DCT_ADST = 2
-    { daala_idst8, daala_idst8 },  // ADST_ADST = 3
-#if CONFIG_EXT_TX
-    { daala_idst8, daala_idct8 },  // FLIPADST_DCT
-    { daala_idct8, daala_idst8 },  // DCT_FLIPADST
-    { daala_idst8, daala_idst8 },  // FLIPADST_FLIPADST
-    { daala_idst8, daala_idst8 },  // ADST_FLIPADST
-    { daala_idst8, daala_idst8 },  // FLIPADST_ADST
-    { daala_idtx8, daala_idtx8 },  // IDTX
-    { daala_idct8, daala_idtx8 },  // V_DCT
-    { daala_idtx8, daala_idct8 },  // H_DCT
-    { daala_idst8, daala_idtx8 },  // V_ADST
-    { daala_idtx8, daala_idst8 },  // H_ADST
-    { daala_idst8, daala_idtx8 },  // V_FLIPADST
-    { daala_idtx8, daala_idst8 },  // H_FLIPADST
-#endif
-#else
-    { aom_idct8_c, aom_idct8_c },    // DCT_DCT  = 0
-    { aom_iadst8_c, aom_idct8_c },   // ADST_DCT = 1
-    { aom_idct8_c, aom_iadst8_c },   // DCT_ADST = 2
-    { aom_iadst8_c, aom_iadst8_c },  // ADST_ADST = 3
-#if CONFIG_EXT_TX
-    { aom_iadst8_c, aom_idct8_c },   // FLIPADST_DCT
-    { aom_idct8_c, aom_iadst8_c },   // DCT_FLIPADST
-    { aom_iadst8_c, aom_iadst8_c },  // FLIPADST_FLIPADST
-    { aom_iadst8_c, aom_iadst8_c },  // ADST_FLIPADST
-    { aom_iadst8_c, aom_iadst8_c },  // FLIPADST_ADST
-    { iidtx8_c, iidtx8_c },          // IDTX
-    { aom_idct8_c, iidtx8_c },       // V_DCT
-    { iidtx8_c, aom_idct8_c },       // H_DCT
-    { aom_iadst8_c, iidtx8_c },      // V_ADST
-    { iidtx8_c, aom_iadst8_c },      // H_ADST
-    { aom_iadst8_c, iidtx8_c },      // V_FLIPADST
-    { iidtx8_c, aom_iadst8_c },      // H_FLIPADST
-#endif
-#endif
-  };
-
-  int i, j;
-  tran_low_t tmp[8][8];
-  tran_low_t out[8][8];
-  tran_low_t *outp = &out[0][0];
-  int outstride = 8;
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
-  int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // inverse transform row vectors
-  for (i = 0; i < 8; ++i) {
-#if CONFIG_DAALA_DCT8
-    tran_low_t temp_in[8];
-    for (j = 0; j < 8; j++) temp_in[j] = input[j] * 2;
-    IHT_8[tx_type].rows(temp_in, out[i]);
-#else
-#if CONFIG_LGT
-    if (use_lgt_row)
-      ilgt8(input, out[i], lgtmtx_row[0]);
-    else
-#endif
-      IHT_8[tx_type].rows(input, out[i]);
-#endif
-    input += 8;
-  }
-
-  // transpose
-  for (i = 0; i < 8; i++) {
-    for (j = 0; j < 8; j++) {
-      tmp[j][i] = out[i][j];
-    }
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < 8; ++i) {
-#if CONFIG_LGT
-    if (use_lgt_col)
-      ilgt8(tmp[i], out[i], lgtmtx_col[0]);
-    else
-#endif
-      IHT_8[tx_type].cols(tmp[i], out[i]);
-  }
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 8, 8);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-#if CONFIG_DAALA_DCT8
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-#endif
-    }
-  }
-}
-
-void av1_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                            const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_16[] = {
-#if CONFIG_DAALA_DCT16
-    { daala_idct16, daala_idct16 },  // DCT_DCT  = 0
-    { daala_idst16, daala_idct16 },  // ADST_DCT = 1
-    { daala_idct16, daala_idst16 },  // DCT_ADST = 2
-    { daala_idst16, daala_idst16 },  // ADST_ADST = 3
-#if CONFIG_EXT_TX
-    { daala_idst16, daala_idct16 },  // FLIPADST_DCT
-    { daala_idct16, daala_idst16 },  // DCT_FLIPADST
-    { daala_idst16, daala_idst16 },  // FLIPADST_FLIPADST
-    { daala_idst16, daala_idst16 },  // ADST_FLIPADST
-    { daala_idst16, daala_idst16 },  // FLIPADST_ADST
-    { daala_idtx16, daala_idtx16 },  // IDTX
-    { daala_idct16, daala_idtx16 },  // V_DCT
-    { daala_idtx16, daala_idct16 },  // H_DCT
-    { daala_idst16, daala_idtx16 },  // V_ADST
-    { daala_idtx16, daala_idst16 },  // H_ADST
-    { daala_idst16, daala_idtx16 },  // V_FLIPADST
-    { daala_idtx16, daala_idst16 },  // H_FLIPADST
-#endif
-#else
-    { aom_idct16_c, aom_idct16_c },    // DCT_DCT  = 0
-    { aom_iadst16_c, aom_idct16_c },   // ADST_DCT = 1
-    { aom_idct16_c, aom_iadst16_c },   // DCT_ADST = 2
-    { aom_iadst16_c, aom_iadst16_c },  // ADST_ADST = 3
-#if CONFIG_EXT_TX
-    { aom_iadst16_c, aom_idct16_c },   // FLIPADST_DCT
-    { aom_idct16_c, aom_iadst16_c },   // DCT_FLIPADST
-    { aom_iadst16_c, aom_iadst16_c },  // FLIPADST_FLIPADST
-    { aom_iadst16_c, aom_iadst16_c },  // ADST_FLIPADST
-    { aom_iadst16_c, aom_iadst16_c },  // FLIPADST_ADST
-    { iidtx16_c, iidtx16_c },          // IDTX
-    { aom_idct16_c, iidtx16_c },       // V_DCT
-    { iidtx16_c, aom_idct16_c },       // H_DCT
-    { aom_iadst16_c, iidtx16_c },      // V_ADST
-    { iidtx16_c, aom_iadst16_c },      // H_ADST
-    { aom_iadst16_c, iidtx16_c },      // V_FLIPADST
-    { iidtx16_c, aom_iadst16_c },      // H_FLIPADST
-#endif
-#endif
-  };
-
-  int i, j;
-  tran_low_t tmp[16][16];
-  tran_low_t out[16][16];
-  tran_low_t *outp = &out[0][0];
-  int outstride = 16;
-
-  // inverse transform row vectors
-  for (i = 0; i < 16; ++i) {
-#if CONFIG_DAALA_DCT16
-    tran_low_t temp_in[16];
-    for (j = 0; j < 16; j++) temp_in[j] = input[j] * 2;
-    IHT_16[tx_type].rows(temp_in, out[i]);
-#else
-    IHT_16[tx_type].rows(input, out[i]);
-#endif
-    input += 16;
-  }
-
-  // transpose
-  for (i = 0; i < 16; i++) {
-    for (j = 0; j < 16; j++) {
-      tmp[j][i] = out[i][j];
-    }
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < 16; ++i) IHT_16[tx_type].cols(tmp[i], out[i]);
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 16, 16);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-#if CONFIG_DAALA_DCT16
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
-#else
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-#endif
-    }
-  }
-}
-
-#if CONFIG_EXT_TX || CONFIG_DAALA_DCT32
-void av1_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                             const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_32[] = {
-#if CONFIG_DAALA_DCT32
-    { daala_idct32, daala_idct32 },  // DCT_DCT
-#if CONFIG_EXT_TX
-    { daala_idst32, daala_idct32 },  // ADST_DCT
-    { daala_idct32, daala_idst32 },  // DCT_ADST
-    { daala_idst32, daala_idst32 },  // ADST_ADST
-    { daala_idst32, daala_idct32 },  // FLIPADST_DCT
-    { daala_idct32, daala_idst32 },  // DCT_FLIPADST
-    { daala_idst32, daala_idst32 },  // FLIPADST_FLIPADST
-    { daala_idst32, daala_idst32 },  // ADST_FLIPADST
-    { daala_idst32, daala_idst32 },  // FLIPADST_ADST
-    { daala_idtx32, daala_idtx32 },  // IDTX
-    { daala_idct32, daala_idtx32 },  // V_DCT
-    { daala_idtx32, daala_idct32 },  // H_DCT
-    { daala_idst32, daala_idtx32 },  // V_ADST
-    { daala_idtx32, daala_idst32 },  // H_ADST
-    { daala_idst32, daala_idtx32 },  // V_FLIPADST
-    { daala_idtx32, daala_idst32 },  // H_FLIPADST
-#endif
-#else
-    { aom_idct32_c, aom_idct32_c },      // DCT_DCT
-#if CONFIG_EXT_TX
-    { ihalfright32_c, aom_idct32_c },    // ADST_DCT
-    { aom_idct32_c, ihalfright32_c },    // DCT_ADST
-    { ihalfright32_c, ihalfright32_c },  // ADST_ADST
-    { ihalfright32_c, aom_idct32_c },    // FLIPADST_DCT
-    { aom_idct32_c, ihalfright32_c },    // DCT_FLIPADST
-    { ihalfright32_c, ihalfright32_c },  // FLIPADST_FLIPADST
-    { ihalfright32_c, ihalfright32_c },  // ADST_FLIPADST
-    { ihalfright32_c, ihalfright32_c },  // FLIPADST_ADST
-    { iidtx32_c, iidtx32_c },            // IDTX
-    { aom_idct32_c, iidtx32_c },         // V_DCT
-    { iidtx32_c, aom_idct32_c },         // H_DCT
-    { ihalfright32_c, iidtx32_c },       // V_ADST
-    { iidtx32_c, ihalfright32_c },       // H_ADST
-    { ihalfright32_c, iidtx32_c },       // V_FLIPADST
-    { iidtx32_c, ihalfright32_c },       // H_FLIPADST
-#endif
-#endif
-  };
-
-  int i, j;
-  tran_low_t tmp[32][32];
-  tran_low_t out[32][32];
-  tran_low_t *outp = &out[0][0];
-  int outstride = 32;
-
-  // inverse transform row vectors
-  for (i = 0; i < 32; ++i) {
-#if CONFIG_DAALA_DCT32
-    tran_low_t temp_in[32];
-    for (j = 0; j < 32; j++) temp_in[j] = input[j] * 2;
-    IHT_32[tx_type].rows(temp_in, out[i]);
-#else
-    IHT_32[tx_type].rows(input, out[i]);
-#endif
-    input += 32;
-  }
-
-  // transpose
-  for (i = 0; i < 32; i++) {
-    for (j = 0; j < 32; j++) {
-#if CONFIG_DAALA_DCT32
-      tmp[j][i] = out[i][j] * 4;
-#else
-      tmp[j][i] = out[i][j];
-#endif
-    }
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < 32; ++i) IHT_32[tx_type].cols(tmp[i], out[i]);
-
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 32, 32);
-
-  // Sum with the destination
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-#if CONFIG_DAALA_DCT32
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-#else
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
-#endif
-    }
-  }
-}
-#endif  // CONFIG_EXT_TX || CONFIG_DAALA_DCT32
-
-#if CONFIG_TX64X64
-void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                             const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_64[] = {
-#if CONFIG_DAALA_DCT64
-    { daala_idct64, daala_idct64 },  // DCT_DCT
-    { daala_idst64, daala_idct64 },  // ADST_DCT
-    { daala_idct64, daala_idst64 },  // DCT_ADST
-    { daala_idst64, daala_idst64 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { daala_idst64, daala_idct64 },  // FLIPADST_DCT
-    { daala_idct64, daala_idst64 },  // DCT_FLIPADST
-    { daala_idst64, daala_idst64 },  // FLIPADST_FLIPADST
-    { daala_idst64, daala_idst64 },  // ADST_FLIPADST
-    { daala_idst64, daala_idst64 },  // FLIPADST_ADST
-    { daala_idtx64, daala_idtx64 },  // IDTX
-    { daala_idct64, daala_idtx64 },  // V_DCT
-    { daala_idtx64, daala_idct64 },  // H_DCT
-    { daala_idst64, daala_idtx64 },  // V_ADST
-    { daala_idtx64, daala_idst64 },  // H_ADST
-    { daala_idst64, daala_idtx64 },  // V_FLIPADST
-    { daala_idtx64, daala_idst64 },  // H_FLIPADST
-#endif
-#else
-    { idct64_col_c, idct64_row_c },      // DCT_DCT
-    { ihalfright64_c, idct64_row_c },    // ADST_DCT
-    { idct64_col_c, ihalfright64_c },    // DCT_ADST
-    { ihalfright64_c, ihalfright64_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { ihalfright64_c, idct64_row_c },    // FLIPADST_DCT
-    { idct64_col_c, ihalfright64_c },    // DCT_FLIPADST
-    { ihalfright64_c, ihalfright64_c },  // FLIPADST_FLIPADST
-    { ihalfright64_c, ihalfright64_c },  // ADST_FLIPADST
-    { ihalfright64_c, ihalfright64_c },  // FLIPADST_ADST
-    { iidtx64_c, iidtx64_c },            // IDTX
-    { idct64_col_c, iidtx64_c },         // V_DCT
-    { iidtx64_c, idct64_row_c },         // H_DCT
-    { ihalfright64_c, iidtx64_c },       // V_ADST
-    { iidtx64_c, ihalfright64_c },       // H_ADST
-    { ihalfright64_c, iidtx64_c },       // V_FLIPADST
-    { iidtx64_c, ihalfright64_c },       // H_FLIPADST
-#endif
-#endif
-  };
-
-  int i, j;
-  tran_low_t tmp[64][64];
-  tran_low_t out[64][64];
-  tran_low_t *outp = &out[0][0];
-  int outstride = 64;
-
-  // inverse transform row vectors
-  for (i = 0; i < 64; ++i) {
-#if CONFIG_DAALA_DCT64
-    tran_low_t temp_in[64];
-    for (j = 0; j < 64; j++) temp_in[j] = input[j] * 2;
-    IHT_64[tx_type].rows(temp_in, out[i]);
-// Do not rescale intermediate for Daala
-#else
-    IHT_64[tx_type].rows(input, out[i]);
-    for (j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1);
-#endif
-    input += 64;
-  }
-
-  // transpose
-  for (i = 0; i < 64; i++) {
-    for (j = 0; j < 64; j++) {
-      tmp[j][i] = out[i][j];
-    }
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < 64; ++i) IHT_64[tx_type].cols(tmp[i], out[i]);
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 64, 64);
-#endif  // CONFIG_EXT_TX
-
-  // Sum with the destination
-  for (i = 0; i < 64; ++i) {
-    for (j = 0; j < 64; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-#if CONFIG_DAALA_DCT64
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 2));
-#else
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-#endif
-    }
-  }
-}
-
-void av1_iht64x32_2048_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                             const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_64x32[] = {
-    { aom_idct32_c, idct64_row_c },      // DCT_DCT
-    { ihalfright32_c, idct64_row_c },    // ADST_DCT
-    { aom_idct32_c, ihalfright64_c },    // DCT_ADST
-    { ihalfright32_c, ihalfright64_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { ihalfright32_c, idct64_row_c },    // FLIPADST_DCT
-    { aom_idct32_c, ihalfright64_c },    // DCT_FLIPADST
-    { ihalfright32_c, ihalfright64_c },  // FLIPADST_FLIPADST
-    { ihalfright32_c, ihalfright64_c },  // ADST_FLIPADST
-    { ihalfright32_c, ihalfright64_c },  // FLIPADST_ADST
-    { iidtx32_c, iidtx64_c },            // IDTX
-    { aom_idct32_c, iidtx64_c },         // V_DCT
-    { iidtx32_c, idct64_row_c },         // H_DCT
-    { ihalfright32_c, iidtx64_c },       // V_ADST
-    { iidtx32_c, ihalfright64_c },       // H_ADST
-    { ihalfright32_c, iidtx64_c },       // V_FLIPADST
-    { iidtx32_c, ihalfright64_c },       // H_FLIPADST
-#endif
-  };
-  const int n = 32;
-  const int n2 = 64;
-
-  int i, j;
-  tran_low_t out[64][32], tmp[64][32], outtmp[64];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n;
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n; ++i) {
-    IHT_64x32[tx_type].rows(input, outtmp);
-    for (j = 0; j < n2; ++j)
-      tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * InvSqrt2);
-    input += n2;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n2; ++i) IHT_64x32[tx_type].cols(tmp[i], out[i]);
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-    }
-  }
-}
-
-void av1_iht32x64_2048_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                             const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d IHT_32x64[] = {
-    { idct64_col_c, aom_idct32_c },      // DCT_DCT
-    { ihalfright64_c, aom_idct32_c },    // ADST_DCT
-    { idct64_col_c, ihalfright32_c },    // DCT_ADST
-    { ihalfright64_c, ihalfright32_c },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { ihalfright64_c, aom_idct32_c },    // FLIPADST_DCT
-    { idct64_col_c, ihalfright32_c },    // DCT_FLIPADST
-    { ihalfright64_c, ihalfright32_c },  // FLIPADST_FLIPADST
-    { ihalfright64_c, ihalfright32_c },  // ADST_FLIPADST
-    { ihalfright64_c, ihalfright32_c },  // FLIPADST_ADST
-    { iidtx64_c, iidtx32_c },            // IDTX
-    { idct64_col_c, iidtx32_c },         // V_DCT
-    { iidtx64_c, aom_idct32_c },         // H_DCT
-    { ihalfright64_c, iidtx32_c },       // V_ADST
-    { iidtx64_c, ihalfright32_c },       // H_ADST
-    { ihalfright64_c, iidtx32_c },       // V_FLIPADST
-    { iidtx64_c, ihalfright32_c },       // H_FLIPADST
-#endif
-  };
-
-  const int n = 32;
-  const int n2 = 64;
-  int i, j;
-  tran_low_t out[32][64], tmp[32][64], outtmp[32];
-  tran_low_t *outp = &out[0][0];
-  int outstride = n2;
-
-  // inverse transform row vectors and transpose
-  for (i = 0; i < n2; ++i) {
-    IHT_32x64[tx_type].rows(input, outtmp);
-    for (j = 0; j < n; ++j)
-      tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * InvSqrt2);
-    input += n;
-  }
-
-  // inverse transform column vectors
-  for (i = 0; i < n; ++i) IHT_32x64[tx_type].cols(tmp[i], out[i]);
-
-#if CONFIG_EXT_TX
-  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
-#endif
-
-  // Sum with the destination
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j) {
-      int d = i * stride + j;
-      int s = j * outstride + i;
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
-    }
-  }
-}
-
-#endif  // CONFIG_TX64X64
-
 // idct
-void av1_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
-                     const TxfmParam *txfm_param) {
-  const int eob = txfm_param->eob;
-  if (eob > 1)
-    av1_iht4x4_16_add(input, dest, stride, txfm_param);
-  else
-    aom_idct4x4_1_add(input, dest, stride);
-}
-
-void av1_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
-                     const TxfmParam *txfm_param) {
-  const int eob = txfm_param->eob;
-  if (eob > 1)
-    aom_iwht4x4_16_add(input, dest, stride);
-  else
-    aom_iwht4x4_1_add(input, dest, stride);
-}
-
-#if !CONFIG_DAALA_DCT8
-static void idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
-                        const TxfmParam *txfm_param) {
-// If dc is 1, then input[0] is the reconstructed value, do not need
-// dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
-
-// The calculation can be simplified if there are not many non-zero dct
-// coefficients. Use eobs to decide what to do.
-// TODO(yunqingwang): "eobs = 1" case is also handled in av1_short_idct8x8_c.
-// Combine that with code here.
-#if CONFIG_ADAPT_SCAN
-  const int16_t half = txfm_param->eob_threshold[0];
-#else
-  const int16_t half = 12;
-#endif
-
-  const int eob = txfm_param->eob;
-  if (eob == 1)
-    // DC only DCT coefficient
-    aom_idct8x8_1_add(input, dest, stride);
-  else if (eob <= half)
-    aom_idct8x8_12_add(input, dest, stride);
-  else
-    aom_idct8x8_64_add(input, dest, stride);
-}
-#endif
-
-#if !CONFIG_DAALA_DCT16
-static void idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
-                          const TxfmParam *txfm_param) {
-// The calculation can be simplified if there are not many non-zero dct
-// coefficients. Use eobs to separate different cases.
-#if CONFIG_ADAPT_SCAN
-  const int16_t half = txfm_param->eob_threshold[0];
-  const int16_t quarter = txfm_param->eob_threshold[1];
-#else
-  const int16_t half = 38;
-  const int16_t quarter = 10;
-#endif
-
-  const int eob = txfm_param->eob;
-  if (eob == 1) /* DC only DCT coefficient. */
-    aom_idct16x16_1_add(input, dest, stride);
-  else if (eob <= quarter)
-    aom_idct16x16_10_add(input, dest, stride);
-  else if (eob <= half)
-    aom_idct16x16_38_add(input, dest, stride);
-  else
-    aom_idct16x16_256_add(input, dest, stride);
-}
-#endif
-
-#if CONFIG_MRC_TX
-static void imrc32x32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                            const TxfmParam *txfm_param) {
-#if CONFIG_ADAPT_SCAN
-  const int16_t half = txfm_param->eob_threshold[0];
-  const int16_t quarter = txfm_param->eob_threshold[1];
-#else
-  const int16_t half = 135;
-  const int16_t quarter = 34;
-#endif
-
-  const int eob = txfm_param->eob;
-  int n_masked_vals = 0;
-  uint8_t *mask;
-  uint8_t mask_tmp[32 * 32];
-  if (eob == 1) {
-    aom_idct32x32_1_add_c(input, dest, stride);
-  } else {
-    if ((txfm_param->is_inter && SIGNAL_MRC_MASK_INTER) ||
-        (!txfm_param->is_inter && SIGNAL_MRC_MASK_INTRA)) {
-      mask = txfm_param->mask;
-    } else {
-      n_masked_vals =
-          get_mrc_pred_mask(txfm_param->dst, txfm_param->stride, mask_tmp, 32,
-                            32, 32, txfm_param->is_inter);
-      if (!is_valid_mrc_mask(n_masked_vals, 32, 32))
-        assert(0 && "Invalid MRC mask");
-      mask = mask_tmp;
-    }
-    if (eob <= quarter)
-      // non-zero coeff only in upper-left 8x8
-      aom_imrc32x32_34_add_c(input, dest, stride, mask);
-    else if (eob <= half)
-      // non-zero coeff only in upper-left 16x16
-      aom_imrc32x32_135_add_c(input, dest, stride, mask);
-    else
-      aom_imrc32x32_1024_add_c(input, dest, stride, mask);
-  }
-}
-#endif  // CONFIG_MRC_TX
-
-#if !CONFIG_DAALA_DCT32
-static void idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
-                          const TxfmParam *txfm_param) {
-#if CONFIG_ADAPT_SCAN
-  const int16_t half = txfm_param->eob_threshold[0];
-  const int16_t quarter = txfm_param->eob_threshold[1];
-#else
-  const int16_t half = 135;
-  const int16_t quarter = 34;
-#endif
-
-  const int eob = txfm_param->eob;
-  if (eob == 1)
-    aom_idct32x32_1_add(input, dest, stride);
-  else if (eob <= quarter)
-    // non-zero coeff only in upper-left 8x8
-    aom_idct32x32_34_add(input, dest, stride);
-  else if (eob <= half)
-    // non-zero coeff only in upper-left 16x16
-    aom_idct32x32_135_add(input, dest, stride);
-  else
-    aom_idct32x32_1024_add(input, dest, stride);
-}
-#endif
-
-#if CONFIG_TX64X64 && !CONFIG_DAALA_DCT64
-static void idct64x64_add(const tran_low_t *input, uint8_t *dest, int stride,
-                          const TxfmParam *txfm_param) {
-  (void)txfm_param;
-  av1_iht64x64_4096_add(input, dest, stride, txfm_param);
-}
-#endif  // CONFIG_TX64X64 && !CONFIG_DAALA_DCT64
-
-#if CONFIG_CHROMA_2X2
-static void inv_txfm_add_2x2(const tran_low_t *input, uint8_t *dest, int stride,
-                             const TxfmParam *txfm_param) {
-  tran_high_t a1 = input[0] >> UNIT_QUANT_SHIFT;
-  tran_high_t b1 = input[1] >> UNIT_QUANT_SHIFT;
-  tran_high_t c1 = input[2] >> UNIT_QUANT_SHIFT;
-  tran_high_t d1 = input[3] >> UNIT_QUANT_SHIFT;
-
-  tran_high_t a2 = a1 + c1;
-  tran_high_t b2 = b1 + d1;
-  tran_high_t c2 = a1 - c1;
-  tran_high_t d2 = b1 - d1;
-
-  (void)txfm_param;
-
-  a1 = (a2 + b2) >> 2;
-  b1 = (a2 - b2) >> 2;
-  c1 = (c2 + d2) >> 2;
-  d1 = (c2 - d2) >> 2;
-
-  dest[0] = clip_pixel_add(dest[0], WRAPLOW(a1));
-  dest[1] = clip_pixel_add(dest[1], WRAPLOW(b1));
-  dest[stride] = clip_pixel_add(dest[stride], WRAPLOW(c1));
-  dest[stride + 1] = clip_pixel_add(dest[stride + 1], WRAPLOW(d1));
-}
-#endif
-
-static void inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, int stride,
-                             const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  if (txfm_param->lossless) {
-    assert(tx_type == DCT_DCT);
-    av1_iwht4x4_add(input, dest, stride, txfm_param);
-    return;
-  }
-
-  switch (tx_type) {
-#if !CONFIG_DAALA_DCT4
-    case DCT_DCT: av1_idct4x4_add(input, dest, stride, txfm_param); break;
-#else
-    case DCT_DCT:
-#endif
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_LGT || CONFIG_DAALA_DCT4
-      // LGT only exists in C verson
-      av1_iht4x4_16_add_c(input, dest, stride, txfm_param);
-      break;
-#else
-      av1_iht4x4_16_add(input, dest, stride, txfm_param);
-      break;
-#endif
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-#if CONFIG_LGT || CONFIG_DAALA_DCT4
-      av1_iht4x4_16_add_c(input, dest, stride, txfm_param);
-      break;
-#else
-      av1_iht4x4_16_add(input, dest, stride, txfm_param);
-      break;
-#endif
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-      // Use C version since DST only exists in C code
-      av1_iht4x4_16_add_c(input, dest, stride, txfm_param);
-      break;
-    case IDTX: inv_idtx_add_c(input, dest, stride, 4, 4, tx_type); break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-}
-
-static void inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, int stride,
-                             const TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_iht4x8_32_add_c(input, dest, stride, txfm_param);
-#else
-  av1_iht4x8_32_add(input, dest, stride, txfm_param);
-#endif
-}
-
-static void inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest, int stride,
-                             const TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_iht8x4_32_add_c(input, dest, stride, txfm_param);
-#else
-  av1_iht8x4_32_add(input, dest, stride, txfm_param);
-#endif
-}
-
-// These will be used by the masked-tx experiment in the future.
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-static void inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest,
-                              int stride, const TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_iht4x16_64_add_c(input, dest, stride, txfm_param);
-#else
-  av1_iht4x16_64_add(input, dest, stride, txfm_param);
-#endif
-}
-
-static void inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest,
-                              int stride, const TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_iht16x4_64_add_c(input, dest, stride, txfm_param);
-#else
-  av1_iht16x4_64_add(input, dest, stride, txfm_param);
-#endif
-}
-
-static void inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest,
-                              int stride, const TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_iht8x32_256_add_c(input, dest, stride, txfm_param);
-#else
-  av1_iht8x32_256_add(input, dest, stride, txfm_param);
-#endif
-}
-
-static void inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest,
-                              int stride, const TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_iht32x8_256_add_c(input, dest, stride, txfm_param);
-#else
-  av1_iht32x8_256_add(input, dest, stride, txfm_param);
-#endif
-}
-#endif
-
-static void inv_txfm_add_8x16(const tran_low_t *input, uint8_t *dest,
-                              int stride, const TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_iht8x16_128_add_c(input, dest, stride, txfm_param);
-#else
-  av1_iht8x16_128_add(input, dest, stride, txfm_param);
-#endif
-}
-
-static void inv_txfm_add_16x8(const tran_low_t *input, uint8_t *dest,
-                              int stride, const TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_iht16x8_128_add_c(input, dest, stride, txfm_param);
-#else
-  av1_iht16x8_128_add(input, dest, stride, txfm_param);
-#endif
-}
-
-static void inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  av1_iht16x32_512_add(input, dest, stride, txfm_param);
-}
-
-static void inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  av1_iht32x16_512_add(input, dest, stride, txfm_param);
-}
-
-#if CONFIG_TX64X64
-static void inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  av1_iht32x64_2048_add(input, dest, stride, txfm_param);
-}
-
-static void inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  av1_iht64x32_2048_add(input, dest, stride, txfm_param);
-}
-#endif  // CONFIG_TX64X64
-
-static void inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest, int stride,
-                             const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  switch (tx_type) {
-#if !CONFIG_DAALA_DCT8
-    case DCT_DCT: idct8x8_add(input, dest, stride, txfm_param); break;
-#else
-    case DCT_DCT:
-#endif
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_LGT || CONFIG_DAALA_DCT8
-      av1_iht8x8_64_add_c(input, dest, stride, txfm_param);
-      break;
-#else
-      av1_iht8x8_64_add(input, dest, stride, txfm_param);
-      break;
-#endif
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-#if CONFIG_LGT || CONFIG_DAALA_DCT8
-      av1_iht8x8_64_add_c(input, dest, stride, txfm_param);
-      break;
-#else
-      av1_iht8x8_64_add(input, dest, stride, txfm_param);
-      break;
-#endif
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-      // Use C version since DST only exists in C code
-      av1_iht8x8_64_add_c(input, dest, stride, txfm_param);
-      break;
-    case IDTX: inv_idtx_add_c(input, dest, stride, 8, 8, tx_type); break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-}
-
-static void inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  switch (tx_type) {
-#if !CONFIG_DAALA_DCT16
-    case DCT_DCT: idct16x16_add(input, dest, stride, txfm_param); break;
-#else
-    case DCT_DCT:
-#endif
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_DAALA_DCT16
-      av1_iht16x16_256_add_c(input, dest, stride, txfm_param);
-#else
-      av1_iht16x16_256_add(input, dest, stride, txfm_param);
-#endif  // CONFIG_DAALA_DCT16
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-#if CONFIG_DAALA_DCT16
-      av1_iht16x16_256_add_c(input, dest, stride, txfm_param);
-#else
-      av1_iht16x16_256_add(input, dest, stride, txfm_param);
-#endif  // CONFIG_DAALA_DCT16
-      break;
-    case IDTX: inv_idtx_add_c(input, dest, stride, 16, 16, tx_type); break;
-#endif  // CONFIG_EXT_TX
-#if CONFIG_MRC_TX
-    case MRC_DCT: assert(0 && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-    default: assert(0); break;
-  }
-}
-
-static void inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  switch (tx_type) {
-#if !CONFIG_DAALA_DCT32
-    case DCT_DCT: idct32x32_add(input, dest, stride, txfm_param); break;
-#else
-    case DCT_DCT:
-      av1_iht32x32_1024_add_c(input, dest, stride, txfm_param);
-      break;
-#endif
-#if CONFIG_EXT_TX
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-      av1_iht32x32_1024_add_c(input, dest, stride, txfm_param);
-      break;
-    case IDTX: inv_idtx_add_c(input, dest, stride, 32, 32, tx_type); break;
-#endif  // CONFIG_EXT_TX
-#if CONFIG_MRC_TX
-    case MRC_DCT: imrc32x32_add_c(input, dest, stride, txfm_param); break;
-#endif  // CONFIG_MRC_TX
-    default: assert(0); break;
-  }
-}
-
-#if CONFIG_TX64X64
-static void inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  assert(tx_type == DCT_DCT);
-  switch (tx_type) {
-#if !CONFIG_DAALA_DCT64
-    case DCT_DCT: idct64x64_add(input, dest, stride, txfm_param); break;
-#else
-    case DCT_DCT:
-#endif
-#if CONFIG_EXT_TX
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-      av1_iht64x64_4096_add_c(input, dest, stride, txfm_param);
-      break;
-    case IDTX: inv_idtx_add_c(input, dest, stride, 64, 64, tx_type); break;
-#endif  // CONFIG_EXT_TX
-#if CONFIG_MRC_TX
-    case MRC_DCT: assert(0 && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-    default: assert(0); break;
-  }
-}
-#endif  // CONFIG_TX64X64
-
-void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
-                            int eob, int bd) {
+static void highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest,
+                               int stride, int eob, int bd) {
   if (eob > 1)
-    aom_highbd_iwht4x4_16_add(input, dest, stride, bd);
+    av1_highbd_iwht4x4_16_add(input, dest, stride, bd);
   else
-    aom_highbd_iwht4x4_1_add(input, dest, stride, bd);
+    av1_highbd_iwht4x4_1_add(input, dest, stride, bd);
 }
 
-#if CONFIG_CHROMA_2X2
-static void highbd_inv_txfm_add_2x2(const tran_low_t *input, uint8_t *dest,
-                                    int stride, const TxfmParam *txfm_param) {
-  int eob = txfm_param->eob;
-  int bd = txfm_param->bd;
-  int lossless = txfm_param->lossless;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  tran_high_t a1 = input[0] >> UNIT_QUANT_SHIFT;
-  tran_high_t b1 = input[1] >> UNIT_QUANT_SHIFT;
-  tran_high_t c1 = input[2] >> UNIT_QUANT_SHIFT;
-  tran_high_t d1 = input[3] >> UNIT_QUANT_SHIFT;
-
-  tran_high_t a2 = a1 + c1;
-  tran_high_t b2 = b1 + d1;
-  tran_high_t c2 = a1 - c1;
-  tran_high_t d2 = b1 - d1;
-
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dest);
-
-  (void)tx_type;
-  (void)lossless;
-  (void)eob;
-
-  a1 = (a2 + b2) >> 2;
-  b1 = (a2 - b2) >> 2;
-  c1 = (c2 + d2) >> 2;
-  d1 = (c2 - d2) >> 2;
-
-  dst[0] = highbd_clip_pixel_add(dst[0], a1, bd);
-  dst[1] = highbd_clip_pixel_add(dst[1], b1, bd);
-  dst[stride] = highbd_clip_pixel_add(dst[stride], c1, bd);
-  dst[stride + 1] = highbd_clip_pixel_add(dst[stride + 1], d1, bd);
-}
-#endif
-
 static const int32_t *cast_to_int32(const tran_low_t *input) {
   assert(sizeof(int32_t) == sizeof(tran_low_t));
   return (const int32_t *)input;
@@ -2636,6 +46,7 @@ static const int32_t *cast_to_int32(const tran_low_t *input) {
 
 void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
                                  int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   int eob = txfm_param->eob;
   int bd = txfm_param->bd;
   int lossless = txfm_param->lossless;
@@ -2643,27 +54,12 @@ void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
   const TX_TYPE tx_type = txfm_param->tx_type;
   if (lossless) {
     assert(tx_type == DCT_DCT);
-    av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
+    highbd_iwht4x4_add(input, dest, stride, eob, bd);
     return;
   }
   switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      av1_inv_txfm2d_add_4x4(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                             bd);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-      av1_inv_txfm2d_add_4x4(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                             bd);
-      break;
-    // use the c version for anything including identity for now
+    // Assembly version doesn't support some transform types, so use C version
+    // for those.
     case V_DCT:
     case H_DCT:
     case V_ADST:
@@ -2674,68 +70,112 @@ void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
       av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
                                bd);
       break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
+    default:
+      av1_inv_txfm2d_add_4x4(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                             bd);
+      break;
   }
 }
 
-void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
-                                 int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                           txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_4x8(src, CONVERT_TO_SHORTPTR(dest), stride,
+                         txfm_param->tx_type, txfm_param->bd);
 }
 
-void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
-                                 int stride, const TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                           txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_8x4(src, CONVERT_TO_SHORTPTR(dest), stride,
+                         txfm_param->tx_type, txfm_param->bd);
 }
 
 static void highbd_inv_txfm_add_8x16(const tran_low_t *input, uint8_t *dest,
                                      int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                            txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_8x16(src, CONVERT_TO_SHORTPTR(dest), stride,
+                          txfm_param->tx_type, txfm_param->bd);
 }
 
 static void highbd_inv_txfm_add_16x8(const tran_low_t *input, uint8_t *dest,
                                      int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                            txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_16x8(src, CONVERT_TO_SHORTPTR(dest), stride,
+                          txfm_param->tx_type, txfm_param->bd);
 }
 
 static void highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                             txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_16x32(src, CONVERT_TO_SHORTPTR(dest), stride,
+                           txfm_param->tx_type, txfm_param->bd);
 }
 
 static void highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                             txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_32x16(src, CONVERT_TO_SHORTPTR(dest), stride,
+                           txfm_param->tx_type, txfm_param->bd);
+}
+
+static void highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_16x4(src, CONVERT_TO_SHORTPTR(dest), stride,
+                          txfm_param->tx_type, txfm_param->bd);
+}
+
+static void highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_4x16(src, CONVERT_TO_SHORTPTR(dest), stride,
+                          txfm_param->tx_type, txfm_param->bd);
+}
+
+static void highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_32x8(src, CONVERT_TO_SHORTPTR(dest), stride,
+                          txfm_param->tx_type, txfm_param->bd);
+}
+
+static void highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_8x32(src, CONVERT_TO_SHORTPTR(dest), stride,
+                          txfm_param->tx_type, txfm_param->bd);
 }
 
-#if CONFIG_TX64X64
 static void highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                             txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_32x64(src, CONVERT_TO_SHORTPTR(dest), stride,
+                           txfm_param->tx_type, txfm_param->bd);
 }
 
 static void highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
   const int32_t *src = cast_to_int32(input);
-  av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                             txfm_param->tx_type, txfm_param->bd);
+  av1_inv_txfm2d_add_64x32(src, CONVERT_TO_SHORTPTR(dest), stride,
+                           txfm_param->tx_type, txfm_param->bd);
+}
+
+static void highbd_inv_txfm_add_16x64(const tran_low_t *input, uint8_t *dest,
+                                      int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_16x64(src, CONVERT_TO_SHORTPTR(dest), stride,
+                           txfm_param->tx_type, txfm_param->bd);
+}
+
+static void highbd_inv_txfm_add_64x16(const tran_low_t *input, uint8_t *dest,
+                                      int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_64x16(src, CONVERT_TO_SHORTPTR(dest), stride,
+                           txfm_param->tx_type, txfm_param->bd);
 }
-#endif  // CONFIG_TX64X64
 
 static void highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
                                     int stride, const TxfmParam *txfm_param) {
@@ -2743,23 +183,8 @@ static void highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
   switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      av1_inv_txfm2d_add_8x8(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                             bd);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-      av1_inv_txfm2d_add_8x8(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                             bd);
-      break;
-    // use the c version for anything including identity for now
+    // Assembly version doesn't support some transform types, so use C version
+    // for those.
     case V_DCT:
     case H_DCT:
     case V_ADST:
@@ -2770,8 +195,10 @@ static void highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
       av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
                                bd);
       break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0);
+    default:
+      av1_inv_txfm2d_add_8x8(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                             bd);
+      break;
   }
 }
 
@@ -2781,23 +208,8 @@ static void highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
   switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      av1_inv_txfm2d_add_16x16(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-      av1_inv_txfm2d_add_16x16(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
-                               bd);
-      break;
-    // use the c version for anything including identity for now
+    // Assembly version doesn't support some transform types, so use C version
+    // for those.
     case V_DCT:
     case H_DCT:
     case V_ADST:
@@ -2808,14 +220,16 @@ static void highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
       av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                                  tx_type, bd);
       break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0);
+    default:
+      av1_inv_txfm2d_add_16x16(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                               bd);
+      break;
   }
 }
 
 static void highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
+  const int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
   switch (tx_type) {
@@ -2823,26 +237,8 @@ static void highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
       av1_inv_txfm2d_add_32x32(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
                                bd);
       break;
-
-    // The optimised version only supports DCT_DCT, so force use of
-    // the C version for all other transform types.
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
+    // Assembly version doesn't support IDTX, so use C version for it.
     case IDTX:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-#endif  // CONFIG_EXT_TX
       av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                                  tx_type, bd);
       break;
@@ -2851,225 +247,34 @@ static void highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
   }
 }
 
-#if CONFIG_TX64X64
 static void highbd_inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest,
                                       int stride, const TxfmParam *txfm_param) {
-  int bd = txfm_param->bd;
+  const int bd = txfm_param->bd;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int32_t *src = cast_to_int32(input);
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_inv_txfm2d_add_64x64(src, CONVERT_TO_SHORTPTR(dest), stride, DCT_DCT,
-                               bd);
-      break;
-#if CONFIG_EXT_TX
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-      // TODO(sarahparker)
-      // I've deleted the 64x64 implementations that existed in lieu
-      // of adst, flipadst and identity for simplicity but will bring back
-      // in a later change. This shouldn't impact performance since
-      // DCT_DCT is the only extended type currently allowed for 64x64,
-      // as dictated by get_ext_tx_set_type in blockd.h.
-      av1_inv_txfm2d_add_64x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
-                                 DCT_DCT, bd);
-      break;
-    case IDTX:
-      highbd_inv_idtx_add_c(input, dest, stride, 64, 64, tx_type, bd);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-}
-#endif  // CONFIG_TX64X64
-
-void av1_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
-                      TxfmParam *txfm_param) {
-  const TX_SIZE tx_size = txfm_param->tx_size;
-#if CONFIG_LGT_FROM_PRED
-  if (txfm_param->use_lgt) {
-    assert(is_lgt_allowed(txfm_param->mode, tx_size));
-    ilgt2d_from_pred_add(input, dest, stride, txfm_param);
-    return;
-  }
-#endif  // CONFIG_LGT_FROM_PRED
-  switch (tx_size) {
-#if CONFIG_TX64X64
-    case TX_64X64: inv_txfm_add_64x64(input, dest, stride, txfm_param); break;
-#endif  // CONFIG_TX64X64
-    case TX_32X32: inv_txfm_add_32x32(input, dest, stride, txfm_param); break;
-    case TX_16X16: inv_txfm_add_16x16(input, dest, stride, txfm_param); break;
-    case TX_8X8: inv_txfm_add_8x8(input, dest, stride, txfm_param); break;
-    case TX_4X8: inv_txfm_add_4x8(input, dest, stride, txfm_param); break;
-    case TX_8X4: inv_txfm_add_8x4(input, dest, stride, txfm_param); break;
-    case TX_8X16: inv_txfm_add_8x16(input, dest, stride, txfm_param); break;
-    case TX_16X8: inv_txfm_add_16x8(input, dest, stride, txfm_param); break;
-    case TX_16X32: inv_txfm_add_16x32(input, dest, stride, txfm_param); break;
-    case TX_32X16: inv_txfm_add_32x16(input, dest, stride, txfm_param); break;
-#if CONFIG_TX64X64
-    case TX_64X32: inv_txfm_add_64x32(input, dest, stride, txfm_param); break;
-    case TX_32X64: inv_txfm_add_32x64(input, dest, stride, txfm_param); break;
-#endif  // CONFIG_TX64X64
-    case TX_4X4:
-      // this is like av1_short_idct4x4 but has a special case around eob<=1
-      // which is significant (not just an optimization) for the lossless
-      // case.
-      inv_txfm_add_4x4(input, dest, stride, txfm_param);
-      break;
-#if CONFIG_CHROMA_2X2
-    case TX_2X2: inv_txfm_add_2x2(input, dest, stride, txfm_param); break;
-#endif
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    case TX_32X8: inv_txfm_add_32x8(input, dest, stride, txfm_param); break;
-    case TX_8X32: inv_txfm_add_8x32(input, dest, stride, txfm_param); break;
-    case TX_16X4: inv_txfm_add_16x4(input, dest, stride, txfm_param); break;
-    case TX_4X16: inv_txfm_add_4x16(input, dest, stride, txfm_param); break;
-#endif
-    default: assert(0 && "Invalid transform size"); break;
-  }
+  assert(tx_type == DCT_DCT);
+  av1_inv_txfm2d_add_64x64(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
 }
 
-static void init_txfm_param(const MACROBLOCKD *xd, TX_SIZE tx_size,
-                            TX_TYPE tx_type, int eob, TxfmParam *txfm_param) {
+static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
+                            TX_TYPE tx_type, int eob, int reduced_tx_set,
+                            TxfmParam *txfm_param) {
+  (void)plane;
   txfm_param->tx_type = tx_type;
   txfm_param->tx_size = tx_size;
   txfm_param->eob = eob;
-  txfm_param->lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+  txfm_param->lossless = xd->lossless[xd->mi[0]->segment_id];
   txfm_param->bd = xd->bd;
-#if CONFIG_LGT
-  txfm_param->is_inter = is_inter_block(&xd->mi[0]->mbmi);
-#endif
-#if CONFIG_LGT_FROM_PRED
-  txfm_param->use_lgt = xd->mi[0]->mbmi.use_lgt;
-#endif
-#if CONFIG_ADAPT_SCAN
-  txfm_param->eob_threshold =
-      (const int16_t *)&xd->eob_threshold_md[tx_size][tx_type][0];
-#endif
-}
-
-#if !CONFIG_TXMG
-typedef void (*InvTxfmFunc)(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
-                            TxfmParam *txfm_param);
-
-static InvTxfmFunc inv_txfm_func[2] = { av1_inv_txfm_add,
-                                        av1_highbd_inv_txfm_add };
-#endif
-
-void av1_inverse_transform_block(const MACROBLOCKD *xd,
-                                 const tran_low_t *dqcoeff,
-#if CONFIG_LGT_FROM_PRED
-                                 PREDICTION_MODE mode,
-#endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                 uint8_t *mrc_mask,
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                 TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
-                                 int stride, int eob) {
-  if (!eob) return;
-#if CONFIG_PVQ
-  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
-  const int txb_width = block_size_wide[tx_bsize];
-  const int txb_height = block_size_high[tx_bsize];
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    for (int r = 0; r < txb_height; r++)
-      for (int c = 0; c < txb_width; c++)
-        CONVERT_TO_SHORTPTR(dst)[r * stride + c] = 0;
-  } else {
-    for (int r = 0; r < txb_height; r++)
-      for (int c = 0; c < txb_width; c++) dst[r * stride + c] = 0;
-  }
-#endif  // CONFIG_PVQ
-  TxfmParam txfm_param;
-  init_txfm_param(xd, tx_size, tx_type, eob, &txfm_param);
-#if CONFIG_LGT || CONFIG_MRC_TX
-  txfm_param.is_inter = is_inter_block(&xd->mi[0]->mbmi);
-#endif  // CONFIG_LGT || CONFIG_MRC_TX
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  txfm_param.mask = mrc_mask;
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-#if CONFIG_LGT_FROM_PRED || CONFIG_MRC_TX
-  txfm_param.dst = dst;
-  txfm_param.stride = stride;
-#if CONFIG_LGT_FROM_PRED
-  txfm_param.mode = mode;
-#endif  // CONFIG_LGT_FROM_PRED
-#endif  // CONFIG_LGT_FROM_PRED || CONFIG_MRC_TX
-
-  const int is_hbd = get_bitdepth_data_path_index(xd);
-#if CONFIG_TXMG
-  if (is_hbd) {
-    av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
-  } else {
-    DECLARE_ALIGNED(16, uint16_t, tmp[MAX_TX_SQUARE]);
-    int tmp_stride = MAX_TX_SIZE;
-    int w = tx_size_wide[tx_size];
-    int h = tx_size_high[tx_size];
-    for (int r = 0; r < h; ++r) {
-      for (int c = 0; c < w; ++c) {
-        tmp[r * tmp_stride + c] = dst[r * stride + c];
-      }
-    }
-
-    av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
-                            &txfm_param);
-
-    for (int r = 0; r < h; ++r) {
-      for (int c = 0; c < w; ++c) {
-        dst[r * stride + c] = (uint8_t)tmp[r * tmp_stride + c];
-      }
-    }
-  }
-#else  // CONFIG_TXMG
-  inv_txfm_func[is_hbd](dqcoeff, dst, stride, &txfm_param);
-#endif  // CONFIG_TXMG
-}
-
-void av1_inverse_transform_block_facade(MACROBLOCKD *xd, int plane, int block,
-                                        int blk_row, int blk_col, int eob) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-  const int dst_stride = pd->dst.stride;
-  uint8_t *dst =
-      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-  av1_inverse_transform_block(xd, dqcoeff,
-#if CONFIG_LGT_FROM_PRED
-                              xd->mi[0]->mbmi.mode,
-#endif  // CONFIG_LGT_FROM_PRED
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                              mrc_mask,
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                              tx_type, tx_size, dst, dst_stride, eob);
+  txfm_param->is_hbd = get_bitdepth_data_path_index(xd);
+  txfm_param->tx_set_type = av1_get_ext_tx_set_type(
+      txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
 }
 
-void av1_highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
-                             TxfmParam *txfm_param) {
+static void highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest,
+                                int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
-#if CONFIG_TX64X64
-    case TX_64X64:
-      highbd_inv_txfm_add_64x64(input, dest, stride, txfm_param);
-      break;
-#endif  // CONFIG_TX64X64
     case TX_32X32:
       highbd_inv_txfm_add_32x32(input, dest, stride, txfm_param);
       break;
@@ -3080,10 +285,10 @@ void av1_highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
       highbd_inv_txfm_add_8x8(input, dest, stride, txfm_param);
       break;
     case TX_4X8:
-      av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
       break;
     case TX_8X4:
-      av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+      highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
       break;
     case TX_8X16:
       highbd_inv_txfm_add_8x16(input, dest, stride, txfm_param);
@@ -3097,25 +302,81 @@ void av1_highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
     case TX_32X16:
       highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
       break;
-#if CONFIG_TX64X64
-    case TX_64X32:
-      highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+    case TX_64X64:
+      highbd_inv_txfm_add_64x64(input, dest, stride, txfm_param);
       break;
     case TX_32X64:
       highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
       break;
-#endif  // CONFIG_TX64X64
+    case TX_64X32:
+      highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+      break;
+    case TX_16X64:
+      highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param);
+      break;
+    case TX_64X16:
+      highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param);
+      break;
     case TX_4X4:
       // this is like av1_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
       // case.
       av1_highbd_inv_txfm_add_4x4(input, dest, stride, txfm_param);
       break;
-#if CONFIG_CHROMA_2X2
-    case TX_2X2:
-      highbd_inv_txfm_add_2x2(input, dest, stride, txfm_param);
+    case TX_16X4:
+      highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+      break;
+    case TX_4X16:
+      highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+      break;
+    case TX_8X32:
+      highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+      break;
+    case TX_32X8:
+      highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
       break;
-#endif
     default: assert(0 && "Invalid transform size"); break;
   }
 }
+
+void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+                        const TxfmParam *txfm_param) {
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  DECLARE_ALIGNED(32, uint16_t, tmp[MAX_TX_SQUARE]);
+  int tmp_stride = MAX_TX_SIZE;
+  int w = tx_size_wide[tx_size];
+  int h = tx_size_high[tx_size];
+  for (int r = 0; r < h; ++r) {
+    for (int c = 0; c < w; ++c) {
+      tmp[r * tmp_stride + c] = dst[r * stride + c];
+    }
+  }
+
+  highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride, txfm_param);
+
+  for (int r = 0; r < h; ++r) {
+    for (int c = 0; c < w; ++c) {
+      dst[r * stride + c] = (uint8_t)tmp[r * tmp_stride + c];
+    }
+  }
+}
+
+void av1_inverse_transform_block(const MACROBLOCKD *xd,
+                                 const tran_low_t *dqcoeff, int plane,
+                                 TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
+                                 int stride, int eob, int reduced_tx_set) {
+  if (!eob) return;
+
+  assert(eob <= av1_get_max_eob(tx_size));
+
+  TxfmParam txfm_param;
+  init_txfm_param(xd, plane, tx_size, tx_type, eob, reduced_tx_set,
+                  &txfm_param);
+  assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]);
+
+  if (txfm_param.is_hbd) {
+    highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+  } else {
+    av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+  }
+}
diff --git a/third_party/aom/av1/common/idct.h b/third_party/aom/av1/common/idct.h
index e4e4ad671..50032a167 100644
--- a/third_party/aom/av1/common/idct.h
+++ b/third_party/aom/av1/common/idct.h
@@ -12,15 +12,12 @@
 #ifndef AV1_COMMON_IDCT_H_
 #define AV1_COMMON_IDCT_H_
 
-#include <assert.h>
+#include "config/aom_config.h"
 
-#include "./aom_config.h"
 #include "av1/common/blockd.h"
 #include "av1/common/common.h"
 #include "av1/common/enums.h"
-#include "aom_dsp/inv_txfm.h"
 #include "aom_dsp/txfm_common.h"
-#include "aom_ports/mem.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -32,64 +29,16 @@ typedef struct {
   transform_1d cols, rows;  // vertical and horizontal
 } transform_2d;
 
-#if CONFIG_LGT
-int get_lgt4(const TxfmParam *txfm_param, int is_col,
-             const tran_high_t **lgtmtx);
-int get_lgt8(const TxfmParam *txfm_param, int is_col,
-             const tran_high_t **lgtmtx);
-#endif  // CONFIG_LGT
-
-#if CONFIG_LGT_FROM_PRED
-void get_lgt4_from_pred(const TxfmParam *txfm_param, int is_col,
-                        const tran_high_t **lgtmtx, int ntx);
-void get_lgt8_from_pred(const TxfmParam *txfm_param, int is_col,
-                        const tran_high_t **lgtmtx, int ntx);
-void get_lgt16up_from_pred(const TxfmParam *txfm_param, int is_col,
-                           const tran_high_t **lgtmtx, int ntx);
-#endif  // CONFIG_LGT_FROM_PRED
-
-#if CONFIG_HIGHBITDEPTH
-typedef void (*highbd_transform_1d)(const tran_low_t *, tran_low_t *, int bd);
-
-typedef struct {
-  highbd_transform_1d cols, rows;  // vertical and horizontal
-} highbd_transform_2d;
-#endif  // CONFIG_HIGHBITDEPTH
-
 #define MAX_TX_SCALE 1
 int av1_get_tx_scale(const TX_SIZE tx_size);
 
-void av1_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
-                     const TxfmParam *txfm_param);
-void av1_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
-                     const TxfmParam *txfm_param);
-
-void av1_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
-                      TxfmParam *txfm_param);
 void av1_inverse_transform_block(const MACROBLOCKD *xd,
-                                 const tran_low_t *dqcoeff,
-#if CONFIG_LGT_FROM_PRED
-                                 PREDICTION_MODE mode,
-#endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                 uint8_t *mrc_mask,
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                                 const tran_low_t *dqcoeff, int plane,
                                  TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
-                                 int stride, int eob);
-void av1_inverse_transform_block_facade(MACROBLOCKD *xd, int plane, int block,
-                                        int blk_row, int blk_col, int eob);
+                                 int stride, int eob, int reduced_tx_set);
 
-void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
-                            int eob, int bd);
 void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
                                  int stride, const TxfmParam *param);
-void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
-                                 int stride, const TxfmParam *param);
-void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
-                                 int stride, const TxfmParam *param);
-void av1_highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
-                             TxfmParam *txfm_param);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/laplace_tables.c b/third_party/aom/av1/common/laplace_tables.c
deleted file mode 100644
index ab8784895..000000000
--- a/third_party/aom/av1/common/laplace_tables.c
+++ /dev/null
@@ -1,657 +0,0 @@
-/* This file is auto-generated using "gen_laplace_tables 128 7" */
-
-/* clang-format off */
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include "aom_dsp/prob.h"
-#include "pvq.h"
-
-const uint16_t EXP_CDF_TABLE[128][16] = {
-  {AOM_ICDF(32753), AOM_ICDF(32754), AOM_ICDF(32755), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(32499), AOM_ICDF(32753), AOM_ICDF(32755), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(32243), AOM_ICDF(32747), AOM_ICDF(32755), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(31987), AOM_ICDF(32737), AOM_ICDF(32755), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(31732), AOM_ICDF(32724), AOM_ICDF(32755), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(31476), AOM_ICDF(32706), AOM_ICDF(32754), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(31220), AOM_ICDF(32684), AOM_ICDF(32753), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(30964), AOM_ICDF(32658), AOM_ICDF(32751), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(30708), AOM_ICDF(32628), AOM_ICDF(32748), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(30452), AOM_ICDF(32594), AOM_ICDF(32745), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(30198), AOM_ICDF(32558), AOM_ICDF(32742), AOM_ICDF(32756),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(29941), AOM_ICDF(32515), AOM_ICDF(32736), AOM_ICDF(32755),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(29686), AOM_ICDF(32470), AOM_ICDF(32731), AOM_ICDF(32755),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(29429), AOM_ICDF(32419), AOM_ICDF(32723), AOM_ICDF(32754),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(29174), AOM_ICDF(32366), AOM_ICDF(32715), AOM_ICDF(32753),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(28918), AOM_ICDF(32308), AOM_ICDF(32705), AOM_ICDF(32752),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(28662), AOM_ICDF(32246), AOM_ICDF(32694), AOM_ICDF(32750),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(28406), AOM_ICDF(32180), AOM_ICDF(32681), AOM_ICDF(32748),
-    AOM_ICDF(32757), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(28150), AOM_ICDF(32110), AOM_ICDF(32667), AOM_ICDF(32745),
-    AOM_ICDF(32756), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(27894), AOM_ICDF(32036), AOM_ICDF(32651), AOM_ICDF(32742),
-    AOM_ICDF(32756), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(27639), AOM_ICDF(31959), AOM_ICDF(32634), AOM_ICDF(32739),
-    AOM_ICDF(32755), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(27383), AOM_ICDF(31877), AOM_ICDF(32614), AOM_ICDF(32735),
-    AOM_ICDF(32755), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(27126), AOM_ICDF(31790), AOM_ICDF(32592), AOM_ICDF(32730),
-    AOM_ICDF(32754), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(26871), AOM_ICDF(31701), AOM_ICDF(32569), AOM_ICDF(32725),
-    AOM_ICDF(32753), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(26615), AOM_ICDF(31607), AOM_ICDF(32543), AOM_ICDF(32719),
-    AOM_ICDF(32752), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(26361), AOM_ICDF(31511), AOM_ICDF(32517), AOM_ICDF(32713),
-    AOM_ICDF(32751), AOM_ICDF(32758), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(26104), AOM_ICDF(31408), AOM_ICDF(32485), AOM_ICDF(32704),
-    AOM_ICDF(32748), AOM_ICDF(32757), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(25848), AOM_ICDF(31302), AOM_ICDF(32452), AOM_ICDF(32695),
-    AOM_ICDF(32746), AOM_ICDF(32757), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(25591), AOM_ICDF(31191), AOM_ICDF(32416), AOM_ICDF(32684),
-    AOM_ICDF(32743), AOM_ICDF(32756), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(25336), AOM_ICDF(31078), AOM_ICDF(32379), AOM_ICDF(32674),
-    AOM_ICDF(32741), AOM_ICDF(32756), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(25080), AOM_ICDF(30960), AOM_ICDF(32338), AOM_ICDF(32661),
-    AOM_ICDF(32737), AOM_ICDF(32755), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(24824), AOM_ICDF(30838), AOM_ICDF(32295), AOM_ICDF(32648),
-    AOM_ICDF(32733), AOM_ICDF(32754), AOM_ICDF(32759), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(24568), AOM_ICDF(30712), AOM_ICDF(32248), AOM_ICDF(32632),
-    AOM_ICDF(32728), AOM_ICDF(32752), AOM_ICDF(32758), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(24313), AOM_ICDF(30583), AOM_ICDF(32199), AOM_ICDF(32616),
-    AOM_ICDF(32723), AOM_ICDF(32751), AOM_ICDF(32758), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(24057), AOM_ICDF(30449), AOM_ICDF(32147), AOM_ICDF(32598),
-    AOM_ICDF(32718), AOM_ICDF(32750), AOM_ICDF(32758), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(23801), AOM_ICDF(30311), AOM_ICDF(32091), AOM_ICDF(32578),
-    AOM_ICDF(32711), AOM_ICDF(32747), AOM_ICDF(32757), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(23546), AOM_ICDF(30170), AOM_ICDF(32033), AOM_ICDF(32557),
-    AOM_ICDF(32704), AOM_ICDF(32745), AOM_ICDF(32757), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(23288), AOM_ICDF(30022), AOM_ICDF(31969), AOM_ICDF(32532),
-    AOM_ICDF(32695), AOM_ICDF(32742), AOM_ICDF(32756), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(23033), AOM_ICDF(29873), AOM_ICDF(31904), AOM_ICDF(32507),
-    AOM_ICDF(32686), AOM_ICDF(32739), AOM_ICDF(32755), AOM_ICDF(32760),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(22778), AOM_ICDF(29720), AOM_ICDF(31835), AOM_ICDF(32479),
-    AOM_ICDF(32675), AOM_ICDF(32735), AOM_ICDF(32753), AOM_ICDF(32759),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(22521), AOM_ICDF(29561), AOM_ICDF(31761), AOM_ICDF(32449),
-    AOM_ICDF(32664), AOM_ICDF(32731), AOM_ICDF(32752), AOM_ICDF(32759),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(22267), AOM_ICDF(29401), AOM_ICDF(31686), AOM_ICDF(32418),
-    AOM_ICDF(32652), AOM_ICDF(32727), AOM_ICDF(32751), AOM_ICDF(32759),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(22011), AOM_ICDF(29235), AOM_ICDF(31605), AOM_ICDF(32383),
-    AOM_ICDF(32638), AOM_ICDF(32722), AOM_ICDF(32749), AOM_ICDF(32758),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(21754), AOM_ICDF(29064), AOM_ICDF(31520), AOM_ICDF(32345),
-    AOM_ICDF(32622), AOM_ICDF(32715), AOM_ICDF(32746), AOM_ICDF(32757),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(21501), AOM_ICDF(28893), AOM_ICDF(31434), AOM_ICDF(32307),
-    AOM_ICDF(32607), AOM_ICDF(32710), AOM_ICDF(32745), AOM_ICDF(32757),
-    AOM_ICDF(32761), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(21243), AOM_ICDF(28713), AOM_ICDF(31339), AOM_ICDF(32262),
-    AOM_ICDF(32587), AOM_ICDF(32701), AOM_ICDF(32741), AOM_ICDF(32755),
-    AOM_ICDF(32760), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(20988), AOM_ICDF(28532), AOM_ICDF(31243), AOM_ICDF(32217),
-    AOM_ICDF(32567), AOM_ICDF(32693), AOM_ICDF(32738), AOM_ICDF(32754),
-    AOM_ICDF(32760), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(20730), AOM_ICDF(28344), AOM_ICDF(31140), AOM_ICDF(32167),
-    AOM_ICDF(32544), AOM_ICDF(32682), AOM_ICDF(32733), AOM_ICDF(32752),
-    AOM_ICDF(32759), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(20476), AOM_ICDF(28156), AOM_ICDF(31036), AOM_ICDF(32116),
-    AOM_ICDF(32521), AOM_ICDF(32673), AOM_ICDF(32730), AOM_ICDF(32751),
-    AOM_ICDF(32759), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(20220), AOM_ICDF(27962), AOM_ICDF(30926), AOM_ICDF(32061),
-    AOM_ICDF(32495), AOM_ICDF(32661), AOM_ICDF(32725), AOM_ICDF(32749),
-    AOM_ICDF(32758), AOM_ICDF(32762), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(19963), AOM_ICDF(27763), AOM_ICDF(30810), AOM_ICDF(32000),
-    AOM_ICDF(32465), AOM_ICDF(32647), AOM_ICDF(32718), AOM_ICDF(32746),
-    AOM_ICDF(32757), AOM_ICDF(32761), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(19708), AOM_ICDF(27562), AOM_ICDF(30691), AOM_ICDF(31938),
-    AOM_ICDF(32435), AOM_ICDF(32633), AOM_ICDF(32712), AOM_ICDF(32743),
-    AOM_ICDF(32756), AOM_ICDF(32761), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(19454), AOM_ICDF(27358), AOM_ICDF(30569), AOM_ICDF(31873),
-    AOM_ICDF(32403), AOM_ICDF(32618), AOM_ICDF(32705), AOM_ICDF(32741),
-    AOM_ICDF(32755), AOM_ICDF(32761), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(19196), AOM_ICDF(27146), AOM_ICDF(30438), AOM_ICDF(31801),
-    AOM_ICDF(32365), AOM_ICDF(32599), AOM_ICDF(32696), AOM_ICDF(32736),
-    AOM_ICDF(32753), AOM_ICDF(32760), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(18942), AOM_ICDF(26934), AOM_ICDF(30306), AOM_ICDF(31728),
-    AOM_ICDF(32328), AOM_ICDF(32581), AOM_ICDF(32688), AOM_ICDF(32733),
-    AOM_ICDF(32752), AOM_ICDF(32760), AOM_ICDF(32763), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(18684), AOM_ICDF(26714), AOM_ICDF(30164), AOM_ICDF(31647),
-    AOM_ICDF(32284), AOM_ICDF(32558), AOM_ICDF(32676), AOM_ICDF(32727),
-    AOM_ICDF(32749), AOM_ICDF(32758), AOM_ICDF(32762), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(18429), AOM_ICDF(26493), AOM_ICDF(30021), AOM_ICDF(31565),
-    AOM_ICDF(32240), AOM_ICDF(32535), AOM_ICDF(32664), AOM_ICDF(32721),
-    AOM_ICDF(32746), AOM_ICDF(32757), AOM_ICDF(32762), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(18174), AOM_ICDF(26268), AOM_ICDF(29872), AOM_ICDF(31477),
-    AOM_ICDF(32192), AOM_ICDF(32510), AOM_ICDF(32652), AOM_ICDF(32715),
-    AOM_ICDF(32743), AOM_ICDF(32756), AOM_ICDF(32762), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(17920), AOM_ICDF(26040), AOM_ICDF(29719), AOM_ICDF(31386),
-    AOM_ICDF(32141), AOM_ICDF(32483), AOM_ICDF(32638), AOM_ICDF(32708),
-    AOM_ICDF(32740), AOM_ICDF(32754), AOM_ICDF(32761), AOM_ICDF(32764),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(17661), AOM_ICDF(25803), AOM_ICDF(29556), AOM_ICDF(31286),
-    AOM_ICDF(32083), AOM_ICDF(32451), AOM_ICDF(32620), AOM_ICDF(32698),
-    AOM_ICDF(32734), AOM_ICDF(32751), AOM_ICDF(32759), AOM_ICDF(32763),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(17406), AOM_ICDF(25566), AOM_ICDF(29391), AOM_ICDF(31184),
-    AOM_ICDF(32024), AOM_ICDF(32418), AOM_ICDF(32603), AOM_ICDF(32690),
-    AOM_ICDF(32731), AOM_ICDF(32750), AOM_ICDF(32759), AOM_ICDF(32763),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(17151), AOM_ICDF(25325), AOM_ICDF(29220), AOM_ICDF(31076),
-    AOM_ICDF(31961), AOM_ICDF(32383), AOM_ICDF(32584), AOM_ICDF(32680),
-    AOM_ICDF(32726), AOM_ICDF(32748), AOM_ICDF(32758), AOM_ICDF(32763),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(16896), AOM_ICDF(25080), AOM_ICDF(29044), AOM_ICDF(30964),
-    AOM_ICDF(31894), AOM_ICDF(32344), AOM_ICDF(32562), AOM_ICDF(32668),
-    AOM_ICDF(32719), AOM_ICDF(32744), AOM_ICDF(32756), AOM_ICDF(32762),
-    AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(16639), AOM_ICDF(24829), AOM_ICDF(28860), AOM_ICDF(30844),
-    AOM_ICDF(31821), AOM_ICDF(32302), AOM_ICDF(32539), AOM_ICDF(32655),
-    AOM_ICDF(32712), AOM_ICDF(32740), AOM_ICDF(32754), AOM_ICDF(32761),
-    AOM_ICDF(32764), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(28672), AOM_ICDF(30720),
-    AOM_ICDF(31744), AOM_ICDF(32256), AOM_ICDF(32512), AOM_ICDF(32640),
-    AOM_ICDF(32704), AOM_ICDF(32736), AOM_ICDF(32752), AOM_ICDF(32760),
-    AOM_ICDF(32764), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(16130), AOM_ICDF(24320), AOM_ICDF(28479), AOM_ICDF(30591),
-    AOM_ICDF(31663), AOM_ICDF(32208), AOM_ICDF(32485), AOM_ICDF(32625),
-    AOM_ICDF(32696), AOM_ICDF(32732), AOM_ICDF(32750), AOM_ICDF(32759),
-    AOM_ICDF(32764), AOM_ICDF(32766), AOM_ICDF(32767), AOM_ICDF(32768)},
-  {AOM_ICDF(15872), AOM_ICDF(24056), AOM_ICDF(28276), AOM_ICDF(30452),
-    AOM_ICDF(31574), AOM_ICDF(32152), AOM_ICDF(32450), AOM_ICDF(32604),
-    AOM_ICDF(32683), AOM_ICDF(32724), AOM_ICDF(32745), AOM_ICDF(32756),
-    AOM_ICDF(32762), AOM_ICDF(32765), AOM_ICDF(32766), AOM_ICDF(32768)},
-  {AOM_ICDF(15615), AOM_ICDF(23789), AOM_ICDF(28068), AOM_ICDF(30308),
-    AOM_ICDF(31480), AOM_ICDF(32094), AOM_ICDF(32415), AOM_ICDF(32583),
-    AOM_ICDF(32671), AOM_ICDF(32717), AOM_ICDF(32741), AOM_ICDF(32754),
-    AOM_ICDF(32761), AOM_ICDF(32764), AOM_ICDF(32766), AOM_ICDF(32768)},
-  {AOM_ICDF(15361), AOM_ICDF(23521), AOM_ICDF(27856), AOM_ICDF(30159),
-    AOM_ICDF(31382), AOM_ICDF(32032), AOM_ICDF(32377), AOM_ICDF(32560),
-    AOM_ICDF(32657), AOM_ICDF(32709), AOM_ICDF(32737), AOM_ICDF(32752),
-    AOM_ICDF(32760), AOM_ICDF(32764), AOM_ICDF(32766), AOM_ICDF(32768)},
-  {AOM_ICDF(15103), AOM_ICDF(23245), AOM_ICDF(27634), AOM_ICDF(30000),
-    AOM_ICDF(31275), AOM_ICDF(31963), AOM_ICDF(32334), AOM_ICDF(32534),
-    AOM_ICDF(32642), AOM_ICDF(32700), AOM_ICDF(32731), AOM_ICDF(32748),
-    AOM_ICDF(32757), AOM_ICDF(32762), AOM_ICDF(32765), AOM_ICDF(32768)},
-  {AOM_ICDF(14848), AOM_ICDF(22968), AOM_ICDF(27409), AOM_ICDF(29837),
-    AOM_ICDF(31165), AOM_ICDF(31891), AOM_ICDF(32288), AOM_ICDF(32505),
-    AOM_ICDF(32624), AOM_ICDF(32689), AOM_ICDF(32725), AOM_ICDF(32744),
-    AOM_ICDF(32755), AOM_ICDF(32761), AOM_ICDF(32764), AOM_ICDF(32768)},
-  {AOM_ICDF(14592), AOM_ICDF(22686), AOM_ICDF(27176), AOM_ICDF(29666),
-    AOM_ICDF(31047), AOM_ICDF(31813), AOM_ICDF(32238), AOM_ICDF(32474),
-    AOM_ICDF(32605), AOM_ICDF(32678), AOM_ICDF(32718), AOM_ICDF(32740),
-    AOM_ICDF(32752), AOM_ICDF(32759), AOM_ICDF(32763), AOM_ICDF(32768)},
-  {AOM_ICDF(14336), AOM_ICDF(22400), AOM_ICDF(26936), AOM_ICDF(29488),
-    AOM_ICDF(30923), AOM_ICDF(31730), AOM_ICDF(32184), AOM_ICDF(32439),
-    AOM_ICDF(32583), AOM_ICDF(32664), AOM_ICDF(32709), AOM_ICDF(32735),
-    AOM_ICDF(32749), AOM_ICDF(32757), AOM_ICDF(32762), AOM_ICDF(32768)},
-  {AOM_ICDF(14079), AOM_ICDF(22109), AOM_ICDF(26689), AOM_ICDF(29301),
-    AOM_ICDF(30791), AOM_ICDF(31641), AOM_ICDF(32125), AOM_ICDF(32401),
-    AOM_ICDF(32559), AOM_ICDF(32649), AOM_ICDF(32700), AOM_ICDF(32729),
-    AOM_ICDF(32746), AOM_ICDF(32756), AOM_ICDF(32761), AOM_ICDF(32768)},
-  {AOM_ICDF(13825), AOM_ICDF(21817), AOM_ICDF(26437), AOM_ICDF(29108),
-    AOM_ICDF(30652), AOM_ICDF(31545), AOM_ICDF(32061), AOM_ICDF(32359),
-    AOM_ICDF(32532), AOM_ICDF(32632), AOM_ICDF(32690), AOM_ICDF(32723),
-    AOM_ICDF(32742), AOM_ICDF(32753), AOM_ICDF(32759), AOM_ICDF(32768)},
-  {AOM_ICDF(13568), AOM_ICDF(21518), AOM_ICDF(26176), AOM_ICDF(28905),
-    AOM_ICDF(30504), AOM_ICDF(31441), AOM_ICDF(31990), AOM_ICDF(32312),
-    AOM_ICDF(32501), AOM_ICDF(32611), AOM_ICDF(32676), AOM_ICDF(32714),
-    AOM_ICDF(32736), AOM_ICDF(32749), AOM_ICDF(32757), AOM_ICDF(32768)},
-  {AOM_ICDF(13314), AOM_ICDF(21218), AOM_ICDF(25911), AOM_ICDF(28697),
-    AOM_ICDF(30351), AOM_ICDF(31333), AOM_ICDF(31916), AOM_ICDF(32262),
-    AOM_ICDF(32468), AOM_ICDF(32590), AOM_ICDF(32662), AOM_ICDF(32705),
-    AOM_ICDF(32731), AOM_ICDF(32746), AOM_ICDF(32755), AOM_ICDF(32768)},
-  {AOM_ICDF(13054), AOM_ICDF(20908), AOM_ICDF(25633), AOM_ICDF(28475),
-    AOM_ICDF(30185), AOM_ICDF(31214), AOM_ICDF(31833), AOM_ICDF(32205),
-    AOM_ICDF(32429), AOM_ICDF(32564), AOM_ICDF(32645), AOM_ICDF(32694),
-    AOM_ICDF(32723), AOM_ICDF(32741), AOM_ICDF(32752), AOM_ICDF(32768)},
-  {AOM_ICDF(12803), AOM_ICDF(20603), AOM_ICDF(25356), AOM_ICDF(28252),
-    AOM_ICDF(30017), AOM_ICDF(31093), AOM_ICDF(31748), AOM_ICDF(32147),
-    AOM_ICDF(32390), AOM_ICDF(32538), AOM_ICDF(32628), AOM_ICDF(32683),
-    AOM_ICDF(32717), AOM_ICDF(32737), AOM_ICDF(32749), AOM_ICDF(32768)},
-  {AOM_ICDF(12544), AOM_ICDF(20286), AOM_ICDF(25064), AOM_ICDF(28013),
-    AOM_ICDF(29833), AOM_ICDF(30956), AOM_ICDF(31649), AOM_ICDF(32077),
-    AOM_ICDF(32341), AOM_ICDF(32504), AOM_ICDF(32605), AOM_ICDF(32667),
-    AOM_ICDF(32705), AOM_ICDF(32729), AOM_ICDF(32744), AOM_ICDF(32768)},
-  {AOM_ICDF(12288), AOM_ICDF(19968), AOM_ICDF(24768), AOM_ICDF(27768),
-    AOM_ICDF(29643), AOM_ICDF(30815), AOM_ICDF(31547), AOM_ICDF(32005),
-    AOM_ICDF(32291), AOM_ICDF(32470), AOM_ICDF(32582), AOM_ICDF(32652),
-    AOM_ICDF(32696), AOM_ICDF(32723), AOM_ICDF(32740), AOM_ICDF(32768)},
-  {AOM_ICDF(12033), AOM_ICDF(19647), AOM_ICDF(24465), AOM_ICDF(27514),
-    AOM_ICDF(29443), AOM_ICDF(30664), AOM_ICDF(31437), AOM_ICDF(31926),
-    AOM_ICDF(32235), AOM_ICDF(32431), AOM_ICDF(32555), AOM_ICDF(32633),
-    AOM_ICDF(32683), AOM_ICDF(32714), AOM_ICDF(32734), AOM_ICDF(32768)},
-  {AOM_ICDF(11777), AOM_ICDF(19321), AOM_ICDF(24154), AOM_ICDF(27250),
-    AOM_ICDF(29233), AOM_ICDF(30504), AOM_ICDF(31318), AOM_ICDF(31839),
-    AOM_ICDF(32173), AOM_ICDF(32387), AOM_ICDF(32524), AOM_ICDF(32612),
-    AOM_ICDF(32668), AOM_ICDF(32704), AOM_ICDF(32727), AOM_ICDF(32768)},
-  {AOM_ICDF(11521), AOM_ICDF(18991), AOM_ICDF(23835), AOM_ICDF(26976),
-    AOM_ICDF(29013), AOM_ICDF(30334), AOM_ICDF(31190), AOM_ICDF(31745),
-    AOM_ICDF(32105), AOM_ICDF(32338), AOM_ICDF(32489), AOM_ICDF(32587),
-    AOM_ICDF(32651), AOM_ICDF(32692), AOM_ICDF(32719), AOM_ICDF(32768)},
-  {AOM_ICDF(11265), AOM_ICDF(18657), AOM_ICDF(23508), AOM_ICDF(26691),
-    AOM_ICDF(28780), AOM_ICDF(30151), AOM_ICDF(31051), AOM_ICDF(31641),
-    AOM_ICDF(32028), AOM_ICDF(32282), AOM_ICDF(32449), AOM_ICDF(32559),
-    AOM_ICDF(32631), AOM_ICDF(32678), AOM_ICDF(32709), AOM_ICDF(32768)},
-  {AOM_ICDF(11006), AOM_ICDF(18316), AOM_ICDF(23170), AOM_ICDF(26394),
-    AOM_ICDF(28535), AOM_ICDF(29957), AOM_ICDF(30901), AOM_ICDF(31528),
-    AOM_ICDF(31944), AOM_ICDF(32220), AOM_ICDF(32404), AOM_ICDF(32526),
-    AOM_ICDF(32607), AOM_ICDF(32661), AOM_ICDF(32697), AOM_ICDF(32768)},
-  {AOM_ICDF(10752), AOM_ICDF(17976), AOM_ICDF(22830), AOM_ICDF(26091),
-    AOM_ICDF(28282), AOM_ICDF(29754), AOM_ICDF(30743), AOM_ICDF(31408),
-    AOM_ICDF(31854), AOM_ICDF(32154), AOM_ICDF(32356), AOM_ICDF(32491),
-    AOM_ICDF(32582), AOM_ICDF(32643), AOM_ICDF(32684), AOM_ICDF(32768)},
-  {AOM_ICDF(10496), AOM_ICDF(17630), AOM_ICDF(22479), AOM_ICDF(25775),
-    AOM_ICDF(28015), AOM_ICDF(29538), AOM_ICDF(30573), AOM_ICDF(31276),
-    AOM_ICDF(31754), AOM_ICDF(32079), AOM_ICDF(32300), AOM_ICDF(32450),
-    AOM_ICDF(32552), AOM_ICDF(32621), AOM_ICDF(32668), AOM_ICDF(32768)},
-  {AOM_ICDF(10240), AOM_ICDF(17280), AOM_ICDF(22120), AOM_ICDF(25448),
-    AOM_ICDF(27736), AOM_ICDF(29309), AOM_ICDF(30390), AOM_ICDF(31133),
-    AOM_ICDF(31644), AOM_ICDF(31995), AOM_ICDF(32237), AOM_ICDF(32403),
-    AOM_ICDF(32517), AOM_ICDF(32595), AOM_ICDF(32649), AOM_ICDF(32768)},
-  { AOM_ICDF(9984), AOM_ICDF(16926), AOM_ICDF(21753), AOM_ICDF(25109),
-    AOM_ICDF(27443), AOM_ICDF(29066), AOM_ICDF(30194), AOM_ICDF(30978),
-    AOM_ICDF(31523), AOM_ICDF(31902), AOM_ICDF(32166), AOM_ICDF(32349),
-    AOM_ICDF(32476), AOM_ICDF(32565), AOM_ICDF(32627), AOM_ICDF(32768)},
-  { AOM_ICDF(9728), AOM_ICDF(16568), AOM_ICDF(21377), AOM_ICDF(24759),
-    AOM_ICDF(27137), AOM_ICDF(28809), AOM_ICDF(29984), AOM_ICDF(30811),
-    AOM_ICDF(31392), AOM_ICDF(31801), AOM_ICDF(32088), AOM_ICDF(32290),
-    AOM_ICDF(32432), AOM_ICDF(32532), AOM_ICDF(32602), AOM_ICDF(32768)},
-  { AOM_ICDF(9474), AOM_ICDF(16208), AOM_ICDF(20995), AOM_ICDF(24399),
-    AOM_ICDF(26819), AOM_ICDF(28539), AOM_ICDF(29762), AOM_ICDF(30631),
-    AOM_ICDF(31249), AOM_ICDF(31688), AOM_ICDF(32000), AOM_ICDF(32222),
-    AOM_ICDF(32380), AOM_ICDF(32492), AOM_ICDF(32572), AOM_ICDF(32768)},
-  { AOM_ICDF(9216), AOM_ICDF(15840), AOM_ICDF(20601), AOM_ICDF(24023),
-    AOM_ICDF(26483), AOM_ICDF(28251), AOM_ICDF(29522), AOM_ICDF(30435),
-    AOM_ICDF(31091), AOM_ICDF(31563), AOM_ICDF(31902), AOM_ICDF(32146),
-    AOM_ICDF(32321), AOM_ICDF(32447), AOM_ICDF(32537), AOM_ICDF(32768)},
-  { AOM_ICDF(8959), AOM_ICDF(15469), AOM_ICDF(20199), AOM_ICDF(23636),
-    AOM_ICDF(26133), AOM_ICDF(27947), AOM_ICDF(29265), AOM_ICDF(30223),
-    AOM_ICDF(30919), AOM_ICDF(31425), AOM_ICDF(31792), AOM_ICDF(32059),
-    AOM_ICDF(32253), AOM_ICDF(32394), AOM_ICDF(32496), AOM_ICDF(32768)},
-  { AOM_ICDF(8705), AOM_ICDF(15097), AOM_ICDF(19791), AOM_ICDF(23238),
-    AOM_ICDF(25770), AOM_ICDF(27629), AOM_ICDF(28994), AOM_ICDF(29997),
-    AOM_ICDF(30733), AOM_ICDF(31274), AOM_ICDF(31671), AOM_ICDF(31963),
-    AOM_ICDF(32177), AOM_ICDF(32334), AOM_ICDF(32449), AOM_ICDF(32768)},
-  { AOM_ICDF(8449), AOM_ICDF(14719), AOM_ICDF(19373), AOM_ICDF(22827),
-    AOM_ICDF(25390), AOM_ICDF(27292), AOM_ICDF(28704), AOM_ICDF(29752),
-    AOM_ICDF(30530), AOM_ICDF(31107), AOM_ICDF(31535), AOM_ICDF(31853),
-    AOM_ICDF(32089), AOM_ICDF(32264), AOM_ICDF(32394), AOM_ICDF(32768)},
-  { AOM_ICDF(8192), AOM_ICDF(14336), AOM_ICDF(18944), AOM_ICDF(22400),
-    AOM_ICDF(24992), AOM_ICDF(26936), AOM_ICDF(28394), AOM_ICDF(29488),
-    AOM_ICDF(30308), AOM_ICDF(30923), AOM_ICDF(31384), AOM_ICDF(31730),
-    AOM_ICDF(31989), AOM_ICDF(32184), AOM_ICDF(32330), AOM_ICDF(32768)},
-  { AOM_ICDF(7936), AOM_ICDF(13950), AOM_ICDF(18507), AOM_ICDF(21961),
-    AOM_ICDF(24578), AOM_ICDF(26561), AOM_ICDF(28064), AOM_ICDF(29203),
-    AOM_ICDF(30066), AOM_ICDF(30720), AOM_ICDF(31216), AOM_ICDF(31592),
-    AOM_ICDF(31877), AOM_ICDF(32093), AOM_ICDF(32256), AOM_ICDF(32768)},
-  { AOM_ICDF(7678), AOM_ICDF(13558), AOM_ICDF(18060), AOM_ICDF(21507),
-    AOM_ICDF(24146), AOM_ICDF(26166), AOM_ICDF(27713), AOM_ICDF(28897),
-    AOM_ICDF(29804), AOM_ICDF(30498), AOM_ICDF(31030), AOM_ICDF(31437),
-    AOM_ICDF(31749), AOM_ICDF(31988), AOM_ICDF(32171), AOM_ICDF(32768)},
-  { AOM_ICDF(7423), AOM_ICDF(13165), AOM_ICDF(17606), AOM_ICDF(21041),
-    AOM_ICDF(23698), AOM_ICDF(25753), AOM_ICDF(27342), AOM_ICDF(28571),
-    AOM_ICDF(29522), AOM_ICDF(30257), AOM_ICDF(30826), AOM_ICDF(31266),
-    AOM_ICDF(31606), AOM_ICDF(31869), AOM_ICDF(32073), AOM_ICDF(32768)},
-  { AOM_ICDF(7168), AOM_ICDF(12768), AOM_ICDF(17143), AOM_ICDF(20561),
-    AOM_ICDF(23231), AOM_ICDF(25317), AOM_ICDF(26947), AOM_ICDF(28220),
-    AOM_ICDF(29215), AOM_ICDF(29992), AOM_ICDF(30599), AOM_ICDF(31073),
-    AOM_ICDF(31444), AOM_ICDF(31734), AOM_ICDF(31960), AOM_ICDF(32768)},
-  { AOM_ICDF(6911), AOM_ICDF(12365), AOM_ICDF(16669), AOM_ICDF(20065),
-    AOM_ICDF(22744), AOM_ICDF(24858), AOM_ICDF(26526), AOM_ICDF(27842),
-    AOM_ICDF(28881), AOM_ICDF(29701), AOM_ICDF(30348), AOM_ICDF(30858),
-    AOM_ICDF(31261), AOM_ICDF(31579), AOM_ICDF(31830), AOM_ICDF(32768)},
-  { AOM_ICDF(6657), AOM_ICDF(11961), AOM_ICDF(16188), AOM_ICDF(19556),
-    AOM_ICDF(22240), AOM_ICDF(24379), AOM_ICDF(26083), AOM_ICDF(27441),
-    AOM_ICDF(28523), AOM_ICDF(29385), AOM_ICDF(30072), AOM_ICDF(30620),
-    AOM_ICDF(31056), AOM_ICDF(31404), AOM_ICDF(31681), AOM_ICDF(32768)},
-  { AOM_ICDF(6400), AOM_ICDF(11550), AOM_ICDF(15694), AOM_ICDF(19029),
-    AOM_ICDF(21712), AOM_ICDF(23871), AOM_ICDF(25609), AOM_ICDF(27007),
-    AOM_ICDF(28132), AOM_ICDF(29037), AOM_ICDF(29766), AOM_ICDF(30352),
-    AOM_ICDF(30824), AOM_ICDF(31204), AOM_ICDF(31509), AOM_ICDF(32768)},
-  { AOM_ICDF(6142), AOM_ICDF(11134), AOM_ICDF(15190), AOM_ICDF(18486),
-    AOM_ICDF(21164), AOM_ICDF(23340), AOM_ICDF(25108), AOM_ICDF(26544),
-    AOM_ICDF(27711), AOM_ICDF(28659), AOM_ICDF(29429), AOM_ICDF(30055),
-    AOM_ICDF(30564), AOM_ICDF(30977), AOM_ICDF(31313), AOM_ICDF(32768)},
-  { AOM_ICDF(5890), AOM_ICDF(10720), AOM_ICDF(14682), AOM_ICDF(17932),
-    AOM_ICDF(20598), AOM_ICDF(22785), AOM_ICDF(24579), AOM_ICDF(26051),
-    AOM_ICDF(27258), AOM_ICDF(28248), AOM_ICDF(29060), AOM_ICDF(29726),
-    AOM_ICDF(30273), AOM_ICDF(30721), AOM_ICDF(31089), AOM_ICDF(32768)},
-  { AOM_ICDF(5631), AOM_ICDF(10295), AOM_ICDF(14157), AOM_ICDF(17356),
-    AOM_ICDF(20005), AOM_ICDF(22199), AOM_ICDF(24016), AOM_ICDF(25520),
-    AOM_ICDF(26766), AOM_ICDF(27798), AOM_ICDF(28652), AOM_ICDF(29359),
-    AOM_ICDF(29945), AOM_ICDF(30430), AOM_ICDF(30832), AOM_ICDF(32768)},
-  { AOM_ICDF(5377), AOM_ICDF(9871), AOM_ICDF(13628), AOM_ICDF(16768),
-    AOM_ICDF(19393), AOM_ICDF(21587), AOM_ICDF(23421), AOM_ICDF(24954),
-    AOM_ICDF(26236), AOM_ICDF(27308), AOM_ICDF(28204), AOM_ICDF(28953),
-    AOM_ICDF(29579), AOM_ICDF(30102), AOM_ICDF(30539), AOM_ICDF(32768)},
-  { AOM_ICDF(5121), AOM_ICDF(9441), AOM_ICDF(13086), AOM_ICDF(16161),
-    AOM_ICDF(18756), AOM_ICDF(20945), AOM_ICDF(22792), AOM_ICDF(24351),
-    AOM_ICDF(25666), AOM_ICDF(26776), AOM_ICDF(27712), AOM_ICDF(28502),
-    AOM_ICDF(29169), AOM_ICDF(29731), AOM_ICDF(30206), AOM_ICDF(32768)},
-  { AOM_ICDF(4865), AOM_ICDF(9007), AOM_ICDF(12534), AOM_ICDF(15538),
-    AOM_ICDF(18096), AOM_ICDF(20274), AOM_ICDF(22129), AOM_ICDF(23708),
-    AOM_ICDF(25053), AOM_ICDF(26198), AOM_ICDF(27173), AOM_ICDF(28004),
-    AOM_ICDF(28711), AOM_ICDF(29313), AOM_ICDF(29826), AOM_ICDF(32768)},
-  { AOM_ICDF(4608), AOM_ICDF(8568), AOM_ICDF(11971), AOM_ICDF(14896),
-    AOM_ICDF(17409), AOM_ICDF(19569), AOM_ICDF(21425), AOM_ICDF(23020),
-    AOM_ICDF(24391), AOM_ICDF(25569), AOM_ICDF(26581), AOM_ICDF(27451),
-    AOM_ICDF(28199), AOM_ICDF(28842), AOM_ICDF(29394), AOM_ICDF(32768)},
-  { AOM_ICDF(4351), AOM_ICDF(8125), AOM_ICDF(11398), AOM_ICDF(14236),
-    AOM_ICDF(16697), AOM_ICDF(18831), AOM_ICDF(20682), AOM_ICDF(22287),
-    AOM_ICDF(23679), AOM_ICDF(24886), AOM_ICDF(25933), AOM_ICDF(26841),
-    AOM_ICDF(27628), AOM_ICDF(28311), AOM_ICDF(28903), AOM_ICDF(32768)},
-  { AOM_ICDF(4096), AOM_ICDF(7680), AOM_ICDF(10816), AOM_ICDF(13560),
-    AOM_ICDF(15961), AOM_ICDF(18062), AOM_ICDF(19900), AOM_ICDF(21508),
-    AOM_ICDF(22915), AOM_ICDF(24146), AOM_ICDF(25224), AOM_ICDF(26167),
-    AOM_ICDF(26992), AOM_ICDF(27714), AOM_ICDF(28346), AOM_ICDF(32768)},
-  { AOM_ICDF(3840), AOM_ICDF(7230), AOM_ICDF(10223), AOM_ICDF(12865),
-    AOM_ICDF(15197), AOM_ICDF(17256), AOM_ICDF(19074), AOM_ICDF(20679),
-    AOM_ICDF(22096), AOM_ICDF(23347), AOM_ICDF(24451), AOM_ICDF(25426),
-    AOM_ICDF(26287), AOM_ICDF(27047), AOM_ICDF(27718), AOM_ICDF(32768)},
-  { AOM_ICDF(3584), AOM_ICDF(6776), AOM_ICDF(9619), AOM_ICDF(12151),
-    AOM_ICDF(14406), AOM_ICDF(16414), AOM_ICDF(18203), AOM_ICDF(19796),
-    AOM_ICDF(21215), AOM_ICDF(22479), AOM_ICDF(23604), AOM_ICDF(24606),
-    AOM_ICDF(25499), AOM_ICDF(26294), AOM_ICDF(27002), AOM_ICDF(32768)},
-  { AOM_ICDF(3328), AOM_ICDF(6318), AOM_ICDF(9004), AOM_ICDF(11417),
-    AOM_ICDF(13585), AOM_ICDF(15533), AOM_ICDF(17283), AOM_ICDF(18856),
-    AOM_ICDF(20269), AOM_ICDF(21538), AOM_ICDF(22678), AOM_ICDF(23703),
-    AOM_ICDF(24624), AOM_ICDF(25451), AOM_ICDF(26194), AOM_ICDF(32768)},
-  { AOM_ICDF(3072), AOM_ICDF(5856), AOM_ICDF(8379), AOM_ICDF(10665),
-    AOM_ICDF(12737), AOM_ICDF(14615), AOM_ICDF(16317), AOM_ICDF(17859),
-    AOM_ICDF(19257), AOM_ICDF(20524), AOM_ICDF(21672), AOM_ICDF(22712),
-    AOM_ICDF(23655), AOM_ICDF(24509), AOM_ICDF(25283), AOM_ICDF(32768)},
-  { AOM_ICDF(2816), AOM_ICDF(5390), AOM_ICDF(7743), AOM_ICDF(9894),
-    AOM_ICDF(11860), AOM_ICDF(13657), AOM_ICDF(15299), AOM_ICDF(16800),
-    AOM_ICDF(18172), AOM_ICDF(19426), AOM_ICDF(20573), AOM_ICDF(21621),
-    AOM_ICDF(22579), AOM_ICDF(23455), AOM_ICDF(24255), AOM_ICDF(32768)},
-  { AOM_ICDF(2560), AOM_ICDF(4920), AOM_ICDF(7096), AOM_ICDF(9102),
-    AOM_ICDF(10951), AOM_ICDF(12656), AOM_ICDF(14227), AOM_ICDF(15676),
-    AOM_ICDF(17011), AOM_ICDF(18242), AOM_ICDF(19377), AOM_ICDF(20423),
-    AOM_ICDF(21388), AOM_ICDF(22277), AOM_ICDF(23097), AOM_ICDF(32768)},
-  { AOM_ICDF(2304), AOM_ICDF(4446), AOM_ICDF(6437), AOM_ICDF(8288),
-    AOM_ICDF(10009), AOM_ICDF(11609), AOM_ICDF(13097), AOM_ICDF(14480),
-    AOM_ICDF(15766), AOM_ICDF(16961), AOM_ICDF(18072), AOM_ICDF(19105),
-    AOM_ICDF(20066), AOM_ICDF(20959), AOM_ICDF(21789), AOM_ICDF(32768)},
-  { AOM_ICDF(2048), AOM_ICDF(3968), AOM_ICDF(5768), AOM_ICDF(7456),
-    AOM_ICDF(9038), AOM_ICDF(10521), AOM_ICDF(11911), AOM_ICDF(13215),
-    AOM_ICDF(14437), AOM_ICDF(15583), AOM_ICDF(16657), AOM_ICDF(17664),
-    AOM_ICDF(18608), AOM_ICDF(19493), AOM_ICDF(20323), AOM_ICDF(32768)},
-  { AOM_ICDF(1792), AOM_ICDF(3486), AOM_ICDF(5087), AOM_ICDF(6601),
-    AOM_ICDF(8032), AOM_ICDF(9385), AOM_ICDF(10664), AOM_ICDF(11873),
-    AOM_ICDF(13016), AOM_ICDF(14096), AOM_ICDF(15117), AOM_ICDF(16082),
-    AOM_ICDF(16995), AOM_ICDF(17858), AOM_ICDF(18673), AOM_ICDF(32768)},
-  { AOM_ICDF(1536), AOM_ICDF(3000), AOM_ICDF(4395), AOM_ICDF(5725),
-    AOM_ICDF(6993), AOM_ICDF(8201), AOM_ICDF(9353), AOM_ICDF(10451),
-    AOM_ICDF(11497), AOM_ICDF(12494), AOM_ICDF(13444), AOM_ICDF(14350),
-    AOM_ICDF(15213), AOM_ICDF(16036), AOM_ICDF(16820), AOM_ICDF(32768)},
-  { AOM_ICDF(1280), AOM_ICDF(2510), AOM_ICDF(3692), AOM_ICDF(4828),
-    AOM_ICDF(5919), AOM_ICDF(6968), AOM_ICDF(7976), AOM_ICDF(8944),
-    AOM_ICDF(9875), AOM_ICDF(10769), AOM_ICDF(11628), AOM_ICDF(12454),
-    AOM_ICDF(13248), AOM_ICDF(14011), AOM_ICDF(14744), AOM_ICDF(32768)},
-  { AOM_ICDF(1024), AOM_ICDF(2016), AOM_ICDF(2977), AOM_ICDF(3908),
-    AOM_ICDF(4810), AOM_ICDF(5684), AOM_ICDF(6530), AOM_ICDF(7350),
-    AOM_ICDF(8144), AOM_ICDF(8913), AOM_ICDF(9658), AOM_ICDF(10380),
-    AOM_ICDF(11080), AOM_ICDF(11758), AOM_ICDF(12415), AOM_ICDF(32768)},
-  {  AOM_ICDF(768), AOM_ICDF(1518), AOM_ICDF(2250), AOM_ICDF(2965),
-    AOM_ICDF(3663), AOM_ICDF(4345), AOM_ICDF(5011), AOM_ICDF(5662),
-    AOM_ICDF(6297), AOM_ICDF(6917), AOM_ICDF(7523), AOM_ICDF(8115),
-    AOM_ICDF(8693), AOM_ICDF(9257), AOM_ICDF(9808), AOM_ICDF(32768)},
-  {  AOM_ICDF(512), AOM_ICDF(1016), AOM_ICDF(1512), AOM_ICDF(2000),
-    AOM_ICDF(2481), AOM_ICDF(2954), AOM_ICDF(3420), AOM_ICDF(3879),
-    AOM_ICDF(4330), AOM_ICDF(4774), AOM_ICDF(5211), AOM_ICDF(5642),
-    AOM_ICDF(6066), AOM_ICDF(6483), AOM_ICDF(6894), AOM_ICDF(32768)},
-  {  AOM_ICDF(256),  AOM_ICDF(510),  AOM_ICDF(762), AOM_ICDF(1012),
-    AOM_ICDF(1260), AOM_ICDF(1506), AOM_ICDF(1750), AOM_ICDF(1992),
-    AOM_ICDF(2232), AOM_ICDF(2471), AOM_ICDF(2708), AOM_ICDF(2943),
-    AOM_ICDF(3176), AOM_ICDF(3407), AOM_ICDF(3636), AOM_ICDF(32768)},
-};
-
-
-const uint16_t LAPLACE_OFFSET[128] = {
-  0,
-  29871,
-  28672,
-  27751,
-  26975,
-  26291,
-  25673,
-  25105,
-  24576,
-  24079,
-  23609,
-  23162,
-  22734,
-  22325,
-  21931,
-  21550,
-  21182,
-  20826,
-  20480,
-  20143,
-  19815,
-  19495,
-  19183,
-  18877,
-  18579,
-  18286,
-  17999,
-  17718,
-  17442,
-  17170,
-  16904,
-  16642,
-  16384,
-  16129,
-  15879,
-  15633,
-  15390,
-  15150,
-  14913,
-  14680,
-  14450,
-  14222,
-  13997,
-  13775,
-  13556,
-  13338,
-  13124,
-  12911,
-  12701,
-  12493,
-  12288,
-  12084,
-  11882,
-  11682,
-  11484,
-  11288,
-  11094,
-  10901,
-  10710,
-  10521,
-  10333,
-  10147,
-  9962,
-  9779,
-  9597,
-  9417,
-  9238,
-  9060,
-  8884,
-  8709,
-  8535,
-  8363,
-  8192,
-  8021,
-  7853,
-  7685,
-  7518,
-  7352,
-  7188,
-  7025,
-  6862,
-  6701,
-  6540,
-  6381,
-  6222,
-  6065,
-  5908,
-  5753,
-  5598,
-  5444,
-  5291,
-  5138,
-  4987,
-  4837,
-  4687,
-  4538,
-  4390,
-  4242,
-  4096,
-  3950,
-  3804,
-  3660,
-  3516,
-  3373,
-  3231,
-  3089,
-  2948,
-  2808,
-  2668,
-  2529,
-  2391,
-  2253,
-  2116,
-  1979,
-  1843,
-  1708,
-  1573,
-  1439,
-  1306,
-  1172,
-  1040,
-  908,
-  777,
-  646,
-  516,
-  386,
-  257,
-  128,
-};
diff --git a/third_party/aom/av1/common/mips/msa/av1_idct16x16_msa.c b/third_party/aom/av1/common/mips/msa/av1_idct16x16_msa.c
deleted file mode 100644
index ff461b914..000000000
--- a/third_party/aom/av1/common/mips/msa/av1_idct16x16_msa.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "av1/common/enums.h"
-#include "aom_dsp/mips/inv_txfm_msa.h"
-
-void av1_iht16x16_256_add_msa(const int16_t *input, uint8_t *dst,
-                              int32_t dst_stride, TxfmParam *txfm_param) {
-  int32_t i;
-  DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
-  int16_t *out_ptr = &out[0];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  switch (tx_type) {
-    case DCT_DCT:
-      /* transform rows */
-      for (i = 0; i < 2; ++i) {
-        /* process 16 * 8 block */
-        aom_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
-      }
-
-      /* transform columns */
-      for (i = 0; i < 2; ++i) {
-        /* process 8 * 16 block */
-        aom_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
-                                         dst_stride);
-      }
-      break;
-    case ADST_DCT:
-      /* transform rows */
-      for (i = 0; i < 2; ++i) {
-        /* process 16 * 8 block */
-        aom_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
-      }
-
-      /* transform columns */
-      for (i = 0; i < 2; ++i) {
-        aom_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
-                                          (dst + (i << 3)), dst_stride);
-      }
-      break;
-    case DCT_ADST:
-      /* transform rows */
-      for (i = 0; i < 2; ++i) {
-        /* process 16 * 8 block */
-        aom_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
-      }
-
-      /* transform columns */
-      for (i = 0; i < 2; ++i) {
-        /* process 8 * 16 block */
-        aom_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
-                                         dst_stride);
-      }
-      break;
-    case ADST_ADST:
-      /* transform rows */
-      for (i = 0; i < 2; ++i) {
-        /* process 16 * 8 block */
-        aom_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
-      }
-
-      /* transform columns */
-      for (i = 0; i < 2; ++i) {
-        aom_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
-                                          (dst + (i << 3)), dst_stride);
-      }
-      break;
-    default: assert(0); break;
-  }
-}
diff --git a/third_party/aom/av1/common/mips/msa/av1_idct4x4_msa.c b/third_party/aom/av1/common/mips/msa/av1_idct4x4_msa.c
deleted file mode 100644
index 37f7fd77b..000000000
--- a/third_party/aom/av1/common/mips/msa/av1_idct4x4_msa.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "av1/common/enums.h"
-#include "aom_dsp/mips/inv_txfm_msa.h"
-
-void av1_iht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
-                           int32_t dst_stride, TxfmParam *txfm_param) {
-  v8i16 in0, in1, in2, in3;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  /* load vector elements of 4x4 block */
-  LD4x4_SH(input, in0, in1, in2, in3);
-  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-
-  switch (tx_type) {
-    case DCT_DCT:
-      /* DCT in horizontal */
-      AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
-      /* DCT in vertical */
-      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-      AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
-      break;
-    case ADST_DCT:
-      /* DCT in horizontal */
-      AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
-      /* ADST in vertical */
-      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-      AOM_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
-      break;
-    case DCT_ADST:
-      /* ADST in horizontal */
-      AOM_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
-      /* DCT in vertical */
-      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-      AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
-      break;
-    case ADST_ADST:
-      /* ADST in horizontal */
-      AOM_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
-      /* ADST in vertical */
-      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-      AOM_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
-      break;
-    default: assert(0); break;
-  }
-
-  /* final rounding (add 2^3, divide by 2^4) and shift */
-  SRARI_H4_SH(in0, in1, in2, in3, 4);
-  /* add block and store 4x4 */
-  ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
-}
diff --git a/third_party/aom/av1/common/mips/msa/av1_idct8x8_msa.c b/third_party/aom/av1/common/mips/msa/av1_idct8x8_msa.c
deleted file mode 100644
index 7410f7b98..000000000
--- a/third_party/aom/av1/common/mips/msa/av1_idct8x8_msa.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "av1/common/enums.h"
-#include "aom_dsp/mips/inv_txfm_msa.h"
-
-void av1_iht8x8_64_add_msa(const int16_t *input, uint8_t *dst,
-                           int32_t dst_stride, TxfmParam *txfm_param) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  /* load vector elements of 8x8 block */
-  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
-
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-
-  switch (tx_type) {
-    case DCT_DCT:
-      /* DCT in horizontal */
-      AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-      /* DCT in vertical */
-      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
-                         in3, in4, in5, in6, in7);
-      AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-      break;
-    case ADST_DCT:
-      /* DCT in horizontal */
-      AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-      /* ADST in vertical */
-      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
-                         in3, in4, in5, in6, in7);
-      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      break;
-    case DCT_ADST:
-      /* ADST in horizontal */
-      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      /* DCT in vertical */
-      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
-                         in3, in4, in5, in6, in7);
-      AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-      break;
-    case ADST_ADST:
-      /* ADST in horizontal */
-      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      /* ADST in vertical */
-      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
-                         in3, in4, in5, in6, in7);
-      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      break;
-    default: assert(0); break;
-  }
-
-  /* final rounding (add 2^4, divide by 2^5) and shift */
-  SRARI_H4_SH(in0, in1, in2, in3, 5);
-  SRARI_H4_SH(in4, in5, in6, in7, 5);
-
-  /* add block and store 8x8 */
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
-  dst += (4 * dst_stride);
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
-}
diff --git a/third_party/aom/av1/common/mv.h b/third_party/aom/av1/common/mv.h
index 65f0f7eda..a6227f18f 100644
--- a/third_party/aom/av1/common/mv.h
+++ b/third_party/aom/av1/common/mv.h
@@ -27,6 +27,8 @@ typedef struct mv {
   int16_t col;
 } MV;
 
+static const MV kZeroMv = { 0, 0 };
+
 typedef union int_mv {
   uint32_t as_int;
   MV as_mv;
@@ -37,11 +39,6 @@ typedef struct mv32 {
   int32_t col;
 } MV32;
 
-#if CONFIG_WARPED_MOTION
-#define WARPED_MOTION_SORT_SAMPLES 1
-#endif  // CONFIG_WARPED_MOTION
-
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 // Bits of precision used for the model
 #define WARPEDMODEL_PREC_BITS 16
 #define WARPEDMODEL_ROW3HOMO_PREC_BITS 16
@@ -54,19 +51,8 @@ typedef struct mv32 {
 #define WARPEDPIXEL_PREC_BITS 6
 #define WARPEDPIXEL_PREC_SHIFTS (1 << WARPEDPIXEL_PREC_BITS)
 
-// Taps for ntap filter
-#define WARPEDPIXEL_FILTER_TAPS 6
-
-// Precision of filter taps
-#define WARPEDPIXEL_FILTER_BITS 7
-
 #define WARP_PARAM_REDUCE_BITS 6
 
-// Precision bits reduction after horizontal shear
-#define HORSHEAR_REDUCE_PREC_BITS 5
-#define VERSHEAR_REDUCE_PREC_BITS \
-  (2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)
-
 #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
 
 /* clang-format off */
@@ -75,10 +61,7 @@ typedef enum {
   TRANSLATION = 1,   // translational motion 2-parameter
   ROTZOOM = 2,       // simplified affine with rotation + zoom only, 4-parameter
   AFFINE = 3,        // affine, 6-parameter
-  HORTRAPEZOID = 4,  // constrained homography, hor trapezoid, 6-parameter
-  VERTRAPEZOID = 5,  // constrained homography, ver trapezoid, 6-parameter
-  HOMOGRAPHY = 6,    // homography, 8-parameter
-  TRANS_TYPES = 7,
+  TRANS_TYPES,
 } TransformationType;
 /* clang-format on */
 
@@ -90,24 +73,13 @@ typedef enum {
 // GLOBAL_TRANS_TYPES 7 - up to full homography
 #define GLOBAL_TRANS_TYPES 4
 
-#if GLOBAL_TRANS_TYPES > 4
-// First bit indicates whether using identity or not
-// GLOBAL_TYPE_BITS=ceiling(log2(GLOBAL_TRANS_TYPES-1)) is the
-// number of bits needed to cover the remaining possibilities
-#define GLOBAL_TYPE_BITS (get_msb(2 * GLOBAL_TRANS_TYPES - 3))
-#endif  // GLOBAL_TRANS_TYPES > 4
-
 typedef struct {
-#if CONFIG_GLOBAL_MOTION
   int global_warp_allowed;
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
   int local_warp_allowed;
-#endif  // CONFIG_WARPED_MOTION
 } WarpTypesAllowed;
 
 // number of parameters used by each transformation in TransformationTypes
-static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6, 6, 6, 8 };
+static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
 
 // The order of values in the wmmat matrix below is best described
 // by the homography:
@@ -118,6 +90,7 @@ typedef struct {
   TransformationType wmtype;
   int32_t wmmat[8];
   int16_t alpha, beta, gamma, delta;
+  int8_t invalid;
 } WarpedMotionParams;
 
 /* clang-format off */
@@ -125,12 +98,11 @@ static const WarpedMotionParams default_warp_params = {
   IDENTITY,
   { 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0,
     0 },
-  0, 0, 0, 0
+  0, 0, 0, 0,
+  0,
 };
 /* clang-format on */
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 
-#if CONFIG_GLOBAL_MOTION
 // The following constants describe the various precisions
 // of different parameters in the global motion experiment.
 //
@@ -187,9 +159,6 @@ static const WarpedMotionParams default_warp_params = {
 #define GM_ALPHA_MIN -GM_ALPHA_MAX
 #define GM_ROW3HOMO_MIN -GM_ROW3HOMO_MAX
 
-// Use global motion parameters for sub8x8 blocks
-#define GLOBAL_SUB8X8_USED 0
-
 static INLINE int block_center_x(int mi_col, BLOCK_SIZE bs) {
   const int bw = block_size_wide[bs];
   return mi_col * MI_SIZE + bw / 2 - 1;
@@ -206,7 +175,6 @@ static INLINE int convert_to_trans_prec(int allow_hp, int coor) {
   else
     return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 2) * 2;
 }
-#if CONFIG_AMVR
 static INLINE void integer_mv_precision(MV *mv) {
   int mod = (mv->row % 8);
   if (mod != 0) {
@@ -232,7 +200,6 @@ static INLINE void integer_mv_precision(MV *mv) {
     }
   }
 }
-#endif
 // Convert a global motion vector into a motion vector at the centre of the
 // given block.
 //
@@ -242,14 +209,15 @@ static INLINE void integer_mv_precision(MV *mv) {
 // represents an integer)
 static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm,
                                           int allow_hp, BLOCK_SIZE bsize,
-                                          int mi_col, int mi_row, int block_idx
-#if CONFIG_AMVR
-                                          ,
-                                          int is_integer
-#endif
-                                          ) {
-  const int unify_bsize = CONFIG_CB4X4;
+                                          int mi_col, int mi_row,
+                                          int is_integer) {
   int_mv res;
+
+  if (gm->wmtype == IDENTITY) {
+    res.as_int = 0;
+    return res;
+  }
+
   const int32_t *mat = gm->wmmat;
   int x, y, tx, ty;
 
@@ -265,65 +233,37 @@ static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm,
     res.as_mv.row = gm->wmmat[0] >> GM_TRANS_ONLY_PREC_DIFF;
     res.as_mv.col = gm->wmmat[1] >> GM_TRANS_ONLY_PREC_DIFF;
     assert(IMPLIES(1 & (res.as_mv.row | res.as_mv.col), allow_hp));
-#if CONFIG_AMVR
     if (is_integer) {
       integer_mv_precision(&res.as_mv);
     }
-#endif
     return res;
   }
 
-  if (bsize >= BLOCK_8X8 || unify_bsize) {
-    x = block_center_x(mi_col, bsize);
-    y = block_center_y(mi_row, bsize);
-  } else {
-    x = block_center_x(mi_col, bsize);
-    y = block_center_y(mi_row, bsize);
-    x += (block_idx & 1) * MI_SIZE / 2;
-    y += (block_idx & 2) * MI_SIZE / 4;
-  }
+  x = block_center_x(mi_col, bsize);
+  y = block_center_y(mi_row, bsize);
 
   if (gm->wmtype == ROTZOOM) {
     assert(gm->wmmat[5] == gm->wmmat[2]);
     assert(gm->wmmat[4] == -gm->wmmat[3]);
   }
-  if (gm->wmtype > AFFINE) {
-    int xc = (int)((int64_t)mat[2] * x + (int64_t)mat[3] * y + mat[0]);
-    int yc = (int)((int64_t)mat[4] * x + (int64_t)mat[5] * y + mat[1]);
-    const int Z = (int)((int64_t)mat[6] * x + (int64_t)mat[7] * y +
-                        (1 << WARPEDMODEL_ROW3HOMO_PREC_BITS));
-    xc *= 1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS - WARPEDMODEL_PREC_BITS);
-    yc *= 1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS - WARPEDMODEL_PREC_BITS);
-    xc = (int)(xc > 0 ? ((int64_t)xc + Z / 2) / Z : ((int64_t)xc - Z / 2) / Z);
-    yc = (int)(yc > 0 ? ((int64_t)yc + Z / 2) / Z : ((int64_t)yc - Z / 2) / Z);
-    tx = convert_to_trans_prec(allow_hp, xc) - (x << 3);
-    ty = convert_to_trans_prec(allow_hp, yc) - (y << 3);
-  } else {
-    const int xc =
-        (mat[2] - (1 << WARPEDMODEL_PREC_BITS)) * x + mat[3] * y + mat[0];
-    const int yc =
-        mat[4] * x + (mat[5] - (1 << WARPEDMODEL_PREC_BITS)) * y + mat[1];
-    tx = convert_to_trans_prec(allow_hp, xc);
-    ty = convert_to_trans_prec(allow_hp, yc);
-  }
+
+  const int xc =
+      (mat[2] - (1 << WARPEDMODEL_PREC_BITS)) * x + mat[3] * y + mat[0];
+  const int yc =
+      mat[4] * x + (mat[5] - (1 << WARPEDMODEL_PREC_BITS)) * y + mat[1];
+  tx = convert_to_trans_prec(allow_hp, xc);
+  ty = convert_to_trans_prec(allow_hp, yc);
 
   res.as_mv.row = ty;
   res.as_mv.col = tx;
 
-#if CONFIG_AMVR
   if (is_integer) {
     integer_mv_precision(&res.as_mv);
   }
-#endif
   return res;
 }
 
 static INLINE TransformationType get_gmtype(const WarpedMotionParams *gm) {
-  if (gm->wmmat[6] != 0 || gm->wmmat[7] != 0) {
-    if (!gm->wmmat[6] && !gm->wmmat[4]) return HORTRAPEZOID;
-    if (!gm->wmmat[7] && !gm->wmmat[3]) return VERTRAPEZOID;
-    return HOMOGRAPHY;
-  }
   if (gm->wmmat[5] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[4] &&
       gm->wmmat[2] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[3]) {
     return ((!gm->wmmat[1] && !gm->wmmat[0]) ? IDENTITY : TRANSLATION);
@@ -333,12 +273,10 @@ static INLINE TransformationType get_gmtype(const WarpedMotionParams *gm) {
   else
     return AFFINE;
 }
-#endif  // CONFIG_GLOBAL_MOTION
 
 typedef struct candidate_mv {
   int_mv this_mv;
   int_mv comp_mv;
-  uint8_t pred_diff[2];
   int weight;
 } CANDIDATE_MV;
 
diff --git a/third_party/aom/av1/common/mvref_common.c b/third_party/aom/av1/common/mvref_common.c
index 891396e9b..6939df335 100644
--- a/third_party/aom/av1/common/mvref_common.c
+++ b/third_party/aom/av1/common/mvref_common.c
@@ -9,68 +9,72 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <stdlib.h>
+
 #include "av1/common/mvref_common.h"
-#if CONFIG_WARPED_MOTION
 #include "av1/common/warped_motion.h"
-#endif  // CONFIG_WARPED_MOTION
 
-#if CONFIG_GLOBAL_MOTION
-#define USE_CUR_GM_REFMV 1
-#endif  // CONFIG_GLOBAL_MOTION
+// Although we assign 32 bit integers, all the values are strictly under 14
+// bits.
+static int div_mult[32] = { 0,    16384, 8192, 5461, 4096, 3276, 2730, 2340,
+                            2048, 1820,  1638, 1489, 1365, 1260, 1170, 1092,
+                            1024, 963,   910,  862,  819,  780,  744,  712,
+                            682,  655,   630,  606,  585,  564,  546,  528 };
+
+// TODO(jingning): Consider the use of lookup table for (num / den)
+// altogether.
+static void get_mv_projection(MV *output, MV ref, int num, int den) {
+  den = AOMMIN(den, MAX_FRAME_DISTANCE);
+  num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE)
+                : AOMMAX(num, -MAX_FRAME_DISTANCE);
+  int mv_row = ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14);
+  int mv_col = ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14);
+  const int clamp_max = MV_UPP - 1;
+  const int clamp_min = MV_LOW + 1;
+  output->row = (int16_t)clamp(mv_row, clamp_min, clamp_max);
+  output->col = (int16_t)clamp(mv_col, clamp_min, clamp_max);
+}
 
-void av1_copy_frame_mvs(const AV1_COMMON *const cm, MODE_INFO *mi, int mi_row,
-                        int mi_col, int x_mis, int y_mis) {
-#if CONFIG_TMV
+void av1_copy_frame_mvs(const AV1_COMMON *const cm, MB_MODE_INFO *mi,
+                        int mi_row, int mi_col, int x_mis, int y_mis) {
   const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1);
-  MV_REF *frame_mvs = cm->cur_frame->mvs +
-                      ((mi_row & 0xfffe) >> 1) * frame_mvs_stride +
-                      ((mi_col & 0xfffe) >> 1);
+  MV_REF *frame_mvs =
+      cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
   x_mis = ROUND_POWER_OF_TWO(x_mis, 1);
   y_mis = ROUND_POWER_OF_TWO(y_mis, 1);
-#else
-  const int frame_mvs_stride = cm->mi_cols;
-  MV_REF *frame_mvs = cm->cur_frame->mvs +
-                      (mi_row & 0xfffe) * frame_mvs_stride + (mi_col & 0xfffe);
-  x_mis = AOMMAX(x_mis, 2);
-  y_mis = AOMMAX(y_mis, 2);
-#endif  // CONFIG_TMV
   int w, h;
 
   for (h = 0; h < y_mis; h++) {
-    MV_REF *const frame_mv = frame_mvs + h * frame_mvs_stride;
+    MV_REF *mv = frame_mvs;
     for (w = 0; w < x_mis; w++) {
-      MV_REF *const mv = frame_mv + w;
-      mv->ref_frame[0] = mi->mbmi.ref_frame[0];
-      mv->ref_frame[1] = mi->mbmi.ref_frame[1];
-      mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
-      mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
-      // (TODO:yunqing) The following 2 lines won't be used and can be removed.
-      mv->pred_mv[0].as_int = mi->mbmi.pred_mv[0].as_int;
-      mv->pred_mv[1].as_int = mi->mbmi.pred_mv[1].as_int;
+      mv->ref_frame = NONE_FRAME;
+      mv->mv.as_int = 0;
+
+      for (int idx = 0; idx < 2; ++idx) {
+        MV_REFERENCE_FRAME ref_frame = mi->ref_frame[idx];
+        if (ref_frame > INTRA_FRAME) {
+          int8_t ref_idx = cm->ref_frame_side[ref_frame];
+          if (ref_idx) continue;
+          if ((abs(mi->mv[idx].as_mv.row) > REFMVS_LIMIT) ||
+              (abs(mi->mv[idx].as_mv.col) > REFMVS_LIMIT))
+            continue;
+          mv->ref_frame = ref_frame;
+          mv->mv.as_int = mi->mv[idx].as_int;
+        }
+      }
+      mv++;
     }
+    frame_mvs += frame_mvs_stride;
   }
 }
 
-static uint8_t add_ref_mv_candidate(
-    const MODE_INFO *const candidate_mi, const MB_MODE_INFO *const candidate,
-    const MV_REFERENCE_FRAME rf[2], uint8_t *refmv_count,
-    CANDIDATE_MV *ref_mv_stack, const int use_hp, int len, int block,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-    int_mv *gm_mv_candidates, const WarpedMotionParams *gm_params,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-    int col, int weight
-#if CONFIG_AMVR
-    ,
-    int is_integer
-#endif
-    ) {
+static void add_ref_mv_candidate(
+    const MB_MODE_INFO *const candidate, const MV_REFERENCE_FRAME rf[2],
+    uint8_t *refmv_count, uint8_t *ref_match_count, uint8_t *newmv_count,
+    CANDIDATE_MV *ref_mv_stack, int_mv *gm_mv_candidates,
+    const WarpedMotionParams *gm_params, int col, int weight) {
+  if (!is_inter_block(candidate)) return;  // for intrabc
   int index = 0, ref;
-  int newmv_count = 0;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
   assert(weight % 2 == 0);
 
   if (rf[1] == NONE_FRAME) {
@@ -78,60 +82,24 @@ static uint8_t add_ref_mv_candidate(
     for (ref = 0; ref < 2; ++ref) {
       if (candidate->ref_frame[ref] == rf[0]) {
         int_mv this_refmv;
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-        if (is_global_mv_block(candidate_mi, block, gm_params[rf[0]].wmtype))
+        if (is_global_mv_block(candidate, gm_params[rf[0]].wmtype))
           this_refmv = gm_mv_candidates[0];
         else
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-          this_refmv = get_sub_block_mv(candidate_mi, ref, col, block);
-#if CONFIG_AMVR
-        lower_mv_precision(&this_refmv.as_mv, use_hp, is_integer);
-#else
-        lower_mv_precision(&this_refmv.as_mv, use_hp);
-#endif  // CONFIG_AMVR
+          this_refmv = get_sub_block_mv(candidate, ref, col);
 
         for (index = 0; index < *refmv_count; ++index)
           if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) break;
 
-        if (index < *refmv_count) ref_mv_stack[index].weight += weight * len;
+        if (index < *refmv_count) ref_mv_stack[index].weight += weight;
 
         // Add a new item to the list.
-        if (index == *refmv_count) {
+        if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
           ref_mv_stack[index].this_mv = this_refmv;
-          ref_mv_stack[index].pred_diff[0] = av1_get_pred_diff_ctx(
-              get_sub_block_pred_mv(candidate_mi, ref, col, block), this_refmv);
-          ref_mv_stack[index].weight = weight * len;
+          ref_mv_stack[index].weight = weight;
           ++(*refmv_count);
-
-          if (candidate->mode == NEWMV) ++newmv_count;
-        }
-
-        if (candidate_mi->mbmi.sb_type < BLOCK_8X8 && block >= 0 &&
-            !unify_bsize) {
-          int alt_block = 3 - block;
-          this_refmv = get_sub_block_mv(candidate_mi, ref, col, alt_block);
-#if CONFIG_AMVR
-          lower_mv_precision(&this_refmv.as_mv, use_hp, is_integer);
-#else
-          lower_mv_precision(&this_refmv.as_mv, use_hp);
-#endif
-          for (index = 0; index < *refmv_count; ++index)
-            if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) break;
-
-          if (index < *refmv_count) ref_mv_stack[index].weight += len;
-
-          // Add a new item to the list.
-          if (index == *refmv_count) {
-            ref_mv_stack[index].this_mv = this_refmv;
-            ref_mv_stack[index].pred_diff[0] = av1_get_pred_diff_ctx(
-                get_sub_block_pred_mv(candidate_mi, ref, col, alt_block),
-                this_refmv);
-            ref_mv_stack[index].weight = len;
-            ++(*refmv_count);
-
-            if (candidate->mode == NEWMV) ++newmv_count;
-          }
         }
+        if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
+        ++*ref_match_count;
       }
     }
   } else {
@@ -140,17 +108,10 @@ static uint8_t add_ref_mv_candidate(
       int_mv this_refmv[2];
 
       for (ref = 0; ref < 2; ++ref) {
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-        if (is_global_mv_block(candidate_mi, block, gm_params[rf[ref]].wmtype))
+        if (is_global_mv_block(candidate, gm_params[rf[ref]].wmtype))
           this_refmv[ref] = gm_mv_candidates[ref];
         else
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-          this_refmv[ref] = get_sub_block_mv(candidate_mi, ref, col, block);
-#if CONFIG_AMVR
-        lower_mv_precision(&this_refmv[ref].as_mv, use_hp, is_integer);
-#else
-        lower_mv_precision(&this_refmv[ref].as_mv, use_hp);
-#endif
+          this_refmv[ref] = get_sub_block_mv(candidate, ref, col);
       }
 
       for (index = 0; index < *refmv_count; ++index)
@@ -158,94 +119,46 @@ static uint8_t add_ref_mv_candidate(
             (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int))
           break;
 
-      if (index < *refmv_count) ref_mv_stack[index].weight += weight * len;
+      if (index < *refmv_count) ref_mv_stack[index].weight += weight;
 
       // Add a new item to the list.
-      if (index == *refmv_count) {
+      if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
         ref_mv_stack[index].this_mv = this_refmv[0];
         ref_mv_stack[index].comp_mv = this_refmv[1];
-        ref_mv_stack[index].pred_diff[0] = av1_get_pred_diff_ctx(
-            get_sub_block_pred_mv(candidate_mi, 0, col, block), this_refmv[0]);
-        ref_mv_stack[index].pred_diff[1] = av1_get_pred_diff_ctx(
-            get_sub_block_pred_mv(candidate_mi, 1, col, block), this_refmv[1]);
-        ref_mv_stack[index].weight = weight * len;
+        ref_mv_stack[index].weight = weight;
         ++(*refmv_count);
-
-        if (candidate->mode == NEW_NEWMV) ++newmv_count;
-      }
-
-      if (candidate_mi->mbmi.sb_type < BLOCK_8X8 && block >= 0 &&
-          !unify_bsize) {
-        int alt_block = 3 - block;
-        this_refmv[0] = get_sub_block_mv(candidate_mi, 0, col, alt_block);
-        this_refmv[1] = get_sub_block_mv(candidate_mi, 1, col, alt_block);
-
-        for (ref = 0; ref < 2; ++ref) {
-#if CONFIG_AMVR
-          lower_mv_precision(&this_refmv[ref].as_mv, use_hp, is_integer);
-#else
-          lower_mv_precision(&this_refmv[ref].as_mv, use_hp);
-#endif
-        }
-        for (index = 0; index < *refmv_count; ++index)
-          if (ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int &&
-              ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int)
-            break;
-
-        if (index < *refmv_count) ref_mv_stack[index].weight += len;
-
-        // Add a new item to the list.
-        if (index == *refmv_count) {
-          ref_mv_stack[index].this_mv = this_refmv[0];
-          ref_mv_stack[index].comp_mv = this_refmv[1];
-          ref_mv_stack[index].pred_diff[0] = av1_get_pred_diff_ctx(
-              get_sub_block_pred_mv(candidate_mi, 0, col, block),
-              this_refmv[0]);
-          ref_mv_stack[index].pred_diff[0] = av1_get_pred_diff_ctx(
-              get_sub_block_pred_mv(candidate_mi, 1, col, block),
-              this_refmv[1]);
-          ref_mv_stack[index].weight = len;
-          ++(*refmv_count);
-
-          if (candidate->mode == NEW_NEWMV) ++newmv_count;
-        }
       }
+      if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
+      ++*ref_match_count;
     }
   }
-  return newmv_count;
 }
 
-static uint8_t scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                             const int mi_col, int block,
-                             const MV_REFERENCE_FRAME rf[2], int row_offset,
-                             CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                             int_mv *gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                             int max_row_offset, int *processed_rows) {
-  const int end_mi = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
+static void scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                          int mi_row, int mi_col,
+                          const MV_REFERENCE_FRAME rf[2], int row_offset,
+                          CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count,
+                          uint8_t *ref_match_count, uint8_t *newmv_count,
+                          int_mv *gm_mv_candidates, int max_row_offset,
+                          int *processed_rows) {
+  int end_mi = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
+  end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]);
   const int n8_w_8 = mi_size_wide[BLOCK_8X8];
   const int n8_w_16 = mi_size_wide[BLOCK_16X16];
   int i;
-  uint8_t newmv_count = 0;
   int col_offset = 0;
-#if CONFIG_CB4X4
   const int shift = 0;
   // TODO(jingning): Revisit this part after cb4x4 is stable.
   if (abs(row_offset) > 1) {
     col_offset = 1;
-    if (mi_col & 0x01 && xd->n8_w < n8_w_8) --col_offset;
+    if ((mi_col & 0x01) && xd->n8_w < n8_w_8) --col_offset;
   }
   const int use_step_16 = (xd->n8_w >= 16);
-#else
-  const int shift = 1;
-  const int use_step_16 = (xd->n8_w >= 8);
-#endif
-  MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride;
-
-  for (i = 0; i < end_mi && *refmv_count < MAX_REF_MV_STACK_SIZE;) {
-    const MODE_INFO *const candidate_mi = candidate_mi0[col_offset + i];
-    const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+  MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride;
+  (void)mi_row;
+
+  for (i = 0; i < end_mi;) {
+    const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i];
     const int candidate_bsize = candidate->sb_type;
     const int n8_w = mi_size_wide[candidate_bsize];
     int len = AOMMIN(xd->n8_w, n8_w);
@@ -264,60 +177,38 @@ static uint8_t scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
       *processed_rows = inc - row_offset - 1;
     }
 
-#if CONFIG_AMVR
-    newmv_count += add_ref_mv_candidate(
-        candidate_mi, candidate, rf, refmv_count, ref_mv_stack,
-        cm->allow_high_precision_mv, len, block,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-        gm_mv_candidates, cm->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-        col_offset + i, weight, cm->cur_frame_mv_precision_level);
-#else
-    newmv_count += add_ref_mv_candidate(candidate_mi, candidate, rf,
-                                        refmv_count, ref_mv_stack,
-                                        cm->allow_high_precision_mv, len, block,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                                        gm_mv_candidates, cm->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                                        col_offset + i, weight);
-#endif
+    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
+                         newmv_count, ref_mv_stack, gm_mv_candidates,
+                         cm->global_motion, col_offset + i, len * weight);
 
     i += len;
   }
-
-  return newmv_count;
 }
 
-static uint8_t scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                             const int mi_row, int block,
-                             const MV_REFERENCE_FRAME rf[2], int col_offset,
-                             CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                             int_mv *gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                             int max_col_offset, int *processed_cols) {
-  const int end_mi = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
+static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                          int mi_row, int mi_col,
+                          const MV_REFERENCE_FRAME rf[2], int col_offset,
+                          CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count,
+                          uint8_t *ref_match_count, uint8_t *newmv_count,
+                          int_mv *gm_mv_candidates, int max_col_offset,
+                          int *processed_cols) {
+  int end_mi = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
+  end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]);
   const int n8_h_8 = mi_size_high[BLOCK_8X8];
   const int n8_h_16 = mi_size_high[BLOCK_16X16];
   int i;
-  uint8_t newmv_count = 0;
   int row_offset = 0;
-#if CONFIG_CB4X4
   const int shift = 0;
   if (abs(col_offset) > 1) {
     row_offset = 1;
-    if (mi_row & 0x01 && xd->n8_h < n8_h_8) --row_offset;
+    if ((mi_row & 0x01) && xd->n8_h < n8_h_8) --row_offset;
   }
   const int use_step_16 = (xd->n8_h >= 16);
-#else
-  const int shift = 1;
-  const int use_step_16 = (xd->n8_h >= 8);
-#endif
+  (void)mi_col;
 
-  for (i = 0; i < end_mi && *refmv_count < MAX_REF_MV_STACK_SIZE;) {
-    const MODE_INFO *const candidate_mi =
+  for (i = 0; i < end_mi;) {
+    const MB_MODE_INFO *const candidate =
         xd->mi[(row_offset + i) * xd->mi_stride + col_offset];
-    const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
     const int candidate_bsize = candidate->sb_type;
     const int n8_h = mi_size_high[candidate_bsize];
     int len = AOMMIN(xd->n8_h, n8_h);
@@ -336,79 +227,46 @@ static uint8_t scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
       *processed_cols = inc - col_offset - 1;
     }
 
-#if CONFIG_AMVR
-    newmv_count += add_ref_mv_candidate(
-        candidate_mi, candidate, rf, refmv_count, ref_mv_stack,
-        cm->allow_high_precision_mv, len, block,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-        gm_mv_candidates, cm->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-        col_offset, weight, cm->cur_frame_mv_precision_level);
-#else
-    newmv_count += add_ref_mv_candidate(candidate_mi, candidate, rf,
-                                        refmv_count, ref_mv_stack,
-                                        cm->allow_high_precision_mv, len, block,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                                        gm_mv_candidates, cm->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                                        col_offset, weight);
-#endif
+    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
+                         newmv_count, ref_mv_stack, gm_mv_candidates,
+                         cm->global_motion, col_offset, len * weight);
+
     i += len;
   }
-
-  return newmv_count;
 }
 
-static uint8_t scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                             const int mi_row, const int mi_col, int block,
-                             const MV_REFERENCE_FRAME rf[2], int row_offset,
-                             int col_offset, CANDIDATE_MV *ref_mv_stack,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                             int_mv *gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                             uint8_t *refmv_count) {
+static void scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                          const int mi_row, const int mi_col,
+                          const MV_REFERENCE_FRAME rf[2], int row_offset,
+                          int col_offset, CANDIDATE_MV *ref_mv_stack,
+                          uint8_t *ref_match_count, uint8_t *newmv_count,
+                          int_mv *gm_mv_candidates,
+                          uint8_t refmv_count[MODE_CTX_REF_FRAMES]) {
   const TileInfo *const tile = &xd->tile;
   POSITION mi_pos;
-  uint8_t newmv_count = 0;
 
   mi_pos.row = row_offset;
   mi_pos.col = col_offset;
 
-  if (is_inside(tile, mi_col, mi_row, cm->mi_rows, cm, &mi_pos) &&
-      *refmv_count < MAX_REF_MV_STACK_SIZE) {
-    const MODE_INFO *const candidate_mi =
+  if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) {
+    const MB_MODE_INFO *const candidate =
         xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
-    const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
     const int len = mi_size_wide[BLOCK_8X8];
 
-#if CONFIG_AMVR
-    newmv_count += add_ref_mv_candidate(
-        candidate_mi, candidate, rf, refmv_count, ref_mv_stack,
-        cm->allow_high_precision_mv, len, block,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-        gm_mv_candidates, cm->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-        mi_pos.col, 2, cm->cur_frame_mv_precision_level);
-#else
-    newmv_count += add_ref_mv_candidate(candidate_mi, candidate, rf,
-                                        refmv_count, ref_mv_stack,
-                                        cm->allow_high_precision_mv, len, block,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                                        gm_mv_candidates, cm->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                                        mi_pos.col, 2);
-#endif
+    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
+                         newmv_count, ref_mv_stack, gm_mv_candidates,
+                         cm->global_motion, mi_pos.col, 2 * len);
   }  // Analyze a single 8x8 block motion information.
-
-  return newmv_count;
 }
 
 static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                          int mi_row, int mi_col, int bs) {
-  const int sb_mi_size = mi_size_wide[cm->sb_size];
+  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
   const int mask_row = mi_row & (sb_mi_size - 1);
   const int mask_col = mi_col & (sb_mi_size - 1);
 
+  if (bs > mi_size_wide[BLOCK_64X64]) return 0;
+
   // In a split partition all apart from the bottom right has a top right
   int has_tr = !((mask_row & bs) && (mask_col & bs));
 
@@ -440,22 +298,20 @@ static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   if (xd->n8_w > xd->n8_h)
     if (xd->is_sec_rect) has_tr = 0;
 
-#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB
   // The bottom left square of a Vertical A (in the old format) does
   // not have a top right as it is decoded before the right hand
   // rectangle of the partition
-  if (xd->mi[0]->mbmi.partition == PARTITION_VERT_A)
-    if ((mask_row & bs) && !(mask_col & bs)) has_tr = 0;
-#endif  // CONFIG_EXT_PARTITION_TYPES
+  if (xd->mi[0]->partition == PARTITION_VERT_A) {
+    if (xd->n8_w == xd->n8_h)
+      if (mask_row & bs) has_tr = 0;
+  }
 
   return has_tr;
 }
 
-#if CONFIG_MFMV
-static int check_sb_border(const AV1_COMMON *cm, const int mi_row,
-                           const int mi_col, const int row_offset,
-                           const int col_offset) {
-  const int sb_mi_size = mi_size_wide[cm->sb_size];
+static int check_sb_border(const int mi_row, const int mi_col,
+                           const int row_offset, const int col_offset) {
+  const int sb_mi_size = mi_size_wide[BLOCK_64X64];
   const int row = mi_row & (sb_mi_size - 1);
   const int col = mi_col & (sb_mi_size - 1);
 
@@ -466,513 +322,307 @@ static int check_sb_border(const AV1_COMMON *cm, const int mi_row,
   return 1;
 }
 
-static int add_tpl_ref_mv(const AV1_COMMON *cm,
-                          const MV_REF *prev_frame_mvs_base,
-                          const MACROBLOCKD *xd, int mi_row, int mi_col,
-                          MV_REFERENCE_FRAME ref_frame, int blk_row,
-                          int blk_col, uint8_t *refmv_count,
-                          CANDIDATE_MV *ref_mv_stack, int16_t *mode_context) {
-  (void)prev_frame_mvs_base;
+static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                          int mi_row, int mi_col, MV_REFERENCE_FRAME ref_frame,
+                          int blk_row, int blk_col, int_mv *gm_mv_candidates,
+                          uint8_t refmv_count[MODE_CTX_REF_FRAMES],
+                          CANDIDATE_MV ref_mv_stacks[][MAX_REF_MV_STACK_SIZE],
+                          int16_t *mode_context) {
   POSITION mi_pos;
   int idx;
-  int coll_blk_count = 0;
   const int weight_unit = 1;  // mi_size_wide[BLOCK_8X8];
 
-#if CONFIG_MV_COMPRESS
   mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1;
   mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1;
-#else
-  mi_pos.row = blk_row;
-  mi_pos.col = blk_col;
-#endif
 
-  if (!is_inside(&xd->tile, mi_col, mi_row, cm->mi_rows, cm, &mi_pos))
-    return coll_blk_count;
+  if (!is_inside(&xd->tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) return 0;
 
-  const TPL_MV_REF *prev_frame_mvs = cm->cur_frame->tpl_mvs +
-                                     (mi_row + mi_pos.row) * cm->mi_stride +
-                                     (mi_col + mi_pos.col);
+  const TPL_MV_REF *prev_frame_mvs =
+      cm->tpl_mvs + ((mi_row + mi_pos.row) >> 1) * (cm->mi_stride >> 1) +
+      ((mi_col + mi_pos.col) >> 1);
 
   MV_REFERENCE_FRAME rf[2];
   av1_set_ref_frame(rf, ref_frame);
 
   if (rf[1] == NONE_FRAME) {
-    for (int i = 0; i < MFMV_STACK_SIZE; ++i) {
-      if (prev_frame_mvs->mfmv[ref_frame - LAST_FRAME][i].as_int !=
-          INVALID_MV) {
-        int_mv this_refmv = prev_frame_mvs->mfmv[ref_frame - LAST_FRAME][i];
-        lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv);
-
-        if (blk_row == 0 && blk_col == 0)
-          if (abs(this_refmv.as_mv.row) >= 16 ||
-              abs(this_refmv.as_mv.col) >= 16)
-            mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
-
-        for (idx = 0; idx < *refmv_count; ++idx)
-          if (abs(this_refmv.as_mv.row - ref_mv_stack[idx].this_mv.as_mv.row) <
-                  4 &&
-              abs(this_refmv.as_mv.col - ref_mv_stack[idx].this_mv.as_mv.col) <
-                  4)
-            break;
-
-        if (idx < *refmv_count) ref_mv_stack[idx].weight += 2 * weight_unit;
-
-        if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
-          ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
-          // TODO(jingning): Hard coded context number. Need to make it better
-          // sense.
-          ref_mv_stack[idx].pred_diff[0] = 1;
-          ref_mv_stack[idx].weight = 2 * weight_unit;
-          ++(*refmv_count);
-        }
+    int cur_frame_index = cm->cur_frame->cur_frame_offset;
+    int buf_idx_0 = cm->frame_refs[FWD_RF_OFFSET(rf[0])].idx;
+    int frame0_index = cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset;
+    int cur_offset_0 = get_relative_dist(cm, cur_frame_index, frame0_index);
+    CANDIDATE_MV *ref_mv_stack = ref_mv_stacks[rf[0]];
+
+    if (prev_frame_mvs->mfmv0.as_int != INVALID_MV) {
+      int_mv this_refmv;
+
+      get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+                        cur_offset_0, prev_frame_mvs->ref_frame_offset);
+      lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv,
+                         cm->cur_frame_force_integer_mv);
+
+      if (blk_row == 0 && blk_col == 0)
+        if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
+            abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16)
+          mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+
+      for (idx = 0; idx < refmv_count[rf[0]]; ++idx)
+        if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break;
 
-        ++coll_blk_count;
+      if (idx < refmv_count[rf[0]]) ref_mv_stack[idx].weight += 2 * weight_unit;
+
+      if (idx == refmv_count[rf[0]] &&
+          refmv_count[rf[0]] < MAX_REF_MV_STACK_SIZE) {
+        ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
+        ref_mv_stack[idx].weight = 2 * weight_unit;
+        ++(refmv_count[rf[0]]);
       }
+      return 1;
     }
   } else {
     // Process compound inter mode
-    for (int i = 0; i < MFMV_STACK_SIZE; ++i) {
-      if (prev_frame_mvs->mfmv[rf[0] - LAST_FRAME][i].as_int != INVALID_MV &&
-          prev_frame_mvs->mfmv[rf[1] - LAST_FRAME][i].as_int != INVALID_MV) {
-        int_mv this_refmv = prev_frame_mvs->mfmv[rf[0] - LAST_FRAME][i];
-        int_mv comp_refmv = prev_frame_mvs->mfmv[rf[1] - LAST_FRAME][i];
-        lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv);
-        lower_mv_precision(&comp_refmv.as_mv, cm->allow_high_precision_mv);
-
-        if (blk_row == 0 && blk_col == 0)
-          if (abs(this_refmv.as_mv.row) >= 16 ||
-              abs(this_refmv.as_mv.col) >= 16 ||
-              abs(comp_refmv.as_mv.row) >= 16 ||
-              abs(comp_refmv.as_mv.col) >= 16)
-            mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
-
-        for (idx = 0; idx < *refmv_count; ++idx)
-          if (abs(this_refmv.as_mv.row - ref_mv_stack[idx].this_mv.as_mv.row) <
-                  4 &&
-              abs(this_refmv.as_mv.col - ref_mv_stack[idx].this_mv.as_mv.col) <
-                  4 &&
-              abs(comp_refmv.as_mv.row - ref_mv_stack[idx].comp_mv.as_mv.row) <
-                  4 &&
-              abs(comp_refmv.as_mv.col - ref_mv_stack[idx].comp_mv.as_mv.col) <
-                  4)
-            break;
-
-        if (idx < *refmv_count) ref_mv_stack[idx].weight += 2 * weight_unit;
-
-        if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
-          ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
-          ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int;
-          // TODO(jingning): Hard coded context number. Need to make it better
-          // sense.
-          ref_mv_stack[idx].pred_diff[0] = 1;
-          ref_mv_stack[idx].pred_diff[1] = 1;
-          ref_mv_stack[idx].weight = 2 * weight_unit;
-          ++(*refmv_count);
-        }
+    int cur_frame_index = cm->cur_frame->cur_frame_offset;
+    int buf_idx_0 = cm->frame_refs[FWD_RF_OFFSET(rf[0])].idx;
+    int frame0_index = cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset;
+
+    int cur_offset_0 = get_relative_dist(cm, cur_frame_index, frame0_index);
+    int buf_idx_1 = cm->frame_refs[FWD_RF_OFFSET(rf[1])].idx;
+    int frame1_index = cm->buffer_pool->frame_bufs[buf_idx_1].cur_frame_offset;
+    int cur_offset_1 = get_relative_dist(cm, cur_frame_index, frame1_index);
+    CANDIDATE_MV *ref_mv_stack = ref_mv_stacks[ref_frame];
+
+    if (prev_frame_mvs->mfmv0.as_int != INVALID_MV) {
+      int_mv this_refmv;
+      int_mv comp_refmv;
+      get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+                        cur_offset_0, prev_frame_mvs->ref_frame_offset);
+      get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+                        cur_offset_1, prev_frame_mvs->ref_frame_offset);
 
-        ++coll_blk_count;
-      }
-    }
-  }
-
-  return coll_blk_count;
-}
-#else
-static int add_col_ref_mv(const AV1_COMMON *cm,
-                          const MV_REF *prev_frame_mvs_base,
-                          int prev_frame_mvs_stride, const MACROBLOCKD *xd,
-                          int mi_row, int mi_col, MV_REFERENCE_FRAME ref_frame,
-                          int blk_row, int blk_col, uint8_t *refmv_count,
-                          CANDIDATE_MV *ref_mv_stack, int16_t *mode_context) {
-#if CONFIG_TMV
-  const MV_REF *prev_frame_mvs = prev_frame_mvs_base +
-                                 ((blk_row + 1) >> 1) * prev_frame_mvs_stride +
-                                 ((blk_col + 1) >> 1);
-#else
-  const MV_REF *prev_frame_mvs =
-      prev_frame_mvs_base + blk_row * prev_frame_mvs_stride + blk_col;
-#endif
-  POSITION mi_pos;
-  int ref, idx;
-  int coll_blk_count = 0;
-  const int weight_unit = mi_size_wide[BLOCK_8X8];
-
-#if CONFIG_TMV
-  mi_pos.row = blk_row;
-  mi_pos.col = blk_col;
-#else
-#if CONFIG_MV_COMPRESS
-  mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1;
-  mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1;
-#else
-  mi_pos.row = blk_row;
-  mi_pos.col = blk_col;
-#endif
-#endif  // CONFIG_TMV
-
-  if (!is_inside(&xd->tile, mi_col, mi_row, cm->mi_rows, cm, &mi_pos))
-    return coll_blk_count;
-  for (ref = 0; ref < 2; ++ref) {
-    if (prev_frame_mvs->ref_frame[ref] == ref_frame) {
-      int_mv this_refmv = prev_frame_mvs->mv[ref];
-#if CONFIG_AMVR
       lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv,
-                         cm->cur_frame_mv_precision_level);
-#else
-      lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv);
-#endif
+                         cm->cur_frame_force_integer_mv);
+      lower_mv_precision(&comp_refmv.as_mv, cm->allow_high_precision_mv,
+                         cm->cur_frame_force_integer_mv);
 
-#if CONFIG_OPT_REF_MV
       if (blk_row == 0 && blk_col == 0)
-#endif
-      {
-        if (abs(this_refmv.as_mv.row) >= 16 || abs(this_refmv.as_mv.col) >= 16)
-          mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
-      }
-
-      for (idx = 0; idx < *refmv_count; ++idx)
-        if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break;
+        if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
+            abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 ||
+            abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 ||
+            abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16)
+          mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+
+      for (idx = 0; idx < refmv_count[ref_frame]; ++idx)
+        if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int &&
+            comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int)
+          break;
 
-      if (idx < *refmv_count) ref_mv_stack[idx].weight += 2 * weight_unit;
+      if (idx < refmv_count[ref_frame])
+        ref_mv_stack[idx].weight += 2 * weight_unit;
 
-      if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+      if (idx == refmv_count[ref_frame] &&
+          refmv_count[ref_frame] < MAX_REF_MV_STACK_SIZE) {
         ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
-        ref_mv_stack[idx].pred_diff[0] =
-            av1_get_pred_diff_ctx(prev_frame_mvs->pred_mv[ref], this_refmv);
+        ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int;
         ref_mv_stack[idx].weight = 2 * weight_unit;
-        ++(*refmv_count);
+        ++(refmv_count[ref_frame]);
       }
-
-      ++coll_blk_count;
+      return 1;
     }
   }
-
-  return coll_blk_count;
+  return 0;
 }
-#endif
-
-static void setup_ref_mv_list(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                              MV_REFERENCE_FRAME ref_frame,
-                              uint8_t *refmv_count, CANDIDATE_MV *ref_mv_stack,
-                              int_mv *mv_ref_list,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                              int_mv *gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                              int block, int mi_row, int mi_col,
-                              int16_t *mode_context) {
-  int idx, nearest_refmv_count = 0;
-  uint8_t newmv_count = 0;
-  CANDIDATE_MV tmp_mv;
-  int len, nr_len;
-
-#if CONFIG_TMV
-  const int prev_frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1);
-  const int tmi_row = mi_row & 0xfffe;
-  const int tmi_col = mi_col & 0xfffe;
-  const MV_REF *const prev_frame_mvs_base =
-      cm->use_prev_frame_mvs
-          ? cm->prev_frame->mvs + (tmi_row >> 1) * prev_frame_mvs_stride +
-                (tmi_col >> 1)
-          : NULL;
-#else
-  const int prev_frame_mvs_stride = cm->mi_cols;
-#if CONFIG_MV_COMPRESS
-  const MV_REF *const prev_frame_mvs_base =
-      cm->use_prev_frame_mvs
-          ? cm->prev_frame->mvs +
-                (((mi_row >> 1) << 1) + 1) * prev_frame_mvs_stride +
-                ((mi_col >> 1) << 1) + 1
-          : NULL;
-#else
-  const MV_REF *const prev_frame_mvs_base =
-      cm->use_prev_frame_mvs
-          ? cm->prev_frame->mvs + mi_row * prev_frame_mvs_stride + mi_col
-          : NULL;
-#endif
-#endif  // CONFIG_TMV
 
+static void setup_ref_mv_list(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame,
+    uint8_t refmv_count[MODE_CTX_REF_FRAMES],
+    CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+    int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates,
+    int mi_row, int mi_col, int16_t *mode_context) {
   const int bs = AOMMAX(xd->n8_w, xd->n8_h);
   const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs);
   MV_REFERENCE_FRAME rf[2];
 
   const TileInfo *const tile = &xd->tile;
   int max_row_offset = 0, max_col_offset = 0;
-#if CONFIG_CB4X4
   const int row_adj = (xd->n8_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
   const int col_adj = (xd->n8_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
-#endif
   int processed_rows = 0;
   int processed_cols = 0;
-  int row_offset, col_offset;
 
   av1_set_ref_frame(rf, ref_frame);
   mode_context[ref_frame] = 0;
-  *refmv_count = 0;
+  refmv_count[ref_frame] = 0;
 
   // Find valid maximum row/col offset.
   if (xd->up_available) {
-#if CONFIG_CB4X4
-    max_row_offset = -(MVREF_ROWS << 1) + row_adj;
-#else
-    max_row_offset = -MVREF_ROWS;
-#endif
+    max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj;
+
+    if (xd->n8_h < mi_size_high[BLOCK_8X8])
+      max_row_offset = -(2 << 1) + row_adj;
+
     max_row_offset =
-        find_valid_row_offset(tile, mi_row, cm->mi_rows, cm, max_row_offset);
+        find_valid_row_offset(tile, mi_row, cm->mi_rows, max_row_offset);
   }
 
   if (xd->left_available) {
-#if CONFIG_CB4X4
-    max_col_offset = -(MVREF_COLS << 1) + col_adj;
-#else
-    max_col_offset = -MVREF_COLS;
-#endif
+    max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj;
+
+    if (xd->n8_w < mi_size_wide[BLOCK_8X8])
+      max_col_offset = -(2 << 1) + col_adj;
+
     max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset);
   }
 
+  uint8_t col_match_count = 0;
+  uint8_t row_match_count = 0;
+  uint8_t newmv_count = 0;
+
   // Scan the first above row mode info. row_offset = -1;
   if (abs(max_row_offset) >= 1)
-    newmv_count +=
-        scan_row_mbmi(cm, xd, mi_col, block, rf, -1, ref_mv_stack, refmv_count,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                      gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                      max_row_offset, &processed_rows);
+    scan_row_mbmi(cm, xd, mi_row, mi_col, rf, -1, ref_mv_stack[ref_frame],
+                  &refmv_count[ref_frame], &row_match_count, &newmv_count,
+                  gm_mv_candidates, max_row_offset, &processed_rows);
   // Scan the first left column mode info. col_offset = -1;
   if (abs(max_col_offset) >= 1)
-    newmv_count +=
-        scan_col_mbmi(cm, xd, mi_row, block, rf, -1, ref_mv_stack, refmv_count,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                      gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                      max_col_offset, &processed_cols);
+    scan_col_mbmi(cm, xd, mi_row, mi_col, rf, -1, ref_mv_stack[ref_frame],
+                  &refmv_count[ref_frame], &col_match_count, &newmv_count,
+                  gm_mv_candidates, max_col_offset, &processed_cols);
   // Check top-right boundary
   if (has_tr)
-    newmv_count += scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf, -1,
-                                 xd->n8_w, ref_mv_stack,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                                 gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                                 refmv_count);
-
-  nearest_refmv_count = *refmv_count;
-
-  for (idx = 0; idx < nearest_refmv_count; ++idx)
-    ref_mv_stack[idx].weight += REF_CAT_LEVEL;
-
-#if CONFIG_MFMV
-  int blk_row, blk_col;
-  int coll_blk_count = 0;
-  int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n8_h);
-  int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n8_w);
-
-  int tpl_sample_pos[9][2] = {
-    { -2, hoffset }, { 0, hoffset },  { voffset, hoffset },
-    { voffset, 0 },  { voffset, -2 }, { voffset, -4 },
-    { -4, hoffset }, { voffset, 4 },  { 2, hoffset + 4 },
-  };
-  int i;
-
-  for (blk_row = 0; blk_row < xd->n8_h; blk_row += mi_size_high[BLOCK_8X8]) {
-    for (blk_col = 0; blk_col < xd->n8_w; blk_col += mi_size_wide[BLOCK_8X8]) {
-      // (TODO: yunqing) prev_frame_mvs_base is not used here, tpl_mvs is used.
-      // Can be modified the same way.
-      int is_available = add_tpl_ref_mv(
-          cm, prev_frame_mvs_base, xd, mi_row, mi_col, ref_frame, blk_row,
-          blk_col, refmv_count, ref_mv_stack, mode_context);
-      if (blk_row == 0 && blk_col == 0) coll_blk_count = is_available;
+    scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n8_w,
+                  ref_mv_stack[ref_frame], &row_match_count, &newmv_count,
+                  gm_mv_candidates, &refmv_count[ref_frame]);
+
+  uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0);
+  uint8_t nearest_refmv_count = refmv_count[ref_frame];
+
+  // TODO(yunqing): for comp_search, do it for all 3 cases.
+  for (int idx = 0; idx < nearest_refmv_count; ++idx)
+    ref_mv_stack[ref_frame][idx].weight += REF_CAT_LEVEL;
+
+  if (cm->allow_ref_frame_mvs) {
+    int is_available = 0;
+    const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n8_h);
+    const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n8_w);
+    const int blk_row_end = AOMMIN(xd->n8_h, mi_size_high[BLOCK_64X64]);
+    const int blk_col_end = AOMMIN(xd->n8_w, mi_size_wide[BLOCK_64X64]);
+
+    const int tpl_sample_pos[3][2] = {
+      { voffset, -2 },
+      { voffset, hoffset },
+      { voffset - 2, hoffset },
+    };
+    const int allow_extension = (xd->n8_h >= mi_size_high[BLOCK_8X8]) &&
+                                (xd->n8_h < mi_size_high[BLOCK_64X64]) &&
+                                (xd->n8_w >= mi_size_wide[BLOCK_8X8]) &&
+                                (xd->n8_w < mi_size_wide[BLOCK_64X64]);
+
+    int step_h = (xd->n8_h >= mi_size_high[BLOCK_64X64])
+                     ? mi_size_high[BLOCK_16X16]
+                     : mi_size_high[BLOCK_8X8];
+    int step_w = (xd->n8_w >= mi_size_wide[BLOCK_64X64])
+                     ? mi_size_wide[BLOCK_16X16]
+                     : mi_size_wide[BLOCK_8X8];
+
+    for (int blk_row = 0; blk_row < blk_row_end; blk_row += step_h) {
+      for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) {
+        int ret = add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row,
+                                 blk_col, gm_mv_candidates, refmv_count,
+                                 ref_mv_stack, mode_context);
+        if (blk_row == 0 && blk_col == 0) is_available = ret;
+      }
     }
-  }
 
-  if (coll_blk_count == 0) mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
+    if (is_available == 0) mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
 
-  for (i = 0; i < 9; ++i) {
-    blk_row = tpl_sample_pos[i][0];
-    blk_col = tpl_sample_pos[i][1];
+    for (int i = 0; i < 3 && allow_extension; ++i) {
+      const int blk_row = tpl_sample_pos[i][0];
+      const int blk_col = tpl_sample_pos[i][1];
 
-    if (!check_sb_border(cm, mi_row, mi_col, blk_row, blk_col)) continue;
-    // (TODO: yunqing) prev_frame_mvs_base is not used here, tpl_mvs is used.
-    // Can be modified the same way.
-    coll_blk_count += add_tpl_ref_mv(cm, prev_frame_mvs_base, xd, mi_row,
-                                     mi_col, ref_frame, blk_row, blk_col,
-                                     refmv_count, ref_mv_stack, mode_context);
-  }
-#else
-#if CONFIG_TEMPMV_SIGNALING
-  if (cm->use_prev_frame_mvs && rf[1] == NONE_FRAME)
-#else
-  if (prev_frame_mvs_base && cm->show_frame && cm->last_show_frame &&
-      rf[1] == NONE_FRAME)
-#endif
-  {
-    int blk_row, blk_col;
-    int coll_blk_count = 0;
-#if CONFIG_CB4X4
-    const int mi_step = (xd->n8_w == 1 || xd->n8_h == 1)
-                            ? mi_size_wide[BLOCK_8X8]
-                            : mi_size_wide[BLOCK_16X16];
-#else
-    const int mi_step = mi_size_wide[BLOCK_16X16];
-#endif
-
-#if CONFIG_TPL_MV
-    // Modified sample positions to be consistent with frame_mvs
-    // spatial resolution.
-    int tpl_sample_pos[5][2] = { { -1, xd->n8_w },
-                                 { 0, xd->n8_w },
-                                 { xd->n8_h, xd->n8_w },
-                                 { xd->n8_h, 0 },
-                                 { xd->n8_h, -1 } };
-    int i;
-#endif
-
-    for (blk_row = 0; blk_row < xd->n8_h; blk_row += mi_step) {
-      for (blk_col = 0; blk_col < xd->n8_w; blk_col += mi_step) {
-#if CONFIG_TMV
-        int is_available =
-            add_col_ref_mv(cm, prev_frame_mvs_base, prev_frame_mvs_stride, xd,
-                           tmi_row, tmi_col, ref_frame, blk_row, blk_col,
-                           refmv_count, ref_mv_stack, mode_context);
-#else
-        int is_available =
-            add_col_ref_mv(cm, prev_frame_mvs_base, prev_frame_mvs_stride, xd,
-                           mi_row, mi_col, ref_frame, blk_row, blk_col,
-                           refmv_count, ref_mv_stack, mode_context);
-#endif  // CONFIG_TMV
-#if CONFIG_OPT_REF_MV
-        if (blk_row == 0 && blk_col == 0) coll_blk_count = is_available;
-#else
-        coll_blk_count += is_available;
-#endif
-      }
-    }
-
-#if CONFIG_TPL_MV
-    for (i = 0; i < 5; ++i) {
-      blk_row = tpl_sample_pos[i][0];
-      blk_col = tpl_sample_pos[i][1];
-#if CONFIG_TMV
-      coll_blk_count += add_col_ref_mv(
-          cm, prev_frame_mvs_base, prev_frame_mvs_stride, xd, tmi_row, tmi_col,
-          ref_frame, blk_row, blk_col, refmv_count, ref_mv_stack, mode_context);
-#else
-      coll_blk_count += add_col_ref_mv(
-          cm, prev_frame_mvs_base, prev_frame_mvs_stride, xd, mi_row, mi_col,
-          ref_frame, blk_row, blk_col, refmv_count, ref_mv_stack, mode_context);
-#endif  // CONFIG_TMV
+      if (!check_sb_border(mi_row, mi_col, blk_row, blk_col)) continue;
+      add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, blk_col,
+                     gm_mv_candidates, refmv_count, ref_mv_stack, mode_context);
     }
-#endif
-
-    if (coll_blk_count == 0) mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
-  } else {
-    mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
   }
-#endif
+
+  uint8_t dummy_newmv_count = 0;
 
   // Scan the second outer area.
-  scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf, -1, -1, ref_mv_stack,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                refmv_count);
-  for (idx = 2; idx <= MVREF_ROWS; ++idx) {
-#if CONFIG_CB4X4
-    row_offset = -(idx << 1) + 1 + row_adj;
-    col_offset = -(idx << 1) + 1 + col_adj;
-#else
-    row_offset = -idx;
-    col_offset = -idx;
-#endif
+  scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack[ref_frame],
+                &row_match_count, &dummy_newmv_count, gm_mv_candidates,
+                &refmv_count[ref_frame]);
+
+  for (int idx = 2; idx <= MVREF_ROW_COLS; ++idx) {
+    const int row_offset = -(idx << 1) + 1 + row_adj;
+    const int col_offset = -(idx << 1) + 1 + col_adj;
 
     if (abs(row_offset) <= abs(max_row_offset) &&
         abs(row_offset) > processed_rows)
-      scan_row_mbmi(cm, xd, mi_col, block, rf, row_offset, ref_mv_stack,
-                    refmv_count,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                    gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
+      scan_row_mbmi(cm, xd, mi_row, mi_col, rf, row_offset,
+                    ref_mv_stack[ref_frame], &refmv_count[ref_frame],
+                    &row_match_count, &dummy_newmv_count, gm_mv_candidates,
                     max_row_offset, &processed_rows);
 
     if (abs(col_offset) <= abs(max_col_offset) &&
         abs(col_offset) > processed_cols)
-      scan_col_mbmi(cm, xd, mi_row, block, rf, col_offset, ref_mv_stack,
-                    refmv_count,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                    gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
+      scan_col_mbmi(cm, xd, mi_row, mi_col, rf, col_offset,
+                    ref_mv_stack[ref_frame], &refmv_count[ref_frame],
+                    &col_match_count, &dummy_newmv_count, gm_mv_candidates,
                     max_col_offset, &processed_cols);
   }
 
-#if CONFIG_CB4X4
-  col_offset = -(MVREF_COLS << 1) + 1 + col_adj;
-#else
-  col_offset = -MVREF_COLS;
-#endif
-  if (abs(col_offset) <= abs(max_col_offset) &&
-      abs(col_offset) > processed_cols)
-    scan_col_mbmi(cm, xd, mi_row, block, rf, col_offset, ref_mv_stack,
-                  refmv_count,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                  gm_mv_candidates,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                  max_col_offset, &processed_cols);
-
-  switch (nearest_refmv_count) {
-    case 0: mode_context[ref_frame] |= 0;
-#if !CONFIG_OPT_REF_MV
-      if (*refmv_count >= 1) mode_context[ref_frame] |= 1;
-      if (*refmv_count == 1)
+  uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0);
+
+  switch (nearest_match) {
+    case 0:
+      mode_context[ref_frame] |= 0;
+      if (ref_match_count >= 1) mode_context[ref_frame] |= 1;
+      if (ref_match_count == 1)
         mode_context[ref_frame] |= (1 << REFMV_OFFSET);
-      else if (*refmv_count >= 2)
+      else if (ref_match_count >= 2)
         mode_context[ref_frame] |= (2 << REFMV_OFFSET);
-#endif
       break;
-    case 1: mode_context[ref_frame] |= (newmv_count > 0) ? 2 : 3;
-#if CONFIG_OPT_REF_MV
-      mode_context[ref_frame] |= (3 << REFMV_OFFSET);
-#else
-      if (*refmv_count == 1)
+    case 1:
+      mode_context[ref_frame] |= (newmv_count > 0) ? 2 : 3;
+      if (ref_match_count == 1)
         mode_context[ref_frame] |= (3 << REFMV_OFFSET);
-      else if (*refmv_count >= 2)
+      else if (ref_match_count >= 2)
         mode_context[ref_frame] |= (4 << REFMV_OFFSET);
-#endif
       break;
-
     case 2:
     default:
-      if (newmv_count >= 2)
+      if (newmv_count >= 1)
         mode_context[ref_frame] |= 4;
-      else if (newmv_count == 1)
-        mode_context[ref_frame] |= 5;
       else
-        mode_context[ref_frame] |= 6;
+        mode_context[ref_frame] |= 5;
 
       mode_context[ref_frame] |= (5 << REFMV_OFFSET);
       break;
   }
 
   // Rank the likelihood and assign nearest and near mvs.
-  len = nearest_refmv_count;
+  int len = nearest_refmv_count;
   while (len > 0) {
-    nr_len = 0;
-    for (idx = 1; idx < len; ++idx) {
-      if (ref_mv_stack[idx - 1].weight < ref_mv_stack[idx].weight) {
-        tmp_mv = ref_mv_stack[idx - 1];
-        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
-        ref_mv_stack[idx] = tmp_mv;
+    int nr_len = 0;
+    for (int idx = 1; idx < len; ++idx) {
+      if (ref_mv_stack[ref_frame][idx - 1].weight <
+          ref_mv_stack[ref_frame][idx].weight) {
+        CANDIDATE_MV tmp_mv = ref_mv_stack[ref_frame][idx - 1];
+        ref_mv_stack[ref_frame][idx - 1] = ref_mv_stack[ref_frame][idx];
+        ref_mv_stack[ref_frame][idx] = tmp_mv;
         nr_len = idx;
       }
     }
     len = nr_len;
   }
 
-  len = *refmv_count;
+  len = refmv_count[ref_frame];
   while (len > nearest_refmv_count) {
-    nr_len = nearest_refmv_count;
-    for (idx = nearest_refmv_count + 1; idx < len; ++idx) {
-      if (ref_mv_stack[idx - 1].weight < ref_mv_stack[idx].weight) {
-        tmp_mv = ref_mv_stack[idx - 1];
-        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
-        ref_mv_stack[idx] = tmp_mv;
+    int nr_len = nearest_refmv_count;
+    for (int idx = nearest_refmv_count + 1; idx < len; ++idx) {
+      if (ref_mv_stack[ref_frame][idx - 1].weight <
+          ref_mv_stack[ref_frame][idx].weight) {
+        CANDIDATE_MV tmp_mv = ref_mv_stack[ref_frame][idx - 1];
+        ref_mv_stack[ref_frame][idx - 1] = ref_mv_stack[ref_frame][idx];
+        ref_mv_stack[ref_frame][idx] = tmp_mv;
         nr_len = idx;
       }
     }
@@ -980,595 +630,324 @@ static void setup_ref_mv_list(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   }
 
   if (rf[1] > NONE_FRAME) {
-    for (idx = 0; idx < *refmv_count; ++idx) {
-      clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                   xd->n8_h << MI_SIZE_LOG2, xd);
-      clamp_mv_ref(&ref_mv_stack[idx].comp_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                   xd->n8_h << MI_SIZE_LOG2, xd);
-    }
-  } else {
-    for (idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, *refmv_count); ++idx) {
-      mv_ref_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
-      clamp_mv_ref(&mv_ref_list[idx].as_mv, xd->n8_w << MI_SIZE_LOG2,
-                   xd->n8_h << MI_SIZE_LOG2, xd);
-    }
-  }
-}
+    // TODO(jingning, yunqing): Refactor and consolidate the compound and
+    // single reference frame modes. Reduce unnecessary redundancy.
+    if (refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES) {
+      int_mv ref_id[2][2], ref_diff[2][2];
+      int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 };
+
+      int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n8_w);
+      mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
+      int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n8_h);
+      mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
+      int mi_size = AOMMIN(mi_width, mi_height);
+
+      for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) {
+        const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
+        const int candidate_bsize = candidate->sb_type;
+
+        for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+          MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
+
+          for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
+            if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
+              ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
+              ++ref_id_count[cmp_idx];
+            } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
+              int_mv this_mv = candidate->mv[rf_idx];
+              if (cm->ref_frame_sign_bias[can_rf] !=
+                  cm->ref_frame_sign_bias[rf[cmp_idx]]) {
+                this_mv.as_mv.row = -this_mv.as_mv.row;
+                this_mv.as_mv.col = -this_mv.as_mv.col;
+              }
+              ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
+              ++ref_diff_count[cmp_idx];
+            }
+          }
+        }
+        idx += mi_size_wide[candidate_bsize];
+      }
 
-// This function searches the neighbourhood of a given MB/SB
-// to try and find candidate reference vectors.
-static void find_mv_refs_idx(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                             MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                             int_mv *mv_ref_list, int block, int mi_row,
-                             int mi_col, find_mv_refs_sync sync,
-                             void *const data, int16_t *mode_context,
-                             int_mv zeromv) {
-  const int *ref_sign_bias = cm->ref_frame_sign_bias;
-  const int sb_mi_size = mi_size_wide[cm->sb_size];
-  int i, refmv_count = 0;
-  int different_ref_found = 0;
-  int context_counter = 0;
-
-#if CONFIG_TMV
-  int tmi_row = mi_row & 0xfffe;
-  int tmi_col = mi_col & 0xfffe;
-  POSITION mi_pos = { 0, 0 };
-  int inside = is_inside(&xd->tile, tmi_col, tmi_row, cm->mi_rows, cm, &mi_pos);
-  const MV_REF *const prev_frame_mvs =
-      cm->use_prev_frame_mvs && inside
-          ? cm->prev_frame->mvs + (tmi_row >> 1) * ((cm->mi_cols + 1) >> 1) +
-                (tmi_col >> 1)
-          : NULL;
-#else
-#if CONFIG_MV_COMPRESS
-  const TileInfo *const tile_ = &xd->tile;
-  int mi_row_end = tile_->mi_row_end;
-  int mi_col_end = tile_->mi_col_end;
-  const MV_REF *const prev_frame_mvs =
-      cm->use_prev_frame_mvs
-          ? cm->prev_frame->mvs +
-                AOMMIN(((mi_row >> 1) << 1) + 1 + (((xd->n8_h - 1) >> 1) << 1),
-                       mi_row_end - 1) *
-                    cm->mi_cols +
-                AOMMIN(((mi_col >> 1) << 1) + 1 + (((xd->n8_w - 1) >> 1) << 1),
-                       mi_col_end - 1)
-          : NULL;
-#else
-  const MV_REF *const prev_frame_mvs =
-      cm->use_prev_frame_mvs
-          ? cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col
-          : NULL;
-#endif
-#endif  // CONFIG_TMV
-
-#if CONFIG_INTRABC
-  assert(IMPLIES(ref_frame == INTRA_FRAME, cm->use_prev_frame_mvs == 0));
-#endif
-  const TileInfo *const tile = &xd->tile;
-  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
-  const int bw = block_size_wide[AOMMAX(bsize, BLOCK_8X8)];
-  const int bh = block_size_high[AOMMAX(bsize, BLOCK_8X8)];
-  POSITION mv_ref_search[MVREF_NEIGHBOURS];
-  const int num_8x8_blocks_wide = num_8x8_blocks_wide_lookup[bsize];
-  const int num_8x8_blocks_high = num_8x8_blocks_high_lookup[bsize];
-  mv_ref_search[0].row = num_8x8_blocks_high - 1;
-  mv_ref_search[0].col = -1;
-  mv_ref_search[1].row = -1;
-  mv_ref_search[1].col = num_8x8_blocks_wide - 1;
-  mv_ref_search[2].row = -1;
-  mv_ref_search[2].col = (num_8x8_blocks_wide - 1) >> 1;
-  mv_ref_search[3].row = (num_8x8_blocks_high - 1) >> 1;
-  mv_ref_search[3].col = -1;
-  mv_ref_search[4].row = -1;
-  mv_ref_search[4].col = -1;
-#if CONFIG_EXT_PARTITION_TYPES
-  if (num_8x8_blocks_wide == num_8x8_blocks_high) {
-    mv_ref_search[5].row = -1;
-    mv_ref_search[5].col = 0;
-    mv_ref_search[6].row = 0;
-    mv_ref_search[6].col = -1;
-  } else {
-    mv_ref_search[5].row = -1;
-    mv_ref_search[5].col = num_8x8_blocks_wide;
-    mv_ref_search[6].row = num_8x8_blocks_high;
-    mv_ref_search[6].col = -1;
-  }
-#else
-  mv_ref_search[5].row = -1;
-  mv_ref_search[5].col = num_8x8_blocks_wide;
-  mv_ref_search[6].row = num_8x8_blocks_high;
-  mv_ref_search[6].col = -1;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-  mv_ref_search[7].row = -1;
-  mv_ref_search[7].col = -3;
-  mv_ref_search[8].row = num_8x8_blocks_high - 1;
-  mv_ref_search[8].col = -3;
-
-#if CONFIG_CB4X4
-  for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
-    mv_ref_search[i].row *= 2;
-    mv_ref_search[i].col *= 2;
-  }
-#endif  // CONFIG_CB4X4
-
-  // The nearest 2 blocks are treated differently
-  // if the size < 8x8 we get the mv from the bmi substructure,
-  // and we also need to keep a mode count.
-  for (i = 0; i < 2; ++i) {
-    const POSITION *const mv_ref = &mv_ref_search[i];
-    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, cm, mv_ref)) {
-      const MODE_INFO *const candidate_mi =
-          xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
-      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
-      // Keep counts for entropy encoding.
-      context_counter += mode_2_counter[candidate->mode];
-      different_ref_found = 1;
-
-      if (candidate->ref_frame[0] == ref_frame)
-        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block),
-                        refmv_count, mv_ref_list, bw, bh, xd, Done);
-      else if (candidate->ref_frame[1] == ref_frame)
-        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block),
-                        refmv_count, mv_ref_list, bw, bh, xd, Done);
-    }
-  }
+      for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) {
+        const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
+        const int candidate_bsize = candidate->sb_type;
+
+        for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+          MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
+
+          for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
+            if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
+              ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
+              ++ref_id_count[cmp_idx];
+            } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
+              int_mv this_mv = candidate->mv[rf_idx];
+              if (cm->ref_frame_sign_bias[can_rf] !=
+                  cm->ref_frame_sign_bias[rf[cmp_idx]]) {
+                this_mv.as_mv.row = -this_mv.as_mv.row;
+                this_mv.as_mv.col = -this_mv.as_mv.col;
+              }
+              ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
+              ++ref_diff_count[cmp_idx];
+            }
+          }
+        }
+        idx += mi_size_high[candidate_bsize];
+      }
 
-  // Check the rest of the neighbors in much the same way
-  // as before except we don't need to keep track of sub blocks or
-  // mode counts.
-  for (; i < MVREF_NEIGHBOURS; ++i) {
-    const POSITION *const mv_ref = &mv_ref_search[i];
-    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, cm, mv_ref)) {
-      const MB_MODE_INFO *const candidate =
-          !xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]
-              ? NULL
-              : &xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]->mbmi;
-      if (candidate == NULL) continue;
-      if ((mi_row & (sb_mi_size - 1)) + mv_ref->row >= sb_mi_size ||
-          (mi_col & (sb_mi_size - 1)) + mv_ref->col >= sb_mi_size)
-        continue;
-      different_ref_found = 1;
-
-      if (candidate->ref_frame[0] == ref_frame)
-        ADD_MV_REF_LIST(candidate->mv[0], refmv_count, mv_ref_list, bw, bh, xd,
-                        Done);
-      else if (candidate->ref_frame[1] == ref_frame)
-        ADD_MV_REF_LIST(candidate->mv[1], refmv_count, mv_ref_list, bw, bh, xd,
-                        Done);
+      // Build up the compound mv predictor
+      int_mv comp_list[3][2];
+
+      for (int idx = 0; idx < 2; ++idx) {
+        int comp_idx = 0;
+        for (int list_idx = 0; list_idx < ref_id_count[idx] && comp_idx < 2;
+             ++list_idx, ++comp_idx)
+          comp_list[comp_idx][idx] = ref_id[idx][list_idx];
+        for (int list_idx = 0; list_idx < ref_diff_count[idx] && comp_idx < 2;
+             ++list_idx, ++comp_idx)
+          comp_list[comp_idx][idx] = ref_diff[idx][list_idx];
+        for (; comp_idx < 3; ++comp_idx)
+          comp_list[comp_idx][idx] = gm_mv_candidates[idx];
+      }
+
+      if (refmv_count[ref_frame]) {
+        assert(refmv_count[ref_frame] == 1);
+        if (comp_list[0][0].as_int ==
+                ref_mv_stack[ref_frame][0].this_mv.as_int &&
+            comp_list[0][1].as_int ==
+                ref_mv_stack[ref_frame][0].comp_mv.as_int) {
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
+              comp_list[1][0];
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
+              comp_list[1][1];
+        } else {
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
+              comp_list[0][0];
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
+              comp_list[0][1];
+        }
+        ref_mv_stack[ref_frame][refmv_count[ref_frame]].weight = 2;
+        ++refmv_count[ref_frame];
+      } else {
+        for (int idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) {
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
+              comp_list[idx][0];
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
+              comp_list[idx][1];
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].weight = 2;
+          ++refmv_count[ref_frame];
+        }
+      }
     }
-  }
 
-// TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast
-// on windows platform. The sync here is unncessary if use_perv_frame_mvs
-// is 0. But after removing it, there will be hang in the unit test on windows
-// due to several threads waiting for a thread's signal.
-#if defined(_WIN32) && !HAVE_PTHREAD_H
-  if (cm->frame_parallel_decode && sync != NULL) {
-    sync(data, mi_row);
-  }
-#endif
+    assert(refmv_count[ref_frame] >= 2);
 
-  // Check the last frame's mode and mv info.
-  if (cm->use_prev_frame_mvs) {
-    // Synchronize here for frame parallel decode if sync function is provided.
-    if (cm->frame_parallel_decode && sync != NULL) {
-      sync(data, mi_row);
+    for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
+      clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
+                   xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
+      clamp_mv_ref(&ref_mv_stack[ref_frame][idx].comp_mv.as_mv,
+                   xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
     }
+  } else {
+    // Handle single reference frame extension
+    int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n8_w);
+    mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
+    int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n8_h);
+    mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
+    int mi_size = AOMMIN(mi_width, mi_height);
+
+    for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size &&
+                      refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
+      const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
+      const int candidate_bsize = candidate->sb_type;
+
+      // TODO(jingning): Refactor the following code.
+      for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+        if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
+          int_mv this_mv = candidate->mv[rf_idx];
+          if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
+              cm->ref_frame_sign_bias[ref_frame]) {
+            this_mv.as_mv.row = -this_mv.as_mv.row;
+            this_mv.as_mv.col = -this_mv.as_mv.col;
+          }
+          int stack_idx;
+          for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
+            int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
+            if (this_mv.as_int == stack_mv.as_int) break;
+          }
 
-    if (prev_frame_mvs->ref_frame[0] == ref_frame) {
-      ADD_MV_REF_LIST(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, bw, bh,
-                      xd, Done);
-    } else if (prev_frame_mvs->ref_frame[1] == ref_frame) {
-      ADD_MV_REF_LIST(prev_frame_mvs->mv[1], refmv_count, mv_ref_list, bw, bh,
-                      xd, Done);
-    }
-  }
+          if (stack_idx == refmv_count[ref_frame]) {
+            ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
 
-  // Since we couldn't find 2 mvs from the same reference frame
-  // go back through the neighbors and find motion vectors from
-  // different reference frames.
-  if (different_ref_found) {
-    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
-      const POSITION *mv_ref = &mv_ref_search[i];
-      if (is_inside(tile, mi_col, mi_row, cm->mi_rows, cm, mv_ref)) {
-        const MB_MODE_INFO *const candidate =
-            !xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]
-                ? NULL
-                : &xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]->mbmi;
-        if (candidate == NULL) continue;
-        if ((mi_row & (sb_mi_size - 1)) + mv_ref->row >= sb_mi_size ||
-            (mi_col & (sb_mi_size - 1)) + mv_ref->col >= sb_mi_size)
-          continue;
-
-        // If the candidate is INTRA we don't want to consider its mv.
-        IF_DIFF_REF_FRAME_ADD_MV(candidate, ref_frame, ref_sign_bias,
-                                 refmv_count, mv_ref_list, bw, bh, xd, Done);
+            // TODO(jingning): Set an arbitrary small number here. The weight
+            // doesn't matter as long as it is properly initialized.
+            ref_mv_stack[ref_frame][stack_idx].weight = 2;
+            ++refmv_count[ref_frame];
+          }
+        }
       }
+      idx += mi_size_wide[candidate_bsize];
     }
-  }
 
-  // Since we still don't have a candidate we'll try the last frame.
-  if (cm->use_prev_frame_mvs) {
-    if (prev_frame_mvs->ref_frame[0] != ref_frame &&
-        prev_frame_mvs->ref_frame[0] > INTRA_FRAME) {
-      int_mv mv = prev_frame_mvs->mv[0];
-      if (ref_sign_bias[prev_frame_mvs->ref_frame[0]] !=
-          ref_sign_bias[ref_frame]) {
-        mv.as_mv.row *= -1;
-        mv.as_mv.col *= -1;
+    for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size &&
+                      refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
+      const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
+      const int candidate_bsize = candidate->sb_type;
+
+      // TODO(jingning): Refactor the following code.
+      for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+        if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
+          int_mv this_mv = candidate->mv[rf_idx];
+          if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
+              cm->ref_frame_sign_bias[ref_frame]) {
+            this_mv.as_mv.row = -this_mv.as_mv.row;
+            this_mv.as_mv.col = -this_mv.as_mv.col;
+          }
+          int stack_idx;
+          for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
+            int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
+            if (this_mv.as_int == stack_mv.as_int) break;
+          }
+
+          if (stack_idx == refmv_count[ref_frame]) {
+            ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
+
+            // TODO(jingning): Set an arbitrary small number here. The weight
+            // doesn't matter as long as it is properly initialized.
+            ref_mv_stack[ref_frame][stack_idx].weight = 2;
+            ++refmv_count[ref_frame];
+          }
+        }
       }
-      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, bw, bh, xd, Done);
+      idx += mi_size_high[candidate_bsize];
     }
 
-    if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME &&
-        prev_frame_mvs->ref_frame[1] != ref_frame) {
-      int_mv mv = prev_frame_mvs->mv[1];
-      if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] !=
-          ref_sign_bias[ref_frame]) {
-        mv.as_mv.row *= -1;
-        mv.as_mv.col *= -1;
-      }
-      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, bw, bh, xd, Done);
+    for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
+      clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
+                   xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
     }
-  }
 
-Done:
-  if (mode_context)
-    mode_context[ref_frame] = counter_to_context[context_counter];
-  for (i = refmv_count; i < MAX_MV_REF_CANDIDATES; ++i)
-    mv_ref_list[i].as_int = zeromv.as_int;
-}
+    if (mv_ref_list != NULL) {
+      for (int idx = refmv_count[ref_frame]; idx < MAX_MV_REF_CANDIDATES; ++idx)
+        mv_ref_list[rf[0]][idx].as_int = gm_mv_candidates[0].as_int;
 
-// This function keeps a mode count for a given MB/SB
-void av1_update_mv_context(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                           MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                           int_mv *mv_ref_list, int block, int mi_row,
-                           int mi_col, int16_t *mode_context) {
-  int i, refmv_count = 0;
-  int context_counter = 0;
-  const int bw = block_size_wide[mi->mbmi.sb_type];
-  const int bh = block_size_high[mi->mbmi.sb_type];
-  const TileInfo *const tile = &xd->tile;
-  POSITION mv_ref_search[2];
-  const int num_8x8_blocks_wide = mi_size_wide[mi->mbmi.sb_type];
-  const int num_8x8_blocks_high = mi_size_high[mi->mbmi.sb_type];
-
-  mv_ref_search[0].row = num_8x8_blocks_high - 1;
-  mv_ref_search[0].col = -1;
-  mv_ref_search[1].row = -1;
-  mv_ref_search[1].col = num_8x8_blocks_wide - 1;
-
-  // Blank the reference vector list
-  memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
-
-  // The nearest 2 blocks are examined only.
-  // If the size < 8x8, we get the mv from the bmi substructure;
-  for (i = 0; i < 2; ++i) {
-    const POSITION *const mv_ref = &mv_ref_search[i];
-    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, cm, mv_ref)) {
-      const MODE_INFO *const candidate_mi =
-          xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
-      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
-
-      // Keep counts for entropy encoding.
-      context_counter += mode_2_counter[candidate->mode];
-
-      if (candidate->ref_frame[0] == ref_frame) {
-        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block),
-                        refmv_count, mv_ref_list, bw, bh, xd, Done);
-      } else if (candidate->ref_frame[1] == ref_frame) {
-        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block),
-                        refmv_count, mv_ref_list, bw, bh, xd, Done);
+      for (int idx = 0;
+           idx < AOMMIN(MAX_MV_REF_CANDIDATES, refmv_count[ref_frame]); ++idx) {
+        mv_ref_list[rf[0]][idx].as_int =
+            ref_mv_stack[ref_frame][idx].this_mv.as_int;
       }
     }
   }
-
-Done:
-
-  if (mode_context)
-    mode_context[ref_frame] = counter_to_context[context_counter];
 }
 
 void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                      uint8_t *ref_mv_count, CANDIDATE_MV *ref_mv_stack,
-                      int16_t *compound_mode_context, int_mv *mv_ref_list,
-                      int mi_row, int mi_col, find_mv_refs_sync sync,
-                      void *const data, int16_t *mode_context) {
+                      MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                      uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
+                      CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+                      int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES],
+                      int_mv *global_mvs, int mi_row, int mi_col,
+                      int16_t *mode_context) {
   int_mv zeromv[2];
-#if CONFIG_GLOBAL_MOTION
-  BLOCK_SIZE bsize = mi->mbmi.sb_type;
-#endif  // CONFIG_GLOBAL_MOTION
-  int idx, all_zero = 1;
-#if CONFIG_GLOBAL_MOTION
+  BLOCK_SIZE bsize = mi->sb_type;
   MV_REFERENCE_FRAME rf[2];
-#endif  // CONFIG_GLOBAL_MOTION
-
-  av1_update_mv_context(cm, xd, mi, ref_frame, mv_ref_list, -1, mi_row, mi_col,
-                        compound_mode_context);
-
-#if CONFIG_GLOBAL_MOTION
-  if (!CONFIG_INTRABC || ref_frame != INTRA_FRAME) {
-    av1_set_ref_frame(rf, ref_frame);
-    zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[rf[0]],
-                                            cm->allow_high_precision_mv, bsize,
-                                            mi_col, mi_row, 0
-#if CONFIG_AMVR
-                                            ,
-                                            cm->cur_frame_mv_precision_level
-#endif
-                                            )
-                           .as_int;
+  av1_set_ref_frame(rf, ref_frame);
+
+  if (ref_frame < REF_FRAMES) {
+    if (ref_frame != INTRA_FRAME) {
+      global_mvs[ref_frame] = gm_get_motion_vector(
+          &cm->global_motion[ref_frame], cm->allow_high_precision_mv, bsize,
+          mi_col, mi_row, cm->cur_frame_force_integer_mv);
+    } else {
+      global_mvs[ref_frame].as_int = INVALID_MV;
+    }
+  }
+
+  if (ref_frame != INTRA_FRAME) {
+    zeromv[0].as_int =
+        gm_get_motion_vector(&cm->global_motion[rf[0]],
+                             cm->allow_high_precision_mv, bsize, mi_col, mi_row,
+                             cm->cur_frame_force_integer_mv)
+            .as_int;
     zeromv[1].as_int =
         (rf[1] != NONE_FRAME)
             ? gm_get_motion_vector(&cm->global_motion[rf[1]],
                                    cm->allow_high_precision_mv, bsize, mi_col,
-                                   mi_row, 0
-#if CONFIG_AMVR
-                                   ,
-                                   cm->cur_frame_mv_precision_level
-#endif
-                                   )
+                                   mi_row, cm->cur_frame_force_integer_mv)
                   .as_int
             : 0;
   } else {
     zeromv[0].as_int = zeromv[1].as_int = 0;
   }
-#else
-  zeromv[0].as_int = zeromv[1].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
-
-  if (ref_frame <= ALTREF_FRAME)
-    find_mv_refs_idx(cm, xd, mi, ref_frame, mv_ref_list, -1, mi_row, mi_col,
-                     sync, data, mode_context, zeromv[0]);
 
   setup_ref_mv_list(cm, xd, ref_frame, ref_mv_count, ref_mv_stack, mv_ref_list,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                    zeromv,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                    -1, mi_row, mi_col, mode_context);
-  /* Note: If global motion is enabled, then we want to set the ALL_ZERO flag
-     iff all of the MVs we could generate with NEARMV/NEARESTMV are equivalent
-     to the global motion vector.
-     Note: For the following to work properly, the encoder can't throw away
-     any global motion models after calling this function, even if they are
-     unused. Instead we rely on the recode loop: If any non-IDENTITY model
-     is unused, the whole frame will be re-encoded without it.
-     The problem is that, otherwise, we can end up in the following situation:
-     * Encoder has a global motion model with nonzero translational part,
-       and all candidate MVs are zero. So the ALL_ZERO flag is unset.
-     * Encoder throws away global motion because it is never used.
-     * Decoder sees that there is no global motion and all candidate MVs are
-       zero, so sets the ALL_ZERO flag.
-     * This leads to an encode/decode mismatch.
-  */
-  for (idx = 0; idx < AOMMIN(3, *ref_mv_count); ++idx) {
-    if (ref_mv_stack[idx].this_mv.as_int != zeromv[0].as_int) all_zero = 0;
-    if (ref_frame > ALTREF_FRAME)
-      if (ref_mv_stack[idx].comp_mv.as_int != zeromv[1].as_int) all_zero = 0;
-  }
-  if (*ref_mv_count < 2 && ref_frame <= ALTREF_FRAME) {
-    for (idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx)
-      if (mv_ref_list[idx].as_int != zeromv[0].as_int) all_zero = 0;
-  }
-
-#if !CONFIG_OPT_REF_MV
-  if (all_zero) mode_context[ref_frame] |= (1 << ALL_ZERO_FLAG_OFFSET);
-#else
-  (void)all_zero;
-#endif
+                    zeromv, mi_row, mi_col, mode_context);
 }
 
 void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
-                           int_mv *near_mv
-#if CONFIG_AMVR
-                           ,
-                           int is_integer
-#endif
-                           ) {
+                           int_mv *near_mv, int is_integer) {
   int i;
   // Make sure all the candidates are properly clamped etc
   for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
-#if CONFIG_AMVR
     lower_mv_precision(&mvlist[i].as_mv, allow_hp, is_integer);
-#else
-    lower_mv_precision(&mvlist[i].as_mv, allow_hp);
-#endif
   }
   *nearest_mv = mvlist[0];
   *near_mv = mvlist[1];
 }
 
-void av1_append_sub8x8_mvs_for_idx(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                   int block, int ref, int mi_row, int mi_col,
-                                   CANDIDATE_MV *ref_mv_stack,
-                                   uint8_t *ref_mv_count, int_mv *mv_list,
-                                   int_mv *nearest_mv, int_mv *near_mv) {
-  MODE_INFO *const mi = xd->mi[0];
-  b_mode_info *bmi = mi->bmi;
-  int n;
-  int_mv zeromv;
-  CANDIDATE_MV tmp_mv;
-  uint8_t idx;
-  uint8_t above_count = 0, left_count = 0;
-  MV_REFERENCE_FRAME rf[2] = { mi->mbmi.ref_frame[ref], NONE_FRAME };
-  *ref_mv_count = 0;
-
-  assert(MAX_MV_REF_CANDIDATES == 2);
-
-#if CONFIG_GLOBAL_MOTION
-  zeromv.as_int = gm_get_motion_vector(&cm->global_motion[rf[0]],
-                                       cm->allow_high_precision_mv,
-                                       mi->mbmi.sb_type, mi_col, mi_row, block
-#if CONFIG_AMVR
-                                       ,
-                                       cm->cur_frame_mv_precision_level
-#endif
-                                       )
-                      .as_int;
-#else
-  zeromv.as_int = 0;
-#endif
-  find_mv_refs_idx(cm, xd, mi, mi->mbmi.ref_frame[ref], mv_list, block, mi_row,
-                   mi_col, NULL, NULL, NULL, zeromv);
-
-  scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf, -1, 0, ref_mv_stack,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                &zeromv,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                ref_mv_count);
-  above_count = *ref_mv_count;
-
-  scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf, 0, -1, ref_mv_stack,
-#if CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                &zeromv,
-#endif  // CONFIG_GLOBAL_MOTION && USE_CUR_GM_REFMV
-                ref_mv_count);
-  left_count = *ref_mv_count - above_count;
-
-  if (above_count > 1 && left_count > 0) {
-    tmp_mv = ref_mv_stack[1];
-    ref_mv_stack[1] = ref_mv_stack[above_count];
-    ref_mv_stack[above_count] = tmp_mv;
-  }
-
-  for (idx = 0; idx < *ref_mv_count; ++idx)
-    clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                 xd->n8_h << MI_SIZE_LOG2, xd);
-
-  for (idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, *ref_mv_count); ++idx)
-    mv_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
-
-  near_mv->as_int = 0;
-  switch (block) {
-    case 0:
-      nearest_mv->as_int = mv_list[0].as_int;
-      near_mv->as_int = mv_list[1].as_int;
-      break;
-    case 1:
-    case 2:
-      nearest_mv->as_int = bmi[0].as_mv[ref].as_int;
-      for (n = 0; n < MAX_MV_REF_CANDIDATES; ++n)
-        if (nearest_mv->as_int != mv_list[n].as_int) {
-          near_mv->as_int = mv_list[n].as_int;
-          break;
-        }
-      break;
-    case 3: {
-      int_mv candidates[2 + MAX_MV_REF_CANDIDATES];
-      candidates[0] = bmi[1].as_mv[ref];
-      candidates[1] = bmi[0].as_mv[ref];
-      candidates[2] = mv_list[0];
-      candidates[3] = mv_list[1];
-
-      nearest_mv->as_int = bmi[2].as_mv[ref].as_int;
-      for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n)
-        if (nearest_mv->as_int != candidates[n].as_int) {
-          near_mv->as_int = candidates[n].as_int;
-          break;
-        }
-      break;
-    }
-    default: assert(0 && "Invalid block index.");
-  }
-}
-
-#if CONFIG_FRAME_MARKER
 void av1_setup_frame_buf_refs(AV1_COMMON *cm) {
   cm->cur_frame->cur_frame_offset = cm->frame_offset;
-  int alt_buf_idx = cm->frame_refs[ALTREF_FRAME - LAST_FRAME].idx;
-  int lst_buf_idx = cm->frame_refs[LAST_FRAME - LAST_FRAME].idx;
-  int gld_buf_idx = cm->frame_refs[GOLDEN_FRAME - LAST_FRAME].idx;
-
-#if CONFIG_EXT_REFS
-  int lst2_buf_idx = cm->frame_refs[LAST2_FRAME - LAST_FRAME].idx;
-  int lst3_buf_idx = cm->frame_refs[LAST3_FRAME - LAST_FRAME].idx;
-  int bwd_buf_idx = cm->frame_refs[BWDREF_FRAME - LAST_FRAME].idx;
-  int alt2_buf_idx = cm->frame_refs[ALTREF2_FRAME - LAST_FRAME].idx;
-#endif
-
-  if (alt_buf_idx >= 0)
-    cm->cur_frame->alt_frame_offset =
-        cm->buffer_pool->frame_bufs[alt_buf_idx].cur_frame_offset;
-
-  if (lst_buf_idx >= 0)
-    cm->cur_frame->lst_frame_offset =
-        cm->buffer_pool->frame_bufs[lst_buf_idx].cur_frame_offset;
-
-  if (gld_buf_idx >= 0)
-    cm->cur_frame->gld_frame_offset =
-        cm->buffer_pool->frame_bufs[gld_buf_idx].cur_frame_offset;
-
-#if CONFIG_EXT_REFS
-  if (lst2_buf_idx >= 0)
-    cm->cur_frame->lst2_frame_offset =
-        cm->buffer_pool->frame_bufs[lst2_buf_idx].cur_frame_offset;
-
-  if (lst3_buf_idx >= 0)
-    cm->cur_frame->lst3_frame_offset =
-        cm->buffer_pool->frame_bufs[lst3_buf_idx].cur_frame_offset;
-
-  if (bwd_buf_idx >= 0)
-    cm->cur_frame->bwd_frame_offset =
-        cm->buffer_pool->frame_bufs[bwd_buf_idx].cur_frame_offset;
-
-  if (alt2_buf_idx >= 0)
-    cm->cur_frame->alt2_frame_offset =
-        cm->buffer_pool->frame_bufs[alt2_buf_idx].cur_frame_offset;
-#endif
+
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
+    if (buf_idx >= 0)
+      cm->cur_frame->ref_frame_offset[ref_frame - LAST_FRAME] =
+          cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+  }
 }
 
-#if CONFIG_FRAME_SIGN_BIAS
 void av1_setup_frame_sign_bias(AV1_COMMON *cm) {
   MV_REFERENCE_FRAME ref_frame;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
-    if (buf_idx != INVALID_IDX) {
+    if (cm->seq_params.enable_order_hint && buf_idx != INVALID_IDX) {
       const int ref_frame_offset =
           cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
       cm->ref_frame_sign_bias[ref_frame] =
-          (ref_frame_offset <= (int)cm->frame_offset) ? 0 : 1;
+          (get_relative_dist(cm, ref_frame_offset, (int)cm->frame_offset) <= 0)
+              ? 0
+              : 1;
     } else {
       cm->ref_frame_sign_bias[ref_frame] = 0;
     }
   }
 }
-#endif  // CONFIG_FRAME_SIGN_BIAS
-#endif  // CONFIG_FRAME_MARKER
-
-#if CONFIG_MFMV
-// Although we assign 32 bit integers, all the values are strictly under 14
-// bits.
-static int div_mult[32] = {
-  0,    16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638,
-  1489, 1365,  1260, 1170, 1092, 1024, 963,  910,  862,  819,  780,
-  744,  712,   682,  655,  630,  606,  585,  564,  546,  528,
-};
-
-// TODO(jingning): Consider the use of lookup table for (num / den)
-// altogether.
-static void get_mv_projection(MV *output, MV ref, int num, int den) {
-  output->row =
-      (int16_t)(ROUND_POWER_OF_TWO(ref.row * num * div_mult[den], 14));
-  output->col =
-      (int16_t)(ROUND_POWER_OF_TWO(ref.col * num * div_mult[den], 14));
-}
 
 #define MAX_OFFSET_WIDTH 64
-#define MAX_OFFSET_HEIGHT 32
+#define MAX_OFFSET_HEIGHT 0
 
 static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row,
                               int blk_col, MV mv, int sign_bias) {
-  if ((abs(mv.row) >> 3) > MAX_OFFSET_HEIGHT ||
-      (abs(mv.col) >> 3) > MAX_OFFSET_WIDTH)
-    return 0;
+  const int base_blk_row = (blk_row >> 3) << 3;
+  const int base_blk_col = (blk_col >> 3) << 3;
+
+  const int row_offset = (mv.row >= 0) ? (mv.row >> (4 + MI_SIZE_LOG2))
+                                       : -((-mv.row) >> (4 + MI_SIZE_LOG2));
+
+  const int col_offset = (mv.col >= 0) ? (mv.col >> (4 + MI_SIZE_LOG2))
+                                       : -((-mv.col) >> (4 + MI_SIZE_LOG2));
 
-  int row = (sign_bias == 1) ? blk_row - (mv.row >> (3 + MI_SIZE_LOG2))
-                             : blk_row + (mv.row >> (3 + MI_SIZE_LOG2));
-  int col = (sign_bias == 1) ? blk_col - (mv.col >> (3 + MI_SIZE_LOG2))
-                             : blk_col + (mv.col >> (3 + MI_SIZE_LOG2));
+  int row = (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset;
+  int col = (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset;
 
-  if (row < 0 || row >= cm->mi_rows || col < 0 || col >= cm->mi_cols) return 0;
+  if (row < 0 || row >= (cm->mi_rows >> 1) || col < 0 ||
+      col >= (cm->mi_cols >> 1))
+    return 0;
+
+  if (row < base_blk_row - (MAX_OFFSET_HEIGHT >> 3) ||
+      row >= base_blk_row + 8 + (MAX_OFFSET_HEIGHT >> 3) ||
+      col < base_blk_col - (MAX_OFFSET_WIDTH >> 3) ||
+      col >= base_blk_col + 8 + (MAX_OFFSET_WIDTH >> 3))
+    return 0;
 
   *mi_r = row;
   *mi_c = col;
@@ -1576,504 +955,209 @@ static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row,
   return 1;
 }
 
-static uint32_t mv_sign_reverse(int_mv ref) {
-  int_mv this_mv;
-  this_mv.as_mv.row = -ref.as_mv.row;
-  this_mv.as_mv.col = -ref.as_mv.col;
+static int motion_field_projection(AV1_COMMON *cm, MV_REFERENCE_FRAME ref_frame,
+                                   int dir) {
+  TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
+  int ref_offset[REF_FRAMES] = { 0 };
 
-  return this_mv.as_int;
-}
+  (void)dir;
 
-void av1_setup_motion_field(AV1_COMMON *cm) {
+  int ref_frame_idx = cm->frame_refs[FWD_RF_OFFSET(ref_frame)].idx;
+  if (ref_frame_idx < 0) return 0;
+
+  if (cm->buffer_pool->frame_bufs[ref_frame_idx].intra_only) return 0;
+
+  if (cm->buffer_pool->frame_bufs[ref_frame_idx].mi_rows != cm->mi_rows ||
+      cm->buffer_pool->frame_bufs[ref_frame_idx].mi_cols != cm->mi_cols)
+    return 0;
+
+  int ref_frame_index =
+      cm->buffer_pool->frame_bufs[ref_frame_idx].cur_frame_offset;
+  unsigned int *ref_rf_idx =
+      &cm->buffer_pool->frame_bufs[ref_frame_idx].ref_frame_offset[0];
   int cur_frame_index = cm->cur_frame->cur_frame_offset;
-  int lst_frame_index = 0, alt_frame_index = 0, gld_frame_index = 0;
-#if CONFIG_EXT_REFS
-  int lst2_frame_index = 0, lst3_frame_index = 0;
-  int bwd_frame_index = 0, alt2_frame_index = 0;
-#endif
-  TPL_MV_REF *tpl_mvs_base = cm->cur_frame->tpl_mvs;
-
-  for (int ref_frame = 0; ref_frame < INTER_REFS_PER_FRAME; ++ref_frame) {
-    int size = (cm->mi_rows + 16) * cm->mi_stride;
-    for (int idx = 0; idx < size; ++idx) {
-      for (int i = 0; i < MFMV_STACK_SIZE; ++i)
-        tpl_mvs_base[idx].mfmv[ref_frame][i].as_int = INVALID_MV;
-    }
+  int ref_to_cur = get_relative_dist(cm, ref_frame_index, cur_frame_index);
+
+  for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) {
+    ref_offset[rf] =
+        get_relative_dist(cm, ref_frame_index, ref_rf_idx[rf - LAST_FRAME]);
   }
 
-  int alt_buf_idx = cm->frame_refs[ALTREF_FRAME - LAST_FRAME].idx;
-  int lst_buf_idx = cm->frame_refs[LAST_FRAME - LAST_FRAME].idx;
-  int gld_buf_idx = cm->frame_refs[GOLDEN_FRAME - LAST_FRAME].idx;
-#if CONFIG_EXT_REFS
-  int lst2_buf_idx = cm->frame_refs[LAST2_FRAME - LAST_FRAME].idx;
-  int lst3_buf_idx = cm->frame_refs[LAST3_FRAME - LAST_FRAME].idx;
-  int bwd_buf_idx = cm->frame_refs[BWDREF_FRAME - LAST_FRAME].idx;
-  int alt2_buf_idx = cm->frame_refs[ALTREF2_FRAME - LAST_FRAME].idx;
-#endif
-
-  if (alt_buf_idx >= 0)
-    alt_frame_index = cm->buffer_pool->frame_bufs[alt_buf_idx].cur_frame_offset;
-
-  if (lst_buf_idx >= 0)
-    lst_frame_index = cm->buffer_pool->frame_bufs[lst_buf_idx].cur_frame_offset;
-
-  if (gld_buf_idx >= 0)
-    gld_frame_index = cm->buffer_pool->frame_bufs[gld_buf_idx].cur_frame_offset;
-
-#if CONFIG_EXT_REFS
-  if (lst2_buf_idx >= 0)
-    lst2_frame_index =
-        cm->buffer_pool->frame_bufs[lst2_buf_idx].cur_frame_offset;
-
-  if (lst3_buf_idx >= 0)
-    lst3_frame_index =
-        cm->buffer_pool->frame_bufs[lst3_buf_idx].cur_frame_offset;
-
-  if (bwd_buf_idx >= 0)
-    bwd_frame_index = cm->buffer_pool->frame_bufs[bwd_buf_idx].cur_frame_offset;
-
-  if (alt2_buf_idx >= 0)
-    alt2_frame_index =
-        cm->buffer_pool->frame_bufs[alt2_buf_idx].cur_frame_offset;
-#endif
-
-  if (alt_frame_index < cur_frame_index) return;
-
-  // ======================
-  // Process last frame
-  // ======================
-  if (lst_buf_idx >= 0) {
-    MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[lst_buf_idx].mvs;
-    const int lst_frame_idx =
-        cm->buffer_pool->frame_bufs[lst_buf_idx].lst_frame_offset;
-    const int alt_frame_idx =
-        cm->buffer_pool->frame_bufs[lst_buf_idx].alt_frame_offset;
-    const int gld_frame_idx =
-        cm->buffer_pool->frame_bufs[lst_buf_idx].gld_frame_offset;
-#if CONFIG_EXT_REFS
-    const int lst2_frame_idx =
-        cm->buffer_pool->frame_bufs[lst_buf_idx].lst2_frame_offset;
-    const int lst3_frame_idx =
-        cm->buffer_pool->frame_bufs[lst_buf_idx].lst3_frame_offset;
-    const int bwd_frame_idx =
-        cm->buffer_pool->frame_bufs[lst_buf_idx].bwd_frame_offset;
-    const int alt2_frame_idx =
-        cm->buffer_pool->frame_bufs[lst_buf_idx].alt2_frame_offset;
-#endif
-
-    int alt_offset = AOMMAX(1, alt_frame_idx - lst_frame_index);
-    int lst_offset = AOMMAX(1, lst_frame_index - lst_frame_idx);
-    int gld_offset = AOMMAX(1, lst_frame_index - gld_frame_idx);
-    int cur_to_lst = cur_frame_index - lst_frame_index;
-    int cur_to_alt = alt_frame_index - cur_frame_index;
-    int cur_to_gld = cur_frame_index - gld_frame_index;
-
-#if CONFIG_EXT_REFS
-    int bwd_offset = AOMMAX(1, bwd_frame_idx - lst_frame_index);
-    int alt2_offset = AOMMAX(1, alt2_frame_idx - lst_frame_index);
-    int lst2_offset = AOMMAX(1, lst_frame_index - lst2_frame_idx);
-    int lst3_offset = AOMMAX(1, lst_frame_index - lst3_frame_idx);
-    int cur_to_lst2 = cur_frame_index - lst2_frame_index;
-    int cur_to_lst3 = cur_frame_index - lst3_frame_index;
-    int cur_to_bwd = bwd_frame_index - cur_frame_index;
-    int cur_to_alt2 = alt2_frame_index - cur_frame_index;
-#endif
-
-    const int is_lst_overlay = (alt_frame_idx == gld_frame_index);
-    // clang-format off
-    const int ref_frame_offset_buffer[TOTAL_REFS_PER_FRAME] = {
-#if CONFIG_EXT_REFS
-        0, lst_offset, lst2_offset, lst3_offset, gld_offset,
-        bwd_offset, alt2_offset, alt_offset
-#else
-        0, lst_offset, gld_offset, alt_offset
-#endif
-    };
-    // clang-format on
+  if (dir == 2) ref_to_cur = -ref_to_cur;
+
+  MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[ref_frame_idx].mvs;
+  const int mvs_rows = (cm->mi_rows + 1) >> 1;
+  const int mvs_cols = (cm->mi_cols + 1) >> 1;
 
-    for (int blk_row = 0; blk_row < cm->mi_rows && !is_lst_overlay; ++blk_row) {
-      for (int blk_col = 0; blk_col < cm->mi_cols; ++blk_col) {
-        MV_REF *mv_ref = &mv_ref_base[blk_row * cm->mi_cols + blk_col];
-        MV fwd_mv = mv_ref->mv[0].as_mv;
-        MV_REFERENCE_FRAME ref_frame[2] = { mv_ref->ref_frame[0],
-                                            mv_ref->ref_frame[1] };
+  for (int blk_row = 0; blk_row < mvs_rows; ++blk_row) {
+    for (int blk_col = 0; blk_col < mvs_cols; ++blk_col) {
+      MV_REF *mv_ref = &mv_ref_base[blk_row * mvs_cols + blk_col];
+      MV fwd_mv = mv_ref->mv.as_mv;
 
-        // Derive  motion vectors toward last reference frame.
-        if (ref_frame[0] <= GOLDEN_FRAME && ref_frame[0] > INTRA_FRAME) {
-          int_mv this_mv;
-          int mi_r, mi_c;
+      if (mv_ref->ref_frame > INTRA_FRAME) {
+        int_mv this_mv;
+        int mi_r, mi_c;
+        const int ref_frame_offset = ref_offset[mv_ref->ref_frame];
 
-          const int ref_frame_offset = ref_frame_offset_buffer[ref_frame[0]];
+        int pos_valid = abs(ref_frame_offset) <= MAX_FRAME_DISTANCE &&
+                        ref_frame_offset > 0 &&
+                        abs(ref_to_cur) <= MAX_FRAME_DISTANCE;
 
-          get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_lst,
+        if (pos_valid) {
+          get_mv_projection(&this_mv.as_mv, fwd_mv, ref_to_cur,
                             ref_frame_offset);
-          int pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
-                                             this_mv.as_mv, 1);
-
-          if (pos_valid) {
-            int mi_offset = mi_r * cm->mi_stride + mi_c;
-            tpl_mvs_base[mi_offset].mfmv[FWD_RF_OFFSET(LAST_FRAME)][0].as_int =
-                this_mv.as_int;
-
-#if CONFIG_EXT_REFS
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_lst2,
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset].mfmv[FWD_RF_OFFSET(LAST2_FRAME)][0].as_int =
-                this_mv.as_int;
-
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_lst3,
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset].mfmv[FWD_RF_OFFSET(LAST3_FRAME)][0].as_int =
-                this_mv.as_int;
-#endif
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_gld,
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset].mfmv[FWD_RF_OFFSET(GOLDEN_FRAME)]
-                                        [0].as_int = this_mv.as_int;
-          }
+          pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
+                                         this_mv.as_mv, dir >> 1);
         }
 
-        for (int idx = 0; idx < 2; ++idx) {
-          if (ref_frame[idx] <= GOLDEN_FRAME) continue;
-
-          int_mv this_mv;
-          int mi_r, mi_c;
-          fwd_mv = mv_ref->mv[idx].as_mv;
+        if (pos_valid) {
+          int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c;
 
-          const int ref_frame_offset = ref_frame_offset_buffer[ref_frame[idx]];
-
-          get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_lst,
-                            ref_frame_offset);
-          int pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
-                                             this_mv.as_mv, 0);
-
-          if (pos_valid) {
-            int mi_offset = mi_r * cm->mi_stride + mi_c;
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_alt,
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset].mfmv[FWD_RF_OFFSET(ALTREF_FRAME)]
-                                        [0].as_int = this_mv.as_int;
-
-#if CONFIG_EXT_REFS
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_bwd,
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset].mfmv[FWD_RF_OFFSET(BWDREF_FRAME)]
-                                        [0].as_int = this_mv.as_int;
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_alt2,
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset].mfmv[FWD_RF_OFFSET(ALTREF2_FRAME)]
-                                        [0].as_int = this_mv.as_int;
-#endif
-          }
+          tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row;
+          tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col;
+          tpl_mvs_base[mi_offset].ref_frame_offset = ref_frame_offset;
         }
       }
     }
   }
 
-  // =======================
-  // Process ARF frame
-  // =======================
-  if (alt_buf_idx >= 0) {
-    MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[alt_buf_idx].mvs;
-    const int lst_frame_idx =
-        cm->buffer_pool->frame_bufs[alt_buf_idx].lst_frame_offset;
-    const int gld_frame_idx =
-        cm->buffer_pool->frame_bufs[alt_buf_idx].gld_frame_offset;
-#if CONFIG_EXT_REFS
-    const int lst2_frame_idx =
-        cm->buffer_pool->frame_bufs[alt_buf_idx].lst2_frame_offset;
-    const int lst3_frame_idx =
-        cm->buffer_pool->frame_bufs[alt_buf_idx].lst3_frame_offset;
-    const int bwd_frame_idx =
-        cm->buffer_pool->frame_bufs[alt_buf_idx].bwd_frame_offset;
-    const int alt2_frame_idx =
-        cm->buffer_pool->frame_bufs[alt_buf_idx].alt2_frame_offset;
-#endif
-
-    int lst_offset = AOMMAX(1, alt_frame_index - lst_frame_idx);
-    int gld_offset = AOMMAX(1, alt_frame_index - gld_frame_idx);
-    int cur_to_alt = alt_frame_index - cur_frame_index;
-    int cur_to_lst = cur_frame_index - lst_frame_index;
-    int cur_to_gld = cur_frame_index - gld_frame_index;
-#if CONFIG_EXT_REFS
-    int bwd_offset = AOMMAX(1, alt_frame_index - bwd_frame_idx);
-    int alt2_offset = AOMMAX(1, alt_frame_index - alt2_frame_idx);
-    int lst2_offset = AOMMAX(1, alt_frame_index - lst2_frame_idx);
-    int lst3_offset = AOMMAX(1, alt_frame_index - lst3_frame_idx);
-    int cur_to_lst2 = cur_frame_index - lst2_frame_index;
-    int cur_to_lst3 = cur_frame_index - lst3_frame_index;
-    int cur_to_bwd = bwd_frame_index - cur_frame_index;
-    int cur_to_alt2 = alt2_frame_index - cur_frame_index;
-#endif
-    const int ref_stamp = FWD_RF_OFFSET(ALTREF_FRAME);
-    // clang-format off
-    const int ref_frame_offset_buffer[TOTAL_REFS_PER_FRAME] = {
-#if CONFIG_EXT_REFS
-        0, lst_offset, lst2_offset, lst3_offset, gld_offset,
-        bwd_offset, alt2_offset, 0,
-#else
-        0, lst_offset, gld_offset, 0,
-#endif
-    };
-    // clang-format on
+  return 1;
+}
 
-    for (int blk_row = 0; blk_row < cm->mi_rows; ++blk_row) {
-      for (int blk_col = 0; blk_col < cm->mi_cols; ++blk_col) {
-        MV_REF *mv_ref = &mv_ref_base[blk_row * cm->mi_cols + blk_col];
-        MV fwd_mv = mv_ref->mv[0].as_mv;
-        MV_REFERENCE_FRAME ref_frame[2] = { mv_ref->ref_frame[0],
-                                            mv_ref->ref_frame[1] };
+void av1_setup_motion_field(AV1_COMMON *cm) {
+  memset(cm->ref_frame_side, 0, sizeof(cm->ref_frame_side));
+  if (!cm->seq_params.enable_order_hint) return;
+
+  TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
+  int size = ((cm->mi_rows + MAX_MIB_SIZE) >> 1) * (cm->mi_stride >> 1);
+  for (int idx = 0; idx < size; ++idx) {
+    tpl_mvs_base[idx].mfmv0.as_int = INVALID_MV;
+    tpl_mvs_base[idx].ref_frame_offset = 0;
+  }
 
-        const int ref_frame_offset = ref_frame_offset_buffer[ref_frame[0]];
+  const int cur_order_hint = cm->cur_frame->cur_frame_offset;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 
-        if (ref_frame[0] <= GOLDEN_FRAME && ref_frame[0] > INTRA_FRAME) {
-          int_mv this_mv;
-          int mi_r, mi_c;
+  int ref_buf_idx[INTER_REFS_PER_FRAME];
+  int ref_order_hint[INTER_REFS_PER_FRAME];
 
-          get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_alt,
-                            ref_frame_offset);
-          int pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
-                                             this_mv.as_mv, 0);
-
-          if (pos_valid) {
-            int mi_offset = mi_r * cm->mi_stride + mi_c;
-            tpl_mvs_base[mi_offset]
-                .mfmv[FWD_RF_OFFSET(ALTREF_FRAME)][ref_stamp]
-                .as_int = mv_sign_reverse(this_mv);
-
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_lst,
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset]
-                .mfmv[FWD_RF_OFFSET(LAST_FRAME)][ref_stamp]
-                .as_int = this_mv.as_int;
-
-#if CONFIG_EXT_REFS
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_bwd,
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset]
-                .mfmv[FWD_RF_OFFSET(BWDREF_FRAME)][ref_stamp]
-                .as_int = mv_sign_reverse(this_mv);
-
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_alt2,
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset]
-                .mfmv[FWD_RF_OFFSET(ALTREF2_FRAME)][ref_stamp]
-                .as_int = mv_sign_reverse(this_mv);
-
-            if (ref_frame[0] >= LAST2_FRAME) {
-              get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_lst2,
-                                ref_frame_offset);
-              tpl_mvs_base[mi_offset]
-                  .mfmv[FWD_RF_OFFSET(LAST2_FRAME)][ref_stamp]
-                  .as_int = this_mv.as_int;
-            }
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    const int ref_idx = ref_frame - LAST_FRAME;
+    const int buf_idx = cm->frame_refs[ref_idx].idx;
+    int order_hint = 0;
 
-            if (ref_frame[0] >= LAST3_FRAME) {
-              get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_lst3,
-                                ref_frame_offset);
-              tpl_mvs_base[mi_offset]
-                  .mfmv[FWD_RF_OFFSET(LAST3_FRAME)][ref_stamp]
-                  .as_int = this_mv.as_int;
-            }
-#endif
-            if (ref_frame[0] >= GOLDEN_FRAME) {
-              get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_gld,
-                                ref_frame_offset);
-              tpl_mvs_base[mi_offset]
-                  .mfmv[FWD_RF_OFFSET(GOLDEN_FRAME)][ref_stamp]
-                  .as_int = this_mv.as_int;
-            }
-          }
-        }
-      }
-    }
-  }
+    if (buf_idx >= 0) order_hint = frame_bufs[buf_idx].cur_frame_offset;
 
-// ==========================================
-// Process BWD reference frame
-// ==========================================
-#if CONFIG_EXT_REFS
-  if (bwd_buf_idx >= 0) {
-    MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[bwd_buf_idx].mvs;
-    const int lst_frame_idx =
-        cm->buffer_pool->frame_bufs[bwd_buf_idx].lst_frame_offset;
-    const int gld_frame_idx =
-        cm->buffer_pool->frame_bufs[bwd_buf_idx].gld_frame_offset;
-    const int lst2_frame_idx =
-        cm->buffer_pool->frame_bufs[bwd_buf_idx].lst2_frame_offset;
-    const int lst3_frame_idx =
-        cm->buffer_pool->frame_bufs[bwd_buf_idx].lst3_frame_offset;
-    const int bwd_frame_idx =
-        cm->buffer_pool->frame_bufs[bwd_buf_idx].bwd_frame_offset;
-    const int alt2_frame_idx =
-        cm->buffer_pool->frame_bufs[bwd_buf_idx].alt2_frame_offset;
-    const int alt_frame_idx =
-        cm->buffer_pool->frame_bufs[bwd_buf_idx].alt_frame_offset;
-
-    int lst_offset = AOMMAX(1, bwd_frame_index - lst_frame_idx);
-    int gld_offset = AOMMAX(1, bwd_frame_index - gld_frame_idx);
-    int cur_to_lst = cur_frame_index - lst_frame_index;
-
-    int lst2_offset = AOMMAX(1, bwd_frame_index - lst2_frame_idx);
-    int lst3_offset = AOMMAX(1, bwd_frame_index - lst3_frame_idx);
-    int bwd_offset = AOMMAX(1, bwd_frame_idx - bwd_frame_index);
-    int alt2_offset = AOMMAX(1, alt2_frame_idx - bwd_frame_index);
-    int alt_offset = AOMMAX(1, alt_frame_idx - bwd_frame_index);
-    int cur_to_lst2 = cur_frame_index - lst2_frame_index;
-    int cur_to_lst3 = cur_frame_index - lst3_frame_index;
-    int cur_to_gld = cur_frame_index - gld_frame_index;
-    int cur_to_bwd = bwd_frame_index - cur_frame_index;
-
-    const int ref_stamp = FWD_RF_OFFSET(BWDREF_FRAME);
-    const int ref_frame_offset_buffer[TOTAL_REFS_PER_FRAME] = {
-      0,          lst_offset, lst2_offset, lst3_offset,
-      gld_offset, bwd_offset, alt2_offset, alt_offset,
-    };
+    ref_buf_idx[ref_idx] = buf_idx;
+    ref_order_hint[ref_idx] = order_hint;
 
-    for (int blk_row = 0; blk_row < cm->mi_rows; ++blk_row) {
-      for (int blk_col = 0; blk_col < cm->mi_cols; ++blk_col) {
-        MV_REF *mv_ref = &mv_ref_base[blk_row * cm->mi_cols + blk_col];
-        MV fwd_mv = mv_ref->mv[0].as_mv;
-        MV_REFERENCE_FRAME ref_frame[2] = { mv_ref->ref_frame[0],
-                                            mv_ref->ref_frame[1] };
+    if (get_relative_dist(cm, order_hint, cur_order_hint) > 0)
+      cm->ref_frame_side[ref_frame] = 1;
+    else if (order_hint == cur_order_hint)
+      cm->ref_frame_side[ref_frame] = -1;
+  }
 
-        if (ref_frame[0] <= GOLDEN_FRAME && ref_frame[0] > INTRA_FRAME) {
-          const int ref_frame_offset = ref_frame_offset_buffer[ref_frame[0]];
-          int_mv this_mv;
-          int mi_r, mi_c;
+  int ref_stamp = MFMV_STACK_SIZE - 1;
 
-          get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_bwd,
-                            ref_frame_offset);
-          int pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
-                                             this_mv.as_mv, 0);
-
-          if (pos_valid) {
-            int mi_offset = mi_r * cm->mi_stride + mi_c;
-
-            tpl_mvs_base[mi_offset]
-                .mfmv[FWD_RF_OFFSET(BWDREF_FRAME)][ref_stamp]
-                .as_int = mv_sign_reverse(this_mv);
-
-            // Project the motion vector onto last reference frame
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_lst,
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset]
-                .mfmv[FWD_RF_OFFSET(LAST_FRAME)][ref_stamp]
-                .as_int = this_mv.as_int;
-
-            if (ref_frame[0] >= LAST2_FRAME) {
-              get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_lst2,
-                                ref_frame_offset);
-              tpl_mvs_base[mi_offset]
-                  .mfmv[FWD_RF_OFFSET(LAST2_FRAME)][ref_stamp]
-                  .as_int = this_mv.as_int;
-            }
+  if (ref_buf_idx[LAST_FRAME - LAST_FRAME] >= 0) {
+    const int alt_of_lst_order_hint =
+        frame_bufs[ref_buf_idx[LAST_FRAME - LAST_FRAME]]
+            .ref_frame_offset[ALTREF_FRAME - LAST_FRAME];
 
-            if (ref_frame[0] >= LAST3_FRAME) {
-              get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_lst3,
-                                ref_frame_offset);
-              tpl_mvs_base[mi_offset]
-                  .mfmv[FWD_RF_OFFSET(LAST3_FRAME)][ref_stamp]
-                  .as_int = this_mv.as_int;
-            }
+    const int is_lst_overlay =
+        (alt_of_lst_order_hint == ref_order_hint[GOLDEN_FRAME - LAST_FRAME]);
+    if (!is_lst_overlay) motion_field_projection(cm, LAST_FRAME, 2);
+    --ref_stamp;
+  }
 
-            if (ref_frame[0] >= GOLDEN_FRAME) {
-              get_mv_projection(&this_mv.as_mv, fwd_mv, cur_to_gld,
-                                ref_frame_offset);
-              tpl_mvs_base[mi_offset]
-                  .mfmv[FWD_RF_OFFSET(GOLDEN_FRAME)][ref_stamp]
-                  .as_int = this_mv.as_int;
-            }
-          }
-        }
-      }
-    }
+  if (get_relative_dist(cm, ref_order_hint[BWDREF_FRAME - LAST_FRAME],
+                        cur_order_hint) > 0) {
+    if (motion_field_projection(cm, BWDREF_FRAME, 0)) --ref_stamp;
+  }
+
+  if (get_relative_dist(cm, ref_order_hint[ALTREF2_FRAME - LAST_FRAME],
+                        cur_order_hint) > 0) {
+    if (motion_field_projection(cm, ALTREF2_FRAME, 0)) --ref_stamp;
   }
-#endif
+
+  if (get_relative_dist(cm, ref_order_hint[ALTREF_FRAME - LAST_FRAME],
+                        cur_order_hint) > 0 &&
+      ref_stamp >= 0)
+    if (motion_field_projection(cm, ALTREF_FRAME, 0)) --ref_stamp;
+
+  if (ref_stamp >= 0 && ref_buf_idx[LAST2_FRAME - LAST_FRAME] >= 0)
+    if (motion_field_projection(cm, LAST2_FRAME, 2)) --ref_stamp;
 }
-#endif  // CONFIG_MFMV
 
-#if CONFIG_WARPED_MOTION
-#if WARPED_MOTION_SORT_SAMPLES
 static INLINE void record_samples(MB_MODE_INFO *mbmi, int *pts, int *pts_inref,
-                                  int *pts_mv, int global_offset_r,
-                                  int global_offset_c, int row_offset,
-                                  int sign_r, int col_offset, int sign_c) {
+                                  int row_offset, int sign_r, int col_offset,
+                                  int sign_c) {
   int bw = block_size_wide[mbmi->sb_type];
   int bh = block_size_high[mbmi->sb_type];
-  int cr_offset = row_offset * MI_SIZE + sign_r * AOMMAX(bh, MI_SIZE) / 2 - 1;
-  int cc_offset = col_offset * MI_SIZE + sign_c * AOMMAX(bw, MI_SIZE) / 2 - 1;
-  int x = cc_offset + global_offset_c;
-  int y = cr_offset + global_offset_r;
+  int x = col_offset * MI_SIZE + sign_c * AOMMAX(bw, MI_SIZE) / 2 - 1;
+  int y = row_offset * MI_SIZE + sign_r * AOMMAX(bh, MI_SIZE) / 2 - 1;
 
   pts[0] = (x * 8);
   pts[1] = (y * 8);
   pts_inref[0] = (x * 8) + mbmi->mv[0].as_mv.col;
   pts_inref[1] = (y * 8) + mbmi->mv[0].as_mv.row;
-  pts_mv[0] = mbmi->mv[0].as_mv.col;
-  pts_mv[1] = mbmi->mv[0].as_mv.row;
 }
 
-// Only sort pts and pts_inref, and pts_mv is not sorted.
-#define TRIM_THR 16
-int sortSamples(int *pts_mv, MV *mv, int *pts, int *pts_inref, int len) {
+// Select samples according to the motion vector difference.
+int selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize) {
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int thresh = clamp(AOMMAX(bw, bh), 16, 112);
   int pts_mvd[SAMPLES_ARRAY_SIZE] = { 0 };
-  int i, j, k;
-  int ret = len;
-
-  for (i = 0; i < len; ++i)
-    pts_mvd[i] =
-        abs(pts_mv[2 * i] - mv->col) + abs(pts_mv[2 * i + 1] - mv->row);
-
-  for (i = 1; i <= len - 1; ++i) {
-    for (j = 0; j < i; ++j) {
-      if (pts_mvd[j] > pts_mvd[i]) {
-        int temp, tempi, tempj, ptempi, ptempj;
-
-        temp = pts_mvd[i];
-        tempi = pts[2 * i];
-        tempj = pts[2 * i + 1];
-        ptempi = pts_inref[2 * i];
-        ptempj = pts_inref[2 * i + 1];
-
-        for (k = i; k > j; k--) {
-          pts_mvd[k] = pts_mvd[k - 1];
-          pts[2 * k] = pts[2 * (k - 1)];
-          pts[2 * k + 1] = pts[2 * (k - 1) + 1];
-          pts_inref[2 * k] = pts_inref[2 * (k - 1)];
-          pts_inref[2 * k + 1] = pts_inref[2 * (k - 1) + 1];
-        }
-
-        pts_mvd[j] = temp;
-        pts[2 * j] = tempi;
-        pts[2 * j + 1] = tempj;
-        pts_inref[2 * j] = ptempi;
-        pts_inref[2 * j + 1] = ptempj;
-        break;
-      }
-    }
+  int i, j, k, l = len;
+  int ret = 0;
+  assert(len <= LEAST_SQUARES_SAMPLES_MAX);
+
+  // Obtain the motion vector difference.
+  for (i = 0; i < len; ++i) {
+    pts_mvd[i] = abs(pts_inref[2 * i] - pts[2 * i] - mv->col) +
+                 abs(pts_inref[2 * i + 1] - pts[2 * i + 1] - mv->row);
+
+    if (pts_mvd[i] > thresh)
+      pts_mvd[i] = -1;
+    else
+      ret++;
   }
 
-  for (i = len - 1; i >= 1; i--) {
-    int low = (i == 1) ? 1 : AOMMAX((pts_mvd[i - 1] - pts_mvd[0]) / (i - 1), 1);
-
-    if ((pts_mvd[i] - pts_mvd[i - 1]) >= TRIM_THR * low) ret = i;
+  // Keep at least 1 sample.
+  if (!ret) return 1;
+
+  i = 0;
+  j = l - 1;
+  for (k = 0; k < l - ret; k++) {
+    while (pts_mvd[i] != -1) i++;
+    while (pts_mvd[j] == -1) j--;
+    assert(i != j);
+    if (i > j) break;
+
+    // Replace the discarded samples;
+    pts_mvd[i] = pts_mvd[j];
+    pts[2 * i] = pts[2 * j];
+    pts[2 * i + 1] = pts[2 * j + 1];
+    pts_inref[2 * i] = pts_inref[2 * j];
+    pts_inref[2 * i + 1] = pts_inref[2 * j + 1];
+    i++;
+    j--;
   }
 
-  if (ret > LEAST_SQUARES_SAMPLES_MAX) ret = LEAST_SQUARES_SAMPLES_MAX;
   return ret;
 }
 
 // Note: Samples returned are at 1/8-pel precision
+// Sample are the neighbor block center point's coordinates relative to the
+// left-top pixel of current block.
 int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
-                int *pts, int *pts_inref, int *pts_mv) {
-  MB_MODE_INFO *const mbmi0 = &(xd->mi[0]->mbmi);
+                int *pts, int *pts_inref) {
+  MB_MODE_INFO *const mbmi0 = xd->mi[0];
   int ref_frame = mbmi0->ref_frame[0];
   int up_available = xd->up_available;
   int left_available = xd->left_available;
   int i, mi_step = 1, np = 0;
-  int global_offset_c = mi_col * MI_SIZE;
-  int global_offset_r = mi_row * MI_SIZE;
 
   const TileInfo *const tile = &xd->tile;
   int do_tl = 1;
@@ -2082,8 +1166,7 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
   // scan the nearest above rows
   if (up_available) {
     int mi_row_offset = -1;
-    MODE_INFO *mi = xd->mi[mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *mbmi = &mi->mbmi;
+    MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * xd->mi_stride];
     uint8_t n8_w = mi_size_wide[mbmi->sb_type];
 
     if (xd->n8_w <= n8_w) {
@@ -2094,42 +1177,38 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
       if (col_offset + n8_w > xd->n8_w) do_tr = 0;
 
       if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
-        record_samples(mbmi, pts, pts_inref, pts_mv, global_offset_r,
-                       global_offset_c, 0, -1, col_offset, 1);
+        record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1);
         pts += 2;
         pts_inref += 2;
-        pts_mv += 2;
         np++;
+        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
       }
     } else {
       // Handle "current block width > above block width" case.
       for (i = 0; i < AOMMIN(xd->n8_w, cm->mi_cols - mi_col); i += mi_step) {
         int mi_col_offset = i;
-        mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-        mbmi = &mi->mbmi;
+        mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
         n8_w = mi_size_wide[mbmi->sb_type];
         mi_step = AOMMIN(xd->n8_w, n8_w);
 
         if (mbmi->ref_frame[0] == ref_frame &&
             mbmi->ref_frame[1] == NONE_FRAME) {
-          record_samples(mbmi, pts, pts_inref, pts_mv, global_offset_r,
-                         global_offset_c, 0, -1, i, 1);
+          record_samples(mbmi, pts, pts_inref, 0, -1, i, 1);
           pts += 2;
           pts_inref += 2;
-          pts_mv += 2;
           np++;
+          if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
         }
       }
     }
   }
-  assert(2 * np <= SAMPLES_ARRAY_SIZE);
+  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
 
   // scan the nearest left columns
   if (left_available) {
     int mi_col_offset = -1;
 
-    MODE_INFO *mi = xd->mi[mi_col_offset];
-    MB_MODE_INFO *mbmi = &mi->mbmi;
+    MB_MODE_INFO *mbmi = xd->mi[mi_col_offset];
     uint8_t n8_h = mi_size_high[mbmi->sb_type];
 
     if (xd->n8_h <= n8_h) {
@@ -2139,182 +1218,329 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
       if (row_offset < 0) do_tl = 0;
 
       if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
-        record_samples(mbmi, pts, pts_inref, pts_mv, global_offset_r,
-                       global_offset_c, row_offset, 1, 0, -1);
+        record_samples(mbmi, pts, pts_inref, row_offset, 1, 0, -1);
         pts += 2;
         pts_inref += 2;
-        pts_mv += 2;
         np++;
+        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
       }
     } else {
       // Handle "current block height > above block height" case.
       for (i = 0; i < AOMMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
         int mi_row_offset = i;
-        mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-        mbmi = &mi->mbmi;
+        mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
         n8_h = mi_size_high[mbmi->sb_type];
         mi_step = AOMMIN(xd->n8_h, n8_h);
 
         if (mbmi->ref_frame[0] == ref_frame &&
             mbmi->ref_frame[1] == NONE_FRAME) {
-          record_samples(mbmi, pts, pts_inref, pts_mv, global_offset_r,
-                         global_offset_c, i, 1, 0, -1);
+          record_samples(mbmi, pts, pts_inref, i, 1, 0, -1);
           pts += 2;
           pts_inref += 2;
-          pts_mv += 2;
           np++;
+          if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
         }
       }
     }
   }
-  assert(2 * np <= SAMPLES_ARRAY_SIZE);
+  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
 
   // Top-left block
   if (do_tl && left_available && up_available) {
     int mi_row_offset = -1;
     int mi_col_offset = -1;
 
-    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *mbmi = &mi->mbmi;
+    MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
 
     if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
-      record_samples(mbmi, pts, pts_inref, pts_mv, global_offset_r,
-                     global_offset_c, 0, -1, 0, -1);
+      record_samples(mbmi, pts, pts_inref, 0, -1, 0, -1);
       pts += 2;
       pts_inref += 2;
-      pts_mv += 2;
       np++;
+      if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
     }
   }
-  assert(2 * np <= SAMPLES_ARRAY_SIZE);
+  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
 
   // Top-right block
   if (do_tr &&
       has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->n8_w, xd->n8_h))) {
     POSITION trb_pos = { -1, xd->n8_w };
 
-    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, cm, &trb_pos)) {
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &trb_pos)) {
       int mi_row_offset = -1;
       int mi_col_offset = xd->n8_w;
 
-      MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-      MB_MODE_INFO *mbmi = &mi->mbmi;
+      MB_MODE_INFO *mbmi =
+          xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
 
       if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
-        record_samples(mbmi, pts, pts_inref, pts_mv, global_offset_r,
-                       global_offset_c, 0, -1, xd->n8_w, 1);
+        record_samples(mbmi, pts, pts_inref, 0, -1, xd->n8_w, 1);
         np++;
+        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
       }
     }
   }
-  assert(2 * np <= SAMPLES_ARRAY_SIZE);
+  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
 
   return np;
 }
-#else
-void calc_projection_samples(MB_MODE_INFO *const mbmi, int x, int y,
-                             int *pts_inref) {
-  pts_inref[0] = (x * 8) + mbmi->mv[0].as_mv.col;
-  pts_inref[1] = (y * 8) + mbmi->mv[0].as_mv.row;
+
+void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
+  cm->is_skip_mode_allowed = 0;
+  cm->ref_frame_idx_0 = cm->ref_frame_idx_1 = INVALID_IDX;
+
+  if (!cm->seq_params.enable_order_hint || frame_is_intra_only(cm) ||
+      cm->reference_mode == SINGLE_REFERENCE)
+    return;
+
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  const int cur_frame_offset = cm->frame_offset;
+  int ref_frame_offset[2] = { -1, INT_MAX };
+  int ref_idx[2] = { INVALID_IDX, INVALID_IDX };
+
+  // Identify the nearest forward and backward references.
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    const int buf_idx = cm->frame_refs[i].idx;
+    if (buf_idx == INVALID_IDX) continue;
+
+    const int ref_offset = frame_bufs[buf_idx].cur_frame_offset;
+    if (get_relative_dist(cm, ref_offset, cur_frame_offset) < 0) {
+      // Forward reference
+      if (ref_frame_offset[0] == -1 ||
+          get_relative_dist(cm, ref_offset, ref_frame_offset[0]) > 0) {
+        ref_frame_offset[0] = ref_offset;
+        ref_idx[0] = i;
+      }
+    } else if (get_relative_dist(cm, ref_offset, cur_frame_offset) > 0) {
+      // Backward reference
+      if (ref_frame_offset[1] == INT_MAX ||
+          get_relative_dist(cm, ref_offset, ref_frame_offset[1]) < 0) {
+        ref_frame_offset[1] = ref_offset;
+        ref_idx[1] = i;
+      }
+    }
+  }
+
+  if (ref_idx[0] != INVALID_IDX && ref_idx[1] != INVALID_IDX) {
+    // == Bi-directional prediction ==
+    cm->is_skip_mode_allowed = 1;
+    cm->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
+    cm->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
+  } else if (ref_idx[0] != INVALID_IDX && ref_idx[1] == INVALID_IDX) {
+    // == Forward prediction only ==
+    // Identify the second nearest forward reference.
+    ref_frame_offset[1] = -1;
+    for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+      const int buf_idx = cm->frame_refs[i].idx;
+      if (buf_idx == INVALID_IDX) continue;
+
+      const int ref_offset = frame_bufs[buf_idx].cur_frame_offset;
+      if ((ref_frame_offset[0] != -1 &&
+           get_relative_dist(cm, ref_offset, ref_frame_offset[0]) < 0) &&
+          (ref_frame_offset[1] == -1 ||
+           get_relative_dist(cm, ref_offset, ref_frame_offset[1]) > 0)) {
+        // Second closest forward reference
+        ref_frame_offset[1] = ref_offset;
+        ref_idx[1] = i;
+      }
+    }
+    if (ref_frame_offset[1] != -1) {
+      cm->is_skip_mode_allowed = 1;
+      cm->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
+      cm->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
+    }
+  }
 }
 
-// Note: Samples returned are at 1/8-pel precision
-int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
-                int *pts, int *pts_inref) {
-  MB_MODE_INFO *const mbmi0 = &(xd->mi[0]->mbmi);
-  int ref_frame = mbmi0->ref_frame[0];
-  int up_available = xd->up_available;
-  int left_available = xd->left_available;
-  int i, mi_step, np = 0;
-  int global_offset_c = mi_col * MI_SIZE;
-  int global_offset_r = mi_row * MI_SIZE;
+typedef struct {
+  int map_idx;   // frame map index
+  int buf_idx;   // frame buffer index
+  int sort_idx;  // index based on the offset to be used for sorting
+} REF_FRAME_INFO;
+
+static int compare_ref_frame_info(const void *arg_a, const void *arg_b) {
+  const REF_FRAME_INFO *info_a = (REF_FRAME_INFO *)arg_a;
+  const REF_FRAME_INFO *info_b = (REF_FRAME_INFO *)arg_b;
+
+  if (info_a->sort_idx < info_b->sort_idx) return -1;
+  if (info_a->sort_idx > info_b->sort_idx) return 1;
+  return (info_a->map_idx < info_b->map_idx)
+             ? -1
+             : ((info_a->map_idx > info_b->map_idx) ? 1 : 0);
+}
 
-  // scan the above row
-  if (up_available) {
-    for (i = 0; i < AOMMIN(xd->n8_w, cm->mi_cols - mi_col); i += mi_step) {
-      int mi_row_offset = -1;
-      int mi_col_offset = i;
+static void set_ref_frame_info(AV1_COMMON *const cm, int frame_idx,
+                               REF_FRAME_INFO *ref_info) {
+  assert(frame_idx >= 0 && frame_idx <= INTER_REFS_PER_FRAME);
 
-      MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-      MB_MODE_INFO *mbmi = &mi->mbmi;
+  const int buf_idx = ref_info->buf_idx;
 
-      mi_step = AOMMIN(xd->n8_w, mi_size_wide[mbmi->sb_type]);
+  cm->frame_refs[frame_idx].idx = buf_idx;
+  cm->frame_refs[frame_idx].buf = &cm->buffer_pool->frame_bufs[buf_idx].buf;
+  cm->frame_refs[frame_idx].map_idx = ref_info->map_idx;
+}
 
-      if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
-        int bw = block_size_wide[mbmi->sb_type];
-        int bh = block_size_high[mbmi->sb_type];
-        int cr_offset = -AOMMAX(bh, MI_SIZE) / 2 - 1;
-        int cc_offset = i * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1;
-        int x = cc_offset + global_offset_c;
-        int y = cr_offset + global_offset_r;
-
-        pts[0] = (x * 8);
-        pts[1] = (y * 8);
-        calc_projection_samples(mbmi, x, y, pts_inref);
-        pts += 2;
-        pts_inref += 2;
-        np++;
-        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
-      }
+void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
+                        int gld_map_idx) {
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = pool->frame_bufs;
+
+  int lst_frame_sort_idx = -1;
+  int gld_frame_sort_idx = -1;
+
+  assert(cm->seq_params.enable_order_hint);
+  assert(cm->seq_params.order_hint_bits_minus_1 >= 0);
+  const int cur_frame_offset = (int)cm->frame_offset;
+  const int cur_frame_sort_idx = 1 << cm->seq_params.order_hint_bits_minus_1;
+
+  REF_FRAME_INFO ref_frame_info[REF_FRAMES];
+  int ref_flag_list[INTER_REFS_PER_FRAME] = { 0, 0, 0, 0, 0, 0, 0 };
+
+  for (int i = 0; i < REF_FRAMES; ++i) {
+    const int map_idx = i;
+
+    ref_frame_info[i].map_idx = map_idx;
+    ref_frame_info[i].sort_idx = -1;
+
+    const int buf_idx = cm->ref_frame_map[map_idx];
+    ref_frame_info[i].buf_idx = buf_idx;
+
+    if (buf_idx < 0 || buf_idx >= FRAME_BUFFERS) continue;
+    // TODO(zoeliu@google.com): To verify the checking on ref_count.
+    if (frame_bufs[buf_idx].ref_count <= 0) continue;
+
+    const int offset = (int)frame_bufs[buf_idx].cur_frame_offset;
+    ref_frame_info[i].sort_idx =
+        (offset == -1) ? -1
+                       : cur_frame_sort_idx +
+                             get_relative_dist(cm, offset, cur_frame_offset);
+    assert(ref_frame_info[i].sort_idx >= -1);
+
+    if (map_idx == lst_map_idx) lst_frame_sort_idx = ref_frame_info[i].sort_idx;
+    if (map_idx == gld_map_idx) gld_frame_sort_idx = ref_frame_info[i].sort_idx;
+  }
+
+  // Confirm both LAST_FRAME and GOLDEN_FRAME are valid forward reference
+  // frames.
+  if (lst_frame_sort_idx == -1 || lst_frame_sort_idx >= cur_frame_sort_idx) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Inter frame requests a look-ahead frame as LAST");
+  }
+  if (gld_frame_sort_idx == -1 || gld_frame_sort_idx >= cur_frame_sort_idx) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Inter frame requests a look-ahead frame as GOLDEN");
+  }
+
+  // Sort ref frames based on their frame_offset values.
+  qsort(ref_frame_info, REF_FRAMES, sizeof(REF_FRAME_INFO),
+        compare_ref_frame_info);
+
+  // Identify forward and backward reference frames.
+  // Forward  reference: offset < cur_frame_offset
+  // Backward reference: offset >= cur_frame_offset
+  int fwd_start_idx = 0, fwd_end_idx = REF_FRAMES - 1;
+
+  for (int i = 0; i < REF_FRAMES; i++) {
+    if (ref_frame_info[i].sort_idx == -1) {
+      fwd_start_idx++;
+      continue;
+    }
+
+    if (ref_frame_info[i].sort_idx >= cur_frame_sort_idx) {
+      fwd_end_idx = i - 1;
+      break;
     }
   }
-  assert(2 * np <= SAMPLES_ARRAY_SIZE);
 
-  // scan the left column
-  if (left_available) {
-    for (i = 0; i < AOMMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
-      int mi_row_offset = i;
-      int mi_col_offset = -1;
+  int bwd_start_idx = fwd_end_idx + 1;
+  int bwd_end_idx = REF_FRAMES - 1;
 
-      MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-      MB_MODE_INFO *mbmi = &mi->mbmi;
+  // === Backward Reference Frames ===
 
-      mi_step = AOMMIN(xd->n8_h, mi_size_high[mbmi->sb_type]);
+  // == ALTREF_FRAME ==
+  if (bwd_start_idx <= bwd_end_idx) {
+    set_ref_frame_info(cm, ALTREF_FRAME - LAST_FRAME,
+                       &ref_frame_info[bwd_end_idx]);
+    ref_flag_list[ALTREF_FRAME - LAST_FRAME] = 1;
+    bwd_end_idx--;
+  }
 
-      if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
-        int bw = block_size_wide[mbmi->sb_type];
-        int bh = block_size_high[mbmi->sb_type];
-        int cr_offset = i * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1;
-        int cc_offset = -AOMMAX(bw, MI_SIZE) / 2 - 1;
-        int x = cc_offset + global_offset_c;
-        int y = cr_offset + global_offset_r;
-
-        pts[0] = (x * 8);
-        pts[1] = (y * 8);
-        calc_projection_samples(mbmi, x, y, pts_inref);
-        pts += 2;
-        pts_inref += 2;
-        np++;
-        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
-      }
+  // == BWDREF_FRAME ==
+  if (bwd_start_idx <= bwd_end_idx) {
+    set_ref_frame_info(cm, BWDREF_FRAME - LAST_FRAME,
+                       &ref_frame_info[bwd_start_idx]);
+    ref_flag_list[BWDREF_FRAME - LAST_FRAME] = 1;
+    bwd_start_idx++;
+  }
+
+  // == ALTREF2_FRAME ==
+  if (bwd_start_idx <= bwd_end_idx) {
+    set_ref_frame_info(cm, ALTREF2_FRAME - LAST_FRAME,
+                       &ref_frame_info[bwd_start_idx]);
+    ref_flag_list[ALTREF2_FRAME - LAST_FRAME] = 1;
+  }
+
+  // === Forward Reference Frames ===
+
+  for (int i = fwd_start_idx; i <= fwd_end_idx; ++i) {
+    // == LAST_FRAME ==
+    if (ref_frame_info[i].map_idx == lst_map_idx) {
+      set_ref_frame_info(cm, LAST_FRAME - LAST_FRAME, &ref_frame_info[i]);
+      ref_flag_list[LAST_FRAME - LAST_FRAME] = 1;
+    }
+
+    // == GOLDEN_FRAME ==
+    if (ref_frame_info[i].map_idx == gld_map_idx) {
+      set_ref_frame_info(cm, GOLDEN_FRAME - LAST_FRAME, &ref_frame_info[i]);
+      ref_flag_list[GOLDEN_FRAME - LAST_FRAME] = 1;
     }
   }
-  assert(2 * np <= SAMPLES_ARRAY_SIZE);
 
-  if (left_available && up_available) {
-    int mi_row_offset = -1;
-    int mi_col_offset = -1;
+  assert(ref_flag_list[LAST_FRAME - LAST_FRAME] == 1 &&
+         ref_flag_list[GOLDEN_FRAME - LAST_FRAME] == 1);
 
-    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *mbmi = &mi->mbmi;
+  // == LAST2_FRAME ==
+  // == LAST3_FRAME ==
+  // == BWDREF_FRAME ==
+  // == ALTREF2_FRAME ==
+  // == ALTREF_FRAME ==
 
-    if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
-      int bw = block_size_wide[mbmi->sb_type];
-      int bh = block_size_high[mbmi->sb_type];
-      int cr_offset = -AOMMAX(bh, MI_SIZE) / 2 - 1;
-      int cc_offset = -AOMMAX(bw, MI_SIZE) / 2 - 1;
-      int x = cc_offset + global_offset_c;
-      int y = cr_offset + global_offset_r;
-
-      pts[0] = (x * 8);
-      pts[1] = (y * 8);
-      calc_projection_samples(mbmi, x, y, pts_inref);
-      np++;
+  // Set up the reference frames in the anti-chronological order.
+  static const MV_REFERENCE_FRAME ref_frame_list[INTER_REFS_PER_FRAME - 2] = {
+    LAST2_FRAME, LAST3_FRAME, BWDREF_FRAME, ALTREF2_FRAME, ALTREF_FRAME
+  };
+
+  int ref_idx;
+  for (ref_idx = 0; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
+    const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
+
+    if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;
+
+    while (fwd_start_idx <= fwd_end_idx &&
+           (ref_frame_info[fwd_end_idx].map_idx == lst_map_idx ||
+            ref_frame_info[fwd_end_idx].map_idx == gld_map_idx)) {
+      fwd_end_idx--;
     }
+    if (fwd_start_idx > fwd_end_idx) break;
+
+    set_ref_frame_info(cm, ref_frame - LAST_FRAME,
+                       &ref_frame_info[fwd_end_idx]);
+    ref_flag_list[ref_frame - LAST_FRAME] = 1;
+
+    fwd_end_idx--;
   }
-  assert(2 * np <= SAMPLES_ARRAY_SIZE);
 
-  return np;
+  // Assign all the remaining frame(s), if any, to the earliest reference frame.
+  for (; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
+    const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
+    if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;
+    set_ref_frame_info(cm, ref_frame - LAST_FRAME,
+                       &ref_frame_info[fwd_start_idx]);
+    ref_flag_list[ref_frame - LAST_FRAME] = 1;
+  }
+
+  for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+    assert(ref_flag_list[i] == 1);
+  }
 }
-#endif  // WARPED_MOTION_SORT_SAMPLES
-#endif  // CONFIG_WARPED_MOTION
diff --git a/third_party/aom/av1/common/mvref_common.h b/third_party/aom/av1/common/mvref_common.h
index 348887e43..716b4a247 100644
--- a/third_party/aom/av1/common/mvref_common.h
+++ b/third_party/aom/av1/common/mvref_common.h
@@ -18,103 +18,36 @@
 extern "C" {
 #endif
 
-#define MVREF_NEIGHBOURS 9
-#define MVREF_ROWS 3
-#define MVREF_COLS 4
+#define MVREF_ROW_COLS 3
+
+// Set the upper limit of the motion vector component magnitude.
+// This would make a motion vector fit in 26 bits. Plus 3 bits for the
+// reference frame index. A tuple of motion vector can hence be stored within
+// 32 bit range for efficient load/store operations.
+#define REFMVS_LIMIT ((1 << 12) - 1)
 
 typedef struct position {
   int row;
   int col;
 } POSITION;
 
-typedef enum {
-  BOTH_ZERO = 0,
-  ZERO_PLUS_PREDICTED = 1,
-  BOTH_PREDICTED = 2,
-  NEW_PLUS_NON_INTRA = 3,
-  BOTH_NEW = 4,
-  INTRA_PLUS_NON_INTRA = 5,
-  BOTH_INTRA = 6,
-  INVALID_CASE = 9
-} motion_vector_context;
-
-// This is used to figure out a context for the ref blocks. The code flattens
-// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by
-// adding 9 for each intra block, 3 for each zero mv and 1 for each new
-// motion vector. This single number is then converted into a context
-// with a single lookup ( counter_to_context ).
-static const int mode_2_counter[] = {
-  9,  // DC_PRED
-  9,  // V_PRED
-  9,  // H_PRED
-  9,  // D45_PRED
-  9,  // D135_PRED
-  9,  // D117_PRED
-  9,  // D153_PRED
-  9,  // D207_PRED
-  9,  // D63_PRED
-  9,  // SMOOTH_PRED
-#if CONFIG_SMOOTH_HV
-  9,    // SMOOTH_V_PRED
-  9,    // SMOOTH_H_PRED
-#endif  // CONFIG_SMOOTH_HV
-  9,    // TM_PRED
-  0,    // NEARESTMV
-  0,    // NEARMV
-  3,    // ZEROMV
-  1,    // NEWMV
-#if CONFIG_COMPOUND_SINGLEREF
-  0,    // SR_NEAREST_NEARMV
-        //  1,    // SR_NEAREST_NEWMV
-  1,    // SR_NEAR_NEWMV
-  3,    // SR_ZERO_NEWMV
-  1,    // SR_NEW_NEWMV
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  0,    // NEAREST_NEARESTMV
-  0,    // NEAR_NEARMV
-  1,    // NEAREST_NEWMV
-  1,    // NEW_NEARESTMV
-  1,    // NEAR_NEWMV
-  1,    // NEW_NEARMV
-  3,    // ZERO_ZEROMV
-  1,    // NEW_NEWMV
-};
+// clamp_mv_ref
+#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
 
-// There are 3^3 different combinations of 3 counts that can be either 0,1 or
-// 2. However the actual count can never be greater than 2 so the highest
-// counter we need is 18. 9 is an invalid counter that's never used.
-static const int counter_to_context[19] = {
-  BOTH_PREDICTED,        // 0
-  NEW_PLUS_NON_INTRA,    // 1
-  BOTH_NEW,              // 2
-  ZERO_PLUS_PREDICTED,   // 3
-  NEW_PLUS_NON_INTRA,    // 4
-  INVALID_CASE,          // 5
-  BOTH_ZERO,             // 6
-  INVALID_CASE,          // 7
-  INVALID_CASE,          // 8
-  INTRA_PLUS_NON_INTRA,  // 9
-  INTRA_PLUS_NON_INTRA,  // 10
-  INVALID_CASE,          // 11
-  INTRA_PLUS_NON_INTRA,  // 12
-  INVALID_CASE,          // 13
-  INVALID_CASE,          // 14
-  INVALID_CASE,          // 15
-  INVALID_CASE,          // 16
-  INVALID_CASE,          // 17
-  BOTH_INTRA             // 18
-};
+static INLINE int get_relative_dist(const AV1_COMMON *cm, int a, int b) {
+  if (!cm->seq_params.enable_order_hint) return 0;
 
-static const int idx_n_column_to_subblock[4][2] = {
-  { 1, 2 }, { 1, 3 }, { 3, 2 }, { 3, 3 }
-};
+  const int bits = cm->seq_params.order_hint_bits_minus_1 + 1;
 
-// clamp_mv_ref
-#if CONFIG_EXT_PARTITION
-#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
-#else
-#define MV_BORDER (8 << 3)  // Allow 8 pels in 1/8th pel units
-#endif                      // CONFIG_EXT_PARTITION
+  assert(bits >= 1);
+  assert(a >= 0 && a < (1 << bits));
+  assert(b >= 0 && b < (1 << bits));
+
+  int diff = a - b;
+  int m = 1 << (bits - 1);
+  diff = (diff & (m - 1)) - (diff & m);
+  return diff;
+}
 
 static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
   clamp_mv(mv, xd->mb_to_left_edge - bw * 8 - MV_BORDER,
@@ -125,19 +58,16 @@ static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
 
 // This function returns either the appropriate sub block or block's mv
 // on whether the block_size < 8x8 and we have check_sub_blocks set.
-static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
-                                      int search_col, int block_idx) {
+static INLINE int_mv get_sub_block_mv(const MB_MODE_INFO *candidate,
+                                      int which_mv, int search_col) {
   (void)search_col;
-  (void)block_idx;
-  return candidate->mbmi.mv[which_mv];
+  return candidate->mv[which_mv];
 }
 
-static INLINE int_mv get_sub_block_pred_mv(const MODE_INFO *candidate,
-                                           int which_mv, int search_col,
-                                           int block_idx) {
+static INLINE int_mv get_sub_block_pred_mv(const MB_MODE_INFO *candidate,
+                                           int which_mv, int search_col) {
   (void)search_col;
-  (void)block_idx;
-  return candidate->mbmi.mv[which_mv];
+  return candidate->mv[which_mv];
 }
 
 // Performs mv sign inversion if indicated by the reference frame combination.
@@ -152,48 +82,11 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
   return mv;
 }
 
-#define CLIP_IN_ADD(mv, bw, bh, xd) clamp_mv_ref(mv, bw, bh, xd)
-
-// This macro is used to add a motion vector mv_ref list if it isn't
-// already in the list.  If it's the second motion vector it will also
-// skip all additional processing and jump to done!
-#define ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, bw, bh, xd, Done)      \
-  do {                                                                       \
-    (mv_ref_list)[(refmv_count)] = (mv);                                     \
-    CLIP_IN_ADD(&(mv_ref_list)[(refmv_count)].as_mv, (bw), (bh), (xd));      \
-    if (refmv_count && (mv_ref_list)[1].as_int != (mv_ref_list)[0].as_int) { \
-      (refmv_count) = 2;                                                     \
-      goto Done;                                                             \
-    }                                                                        \
-    (refmv_count) = 1;                                                       \
-  } while (0)
-
-// If either reference frame is different, not INTRA, and they
-// are different from each other scale and add the mv to our list.
-#define IF_DIFF_REF_FRAME_ADD_MV(mbmi, ref_frame, ref_sign_bias, refmv_count, \
-                                 mv_ref_list, bw, bh, xd, Done)               \
-  do {                                                                        \
-    if (is_inter_block(mbmi)) {                                               \
-      if ((mbmi)->ref_frame[0] != ref_frame)                                  \
-        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias),        \
-                        refmv_count, mv_ref_list, bw, bh, xd, Done);          \
-      if (has_second_ref(mbmi) && (mbmi)->ref_frame[1] != ref_frame)          \
-        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias),        \
-                        refmv_count, mv_ref_list, bw, bh, xd, Done);          \
-    }                                                                         \
-  } while (0)
-
 // Checks that the given mi_row, mi_col and search point
 // are inside the borders of the tile.
 static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
-                            int mi_rows, const AV1_COMMON *cm,
-                            const POSITION *mi_pos) {
-#if CONFIG_DEPENDENT_HORZTILES
-  const int dependent_horz_tile_flag = cm->dependent_horz_tiles;
-#else
+                            int mi_rows, const POSITION *mi_pos) {
   const int dependent_horz_tile_flag = 0;
-  (void)cm;
-#endif
   if (dependent_horz_tile_flag && !tile->tg_horz_boundary) {
     return !(mi_row + mi_pos->row < 0 ||
              mi_col + mi_pos->col < tile->mi_col_start ||
@@ -208,14 +101,8 @@ static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
 }
 
 static INLINE int find_valid_row_offset(const TileInfo *const tile, int mi_row,
-                                        int mi_rows, const AV1_COMMON *cm,
-                                        int row_offset) {
-#if CONFIG_DEPENDENT_HORZTILES
-  const int dependent_horz_tile_flag = cm->dependent_horz_tiles;
-#else
+                                        int mi_rows, int row_offset) {
   const int dependent_horz_tile_flag = 0;
-  (void)cm;
-#endif
   if (dependent_horz_tile_flag && !tile->tg_horz_boundary)
     return clamp(row_offset, -mi_row, mi_rows - mi_row - 1);
   else
@@ -229,87 +116,49 @@ static INLINE int find_valid_col_offset(const TileInfo *const tile, int mi_col,
                tile->mi_col_end - mi_col - 1);
 }
 
-static INLINE void lower_mv_precision(MV *mv, int allow_hp
-#if CONFIG_AMVR
-                                      ,
-                                      int is_integer
-#endif
-                                      ) {
-#if CONFIG_AMVR
+static INLINE void lower_mv_precision(MV *mv, int allow_hp, int is_integer) {
   if (is_integer) {
     integer_mv_precision(mv);
   } else {
-#endif
     if (!allow_hp) {
       if (mv->row & 1) mv->row += (mv->row > 0 ? -1 : 1);
       if (mv->col & 1) mv->col += (mv->col > 0 ? -1 : 1);
     }
-#if CONFIG_AMVR
   }
-#endif
-}
-
-static INLINE uint8_t av1_get_pred_diff_ctx(const int_mv pred_mv,
-                                            const int_mv this_mv) {
-  if (abs(this_mv.as_mv.row - pred_mv.as_mv.row) <= 4 &&
-      abs(this_mv.as_mv.col - pred_mv.as_mv.col) <= 4)
-    return 2;
-  else
-    return 1;
-}
-
-static INLINE int av1_nmv_ctx(const uint8_t ref_mv_count,
-                              const CANDIDATE_MV *ref_mv_stack, int ref,
-                              int ref_mv_idx) {
-  if (ref_mv_stack[ref_mv_idx].weight >= REF_CAT_LEVEL && ref_mv_count > 0)
-    return ref_mv_stack[ref_mv_idx].pred_diff[ref];
-
-  return 0;
 }
 
-#if CONFIG_EXT_COMP_REFS
-static INLINE int8_t av1_uni_comp_ref_idx(const MV_REFERENCE_FRAME *const rf) {
+static INLINE int8_t get_uni_comp_ref_idx(const MV_REFERENCE_FRAME *const rf) {
   // Single ref pred
   if (rf[1] <= INTRA_FRAME) return -1;
 
   // Bi-directional comp ref pred
   if ((rf[0] < BWDREF_FRAME) && (rf[1] >= BWDREF_FRAME)) return -1;
 
-  for (int8_t ref_idx = 0; ref_idx < UNIDIR_COMP_REFS; ++ref_idx) {
+  for (int8_t ref_idx = 0; ref_idx < TOTAL_UNIDIR_COMP_REFS; ++ref_idx) {
     if (rf[0] == comp_ref0(ref_idx) && rf[1] == comp_ref1(ref_idx))
       return ref_idx;
   }
   return -1;
 }
-#endif  // CONFIG_EXT_COMP_REFS
 
 static INLINE int8_t av1_ref_frame_type(const MV_REFERENCE_FRAME *const rf) {
   if (rf[1] > INTRA_FRAME) {
-#if CONFIG_EXT_COMP_REFS
-    int8_t uni_comp_ref_idx = av1_uni_comp_ref_idx(rf);
-#if !USE_UNI_COMP_REFS
-    // NOTE: uni-directional comp refs disabled
-    assert(uni_comp_ref_idx < 0);
-#endif  // !USE_UNI_COMP_REFS
+    const int8_t uni_comp_ref_idx = get_uni_comp_ref_idx(rf);
     if (uni_comp_ref_idx >= 0) {
-      assert((TOTAL_REFS_PER_FRAME + FWD_REFS * BWD_REFS + uni_comp_ref_idx) <
+      assert((REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx) <
              MODE_CTX_REF_FRAMES);
-      return TOTAL_REFS_PER_FRAME + FWD_REFS * BWD_REFS + uni_comp_ref_idx;
+      return REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx;
     } else {
-#endif  // CONFIG_EXT_COMP_REFS
-      return TOTAL_REFS_PER_FRAME + FWD_RF_OFFSET(rf[0]) +
+      return REF_FRAMES + FWD_RF_OFFSET(rf[0]) +
              BWD_RF_OFFSET(rf[1]) * FWD_REFS;
-#if CONFIG_EXT_COMP_REFS
     }
-#endif  // CONFIG_EXT_COMP_REFS
   }
 
   return rf[0];
 }
 
 // clang-format off
-static MV_REFERENCE_FRAME ref_frame_map[COMP_REFS][2] = {
-#if CONFIG_EXT_REFS
+static MV_REFERENCE_FRAME ref_frame_map[TOTAL_COMP_REFS][2] = {
   { LAST_FRAME, BWDREF_FRAME },  { LAST2_FRAME, BWDREF_FRAME },
   { LAST3_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, BWDREF_FRAME },
 
@@ -317,58 +166,51 @@ static MV_REFERENCE_FRAME ref_frame_map[COMP_REFS][2] = {
   { LAST3_FRAME, ALTREF2_FRAME }, { GOLDEN_FRAME, ALTREF2_FRAME },
 
   { LAST_FRAME, ALTREF_FRAME },  { LAST2_FRAME, ALTREF_FRAME },
-  { LAST3_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME }
-
-  // TODO(zoeliu): Temporarily disable uni-directional comp refs
-#if CONFIG_EXT_COMP_REFS
-  , { LAST_FRAME, LAST2_FRAME }, { LAST_FRAME, LAST3_FRAME },
-  { LAST_FRAME, GOLDEN_FRAME }, { BWDREF_FRAME, ALTREF_FRAME }
-  // TODO(zoeliu): When ALTREF2 is enabled, we may add:
-  //               {BWDREF_FRAME, ALTREF2_FRAME}
-#endif  // CONFIG_EXT_COMP_REFS
-#else  // !CONFIG_EXT_REFS
-  { LAST_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME }
-#endif  // CONFIG_EXT_REFS
+  { LAST3_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME },
+
+  { LAST_FRAME, LAST2_FRAME }, { LAST_FRAME, LAST3_FRAME },
+  { LAST_FRAME, GOLDEN_FRAME }, { BWDREF_FRAME, ALTREF_FRAME },
+
+  // NOTE: Following reference frame pairs are not supported to be explicitly
+  //       signalled, but they are possibly chosen by the use of skip_mode,
+  //       which may use the most recent one-sided reference frame pair.
+  { LAST2_FRAME, LAST3_FRAME }, { LAST2_FRAME, GOLDEN_FRAME },
+  { LAST3_FRAME, GOLDEN_FRAME }, {BWDREF_FRAME, ALTREF2_FRAME},
+  { ALTREF2_FRAME, ALTREF_FRAME }
 };
 // clang-format on
 
 static INLINE void av1_set_ref_frame(MV_REFERENCE_FRAME *rf,
                                      int8_t ref_frame_type) {
-  if (ref_frame_type >= TOTAL_REFS_PER_FRAME) {
-    rf[0] = ref_frame_map[ref_frame_type - TOTAL_REFS_PER_FRAME][0];
-    rf[1] = ref_frame_map[ref_frame_type - TOTAL_REFS_PER_FRAME][1];
+  if (ref_frame_type >= REF_FRAMES) {
+    rf[0] = ref_frame_map[ref_frame_type - REF_FRAMES][0];
+    rf[1] = ref_frame_map[ref_frame_type - REF_FRAMES][1];
   } else {
     rf[0] = ref_frame_type;
     rf[1] = NONE_FRAME;
-#if CONFIG_INTRABC
     assert(ref_frame_type > NONE_FRAME);
-#else
-    assert(ref_frame_type > INTRA_FRAME);
-#endif
-    assert(ref_frame_type < TOTAL_REFS_PER_FRAME);
   }
 }
 
+static uint16_t compound_mode_ctx_map[3][COMP_NEWMV_CTXS] = {
+  { 0, 1, 1, 1, 1 },
+  { 1, 2, 3, 4, 4 },
+  { 4, 4, 5, 6, 7 },
+};
+
 static INLINE int16_t av1_mode_context_analyzer(
-    const int16_t *const mode_context, const MV_REFERENCE_FRAME *const rf,
-    BLOCK_SIZE bsize, int block) {
-  int16_t mode_ctx = 0;
-  int8_t ref_frame_type = av1_ref_frame_type(rf);
-
-  if (block >= 0) {
-    mode_ctx = mode_context[rf[0]] & 0x00ff;
-#if !CONFIG_CB4X4
-    if (block > 0 && bsize < BLOCK_8X8 && bsize > BLOCK_4X4)
-      mode_ctx |= (1 << SKIP_NEARESTMV_SUB8X8_OFFSET);
-#else
-    (void)block;
-    (void)bsize;
-#endif
+    const int16_t *const mode_context, const MV_REFERENCE_FRAME *const rf) {
+  const int8_t ref_frame = av1_ref_frame_type(rf);
 
-    return mode_ctx;
-  }
+  if (rf[1] <= INTRA_FRAME) return mode_context[ref_frame];
+
+  const int16_t newmv_ctx = mode_context[ref_frame] & NEWMV_CTX_MASK;
+  const int16_t refmv_ctx =
+      (mode_context[ref_frame] >> REFMV_OFFSET) & REFMV_CTX_MASK;
 
-  return mode_context[ref_frame_type];
+  const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN(
+      newmv_ctx, COMP_NEWMV_CTXS - 1)];
+  return comp_ctx;
 }
 
 static INLINE uint8_t av1_drl_ctx(const CANDIDATE_MV *ref_mv_stack,
@@ -379,92 +221,99 @@ static INLINE uint8_t av1_drl_ctx(const CANDIDATE_MV *ref_mv_stack,
 
   if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL &&
       ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
-    return 2;
+    return 1;
 
   if (ref_mv_stack[ref_idx].weight < REF_CAT_LEVEL &&
       ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
-    return 3;
+    return 2;
 
   return 0;
 }
 
-#if CONFIG_FRAME_MARKER
 void av1_setup_frame_buf_refs(AV1_COMMON *cm);
-#if CONFIG_FRAME_SIGN_BIAS
 void av1_setup_frame_sign_bias(AV1_COMMON *cm);
-#endif  // CONFIG_FRAME_SIGN_BIAS
-#if CONFIG_MFMV
+void av1_setup_skip_mode_allowed(AV1_COMMON *cm);
 void av1_setup_motion_field(AV1_COMMON *cm);
-#endif  // CONFIG_MFMV
-#endif  // CONFIG_FRAME_MARKER
+void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, int gld_map_idx);
+
+static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) {
+  av1_zero(xd->neighbors_ref_counts);
+
+  uint8_t *const ref_counts = xd->neighbors_ref_counts;
+
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+  // Above neighbor
+  if (above_in_image && is_inter_block(above_mbmi)) {
+    ref_counts[above_mbmi->ref_frame[0]]++;
+    if (has_second_ref(above_mbmi)) {
+      ref_counts[above_mbmi->ref_frame[1]]++;
+    }
+  }
+
+  // Left neighbor
+  if (left_in_image && is_inter_block(left_mbmi)) {
+    ref_counts[left_mbmi->ref_frame[0]]++;
+    if (has_second_ref(left_mbmi)) {
+      ref_counts[left_mbmi->ref_frame[1]]++;
+    }
+  }
+}
 
-void av1_copy_frame_mvs(const AV1_COMMON *const cm, MODE_INFO *mi, int mi_row,
-                        int mi_col, int x_mis, int y_mis);
+void av1_copy_frame_mvs(const AV1_COMMON *const cm, MB_MODE_INFO *mi,
+                        int mi_row, int mi_col, int x_mis, int y_mis);
 
-typedef void (*find_mv_refs_sync)(void *const data, int mi_row);
 void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                      uint8_t *ref_mv_count, CANDIDATE_MV *ref_mv_stack,
-                      int16_t *compound_mode_context, int_mv *mv_ref_list,
-                      int mi_row, int mi_col, find_mv_refs_sync sync,
-                      void *const data, int16_t *mode_context);
+                      MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                      uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
+                      CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+                      int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES],
+                      int_mv *global_mvs, int mi_row, int mi_col,
+                      int16_t *mode_context);
 
 // check a list of motion vectors by sad score using a number rows of pixels
 // above and a number cols of pixels in the left to select the one with best
 // score to use as ref motion vector
-#if CONFIG_AMVR
 void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
                            int_mv *near_mv, int is_integer);
-#else
-void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
-                           int_mv *near_mv);
-#endif
 
-void av1_append_sub8x8_mvs_for_idx(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                   int block, int ref, int mi_row, int mi_col,
-                                   CANDIDATE_MV *ref_mv_stack,
-                                   uint8_t *ref_mv_count, int_mv *mv_list,
-                                   int_mv *nearest_mv, int_mv *near_mv);
-
-// This function keeps a mode count for a given MB/SB
-void av1_update_mv_context(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                           MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                           int_mv *mv_ref_list, int block, int mi_row,
-                           int mi_col, int16_t *mode_context);
-
-#if CONFIG_WARPED_MOTION
-#if WARPED_MOTION_SORT_SAMPLES
-int sortSamples(int *pts_mv, MV *mv, int *pts, int *pts_inref, int len);
-int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
-                int *pts, int *pts_inref, int *pts_mv);
-#else
+int selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize);
 int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
                 int *pts, int *pts_inref);
-#endif  // WARPED_MOTION_SORT_SAMPLES
-#endif  // CONFIG_WARPED_MOTION
 
-#if CONFIG_INTRABC
-static INLINE void av1_find_ref_dv(int_mv *ref_dv, int mi_row, int mi_col) {
-  // TODO(aconverse@google.com): Handle tiles and such
+#define INTRABC_DELAY_PIXELS 256  //  Delay of 256 pixels
+#define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
+#define USE_WAVE_FRONT 1  // Use only top left area of frame for reference.
+
+static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile,
+                                   int mib_size, int mi_row, int mi_col) {
   (void)mi_col;
-  if (mi_row < MAX_MIB_SIZE) {
+  if (mi_row - mib_size < tile->mi_row_start) {
     ref_dv->as_mv.row = 0;
-    ref_dv->as_mv.col = -MI_SIZE * MAX_MIB_SIZE;
+    ref_dv->as_mv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS;
   } else {
-    ref_dv->as_mv.row = -MI_SIZE * MAX_MIB_SIZE;
+    ref_dv->as_mv.row = -MI_SIZE * mib_size;
     ref_dv->as_mv.col = 0;
   }
+  ref_dv->as_mv.row *= 8;
+  ref_dv->as_mv.col *= 8;
 }
 
-static INLINE int is_dv_valid(const MV dv, const TileInfo *const tile,
-                              int mi_row, int mi_col, BLOCK_SIZE bsize) {
+static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
+                                  const MACROBLOCKD *xd, int mi_row, int mi_col,
+                                  BLOCK_SIZE bsize, int mib_size_log2) {
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
   const int SCALE_PX_TO_MV = 8;
   // Disallow subpixel for now
   // SUBPEL_MASK is not the correct scale
-  if ((dv.row & (SCALE_PX_TO_MV - 1) || dv.col & (SCALE_PX_TO_MV - 1)))
+  if (((dv.row & (SCALE_PX_TO_MV - 1)) || (dv.col & (SCALE_PX_TO_MV - 1))))
     return 0;
+
+  const TileInfo *const tile = &xd->tile;
   // Is the source top-left inside the current tile?
   const int src_top_edge = mi_row * MI_SIZE * SCALE_PX_TO_MV + dv.row;
   const int tile_top_edge = tile->mi_row_start * MI_SIZE * SCALE_PX_TO_MV;
@@ -479,20 +328,44 @@ static INLINE int is_dv_valid(const MV dv, const TileInfo *const tile,
   const int src_right_edge = (mi_col * MI_SIZE + bw) * SCALE_PX_TO_MV + dv.col;
   const int tile_right_edge = tile->mi_col_end * MI_SIZE * SCALE_PX_TO_MV;
   if (src_right_edge > tile_right_edge) return 0;
-  // Is the bottom right within an already coded SB?
-  const int active_sb_top_edge =
-      (mi_row & ~MAX_MIB_MASK) * MI_SIZE * SCALE_PX_TO_MV;
-  const int active_sb_bottom_edge =
-      ((mi_row & ~MAX_MIB_MASK) + MAX_MIB_SIZE) * MI_SIZE * SCALE_PX_TO_MV;
-  const int active_sb_left_edge =
-      (mi_col & ~MAX_MIB_MASK) * MI_SIZE * SCALE_PX_TO_MV;
-  if (src_bottom_edge > active_sb_bottom_edge) return 0;
-  if (src_bottom_edge > active_sb_top_edge &&
-      src_right_edge > active_sb_left_edge)
+
+  // Special case for sub 8x8 chroma cases, to prevent referring to chroma
+  // pixels outside current tile.
+  for (int plane = 1; plane < av1_num_planes(cm); ++plane) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    if (is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                            pd->subsampling_y)) {
+      if (bw < 8 && pd->subsampling_x)
+        if (src_left_edge < tile_left_edge + 4 * SCALE_PX_TO_MV) return 0;
+      if (bh < 8 && pd->subsampling_y)
+        if (src_top_edge < tile_top_edge + 4 * SCALE_PX_TO_MV) return 0;
+    }
+  }
+
+  // Is the bottom right within an already coded SB? Also consider additional
+  // constraints to facilitate HW decoder.
+  const int max_mib_size = 1 << mib_size_log2;
+  const int active_sb_row = mi_row >> mib_size_log2;
+  const int active_sb64_col = (mi_col * MI_SIZE) >> 6;
+  const int sb_size = max_mib_size * MI_SIZE;
+  const int src_sb_row = ((src_bottom_edge >> 3) - 1) / sb_size;
+  const int src_sb64_col = ((src_right_edge >> 3) - 1) >> 6;
+  const int total_sb64_per_row =
+      ((tile->mi_col_end - tile->mi_col_start - 1) >> 4) + 1;
+  const int active_sb64 = active_sb_row * total_sb64_per_row + active_sb64_col;
+  const int src_sb64 = src_sb_row * total_sb64_per_row + src_sb64_col;
+  if (src_sb64 >= active_sb64 - INTRABC_DELAY_SB64) return 0;
+
+#if USE_WAVE_FRONT
+  const int gradient = 1 + INTRABC_DELAY_SB64 + (sb_size > 64);
+  const int wf_offset = gradient * (active_sb_row - src_sb_row);
+  if (src_sb_row > active_sb_row ||
+      src_sb64_col >= active_sb64_col - INTRABC_DELAY_SB64 + wf_offset)
     return 0;
+#endif
+
   return 1;
 }
-#endif  // CONFIG_INTRABC
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/common/ncobmc_kernels.c b/third_party/aom/av1/common/ncobmc_kernels.c
deleted file mode 100644
index af951398b..000000000
--- a/third_party/aom/av1/common/ncobmc_kernels.c
+++ /dev/null
@@ -1,1181 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "av1/common/ncobmc_kernels.h"
-
-// The kernels are only used in the experiment "ncobmc-adapt-weight", which
-// blends four predictions to form a final prediction for an inter-block
-// The indices of the default kernels correspond to
-// 1. the index of the size of the kernels (ADAPT_OVERLAP_BLOCKS )
-// 2. the interpolation modes (NCOBMC_MODE)
-// 3. the prediction the kernels applies to
-
-static int16_t default_ncobmc_krnl_0_0_0[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 5684, 3601, 1367, 364, 1509, 2313, 4007, 5080 },
-  { 3728, 2486, 827, 196, 1434, 2034, 2868, 3000 },
-  { 1643, 1465, 726, 208, 890, 1386, 1242, 1293 },
-  { 794, 723, 277, -237, 206, 487, 749, 896 },
-  { 1176, 730, 286, 136, 281, 262, 724, 953 },
-  { 2086, 1958, 783, 539, 751, 984, 1143, 1491 },
-  { 2665, 2520, 1402, 1037, 939, 1223, 1593, 1937 },
-  { 3451, 3172, 2350, 1291, 1069, 1916, 2672, 3223 }
-};
-static int16_t default_ncobmc_krnl_0_0_1[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 5541, 8123, 10470, 11908, 11291, 10382, 8800, 6446 },
-  { 3338, 5536, 7249, 8080, 7671, 6428, 5280, 3900 },
-  { 1732, 3087, 3842, 4325, 4034, 2929, 2318, 1800 },
-  { 744, 1217, 1559, 2215, 1957, 1352, 707, 322 },
-  { 685, 1082, 1792, 2300, 1975, 1350, 738, 671 },
-  { 1168, 2336, 3303, 3965, 3790, 3098, 2909, 2141 },
-  { 3005, 4370, 5806, 6716, 6282, 5553, 4782, 3453 },
-  { 4748, 6650, 7779, 9010, 9208, 8184, 6987, 5197 }
-};
-static int16_t default_ncobmc_krnl_0_0_2[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 6026, 4784, 2400, 1250, 1002, 2371, 3320, 5285 },
-  { 8638, 6094, 3257, 1498, 1297, 3145, 5252, 7625 },
-  { 10859, 7249, 3868, 1871, 1813, 3569, 6577, 8858 },
-  { 11432, 8123, 4216, 1786, 2477, 4370, 6669, 9366 },
-  { 11894, 8466, 4870, 1917, 2479, 4656, 7057, 9383 },
-  { 11109, 7432, 3924, 1288, 2018, 3946, 6660, 9877 },
-  { 10138, 6548, 2830, 461, 2087, 3810, 6170, 9255 },
-  { 8613, 5163, 1658, 279, 1694, 3082, 4807, 7897 }
-};
-static int16_t default_ncobmc_krnl_0_0_3[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { -833, -80, 2193, 2907, 2623, 1359, 298, -383 },
-  { 705, 2300, 5090, 6649, 6024, 4820, 3020, 1892 },
-  { 2189, 4625, 7990, 10015, 9679, 8539, 6284, 4464 },
-  { 3445, 6356, 10371, 12660, 11773, 10205, 8287, 5828 },
-  { 2664, 6149, 9483, 12064, 11681, 10156, 7908, 5409 },
-  { 2040, 4690, 8405, 10631, 9862, 8396, 5711, 2909 },
-  { 626, 2993, 6387, 8212, 7123, 5840, 3877, 1788 },
-  { -402, 1431, 4636, 5850, 4461, 3246, 1964, 122 }
-};
-static int16_t default_ncobmc_krnl_0_1_0[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 1465, 553, -76, 10, 635, 756, 1843, 3144 },
-  { 687, 117, -404, -300, 238, 280, 696, 1415 },
-  { 49, -38, -224, -241, -135, -209, -237, 382 },
-  { 48, 37, -266, -273, -235, -137, -208, -94 },
-  { 555, -3, -132, -172, -98, 93, 347, 313 },
-  { 887, 256, -266, -307, 304, 222, -98, 82 },
-  { 1701, 816, 454, 501, 119, 230, 450, 551 },
-  { 2732, 1502, 1174, 540, 323, 709, 1002, 936 }
-};
-static int16_t default_ncobmc_krnl_0_1_1[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 7707, 10467, 11036, 10942, 10165, 9420, 8728, 5835 },
-  { 3167, 5146, 5854, 5771, 4914, 4684, 4357, 3154 },
-  { 900, 1646, 2412, 2014, 1974, 1986, 1776, 1005 },
-  { -198, -179, 488, 737, 866, 784, 828, 236 },
-  { -469, 32, 402, 574, 738, 495, 242, -187 },
-  { 186, 1078, 1378, 1480, 1226, 1506, 1656, 745 },
-  { 1531, 2525, 3139, 3367, 3535, 3519, 3095, 2171 },
-  { 3152, 5453, 6176, 7089, 7310, 6879, 6483, 4916 }
-};
-static int16_t default_ncobmc_krnl_0_1_2[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 7378, 3775, 1232, 453, 133, 936, 1688, 4950 },
-  { 10336, 5944, 2400, 1175, 168, 954, 2894, 6436 },
-  { 11176, 6145, 2051, 829, 543, 1193, 3403, 6517 },
-  { 10387, 6062, 2036, 646, 507, 1077, 2998, 6029 },
-  { 10768, 6277, 2226, 677, 321, 982, 2845, 6378 },
-  { 10072, 5808, 1937, 873, 372, 1396, 3498, 7298 },
-  { 8951, 4724, 1216, 104, 656, 1830, 3722, 7558 },
-  { 7447, 3372, 468, -135, 99, 1177, 2980, 7260 }
-};
-static int16_t default_ncobmc_krnl_0_1_3[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { -229, 1545, 4135, 4921, 5405, 5226, 4081, 2418 },
-  { 2120, 5121, 8485, 9692, 11018, 10406, 8380, 5338 },
-  { 4205, 8593, 12099, 13717, 13936, 13366, 11402, 8436 },
-  { 6068, 10382, 14047, 15190, 15155, 14577, 12684, 10145 },
-  { 5458, 10012, 13836, 15243, 15361, 14752, 12876, 9818 },
-  { 5153, 9162, 13256, 14256, 14385, 13170, 11245, 8186 },
-  { 4140, 8257, 11521, 12362, 12028, 10762, 9062, 6053 },
-  { 2966, 5975, 8490, 8807, 8561, 7529, 5836, 3204 }
-};
-static int16_t default_ncobmc_krnl_1_0_0[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 4414, 2642, 2518, 1763, 1089, 644, 355, 254, -234, 454, 399, 228, 525, 785,
-    558, 919 },
-  { 2989, 3035, 2685, 1476, 1872, 768, 820, -309, -107, 273, 87, 286, 499, 638,
-    929, 1025 },
-  { 1779, 1672, 1713, 645, 953, 151, 617, 79, -91, 185, 18, 307, 794, 681, 484,
-    521 },
-  { 1429, 1571, 1893, 1493, 949, 288, -232, -248, -152, 179, -50, 74, 107, 329,
-    539, 822 },
-  { 1444, 852, 1022, 688, 850, 205, 135, -629, 334, 96, 106, 337, 259, 300, 150,
-    680 },
-  { 962, 367, 329, 921, 591, -79, 146, 201, 296, 179, -190, 143, 46, -107, 215,
-    853 },
-  { 915, 865, 463, 169, 498, -390, 12, 202, 225, 490, 410, 483, 52, 99, 293,
-    569 },
-  { 728, -135, 241, 383, 288, -69, 33, 421, 523, 506, 376, 58, 143, -4, 151,
-    218 },
-  { 337, 65, 255, 282, 173, 267, 237, 15, 38, 114, 253, 110, 32, 227, 92, -48 },
-  { 317, 115, 295, 231, 380, 435, 331, -97, 392, 393, 51, 59, 276, 41, -33,
-    46 },
-  { 31, -14, 86, 250, -36, -214, 210, -79, -117, 401, 193, 440, 171, 200, 8,
-    112 },
-  { 46, 19, 165, -6, 75, 180, 266, -98, 76, 276, 6, 29, 230, -49, 177, 168 },
-  { 104, -243, -121, 295, -8, 180, 16, -44, 232, 315, 176, 10, 0, -95, -154,
-    141 },
-  { 248, 201, 361, 430, -20, -45, 209, -44, 222, 540, 527, 297, 240, 625, 531,
-    409 },
-  { 91, 37, 193, 177, 233, 210, -299, 120, 327, 214, 293, 189, 86, 123, 206,
-    356 },
-  { 501, 779, 295, 199, 148, 81, -31, 70, 211, 309, 300, 110, 227, 30, 242,
-    261 }
-};
-static int16_t default_ncobmc_krnl_1_0_1[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 6603, 7905, 7762, 8159, 8426, 10334, 10792, 10984, 12097, 10534, 11216,
-    10624, 9514, 8715, 8672, 8846 },
-  { 5897, 6238, 6272, 7323, 7162, 8091, 9465, 9845, 9929, 9747, 10562, 10737,
-    9059, 7651, 7330, 7314 },
-  { 5572, 6017, 5568, 7112, 6617, 6501, 7334, 8494, 8900, 8826, 9852, 8034,
-    6956, 7691, 7513, 6106 },
-  { 4564, 3877, 4682, 4586, 5135, 5795, 7968, 7859, 7720, 6548, 6306, 5639,
-    6357, 6514, 6493, 5609 },
-  { 4142, 4154, 3332, 4193, 3873, 4977, 4685, 5787, 5707, 5300, 5854, 4720,
-    5452, 5642, 4810, 4250 },
-  { 2993, 3176, 3012, 2637, 2664, 4336, 4207, 3687, 4627, 4487, 4847, 4120,
-    4079, 3931, 3730, 3205 },
-  { 2479, 2268, 1858, 1737, 2266, 2806, 2919, 3017, 3231, 2964, 3181, 3423,
-    3096, 3025, 2684, 2353 },
-  { 1969, 2001, 1997, 1959, 1323, 1565, 1963, 1351, 1957, 1711, 2093, 2057,
-    1762, 1926, 1118, 1367 },
-  { 1473, 816, 655, 1628, 1252, 1764, 1723, 1675, 2559, 3029, 1951, 2160, 2305,
-    2299, 1688, 1048 },
-  { 3073, 1667, 1324, 1360, 1562, 1774, 2154, 2740, 3281, 3434, 3258, 4095,
-    2823, 2443, 2894, 2449 },
-  { 3813, 2830, 3352, 2125, 2627, 2974, 3847, 3720, 4592, 4846, 4787, 5066,
-    4598, 4229, 4032, 3478 },
-  { 3415, 2733, 3827, 3637, 3381, 3743, 3768, 4732, 5055, 5445, 5870, 5937,
-    5734, 5980, 5010, 4954 },
-  { 4878, 3604, 5532, 4558, 4210, 4880, 4847, 5771, 5136, 6486, 7096, 6426,
-    5765, 6824, 6094, 5753 },
-  { 6076, 5817, 5318, 5268, 5784, 5482, 6453, 6582, 6803, 7077, 8113, 8173,
-    8329, 7653, 6448, 6476 },
-  { 7549, 5758, 5554, 6383, 7113, 7664, 7123, 6712, 8539, 8997, 9047, 8794,
-    8700, 8760, 7600, 7995 },
-  { 7698, 7133, 7048, 7498, 7821, 8401, 9152, 8647, 8934, 8874, 8595, 8789,
-    8828, 8766, 9019, 8783 }
-};
-static int16_t default_ncobmc_krnl_1_0_2[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 5573, 5972, 5705, 5033, 5738, 3189, 2009, 1476, 2057, 2178, 1869, 2927,
-    3305, 4036, 4017, 5328 },
-  { 7539, 7568, 7302, 5564, 4410, 3954, 2153, 2693, 622, 1831, 1753, 1636, 3552,
-    4374, 4319, 6015 },
-  { 8753, 7544, 6620, 5710, 6142, 5819, 2731, 2898, 1702, 1487, 2249, 1688,
-    4110, 4483, 5108, 5621 },
-  { 9273, 7922, 6245, 6310, 6442, 5272, 3068, 2649, 1599, 2693, 3219, 4501,
-    4588, 4310, 5647, 6894 },
-  { 9697, 8245, 7267, 6551, 5199, 4626, 3466, 3256, 2099, 3125, 3608, 4297,
-    3944, 5468, 6056, 7545 },
-  { 8831, 8583, 7466, 6937, 6056, 5482, 3407, 3324, 1802, 3128, 3078, 4560,
-    4560, 5901, 6131, 7612 },
-  { 9556, 7457, 6602, 7342, 5370, 4431, 3573, 3339, 1668, 3172, 3779, 4564,
-    5744, 7244, 8522, 8407 },
-  { 10238, 8809, 7064, 6643, 4885, 4246, 2737, 2684, 2501, 3443, 3761, 6174,
-    5891, 6882, 7585, 8821 },
-  { 10151, 10001, 8289, 6859, 6054, 4903, 3809, 3540, 2644, 3424, 3542, 4649,
-    5389, 5384, 6733, 8360 },
-  { 9635, 9516, 7609, 7438, 6181, 4529, 4140, 3439, 2568, 3338, 3789, 5195,
-    5510, 6181, 7566, 8512 },
-  { 9988, 8848, 6807, 6731, 6139, 5355, 3797, 4097, 3364, 3319, 4230, 5136,
-    5581, 6125, 7748, 8229 },
-  { 10252, 9244, 7204, 7260, 6478, 6040, 4659, 3920, 2869, 3263, 4068, 5475,
-    5714, 7183, 7153, 8318 },
-  { 9682, 9366, 7096, 6059, 6036, 4463, 3898, 3477, 2065, 2704, 4434, 5167,
-    5502, 6743, 8002, 7443 },
-  { 9252, 8211, 6399, 6747, 6498, 5626, 4016, 3880, 2172, 2557, 3576, 4270,
-    4968, 5115, 6840, 7550 },
-  { 8753, 8157, 7097, 6500, 5779, 5174, 4190, 2645, 2380, 3239, 4155, 5263,
-    5437, 5337, 5663, 6667 },
-  { 9680, 7710, 6921, 5657, 4863, 3990, 3485, 2172, 2620, 3003, 3328, 4112,
-    4806, 6020, 6833, 7212 }
-};
-static int16_t default_ncobmc_krnl_1_0_3[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { -219, -121, 416, 1445, 1150, 2238, 3251, 3695, 2496, 3247, 2925, 2639, 3064,
-    2869, 3167, 1320 },
-  { -68, -450, 130, 2039, 2952, 3584, 3966, 4171, 5961, 4555, 3995, 3732, 3281,
-    3731, 3827, 2052 },
-  { 262, 1161, 2497, 2934, 2690, 3939, 5735, 4938, 5906, 5924, 4296, 6388, 4553,
-    3551, 3297, 4164 },
-  { 1091, 3025, 3566, 4005, 3874, 5040, 5600, 6151, 7241, 6990, 6924, 6186,
-    5356, 5256, 3726, 3083 },
-  { 1079, 3140, 4769, 4958, 6480, 6589, 8111, 7988, 8255, 7879, 6838, 7052,
-    6751, 5005, 5393, 3931 },
-  { 3566, 4255, 5572, 5909, 7098, 6653, 8641, 9199, 9689, 8617, 8673, 7591,
-    7733, 6676, 6324, 4737 },
-  { 3411, 5802, 7481, 7149, 8259, 9553, 9900, 9854, 11285, 9779, 9040, 7939,
-    7515, 6037, 4902, 5075 },
-  { 3417, 5718, 7095, 7425, 9913, 10666, 11679, 11951, 11429, 10749, 10173,
-    8116, 8610, 7605, 7548, 5992 },
-  { 4408, 5515, 7201, 7627, 8922, 9470, 10636, 11166, 11159, 9844, 10673, 9502,
-    8693, 8503, 7905, 7046 },
-  { 3340, 5097, 7171, 7366, 8273, 9660, 9784, 10332, 10155, 9232, 9301, 7056,
-    7798, 7746, 5981, 5402 },
-  { 2531, 4732, 6148, 7284, 7672, 8287, 8551, 8672, 8567, 7846, 7199, 5757,
-    6057, 5863, 4613, 4578 },
-  { 2646, 4394, 5195, 5511, 6471, 6443, 7713, 7854, 8408, 7427, 6461, 4968,
-    4731, 3294, 4066, 2960 },
-  { 1692, 3664, 3881, 5480, 6162, 6871, 7635, 7198, 8963, 6891, 4694, 4801,
-    5141, 2932, 2459, 3060 },
-  { 769, 2144, 4310, 3945, 4125, 5329, 5712, 5975, 7200, 6220, 4179, 3662, 2868,
-    3007, 2579, 1958 },
-  { -45, 2434, 3549, 3335, 3273, 3357, 5394, 6931, 5159, 3956, 2912, 2164, 2187,
-    2187, 2935, 1388 },
-  { -1514, 786, 2135, 3045, 3561, 3922, 3800, 5515, 4650, 4225, 4169, 3387,
-    2539, 1590, 317, 161 }
-};
-static int16_t default_ncobmc_krnl_1_1_0[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 2375, 1912, 1469, 213, 933, -507, -173, -369, -333, 187, -128, 427, 999,
-    1166, 1515, 2728 },
-  { 1857, 1085, 817, 454, 598, 479, 53, -218, -611, 240, 76, 31, 284, 1347,
-    1738, 1317 },
-  { 1911, 531, 453, 89, 639, -361, -331, -605, -162, 63, -154, 259, 446, 390,
-    708, 1113 },
-  { 818, 1304, 871, 665, 1006, -114, -405, -407, 331, 203, 304, 506, 476, 1053,
-    1155, 879 },
-  { 1054, 874, 714, -162, 624, -144, -306, -541, 30, -281, 296, 812, 418, 858,
-    755, 252 },
-  { 967, 388, 354, 878, 31, -691, -244, -307, 425, 281, 0, -50, 110, -107, 279,
-    255 },
-  { 152, -53, 156, 266, 192, -864, -236, -110, 397, 484, -129, 14, 22, 44, -90,
-    278 },
-  { 203, -54, 103, -142, -598, -741, -546, -26, 545, 253, -43, -234, -391, -504,
-    -158, -143 },
-  { 387, 275, 136, 69, -289, -9, -210, -364, 39, 3, 4, 61, -66, -102, -94,
-    -215 },
-  { 195, 213, 433, 158, 128, -131, -203, -266, -132, -285, -301, -367, -315,
-    -249, -144, -9 },
-  { 600, 145, 418, 277, 156, -118, 85, -20, 119, 260, 41, 72, -85, 316, -97,
-    -41 },
-  { 682, 610, 356, 880, 527, 272, 90, 92, -124, 314, -204, -339, -590, -384,
-    -248, -192 },
-  { 999, 423, 208, 752, 623, 409, 91, -57, -3, -124, 148, 255, -7, 112, -128,
-    -144 },
-  { 1007, 710, 609, 766, 264, -163, 324, 291, 219, -61, 24, 507, 74, 109, 127,
-    629 },
-  { 2211, 878, 853, 462, 86, 203, -71, 122, -36, 131, 308, 267, 210, 369, 50,
-    -96 },
-  { 1810, 1630, 1123, 645, 610, 217, -93, -37, -220, -341, -250, -110, 135, 0,
-    112, 93 }
-};
-static int16_t default_ncobmc_krnl_1_1_1[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 5824, 7106, 8063, 8929, 8632, 9731, 10164, 11047, 11088, 10239, 10606, 8987,
-    8411, 7117, 6115, 5322 },
-  { 4980, 6239, 7135, 7245, 7418, 8526, 9440, 9438, 8119, 8336, 7993, 8057,
-    6686, 5210, 4193, 4841 },
-  { 2436, 4500, 5019, 5908, 5578, 7270, 7391, 7974, 7281, 6871, 6705, 6327,
-    4867, 4521, 4286, 3605 },
-  { 2298, 3501, 4714, 4692, 4835, 5278, 5830, 4968, 4435, 4824, 4373, 4085,
-    3825, 2657, 2539, 2557 },
-  { 1643, 2741, 2604, 2664, 1877, 3334, 2995, 3162, 3367, 3104, 3356, 2827,
-    3577, 2359, 1755, 2140 },
-  { 742, 1397, 1315, 1332, 1864, 3032, 2472, 2253, 1692, 2071, 2260, 2426, 1951,
-    1610, 1189, 1275 },
-  { 482, 869, 598, 288, 769, 1490, 1284, 1692, 883, 1061, 1259, 1239, 1118, 585,
-    219, 571 },
-  { 178, 278, 580, 915, 717, 873, 1012, 721, 52, 348, 624, 540, 691, 102, -108,
-    383 },
-  { -718, -648, -223, -520, -1000, -754, -688, -639, -528, -414, -440, -365,
-    -268, -546, -672, -332 },
-  { -256, -226, -395, -158, -311, -325, -66, 87, 533, 705, 261, 344, 484, 692,
-    155, 507 },
-  { 204, 448, 131, -571, 889, 712, 626, 349, 261, 578, 240, 1012, 849, 900, 889,
-    977 },
-  { 132, 1395, 1847, 1181, 845, 1203, 1920, 2068, 2141, 2071, 1834, 2191, 2130,
-    2522, 1537, 1326 },
-  { 140, 1278, 2440, 2063, 1581, 2204, 2781, 2532, 1677, 2426, 2538, 2210, 1568,
-    2564, 2394, 1945 },
-  { 2943, 3776, 3833, 3310, 3900, 4118, 4161, 3571, 4059, 4143, 4145, 4273,
-    4034, 3940, 3720, 2418 },
-  { 3437, 3906, 4106, 4294, 5303, 5257, 4956, 4027, 5935, 5373, 4102, 4853,
-    5331, 5251, 3964, 4748 },
-  { 5493, 5799, 5966, 6535, 7015, 7397, 8011, 6526, 5832, 6257, 6247, 7097,
-    6499, 6272, 5963, 5593 }
-};
-static int16_t default_ncobmc_krnl_1_1_2[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 6049, 4906, 3617, 2960, 2187, 1950, 556, 497, 688, 355, 503, 1054, 1170,
-    1641, 2343, 4226 },
-  { 7459, 6408, 4326, 3635, 2042, 1565, 492, 572, 746, 338, 719, 797, 2540,
-    2283, 2301, 4089 },
-  { 8025, 6914, 5072, 4249, 2793, 1910, 430, 1137, -150, 451, 1061, 872, 1515,
-    2805, 3823, 4550 },
-  { 9615, 6936, 5226, 3388, 2611, 2061, 801, 1003, -537, 72, 736, 1347, 2215,
-    3509, 4262, 5097 },
-  { 9677, 6521, 5633, 5223, 2996, 2449, 1300, 1136, 160, 918, 488, 801, 2306,
-    3781, 4818, 6441 },
-  { 9988, 7509, 6019, 4950, 3376, 2777, 1427, 1395, -118, 310, 393, 1626, 3387,
-    3649, 4737, 7431 },
-  { 10542, 7745, 5192, 4494, 1637, 1960, 1212, 1056, -309, 383, 1166, 2107,
-    4048, 4030, 7206, 7851 },
-  { 9350, 7480, 4343, 3589, 1748, 1687, 1057, 898, 592, 776, 680, 1960, 3804,
-    4598, 5688, 7834 },
-  { 8769, 7236, 5518, 4182, 2776, 2412, 915, 1370, 789, 561, 520, 1146, 3139,
-    4730, 5542, 7514 },
-  { 9580, 7116, 5910, 4623, 3085, 2450, 1703, 745, 419, 600, 1077, 1208, 3256,
-    4261, 5611, 6709 },
-  { 9725, 7053, 5594, 4217, 2573, 1834, 562, 512, 496, 356, 883, 1360, 3323,
-    4866, 5632, 7594 },
-  { 10110, 7367, 5622, 3858, 3720, 2398, 1075, 1687, 616, 461, 1082, 1786, 2570,
-    4242, 5731, 8319 },
-  { 9416, 7582, 6054, 3915, 3283, 2035, 1335, 1138, 317, 92, 253, 483, 1715,
-    3597, 5613, 8103 },
-  { 8693, 6881, 4626, 3505, 2663, 1949, 751, 792, -343, 55, 303, 460, 2027,
-    3584, 6230, 8704 },
-  { 7368, 6609, 5087, 3861, 2790, 1746, 1487, 518, 497, -165, 439, 904, 2514,
-    3735, 6082, 6914 },
-  { 7004, 5321, 3472, 2621, 1221, 999, 1172, 377, 850, 864, 866, 647, 2574,
-    3977, 6416, 7777 }
-};
-static int16_t default_ncobmc_krnl_1_1_3[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 2085, 2421, 3201, 4245, 4593, 5179, 5800, 5172, 4904, 5558, 5357, 5889,
-    5769, 6415, 6377, 4080 },
-  { 2031, 2607, 4062, 5018, 6279, 5766, 6373, 6562, 8085, 7434, 7557, 7449,
-    6834, 7509, 8119, 6106 },
-  { 3960, 4394, 5800, 6108, 7339, 7531, 8876, 7849, 9371, 8973, 8753, 8896,
-    9525, 8636, 7540, 7092 },
-  { 3599, 4610, 5527, 7597, 7898, 9121, 10115, 10783, 12123, 11248, 10928,
-    10406, 9827, 9129, 8401, 7814 },
-  { 3953, 6203, 7382, 8619, 10852, 10722, 12369, 12580, 12777, 12605, 12198,
-    11899, 10047, 9350, 9018, 7521 },
-  { 4615, 7038, 8644, 9190, 11073, 11216, 12685, 13003, 14345, 13679, 13689,
-    12344, 10902, 11188, 10148, 7399 },
-  { 5141, 7775, 10402, 11309, 13751, 13759, 14094, 13720, 15371, 14418, 14061,
-    12988, 11166, 11692, 9019, 7665 },
-  { 6591, 8644, 11320, 11985, 14476, 14526, 14816, 14745, 15159, 14966, 15071,
-    14071, 12238, 12154, 10931, 8266 },
-  { 7897, 9483, 10910, 12615, 14865, 14701, 16336, 15966, 16036, 16200, 16266,
-    15506, 13546, 12270, 11580, 9377 },
-  { 6808, 9239, 10394, 11719, 13438, 14348, 14923, 15789, 15519, 15341, 15316,
-    15166, 12927, 11656, 10736, 9138 },
-  { 5796, 8696, 10198, 12417, 12722, 13926, 15077, 15506, 15468, 15155, 15184,
-    13906, 12262, 10270, 9924, 7815 },
-  { 5386, 6960, 8500, 10429, 11262, 12474, 13263, 12505, 13713, 13502, 13632,
-    12702, 12233, 9964, 9329, 6889 },
-  { 5768, 7049, 7630, 9626, 10868, 11697, 12128, 12718, 14351, 13953, 13402,
-    13389, 13063, 10072, 8470, 6445 },
-  { 3665, 4962, 7272, 8760, 9507, 10431, 11095, 11676, 12400, 12216, 11874,
-    11099, 10214, 8725, 6279, 4598 },
-  { 3293, 4948, 6288, 7711, 8156, 9140, 9976, 11683, 9946, 11003, 11496, 10325,
-    8287, 6988, 6251, 4796 },
-  { 2010, 3599, 5789, 6548, 7490, 7725, 7264, 9488, 9893, 9573, 9487, 8725,
-    7145, 6110, 3858, 2891 }
-};
-static int16_t default_ncobmc_krnl_2_0_0[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 3437, 3490, 4578, 2066,  1672, 1354, 1502, 2345, 2273, -600, 52,
-    272,  484,  2214, -1553, -197, 165,  278,  306,  384,  73,   415,
-    -213, 357,  497,  288,   714,  6,    -82,  -118, 170,  181 },
-  { 2505,  3488, 306,   3011,  2631, 181,  636,  2608, 1663, -964, 594,
-    -1455, 1057, -1198, -1647, 187,  404,  412,  177,  -32,  269,  -24,
-    148,   233,  -290,  -359,  -178, -164, -362, -19,  -408, 106 },
-  { 2588, 3528, 3391, 3134, 1812, 2387, -34, -298, -13,  -955, 40,
-    -475, 1243, 283,  -247, -484, 200,  -46, 36,   -642, -386, -438,
-    34,   295,  93,   -528, -13,  412,  -8,  41,   -457, 28 },
-  { 796, 3353, 435,  3473,  458,  1851, 519,  1061, 259,  942,  416,
-    195, 390,  -151, -1141, -710, 716,  -401, 33,   -771, -759, -336,
-    88,  -124, -139, -372,  -223, -505, -164, -100, -512, -465 },
-  { 3233,  3990, 2698, -107,  -448, 297, 331, -13, -530, -383, -464,
-    -1530, 715,  -899, -1978, -879, 43,  93,  -77, -138, -425, -97,
-    -167,  -348, -460, -95,   280,  -45, 235, 172, -357, -200 },
-  { 868,   4162,  1417,  487,  -1446, -355, 392, -159, 202,  704,  -814,
-    -3095, -1052, -1482, -745, -1403, -199, -27, -38,  -387, -208, 20,
-    -64,   -130,  -265,  81,   -20,   238,  49,  121,  -137, 495 },
-  { 2774, 3478, 2072, 1229, 819,  1359, 106,  -222, -1445, -1559, 924,
-    -98,  44,   -347, 455,  -862, -318, -288, -31,  281,   -144,  -107,
-    148,  103,  -171, -239, -134, 25,   125,  108,  -142,  -129 },
-  { 610,  990,  -703,  1003,  437,  -275, -179, -233, -2041, -445, -1145,
-    -488, 335,  -2684, -1339, -294, -176, -195, -36,  -65,   -276, 10,
-    -111, -277, -134,  -222,  -51,  31,   -369, -279, -105,  69 },
-  { 420,  2773, 375,   -372, 489,  989,  -900, 1075, 182,  119,  -529,
-    -470, -504, -2225, 225,  101,  -264, -417, -253, -459, -317, -205,
-    -528, -7,   -43,   -268, -116, -857, -608, -208, -216, 220 },
-  { 2969, 1927, -314,  -476, 402,   -637, -838, 835,  1229, 1200, 135,
-    -299, -324, -2136, 340,  -1563, -309, -98,  -408, -137, -154, 668,
-    101,  -90,  245,   112,  -51,   -37,  -525, -254, -244, -126 },
-  { 1404, -258, 2333,  2019,  309,   -29,  -2468, 18,   -494, 70,  -260,
-    245,  515,  -1984, -1759, -1003, -504, 104,   472,  197,  -38, 265,
-    378,  6,    50,    -183,  -204,  -17,  -383,  -318, -396, 142 },
-  { 807,  637,  712,   1237,  -971, -176, -1160, -210, -2072, -782, -959,
-    -372, -590, -1159, -1017, -889, -750, -399,  -98,  -15,   2,    -172,
-    -48,  -488, -628,  -12,   -25,  136,  229,   -200, -212,  -472 },
-  { -1464, 333,  -1978, -1394, -281, -1820, -124, 385,  97,   -297, -1497,
-    -3,    -916, -660,  -949,  -504, 117,   11,   86,   88,   2,    219,
-    333,   -120, -224,  71,    237,  -507,  13,   -381, -207, -113 },
-  { 1100, -717,  -1827, -1908, -1030, -1562, 404,  794,  4,    -682, -1306,
-    -612, -1197, 8,     -131,  525,   159,   -345, -91,  9,    -222, -482,
-    -69,  482,   593,   -32,   -239,  -408,  -522, -692, -126, 712 },
-  { -798, -735, -174, -1695, 819,   -737, -15, -426, -750, 876, 34,
-    -622, 448,  -71,  -950,  -2094, 74,   170, 18,   57,   156, 443,
-    -85,  -374, -416, -537,  -348,  -126, 62,  -381, 399,  -53 },
-  { -552, -1352, 536,  -1,    -322, -1094, -428, 309,  -142, -752, 354,
-    900,  473,   -137, -1263, -370, -731,  -864, -30,  -101, 354,  -321,
-    -523, 377,   9,    -415,  -87,  -145,  -154, -286, 100,  23 },
-  { 44,  607,  316,  -268, -246, -497, 267, 154, 160, 717,  324,
-    240, -130, -218, -107, -252, -64,  4,   113, -57, -162, 123,
-    -5,  143,  -312, -66,  -230, -33,  -57, 60,  153, 85 },
-  { 158,  14,  -307, -240, -85, -416, 304, -402, -461, -221, 193,
-    -123, 384, -142, 48,   -77, -378, 36,  -56,  20,   2,    -240,
-    -88,  -1,  -185, 87,   6,   94,   -22, 82,   191,  194 },
-  { 417,  259,  -85,  -170, -45,  -151, -402, 136, 28,   -40, 101,
-    224,  -337, 97,   98,   51,   -401, 95,   -77, -153, 357, -99,
-    -473, -142, -289, -80,  -349, -76,  -87,  97,  40,   198 },
-  { -236, 62,  -104, -8,  98,  68,  128, 116, 47,  54,  -121,
-    -150, -20, -120, 196, -80, 37,  290, 231, 247, 131, -113,
-    -126, -87, 65,   250, 260, 102, -68, 234, 76,  -87 },
-  { 245, 486, 38,   -10,  -135, 106, 217,  -187, -200, 96,   20,
-    117, -40, -97,  68,   -139, 276, 8,    -55,  -53,  -187, -20,
-    -41, 1,   -145, -246, -106, -45, -145, -353, 185,  -35 },
-  { 448,  -172, -496, -63, -84, -106, 151,  9,   -143, -180, -38,
-    -276, -223, 269,  100, 38,  -236, -66,  124, -59,  475,  -78,
-    -407, -20,  -119, -19, 162, -4,   -226, 101, 247,  78 },
-  { -348, -156, -324, -260, -173, 0,   -41,  63,  235,  -114, 109,
-    -362, -96,  279,  -277, 36,   394, 394,  240, 30,   -88,  209,
-    29,   176,  59,   -20,  -244, 50,  -104, 192, -157, 48 },
-  { -376, -176, 269, -426, -159, -108, -18,  -163, 93,  130, -222,
-    -40,  539,  176, 164,  -62,  -709, -354, 502,  664, 243, -414,
-    -51,  192,  33,  54,   -10,  -57,  -141, -3,   144, 71 },
-  { -137, -636, 627,  6,    -129, -159, -45, -150, -15,  402, 207,
-    20,   202,  1,    -203, 88,   183,  62,  -76,  120,  418, -196,
-    -104, -154, -433, -338, -73,  1,    -79, -14,  -200, 84 },
-  { 184, -334, 175,  114,  -274, -60, -429, 176,  36,   373, 468,
-    134, 110,  -11,  -201, -94,  352, 109,  115,  91,   187, -83,
-    21,  0,    -154, -180, 288,  0,   -61,  -197, -246, 42 },
-  { -143, 26,   190,  -110, -335, -385, -357, 27,   103,  -66, -96,
-    -189, -337, -150, 129,  -104, -176, -418, -216, -118, 28,  126,
-    -112, -130, 110,  17,   141,  111,  -82,  238,  22,   -50 },
-  { 104, -95, 48,   -239, -40, -148, -327, 244,  323,  -102, 244,
-    151, 113, -150, -74,  223, -81,  -328, -178, 140,  -233, -165,
-    182, 514, 216,  -129, -8,  141,  -81,  451,  -110, -71 },
-  { -116, 84,   -228, 177, 318, 62,   134, -3,   239,  14,  338,
-    278,  -439, -254, 3,   -82, -210, -62, -236, -124, 5,   -60,
-    112,  -18,  -115, -31, 5,   -65,  278, 4,    -19,  -130 },
-  { 236, -64,  -147, -519, 147,  -27, 71,  -567, -133, 24, -199,
-    229, -107, 126,  -141, -148, -35, -34, 68,   230,  8,  72,
-    40,  -148, 203,  97,   84,   107, 32,  17,   -58,  -18 },
-  { -43,  -408, -101, 120, 118, 168,  -170, -233, -323, -120, -339,
-    80,   -294, -151, 85,  52,  -420, 79,   -162, -233, -237, -47,
-    -131, -53,  -199, 14,  85,  -80,  93,   -150, -15,  318 },
-  { -106, 107,  -6,   189, 53,  -109, 22,  -474, -335, -102, -279,
-    -321, -66,  186,  -65, -13, 61,   167, 43,   -159, -57,  -13,
-    37,   -125, -137, 132, 161, -156, -27, -276, -89,  15 }
-};
-static int16_t default_ncobmc_krnl_2_0_1[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 5401, 5987, 4279, 6550, 4858, 4986,  5733,  7172,  8194, 7631, 7549,
-    6971, 9288, 7485, 8583, 9244, 12058, 11530, 10461, 8453, 8304, 11724,
-    8999, 9457, 5018, 6922, 8375, 7860,  7915,  6921,  7703, 8963 },
-  { 2308, 2670,  5018,  5298, 3883, 6449,  4267,  4119, 9252, 10082, 7844,
-    7414, 9050,  9261,  8739, 7808, 10974, 10279, 8627, 8840, 9203,  9406,
-    9360, 10574, 10156, 7673, 6238, 8876,  6800,  6423, 6931, 8589 },
-  { 6608,  4325, 3372, 5227, 6182, 3670, 5595, 5758, 8575, 8025, 8251,
-    10711, 5449, 6965, 5443, 7178, 9099, 8842, 7132, 7830, 5795, 9882,
-    8939,  8323, 7507, 7248, 8750, 6786, 6940, 4942, 7125, 6399 },
-  { 3977, 3060, 4962, 7094, 7211, 6388, 6256, 3960, 7672, 7814, 7711,
-    7237, 7088, 7232, 5716, 6040, 9565, 6643, 8113, 7841, 9849, 10144,
-    8297, 7676, 6792, 8447, 7805, 5475, 5499, 4728, 5379, 7645 },
-  { 4598, 4391, 3660, 6284, 6694, 8302, 5610,  5341, 7466, 6298, 6406,
-    7734, 5743, 5155, 5257, 6958, 9035, 11566, 9636, 7825, 8147, 9427,
-    6612, 5526, 7635, 7259, 7696, 7853, 5505,  6744, 9265, 5394 },
-  { 5980, 2356, 2746, 5955, 4045, 4283, 5117, 3799, 5386, 5594, 7671,
-    6984, 6232, 6028, 3101, 3391, 5757, 9530, 7408, 6206, 5512, 7867,
-    5144, 8011, 6690, 6994, 4877, 5063, 6175, 5205, 1965, 859 },
-  { 2619, 4096, 4225, 4712, 5637, 6418, 6649, 3904, 5463, 5102, 4785,
-    4100, 5127, 3858, 3419, 5301, 6002, 7649, 8260, 6241, 4168, 4551,
-    6153, 5016, 7113, 7845, 5201, 5455, 5069, 2335, 3311, 5194 },
-  { 1278, 4942, 4441, 3456, 3791, 5620, 5275, 2243, 5080, 4619, 5834,
-    4859, 4320, 5092, 1481, 846,  4969, 4835, 3646, 5940, 5736, 5862,
-    3628, 5918, 5865, 4945, 4385, 4699, 4342, 5415, 8383, 4711 },
-  { 3855, 1678, 2560, 4631, 2765, 1444, 1449, 1895, 4494, 5706, 4813,
-    4882, 3532, 2264, 3222, 5444, 4097, 5236, 5036, 3713, 6547, 4371,
-    5311, 2363, 5113, 6290, 3743, 5343, 5369, 2813, 2486, 1647 },
-  { -651, 1098, 2116, 3495, 2289, 1836, 4507, 4057, 5225, 4553, 2631,
-    2791, 2984, 3605, 3416, 3611, 4358, 4719, 3450, 4146, 3973, 3263,
-    3826, 5881, 6402, 4584, 4396, 3689, 2020, 1960, 2100, 4304 },
-  { -622, 1848, 379,  112,  -1474, 1013, 6023, 260,  1035, 1984, 3811,
-    2362, 1394, 2546, 3347, 2472,  1865, 755,  2251, 1139, 1933, 2252,
-    1163, 3003, 4091, 4792, 3801,  3517, 4247, 3798, 5216, 4543 },
-  { 1342, 2229, 1014, 1212, 260,  432,  1975, 99,   2798, 818,  2455,
-    3858, 2231, 3773, 136,  857,  2171, 815,  1966, 1825, 1711, 964,
-    2142, 2514, 5367, 3539, 3241, 3116, 3982, 3839, 3553, 3535 },
-  { 1800, 27,   321,  111,  1003, 528,  254,  979,  2444, 2413, 3807,
-    961,  1961, 1173, 2156, 3935, 259,  263,  1815, 1979, 1218, 2393,
-    3738, 1109, 4444, 3726, 3647, 3428, 2966, 4602, 4903, 5851 },
-  { 1340, 753,  317,  1318, 738,  1880,  -500, -691, 1108, 38,   412,
-    890,  494,  291,  -131, 759,  -111,  221,  -95,  2575, 3099, 3223,
-    3140, 3156, 3952, 1942, 2615, -2313, 2991, 6367, 5744, 4528 },
-  { 752,  490,  1255, 2396, 14,   3819, 1319,  1239, 3491, 2464, 3243,
-    3083, 392,  1273, 1712, -226, -931, -2130, 710,  864,  385,  265,
-    1431, 1796, 3063, 3531, 3879, 3986, 3503,  4045, 2539, 3489 },
-  { 1943, 170,  358,  1884, 2344, 1566, 92,   1721, 1381, 1115, 723,
-    1670, 2294, 1497, 1697, 973,  1286, 2306, 381,  2582, 2551, 3852,
-    2481, 3432, 2273, 3079, 2076, 3014, 3365, 3906, 2241, 2250 },
-  { 1741, -705, 595,  956, 2038, 793,  1518, 148,   -524, -881, -487,
-    711,  720,  773,  431, 2181, -435, -841, -1106, -552, 434,  -2007,
-    -41,  -234, -960, -23, 394,  -655, 792,  934,   1495, 1947 },
-  { 2086, 1360,  97,   1352, -95,  1800, -729, -916, -152, 956,  196,
-    1746, -1973, -690, 472,  1788, -28,  385,  781,  589,  -320, 1167,
-    -484, 66,    1136, 1038, 1741, 888,  3056, 2114, 3495, 1297 },
-  { 1900, 1373, 983,  3718, 1409,  2096, 932,  -604,  -1370, 1153, 109,
-    58,   104,  2851, 602,  -2071, 252,  -888, 1428,  2724,  1344, 1567,
-    563,  1902, 1370, 519,  -294,  393,  1153, -1032, 2129,  335 },
-  { 2652, 2620,  3178,  2344,  2466, 2241, 1145, -101, -635, 306, -1036,
-    638,  -2606, -1921, -1098, -328, -324, 2598, 1092, 1832, 493, 2507,
-    1152, 1461,  -796,  2126,  -742, 1182, 2078, 1549, 2665, 2366 },
-  { 1080, 798,  1934, 568,  1218, 3206, 155, 1844, 2313, 3509, 1090,
-    650,  1166, 2515, 1846, 1025, 259,  720, 1587, 3010, 4955, 6457,
-    2952, 2764, -396, 1937, 1563, 673,  828, 4062, 2711, 1548 },
-  { 871,  657,  2761, 1756, 2349, 198,   -1003, -1105, -1181, -69,  146,
-    3201, -27,  1493, 13,   291,  -2260, -468,  1178,  928,   2665, 3887,
-    3140, 1334, 1969, 2687, 544,  3842,  2885,  733,   3419,  1963 },
-  { 1491, 1698, 302,  2127, 1256, 907,  1607, 1833, 2061, -536, 988,
-    4380, 2723, -195, 962,  1769, 2466, 1735, 2707, -369, -713, 1599,
-    3031, 2924, 2023, 2045, 5259, 1733, 3517, 4274, 440,  412 },
-  { 2163, 1,    167,  1755, 5694, 3272, 739,  4235, 6123,  3811, 4611,
-    5800, 2424, 2409, 1458, 2152, 104,  115,  466,  -998,  -806, 2824,
-    4473, 2511, 4878, 3258, 5014, 3559, 1003, 2074, -2091, 1403 },
-  { 964,  1051, -1527, 1266, 3883, 2349, 1054, 1972,  1929, -249, 3796,
-    2861, 1542, 449,   539,  1942, -16,  58,   2080,  56,   1106, 4248,
-    580,  2540, 3095,  4536, 152,  354,  4067, -2246, 1505, 1981 },
-  { 1081, 1440, 324,  736,  2839, 2597, 3712, 2282, 3717, 2483,  1247,
-    4456, 3604, 3415, 2487, 3715, 2073, 2928, 2372, 828,  -2700, 2054,
-    4315, -125, 1777, 2211, 2992, 7336, 4216, 3571, 2657, 6780 },
-  { 1997, 2104, 1255, 1942, 1335, 1450, 3567, 1447, 3812, 6083, 5233,
-    4484, 3536, 3564, 3290, 4062, 2589, 2816, 3971, 4406, 3481, 2664,
-    1245, 1759, 3353, 1036, 2054, 1299, 2263, 4010, 4171, 3972 },
-  { 1519, 4826, -750, 988,  1338, 2999, 212,  3858, 5202, 5306,  5717,
-    3066, 2629, 6461, 6043, 6637, 8388, 7252, 4890, 4161, -1056, 4615,
-    2538, 5633, 3389, 6439, 2985, 7148, 5149, 4509, 8001, 8863 },
-  { 1047, 876,  2713, 3913, 2232, 1084, 1702, 2626, 1983,  3744, 2044,
-    3690, 2087, 4497, 2656, 5592, 6247, 4584, 4218, 6097,  6884, 6277,
-    2412, 5097, 7400, 2789, 6089, 6157, 7247, 9712, 11393, 5627 },
-  { 2876, 4288, 2443, 3081, 1569, 1823, 1050, 2325,  2558, 2591, 4223,
-    6300, 4237, 4354, 4411, 7502, 4175, 3350, 4208,  1100, 6473, 6664,
-    5460, 4207, 5297, 8047, 6850, 6496, 7866, 10375, 7455, 2868 },
-  { 3282, 5838, 6486, 6479, 3474, 4665, 3790, 2882,  5116, 4457, 4649,
-    4208, 4520, 7271, 4363, 7125, 8799, 6540, 10158, 5716, 6794, 5762,
-    6462, 8561, 2742, 7002, 9454, 8451, 8560, 7973,  7759, 6679 },
-  { 5957, 7221, 5126, 7057, 5824, 4274,  5374,  6023, 7549, 6239, 7666,
-    6368, 4014, 5338, 7150, 9793, 10608, 9838,  6748, 9691, 5465, 4631,
-    7964, 7692, 8173, 9362, 8989, 11677, 10282, 9960, 6666, 9276 }
-};
-static int16_t default_ncobmc_krnl_2_0_2[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 7499, 5941, 5384,  4566, 4006, 3634, 2288, 4112, 2127,  3001, 2639,
-    1927, 467,  -1639, 1484, 1143, 66,   -316, 626,  1721,  1208, 193,
-    1591, 3903, 8472,  3945, 1882, 4378, 6453, 8972, 11867, 10110 },
-  { 7919, 6226, 8601, 3825, 4644, 4380, 3957, 2964, 1316, 3586, 2268,
-    2802, 2193, 1427, 1479, 1353, -55,  373,  271,  979,  526,  1827,
-    2463, 1938, 3963, 4851, 5040, 4192, 3731, 4522, 8903, 6733 },
-  { 6373, 4994, 6414, 4822, 4923, 4881, 4383, 6117, 3342, 5068, 2353,
-    2370, 2231, 758,  1768, 1338, 742,  1498, 454,  1453, 1466, -213,
-    177,  1223, 512,  5366, 2462, 4667, 5671, 5039, 6065, 6874 },
-  { 9299, 8698, 12939, 6170, 7063, 3147, 3256, 3492, 2696, 4498, 3705,
-    3176, 2797, 1099,  2852, 1331, 527,  1272, -388, 1619, 110,  -406,
-    390,  3801, 4468,  3193, 2944, 7284, 7144, 4560, 6320, 8073 },
-  { 5937, 4572, 5212, 6678, 5291, 2561, 2752, 4892, 2713, 5203, 4202,
-    1527, -470, 2424, 2850, 1217, 401,  587,  191,  1122, 1314, 1854,
-    3860, 4579, 2455, 5427, 1614, 5037, 5073, 5074, 3101, 7734 },
-  { 7035, 5229, 7515, 6523, 7587, 5653, 5311, 4945, 4097, 4237, 2836,
-    2667, 1959, 4095, 1669, 1484, 57,   467,  1028, 642,  2843, 2782,
-    3604, -825, 1592, 4305, 2202, 4432, 4683, 3867, 3520, 9281 },
-  { 7248, 3787, 4243, 4710, 3288, 1975, 2766, 4057, 1506, 2644, 1436,
-    818,  1150, 2159, 787,  920,  98,   137,  1065, 306,  3880, 537,
-    3871, 1060, 3821, 3395, 2484, 3532, 4072, 3339, 2638, 3982 },
-  { 8810, 5802, 5538, 4090,  3659, 3742, 3818, 6827, 6474, 4756, 4093,
-    3735, 4063, 4586, -1945, 470,  328,  -163, 958,  511,  2541, 3057,
-    2972, 4349, 4754, 5115,  5847, 6843, 7299, 6652, 5891, 5655 },
-  { 9091, 5007, 6438, 4749, 5610, 3664, 6151, 5188, 3686, 2005, 2670,
-    -245, 1788, 3724, 2626, 679,  -52,  -839, -145, 356,  3488, 1970,
-    1988, 2126, 1099, 2578, 5401, 6965, 4908, 5526, 6748, 5968 },
-  { 6412, 7516, 8029, 8748, 6742, 7509, 6552, 4078, 4300, 5066, 4786,
-    3270, 4270, 3875, 2319, 4282, 1640, -843, -439, 427,  1587, 520,
-    -28,  2251, 3358, 3049, 4407, 7286, 8994, 7802, 5924, 6824 },
-  { 8467, 6838, 3934, 2952, 7200, 5407, 4593, 5882, 3353, 3865, 1870,
-    1535, 2130, 4121, 3527, 1799, -637, -937, 513,  247,  169,  607,
-    2947, 3530, 3717, 6082, 9703, 6867, 2729, 6292, 3084, 4879 },
-  { 9934, 8638, 7508, 6894, 7343, 5306, 6208, 6136, 5240, 7136, 3958,
-    1811, 3171, 1064, 2246, 882,  1681, 727,  1694, 769,  1700, 1370,
-    1901, 5812, 3852, 6468, 5875, 5416, 6007, 3348, 3600, 6661 },
-  { 10978, 9383, 9741, 10746, 5208, 8469, 4608, 5824, 4424, 3460, 3841,
-    4037,  3687, 1582, 3784,  988,  1974, 1292, 2272, 2128, 2210, 2888,
-    -967,  5864, 5568, 4693,  3796, 6361, 4816, 2697, 4559, 6437 },
-  { 8329, 9809, 8672, 9375, 7503, 5775, 3454, 4596, 5093, 5033, 4021,
-    2860, 2833, 2782, 3056, -617, 1644, 1759, 2434, 2570, 3312, 3807,
-    3518, 3521, 1126, 2830, 3378, 4432, 3261, 5211, 4073, 10050 },
-  { 9992, 8148, 7951, 7194, 5624, 5032, 3296, 2981, 5388, 3910, 2274,
-    1436, 1425, 1053, 2111, 2806, 1606, 1446, 1681, -211, 1877, 1541,
-    1700, 2736, 2088, 2551, 1045, 2977, 2632, 1719, 4896, 5378 },
-  { 9403, 8846, 8061, 7478, 5269, 6655, 6312, 4110, 3529, 5802, 3108,
-    3246, 1943, 909,  2436, 1678, 1513, 1243, 797,  213,  3888, 4015,
-    2775, 2082, 2395, 2792, 2136, 2475, 1657, 2156, 1878, 2587 },
-  { 9499, 9075, 5426, 6962, 8206, 8057, 3968, 5184, 2759, 2277, 2744,
-    3531, 2518, 367,  1075, 2118, 900,  901,  2964, 3641, 5282, 2186,
-    2416, 2312, 2366, 2149, 1024, 1912, 1119, 220,  401,  727 },
-  { 7615, 8271, 8148, 7699, 7063, 7658, 5473, 7497, 7302, 5841, 4165,
-    3092, 734,  2215, 3316, 2226, 1197, 1236, 2996, 5007, 2872, 3460,
-    2371, 1898, 1917, 1442, 853,  1412, 700,  620,  317,  1237 },
-  { 8331, 8530, 8633, 7185, 6863, 9076, 5328,  5045, 5378, 4004, 4089,
-    1469, 1341, -333, 2689, 1982, 115,  -1158, 383,  1548, 1118, 2864,
-    3154, 1803, 2079, 1676, 1450, 1165, 967,   795,  136,  1184 },
-  { 8763, 9102, 6716, 8961, 5448, 6366, 3438, 5722, 5374, 5651, 5422,
-    1728, 1751, 2444, 1024, 1118, 424,  2288, 3655, 2719, 2254, 1313,
-    3476, 1983, 1975, 1502, 1172, 2333, 937,  594,  122,  149 },
-  { 8146, 9931, 7629, 8882, 6328, 7491, 5646, 5494, 7238, 7355, 4478,
-    2019, 2646, 3486, 4193, 1121, 562,  1823, 2787, 1720, 2228, 3627,
-    4470, 3351, 2439, 2214, 1926, 2118, 1771, 767,  353,  1062 },
-  { 10816, 9814, 10917, 7424, 8207, 9717, 8537, 8728, 7356, 7376, 7246,
-    3223,  1981, 277,   1282, 951,  515,  222,  1392, 789,  4372, 2112,
-    4083,  2706, 3234,  2414, 2655, 1407, 702,  1369, 121,  676 },
-  { 11362, 10078, 7520, 7828, 10705, 7300, 7358,  6559, 8337, 7569, 5067,
-    3465,  2417,  1956, 2165, 759,   -106, -1282, 1822, 3225, 4767, 5619,
-    4119,  3383,  3877, 2702, 2410,  2459, 1441,  1392, 945,  216 },
-  { 10112, 8115, 3762, 5107, 7443, 7676, 7498, 7380, 6235, 7523, 6246,
-    3574,  2749, 3853, 303,  1558, 1896, 1107, 462,  2172, 2388, 4222,
-    2000,  1688, 3560, 2297, 1593, 3679, 3628, 1507, 1549, -188 },
-  { 7794, 6437, 6605, 5381, 6404, 4410, 6677, 4233, 4949, 3000, 2812,
-    3756, 1805, 2877, 2098, 1737, 1809, 1427, 378,  2031, 2115, 5006,
-    3159, 3602, 6343, 3503, 3356, 5971, 3138, 3932, 1028, 699 },
-  { 6757, 7738, 6538, 8248, 6959, 6557, 5264, 3092, 3765, 1895, 1865,
-    901,  2485, 2217, 1699, 1946, 3573, 1501, 2141, 2177, 180,  1003,
-    1816, 4793, 2112, 4560, 3820, 2881, 4376, 2091, 681,  623 },
-  { 9057, 8917, 7385, 7072, 6324, 5492, 5283, 5053, 5785, 4277, 3322,
-    1267, 1946, 1894, 3701, 472,  1658, 1154, 777,  2193, 2349, 3611,
-    3129, 3719, 1781, 5389, 3418, 2463, 3734, 3644, 3365, 2247 },
-  { 9444, 9439, 8598, 9152, 6982,  8451, 8279, 6129, 5172, 3730, 2416,
-    2483, 2775, 1913, 1041, -1110, -392, 1068, 556,  598,  4171, 2377,
-    1870, 1906, 5449, 5413, 2589,  3564, 6473, 6692, 3140, 2665 },
-  { 10567, 10001, 8225, 8289, 6898, 6856, 3920, 4547, 4297, 1456, 2348,
-    1526,  2343,  2863, 1429, 312,  57,   930,  1619, 1189, 596,  1815,
-    2589,  3141,  1662, 3349, 1311, 4091, 4596, 7321, 5911, 6965 },
-  { 9593, 9214, 9132, 8273, 8030, 8135, 5179,  5564,  4052, 4155, 4052,
-    2249, 2178, 1680, 439,  822,  -378, -1210, -1149, 3709, 2830, 747,
-    2987, 5873, 795,  5124, 4233, 3887, 5573,  5312,  7258, 11014 },
-  { 8373, 8033, 8934, 7880, 7434, 6144, 7528, 5163, 2591,  4301, 2489,
-    4137, 1295, 760,  703,  805,  -308, -320, 2205, -1113, 362,  581,
-    2567, 689,  5949, 2652, 1996, 2138, 7469, 4835, 8058,  11132 },
-  { 8586, 6026, 7656, 7201, 8141, 7249, 5995, 4896, 3152,  4255, 1711,
-    3498, 3933, 1852, 1444, 715,  -104, -695, 4021, 3937,  6478, 1755,
-    935,  384,  1002, 2595, 3359, 4532, 7103, 5192, 12241, 14373 }
-};
-static int16_t default_ncobmc_krnl_2_0_3[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { -18,  921,  2116, 3151, 5822, 6391, 6844, 2748, 3794,  6358, 6115,
-    7194, 6145, 8324, 7847, 6181, 4052, 4867, 4967, 5823,  6786, 4035,
-    5989, 2636, 2376, 5222, 5409, 4121, 2105, 626,  -3363, -2857 },
-  { 3594, 3991, 2433, 4231, 5187, 5335, 7496, 6672, 4132, 3625, 5649,
-    7621, 4052, 6868, 7772, 7010, 5041, 5311, 7273, 6593, 6376, 5150,
-    4421, 3618, 2523, 4188, 5275, 3469, 6209, 5459, 953,  947 },
-  { 786,  3510, 3161, 3162, 3435, 5439, 6415, 4784, 4467, 4232, 5708,
-    3775, 7437, 8362, 9398, 8331, 6300, 6049, 8740, 7748, 9508, 7139,
-    7232, 6528, 8257, 4296, 5180, 4497, 3755, 6329, 3620, 3050 },
-  { 2273, 1239, -1997, -385, 1641, 4987, 6332, 7869, 5742, 3115, 4523,
-    5739, 6076, 8184,  8936, 9733, 5577, 8872, 8635, 7679, 7192, 6961,
-    7586, 5022, 5256,  5107, 5842, 4127, 3898, 7191, 5184, 1097 },
-  { 2576, 3444,  4787, 3494,  4843, 5213, 7669, 6154, 6713, 5224, 6221,
-    8653, 10387, 9676, 10219, 9062, 6899, 4115, 6617, 7548, 7319, 5169,
-    6051, 6609,  6735, 3759,  6779, 3520, 5518, 4355, 4386, 3459 },
-  { 2457, 4623, 4686, 3390,  6167,  6776,  5546, 7755, 6678,  5831, 6667,
-    9797, 9222, 7728, 12319, 12899, 10764, 6383, 7947, 9907,  8225, 5677,
-    7690, 9312, 8324, 4971,  9288,  6616,  5448, 7180, 11014, 5709 },
-  { 3687,  5015,  5834,  5702,  6619,  6602,  6844, 8607,  10828, 10170, 9206,
-    11527, 10057, 10677, 11683, 11009, 10585, 8869, 7057,  9542,  8465,  11391,
-    6180,  10182, 5594,  5353,  8810,  7358,  7118, 10591, 10569, 7318 },
-  { 5659, 4619, 7090, 7819,  8483,  7258,  7446,  7530,  6847, 7424, 7586,
-    8261, 7644, 9373, 18173, 15351, 11259, 11899, 11787, 9977, 8370, 7422,
-    9853, 6375, 5873, 6503,  6194,  4792,  5082,  4563,  2192, 5942 },
-  { 3004,  6927,  6994,  7359,  7505,  10247, 9661,  8199,  7979,  8529, 9388,
-    12192, 11555, 12591, 10308, 10143, 12579, 12379, 11700, 12735, 6629, 10209,
-    9592,  11878, 10187, 7755,  7344,  4922,  6699,  8240,  7341,  8532 },
-  { 7590,  5795, 6512,  4587,  6933,  7660,  6141,  7410,  5605,  5542,  8790,
-    10597, 9438, 10999, 10270, 10028, 10678, 12591, 13767, 11933, 10966, 11898,
-    12452, 8305, 6352,  8621,  7598,  5409,  5869,  6860,  8606,  5371 },
-  { 7095,  7927,  9729,  11290, 10321, 9966,  8226,  10211, 12468, 10459, 10959,
-    12232, 12326, 11686, 11247, 13106, 15660, 16448, 13119, 14772, 14295, 13233,
-    11880, 9805,  8498,  5650,  3043,  5995,  9756,  6592,  8450,  6801 },
-  { 4251,  4844,  7130,  7033,  9742,  10794, 9341,  10350, 10410, 9188,  10907,
-    11059, 11547, 12685, 14995, 15511, 13256, 15229, 12788, 13792, 12937, 14179,
-    12355, 8519,  7767,  6376,  7293,  7706,  6134,  9392,  9423,  6656 },
-  { 5032,  6597,  8267,  6875,  10431, 9182,  11606, 9174,  9394,  10754, 10214,
-    11384, 11633, 14256, 11377, 11933, 13999, 14801, 12182, 12170, 12927, 10856,
-    13248, 9493,  6586,  7871,  8697,  7094,  8561,  9451,  7116,  4183 },
-  { 5550,  6479,  9188,  7562,  9126,  10236, 12984, 11667, 10146, 11981, 13257,
-    13227, 14228, 13278, 13571, 15730, 14696, 14740, 14122, 11230, 10186, 9795,
-    9766,  9187,  10707, 11612, 10594, 14651, 10618, 5465,  6640,  1085 },
-  { 6402,  8472,  7318,  8449,  9884,  8237,  11776, 12579, 8248,  9119,  10813,
-    12464, 14087, 14122, 13487, 15884, 15630, 16883, 13968, 15663, 13943, 14099,
-    13309, 12222, 11647, 10827, 11813, 9543,  10171, 10991, 8523,  7564 },
-  { 5558,  8716,  7398,  7003,  9081,  9234,  10389, 10222, 11602, 10189, 12165,
-    10551, 11676, 14110, 13499, 14107, 14297, 13673, 15239, 13669, 9564,  8809,
-    11609, 10482, 11688, 10885, 12257, 11025, 11490, 10586, 12134, 11499 },
-  { 5054,  7370,  10001, 8690,  6346,  7990,  10600, 10877, 13977, 14230, 13786,
-    11880, 13256, 15455, 14951, 12311, 15970, 16289, 14385, 13318, 10806, 16058,
-    14004, 14150, 15275, 14285, 15169, 15124, 14484, 15130, 14320, 13627 },
-  { 6472,  6714,  8422,  7520,  9468,  7309,  11310, 10173, 9680,  9775,  11809,
-    11641, 17217, 14973, 12511, 12431, 15565, 14706, 12653, 10736, 13799, 11984,
-    14576, 14406, 13494, 13775, 13748, 13952, 12627, 13551, 12343, 13637 },
-  { 5691,  6196,  6840,  5618,  8130,  5337,  10502, 11764, 12309, 11243, 12058,
-    14603, 15254, 13730, 12988, 16426, 16398, 18336, 14653, 12258, 13528, 12015,
-    13122, 12816, 13238, 14265, 15564, 14875, 14346, 16501, 14057, 14664 },
-  { 5142,  4576,  6578,  5068,  8343,  7665,  11649, 10611, 11541, 10331, 12078,
-    14129, 17221, 15930, 16224, 15649, 16231, 11200, 11389, 11572, 13476, 12629,
-    11861, 13013, 15114, 12486, 15663, 12735, 13401, 13979, 13507, 13952 },
-  { 6851,  5162,  6778,  6922,  8951,  5567,  10360, 9216,  7036,  5410, 10771,
-    13577, 12588, 10477, 10248, 14359, 15261, 13795, 12048, 11716, 9361, 6278,
-    8997,  10237, 14438, 12459, 12976, 13600, 13892, 11879, 13127, 13802 },
-  { 4195,  6070,  3151,  7247,  5889,  6549,  8672,  8715,  10338, 9229, 9026,
-    10246, 14651, 14345, 15001, 15116, 18364, 16684, 13657, 14718, 8840, 10437,
-    9581,  12367, 11264, 11291, 13002, 11111, 13027, 14172, 12590, 13651 },
-  { 3818, 4756,  8879,  6693,  4570,  8158,  7459,  7913,  5727,  9446,  10204,
-    8887, 11326, 14337, 13524, 13813, 13628, 15506, 11578, 13470, 12391, 8927,
-    9166, 9882,  10411, 11665, 8963,  12141, 11521, 10521, 15132, 15679 },
-  { 4425, 8428,  12163, 9947,  3396,  5526,  8133,  4898,  3913,  4891,  5711,
-    7034, 10657, 9932,  14435, 12716, 15058, 15501, 14937, 14530, 14536, 9746,
-    9923, 11968, 7869,  10734, 9735,  9164,  11842, 12786, 16768, 15073 },
-  { 7712,  9515,  10650, 9707,  6201,  9752,  8700,  10334, 9503,  13202, 9555,
-    9748,  12814, 13027, 13920, 12593, 14370, 14808, 13965, 14154, 12735, 7319,
-    12721, 10395, 7361,  8678,  12937, 10057, 9234,  14695, 14044, 13613 },
-  { 8309,  7528,  9323,  7254,  6829,  7276,  7831,  10824, 8851,  11605, 12763,
-    10865, 10153, 10736, 12379, 10799, 10370, 11817, 11734, 13290, 18692, 13378,
-    10209, 11690, 12616, 9779,  9257,  6142,  7818,  10903, 13276, 8893 },
-  { 5420,  5315,  7529,  7453, 9027,  9825,  7865,  9813,  6673, 6090,  7914,
-    10790, 11205, 11064, 9239, 11947, 12306, 12802, 11856, 9896, 10502, 9968,
-    12099, 11011, 11103, 9920, 10747, 12477, 10458, 8485,  8805, 10199 },
-  { 5275,  2169,  8448, 6454, 8077,  5060, 8189, 6133,  5673,  7424,  7993,
-    10659, 10836, 8138, 9347, 10570, 8447, 8359, 11071, 11453, 13480, 9521,
-    11755, 8294,  7308, 4637, 10781, 5515, 4843, 4737,  5330,  4893 },
-  { 4846,  5401,  5671, 3987,  6910,  8363,  10605, 9189,  9832, 11154, 11632,
-    10874, 12377, 9266, 12273, 10543, 10287, 10912, 10745, 9206, 8851,  8327,
-    11242, 8123,  7431, 10266, 8947,  6186,  4259,  -682,  -920, 3901 },
-  { 3634, 2920,  4925,  5515,  6626, 6450,  10063, 9047,  9880,  9577, 8277,
-    7582, 10044, 10186, 11630, 8182, 12589, 14249, 13236, 11328, 7042, 8880,
-    7868, 6442,  10067, 3096,  5190, 5874,  2890,  668,   1718,  2480 },
-  { 4732, 2901,  1056, 1878,  5356, 5406, 5212,  8538, 8974,  7742, 9588,
-    7933, 10867, 8487, 11203, 8392, 8301, 10070, 4166, 11993, 9436, 10071,
-    7464, 7158,  7848, 6669,  4825, 5838, 236,   3720, 562,   -1751 },
-  { 1899, 3004, 3605, 1918, 2347, 4957, 5010, 5918, 6020,  5972, 7291,
-    6820, 8455, 8985, 7833, 5877, 5796, 7048, 5548, 2886,  4467, 10008,
-    7443, 8399, 7314, 4277, 3852, 296,  -983, 1487, -2474, -7290 }
-};
-static int16_t default_ncobmc_krnl_2_1_0[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 4140, 3361, 5678, 1978,  3443, 3453, 2905, 2131, 4836, 2612, 1530,
-    -831, -257, 584,  -1193, -391, 107,  -47,  32,   125,  282,  684,
-    161,  23,   -22,  -95,   555,  -405, 569,  -268, -92,  105 },
-  { 4680, 4183, 4456, 4730, 4264, 4681, 2310, 2034, 3081, 2493, 2012,
-    1397, 1521, -881, -976, -668, -606, -768, -273, 256,  -4,   -290,
-    64,   -55,  -444, -989, -316, -496, 206,  -169, -158, -87 },
-  { 3199, 3846,  3775, 632,  2359, 3492, 3355, 53,   -1201, 145,  263,
-    -93,  -1435, 415,  -844, 954,  -241, -483, -165, -191,  -561, -185,
-    -300, -258,  -154, -654, 308,  -64,  -36,  -150, 95,    146 },
-  { 680,  2863, 889,  1721, 3444, 2472,  -27,  2458, 816,  -186, 123,
-    3214, 2029, 2485, -631, 323,  1030,  -275, 196,  -532, -537, 153,
-    274,  61,   -453, -283, -533, -1062, -145, -388, 158,  0 },
-  { 1962, 4004, 1406, -535, 1315, 2669, 2522, 654, 3394, 4205, 2731,
-    -40,  -118, 599,  -511, 618,  162,  840,  43,  253,  -59,  222,
-    64,   -21,  -671, -179, 241,  283,  902,  226, 305,  -204 },
-  { 516,  1205, 3201, -5,   1479, 945,  2129, -628, 3181, 900, 1439,
-    1128, 799,  -158, -431, 347,  -118, 527,  389,  268,  -73, 2,
-    534,  133,  -287, -19,  561,  329,  394,  -120, 38,   -461 },
-  { 2130, 2022, 1966, 210, 447,  402,  1249, 1677, 2353, 1113, 1723,
-    1300, 2060, -144, 420, 2008, -417, -74,  -197, 135,  217,  310,
-    152,  339,  -99,  -81, 279,  44,   54,   -160, -82,  4 },
-  { 2134, -1849, -990, -93,  1932, 2119, 2954, -371, -1021, -831, 1662,
-    1330, 1634,  246,  -777, 852,  130,  -67,  191,  -316,  -429, -240,
-    -147, -198,  92,   -15,  310,  141,  -10,  146,  35,    85 },
-  { 2763, 4779, 994, 1054, 2625, 2031, 1784, -161, 1142, 1052, 2300,
-    2462, 1943, 516, 816,  27,   18,   171,  158,  -311, -636, 20,
-    -463, -235, 145, 339,  240,  -354, -110, 41,   404,  353 },
-  { 3625, 3557, 2333, 950,  2020, 2445, 2562, 1506, 2571, 1559, 4781,
-    2030, 1325, 2507, 2045, 1896, -526, -22,  -272, -143, -189, 17,
-    10,   405,  143,  414,  -95,  -229, -215, 0,    -347, 83 },
-  { 2808, 1062, 1502, 411, 1139, 998, 1577, 1233, 1637, 998,  1846,
-    2487, 3868, 2225, 533, -51,  -6,  -180, -30,  186,  -175, 247,
-    352,  57,   83,   290, 330,  160, 165,  354,  -465, 131 },
-  { 2809, 2966, 2929, 1435, 2875, 1948, 130,  1168, 252,  1276, 2838,
-    3507, 3001, 1410, 312,  1941, -336, -431, -190, -194, -130, -336,
-    238,  75,   -472, -189, 123,  61,   -583, 147,  305,  200 },
-  { -23,  2306, 2169, 33,   1848, 1832, 2721, 49,  1435, 585, 1036,
-    2116, 1658, 1011, 815,  920,  101,  108,  262, 299,  283, 357,
-    268,  141,  -71,  -285, 205,  142,  -71,  224, 252,  156 },
-  { 1447, 2625, 4643, 2096, -847, -154, 2876, 1050, 104,  -873, -327,
-    146,  -596, 622,  -337, 1317, -61,  9,    -201, 110,  90,   644,
-    337,  204,  155,  278,  320,  -306, -504, 357,  -108, 132 },
-  { -16, 2815, 1344, -2044, 2236, -549, 586,  409, 30,  152,  1588,
-    243, -115, 291,  -30,   -170, -96,  -10,  433, 205, -134, 17,
-    528, -16,  -22,  -198,  -43,  -143, -224, 270, 153, 37 },
-  { 1478, 829,  628, 1055, 1323, -406, -282, -12,  418,  40,  -795,
-    -286, -627, -41, -448, 454,  -267, -258, -129, -57,  -44, -406,
-    -260, -67,  134, -196, -236, -125, 35,   -62,  -137, -5 },
-  { 220,  26,  -380, -257, -90,  -453, -196, -56,  -193, 37,   131,
-    151,  -88, -695, 66,   -113, -200, -144, 132,  -48,  -244, -207,
-    -178, 268, -107, -1,   69,   337,  -84,  -197, 87,   119 },
-  { 7,    3,   -85,  -185, 334,  -86, -69, 152, -320, -239, 587,
-    415,  246, 290,  -146, -134, -9,  -69, -66, -148, -41,  -206,
-    -148, 283, -144, -287, -73,  93,  -23, 247, 398,  174 },
-  { 46,  -256, -114, -61,  -532, 103,  32,   -223, 24,   -20,  132,
-    339, 61,   -381, -711, -160, -200, -334, 78,   173,  -281, -139,
-    -27, 134,  -120, 96,   110,  -251, -114, -32,  -299, -183 },
-  { -193, 28,  -134, 200,  155,  -316, -363, 285,  268, 665, 233,
-    -127, 436, -20,  -536, -163, 51,   -40,  162,  78,  -27, 192,
-    -34,  -40, -17,  -205, 203,  106,  -62,  -211, -84, 60 },
-  { -440, 312, -195, 221,  251, -388, -116, -252, -101, 92,  -244,
-    -694, -27, 198,  -3,   255, -257, -17,  0,    143,  -20, 48,
-    -68,  110, -130, -340, 136, -45,  -138, 251,  -111, -2 },
-  { 325,  219,  -68,  215,  -177, -206, 14,   108,  -291, 211, 92,
-    -62,  -166, -218, -158, -220, -279, 199,  113,  -263, 271, 153,
-    -433, -16,  19,   -322, -28,  258,  -295, -300, -285, -123 },
-  { -345, 543,  356, -541, -726, -205, -332, -397, -10, -132, 232,
-    132,  308,  324, 229,  79,   -151, 161,  143,  -40, -144, -464,
-    32,   -364, -11, -99,  -285, 61,   -258, 182,  -28, 107 },
-  { -55, 70,   -78,  -269, -709, -52,  351,  94,   80,  268, 249,
-    -56, 189,  -191, -60,  -88,  15,   -205, 111,  -62, 21,  85,
-    77,  -107, -35,  -13,  -107, -472, -546, -197, 5,   115 },
-  { -363, -297, 246,  -84, -419, -230, 283,  -128, 34,   -27, 112,
-    125,  166,  163,  176, -422, 14,   -238, -80,  -153, 313, -366,
-    -208, -54,  -260, 48,  -176, 21,   -91,  -295, -270, 40 },
-  { 85,   242,  107,  -41,  -283, -390, -105, 360, 181,  -720, -582,
-    27,   -96,  -350, -217, -189, -135, -12,  280, 86,   3,    25,
-    -126, -213, -384, 41,   -15,  101,  -68,  143, -211, 86 },
-  { -183, 13,  274,  -46, -86,  -633, 181,  -232, -90, -106, -22,
-    332,  -12, -16,  -30, 87,   5,    46,   37,   -99, 27,   292,
-    -74,  -94, -237, -16, -145, 76,   -106, 227,  -52, 168 },
-  { 40,  -258, -140, -6,   203,  146,  -64, -88, -183, 221,  62,
-    67,  114,  -216, -307, -560, -197, -46, 149, -126, -120, -316,
-    -36, -227, -200, 115,  -41,  -51,  97,  123, -47,  103 },
-  { -51, 44,  -99,  -230, -156, -46, -145, -412, -56,  48, -239,
-    222, 83,  -339, -196, -64,  175, 149,  -140, -316, 6,  -62,
-    -27, -56, -21,  -269, 229,  -7,  122,  -18,  -129, 86 },
-  { -372, 106, 18,  172,  364,  19,  -245, -73,  -124, 164, -9,
-    14,   214, -67, -217, -175, -45, 119,  -194, 36,   18,  -83,
-    126,  196, 112, -297, -102, 104, -74,  -152, 19,   199 },
-  { 314,  81,  -49,  -188, 48,  -82, -4,   107, -221, -4,  207,
-    -245, 197, -37,  -185, -50, -56, -214, 100, -231, -31, -2,
-    21,   -53, -215, -77,  168, -23, 82,   5,   155,  169 },
-  { 258, 188, -27,  -27,  165,  29,  -17,  100, -27, -80, -80,
-    196, 23,  -391, -533, -171, 84,  -137, 0,   14,  251, 99,
-    35,  88,  -28,  1,    144,  -96, -235, 176, 103, -85 }
-};
-static int16_t default_ncobmc_krnl_2_1_1[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 5724, 6155, 5101, 6937, 3616, 3940, 3066, 5662, 7104, 5021, 4979,
-    5907, 4968, 7085, 6582, 7719, 9143, 4128, 6447, 4879, 7061, 11362,
-    7837, 9965, 7152, 6477, 6581, 5803, 1819, 5309, 8559, 10776 },
-  { 1775, 3231, 4026, 2629, 4438, 6309, 5114, 2895, 5657, 6541, 6734,
-    5994, 7468, 4555, 9911, 5200, 5402, 1698, 4298, 6112, 6417, 6691,
-    4816, 6195, 4139, 5856, 3358, 1993, 1542, 661,  1660, 4762 },
-  { 1953, 726,  336,  2519, 4189, -753, 2993, 4957, 5850, 4298, 3651,
-    5353, 3255, 5491, 7815, 3406, 3928, 2987, 4148, 4276, 3530, 8058,
-    5079, 5821, 4622, 3354, 3146, 2460, 489,  1550, 1587, 1399 },
-  { -801, 328,  103,  886,  1381, 2280, 4320, 2452, 1215, 6261, 2206,
-    4849, 4488, 3829, 6128, 5213, 1739, 3173, 4425, 4567, 5845, 5197,
-    5910, 6147, 4260, 3730, 4240, 5420, 307,  672,  963,  3278 },
-  { -1721, -2596, -155, 3029, 3428, 2390, 2321, 3757, 1383, -1283, -1621,
-    1418,  2475,  4188, 5570, 3575, 799,  4017, 2856, 1426, 2012,  2722,
-    3669,  4104,  3800, 4116, 3275, 3739, 326,  95,   2421, 3075 },
-  { -551, -927, -520, 2944, 2518, -722, -215, 1875, 137,  2182, 2761,
-    159,  762,  3693, 1681, 2600, 880,  3273, 4470, 5007, 4272, 3074,
-    2474, 4254, 6828, 4219, 3671, 2407, 1044, 129,  -478, 2814 },
-  { -2686, -1229, 1372, 4761, 4668, 1462, 509,  2727, 930,  2438, 3542,
-    1456,  1961,  541,  1063, 1426, 3603, 2873, 2412, 2999, 2101, 3739,
-    2385,  5494,  5444, 5655, 5034, 381,  321,  90,   2585, 4160 },
-  { -4203, 479,  1122, 2688, 2124, 942,  -2136, -1643, -491, 2581, -2155,
-    -2375, 559,  582,  2202, 2081, 3774, 3330,  1101,  894,  3410, 3691,
-    2509,  5195, 6226, 5471, 5022, 2525, 778,   1212,  2736, 3350 },
-  { -2415, -2903, 4719, 5860, 4006, 2692, 4035, 4143, 2498, 4377, 2058,
-    488,   1429,  3199, -11,  2009, 2087, 2903, 155,  522,  4521, 2221,
-    2310,  3124,  2870, 1941, 3262, 2258, 1515, 2257, 1584, 1048 },
-  { -1469, -2652, -561,  2135, 389,  -522, -589, 447,  -847, 268,  -1641,
-    -1540, -1513, -1334, -599, -581, 2848, 2828, 1416, 2157, 2198, 925,
-    2421,  1437,  1963,  369,  2195, -548, 2051, 868,  824,  2683 },
-  { -2620, -3631, -4548, -885, 629, 523,  -528, -2178, -1743, 1644, 353,
-    -2687, -3041, -1722, 283,  178, 1594, 1190, 968,   -386,  2305, 1317,
-    245,   1443,  968,   800,  471, 521,  1564, 669,   903,   243 },
-  { -1791, -3282, -4140, -1753, -1006, -374, 1027,  -176,  -1477, -891, 191,
-    -912,  497,   96,    359,   1045,  1467, 172,   1303,  2510,  3516, 3671,
-    789,   -807,  2670,  1483,  547,   -521, -1219, -1856, 1008,  1053 },
-  { -1427, -2698, -3949, -436, 801,  -614, -1548, 523,  -176, -683, 423,
-    -871,  820,   -2279, -143, 375,  768,  2306,  5249, 1302, -338, -396,
-    -1590, -608,  1469,  2344, -187, -693, 599,   -661, -458, 160 },
-  { -3491, -3877, -2952, 1252, 767,   -3037, -3638, 188, 587,  710,  1416,
-    1176,  -319,  -473,  1873, -1997, 725,   596,   -94, 1875, 2992, -519,
-    -139,  1938,  1025,  521,  760,   1090,  3648,  392, 564,  902 },
-  { -2186, -3264, -1742, 2634, -36,  -51,  -1253, -314, -908, -459, -1701,
-    -1437, -991,  84,    1265, -964, 402,  1454,  -772, -927, 1765, 1543,
-    484,   2346,  3310,  1887, 1754, 3058, 1474,  728,  -466, -1646 },
-  { -1826, -332, 48,   744,  -618, -97, -165, -155, -908,  -143, 1285,
-    1739,  1185, 885,  1134, -531, -15, -526, 543,  1438,  2026, 3022,
-    558,   1827, -139, 1792, 2022, 769, 2400, 444,  -1572, 598 },
-  { 165,  -357, 15,  666, 1315, 1155, 376,  -7,  991,  213,  1687,
-    -34,  452,  352, 203, 1605, 1484, -498, 581, 533,  467,  1744,
-    1315, 874,  82,  900, 1437, -692, -417, 456, -271, -1132 },
-  { 646, 210,   320,  1208, 145,  971,   396, -448, 557, 1876, -1791,
-    913, -1288, -452, 1015, 925,  -1197, -49, -285, 442, 1093, -410,
-    125, 519,   -52,  513,  1497, -1337, 298, -402, 820, 732 },
-  { -796, 627, -1017, 2972, 4463, 2331, 1387, 1496, 1796, 1608, 1681,
-    -877, 881, -160,  -581, -433, 949,  471,  307,  140,  -946, -597,
-    247,  650, 1143,  694,  10,   -682, 890,  409,  617,  810 },
-  { 1653, 4435,  2388,  294,  2578, 1229, 1072, 1871, 465,  1650, 1524,
-    -430, -1195, -3427, -116, 1117, 217,  967,  -254, 259,  -55,  1425,
-    1583, -1261, -1773, 1232, 2886, 646,  1346, 1518, 2090, -837 },
-  { 2020, 728,   2038,  316, 5725, 4193, 890,  1490, 584,  2705, 694,
-    -892, 34,    2041,  972, 332,  -295, -218, -756, 2193, 1672, 1440,
-    2310, -2136, -2204, 399, -753, 743,  3155, 2521, 3534, 166 },
-  { 824,  1664, 991,  853,  700,  -80,   148, -908, -194, -620, 1053,
-    -368, 1616, 1250, 1449, 3140, -1065, 286, 2226, -590, -570, -1131,
-    477,  -61,  -708, 519,  586,  1148,  898, 1653, 4697, 1581 },
-  { 2014, 1921, -210, 556,  686,  -561, -1239, -1345, -664,  -138, -215,
-    -343, 1019, 1294, 519,  -179, 212,  -299,  -2160, -1450, -329, 293,
-    691,  162,  -645, 1079, 2005, 1466, 1127,  2263,  730,   179 },
-  { 5629, 4670, 597,  2030, 3873, 3698, 54,   2714, 62,   352,   2177,
-    908,  1306, 1504, 1464, -288, -106, -69,  -179, -900, -1340, -4,
-    877,  487,  2606, 358,  2055, 1131, 1421, 931,  -477, 1173 },
-  { 757,  -493, 1510, 2513, 4514, 4649, -478, 2069, 124, -1186, 2855,
-    1906, 1420, 1738, 19,   1916, 1195, -519, 32,   512, 230,   528,
-    43,   -263, 1314, 1350, 137,  -256, 939,  256,  168, -201 },
-  { 663, 947,  699,  3239, 4730, 5279, 1739, 1659, 2774,  -1660, -1677,
-    185, 3745, 1319, 2347, 477,  364,  531,  608,  -520,  -783,  -123,
-    -59, -345, 1202, 1766, 88,   883,  654,  1399, -1082, 658 },
-  { 4534, 5694, 5332, 4909, 4828, 4761, 7376, 3834, 2327, 4737, 7135,
-    5306, 6337, 5240, 5578, 4321, 2107, -205, 1387, 597,  1112, 904,
-    1567, 610,  461,  371,  250,  602,  358,  1807, -617, -59 },
-  { 6124, 8363, 9624, 5674, 7043, 4437, 3846, 3121, 3477, 2818, 5445,
-    3618, 5067, 3996, 5759, 7185, 2150, 785,  1581, 2084, 3321, 4828,
-    -545, 510,  2309, 2501, 1594, 2028, 528,  113,  248,  550 },
-  { 8154,  9890, 6292, 6421, 8295, 4403, 7503, 5496, 7256, 3699, 2845,
-    3725,  5365, 5905, 7170, 2903, 733,  4614, 3856, 4346, 7099, -902,
-    -1492, 1703, 2321, 1842, 3488, 1690, 982,  524,  -467, -687 },
-  { 5338, 10331, 7754, 7014, 3581, 5660, 5471, 5420, 3976, 2548, 6486,
-    9144, 6584,  5442, 6795, 4845, 5182, 2855, 8246, 3660, 5417, 1845,
-    1803, 288,   1434, 639,  1404, 2752, 923,  1055, 741,  -984 },
-  { 4457, 7110, 5195, 5959, 6818, 8562, 5548, 2071, 5544, 8734, 7080,
-    4737, 9481, 7672, 8374, 7638, 4204, 3562, 3758, 3598, 5016, 2863,
-    3927, 5001, 4677, 4444, 2481, 1773, 2525, 3142, 4840, 3965 },
-  { 1134, 3249, 4702, 5483, 4471, 7234, 7281, 6240, 5891, 7577, 3826,
-    5886, 4798, 7117, 6319, 7264, 4115, 5613, 4674, 4999, 4518, 2501,
-    6830, 4913, 2356, 789,  1926, 2190, 1914, 1434, 987,  1761 }
-};
-static int16_t default_ncobmc_krnl_2_1_2[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 6131, 7769, 6548, 6297, 4967, 4708, 3127, 5937, 697,  748,  1850,
-    2290, 2945, -80,  216,  377,  318,  1009, 2112, 2962, -886, 849,
-    510,  4160, 2257, 2875, 4589, 5345, 7363, 5350, 6815, 1644 },
-  { 6949, 8044, 7295, 7318, 3142, 2084, 1819, 3048, 1654, 1831, 1344,
-    3344, 2065, 2889, -88,  3746, 696,  1143, 232,  1444, 1587, 4125,
-    3991, 3840, 5642, 4933, 3560, 6540, 5865, 6663, 6729, 5520 },
-  { 7816, 4894, 7089, 7533, 4271, 6814, 1972, 3845,  3755, 3498, 3571,
-    1884, 3171, 1843, 70,   2358, 2622, 1241, 143,   2657, 3804, 2968,
-    1781, 262,  2864, 4345, 1302, 5434, 7815, 10560, 9211, 8202 },
-  { 10656, 7490, 8639, 7975, 4318, 7432, 6148,  3321, 3776, 2781, 3544,
-    246,   2350, 793,  1600, 1266, 2372, -1382, -983, 1926, 493,  447,
-    2275,  3510, 4789, 3766, 878,  2353, 3314,  6282, 5853, 3709 },
-  { 11083, 7270, 6211, 6170, 4927, 4198, 3939, 4605, 1734, 2009, 2950,
-    546,   722,  99,   550,  597,  2350, 41,   1314, 1148, -183, 1143,
-    5392,  3550, 3102, 1161, -556, 1700, 7598, 8412, 6019, 9654 },
-  { 10358, 7350, 6589, 5975, 3587, 6201, 4603, 3974, 2262, 886,  1815,
-    1899,  1642, 2894, 1557, 228,  1625, 1879, 838,  182,  919,  1168,
-    3272,  1155, 889,  2292, 128,  4478, 5205, 7668, 8767, 10921 },
-  { 8569, 4702, 5397, 5147, 2577, 4301, 2139, 1630, 721,  1721, -218,
-    1595, 275,  1133, 1051, -777, 1556, -245, 972,  106,  2205, 385,
-    1410, 366,  3348, 2139, -164, 3111, 2656, 5036, 6021, 4847 },
-  { 7654, 5535, 5975, 4580, 3005, 5483, 4637, 5560, 6252, 4946, 4508,
-    3600, 1824, 1528, 338,  131,  1290, 309,  344,  3110, 3607, 2484,
-    1062, 1267, 1426, -860, 1155, 6137, 2415, 5482, 6846, 4916 },
-  { 8060,  5296,  4396, 2040, 867,  1189, 3555, 3397, 3438, 664,  -1931,
-    -1938, -1414, 1317, 762,  -312, -655, -801, -243, 2795, 1663, 1314,
-    1478,  2856,  562,  1075, 3211, 7482, 2988, 3880, 4156, 3289 },
-  { 8146, 7596, 7056,  7622, 5755, 7181, 7862, 4736, 4932, 3146, 1043,
-    -422, -813, -2152, 1444, 441,  3599, 395,  2173, 755,  4245, 3047,
-    1545, 1062, 1159,  1621, 209,  6521, 7385, 7730, 6511, 8959 },
-  { 9567,  8044, 7535, 6969, 3284, 4284, 4734, 4758, 5177, 2342, 230,
-    -1852, -839, -769, 222,  255,  -315, -16,  1101, -28,  3561, 2004,
-    -260,  789,  1856, 1960, 4962, 4207, 2425, 8406, 6771, 7796 },
-  { 8019,  7612,  8357,  5521, 4711,  3374, 4391, 7093, 5013, 3608, 238,
-    -1564, -1662, -1373, -198, -1045, 100,  2694, 1251, 489,  2110, 1670,
-    188,   -1362, 953,   2340, 3361,  3595, 6405, 7676, 1634, 7730 },
-  { 10177, 6488, 5822, 5121, 2615,  2725, 3372, 4849, 2232, 2548, 2841,
-    874,   895,  307,  1293, -150,  411,  -981, -815, -24,  936,  -2339,
-    254,   3019, 5892, 4302, -2171, 6747, 7198, 5638, 4832, 9538 },
-  { 7260,  9945, 2818, 1106, 6179, 6331, 5106, 1814, 5997, 4045, 1456,
-    -230,  297,  1045, 1918, -126, 752,  1014, 999,  -506, 198,  -732,
-    -1900, 139,  749,  3999, 5614, 5241, 6339, 8316, 3673, 7681 },
-  { 11101, 6954, 7475,  5729, 4242, 6118, 4569, 2348, 5307, 3762, 2933,
-    -1610, 988,  -1178, -104, -151, -507, 491,  -906, 1236, 3075, 1525,
-    1631,  2901, 2758,  1303, 1578, 6405, 3807, 7189, 8468, 9262 },
-  { 6835, 4602, 5501, 5568, 4338, 6143, 4304, 3557, 3258, 3797, 1242,
-    968,  1683, -251, 1218, 301,  1257, 1924, 985,  1251, 3051, 433,
-    1756, 167,  -660, 3884, 3450, 7202, 6544, 5184, 7556, 9366 },
-  { 5991, 6762, 3854, 4856, 6714, 5701, 4072, 2489, 422,  -365, 1488,
-    1660, 725,  1157, -778, 654,  313,  -18,  3162, 3065, 2925, 2391,
-    827,  5547, 461,  2487, 1492, 5810, 7042, 5284, 3995, 6870 },
-  { 6435, 8283, 4732, 5896, 5599, 4229, 4798, 3309, 3128, 941,  2565,
-    394,  257,  2477, 721,  1494, 3161, 1409, 1306, 2534, 1261, 2719,
-    756,  4388, 570,  5416, 3719, 6067, 4092, 2565, 6299, 10504 },
-  { 6042, 7417, 5391, 4671, 3245, 7547,  3777,  3203, 2044, 583,  2083,
-    1971, 1721, 1948, -169, 1197, -1141, -480,  2155, 1033, 1313, 268,
-    1857, 4493, 3083, 2005, 5347, 4397,  10144, 4828, 6622, 9817 },
-  { 7202, 5045, 6601, 6937, 3704, 5796, 5061, 3575, 2383, 1389, 3111,
-    1751, 1603, 2813, 174,  706,  -569, 2620, 1735, 1418, 1871, -1542,
-    168,  2156, 5107, 6329, 4968, 7018, 6279, 6864, 5898, 9157 },
-  { 5722, 5683, 4189, 4814, 2883, 5508, 5100, 1625, 2169, 3680, 1884,
-    2109, 462,  1145, 334,  515,  191,  441,  1058, 917,  1528, -96,
-    1843, 5395, 4498, 5681, 4193, 5196, 8356, 5303, 7262, 10141 },
-  { 5879, 5779,  7257, 3873, 6911, 6238, 5672,  3583, 3261, 3048, 2536,
-    -310, -1046, -69,  -660, 417,  -719, -2058, 1740, 888,  2746, 1367,
-    1668, 1090,  1830, 1153, 5047, 7336, 3380,  7160, 4422, 9401 },
-  { 7809, 7945, 8385, 8535, 7803, 3953, 5065, 3185,  2013,  1659, 1648,
-    769,  292,  -135, 114,  -579, 713,  1407, -1181, 1569,  3525, 5630,
-    219,  3518, 3739, 3432, 7282, 6357, 619,  5779,  10116, 6448 },
-  { 9496,  7224, 5342, 5960, 5092,  4225, 4353, 3995, 3631, 1662, 1413,
-    762,   534,  126,  -551, -1025, 2327, 602,  -452, 1285, 2103, 2579,
-    -1369, 2724, 6353, 3925, 4631,  9139, 4974, 6630, 7755, 4125 },
-  { 5226, 7729, 5768,  5815, 4531, 2948, 3029,  2603, 2549, 1366, 119,
-    405,  21,   -1831, -327, -287, -415, -1317, -214, 3017, 1586, 2436,
-    868,  1094, 290,   668,  2117, 756,  1228,  2700, 5743, 8052 },
-  { 6262, 5531, 4454, 4616, 3913, 2022, 4240, 2241, 4201, 2506, 1810,
-    628,  -496, -779, -471, 394,  756,  1666, -445, 490,  575,  -478,
-    894,  1182, 822,  626,  1782, 1781, 5333, 5482, 1760, 8187 },
-  { 6488,  6875,  4960, 6837,  4564, 1871, 390,  2940, 4330, 1634, 131,
-    -1102, -1451, -928, -1067, -419, -614, -2,   1017, 1066, 1051, 917,
-    1097,  844,   465,  513,   2377, 1031, 3548, 5088, 4516, 10564 },
-  { 6497, 6047,  5649, 7156, 4974, 3683, 2875, 4421, 1502, 1244, 668,
-    -30,  -1465, -59,  -399, -721, 954,  -281, -2,   664,  1039, 814,
-    758,  1911,  319,  4247, 1848, 1606, 2536, 2189, 1372, 7759 },
-  { 5994, 5659,  6777, 6693, 4758, 2986, 1463, 1186, 2116, -166, 499,
-    73,   -1151, -164, 279,  -895, -169, 339,  1194, 1772, 752,  1649,
-    1696, -2615, 1581, 1740, 1789, 1832, 1899, 510,  2135, 7149 },
-  { 9107,  4250, 5418, 4334,  613,   2618, 3395, 4809, 1724, 873, -78,
-    -1146, -431, -547, -1104, -1128, -6,   -290, 945,  794,  564, 1670,
-    737,   4540, 1574, 6285,  2596,  2859, 1191, 1428, 5614, 8419 },
-  { 5905, 4490, 6470,  3636, 2119,  1731, 3532, 2461, 2391, 473,  176,
-    -562, 389,  -1300, -916, -1436, 371,  567,  1038, 866,  59,   195,
-    679,  -721, 2994,  3260, 1813,  1589, 850,  1982, 7410, 11546 },
-  { 7265, 8775, 6672, 6657, 6182, 3732, 3222, 4564, 2644, 790,  924,
-    -596, 628,  -681, -57,  -236, 103,  364,  603,  1420, 309,  787,
-    1257, 770,  2453, 3401, 1175, 434,  792,  4019, 8792, 11773 }
-};
-static int16_t default_ncobmc_krnl_2_1_3[MAX_SB_SIZE][MAX_SB_SIZE] = {
-  { 391,  -894, -939, 1155,  4362, 4297, 7296,  2684, 3758, 8010, 8044,
-    9041, 8748, 8816, 10796, 8701, 6840, 11306, 7814, 8456, 9952, 3511,
-    7870, 2227, 7018, 7148,  4672, 5660, 6657,  6007, 1098, 3866 },
-  { 2970, 945,  619,  1701, 4540, 3326,  7140,  8401,  6001, 5524, 6311,
-    5657, 5333, 9833, 7547, 8127, 10894, 14326, 12130, 8591, 8408, 5873,
-    7524, 6398, 7054, 6594, 9788, 8347,  8784,  9253,  8154, 6170 },
-  { 3423, 6928,  5192, 5699, 5575,  6852,  8083,  7546,  8019, 8464, 8910,
-    9251, 11401, 8637, 9356, 9671,  10065, 12652, 12275, 9662, 9627, 5550,
-    9836, 10565, 9075, 9350, 11656, 8549,  8120,  4437,  5501, 6658 },
-  { 5859, 5714, 6766, 5830, 7266,  4208,  5956,  8173,  10615, 7557,  10533,
-    8101, 7530, 9292, 9312, 9603,  11268, 14896, 12761, 10435, 10584, 10602,
-    7945, 6677, 7798, 9184, 11805, 9688,  12921, 9831,  9425,  9409 },
-  { 5068,  7732,  8953,  7750,  6739,  7145,  7635,  7400,  9896,  11465, 12344,
-    14483, 13309, 11497, 10778, 11614, 13096, 11519, 12197, 13573, 14652, 12324,
-    7270,  8764,  10162, 11289, 13446, 10681, 7564,  7663,  7650,  3879 },
-  { 6073,  8775,  7134, 7485,  8815,  9982,  9893,  11182, 10807, 12415, 10385,
-    13211, 13198, 9974, 13590, 13229, 14029, 10733, 10710, 10950, 11286, 12150,
-    10133, 10858, 8958, 9903,  12033, 9177,  9756,  8710,  8055,  3108 },
-  { 8368,  10916, 7650,  6261,  8713,  10236, 12507, 10373, 12385, 11135, 11343,
-    12039, 12114, 14871, 13861, 13742, 11649, 13839, 13207, 13160, 11863, 11950,
-    12423, 10188, 7712,  8705,  11270, 12864, 13370, 11422, 7881,  7390 },
-  { 10805, 12233, 10301, 9238,  9352,  7871,  10959, 12870, 11641, 9692, 12373,
-    13839, 12380, 14055, 14653, 13348, 11227, 12844, 14769, 12714, 9815, 10484,
-    12966, 10123, 8644,  11791, 9911,  7598,  13225, 9539,  6774,  8055 },
-  { 7987,  9257,  6281,  7446,  8911,  10506, 7039,  9031,  9319,  10294, 13979,
-    15391, 14445, 11372, 14852, 14690, 14954, 14129, 16319, 13385, 10855, 12837,
-    13065, 10647, 12815, 13043, 9686,  7003,  12028, 10211, 10237, 11699 },
-  { 6073,  7893,  7571,  5698,  8244,  7305,  6581,  9719,  9746,  11432, 12215,
-    16346, 17408, 17379, 13508, 14637, 10471, 13204, 13089, 13632, 10135, 12397,
-    12431, 13511, 13140, 13999, 14081, 10639, 7173,  7807,  9433,  4659 },
-  { 6634,  10941, 11920, 9920,  11356, 10608, 10624, 12593, 11330, 11413, 13971,
-    18455, 16400, 16654, 15373, 16023, 15144, 15413, 14357, 16626, 10718, 12841,
-    16053, 14104, 13496, 13334, 10605, 11490, 12221, 6956,  9178,  8213 },
-  { 7366,  9121,  9253,  11198, 9839,  11458, 10864, 8319,  12656, 12437, 13128,
-    15378, 14565, 16278, 15940, 14457, 15156, 13972, 14035, 13587, 10888, 11376,
-    15176, 18483, 13236, 12754, 12347, 13247, 11785, 10432, 13455, 7419 },
-  { 7665,  10318, 12372, 11702, 11166, 12470, 11859, 10983, 12921, 13947, 12106,
-    14300, 13037, 17367, 14444, 15259, 15107, 14974, 11715, 14835, 15525, 18775,
-    17479, 13835, 9101,  10034, 18554, 10201, 8666,  11181, 11767, 6530 },
-  { 11169, 7696,  11879, 11938, 10302, 13271, 12067, 13360, 9715,  12528, 13879,
-    15312, 17012, 15194, 12951, 17211, 14989, 14796, 15695, 14942, 13140, 17003,
-    18104, 14131, 14490, 11607, 9697,  10346, 6890,  7337,  12248, 7668 },
-  { 7494,  9902,  9327,  10081, 9955,  10895, 12521, 13971, 11975, 12950, 13579,
-    19214, 16537, 17208, 15292, 17698, 16633, 14485, 17676, 15920, 11698, 13314,
-    13747, 11163, 10360, 13396, 13119, 7073,  11331, 8217,  8258,  8754 },
-  { 9934,  11319, 10239, 9047,  11387, 10784, 12566, 13038, 13663, 12717, 14675,
-    14008, 14178, 15820, 14510, 16181, 15440, 15283, 15009, 13767, 11372, 13359,
-    14352, 14480, 17066, 10914, 11175, 8554,  7428,  10827, 10561, 6443 },
-  { 10016, 9986,  12912, 11133, 8475,  9995,  12150, 14006, 15182, 16531, 13117,
-    14634, 15313, 15598, 16928, 14269, 14814, 17080, 12532, 12849, 13261, 12479,
-    14442, 9716,  15960, 13029, 13398, 10927, 9854,  10849, 12580, 10547 },
-  { 9295,  7913,  11422, 9455,  10319, 11278, 11274, 13394, 13038, 13821, 15044,
-    14686, 17187, 14091, 14823, 14137, 14455, 15111, 15447, 13582, 14076, 14295,
-    15643, 11185, 16015, 10747, 11235, 11551, 12009, 13990, 8881,  5003 },
-  { 11095, 8615,  12138, 8821,  9239,  6419,  11207, 11937, 12556, 14236, 12501,
-    14976, 13740, 15006, 17876, 15826, 16800, 16761, 13880, 15072, 16296, 16857,
-    14333, 11125, 12310, 13605, 10932, 12928, 5472,  11185, 9435,  5957 },
-  { 7725,  6887,  7535,  8957,  9967,  9700,  10640, 10680, 13275, 12682, 11517,
-    15207, 15552, 17018, 16856, 14725, 16692, 12845, 14748, 14656, 14606, 16310,
-    14672, 15510, 13069, 9039,  8315,  8606,  8826,  8214,  8487,  7999 },
-  { 9071,  9686,  10375, 11046, 7539,  7106,  10540, 13531, 13747, 9927,  14071,
-    15876, 15935, 13026, 15104, 15296, 16773, 16198, 16098, 13165, 13227, 15002,
-    12319, 13015, 14240, 10673, 12818, 10497, 5016,  8298,  5706,  6088 },
-  { 9366,  8741,  8215,  11450, 8961,  10464, 10575, 13631, 13635, 13752, 12735,
-    17169, 16010, 15438, 15786, 13083, 18481, 17990, 12316, 16370, 13953, 16000,
-    14693, 15392, 15242, 15049, 10809, 7658,  12399, 7866,  7570,  5544 },
-  { 6903,  5972,  7864,  7864,  8655,  13231, 12904, 14949, 15064, 15007, 14738,
-    15847, 14769, 14910, 15543, 17103, 15630, 15115, 19594, 16319, 13352, 10936,
-    15453, 13064, 13305, 12008, 7408,  8514,  14898, 8171,  5583,  9657 },
-  { 1309,  4431,  10551, 8701,  8152,  8547,  11642, 9601,  12635, 14116, 12560,
-    14796, 14370, 14959, 15558, 17801, 14148, 16067, 16927, 16084, 15633, 13749,
-    16805, 13274, 7467,  12136, 9815,  6584,  10514, 9020,  9109,  10981 },
-  { 10778, 9464,  8877,  8157,  7779,  9056,  13584, 11871, 13714, 16259, 13305,
-    13956, 14785, 16328, 16541, 15199, 15586, 18478, 16668, 13019, 14279, 13814,
-    15684, 15613, 15050, 14345, 14327, 15869, 14316, 13744, 10738, 8497 },
-  { 9411,  9691,  11139, 8582,  8038,  9492,  10534, 12154, 9249,  16286, 16839,
-    15572, 13252, 16207, 14760, 15743, 15428, 14223, 15971, 16378, 16607, 16993,
-    15698, 15766, 14771, 13969, 14551, 13631, 10451, 9360,  15908, 7460 },
-  { 5565,  3814,  5832,  4698,  7091,  10412, 8442,  9852,  9831,  10137, 9167,
-    11864, 11520, 12092, 11930, 12431, 14914, 16568, 13978, 14847, 14215, 14290,
-    13812, 15033, 15711, 15541, 13908, 14681, 12577, 9266,  12542, 5718 },
-  { 3740,  2245,  1259,  3575,  4190,  8150,  9742,  8948,  11592, 12108, 10225,
-    12748, 12684, 12687, 11339, 10475, 13481, 15937, 14669, 13780, 12167, 11074,
-    16225, 14201, 13966, 9544,  12974, 12797, 13248, 13990, 14819, 7995 },
-  { 2296,  817,   3435,  3505,  3507,  9072,  7580,  10139, 7087,  12821, 13297,
-    12396, 12113, 10999, 9149,  14466, 15677, 11290, 11487, 10612, 8552,  15725,
-    16233, 17367, 12511, 13088, 10898, 12875, 13386, 15384, 14845, 9849 },
-  { 2320,  1714,  3209,  4858,  11853, 8126,  7775,  6246,  10834, 12812, 9996,
-    8379,  10020, 11558, 10914, 12851, 11272, 13723, 7409,  11919, 10393, 12987,
-    13756, 11382, 13258, 9754,  12513, 10697, 14356, 14065, 10023, 8748 },
-  { 5715,  4721,  4773,  6968, 7426,  6196,  7322,  11771, 8704,  7198,  8944,
-    12478, 6336,  10064, 9132, 10252, 11884, 12483, 11504, 12168, 11346, 13354,
-    11779, 12178, 8942,  8770, 11937, 13047, 12938, 11277, 4002,  710 },
-  { 7743,  4184,  5058,  4276,  5576,  5393,  5919,  5500,  7881, 8102,  11726,
-    10912, 10943, 10344, 10654, 9537,  12118, 10565, 11112, 9964, 11328, 13005,
-    8273,  10626, 11596, 12198, 13157, 13884, 13912, 10737, 6497, 2938 }
-};
-
-void get_default_ncobmc_kernels(AV1_COMMON *cm) {
-  av1_copy(cm->ncobmc_kernels[0][0].KERNEL[0], default_ncobmc_krnl_0_0_0);
-  av1_copy(cm->ncobmc_kernels[0][0].KERNEL[1], default_ncobmc_krnl_0_0_1);
-  av1_copy(cm->ncobmc_kernels[0][0].KERNEL[2], default_ncobmc_krnl_0_0_2);
-  av1_copy(cm->ncobmc_kernels[0][0].KERNEL[3], default_ncobmc_krnl_0_0_3);
-  av1_copy(cm->ncobmc_kernels[0][1].KERNEL[0], default_ncobmc_krnl_0_1_0);
-  av1_copy(cm->ncobmc_kernels[0][1].KERNEL[1], default_ncobmc_krnl_0_1_1);
-  av1_copy(cm->ncobmc_kernels[0][1].KERNEL[2], default_ncobmc_krnl_0_1_2);
-  av1_copy(cm->ncobmc_kernels[0][1].KERNEL[3], default_ncobmc_krnl_0_1_3);
-  av1_copy(cm->ncobmc_kernels[1][0].KERNEL[0], default_ncobmc_krnl_1_0_0);
-  av1_copy(cm->ncobmc_kernels[1][0].KERNEL[1], default_ncobmc_krnl_1_0_1);
-  av1_copy(cm->ncobmc_kernels[1][0].KERNEL[2], default_ncobmc_krnl_1_0_2);
-  av1_copy(cm->ncobmc_kernels[1][0].KERNEL[3], default_ncobmc_krnl_1_0_3);
-  av1_copy(cm->ncobmc_kernels[1][1].KERNEL[0], default_ncobmc_krnl_1_1_0);
-  av1_copy(cm->ncobmc_kernels[1][1].KERNEL[1], default_ncobmc_krnl_1_1_1);
-  av1_copy(cm->ncobmc_kernels[1][1].KERNEL[2], default_ncobmc_krnl_1_1_2);
-  av1_copy(cm->ncobmc_kernels[1][1].KERNEL[3], default_ncobmc_krnl_1_1_3);
-  av1_copy(cm->ncobmc_kernels[2][0].KERNEL[0], default_ncobmc_krnl_2_0_0);
-  av1_copy(cm->ncobmc_kernels[2][0].KERNEL[1], default_ncobmc_krnl_2_0_1);
-  av1_copy(cm->ncobmc_kernels[2][0].KERNEL[2], default_ncobmc_krnl_2_0_2);
-  av1_copy(cm->ncobmc_kernels[2][0].KERNEL[3], default_ncobmc_krnl_2_0_3);
-  av1_copy(cm->ncobmc_kernels[2][1].KERNEL[0], default_ncobmc_krnl_2_1_0);
-  av1_copy(cm->ncobmc_kernels[2][1].KERNEL[1], default_ncobmc_krnl_2_1_1);
-  av1_copy(cm->ncobmc_kernels[2][1].KERNEL[2], default_ncobmc_krnl_2_1_2);
-  av1_copy(cm->ncobmc_kernels[2][1].KERNEL[3], default_ncobmc_krnl_2_1_3);
-}
diff --git a/third_party/aom/av1/common/obmc.h b/third_party/aom/av1/common/obmc.h
index f3940490f..3918c82c6 100644
--- a/third_party/aom/av1/common/obmc.h
+++ b/third_party/aom/av1/common/obmc.h
@@ -12,31 +12,31 @@
 #ifndef AV1_COMMON_OBMC_H_
 #define AV1_COMMON_OBMC_H_
 
-#if CONFIG_MOTION_VAR
 typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_pos,
-                                          uint8_t nb_mi_size, MODE_INFO *nb_mi,
-                                          void *fun_ctxt);
+                                          uint8_t nb_mi_size,
+                                          MB_MODE_INFO *nb_mi, void *fun_ctxt,
+                                          const int num_planes);
 
 static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
                                                  MACROBLOCKD *xd, int mi_col,
                                                  int nb_max,
                                                  overlappable_nb_visitor_t fun,
                                                  void *fun_ctxt) {
+  const int num_planes = av1_num_planes(cm);
   if (!xd->up_available) return;
 
   int nb_count = 0;
 
   // prev_row_mi points into the mi array, starting at the beginning of the
   // previous row.
-  MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
+  MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
   const int end_col = AOMMIN(mi_col + xd->n8_w, cm->mi_cols);
   uint8_t mi_step;
   for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max;
        above_mi_col += mi_step) {
-    MODE_INFO **above_mi = prev_row_mi + above_mi_col;
-    mi_step = AOMMIN(mi_size_wide[above_mi[0]->mbmi.sb_type],
-                     mi_size_wide[BLOCK_64X64]);
-#if CONFIG_CHROMA_SUB8X8
+    MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col;
+    mi_step =
+        AOMMIN(mi_size_wide[above_mi[0]->sb_type], mi_size_wide[BLOCK_64X64]);
     // If we're considering a block with width 4, it should be treated as
     // half of a pair of blocks with chroma information in the second. Move
     // above_mi_col back to the start of the pair if needed, set above_mbmi
@@ -47,12 +47,10 @@ static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
       above_mi = prev_row_mi + above_mi_col + 1;
       mi_step = 2;
     }
-#endif  // CONFIG_CHROMA_SUB8X8
-    MB_MODE_INFO *above_mbmi = &above_mi[0]->mbmi;
-    if (is_neighbor_overlappable(above_mbmi)) {
+    if (is_neighbor_overlappable(*above_mi)) {
       ++nb_count;
       fun(xd, above_mi_col - mi_col, AOMMIN(xd->n8_w, mi_step), *above_mi,
-          fun_ctxt);
+          fun_ctxt, num_planes);
     }
   }
 }
@@ -62,35 +60,32 @@ static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
                                                 int nb_max,
                                                 overlappable_nb_visitor_t fun,
                                                 void *fun_ctxt) {
+  const int num_planes = av1_num_planes(cm);
   if (!xd->left_available) return;
 
   int nb_count = 0;
 
   // prev_col_mi points into the mi array, starting at the top of the
   // previous column
-  MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
+  MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
   const int end_row = AOMMIN(mi_row + xd->n8_h, cm->mi_rows);
   uint8_t mi_step;
   for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max;
        left_mi_row += mi_step) {
-    MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
-    mi_step = AOMMIN(mi_size_high[left_mi[0]->mbmi.sb_type],
-                     mi_size_high[BLOCK_64X64]);
-#if CONFIG_CHROMA_SUB8X8
+    MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
+    mi_step =
+        AOMMIN(mi_size_high[left_mi[0]->sb_type], mi_size_high[BLOCK_64X64]);
     if (mi_step == 1) {
       left_mi_row &= ~1;
       left_mi = prev_col_mi + (left_mi_row + 1) * xd->mi_stride;
       mi_step = 2;
     }
-#endif  // CONFIG_CHROMA_SUB8X8
-    MB_MODE_INFO *left_mbmi = &left_mi[0]->mbmi;
-    if (is_neighbor_overlappable(left_mbmi)) {
+    if (is_neighbor_overlappable(*left_mi)) {
       ++nb_count;
       fun(xd, left_mi_row - mi_row, AOMMIN(xd->n8_h, mi_step), *left_mi,
-          fun_ctxt);
+          fun_ctxt, num_planes);
     }
   }
 }
 
-#endif  // CONFIG_MOTION_VAR
 #endif  // AV1_COMMON_OBMC_H_
diff --git a/third_party/aom/av1/common/odintrin.c b/third_party/aom/av1/common/odintrin.c
index 868efacc9..7584b2e52 100644
--- a/third_party/aom/av1/common/odintrin.c
+++ b/third_party/aom/av1/common/odintrin.c
@@ -13,16 +13,6 @@
 
 #include "av1/common/odintrin.h"
 
-#if defined(OD_ENABLE_ASSERTIONS)
-# include <stdio.h>
-
-void od_fatal_impl(const char *_str, const char *_file, int _line) {
-  fprintf(stderr, "Fatal (internal) error in %s, line %d: %s\n",
-   _file, _line, _str);
-  abort();
-}
-#endif
-
 /*Constants for use with OD_DIVU_SMALL().
   See \cite{Rob05} for details on computing these constants.
   @INPROCEEDINGS{Rob05,
diff --git a/third_party/aom/av1/common/odintrin.h b/third_party/aom/av1/common/odintrin.h
index a50c456c1..e87c5a0bf 100644
--- a/third_party/aom/av1/common/odintrin.h
+++ b/third_party/aom/av1/common/odintrin.h
@@ -14,10 +14,6 @@
 #ifndef AV1_COMMON_ODINTRIN_H_
 #define AV1_COMMON_ODINTRIN_H_
 
-#if defined(_MSC_VER)
-# define _USE_MATH_DEFINES
-#endif
-#include <math.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -30,71 +26,8 @@
 extern "C" {
 #endif
 
-# if !defined(M_PI)
-#  define M_PI      (3.1415926535897932384626433832795)
-# endif
-
-# if !defined(M_SQRT2)
-#  define M_SQRT2 (1.41421356237309504880168872420970)
-# endif
-
-# if !defined(M_SQRT1_2)
-#  define M_SQRT1_2 (0.70710678118654752440084436210485)
-# endif
-
-# if !defined(M_LOG2E)
-#  define M_LOG2E (1.4426950408889634073599246810019)
-# endif
-
-# if !defined(M_LN2)
-#  define M_LN2 (0.69314718055994530941723212145818)
-# endif
-
-/*Smallest blocks are 4x4*/
-#define OD_LOG_BSIZE0 (2)
-/*There are 5 block sizes total (4x4, 8x8, 16x16, 32x32 and 64x64).*/
-#define OD_NBSIZES (5)
-
-/*There are 4 transform sizes total in AV1 (4x4, 8x8, 16x16 and 32x32).*/
-#define OD_TXSIZES TX_SIZES
-/*The log of the maximum length of the side of a transform.*/
-#define OD_LOG_TXSIZE_MAX (OD_LOG_BSIZE0 + OD_TXSIZES - 1)
-/*The maximum length of the side of a transform.*/
-#define OD_TXSIZE_MAX (1 << OD_LOG_TXSIZE_MAX)
-
-/**The maximum number of color planes allowed in a single frame.*/
-# define OD_NPLANES_MAX (3)
-
-# define OD_COEFF_SHIFT (4)
-
-# define OD_DISABLE_CFL (1)
-# define OD_DISABLE_FILTER (1)
-
-#if !defined(NDEBUG)
-# define OD_ENABLE_ASSERTIONS (1)
-#endif
-
-# define OD_LOG(a)
-# define OD_LOG_PARTIAL(a)
-
-/*Possible block sizes, note that OD_BLOCK_NXN = log2(N) - 2.*/
-#define OD_BLOCK_4X4 (0)
-#define OD_BLOCK_8X8 (1)
-#define OD_BLOCK_16X16 (2)
-#define OD_BLOCK_32X32 (3)
-#define OD_BLOCK_SIZES (OD_BLOCK_32X32 + 1)
-
-# define OD_LIMIT_BSIZE_MIN (OD_BLOCK_4X4)
-# define OD_LIMIT_BSIZE_MAX (OD_BLOCK_32X32)
-
 typedef int od_coeff;
 
-/*This is the strength reduced version of ((_a)/(1 << (_b))).
-  This will not work for _b == 0, however currently this is only used for
-   b == 1 anyway.*/
-# define OD_UNBIASED_RSHIFT32(_a, _b) \
-  (((int32_t)(((uint32_t)(_a) >> (32 - (_b))) + (_a))) >> (_b))
-
 #define OD_DIVU_DMAX (1024)
 
 extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
@@ -116,14 +49,6 @@ extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
 #define OD_CLZ0 (1)
 #define OD_CLZ(x) (-get_msb(x))
 #define OD_ILOG_NZ(x) (OD_CLZ0 - OD_CLZ(x))
-/*Note that __builtin_clz is not defined when x == 0, according to the gcc
-   documentation (and that of the x86 BSR instruction that implements it), so
-   we have to special-case it.
-  We define a special version of the macro to use when x can be zero.*/
-#define OD_ILOG(x) ((x) ? OD_ILOG_NZ(x) : 0)
-
-#define OD_LOG2(x) (M_LOG2E*log(x))
-#define OD_EXP2(x) (exp(M_LN2*(x)))
 
 /*Enable special features for gcc and compatible compilers.*/
 #if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
@@ -146,36 +71,6 @@ extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
 #define OD_ARG_NONNULL(x)
 #endif
 
-#if defined(OD_ENABLE_ASSERTIONS)
-#if OD_GNUC_PREREQ(2, 5, 0)
-__attribute__((noreturn))
-#endif
-void od_fatal_impl(const char *_str, const char *_file, int _line);
-
-#define OD_FATAL(_str) (od_fatal_impl(_str, __FILE__, __LINE__))
-
-#define OD_ASSERT(_cond)                     \
-  do {                                       \
-    if (!(_cond)) {                          \
-      OD_FATAL("assertion failed: " #_cond); \
-    }                                        \
-  } while (0)
-
-#define OD_ASSERT2(_cond, _message)                        \
-  do {                                                     \
-    if (!(_cond)) {                                        \
-      OD_FATAL("assertion failed: " #_cond "\n" _message); \
-    }                                                      \
-  } while (0)
-
-#define OD_ALWAYS_TRUE(_cond) OD_ASSERT(_cond)
-
-#else
-#define OD_ASSERT(_cond)
-#define OD_ASSERT2(_cond, _message)
-#define OD_ALWAYS_TRUE(_cond) ((void)(_cond))
-#endif
-
 /** Copy n elements of memory from src to dst. The 0* term provides
     compile-time type checking  */
 #if !defined(OVERRIDE_OD_COPY)
@@ -190,85 +85,10 @@ void od_fatal_impl(const char *_str, const char *_file, int _line);
  (memmove((dst), (src), sizeof(*(dst))*(n) + 0*((dst) - (src)) ))
 #endif
 
-/** Linkage will break without this if using a C++ compiler, and will issue
- * warnings without this for a C compiler*/
-#if defined(__cplusplus)
-# define OD_EXTERN extern
-#else
-# define OD_EXTERN
-#endif
-
-/** Set n elements of dst to zero */
-#if !defined(OVERRIDE_OD_CLEAR)
-# define OD_CLEAR(dst, n) (memset((dst), 0, sizeof(*(dst))*(n)))
-#endif
-
-/** Silence unused parameter/variable warnings */
-# define OD_UNUSED(expr) (void)(expr)
-
-#if defined(OD_FLOAT_PVQ)
-typedef double od_val16;
-typedef double od_val32;
-# define OD_QCONST32(x, bits) (x)
-# define OD_ROUND16(x) (x)
-# define OD_ROUND32(x) (x)
-# define OD_SHL(x, shift) (x)
-# define OD_SHR(x, shift) (x)
-# define OD_SHR_ROUND(x, shift) (x)
-# define OD_ABS(x) (fabs(x))
-# define OD_MULT16_16(a, b) ((a)*(b))
-# define OD_MULT16_32_Q16(a, b) ((a)*(b))
-#else
-typedef int16_t od_val16;
-typedef int32_t od_val32;
-/** Compile-time conversion of float constant to 32-bit value */
-# define OD_QCONST32(x, bits) ((od_val32)(.5 + (x)*(((od_val32)1) << (bits))))
-# define OD_ROUND16(x) (int16_t)(floor(.5 + (x)))
-# define OD_ROUND32(x) (int32_t)(floor(.5 + (x)))
-/*Shift x left by shift*/
-# define OD_SHL(a, shift) ((int32_t)((uint32_t)(a) << (shift)))
-/*Shift x right by shift (without rounding)*/
-# define OD_SHR(x, shift) \
-  ((int32_t)((x) >> (shift)))
-/*Shift x right by shift (with rounding)*/
-# define OD_SHR_ROUND(x, shift) \
-  ((int32_t)(((x) + (1 << (shift) >> 1)) >> (shift)))
-/*Shift x right by shift (without rounding) or left by -shift if shift
-  is negative.*/
-# define OD_VSHR(x, shift) \
-  (((shift) > 0) ? OD_SHR(x, shift) : OD_SHL(x, -(shift)))
-/*Shift x right by shift (with rounding) or left by -shift if shift
-  is negative.*/
-# define OD_VSHR_ROUND(x, shift) \
-  (((shift) > 0) ? OD_SHR_ROUND(x, shift) : OD_SHL(x, -(shift)))
-# define OD_ABS(x) (abs(x))
-/* (od_val32)(od_val16) gives TI compiler a hint that it's 16x16->32 multiply */
-/** 16x16 multiplication where the result fits in 32 bits */
-# define OD_MULT16_16(a, b) \
- (((od_val32)(od_val16)(a))*((od_val32)(od_val16)(b)))
-/* Multiplies 16-bit a by 32-bit b and keeps bits [16:47]. */
-# define OD_MULT16_32_Q16(a, b) ((int16_t)(a)*(int64_t)(int32_t)(b) >> 16)
-/*16x16 multiplication where the result fits in 16 bits, without rounding.*/
-# define OD_MULT16_16_Q15(a, b) \
-  (((int16_t)(a)*((int32_t)(int16_t)(b))) >> 15)
-/*16x16 multiplication where the result fits in 16 bits, without rounding.*/
-# define OD_MULT16_16_Q16(a, b) \
-  ((((int16_t)(a))*((int32_t)(int16_t)(b))) >> 16)
-#endif
-
 /*All of these macros should expect floats as arguments.*/
-/*These two should compile as a single SSE instruction.*/
-# define OD_MINF(a, b) ((a) < (b) ? (a) : (b))
-# define OD_MAXF(a, b) ((a) > (b) ? (a) : (b))
-
-# define OD_DIV_R0(x, y) (((x) + OD_FLIPSIGNI((((y) + 1) >> 1) - 1, (x)))/(y))
-
 # define OD_SIGNMASK(a) (-((a) < 0))
 # define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b))
 
-# define OD_MULT16_16_Q15(a, b) \
-  (((int16_t)(a)*((int32_t)(int16_t)(b))) >> 15)
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/onyxc_int.h b/third_party/aom/av1/common/onyxc_int.h
index 2396ce2f3..fa5f02e52 100644
--- a/third_party/aom/av1/common/onyxc_int.h
+++ b/third_party/aom/av1/common/onyxc_int.h
@@ -12,76 +12,72 @@
 #ifndef AV1_COMMON_ONYXC_INT_H_
 #define AV1_COMMON_ONYXC_INT_H_
 
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
 #include "aom/internal/aom_codec_internal.h"
 #include "aom_util/aom_thread.h"
-#if CONFIG_ANS
-#include "aom_dsp/ans.h"
-#endif
 #include "av1/common/alloccommon.h"
 #include "av1/common/av1_loopfilter.h"
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
 #include "av1/common/entropymv.h"
+#include "av1/common/enums.h"
 #include "av1/common/frame_buffers.h"
 #include "av1/common/mv.h"
 #include "av1/common/quant_common.h"
-#if CONFIG_LOOP_RESTORATION
 #include "av1/common/restoration.h"
-#endif  // CONFIG_LOOP_RESTORATION
 #include "av1/common/tile_common.h"
+#include "av1/common/timing.h"
 #include "av1/common/odintrin.h"
-#if CONFIG_PVQ
-#include "av1/common/pvq.h"
-#endif
-#if CONFIG_CFL
-#include "av1/common/cfl.h"
-#endif
-#if CONFIG_HASH_ME
-// TODO(youzhou@microsoft.com): Encoder only. Move it out of common
 #include "av1/encoder/hash_motion.h"
-#endif
+#include "aom_dsp/grain_synthesis.h"
+#include "aom_dsp/grain_table.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define CDEF_MAX_STRENGTHS 16
+#if defined(__clang__) && defined(__has_warning)
+#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")
+#define AOM_FALLTHROUGH_INTENDED [[clang::fallthrough]]  // NOLINT
+#endif
+#elif defined(__GNUC__) && __GNUC__ >= 7
+#define AOM_FALLTHROUGH_INTENDED __attribute__((fallthrough))  // NOLINT
+#endif
 
-#define REF_FRAMES_LOG2 3
-#define REF_FRAMES (1 << REF_FRAMES_LOG2)
+#ifndef AOM_FALLTHROUGH_INTENDED
+#define AOM_FALLTHROUGH_INTENDED \
+  do {                           \
+  } while (0)
+#endif
 
-// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
-// in parallel, 3 for scaled references on the encoder.
-// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
-// of framebuffers.
-// TODO(jkoleszar): These 3 extra references could probably come from the
-// normal reference pool.
-#define FRAME_BUFFERS (REF_FRAMES + 7)
+#define CDEF_MAX_STRENGTHS 16
 
-#if CONFIG_REFERENCE_BUFFER
 /* Constant values while waiting for the sequence header */
-#define FRAME_ID_NUMBERS_PRESENT_FLAG 1
-#define FRAME_ID_LENGTH_MINUS7 8         // Allows frame id up to 2^15-1
-#define DELTA_FRAME_ID_LENGTH_MINUS2 12  // Allows frame id deltas up to 2^14-1
-#endif                                   // CONFIG_REFERENCE_BUFFER
+#define FRAME_ID_LENGTH 15
+#define DELTA_FRAME_ID_LENGTH 14
 
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
 #define FRAME_CONTEXTS (FRAME_BUFFERS + 1)
 // Extra frame context which is always kept at default values
 #define FRAME_CONTEXT_DEFAULTS (FRAME_CONTEXTS - 1)
-#else
+#define PRIMARY_REF_BITS 3
+#define PRIMARY_REF_NONE 7
 
-#if CONFIG_EXT_REFS
-#define FRAME_CONTEXTS_LOG2 3
-#else
-#define FRAME_CONTEXTS_LOG2 2
-#endif
+#define NUM_PING_PONG_BUFFERS 2
 
-#define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2)
-#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
+#define MAX_NUM_TEMPORAL_LAYERS 8
+#define MAX_NUM_SPATIAL_LAYERS 4
+/* clang-format off */
+// clang-format seems to think this is a pointer dereference and not a
+// multiplication.
+#define MAX_NUM_OPERATING_POINTS \
+  MAX_NUM_TEMPORAL_LAYERS * MAX_NUM_SPATIAL_LAYERS
+/* clang-format on*/
 
-#define NUM_PING_PONG_BUFFERS 2
+// TODO(jingning): Turning this on to set up transform coefficient
+// processing timer.
+#define TXCOEFF_TIMER 0
+#define TXCOEFF_COST_TIMER 0
 
 typedef enum {
   SINGLE_REFERENCE = 0,
@@ -90,20 +86,11 @@ typedef enum {
   REFERENCE_MODES = 3,
 } REFERENCE_MODE;
 
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-typedef enum {
-  RESET_FRAME_CONTEXT_NONE = 0,
-  RESET_FRAME_CONTEXT_CURRENT = 1,
-  RESET_FRAME_CONTEXT_ALL = 2,
-} RESET_FRAME_CONTEXT_MODE;
-#endif
-
 typedef enum {
   /**
-   * Update frame context to values resulting from forward probability
-   * updates signaled in the frame header
+   * Frame context updates are disabled
    */
-  REFRESH_FRAME_CONTEXT_FORWARD,
+  REFRESH_FRAME_CONTEXT_DISABLED,
   /**
    * Update frame context to values resulting from backward probability
    * updates based on entropy/counts in the decoded frame
@@ -111,57 +98,41 @@ typedef enum {
   REFRESH_FRAME_CONTEXT_BACKWARD,
 } REFRESH_FRAME_CONTEXT_MODE;
 
-#if CONFIG_MFMV
-#define MFMV_STACK_SIZE INTER_REFS_PER_FRAME
-
+#define MFMV_STACK_SIZE 3
 typedef struct {
-  int_mv mfmv[INTER_REFS_PER_FRAME][MFMV_STACK_SIZE];
+  int_mv mfmv0;
+  uint8_t ref_frame_offset;
 } TPL_MV_REF;
-#endif
 
 typedef struct {
-  int_mv mv[2];
-  int_mv pred_mv[2];
-  MV_REFERENCE_FRAME ref_frame[2];
+  int_mv mv;
+  MV_REFERENCE_FRAME ref_frame;
 } MV_REF;
 
 typedef struct {
   int ref_count;
 
-#if CONFIG_FRAME_MARKER
-  int cur_frame_offset;
-  int lst_frame_offset;
-  int alt_frame_offset;
-  int gld_frame_offset;
-#if CONFIG_EXT_REFS
-  int lst2_frame_offset;
-  int lst3_frame_offset;
-  int bwd_frame_offset;
-  int alt2_frame_offset;
-#endif
-#endif  // CONFIG_FRAME_MARKER
+  unsigned int cur_frame_offset;
+  unsigned int ref_frame_offset[INTER_REFS_PER_FRAME];
 
-#if CONFIG_MFMV
-  TPL_MV_REF *tpl_mvs;
-#endif
   MV_REF *mvs;
+  uint8_t *seg_map;
+  struct segmentation seg;
   int mi_rows;
   int mi_cols;
   // Width and height give the size of the buffer (before any upscaling, unlike
   // the sizes that can be derived from the buf structure)
   int width;
   int height;
-#if CONFIG_GLOBAL_MOTION
-  WarpedMotionParams global_motion[TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_GLOBAL_MOTION
+  WarpedMotionParams global_motion[REF_FRAMES];
+  int showable_frame;  // frame can be used as show existing frame in future
+  int film_grain_params_present;
+  aom_film_grain_t film_grain_params;
   aom_codec_frame_buffer_t raw_frame_buffer;
   YV12_BUFFER_CONFIG buf;
-#if CONFIG_HASH_ME
   hash_table hash_table;
-#endif
-#if CONFIG_TEMPMV_SIGNALING
   uint8_t intra_only;
-#endif
+  FRAME_TYPE frame_type;
   // The Following variables will only be used in frame parallel decode.
 
   // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
@@ -173,6 +144,12 @@ typedef struct {
   // when the frame is fully decoded.
   int row;
   int col;
+
+  // Inter frame reference frame delta for loop filter
+  int8_t ref_deltas[REF_FRAMES];
+
+  // 0 = ZERO_MV, MV
+  int8_t mode_deltas[MAX_MODE_LF_DELTAS];
 } RefCntBuffer;
 
 typedef struct BufferPool {
@@ -195,28 +172,77 @@ typedef struct BufferPool {
   InternalFrameBufferList int_frame_buffers;
 } BufferPool;
 
-#if CONFIG_LV_MAP
 typedef struct {
-  int base_ctx_table[2 /*row*/][2 /*col*/][2 /*sig_map*/]
+  int base_ctx_table[2 /*row*/][2 /*col*/][3 /*sig_map*/]
                     [BASE_CONTEXT_POSITION_NUM + 1];
 } LV_MAP_CTX_TABLE;
-typedef int BASE_CTX_TABLE[2 /*col*/][2 /*sig_map*/]
+typedef int BASE_CTX_TABLE[2 /*col*/][3 /*sig_map*/]
                           [BASE_CONTEXT_POSITION_NUM + 1];
-#endif
 
-#if CONFIG_REFERENCE_BUFFER
+typedef struct BitstreamLevel {
+  uint8_t major;
+  uint8_t minor;
+} BitstreamLevel;
+
 /* Initial version of sequence header structure */
 typedef struct SequenceHeader {
+  int num_bits_width;
+  int num_bits_height;
+  int max_frame_width;
+  int max_frame_height;
   int frame_id_numbers_present_flag;
-  int frame_id_length_minus7;
-  int delta_frame_id_length_minus2;
+  int frame_id_length;
+  int delta_frame_id_length;
+  BLOCK_SIZE sb_size;  // Size of the superblock used for this frame
+  int mib_size;        // Size of the superblock in units of MI blocks
+  int mib_size_log2;   // Log 2 of above.
+  int order_hint_bits_minus_1;
+  int force_screen_content_tools;  // 0 - force off
+                                   // 1 - force on
+                                   // 2 - adaptive
+  int force_integer_mv;            // 0 - Not to force. MV can be in 1/4 or 1/8
+                                   // 1 - force to integer
+                                   // 2 - adaptive
+  int still_picture;               // Video is a single frame still picture
+  int reduced_still_picture_hdr;   // Use reduced header for still picture
+  int monochrome;                  // Monochorme video
+  int enable_filter_intra;         // enables/disables filterintra
+  int enable_intra_edge_filter;    // enables/disables corner/edge/upsampling
+  int enable_interintra_compound;  // enables/disables interintra_compound
+  int enable_masked_compound;      // enables/disables masked compound
+  int enable_dual_filter;          // 0 - disable dual interpolation filter
+                                   // 1 - enable vert/horiz filter selection
+  int enable_order_hint;           // 0 - disable order hint, and related tools
+                                   // jnt_comp, ref_frame_mvs, frame_sign_bias
+                                   // if 0, enable_jnt_comp and
+                                   // enable_ref_frame_mvs must be set zs 0.
+  int enable_jnt_comp;             // 0 - disable joint compound modes
+                                   // 1 - enable it
+  int enable_ref_frame_mvs;        // 0 - disable ref frame mvs
+                                   // 1 - enable it
+  int enable_warped_motion;        // 0 - disable warped motion for sequence
+                                   // 1 - enable it for the sequence
+  int enable_superres;     // 0 - Disable superres for the sequence, and disable
+                           //     transmitting per-frame superres enabled flag.
+                           // 1 - Enable superres for the sequence, and also
+                           //     enable per-frame flag to denote if superres is
+                           //     enabled for that frame.
+  int enable_cdef;         // To turn on/off CDEF
+  int enable_restoration;  // To turn on/off loop restoration
+  int operating_points_cnt_minus_1;
+  int operating_point_idc[MAX_NUM_OPERATING_POINTS];
+  int display_model_info_present_flag;
+  int decoder_model_info_present_flag;
+  BitstreamLevel level[MAX_NUM_OPERATING_POINTS];
+  uint8_t tier[MAX_NUM_OPERATING_POINTS];  // seq_tier in the spec. One bit: 0
+                                           // or 1.
 } SequenceHeader;
-#endif  // CONFIG_REFERENCE_BUFFER
 
 typedef struct AV1Common {
   struct aom_internal_error_info error;
-  aom_color_space_t color_space;
-  aom_transfer_function_t transfer_function;
+  aom_color_primaries_t color_primaries;
+  aom_transfer_characteristics_t transfer_characteristics;
+  aom_matrix_coefficients_t matrix_coefficients;
   aom_chroma_sample_position_t chroma_sample_position;
   int color_range;
   int width;
@@ -225,6 +251,14 @@ typedef struct AV1Common {
   int render_height;
   int last_width;
   int last_height;
+  int timing_info_present;
+  aom_timing_info_t timing_info;
+  int buffer_removal_delay_present;
+  aom_dec_model_info_t buffer_model;
+  aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
+  aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1];
+  int tu_presentation_delay_flag;
+  int64_t tu_presentation_delay;
 
   // TODO(jkoleszar): this implies chroma ss right now, but could vary per
   // plane. Revisit as part of the future change to YV12_BUFFER_CONFIG to
@@ -232,10 +266,15 @@ typedef struct AV1Common {
   int subsampling_x;
   int subsampling_y;
 
-#if CONFIG_HIGHBITDEPTH
+  int largest_tile_id;
+  size_t largest_tile_size;
+  int context_update_tile_id;
+
+  // Scale of the current frame with respect to itself.
+  struct scale_factors sf_identity;
+
   // Marks if we need to use 16bit frame buffers (1: yes, 0: no).
   int use_highbitdepth;
-#endif
   YV12_BUFFER_CONFIG *frame_to_show;
   RefCntBuffer *prev_frame;
 
@@ -253,6 +292,10 @@ typedef struct AV1Common {
 
   // Each Inter frame can reference INTER_REFS_PER_FRAME buffers
   RefBuffer frame_refs[INTER_REFS_PER_FRAME];
+  int is_skip_mode_allowed;
+  int skip_mode_flag;
+  int ref_frame_idx_0;
+  int ref_frame_idx_1;
 
   int new_fb_idx;
 
@@ -260,39 +303,26 @@ typedef struct AV1Common {
   FRAME_TYPE frame_type;
 
   int show_frame;
+  int showable_frame;  // frame can be used as show existing frame in future
   int last_show_frame;
   int show_existing_frame;
-#if CONFIG_EXT_REFS
   // Flag for a frame used as a reference - not written to the bitstream
   int is_reference_frame;
-#endif  // CONFIG_EXT_REFS
+  int reset_decoder_state;
 
   // Flag signaling that the frame is encoded using only INTRA modes.
   uint8_t intra_only;
   uint8_t last_intra_only;
-
+  uint8_t disable_cdf_update;
   int allow_high_precision_mv;
-#if CONFIG_AMVR
-  int seq_mv_precision_level;        // 0 the default in AOM, 1 only integer, 2
-                                     // adaptive
-  int cur_frame_mv_precision_level;  // 0 the default in AOM, 1 only integer
-#endif
+  int cur_frame_force_integer_mv;  // 0 the default in AOM, 1 only integer
 
   int allow_screen_content_tools;
-#if CONFIG_INTERINTRA
-  int allow_interintra_compound;
-#endif  // CONFIG_INTERINTRA
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-  int allow_masked_compound;
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  // Flag signaling which frame contexts should be reset to default values.
-  RESET_FRAME_CONTEXT_MODE reset_frame_context;
-#endif
+  int allow_intrabc;
+  int allow_warped_motion;
 
   // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in
-  // MODE_INFO (8-pixel) units.
+  // MB_MODE_INFO (8-pixel) units.
   int MBs;
   int mb_rows, mi_rows;
   int mb_cols, mi_cols;
@@ -301,119 +331,120 @@ typedef struct AV1Common {
   /* profile settings */
   TX_MODE tx_mode;
 
+#if CONFIG_ENTROPY_STATS
+  int coef_cdf_category;
+#endif
+
   int base_qindex;
   int y_dc_delta_q;
-  int uv_dc_delta_q;
-  int uv_ac_delta_q;
-  int16_t y_dequant[MAX_SEGMENTS][2];
-  int16_t uv_dequant[MAX_SEGMENTS][2];
+  int u_dc_delta_q;
+  int v_dc_delta_q;
+  int u_ac_delta_q;
+  int v_ac_delta_q;
+
+  int separate_uv_delta_q;
+
+  // The dequantizers below are true dequntizers used only in the
+  // dequantization process.  They have the same coefficient
+  // shift/scale as TX.
+  int16_t y_dequant_QTX[MAX_SEGMENTS][2];
+  int16_t u_dequant_QTX[MAX_SEGMENTS][2];
+  int16_t v_dequant_QTX[MAX_SEGMENTS][2];
 
-#if CONFIG_AOM_QM
   // Global quant matrix tables
-  qm_val_t *giqmatrix[NUM_QM_LEVELS][2][2][TX_SIZES_ALL];
-  qm_val_t *gqmatrix[NUM_QM_LEVELS][2][2][TX_SIZES_ALL];
+  const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
+  const qm_val_t *gqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
 
   // Local quant matrix tables for each frame
-  qm_val_t *y_iqmatrix[MAX_SEGMENTS][2][TX_SIZES_ALL];
-  qm_val_t *uv_iqmatrix[MAX_SEGMENTS][2][TX_SIZES_ALL];
-  // Encoder
-  qm_val_t *y_qmatrix[MAX_SEGMENTS][2][TX_SIZES_ALL];
-  qm_val_t *uv_qmatrix[MAX_SEGMENTS][2][TX_SIZES_ALL];
+  const qm_val_t *y_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  const qm_val_t *u_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  const qm_val_t *v_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
 
+  // Encoder
   int using_qmatrix;
+  int qm_y;
+  int qm_u;
+  int qm_v;
   int min_qmlevel;
   int max_qmlevel;
-#endif
-#if CONFIG_NEW_QUANT
-  dequant_val_type_nuq y_dequant_nuq[MAX_SEGMENTS][QUANT_PROFILES][COEF_BANDS];
-  dequant_val_type_nuq uv_dequant_nuq[MAX_SEGMENTS][QUANT_PROFILES][COEF_BANDS];
-#endif
 
-  /* We allocate a MODE_INFO struct for each macroblock, together with
+  /* We allocate a MB_MODE_INFO struct for each macroblock, together with
      an extra row on top and column on the left to simplify prediction. */
   int mi_alloc_size;
-  MODE_INFO *mip; /* Base of allocated array */
-  MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
+  MB_MODE_INFO *mip; /* Base of allocated array */
+  MB_MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
 
   // TODO(agrange): Move prev_mi into encoder structure.
   // prev_mip and prev_mi will only be allocated in encoder.
-  MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
-  MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
+  MB_MODE_INFO *prev_mip; /* MB_MODE_INFO array 'mip' from last decoded frame */
+  MB_MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
 
   // Separate mi functions between encoder and decoder.
   int (*alloc_mi)(struct AV1Common *cm, int mi_size);
   void (*free_mi)(struct AV1Common *cm);
   void (*setup_mi)(struct AV1Common *cm);
 
-  // Grid of pointers to 8x8 MODE_INFO structs.  Any 8x8 not in the visible
+  // Grid of pointers to 8x8 MB_MODE_INFO structs.  Any 8x8 not in the visible
   // area will be NULL.
-  MODE_INFO **mi_grid_base;
-  MODE_INFO **mi_grid_visible;
-  MODE_INFO **prev_mi_grid_base;
-  MODE_INFO **prev_mi_grid_visible;
-
-  // Whether to use previous frame's motion vectors for prediction.
-  int use_prev_frame_mvs;
+  MB_MODE_INFO **mi_grid_base;
+  MB_MODE_INFO **mi_grid_visible;
+  MB_MODE_INFO **prev_mi_grid_base;
+  MB_MODE_INFO **prev_mi_grid_visible;
 
-  // Persistent mb segment id map used in prediction.
-  int seg_map_idx;
-  int prev_seg_map_idx;
+  // Whether to use previous frames' motion vectors for prediction.
+  int allow_ref_frame_mvs;
 
-  uint8_t *seg_map_array[NUM_PING_PONG_BUFFERS];
   uint8_t *last_frame_seg_map;
   uint8_t *current_frame_seg_map;
   int seg_map_alloc_size;
 
   InterpFilter interp_filter;
 
+  int switchable_motion_mode;
+
   loop_filter_info_n lf_info;
-#if CONFIG_FRAME_SUPERRES
   // The denominator of the superres scale; the numerator is fixed.
   uint8_t superres_scale_denominator;
   int superres_upscaled_width;
   int superres_upscaled_height;
-#endif  // CONFIG_FRAME_SUPERRES
-#if CONFIG_LOOP_RESTORATION
   RestorationInfo rst_info[MAX_MB_PLANE];
-  RestorationInternal rst_internal;
-#endif  // CONFIG_LOOP_RESTORATION
+
+  // rst_end_stripe[i] is one more than the index of the bottom stripe
+  // for tile row i.
+  int rst_end_stripe[MAX_TILE_ROWS];
+
+  // Pointer to a scratch buffer used by self-guided restoration
+  int32_t *rst_tmpbuf;
+  RestorationLineBuffers *rlbs;
+
+  // Output of loop restoration
+  YV12_BUFFER_CONFIG rst_frame;
 
   // Flag signaling how frame contexts should be updated at the end of
   // a frame decode
   REFRESH_FRAME_CONTEXT_MODE refresh_frame_context;
 
-  int ref_frame_sign_bias[TOTAL_REFS_PER_FRAME]; /* Two state 0, 1 */
+  int ref_frame_sign_bias[REF_FRAMES]; /* Two state 0, 1 */
 
   struct loopfilter lf;
   struct segmentation seg;
-  int all_lossless;
-  int frame_parallel_decode;  // frame-based threading.
+  int coded_lossless;  // frame is fully lossless at the coded resolution.
+  int all_lossless;    // frame is fully lossless at the upscaled resolution.
 
-#if CONFIG_EXT_TX
   int reduced_tx_set_used;
-#endif  // CONFIG_EXT_TX
 
-// Context probabilities for reference frame prediction
-#if CONFIG_EXT_REFS
+  // Context probabilities for reference frame prediction
   MV_REFERENCE_FRAME comp_fwd_ref[FWD_REFS];
   MV_REFERENCE_FRAME comp_bwd_ref[BWD_REFS];
-#else
-  MV_REFERENCE_FRAME comp_fixed_ref;
-  MV_REFERENCE_FRAME comp_var_ref[COMP_REFS];
-#endif  // CONFIG_EXT_REFS
   REFERENCE_MODE reference_mode;
 
   FRAME_CONTEXT *fc;              /* this frame entropy */
   FRAME_CONTEXT *frame_contexts;  // FRAME_CONTEXTS
-  FRAME_CONTEXT *pre_fc;          // Context referenced in this frame
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
   unsigned int frame_context_idx; /* Context to use/update */
-#endif
-  FRAME_COUNTS counts;
+  int fb_of_context_type[REF_FRAMES];
+  int primary_ref_frame;
 
-#if CONFIG_FRAME_MARKER
   unsigned int frame_offset;
-#endif
 
   unsigned int current_video_frame;
   BITSTREAM_PROFILE profile;
@@ -423,44 +454,27 @@ typedef struct AV1Common {
   aom_bit_depth_t dequant_bit_depth;  // bit_depth of current dequantizer
 
   int error_resilient_mode;
+  int force_primary_ref_none;
 
   int tile_cols, tile_rows;
   int last_tile_cols, last_tile_rows;
 
-#if CONFIG_MAX_TILE
+  int max_tile_width_sb;
   int min_log2_tile_cols;
   int max_log2_tile_cols;
   int max_log2_tile_rows;
   int min_log2_tile_rows;
   int min_log2_tiles;
-  int max_tile_width_sb;
   int max_tile_height_sb;
   int uniform_tile_spacing_flag;
   int log2_tile_cols;                        // only valid for uniform tiles
   int log2_tile_rows;                        // only valid for uniform tiles
   int tile_col_start_sb[MAX_TILE_COLS + 1];  // valid for 0 <= i <= tile_cols
   int tile_row_start_sb[MAX_TILE_ROWS + 1];  // valid for 0 <= i <= tile_rows
-#if CONFIG_DEPENDENT_HORZTILES
-  int tile_row_independent[MAX_TILE_ROWS];  // valid for 0 <= i <  tile_rows
-#endif
-#else
-  int log2_tile_cols, log2_tile_rows;  // Used in non-large_scale_tile_coding.
-  int tile_width, tile_height;         // In MI units
-#endif  // CONFIG_MAX_TILE
+  int tile_width, tile_height;               // In MI units
 
-#if CONFIG_EXT_TILE
   unsigned int large_scale_tile;
   unsigned int single_tile_decoding;
-#endif  // CONFIG_EXT_TILE
-
-#if CONFIG_DEPENDENT_HORZTILES
-  int dependent_horz_tiles;
-  int tile_group_start_row[MAX_TILE_ROWS][MAX_TILE_COLS];
-  int tile_group_start_col[MAX_TILE_ROWS][MAX_TILE_COLS];
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  int loop_filter_across_tiles_enabled;
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
 
   int byte_alignment;
   int skip_loop_filter;
@@ -476,74 +490,65 @@ typedef struct AV1Common {
   // External BufferPool passed from outside.
   BufferPool *buffer_pool;
 
-  PARTITION_CONTEXT *above_seg_context;
-  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
-#if CONFIG_VAR_TX
-  TXFM_CONTEXT *above_txfm_context;
-  TXFM_CONTEXT *top_txfm_context[MAX_MB_PLANE];
-  TXFM_CONTEXT left_txfm_context[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
-#endif
-  int above_context_alloc_cols;
-
-  // scratch memory for intraonly/keyframe forward updates from default tables
-  // - this is intentionally not placed in FRAME_CONTEXT since it's reset upon
-  // each keyframe and not used afterwards
-  aom_prob kf_y_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1];
-#if CONFIG_GLOBAL_MOTION
-  WarpedMotionParams global_motion[TOTAL_REFS_PER_FRAME];
-#endif
-
-  BLOCK_SIZE sb_size;  // Size of the superblock used for this frame
-  int mib_size;        // Size of the superblock in units of MI blocks
-  int mib_size_log2;   // Log 2 of above.
-#if CONFIG_CDEF
+  PARTITION_CONTEXT **above_seg_context;
+  ENTROPY_CONTEXT **above_context[MAX_MB_PLANE];
+  TXFM_CONTEXT **above_txfm_context;
+  WarpedMotionParams global_motion[REF_FRAMES];
+  aom_film_grain_table_t *film_grain_table;
+  int film_grain_params_present;
+  aom_film_grain_t film_grain_params;
   int cdef_pri_damping;
   int cdef_sec_damping;
   int nb_cdef_strengths;
   int cdef_strengths[CDEF_MAX_STRENGTHS];
   int cdef_uv_strengths[CDEF_MAX_STRENGTHS];
   int cdef_bits;
-#endif
 
   int delta_q_present_flag;
   // Resolution of delta quant
   int delta_q_res;
-#if CONFIG_EXT_DELTA_Q
   int delta_lf_present_flag;
   // Resolution of delta lf level
   int delta_lf_res;
-#if CONFIG_LOOPFILTER_LEVEL
   // This is a flag for number of deltas of loop filter level
   // 0: use 1 delta, for y_vertical, y_horizontal, u, and v
   // 1: use separate deltas for each filter level
   int delta_lf_multi;
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif
   int num_tg;
-#if CONFIG_REFERENCE_BUFFER
   SequenceHeader seq_params;
   int current_frame_id;
   int ref_frame_id[REF_FRAMES];
   int valid_for_referencing[REF_FRAMES];
-  int refresh_mask;
-  int invalid_delta_frame_id_minus1;
-#endif  // CONFIG_REFERENCE_BUFFER
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  int ans_window_size_log2;
-#endif
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  NCOBMC_KERNELS ncobmc_kernels[ADAPT_OVERLAP_BLOCKS][ALL_NCOBMC_MODES];
-  uint8_t *ncobmcaw_buf[4];
-#endif
-#if CONFIG_LV_MAP
+  int invalid_delta_frame_id_minus_1;
   LV_MAP_CTX_TABLE coeff_ctx_table;
+  TPL_MV_REF *tpl_mvs;
+  int tpl_mvs_mem_size;
+  // TODO(jingning): This can be combined with sign_bias later.
+  int8_t ref_frame_side[REF_FRAMES];
+
+  int is_annexb;
+
+  int frame_refs_short_signaling;
+  int temporal_layer_id;
+  int spatial_layer_id;
+  unsigned int number_temporal_layers;
+  unsigned int number_spatial_layers;
+  int num_allocated_above_context_mi_col;
+  int num_allocated_above_contexts;
+  int num_allocated_above_context_planes;
+
+#if TXCOEFF_TIMER
+  int64_t cum_txcoeff_timer;
+  int64_t txcoeff_timer;
+  int txb_count;
 #endif
-#if CONFIG_LPF_SB
-  int final_lpf_encode;
-#endif
-#if CONFIG_ADAPT_SCAN
-  int use_adapt_scan;
+
+#if TXCOEFF_COST_TIMER
+  int64_t cum_txcoeff_cost_timer;
+  int64_t txcoeff_cost_timer;
+  int64_t txcoeff_cost_count;
 #endif
+  const cfg_options_t *options;
 } AV1_COMMON;
 
 // TODO(hkuang): Don't need to lock the whole pool after implementing atomic
@@ -585,6 +590,17 @@ static INLINE int get_free_fb(AV1_COMMON *cm) {
     if (frame_bufs[i].ref_count == 0) break;
 
   if (i != FRAME_BUFFERS) {
+    if (frame_bufs[i].buf.use_external_refernce_buffers) {
+      // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the
+      // external reference buffers. Restore the buffer pointers to point to the
+      // internally allocated memory.
+      YV12_BUFFER_CONFIG *ybf = &frame_bufs[i].buf;
+      ybf->y_buffer = ybf->store_buf_adr[0];
+      ybf->u_buffer = ybf->store_buf_adr[1];
+      ybf->v_buffer = ybf->store_buf_adr[2];
+      ybf->use_external_refernce_buffers = 0;
+    }
+
     frame_bufs[i].ref_count = 1;
   } else {
     // Reset i to be INVALID_IDX to indicate no free buffer found.
@@ -606,270 +622,236 @@ static INLINE void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) {
   bufs[new_idx].ref_count++;
 }
 
-#if CONFIG_TEMPMV_SIGNALING
-// Returns 1 if this frame might use mvs from some previous frame. This
-// function doesn't consider whether prev_frame is actually suitable (see
-// frame_can_use_prev_frame_mvs for that)
-static INLINE int frame_might_use_prev_frame_mvs(const AV1_COMMON *cm) {
-  return !cm->error_resilient_mode && !cm->intra_only;
+static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) {
+  return cm->frame_type == KEY_FRAME || cm->intra_only;
+}
+
+static INLINE int frame_is_sframe(const AV1_COMMON *cm) {
+  return cm->frame_type == S_FRAME;
 }
 
-// Returns 1 if this frame really can use MVs from some previous frame.
-static INLINE int frame_can_use_prev_frame_mvs(const AV1_COMMON *cm) {
-  return (frame_might_use_prev_frame_mvs(cm) && cm->last_show_frame &&
-          cm->prev_frame && !cm->prev_frame->intra_only &&
-          cm->width == cm->prev_frame->width &&
-          cm->height == cm->prev_frame->height);
+static INLINE RefCntBuffer *get_prev_frame(const AV1_COMMON *const cm) {
+  if (cm->primary_ref_frame == PRIMARY_REF_NONE ||
+      cm->frame_refs[cm->primary_ref_frame].idx == INVALID_IDX) {
+    return NULL;
+  } else {
+    return &cm->buffer_pool
+                ->frame_bufs[cm->frame_refs[cm->primary_ref_frame].idx];
+  }
+}
+
+// Returns 1 if this frame might allow mvs from some reference frame.
+static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) {
+  return !cm->error_resilient_mode && cm->seq_params.enable_ref_frame_mvs &&
+         cm->seq_params.enable_order_hint && !frame_is_intra_only(cm);
+}
+
+// Returns 1 if this frame might use warped_motion
+static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) {
+  return !cm->error_resilient_mode && !frame_is_intra_only(cm) &&
+         cm->seq_params.enable_warped_motion;
 }
-#endif
 
 static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
-  if (buf->mvs == NULL || buf->mi_rows < cm->mi_rows ||
-      buf->mi_cols < cm->mi_cols) {
+  const int buf_rows = buf->mi_rows;
+  const int buf_cols = buf->mi_cols;
+
+  if (buf->mvs == NULL || buf_rows != cm->mi_rows || buf_cols != cm->mi_cols) {
     aom_free(buf->mvs);
     buf->mi_rows = cm->mi_rows;
     buf->mi_cols = cm->mi_cols;
-#if CONFIG_TMV
     CHECK_MEM_ERROR(cm, buf->mvs,
                     (MV_REF *)aom_calloc(
                         ((cm->mi_rows + 1) >> 1) * ((cm->mi_cols + 1) >> 1),
                         sizeof(*buf->mvs)));
-#else
-    CHECK_MEM_ERROR(
-        cm, buf->mvs,
-        (MV_REF *)aom_calloc(cm->mi_rows * cm->mi_cols, sizeof(*buf->mvs)));
-#endif  // CONFIG_TMV
-
-#if CONFIG_MFMV
-    aom_free(buf->tpl_mvs);
-    CHECK_MEM_ERROR(
-        cm, buf->tpl_mvs,
-        (TPL_MV_REF *)aom_calloc((cm->mi_rows + MAX_MIB_SIZE) * cm->mi_stride,
-                                 sizeof(*buf->tpl_mvs)));
-#endif
+    aom_free(buf->seg_map);
+    CHECK_MEM_ERROR(cm, buf->seg_map,
+                    (uint8_t *)aom_calloc(cm->mi_rows * cm->mi_cols,
+                                          sizeof(*buf->seg_map)));
   }
-}
 
-#if CONFIG_VAR_REFS
-#define LAST_IS_VALID(cm) ((cm)->frame_refs[LAST_FRAME - 1].is_valid)
-#define LAST2_IS_VALID(cm) ((cm)->frame_refs[LAST2_FRAME - 1].is_valid)
-#define LAST3_IS_VALID(cm) ((cm)->frame_refs[LAST3_FRAME - 1].is_valid)
-#define GOLDEN_IS_VALID(cm) ((cm)->frame_refs[GOLDEN_FRAME - 1].is_valid)
-#define BWDREF_IS_VALID(cm) ((cm)->frame_refs[BWDREF_FRAME - 1].is_valid)
-#define ALTREF2_IS_VALID(cm) ((cm)->frame_refs[ALTREF2_FRAME - 1].is_valid)
-#define ALTREF_IS_VALID(cm) ((cm)->frame_refs[ALTREF_FRAME - 1].is_valid)
-
-#define L_OR_L2(cm) (LAST_IS_VALID(cm) || LAST2_IS_VALID(cm))
-#define L_AND_L2(cm) (LAST_IS_VALID(cm) && LAST2_IS_VALID(cm))
-#define L_AND_L3(cm) (LAST_IS_VALID(cm) && LAST3_IS_VALID(cm))
-#define L_AND_G(cm) (LAST_IS_VALID(cm) && GOLDEN_IS_VALID(cm))
-
-#define L3_OR_G(cm) (LAST3_IS_VALID(cm) || GOLDEN_IS_VALID(cm))
-#define L3_AND_G(cm) (LAST3_IS_VALID(cm) && GOLDEN_IS_VALID(cm))
-
-#define BWD_OR_ALT2(cm) (BWDREF_IS_VALID(cm) || ALTREF2_IS_VALID(cm))
-#define BWD_AND_ALT2(cm) (BWDREF_IS_VALID(cm) && ALTREF2_IS_VALID(cm))
-#define BWD_OR_ALT(cm) (BWDREF_IS_VALID(cm) || ALTREF_IS_VALID(cm))
-#define BWD_AND_ALT(cm) (BWDREF_IS_VALID(cm) && ALTREF_IS_VALID(cm))
-#endif  // CONFIG_VAR_REFS
+  const int mem_size =
+      ((cm->mi_rows + MAX_MIB_SIZE) >> 1) * (cm->mi_stride >> 1);
+  int realloc = cm->tpl_mvs == NULL;
+  if (cm->tpl_mvs) realloc |= cm->tpl_mvs_mem_size < mem_size;
+
+  if (realloc) {
+    aom_free(cm->tpl_mvs);
+    CHECK_MEM_ERROR(cm, cm->tpl_mvs,
+                    (TPL_MV_REF *)aom_calloc(mem_size, sizeof(*cm->tpl_mvs)));
+    cm->tpl_mvs_mem_size = mem_size;
+  }
+}
 
 static INLINE int mi_cols_aligned_to_sb(const AV1_COMMON *cm) {
-  return ALIGN_POWER_OF_TWO(cm->mi_cols, cm->mib_size_log2);
+  return ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
 }
 
 static INLINE int mi_rows_aligned_to_sb(const AV1_COMMON *cm) {
-  return ALIGN_POWER_OF_TWO(cm->mi_rows, cm->mib_size_log2);
+  return ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
 }
 
-static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) {
-  return cm->frame_type == KEY_FRAME || cm->intra_only;
+void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm);
+
+static INLINE int av1_num_planes(const AV1_COMMON *cm) {
+  return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE;
 }
 
-#if CONFIG_CFL
-#if CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-static INLINE void cfl_clear_sub8x8_val(CFL_CTX *cfl) {
-  memset(cfl->sub8x8_val, 0, sizeof(cfl->sub8x8_val));
+static INLINE void av1_init_above_context(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                          const int tile_row) {
+  const int num_planes = av1_num_planes(cm);
+  for (int i = 0; i < num_planes; ++i) {
+    xd->above_context[i] = cm->above_context[i][tile_row];
+  }
+  xd->above_seg_context = cm->above_seg_context[tile_row];
+  xd->above_txfm_context = cm->above_txfm_context[tile_row];
 }
-#endif  // CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm);
-#endif  // CONFIG_CFL
 
 static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd,
-#if CONFIG_PVQ
-                                        tran_low_t *pvq_ref_coeff,
-#endif
-#if CONFIG_CFL
-                                        CFL_CTX *cfl,
-#endif
                                         tran_low_t *dqcoeff) {
-  for (int i = 0; i < MAX_MB_PLANE; ++i) {
+  const int num_planes = av1_num_planes(cm);
+  for (int i = 0; i < num_planes; ++i) {
     xd->plane[i].dqcoeff = dqcoeff;
-#if CONFIG_PVQ
-    xd->plane[i].pvq_ref_coeff = pvq_ref_coeff;
-#endif
-    xd->above_context[i] = cm->above_context[i];
+
     if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
-      memcpy(xd->plane[i].seg_dequant, cm->y_dequant, sizeof(cm->y_dequant));
-#if CONFIG_AOM_QM
+      memcpy(xd->plane[i].seg_dequant_QTX, cm->y_dequant_QTX,
+             sizeof(cm->y_dequant_QTX));
       memcpy(xd->plane[i].seg_iqmatrix, cm->y_iqmatrix, sizeof(cm->y_iqmatrix));
-#endif
 
-#if CONFIG_NEW_QUANT
-      memcpy(xd->plane[i].seg_dequant_nuq, cm->y_dequant_nuq,
-             sizeof(cm->y_dequant_nuq));
-#endif
     } else {
-      memcpy(xd->plane[i].seg_dequant, cm->uv_dequant, sizeof(cm->uv_dequant));
-#if CONFIG_AOM_QM
-      memcpy(xd->plane[i].seg_iqmatrix, cm->uv_iqmatrix,
-             sizeof(cm->uv_iqmatrix));
-#endif
-#if CONFIG_NEW_QUANT
-      memcpy(xd->plane[i].seg_dequant_nuq, cm->uv_dequant_nuq,
-             sizeof(cm->uv_dequant_nuq));
-#endif
+      if (i == AOM_PLANE_U) {
+        memcpy(xd->plane[i].seg_dequant_QTX, cm->u_dequant_QTX,
+               sizeof(cm->u_dequant_QTX));
+        memcpy(xd->plane[i].seg_iqmatrix, cm->u_iqmatrix,
+               sizeof(cm->u_iqmatrix));
+      } else {
+        memcpy(xd->plane[i].seg_dequant_QTX, cm->v_dequant_QTX,
+               sizeof(cm->v_dequant_QTX));
+        memcpy(xd->plane[i].seg_iqmatrix, cm->v_iqmatrix,
+               sizeof(cm->v_iqmatrix));
+      }
     }
   }
-  xd->fc = cm->fc;
-  xd->above_seg_context = cm->above_seg_context;
-#if CONFIG_VAR_TX
-  xd->above_txfm_context = cm->above_txfm_context;
-#endif
-#if CONFIG_CFL
-  cfl_init(cfl, cm);
-  xd->cfl = cfl;
-#endif
   xd->mi_stride = cm->mi_stride;
   xd->error_info = &cm->error;
+  cfl_init(&xd->cfl, cm);
 }
 
-static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {
+static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    const int num_planes) {
   int i;
   int row_offset = mi_row;
   int col_offset = mi_col;
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
+  for (i = 0; i < num_planes; ++i) {
     struct macroblockd_plane *const pd = &xd->plane[i];
-#if CONFIG_CHROMA_SUB8X8
     // Offset the buffer pointer
-    const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+    const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
     if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
       row_offset = mi_row - 1;
     if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
       col_offset = mi_col - 1;
-#endif
-    int above_idx = col_offset << (MI_SIZE_LOG2 - tx_size_wide_log2[0]);
-    int left_idx = (row_offset & MAX_MIB_MASK)
-                   << (MI_SIZE_LOG2 - tx_size_high_log2[0]);
+    int above_idx = col_offset;
+    int left_idx = row_offset & MAX_MIB_MASK;
     pd->above_context = &xd->above_context[i][above_idx >> pd->subsampling_x];
     pd->left_context = &xd->left_context[i][left_idx >> pd->subsampling_y];
   }
 }
 
 static INLINE int calc_mi_size(int len) {
-  // len is in mi units.
-  return len + MAX_MIB_SIZE;
+  // len is in mi units. Align to a multiple of SBs.
+  return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2);
 }
 
-static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh) {
+static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh,
+                                const int num_planes) {
   int i;
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    xd->plane[i].n4_w = (bw << 1) >> xd->plane[i].subsampling_x;
-    xd->plane[i].n4_h = (bh << 1) >> xd->plane[i].subsampling_y;
-
+  for (i = 0; i < num_planes; i++) {
     xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x;
     xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y;
 
-#if !CONFIG_CHROMA_2X2
     xd->plane[i].width = AOMMAX(xd->plane[i].width, 4);
     xd->plane[i].height = AOMMAX(xd->plane[i].height, 4);
-#endif
   }
 }
 
 static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
                                   int mi_row, int bh, int mi_col, int bw,
-#if CONFIG_DEPENDENT_HORZTILES
-                                  int dependent_horz_tile_flag,
-#endif  // CONFIG_DEPENDENT_HORZTILES
                                   int mi_rows, int mi_cols) {
   xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
   xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8;
   xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
   xd->mb_to_right_edge = ((mi_cols - bw - mi_col) * MI_SIZE) * 8;
 
-#if CONFIG_DEPENDENT_HORZTILES
-  if (dependent_horz_tile_flag) {
-    xd->up_available = (mi_row > tile->mi_row_start) || !tile->tg_horz_boundary;
-  } else {
-#endif  // CONFIG_DEPENDENT_HORZTILES
-    // Are edges available for intra prediction?
-    xd->up_available = (mi_row > tile->mi_row_start);
-#if CONFIG_DEPENDENT_HORZTILES
-  }
-#endif  // CONFIG_DEPENDENT_HORZTILES
+  // Are edges available for intra prediction?
+  xd->up_available = (mi_row > tile->mi_row_start);
+
+  const int ss_x = xd->plane[1].subsampling_x;
+  const int ss_y = xd->plane[1].subsampling_y;
 
   xd->left_available = (mi_col > tile->mi_col_start);
-#if CONFIG_CHROMA_SUB8X8
   xd->chroma_up_available = xd->up_available;
   xd->chroma_left_available = xd->left_available;
-  if (xd->plane[1].subsampling_x && bw < mi_size_wide[BLOCK_8X8])
+  if (ss_x && bw < mi_size_wide[BLOCK_8X8])
     xd->chroma_left_available = (mi_col - 1) > tile->mi_col_start;
-  if (xd->plane[1].subsampling_y && bh < mi_size_high[BLOCK_8X8])
+  if (ss_y && bh < mi_size_high[BLOCK_8X8])
     xd->chroma_up_available = (mi_row - 1) > tile->mi_row_start;
-#endif
   if (xd->up_available) {
-    xd->above_mi = xd->mi[-xd->mi_stride];
-    // above_mi may be NULL in encoder's first pass.
-    xd->above_mbmi = xd->above_mi ? &xd->above_mi->mbmi : NULL;
+    xd->above_mbmi = xd->mi[-xd->mi_stride];
   } else {
-    xd->above_mi = NULL;
     xd->above_mbmi = NULL;
   }
 
   if (xd->left_available) {
-    xd->left_mi = xd->mi[-1];
-    // left_mi may be NULL in encoder's first pass.
-    xd->left_mbmi = xd->left_mi ? &xd->left_mi->mbmi : NULL;
+    xd->left_mbmi = xd->mi[-1];
   } else {
-    xd->left_mi = NULL;
     xd->left_mbmi = NULL;
   }
 
+  const int chroma_ref = ((mi_row & 0x01) || !(bh & 0x01) || !ss_y) &&
+                         ((mi_col & 0x01) || !(bw & 0x01) || !ss_x);
+  if (chroma_ref) {
+    // To help calculate the "above" and "left" chroma blocks, note that the
+    // current block may cover multiple luma blocks (eg, if partitioned into
+    // 4x4 luma blocks).
+    // First, find the top-left-most luma block covered by this chroma block
+    MB_MODE_INFO **base_mi =
+        &xd->mi[-(mi_row & ss_y) * xd->mi_stride - (mi_col & ss_x)];
+
+    // Then, we consider the luma region covered by the left or above 4x4 chroma
+    // prediction. We want to point to the chroma reference block in that
+    // region, which is the bottom-right-most mi unit.
+    // This leads to the following offsets:
+    MB_MODE_INFO *chroma_above_mi =
+        xd->chroma_up_available ? base_mi[-xd->mi_stride + ss_x] : NULL;
+    xd->chroma_above_mbmi = chroma_above_mi;
+
+    MB_MODE_INFO *chroma_left_mi =
+        xd->chroma_left_available ? base_mi[ss_y * xd->mi_stride - 1] : NULL;
+    xd->chroma_left_mbmi = chroma_left_mi;
+  }
+
   xd->n8_h = bh;
   xd->n8_w = bw;
   xd->is_sec_rect = 0;
-  if (xd->n8_w < xd->n8_h)
-    if (mi_col & (xd->n8_h - 1)) xd->is_sec_rect = 1;
+  if (xd->n8_w < xd->n8_h) {
+    // Only mark is_sec_rect as 1 for the last block.
+    // For PARTITION_VERT_4, it would be (0, 0, 0, 1);
+    // For other partitions, it would be (0, 1).
+    if (!((mi_col + xd->n8_w) & (xd->n8_h - 1))) xd->is_sec_rect = 1;
+  }
 
   if (xd->n8_w > xd->n8_h)
     if (mi_row & (xd->n8_w - 1)) xd->is_sec_rect = 1;
 }
 
-static INLINE const aom_prob *get_y_mode_probs(const AV1_COMMON *cm,
-                                               const MODE_INFO *mi,
-                                               const MODE_INFO *above_mi,
-                                               const MODE_INFO *left_mi,
-                                               int block) {
-  const PREDICTION_MODE above = av1_above_block_mode(mi, above_mi, block);
-  const PREDICTION_MODE left = av1_left_block_mode(mi, left_mi, block);
-  return cm->kf_y_prob[above][left];
-}
-
 static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
-                                           const MODE_INFO *mi,
-                                           const MODE_INFO *above_mi,
-                                           const MODE_INFO *left_mi,
-                                           int block) {
-  const PREDICTION_MODE above = av1_above_block_mode(mi, above_mi, block);
-  const PREDICTION_MODE left = av1_left_block_mode(mi, left_mi, block);
-
-#if CONFIG_KF_CTX
-  int above_ctx = intra_mode_context[above];
-  int left_ctx = intra_mode_context[left];
+                                           const MB_MODE_INFO *above_mi,
+                                           const MB_MODE_INFO *left_mi) {
+  const PREDICTION_MODE above = av1_above_block_mode(above_mi);
+  const PREDICTION_MODE left = av1_left_block_mode(left_mi);
+  const int above_ctx = intra_mode_context[above];
+  const int left_ctx = intra_mode_context[left];
   return tile_ctx->kf_y_cdf[above_ctx][left_ctx];
-#else
-  return tile_ctx->kf_y_cdf[above][left];
-#endif
 }
 
 static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row,
@@ -879,130 +861,117 @@ static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row,
   PARTITION_CONTEXT *const left_ctx =
       xd->left_seg_context + (mi_row & MAX_MIB_MASK);
 
-#if CONFIG_EXT_PARTITION_TYPES
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
   memset(above_ctx, partition_context_lookup[subsize].above, bw);
   memset(left_ctx, partition_context_lookup[subsize].left, bh);
-#else
-  // num_4x4_blocks_wide_lookup[bsize] / 2
-  const int bs = mi_size_wide[bsize];
-
-  // update the partition context at the end notes. set partition bits
-  // of block sizes larger than the current one to be one, and partition
-  // bits of smaller block sizes to be zero.
-  memset(above_ctx, partition_context_lookup[subsize].above, bs);
-  memset(left_ctx, partition_context_lookup[subsize].left, bs);
-#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
-#if CONFIG_CB4X4
 static INLINE int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize,
                                       int subsampling_x, int subsampling_y) {
-#if CONFIG_CHROMA_2X2
-  return 1;
-#endif
-
-#if CONFIG_CHROMA_SUB8X8
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
-
   int ref_pos = ((mi_row & 0x01) || !(bh & 0x01) || !subsampling_y) &&
                 ((mi_col & 0x01) || !(bw & 0x01) || !subsampling_x);
-
-  return ref_pos;
-#else
-  int ref_pos = !(((mi_row & 0x01) && subsampling_y) ||
-                  ((mi_col & 0x01) && subsampling_x));
-
-  if (bsize >= BLOCK_8X8) ref_pos = 1;
-
   return ref_pos;
-#endif
-}
-
-#if CONFIG_SUPERTX
-static INLINE int need_handle_chroma_sub8x8(BLOCK_SIZE bsize, int subsampling_x,
-                                            int subsampling_y) {
-  const int bw = mi_size_wide[bsize];
-  const int bh = mi_size_high[bsize];
-
-  if (bsize >= BLOCK_8X8 ||
-      ((!(bh & 0x01) || !subsampling_y) && (!(bw & 0x01) || !subsampling_x)))
-    return 0;
-  else
-    return 1;
 }
-#endif
 
 static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
                                             int subsampling_y) {
   BLOCK_SIZE bs = bsize;
-
-  if (bs < BLOCK_8X8) {
-    if (subsampling_x == 1 && subsampling_y == 1)
-      bs = BLOCK_8X8;
-    else if (subsampling_x == 1)
-      bs = BLOCK_8X4;
-    else if (subsampling_y == 1)
-      bs = BLOCK_4X8;
+  switch (bsize) {
+    case BLOCK_4X4:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X4;
+      else if (subsampling_y == 1)
+        bs = BLOCK_4X8;
+      break;
+    case BLOCK_4X8:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_y == 1)
+        bs = BLOCK_4X8;
+      break;
+    case BLOCK_8X4:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X4;
+      else if (subsampling_y == 1)
+        bs = BLOCK_8X8;
+      break;
+    case BLOCK_4X16:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X16;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X16;
+      else if (subsampling_y == 1)
+        bs = BLOCK_4X16;
+      break;
+    case BLOCK_16X4:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_16X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_16X4;
+      else if (subsampling_y == 1)
+        bs = BLOCK_16X8;
+      break;
+    default: break;
   }
-
   return bs;
 }
-#endif
 
 static INLINE aom_cdf_prob cdf_element_prob(const aom_cdf_prob *cdf,
                                             size_t element) {
   assert(cdf != NULL);
-#if !CONFIG_ANS
   return (element > 0 ? cdf[element - 1] : CDF_PROB_TOP) - cdf[element];
-#else
-  return cdf[element] - (element > 0 ? cdf[element - 1] : 0);
-#endif
 }
 
 static INLINE void partition_gather_horz_alike(aom_cdf_prob *out,
-                                               const aom_cdf_prob *const in) {
+                                               const aom_cdf_prob *const in,
+                                               BLOCK_SIZE bsize) {
+  (void)bsize;
   out[0] = CDF_PROB_TOP;
   out[0] -= cdf_element_prob(in, PARTITION_HORZ);
   out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
-#if CONFIG_EXT_PARTITION_TYPES
   out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
   out[0] -= cdf_element_prob(in, PARTITION_HORZ_B);
   out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
-#endif
+  if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_HORZ_4);
   out[0] = AOM_ICDF(out[0]);
   out[1] = AOM_ICDF(CDF_PROB_TOP);
 }
 
 static INLINE void partition_gather_vert_alike(aom_cdf_prob *out,
-                                               const aom_cdf_prob *const in) {
+                                               const aom_cdf_prob *const in,
+                                               BLOCK_SIZE bsize) {
+  (void)bsize;
   out[0] = CDF_PROB_TOP;
   out[0] -= cdf_element_prob(in, PARTITION_VERT);
   out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
-#if CONFIG_EXT_PARTITION_TYPES
   out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
   out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
   out[0] -= cdf_element_prob(in, PARTITION_VERT_B);
-#endif
+  if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_VERT_4);
   out[0] = AOM_ICDF(out[0]);
   out[1] = AOM_ICDF(CDF_PROB_TOP);
 }
 
-#if CONFIG_EXT_PARTITION_TYPES
 static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row,
                                                 int mi_col, BLOCK_SIZE subsize,
                                                 BLOCK_SIZE bsize,
                                                 PARTITION_TYPE partition) {
   if (bsize >= BLOCK_8X8) {
-#if !CONFIG_EXT_PARTITION_TYPES_AB
     const int hbs = mi_size_wide[bsize] / 2;
-    BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
-#endif
+    BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
     switch (partition) {
       case PARTITION_SPLIT:
         if (bsize != BLOCK_8X8) break;
+        AOM_FALLTHROUGH_INTENDED;
       case PARTITION_NONE:
       case PARTITION_HORZ:
       case PARTITION_VERT:
@@ -1010,30 +979,6 @@ static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row,
       case PARTITION_VERT_4:
         update_partition_context(xd, mi_row, mi_col, subsize, bsize);
         break;
-#if CONFIG_EXT_PARTITION_TYPES_AB
-      case PARTITION_HORZ_A:
-        update_partition_context(xd, mi_row, mi_col,
-                                 get_subsize(bsize, PARTITION_HORZ_4), subsize);
-        update_partition_context(xd, mi_row + mi_size_high[bsize] / 2, mi_col,
-                                 subsize, subsize);
-        break;
-      case PARTITION_HORZ_B:
-        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
-        update_partition_context(xd, mi_row + mi_size_high[bsize] / 2, mi_col,
-                                 get_subsize(bsize, PARTITION_HORZ_4), subsize);
-        break;
-      case PARTITION_VERT_A:
-        update_partition_context(xd, mi_row, mi_col,
-                                 get_subsize(bsize, PARTITION_VERT_4), subsize);
-        update_partition_context(xd, mi_row, mi_col + mi_size_wide[bsize] / 2,
-                                 subsize, subsize);
-        break;
-      case PARTITION_VERT_B:
-        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
-        update_partition_context(xd, mi_row, mi_col + mi_size_wide[bsize] / 2,
-                                 get_subsize(bsize, PARTITION_VERT_4), subsize);
-        break;
-#else
       case PARTITION_HORZ_A:
         update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
         update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
@@ -1050,41 +995,35 @@ static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row,
         update_partition_context(xd, mi_row, mi_col, subsize, subsize);
         update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
         break;
-#endif
       default: assert(0 && "Invalid partition type");
     }
   }
 }
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
 static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row,
-                                          int mi_col,
-#if CONFIG_UNPOISON_PARTITION_CTX
-                                          int has_rows, int has_cols,
-#endif
-                                          BLOCK_SIZE bsize) {
+                                          int mi_col, BLOCK_SIZE bsize) {
   const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
   const PARTITION_CONTEXT *left_ctx =
       xd->left_seg_context + (mi_row & MAX_MIB_MASK);
   // Minimum partition point is 8x8. Offset the bsl accordingly.
-  const int bsl = mi_width_log2_lookup[bsize] - mi_width_log2_lookup[BLOCK_8X8];
+  const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8];
   int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1;
 
-  assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]);
+  assert(mi_size_wide_log2[bsize] == mi_size_high_log2[bsize]);
   assert(bsl >= 0);
 
-#if CONFIG_UNPOISON_PARTITION_CTX
-  if (has_rows && has_cols)
-    return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
-  else if (has_rows && !has_cols)
-    return PARTITION_CONTEXTS_PRIMARY + bsl;
-  else if (!has_rows && has_cols)
-    return PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES + bsl;
-  else
-    return INVALID_PARTITION_CTX;  // Bogus context, forced SPLIT
-#else
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
-#endif
+}
+
+// Return the number of elements in the partition CDF when
+// partitioning the (square) block with luma block size of bsize.
+static INLINE int partition_cdf_length(BLOCK_SIZE bsize) {
+  if (bsize <= BLOCK_8X8)
+    return PARTITION_TYPES;
+  else if (bsize == BLOCK_128X128)
+    return EXT_PARTITION_TYPES - 2;
+  else
+    return EXT_PARTITION_TYPES;
 }
 
 static INLINE int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
@@ -1107,11 +1046,10 @@ static INLINE int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
   if (xd->mb_to_bottom_edge < 0)
     max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
 
-  // Scale the width in the transform block unit.
-  return max_blocks_high >> tx_size_wide_log2[0];
+  // Scale the height in the transform block unit.
+  return max_blocks_high >> tx_size_high_log2[0];
 }
 
-#if CONFIG_CFL
 static INLINE int max_intra_block_width(const MACROBLOCKD *xd,
                                         BLOCK_SIZE plane_bsize, int plane,
                                         TX_SIZE tx_size) {
@@ -1127,36 +1065,43 @@ static INLINE int max_intra_block_height(const MACROBLOCKD *xd,
                               << tx_size_high_log2[0];
   return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]);
 }
-#endif  // CONFIG_CFL
 
 static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
-                                          int mi_col_start, int mi_col_end) {
+  int mi_col_start, int mi_col_end, const int tile_row) {
+  const int num_planes = av1_num_planes(cm);
   const int width = mi_col_end - mi_col_start;
-  const int aligned_width = ALIGN_POWER_OF_TWO(width, cm->mib_size_log2);
+  const int aligned_width =
+    ALIGN_POWER_OF_TWO(width, cm->seq_params.mib_size_log2);
 
-  const int offset_y = mi_col_start << (MI_SIZE_LOG2 - tx_size_wide_log2[0]);
-  const int width_y = aligned_width << (MI_SIZE_LOG2 - tx_size_wide_log2[0]);
+  const int offset_y = mi_col_start;
+  const int width_y = aligned_width;
   const int offset_uv = offset_y >> cm->subsampling_x;
   const int width_uv = width_y >> cm->subsampling_x;
 
-  av1_zero_array(cm->above_context[0] + offset_y, width_y);
-  av1_zero_array(cm->above_context[1] + offset_uv, width_uv);
-  av1_zero_array(cm->above_context[2] + offset_uv, width_uv);
+  av1_zero_array(cm->above_context[0][tile_row] + offset_y, width_y);
+  if (num_planes > 1) {
+    if (cm->above_context[1][tile_row] && cm->above_context[2][tile_row]) {
+      av1_zero_array(cm->above_context[1][tile_row] + offset_uv, width_uv);
+      av1_zero_array(cm->above_context[2][tile_row] + offset_uv, width_uv);
+    } else {
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid value of planes");
+    }
+  }
 
-  av1_zero_array(cm->above_seg_context + mi_col_start, aligned_width);
+  av1_zero_array(cm->above_seg_context[tile_row] + mi_col_start, aligned_width);
 
-#if CONFIG_VAR_TX
-  av1_zero_array(cm->above_txfm_context + (mi_col_start << TX_UNIT_WIDE_LOG2),
-                 aligned_width << TX_UNIT_WIDE_LOG2);
-#endif  // CONFIG_VAR_TX
+  memset(cm->above_txfm_context[tile_row] + mi_col_start,
+    tx_size_wide[TX_SIZES_LARGEST],
+    aligned_width * sizeof(TXFM_CONTEXT));
 }
 
 static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) {
   av1_zero(xd->left_context);
   av1_zero(xd->left_seg_context);
-#if CONFIG_VAR_TX
-  av1_zero(xd->left_txfm_context_buffer);
-#endif
+
+  memset(xd->left_txfm_context_buffer, tx_size_high[TX_SIZES_LARGEST],
+         sizeof(xd->left_txfm_context_buffer));
 }
 
 // Disable array-bounds checks as the TX_SIZE enum contains values larger than
@@ -1166,15 +1111,11 @@ static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) {
 #if defined(__GNUC__) && __GNUC__ >= 4
 #pragma GCC diagnostic ignored "-Warray-bounds"
 #endif
-static INLINE TX_SIZE get_min_tx_size(TX_SIZE tx_size) {
-  assert(tx_size < TX_SIZES_ALL);
-  return txsize_sqr_map[tx_size];
-}
+
 #if defined(__GNUC__) && __GNUC__ >= 4
 #pragma GCC diagnostic warning "-Warray-bounds"
 #endif
 
-#if CONFIG_VAR_TX
 static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
   int i;
   for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
@@ -1190,16 +1131,16 @@ static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n8_w, int n8_h, int skip,
     bh = n8_h * MI_SIZE;
   }
 
-  set_txfm_ctx(xd->above_txfm_context, bw, n8_w << TX_UNIT_WIDE_LOG2);
-  set_txfm_ctx(xd->left_txfm_context, bh, n8_h << TX_UNIT_HIGH_LOG2);
+  set_txfm_ctx(xd->above_txfm_context, bw, n8_w);
+  set_txfm_ctx(xd->left_txfm_context, bh, n8_h);
 }
 
 static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
                                          TXFM_CONTEXT *left_ctx,
                                          TX_SIZE tx_size, TX_SIZE txb_size) {
   BLOCK_SIZE bsize = txsize_to_bsize[txb_size];
-  int bh = mi_size_high[bsize] << TX_UNIT_HIGH_LOG2;
-  int bw = mi_size_wide[bsize] << TX_UNIT_WIDE_LOG2;
+  int bh = mi_size_high[bsize];
+  int bw = mi_size_wide[bsize];
   uint8_t txw = tx_size_wide[tx_size];
   uint8_t txh = tx_size_high[tx_size];
   int i;
@@ -1209,16 +1150,8 @@ static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
 
 static INLINE TX_SIZE get_sqr_tx_size(int tx_dim) {
   switch (tx_dim) {
-#if CONFIG_EXT_PARTITION
     case 128:
-#endif  // CONFIG_EXT_PARTITION
-    case 64:
-#if CONFIG_TX64X64
-      return TX_64X64;
-#else
-      return TX_32X32;
-#endif  // CONFIG_TX64X64
-      break;
+    case 64: return TX_64X64; break;
     case 32: return TX_32X32; break;
     case 16: return TX_16X16; break;
     case 8: return TX_8X8; break;
@@ -1226,6 +1159,45 @@ static INLINE TX_SIZE get_sqr_tx_size(int tx_dim) {
   }
 }
 
+static INLINE TX_SIZE get_tx_size(int width, int height) {
+  if (width == height) {
+    return get_sqr_tx_size(width);
+  }
+  if (width < height) {
+    if (width + width == height) {
+      switch (width) {
+        case 4: return TX_4X8; break;
+        case 8: return TX_8X16; break;
+        case 16: return TX_16X32; break;
+        case 32: return TX_32X64; break;
+      }
+    } else {
+      switch (width) {
+        case 4: return TX_4X16; break;
+        case 8: return TX_8X32; break;
+        case 16: return TX_16X64; break;
+      }
+    }
+  } else {
+    if (height + height == width) {
+      switch (height) {
+        case 4: return TX_8X4; break;
+        case 8: return TX_16X8; break;
+        case 16: return TX_32X16; break;
+        case 32: return TX_64X32; break;
+      }
+    } else {
+      switch (height) {
+        case 4: return TX_16X4; break;
+        case 8: return TX_32X8; break;
+        case 16: return TX_64X16; break;
+      }
+    }
+  }
+  assert(0);
+  return TX_4X4;
+}
+
 static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx,
                                          TXFM_CONTEXT *left_ctx,
                                          BLOCK_SIZE bsize, TX_SIZE tx_size) {
@@ -1233,7 +1205,7 @@ static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx,
   const uint8_t txh = tx_size_high[tx_size];
   const int above = *above_ctx < txw;
   const int left = *left_ctx < txh;
-  int category = TXFM_PARTITION_CONTEXTS - 1;
+  int category = TXFM_PARTITION_CONTEXTS;
 
   // dummy return, not used by others.
   if (tx_size <= TX_4X4) return 0;
@@ -1242,13 +1214,13 @@ static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx,
       get_sqr_tx_size(AOMMAX(block_size_wide[bsize], block_size_high[bsize]));
 
   if (max_tx_size >= TX_8X8) {
-    category = (tx_size != max_tx_size && max_tx_size > TX_8X8) +
-               (TX_SIZES - 1 - max_tx_size) * 2;
+    category =
+        (txsize_sqr_up_map[tx_size] != max_tx_size && max_tx_size > TX_8X8) +
+        (TX_SIZES - 1 - max_tx_size) * 2;
   }
-  if (category == TXFM_PARTITION_CONTEXTS - 1) return category;
+  assert(category != TXFM_PARTITION_CONTEXTS);
   return category * 3 + above + left;
 }
-#endif
 
 // Compute the next partition in the direction of the sb_type stored in the mi
 // array, starting with bsize.
@@ -1258,8 +1230,8 @@ static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return PARTITION_INVALID;
 
   const int offset = mi_row * cm->mi_stride + mi_col;
-  MODE_INFO **mi = cm->mi_grid_visible + offset;
-  const BLOCK_SIZE subsize = mi[0]->mbmi.sb_type;
+  MB_MODE_INFO **mi = cm->mi_grid_visible + offset;
+  const BLOCK_SIZE subsize = mi[0]->sb_type;
 
   if (subsize == bsize) return PARTITION_NONE;
 
@@ -1268,25 +1240,14 @@ static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
   const int sshigh = mi_size_high[subsize];
   const int sswide = mi_size_wide[subsize];
 
-#if CONFIG_EXT_PARTITION_TYPES
   if (bsize > BLOCK_8X8 && mi_row + bwide / 2 < cm->mi_rows &&
       mi_col + bhigh / 2 < cm->mi_cols) {
     // In this case, the block might be using an extended partition
     // type.
-    const MB_MODE_INFO *const mbmi_right = &mi[bwide / 2]->mbmi;
-    const MB_MODE_INFO *const mbmi_below = &mi[bhigh / 2 * cm->mi_stride]->mbmi;
+    const MB_MODE_INFO *const mbmi_right = mi[bwide / 2];
+    const MB_MODE_INFO *const mbmi_below = mi[bhigh / 2 * cm->mi_stride];
 
     if (sswide == bwide) {
-#if CONFIG_EXT_PARTITION_TYPES_AB
-      // Smaller height but same width. Is PARTITION_HORZ, PARTITION_HORZ_4,
-      // PARTITION_HORZ_A or PARTITION_HORZ_B.
-      if (sshigh * 2 == bhigh)
-        return (mbmi_below->sb_type == subsize) ? PARTITION_HORZ
-                                                : PARTITION_HORZ_B;
-      assert(sshigh * 4 == bhigh);
-      return (mbmi_below->sb_type == subsize) ? PARTITION_HORZ_4
-                                              : PARTITION_HORZ_A;
-#else
       // Smaller height but same width. Is PARTITION_HORZ_4, PARTITION_HORZ or
       // PARTITION_HORZ_B. To distinguish the latter two, check if the lower
       // half was split.
@@ -1297,18 +1258,7 @@ static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
         return PARTITION_HORZ;
       else
         return PARTITION_HORZ_B;
-#endif
     } else if (sshigh == bhigh) {
-#if CONFIG_EXT_PARTITION_TYPES_AB
-      // Smaller width but same height. Is PARTITION_VERT, PARTITION_VERT_4,
-      // PARTITION_VERT_A or PARTITION_VERT_B.
-      if (sswide * 2 == bwide)
-        return (mbmi_right->sb_type == subsize) ? PARTITION_VERT
-                                                : PARTITION_VERT_B;
-      assert(sswide * 4 == bwide);
-      return (mbmi_right->sb_type == subsize) ? PARTITION_VERT_4
-                                              : PARTITION_VERT_A;
-#else
       // Smaller width but same height. Is PARTITION_VERT_4, PARTITION_VERT or
       // PARTITION_VERT_B. To distinguish the latter two, check if the right
       // half was split.
@@ -1319,9 +1269,7 @@ static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
         return PARTITION_VERT;
       else
         return PARTITION_VERT_B;
-#endif
     } else {
-#if !CONFIG_EXT_PARTITION_TYPES_AB
       // Smaller width and smaller height. Might be PARTITION_SPLIT or could be
       // PARTITION_HORZ_A or PARTITION_VERT_A. If subsize isn't halved in both
       // dimensions, we immediately know this is a split (which will recurse to
@@ -1333,12 +1281,10 @@ static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
 
       if (mi_size_wide[mbmi_below->sb_type] == bwide) return PARTITION_HORZ_A;
       if (mi_size_high[mbmi_right->sb_type] == bhigh) return PARTITION_VERT_A;
-#endif
 
       return PARTITION_SPLIT;
     }
   }
-#endif
   const int vert_split = sswide < bwide;
   const int horz_split = sshigh < bhigh;
   const int split_idx = (vert_split << 1) | horz_split;
@@ -1352,49 +1298,46 @@ static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
 }
 
 static INLINE void set_use_reference_buffer(AV1_COMMON *const cm, int use) {
-#if CONFIG_REFERENCE_BUFFER
   cm->seq_params.frame_id_numbers_present_flag = use;
-#else
-  (void)cm;
-  (void)use;
-#endif
 }
 
-static INLINE void set_sb_size(AV1_COMMON *const cm, BLOCK_SIZE sb_size) {
-  cm->sb_size = sb_size;
-  cm->mib_size = mi_size_wide[cm->sb_size];
-#if CONFIG_CB4X4
-  cm->mib_size_log2 = b_width_log2_lookup[cm->sb_size];
-#else
-  cm->mib_size_log2 = mi_width_log2_lookup[cm->sb_size];
-#endif
+static INLINE void set_sb_size(SequenceHeader *const seq_params,
+                               BLOCK_SIZE sb_size) {
+  seq_params->sb_size = sb_size;
+  seq_params->mib_size = mi_size_wide[seq_params->sb_size];
+  seq_params->mib_size_log2 = mi_size_wide_log2[seq_params->sb_size];
 }
 
-static INLINE int all_lossless(const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  int i;
-  int all_lossless = 1;
+// Returns true if the frame is fully lossless at the coded resolution.
+// Note: If super-resolution is used, such a frame will still NOT be lossless at
+// the upscaled resolution.
+static INLINE int is_coded_lossless(const AV1_COMMON *cm,
+                                    const MACROBLOCKD *xd) {
+  int coded_lossless = 1;
   if (cm->seg.enabled) {
-    for (i = 0; i < MAX_SEGMENTS; ++i) {
+    for (int i = 0; i < MAX_SEGMENTS; ++i) {
       if (!xd->lossless[i]) {
-        all_lossless = 0;
+        coded_lossless = 0;
         break;
       }
     }
   } else {
-    all_lossless = xd->lossless[0];
+    coded_lossless = xd->lossless[0];
   }
-  return all_lossless;
+  return coded_lossless;
 }
 
-static INLINE int use_compressed_header(const AV1_COMMON *cm) {
-  (void)cm;
-#if CONFIG_RESTRICT_COMPRESSED_HDR && CONFIG_NEW_MULTISYMBOL
-  return 0;
-#elif CONFIG_RESTRICT_COMPRESSED_HDR
-  return cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD;
-#else
-  return 1;
-#endif  // CONFIG_RESTRICT_COMPRESSED_HDR && CONFIG_NEW_MULTISYMBOL
+static INLINE int is_valid_seq_level_idx(uint8_t seq_level_idx) {
+  return seq_level_idx < 24 || seq_level_idx == 31;
+}
+
+static INLINE uint8_t major_minor_to_seq_level_idx(BitstreamLevel bl) {
+  assert(bl.major >= LEVEL_MAJOR_MIN && bl.major <= LEVEL_MAJOR_MAX);
+  // Since bl.minor is unsigned a comparison will return a warning:
+  // comparison is always true due to limited range of data type
+  assert(LEVEL_MINOR_MIN == 0);
+  assert(bl.minor <= LEVEL_MINOR_MAX);
+  return ((bl.major - LEVEL_MAJOR_MIN) << LEVEL_MINOR_BITS) + bl.minor;
 }
 
 #ifdef __cplusplus
diff --git a/third_party/aom/av1/common/partition.c b/third_party/aom/av1/common/partition.c
deleted file mode 100644
index 634a9edd5..000000000
--- a/third_party/aom/av1/common/partition.c
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include "enums.h"
-#include "odintrin.h"
-#include "partition.h"
-#include "zigzag.h"
-
-OD_EXTERN const index_pair *OD_ZIGZAG4[4] = {
-  OD_ZIGZAG4_DCT_DCT,
-  OD_ZIGZAG4_ADST_DCT,
-  OD_ZIGZAG4_DCT_ADST,
-  OD_ZIGZAG4_ADST_ADST
-};
-
-OD_EXTERN const index_pair *OD_ZIGZAG8[4] = {
-  OD_ZIGZAG8_DCT_DCT,
-  OD_ZIGZAG8_ADST_DCT,
-  OD_ZIGZAG8_DCT_ADST,
-  OD_ZIGZAG8_ADST_ADST
-};
-
-OD_EXTERN const index_pair *OD_ZIGZAG16[4] = {
-  OD_ZIGZAG16_DCT_DCT,
-  OD_ZIGZAG16_ADST_DCT,
-  OD_ZIGZAG16_DCT_ADST,
-  OD_ZIGZAG16_ADST_ADST
-};
-
-OD_EXTERN const index_pair *OD_ZIGZAG32[4] = {
-  OD_ZIGZAG32_DCT_DCT,
-  OD_ZIGZAG32_DCT_DCT,
-  OD_ZIGZAG32_DCT_DCT,
-  OD_ZIGZAG32_DCT_DCT
-};
-
-/* The tables below specify how coefficient blocks are translated to
-   and from PVQ partition coding scan order for 4x4, 8x8 and 16x16 */
-
-static const int OD_LAYOUT32_OFFSETS[4] = { 0, 128, 256, 768 };
-const band_layout OD_LAYOUT32 = {
-  OD_ZIGZAG32,
-  32,
-  3,
-  OD_LAYOUT32_OFFSETS
-};
-
-static const int OD_LAYOUT16_OFFSETS[4] = { 0, 32, 64, 192 };
-const band_layout OD_LAYOUT16 = {
-  OD_ZIGZAG16,
-  16,
-  3,
-  OD_LAYOUT16_OFFSETS
-};
-
-const int OD_LAYOUT8_OFFSETS[4] = { 0, 8, 16, 48 };
-const band_layout OD_LAYOUT8 = {
-  OD_ZIGZAG8,
-  8,
-  3,
-  OD_LAYOUT8_OFFSETS
-};
-
-static const int OD_LAYOUT4_OFFSETS[2] = { 0, 15 };
-const band_layout OD_LAYOUT4 = {
-  OD_ZIGZAG4,
-  4,
-  1,
-  OD_LAYOUT4_OFFSETS
-};
-
-/* First element is the number of bands, followed by the list all the band
-  boundaries. */
-static const int OD_BAND_OFFSETS4[] = {1, 1, 16};
-static const int OD_BAND_OFFSETS8[] = {4, 1, 16, 24, 32, 64};
-static const int OD_BAND_OFFSETS16[] = {7, 1, 16, 24, 32, 64, 96, 128, 256};
-static const int OD_BAND_OFFSETS32[] = {10, 1, 16, 24, 32, 64, 96, 128, 256,
- 384, 512, 1024};
-static const int OD_BAND_OFFSETS64[] = {13, 1, 16, 24, 32, 64, 96, 128, 256,
- 384, 512, 1024, 1536, 2048, 4096};
-
-const int *const OD_BAND_OFFSETS[OD_TXSIZES + 1] = {
-  OD_BAND_OFFSETS4,
-  OD_BAND_OFFSETS8,
-  OD_BAND_OFFSETS16,
-  OD_BAND_OFFSETS32,
-  OD_BAND_OFFSETS64
-};
-
-/** Perform a single stage of conversion from a coefficient block in
- * raster order into coding scan order
- *
- * @param [in]     layout  scan order specification
- * @param [out]    dst     destination vector
- * @param [in]     src     source coefficient block
- * @param [int]    int     source vector row stride
- */
-static void od_band_from_raster(const band_layout *layout, tran_low_t *dst,
- const tran_low_t *src, int stride, TX_TYPE tx_type) {
-  int i;
-  int len;
-  len = layout->band_offsets[layout->nb_bands];
-  for (i = 0; i < len; i++) {
-    dst[i] = src[layout->dst_table[tx_type][i][1]*stride + layout->dst_table[tx_type][i][0]];
-  }
-}
-
-/** Perform a single stage of conversion from a vector in coding scan
-    order back into a coefficient block in raster order
- *
- * @param [in]     layout  scan order specification
- * @param [out]    dst     destination coefficient block
- * @param [in]     src     source vector
- * @param [int]    stride  destination vector row stride
- */
-static void od_raster_from_band(const band_layout *layout, tran_low_t *dst,
- int stride, TX_TYPE tx_type, const tran_low_t *src) {
-  int i;
-  int len;
-  len = layout->band_offsets[layout->nb_bands];
-  for (i = 0; i < len; i++) {
-    dst[layout->dst_table[tx_type][i][1]*stride + layout->dst_table[tx_type][i][0]] = src[i];
-  }
-}
-
-static const band_layout *const OD_LAYOUTS[] = {&OD_LAYOUT4, &OD_LAYOUT8,
- &OD_LAYOUT16, &OD_LAYOUT32};
-
-/** Converts a coefficient block in raster order into a vector in
- * coding scan order with the PVQ partitions laid out one after
- * another.  This works in stages; the 4x4 conversion is applied to
- * the coefficients nearest DC, then the 8x8 applied to the 8x8 block
- * nearest DC that was not already coded by 4x4, then 16x16 following
- * the same pattern.
- *
- * @param [out]    dst        destination vector
- * @param [in]     n          block size (along one side)
- * @param [in]     ty_type    transfrom type
- * @param [in]     src        source coefficient block
- * @param [in]     stride     source vector row stride
- */
-void od_raster_to_coding_order(tran_low_t *dst, int n, TX_TYPE ty_type,
- const tran_low_t *src, int stride) {
-  int bs;
-  /* dst + 1 because DC is not included for 4x4 blocks. */
-  od_band_from_raster(OD_LAYOUTS[0], dst + 1, src, stride, ty_type);
-  for (bs = 1; bs < OD_TXSIZES; bs++) {
-    int size;
-    int offset;
-    /* Length of block size > 4. */
-    size = 1 << (OD_LOG_BSIZE0 + bs);
-    /* Offset is the size of the previous block squared. */
-    offset = 1 << 2*(OD_LOG_BSIZE0 - 1 + bs);
-    if (n >= size) {
-      /* 3 16x16 bands come after 3 8x8 bands, which come after 2 4x4 bands. */
-      od_band_from_raster(OD_LAYOUTS[bs], dst + offset, src, stride, ty_type);
-    }
-  }
-  dst[0] = src[0];
-}
-
-/** Converts a vector in coding scan order witht he PVQ partitions
- * laid out one after another into a coefficient block in raster
- * order. This works in stages in the reverse order of raster->scan
- * order; the 16x16 conversion is applied to the coefficients that
- * don't appear in an 8x8 block, then the 8x8 applied to the 8x8 block
- * sans the 4x4 block it contains, then 4x4 is converted sans DC.
- *
- * @param [out]    dst        destination coefficient block
- * @param [in]     stride     destination vector row stride
- * @param [in]     src        source vector
- * @param [in]     n          block size (along one side)
- */
-void od_coding_order_to_raster(tran_low_t *dst, int stride, TX_TYPE ty_type,
- const tran_low_t *src, int n) {
-  int bs;
-  /* src + 1 because DC is not included for 4x4 blocks. */
-  od_raster_from_band(OD_LAYOUTS[0], dst, stride, ty_type, src + 1);
-  for (bs = 1; bs < OD_TXSIZES; bs++) {
-    int size;
-    int offset;
-    /* Length of block size > 4 */
-    size = 1 << (OD_LOG_BSIZE0 + bs);
-    /* Offset is the size of the previous block squared. */
-    offset = 1 << 2*(OD_LOG_BSIZE0 - 1 + bs);
-    if (n >= size) {
-      /* 3 16x16 bands come after 3 8x8 bands, which come after 2 4x4 bands. */
-      od_raster_from_band(OD_LAYOUTS[bs], dst, stride, ty_type, src + offset);
-    }
-  }
-  dst[0] = src[0];
-}
-
-/** Perform a single stage of conversion from a coefficient block in
- * raster order into coding scan order
- *
- * @param [in]     layout  scan order specification
- * @param [out]    dst     destination vector
- * @param [in]     src     source coefficient block
- * @param [int]    int     source vector row stride
- */
-static void od_band_from_raster_16(const band_layout *layout, int16_t *dst,
- const int16_t *src, int stride) {
-  int i;
-  int len;
-  len = layout->band_offsets[layout->nb_bands];
-  for (i = 0; i < len; i++) {
-    dst[i] = src[layout->dst_table[DCT_DCT][i][1]*stride + layout->dst_table[DCT_DCT][i][0]];
-  }
-}
-
-/** Converts a coefficient block in raster order into a vector in
- * coding scan order with the PVQ partitions laid out one after
- * another.  This works in stages; the 4x4 conversion is applied to
- * the coefficients nearest DC, then the 8x8 applied to the 8x8 block
- * nearest DC that was not already coded by 4x4, then 16x16 following
- * the same pattern.
- *
- * @param [out]    dst        destination vector
- * @param [in]     n          block size (along one side)
- * @param [in]     src        source coefficient block
- * @param [in]     stride     source vector row stride
- */
-void od_raster_to_coding_order_16(int16_t *dst, int n, const int16_t *src,
- int stride) {
-  int bs;
-  /* dst + 1 because DC is not included for 4x4 blocks. */
-  od_band_from_raster_16(OD_LAYOUTS[0], dst + 1, src, stride);
-  for (bs = 1; bs < OD_TXSIZES; bs++) {
-    int size;
-    int offset;
-    /* Length of block size > 4. */
-    size = 1 << (OD_LOG_BSIZE0 + bs);
-    /* Offset is the size of the previous block squared. */
-    offset = 1 << 2*(OD_LOG_BSIZE0 - 1 + bs);
-    if (n >= size) {
-      /* 3 16x16 bands come after 3 8x8 bands, which come after 2 4x4 bands. */
-      od_band_from_raster_16(OD_LAYOUTS[bs], dst + offset, src, stride);
-    }
-  }
-  dst[0] = src[0];
-}
diff --git a/third_party/aom/av1/common/partition.h b/third_party/aom/av1/common/partition.h
deleted file mode 100644
index bd308f94f..000000000
--- a/third_party/aom/av1/common/partition.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#if !defined(_partition_H)
-# define _partition_H
-
-#include "av1/common/enums.h"
-#include "odintrin.h"
-
-typedef unsigned char index_pair[2];
-
-typedef struct {
-  const index_pair **const dst_table;
-  int size;
-  int nb_bands;
-  const int *const band_offsets;
-} band_layout;
-
-extern const int *const OD_BAND_OFFSETS[OD_TXSIZES + 1];
-
-void od_raster_to_coding_order(tran_low_t *dst, int n,  TX_TYPE ty_type,
- const tran_low_t *src, int stride);
-
-void od_coding_order_to_raster(tran_low_t *dst, int stride,  TX_TYPE ty_type,
- const tran_low_t *src, int n);
-
-void od_raster_to_coding_order_16(int16_t *dst, int n, const int16_t *src,
- int stride);
-
-#endif
diff --git a/third_party/aom/av1/common/ppc/cfl_ppc.c b/third_party/aom/av1/common/ppc/cfl_ppc.c
new file mode 100644
index 000000000..58933a7b3
--- /dev/null
+++ b/third_party/aom/av1/common/ppc/cfl_ppc.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <altivec.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#define OFF_0 0
+#define OFF_1 16
+#define OFF_2 32
+#define OFF_3 48
+#define CFL_BUF_LINE_BYTES 64
+#define CFL_LINE_1 64
+#define CFL_LINE_2 128
+#define CFL_LINE_3 192
+
+typedef vector int8_t int8x16_t;
+typedef vector uint8_t uint8x16_t;
+typedef vector int16_t int16x8_t;
+typedef vector uint16_t uint16x8_t;
+typedef vector int32_t int32x4_t;
+typedef vector uint32_t uint32x4_t;
+typedef vector uint64_t uint64x2_t;
+
+static INLINE void subtract_average_vsx(int16_t *pred_buf, int width,
+                                        int height, int round_offset,
+                                        int num_pel_log2) {
+  const int16_t *end = pred_buf + height * CFL_BUF_LINE;
+  const int16_t *sum_buf = pred_buf;
+  const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2);
+  const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+                               0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
+  const uint8x16_t mask_32 = { 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
+                               0x1C, 0x1D, 0x1E, 0x1F, 0x08, 0x09, 0x0A, 0x0B };
+
+  int32x4_t sum_32x4_0 = { 0, 0, 0, round_offset };
+  int32x4_t sum_32x4_1 = { 0, 0, 0, 0 };
+  do {
+    sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_0, sum_buf), sum_32x4_0);
+    sum_32x4_1 = vec_sum4s(vec_vsx_ld(OFF_0 + CFL_LINE_1, sum_buf), sum_32x4_1);
+    if (width >= 16) {
+      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_1, sum_buf), sum_32x4_0);
+      sum_32x4_1 =
+          vec_sum4s(vec_vsx_ld(OFF_1 + CFL_LINE_1, sum_buf), sum_32x4_1);
+    }
+    if (width == 32) {
+      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_2, sum_buf), sum_32x4_0);
+      sum_32x4_1 =
+          vec_sum4s(vec_vsx_ld(OFF_2 + CFL_LINE_1, sum_buf), sum_32x4_1);
+      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_3, sum_buf), sum_32x4_0);
+      sum_32x4_1 =
+          vec_sum4s(vec_vsx_ld(OFF_3 + CFL_LINE_1, sum_buf), sum_32x4_1);
+    }
+  } while ((sum_buf += (CFL_BUF_LINE * 2)) < end);
+  int32x4_t sum_32x4 = vec_add(sum_32x4_0, sum_32x4_1);
+
+  const int32x4_t perm_64 = vec_perm(sum_32x4, sum_32x4, mask_64);
+  sum_32x4 = vec_add(sum_32x4, perm_64);
+  const int32x4_t perm_32 = vec_perm(sum_32x4, sum_32x4, mask_32);
+  sum_32x4 = vec_add(sum_32x4, perm_32);
+  const int32x4_t avg = vec_sr(sum_32x4, div_shift);
+  const int16x8_t vec_avg = vec_pack(avg, avg);
+  do {
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, pred_buf), vec_avg), OFF_0, pred_buf);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, pred_buf), vec_avg),
+               OFF_0 + CFL_BUF_LINE_BYTES, pred_buf);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, pred_buf), vec_avg),
+               OFF_0 + CFL_LINE_2, pred_buf);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, pred_buf), vec_avg),
+               OFF_0 + CFL_LINE_3, pred_buf);
+    if (width >= 16) {
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, pred_buf), vec_avg), OFF_1,
+                 pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, pred_buf), vec_avg),
+                 OFF_1 + CFL_LINE_1, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, pred_buf), vec_avg),
+                 OFF_1 + CFL_LINE_2, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, pred_buf), vec_avg),
+                 OFF_1 + CFL_LINE_3, pred_buf);
+    }
+    if (width == 32) {
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, pred_buf), vec_avg), OFF_2,
+                 pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, pred_buf), vec_avg),
+                 OFF_2 + CFL_LINE_1, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, pred_buf), vec_avg),
+                 OFF_2 + CFL_LINE_2, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, pred_buf), vec_avg),
+                 OFF_2 + CFL_LINE_3, pred_buf);
+
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, pred_buf), vec_avg), OFF_3,
+                 pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, pred_buf), vec_avg),
+                 OFF_3 + CFL_LINE_1, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, pred_buf), vec_avg),
+                 OFF_3 + CFL_LINE_2, pred_buf);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, pred_buf), vec_avg),
+                 OFF_3 + CFL_LINE_3, pred_buf);
+    }
+  } while ((pred_buf += CFL_BUF_LINE * 4) < end);
+}
+
+// Declare wrappers for VSX sizes
+CFL_SUB_AVG_X(vsx, 8, 4, 16, 5)
+CFL_SUB_AVG_X(vsx, 8, 8, 32, 6)
+CFL_SUB_AVG_X(vsx, 8, 16, 64, 7)
+CFL_SUB_AVG_X(vsx, 8, 32, 128, 8)
+CFL_SUB_AVG_X(vsx, 16, 4, 32, 6)
+CFL_SUB_AVG_X(vsx, 16, 8, 64, 7)
+CFL_SUB_AVG_X(vsx, 16, 16, 128, 8)
+CFL_SUB_AVG_X(vsx, 16, 32, 256, 9)
+CFL_SUB_AVG_X(vsx, 32, 8, 128, 8)
+CFL_SUB_AVG_X(vsx, 32, 16, 256, 9)
+CFL_SUB_AVG_X(vsx, 32, 32, 512, 10)
+
+// Based on observation, for small blocks VSX does not outperform C (no 64bit
+// load and store intrinsics). So we call the C code for block widths 4.
+cfl_subtract_average_fn get_subtract_average_fn_vsx(TX_SIZE tx_size) {
+  static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
+    subtract_average_4x4_c,     /* 4x4 */
+    subtract_average_8x8_vsx,   /* 8x8 */
+    subtract_average_16x16_vsx, /* 16x16 */
+    subtract_average_32x32_vsx, /* 32x32 */
+    cfl_subtract_average_null,  /* 64x64 (invalid CFL size) */
+    subtract_average_4x8_c,     /* 4x8 */
+    subtract_average_8x4_vsx,   /* 8x4 */
+    subtract_average_8x16_vsx,  /* 8x16 */
+    subtract_average_16x8_vsx,  /* 16x8 */
+    subtract_average_16x32_vsx, /* 16x32 */
+    subtract_average_32x16_vsx, /* 32x16 */
+    cfl_subtract_average_null,  /* 32x64 (invalid CFL size) */
+    cfl_subtract_average_null,  /* 64x32 (invalid CFL size) */
+    subtract_average_4x16_c,    /* 4x16 */
+    subtract_average_16x4_vsx,  /* 16x4 */
+    subtract_average_8x32_vsx,  /* 8x32 */
+    subtract_average_32x8_vsx,  /* 32x8 */
+    cfl_subtract_average_null,  /* 16x64 (invalid CFL size) */
+    cfl_subtract_average_null,  /* 64x16 (invalid CFL size) */
+  };
+  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
+  // index the function pointer array out of bounds.
+  return sub_avg[tx_size % TX_SIZES_ALL];
+}
diff --git a/third_party/aom/av1/common/pred_common.c b/third_party/aom/av1/common/pred_common.c
index 51fd0389e..d77739d85 100644
--- a/third_party/aom/av1/common/pred_common.c
+++ b/third_party/aom/av1/common/pred_common.c
@@ -12,30 +12,23 @@
 #include "av1/common/common.h"
 #include "av1/common/pred_common.h"
 #include "av1/common/reconinter.h"
-#if CONFIG_EXT_INTRA
 #include "av1/common/reconintra.h"
-#endif  // CONFIG_EXT_INTRA
 #include "av1/common/seg_common.h"
 
 // Returns a context number for the given MB prediction signal
-#if CONFIG_DUAL_FILTER
-static InterpFilter get_ref_filter_type(const MODE_INFO *mi,
+static InterpFilter get_ref_filter_type(const MB_MODE_INFO *ref_mbmi,
                                         const MACROBLOCKD *xd, int dir,
                                         MV_REFERENCE_FRAME ref_frame) {
-  const MB_MODE_INFO *ref_mbmi = &mi->mbmi;
-  int use_subpel[2] = {
-    has_subpel_mv_component(mi, xd, dir),
-    has_subpel_mv_component(mi, xd, dir + 2),
-  };
-
-  return (((ref_mbmi->ref_frame[0] == ref_frame && use_subpel[0]) ||
-           (ref_mbmi->ref_frame[1] == ref_frame && use_subpel[1]))
+  (void)xd;
+
+  return ((ref_mbmi->ref_frame[0] == ref_frame ||
+           ref_mbmi->ref_frame[1] == ref_frame)
               ? av1_extract_interp_filter(ref_mbmi->interp_filters, dir & 0x01)
               : SWITCHABLE_FILTERS);
 }
 
 int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   const int ctx_offset =
       (mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET;
   MV_REFERENCE_FRAME ref_frame =
@@ -69,132 +62,57 @@ int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) {
 
   return filter_type_ctx;
 }
-#else
-int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
-  // Note:
-  // The mode info data structure has a one element border above and to the
-  // left of the entries corresponding to real macroblocks.
-  // The prediction flags in these dummy entries are initialized to 0.
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int left_type =
-      xd->left_available && is_inter_block(left_mbmi)
-          ? av1_extract_interp_filter(left_mbmi->interp_filters, 0)
-          : SWITCHABLE_FILTERS;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const int above_type =
-      xd->up_available && is_inter_block(above_mbmi)
-          ? av1_extract_interp_filter(above_mbmi->interp_filters, 0)
-          : SWITCHABLE_FILTERS;
-
-  if (left_type == above_type) {
-    return left_type;
-  } else if (left_type == SWITCHABLE_FILTERS) {
-    assert(above_type != SWITCHABLE_FILTERS);
-    return above_type;
-  } else if (above_type == SWITCHABLE_FILTERS) {
-    assert(left_type != SWITCHABLE_FILTERS);
-    return left_type;
-  } else {
-    return SWITCHABLE_FILTERS;
-  }
-}
-#endif
-
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-// Obtain the reference filter type from the above/left neighbor blocks.
-static INTRA_FILTER get_ref_intra_filter(const MB_MODE_INFO *ref_mbmi) {
-  INTRA_FILTER ref_type = INTRA_FILTERS;
-
-  if (ref_mbmi->sb_type >= BLOCK_8X8) {
-    const PREDICTION_MODE mode = ref_mbmi->mode;
-    if (is_inter_block(ref_mbmi)) {
-      switch (av1_extract_interp_filter(ref_mbmi->interp_filters, 0)) {
-        case EIGHTTAP_REGULAR: ref_type = INTRA_FILTER_8TAP; break;
-        case EIGHTTAP_SMOOTH: ref_type = INTRA_FILTER_8TAP_SMOOTH; break;
-        case MULTITAP_SHARP: ref_type = INTRA_FILTER_8TAP_SHARP; break;
-        case BILINEAR: ref_type = INTRA_FILTERS; break;
-        default: break;
-      }
-    } else {
-      if (av1_is_directional_mode(mode, ref_mbmi->sb_type)) {
-        const int p_angle =
-            mode_to_angle_map[mode] + ref_mbmi->angle_delta[0] * ANGLE_STEP;
-        if (av1_is_intra_filter_switchable(p_angle)) {
-          ref_type = ref_mbmi->intra_filter;
-        }
-      }
-    }
-  }
-  return ref_type;
-}
-
-int av1_get_pred_context_intra_interp(const MACROBLOCKD *xd) {
-  int left_type = INTRA_FILTERS, above_type = INTRA_FILTERS;
 
-  if (xd->left_available) left_type = get_ref_intra_filter(xd->left_mbmi);
+static void palette_add_to_cache(uint16_t *cache, int *n, uint16_t val) {
+  // Do not add an already existing value
+  if (*n > 0 && val == cache[*n - 1]) return;
 
-  if (xd->up_available) above_type = get_ref_intra_filter(xd->above_mbmi);
-
-  if (left_type == above_type)
-    return left_type;
-  else if (left_type == INTRA_FILTERS && above_type != INTRA_FILTERS)
-    return above_type;
-  else if (left_type != INTRA_FILTERS && above_type == INTRA_FILTERS)
-    return left_type;
-  else
-    return INTRA_FILTERS;
+  cache[(*n)++] = val;
 }
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
 
-#if CONFIG_PALETTE_DELTA_ENCODING
 int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane,
                           uint16_t *cache) {
   const int row = -xd->mb_to_top_edge >> 3;
   // Do not refer to above SB row when on SB boundary.
-  const MODE_INFO *const above_mi =
-      (row % (1 << MIN_SB_SIZE_LOG2)) ? xd->above_mi : NULL;
-  const MODE_INFO *const left_mi = xd->left_mi;
+  const MB_MODE_INFO *const above_mi =
+      (row % (1 << MIN_SB_SIZE_LOG2)) ? xd->above_mbmi : NULL;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
   int above_n = 0, left_n = 0;
-  if (above_mi)
-    above_n = above_mi->mbmi.palette_mode_info.palette_size[plane != 0];
-  if (left_mi)
-    left_n = left_mi->mbmi.palette_mode_info.palette_size[plane != 0];
+  if (above_mi) above_n = above_mi->palette_mode_info.palette_size[plane != 0];
+  if (left_mi) left_n = left_mi->palette_mode_info.palette_size[plane != 0];
   if (above_n == 0 && left_n == 0) return 0;
   int above_idx = plane * PALETTE_MAX_SIZE;
   int left_idx = plane * PALETTE_MAX_SIZE;
   int n = 0;
   const uint16_t *above_colors =
-      above_mi ? above_mi->mbmi.palette_mode_info.palette_colors : NULL;
+      above_mi ? above_mi->palette_mode_info.palette_colors : NULL;
   const uint16_t *left_colors =
-      left_mi ? left_mi->mbmi.palette_mode_info.palette_colors : NULL;
+      left_mi ? left_mi->palette_mode_info.palette_colors : NULL;
   // Merge the sorted lists of base colors from above and left to get
   // combined sorted color cache.
   while (above_n > 0 && left_n > 0) {
     uint16_t v_above = above_colors[above_idx];
     uint16_t v_left = left_colors[left_idx];
     if (v_left < v_above) {
-      if (n == 0 || v_left != cache[n - 1]) cache[n++] = v_left;
+      palette_add_to_cache(cache, &n, v_left);
       ++left_idx, --left_n;
     } else {
-      if (n == 0 || v_above != cache[n - 1]) cache[n++] = v_above;
+      palette_add_to_cache(cache, &n, v_above);
       ++above_idx, --above_n;
       if (v_left == v_above) ++left_idx, --left_n;
     }
   }
   while (above_n-- > 0) {
     uint16_t val = above_colors[above_idx++];
-    if (n == 0 || val != cache[n - 1]) cache[n++] = val;
+    palette_add_to_cache(cache, &n, val);
   }
   while (left_n-- > 0) {
     uint16_t val = left_colors[left_idx++];
-    if (n == 0 || val != cache[n - 1]) cache[n++] = val;
+    palette_add_to_cache(cache, &n, val);
   }
   assert(n <= 2 * PALETTE_MAX_SIZE);
   return n;
 }
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
 
 // The mode info data structure has a one element border above and to the
 // left of the entries corresponding to real macroblocks.
@@ -220,65 +138,17 @@ int av1_get_intra_inter_context(const MACROBLOCKD *xd) {
   }
 }
 
-#if CONFIG_COMPOUND_SINGLEREF
-// The compound/single mode info data structure has one element border above and
-// to the left of the entries corresponding to real macroblocks.
-// The prediction flags in these dummy entries are initialized to 0.
-int av1_get_inter_mode_context(const MACROBLOCKD *xd) {
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int has_above = xd->up_available;
-  const int has_left = xd->left_available;
-
-  if (has_above && has_left) {  // both edges available
-    const int above_inter_comp_mode =
-        is_inter_anyref_comp_mode(above_mbmi->mode);
-    const int left_inter_comp_mode = is_inter_anyref_comp_mode(left_mbmi->mode);
-    if (above_inter_comp_mode && left_inter_comp_mode)
-      return 0;
-    else if (above_inter_comp_mode || left_inter_comp_mode)
-      return 1;
-    else if (!is_inter_block(above_mbmi) && !is_inter_block(left_mbmi))
-      return 2;
-    else
-      return 3;
-  } else if (has_above || has_left) {  // one edge available
-    const MB_MODE_INFO *const edge_mbmi = has_above ? above_mbmi : left_mbmi;
-    if (is_inter_anyref_comp_mode(edge_mbmi->mode))
-      return 1;
-    else if (!is_inter_block(edge_mbmi))
-      return 2;
-    else
-      return 3;
-  } else {  // no edge available
-    return 2;
-  }
-}
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-#if CONFIG_EXT_REFS
 #define CHECK_BACKWARD_REFS(ref_frame) \
   (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
 #define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
-#else
-#define IS_BACKWARD_REF_FRAME(ref_frame) ((ref_frame) == cm->comp_fixed_ref)
-#endif  // CONFIG_EXT_REFS
-
-#define CHECK_GOLDEN_OR_LAST3(ref_frame) \
-  (((ref_frame) == GOLDEN_FRAME) || ((ref_frame) == LAST3_FRAME))
 
-int av1_get_reference_mode_context(const AV1_COMMON *cm,
-                                   const MACROBLOCKD *xd) {
+int av1_get_reference_mode_context(const MACROBLOCKD *xd) {
   int ctx;
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
   const int has_above = xd->up_available;
   const int has_left = xd->left_available;
 
-#if CONFIG_EXT_REFS
-  (void)cm;
-#endif  // CONFIG_EXT_REFS
-
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries corresponding to real macroblocks.
@@ -314,9 +184,6 @@ int av1_get_reference_mode_context(const AV1_COMMON *cm,
   return ctx;
 }
 
-#if CONFIG_EXT_COMP_REFS
-// TODO(zoeliu): To try on the design of 3 contexts, instead of 5:
-//               COMP_REF_TYPE_CONTEXTS = 3
 int av1_get_comp_reference_type_context(const MACROBLOCKD *xd) {
   int pred_context;
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
@@ -344,9 +211,8 @@ int av1_get_comp_reference_type_context(const MACROBLOCKD *xd) {
       const MV_REFERENCE_FRAME frfl = left_mbmi->ref_frame[0];
 
       if (a_sg && l_sg) {  // single/single
-        pred_context =
-            1 +
-            2 * (!(IS_BACKWARD_REF_FRAME(frfa) ^ IS_BACKWARD_REF_FRAME(frfl)));
+        pred_context = 1 + 2 * (!(IS_BACKWARD_REF_FRAME(frfa) ^
+                                  IS_BACKWARD_REF_FRAME(frfl)));
       } else if (l_sg || a_sg) {  // single/comp
         const int uni_rfc =
             a_sg ? has_uni_comp_refs(left_mbmi) : has_uni_comp_refs(above_mbmi);
@@ -397,44 +263,16 @@ int av1_get_comp_reference_type_context(const MACROBLOCKD *xd) {
 // 3 contexts: Voting is used to compare the count of forward references with
 //             that of backward references from the spatial neighbors.
 int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int above_in_image = xd->up_available;
-  const int left_in_image = xd->left_available;
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
 
   // Count of forward references (L, L2, L3, or G)
-  int frf_count = 0;
+  const int frf_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] +
+                        ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
   // Count of backward references (B or A)
-  int brf_count = 0;
-
-  if (above_in_image && is_inter_block(above_mbmi)) {
-    if (above_mbmi->ref_frame[0] <= GOLDEN_FRAME)
-      ++frf_count;
-    else
-      ++brf_count;
-    if (has_second_ref(above_mbmi)) {
-      if (above_mbmi->ref_frame[1] <= GOLDEN_FRAME)
-        ++frf_count;
-      else
-        ++brf_count;
-    }
-  }
-
-  if (left_in_image && is_inter_block(left_mbmi)) {
-    if (left_mbmi->ref_frame[0] <= GOLDEN_FRAME)
-      ++frf_count;
-    else
-      ++brf_count;
-    if (has_second_ref(left_mbmi)) {
-      if (left_mbmi->ref_frame[1] <= GOLDEN_FRAME)
-        ++frf_count;
-      else
-        ++brf_count;
-    }
-  }
+  const int brf_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] +
+                        ref_counts[ALTREF_FRAME];
 
-  pred_context =
+  const int pred_context =
       (frf_count == brf_count) ? 1 : ((frf_count < brf_count) ? 0 : 2);
 
   assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
@@ -450,50 +288,17 @@ int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd) {
 // 3 contexts: Voting is used to compare the count of LAST2_FRAME with the
 //             total count of LAST3/GOLDEN from the spatial neighbors.
 int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int above_in_image = xd->up_available;
-  const int left_in_image = xd->left_available;
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
 
   // Count of LAST2
-  int last2_count = 0;
+  const int last2_count = ref_counts[LAST2_FRAME];
   // Count of LAST3 or GOLDEN
-  int last3_or_gld_count = 0;
-
-  if (above_in_image && is_inter_block(above_mbmi)) {
-    last2_count = (above_mbmi->ref_frame[0] == LAST2_FRAME) ? last2_count + 1
-                                                            : last2_count;
-    last3_or_gld_count = CHECK_GOLDEN_OR_LAST3(above_mbmi->ref_frame[0])
-                             ? last3_or_gld_count + 1
-                             : last3_or_gld_count;
-    if (has_second_ref(above_mbmi)) {
-      last2_count = (above_mbmi->ref_frame[1] == LAST2_FRAME) ? last2_count + 1
-                                                              : last2_count;
-      last3_or_gld_count = CHECK_GOLDEN_OR_LAST3(above_mbmi->ref_frame[1])
-                               ? last3_or_gld_count + 1
-                               : last3_or_gld_count;
-    }
-  }
+  const int last3_or_gld_count =
+      ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
 
-  if (left_in_image && is_inter_block(left_mbmi)) {
-    last2_count = (left_mbmi->ref_frame[0] == LAST2_FRAME) ? last2_count + 1
-                                                           : last2_count;
-    last3_or_gld_count = CHECK_GOLDEN_OR_LAST3(left_mbmi->ref_frame[0])
-                             ? last3_or_gld_count + 1
-                             : last3_or_gld_count;
-    if (has_second_ref(left_mbmi)) {
-      last2_count = (left_mbmi->ref_frame[1] == LAST2_FRAME) ? last2_count + 1
-                                                             : last2_count;
-      last3_or_gld_count = CHECK_GOLDEN_OR_LAST3(left_mbmi->ref_frame[1])
-                               ? last3_or_gld_count + 1
-                               : last3_or_gld_count;
-    }
-  }
-
-  pred_context = (last2_count == last3_or_gld_count)
-                     ? 1
-                     : ((last2_count < last3_or_gld_count) ? 0 : 2);
+  const int pred_context = (last2_count == last3_or_gld_count)
+                               ? 1
+                               : ((last2_count < last3_or_gld_count) ? 0 : 2);
 
   assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
   return pred_context;
@@ -508,415 +313,83 @@ int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd) {
 // 3 contexts: Voting is used to compare the count of LAST3_FRAME with the
 //             total count of GOLDEN_FRAME from the spatial neighbors.
 int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int above_in_image = xd->up_available;
-  const int left_in_image = xd->left_available;
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
 
   // Count of LAST3
-  int last3_count = 0;
+  const int last3_count = ref_counts[LAST3_FRAME];
   // Count of GOLDEN
-  int gld_count = 0;
-
-  if (above_in_image && is_inter_block(above_mbmi)) {
-    last3_count = (above_mbmi->ref_frame[0] == LAST3_FRAME) ? last3_count + 1
-                                                            : last3_count;
-    gld_count =
-        (above_mbmi->ref_frame[0] == GOLDEN_FRAME) ? gld_count + 1 : gld_count;
-    if (has_second_ref(above_mbmi)) {
-      last3_count = (above_mbmi->ref_frame[1] == LAST3_FRAME) ? last3_count + 1
-                                                              : last3_count;
-      gld_count = (above_mbmi->ref_frame[1] == GOLDEN_FRAME) ? gld_count + 1
-                                                             : gld_count;
-    }
-  }
+  const int gld_count = ref_counts[GOLDEN_FRAME];
 
-  if (left_in_image && is_inter_block(left_mbmi)) {
-    last3_count = (left_mbmi->ref_frame[0] == LAST3_FRAME) ? last3_count + 1
-                                                           : last3_count;
-    gld_count =
-        (left_mbmi->ref_frame[0] == GOLDEN_FRAME) ? gld_count + 1 : gld_count;
-    if (has_second_ref(left_mbmi)) {
-      last3_count = (left_mbmi->ref_frame[1] == LAST3_FRAME) ? last3_count + 1
-                                                             : last3_count;
-      gld_count =
-          (left_mbmi->ref_frame[1] == GOLDEN_FRAME) ? gld_count + 1 : gld_count;
-    }
-  }
-
-  pred_context =
+  const int pred_context =
       (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2);
 
   assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
   return pred_context;
 }
-#endif  // CONFIG_EXT_COMP_REFS
-
-#if CONFIG_EXT_REFS
 
-// TODO(zoeliu): Future work will be conducted to optimize the context design
-//               for the coding of the reference frames.
-
-#define CHECK_LAST_OR_LAST2(ref_frame) \
-  ((ref_frame == LAST_FRAME) || (ref_frame == LAST2_FRAME))
-
-// Returns a context number for the given MB prediction signal
-// Signal the first reference frame for a compound mode be either
-// GOLDEN/LAST3, or LAST/LAST2.
+// == Common context functions for both comp and single ref ==
 //
-// NOTE(zoeliu): The probability of ref_frame[0] is either
-//               GOLDEN_FRAME or LAST3_FRAME.
-int av1_get_pred_context_comp_ref_p(const AV1_COMMON *cm,
-                                    const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int above_in_image = xd->up_available;
-  const int left_in_image = xd->left_available;
+// Obtain contexts to signal a reference frame to be either LAST/LAST2 or
+// LAST3/GOLDEN.
+static int get_pred_context_ll2_or_l3gld(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
 
-// Note:
-// The mode info data structure has a one element border above and to the
-// left of the entries correpsonding to real macroblocks.
-// The prediction flags in these dummy entries are initialised to 0.
-#if CONFIG_ONE_SIDED_COMPOUND || CONFIG_FRAME_SIGN_BIAS
-  // Code seems to assume that signbias of cm->comp_bwd_ref[0] is always 1
-  const int bwd_ref_sign_idx = 1;
-#else
-  const int bwd_ref_sign_idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
-#endif  // CONFIG_ONE_SIDED_COMPOUND || CONFIG_FRAME_SIGN_BIAS
-  const int fwd_ref_sign_idx = !bwd_ref_sign_idx;
-
-  (void)cm;
+  // Count of LAST + LAST2
+  const int last_last2_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME];
+  // Count of LAST3 + GOLDEN
+  const int last3_gld_count =
+      ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
 
-  if (above_in_image && left_in_image) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-
-    if (above_intra && left_intra) {  // intra/intra (2)
-      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-
-      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
-        pred_context =
-            1 + 2 * (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]));
-      else  // comp pred (1/3)
-        pred_context = 1 +
-                       2 * (!CHECK_GOLDEN_OR_LAST3(
-                               edge_mbmi->ref_frame[fwd_ref_sign_idx]));
-    } else {  // inter/inter
-      const int l_sg = !has_second_ref(left_mbmi);
-      const int a_sg = !has_second_ref(above_mbmi);
-      const MV_REFERENCE_FRAME frfa =
-          a_sg ? above_mbmi->ref_frame[0]
-               : above_mbmi->ref_frame[fwd_ref_sign_idx];
-      const MV_REFERENCE_FRAME frfl =
-          l_sg ? left_mbmi->ref_frame[0]
-               : left_mbmi->ref_frame[fwd_ref_sign_idx];
-
-      if (frfa == frfl && CHECK_GOLDEN_OR_LAST3(frfa)) {
-        pred_context = 0;
-      } else if (l_sg && a_sg) {  // single/single
-        if ((CHECK_BACKWARD_REFS(frfa) && CHECK_LAST_OR_LAST2(frfl)) ||
-            (CHECK_BACKWARD_REFS(frfl) && CHECK_LAST_OR_LAST2(frfa))) {
-          pred_context = 4;
-        } else if (CHECK_GOLDEN_OR_LAST3(frfa) || CHECK_GOLDEN_OR_LAST3(frfl)) {
-          pred_context = 1;
-        } else {
-          pred_context = 3;
-        }
-      } else if (l_sg || a_sg) {  // single/comp
-        const MV_REFERENCE_FRAME frfc = l_sg ? frfa : frfl;
-        const MV_REFERENCE_FRAME rfs = a_sg ? frfa : frfl;
-
-        if (CHECK_GOLDEN_OR_LAST3(frfc) && !CHECK_GOLDEN_OR_LAST3(rfs))
-          pred_context = 1;
-        else if (CHECK_GOLDEN_OR_LAST3(rfs) && !CHECK_GOLDEN_OR_LAST3(frfc))
-          pred_context = 2;
-        else
-          pred_context = 4;
-      } else {  // comp/comp
-        if ((CHECK_LAST_OR_LAST2(frfa) && CHECK_LAST_OR_LAST2(frfl))) {
-          pred_context = 4;
-        } else {
-// NOTE(zoeliu): Following assert may be removed once confirmed.
-#if !USE_UNI_COMP_REFS
-          // TODO(zoeliu): To further study the UNIDIR scenario
-          assert(CHECK_GOLDEN_OR_LAST3(frfa) || CHECK_GOLDEN_OR_LAST3(frfl));
-#endif  // !USE_UNI_COMP_REFS
-          pred_context = 2;
-        }
-      }
-    }
-  } else if (above_in_image || left_in_image) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
-
-    if (!is_inter_block(edge_mbmi)) {
-      pred_context = 2;
-    } else {
-      if (has_second_ref(edge_mbmi))
-        pred_context =
-            4 *
-            (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[fwd_ref_sign_idx]));
-      else
-        pred_context = 3 * (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]));
-    }
-  } else {  // no edges available (2)
-    pred_context = 2;
-  }
+  const int pred_context = (last_last2_count == last3_gld_count)
+                               ? 1
+                               : ((last_last2_count < last3_gld_count) ? 0 : 2);
 
   assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-
   return pred_context;
 }
 
-// Returns a context number for the given MB prediction signal
-// Signal the first reference frame for a compound mode be LAST,
-// conditioning on that it is known either LAST/LAST2.
-//
-// NOTE(zoeliu): The probability of ref_frame[0] is LAST_FRAME,
-// conditioning on it is either LAST_FRAME or LAST2_FRAME.
-int av1_get_pred_context_comp_ref_p1(const AV1_COMMON *cm,
-                                     const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int above_in_image = xd->up_available;
-  const int left_in_image = xd->left_available;
-
-// Note:
-// The mode info data structure has a one element border above and to the
-// left of the entries correpsonding to real macroblocks.
-// The prediction flags in these dummy entries are initialised to 0.
-#if CONFIG_ONE_SIDED_COMPOUND || CONFIG_FRAME_SIGN_BIAS
-  // Code seems to assume that signbias of cm->comp_bwd_ref[0] is always 1
-  const int bwd_ref_sign_idx = 1;
-#else
-  const int bwd_ref_sign_idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
-#endif  //  CONFIG_ONE_SIDED_COMPOUND || CONFIG_FRAME_SIGN_BIAS
-  const int fwd_ref_sign_idx = !bwd_ref_sign_idx;
-
-  (void)cm;
-
-  if (above_in_image && left_in_image) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-
-    if (above_intra && left_intra) {  // intra/intra (2)
-      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-
-      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
-        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != LAST_FRAME);
-      else  // comp pred (1/3)
-        pred_context =
-            1 + 2 * (edge_mbmi->ref_frame[fwd_ref_sign_idx] != LAST_FRAME);
-    } else {  // inter/inter
-      const int l_sg = !has_second_ref(left_mbmi);
-      const int a_sg = !has_second_ref(above_mbmi);
-      const MV_REFERENCE_FRAME frfa =
-          a_sg ? above_mbmi->ref_frame[0]
-               : above_mbmi->ref_frame[fwd_ref_sign_idx];
-      const MV_REFERENCE_FRAME frfl =
-          l_sg ? left_mbmi->ref_frame[0]
-               : left_mbmi->ref_frame[fwd_ref_sign_idx];
-
-      if (frfa == frfl && frfa == LAST_FRAME)
-        pred_context = 0;
-      else if (l_sg && a_sg) {  // single/single
-        if (frfa == LAST_FRAME || frfl == LAST_FRAME)
-          pred_context = 1;
-        else if (CHECK_GOLDEN_OR_LAST3(frfa) || CHECK_GOLDEN_OR_LAST3(frfl))
-          pred_context = 2 + (frfa != frfl);
-        else if (frfa == frfl ||
-                 (CHECK_BACKWARD_REFS(frfa) && CHECK_BACKWARD_REFS(frfl)))
-          pred_context = 3;
-        else
-          pred_context = 4;
-      } else if (l_sg || a_sg) {  // single/comp
-        const MV_REFERENCE_FRAME frfc = l_sg ? frfa : frfl;
-        const MV_REFERENCE_FRAME rfs = a_sg ? frfa : frfl;
+// Obtain contexts to signal a reference frame to be either LAST or LAST2.
+static int get_pred_context_last_or_last2(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
 
-        if (frfc == LAST_FRAME && rfs != LAST_FRAME)
-          pred_context = 1;
-        else if (rfs == LAST_FRAME && frfc != LAST_FRAME)
-          pred_context = 2;
-        else
-          pred_context =
-              3 + (frfc == LAST2_FRAME || CHECK_GOLDEN_OR_LAST3(rfs));
-      } else {  // comp/comp
-        if (frfa == LAST_FRAME || frfl == LAST_FRAME)
-          pred_context = 2;
-        else
-          pred_context =
-              3 + (CHECK_GOLDEN_OR_LAST3(frfa) || CHECK_GOLDEN_OR_LAST3(frfl));
-      }
-    }
-  } else if (above_in_image || left_in_image) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+  // Count of LAST
+  const int last_count = ref_counts[LAST_FRAME];
+  // Count of LAST2
+  const int last2_count = ref_counts[LAST2_FRAME];
 
-    if (!is_inter_block(edge_mbmi)) {
-      pred_context = 2;
-    } else {
-      if (has_second_ref(edge_mbmi)) {
-        pred_context =
-            4 * (edge_mbmi->ref_frame[fwd_ref_sign_idx] != LAST_FRAME);
-      } else {
-        if (edge_mbmi->ref_frame[0] == LAST_FRAME)
-          pred_context = 0;
-        else
-          pred_context = 2 + CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]);
-      }
-    }
-  } else {  // no edges available (2)
-    pred_context = 2;
-  }
+  const int pred_context =
+      (last_count == last2_count) ? 1 : ((last_count < last2_count) ? 0 : 2);
 
   assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-
   return pred_context;
 }
 
-// Returns a context number for the given MB prediction signal
-// Signal the first reference frame for a compound mode be GOLDEN,
-// conditioning on that it is known either GOLDEN or LAST3.
-//
-// NOTE(zoeliu): The probability of ref_frame[0] is GOLDEN_FRAME,
-// conditioning on it is either GOLDEN or LAST3.
-int av1_get_pred_context_comp_ref_p2(const AV1_COMMON *cm,
-                                     const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int above_in_image = xd->up_available;
-  const int left_in_image = xd->left_available;
-
-// Note:
-// The mode info data structure has a one element border above and to the
-// left of the entries correpsonding to real macroblocks.
-// The prediction flags in these dummy entries are initialised to 0.
-#if CONFIG_ONE_SIDED_COMPOUND || CONFIG_FRAME_SIGN_BIAS
-  const int bwd_ref_sign_idx = 1;
-#else
-  const int bwd_ref_sign_idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
-#endif  // CONFIG_ONE_SIDED_COMPOUND || CONFIG_FRAME_SIGN_BIAS
-  const int fwd_ref_sign_idx = !bwd_ref_sign_idx;
-
-  (void)cm;
+// Obtain contexts to signal a reference frame to be either LAST3 or GOLDEN.
+static int get_pred_context_last3_or_gld(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
 
-  if (above_in_image && left_in_image) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-
-    if (above_intra && left_intra) {  // intra/intra (2)
-      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-
-      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
-        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != GOLDEN_FRAME);
-      else  // comp pred (1/3)
-        pred_context =
-            1 + 2 * (edge_mbmi->ref_frame[fwd_ref_sign_idx] != GOLDEN_FRAME);
-    } else {  // inter/inter
-      const int l_sg = !has_second_ref(left_mbmi);
-      const int a_sg = !has_second_ref(above_mbmi);
-      const MV_REFERENCE_FRAME frfa =
-          a_sg ? above_mbmi->ref_frame[0]
-               : above_mbmi->ref_frame[fwd_ref_sign_idx];
-      const MV_REFERENCE_FRAME frfl =
-          l_sg ? left_mbmi->ref_frame[0]
-               : left_mbmi->ref_frame[fwd_ref_sign_idx];
-
-      if (frfa == frfl && frfa == GOLDEN_FRAME)
-        pred_context = 0;
-      else if (l_sg && a_sg) {  // single/single
-        if (frfa == GOLDEN_FRAME || frfl == GOLDEN_FRAME)
-          pred_context = 1;
-        else if (CHECK_LAST_OR_LAST2(frfa) || CHECK_LAST_OR_LAST2(frfl))
-          pred_context = 2 + (frfa != frfl);
-        else if (frfa == frfl ||
-                 (CHECK_BACKWARD_REFS(frfa) && CHECK_BACKWARD_REFS(frfl)))
-          pred_context = 3;
-        else
-          pred_context = 4;
-      } else if (l_sg || a_sg) {  // single/comp
-        const MV_REFERENCE_FRAME frfc = l_sg ? frfa : frfl;
-        const MV_REFERENCE_FRAME rfs = a_sg ? frfa : frfl;
-
-        if (frfc == GOLDEN_FRAME && rfs != GOLDEN_FRAME)
-          pred_context = 1;
-        else if (rfs == GOLDEN_FRAME && frfc != GOLDEN_FRAME)
-          pred_context = 2;
-        else
-          pred_context = 3 + (frfc == LAST3_FRAME || CHECK_LAST_OR_LAST2(rfs));
-      } else {  // comp/comp
-        if (frfa == GOLDEN_FRAME || frfl == GOLDEN_FRAME)
-          pred_context = 2;
-        else
-          pred_context =
-              3 + (CHECK_LAST_OR_LAST2(frfa) || CHECK_LAST_OR_LAST2(frfl));
-      }
-    }
-  } else if (above_in_image || left_in_image) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+  // Count of LAST3
+  const int last3_count = ref_counts[LAST3_FRAME];
+  // Count of GOLDEN
+  const int gld_count = ref_counts[GOLDEN_FRAME];
 
-    if (!is_inter_block(edge_mbmi)) {
-      pred_context = 2;
-    } else {
-      if (has_second_ref(edge_mbmi)) {
-        pred_context =
-            4 * (edge_mbmi->ref_frame[fwd_ref_sign_idx] != GOLDEN_FRAME);
-      } else {
-        if (edge_mbmi->ref_frame[0] == GOLDEN_FRAME)
-          pred_context = 0;
-        else
-          pred_context = 2 + CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]);
-      }
-    }
-  } else {  // no edges available (2)
-    pred_context = 2;
-  }
+  const int pred_context =
+      (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2);
 
   assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-
   return pred_context;
 }
 
 // Obtain contexts to signal a reference frame be either BWDREF/ALTREF2, or
 // ALTREF.
-int av1_get_pred_context_brfarf2_or_arf(const MACROBLOCKD *xd) {
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int above_in_image = xd->up_available;
-  const int left_in_image = xd->left_available;
+static int get_pred_context_brfarf2_or_arf(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
 
   // Counts of BWDREF, ALTREF2, or ALTREF frames (B, A2, or A)
-  int bwdref_counts[ALTREF_FRAME - BWDREF_FRAME + 1] = { 0 };
-
-  if (above_in_image && is_inter_block(above_mbmi)) {
-    if (above_mbmi->ref_frame[0] >= BWDREF_FRAME)
-      ++bwdref_counts[above_mbmi->ref_frame[0] - BWDREF_FRAME];
-    if (has_second_ref(above_mbmi)) {
-      if (above_mbmi->ref_frame[1] >= BWDREF_FRAME)
-        ++bwdref_counts[above_mbmi->ref_frame[1] - BWDREF_FRAME];
-    }
-  }
+  const int brfarf2_count =
+      ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME];
+  const int arf_count = ref_counts[ALTREF_FRAME];
 
-  if (left_in_image && is_inter_block(left_mbmi)) {
-    if (left_mbmi->ref_frame[0] >= BWDREF_FRAME)
-      ++bwdref_counts[left_mbmi->ref_frame[0] - BWDREF_FRAME];
-    if (has_second_ref(left_mbmi)) {
-      if (left_mbmi->ref_frame[1] >= BWDREF_FRAME)
-        ++bwdref_counts[left_mbmi->ref_frame[1] - BWDREF_FRAME];
-    }
-  }
-
-  const int brfarf2_count = bwdref_counts[BWDREF_FRAME - BWDREF_FRAME] +
-                            bwdref_counts[ALTREF2_FRAME - BWDREF_FRAME];
-  const int arf_count = bwdref_counts[ALTREF_FRAME - BWDREF_FRAME];
   const int pred_context =
       (brfarf2_count == arf_count) ? 1 : ((brfarf2_count < arf_count) ? 0 : 2);
 
@@ -925,42 +398,13 @@ int av1_get_pred_context_brfarf2_or_arf(const MACROBLOCKD *xd) {
 }
 
 // Obtain contexts to signal a reference frame be either BWDREF or ALTREF2.
-int av1_get_pred_context_brf_or_arf2(const MACROBLOCKD *xd) {
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int above_in_image = xd->up_available;
-  const int left_in_image = xd->left_available;
+static int get_pred_context_brf_or_arf2(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
 
   // Count of BWDREF frames (B)
-  int brf_count = 0;
+  const int brf_count = ref_counts[BWDREF_FRAME];
   // Count of ALTREF2 frames (A2)
-  int arf2_count = 0;
-
-  if (above_in_image && is_inter_block(above_mbmi)) {
-    if (above_mbmi->ref_frame[0] == BWDREF_FRAME)
-      ++brf_count;
-    else if (above_mbmi->ref_frame[0] == ALTREF2_FRAME)
-      ++arf2_count;
-    if (has_second_ref(above_mbmi)) {
-      if (above_mbmi->ref_frame[1] == BWDREF_FRAME)
-        ++brf_count;
-      else if (above_mbmi->ref_frame[1] == ALTREF2_FRAME)
-        ++arf2_count;
-    }
-  }
-
-  if (left_in_image && is_inter_block(left_mbmi)) {
-    if (left_mbmi->ref_frame[0] == BWDREF_FRAME)
-      ++brf_count;
-    else if (left_mbmi->ref_frame[0] == ALTREF2_FRAME)
-      ++arf2_count;
-    if (has_second_ref(left_mbmi)) {
-      if (left_mbmi->ref_frame[1] == BWDREF_FRAME)
-        ++brf_count;
-      else if (left_mbmi->ref_frame[1] == ALTREF2_FRAME)
-        ++arf2_count;
-    }
-  }
+  const int arf2_count = ref_counts[ALTREF2_FRAME];
 
   const int pred_context =
       (brf_count == arf2_count) ? 1 : ((brf_count < arf2_count) ? 0 : 2);
@@ -969,168 +413,57 @@ int av1_get_pred_context_brf_or_arf2(const MACROBLOCKD *xd) {
   return pred_context;
 }
 
-// Signal the 2nd reference frame for a compound mode be either
-// ALTREF, or ALTREF2/BWDREF.
-int av1_get_pred_context_comp_bwdref_p(const AV1_COMMON *cm,
-                                       const MACROBLOCKD *xd) {
-  (void)cm;
-  return av1_get_pred_context_brfarf2_or_arf(xd);
+// == Context functions for comp ref ==
+//
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be either
+// GOLDEN/LAST3, or LAST/LAST2.
+int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd) {
+  return get_pred_context_ll2_or_l3gld(xd);
 }
 
-// Signal the 2nd reference frame for a compound mode be either
-// ALTREF2 or BWDREF.
-int av1_get_pred_context_comp_bwdref_p1(const AV1_COMMON *cm,
-                                        const MACROBLOCKD *xd) {
-  (void)cm;
-  return av1_get_pred_context_brf_or_arf2(xd);
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be LAST,
+// conditioning on that it is known either LAST/LAST2.
+int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd) {
+  return get_pred_context_last_or_last2(xd);
 }
 
-#else  // !CONFIG_EXT_REFS
-
 // Returns a context number for the given MB prediction signal
-int av1_get_pred_context_comp_ref_p(const AV1_COMMON *cm,
-                                    const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int above_in_image = xd->up_available;
-  const int left_in_image = xd->left_available;
-
-  // Note:
-  // The mode info data structure has a one element border above and to the
-  // left of the entries corresponding to real macroblocks.
-  // The prediction flags in these dummy entries are initialized to 0.
-  const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
-  const int var_ref_idx = !fix_ref_idx;
-
-  if (above_in_image && left_in_image) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-
-    if (above_intra && left_intra) {  // intra/intra (2)
-      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-
-      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
-        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
-      else  // comp pred (1/3)
-        pred_context =
-            1 + 2 * (edge_mbmi->ref_frame[var_ref_idx] != cm->comp_var_ref[1]);
-    } else {  // inter/inter
-      const int l_sg = !has_second_ref(left_mbmi);
-      const int a_sg = !has_second_ref(above_mbmi);
-      const MV_REFERENCE_FRAME vrfa =
-          a_sg ? above_mbmi->ref_frame[0] : above_mbmi->ref_frame[var_ref_idx];
-      const MV_REFERENCE_FRAME vrfl =
-          l_sg ? left_mbmi->ref_frame[0] : left_mbmi->ref_frame[var_ref_idx];
-
-      if (vrfa == vrfl && cm->comp_var_ref[1] == vrfa) {
-        pred_context = 0;
-      } else if (l_sg && a_sg) {  // single/single
-        if ((vrfa == cm->comp_fixed_ref && vrfl == cm->comp_var_ref[0]) ||
-            (vrfl == cm->comp_fixed_ref && vrfa == cm->comp_var_ref[0]))
-          pred_context = 4;
-        else if (vrfa == vrfl)
-          pred_context = 3;
-        else
-          pred_context = 1;
-      } else if (l_sg || a_sg) {  // single/comp
-        const MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;
-        const MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;
-        if (vrfc == cm->comp_var_ref[1] && rfs != cm->comp_var_ref[1])
-          pred_context = 1;
-        else if (rfs == cm->comp_var_ref[1] && vrfc != cm->comp_var_ref[1])
-          pred_context = 2;
-        else
-          pred_context = 4;
-      } else if (vrfa == vrfl) {  // comp/comp
-        pred_context = 4;
-      } else {
-        pred_context = 2;
-      }
-    }
-  } else if (above_in_image || left_in_image) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
-
-    if (!is_inter_block(edge_mbmi)) {
-      pred_context = 2;
-    } else {
-      if (has_second_ref(edge_mbmi))
-        pred_context =
-            4 * (edge_mbmi->ref_frame[var_ref_idx] != cm->comp_var_ref[1]);
-      else
-        pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
-    }
-  } else {  // no edges available (2)
-    pred_context = 2;
-  }
-  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-
-  return pred_context;
+// Signal the first reference frame for a compound mode be GOLDEN,
+// conditioning on that it is known either GOLDEN or LAST3.
+int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd) {
+  return get_pred_context_last3_or_gld(xd);
 }
 
-#endif  // CONFIG_EXT_REFS
+// Signal the 2nd reference frame for a compound mode be either
+// ALTREF, or ALTREF2/BWDREF.
+int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd) {
+  return get_pred_context_brfarf2_or_arf(xd);
+}
 
-#if CONFIG_EXT_REFS
+// Signal the 2nd reference frame for a compound mode be either
+// ALTREF2 or BWDREF.
+int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd) {
+  return get_pred_context_brf_or_arf2(xd);
+}
 
+// == Context functions for single ref ==
+//
 // For the bit to signal whether the single reference is a forward reference
 // frame or a backward reference frame.
 int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int has_above = xd->up_available;
-  const int has_left = xd->left_available;
-
-  // Note:
-  // The mode info data structure has a one element border above and to the
-  // left of the entries correpsonding to real macroblocks.
-  // The prediction flags in these dummy entries are initialised to 0.
-  if (has_above && has_left) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-
-    if (above_intra && left_intra) {  // intra/intra
-      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-
-      if (!has_second_ref(edge_mbmi))  // single
-        pred_context = 4 * (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]));
-      else  // comp
-        pred_context = 2;
-    } else {  // inter/inter
-      const int above_has_second = has_second_ref(above_mbmi);
-      const int left_has_second = has_second_ref(left_mbmi);
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
 
-      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+  // Count of forward reference frames
+  const int fwd_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] +
+                        ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
+  // Count of backward reference frames
+  const int bwd_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] +
+                        ref_counts[ALTREF_FRAME];
 
-      if (above_has_second && left_has_second) {  // comp/comp
-        pred_context = 2;
-      } else if (above_has_second || left_has_second) {  // single/comp
-        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
-
-        pred_context = (!CHECK_BACKWARD_REFS(rfs)) ? 4 : 1;
-      } else {  // single/single
-        pred_context = 2 * (!CHECK_BACKWARD_REFS(above0)) +
-                       2 * (!CHECK_BACKWARD_REFS(left0));
-      }
-    }
-  } else if (has_above || has_left) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
-    if (!is_inter_block(edge_mbmi)) {  // intra
-      pred_context = 2;
-    } else {                           // inter
-      if (!has_second_ref(edge_mbmi))  // single
-        pred_context = 4 * (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]));
-      else  // comp
-        pred_context = 2;
-    }
-  } else {  // no edges available
-    pred_context = 2;
-  }
+  const int pred_context =
+      (fwd_count == bwd_count) ? 1 : ((fwd_count < bwd_count) ? 0 : 2);
 
   assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
   return pred_context;
@@ -1140,445 +473,29 @@ int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
 // non-ALTREF backward reference frame, knowing that it shall be either of
 // these 2 choices.
 int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
-  return av1_get_pred_context_brfarf2_or_arf(xd);
+  return get_pred_context_brfarf2_or_arf(xd);
 }
 
 // For the bit to signal whether the single reference is LAST3/GOLDEN or
 // LAST2/LAST, knowing that it shall be either of these 2 choices.
 int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int has_above = xd->up_available;
-  const int has_left = xd->left_available;
-
-  // Note:
-  // The mode info data structure has a one element border above and to the
-  // left of the entries correpsonding to real macroblocks.
-  // The prediction flags in these dummy entries are initialised to 0.
-  if (has_above && has_left) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-
-    if (above_intra && left_intra) {  // intra/intra
-      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi)) {  // single
-        if (CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]))
-          pred_context = 3;
-        else
-          pred_context = 4 * CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]);
-      } else {  // comp
-        pred_context = 1 +
-                       2 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) ||
-                            CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[1]));
-      }
-    } else {  // inter/inter
-      const int above_has_second = has_second_ref(above_mbmi);
-      const int left_has_second = has_second_ref(left_mbmi);
-      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
-      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
-
-      if (above_has_second && left_has_second) {  // comp/comp
-        if (above0 == left0 && above1 == left1)
-          pred_context =
-              3 * (CHECK_LAST_OR_LAST2(above0) || CHECK_LAST_OR_LAST2(above1) ||
-                   CHECK_LAST_OR_LAST2(left0) || CHECK_LAST_OR_LAST2(left1));
-        else
-          pred_context = 2;
-      } else if (above_has_second || left_has_second) {  // single/comp
-        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
-
-        if (CHECK_LAST_OR_LAST2(rfs))
-          pred_context =
-              3 + (CHECK_LAST_OR_LAST2(crf1) || CHECK_LAST_OR_LAST2(crf2));
-        else if (CHECK_GOLDEN_OR_LAST3(rfs))
-          pred_context =
-              (CHECK_LAST_OR_LAST2(crf1) || CHECK_LAST_OR_LAST2(crf2));
-        else
-          pred_context =
-              1 + 2 * (CHECK_LAST_OR_LAST2(crf1) || CHECK_LAST_OR_LAST2(crf2));
-      } else {  // single/single
-        if (CHECK_BACKWARD_REFS(above0) && CHECK_BACKWARD_REFS(left0)) {
-          pred_context = 2 + (above0 == left0);
-        } else if (CHECK_BACKWARD_REFS(above0) || CHECK_BACKWARD_REFS(left0)) {
-          const MV_REFERENCE_FRAME edge0 =
-              CHECK_BACKWARD_REFS(above0) ? left0 : above0;
-          pred_context = 4 * CHECK_LAST_OR_LAST2(edge0);
-        } else {
-          pred_context =
-              2 * CHECK_LAST_OR_LAST2(above0) + 2 * CHECK_LAST_OR_LAST2(left0);
-        }
-      }
-    }
-  } else if (has_above || has_left) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
-
-    if (!is_inter_block(edge_mbmi) ||
-        (CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]) &&
-         !has_second_ref(edge_mbmi)))
-      pred_context = 2;
-    else if (!has_second_ref(edge_mbmi))  // single
-      pred_context = 4 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]));
-    else  // comp
-      pred_context = 3 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) ||
-                          CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[1]));
-  } else {  // no edges available (2)
-    pred_context = 2;
-  }
-
-  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-  return pred_context;
+  return get_pred_context_ll2_or_l3gld(xd);
 }
 
 // For the bit to signal whether the single reference is LAST2_FRAME or
 // LAST_FRAME, knowing that it shall be either of these 2 choices.
-//
-// NOTE(zoeliu): The probability of ref_frame[0] is LAST2_FRAME, conditioning
-// on it is either LAST2_FRAME/LAST_FRAME.
 int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int has_above = xd->up_available;
-  const int has_left = xd->left_available;
-
-  // Note:
-  // The mode info data structure has a one element border above and to the
-  // left of the entries correpsonding to real macroblocks.
-  // The prediction flags in these dummy entries are initialised to 0.
-  if (has_above && has_left) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-
-    if (above_intra && left_intra) {  // intra/intra
-      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi)) {  // single
-        if (!CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]))
-          pred_context = 3;
-        else
-          pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
-      } else {  // comp
-        pred_context = 1 +
-                       2 * (edge_mbmi->ref_frame[0] == LAST_FRAME ||
-                            edge_mbmi->ref_frame[1] == LAST_FRAME);
-      }
-    } else {  // inter/inter
-      const int above_has_second = has_second_ref(above_mbmi);
-      const int left_has_second = has_second_ref(left_mbmi);
-      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
-      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
-
-      if (above_has_second && left_has_second) {  // comp/comp
-        if (above0 == left0 && above1 == left1)
-          pred_context = 3 * (above0 == LAST_FRAME || above1 == LAST_FRAME ||
-                              left0 == LAST_FRAME || left1 == LAST_FRAME);
-        else
-          pred_context = 2;
-      } else if (above_has_second || left_has_second) {  // single/comp
-        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
-
-        if (rfs == LAST_FRAME)
-          pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
-        else if (rfs == LAST2_FRAME)
-          pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
-        else
-          pred_context = 1 + 2 * (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
-      } else {  // single/single
-        if (!CHECK_LAST_OR_LAST2(above0) && !CHECK_LAST_OR_LAST2(left0)) {
-          pred_context = 2 + (above0 == left0);
-        } else if (!CHECK_LAST_OR_LAST2(above0) ||
-                   !CHECK_LAST_OR_LAST2(left0)) {
-          const MV_REFERENCE_FRAME edge0 =
-              !CHECK_LAST_OR_LAST2(above0) ? left0 : above0;
-          pred_context = 4 * (edge0 == LAST_FRAME);
-        } else {
-          pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME);
-        }
-      }
-    }
-  } else if (has_above || has_left) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
-
-    if (!is_inter_block(edge_mbmi) ||
-        (!CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) &&
-         !has_second_ref(edge_mbmi)))
-      pred_context = 2;
-    else if (!has_second_ref(edge_mbmi))  // single
-      pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
-    else  // comp
-      pred_context = 3 * (edge_mbmi->ref_frame[0] == LAST_FRAME ||
-                          edge_mbmi->ref_frame[1] == LAST_FRAME);
-  } else {  // no edges available (2)
-    pred_context = 2;
-  }
-
-  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-  return pred_context;
+  return get_pred_context_last_or_last2(xd);
 }
 
 // For the bit to signal whether the single reference is GOLDEN_FRAME or
 // LAST3_FRAME, knowing that it shall be either of these 2 choices.
-//
-// NOTE(zoeliu): The probability of ref_frame[0] is GOLDEN_FRAME, conditioning
-// on it is either GOLDEN_FRAME/LAST3_FRAME.
 int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int has_above = xd->up_available;
-  const int has_left = xd->left_available;
-
-  // Note:
-  // The mode info data structure has a one element border above and to the
-  // left of the entries correpsonding to real macroblocks.
-  // The prediction flags in these dummy entries are initialised to 0.
-  if (has_above && has_left) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-
-    if (above_intra && left_intra) {  // intra/intra
-      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi)) {  // single
-        if (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]))
-          pred_context = 3;
-        else
-          pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST3_FRAME);
-      } else {  // comp
-        pred_context = 1 +
-                       2 * (edge_mbmi->ref_frame[0] == LAST3_FRAME ||
-                            edge_mbmi->ref_frame[1] == LAST3_FRAME);
-      }
-    } else {  // inter/inter
-      const int above_has_second = has_second_ref(above_mbmi);
-      const int left_has_second = has_second_ref(left_mbmi);
-      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
-      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
-
-      if (above_has_second && left_has_second) {  // comp/comp
-        if (above0 == left0 && above1 == left1)
-          pred_context = 3 * (above0 == LAST3_FRAME || above1 == LAST3_FRAME ||
-                              left0 == LAST3_FRAME || left1 == LAST3_FRAME);
-        else
-          pred_context = 2;
-      } else if (above_has_second || left_has_second) {  // single/comp
-        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
-
-        if (rfs == LAST3_FRAME)
-          pred_context = 3 + (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
-        else if (rfs == GOLDEN_FRAME)
-          pred_context = (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
-        else
-          pred_context = 1 + 2 * (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
-      } else {  // single/single
-        if (!CHECK_GOLDEN_OR_LAST3(above0) && !CHECK_GOLDEN_OR_LAST3(left0)) {
-          pred_context = 2 + (above0 == left0);
-        } else if (!CHECK_GOLDEN_OR_LAST3(above0) ||
-                   !CHECK_GOLDEN_OR_LAST3(left0)) {
-          const MV_REFERENCE_FRAME edge0 =
-              !CHECK_GOLDEN_OR_LAST3(above0) ? left0 : above0;
-          pred_context = 4 * (edge0 == LAST3_FRAME);
-        } else {
-          pred_context =
-              2 * (above0 == LAST3_FRAME) + 2 * (left0 == LAST3_FRAME);
-        }
-      }
-    }
-  } else if (has_above || has_left) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
-
-    if (!is_inter_block(edge_mbmi) ||
-        (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]) &&
-         !has_second_ref(edge_mbmi)))
-      pred_context = 2;
-    else if (!has_second_ref(edge_mbmi))  // single
-      pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST3_FRAME);
-    else  // comp
-      pred_context = 3 * (edge_mbmi->ref_frame[0] == LAST3_FRAME ||
-                          edge_mbmi->ref_frame[1] == LAST3_FRAME);
-  } else {  // no edges available (2)
-    pred_context = 2;
-  }
-
-  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-  return pred_context;
+  return get_pred_context_last3_or_gld(xd);
 }
 
 // For the bit to signal whether the single reference is ALTREF2_FRAME or
 // BWDREF_FRAME, knowing that it shall be either of these 2 choices.
 int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd) {
-  return av1_get_pred_context_brf_or_arf2(xd);
-}
-
-#else  // !CONFIG_EXT_REFS
-
-int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int has_above = xd->up_available;
-  const int has_left = xd->left_available;
-  // Note:
-  // The mode info data structure has a one element border above and to the
-  // left of the entries corresponding to real macroblocks.
-  // The prediction flags in these dummy entries are initialized to 0.
-  if (has_above && has_left) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-
-    if (above_intra && left_intra) {  // intra/intra
-      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi))
-        pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
-      else
-        pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
-                            edge_mbmi->ref_frame[1] == LAST_FRAME);
-    } else {  // inter/inter
-      const int above_has_second = has_second_ref(above_mbmi);
-      const int left_has_second = has_second_ref(left_mbmi);
-      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
-      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
-
-      if (above_has_second && left_has_second) {
-        pred_context = 1 + (above0 == LAST_FRAME || above1 == LAST_FRAME ||
-                            left0 == LAST_FRAME || left1 == LAST_FRAME);
-      } else if (above_has_second || left_has_second) {
-        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
-
-        if (rfs == LAST_FRAME)
-          pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
-        else
-          pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
-      } else {
-        pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME);
-      }
-    }
-  } else if (has_above || has_left) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
-    if (!is_inter_block(edge_mbmi)) {  // intra
-      pred_context = 2;
-    } else {  // inter
-      if (!has_second_ref(edge_mbmi))
-        pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
-      else
-        pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
-                            edge_mbmi->ref_frame[1] == LAST_FRAME);
-    }
-  } else {  // no edges available
-    pred_context = 2;
-  }
-
-  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-  return pred_context;
+  return get_pred_context_brf_or_arf2(xd);
 }
-
-int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
-  int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int has_above = xd->up_available;
-  const int has_left = xd->left_available;
-
-  // Note:
-  // The mode info data structure has a one element border above and to the
-  // left of the entries corresponding to real macroblocks.
-  // The prediction flags in these dummy entries are initialized to 0.
-  if (has_above && has_left) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-
-    if (above_intra && left_intra) {  // intra/intra
-      pred_context = 2;
-    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi)) {
-        if (edge_mbmi->ref_frame[0] == LAST_FRAME)
-          pred_context = 3;
-        else
-          pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
-      } else {
-        pred_context = 1 +
-                       2 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
-                            edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
-      }
-    } else {  // inter/inter
-      const int above_has_second = has_second_ref(above_mbmi);
-      const int left_has_second = has_second_ref(left_mbmi);
-      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
-      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
-
-      if (above_has_second && left_has_second) {
-        if (above0 == left0 && above1 == left1)
-          pred_context =
-              3 * (above0 == GOLDEN_FRAME || above1 == GOLDEN_FRAME ||
-                   left0 == GOLDEN_FRAME || left1 == GOLDEN_FRAME);
-        else
-          pred_context = 2;
-      } else if (above_has_second || left_has_second) {
-        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
-
-        if (rfs == GOLDEN_FRAME)
-          pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
-        else if (rfs != GOLDEN_FRAME && rfs != LAST_FRAME)
-          pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;
-        else
-          pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
-      } else {
-        if (above0 == LAST_FRAME && left0 == LAST_FRAME) {
-          pred_context = 3;
-        } else if (above0 == LAST_FRAME || left0 == LAST_FRAME) {
-          const MV_REFERENCE_FRAME edge0 =
-              (above0 == LAST_FRAME) ? left0 : above0;
-          pred_context = 4 * (edge0 == GOLDEN_FRAME);
-        } else {
-          pred_context =
-              2 * (above0 == GOLDEN_FRAME) + 2 * (left0 == GOLDEN_FRAME);
-        }
-      }
-    }
-  } else if (has_above || has_left) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
-
-    if (!is_inter_block(edge_mbmi) ||
-        (edge_mbmi->ref_frame[0] == LAST_FRAME && !has_second_ref(edge_mbmi)))
-      pred_context = 2;
-    else if (!has_second_ref(edge_mbmi))
-      pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
-    else
-      pred_context = 3 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
-                          edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
-  } else {  // no edges available (2)
-    pred_context = 2;
-  }
-  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
-  return pred_context;
-}
-
-#endif  // CONFIG_EXT_REFS
diff --git a/third_party/aom/av1/common/pred_common.h b/third_party/aom/av1/common/pred_common.h
index db4618a59..6a835c467 100644
--- a/third_party/aom/av1/common/pred_common.h
+++ b/third_party/aom/av1/common/pred_common.h
@@ -13,6 +13,7 @@
 #define AV1_COMMON_PRED_COMMON_H_
 
 #include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
 #include "av1/common/onyxc_int.h"
 #include "aom_dsp/aom_dsp_common.h"
 
@@ -39,115 +40,174 @@ static INLINE int get_segment_id(const AV1_COMMON *const cm,
   return segment_id;
 }
 
+static INLINE int av1_get_spatial_seg_pred(const AV1_COMMON *const cm,
+                                           const MACROBLOCKD *const xd,
+                                           int mi_row, int mi_col,
+                                           int *cdf_index) {
+  int prev_ul = -1;  // top left segment_id
+  int prev_l = -1;   // left segment_id
+  int prev_u = -1;   // top segment_id
+  if ((xd->up_available) && (xd->left_available)) {
+    prev_ul = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
+                             mi_row - 1, mi_col - 1);
+  }
+  if (xd->up_available) {
+    prev_u = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
+                            mi_row - 1, mi_col - 0);
+  }
+  if (xd->left_available) {
+    prev_l = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
+                            mi_row - 0, mi_col - 1);
+  }
+
+  // Pick CDF index based on number of matching/out-of-bounds segment IDs.
+  if (prev_ul < 0 || prev_u < 0 || prev_l < 0) /* Edge case */
+    *cdf_index = 0;
+  else if ((prev_ul == prev_u) && (prev_ul == prev_l))
+    *cdf_index = 2;
+  else if ((prev_ul == prev_u) || (prev_ul == prev_l) || (prev_u == prev_l))
+    *cdf_index = 1;
+  else
+    *cdf_index = 0;
+
+  // If 2 or more are identical returns that as predictor, otherwise prev_l.
+  if (prev_u == -1)  // edge case
+    return prev_l == -1 ? 0 : prev_l;
+  if (prev_l == -1)  // edge case
+    return prev_u;
+  return (prev_ul == prev_u) ? prev_u : prev_l;
+}
+
 static INLINE int av1_get_pred_context_seg_id(const MACROBLOCKD *xd) {
-  const MODE_INFO *const above_mi = xd->above_mi;
-  const MODE_INFO *const left_mi = xd->left_mi;
-  const int above_sip =
-      (above_mi != NULL) ? above_mi->mbmi.seg_id_predicted : 0;
-  const int left_sip = (left_mi != NULL) ? left_mi->mbmi.seg_id_predicted : 0;
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  const int above_sip = (above_mi != NULL) ? above_mi->seg_id_predicted : 0;
+  const int left_sip = (left_mi != NULL) ? left_mi->seg_id_predicted : 0;
 
   return above_sip + left_sip;
 }
 
-static INLINE aom_prob av1_get_pred_prob_seg_id(
-    const struct segmentation_probs *segp, const MACROBLOCKD *xd) {
-  return segp->pred_probs[av1_get_pred_context_seg_id(xd)];
+static INLINE int get_comp_index_context(const AV1_COMMON *cm,
+                                         const MACROBLOCKD *xd) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  int bck_idx = cm->frame_refs[mbmi->ref_frame[0] - LAST_FRAME].idx;
+  int fwd_idx = cm->frame_refs[mbmi->ref_frame[1] - LAST_FRAME].idx;
+  int bck_frame_index = 0, fwd_frame_index = 0;
+  int cur_frame_index = cm->cur_frame->cur_frame_offset;
+
+  if (bck_idx >= 0)
+    bck_frame_index = cm->buffer_pool->frame_bufs[bck_idx].cur_frame_offset;
+
+  if (fwd_idx >= 0)
+    fwd_frame_index = cm->buffer_pool->frame_bufs[fwd_idx].cur_frame_offset;
+  int fwd = abs(get_relative_dist(cm, fwd_frame_index, cur_frame_index));
+  int bck = abs(get_relative_dist(cm, cur_frame_index, bck_frame_index));
+
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+
+  int above_ctx = 0, left_ctx = 0;
+  const int offset = (fwd == bck);
+
+  if (above_mi) {
+    if (has_second_ref(above_mi))
+      above_ctx = above_mi->compound_idx;
+    else if (above_mi->ref_frame[0] == ALTREF_FRAME)
+      above_ctx = 1;
+  }
+
+  if (left_mi) {
+    if (has_second_ref(left_mi))
+      left_ctx = left_mi->compound_idx;
+    else if (left_mi->ref_frame[0] == ALTREF_FRAME)
+      left_ctx = 1;
+  }
+
+  return above_ctx + left_ctx + 3 * offset;
+}
+
+static INLINE int get_comp_group_idx_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  int above_ctx = 0, left_ctx = 0;
+
+  if (above_mi) {
+    if (has_second_ref(above_mi))
+      above_ctx = above_mi->comp_group_idx;
+    else if (above_mi->ref_frame[0] == ALTREF_FRAME)
+      above_ctx = 3;
+  }
+  if (left_mi) {
+    if (has_second_ref(left_mi))
+      left_ctx = left_mi->comp_group_idx;
+    else if (left_mi->ref_frame[0] == ALTREF_FRAME)
+      left_ctx = 3;
+  }
+
+  return AOMMIN(5, above_ctx + left_ctx);
 }
 
-#if CONFIG_NEW_MULTISYMBOL
 static INLINE aom_cdf_prob *av1_get_pred_cdf_seg_id(
     struct segmentation_probs *segp, const MACROBLOCKD *xd) {
   return segp->pred_cdf[av1_get_pred_context_seg_id(xd)];
 }
-#endif
 
-static INLINE int av1_get_skip_context(const MACROBLOCKD *xd) {
-  const MODE_INFO *const above_mi = xd->above_mi;
-  const MODE_INFO *const left_mi = xd->left_mi;
-  const int above_skip = (above_mi != NULL) ? above_mi->mbmi.skip : 0;
-  const int left_skip = (left_mi != NULL) ? left_mi->mbmi.skip : 0;
-  return above_skip + left_skip;
+static INLINE int av1_get_skip_mode_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  const int above_skip_mode = above_mi ? above_mi->skip_mode : 0;
+  const int left_skip_mode = left_mi ? left_mi->skip_mode : 0;
+  return above_skip_mode + left_skip_mode;
 }
 
-static INLINE aom_prob av1_get_skip_prob(const AV1_COMMON *cm,
-                                         const MACROBLOCKD *xd) {
-  return cm->fc->skip_probs[av1_get_skip_context(xd)];
+static INLINE int av1_get_skip_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  const int above_skip = above_mi ? above_mi->skip : 0;
+  const int left_skip = left_mi ? left_mi->skip : 0;
+  return above_skip + left_skip;
 }
 
-#if CONFIG_DUAL_FILTER
 int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir);
-#else
-int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
-#endif
-
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-int av1_get_pred_context_intra_interp(const MACROBLOCKD *xd);
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
 
-#if CONFIG_PALETTE_DELTA_ENCODING
 // Get a list of palette base colors that are used in the above and left blocks,
 // referred to as "color cache". The return value is the number of colors in the
 // cache (<= 2 * PALETTE_MAX_SIZE). The color values are stored in "cache"
 // in ascending order.
 int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane,
                           uint16_t *cache);
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
 
-int av1_get_intra_inter_context(const MACROBLOCKD *xd);
+static INLINE int av1_get_palette_bsize_ctx(BLOCK_SIZE bsize) {
+  return num_pels_log2_lookup[bsize] - num_pels_log2_lookup[BLOCK_8X8];
+}
 
-static INLINE aom_prob av1_get_intra_inter_prob(const AV1_COMMON *cm,
-                                                const MACROBLOCKD *xd) {
-  return cm->fc->intra_inter_prob[av1_get_intra_inter_context(xd)];
+static INLINE int av1_get_palette_mode_ctx(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  int ctx = 0;
+  if (above_mi) ctx += (above_mi->palette_mode_info.palette_size[0] > 0);
+  if (left_mi) ctx += (left_mi->palette_mode_info.palette_size[0] > 0);
+  return ctx;
 }
 
-int av1_get_reference_mode_context(const AV1_COMMON *cm, const MACROBLOCKD *xd);
+int av1_get_intra_inter_context(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_reference_mode_prob(const AV1_COMMON *cm,
-                                                   const MACROBLOCKD *xd) {
-  return cm->fc->comp_inter_prob[av1_get_reference_mode_context(cm, xd)];
-}
-#if CONFIG_NEW_MULTISYMBOL
-static INLINE aom_cdf_prob *av1_get_reference_mode_cdf(const AV1_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
-  return xd->tile_ctx->comp_inter_cdf[av1_get_reference_mode_context(cm, xd)];
+int av1_get_reference_mode_context(const MACROBLOCKD *xd);
+
+static INLINE aom_cdf_prob *av1_get_reference_mode_cdf(const MACROBLOCKD *xd) {
+  return xd->tile_ctx->comp_inter_cdf[av1_get_reference_mode_context(xd)];
 }
-#endif
 
-#if CONFIG_EXT_COMP_REFS
 int av1_get_comp_reference_type_context(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_comp_reference_type_prob(const AV1_COMMON *cm,
-                                                        const MACROBLOCKD *xd) {
-  return cm->fc->comp_ref_type_prob[av1_get_comp_reference_type_context(xd)];
-}
+// == Uni-directional contexts ==
 
 int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_pred_prob_uni_comp_ref_p(const AV1_COMMON *cm,
-                                                        const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_uni_comp_ref_p(xd);
-  return cm->fc->uni_comp_ref_prob[pred_context][0];
-}
-
 int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd);
 
-static INLINE aom_prob
-av1_get_pred_prob_uni_comp_ref_p1(const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_uni_comp_ref_p1(xd);
-  return cm->fc->uni_comp_ref_prob[pred_context][1];
-}
-
 int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd);
 
-static INLINE aom_prob
-av1_get_pred_prob_uni_comp_ref_p2(const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_uni_comp_ref_p2(xd);
-  return cm->fc->uni_comp_ref_prob[pred_context][2];
-}
-
-#if CONFIG_NEW_MULTISYMBOL
 static INLINE aom_cdf_prob *av1_get_comp_reference_type_cdf(
     const MACROBLOCKD *xd) {
   const int pred_context = av1_get_comp_reference_type_context(xd);
@@ -171,211 +231,126 @@ static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p2(
   const int pred_context = av1_get_pred_context_uni_comp_ref_p2(xd);
   return xd->tile_ctx->uni_comp_ref_cdf[pred_context][2];
 }
-#endif  // CONFIG_NEW_MULTISYMBOL
-#endif  // CONFIG_EXT_COMP_REFS
 
-int av1_get_pred_context_comp_ref_p(const AV1_COMMON *cm,
-                                    const MACROBLOCKD *xd);
+// == Bi-directional contexts ==
 
-#if CONFIG_NEW_MULTISYMBOL
-static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p(const AV1_COMMON *cm,
-                                                        const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_ref_p(cm, xd);
-  return xd->tile_ctx->comp_ref_cdf[pred_context][0];
-}
-#endif
+int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_pred_prob_comp_ref_p(const AV1_COMMON *cm,
-                                                    const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_ref_p(cm, xd);
-  return cm->fc->comp_ref_prob[pred_context][0];
-}
+int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd);
 
-#if CONFIG_EXT_REFS
-int av1_get_pred_context_comp_ref_p1(const AV1_COMMON *cm,
-                                     const MACROBLOCKD *xd);
+int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd);
 
-#if CONFIG_NEW_MULTISYMBOL
-static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p1(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_ref_p1(cm, xd);
-  return xd->tile_ctx->comp_ref_cdf[pred_context][1];
-}
-#endif  // CONFIG_NEW_MULTISYMBOL
+int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_pred_prob_comp_ref_p1(const AV1_COMMON *cm,
-                                                     const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_ref_p1(cm, xd);
-  return cm->fc->comp_ref_prob[pred_context][1];
+int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd);
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p(const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_ref_p(xd);
+  return xd->tile_ctx->comp_ref_cdf[pred_context][0];
 }
 
-int av1_get_pred_context_comp_ref_p2(const AV1_COMMON *cm,
-                                     const MACROBLOCKD *xd);
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p1(
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_ref_p1(xd);
+  return xd->tile_ctx->comp_ref_cdf[pred_context][1];
+}
 
-#if CONFIG_NEW_MULTISYMBOL
 static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p2(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_ref_p2(cm, xd);
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_ref_p2(xd);
   return xd->tile_ctx->comp_ref_cdf[pred_context][2];
 }
-#endif  // CONFIG_NEW_MULTISYMBOL
 
-static INLINE aom_prob av1_get_pred_prob_comp_ref_p2(const AV1_COMMON *cm,
-                                                     const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_ref_p2(cm, xd);
-  return cm->fc->comp_ref_prob[pred_context][2];
-}
-
-int av1_get_pred_context_comp_bwdref_p(const AV1_COMMON *cm,
-                                       const MACROBLOCKD *xd);
-
-#if CONFIG_NEW_MULTISYMBOL
 static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_bwdref_p(cm, xd);
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_bwdref_p(xd);
   return xd->tile_ctx->comp_bwdref_cdf[pred_context][0];
 }
-#endif  // CONFIG_NEW_MULTISYMBOL
 
-static INLINE aom_prob av1_get_pred_prob_comp_bwdref_p(const AV1_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_bwdref_p(cm, xd);
-  return cm->fc->comp_bwdref_prob[pred_context][0];
-}
-
-int av1_get_pred_context_comp_bwdref_p1(const AV1_COMMON *cm,
-                                        const MACROBLOCKD *xd);
-
-#if CONFIG_NEW_MULTISYMBOL
 static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p1(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_bwdref_p1(cm, xd);
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_bwdref_p1(xd);
   return xd->tile_ctx->comp_bwdref_cdf[pred_context][1];
 }
-#endif  // CONFIG_NEW_MULTISYMBOL
 
-static INLINE aom_prob av1_get_pred_prob_comp_bwdref_p1(const AV1_COMMON *cm,
-                                                        const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_bwdref_p1(cm, xd);
-  return cm->fc->comp_bwdref_prob[pred_context][1];
-}
-#endif  // CONFIG_EXT_REFS
+// == Single contexts ==
 
 int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_pred_prob_single_ref_p1(const AV1_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
-  return cm->fc->single_ref_prob[av1_get_pred_context_single_ref_p1(xd)][0];
-}
-
 int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_pred_prob_single_ref_p2(const AV1_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
-  return cm->fc->single_ref_prob[av1_get_pred_context_single_ref_p2(xd)][1];
-}
-
-#if CONFIG_EXT_REFS
 int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_pred_prob_single_ref_p3(const AV1_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
-  return cm->fc->single_ref_prob[av1_get_pred_context_single_ref_p3(xd)][2];
-}
-
 int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_pred_prob_single_ref_p4(const AV1_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
-  return cm->fc->single_ref_prob[av1_get_pred_context_single_ref_p4(xd)][3];
-}
-
 int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_pred_prob_single_ref_p5(const AV1_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
-  return cm->fc->single_ref_prob[av1_get_pred_context_single_ref_p5(xd)][4];
-}
-
 int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_pred_prob_single_ref_p6(const AV1_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
-  return cm->fc->single_ref_prob[av1_get_pred_context_single_ref_p6(xd)][5];
-}
-#endif  // CONFIG_EXT_REFS
-
-#if CONFIG_NEW_MULTISYMBOL
 static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p1(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  (void)cm;
+    const MACROBLOCKD *xd) {
   return xd->tile_ctx
       ->single_ref_cdf[av1_get_pred_context_single_ref_p1(xd)][0];
 }
 static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p2(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  (void)cm;
+    const MACROBLOCKD *xd) {
   return xd->tile_ctx
       ->single_ref_cdf[av1_get_pred_context_single_ref_p2(xd)][1];
 }
-#if CONFIG_EXT_REFS
 static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p3(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  (void)cm;
+    const MACROBLOCKD *xd) {
   return xd->tile_ctx
       ->single_ref_cdf[av1_get_pred_context_single_ref_p3(xd)][2];
 }
 static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p4(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  (void)cm;
+    const MACROBLOCKD *xd) {
   return xd->tile_ctx
       ->single_ref_cdf[av1_get_pred_context_single_ref_p4(xd)][3];
 }
 static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p5(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  (void)cm;
+    const MACROBLOCKD *xd) {
   return xd->tile_ctx
       ->single_ref_cdf[av1_get_pred_context_single_ref_p5(xd)][4];
 }
 static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p6(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd) {
-  (void)cm;
+    const MACROBLOCKD *xd) {
   return xd->tile_ctx
       ->single_ref_cdf[av1_get_pred_context_single_ref_p6(xd)][5];
 }
-#endif  // CONFIG_EXT_REFS
-#endif  // CONFIG_NEW_MULTISYMBOL
-
-#if CONFIG_COMPOUND_SINGLEREF
-int av1_get_inter_mode_context(const MACROBLOCKD *xd);
-
-static INLINE aom_prob av1_get_inter_mode_prob(const AV1_COMMON *cm,
-                                               const MACROBLOCKD *xd) {
-  return cm->fc->comp_inter_mode_prob[av1_get_inter_mode_context(xd)];
-}
-#endif  // CONFIG_COMPOUND_SINGLEREF
 
 // Returns a context number for the given MB prediction signal
 // The mode info data structure has a one element border above and to the
 // left of the entries corresponding to real blocks.
 // The prediction flags in these dummy entries are initialized to 0.
 static INLINE int get_tx_size_context(const MACROBLOCKD *xd) {
-  const int max_tx_size = max_txsize_lookup[xd->mi[0]->mbmi.sb_type];
+  const MB_MODE_INFO *mbmi = xd->mi[0];
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const TX_SIZE max_tx_size = max_txsize_rect_lookup[mbmi->sb_type];
+  const int max_tx_wide = tx_size_wide[max_tx_size];
+  const int max_tx_high = tx_size_high[max_tx_size];
   const int has_above = xd->up_available;
   const int has_left = xd->left_available;
-  int above_ctx = (has_above && !above_mbmi->skip)
-                      ? (int)txsize_sqr_map[above_mbmi->tx_size]
-                      : max_tx_size;
-  int left_ctx = (has_left && !left_mbmi->skip)
-                     ? (int)txsize_sqr_map[left_mbmi->tx_size]
-                     : max_tx_size;
-
-  if (!has_left) left_ctx = above_ctx;
 
-  if (!has_above) above_ctx = left_ctx;
-  return (above_ctx + left_ctx) > max_tx_size + TX_SIZE_LUMA_MIN;
+  int above = xd->above_txfm_context[0] >= max_tx_wide;
+  int left = xd->left_txfm_context[0] >= max_tx_high;
+
+  if (has_above)
+    if (is_inter_block(above_mbmi))
+      above = block_size_wide[above_mbmi->sb_type] >= max_tx_wide;
+
+  if (has_left)
+    if (is_inter_block(left_mbmi))
+      left = block_size_high[left_mbmi->sb_type] >= max_tx_high;
+
+  if (has_above && has_left)
+    return (above + left);
+  else if (has_above)
+    return above;
+  else if (has_left)
+    return left;
+  else
+    return 0;
 }
 
 #ifdef __cplusplus
diff --git a/third_party/aom/av1/common/pvq.c b/third_party/aom/av1/common/pvq.c
deleted file mode 100644
index 221c90c04..000000000
--- a/third_party/aom/av1/common/pvq.c
+++ /dev/null
@@ -1,1007 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include "odintrin.h"
-#include "partition.h"
-#include "pvq.h"
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-/* Imported from encode.c in daala */
-/* These are the PVQ equivalent of quantization matrices, except that
-   the values are per-band. */
-#define OD_MASKING_DISABLED 0
-#define OD_MASKING_ENABLED 1
-
-const unsigned char OD_LUMA_QM_Q4[2][OD_QM_SIZE] = {
-/* Flat quantization for PSNR. The DC component isn't 16 because the DC
-   magnitude compensation is done here for inter (Haar DC doesn't need it).
-   Masking disabled: */
- {
-  16, 16,
-  16, 16, 16, 16,
-  16, 16, 16, 16, 16, 16,
-  16, 16, 16, 16, 16, 16, 16, 16
- },
-/* The non-flat AC coefficients compensate for the non-linear scaling caused
-   by activity masking. The values are currently hand-tuned so that the rate
-   of each band remains roughly constant when enabling activity masking
-   on intra.
-   Masking enabled: */
- {
-  16, 16,
-  16, 18, 28, 32,
-  16, 14, 20, 20, 28, 32,
-  16, 11, 14, 14, 17, 17, 22, 28
- }
-};
-
-const unsigned char OD_CHROMA_QM_Q4[2][OD_QM_SIZE] = {
-/* Chroma quantization is different because of the reduced lapping.
-   FIXME: Use the same matrix as luma for 4:4:4.
-   Masking disabled: */
- {
-  16, 16,
-  16, 16, 16, 16,
-  16, 16, 16, 16, 16, 16,
-  16, 16, 16, 16, 16, 16, 16, 16
- },
-/* The AC part is flat for chroma because it has no activity masking.
-   Masking enabled: */
- {
-  16, 16,
-  16, 16, 16, 16,
-  16, 16, 16, 16, 16, 16,
-  16, 16, 16, 16, 16, 16, 16, 16
- }
-};
-
-/* No interpolation, always use od_flat_qm_q4, but use a different scale for
-   each plane.
-   FIXME: Add interpolation and properly tune chroma. */
-const od_qm_entry OD_DEFAULT_QMS[2][2][OD_NPLANES_MAX] = {
-  /* Masking disabled */
-  { { { 4, 256, OD_LUMA_QM_Q4[OD_MASKING_DISABLED] },
-      { 4, 256, OD_CHROMA_QM_Q4[OD_MASKING_DISABLED] },
-      { 4, 256, OD_CHROMA_QM_Q4[OD_MASKING_DISABLED] } },
-    { { 0, 0, NULL},
-      { 0, 0, NULL},
-      { 0, 0, NULL} } },
-  /* Masking enabled */
-  { { { 4, 256, OD_LUMA_QM_Q4[OD_MASKING_ENABLED] },
-      { 4, 256, OD_CHROMA_QM_Q4[OD_MASKING_ENABLED] },
-      { 4, 256, OD_CHROMA_QM_Q4[OD_MASKING_ENABLED] } },
-    { { 0, 0, NULL},
-      { 0, 0, NULL},
-      { 0, 0, NULL} } }
-};
-
-/* Constants for the beta parameter, which controls how activity masking is
-   used.
-   beta = 1 / (1 - alpha), so when beta is 1, alpha is 0 and activity
-   masking is disabled. When beta is 1.5, activity masking is used. Note that
-   activity masking is neither used for 4x4 blocks nor for chroma. */
-#define OD_BETA(b) OD_QCONST32(b, OD_BETA_SHIFT)
-static const od_val16 OD_PVQ_BETA4_LUMA[1] = {OD_BETA(1.)};
-static const od_val16 OD_PVQ_BETA8_LUMA[4] = {OD_BETA(1.), OD_BETA(1.),
- OD_BETA(1.), OD_BETA(1.)};
-static const od_val16 OD_PVQ_BETA16_LUMA[7] = {OD_BETA(1.), OD_BETA(1.),
- OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.)};
-static const od_val16 OD_PVQ_BETA32_LUMA[10] = {OD_BETA(1.), OD_BETA(1.),
- OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.),
- OD_BETA(1.), OD_BETA(1.)};
-
-static const od_val16 OD_PVQ_BETA4_LUMA_MASKING[1] = {OD_BETA(1.)};
-static const od_val16 OD_PVQ_BETA8_LUMA_MASKING[4] = {OD_BETA(1.5),
- OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5)};
-static const od_val16 OD_PVQ_BETA16_LUMA_MASKING[7] = {OD_BETA(1.5),
- OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5),
- OD_BETA(1.5)};
-static const od_val16 OD_PVQ_BETA32_LUMA_MASKING[10] = {OD_BETA(1.5),
- OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5),
- OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5)};
-
-static const od_val16 OD_PVQ_BETA4_CHROMA[1] = {OD_BETA(1.)};
-static const od_val16 OD_PVQ_BETA8_CHROMA[4] = {OD_BETA(1.), OD_BETA(1.),
- OD_BETA(1.), OD_BETA(1.)};
-static const od_val16 OD_PVQ_BETA16_CHROMA[7] = {OD_BETA(1.), OD_BETA(1.),
- OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.)};
-static const od_val16 OD_PVQ_BETA32_CHROMA[10] = {OD_BETA(1.), OD_BETA(1.),
- OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.),
- OD_BETA(1.), OD_BETA(1.)};
-
-const od_val16 *const OD_PVQ_BETA[2][OD_NPLANES_MAX][OD_TXSIZES + 1] = {
- {{OD_PVQ_BETA4_LUMA, OD_PVQ_BETA8_LUMA,
-   OD_PVQ_BETA16_LUMA, OD_PVQ_BETA32_LUMA},
-  {OD_PVQ_BETA4_CHROMA, OD_PVQ_BETA8_CHROMA,
-   OD_PVQ_BETA16_CHROMA, OD_PVQ_BETA32_CHROMA},
-  {OD_PVQ_BETA4_CHROMA, OD_PVQ_BETA8_CHROMA,
-   OD_PVQ_BETA16_CHROMA, OD_PVQ_BETA32_CHROMA}},
- {{OD_PVQ_BETA4_LUMA_MASKING, OD_PVQ_BETA8_LUMA_MASKING,
-   OD_PVQ_BETA16_LUMA_MASKING, OD_PVQ_BETA32_LUMA_MASKING},
-  {OD_PVQ_BETA4_CHROMA, OD_PVQ_BETA8_CHROMA,
-   OD_PVQ_BETA16_CHROMA, OD_PVQ_BETA32_CHROMA},
-  {OD_PVQ_BETA4_CHROMA, OD_PVQ_BETA8_CHROMA,
-   OD_PVQ_BETA16_CHROMA, OD_PVQ_BETA32_CHROMA}}
-};
-
-
-void od_interp_qm(unsigned char *out, int q, const od_qm_entry *entry1,
-  const od_qm_entry *entry2) {
-  int i;
-  if (entry2 == NULL || entry2->qm_q4 == NULL
-   || q < entry1->interp_q << OD_COEFF_SHIFT) {
-    /* Use entry1. */
-    for (i = 0; i < OD_QM_SIZE; i++) {
-      out[i] = OD_MINI(255, entry1->qm_q4[i]*entry1->scale_q8 >> 8);
-    }
-  }
-  else if (entry1 == NULL || entry1->qm_q4 == NULL
-   || q > entry2->interp_q << OD_COEFF_SHIFT) {
-    /* Use entry2. */
-    for (i = 0; i < OD_QM_SIZE; i++) {
-      out[i] = OD_MINI(255, entry2->qm_q4[i]*entry2->scale_q8 >> 8);
-    }
-  }
-  else {
-    /* Interpolate between entry1 and entry2. The interpolation is linear
-       in terms of log(q) vs log(m*scale). Considering that we're ultimately
-       multiplying the result it makes sense, but we haven't tried other
-       interpolation methods. */
-    double x;
-    const unsigned char *m1;
-    const unsigned char *m2;
-    int q1;
-    int q2;
-    m1 = entry1->qm_q4;
-    m2 = entry2->qm_q4;
-    q1 = entry1->interp_q << OD_COEFF_SHIFT;
-    q2 = entry2->interp_q << OD_COEFF_SHIFT;
-    x = (log(q)-log(q1))/(log(q2)-log(q1));
-    for (i = 0; i < OD_QM_SIZE; i++) {
-      out[i] = OD_MINI(255, (int)floor(.5 + (1./256)*exp(
-       x*log(m2[i]*entry2->scale_q8) + (1 - x)*log(m1[i]*entry1->scale_q8))));
-    }
-  }
-}
-
-void od_adapt_pvq_ctx_reset(od_pvq_adapt_ctx *state, int is_keyframe) {
-  od_pvq_codeword_ctx *ctx;
-  int i;
-  int pli;
-  int bs;
-  ctx = &state->pvq_codeword_ctx;
-  OD_CDFS_INIT_DYNAMIC(state->pvq_param_model[0].cdf);
-  OD_CDFS_INIT_DYNAMIC(state->pvq_param_model[1].cdf);
-  OD_CDFS_INIT_DYNAMIC(state->pvq_param_model[2].cdf);
-  for (i = 0; i < 2*OD_TXSIZES; i++) {
-    ctx->pvq_adapt[4*i + OD_ADAPT_K_Q8] = 384;
-    ctx->pvq_adapt[4*i + OD_ADAPT_SUM_EX_Q8] = 256;
-    ctx->pvq_adapt[4*i + OD_ADAPT_COUNT_Q8] = 104;
-    ctx->pvq_adapt[4*i + OD_ADAPT_COUNT_EX_Q8] = 128;
-  }
-  OD_CDFS_INIT_DYNAMIC(ctx->pvq_k1_cdf);
-  for (pli = 0; pli < OD_NPLANES_MAX; pli++) {
-    for (bs = 0; bs < OD_TXSIZES; bs++)
-    for (i = 0; i < PVQ_MAX_PARTITIONS; i++) {
-      state->pvq_exg[pli][bs][i] = 2 << 16;
-    }
-  }
-  for (i = 0; i < OD_TXSIZES*PVQ_MAX_PARTITIONS; i++) {
-    state->pvq_ext[i] = is_keyframe ? 24576 : 2 << 16;
-  }
-  OD_CDFS_INIT_DYNAMIC(state->pvq_gaintheta_cdf);
-  OD_CDFS_INIT_Q15(state->pvq_skip_dir_cdf);
-  OD_CDFS_INIT_DYNAMIC(ctx->pvq_split_cdf);
-}
-
-/* QMs are arranged from smallest to largest blocksizes, first for
-   blocks with decimation=0, followed by blocks with decimation=1.*/
-int od_qm_offset(int bs, int xydec)
-{
-    return xydec*OD_QM_STRIDE + OD_QM_OFFSET(bs);
-}
-
-#if defined(OD_FLOAT_PVQ)
-#define OD_DEFAULT_MAG 1.0
-#else
-#define OD_DEFAULT_MAG OD_QM_SCALE
-#endif
-
-/* Initialize the quantization matrix. */
-// Note: When hybrid transform and corresponding scan order is used by PVQ,
-// we don't need seperate qm and qm_inv for each transform type,
-// because AOM does not do magnitude compensation (i.e. simplay x16 for all coeffs).
-void od_init_qm(int16_t *x, int16_t *x_inv, const int *qm) {
-  int i;
-  int j;
-  int16_t y[OD_TXSIZE_MAX*OD_TXSIZE_MAX];
-  int16_t y_inv[OD_TXSIZE_MAX*OD_TXSIZE_MAX];
-  int16_t *x1;
-  int16_t *x1_inv;
-  int off;
-  int bs;
-  int xydec;
-  for (bs = 0; bs < OD_TXSIZES; bs++) {
-    for (xydec = 0; xydec < 2; xydec++) {
-      off = od_qm_offset(bs, xydec);
-      x1 = x + off;
-      x1_inv = x_inv + off;
-      for (i = 0; i < 4 << bs; i++) {
-        for (j = 0; j < 4 << bs; j++) {
-          /*This will ultimately be clamped to fit in 16 bits.*/
-          od_val32 mag;
-          int16_t ytmp;
-          mag = OD_DEFAULT_MAG;
-          if (i != 0 || j != 0) {
-#if defined(OD_FLOAT_PVQ)
-            mag /= 0.0625*qm[(i << 1 >> bs)*8 + (j << 1 >> bs)];
-#else
-            int qmv;
-            qmv = qm[(i << 1 >> bs)*8 + (j << 1 >> bs)];
-            mag *= 16;
-            mag = (mag + (qmv >> 1))/qmv;
-#endif
-            OD_ASSERT(mag > 0.0);
-          }
-          /*Convert to fit in 16 bits.*/
-#if defined(OD_FLOAT_PVQ)
-          y[i*(4 << bs) + j] = (int16_t)OD_MINI(OD_QM_SCALE_MAX,
-           (int32_t)floor(.5 + mag*OD_QM_SCALE));
-          y_inv[i*(4 << bs) + j] = (int16_t)floor(.5
-           + OD_QM_SCALE*OD_QM_INV_SCALE/(double)y[i*(4 << bs) + j]);
-#else
-          y[i*(4 << bs) + j] = (int16_t)OD_MINI(OD_QM_SCALE_MAX, mag);
-          ytmp = y[i*(4 << bs) + j];
-          y_inv[i*(4 << bs) + j] = (int16_t)((OD_QM_SCALE*OD_QM_INV_SCALE
-           + (ytmp >> 1))/ytmp);
-#endif
-        }
-      }
-      od_raster_to_coding_order_16(x1, 4 << bs, y, 4 << bs);
-      od_raster_to_coding_order_16(x1_inv, 4 << bs, y_inv, 4 << bs);
-    }
-  }
-}
-
-/* Maps each possible size (n) in the split k-tokenizer to a different value.
-   Possible values of n are:
-   2, 3, 4, 7, 8, 14, 15, 16, 31, 32, 63, 64, 127, 128
-   Since we don't care about the order (even in the bit-stream) the simplest
-   ordering (implemented here) is:
-   14, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 */
-int od_pvq_size_ctx(int n) {
-  int logn;
-  int odd;
-  logn = OD_ILOG(n - 1);
-  odd = n & 1;
-  return 2*logn - 1 - odd - 7*(n == 14);
-}
-
-/* Maps a length n to a context for the (k=1, n<=16) coder, with a special
-   case when n is the original length (orig_length=1) of the vector (i.e. we
-   haven't split it yet). For orig_length=0, we use the same mapping as
-   od_pvq_size_ctx() up to n=16. When orig_length=1, we map lengths
-   7, 8, 14, 15 to contexts 8 to 11. */
-int od_pvq_k1_ctx(int n, int orig_length) {
-  if (orig_length) return 8 + 2*(n > 8) + (n & 1);
-  else return od_pvq_size_ctx(n);
-}
-
-/* Indexing for the packed quantization matrices. */
-int od_qm_get_index(int bs, int band) {
-  /* The -band/3 term is due to the fact that we force corresponding horizontal
-     and vertical bands to have the same quantization. */
-  OD_ASSERT(bs >= 0 && bs < OD_TXSIZES);
-  return bs*(bs + 1) + band - band/3;
-}
-
-#if !defined(OD_FLOAT_PVQ)
-/*See celt/mathops.c in Opus and tools/cos_search.c.*/
-static int16_t od_pvq_cos_pi_2(int16_t x)
-{
-  int16_t x2;
-  x2 = OD_MULT16_16_Q15(x, x);
-  return OD_MINI(32767, (1073758164 - x*x + x2*(-7654 + OD_MULT16_16_Q16(x2,
-   16573 + OD_MULT16_16_Q16(-2529, x2)))) >> 15);
-}
-#endif
-
-/*Approximates cos(x) for -pi < x < pi.
-  Input is in OD_THETA_SCALE.*/
-od_val16 od_pvq_cos(od_val32 x) {
-#if defined(OD_FLOAT_PVQ)
-  return cos(x);
-#else
-  /*Wrap x around by masking, since cos is periodic.*/
-  x = x & 0x0001ffff;
-  if (x > (1 << 16)) {
-    x = (1 << 17) - x;
-  }
-  if (x & 0x00007fff) {
-    if (x < (1 << 15)) {
-       return od_pvq_cos_pi_2((int16_t)x);
-    }
-    else {
-      return -od_pvq_cos_pi_2((int16_t)(65536 - x));
-    }
-  }
-  else {
-    if (x & 0x0000ffff) {
-      return 0;
-    }
-    else if (x & 0x0001ffff) {
-      return -32767;
-    }
-    else {
-      return 32767;
-    }
-  }
-#endif
-}
-
-/*Approximates sin(x) for 0 <= x < pi.
-  Input is in OD_THETA_SCALE.*/
-od_val16 od_pvq_sin(od_val32 x) {
-#if defined(OD_FLOAT_PVQ)
-  return sin(x);
-#else
-  return od_pvq_cos(32768 - x);
-#endif
-}
-
-#if !defined(OD_FLOAT_PVQ)
-/* Computes an upper-bound on the number of bits required to store the L2 norm
-   of a vector (excluding sign). */
-int od_vector_log_mag(const od_coeff *x, int n) {
-  int i;
-  int32_t sum;
-  sum = 0;
-  for (i = 0; i < n; i++) {
-    int16_t tmp;
-    tmp = x[i] >> 8;
-    sum += tmp*(int32_t)tmp;
-  }
-  /* We add one full bit (instead of rounding OD_ILOG() up) for safety because
-     the >> 8 above causes the sum to be slightly underestimated. */
-  return 8 + 1 + OD_ILOG(n + sum)/2;
-}
-#endif
-
-/** Computes Householder reflection that aligns the reference r to the
- *  dimension in r with the greatest absolute value. The reflection
- *  vector is returned in r.
- *
- * @param [in,out]  r      reference vector to be reflected, reflection
- *                         also returned in r
- * @param [in]      n      number of dimensions in r
- * @param [in]      gr     gain of reference vector
- * @param [out]     sign   sign of reflection
- * @return                 dimension number to which reflection aligns
- **/
-int od_compute_householder(od_val16 *r, int n, od_val32 gr, int *sign,
- int shift) {
-  int m;
-  int i;
-  int s;
-  od_val16 maxr;
-  OD_UNUSED(shift);
-  /* Pick component with largest magnitude. Not strictly
-   * necessary, but it helps numerical stability */
-  m = 0;
-  maxr = 0;
-  for (i = 0; i < n; i++) {
-    if (OD_ABS(r[i]) > maxr) {
-      maxr = OD_ABS(r[i]);
-      m = i;
-    }
-  }
-  s = r[m] > 0 ? 1 : -1;
-  /* This turns r into a Householder reflection vector that would reflect
-   * the original r[] to e_m */
-  r[m] += OD_SHR_ROUND(gr*s, shift);
-  *sign = s;
-  return m;
-}
-
-#if !defined(OD_FLOAT_PVQ)
-#define OD_RCP_INSHIFT 15
-#define OD_RCP_OUTSHIFT 14
-static od_val16 od_rcp(od_val16 x)
-{
-  int i;
-  od_val16 n;
-  od_val16 r;
-  i = OD_ILOG(x) - 1;
-  /*n is Q15 with range [0,1).*/
-  n = OD_VSHR_ROUND(x, i - OD_RCP_INSHIFT) - (1 << OD_RCP_INSHIFT);
-  /*Start with a linear approximation:
-    r = 1.8823529411764706-0.9411764705882353*n.
-    The coefficients and the result are Q14 in the range [15420,30840].*/
-  r = 30840 + OD_MULT16_16_Q15(-15420, n);
-  /*Perform two Newton iterations:
-    r -= r*((r*n)-1.Q15)
-       = r*((r*n)+(r-1.Q15)).*/
-  r = r - OD_MULT16_16_Q15(r, (OD_MULT16_16_Q15(r, n) + r - 32768));
-  /*We subtract an extra 1 in the second iteration to avoid overflow; it also
-     neatly compensates for truncation error in the rest of the process.*/
-  r = r - (1 + OD_MULT16_16_Q15(r, OD_MULT16_16_Q15(r, n) + r - 32768));
-  /*r is now the Q15 solution to 2/(n+1), with a maximum relative error
-     of 7.05346E-5, a (relative) RMSE of 2.14418E-5, and a peak absolute
-     error of 1.24665/32768.*/
-  return OD_VSHR_ROUND(r, i - OD_RCP_OUTSHIFT);
-}
-#endif
-
-/** Applies Householder reflection from compute_householder(). The
- * reflection is its own inverse.
- *
- * @param [out]     out    reflected vector
- * @param [in]      x      vector to be reflected
- * @param [in]      r      reflection
- * @param [in]      n      number of dimensions in x,r
- */
-void od_apply_householder(od_val16 *out, const od_val16 *x, const od_val16 *r,
- int n) {
-  int i;
-  od_val32 proj;
-  od_val16 proj_1;
-  od_val32 l2r;
-#if !defined(OD_FLOAT_PVQ)
-  od_val16 proj_norm;
-  od_val16 l2r_norm;
-  od_val16 rcp;
-  int proj_shift;
-  int l2r_shift;
-  int outshift;
-#endif
-  /*FIXME: Can we get l2r and/or l2r_shift from an earlier computation?*/
-  l2r = 0;
-  for (i = 0; i < n; i++) {
-    l2r += OD_MULT16_16(r[i], r[i]);
-  }
-  /* Apply Householder reflection */
-  proj = 0;
-  for (i = 0; i < n; i++) {
-    proj += OD_MULT16_16(r[i], x[i]);
-  }
-#if defined(OD_FLOAT_PVQ)
-  proj_1 = proj*2./(1e-100 + l2r);
-  for (i = 0; i < n; i++) {
-    out[i] = x[i] - r[i]*proj_1;
-  }
-#else
-  /*l2r_norm is [0.5, 1.0[ in Q15.*/
-  l2r_shift = (OD_ILOG(l2r) - 1) - 14;
-  l2r_norm = OD_VSHR_ROUND(l2r, l2r_shift);
-  rcp = od_rcp(l2r_norm);
-  proj_shift = (OD_ILOG(abs(proj)) - 1) - 14;
-  /*proj_norm is [0.5, 1.0[ in Q15.*/
-  proj_norm = OD_VSHR_ROUND(proj, proj_shift);
-  proj_1 = OD_MULT16_16_Q15(proj_norm, rcp);
-  /*The proj*2. in the float code becomes -1 in the final outshift.
-    The sign of l2r_shift is positive since we're taking the reciprocal of
-     l2r_norm and this is a right shift.*/
-  outshift = OD_MINI(30, OD_RCP_OUTSHIFT - proj_shift - 1 + l2r_shift);
-  if (outshift >= 0) {
-    for (i = 0; i < n; i++) {
-      int32_t tmp;
-      tmp = OD_MULT16_16(r[i], proj_1);
-      tmp = OD_SHR_ROUND(tmp, outshift);
-      out[i] = x[i] - tmp;
-    }
-  }
-  else {
-    /*FIXME: Can we make this case impossible?
-      Right now, if r[] is all zeros except for 1, 2, or 3 ones, and
-       if x[] is all zeros except for large values at the same position as the
-       ones in r[], then we can end up with a shift of -1.*/
-    for (i = 0; i < n; i++) {
-      int32_t tmp;
-      tmp = OD_MULT16_16(r[i], proj_1);
-      tmp = OD_SHL(tmp, -outshift);
-      out[i] = x[i] - tmp;
-    }
-  }
-#endif
-}
-
-#if !defined(OD_FLOAT_PVQ)
-static od_val16 od_beta_rcp(od_val16 beta){
-  if (beta == OD_BETA(1.))
-    return OD_BETA(1.);
-  else if (beta == OD_BETA(1.5))
-    return OD_BETA(1./1.5);
-  else {
-    od_val16 rcp_beta;
-    /*Shift by 1 less, transposing beta to range [.5, .75] and thus < 32768.*/
-    rcp_beta = od_rcp(beta << (OD_RCP_INSHIFT - 1 - OD_BETA_SHIFT));
-    return OD_SHR_ROUND(rcp_beta, OD_RCP_OUTSHIFT + 1 - OD_BETA_SHIFT);
-  }
-}
-
-#define OD_EXP2_INSHIFT 15
-#define OD_EXP2_FRACSHIFT 15
-#define OD_EXP2_OUTSHIFT 15
-static const int32_t OD_EXP2_C[5] = {32768, 22709, 7913, 1704, 443};
-/*Output is [1.0, 2.0) in Q(OD_EXP2_FRACSHIFT).
-  It does not include the integer offset, which is added in od_exp2 after the
-   final shift).*/
-static int32_t od_exp2_frac(int32_t x)
-{
-  return OD_MULT16_16_Q15(x, (OD_EXP2_C[1] + OD_MULT16_16_Q15(x,
-   (OD_EXP2_C[2] + OD_MULT16_16_Q15(x, (OD_EXP2_C[3]
-   + OD_MULT16_16_Q15(x, OD_EXP2_C[4])))))));
-}
-
-/** Base-2 exponential approximation (2^x) with Q15 input and output.*/
-static int32_t od_exp2(int32_t x)
-{
-  int integer;
-  int32_t frac;
-  integer = x >> OD_EXP2_INSHIFT;
-  if (integer > 14)
-    return 0x7f000000;
-  else if (integer < -15)
-    return 0;
-  frac = od_exp2_frac(x - OD_SHL(integer, OD_EXP2_INSHIFT));
-  return OD_VSHR_ROUND(OD_EXP2_C[0] + frac, -integer) + 1;
-}
-
-#define OD_LOG2_INSHIFT 15
-#define OD_LOG2_OUTSHIFT 15
-#define OD_LOG2_INSCALE_1 (1./(1 << OD_LOG2_INSHIFT))
-#define OD_LOG2_OUTSCALE (1 << OD_LOG2_OUTSHIFT)
-static int16_t od_log2(int16_t x)
-{
-  return x + OD_MULT16_16_Q15(x, (14482 + OD_MULT16_16_Q15(x, (-23234
-   + OD_MULT16_16_Q15(x, (13643 + OD_MULT16_16_Q15(x, (-6403
-   + OD_MULT16_16_Q15(x, 1515)))))))));
-}
-
-static int32_t od_pow(int32_t x, od_val16 beta)
-{
-  int16_t t;
-  int xshift;
-  int log2_x;
-  od_val32 logr;
-  /*FIXME: this conditional is to avoid doing log2(0).*/
-  if (x == 0)
-    return 0;
-  log2_x = (OD_ILOG(x) - 1);
-  xshift = log2_x - OD_LOG2_INSHIFT;
-  /*t should be in range [0.0, 1.0[ in Q(OD_LOG2_INSHIFT).*/
-  t = OD_VSHR(x, xshift) - (1 << OD_LOG2_INSHIFT);
-  /*log2(g/OD_COMPAND_SCALE) = log2(x) - OD_COMPAND_SHIFT in
-     Q(OD_LOG2_OUTSHIFT).*/
-  logr = od_log2(t) + (log2_x - OD_COMPAND_SHIFT)*OD_LOG2_OUTSCALE;
-  logr = (od_val32)OD_MULT16_32_QBETA(beta, logr);
-  return od_exp2(logr);
-}
-#endif
-
-/** Gain companding: raises gain to the power 1/beta for activity masking.
- *
- * @param [in]  g     real (uncompanded) gain
- * @param [in]  q0    uncompanded quality parameter
- * @param [in]  beta  activity masking beta param (exponent)
- * @return            g^(1/beta)
- */
-static od_val32 od_gain_compand(od_val32 g, int q0, od_val16 beta) {
-#if defined(OD_FLOAT_PVQ)
-  if (beta == 1) return OD_CGAIN_SCALE*g/(double)q0;
-  else {
-    return OD_CGAIN_SCALE*OD_COMPAND_SCALE*pow(g*OD_COMPAND_SCALE_1,
-     1./beta)/(double)q0;
-  }
-#else
-  if (beta == OD_BETA(1)) return (OD_CGAIN_SCALE*g + (q0 >> 1))/q0;
-  else {
-    int32_t expr;
-    expr = od_pow(g, od_beta_rcp(beta));
-    expr <<= OD_CGAIN_SHIFT + OD_COMPAND_SHIFT - OD_EXP2_OUTSHIFT;
-    return (expr + (q0 >> 1))/q0;
-  }
-#endif
-}
-
-#if !defined(OD_FLOAT_PVQ)
-#define OD_SQRT_INSHIFT 16
-#define OD_SQRT_OUTSHIFT 15
-static int16_t od_rsqrt_norm(int16_t x);
-
-static int16_t od_sqrt_norm(int32_t x)
-{
-  OD_ASSERT(x < 65536);
-  return OD_MINI(OD_SHR_ROUND(x*od_rsqrt_norm(x), OD_SQRT_OUTSHIFT), 32767);
-}
-
-static int16_t od_sqrt(int32_t x, int *sqrt_shift)
-{
-  int k;
-  int s;
-  int32_t t;
-  if (x == 0) {
-    *sqrt_shift = 0;
-     return 0;
-  }
-  OD_ASSERT(x < (1 << 30));
-  k = ((OD_ILOG(x) - 1) >> 1);
-  /*t is x in the range [0.25, 1) in QINSHIFT, or x*2^(-s).
-    Shift by log2(x) - log2(0.25*(1 << INSHIFT)) to ensure 0.25 lower bound.*/
-  s = 2*k - (OD_SQRT_INSHIFT - 2);
-  t = OD_VSHR(x, s);
-  /*We want to express od_sqrt() in terms of od_sqrt_norm(), which is
-     defined as (2^OUTSHIFT)*sqrt(t*(2^-INSHIFT)) with t=x*(2^-s).
-    This simplifies to 2^(OUTSHIFT-(INSHIFT/2)-(s/2))*sqrt(x), so the caller
-     needs to shift right by OUTSHIFT - INSHIFT/2 - s/2.*/
-  *sqrt_shift = OD_SQRT_OUTSHIFT - ((s + OD_SQRT_INSHIFT) >> 1);
-  return od_sqrt_norm(t);
-}
-#endif
-
-/** Gain expanding: raises gain to the power beta for activity masking.
- *
- * @param [in]  cg    companded gain
- * @param [in]  q0    uncompanded quality parameter
- * @param [in]  beta  activity masking beta param (exponent)
- * @return            g^beta
- */
-od_val32 od_gain_expand(od_val32 cg0, int q0, od_val16 beta) {
-  if (beta == OD_BETA(1)) {
-    /*The multiply fits into 28 bits because the expanded gain has a range from
-       0 to 2^20.*/
-    return OD_SHR_ROUND(cg0*q0, OD_CGAIN_SHIFT);
-  }
-  else if (beta == OD_BETA(1.5)) {
-#if defined(OD_FLOAT_PVQ)
-    double cg;
-    cg = cg0*OD_CGAIN_SCALE_1;
-    cg *= q0*OD_COMPAND_SCALE_1;
-    return OD_COMPAND_SCALE*cg*sqrt(cg);
-#else
-    int32_t irt;
-    int64_t tmp;
-    int sqrt_inshift;
-    int sqrt_outshift;
-    /*cg0 is in Q(OD_CGAIN_SHIFT) and we need to divide it by
-       2^OD_COMPAND_SHIFT.*/
-    irt = od_sqrt(cg0*q0, &sqrt_outshift);
-    sqrt_inshift = (OD_CGAIN_SHIFT + OD_COMPAND_SHIFT) >> 1;
-    /*tmp is in Q(OD_CGAIN_SHIFT + OD_COMPAND_SHIFT).*/
-    tmp = cg0*q0*(int64_t)irt;
-    /*Expanded gain must be in Q(OD_COMPAND_SHIFT), thus OD_COMPAND_SHIFT is
-       not included here.*/
-    return OD_MAXI(1,
-        OD_VSHR_ROUND(tmp, OD_CGAIN_SHIFT + sqrt_outshift + sqrt_inshift));
-#endif
-  }
-  else {
-#if defined(OD_FLOAT_PVQ)
-    /*Expanded gain must be in Q(OD_COMPAND_SHIFT), hence the multiply by
-       OD_COMPAND_SCALE.*/
-    double cg;
-    cg = cg0*OD_CGAIN_SCALE_1;
-    return OD_COMPAND_SCALE*pow(cg*q0*OD_COMPAND_SCALE_1, beta);
-#else
-    int32_t expr;
-    int32_t cg;
-    cg = OD_SHR_ROUND(cg0*q0, OD_CGAIN_SHIFT);
-    expr = od_pow(cg, beta);
-    /*Expanded gain must be in Q(OD_COMPAND_SHIFT), hence the subtraction by
-       OD_COMPAND_SHIFT.*/
-    return OD_MAXI(1, OD_SHR_ROUND(expr, OD_EXP2_OUTSHIFT - OD_COMPAND_SHIFT));
-#endif
-  }
-}
-
-/** Computes the raw and quantized/companded gain of a given input
- * vector
- *
- * @param [in]      x      vector of input data
- * @param [in]      n      number of elements in vector x
- * @param [in]      q0     quantizer
- * @param [out]     g      raw gain
- * @param [in]      beta   activity masking beta param
- * @param [in]      bshift shift to be applied to raw gain
- * @return                 quantized/companded gain
- */
-od_val32 od_pvq_compute_gain(const od_val16 *x, int n, int q0, od_val32 *g,
- od_val16 beta, int bshift) {
-  int i;
-  od_val32 acc;
-#if !defined(OD_FLOAT_PVQ)
-  od_val32 irt;
-  int sqrt_shift;
-#else
-  OD_UNUSED(bshift);
-#endif
-  acc = 0;
-  for (i = 0; i < n; i++) {
-    acc += x[i]*(od_val32)x[i];
-  }
-#if defined(OD_FLOAT_PVQ)
-  *g = sqrt(acc);
-#else
-  irt = od_sqrt(acc, &sqrt_shift);
-  *g = OD_VSHR_ROUND(irt, sqrt_shift - bshift);
-#endif
-  /* Normalize gain by quantization step size and apply companding
-     (if ACTIVITY != 1). */
-  return od_gain_compand(*g, q0, beta);
-}
-
-/** Compute theta quantization range from quantized/companded gain
- *
- * @param [in]      qcg    quantized companded gain value
- * @param [in]      beta   activity masking beta param
- * @return                 max theta value
- */
-int od_pvq_compute_max_theta(od_val32 qcg, od_val16 beta){
-  /* Set angular resolution (in ra) to match the encoded gain */
-#if defined(OD_FLOAT_PVQ)
-  int ts = (int)floor(.5 + qcg*OD_CGAIN_SCALE_1*M_PI/(2*beta));
-#else
-  int ts = OD_SHR_ROUND(qcg*OD_MULT16_16_QBETA(OD_QCONST32(M_PI/2,
-   OD_CGAIN_SHIFT), od_beta_rcp(beta)), OD_CGAIN_SHIFT*2);
-#endif
-  /* Special case for low gains -- will need to be tuned anyway */
-  if (qcg < OD_QCONST32(1.4, OD_CGAIN_SHIFT)) ts = 1;
-  return ts;
-}
-
-/** Decode quantized theta value from coded value
- *
- * @param [in]      t          quantized companded gain value
- * @param [in]      max_theta  maximum theta value
- * @return                     decoded theta value
- */
-od_val32 od_pvq_compute_theta(int t, int max_theta) {
-  if (max_theta != 0) {
-#if defined(OD_FLOAT_PVQ)
-    return OD_MINI(t, max_theta - 1)*.5*M_PI/max_theta;
-#else
-    return (OD_MAX_THETA_SCALE*OD_MINI(t, max_theta - 1)
-     + (max_theta >> 1))/max_theta;
-#endif
-  }
-  else return 0;
-}
-
-#define OD_SQRT_TBL_SHIFT (10)
-
-#define OD_ITHETA_SHIFT 15
-/** Compute the number of pulses used for PVQ encoding a vector from
- * available metrics (encode and decode side)
- *
- * @param [in]      qcg        quantized companded gain value
- * @param [in]      itheta     quantized PVQ error angle theta
- * @param [in]      noref      indicates present or lack of reference
- *                             (prediction)
- * @param [in]      n          number of elements to be coded
- * @param [in]      beta       activity masking beta param
- * @return                     number of pulses to use for coding
- */
-int od_pvq_compute_k(od_val32 qcg, int itheta, int noref, int n,
-    od_val16 beta) {
-#if !defined(OD_FLOAT_PVQ)
-  /*Lookup table for sqrt(n+3/2) and sqrt(n+2/2) in Q10.
-    Real max values are 32792 and 32784, but clamped to stay within 16 bits.
-    Update with tools/gen_sqrt_tbl if needed.*/
-  static const od_val16 od_sqrt_table[2][13] = {
-   {0, 0, 0, 0, 2290, 2985, 4222, 0, 8256, 0, 16416, 0, 32767},
-   {0, 0, 0, 0, 2401, 3072, 4284, 0, 8287, 0, 16432, 0, 32767}};
-#endif
-  if (noref) {
-    if (qcg == 0) return 0;
-    if (n == 15 && qcg == OD_CGAIN_SCALE && beta > OD_BETA(1.25)) {
-      return 1;
-    }
-    else {
-#if defined(OD_FLOAT_PVQ)
-      return OD_MAXI(1, (int)floor(.5 + (qcg*OD_CGAIN_SCALE_1 - .2)*
-       sqrt((n + 3)/2)/beta));
-#else
-      od_val16 rt;
-      OD_ASSERT(OD_ILOG(n + 1) < 13);
-      rt = od_sqrt_table[1][OD_ILOG(n + 1)];
-      /*FIXME: get rid of 64-bit mul.*/
-      return OD_MAXI(1, OD_SHR_ROUND((int64_t)((qcg
-       - (int64_t)OD_QCONST32(.2, OD_CGAIN_SHIFT))*
-       OD_MULT16_16_QBETA(od_beta_rcp(beta), rt)), OD_CGAIN_SHIFT
-       + OD_SQRT_TBL_SHIFT));
-#endif
-    }
-  }
-  else {
-    if (itheta == 0) return 0;
-    /* Sets K according to gain and theta, based on the high-rate
-       PVQ distortion curves (see PVQ document). Low-rate will have to be
-       perceptually tuned anyway. We subtract 0.2 from the radius as an
-       approximation for the fact that the coefficients aren't identically
-       distributed within a band so at low gain the number of dimensions that
-       are likely to have a pulse is less than n. */
-#if defined(OD_FLOAT_PVQ)
-    return OD_MAXI(1, (int)floor(.5 + (itheta - .2)*sqrt((n + 2)/2)));
-#else
-    od_val16 rt;
-    OD_ASSERT(OD_ILOG(n + 1) < 13);
-    rt = od_sqrt_table[0][OD_ILOG(n + 1)];
-    /*FIXME: get rid of 64-bit mul.*/
-    return OD_MAXI(1, OD_VSHR_ROUND(((OD_SHL(itheta, OD_ITHETA_SHIFT)
-     - OD_QCONST32(.2, OD_ITHETA_SHIFT)))*(int64_t)rt,
-     OD_SQRT_TBL_SHIFT + OD_ITHETA_SHIFT));
-#endif
-  }
-}
-
-#if !defined(OD_FLOAT_PVQ)
-#define OD_RSQRT_INSHIFT 16
-#define OD_RSQRT_OUTSHIFT 14
-/** Reciprocal sqrt approximation where the input is in the range [0.25,1) in
-     Q16 and the output is in the range (1.0, 2.0] in Q14).
-    Error is always within +/1 of round(1/sqrt(t))*/
-static int16_t od_rsqrt_norm(int16_t t)
-{
-  int16_t n;
-  int32_t r;
-  int32_t r2;
-  int32_t ry;
-  int32_t y;
-  int32_t ret;
-  /* Range of n is [-16384,32767] ([-0.5,1) in Q15).*/
-  n = t - 32768;
-  OD_ASSERT(n >= -16384);
-  /*Get a rough initial guess for the root.
-    The optimal minimax quadratic approximation (using relative error) is
-     r = 1.437799046117536+n*(-0.823394375837328+n*0.4096419668459485).
-    Coefficients here, and the final result r, are Q14.*/
-  r = (23565 + OD_MULT16_16_Q15(n, (-13481 + OD_MULT16_16_Q15(n, 6711))));
-  /*We want y = t*r*r-1 in Q15, but t is 32-bit Q16 and r is Q14.
-    We can compute the result from n and r using Q15 multiplies with some
-     adjustment, carefully done to avoid overflow.*/
-  r2 = r*r;
-  y = (((r2 >> 15)*n + r2) >> 12) - 131077;
-  ry = r*y;
-  /*Apply a 2nd-order Householder iteration: r += r*y*(y*0.375-0.5).
-    This yields the Q14 reciprocal square root of the Q16 t, with a maximum
-     relative error of 1.04956E-4, a (relative) RMSE of 2.80979E-5, and a peak
-     absolute error of 2.26591/16384.*/
-  ret = r + ((((ry >> 16)*(3*y) >> 3) - ry) >> 18);
-  OD_ASSERT(ret >= 16384 && ret < 32768);
-  return (int16_t)ret;
-}
-
-static int16_t od_rsqrt(int32_t x, int *rsqrt_shift)
-{
-   int k;
-   int s;
-   int16_t t;
-   k = (OD_ILOG(x) - 1) >> 1;
-  /*t is x in the range [0.25, 1) in QINSHIFT, or x*2^(-s).
-    Shift by log2(x) - log2(0.25*(1 << INSHIFT)) to ensure 0.25 lower bound.*/
-   s = 2*k - (OD_RSQRT_INSHIFT - 2);
-   t = OD_VSHR(x, s);
-   /*We want to express od_rsqrt() in terms of od_rsqrt_norm(), which is
-      defined as (2^OUTSHIFT)/sqrt(t*(2^-INSHIFT)) with t=x*(2^-s).
-     This simplifies to 2^(OUTSHIFT+(INSHIFT/2)+(s/2))/sqrt(x), so the caller
-      needs to shift right by OUTSHIFT + INSHIFT/2 + s/2.*/
-   *rsqrt_shift = OD_RSQRT_OUTSHIFT + ((s + OD_RSQRT_INSHIFT) >> 1);
-   return od_rsqrt_norm(t);
-}
-#endif
-
-/** Synthesizes one parition of coefficient values from a PVQ-encoded
- * vector.  This 'partial' version is called by the encode loop where
- * the Householder reflection has already been computed and there's no
- * need to recompute it.
- *
- * @param [out]     xcoeff  output coefficient partition (x in math doc)
- * @param [in]      ypulse  PVQ-encoded values (y in the math doc); in
- *                          the noref case, this vector has n entries,
- *                          in the reference case it contains n-1 entries
- *                          (the m-th entry is not included)
- * @param [in]      r       reference vector (prediction)
- * @param [in]      n       number of elements in this partition
- * @param [in]      noref   indicates presence or lack of prediction
- * @param [in]      g       decoded quantized vector gain
- * @param [in]      theta   decoded theta (prediction error)
- * @param [in]      m       alignment dimension of Householder reflection
- * @param [in]      s       sign of Householder reflection
- * @param [in]      qm_inv  inverse of the QM with magnitude compensation
- */
-void od_pvq_synthesis_partial(od_coeff *xcoeff, const od_coeff *ypulse,
- const od_val16 *r16, int n, int noref, od_val32 g, od_val32 theta, int m, int s,
- const int16_t *qm_inv) {
-  int i;
-  int yy;
-  od_val32 scale;
-  int nn;
-#if !defined(OD_FLOAT_PVQ)
-  int gshift;
-  int qshift;
-#endif
-  OD_ASSERT(g != 0);
-  nn = n-(!noref); /* when noref==0, vector in is sized n-1 */
-  yy = 0;
-  for (i = 0; i < nn; i++)
-    yy += ypulse[i]*(int32_t)ypulse[i];
-#if !defined(OD_FLOAT_PVQ)
-  /* Shift required for the magnitude of the pre-qm synthesis to be guaranteed
-     to fit in 16 bits. In practice, the range will be 8192-16384 after scaling
-     most of the time. */
-  gshift = OD_MAXI(0, OD_ILOG(g) - 14);
-#endif
-  /*scale is g/sqrt(yy) in Q(16-gshift) so that x[]*scale has a norm that fits
-     in 16 bits.*/
-  if (yy == 0) scale = 0;
-#if defined(OD_FLOAT_PVQ)
-  else {
-    scale = g/sqrt(yy);
-  }
-#else
-  else {
-    int rsqrt_shift;
-    int16_t rsqrt;
-    /*FIXME: should be < int64_t*/
-    int64_t tmp;
-    rsqrt = od_rsqrt(yy, &rsqrt_shift);
-    tmp = rsqrt*(int64_t)g;
-    scale = OD_VSHR_ROUND(tmp, rsqrt_shift + gshift - 16);
-  }
-  /* Shift to apply after multiplying by the inverse QM, taking into account
-     gshift. */
-  qshift = OD_QM_INV_SHIFT - gshift;
-#endif
-  if (noref) {
-    for (i = 0; i < n; i++) {
-      od_val32 x;
-      /* This multiply doesn't round, so it introduces some bias.
-         It would be nice (but not critical) to fix this. */
-      x = (od_val32)OD_MULT16_32_Q16(ypulse[i], scale);
-#if defined(OD_FLOAT_PVQ)
-      xcoeff[i] = (od_coeff)floor(.5
-       + x*(qm_inv[i]*OD_QM_INV_SCALE_1));
-#else
-      xcoeff[i] = OD_SHR_ROUND(x*qm_inv[i], qshift);
-#endif
-    }
-  }
-  else{
-    od_val16 x[MAXN];
-    scale = OD_ROUND32(scale*OD_TRIG_SCALE_1*od_pvq_sin(theta));
-    /* The following multiply doesn't round, but it's probably OK since
-       the Householder reflection is likely to undo most of the resulting
-       bias. */
-    for (i = 0; i < m; i++)
-      x[i] = OD_MULT16_32_Q16(ypulse[i], scale);
-    x[m] = OD_ROUND16(-s*(OD_SHR_ROUND(g, gshift))*OD_TRIG_SCALE_1*
-     od_pvq_cos(theta));
-    for (i = m; i < nn; i++)
-      x[i+1] = OD_MULT16_32_Q16(ypulse[i], scale);
-    od_apply_householder(x, x, r16, n);
-    for (i = 0; i < n; i++) {
-#if defined(OD_FLOAT_PVQ)
-      xcoeff[i] = (od_coeff)floor(.5 + (x[i]*(qm_inv[i]*OD_QM_INV_SCALE_1)));
-#else
-      xcoeff[i] = OD_SHR_ROUND(x[i]*qm_inv[i], qshift);
-#endif
-    }
-  }
-}
diff --git a/third_party/aom/av1/common/pvq.h b/third_party/aom/av1/common/pvq.h
deleted file mode 100644
index 4adf22f02..000000000
--- a/third_party/aom/av1/common/pvq.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#if !defined(_pvq_H)
-# define _pvq_H (1)
-# include "generic_code.h"
-# include "odintrin.h"
-
-extern const uint16_t EXP_CDF_TABLE[][16];
-extern const uint16_t LAPLACE_OFFSET[];
-
-#define AV1_PVQ_ENABLE_ACTIVITY_MASKING (0)
-
-# define PVQ_MAX_PARTITIONS (1 + 3*(OD_TXSIZES-1))
-
-# define OD_NOREF_ADAPT_SPEED (4)
-/* Normalized lambda for PVQ quantizer. Since we normalize the gain by q, the
-   distortion is normalized by q^2 and lambda does not need the q^2 factor.
-   At high rate, this would be log(2)/6, but we're using a slightly more
-   aggressive value, closer to:
-   Li, Xiang, et al. "Laplace distribution based Lagrangian rate distortion
-   optimization for hybrid video coding." Circuits and Systems for Video
-   Technology, IEEE Transactions on 19.2 (2009): 193-205.
-   */
-# define OD_PVQ_LAMBDA (.1146)
-
-#define OD_PVQ_SKIP_ZERO 1
-#define OD_PVQ_SKIP_COPY 2
-
-/* Maximum size for coding a PVQ band. */
-#define OD_MAX_PVQ_SIZE (1024)
-
-#if defined(OD_FLOAT_PVQ)
-#define OD_QM_SHIFT (15)
-#else
-#define OD_QM_SHIFT (11)
-#endif
-#define OD_QM_SCALE (1 << OD_QM_SHIFT)
-#if defined(OD_FLOAT_PVQ)
-#define OD_QM_SCALE_1 (1./OD_QM_SCALE)
-#endif
-#define OD_QM_SCALE_MAX 32767
-#define OD_QM_INV_SHIFT (12)
-#define OD_QM_INV_SCALE (1 << OD_QM_INV_SHIFT)
-#if defined(OD_FLOAT_PVQ)
-#define OD_QM_INV_SCALE_1 (1./OD_QM_INV_SCALE)
-#endif
-#define OD_QM_OFFSET(bs) ((((1 << 2*bs) - 1) << 2*OD_LOG_BSIZE0)/3)
-#define OD_QM_STRIDE (OD_QM_OFFSET(OD_TXSIZES))
-#define OD_QM_BUFFER_SIZE (2*OD_QM_STRIDE)
-
-#if !defined(OD_FLOAT_PVQ)
-#define OD_THETA_SHIFT (15)
-#define OD_THETA_SCALE ((1 << OD_THETA_SHIFT)*2./M_PI)
-#define OD_MAX_THETA_SCALE (1 << OD_THETA_SHIFT)
-#define OD_TRIG_SCALE (32768)
-#define OD_BETA_SHIFT (12)
-#define OD_BETA_SCALE_1 (1./(1 << OD_BETA_SHIFT))
-/*Multiplies 16-bit a by 32-bit b and keeps bits [16:64-OD_BETA_SHIFT-1].*/
-#define OD_MULT16_32_QBETA(a, b) \
- ((int16_t)(a)*(int64_t)(int32_t)(b) >> OD_BETA_SHIFT)
-# define OD_MULT16_16_QBETA(a, b) \
-  ((((int16_t)(a))*((int32_t)(int16_t)(b))) >> OD_BETA_SHIFT)
-#define OD_CGAIN_SHIFT (8)
-#define OD_CGAIN_SCALE (1 << OD_CGAIN_SHIFT)
-#else
-#define OD_BETA_SCALE_1 (1.)
-#define OD_THETA_SCALE (1)
-#define OD_TRIG_SCALE (1)
-#define OD_CGAIN_SCALE (1)
-#endif
-#define OD_THETA_SCALE_1 (1./OD_THETA_SCALE)
-#define OD_TRIG_SCALE_1 (1./OD_TRIG_SCALE)
-#define OD_CGAIN_SCALE_1 (1./OD_CGAIN_SCALE)
-#define OD_CGAIN_SCALE_2 (OD_CGAIN_SCALE_1*OD_CGAIN_SCALE_1)
-
-/* Largest PVQ partition is half the coefficients of largest block size. */
-#define MAXN (OD_TXSIZE_MAX*OD_TXSIZE_MAX/2)
-
-#define OD_COMPAND_SHIFT (8 + OD_COEFF_SHIFT)
-#define OD_COMPAND_SCALE (1 << OD_COMPAND_SHIFT)
-#define OD_COMPAND_SCALE_1 (1./OD_COMPAND_SCALE)
-
-#define OD_QM_SIZE (OD_TXSIZES*(OD_TXSIZES + 1))
-
-#define OD_FLAT_QM 0
-#define OD_HVS_QM  1
-
-# define OD_NSB_ADAPT_CTXS (4)
-
-# define OD_ADAPT_K_Q8        0
-# define OD_ADAPT_SUM_EX_Q8   1
-# define OD_ADAPT_COUNT_Q8    2
-# define OD_ADAPT_COUNT_EX_Q8 3
-
-# define OD_ADAPT_NO_VALUE (-2147483647-1)
-
-typedef enum {
-  PVQ_SKIP = 0x0,
-  DC_CODED = 0x1,
-  AC_CODED = 0x2,
-  AC_DC_CODED = 0x3,
-} PVQ_SKIP_TYPE;
-
-typedef struct od_pvq_adapt_ctx  od_pvq_adapt_ctx;
-typedef struct od_pvq_codeword_ctx od_pvq_codeword_ctx;
-
-struct od_pvq_codeword_ctx {
-  int                 pvq_adapt[2*OD_TXSIZES*OD_NSB_ADAPT_CTXS];
-  /* CDFs are size 16 despite the fact that we're using less than that. */
-  uint16_t            pvq_k1_cdf[12][CDF_SIZE(16)];
-  uint16_t            pvq_split_cdf[22*7][CDF_SIZE(8)];
-};
-
-struct od_pvq_adapt_ctx {
-  od_pvq_codeword_ctx pvq_codeword_ctx;
-  generic_encoder     pvq_param_model[3];
-  int                 pvq_ext[OD_TXSIZES*PVQ_MAX_PARTITIONS];
-  int                 pvq_exg[OD_NPLANES_MAX][OD_TXSIZES][PVQ_MAX_PARTITIONS];
-  uint16_t pvq_gaintheta_cdf[2*OD_TXSIZES*PVQ_MAX_PARTITIONS][CDF_SIZE(16)];
-  uint16_t pvq_skip_dir_cdf[2*(OD_TXSIZES-1)][CDF_SIZE(7)];
-};
-
-typedef struct od_qm_entry {
-  int interp_q;
-  int scale_q8;
-  const unsigned char *qm_q4;
-} od_qm_entry;
-
-extern const od_qm_entry OD_DEFAULT_QMS[2][2][OD_NPLANES_MAX];
-
-void od_adapt_pvq_ctx_reset(od_pvq_adapt_ctx *state, int is_keyframe);
-int od_pvq_size_ctx(int n);
-int od_pvq_k1_ctx(int n, int orig_size);
-
-od_val16 od_pvq_sin(od_val32 x);
-od_val16 od_pvq_cos(od_val32 x);
-#if !defined(OD_FLOAT_PVQ)
-int od_vector_log_mag(const od_coeff *x, int n);
-#endif
-
-void od_interp_qm(unsigned char *out, int q, const od_qm_entry *entry1,
-                  const od_qm_entry *entry2);
-
-int od_qm_get_index(int bs, int band);
-
-extern const od_val16 *const OD_PVQ_BETA[2][OD_NPLANES_MAX][OD_TXSIZES + 1];
-
-void od_init_qm(int16_t *x, int16_t *x_inv, const int *qm);
-int od_compute_householder(od_val16 *r, int n, od_val32 gr, int *sign,
- int shift);
-void od_apply_householder(od_val16 *out, const od_val16 *x, const od_val16 *r,
- int n);
-void od_pvq_synthesis_partial(od_coeff *xcoeff, const od_coeff *ypulse,
-                                  const od_val16 *r, int n,
-                                  int noref, od_val32 g,
-                                  od_val32 theta, int m, int s,
-                                  const int16_t *qm_inv);
-od_val32 od_gain_expand(od_val32 cg, int q0, od_val16 beta);
-od_val32 od_pvq_compute_gain(const od_val16 *x, int n, int q0, od_val32 *g,
- od_val16 beta, int bshift);
-int od_pvq_compute_max_theta(od_val32 qcg, od_val16 beta);
-od_val32 od_pvq_compute_theta(int t, int max_theta);
-int od_pvq_compute_k(od_val32 qcg, int itheta, int noref, int n, od_val16 beta);
-
-int od_vector_is_null(const od_coeff *x, int len);
-int od_qm_offset(int bs, int xydec);
-
-#endif
diff --git a/third_party/aom/av1/common/pvq_state.c b/third_party/aom/av1/common/pvq_state.c
deleted file mode 100644
index 197b9b3a8..000000000
--- a/third_party/aom/av1/common/pvq_state.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "av1/common/pvq_state.h"
-#include "av1/common/odintrin.h"
-
-void od_adapt_ctx_reset(od_adapt_ctx *adapt, int is_keyframe) {
-  int pli;
-  od_adapt_pvq_ctx_reset(&adapt->pvq, is_keyframe);
-  OD_CDFS_INIT_Q15(adapt->skip_cdf);
-  for (pli = 0; pli < OD_NPLANES_MAX; pli++) {
-    int i;
-    OD_CDFS_INIT_DYNAMIC(adapt->model_dc[pli].cdf);
-    for (i = 0; i < OD_TXSIZES; i++) {
-      int j;
-      adapt->ex_g[pli][i] = 8;
-      for (j = 0; j < 3; j++) {
-        adapt->ex_dc[pli][i][j] = pli > 0 ? 8 : 32768;
-      }
-    }
-  }
-}
-
-void od_init_skipped_coeffs(int16_t *d, int16_t *pred, int is_keyframe, int bo,
-                            int n, int w) {
-  int i;
-  int j;
-  if (is_keyframe) {
-    for (i = 0; i < n; i++) {
-      for (j = 0; j < n; j++) {
-        /* skip DC */
-        if (i || j) d[bo + i * w + j] = 0;
-      }
-    }
-  } else {
-    for (i = 0; i < n; i++) {
-      for (j = 0; j < n; j++) {
-        d[bo + i * w + j] = pred[i * n + j];
-      }
-    }
-  }
-}
diff --git a/third_party/aom/av1/common/pvq_state.h b/third_party/aom/av1/common/pvq_state.h
deleted file mode 100644
index 84d454e70..000000000
--- a/third_party/aom/av1/common/pvq_state.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#if !defined(_state_H)
-# define _state_H (1)
-
-typedef struct od_state     od_state;
-typedef struct od_adapt_ctx od_adapt_ctx;
-
-# include "generic_code.h"
-# include "odintrin.h"
-# include "pvq.h"
-
-/*Adaptation speed of scalar Laplace encoding.*/
-# define OD_SCALAR_ADAPT_SPEED (4)
-
-struct od_adapt_ctx {
-  /* Support for PVQ encode/decode */
-  od_pvq_adapt_ctx pvq;
-
-  generic_encoder model_dc[OD_NPLANES_MAX];
-
-  int ex_dc[OD_NPLANES_MAX][OD_TXSIZES][3];
-  int ex_g[OD_NPLANES_MAX][OD_TXSIZES];
-
-  /* Joint skip flag for DC and AC */
-  uint16_t skip_cdf[OD_TXSIZES*2][CDF_SIZE(4)];
-};
-
-struct od_state {
-  od_adapt_ctx *adapt;
-  unsigned char pvq_qm_q4[OD_NPLANES_MAX][OD_QM_SIZE];
-  /* Quantization matrices and their inverses. */
-  int16_t qm[OD_QM_BUFFER_SIZE];
-  int16_t qm_inv[OD_QM_BUFFER_SIZE];
-};
-
-void od_adapt_ctx_reset(od_adapt_ctx *state, int is_keyframe);
-void od_init_skipped_coeffs(int16_t *d, int16_t *pred, int is_keyframe,
- int bo, int n, int w);
-
-#endif
diff --git a/third_party/aom/av1/common/quant_common.c b/third_party/aom/av1/common/quant_common.c
index ea7140cdc..84575d74b 100644
--- a/third_party/aom/av1/common/quant_common.c
+++ b/third_party/aom/av1/common/quant_common.c
@@ -16,111 +16,7 @@
 #include "av1/common/seg_common.h"
 #include "av1/common/blockd.h"
 
-#if CONFIG_NEW_QUANT
-// Bin widths expressed as a fraction over 128 of the quant stepsize,
-// for the quantization bins 0-4.
-// So a value x indicates the bin is actually factor x/128 of the
-// nominal quantization step.  For the zero bin, the width is only
-// for one side of zero, so the actual width is twice that.
-//
-// Functions with nuq correspond to "non uniform quantization"
-// TODO(sarahparker, debargha): Optimize these tables
-
-typedef struct {
-  uint8_t knots[NUQ_KNOTS];  // offsets
-  uint8_t doff;              // dequantization
-} qprofile_type;
-
-static const qprofile_type nuq[QUANT_PROFILES][COEF_BANDS] = {
-  {
-      // lossless
-      { { 64, 128, 128 }, 0 },  // dc, band 0
-      { { 64, 128, 128 }, 0 },  // band 1
-      { { 64, 128, 128 }, 0 },  // band 2
-      { { 64, 128, 128 }, 0 },  // band 3
-      { { 64, 128, 128 }, 0 },  // band 4
-      { { 64, 128, 128 }, 0 },  // band 5
-  },
-  {
-      { { 64, 128, 128 }, 4 },   // dc, band 0
-      { { 64, 128, 128 }, 6 },   // band 1
-      { { 64, 128, 128 }, 8 },   // band 2
-      { { 64, 128, 128 }, 10 },  // band 3
-      { { 72, 128, 128 }, 12 },  // band 4
-      { { 80, 128, 128 }, 14 }   // band 5
-  },
-  {
-      { { 64, 128, 128 }, 6 },   // dc, band 0
-      { { 64, 128, 128 }, 8 },   // band 1
-      { { 64, 128, 128 }, 10 },  // band 2
-      { { 64, 128, 128 }, 12 },  // band 3
-      { { 72, 128, 128 }, 14 },  // band 4
-      { { 80, 128, 128 }, 16 }   // band 5
-  },
-  {
-      { { 64, 128, 128 }, 8 },   // dc, band 0
-      { { 64, 128, 128 }, 10 },  // band 1
-      { { 64, 128, 128 }, 12 },  // band 2
-      { { 72, 128, 128 }, 14 },  // band 3
-      { { 76, 128, 128 }, 16 },  // band 4
-      { { 80, 128, 128 }, 18 }   // band 5
-  }
-};
-
-static const uint8_t *get_nuq_knots(int band, int q_profile) {
-  return nuq[q_profile][band].knots;
-}
-
-static INLINE int16_t quant_to_doff_fixed(int band, int q_profile) {
-  return nuq[q_profile][band].doff;
-}
-
-// get cumulative bins
-static INLINE void get_cuml_bins_nuq(int q, int band, tran_low_t *cuml_bins,
-                                     int q_profile) {
-  const uint8_t *knots = get_nuq_knots(band, q_profile);
-  int16_t cuml_knots[NUQ_KNOTS];
-  int i;
-  cuml_knots[0] = knots[0];
-  for (i = 1; i < NUQ_KNOTS; ++i) cuml_knots[i] = cuml_knots[i - 1] + knots[i];
-  for (i = 0; i < NUQ_KNOTS; ++i)
-    cuml_bins[i] = ROUND_POWER_OF_TWO(cuml_knots[i] * q, 7);
-}
-
-void av1_get_dequant_val_nuq(int q, int band, tran_low_t *dq,
-                             tran_low_t *cuml_bins, int q_profile) {
-  const uint8_t *knots = get_nuq_knots(band, q_profile);
-  tran_low_t cuml_bins_[NUQ_KNOTS], *cuml_bins_ptr;
-  tran_low_t doff;
-  int i;
-  cuml_bins_ptr = (cuml_bins ? cuml_bins : cuml_bins_);
-  get_cuml_bins_nuq(q, band, cuml_bins_ptr, q_profile);
-  dq[0] = 0;
-  for (i = 1; i < NUQ_KNOTS; ++i) {
-    doff = quant_to_doff_fixed(band, q_profile);
-    doff = ROUND_POWER_OF_TWO(doff * knots[i], 7);
-    dq[i] =
-        cuml_bins_ptr[i - 1] + ROUND_POWER_OF_TWO((knots[i] - doff * 2) * q, 8);
-  }
-  doff = quant_to_doff_fixed(band, q_profile);
-  dq[NUQ_KNOTS] =
-      cuml_bins_ptr[NUQ_KNOTS - 1] + ROUND_POWER_OF_TWO((64 - doff) * q, 7);
-}
-
-tran_low_t av1_dequant_abscoeff_nuq(int v, int q, const tran_low_t *dq) {
-  if (v <= NUQ_KNOTS)
-    return dq[v];
-  else
-    return dq[NUQ_KNOTS] + (v - NUQ_KNOTS) * q;
-}
-
-tran_low_t av1_dequant_coeff_nuq(int v, int q, const tran_low_t *dq) {
-  tran_low_t dqmag = av1_dequant_abscoeff_nuq(abs(v), q, dq);
-  return (v < 0 ? -dqmag : dqmag);
-}
-#endif  // CONFIG_NEW_QUANT
-
-static const int16_t dc_qlookup[QINDEX_RANGE] = {
+static const int16_t dc_qlookup_Q3[QINDEX_RANGE] = {
   4,    8,    8,    9,    10,  11,  12,  12,  13,  14,  15,   16,   17,   18,
   19,   19,   20,   21,   22,  23,  24,  25,  26,  26,  27,   28,   29,   30,
   31,   32,   32,   33,   34,  35,  36,  37,  38,  38,  39,   40,   41,   42,
@@ -142,8 +38,7 @@ static const int16_t dc_qlookup[QINDEX_RANGE] = {
   1184, 1232, 1282, 1336,
 };
 
-#if CONFIG_HIGHBITDEPTH
-static const int16_t dc_qlookup_10[QINDEX_RANGE] = {
+static const int16_t dc_qlookup_10_Q3[QINDEX_RANGE] = {
   4,    9,    10,   13,   15,   17,   20,   22,   25,   28,   31,   34,   37,
   40,   43,   47,   50,   53,   57,   60,   64,   68,   71,   75,   78,   82,
   86,   90,   93,   97,   101,  105,  109,  113,  116,  120,  124,  128,  132,
@@ -166,7 +61,7 @@ static const int16_t dc_qlookup_10[QINDEX_RANGE] = {
   3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347,
 };
 
-static const int16_t dc_qlookup_12[QINDEX_RANGE] = {
+static const int16_t dc_qlookup_12_Q3[QINDEX_RANGE] = {
   4,     12,    18,    25,    33,    41,    50,    60,    70,    80,    91,
   103,   115,   127,   140,   153,   166,   180,   194,   208,   222,   237,
   251,   266,   281,   296,   312,   327,   343,   358,   374,   390,   405,
@@ -192,9 +87,8 @@ static const int16_t dc_qlookup_12[QINDEX_RANGE] = {
   13501, 13913, 14343, 14807, 15290, 15812, 16356, 16943, 17575, 18237, 18949,
   19718, 20521, 21387,
 };
-#endif
 
-static const int16_t ac_qlookup[QINDEX_RANGE] = {
+static const int16_t ac_qlookup_Q3[QINDEX_RANGE] = {
   4,    8,    9,    10,   11,   12,   13,   14,   15,   16,   17,   18,   19,
   20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
   33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,   45,
@@ -217,8 +111,7 @@ static const int16_t ac_qlookup[QINDEX_RANGE] = {
   1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
 };
 
-#if CONFIG_HIGHBITDEPTH
-static const int16_t ac_qlookup_10[QINDEX_RANGE] = {
+static const int16_t ac_qlookup_10_Q3[QINDEX_RANGE] = {
   4,    9,    11,   13,   16,   18,   21,   24,   27,   30,   33,   37,   40,
   44,   48,   51,   55,   59,   63,   67,   71,   75,   79,   83,   88,   92,
   96,   100,  105,  109,  114,  118,  122,  127,  131,  136,  140,  145,  149,
@@ -241,7 +134,7 @@ static const int16_t ac_qlookup_10[QINDEX_RANGE] = {
   6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312,
 };
 
-static const int16_t ac_qlookup_12[QINDEX_RANGE] = {
+static const int16_t ac_qlookup_12_Q3[QINDEX_RANGE] = {
   4,     13,    19,    27,    35,    44,    54,    64,    75,    87,    99,
   112,   126,   139,   154,   168,   183,   199,   214,   230,   247,   263,
   280,   297,   314,   331,   349,   366,   384,   402,   420,   438,   456,
@@ -267,64 +160,88 @@ static const int16_t ac_qlookup_12[QINDEX_RANGE] = {
   22766, 23214, 23662, 24126, 24590, 25070, 25551, 26047, 26559, 27071, 27599,
   28143, 28687, 29247,
 };
-#endif
 
-int16_t av1_dc_quant(int qindex, int delta, aom_bit_depth_t bit_depth) {
-#if CONFIG_HIGHBITDEPTH
+// Coefficient scaling and quantization with AV1 TX are tailored to
+// the AV1 TX transforms.  Regardless of the bit-depth of the input,
+// the transform stages scale the coefficient values up by a factor of
+// 8 (3 bits) over the scale of the pixel values.  Thus, for 8-bit
+// input, the coefficients have effectively 11 bits of scale depth
+// (8+3), 10-bit input pixels result in 13-bit coefficient depth
+// (10+3) and 12-bit pixels yield 15-bit (12+3) coefficient depth.
+// All quantizers are built using this invariant of x8, 3-bit scaling,
+// thus the Q3 suffix.
+
+// A partial exception to this rule is large transforms; to avoid
+// overflow, TX blocks with > 256 pels (>16x16) are scaled only
+// 4-times unity (2 bits) over the pixel depth, and TX blocks with
+// over 1024 pixels (>32x32) are scaled up only 2x unity (1 bit).
+// This descaling is found via av1_tx_get_scale().  Thus, 16x32, 32x16
+// and 32x32 transforms actually return Q2 coefficients, and 32x64,
+// 64x32 and 64x64 transforms return Q1 coefficients.  However, the
+// quantizers are de-scaled down on-the-fly by the same amount
+// (av1_tx_get_scale()) during quantization, and as such the
+// dequantized/decoded coefficients, even for large TX blocks, are always
+// effectively Q3. Meanwhile, quantized/coded coefficients are Q0
+// because Qn quantizers are applied to Qn tx coefficients.
+
+// Note that encoder decision making (which uses the quantizer to
+// generate several bespoke lamdas for RDO and other heuristics)
+// expects quantizers to be larger for higher-bitdepth input.  In
+// addition, the minimum allowable quantizer is 4; smaller values will
+// underflow to 0 in the actual quantization routines.
+
+int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
   switch (bit_depth) {
-    case AOM_BITS_8: return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
-    case AOM_BITS_10: return dc_qlookup_10[clamp(qindex + delta, 0, MAXQ)];
-    case AOM_BITS_12: return dc_qlookup_12[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_8: return dc_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_10: return dc_qlookup_10_Q3[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_12: return dc_qlookup_12_Q3[clamp(qindex + delta, 0, MAXQ)];
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
   }
-#else
-  (void)bit_depth;
-  return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
-#endif
 }
 
-int16_t av1_ac_quant(int qindex, int delta, aom_bit_depth_t bit_depth) {
-#if CONFIG_HIGHBITDEPTH
+int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
   switch (bit_depth) {
-    case AOM_BITS_8: return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];
-    case AOM_BITS_10: return ac_qlookup_10[clamp(qindex + delta, 0, MAXQ)];
-    case AOM_BITS_12: return ac_qlookup_12[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_8: return ac_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_10: return ac_qlookup_10_Q3[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_12: return ac_qlookup_12_Q3[clamp(qindex + delta, 0, MAXQ)];
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
   }
-#else
-  (void)bit_depth;
-  return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];
-#endif
 }
 
-int16_t av1_qindex_from_ac(int ac, aom_bit_depth_t bit_depth) {
+// In AV1 TX, the coefficients are always scaled up a factor of 8 (3
+// bits), so QTX == Q3.
+
+int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
+  return av1_dc_quant_Q3(qindex, delta, bit_depth);
+}
+
+int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
+  return av1_ac_quant_Q3(qindex, delta, bit_depth);
+}
+
+int16_t av1_qindex_from_ac_Q3(int ac_Q3, aom_bit_depth_t bit_depth) {
   int i;
-  const int16_t *tab = ac_qlookup;
-  ac *= 4;
-#if CONFIG_HIGHBITDEPTH
+  const int16_t *tab = ac_qlookup_Q3;
   switch (bit_depth) {
     case AOM_BITS_10: {
-      tab = ac_qlookup_10;
-      ac *= 4;
+      tab = ac_qlookup_10_Q3;
       break;
     }
     case AOM_BITS_12: {
-      tab = ac_qlookup_12;
-      ac *= 16;
+      tab = ac_qlookup_12_Q3;
       break;
     }
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
   }
-#endif
   (void)bit_depth;
   for (i = 0; i < QINDEX_RANGE; i++) {
-    if (ac <= tab[i]) return i;
+    if (ac_Q3 <= tab[i]) return i;
   }
   return QINDEX_RANGE - 1;
 }
@@ -333,55 +250,47 @@ int av1_get_qindex(const struct segmentation *seg, int segment_id,
                    int base_qindex) {
   if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
     const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
-    const int seg_qindex =
-        seg->abs_delta == SEGMENT_ABSDATA ? data : base_qindex + data;
+    const int seg_qindex = base_qindex + data;
     return clamp(seg_qindex, 0, MAXQ);
   } else {
     return base_qindex;
   }
 }
 
-#if CONFIG_AOM_QM
-qm_val_t *aom_iqmatrix(AV1_COMMON *cm, int qmlevel, int is_chroma,
-                       TX_SIZE tx_size, int is_intra) {
-  return &cm->giqmatrix[qmlevel][!!is_chroma][!!is_intra][tx_size][0];
+const qm_val_t *av1_iqmatrix(AV1_COMMON *cm, int qmlevel, int plane,
+                             TX_SIZE tx_size) {
+  return &cm->giqmatrix[qmlevel][plane][tx_size][0];
 }
-qm_val_t *aom_qmatrix(AV1_COMMON *cm, int qmlevel, int is_chroma,
-                      TX_SIZE tx_size, int is_intra) {
-  return &cm->gqmatrix[qmlevel][!!is_chroma][!!is_intra][tx_size][0];
+const qm_val_t *av1_qmatrix(AV1_COMMON *cm, int qmlevel, int plane,
+                            TX_SIZE tx_size) {
+  return &cm->gqmatrix[qmlevel][plane][tx_size][0];
 }
 
-#if CONFIG_CHROMA_2X2
-#define QM_TOTAL_SIZE 3348
-#else
 #define QM_TOTAL_SIZE 3344
-#endif
-static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE];
-static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE];
+static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE];
+static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE];
 
-void aom_qm_init(AV1_COMMON *cm) {
-  int q, c, f, t;
+void av1_qm_init(AV1_COMMON *cm) {
+  const int num_planes = av1_num_planes(cm);
+  int q, c, t;
   int current;
   for (q = 0; q < NUM_QM_LEVELS; ++q) {
-    for (c = 0; c < 2; ++c) {
-      for (f = 0; f < 2; ++f) {
-        current = 0;
-        for (t = 0; t < TX_SIZES_ALL; ++t) {
-          const int size = tx_size_2d[t];
-          // Don't use QM for sizes > 32x32
-          if (q == NUM_QM_LEVELS - 1 || size > 1024) {
-            cm->gqmatrix[q][c][f][t] = NULL;
-            cm->giqmatrix[q][c][f][t] = NULL;
-          } else {
-            assert(current + size <= QM_TOTAL_SIZE);
-            cm->gqmatrix[q][c][f][t] = &wt_matrix_ref[AOMMIN(
-                NUM_QM_LEVELS - 1, f == 0 ? q + DEFAULT_QM_INTER_OFFSET : q)][c]
-                                                     [current];
-            cm->giqmatrix[q][c][f][t] = &iwt_matrix_ref[AOMMIN(
-                NUM_QM_LEVELS - 1, f == 0 ? q + DEFAULT_QM_INTER_OFFSET : q)][c]
-                                                       [current];
-            current += size;
-          }
+    for (c = 0; c < num_planes; ++c) {
+      current = 0;
+      for (t = 0; t < TX_SIZES_ALL; ++t) {
+        const int size = tx_size_2d[t];
+        const int qm_tx_size = av1_get_adjusted_tx_size(t);
+        if (q == NUM_QM_LEVELS - 1) {
+          cm->gqmatrix[q][c][t] = NULL;
+          cm->giqmatrix[q][c][t] = NULL;
+        } else if (t != qm_tx_size) {  // Reuse matrices for 'qm_tx_size'
+          cm->gqmatrix[q][c][t] = cm->gqmatrix[q][c][qm_tx_size];
+          cm->giqmatrix[q][c][t] = cm->giqmatrix[q][c][qm_tx_size];
+        } else {
+          assert(current + size <= QM_TOTAL_SIZE);
+          cm->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current];
+          cm->giqmatrix[q][c][t] = &iwt_matrix_ref[q][c >= 1][current];
+          current += size;
         }
       }
     }
@@ -399,13 +308,9 @@ void aom_qm_init(AV1_COMMON *cm) {
    frequency domain according to different nominal viewing
    distances.
  */
-static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
+static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        43, 86, 86, 166,
-#endif
         /* Size 4x4 */
         32, 43, 73, 97, 43, 67, 94, 110, 73, 94, 137, 150, 97, 110, 150, 200,
         /* Size 8x8 */
@@ -632,10 +537,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146,
         152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        50, 62, 62, 100,
-#endif
         /* Size 4x4 */
         35, 46, 57, 66, 46, 60, 69, 71, 57, 69, 90, 90, 66, 71, 90, 109,
         /* Size 8x8 */
@@ -848,10 +749,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        39, 82, 82, 155,
-#endif
         /* Size 4x4 */
         32, 41, 69, 92, 41, 63, 88, 103, 69, 88, 127, 140, 92, 103, 140, 184,
         /* Size 8x8 */
@@ -1076,10 +973,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161,
         171, 174, 179, 181, 188, 188, 190 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        48, 60, 60, 97,
-#endif
         /* Size 4x4 */
         33, 45, 56, 64, 45, 58, 66, 69, 56, 66, 86, 87, 64, 69, 87, 105,
         /* Size 8x8 */
@@ -1291,10 +1184,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        39, 76, 76, 140,
-#endif
         /* Size 4x4 */
         32, 38, 63, 86, 38, 56, 78, 97, 63, 78, 113, 130, 86, 97, 130, 169,
         /* Size 8x8 */
@@ -1515,10 +1404,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         97, 98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163,
         168, 169, 175, 175, 176 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        48, 58, 58, 91,
-#endif
         /* Size 4x4 */
         32, 45, 53, 63, 45, 55, 62, 67, 53, 62, 80, 84, 63, 67, 84, 101,
         /* Size 8x8 */
@@ -1730,10 +1615,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        36, 71, 71, 134,
-#endif
         /* Size 4x4 */
         32, 37, 58, 81, 37, 54, 72, 91, 58, 72, 102, 121, 81, 91, 121, 156,
         /* Size 8x8 */
@@ -1953,10 +1834,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163,
         163, 163 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        47, 55, 55, 89,
-#endif
         /* Size 4x4 */
         32, 45, 51, 61, 45, 54, 59, 65, 51, 59, 75, 81, 61, 65, 81, 97,
         /* Size 8x8 */
@@ -2168,10 +2045,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        35, 63, 63, 117,
-#endif
         /* Size 4x4 */
         32, 34, 53, 75, 34, 49, 64, 81, 53, 64, 91, 112, 75, 81, 112, 140,
         /* Size 8x8 */
@@ -2387,10 +2260,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151,
         152 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        47, 52, 52, 82,
-#endif
         /* Size 4x4 */
         32, 46, 49, 58, 46, 53, 55, 62, 49, 55, 70, 78, 58, 62, 78, 91,
         /* Size 8x8 */
@@ -2601,10 +2470,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        35, 58, 58, 105,
-#endif
         /* Size 4x4 */
         32, 34, 49, 72, 34, 48, 60, 79, 49, 60, 82, 104, 72, 79, 104, 134,
         /* Size 8x8 */
@@ -2817,10 +2682,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118,
         118, 125, 125, 133, 133, 136, 136, 141 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        47, 50, 50, 76,
-#endif
         /* Size 4x4 */
         32, 46, 47, 57, 46, 53, 54, 60, 47, 54, 66, 75, 57, 60, 75, 89,
         /* Size 8x8 */
@@ -3031,10 +2892,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        34, 52, 52, 89,
-#endif
         /* Size 4x4 */
         32, 33, 45, 62, 33, 39, 51, 64, 45, 51, 71, 87, 62, 64, 87, 108,
         /* Size 8x8 */
@@ -3246,10 +3103,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103,
         105, 108, 112, 114, 119, 119, 127, 127 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        41, 48, 48, 69,
-#endif
         /* Size 4x4 */
         31, 42, 47, 53, 42, 48, 50, 54, 47, 50, 61, 67, 53, 54, 67, 78,
         /* Size 8x8 */
@@ -3460,10 +3313,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        33, 47, 47, 75,
-#endif
         /* Size 4x4 */
         32, 33, 42, 55, 33, 38, 46, 57, 42, 46, 63, 75, 55, 57, 75, 92,
         /* Size 8x8 */
@@ -3673,10 +3522,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98,
         100, 105, 105, 109 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        39, 47, 47, 63,
-#endif
         /* Size 4x4 */
         31, 41, 46, 51, 41, 48, 48, 51, 46, 48, 58, 62, 51, 51, 62, 71,
         /* Size 8x8 */
@@ -3887,10 +3732,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        33, 42, 42, 64,
-#endif
         /* Size 4x4 */
         32, 32, 38, 51, 32, 35, 40, 49, 38, 40, 54, 64, 51, 49, 64, 81,
         /* Size 8x8 */
@@ -4099,10 +3940,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81,
         82, 83, 87, 87 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        38, 45, 45, 59,
-#endif
         /* Size 4x4 */
         31, 38, 47, 49, 38, 47, 46, 46, 47, 46, 54, 57, 49, 46, 57, 66,
         /* Size 8x8 */
@@ -4313,10 +4150,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 38, 38, 54,
-#endif
         /* Size 4x4 */
         32, 32, 35, 43, 32, 34, 37, 43, 35, 37, 48, 54, 43, 43, 54, 65,
         /* Size 8x8 */
@@ -4525,10 +4358,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67,
         69, 70, 70, 73 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        33, 45, 45, 54,
-#endif
         /* Size 4x4 */
         31, 37, 47, 47, 37, 44, 47, 45, 47, 47, 53, 53, 47, 45, 53, 59,
         /* Size 8x8 */
@@ -4739,10 +4568,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 34, 34, 48,
-#endif
         /* Size 4x4 */
         32, 32, 34, 38, 32, 33, 35, 39, 34, 35, 39, 45, 38, 39, 45, 54,
         /* Size 8x8 */
@@ -4951,10 +4776,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58,
         58, 60, 63, 63 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 46, 46, 53,
-#endif
         /* Size 4x4 */
         31, 34, 42, 47, 34, 39, 45, 46, 42, 45, 48, 49, 47, 46, 49, 54,
         /* Size 8x8 */
@@ -5165,10 +4986,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 33, 33, 39,
-#endif
         /* Size 4x4 */
         32, 32, 32, 35, 32, 32, 33, 35, 32, 33, 35, 38, 35, 35, 38, 46,
         /* Size 8x8 */
@@ -5377,10 +5194,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47,
         48, 48, 48, 49 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        31, 42, 42, 48,
-#endif
         /* Size 4x4 */
         31, 32, 38, 46, 32, 34, 41, 46, 38, 41, 47, 47, 46, 46, 47, 52,
         /* Size 8x8 */
@@ -5591,10 +5404,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 32, 32, 35,
-#endif
         /* Size 4x4 */
         31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 33, 34, 32, 33, 34, 35,
         /* Size 8x8 */
@@ -5803,10 +5612,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36,
         36, 37, 38, 38 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        31, 38, 38, 46,
-#endif
         /* Size 4x4 */
         31, 31, 34, 38, 31, 32, 35, 40, 34, 35, 39, 43, 38, 40, 43, 47,
         /* Size 8x8 */
@@ -6017,10 +5822,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 32, 32, 33,
-#endif
         /* Size 4x4 */
         31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
         /* Size 8x8 */
@@ -6229,10 +6030,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34,
         34, 34, 34, 34 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        31, 33, 33, 36,
-#endif
         /* Size 4x4 */
         31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 32, 35, 34, 35, 35, 39,
         /* Size 8x8 */
@@ -6443,10 +6240,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        31, 31, 31, 32,
-#endif
         /* Size 4x4 */
         31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
         /* Size 8x8 */
@@ -6655,10 +6448,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
         32, 32, 32, 32 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        31, 31, 31, 31,
-#endif
         /* Size 4x4 */
         31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
         /* Size 8x8 */
@@ -6869,10 +6658,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 32, 32, 32,
-#endif
         /* Size 4x4 */
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
         /* Size 8x8 */
@@ -7081,10 +6866,6 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
         32, 32, 32, 32 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 32, 32, 32,
-#endif
         /* Size 4x4 */
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
         /* Size 8x8 */
@@ -7295,13 +7076,9 @@ static uint16_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
 };
 
-static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
+static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        24, 12, 12, 6,
-#endif
         /* Size 4x4 */
         32, 24, 14, 11, 24, 15, 11, 9, 14, 11, 7, 7, 11, 9, 7, 5,
         /* Size 8x8 */
@@ -7494,10 +7271,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5,
         5, 5 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        20, 17, 17, 10,
-#endif
         /* Size 4x4 */
         29, 22, 18, 16, 22, 17, 15, 14, 18, 15, 11, 11, 16, 14, 11, 9,
         /* Size 8x8 */
@@ -7708,10 +7481,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        26, 12, 12, 7,
-#endif
         /* Size 4x4 */
         32, 25, 15, 11, 25, 16, 12, 10, 15, 12, 8, 7, 11, 10, 7, 6,
         /* Size 8x8 */
@@ -7907,10 +7676,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6,
         6, 6, 6, 6, 5, 5, 5 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        21, 17, 17, 11,
-#endif
         /* Size 4x4 */
         31, 23, 18, 16, 23, 18, 16, 15, 18, 16, 12, 12, 16, 15, 12, 10,
         /* Size 8x8 */
@@ -8121,10 +7886,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        26, 13, 13, 7,
-#endif
         /* Size 4x4 */
         32, 27, 16, 12, 27, 18, 13, 11, 16, 13, 9, 8, 12, 11, 8, 6,
         /* Size 8x8 */
@@ -8321,10 +8082,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         12, 12, 12, 12, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7,
         7, 7, 6, 6, 6, 6, 6, 6, 6 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        21, 18, 18, 11,
-#endif
         /* Size 4x4 */
         32, 23, 19, 16, 23, 19, 17, 15, 19, 17, 13, 12, 16, 15, 12, 10,
         /* Size 8x8 */
@@ -8535,10 +8292,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        28, 14, 14, 8,
-#endif
         /* Size 4x4 */
         32, 28, 18, 13, 28, 19, 14, 11, 18, 14, 10, 8, 13, 11, 8, 7,
         /* Size 8x8 */
@@ -8735,10 +8488,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         11, 12, 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9,
         9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        22, 19, 19, 12,
-#endif
         /* Size 4x4 */
         32, 23, 20, 17, 23, 19, 17, 16, 20, 17, 14, 13, 17, 16, 13, 11,
         /* Size 8x8 */
@@ -8949,10 +8698,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        29, 16, 16, 9,
-#endif
         /* Size 4x4 */
         32, 30, 19, 14, 30, 21, 16, 13, 19, 16, 11, 9, 14, 13, 9, 7,
         /* Size 8x8 */
@@ -9152,10 +8897,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         7, 7, 7, 8, 12, 12, 12, 13, 13, 13, 13, 14, 13, 13, 12, 12, 12, 11, 11,
         11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        22, 20, 20, 12,
-#endif
         /* Size 4x4 */
         32, 22, 21, 18, 22, 19, 19, 17, 21, 19, 15, 13, 18, 17, 13, 11,
         /* Size 8x8 */
@@ -9366,10 +9107,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        29, 18, 18, 10,
-#endif
         /* Size 4x4 */
         32, 30, 21, 14, 30, 21, 17, 13, 21, 17, 12, 10, 14, 13, 10, 8,
         /* Size 8x8 */
@@ -9571,10 +9308,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         10, 9, 9, 9, 9, 8, 8, 8, 13, 14, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13,
         13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        22, 20, 20, 13,
-#endif
         /* Size 4x4 */
         32, 22, 22, 18, 22, 19, 19, 17, 22, 19, 16, 14, 18, 17, 14, 12,
         /* Size 8x8 */
@@ -9785,10 +9518,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        30, 20, 20, 12,
-#endif
         /* Size 4x4 */
         32, 31, 23, 17, 31, 26, 20, 16, 23, 20, 14, 12, 17, 16, 12, 9,
         /* Size 8x8 */
@@ -9997,10 +9726,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
         8, 8 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        25, 21, 21, 15,
-#endif
         /* Size 4x4 */
         33, 24, 22, 19, 24, 21, 20, 19, 22, 20, 17, 15, 19, 19, 15, 13,
         /* Size 8x8 */
@@ -10211,10 +9936,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        31, 22, 22, 14,
-#endif
         /* Size 4x4 */
         32, 31, 24, 19, 31, 27, 22, 18, 24, 22, 16, 14, 19, 18, 14, 11,
         /* Size 8x8 */
@@ -10423,10 +10144,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10,
         10, 10, 10, 9 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        26, 22, 22, 16,
-#endif
         /* Size 4x4 */
         33, 25, 22, 20, 25, 21, 21, 20, 22, 21, 18, 17, 20, 20, 17, 14,
         /* Size 8x8 */
@@ -10637,10 +10354,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        31, 24, 24, 16,
-#endif
         /* Size 4x4 */
         32, 32, 27, 20, 32, 29, 26, 21, 27, 26, 19, 16, 20, 21, 16, 13,
         /* Size 8x8 */
@@ -10849,10 +10562,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13,
         12, 12, 12, 12 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        27, 23, 23, 17,
-#endif
         /* Size 4x4 */
         33, 27, 22, 21, 27, 22, 22, 22, 22, 22, 19, 18, 21, 22, 18, 16,
         /* Size 8x8 */
@@ -11063,10 +10772,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 27, 27, 19,
-#endif
         /* Size 4x4 */
         32, 32, 29, 24, 32, 30, 28, 24, 29, 28, 21, 19, 24, 24, 19, 16,
         /* Size 8x8 */
@@ -11275,10 +10980,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         23, 23, 23, 23, 22, 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15,
         15, 15, 15, 14 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        31, 23, 23, 19,
-#endif
         /* Size 4x4 */
         33, 28, 22, 22, 28, 23, 22, 23, 22, 22, 19, 19, 22, 23, 19, 17,
         /* Size 8x8 */
@@ -11489,10 +11190,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 30, 30, 21,
-#endif
         /* Size 4x4 */
         32, 32, 30, 27, 32, 31, 29, 26, 30, 29, 26, 23, 27, 26, 23, 19,
         /* Size 8x8 */
@@ -11701,10 +11398,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 19, 18, 18, 18,
         18, 17, 16, 16 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 22, 22, 19,
-#endif
         /* Size 4x4 */
         33, 30, 24, 22, 30, 26, 23, 22, 24, 23, 21, 21, 22, 22, 21, 19,
         /* Size 8x8 */
@@ -11915,10 +11608,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 31, 31, 26,
-#endif
         /* Size 4x4 */
         32, 32, 32, 29, 32, 32, 31, 29, 32, 31, 29, 27, 29, 29, 27, 22,
         /* Size 8x8 */
@@ -12127,10 +11816,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22,
         21, 21, 21, 21 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        33, 24, 24, 21,
-#endif
         /* Size 4x4 */
         33, 32, 27, 22, 32, 30, 25, 22, 27, 25, 22, 22, 22, 22, 22, 20,
         /* Size 8x8 */
@@ -12341,10 +12026,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 32, 32, 29,
-#endif
         /* Size 4x4 */
         33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 31, 30, 32, 31, 30, 29,
         /* Size 8x8 */
@@ -12553,10 +12234,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28,
         28, 28, 27, 27 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        33, 27, 27, 22,
-#endif
         /* Size 4x4 */
         33, 33, 30, 27, 33, 32, 29, 26, 30, 29, 26, 24, 27, 26, 24, 22,
         /* Size 8x8 */
@@ -12767,10 +12444,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 32, 32, 31,
-#endif
         /* Size 4x4 */
         33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31,
         /* Size 8x8 */
@@ -12979,10 +12652,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30,
         30, 30, 30, 30 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        33, 31, 31, 28,
-#endif
         /* Size 4x4 */
         33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 32, 29, 30, 29, 29, 26,
         /* Size 8x8 */
@@ -13193,10 +12862,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        33, 33, 33, 32,
-#endif
         /* Size 4x4 */
         33, 33, 33, 33, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
         /* Size 8x8 */
@@ -13405,10 +13070,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
         32, 32, 32, 32 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        33, 33, 33, 33,
-#endif
         /* Size 4x4 */
         33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
         /* Size 8x8 */
@@ -13619,10 +13280,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
   },
   {
       { /* Luma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 32, 32, 32,
-#endif
         /* Size 4x4 */
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
         /* Size 8x8 */
@@ -13831,10 +13488,6 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
         32, 32, 32, 32 },
       { /* Chroma */
-#if CONFIG_CHROMA_2X2
-        /* Size 2x2 */
-        32, 32, 32, 32,
-#endif
         /* Size 4x4 */
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
         /* Size 8x8 */
@@ -14044,63 +13697,3 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
         32, 32, 32, 32 },
   },
 };
-#endif
-
-#if CONFIG_PVQ
-/* Quantization matrices for 8x8. For other block sizes, we currently just do
-   resampling. */
-/* Flat quantization, i.e. optimize for PSNR. */
-const int OD_QM8_Q4_FLAT[] = { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
-                               16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
-                               16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
-                               16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
-                               16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
-                               16, 16, 16, 16, 16, 16, 16, 16, 16 };
-#if 0
-/* M1: MPEG2 matrix for inter (which has a dead zone). */
-const int OD_QM8_Q4[] = {
-  16, 17, 18, 19, 20, 21, 22, 23,
-  17, 18, 19, 20, 21, 22, 23, 24,
-  18, 19, 20, 21, 22, 23, 24, 25,
-  19, 20, 21, 22, 23, 24, 26, 27,
-  20, 21, 22, 23, 25, 26, 27, 28,
-  21, 22, 23, 24, 26, 27, 28, 30,
-  22, 23, 24, 26, 27, 28, 30, 31,
-  23, 24, 25, 27, 28, 30, 31, 33};
-#endif
-#if 0
-/* M2: MPEG2 matrix for intra (no dead zone). */
-const int OD_QM8_Q4[] = {
-  16, 16, 19, 22, 22, 26, 26, 27,
-  16, 16, 22, 22, 26, 27, 27, 29,
-  19, 22, 26, 26, 27, 29, 29, 35,
-  22, 24, 27, 27, 29, 32, 34, 38,
-  26, 27, 29, 29, 32, 35, 38, 46,
-  27, 29, 34, 34, 35, 40, 46, 56,
-  29, 34, 34, 37, 40, 48, 56, 69,
-  34, 37, 38, 40, 48, 58, 69, 83
-};
-#endif
-#if 0
-/* M3: Taken from dump_psnrhvs. */
-const int OD_QM8_Q4[] = {
-  16, 16, 17, 20, 24, 29, 36, 42,
-  16, 17, 17, 19, 22, 26, 31, 37,
-  17, 17, 21, 23, 26, 30, 34, 40,
-  20, 19, 23, 28, 31, 35, 39, 45,
-  24, 22, 26, 31, 36, 41, 46, 51,
-  29, 26, 30, 35, 41, 47, 52, 58,
-  36, 31, 34, 39, 46, 52, 59, 66,
-  42, 37, 40, 45, 51, 58, 66, 73
-};
-#endif
-#if 1
-/* M4: a compromise equal to .5*(M3 + .5*(M2+transpose(M2))) */
-const int OD_QM8_Q4_HVS[] = { 16, 16, 18, 21, 24, 28, 32, 36, 16, 17, 20,
-                              21, 24, 27, 31, 35, 18, 20, 24, 25, 27, 31,
-                              33, 38, 21, 21, 25, 28, 30, 34, 37, 42, 24,
-                              24, 27, 30, 34, 38, 43, 49, 28, 27, 31, 34,
-                              38, 44, 50, 58, 32, 31, 33, 37, 43, 50, 58,
-                              68, 36, 35, 38, 42, 49, 58, 68, 78 };
-#endif
-#endif
diff --git a/third_party/aom/av1/common/quant_common.h b/third_party/aom/av1/common/quant_common.h
index 92843fe4d..f9681036d 100644
--- a/third_party/aom/av1/common/quant_common.h
+++ b/third_party/aom/av1/common/quant_common.h
@@ -25,82 +25,37 @@ extern "C" {
 #define MAXQ 255
 #define QINDEX_RANGE (MAXQ - MINQ + 1)
 #define QINDEX_BITS 8
-#if CONFIG_AOM_QM
 // Total number of QM sets stored
 #define QM_LEVEL_BITS 4
 #define NUM_QM_LEVELS (1 << QM_LEVEL_BITS)
 /* Range of QMS is between first and last value, with offset applied to inter
  * blocks*/
+#define DEFAULT_QM_Y 10
+#define DEFAULT_QM_U 11
+#define DEFAULT_QM_V 12
 #define DEFAULT_QM_FIRST 5
 #define DEFAULT_QM_LAST 9
-#define DEFAULT_QM_INTER_OFFSET 0
-#endif
 
 struct AV1Common;
 
-int16_t av1_dc_quant(int qindex, int delta, aom_bit_depth_t bit_depth);
-int16_t av1_ac_quant(int qindex, int delta, aom_bit_depth_t bit_depth);
-int16_t av1_qindex_from_ac(int ac, aom_bit_depth_t bit_depth);
+int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_qindex_from_ac_Q3(int ac_Q3, aom_bit_depth_t bit_depth);
 
 int av1_get_qindex(const struct segmentation *seg, int segment_id,
                    int base_qindex);
-#if CONFIG_AOM_QM
 // Reduce the large number of quantizers to a smaller number of levels for which
 // different matrices may be defined
 static INLINE int aom_get_qmlevel(int qindex, int first, int last) {
   return first + (qindex * (last + 1 - first)) / QINDEX_RANGE;
 }
-void aom_qm_init(struct AV1Common *cm);
-qm_val_t *aom_iqmatrix(struct AV1Common *cm, int qindex, int comp,
-                       TX_SIZE tx_size, int is_intra);
-qm_val_t *aom_qmatrix(struct AV1Common *cm, int qindex, int comp,
-                      TX_SIZE tx_size, int is_intra);
-#endif
-
-#if CONFIG_NEW_QUANT
-
-#define QUANT_PROFILES 4
-#define QUANT_RANGES 2
-#define NUQ_KNOTS 3
-
-typedef tran_low_t dequant_val_type_nuq[NUQ_KNOTS + 1];
-typedef tran_low_t cuml_bins_type_nuq[NUQ_KNOTS];
-void av1_get_dequant_val_nuq(int q, int band, tran_low_t *dq,
-                             tran_low_t *cuml_bins, int dq_off_index);
-tran_low_t av1_dequant_abscoeff_nuq(int v, int q, const tran_low_t *dq);
-tran_low_t av1_dequant_coeff_nuq(int v, int q, const tran_low_t *dq);
-
-static INLINE int qindex_to_qrange(int qindex) {
-  return (qindex < 140 ? 1 : 0);
-}
-
-static INLINE int get_dq_profile_from_ctx(int qindex, int q_ctx, int is_inter,
-                                          PLANE_TYPE plane_type) {
-  // intra/inter, Y/UV, ctx, qrange
-  static const int
-      def_dq_profile_lookup[REF_TYPES][PLANE_TYPES][COEFF_CONTEXTS0]
-                           [QUANT_RANGES] = {
-                             {
-                                 // intra
-                                 { { 2, 1 }, { 2, 1 }, { 2, 1 } },  // Y
-                                 { { 3, 1 }, { 3, 1 }, { 3, 1 } },  // UV
-                             },
-                             {
-                                 // inter
-                                 { { 3, 1 }, { 2, 1 }, { 2, 1 } },  // Y
-                                 { { 3, 1 }, { 3, 1 }, { 3, 1 } },  // UV
-                             },
-                           };
-  if (!qindex) return 0;  // lossless
-  return def_dq_profile_lookup[is_inter][plane_type][q_ctx]
-                              [qindex_to_qrange(qindex)];
-}
-#endif  // CONFIG_NEW_QUANT
-
-#if CONFIG_PVQ
-extern const int OD_QM8_Q4_FLAT[];
-extern const int OD_QM8_Q4_HVS[];
-#endif
+void av1_qm_init(struct AV1Common *cm);
+const qm_val_t *av1_iqmatrix(struct AV1Common *cm, int qindex, int comp,
+                             TX_SIZE tx_size);
+const qm_val_t *av1_qmatrix(struct AV1Common *cm, int qindex, int comp,
+                            TX_SIZE tx_size);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/common/reconinter.c b/third_party/aom/av1/common/reconinter.c
index a1a22a0af..b6ac436fb 100644
--- a/third_party/aom/av1/common/reconinter.c
+++ b/third_party/aom/av1/common/reconinter.c
@@ -13,208 +13,157 @@
 #include <stdio.h>
 #include <limits.h>
 
-#include "./aom_scale_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-#include "./aom_config.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
 
 #include "aom/aom_integer.h"
 #include "aom_dsp/blend.h"
 
 #include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
 #include "av1/common/reconinter.h"
 #include "av1/common/reconintra.h"
-#if CONFIG_MOTION_VAR
 #include "av1/common/onyxc_int.h"
 #include "av1/common/obmc.h"
-#endif  // CONFIG_MOTION_VAR
 
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+#define USE_PRECOMPUTED_WEDGE_MASK 1
+#define USE_PRECOMPUTED_WEDGE_SIGN 1
+
 // This function will determine whether or not to create a warped
-// prediction and return the appropriate motion model depending
-// on the configuration. Behavior will change with different
-// combinations of GLOBAL_MOTION, WARPED_MOTION and MOTION_VAR.
-static INLINE int allow_warp(const MODE_INFO *const mi,
-                             const WarpTypesAllowed *const warp_types,
-#if CONFIG_GLOBAL_MOTION
-                             const WarpedMotionParams *const gm_params,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_MOTION_VAR
-                             int build_for_obmc,
-#endif  // CONFIG_MOTION_VAR
-                             WarpedMotionParams *final_warp_params) {
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
-  *final_warp_params = default_warp_params;
-
-// Only global motion configured
-#if CONFIG_GLOBAL_MOTION && !CONFIG_WARPED_MOTION && !CONFIG_MOTION_VAR
-  (void)mbmi;
-  if (warp_types->global_warp_allowed) {
-    memcpy(final_warp_params, gm_params, sizeof(*final_warp_params));
-    return 1;
-  }
-#endif  // CONFIG_GLOBAL_MOTION && !CONFIG_WARPED_MOTION && !CONFIG_MOTION_VAR
+// prediction.
+int av1_allow_warp(const MB_MODE_INFO *const mbmi,
+                   const WarpTypesAllowed *const warp_types,
+                   const WarpedMotionParams *const gm_params,
+                   int build_for_obmc, int x_scale, int y_scale,
+                   WarpedMotionParams *final_warp_params) {
+  if (x_scale != SCALE_SUBPEL_SHIFTS || y_scale != SCALE_SUBPEL_SHIFTS)
+    return 0;
 
-// Only warped motion configured
-#if CONFIG_WARPED_MOTION && !CONFIG_GLOBAL_MOTION && !CONFIG_MOTION_VAR
-  if (warp_types->local_warp_allowed) {
-    memcpy(final_warp_params, &mbmi->wm_params[0], sizeof(*final_warp_params));
-    return 1;
-  }
-#endif  // CONFIG_WARPED_MOTION && !CONFIG_GLOBAL_MOTION && !CONFIG_MOTION_VAR
-
-// Warped and global motion configured
-#if CONFIG_GLOBAL_MOTION && CONFIG_WARPED_MOTION && !CONFIG_MOTION_VAR
-  // When both are enabled, warped will take priority. The global parameters
-  // will only be used to compute projection samples to find the warped model.
-  // Note that when a block chooses global, it will not be possible to
-  // select WARPED_CAUSAL.
-  if (warp_types->local_warp_allowed) {
-    memcpy(final_warp_params, &mbmi->wm_params[0], sizeof(*final_warp_params));
-    return 1;
-  } else if (warp_types->global_warp_allowed) {
-    memcpy(final_warp_params, gm_params, sizeof(*final_warp_params));
-    return 1;
-  }
-#endif  // CONFIG_GLOBAL_MOTION && CONFIG_WARPED_MOTION && !CONFIG_MOTION_VAR
-
-// Motion var and global motion configured
-#if CONFIG_GLOBAL_MOTION && CONFIG_MOTION_VAR && !CONFIG_WARPED_MOTION
-  // We warp if either case is true:
-  //   1.) We are predicting a block which uses global motion
-  //   2.) We are predicting a neighboring block of a block using OBMC,
-  //       the neighboring block uses global motion, and we have enabled
-  //       WARP_GM_NEIGHBORS_WITH_OBMC
-  (void)mbmi;
-  if (warp_types->global_warp_allowed &&
-      (WARP_GM_NEIGHBORS_WITH_OBMC || !build_for_obmc)) {
-    memcpy(final_warp_params, gm_params, sizeof(*final_warp_params));
-    return 1;
-  }
-#endif  // CONFIG_GLOBAL_MOTION && CONFIG_MOTION_VAR && !CONFIG_WARPED_MOTION
-
-// Motion var and warped motion configured
-#if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR && !CONFIG_GLOBAL_MOTION
-  // We warp if either case is true:
-  //   1.) We are predicting a block with motion mode WARPED_CAUSAL
-  //   2.) We are predicting a neighboring block of a block using OBMC,
-  //       the neighboring block has mode WARPED_CAUSAL, and we have enabled
-  //       WARP_WM_NEIGHBORS_WITH_OBMC
-  if (warp_types->local_warp_allowed) {
-    if ((build_for_obmc && WARP_WM_NEIGHBORS_WITH_OBMC) || (!build_for_obmc)) {
-      memcpy(final_warp_params, &mbmi->wm_params[0],
-             sizeof(*final_warp_params));
-      return 1;
-    }
-  }
-#endif  // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR && !CONFIG_GLOBAL_MOTION
+  if (final_warp_params != NULL) *final_warp_params = default_warp_params;
+
+  if (build_for_obmc) return 0;
 
-// Motion var, warped motion and global motion all configured
-#if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR && CONFIG_GLOBAL_MOTION
-  if (warp_types->local_warp_allowed) {
-    if ((build_for_obmc && WARP_WM_NEIGHBORS_WITH_OBMC) || (!build_for_obmc)) {
+  if (warp_types->local_warp_allowed && !mbmi->wm_params[0].invalid) {
+    if (final_warp_params != NULL)
       memcpy(final_warp_params, &mbmi->wm_params[0],
              sizeof(*final_warp_params));
-      return 1;
-    }
-  } else if (warp_types->global_warp_allowed &&
-             (WARP_GM_NEIGHBORS_WITH_OBMC || !build_for_obmc)) {
-    memcpy(final_warp_params, gm_params, sizeof(*final_warp_params));
+    return 1;
+  } else if (warp_types->global_warp_allowed && !gm_params->invalid) {
+    if (final_warp_params != NULL)
+      memcpy(final_warp_params, gm_params, sizeof(*final_warp_params));
     return 1;
   }
-#endif  // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR && CONFIG_GLOBAL_MOTION
 
   return 0;
 }
-#endif  // CONFIG_GLOBAL_MOTION ||CONFIG_WARPED_MOTION
 
-static INLINE void av1_make_inter_predictor(
-    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
-    const int subpel_x, const int subpel_y, const struct scale_factors *sf,
-    int w, int h, ConvolveParams *conv_params, InterpFilters interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-    const WarpTypesAllowed *warp_types, int p_col, int p_row, int plane,
-    int ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR
-    const MODE_INFO *mi, int build_for_obmc,
-#endif
-    int xs, int ys, const MACROBLOCKD *xd) {
-  (void)xd;
-
-#if !CONFIG_MOTION_VAR
-  const MODE_INFO *mi = xd->mi[0];
-  (void)mi;
-#endif  // CONFIG_MOTION_VAR
-
-// Make sure the selected motion mode is valid for this configuration
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  assert_motion_mode_valid(mi->mbmi.motion_mode,
-#if CONFIG_GLOBAL_MOTION
-                           0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-                           xd,
-#endif
-                           mi);
-#endif  // CONFIG MOTION_VAR || CONFIG_WARPED_MOTION
+void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+                              int dst_stride, const SubpelParams *subpel_params,
+                              const struct scale_factors *sf, int w, int h,
+                              ConvolveParams *conv_params,
+                              InterpFilters interp_filters,
+                              const WarpTypesAllowed *warp_types, int p_col,
+                              int p_row, int plane, int ref,
+                              const MB_MODE_INFO *mi, int build_for_obmc,
+                              const MACROBLOCKD *xd, int can_use_previous) {
+  // Make sure the selected motion mode is valid for this configuration
+  assert_motion_mode_valid(mi->motion_mode, xd->global_motion, xd, mi,
+                           can_use_previous);
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
 
-#if CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION
   WarpedMotionParams final_warp_params;
-  const int do_warp = allow_warp(
-      mi, warp_types,
-#if CONFIG_GLOBAL_MOTION
-#if CONFIG_COMPOUND_SINGLEREF
-      // TODO(zoeliu): To further check the single
-      // ref comp mode to work together with
-      //               global motion.
-      has_second_ref(&mi->mbmi) ? &xd->global_motion[mi->mbmi.ref_frame[ref]]
-                                : &xd->global_motion[mi->mbmi.ref_frame[0]],
-#else   // !(CONFIG_COMPOUND_SINGLEREF)
-      &xd->global_motion[mi->mbmi.ref_frame[ref]],
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_MOTION_VAR
-      build_for_obmc,
-#endif  // CONFIG_MOTION_VAR
-      &final_warp_params);
-  if (do_warp
-#if CONFIG_AMVR
-      && xd->cur_frame_mv_precision_level == 0
-#endif
-      ) {
+  const int do_warp =
+      (w >= 8 && h >= 8 &&
+       av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]],
+                      build_for_obmc, subpel_params->xs, subpel_params->ys,
+                      &final_warp_params));
+  if (do_warp && xd->cur_frame_force_integer_mv == 0) {
     const struct macroblockd_plane *const pd = &xd->plane[plane];
     const struct buf_2d *const pre_buf = &pd->pre[ref];
     av1_warp_plane(&final_warp_params,
-#if CONFIG_HIGHBITDEPTH
                    xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
-#endif  // CONFIG_HIGHBITDEPTH
                    pre_buf->buf0, pre_buf->width, pre_buf->height,
                    pre_buf->stride, dst, p_col, p_row, w, h, dst_stride,
-                   pd->subsampling_x, pd->subsampling_y, xs, ys, conv_params);
-    return;
-  }
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
-                           sf, w, h, conv_params, interp_filters, xs, ys,
-                           xd->bd);
-    return;
+                   pd->subsampling_x, pd->subsampling_y, conv_params);
+  } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf,
+                           w, h, conv_params, interp_filters, xd->bd);
+  } else {
+    inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf, w, h,
+                    conv_params, interp_filters);
   }
-#endif  // CONFIG_HIGHBITDEPTH
-  inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y, sf, w,
-                  h, conv_params, interp_filters, xs, ys);
 }
 
-#define NSMOOTHERS 1
+#if USE_PRECOMPUTED_WEDGE_MASK
+static const uint8_t wedge_master_oblique_odd[MASK_MASTER_SIZE] = {
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  6,  18,
+  37, 53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+static const uint8_t wedge_master_oblique_even[MASK_MASTER_SIZE] = {
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  4,  11, 27,
+  46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+static const uint8_t wedge_master_vertical[MASK_MASTER_SIZE] = {
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  7,  21,
+  43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
 
-// [smoother][negative][direction]
-DECLARE_ALIGNED(16, static uint8_t,
-                wedge_mask_obl[NSMOOTHERS][2][WEDGE_DIRECTIONS]
-                              [MASK_MASTER_SIZE * MASK_MASTER_SIZE]);
+static void shift_copy(const uint8_t *src, uint8_t *dst, int shift, int width) {
+  if (shift >= 0) {
+    memcpy(dst + shift, src, width - shift);
+    memset(dst, src[0], shift);
+  } else {
+    shift = -shift;
+    memcpy(dst, src + shift, width - shift);
+    memset(dst + width - shift, src[width - 1], shift);
+  }
+}
+#endif  // USE_PRECOMPUTED_WEDGE_MASK
 
+#if USE_PRECOMPUTED_WEDGE_SIGN
+/* clang-format off */
+DECLARE_ALIGNED(16, static uint8_t,
+                wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]) = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+};
+/* clang-format on */
+#else
 DECLARE_ALIGNED(16, static uint8_t,
                 wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]);
+#endif  // USE_PRECOMPUTED_WEDGE_SIGN
+
+// [negative][direction]
+DECLARE_ALIGNED(
+    16, static uint8_t,
+    wedge_mask_obl[2][WEDGE_DIRECTIONS][MASK_MASTER_SIZE * MASK_MASTER_SIZE]);
 
 // 4 * MAX_WEDGE_SQUARE is an easy to compute and fairly tight upper bound
 // on the sum of all mask sizes up to an including MAX_WEDGE_SQUARE.
@@ -223,88 +172,6 @@ DECLARE_ALIGNED(16, static uint8_t,
 
 static wedge_masks_type wedge_masks[BLOCK_SIZES_ALL][2];
 
-// Some unused wedge codebooks left temporarily to facilitate experiments.
-// To be removed when settled.
-/*
-static wedge_code_type wedge_codebook_8_hgtw[8] = {
-  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
-  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
-  { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
-  { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
-};
-
-static wedge_code_type wedge_codebook_8_hltw[8] = {
-  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
-  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
-  { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
-  { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
-};
-
-static wedge_code_type wedge_codebook_8_heqw[8] = {
-  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
-  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
-  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
-  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 6, 4 },
-};
-
-static const wedge_code_type wedge_codebook_32_hgtw[32] = {
-  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
-  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
-  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 },
-  { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
-  { WEDGE_OBLIQUE27, 4, 1 },  { WEDGE_OBLIQUE27, 4, 2 },
-  { WEDGE_OBLIQUE27, 4, 3 },  { WEDGE_OBLIQUE27, 4, 5 },
-  { WEDGE_OBLIQUE27, 4, 6 },  { WEDGE_OBLIQUE27, 4, 7 },
-  { WEDGE_OBLIQUE153, 4, 1 }, { WEDGE_OBLIQUE153, 4, 2 },
-  { WEDGE_OBLIQUE153, 4, 3 }, { WEDGE_OBLIQUE153, 4, 5 },
-  { WEDGE_OBLIQUE153, 4, 6 }, { WEDGE_OBLIQUE153, 4, 7 },
-  { WEDGE_OBLIQUE63, 1, 4 },  { WEDGE_OBLIQUE63, 2, 4 },
-  { WEDGE_OBLIQUE63, 3, 4 },  { WEDGE_OBLIQUE63, 5, 4 },
-  { WEDGE_OBLIQUE63, 6, 4 },  { WEDGE_OBLIQUE63, 7, 4 },
-  { WEDGE_OBLIQUE117, 1, 4 }, { WEDGE_OBLIQUE117, 2, 4 },
-  { WEDGE_OBLIQUE117, 3, 4 }, { WEDGE_OBLIQUE117, 5, 4 },
-  { WEDGE_OBLIQUE117, 6, 4 }, { WEDGE_OBLIQUE117, 7, 4 },
-};
-
-static const wedge_code_type wedge_codebook_32_hltw[32] = {
-  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
-  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
-  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 4, 4 },
-  { WEDGE_VERTICAL, 6, 4 },   { WEDGE_HORIZONTAL, 4, 4 },
-  { WEDGE_OBLIQUE27, 4, 1 },  { WEDGE_OBLIQUE27, 4, 2 },
-  { WEDGE_OBLIQUE27, 4, 3 },  { WEDGE_OBLIQUE27, 4, 5 },
-  { WEDGE_OBLIQUE27, 4, 6 },  { WEDGE_OBLIQUE27, 4, 7 },
-  { WEDGE_OBLIQUE153, 4, 1 }, { WEDGE_OBLIQUE153, 4, 2 },
-  { WEDGE_OBLIQUE153, 4, 3 }, { WEDGE_OBLIQUE153, 4, 5 },
-  { WEDGE_OBLIQUE153, 4, 6 }, { WEDGE_OBLIQUE153, 4, 7 },
-  { WEDGE_OBLIQUE63, 1, 4 },  { WEDGE_OBLIQUE63, 2, 4 },
-  { WEDGE_OBLIQUE63, 3, 4 },  { WEDGE_OBLIQUE63, 5, 4 },
-  { WEDGE_OBLIQUE63, 6, 4 },  { WEDGE_OBLIQUE63, 7, 4 },
-  { WEDGE_OBLIQUE117, 1, 4 }, { WEDGE_OBLIQUE117, 2, 4 },
-  { WEDGE_OBLIQUE117, 3, 4 }, { WEDGE_OBLIQUE117, 5, 4 },
-  { WEDGE_OBLIQUE117, 6, 4 }, { WEDGE_OBLIQUE117, 7, 4 },
-};
-
-static const wedge_code_type wedge_codebook_32_heqw[32] = {
-  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
-  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
-  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
-  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 6, 4 },
-  { WEDGE_OBLIQUE27, 4, 1 },  { WEDGE_OBLIQUE27, 4, 2 },
-  { WEDGE_OBLIQUE27, 4, 3 },  { WEDGE_OBLIQUE27, 4, 5 },
-  { WEDGE_OBLIQUE27, 4, 6 },  { WEDGE_OBLIQUE27, 4, 7 },
-  { WEDGE_OBLIQUE153, 4, 1 }, { WEDGE_OBLIQUE153, 4, 2 },
-  { WEDGE_OBLIQUE153, 4, 3 }, { WEDGE_OBLIQUE153, 4, 5 },
-  { WEDGE_OBLIQUE153, 4, 6 }, { WEDGE_OBLIQUE153, 4, 7 },
-  { WEDGE_OBLIQUE63, 1, 4 },  { WEDGE_OBLIQUE63, 2, 4 },
-  { WEDGE_OBLIQUE63, 3, 4 },  { WEDGE_OBLIQUE63, 5, 4 },
-  { WEDGE_OBLIQUE63, 6, 4 },  { WEDGE_OBLIQUE63, 7, 4 },
-  { WEDGE_OBLIQUE117, 1, 4 }, { WEDGE_OBLIQUE117, 2, 4 },
-  { WEDGE_OBLIQUE117, 3, 4 }, { WEDGE_OBLIQUE117, 5, 4 },
-  { WEDGE_OBLIQUE117, 6, 4 }, { WEDGE_OBLIQUE117, 7, 4 },
-};
-*/
-
 static const wedge_code_type wedge_codebook_16_hgtw[16] = {
   { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
   { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
@@ -339,78 +206,37 @@ static const wedge_code_type wedge_codebook_16_heqw[16] = {
 };
 
 const wedge_params_type wedge_params_lookup[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  { 0, NULL, NULL, 0, NULL },
-  { 0, NULL, NULL, 0, NULL },
-  { 0, NULL, NULL, 0, NULL },
-#endif  // CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  { 0, NULL, NULL, 0, NULL },
-  { 0, NULL, NULL, 0, NULL },
-  { 0, NULL, NULL, 0, NULL },
-#if CONFIG_WEDGE
-  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8], 0,
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8],
     wedge_masks[BLOCK_8X8] },
-  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16], 0,
+  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16],
     wedge_masks[BLOCK_8X16] },
-  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8], 0,
+  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8],
     wedge_masks[BLOCK_16X8] },
-  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16], 0,
+  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16],
     wedge_masks[BLOCK_16X16] },
-  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32], 0,
+  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32],
     wedge_masks[BLOCK_16X32] },
-  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16], 0,
+  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16],
     wedge_masks[BLOCK_32X16] },
-  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32], 0,
+  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32],
     wedge_masks[BLOCK_32X32] },
-#else
-  { 0, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8], 0,
-    wedge_masks[BLOCK_8X8] },
-  { 0, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16], 0,
-    wedge_masks[BLOCK_8X16] },
-  { 0, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8], 0,
-    wedge_masks[BLOCK_16X8] },
-  { 0, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16], 0,
-    wedge_masks[BLOCK_16X16] },
-  { 0, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32], 0,
-    wedge_masks[BLOCK_16X32] },
-  { 0, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16], 0,
-    wedge_masks[BLOCK_32X16] },
-  { 0, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32], 0,
-    wedge_masks[BLOCK_32X32] },
-#endif  // CONFIG_WEDGE
-  { 0, NULL, NULL, 0, NULL },
-  { 0, NULL, NULL, 0, NULL },
-  { 0, NULL, NULL, 0, NULL },
-#if CONFIG_EXT_PARTITION
-  { 0, NULL, NULL, 0, NULL },
-  { 0, NULL, NULL, 0, NULL },
-  { 0, NULL, NULL, 0, NULL },
-#endif  // CONFIG_EXT_PARTITION
-#if CONFIG_WEDGE
-  { 0, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_4X16], 0,
-    wedge_masks[BLOCK_4X16] },
-  { 0, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X4], 0,
-    wedge_masks[BLOCK_16X4] },
-  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32], 0,
-    wedge_masks[BLOCK_8X32] },
-  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8], 0,
-    wedge_masks[BLOCK_32X8] },
-#else
-  { 0, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_4X16], 0,
-    wedge_masks[BLOCK_4X16] },
-  { 0, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X4], 0,
-    wedge_masks[BLOCK_16X4] },
-  { 0, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32], 0,
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32],
     wedge_masks[BLOCK_8X32] },
-  { 0, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8], 0,
+  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8],
     wedge_masks[BLOCK_32X8] },
-#endif  // CONFIG_WEDGE
-  { 0, NULL, NULL, 0, NULL },
-  { 0, NULL, NULL, 0, NULL },
-#if CONFIG_EXT_PARTITION
-  { 0, NULL, NULL, 0, NULL },
-  { 0, NULL, NULL, 0, NULL },
-#endif  // CONFIG_EXT_PARTITION
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
 };
 
 static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg,
@@ -420,7 +246,6 @@ static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg,
   const int bw = block_size_wide[sb_type];
   const wedge_code_type *a =
       wedge_params_lookup[sb_type].codebook + wedge_index;
-  const int smoother = wedge_params_lookup[sb_type].smoother;
   int woff, hoff;
   const uint8_t wsignflip = wedge_params_lookup[sb_type].signflip[wedge_index];
 
@@ -428,339 +253,231 @@ static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg,
          wedge_index < (1 << get_wedge_bits_lookup(sb_type)));
   woff = (a->x_offset * bw) >> 3;
   hoff = (a->y_offset * bh) >> 3;
-  master = wedge_mask_obl[smoother][neg ^ wsignflip][a->direction] +
+  master = wedge_mask_obl[neg ^ wsignflip][a->direction] +
            MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) +
            MASK_MASTER_SIZE / 2 - woff;
   return master;
 }
 
-const uint8_t *av1_get_soft_mask(int wedge_index, int wedge_sign,
-                                 BLOCK_SIZE sb_type, int offset_x,
-                                 int offset_y) {
-  const uint8_t *mask =
-      get_wedge_mask_inplace(wedge_index, wedge_sign, sb_type);
-  if (mask) mask -= (offset_x + offset_y * MASK_MASTER_STRIDE);
-  return mask;
-}
-
-#if CONFIG_COMPOUND_SEGMENT
-static uint8_t *invert_mask(uint8_t *mask_inv_buffer, const uint8_t *const mask,
-                            int h, int w, int stride) {
-  int i, j;
-
-  for (i = 0; i < h; ++i)
-    for (j = 0; j < w; ++j) {
-      mask_inv_buffer[i * stride + j] =
-          AOM_BLEND_A64_MAX_ALPHA - mask[i * stride + j];
-    }
-  return mask_inv_buffer;
-}
-#endif  // CONFIG_COMPOUND_SEGMENT
-
-const uint8_t *av1_get_compound_type_mask_inverse(
-    const INTERINTER_COMPOUND_DATA *const comp_data,
-#if CONFIG_COMPOUND_SEGMENT
-    uint8_t *mask_buffer, int h, int w, int stride,
-#endif
-    BLOCK_SIZE sb_type) {
-  assert(is_masked_compound_type(comp_data->interinter_compound_type));
-  (void)sb_type;
-  switch (comp_data->interinter_compound_type) {
-#if CONFIG_WEDGE
-    case COMPOUND_WEDGE:
-      return av1_get_contiguous_soft_mask(comp_data->wedge_index,
-                                          !comp_data->wedge_sign, sb_type);
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-    case COMPOUND_SEG:
-      return invert_mask(mask_buffer, comp_data->seg_mask, h, w, stride);
-#endif  // CONFIG_COMPOUND_SEGMENT
-    default: assert(0); return NULL;
-  }
-}
-
 const uint8_t *av1_get_compound_type_mask(
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type) {
-  assert(is_masked_compound_type(comp_data->interinter_compound_type));
+  assert(is_masked_compound_type(comp_data->type));
   (void)sb_type;
-  switch (comp_data->interinter_compound_type) {
-#if CONFIG_WEDGE
+  switch (comp_data->type) {
     case COMPOUND_WEDGE:
       return av1_get_contiguous_soft_mask(comp_data->wedge_index,
                                           comp_data->wedge_sign, sb_type);
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-    case COMPOUND_SEG: return comp_data->seg_mask;
-#endif  // CONFIG_COMPOUND_SEGMENT
+    case COMPOUND_DIFFWTD: return comp_data->seg_mask;
     default: assert(0); return NULL;
   }
 }
 
-#if CONFIG_COMPOUND_SEGMENT
-#if COMPOUND_SEGMENT_TYPE == 0
-static void uniform_mask(uint8_t *mask, int which_inverse, BLOCK_SIZE sb_type,
-                         int h, int w, int mask_val) {
-  int i, j;
-  int block_stride = block_size_wide[sb_type];
-  for (i = 0; i < h; ++i)
-    for (j = 0; j < w; ++j) {
-      mask[i * block_stride + j] =
-          which_inverse ? AOM_BLEND_A64_MAX_ALPHA - mask_val : mask_val;
-    }
-}
-
-void build_compound_seg_mask(uint8_t *mask, SEG_MASK_TYPE mask_type,
-                             const uint8_t *src0, int src0_stride,
-                             const uint8_t *src1, int src1_stride,
-                             BLOCK_SIZE sb_type, int h, int w) {
-  (void)src0;
-  (void)src1;
-  (void)src0_stride;
-  (void)src1_stride;
-  switch (mask_type) {
-    case UNIFORM_45: uniform_mask(mask, 0, sb_type, h, w, 45); break;
-    case UNIFORM_45_INV: uniform_mask(mask, 1, sb_type, h, w, 45); break;
-    default: assert(0);
-  }
-}
-
-#if CONFIG_HIGHBITDEPTH
-void build_compound_seg_mask_highbd(uint8_t *mask, SEG_MASK_TYPE mask_type,
-                                    const uint8_t *src0, int src0_stride,
-                                    const uint8_t *src1, int src1_stride,
-                                    BLOCK_SIZE sb_type, int h, int w, int bd) {
-  (void)src0;
-  (void)src1;
-  (void)src0_stride;
-  (void)src1_stride;
-  (void)bd;
-  switch (mask_type) {
-    case UNIFORM_45: uniform_mask(mask, 0, sb_type, h, w, 45); break;
-    case UNIFORM_45_INV: uniform_mask(mask, 1, sb_type, h, w, 45); break;
-    default: assert(0);
-  }
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-#elif COMPOUND_SEGMENT_TYPE == 1
-#define DIFF_FACTOR 16
-
-#if CONFIG_CONVOLVE_ROUND
-static void diffwtd_mask_d32(uint8_t *mask, int which_inverse, int mask_base,
-                             const int32_t *src0, int src0_stride,
-                             const int32_t *src1, int src1_stride,
-                             BLOCK_SIZE sb_type, int h, int w,
-                             ConvolveParams *conv_params, int bd) {
+static void diffwtd_mask_d16(uint8_t *mask, int which_inverse, int mask_base,
+                             const CONV_BUF_TYPE *src0, int src0_stride,
+                             const CONV_BUF_TYPE *src1, int src1_stride, int h,
+                             int w, ConvolveParams *conv_params, int bd) {
   int round =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
   int i, j, m, diff;
-  int block_stride = block_size_wide[sb_type];
   for (i = 0; i < h; ++i) {
     for (j = 0; j < w; ++j) {
       diff = abs(src0[i * src0_stride + j] - src1[i * src1_stride + j]);
       diff = ROUND_POWER_OF_TWO(diff, round);
       m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
-      mask[i * block_stride + j] =
-          which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
+      mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
     }
   }
 }
 
-static void build_compound_seg_mask_d32(uint8_t *mask, SEG_MASK_TYPE mask_type,
-                                        const int32_t *src0, int src0_stride,
-                                        const int32_t *src1, int src1_stride,
-                                        BLOCK_SIZE sb_type, int h, int w,
-                                        ConvolveParams *conv_params, int bd) {
+void av1_build_compound_diffwtd_mask_d16_c(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
   switch (mask_type) {
     case DIFFWTD_38:
-      diffwtd_mask_d32(mask, 0, 38, src0, src0_stride, src1, src1_stride,
-                       sb_type, h, w, conv_params, bd);
+      diffwtd_mask_d16(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w,
+                       conv_params, bd);
       break;
     case DIFFWTD_38_INV:
-      diffwtd_mask_d32(mask, 1, 38, src0, src0_stride, src1, src1_stride,
-                       sb_type, h, w, conv_params, bd);
+      diffwtd_mask_d16(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w,
+                       conv_params, bd);
       break;
     default: assert(0);
   }
 }
-#endif
 
 static void diffwtd_mask(uint8_t *mask, int which_inverse, int mask_base,
                          const uint8_t *src0, int src0_stride,
-                         const uint8_t *src1, int src1_stride,
-                         BLOCK_SIZE sb_type, int h, int w) {
+                         const uint8_t *src1, int src1_stride, int h, int w) {
   int i, j, m, diff;
-  int block_stride = block_size_wide[sb_type];
   for (i = 0; i < h; ++i) {
     for (j = 0; j < w; ++j) {
       diff =
           abs((int)src0[i * src0_stride + j] - (int)src1[i * src1_stride + j]);
       m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
-      mask[i * block_stride + j] =
-          which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
+      mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
     }
   }
 }
 
-void build_compound_seg_mask(uint8_t *mask, SEG_MASK_TYPE mask_type,
-                             const uint8_t *src0, int src0_stride,
-                             const uint8_t *src1, int src1_stride,
-                             BLOCK_SIZE sb_type, int h, int w) {
+void av1_build_compound_diffwtd_mask_c(uint8_t *mask,
+                                       DIFFWTD_MASK_TYPE mask_type,
+                                       const uint8_t *src0, int src0_stride,
+                                       const uint8_t *src1, int src1_stride,
+                                       int h, int w) {
   switch (mask_type) {
     case DIFFWTD_38:
-      diffwtd_mask(mask, 0, 38, src0, src0_stride, src1, src1_stride, sb_type,
-                   h, w);
+      diffwtd_mask(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w);
       break;
     case DIFFWTD_38_INV:
-      diffwtd_mask(mask, 1, 38, src0, src0_stride, src1, src1_stride, sb_type,
-                   h, w);
+      diffwtd_mask(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w);
       break;
     default: assert(0);
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
-static void diffwtd_mask_highbd(uint8_t *mask, int which_inverse, int mask_base,
-                                const uint16_t *src0, int src0_stride,
-                                const uint16_t *src1, int src1_stride,
-                                BLOCK_SIZE sb_type, int h, int w, int bd) {
-  int i, j, m, diff;
-  int block_stride = block_size_wide[sb_type];
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) {
-      diff = abs((int)src0[i * src0_stride + j] -
-                 (int)src1[i * src1_stride + j]) >>
-             (bd - 8);
-      m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
-      mask[i * block_stride + j] =
-          which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
+static AOM_FORCE_INLINE void diffwtd_mask_highbd(
+    uint8_t *mask, int which_inverse, int mask_base, const uint16_t *src0,
+    int src0_stride, const uint16_t *src1, int src1_stride, int h, int w,
+    const unsigned int bd) {
+  assert(bd >= 8);
+  if (bd == 8) {
+    if (which_inverse) {
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+          int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
+          unsigned int m = negative_to_zero(mask_base + diff);
+          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+          mask[j] = AOM_BLEND_A64_MAX_ALPHA - m;
+        }
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += w;
+      }
+    } else {
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+          int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
+          unsigned int m = negative_to_zero(mask_base + diff);
+          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+          mask[j] = m;
+        }
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += w;
+      }
+    }
+  } else {
+    const unsigned int bd_shift = bd - 8;
+    if (which_inverse) {
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+          int diff =
+              (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
+          unsigned int m = negative_to_zero(mask_base + diff);
+          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+          mask[j] = AOM_BLEND_A64_MAX_ALPHA - m;
+        }
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += w;
+      }
+    } else {
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+          int diff =
+              (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
+          unsigned int m = negative_to_zero(mask_base + diff);
+          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+          mask[j] = m;
+        }
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += w;
+      }
     }
   }
 }
 
-void build_compound_seg_mask_highbd(uint8_t *mask, SEG_MASK_TYPE mask_type,
-                                    const uint8_t *src0, int src0_stride,
-                                    const uint8_t *src1, int src1_stride,
-                                    BLOCK_SIZE sb_type, int h, int w, int bd) {
+void av1_build_compound_diffwtd_mask_highbd_c(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+    int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+    int bd) {
   switch (mask_type) {
     case DIFFWTD_38:
       diffwtd_mask_highbd(mask, 0, 38, CONVERT_TO_SHORTPTR(src0), src0_stride,
-                          CONVERT_TO_SHORTPTR(src1), src1_stride, sb_type, h, w,
-                          bd);
+                          CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd);
       break;
     case DIFFWTD_38_INV:
       diffwtd_mask_highbd(mask, 1, 38, CONVERT_TO_SHORTPTR(src0), src0_stride,
-                          CONVERT_TO_SHORTPTR(src1), src1_stride, sb_type, h, w,
-                          bd);
+                          CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd);
       break;
     default: assert(0);
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // COMPOUND_SEGMENT_TYPE
-#endif  // CONFIG_COMPOUND_SEGMENT
-
-#if MASK_MASTER_SIZE == 64
-static const uint8_t wedge_master_oblique_odd[NSMOOTHERS][MASK_MASTER_SIZE] = {
-  {
-      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  6,  18,
-      37, 53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-      64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-  }
-};
-static const uint8_t wedge_master_oblique_even[NSMOOTHERS][MASK_MASTER_SIZE] = {
-  {
-      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  4,  11, 27,
-      46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-      64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-  }
-};
-static const uint8_t wedge_master_vertical[NSMOOTHERS][MASK_MASTER_SIZE] = { {
-    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  7,  21,
-    43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-} };
-
-static void shift_copy(const uint8_t *src, uint8_t *dst, int shift, int width) {
-  if (shift >= 0) {
-    memcpy(dst + shift, src, width - shift);
-    memset(dst, src[0], shift);
-  } else {
-    shift = -shift;
-    memcpy(dst, src + shift, width - shift);
-    memset(dst + width - shift, src[width - 1], shift);
-  }
-}
-#else
-static const double smoother_param[NSMOOTHERS] = { 3.0 };
-#endif  // MASK_MASTER_SIZE == 64
 
 static void init_wedge_master_masks() {
-  int i, j, s;
+  int i, j;
   const int w = MASK_MASTER_SIZE;
   const int h = MASK_MASTER_SIZE;
   const int stride = MASK_MASTER_STRIDE;
-  for (s = 0; s < NSMOOTHERS; s++) {
 // Note: index [0] stores the masters, and [1] its complement.
-#if MASK_MASTER_SIZE == 64
-    // Generate prototype by shifting the masters
-    int shift = h / 4;
-    for (i = 0; i < h; i += 2) {
-      shift_copy(wedge_master_oblique_even[s],
-                 &wedge_mask_obl[s][0][WEDGE_OBLIQUE63][i * stride], shift,
-                 MASK_MASTER_SIZE);
-      shift--;
-      shift_copy(wedge_master_oblique_odd[s],
-                 &wedge_mask_obl[s][0][WEDGE_OBLIQUE63][(i + 1) * stride],
-                 shift, MASK_MASTER_SIZE);
-      memcpy(&wedge_mask_obl[s][0][WEDGE_VERTICAL][i * stride],
-             wedge_master_vertical[s],
-             MASK_MASTER_SIZE * sizeof(wedge_master_vertical[s][0]));
-      memcpy(&wedge_mask_obl[s][0][WEDGE_VERTICAL][(i + 1) * stride],
-             wedge_master_vertical[s],
-             MASK_MASTER_SIZE * sizeof(wedge_master_vertical[s][0]));
-    }
+#if USE_PRECOMPUTED_WEDGE_MASK
+  // Generate prototype by shifting the masters
+  int shift = h / 4;
+  for (i = 0; i < h; i += 2) {
+    shift_copy(wedge_master_oblique_even,
+               &wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride], shift,
+               MASK_MASTER_SIZE);
+    shift--;
+    shift_copy(wedge_master_oblique_odd,
+               &wedge_mask_obl[0][WEDGE_OBLIQUE63][(i + 1) * stride], shift,
+               MASK_MASTER_SIZE);
+    memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][i * stride],
+           wedge_master_vertical,
+           MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
+    memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][(i + 1) * stride],
+           wedge_master_vertical,
+           MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
+  }
 #else
-    const int a[2] = { 2, 1 };
-    const double asqrt = sqrt(a[0] * a[0] + a[1] * a[1]);
-    for (i = 0; i < h; i++) {
-      for (j = 0; j < w; ++j) {
-        int x = (2 * j + 1 - w);
-        int y = (2 * i + 1 - h);
-        double d = (a[0] * x + a[1] * y) / asqrt;
-        const int msk = (int)rint((1.0 + tanh(d / smoother_param[s])) * 32);
-        wedge_mask_obl[s][0][WEDGE_OBLIQUE63][i * stride + j] = msk;
-        const int mskx = (int)rint((1.0 + tanh(x / smoother_param[s])) * 32);
-        wedge_mask_obl[s][0][WEDGE_VERTICAL][i * stride + j] = mskx;
-      }
+  static const double smoother_param = 2.85;
+  const int a[2] = { 2, 1 };
+  const double asqrt = sqrt(a[0] * a[0] + a[1] * a[1]);
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; ++j) {
+      int x = (2 * j + 1 - w);
+      int y = (2 * i + 1 - h);
+      double d = (a[0] * x + a[1] * y) / asqrt;
+      const int msk = (int)rint((1.0 + tanh(d / smoother_param)) * 32);
+      wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j] = msk;
+      const int mskx = (int)rint((1.0 + tanh(x / smoother_param)) * 32);
+      wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j] = mskx;
     }
-#endif  // MASK_MASTER_SIZE == 64
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int msk = wedge_mask_obl[s][0][WEDGE_OBLIQUE63][i * stride + j];
-        wedge_mask_obl[s][0][WEDGE_OBLIQUE27][j * stride + i] = msk;
-        wedge_mask_obl[s][0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
-            wedge_mask_obl[s][0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
-                (1 << WEDGE_WEIGHT_BITS) - msk;
-        wedge_mask_obl[s][1][WEDGE_OBLIQUE63][i * stride + j] =
-            wedge_mask_obl[s][1][WEDGE_OBLIQUE27][j * stride + i] =
-                (1 << WEDGE_WEIGHT_BITS) - msk;
-        wedge_mask_obl[s][1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
-            wedge_mask_obl[s][1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
-                msk;
-        const int mskx = wedge_mask_obl[s][0][WEDGE_VERTICAL][i * stride + j];
-        wedge_mask_obl[s][0][WEDGE_HORIZONTAL][j * stride + i] = mskx;
-        wedge_mask_obl[s][1][WEDGE_VERTICAL][i * stride + j] =
-            wedge_mask_obl[s][1][WEDGE_HORIZONTAL][j * stride + i] =
-                (1 << WEDGE_WEIGHT_BITS) - mskx;
-      }
+  }
+#endif  // USE_PRECOMPUTED_WEDGE_MASK
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      const int msk = wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j];
+      wedge_mask_obl[0][WEDGE_OBLIQUE27][j * stride + i] = msk;
+      wedge_mask_obl[0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
+          wedge_mask_obl[0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
+              (1 << WEDGE_WEIGHT_BITS) - msk;
+      wedge_mask_obl[1][WEDGE_OBLIQUE63][i * stride + j] =
+          wedge_mask_obl[1][WEDGE_OBLIQUE27][j * stride + i] =
+              (1 << WEDGE_WEIGHT_BITS) - msk;
+      wedge_mask_obl[1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
+          wedge_mask_obl[1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = msk;
+      const int mskx = wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j];
+      wedge_mask_obl[0][WEDGE_HORIZONTAL][j * stride + i] = mskx;
+      wedge_mask_obl[1][WEDGE_VERTICAL][i * stride + j] =
+          wedge_mask_obl[1][WEDGE_HORIZONTAL][j * stride + i] =
+              (1 << WEDGE_WEIGHT_BITS) - mskx;
     }
   }
 }
 
+#if !USE_PRECOMPUTED_WEDGE_SIGN
 // If the signs for the wedges for various blocksizes are
 // inconsistent flip the sign flag. Do it only once for every
 // wedge codebook.
@@ -774,28 +491,29 @@ static void init_wedge_signs() {
     const int wbits = wedge_params.bits;
     const int wtypes = 1 << wbits;
     int i, w;
-    if (wbits == 0) continue;
-    for (w = 0; w < wtypes; ++w) {
-      // Get the mask master, i.e. index [0]
-      const uint8_t *mask = get_wedge_mask_inplace(w, 0, sb_type);
-      int avg = 0;
-      for (i = 0; i < bw; ++i) avg += mask[i];
-      for (i = 1; i < bh; ++i) avg += mask[i * MASK_MASTER_STRIDE];
-      avg = (avg + (bw + bh - 1) / 2) / (bw + bh - 1);
-      // Default sign of this wedge is 1 if the average < 32, 0 otherwise.
-      // If default sign is 1:
-      //   If sign requested is 0, we need to flip the sign and return
-      //   the complement i.e. index [1] instead. If sign requested is 1
-      //   we need to flip the sign and return index [0] instead.
-      // If default sign is 0:
-      //   If sign requested is 0, we need to return index [0] the master
-      //   if sign requested is 1, we need to return the complement index [1]
-      //   instead.
-      wedge_params.signflip[w] = (avg < 32);
-      // printf("%d[%d] = %d\n", sb_type, w, wedge_params.signflip[w]);
+    if (wbits) {
+      for (w = 0; w < wtypes; ++w) {
+        // Get the mask master, i.e. index [0]
+        const uint8_t *mask = get_wedge_mask_inplace(w, 0, sb_type);
+        int avg = 0;
+        for (i = 0; i < bw; ++i) avg += mask[i];
+        for (i = 1; i < bh; ++i) avg += mask[i * MASK_MASTER_STRIDE];
+        avg = (avg + (bw + bh - 1) / 2) / (bw + bh - 1);
+        // Default sign of this wedge is 1 if the average < 32, 0 otherwise.
+        // If default sign is 1:
+        //   If sign requested is 0, we need to flip the sign and return
+        //   the complement i.e. index [1] instead. If sign requested is 1
+        //   we need to flip the sign and return index [0] instead.
+        // If default sign is 0:
+        //   If sign requested is 0, we need to return index [0] the master
+        //   if sign requested is 1, we need to return the complement index [1]
+        //   instead.
+        wedge_params.signflip[w] = (avg < 32);
+      }
     }
   }
 }
+#endif  // !USE_PRECOMPUTED_WEDGE_SIGN
 
 static void init_wedge_masks() {
   uint8_t *dst = wedge_mask_buf;
@@ -830,83 +548,32 @@ static void init_wedge_masks() {
 // Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0
 void av1_init_wedge_masks() {
   init_wedge_master_masks();
+#if !USE_PRECOMPUTED_WEDGE_SIGN
   init_wedge_signs();
+#endif  // !USE_PRECOMPUTED_WEDGE_SIGN
   init_wedge_masks();
 }
 
-#if CONFIG_SUPERTX
-static void build_masked_compound_wedge_extend(
-    uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
-    const uint8_t *src1, int src1_stride,
-    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type,
-    int wedge_offset_x, int wedge_offset_y, int h, int w) {
-  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
-  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
-  const uint8_t *mask;
-  size_t mask_stride;
-  switch (comp_data->interinter_compound_type) {
-    case COMPOUND_WEDGE:
-      mask = av1_get_soft_mask(comp_data->wedge_index, comp_data->wedge_sign,
-                               sb_type, wedge_offset_x, wedge_offset_y);
-      mask_stride = MASK_MASTER_STRIDE;
-      break;
-#if CONFIG_COMPOUND_SEGMENT
-    case COMPOUND_SEG:
-      mask = comp_data->seg_mask;
-      mask_stride = block_size_wide[sb_type];
-      break;
-#endif
-    default: assert(0); return;
-  }
-  aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                     mask, (int)mask_stride, h, w, subh, subw);
-}
-
-#if CONFIG_HIGHBITDEPTH
-static void build_masked_compound_wedge_extend_highbd(
-    uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
-    const uint8_t *src1_8, int src1_stride,
-    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type,
-    int wedge_offset_x, int wedge_offset_y, int h, int w, int bd) {
-  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
-  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
-  const uint8_t *mask;
-  size_t mask_stride;
-  switch (comp_data->interinter_compound_type) {
-    case COMPOUND_WEDGE:
-      mask = av1_get_soft_mask(comp_data->wedge_index, comp_data->wedge_sign,
-                               sb_type, wedge_offset_x, wedge_offset_y);
-      mask_stride = MASK_MASTER_STRIDE;
-      break;
-#if CONFIG_COMPOUND_SEGMENT
-    case COMPOUND_SEG:
-      mask = comp_data->seg_mask;
-      mask_stride = block_size_wide[sb_type];
-      break;
-#endif
-    default: assert(0); return;
-  }
-  aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
-                            src1_stride, mask, (int)mask_stride, h, w, subh,
-                            subw, bd);
-}
-#endif  // CONFIG_HIGHBITDEPTH
-#else
-#if CONFIG_CONVOLVE_ROUND
 static void build_masked_compound_no_round(
-    CONV_BUF_TYPE *dst, int dst_stride, const CONV_BUF_TYPE *src0,
-    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride,
+    uint8_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride,
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
-    int w) {
+    int w, ConvolveParams *conv_params, MACROBLOCKD *xd) {
   // Derive subsampling from h and w passed in. May be refactored to
   // pass in subsampling factors directly.
-  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
-  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
+  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
   const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
-  aom_blend_a64_d32_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                         mask, block_size_wide[sb_type], h, w, subh, subw);
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, block_size_wide[sb_type],
+                                  w, h, subw, subh, conv_params, xd->bd);
+  else
+    aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, block_size_wide[sb_type], w,
+                                 h, subw, subh, conv_params);
 }
-#endif  // CONFIG_CONVOLVE_ROUND
+
 static void build_masked_compound(
     uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
     const uint8_t *src1, int src1_stride,
@@ -914,14 +581,13 @@ static void build_masked_compound(
     int w) {
   // Derive subsampling from h and w passed in. May be refactored to
   // pass in subsampling factors directly.
-  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
-  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
+  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
   const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
   aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                     mask, block_size_wide[sb_type], h, w, subh, subw);
+                     mask, block_size_wide[sb_type], w, h, subw, subh);
 }
 
-#if CONFIG_HIGHBITDEPTH
 static void build_masked_compound_highbd(
     uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
     const uint8_t *src1_8, int src1_stride,
@@ -929,320 +595,259 @@ static void build_masked_compound_highbd(
     int w, int bd) {
   // Derive subsampling from h and w passed in. May be refactored to
   // pass in subsampling factors directly.
-  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
-  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
+  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
   const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
   // const uint8_t *mask =
   //     av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
   aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
-                            src1_stride, mask, block_size_wide[sb_type], h, w,
-                            subh, subw, bd);
+                            src1_stride, mask, block_size_wide[sb_type], w, h,
+                            subw, subh, bd);
 }
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_SUPERTX
 
 void av1_make_masked_inter_predictor(
     const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride,
-    const int subpel_x, const int subpel_y, const struct scale_factors *sf,
-    int w, int h, ConvolveParams *conv_params, InterpFilters interp_filters,
-    int xs, int ys,
-#if CONFIG_SUPERTX
-    int wedge_offset_x, int wedge_offset_y,
-#endif  // CONFIG_SUPERTX
-    int plane,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
+    int h, ConvolveParams *conv_params, InterpFilters interp_filters, int plane,
     const WarpTypesAllowed *warp_types, int p_col, int p_row, int ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-    MACROBLOCKD *xd) {
-  const MODE_INFO *mi = xd->mi[0];
-
-  const INTERINTER_COMPOUND_DATA comp_data = {
-#if CONFIG_WEDGE
-    mi->mbmi.wedge_index,
-    mi->mbmi.wedge_sign,
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-    mi->mbmi.mask_type,
-    xd->seg_mask,
-#endif  // CONFIG_COMPOUND_SEGMENT
-    mi->mbmi.interinter_compound_type
-  };
+    MACROBLOCKD *xd, int can_use_previous) {
+  MB_MODE_INFO *mi = xd->mi[0];
+  (void)dst;
+  (void)dst_stride;
+  mi->interinter_comp.seg_mask = xd->seg_mask;
+  const INTERINTER_COMPOUND_DATA *comp_data = &mi->interinter_comp;
 
 // We're going to call av1_make_inter_predictor to generate a prediction into
 // a temporary buffer, then will blend that temporary buffer with that from
 // the other reference.
 //
-// With CONFIG_CONVOLVE_ROUND, if the rounding mode is CONVOLVE_OPT_NO_ROUND
-// then the predictions are at 32-bits, so we'll need 32 bits per
-// pixel. Otherwise, we'll need up to 16 bits per pixel if
-// CONFIG_HIGHBITDEPTH or just 8 otherwise.
-#if CONFIG_CONVOLVE_ROUND
-#define INTER_PRED_BYTES_PER_PIXEL 4
-#elif CONFIG_HIGHBITDEPTH
 #define INTER_PRED_BYTES_PER_PIXEL 2
-#else
-#define INTER_PRED_BYTES_PER_PIXEL 1
-#endif
-  DECLARE_ALIGNED(16, uint8_t,
+
+  DECLARE_ALIGNED(32, uint8_t,
                   tmp_buf[INTER_PRED_BYTES_PER_PIXEL * MAX_SB_SQUARE]);
 #undef INTER_PRED_BYTES_PER_PIXEL
 
-#if CONFIG_HIGHBITDEPTH
   uint8_t *tmp_dst = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
                          ? CONVERT_TO_BYTEPTR(tmp_buf)
                          : tmp_buf;
-  const int bd = xd->bd;
-#else
-  uint8_t *tmp_dst = tmp_buf;
-  const int bd = 8;
-#endif
 
-#if CONFIG_CONVOLVE_ROUND
   const int tmp_buf_stride = MAX_SB_SIZE;
-  const int is_conv_no_round = conv_params->round == CONVOLVE_OPT_NO_ROUND;
   CONV_BUF_TYPE *org_dst = conv_params->dst;
   int org_dst_stride = conv_params->dst_stride;
-  CONV_BUF_TYPE *tmp_buf32 = (CONV_BUF_TYPE *)tmp_buf;
-  if (is_conv_no_round) {
-    conv_params->dst = tmp_buf32;
-    conv_params->dst_stride = tmp_buf_stride;
-    assert(conv_params->do_average == 0);
-  }
-#endif  // CONFIG_CONVOLVE_ROUND
+  CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf;
+  conv_params->dst = tmp_buf16;
+  conv_params->dst_stride = tmp_buf_stride;
+  assert(conv_params->do_average == 0);
 
   // This will generate a prediction in tmp_buf for the second reference
-  av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, subpel_x,
-                           subpel_y, sf, w, h, conv_params, interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                           warp_types, p_col, p_row, plane, ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR
-                           mi, 0,
-#endif
-                           xs, ys, xd);
-
-#if CONFIG_COMPOUND_SEGMENT
-  if (!plane && comp_data.interinter_compound_type == COMPOUND_SEG) {
-#if CONFIG_CONVOLVE_ROUND
-    if (is_conv_no_round) {
-      build_compound_seg_mask_d32(
-          comp_data.seg_mask, comp_data.mask_type, org_dst, org_dst_stride,
-          tmp_buf32, tmp_buf_stride, mi->mbmi.sb_type, h, w, conv_params, bd);
-    } else {
-#endif  // CONFIG_CONVOLVE_ROUND
-#if CONFIG_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        build_compound_seg_mask_highbd(comp_data.seg_mask, comp_data.mask_type,
-                                       dst, dst_stride, tmp_dst, MAX_SB_SIZE,
-                                       mi->mbmi.sb_type, h, w, bd);
-      } else {
-#endif
-        build_compound_seg_mask(comp_data.seg_mask, comp_data.mask_type, dst,
-                                dst_stride, tmp_dst, MAX_SB_SIZE,
-                                mi->mbmi.sb_type, h, w);
-#if CONFIG_HIGHBITDEPTH
-      }
-#endif
-#if CONFIG_CONVOLVE_ROUND
-    }
-#endif
-  }
-#endif  // CONFIG_COMPOUND_SEGMENT
+  av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, subpel_params,
+                           sf, w, h, conv_params, interp_filters, warp_types,
+                           p_col, p_row, plane, ref, mi, 0, xd,
+                           can_use_previous);
 
-#if CONFIG_SUPERTX
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    build_masked_compound_wedge_extend_highbd(
-        dst, dst_stride, dst, dst_stride, tmp_dst, MAX_SB_SIZE, &comp_data,
-        mi->mbmi.sb_type, wedge_offset_x, wedge_offset_y, h, w, xd->bd);
-  else
-#endif  // CONFIG_HIGHBITDEPTH
-    build_masked_compound_wedge_extend(
-        dst, dst_stride, dst, dst_stride, tmp_dst, MAX_SB_SIZE, &comp_data,
-        mi->mbmi.sb_type, wedge_offset_x, wedge_offset_y, h, w);
-#else
-#if CONFIG_CONVOLVE_ROUND
-  if (is_conv_no_round) {
-    build_masked_compound_no_round(org_dst, org_dst_stride, org_dst,
-                                   org_dst_stride, tmp_buf32, tmp_buf_stride,
-                                   &comp_data, mi->mbmi.sb_type, h, w);
-
-    const int convolve_rounding_bits =
-        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      av1_highbd_convolve_rounding(org_dst, org_dst_stride, dst, dst_stride, w,
-                                   h, convolve_rounding_bits, xd->bd);
-    else
-#endif
-      av1_convolve_rounding(org_dst, org_dst_stride, dst, dst_stride, w, h,
-                            convolve_rounding_bits);
-
-    conv_params->do_post_rounding = 0;
-  } else {
-#endif  // CONFIG_CONVOLVE_ROUND
-
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      build_masked_compound_highbd(dst, dst_stride, dst, dst_stride, tmp_dst,
-                                   MAX_SB_SIZE, &comp_data, mi->mbmi.sb_type, h,
-                                   w, xd->bd);
-    else
-#endif  // CONFIG_HIGHBITDEPTH
-      build_masked_compound(dst, dst_stride, dst, dst_stride, tmp_dst,
-                            MAX_SB_SIZE, &comp_data, mi->mbmi.sb_type, h, w);
-#if CONFIG_CONVOLVE_ROUND
+  if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
+    av1_build_compound_diffwtd_mask_d16(
+        comp_data->seg_mask, comp_data->mask_type, org_dst, org_dst_stride,
+        tmp_buf16, tmp_buf_stride, h, w, conv_params, xd->bd);
   }
-#endif  // CONFIG_CONVOLVE_ROUND
-#endif  // CONFIG_SUPERTX
-
-#if CONFIG_COMPOUND_SEGMENT
-  (void)plane;
-#endif  // CONFIG_COMPOUND_SEGMENT
+  build_masked_compound_no_round(dst, dst_stride, org_dst, org_dst_stride,
+                                 tmp_buf16, tmp_buf_stride, comp_data,
+                                 mi->sb_type, h, w, conv_params, xd);
 }
 
 // TODO(sarahparker) av1_highbd_build_inter_predictor and
 // av1_build_inter_predictor should be combined with
 // av1_make_inter_predictor
-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_build_inter_predictor(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
     const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref,
-    InterpFilters interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-    const WarpTypesAllowed *warp_types, int p_col, int p_row,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-    int plane, enum mv_precision precision, int x, int y,
-    const MACROBLOCKD *xd) {
+    InterpFilters interp_filters, const WarpTypesAllowed *warp_types, int p_col,
+    int p_row, int plane, enum mv_precision precision, int x, int y,
+    const MACROBLOCKD *xd, int can_use_previous) {
   const int is_q4 = precision == MV_PRECISION_Q4;
   const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
                      is_q4 ? src_mv->col : src_mv->col * 2 };
   MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
   mv.col += SCALE_EXTRA_OFF;
   mv.row += SCALE_EXTRA_OFF;
-  const int subpel_x = mv.col & SCALE_SUBPEL_MASK;
-  const int subpel_y = mv.row & SCALE_SUBPEL_MASK;
-  ConvolveParams conv_params = get_conv_params(ref, ref, plane);
+  const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+                                       mv.col & SCALE_SUBPEL_MASK,
+                                       mv.row & SCALE_SUBPEL_MASK };
+  ConvolveParams conv_params = get_conv_params(ref, 0, plane, xd->bd);
 
   src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride +
          (mv.col >> SCALE_SUBPEL_BITS);
 
-  av1_make_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
-                           sf, w, h, &conv_params, interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                           warp_types, p_col, p_row, plane, ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR
-                           xd->mi[0], 0,
-#endif
-                           sf->x_step_q4, sf->y_step_q4, xd);
+  av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf,
+                           w, h, &conv_params, interp_filters, warp_types,
+                           p_col, p_row, plane, ref, xd->mi[0], 0, xd,
+                           can_use_previous);
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
                                int dst_stride, const MV *src_mv,
                                const struct scale_factors *sf, int w, int h,
                                ConvolveParams *conv_params,
                                InterpFilters interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
                                const WarpTypesAllowed *warp_types, int p_col,
                                int p_row, int plane, int ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
                                enum mv_precision precision, int x, int y,
-                               const MACROBLOCKD *xd) {
+                               const MACROBLOCKD *xd, int can_use_previous) {
   const int is_q4 = precision == MV_PRECISION_Q4;
   const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
                      is_q4 ? src_mv->col : src_mv->col * 2 };
   MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
   mv.col += SCALE_EXTRA_OFF;
   mv.row += SCALE_EXTRA_OFF;
-  const int subpel_x = mv.col & SCALE_SUBPEL_MASK;
-  const int subpel_y = mv.row & SCALE_SUBPEL_MASK;
 
+  const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+                                       mv.col & SCALE_SUBPEL_MASK,
+                                       mv.row & SCALE_SUBPEL_MASK };
   src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride +
          (mv.col >> SCALE_SUBPEL_BITS);
 
-  av1_make_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
-                           sf, w, h, conv_params, interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                           warp_types, p_col, p_row, plane, ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR
-                           xd->mi[0], 0,
-#endif
-                           sf->x_step_q4, sf->y_step_q4, xd);
+  av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf,
+                           w, h, conv_params, interp_filters, warp_types, p_col,
+                           p_row, plane, ref, xd->mi[0], 0, xd,
+                           can_use_previous);
+}
+
+void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
+                                int order_idx, int *fwd_offset, int *bck_offset,
+                                int *use_jnt_comp_avg, int is_compound) {
+  assert(fwd_offset != NULL && bck_offset != NULL);
+  if (!is_compound || mbmi->compound_idx) {
+    *use_jnt_comp_avg = 0;
+    return;
+  }
+
+  *use_jnt_comp_avg = 1;
+  const int bck_idx = cm->frame_refs[mbmi->ref_frame[0] - LAST_FRAME].idx;
+  const int fwd_idx = cm->frame_refs[mbmi->ref_frame[1] - LAST_FRAME].idx;
+  const int cur_frame_index = cm->cur_frame->cur_frame_offset;
+  int bck_frame_index = 0, fwd_frame_index = 0;
+
+  if (bck_idx >= 0) {
+    bck_frame_index = cm->buffer_pool->frame_bufs[bck_idx].cur_frame_offset;
+  }
+
+  if (fwd_idx >= 0) {
+    fwd_frame_index = cm->buffer_pool->frame_bufs[fwd_idx].cur_frame_offset;
+  }
+
+  int d0 = clamp(abs(get_relative_dist(cm, fwd_frame_index, cur_frame_index)),
+                 0, MAX_FRAME_DISTANCE);
+  int d1 = clamp(abs(get_relative_dist(cm, cur_frame_index, bck_frame_index)),
+                 0, MAX_FRAME_DISTANCE);
+
+  const int order = d0 <= d1;
+
+  if (d0 == 0 || d1 == 0) {
+    *fwd_offset = quant_dist_lookup_table[order_idx][3][order];
+    *bck_offset = quant_dist_lookup_table[order_idx][3][1 - order];
+    return;
+  }
+
+  int i;
+  for (i = 0; i < 3; ++i) {
+    int c0 = quant_dist_weight[i][order];
+    int c1 = quant_dist_weight[i][!order];
+    int d0_c0 = d0 * c0;
+    int d1_c1 = d1 * c1;
+    if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break;
+  }
+
+  *fwd_offset = quant_dist_lookup_table[order_idx][i][order];
+  *bck_offset = quant_dist_lookup_table[order_idx][i][1 - order];
+}
+
+static INLINE void calc_subpel_params(
+    MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv,
+    int plane, const int pre_x, const int pre_y, int x, int y,
+    struct buf_2d *const pre_buf, uint8_t **pre, SubpelParams *subpel_params,
+    int bw, int bh) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int is_scaled = av1_is_scaled(sf);
+  if (is_scaled) {
+    int ssx = pd->subsampling_x;
+    int ssy = pd->subsampling_y;
+    int orig_pos_y = (pre_y + y) << SUBPEL_BITS;
+    orig_pos_y += mv.row * (1 << (1 - ssy));
+    int orig_pos_x = (pre_x + x) << SUBPEL_BITS;
+    orig_pos_x += mv.col * (1 << (1 - ssx));
+    int pos_y = sf->scale_value_y(orig_pos_y, sf);
+    int pos_x = sf->scale_value_x(orig_pos_x, sf);
+    pos_x += SCALE_EXTRA_OFF;
+    pos_y += SCALE_EXTRA_OFF;
+
+    const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+    const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+    const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                       << SCALE_SUBPEL_BITS;
+    const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+    pos_y = clamp(pos_y, top, bottom);
+    pos_x = clamp(pos_x, left, right);
+
+    *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+           (pos_x >> SCALE_SUBPEL_BITS);
+    subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
+    subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
+    subpel_params->xs = sf->x_step_q4;
+    subpel_params->ys = sf->y_step_q4;
+  } else {
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(
+        xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+    subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
+    subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+    subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+    *pre = pre_buf->buf + (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
+           (x + (mv_q4.col >> SUBPEL_BITS));
+  }
 }
 
-typedef struct SubpelParams {
-  int xs;
-  int ys;
-  int subpel_x;
-  int subpel_y;
-} SubpelParams;
-
-static INLINE void build_inter_predictors(
-    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane,
-#if CONFIG_MOTION_VAR
-    const MODE_INFO *mi, int build_for_obmc,
-#endif  // CONFIG_MOTION_VAR
-    int block, int bw, int bh, int x, int y, int w, int h,
-#if CONFIG_SUPERTX
-    int wedge_offset_x, int wedge_offset_y,
-#endif  // CONFIG_SUPERTX
-    int mi_x, int mi_y) {
+static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                          int plane, const MB_MODE_INFO *mi,
+                                          int build_for_obmc, int bw, int bh,
+                                          int mi_x, int mi_y) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-#if !CONFIG_MOTION_VAR
-  const MODE_INFO *mi = xd->mi[0];
-#endif  // CONFIG_MOTION_VAR
-  int is_compound = has_second_ref(&mi->mbmi);
-#if CONFIG_COMPOUND_SINGLEREF
-  int is_comp_mode_pred =
-      is_compound || is_inter_singleref_comp_mode(mi->mbmi.mode);
-#endif  // CONFIG_COMPOUND_SINGLEREF
+  int is_compound = has_second_ref(mi);
   int ref;
-#if CONFIG_INTRABC
-  const int is_intrabc = is_intrabc_block(&mi->mbmi);
+  const int is_intrabc = is_intrabc_block(mi);
   assert(IMPLIES(is_intrabc, !is_compound));
-#endif  // CONFIG_INTRABC
-#if CONFIG_GLOBAL_MOTION
   int is_global[2] = { 0, 0 };
   for (ref = 0; ref < 1 + is_compound; ++ref) {
-    WarpedMotionParams *const wm = &xd->global_motion[mi->mbmi.ref_frame[ref]];
-    is_global[ref] = is_global_mv_block(mi, block, wm->wmtype);
+    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+    is_global[ref] = is_global_mv_block(mi, wm->wmtype);
   }
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!is_compound && is_comp_mode_pred) is_global[1] = is_global[0];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#endif  // CONFIG_GLOBAL_MOTION
-
-#if CONFIG_CB4X4
-  (void)block;
-  (void)cm;
-#endif
 
-#if CONFIG_CHROMA_SUB8X8
-  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
+  const BLOCK_SIZE bsize = mi->sb_type;
   const int ss_x = pd->subsampling_x;
   const int ss_y = pd->subsampling_y;
-  int sub8x8_inter = bsize < BLOCK_8X8 && (ss_x || ss_y);
-
-#if CONFIG_INTRABC
-  if (is_intrabc) {
-    sub8x8_inter = 0;
-  }
-#endif
+  int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) ||
+                     (block_size_high[bsize] < 8 && ss_y);
+
+  if (is_intrabc) sub8x8_inter = 0;
+
+  // For sub8x8 chroma blocks, we may be covering more than one luma block's
+  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+  // the top-left corner of the prediction source - the correct top-left corner
+  // is at (pre_x, pre_y).
+  const int row_start =
+      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
+  const int col_start =
+      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
+  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
 
-#if CONFIG_MOTION_VAR
   sub8x8_inter = sub8x8_inter && !build_for_obmc;
-#endif  // CONFIG_MOTION_VAR
-  const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0;
-  const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0;
-
   if (sub8x8_inter) {
-    for (int row = row_start; row <= 0 && sub8x8_inter; ++row)
-      for (int col = col_start; col <= 0; ++col)
-        if (!is_inter_block(&xd->mi[row * xd->mi_stride + col]->mbmi))
-          sub8x8_inter = 0;
+    for (int row = row_start; row <= 0 && sub8x8_inter; ++row) {
+      for (int col = col_start; col <= 0; ++col) {
+        const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+        if (!is_inter_block(this_mbmi)) sub8x8_inter = 0;
+        if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0;
+      }
+    }
   }
 
   if (sub8x8_inter) {
@@ -1252,178 +857,67 @@ static INLINE void build_inter_predictors(
     const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
     const int b8_w = block_size_wide[plane_bsize] >> ss_x;
     const int b8_h = block_size_high[plane_bsize] >> ss_y;
-    int idx, idy;
-
-    const int x_base = x;
-    const int y_base = y;
+    assert(!is_compound);
 
     const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] };
 
     int row = row_start;
-    for (idy = 0; idy < b8_h; idy += b4_h) {
+    for (int y = 0; y < b8_h; y += b4_h) {
       int col = col_start;
-      for (idx = 0; idx < b8_w; idx += b4_w) {
-        MB_MODE_INFO *this_mbmi = &xd->mi[row * xd->mi_stride + col]->mbmi;
+      for (int x = 0; x < b8_w; x += b4_w) {
+        MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
         is_compound = has_second_ref(this_mbmi);
-#if CONFIG_CONVOLVE_ROUND
-        DECLARE_ALIGNED(16, int32_t, tmp_dst[8 * 8]);
+        DECLARE_ALIGNED(32, CONV_BUF_TYPE, tmp_dst[8 * 8]);
         int tmp_dst_stride = 8;
-        assert(w <= 8 && h <= 8);
-#endif  // CONFIG_CONVOLVE_ROUND
-#if CONFIG_CONVOLVE_ROUND
-        ConvolveParams conv_params =
-            get_conv_params_no_round(0, 0, plane, tmp_dst, tmp_dst_stride);
-#else
-        ConvolveParams conv_params = get_conv_params(0, 0, plane);
-#endif
+        assert(bw < 8 || bh < 8);
+        ConvolveParams conv_params = get_conv_params_no_round(
+            0, 0, plane, tmp_dst, tmp_dst_stride, is_compound, xd->bd);
+        conv_params.use_jnt_comp_avg = 0;
         struct buf_2d *const dst_buf = &pd->dst;
-        x = x_base + idx;
-        y = y_base + idy;
         uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
 
-        // TODO(zoeliu): If single ref comp modes are considered here, a
-        //               mismatch was caused. Need a further investigation.
-        for (ref = 0; ref < 1 + is_compound; ++ref) {
-          const RefBuffer *ref_buf =
-              &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
-
-          const int c_offset = (mi_x + MI_SIZE * col_start) >> ss_x;
-          const int r_offset = (mi_y + MI_SIZE * row_start) >> ss_y;
-          pd->pre[ref].buf0 =
-              (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer;
-          pd->pre[ref].buf =
-              pd->pre[ref].buf0 + scaled_buffer_offset(c_offset, r_offset,
-                                                       ref_buf->buf->uv_stride,
-                                                       &ref_buf->sf);
-          pd->pre[ref].width = ref_buf->buf->uv_crop_width;
-          pd->pre[ref].height = ref_buf->buf->uv_crop_height;
-          pd->pre[ref].stride = ref_buf->buf->uv_stride;
-
-#if CONFIG_INTRABC
-          const struct scale_factors *const sf =
-              is_intrabc ? &xd->sf_identity : &ref_buf->sf;
-          struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
-#else
-          const struct scale_factors *const sf = &ref_buf->sf;
-          struct buf_2d *const pre_buf = &pd->pre[ref];
-#endif  // CONFIG_INTRABC
-
-          const MV mv = this_mbmi->mv[ref].as_mv;
-
-          uint8_t *pre;
-          int xs, ys, subpel_x, subpel_y;
-          const int is_scaled = av1_is_scaled(sf);
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-          WarpTypesAllowed warp_types;
-#if CONFIG_GLOBAL_MOTION
-          warp_types.global_warp_allowed = is_global[ref];
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-          warp_types.local_warp_allowed =
-              this_mbmi->motion_mode == WARPED_CAUSAL;
-#endif  // CONFIG_WARPED_MOTION
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-
-          if (is_scaled) {
-            int ssx = pd->subsampling_x;
-            int ssy = pd->subsampling_y;
-            int orig_pos_y = (mi_y << (SUBPEL_BITS - ssy)) + (y << SUBPEL_BITS);
-            orig_pos_y += mv.row * (1 << (1 - ssy));
-            int orig_pos_x = (mi_x << (SUBPEL_BITS - ssx)) + (x << SUBPEL_BITS);
-            orig_pos_x += mv.col * (1 << (1 - ssx));
-            int pos_y = sf->scale_value_y(orig_pos_y, sf);
-            int pos_x = sf->scale_value_x(orig_pos_x, sf);
-            pos_x += SCALE_EXTRA_OFF;
-            pos_y += SCALE_EXTRA_OFF;
-
-            const int top = -((AOM_INTERP_EXTEND + bh) << SCALE_SUBPEL_BITS);
-            const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                               << SCALE_SUBPEL_BITS;
-            const int left = -((AOM_INTERP_EXTEND + bw) << SCALE_SUBPEL_BITS);
-            const int right = (pre_buf->width + AOM_INTERP_EXTEND)
-                              << SCALE_SUBPEL_BITS;
-            pos_y = clamp(pos_y, top, bottom);
-            pos_x = clamp(pos_x, left, right);
-
-            pre = pre_buf->buf0 +
-                  (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-                  (pos_x >> SCALE_SUBPEL_BITS);
-            subpel_x = pos_x & SCALE_SUBPEL_MASK;
-            subpel_y = pos_y & SCALE_SUBPEL_MASK;
-            xs = sf->x_step_q4;
-            ys = sf->y_step_q4;
-          } else {
-            const MV mv_q4 = clamp_mv_to_umv_border_sb(
-                xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
-            xs = ys = SCALE_SUBPEL_SHIFTS;
-            subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
-            subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
-            pre = pre_buf->buf +
-                  (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
-                  (x + (mv_q4.col >> SUBPEL_BITS));
-          }
-
-          conv_params.ref = ref;
-          conv_params.do_average = ref;
-          if (is_masked_compound_type(mi->mbmi.interinter_compound_type)) {
-            // masked compound type has its own average mechanism
-            conv_params.do_average = 0;
-#if CONFIG_CONVOLVE_ROUND && CONFIG_COMPOUND_SEGMENT && CONFIG_SUPERTX
-            // TODO(angiebird): convolve_round does not support compound_segment
-            // when supertx is on
-            conv_params = get_conv_params(ref, 0, plane);
-#endif
-          }
-          if (ref && is_masked_compound_type(mi->mbmi.interinter_compound_type))
-            av1_make_masked_inter_predictor(
-                pre, pre_buf->stride, dst, dst_buf->stride, subpel_x, subpel_y,
-                sf, b4_w, b4_h, &conv_params, mi->mbmi.interp_filters, xs, ys,
-#if CONFIG_SUPERTX
-                wedge_offset_x, wedge_offset_y,
-#endif  // CONFIG_SUPERTX
-                plane,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                &warp_types, (mi_x >> pd->subsampling_x) + x,
-                (mi_y >> pd->subsampling_y) + y, ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                xd);
-          else
-            av1_make_inter_predictor(
-                pre, pre_buf->stride, dst, dst_buf->stride, subpel_x, subpel_y,
-                sf, b4_w, b4_h, &conv_params, this_mbmi->interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                &warp_types, (mi_x >> pd->subsampling_x) + x,
-                (mi_y >> pd->subsampling_y) + y, plane, ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR
-                mi, build_for_obmc,
-#endif  // CONFIG_MOTION_VAR
-                xs, ys, xd);
-        }  // for (ref = 0; ref < 1 + is_compound; ++ref)
-#if CONFIG_CONVOLVE_ROUND
-        if (conv_params.do_post_rounding) {
-#if CONFIG_HIGHBITDEPTH
-          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-            av1_highbd_convolve_rounding(
-                tmp_dst, tmp_dst_stride, dst, dst_buf->stride, b4_w, b4_h,
-                FILTER_BITS * 2 + is_compound - conv_params.round_0 -
-                    conv_params.round_1,
-                xd->bd);
-          else
-#endif  // CONFIG_HIGHBITDEPTH
-#if CONFIG_COMPOUND_SINGLEREF
-            av1_convolve_rounding(
-                tmp_dst, tmp_dst_stride, dst, dst_buf->stride, b4_w, b4_h,
-                FILTER_BITS * 2 + is_comp_mode_pred - conv_params.round_0 -
-                    conv_params.round_1);
-#else   // !(CONFIG_COMPOUND_SINGLEREF)
-          av1_convolve_rounding(tmp_dst, tmp_dst_stride, dst, dst_buf->stride,
-                                b4_w, b4_h,
-                                FILTER_BITS * 2 + is_compound -
-                                    conv_params.round_0 - conv_params.round_1);
-#endif  // CONFIG_COMPOUND_SINGLEREF
+        ref = 0;
+        const RefBuffer *ref_buf =
+            &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
+
+        pd->pre[ref].buf0 =
+            (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer;
+        pd->pre[ref].buf =
+            pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
+                                                     ref_buf->buf->uv_stride,
+                                                     &ref_buf->sf);
+        pd->pre[ref].width = ref_buf->buf->uv_crop_width;
+        pd->pre[ref].height = ref_buf->buf->uv_crop_height;
+        pd->pre[ref].stride = ref_buf->buf->uv_stride;
+
+        const struct scale_factors *const sf =
+            is_intrabc ? &cm->sf_identity : &ref_buf->sf;
+        struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+
+        const MV mv = this_mbmi->mv[ref].as_mv;
+
+        uint8_t *pre;
+        SubpelParams subpel_params;
+        WarpTypesAllowed warp_types;
+        warp_types.global_warp_allowed = is_global[ref];
+        warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL;
+
+        calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
+                           &subpel_params, bw, bh);
+
+        conv_params.ref = ref;
+        conv_params.do_average = ref;
+        if (is_masked_compound_type(mi->interinter_comp.type)) {
+          // masked compound type has its own average mechanism
+          conv_params.do_average = 0;
         }
-#endif  // CONFIG_CONVOLVE_ROUND
+
+        av1_make_inter_predictor(
+            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf,
+            b4_w, b4_h, &conv_params, this_mbmi->interp_filters, &warp_types,
+            (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y,
+            plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
+
         ++col;
       }
       ++row;
@@ -1432,194 +926,50 @@ static INLINE void build_inter_predictors(
     for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref];
     return;
   }
-#else
-  (void)cm;
-#endif  // CONFIG_CHROMA_SUB8X8
 
   {
+    DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
+    ConvolveParams conv_params = get_conv_params_no_round(
+        0, 0, plane, tmp_dst, MAX_SB_SIZE, is_compound, xd->bd);
+    av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
+                               &conv_params.bck_offset,
+                               &conv_params.use_jnt_comp_avg, is_compound);
+
     struct buf_2d *const dst_buf = &pd->dst;
-    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
-    uint8_t *pre[2];
-    SubpelParams subpel_params[2];
-#if CONFIG_CONVOLVE_ROUND
-    DECLARE_ALIGNED(16, int32_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
-#endif  // CONFIG_CONVOLVE_ROUND
-
-#if CONFIG_COMPOUND_SINGLEREF
-    for (ref = 0; ref < 1 + is_comp_mode_pred; ++ref)
-#else
-    for (ref = 0; ref < 1 + is_compound; ++ref)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    {
-#if CONFIG_INTRABC
+    uint8_t *const dst = dst_buf->buf;
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
       const struct scale_factors *const sf =
-          is_intrabc ? &xd->sf_identity : &xd->block_refs[ref]->sf;
+          is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
       struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
-#else
-      const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
-      struct buf_2d *const pre_buf = &pd->pre[ref];
-#endif  // CONFIG_INTRABC
-#if CONFIG_CB4X4
-      const MV mv = mi->mbmi.mv[ref].as_mv;
-#else
-      const MV mv =
-#if CONFIG_MOTION_VAR
-          (mi->mbmi.sb_type < BLOCK_8X8 && !build_for_obmc)
-              ?
-#else
-          mi->mbmi.sb_type < BLOCK_8X8 ?
-#endif
-              average_split_mvs(pd, mi, ref, block)
-              : mi->mbmi.mv[ref].as_mv;
-#endif
-
-      const int is_scaled = av1_is_scaled(sf);
-      if (is_scaled) {
-        // Note: The various inputs here have different units:
-        // * mi_x/mi_y are in units of luma pixels
-        // * mv is in units of 1/8 luma pixels
-        // * x/y are in units of pixels *in the current plane*
-        // Here we unify these into a q4-format position within the current
-        // plane, then project into the reference frame
-        int ssx = pd->subsampling_x;
-        int ssy = pd->subsampling_y;
-        int orig_pos_y = (mi_y << (SUBPEL_BITS - ssy)) + (y << SUBPEL_BITS);
-        orig_pos_y += mv.row * (1 << (1 - ssy));
-        int orig_pos_x = (mi_x << (SUBPEL_BITS - ssx)) + (x << SUBPEL_BITS);
-        orig_pos_x += mv.col * (1 << (1 - ssx));
-        int pos_y = sf->scale_value_y(orig_pos_y, sf);
-        int pos_x = sf->scale_value_x(orig_pos_x, sf);
-        pos_x += SCALE_EXTRA_OFF;
-        pos_y += SCALE_EXTRA_OFF;
-
-        // Clamp against the reference frame borders, with enough extension
-        // that we don't force the reference block to be partially onscreen.
-        const int top = -((AOM_INTERP_EXTEND + bh) << SCALE_SUBPEL_BITS);
-        const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                           << SCALE_SUBPEL_BITS;
-        const int left = -((AOM_INTERP_EXTEND + bw) << SCALE_SUBPEL_BITS);
-        const int right = (pre_buf->width + AOM_INTERP_EXTEND)
-                          << SCALE_SUBPEL_BITS;
-        pos_y = clamp(pos_y, top, bottom);
-        pos_x = clamp(pos_x, left, right);
-
-        pre[ref] = pre_buf->buf0 +
-                   (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-                   (pos_x >> SCALE_SUBPEL_BITS);
-        subpel_params[ref].subpel_x = pos_x & SCALE_SUBPEL_MASK;
-        subpel_params[ref].subpel_y = pos_y & SCALE_SUBPEL_MASK;
-        subpel_params[ref].xs = sf->x_step_q4;
-        subpel_params[ref].ys = sf->y_step_q4;
-      } else {
-        const MV mv_q4 = clamp_mv_to_umv_border_sb(
-            xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
-        subpel_params[ref].subpel_x = (mv_q4.col & SUBPEL_MASK)
-                                      << SCALE_EXTRA_BITS;
-        subpel_params[ref].subpel_y = (mv_q4.row & SUBPEL_MASK)
-                                      << SCALE_EXTRA_BITS;
-        subpel_params[ref].xs = SCALE_SUBPEL_SHIFTS;
-        subpel_params[ref].ys = SCALE_SUBPEL_SHIFTS;
-        pre[ref] = pre_buf->buf +
-                   (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
-                   (x + (mv_q4.col >> SUBPEL_BITS));
-      }
-    }
+      const MV mv = mi->mv[ref].as_mv;
 
-#if CONFIG_CONVOLVE_ROUND
-    ConvolveParams conv_params =
-        get_conv_params_no_round(ref, ref, plane, tmp_dst, MAX_SB_SIZE);
-#else
-    ConvolveParams conv_params = get_conv_params(ref, ref, plane);
-#endif  // CONFIG_CONVOLVE_ROUND
+      uint8_t *pre;
+      SubpelParams subpel_params;
+      calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf, &pre,
+                         &subpel_params, bw, bh);
 
-#if CONFIG_COMPOUND_SINGLEREF
-    for (ref = 0; ref < 1 + is_comp_mode_pred; ++ref)
-#else
-    for (ref = 0; ref < 1 + is_compound; ++ref)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    {
-#if CONFIG_INTRABC
-      const struct scale_factors *const sf =
-          is_intrabc ? &xd->sf_identity : &xd->block_refs[ref]->sf;
-      struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
-#else
-      const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
-      struct buf_2d *const pre_buf = &pd->pre[ref];
-#endif  // CONFIG_INTRABC
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
       WarpTypesAllowed warp_types;
-#if CONFIG_GLOBAL_MOTION
       warp_types.global_warp_allowed = is_global[ref];
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-      warp_types.local_warp_allowed = mi->mbmi.motion_mode == WARPED_CAUSAL;
-#endif  // CONFIG_WARPED_MOTION
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
       conv_params.ref = ref;
-      conv_params.do_average = ref;
-      if (is_masked_compound_type(mi->mbmi.interinter_compound_type)) {
+
+      if (ref && is_masked_compound_type(mi->interinter_comp.type)) {
         // masked compound type has its own average mechanism
         conv_params.do_average = 0;
-#if CONFIG_CONVOLVE_ROUND && CONFIG_COMPOUND_SEGMENT && CONFIG_SUPERTX
-        // TODO(angiebird): convolve_round does not support compound_segment
-        // when supertx is on
-        conv_params = get_conv_params(ref, 0, plane);
-#endif
-      }
-
-      if (ref && is_masked_compound_type(mi->mbmi.interinter_compound_type))
         av1_make_masked_inter_predictor(
-            pre[ref], pre_buf->stride, dst, dst_buf->stride,
-            subpel_params[ref].subpel_x, subpel_params[ref].subpel_y, sf, w, h,
-            &conv_params, mi->mbmi.interp_filters, subpel_params[ref].xs,
-            subpel_params[ref].ys,
-#if CONFIG_SUPERTX
-            wedge_offset_x, wedge_offset_y,
-#endif  // CONFIG_SUPERTX
-            plane,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-            &warp_types, (mi_x >> pd->subsampling_x) + x,
-            (mi_y >> pd->subsampling_y) + y, ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-            xd);
-      else
+            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
+            bh, &conv_params, mi->interp_filters, plane, &warp_types,
+            mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, ref, xd,
+            cm->allow_warped_motion);
+      } else {
+        conv_params.do_average = ref;
         av1_make_inter_predictor(
-            pre[ref], pre_buf->stride, dst, dst_buf->stride,
-            subpel_params[ref].subpel_x, subpel_params[ref].subpel_y, sf, w, h,
-            &conv_params, mi->mbmi.interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-            &warp_types, (mi_x >> pd->subsampling_x) + x,
-            (mi_y >> pd->subsampling_y) + y, plane, ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR
-            mi, build_for_obmc,
-#endif  // CONFIG_MOTION_VAR
-            subpel_params[ref].xs, subpel_params[ref].ys, xd);
-    }
-
-#if CONFIG_CONVOLVE_ROUND
-    // TODO(angiebird): This part needs optimization
-    if (conv_params.do_post_rounding) {
-#if CONFIG_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-        av1_highbd_convolve_rounding(
-            tmp_dst, MAX_SB_SIZE, dst, dst_buf->stride, w, h,
-            FILTER_BITS * 2 + is_compound - conv_params.round_0 -
-                conv_params.round_1,
-            xd->bd);
-      else
-#endif  // CONFIG_HIGHBITDEPTH
-#if CONFIG_COMPOUND_SINGLEREF
-        av1_convolve_rounding(tmp_dst, MAX_SB_SIZE, dst, dst_buf->stride, w, h,
-                              FILTER_BITS * 2 + is_comp_mode_pred -
-                                  conv_params.round_0 - conv_params.round_1);
-#else   // !(CONFIG_COMPOUND_SINGLEREF)
-      av1_convolve_rounding(tmp_dst, MAX_SB_SIZE, dst, dst_buf->stride, w, h,
-                            FILTER_BITS * 2 + is_compound -
-                                conv_params.round_0 - conv_params.round_1);
-#endif  // CONFIG_COMPOUND_SINGLEREF
+            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
+            bh, &conv_params, mi->interp_filters, &warp_types,
+            mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, plane, ref,
+            mi, build_for_obmc, xd, cm->allow_warped_motion);
+      }
     }
-#endif  // CONFIG_CONVOLVE_ROUND
   }
 }
 
@@ -1630,56 +980,16 @@ static void build_inter_predictors_for_planes(const AV1_COMMON *cm,
   int plane;
   const int mi_x = mi_col * MI_SIZE;
   const int mi_y = mi_row * MI_SIZE;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
   for (plane = plane_from; plane <= plane_to; ++plane) {
     const struct macroblockd_plane *pd = &xd->plane[plane];
     const int bw = pd->width;
     const int bh = pd->height;
 
-#if CONFIG_CB4X4
     if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
                              pd->subsampling_y))
       continue;
-#endif
 
-    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8 && !unify_bsize) {
-      const PARTITION_TYPE bp = bsize - xd->mi[0]->mbmi.sb_type;
-      const int have_vsplit = bp != PARTITION_HORZ;
-      const int have_hsplit = bp != PARTITION_VERT;
-      const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
-      const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
-      const int pw = 8 >> (have_vsplit | pd->subsampling_x);
-      const int ph = 8 >> (have_hsplit | pd->subsampling_y);
-      int x, y;
-      assert(bp != PARTITION_NONE && bp < PARTITION_TYPES);
-      assert(bsize == BLOCK_8X8);
-      assert(pw * num_4x4_w == bw && ph * num_4x4_h == bh);
-      for (y = 0; y < num_4x4_h; ++y)
-        for (x = 0; x < num_4x4_w; ++x)
-          build_inter_predictors(cm, xd, plane,
-#if CONFIG_MOTION_VAR
-                                 xd->mi[0], 0,
-#endif  // CONFIG_MOTION_VAR
-                                 y * 2 + x, bw, bh, 4 * x, 4 * y, pw, ph,
-#if CONFIG_SUPERTX
-                                 0, 0,
-#endif  // CONFIG_SUPERTX
-                                 mi_x, mi_y);
-    } else {
-      build_inter_predictors(cm, xd, plane,
-#if CONFIG_MOTION_VAR
-                             xd->mi[0], 0,
-#endif  // CONFIG_MOTION_VAR
-                             0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_SUPERTX
-                             0, 0,
-#endif  // CONFIG_SUPERTX
-                             mi_x, mi_y);
-    }
+    build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y);
   }
 }
 
@@ -1687,17 +997,14 @@ void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                     int mi_row, int mi_col, BUFFER_SET *ctx,
                                     BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 0, 0);
-#if CONFIG_INTERINTRA
-  if (is_interintra_pred(&xd->mi[0]->mbmi)) {
+
+  if (is_interintra_pred(xd->mi[0])) {
     BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL },
                                { xd->plane[0].dst.stride, 0, 0 } };
     if (!ctx) ctx = &default_ctx;
     av1_build_interintra_predictors_sby(cm, xd, xd->plane[0].dst.buf,
                                         xd->plane[0].dst.stride, ctx, bsize);
   }
-#else
-  (void)ctx;
-#endif  // CONFIG_INTERINTRA
 }
 
 void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
@@ -1705,8 +1012,8 @@ void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                      BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 1,
                                     MAX_MB_PLANE - 1);
-#if CONFIG_INTERINTRA
-  if (is_interintra_pred(&xd->mi[0]->mbmi)) {
+
+  if (is_interintra_pred(xd->mi[0])) {
     BUFFER_SET default_ctx = {
       { NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
       { 0, xd->plane[1].dst.stride, xd->plane[2].dst.stride }
@@ -1716,247 +1023,49 @@ void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
         cm, xd, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
         xd->plane[1].dst.stride, xd->plane[2].dst.stride, ctx, bsize);
   }
-#else
-  (void)ctx;
-#endif  // CONFIG_INTERINTRA
 }
 
 void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                    int mi_row, int mi_col, BUFFER_SET *ctx,
                                    BLOCK_SIZE bsize) {
+  const int num_planes = av1_num_planes(cm);
   av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
-  av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
+  if (num_planes > 1)
+    av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
 }
 
 void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
-                          const YV12_BUFFER_CONFIG *src, int mi_row,
-                          int mi_col) {
-  const int widths[MAX_MB_PLANE] = { src->y_crop_width, src->uv_crop_width,
-                                     src->uv_crop_width };
-  const int heights[MAX_MB_PLANE] = { src->y_crop_height, src->uv_crop_height,
-                                      src->uv_crop_height };
-  const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
-                                      src->uv_stride };
-  int i;
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const int plane_start, const int plane_end) {
+  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+  // the static analysis warnings.
+  for (int i = plane_start; i < AOMMIN(plane_end, MAX_MB_PLANE); ++i) {
     struct macroblockd_plane *const pd = &planes[i];
-    setup_pred_plane(&pd->dst, bsize, src->buffers[i], widths[i], heights[i],
-                     strides[i], mi_row, mi_col, NULL, pd->subsampling_x,
-                     pd->subsampling_y);
+    const int is_uv = i > 0;
+    setup_pred_plane(&pd->dst, bsize, src->buffers[i], src->crop_widths[is_uv],
+                     src->crop_heights[is_uv], src->strides[is_uv], mi_row,
+                     mi_col, NULL, pd->subsampling_x, pd->subsampling_y);
   }
 }
 
 void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
-                          const struct scale_factors *sf) {
+                          const struct scale_factors *sf,
+                          const int num_planes) {
   if (src != NULL) {
-    int i;
-    uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
-                                             src->v_buffer };
-    const int widths[MAX_MB_PLANE] = { src->y_crop_width, src->uv_crop_width,
-                                       src->uv_crop_width };
-    const int heights[MAX_MB_PLANE] = { src->y_crop_height, src->uv_crop_height,
-                                        src->uv_crop_height };
-    const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
-                                        src->uv_stride };
-    for (i = 0; i < MAX_MB_PLANE; ++i) {
+    // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+    // the static analysis warnings.
+    for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
       struct macroblockd_plane *const pd = &xd->plane[i];
-      setup_pred_plane(&pd->pre[idx], xd->mi[0]->mbmi.sb_type, buffers[i],
-                       widths[i], heights[i], strides[i], mi_row, mi_col, sf,
+      const int is_uv = i > 0;
+      setup_pred_plane(&pd->pre[idx], xd->mi[0]->sb_type, src->buffers[i],
+                       src->crop_widths[is_uv], src->crop_heights[is_uv],
+                       src->strides[is_uv], mi_row, mi_col, sf,
                        pd->subsampling_x, pd->subsampling_y);
     }
   }
 }
 
-#if CONFIG_SUPERTX
-#if CONFIG_CB4X4
-static const uint8_t mask_4[4] = { 64, 52, 12, 0 };
-static const uint8_t mask_4_uv[4] = { 64, 52, 12, 0 };
-#endif  // CONFIG_CB4X4
-static const uint8_t mask_8[8] = { 64, 64, 62, 52, 12, 2, 0, 0 };
-
-static const uint8_t mask_16[16] = { 63, 62, 60, 58, 55, 50, 43, 36,
-                                     28, 21, 14, 9,  6,  4,  2,  1 };
-
-static const uint8_t mask_32[32] = { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 63,
-                                     61, 57, 52, 45, 36, 28, 19, 12, 7,  3,  1,
-                                     0,  0,  0,  0,  0,  0,  0,  0,  0,  0 };
-
-static const uint8_t mask_8_uv[8] = { 64, 64, 62, 52, 12, 2, 0, 0 };
-
-static const uint8_t mask_16_uv[16] = { 64, 64, 64, 64, 61, 53, 45, 36,
-                                        28, 19, 11, 3,  0,  0,  0,  0 };
-
-static const uint8_t mask_32_uv[32] = { 64, 64, 64, 64, 64, 64, 64, 64,
-                                        64, 64, 64, 64, 60, 54, 46, 36,
-                                        28, 18, 10, 4,  0,  0,  0,  0,
-                                        0,  0,  0,  0,  0,  0,  0,  0 };
-
-static const uint8_t *get_supertx_mask(int length, int plane) {
-  switch (length) {
-#if CONFIG_CB4X4
-    case 4: return plane ? mask_4_uv : mask_4;
-#endif  // CONFIG_CB4X4
-    case 8: return plane ? mask_8_uv : mask_8;
-    case 16: return plane ? mask_16_uv : mask_16;
-    case 32: return plane ? mask_32_uv : mask_32;
-    default: assert(0);
-  }
-  return NULL;
-}
-
-void av1_build_masked_inter_predictor_complex(
-    MACROBLOCKD *xd, uint8_t *dst, int dst_stride, const uint8_t *pre,
-    int pre_stride, int mi_row, int mi_col, int mi_row_ori, int mi_col_ori,
-    BLOCK_SIZE bsize, BLOCK_SIZE top_bsize, PARTITION_TYPE partition,
-    int plane) {
-  const struct macroblockd_plane *pd = &xd->plane[plane];
-  const int ssx = pd->subsampling_x;
-  const int ssy = pd->subsampling_y;
-  const int top_w = block_size_wide[top_bsize] >> ssx;
-  const int top_h = block_size_high[top_bsize] >> ssy;
-  const int w = block_size_wide[bsize] >> ssx;
-  const int h = block_size_high[bsize] >> ssy;
-  const int w_offset = ((mi_col - mi_col_ori) * MI_SIZE) >> ssx;
-  const int h_offset = ((mi_row - mi_row_ori) * MI_SIZE) >> ssy;
-
-  int w_remain, h_remain;
-
-#if CONFIG_HIGHBITDEPTH
-  const int is_hdb = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-#endif  // CONFIG_HIGHBITDEPTH
-
-  assert(bsize <= BLOCK_32X32);
-  assert(IMPLIES(plane == 0, ssx == 0));
-  assert(IMPLIES(plane == 0, ssy == 0));
-
-  switch (partition) {
-    case PARTITION_HORZ: {
-      const uint8_t *const mask = get_supertx_mask(h, ssy);
-
-      w_remain = top_w;
-      h_remain = top_h - h_offset - h;
-      dst += h_offset * dst_stride;
-      pre += h_offset * pre_stride;
-
-#if CONFIG_HIGHBITDEPTH
-      if (is_hdb)
-        aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, pre,
-                                   pre_stride, mask, h, top_w, xd->bd);
-      else
-#endif  // CONFIG_HIGHBITDEPTH
-        aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, pre, pre_stride,
-                            mask, h, top_w);
-
-      dst += h * dst_stride;
-      pre += h * pre_stride;
-      break;
-    }
-    case PARTITION_VERT: {
-      const uint8_t *const mask = get_supertx_mask(w, ssx);
-
-      w_remain = top_w - w_offset - w;
-      h_remain = top_h;
-      dst += w_offset;
-      pre += w_offset;
-
-#if CONFIG_HIGHBITDEPTH
-      if (is_hdb)
-        aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, pre,
-                                   pre_stride, mask, top_h, w, xd->bd);
-      else
-#endif  // CONFIG_HIGHBITDEPTH
-        aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, pre, pre_stride,
-                            mask, top_h, w);
-
-      dst += w;
-      pre += w;
-      break;
-    }
-    default: {
-      assert(0);
-      return;
-    }
-  }
-
-  if (w_remain == 0 || h_remain == 0) {
-    return;
-  }
-
-#if CONFIG_HIGHBITDEPTH
-  if (is_hdb) {
-    dst = (uint8_t *)CONVERT_TO_SHORTPTR(dst);
-    pre = (const uint8_t *)CONVERT_TO_SHORTPTR(pre);
-    dst_stride *= 2;
-    pre_stride *= 2;
-    w_remain *= 2;
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-
-  do {
-    memcpy(dst, pre, w_remain * sizeof(uint8_t));
-    dst += dst_stride;
-    pre += pre_stride;
-  } while (--h_remain);
-}
-
-void av1_build_inter_predictor_sb_sub8x8_extend(const AV1_COMMON *cm,
-                                                MACROBLOCKD *xd, int mi_row_ori,
-                                                int mi_col_ori, int mi_row,
-                                                int mi_col, int plane,
-                                                BLOCK_SIZE bsize, int block) {
-  // Prediction function used in supertx:
-  // Use the mv at current block (which is less than 8x8)
-  // to get prediction of a block located at (mi_row, mi_col) at size of bsize
-  // bsize can be larger than 8x8.
-  // block (0-3): the sub8x8 location of current block
-  const int mi_x = mi_col * MI_SIZE;
-  const int mi_y = mi_row * MI_SIZE;
-  const int wedge_offset_x = (mi_col_ori - mi_col) * MI_SIZE;
-  const int wedge_offset_y = (mi_row_ori - mi_row) * MI_SIZE;
-
-  // For sub8x8 uv:
-  // Skip uv prediction in supertx except the first block (block = 0)
-  int max_plane = block ? 1 : MAX_MB_PLANE;
-  if (plane >= max_plane) return;
-
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
-  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-  const int bw = 4 * num_4x4_w;
-  const int bh = 4 * num_4x4_h;
-
-  build_inter_predictors(cm, xd, plane,
-#if CONFIG_MOTION_VAR
-                         xd->mi[0], 0,
-#endif  // CONFIG_MOTION_VAR
-                         block, bw, bh, 0, 0, bw, bh, wedge_offset_x,
-                         wedge_offset_y, mi_x, mi_y);
-}
-
-void av1_build_inter_predictor_sb_extend(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         int mi_row_ori, int mi_col_ori,
-                                         int mi_row, int mi_col, int plane,
-                                         BLOCK_SIZE bsize) {
-  const int mi_x = mi_col * MI_SIZE;
-  const int mi_y = mi_row * MI_SIZE;
-  const int wedge_offset_x = (mi_col_ori - mi_col) * MI_SIZE;
-  const int wedge_offset_y = (mi_row_ori - mi_row) * MI_SIZE;
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
-  const int bw = block_size_wide[plane_bsize];
-  const int bh = block_size_high[plane_bsize];
-
-  build_inter_predictors(cm, xd, plane,
-#if CONFIG_MOTION_VAR
-                         xd->mi[0], 0,
-#endif  // CONFIG_MOTION_VAR
-                         0, bw, bh, 0, 0, bw, bh, wedge_offset_x,
-                         wedge_offset_y, mi_x, mi_y);
-}
-#endif  // CONFIG_SUPERTX
-
-#if CONFIG_MOTION_VAR
 // obmc_mask_N[overlap_position]
 static const uint8_t obmc_mask_1[1] = { 64 };
 
@@ -1974,14 +1083,12 @@ static const uint8_t obmc_mask_32[32] = { 33, 35, 36, 38, 40, 41, 43, 44,
                                           56, 57, 58, 59, 60, 60, 61, 62,
                                           64, 64, 64, 64, 64, 64, 64, 64 };
 
-#if CONFIG_EXT_PARTITION
 static const uint8_t obmc_mask_64[64] = {
   33, 34, 35, 35, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 44, 44,
   45, 46, 47, 47, 48, 49, 50, 51, 51, 51, 52, 52, 53, 54, 55, 56,
   56, 56, 57, 57, 58, 58, 59, 60, 60, 60, 60, 60, 61, 62, 62, 62,
   62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
 };
-#endif  // CONFIG_EXT_PARTITION
 
 const uint8_t *av1_get_obmc_mask(int length) {
   switch (length) {
@@ -1991,69 +1098,25 @@ const uint8_t *av1_get_obmc_mask(int length) {
     case 8: return obmc_mask_8;
     case 16: return obmc_mask_16;
     case 32: return obmc_mask_32;
-#if CONFIG_EXT_PARTITION
     case 64: return obmc_mask_64;
-#endif  // CONFIG_EXT_PARTITION
     default: assert(0); return NULL;
   }
 }
 
-#if CONFIG_NCOBMC
-// obmc_mask_flipN[overlap_position]
-static const uint8_t obmc_mask_flip1[1] = { 55 };
-
-static const uint8_t obmc_mask_flip2[2] = { 62, 45 };
-
-static const uint8_t obmc_mask_flip4[4] = { 64, 59, 50, 39 };
-
-static const uint8_t obmc_mask_flip8[8] = { 64, 63, 61, 57, 53, 48, 42, 36 };
-
-static const uint8_t obmc_mask_flip16[16] = { 64, 64, 64, 63, 61, 60, 58, 56,
-                                              54, 52, 49, 46, 43, 40, 37, 34 };
-
-static const uint8_t obmc_mask_flip32[32] = { 64, 64, 64, 64, 64, 63, 63, 62,
-                                              62, 61, 60, 60, 59, 58, 57, 56,
-                                              55, 53, 52, 51, 50, 48, 47, 45,
-                                              44, 43, 41, 40, 38, 36, 35, 33 };
-
-#if CONFIG_EXT_PARTITION
-static const uint8_t obmc_mask_flip64[64] = {
-  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 63, 63, 63, 63, 62, 62,
-  62, 62, 62, 61, 60, 60, 60, 60, 60, 59, 58, 58, 57, 57, 56, 56,
-  56, 55, 54, 53, 52, 52, 51, 51, 51, 50, 49, 48, 47, 47, 46, 45,
-  44, 44, 44, 43, 42, 41, 40, 40, 39, 38, 37, 36, 35, 35, 34, 33,
-};
-#endif  // CONFIG_EXT_PARTITION
-
-const uint8_t *av1_get_obmc_mask_flipped(int length) {
-  switch (length) {
-    case 1: return obmc_mask_flip1;
-    case 2: return obmc_mask_flip2;
-    case 4: return obmc_mask_flip4;
-    case 8: return obmc_mask_flip8;
-    case 16: return obmc_mask_flip16;
-    case 32: return obmc_mask_flip32;
-#if CONFIG_EXT_PARTITION
-    case 64: return obmc_mask_flip64;
-#endif  // CONFIG_EXT_PARTITION
-    default: assert(0); return NULL;
-  }
-}
-#endif  // CONFIG_NCOBMC
-
 static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_rc,
-                                     uint8_t mi_hw, MODE_INFO *mi,
-                                     void *fun_ctxt) {
+                                     uint8_t mi_hw, MB_MODE_INFO *mi,
+                                     void *fun_ctxt, const int num_planes) {
   (void)xd;
   (void)rel_mi_rc;
   (void)mi_hw;
   (void)mi;
   ++*(int *)fun_ctxt;
+  (void)num_planes;
 }
 
 void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                       int mi_row, int mi_col) {
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
 
   mbmi->overlappable_neighbors[0] = 0;
   mbmi->overlappable_neighbors[1] = 0;
@@ -2066,21 +1129,17 @@ void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                &mbmi->overlappable_neighbors[1]);
 }
 
-// HW does not support < 4x4 prediction. To limit the bandwidth requirement, for
-// small blocks, only blend with neighbors from one side. If block-size of
-// current plane is 4x4 or 8x4, the above neighbor (dir = 0) will be skipped. If
-// it is 4x8, the left neighbor (dir = 1) will be skipped.
+// HW does not support < 4x4 prediction. To limit the bandwidth requirement, if
+// block-size of current plane is smaller than 8x8, always only blend with the
+// left neighbor(s) (skip blending with the above side).
 #define DISABLE_CHROMA_U8X8_OBMC 0  // 0: one-sided obmc; 1: disable
 
-int skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize, const struct macroblockd_plane *pd,
-                           int dir) {
+int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize,
+                               const struct macroblockd_plane *pd, int dir) {
   assert(is_motion_variation_allowed_bsize(bsize));
 
-  BLOCK_SIZE bsize_plane =
-      ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y];
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  if (bsize_plane < BLOCK_4X4) return 1;
-#endif
+  const BLOCK_SIZE bsize_plane =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
   switch (bsize_plane) {
 #if DISABLE_CHROMA_U8X8_OBMC
     case BLOCK_4X4:
@@ -2095,6 +1154,13 @@ int skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize, const struct macroblockd_plane *pd,
   }
 }
 
+void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
+  mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+
+  return;
+}
+
 struct obmc_inter_pred_ctxt {
   uint8_t **adjacent;
   int *adjacent_stride;
@@ -2102,24 +1168,23 @@ struct obmc_inter_pred_ctxt {
 
 static INLINE void build_obmc_inter_pred_above(MACROBLOCKD *xd, int rel_mi_col,
                                                uint8_t above_mi_width,
-                                               MODE_INFO *above_mi,
-                                               void *fun_ctxt) {
+                                               MB_MODE_INFO *above_mi,
+                                               void *fun_ctxt,
+                                               const int num_planes) {
   (void)above_mi;
   struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-#if CONFIG_HIGHBITDEPTH
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-#endif  // CONFIG_HIGHBITDEPTH
   const int overlap =
       AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
 
-  for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+  for (int plane = 0; plane < num_planes; ++plane) {
     const struct macroblockd_plane *pd = &xd->plane[plane];
     const int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
     const int bh = overlap >> pd->subsampling_y;
     const int plane_col = (rel_mi_col * MI_SIZE) >> pd->subsampling_x;
 
-    if (skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
 
     const int dst_stride = pd->dst.stride;
     uint8_t *const dst = &pd->dst.buf[plane_col];
@@ -2127,37 +1192,34 @@ static INLINE void build_obmc_inter_pred_above(MACROBLOCKD *xd, int rel_mi_col,
     const uint8_t *const tmp = &ctxt->adjacent[plane][plane_col];
     const uint8_t *const mask = av1_get_obmc_mask(bh);
 
-#if CONFIG_HIGHBITDEPTH
     if (is_hbd)
       aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
-                                 tmp_stride, mask, bh, bw, xd->bd);
+                                 tmp_stride, mask, bw, bh, xd->bd);
     else
-#endif  // CONFIG_HIGHBITDEPTH
       aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
-                          mask, bh, bw);
+                          mask, bw, bh);
   }
 }
 
 static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row,
                                               uint8_t left_mi_height,
-                                              MODE_INFO *left_mi,
-                                              void *fun_ctxt) {
+                                              MB_MODE_INFO *left_mi,
+                                              void *fun_ctxt,
+                                              const int num_planes) {
   (void)left_mi;
   struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   const int overlap =
       AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
-#if CONFIG_HIGHBITDEPTH
   const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-#endif  // CONFIG_HIGHBITDEPTH
 
-  for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+  for (int plane = 0; plane < num_planes; ++plane) {
     const struct macroblockd_plane *pd = &xd->plane[plane];
     const int bw = overlap >> pd->subsampling_x;
     const int bh = (left_mi_height * MI_SIZE) >> pd->subsampling_y;
     const int plane_row = (rel_mi_row * MI_SIZE) >> pd->subsampling_y;
 
-    if (skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
 
     const int dst_stride = pd->dst.stride;
     uint8_t *const dst = &pd->dst.buf[plane_row * dst_stride];
@@ -2165,14 +1227,12 @@ static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row,
     const uint8_t *const tmp = &ctxt->adjacent[plane][plane_row * tmp_stride];
     const uint8_t *const mask = av1_get_obmc_mask(bw);
 
-#if CONFIG_HIGHBITDEPTH
     if (is_hbd)
       aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
-                                 tmp_stride, mask, bh, bw, xd->bd);
+                                 tmp_stride, mask, bw, bh, xd->bd);
     else
-#endif  // CONFIG_HIGHBITDEPTH
       aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
-                          mask, bh, bw);
+                          mask, bw, bh);
   }
 }
 
@@ -2186,86 +1246,41 @@ void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                      int above_stride[MAX_MB_PLANE],
                                      uint8_t *left[MAX_MB_PLANE],
                                      int left_stride[MAX_MB_PLANE]) {
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
 
   // handle above row
   struct obmc_inter_pred_ctxt ctxt_above = { above, above_stride };
   foreach_overlappable_nb_above(cm, xd, mi_col,
-                                max_neighbor_obmc[b_width_log2_lookup[bsize]],
+                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
                                 build_obmc_inter_pred_above, &ctxt_above);
 
   // handle left column
   struct obmc_inter_pred_ctxt ctxt_left = { left, left_stride };
   foreach_overlappable_nb_left(cm, xd, mi_row,
-                               max_neighbor_obmc[b_height_log2_lookup[bsize]],
+                               max_neighbor_obmc[mi_size_high_log2[bsize]],
                                build_obmc_inter_pred_left, &ctxt_left);
 }
 
-void modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
-  if (is_interintra_pred(mbmi)) {
-    mbmi->ref_frame[1] = NONE_FRAME;
-  } else if (has_second_ref(mbmi) &&
-             is_masked_compound_type(mbmi->interinter_compound_type)) {
-    mbmi->interinter_compound_type = COMPOUND_AVERAGE;
-    mbmi->ref_frame[1] = NONE_FRAME;
-#if CONFIG_COMPOUND_SINGLEREF
-  } else if (!has_second_ref(mbmi) &&
-             is_inter_singleref_comp_mode(mbmi->mode)) {
-    // mbmi->mode = compound_ref0_mode(mbmi->mode);
-    mbmi->mode = compound_ref1_mode(mbmi->mode);
-    assert(is_inter_singleref_mode(mbmi->mode));
-    mbmi->mv[0].as_int = mbmi->mv[1].as_int;
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  }
-  if (has_second_ref(mbmi)) mbmi->ref_frame[1] = NONE_FRAME;
-  return;
-}
-
-struct build_prediction_ctxt {
-  const AV1_COMMON *cm;
-  int mi_row;
-  int mi_col;
-  uint8_t **tmp_buf;
-  int *tmp_width;
-  int *tmp_height;
-  int *tmp_stride;
-  int mb_to_far_edge;
-};
-
-static INLINE void build_prediction_by_above_pred(MACROBLOCKD *xd,
-                                                  int rel_mi_col,
-                                                  uint8_t above_mi_width,
-                                                  MODE_INFO *above_mi,
-                                                  void *fun_ctxt) {
-  MB_MODE_INFO *above_mbmi = &above_mi->mbmi;
+void av1_setup_build_prediction_by_above_pred(
+    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+    MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
+    const int num_planes) {
   const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->sb_type);
-  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
   const int above_mi_col = ctxt->mi_col + rel_mi_col;
 
-  MB_MODE_INFO backup_mbmi = *above_mbmi;
-  modify_neighbor_predictor_for_obmc(above_mbmi);
+  av1_modify_neighbor_predictor_for_obmc(above_mbmi);
 
-  for (int j = 0; j < MAX_MB_PLANE; ++j) {
+  for (int j = 0; j < num_planes; ++j) {
     struct macroblockd_plane *const pd = &xd->plane[j];
     setup_pred_plane(&pd->dst, a_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j],
                      ctxt->tmp_height[j], ctxt->tmp_stride[j], 0, rel_mi_col,
                      NULL, pd->subsampling_x, pd->subsampling_y);
   }
 
-#if CONFIG_COMPOUND_SINGLEREF
-  const int num_refs = 1 + is_inter_anyref_comp_mode(above_mbmi->mode);
-#else
   const int num_refs = 1 + has_second_ref(above_mbmi);
-#endif
 
   for (int ref = 0; ref < num_refs; ++ref) {
-#if CONFIG_COMPOUND_SINGLEREF
-    const MV_REFERENCE_FRAME frame = has_second_ref(above_mbmi)
-                                         ? above_mbmi->ref_frame[ref]
-                                         : above_mbmi->ref_frame[0];
-#else
     const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
-#endif  // CONFIG_COMPOUND_SINGLEREF
 
     const RefBuffer *const ref_buf = &ctxt->cm->frame_refs[frame - LAST_FRAME];
 
@@ -2274,31 +1289,37 @@ static INLINE void build_prediction_by_above_pred(MACROBLOCKD *xd,
       aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
                          "Reference frame has invalid dimensions");
     av1_setup_pre_planes(xd, ref, ref_buf->buf, ctxt->mi_row, above_mi_col,
-                         &ref_buf->sf);
+                         &ref_buf->sf, num_planes);
   }
 
   xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
   xd->mb_to_right_edge = ctxt->mb_to_far_edge +
                          (xd->n8_w - rel_mi_col - above_mi_width) * MI_SIZE * 8;
+}
 
-  int mi_x = above_mi_col << MI_SIZE_LOG2;
-  int mi_y = ctxt->mi_row << MI_SIZE_LOG2;
+static INLINE void build_prediction_by_above_pred(
+    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+    MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
+  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+  const int above_mi_col = ctxt->mi_col + rel_mi_col;
+  int mi_x, mi_y;
+  MB_MODE_INFO backup_mbmi = *above_mbmi;
 
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width,
+                                           above_mbmi, ctxt, num_planes);
+  mi_x = above_mi_col << MI_SIZE_LOG2;
+  mi_y = ctxt->mi_row << MI_SIZE_LOG2;
 
-  for (int j = 0; j < MAX_MB_PLANE; ++j) {
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+  for (int j = 0; j < num_planes; ++j) {
     const struct macroblockd_plane *pd = &xd->plane[j];
     int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
     int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
                    block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
 
-    if (skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
-    build_inter_predictors(ctxt->cm, xd, j, above_mi, 1, 0, bw, bh, 0, 0, bw,
-                           bh,
-#if CONFIG_SUPERTX
-                           0, 0,
-#endif  // CONFIG_SUPERTX
-                           mi_x, mi_y);
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+    build_inter_predictors(ctxt->cm, xd, j, above_mbmi, 1, bw, bh, mi_x, mi_y);
   }
   *above_mbmi = backup_mbmi;
 }
@@ -2322,9 +1343,9 @@ void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                         mi_col,     tmp_buf,
                                         tmp_width,  tmp_height,
                                         tmp_stride, xd->mb_to_right_edge };
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   foreach_overlappable_nb_above(cm, xd, mi_col,
-                                max_neighbor_obmc[b_width_log2_lookup[bsize]],
+                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
                                 build_prediction_by_above_pred, &ctxt);
 
   xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
@@ -2332,40 +1353,27 @@ void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
   xd->mb_to_bottom_edge -= (this_height - pred_height) * 8;
 }
 
-static INLINE void build_prediction_by_left_pred(MACROBLOCKD *xd,
-                                                 int rel_mi_row,
-                                                 uint8_t left_mi_height,
-                                                 MODE_INFO *left_mi,
-                                                 void *fun_ctxt) {
-  MB_MODE_INFO *left_mbmi = &left_mi->mbmi;
+void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
+                                             uint8_t left_mi_height,
+                                             MB_MODE_INFO *left_mbmi,
+                                             struct build_prediction_ctxt *ctxt,
+                                             const int num_planes) {
   const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->sb_type);
-  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
   const int left_mi_row = ctxt->mi_row + rel_mi_row;
 
-  MB_MODE_INFO backup_mbmi = *left_mbmi;
-  modify_neighbor_predictor_for_obmc(left_mbmi);
+  av1_modify_neighbor_predictor_for_obmc(left_mbmi);
 
-  for (int j = 0; j < MAX_MB_PLANE; ++j) {
+  for (int j = 0; j < num_planes; ++j) {
     struct macroblockd_plane *const pd = &xd->plane[j];
     setup_pred_plane(&pd->dst, l_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j],
                      ctxt->tmp_height[j], ctxt->tmp_stride[j], rel_mi_row, 0,
                      NULL, pd->subsampling_x, pd->subsampling_y);
   }
 
-#if CONFIG_COMPOUND_SINGLEREF
-  const int num_refs = 1 + is_inter_anyref_comp_mode(left_mbmi->mode);
-#else
   const int num_refs = 1 + has_second_ref(left_mbmi);
-#endif
 
   for (int ref = 0; ref < num_refs; ++ref) {
-#if CONFIG_COMPOUND_SINGLEREF
-    const MV_REFERENCE_FRAME frame = has_second_ref(left_mbmi)
-                                         ? left_mbmi->ref_frame[ref]
-                                         : left_mbmi->ref_frame[0];
-#else
     const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
-#endif  // CONFIG_COMPOUND_SINGLEREF
 
     const RefBuffer *const ref_buf = &ctxt->cm->frame_refs[frame - LAST_FRAME];
 
@@ -2374,31 +1382,37 @@ static INLINE void build_prediction_by_left_pred(MACROBLOCKD *xd,
       aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
                          "Reference frame has invalid dimensions");
     av1_setup_pre_planes(xd, ref, ref_buf->buf, left_mi_row, ctxt->mi_col,
-                         &ref_buf->sf);
+                         &ref_buf->sf, num_planes);
   }
 
   xd->mb_to_top_edge = 8 * MI_SIZE * (-left_mi_row);
   xd->mb_to_bottom_edge =
       ctxt->mb_to_far_edge +
       (xd->n8_h - rel_mi_row - left_mi_height) * MI_SIZE * 8;
+}
 
-  int mi_x = ctxt->mi_col << MI_SIZE_LOG2;
-  int mi_y = left_mi_row << MI_SIZE_LOG2;
+static INLINE void build_prediction_by_left_pred(
+    MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height,
+    MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
+  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+  const int left_mi_row = ctxt->mi_row + rel_mi_row;
+  int mi_x, mi_y;
+  MB_MODE_INFO backup_mbmi = *left_mbmi;
 
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height,
+                                          left_mbmi, ctxt, num_planes);
+  mi_x = ctxt->mi_col << MI_SIZE_LOG2;
+  mi_y = left_mi_row << MI_SIZE_LOG2;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
 
-  for (int j = 0; j < MAX_MB_PLANE; ++j) {
+  for (int j = 0; j < num_planes; ++j) {
     const struct macroblockd_plane *pd = &xd->plane[j];
     int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
                    block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
     int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y;
 
-    if (skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
-    build_inter_predictors(ctxt->cm, xd, j, left_mi, 1, 0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_SUPERTX
-                           0, 0,
-#endif  // CONFIG_SUPERTX
-                           mi_x, mi_y);
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+    build_inter_predictors(ctxt->cm, xd, j, left_mbmi, 1, bw, bh, mi_x, mi_y);
   }
   *left_mbmi = backup_mbmi;
 }
@@ -2422,9 +1436,9 @@ void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                         mi_col,     tmp_buf,
                                         tmp_width,  tmp_height,
                                         tmp_stride, xd->mb_to_bottom_edge };
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   foreach_overlappable_nb_left(cm, xd, mi_row,
-                               max_neighbor_obmc[b_height_log2_lookup[bsize]],
+                               max_neighbor_obmc[mi_size_high_log2[bsize]],
                                build_prediction_by_left_pred, &ctxt);
 
   xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
@@ -2434,13 +1448,9 @@ void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
 
 void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                         int mi_row, int mi_col) {
-#if CONFIG_HIGHBITDEPTH
+  const int num_planes = av1_num_planes(cm);
   DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
   DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
-#else
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
-#endif  // CONFIG_HIGHBITDEPTH
   uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
   int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
@@ -2449,7 +1459,6 @@ void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
   int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
 
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     int len = sizeof(uint16_t);
     dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
@@ -2459,434 +1468,25 @@ void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
     dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
     dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
   } else {
-#endif  // CONFIG_HIGHBITDEPTH
     dst_buf1[0] = tmp_buf1;
     dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
     dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
     dst_buf2[0] = tmp_buf2;
     dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
     dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
-#if CONFIG_HIGHBITDEPTH
   }
-#endif  // CONFIG_HIGHBITDEPTH
   av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
                                       dst_width1, dst_height1, dst_stride1);
   av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
                                      dst_width2, dst_height2, dst_stride2);
-  av1_setup_dst_planes(xd->plane, xd->mi[0]->mbmi.sb_type,
-                       get_frame_new_buffer(cm), mi_row, mi_col);
+  av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, get_frame_new_buffer(cm),
+                       mi_row, mi_col, 0, num_planes);
   av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
                                   dst_buf2, dst_stride2);
 }
 
-#if CONFIG_NCOBMC
-void av1_build_prediction_by_bottom_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                          int mi_row, int mi_col,
-                                          uint8_t *tmp_buf[MAX_MB_PLANE],
-                                          int tmp_width[MAX_MB_PLANE],
-                                          int tmp_height[MAX_MB_PLANE],
-                                          int tmp_stride[MAX_MB_PLANE]) {
-  const TileInfo *const tile = &xd->tile;
-#if CONFIG_DEBUG
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-#endif
-  int i, j, mi_step, ref;
-  const int ilimit = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
-  int mb_to_right_edge_base = xd->mb_to_right_edge;
-
-  if (mi_row + xd->n8_h >= tile->mi_row_end ||
-      (mi_row + xd->n8_h) % MI_SIZE == 0 || (mi_row + xd->n8_h) >= cm->mi_rows)
-    return;
-  assert(bsize >= BLOCK_8X8);
-
-  xd->mb_to_top_edge -= xd->n8_h * 32;
-  for (i = 0; i < ilimit; i += mi_step) {
-    int mi_row_offset = xd->n8_h;
-    int mi_col_offset = i;
-    int mi_x, mi_y, bw, bh;
-    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *mbmi = &mi->mbmi;
-    MB_MODE_INFO backup_mbmi;
-
-    mi_step = AOMMIN(xd->n8_w, mi_size_wide[mbmi->sb_type]);
-
-    if (!is_neighbor_overlappable(mbmi)) continue;
-
-    backup_mbmi = *mbmi;
-    modify_neighbor_predictor_for_obmc(mbmi);
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst, AOMMAX(mbmi->sb_type, BLOCK_8X8), tmp_buf[j],
-                       tmp_width[j], tmp_height[j], tmp_stride[j],
-                       (xd->n8_h >> 1), i, NULL, pd->subsampling_x,
-                       pd->subsampling_y);
-    }
-    for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
-      const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
-      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-
-      xd->block_refs[ref] = ref_buf;
-      if ((!av1_is_valid_scale(&ref_buf->sf)))
-        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Reference frame has invalid dimensions");
-      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + (xd->n8_h >> 1),
-                           mi_col + i, &ref_buf->sf);
-    }
-
-    xd->mb_to_left_edge = -(((mi_col + i) * MI_SIZE) * 8);
-    xd->mb_to_right_edge =
-        mb_to_right_edge_base + (xd->n8_w - i - mi_step) * 64;
-    mi_x = (mi_col + i) << MI_SIZE_LOG2;
-    mi_y = (mi_row << MI_SIZE_LOG2) + xd->n8_h * (MI_SIZE >> 1);
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      const struct macroblockd_plane *pd = &xd->plane[j];
-      bw = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_x;
-      bh = (xd->n8_h << (MI_SIZE_LOG2 - 1)) >> pd->subsampling_y;
-
-      if (mbmi->sb_type < BLOCK_8X8 && !CONFIG_CB4X4) {
-        const PARTITION_TYPE bp = BLOCK_8X8 - mbmi->sb_type;
-        const int have_vsplit = bp != PARTITION_HORZ;
-        const int have_hsplit = bp != PARTITION_VERT;
-        const int num_4x4_w = 2 >> (!have_vsplit);
-        const int num_4x4_h = 2 >> (!have_hsplit);
-        const int pw = 8 >> (have_vsplit + pd->subsampling_x);
-        int x, y;
-
-        for (y = 0; y < num_4x4_h; ++y)
-          for (x = 0; x < num_4x4_w; ++x) {
-            if ((bp == PARTITION_HORZ || bp == PARTITION_SPLIT) && y != 0)
-              continue;
-
-            build_inter_predictors(cm, xd, j, mi, 1, y * 2 + x, bw, bh,
-                                   (4 * x) >> pd->subsampling_x,
-                                   xd->n8_h == 1 ? (4 >> pd->subsampling_y) : 0,
-                                   pw, bh,
-#if CONFIG_SUPERTX
-                                   0, 0,
-#endif  // CONFIG_SUPERTX
-                                   mi_x, mi_y);
-          }
-      } else {
-        build_inter_predictors(cm, xd, j, mi, 1, 0, bw, bh, 0,
-                               xd->n8_h == 1 ? (4 >> pd->subsampling_y) : 0, bw,
-                               bh,
-#if CONFIG_SUPERTX
-                               0, 0,
-#endif  // CONFIG_SUPERTX
-                               mi_x, mi_y);
-      }
-    }
-    *mbmi = backup_mbmi;
-  }
-  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
-  xd->mb_to_right_edge = mb_to_right_edge_base;
-  xd->mb_to_top_edge += xd->n8_h * 32;
-}
-
-void av1_build_prediction_by_right_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         int mi_row, int mi_col,
-                                         uint8_t *tmp_buf[MAX_MB_PLANE],
-                                         int tmp_width[MAX_MB_PLANE],
-                                         int tmp_height[MAX_MB_PLANE],
-                                         const int tmp_stride[MAX_MB_PLANE]) {
-  const TileInfo *const tile = &xd->tile;
-#if CONFIG_DEBUG
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-#endif
-  int i, j, mi_step, ref;
-  const int ilimit = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
-  int mb_to_bottom_edge_base = xd->mb_to_bottom_edge;
-
-  if (mi_col + xd->n8_w >= tile->mi_col_end ||
-      (mi_col + xd->n8_w) % MI_SIZE == 0 || (mi_col + xd->n8_w) >= cm->mi_cols)
-    return;
-
-  assert(bsize >= BLOCK_8X8);
-
-  xd->mb_to_left_edge -= xd->n8_w / 2 * MI_SIZE * 8;
-  for (i = 0; i < ilimit; i += mi_step) {
-    int mi_row_offset = i;
-    int mi_col_offset = xd->n8_w;
-    int mi_x, mi_y, bw, bh;
-    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *mbmi = &mi->mbmi;
-    MB_MODE_INFO backup_mbmi;
-
-    mi_step = AOMMIN(xd->n8_h, mi_size_high[mbmi->sb_type]);
-
-    if (!is_neighbor_overlappable(mbmi)) continue;
-
-    backup_mbmi = *mbmi;
-    modify_neighbor_predictor_for_obmc(mbmi);
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst, AOMMAX(mbmi->sb_type, BLOCK_8X8), tmp_buf[j],
-                       tmp_width[j], tmp_height[j], tmp_stride[j], i,
-                       xd->n8_w >> 1, NULL, pd->subsampling_x,
-                       pd->subsampling_y);
-    }
-    for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
-      const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
-      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-
-      xd->block_refs[ref] = ref_buf;
-      if ((!av1_is_valid_scale(&ref_buf->sf)))
-        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Reference frame has invalid dimensions");
-      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + i,
-                           mi_col + (xd->n8_w >> 1), &ref_buf->sf);
-    }
-
-    xd->mb_to_top_edge = -(((mi_row + i) * MI_SIZE) * 8);
-    xd->mb_to_bottom_edge =
-        mb_to_bottom_edge_base + (xd->n8_h - i - mi_step) * MI_SIZE * 8;
-    mi_x = (mi_col << MI_SIZE_LOG2) + xd->n8_w * (MI_SIZE >> 1);
-    mi_y = (mi_row + i) << MI_SIZE_LOG2;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      const struct macroblockd_plane *pd = &xd->plane[j];
-      bw = (xd->n8_w << (MI_SIZE_LOG2 - 1)) >> pd->subsampling_x;
-      bh = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;
-
-      if (mbmi->sb_type < BLOCK_8X8 && !CONFIG_CB4X4) {
-        const PARTITION_TYPE bp = BLOCK_8X8 - mbmi->sb_type;
-        const int have_vsplit = bp != PARTITION_HORZ;
-        const int have_hsplit = bp != PARTITION_VERT;
-        const int num_4x4_w = 2 >> (!have_vsplit);
-        const int num_4x4_h = 2 >> (!have_hsplit);
-        const int ph = 8 >> (have_hsplit + pd->subsampling_y);
-        int x, y;
-
-        for (y = 0; y < num_4x4_h; ++y)
-          for (x = 0; x < num_4x4_w; ++x) {
-            if ((bp == PARTITION_VERT || bp == PARTITION_SPLIT) && x != 0)
-              continue;
-
-            build_inter_predictors(cm, xd, j, mi, 1, y * 2 + x, bw, bh,
-                                   xd->n8_w == 1 ? 4 >> pd->subsampling_x : 0,
-                                   (4 * y) >> pd->subsampling_y, bw, ph,
-#if CONFIG_SUPERTX
-                                   0, 0,
-#endif  // CONFIG_SUPERTX
-                                   mi_x, mi_y);
-          }
-      } else {
-        build_inter_predictors(cm, xd, j, mi, 1, 0, bw, bh,
-                               xd->n8_w == 1 ? 4 >> pd->subsampling_x : 0, 0,
-                               bw, bh,
-#if CONFIG_SUPERTX
-                               0, 0,
-#endif  // CONFIG_SUPERTX
-                               mi_x, mi_y);
-      }
-    }
-    *mbmi = backup_mbmi;
-  }
-  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
-  xd->mb_to_bottom_edge = mb_to_bottom_edge_base;
-  xd->mb_to_left_edge += xd->n8_w / 2 * MI_SIZE * 8;
-}
-
-// This function combines motion compensated predictions that is generated by
-// bottom/right neighboring blocks' inter predictors with prediction in dst
-// buffer.
-void av1_merge_dst_bottom_right_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                      int mi_row, int mi_col,
-                                      uint8_t *bottom[MAX_MB_PLANE],
-                                      const int bottom_stride[MAX_MB_PLANE],
-                                      uint8_t *right[MAX_MB_PLANE],
-                                      const int right_stride[MAX_MB_PLANE]) {
-  const TileInfo *const tile = &xd->tile;
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  int plane, i, mi_step;
-  const int bottom_available = mi_row + xd->n8_h < tile->mi_row_end &&
-                               (mi_row + xd->n8_h) % MI_SIZE != 0 &&
-                               (mi_row + xd->n8_h) < cm->mi_rows;
-#if CONFIG_HIGHBITDEPTH
-  int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-#endif  // CONFIG_HIGHBITDEPTH
-
-  // handle bottom row
-  for (i = 0; bottom_available && i < AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
-       i += mi_step) {
-    int mi_row_offset = xd->n8_h;
-    int mi_col_offset = i;
-    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *mbmi = &mi->mbmi;
-    int overlap;
-
-    mi_step = AOMMIN(xd->n8_w, mi_size_wide[mbmi->sb_type]);
-
-    if (!is_neighbor_overlappable(mbmi)) continue;
-
-    overlap = num_4x4_blocks_high_lookup[bsize] << 1;
-
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      const struct macroblockd_plane *pd = &xd->plane[plane];
-      const int bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
-      const int bh = overlap >> pd->subsampling_y;
-      const int dst_stride = pd->dst.stride;
-      uint8_t *dst =
-          &pd->dst.buf[((i * MI_SIZE) >> pd->subsampling_x) +
-                       (((xd->n8_h * MI_SIZE - overlap) * dst_stride) >>
-                        pd->subsampling_y)];
-      const int tmp_stride = bottom_stride[plane];
-      const uint8_t *const tmp =
-          &bottom[plane][((i * MI_SIZE) >> pd->subsampling_x) +
-                         (((xd->n8_h * MI_SIZE - overlap) * tmp_stride) >>
-                          pd->subsampling_y)];
-      const uint8_t *const mask = av1_get_obmc_mask_flipped(bh);
-
-#if CONFIG_HIGHBITDEPTH
-      if (is_hbd)
-        aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
-                                   tmp_stride, mask, bh, bw, xd->bd);
-      else
-#endif  // CONFIG_HIGHBITDEPTH
-        aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
-                            mask, bh, bw);
-    }
-  }  // each mi in the bottom row
-
-  // handle right column
-  if (mi_col + xd->n8_w >= tile->mi_col_end ||
-      (mi_col + xd->n8_w) % MI_SIZE == 0 || (mi_col + xd->n8_w) >= cm->mi_cols)
-    return;
-
-  for (i = 0; i < AOMMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
-    int mi_row_offset = i;
-    int mi_col_offset = xd->n8_w;
-    int overlap;
-    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *mbmi = &mi->mbmi;
-
-    mi_step = AOMMIN(xd->n8_h, mi_size_high[mbmi->sb_type]);
-
-    if (!is_neighbor_overlappable(mbmi)) continue;
-
-    overlap = num_4x4_blocks_wide_lookup[bsize] << 1;
-
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      const struct macroblockd_plane *pd = &xd->plane[plane];
-      const int bw = overlap >> pd->subsampling_x;
-      const int bh = (mi_step * MI_SIZE) >> pd->subsampling_y;
-      const int dst_stride = pd->dst.stride;
-      uint8_t *dst =
-          &pd->dst.buf[((i * MI_SIZE * dst_stride) >> pd->subsampling_y) +
-                       ((xd->n8_w * MI_SIZE - overlap) >> pd->subsampling_x)];
-      const int tmp_stride = right_stride[plane];
-      const uint8_t *const tmp =
-          &right[plane][((i * MI_SIZE * tmp_stride) >> pd->subsampling_y) +
-                        ((xd->n8_w * MI_SIZE - overlap) >> pd->subsampling_x)];
-      const uint8_t *const mask = av1_get_obmc_mask_flipped(bw);
-
-#if CONFIG_HIGHBITDEPTH
-      if (is_hbd)
-        aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
-                                   tmp_stride, mask, bh, bw, xd->bd);
-      else
-#endif  // CONFIG_HIGHBITDEPTH
-        aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
-                            mask, bh, bw);
-    }
-  }  // each mi in the right column
-}
-
-// This function generates 4 sided obmc. (1) Prediction blocks generated by
-// bottom and right motion vectors are calculated. (2) Combine them with the
-// original prediction block (which should be pre-stored in xd->plane[].dst.buf
-// before calling this function). The results is updated in xd->plane[].dst.buf
-// (3) Call causal obmc prediction function, which will generate left and above
-// preds, and then merge them and xd->plane[].dst.buf.
-void av1_build_ncobmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                          int mi_row, int mi_col) {
-#if CONFIG_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
-#else
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
-#endif  // CONFIG_HIGHBITDEPTH
-  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
-  int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    int len = sizeof(uint16_t);
-    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
-    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
-    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len);
-    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
-    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
-    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
-  } else {
-#endif  // CONFIG_HIGHBITDEPTH
-    dst_buf1[0] = tmp_buf1;
-    dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
-    dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
-    dst_buf2[0] = tmp_buf2;
-    dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
-    dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
-#if CONFIG_HIGHBITDEPTH
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  // TODO(zoeliu): COMPOUND_SINGLEREF has not worked with NCOBMC yet.
-  av1_build_prediction_by_bottom_preds(cm, xd, mi_row, mi_col, dst_buf1,
-                                       dst_width1, dst_height1, dst_stride1);
-  av1_build_prediction_by_right_preds(cm, xd, mi_row, mi_col, dst_buf2,
-                                      dst_width2, dst_height2, dst_stride2);
-  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                       mi_col);
-  av1_merge_dst_bottom_right_preds(cm, xd, mi_row, mi_col, dst_buf1,
-                                   dst_stride1, dst_buf2, dst_stride2);
-  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                       mi_col);
-  av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                       mi_col);
-}
-#endif  // CONFIG_NCOBMC
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-void reset_xd_boundary(MACROBLOCKD *xd, int mi_row, int bh, int mi_col, int bw,
-                       int mi_rows, int mi_cols) {
-  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
-  xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8;
-  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
-  xd->mb_to_right_edge = ((mi_cols - bw - mi_col) * MI_SIZE) * 8;
-}
-void set_sb_mi_boundaries(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                          const int mi_row, const int mi_col) {
-  const BLOCK_SIZE sb = cm->sb_size;
-  const int num_mi_w = mi_size_wide[sb];
-  const int num_mi_h = mi_size_high[sb];
-
-  xd->sb_mi_bd.mi_col_begin = mi_col;
-  xd->sb_mi_bd.mi_row_begin = mi_row;
-  // points to the last mi
-  xd->sb_mi_bd.mi_col_end =
-      mi_col + num_mi_w > cm->mi_cols ? cm->mi_cols - 1 : mi_col + num_mi_w - 1;
-  xd->sb_mi_bd.mi_row_end =
-      mi_row + num_mi_h > cm->mi_rows ? cm->mi_rows - 1 : mi_row + num_mi_h - 1;
-}
-#endif
-
-#endif  // CONFIG_MOTION_VAR
-
 /* clang-format off */
-#if CONFIG_INTERINTRA
-#if CONFIG_EXT_PARTITION
-static const int ii_weights1d[MAX_SB_SIZE] = {
+static const uint8_t ii_weights1d[MAX_SB_SIZE] = {
   60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32,
   31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16,
   16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10,  9,  9,  9,  8,
@@ -2895,103 +1495,82 @@ static const int ii_weights1d[MAX_SB_SIZE] = {
   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,
   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
 };
-static int ii_size_scales[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    32, 32, 32,
-#endif
+static uint8_t ii_size_scales[BLOCK_SIZES_ALL] = {
     32, 16, 16, 16, 8, 8, 8, 4,
     4,  4,  2,  2,  2, 1, 1, 1,
-    16, 16, 8, 8, 4, 4, 2, 2
-};
-#else
-static const int ii_weights1d[MAX_SB_SIZE] = {
-  60, 56, 52, 48, 45, 42, 39, 37, 34, 32, 30, 28, 26, 24, 22, 21,
-  19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 10,  9,  8,  8,  7,  7,
-  6,  6,  6,  5,  5,  4,  4,  4,  4,  3,  3,  3,  3,  3,  2,  2,
-  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
-};
-static int ii_size_scales[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    16, 16, 16,
-#endif
-    16, 8, 8, 8, 4, 4, 4,
-    2,  2, 2, 1, 1, 1,
-    8, 8, 4, 4, 2, 2,
+    8,  8,  4,  4,  2, 2
 };
 /* clang-format on */
-#endif  // CONFIG_EXT_PARTITION
 
-static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra,
-                               int wedge_index, int wedge_sign,
-                               BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
-                               uint8_t *comppred, int compstride,
-                               const uint8_t *interpred, int interstride,
-                               const uint8_t *intrapred, int intrastride) {
+static void build_smooth_interintra_mask(uint8_t *mask, int stride,
+                                         BLOCK_SIZE plane_bsize,
+                                         INTERINTRA_MODE mode) {
+  int i, j;
   const int bw = block_size_wide[plane_bsize];
   const int bh = block_size_high[plane_bsize];
   const int size_scale = ii_size_scales[plane_bsize];
-  int i, j;
-
-  if (use_wedge_interintra) {
-    if (is_interintra_wedge_used(bsize)) {
-      const uint8_t *mask =
-          av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
-      const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
-      const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
-      aom_blend_a64_mask(comppred, compstride, intrapred, intrastride,
-                         interpred, interstride, mask, block_size_wide[bsize],
-                         bh, bw, subh, subw);
-    }
-    return;
-  }
 
   switch (mode) {
     case II_V_PRED:
       for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          int scale = ii_weights1d[i * size_scale];
-          comppred[i * compstride + j] =
-              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
-                            interpred[i * interstride + j]);
-        }
+        memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0]));
+        mask += stride;
       }
       break;
 
     case II_H_PRED:
       for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          int scale = ii_weights1d[j * size_scale];
-          comppred[i * compstride + j] =
-              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
-                            interpred[i * interstride + j]);
-        }
+        for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale];
+        mask += stride;
       }
       break;
 
     case II_SMOOTH_PRED:
       for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          int scale = ii_weights1d[(i < j ? i : j) * size_scale];
-          comppred[i * compstride + j] =
-              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
-                            interpred[i * interstride + j]);
-        }
+        for (j = 0; j < bw; ++j)
+          mask[j] = ii_weights1d[(i < j ? i : j) * size_scale];
+        mask += stride;
       }
       break;
 
     case II_DC_PRED:
     default:
       for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          comppred[i * compstride + j] = AOM_BLEND_AVG(
-              intrapred[i * intrastride + j], interpred[i * interstride + j]);
-        }
+        memset(mask, 32, bw * sizeof(mask[0]));
+        mask += stride;
       }
       break;
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
+static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra,
+                               int wedge_index, int wedge_sign,
+                               BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
+                               uint8_t *comppred, int compstride,
+                               const uint8_t *interpred, int interstride,
+                               const uint8_t *intrapred, int intrastride) {
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
+
+  if (use_wedge_interintra) {
+    if (is_interintra_wedge_used(bsize)) {
+      const uint8_t *mask =
+          av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+      const int subw = 2 * mi_size_wide[bsize] == bw;
+      const int subh = 2 * mi_size_high[bsize] == bh;
+      aom_blend_a64_mask(comppred, compstride, intrapred, intrastride,
+                         interpred, interstride, mask, block_size_wide[bsize],
+                         bw, bh, subw, subh);
+    }
+    return;
+  }
+
+  uint8_t mask[MAX_SB_SQUARE];
+  build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
+  aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, interpred,
+                     interstride, mask, bw, bw, bh, 0, 0);
+}
+
 static void combine_interintra_highbd(
     INTERINTRA_MODE mode, int use_wedge_interintra, int wedge_index,
     int wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
@@ -2999,72 +1578,26 @@ static void combine_interintra_highbd(
     int interstride, const uint8_t *intrapred8, int intrastride, int bd) {
   const int bw = block_size_wide[plane_bsize];
   const int bh = block_size_high[plane_bsize];
-  const int size_scale = ii_size_scales[plane_bsize];
-  int i, j;
-
-  uint16_t *comppred = CONVERT_TO_SHORTPTR(comppred8);
-  const uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8);
-  const uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);
 
   if (use_wedge_interintra) {
     if (is_interintra_wedge_used(bsize)) {
       const uint8_t *mask =
           av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
-      const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
-      const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
+      const int subh = 2 * mi_size_high[bsize] == bh;
+      const int subw = 2 * mi_size_wide[bsize] == bw;
       aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride,
                                 interpred8, interstride, mask,
-                                block_size_wide[bsize], bh, bw, subh, subw, bd);
+                                block_size_wide[bsize], bw, bh, subw, subh, bd);
     }
     return;
   }
 
-  switch (mode) {
-    case II_V_PRED:
-      for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          int scale = ii_weights1d[i * size_scale];
-          comppred[i * compstride + j] =
-              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
-                            interpred[i * interstride + j]);
-        }
-      }
-      break;
-
-    case II_H_PRED:
-      for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          int scale = ii_weights1d[j * size_scale];
-          comppred[i * compstride + j] =
-              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
-                            interpred[i * interstride + j]);
-        }
-      }
-      break;
-
-    case II_SMOOTH_PRED:
-      for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          int scale = ii_weights1d[(i < j ? i : j) * size_scale];
-          comppred[i * compstride + j] =
-              AOM_BLEND_A64(scale, intrapred[i * intrastride + j],
-                            interpred[i * interstride + j]);
-        }
-      }
-      break;
-
-    case II_DC_PRED:
-    default:
-      for (i = 0; i < bh; ++i) {
-        for (j = 0; j < bw; ++j) {
-          comppred[i * compstride + j] = AOM_BLEND_AVG(
-              interpred[i * interstride + j], intrapred[i * intrastride + j]);
-        }
-      }
-      break;
-  }
+  uint8_t mask[MAX_SB_SQUARE];
+  build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
+  aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride,
+                            interpred8, interstride, mask, bw, bw, bh, 0, 0,
+                            bd);
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
                                                MACROBLOCKD *xd,
@@ -3072,42 +1605,46 @@ void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
                                                BUFFER_SET *ctx, uint8_t *dst,
                                                int dst_stride) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
-  PREDICTION_MODE mode =
-      interintra_to_intra_mode[xd->mi[0]->mbmi.interintra_mode];
+  const int ssx = xd->plane[plane].subsampling_x;
+  const int ssy = xd->plane[plane].subsampling_y;
+  BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
+  PREDICTION_MODE mode = interintra_to_intra_mode[xd->mi[0]->interintra_mode];
+  xd->mi[0]->angle_delta[PLANE_TYPE_Y] = 0;
+  xd->mi[0]->angle_delta[PLANE_TYPE_UV] = 0;
+  xd->mi[0]->filter_intra_mode_info.use_filter_intra = 0;
+  xd->mi[0]->use_intrabc = 0;
 
-  av1_predict_intra_block(cm, xd, pd->width, pd->height, plane_bsize, mode,
-                          ctx->plane[plane], ctx->stride[plane], dst,
-                          dst_stride, 0, 0, plane);
+  av1_predict_intra_block(cm, xd, pd->width, pd->height,
+                          max_txsize_rect_lookup[plane_bsize], mode, 0, 0,
+                          FILTER_INTRA_MODES, ctx->plane[plane],
+                          ctx->stride[plane], dst, dst_stride, 0, 0, plane);
 }
 
 void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
                             const uint8_t *inter_pred, int inter_stride,
                             const uint8_t *intra_pred, int intra_stride) {
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
-#if CONFIG_HIGHBITDEPTH
+  const int ssx = xd->plane[plane].subsampling_x;
+  const int ssy = xd->plane[plane].subsampling_y;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     combine_interintra_highbd(
-        xd->mi[0]->mbmi.interintra_mode, xd->mi[0]->mbmi.use_wedge_interintra,
-        xd->mi[0]->mbmi.interintra_wedge_index,
-        xd->mi[0]->mbmi.interintra_wedge_sign, bsize, plane_bsize,
-        xd->plane[plane].dst.buf, xd->plane[plane].dst.stride, inter_pred,
-        inter_stride, intra_pred, intra_stride, xd->bd);
+        xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
+        xd->mi[0]->interintra_wedge_index, xd->mi[0]->interintra_wedge_sign,
+        bsize, plane_bsize, xd->plane[plane].dst.buf,
+        xd->plane[plane].dst.stride, inter_pred, inter_stride, intra_pred,
+        intra_stride, xd->bd);
     return;
   }
-#endif  // CONFIG_HIGHBITDEPTH
-  combine_interintra(xd->mi[0]->mbmi.interintra_mode,
-                     xd->mi[0]->mbmi.use_wedge_interintra,
-                     xd->mi[0]->mbmi.interintra_wedge_index,
-                     xd->mi[0]->mbmi.interintra_wedge_sign, bsize, plane_bsize,
-                     xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
-                     inter_pred, inter_stride, intra_pred, intra_stride);
+  combine_interintra(
+      xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
+      xd->mi[0]->interintra_wedge_index, xd->mi[0]->interintra_wedge_sign,
+      bsize, plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
+      inter_pred, inter_stride, intra_pred, intra_stride);
 }
 
 void av1_build_interintra_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          uint8_t *ypred, int ystride,
                                          BUFFER_SET *ctx, BLOCK_SIZE bsize) {
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
     av1_build_intra_predictors_for_interintra(
@@ -3116,7 +1653,6 @@ void av1_build_interintra_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
                            CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
     return;
   }
-#endif  // CONFIG_HIGHBITDEPTH
   {
     DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
     av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, ctx,
@@ -3130,7 +1666,6 @@ void av1_build_interintra_predictors_sbc(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          uint8_t *upred, int ustride,
                                          BUFFER_SET *ctx, int plane,
                                          BLOCK_SIZE bsize) {
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     DECLARE_ALIGNED(16, uint16_t, uintrapredictor[MAX_SB_SQUARE]);
     av1_build_intra_predictors_for_interintra(
@@ -3138,10 +1673,7 @@ void av1_build_interintra_predictors_sbc(const AV1_COMMON *cm, MACROBLOCKD *xd,
         MAX_SB_SIZE);
     av1_combine_interintra(xd, bsize, plane, upred, ustride,
                            CONVERT_TO_BYTEPTR(uintrapredictor), MAX_SB_SIZE);
-    return;
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-  {
+  } else {
     DECLARE_ALIGNED(16, uint8_t, uintrapredictor[MAX_SB_SQUARE]);
     av1_build_intra_predictors_for_interintra(cm, xd, bsize, plane, ctx,
                                               uintrapredictor, MAX_SB_SIZE);
@@ -3167,966 +1699,119 @@ void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
   av1_build_interintra_predictors_sbuv(cm, xd, upred, vpred, ustride, vstride,
                                        ctx, bsize);
 }
-#endif  // CONFIG_INTERINTRA
 
 // Builds the inter-predictor for the single ref case
 // for use in the encoder to search the wedges efficiently.
 static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
-                                              int block, int bw, int bh, int x,
-                                              int y, int w, int h, int mi_x,
-                                              int mi_y, int ref,
-                                              uint8_t *const ext_dst,
-                                              int ext_dst_stride) {
+                                              int bw, int bh, int x, int y,
+                                              int w, int h, int mi_x, int mi_y,
+                                              int ref, uint8_t *const ext_dst,
+                                              int ext_dst_stride,
+                                              int can_use_previous) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const MODE_INFO *mi = xd->mi[0];
+  const MB_MODE_INFO *mi = xd->mi[0];
 
   const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
   struct buf_2d *const pre_buf = &pd->pre[ref];
-#if CONFIG_HIGHBITDEPTH
+  const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
   uint8_t *const dst =
-      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? CONVERT_TO_BYTEPTR(ext_dst)
-                                                   : ext_dst) +
-      ext_dst_stride * y + x;
-#else
-  uint8_t *const dst = ext_dst + ext_dst_stride * y + x;
-#endif
-  const MV mv = mi->mbmi.sb_type < BLOCK_8X8
-                    ? average_split_mvs(pd, mi, ref, block)
-                    : mi->mbmi.mv[ref].as_mv;
+      (hbd ? CONVERT_TO_BYTEPTR(ext_dst) : ext_dst) + ext_dst_stride * y + x;
+  const MV mv = mi->mv[ref].as_mv;
 
-  uint8_t *pre;
-  int xs, ys, subpel_x, subpel_y;
-  const int is_scaled = av1_is_scaled(sf);
-  ConvolveParams conv_params = get_conv_params(ref, 0, plane);
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+  ConvolveParams conv_params = get_conv_params(ref, 0, plane, xd->bd);
   WarpTypesAllowed warp_types;
-#if CONFIG_GLOBAL_MOTION
-#if CONFIG_COMPOUND_SINGLEREF
-  WarpedMotionParams *const wm =
-      mi->mbmi.ref_frame[ref] > 0 ? &xd->global_motion[mi->mbmi.ref_frame[ref]]
-                                  : &xd->global_motion[mi->mbmi.ref_frame[0]];
-#else   // !(CONFIG_COMPOUND_SINGLEREF)
-  WarpedMotionParams *const wm = &xd->global_motion[mi->mbmi.ref_frame[ref]];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  warp_types.global_warp_allowed = is_global_mv_block(mi, block, wm->wmtype);
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-  warp_types.local_warp_allowed = mi->mbmi.motion_mode == WARPED_CAUSAL;
-#endif  // CONFIG_WARPED_MOTION
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-
-  if (is_scaled) {
-    int ssx = pd->subsampling_x;
-    int ssy = pd->subsampling_y;
-    int orig_pos_y = (mi_y << (SUBPEL_BITS - ssy)) + (y << SUBPEL_BITS);
-    orig_pos_y += mv.row * (1 << (1 - ssy));
-    int orig_pos_x = (mi_x << (SUBPEL_BITS - ssx)) + (x << SUBPEL_BITS);
-    orig_pos_x += mv.col * (1 << (1 - ssx));
-    int pos_y = sf->scale_value_y(orig_pos_y, sf);
-    int pos_x = sf->scale_value_x(orig_pos_x, sf);
-    pos_x += SCALE_EXTRA_OFF;
-    pos_y += SCALE_EXTRA_OFF;
-
-    const int top = -((AOM_INTERP_EXTEND + bh) << SCALE_SUBPEL_BITS);
-    const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                       << SCALE_SUBPEL_BITS;
-    const int left = -((AOM_INTERP_EXTEND + bw) << SCALE_SUBPEL_BITS);
-    const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
-    pos_y = clamp(pos_y, top, bottom);
-    pos_x = clamp(pos_x, left, right);
-
-    pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-          (pos_x >> SCALE_SUBPEL_BITS);
-    subpel_x = pos_x & SCALE_SUBPEL_MASK;
-    subpel_y = pos_y & SCALE_SUBPEL_MASK;
-    xs = sf->x_step_q4;
-    ys = sf->y_step_q4;
-  } else {
-    const MV mv_q4 = clamp_mv_to_umv_border_sb(
-        xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
-    xs = ys = SCALE_SUBPEL_SHIFTS;
-    subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
-    subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
-    pre = pre_buf->buf + (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
-          (x + (mv_q4.col >> SUBPEL_BITS));
-  }
+  const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+  warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype);
+  warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+  const int pre_x = (mi_x) >> pd->subsampling_x;
+  const int pre_y = (mi_y) >> pd->subsampling_y;
+  uint8_t *pre;
+  SubpelParams subpel_params;
+  calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
+                     &subpel_params, bw, bh);
 
-  av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride, subpel_x,
-                           subpel_y, sf, w, h, &conv_params,
-                           mi->mbmi.interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                           &warp_types, (mi_x >> pd->subsampling_x) + x,
-                           (mi_y >> pd->subsampling_y) + y, plane, ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR
-                           mi, 0,
-#endif
-                           xs, ys, xd);
+  av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride,
+                           &subpel_params, sf, w, h, &conv_params,
+                           mi->interp_filters, &warp_types, pre_x + x,
+                           pre_y + y, plane, ref, mi, 0, xd, can_use_previous);
 }
 
 void av1_build_inter_predictors_for_planes_single_buf(
     MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
-    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3]) {
+    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
+    int can_use_previous) {
   int plane;
   const int mi_x = mi_col * MI_SIZE;
   const int mi_y = mi_row * MI_SIZE;
   for (plane = plane_from; plane <= plane_to; ++plane) {
-    const BLOCK_SIZE plane_bsize =
-        get_plane_block_size(bsize, &xd->plane[plane]);
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(
+        bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
     const int bw = block_size_wide[plane_bsize];
     const int bh = block_size_high[plane_bsize];
-
-    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8 && !CONFIG_CB4X4) {
-      int x, y;
-      const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-      const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-      assert(bsize == BLOCK_8X8);
-#if CONFIG_COMPOUND_SINGLEREF
-      assert(has_second_ref(&xd->mi[0]->mbmi) ||
-             !is_inter_singleref_comp_mode(xd->mi[0]->mbmi.mode));
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      for (y = 0; y < num_4x4_h; ++y)
-        for (x = 0; x < num_4x4_w; ++x)
-          build_inter_predictors_single_buf(
-              xd, plane, y * 2 + x, bw, bh, 4 * x, 4 * y, 4, 4, mi_x, mi_y, ref,
-              ext_dst[plane], ext_dst_stride[plane]);
-    } else {
-      build_inter_predictors_single_buf(xd, plane, 0, bw, bh, 0, 0, bw, bh,
-                                        mi_x, mi_y, ref, ext_dst[plane],
-                                        ext_dst_stride[plane]);
-    }
+    build_inter_predictors_single_buf(xd, plane, bw, bh, 0, 0, bw, bh, mi_x,
+                                      mi_y, ref, ext_dst[plane],
+                                      ext_dst_stride[plane], can_use_previous);
   }
 }
 
 static void build_wedge_inter_predictor_from_buf(
-    MACROBLOCKD *xd, int plane, int x, int y, int w, int h,
-#if CONFIG_SUPERTX
-    int wedge_offset_x, int wedge_offset_y,
-#endif  // CONFIG_SUPERTX
-    uint8_t *ext_dst0, int ext_dst_stride0, uint8_t *ext_dst1,
-    int ext_dst_stride1) {
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+    MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0,
+    int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const int is_compound = has_second_ref(mbmi);
   MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
   struct buf_2d *const dst_buf = &pd->dst;
   uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
-  const INTERINTER_COMPOUND_DATA comp_data = {
-#if CONFIG_WEDGE
-    mbmi->wedge_index,
-    mbmi->wedge_sign,
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-    mbmi->mask_type,
-    xd->seg_mask,
-#endif  // CONFIG_COMPOUND_SEGMENT
-    mbmi->interinter_compound_type
-  };
-
-#if CONFIG_COMPOUND_SINGLEREF
-  if ((is_compound || is_inter_singleref_comp_mode(mbmi->mode)) &&
-      is_masked_compound_type(mbmi->interinter_compound_type))
-#else   // !CONFIG_COMPOUND_SINGLEREF
-  if (is_compound && is_masked_compound_type(mbmi->interinter_compound_type))
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  {
-#if CONFIG_COMPOUND_SEGMENT
-    if (!plane && comp_data.interinter_compound_type == COMPOUND_SEG) {
-#if CONFIG_HIGHBITDEPTH
+  mbmi->interinter_comp.seg_mask = xd->seg_mask;
+  const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp;
+
+  if (is_compound && is_masked_compound_type(comp_data->type)) {
+    if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-        build_compound_seg_mask_highbd(
-            comp_data.seg_mask, comp_data.mask_type,
+        av1_build_compound_diffwtd_mask_highbd(
+            comp_data->seg_mask, comp_data->mask_type,
             CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
-            CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, mbmi->sb_type, h, w,
-            xd->bd);
+            CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd);
       else
-#endif  // CONFIG_HIGHBITDEPTH
-        build_compound_seg_mask(comp_data.seg_mask, comp_data.mask_type,
-                                ext_dst0, ext_dst_stride0, ext_dst1,
-                                ext_dst_stride1, mbmi->sb_type, h, w);
+        av1_build_compound_diffwtd_mask(
+            comp_data->seg_mask, comp_data->mask_type, ext_dst0,
+            ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
     }
-#endif  // CONFIG_COMPOUND_SEGMENT
 
-#if CONFIG_SUPERTX
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      build_masked_compound_wedge_extend_highbd(
-          dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
-          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, &comp_data,
-          mbmi->sb_type, wedge_offset_x, wedge_offset_y, h, w, xd->bd);
-    else
-#endif  // CONFIG_HIGHBITDEPTH
-      build_masked_compound_wedge_extend(
-          dst, dst_buf->stride, ext_dst0, ext_dst_stride0, ext_dst1,
-          ext_dst_stride1, &comp_data, mbmi->sb_type, wedge_offset_x,
-          wedge_offset_y, h, w);
-#else  // !CONFIG_SUPERTX
-#if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
       build_masked_compound_highbd(
           dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
-          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, &comp_data,
+          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
           mbmi->sb_type, h, w, xd->bd);
     else
-#endif  // CONFIG_HIGHBITDEPTH
       build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
-                            ext_dst1, ext_dst_stride1, &comp_data,
-                            mbmi->sb_type, h, w);
-#endif  // CONFIG_SUPERTX
+                            ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
+                            h, w);
   } else {
-#if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
       aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
                                dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
                                xd->bd);
     else
-#endif  // CONFIG_HIGHBITDEPTH
       aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
                         0, NULL, 0, w, h);
   }
 }
 
-void av1_build_wedge_inter_predictor_from_buf(
-    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to,
-#if CONFIG_SUPERTX
-    int wedge_offset_x, int wedge_offset_y,
-#endif  // CONFIG_SUPERTX
-    uint8_t *ext_dst0[3], int ext_dst_stride0[3], uint8_t *ext_dst1[3],
-    int ext_dst_stride1[3]) {
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int plane_from, int plane_to,
+                                              uint8_t *ext_dst0[3],
+                                              int ext_dst_stride0[3],
+                                              uint8_t *ext_dst1[3],
+                                              int ext_dst_stride1[3]) {
   int plane;
   for (plane = plane_from; plane <= plane_to; ++plane) {
-    const BLOCK_SIZE plane_bsize =
-        get_plane_block_size(bsize, &xd->plane[plane]);
-
-    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8 && !CONFIG_CB4X4) {
-      int x, y;
-      const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-      const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-      assert(bsize == BLOCK_8X8);
-      for (y = 0; y < num_4x4_h; ++y)
-        for (x = 0; x < num_4x4_w; ++x)
-          build_wedge_inter_predictor_from_buf(
-              xd, plane, 4 * x, 4 * y, 4, 4,
-#if CONFIG_SUPERTX
-              wedge_offset_x, wedge_offset_y,
-#endif  // CONFIG_SUPERTX
-              ext_dst0[plane], ext_dst_stride0[plane], ext_dst1[plane],
-              ext_dst_stride1[plane]);
-    } else {
-      const int bw = block_size_wide[plane_bsize];
-      const int bh = block_size_high[plane_bsize];
-      build_wedge_inter_predictor_from_buf(
-          xd, plane, 0, 0, bw, bh,
-#if CONFIG_SUPERTX
-          wedge_offset_x, wedge_offset_y,
-#endif  // CONFIG_SUPERTX
-          ext_dst0[plane], ext_dst_stride0[plane], ext_dst1[plane],
-          ext_dst_stride1[plane]);
-    }
-  }
-}
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-
-void alloc_ncobmc_pred_buffer(MACROBLOCKD *const xd) {
-  int i;
-  // allocate interpolated prediction buffer
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    xd->ncobmc_pred_buf[i] = (uint8_t *)malloc(sizeof(uint8_t) * MAX_SB_SQUARE);
-    av1_zero_array(xd->ncobmc_pred_buf[i], MAX_SB_SQUARE);
-    xd->ncobmc_pred_buf_stride[i] = MAX_SB_SIZE;
-  }
-}
-
-void free_ncobmc_pred_buffer(MACROBLOCKD *const xd) {
-  for (int i = 0; i < MAX_MB_PLANE; ++i) free(xd->ncobmc_pred_buf[i]);
-}
-
-void get_pred_from_intrpl_buf(MACROBLOCKD *xd, int mi_row, int mi_col,
-                              BLOCK_SIZE bsize, int plane) {
-  uint8_t *dst = xd->plane[plane].dst.buf;
-  int ds = xd->plane[plane].dst.stride;
-  int ss_x = xd->plane[plane].subsampling_x;
-  int ss_y = xd->plane[plane].subsampling_y;
-
-  const int ip_wide = mi_size_wide[bsize] * MI_SIZE >> ss_x;
-  const int ip_high = mi_size_high[bsize] * MI_SIZE >> ss_y;
-  // relative coordinates of this MI in the superblock
-  int row_rlt = (mi_row - xd->sb_mi_bd.mi_row_begin) * MI_SIZE >> ss_y;
-  int col_rlt = (mi_col - xd->sb_mi_bd.mi_col_begin) * MI_SIZE >> ss_x;
-  int s = xd->ncobmc_pred_buf_stride[plane];
-  int r, c;
-
-  for (r = 0; r < ip_high; ++r) {
-    for (c = 0; c < ip_wide; ++c) {
-      dst[r * ds + c] =
-          xd->ncobmc_pred_buf[plane][(r + row_rlt) * s + c + col_rlt];
-    }
-  }
-}
-// scaling factors for ncobmc kernels
-#define KERNEL_SCALE_LOG 14
-
-void build_ncobmc_intrpl_pred(const AV1_COMMON *const cm, MACROBLOCKD *xd,
-                              int plane, int pxl_row, int pxl_col,
-                              BLOCK_SIZE bsize, uint8_t *preds[][MAX_MB_PLANE],
-                              int stride[MAX_MB_PLANE],  // pred buffer strides
-                              int mode) {
-  const ADAPT_OVERLAP_BLOCK ao_block = adapt_overlap_block_lookup[bsize];
-  const NCOBMC_KERNELS *const knls = &cm->ncobmc_kernels[ao_block][mode];
-  const int wide = mi_size_wide[bsize] * MI_SIZE;
-  const int high = mi_size_high[bsize] * MI_SIZE;
-  const int s = stride[plane];
-  const int ss_x = xd->plane[plane].subsampling_x;
-  const int ss_y = xd->plane[plane].subsampling_y;
-  int row_offset = (pxl_row - xd->sb_mi_bd.mi_row_begin * MI_SIZE) >> ss_y;
-  int col_offset = (pxl_col - xd->sb_mi_bd.mi_col_begin * MI_SIZE) >> ss_x;
-  int dst_stride = xd->ncobmc_pred_buf_stride[plane];
-  int dst_offset = row_offset * dst_stride + col_offset;
-
-#if CONFIG_HIGHBITDEPTH
-  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-#else
-  const int is_hbd = 0;
-#endif  // CONFIG_HIGHBITDEPTH
-
-  int r, c, k_r, k_c;
-  int64_t tmp;
-
-  for (r = 0; r < (high >> ss_x); ++r) {
-    for (c = 0; c < (wide >> ss_y); ++c) {
-      int pos = r * s + c;
-      int q_tmp;
-      uint8_t val;
-
-      // TODO(weitinglin): find out the optimal sub-sampling patterns for
-      //                   chroma
-      k_r = (r << ss_y) + ss_y;
-      k_c = (c << ss_x) + ss_x;
-      if (ss_y && k_r >= high) k_r -= 1;
-      if (ss_x && k_c >= wide) k_c -= 1;
-
-      if (!is_hbd) {
-        uint8_t *tmp_p[4];
-        int i;
-        for (i = 0; i < 4; ++i) tmp_p[i] = preds[i][plane];
-
-        tmp = 0;
-        for (i = 0; i < 4; ++i)
-          tmp += knls->KERNEL[i][k_r][k_c] * tmp_p[i][pos];
-
-      } else {
-        uint16_t *tmp_p[4];
-        int i;
-        for (i = 0; i < 4; ++i) tmp_p[i] = CONVERT_TO_SHORTPTR(preds[i][plane]);
-
-        tmp = 0;
-        for (i = 0; i < 4; ++i)
-          tmp += knls->KERNEL[i][k_r][k_c] * tmp_p[i][pos];
-      }
-
-      q_tmp = (tmp <= 0) ? 0 : ROUND_POWER_OF_TWO(tmp, KERNEL_SCALE_LOG);
-      val = clip_pixel(q_tmp);
-
-      xd->ncobmc_pred_buf[plane][r * dst_stride + c + dst_offset] = val;
-
-      assert(r * dst_stride + c + dst_offset < MAX_SB_SQUARE);
-    }
-  }
-}
-
-void get_pred_by_horz_neighbor(const AV1_COMMON *cm, MACROBLOCKD *xd, int bsize,
-                               int mi_row, int mi_col,
-                               uint8_t *dst_buf[MAX_MB_PLANE],
-                               int dst_stride[MAX_MB_PLANE]) {
-  const TileInfo *const tile = &xd->tile;
-  const int mb_to_bottom_edge_base = xd->mb_to_bottom_edge;
-  const int mb_to_top_edge_base = xd->mb_to_top_edge;
-  const int mb_to_left_edge_base = xd->mb_to_left_edge;
-  const int mb_to_right_edge_base = xd->mb_to_right_edge;
-  int overlappable_offset = -1;
-  const int mi_nums = AOMMIN(mi_size_high[bsize], cm->mi_rows - mi_row);
-
-  int i, j, mi_step, ref;
-
-  xd->mb_to_right_edge += mi_size_wide[bsize] * MI_SIZE * 4;
-
-  // build from left neighbors
-  for (i = 0; i < mi_nums; i += mi_step) {
-    int mi_row_offset = i;
-    int mi_col_offset = -1;
-    int mi_x, mi_y, bw, bh;
-    MODE_INFO *left_mi;
-    MB_MODE_INFO *left_mbmi, backup_mbmi;
-    BLOCK_SIZE l_bsize;
-
-    // create the original prediction if offset exceeds the boundary
-    if (mi_col == 0 || (mi_col - 1 < tile->mi_col_start)) mi_col_offset = 0;
-
-    left_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    left_mbmi = &left_mi->mbmi;
-    l_bsize = AOMMAX(left_mbmi->sb_type, BLOCK_8X8);
-
-    mi_step = AOMMIN(xd->n8_h, mi_size_high[l_bsize]);
-
-    // reset the mi if it is not overlappble
-    if (!is_neighbor_overlappable(left_mbmi)) {
-      // use left_mbmi->sb_type instead of l_bsize to handle
-      // sub8x8 cases
-      int search_mi_step = mi_size_high[left_mbmi->sb_type];
-      while (!is_neighbor_overlappable(left_mbmi)) {
-        mi_row_offset += search_mi_step;
-        if (mi_row_offset < mi_nums) {
-          left_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-          left_mbmi = &left_mi->mbmi;
-          search_mi_step = mi_size_high[left_mbmi->sb_type];
-        } else {
-          if (overlappable_offset >= 0) {
-            mi_row_offset = overlappable_offset;
-          } else {
-            mi_row_offset = 0;
-            mi_col_offset = 0;
-          }
-          left_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-          left_mbmi = &left_mi->mbmi;
-          break;
-        }
-      }
-    } else {
-      // update the available overlappable mi
-      overlappable_offset = mi_row_offset;
-    }
-
-    backup_mbmi = *left_mbmi;
-    modify_neighbor_predictor_for_obmc(left_mbmi);
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst, l_bsize, dst_buf[j], MAX_SB_SIZE, MAX_SB_SIZE,
-                       dst_stride[j], i, 0, NULL, pd->subsampling_x,
-                       pd->subsampling_y);
-    }
-#if CONFIG_COMPOUND_SINGLEREF
-    for (ref = 0; ref < 1 + (is_inter_anyref_comp_mode(left_mbmi->mode));
-         ++ref) {
-      const MV_REFERENCE_FRAME frame = has_second_ref(left_mbmi)
-                                           ? left_mbmi->ref_frame[ref]
-                                           : left_mbmi->ref_frame[0];
-#else   // !(CONFIG_COMPOUND_SINGLEREF)
-    for (ref = 0; ref < 1 + has_second_ref(left_mbmi); ++ref) {
-      const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-
-      xd->block_refs[ref] = ref_buf;
-      if ((!av1_is_valid_scale(&ref_buf->sf)))
-        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Reference frame has invalid dimensions");
-      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + i, mi_col,
-                           &ref_buf->sf);
-    }
-    xd->mb_to_top_edge = -((mi_row + i) * MI_SIZE * 8);
-    xd->mb_to_bottom_edge =
-        mb_to_bottom_edge_base + (mi_nums - i - mi_step) * MI_SIZE * 8;
-    mi_x = mi_col << MI_SIZE_LOG2;
-    mi_y = (mi_row + i) << MI_SIZE_LOG2;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      const struct macroblockd_plane *pd = &xd->plane[j];
-      bw = mi_size_wide[bsize] << (MI_SIZE_LOG2 - 1) >> pd->subsampling_x;
-      bh = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;
-
-      build_inter_predictors(cm, xd, j, left_mi, 1, 0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_SUPERTX
-                             0, 0,
-#endif  // CONFIG_SUPERTX
-                             mi_x, mi_y);
-    }
-    *left_mbmi = backup_mbmi;
-  }
-
-  // build from right neighbors
-  xd->mb_to_right_edge = mb_to_right_edge_base;
-  xd->mb_to_left_edge -= mi_size_wide[bsize] * MI_SIZE * 4;
-
-  overlappable_offset = -1;
-
-  for (i = 0; i < mi_nums; i += mi_step) {
-    int mi_row_offset = i;
-    int mi_col_offset = mi_size_wide[bsize];
-    int mi_x, mi_y, bw, bh;
-    int mi_col_shift = mi_size_wide[bsize] >> 1;
-    MODE_INFO *right_mi;
-    MB_MODE_INFO *right_mbmi, backup_mbmi;
-    BLOCK_SIZE r_bsize;
-
-    // create the original prediction if offset exceeds the boundary
-    if (mi_col + mi_col_offset > xd->sb_mi_bd.mi_col_end) mi_col_offset = 0;
-
-    right_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    right_mbmi = &right_mi->mbmi;
-    r_bsize = AOMMAX(right_mbmi->sb_type, BLOCK_8X8);
-
-    mi_step = AOMMIN(mi_nums, mi_size_high[r_bsize]);
-
-    if (!is_neighbor_overlappable(right_mbmi)) {
-      int search_mi_step = mi_size_high[right_mbmi->sb_type];
-      while (!is_neighbor_overlappable(right_mbmi)) {
-        mi_row_offset += search_mi_step;
-        if (mi_row_offset < mi_nums) {
-          right_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-          right_mbmi = &right_mi->mbmi;
-          search_mi_step = mi_size_high[right_mbmi->sb_type];
-        } else {
-          if (overlappable_offset >= 0) {
-            mi_row_offset = overlappable_offset;
-          } else {
-            mi_row_offset = 0;
-            mi_col_offset = 0;
-          }
-          right_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-          right_mbmi = &right_mi->mbmi;
-          break;
-        }
-      }
-    } else {
-      overlappable_offset = mi_row_offset;
-    }
-
-    backup_mbmi = *right_mbmi;
-    modify_neighbor_predictor_for_obmc(right_mbmi);
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst, r_bsize, dst_buf[j], MAX_SB_SIZE, MAX_SB_SIZE,
-                       dst_stride[j], i, mi_col_shift, NULL, pd->subsampling_x,
-                       pd->subsampling_y);
-    }
-#if CONFIG_COMPOUND_SINGLEREF
-    for (ref = 0; ref < 1 + (is_inter_anyref_comp_mode(right_mbmi->mode));
-         ++ref) {
-      const MV_REFERENCE_FRAME frame = has_second_ref(right_mbmi)
-                                           ? right_mbmi->ref_frame[ref]
-                                           : right_mbmi->ref_frame[0];
-#else   // !(CONFIG_COMPOUND_SINGLEREF)
-    for (ref = 0; ref < 1 + has_second_ref(right_mbmi); ++ref) {
-      const MV_REFERENCE_FRAME frame = right_mbmi->ref_frame[ref];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-      xd->block_refs[ref] = ref_buf;
-      if ((!av1_is_valid_scale(&ref_buf->sf)))
-        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Reference frame has invalid dimensions");
-      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + i,
-                           mi_col + mi_col_shift, &ref_buf->sf);
-    }
-    xd->mb_to_top_edge = -((mi_row + i) * MI_SIZE * 8);
-    xd->mb_to_bottom_edge =
-        mb_to_bottom_edge_base + (mi_nums - i - mi_step) * MI_SIZE * 8;
-    mi_x = (mi_col + mi_col_shift) << MI_SIZE_LOG2;
-    mi_y = (mi_row + i) << MI_SIZE_LOG2;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      const struct macroblockd_plane *pd = &xd->plane[j];
-      bw = mi_size_wide[bsize] << (MI_SIZE_LOG2 - 1) >> pd->subsampling_x;
-      bh = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;
-
-      build_inter_predictors(cm, xd, j, right_mi, 1, 0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_SUPERTX
-                             0, 0,
-#endif  // CONFIG_SUPERTX
-                             mi_x, mi_y);
-    }
-
-    *right_mbmi = backup_mbmi;
-  }
-
-  // restore the boundaries
-  xd->mb_to_top_edge = mb_to_top_edge_base;
-  xd->mb_to_bottom_edge = mb_to_bottom_edge_base;
-  xd->mb_to_left_edge = mb_to_left_edge_base;
-  xd->mb_to_right_edge = mb_to_right_edge_base;
-}
-
-void get_pred_by_vert_neighbor(const AV1_COMMON *cm, MACROBLOCKD *xd, int bsize,
-                               int mi_row, int mi_col,
-                               uint8_t *dst_buf[MAX_MB_PLANE],
-                               int dst_stride[MAX_MB_PLANE]) {
-  const TileInfo *const tile = &xd->tile;
-  const int mb_to_bottom_edge_base = xd->mb_to_bottom_edge;
-  const int mb_to_top_edge_base = xd->mb_to_top_edge;
-  const int mb_to_left_edge_base = xd->mb_to_left_edge;
-  const int mb_to_right_edge_base = xd->mb_to_right_edge;
-  int overlappable_offset = -1;
-  const int mi_nums = AOMMIN(mi_size_wide[bsize], cm->mi_cols - mi_col);
-
-  int i, j, mi_step, ref;
-
-  xd->mb_to_bottom_edge += mi_nums * MI_SIZE * 4;
-
-  // build from above neighbors
-  for (i = 0; i < mi_nums; i += mi_step) {
-    int mi_row_offset = -1;
-    int mi_col_offset = i;
-    int mi_x, mi_y, bw, bh;
-    MODE_INFO *above_mi;
-    MB_MODE_INFO *above_mbmi, backup_mbmi;
-    BLOCK_SIZE a_bsize;
-
-    // create the original prediction if offset exceeds the boundary
-    if (mi_row <= tile->mi_row_start) mi_row_offset = 0;
-
-    above_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    above_mbmi = &above_mi->mbmi;
-    a_bsize = AOMMAX(above_mbmi->sb_type, BLOCK_8X8);
-
-    mi_step = AOMMIN(mi_nums, mi_size_high[a_bsize]);
-
-    // reset the mi if it is not overlappble
-    if (!is_neighbor_overlappable(above_mbmi)) {
-      int search_mi_step = mi_size_high[above_mbmi->sb_type];
-      // backward search
-      while (!is_neighbor_overlappable(above_mbmi)) {
-        mi_col_offset += search_mi_step;
-        if (mi_col_offset < mi_nums) {
-          above_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-          above_mbmi = &above_mi->mbmi;
-          search_mi_step = mi_size_high[above_mbmi->sb_type];
-        } else {
-          if (overlappable_offset >= 0) {
-            mi_col_offset = overlappable_offset;
-          } else {
-            mi_row_offset = 0;
-            mi_col_offset = 0;
-          }
-          above_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-          above_mbmi = &above_mi->mbmi;
-          break;
-        }
-      }
-    } else {
-      // update the available overlappable mi
-      overlappable_offset = mi_col_offset;
-    }
-
-    backup_mbmi = *above_mbmi;
-    modify_neighbor_predictor_for_obmc(above_mbmi);
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst, a_bsize, dst_buf[j], MAX_SB_SIZE, MAX_SB_SIZE,
-                       dst_stride[j], 0, i, NULL, pd->subsampling_x,
-                       pd->subsampling_y);
-    }
-#if CONFIG_COMPOUND_SINGLEREF
-    for (ref = 0; ref < 1 + (is_inter_anyref_comp_mode(above_mbmi->mode));
-         ++ref) {
-      const MV_REFERENCE_FRAME frame = has_second_ref(above_mbmi)
-                                           ? above_mbmi->ref_frame[ref]
-                                           : above_mbmi->ref_frame[0];
-#else   // !(CONFIG_COMPOUND_SINGLEREF)
-    for (ref = 0; ref < 1 + has_second_ref(above_mbmi); ++ref) {
-      const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-
-      xd->block_refs[ref] = ref_buf;
-      if ((!av1_is_valid_scale(&ref_buf->sf)))
-        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Reference frame has invalid dimensions");
-      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col + i,
-                           &ref_buf->sf);
-    }
-
-    xd->mb_to_left_edge = -(((mi_col + i) * MI_SIZE) * 8);
-    xd->mb_to_right_edge =
-        mb_to_right_edge_base + (mi_nums - i - mi_step) * MI_SIZE * 8;
-    mi_x = (mi_col + i) << MI_SIZE_LOG2;
-    mi_y = mi_row << MI_SIZE_LOG2;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      const struct macroblockd_plane *pd = &xd->plane[j];
-
-      bh = mi_size_high[bsize] << (MI_SIZE_LOG2 - 1) >> pd->subsampling_x;
-      bw = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;
-
-      build_inter_predictors(cm, xd, j, above_mi, 1, 0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_SUPERTX
-                             0, 0,
-#endif  // CONFIG_SUPERTX
-                             mi_x, mi_y);
-    }
-
-    *above_mbmi = backup_mbmi;
-  }
-
-  // build from bottom neighbors
-  xd->mb_to_bottom_edge = mb_to_bottom_edge_base;
-  xd->mb_to_top_edge -= mi_size_high[bsize] * MI_SIZE * 4;
-
-  overlappable_offset = -1;
-
-  for (i = 0; i < mi_nums; i += mi_step) {
-    int mi_row_offset = mi_size_high[bsize];
-    int mi_col_offset = i;
-    int mi_x, mi_y, bw, bh;
-    int mi_row_shift = mi_size_high[bsize] >> 1;
-    MODE_INFO *bottom_mi;
-    MB_MODE_INFO *bottom_mbmi, backup_mbmi;
-    BLOCK_SIZE b_bsize;
-
-    // create the original prediction if offset exceeds the boundary
-    if (mi_row + mi_row_offset > xd->sb_mi_bd.mi_row_end) mi_row_offset = 0;
-
-    bottom_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    bottom_mbmi = &bottom_mi->mbmi;
-    b_bsize = AOMMAX(bottom_mbmi->sb_type, BLOCK_8X8);
-
-    mi_step = AOMMIN(mi_nums, mi_size_high[b_bsize]);
-
-    // reset the mi if it is not overlappble
-    if (!is_neighbor_overlappable(bottom_mbmi)) {
-      int search_mi_step = mi_size_high[bottom_mbmi->sb_type];
-      while (!is_neighbor_overlappable(bottom_mbmi)) {
-        mi_col_offset += search_mi_step;
-        if (mi_col_offset < mi_nums) {
-          bottom_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-          bottom_mbmi = &bottom_mi->mbmi;
-          search_mi_step = mi_size_high[bottom_mbmi->sb_type];
-        } else {
-          if (overlappable_offset >= 0) {
-            mi_col_offset = overlappable_offset;
-          } else {
-            mi_col_offset = 0;
-            mi_row_offset = 0;
-          }
-          bottom_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-          bottom_mbmi = &bottom_mi->mbmi;
-          break;
-        }
-      }
-    } else {
-      // update the available overlappable mi
-      overlappable_offset = mi_col_offset;
-    }
-
-    backup_mbmi = *bottom_mbmi;
-    modify_neighbor_predictor_for_obmc(bottom_mbmi);
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst, b_bsize, dst_buf[j], MAX_SB_SIZE, MAX_SB_SIZE,
-                       dst_stride[j], mi_row_shift, i, NULL, pd->subsampling_x,
-                       pd->subsampling_y);
-    }
-#if CONFIG_COMPOUND_SINGLEREF
-    for (ref = 0; ref < 1 + (is_inter_anyref_comp_mode(bottom_mbmi->mode));
-         ++ref) {
-      const MV_REFERENCE_FRAME frame = has_second_ref(bottom_mbmi)
-                                           ? bottom_mbmi->ref_frame[ref]
-                                           : bottom_mbmi->ref_frame[0];
-#else   // !(CONFIG_COMPOUND_SINGLEREF)
-    for (ref = 0; ref < 1 + has_second_ref(bottom_mbmi); ++ref) {
-      const MV_REFERENCE_FRAME frame = bottom_mbmi->ref_frame[ref];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-      xd->block_refs[ref] = ref_buf;
-      if ((!av1_is_valid_scale(&ref_buf->sf)))
-        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Reference frame has invalid dimensions");
-      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + mi_row_shift,
-                           mi_col + i, &ref_buf->sf);
-    }
-
-    xd->mb_to_left_edge = -(((mi_col + i) * MI_SIZE) * 8);
-    xd->mb_to_right_edge =
-        mb_to_right_edge_base + (mi_nums - i - mi_step) * MI_SIZE * 8;
-    mi_x = (mi_col + i) << MI_SIZE_LOG2;
-    mi_y = (mi_row + mi_row_shift) << MI_SIZE_LOG2;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      const struct macroblockd_plane *pd = &xd->plane[j];
-
-      bh = mi_size_high[bsize] << (MI_SIZE_LOG2 - 1) >> pd->subsampling_x;
-      bw = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;
-
-      build_inter_predictors(cm, xd, j, bottom_mi, 1, 0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_SUPERTX
-                             0, 0,
-#endif  // CONFIG_SUPERTX
-                             mi_x, mi_y);
-    }
-
-    *bottom_mbmi = backup_mbmi;
-  }
-  // restore the boundaries
-  xd->mb_to_top_edge = mb_to_top_edge_base;
-  xd->mb_to_bottom_edge = mb_to_bottom_edge_base;
-  xd->mb_to_left_edge = mb_to_left_edge_base;
-  xd->mb_to_right_edge = mb_to_right_edge_base;
-}
-
-void get_pred_by_corner_neighbor(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                 int bsize, int mi_row, int mi_col,
-                                 uint8_t *dst_buf[MAX_MB_PLANE],
-                                 int dst_stride[MAX_MB_PLANE]) {
-  const TileInfo *const tile = &xd->tile;
-  const int mb_to_bottom_edge_base = xd->mb_to_bottom_edge;
-  const int mb_to_top_edge_base = xd->mb_to_top_edge;
-  const int mb_to_left_edge_base = xd->mb_to_left_edge;
-  const int mb_to_right_edge_base = xd->mb_to_right_edge;
-  const int mi_wide = mi_size_wide[bsize];
-  const int mi_high = mi_size_high[bsize];
-
-  // location of four mi sources
-  const int mi_row_offsets[4] = { -1, -1, mi_high, mi_high };
-  const int mi_col_offsets[4] = { -1, mi_wide, -1, mi_wide };
-
-  MB_MODE_INFO backup_mbmi;
-  int mi_x, mi_y, bh, bw;
-  int i, j, ref;
-
-  assert(bsize >= BLOCK_8X8);
-
-  for (i = 0; i < 4; ++i) {
-    int mi_row_offset = mi_row_offsets[i];
-    int mi_col_offset = mi_col_offsets[i];
-    MODE_INFO *corner_mi;
-    MB_MODE_INFO *corner_mbmi;
-
-    if (mi_col + mi_col_offset < tile->mi_col_start ||
-        mi_col + mi_col_offset > xd->sb_mi_bd.mi_col_end)
-      mi_col_offset = 0;
-
-    if (mi_row + mi_row_offset < tile->mi_row_start ||
-        mi_row + mi_row_offset > xd->sb_mi_bd.mi_row_end)
-      mi_row_offset = 0;
-
-    corner_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    corner_mbmi = &corner_mi->mbmi;
-
-    // reset the mi if it is not overlappble
-    if (!is_neighbor_overlappable(corner_mbmi)) {
-      mi_row_offset = 0;
-      mi_col_offset = 0;
-      corner_mi = xd->mi[0];
-      corner_mbmi = &corner_mi->mbmi;
-    }
-
-    backup_mbmi = *corner_mbmi;
-    modify_neighbor_predictor_for_obmc(corner_mbmi);
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst, BLOCK_8X8, dst_buf[j], MAX_SB_SIZE,
-                       MAX_SB_SIZE, dst_stride[j], (i / 2) * (mi_high >> 1),
-                       (i % 2) * (mi_wide >> 1), NULL, pd->subsampling_x,
-                       pd->subsampling_y);
-    }
-
-#if CONFIG_COMPOUND_SINGLEREF
-    for (ref = 0; ref < 1 + (is_inter_anyref_comp_mode(corner_mbmi->mode));
-         ++ref) {
-      const MV_REFERENCE_FRAME frame = has_second_ref(corner_mbmi)
-                                           ? corner_mbmi->ref_frame[ref]
-                                           : corner_mbmi->ref_frame[0];
-#else
-    for (ref = 0; ref < 1 + has_second_ref(corner_mbmi); ++ref) {
-      const MV_REFERENCE_FRAME frame = corner_mbmi->ref_frame[ref];
-#endif
-      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-      xd->block_refs[ref] = ref_buf;
-
-      if ((!av1_is_valid_scale(&ref_buf->sf)))
-        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Reference frame has invalid dimensions");
-      av1_setup_pre_planes(xd, ref, ref_buf->buf,
-                           mi_row + (i / 2) * (mi_high >> 1),
-                           mi_col + (i % 2) * (mi_wide >> 1), &ref_buf->sf);
-    }
-    // adjust mi boundaries of this block
-    xd->mb_to_bottom_edge =
-        mb_to_bottom_edge_base + (1 - (i / 2)) * mi_high * MI_SIZE * 4;
-    xd->mb_to_top_edge = mb_to_top_edge_base - (i / 2) * mi_high * MI_SIZE * 4;
-    xd->mb_to_right_edge =
-        mb_to_right_edge_base + (1 - (i % 2)) * mi_wide * MI_SIZE * 4;
-    xd->mb_to_left_edge =
-        mb_to_left_edge_base - (i % 2) * mi_wide * MI_SIZE * 4;
-
-    mi_x = (mi_col + (i % 2) * mi_wide / 2) << MI_SIZE_LOG2;
-    mi_y = (mi_row + (i / 2) * mi_high / 2) << MI_SIZE_LOG2;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      const struct macroblockd_plane *pd = &xd->plane[j];
-      bh = mi_high << MI_SIZE_LOG2 >> (pd->subsampling_x + 1);
-      bw = mi_wide << MI_SIZE_LOG2 >> (pd->subsampling_y + 1);
-      build_inter_predictors(cm, xd, j, corner_mi, 1, 0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_SUPERTX
-                             0, 0,
-#endif  // CONFIG_SUPERTX
-                             mi_x, mi_y);
-    }
-    *corner_mbmi = backup_mbmi;
-  }
-  // restore the boundaries
-  xd->mb_to_bottom_edge = mb_to_bottom_edge_base;
-  xd->mb_to_top_edge = mb_to_top_edge_base;
-  xd->mb_to_right_edge = mb_to_right_edge_base;
-  xd->mb_to_left_edge = mb_to_left_edge_base;
-}
-
-// get the stitched extra prediction for this block
-void av1_get_ext_blk_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, int bsize,
-                           int mi_row, int mi_col,
-                           uint8_t *dst_buf[][MAX_MB_PLANE],
-                           int dst_stride[MAX_MB_PLANE]) {
-  get_pred_by_corner_neighbor(cm, xd, bsize, mi_row, mi_col, dst_buf[0],
-                              dst_stride);
-  get_pred_by_vert_neighbor(cm, xd, bsize, mi_row, mi_col, dst_buf[1],
-                            dst_stride);
-  get_pred_by_horz_neighbor(cm, xd, bsize, mi_row, mi_col, dst_buf[2],
-                            dst_stride);
-}
-
-void av1_get_ori_blk_pred(const AV1_COMMON *cm, MACROBLOCKD *xd, int bsize,
-                          int mi_row, int mi_col,
-                          uint8_t *dst_buf[MAX_MB_PLANE],
-                          int dst_stride[MAX_MB_PLANE]) {
-  MODE_INFO *const mi = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  int mi_x = mi_col << MI_SIZE_LOG2;
-  int mi_y = mi_row << MI_SIZE_LOG2;
-  int bw = block_size_wide[bsize];
-  int bh = block_size_high[bsize];
-  int i, ref;
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    struct macroblockd_plane *const pd = &xd->plane[i];
-    setup_pred_plane(&pd->dst, BLOCK_8X8, dst_buf[i], MAX_SB_SIZE, MAX_SB_SIZE,
-                     dst_stride[i], 0, 0, NULL, pd->subsampling_x,
-                     pd->subsampling_y);
-  }
-
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
-    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
-    const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-    xd->block_refs[ref] = ref_buf;
-
-    if (!av1_is_valid_scale(&ref_buf->sf))
-      aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                         "Reference frame has invalid dimensions");
-
-    av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col, &ref_buf->sf);
-  }
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    const struct macroblockd_plane *pd = &xd->plane[i];
-    build_inter_predictors(cm, xd, i, mi, 1, 0, bw >> pd->subsampling_x,
-                           bh >> pd->subsampling_y, 0, 0,
-                           bw >> pd->subsampling_x, bh >> pd->subsampling_y,
-#if CONFIG_SUPERTX
-                           0, 0,
-#endif  // CONFIG_SUPERTX
-                           mi_x, mi_y);
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(
+        bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+    build_wedge_inter_predictor_from_buf(
+        xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane],
+        ext_dst1[plane], ext_dst_stride1[plane]);
   }
 }
-
-#endif
diff --git a/third_party/aom/av1/common/reconinter.h b/third_party/aom/av1/common/reconinter.h
index 0c3333339..aa3aefc88 100644
--- a/third_party/aom/av1/common/reconinter.h
+++ b/third_party/aom/av1/common/reconinter.h
@@ -15,164 +15,26 @@
 #include "av1/common/filter.h"
 #include "av1/common/onyxc_int.h"
 #include "av1/common/convolve.h"
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 #include "av1/common/warped_motion.h"
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 #include "aom/aom_integer.h"
 
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#define WARP_WM_NEIGHBORS_WITH_OBMC 0
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-
-#if CONFIG_MOTION_VAR && CONFIG_GLOBAL_MOTION
-#define WARP_GM_NEIGHBORS_WITH_OBMC 0
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+// Work out how many pixels off the edge of a reference frame we're allowed
+// to go when forming an inter prediction.
+// The outermost row/col of each referernce frame is extended by
+// (AOM_BORDER_IN_PIXELS >> subsampling) pixels, but we need to keep
+// at least AOM_INTERP_EXTEND pixels within that to account for filtering.
+//
+// We have to break this up into two macros to keep both clang-format and
+// tools/lint-hunks.py happy.
+#define AOM_LEFT_TOP_MARGIN_PX(subsampling) \
+  ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
+#define AOM_LEFT_TOP_MARGIN_SCALED(subsampling) \
+  (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-static INLINE int has_scale(int xs, int ys) {
-  return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
-}
-
-static INLINE void inter_predictor(const uint8_t *src, int src_stride,
-                                   uint8_t *dst, int dst_stride, int subpel_x,
-                                   int subpel_y, const struct scale_factors *sf,
-                                   int w, int h, ConvolveParams *conv_params,
-                                   InterpFilters interp_filters, int xs,
-                                   int ys) {
-  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
-  assert(sf);
-  if (has_scale(xs, ys)) {
-    // TODO(afergs, debargha): Use a different scale convolve function
-    // that uses higher precision for subpel_x, subpel_y, xs, ys
-    if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
-#if CONFIG_CONVOLVE_ROUND
-      av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
-                             interp_filters, subpel_x, xs, subpel_y, ys, 1,
-                             conv_params);
-      conv_params->do_post_rounding = 1;
-#else
-      assert(0);
-#endif  // CONFIG_CONVOLVE_ROUND
-    } else {
-      assert(conv_params->round == CONVOLVE_OPT_ROUND);
-      av1_convolve_scale(src, src_stride, dst, dst_stride, w, h, interp_filters,
-                         subpel_x, xs, subpel_y, ys, conv_params);
-    }
-  } else {
-    subpel_x >>= SCALE_EXTRA_BITS;
-    subpel_y >>= SCALE_EXTRA_BITS;
-    xs >>= SCALE_EXTRA_BITS;
-    ys >>= SCALE_EXTRA_BITS;
-    assert(subpel_x < SUBPEL_SHIFTS);
-    assert(subpel_y < SUBPEL_SHIFTS);
-    assert(xs <= SUBPEL_SHIFTS);
-    assert(ys <= SUBPEL_SHIFTS);
-    if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
-#if CONFIG_CONVOLVE_ROUND
-      av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
-                             interp_filters, subpel_x, xs, subpel_y, ys, 0,
-                             conv_params);
-      conv_params->do_post_rounding = 1;
-#else
-      assert(0);
-#endif  // CONFIG_CONVOLVE_ROUND
-    } else {
-      assert(conv_params->round == CONVOLVE_OPT_ROUND);
-
-      InterpFilterParams filter_params_x, filter_params_y;
-      av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x,
-                                     &filter_params_y);
-
-      if (w <= 2 || h <= 2) {
-        av1_convolve_c(src, src_stride, dst, dst_stride, w, h, interp_filters,
-                       subpel_x, xs, subpel_y, ys, conv_params);
-      } else if (filter_params_x.taps == SUBPEL_TAPS &&
-                 filter_params_y.taps == SUBPEL_TAPS) {
-        const int16_t *kernel_x =
-            av1_get_interp_filter_subpel_kernel(filter_params_x, subpel_x);
-        const int16_t *kernel_y =
-            av1_get_interp_filter_subpel_kernel(filter_params_y, subpel_y);
-        sf->predict[subpel_x != 0][subpel_y != 0][conv_params->do_average](
-            src, src_stride, dst, dst_stride, kernel_x, xs, kernel_y, ys, w, h);
-      } else {
-        av1_convolve(src, src_stride, dst, dst_stride, w, h, interp_filters,
-                     subpel_x, xs, subpel_y, ys, conv_params);
-      }
-    }
-  }
-}
-
-#if CONFIG_HIGHBITDEPTH
-static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride,
-                                          uint8_t *dst, int dst_stride,
-                                          int subpel_x, int subpel_y,
-                                          const struct scale_factors *sf, int w,
-                                          int h, ConvolveParams *conv_params,
-                                          InterpFilters interp_filters, int xs,
-                                          int ys, int bd) {
-  const int avg = conv_params->do_average;
-  assert(avg == 0 || avg == 1);
-
-  if (has_scale(xs, ys)) {
-    if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
-#if CONFIG_CONVOLVE_ROUND
-      av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
-                                    interp_filters, subpel_x, xs, subpel_y, ys,
-                                    1, conv_params, bd);
-      conv_params->do_post_rounding = 1;
-#else
-      assert(0);
-#endif  // CONFIG_CONVOLVE_ROUND
-    } else {
-      av1_highbd_convolve_scale(src, src_stride, dst, dst_stride, w, h,
-                                interp_filters, subpel_x, xs, subpel_y, ys, avg,
-                                bd);
-    }
-  } else {
-    subpel_x >>= SCALE_EXTRA_BITS;
-    subpel_y >>= SCALE_EXTRA_BITS;
-    xs >>= SCALE_EXTRA_BITS;
-    ys >>= SCALE_EXTRA_BITS;
-    assert(subpel_x < SUBPEL_SHIFTS);
-    assert(subpel_y < SUBPEL_SHIFTS);
-    assert(xs <= SUBPEL_SHIFTS);
-    assert(ys <= SUBPEL_SHIFTS);
-    if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
-#if CONFIG_CONVOLVE_ROUND
-      av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
-                                    interp_filters, subpel_x, xs, subpel_y, ys,
-                                    0, conv_params, bd);
-      conv_params->do_post_rounding = 1;
-#else
-      assert(0);
-#endif  // CONFIG_CONVOLVE_ROUND
-    } else {
-      InterpFilterParams filter_params_x, filter_params_y;
-      av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x,
-                                     &filter_params_y);
-
-      if (filter_params_x.taps == SUBPEL_TAPS &&
-          filter_params_y.taps == SUBPEL_TAPS && w > 2 && h > 2) {
-        const int16_t *kernel_x =
-            av1_get_interp_filter_subpel_kernel(filter_params_x, subpel_x);
-        const int16_t *kernel_y =
-            av1_get_interp_filter_subpel_kernel(filter_params_y, subpel_y);
-        sf->highbd_predict[subpel_x != 0][subpel_y != 0][avg](
-            src, src_stride, dst, dst_stride, kernel_x, xs, kernel_y, ys, w, h,
-            bd);
-      } else {
-        av1_highbd_convolve(src, src_stride, dst, dst_stride, w, h,
-                            interp_filters, subpel_x, xs, subpel_y, ys, avg,
-                            bd);
-      }
-    }
-  }
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
 // Set to (1 << 5) if the 32-ary codebooks are used for any bock size
 #define MAX_WEDGE_TYPES (1 << 4)
 
@@ -208,38 +70,108 @@ typedef struct {
   int bits;
   const wedge_code_type *codebook;
   uint8_t *signflip;
-  int smoother;
   wedge_masks_type *masks;
 } wedge_params_type;
 
 extern const wedge_params_type wedge_params_lookup[BLOCK_SIZES_ALL];
 
+typedef struct SubpelParams {
+  int xs;
+  int ys;
+  int subpel_x;
+  int subpel_y;
+} SubpelParams;
+
+struct build_prediction_ctxt {
+  const AV1_COMMON *cm;
+  int mi_row;
+  int mi_col;
+  uint8_t **tmp_buf;
+  int *tmp_width;
+  int *tmp_height;
+  int *tmp_stride;
+  int mb_to_far_edge;
+};
+
+static INLINE int has_scale(int xs, int ys) {
+  return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
+}
+
+static INLINE void revert_scale_extra_bits(SubpelParams *sp) {
+  sp->subpel_x >>= SCALE_EXTRA_BITS;
+  sp->subpel_y >>= SCALE_EXTRA_BITS;
+  sp->xs >>= SCALE_EXTRA_BITS;
+  sp->ys >>= SCALE_EXTRA_BITS;
+  assert(sp->subpel_x < SUBPEL_SHIFTS);
+  assert(sp->subpel_y < SUBPEL_SHIFTS);
+  assert(sp->xs <= SUBPEL_SHIFTS);
+  assert(sp->ys <= SUBPEL_SHIFTS);
+}
+
+static INLINE void inter_predictor(const uint8_t *src, int src_stride,
+                                   uint8_t *dst, int dst_stride,
+                                   const SubpelParams *subpel_params,
+                                   const struct scale_factors *sf, int w, int h,
+                                   ConvolveParams *conv_params,
+                                   InterpFilters interp_filters) {
+  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
+  assert(sf);
+  if (has_scale(subpel_params->xs, subpel_params->ys)) {
+    av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+                           interp_filters, subpel_params->subpel_x,
+                           subpel_params->xs, subpel_params->subpel_y,
+                           subpel_params->ys, 1, conv_params, sf);
+  } else {
+    SubpelParams sp = *subpel_params;
+    revert_scale_extra_bits(&sp);
+    av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+                           interp_filters, sp.subpel_x, sp.xs, sp.subpel_y,
+                           sp.ys, 0, conv_params, sf);
+  }
+}
+
+static INLINE void highbd_inter_predictor(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
+    int h, ConvolveParams *conv_params, InterpFilters interp_filters, int bd) {
+  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
+  assert(sf);
+  if (has_scale(subpel_params->xs, subpel_params->ys)) {
+    av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+                                  interp_filters, subpel_params->subpel_x,
+                                  subpel_params->xs, subpel_params->subpel_y,
+                                  subpel_params->ys, 1, conv_params, sf, bd);
+  } else {
+    SubpelParams sp = *subpel_params;
+    revert_scale_extra_bits(&sp);
+    av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+                                  interp_filters, sp.subpel_x, sp.xs,
+                                  sp.subpel_y, sp.ys, 0, conv_params, sf, bd);
+  }
+}
+
+void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi);
+int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize,
+                               const struct macroblockd_plane *pd, int dir);
+
 static INLINE int is_interinter_compound_used(COMPOUND_TYPE type,
                                               BLOCK_SIZE sb_type) {
-  (void)sb_type;
+  const int comp_allowed = is_comp_ref_allowed(sb_type);
   switch (type) {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    case COMPOUND_AVERAGE: return sb_type >= BLOCK_4X4;
-#else   // CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-    case COMPOUND_AVERAGE: return 1;
-#endif  // CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-#if CONFIG_WEDGE
-    case COMPOUND_WEDGE: return wedge_params_lookup[sb_type].bits > 0;
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-    case COMPOUND_SEG:
-      return AOMMIN(block_size_wide[sb_type], block_size_high[sb_type]) >= 8;
-#endif  // CONFIG_COMPOUND_SEGMENT
+    case COMPOUND_AVERAGE:
+    case COMPOUND_DIFFWTD: return comp_allowed;
+    case COMPOUND_WEDGE:
+      return comp_allowed && wedge_params_lookup[sb_type].bits > 0;
     default: assert(0); return 0;
   }
 }
 
 static INLINE int is_any_masked_compound_used(BLOCK_SIZE sb_type) {
   COMPOUND_TYPE comp_type;
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  if (sb_type < BLOCK_4X4) return 0;
-#endif  // CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  for (comp_type = 0; comp_type < COMPOUND_TYPES; comp_type++) {
+  int i;
+  if (!is_comp_ref_allowed(sb_type)) return 0;
+  for (i = 0; i < COMPOUND_TYPES; i++) {
+    comp_type = (COMPOUND_TYPE)i;
     if (is_masked_compound_type(comp_type) &&
         is_interinter_compound_used(comp_type, sb_type))
       return 1;
@@ -257,7 +189,6 @@ static INLINE int get_interinter_wedge_bits(BLOCK_SIZE sb_type) {
 }
 
 static INLINE int is_interintra_wedge_used(BLOCK_SIZE sb_type) {
-  (void)sb_type;
   return wedge_params_lookup[sb_type].bits > 0;
 }
 
@@ -265,60 +196,22 @@ static INLINE int get_interintra_wedge_bits(BLOCK_SIZE sb_type) {
   return wedge_params_lookup[sb_type].bits;
 }
 
-#if CONFIG_COMPOUND_SEGMENT
-void build_compound_seg_mask(uint8_t *mask, SEG_MASK_TYPE mask_type,
-                             const uint8_t *src0, int src0_stride,
-                             const uint8_t *src1, int src1_stride,
-                             BLOCK_SIZE sb_type, int h, int w);
-#if CONFIG_HIGHBITDEPTH
-void build_compound_seg_mask_highbd(uint8_t *mask, SEG_MASK_TYPE mask_type,
-                                    const uint8_t *src0, int src0_stride,
-                                    const uint8_t *src1, int src1_stride,
-                                    BLOCK_SIZE sb_type, int h, int w, int bd);
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_COMPOUND_SEGMENT
+void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+                              int dst_stride, const SubpelParams *subpel_params,
+                              const struct scale_factors *sf, int w, int h,
+                              ConvolveParams *conv_params,
+                              InterpFilters interp_filters,
+                              const WarpTypesAllowed *warp_types, int p_col,
+                              int p_row, int plane, int ref,
+                              const MB_MODE_INFO *mi, int build_for_obmc,
+                              const MACROBLOCKD *xd, int can_use_previous);
 
 void av1_make_masked_inter_predictor(
     const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride,
-    const int subpel_x, const int subpel_y, const struct scale_factors *sf,
-    int w, int h, ConvolveParams *conv_params, InterpFilters interp_filters,
-    int xs, int ys,
-#if CONFIG_SUPERTX
-    int wedge_offset_x, int wedge_offset_y,
-#endif  // CONFIG_SUPERTX
-    int plane,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
+    int h, ConvolveParams *conv_params, InterpFilters interp_filters, int plane,
     const WarpTypesAllowed *warp_types, int p_col, int p_row, int ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-    MACROBLOCKD *xd);
-
-static INLINE int round_mv_comp_q4(int value) {
-  return (value < 0 ? value - 2 : value + 2) / 4;
-}
-
-static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) {
-  MV res = {
-    round_mv_comp_q4(
-        mi->bmi[0].as_mv[idx].as_mv.row + mi->bmi[1].as_mv[idx].as_mv.row +
-        mi->bmi[2].as_mv[idx].as_mv.row + mi->bmi[3].as_mv[idx].as_mv.row),
-    round_mv_comp_q4(
-        mi->bmi[0].as_mv[idx].as_mv.col + mi->bmi[1].as_mv[idx].as_mv.col +
-        mi->bmi[2].as_mv[idx].as_mv.col + mi->bmi[3].as_mv[idx].as_mv.col)
-  };
-  return res;
-}
-
-static INLINE int round_mv_comp_q2(int value) {
-  return (value < 0 ? value - 1 : value + 1) / 2;
-}
-
-static MV mi_mv_pred_q2(const MODE_INFO *mi, int idx, int block0, int block1) {
-  MV res = { round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.row +
-                              mi->bmi[block1].as_mv[idx].as_mv.row),
-             round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.col +
-                              mi->bmi[block1].as_mv[idx].as_mv.col) };
-  return res;
-}
+    MACROBLOCKD *xd, int can_use_previous);
 
 // TODO(jkoleszar): yet another mv clamping function :-(
 static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
@@ -331,8 +224,8 @@ static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
   const int spel_right = spel_left - SUBPEL_SHIFTS;
   const int spel_top = (AOM_INTERP_EXTEND + bh) << SUBPEL_BITS;
   const int spel_bottom = spel_top - SUBPEL_SHIFTS;
-  MV clamped_mv = { src_mv->row * (1 << (1 - ss_y)),
-                    src_mv->col * (1 << (1 - ss_x)) };
+  MV clamped_mv = { (int16_t)(src_mv->row * (1 << (1 - ss_y))),
+                    (int16_t)(src_mv->col * (1 << (1 - ss_x))) };
   assert(ss_x <= 1);
   assert(ss_y <= 1);
 
@@ -344,20 +237,6 @@ static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
   return clamped_mv;
 }
 
-static INLINE MV average_split_mvs(const struct macroblockd_plane *pd,
-                                   const MODE_INFO *mi, int ref, int block) {
-  const int ss_idx = ((pd->subsampling_x > 0) << 1) | (pd->subsampling_y > 0);
-  MV res = { 0, 0 };
-  switch (ss_idx) {
-    case 0: res = mi->bmi[block].as_mv[ref].as_mv; break;
-    case 1: res = mi_mv_pred_q2(mi, ref, block, block + 2); break;
-    case 2: res = mi_mv_pred_q2(mi, ref, block, block + 1); break;
-    case 3: res = mi_mv_pred_q4(mi, ref); break;
-    default: assert(ss_idx <= 3 && ss_idx >= 0);
-  }
-  return res;
-}
-
 void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                     int mi_row, int mi_col, BUFFER_SET *ctx,
                                     BLOCK_SIZE bsize);
@@ -370,48 +249,22 @@ void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                    int mi_row, int mi_col, BUFFER_SET *ctx,
                                    BLOCK_SIZE bsize);
 
-#if CONFIG_SUPERTX
-void av1_build_inter_predictor_sb_sub8x8_extend(const AV1_COMMON *cm,
-                                                MACROBLOCKD *xd, int mi_row_ori,
-                                                int mi_col_ori, int mi_row,
-                                                int mi_col, int plane,
-                                                BLOCK_SIZE bsize, int block);
-
-void av1_build_inter_predictor_sb_extend(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         int mi_row_ori, int mi_col_ori,
-                                         int mi_row, int mi_col, int plane,
-                                         BLOCK_SIZE bsize);
-struct macroblockd_plane;
-void av1_build_masked_inter_predictor_complex(
-    MACROBLOCKD *xd, uint8_t *dst, int dst_stride, const uint8_t *pre,
-    int pre_stride, int mi_row, int mi_col, int mi_row_ori, int mi_col_ori,
-    BLOCK_SIZE bsize, BLOCK_SIZE top_bsize, PARTITION_TYPE partition,
-    int plane);
-#endif  // CONFIG_SUPERTX
-
 void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
                                int dst_stride, const MV *src_mv,
                                const struct scale_factors *sf, int w, int h,
                                ConvolveParams *conv_params,
                                InterpFilters interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
                                const WarpTypesAllowed *warp_types, int p_col,
                                int p_row, int plane, int ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
                                enum mv_precision precision, int x, int y,
-                               const MACROBLOCKD *xd);
+                               const MACROBLOCKD *xd, int can_use_previous);
 
-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_build_inter_predictor(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
     const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg,
-    InterpFilters interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-    const WarpTypesAllowed *warp_types, int p_col, int p_row,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-    int plane, enum mv_precision precision, int x, int y,
-    const MACROBLOCKD *xd);
-#endif
+    InterpFilters interp_filters, const WarpTypesAllowed *warp_types, int p_col,
+    int p_row, int plane, enum mv_precision precision, int x, int y,
+    const MACROBLOCKD *xd, int can_use_previous);
 
 static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
                                        const struct scale_factors *sf) {
@@ -427,15 +280,11 @@ static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize,
                                     int stride, int mi_row, int mi_col,
                                     const struct scale_factors *scale,
                                     int subsampling_x, int subsampling_y) {
-#if CONFIG_CHROMA_SUB8X8
   // Offset the buffer pointer
   if (subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
     mi_row -= 1;
   if (subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
     mi_col -= 1;
-#else
-  (void)bsize;
-#endif
 
   const int x = (MI_SIZE * mi_col) >> subsampling_x;
   const int y = (MI_SIZE * mi_row) >> subsampling_y;
@@ -447,62 +296,33 @@ static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize,
 }
 
 void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
-                          const YV12_BUFFER_CONFIG *src, int mi_row,
-                          int mi_col);
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const int plane_start, const int plane_end);
 
 void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
-                          const struct scale_factors *sf);
+                          const struct scale_factors *sf, const int num_planes);
 
 // Detect if the block have sub-pixel level motion vectors
 // per component.
 #define CHECK_SUBPEL 0
-static INLINE int has_subpel_mv_component(const MODE_INFO *const mi,
+static INLINE int has_subpel_mv_component(const MB_MODE_INFO *const mbmi,
                                           const MACROBLOCKD *const xd,
                                           int dir) {
 #if CHECK_SUBPEL
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   int plane;
   int ref = (dir >> 1);
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
 
-  if (bsize >= BLOCK_8X8 || unify_bsize) {
-    if (dir & 0x01) {
-      if (mbmi->mv[ref].as_mv.col & SUBPEL_MASK) return 1;
-    } else {
-      if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK) return 1;
-    }
+  if (dir & 0x01) {
+    if (mbmi->mv[ref].as_mv.col & SUBPEL_MASK) return 1;
   } else {
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      const PARTITION_TYPE bp = BLOCK_8X8 - bsize;
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const int have_vsplit = bp != PARTITION_HORZ;
-      const int have_hsplit = bp != PARTITION_VERT;
-      const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
-      const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
-
-      int x, y;
-      for (y = 0; y < num_4x4_h; ++y) {
-        for (x = 0; x < num_4x4_w; ++x) {
-          const MV mv = average_split_mvs(pd, mi, ref, y * 2 + x);
-          if (dir & 0x01) {
-            if (mv.col & SUBPEL_MASK) return 1;
-          } else {
-            if (mv.row & SUBPEL_MASK) return 1;
-          }
-        }
-      }
-    }
+    if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK) return 1;
   }
 
   return 0;
 #else
-  (void)mi;
+  (void)mbmi;
   (void)xd;
   (void)dir;
   return 1;
@@ -516,20 +336,16 @@ static INLINE void set_default_interp_filters(
 }
 
 static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) {
-  (void)xd;
-#if CONFIG_WARPED_MOTION
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  if (mbmi->skip_mode) return 0;
   if (mbmi->motion_mode == WARPED_CAUSAL) return 0;
-#endif  // CONFIG_WARPED_MOTION
-#if CONFIG_GLOBAL_MOTION
-  if (is_nontrans_global_motion(xd)) return 0;
-#endif  // CONFIG_GLOBAL_MOTION
+  if (is_nontrans_global_motion(xd, xd->mi[0])) return 0;
   return 1;
 }
 
 static INLINE int av1_is_interp_search_needed(const MACROBLOCKD *const xd) {
-  MODE_INFO *const mi = xd->mi[0];
-  const int is_compound = has_second_ref(&mi->mbmi);
+  MB_MODE_INFO *const mi = xd->mi[0];
+  const int is_compound = has_second_ref(mi);
   int ref;
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     int row_col;
@@ -542,17 +358,15 @@ static INLINE int av1_is_interp_search_needed(const MACROBLOCKD *const xd) {
   }
   return 0;
 }
-
-#if CONFIG_MOTION_VAR
-const uint8_t *av1_get_obmc_mask(int length);
-void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                      int mi_row, int mi_col);
-void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     int mi_row, int mi_col,
-                                     uint8_t *above[MAX_MB_PLANE],
-                                     int above_stride[MAX_MB_PLANE],
-                                     uint8_t *left[MAX_MB_PLANE],
-                                     int left_stride[MAX_MB_PLANE]);
+void av1_setup_build_prediction_by_above_pred(
+    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+    MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
+    const int num_planes);
+void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
+                                             uint8_t left_mi_height,
+                                             MB_MODE_INFO *left_mbmi,
+                                             struct build_prediction_ctxt *ctxt,
+                                             const int num_planes);
 void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          int mi_row, int mi_col,
                                          uint8_t *tmp_buf[MAX_MB_PLANE],
@@ -565,13 +379,18 @@ void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                         int tmp_width[MAX_MB_PLANE],
                                         int tmp_height[MAX_MB_PLANE],
                                         int tmp_stride[MAX_MB_PLANE]);
+void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                     int mi_row, int mi_col,
+                                     uint8_t *above[MAX_MB_PLANE],
+                                     int above_stride[MAX_MB_PLANE],
+                                     uint8_t *left[MAX_MB_PLANE],
+                                     int left_stride[MAX_MB_PLANE]);
+
+const uint8_t *av1_get_obmc_mask(int length);
+void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                      int mi_row, int mi_col);
 void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                         int mi_row, int mi_col);
-#if CONFIG_NCOBMC
-void av1_build_ncobmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                          int mi_row, int mi_col);
-#endif
-#endif  // CONFIG_MOTION_VAR
 
 #define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
 #define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
@@ -584,32 +403,24 @@ static INLINE const uint8_t *av1_get_contiguous_soft_mask(int wedge_index,
   return wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
 }
 
-const uint8_t *av1_get_soft_mask(int wedge_index, int wedge_sign,
-                                 BLOCK_SIZE sb_type, int wedge_offset_x,
-                                 int wedge_offset_y);
-
-const uint8_t *av1_get_compound_type_mask_inverse(
-    const INTERINTER_COMPOUND_DATA *const comp_data,
-#if CONFIG_COMPOUND_SEGMENT
-    uint8_t *mask_buffer, int h, int w, int stride,
-#endif
-    BLOCK_SIZE sb_type);
-
 const uint8_t *av1_get_compound_type_mask(
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type);
-#if CONFIG_INTERINTRA
+
 void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                      uint8_t *ypred, uint8_t *upred,
                                      uint8_t *vpred, int ystride, int ustride,
                                      int vstride, BUFFER_SET *ctx,
                                      BLOCK_SIZE bsize);
+
 void av1_build_interintra_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          uint8_t *ypred, int ystride,
                                          BUFFER_SET *ctx, BLOCK_SIZE bsize);
+
 void av1_build_interintra_predictors_sbc(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          uint8_t *upred, int ustride,
                                          BUFFER_SET *ctx, int plane,
                                          BLOCK_SIZE bsize);
+
 void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                           uint8_t *upred, uint8_t *vpred,
                                           int ustride, int vstride,
@@ -621,57 +432,27 @@ void av1_build_intra_predictors_for_interintra(
 void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
                             const uint8_t *inter_pred, int inter_stride,
                             const uint8_t *intra_pred, int intra_stride);
-#endif  // CONFIG_INTERINTRA
+
 // Encoder only
 void av1_build_inter_predictors_for_planes_single_buf(
     MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
-    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3]);
-void av1_build_wedge_inter_predictor_from_buf(
-    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to,
-#if CONFIG_SUPERTX
-    int wedge_offset_x, int wedge_offset_y,
-#endif  // CONFIG_SUPERTX
-    uint8_t *ext_dst0[3], int ext_dst_stride0[3], uint8_t *ext_dst1[3],
-    int ext_dst_stride1[3]);
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-#define ASSIGN_ALIGNED_PTRS(p, a, s) \
-  p[0] = a;                          \
-  p[1] = a + s;                      \
-  p[2] = a + 2 * s;
-
-#define ASSIGN_ALIGNED_PTRS_HBD(p, a, s, l) \
-  p[0] = CONVERT_TO_BYTEPTR(a);             \
-  p[1] = CONVERT_TO_BYTEPTR(a + s * l);     \
-  p[2] = CONVERT_TO_BYTEPTR(a + 2 * s * l);
-
-void alloc_ncobmc_pred_buffer(MACROBLOCKD *const xd);
-void free_ncobmc_pred_buffer(MACROBLOCKD *const xd);
-void set_sb_mi_boundaries(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                          const int mi_row, const int mi_col);
-
-void reset_xd_boundary(MACROBLOCKD *xd, int mi_row, int bh, int mi_col, int bw,
-                       int mi_rows, int mi_cols);
-
-void get_pred_from_intrpl_buf(MACROBLOCKD *xd, int mi_row, int mi_col,
-                              BLOCK_SIZE bsize, int plane);
-
-void build_ncobmc_intrpl_pred(const AV1_COMMON *const cm, MACROBLOCKD *xd,
-                              int plane, int pxl_row, int pxl_col,
-                              BLOCK_SIZE bsize, uint8_t *preds[][MAX_MB_PLANE],
-                              int ps[MAX_MB_PLANE],  // pred buffer strides
-                              int mode);
-
-void av1_get_ext_blk_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, int bsize,
-                           int mi_row, int mi_col,
-                           uint8_t *dst_buf[][MAX_MB_PLANE],
-                           int dst_stride[MAX_MB_PLANE]);
-
-void av1_get_ori_blk_pred(const AV1_COMMON *cm, MACROBLOCKD *xd, int bsize,
-                          int mi_row, int mi_col,
-                          uint8_t *dst_buf[MAX_MB_PLANE],
-                          int dst_stride[MAX_MB_PLANE]);
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
+    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
+    int can_use_previous);
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int plane_from, int plane_to,
+                                              uint8_t *ext_dst0[3],
+                                              int ext_dst_stride0[3],
+                                              uint8_t *ext_dst1[3],
+                                              int ext_dst_stride1[3]);
+
+void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
+                                int order_idx, int *fwd_offset, int *bck_offset,
+                                int *use_jnt_comp_avg, int is_compound);
+int av1_allow_warp(const MB_MODE_INFO *const mbmi,
+                   const WarpTypesAllowed *const warp_types,
+                   const WarpedMotionParams *const gm_params,
+                   int build_for_obmc, int x_scale, int y_scale,
+                   WarpedMotionParams *final_warp_params);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/common/reconintra.c b/third_party/aom/av1/common/reconintra.c
index c6d57b742..21d1f60b2 100644
--- a/third_party/aom/av1/common/reconintra.c
+++ b/third_party/aom/av1/common/reconintra.c
@@ -11,22 +11,18 @@
 
 #include <math.h>
 
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom_ports/system_state.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
-#if CONFIG_HIGHBITDEPTH
 #include "aom_dsp/aom_dsp_common.h"
-#endif  // CONFIG_HIGHBITDEPTH
 #include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
 #include "aom_ports/aom_once.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
 #include "av1/common/reconintra.h"
 #include "av1/common/onyxc_int.h"
-#if CONFIG_CFL
 #include "av1/common/cfl.h"
-#endif
 
 enum {
   NEED_LEFT = 1 << 1,
@@ -36,17 +32,9 @@ enum {
   NEED_BOTTOMLEFT = 1 << 5,
 };
 
-#if CONFIG_INTRA_EDGE
 #define INTRA_EDGE_FILT 3
 #define INTRA_EDGE_TAPS 5
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-#define MAX_UPSAMPLE_SZ 12
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-#endif  // CONFIG_INTRA_EDGE
-
-#define INTRA_USES_EXT_TRANSFORMS 1
-#define INTRA_USES_RECT_TRANSFORMS \
-  (CONFIG_RECT_TX && (CONFIG_VAR_TX || CONFIG_EXT_TX))
+#define MAX_UPSAMPLE_SZ 16
 
 static const uint8_t extend_modes[INTRA_MODES] = {
   NEED_ABOVE | NEED_LEFT,                   // DC
@@ -54,515 +42,187 @@ static const uint8_t extend_modes[INTRA_MODES] = {
   NEED_LEFT,                                // H
   NEED_ABOVE | NEED_ABOVERIGHT,             // D45
   NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D135
-  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D117
-  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D153
-  NEED_LEFT | NEED_BOTTOMLEFT,              // D207
-  NEED_ABOVE | NEED_ABOVERIGHT,             // D63
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D113
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D157
+  NEED_LEFT | NEED_BOTTOMLEFT,              // D203
+  NEED_ABOVE | NEED_ABOVERIGHT,             // D67
   NEED_LEFT | NEED_ABOVE,                   // SMOOTH
-#if CONFIG_SMOOTH_HV
   NEED_LEFT | NEED_ABOVE,                   // SMOOTH_V
   NEED_LEFT | NEED_ABOVE,                   // SMOOTH_H
-#endif                                      // CONFIG_SMOOTH_HV
-  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // TM
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // PAETH
 };
 
-static const uint16_t orders_128x128[1] = { 0 };
-static const uint16_t orders_128x64[2] = { 0, 1 };
-static const uint16_t orders_64x128[2] = { 0, 1 };
-static const uint16_t orders_64x64[4] = {
-  0, 1, 2, 3,
-};
-static const uint16_t orders_64x32[8] = {
-  0, 2, 1, 3, 4, 6, 5, 7,
+// Tables to store if the top-right reference pixels are available. The flags
+// are represented with bits, packed into 8-bit integers. E.g., for the 32x32
+// blocks in a 128x128 superblock, the index of the "o" block is 10 (in raster
+// order), so its flag is stored at the 3rd bit of the 2nd entry in the table,
+// i.e. (table[10 / 8] >> (10 % 8)) & 1.
+//       . . . .
+//       . . . .
+//       . . o .
+//       . . . .
+static uint8_t has_tr_4x4[128] = {
+  255, 255, 255, 255, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  255, 255, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
 };
-static const uint16_t orders_32x64[8] = {
-  0, 1, 2, 3, 4, 5, 6, 7,
+static uint8_t has_tr_4x8[64] = {
+  255, 255, 255, 255, 119, 119, 119, 119, 127, 127, 127, 127, 119,
+  119, 119, 119, 255, 127, 255, 127, 119, 119, 119, 119, 127, 127,
+  127, 127, 119, 119, 119, 119, 255, 255, 255, 127, 119, 119, 119,
+  119, 127, 127, 127, 127, 119, 119, 119, 119, 255, 127, 255, 127,
+  119, 119, 119, 119, 127, 127, 127, 127, 119, 119, 119, 119,
 };
-static const uint16_t orders_32x32[16] = {
-  0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15,
+static uint8_t has_tr_8x4[64] = {
+  255, 255, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+  127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+  255, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+  127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
 };
-static const uint16_t orders_32x16[32] = {
-  0,  2,  8,  10, 1,  3,  9,  11, 4,  6,  12, 14, 5,  7,  13, 15,
-  16, 18, 24, 26, 17, 19, 25, 27, 20, 22, 28, 30, 21, 23, 29, 31,
+static uint8_t has_tr_8x8[32] = {
+  255, 255, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85,
+  255, 127, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85,
 };
-static const uint16_t orders_16x32[32] = {
-  0,  1,  2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15,
-  16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31,
+static uint8_t has_tr_8x16[16] = {
+  255, 255, 119, 119, 127, 127, 119, 119,
+  255, 127, 119, 119, 127, 127, 119, 119,
 };
-static const uint16_t orders_16x16[64] = {
-  0,  1,  4,  5,  16, 17, 20, 21, 2,  3,  6,  7,  18, 19, 22, 23,
-  8,  9,  12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31,
-  32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55,
-  40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63,
+static uint8_t has_tr_16x8[16] = {
+  255, 0, 85, 0, 119, 0, 85, 0, 127, 0, 85, 0, 119, 0, 85, 0,
 };
-
-static const uint16_t orders_64x16[16] = {
-  0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15,
+static uint8_t has_tr_16x16[8] = {
+  255, 85, 119, 85, 127, 85, 119, 85,
 };
-static const uint16_t orders_16x64[16] = {
-  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+static uint8_t has_tr_16x32[4] = { 255, 119, 127, 119 };
+static uint8_t has_tr_32x16[4] = { 15, 5, 7, 5 };
+static uint8_t has_tr_32x32[2] = { 95, 87 };
+static uint8_t has_tr_32x64[1] = { 127 };
+static uint8_t has_tr_64x32[1] = { 19 };
+static uint8_t has_tr_64x64[1] = { 7 };
+static uint8_t has_tr_64x128[1] = { 3 };
+static uint8_t has_tr_128x64[1] = { 1 };
+static uint8_t has_tr_128x128[1] = { 1 };
+static uint8_t has_tr_4x16[32] = {
+  255, 255, 255, 255, 127, 127, 127, 127, 255, 127, 255,
+  127, 127, 127, 127, 127, 255, 255, 255, 127, 127, 127,
+  127, 127, 255, 127, 255, 127, 127, 127, 127, 127,
 };
-static const uint16_t orders_32x8[64] = {
-  0,  4,  16, 20, 1,  5,  17, 21, 2,  6,  18, 22, 3,  7,  19, 23,
-  8,  12, 24, 28, 9,  13, 25, 29, 10, 14, 26, 30, 11, 15, 27, 31,
-  32, 36, 48, 52, 33, 37, 49, 53, 34, 38, 50, 54, 35, 39, 51, 55,
-  40, 44, 56, 60, 41, 45, 57, 61, 42, 46, 58, 62, 43, 47, 59, 63,
+static uint8_t has_tr_16x4[32] = {
+  255, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0,
+  127, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0,
 };
-static const uint16_t orders_8x32[64] = {
-  0,  1,  2,  3,  4,  5,  6,  7,  16, 17, 18, 19, 20, 21, 22, 23,
-  8,  9,  10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55,
-  40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63,
+static uint8_t has_tr_8x32[8] = {
+  255, 255, 127, 127, 255, 127, 127, 127,
 };
-
-#if CONFIG_EXT_PARTITION
-static const uint16_t orders_16x4[256] = {
-  0,   4,   16,  20,  64,  68,  80,  84,  1,   5,   17,  21,  65,  69,  81,
-  85,  2,   6,   18,  22,  66,  70,  82,  86,  3,   7,   19,  23,  67,  71,
-  83,  87,  8,   12,  24,  28,  72,  76,  88,  92,  9,   13,  25,  29,  73,
-  77,  89,  93,  10,  14,  26,  30,  74,  78,  90,  94,  11,  15,  27,  31,
-  75,  79,  91,  95,  32,  36,  48,  52,  96,  100, 112, 116, 33,  37,  49,
-  53,  97,  101, 113, 117, 34,  38,  50,  54,  98,  102, 114, 118, 35,  39,
-  51,  55,  99,  103, 115, 119, 40,  44,  56,  60,  104, 108, 120, 124, 41,
-  45,  57,  61,  105, 109, 121, 125, 42,  46,  58,  62,  106, 110, 122, 126,
-  43,  47,  59,  63,  107, 111, 123, 127, 128, 132, 144, 148, 192, 196, 208,
-  212, 129, 133, 145, 149, 193, 197, 209, 213, 130, 134, 146, 150, 194, 198,
-  210, 214, 131, 135, 147, 151, 195, 199, 211, 215, 136, 140, 152, 156, 200,
-  204, 216, 220, 137, 141, 153, 157, 201, 205, 217, 221, 138, 142, 154, 158,
-  202, 206, 218, 222, 139, 143, 155, 159, 203, 207, 219, 223, 160, 164, 176,
-  180, 224, 228, 240, 244, 161, 165, 177, 181, 225, 229, 241, 245, 162, 166,
-  178, 182, 226, 230, 242, 246, 163, 167, 179, 183, 227, 231, 243, 247, 168,
-  172, 184, 188, 232, 236, 248, 252, 169, 173, 185, 189, 233, 237, 249, 253,
-  170, 174, 186, 190, 234, 238, 250, 254, 171, 175, 187, 191, 235, 239, 251,
-  255,
+static uint8_t has_tr_32x8[8] = {
+  15, 0, 5, 0, 7, 0, 5, 0,
 };
-static const uint16_t orders_4x16[256] = {
-  0,   1,   2,   3,   4,   5,   6,   7,   16,  17,  18,  19,  20,  21,  22,
-  23,  64,  65,  66,  67,  68,  69,  70,  71,  80,  81,  82,  83,  84,  85,
-  86,  87,  8,   9,   10,  11,  12,  13,  14,  15,  24,  25,  26,  27,  28,
-  29,  30,  31,  72,  73,  74,  75,  76,  77,  78,  79,  88,  89,  90,  91,
-  92,  93,  94,  95,  32,  33,  34,  35,  36,  37,  38,  39,  48,  49,  50,
-  51,  52,  53,  54,  55,  96,  97,  98,  99,  100, 101, 102, 103, 112, 113,
-  114, 115, 116, 117, 118, 119, 40,  41,  42,  43,  44,  45,  46,  47,  56,
-  57,  58,  59,  60,  61,  62,  63,  104, 105, 106, 107, 108, 109, 110, 111,
-  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
-  135, 144, 145, 146, 147, 148, 149, 150, 151, 192, 193, 194, 195, 196, 197,
-  198, 199, 208, 209, 210, 211, 212, 213, 214, 215, 136, 137, 138, 139, 140,
-  141, 142, 143, 152, 153, 154, 155, 156, 157, 158, 159, 200, 201, 202, 203,
-  204, 205, 206, 207, 216, 217, 218, 219, 220, 221, 222, 223, 160, 161, 162,
-  163, 164, 165, 166, 167, 176, 177, 178, 179, 180, 181, 182, 183, 224, 225,
-  226, 227, 228, 229, 230, 231, 240, 241, 242, 243, 244, 245, 246, 247, 168,
-  169, 170, 171, 172, 173, 174, 175, 184, 185, 186, 187, 188, 189, 190, 191,
-  232, 233, 234, 235, 236, 237, 238, 239, 248, 249, 250, 251, 252, 253, 254,
-  255,
+static uint8_t has_tr_16x64[2] = { 255, 127 };
+static uint8_t has_tr_64x16[2] = { 3, 1 };
+
+static const uint8_t *const has_tr_tables[BLOCK_SIZES_ALL] = {
+  // 4X4
+  has_tr_4x4,
+  // 4X8,       8X4,            8X8
+  has_tr_4x8, has_tr_8x4, has_tr_8x8,
+  // 8X16,      16X8,           16X16
+  has_tr_8x16, has_tr_16x8, has_tr_16x16,
+  // 16X32,     32X16,          32X32
+  has_tr_16x32, has_tr_32x16, has_tr_32x32,
+  // 32X64,     64X32,          64X64
+  has_tr_32x64, has_tr_64x32, has_tr_64x64,
+  // 64x128,    128x64,         128x128
+  has_tr_64x128, has_tr_128x64, has_tr_128x128,
+  // 4x16,      16x4,            8x32
+  has_tr_4x16, has_tr_16x4, has_tr_8x32,
+  // 32x8,      16x64,           64x16
+  has_tr_32x8, has_tr_16x64, has_tr_64x16
 };
-#endif
 
-static const uint16_t orders_32x128[4] = {
-  0, 1, 2, 3,
-};
-static const uint16_t orders_128x32[4] = {
-  0, 1, 2, 3,
-};
-
-#if CONFIG_CB4X4 || CONFIG_EXT_PARTITION
-static const uint16_t orders_16x8[128] = {
-  0,  2,  8,  10, 32,  34,  40,  42,  1,  3,  9,  11, 33,  35,  41,  43,
-  4,  6,  12, 14, 36,  38,  44,  46,  5,  7,  13, 15, 37,  39,  45,  47,
-  16, 18, 24, 26, 48,  50,  56,  58,  17, 19, 25, 27, 49,  51,  57,  59,
-  20, 22, 28, 30, 52,  54,  60,  62,  21, 23, 29, 31, 53,  55,  61,  63,
-  64, 66, 72, 74, 96,  98,  104, 106, 65, 67, 73, 75, 97,  99,  105, 107,
-  68, 70, 76, 78, 100, 102, 108, 110, 69, 71, 77, 79, 101, 103, 109, 111,
-  80, 82, 88, 90, 112, 114, 120, 122, 81, 83, 89, 91, 113, 115, 121, 123,
-  84, 86, 92, 94, 116, 118, 124, 126, 85, 87, 93, 95, 117, 119, 125, 127,
-};
-static const uint16_t orders_8x16[128] = {
-  0,  1,  2,  3,  8,  9,  10, 11, 32,  33,  34,  35,  40,  41,  42,  43,
-  4,  5,  6,  7,  12, 13, 14, 15, 36,  37,  38,  39,  44,  45,  46,  47,
-  16, 17, 18, 19, 24, 25, 26, 27, 48,  49,  50,  51,  56,  57,  58,  59,
-  20, 21, 22, 23, 28, 29, 30, 31, 52,  53,  54,  55,  60,  61,  62,  63,
-  64, 65, 66, 67, 72, 73, 74, 75, 96,  97,  98,  99,  104, 105, 106, 107,
-  68, 69, 70, 71, 76, 77, 78, 79, 100, 101, 102, 103, 108, 109, 110, 111,
-  80, 81, 82, 83, 88, 89, 90, 91, 112, 113, 114, 115, 120, 121, 122, 123,
-  84, 85, 86, 87, 92, 93, 94, 95, 116, 117, 118, 119, 124, 125, 126, 127,
+static uint8_t has_tr_vert_8x8[32] = {
+  255, 255, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0,
+  255, 127, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0,
 };
-static const uint16_t orders_8x8[256] = {
-  0,   1,   4,   5,   16,  17,  20,  21,  64,  65,  68,  69,  80,  81,  84,
-  85,  2,   3,   6,   7,   18,  19,  22,  23,  66,  67,  70,  71,  82,  83,
-  86,  87,  8,   9,   12,  13,  24,  25,  28,  29,  72,  73,  76,  77,  88,
-  89,  92,  93,  10,  11,  14,  15,  26,  27,  30,  31,  74,  75,  78,  79,
-  90,  91,  94,  95,  32,  33,  36,  37,  48,  49,  52,  53,  96,  97,  100,
-  101, 112, 113, 116, 117, 34,  35,  38,  39,  50,  51,  54,  55,  98,  99,
-  102, 103, 114, 115, 118, 119, 40,  41,  44,  45,  56,  57,  60,  61,  104,
-  105, 108, 109, 120, 121, 124, 125, 42,  43,  46,  47,  58,  59,  62,  63,
-  106, 107, 110, 111, 122, 123, 126, 127, 128, 129, 132, 133, 144, 145, 148,
-  149, 192, 193, 196, 197, 208, 209, 212, 213, 130, 131, 134, 135, 146, 147,
-  150, 151, 194, 195, 198, 199, 210, 211, 214, 215, 136, 137, 140, 141, 152,
-  153, 156, 157, 200, 201, 204, 205, 216, 217, 220, 221, 138, 139, 142, 143,
-  154, 155, 158, 159, 202, 203, 206, 207, 218, 219, 222, 223, 160, 161, 164,
-  165, 176, 177, 180, 181, 224, 225, 228, 229, 240, 241, 244, 245, 162, 163,
-  166, 167, 178, 179, 182, 183, 226, 227, 230, 231, 242, 243, 246, 247, 168,
-  169, 172, 173, 184, 185, 188, 189, 232, 233, 236, 237, 248, 249, 252, 253,
-  170, 171, 174, 175, 186, 187, 190, 191, 234, 235, 238, 239, 250, 251, 254,
-  255,
+static uint8_t has_tr_vert_16x16[8] = {
+  255, 0, 119, 0, 127, 0, 119, 0,
 };
-
-#if CONFIG_CB4X4 && CONFIG_EXT_PARTITION
-static const uint16_t orders_4x8[512] = {
-  0,   1,   2,   3,   8,   9,   10,  11,  32,  33,  34,  35,  40,  41,  42,
-  43,  128, 129, 130, 131, 136, 137, 138, 139, 160, 161, 162, 163, 168, 169,
-  170, 171, 4,   5,   6,   7,   12,  13,  14,  15,  36,  37,  38,  39,  44,
-  45,  46,  47,  132, 133, 134, 135, 140, 141, 142, 143, 164, 165, 166, 167,
-  172, 173, 174, 175, 16,  17,  18,  19,  24,  25,  26,  27,  48,  49,  50,
-  51,  56,  57,  58,  59,  144, 145, 146, 147, 152, 153, 154, 155, 176, 177,
-  178, 179, 184, 185, 186, 187, 20,  21,  22,  23,  28,  29,  30,  31,  52,
-  53,  54,  55,  60,  61,  62,  63,  148, 149, 150, 151, 156, 157, 158, 159,
-  180, 181, 182, 183, 188, 189, 190, 191, 64,  65,  66,  67,  72,  73,  74,
-  75,  96,  97,  98,  99,  104, 105, 106, 107, 192, 193, 194, 195, 200, 201,
-  202, 203, 224, 225, 226, 227, 232, 233, 234, 235, 68,  69,  70,  71,  76,
-  77,  78,  79,  100, 101, 102, 103, 108, 109, 110, 111, 196, 197, 198, 199,
-  204, 205, 206, 207, 228, 229, 230, 231, 236, 237, 238, 239, 80,  81,  82,
-  83,  88,  89,  90,  91,  112, 113, 114, 115, 120, 121, 122, 123, 208, 209,
-  210, 211, 216, 217, 218, 219, 240, 241, 242, 243, 248, 249, 250, 251, 84,
-  85,  86,  87,  92,  93,  94,  95,  116, 117, 118, 119, 124, 125, 126, 127,
-  212, 213, 214, 215, 220, 221, 222, 223, 244, 245, 246, 247, 252, 253, 254,
-  255, 256, 257, 258, 259, 264, 265, 266, 267, 288, 289, 290, 291, 296, 297,
-  298, 299, 384, 385, 386, 387, 392, 393, 394, 395, 416, 417, 418, 419, 424,
-  425, 426, 427, 260, 261, 262, 263, 268, 269, 270, 271, 292, 293, 294, 295,
-  300, 301, 302, 303, 388, 389, 390, 391, 396, 397, 398, 399, 420, 421, 422,
-  423, 428, 429, 430, 431, 272, 273, 274, 275, 280, 281, 282, 283, 304, 305,
-  306, 307, 312, 313, 314, 315, 400, 401, 402, 403, 408, 409, 410, 411, 432,
-  433, 434, 435, 440, 441, 442, 443, 276, 277, 278, 279, 284, 285, 286, 287,
-  308, 309, 310, 311, 316, 317, 318, 319, 404, 405, 406, 407, 412, 413, 414,
-  415, 436, 437, 438, 439, 444, 445, 446, 447, 320, 321, 322, 323, 328, 329,
-  330, 331, 352, 353, 354, 355, 360, 361, 362, 363, 448, 449, 450, 451, 456,
-  457, 458, 459, 480, 481, 482, 483, 488, 489, 490, 491, 324, 325, 326, 327,
-  332, 333, 334, 335, 356, 357, 358, 359, 364, 365, 366, 367, 452, 453, 454,
-  455, 460, 461, 462, 463, 484, 485, 486, 487, 492, 493, 494, 495, 336, 337,
-  338, 339, 344, 345, 346, 347, 368, 369, 370, 371, 376, 377, 378, 379, 464,
-  465, 466, 467, 472, 473, 474, 475, 496, 497, 498, 499, 504, 505, 506, 507,
-  340, 341, 342, 343, 348, 349, 350, 351, 372, 373, 374, 375, 380, 381, 382,
-  383, 468, 469, 470, 471, 476, 477, 478, 479, 500, 501, 502, 503, 508, 509,
-  510, 511,
-};
-
-static const uint16_t orders_8x4[512] = {
-  0,   2,   8,   10,  32,  34,  40,  42,  128, 130, 136, 138, 160, 162, 168,
-  170, 1,   3,   9,   11,  33,  35,  41,  43,  129, 131, 137, 139, 161, 163,
-  169, 171, 4,   6,   12,  14,  36,  38,  44,  46,  132, 134, 140, 142, 164,
-  166, 172, 174, 5,   7,   13,  15,  37,  39,  45,  47,  133, 135, 141, 143,
-  165, 167, 173, 175, 16,  18,  24,  26,  48,  50,  56,  58,  144, 146, 152,
-  154, 176, 178, 184, 186, 17,  19,  25,  27,  49,  51,  57,  59,  145, 147,
-  153, 155, 177, 179, 185, 187, 20,  22,  28,  30,  52,  54,  60,  62,  148,
-  150, 156, 158, 180, 182, 188, 190, 21,  23,  29,  31,  53,  55,  61,  63,
-  149, 151, 157, 159, 181, 183, 189, 191, 64,  66,  72,  74,  96,  98,  104,
-  106, 192, 194, 200, 202, 224, 226, 232, 234, 65,  67,  73,  75,  97,  99,
-  105, 107, 193, 195, 201, 203, 225, 227, 233, 235, 68,  70,  76,  78,  100,
-  102, 108, 110, 196, 198, 204, 206, 228, 230, 236, 238, 69,  71,  77,  79,
-  101, 103, 109, 111, 197, 199, 205, 207, 229, 231, 237, 239, 80,  82,  88,
-  90,  112, 114, 120, 122, 208, 210, 216, 218, 240, 242, 248, 250, 81,  83,
-  89,  91,  113, 115, 121, 123, 209, 211, 217, 219, 241, 243, 249, 251, 84,
-  86,  92,  94,  116, 118, 124, 126, 212, 214, 220, 222, 244, 246, 252, 254,
-  85,  87,  93,  95,  117, 119, 125, 127, 213, 215, 221, 223, 245, 247, 253,
-  255, 256, 258, 264, 266, 288, 290, 296, 298, 384, 386, 392, 394, 416, 418,
-  424, 426, 257, 259, 265, 267, 289, 291, 297, 299, 385, 387, 393, 395, 417,
-  419, 425, 427, 260, 262, 268, 270, 292, 294, 300, 302, 388, 390, 396, 398,
-  420, 422, 428, 430, 261, 263, 269, 271, 293, 295, 301, 303, 389, 391, 397,
-  399, 421, 423, 429, 431, 272, 274, 280, 282, 304, 306, 312, 314, 400, 402,
-  408, 410, 432, 434, 440, 442, 273, 275, 281, 283, 305, 307, 313, 315, 401,
-  403, 409, 411, 433, 435, 441, 443, 276, 278, 284, 286, 308, 310, 316, 318,
-  404, 406, 412, 414, 436, 438, 444, 446, 277, 279, 285, 287, 309, 311, 317,
-  319, 405, 407, 413, 415, 437, 439, 445, 447, 320, 322, 328, 330, 352, 354,
-  360, 362, 448, 450, 456, 458, 480, 482, 488, 490, 321, 323, 329, 331, 353,
-  355, 361, 363, 449, 451, 457, 459, 481, 483, 489, 491, 324, 326, 332, 334,
-  356, 358, 364, 366, 452, 454, 460, 462, 484, 486, 492, 494, 325, 327, 333,
-  335, 357, 359, 365, 367, 453, 455, 461, 463, 485, 487, 493, 495, 336, 338,
-  344, 346, 368, 370, 376, 378, 464, 466, 472, 474, 496, 498, 504, 506, 337,
-  339, 345, 347, 369, 371, 377, 379, 465, 467, 473, 475, 497, 499, 505, 507,
-  340, 342, 348, 350, 372, 374, 380, 382, 468, 470, 476, 478, 500, 502, 508,
-  510, 341, 343, 349, 351, 373, 375, 381, 383, 469, 471, 477, 479, 501, 503,
-  509, 511,
-};
-
-static const uint16_t orders_4x4[1024] = {
-  0,    1,    4,    5,    16,   17,   20,   21,   64,   65,   68,   69,   80,
-  81,   84,   85,   256,  257,  260,  261,  272,  273,  276,  277,  320,  321,
-  324,  325,  336,  337,  340,  341,  2,    3,    6,    7,    18,   19,   22,
-  23,   66,   67,   70,   71,   82,   83,   86,   87,   258,  259,  262,  263,
-  274,  275,  278,  279,  322,  323,  326,  327,  338,  339,  342,  343,  8,
-  9,    12,   13,   24,   25,   28,   29,   72,   73,   76,   77,   88,   89,
-  92,   93,   264,  265,  268,  269,  280,  281,  284,  285,  328,  329,  332,
-  333,  344,  345,  348,  349,  10,   11,   14,   15,   26,   27,   30,   31,
-  74,   75,   78,   79,   90,   91,   94,   95,   266,  267,  270,  271,  282,
-  283,  286,  287,  330,  331,  334,  335,  346,  347,  350,  351,  32,   33,
-  36,   37,   48,   49,   52,   53,   96,   97,   100,  101,  112,  113,  116,
-  117,  288,  289,  292,  293,  304,  305,  308,  309,  352,  353,  356,  357,
-  368,  369,  372,  373,  34,   35,   38,   39,   50,   51,   54,   55,   98,
-  99,   102,  103,  114,  115,  118,  119,  290,  291,  294,  295,  306,  307,
-  310,  311,  354,  355,  358,  359,  370,  371,  374,  375,  40,   41,   44,
-  45,   56,   57,   60,   61,   104,  105,  108,  109,  120,  121,  124,  125,
-  296,  297,  300,  301,  312,  313,  316,  317,  360,  361,  364,  365,  376,
-  377,  380,  381,  42,   43,   46,   47,   58,   59,   62,   63,   106,  107,
-  110,  111,  122,  123,  126,  127,  298,  299,  302,  303,  314,  315,  318,
-  319,  362,  363,  366,  367,  378,  379,  382,  383,  128,  129,  132,  133,
-  144,  145,  148,  149,  192,  193,  196,  197,  208,  209,  212,  213,  384,
-  385,  388,  389,  400,  401,  404,  405,  448,  449,  452,  453,  464,  465,
-  468,  469,  130,  131,  134,  135,  146,  147,  150,  151,  194,  195,  198,
-  199,  210,  211,  214,  215,  386,  387,  390,  391,  402,  403,  406,  407,
-  450,  451,  454,  455,  466,  467,  470,  471,  136,  137,  140,  141,  152,
-  153,  156,  157,  200,  201,  204,  205,  216,  217,  220,  221,  392,  393,
-  396,  397,  408,  409,  412,  413,  456,  457,  460,  461,  472,  473,  476,
-  477,  138,  139,  142,  143,  154,  155,  158,  159,  202,  203,  206,  207,
-  218,  219,  222,  223,  394,  395,  398,  399,  410,  411,  414,  415,  458,
-  459,  462,  463,  474,  475,  478,  479,  160,  161,  164,  165,  176,  177,
-  180,  181,  224,  225,  228,  229,  240,  241,  244,  245,  416,  417,  420,
-  421,  432,  433,  436,  437,  480,  481,  484,  485,  496,  497,  500,  501,
-  162,  163,  166,  167,  178,  179,  182,  183,  226,  227,  230,  231,  242,
-  243,  246,  247,  418,  419,  422,  423,  434,  435,  438,  439,  482,  483,
-  486,  487,  498,  499,  502,  503,  168,  169,  172,  173,  184,  185,  188,
-  189,  232,  233,  236,  237,  248,  249,  252,  253,  424,  425,  428,  429,
-  440,  441,  444,  445,  488,  489,  492,  493,  504,  505,  508,  509,  170,
-  171,  174,  175,  186,  187,  190,  191,  234,  235,  238,  239,  250,  251,
-  254,  255,  426,  427,  430,  431,  442,  443,  446,  447,  490,  491,  494,
-  495,  506,  507,  510,  511,  512,  513,  516,  517,  528,  529,  532,  533,
-  576,  577,  580,  581,  592,  593,  596,  597,  768,  769,  772,  773,  784,
-  785,  788,  789,  832,  833,  836,  837,  848,  849,  852,  853,  514,  515,
-  518,  519,  530,  531,  534,  535,  578,  579,  582,  583,  594,  595,  598,
-  599,  770,  771,  774,  775,  786,  787,  790,  791,  834,  835,  838,  839,
-  850,  851,  854,  855,  520,  521,  524,  525,  536,  537,  540,  541,  584,
-  585,  588,  589,  600,  601,  604,  605,  776,  777,  780,  781,  792,  793,
-  796,  797,  840,  841,  844,  845,  856,  857,  860,  861,  522,  523,  526,
-  527,  538,  539,  542,  543,  586,  587,  590,  591,  602,  603,  606,  607,
-  778,  779,  782,  783,  794,  795,  798,  799,  842,  843,  846,  847,  858,
-  859,  862,  863,  544,  545,  548,  549,  560,  561,  564,  565,  608,  609,
-  612,  613,  624,  625,  628,  629,  800,  801,  804,  805,  816,  817,  820,
-  821,  864,  865,  868,  869,  880,  881,  884,  885,  546,  547,  550,  551,
-  562,  563,  566,  567,  610,  611,  614,  615,  626,  627,  630,  631,  802,
-  803,  806,  807,  818,  819,  822,  823,  866,  867,  870,  871,  882,  883,
-  886,  887,  552,  553,  556,  557,  568,  569,  572,  573,  616,  617,  620,
-  621,  632,  633,  636,  637,  808,  809,  812,  813,  824,  825,  828,  829,
-  872,  873,  876,  877,  888,  889,  892,  893,  554,  555,  558,  559,  570,
-  571,  574,  575,  618,  619,  622,  623,  634,  635,  638,  639,  810,  811,
-  814,  815,  826,  827,  830,  831,  874,  875,  878,  879,  890,  891,  894,
-  895,  640,  641,  644,  645,  656,  657,  660,  661,  704,  705,  708,  709,
-  720,  721,  724,  725,  896,  897,  900,  901,  912,  913,  916,  917,  960,
-  961,  964,  965,  976,  977,  980,  981,  642,  643,  646,  647,  658,  659,
-  662,  663,  706,  707,  710,  711,  722,  723,  726,  727,  898,  899,  902,
-  903,  914,  915,  918,  919,  962,  963,  966,  967,  978,  979,  982,  983,
-  648,  649,  652,  653,  664,  665,  668,  669,  712,  713,  716,  717,  728,
-  729,  732,  733,  904,  905,  908,  909,  920,  921,  924,  925,  968,  969,
-  972,  973,  984,  985,  988,  989,  650,  651,  654,  655,  666,  667,  670,
-  671,  714,  715,  718,  719,  730,  731,  734,  735,  906,  907,  910,  911,
-  922,  923,  926,  927,  970,  971,  974,  975,  986,  987,  990,  991,  672,
-  673,  676,  677,  688,  689,  692,  693,  736,  737,  740,  741,  752,  753,
-  756,  757,  928,  929,  932,  933,  944,  945,  948,  949,  992,  993,  996,
-  997,  1008, 1009, 1012, 1013, 674,  675,  678,  679,  690,  691,  694,  695,
-  738,  739,  742,  743,  754,  755,  758,  759,  930,  931,  934,  935,  946,
-  947,  950,  951,  994,  995,  998,  999,  1010, 1011, 1014, 1015, 680,  681,
-  684,  685,  696,  697,  700,  701,  744,  745,  748,  749,  760,  761,  764,
-  765,  936,  937,  940,  941,  952,  953,  956,  957,  1000, 1001, 1004, 1005,
-  1016, 1017, 1020, 1021, 682,  683,  686,  687,  698,  699,  702,  703,  746,
-  747,  750,  751,  762,  763,  766,  767,  938,  939,  942,  943,  954,  955,
-  958,  959,  1002, 1003, 1006, 1007, 1018, 1019, 1022, 1023,
-};
-#endif
-#endif  // CONFIG_CB4X4 || CONFIG_EXT_PARTITION
-
-#if CONFIG_EXT_PARTITION
-/* clang-format off */
-static const uint16_t *const orders[BLOCK_SIZES_ALL] = {
-#if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  // 2X2,         2X4,            4X2
-  orders_4x4,     orders_4x4,     orders_4x4,
-#endif
-  //                              4X4
-                                  orders_4x4,
-  // 4X8,         8X4,            8X8
-  orders_4x8,     orders_8x4,     orders_8x8,
-#else  // CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  //                              4X4
-                                  orders_8x8,
-  // 4X8,         8X4,            8X8
-  orders_8x8,     orders_8x8,     orders_8x8,
-#endif
-  // 8X16,        16X8,           16X16
-  orders_8x16,    orders_16x8,    orders_16x16,
-  // 16X32,       32X16,          32X32
-  orders_16x32,   orders_32x16,   orders_32x32,
-  // 32X64,       64X32,          64X64
-  orders_32x64,   orders_64x32,   orders_64x64,
-  // 64x128,      128x64,         128x128
-  orders_64x128,  orders_128x64,  orders_128x128,
-  // 4x16,        16x4,           8x32
-  orders_4x16,    orders_16x4,    orders_8x32,
-  // 32x8,        16x64,          64x16
-  orders_32x8,    orders_16x64,   orders_64x16,
-  // 32x128,      128x32
-  orders_32x128,  orders_128x32
-};
-/* clang-format on */
-#else
-/* clang-format off */
-static const uint16_t *const orders[BLOCK_SIZES_ALL] = {
-#if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  // 2X2,         2X4,            4X2
-  orders_8x8,     orders_8x8,     orders_8x8,
-#endif
-  //                              4X4
-                                  orders_8x8,
-  // 4X8,         8X4,            8X8
-  orders_8x16,    orders_16x8,    orders_16x16,
-#else  // CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  //                              4X4
-                                  orders_16x16,
-  // 4X8,         8X4,            8X8
-  orders_16x16,   orders_16x16,   orders_16x16,
-#endif
-  // 8X16,        16X8,           16X16
-  orders_16x32,   orders_32x16,   orders_32x32,
-  // 16X32,       32X16,          32X32
-  orders_32x64,   orders_64x32,   orders_64x64,
-  // 32X64,       64X32,          64X64
-  orders_64x128,  orders_128x64,  orders_128x128,
-  // 4x16,        16x4,           8x32
-  orders_8x32,    orders_32x8,    orders_16x64,
-  // 32x8,        16x64,          64x16
-  orders_64x16,   orders_32x128,  orders_128x32
+static uint8_t has_tr_vert_32x32[2] = { 15, 7 };
+static uint8_t has_tr_vert_64x64[1] = { 3 };
+
+// The _vert_* tables are like the ordinary tables above, but describe the
+// order we visit square blocks when doing a PARTITION_VERT_A or
+// PARTITION_VERT_B. This is the same order as normal except for on the last
+// split where we go vertically (TL, BL, TR, BR). We treat the rectangular block
+// as a pair of squares, which means that these tables work correctly for both
+// mixed vertical partition types.
+//
+// There are tables for each of the square sizes. Vertical rectangles (like
+// BLOCK_16X32) use their respective "non-vert" table
+static const uint8_t *const has_tr_vert_tables[BLOCK_SIZES] = {
+  // 4X4
+  NULL,
+  // 4X8,      8X4,         8X8
+  has_tr_4x8, NULL, has_tr_vert_8x8,
+  // 8X16,     16X8,        16X16
+  has_tr_8x16, NULL, has_tr_vert_16x16,
+  // 16X32,    32X16,       32X32
+  has_tr_16x32, NULL, has_tr_vert_32x32,
+  // 32X64,    64X32,       64X64
+  has_tr_32x64, NULL, has_tr_vert_64x64,
+  // 64x128,   128x64,      128x128
+  has_tr_64x128, NULL, has_tr_128x128
 };
-/* clang-format on */
-#endif  // CONFIG_EXT_PARTITION
 
-#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB
-static const uint16_t orders_verta_64x64[4] = {
-  0, 2, 1, 2,
-};
-static const uint16_t orders_verta_32x32[16] = {
-  0, 2, 4, 6, 1, 2, 5, 6, 8, 10, 12, 14, 9, 10, 13, 14,
-};
-static const uint16_t orders_verta_16x16[64] = {
-  0,  2,  4,  6,  16, 18, 20, 22, 1,  2,  5,  6,  17, 18, 21, 22,
-  8,  10, 12, 14, 24, 26, 28, 30, 9,  10, 13, 14, 25, 26, 29, 30,
-  32, 34, 36, 38, 48, 50, 52, 54, 33, 34, 37, 38, 49, 50, 53, 54,
-  40, 42, 44, 46, 56, 58, 60, 62, 41, 42, 45, 46, 57, 58, 61, 62,
-};
-#if CONFIG_EXT_PARTITION || CONFIG_CB4X4
-static const uint16_t orders_verta_8x8[256] = {
-  0,   2,   4,   6,   16,  18,  20,  22,  64,  66,  68,  70,  80,  82,  84,
-  86,  1,   2,   5,   6,   17,  18,  21,  22,  65,  66,  69,  70,  81,  82,
-  85,  86,  8,   10,  12,  14,  24,  26,  28,  30,  72,  74,  76,  78,  88,
-  90,  92,  94,  9,   10,  13,  14,  25,  26,  29,  30,  73,  74,  77,  78,
-  89,  90,  93,  94,  32,  34,  36,  38,  48,  50,  52,  54,  96,  98,  100,
-  102, 112, 114, 116, 118, 33,  34,  37,  38,  49,  50,  53,  54,  97,  98,
-  101, 102, 113, 114, 117, 118, 40,  42,  44,  46,  56,  58,  60,  62,  104,
-  106, 108, 110, 120, 122, 124, 126, 41,  42,  45,  46,  57,  58,  61,  62,
-  105, 106, 109, 110, 121, 122, 125, 126, 128, 130, 132, 134, 144, 146, 148,
-  150, 192, 194, 196, 198, 208, 210, 212, 214, 129, 130, 133, 134, 145, 146,
-  149, 150, 193, 194, 197, 198, 209, 210, 213, 214, 136, 138, 140, 142, 152,
-  154, 156, 158, 200, 202, 204, 206, 216, 218, 220, 222, 137, 138, 141, 142,
-  153, 154, 157, 158, 201, 202, 205, 206, 217, 218, 221, 222, 160, 162, 164,
-  166, 176, 178, 180, 182, 224, 226, 228, 230, 240, 242, 244, 246, 161, 162,
-  165, 166, 177, 178, 181, 182, 225, 226, 229, 230, 241, 242, 245, 246, 168,
-  170, 172, 174, 184, 186, 188, 190, 232, 234, 236, 238, 248, 250, 252, 254,
-  169, 170, 173, 174, 185, 186, 189, 190, 233, 234, 237, 238, 249, 250, 253,
-  254,
-};
-#endif  // CONFIG_EXT_PARTITION || CONFIG_CB4X4
-
-#if CONFIG_EXT_PARTITION
-/* clang-format off */
-static const uint16_t *const orders_verta[BLOCK_SIZES] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  // 2X2,           2X4,              4X2
-  orders_4x4,       orders_4x4,       orders_4x4,
-#endif
-  //                                  4X4
-                                      orders_verta_8x8,
-  // 4X8,           8X4,              8X8
-  orders_verta_8x8, orders_verta_8x8, orders_verta_8x8,
-  // 8X16,          16X8,             16X16
-  orders_8x16,      orders_16x8,      orders_verta_16x16,
-  // 16X32,         32X16,            32X32
-  orders_16x32,     orders_32x16,     orders_verta_32x32,
-  // 32X64,         64X32,            64X64
-  orders_32x64,     orders_64x32,     orders_verta_64x64,
-  // 64x128,        128x64,           128x128
-  orders_64x128,    orders_128x64,    orders_128x128,
-  // Note: We can't get 4:1 shaped blocks from a VERT_A type partition
-};
-/* clang-format on */
-#else
-/* clang-format off */
-static const uint16_t *const orders_verta[BLOCK_SIZES] = {
-#if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  // 2X2,             2X4,                4X2
-  orders_verta_8x8,   orders_verta_8x8,   orders_verta_8x8,
-#endif
-  //                                      4X4
-                                          orders_verta_8x8,
-  // 4X8,             8X4,                8X8
-  orders_verta_8x8,   orders_verta_8x8,   orders_verta_16x16,
-#else  // CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  //                                      4X4
-                                          orders_verta_16x16,
-  // 4X8,             8X4,                8X8
-  orders_verta_16x16, orders_verta_16x16, orders_verta_16x16,
-#endif
-  // 8X16,            16X8,               16X16
-  orders_16x32,       orders_32x16,       orders_verta_32x32,
-  // 16X32,           32X16,              32X32
-  orders_32x64,       orders_64x32,       orders_verta_64x64,
-  // 32X64,           64X32,              64X64
-  orders_64x128,      orders_128x64,      orders_128x128,
-  // Note: We can't get 4:1 shaped blocks from a VERT_A type partition
-};
-/* clang-format on */
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
+static const uint8_t *get_has_tr_table(PARTITION_TYPE partition,
+                                       BLOCK_SIZE bsize) {
+  const uint8_t *ret = NULL;
+  // If this is a mixed vertical partition, look up bsize in orders_vert.
+  if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) {
+    assert(bsize < BLOCK_SIZES);
+    ret = has_tr_vert_tables[bsize];
+  } else {
+    ret = has_tr_tables[bsize];
+  }
+  assert(ret);
+  return ret;
+}
 
 static int has_top_right(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
                          int mi_col, int top_available, int right_available,
-#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB
-                         PARTITION_TYPE partition,
-#endif  // CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB
-                         TX_SIZE txsz, int row_off, int col_off, int ss_x) {
+                         PARTITION_TYPE partition, TX_SIZE txsz, int row_off,
+                         int col_off, int ss_x, int ss_y) {
   if (!top_available || !right_available) return 0;
 
-#if !CONFIG_CB4X4
-  // TODO(bshacklett, huisu): Currently the RD loop traverses 4X8 blocks in
-  // inverted N order while in the bitstream the subblocks are stored in Z
-  // order. This discrepancy makes this function incorrect when considering 4X8
-  // blocks in the RD loop, so we disable the extended right edge for these
-  // blocks. The correct solution is to change the bitstream to store these
-  // blocks in inverted N order, and then update this function appropriately.
-  if (bsize == BLOCK_4X8 && row_off == 1) return 0;
-#endif
-
   const int bw_unit = block_size_wide[bsize] >> tx_size_wide_log2[0];
   const int plane_bw_unit = AOMMAX(bw_unit >> ss_x, 1);
   const int top_right_count_unit = tx_size_wide_unit[txsz];
 
-#if !CONFIG_CB4X4
-  // Special handling for block sizes 4x8 and 4x4.
-  if (ss_x == 0 && bw_unit < 2 && col_off == 0) return 1;
-#endif
-
   if (row_off > 0) {  // Just need to check if enough pixels on the right.
-#if CONFIG_EXT_PARTITION
-    if (col_off + top_right_count_unit >=
-        (block_size_wide[BLOCK_64X64] >> (tx_size_wide_log2[0] + ss_x)))
-      return 0;
-#endif
+    if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64]) {
+      // Special case: For 128x128 blocks, the transform unit whose
+      // top-right corner is at the center of the block does in fact have
+      // pixels available at its top-right corner.
+      if (row_off == mi_size_high[BLOCK_64X64] >> ss_y &&
+          col_off + top_right_count_unit == mi_size_wide[BLOCK_64X64] >> ss_x) {
+        return 1;
+      }
+      const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x;
+      const int col_off_64 = col_off % plane_bw_unit_64;
+      return col_off_64 + top_right_count_unit < plane_bw_unit_64;
+    }
     return col_off + top_right_count_unit < plane_bw_unit;
   } else {
     // All top-right pixels are in the block above, which is already available.
     if (col_off + top_right_count_unit < plane_bw_unit) return 1;
 
-    const int bw_in_mi_log2 = mi_width_log2_lookup[bsize];
-    const int bh_in_mi_log2 = mi_height_log2_lookup[bsize];
-    const int sb_mi_size = mi_size_high[cm->sb_size];
+    const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
+    const int bh_in_mi_log2 = mi_size_high_log2[bsize];
+    const int sb_mi_size = mi_size_high[cm->seq_params.sb_size];
     const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
     const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
 
@@ -572,32 +232,175 @@ static int has_top_right(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
 
     // Rightmost column of superblock (and not the top row): so top-right pixels
     // fall in the right superblock, which is not available yet.
-    if (((blk_col_in_sb + 1) << bw_in_mi_log2) >= sb_mi_size) return 0;
+    if (((blk_col_in_sb + 1) << bw_in_mi_log2) >= sb_mi_size) {
+      return 0;
+    }
 
     // General case (neither top row nor rightmost column): check if the
     // top-right block is coded before the current block.
-    const uint16_t *const order =
-#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB
-        (partition == PARTITION_VERT_A) ? orders_verta[bsize] :
-#endif  // CONFIG_EXT_PARTITION_TYPES
-                                        orders[bsize];
     const int this_blk_index =
         ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
         blk_col_in_sb + 0;
-    const uint16_t this_blk_order = order[this_blk_index];
-    const int tr_blk_index =
-        ((blk_row_in_sb - 1) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
-        blk_col_in_sb + 1;
-    const uint16_t tr_blk_order = order[tr_blk_index];
-    return tr_blk_order < this_blk_order;
+    const int idx1 = this_blk_index / 8;
+    const int idx2 = this_blk_index % 8;
+    const uint8_t *has_tr_table = get_has_tr_table(partition, bsize);
+    return (has_tr_table[idx1] >> idx2) & 1;
+  }
+}
+
+// Similar to the has_tr_* tables, but store if the bottom-left reference
+// pixels are available.
+static uint8_t has_bl_4x4[128] = {
+  84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  1,  1,  1,  84, 85, 85,
+  85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  0,  1,  0,  84, 85, 85, 85, 16, 17,
+  17, 17, 84, 85, 85, 85, 0,  1,  1,  1,  84, 85, 85, 85, 16, 17, 17, 17, 84,
+  85, 85, 85, 0,  0,  0,  0,  84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85,
+  0,  1,  1,  1,  84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  0,  1,
+  0,  84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  1,  1,  1,  84, 85,
+  85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  0,  0,  0,
+};
+static uint8_t has_bl_4x8[64] = {
+  16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0,
+  16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0,
+  16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0,
+  16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0,
+};
+static uint8_t has_bl_8x4[64] = {
+  254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1,
+  254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0,
+  254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1,
+  254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0,
+};
+static uint8_t has_bl_8x8[32] = {
+  84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0,
+  84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0,
+};
+static uint8_t has_bl_8x16[16] = {
+  16, 17, 0, 1, 16, 17, 0, 0, 16, 17, 0, 1, 16, 17, 0, 0,
+};
+static uint8_t has_bl_16x8[16] = {
+  254, 84, 254, 16, 254, 84, 254, 0, 254, 84, 254, 16, 254, 84, 254, 0,
+};
+static uint8_t has_bl_16x16[8] = {
+  84, 16, 84, 0, 84, 16, 84, 0,
+};
+static uint8_t has_bl_16x32[4] = { 16, 0, 16, 0 };
+static uint8_t has_bl_32x16[4] = { 78, 14, 78, 14 };
+static uint8_t has_bl_32x32[2] = { 4, 4 };
+static uint8_t has_bl_32x64[1] = { 0 };
+static uint8_t has_bl_64x32[1] = { 34 };
+static uint8_t has_bl_64x64[1] = { 0 };
+static uint8_t has_bl_64x128[1] = { 0 };
+static uint8_t has_bl_128x64[1] = { 0 };
+static uint8_t has_bl_128x128[1] = { 0 };
+static uint8_t has_bl_4x16[32] = {
+  0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
+  0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
+};
+static uint8_t has_bl_16x4[32] = {
+  254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0,
+  254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0,
+};
+static uint8_t has_bl_8x32[8] = {
+  0, 1, 0, 0, 0, 1, 0, 0,
+};
+static uint8_t has_bl_32x8[8] = {
+  238, 78, 238, 14, 238, 78, 238, 14,
+};
+static uint8_t has_bl_16x64[2] = { 0, 0 };
+static uint8_t has_bl_64x16[2] = { 42, 42 };
+
+static const uint8_t *const has_bl_tables[BLOCK_SIZES_ALL] = {
+  // 4X4
+  has_bl_4x4,
+  // 4X8,         8X4,         8X8
+  has_bl_4x8, has_bl_8x4, has_bl_8x8,
+  // 8X16,        16X8,        16X16
+  has_bl_8x16, has_bl_16x8, has_bl_16x16,
+  // 16X32,       32X16,       32X32
+  has_bl_16x32, has_bl_32x16, has_bl_32x32,
+  // 32X64,       64X32,       64X64
+  has_bl_32x64, has_bl_64x32, has_bl_64x64,
+  // 64x128,      128x64,      128x128
+  has_bl_64x128, has_bl_128x64, has_bl_128x128,
+  // 4x16,        16x4,        8x32
+  has_bl_4x16, has_bl_16x4, has_bl_8x32,
+  // 32x8,        16x64,       64x16
+  has_bl_32x8, has_bl_16x64, has_bl_64x16
+};
+
+static uint8_t has_bl_vert_8x8[32] = {
+  254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0,
+  254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0,
+};
+static uint8_t has_bl_vert_16x16[8] = {
+  254, 16, 254, 0, 254, 16, 254, 0,
+};
+static uint8_t has_bl_vert_32x32[2] = { 14, 14 };
+static uint8_t has_bl_vert_64x64[1] = { 2 };
+
+// The _vert_* tables are like the ordinary tables above, but describe the
+// order we visit square blocks when doing a PARTITION_VERT_A or
+// PARTITION_VERT_B. This is the same order as normal except for on the last
+// split where we go vertically (TL, BL, TR, BR). We treat the rectangular block
+// as a pair of squares, which means that these tables work correctly for both
+// mixed vertical partition types.
+//
+// There are tables for each of the square sizes. Vertical rectangles (like
+// BLOCK_16X32) use their respective "non-vert" table
+static const uint8_t *const has_bl_vert_tables[BLOCK_SIZES] = {
+  // 4X4
+  NULL,
+  // 4X8,     8X4,         8X8
+  has_bl_4x8, NULL, has_bl_vert_8x8,
+  // 8X16,    16X8,        16X16
+  has_bl_8x16, NULL, has_bl_vert_16x16,
+  // 16X32,   32X16,       32X32
+  has_bl_16x32, NULL, has_bl_vert_32x32,
+  // 32X64,   64X32,       64X64
+  has_bl_32x64, NULL, has_bl_vert_64x64,
+  // 64x128,  128x64,      128x128
+  has_bl_64x128, NULL, has_bl_128x128
+};
+
+static const uint8_t *get_has_bl_table(PARTITION_TYPE partition,
+                                       BLOCK_SIZE bsize) {
+  const uint8_t *ret = NULL;
+  // If this is a mixed vertical partition, look up bsize in orders_vert.
+  if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) {
+    assert(bsize < BLOCK_SIZES);
+    ret = has_bl_vert_tables[bsize];
+  } else {
+    ret = has_bl_tables[bsize];
   }
+  assert(ret);
+  return ret;
 }
 
 static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
                            int mi_col, int bottom_available, int left_available,
-                           TX_SIZE txsz, int row_off, int col_off, int ss_y) {
+                           PARTITION_TYPE partition, TX_SIZE txsz, int row_off,
+                           int col_off, int ss_x, int ss_y) {
   if (!bottom_available || !left_available) return 0;
 
+  // Special case for 128x* blocks, when col_off is half the block width.
+  // This is needed because 128x* superblocks are divided into 64x* blocks in
+  // raster order
+  if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64] && col_off > 0) {
+    const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x;
+    const int col_off_64 = col_off % plane_bw_unit_64;
+    if (col_off_64 == 0) {
+      // We are at the left edge of top-right or bottom-right 64x* block.
+      const int plane_bh_unit_64 = mi_size_high[BLOCK_64X64] >> ss_y;
+      const int row_off_64 = row_off % plane_bh_unit_64;
+      const int plane_bh_unit =
+          AOMMIN(mi_size_high[bsize] >> ss_y, plane_bh_unit_64);
+      // Check if all bottom-left pixels are in the left 64x* block (which is
+      // already coded).
+      return row_off_64 + tx_size_high_unit[txsz] < plane_bh_unit;
+    }
+  }
+
   if (col_off > 0) {
     // Bottom-left pixels are in the bottom-left block, which is not available.
     return 0;
@@ -606,17 +409,12 @@ static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
     const int plane_bh_unit = AOMMAX(bh_unit >> ss_y, 1);
     const int bottom_left_count_unit = tx_size_high_unit[txsz];
 
-#if !CONFIG_CB4X4
-    // Special handling for block sizes 8x4 and 4x4.
-    if (ss_y == 0 && bh_unit < 2 && row_off == 0) return 1;
-#endif
-
     // All bottom-left pixels are in the left block, which is already available.
     if (row_off + bottom_left_count_unit < plane_bh_unit) return 1;
 
-    const int bw_in_mi_log2 = mi_width_log2_lookup[bsize];
-    const int bh_in_mi_log2 = mi_height_log2_lookup[bsize];
-    const int sb_mi_size = mi_size_high[cm->sb_size];
+    const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
+    const int bh_in_mi_log2 = mi_size_high_log2[bsize];
+    const int sb_mi_size = mi_size_high[cm->seq_params.sb_size];
     const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
     const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
 
@@ -629,8 +427,7 @@ static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
                                             tx_size_wide_log2[0]) >>
                                     ss_y;
       const int row_off_in_sb = blk_start_row_off + row_off;
-      const int sb_height_unit =
-          sb_mi_size << (MI_SIZE_LOG2 - tx_size_wide_log2[0]) >> ss_y;
+      const int sb_height_unit = sb_mi_size >> ss_y;
       return row_off_in_sb + bottom_left_count_unit < sb_height_unit;
     }
 
@@ -640,16 +437,13 @@ static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
 
     // General case (neither leftmost column nor bottom row): check if the
     // bottom-left block is coded before the current block.
-    const uint16_t *const order = orders[bsize];
     const int this_blk_index =
         ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
         blk_col_in_sb + 0;
-    const uint16_t this_blk_order = order[this_blk_index];
-    const int bl_blk_index =
-        ((blk_row_in_sb + 1) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
-        blk_col_in_sb - 1;
-    const uint16_t bl_blk_order = order[bl_blk_index];
-    return bl_blk_order < this_blk_order;
+    const int idx1 = this_blk_index / 8;
+    const int idx2 = this_blk_index % 8;
+    const uint8_t *has_bl_table = get_has_bl_table(partition, bsize);
+    return (has_bl_table[idx1] >> idx2) & 1;
   }
 }
 
@@ -659,20 +453,15 @@ typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
 static intra_pred_fn pred[INTRA_MODES][TX_SIZES_ALL];
 static intra_pred_fn dc_pred[2][2][TX_SIZES_ALL];
 
-#if CONFIG_HIGHBITDEPTH
 typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride,
                                    const uint16_t *above, const uint16_t *left,
                                    int bd);
 static intra_high_pred_fn pred_high[INTRA_MODES][TX_SIZES_ALL];
 static intra_high_pred_fn dc_pred_high[2][2][TX_SIZES_ALL];
-#endif  // CONFIG_HIGHBITDEPTH
 
-static void av1_init_intra_predictors_internal(void) {
-#if CONFIG_EXT_INTRA
+static void init_intra_predictors_internal(void) {
   assert(NELEMENTS(mode_to_angle_map) == INTRA_MODES);
-#endif  // CONFIG_EXT_INTRA
 
-#if CONFIG_TX64X64
 #define INIT_RECTANGULAR(p, type)             \
   p[TX_4X8] = aom_##type##_predictor_4x8;     \
   p[TX_8X4] = aom_##type##_predictor_8x4;     \
@@ -681,132 +470,53 @@ static void av1_init_intra_predictors_internal(void) {
   p[TX_16X32] = aom_##type##_predictor_16x32; \
   p[TX_32X16] = aom_##type##_predictor_32x16; \
   p[TX_32X64] = aom_##type##_predictor_32x64; \
-  p[TX_64X32] = aom_##type##_predictor_64x32;
-#else
-#define INIT_RECTANGULAR(p, type)             \
-  p[TX_4X8] = aom_##type##_predictor_4x8;     \
-  p[TX_8X4] = aom_##type##_predictor_8x4;     \
-  p[TX_8X16] = aom_##type##_predictor_8x16;   \
-  p[TX_16X8] = aom_##type##_predictor_16x8;   \
-  p[TX_16X32] = aom_##type##_predictor_16x32; \
-  p[TX_32X16] = aom_##type##_predictor_32x16;
-#endif  // CONFIG_TX64X64
+  p[TX_64X32] = aom_##type##_predictor_64x32; \
+  p[TX_4X16] = aom_##type##_predictor_4x16;   \
+  p[TX_16X4] = aom_##type##_predictor_16x4;   \
+  p[TX_8X32] = aom_##type##_predictor_8x32;   \
+  p[TX_32X8] = aom_##type##_predictor_32x8;   \
+  p[TX_16X64] = aom_##type##_predictor_16x64; \
+  p[TX_64X16] = aom_##type##_predictor_64x16;
 
-#if CONFIG_TX64X64
 #define INIT_NO_4X4(p, type)                  \
   p[TX_8X8] = aom_##type##_predictor_8x8;     \
   p[TX_16X16] = aom_##type##_predictor_16x16; \
   p[TX_32X32] = aom_##type##_predictor_32x32; \
   p[TX_64X64] = aom_##type##_predictor_64x64; \
   INIT_RECTANGULAR(p, type)
-#else
-#define INIT_NO_4X4(p, type)                  \
-  p[TX_8X8] = aom_##type##_predictor_8x8;     \
-  p[TX_16X16] = aom_##type##_predictor_16x16; \
-  p[TX_32X32] = aom_##type##_predictor_32x32; \
-  INIT_RECTANGULAR(p, type)
-#endif  // CONFIG_TX64X64
 
-#if CONFIG_CHROMA_2X2
 #define INIT_ALL_SIZES(p, type)           \
-  p[TX_2X2] = aom_##type##_predictor_2x2; \
   p[TX_4X4] = aom_##type##_predictor_4x4; \
   INIT_NO_4X4(p, type)
-#else
-#define INIT_ALL_SIZES(p, type)           \
-  p[TX_4X4] = aom_##type##_predictor_4x4; \
-  INIT_NO_4X4(p, type)
-#endif
 
   INIT_ALL_SIZES(pred[V_PRED], v);
   INIT_ALL_SIZES(pred[H_PRED], h);
-  INIT_ALL_SIZES(pred[D207_PRED], d207e);
-  INIT_ALL_SIZES(pred[D45_PRED], d45e);
-  INIT_ALL_SIZES(pred[D63_PRED], d63e);
-  INIT_ALL_SIZES(pred[D117_PRED], d117);
-  INIT_ALL_SIZES(pred[D135_PRED], d135);
-  INIT_ALL_SIZES(pred[D153_PRED], d153);
-
-  INIT_ALL_SIZES(pred[TM_PRED], paeth);
+  INIT_ALL_SIZES(pred[PAETH_PRED], paeth);
   INIT_ALL_SIZES(pred[SMOOTH_PRED], smooth);
-#if CONFIG_SMOOTH_HV
   INIT_ALL_SIZES(pred[SMOOTH_V_PRED], smooth_v);
   INIT_ALL_SIZES(pred[SMOOTH_H_PRED], smooth_h);
-#endif  // CONFIG_SMOOTH_HV
-
   INIT_ALL_SIZES(dc_pred[0][0], dc_128);
   INIT_ALL_SIZES(dc_pred[0][1], dc_top);
   INIT_ALL_SIZES(dc_pred[1][0], dc_left);
   INIT_ALL_SIZES(dc_pred[1][1], dc);
 
-#if CONFIG_HIGHBITDEPTH
   INIT_ALL_SIZES(pred_high[V_PRED], highbd_v);
   INIT_ALL_SIZES(pred_high[H_PRED], highbd_h);
-  INIT_ALL_SIZES(pred_high[D207_PRED], highbd_d207e);
-  INIT_ALL_SIZES(pred_high[D45_PRED], highbd_d45e);
-  INIT_ALL_SIZES(pred_high[D63_PRED], highbd_d63e);
-  INIT_ALL_SIZES(pred_high[D117_PRED], highbd_d117);
-  INIT_ALL_SIZES(pred_high[D135_PRED], highbd_d135);
-  INIT_ALL_SIZES(pred_high[D153_PRED], highbd_d153);
-
-  INIT_ALL_SIZES(pred_high[TM_PRED], highbd_paeth);
+  INIT_ALL_SIZES(pred_high[PAETH_PRED], highbd_paeth);
   INIT_ALL_SIZES(pred_high[SMOOTH_PRED], highbd_smooth);
-#if CONFIG_SMOOTH_HV
   INIT_ALL_SIZES(pred_high[SMOOTH_V_PRED], highbd_smooth_v);
   INIT_ALL_SIZES(pred_high[SMOOTH_H_PRED], highbd_smooth_h);
-#endif  // CONFIG_SMOOTH_HV
-
   INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128);
   INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top);
   INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left);
   INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc);
-#endif  // CONFIG_HIGHBITDEPTH
-
 #undef intra_pred_allsizes
 }
 
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-static int intra_subpel_interp(int base, int shift, const uint8_t *ref,
-                               int ref_start_idx, int ref_end_idx,
-                               INTRA_FILTER filter_type) {
-  int val, k, idx, filter_idx = 0;
-  const int16_t *filter = NULL;
-
-  if (filter_type == INTRA_FILTER_LINEAR) {
-    val = ref[base] * (256 - shift) + ref[base + 1] * shift;
-    val = ROUND_POWER_OF_TWO(val, 8);
-  } else {
-    filter_idx = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS);
-    filter = av1_intra_filter_kernels[filter_type][filter_idx];
-
-    if (filter_idx < (1 << SUBPEL_BITS)) {
-      val = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) {
-        idx = base + 1 - (SUBPEL_TAPS / 2) + k;
-        idx = AOMMAX(AOMMIN(idx, ref_end_idx), ref_start_idx);
-        val += ref[idx] * filter[k];
-      }
-      val = ROUND_POWER_OF_TWO(val, FILTER_BITS);
-    } else {
-      val = ref[base + 1];
-    }
-  }
-
-  return val;
-}
-#endif  // CONFIG_INTRA_INTERP
-
 // Directional prediction, zone 1: 0 < angle < 90
-static void dr_prediction_z1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
-                             const uint8_t *above, const uint8_t *left,
-#if CONFIG_INTRA_INTERP
-                             INTRA_FILTER filter_type,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                             int upsample_above,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                             int dx, int dy) {
+void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                            const uint8_t *above, const uint8_t *left,
+                            int upsample_above, int dx, int dy) {
   int r, c, x, base, shift, val;
 
   (void)left;
@@ -814,16 +524,13 @@ static void dr_prediction_z1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   assert(dy == 1);
   assert(dx > 0);
 
-#if !CONFIG_INTRA_EDGE_UPSAMPLE
-  const int upsample_above = 0;
-#endif  // !CONFIG_INTRA_EDGE_UPSAMPLE
   const int max_base_x = ((bw + bh) - 1) << upsample_above;
-  const int frac_bits = 8 - upsample_above;
+  const int frac_bits = 6 - upsample_above;
   const int base_inc = 1 << upsample_above;
   x = dx;
   for (r = 0; r < bh; ++r, dst += stride, x += dx) {
     base = x >> frac_bits;
-    shift = (x << upsample_above) & 0xFF;
+    shift = ((x << upsample_above) & 0x3F) >> 1;
 
     if (base >= max_base_x) {
       for (int i = r; i < bh; ++i) {
@@ -835,14 +542,8 @@ static void dr_prediction_z1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
 
     for (c = 0; c < bw; ++c, base += base_inc) {
       if (base < max_base_x) {
-#if CONFIG_INTRA_INTERP
-        val = intra_subpel_interp(base, shift, above, 0, bw + bh - 1,
-                                  filter_type);
-#else   // CONFIG_INTRA_INTERP
-        val = above[base] * (256 - shift) + above[base + 1] * shift;
-        val = ROUND_POWER_OF_TWO(val, 8);
-#endif  // CONFIG_INTRA_INTERP
-        dst[c] = clip_pixel(val);
+        val = above[base] * (32 - shift) + above[base + 1] * shift;
+        dst[c] = ROUND_POWER_OF_TWO(val, 5);
       } else {
         dst[c] = above[max_base_x];
       }
@@ -851,68 +552,44 @@ static void dr_prediction_z1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
 }
 
 // Directional prediction, zone 2: 90 < angle < 180
-static void dr_prediction_z2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
-                             const uint8_t *above, const uint8_t *left,
-#if CONFIG_INTRA_INTERP
-                             INTRA_FILTER filter_type,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                             int upsample_above, int upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                             int dx, int dy) {
+void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                            const uint8_t *above, const uint8_t *left,
+                            int upsample_above, int upsample_left, int dx,
+                            int dy) {
   int r, c, x, y, shift1, shift2, val, base1, base2;
 
   assert(dx > 0);
   assert(dy > 0);
 
-#if !CONFIG_INTRA_EDGE_UPSAMPLE
-  const int upsample_above = 0;
-  const int upsample_left = 0;
-#endif  // !CONFIG_INTRA_EDGE_UPSAMPLE
   const int min_base_x = -(1 << upsample_above);
-  const int frac_bits_x = 8 - upsample_above;
-  const int frac_bits_y = 8 - upsample_left;
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
   const int base_inc_x = 1 << upsample_above;
   x = -dx;
   for (r = 0; r < bh; ++r, x -= dx, dst += stride) {
     base1 = x >> frac_bits_x;
-    y = (r << 8) - dy;
+    y = (r << 6) - dy;
     for (c = 0; c < bw; ++c, base1 += base_inc_x, y -= dy) {
       if (base1 >= min_base_x) {
-        shift1 = (x * (1 << upsample_above)) & 0xFF;
-#if CONFIG_INTRA_INTERP
-        val =
-            intra_subpel_interp(base1, shift1, above, -1, bw - 1, filter_type);
-#else
-        val = above[base1] * (256 - shift1) + above[base1 + 1] * shift1;
-        val = ROUND_POWER_OF_TWO(val, 8);
-#endif  // CONFIG_INTRA_INTERP
+        shift1 = ((x * (1 << upsample_above)) & 0x3F) >> 1;
+        val = above[base1] * (32 - shift1) + above[base1 + 1] * shift1;
+        val = ROUND_POWER_OF_TWO(val, 5);
       } else {
         base2 = y >> frac_bits_y;
         assert(base2 >= -(1 << upsample_left));
-        shift2 = (y * (1 << upsample_left)) & 0xFF;
-#if CONFIG_INTRA_INTERP
-        val = intra_subpel_interp(base2, shift2, left, -1, bh - 1, filter_type);
-#else
-        val = left[base2] * (256 - shift2) + left[base2 + 1] * shift2;
-        val = ROUND_POWER_OF_TWO(val, 8);
-#endif  // CONFIG_INTRA_INTERP
+        shift2 = ((y * (1 << upsample_left)) & 0x3F) >> 1;
+        val = left[base2] * (32 - shift2) + left[base2 + 1] * shift2;
+        val = ROUND_POWER_OF_TWO(val, 5);
       }
-      dst[c] = clip_pixel(val);
+      dst[c] = val;
     }
   }
 }
 
 // Directional prediction, zone 3: 180 < angle < 270
-static void dr_prediction_z3(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
-                             const uint8_t *above, const uint8_t *left,
-#if CONFIG_INTRA_INTERP
-                             INTRA_FILTER filter_type,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                             int upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                             int dx, int dy) {
+void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                            const uint8_t *above, const uint8_t *left,
+                            int upsample_left, int dx, int dy) {
   int r, c, y, base, shift, val;
 
   (void)above;
@@ -921,27 +598,18 @@ static void dr_prediction_z3(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   assert(dx == 1);
   assert(dy > 0);
 
-#if !CONFIG_INTRA_EDGE_UPSAMPLE
-  const int upsample_left = 0;
-#endif  // !CONFIG_INTRA_EDGE_UPSAMPLE
   const int max_base_y = (bw + bh - 1) << upsample_left;
-  const int frac_bits = 8 - upsample_left;
+  const int frac_bits = 6 - upsample_left;
   const int base_inc = 1 << upsample_left;
   y = dy;
   for (c = 0; c < bw; ++c, y += dy) {
     base = y >> frac_bits;
-    shift = (y << upsample_left) & 0xFF;
+    shift = ((y << upsample_left) & 0x3F) >> 1;
 
     for (r = 0; r < bh; ++r, base += base_inc) {
       if (base < max_base_y) {
-#if CONFIG_INTRA_INTERP
-        val =
-            intra_subpel_interp(base, shift, left, 0, bw + bh - 1, filter_type);
-#else   // CONFIG_INTRA_INTERP
-        val = left[base] * (256 - shift) + left[base + 1] * shift;
-        val = ROUND_POWER_OF_TWO(val, 8);
-#endif  // CONFIG_INTRA_INTERP
-        dst[r * stride + c] = clip_pixel(val);
+        val = left[base] * (32 - shift) + left[base + 1] * shift;
+        dst[r * stride + c] = val = ROUND_POWER_OF_TWO(val, 5);
       } else {
         for (; r < bh; ++r) dst[r * stride + c] = left[max_base_y];
         break;
@@ -950,78 +618,24 @@ static void dr_prediction_z3(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   }
 }
 
-// Get the shift (up-scaled by 256) in X w.r.t a unit change in Y.
-// If angle > 0 && angle < 90, dx = -((int)(256 / t));
-// If angle > 90 && angle < 180, dx = (int)(256 / t);
-// If angle > 180 && angle < 270, dx = 1;
-static INLINE int get_dx(int angle) {
-  if (angle > 0 && angle < 90) {
-    return dr_intra_derivative[angle];
-  } else if (angle > 90 && angle < 180) {
-    return dr_intra_derivative[180 - angle];
-  } else {
-    // In this case, we are not really going to use dx. We may return any value.
-    return 1;
-  }
-}
-
-// Get the shift (up-scaled by 256) in Y w.r.t a unit change in X.
-// If angle > 0 && angle < 90, dy = 1;
-// If angle > 90 && angle < 180, dy = (int)(256 * t);
-// If angle > 180 && angle < 270, dy = -((int)(256 * t));
-static INLINE int get_dy(int angle) {
-  if (angle > 90 && angle < 180) {
-    return dr_intra_derivative[angle - 90];
-  } else if (angle > 180 && angle < 270) {
-    return dr_intra_derivative[270 - angle];
-  } else {
-    // In this case, we are not really going to use dy. We may return any value.
-    return 1;
-  }
-}
-
 static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
                          const uint8_t *above, const uint8_t *left,
-#if CONFIG_INTRA_INTERP
-                         INTRA_FILTER filter_type,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                         int upsample_above, int upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                         int angle) {
-  const int dx = get_dx(angle);
-  const int dy = get_dy(angle);
+                         int upsample_above, int upsample_left, int angle) {
+  const int dx = av1_get_dx(angle);
+  const int dy = av1_get_dy(angle);
   const int bw = tx_size_wide[tx_size];
   const int bh = tx_size_high[tx_size];
   assert(angle > 0 && angle < 270);
 
   if (angle > 0 && angle < 90) {
-    dr_prediction_z1(dst, stride, bw, bh, above, left,
-#if CONFIG_INTRA_INTERP
-                     filter_type,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                     upsample_above,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                     dx, dy);
+    av1_dr_prediction_z1(dst, stride, bw, bh, above, left, upsample_above, dx,
+                         dy);
   } else if (angle > 90 && angle < 180) {
-    dr_prediction_z2(dst, stride, bw, bh, above, left,
-#if CONFIG_INTRA_INTERP
-                     filter_type,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                     upsample_above, upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                     dx, dy);
+    av1_dr_prediction_z2(dst, stride, bw, bh, above, left, upsample_above,
+                         upsample_left, dx, dy);
   } else if (angle > 180 && angle < 270) {
-    dr_prediction_z3(dst, stride, bw, bh, above, left,
-#if CONFIG_INTRA_INTERP
-                     filter_type,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                     upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                     dx, dy);
+    av1_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left, dx,
+                         dy);
   } else if (angle == 90) {
     pred[V_PRED][tx_size](dst, stride, above, left);
   } else if (angle == 180) {
@@ -1029,66 +643,26 @@ static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
-#if CONFIG_INTRA_INTERP
-static int highbd_intra_subpel_interp(int base, int shift, const uint16_t *ref,
-                                      int ref_start_idx, int ref_end_idx,
-                                      INTRA_FILTER filter_type) {
-  int val, k, idx, filter_idx = 0;
-  const int16_t *filter = NULL;
-
-  if (filter_type == INTRA_FILTER_LINEAR) {
-    val = ref[base] * (256 - shift) + ref[base + 1] * shift;
-    val = ROUND_POWER_OF_TWO(val, 8);
-  } else {
-    filter_idx = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS);
-    filter = av1_intra_filter_kernels[filter_type][filter_idx];
-
-    if (filter_idx < (1 << SUBPEL_BITS)) {
-      val = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) {
-        idx = base + 1 - (SUBPEL_TAPS / 2) + k;
-        idx = AOMMAX(AOMMIN(idx, ref_end_idx), ref_start_idx);
-        val += ref[idx] * filter[k];
-      }
-      val = ROUND_POWER_OF_TWO(val, FILTER_BITS);
-    } else {
-      val = ref[base + 1];
-    }
-  }
-
-  return val;
-}
-#endif  // CONFIG_INTRA_INTERP
-
 // Directional prediction, zone 1: 0 < angle < 90
-static void highbd_dr_prediction_z1(uint16_t *dst, ptrdiff_t stride, int bw,
-                                    int bh, const uint16_t *above,
-                                    const uint16_t *left,
-#if CONFIG_INTRA_INTERP
-                                    INTRA_FILTER filter_type,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                                    int upsample_above,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                                    int dx, int dy, int bd) {
+void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw,
+                                   int bh, const uint16_t *above,
+                                   const uint16_t *left, int upsample_above,
+                                   int dx, int dy, int bd) {
   int r, c, x, base, shift, val;
 
   (void)left;
   (void)dy;
+  (void)bd;
   assert(dy == 1);
   assert(dx > 0);
 
-#if !CONFIG_INTRA_EDGE_UPSAMPLE
-  const int upsample_above = 0;
-#endif  // !CONFIG_INTRA_EDGE_UPSAMPLE
   const int max_base_x = ((bw + bh) - 1) << upsample_above;
-  const int frac_bits = 8 - upsample_above;
+  const int frac_bits = 6 - upsample_above;
   const int base_inc = 1 << upsample_above;
   x = dx;
   for (r = 0; r < bh; ++r, dst += stride, x += dx) {
     base = x >> frac_bits;
-    shift = (x << upsample_above) & 0xFF;
+    shift = ((x << upsample_above) & 0x3F) >> 1;
 
     if (base >= max_base_x) {
       for (int i = r; i < bh; ++i) {
@@ -1100,14 +674,8 @@ static void highbd_dr_prediction_z1(uint16_t *dst, ptrdiff_t stride, int bw,
 
     for (c = 0; c < bw; ++c, base += base_inc) {
       if (base < max_base_x) {
-#if CONFIG_INTRA_INTERP
-        val = highbd_intra_subpel_interp(base, shift, above, 0, bw + bh - 1,
-                                         filter_type);
-#else
-        val = above[base] * (256 - shift) + above[base + 1] * shift;
-        val = ROUND_POWER_OF_TWO(val, 8);
-#endif  // CONFIG_INTRA_INTERP
-        dst[c] = clip_pixel_highbd(val, bd);
+        val = above[base] * (32 - shift) + above[base + 1] * shift;
+        dst[c] = ROUND_POWER_OF_TWO(val, 5);
       } else {
         dst[c] = above[max_base_x];
       }
@@ -1116,100 +684,67 @@ static void highbd_dr_prediction_z1(uint16_t *dst, ptrdiff_t stride, int bw,
 }
 
 // Directional prediction, zone 2: 90 < angle < 180
-static void highbd_dr_prediction_z2(uint16_t *dst, ptrdiff_t stride, int bw,
-                                    int bh, const uint16_t *above,
-                                    const uint16_t *left,
-#if CONFIG_INTRA_INTERP
-                                    INTRA_FILTER filter_type,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                                    int upsample_above, int upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                                    int dx, int dy, int bd) {
+void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw,
+                                   int bh, const uint16_t *above,
+                                   const uint16_t *left, int upsample_above,
+                                   int upsample_left, int dx, int dy, int bd) {
   int r, c, x, y, shift, val, base;
 
+  (void)bd;
   assert(dx > 0);
   assert(dy > 0);
 
-#if !CONFIG_INTRA_EDGE_UPSAMPLE
-  const int upsample_above = 0;
-  const int upsample_left = 0;
-#endif  // !CONFIG_INTRA_EDGE_UPSAMPLE
   const int min_base_x = -(1 << upsample_above);
-  const int frac_bits_x = 8 - upsample_above;
-  const int frac_bits_y = 8 - upsample_left;
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
   for (r = 0; r < bh; ++r) {
     for (c = 0; c < bw; ++c) {
       y = r + 1;
-      x = (c << 8) - y * dx;
+      x = (c << 6) - y * dx;
       base = x >> frac_bits_x;
       if (base >= min_base_x) {
-        shift = (x * (1 << upsample_above)) & 0xFF;
-#if CONFIG_INTRA_INTERP
-        val = highbd_intra_subpel_interp(base, shift, above, -1, bw - 1,
-                                         filter_type);
-#else
-        val = above[base] * (256 - shift) + above[base + 1] * shift;
-        val = ROUND_POWER_OF_TWO(val, 8);
-#endif  // CONFIG_INTRA_INTERP
+        shift = ((x * (1 << upsample_above)) & 0x3F) >> 1;
+        val = above[base] * (32 - shift) + above[base + 1] * shift;
+        val = ROUND_POWER_OF_TWO(val, 5);
       } else {
         x = c + 1;
-        y = (r << 8) - x * dy;
+        y = (r << 6) - x * dy;
         base = y >> frac_bits_y;
-        shift = (y * (1 << upsample_left)) & 0xFF;
-#if CONFIG_INTRA_INTERP
-        val = highbd_intra_subpel_interp(base, shift, left, -1, bh - 1,
-                                         filter_type);
-#else
-        val = left[base] * (256 - shift) + left[base + 1] * shift;
-        val = ROUND_POWER_OF_TWO(val, 8);
-#endif  // CONFIG_INTRA_INTERP
+        shift = ((y * (1 << upsample_left)) & 0x3F) >> 1;
+        val = left[base] * (32 - shift) + left[base + 1] * shift;
+        val = ROUND_POWER_OF_TWO(val, 5);
       }
-      dst[c] = clip_pixel_highbd(val, bd);
+      dst[c] = val;
     }
     dst += stride;
   }
 }
 
 // Directional prediction, zone 3: 180 < angle < 270
-static void highbd_dr_prediction_z3(uint16_t *dst, ptrdiff_t stride, int bw,
-                                    int bh, const uint16_t *above,
-                                    const uint16_t *left,
-#if CONFIG_INTRA_INTERP
-                                    INTRA_FILTER filter_type,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                                    int upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                                    int dx, int dy, int bd) {
+void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw,
+                                   int bh, const uint16_t *above,
+                                   const uint16_t *left, int upsample_left,
+                                   int dx, int dy, int bd) {
   int r, c, y, base, shift, val;
 
   (void)above;
   (void)dx;
+  (void)bd;
   assert(dx == 1);
   assert(dy > 0);
 
-#if !CONFIG_INTRA_EDGE_UPSAMPLE
-  const int upsample_left = 0;
-#endif  // !CONFIG_INTRA_EDGE_UPSAMPLE
   const int max_base_y = (bw + bh - 1) << upsample_left;
-  const int frac_bits = 8 - upsample_left;
+  const int frac_bits = 6 - upsample_left;
   const int base_inc = 1 << upsample_left;
   y = dy;
   for (c = 0; c < bw; ++c, y += dy) {
     base = y >> frac_bits;
-    shift = (y << upsample_left) & 0xFF;
+    shift = ((y << upsample_left) & 0x3F) >> 1;
 
     for (r = 0; r < bh; ++r, base += base_inc) {
       if (base < max_base_y) {
-#if CONFIG_INTRA_INTERP
-        val = highbd_intra_subpel_interp(base, shift, left, 0, bw + bh - 1,
-                                         filter_type);
-#else
-        val = left[base] * (256 - shift) + left[base + 1] * shift;
-        val = ROUND_POWER_OF_TWO(val, 8);
-#endif  // CONFIG_INTRA_INTERP
-        dst[r * stride + c] = clip_pixel_highbd(val, bd);
+        val = left[base] * (32 - shift) + left[base + 1] * shift;
+        dst[r * stride + c] = ROUND_POWER_OF_TWO(val, 5);
       } else {
         for (; r < bh; ++r) dst[r * stride + c] = left[max_base_y];
         break;
@@ -1220,1002 +755,253 @@ static void highbd_dr_prediction_z3(uint16_t *dst, ptrdiff_t stride, int bw,
 
 static void highbd_dr_predictor(uint16_t *dst, ptrdiff_t stride,
                                 TX_SIZE tx_size, const uint16_t *above,
-                                const uint16_t *left,
-#if CONFIG_INTRA_INTERP
-                                INTRA_FILTER filter,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                                int upsample_above, int upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                                int angle, int bd) {
-  const int dx = get_dx(angle);
-  const int dy = get_dy(angle);
+                                const uint16_t *left, int upsample_above,
+                                int upsample_left, int angle, int bd) {
+  const int dx = av1_get_dx(angle);
+  const int dy = av1_get_dy(angle);
   const int bw = tx_size_wide[tx_size];
   const int bh = tx_size_high[tx_size];
   assert(angle > 0 && angle < 270);
 
   if (angle > 0 && angle < 90) {
-    highbd_dr_prediction_z1(dst, stride, bw, bh, above, left,
-#if CONFIG_INTRA_INTERP
-                            filter,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                            upsample_above,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                            dx, dy, bd);
+    av1_highbd_dr_prediction_z1(dst, stride, bw, bh, above, left,
+                                upsample_above, dx, dy, bd);
   } else if (angle > 90 && angle < 180) {
-    highbd_dr_prediction_z2(dst, stride, bw, bh, above, left,
-#if CONFIG_INTRA_INTERP
-                            filter,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                            upsample_above, upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                            dx, dy, bd);
+    av1_highbd_dr_prediction_z2(dst, stride, bw, bh, above, left,
+                                upsample_above, upsample_left, dx, dy, bd);
   } else if (angle > 180 && angle < 270) {
-    highbd_dr_prediction_z3(dst, stride, bw, bh, above, left,
-#if CONFIG_INTRA_INTERP
-                            filter,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                            upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                            dx, dy, bd);
+    av1_highbd_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left,
+                                dx, dy, bd);
   } else if (angle == 90) {
     pred_high[V_PRED][tx_size](dst, stride, above, left, bd);
   } else if (angle == 180) {
     pred_high[H_PRED][tx_size](dst, stride, above, left, bd);
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_EXT_INTRA
 
-#if CONFIG_FILTER_INTRA
-#if USE_3TAP_INTRA_FILTER
-static int filter_intra_taps_3[TX_SIZES_ALL][FILTER_INTRA_MODES][3] = {
-#if CONFIG_CHROMA_2X2
-  {
-      { 697, 836, -509 },
-      { 993, 513, -482 },
-      { 381, 984, -341 },
-      { 642, 1169, -787 },
-      { 590, 553, -119 },
-      { 762, 385, -123 },
-      { 358, 687, -21 },
-      { 411, 1083, -470 },
-      { 912, 814, -702 },
-      { 883, 902, -761 },
-  },
-#endif
+DECLARE_ALIGNED(16, const int8_t,
+                av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]) = {
   {
-      { 697, 836, -509 },
-      { 993, 513, -482 },
-      { 381, 984, -341 },
-      { 642, 1169, -787 },
-      { 590, 553, -119 },
-      { 762, 385, -123 },
-      { 358, 687, -21 },
-      { 411, 1083, -470 },
-      { 912, 814, -702 },
-      { 883, 902, -761 },
+      { -6, 10, 0, 0, 0, 12, 0, 0 },
+      { -5, 2, 10, 0, 0, 9, 0, 0 },
+      { -3, 1, 1, 10, 0, 7, 0, 0 },
+      { -3, 1, 1, 2, 10, 5, 0, 0 },
+      { -4, 6, 0, 0, 0, 2, 12, 0 },
+      { -3, 2, 6, 0, 0, 2, 9, 0 },
+      { -3, 2, 2, 6, 0, 2, 7, 0 },
+      { -3, 1, 2, 2, 6, 3, 5, 0 },
   },
   {
-      { 659, 816, -451 },
-      { 980, 625, -581 },
-      { 558, 962, -496 },
-      { 681, 888, -545 },
-      { 591, 613, 180 },
-      { 778, 399, -153 },
-      { 495, 641, -112 },
-      { 671, 937, -584 },
-      { 745, 940, -661 },
-      { 839, 911, -726 },
+      { -10, 16, 0, 0, 0, 10, 0, 0 },
+      { -6, 0, 16, 0, 0, 6, 0, 0 },
+      { -4, 0, 0, 16, 0, 4, 0, 0 },
+      { -2, 0, 0, 0, 16, 2, 0, 0 },
+      { -10, 16, 0, 0, 0, 0, 10, 0 },
+      { -6, 0, 16, 0, 0, 0, 6, 0 },
+      { -4, 0, 0, 16, 0, 0, 4, 0 },
+      { -2, 0, 0, 0, 16, 0, 2, 0 },
   },
   {
-      { 539, 927, -442 },
-      { 1003, 714, -693 },
-      { 349, 1271, -596 },
-      { 820, 764, -560 },
-      { 524, 816, -316 },
-      { 780, 681, -437 },
-      { 586, 795, -357 },
-      { 551, 1135, -663 },
-      { 593, 1061, -630 },
-      { 974, 970, -920 },
+      { -8, 8, 0, 0, 0, 16, 0, 0 },
+      { -8, 0, 8, 0, 0, 16, 0, 0 },
+      { -8, 0, 0, 8, 0, 16, 0, 0 },
+      { -8, 0, 0, 0, 8, 16, 0, 0 },
+      { -4, 4, 0, 0, 0, 0, 16, 0 },
+      { -4, 0, 4, 0, 0, 0, 16, 0 },
+      { -4, 0, 0, 4, 0, 0, 16, 0 },
+      { -4, 0, 0, 0, 4, 0, 16, 0 },
   },
   {
-      { 595, 919, -490 },
-      { 945, 668, -579 },
-      { 495, 962, -433 },
-      { 385, 1551, -912 },
-      { 455, 554, 15 },
-      { 852, 478, -306 },
-      { 177, 760, -87 },
-      { -65, 1611, -522 },
-      { 815, 894, -685 },
-      { 846, 1010, -832 },
+      { -2, 8, 0, 0, 0, 10, 0, 0 },
+      { -1, 3, 8, 0, 0, 6, 0, 0 },
+      { -1, 2, 3, 8, 0, 4, 0, 0 },
+      { 0, 1, 2, 3, 8, 2, 0, 0 },
+      { -1, 4, 0, 0, 0, 3, 10, 0 },
+      { -1, 3, 4, 0, 0, 4, 6, 0 },
+      { -1, 2, 3, 4, 0, 4, 4, 0 },
+      { -1, 2, 2, 3, 4, 3, 3, 0 },
   },
-#if CONFIG_TX64X64
   {
-      { 595, 919, -490 },
-      { 945, 668, -579 },
-      { 495, 962, -433 },
-      { 385, 1551, -912 },
-      { 455, 554, 15 },
-      { 852, 478, -306 },
-      { 177, 760, -87 },
-      { -65, 1611, -522 },
-      { 815, 894, -685 },
-      { 846, 1010, -832 },
+      { -12, 14, 0, 0, 0, 14, 0, 0 },
+      { -10, 0, 14, 0, 0, 12, 0, 0 },
+      { -9, 0, 0, 14, 0, 11, 0, 0 },
+      { -8, 0, 0, 0, 14, 10, 0, 0 },
+      { -10, 12, 0, 0, 0, 0, 14, 0 },
+      { -9, 1, 12, 0, 0, 0, 12, 0 },
+      { -8, 0, 0, 12, 0, 1, 11, 0 },
+      { -7, 0, 0, 1, 12, 1, 9, 0 },
   },
-#endif  // CONFIG_TX64X64
-  {
-      { 697, 836, -509 },
-      { 993, 513, -482 },
-      { 381, 984, -341 },
-      { 642, 1169, -787 },
-      { 590, 553, -119 },
-      { 762, 385, -123 },
-      { 358, 687, -21 },
-      { 411, 1083, -470 },
-      { 912, 814, -702 },
-      { 883, 902, -761 },
-  },
-  {
-      { 697, 836, -509 },
-      { 993, 513, -482 },
-      { 381, 984, -341 },
-      { 642, 1169, -787 },
-      { 590, 553, -119 },
-      { 762, 385, -123 },
-      { 358, 687, -21 },
-      { 411, 1083, -470 },
-      { 912, 814, -702 },
-      { 883, 902, -761 },
-  },
-  {
-      { 659, 816, -451 },
-      { 980, 625, -581 },
-      { 558, 962, -496 },
-      { 681, 888, -545 },
-      { 591, 613, 180 },
-      { 778, 399, -153 },
-      { 495, 641, -112 },
-      { 671, 937, -584 },
-      { 745, 940, -661 },
-      { 839, 911, -726 },
-  },
-  {
-      { 659, 816, -451 },
-      { 980, 625, -581 },
-      { 558, 962, -496 },
-      { 681, 888, -545 },
-      { 591, 613, 180 },
-      { 778, 399, -153 },
-      { 495, 641, -112 },
-      { 671, 937, -584 },
-      { 745, 940, -661 },
-      { 839, 911, -726 },
-  },
-  {
-      { 539, 927, -442 },
-      { 1003, 714, -693 },
-      { 349, 1271, -596 },
-      { 820, 764, -560 },
-      { 524, 816, -316 },
-      { 780, 681, -437 },
-      { 586, 795, -357 },
-      { 551, 1135, -663 },
-      { 593, 1061, -630 },
-      { 974, 970, -920 },
-  },
-  {
-      { 539, 927, -442 },
-      { 1003, 714, -693 },
-      { 349, 1271, -596 },
-      { 820, 764, -560 },
-      { 524, 816, -316 },
-      { 780, 681, -437 },
-      { 586, 795, -357 },
-      { 551, 1135, -663 },
-      { 593, 1061, -630 },
-      { 974, 970, -920 },
-  },
-  {
-      { 697, 836, -509 },
-      { 993, 513, -482 },
-      { 381, 984, -341 },
-      { 642, 1169, -787 },
-      { 590, 553, -119 },
-      { 762, 385, -123 },
-      { 358, 687, -21 },
-      { 411, 1083, -470 },
-      { 912, 814, -702 },
-      { 883, 902, -761 },
-  },
-  {
-      { 697, 836, -509 },
-      { 993, 513, -482 },
-      { 381, 984, -341 },
-      { 642, 1169, -787 },
-      { 590, 553, -119 },
-      { 762, 385, -123 },
-      { 358, 687, -21 },
-      { 411, 1083, -470 },
-      { 912, 814, -702 },
-      { 883, 902, -761 },
-  },
-  {
-      { 659, 816, -451 },
-      { 980, 625, -581 },
-      { 558, 962, -496 },
-      { 681, 888, -545 },
-      { 591, 613, 180 },
-      { 778, 399, -153 },
-      { 495, 641, -112 },
-      { 671, 937, -584 },
-      { 745, 940, -661 },
-      { 839, 911, -726 },
-  },
-  {
-      { 659, 816, -451 },
-      { 980, 625, -581 },
-      { 558, 962, -496 },
-      { 681, 888, -545 },
-      { 591, 613, 180 },
-      { 778, 399, -153 },
-      { 495, 641, -112 },
-      { 671, 937, -584 },
-      { 745, 940, -661 },
-      { 839, 911, -726 },
-  }
 };
-#else
-static int filter_intra_taps_4[TX_SIZES_ALL][FILTER_INTRA_MODES][4] = {
-#if CONFIG_CHROMA_2X2
-  {
-      { 735, 881, -537, -54 },
-      { 1005, 519, -488, -11 },
-      { 383, 990, -343, -6 },
-      { 442, 805, -542, 319 },
-      { 658, 616, -133, -116 },
-      { 875, 442, -141, -151 },
-      { 386, 741, -23, -80 },
-      { 390, 1027, -446, 51 },
-      { 679, 606, -523, 262 },
-      { 903, 922, -778, -23 },
-  },
-#endif
-  {
-      { 735, 881, -537, -54 },
-      { 1005, 519, -488, -11 },
-      { 383, 990, -343, -6 },
-      { 442, 805, -542, 319 },
-      { 658, 616, -133, -116 },
-      { 875, 442, -141, -151 },
-      { 386, 741, -23, -80 },
-      { 390, 1027, -446, 51 },
-      { 679, 606, -523, 262 },
-      { 903, 922, -778, -23 },
-  },
-  {
-      { 648, 803, -444, 16 },
-      { 972, 620, -576, 7 },
-      { 561, 967, -499, -5 },
-      { 585, 762, -468, 144 },
-      { 596, 619, -182, -9 },
-      { 895, 459, -176, -153 },
-      { 557, 722, -126, -129 },
-      { 601, 839, -523, 105 },
-      { 562, 709, -499, 251 },
-      { 803, 872, -695, 43 },
-  },
-  {
-      { 423, 728, -347, 111 },
-      { 963, 685, -665, 23 },
-      { 281, 1024, -480, 216 },
-      { 640, 596, -437, 78 },
-      { 429, 669, -259, 99 },
-      { 740, 646, -415, 23 },
-      { 568, 771, -346, 40 },
-      { 404, 833, -486, 209 },
-      { 398, 712, -423, 307 },
-      { 939, 935, -887, 17 },
-  },
-  {
-      { 477, 737, -393, 150 },
-      { 881, 630, -546, 67 },
-      { 506, 984, -443, -20 },
-      { 114, 459, -270, 528 },
-      { 433, 528, 14, 3 },
-      { 837, 470, -301, -30 },
-      { 181, 777, 89, -107 },
-      { -29, 716, -232, 259 },
-      { 589, 646, -495, 255 },
-      { 740, 884, -728, 77 },
-  },
-#if CONFIG_TX64X64
-  {
-      { 477, 737, -393, 150 },
-      { 881, 630, -546, 67 },
-      { 506, 984, -443, -20 },
-      { 114, 459, -270, 528 },
-      { 433, 528, 14, 3 },
-      { 837, 470, -301, -30 },
-      { 181, 777, 89, -107 },
-      { -29, 716, -232, 259 },
-      { 589, 646, -495, 255 },
-      { 740, 884, -728, 77 },
-  },
-#endif  // CONFIG_TX64X64
-  {
-      { 735, 881, -537, -54 },
-      { 1005, 519, -488, -11 },
-      { 383, 990, -343, -6 },
-      { 442, 805, -542, 319 },
-      { 658, 616, -133, -116 },
-      { 875, 442, -141, -151 },
-      { 386, 741, -23, -80 },
-      { 390, 1027, -446, 51 },
-      { 679, 606, -523, 262 },
-      { 903, 922, -778, -23 },
-  },
-  {
-      { 735, 881, -537, -54 },
-      { 1005, 519, -488, -11 },
-      { 383, 990, -343, -6 },
-      { 442, 805, -542, 319 },
-      { 658, 616, -133, -116 },
-      { 875, 442, -141, -151 },
-      { 386, 741, -23, -80 },
-      { 390, 1027, -446, 51 },
-      { 679, 606, -523, 262 },
-      { 903, 922, -778, -23 },
-  },
-  {
-      { 648, 803, -444, 16 },
-      { 972, 620, -576, 7 },
-      { 561, 967, -499, -5 },
-      { 585, 762, -468, 144 },
-      { 596, 619, -182, -9 },
-      { 895, 459, -176, -153 },
-      { 557, 722, -126, -129 },
-      { 601, 839, -523, 105 },
-      { 562, 709, -499, 251 },
-      { 803, 872, -695, 43 },
-  },
-  {
-      { 648, 803, -444, 16 },
-      { 972, 620, -576, 7 },
-      { 561, 967, -499, -5 },
-      { 585, 762, -468, 144 },
-      { 596, 619, -182, -9 },
-      { 895, 459, -176, -153 },
-      { 557, 722, -126, -129 },
-      { 601, 839, -523, 105 },
-      { 562, 709, -499, 251 },
-      { 803, 872, -695, 43 },
-  },
-  {
-      { 423, 728, -347, 111 },
-      { 963, 685, -665, 23 },
-      { 281, 1024, -480, 216 },
-      { 640, 596, -437, 78 },
-      { 429, 669, -259, 99 },
-      { 740, 646, -415, 23 },
-      { 568, 771, -346, 40 },
-      { 404, 833, -486, 209 },
-      { 398, 712, -423, 307 },
-      { 939, 935, -887, 17 },
-  },
-  {
-      { 423, 728, -347, 111 },
-      { 963, 685, -665, 23 },
-      { 281, 1024, -480, 216 },
-      { 640, 596, -437, 78 },
-      { 429, 669, -259, 99 },
-      { 740, 646, -415, 23 },
-      { 568, 771, -346, 40 },
-      { 404, 833, -486, 209 },
-      { 398, 712, -423, 307 },
-      { 939, 935, -887, 17 },
-  },
-  {
-      { 735, 881, -537, -54 },
-      { 1005, 519, -488, -11 },
-      { 383, 990, -343, -6 },
-      { 442, 805, -542, 319 },
-      { 658, 616, -133, -116 },
-      { 875, 442, -141, -151 },
-      { 386, 741, -23, -80 },
-      { 390, 1027, -446, 51 },
-      { 679, 606, -523, 262 },
-      { 903, 922, -778, -23 },
-  },
-  {
-      { 735, 881, -537, -54 },
-      { 1005, 519, -488, -11 },
-      { 383, 990, -343, -6 },
-      { 442, 805, -542, 319 },
-      { 658, 616, -133, -116 },
-      { 875, 442, -141, -151 },
-      { 386, 741, -23, -80 },
-      { 390, 1027, -446, 51 },
-      { 679, 606, -523, 262 },
-      { 903, 922, -778, -23 },
-  },
-  {
-      { 648, 803, -444, 16 },
-      { 972, 620, -576, 7 },
-      { 561, 967, -499, -5 },
-      { 585, 762, -468, 144 },
-      { 596, 619, -182, -9 },
-      { 895, 459, -176, -153 },
-      { 557, 722, -126, -129 },
-      { 601, 839, -523, 105 },
-      { 562, 709, -499, 251 },
-      { 803, 872, -695, 43 },
-  },
-  {
-      { 648, 803, -444, 16 },
-      { 972, 620, -576, 7 },
-      { 561, 967, -499, -5 },
-      { 585, 762, -468, 144 },
-      { 596, 619, -182, -9 },
-      { 895, 459, -176, -153 },
-      { 557, 722, -126, -129 },
-      { 601, 839, -523, 105 },
-      { 562, 709, -499, 251 },
-      { 803, 872, -695, 43 },
-  }
-};
-#endif
-
-#if USE_3TAP_INTRA_FILTER
-static void filter_intra_predictors_3tap(uint8_t *dst, ptrdiff_t stride,
-                                         TX_SIZE tx_size, const uint8_t *above,
-                                         const uint8_t *left, int mode) {
-  int r, c;
-  int mean, ipred;
-#if CONFIG_TX64X64
-  int buffer[65][65];
-#else
-  int buffer[33][33];
-#endif  // CONFIG_TX64X64
-  const int c0 = filter_intra_taps_3[tx_size][mode][0];
-  const int c1 = filter_intra_taps_3[tx_size][mode][1];
-  const int c2 = filter_intra_taps_3[tx_size][mode][2];
-  const int bw = tx_size_wide[tx_size];
-  const int bh = tx_size_high[tx_size];
-
-  mean = 0;
-  for (r = 0; r < bh; ++r) {
-    mean += (int)left[r];
-  }
-  for (c = 0; c < bw; ++c) {
-    mean += (int)above[c];
-  }
-  mean = (mean + ((bw + bh) >> 1)) / (bw + bh);
-
-  for (r = 0; r < bh; ++r) buffer[r + 1][0] = (int)left[r] - mean;
-
-  for (c = 0; c < bw + 1; ++c) buffer[0][c] = (int)above[c - 1] - mean;
 
-  for (r = 1; r < bh + 1; ++r)
-    for (c = 1; c < bw + 1; ++c) {
-      ipred = c0 * buffer[r - 1][c] + c1 * buffer[r][c - 1] +
-              c2 * buffer[r - 1][c - 1];
-      buffer[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_PREC_BITS);
-      buffer[r][c] = clip_pixel(buffer[r][c] + mean) - mean;
-    }
-
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = clip_pixel(buffer[r + 1][c + 1] + mean);
-    }
-    dst += stride;
-  }
-}
-#else
-static void filter_intra_predictors_4tap(uint8_t *dst, ptrdiff_t stride,
-                                         TX_SIZE tx_size, const uint8_t *above,
-                                         const uint8_t *left, int mode) {
+void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride,
+                                  TX_SIZE tx_size, const uint8_t *above,
+                                  const uint8_t *left, int mode) {
   int r, c;
-  int mean, ipred;
-#if CONFIG_TX64X64
-  int buffer[65][129];
-#else
-  int buffer[33][65];
-#endif  // CONFIG_TX64X64
-  const int c0 = filter_intra_taps_4[tx_size][mode][0];
-  const int c1 = filter_intra_taps_4[tx_size][mode][1];
-  const int c2 = filter_intra_taps_4[tx_size][mode][2];
-  const int c3 = filter_intra_taps_4[tx_size][mode][3];
+  uint8_t buffer[33][33];
   const int bw = tx_size_wide[tx_size];
   const int bh = tx_size_high[tx_size];
 
-  mean = 0;
-  for (r = 0; r < bh; ++r) {
-    mean += (int)left[r];
-  }
-  for (c = 0; c < bw; ++c) {
-    mean += (int)above[c];
-  }
-  mean = (mean + ((bw + bh) >> 1)) / (bw + bh);
-
-  for (r = 0; r < bh; ++r) buffer[r + 1][0] = (int)left[r] - mean;
-
-  for (c = 0; c < 2 * bw + 1; ++c) buffer[0][c] = (int)above[c - 1] - mean;
-
-  for (r = 1; r < bh + 1; ++r)
-    for (c = 1; c < 2 * bw + 1 - r; ++c) {
-      ipred = c0 * buffer[r - 1][c] + c1 * buffer[r][c - 1] +
-              c2 * buffer[r - 1][c - 1] + c3 * buffer[r - 1][c + 1];
-      buffer[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_PREC_BITS);
-      buffer[r][c] = clip_pixel(buffer[r][c] + mean) - mean;
+  assert(bw <= 32 && bh <= 32);
+
+  // The initialization is just for silencing Jenkins static analysis warnings
+  for (r = 0; r < bh + 1; ++r)
+    memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
+
+  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
+
+  for (r = 1; r < bh + 1; r += 2)
+    for (c = 1; c < bw + 1; c += 4) {
+      const uint8_t p0 = buffer[r - 1][c - 1];
+      const uint8_t p1 = buffer[r - 1][c];
+      const uint8_t p2 = buffer[r - 1][c + 1];
+      const uint8_t p3 = buffer[r - 1][c + 2];
+      const uint8_t p4 = buffer[r - 1][c + 3];
+      const uint8_t p5 = buffer[r][c - 1];
+      const uint8_t p6 = buffer[r + 1][c - 1];
+      for (int k = 0; k < 8; ++k) {
+        int r_offset = k >> 2;
+        int c_offset = k & 0x03;
+        buffer[r + r_offset][c + c_offset] =
+            clip_pixel(ROUND_POWER_OF_TWO_SIGNED(
+                av1_filter_intra_taps[mode][k][0] * p0 +
+                    av1_filter_intra_taps[mode][k][1] * p1 +
+                    av1_filter_intra_taps[mode][k][2] * p2 +
+                    av1_filter_intra_taps[mode][k][3] * p3 +
+                    av1_filter_intra_taps[mode][k][4] * p4 +
+                    av1_filter_intra_taps[mode][k][5] * p5 +
+                    av1_filter_intra_taps[mode][k][6] * p6,
+                FILTER_INTRA_SCALE_BITS));
+      }
     }
 
   for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = clip_pixel(buffer[r + 1][c + 1] + mean);
-    }
+    memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
     dst += stride;
   }
 }
-#endif
-
-void av1_dc_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
-                               const uint8_t *above, const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
-  filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                               FILTER_DC_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_DC_PRED);
-#endif
-}
 
-void av1_v_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
-                              const uint8_t *above, const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
-  filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                               FILTER_V_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_V_PRED);
-#endif
-}
-
-void av1_h_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
-                              const uint8_t *above, const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
-  filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                               FILTER_H_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_H_PRED);
-#endif
-}
-
-void av1_d45_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
-                                const uint8_t *above, const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
-  filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                               FILTER_D45_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_D45_PRED);
-#endif
-}
-
-void av1_d135_filter_predictor_c(uint8_t *dst, ptrdiff_t stride,
-                                 TX_SIZE tx_size, const uint8_t *above,
-                                 const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
-  filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                               FILTER_D135_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_D135_PRED);
-#endif
-}
-
-void av1_d117_filter_predictor_c(uint8_t *dst, ptrdiff_t stride,
-                                 TX_SIZE tx_size, const uint8_t *above,
-                                 const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
-  filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                               FILTER_D117_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_D117_PRED);
-#endif
-}
-
-void av1_d153_filter_predictor_c(uint8_t *dst, ptrdiff_t stride,
-                                 TX_SIZE tx_size, const uint8_t *above,
-                                 const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
-  filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                               FILTER_D153_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_D153_PRED);
-#endif
-}
-
-void av1_d207_filter_predictor_c(uint8_t *dst, ptrdiff_t stride,
-                                 TX_SIZE tx_size, const uint8_t *above,
-                                 const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
-  filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                               FILTER_D207_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_D207_PRED);
-#endif
-}
-
-void av1_d63_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
-                                const uint8_t *above, const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
-  filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                               FILTER_D63_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_D63_PRED);
-#endif
-}
-
-void av1_tm_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
-                               const uint8_t *above, const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
-  filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                               FILTER_TM_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_TM_PRED);
-#endif
-}
-
-static void filter_intra_predictors(FILTER_INTRA_MODE mode, uint8_t *dst,
-                                    ptrdiff_t stride, TX_SIZE tx_size,
-                                    const uint8_t *above, const uint8_t *left) {
-  switch (mode) {
-    case FILTER_DC_PRED:
-      av1_dc_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_V_PRED:
-      av1_v_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_H_PRED:
-      av1_h_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_D45_PRED:
-      av1_d45_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_D135_PRED:
-      av1_d135_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_D117_PRED:
-      av1_d117_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_D153_PRED:
-      av1_d153_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_D207_PRED:
-      av1_d207_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_D63_PRED:
-      av1_d63_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_TM_PRED:
-      av1_tm_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    default: assert(0);
-  }
-}
-#if CONFIG_HIGHBITDEPTH
-#if USE_3TAP_INTRA_FILTER
-static void highbd_filter_intra_predictors_3tap(uint16_t *dst, ptrdiff_t stride,
-                                                TX_SIZE tx_size,
-                                                const uint16_t *above,
-                                                const uint16_t *left, int mode,
-                                                int bd) {
+static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride,
+                                          TX_SIZE tx_size,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int mode,
+                                          int bd) {
   int r, c;
-  int mean, ipred;
-#if CONFIG_TX64X64
-  int preds[65][65];
-#else
-  int preds[33][33];
-#endif  // CONFIG_TX64X64
-  const int c0 = filter_intra_taps_3[tx_size][mode][0];
-  const int c1 = filter_intra_taps_3[tx_size][mode][1];
-  const int c2 = filter_intra_taps_3[tx_size][mode][2];
+  uint16_t buffer[33][33];
   const int bw = tx_size_wide[tx_size];
   const int bh = tx_size_high[tx_size];
 
-  mean = 0;
-  for (r = 0; r < bh; ++r) {
-    mean += (int)left[r];
-  }
-  for (c = 0; c < bw; ++c) {
-    mean += (int)above[c];
-  }
-  mean = (mean + ((bw + bh) >> 1)) / (bw + bh);
-
-  for (r = 0; r < bh; ++r) preds[r + 1][0] = (int)left[r] - mean;
-
-  for (c = 0; c < bw + 1; ++c) preds[0][c] = (int)above[c - 1] - mean;
-
-  for (r = 1; r < bh + 1; ++r)
-    for (c = 1; c < bw + 1; ++c) {
-      ipred = c0 * preds[r - 1][c] + c1 * preds[r][c - 1] +
-              c2 * preds[r - 1][c - 1];
-      preds[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_PREC_BITS);
-      preds[r][c] = clip_pixel_highbd(preds[r][c] + mean, bd) - mean;
+  assert(bw <= 32 && bh <= 32);
+
+  // The initialization is just for silencing Jenkins static analysis warnings
+  for (r = 0; r < bh + 1; ++r)
+    memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
+
+  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(buffer[0][0]));
+
+  for (r = 1; r < bh + 1; r += 2)
+    for (c = 1; c < bw + 1; c += 4) {
+      const uint16_t p0 = buffer[r - 1][c - 1];
+      const uint16_t p1 = buffer[r - 1][c];
+      const uint16_t p2 = buffer[r - 1][c + 1];
+      const uint16_t p3 = buffer[r - 1][c + 2];
+      const uint16_t p4 = buffer[r - 1][c + 3];
+      const uint16_t p5 = buffer[r][c - 1];
+      const uint16_t p6 = buffer[r + 1][c - 1];
+      for (int k = 0; k < 8; ++k) {
+        int r_offset = k >> 2;
+        int c_offset = k & 0x03;
+        buffer[r + r_offset][c + c_offset] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO_SIGNED(
+                                  av1_filter_intra_taps[mode][k][0] * p0 +
+                                      av1_filter_intra_taps[mode][k][1] * p1 +
+                                      av1_filter_intra_taps[mode][k][2] * p2 +
+                                      av1_filter_intra_taps[mode][k][3] * p3 +
+                                      av1_filter_intra_taps[mode][k][4] * p4 +
+                                      av1_filter_intra_taps[mode][k][5] * p5 +
+                                      av1_filter_intra_taps[mode][k][6] * p6,
+                                  FILTER_INTRA_SCALE_BITS),
+                              bd);
+      }
     }
 
   for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = clip_pixel_highbd(preds[r + 1][c + 1] + mean, bd);
-    }
+    memcpy(dst, &buffer[r + 1][1], bw * sizeof(dst[0]));
     dst += stride;
   }
 }
-#else
-static void highbd_filter_intra_predictors_4tap(uint16_t *dst, ptrdiff_t stride,
-                                                TX_SIZE tx_size,
-                                                const uint16_t *above,
-                                                const uint16_t *left, int mode,
-                                                int bd) {
-  int r, c;
-  int mean, ipred;
-#if CONFIG_TX64X64
-  int preds[65][129];
-#else
-  int preds[33][65];
-#endif  // CONFIG_TX64X64
-  const int c0 = filter_intra_taps_4[tx_size][mode][0];
-  const int c1 = filter_intra_taps_4[tx_size][mode][1];
-  const int c2 = filter_intra_taps_4[tx_size][mode][2];
-  const int c3 = filter_intra_taps_4[tx_size][mode][3];
-  const int bw = tx_size_wide[tx_size];
-  const int bh = tx_size_high[tx_size];
 
-  mean = 0;
-  for (r = 0; r < bh; ++r) {
-    mean += (int)left[r];
-  }
-  for (c = 0; c < bw; ++c) {
-    mean += (int)above[c];
-  }
-  mean = (mean + ((bw + bh) >> 1)) / (bw + bh);
-
-  for (r = 0; r < bh; ++r) preds[r + 1][0] = (int)left[r] - mean;
-
-  for (c = 0; c < 2 * bw + 1; ++c) preds[0][c] = (int)above[c - 1] - mean;
-
-  for (r = 1; r < bh + 1; ++r)
-    for (c = 1; c < 2 * bw + 1 - r; ++c) {
-      ipred = c0 * preds[r - 1][c] + c1 * preds[r][c - 1] +
-              c2 * preds[r - 1][c - 1] + c3 * preds[r - 1][c + 1];
-      preds[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_PREC_BITS);
-      preds[r][c] = clip_pixel_highbd(preds[r][c] + mean, bd) - mean;
-    }
+static int is_smooth(const MB_MODE_INFO *mbmi, int plane) {
+  if (plane == 0) {
+    const PREDICTION_MODE mode = mbmi->mode;
+    return (mode == SMOOTH_PRED || mode == SMOOTH_V_PRED ||
+            mode == SMOOTH_H_PRED);
+  } else {
+    // uv_mode is not set for inter blocks, so need to explicitly
+    // detect that case.
+    if (is_inter_block(mbmi)) return 0;
 
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = clip_pixel_highbd(preds[r + 1][c + 1] + mean, bd);
-    }
-    dst += stride;
+    const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+    return (uv_mode == UV_SMOOTH_PRED || uv_mode == UV_SMOOTH_V_PRED ||
+            uv_mode == UV_SMOOTH_H_PRED);
   }
 }
-#endif
 
-void av1_highbd_dc_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                      TX_SIZE tx_size, const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
-  highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                                      FILTER_DC_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_DC_PRED, bd);
-#endif
-}
-
-void av1_highbd_v_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                     TX_SIZE tx_size, const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
-  highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                                      FILTER_V_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_V_PRED, bd);
-#endif
-}
-
-void av1_highbd_h_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                     TX_SIZE tx_size, const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
-  highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                                      FILTER_H_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_H_PRED, bd);
-#endif
-}
-
-void av1_highbd_d45_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                       TX_SIZE tx_size, const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
-  highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                                      FILTER_D45_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_D45_PRED, bd);
-#endif
-}
-
-void av1_highbd_d135_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                        TX_SIZE tx_size, const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
-  highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                                      FILTER_D135_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_D135_PRED, bd);
-#endif
-}
-
-void av1_highbd_d117_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                        TX_SIZE tx_size, const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
-  highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                                      FILTER_D117_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_D117_PRED, bd);
-#endif
-}
-
-void av1_highbd_d153_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                        TX_SIZE tx_size, const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
-  highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                                      FILTER_D153_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_D153_PRED, bd);
-#endif
-}
-
-void av1_highbd_d207_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                        TX_SIZE tx_size, const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
-  highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                                      FILTER_D207_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_D207_PRED, bd);
-#endif
-}
-
-void av1_highbd_d63_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                       TX_SIZE tx_size, const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
-  highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                                      FILTER_D63_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_D63_PRED, bd);
-#endif
-}
+static int get_filt_type(const MACROBLOCKD *xd, int plane) {
+  int ab_sm, le_sm;
 
-void av1_highbd_tm_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                      TX_SIZE tx_size, const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
-  highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
-                                      FILTER_TM_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_TM_PRED, bd);
-#endif
-}
-
-static void highbd_filter_intra_predictors(FILTER_INTRA_MODE mode,
-                                           uint16_t *dst, ptrdiff_t stride,
-                                           TX_SIZE tx_size,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  switch (mode) {
-    case FILTER_DC_PRED:
-      av1_highbd_dc_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_V_PRED:
-      av1_highbd_v_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_H_PRED:
-      av1_highbd_h_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_D45_PRED:
-      av1_highbd_d45_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_D135_PRED:
-      av1_highbd_d135_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_D117_PRED:
-      av1_highbd_d117_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_D153_PRED:
-      av1_highbd_d153_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_D207_PRED:
-      av1_highbd_d207_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_D63_PRED:
-      av1_highbd_d63_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_TM_PRED:
-      av1_highbd_tm_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    default: assert(0);
+  if (plane == 0) {
+    const MB_MODE_INFO *ab = xd->above_mbmi;
+    const MB_MODE_INFO *le = xd->left_mbmi;
+    ab_sm = ab ? is_smooth(ab, plane) : 0;
+    le_sm = le ? is_smooth(le, plane) : 0;
+  } else {
+    const MB_MODE_INFO *ab = xd->chroma_above_mbmi;
+    const MB_MODE_INFO *le = xd->chroma_left_mbmi;
+    ab_sm = ab ? is_smooth(ab, plane) : 0;
+    le_sm = le ? is_smooth(le, plane) : 0;
   }
+
+  return (ab_sm || le_sm) ? 1 : 0;
 }
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_FILTER_INTRA
 
-#if CONFIG_INTRA_EDGE
-static int intra_edge_filter_strength(int bsz, int delta) {
+static int intra_edge_filter_strength(int bs0, int bs1, int delta, int type) {
   const int d = abs(delta);
   int strength = 0;
 
-  switch (bsz) {
-    case 4:
-      if (d < 56) {
-        strength = 0;
-      } else if (d < 90) {
-        strength = 1;
-      }
-      break;
-    case 8:
-      if (d < 8) {
-        strength = 0;
-      } else if (d < 32) {
-        strength = 1;
-      } else if (d < 90) {
-        strength = 3;
-      }
-      break;
-    case 16:
-      if (d < 4) {
-        strength = 0;
-      } else if (d < 16) {
-        strength = 1;
-      } else if (d < 90) {
-        strength = 3;
-      }
-      break;
-    case 32:
-      if (d < 16) {
-        strength = 2;
-      } else if (d < 90) {
-        strength = 3;
-      }
-      break;
-    default: strength = 0; break;
+  const int blk_wh = bs0 + bs1;
+  if (type == 0) {
+    if (blk_wh <= 8) {
+      if (d >= 56) strength = 1;
+    } else if (blk_wh <= 12) {
+      if (d >= 40) strength = 1;
+    } else if (blk_wh <= 16) {
+      if (d >= 40) strength = 1;
+    } else if (blk_wh <= 24) {
+      if (d >= 8) strength = 1;
+      if (d >= 16) strength = 2;
+      if (d >= 32) strength = 3;
+    } else if (blk_wh <= 32) {
+      if (d >= 1) strength = 1;
+      if (d >= 4) strength = 2;
+      if (d >= 32) strength = 3;
+    } else {
+      if (d >= 1) strength = 3;
+    }
+  } else {
+    if (blk_wh <= 8) {
+      if (d >= 40) strength = 1;
+      if (d >= 64) strength = 2;
+    } else if (blk_wh <= 16) {
+      if (d >= 20) strength = 1;
+      if (d >= 48) strength = 2;
+    } else if (blk_wh <= 24) {
+      if (d >= 4) strength = 3;
+    } else {
+      if (d >= 1) strength = 3;
+    }
   }
-
   return strength;
 }
 
@@ -2229,7 +1015,7 @@ void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength) {
   uint8_t edge[129];
 
   memcpy(edge, p, sz * sizeof(*p));
-  for (int i = 1; i < sz - 1; i++) {
+  for (int i = 1; i < sz; i++) {
     int s = 0;
     for (int j = 0; j < INTRA_EDGE_TAPS; j++) {
       int k = i - 2 + j;
@@ -2242,7 +1028,16 @@ void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength) {
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
+static void filter_intra_edge_corner(uint8_t *p_above, uint8_t *p_left) {
+  const int kernel[3] = { 5, 6, 5 };
+
+  int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
+          (p_above[0] * kernel[2]);
+  s = (s + 8) >> 4;
+  p_above[-1] = s;
+  p_left[-1] = s;
+}
+
 void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) {
   if (!strength) return;
 
@@ -2253,7 +1048,7 @@ void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) {
   uint16_t edge[129];
 
   memcpy(edge, p, sz * sizeof(*p));
-  for (int i = 1; i < sz - 1; i++) {
+  for (int i = 1; i < sz; i++) {
     int s = 0;
     for (int j = 0; j < INTRA_EDGE_TAPS; j++) {
       int k = i - 2 + j;
@@ -2265,12 +1060,22 @@ void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) {
     p[i] = s;
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-static int use_intra_edge_upsample(int bsz, int delta) {
+static void filter_intra_edge_corner_high(uint16_t *p_above, uint16_t *p_left) {
+  const int kernel[3] = { 5, 6, 5 };
+
+  int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
+          (p_above[0] * kernel[2]);
+  s = (s + 8) >> 4;
+  p_above[-1] = s;
+  p_left[-1] = s;
+}
+
+static int use_intra_edge_upsample(int bs0, int bs1, int delta, int type) {
   const int d = abs(delta);
-  return (bsz == 4 && d > 0 && d < 56);
+  const int blk_wh = bs0 + bs1;
+  if (d <= 0 || d >= 40) return 0;
+  return type ? (blk_wh <= 8) : (blk_wh <= 16);
 }
 
 void av1_upsample_intra_edge_c(uint8_t *p, int sz) {
@@ -2296,7 +1101,6 @@ void av1_upsample_intra_edge_c(uint8_t *p, int sz) {
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd) {
   // interpolate half-sample positions
   assert(sz <= MAX_UPSAMPLE_SZ);
@@ -2320,16 +1124,13 @@ void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd) {
     p[2 * i] = in[i + 2];
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-
-#endif  // CONFIG_INTRA_EDGE
 
-#if CONFIG_HIGHBITDEPTH
 static void build_intra_predictors_high(
     const MACROBLOCKD *xd, const uint8_t *ref8, int ref_stride, uint8_t *dst8,
-    int dst_stride, PREDICTION_MODE mode, TX_SIZE tx_size, int n_top_px,
-    int n_topright_px, int n_left_px, int n_bottomleft_px, int plane) {
+    int dst_stride, PREDICTION_MODE mode, int angle_delta,
+    FILTER_INTRA_MODE filter_intra_mode, TX_SIZE tx_size,
+    int disable_edge_filter, int n_top_px, int n_topright_px, int n_left_px,
+    int n_bottomleft_px, int plane) {
   int i;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
@@ -2339,36 +1140,25 @@ static void build_intra_predictors_high(
   uint16_t *const left_col = left_data + 16;
   const int txwpx = tx_size_wide[tx_size];
   const int txhpx = tx_size_high[tx_size];
-#if !INTRA_USES_RECT_TRANSFORMS
-  assert(txwpx == txhpx);
-#endif  // !INTRA_USES_RECT_TRANSFORMS
   int need_left = extend_modes[mode] & NEED_LEFT;
   int need_above = extend_modes[mode] & NEED_ABOVE;
   int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
   const uint16_t *above_ref = ref - ref_stride;
-#if CONFIG_EXT_INTRA
+  const uint16_t *left_ref = ref - 1;
   int p_angle = 0;
-  const int is_dr_mode = av1_is_directional_mode(mode, xd->mi[0]->mbmi.sb_type);
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  const FILTER_INTRA_MODE_INFO *filter_intra_mode_info =
-      &xd->mi[0]->mbmi.filter_intra_mode_info;
-  const FILTER_INTRA_MODE filter_intra_mode =
-      filter_intra_mode_info->filter_intra_mode[plane != 0];
-#endif  // CONFIG_FILTER_INTRA
+  const int is_dr_mode = av1_is_directional_mode(mode);
+  const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
   int base = 128 << (xd->bd - 8);
 
+  // The default values if ref pixels are not available:
   // base-1 base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
   // base+1   A      B  ..     Y      Z
   // base+1   C      D  ..     W      X
   // base+1   E      F  ..     U      V
   // base+1   G      H  ..     S      T      T      T      T      T
-  aom_memset16(left_data, base + 1, sizeof(left_data) / sizeof(*left_data));
 
-#if CONFIG_EXT_INTRA
   if (is_dr_mode) {
-    p_angle = mode_to_angle_map[mode] +
-              xd->mi[0]->mbmi.angle_delta[plane != 0] * ANGLE_STEP;
+    p_angle = mode_to_angle_map[mode] + angle_delta;
     if (p_angle <= 90)
       need_above = 1, need_left = 0, need_above_left = 1;
     else if (p_angle < 180)
@@ -2376,29 +1166,20 @@ static void build_intra_predictors_high(
     else
       need_above = 0, need_left = 1, need_above_left = 1;
   }
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
-    need_left = need_above = need_above_left = 1;
-#endif  // CONFIG_FILTER_INTRA
+  if (use_filter_intra) need_left = need_above = need_above_left = 1;
 
-  (void)plane;
   assert(n_top_px >= 0);
   assert(n_topright_px >= 0);
   assert(n_left_px >= 0);
   assert(n_bottomleft_px >= 0);
 
   if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
-#if CONFIG_INTRA_EDGE
     int val;
     if (need_left) {
       val = (n_top_px > 0) ? above_ref[0] : base + 1;
     } else {
-      val = (n_left_px > 0) ? ref[-1] : base - 1;
+      val = (n_left_px > 0) ? left_ref[0] : base - 1;
     }
-#else
-    const int val = need_left ? base + 1 : base - 1;
-#endif  // CONFIG_INTRA_EDGE
     for (i = 0; i < txhpx; ++i) {
       aom_memset16(dst, val, txwpx);
       dst += dst_stride;
@@ -2408,56 +1189,34 @@ static void build_intra_predictors_high(
 
   // NEED_LEFT
   if (need_left) {
-#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
     int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
-#if CONFIG_FILTER_INTRA
-    if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
-      need_bottom = 0;
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_EXT_INTRA
+    if (use_filter_intra) need_bottom = 0;
     if (is_dr_mode) need_bottom = p_angle > 180;
-#endif  // CONFIG_EXT_INTRA
-#else
-    const int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
-#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
     const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0);
     i = 0;
     if (n_left_px > 0) {
-      for (; i < n_left_px; i++) left_col[i] = ref[i * ref_stride - 1];
+      for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
       if (need_bottom && n_bottomleft_px > 0) {
         assert(i == txhpx);
         for (; i < txhpx + n_bottomleft_px; i++)
-          left_col[i] = ref[i * ref_stride - 1];
+          left_col[i] = left_ref[i * ref_stride];
       }
       if (i < num_left_pixels_needed)
         aom_memset16(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
     } else {
-#if CONFIG_INTRA_EDGE
       if (n_top_px > 0) {
         aom_memset16(left_col, above_ref[0], num_left_pixels_needed);
       } else {
-#endif  // CONFIG_INTRA_EDGE
         aom_memset16(left_col, base + 1, num_left_pixels_needed);
-#if CONFIG_INTRA_EDGE
       }
-#endif  // CONFIG_INTRA_EDGE
     }
   }
 
   // NEED_ABOVE
   if (need_above) {
-#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
     int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
-#if CONFIG_FILTER_INTRA
-    if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
-      need_right = 1;
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_EXT_INTRA
+    if (use_filter_intra) need_right = 0;
     if (is_dr_mode) need_right = p_angle < 90;
-#endif  // CONFIG_EXT_INTRA
-#else
-    const int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
-#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
     const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0);
     if (n_top_px > 0) {
       memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0]));
@@ -2472,92 +1231,75 @@ static void build_intra_predictors_high(
         aom_memset16(&above_row[i], above_row[i - 1],
                      num_top_pixels_needed - i);
     } else {
-#if CONFIG_INTRA_EDGE
       if (n_left_px > 0) {
-        aom_memset16(above_row, ref[-1], num_top_pixels_needed);
+        aom_memset16(above_row, left_ref[0], num_top_pixels_needed);
       } else {
-#endif  // CONFIG_INTRA_EDGE
         aom_memset16(above_row, base - 1, num_top_pixels_needed);
-#if CONFIG_INTRA_EDGE
       }
-#endif  // CONFIG_INTRA_EDGE
     }
   }
 
   if (need_above_left) {
-#if CONFIG_INTRA_EDGE
     if (n_top_px > 0 && n_left_px > 0) {
       above_row[-1] = above_ref[-1];
     } else if (n_top_px > 0) {
       above_row[-1] = above_ref[0];
     } else if (n_left_px > 0) {
-      above_row[-1] = ref[-1];
+      above_row[-1] = left_ref[0];
     } else {
       above_row[-1] = base;
     }
-#else
-    above_row[-1] =
-        n_top_px > 0 ? (n_left_px > 0 ? above_ref[-1] : base + 1) : base - 1;
-#endif  // CONFIG_INTRA_EDGE
     left_col[-1] = above_row[-1];
   }
 
-#if CONFIG_FILTER_INTRA
-  if (filter_intra_mode_info->use_filter_intra_mode[plane != 0]) {
-    highbd_filter_intra_predictors(filter_intra_mode, dst, dst_stride, tx_size,
-                                   above_row, left_col, xd->bd);
+  if (use_filter_intra) {
+    highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
+                                  filter_intra_mode, xd->bd);
     return;
   }
-#endif  // CONFIG_FILTER_INTRA
 
-#if CONFIG_EXT_INTRA
   if (is_dr_mode) {
-#if CONFIG_INTRA_INTERP
-    INTRA_FILTER filter = INTRA_FILTER_LINEAR;
-    if (plane == 0 && av1_is_intra_filter_switchable(p_angle))
-      filter = xd->mi[0]->mbmi.intra_filter;
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE
-    const int need_right = p_angle < 90;
-    const int need_bottom = p_angle > 180;
-    if (p_angle != 90 && p_angle != 180) {
-      const int ab_le = need_above_left ? 1 : 0;
-      if (need_above && n_top_px > 0) {
-        const int strength = intra_edge_filter_strength(txwpx, p_angle - 90);
-        const int n_px = n_top_px + ab_le + (need_right ? n_topright_px : 0);
-        av1_filter_intra_edge_high(above_row - ab_le, n_px, strength);
+    int upsample_above = 0;
+    int upsample_left = 0;
+    if (!disable_edge_filter) {
+      const int need_right = p_angle < 90;
+      const int need_bottom = p_angle > 180;
+      const int filt_type = get_filt_type(xd, plane);
+      if (p_angle != 90 && p_angle != 180) {
+        const int ab_le = need_above_left ? 1 : 0;
+        if (need_above && need_left && (txwpx + txhpx >= 24)) {
+          filter_intra_edge_corner_high(above_row, left_col);
+        }
+        if (need_above && n_top_px > 0) {
+          const int strength =
+              intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type);
+          const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
+          av1_filter_intra_edge_high(above_row - ab_le, n_px, strength);
+        }
+        if (need_left && n_left_px > 0) {
+          const int strength = intra_edge_filter_strength(
+              txhpx, txwpx, p_angle - 180, filt_type);
+          const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
+          av1_filter_intra_edge_high(left_col - ab_le, n_px, strength);
+        }
       }
-      if (need_left && n_left_px > 0) {
-        const int strength = intra_edge_filter_strength(txhpx, p_angle - 180);
-        const int n_px =
-            n_left_px + ab_le + (need_bottom ? n_bottomleft_px : 0);
-        av1_filter_intra_edge_high(left_col - ab_le, n_px, strength);
+      upsample_above =
+          use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+      if (need_above && upsample_above) {
+        const int n_px = txwpx + (need_right ? txhpx : 0);
+        av1_upsample_intra_edge_high(above_row, n_px, xd->bd);
+      }
+      upsample_left =
+          use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+      if (need_left && upsample_left) {
+        const int n_px = txhpx + (need_bottom ? txwpx : 0);
+        av1_upsample_intra_edge_high(left_col, n_px, xd->bd);
       }
     }
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-    const int upsample_above = use_intra_edge_upsample(txwpx, p_angle - 90);
-    if (need_above && upsample_above) {
-      const int n_px = txwpx + (need_right ? txhpx : 0);
-      av1_upsample_intra_edge_high(above_row, n_px, xd->bd);
-    }
-    const int upsample_left = use_intra_edge_upsample(txhpx, p_angle - 180);
-    if (need_left && upsample_left) {
-      const int n_px = txhpx + (need_bottom ? txwpx : 0);
-      av1_upsample_intra_edge_high(left_col, n_px, xd->bd);
-    }
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-#endif  // CONFIG_INTRA_EDGE
     highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
-#if CONFIG_INTRA_INTERP
-                        filter,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                        upsample_above, upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                        p_angle, xd->bd);
+                        upsample_above, upsample_left, p_angle, xd->bd);
     return;
   }
-#endif  // CONFIG_EXT_INTRA
 
   // predict
   if (mode == DC_PRED) {
@@ -2567,52 +1309,41 @@ static void build_intra_predictors_high(
     pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, xd->bd);
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
                                    int ref_stride, uint8_t *dst, int dst_stride,
-                                   PREDICTION_MODE mode, TX_SIZE tx_size,
+                                   PREDICTION_MODE mode, int angle_delta,
+                                   FILTER_INTRA_MODE filter_intra_mode,
+                                   TX_SIZE tx_size, int disable_edge_filter,
                                    int n_top_px, int n_topright_px,
                                    int n_left_px, int n_bottomleft_px,
                                    int plane) {
   int i;
   const uint8_t *above_ref = ref - ref_stride;
+  const uint8_t *left_ref = ref - 1;
   DECLARE_ALIGNED(16, uint8_t, left_data[MAX_TX_SIZE * 2 + 32]);
   DECLARE_ALIGNED(16, uint8_t, above_data[MAX_TX_SIZE * 2 + 32]);
   uint8_t *const above_row = above_data + 16;
   uint8_t *const left_col = left_data + 16;
   const int txwpx = tx_size_wide[tx_size];
   const int txhpx = tx_size_high[tx_size];
-#if !INTRA_USES_RECT_TRANSFORMS
-  assert(txwpx == txhpx);
-#endif  // !INTRA_USES_RECT_TRANSFORMS
   int need_left = extend_modes[mode] & NEED_LEFT;
   int need_above = extend_modes[mode] & NEED_ABOVE;
   int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
-#if CONFIG_EXT_INTRA
   int p_angle = 0;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const int is_dr_mode = av1_is_directional_mode(mode, mbmi->sb_type);
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  const FILTER_INTRA_MODE_INFO *filter_intra_mode_info =
-      &xd->mi[0]->mbmi.filter_intra_mode_info;
-  const FILTER_INTRA_MODE filter_intra_mode =
-      filter_intra_mode_info->filter_intra_mode[plane != 0];
-#endif  // CONFIG_FILTER_INTRA
+  const int is_dr_mode = av1_is_directional_mode(mode);
+  const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
 
+  // The default values if ref pixels are not available:
   // 127 127 127 .. 127 127 127 127 127 127
   // 129  A   B  ..  Y   Z
   // 129  C   D  ..  W   X
   // 129  E   F  ..  U   V
   // 129  G   H  ..  S   T   T   T   T   T
   // ..
-  memset(left_data, 129, sizeof(left_data));
 
-#if CONFIG_EXT_INTRA
   if (is_dr_mode) {
-    p_angle = mode_to_angle_map[mode] +
-              xd->mi[0]->mbmi.angle_delta[plane != 0] * ANGLE_STEP;
+    p_angle = mode_to_angle_map[mode] + angle_delta;
     if (p_angle <= 90)
       need_above = 1, need_left = 0, need_above_left = 1;
     else if (p_angle < 180)
@@ -2620,30 +1351,20 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
     else
       need_above = 0, need_left = 1, need_above_left = 1;
   }
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
-    need_left = need_above = need_above_left = 1;
-#endif  // CONFIG_FILTER_INTRA
-
-  (void)xd;
-  (void)plane;
+  if (use_filter_intra) need_left = need_above = need_above_left = 1;
+
   assert(n_top_px >= 0);
   assert(n_topright_px >= 0);
   assert(n_left_px >= 0);
   assert(n_bottomleft_px >= 0);
 
   if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
-#if CONFIG_INTRA_EDGE
     int val;
     if (need_left) {
       val = (n_top_px > 0) ? above_ref[0] : 129;
     } else {
-      val = (n_left_px > 0) ? ref[-1] : 127;
+      val = (n_left_px > 0) ? left_ref[0] : 127;
     }
-#else
-    const int val = need_left ? 129 : 127;
-#endif  // CONFIG_INTRA_EDGE
     for (i = 0; i < txhpx; ++i) {
       memset(dst, val, txwpx);
       dst += dst_stride;
@@ -2653,56 +1374,34 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
 
   // NEED_LEFT
   if (need_left) {
-#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
     int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
-#if CONFIG_FILTER_INTRA
-    if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
-      need_bottom = 0;
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_EXT_INTRA
+    if (use_filter_intra) need_bottom = 0;
     if (is_dr_mode) need_bottom = p_angle > 180;
-#endif  // CONFIG_EXT_INTRA
-#else
-    const int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
-#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
     const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0);
     i = 0;
     if (n_left_px > 0) {
-      for (; i < n_left_px; i++) left_col[i] = ref[i * ref_stride - 1];
+      for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
       if (need_bottom && n_bottomleft_px > 0) {
         assert(i == txhpx);
         for (; i < txhpx + n_bottomleft_px; i++)
-          left_col[i] = ref[i * ref_stride - 1];
+          left_col[i] = left_ref[i * ref_stride];
       }
       if (i < num_left_pixels_needed)
         memset(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
     } else {
-#if CONFIG_INTRA_EDGE
       if (n_top_px > 0) {
         memset(left_col, above_ref[0], num_left_pixels_needed);
       } else {
-#endif  // CONFIG_INTRA_EDGE
         memset(left_col, 129, num_left_pixels_needed);
-#if CONFIG_INTRA_EDGE
       }
-#endif  // CONFIG_INTRA_EDGE
     }
   }
 
   // NEED_ABOVE
   if (need_above) {
-#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
     int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
-#if CONFIG_FILTER_INTRA
-    if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
-      need_right = 1;
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_EXT_INTRA
+    if (use_filter_intra) need_right = 0;
     if (is_dr_mode) need_right = p_angle < 90;
-#endif  // CONFIG_EXT_INTRA
-#else
-    const int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
-#endif  // CONFIG_EXT_INTRA || CONFIG_FITLER_INTRA
     const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0);
     if (n_top_px > 0) {
       memcpy(above_row, above_ref, n_top_px);
@@ -2715,91 +1414,75 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
       if (i < num_top_pixels_needed)
         memset(&above_row[i], above_row[i - 1], num_top_pixels_needed - i);
     } else {
-#if CONFIG_INTRA_EDGE
       if (n_left_px > 0) {
-        memset(above_row, ref[-1], num_top_pixels_needed);
+        memset(above_row, left_ref[0], num_top_pixels_needed);
       } else {
-#endif  // CONFIG_INTRA_EDGE
         memset(above_row, 127, num_top_pixels_needed);
-#if CONFIG_INTRA_EDGE
       }
-#endif  // CONFIG_INTRA_EDGE
     }
   }
 
   if (need_above_left) {
-#if CONFIG_INTRA_EDGE
     if (n_top_px > 0 && n_left_px > 0) {
       above_row[-1] = above_ref[-1];
     } else if (n_top_px > 0) {
       above_row[-1] = above_ref[0];
     } else if (n_left_px > 0) {
-      above_row[-1] = ref[-1];
+      above_row[-1] = left_ref[0];
     } else {
       above_row[-1] = 128;
     }
-#else
-    above_row[-1] = n_top_px > 0 ? (n_left_px > 0 ? above_ref[-1] : 129) : 127;
-#endif  // CONFIG_INTRA_EDGE
     left_col[-1] = above_row[-1];
   }
 
-#if CONFIG_FILTER_INTRA
-  if (filter_intra_mode_info->use_filter_intra_mode[plane != 0]) {
-    filter_intra_predictors(filter_intra_mode, dst, dst_stride, tx_size,
-                            above_row, left_col);
+  if (use_filter_intra) {
+    av1_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
+                               filter_intra_mode);
     return;
   }
-#endif  // CONFIG_FILTER_INTRA
 
-#if CONFIG_EXT_INTRA
   if (is_dr_mode) {
-#if CONFIG_INTRA_INTERP
-    INTRA_FILTER filter = INTRA_FILTER_LINEAR;
-    if (plane == 0 && av1_is_intra_filter_switchable(p_angle))
-      filter = xd->mi[0]->mbmi.intra_filter;
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE
-    const int need_right = p_angle < 90;
-    const int need_bottom = p_angle > 180;
-    if (p_angle != 90 && p_angle != 180) {
-      const int ab_le = need_above_left ? 1 : 0;
-      if (need_above && n_top_px > 0) {
-        const int strength = intra_edge_filter_strength(txwpx, p_angle - 90);
-        const int n_px = n_top_px + ab_le + (need_right ? n_topright_px : 0);
-        av1_filter_intra_edge(above_row - ab_le, n_px, strength);
+    int upsample_above = 0;
+    int upsample_left = 0;
+    if (!disable_edge_filter) {
+      const int need_right = p_angle < 90;
+      const int need_bottom = p_angle > 180;
+      const int filt_type = get_filt_type(xd, plane);
+      if (p_angle != 90 && p_angle != 180) {
+        const int ab_le = need_above_left ? 1 : 0;
+        if (need_above && need_left && (txwpx + txhpx >= 24)) {
+          filter_intra_edge_corner(above_row, left_col);
+        }
+        if (need_above && n_top_px > 0) {
+          const int strength =
+              intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type);
+          const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
+          av1_filter_intra_edge(above_row - ab_le, n_px, strength);
+        }
+        if (need_left && n_left_px > 0) {
+          const int strength = intra_edge_filter_strength(
+              txhpx, txwpx, p_angle - 180, filt_type);
+          const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
+          av1_filter_intra_edge(left_col - ab_le, n_px, strength);
+        }
       }
-      if (need_left && n_left_px > 0) {
-        const int strength = intra_edge_filter_strength(txhpx, p_angle - 180);
-        const int n_px =
-            n_left_px + ab_le + (need_bottom ? n_bottomleft_px : 0);
-        av1_filter_intra_edge(left_col - ab_le, n_px, strength);
+      upsample_above =
+          use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+      if (need_above && upsample_above) {
+        const int n_px = txwpx + (need_right ? txhpx : 0);
+        av1_upsample_intra_edge(above_row, n_px);
+      }
+      upsample_left =
+          use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+      if (need_left && upsample_left) {
+        const int n_px = txhpx + (need_bottom ? txwpx : 0);
+        av1_upsample_intra_edge(left_col, n_px);
       }
     }
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-    const int upsample_above = use_intra_edge_upsample(txwpx, p_angle - 90);
-    if (need_above && upsample_above) {
-      const int n_px = txwpx + (need_right ? txhpx : 0);
-      av1_upsample_intra_edge(above_row, n_px);
-    }
-    const int upsample_left = use_intra_edge_upsample(txhpx, p_angle - 180);
-    if (need_left && upsample_left) {
-      const int n_px = txhpx + (need_bottom ? txwpx : 0);
-      av1_upsample_intra_edge(left_col, n_px);
-    }
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-#endif  // CONFIG_INTRA_EDGE
-    dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
-#if CONFIG_INTRA_INTERP
-                 filter,
-#endif  // CONFIG_INTRA_INTERP
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-                 upsample_above, upsample_left,
-#endif  // CONFIG_INTRA_EDGE_UPSAMPLE
-                 p_angle);
+    dr_predictor(dst, dst_stride, tx_size, above_row, left_col, upsample_above,
+                 upsample_left, p_angle);
     return;
   }
-#endif  // CONFIG_EXT_INTRA
 
   // predict
   if (mode == DC_PRED) {
@@ -2810,41 +1493,54 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
   }
 }
 
-static void predict_intra_block_helper(const AV1_COMMON *cm,
-                                       const MACROBLOCKD *xd, int wpx, int hpx,
-                                       TX_SIZE tx_size, PREDICTION_MODE mode,
-                                       const uint8_t *ref, int ref_stride,
-                                       uint8_t *dst, int dst_stride,
-                                       int col_off, int row_off, int plane) {
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+void av1_predict_intra_block(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx,
+    TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette,
+    FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride,
+    uint8_t *dst, int dst_stride, int col_off, int row_off, int plane) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int txwpx = tx_size_wide[tx_size];
+  const int txhpx = tx_size_high[tx_size];
+  const int x = col_off << tx_size_wide_log2[0];
+  const int y = row_off << tx_size_high_log2[0];
+
+  if (use_palette) {
+    int r, c;
+    const uint8_t *const map = xd->plane[plane != 0].color_index_map +
+                               xd->color_index_map_offset[plane != 0];
+    const uint16_t *const palette =
+        mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE;
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+      for (r = 0; r < txhpx; ++r) {
+        for (c = 0; c < txwpx; ++c) {
+          dst16[r * dst_stride + c] = palette[map[(r + y) * wpx + c + x]];
+        }
+      }
+    } else {
+      for (r = 0; r < txhpx; ++r) {
+        for (c = 0; c < txwpx; ++c) {
+          dst[r * dst_stride + c] =
+              (uint8_t)palette[map[(r + y) * wpx + c + x]];
+        }
+      }
+    }
+    return;
+  }
+
+  BLOCK_SIZE bsize = mbmi->sb_type;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int txw = tx_size_wide_unit[tx_size];
-#if CONFIG_CB4X4 && CONFIG_CHROMA_SUB8X8
+  const int txh = tx_size_high_unit[tx_size];
   const int have_top = row_off || (pd->subsampling_y ? xd->chroma_up_available
                                                      : xd->up_available);
   const int have_left =
       col_off ||
       (pd->subsampling_x ? xd->chroma_left_available : xd->left_available);
-#else
-  const int have_top = row_off || xd->up_available;
-  const int have_left = col_off || xd->left_available;
-#endif
-  const int x = col_off << tx_size_wide_log2[0];
-  const int y = row_off << tx_size_high_log2[0];
   const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
   const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
-  const int txwpx = tx_size_wide[tx_size];
-  const int txhpx = tx_size_high[tx_size];
-#if !INTRA_USES_RECT_TRANSFORMS
-  assert(txwpx == txhpx);
-#endif  // !INTRA_USES_RECT_TRANSFORMS
-#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2 && !CONFIG_CHROMA_SUB8X8
-  const int xr_chr_offset = (pd->subsampling_x && bsize < BLOCK_8X8) ? 2 : 0;
-  const int yd_chr_offset = (pd->subsampling_y && bsize < BLOCK_8X8) ? 2 : 0;
-#else
   const int xr_chr_offset = 0;
   const int yd_chr_offset = 0;
-#endif
 
   // Distance between the right edge of this prediction block to
   // the frame right edge
@@ -2854,69 +1550,39 @@ static void predict_intra_block_helper(const AV1_COMMON *cm,
   // the frame bottom edge
   const int yd = (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) +
                  (hpx - y - txhpx) - yd_chr_offset;
-  const int right_available = mi_col + ((col_off + txw) << pd->subsampling_x >>
-                                        (MI_SIZE_LOG2 - tx_size_wide_log2[0])) <
-                              xd->tile.mi_col_end;
-  const int bottom_available = (yd > 0);
-#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB
-  const PARTITION_TYPE partition = xd->mi[0]->mbmi.partition;
-#endif
+  const int right_available =
+      mi_col + ((col_off + txw) << pd->subsampling_x) < xd->tile.mi_col_end;
+  const int bottom_available =
+      (yd > 0) &&
+      (mi_row + ((row_off + txh) << pd->subsampling_y) < xd->tile.mi_row_end);
+
+  const PARTITION_TYPE partition = mbmi->partition;
 
-#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
   // force 4x4 chroma component block size.
   bsize = scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
-#endif
 
-  const int have_top_right =
-      has_top_right(cm, bsize, mi_row, mi_col, have_top, right_available,
-#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB
-                    partition,
-#endif  // CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB
-                    tx_size, row_off, col_off, pd->subsampling_x);
-  const int have_bottom_left =
-      has_bottom_left(cm, bsize, mi_row, mi_col, bottom_available, have_left,
-                      tx_size, row_off, col_off, pd->subsampling_y);
-  if (xd->mi[0]->mbmi.palette_mode_info.palette_size[plane != 0] > 0) {
-    const int stride = wpx;
-    int r, c;
-    const uint8_t *const map = xd->plane[plane != 0].color_index_map;
-    uint16_t *palette = xd->mi[0]->mbmi.palette_mode_info.palette_colors +
-                        plane * PALETTE_MAX_SIZE;
+  const int have_top_right = has_top_right(
+      cm, bsize, mi_row, mi_col, have_top, right_available, partition, tx_size,
+      row_off, col_off, pd->subsampling_x, pd->subsampling_y);
+  const int have_bottom_left = has_bottom_left(
+      cm, bsize, mi_row, mi_col, bottom_available, have_left, partition,
+      tx_size, row_off, col_off, pd->subsampling_x, pd->subsampling_y);
 
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
-      for (r = 0; r < txhpx; ++r) {
-        for (c = 0; c < txwpx; ++c) {
-          dst16[r * dst_stride + c] = palette[map[(r + y) * stride + c + x]];
-        }
-      }
-    } else {
-#endif  // CONFIG_HIGHBITDEPTH
-      for (r = 0; r < txhpx; ++r) {
-        for (c = 0; c < txwpx; ++c) {
-          dst[r * dst_stride + c] =
-              (uint8_t)palette[map[(r + y) * stride + c + x]];
-        }
-      }
-#if CONFIG_HIGHBITDEPTH
-    }
-#endif  // CONFIG_HIGHBITDEPTH
-    return;
-  }
-
-#if CONFIG_HIGHBITDEPTH
+  const int disable_edge_filter = !cm->seq_params.enable_intra_edge_filter;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     build_intra_predictors_high(
-        xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
+        xd, ref, ref_stride, dst, dst_stride, mode, angle_delta,
+        filter_intra_mode, tx_size, disable_edge_filter,
         have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
         have_top_right ? AOMMIN(txwpx, xr) : 0,
         have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
         have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane);
     return;
   }
-#endif
-  build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
+
+  build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode,
+                         angle_delta, filter_intra_mode, tx_size,
+                         disable_edge_filter,
                          have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
                          have_top_right ? AOMMIN(txwpx, xr) : 0,
                          have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
@@ -2924,278 +1590,56 @@ static void predict_intra_block_helper(const AV1_COMMON *cm,
 }
 
 void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                    int plane, int block_idx, int blk_col,
-                                    int blk_row, TX_SIZE tx_size) {
-  const MODE_INFO *mi = xd->mi[0];
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+                                    int plane, int blk_col, int blk_row,
+                                    TX_SIZE tx_size) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int dst_stride = pd->dst.stride;
   uint8_t *dst =
       &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-  const int block_raster_idx =
-      av1_block_index_to_raster_order(tx_size, block_idx);
-  const PREDICTION_MODE mode = (plane == AOM_PLANE_Y)
-                                   ? get_y_mode(mi, block_raster_idx)
-                                   : get_uv_mode(mbmi->uv_mode);
-#if CONFIG_CFL
+  const PREDICTION_MODE mode =
+      (plane == AOM_PLANE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
+  const int use_palette = mbmi->palette_mode_info.palette_size[plane != 0] > 0;
+  const FILTER_INTRA_MODE filter_intra_mode =
+      (plane == AOM_PLANE_Y && mbmi->filter_intra_mode_info.use_filter_intra)
+          ? mbmi->filter_intra_mode_info.filter_intra_mode
+          : FILTER_INTRA_MODES;
+  const int angle_delta = mbmi->angle_delta[plane != AOM_PLANE_Y] * ANGLE_STEP;
+
   if (plane != AOM_PLANE_Y && mbmi->uv_mode == UV_CFL_PRED) {
-    if (plane == AOM_PLANE_U && blk_col == 0 && blk_row == 0) {
-      // Avoid computing the CfL parameters twice, if they have already been
-      // computed in cfl_rd_pick_alpha.
-      if (!xd->cfl->are_parameters_computed)
-        cfl_compute_parameters(xd, tx_size);
+#if CONFIG_DEBUG
+    assert(is_cfl_allowed(xd));
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(
+        mbmi->sb_type, pd->subsampling_x, pd->subsampling_y);
+    (void)plane_bsize;
+    assert(plane_bsize < BLOCK_SIZES_ALL);
+    if (!xd->lossless[mbmi->segment_id]) {
+      assert(blk_col == 0);
+      assert(blk_row == 0);
+      assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
+      assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
     }
-    cfl_predict_block(xd, dst, dst_stride, blk_row, blk_col, tx_size, plane);
-    return;
-  }
 #endif
-
-  av1_predict_intra_block(cm, xd, pd->width, pd->height,
-                          txsize_to_bsize[tx_size], mode, dst, dst_stride, dst,
-                          dst_stride, blk_col, blk_row, plane);
-}
-
-#if INTRA_USES_EXT_TRANSFORMS
-// Copy the given row of dst into the equivalent row of ref, saving
-// the overwritten data to tmp. Returns zero if no copy happened (so
-// no restore is needed)
-//
-// Note that ref_row and dst_row follow the usual hibd convention
-// where you convert to a uint16_t* with CONVERT_TO_SHORTPTR(). tmp
-// does not follow that convention: it's a genuine pointer which is
-// correctly aligned and sized for either 8 or 16 bit data.
-//
-// matching_strides is a boolean flag which should be nonzero if ref
-// and dst have the same stride.
-static int overwrite_ref_row(int matching_strides, int buf_flags,
-                             int block_width, const uint8_t *dst_row,
-                             uint8_t *ref_row, uint8_t *tmp_row) {
-  if (ref_row == dst_row && matching_strides) return 0;
-
-  int row_bytes = block_width;
-
-#if CONFIG_HIGHBITDEPTH
-  if (buf_flags & YV12_FLAG_HIGHBITDEPTH) {
-    row_bytes *= 2;
-    ref_row = (uint8_t *)CONVERT_TO_SHORTPTR(ref_row);
-    dst_row = (const uint8_t *)CONVERT_TO_SHORTPTR(dst_row);
-  }
-#else
-  (void)buf_flags;
-#endif  // CONFIG_HIGHBITDEPTH
-
-  memcpy(tmp_row, ref_row, row_bytes);
-  memcpy(ref_row, dst_row, row_bytes);
-  return 1;
-}
-
-static void restore_ref_row(int buf_flags, int block_width,
-                            const uint8_t *tmp_row, uint8_t *ref_row) {
-  int row_bytes = block_width;
-#if CONFIG_HIGHBITDEPTH
-  if (buf_flags & YV12_FLAG_HIGHBITDEPTH) {
-    row_bytes *= 2;
-    ref_row = (uint8_t *)CONVERT_TO_SHORTPTR(ref_row);
-  }
-#else
-  (void)buf_flags;
-#endif  // CONFIG_HIGHBITDEPTH
-
-  memcpy(ref_row, tmp_row, row_bytes);
-}
-
-// The column equivalent of overwrite_ref_row. ref_row and dst_row
-// point at the relevant column of the first row of the block.
-static int overwrite_ref_col(int buf_flags, int block_height,
-                             const uint8_t *dst_row, int dst_stride,
-                             uint8_t *ref_row, int ref_stride,
-                             uint8_t *tmp_row) {
-  if (ref_row == dst_row && ref_stride == dst_stride) return 0;
-
-#if CONFIG_HIGHBITDEPTH
-  if (buf_flags & YV12_FLAG_HIGHBITDEPTH) {
-    uint16_t *tmp_16 = (uint16_t *)tmp_row;
-    uint16_t *ref_16 = CONVERT_TO_SHORTPTR(ref_row);
-    const uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst_row);
-
-    for (int i = 0; i < block_height; ++i) {
-      tmp_16[i] = ref_16[i * ref_stride];
-      ref_16[i * ref_stride] = dst_16[i * dst_stride];
-    }
-  } else {
-#endif  // CONFIG_HIGHBITDEPTH
-    for (int i = 0; i < block_height; ++i) {
-      tmp_row[i] = ref_row[i * ref_stride];
-      ref_row[i * ref_stride] = dst_row[i * dst_stride];
-    }
-#if CONFIG_HIGHBITDEPTH
-  }
-#else
-  (void)buf_flags;
-#endif  // CONFIG_HIGHBITDEPTH
-  return 1;
-}
-
-static void restore_ref_col(int buf_flags, int block_height,
-                            const uint8_t *tmp_row, uint8_t *ref_row,
-                            int ref_stride) {
-#if CONFIG_HIGHBITDEPTH
-  if (buf_flags & YV12_FLAG_HIGHBITDEPTH) {
-    const uint16_t *tmp_16 = (const uint16_t *)tmp_row;
-    uint16_t *ref_16 = CONVERT_TO_SHORTPTR(ref_row);
-
-    for (int i = 0; i < block_height; ++i) {
-      ref_16[i * ref_stride] = tmp_16[i];
-    }
-  } else {
-#endif  // CONFIG_HIGHBITDEPTH
-    for (int i = 0; i < block_height; ++i) {
-      ref_row[i * ref_stride] = tmp_row[i];
+    CFL_CTX *const cfl = &xd->cfl;
+    CFL_PRED_TYPE pred_plane = get_cfl_pred_type(plane);
+    if (cfl->dc_pred_is_cached[pred_plane] == 0) {
+      av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode,
+                              angle_delta, use_palette, filter_intra_mode, dst,
+                              dst_stride, dst, dst_stride, blk_col, blk_row,
+                              plane);
+      if (cfl->use_dc_pred_cache) {
+        cfl_store_dc_pred(xd, dst, pred_plane, tx_size_wide[tx_size]);
+        cfl->dc_pred_is_cached[pred_plane] = 1;
+      }
+    } else {
+      cfl_load_dc_pred(xd, dst, dst_stride, tx_size, pred_plane);
     }
-#if CONFIG_HIGHBITDEPTH
-  }
-#else
-  (void)buf_flags;
-#endif  // CONFIG_HIGHBITDEPTH
-}
-#endif  // #if INTRA_USES_EXT_TRANSFORMS
-
-void av1_predict_intra_block(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                             int wpx, int hpx, BLOCK_SIZE bsize,
-                             PREDICTION_MODE mode, const uint8_t *ref,
-                             int ref_stride, uint8_t *dst, int dst_stride,
-                             int col_off, int row_off, int plane) {
-  const int block_width = block_size_wide[bsize];
-  const int block_height = block_size_high[bsize];
-#if INTRA_USES_RECT_TRANSFORMS
-  const TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
-  assert(tx_size < TX_SIZES_ALL);
-#else
-  const TX_SIZE tx_size = max_txsize_lookup[bsize];
-  assert(tx_size < TX_SIZES);
-#endif  // INTRA_USES_RECT_TRANSFORMS
-
-  // Start by running the helper to predict either the entire block
-  // (if the block is square or the same size as tx_size) or the top
-  // or left of the block if it's tall and thin or short and wide.
-  predict_intra_block_helper(cm, xd, wpx, hpx, tx_size, mode, ref, ref_stride,
-                             dst, dst_stride, col_off, row_off, plane);
-
-// If we're not using extended transforms, this function should
-// always be called with a square block.
-#if !INTRA_USES_EXT_TRANSFORMS
-  assert(block_width == block_height);
-#endif  // !INTRA_USES_EXT_TRANSFORMS
-
-  // If the block is square, we're done.
-  if (block_width == block_height) return;
-
-#if INTRA_USES_EXT_TRANSFORMS
-// If we're using rectangular transforms, we might be done even
-// though the block isn't square.
-#if INTRA_USES_RECT_TRANSFORMS
-  if (block_width == tx_size_wide[tx_size] &&
-      block_height == tx_size_high[tx_size])
+    cfl_predict_block(xd, dst, dst_stride, tx_size, plane);
     return;
-
-  // A block should only fail to have a matching transform if it's
-  // large and rectangular (such large transform sizes aren't
-  // available).
-  assert(block_width >= 32 && block_height >= 32);
-#endif  // INTRA_USES_RECT_TRANSFORMS
-
-  assert((block_width == wpx && block_height == hpx) ||
-         (block_width == (wpx >> 1) && block_height == hpx) ||
-         (block_width == wpx && block_height == (hpx >> 1)));
-
-// The tmp buffer needs to be big enough to hold MAX_SB_SIZE samples
-// from the image. If CONFIG_HIGHBITDEPTH is enabled, it also needs
-// to be big enough and correctly aligned to hold 16-bit entries.
-#if CONFIG_HIGHBITDEPTH
-  uint16_t tmp_buf[MAX_SB_SIZE];
-#else
-  uint8_t tmp_buf[MAX_SB_SIZE];
-#endif  // CONFIG_HIGHBITDEPTH
-  uint8_t *tmp = (uint8_t *)tmp_buf;
-
-  if (block_width < block_height) {
-    // The block is tall and thin. We've already done the top part,
-    // and need to repeat the prediction down the rest of the block.
-
-    const int tx_height = tx_size_high[tx_size];
-    const int tx_height_off = tx_height >> tx_size_wide_log2[0];
-    assert(tx_height_off << tx_size_wide_log2[0] == tx_height);
-
-    int next_row_off = row_off + tx_height_off;
-    int next_row_idx = tx_height;
-
-    while (next_row_idx < block_height) {
-      const int last_row_idx = next_row_idx - 1;
-
-      // Cast away the const to make a mutable pointer to the last
-      // row of ref. This will be snapshotted and restored later.
-      uint8_t *last_ref_row = (uint8_t *)ref + last_row_idx * ref_stride;
-      uint8_t *last_dst_row = dst + last_row_idx * dst_stride;
-
-      const int needs_restore =
-          overwrite_ref_row(ref_stride == dst_stride, xd->cur_buf->flags,
-                            block_width, last_dst_row, last_ref_row, tmp);
-
-      const uint8_t *next_ref_row = ref + next_row_idx * ref_stride;
-      uint8_t *next_dst_row = dst + next_row_idx * dst_stride;
-
-      predict_intra_block_helper(cm, xd, wpx, hpx, tx_size, mode, next_ref_row,
-                                 ref_stride, next_dst_row, dst_stride, col_off,
-                                 next_row_off, plane);
-
-      if (needs_restore)
-        restore_ref_row(xd->cur_buf->flags, block_width, tmp, last_ref_row);
-
-      next_row_idx += tx_height;
-      next_row_off += tx_height_off;
-    }
-  } else {
-    // The block is short and wide. We've already done the left part,
-    // and need to repeat the prediction to the right.
-
-    const int tx_width = tx_size_wide[tx_size];
-    const int tx_width_off = tx_width >> tx_size_wide_log2[0];
-    assert(tx_width_off << tx_size_wide_log2[0] == tx_width);
-
-    int next_col_off = col_off + tx_width_off;
-    int next_col_idx = tx_width;
-
-    while (next_col_idx < block_width) {
-      const int last_col_idx = next_col_idx - 1;
-
-      // Cast away the const to make a mutable pointer to ref,
-      // starting at the last column written. This will be
-      // snapshotted and restored later.
-      uint8_t *last_ref_col = (uint8_t *)ref + last_col_idx;
-      uint8_t *last_dst_col = dst + last_col_idx;
-
-      const int needs_restore =
-          overwrite_ref_col(xd->cur_buf->flags, block_height, last_dst_col,
-                            dst_stride, last_ref_col, ref_stride, tmp);
-
-      const uint8_t *next_ref_col = ref + next_col_idx;
-      uint8_t *next_dst_col = dst + next_col_idx;
-
-      predict_intra_block_helper(cm, xd, wpx, hpx, tx_size, mode, next_ref_col,
-                                 ref_stride, next_dst_col, dst_stride,
-                                 next_col_off, row_off, plane);
-
-      if (needs_restore)
-        restore_ref_col(xd->cur_buf->flags, block_height, tmp, last_ref_col,
-                        ref_stride);
-
-      next_col_idx += tx_width;
-      next_col_off += tx_width_off;
-    }
   }
-#endif  // INTRA_USES_EXT_TRANSFORMS
+  av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode,
+                          angle_delta, use_palette, filter_intra_mode, dst,
+                          dst_stride, dst, dst_stride, blk_col, blk_row, plane);
 }
 
-void av1_init_intra_predictors(void) {
-  once(av1_init_intra_predictors_internal);
-}
+void av1_init_intra_predictors(void) { once(init_intra_predictors_internal); }
diff --git a/third_party/aom/av1/common/reconintra.h b/third_party/aom/av1/common/reconintra.h
index 42797e310..a7d9e8b79 100644
--- a/third_party/aom/av1/common/reconintra.h
+++ b/third_party/aom/av1/common/reconintra.h
@@ -22,15 +22,16 @@ extern "C" {
 
 void av1_init_intra_predictors(void);
 void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                    int plane, int block_idx, int blk_col,
-                                    int blk_row, TX_SIZE tx_size);
+                                    int plane, int blk_col, int blk_row,
+                                    TX_SIZE tx_size);
 void av1_predict_intra_block(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                             int bw, int bh, BLOCK_SIZE bsize,
-                             PREDICTION_MODE mode, const uint8_t *ref,
-                             int ref_stride, uint8_t *dst, int dst_stride,
-                             int aoff, int loff, int plane);
+                             int bw, int bh, TX_SIZE tx_size,
+                             PREDICTION_MODE mode, int angle_delta,
+                             int use_palette,
+                             FILTER_INTRA_MODE filter_intra_mode,
+                             const uint8_t *ref, int ref_stride, uint8_t *dst,
+                             int dst_stride, int aoff, int loff, int plane);
 
-#if CONFIG_INTERINTRA
 // Mapping of interintra to intra mode for use in the intra component
 static const PREDICTION_MODE interintra_to_intra_mode[INTERINTRA_MODES] = {
   DC_PRED, V_PRED, H_PRED, SMOOTH_PRED
@@ -41,44 +42,67 @@ static const INTERINTRA_MODE intra_to_interintra_mode[INTRA_MODES] = {
   II_DC_PRED, II_V_PRED, II_H_PRED, II_V_PRED,      II_SMOOTH_PRED, II_V_PRED,
   II_H_PRED,  II_H_PRED, II_V_PRED, II_SMOOTH_PRED, II_SMOOTH_PRED
 };
-#endif  // CONFIG_INTERINTRA
-
-#if CONFIG_FILTER_INTRA
-#define FILTER_INTRA_PREC_BITS 10
-#endif  // CONFIG_FILTER_INTRA
-
-#define CONFIG_INTRA_EDGE_UPSAMPLE CONFIG_INTRA_EDGE
-#define CONFIG_USE_ANGLE_DELTA_SUB8X8 0
-
-#if CONFIG_EXT_INTRA
-static INLINE int av1_is_directional_mode(PREDICTION_MODE mode,
-                                          BLOCK_SIZE bsize) {
-#if CONFIG_INTRA_EDGE_UPSAMPLE
-  (void)bsize;
-  return mode >= V_PRED && mode <= D63_PRED;
-#else
-  return mode >= V_PRED && mode <= D63_PRED && bsize >= BLOCK_8X8;
-#endif
+
+#define FILTER_INTRA_SCALE_BITS 4
+
+static INLINE int av1_is_directional_mode(PREDICTION_MODE mode) {
+  return mode >= V_PRED && mode <= D67_PRED;
 }
 
 static INLINE int av1_use_angle_delta(BLOCK_SIZE bsize) {
-  (void)bsize;
-#if CONFIG_USE_ANGLE_DELTA_SUB8X8
-  return 1;
-#else
   return bsize >= BLOCK_8X8;
-#endif
 }
-#endif  // CONFIG_EXT_INTRA
 
-#if CONFIG_INTRABC
-static INLINE int av1_allow_intrabc(BLOCK_SIZE bsize,
-                                    const AV1_COMMON *const cm) {
-  return (bsize >= BLOCK_8X8 || bsize == BLOCK_4X4) &&
-         cm->allow_screen_content_tools;
+static INLINE int av1_allow_intrabc(const AV1_COMMON *const cm) {
+  return frame_is_intra_only(cm) && cm->allow_screen_content_tools &&
+         cm->allow_intrabc;
+}
+
+static INLINE int av1_filter_intra_allowed_bsize(const AV1_COMMON *const cm,
+                                                 BLOCK_SIZE bs) {
+  if (!cm->seq_params.enable_filter_intra || bs == BLOCK_INVALID) return 0;
+
+  return block_size_wide[bs] <= 32 && block_size_high[bs] <= 32;
 }
-#endif  // CONFIG_INTRABC
 
+static INLINE int av1_filter_intra_allowed(const AV1_COMMON *const cm,
+                                           const MB_MODE_INFO *mbmi) {
+  return mbmi->mode == DC_PRED &&
+         mbmi->palette_mode_info.palette_size[0] == 0 &&
+         av1_filter_intra_allowed_bsize(cm, mbmi->sb_type);
+}
+
+extern const int8_t av1_filter_intra_taps[FILTER_INTRA_MODES][8][8];
+
+// Get the shift (up-scaled by 256) in X w.r.t a unit change in Y.
+// If angle > 0 && angle < 90, dx = -((int)(256 / t));
+// If angle > 90 && angle < 180, dx = (int)(256 / t);
+// If angle > 180 && angle < 270, dx = 1;
+static INLINE int av1_get_dx(int angle) {
+  if (angle > 0 && angle < 90) {
+    return dr_intra_derivative[angle];
+  } else if (angle > 90 && angle < 180) {
+    return dr_intra_derivative[180 - angle];
+  } else {
+    // In this case, we are not really going to use dx. We may return any value.
+    return 1;
+  }
+}
+
+// Get the shift (up-scaled by 256) in Y w.r.t a unit change in X.
+// If angle > 0 && angle < 90, dy = 1;
+// If angle > 90 && angle < 180, dy = (int)(256 * t);
+// If angle > 180 && angle < 270, dy = -((int)(256 * t));
+static INLINE int av1_get_dy(int angle) {
+  if (angle > 90 && angle < 180) {
+    return dr_intra_derivative[angle - 90];
+  } else if (angle > 180 && angle < 270) {
+    return dr_intra_derivative[270 - angle];
+  } else {
+    // In this case, we are not really going to use dy. We may return any value.
+    return 1;
+  }
+}
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c
index b0f303e35..17e6823b1 100644
--- a/third_party/aom/av1/common/resize.c
+++ b/third_party/aom/av1/common/resize.c
@@ -16,30 +16,18 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "./aom_config.h"
-#if CONFIG_HIGHBITDEPTH
+#include "config/aom_config.h"
+
 #include "aom_dsp/aom_dsp_common.h"
-#endif  // CONFIG_HIGHBITDEPTH
 #include "aom_ports/mem.h"
 #include "aom_scale/aom_scale.h"
 #include "av1/common/common.h"
 #include "av1/common/resize.h"
 
-#include "./aom_scale_rtcd.h"
-
-#define FILTER_BITS 7
-
-#define INTERP_TAPS 8
-#define SUBPEL_BITS_RS 6
-#define SUBPEL_MASK_RS ((1 << SUBPEL_BITS_RS) - 1)
-#define INTERP_PRECISION_BITS 16
-#define SUBPEL_INTERP_EXTRA_BITS (INTERP_PRECISION_BITS - SUBPEL_BITS_RS)
-#define SUBPEL_INTERP_EXTRA_OFF (1 << (SUBPEL_INTERP_EXTRA_BITS - 1))
-
-typedef int16_t interp_kernel[INTERP_TAPS];
+#include "config/aom_scale_rtcd.h"
 
 // Filters for interpolation (0.5-band) - note this also filters integer pels.
-static const interp_kernel filteredinterp_filters500[(1 << SUBPEL_BITS_RS)] = {
+static const InterpKernel filteredinterp_filters500[(1 << RS_SUBPEL_BITS)] = {
   { -3, 0, 35, 64, 35, 0, -3, 0 },    { -3, 0, 34, 64, 36, 0, -3, 0 },
   { -3, -1, 34, 64, 36, 1, -3, 0 },   { -3, -1, 33, 64, 37, 1, -3, 0 },
   { -3, -1, 32, 64, 38, 1, -3, 0 },   { -3, -1, 31, 64, 39, 1, -3, 0 },
@@ -75,7 +63,7 @@ static const interp_kernel filteredinterp_filters500[(1 << SUBPEL_BITS_RS)] = {
 };
 
 // Filters for interpolation (0.625-band) - note this also filters integer pels.
-static const interp_kernel filteredinterp_filters625[(1 << SUBPEL_BITS_RS)] = {
+static const InterpKernel filteredinterp_filters625[(1 << RS_SUBPEL_BITS)] = {
   { -1, -8, 33, 80, 33, -8, -1, 0 }, { -1, -8, 31, 80, 34, -8, -1, 1 },
   { -1, -8, 30, 80, 35, -8, -1, 1 }, { -1, -8, 29, 80, 36, -7, -2, 1 },
   { -1, -8, 28, 80, 37, -7, -2, 1 }, { -1, -8, 27, 80, 38, -7, -2, 1 },
@@ -111,7 +99,7 @@ static const interp_kernel filteredinterp_filters625[(1 << SUBPEL_BITS_RS)] = {
 };
 
 // Filters for interpolation (0.75-band) - note this also filters integer pels.
-static const interp_kernel filteredinterp_filters750[(1 << SUBPEL_BITS_RS)] = {
+static const InterpKernel filteredinterp_filters750[(1 << RS_SUBPEL_BITS)] = {
   { 2, -11, 25, 96, 25, -11, 2, 0 }, { 2, -11, 24, 96, 26, -11, 2, 0 },
   { 2, -11, 22, 96, 28, -11, 2, 0 }, { 2, -10, 21, 96, 29, -12, 2, 0 },
   { 2, -10, 19, 96, 31, -12, 2, 0 }, { 2, -10, 18, 95, 32, -11, 2, 0 },
@@ -147,7 +135,7 @@ static const interp_kernel filteredinterp_filters750[(1 << SUBPEL_BITS_RS)] = {
 };
 
 // Filters for interpolation (0.875-band) - note this also filters integer pels.
-static const interp_kernel filteredinterp_filters875[(1 << SUBPEL_BITS_RS)] = {
+static const InterpKernel filteredinterp_filters875[(1 << RS_SUBPEL_BITS)] = {
   { 3, -8, 13, 112, 13, -8, 3, 0 },   { 2, -7, 12, 112, 15, -8, 3, -1 },
   { 3, -7, 10, 112, 17, -9, 3, -1 },  { 2, -6, 8, 112, 19, -9, 3, -1 },
   { 2, -6, 7, 112, 21, -10, 3, -1 },  { 2, -5, 6, 111, 22, -10, 3, -1 },
@@ -183,7 +171,7 @@ static const interp_kernel filteredinterp_filters875[(1 << SUBPEL_BITS_RS)] = {
 };
 
 // Filters for interpolation (full-band) - no filtering for integer pixels
-static const interp_kernel filteredinterp_filters1000[(1 << SUBPEL_BITS_RS)] = {
+static const InterpKernel filteredinterp_filters1000[(1 << RS_SUBPEL_BITS)] = {
   { 0, 0, 0, 128, 0, 0, 0, 0 },        { 0, 0, -1, 128, 2, -1, 0, 0 },
   { 0, 1, -3, 127, 4, -2, 1, 0 },      { 0, 1, -4, 127, 6, -3, 1, 0 },
   { 0, 2, -6, 126, 8, -3, 1, 0 },      { 0, 2, -7, 125, 11, -4, 1, 0 },
@@ -218,153 +206,116 @@ static const interp_kernel filteredinterp_filters1000[(1 << SUBPEL_BITS_RS)] = {
   { 0, 1, -2, 4, 127, -3, 1, 0 },      { 0, 0, -1, 2, 128, -1, 0, 0 },
 };
 
-#if CONFIG_FRAME_SUPERRES && CONFIG_LOOP_RESTORATION
-#define INTERP_SIMPLE_TAPS 4
-static const int16_t filter_simple[(1
-                                    << SUBPEL_BITS_RS)][INTERP_SIMPLE_TAPS] = {
-#if INTERP_SIMPLE_TAPS == 2
-  { 128, 0 },  { 126, 2 },  { 124, 4 },  { 122, 6 },  { 120, 8 },  { 118, 10 },
-  { 116, 12 }, { 114, 14 }, { 112, 16 }, { 110, 18 }, { 108, 20 }, { 106, 22 },
-  { 104, 24 }, { 102, 26 }, { 100, 28 }, { 98, 30 },  { 96, 32 },  { 94, 34 },
-  { 92, 36 },  { 90, 38 },  { 88, 40 },  { 86, 42 },  { 84, 44 },  { 82, 46 },
-  { 80, 48 },  { 78, 50 },  { 76, 52 },  { 74, 54 },  { 72, 56 },  { 70, 58 },
-  { 68, 60 },  { 66, 62 },  { 64, 64 },  { 62, 66 },  { 60, 68 },  { 58, 70 },
-  { 56, 72 },  { 54, 74 },  { 52, 76 },  { 50, 78 },  { 48, 80 },  { 46, 82 },
-  { 44, 84 },  { 42, 86 },  { 40, 88 },  { 38, 90 },  { 36, 92 },  { 34, 94 },
-  { 32, 96 },  { 30, 98 },  { 28, 100 }, { 26, 102 }, { 24, 104 }, { 22, 106 },
-  { 20, 108 }, { 18, 110 }, { 16, 112 }, { 14, 114 }, { 12, 116 }, { 10, 118 },
-  { 8, 120 },  { 6, 122 },  { 4, 124 },  { 2, 126 },
-#elif INTERP_SIMPLE_TAPS == 4
-  { 0, 128, 0, 0 },      { -1, 128, 2, -1 },    { -2, 127, 4, -1 },
-  { -3, 126, 7, -2 },    { -4, 125, 9, -2 },    { -5, 125, 11, -3 },
-  { -6, 124, 13, -3 },   { -7, 123, 16, -4 },   { -7, 122, 18, -5 },
-  { -8, 121, 20, -5 },   { -9, 120, 23, -6 },   { -9, 118, 25, -6 },
-  { -10, 117, 28, -7 },  { -11, 116, 30, -7 },  { -11, 114, 33, -8 },
-  { -12, 113, 35, -8 },  { -12, 111, 38, -9 },  { -13, 109, 41, -9 },
-  { -13, 108, 43, -10 }, { -13, 106, 45, -10 }, { -13, 104, 48, -11 },
-  { -14, 102, 51, -11 }, { -14, 100, 53, -11 }, { -14, 98, 56, -12 },
-  { -14, 96, 58, -12 },  { -14, 94, 61, -13 },  { -15, 92, 64, -13 },
-  { -15, 90, 66, -13 },  { -15, 87, 69, -13 },  { -14, 85, 71, -14 },
-  { -14, 83, 73, -14 },  { -14, 80, 76, -14 },  { -14, 78, 78, -14 },
-  { -14, 76, 80, -14 },  { -14, 73, 83, -14 },  { -14, 71, 85, -14 },
-  { -13, 69, 87, -15 },  { -13, 66, 90, -15 },  { -13, 64, 92, -15 },
-  { -13, 61, 94, -14 },  { -12, 58, 96, -14 },  { -12, 56, 98, -14 },
-  { -11, 53, 100, -14 }, { -11, 51, 102, -14 }, { -11, 48, 104, -13 },
-  { -10, 45, 106, -13 }, { -10, 43, 108, -13 }, { -9, 41, 109, -13 },
-  { -9, 38, 111, -12 },  { -8, 35, 113, -12 },  { -8, 33, 114, -11 },
-  { -7, 30, 116, -11 },  { -7, 28, 117, -10 },  { -6, 25, 118, -9 },
-  { -6, 23, 120, -9 },   { -5, 20, 121, -8 },   { -5, 18, 122, -7 },
-  { -4, 16, 123, -7 },   { -3, 13, 124, -6 },   { -3, 11, 125, -5 },
-  { -2, 9, 125, -4 },    { -2, 7, 126, -3 },    { -1, 4, 127, -2 },
-  { -1, 2, 128, -1 },
-#elif INTERP_SIMPLE_TAPS == 6
-  { 0, 0, 128, 0, 0, 0 },      { 0, -1, 128, 2, -1, 0 },
-  { 1, -3, 127, 4, -2, 1 },    { 1, -4, 127, 6, -3, 1 },
-  { 2, -6, 126, 8, -3, 1 },    { 2, -7, 125, 11, -4, 1 },
-  { 2, -9, 125, 13, -5, 2 },   { 3, -10, 124, 15, -6, 2 },
-  { 3, -11, 123, 18, -7, 2 },  { 3, -12, 122, 20, -8, 3 },
-  { 4, -13, 121, 22, -9, 3 },  { 4, -14, 119, 25, -9, 3 },
-  { 4, -15, 118, 27, -10, 4 }, { 4, -16, 117, 30, -11, 4 },
-  { 5, -17, 116, 32, -12, 4 }, { 5, -17, 114, 35, -13, 4 },
-  { 5, -18, 112, 37, -13, 5 }, { 5, -19, 111, 40, -14, 5 },
-  { 6, -19, 109, 42, -15, 5 }, { 6, -20, 107, 45, -15, 5 },
-  { 6, -20, 105, 48, -16, 5 }, { 6, -21, 103, 51, -17, 6 },
-  { 6, -21, 101, 53, -17, 6 }, { 6, -21, 99, 56, -18, 6 },
-  { 7, -22, 97, 58, -18, 6 },  { 7, -22, 95, 61, -19, 6 },
-  { 7, -22, 93, 63, -19, 6 },  { 7, -22, 91, 66, -20, 6 },
-  { 7, -22, 88, 69, -20, 6 },  { 7, -22, 86, 71, -21, 7 },
-  { 7, -22, 83, 74, -21, 7 },  { 7, -22, 81, 76, -21, 7 },
-  { 7, -22, 79, 79, -22, 7 },  { 7, -21, 76, 81, -22, 7 },
-  { 7, -21, 74, 83, -22, 7 },  { 7, -21, 71, 86, -22, 7 },
-  { 6, -20, 69, 88, -22, 7 },  { 6, -20, 66, 91, -22, 7 },
-  { 6, -19, 63, 93, -22, 7 },  { 6, -19, 61, 95, -22, 7 },
-  { 6, -18, 58, 97, -22, 7 },  { 6, -18, 56, 99, -21, 6 },
-  { 6, -17, 53, 101, -21, 6 }, { 6, -17, 51, 103, -21, 6 },
-  { 5, -16, 48, 105, -20, 6 }, { 5, -15, 45, 107, -20, 6 },
-  { 5, -15, 42, 109, -19, 6 }, { 5, -14, 40, 111, -19, 5 },
-  { 5, -13, 37, 112, -18, 5 }, { 4, -13, 35, 114, -17, 5 },
-  { 4, -12, 32, 116, -17, 5 }, { 4, -11, 30, 117, -16, 4 },
-  { 4, -10, 27, 118, -15, 4 }, { 3, -9, 25, 119, -14, 4 },
-  { 3, -9, 22, 121, -13, 4 },  { 3, -8, 20, 122, -12, 3 },
-  { 2, -7, 18, 123, -11, 3 },  { 2, -6, 15, 124, -10, 3 },
-  { 2, -5, 13, 125, -9, 2 },   { 1, -4, 11, 125, -7, 2 },
-  { 1, -3, 8, 126, -6, 2 },    { 1, -3, 6, 127, -4, 1 },
-  { 1, -2, 4, 127, -3, 1 },    { 0, -1, 2, 128, -1, 0 },
+const int16_t av1_resize_filter_normative[(
+    1 << RS_SUBPEL_BITS)][UPSCALE_NORMATIVE_TAPS] = {
+#if UPSCALE_NORMATIVE_TAPS == 8
+  { 0, 0, 0, 128, 0, 0, 0, 0 },        { 0, 0, -1, 128, 2, -1, 0, 0 },
+  { 0, 1, -3, 127, 4, -2, 1, 0 },      { 0, 1, -4, 127, 6, -3, 1, 0 },
+  { 0, 2, -6, 126, 8, -3, 1, 0 },      { 0, 2, -7, 125, 11, -4, 1, 0 },
+  { -1, 2, -8, 125, 13, -5, 2, 0 },    { -1, 3, -9, 124, 15, -6, 2, 0 },
+  { -1, 3, -10, 123, 18, -6, 2, -1 },  { -1, 3, -11, 122, 20, -7, 3, -1 },
+  { -1, 4, -12, 121, 22, -8, 3, -1 },  { -1, 4, -13, 120, 25, -9, 3, -1 },
+  { -1, 4, -14, 118, 28, -9, 3, -1 },  { -1, 4, -15, 117, 30, -10, 4, -1 },
+  { -1, 5, -16, 116, 32, -11, 4, -1 }, { -1, 5, -16, 114, 35, -12, 4, -1 },
+  { -1, 5, -17, 112, 38, -12, 4, -1 }, { -1, 5, -18, 111, 40, -13, 5, -1 },
+  { -1, 5, -18, 109, 43, -14, 5, -1 }, { -1, 6, -19, 107, 45, -14, 5, -1 },
+  { -1, 6, -19, 105, 48, -15, 5, -1 }, { -1, 6, -19, 103, 51, -16, 5, -1 },
+  { -1, 6, -20, 101, 53, -16, 6, -1 }, { -1, 6, -20, 99, 56, -17, 6, -1 },
+  { -1, 6, -20, 97, 58, -17, 6, -1 },  { -1, 6, -20, 95, 61, -18, 6, -1 },
+  { -2, 7, -20, 93, 64, -18, 6, -2 },  { -2, 7, -20, 91, 66, -19, 6, -1 },
+  { -2, 7, -20, 88, 69, -19, 6, -1 },  { -2, 7, -20, 86, 71, -19, 6, -1 },
+  { -2, 7, -20, 84, 74, -20, 7, -2 },  { -2, 7, -20, 81, 76, -20, 7, -1 },
+  { -2, 7, -20, 79, 79, -20, 7, -2 },  { -1, 7, -20, 76, 81, -20, 7, -2 },
+  { -2, 7, -20, 74, 84, -20, 7, -2 },  { -1, 6, -19, 71, 86, -20, 7, -2 },
+  { -1, 6, -19, 69, 88, -20, 7, -2 },  { -1, 6, -19, 66, 91, -20, 7, -2 },
+  { -2, 6, -18, 64, 93, -20, 7, -2 },  { -1, 6, -18, 61, 95, -20, 6, -1 },
+  { -1, 6, -17, 58, 97, -20, 6, -1 },  { -1, 6, -17, 56, 99, -20, 6, -1 },
+  { -1, 6, -16, 53, 101, -20, 6, -1 }, { -1, 5, -16, 51, 103, -19, 6, -1 },
+  { -1, 5, -15, 48, 105, -19, 6, -1 }, { -1, 5, -14, 45, 107, -19, 6, -1 },
+  { -1, 5, -14, 43, 109, -18, 5, -1 }, { -1, 5, -13, 40, 111, -18, 5, -1 },
+  { -1, 4, -12, 38, 112, -17, 5, -1 }, { -1, 4, -12, 35, 114, -16, 5, -1 },
+  { -1, 4, -11, 32, 116, -16, 5, -1 }, { -1, 4, -10, 30, 117, -15, 4, -1 },
+  { -1, 3, -9, 28, 118, -14, 4, -1 },  { -1, 3, -9, 25, 120, -13, 4, -1 },
+  { -1, 3, -8, 22, 121, -12, 4, -1 },  { -1, 3, -7, 20, 122, -11, 3, -1 },
+  { -1, 2, -6, 18, 123, -10, 3, -1 },  { 0, 2, -6, 15, 124, -9, 3, -1 },
+  { 0, 2, -5, 13, 125, -8, 2, -1 },    { 0, 1, -4, 11, 125, -7, 2, 0 },
+  { 0, 1, -3, 8, 126, -6, 2, 0 },      { 0, 1, -3, 6, 127, -4, 1, 0 },
+  { 0, 1, -2, 4, 127, -3, 1, 0 },      { 0, 0, -1, 2, 128, -1, 0, 0 },
 #else
-#error "Invalid value of INTERP_SIMPLE_TAPS"
-#endif  // INTERP_SIMPLE_TAPS == 2
+#error "Invalid value of UPSCALE_NORMATIVE_TAPS"
+#endif  // UPSCALE_NORMATIVE_TAPS == 8
 };
-#endif  // CONFIG_FRAME_SUPERRES && CONFIG_LOOP_RESTORATION
 
 // Filters for factor of 2 downsampling.
 static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 };
 static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 };
 
-static const interp_kernel *choose_interp_filter(int inlength, int outlength) {
-  int outlength16 = outlength * 16;
-  if (outlength16 >= inlength * 16)
+static const InterpKernel *choose_interp_filter(int in_length, int out_length) {
+  int out_length16 = out_length * 16;
+  if (out_length16 >= in_length * 16)
     return filteredinterp_filters1000;
-  else if (outlength16 >= inlength * 13)
+  else if (out_length16 >= in_length * 13)
     return filteredinterp_filters875;
-  else if (outlength16 >= inlength * 11)
+  else if (out_length16 >= in_length * 11)
     return filteredinterp_filters750;
-  else if (outlength16 >= inlength * 9)
+  else if (out_length16 >= in_length * 9)
     return filteredinterp_filters625;
   else
     return filteredinterp_filters500;
 }
 
-static void interpolate_core(const uint8_t *const input, int inlength,
-                             uint8_t *output, int outlength,
+static void interpolate_core(const uint8_t *const input, int in_length,
+                             uint8_t *output, int out_length,
                              const int16_t *interp_filters, int interp_taps) {
   const int32_t delta =
-      (((uint32_t)inlength << INTERP_PRECISION_BITS) + outlength / 2) /
-      outlength;
+      (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) /
+      out_length;
   const int32_t offset =
-      inlength > outlength
-          ? (((int32_t)(inlength - outlength) << (INTERP_PRECISION_BITS - 1)) +
-             outlength / 2) /
-                outlength
-          : -(((int32_t)(outlength - inlength) << (INTERP_PRECISION_BITS - 1)) +
-              outlength / 2) /
-                outlength;
+      in_length > out_length
+          ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+             out_length / 2) /
+                out_length
+          : -(((int32_t)(out_length - in_length)
+               << (RS_SCALE_SUBPEL_BITS - 1)) +
+              out_length / 2) /
+                out_length;
   uint8_t *optr = output;
   int x, x1, x2, sum, k, int_pel, sub_pel;
   int32_t y;
 
   x = 0;
-  y = offset + SUBPEL_INTERP_EXTRA_OFF;
-  while ((y >> INTERP_PRECISION_BITS) < (interp_taps / 2 - 1)) {
+  y = offset + RS_SCALE_EXTRA_OFF;
+  while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) {
     x++;
     y += delta;
   }
   x1 = x;
-  x = outlength - 1;
-  y = delta * x + offset + SUBPEL_INTERP_EXTRA_OFF;
-  while ((y >> INTERP_PRECISION_BITS) + (int32_t)(interp_taps / 2) >=
-         inlength) {
+  x = out_length - 1;
+  y = delta * x + offset + RS_SCALE_EXTRA_OFF;
+  while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >=
+         in_length) {
     x--;
     y -= delta;
   }
   x2 = x;
   if (x1 > x2) {
-    for (x = 0, y = offset + SUBPEL_INTERP_EXTRA_OFF; x < outlength;
+    for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length;
          ++x, y += delta) {
-      int_pel = y >> INTERP_PRECISION_BITS;
-      sub_pel = (y >> SUBPEL_INTERP_EXTRA_BITS) & SUBPEL_MASK_RS;
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
       const int16_t *filter = &interp_filters[sub_pel * interp_taps];
       sum = 0;
       for (k = 0; k < interp_taps; ++k) {
         const int pk = int_pel - interp_taps / 2 + 1 + k;
-        sum += filter[k] * input[AOMMAX(AOMMIN(pk, inlength - 1), 0)];
+        sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)];
       }
       *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
     }
   } else {
     // Initial part.
-    for (x = 0, y = offset + SUBPEL_INTERP_EXTRA_OFF; x < x1; ++x, y += delta) {
-      int_pel = y >> INTERP_PRECISION_BITS;
-      sub_pel = (y >> SUBPEL_INTERP_EXTRA_BITS) & SUBPEL_MASK_RS;
+    for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
       const int16_t *filter = &interp_filters[sub_pel * interp_taps];
       sum = 0;
       for (k = 0; k < interp_taps; ++k)
@@ -373,8 +324,8 @@ static void interpolate_core(const uint8_t *const input, int inlength,
     }
     // Middle part.
     for (; x <= x2; ++x, y += delta) {
-      int_pel = y >> INTERP_PRECISION_BITS;
-      sub_pel = (y >> SUBPEL_INTERP_EXTRA_BITS) & SUBPEL_MASK_RS;
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
       const int16_t *filter = &interp_filters[sub_pel * interp_taps];
       sum = 0;
       for (k = 0; k < interp_taps; ++k)
@@ -382,35 +333,42 @@ static void interpolate_core(const uint8_t *const input, int inlength,
       *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
     }
     // End part.
-    for (; x < outlength; ++x, y += delta) {
-      int_pel = y >> INTERP_PRECISION_BITS;
-      sub_pel = (y >> SUBPEL_INTERP_EXTRA_BITS) & SUBPEL_MASK_RS;
+    for (; x < out_length; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
       const int16_t *filter = &interp_filters[sub_pel * interp_taps];
       sum = 0;
       for (k = 0; k < interp_taps; ++k)
         sum += filter[k] *
-               input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, inlength - 1)];
+               input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)];
       *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
     }
   }
 }
 
-static void interpolate(const uint8_t *const input, int inlength,
-                        uint8_t *output, int outlength) {
-  const interp_kernel *interp_filters =
-      choose_interp_filter(inlength, outlength);
+static void interpolate(const uint8_t *const input, int in_length,
+                        uint8_t *output, int out_length) {
+  const InterpKernel *interp_filters =
+      choose_interp_filter(in_length, out_length);
+
+  interpolate_core(input, in_length, output, out_length, &interp_filters[0][0],
+                   SUBPEL_TAPS);
+}
 
-  interpolate_core(input, inlength, output, outlength, &interp_filters[0][0],
-                   INTERP_TAPS);
+int32_t av1_get_upscale_convolve_step(int in_length, int out_length) {
+  return ((in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / out_length;
 }
 
-#if CONFIG_FRAME_SUPERRES && CONFIG_LOOP_RESTORATION
-static void interpolate_simple(const uint8_t *const input, int inlength,
-                               uint8_t *output, int outlength) {
-  interpolate_core(input, inlength, output, outlength, &filter_simple[0][0],
-                   INTERP_SIMPLE_TAPS);
+static int32_t get_upscale_convolve_x0(int in_length, int out_length,
+                                       int32_t x_step_qn) {
+  const int err = out_length * x_step_qn - (in_length << RS_SCALE_SUBPEL_BITS);
+  const int32_t x0 =
+      (-((out_length - in_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+       out_length / 2) /
+          out_length +
+      RS_SCALE_EXTRA_OFF - err / 2;
+  return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK);
 }
-#endif  // CONFIG_FRAME_SUPERRES && CONFIG_LOOP_RESTORATION
 
 #ifndef __clang_analyzer__
 static void down2_symeven(const uint8_t *const input, int length,
@@ -525,8 +483,7 @@ static void down2_symodd(const uint8_t *const input, int length,
 }
 
 static int get_down2_length(int length, int steps) {
-  int s;
-  for (s = 0; s < steps; ++s) length = (length + 1) >> 1;
+  for (int s = 0; s < steps; ++s) length = (length + 1) >> 1;
   return length;
 }
 
@@ -536,6 +493,12 @@ static int get_down2_steps(int in_length, int out_length) {
   while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) {
     ++steps;
     in_length = proj_in_length;
+    if (in_length == 1) {
+      // Special case: we break because any further calls to get_down2_length()
+      // with be with length == 1, which return 1, resulting in an infinite
+      // loop.
+      break;
+    }
   }
   return steps;
 }
@@ -624,97 +587,118 @@ Error:
   aom_free(arrbuf2);
 }
 
-#if CONFIG_FRAME_SUPERRES
-static void upscale_normative(const uint8_t *const input, int length,
-                              uint8_t *output, int olength) {
-#if CONFIG_LOOP_RESTORATION
-  interpolate_simple(input, length, output, olength);
-#else
-  interpolate(input, length, output, olength);
-#endif  // CONFIG_LOOP_RESTORATION
-}
-
-static void upscale_normative_plane(const uint8_t *const input, int height,
-                                    int width, int in_stride, uint8_t *output,
-                                    int height2, int width2, int out_stride) {
-  int i;
-  uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * width2 * height);
-  uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * height);
-  uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(uint8_t) * height2);
-  if (intbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) goto Error;
+static void upscale_normative_rect(const uint8_t *const input, int height,
+                                   int width, int in_stride, uint8_t *output,
+                                   int height2, int width2, int out_stride,
+                                   int x_step_qn, int x0_qn, int pad_left,
+                                   int pad_right) {
   assert(width > 0);
   assert(height > 0);
   assert(width2 > 0);
   assert(height2 > 0);
-  for (i = 0; i < height; ++i)
-    upscale_normative(input + in_stride * i, width, intbuf + width2 * i,
-                      width2);
-  for (i = 0; i < width2; ++i) {
-    fill_col_to_arr(intbuf + i, width2, height, arrbuf);
-    upscale_normative(arrbuf, height, arrbuf2, height2);
-    fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
+  assert(height2 == height);
+
+  // Extend the left/right pixels of the tile column if needed
+  // (either because we can't sample from other tiles, or because we're at
+  // a frame edge).
+  // Save the overwritten pixels into tmp_left and tmp_right.
+  // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra
+  // column of border pixels compared to what we'd naively think.
+  const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1;
+  uint8_t *tmp_left =
+      NULL;  // Silence spurious "may be used uninitialized" warnings
+  uint8_t *tmp_right = NULL;
+  uint8_t *const in_tl = (uint8_t *)(input - border_cols);  // Cast off 'const'
+  uint8_t *const in_tr = (uint8_t *)(input + width);
+  if (pad_left) {
+    tmp_left = (uint8_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height);
+    for (int i = 0; i < height; i++) {
+      memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_cols);
+      memset(in_tl + i * in_stride, input[i * in_stride], border_cols);
+    }
+  }
+  if (pad_right) {
+    tmp_right =
+        (uint8_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height);
+    for (int i = 0; i < height; i++) {
+      memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_cols);
+      memset(in_tr + i * in_stride, input[i * in_stride + width - 1],
+             border_cols);
+    }
   }
 
-Error:
-  aom_free(intbuf);
-  aom_free(arrbuf);
-  aom_free(arrbuf2);
+  av1_convolve_horiz_rs(input - 1, in_stride, output, out_stride, width2,
+                        height2, &av1_resize_filter_normative[0][0], x0_qn,
+                        x_step_qn);
+
+  // Restore the left/right border pixels
+  if (pad_left) {
+    for (int i = 0; i < height; i++) {
+      memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_cols);
+    }
+    aom_free(tmp_left);
+  }
+  if (pad_right) {
+    for (int i = 0; i < height; i++) {
+      memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_cols);
+    }
+    aom_free(tmp_right);
+  }
 }
-#endif  // CONFIG_FRAME_SUPERRES
 
-#if CONFIG_HIGHBITDEPTH
-static void highbd_interpolate_core(const uint16_t *const input, int inlength,
-                                    uint16_t *output, int outlength, int bd,
+static void highbd_interpolate_core(const uint16_t *const input, int in_length,
+                                    uint16_t *output, int out_length, int bd,
                                     const int16_t *interp_filters,
                                     int interp_taps) {
   const int32_t delta =
-      (((uint32_t)inlength << INTERP_PRECISION_BITS) + outlength / 2) /
-      outlength;
+      (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) /
+      out_length;
   const int32_t offset =
-      inlength > outlength
-          ? (((int32_t)(inlength - outlength) << (INTERP_PRECISION_BITS - 1)) +
-             outlength / 2) /
-                outlength
-          : -(((int32_t)(outlength - inlength) << (INTERP_PRECISION_BITS - 1)) +
-              outlength / 2) /
-                outlength;
+      in_length > out_length
+          ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+             out_length / 2) /
+                out_length
+          : -(((int32_t)(out_length - in_length)
+               << (RS_SCALE_SUBPEL_BITS - 1)) +
+              out_length / 2) /
+                out_length;
   uint16_t *optr = output;
   int x, x1, x2, sum, k, int_pel, sub_pel;
   int32_t y;
 
   x = 0;
-  y = offset + SUBPEL_INTERP_EXTRA_OFF;
-  while ((y >> INTERP_PRECISION_BITS) < (interp_taps / 2 - 1)) {
+  y = offset + RS_SCALE_EXTRA_OFF;
+  while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) {
     x++;
     y += delta;
   }
   x1 = x;
-  x = outlength - 1;
-  y = delta * x + offset + SUBPEL_INTERP_EXTRA_OFF;
-  while ((y >> INTERP_PRECISION_BITS) + (int32_t)(interp_taps / 2) >=
-         inlength) {
+  x = out_length - 1;
+  y = delta * x + offset + RS_SCALE_EXTRA_OFF;
+  while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >=
+         in_length) {
     x--;
     y -= delta;
   }
   x2 = x;
   if (x1 > x2) {
-    for (x = 0, y = offset + SUBPEL_INTERP_EXTRA_OFF; x < outlength;
+    for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length;
          ++x, y += delta) {
-      int_pel = y >> INTERP_PRECISION_BITS;
-      sub_pel = (y >> SUBPEL_INTERP_EXTRA_BITS) & SUBPEL_MASK_RS;
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
       const int16_t *filter = &interp_filters[sub_pel * interp_taps];
       sum = 0;
       for (k = 0; k < interp_taps; ++k) {
         const int pk = int_pel - interp_taps / 2 + 1 + k;
-        sum += filter[k] * input[AOMMAX(AOMMIN(pk, inlength - 1), 0)];
+        sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)];
       }
       *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
     }
   } else {
     // Initial part.
-    for (x = 0, y = offset + SUBPEL_INTERP_EXTRA_OFF; x < x1; ++x, y += delta) {
-      int_pel = y >> INTERP_PRECISION_BITS;
-      sub_pel = (y >> SUBPEL_INTERP_EXTRA_BITS) & SUBPEL_MASK_RS;
+    for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
       const int16_t *filter = &interp_filters[sub_pel * interp_taps];
       sum = 0;
       for (k = 0; k < interp_taps; ++k)
@@ -723,8 +707,8 @@ static void highbd_interpolate_core(const uint16_t *const input, int inlength,
     }
     // Middle part.
     for (; x <= x2; ++x, y += delta) {
-      int_pel = y >> INTERP_PRECISION_BITS;
-      sub_pel = (y >> SUBPEL_INTERP_EXTRA_BITS) & SUBPEL_MASK_RS;
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
       const int16_t *filter = &interp_filters[sub_pel * interp_taps];
       sum = 0;
       for (k = 0; k < interp_taps; ++k)
@@ -732,35 +716,27 @@ static void highbd_interpolate_core(const uint16_t *const input, int inlength,
       *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
     }
     // End part.
-    for (; x < outlength; ++x, y += delta) {
-      int_pel = y >> INTERP_PRECISION_BITS;
-      sub_pel = (y >> SUBPEL_INTERP_EXTRA_BITS) & SUBPEL_MASK_RS;
+    for (; x < out_length; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
       const int16_t *filter = &interp_filters[sub_pel * interp_taps];
       sum = 0;
       for (k = 0; k < interp_taps; ++k)
         sum += filter[k] *
-               input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, inlength - 1)];
+               input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)];
       *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
     }
   }
 }
 
-static void highbd_interpolate(const uint16_t *const input, int inlength,
-                               uint16_t *output, int outlength, int bd) {
-  const interp_kernel *interp_filters =
-      choose_interp_filter(inlength, outlength);
-
-  highbd_interpolate_core(input, inlength, output, outlength, bd,
-                          &interp_filters[0][0], INTERP_TAPS);
-}
+static void highbd_interpolate(const uint16_t *const input, int in_length,
+                               uint16_t *output, int out_length, int bd) {
+  const InterpKernel *interp_filters =
+      choose_interp_filter(in_length, out_length);
 
-#if CONFIG_FRAME_SUPERRES && CONFIG_LOOP_RESTORATION
-static void highbd_interpolate_simple(const uint16_t *const input, int inlength,
-                                      uint16_t *output, int outlength, int bd) {
-  highbd_interpolate_core(input, inlength, output, outlength, bd,
-                          &filter_simple[0][0], INTERP_SIMPLE_TAPS);
+  highbd_interpolate_core(input, in_length, output, out_length, bd,
+                          &interp_filters[0][0], SUBPEL_TAPS);
 }
-#endif  // CONFIG_FRAME_SUPERRES && CONFIG_LOOP_RESTORATION
 
 #ifndef __clang_analyzer__
 static void highbd_down2_symeven(const uint16_t *const input, int length,
@@ -958,44 +934,68 @@ Error:
   aom_free(arrbuf2);
 }
 
-#if CONFIG_FRAME_SUPERRES
-static void highbd_upscale_normative(const uint16_t *const input, int length,
-                                     uint16_t *output, int olength, int bd) {
-#if CONFIG_LOOP_RESTORATION
-  highbd_interpolate_simple(input, length, output, olength, bd);
-#else
-  highbd_interpolate(input, length, output, olength, bd);
-#endif  // CONFIG_LOOP_RESTORATION
-}
-
-static void highbd_upscale_normative_plane(const uint8_t *const input,
-                                           int height, int width, int in_stride,
-                                           uint8_t *output, int height2,
-                                           int width2, int out_stride, int bd) {
-  int i;
-  uint16_t *intbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * width2 * height);
-  uint16_t *arrbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * height);
-  uint16_t *arrbuf2 = (uint16_t *)aom_malloc(sizeof(uint16_t) * height2);
-  if (intbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) goto Error;
-  for (i = 0; i < height; ++i) {
-    highbd_upscale_normative(CONVERT_TO_SHORTPTR(input + in_stride * i), width,
-                             intbuf + width2 * i, width2, bd);
+static void highbd_upscale_normative_rect(const uint8_t *const input,
+                                          int height, int width, int in_stride,
+                                          uint8_t *output, int height2,
+                                          int width2, int out_stride,
+                                          int x_step_qn, int x0_qn,
+                                          int pad_left, int pad_right, int bd) {
+  assert(width > 0);
+  assert(height > 0);
+  assert(width2 > 0);
+  assert(height2 > 0);
+  assert(height2 == height);
+
+  // Extend the left/right pixels of the tile column if needed
+  // (either because we can't sample from other tiles, or because we're at
+  // a frame edge).
+  // Save the overwritten pixels into tmp_left and tmp_right.
+  // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra
+  // column of border pixels compared to what we'd naively think.
+  const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1;
+  const int border_size = border_cols * sizeof(uint16_t);
+  uint16_t *tmp_left =
+      NULL;  // Silence spurious "may be used uninitialized" warnings
+  uint16_t *tmp_right = NULL;
+  uint16_t *const input16 = CONVERT_TO_SHORTPTR(input);
+  uint16_t *const in_tl = input16 - border_cols;
+  uint16_t *const in_tr = input16 + width;
+  if (pad_left) {
+    tmp_left = (uint16_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height);
+    for (int i = 0; i < height; i++) {
+      memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_size);
+      aom_memset16(in_tl + i * in_stride, input16[i * in_stride], border_cols);
+    }
   }
-  for (i = 0; i < width2; ++i) {
-    highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf);
-    highbd_upscale_normative(arrbuf, height, arrbuf2, height2, bd);
-    highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2,
-                           arrbuf2);
+  if (pad_right) {
+    tmp_right =
+        (uint16_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height);
+    for (int i = 0; i < height; i++) {
+      memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_size);
+      aom_memset16(in_tr + i * in_stride, input16[i * in_stride + width - 1],
+                   border_cols);
+    }
   }
 
-Error:
-  aom_free(intbuf);
-  aom_free(arrbuf);
-  aom_free(arrbuf2);
-}
-#endif  // CONFIG_FRAME_SUPERRES
+  av1_highbd_convolve_horiz_rs(CONVERT_TO_SHORTPTR(input - 1), in_stride,
+                               CONVERT_TO_SHORTPTR(output), out_stride, width2,
+                               height2, &av1_resize_filter_normative[0][0],
+                               x0_qn, x_step_qn, bd);
 
-#endif  // CONFIG_HIGHBITDEPTH
+  // Restore the left/right border pixels
+  if (pad_left) {
+    for (int i = 0; i < height; i++) {
+      memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_size);
+    }
+    aom_free(tmp_left);
+  }
+  if (pad_right) {
+    for (int i = 0; i < height; i++) {
+      memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_size);
+    }
+    aom_free(tmp_right);
+  }
+}
 
 void av1_resize_frame420(const uint8_t *const y, int y_stride,
                          const uint8_t *const u, const uint8_t *const v,
@@ -1031,7 +1031,6 @@ void av1_resize_frame444(const uint8_t *const y, int y_stride,
   resize_plane(v, height, width, uv_stride, ov, oheight, owidth, ouv_stride);
 }
 
-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride,
                                 const uint8_t *const u, const uint8_t *const v,
                                 int uv_stride, int height, int width,
@@ -1073,125 +1072,137 @@ void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride,
   highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
                       ouv_stride, bd);
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
-#if CONFIG_HIGHBITDEPTH
-void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                 YV12_BUFFER_CONFIG *dst, int bd) {
-#else
 void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                 YV12_BUFFER_CONFIG *dst) {
-#endif  // CONFIG_HIGHBITDEPTH
+                                 YV12_BUFFER_CONFIG *dst, int bd,
+                                 const int num_planes) {
   // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t
-  int i;
-  const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
-                                   src->v_buffer };
-  const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
-  const int src_widths[3] = { src->y_crop_width, src->uv_crop_width,
-                              src->uv_crop_width };
-  const int src_heights[3] = { src->y_crop_height, src->uv_crop_height,
-                               src->uv_crop_height };
-  uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
-  const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
-  const int dst_widths[3] = { dst->y_crop_width, dst->uv_crop_width,
-                              dst->uv_crop_width };
-  const int dst_heights[3] = { dst->y_crop_height, dst->uv_crop_height,
-                               dst->uv_crop_height };
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-#if CONFIG_HIGHBITDEPTH
+
+  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+  // the static analysis warnings.
+  for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+    const int is_uv = i > 0;
     if (src->flags & YV12_FLAG_HIGHBITDEPTH)
-      highbd_resize_plane(srcs[i], src_heights[i], src_widths[i],
-                          src_strides[i], dsts[i], dst_heights[i],
-                          dst_widths[i], dst_strides[i], bd);
+      highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+                          src->crop_widths[is_uv], src->strides[is_uv],
+                          dst->buffers[i], dst->crop_heights[is_uv],
+                          dst->crop_widths[is_uv], dst->strides[is_uv], bd);
     else
-#endif  // CONFIG_HIGHBITDEPTH
-      resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
-                   dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
+      resize_plane(src->buffers[i], src->crop_heights[is_uv],
+                   src->crop_widths[is_uv], src->strides[is_uv],
+                   dst->buffers[i], dst->crop_heights[is_uv],
+                   dst->crop_widths[is_uv], dst->strides[is_uv]);
   }
-  aom_extend_frame_borders(dst);
+  aom_extend_frame_borders(dst, num_planes);
 }
 
-#if CONFIG_FRAME_SUPERRES
-#if CONFIG_HIGHBITDEPTH
-void av1_upscale_normative_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                            YV12_BUFFER_CONFIG *dst, int bd) {
-#else
-void av1_upscale_normative_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                            YV12_BUFFER_CONFIG *dst) {
-#endif  // CONFIG_HIGHBITDEPTH
-  // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t
-  int i;
-  const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
-                                   src->v_buffer };
-  const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
-  const int src_widths[3] = { src->y_crop_width, src->uv_crop_width,
-                              src->uv_crop_width };
-  const int src_heights[3] = { src->y_crop_height, src->uv_crop_height,
-                               src->uv_crop_height };
-  uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
-  const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
-  const int dst_widths[3] = { dst->y_crop_width, dst->uv_crop_width,
-                              dst->uv_crop_width };
-  const int dst_heights[3] = { dst->y_crop_height, dst->uv_crop_height,
-                               dst->uv_crop_height };
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-#if CONFIG_HIGHBITDEPTH
-    if (src->flags & YV12_FLAG_HIGHBITDEPTH)
-      highbd_upscale_normative_plane(srcs[i], src_heights[i], src_widths[i],
-                                     src_strides[i], dsts[i], dst_heights[i],
-                                     dst_widths[i], dst_strides[i], bd);
+void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
+                                int src_stride, uint8_t *dst, int dst_stride,
+                                int plane, int rows) {
+  const int is_uv = (plane > 0);
+  const int ss_x = is_uv && cm->subsampling_x;
+  const int downscaled_plane_width = ROUND_POWER_OF_TWO(cm->width, ss_x);
+  const int upscaled_plane_width =
+      ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
+  const int superres_denom = cm->superres_scale_denominator;
+
+  TileInfo tile_col;
+  const int32_t x_step_qn = av1_get_upscale_convolve_step(
+      downscaled_plane_width, upscaled_plane_width);
+  int32_t x0_qn = get_upscale_convolve_x0(downscaled_plane_width,
+                                          upscaled_plane_width, x_step_qn);
+
+  for (int j = 0; j < cm->tile_cols; j++) {
+    av1_tile_set_col(&tile_col, cm, j);
+    // Determine the limits of this tile column in both the source
+    // and destination images.
+    // Note: The actual location which we start sampling from is
+    // (downscaled_x0 - 1 + (x0_qn/2^14)), and this quantity increases
+    // by exactly dst_width * (x_step_qn/2^14) pixels each iteration.
+    const int downscaled_x0 = tile_col.mi_col_start << (MI_SIZE_LOG2 - ss_x);
+    const int downscaled_x1 = tile_col.mi_col_end << (MI_SIZE_LOG2 - ss_x);
+    const int src_width = downscaled_x1 - downscaled_x0;
+
+    const int upscaled_x0 = (downscaled_x0 * superres_denom) / SCALE_NUMERATOR;
+    int upscaled_x1;
+    if (j == cm->tile_cols - 1) {
+      // Note that we can't just use AOMMIN here - due to rounding,
+      // (downscaled_x1 * superres_denom) / SCALE_NUMERATOR may be less than
+      // upscaled_plane_width.
+      upscaled_x1 = upscaled_plane_width;
+    } else {
+      upscaled_x1 = (downscaled_x1 * superres_denom) / SCALE_NUMERATOR;
+    }
+
+    const uint8_t *const src_ptr = src + downscaled_x0;
+    uint8_t *const dst_ptr = dst + upscaled_x0;
+    const int dst_width = upscaled_x1 - upscaled_x0;
+
+    const int pad_left = (j == 0);
+    const int pad_right = (j == cm->tile_cols - 1);
+
+    if (cm->use_highbitdepth)
+      highbd_upscale_normative_rect(
+          src_ptr, rows, src_width, src_stride, dst_ptr, rows, dst_width,
+          dst_stride, x_step_qn, x0_qn, pad_left, pad_right, cm->bit_depth);
     else
-#endif  // CONFIG_HIGHBITDEPTH
-      upscale_normative_plane(srcs[i], src_heights[i], src_widths[i],
-                              src_strides[i], dsts[i], dst_heights[i],
-                              dst_widths[i], dst_strides[i]);
+      upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr,
+                             rows, dst_width, dst_stride, x_step_qn, x0_qn,
+                             pad_left, pad_right);
+
+    // Update the fractional pixel offset to prepare for the next tile column.
+    x0_qn += (dst_width * x_step_qn) - (src_width << RS_SCALE_SUBPEL_BITS);
   }
-  aom_extend_frame_borders(dst);
 }
-#endif  // CONFIG_FRAME_SUPERRES
+
+void av1_upscale_normative_and_extend_frame(const AV1_COMMON *cm,
+                                            const YV12_BUFFER_CONFIG *src,
+                                            YV12_BUFFER_CONFIG *dst) {
+  const int num_planes = av1_num_planes(cm);
+  for (int i = 0; i < num_planes; ++i) {
+    const int is_uv = (i > 0);
+    av1_upscale_normative_rows(cm, src->buffers[i], src->strides[is_uv],
+                               dst->buffers[i], dst->strides[is_uv], i,
+                               src->crop_heights[is_uv]);
+  }
+
+  aom_extend_frame_borders(dst, num_planes);
+}
 
 YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
                                           YV12_BUFFER_CONFIG *scaled) {
+  const int num_planes = av1_num_planes(cm);
   if (cm->width != unscaled->y_crop_width ||
       cm->height != unscaled->y_crop_height) {
-#if CONFIG_HIGHBITDEPTH
-    av1_resize_and_extend_frame(unscaled, scaled, (int)cm->bit_depth);
-#else
-    av1_resize_and_extend_frame(unscaled, scaled);
-#endif  // CONFIG_HIGHBITDEPTH
+    av1_resize_and_extend_frame(unscaled, scaled, (int)cm->bit_depth,
+                                num_planes);
     return scaled;
   } else {
     return unscaled;
   }
 }
 
-// Calculates scaled dimensions given original dimensions and the scale
-// denominator. If 'scale_height' is 1, both width and height are scaled;
-// otherwise, only the width is scaled.
-static void calculate_scaled_size_helper(int *width, int *height, int denom,
-                                         int scale_height) {
+// Calculates the scaled dimension given the original dimension and the scale
+// denominator.
+static void calculate_scaled_size_helper(int *dim, int denom) {
   if (denom != SCALE_NUMERATOR) {
-    *width = *width * SCALE_NUMERATOR / denom;
-    *width += *width & 1;  // Make it even.
-    if (scale_height) {
-      *height = *height * SCALE_NUMERATOR / denom;
-      *height += *height & 1;  // Make it even.
-    }
+    // Use this version if we need *dim to be even
+    // *width = (*width * SCALE_NUMERATOR + denom) / (2 * denom);
+    // *width <<= 1;
+    *dim = (*dim * SCALE_NUMERATOR + denom / 2) / (denom);
   }
 }
 
 void av1_calculate_scaled_size(int *width, int *height, int resize_denom) {
-  calculate_scaled_size_helper(width, height, resize_denom, 1);
+  calculate_scaled_size_helper(width, resize_denom);
+  calculate_scaled_size_helper(height, resize_denom);
 }
 
-#if CONFIG_FRAME_SUPERRES
 void av1_calculate_scaled_superres_size(int *width, int *height,
                                         int superres_denom) {
-  calculate_scaled_size_helper(width, height, superres_denom,
-                               !CONFIG_HORZONLY_FRAME_SUPERRES);
+  (void)height;
+  calculate_scaled_size_helper(width, superres_denom);
 }
 
 void av1_calculate_unscaled_superres_size(int *width, int *height, int denom) {
@@ -1199,38 +1210,47 @@ void av1_calculate_unscaled_superres_size(int *width, int *height, int denom) {
     // Note: av1_calculate_scaled_superres_size() rounds *up* after division
     // when the resulting dimensions are odd. So here, we round *down*.
     *width = *width * denom / SCALE_NUMERATOR;
-#if CONFIG_HORZONLY_FRAME_SUPERRES
     (void)height;
-#else
-    *height = *height * denom / SCALE_NUMERATOR;
-#endif  // CONFIG_HORZONLY_FRAME_SUPERRES
   }
 }
 
+// Copy only the config data from 'src' to 'dst'.
+static void copy_buffer_config(const YV12_BUFFER_CONFIG *const src,
+                               YV12_BUFFER_CONFIG *const dst) {
+  dst->bit_depth = src->bit_depth;
+  dst->color_primaries = src->color_primaries;
+  dst->transfer_characteristics = src->transfer_characteristics;
+  dst->matrix_coefficients = src->matrix_coefficients;
+  dst->monochrome = src->monochrome;
+  dst->chroma_sample_position = src->chroma_sample_position;
+  dst->color_range = src->color_range;
+}
+
 // TODO(afergs): Look for in-place upscaling
 // TODO(afergs): aom_ vs av1_ functions? Which can I use?
 // Upscale decoded image.
 void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
-  if (av1_superres_unscaled(cm)) return;
+  const int num_planes = av1_num_planes(cm);
+  if (!av1_superres_scaled(cm)) return;
 
   YV12_BUFFER_CONFIG copy_buffer;
   memset(&copy_buffer, 0, sizeof(copy_buffer));
 
   YV12_BUFFER_CONFIG *const frame_to_show = get_frame_new_buffer(cm);
 
-  if (aom_alloc_frame_buffer(&copy_buffer, cm->width, cm->height,
+  const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, 3);
+  if (aom_alloc_frame_buffer(&copy_buffer, aligned_width, cm->height,
                              cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                             cm->use_highbitdepth,
-#endif  // CONFIG_HIGHBITDEPTH
-                             AOM_BORDER_IN_PIXELS, cm->byte_alignment))
+                             cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                             cm->byte_alignment))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate copy buffer for superres upscaling");
 
-  // Copy function assumes the frames are the same size, doesn't copy bit_depth.
-  aom_yv12_copy_frame(frame_to_show, &copy_buffer);
-  copy_buffer.bit_depth = frame_to_show->bit_depth;
-  assert(copy_buffer.y_crop_width == cm->width);
+  // Copy function assumes the frames are the same size.
+  // Note that it does not copy YV12_BUFFER_CONFIG config data.
+  aom_yv12_copy_frame(frame_to_show, &copy_buffer, num_planes);
+
+  assert(copy_buffer.y_crop_width == aligned_width);
   assert(copy_buffer.y_crop_height == cm->height);
 
   // Realloc the current frame buffer at a higher resolution in place.
@@ -1248,48 +1268,43 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
           &cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to free current frame buffer before superres upscaling");
 
-    if (aom_realloc_frame_buffer(
-            frame_to_show, cm->superres_upscaled_width,
-            cm->superres_upscaled_height, cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-            cm->use_highbitdepth,
-#endif  // CONFIG_HIGHBITDEPTH
-            AOM_BORDER_IN_PIXELS, cm->byte_alignment, fb, cb, cb_priv))
+    // aom_realloc_frame_buffer() leaves config data for frame_to_show intact
+    if (aom_realloc_frame_buffer(frame_to_show, cm->superres_upscaled_width,
+                                 cm->superres_upscaled_height,
+                                 cm->subsampling_x, cm->subsampling_y,
+                                 cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                                 cm->byte_alignment, fb, cb, cb_priv))
       aom_internal_error(
           &cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to allocate current frame buffer for superres upscaling");
   } else {
+    // Make a copy of the config data for frame_to_show in copy_buffer
+    copy_buffer_config(frame_to_show, &copy_buffer);
+
     // Don't use callbacks on the encoder.
+    // aom_alloc_frame_buffer() clears the config data for frame_to_show
     if (aom_alloc_frame_buffer(frame_to_show, cm->superres_upscaled_width,
                                cm->superres_upscaled_height, cm->subsampling_x,
-                               cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                               cm->use_highbitdepth,
-#endif  // CONFIG_HIGHBITDEPTH
+                               cm->subsampling_y, cm->use_highbitdepth,
                                AOM_BORDER_IN_PIXELS, cm->byte_alignment))
       aom_internal_error(
           &cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to reallocate current frame buffer for superres upscaling");
+
+    // Restore config data back to frame_to_show
+    copy_buffer_config(&copy_buffer, frame_to_show);
   }
   // TODO(afergs): verify frame_to_show is correct after realloc
   //               encoder:
   //               decoder:
-  frame_to_show->bit_depth = copy_buffer.bit_depth;
+
   assert(frame_to_show->y_crop_width == cm->superres_upscaled_width);
   assert(frame_to_show->y_crop_height == cm->superres_upscaled_height);
 
   // Scale up and back into frame_to_show.
   assert(frame_to_show->y_crop_width != cm->width);
-  assert(IMPLIES(!CONFIG_HORZONLY_FRAME_SUPERRES,
-                 frame_to_show->y_crop_height != cm->height));
-#if CONFIG_HIGHBITDEPTH
-  av1_upscale_normative_and_extend_frame(&copy_buffer, frame_to_show,
-                                         (int)cm->bit_depth);
-#else
-  av1_upscale_normative_and_extend_frame(&copy_buffer, frame_to_show);
-#endif  // CONFIG_HIGHBITDEPTH
+  av1_upscale_normative_and_extend_frame(cm, &copy_buffer, frame_to_show);
 
   // Free the copy buffer
   aom_free_frame_buffer(&copy_buffer);
 }
-#endif  // CONFIG_FRAME_SUPERRES
diff --git a/third_party/aom/av1/common/resize.h b/third_party/aom/av1/common/resize.h
index 66b32c72d..feec3a90e 100644
--- a/third_party/aom/av1/common/resize.h
+++ b/third_party/aom/av1/common/resize.h
@@ -39,7 +39,6 @@ void av1_resize_frame444(const uint8_t *const y, int y_stride,
                          int oy_stride, uint8_t *ou, uint8_t *ov,
                          int ouv_stride, int oheight, int owidth);
 
-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_resize_plane(const uint8_t *const input, int height, int width,
                              int in_stride, uint8_t *output, int height2,
                              int width2, int out_stride, int bd);
@@ -61,25 +60,16 @@ void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride,
                                 uint8_t *oy, int oy_stride, uint8_t *ou,
                                 uint8_t *ov, int ouv_stride, int oheight,
                                 int owidth, int bd);
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_HIGHBITDEPTH
-void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                 YV12_BUFFER_CONFIG *dst, int bd);
-#else
 void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                 YV12_BUFFER_CONFIG *dst);
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_FRAME_SUPERRES
-#if CONFIG_HIGHBITDEPTH
-void av1_upscale_normative_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                            YV12_BUFFER_CONFIG *dst, int bd);
-#else
-void av1_upscale_normative_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                                 YV12_BUFFER_CONFIG *dst, int bd,
+                                 const int num_planes);
+
+void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
+                                int src_stride, uint8_t *dst, int dst_stride,
+                                int plane, int rows);
+void av1_upscale_normative_and_extend_frame(const AV1_COMMON *cm,
+                                            const YV12_BUFFER_CONFIG *src,
                                             YV12_BUFFER_CONFIG *dst);
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_FRAME_SUPERRES
 
 YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
@@ -89,7 +79,6 @@ YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
 // resize scale denominator.
 void av1_calculate_scaled_size(int *width, int *height, int resize_denom);
 
-#if CONFIG_FRAME_SUPERRES
 // Similar to above, but calculates scaled dimensions after superres from the
 // given original dimensions and superres scale denominator.
 void av1_calculate_scaled_superres_size(int *width, int *height,
@@ -102,11 +91,19 @@ void av1_calculate_unscaled_superres_size(int *width, int *height, int denom);
 
 void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool);
 
-// Returns 1 if a superres upscaled frame is unscaled and 0 otherwise.
-static INLINE int av1_superres_unscaled(const AV1_COMMON *cm) {
-  return (cm->superres_scale_denominator == SCALE_NUMERATOR);
+// Returns 1 if a superres upscaled frame is scaled and 0 otherwise.
+static INLINE int av1_superres_scaled(const AV1_COMMON *cm) {
+  // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling
+  // required even though cm->superres_scale_denominator != SCALE_NUMERATOR.
+  // So, the following check is more accurate.
+  return !(cm->width == cm->superres_upscaled_width);
 }
-#endif  // CONFIG_FRAME_SUPERRES
+
+#define UPSCALE_NORMATIVE_TAPS 8
+extern const int16_t av1_resize_filter_normative[1 << RS_SUBPEL_BITS]
+                                                [UPSCALE_NORMATIVE_TAPS];
+
+int32_t av1_get_upscale_convolve_step(int in_length, int out_length);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/common/restoration.c b/third_party/aom/av1/common/restoration.c
index 00441f072..58a5275ca 100644
--- a/third_party/aom/av1/common/restoration.c
+++ b/third_party/aom/av1/common/restoration.c
@@ -12,100 +12,130 @@
 
 #include <math.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "./aom_scale_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
 #include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
 #include "av1/common/restoration.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 
 #include "aom_ports/mem.h"
 
+// The 's' values are calculated based on original 'r' and 'e' values in the
+// spec using GenSgrprojVtable().
+// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
 const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
-#if USE_HIGHPASS_IN_SGRPROJ
-  // corner, edge, r2, eps2
-  { -1, 2, 1, 1 }, { -1, 2, 1, 2 }, { -1, 2, 1, 3 }, { -1, 2, 1, 4 },
-  { -1, 2, 1, 5 }, { -2, 3, 1, 2 }, { -2, 3, 1, 3 }, { -2, 3, 1, 4 },
-  { -2, 3, 1, 5 }, { -2, 3, 1, 6 }, { -3, 4, 1, 3 }, { -3, 4, 1, 4 },
-  { -3, 4, 1, 5 }, { -3, 4, 1, 6 }, { -3, 4, 1, 7 }, { -3, 4, 1, 8 }
-#else
-// r1, eps1, r2, eps2
-#if MAX_RADIUS == 2
-  { 2, 12, 1, 4 },  { 2, 15, 1, 6 },  { 2, 18, 1, 8 },  { 2, 20, 1, 9 },
-  { 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 },
-  { 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 2, 30, 1, 2 },
-  { 2, 50, 1, 12 }, { 2, 60, 1, 13 }, { 2, 70, 1, 14 }, { 2, 80, 1, 15 },
-#else
-  { 2, 12, 1, 4 },  { 2, 15, 1, 6 },  { 2, 18, 1, 8 },  { 2, 20, 1, 9 },
-  { 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 },
-  { 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 3, 30, 1, 10 },
-  { 3, 50, 1, 12 }, { 3, 50, 2, 25 }, { 3, 60, 2, 35 }, { 3, 70, 2, 45 },
-#endif  // MAX_RADIUS == 2
-#endif
+  { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
+  { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
+  { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
+  { { 2, 1 }, { 47, 1079 } },  { { 2, 1 }, { 37, 996 } },
+  { { 2, 1 }, { 30, 925 } },   { { 2, 1 }, { 25, 863 } },
+  { { 0, 1 }, { -1, 2589 } },  { { 0, 1 }, { -1, 1618 } },
+  { { 0, 1 }, { -1, 1177 } },  { { 0, 1 }, { -1, 925 } },
+  { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
 };
 
-typedef void (*restore_func_type)(uint8_t *data8, int width, int height,
-                                  int stride, RestorationInternal *rst,
-                                  uint8_t *dst8, int dst_stride);
-#if CONFIG_HIGHBITDEPTH
-typedef void (*restore_func_highbd_type)(uint8_t *data8, int width, int height,
-                                         int stride, RestorationInternal *rst,
-                                         int bit_depth, uint8_t *dst8,
-                                         int dst_stride);
-#endif  // CONFIG_HIGHBITDEPTH
-
-int av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rst_info,
-                                 int width, int height) {
-  const int ntiles = av1_get_rest_ntiles(
-      width, height, rst_info->restoration_tilesize, NULL, NULL, NULL, NULL);
-  aom_free(rst_info->restoration_type);
-  CHECK_MEM_ERROR(cm, rst_info->restoration_type,
-                  (RestorationType *)aom_malloc(
-                      sizeof(*rst_info->restoration_type) * ntiles));
-  aom_free(rst_info->wiener_info);
-  CHECK_MEM_ERROR(
-      cm, rst_info->wiener_info,
-      (WienerInfo *)aom_memalign(16, sizeof(*rst_info->wiener_info) * ntiles));
-  memset(rst_info->wiener_info, 0, sizeof(*rst_info->wiener_info) * ntiles);
-  aom_free(rst_info->sgrproj_info);
-  CHECK_MEM_ERROR(
-      cm, rst_info->sgrproj_info,
-      (SgrprojInfo *)aom_malloc(sizeof(*rst_info->sgrproj_info) * ntiles));
-  return ntiles;
+AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
+  AV1PixelRect rect;
+
+  int ss_x = is_uv && cm->subsampling_x;
+  int ss_y = is_uv && cm->subsampling_y;
+
+  rect.top = 0;
+  rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
+  rect.left = 0;
+  rect.right = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
+  return rect;
+}
+
+// Count horizontal or vertical units per tile (use a width or height for
+// tile_size, respectively). We basically want to divide the tile size by the
+// size of a restoration unit. Rather than rounding up unconditionally as you
+// might expect, we round to nearest, which models the way a right or bottom
+// restoration unit can extend to up to 150% its normal width or height. The
+// max with 1 is to deal with tiles that are smaller than half of a restoration
+// unit.
+int av1_lr_count_units_in_tile(int unit_size, int tile_size) {
+  return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
+}
+
+void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
+                                  int is_uv) {
+  // We need to allocate enough space for restoration units to cover the
+  // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
+  // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
+  // to do the computation ourselves, iterating over the tiles and keeping
+  // track of the largest width and height, then upscaling.
+  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
+  const int max_tile_w = tile_rect.right - tile_rect.left;
+  const int max_tile_h = tile_rect.bottom - tile_rect.top;
+
+  // To calculate hpertile and vpertile (horizontal and vertical units per
+  // tile), we basically want to divide the largest tile width or height by the
+  // size of a restoration unit. Rather than rounding up unconditionally as you
+  // might expect, we round to nearest, which models the way a right or bottom
+  // restoration unit can extend to up to 150% its normal width or height. The
+  // max with 1 is to deal with tiles that are smaller than half of a
+  // restoration unit.
+  const int unit_size = rsi->restoration_unit_size;
+  const int hpertile = av1_lr_count_units_in_tile(unit_size, max_tile_w);
+  const int vpertile = av1_lr_count_units_in_tile(unit_size, max_tile_h);
+
+  rsi->units_per_tile = hpertile * vpertile;
+  rsi->horz_units_per_tile = hpertile;
+  rsi->vert_units_per_tile = vpertile;
+
+  const int ntiles = 1;
+  const int nunits = ntiles * rsi->units_per_tile;
+
+  aom_free(rsi->unit_info);
+  CHECK_MEM_ERROR(cm, rsi->unit_info,
+                  (RestorationUnitInfo *)aom_memalign(
+                      16, sizeof(*rsi->unit_info) * nunits));
 }
 
 void av1_free_restoration_struct(RestorationInfo *rst_info) {
-  aom_free(rst_info->restoration_type);
-  rst_info->restoration_type = NULL;
-  aom_free(rst_info->wiener_info);
-  rst_info->wiener_info = NULL;
-  aom_free(rst_info->sgrproj_info);
-  rst_info->sgrproj_info = NULL;
+  aom_free(rst_info->unit_info);
+  rst_info->unit_info = NULL;
 }
 
-// TODO(debargha): This table can be substantially reduced since only a few
-// values are actually used.
-int sgrproj_mtable[MAX_EPS][MAX_NELEM];
+#if 0
+// Pair of values for each sgrproj parameter:
+// Index 0 corresponds to r[0], e[0]
+// Index 1 corresponds to r[1], e[1]
+int sgrproj_mtable[SGRPROJ_PARAMS][2];
 
 static void GenSgrprojVtable() {
-  int e, n;
-  for (e = 1; e <= MAX_EPS; ++e)
-    for (n = 1; n <= MAX_NELEM; ++n) {
-      const int n2e = n * n * e;
-      sgrproj_mtable[e - 1][n - 1] =
-          (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
+  for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
+    const sgr_params_type *const params = &sgr_params[i];
+    for (int j = 0; j < 2; ++j) {
+      const int e = params->e[j];
+      const int r = params->r[j];
+      if (r == 0) {                 // filter is disabled
+        sgrproj_mtable[i][j] = -1;  // mark invalid
+      } else {                      // filter is enabled
+        const int n = (2 * r + 1) * (2 * r + 1);
+        const int n2e = n * n * e;
+        assert(n2e != 0);
+        sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
+      }
     }
+  }
 }
+#endif
 
-void av1_loop_restoration_precal() { GenSgrprojVtable(); }
-
-static void loop_restoration_init(RestorationInternal *rst, int kf) {
-  rst->keyframe = kf;
+void av1_loop_restoration_precal() {
+#if 0
+  GenSgrprojVtable();
+#endif
 }
 
-void extend_frame(uint8_t *data, int width, int height, int stride,
-                  int border_horz, int border_vert) {
+static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride,
+                               int border_horz, int border_vert) {
   uint8_t *data_p;
   int i;
   for (i = 0; i < height; ++i) {
@@ -123,261 +153,297 @@ void extend_frame(uint8_t *data, int width, int height, int stride,
   }
 }
 
-#if CONFIG_STRIPED_LOOP_RESTORATION
-
-// This function setup a processing stripe by replacing the vertical
-// stripe boundary (2 lines above and 2 lines below) by data coming
-// from the above/below buffers. Before doing so the original
-// frame data is saved into a temporary buffer, such that it
-// can be restored by the restore_processing_stripe_boundary
-// function after the filtering of the processing stripe.
-// Returns the height of the processing stripe
-static int setup_processing_stripe_boundary(int y0, int v_end, int h_start,
-                                            int h_end, uint8_t *data,
-                                            int stride,
-                                            RestorationInternal *rst,
-                                            int use_highbd) {
-  int y, y_stripe_topmost, stripe_index, i;
-  int tile_offset = RESTORATION_TILE_OFFSET >> rst->subsampling_y;
-  int stripe_height = rst->rsi->procunit_height;
-  int comp = rst->component;
-  uint8_t *boundary_above_buf = rst->stripe_boundary_above[comp];
-  uint8_t *boundary_below_buf = rst->stripe_boundary_below[comp];
-  int boundary_stride = rst->stripe_boundary_stride[comp];
-  int x0 = h_start - RESTORATION_EXTRA_HORZ;
-  int x1 = h_end + RESTORATION_EXTRA_HORZ;
-
-  stripe_index = (y0 + tile_offset) / stripe_height;
-  y_stripe_topmost = stripe_index * stripe_height - tile_offset;
-  boundary_above_buf +=
-      ((stripe_index - 1) * 2 * boundary_stride + RESTORATION_EXTRA_HORZ)
-      << use_highbd;
-  boundary_below_buf +=
-      (stripe_index * 2 * boundary_stride + RESTORATION_EXTRA_HORZ)
-      << use_highbd;
-
-  // setup the 2 lines above the stripe
-  for (i = 0; i < 2; i++) {
-    y = y_stripe_topmost - 2 + i;
-    if (y >= 0 && y < y0 && y >= y0 - 2) {
-      uint8_t *p = data + ((y * stride + x0) << use_highbd);
-      uint8_t *new_data =
-          boundary_above_buf + ((i * boundary_stride + x0) << use_highbd);
-      // printf("above %3d %3d: %08x %08x : %08x %08x\n", y, x0,
-      // ((uint32_t*)p)[0], ((uint32_t*)p)[1], ((uint32_t*)new_data)[0],
-      // ((uint32_t*)new_data)[1]);
-      // Save old pixels
-      memcpy(rst->tmp_save_above[i], p, (x1 - x0) << use_highbd);
-      // Replace width pixels from boundary_above_buf
-      memcpy(p, new_data, (x1 - x0) << use_highbd);
-    }
+static void extend_frame_highbd(uint16_t *data, int width, int height,
+                                int stride, int border_horz, int border_vert) {
+  uint16_t *data_p;
+  int i, j;
+  for (i = 0; i < height; ++i) {
+    data_p = data + i * stride;
+    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
+    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
   }
-  // setup the 2 lines below the stripe
-  for (i = 0; i < 2; i++) {
-    y = y_stripe_topmost + stripe_height + i;
-    if (y < v_end + 2) {
-      uint8_t *p = data + ((y * stride + x0) << use_highbd);
-      uint8_t *new_data =
-          boundary_below_buf + ((i * boundary_stride + x0) << use_highbd);
-      // printf("below %3d %3d: %08x %08x : %08x %08x\n", y, x0,
-      // ((uint32_t*)p)[0], ((uint32_t*)p)[1], ((uint32_t*)new_data)[0],
-      // ((uint32_t*)new_data)[1]);
-      // Save old pixels
-      memcpy(rst->tmp_save_below[i], p, (x1 - x0) << use_highbd);
-      // Replace width pixels from boundary_below_buf
-      memcpy(p, new_data, (x1 - x0) << use_highbd);
-    }
+  data_p = data - border_horz;
+  for (i = -border_vert; i < 0; ++i) {
+    memcpy(data_p + i * stride, data_p,
+           (width + 2 * border_horz) * sizeof(uint16_t));
+  }
+  for (i = height; i < height + border_vert; ++i) {
+    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
+           (width + 2 * border_horz) * sizeof(uint16_t));
   }
-  // Return actual stripe height
-  return AOMMIN(v_end, y_stripe_topmost + stripe_height) - y0;
 }
 
-// This function restores the boundary lines modified by
-// setup_processing_stripe_boundary.
-static void restore_processing_stripe_boundary(int y0, int v_end, int h_start,
-                                               int h_end, uint8_t *data,
-                                               int stride,
-                                               RestorationInternal *rst,
-                                               int use_highbd) {
-  int y, y_stripe_topmost, i, stripe_index;
-  int tile_offset = 8 >> rst->subsampling_y;
-  int stripe_height = rst->rsi->procunit_height;
-  int x0 = h_start - RESTORATION_EXTRA_HORZ;
-  int x1 = h_end + RESTORATION_EXTRA_HORZ;
-
-  stripe_index = (y0 + tile_offset) / stripe_height;
-  y_stripe_topmost = stripe_index * stripe_height - tile_offset;
-
-  // restore the 2 lines above the stripe
-  for (i = 0; i < 2; i++) {
-    y = y_stripe_topmost - 2 + i;
-    if (y >= 0 && y < y0 && y >= y0 - 2) {
-      uint8_t *p = data + ((y * stride + x0) << use_highbd);
-      memcpy(p, rst->tmp_save_above[i], (x1 - x0) << use_highbd);
-    }
-  }
-  // restore the 2 lines below the stripe
-  for (i = 0; i < 2; i++) {
-    y = y_stripe_topmost + stripe_height + i;
-    if (y < v_end + 2) {
-      uint8_t *p = data + ((y * stride + x0) << use_highbd);
-      memcpy(p, rst->tmp_save_below[i], (x1 - x0) << use_highbd);
-    }
-  }
+void extend_frame(uint8_t *data, int width, int height, int stride,
+                  int border_horz, int border_vert, int highbd) {
+  if (highbd)
+    extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
+                        border_horz, border_vert);
+  else
+    extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
 }
 
-#endif
+static void copy_tile_lowbd(int width, int height, const uint8_t *src,
+                            int src_stride, uint8_t *dst, int dst_stride) {
+  for (int i = 0; i < height; ++i)
+    memcpy(dst + i * dst_stride, src + i * src_stride, width);
+}
 
-static void loop_copy_tile(uint8_t *data, int tile_idx, int width, int height,
-                           int stride, RestorationInternal *rst, uint8_t *dst,
-                           int dst_stride) {
-  const int tile_width = rst->tile_width;
-  const int tile_height = rst->tile_height;
-  RestorationTileLimits limits =
-      av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
-#if CONFIG_STRIPED_LOOP_RESTORATION
-                               tile_height, width, height, rst->subsampling_y);
-#else
-                               tile_height, width, height);
-#endif
-  for (int i = limits.v_start; i < limits.v_end; ++i)
-    memcpy(dst + i * dst_stride + limits.h_start,
-           data + i * stride + limits.h_start, limits.h_end - limits.h_start);
+static void copy_tile_highbd(int width, int height, const uint16_t *src,
+                             int src_stride, uint16_t *dst, int dst_stride) {
+  for (int i = 0; i < height; ++i)
+    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
 }
 
-static void stepdown_wiener_kernel(const InterpKernel orig, InterpKernel vert,
-                                   int boundary_dist, int istop) {
-  memcpy(vert, orig, sizeof(InterpKernel));
-  switch (boundary_dist) {
-    case 0:
-      vert[WIENER_HALFWIN] += vert[2] + vert[1] + vert[0];
-      vert[2] = vert[1] = vert[0] = 0;
-      break;
-    case 1:
-      vert[2] += vert[1] + vert[0];
-      vert[1] = vert[0] = 0;
-      break;
-    case 2:
-      vert[1] += vert[0];
-      vert[0] = 0;
-      break;
-    default: break;
-  }
-  if (!istop) {
-    int tmp;
-    tmp = vert[0];
-    vert[0] = vert[WIENER_WIN - 1];
-    vert[WIENER_WIN - 1] = tmp;
-    tmp = vert[1];
-    vert[1] = vert[WIENER_WIN - 2];
-    vert[WIENER_WIN - 2] = tmp;
-    tmp = vert[2];
-    vert[2] = vert[WIENER_WIN - 3];
-    vert[WIENER_WIN - 3] = tmp;
-  }
+static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
+                      uint8_t *dst, int dst_stride, int highbd) {
+  if (highbd)
+    copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
+                     CONVERT_TO_SHORTPTR(dst), dst_stride);
+  else
+    copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
 }
 
-static void loop_wiener_filter_tile(uint8_t *data, int tile_idx, int width,
-                                    int height, int stride,
-                                    RestorationInternal *rst, uint8_t *dst,
-                                    int dst_stride) {
-  const int procunit_width = rst->rsi->procunit_width;
-#if CONFIG_STRIPED_LOOP_RESTORATION
-  int procunit_height;
-#else
-  const int procunit_height = rst->rsi->procunit_height;
-#endif
-  const int tile_width = rst->tile_width;
-  const int tile_height = rst->tile_height;
-  if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
-    loop_copy_tile(data, tile_idx, width, height, stride, rst, dst, dst_stride);
-    return;
+#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
+
+// With striped loop restoration, the filtering for each 64-pixel stripe gets
+// most of its input from the output of CDEF (stored in data8), but we need to
+// fill out a border of 3 pixels above/below the stripe according to the
+// following
+// rules:
+//
+// * At a frame boundary, we copy the outermost row of CDEF pixels three times.
+//   This extension is done by a call to extend_frame() at the start of the loop
+//   restoration process, so the value of copy_above/copy_below doesn't strictly
+//   matter.
+//   However, by setting *copy_above = *copy_below = 1 whenever loop filtering
+//   across tiles is disabled, we can allow
+//   {setup,restore}_processing_stripe_boundary to assume that the top/bottom
+//   data has always been copied, simplifying the behaviour at the left and
+//   right edges of tiles.
+//
+// * If we're at a tile boundary and loop filtering across tiles is enabled,
+//   then there is a logical stripe which is 64 pixels high, but which is split
+//   into an 8px high and a 56px high stripe so that the processing (and
+//   coefficient set usage) can be aligned to tiles.
+//   In this case, we use the 3 rows of CDEF output across the boundary for
+//   context; this corresponds to leaving the frame buffer as-is.
+//
+// * If we're at a tile boundary and loop filtering across tiles is disabled,
+//   then we take the outermost row of CDEF pixels *within the current tile*
+//   and copy it three times. Thus we behave exactly as if the tile were a full
+//   frame.
+//
+// * Otherwise, we're at a stripe boundary within a tile. In that case, we
+//   take 2 rows of deblocked pixels and extend them to 3 rows of context.
+//
+// The distinction between the latter two cases is handled by the
+// av1_loop_restoration_save_boundary_lines() function, so here we just need
+// to decide if we're overwriting the above/below boundary pixels or not.
+static void get_stripe_boundary_info(const RestorationTileLimits *limits,
+                                     const AV1PixelRect *tile_rect, int ss_y,
+                                     int *copy_above, int *copy_below) {
+  *copy_above = 1;
+  *copy_below = 1;
+
+  const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+  const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
+
+  const int first_stripe_in_tile = (limits->v_start == tile_rect->top);
+  const int this_stripe_height =
+      full_stripe_height - (first_stripe_in_tile ? runit_offset : 0);
+  const int last_stripe_in_tile =
+      (limits->v_start + this_stripe_height >= tile_rect->bottom);
+
+  if (first_stripe_in_tile) *copy_above = 0;
+  if (last_stripe_in_tile) *copy_below = 0;
+}
+
+// Overwrite the border pixels around a processing stripe so that the conditions
+// listed above get_stripe_boundary_info() are preserved.
+// We save the pixels which get overwritten into a temporary buffer, so that
+// they can be restored by restore_processing_stripe_boundary() after we've
+// processed the stripe.
+//
+// limits gives the rectangular limits of the remaining stripes for the current
+// restoration unit. rsb is the stored stripe boundaries (taken from either
+// deblock or CDEF output as necessary).
+//
+// tile_rect is the limits of the current tile and tile_stripe0 is the index of
+// the first stripe in this tile (needed to convert the tile-relative stripe
+// index we get from limits into something we can look up in rsb).
+static void setup_processing_stripe_boundary(
+    const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
+    int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
+    RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
+  // Offsets within the line buffers. The buffer logically starts at column
+  // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
+  // has column x0 in the buffer.
+  const int buf_stride = rsb->stripe_boundary_stride;
+  const int buf_x0_off = limits->h_start;
+  const int line_width =
+      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
+  const int line_size = line_width << use_highbd;
+
+  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
+
+  // Replace RESTORATION_BORDER pixels above the top of the stripe
+  // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
+  // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
+  // duplicating the topmost of the 2 lines (see the AOMMAX call when
+  // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
+  //
+  // Special case: If we're at the top of a tile, which isn't on the topmost
+  // tile row, and we're allowed to loop filter across tiles, then we have a
+  // logical 64-pixel-high stripe which has been split into an 8-pixel high
+  // stripe and a 56-pixel high stripe (the current one). So, in this case,
+  // we want to leave the boundary alone!
+  if (!opt) {
+    if (copy_above) {
+      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+
+      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
+        const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
+        const int buf_off = buf_x0_off + buf_row * buf_stride;
+        const uint8_t *buf =
+            rsb->stripe_boundary_above + (buf_off << use_highbd);
+        uint8_t *dst8 = data8_tl + i * data_stride;
+        // Save old pixels, then replace with data from stripe_boundary_above
+        memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
+               REAL_PTR(use_highbd, dst8), line_size);
+        memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
+      }
+    }
+
+    // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
+    // The second buffer row is repeated, so src_row gets the values 0, 1, 1
+    // for i = 0, 1, 2.
+    if (copy_below) {
+      const int stripe_end = limits->v_start + h;
+      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
+
+      for (int i = 0; i < RESTORATION_BORDER; ++i) {
+        const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
+        const int buf_off = buf_x0_off + buf_row * buf_stride;
+        const uint8_t *src =
+            rsb->stripe_boundary_below + (buf_off << use_highbd);
+
+        uint8_t *dst8 = data8_bl + i * data_stride;
+        // Save old pixels, then replace with data from stripe_boundary_below
+        memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
+        memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
+      }
+    }
+  } else {
+    if (copy_above) {
+      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+
+      // Only save and overwrite i=-RESTORATION_BORDER line.
+      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
+      // Save old pixels, then replace with data from stripe_boundary_above
+      memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
+      memcpy(REAL_PTR(use_highbd, dst8),
+             REAL_PTR(use_highbd,
+                      data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
+             line_size);
+    }
+
+    if (copy_below) {
+      const int stripe_end = limits->v_start + h;
+      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
+
+      // Only save and overwrite i=2 line.
+      uint8_t *dst8 = data8_bl + 2 * data_stride;
+      // Save old pixels, then replace with data from stripe_boundary_below
+      memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
+      memcpy(REAL_PTR(use_highbd, dst8),
+             REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
+    }
   }
-  InterpKernel vertical_topbot;
-  RestorationTileLimits limits =
-      av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
-#if CONFIG_STRIPED_LOOP_RESTORATION
-                               tile_height, width, height, rst->subsampling_y);
-#else
-                               tile_height, width, height);
-#endif
+}
 
-  // Convolve the whole tile (done in blocks here to match the requirements
-  // of the vectorized convolve functions, but the result is equivalent)
-  for (int i = limits.v_start; i < limits.v_end; i += procunit_height) {
-#if CONFIG_STRIPED_LOOP_RESTORATION
-    int h = setup_processing_stripe_boundary(
-        i, limits.v_end, limits.h_start, limits.h_end, data, stride, rst, 0);
-    h = ALIGN_POWER_OF_TWO(h, 1);
-    procunit_height = h;
-#else
-    int h = AOMMIN(procunit_height, (limits.v_end - i + 15) & ~15);
-#endif
-    for (int j = limits.h_start; j < limits.h_end; j += procunit_width) {
-      int w = AOMMIN(procunit_width, (limits.h_end - j + 15) & ~15);
-      const uint8_t *data_p = data + i * stride + j;
-      uint8_t *dst_p = dst + i * dst_stride + j;
-      // Note h is at least 16
-      for (int b = 0; b < WIENER_HALFWIN - WIENER_BORDER_VERT; ++b) {
-        stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
-                               vertical_topbot, WIENER_BORDER_VERT + b, 1);
-#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-        aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
-                                  rst->rsi->wiener_info[tile_idx].hfilter, 16,
-                                  vertical_topbot, 16, w, 1);
-#else
-        aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
-                              rst->rsi->wiener_info[tile_idx].hfilter, 16,
-                              vertical_topbot, 16, w, 1);
-#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-        data_p += stride;
-        dst_p += dst_stride;
+// This function restores the boundary lines modified by
+// setup_processing_stripe_boundary.
+//
+// Note: We need to be careful when handling the corners of the processing
+// unit, because (eg.) the top-left corner is considered to be part of
+// both the left and top borders. This means that, depending on the
+// loop_filter_across_tiles_enabled flag, the corner pixels might get
+// overwritten twice, once as part of the "top" border and once as part
+// of the "left" border (or similar for other corners).
+//
+// Everything works out fine as long as we make sure to reverse the order
+// when restoring, ie. we need to restore the left/right borders followed
+// by the top/bottom borders.
+static void restore_processing_stripe_boundary(
+    const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
+    int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
+    int copy_below, int opt) {
+  const int line_width =
+      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
+  const int line_size = line_width << use_highbd;
+
+  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
+
+  if (!opt) {
+    if (copy_above) {
+      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
+        uint8_t *dst8 = data8_tl + i * data_stride;
+        memcpy(REAL_PTR(use_highbd, dst8),
+               rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
       }
-#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-      aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
-                                rst->rsi->wiener_info[tile_idx].hfilter, 16,
-                                rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
-                                h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
-#else
-      aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
-                            rst->rsi->wiener_info[tile_idx].hfilter, 16,
-                            rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
-                            h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
-#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-      data_p += stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
-      dst_p += dst_stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
-      for (int b = WIENER_HALFWIN - WIENER_BORDER_VERT - 1; b >= 0; --b) {
-        stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
-                               vertical_topbot, WIENER_BORDER_VERT + b, 0);
-#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-        aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
-                                  rst->rsi->wiener_info[tile_idx].hfilter, 16,
-                                  vertical_topbot, 16, w, 1);
-#else
-        aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
-                              rst->rsi->wiener_info[tile_idx].hfilter, 16,
-                              vertical_topbot, 16, w, 1);
-#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-        data_p += stride;
-        dst_p += dst_stride;
+    }
+
+    if (copy_below) {
+      const int stripe_bottom = limits->v_start + h;
+      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
+
+      for (int i = 0; i < RESTORATION_BORDER; ++i) {
+        if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
+
+        uint8_t *dst8 = data8_bl + i * data_stride;
+        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
+      }
+    }
+  } else {
+    if (copy_above) {
+      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+
+      // Only restore i=-RESTORATION_BORDER line.
+      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
+      memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
+    }
+
+    if (copy_below) {
+      const int stripe_bottom = limits->v_start + h;
+      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
+
+      // Only restore i=2 line.
+      if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
+        uint8_t *dst8 = data8_bl + 2 * data_stride;
+        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
       }
     }
-#if CONFIG_STRIPED_LOOP_RESTORATION
-    restore_processing_stripe_boundary(i, limits.v_end, limits.h_start,
-                                       limits.h_end, data, stride, rst, 0);
-#endif
   }
 }
 
-static void loop_wiener_filter(uint8_t *data, int width, int height, int stride,
-                               RestorationInternal *rst, uint8_t *dst,
-                               int dst_stride) {
-  int tile_idx;
-  extend_frame(data, width, height, stride, WIENER_BORDER_HORZ,
-               WIENER_BORDER_VERT);
-  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
-    loop_wiener_filter_tile(data, tile_idx, width, height, stride, rst, dst,
-                            dst_stride);
+static void wiener_filter_stripe(const RestorationUnitInfo *rui,
+                                 int stripe_width, int stripe_height,
+                                 int procunit_width, const uint8_t *src,
+                                 int src_stride, uint8_t *dst, int dst_stride,
+                                 int32_t *tmpbuf, int bit_depth) {
+  (void)tmpbuf;
+  (void)bit_depth;
+  assert(bit_depth == 8);
+  const ConvolveParams conv_params = get_conv_params_wiener(8);
+
+  for (int j = 0; j < stripe_width; j += procunit_width) {
+    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
+    const uint8_t *src_p = src + j;
+    uint8_t *dst_p = dst + j;
+    av1_wiener_convolve_add_src(
+        src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
+        rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
   }
 }
 
@@ -391,6 +457,8 @@ static void loop_wiener_filter(uint8_t *data, int width, int height, int stride,
 static void boxsum1(int32_t *src, int width, int height, int src_stride,
                     int sqr, int32_t *dst, int dst_stride) {
   int i, j, a, b, c;
+  assert(width > 2 * SGRPROJ_BORDER_HORZ);
+  assert(height > 2 * SGRPROJ_BORDER_VERT);
 
   // Vertical sum over 3-pixel regions, from src into dst.
   if (!sqr) {
@@ -456,6 +524,8 @@ static void boxsum1(int32_t *src, int width, int height, int src_stride,
 static void boxsum2(int32_t *src, int width, int height, int src_stride,
                     int sqr, int32_t *dst, int dst_stride) {
   int i, j, a, b, c, d, e;
+  assert(width > 2 * SGRPROJ_BORDER_HORZ);
+  assert(height > 2 * SGRPROJ_BORDER_VERT);
 
   // Vertical sum over 5-pixel regions, from src into dst.
   if (!sqr) {
@@ -540,202 +610,33 @@ static void boxsum2(int32_t *src, int width, int height, int src_stride,
   }
 }
 
-static void boxsum3(int32_t *src, int width, int height, int src_stride,
-                    int sqr, int32_t *dst, int dst_stride) {
-  int i, j, a, b, c, d, e, f, g;
-
-  // Vertical sum over 7-pixel regions, from src into dst.
-  if (!sqr) {
-    for (j = 0; j < width; ++j) {
-      a = src[j];
-      b = src[1 * src_stride + j];
-      c = src[2 * src_stride + j];
-      d = src[3 * src_stride + j];
-      e = src[4 * src_stride + j];
-      f = src[5 * src_stride + j];
-      g = src[6 * src_stride + j];
-
-      dst[j] = a + b + c + d;
-      dst[dst_stride + j] = a + b + c + d + e;
-      dst[2 * dst_stride + j] = a + b + c + d + e + f;
-      for (i = 3; i < height - 4; ++i) {
-        dst[i * dst_stride + j] = a + b + c + d + e + f + g;
-        a = b;
-        b = c;
-        c = d;
-        d = e;
-        e = f;
-        f = g;
-        g = src[(i + 4) * src_stride + j];
-      }
-      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
-      dst[(i + 1) * dst_stride + j] = b + c + d + e + f + g;
-      dst[(i + 2) * dst_stride + j] = c + d + e + f + g;
-      dst[(i + 3) * dst_stride + j] = d + e + f + g;
-    }
-  } else {
-    for (j = 0; j < width; ++j) {
-      a = src[j] * src[j];
-      b = src[1 * src_stride + j] * src[1 * src_stride + j];
-      c = src[2 * src_stride + j] * src[2 * src_stride + j];
-      d = src[3 * src_stride + j] * src[3 * src_stride + j];
-      e = src[4 * src_stride + j] * src[4 * src_stride + j];
-      f = src[5 * src_stride + j] * src[5 * src_stride + j];
-      g = src[6 * src_stride + j] * src[6 * src_stride + j];
-
-      dst[j] = a + b + c + d;
-      dst[dst_stride + j] = a + b + c + d + e;
-      dst[2 * dst_stride + j] = a + b + c + d + e + f;
-      for (i = 3; i < height - 4; ++i) {
-        dst[i * dst_stride + j] = a + b + c + d + e + f + g;
-        a = b;
-        b = c;
-        c = d;
-        d = e;
-        e = f;
-        f = g;
-        g = src[(i + 4) * src_stride + j] * src[(i + 4) * src_stride + j];
-      }
-      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
-      dst[(i + 1) * dst_stride + j] = b + c + d + e + f + g;
-      dst[(i + 2) * dst_stride + j] = c + d + e + f + g;
-      dst[(i + 3) * dst_stride + j] = d + e + f + g;
-    }
-  }
-
-  // Horizontal sum over 7-pixel regions of dst
-  for (i = 0; i < height; ++i) {
-    a = dst[i * dst_stride];
-    b = dst[i * dst_stride + 1];
-    c = dst[i * dst_stride + 2];
-    d = dst[i * dst_stride + 3];
-    e = dst[i * dst_stride + 4];
-    f = dst[i * dst_stride + 5];
-    g = dst[i * dst_stride + 6];
-
-    dst[i * dst_stride] = a + b + c + d;
-    dst[i * dst_stride + 1] = a + b + c + d + e;
-    dst[i * dst_stride + 2] = a + b + c + d + e + f;
-    for (j = 3; j < width - 4; ++j) {
-      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
-      a = b;
-      b = c;
-      c = d;
-      d = e;
-      e = f;
-      f = g;
-      g = dst[i * dst_stride + (j + 4)];
-    }
-    dst[i * dst_stride + j] = a + b + c + d + e + f + g;
-    dst[i * dst_stride + (j + 1)] = b + c + d + e + f + g;
-    dst[i * dst_stride + (j + 2)] = c + d + e + f + g;
-    dst[i * dst_stride + (j + 3)] = d + e + f + g;
-  }
-}
-
-// Generic version for any r. To be removed after experiments are done.
-static void boxsumr(int32_t *src, int width, int height, int src_stride, int r,
-                    int sqr, int32_t *dst, int dst_stride) {
-  int32_t *tmp = aom_malloc(width * height * sizeof(*tmp));
-  int tmp_stride = width;
-  int i, j;
-  if (sqr) {
-    for (j = 0; j < width; ++j) tmp[j] = src[j] * src[j];
-    for (j = 0; j < width; ++j)
-      for (i = 1; i < height; ++i)
-        tmp[i * tmp_stride + j] =
-            tmp[(i - 1) * tmp_stride + j] +
-            src[i * src_stride + j] * src[i * src_stride + j];
-  } else {
-    memcpy(tmp, src, sizeof(*tmp) * width);
-    for (j = 0; j < width; ++j)
-      for (i = 1; i < height; ++i)
-        tmp[i * tmp_stride + j] =
-            tmp[(i - 1) * tmp_stride + j] + src[i * src_stride + j];
-  }
-  for (i = 0; i <= r; ++i)
-    memcpy(&dst[i * dst_stride], &tmp[(i + r) * tmp_stride],
-           sizeof(*tmp) * width);
-  for (i = r + 1; i < height - r; ++i)
-    for (j = 0; j < width; ++j)
-      dst[i * dst_stride + j] =
-          tmp[(i + r) * tmp_stride + j] - tmp[(i - r - 1) * tmp_stride + j];
-  for (i = height - r; i < height; ++i)
-    for (j = 0; j < width; ++j)
-      dst[i * dst_stride + j] = tmp[(height - 1) * tmp_stride + j] -
-                                tmp[(i - r - 1) * tmp_stride + j];
-
-  for (i = 0; i < height; ++i) tmp[i * tmp_stride] = dst[i * dst_stride];
-  for (i = 0; i < height; ++i)
-    for (j = 1; j < width; ++j)
-      tmp[i * tmp_stride + j] =
-          tmp[i * tmp_stride + j - 1] + dst[i * src_stride + j];
-
-  for (j = 0; j <= r; ++j)
-    for (i = 0; i < height; ++i)
-      dst[i * dst_stride + j] = tmp[i * tmp_stride + j + r];
-  for (j = r + 1; j < width - r; ++j)
-    for (i = 0; i < height; ++i)
-      dst[i * dst_stride + j] =
-          tmp[i * tmp_stride + j + r] - tmp[i * tmp_stride + j - r - 1];
-  for (j = width - r; j < width; ++j)
-    for (i = 0; i < height; ++i)
-      dst[i * dst_stride + j] =
-          tmp[i * tmp_stride + width - 1] - tmp[i * tmp_stride + j - r - 1];
-  aom_free(tmp);
-}
-
 static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
                    int sqr, int32_t *dst, int dst_stride) {
   if (r == 1)
     boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
   else if (r == 2)
     boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
-  else if (r == 3)
-    boxsum3(src, width, height, src_stride, sqr, dst, dst_stride);
   else
-    boxsumr(src, width, height, src_stride, r, sqr, dst, dst_stride);
+    assert(0 && "Invalid value of r in self-guided filter");
 }
 
-static void boxnum(int width, int height, int r, int8_t *num, int num_stride) {
-  int i, j;
-  for (i = 0; i <= r; ++i) {
-    for (j = 0; j <= r; ++j) {
-      num[i * num_stride + j] = (r + 1 + i) * (r + 1 + j);
-      num[i * num_stride + (width - 1 - j)] = num[i * num_stride + j];
-      num[(height - 1 - i) * num_stride + j] = num[i * num_stride + j];
-      num[(height - 1 - i) * num_stride + (width - 1 - j)] =
-          num[i * num_stride + j];
-    }
-  }
-  for (j = 0; j <= r; ++j) {
-    const int val = (2 * r + 1) * (r + 1 + j);
-    for (i = r + 1; i < height - r; ++i) {
-      num[i * num_stride + j] = val;
-      num[i * num_stride + (width - 1 - j)] = val;
-    }
-  }
-  for (i = 0; i <= r; ++i) {
-    const int val = (2 * r + 1) * (r + 1 + i);
-    for (j = r + 1; j < width - r; ++j) {
-      num[i * num_stride + j] = val;
-      num[(height - 1 - i) * num_stride + j] = val;
-    }
-  }
-  for (i = r + 1; i < height - r; ++i) {
-    for (j = r + 1; j < width - r; ++j) {
-      num[i * num_stride + j] = (2 * r + 1) * (2 * r + 1);
-    }
+void decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
+  if (params->r[0] == 0) {
+    xq[0] = 0;
+    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
+  } else if (params->r[1] == 0) {
+    xq[0] = xqd[0];
+    xq[1] = 0;
+  } else {
+    xq[0] = xqd[0];
+    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
   }
 }
 
-void decode_xq(int *xqd, int *xq) {
-  xq[0] = xqd[0];
-  xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
-}
-
 const int32_t x_by_xplus1[256] = {
-  0,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
+  // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
+  // instead of 0. See comments in selfguided_restoration_internal() for why
+  1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
   240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
   248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
   250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
@@ -758,19 +659,15 @@ const int32_t x_by_xplus1[256] = {
 const int32_t one_by_x[MAX_NELEM] = {
   4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
   293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
-#if MAX_RADIUS > 2
-  158,  152,  146,  141,  137, 132, 128, 124, 120, 117, 114, 111, 108,
-  105,  102,  100,  98,   95,  93,  91,  89,  87,  85,  84
-#endif  // MAX_RADIUS > 2
 };
 
-static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
-                                                int height, int dgd_stride,
-                                                int32_t *dst, int dst_stride,
-                                                int bit_depth, int r, int eps) {
+static void selfguided_restoration_fast_internal(
+    int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
+    int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
-  const int num_stride = width_ext;
   // Adjusting the stride of A and B here appears to avoid bad cache effects,
   // leading to a significant speed improvement.
   // We also align the stride to a multiple of 16 bytes, for consistency
@@ -780,25 +677,24 @@ static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
   int32_t B_[RESTORATION_PROC_UNIT_PELS];
   int32_t *A = A_;
   int32_t *B = B_;
-  int8_t num_[RESTORATION_PROC_UNIT_PELS];
-  int8_t *num = num_ + SGRPROJ_BORDER_VERT * num_stride + SGRPROJ_BORDER_HORZ;
   int i, j;
 
-  // Don't filter tiles with dimensions < 5 on any axis
-  if ((width < 5) || (height < 5)) return;
+  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+         "Need SGRPROJ_BORDER_* >= r+1");
 
   boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
          width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
   boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
          width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
-  boxnum(width_ext, height_ext, r, num_, num_stride);
-  assert(r <= 3);
   A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
+  // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
+  // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
+  for (i = -1; i < height + 1; i += 2) {
+    for (j = -1; j < width + 1; ++j) {
       const int k = i * buf_stride + j;
-      const int n = num[i * num_stride + j];
+      const int n = (2 * r + 1) * (2 * r + 1);
 
       // a < 2^16 * n < 2^22 regardless of bit depth
       uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
@@ -807,139 +703,192 @@ static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
 
       // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
       // and p itself satisfies p < 2^14 * n^2 < 2^26.
+      // This bound on p is due to:
+      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
+      //
       // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
       // This is an artefact of rounding, and can only happen if all pixels
       // are (almost) identical, so in this case we saturate to p=0.
       uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
-      uint32_t s = sgrproj_mtable[eps - 1][n - 1];
+
+      const uint32_t s = params->s[radius_idx];
 
       // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
       // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
       // (this holds even after accounting for the rounding in s)
       const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
 
-      A[k] = x_by_xplus1[AOMMIN(z, 255)];  // < 2^8
-
-      // SGRPROJ_SGR - A[k] < 2^8, B[k] < 2^(bit_depth) * n,
+      // Note: We have to be quite careful about the value of A[k].
+      // This is used as a blend factor between individual pixel values and the
+      // local mean. So it logically has a range of [0, 256], including both
+      // endpoints.
+      //
+      // This is a pain for hardware, as we'd like something which can be stored
+      // in exactly 8 bits.
+      // Further, in the calculation of B[k] below, if z == 0 and r == 2,
+      // then A[k] "should be" 0. But then we can end up setting B[k] to a value
+      // slightly above 2^(8 + bit depth), due to rounding in the value of
+      // one_by_x[25-1].
+      //
+      // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
+      // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
+      // overflow), without significantly affecting the final result: z == 0
+      // implies that the image is essentially "flat", so the local mean and
+      // individual pixel values are very similar.
+      //
+      // Note that saturating on the other side, ie. requring A[k] <= 255,
+      // would be a bad idea, as that corresponds to the case where the image
+      // is very variable, when we want to preserve the local pixel value as
+      // much as possible.
+      A[k] = x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
+
+      // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
       // one_by_x[n - 1] = round(2^12 / n)
       // => the product here is < 2^(20 + bit_depth) <= 2^32,
       // and B[k] is set to a value < 2^(8 + bit depth)
+      // This holds even with the rounding in one_by_x and in the overall
+      // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
       B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
                                              (uint32_t)B[k] *
                                              (uint32_t)one_by_x[n - 1],
                                          SGRPROJ_RECIP_BITS);
     }
   }
-  i = 0;
-  j = 0;
-  {
-    const int k = i * buf_stride + j;
-    const int l = i * dgd_stride + j;
-    const int m = i * dst_stride + j;
-    const int nb = 3;
-    const int32_t a =
-        3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] + A[k + buf_stride + 1];
-    const int32_t b =
-        3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] + B[k + buf_stride + 1];
-    const int32_t v = a * dgd[l] + b;
-    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-  }
-  i = 0;
-  j = width - 1;
-  {
-    const int k = i * buf_stride + j;
-    const int l = i * dgd_stride + j;
-    const int m = i * dst_stride + j;
-    const int nb = 3;
-    const int32_t a =
-        3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] + A[k + buf_stride - 1];
-    const int32_t b =
-        3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] + B[k + buf_stride - 1];
-    const int32_t v = a * dgd[l] + b;
-    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-  }
-  i = height - 1;
-  j = 0;
-  {
-    const int k = i * buf_stride + j;
-    const int l = i * dgd_stride + j;
-    const int m = i * dst_stride + j;
-    const int nb = 3;
-    const int32_t a =
-        3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] + A[k - buf_stride + 1];
-    const int32_t b =
-        3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] + B[k - buf_stride + 1];
-    const int32_t v = a * dgd[l] + b;
-    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-  }
-  i = height - 1;
-  j = width - 1;
-  {
-    const int k = i * buf_stride + j;
-    const int l = i * dgd_stride + j;
-    const int m = i * dst_stride + j;
-    const int nb = 3;
-    const int32_t a =
-        3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] + A[k - buf_stride - 1];
-    const int32_t b =
-        3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] + B[k - buf_stride - 1];
-    const int32_t v = a * dgd[l] + b;
-    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-  }
-  i = 0;
-  for (j = 1; j < width - 1; ++j) {
-    const int k = i * buf_stride + j;
-    const int l = i * dgd_stride + j;
-    const int m = i * dst_stride + j;
-    const int nb = 3;
-    const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
-                      A[k + buf_stride - 1] + A[k + buf_stride + 1];
-    const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
-                      B[k + buf_stride - 1] + B[k + buf_stride + 1];
-    const int32_t v = a * dgd[l] + b;
-    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-  }
-  i = height - 1;
-  for (j = 1; j < width - 1; ++j) {
-    const int k = i * buf_stride + j;
-    const int l = i * dgd_stride + j;
-    const int m = i * dst_stride + j;
-    const int nb = 3;
-    const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
-                      A[k - buf_stride - 1] + A[k - buf_stride + 1];
-    const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
-                      B[k - buf_stride - 1] + B[k - buf_stride + 1];
-    const int32_t v = a * dgd[l] + b;
-    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-  }
-  j = 0;
-  for (i = 1; i < height - 1; ++i) {
-    const int k = i * buf_stride + j;
-    const int l = i * dgd_stride + j;
-    const int m = i * dst_stride + j;
-    const int nb = 3;
-    const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
-                      A[k + 1] + A[k - buf_stride + 1] + A[k + buf_stride + 1];
-    const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
-                      B[k + 1] + B[k - buf_stride + 1] + B[k + buf_stride + 1];
-    const int32_t v = a * dgd[l] + b;
-    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+  // Use the A[] and B[] arrays to calculate the filtered image
+  assert(r == 2);
+  for (i = 0; i < height; ++i) {
+    if (!(i & 1)) {  // even row
+      for (j = 0; j < width; ++j) {
+        const int k = i * buf_stride + j;
+        const int l = i * dgd_stride + j;
+        const int m = i * dst_stride + j;
+        const int nb = 5;
+        const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
+                          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
+                           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
+                              5;
+        const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
+                          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
+                           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
+                              5;
+        const int32_t v = a * dgd[l] + b;
+        dst[m] =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+      }
+    } else {  // odd row
+      for (j = 0; j < width; ++j) {
+        const int k = i * buf_stride + j;
+        const int l = i * dgd_stride + j;
+        const int m = i * dst_stride + j;
+        const int nb = 4;
+        const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
+        const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
+        const int32_t v = a * dgd[l] + b;
+        dst[m] =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+      }
+    }
   }
-  j = width - 1;
-  for (i = 1; i < height - 1; ++i) {
-    const int k = i * buf_stride + j;
-    const int l = i * dgd_stride + j;
-    const int m = i * dst_stride + j;
-    const int nb = 3;
-    const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
-                      A[k - 1] + A[k - buf_stride - 1] + A[k + buf_stride - 1];
-    const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
-                      B[k - 1] + B[k - buf_stride - 1] + B[k + buf_stride - 1];
-    const int32_t v = a * dgd[l] + b;
-    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+}
+
+static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
+                                            int dgd_stride, int32_t *dst,
+                                            int dst_stride, int bit_depth,
+                                            int sgr_params_idx,
+                                            int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 16 bytes, for consistency
+  // with the SIMD version of this function.
+  int buf_stride = ((width_ext + 3) & ~3) + 16;
+  int32_t A_[RESTORATION_PROC_UNIT_PELS];
+  int32_t B_[RESTORATION_PROC_UNIT_PELS];
+  int32_t *A = A_;
+  int32_t *B = B_;
+  int i, j;
+
+  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+         "Need SGRPROJ_BORDER_* >= r+1");
+
+  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
+         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
+  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
+         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
+  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
+  // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
+  for (i = -1; i < height + 1; ++i) {
+    for (j = -1; j < width + 1; ++j) {
+      const int k = i * buf_stride + j;
+      const int n = (2 * r + 1) * (2 * r + 1);
+
+      // a < 2^16 * n < 2^22 regardless of bit depth
+      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
+      // b < 2^8 * n < 2^14 regardless of bit depth
+      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
+
+      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
+      // and p itself satisfies p < 2^14 * n^2 < 2^26.
+      // This bound on p is due to:
+      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
+      //
+      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
+      // This is an artefact of rounding, and can only happen if all pixels
+      // are (almost) identical, so in this case we saturate to p=0.
+      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
+
+      const uint32_t s = params->s[radius_idx];
+
+      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
+      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
+      // (this holds even after accounting for the rounding in s)
+      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
+
+      // Note: We have to be quite careful about the value of A[k].
+      // This is used as a blend factor between individual pixel values and the
+      // local mean. So it logically has a range of [0, 256], including both
+      // endpoints.
+      //
+      // This is a pain for hardware, as we'd like something which can be stored
+      // in exactly 8 bits.
+      // Further, in the calculation of B[k] below, if z == 0 and r == 2,
+      // then A[k] "should be" 0. But then we can end up setting B[k] to a value
+      // slightly above 2^(8 + bit depth), due to rounding in the value of
+      // one_by_x[25-1].
+      //
+      // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
+      // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
+      // overflow), without significantly affecting the final result: z == 0
+      // implies that the image is essentially "flat", so the local mean and
+      // individual pixel values are very similar.
+      //
+      // Note that saturating on the other side, ie. requring A[k] <= 255,
+      // would be a bad idea, as that corresponds to the case where the image
+      // is very variable, when we want to preserve the local pixel value as
+      // much as possible.
+      A[k] = x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
+
+      // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
+      // one_by_x[n - 1] = round(2^12 / n)
+      // => the product here is < 2^(20 + bit_depth) <= 2^32,
+      // and B[k] is set to a value < 2^(8 + bit depth)
+      // This holds even with the rounding in one_by_x and in the overall
+      // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
+      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
+                                             (uint32_t)B[k] *
+                                             (uint32_t)one_by_x[n - 1],
+                                         SGRPROJ_RECIP_BITS);
+    }
   }
-  for (i = 1; i < height - 1; ++i) {
-    for (j = 1; j < width - 1; ++j) {
+  // Use the A[] and B[] arrays to calculate the filtered image
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
       const int k = i * buf_stride + j;
       const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
@@ -962,968 +911,697 @@ static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
   }
 }
 
-void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height,
-                                  int stride, int32_t *dst, int dst_stride,
-                                  int r, int eps) {
+void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+                                  int dgd_stride, int32_t *flt0, int32_t *flt1,
+                                  int flt_stride, int sgr_params_idx,
+                                  int bit_depth, int highbd) {
   int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
   const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
   int32_t *dgd32 =
       dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
-  int i, j;
-  for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
-    for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
-      dgd32[i * dgd32_stride + j] = dgd[i * stride + j];
-    }
-  }
-  av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst,
-                                      dst_stride, 8, r, eps);
-}
 
-void av1_highpass_filter_c(uint8_t *dgd, int width, int height, int stride,
-                           int32_t *dst, int dst_stride, int corner, int edge) {
-  int i, j;
-  const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
-
-  i = 0;
-  j = 0;
-  {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] =
-        center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
-        corner * (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
-  }
-  i = 0;
-  j = width - 1;
-  {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] =
-        center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
-        corner * (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
-  }
-  i = height - 1;
-  j = 0;
-  {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] =
-        center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
-        corner * (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
-  }
-  i = height - 1;
-  j = width - 1;
-  {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] =
-        center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
-        corner * (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
-  }
-  i = 0;
-  for (j = 1; j < width - 1; ++j) {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] = center * dgd[k] +
-             edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
-             corner * (dgd[k + stride - 1] + dgd[k + stride + 1] + dgd[k - 1] +
-                       dgd[k + 1]);
-  }
-  i = height - 1;
-  for (j = 1; j < width - 1; ++j) {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] = center * dgd[k] +
-             edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
-             corner * (dgd[k - stride - 1] + dgd[k - stride + 1] + dgd[k - 1] +
-                       dgd[k + 1]);
-  }
-  j = 0;
-  for (i = 1; i < height - 1; ++i) {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] = center * dgd[k] +
-             edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
-             corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
-                       dgd[k - stride] + dgd[k + stride]);
-  }
-  j = width - 1;
-  for (i = 1; i < height - 1; ++i) {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] = center * dgd[k] +
-             edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
-             corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
-                       dgd[k - stride] + dgd[k + stride]);
-  }
-  for (i = 1; i < height - 1; ++i) {
-    for (j = 1; j < width - 1; ++j) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] +
-          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
-          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
-                    dgd[k - stride + 1] + dgd[k + stride + 1]);
+  if (highbd) {
+    const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
+    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
+      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
+        dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
+      }
+    }
+  } else {
+    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
+      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
+        dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
+      }
     }
   }
+
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  // If params->r == 0 we skip the corresponding filter. We only allow one of
+  // the radii to be 0, as having both equal to 0 would be equivalent to
+  // skipping SGR entirely.
+  assert(!(params->r[0] == 0 && params->r[1] == 0));
+
+  if (params->r[0] > 0)
+    selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
+                                         flt0, flt_stride, bit_depth,
+                                         sgr_params_idx, 0);
+  if (params->r[1] > 0)
+    selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
+                                    flt_stride, bit_depth, sgr_params_idx, 1);
 }
 
-void apply_selfguided_restoration_c(uint8_t *dat, int width, int height,
-                                    int stride, int eps, int *xqd, uint8_t *dst,
-                                    int dst_stride, int32_t *tmpbuf) {
+void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
+                                    int stride, int eps, const int *xqd,
+                                    uint8_t *dst8, int dst_stride,
+                                    int32_t *tmpbuf, int bit_depth,
+                                    int highbd) {
+  int32_t *flt0 = tmpbuf;
+  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+  assert(width * height <= RESTORATION_UNITPELS_MAX);
+
+  av1_selfguided_restoration_c(dat8, width, height, stride, flt0, flt1, width,
+                               eps, bit_depth, highbd);
+  const sgr_params_type *const params = &sgr_params[eps];
   int xq[2];
-  int32_t *flt1 = tmpbuf;
-  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
-  int i, j;
-  assert(width * height <= RESTORATION_TILEPELS_MAX);
-#if USE_HIGHPASS_IN_SGRPROJ
-  av1_highpass_filter_c(dat, width, height, stride, flt1, width,
-                        sgr_params[eps].corner, sgr_params[eps].edge);
-#else
-  av1_selfguided_restoration_c(dat, width, height, stride, flt1, width,
-                               sgr_params[eps].r1, sgr_params[eps].e1);
-#endif  // USE_HIGHPASS_IN_SGRPROJ
-  av1_selfguided_restoration_c(dat, width, height, stride, flt2, width,
-                               sgr_params[eps].r2, sgr_params[eps].e2);
-  decode_xq(xqd, xq);
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
+  decode_xq(xqd, xq, params);
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
       const int k = i * width + j;
-      const int l = i * stride + j;
-      const int m = i * dst_stride + j;
-      const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
-      const int32_t f1 = (int32_t)flt1[k] - u;
-      const int32_t f2 = (int32_t)flt2[k] - u;
-      const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
+      uint8_t *dst8ij = dst8 + i * dst_stride + j;
+      const uint8_t *dat8ij = dat8 + i * stride + j;
+
+      const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
+      const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
+      int32_t v = u << SGRPROJ_PRJ_BITS;
+      // If params->r == 0 then we skipped the filtering in
+      // av1_selfguided_restoration_c, i.e. flt[k] == u
+      if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
+      if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
       const int16_t w =
           (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
-      dst[m] = clip_pixel(w);
-    }
-  }
-}
 
-static void loop_sgrproj_filter_tile(uint8_t *data, int tile_idx, int width,
-                                     int height, int stride,
-                                     RestorationInternal *rst, uint8_t *dst,
-                                     int dst_stride) {
-  const int procunit_width = rst->rsi->procunit_width;
-#if CONFIG_STRIPED_LOOP_RESTORATION
-  int procunit_height;
-#else
-  const int procunit_height = rst->rsi->procunit_height;
-#endif
-  const int tile_width = rst->tile_width;
-  const int tile_height = rst->tile_height;
-  if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
-    loop_copy_tile(data, tile_idx, width, height, stride, rst, dst, dst_stride);
-    return;
-  }
-  RestorationTileLimits limits =
-      av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
-#if CONFIG_STRIPED_LOOP_RESTORATION
-                               tile_height, width, height, rst->subsampling_y);
-#else
-                               tile_height, width, height);
-#endif
-  for (int i = limits.v_start; i < limits.v_end; i += procunit_height) {
-#if CONFIG_STRIPED_LOOP_RESTORATION
-    int h = setup_processing_stripe_boundary(
-        i, limits.v_end, limits.h_start, limits.h_end, data, stride, rst, 0);
-    procunit_height = h;
-#else
-    int h = AOMMIN(procunit_height, limits.v_end - i);
-#endif
-    for (int j = limits.h_start; j < limits.h_end; j += procunit_width) {
-      int w = AOMMIN(procunit_width, limits.h_end - j);
-      uint8_t *data_p = data + i * stride + j;
-      uint8_t *dst_p = dst + i * dst_stride + j;
-      apply_selfguided_restoration(
-          data_p, w, h, stride, rst->rsi->sgrproj_info[tile_idx].ep,
-          rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
+      const uint16_t out = clip_pixel_highbd(w, bit_depth);
+      if (highbd)
+        *CONVERT_TO_SHORTPTR(dst8ij) = out;
+      else
+        *dst8ij = (uint8_t)out;
     }
-#if CONFIG_STRIPED_LOOP_RESTORATION
-    restore_processing_stripe_boundary(i, limits.v_end, limits.h_start,
-                                       limits.h_end, data, stride, rst, 0);
-#endif
   }
 }
 
-static void loop_sgrproj_filter(uint8_t *data, int width, int height,
-                                int stride, RestorationInternal *rst,
-                                uint8_t *dst, int dst_stride) {
-  int tile_idx;
-  extend_frame(data, width, height, stride, SGRPROJ_BORDER_HORZ,
-               SGRPROJ_BORDER_VERT);
-  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
-    loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst, dst,
-                             dst_stride);
+static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
+                                  int stripe_width, int stripe_height,
+                                  int procunit_width, const uint8_t *src,
+                                  int src_stride, uint8_t *dst, int dst_stride,
+                                  int32_t *tmpbuf, int bit_depth) {
+  (void)bit_depth;
+  assert(bit_depth == 8);
+
+  for (int j = 0; j < stripe_width; j += procunit_width) {
+    int w = AOMMIN(procunit_width, stripe_width - j);
+    apply_selfguided_restoration(src + j, w, stripe_height, src_stride,
+                                 rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
+                                 dst + j, dst_stride, tmpbuf, bit_depth, 0);
   }
 }
 
-static void loop_switchable_filter(uint8_t *data, int width, int height,
-                                   int stride, RestorationInternal *rst,
-                                   uint8_t *dst, int dst_stride) {
-  int tile_idx;
-  extend_frame(data, width, height, stride, RESTORATION_BORDER_HORZ,
-               RESTORATION_BORDER_VERT);
-  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
-    if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
-      loop_copy_tile(data, tile_idx, width, height, stride, rst, dst,
-                     dst_stride);
-    } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_WIENER) {
-      loop_wiener_filter_tile(data, tile_idx, width, height, stride, rst, dst,
-                              dst_stride);
-    } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_SGRPROJ) {
-      loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst, dst,
-                               dst_stride);
-    }
+static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
+                                        int stripe_width, int stripe_height,
+                                        int procunit_width, const uint8_t *src8,
+                                        int src_stride, uint8_t *dst8,
+                                        int dst_stride, int32_t *tmpbuf,
+                                        int bit_depth) {
+  (void)tmpbuf;
+  const ConvolveParams conv_params = get_conv_params_wiener(bit_depth);
+
+  for (int j = 0; j < stripe_width; j += procunit_width) {
+    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
+    const uint8_t *src8_p = src8 + j;
+    uint8_t *dst8_p = dst8 + j;
+    av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
+                                       rui->wiener_info.hfilter, 16,
+                                       rui->wiener_info.vfilter, 16, w,
+                                       stripe_height, &conv_params, bit_depth);
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
-void extend_frame_highbd(uint16_t *data, int width, int height, int stride,
-                         int border_horz, int border_vert) {
-  uint16_t *data_p;
-  int i, j;
-  for (i = 0; i < height; ++i) {
-    data_p = data + i * stride;
-    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
-    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
-  }
-  data_p = data - border_horz;
-  for (i = -border_vert; i < 0; ++i) {
-    memcpy(data_p + i * stride, data_p,
-           (width + 2 * border_horz) * sizeof(uint16_t));
-  }
-  for (i = height; i < height + border_vert; ++i) {
-    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
-           (width + 2 * border_horz) * sizeof(uint16_t));
+static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
+                                         int stripe_width, int stripe_height,
+                                         int procunit_width,
+                                         const uint8_t *src8, int src_stride,
+                                         uint8_t *dst8, int dst_stride,
+                                         int32_t *tmpbuf, int bit_depth) {
+  for (int j = 0; j < stripe_width; j += procunit_width) {
+    int w = AOMMIN(procunit_width, stripe_width - j);
+    apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride,
+                                 rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
+                                 dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
   }
 }
 
-static void loop_copy_tile_highbd(uint16_t *data, int tile_idx, int width,
-                                  int height, int stride,
-                                  RestorationInternal *rst, uint16_t *dst,
-                                  int dst_stride) {
-  const int tile_width = rst->tile_width;
-  const int tile_height = rst->tile_height;
-  RestorationTileLimits limits =
-      av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
-#if CONFIG_STRIPED_LOOP_RESTORATION
-                               tile_height, width, height, rst->subsampling_y);
-#else
-                               tile_height, width, height);
-#endif
-  for (int i = limits.v_start; i < limits.v_end; ++i)
-    memcpy(dst + i * dst_stride + limits.h_start,
-           data + i * stride + limits.h_start,
-           (limits.h_end - limits.h_start) * sizeof(*dst));
-}
+typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
+                                  int stripe_width, int stripe_height,
+                                  int procunit_width, const uint8_t *src,
+                                  int src_stride, uint8_t *dst, int dst_stride,
+                                  int32_t *tmpbuf, int bit_depth);
 
-static void loop_wiener_filter_tile_highbd(uint16_t *data, int tile_idx,
-                                           int width, int height, int stride,
-                                           RestorationInternal *rst,
-                                           int bit_depth, uint16_t *dst,
-                                           int dst_stride) {
-  const int procunit_width = rst->rsi->procunit_width;
-#if CONFIG_STRIPED_LOOP_RESTORATION
-  int procunit_height;
-#else
-  const int procunit_height = rst->rsi->procunit_height;
-#endif
-  const int tile_width = rst->tile_width;
-  const int tile_height = rst->tile_height;
+#define NUM_STRIPE_FILTERS 4
+
+static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
+  wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
+  sgrproj_filter_stripe_highbd
+};
 
-  if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
-    loop_copy_tile_highbd(data, tile_idx, width, height, stride, rst, dst,
-                          dst_stride);
+// Filter one restoration unit
+void av1_loop_restoration_filter_unit(
+    const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
+    const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
+    const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
+    int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
+    int dst_stride, int32_t *tmpbuf, int optimized_lr) {
+  RestorationType unit_rtype = rui->restoration_type;
+
+  int unit_h = limits->v_end - limits->v_start;
+  int unit_w = limits->h_end - limits->h_start;
+  uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start;
+  uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start;
+
+  if (unit_rtype == RESTORE_NONE) {
+    copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd);
     return;
   }
-  RestorationTileLimits limits =
-      av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
-#if CONFIG_STRIPED_LOOP_RESTORATION
-                               tile_height, width, height, rst->subsampling_y);
-#else
-                               tile_height, width, height);
-#endif
-  InterpKernel vertical_topbot;
-
-  // Convolve the whole tile (done in blocks here to match the requirements
-  // of the vectorized convolve functions, but the result is equivalent)
-  for (int i = limits.v_start; i < limits.v_end; i += procunit_height) {
-#if CONFIG_STRIPED_LOOP_RESTORATION
-    int h = setup_processing_stripe_boundary(i, limits.v_end, limits.h_start,
-                                             limits.h_end, (uint8_t *)data,
-                                             stride, rst, 1);
-    h = ALIGN_POWER_OF_TWO(h, 1);
-    procunit_height = h;
-#else
-    int h = AOMMIN(procunit_height, (limits.v_end - i + 15) & ~15);
-#endif
-    for (int j = limits.h_start; j < limits.h_end; j += procunit_width) {
-      int w = AOMMIN(procunit_width, (limits.h_end - j + 15) & ~15);
-      const uint16_t *data_p = data + i * stride + j;
-      uint16_t *dst_p = dst + i * dst_stride + j;
-      // Note h is at least 16
-      for (int b = 0; b < WIENER_HALFWIN - WIENER_BORDER_VERT; ++b) {
-        stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
-                               vertical_topbot, WIENER_BORDER_VERT + b, 1);
-#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-        aom_highbd_convolve8_add_src_hip(
-            CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
-            dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
-            vertical_topbot, 16, w, 1, bit_depth);
-#else
-        aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
-                                     CONVERT_TO_BYTEPTR(dst_p), dst_stride,
-                                     rst->rsi->wiener_info[tile_idx].hfilter,
-                                     16, vertical_topbot, 16, w, 1, bit_depth);
-#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-        data_p += stride;
-        dst_p += dst_stride;
-      }
-#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-      aom_highbd_convolve8_add_src_hip(
-          CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
-          dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
-          rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
-          h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2, bit_depth);
-#else
-      aom_highbd_convolve8_add_src(
-          CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
-          dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
-          rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
-          h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2, bit_depth);
-#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-      data_p += stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
-      dst_p += dst_stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
-      for (int b = WIENER_HALFWIN - WIENER_BORDER_VERT - 1; b >= 0; --b) {
-        stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
-                               vertical_topbot, WIENER_BORDER_VERT + b, 0);
-#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-        aom_highbd_convolve8_add_src_hip(
-            CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
-            dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
-            vertical_topbot, 16, w, 1, bit_depth);
-#else
-        aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
-                                     CONVERT_TO_BYTEPTR(dst_p), dst_stride,
-                                     rst->rsi->wiener_info[tile_idx].hfilter,
-                                     16, vertical_topbot, 16, w, 1, bit_depth);
-#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-        data_p += stride;
-        dst_p += dst_stride;
-      }
-    }
-#if CONFIG_STRIPED_LOOP_RESTORATION
-    restore_processing_stripe_boundary(i, limits.v_end, limits.h_start,
-                                       limits.h_end, (uint8_t *)data, stride,
-                                       rst, 1);
-#endif
+
+  const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
+  assert(filter_idx < NUM_STRIPE_FILTERS);
+  const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
+
+  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
+
+  // Convolve the whole tile one stripe at a time
+  RestorationTileLimits remaining_stripes = *limits;
+  int i = 0;
+  while (i < unit_h) {
+    int copy_above, copy_below;
+    remaining_stripes.v_start = limits->v_start + i;
+
+    get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, &copy_above,
+                             &copy_below);
+
+    const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+    const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
+
+    // Work out where this stripe's boundaries are within
+    // rsb->stripe_boundary_{above,below}
+    const int tile_stripe =
+        (remaining_stripes.v_start - tile_rect->top + runit_offset) /
+        full_stripe_height;
+    const int frame_stripe = tile_stripe0 + tile_stripe;
+    const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
+
+    // Calculate this stripe's height, based on two rules:
+    // * The topmost stripe in each tile is 8 luma pixels shorter than usual.
+    // * We can't extend past the end of the current restoration unit
+    const int nominal_stripe_height =
+        full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0);
+    const int h = AOMMIN(nominal_stripe_height,
+                         remaining_stripes.v_end - remaining_stripes.v_start);
+
+    setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
+                                     h, data8, stride, rlbs, copy_above,
+                                     copy_below, optimized_lr);
+
+    stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
+                  dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth);
+
+    restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
+                                       data8, stride, copy_above, copy_below,
+                                       optimized_lr);
+
+    i += h;
   }
 }
 
-static void loop_wiener_filter_highbd(uint8_t *data8, int width, int height,
-                                      int stride, RestorationInternal *rst,
-                                      int bit_depth, uint8_t *dst8,
-                                      int dst_stride) {
-  uint16_t *data = CONVERT_TO_SHORTPTR(data8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  int tile_idx;
-  extend_frame_highbd(data, width, height, stride, WIENER_BORDER_HORZ,
-                      WIENER_BORDER_VERT);
-  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
-    loop_wiener_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
-                                   bit_depth, dst, dst_stride);
-  }
+static void filter_frame_on_tile(int tile_row, int tile_col, void *priv,
+                                 AV1_COMMON *cm) {
+  (void)tile_col;
+  FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
+  ctxt->tile_stripe0 = (tile_row == 0) ? 0 : cm->rst_end_stripe[tile_row - 1];
 }
 
-void av1_selfguided_restoration_highbd_c(uint16_t *dgd, int width, int height,
-                                         int stride, int32_t *dst,
-                                         int dst_stride, int bit_depth, int r,
-                                         int eps) {
-  int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
-  const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
-  int32_t *dgd32 =
-      dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
-  int i, j;
-  for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
-    for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
-      dgd32[i * dgd32_stride + j] = dgd[i * stride + j];
-    }
-  }
-  av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst,
-                                      dst_stride, bit_depth, r, eps);
+static void filter_frame_on_unit(const RestorationTileLimits *limits,
+                                 const AV1PixelRect *tile_rect,
+                                 int rest_unit_idx, void *priv, int32_t *tmpbuf,
+                                 RestorationLineBuffers *rlbs) {
+  FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
+  const RestorationInfo *rsi = ctxt->rsi;
+
+  av1_loop_restoration_filter_unit(
+      limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, tile_rect,
+      ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, ctxt->bit_depth,
+      ctxt->data8, ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf,
+      rsi->optimized_lr);
 }
 
-void av1_highpass_filter_highbd_c(uint16_t *dgd, int width, int height,
-                                  int stride, int32_t *dst, int dst_stride,
-                                  int corner, int edge) {
-  int i, j;
-  const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
-
-  i = 0;
-  j = 0;
-  {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] =
-        center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
-        corner * (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
-  }
-  i = 0;
-  j = width - 1;
-  {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] =
-        center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
-        corner * (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
-  }
-  i = height - 1;
-  j = 0;
-  {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] =
-        center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
-        corner * (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
-  }
-  i = height - 1;
-  j = width - 1;
-  {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] =
-        center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
-        corner * (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
-  }
-  i = 0;
-  for (j = 1; j < width - 1; ++j) {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] = center * dgd[k] +
-             edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
-             corner * (dgd[k + stride - 1] + dgd[k + stride + 1] + dgd[k - 1] +
-                       dgd[k + 1]);
-  }
-  i = height - 1;
-  for (j = 1; j < width - 1; ++j) {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] = center * dgd[k] +
-             edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
-             corner * (dgd[k - stride - 1] + dgd[k - stride + 1] + dgd[k - 1] +
-                       dgd[k + 1]);
-  }
-  j = 0;
-  for (i = 1; i < height - 1; ++i) {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] = center * dgd[k] +
-             edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
-             corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
-                       dgd[k - stride] + dgd[k + stride]);
-  }
-  j = width - 1;
-  for (i = 1; i < height - 1; ++i) {
-    const int k = i * stride + j;
-    const int l = i * dst_stride + j;
-    dst[l] = center * dgd[k] +
-             edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
-             corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
-                       dgd[k - stride] + dgd[k + stride]);
-  }
-  for (i = 1; i < height - 1; ++i) {
-    for (j = 1; j < width - 1; ++j) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] +
-          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
-          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
-                    dgd[k - stride + 1] + dgd[k + stride + 1]);
+void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
+                                            YV12_BUFFER_CONFIG *frame,
+                                            AV1_COMMON *cm, int optimized_lr,
+                                            int num_planes) {
+  const int bit_depth = cm->bit_depth;
+  const int highbd = cm->use_highbitdepth;
+  lr_ctxt->dst = &cm->rst_frame;
+
+  const int frame_width = frame->crop_widths[0];
+  const int frame_height = frame->crop_heights[0];
+  if (aom_realloc_frame_buffer(lr_ctxt->dst, frame_width, frame_height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                               cm->byte_alignment, NULL, NULL, NULL) < 0)
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate restoration dst buffer");
+
+  lr_ctxt->on_rest_unit = filter_frame_on_unit;
+  lr_ctxt->frame = frame;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    RestorationInfo *rsi = &cm->rst_info[plane];
+    RestorationType rtype = rsi->frame_restoration_type;
+    rsi->optimized_lr = optimized_lr;
+
+    if (rtype == RESTORE_NONE) {
+      continue;
     }
+
+    const int is_uv = plane > 0;
+    const int plane_width = frame->crop_widths[is_uv];
+    const int plane_height = frame->crop_heights[is_uv];
+    FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
+
+    extend_frame(frame->buffers[plane], plane_width, plane_height,
+                 frame->strides[is_uv], RESTORATION_BORDER, RESTORATION_BORDER,
+                 highbd);
+
+    lr_plane_ctxt->rsi = rsi;
+    lr_plane_ctxt->ss_x = is_uv && cm->subsampling_x;
+    lr_plane_ctxt->ss_y = is_uv && cm->subsampling_y;
+    lr_plane_ctxt->highbd = highbd;
+    lr_plane_ctxt->bit_depth = bit_depth;
+    lr_plane_ctxt->data8 = frame->buffers[plane];
+    lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
+    lr_plane_ctxt->data_stride = frame->strides[is_uv];
+    lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
+    lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv);
+    filter_frame_on_tile(LR_TILE_ROW, LR_TILE_COL, lr_plane_ctxt, cm);
   }
 }
 
-void apply_selfguided_restoration_highbd_c(uint16_t *dat, int width, int height,
-                                           int stride, int bit_depth, int eps,
-                                           int *xqd, uint16_t *dst,
-                                           int dst_stride, int32_t *tmpbuf) {
-  int xq[2];
-  int32_t *flt1 = tmpbuf;
-  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
-  int i, j;
-  assert(width * height <= RESTORATION_TILEPELS_MAX);
-#if USE_HIGHPASS_IN_SGRPROJ
-  av1_highpass_filter_highbd_c(dat, width, height, stride, flt1, width,
-                               sgr_params[eps].corner, sgr_params[eps].edge);
-#else
-  av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt1, width,
-                                      bit_depth, sgr_params[eps].r1,
-                                      sgr_params[eps].e1);
-#endif  // USE_HIGHPASS_IN_SGRPROJ
-  av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt2, width,
-                                      bit_depth, sgr_params[eps].r2,
-                                      sgr_params[eps].e2);
-  decode_xq(xqd, xq);
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      const int k = i * width + j;
-      const int l = i * stride + j;
-      const int m = i * dst_stride + j;
-      const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
-      const int32_t f1 = (int32_t)flt1[k] - u;
-      const int32_t f2 = (int32_t)flt2[k] - u;
-      const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
-      const int16_t w =
-          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
-      dst[m] = (uint16_t)clip_pixel_highbd(w, bit_depth);
-    }
+void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
+                                      AV1_COMMON *cm, int num_planes) {
+  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
+                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
+                           int vstart, int vend);
+  static const copy_fun copy_funs[3] = {
+    aom_yv12_partial_copy_y, aom_yv12_partial_copy_u, aom_yv12_partial_copy_v
+  };
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+    AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
+    copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, tile_rect.left,
+                     tile_rect.right, tile_rect.top, tile_rect.bottom);
   }
 }
 
-static void loop_sgrproj_filter_tile_highbd(uint16_t *data, int tile_idx,
-                                            int width, int height, int stride,
-                                            RestorationInternal *rst,
-                                            int bit_depth, uint16_t *dst,
-                                            int dst_stride) {
-  const int procunit_width = rst->rsi->procunit_width;
-#if CONFIG_STRIPED_LOOP_RESTORATION
-  int procunit_height;
-#else
-  const int procunit_height = rst->rsi->procunit_height;
-#endif
-  const int tile_width = rst->tile_width;
-  const int tile_height = rst->tile_height;
+static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
+                                        int num_planes) {
+  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
 
-  if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
-    loop_copy_tile_highbd(data, tile_idx, width, height, stride, rst, dst,
-                          dst_stride);
-    return;
-  }
-  RestorationTileLimits limits =
-      av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
-#if CONFIG_STRIPED_LOOP_RESTORATION
-                               tile_height, width, height, rst->subsampling_y);
-#else
-                               tile_height, width, height);
-#endif
-  for (int i = limits.v_start; i < limits.v_end; i += procunit_height) {
-#if CONFIG_STRIPED_LOOP_RESTORATION
-    int h = setup_processing_stripe_boundary(i, limits.v_end, limits.h_start,
-                                             limits.h_end, (uint8_t *)data,
-                                             stride, rst, 1);
-    procunit_height = h;
-#else
-    int h = AOMMIN(procunit_height, limits.v_end - i);
-#endif
-    for (int j = limits.h_start; j < limits.h_end; j += procunit_width) {
-      int w = AOMMIN(procunit_width, limits.h_end - j);
-      uint16_t *data_p = data + i * stride + j;
-      uint16_t *dst_p = dst + i * dst_stride + j;
-      apply_selfguided_restoration_highbd(
-          data_p, w, h, stride, bit_depth, rst->rsi->sgrproj_info[tile_idx].ep,
-          rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
+      continue;
     }
-#if CONFIG_STRIPED_LOOP_RESTORATION
-    restore_processing_stripe_boundary(i, limits.v_end, limits.h_start,
-                                       limits.h_end, (uint8_t *)data, stride,
-                                       rst, 1);
-#endif
-  }
-}
 
-static void loop_sgrproj_filter_highbd(uint8_t *data8, int width, int height,
-                                       int stride, RestorationInternal *rst,
-                                       int bit_depth, uint8_t *dst8,
-                                       int dst_stride) {
-  int tile_idx;
-  uint16_t *data = CONVERT_TO_SHORTPTR(data8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  extend_frame_highbd(data, width, height, stride, SGRPROJ_BORDER_HORZ,
-                      SGRPROJ_BORDER_VERT);
-  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
-    loop_sgrproj_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
-                                    bit_depth, dst, dst_stride);
+    av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit,
+                                   &ctxt[plane], &ctxt[plane].tile_rect,
+                                   cm->rst_tmpbuf, cm->rlbs);
   }
 }
 
-static void loop_switchable_filter_highbd(uint8_t *data8, int width, int height,
-                                          int stride, RestorationInternal *rst,
-                                          int bit_depth, uint8_t *dst8,
-                                          int dst_stride) {
-  uint16_t *data = CONVERT_TO_SHORTPTR(data8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  int tile_idx;
-  extend_frame_highbd(data, width, height, stride, RESTORATION_BORDER_HORZ,
-                      RESTORATION_BORDER_VERT);
-  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
-    if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
-      loop_copy_tile_highbd(data, tile_idx, width, height, stride, rst, dst,
-                            dst_stride);
-    } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_WIENER) {
-      loop_wiener_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
-                                     bit_depth, dst, dst_stride);
-    } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_SGRPROJ) {
-      loop_sgrproj_filter_tile_highbd(data, tile_idx, width, height, stride,
-                                      rst, bit_depth, dst, dst_stride);
-    }
-  }
+void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
+                                       AV1_COMMON *cm, int optimized_lr,
+                                       void *lr_ctxt) {
+  assert(!cm->all_lossless);
+  const int num_planes = av1_num_planes(cm);
+
+  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
+
+  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
+                                         optimized_lr, num_planes);
+
+  foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
+
+  av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
 }
-#endif  // CONFIG_HIGHBITDEPTH
-
-static void loop_restoration_rows(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
-                                  int start_mi_row, int end_mi_row,
-                                  int components_pattern, RestorationInfo *rsi,
-                                  YV12_BUFFER_CONFIG *dst) {
-  const int ywidth = frame->y_crop_width;
-  const int yheight = frame->y_crop_height;
-  const int uvwidth = frame->uv_crop_width;
-  const int uvheight = frame->uv_crop_height;
-  const int ystride = frame->y_stride;
-  const int uvstride = frame->uv_stride;
-  const int ystart = start_mi_row << MI_SIZE_LOG2;
-  const int uvstart = ystart >> cm->subsampling_y;
-  int yend = end_mi_row << MI_SIZE_LOG2;
-  int uvend = yend >> cm->subsampling_y;
-  restore_func_type restore_funcs[RESTORE_TYPES] = {
-    NULL, loop_wiener_filter, loop_sgrproj_filter, loop_switchable_filter
-  };
-#if CONFIG_HIGHBITDEPTH
-  restore_func_highbd_type restore_funcs_highbd[RESTORE_TYPES] = {
-    NULL, loop_wiener_filter_highbd, loop_sgrproj_filter_highbd,
-    loop_switchable_filter_highbd
-  };
-#endif  // CONFIG_HIGHBITDEPTH
-  restore_func_type restore_func;
-#if CONFIG_HIGHBITDEPTH
-  restore_func_highbd_type restore_func_highbd;
-#endif  // CONFIG_HIGHBITDEPTH
-  YV12_BUFFER_CONFIG dst_;
-
-  yend = AOMMIN(yend, yheight);
-  uvend = AOMMIN(uvend, uvheight);
-  if (components_pattern == (1 << AOM_PLANE_Y)) {
-    // Only y
-    if (rsi[0].frame_restoration_type == RESTORE_NONE) {
-      if (dst) aom_yv12_copy_y(frame, dst);
-      return;
-    }
-  } else if (components_pattern == (1 << AOM_PLANE_U)) {
-    // Only U
-    if (rsi[1].frame_restoration_type == RESTORE_NONE) {
-      if (dst) aom_yv12_copy_u(frame, dst);
-      return;
-    }
-  } else if (components_pattern == (1 << AOM_PLANE_V)) {
-    // Only V
-    if (rsi[2].frame_restoration_type == RESTORE_NONE) {
-      if (dst) aom_yv12_copy_v(frame, dst);
-      return;
-    }
-  } else if (components_pattern ==
-             ((1 << AOM_PLANE_Y) | (1 << AOM_PLANE_U) | (1 << AOM_PLANE_V))) {
-    // All components
-    if (rsi[0].frame_restoration_type == RESTORE_NONE &&
-        rsi[1].frame_restoration_type == RESTORE_NONE &&
-        rsi[2].frame_restoration_type == RESTORE_NONE) {
-      if (dst) aom_yv12_copy_frame(frame, dst);
-      return;
-    }
-  }
 
-  if (!dst) {
-    dst = &dst_;
-    memset(dst, 0, sizeof(YV12_BUFFER_CONFIG));
-    if (aom_realloc_frame_buffer(
-            dst, ywidth, yheight, cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-            cm->use_highbitdepth,
-#endif
-            AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL) < 0)
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                         "Failed to allocate restoration dst buffer");
-  }
+void av1_foreach_rest_unit_in_row(
+    RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
+    rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
+    int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
+    void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+    sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
+    struct AV1LrSyncData *const lr_sync) {
+  const int tile_w = tile_rect->right - tile_rect->left;
+  const int ext_size = unit_size * 3 / 2;
+  int x0 = 0, j = 0;
+  while (x0 < tile_w) {
+    int remaining_w = tile_w - x0;
+    int w = (remaining_w < ext_size) ? remaining_w : unit_size;
 
-  if ((components_pattern >> AOM_PLANE_Y) & 1) {
-    if (rsi[0].frame_restoration_type != RESTORE_NONE) {
-      cm->rst_internal.ntiles = av1_get_rest_ntiles(
-          ywidth, yheight, cm->rst_info[AOM_PLANE_Y].restoration_tilesize,
-          &cm->rst_internal.tile_width, &cm->rst_internal.tile_height,
-          &cm->rst_internal.nhtiles, &cm->rst_internal.nvtiles);
-      cm->rst_internal.rsi = &rsi[0];
-#if CONFIG_STRIPED_LOOP_RESTORATION
-      cm->rst_internal.component = AOM_PLANE_Y;
-      cm->rst_internal.subsampling_y = 0;
-#endif
-      restore_func =
-          restore_funcs[cm->rst_internal.rsi->frame_restoration_type];
-#if CONFIG_HIGHBITDEPTH
-      restore_func_highbd =
-          restore_funcs_highbd[cm->rst_internal.rsi->frame_restoration_type];
-      if (cm->use_highbitdepth)
-        restore_func_highbd(
-            frame->y_buffer + ystart * ystride, ywidth, yend - ystart, ystride,
-            &cm->rst_internal, cm->bit_depth,
-            dst->y_buffer + ystart * dst->y_stride, dst->y_stride);
-      else
-#endif  // CONFIG_HIGHBITDEPTH
-        restore_func(frame->y_buffer + ystart * ystride, ywidth, yend - ystart,
-                     ystride, &cm->rst_internal,
-                     dst->y_buffer + ystart * dst->y_stride, dst->y_stride);
-    } else {
-      aom_yv12_copy_y(frame, dst);
-    }
-  }
+    limits->h_start = tile_rect->left + x0;
+    limits->h_end = tile_rect->left + x0 + w;
+    assert(limits->h_end <= tile_rect->right);
 
-  if ((components_pattern >> AOM_PLANE_U) & 1) {
-    if (rsi[AOM_PLANE_U].frame_restoration_type != RESTORE_NONE) {
-      cm->rst_internal.ntiles = av1_get_rest_ntiles(
-          uvwidth, uvheight, cm->rst_info[AOM_PLANE_U].restoration_tilesize,
-          &cm->rst_internal.tile_width, &cm->rst_internal.tile_height,
-          &cm->rst_internal.nhtiles, &cm->rst_internal.nvtiles);
-      cm->rst_internal.rsi = &rsi[AOM_PLANE_U];
-#if CONFIG_STRIPED_LOOP_RESTORATION
-      cm->rst_internal.component = AOM_PLANE_U;
-      cm->rst_internal.subsampling_y = cm->subsampling_y;
-#endif
-      restore_func =
-          restore_funcs[cm->rst_internal.rsi->frame_restoration_type];
-#if CONFIG_HIGHBITDEPTH
-      restore_func_highbd =
-          restore_funcs_highbd[cm->rst_internal.rsi->frame_restoration_type];
-      if (cm->use_highbitdepth)
-        restore_func_highbd(
-            frame->u_buffer + uvstart * uvstride, uvwidth, uvend - uvstart,
-            uvstride, &cm->rst_internal, cm->bit_depth,
-            dst->u_buffer + uvstart * dst->uv_stride, dst->uv_stride);
-      else
-#endif  // CONFIG_HIGHBITDEPTH
-        restore_func(frame->u_buffer + uvstart * uvstride, uvwidth,
-                     uvend - uvstart, uvstride, &cm->rst_internal,
-                     dst->u_buffer + uvstart * dst->uv_stride, dst->uv_stride);
-    } else {
-      aom_yv12_copy_u(frame, dst);
-    }
-  }
+    const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j;
 
-  if ((components_pattern >> AOM_PLANE_V) & 1) {
-    if (rsi[AOM_PLANE_V].frame_restoration_type != RESTORE_NONE) {
-      cm->rst_internal.ntiles = av1_get_rest_ntiles(
-          uvwidth, uvheight, cm->rst_info[AOM_PLANE_V].restoration_tilesize,
-          &cm->rst_internal.tile_width, &cm->rst_internal.tile_height,
-          &cm->rst_internal.nhtiles, &cm->rst_internal.nvtiles);
-      cm->rst_internal.rsi = &rsi[AOM_PLANE_V];
-#if CONFIG_STRIPED_LOOP_RESTORATION
-      cm->rst_internal.component = AOM_PLANE_V;
-      cm->rst_internal.subsampling_y = cm->subsampling_y;
-#endif
-      restore_func =
-          restore_funcs[cm->rst_internal.rsi->frame_restoration_type];
-#if CONFIG_HIGHBITDEPTH
-      restore_func_highbd =
-          restore_funcs_highbd[cm->rst_internal.rsi->frame_restoration_type];
-      if (cm->use_highbitdepth)
-        restore_func_highbd(
-            frame->v_buffer + uvstart * uvstride, uvwidth, uvend - uvstart,
-            uvstride, &cm->rst_internal, cm->bit_depth,
-            dst->v_buffer + uvstart * dst->uv_stride, dst->uv_stride);
-      else
-#endif  // CONFIG_HIGHBITDEPTH
-        restore_func(frame->v_buffer + uvstart * uvstride, uvwidth,
-                     uvend - uvstart, uvstride, &cm->rst_internal,
-                     dst->v_buffer + uvstart * dst->uv_stride, dst->uv_stride);
-    } else {
-      aom_yv12_copy_v(frame, dst);
-    }
-  }
+    // No sync for even numbered rows
+    // For odd numbered rows, Loop Restoration of current block requires the LR
+    // of top-right and bottom-right blocks to be completed
+
+    // top-right sync
+    on_sync_read(lr_sync, row_number, j, plane);
+    if ((row_number + 1) < vunits_per_tile)
+      // bottom-right sync
+      on_sync_read(lr_sync, row_number + 2, j, plane);
+
+    on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs);
+
+    on_sync_write(lr_sync, row_number, j, hunits_per_tile, plane);
 
-  if (dst == &dst_) {
-    if ((components_pattern >> AOM_PLANE_Y) & 1) aom_yv12_copy_y(dst, frame);
-    if ((components_pattern >> AOM_PLANE_U) & 1) aom_yv12_copy_u(dst, frame);
-    if ((components_pattern >> AOM_PLANE_V) & 1) aom_yv12_copy_v(dst, frame);
-    aom_free_frame_buffer(dst);
+    x0 += w;
+    ++j;
   }
 }
 
-void av1_loop_restoration_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
-                                RestorationInfo *rsi, int components_pattern,
-                                int partial_frame, YV12_BUFFER_CONFIG *dst) {
-  int start_mi_row, end_mi_row, mi_rows_to_filter;
-  start_mi_row = 0;
-#if CONFIG_FRAME_SUPERRES
-  mi_rows_to_filter =
-      ALIGN_POWER_OF_TWO(cm->superres_upscaled_height, 3) >> MI_SIZE_LOG2;
-#else
-  mi_rows_to_filter = cm->mi_rows;
-#endif  // CONFIG_FRAME_SUPERRES
-  if (partial_frame && mi_rows_to_filter > 8) {
-    start_mi_row = mi_rows_to_filter >> 1;
-    start_mi_row &= 0xfffffff8;
-    mi_rows_to_filter = AOMMAX(mi_rows_to_filter / 8, 8);
+void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
+  (void)lr_sync;
+  (void)r;
+  (void)c;
+  (void)plane;
+}
+
+void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
+                             const int sb_cols, int plane) {
+  (void)lr_sync;
+  (void)r;
+  (void)c;
+  (void)sb_cols;
+  (void)plane;
+}
+
+static void foreach_rest_unit_in_tile(
+    const AV1PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols,
+    int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size,
+    int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv,
+    int32_t *tmpbuf, RestorationLineBuffers *rlbs) {
+  const int tile_h = tile_rect->bottom - tile_rect->top;
+  const int ext_size = unit_size * 3 / 2;
+
+  const int tile_idx = tile_col + tile_row * tile_cols;
+  const int unit_idx0 = tile_idx * units_per_tile;
+
+  int y0 = 0, i = 0;
+  while (y0 < tile_h) {
+    int remaining_h = tile_h - y0;
+    int h = (remaining_h < ext_size) ? remaining_h : unit_size;
+
+    RestorationTileLimits limits;
+    limits.v_start = tile_rect->top + y0;
+    limits.v_end = tile_rect->top + y0 + h;
+    assert(limits.v_end <= tile_rect->bottom);
+    // Offset the tile upwards to align with the restoration processing stripe
+    const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
+    limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
+    if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
+
+    av1_foreach_rest_unit_in_row(
+        &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0,
+        hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs,
+        av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL);
+
+    y0 += h;
+    ++i;
   }
-  end_mi_row = start_mi_row + mi_rows_to_filter;
-  loop_restoration_init(&cm->rst_internal, cm->frame_type == KEY_FRAME);
-  loop_restoration_rows(frame, cm, start_mi_row, end_mi_row, components_pattern,
-                        rsi, dst);
+}
+
+void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
+                                    rest_unit_visitor_t on_rest_unit,
+                                    void *priv, AV1PixelRect *tile_rect,
+                                    int32_t *tmpbuf,
+                                    RestorationLineBuffers *rlbs) {
+  const int is_uv = plane > 0;
+  const int ss_y = is_uv && cm->subsampling_y;
+
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+
+  foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS,
+                            rsi->horz_units_per_tile, rsi->vert_units_per_tile,
+                            rsi->units_per_tile, rsi->restoration_unit_size,
+                            ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs);
 }
 
 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
                                        int *rcol0, int *rcol1, int *rrow0,
-                                       int *rrow1, int *nhtiles) {
-  assert(rcol0 && rcol1 && rrow0 && rrow1 && nhtiles);
-
-  if (bsize != cm->sb_size) return 0;
-
-#if CONFIG_FRAME_SUPERRES
-  const int frame_w = cm->superres_upscaled_width;
-  const int frame_h = cm->superres_upscaled_height;
-  const int mi_to_px = MI_SIZE * SCALE_NUMERATOR;
-  const int denom = cm->superres_scale_denominator;
-#else
-  const int frame_w = cm->width;
-  const int frame_h = cm->height;
-  const int mi_to_px = MI_SIZE;
-  const int denom = 1;
-#endif  // CONFIG_FRAME_SUPERRES
-
-  const int ss_x = plane > 0 && cm->subsampling_x != 0;
-  const int ss_y = plane > 0 && cm->subsampling_y != 0;
-
-  const int ss_frame_w = (frame_w + ss_x) >> ss_x;
-  const int ss_frame_h = (frame_h + ss_y) >> ss_y;
-
-  int rtile_w, rtile_h, nvtiles;
-  av1_get_rest_ntiles(ss_frame_w, ss_frame_h,
-                      cm->rst_info[plane].restoration_tilesize, &rtile_w,
-                      &rtile_h, nhtiles, &nvtiles);
-
-  const int rnd_w = rtile_w * denom - 1;
-  const int rnd_h = rtile_h * denom - 1;
-
-  // rcol0/rrow0 should be the first column/row of rtiles that doesn't start
-  // left/below of mi_col/mi_row. For this calculation, we need to round up the
-  // division (if the sb starts at rtile column 10.1, the first matching rtile
-  // has column index 11)
-  *rcol0 = (mi_col * mi_to_px + rnd_w) / (rtile_w * denom);
-  *rrow0 = (mi_row * mi_to_px + rnd_h) / (rtile_h * denom);
-
-  // rcol1/rrow1 is the equivalent calculation, but for the superblock
-  // below-right. There are some slightly strange boundary effects. First, we
-  // need to clamp to nhtiles/nvtiles for the case where it appears there are,
-  // say, 2.4 restoration tiles horizontally. There we need a maximum mi_row1
-  // of 2 because tile 1 gets extended.
-  //
-  // Second, if mi_col1 >= cm->mi_cols then we must manually set *rcol1 to
-  // nhtiles. This is needed whenever the frame's width rounded up to the next
-  // toplevel superblock is smaller than nhtiles * rtile_w. The same logic is
-  // needed for rows.
-  const int mi_row1 = mi_row + mi_size_high[bsize];
-  const int mi_col1 = mi_col + mi_size_wide[bsize];
-
-  if (mi_col1 >= cm->mi_cols)
-    *rcol1 = *nhtiles;
-  else
-    *rcol1 = AOMMIN(*nhtiles, (mi_col1 * mi_to_px + rnd_w) / (rtile_w * denom));
+                                       int *rrow1, int *tile_tl_idx) {
+  assert(rcol0 && rcol1 && rrow0 && rrow1);
 
-  if (mi_row1 >= cm->mi_rows)
-    *rrow1 = nvtiles;
-  else
-    *rrow1 = AOMMIN(nvtiles, (mi_row1 * mi_to_px + rnd_h) / (rtile_h * denom));
+  if (bsize != cm->seq_params.sb_size) return 0;
+  if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
+
+  assert(!cm->all_lossless);
+
+  const int is_uv = plane > 0;
+
+  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
+  const int tile_w = tile_rect.right - tile_rect.left;
+  const int tile_h = tile_rect.bottom - tile_rect.top;
+
+  const int mi_top = 0;
+  const int mi_left = 0;
+
+  // Compute the mi-unit corners of the superblock relative to the top-left of
+  // the tile
+  const int mi_rel_row0 = mi_row - mi_top;
+  const int mi_rel_col0 = mi_col - mi_left;
+  const int mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize];
+  const int mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize];
+
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+  const int size = rsi->restoration_unit_size;
+
+  // Calculate the number of restoration units in this tile (which might be
+  // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile)
+  const int horz_units = av1_lr_count_units_in_tile(size, tile_w);
+  const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
+
+  // The size of an MI-unit on this plane of the image
+  const int ss_x = is_uv && cm->subsampling_x;
+  const int ss_y = is_uv && cm->subsampling_y;
+  const int mi_size_x = MI_SIZE >> ss_x;
+  const int mi_size_y = MI_SIZE >> ss_y;
+
+  // Write m for the relative mi column or row, D for the superres denominator
+  // and N for the superres numerator. If u is the upscaled pixel offset then
+  // we can write the downscaled pixel offset in two ways as:
+  //
+  //   MI_SIZE * m = N / D u
+  //
+  // from which we get u = D * MI_SIZE * m / N
+  const int mi_to_num_x = av1_superres_scaled(cm)
+                              ? mi_size_x * cm->superres_scale_denominator
+                              : mi_size_x;
+  const int mi_to_num_y = mi_size_y;
+  const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
+  const int denom_y = size;
+
+  const int rnd_x = denom_x - 1;
+  const int rnd_y = denom_y - 1;
+
+  // rcol0/rrow0 should be the first column/row of restoration units (relative
+  // to the top-left of the tile) that doesn't start left/below of
+  // mi_col/mi_row. For this calculation, we need to round up the division (if
+  // the sb starts at runit column 10.1, the first matching runit has column
+  // index 11)
+  *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x;
+  *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y;
+
+  // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
+  // below-right. If we're at the bottom or right of the tile, this restoration
+  // unit might not exist, in which case we'll clamp accordingly.
+  *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
+  *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
+
+  const int tile_idx = 0;
+  *tile_tl_idx = tile_idx * rsi->units_per_tile;
 
   return *rcol0 < *rcol1 && *rrow0 < *rrow1;
 }
 
-#if CONFIG_STRIPED_LOOP_RESTORATION
-
 // Extend to left and right
-static void extend_line(uint8_t *buf, int width, int extend,
-                        int use_highbitdepth) {
-  int i;
-  if (use_highbitdepth) {
-    uint16_t val, *buf16 = (uint16_t *)buf;
-    val = buf16[0];
-    for (i = 0; i < extend; i++) buf16[-1 - i] = val;
-    val = buf16[width - 1];
-    for (i = 0; i < extend; i++) buf16[width + i] = val;
+static void extend_lines(uint8_t *buf, int width, int height, int stride,
+                         int extend, int use_highbitdepth) {
+  for (int i = 0; i < height; ++i) {
+    if (use_highbitdepth) {
+      uint16_t *buf16 = (uint16_t *)buf;
+      aom_memset16(buf16 - extend, buf16[0], extend);
+      aom_memset16(buf16 + width, buf16[width - 1], extend);
+    } else {
+      memset(buf - extend, buf[0], extend);
+      memset(buf + width, buf[width - 1], extend);
+    }
+    buf += stride;
+  }
+}
+
+static void save_deblock_boundary_lines(
+    const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
+    int stripe, int use_highbd, int is_above,
+    RestorationStripeBoundaries *boundaries) {
+  const int is_uv = plane > 0;
+  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
+  const int src_stride = frame->strides[is_uv] << use_highbd;
+  const uint8_t *src_rows = src_buf + row * src_stride;
+
+  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
+                               : boundaries->stripe_boundary_below;
+  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
+  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
+  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
+
+  // There is a rare case in which a processing stripe can end 1px above the
+  // crop border. In this case, we do want to use deblocked pixels from below
+  // the stripe (hence why we ended up in this function), but instead of
+  // fetching 2 "below" rows we need to fetch one and duplicate it.
+  // This is equivalent to clamping the sample locations against the crop border
+  const int lines_to_save =
+      AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
+  assert(lines_to_save == 1 || lines_to_save == 2);
+
+  int upscaled_width;
+  int line_bytes;
+  if (av1_superres_scaled(cm)) {
+    const int ss_x = is_uv && cm->subsampling_x;
+    upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
+    line_bytes = upscaled_width << use_highbd;
+    if (use_highbd)
+      av1_upscale_normative_rows(
+          cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
+          CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
+          plane, lines_to_save);
+    else
+      av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
+                                 boundaries->stripe_boundary_stride, plane,
+                                 lines_to_save);
   } else {
-    uint8_t val;
-    val = buf[0];
-    for (i = 0; i < extend; i++) buf[-1 - i] = val;
-    val = buf[width - 1];
-    for (i = 0; i < extend; i++) buf[width + i] = val;
+    upscaled_width = frame->crop_widths[is_uv];
+    line_bytes = upscaled_width << use_highbd;
+    for (int i = 0; i < lines_to_save; i++) {
+      memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
+             line_bytes);
+    }
   }
+  // If we only saved one line, then copy it into the second line buffer
+  if (lines_to_save == 1)
+    memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
+
+  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
+               RESTORATION_EXTRA_HORZ, use_highbd);
+}
+
+static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+                                     const AV1_COMMON *cm, int plane, int row,
+                                     int stripe, int use_highbd, int is_above,
+                                     RestorationStripeBoundaries *boundaries) {
+  const int is_uv = plane > 0;
+  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
+  const int src_stride = frame->strides[is_uv] << use_highbd;
+  const uint8_t *src_rows = src_buf + row * src_stride;
+
+  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
+                               : boundaries->stripe_boundary_below;
+  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
+  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
+  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
+  const int src_width = frame->crop_widths[is_uv];
+
+  // At the point where this function is called, we've already applied
+  // superres. So we don't need to extend the lines here, we can just
+  // pull directly from the topmost row of the upscaled frame.
+  const int ss_x = is_uv && cm->subsampling_x;
+  const int upscaled_width = av1_superres_scaled(cm)
+                                 ? (cm->superres_upscaled_width + ss_x) >> ss_x
+                                 : src_width;
+  const int line_bytes = upscaled_width << use_highbd;
+  for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
+    // Copy the line at 'row' into both context lines. This is because
+    // we want to (effectively) extend the outermost row of CDEF data
+    // from this tile to produce a border, rather than using deblocked
+    // pixels from the tile above/below.
+    memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
+  }
+  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
+               RESTORATION_EXTRA_HORZ, use_highbd);
 }
 
-// For each 64 pixel high stripe, save 4 scan lines to be used as boundary in
-// the loop restoration process. The lines are saved in
-// rst_internal.stripe_boundary_lines
-void av1_loop_restoration_save_boundary_lines(YV12_BUFFER_CONFIG *frame,
-                                              AV1_COMMON *cm) {
-  int p, boundary_stride;
-  int src_width, src_height, src_stride, stripe_height, stripe_offset, stripe_y,
-      yy;
-  uint8_t *src_buf, *boundary_below_buf, *boundary_above_buf;
-  int use_highbitdepth = 0;
-  for (p = 0; p < MAX_MB_PLANE; ++p) {
-    if (p == 0) {
-      src_buf = frame->y_buffer;
-      src_width = frame->y_crop_width;
-      src_height = frame->y_crop_height;
-      src_stride = frame->y_stride;
-      stripe_height = 64;
-      stripe_offset = 56 - 2;  // offset of first line to copy
+static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+                                         int use_highbd, int plane,
+                                         AV1_COMMON *cm, int after_cdef) {
+  const int is_uv = plane > 0;
+  const int ss_y = is_uv && cm->subsampling_y;
+  const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+  const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
+
+  // Get the tile rectangle, with height rounded up to the next multiple of 8
+  // luma pixels (only relevant for the bottom tile of the frame)
+  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
+  const int stripe0 = 0;
+
+  RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
+
+  const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
+
+  int tile_stripe;
+  for (tile_stripe = 0;; ++tile_stripe) {
+    const int rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off);
+    const int y0 = tile_rect.top + rel_y0;
+    if (y0 >= tile_rect.bottom) break;
+
+    const int rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off;
+    const int y1 = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom);
+
+    const int frame_stripe = stripe0 + tile_stripe;
+
+    // In this case, we should only use CDEF pixels at the top
+    // and bottom of the frame as a whole; internal tile boundaries
+    // can use deblocked pixels from adjacent tiles for context.
+    const int use_deblock_above = (frame_stripe > 0);
+    const int use_deblock_below = (y1 < plane_height);
+
+    if (!after_cdef) {
+      // Save deblocked context where needed.
+      if (use_deblock_above) {
+        save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
+                                    frame_stripe, use_highbd, 1, boundaries);
+      }
+      if (use_deblock_below) {
+        save_deblock_boundary_lines(frame, cm, plane, y1, frame_stripe,
+                                    use_highbd, 0, boundaries);
+      }
     } else {
-      src_buf = p == 1 ? frame->u_buffer : frame->v_buffer;
-      src_width = frame->uv_crop_width;
-      src_height = frame->uv_crop_height;
-      src_stride = frame->uv_stride;
-      stripe_height = 64 >> cm->subsampling_y;
-      stripe_offset = (56 >> cm->subsampling_y) - 2;
-    }
-    boundary_above_buf = cm->rst_internal.stripe_boundary_above[p];
-    boundary_below_buf = cm->rst_internal.stripe_boundary_below[p];
-    boundary_stride = cm->rst_internal.stripe_boundary_stride[p];
-#if CONFIG_HIGHBITDEPTH
-    use_highbitdepth = cm->use_highbitdepth;
-    if (use_highbitdepth) {
-      src_buf = (uint8_t *)CONVERT_TO_SHORTPTR(src_buf);
-    }
-#endif
-    src_buf += (stripe_offset * src_stride) << use_highbitdepth;
-    boundary_above_buf += RESTORATION_EXTRA_HORZ << use_highbitdepth;
-    boundary_below_buf += RESTORATION_EXTRA_HORZ << use_highbitdepth;
-    // Loop over stripes
-    for (stripe_y = stripe_offset; stripe_y < src_height;
-         stripe_y += stripe_height) {
-      // Save 2 lines above the LR stripe (offset -9, -10)
-      for (yy = 0; yy < 2; yy++) {
-        if (stripe_y + yy < src_height) {
-          memcpy(boundary_above_buf, src_buf, src_width << use_highbitdepth);
-          extend_line(boundary_above_buf, src_width, RESTORATION_EXTRA_HORZ,
-                      use_highbitdepth);
-          src_buf += src_stride << use_highbitdepth;
-          boundary_above_buf += boundary_stride << use_highbitdepth;
-        }
+      // Save CDEF context where needed. Note that we need to save the CDEF
+      // context for a particular boundary iff we *didn't* save deblocked
+      // context for that boundary.
+      //
+      // In addition, we need to save copies of the outermost line within
+      // the tile, rather than using data from outside the tile.
+      if (!use_deblock_above) {
+        save_cdef_boundary_lines(frame, cm, plane, y0, frame_stripe, use_highbd,
+                                 1, boundaries);
       }
-      // Save 2 lines below the LR stripe (offset 56,57)
-      for (yy = 2; yy < 4; yy++) {
-        if (stripe_y + yy < src_height) {
-          memcpy(boundary_below_buf, src_buf, src_width << use_highbitdepth);
-          extend_line(boundary_below_buf, src_width, RESTORATION_EXTRA_HORZ,
-                      use_highbitdepth);
-          src_buf += src_stride << use_highbitdepth;
-          boundary_below_buf += boundary_stride << use_highbitdepth;
-        }
+      if (!use_deblock_below) {
+        save_cdef_boundary_lines(frame, cm, plane, y1 - 1, frame_stripe,
+                                 use_highbd, 0, boundaries);
       }
-      // jump to next stripe
-      src_buf += ((stripe_height - 4) * src_stride) << use_highbitdepth;
     }
   }
 }
 
-#endif  // CONFIG_STRIPED_LOOP_RESTORATION
+// For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
+// lines to be used as boundary in the loop restoration process. The
+// lines are saved in rst_internal.stripe_boundary_lines
+void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+                                              AV1_COMMON *cm, int after_cdef) {
+  const int num_planes = av1_num_planes(cm);
+  const int use_highbd = cm->use_highbitdepth;
+  for (int p = 0; p < num_planes; ++p) {
+    save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
+  }
+}
diff --git a/third_party/aom/av1/common/restoration.h b/third_party/aom/av1/common/restoration.h
index 23a53879e..0c4017534 100644
--- a/third_party/aom/av1/common/restoration.h
+++ b/third_party/aom/av1/common/restoration.h
@@ -13,9 +13,10 @@
 #define AV1_COMMON_RESTORATION_H_
 
 #include "aom_ports/mem.h"
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #include "av1/common/blockd.h"
+#include "av1/common/enums.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -26,23 +27,13 @@ extern "C" {
 
 #define RESTORATION_PROC_UNIT_SIZE 64
 
-#if CONFIG_STRIPED_LOOP_RESTORATION
 // Filter tile grid offset upwards compared to the superblock grid
-#define RESTORATION_TILE_OFFSET 8
-#endif
+#define RESTORATION_UNIT_OFFSET 8
 
-#if CONFIG_STRIPED_LOOP_RESTORATION
-#define SGRPROJ_BORDER_VERT 2  // Vertical border used for Sgr
-#else
-#define SGRPROJ_BORDER_VERT 1  // Vertical border used for Sgr
-#endif
-#define SGRPROJ_BORDER_HORZ 2  // Horizontal border used for Sgr
+#define SGRPROJ_BORDER_VERT 3  // Vertical border used for Sgr
+#define SGRPROJ_BORDER_HORZ 3  // Horizontal border used for Sgr
 
-#if CONFIG_STRIPED_LOOP_RESTORATION
 #define WIENER_BORDER_VERT 2  // Vertical border used for Wiener
-#else
-#define WIENER_BORDER_VERT 1  // Vertical border used for Wiener
-#endif
 #define WIENER_HALFWIN 3
 #define WIENER_BORDER_HORZ (WIENER_HALFWIN)  // Horizontal border for Wiener
 
@@ -61,11 +52,16 @@ extern "C" {
 #define RESTORATION_BORDER_HORZ (WIENER_BORDER_HORZ)
 #endif  // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
 
-#if CONFIG_STRIPED_LOOP_RESTORATION
+// How many border pixels do we need for each processing unit?
+#define RESTORATION_BORDER 3
+
+// How many rows of deblocked pixels do we save above/below each processing
+// stripe?
+#define RESTORATION_CTX_VERT 2
+
 // Additional pixels to the left and right in above/below buffers
 // It is RESTORATION_BORDER_HORZ rounded up to get nicer buffer alignment
 #define RESTORATION_EXTRA_HORZ 4
-#endif
 
 // Pad up to 20 more (may be much less is needed)
 #define RESTORATION_PADDING 20
@@ -75,30 +71,23 @@ extern "C" {
    (RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_VERT * 2 + \
     RESTORATION_PADDING))
 
-#define RESTORATION_TILESIZE_MAX 256
-#if CONFIG_STRIPED_LOOP_RESTORATION
-#define RESTORATION_TILEPELS_HORZ_MAX \
-  (RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16)
-#define RESTORATION_TILEPELS_VERT_MAX                                \
-  ((RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT + \
-    RESTORATION_TILE_OFFSET))
-#define RESTORATION_TILEPELS_MAX \
-  (RESTORATION_TILEPELS_HORZ_MAX * RESTORATION_TILEPELS_VERT_MAX)
-#else
-#define RESTORATION_TILEPELS_MAX                                           \
-  ((RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16) * \
-   (RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT))
-#endif
+#define RESTORATION_UNITSIZE_MAX 256
+#define RESTORATION_UNITPELS_HORZ_MAX \
+  (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16)
+#define RESTORATION_UNITPELS_VERT_MAX                                \
+  ((RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT + \
+    RESTORATION_UNIT_OFFSET))
+#define RESTORATION_UNITPELS_MAX \
+  (RESTORATION_UNITPELS_HORZ_MAX * RESTORATION_UNITPELS_VERT_MAX)
 
 // Two 32-bit buffers needed for the restored versions from two filters
 // TODO(debargha, rupert): Refactor to not need the large tilesize to be stored
 // on the decoder side.
-#define SGRPROJ_TMPBUF_SIZE (RESTORATION_TILEPELS_MAX * 2 * sizeof(int32_t))
+#define SGRPROJ_TMPBUF_SIZE (RESTORATION_UNITPELS_MAX * 2 * sizeof(int32_t))
 
 #define SGRPROJ_EXTBUF_SIZE (0)
 #define SGRPROJ_PARAMS_BITS 4
 #define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS)
-#define USE_HIGHPASS_IN_SGRPROJ 0
 
 // Precision bits for projection
 #define SGRPROJ_PRJ_BITS 7
@@ -108,24 +97,16 @@ extern "C" {
 #define SGRPROJ_SGR_BITS 8
 #define SGRPROJ_SGR (1 << SGRPROJ_SGR_BITS)
 
-#if USE_HIGHPASS_IN_SGRPROJ
-#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) / 8)
-#define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1)
-#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 2)
-#define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
-#else
 #define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) * 3 / 4)
 #define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1)
 #define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
 #define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
-#endif  // USE_HIGHPASS_IN_SGRPROJ
 
 #define SGRPROJ_PRJ_SUBEXP_K 4
 
 #define SGRPROJ_BITS (SGRPROJ_PRJ_BITS * 2 + SGRPROJ_PARAMS_BITS)
 
 #define MAX_RADIUS 2  // Only 1, 2, 3 allowed
-#define MAX_EPS 80    // Max value of eps
 #define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1))
 #define SGRPROJ_MTABLE_BITS 20
 #define SGRPROJ_RECIP_BITS 12
@@ -143,17 +124,13 @@ extern "C" {
 #define WIENER_FILT_PREC_BITS 7
 #define WIENER_FILT_STEP (1 << WIENER_FILT_PREC_BITS)
 
-// Whether to use high intermediate precision filtering
-#define USE_WIENER_HIGH_INTERMEDIATE_PRECISION 1
-
 // Central values for the taps
 #define WIENER_FILT_TAP0_MIDV (3)
 #define WIENER_FILT_TAP1_MIDV (-7)
 #define WIENER_FILT_TAP2_MIDV (15)
-#define WIENER_FILT_TAP3_MIDV                           \
-  (WIENER_FILT_STEP -                                   \
-   2 * (WIENER_FILT_TAP0_MIDV + WIENER_FILT_TAP1_MIDV + \
-        WIENER_FILT_TAP2_MIDV))
+#define WIENER_FILT_TAP3_MIDV                                              \
+  (WIENER_FILT_STEP - 2 * (WIENER_FILT_TAP0_MIDV + WIENER_FILT_TAP1_MIDV + \
+                           WIENER_FILT_TAP2_MIDV))
 
 #define WIENER_FILT_TAP0_BITS 4
 #define WIENER_FILT_TAP1_BITS 5
@@ -194,51 +171,64 @@ extern "C" {
 #error "Wiener filter currently only works if WIENER_FILT_PREC_BITS == 7"
 #endif
 
+#define LR_TILE_ROW 0
+#define LR_TILE_COL 0
+#define LR_TILE_COLS 1
+
 typedef struct {
-#if USE_HIGHPASS_IN_SGRPROJ
-  int corner;
-  int edge;
-#else
-  int r1;
-  int e1;
-#endif  // USE_HIGHPASS_IN_SGRPROJ
-  int r2;
-  int e2;
+  int r[2];  // radii
+  int s[2];  // sgr parameters for r[0] and r[1], based on GenSgrprojVtable()
 } sgr_params_type;
 
 typedef struct {
-  int restoration_tilesize;
-  int procunit_width, procunit_height;
-  RestorationType frame_restoration_type;
-  RestorationType *restoration_type;
-  // Wiener filter
-  WienerInfo *wiener_info;
-  // Selfguided proj filter
-  SgrprojInfo *sgrproj_info;
-} RestorationInfo;
+  RestorationType restoration_type;
+  WienerInfo wiener_info;
+  SgrprojInfo sgrproj_info;
+} RestorationUnitInfo;
+
+// A restoration line buffer needs space for two lines plus a horizontal filter
+// margin of RESTORATION_EXTRA_HORZ on each side.
+#define RESTORATION_LINEBUFFER_WIDTH \
+  (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_EXTRA_HORZ)
+
+// Similarly, the column buffers (used when we're at a vertical tile edge
+// that we can't filter across) need space for one processing unit's worth
+// of pixels, plus the top/bottom border width
+#define RESTORATION_COLBUFFER_HEIGHT \
+  (RESTORATION_PROC_UNIT_SIZE + 2 * RESTORATION_BORDER)
 
 typedef struct {
-  RestorationInfo *rsi;
-  int keyframe;
-  int ntiles;
-  int tile_width, tile_height;
-  int nhtiles, nvtiles;
-  int32_t *tmpbuf;
-#if CONFIG_STRIPED_LOOP_RESTORATION
-  int component;
-  int subsampling_y;
-  uint8_t *stripe_boundary_above[MAX_MB_PLANE];
-  uint8_t *stripe_boundary_below[MAX_MB_PLANE];
-  int stripe_boundary_stride[MAX_MB_PLANE];
-  // Temporary buffers to save/restore 2 lines above/below the restoration
-  // stripe
-  // Allow for filter margin to left and right
-  uint16_t
-      tmp_save_above[2][RESTORATION_TILESIZE_MAX + 2 * RESTORATION_EXTRA_HORZ];
-  uint16_t
-      tmp_save_below[2][RESTORATION_TILESIZE_MAX + 2 * RESTORATION_EXTRA_HORZ];
-#endif
-} RestorationInternal;
+  // Temporary buffers to save/restore 3 lines above/below the restoration
+  // stripe.
+  uint16_t tmp_save_above[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH];
+  uint16_t tmp_save_below[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH];
+} RestorationLineBuffers;
+
+typedef struct {
+  uint8_t *stripe_boundary_above;
+  uint8_t *stripe_boundary_below;
+  int stripe_boundary_stride;
+  int stripe_boundary_size;
+} RestorationStripeBoundaries;
+
+typedef struct {
+  RestorationType frame_restoration_type;
+  int restoration_unit_size;
+
+  // Fields below here are allocated and initialised by
+  // av1_alloc_restoration_struct. (horz_)units_per_tile give the number of
+  // restoration units in (one row of) the largest tile in the frame. The data
+  // in unit_info is laid out with units_per_tile entries for each tile, which
+  // have stride horz_units_per_tile.
+  //
+  // Even if there are tiles of different sizes, the data in unit_info is laid
+  // out as if all tiles are of full size.
+  int units_per_tile;
+  int vert_units_per_tile, horz_units_per_tile;
+  RestorationUnitInfo *unit_info;
+  RestorationStripeBoundaries boundaries;
+  int optimized_lr;
+} RestorationInfo;
 
 static INLINE void set_default_sgrproj(SgrprojInfo *sgrproj_info) {
   sgrproj_info->xqd[0] = (SGRPROJ_PRJ_MIN0 + SGRPROJ_PRJ_MAX0) / 2;
@@ -257,91 +247,128 @@ static INLINE void set_default_wiener(WienerInfo *wiener_info) {
   wiener_info->vfilter[6] = wiener_info->hfilter[6] = WIENER_FILT_TAP0_MIDV;
 }
 
-static INLINE int av1_get_rest_ntiles(int width, int height, int tilesize,
-                                      int *tile_width, int *tile_height,
-                                      int *nhtiles, int *nvtiles) {
-  int nhtiles_, nvtiles_;
-  int tile_width_, tile_height_;
-  tile_width_ = (tilesize < 0) ? width : AOMMIN(tilesize, width);
-  tile_height_ = (tilesize < 0) ? height : AOMMIN(tilesize, height);
-  assert(tile_width_ > 0 && tile_height_ > 0);
-
-  nhtiles_ = (width + (tile_width_ >> 1)) / tile_width_;
-  nvtiles_ = (height + (tile_height_ >> 1)) / tile_height_;
-  if (tile_width) *tile_width = tile_width_;
-  if (tile_height) *tile_height = tile_height_;
-  if (nhtiles) *nhtiles = nhtiles_;
-  if (nvtiles) *nvtiles = nvtiles_;
-  return (nhtiles_ * nvtiles_);
-}
-
-typedef struct { int h_start, h_end, v_start, v_end; } RestorationTileLimits;
-
-static INLINE RestorationTileLimits
-av1_get_rest_tile_limits(int tile_idx, int nhtiles, int nvtiles, int tile_width,
-                         int tile_height, int im_width,
-#if CONFIG_STRIPED_LOOP_RESTORATION
-                         int im_height, int subsampling_y) {
-#else
-                         int im_height) {
-#endif
-  const int htile_idx = tile_idx % nhtiles;
-  const int vtile_idx = tile_idx / nhtiles;
-  RestorationTileLimits limits;
-  limits.h_start = htile_idx * tile_width;
-  limits.v_start = vtile_idx * tile_height;
-  limits.h_end =
-      (htile_idx < nhtiles - 1) ? limits.h_start + tile_width : im_width;
-  limits.v_end =
-      (vtile_idx < nvtiles - 1) ? limits.v_start + tile_height : im_height;
-#if CONFIG_STRIPED_LOOP_RESTORATION
-  // Offset the tile upwards to align with the restoration processing stripe
-  limits.v_start -= RESTORATION_TILE_OFFSET >> subsampling_y;
-  if (limits.v_start < 0) limits.v_start = 0;
-  if (limits.v_end < im_height)
-    limits.v_end -= RESTORATION_TILE_OFFSET >> subsampling_y;
-#endif
-  return limits;
-}
+typedef struct {
+  int h_start, h_end, v_start, v_end;
+} RestorationTileLimits;
+
+typedef void (*rest_unit_visitor_t)(const RestorationTileLimits *limits,
+                                    const AV1PixelRect *tile_rect,
+                                    int rest_unit_idx, void *priv,
+                                    int32_t *tmpbuf,
+                                    RestorationLineBuffers *rlbs);
+
+typedef struct FilterFrameCtxt {
+  const RestorationInfo *rsi;
+  int tile_stripe0;
+  int ss_x, ss_y;
+  int highbd, bit_depth;
+  uint8_t *data8, *dst8;
+  int data_stride, dst_stride;
+  AV1PixelRect tile_rect;
+} FilterFrameCtxt;
+
+typedef struct AV1LrStruct {
+  rest_unit_visitor_t on_rest_unit;
+  FilterFrameCtxt ctxt[MAX_MB_PLANE];
+  YV12_BUFFER_CONFIG *frame;
+  YV12_BUFFER_CONFIG *dst;
+} AV1LrStruct;
 
 extern const sgr_params_type sgr_params[SGRPROJ_PARAMS];
-extern int sgrproj_mtable[MAX_EPS][MAX_NELEM];
+extern int sgrproj_mtable[SGRPROJ_PARAMS][2];
 extern const int32_t x_by_xplus1[256];
 extern const int32_t one_by_x[MAX_NELEM];
 
-int av1_alloc_restoration_struct(struct AV1Common *cm,
-                                 RestorationInfo *rst_info, int width,
-                                 int height);
+void av1_alloc_restoration_struct(struct AV1Common *cm, RestorationInfo *rsi,
+                                  int is_uv);
 void av1_free_restoration_struct(RestorationInfo *rst_info);
 
 void extend_frame(uint8_t *data, int width, int height, int stride,
-                  int border_horz, int border_vert);
-#if CONFIG_HIGHBITDEPTH
-void extend_frame_highbd(uint16_t *data, int width, int height, int stride,
-                         int border_horz, int border_vert);
-#endif  // CONFIG_HIGHBITDEPTH
-void decode_xq(int *xqd, int *xq);
-void av1_loop_restoration_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
-                                RestorationInfo *rsi, int components_pattern,
-                                int partial_frame, YV12_BUFFER_CONFIG *dst);
+                  int border_horz, int border_vert, int highbd);
+void decode_xq(const int *xqd, int *xq, const sgr_params_type *params);
+
+// Filter a single loop restoration unit.
+//
+// limits is the limits of the unit. rui gives the mode to use for this unit
+// and its coefficients. If striped loop restoration is enabled, rsb contains
+// deblocked pixels to use for stripe boundaries; rlbs is just some space to
+// use as a scratch buffer. tile_rect gives the limits of the tile containing
+// this unit. tile_stripe0 is the index of the first stripe in this tile.
+//
+// ss_x and ss_y are flags which should be 1 if this is a plane with
+// horizontal/vertical subsampling, respectively. highbd is a flag which should
+// be 1 in high bit depth mode, in which case bit_depth is the bit depth.
+//
+// data8 is the frame data (pointing at the top-left corner of the frame, not
+// the restoration unit) and stride is its stride. dst8 is the buffer where the
+// results will be written and has stride dst_stride. Like data8, dst8 should
+// point at the top-left corner of the frame.
+//
+// Finally tmpbuf is a scratch buffer used by the sgrproj filter which should
+// be at least SGRPROJ_TMPBUF_SIZE big.
+void av1_loop_restoration_filter_unit(
+    const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
+    const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
+    const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
+    int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
+    int dst_stride, int32_t *tmpbuf, int optimized_lr);
+
+void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
+                                       struct AV1Common *cm, int optimized_lr,
+                                       void *lr_ctxt);
 void av1_loop_restoration_precal();
 
+typedef void (*rest_tile_start_visitor_t)(int tile_row, int tile_col,
+                                          void *priv);
+struct AV1LrSyncData;
+
+typedef void (*sync_read_fn_t)(void *const lr_sync, int r, int c, int plane);
+
+typedef void (*sync_write_fn_t)(void *const lr_sync, int r, int c,
+                                const int sb_cols, int plane);
+
+// Call on_rest_unit for each loop restoration unit in the plane.
+void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
+                                    rest_unit_visitor_t on_rest_unit,
+                                    void *priv, AV1PixelRect *tile_rect,
+                                    int32_t *tmpbuf,
+                                    RestorationLineBuffers *rlbs);
+
 // Return 1 iff the block at mi_row, mi_col with size bsize is a
 // top-level superblock containing the top-left corner of at least one
-// loop restoration tile.
+// loop restoration unit.
 //
 // If the block is a top-level superblock, the function writes to
-// *rcol0, *rcol1, *rrow0, *rrow1. The rectangle of indices given by
-// [*rcol0, *rcol1) x [*rrow0, *rrow1) will point at the set of rtiles
-// whose top left corners lie in the superblock. Note that the set is
-// only nonempty if *rcol0 < *rcol1 and *rrow0 < *rrow1.
+// *rcol0, *rcol1, *rrow0, *rrow1. The rectangle of restoration unit
+// indices given by [*rcol0, *rcol1) x [*rrow0, *rrow1) are relative
+// to the current tile, whose starting index is returned as
+// *tile_tl_idx.
 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
                                        int *rcol0, int *rcol1, int *rrow0,
-                                       int *rrow1, int *nhtiles);
-
-void av1_loop_restoration_save_boundary_lines(YV12_BUFFER_CONFIG *frame,
-                                              struct AV1Common *cm);
+                                       int *rrow1, int *tile_tl_idx);
+
+void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+                                              struct AV1Common *cm,
+                                              int after_cdef);
+void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
+                                            YV12_BUFFER_CONFIG *frame,
+                                            struct AV1Common *cm,
+                                            int optimized_lr, int num_planes);
+void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
+                                      struct AV1Common *cm, int num_planes);
+void av1_foreach_rest_unit_in_row(
+    RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
+    rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
+    int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
+    void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+    sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
+    struct AV1LrSyncData *const lr_sync);
+AV1PixelRect av1_whole_frame_rect(const struct AV1Common *cm, int is_uv);
+int av1_lr_count_units_in_tile(int unit_size, int tile_size);
+void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane);
+void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
+                             const int sb_cols, int plane);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/scale.c b/third_party/aom/av1/common/scale.c
index d5ccdfec0..c525fe229 100644
--- a/third_party/aom/av1/common/scale.c
+++ b/third_party/aom/av1/common/scale.c
@@ -9,7 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "av1/common/filter.h"
 #include "av1/common/scale.h"
 #include "aom_dsp/aom_filter.h"
@@ -46,12 +48,9 @@ static int get_fixed_point_scale_factor(int other_size, int this_size) {
   return ((other_size << REF_SCALE_SHIFT) + this_size / 2) / this_size;
 }
 
-static int get_coarse_point_scale_factor(int other_size, int this_size) {
-  // Calculate scaling factor once for each reference frame
-  // and use fixed point scaling factors in decoding and encoding routines.
-  // Hardware implementations can calculate scale factor in device driver
-  // and use multiplication and shifting on hardware instead of division.
-  return ((other_size << SCALE_SUBPEL_BITS) + this_size / 2) / this_size;
+// Given the fixed point scale, calculate coarse point scale.
+static int fixed_point_scale_to_coarse_point_scale(int scale_fp) {
+  return ROUND_POWER_OF_TWO(scale_fp, REF_SCALE_SHIFT - SCALE_SUBPEL_BITS);
 }
 
 // Note: x and y are integer precision, mvq4 is q4 precision.
@@ -64,14 +63,8 @@ MV32 av1_scale_mv(const MV *mvq4, int x, int y,
   return res;
 }
 
-#if CONFIG_HIGHBITDEPTH
-void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
-                                       int other_h, int this_w, int this_h,
-                                       int use_highbd) {
-#else
 void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
                                        int other_h, int this_w, int this_h) {
-#endif
   if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
     sf->x_scale_fp = REF_INVALID_SCALE;
     sf->y_scale_fp = REF_INVALID_SCALE;
@@ -81,8 +74,8 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
   sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
   sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
 
-  sf->x_step_q4 = get_coarse_point_scale_factor(other_w, this_w);
-  sf->y_step_q4 = get_coarse_point_scale_factor(other_h, this_h);
+  sf->x_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->x_scale_fp);
+  sf->y_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->y_scale_fp);
 
   if (av1_is_scaled(sf)) {
     sf->scale_value_x = scaled_x;
@@ -92,95 +85,42 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
     sf->scale_value_y = unscaled_value;
   }
 
-  // TODO(agrange): Investigate the best choice of functions to use here
-  // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what
-  // to do at full-pel offsets. The current selection, where the filter is
-  // applied in one direction only, and not at all for 0,0, seems to give the
-  // best quality, but it may be worth trying an additional mode that does
-  // do the filtering on full-pel.
-  if (sf->x_step_q4 == SCALE_SUBPEL_SHIFTS) {
-    if (sf->y_step_q4 == SCALE_SUBPEL_SHIFTS) {
-      // No scaling in either direction.
-      sf->predict[0][0][0] = aom_convolve_copy;
-      sf->predict[0][0][1] = aom_convolve_avg;
-      sf->predict[0][1][0] = aom_convolve8_vert;
-      sf->predict[0][1][1] = aom_convolve8_avg_vert;
-      sf->predict[1][0][0] = aom_convolve8_horiz;
-      sf->predict[1][0][1] = aom_convolve8_avg_horiz;
-    } else {
-      // No scaling in x direction. Must always scale in the y direction.
-      sf->predict[0][0][0] = aom_convolve8_vert;
-      sf->predict[0][0][1] = aom_convolve8_avg_vert;
-      sf->predict[0][1][0] = aom_convolve8_vert;
-      sf->predict[0][1][1] = aom_convolve8_avg_vert;
-      sf->predict[1][0][0] = aom_convolve8;
-      sf->predict[1][0][1] = aom_convolve8_avg;
-    }
-  } else {
-    if (sf->y_step_q4 == SCALE_SUBPEL_SHIFTS) {
-      // No scaling in the y direction. Must always scale in the x direction.
-      sf->predict[0][0][0] = aom_convolve8_horiz;
-      sf->predict[0][0][1] = aom_convolve8_avg_horiz;
-      sf->predict[0][1][0] = aom_convolve8;
-      sf->predict[0][1][1] = aom_convolve8_avg;
-      sf->predict[1][0][0] = aom_convolve8_horiz;
-      sf->predict[1][0][1] = aom_convolve8_avg_horiz;
-    } else {
-      // Must always scale in both directions.
-      sf->predict[0][0][0] = aom_convolve8;
-      sf->predict[0][0][1] = aom_convolve8_avg;
-      sf->predict[0][1][0] = aom_convolve8;
-      sf->predict[0][1][1] = aom_convolve8_avg;
-      sf->predict[1][0][0] = aom_convolve8;
-      sf->predict[1][0][1] = aom_convolve8_avg;
-    }
-  }
-  // 2D subpel motion always gets filtered in both directions
-  sf->predict[1][1][0] = aom_convolve8;
-  sf->predict[1][1][1] = aom_convolve8_avg;
-
-#if CONFIG_HIGHBITDEPTH
-  if (use_highbd) {
-    if (sf->x_step_q4 == SCALE_SUBPEL_SHIFTS) {
-      if (sf->y_step_q4 == SCALE_SUBPEL_SHIFTS) {
-        // No scaling in either direction.
-        sf->highbd_predict[0][0][0] = aom_highbd_convolve_copy;
-        sf->highbd_predict[0][0][1] = aom_highbd_convolve_avg;
-        sf->highbd_predict[0][1][0] = aom_highbd_convolve8_vert;
-        sf->highbd_predict[0][1][1] = aom_highbd_convolve8_avg_vert;
-        sf->highbd_predict[1][0][0] = aom_highbd_convolve8_horiz;
-        sf->highbd_predict[1][0][1] = aom_highbd_convolve8_avg_horiz;
-      } else {
-        // No scaling in x direction. Must always scale in the y direction.
-        sf->highbd_predict[0][0][0] = aom_highbd_convolve8_vert;
-        sf->highbd_predict[0][0][1] = aom_highbd_convolve8_avg_vert;
-        sf->highbd_predict[0][1][0] = aom_highbd_convolve8_vert;
-        sf->highbd_predict[0][1][1] = aom_highbd_convolve8_avg_vert;
-        sf->highbd_predict[1][0][0] = aom_highbd_convolve8;
-        sf->highbd_predict[1][0][1] = aom_highbd_convolve8_avg;
-      }
-    } else {
-      if (sf->y_step_q4 == SCALE_SUBPEL_SHIFTS) {
-        // No scaling in the y direction. Must always scale in the x direction.
-        sf->highbd_predict[0][0][0] = aom_highbd_convolve8_horiz;
-        sf->highbd_predict[0][0][1] = aom_highbd_convolve8_avg_horiz;
-        sf->highbd_predict[0][1][0] = aom_highbd_convolve8;
-        sf->highbd_predict[0][1][1] = aom_highbd_convolve8_avg;
-        sf->highbd_predict[1][0][0] = aom_highbd_convolve8_horiz;
-        sf->highbd_predict[1][0][1] = aom_highbd_convolve8_avg_horiz;
-      } else {
-        // Must always scale in both directions.
-        sf->highbd_predict[0][0][0] = aom_highbd_convolve8;
-        sf->highbd_predict[0][0][1] = aom_highbd_convolve8_avg;
-        sf->highbd_predict[0][1][0] = aom_highbd_convolve8;
-        sf->highbd_predict[0][1][1] = aom_highbd_convolve8_avg;
-        sf->highbd_predict[1][0][0] = aom_highbd_convolve8;
-        sf->highbd_predict[1][0][1] = aom_highbd_convolve8_avg;
-      }
-    }
-    // 2D subpel motion always gets filtered in both directions.
-    sf->highbd_predict[1][1][0] = aom_highbd_convolve8;
-    sf->highbd_predict[1][1][1] = aom_highbd_convolve8_avg;
-  }
-#endif  // CONFIG_HIGHBITDEPTH
+  // AV1 convolve functions
+  // Special case convolve functions should produce the same result as
+  // av1_convolve_2d.
+  // subpel_x_q4 == 0 && subpel_y_q4 == 0
+  sf->convolve[0][0][0] = av1_convolve_2d_copy_sr;
+  // subpel_x_q4 == 0
+  sf->convolve[0][1][0] = av1_convolve_y_sr;
+  // subpel_y_q4 == 0
+  sf->convolve[1][0][0] = av1_convolve_x_sr;
+  // subpel_x_q4 != 0 && subpel_y_q4 != 0
+  sf->convolve[1][1][0] = av1_convolve_2d_sr;
+  // subpel_x_q4 == 0 && subpel_y_q4 == 0
+  sf->convolve[0][0][1] = av1_jnt_convolve_2d_copy;
+  // subpel_x_q4 == 0
+  sf->convolve[0][1][1] = av1_jnt_convolve_y;
+  // subpel_y_q4 == 0
+  sf->convolve[1][0][1] = av1_jnt_convolve_x;
+  // subpel_x_q4 != 0 && subpel_y_q4 != 0
+  sf->convolve[1][1][1] = av1_jnt_convolve_2d;
+  // AV1 High BD convolve functions
+  // Special case convolve functions should produce the same result as
+  // av1_highbd_convolve_2d.
+  // subpel_x_q4 == 0 && subpel_y_q4 == 0
+  sf->highbd_convolve[0][0][0] = av1_highbd_convolve_2d_copy_sr;
+  // subpel_x_q4 == 0
+  sf->highbd_convolve[0][1][0] = av1_highbd_convolve_y_sr;
+  // subpel_y_q4 == 0
+  sf->highbd_convolve[1][0][0] = av1_highbd_convolve_x_sr;
+  // subpel_x_q4 != 0 && subpel_y_q4 != 0
+  sf->highbd_convolve[1][1][0] = av1_highbd_convolve_2d_sr;
+  // subpel_x_q4 == 0 && subpel_y_q4 == 0
+  sf->highbd_convolve[0][0][1] = av1_highbd_jnt_convolve_2d_copy;
+  // subpel_x_q4 == 0
+  sf->highbd_convolve[0][1][1] = av1_highbd_jnt_convolve_y;
+  // subpel_y_q4 == 0
+  sf->highbd_convolve[1][0][1] = av1_highbd_jnt_convolve_x;
+  // subpel_x_q4 != 0 && subpel_y_q4 != 0
+  sf->highbd_convolve[1][1][1] = av1_highbd_jnt_convolve_2d;
 }
diff --git a/third_party/aom/av1/common/scale.h b/third_party/aom/av1/common/scale.h
index 900e6bf47..5f02fdb81 100644
--- a/third_party/aom/av1/common/scale.h
+++ b/third_party/aom/av1/common/scale.h
@@ -12,6 +12,7 @@
 #ifndef AV1_COMMON_SCALE_H_
 #define AV1_COMMON_SCALE_H_
 
+#include "av1/common/convolve.h"
 #include "av1/common/mv.h"
 #include "aom_dsp/aom_convolve.h"
 
@@ -34,22 +35,15 @@ struct scale_factors {
   int (*scale_value_x)(int val, const struct scale_factors *sf);
   int (*scale_value_y)(int val, const struct scale_factors *sf);
 
-  convolve_fn_t predict[2][2][2];  // horiz, vert, avg
-#if CONFIG_HIGHBITDEPTH
-  highbd_convolve_fn_t highbd_predict[2][2][2];  // horiz, vert, avg
-#endif                                           // CONFIG_HIGHBITDEPTH
+  // convolve_fn_ptr[subpel_x != 0][subpel_y != 0][is_compound]
+  aom_convolve_fn_t convolve[2][2][2];
+  aom_highbd_convolve_fn_t highbd_convolve[2][2][2];
 };
 
 MV32 av1_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
 
-#if CONFIG_HIGHBITDEPTH
-void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
-                                       int other_h, int this_w, int this_h,
-                                       int use_high);
-#else
 void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
                                        int other_h, int this_w, int this_h);
-#endif  // CONFIG_HIGHBITDEPTH
 
 static INLINE int av1_is_valid_scale(const struct scale_factors *sf) {
   return sf->x_scale_fp != REF_INVALID_SCALE &&
diff --git a/third_party/aom/av1/common/scan.c b/third_party/aom/av1/common/scan.c
index 3c8f3d7ac..31a787b53 100644
--- a/third_party/aom/av1/common/scan.c
+++ b/third_party/aom/av1/common/scan.c
@@ -14,17 +14,10 @@
 #include "av1/common/common_data.h"
 #include "av1/common/scan.h"
 
-#if CONFIG_CHROMA_2X2
-DECLARE_ALIGNED(16, static const int16_t, default_scan_2x2[4]) = {
-  0, 1, 2, 3,
-};
-#endif
-
 DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = {
-  0, 4, 1, 5, 8, 2, 12, 9, 3, 6, 13, 10, 7, 14, 11, 15,
+  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = {
   0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
 };
@@ -32,19 +25,10 @@ DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = {
 DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x4[16]) = {
   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
 };
-#endif  // CONFIG_EXT_TX
-
-DECLARE_ALIGNED(16, static const int16_t, col_scan_4x4[16]) = {
-  0, 4, 8, 1, 12, 5, 9, 2, 13, 6, 10, 3, 7, 14, 11, 15,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, row_scan_4x4[16]) = {
-  0, 1, 4, 2, 5, 3, 6, 8, 9, 7, 12, 10, 13, 11, 14, 15,
-};
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_4x8[32]) = {
-  0,  1,  4,  5,  2,  8,  6,  9,  10, 3,  12, 7,  13, 11, 14, 16,
-  17, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  0,  1,  4,  2,  5,  8,  3,  6,  9,  12, 7,  10, 13, 16, 11, 14,
+  17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 27, 30, 31,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x8[32]) = {
@@ -58,8 +42,8 @@ DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x8[32]) = {
 };
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_8x4[32]) = {
-  0,  1,  8,  9, 2,  16, 10, 17, 18, 3,  24, 11, 25, 19, 26, 4,
-  12, 27, 20, 5, 28, 13, 21, 29, 6,  14, 22, 30, 7,  15, 23, 31,
+  0,  8, 1,  16, 9,  2, 24, 17, 10, 3, 25, 18, 11, 4,  26, 19,
+  12, 5, 27, 20, 13, 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, 31,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x4[32]) = {
@@ -73,20 +57,19 @@ DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x4[32]) = {
 };
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_4x16[64]) = {
-  0,  1,  4,  5,  2,  8,  6,  9,  10, 3,  12, 7,  13, 11, 14, 16,
-  17, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+  0,  1,  4,  2,  5,  8,  3,  6,  9,  12, 7,  10, 13, 16, 11, 14,
+  17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
+  33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
+  49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_16x4[64]) = {
-  0,  1,  16, 17, 2,  32, 18, 33, 34, 3,  48, 19, 49, 35, 50, 4,
-  20, 51, 36, 5,  52, 21, 37, 53, 6,  22, 38, 54, 7,  23, 39, 55,
-  8,  24, 40, 56, 9,  25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
-  12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63,
+  0,  16, 1,  32, 17, 2,  48, 33, 18, 3,  49, 34, 19, 4,  50, 35,
+  20, 5,  51, 36, 21, 6,  52, 37, 22, 7,  53, 38, 23, 8,  54, 39,
+  24, 9,  55, 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 12, 58, 43,
+  28, 13, 59, 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, 63,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x16[64]) = {
   0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
@@ -114,7 +97,6 @@ DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x4[64]) = {
   8,  24, 40, 56, 9,  25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
   12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63,
 };
-#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_8x32[256]) = {
   0,   1,   8,   2,   9,   16,  3,   10,  17,  24,  4,   11,  18,  25,  32,
@@ -138,27 +120,26 @@ DECLARE_ALIGNED(16, static const int16_t, default_scan_8x32[256]) = {
 };
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_32x8[256]) = {
-  0,   1,   32,  2,   33,  64,  3,   34,  65,  96,  4,   35,  66,  97,  128,
-  5,   36,  67,  98,  129, 160, 6,   37,  68,  99,  130, 161, 192, 7,   38,
-  69,  100, 131, 162, 193, 224, 8,   39,  70,  101, 132, 163, 194, 225, 9,
-  40,  71,  102, 133, 164, 195, 226, 10,  41,  72,  103, 134, 165, 196, 227,
-  11,  42,  73,  104, 135, 166, 197, 228, 12,  43,  74,  105, 136, 167, 198,
-  229, 13,  44,  75,  106, 137, 168, 199, 230, 14,  45,  76,  107, 138, 169,
-  200, 231, 15,  46,  77,  108, 139, 170, 201, 232, 16,  47,  78,  109, 140,
-  171, 202, 233, 17,  48,  79,  110, 141, 172, 203, 234, 18,  49,  80,  111,
-  142, 173, 204, 235, 19,  50,  81,  112, 143, 174, 205, 236, 20,  51,  82,
-  113, 144, 175, 206, 237, 21,  52,  83,  114, 145, 176, 207, 238, 22,  53,
-  84,  115, 146, 177, 208, 239, 23,  54,  85,  116, 147, 178, 209, 240, 24,
-  55,  86,  117, 148, 179, 210, 241, 25,  56,  87,  118, 149, 180, 211, 242,
-  26,  57,  88,  119, 150, 181, 212, 243, 27,  58,  89,  120, 151, 182, 213,
-  244, 28,  59,  90,  121, 152, 183, 214, 245, 29,  60,  91,  122, 153, 184,
-  215, 246, 30,  61,  92,  123, 154, 185, 216, 247, 31,  62,  93,  124, 155,
-  186, 217, 248, 63,  94,  125, 156, 187, 218, 249, 95,  126, 157, 188, 219,
-  250, 127, 158, 189, 220, 251, 159, 190, 221, 252, 191, 222, 253, 223, 254,
+  0,   32,  1,   64,  33,  2,   96,  65,  34,  3,   128, 97,  66,  35,  4,
+  160, 129, 98,  67,  36,  5,   192, 161, 130, 99,  68,  37,  6,   224, 193,
+  162, 131, 100, 69,  38,  7,   225, 194, 163, 132, 101, 70,  39,  8,   226,
+  195, 164, 133, 102, 71,  40,  9,   227, 196, 165, 134, 103, 72,  41,  10,
+  228, 197, 166, 135, 104, 73,  42,  11,  229, 198, 167, 136, 105, 74,  43,
+  12,  230, 199, 168, 137, 106, 75,  44,  13,  231, 200, 169, 138, 107, 76,
+  45,  14,  232, 201, 170, 139, 108, 77,  46,  15,  233, 202, 171, 140, 109,
+  78,  47,  16,  234, 203, 172, 141, 110, 79,  48,  17,  235, 204, 173, 142,
+  111, 80,  49,  18,  236, 205, 174, 143, 112, 81,  50,  19,  237, 206, 175,
+  144, 113, 82,  51,  20,  238, 207, 176, 145, 114, 83,  52,  21,  239, 208,
+  177, 146, 115, 84,  53,  22,  240, 209, 178, 147, 116, 85,  54,  23,  241,
+  210, 179, 148, 117, 86,  55,  24,  242, 211, 180, 149, 118, 87,  56,  25,
+  243, 212, 181, 150, 119, 88,  57,  26,  244, 213, 182, 151, 120, 89,  58,
+  27,  245, 214, 183, 152, 121, 90,  59,  28,  246, 215, 184, 153, 122, 91,
+  60,  29,  247, 216, 185, 154, 123, 92,  61,  30,  248, 217, 186, 155, 124,
+  93,  62,  31,  249, 218, 187, 156, 125, 94,  63,  250, 219, 188, 157, 126,
+  95,  251, 220, 189, 158, 127, 252, 221, 190, 159, 253, 222, 191, 254, 223,
   255,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x32[256]) = {
   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
   15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
@@ -240,16 +221,14 @@ DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x8[256]) = {
   28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253,
   30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255,
 };
-#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = {
-  0,  8,  1,  16, 9,  2,  17, 24, 10, 3,  18, 25, 32, 11, 4,  26,
-  33, 19, 40, 12, 34, 27, 5,  41, 20, 48, 13, 35, 42, 28, 21, 6,
-  49, 56, 36, 43, 29, 7,  14, 50, 57, 44, 22, 37, 15, 51, 58, 30,
-  45, 23, 52, 59, 38, 31, 60, 53, 46, 39, 61, 54, 47, 62, 55, 63,
+  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, 18, 11, 4,  5,
+  12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6,  7,  14, 21, 28,
+  35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+  58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x8[64]) = {
   0, 8,  16, 24, 32, 40, 48, 56, 1, 9,  17, 25, 33, 41, 49, 57,
   2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
@@ -263,21 +242,6 @@ DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x8[64]) = {
   32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
 };
-#endif  // CONFIG_EXT_TX
-
-DECLARE_ALIGNED(16, static const int16_t, col_scan_8x8[64]) = {
-  0,  8,  16, 1,  24, 9,  32, 17, 2,  40, 25, 10, 33, 18, 48, 3,
-  26, 41, 11, 56, 19, 34, 4,  49, 27, 42, 12, 35, 20, 57, 50, 28,
-  5,  43, 13, 36, 58, 51, 21, 44, 6,  29, 59, 37, 14, 52, 22, 7,
-  45, 60, 30, 15, 38, 53, 23, 46, 31, 61, 39, 54, 47, 62, 55, 63,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, row_scan_8x8[64]) = {
-  0,  1,  2,  8,  9,  3,  16, 10, 4,  17, 11, 24, 5,  18, 25, 12,
-  19, 26, 32, 6,  13, 20, 33, 27, 7,  34, 40, 21, 28, 41, 14, 35,
-  48, 42, 29, 36, 49, 22, 43, 15, 56, 37, 50, 44, 30, 57, 23, 51,
-  58, 45, 38, 52, 31, 59, 53, 46, 60, 39, 61, 47, 54, 55, 62, 63,
-};
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_8x16[128]) = {
   0,   1,   8,   2,   9,   16,  3,   10,  17,  24,  4,   11,  18,  25,  32,
@@ -292,14 +256,14 @@ DECLARE_ALIGNED(16, static const int16_t, default_scan_8x16[128]) = {
 };
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_16x8[128]) = {
-  0,   1,  16,  2,   17,  32,  3,  18, 33,  48,  4,   19,  34,  49,  64,  5,
-  20,  35, 50,  65,  80,  6,   21, 36, 51,  66,  81,  96,  7,   22,  37,  52,
-  67,  82, 97,  112, 8,   23,  38, 53, 68,  83,  98,  113, 9,   24,  39,  54,
-  69,  84, 99,  114, 10,  25,  40, 55, 70,  85,  100, 115, 11,  26,  41,  56,
-  71,  86, 101, 116, 12,  27,  42, 57, 72,  87,  102, 117, 13,  28,  43,  58,
-  73,  88, 103, 118, 14,  29,  44, 59, 74,  89,  104, 119, 15,  30,  45,  60,
-  75,  90, 105, 120, 31,  46,  61, 76, 91,  106, 121, 47,  62,  77,  92,  107,
-  122, 63, 78,  93,  108, 123, 79, 94, 109, 124, 95,  110, 125, 111, 126, 127,
+  0,  16,  1,   32, 17,  2,   48,  33,  18, 3,  64,  49,  34,  19,  4,   80,
+  65, 50,  35,  20, 5,   96,  81,  66,  51, 36, 21,  6,   112, 97,  82,  67,
+  52, 37,  22,  7,  113, 98,  83,  68,  53, 38, 23,  8,   114, 99,  84,  69,
+  54, 39,  24,  9,  115, 100, 85,  70,  55, 40, 25,  10,  116, 101, 86,  71,
+  56, 41,  26,  11, 117, 102, 87,  72,  57, 42, 27,  12,  118, 103, 88,  73,
+  58, 43,  28,  13, 119, 104, 89,  74,  59, 44, 29,  14,  120, 105, 90,  75,
+  60, 45,  30,  15, 121, 106, 91,  76,  61, 46, 31,  122, 107, 92,  77,  62,
+  47, 123, 108, 93, 78,  63,  124, 109, 94, 79, 125, 110, 95,  126, 111, 127,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x16[128]) = {
@@ -387,41 +351,41 @@ DECLARE_ALIGNED(16, static const int16_t, default_scan_16x32[512]) = {
 };
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_32x16[512]) = {
-  0,   1,   32,  2,   33,  64,  3,   34,  65,  96,  4,   35,  66,  97,  128,
-  5,   36,  67,  98,  129, 160, 6,   37,  68,  99,  130, 161, 192, 7,   38,
-  69,  100, 131, 162, 193, 224, 8,   39,  70,  101, 132, 163, 194, 225, 256,
-  9,   40,  71,  102, 133, 164, 195, 226, 257, 288, 10,  41,  72,  103, 134,
-  165, 196, 227, 258, 289, 320, 11,  42,  73,  104, 135, 166, 197, 228, 259,
-  290, 321, 352, 12,  43,  74,  105, 136, 167, 198, 229, 260, 291, 322, 353,
-  384, 13,  44,  75,  106, 137, 168, 199, 230, 261, 292, 323, 354, 385, 416,
-  14,  45,  76,  107, 138, 169, 200, 231, 262, 293, 324, 355, 386, 417, 448,
-  15,  46,  77,  108, 139, 170, 201, 232, 263, 294, 325, 356, 387, 418, 449,
-  480, 16,  47,  78,  109, 140, 171, 202, 233, 264, 295, 326, 357, 388, 419,
-  450, 481, 17,  48,  79,  110, 141, 172, 203, 234, 265, 296, 327, 358, 389,
-  420, 451, 482, 18,  49,  80,  111, 142, 173, 204, 235, 266, 297, 328, 359,
-  390, 421, 452, 483, 19,  50,  81,  112, 143, 174, 205, 236, 267, 298, 329,
-  360, 391, 422, 453, 484, 20,  51,  82,  113, 144, 175, 206, 237, 268, 299,
-  330, 361, 392, 423, 454, 485, 21,  52,  83,  114, 145, 176, 207, 238, 269,
-  300, 331, 362, 393, 424, 455, 486, 22,  53,  84,  115, 146, 177, 208, 239,
-  270, 301, 332, 363, 394, 425, 456, 487, 23,  54,  85,  116, 147, 178, 209,
-  240, 271, 302, 333, 364, 395, 426, 457, 488, 24,  55,  86,  117, 148, 179,
-  210, 241, 272, 303, 334, 365, 396, 427, 458, 489, 25,  56,  87,  118, 149,
-  180, 211, 242, 273, 304, 335, 366, 397, 428, 459, 490, 26,  57,  88,  119,
-  150, 181, 212, 243, 274, 305, 336, 367, 398, 429, 460, 491, 27,  58,  89,
-  120, 151, 182, 213, 244, 275, 306, 337, 368, 399, 430, 461, 492, 28,  59,
-  90,  121, 152, 183, 214, 245, 276, 307, 338, 369, 400, 431, 462, 493, 29,
-  60,  91,  122, 153, 184, 215, 246, 277, 308, 339, 370, 401, 432, 463, 494,
-  30,  61,  92,  123, 154, 185, 216, 247, 278, 309, 340, 371, 402, 433, 464,
-  495, 31,  62,  93,  124, 155, 186, 217, 248, 279, 310, 341, 372, 403, 434,
-  465, 496, 63,  94,  125, 156, 187, 218, 249, 280, 311, 342, 373, 404, 435,
-  466, 497, 95,  126, 157, 188, 219, 250, 281, 312, 343, 374, 405, 436, 467,
-  498, 127, 158, 189, 220, 251, 282, 313, 344, 375, 406, 437, 468, 499, 159,
-  190, 221, 252, 283, 314, 345, 376, 407, 438, 469, 500, 191, 222, 253, 284,
-  315, 346, 377, 408, 439, 470, 501, 223, 254, 285, 316, 347, 378, 409, 440,
-  471, 502, 255, 286, 317, 348, 379, 410, 441, 472, 503, 287, 318, 349, 380,
-  411, 442, 473, 504, 319, 350, 381, 412, 443, 474, 505, 351, 382, 413, 444,
-  475, 506, 383, 414, 445, 476, 507, 415, 446, 477, 508, 447, 478, 509, 479,
-  510, 511,
+  0,   32,  1,   64,  33,  2,   96,  65,  34,  3,   128, 97,  66,  35,  4,
+  160, 129, 98,  67,  36,  5,   192, 161, 130, 99,  68,  37,  6,   224, 193,
+  162, 131, 100, 69,  38,  7,   256, 225, 194, 163, 132, 101, 70,  39,  8,
+  288, 257, 226, 195, 164, 133, 102, 71,  40,  9,   320, 289, 258, 227, 196,
+  165, 134, 103, 72,  41,  10,  352, 321, 290, 259, 228, 197, 166, 135, 104,
+  73,  42,  11,  384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74,  43,
+  12,  416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75,  44,  13,
+  448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, 76,  45,  14,
+  480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, 139, 108, 77,  46,
+  15,  481, 450, 419, 388, 357, 326, 295, 264, 233, 202, 171, 140, 109, 78,
+  47,  16,  482, 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110,
+  79,  48,  17,  483, 452, 421, 390, 359, 328, 297, 266, 235, 204, 173, 142,
+  111, 80,  49,  18,  484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174,
+  143, 112, 81,  50,  19,  485, 454, 423, 392, 361, 330, 299, 268, 237, 206,
+  175, 144, 113, 82,  51,  20,  486, 455, 424, 393, 362, 331, 300, 269, 238,
+  207, 176, 145, 114, 83,  52,  21,  487, 456, 425, 394, 363, 332, 301, 270,
+  239, 208, 177, 146, 115, 84,  53,  22,  488, 457, 426, 395, 364, 333, 302,
+  271, 240, 209, 178, 147, 116, 85,  54,  23,  489, 458, 427, 396, 365, 334,
+  303, 272, 241, 210, 179, 148, 117, 86,  55,  24,  490, 459, 428, 397, 366,
+  335, 304, 273, 242, 211, 180, 149, 118, 87,  56,  25,  491, 460, 429, 398,
+  367, 336, 305, 274, 243, 212, 181, 150, 119, 88,  57,  26,  492, 461, 430,
+  399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89,  58,  27,  493, 462,
+  431, 400, 369, 338, 307, 276, 245, 214, 183, 152, 121, 90,  59,  28,  494,
+  463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91,  60,  29,
+  495, 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92,  61,
+  30,  496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93,
+  62,  31,  497, 466, 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125,
+  94,  63,  498, 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126,
+  95,  499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189, 158, 127, 500,
+  469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470, 439, 408,
+  377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316, 285,
+  254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411,
+  380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413,
+  382, 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510,
+  479, 511,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x32[512]) = {
@@ -574,27 +538,26 @@ DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x16[512]) = {
 };
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_16x16[256]) = {
-  0,   16,  1,   32,  17,  2,   48,  33,  18,  3,   64,  34,  49,  19,  65,
-  80,  50,  4,   35,  66,  20,  81,  96,  51,  5,   36,  82,  97,  67,  112,
-  21,  52,  98,  37,  83,  113, 6,   68,  128, 53,  22,  99,  114, 84,  7,
-  129, 38,  69,  100, 115, 144, 130, 85,  54,  23,  8,   145, 39,  70,  116,
-  101, 131, 160, 146, 55,  86,  24,  71,  132, 117, 161, 40,  9,   102, 147,
-  176, 162, 87,  56,  25,  133, 118, 177, 148, 72,  103, 41,  163, 10,  192,
-  178, 88,  57,  134, 149, 119, 26,  164, 73,  104, 193, 42,  179, 208, 11,
-  135, 89,  165, 120, 150, 58,  194, 180, 27,  74,  209, 105, 151, 136, 43,
-  90,  224, 166, 195, 181, 121, 210, 59,  12,  152, 106, 167, 196, 75,  137,
-  225, 211, 240, 182, 122, 91,  28,  197, 13,  226, 168, 183, 153, 44,  212,
-  138, 107, 241, 60,  29,  123, 198, 184, 227, 169, 242, 76,  213, 154, 45,
-  92,  14,  199, 139, 61,  228, 214, 170, 185, 243, 108, 77,  155, 30,  15,
-  200, 229, 124, 215, 244, 93,  46,  186, 171, 201, 109, 140, 230, 62,  216,
-  245, 31,  125, 78,  156, 231, 47,  187, 202, 217, 94,  246, 141, 63,  232,
-  172, 110, 247, 157, 79,  218, 203, 126, 233, 188, 248, 95,  173, 142, 219,
-  111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159, 251,
-  190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239,
-  255,
+  0,   1,   16,  32,  17,  2,   3,   18,  33,  48,  64,  49,  34,  19,  4,
+  5,   20,  35,  50,  65,  80,  96,  81,  66,  51,  36,  21,  6,   7,   22,
+  37,  52,  67,  82,  97,  112, 128, 113, 98,  83,  68,  53,  38,  23,  8,
+  9,   24,  39,  54,  69,  84,  99,  114, 129, 144, 160, 145, 130, 115, 100,
+  85,  70,  55,  40,  25,  10,  11,  26,  41,  56,  71,  86,  101, 116, 131,
+  146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87,  72,  57,  42,  27,
+  12,  13,  28,  43,  58,  73,  88,  103, 118, 133, 148, 163, 178, 193, 208,
+  224, 209, 194, 179, 164, 149, 134, 119, 104, 89,  74,  59,  44,  29,  14,
+  15,  30,  45,  60,  75,  90,  105, 120, 135, 150, 165, 180, 195, 210, 225,
+  240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91,  76,  61,  46,
+  31,  47,  62,  77,  92,  107, 122, 137, 152, 167, 182, 197, 212, 227, 242,
+  243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93,  78,  63,  79,  94,
+  109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185,
+  170, 155, 140, 125, 110, 95,  111, 126, 141, 156, 171, 186, 201, 216, 231,
+  246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188, 203,
+  218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,
+  250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254,
+  255
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x16[256]) = {
   0,  16, 32, 48, 64, 80, 96,  112, 128, 144, 160, 176, 192, 208, 224, 240,
   1,  17, 33, 49, 65, 81, 97,  113, 129, 145, 161, 177, 193, 209, 225, 241,
@@ -634,51 +597,7 @@ DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x16[256]) = {
   240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
   255,
 };
-#endif  // CONFIG_EXT_TX
-
-DECLARE_ALIGNED(16, static const int16_t, col_scan_16x16[256]) = {
-  0,   16,  32,  48,  1,   64,  17,  80,  33,  96,  49,  2,   65,  112, 18,
-  81,  34,  128, 50,  97,  3,   66,  144, 19,  113, 35,  82,  160, 98,  51,
-  129, 4,   67,  176, 20,  114, 145, 83,  36,  99,  130, 52,  192, 5,   161,
-  68,  115, 21,  146, 84,  208, 177, 37,  131, 100, 53,  162, 224, 69,  6,
-  116, 193, 147, 85,  22,  240, 132, 38,  178, 101, 163, 54,  209, 117, 70,
-  7,   148, 194, 86,  179, 225, 23,  133, 39,  164, 8,   102, 210, 241, 55,
-  195, 118, 149, 71,  180, 24,  87,  226, 134, 165, 211, 40,  103, 56,  72,
-  150, 196, 242, 119, 9,   181, 227, 88,  166, 25,  135, 41,  104, 212, 57,
-  151, 197, 120, 73,  243, 182, 136, 167, 213, 89,  10,  228, 105, 152, 198,
-  26,  42,  121, 183, 244, 168, 58,  137, 229, 74,  214, 90,  153, 199, 184,
-  11,  106, 245, 27,  122, 230, 169, 43,  215, 59,  200, 138, 185, 246, 75,
-  12,  91,  154, 216, 231, 107, 28,  44,  201, 123, 170, 60,  247, 232, 76,
-  139, 13,  92,  217, 186, 248, 155, 108, 29,  124, 45,  202, 233, 171, 61,
-  14,  77,  140, 15,  249, 93,  30,  187, 156, 218, 46,  109, 125, 62,  172,
-  78,  203, 31,  141, 234, 94,  47,  188, 63,  157, 110, 250, 219, 79,  126,
-  204, 173, 142, 95,  189, 111, 235, 158, 220, 251, 127, 174, 143, 205, 236,
-  159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239,
-  255,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, row_scan_16x16[256]) = {
-  0,   1,   2,   16,  3,   17,  4,   18,  32,  5,   33,  19,  6,   34,  48,
-  20,  49,  7,   35,  21,  50,  64,  8,   36,  65,  22,  51,  37,  80,  9,
-  66,  52,  23,  38,  81,  67,  10,  53,  24,  82,  68,  96,  39,  11,  54,
-  83,  97,  69,  25,  98,  84,  40,  112, 55,  12,  70,  99,  113, 85,  26,
-  41,  56,  114, 100, 13,  71,  128, 86,  27,  115, 101, 129, 42,  57,  72,
-  116, 14,  87,  130, 102, 144, 73,  131, 117, 28,  58,  15,  88,  43,  145,
-  103, 132, 146, 118, 74,  160, 89,  133, 104, 29,  59,  147, 119, 44,  161,
-  148, 90,  105, 134, 162, 120, 176, 75,  135, 149, 30,  60,  163, 177, 45,
-  121, 91,  106, 164, 178, 150, 192, 136, 165, 179, 31,  151, 193, 76,  122,
-  61,  137, 194, 107, 152, 180, 208, 46,  166, 167, 195, 92,  181, 138, 209,
-  123, 153, 224, 196, 77,  168, 210, 182, 240, 108, 197, 62,  154, 225, 183,
-  169, 211, 47,  139, 93,  184, 226, 212, 241, 198, 170, 124, 155, 199, 78,
-  213, 185, 109, 227, 200, 63,  228, 242, 140, 214, 171, 186, 156, 229, 243,
-  125, 94,  201, 244, 215, 216, 230, 141, 187, 202, 79,  172, 110, 157, 245,
-  217, 231, 95,  246, 232, 126, 203, 247, 233, 173, 218, 142, 111, 158, 188,
-  248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220, 175,
-  190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254,
-  255,
-};
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x32[1024]) = {
   0,   32,   64,  96,   128, 160,  192, 224,  256, 288,  320, 352,  384, 416,
   448, 480,  512, 544,  576, 608,  640, 672,  704, 736,  768, 800,  832, 864,
@@ -837,998 +756,97 @@ DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x32[1024]) = {
   1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
   1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
 };
-#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = {
-  0,    32,   1,    64,  33,   2,    96,   65,   34,   128,  3,    97,   66,
-  160,  129,  35,   98,  4,    67,   130,  161,  192,  36,   99,   224,  5,
-  162,  193,  68,   131, 37,   100,  225,  194,  256,  163,  69,   132,  6,
-  226,  257,  288,  195, 101,  164,  38,   258,  7,    227,  289,  133,  320,
-  70,   196,  165,  290, 259,  228,  39,   321,  102,  352,  8,    197,  71,
-  134,  322,  291,  260, 353,  384,  229,  166,  103,  40,   354,  323,  292,
-  135,  385,  198,  261, 72,   9,    416,  167,  386,  355,  230,  324,  104,
-  293,  41,   417,  199, 136,  262,  387,  448,  325,  356,  10,   73,   418,
-  231,  168,  449,  294, 388,  105,  419,  263,  42,   200,  357,  450,  137,
-  480,  74,   326,  232, 11,   389,  169,  295,  420,  106,  451,  481,  358,
-  264,  327,  201,  43,  138,  512,  482,  390,  296,  233,  170,  421,  75,
-  452,  359,  12,   513, 265,  483,  328,  107,  202,  514,  544,  422,  391,
-  453,  139,  44,   234, 484,  297,  360,  171,  76,   515,  545,  266,  329,
-  454,  13,   423,  203, 108,  546,  485,  576,  298,  235,  140,  361,  330,
-  172,  547,  45,   455, 267,  577,  486,  77,   204,  362,  608,  14,   299,
-  578,  109,  236,  487, 609,  331,  141,  579,  46,   15,   173,  610,  363,
-  78,   205,  16,   110, 237,  611,  142,  47,   174,  79,   206,  17,   111,
-  238,  48,   143,  80,  175,  112,  207,  49,   18,   239,  81,   113,  19,
-  50,   82,   114,  51,  83,   115,  640,  516,  392,  268,  144,  20,   672,
-  641,  548,  517,  424, 393,  300,  269,  176,  145,  52,   21,   704,  673,
-  642,  580,  549,  518, 456,  425,  394,  332,  301,  270,  208,  177,  146,
-  84,   53,   22,   736, 705,  674,  643,  612,  581,  550,  519,  488,  457,
-  426,  395,  364,  333, 302,  271,  240,  209,  178,  147,  116,  85,   54,
-  23,   737,  706,  675, 613,  582,  551,  489,  458,  427,  365,  334,  303,
-  241,  210,  179,  117, 86,   55,   738,  707,  614,  583,  490,  459,  366,
-  335,  242,  211,  118, 87,   739,  615,  491,  367,  243,  119,  768,  644,
-  520,  396,  272,  148, 24,   800,  769,  676,  645,  552,  521,  428,  397,
-  304,  273,  180,  149, 56,   25,   832,  801,  770,  708,  677,  646,  584,
-  553,  522,  460,  429, 398,  336,  305,  274,  212,  181,  150,  88,   57,
-  26,   864,  833,  802, 771,  740,  709,  678,  647,  616,  585,  554,  523,
-  492,  461,  430,  399, 368,  337,  306,  275,  244,  213,  182,  151,  120,
-  89,   58,   27,   865, 834,  803,  741,  710,  679,  617,  586,  555,  493,
-  462,  431,  369,  338, 307,  245,  214,  183,  121,  90,   59,   866,  835,
-  742,  711,  618,  587, 494,  463,  370,  339,  246,  215,  122,  91,   867,
-  743,  619,  495,  371, 247,  123,  896,  772,  648,  524,  400,  276,  152,
-  28,   928,  897,  804, 773,  680,  649,  556,  525,  432,  401,  308,  277,
-  184,  153,  60,   29,  960,  929,  898,  836,  805,  774,  712,  681,  650,
-  588,  557,  526,  464, 433,  402,  340,  309,  278,  216,  185,  154,  92,
-  61,   30,   992,  961, 930,  899,  868,  837,  806,  775,  744,  713,  682,
-  651,  620,  589,  558, 527,  496,  465,  434,  403,  372,  341,  310,  279,
-  248,  217,  186,  155, 124,  93,   62,   31,   993,  962,  931,  869,  838,
-  807,  745,  714,  683, 621,  590,  559,  497,  466,  435,  373,  342,  311,
-  249,  218,  187,  125, 94,   63,   994,  963,  870,  839,  746,  715,  622,
-  591,  498,  467,  374, 343,  250,  219,  126,  95,   995,  871,  747,  623,
-  499,  375,  251,  127, 900,  776,  652,  528,  404,  280,  156,  932,  901,
-  808,  777,  684,  653, 560,  529,  436,  405,  312,  281,  188,  157,  964,
-  933,  902,  840,  809, 778,  716,  685,  654,  592,  561,  530,  468,  437,
-  406,  344,  313,  282, 220,  189,  158,  996,  965,  934,  903,  872,  841,
-  810,  779,  748,  717, 686,  655,  624,  593,  562,  531,  500,  469,  438,
-  407,  376,  345,  314, 283,  252,  221,  190,  159,  997,  966,  935,  873,
-  842,  811,  749,  718, 687,  625,  594,  563,  501,  470,  439,  377,  346,
-  315,  253,  222,  191, 998,  967,  874,  843,  750,  719,  626,  595,  502,
-  471,  378,  347,  254, 223,  999,  875,  751,  627,  503,  379,  255,  904,
-  780,  656,  532,  408, 284,  936,  905,  812,  781,  688,  657,  564,  533,
-  440,  409,  316,  285, 968,  937,  906,  844,  813,  782,  720,  689,  658,
-  596,  565,  534,  472, 441,  410,  348,  317,  286,  1000, 969,  938,  907,
-  876,  845,  814,  783, 752,  721,  690,  659,  628,  597,  566,  535,  504,
-  473,  442,  411,  380, 349,  318,  287,  1001, 970,  939,  877,  846,  815,
-  753,  722,  691,  629, 598,  567,  505,  474,  443,  381,  350,  319,  1002,
-  971,  878,  847,  754, 723,  630,  599,  506,  475,  382,  351,  1003, 879,
-  755,  631,  507,  383, 908,  784,  660,  536,  412,  940,  909,  816,  785,
-  692,  661,  568,  537, 444,  413,  972,  941,  910,  848,  817,  786,  724,
-  693,  662,  600,  569, 538,  476,  445,  414,  1004, 973,  942,  911,  880,
-  849,  818,  787,  756, 725,  694,  663,  632,  601,  570,  539,  508,  477,
-  446,  415,  1005, 974, 943,  881,  850,  819,  757,  726,  695,  633,  602,
-  571,  509,  478,  447, 1006, 975,  882,  851,  758,  727,  634,  603,  510,
-  479,  1007, 883,  759, 635,  511,  912,  788,  664,  540,  944,  913,  820,
-  789,  696,  665,  572, 541,  976,  945,  914,  852,  821,  790,  728,  697,
-  666,  604,  573,  542, 1008, 977,  946,  915,  884,  853,  822,  791,  760,
-  729,  698,  667,  636, 605,  574,  543,  1009, 978,  947,  885,  854,  823,
-  761,  730,  699,  637, 606,  575,  1010, 979,  886,  855,  762,  731,  638,
-  607,  1011, 887,  763, 639,  916,  792,  668,  948,  917,  824,  793,  700,
-  669,  980,  949,  918, 856,  825,  794,  732,  701,  670,  1012, 981,  950,
-  919,  888,  857,  826, 795,  764,  733,  702,  671,  1013, 982,  951,  889,
-  858,  827,  765,  734, 703,  1014, 983,  890,  859,  766,  735,  1015, 891,
-  767,  920,  796,  952, 921,  828,  797,  984,  953,  922,  860,  829,  798,
-  1016, 985,  954,  923, 892,  861,  830,  799,  1017, 986,  955,  893,  862,
-  831,  1018, 987,  894, 863,  1019, 895,  924,  956,  925,  988,  957,  926,
-  1020, 989,  958,  927, 1021, 990,  959,  1022, 991,  1023,
-};
-
-// Scan over two rectangular vertical partitions one after the other
-DECLARE_ALIGNED(16, static const int16_t, v2_scan_32x32[1024]) = {
-  0,    1,    32,   33,   2,    64,   34,   65,   66,   3,    96,   35,   97,
-  67,   98,   4,    128,  36,   129,  99,   68,   130,  5,    100,  131,  160,
-  37,   161,  69,   162,  132,  101,  163,  6,    192,  38,   193,  70,   194,
-  133,  164,  102,  195,  7,    224,  39,   165,  225,  134,  196,  71,   226,
-  103,  227,  166,  197,  8,    256,  40,   135,  228,  257,  72,   258,  198,
-  104,  259,  167,  229,  136,  260,  9,    288,  41,   289,  73,   199,  230,
-  290,  168,  261,  105,  291,  137,  292,  231,  10,   200,  262,  320,  42,
-  321,  74,   322,  169,  293,  106,  323,  232,  263,  138,  324,  201,  294,
-  11,   352,  43,   353,  75,   170,  325,  354,  264,  107,  233,  295,  355,
-  202,  326,  139,  356,  12,   384,  44,   265,  296,  385,  171,  357,  76,
-  386,  234,  327,  108,  387,  203,  358,  140,  388,  297,  266,  328,  13,
-  172,  389,  416,  45,   235,  359,  417,  77,   418,  109,  419,  204,  390,
-  298,  329,  141,  267,  360,  420,  236,  391,  173,  421,  14,   448,  46,
-  449,  78,   330,  450,  299,  361,  110,  205,  422,  451,  268,  392,  142,
-  452,  237,  423,  174,  331,  362,  453,  15,   300,  393,  480,  47,   481,
-  79,   482,  206,  454,  269,  424,  111,  483,  143,  484,  363,  332,  394,
-  238,  455,  175,  301,  425,  485,  512,  513,  270,  456,  514,  207,  486,
-  364,  395,  515,  333,  426,  516,  239,  487,  302,  457,  517,  396,  271,
-  488,  544,  365,  427,  545,  518,  546,  334,  458,  547,  519,  548,  303,
-  489,  397,  428,  549,  366,  459,  520,  576,  335,  490,  550,  577,  578,
-  579,  521,  429,  551,  398,  460,  580,  367,  491,  581,  552,  522,  582,
-  608,  609,  430,  461,  610,  399,  492,  553,  611,  583,  523,  612,  613,
-  584,  554,  462,  431,  493,  614,  524,  640,  641,  642,  585,  643,  555,
-  615,  644,  463,  494,  586,  525,  616,  645,  556,  646,  672,  617,  673,
-  587,  674,  647,  495,  675,  526,  676,  557,  618,  648,  677,  588,  678,
-  527,  649,  619,  704,  558,  705,  706,  679,  589,  707,  650,  708,  620,
-  680,  709,  559,  590,  710,  651,  681,  736,  621,  737,  711,  738,  739,
-  682,  652,  740,  712,  591,  741,  622,  683,  713,  742,  653,  768,  769,
-  743,  770,  714,  684,  771,  623,  772,  744,  654,  773,  715,  685,  745,
-  774,  655,  775,  800,  801,  716,  746,  802,  803,  686,  776,  804,  747,
-  805,  717,  777,  806,  687,  748,  807,  778,  832,  833,  718,  834,  835,
-  808,  836,  779,  749,  837,  809,  719,  838,  780,  750,  810,  839,  864,
-  865,  866,  867,  840,  781,  868,  811,  751,  869,  841,  870,  812,  782,
-  842,  871,  896,  897,  898,  872,  899,  813,  843,  900,  783,  901,  873,
-  844,  902,  814,  874,  903,  928,  929,  845,  930,  904,  815,  875,  931,
-  932,  905,  933,  846,  876,  934,  906,  935,  877,  960,  847,  961,  962,
-  907,  936,  963,  964,  937,  878,  965,  908,  966,  938,  967,  909,  879,
-  992,  939,  993,  968,  994,  995,  996,  910,  969,  940,  997,  998,  970,
-  911,  941,  999,  971,  1000, 942,  1001, 972,  1002, 943,  973,  1003, 974,
-  1004, 975,  1005, 1006, 1007, 16,   48,   80,   112,  144,  176,  17,   49,
-  208,  81,   113,  145,  240,  177,  272,  18,   50,   209,  82,   114,  304,
-  241,  146,  178,  273,  336,  210,  19,   51,   83,   115,  305,  242,  147,
-  368,  179,  274,  337,  211,  20,   400,  52,   84,   306,  116,  243,  369,
-  148,  338,  180,  275,  432,  401,  212,  21,   53,   307,  85,   370,  244,
-  117,  464,  149,  433,  339,  276,  181,  402,  213,  308,  496,  371,  22,
-  54,   465,  86,   245,  118,  434,  150,  340,  277,  403,  182,  528,  497,
-  214,  466,  372,  309,  23,   55,   435,  87,   246,  119,  341,  404,  151,
-  529,  560,  278,  498,  183,  467,  373,  215,  310,  436,  24,   56,   247,
-  561,  88,   530,  592,  342,  120,  405,  499,  152,  279,  468,  184,  374,
-  311,  437,  216,  562,  593,  531,  624,  25,   248,  500,  57,   406,  89,
-  343,  121,  469,  280,  153,  594,  185,  375,  563,  625,  438,  532,  656,
-  312,  217,  501,  407,  249,  26,   344,  58,   90,   470,  122,  595,  626,
-  281,  564,  657,  154,  376,  533,  688,  439,  186,  313,  502,  218,  408,
-  627,  596,  658,  250,  345,  471,  27,   59,   565,  689,  91,   123,  282,
-  534,  720,  155,  440,  377,  187,  503,  314,  628,  659,  219,  597,  690,
-  409,  472,  566,  721,  346,  251,  28,   60,   535,  752,  92,   124,  283,
-  441,  378,  156,  660,  504,  629,  691,  598,  722,  188,  315,  567,  753,
-  220,  410,  473,  347,  536,  784,  252,  29,   661,  692,  61,   93,   442,
-  630,  723,  284,  125,  379,  505,  599,  754,  157,  316,  568,  785,  189,
-  474,  411,  221,  537,  816,  693,  348,  662,  724,  253,  631,  755,  443,
-  30,   600,  786,  62,   506,  94,   285,  380,  126,  569,  817,  158,  317,
-  190,  475,  694,  725,  412,  663,  756,  538,  848,  222,  632,  787,  349,
-  254,  601,  818,  444,  507,  31,   63,   381,  286,  95,   570,  849,  726,
-  127,  695,  757,  664,  788,  159,  476,  318,  413,  539,  880,  191,  633,
-  819,  223,  350,  602,  850,  508,  255,  445,  727,  758,  696,  789,  571,
-  881,  382,  287,  665,  820,  477,  634,  851,  540,  912,  319,  414,  603,
-  882,  759,  728,  790,  351,  509,  697,  821,  446,  572,  913,  666,  852,
-  383,  635,  883,  478,  541,  944,  415,  760,  791,  604,  914,  729,  822,
-  698,  853,  510,  667,  884,  447,  573,  945,  636,  915,  792,  761,  823,
-  542,  976,  479,  730,  854,  605,  946,  699,  885,  668,  916,  511,  574,
-  977,  793,  824,  637,  947,  762,  855,  731,  886,  543,  1008, 606,  978,
-  700,  917,  669,  948,  575,  825,  1009, 794,  856,  763,  887,  638,  979,
-  732,  918,  701,  949,  607,  1010, 670,  980,  826,  857,  795,  888,  764,
-  919,  639,  1011, 733,  950,  702,  981,  858,  827,  889,  796,  920,  671,
-  1012, 765,  951,  734,  982,  703,  1013, 859,  890,  828,  921,  797,  952,
-  766,  983,  735,  1014, 891,  860,  922,  829,  953,  798,  984,  767,  1015,
-  892,  923,  861,  954,  830,  985,  799,  1016, 924,  893,  955,  862,  986,
-  831,  1017, 925,  956,  894,  987,  863,  1018, 957,  926,  988,  895,  1019,
-  958,  989,  927,  1020, 990,  959,  1021, 991,  1022, 1023,
-};
-
-// Scan over two rectangular horizontal partitions one after the other
-DECLARE_ALIGNED(16, static const int16_t, h2_scan_32x32[1024]) = {
-  0,    1,    32,   33,   2,    64,   34,   65,   66,   3,    96,   35,   97,
-  67,   98,   4,    128,  36,   129,  99,   68,   130,  5,    100,  131,  160,
-  37,   161,  69,   162,  132,  101,  163,  6,    192,  38,   193,  70,   194,
-  133,  164,  102,  195,  7,    224,  39,   165,  225,  134,  196,  71,   226,
-  103,  227,  166,  197,  8,    256,  40,   135,  228,  257,  72,   258,  198,
-  104,  259,  167,  229,  136,  260,  9,    288,  41,   289,  73,   199,  230,
-  290,  168,  261,  105,  291,  137,  292,  231,  10,   200,  262,  320,  42,
-  321,  74,   322,  169,  293,  106,  323,  232,  263,  138,  324,  201,  294,
-  11,   352,  43,   353,  75,   170,  325,  354,  264,  107,  233,  295,  355,
-  202,  326,  139,  356,  12,   384,  44,   265,  296,  385,  171,  357,  76,
-  386,  234,  327,  108,  387,  203,  358,  140,  388,  297,  266,  328,  13,
-  172,  389,  416,  45,   235,  359,  417,  77,   418,  109,  419,  204,  390,
-  298,  329,  141,  267,  360,  420,  236,  391,  173,  421,  14,   448,  46,
-  449,  78,   330,  450,  299,  361,  110,  205,  422,  451,  268,  392,  142,
-  452,  237,  423,  174,  331,  362,  453,  15,   300,  393,  480,  47,   481,
-  79,   482,  206,  454,  269,  424,  111,  483,  143,  484,  363,  332,  394,
-  238,  455,  175,  301,  425,  485,  16,   48,   80,   270,  456,  207,  486,
-  112,  364,  395,  333,  426,  144,  239,  487,  302,  457,  176,  396,  17,
-  271,  488,  49,   365,  427,  208,  81,   334,  458,  113,  145,  240,  303,
-  489,  397,  428,  177,  366,  459,  272,  18,   50,   209,  335,  490,  82,
-  114,  304,  241,  429,  146,  398,  460,  367,  491,  178,  273,  336,  210,
-  19,   51,   83,   430,  461,  399,  492,  115,  305,  242,  147,  368,  179,
-  274,  337,  462,  431,  493,  211,  20,   400,  52,   84,   306,  116,  243,
-  369,  148,  463,  494,  338,  180,  275,  432,  401,  212,  21,   53,   307,
-  85,   370,  244,  117,  495,  464,  149,  433,  339,  276,  181,  402,  213,
-  308,  496,  371,  22,   54,   465,  86,   245,  118,  434,  150,  340,  277,
-  403,  182,  497,  214,  466,  372,  309,  23,   55,   435,  87,   246,  119,
-  341,  404,  151,  278,  498,  183,  467,  373,  215,  310,  436,  24,   56,
-  247,  88,   342,  120,  405,  499,  152,  279,  468,  184,  374,  311,  437,
-  216,  25,   248,  500,  57,   406,  89,   343,  121,  469,  280,  153,  185,
-  375,  438,  312,  217,  501,  407,  249,  26,   344,  58,   90,   470,  122,
-  281,  154,  376,  439,  186,  313,  502,  218,  408,  250,  345,  471,  27,
-  59,   91,   123,  282,  155,  440,  377,  187,  503,  314,  219,  409,  472,
-  346,  251,  28,   60,   92,   124,  283,  441,  378,  156,  504,  188,  315,
-  220,  410,  473,  347,  252,  29,   61,   93,   442,  284,  125,  379,  505,
-  157,  316,  189,  474,  411,  221,  348,  253,  443,  30,   62,   506,  94,
-  285,  380,  126,  158,  317,  190,  475,  412,  222,  349,  254,  444,  507,
-  31,   63,   381,  286,  95,   127,  159,  476,  318,  413,  191,  223,  350,
-  508,  255,  445,  382,  287,  477,  319,  414,  351,  509,  446,  383,  478,
-  415,  510,  447,  479,  511,  512,  513,  514,  515,  516,  517,  544,  545,
-  518,  546,  547,  519,  548,  549,  520,  576,  550,  577,  578,  579,  521,
-  551,  580,  581,  552,  522,  582,  608,  609,  610,  553,  611,  583,  523,
-  612,  613,  584,  554,  614,  524,  640,  641,  642,  585,  643,  555,  615,
-  644,  586,  525,  616,  645,  556,  646,  672,  617,  673,  587,  674,  647,
-  675,  526,  676,  557,  618,  648,  677,  588,  678,  527,  649,  619,  704,
-  558,  705,  706,  679,  589,  707,  650,  708,  620,  680,  709,  528,  559,
-  590,  710,  651,  681,  736,  621,  737,  711,  738,  739,  682,  652,  529,
-  560,  740,  712,  591,  741,  622,  683,  713,  742,  653,  768,  769,  561,
-  743,  530,  592,  770,  714,  684,  771,  623,  772,  744,  654,  773,  715,
-  685,  745,  774,  562,  593,  531,  624,  655,  775,  800,  801,  716,  746,
-  802,  803,  686,  776,  804,  594,  563,  625,  747,  805,  717,  532,  656,
-  777,  806,  687,  748,  807,  778,  832,  833,  718,  834,  595,  626,  835,
-  564,  657,  808,  836,  533,  688,  779,  749,  837,  809,  719,  838,  780,
-  627,  596,  658,  750,  810,  839,  864,  565,  689,  865,  866,  867,  534,
-  720,  840,  781,  868,  811,  751,  869,  841,  628,  659,  597,  690,  870,
-  812,  782,  566,  721,  842,  871,  896,  535,  752,  897,  898,  872,  899,
-  813,  843,  660,  900,  783,  629,  691,  598,  722,  901,  873,  567,  753,
-  844,  902,  814,  874,  536,  784,  903,  661,  692,  928,  929,  630,  723,
-  845,  930,  904,  815,  875,  931,  599,  754,  932,  568,  785,  905,  933,
-  846,  876,  934,  537,  816,  693,  662,  724,  906,  631,  755,  935,  877,
-  600,  786,  960,  847,  961,  962,  907,  936,  963,  569,  817,  964,  937,
-  694,  725,  878,  965,  908,  663,  756,  538,  848,  966,  632,  787,  938,
-  601,  818,  967,  909,  879,  992,  939,  993,  968,  570,  849,  994,  726,
-  695,  757,  995,  664,  788,  996,  910,  969,  539,  880,  940,  633,  819,
-  997,  998,  602,  850,  970,  911,  941,  999,  727,  758,  696,  789,  571,
-  881,  971,  665,  820,  1000, 634,  851,  942,  540,  912,  1001, 972,  603,
-  882,  759,  728,  790,  1002, 697,  821,  943,  973,  572,  913,  666,  852,
-  1003, 635,  883,  974,  541,  944,  760,  791,  1004, 604,  914,  729,  822,
-  698,  853,  975,  667,  884,  573,  945,  1005, 636,  915,  792,  761,  823,
-  542,  976,  1006, 730,  854,  605,  946,  699,  885,  668,  916,  1007, 574,
-  977,  793,  824,  637,  947,  762,  855,  731,  886,  543,  1008, 606,  978,
-  700,  917,  669,  948,  575,  825,  1009, 794,  856,  763,  887,  638,  979,
-  732,  918,  701,  949,  607,  1010, 670,  980,  826,  857,  795,  888,  764,
-  919,  639,  1011, 733,  950,  702,  981,  858,  827,  889,  796,  920,  671,
-  1012, 765,  951,  734,  982,  703,  1013, 859,  890,  828,  921,  797,  952,
-  766,  983,  735,  1014, 891,  860,  922,  829,  953,  798,  984,  767,  1015,
-  892,  923,  861,  954,  830,  985,  799,  1016, 924,  893,  955,  862,  986,
-  831,  1017, 925,  956,  894,  987,  863,  1018, 957,  926,  988,  895,  1019,
-  958,  989,  927,  1020, 990,  959,  1021, 991,  1022, 1023,
-};
-
-// Scan where the top left quarter is scanned first
-DECLARE_ALIGNED(16, static const int16_t, qtr_scan_32x32[1024]) = {
-  0,    1,    32,   33,   2,    64,   34,   65,   66,   3,    96,   35,   97,
-  67,   98,   4,    128,  36,   129,  99,   68,   130,  5,    100,  131,  160,
-  37,   161,  69,   162,  132,  101,  163,  6,    192,  38,   193,  70,   194,
-  133,  164,  102,  195,  7,    224,  39,   165,  225,  134,  196,  71,   226,
-  103,  227,  166,  197,  8,    256,  40,   135,  228,  257,  72,   258,  198,
-  104,  259,  167,  229,  136,  260,  9,    288,  41,   289,  73,   199,  230,
-  290,  168,  261,  105,  291,  137,  292,  231,  10,   200,  262,  320,  42,
-  321,  74,   322,  169,  293,  106,  323,  232,  263,  138,  324,  201,  294,
-  11,   352,  43,   353,  75,   170,  325,  354,  264,  107,  233,  295,  355,
-  202,  326,  139,  356,  12,   384,  44,   265,  296,  385,  171,  357,  76,
-  386,  234,  327,  108,  387,  203,  358,  140,  388,  297,  266,  328,  13,
-  172,  389,  416,  45,   235,  359,  417,  77,   418,  109,  419,  204,  390,
-  298,  329,  141,  267,  360,  420,  236,  391,  173,  421,  14,   448,  46,
-  449,  78,   330,  450,  299,  361,  110,  205,  422,  451,  268,  392,  142,
-  452,  237,  423,  174,  331,  362,  453,  15,   300,  393,  480,  47,   481,
-  79,   482,  206,  454,  269,  424,  111,  483,  143,  484,  363,  332,  394,
-  238,  455,  175,  301,  425,  485,  270,  456,  207,  486,  364,  395,  333,
-  426,  239,  487,  302,  457,  396,  271,  488,  365,  427,  334,  458,  303,
-  489,  397,  428,  366,  459,  335,  490,  429,  398,  460,  367,  491,  430,
-  461,  399,  492,  462,  431,  493,  463,  494,  495,  16,   512,  48,   513,
-  80,   514,  112,  515,  144,  516,  176,  517,  17,   544,  49,   545,  208,
-  518,  81,   546,  113,  547,  145,  240,  519,  548,  177,  549,  272,  520,
-  18,   576,  50,   209,  550,  577,  82,   578,  114,  579,  304,  521,  241,
-  551,  146,  580,  178,  581,  273,  552,  336,  522,  210,  582,  19,   608,
-  51,   609,  83,   610,  115,  305,  553,  611,  242,  583,  147,  368,  523,
-  612,  179,  613,  274,  584,  337,  554,  211,  614,  20,   400,  524,  640,
-  52,   641,  84,   642,  306,  585,  116,  643,  243,  369,  555,  615,  148,
-  644,  338,  586,  180,  275,  432,  525,  616,  645,  401,  556,  212,  646,
-  21,   672,  53,   307,  617,  673,  85,   370,  587,  674,  244,  647,  117,
-  675,  464,  526,  149,  676,  433,  557,  339,  618,  276,  648,  181,  677,
-  402,  588,  213,  678,  308,  496,  527,  649,  371,  619,  22,   704,  54,
-  465,  558,  705,  86,   706,  245,  679,  118,  434,  589,  707,  150,  340,
-  650,  708,  277,  403,  620,  680,  182,  709,  528,  497,  559,  214,  466,
-  590,  710,  372,  651,  309,  681,  23,   736,  55,   435,  621,  737,  87,
-  246,  711,  738,  119,  739,  341,  682,  404,  652,  151,  529,  560,  740,
-  278,  712,  498,  591,  183,  741,  467,  622,  373,  683,  215,  310,  713,
-  742,  436,  653,  24,   768,  56,   769,  247,  561,  743,  88,   530,  592,
-  770,  342,  714,  120,  405,  684,  771,  499,  623,  152,  772,  279,  744,
-  468,  654,  184,  773,  374,  715,  311,  437,  685,  745,  216,  774,  562,
-  593,  531,  624,  25,   248,  500,  655,  775,  800,  57,   801,  406,  716,
-  89,   343,  746,  802,  121,  803,  469,  686,  280,  776,  153,  804,  594,
-  185,  375,  563,  625,  747,  805,  438,  717,  532,  656,  312,  777,  217,
-  806,  501,  687,  407,  748,  249,  807,  26,   344,  778,  832,  58,   833,
-  90,   470,  718,  834,  122,  595,  626,  835,  281,  564,  657,  808,  154,
-  836,  376,  533,  688,  779,  439,  749,  186,  837,  313,  809,  502,  719,
-  218,  838,  408,  780,  627,  596,  658,  250,  345,  471,  750,  810,  839,
-  27,   864,  59,   565,  689,  865,  91,   866,  123,  867,  282,  534,  720,
-  840,  155,  440,  781,  868,  377,  811,  187,  503,  751,  869,  314,  841,
-  628,  659,  219,  597,  690,  870,  409,  812,  472,  782,  566,  721,  346,
-  842,  251,  871,  28,   896,  60,   535,  752,  897,  92,   898,  124,  283,
-  872,  899,  441,  813,  378,  843,  156,  660,  900,  504,  783,  629,  691,
-  598,  722,  188,  901,  315,  873,  567,  753,  220,  410,  844,  902,  473,
-  814,  347,  874,  536,  784,  252,  903,  29,   661,  692,  928,  61,   929,
-  93,   442,  630,  723,  845,  930,  284,  904,  125,  379,  505,  815,  875,
-  931,  599,  754,  157,  932,  316,  568,  785,  905,  189,  933,  474,  846,
-  411,  876,  221,  934,  537,  816,  693,  348,  662,  724,  906,  253,  631,
-  755,  935,  443,  877,  30,   600,  786,  960,  62,   506,  847,  961,  94,
-  962,  285,  380,  907,  936,  126,  963,  569,  817,  158,  964,  317,  937,
-  190,  475,  694,  725,  878,  965,  412,  908,  663,  756,  538,  848,  222,
-  966,  632,  787,  349,  938,  254,  601,  818,  967,  444,  909,  507,  879,
-  31,   992,  63,   381,  939,  993,  286,  968,  95,   570,  849,  994,  726,
-  127,  695,  757,  995,  664,  788,  159,  996,  476,  910,  318,  969,  413,
-  539,  880,  940,  191,  633,  819,  997,  223,  998,  350,  602,  850,  970,
-  508,  911,  255,  445,  941,  999,  727,  758,  696,  789,  571,  881,  382,
-  971,  287,  665,  820,  1000, 477,  634,  851,  942,  540,  912,  319,  1001,
-  414,  972,  603,  882,  759,  728,  790,  351,  1002, 509,  697,  821,  943,
-  446,  973,  572,  913,  666,  852,  383,  1003, 635,  883,  478,  974,  541,
-  944,  415,  760,  791,  1004, 604,  914,  729,  822,  698,  853,  510,  975,
-  667,  884,  447,  573,  945,  1005, 636,  915,  792,  761,  823,  542,  976,
-  479,  1006, 730,  854,  605,  946,  699,  885,  668,  916,  511,  1007, 574,
-  977,  793,  824,  637,  947,  762,  855,  731,  886,  543,  1008, 606,  978,
-  700,  917,  669,  948,  575,  825,  1009, 794,  856,  763,  887,  638,  979,
-  732,  918,  701,  949,  607,  1010, 670,  980,  826,  857,  795,  888,  764,
-  919,  639,  1011, 733,  950,  702,  981,  858,  827,  889,  796,  920,  671,
-  1012, 765,  951,  734,  982,  703,  1013, 859,  890,  828,  921,  797,  952,
-  766,  983,  735,  1014, 891,  860,  922,  829,  953,  798,  984,  767,  1015,
-  892,  923,  861,  954,  830,  985,  799,  1016, 924,  893,  955,  862,  986,
-  831,  1017, 925,  956,  894,  987,  863,  1018, 957,  926,  988,  895,  1019,
-  958,  989,  927,  1020, 990,  959,  1021, 991,  1022, 1023,
-};
-
-#if CONFIG_TX64X64
-DECLARE_ALIGNED(16, static const int16_t, default_scan_32x64[2048]) = {
-  0,    1,    32,   2,    33,   64,   3,    34,   65,   96,   4,    35,   66,
-  97,   128,  5,    36,   67,   98,   129,  160,  6,    37,   68,   99,   130,
-  161,  192,  7,    38,   69,   100,  131,  162,  193,  224,  8,    39,   70,
-  101,  132,  163,  194,  225,  256,  9,    40,   71,   102,  133,  164,  195,
-  226,  257,  288,  10,   41,   72,   103,  134,  165,  196,  227,  258,  289,
-  320,  11,   42,   73,   104,  135,  166,  197,  228,  259,  290,  321,  352,
-  12,   43,   74,   105,  136,  167,  198,  229,  260,  291,  322,  353,  384,
-  13,   44,   75,   106,  137,  168,  199,  230,  261,  292,  323,  354,  385,
-  416,  14,   45,   76,   107,  138,  169,  200,  231,  262,  293,  324,  355,
-  386,  417,  448,  15,   46,   77,   108,  139,  170,  201,  232,  263,  294,
-  325,  356,  387,  418,  449,  480,  16,   47,   78,   109,  140,  171,  202,
-  233,  264,  295,  326,  357,  388,  419,  450,  481,  512,  17,   48,   79,
-  110,  141,  172,  203,  234,  265,  296,  327,  358,  389,  420,  451,  482,
-  513,  544,  18,   49,   80,   111,  142,  173,  204,  235,  266,  297,  328,
-  359,  390,  421,  452,  483,  514,  545,  576,  19,   50,   81,   112,  143,
-  174,  205,  236,  267,  298,  329,  360,  391,  422,  453,  484,  515,  546,
-  577,  608,  20,   51,   82,   113,  144,  175,  206,  237,  268,  299,  330,
-  361,  392,  423,  454,  485,  516,  547,  578,  609,  640,  21,   52,   83,
-  114,  145,  176,  207,  238,  269,  300,  331,  362,  393,  424,  455,  486,
-  517,  548,  579,  610,  641,  672,  22,   53,   84,   115,  146,  177,  208,
-  239,  270,  301,  332,  363,  394,  425,  456,  487,  518,  549,  580,  611,
-  642,  673,  704,  23,   54,   85,   116,  147,  178,  209,  240,  271,  302,
-  333,  364,  395,  426,  457,  488,  519,  550,  581,  612,  643,  674,  705,
-  736,  24,   55,   86,   117,  148,  179,  210,  241,  272,  303,  334,  365,
-  396,  427,  458,  489,  520,  551,  582,  613,  644,  675,  706,  737,  768,
-  25,   56,   87,   118,  149,  180,  211,  242,  273,  304,  335,  366,  397,
-  428,  459,  490,  521,  552,  583,  614,  645,  676,  707,  738,  769,  800,
-  26,   57,   88,   119,  150,  181,  212,  243,  274,  305,  336,  367,  398,
-  429,  460,  491,  522,  553,  584,  615,  646,  677,  708,  739,  770,  801,
-  832,  27,   58,   89,   120,  151,  182,  213,  244,  275,  306,  337,  368,
-  399,  430,  461,  492,  523,  554,  585,  616,  647,  678,  709,  740,  771,
-  802,  833,  864,  28,   59,   90,   121,  152,  183,  214,  245,  276,  307,
-  338,  369,  400,  431,  462,  493,  524,  555,  586,  617,  648,  679,  710,
-  741,  772,  803,  834,  865,  896,  29,   60,   91,   122,  153,  184,  215,
-  246,  277,  308,  339,  370,  401,  432,  463,  494,  525,  556,  587,  618,
-  649,  680,  711,  742,  773,  804,  835,  866,  897,  928,  30,   61,   92,
-  123,  154,  185,  216,  247,  278,  309,  340,  371,  402,  433,  464,  495,
-  526,  557,  588,  619,  650,  681,  712,  743,  774,  805,  836,  867,  898,
-  929,  960,  31,   62,   93,   124,  155,  186,  217,  248,  279,  310,  341,
-  372,  403,  434,  465,  496,  527,  558,  589,  620,  651,  682,  713,  744,
-  775,  806,  837,  868,  899,  930,  961,  992,  63,   94,   125,  156,  187,
-  218,  249,  280,  311,  342,  373,  404,  435,  466,  497,  528,  559,  590,
-  621,  652,  683,  714,  745,  776,  807,  838,  869,  900,  931,  962,  993,
-  1024, 95,   126,  157,  188,  219,  250,  281,  312,  343,  374,  405,  436,
-  467,  498,  529,  560,  591,  622,  653,  684,  715,  746,  777,  808,  839,
-  870,  901,  932,  963,  994,  1025, 1056, 127,  158,  189,  220,  251,  282,
-  313,  344,  375,  406,  437,  468,  499,  530,  561,  592,  623,  654,  685,
-  716,  747,  778,  809,  840,  871,  902,  933,  964,  995,  1026, 1057, 1088,
-  159,  190,  221,  252,  283,  314,  345,  376,  407,  438,  469,  500,  531,
-  562,  593,  624,  655,  686,  717,  748,  779,  810,  841,  872,  903,  934,
-  965,  996,  1027, 1058, 1089, 1120, 191,  222,  253,  284,  315,  346,  377,
-  408,  439,  470,  501,  532,  563,  594,  625,  656,  687,  718,  749,  780,
-  811,  842,  873,  904,  935,  966,  997,  1028, 1059, 1090, 1121, 1152, 223,
-  254,  285,  316,  347,  378,  409,  440,  471,  502,  533,  564,  595,  626,
-  657,  688,  719,  750,  781,  812,  843,  874,  905,  936,  967,  998,  1029,
-  1060, 1091, 1122, 1153, 1184, 255,  286,  317,  348,  379,  410,  441,  472,
-  503,  534,  565,  596,  627,  658,  689,  720,  751,  782,  813,  844,  875,
-  906,  937,  968,  999,  1030, 1061, 1092, 1123, 1154, 1185, 1216, 287,  318,
-  349,  380,  411,  442,  473,  504,  535,  566,  597,  628,  659,  690,  721,
-  752,  783,  814,  845,  876,  907,  938,  969,  1000, 1031, 1062, 1093, 1124,
-  1155, 1186, 1217, 1248, 319,  350,  381,  412,  443,  474,  505,  536,  567,
-  598,  629,  660,  691,  722,  753,  784,  815,  846,  877,  908,  939,  970,
-  1001, 1032, 1063, 1094, 1125, 1156, 1187, 1218, 1249, 1280, 351,  382,  413,
-  444,  475,  506,  537,  568,  599,  630,  661,  692,  723,  754,  785,  816,
-  847,  878,  909,  940,  971,  1002, 1033, 1064, 1095, 1126, 1157, 1188, 1219,
-  1250, 1281, 1312, 383,  414,  445,  476,  507,  538,  569,  600,  631,  662,
-  693,  724,  755,  786,  817,  848,  879,  910,  941,  972,  1003, 1034, 1065,
-  1096, 1127, 1158, 1189, 1220, 1251, 1282, 1313, 1344, 415,  446,  477,  508,
-  539,  570,  601,  632,  663,  694,  725,  756,  787,  818,  849,  880,  911,
-  942,  973,  1004, 1035, 1066, 1097, 1128, 1159, 1190, 1221, 1252, 1283, 1314,
-  1345, 1376, 447,  478,  509,  540,  571,  602,  633,  664,  695,  726,  757,
-  788,  819,  850,  881,  912,  943,  974,  1005, 1036, 1067, 1098, 1129, 1160,
-  1191, 1222, 1253, 1284, 1315, 1346, 1377, 1408, 479,  510,  541,  572,  603,
-  634,  665,  696,  727,  758,  789,  820,  851,  882,  913,  944,  975,  1006,
-  1037, 1068, 1099, 1130, 1161, 1192, 1223, 1254, 1285, 1316, 1347, 1378, 1409,
-  1440, 511,  542,  573,  604,  635,  666,  697,  728,  759,  790,  821,  852,
-  883,  914,  945,  976,  1007, 1038, 1069, 1100, 1131, 1162, 1193, 1224, 1255,
-  1286, 1317, 1348, 1379, 1410, 1441, 1472, 543,  574,  605,  636,  667,  698,
-  729,  760,  791,  822,  853,  884,  915,  946,  977,  1008, 1039, 1070, 1101,
-  1132, 1163, 1194, 1225, 1256, 1287, 1318, 1349, 1380, 1411, 1442, 1473, 1504,
-  575,  606,  637,  668,  699,  730,  761,  792,  823,  854,  885,  916,  947,
-  978,  1009, 1040, 1071, 1102, 1133, 1164, 1195, 1226, 1257, 1288, 1319, 1350,
-  1381, 1412, 1443, 1474, 1505, 1536, 607,  638,  669,  700,  731,  762,  793,
-  824,  855,  886,  917,  948,  979,  1010, 1041, 1072, 1103, 1134, 1165, 1196,
-  1227, 1258, 1289, 1320, 1351, 1382, 1413, 1444, 1475, 1506, 1537, 1568, 639,
-  670,  701,  732,  763,  794,  825,  856,  887,  918,  949,  980,  1011, 1042,
-  1073, 1104, 1135, 1166, 1197, 1228, 1259, 1290, 1321, 1352, 1383, 1414, 1445,
-  1476, 1507, 1538, 1569, 1600, 671,  702,  733,  764,  795,  826,  857,  888,
-  919,  950,  981,  1012, 1043, 1074, 1105, 1136, 1167, 1198, 1229, 1260, 1291,
-  1322, 1353, 1384, 1415, 1446, 1477, 1508, 1539, 1570, 1601, 1632, 703,  734,
-  765,  796,  827,  858,  889,  920,  951,  982,  1013, 1044, 1075, 1106, 1137,
-  1168, 1199, 1230, 1261, 1292, 1323, 1354, 1385, 1416, 1447, 1478, 1509, 1540,
-  1571, 1602, 1633, 1664, 735,  766,  797,  828,  859,  890,  921,  952,  983,
-  1014, 1045, 1076, 1107, 1138, 1169, 1200, 1231, 1262, 1293, 1324, 1355, 1386,
-  1417, 1448, 1479, 1510, 1541, 1572, 1603, 1634, 1665, 1696, 767,  798,  829,
-  860,  891,  922,  953,  984,  1015, 1046, 1077, 1108, 1139, 1170, 1201, 1232,
-  1263, 1294, 1325, 1356, 1387, 1418, 1449, 1480, 1511, 1542, 1573, 1604, 1635,
-  1666, 1697, 1728, 799,  830,  861,  892,  923,  954,  985,  1016, 1047, 1078,
-  1109, 1140, 1171, 1202, 1233, 1264, 1295, 1326, 1357, 1388, 1419, 1450, 1481,
-  1512, 1543, 1574, 1605, 1636, 1667, 1698, 1729, 1760, 831,  862,  893,  924,
-  955,  986,  1017, 1048, 1079, 1110, 1141, 1172, 1203, 1234, 1265, 1296, 1327,
-  1358, 1389, 1420, 1451, 1482, 1513, 1544, 1575, 1606, 1637, 1668, 1699, 1730,
-  1761, 1792, 863,  894,  925,  956,  987,  1018, 1049, 1080, 1111, 1142, 1173,
-  1204, 1235, 1266, 1297, 1328, 1359, 1390, 1421, 1452, 1483, 1514, 1545, 1576,
-  1607, 1638, 1669, 1700, 1731, 1762, 1793, 1824, 895,  926,  957,  988,  1019,
-  1050, 1081, 1112, 1143, 1174, 1205, 1236, 1267, 1298, 1329, 1360, 1391, 1422,
-  1453, 1484, 1515, 1546, 1577, 1608, 1639, 1670, 1701, 1732, 1763, 1794, 1825,
-  1856, 927,  958,  989,  1020, 1051, 1082, 1113, 1144, 1175, 1206, 1237, 1268,
-  1299, 1330, 1361, 1392, 1423, 1454, 1485, 1516, 1547, 1578, 1609, 1640, 1671,
-  1702, 1733, 1764, 1795, 1826, 1857, 1888, 959,  990,  1021, 1052, 1083, 1114,
-  1145, 1176, 1207, 1238, 1269, 1300, 1331, 1362, 1393, 1424, 1455, 1486, 1517,
-  1548, 1579, 1610, 1641, 1672, 1703, 1734, 1765, 1796, 1827, 1858, 1889, 1920,
-  991,  1022, 1053, 1084, 1115, 1146, 1177, 1208, 1239, 1270, 1301, 1332, 1363,
-  1394, 1425, 1456, 1487, 1518, 1549, 1580, 1611, 1642, 1673, 1704, 1735, 1766,
-  1797, 1828, 1859, 1890, 1921, 1952, 1023, 1054, 1085, 1116, 1147, 1178, 1209,
-  1240, 1271, 1302, 1333, 1364, 1395, 1426, 1457, 1488, 1519, 1550, 1581, 1612,
-  1643, 1674, 1705, 1736, 1767, 1798, 1829, 1860, 1891, 1922, 1953, 1984, 1055,
-  1086, 1117, 1148, 1179, 1210, 1241, 1272, 1303, 1334, 1365, 1396, 1427, 1458,
-  1489, 1520, 1551, 1582, 1613, 1644, 1675, 1706, 1737, 1768, 1799, 1830, 1861,
-  1892, 1923, 1954, 1985, 2016, 1087, 1118, 1149, 1180, 1211, 1242, 1273, 1304,
-  1335, 1366, 1397, 1428, 1459, 1490, 1521, 1552, 1583, 1614, 1645, 1676, 1707,
-  1738, 1769, 1800, 1831, 1862, 1893, 1924, 1955, 1986, 2017, 1119, 1150, 1181,
-  1212, 1243, 1274, 1305, 1336, 1367, 1398, 1429, 1460, 1491, 1522, 1553, 1584,
-  1615, 1646, 1677, 1708, 1739, 1770, 1801, 1832, 1863, 1894, 1925, 1956, 1987,
-  2018, 1151, 1182, 1213, 1244, 1275, 1306, 1337, 1368, 1399, 1430, 1461, 1492,
-  1523, 1554, 1585, 1616, 1647, 1678, 1709, 1740, 1771, 1802, 1833, 1864, 1895,
-  1926, 1957, 1988, 2019, 1183, 1214, 1245, 1276, 1307, 1338, 1369, 1400, 1431,
-  1462, 1493, 1524, 1555, 1586, 1617, 1648, 1679, 1710, 1741, 1772, 1803, 1834,
-  1865, 1896, 1927, 1958, 1989, 2020, 1215, 1246, 1277, 1308, 1339, 1370, 1401,
-  1432, 1463, 1494, 1525, 1556, 1587, 1618, 1649, 1680, 1711, 1742, 1773, 1804,
-  1835, 1866, 1897, 1928, 1959, 1990, 2021, 1247, 1278, 1309, 1340, 1371, 1402,
-  1433, 1464, 1495, 1526, 1557, 1588, 1619, 1650, 1681, 1712, 1743, 1774, 1805,
-  1836, 1867, 1898, 1929, 1960, 1991, 2022, 1279, 1310, 1341, 1372, 1403, 1434,
-  1465, 1496, 1527, 1558, 1589, 1620, 1651, 1682, 1713, 1744, 1775, 1806, 1837,
-  1868, 1899, 1930, 1961, 1992, 2023, 1311, 1342, 1373, 1404, 1435, 1466, 1497,
-  1528, 1559, 1590, 1621, 1652, 1683, 1714, 1745, 1776, 1807, 1838, 1869, 1900,
-  1931, 1962, 1993, 2024, 1343, 1374, 1405, 1436, 1467, 1498, 1529, 1560, 1591,
-  1622, 1653, 1684, 1715, 1746, 1777, 1808, 1839, 1870, 1901, 1932, 1963, 1994,
-  2025, 1375, 1406, 1437, 1468, 1499, 1530, 1561, 1592, 1623, 1654, 1685, 1716,
-  1747, 1778, 1809, 1840, 1871, 1902, 1933, 1964, 1995, 2026, 1407, 1438, 1469,
-  1500, 1531, 1562, 1593, 1624, 1655, 1686, 1717, 1748, 1779, 1810, 1841, 1872,
-  1903, 1934, 1965, 1996, 2027, 1439, 1470, 1501, 1532, 1563, 1594, 1625, 1656,
-  1687, 1718, 1749, 1780, 1811, 1842, 1873, 1904, 1935, 1966, 1997, 2028, 1471,
-  1502, 1533, 1564, 1595, 1626, 1657, 1688, 1719, 1750, 1781, 1812, 1843, 1874,
-  1905, 1936, 1967, 1998, 2029, 1503, 1534, 1565, 1596, 1627, 1658, 1689, 1720,
-  1751, 1782, 1813, 1844, 1875, 1906, 1937, 1968, 1999, 2030, 1535, 1566, 1597,
-  1628, 1659, 1690, 1721, 1752, 1783, 1814, 1845, 1876, 1907, 1938, 1969, 2000,
-  2031, 1567, 1598, 1629, 1660, 1691, 1722, 1753, 1784, 1815, 1846, 1877, 1908,
-  1939, 1970, 2001, 2032, 1599, 1630, 1661, 1692, 1723, 1754, 1785, 1816, 1847,
-  1878, 1909, 1940, 1971, 2002, 2033, 1631, 1662, 1693, 1724, 1755, 1786, 1817,
-  1848, 1879, 1910, 1941, 1972, 2003, 2034, 1663, 1694, 1725, 1756, 1787, 1818,
-  1849, 1880, 1911, 1942, 1973, 2004, 2035, 1695, 1726, 1757, 1788, 1819, 1850,
-  1881, 1912, 1943, 1974, 2005, 2036, 1727, 1758, 1789, 1820, 1851, 1882, 1913,
-  1944, 1975, 2006, 2037, 1759, 1790, 1821, 1852, 1883, 1914, 1945, 1976, 2007,
-  2038, 1791, 1822, 1853, 1884, 1915, 1946, 1977, 2008, 2039, 1823, 1854, 1885,
-  1916, 1947, 1978, 2009, 2040, 1855, 1886, 1917, 1948, 1979, 2010, 2041, 1887,
-  1918, 1949, 1980, 2011, 2042, 1919, 1950, 1981, 2012, 2043, 1951, 1982, 2013,
-  2044, 1983, 2014, 2045, 2015, 2046, 2047,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, default_scan_64x32[2048]) = {
-  0,    1,    64,   2,    65,   128,  3,    66,   129,  192,  4,    67,   130,
-  193,  256,  5,    68,   131,  194,  257,  320,  6,    69,   132,  195,  258,
-  321,  384,  7,    70,   133,  196,  259,  322,  385,  448,  8,    71,   134,
-  197,  260,  323,  386,  449,  512,  9,    72,   135,  198,  261,  324,  387,
-  450,  513,  576,  10,   73,   136,  199,  262,  325,  388,  451,  514,  577,
-  640,  11,   74,   137,  200,  263,  326,  389,  452,  515,  578,  641,  704,
-  12,   75,   138,  201,  264,  327,  390,  453,  516,  579,  642,  705,  768,
-  13,   76,   139,  202,  265,  328,  391,  454,  517,  580,  643,  706,  769,
-  832,  14,   77,   140,  203,  266,  329,  392,  455,  518,  581,  644,  707,
-  770,  833,  896,  15,   78,   141,  204,  267,  330,  393,  456,  519,  582,
-  645,  708,  771,  834,  897,  960,  16,   79,   142,  205,  268,  331,  394,
-  457,  520,  583,  646,  709,  772,  835,  898,  961,  1024, 17,   80,   143,
-  206,  269,  332,  395,  458,  521,  584,  647,  710,  773,  836,  899,  962,
-  1025, 1088, 18,   81,   144,  207,  270,  333,  396,  459,  522,  585,  648,
-  711,  774,  837,  900,  963,  1026, 1089, 1152, 19,   82,   145,  208,  271,
-  334,  397,  460,  523,  586,  649,  712,  775,  838,  901,  964,  1027, 1090,
-  1153, 1216, 20,   83,   146,  209,  272,  335,  398,  461,  524,  587,  650,
-  713,  776,  839,  902,  965,  1028, 1091, 1154, 1217, 1280, 21,   84,   147,
-  210,  273,  336,  399,  462,  525,  588,  651,  714,  777,  840,  903,  966,
-  1029, 1092, 1155, 1218, 1281, 1344, 22,   85,   148,  211,  274,  337,  400,
-  463,  526,  589,  652,  715,  778,  841,  904,  967,  1030, 1093, 1156, 1219,
-  1282, 1345, 1408, 23,   86,   149,  212,  275,  338,  401,  464,  527,  590,
-  653,  716,  779,  842,  905,  968,  1031, 1094, 1157, 1220, 1283, 1346, 1409,
-  1472, 24,   87,   150,  213,  276,  339,  402,  465,  528,  591,  654,  717,
-  780,  843,  906,  969,  1032, 1095, 1158, 1221, 1284, 1347, 1410, 1473, 1536,
-  25,   88,   151,  214,  277,  340,  403,  466,  529,  592,  655,  718,  781,
-  844,  907,  970,  1033, 1096, 1159, 1222, 1285, 1348, 1411, 1474, 1537, 1600,
-  26,   89,   152,  215,  278,  341,  404,  467,  530,  593,  656,  719,  782,
-  845,  908,  971,  1034, 1097, 1160, 1223, 1286, 1349, 1412, 1475, 1538, 1601,
-  1664, 27,   90,   153,  216,  279,  342,  405,  468,  531,  594,  657,  720,
-  783,  846,  909,  972,  1035, 1098, 1161, 1224, 1287, 1350, 1413, 1476, 1539,
-  1602, 1665, 1728, 28,   91,   154,  217,  280,  343,  406,  469,  532,  595,
-  658,  721,  784,  847,  910,  973,  1036, 1099, 1162, 1225, 1288, 1351, 1414,
-  1477, 1540, 1603, 1666, 1729, 1792, 29,   92,   155,  218,  281,  344,  407,
-  470,  533,  596,  659,  722,  785,  848,  911,  974,  1037, 1100, 1163, 1226,
-  1289, 1352, 1415, 1478, 1541, 1604, 1667, 1730, 1793, 1856, 30,   93,   156,
-  219,  282,  345,  408,  471,  534,  597,  660,  723,  786,  849,  912,  975,
-  1038, 1101, 1164, 1227, 1290, 1353, 1416, 1479, 1542, 1605, 1668, 1731, 1794,
-  1857, 1920, 31,   94,   157,  220,  283,  346,  409,  472,  535,  598,  661,
-  724,  787,  850,  913,  976,  1039, 1102, 1165, 1228, 1291, 1354, 1417, 1480,
-  1543, 1606, 1669, 1732, 1795, 1858, 1921, 1984, 32,   95,   158,  221,  284,
-  347,  410,  473,  536,  599,  662,  725,  788,  851,  914,  977,  1040, 1103,
-  1166, 1229, 1292, 1355, 1418, 1481, 1544, 1607, 1670, 1733, 1796, 1859, 1922,
-  1985, 33,   96,   159,  222,  285,  348,  411,  474,  537,  600,  663,  726,
-  789,  852,  915,  978,  1041, 1104, 1167, 1230, 1293, 1356, 1419, 1482, 1545,
-  1608, 1671, 1734, 1797, 1860, 1923, 1986, 34,   97,   160,  223,  286,  349,
-  412,  475,  538,  601,  664,  727,  790,  853,  916,  979,  1042, 1105, 1168,
-  1231, 1294, 1357, 1420, 1483, 1546, 1609, 1672, 1735, 1798, 1861, 1924, 1987,
-  35,   98,   161,  224,  287,  350,  413,  476,  539,  602,  665,  728,  791,
-  854,  917,  980,  1043, 1106, 1169, 1232, 1295, 1358, 1421, 1484, 1547, 1610,
-  1673, 1736, 1799, 1862, 1925, 1988, 36,   99,   162,  225,  288,  351,  414,
-  477,  540,  603,  666,  729,  792,  855,  918,  981,  1044, 1107, 1170, 1233,
-  1296, 1359, 1422, 1485, 1548, 1611, 1674, 1737, 1800, 1863, 1926, 1989, 37,
-  100,  163,  226,  289,  352,  415,  478,  541,  604,  667,  730,  793,  856,
-  919,  982,  1045, 1108, 1171, 1234, 1297, 1360, 1423, 1486, 1549, 1612, 1675,
-  1738, 1801, 1864, 1927, 1990, 38,   101,  164,  227,  290,  353,  416,  479,
-  542,  605,  668,  731,  794,  857,  920,  983,  1046, 1109, 1172, 1235, 1298,
-  1361, 1424, 1487, 1550, 1613, 1676, 1739, 1802, 1865, 1928, 1991, 39,   102,
-  165,  228,  291,  354,  417,  480,  543,  606,  669,  732,  795,  858,  921,
-  984,  1047, 1110, 1173, 1236, 1299, 1362, 1425, 1488, 1551, 1614, 1677, 1740,
-  1803, 1866, 1929, 1992, 40,   103,  166,  229,  292,  355,  418,  481,  544,
-  607,  670,  733,  796,  859,  922,  985,  1048, 1111, 1174, 1237, 1300, 1363,
-  1426, 1489, 1552, 1615, 1678, 1741, 1804, 1867, 1930, 1993, 41,   104,  167,
-  230,  293,  356,  419,  482,  545,  608,  671,  734,  797,  860,  923,  986,
-  1049, 1112, 1175, 1238, 1301, 1364, 1427, 1490, 1553, 1616, 1679, 1742, 1805,
-  1868, 1931, 1994, 42,   105,  168,  231,  294,  357,  420,  483,  546,  609,
-  672,  735,  798,  861,  924,  987,  1050, 1113, 1176, 1239, 1302, 1365, 1428,
-  1491, 1554, 1617, 1680, 1743, 1806, 1869, 1932, 1995, 43,   106,  169,  232,
-  295,  358,  421,  484,  547,  610,  673,  736,  799,  862,  925,  988,  1051,
-  1114, 1177, 1240, 1303, 1366, 1429, 1492, 1555, 1618, 1681, 1744, 1807, 1870,
-  1933, 1996, 44,   107,  170,  233,  296,  359,  422,  485,  548,  611,  674,
-  737,  800,  863,  926,  989,  1052, 1115, 1178, 1241, 1304, 1367, 1430, 1493,
-  1556, 1619, 1682, 1745, 1808, 1871, 1934, 1997, 45,   108,  171,  234,  297,
-  360,  423,  486,  549,  612,  675,  738,  801,  864,  927,  990,  1053, 1116,
-  1179, 1242, 1305, 1368, 1431, 1494, 1557, 1620, 1683, 1746, 1809, 1872, 1935,
-  1998, 46,   109,  172,  235,  298,  361,  424,  487,  550,  613,  676,  739,
-  802,  865,  928,  991,  1054, 1117, 1180, 1243, 1306, 1369, 1432, 1495, 1558,
-  1621, 1684, 1747, 1810, 1873, 1936, 1999, 47,   110,  173,  236,  299,  362,
-  425,  488,  551,  614,  677,  740,  803,  866,  929,  992,  1055, 1118, 1181,
-  1244, 1307, 1370, 1433, 1496, 1559, 1622, 1685, 1748, 1811, 1874, 1937, 2000,
-  48,   111,  174,  237,  300,  363,  426,  489,  552,  615,  678,  741,  804,
-  867,  930,  993,  1056, 1119, 1182, 1245, 1308, 1371, 1434, 1497, 1560, 1623,
-  1686, 1749, 1812, 1875, 1938, 2001, 49,   112,  175,  238,  301,  364,  427,
-  490,  553,  616,  679,  742,  805,  868,  931,  994,  1057, 1120, 1183, 1246,
-  1309, 1372, 1435, 1498, 1561, 1624, 1687, 1750, 1813, 1876, 1939, 2002, 50,
-  113,  176,  239,  302,  365,  428,  491,  554,  617,  680,  743,  806,  869,
-  932,  995,  1058, 1121, 1184, 1247, 1310, 1373, 1436, 1499, 1562, 1625, 1688,
-  1751, 1814, 1877, 1940, 2003, 51,   114,  177,  240,  303,  366,  429,  492,
-  555,  618,  681,  744,  807,  870,  933,  996,  1059, 1122, 1185, 1248, 1311,
-  1374, 1437, 1500, 1563, 1626, 1689, 1752, 1815, 1878, 1941, 2004, 52,   115,
-  178,  241,  304,  367,  430,  493,  556,  619,  682,  745,  808,  871,  934,
-  997,  1060, 1123, 1186, 1249, 1312, 1375, 1438, 1501, 1564, 1627, 1690, 1753,
-  1816, 1879, 1942, 2005, 53,   116,  179,  242,  305,  368,  431,  494,  557,
-  620,  683,  746,  809,  872,  935,  998,  1061, 1124, 1187, 1250, 1313, 1376,
-  1439, 1502, 1565, 1628, 1691, 1754, 1817, 1880, 1943, 2006, 54,   117,  180,
-  243,  306,  369,  432,  495,  558,  621,  684,  747,  810,  873,  936,  999,
-  1062, 1125, 1188, 1251, 1314, 1377, 1440, 1503, 1566, 1629, 1692, 1755, 1818,
-  1881, 1944, 2007, 55,   118,  181,  244,  307,  370,  433,  496,  559,  622,
-  685,  748,  811,  874,  937,  1000, 1063, 1126, 1189, 1252, 1315, 1378, 1441,
-  1504, 1567, 1630, 1693, 1756, 1819, 1882, 1945, 2008, 56,   119,  182,  245,
-  308,  371,  434,  497,  560,  623,  686,  749,  812,  875,  938,  1001, 1064,
-  1127, 1190, 1253, 1316, 1379, 1442, 1505, 1568, 1631, 1694, 1757, 1820, 1883,
-  1946, 2009, 57,   120,  183,  246,  309,  372,  435,  498,  561,  624,  687,
-  750,  813,  876,  939,  1002, 1065, 1128, 1191, 1254, 1317, 1380, 1443, 1506,
-  1569, 1632, 1695, 1758, 1821, 1884, 1947, 2010, 58,   121,  184,  247,  310,
-  373,  436,  499,  562,  625,  688,  751,  814,  877,  940,  1003, 1066, 1129,
-  1192, 1255, 1318, 1381, 1444, 1507, 1570, 1633, 1696, 1759, 1822, 1885, 1948,
-  2011, 59,   122,  185,  248,  311,  374,  437,  500,  563,  626,  689,  752,
-  815,  878,  941,  1004, 1067, 1130, 1193, 1256, 1319, 1382, 1445, 1508, 1571,
-  1634, 1697, 1760, 1823, 1886, 1949, 2012, 60,   123,  186,  249,  312,  375,
-  438,  501,  564,  627,  690,  753,  816,  879,  942,  1005, 1068, 1131, 1194,
-  1257, 1320, 1383, 1446, 1509, 1572, 1635, 1698, 1761, 1824, 1887, 1950, 2013,
-  61,   124,  187,  250,  313,  376,  439,  502,  565,  628,  691,  754,  817,
-  880,  943,  1006, 1069, 1132, 1195, 1258, 1321, 1384, 1447, 1510, 1573, 1636,
-  1699, 1762, 1825, 1888, 1951, 2014, 62,   125,  188,  251,  314,  377,  440,
-  503,  566,  629,  692,  755,  818,  881,  944,  1007, 1070, 1133, 1196, 1259,
-  1322, 1385, 1448, 1511, 1574, 1637, 1700, 1763, 1826, 1889, 1952, 2015, 63,
-  126,  189,  252,  315,  378,  441,  504,  567,  630,  693,  756,  819,  882,
-  945,  1008, 1071, 1134, 1197, 1260, 1323, 1386, 1449, 1512, 1575, 1638, 1701,
-  1764, 1827, 1890, 1953, 2016, 127,  190,  253,  316,  379,  442,  505,  568,
-  631,  694,  757,  820,  883,  946,  1009, 1072, 1135, 1198, 1261, 1324, 1387,
-  1450, 1513, 1576, 1639, 1702, 1765, 1828, 1891, 1954, 2017, 191,  254,  317,
-  380,  443,  506,  569,  632,  695,  758,  821,  884,  947,  1010, 1073, 1136,
-  1199, 1262, 1325, 1388, 1451, 1514, 1577, 1640, 1703, 1766, 1829, 1892, 1955,
-  2018, 255,  318,  381,  444,  507,  570,  633,  696,  759,  822,  885,  948,
-  1011, 1074, 1137, 1200, 1263, 1326, 1389, 1452, 1515, 1578, 1641, 1704, 1767,
-  1830, 1893, 1956, 2019, 319,  382,  445,  508,  571,  634,  697,  760,  823,
-  886,  949,  1012, 1075, 1138, 1201, 1264, 1327, 1390, 1453, 1516, 1579, 1642,
-  1705, 1768, 1831, 1894, 1957, 2020, 383,  446,  509,  572,  635,  698,  761,
-  824,  887,  950,  1013, 1076, 1139, 1202, 1265, 1328, 1391, 1454, 1517, 1580,
-  1643, 1706, 1769, 1832, 1895, 1958, 2021, 447,  510,  573,  636,  699,  762,
-  825,  888,  951,  1014, 1077, 1140, 1203, 1266, 1329, 1392, 1455, 1518, 1581,
-  1644, 1707, 1770, 1833, 1896, 1959, 2022, 511,  574,  637,  700,  763,  826,
-  889,  952,  1015, 1078, 1141, 1204, 1267, 1330, 1393, 1456, 1519, 1582, 1645,
-  1708, 1771, 1834, 1897, 1960, 2023, 575,  638,  701,  764,  827,  890,  953,
-  1016, 1079, 1142, 1205, 1268, 1331, 1394, 1457, 1520, 1583, 1646, 1709, 1772,
-  1835, 1898, 1961, 2024, 639,  702,  765,  828,  891,  954,  1017, 1080, 1143,
-  1206, 1269, 1332, 1395, 1458, 1521, 1584, 1647, 1710, 1773, 1836, 1899, 1962,
-  2025, 703,  766,  829,  892,  955,  1018, 1081, 1144, 1207, 1270, 1333, 1396,
-  1459, 1522, 1585, 1648, 1711, 1774, 1837, 1900, 1963, 2026, 767,  830,  893,
-  956,  1019, 1082, 1145, 1208, 1271, 1334, 1397, 1460, 1523, 1586, 1649, 1712,
-  1775, 1838, 1901, 1964, 2027, 831,  894,  957,  1020, 1083, 1146, 1209, 1272,
-  1335, 1398, 1461, 1524, 1587, 1650, 1713, 1776, 1839, 1902, 1965, 2028, 895,
-  958,  1021, 1084, 1147, 1210, 1273, 1336, 1399, 1462, 1525, 1588, 1651, 1714,
-  1777, 1840, 1903, 1966, 2029, 959,  1022, 1085, 1148, 1211, 1274, 1337, 1400,
-  1463, 1526, 1589, 1652, 1715, 1778, 1841, 1904, 1967, 2030, 1023, 1086, 1149,
-  1212, 1275, 1338, 1401, 1464, 1527, 1590, 1653, 1716, 1779, 1842, 1905, 1968,
-  2031, 1087, 1150, 1213, 1276, 1339, 1402, 1465, 1528, 1591, 1654, 1717, 1780,
-  1843, 1906, 1969, 2032, 1151, 1214, 1277, 1340, 1403, 1466, 1529, 1592, 1655,
-  1718, 1781, 1844, 1907, 1970, 2033, 1215, 1278, 1341, 1404, 1467, 1530, 1593,
-  1656, 1719, 1782, 1845, 1908, 1971, 2034, 1279, 1342, 1405, 1468, 1531, 1594,
-  1657, 1720, 1783, 1846, 1909, 1972, 2035, 1343, 1406, 1469, 1532, 1595, 1658,
-  1721, 1784, 1847, 1910, 1973, 2036, 1407, 1470, 1533, 1596, 1659, 1722, 1785,
-  1848, 1911, 1974, 2037, 1471, 1534, 1597, 1660, 1723, 1786, 1849, 1912, 1975,
-  2038, 1535, 1598, 1661, 1724, 1787, 1850, 1913, 1976, 2039, 1599, 1662, 1725,
-  1788, 1851, 1914, 1977, 2040, 1663, 1726, 1789, 1852, 1915, 1978, 2041, 1727,
-  1790, 1853, 1916, 1979, 2042, 1791, 1854, 1917, 1980, 2043, 1855, 1918, 1981,
-  2044, 1919, 1982, 2045, 1983, 2046, 2047,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, default_scan_64x64[4096]) = {
-  0,    1,    64,   65,   2,    128,  66,   129,  130,  3,    192,  67,   193,
-  131,  194,  4,    256,  68,   257,  195,  132,  258,  5,    196,  259,  320,
-  69,   321,  133,  322,  260,  197,  323,  6,    384,  70,   385,  134,  386,
-  261,  324,  198,  387,  7,    448,  71,   325,  449,  262,  388,  135,  450,
-  199,  451,  326,  389,  8,    512,  72,   263,  452,  513,  136,  514,  390,
-  200,  515,  327,  453,  264,  516,  9,    576,  73,   577,  137,  391,  454,
-  578,  328,  517,  201,  579,  265,  580,  455,  10,   392,  518,  640,  74,
-  641,  138,  642,  329,  581,  202,  643,  456,  519,  266,  644,  393,  582,
-  11,   704,  75,   705,  139,  330,  645,  706,  520,  203,  457,  583,  707,
-  394,  646,  267,  708,  12,   768,  76,   521,  584,  769,  331,  709,  140,
-  770,  458,  647,  204,  771,  395,  710,  268,  772,  585,  522,  648,  13,
-  332,  773,  832,  77,   459,  711,  833,  141,  834,  205,  835,  396,  774,
-  586,  649,  269,  523,  712,  836,  460,  775,  333,  837,  14,   896,  78,
-  897,  142,  650,  898,  587,  713,  206,  397,  838,  899,  524,  776,  270,
-  900,  461,  839,  334,  651,  714,  901,  15,   588,  777,  960,  79,   961,
-  143,  962,  398,  902,  525,  840,  207,  963,  271,  964,  715,  652,  778,
-  462,  903,  335,  589,  841,  965,  16,   1024, 80,   1025, 144,  526,  904,
-  1026, 399,  966,  208,  716,  779,  1027, 653,  842,  272,  1028, 463,  967,
-  590,  905,  336,  1029, 780,  17,   527,  968,  1088, 81,   717,  843,  1089,
-  400,  1030, 145,  1090, 654,  906,  209,  1091, 273,  464,  1031, 1092, 591,
-  969,  781,  844,  337,  1093, 718,  907,  528,  1032, 18,   1152, 82,   401,
-  655,  970,  1094, 1153, 146,  1154, 210,  1155, 592,  1033, 465,  845,  1095,
-  274,  782,  908,  1156, 719,  971,  338,  1157, 529,  1096, 656,  1034, 402,
-  1158, 19,   1216, 83,   1217, 147,  846,  909,  1218, 783,  972,  211,  593,
-  1097, 1219, 466,  1159, 275,  720,  1035, 1220, 339,  1221, 530,  1160, 657,
-  1098, 910,  847,  973,  403,  1222, 20,   784,  1036, 1280, 84,   1281, 148,
-  1282, 594,  1161, 212,  1283, 467,  721,  1099, 1223, 276,  1284, 911,  974,
-  658,  1162, 340,  531,  848,  1037, 1224, 1285, 785,  1100, 404,  1286, 21,
-  1344, 85,   595,  1225, 1345, 149,  722,  1163, 1346, 468,  1287, 213,  975,
-  1347, 912,  1038, 277,  1348, 849,  1101, 659,  1226, 532,  1288, 341,  1349,
-  786,  1164, 405,  1350, 596,  976,  1039, 1289, 723,  1227, 22,   1408, 86,
-  913,  1102, 1409, 150,  1410, 469,  1351, 214,  850,  1165, 1411, 278,  660,
-  1290, 1412, 533,  787,  1228, 1352, 342,  1413, 1040, 977,  1103, 406,  914,
-  1166, 1414, 724,  1291, 597,  1353, 23,   1472, 87,   851,  1229, 1473, 151,
-  470,  1415, 1474, 215,  1475, 661,  1354, 788,  1292, 279,  1041, 1104, 1476,
-  534,  1416, 978,  1167, 343,  1477, 915,  1230, 725,  1355, 407,  598,  1417,
-  1478, 852,  1293, 24,   1536, 88,   1537, 471,  1105, 1479, 152,  1042, 1168,
-  1538, 662,  1418, 216,  789,  1356, 1539, 979,  1231, 280,  1540, 535,  1480,
-  916,  1294, 344,  1541, 726,  1419, 599,  853,  1357, 1481, 408,  1542, 1106,
-  1169, 1043, 1232, 25,   472,  980,  1295, 1543, 1600, 89,   1601, 790,  1420,
-  153,  663,  1482, 1602, 217,  1603, 917,  1358, 536,  1544, 281,  1604, 1170,
-  345,  727,  1107, 1233, 1483, 1605, 854,  1421, 1044, 1296, 600,  1545, 409,
-  1606, 981,  1359, 791,  1484, 473,  1607, 26,   664,  1546, 1664, 90,   1665,
-  154,  918,  1422, 1666, 218,  1171, 1234, 1667, 537,  1108, 1297, 1608, 282,
-  1668, 728,  1045, 1360, 1547, 855,  1485, 346,  1669, 601,  1609, 982,  1423,
-  410,  1670, 792,  1548, 1235, 1172, 1298, 474,  665,  919,  1486, 1610, 1671,
-  27,   1728, 91,   1109, 1361, 1729, 155,  1730, 219,  1731, 538,  1046, 1424,
-  1672, 283,  856,  1549, 1732, 729,  1611, 347,  983,  1487, 1733, 602,  1673,
-  1236, 1299, 411,  1173, 1362, 1734, 793,  1612, 920,  1550, 1110, 1425, 666,
-  1674, 475,  1735, 28,   1792, 92,   1047, 1488, 1793, 156,  1794, 220,  539,
-  1736, 1795, 857,  1613, 730,  1675, 284,  1300, 1796, 984,  1551, 1237, 1363,
-  1174, 1426, 348,  1797, 603,  1737, 1111, 1489, 412,  794,  1676, 1798, 921,
-  1614, 667,  1738, 1048, 1552, 476,  1799, 29,   1301, 1364, 1856, 93,   1857,
-  157,  858,  1238, 1427, 1677, 1858, 540,  1800, 221,  731,  985,  1615, 1739,
-  1859, 1175, 1490, 285,  1860, 604,  1112, 1553, 1801, 349,  1861, 922,  1678,
-  795,  1740, 413,  1862, 1049, 1616, 1365, 668,  1302, 1428, 1802, 477,  1239,
-  1491, 1863, 859,  1741, 30,   1176, 1554, 1920, 94,   986,  1679, 1921, 158,
-  1922, 541,  732,  1803, 1864, 222,  1923, 1113, 1617, 286,  1924, 605,  1865,
-  350,  923,  1366, 1429, 1742, 1925, 796,  1804, 1303, 1492, 1050, 1680, 414,
-  1926, 1240, 1555, 669,  1866, 478,  1177, 1618, 1927, 860,  1805, 987,  1743,
-  31,   1984, 95,   733,  1867, 1985, 542,  1928, 159,  1114, 1681, 1986, 1430,
-  223,  1367, 1493, 1987, 1304, 1556, 287,  1988, 924,  1806, 606,  1929, 797,
-  1051, 1744, 1868, 351,  1241, 1619, 1989, 415,  1990, 670,  1178, 1682, 1930,
-  988,  1807, 479,  861,  1869, 1991, 1431, 1494, 1368, 1557, 1115, 1745, 734,
-  1931, 32,   2048, 96,   543,  1305, 1620, 1992, 2049, 160,  2050, 224,  2051,
-  925,  1242, 1683, 1870, 288,  1052, 1808, 2052, 607,  1993, 798,  1932, 352,
-  2053, 1179, 1746, 1495, 416,  1432, 1558, 2054, 671,  1994, 989,  1369, 1621,
-  1871, 862,  1933, 480,  1116, 1809, 2055, 1306, 1684, 735,  1995, 544,  2056,
-  33,   2112, 97,   1243, 1747, 2113, 161,  2114, 926,  1934, 1053, 1872, 225,
-  2115, 289,  608,  799,  1496, 1559, 1996, 2057, 2116, 1180, 1810, 1433, 1622,
-  353,  2117, 1370, 1685, 672,  2058, 417,  990,  1935, 2118, 1307, 1748, 863,
-  1117, 1873, 1997, 481,  2119, 736,  1244, 1811, 2059, 1560, 545,  2120, 1497,
-  1623, 34,   1054, 1936, 2176, 98,   927,  1998, 2177, 162,  1434, 1686, 2178,
-  226,  1181, 1874, 2179, 800,  2060, 609,  1371, 1749, 2121, 290,  2180, 354,
-  2181, 1308, 1812, 991,  1999, 673,  1118, 1937, 2122, 418,  2182, 864,  2061,
-  1561, 1624, 1245, 1875, 482,  1498, 1687, 2183, 737,  2123, 1435, 1750, 1055,
-  2000, 546,  928,  2062, 2184, 1182, 1938, 35,   1372, 1813, 2240, 99,   2241,
-  163,  2242, 801,  2124, 227,  2243, 610,  2185, 291,  1309, 1876, 2244, 992,
-  2063, 355,  1119, 1625, 2001, 2245, 1562, 1688, 674,  2186, 865,  1499, 1751,
-  2125, 419,  1246, 1939, 2246, 1436, 1814, 483,  2247, 738,  2187, 1056, 2064,
-  1373, 1877, 929,  1183, 2002, 2126, 547,  2248, 36,   2304, 100,  2305, 164,
-  802,  1310, 1940, 2188, 2306, 1626, 1689, 228,  1563, 1752, 2307, 611,  2249,
-  292,  2308, 1120, 1500, 1815, 2065, 993,  2127, 356,  2309, 1247, 2003, 675,
-  866,  1437, 1878, 2189, 2250, 420,  2310, 1374, 1941, 484,  1057, 2128, 2311,
-  739,  2251, 1184, 2066, 930,  1690, 2190, 1627, 1753, 548,  1564, 1816, 2312,
-  1311, 2004, 37,   803,  2252, 2368, 101,  1501, 1879, 2369, 165,  2370, 612,
-  2313, 229,  1121, 2129, 2371, 994,  2191, 1438, 1942, 293,  1248, 2067, 2372,
-  357,  867,  2253, 2373, 676,  2314, 1375, 2005, 421,  1691, 1754, 2374, 1628,
-  1817, 1058, 2192, 1185, 2130, 740,  1565, 1880, 2315, 485,  2375, 931,  2254,
-  1312, 2068, 1502, 1943, 549,  2376, 804,  2316, 38,   2432, 102,  1122, 1439,
-  2006, 2193, 2433, 166,  2434, 613,  995,  1249, 2131, 2255, 2377, 230,  2435,
-  1755, 294,  1692, 1818, 2436, 868,  1376, 2069, 2317, 1629, 1881, 358,  677,
-  2378, 2437, 1566, 1944, 422,  1186, 2194, 2438, 1059, 2256, 1313, 2132, 741,
-  1503, 2007, 2379, 932,  2318, 486,  2439, 550,  1440, 2070, 2440, 805,  1756,
-  1819, 2380, 1123, 2257, 1250, 1693, 1882, 2195, 39,   996,  2319, 2496, 103,
-  2497, 167,  614,  1630, 1945, 2441, 2498, 231,  1377, 2133, 2499, 295,  1567,
-  2008, 2500, 869,  2381, 678,  2442, 359,  2501, 1187, 2258, 1060, 2320, 1504,
-  2071, 1314, 2196, 423,  2502, 742,  933,  2382, 2443, 1820, 487,  1757, 1883,
-  2503, 1441, 2134, 1694, 1946, 551,  1124, 2321, 2504, 1251, 1631, 2009, 2259,
-  806,  2444, 997,  2383, 1378, 2197, 40,   1568, 2072, 2560, 104,  2561, 615,
-  2505, 168,  2562, 232,  2563, 870,  2445, 296,  2564, 1505, 2135, 1188, 2322,
-  679,  2506, 360,  1061, 1315, 1821, 1884, 2260, 2384, 2565, 1758, 1947, 424,
-  2566, 1695, 2010, 934,  1442, 2198, 2446, 743,  2507, 488,  1632, 2073, 2567,
-  1252, 2323, 1125, 2385, 552,  2568, 807,  1569, 2136, 2508, 1379, 2261, 998,
-  2447, 41,   616,  2569, 2624, 105,  1885, 2625, 1822, 1948, 169,  1506, 2199,
-  2626, 233,  871,  1759, 2011, 2509, 2627, 1189, 2386, 1316, 2324, 297,  2628,
-  680,  1062, 1696, 2074, 2448, 2570, 361,  2629, 1443, 2262, 1633, 2137, 425,
-  935,  2510, 2630, 744,  2571, 489,  1253, 2387, 2631, 1570, 2200, 1126, 2449,
-  1380, 2325, 1886, 1949, 808,  2572, 553,  1823, 2012, 2632, 999,  2511, 1760,
-  2075, 1507, 2263, 617,  2633, 42,   2688, 106,  1697, 2138, 2689, 170,  1190,
-  2450, 2690, 872,  1317, 2388, 2573, 234,  2691, 1063, 2512, 298,  1444, 2326,
-  2692, 681,  1634, 2201, 2634, 362,  2693, 936,  2574, 426,  1950, 2694, 1571,
-  2264, 745,  1887, 2013, 2635, 1254, 2451, 1824, 2076, 1127, 1381, 2389, 2513,
-  490,  2695, 1761, 2139, 809,  1000, 1508, 2327, 2575, 2636, 554,  2696, 1698,
-  2202, 1318, 2452, 618,  1191, 2514, 2697, 43,   2752, 107,  873,  1635, 2265,
-  2637, 2753, 171,  1445, 2390, 2754, 1064, 2576, 235,  2755, 1951, 2014, 682,
-  2698, 299,  1888, 2077, 2756, 1572, 2328, 1825, 2140, 363,  2757, 937,  2638,
-  1255, 2515, 427,  746,  1382, 1762, 2203, 2453, 2699, 2758, 1128, 2577, 491,
-  1509, 2391, 2759, 1699, 2266, 1001, 2639, 810,  2700, 555,  2760, 1319, 1636,
-  2329, 2516, 2015, 1192, 1952, 2078, 2578, 1446, 2454, 619,  1889, 2141, 2761,
-  874,  2701, 44,   2816, 108,  1065, 2640, 2817, 172,  1826, 2204, 2818, 236,
-  1573, 2392, 2819, 683,  2762, 300,  2820, 1763, 2267, 938,  2702, 364,  1256,
-  2579, 2821, 1383, 2517, 747,  1129, 2641, 2763, 428,  1700, 2330, 2822, 1510,
-  2455, 492,  2016, 2079, 2823, 1002, 1953, 2142, 2703, 811,  2764, 1637, 2393,
-  1890, 2205, 556,  1320, 2580, 2824, 1193, 1447, 2518, 2642, 1827, 2268, 620,
-  2825, 875,  2765, 1066, 1574, 2456, 2704, 45,   1764, 2331, 2880, 109,  2881,
-  173,  2882, 237,  2883, 684,  2826, 301,  1384, 2581, 2884, 1257, 2643, 939,
-  1701, 2394, 2766, 2080, 365,  1511, 2017, 2143, 2519, 2885, 1130, 2705, 1954,
-  2206, 748,  2827, 429,  2886, 1891, 2269, 1638, 2457, 493,  1003, 2767, 2887,
-  812,  1828, 2332, 2828, 1321, 2644, 1448, 2582, 1194, 2706, 557,  2888, 1575,
-  2520, 1765, 2395, 876,  1067, 2768, 2829, 621,  2889, 2081, 2144, 46,   2944,
-  110,  2018, 2207, 2945, 174,  1702, 2458, 2946, 1385, 2645, 238,  685,  1258,
-  1955, 2270, 2707, 2890, 2947, 1512, 2583, 302,  940,  2830, 2948, 1892, 2333,
-  1131, 2769, 366,  2949, 749,  1639, 2521, 2891, 430,  2950, 1829, 2396, 1004,
-  2831, 1322, 2708, 494,  1449, 2646, 2951, 813,  2892, 1195, 1766, 2459, 2770,
-  1576, 2584, 2145, 558,  2082, 2208, 2952, 2019, 2271, 1068, 2832, 877,  2893,
-  1956, 2334, 622,  1703, 2522, 2953, 1386, 2709, 47,   3008, 111,  1259, 1513,
-  1893, 2397, 2647, 2771, 3009, 175,  3010, 686,  2954, 239,  3011, 941,  2894,
-  303,  1132, 1640, 2585, 2833, 3012, 1830, 2460, 367,  3013, 750,  2955, 431,
-  2146, 2209, 3014, 1450, 2710, 1323, 2083, 2272, 2772, 1005, 1767, 2523, 2895,
-  1577, 2020, 2335, 2648, 495,  3015, 814,  1196, 2834, 2956, 1957, 2398, 559,
-  3016, 1704, 2586, 1069, 2896, 878,  1894, 2461, 2957, 623,  1387, 2773, 3017,
-  1514, 2711, 1260, 2835, 48,   3072, 112,  1831, 2524, 3073, 1641, 2649, 176,
-  3074, 687,  3018, 942,  2210, 2958, 240,  3075, 1133, 2147, 2273, 2897, 304,
-  2084, 2336, 3076, 368,  1768, 2587, 3077, 751,  2021, 2399, 3019, 1451, 2774,
-  1324, 2836, 432,  1578, 2712, 3078, 1006, 2959, 1958, 2462, 1197, 2898, 496,
-  815,  3020, 3079, 1705, 2650, 1895, 2525, 560,  3080, 1070, 2960, 1388, 2837,
-  879,  1515, 2775, 3021, 2211, 2274, 1832, 2588, 624,  2148, 2337, 3081, 1261,
-  2899, 1642, 2713, 2085, 2400, 49,   3136, 113,  3137, 688,  3082, 177,  943,
-  1134, 2022, 2463, 2961, 3022, 3138, 241,  1769, 2651, 3139, 305,  3140, 1452,
-  2838, 1959, 2526, 752,  1325, 1579, 2776, 2900, 3083, 369,  3141, 1007, 3023,
-  433,  3142, 1198, 1706, 2714, 2962, 1896, 2589, 816,  3084, 497,  2275, 3143,
-  2212, 2338, 2149, 2401, 561,  1071, 1516, 1833, 2652, 2839, 3024, 3144, 1389,
-  2901, 2086, 2464, 880,  3085, 1643, 2777, 1262, 2963, 625,  2023, 2527, 3145,
-  1770, 2715, 1135, 3025, 50,   944,  1960, 2590, 3086, 3200, 114,  689,  3146,
-  3201, 178,  3202, 242,  1453, 2902, 3203, 1580, 2840, 306,  1326, 2964, 3204,
-  2276, 2339, 753,  1897, 2653, 3147, 370,  1707, 2213, 2402, 2778, 3205, 1008,
-  3087, 1199, 2150, 2465, 3026, 434,  3206, 817,  2087, 2528, 3148, 1834, 2716,
-  498,  3207, 1517, 2903, 1390, 2965, 1072, 3088, 1644, 2024, 2591, 2841, 562,
-  3208, 881,  1263, 3027, 3149, 1771, 2779, 626,  1961, 2654, 3209, 2340, 1136,
-  3089, 2277, 2403, 945,  3150, 690,  1454, 2214, 2466, 2966, 3210, 51,   1581,
-  2904, 3264, 115,  3265, 179,  1898, 2717, 3266, 1327, 3028, 243,  2151, 2529,
-  3267, 1708, 2842, 307,  3268, 754,  3211, 2088, 2592, 371,  1009, 3151, 3269,
-  1200, 3090, 1835, 2780, 435,  3270, 2025, 2655, 818,  3212, 1518, 2967, 499,
-  1391, 1645, 2905, 3029, 3271, 1073, 3152, 1962, 2718, 563,  1264, 1772, 2341,
-  2404, 2843, 3091, 3272, 882,  2278, 2467, 3213, 2215, 2530, 627,  3273, 2152,
-  2593, 1137, 1899, 2781, 3153, 1582, 2968, 1455, 3030, 946,  3214, 691,  1709,
-  2906, 3274, 52,   1328, 3092, 3328, 116,  2089, 2656, 3329, 180,  3330, 244,
-  3331, 308,  1836, 2844, 3332, 755,  3275, 1010, 1201, 2026, 2719, 3154, 3215,
-  372,  3333, 1519, 2405, 3031, 436,  2342, 2468, 3334, 1646, 2969, 819,  1392,
-  3093, 3276, 2279, 2531, 1963, 2782, 500,  3335, 1773, 2907, 1074, 2216, 2594,
-  3216, 1265, 3155, 564,  3336, 883,  2153, 2657, 3277, 1900, 2845, 628,  1583,
-  3032, 3337, 1456, 2090, 2720, 3094, 1138, 3217, 1710, 2970, 947,  3278, 1329,
-  3156, 692,  3338, 53,   1837, 2908, 3392, 117,  2027, 2783, 3393, 181,  2406,
-  2469, 3394, 2343, 2532, 245,  3395, 1202, 3218, 309,  756,  2280, 2595, 3339,
-  3396, 1011, 3279, 1520, 3095, 373,  1647, 3033, 3397, 1964, 2846, 2217, 2658,
-  1393, 3157, 437,  1774, 2971, 3398, 820,  3340, 2154, 2721, 1075, 3280, 501,
-  3399, 1266, 3219, 1901, 2909, 565,  884,  2091, 2784, 3341, 3400, 1584, 3096,
-  1457, 1711, 3034, 3158, 2470, 629,  1139, 2407, 2533, 3281, 3401, 2344, 2596,
-  2028, 2847, 948,  1330, 1838, 2972, 3220, 3342, 2281, 2659, 693,  3402, 54,
-  3456, 118,  3457, 182,  2218, 2722, 3458, 246,  1203, 1965, 2910, 3282, 3459,
-  1012, 1648, 3097, 3343, 757,  1521, 3159, 3403, 310,  3460, 1775, 2155, 2785,
-  3035, 374,  1394, 3221, 3461, 438,  3462, 821,  3404, 1902, 2973, 1076, 2092,
-  2848, 3344, 1267, 3283, 502,  2471, 2534, 3463, 2408, 2597, 1585, 2345, 2660,
-  3160, 885,  3405, 566,  1712, 3098, 3464, 1458, 3222, 2029, 2911, 2282, 2723,
-  1140, 1839, 3036, 3345, 630,  3465, 1331, 3284, 949,  2219, 2786, 3406, 694,
-  1966, 2974, 3466, 55,   2156, 2849, 3520, 119,  1649, 3161, 3521, 1204, 3346,
-  183,  1522, 3223, 3522, 1776, 3099, 247,  1013, 3407, 3523, 758,  3467, 311,
-  3524, 1395, 2535, 3285, 2472, 2598, 2093, 2912, 375,  1903, 2409, 2661, 3037,
-  3525, 822,  2346, 2724, 3468, 439,  3526, 1077, 1268, 3347, 3408, 503,  2283,
-  2787, 3527, 1586, 3224, 1713, 2030, 2975, 3162, 886,  1459, 3286, 3469, 1840,
-  3100, 567,  3528, 2220, 2850, 1141, 3409, 1332, 3348, 631,  3529, 1967, 3038,
-  950,  3470, 2157, 2913, 2536, 2599, 695,  1650, 2473, 2662, 3225, 3530, 1523,
-  1777, 3163, 3287, 1205, 2410, 2725, 3410, 56,   3584, 120,  3585, 184,  2094,
-  2976, 3586, 1014, 3471, 248,  1396, 1904, 2347, 2788, 3101, 3349, 3587, 759,
-  3531, 312,  3588, 376,  2284, 2851, 3589, 823,  3532, 1269, 2031, 3039, 3411,
-  440,  1078, 3472, 3590, 1714, 3226, 1587, 3288, 2221, 2914, 504,  1841, 3164,
-  3591, 1460, 3350, 887,  3533, 568,  2600, 3592, 2537, 2663, 1968, 3102, 1142,
-  2158, 2977, 3473, 2474, 2726, 1333, 3412, 632,  3593, 2411, 2789, 951,  3534,
-  1651, 3289, 1778, 3227, 2348, 2852, 1524, 2095, 3040, 3351, 696,  3594, 1206,
-  3474, 1905, 3165, 57,   3648, 121,  1015, 1397, 2285, 2915, 3413, 3535, 3649,
-  185,  3650, 760,  3595, 249,  3651, 313,  2032, 3103, 3652, 2222, 2978, 377,
-  3653, 1270, 1715, 3290, 3475, 824,  1588, 3352, 3596, 1079, 2601, 2664, 3536,
-  1842, 3228, 441,  2538, 2727, 3654, 1461, 2475, 2790, 3414, 505,  2159, 3041,
-  3655, 1969, 3166, 888,  2412, 2853, 3597, 569,  3656, 1143, 3537, 1334, 3476,
-  2349, 2916, 2096, 3104, 1652, 3353, 633,  1779, 3291, 3657, 952,  3598, 1525,
-  3415, 1906, 2286, 2979, 3229, 697,  1207, 3538, 3658, 1398, 3477, 1016, 3599,
-  2033, 2665, 3167, 58,   2602, 2728, 3712, 122,  2223, 3042, 3713, 186,  3714,
-  761,  2539, 2791, 3659, 250,  3715, 314,  1716, 2476, 2854, 3354, 3716, 1589,
-  1843, 3292, 3416, 1271, 3539, 378,  3717, 1080, 3600, 825,  2160, 3105, 3660,
-  2413, 2917, 442,  1462, 1970, 3230, 3478, 3718, 2350, 2980, 506,  3719, 889,
-  3661, 1144, 1335, 2097, 3168, 3540, 3601, 570,  3720, 1780, 3355, 1653, 2287,
-  3043, 3417, 1907, 3293, 634,  953,  1526, 2666, 2729, 3479, 3662, 3721, 2603,
-  2792, 2540, 2855, 1208, 2224, 3106, 3602, 2034, 3231, 698,  3722, 1399, 3541,
-  2477, 2918, 1017, 3663, 59,   3776, 123,  3777, 187,  762,  1717, 2414, 2981,
-  3418, 3723, 3778, 1844, 3356, 251,  2161, 3169, 3779, 1590, 3480, 315,  1272,
-  3603, 3780, 1971, 3294, 1081, 2351, 3044, 3664, 379,  3781, 826,  3724, 1463,
-  3542, 443,  3782, 2098, 3232, 2730, 2288, 3107, 507,  2667, 2793, 3783, 890,
-  3725, 1336, 2604, 2856, 3604, 1145, 1781, 3419, 3665, 1654, 3481, 571,  1908,
-  3357, 3784, 2541, 2919, 1527, 3543, 2225, 3170, 954,  2478, 2982, 3726, 635,
-  2035, 3295, 3785, 1209, 3666, 1400, 3605, 2415, 3045, 699,  3786, 1018, 2162,
-  3233, 3727, 1718, 3482, 1845, 3420, 60,   2352, 3108, 3840, 124,  1591, 3544,
-  3841, 763,  3787, 188,  1972, 3358, 3842, 252,  3843, 1273, 3667, 2731, 2794,
-  316,  3844, 2668, 2857, 1082, 1464, 3606, 3728, 380,  827,  2099, 2605, 2920,
-  3296, 3788, 3845, 2289, 3171, 444,  3846, 2542, 2983, 1782, 3483, 508,  1337,
-  3668, 3847, 891,  1655, 1909, 3421, 3545, 3789, 1146, 2479, 3046, 3729, 2226,
-  3234, 572,  3848, 1528, 2036, 3359, 3607, 2416, 3109, 955,  3790, 636,  3849,
-  1210, 3730, 1401, 2163, 3297, 3669, 2353, 3172, 2795, 700,  1846, 2732, 2858,
-  3484, 3850, 1719, 3546, 1019, 2669, 2921, 3791, 1973, 3422, 1592, 3608, 2606,
-  2984, 61,   764,  3851, 3904, 125,  3905, 189,  1274, 2290, 3235, 3731, 3906,
-  2100, 3360, 253,  2543, 3047, 3907, 1465, 3670, 317,  1083, 3792, 3908, 828,
-  3852, 381,  3909, 2480, 3110, 1783, 3547, 445,  1910, 2227, 3298, 3485, 3910,
-  1656, 3609, 1338, 3732, 892,  3853, 509,  1147, 2037, 2417, 3173, 3423, 3793,
-  3911, 1529, 3671, 573,  2796, 2859, 3912, 2733, 2922, 2164, 3361, 956,  2354,
-  3236, 3854, 2670, 2985, 637,  3913, 1211, 1402, 3733, 3794, 1847, 2607, 3048,
-  3548, 1720, 3610, 1974, 3486, 701,  3914, 1020, 1593, 2544, 3111, 3672, 3855,
-  2291, 3299, 2101, 3424, 765,  1275, 3795, 3915, 62,   3968, 126,  2481, 3174,
-  3969, 190,  1466, 3734, 3970, 254,  3971, 1084, 3856, 318,  2228, 3362, 3972,
-  829,  1784, 3611, 3916, 1911, 3549, 382,  2418, 3237, 3973, 2860, 1657, 2797,
-  2923, 3673, 2038, 3487, 446,  2734, 2986, 3974, 1339, 3796, 1148, 3857, 893,
-  2671, 3049, 3917, 510,  1530, 3735, 3975, 2355, 3300, 2165, 3425, 2608, 3112,
-  574,  3976, 957,  3918, 1848, 3612, 1403, 2545, 3175, 3797, 1212, 3858, 638,
-  1721, 1975, 3550, 3674, 3977, 2292, 3363, 1594, 2102, 3488, 3736, 702,  2482,
-  3238, 3978, 1021, 3919, 1276, 2861, 2924, 3859, 766,  1467, 2229, 2798, 2987,
-  3426, 3798, 3979, 63,   4032, 127,  2419, 3301, 4033, 191,  2735, 3050, 4034,
-  1085, 1912, 3613, 3920, 255,  1785, 3675, 4035, 319,  2672, 3113, 4036, 2039,
-  3551, 830,  3980, 1658, 3737, 383,  4037, 1340, 2356, 3364, 3860, 2609, 3176,
-  447,  2166, 3489, 4038, 1149, 1531, 3799, 3921, 894,  3981, 511,  4039, 2546,
-  3239, 575,  1849, 3676, 4040, 2293, 3427, 1976, 3614, 958,  1722, 3738, 3982,
-  1404, 3861, 1213, 2483, 3302, 3922, 2103, 3552, 639,  2925, 4041, 2862, 2988,
-  1595, 3800, 2799, 3051, 2736, 3114, 703,  1022, 3983, 4042, 2230, 3490, 2420,
-  3365, 1277, 2673, 3177, 3923, 1468, 3862, 767,  1913, 3677, 4043, 1786, 3739,
-  2040, 3615, 1086, 2610, 3240, 3984, 2357, 3428, 1659, 3801, 831,  4044, 2167,
-  3553, 1341, 3924, 2547, 3303, 1532, 3863, 1150, 3985, 895,  4045, 2294, 2926,
-  2989, 3491, 2863, 3052, 1850, 2484, 3366, 3740, 1977, 3678, 2800, 3115, 1723,
-  3802, 2104, 3616, 1405, 3925, 959,  2737, 3178, 4046, 1214, 3986, 1596, 3864,
-  2421, 3429, 2231, 2674, 3241, 3554, 1023, 4047, 2611, 3304, 1278, 1469, 1914,
-  3741, 3926, 3987, 1787, 2041, 3679, 3803, 2358, 3492, 1087, 1660, 2168, 2548,
-  3367, 3617, 3865, 4048, 2990, 2927, 3053, 2864, 3116, 1342, 3988, 1533, 2295,
-  2801, 3179, 3555, 3927, 2485, 3430, 1151, 4049, 1978, 2738, 3242, 3742, 1851,
-  3804, 2105, 3680, 1724, 3866, 2675, 3305, 1406, 2422, 3493, 3989, 2232, 3618,
-  1215, 4050, 1597, 3928, 2612, 3368, 2359, 3556, 1915, 3805, 2042, 2991, 3054,
-  3743, 1470, 3990, 1788, 2928, 3117, 3867, 1279, 2549, 3431, 4051, 2865, 3180,
-  2169, 3681, 1661, 3929, 2802, 3243, 2486, 3494, 2296, 3619, 2739, 3306, 1343,
-  4052, 1534, 3991, 1979, 3806, 1852, 3868, 2676, 3369, 2106, 3744, 2423, 3557,
-  1725, 3930, 2233, 3682, 2613, 3432, 1407, 4053, 3055, 1598, 2992, 3118, 3992,
-  2929, 3181, 2360, 3620, 2866, 3244, 2550, 3495, 1916, 3869, 2043, 3807, 1789,
-  2803, 3307, 3931, 1471, 2170, 3745, 4054, 2740, 3370, 1662, 2487, 3558, 3993,
-  2297, 3683, 2677, 3433, 1535, 4055, 1980, 3870, 1853, 2107, 2424, 3621, 3808,
-  3932, 3056, 3119, 2614, 3496, 2993, 3182, 1726, 2234, 3746, 3994, 2930, 3245,
-  2867, 3308, 1599, 2361, 3684, 4056, 2551, 3559, 2804, 3371, 2044, 3871, 1917,
-  3933, 2171, 3809, 1790, 2741, 3434, 3995, 2488, 3622, 2298, 3747, 1663, 4057,
-  2678, 3497, 3120, 3057, 3183, 2994, 3246, 2425, 3685, 1981, 3934, 2108, 3872,
-  2615, 3560, 2931, 3309, 1854, 3996, 2235, 3810, 2868, 3372, 1727, 4058, 2552,
-  3623, 2805, 3435, 2362, 3748, 2742, 3498, 2045, 3935, 1918, 3997, 2172, 3873,
-  2489, 3686, 1791, 4059, 3121, 3184, 2299, 2679, 3561, 3811, 3058, 3247, 2995,
-  3310, 2932, 3373, 2426, 3749, 2616, 3624, 1982, 3998, 2109, 2869, 3436, 3936,
-  1855, 4060, 2236, 3874, 2806, 3499, 2553, 3687, 2363, 3812, 2743, 3562, 3185,
-  3122, 3248, 2046, 3999, 2490, 3750, 1919, 2173, 3059, 3311, 3937, 4061, 2680,
-  3625, 2996, 3374, 2300, 3875, 2933, 3437, 2617, 3688, 2427, 3813, 2870, 3500,
-  2110, 4000, 1983, 4062, 2807, 3563, 2237, 3938, 2554, 3751, 2364, 3876, 2744,
-  3626, 3186, 3249, 3123, 3312, 3060, 3375, 2491, 2997, 3438, 3814, 2047, 2681,
-  3689, 4063, 2174, 4001, 2934, 3501, 2301, 3939, 2871, 3564, 2618, 3752, 2428,
-  3877, 2808, 3627, 2111, 4064, 2238, 3250, 4002, 2555, 3187, 3313, 3815, 3124,
-  3376, 2745, 3690, 2365, 3940, 3061, 3439, 2998, 3502, 2492, 3878, 2682, 3753,
-  2935, 3565, 2175, 4065, 2302, 4003, 2872, 3628, 2619, 3816, 2429, 3941, 2809,
-  3691, 3251, 3314, 3188, 3377, 3125, 3440, 2556, 3879, 2239, 3062, 3503, 4066,
-  2746, 3754, 2366, 4004, 2999, 3566, 2936, 3629, 2683, 3817, 2493, 3942, 2873,
-  3692, 2303, 4067, 2620, 3880, 3315, 3252, 3378, 3189, 3441, 2430, 2810, 3755,
-  4005, 3126, 3504, 3063, 3567, 2557, 3943, 2747, 3818, 3000, 3630, 2367, 4068,
-  2937, 3693, 2684, 3881, 2494, 4006, 2874, 3756, 3316, 3379, 3253, 3442, 3190,
-  3505, 2621, 3944, 3127, 3568, 2811, 3819, 2431, 4069, 3064, 3631, 2748, 3882,
-  2558, 3001, 3694, 4007, 2938, 3757, 2685, 3945, 3380, 3317, 3443, 2495, 4070,
-  3254, 3506, 2875, 3820, 3191, 3569, 3128, 3632, 2622, 4008, 2812, 3883, 3065,
-  3695, 3002, 3758, 2749, 3946, 2559, 4071, 2939, 3821, 3381, 3444, 3318, 3507,
-  2686, 3255, 3570, 4009, 2876, 3884, 3192, 3633, 3129, 3696, 2623, 4072, 2813,
-  3947, 3066, 3759, 3003, 3822, 2750, 4010, 3445, 3382, 3508, 2940, 3885, 3319,
-  3571, 3256, 3634, 2687, 3193, 3697, 4073, 2877, 3948, 3130, 3760, 3067, 3823,
-  2814, 4011, 3004, 3886, 3446, 3509, 3383, 3572, 2751, 4074, 3320, 3635, 2941,
-  3949, 3257, 3698, 3194, 3761, 2878, 4012, 3131, 3824, 3068, 3887, 2815, 4075,
-  3510, 3447, 3573, 3005, 3950, 3384, 3636, 3321, 3699, 3258, 3762, 2942, 4013,
-  3195, 3825, 3132, 3888, 2879, 4076, 3069, 3951, 3511, 3574, 3448, 3637, 3385,
-  3700, 3006, 4014, 3322, 3763, 3259, 3826, 2943, 4077, 3196, 3889, 3133, 3952,
-  3575, 3512, 3638, 3070, 4015, 3449, 3701, 3386, 3764, 3323, 3827, 3007, 4078,
-  3260, 3890, 3197, 3953, 3134, 4016, 3576, 3639, 3513, 3702, 3450, 3765, 3071,
-  4079, 3387, 3828, 3324, 3891, 3261, 3954, 3198, 4017, 3640, 3135, 4080, 3577,
-  3703, 3514, 3766, 3451, 3829, 3388, 3892, 3325, 3955, 3262, 4018, 3199, 4081,
-  3641, 3704, 3578, 3767, 3515, 3830, 3452, 3893, 3389, 3956, 3326, 4019, 3263,
-  4082, 3705, 3642, 3768, 3579, 3831, 3516, 3894, 3453, 3957, 3390, 4020, 3327,
-  4083, 3706, 3769, 3643, 3832, 3580, 3895, 3517, 3958, 3454, 4021, 3391, 4084,
-  3770, 3707, 3833, 3644, 3896, 3581, 3959, 3518, 4022, 3455, 4085, 3771, 3834,
-  3708, 3897, 3645, 3960, 3582, 4023, 3519, 4086, 3835, 3772, 3898, 3709, 3961,
-  3646, 4024, 3583, 4087, 3836, 3899, 3773, 3962, 3710, 4025, 3647, 4088, 3900,
-  3837, 3963, 3774, 4026, 3711, 4089, 3901, 3964, 3838, 4027, 3775, 4090, 3965,
-  3902, 4028, 3839, 4091, 3966, 4029, 3903, 4092, 4030, 3967, 4093, 4031, 4094,
-  4095,
-};
-#endif  // CONFIG_TX64X64
-
-#if CONFIG_CHROMA_2X2
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_2x2_neighbors[5 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 0, 1, 1, 2, 0, 0,
+  0,    1,    32,   64,   33,   2,   3,    34,   65,   96,   128,  97,  66,
+  35,   4,    5,    36,   67,   98,  129,  160,  192,  161,  130,  99,  68,
+  37,   6,    7,    38,   69,   100, 131,  162,  193,  224,  256,  225, 194,
+  163,  132,  101,  70,   39,   8,   9,    40,   71,   102,  133,  164, 195,
+  226,  257,  288,  320,  289,  258, 227,  196,  165,  134,  103,  72,  41,
+  10,   11,   42,   73,   104,  135, 166,  197,  228,  259,  290,  321, 352,
+  384,  353,  322,  291,  260,  229, 198,  167,  136,  105,  74,   43,  12,
+  13,   44,   75,   106,  137,  168, 199,  230,  261,  292,  323,  354, 385,
+  416,  448,  417,  386,  355,  324, 293,  262,  231,  200,  169,  138, 107,
+  76,   45,   14,   15,   46,   77,  108,  139,  170,  201,  232,  263, 294,
+  325,  356,  387,  418,  449,  480, 512,  481,  450,  419,  388,  357, 326,
+  295,  264,  233,  202,  171,  140, 109,  78,   47,   16,   17,   48,  79,
+  110,  141,  172,  203,  234,  265, 296,  327,  358,  389,  420,  451, 482,
+  513,  544,  576,  545,  514,  483, 452,  421,  390,  359,  328,  297, 266,
+  235,  204,  173,  142,  111,  80,  49,   18,   19,   50,   81,   112, 143,
+  174,  205,  236,  267,  298,  329, 360,  391,  422,  453,  484,  515, 546,
+  577,  608,  640,  609,  578,  547, 516,  485,  454,  423,  392,  361, 330,
+  299,  268,  237,  206,  175,  144, 113,  82,   51,   20,   21,   52,  83,
+  114,  145,  176,  207,  238,  269, 300,  331,  362,  393,  424,  455, 486,
+  517,  548,  579,  610,  641,  672, 704,  673,  642,  611,  580,  549, 518,
+  487,  456,  425,  394,  363,  332, 301,  270,  239,  208,  177,  146, 115,
+  84,   53,   22,   23,   54,   85,  116,  147,  178,  209,  240,  271, 302,
+  333,  364,  395,  426,  457,  488, 519,  550,  581,  612,  643,  674, 705,
+  736,  768,  737,  706,  675,  644, 613,  582,  551,  520,  489,  458, 427,
+  396,  365,  334,  303,  272,  241, 210,  179,  148,  117,  86,   55,  24,
+  25,   56,   87,   118,  149,  180, 211,  242,  273,  304,  335,  366, 397,
+  428,  459,  490,  521,  552,  583, 614,  645,  676,  707,  738,  769, 800,
+  832,  801,  770,  739,  708,  677, 646,  615,  584,  553,  522,  491, 460,
+  429,  398,  367,  336,  305,  274, 243,  212,  181,  150,  119,  88,  57,
+  26,   27,   58,   89,   120,  151, 182,  213,  244,  275,  306,  337, 368,
+  399,  430,  461,  492,  523,  554, 585,  616,  647,  678,  709,  740, 771,
+  802,  833,  864,  896,  865,  834, 803,  772,  741,  710,  679,  648, 617,
+  586,  555,  524,  493,  462,  431, 400,  369,  338,  307,  276,  245, 214,
+  183,  152,  121,  90,   59,   28,  29,   60,   91,   122,  153,  184, 215,
+  246,  277,  308,  339,  370,  401, 432,  463,  494,  525,  556,  587, 618,
+  649,  680,  711,  742,  773,  804, 835,  866,  897,  928,  960,  929, 898,
+  867,  836,  805,  774,  743,  712, 681,  650,  619,  588,  557,  526, 495,
+  464,  433,  402,  371,  340,  309, 278,  247,  216,  185,  154,  123, 92,
+  61,   30,   31,   62,   93,   124, 155,  186,  217,  248,  279,  310, 341,
+  372,  403,  434,  465,  496,  527, 558,  589,  620,  651,  682,  713, 744,
+  775,  806,  837,  868,  899,  930, 961,  992,  993,  962,  931,  900, 869,
+  838,  807,  776,  745,  714,  683, 652,  621,  590,  559,  528,  497, 466,
+  435,  404,  373,  342,  311,  280, 249,  218,  187,  156,  125,  94,  63,
+  95,   126,  157,  188,  219,  250, 281,  312,  343,  374,  405,  436, 467,
+  498,  529,  560,  591,  622,  653, 684,  715,  746,  777,  808,  839, 870,
+  901,  932,  963,  994,  995,  964, 933,  902,  871,  840,  809,  778, 747,
+  716,  685,  654,  623,  592,  561, 530,  499,  468,  437,  406,  375, 344,
+  313,  282,  251,  220,  189,  158, 127,  159,  190,  221,  252,  283, 314,
+  345,  376,  407,  438,  469,  500, 531,  562,  593,  624,  655,  686, 717,
+  748,  779,  810,  841,  872,  903, 934,  965,  996,  997,  966,  935, 904,
+  873,  842,  811,  780,  749,  718, 687,  656,  625,  594,  563,  532, 501,
+  470,  439,  408,  377,  346,  315, 284,  253,  222,  191,  223,  254, 285,
+  316,  347,  378,  409,  440,  471, 502,  533,  564,  595,  626,  657, 688,
+  719,  750,  781,  812,  843,  874, 905,  936,  967,  998,  999,  968, 937,
+  906,  875,  844,  813,  782,  751, 720,  689,  658,  627,  596,  565, 534,
+  503,  472,  441,  410,  379,  348, 317,  286,  255,  287,  318,  349, 380,
+  411,  442,  473,  504,  535,  566, 597,  628,  659,  690,  721,  752, 783,
+  814,  845,  876,  907,  938,  969, 1000, 1001, 970,  939,  908,  877, 846,
+  815,  784,  753,  722,  691,  660, 629,  598,  567,  536,  505,  474, 443,
+  412,  381,  350,  319,  351,  382, 413,  444,  475,  506,  537,  568, 599,
+  630,  661,  692,  723,  754,  785, 816,  847,  878,  909,  940,  971, 1002,
+  1003, 972,  941,  910,  879,  848, 817,  786,  755,  724,  693,  662, 631,
+  600,  569,  538,  507,  476,  445, 414,  383,  415,  446,  477,  508, 539,
+  570,  601,  632,  663,  694,  725, 756,  787,  818,  849,  880,  911, 942,
+  973,  1004, 1005, 974,  943,  912, 881,  850,  819,  788,  757,  726, 695,
+  664,  633,  602,  571,  540,  509, 478,  447,  479,  510,  541,  572, 603,
+  634,  665,  696,  727,  758,  789, 820,  851,  882,  913,  944,  975, 1006,
+  1007, 976,  945,  914,  883,  852, 821,  790,  759,  728,  697,  666, 635,
+  604,  573,  542,  511,  543,  574, 605,  636,  667,  698,  729,  760, 791,
+  822,  853,  884,  915,  946,  977, 1008, 1009, 978,  947,  916,  885, 854,
+  823,  792,  761,  730,  699,  668, 637,  606,  575,  607,  638,  669, 700,
+  731,  762,  793,  824,  855,  886, 917,  948,  979,  1010, 1011, 980, 949,
+  918,  887,  856,  825,  794,  763, 732,  701,  670,  639,  671,  702, 733,
+  764,  795,  826,  857,  888,  919, 950,  981,  1012, 1013, 982,  951, 920,
+  889,  858,  827,  796,  765,  734, 703,  735,  766,  797,  828,  859, 890,
+  921,  952,  983,  1014, 1015, 984, 953,  922,  891,  860,  829,  798, 767,
+  799,  830,  861,  892,  923,  954, 985,  1016, 1017, 986,  955,  924, 893,
+  862,  831,  863,  894,  925,  956, 987,  1018, 1019, 988,  957,  926, 895,
+  927,  958,  989,  1020, 1021, 990, 959,  991,  1022, 1023
 };
-#endif
 
 // Neighborhood 2-tuples for various scans and blocksizes,
 // in {top, left} order for each position in corresponding scan order.
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 4,  0, 1, 4, 4, 5,  5,  1, 8,  8,  5,  8, 2,
-  2, 2, 5, 9, 12, 6, 9, 3, 6, 10, 13, 7, 10, 11, 14, 0, 0,
+  0, 0, 0, 0, 0,  0, 4, 4, 1, 4, 1,  1,  2,  2,  2,  5, 5,
+  8, 8, 8, 9, 12, 6, 9, 3, 6, 7, 10, 10, 13, 11, 14, 0, 0
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mcol_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
   0, 0, 0, 0, 4, 4,  8,  8, 0, 0, 1, 4, 5,  8,  9,  12, 1,
@@ -1840,19 +858,6 @@ DECLARE_ALIGNED(16, static const int16_t,
   0, 0, 0, 0, 1, 1, 2,  2, 0, 0, 1,  4,  2,  5,  3,  6, 4,
   4, 5, 8, 6, 9, 7, 10, 8, 8, 9, 12, 10, 13, 11, 14, 0, 0,
 };
-#endif  // CONFIG_EXT_TX
-
-DECLARE_ALIGNED(16, static const int16_t,
-                col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0,  0, 0, 0, 4, 4, 4, 0, 8, 8,  1,  4, 5,  8,  5,  1, 9,
-  12, 2, 5, 6, 9, 6, 2, 3, 6, 10, 13, 7, 10, 11, 14, 0, 0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 0, 1, 1, 1, 1,  4, 2,  2,  2,  5,  4,  5, 5,
-  8, 3, 6, 8, 9, 6, 9, 9, 12, 7, 10, 10, 13, 11, 14, 0, 0,
-};
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
@@ -1924,7 +929,6 @@ DECLARE_ALIGNED(16, static const int16_t,
   14, 29, 30, 45, 46, 61, 14, 14, 15, 30, 31, 46, 47, 62, 0,  0
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mrow_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = {
   0,  0,  0,  0,  1,  1,  2,  2,  0,  0,  1,  4,  2,  5,  3,  6,  4,  4,  5,
@@ -1968,7 +972,6 @@ DECLARE_ALIGNED(16, static const int16_t,
   58, 11, 11, 12, 27, 28, 43, 44, 59, 12, 12, 13, 28, 29, 44, 45, 60, 13, 13,
   14, 29, 30, 45, 46, 61, 14, 14, 15, 30, 31, 46, 47, 62, 0,  0
 };
-#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
@@ -2048,7 +1051,6 @@ DECLARE_ALIGNED(16, static const int16_t,
   223, 254, 0,   0
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mrow_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
   0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
@@ -2202,20 +1204,7 @@ DECLARE_ALIGNED(16, static const int16_t,
   30, 30, 31, 62, 63, 94, 95, 126, 127, 158, 159, 190, 191, 222, 223, 254,
   0,  0
 };
-#endif  // CONFIG_EXT_TX
-
-DECLARE_ALIGNED(16, static const int16_t,
-                col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  8,  8,  8,  0,  16, 16, 1,  8,  24, 24, 9,  16, 9,  1,  32,
-  32, 17, 24, 2,  9,  25, 32, 10, 17, 40, 40, 10, 2,  18, 25, 33, 40, 3,  10,
-  48, 48, 11, 18, 26, 33, 11, 3,  41, 48, 19, 26, 34, 41, 4,  11, 27, 34, 12,
-  19, 49, 56, 42, 49, 20, 27, 12, 4,  35, 42, 5,  12, 28, 35, 50, 57, 43, 50,
-  13, 20, 36, 43, 13, 5,  21, 28, 51, 58, 29, 36, 6,  13, 44, 51, 14, 21, 14,
-  6,  37, 44, 52, 59, 22, 29, 7,  14, 30, 37, 45, 52, 15, 22, 38, 45, 23, 30,
-  53, 60, 31, 38, 46, 53, 39, 46, 54, 61, 47, 54, 55, 62, 0,  0,
-};
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mcol_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
   0,  0,  0,  0,  8,  8,  16, 16, 24, 24, 32, 32, 40, 40, 48, 48, 0,  0,  1,
@@ -2237,28 +1226,16 @@ DECLARE_ALIGNED(16, static const int16_t,
   46, 40, 40, 41, 48, 42, 49, 43, 50, 44, 51, 45, 52, 46, 53, 47, 54, 48, 48,
   49, 56, 50, 57, 51, 58, 52, 59, 53, 60, 54, 61, 55, 62, 0,  0,
 };
-#endif  // CONFIG_EXT_TX
-
-DECLARE_ALIGNED(16, static const int16_t,
-                row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  1,  1,  0,  1,  1,  8,  2,  2,  8,  9,  2,  9,  3,  3,  9,
-  16, 3,  10, 16, 17, 4,  4,  10, 17, 17, 24, 4,  11, 11, 18, 18, 25, 24, 25,
-  5,  5,  5,  12, 12, 19, 25, 32, 19, 26, 6,  6,  26, 33, 32, 33, 13, 20, 20,
-  27, 33, 40, 6,  13, 27, 34, 40, 41, 34, 41, 21, 28, 28, 35, 41, 48, 14, 21,
-  35, 42, 7,  14, 48, 49, 29, 36, 42, 49, 36, 43, 22, 29, 49, 56, 15, 22, 43,
-  50, 50, 57, 37, 44, 30, 37, 44, 51, 23, 30, 51, 58, 45, 52, 38, 45, 52, 59,
-  31, 38, 53, 60, 39, 46, 46, 53, 47, 54, 54, 61, 55, 62, 0,  0,
-};
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  8,  0,  8,  8,  1,  8,  9,  1,  9,  16, 16, 17, 2,  9,  10,
-  2,  10, 17, 17, 24, 24, 25, 3,  10, 11, 3,  18, 25, 25, 32, 11, 18, 32, 33,
-  4,  11, 26, 33, 19, 26, 12, 4,  33, 40, 12, 19, 40, 41, 5,  12, 27, 34, 34,
-  41, 20, 27, 13, 20, 13, 5,  41, 48, 48, 49, 28, 35, 35, 42, 21, 28, 6,  6,
-  6,  13, 42, 49, 49, 56, 36, 43, 14, 21, 29, 36, 7,  14, 43, 50, 50, 57, 22,
-  29, 37, 44, 15, 22, 44, 51, 51, 58, 30, 37, 23, 30, 52, 59, 45, 52, 38, 45,
-  31, 38, 53, 60, 46, 53, 39, 46, 54, 61, 47, 54, 55, 62, 0,  0,
+  0,  0,  0,  0,  0,  0,  8,  8,  1,  8,  1,  1,  2,  2,  2,  9,  9,  16, 16,
+  16, 24, 24, 17, 24, 10, 17, 3,  10, 3,  3,  4,  4,  4,  11, 11, 18, 18, 25,
+  25, 32, 32, 32, 40, 40, 33, 40, 26, 33, 19, 26, 12, 19, 5,  12, 5,  5,  6,
+  6,  6,  13, 13, 20, 20, 27, 27, 34, 34, 41, 41, 48, 48, 48, 49, 56, 42, 49,
+  35, 42, 28, 35, 21, 28, 14, 21, 7,  14, 15, 22, 22, 29, 29, 36, 36, 43, 43,
+  50, 50, 57, 51, 58, 44, 51, 37, 44, 30, 37, 23, 30, 31, 38, 38, 45, 45, 52,
+  52, 59, 53, 60, 46, 53, 39, 46, 47, 54, 54, 61, 55, 62, 0,  0
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
@@ -2829,7 +1806,6 @@ DECLARE_ALIGNED(16, static const int16_t,
   478, 509, 479, 510, 0,   0
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mcol_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
   0,   0,   0,   0,   16,  16,  32,  32,  48,  48,  64,  64,  80,  80,  96,
@@ -2907,126 +1883,46 @@ DECLARE_ALIGNED(16, static const int16_t,
   246, 232, 247, 233, 248, 234, 249, 235, 250, 236, 251, 237, 252, 238, 253,
   239, 254, 0,   0,
 };
-#endif  // CONFIG_EXT_TX
-
-DECLARE_ALIGNED(16, static const int16_t,
-                col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   16,  16,  32,  32,  16,  0,   48,  48,  1,   16,  64,
-  64,  17,  32,  80,  80,  33,  48,  17,  1,   49,  64,  96,  96,  2,   17,
-  65,  80,  18,  33,  112, 112, 34,  49,  81,  96,  18,  2,   50,  65,  128,
-  128, 3,   18,  97,  112, 19,  34,  66,  81,  144, 144, 82,  97,  35,  50,
-  113, 128, 19,  3,   51,  66,  160, 160, 4,   19,  98,  113, 129, 144, 67,
-  82,  20,  35,  83,  98,  114, 129, 36,  51,  176, 176, 20,  4,   145, 160,
-  52,  67,  99,  114, 5,   20,  130, 145, 68,  83,  192, 192, 161, 176, 21,
-  36,  115, 130, 84,  99,  37,  52,  146, 161, 208, 208, 53,  68,  21,  5,
-  100, 115, 177, 192, 131, 146, 69,  84,  6,   21,  224, 224, 116, 131, 22,
-  37,  162, 177, 85,  100, 147, 162, 38,  53,  193, 208, 101, 116, 54,  69,
-  22,  6,   132, 147, 178, 193, 70,  85,  163, 178, 209, 224, 7,   22,  117,
-  132, 23,  38,  148, 163, 23,  7,   86,  101, 194, 209, 225, 240, 39,  54,
-  179, 194, 102, 117, 133, 148, 55,  70,  164, 179, 8,   23,  71,  86,  210,
-  225, 118, 133, 149, 164, 195, 210, 24,  39,  87,  102, 40,  55,  56,  71,
-  134, 149, 180, 195, 226, 241, 103, 118, 24,  8,   165, 180, 211, 226, 72,
-  87,  150, 165, 9,   24,  119, 134, 25,  40,  88,  103, 196, 211, 41,  56,
-  135, 150, 181, 196, 104, 119, 57,  72,  227, 242, 166, 181, 120, 135, 151,
-  166, 197, 212, 73,  88,  25,  9,   212, 227, 89,  104, 136, 151, 182, 197,
-  10,  25,  26,  41,  105, 120, 167, 182, 228, 243, 152, 167, 42,  57,  121,
-  136, 213, 228, 58,  73,  198, 213, 74,  89,  137, 152, 183, 198, 168, 183,
-  26,  10,  90,  105, 229, 244, 11,  26,  106, 121, 214, 229, 153, 168, 27,
-  42,  199, 214, 43,  58,  184, 199, 122, 137, 169, 184, 230, 245, 59,  74,
-  27,  11,  75,  90,  138, 153, 200, 215, 215, 230, 91,  106, 12,  27,  28,
-  43,  185, 200, 107, 122, 154, 169, 44,  59,  231, 246, 216, 231, 60,  75,
-  123, 138, 28,  12,  76,  91,  201, 216, 170, 185, 232, 247, 139, 154, 92,
-  107, 13,  28,  108, 123, 29,  44,  186, 201, 217, 232, 155, 170, 45,  60,
-  29,  13,  61,  76,  124, 139, 14,  14,  233, 248, 77,  92,  14,  29,  171,
-  186, 140, 155, 202, 217, 30,  45,  93,  108, 109, 124, 46,  61,  156, 171,
-  62,  77,  187, 202, 15,  30,  125, 140, 218, 233, 78,  93,  31,  46,  172,
-  187, 47,  62,  141, 156, 94,  109, 234, 249, 203, 218, 63,  78,  110, 125,
-  188, 203, 157, 172, 126, 141, 79,  94,  173, 188, 95,  110, 219, 234, 142,
-  157, 204, 219, 235, 250, 111, 126, 158, 173, 127, 142, 189, 204, 220, 235,
-  143, 158, 174, 189, 205, 220, 236, 251, 159, 174, 190, 205, 221, 236, 175,
-  190, 237, 252, 206, 221, 222, 237, 191, 206, 238, 253, 207, 222, 223, 238,
-  239, 254, 0,   0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   1,   1,   0,   1,   2,   2,   1,   16,  3,   3,   2,
-  17,  16,  17,  4,   4,   17,  32,  3,   18,  5,   5,   18,  33,  32,  33,
-  4,   19,  33,  48,  6,   6,   19,  34,  5,   20,  34,  49,  48,  49,  7,
-  7,   20,  35,  49,  64,  6,   21,  35,  50,  21,  36,  64,  65,  8,   8,
-  50,  65,  36,  51,  7,   22,  22,  37,  65,  80,  51,  66,  9,   9,   37,
-  52,  8,   23,  66,  81,  52,  67,  80,  81,  23,  38,  10,  10,  38,  53,
-  67,  82,  81,  96,  53,  68,  9,   24,  82,  97,  68,  83,  24,  39,  96,
-  97,  39,  54,  11,  11,  54,  69,  83,  98,  97,  112, 69,  84,  10,  25,
-  25,  40,  40,  55,  98,  113, 84,  99,  12,  12,  55,  70,  112, 113, 70,
-  85,  11,  26,  99,  114, 85,  100, 113, 128, 26,  41,  41,  56,  56,  71,
-  100, 115, 13,  13,  71,  86,  114, 129, 86,  101, 128, 129, 57,  72,  115,
-  130, 101, 116, 12,  27,  42,  57,  14,  14,  72,  87,  27,  42,  129, 144,
-  87,  102, 116, 131, 130, 145, 102, 117, 58,  73,  144, 145, 73,  88,  117,
-  132, 88,  103, 13,  28,  43,  58,  131, 146, 103, 118, 28,  43,  145, 160,
-  132, 147, 74,  89,  89,  104, 118, 133, 146, 161, 104, 119, 160, 161, 59,
-  74,  119, 134, 133, 148, 14,  29,  44,  59,  147, 162, 161, 176, 29,  44,
-  105, 120, 75,  90,  90,  105, 148, 163, 162, 177, 134, 149, 176, 177, 120,
-  135, 149, 164, 163, 178, 15,  30,  135, 150, 177, 192, 60,  75,  106, 121,
-  45,  60,  121, 136, 178, 193, 91,  106, 136, 151, 164, 179, 192, 193, 30,
-  45,  150, 165, 151, 166, 179, 194, 76,  91,  165, 180, 122, 137, 193, 208,
-  107, 122, 137, 152, 208, 209, 180, 195, 61,  76,  152, 167, 194, 209, 166,
-  181, 224, 224, 92,  107, 181, 196, 46,  61,  138, 153, 209, 224, 167, 182,
-  153, 168, 195, 210, 31,  46,  123, 138, 77,  92,  168, 183, 210, 225, 196,
-  211, 225, 240, 182, 197, 154, 169, 108, 123, 139, 154, 183, 198, 62,  77,
-  197, 212, 169, 184, 93,  108, 211, 226, 184, 199, 47,  62,  212, 227, 226,
-  241, 124, 139, 198, 213, 155, 170, 170, 185, 140, 155, 213, 228, 227, 242,
-  109, 124, 78,  93,  185, 200, 228, 243, 199, 214, 200, 215, 214, 229, 125,
-  140, 171, 186, 186, 201, 63,  78,  156, 171, 94,  109, 141, 156, 229, 244,
-  201, 216, 215, 230, 79,  94,  230, 245, 216, 231, 110, 125, 187, 202, 231,
-  246, 217, 232, 157, 172, 202, 217, 126, 141, 95,  110, 142, 157, 172, 187,
-  232, 247, 111, 126, 218, 233, 203, 218, 233, 248, 173, 188, 188, 203, 127,
-  142, 158, 173, 143, 158, 234, 249, 219, 234, 189, 204, 204, 219, 159, 174,
-  174, 189, 235, 250, 205, 220, 175, 190, 190, 205, 220, 235, 191, 206, 221,
-  236, 236, 251, 206, 221, 237, 252, 207, 222, 222, 237, 223, 238, 238, 253,
-  239, 254, 0,   0,
-};
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   16,  0,   16,  16,  1,   16,  17,  1,   32,  32,  17,
-  32,  2,   17,  18,  2,   48,  48,  18,  33,  33,  48,  3,   18,  49,  64,
-  64,  65,  34,  49,  19,  3,   19,  34,  50,  65,  4,   19,  65,  80,  80,
-  81,  35,  50,  20,  4,   20,  35,  66,  81,  81,  96,  51,  66,  96,  97,
-  5,   20,  36,  51,  82,  97,  21,  36,  67,  82,  97,  112, 21,  5,   52,
-  67,  112, 113, 37,  52,  6,   21,  83,  98,  98,  113, 68,  83,  22,  6,
-  113, 128, 22,  37,  53,  68,  84,  99,  99,  114, 128, 129, 114, 129, 69,
-  84,  38,  53,  7,   22,  23,  7,   129, 144, 23,  38,  54,  69,  100, 115,
-  85,  100, 115, 130, 144, 145, 130, 145, 39,  54,  70,  85,  8,   23,  55,
-  70,  116, 131, 101, 116, 145, 160, 24,  39,  24,  8,   86,  101, 131, 146,
-  160, 161, 146, 161, 71,  86,  40,  55,  9,   24,  117, 132, 102, 117, 161,
-  176, 132, 147, 56,  71,  87,  102, 25,  40,  147, 162, 25,  9,   176, 177,
-  162, 177, 72,  87,  41,  56,  118, 133, 133, 148, 103, 118, 10,  25,  148,
-  163, 57,  72,  88,  103, 177, 192, 26,  41,  163, 178, 192, 193, 26,  10,
-  119, 134, 73,  88,  149, 164, 104, 119, 134, 149, 42,  57,  178, 193, 164,
-  179, 11,  26,  58,  73,  193, 208, 89,  104, 135, 150, 120, 135, 27,  42,
-  74,  89,  208, 209, 150, 165, 179, 194, 165, 180, 105, 120, 194, 209, 43,
-  58,  27,  11,  136, 151, 90,  105, 151, 166, 180, 195, 59,  74,  121, 136,
-  209, 224, 195, 210, 224, 225, 166, 181, 106, 121, 75,  90,  12,  27,  181,
-  196, 28,  12,  210, 225, 152, 167, 167, 182, 137, 152, 28,  43,  196, 211,
-  122, 137, 91,  106, 225, 240, 44,  59,  13,  28,  107, 122, 182, 197, 168,
-  183, 211, 226, 153, 168, 226, 241, 60,  75,  197, 212, 138, 153, 29,  44,
-  76,  91,  29,  13,  183, 198, 123, 138, 45,  60,  212, 227, 198, 213, 154,
-  169, 169, 184, 227, 242, 92,  107, 61,  76,  139, 154, 14,  29,  30,  14,
-  184, 199, 213, 228, 108, 123, 199, 214, 228, 243, 77,  92,  30,  45,  170,
-  185, 155, 170, 185, 200, 93,  108, 124, 139, 214, 229, 46,  61,  200, 215,
-  229, 244, 15,  30,  109, 124, 62,  77,  140, 155, 215, 230, 31,  46,  171,
-  186, 186, 201, 201, 216, 78,  93,  230, 245, 125, 140, 47,  62,  216, 231,
-  156, 171, 94,  109, 231, 246, 141, 156, 63,  78,  202, 217, 187, 202, 110,
-  125, 217, 232, 172, 187, 232, 247, 79,  94,  157, 172, 126, 141, 203, 218,
-  95,  110, 233, 248, 218, 233, 142, 157, 111, 126, 173, 188, 188, 203, 234,
-  249, 219, 234, 127, 142, 158, 173, 204, 219, 189, 204, 143, 158, 235, 250,
-  174, 189, 205, 220, 159, 174, 220, 235, 221, 236, 175, 190, 190, 205, 236,
-  251, 206, 221, 237, 252, 191, 206, 222, 237, 207, 222, 238, 253, 223, 238,
-  239, 254, 0,   0,
+  0,   0,   0,   0,   0,   0,   16,  16,  1,   16,  1,   1,   2,   2,   2,
+  17,  17,  32,  32,  32,  48,  48,  33,  48,  18,  33,  3,   18,  3,   3,
+  4,   4,   4,   19,  19,  34,  34,  49,  49,  64,  64,  64,  80,  80,  65,
+  80,  50,  65,  35,  50,  20,  35,  5,   20,  5,   5,   6,   6,   6,   21,
+  21,  36,  36,  51,  51,  66,  66,  81,  81,  96,  96,  96,  112, 112, 97,
+  112, 82,  97,  67,  82,  52,  67,  37,  52,  22,  37,  7,   22,  7,   7,
+  8,   8,   8,   23,  23,  38,  38,  53,  53,  68,  68,  83,  83,  98,  98,
+  113, 113, 128, 128, 128, 144, 144, 129, 144, 114, 129, 99,  114, 84,  99,
+  69,  84,  54,  69,  39,  54,  24,  39,  9,   24,  9,   9,   10,  10,  10,
+  25,  25,  40,  40,  55,  55,  70,  70,  85,  85,  100, 100, 115, 115, 130,
+  130, 145, 145, 160, 160, 160, 176, 176, 161, 176, 146, 161, 131, 146, 116,
+  131, 101, 116, 86,  101, 71,  86,  56,  71,  41,  56,  26,  41,  11,  26,
+  11,  11,  12,  12,  12,  27,  27,  42,  42,  57,  57,  72,  72,  87,  87,
+  102, 102, 117, 117, 132, 132, 147, 147, 162, 162, 177, 177, 192, 192, 192,
+  208, 208, 193, 208, 178, 193, 163, 178, 148, 163, 133, 148, 118, 133, 103,
+  118, 88,  103, 73,  88,  58,  73,  43,  58,  28,  43,  13,  28,  13,  13,
+  14,  14,  14,  29,  29,  44,  44,  59,  59,  74,  74,  89,  89,  104, 104,
+  119, 119, 134, 134, 149, 149, 164, 164, 179, 179, 194, 194, 209, 209, 224,
+  224, 224, 225, 240, 210, 225, 195, 210, 180, 195, 165, 180, 150, 165, 135,
+  150, 120, 135, 105, 120, 90,  105, 75,  90,  60,  75,  45,  60,  30,  45,
+  15,  30,  31,  46,  46,  61,  61,  76,  76,  91,  91,  106, 106, 121, 121,
+  136, 136, 151, 151, 166, 166, 181, 181, 196, 196, 211, 211, 226, 226, 241,
+  227, 242, 212, 227, 197, 212, 182, 197, 167, 182, 152, 167, 137, 152, 122,
+  137, 107, 122, 92,  107, 77,  92,  62,  77,  47,  62,  63,  78,  78,  93,
+  93,  108, 108, 123, 123, 138, 138, 153, 153, 168, 168, 183, 183, 198, 198,
+  213, 213, 228, 228, 243, 229, 244, 214, 229, 199, 214, 184, 199, 169, 184,
+  154, 169, 139, 154, 124, 139, 109, 124, 94,  109, 79,  94,  95,  110, 110,
+  125, 125, 140, 140, 155, 155, 170, 170, 185, 185, 200, 200, 215, 215, 230,
+  230, 245, 231, 246, 216, 231, 201, 216, 186, 201, 171, 186, 156, 171, 141,
+  156, 126, 141, 111, 126, 127, 142, 142, 157, 157, 172, 172, 187, 187, 202,
+  202, 217, 217, 232, 232, 247, 233, 248, 218, 233, 203, 218, 188, 203, 173,
+  188, 158, 173, 143, 158, 159, 174, 174, 189, 189, 204, 204, 219, 219, 234,
+  234, 249, 235, 250, 220, 235, 205, 220, 190, 205, 175, 190, 191, 206, 206,
+  221, 221, 236, 236, 251, 237, 252, 222, 237, 207, 222, 223, 238, 238, 253,
+  239, 254, 0,   0
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mcol_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
   0,   0,    0,   0,    32,  32,   64,  64,   96,  96,   128, 128,  160, 160,
@@ -3328,1899 +2224,162 @@ DECLARE_ALIGNED(16, static const int16_t,
   983, 1014, 984, 1015, 985, 1016, 986, 1017, 987, 1018, 988, 1019, 989, 1020,
   990, 1021, 991, 1022, 0,   0,
 };
-#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
-  0,   0,    0,   0,    32,  0,    32,  32,   1,   32,  33,  1,    64,  64,
-  33,  64,   2,   33,   96,  96,   34,  2,    65,  96,  34,  65,   128, 128,
-  97,  128,  3,   34,   66,  97,   35,  3,    35,  66,  98,  129,  129, 160,
-  160, 161,  4,   35,   67,  98,   192, 192,  36,  4,   130, 161,  161, 192,
-  36,  67,   99,  130,  5,   36,   68,  99,   193, 224, 162, 193,  224, 225,
-  131, 162,  37,  68,   100, 131,  37,  5,    194, 225, 225, 256,  256, 257,
-  163, 194,  69,  100,  132, 163,  6,   37,   226, 257, 38,  6,    195, 226,
-  257, 288,  101, 132,  288, 289,  38,  69,   164, 195, 133, 164,  258, 289,
-  227, 258,  196, 227,  7,   38,   289, 320,  70,  101, 320, 321,  39,  7,
-  165, 196,  39,  70,   102, 133,  290, 321,  259, 290, 228, 259,  321, 352,
-  352, 353,  197, 228,  134, 165,  71,  102,  8,   39,  322, 353,  291, 322,
-  260, 291,  103, 134,  353, 384,  166, 197,  229, 260, 40,  71,   40,  8,
-  384, 385,  135, 166,  354, 385,  323, 354,  198, 229, 292, 323,  72,  103,
-  261, 292,  9,   40,   385, 416,  167, 198,  104, 135, 230, 261,  355, 386,
-  416, 417,  293, 324,  324, 355,  41,  9,    41,  72,  386, 417,  199, 230,
-  136, 167,  417, 448,  262, 293,  356, 387,  73,  104, 387, 418,  231, 262,
-  10,  41,   168, 199,  325, 356,  418, 449,  105, 136, 448, 449,  42,  73,
-  294, 325,  200, 231,  42,  10,   357, 388,  137, 168, 263, 294,  388, 419,
-  74,  105,  419, 450,  449, 480,  326, 357,  232, 263, 295, 326,  169, 200,
-  11,  42,   106, 137,  480, 481,  450, 481,  358, 389, 264, 295,  201, 232,
-  138, 169,  389, 420,  43,  74,   420, 451,  327, 358, 43,  11,   481, 512,
-  233, 264,  451, 482,  296, 327,  75,  106,  170, 201, 482, 513,  512, 513,
-  390, 421,  359, 390,  421, 452,  107, 138,  12,  43,  202, 233,  452, 483,
-  265, 296,  328, 359,  139, 170,  44,  75,   483, 514, 513, 544,  234, 265,
-  297, 328,  422, 453,  44,  12,   391, 422,  171, 202, 76,  107,  514, 545,
-  453, 484,  544, 545,  266, 297,  203, 234,  108, 139, 329, 360,  298, 329,
-  140, 171,  515, 546,  13,  44,   423, 454,  235, 266, 545, 576,  454, 485,
-  45,  76,   172, 203,  330, 361,  576, 577,  45,  13,  267, 298,  546, 577,
-  77,  108,  204, 235,  455, 486,  577, 608,  299, 330, 109, 140,  547, 578,
-  14,  45,   46,  14,   141, 172,  578, 609,  331, 362, 46,  77,   173, 204,
-  15,  15,   78,  109,  205, 236,  579, 610,  110, 141, 15,  46,   142, 173,
-  47,  78,   174, 205,  16,  16,   79,  110,  206, 237, 16,  47,   111, 142,
-  48,  79,   143, 174,  80,  111,  175, 206,  17,  48,  49,  17,   207, 238,
-  49,  80,   81,  112,  18,  18,   18,  49,   50,  81,  82,  113,  19,  50,
-  51,  82,   83,  114,  608, 609,  484, 515,  360, 391, 236, 267,  112, 143,
-  51,  19,   640, 640,  609, 640,  516, 547,  485, 516, 392, 423,  361, 392,
-  268, 299,  237, 268,  144, 175,  113, 144,  20,  51,  52,  20,   672, 672,
-  641, 672,  610, 641,  548, 579,  517, 548,  486, 517, 424, 455,  393, 424,
-  362, 393,  300, 331,  269, 300,  238, 269,  176, 207, 145, 176,  114, 145,
-  52,  83,   21,  52,   53,  21,   704, 704,  673, 704, 642, 673,  611, 642,
-  580, 611,  549, 580,  518, 549,  487, 518,  456, 487, 425, 456,  394, 425,
-  363, 394,  332, 363,  301, 332,  270, 301,  239, 270, 208, 239,  177, 208,
-  146, 177,  115, 146,  84,  115,  53,  84,   22,  53,  54,  22,   705, 736,
-  674, 705,  643, 674,  581, 612,  550, 581,  519, 550, 457, 488,  426, 457,
-  395, 426,  333, 364,  302, 333,  271, 302,  209, 240, 178, 209,  147, 178,
-  85,  116,  54,  85,   23,  54,   706, 737,  675, 706, 582, 613,  551, 582,
-  458, 489,  427, 458,  334, 365,  303, 334,  210, 241, 179, 210,  86,  117,
-  55,  86,   707, 738,  583, 614,  459, 490,  335, 366, 211, 242,  87,  118,
-  736, 737,  612, 643,  488, 519,  364, 395,  240, 271, 116, 147,  55,  23,
-  768, 768,  737, 768,  644, 675,  613, 644,  520, 551, 489, 520,  396, 427,
-  365, 396,  272, 303,  241, 272,  148, 179,  117, 148, 24,  55,   56,  24,
-  800, 800,  769, 800,  738, 769,  676, 707,  645, 676, 614, 645,  552, 583,
-  521, 552,  490, 521,  428, 459,  397, 428,  366, 397, 304, 335,  273, 304,
-  242, 273,  180, 211,  149, 180,  118, 149,  56,  87,  25,  56,   57,  25,
-  832, 832,  801, 832,  770, 801,  739, 770,  708, 739, 677, 708,  646, 677,
-  615, 646,  584, 615,  553, 584,  522, 553,  491, 522, 460, 491,  429, 460,
-  398, 429,  367, 398,  336, 367,  305, 336,  274, 305, 243, 274,  212, 243,
-  181, 212,  150, 181,  119, 150,  88,  119,  57,  88,  26,  57,   58,  26,
-  833, 864,  802, 833,  771, 802,  709, 740,  678, 709, 647, 678,  585, 616,
-  554, 585,  523, 554,  461, 492,  430, 461,  399, 430, 337, 368,  306, 337,
-  275, 306,  213, 244,  182, 213,  151, 182,  89,  120, 58,  89,   27,  58,
-  834, 865,  803, 834,  710, 741,  679, 710,  586, 617, 555, 586,  462, 493,
-  431, 462,  338, 369,  307, 338,  214, 245,  183, 214, 90,  121,  59,  90,
-  835, 866,  711, 742,  587, 618,  463, 494,  339, 370, 215, 246,  91,  122,
-  864, 865,  740, 771,  616, 647,  492, 523,  368, 399, 244, 275,  120, 151,
-  59,  27,   896, 896,  865, 896,  772, 803,  741, 772, 648, 679,  617, 648,
-  524, 555,  493, 524,  400, 431,  369, 400,  276, 307, 245, 276,  152, 183,
-  121, 152,  28,  59,   60,  28,   928, 928,  897, 928, 866, 897,  804, 835,
-  773, 804,  742, 773,  680, 711,  649, 680,  618, 649, 556, 587,  525, 556,
-  494, 525,  432, 463,  401, 432,  370, 401,  308, 339, 277, 308,  246, 277,
-  184, 215,  153, 184,  122, 153,  60,  91,   29,  60,  61,  29,   960, 960,
-  929, 960,  898, 929,  867, 898,  836, 867,  805, 836, 774, 805,  743, 774,
-  712, 743,  681, 712,  650, 681,  619, 650,  588, 619, 557, 588,  526, 557,
-  495, 526,  464, 495,  433, 464,  402, 433,  371, 402, 340, 371,  309, 340,
-  278, 309,  247, 278,  216, 247,  185, 216,  154, 185, 123, 154,  92,  123,
-  61,  92,   30,  61,   62,  30,   961, 992,  930, 961, 899, 930,  837, 868,
-  806, 837,  775, 806,  713, 744,  682, 713,  651, 682, 589, 620,  558, 589,
-  527, 558,  465, 496,  434, 465,  403, 434,  341, 372, 310, 341,  279, 310,
-  217, 248,  186, 217,  155, 186,  93,  124,  62,  93,  31,  62,   962, 993,
-  931, 962,  838, 869,  807, 838,  714, 745,  683, 714, 590, 621,  559, 590,
-  466, 497,  435, 466,  342, 373,  311, 342,  218, 249, 187, 218,  94,  125,
-  63,  94,   963, 994,  839, 870,  715, 746,  591, 622, 467, 498,  343, 374,
-  219, 250,  95,  126,  868, 899,  744, 775,  620, 651, 496, 527,  372, 403,
-  248, 279,  124, 155,  900, 931,  869, 900,  776, 807, 745, 776,  652, 683,
-  621, 652,  528, 559,  497, 528,  404, 435,  373, 404, 280, 311,  249, 280,
-  156, 187,  125, 156,  932, 963,  901, 932,  870, 901, 808, 839,  777, 808,
-  746, 777,  684, 715,  653, 684,  622, 653,  560, 591, 529, 560,  498, 529,
-  436, 467,  405, 436,  374, 405,  312, 343,  281, 312, 250, 281,  188, 219,
-  157, 188,  126, 157,  964, 995,  933, 964,  902, 933, 871, 902,  840, 871,
-  809, 840,  778, 809,  747, 778,  716, 747,  685, 716, 654, 685,  623, 654,
-  592, 623,  561, 592,  530, 561,  499, 530,  468, 499, 437, 468,  406, 437,
-  375, 406,  344, 375,  313, 344,  282, 313,  251, 282, 220, 251,  189, 220,
-  158, 189,  127, 158,  965, 996,  934, 965,  903, 934, 841, 872,  810, 841,
-  779, 810,  717, 748,  686, 717,  655, 686,  593, 624, 562, 593,  531, 562,
-  469, 500,  438, 469,  407, 438,  345, 376,  314, 345, 283, 314,  221, 252,
-  190, 221,  159, 190,  966, 997,  935, 966,  842, 873, 811, 842,  718, 749,
-  687, 718,  594, 625,  563, 594,  470, 501,  439, 470, 346, 377,  315, 346,
-  222, 253,  191, 222,  967, 998,  843, 874,  719, 750, 595, 626,  471, 502,
-  347, 378,  223, 254,  872, 903,  748, 779,  624, 655, 500, 531,  376, 407,
-  252, 283,  904, 935,  873, 904,  780, 811,  749, 780, 656, 687,  625, 656,
-  532, 563,  501, 532,  408, 439,  377, 408,  284, 315, 253, 284,  936, 967,
-  905, 936,  874, 905,  812, 843,  781, 812,  750, 781, 688, 719,  657, 688,
-  626, 657,  564, 595,  533, 564,  502, 533,  440, 471, 409, 440,  378, 409,
-  316, 347,  285, 316,  254, 285,  968, 999,  937, 968, 906, 937,  875, 906,
-  844, 875,  813, 844,  782, 813,  751, 782,  720, 751, 689, 720,  658, 689,
-  627, 658,  596, 627,  565, 596,  534, 565,  503, 534, 472, 503,  441, 472,
-  410, 441,  379, 410,  348, 379,  317, 348,  286, 317, 255, 286,  969, 1000,
-  938, 969,  907, 938,  845, 876,  814, 845,  783, 814, 721, 752,  690, 721,
-  659, 690,  597, 628,  566, 597,  535, 566,  473, 504, 442, 473,  411, 442,
-  349, 380,  318, 349,  287, 318,  970, 1001, 939, 970, 846, 877,  815, 846,
-  722, 753,  691, 722,  598, 629,  567, 598,  474, 505, 443, 474,  350, 381,
-  319, 350,  971, 1002, 847, 878,  723, 754,  599, 630, 475, 506,  351, 382,
-  876, 907,  752, 783,  628, 659,  504, 535,  380, 411, 908, 939,  877, 908,
-  784, 815,  753, 784,  660, 691,  629, 660,  536, 567, 505, 536,  412, 443,
-  381, 412,  940, 971,  909, 940,  878, 909,  816, 847, 785, 816,  754, 785,
-  692, 723,  661, 692,  630, 661,  568, 599,  537, 568, 506, 537,  444, 475,
-  413, 444,  382, 413,  972, 1003, 941, 972,  910, 941, 879, 910,  848, 879,
-  817, 848,  786, 817,  755, 786,  724, 755,  693, 724, 662, 693,  631, 662,
-  600, 631,  569, 600,  538, 569,  507, 538,  476, 507, 445, 476,  414, 445,
-  383, 414,  973, 1004, 942, 973,  911, 942,  849, 880, 818, 849,  787, 818,
-  725, 756,  694, 725,  663, 694,  601, 632,  570, 601, 539, 570,  477, 508,
-  446, 477,  415, 446,  974, 1005, 943, 974,  850, 881, 819, 850,  726, 757,
-  695, 726,  602, 633,  571, 602,  478, 509,  447, 478, 975, 1006, 851, 882,
-  727, 758,  603, 634,  479, 510,  880, 911,  756, 787, 632, 663,  508, 539,
-  912, 943,  881, 912,  788, 819,  757, 788,  664, 695, 633, 664,  540, 571,
-  509, 540,  944, 975,  913, 944,  882, 913,  820, 851, 789, 820,  758, 789,
-  696, 727,  665, 696,  634, 665,  572, 603,  541, 572, 510, 541,  976, 1007,
-  945, 976,  914, 945,  883, 914,  852, 883,  821, 852, 790, 821,  759, 790,
-  728, 759,  697, 728,  666, 697,  635, 666,  604, 635, 573, 604,  542, 573,
-  511, 542,  977, 1008, 946, 977,  915, 946,  853, 884, 822, 853,  791, 822,
-  729, 760,  698, 729,  667, 698,  605, 636,  574, 605, 543, 574,  978, 1009,
-  947, 978,  854, 885,  823, 854,  730, 761,  699, 730, 606, 637,  575, 606,
-  979, 1010, 855, 886,  731, 762,  607, 638,  884, 915, 760, 791,  636, 667,
-  916, 947,  885, 916,  792, 823,  761, 792,  668, 699, 637, 668,  948, 979,
-  917, 948,  886, 917,  824, 855,  793, 824,  762, 793, 700, 731,  669, 700,
-  638, 669,  980, 1011, 949, 980,  918, 949,  887, 918, 856, 887,  825, 856,
-  794, 825,  763, 794,  732, 763,  701, 732,  670, 701, 639, 670,  981, 1012,
-  950, 981,  919, 950,  857, 888,  826, 857,  795, 826, 733, 764,  702, 733,
-  671, 702,  982, 1013, 951, 982,  858, 889,  827, 858, 734, 765,  703, 734,
-  983, 1014, 859, 890,  735, 766,  888, 919,  764, 795, 920, 951,  889, 920,
-  796, 827,  765, 796,  952, 983,  921, 952,  890, 921, 828, 859,  797, 828,
-  766, 797,  984, 1015, 953, 984,  922, 953,  891, 922, 860, 891,  829, 860,
-  798, 829,  767, 798,  985, 1016, 954, 985,  923, 954, 861, 892,  830, 861,
-  799, 830,  986, 1017, 955, 986,  862, 893,  831, 862, 987, 1018, 863, 894,
-  892, 923,  924, 955,  893, 924,  956, 987,  925, 956, 894, 925,  988, 1019,
-  957, 988,  926, 957,  895, 926,  989, 1020, 958, 989, 927, 958,  990, 1021,
-  959, 990,  991, 1022, 0,   0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                v2_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
-  0,   0,    0,   0,    0,   0,    1,   32,   1,   1,   32,  32,   2,   33,
-  33,  64,   34,  65,   2,   2,    64,  64,   3,   34,  65,  96,   35,  66,
-  66,  97,   3,   3,    96,  96,   4,   35,   97,  128, 67,  98,   36,  67,
-  98,  129,  4,   4,    68,  99,   99,  130,  128, 128, 5,   36,   129, 160,
-  37,  68,   130, 161,  100, 131,  69,  100,  131, 162, 5,   5,    160, 160,
-  6,   37,   161, 192,  38,  69,   162, 193,  101, 132, 132, 163,  70,  101,
-  163, 194,  6,   6,    192, 192,  7,   38,   133, 164, 193, 224,  102, 133,
-  164, 195,  39,  70,   194, 225,  71,  102,  195, 226, 134, 165,  165, 196,
-  7,   7,    224, 224,  8,   39,   103, 134,  196, 227, 225, 256,  40,  71,
-  226, 257,  166, 197,  72,  103,  227, 258,  135, 166, 197, 228,  104, 135,
-  228, 259,  8,   8,    256, 256,  9,   40,   257, 288, 41,  72,   167, 198,
-  198, 229,  258, 289,  136, 167,  229, 260,  73,  104, 259, 290,  105, 136,
-  260, 291,  199, 230,  9,   9,    168, 199,  230, 261, 288, 288,  10,  41,
-  289, 320,  42,  73,   290, 321,  137, 168,  261, 292, 74,  105,  291, 322,
-  200, 231,  231, 262,  106, 137,  292, 323,  169, 200, 262, 293,  10,  10,
-  320, 320,  11,  42,   321, 352,  43,  74,   138, 169, 293, 324,  322, 353,
-  232, 263,  75,  106,  201, 232,  263, 294,  323, 354, 170, 201,  294, 325,
-  107, 138,  324, 355,  11,  11,   352, 352,  12,  43,  233, 264,  264, 295,
-  353, 384,  139, 170,  325, 356,  44,  75,   354, 385, 202, 233,  295, 326,
-  76,  107,  355, 386,  171, 202,  326, 357,  108, 139, 356, 387,  265, 296,
-  234, 265,  296, 327,  12,  12,   140, 171,  357, 388, 384, 384,  13,  44,
-  203, 234,  327, 358,  385, 416,  45,  76,   386, 417, 77,  108,  387, 418,
-  172, 203,  358, 389,  266, 297,  297, 328,  109, 140, 235, 266,  328, 359,
-  388, 419,  204, 235,  359, 390,  141, 172,  389, 420, 13,  13,   416, 416,
-  14,  45,   417, 448,  46,  77,   298, 329,  418, 449, 267, 298,  329, 360,
-  78,  109,  173, 204,  390, 421,  419, 450,  236, 267, 360, 391,  110, 141,
-  420, 451,  205, 236,  391, 422,  142, 173,  299, 330, 330, 361,  421, 452,
-  14,  14,   268, 299,  361, 392,  448, 448,  15,  46,  449, 480,  47,  78,
-  450, 481,  174, 205,  422, 453,  237, 268,  392, 423, 79,  110,  451, 482,
-  111, 142,  452, 483,  331, 362,  300, 331,  362, 393, 206, 237,  423, 454,
-  143, 174,  269, 300,  393, 424,  453, 484,  480, 480, 481, 512,  238, 269,
-  424, 455,  482, 513,  175, 206,  454, 485,  332, 363, 363, 394,  483, 514,
-  301, 332,  394, 425,  484, 515,  207, 238,  455, 486, 270, 301,  425, 456,
-  485, 516,  364, 395,  239, 270,  456, 487,  512, 512, 333, 364,  395, 426,
-  513, 544,  486, 517,  514, 545,  302, 333,  426, 457, 515, 546,  487, 518,
-  516, 547,  271, 302,  457, 488,  365, 396,  396, 427, 517, 548,  334, 365,
-  427, 458,  488, 519,  544, 544,  303, 334,  458, 489, 518, 549,  545, 576,
-  546, 577,  547, 578,  489, 520,  397, 428,  519, 550, 366, 397,  428, 459,
-  548, 579,  335, 366,  459, 490,  549, 580,  520, 551, 490, 521,  550, 581,
-  576, 576,  577, 608,  398, 429,  429, 460,  578, 609, 367, 398,  460, 491,
-  521, 552,  579, 610,  551, 582,  491, 522,  580, 611, 581, 612,  552, 583,
-  522, 553,  430, 461,  399, 430,  461, 492,  582, 613, 492, 523,  608, 608,
-  609, 640,  610, 641,  553, 584,  611, 642,  523, 554, 583, 614,  612, 643,
-  431, 462,  462, 493,  554, 585,  493, 524,  584, 615, 613, 644,  524, 555,
-  614, 645,  640, 640,  585, 616,  641, 672,  555, 586, 642, 673,  615, 646,
-  463, 494,  643, 674,  494, 525,  644, 675,  525, 556, 586, 617,  616, 647,
-  645, 676,  556, 587,  646, 677,  495, 526,  617, 648, 587, 618,  672, 672,
-  526, 557,  673, 704,  674, 705,  647, 678,  557, 588, 675, 706,  618, 649,
-  676, 707,  588, 619,  648, 679,  677, 708,  527, 558, 558, 589,  678, 709,
-  619, 650,  649, 680,  704, 704,  589, 620,  705, 736, 679, 710,  706, 737,
-  707, 738,  650, 681,  620, 651,  708, 739,  680, 711, 559, 590,  709, 740,
-  590, 621,  651, 682,  681, 712,  710, 741,  621, 652, 736, 736,  737, 768,
-  711, 742,  738, 769,  682, 713,  652, 683,  739, 770, 591, 622,  740, 771,
-  712, 743,  622, 653,  741, 772,  683, 714,  653, 684, 713, 744,  742, 773,
-  623, 654,  743, 774,  768, 768,  769, 800,  684, 715, 714, 745,  770, 801,
-  771, 802,  654, 685,  744, 775,  772, 803,  715, 746, 773, 804,  685, 716,
-  745, 776,  774, 805,  655, 686,  716, 747,  775, 806, 746, 777,  800, 800,
-  801, 832,  686, 717,  802, 833,  803, 834,  776, 807, 804, 835,  747, 778,
-  717, 748,  805, 836,  777, 808,  687, 718,  806, 837, 748, 779,  718, 749,
-  778, 809,  807, 838,  832, 832,  833, 864,  834, 865, 835, 866,  808, 839,
-  749, 780,  836, 867,  779, 810,  719, 750,  837, 868, 809, 840,  838, 869,
-  780, 811,  750, 781,  810, 841,  839, 870,  864, 864, 865, 896,  866, 897,
-  840, 871,  867, 898,  781, 812,  811, 842,  868, 899, 751, 782,  869, 900,
-  841, 872,  812, 843,  870, 901,  782, 813,  842, 873, 871, 902,  896, 896,
-  897, 928,  813, 844,  898, 929,  872, 903,  783, 814, 843, 874,  899, 930,
-  900, 931,  873, 904,  901, 932,  814, 845,  844, 875, 902, 933,  874, 905,
-  903, 934,  845, 876,  928, 928,  815, 846,  929, 960, 930, 961,  875, 906,
-  904, 935,  931, 962,  932, 963,  905, 936,  846, 877, 933, 964,  876, 907,
-  934, 965,  906, 937,  935, 966,  877, 908,  847, 878, 960, 960,  907, 938,
-  961, 992,  936, 967,  962, 993,  963, 994,  964, 995, 878, 909,  937, 968,
-  908, 939,  965, 996,  966, 997,  938, 969,  879, 910, 909, 940,  967, 998,
-  939, 970,  968, 999,  910, 941,  969, 1000, 940, 971, 970, 1001, 911, 942,
-  941, 972,  971, 1002, 942, 973,  972, 1003, 943, 974, 973, 1004, 974, 1005,
-  975, 1006, 15,  15,   16,  47,   48,  79,   80,  111, 112, 143,  144, 175,
-  16,  16,   17,  48,   176, 207,  49,  80,   81,  112, 113, 144,  208, 239,
-  145, 176,  240, 271,  17,  17,   18,  49,   177, 208, 50,  81,   82,  113,
-  272, 303,  209, 240,  114, 145,  146, 177,  241, 272, 304, 335,  178, 209,
-  18,  18,   19,  50,   51,  82,   83,  114,  273, 304, 210, 241,  115, 146,
-  336, 367,  147, 178,  242, 273,  305, 336,  179, 210, 19,  19,   368, 399,
-  20,  51,   52,  83,   274, 305,  84,  115,  211, 242, 337, 368,  116, 147,
-  306, 337,  148, 179,  243, 274,  400, 431,  369, 400, 180, 211,  20,  20,
-  21,  52,   275, 306,  53,  84,   338, 369,  212, 243, 85,  116,  432, 463,
-  117, 148,  401, 432,  307, 338,  244, 275,  149, 180, 370, 401,  181, 212,
-  276, 307,  464, 495,  339, 370,  21,  21,   22,  53,  433, 464,  54,  85,
-  213, 244,  86,  117,  402, 433,  118, 149,  308, 339, 245, 276,  371, 402,
-  150, 181,  496, 527,  465, 496,  182, 213,  434, 465, 340, 371,  277, 308,
-  22,  22,   23,  54,   403, 434,  55,  86,   214, 245, 87,  118,  309, 340,
-  372, 403,  119, 150,  497, 528,  528, 559,  246, 277, 466, 497,  151, 182,
-  435, 466,  341, 372,  183, 214,  278, 309,  404, 435, 23,  23,   24,  55,
-  215, 246,  529, 560,  56,  87,   498, 529,  560, 591, 310, 341,  88,  119,
-  373, 404,  467, 498,  120, 151,  247, 278,  436, 467, 152, 183,  342, 373,
-  279, 310,  405, 436,  184, 215,  530, 561,  561, 592, 499, 530,  592, 623,
-  24,  24,   216, 247,  468, 499,  25,  56,   374, 405, 57,  88,   311, 342,
-  89,  120,  437, 468,  248, 279,  121, 152,  562, 593, 153, 184,  343, 374,
-  531, 562,  593, 624,  406, 437,  500, 531,  624, 655, 280, 311,  185, 216,
-  469, 500,  375, 406,  217, 248,  25,  25,   312, 343, 26,  57,   58,  89,
-  438, 469,  90,  121,  563, 594,  594, 625,  249, 280, 532, 563,  625, 656,
-  122, 153,  344, 375,  501, 532,  656, 687,  407, 438, 154, 185,  281, 312,
-  470, 501,  186, 217,  376, 407,  595, 626,  564, 595, 626, 657,  218, 249,
-  313, 344,  439, 470,  26,  26,   27,  58,   533, 564, 657, 688,  59,  90,
-  91,  122,  250, 281,  502, 533,  688, 719,  123, 154, 408, 439,  345, 376,
-  155, 186,  471, 502,  282, 313,  596, 627,  627, 658, 187, 218,  565, 596,
-  658, 689,  377, 408,  440, 471,  534, 565,  689, 720, 314, 345,  219, 250,
-  27,  27,   28,  59,   503, 534,  720, 751,  60,  91,  92,  123,  251, 282,
-  409, 440,  346, 377,  124, 155,  628, 659,  472, 503, 597, 628,  659, 690,
-  566, 597,  690, 721,  156, 187,  283, 314,  535, 566, 721, 752,  188, 219,
-  378, 409,  441, 472,  315, 346,  504, 535,  752, 783, 220, 251,  28,  28,
-  629, 660,  660, 691,  29,  60,   61,  92,   410, 441, 598, 629,  691, 722,
-  252, 283,  93,  124,  347, 378,  473, 504,  567, 598, 722, 753,  125, 156,
-  284, 315,  536, 567,  753, 784,  157, 188,  442, 473, 379, 410,  189, 220,
-  505, 536,  784, 815,  661, 692,  316, 347,  630, 661, 692, 723,  221, 252,
-  599, 630,  723, 754,  411, 442,  29,  29,   568, 599, 754, 785,  30,  61,
-  474, 505,  62,  93,   253, 284,  348, 379,  94,  125, 537, 568,  785, 816,
-  126, 157,  285, 316,  158, 189,  443, 474,  662, 693, 693, 724,  380, 411,
-  631, 662,  724, 755,  506, 537,  816, 847,  190, 221, 600, 631,  755, 786,
-  317, 348,  222, 253,  569, 600,  786, 817,  412, 443, 475, 506,  30,  30,
-  31,  62,   349, 380,  254, 285,  63,  94,   538, 569, 817, 848,  694, 725,
-  95,  126,  663, 694,  725, 756,  632, 663,  756, 787, 127, 158,  444, 475,
-  286, 317,  381, 412,  507, 538,  848, 879,  159, 190, 601, 632,  787, 818,
-  191, 222,  318, 349,  570, 601,  818, 849,  476, 507, 223, 254,  413, 444,
-  695, 726,  726, 757,  664, 695,  757, 788,  539, 570, 849, 880,  350, 381,
-  255, 286,  633, 664,  788, 819,  445, 476,  602, 633, 819, 850,  508, 539,
-  880, 911,  287, 318,  382, 413,  571, 602,  850, 881, 727, 758,  696, 727,
-  758, 789,  319, 350,  477, 508,  665, 696,  789, 820, 414, 445,  540, 571,
-  881, 912,  634, 665,  820, 851,  351, 382,  603, 634, 851, 882,  446, 477,
-  509, 540,  912, 943,  383, 414,  728, 759,  759, 790, 572, 603,  882, 913,
-  697, 728,  790, 821,  666, 697,  821, 852,  478, 509, 635, 666,  852, 883,
-  415, 446,  541, 572,  913, 944,  604, 635,  883, 914, 760, 791,  729, 760,
-  791, 822,  510, 541,  944, 975,  447, 478,  698, 729, 822, 853,  573, 604,
-  914, 945,  667, 698,  853, 884,  636, 667,  884, 915, 479, 510,  542, 573,
-  945, 976,  761, 792,  792, 823,  605, 636,  915, 946, 730, 761,  823, 854,
-  699, 730,  854, 885,  511, 542,  976, 1007, 574, 605, 946, 977,  668, 699,
-  885, 916,  637, 668,  916, 947,  543, 574,  793, 824, 977, 1008, 762, 793,
-  824, 855,  731, 762,  855, 886,  606, 637,  947, 978, 700, 731,  886, 917,
-  669, 700,  917, 948,  575, 606,  978, 1009, 638, 669, 948, 979,  794, 825,
-  825, 856,  763, 794,  856, 887,  732, 763,  887, 918, 607, 638,  979, 1010,
-  701, 732,  918, 949,  670, 701,  949, 980,  826, 857, 795, 826,  857, 888,
-  764, 795,  888, 919,  639, 670,  980, 1011, 733, 764, 919, 950,  702, 733,
-  950, 981,  671, 702,  981, 1012, 827, 858,  858, 889, 796, 827,  889, 920,
-  765, 796,  920, 951,  734, 765,  951, 982,  703, 734, 982, 1013, 859, 890,
-  828, 859,  890, 921,  797, 828,  921, 952,  766, 797, 952, 983,  735, 766,
-  983, 1014, 860, 891,  891, 922,  829, 860,  922, 953, 798, 829,  953, 984,
-  767, 798,  984, 1015, 892, 923,  861, 892,  923, 954, 830, 861,  954, 985,
-  799, 830,  985, 1016, 893, 924,  924, 955,  862, 893, 955, 986,  831, 862,
-  986, 1017, 925, 956,  894, 925,  956, 987,  863, 894, 987, 1018, 926, 957,
-  957, 988,  895, 926,  988, 1019, 958, 989,  927, 958, 989, 1020, 959, 990,
-  990, 1021, 991, 1022, 0,   0,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                h2_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
-  0,   0,    0,   0,    0,   0,    1,   32,   1,   1,    32,  32,   2,   33,
-  33,  64,   34,  65,   2,   2,    64,  64,   3,   34,   65,  96,   35,  66,
-  66,  97,   3,   3,    96,  96,   4,   35,   97,  128,  67,  98,   36,  67,
-  98,  129,  4,   4,    68,  99,   99,  130,  128, 128,  5,   36,   129, 160,
-  37,  68,   130, 161,  100, 131,  69,  100,  131, 162,  5,   5,    160, 160,
-  6,   37,   161, 192,  38,  69,   162, 193,  101, 132,  132, 163,  70,  101,
-  163, 194,  6,   6,    192, 192,  7,   38,   133, 164,  193, 224,  102, 133,
-  164, 195,  39,  70,   194, 225,  71,  102,  195, 226,  134, 165,  165, 196,
-  7,   7,    224, 224,  8,   39,   103, 134,  196, 227,  225, 256,  40,  71,
-  226, 257,  166, 197,  72,  103,  227, 258,  135, 166,  197, 228,  104, 135,
-  228, 259,  8,   8,    256, 256,  9,   40,   257, 288,  41,  72,   167, 198,
-  198, 229,  258, 289,  136, 167,  229, 260,  73,  104,  259, 290,  105, 136,
-  260, 291,  199, 230,  9,   9,    168, 199,  230, 261,  288, 288,  10,  41,
-  289, 320,  42,  73,   290, 321,  137, 168,  261, 292,  74,  105,  291, 322,
-  200, 231,  231, 262,  106, 137,  292, 323,  169, 200,  262, 293,  10,  10,
-  320, 320,  11,  42,   321, 352,  43,  74,   138, 169,  293, 324,  322, 353,
-  232, 263,  75,  106,  201, 232,  263, 294,  323, 354,  170, 201,  294, 325,
-  107, 138,  324, 355,  11,  11,   352, 352,  12,  43,   233, 264,  264, 295,
-  353, 384,  139, 170,  325, 356,  44,  75,   354, 385,  202, 233,  295, 326,
-  76,  107,  355, 386,  171, 202,  326, 357,  108, 139,  356, 387,  265, 296,
-  234, 265,  296, 327,  12,  12,   140, 171,  357, 388,  384, 384,  13,  44,
-  203, 234,  327, 358,  385, 416,  45,  76,   386, 417,  77,  108,  387, 418,
-  172, 203,  358, 389,  266, 297,  297, 328,  109, 140,  235, 266,  328, 359,
-  388, 419,  204, 235,  359, 390,  141, 172,  389, 420,  13,  13,   416, 416,
-  14,  45,   417, 448,  46,  77,   298, 329,  418, 449,  267, 298,  329, 360,
-  78,  109,  173, 204,  390, 421,  419, 450,  236, 267,  360, 391,  110, 141,
-  420, 451,  205, 236,  391, 422,  142, 173,  299, 330,  330, 361,  421, 452,
-  14,  14,   268, 299,  361, 392,  448, 448,  15,  46,   449, 480,  47,  78,
-  450, 481,  174, 205,  422, 453,  237, 268,  392, 423,  79,  110,  451, 482,
-  111, 142,  452, 483,  331, 362,  300, 331,  362, 393,  206, 237,  423, 454,
-  143, 174,  269, 300,  393, 424,  453, 484,  15,  15,   16,  47,   48,  79,
-  238, 269,  424, 455,  175, 206,  454, 485,  80,  111,  332, 363,  363, 394,
-  301, 332,  394, 425,  112, 143,  207, 238,  455, 486,  270, 301,  425, 456,
-  144, 175,  364, 395,  16,  16,   239, 270,  456, 487,  17,  48,   333, 364,
-  395, 426,  176, 207,  49,  80,   302, 333,  426, 457,  81,  112,  113, 144,
-  208, 239,  271, 302,  457, 488,  365, 396,  396, 427,  145, 176,  334, 365,
-  427, 458,  240, 271,  17,  17,   18,  49,   177, 208,  303, 334,  458, 489,
-  50,  81,   82,  113,  272, 303,  209, 240,  397, 428,  114, 145,  366, 397,
-  428, 459,  335, 366,  459, 490,  146, 177,  241, 272,  304, 335,  178, 209,
-  18,  18,   19,  50,   51,  82,   398, 429,  429, 460,  367, 398,  460, 491,
-  83,  114,  273, 304,  210, 241,  115, 146,  336, 367,  147, 178,  242, 273,
-  305, 336,  430, 461,  399, 430,  461, 492,  179, 210,  19,  19,   368, 399,
-  20,  51,   52,  83,   274, 305,  84,  115,  211, 242,  337, 368,  116, 147,
-  431, 462,  462, 493,  306, 337,  148, 179,  243, 274,  400, 431,  369, 400,
-  180, 211,  20,  20,   21,  52,   275, 306,  53,  84,   338, 369,  212, 243,
-  85,  116,  463, 494,  432, 463,  117, 148,  401, 432,  307, 338,  244, 275,
-  149, 180,  370, 401,  181, 212,  276, 307,  464, 495,  339, 370,  21,  21,
-  22,  53,   433, 464,  54,  85,   213, 244,  86,  117,  402, 433,  118, 149,
-  308, 339,  245, 276,  371, 402,  150, 181,  465, 496,  182, 213,  434, 465,
-  340, 371,  277, 308,  22,  22,   23,  54,   403, 434,  55,  86,   214, 245,
-  87,  118,  309, 340,  372, 403,  119, 150,  246, 277,  466, 497,  151, 182,
-  435, 466,  341, 372,  183, 214,  278, 309,  404, 435,  23,  23,   24,  55,
-  215, 246,  56,  87,   310, 341,  88,  119,  373, 404,  467, 498,  120, 151,
-  247, 278,  436, 467,  152, 183,  342, 373,  279, 310,  405, 436,  184, 215,
-  24,  24,   216, 247,  468, 499,  25,  56,   374, 405,  57,  88,   311, 342,
-  89,  120,  437, 468,  248, 279,  121, 152,  153, 184,  343, 374,  406, 437,
-  280, 311,  185, 216,  469, 500,  375, 406,  217, 248,  25,  25,   312, 343,
-  26,  57,   58,  89,   438, 469,  90,  121,  249, 280,  122, 153,  344, 375,
-  407, 438,  154, 185,  281, 312,  470, 501,  186, 217,  376, 407,  218, 249,
-  313, 344,  439, 470,  26,  26,   27,  58,   59,  90,   91,  122,  250, 281,
-  123, 154,  408, 439,  345, 376,  155, 186,  471, 502,  282, 313,  187, 218,
-  377, 408,  440, 471,  314, 345,  219, 250,  27,  27,   28,  59,   60,  91,
-  92,  123,  251, 282,  409, 440,  346, 377,  124, 155,  472, 503,  156, 187,
-  283, 314,  188, 219,  378, 409,  441, 472,  315, 346,  220, 251,  28,  28,
-  29,  60,   61,  92,   410, 441,  252, 283,  93,  124,  347, 378,  473, 504,
-  125, 156,  284, 315,  157, 188,  442, 473,  379, 410,  189, 220,  316, 347,
-  221, 252,  411, 442,  29,  29,   30,  61,   474, 505,  62,  93,   253, 284,
-  348, 379,  94,  125,  126, 157,  285, 316,  158, 189,  443, 474,  380, 411,
-  190, 221,  317, 348,  222, 253,  412, 443,  475, 506,  30,  30,   31,  62,
-  349, 380,  254, 285,  63,  94,   95,  126,  127, 158,  444, 475,  286, 317,
-  381, 412,  159, 190,  191, 222,  318, 349,  476, 507,  223, 254,  413, 444,
-  350, 381,  255, 286,  445, 476,  287, 318,  382, 413,  319, 350,  477, 508,
-  414, 445,  351, 382,  446, 477,  383, 414,  478, 509,  415, 446,  447, 478,
-  479, 510,  480, 480,  481, 512,  482, 513,  483, 514,  484, 515,  485, 516,
-  512, 512,  513, 544,  486, 517,  514, 545,  515, 546,  487, 518,  516, 547,
-  517, 548,  488, 519,  544, 544,  518, 549,  545, 576,  546, 577,  547, 578,
-  489, 520,  519, 550,  548, 579,  549, 580,  520, 551,  490, 521,  550, 581,
-  576, 576,  577, 608,  578, 609,  521, 552,  579, 610,  551, 582,  491, 522,
-  580, 611,  581, 612,  552, 583,  522, 553,  582, 613,  492, 523,  608, 608,
-  609, 640,  610, 641,  553, 584,  611, 642,  523, 554,  583, 614,  612, 643,
-  554, 585,  493, 524,  584, 615,  613, 644,  524, 555,  614, 645,  640, 640,
-  585, 616,  641, 672,  555, 586,  642, 673,  615, 646,  643, 674,  494, 525,
-  644, 675,  525, 556,  586, 617,  616, 647,  645, 676,  556, 587,  646, 677,
-  495, 526,  617, 648,  587, 618,  672, 672,  526, 557,  673, 704,  674, 705,
-  647, 678,  557, 588,  675, 706,  618, 649,  676, 707,  588, 619,  648, 679,
-  677, 708,  496, 527,  527, 558,  558, 589,  678, 709,  619, 650,  649, 680,
-  704, 704,  589, 620,  705, 736,  679, 710,  706, 737,  707, 738,  650, 681,
-  620, 651,  497, 528,  528, 559,  708, 739,  680, 711,  559, 590,  709, 740,
-  590, 621,  651, 682,  681, 712,  710, 741,  621, 652,  736, 736,  737, 768,
-  529, 560,  711, 742,  498, 529,  560, 591,  738, 769,  682, 713,  652, 683,
-  739, 770,  591, 622,  740, 771,  712, 743,  622, 653,  741, 772,  683, 714,
-  653, 684,  713, 744,  742, 773,  530, 561,  561, 592,  499, 530,  592, 623,
-  623, 654,  743, 774,  768, 768,  769, 800,  684, 715,  714, 745,  770, 801,
-  771, 802,  654, 685,  744, 775,  772, 803,  562, 593,  531, 562,  593, 624,
-  715, 746,  773, 804,  685, 716,  500, 531,  624, 655,  745, 776,  774, 805,
-  655, 686,  716, 747,  775, 806,  746, 777,  800, 800,  801, 832,  686, 717,
-  802, 833,  563, 594,  594, 625,  803, 834,  532, 563,  625, 656,  776, 807,
-  804, 835,  501, 532,  656, 687,  747, 778,  717, 748,  805, 836,  777, 808,
-  687, 718,  806, 837,  748, 779,  595, 626,  564, 595,  626, 657,  718, 749,
-  778, 809,  807, 838,  832, 832,  533, 564,  657, 688,  833, 864,  834, 865,
-  835, 866,  502, 533,  688, 719,  808, 839,  749, 780,  836, 867,  779, 810,
-  719, 750,  837, 868,  809, 840,  596, 627,  627, 658,  565, 596,  658, 689,
-  838, 869,  780, 811,  750, 781,  534, 565,  689, 720,  810, 841,  839, 870,
-  864, 864,  503, 534,  720, 751,  865, 896,  866, 897,  840, 871,  867, 898,
-  781, 812,  811, 842,  628, 659,  868, 899,  751, 782,  597, 628,  659, 690,
-  566, 597,  690, 721,  869, 900,  841, 872,  535, 566,  721, 752,  812, 843,
-  870, 901,  782, 813,  842, 873,  504, 535,  752, 783,  871, 902,  629, 660,
-  660, 691,  896, 896,  897, 928,  598, 629,  691, 722,  813, 844,  898, 929,
-  872, 903,  783, 814,  843, 874,  899, 930,  567, 598,  722, 753,  900, 931,
-  536, 567,  753, 784,  873, 904,  901, 932,  814, 845,  844, 875,  902, 933,
-  505, 536,  784, 815,  661, 692,  630, 661,  692, 723,  874, 905,  599, 630,
-  723, 754,  903, 934,  845, 876,  568, 599,  754, 785,  928, 928,  815, 846,
-  929, 960,  930, 961,  875, 906,  904, 935,  931, 962,  537, 568,  785, 816,
-  932, 963,  905, 936,  662, 693,  693, 724,  846, 877,  933, 964,  876, 907,
-  631, 662,  724, 755,  506, 537,  816, 847,  934, 965,  600, 631,  755, 786,
-  906, 937,  569, 600,  786, 817,  935, 966,  877, 908,  847, 878,  960, 960,
-  907, 938,  961, 992,  936, 967,  538, 569,  817, 848,  962, 993,  694, 725,
-  663, 694,  725, 756,  963, 994,  632, 663,  756, 787,  964, 995,  878, 909,
-  937, 968,  507, 538,  848, 879,  908, 939,  601, 632,  787, 818,  965, 996,
-  966, 997,  570, 601,  818, 849,  938, 969,  879, 910,  909, 940,  967, 998,
-  695, 726,  726, 757,  664, 695,  757, 788,  539, 570,  849, 880,  939, 970,
-  633, 664,  788, 819,  968, 999,  602, 633,  819, 850,  910, 941,  508, 539,
-  880, 911,  969, 1000, 940, 971,  571, 602,  850, 881,  727, 758,  696, 727,
-  758, 789,  970, 1001, 665, 696,  789, 820,  911, 942,  941, 972,  540, 571,
-  881, 912,  634, 665,  820, 851,  971, 1002, 603, 634,  851, 882,  942, 973,
-  509, 540,  912, 943,  728, 759,  759, 790,  972, 1003, 572, 603,  882, 913,
-  697, 728,  790, 821,  666, 697,  821, 852,  943, 974,  635, 666,  852, 883,
-  541, 572,  913, 944,  973, 1004, 604, 635,  883, 914,  760, 791,  729, 760,
-  791, 822,  510, 541,  944, 975,  974, 1005, 698, 729,  822, 853,  573, 604,
-  914, 945,  667, 698,  853, 884,  636, 667,  884, 915,  975, 1006, 542, 573,
-  945, 976,  761, 792,  792, 823,  605, 636,  915, 946,  730, 761,  823, 854,
-  699, 730,  854, 885,  511, 542,  976, 1007, 574, 605,  946, 977,  668, 699,
-  885, 916,  637, 668,  916, 947,  543, 574,  793, 824,  977, 1008, 762, 793,
-  824, 855,  731, 762,  855, 886,  606, 637,  947, 978,  700, 731,  886, 917,
-  669, 700,  917, 948,  575, 606,  978, 1009, 638, 669,  948, 979,  794, 825,
-  825, 856,  763, 794,  856, 887,  732, 763,  887, 918,  607, 638,  979, 1010,
-  701, 732,  918, 949,  670, 701,  949, 980,  826, 857,  795, 826,  857, 888,
-  764, 795,  888, 919,  639, 670,  980, 1011, 733, 764,  919, 950,  702, 733,
-  950, 981,  671, 702,  981, 1012, 827, 858,  858, 889,  796, 827,  889, 920,
-  765, 796,  920, 951,  734, 765,  951, 982,  703, 734,  982, 1013, 859, 890,
-  828, 859,  890, 921,  797, 828,  921, 952,  766, 797,  952, 983,  735, 766,
-  983, 1014, 860, 891,  891, 922,  829, 860,  922, 953,  798, 829,  953, 984,
-  767, 798,  984, 1015, 892, 923,  861, 892,  923, 954,  830, 861,  954, 985,
-  799, 830,  985, 1016, 893, 924,  924, 955,  862, 893,  955, 986,  831, 862,
-  986, 1017, 925, 956,  894, 925,  956, 987,  863, 894,  987, 1018, 926, 957,
-  957, 988,  895, 926,  988, 1019, 958, 989,  927, 958,  989, 1020, 959, 990,
-  990, 1021, 991, 1022, 0,   0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                qtr_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
-  0,   0,    0,   0,    0,   0,    1,   32,   1,   1,    32,  32,   2,   33,
-  33,  64,   34,  65,   2,   2,    64,  64,   3,   34,   65,  96,   35,  66,
-  66,  97,   3,   3,    96,  96,   4,   35,   97,  128,  67,  98,   36,  67,
-  98,  129,  4,   4,    68,  99,   99,  130,  128, 128,  5,   36,   129, 160,
-  37,  68,   130, 161,  100, 131,  69,  100,  131, 162,  5,   5,    160, 160,
-  6,   37,   161, 192,  38,  69,   162, 193,  101, 132,  132, 163,  70,  101,
-  163, 194,  6,   6,    192, 192,  7,   38,   133, 164,  193, 224,  102, 133,
-  164, 195,  39,  70,   194, 225,  71,  102,  195, 226,  134, 165,  165, 196,
-  7,   7,    224, 224,  8,   39,   103, 134,  196, 227,  225, 256,  40,  71,
-  226, 257,  166, 197,  72,  103,  227, 258,  135, 166,  197, 228,  104, 135,
-  228, 259,  8,   8,    256, 256,  9,   40,   257, 288,  41,  72,   167, 198,
-  198, 229,  258, 289,  136, 167,  229, 260,  73,  104,  259, 290,  105, 136,
-  260, 291,  199, 230,  9,   9,    168, 199,  230, 261,  288, 288,  10,  41,
-  289, 320,  42,  73,   290, 321,  137, 168,  261, 292,  74,  105,  291, 322,
-  200, 231,  231, 262,  106, 137,  292, 323,  169, 200,  262, 293,  10,  10,
-  320, 320,  11,  42,   321, 352,  43,  74,   138, 169,  293, 324,  322, 353,
-  232, 263,  75,  106,  201, 232,  263, 294,  323, 354,  170, 201,  294, 325,
-  107, 138,  324, 355,  11,  11,   352, 352,  12,  43,   233, 264,  264, 295,
-  353, 384,  139, 170,  325, 356,  44,  75,   354, 385,  202, 233,  295, 326,
-  76,  107,  355, 386,  171, 202,  326, 357,  108, 139,  356, 387,  265, 296,
-  234, 265,  296, 327,  12,  12,   140, 171,  357, 388,  384, 384,  13,  44,
-  203, 234,  327, 358,  385, 416,  45,  76,   386, 417,  77,  108,  387, 418,
-  172, 203,  358, 389,  266, 297,  297, 328,  109, 140,  235, 266,  328, 359,
-  388, 419,  204, 235,  359, 390,  141, 172,  389, 420,  13,  13,   416, 416,
-  14,  45,   417, 448,  46,  77,   298, 329,  418, 449,  267, 298,  329, 360,
-  78,  109,  173, 204,  390, 421,  419, 450,  236, 267,  360, 391,  110, 141,
-  420, 451,  205, 236,  391, 422,  142, 173,  299, 330,  330, 361,  421, 452,
-  14,  14,   268, 299,  361, 392,  448, 448,  15,  46,   449, 480,  47,  78,
-  450, 481,  174, 205,  422, 453,  237, 268,  392, 423,  79,  110,  451, 482,
-  111, 142,  452, 483,  331, 362,  300, 331,  362, 393,  206, 237,  423, 454,
-  143, 174,  269, 300,  393, 424,  453, 484,  238, 269,  424, 455,  175, 206,
-  454, 485,  332, 363,  363, 394,  301, 332,  394, 425,  207, 238,  455, 486,
-  270, 301,  425, 456,  364, 395,  239, 270,  456, 487,  333, 364,  395, 426,
-  302, 333,  426, 457,  271, 302,  457, 488,  365, 396,  396, 427,  334, 365,
-  427, 458,  303, 334,  458, 489,  397, 428,  366, 397,  428, 459,  335, 366,
-  459, 490,  398, 429,  429, 460,  367, 398,  460, 491,  430, 461,  399, 430,
-  461, 492,  431, 462,  462, 493,  463, 494,  15,  15,   480, 480,  16,  47,
-  481, 512,  48,  79,   482, 513,  80,  111,  483, 514,  112, 143,  484, 515,
-  144, 175,  485, 516,  16,  16,   512, 512,  17,  48,   513, 544,  176, 207,
-  486, 517,  49,  80,   514, 545,  81,  112,  515, 546,  113, 144,  208, 239,
-  487, 518,  516, 547,  145, 176,  517, 548,  240, 271,  488, 519,  17,  17,
-  544, 544,  18,  49,   177, 208,  518, 549,  545, 576,  50,  81,   546, 577,
-  82,  113,  547, 578,  272, 303,  489, 520,  209, 240,  519, 550,  114, 145,
-  548, 579,  146, 177,  549, 580,  241, 272,  520, 551,  304, 335,  490, 521,
-  178, 209,  550, 581,  18,  18,   576, 576,  19,  50,   577, 608,  51,  82,
-  578, 609,  83,  114,  273, 304,  521, 552,  579, 610,  210, 241,  551, 582,
-  115, 146,  336, 367,  491, 522,  580, 611,  147, 178,  581, 612,  242, 273,
-  552, 583,  305, 336,  522, 553,  179, 210,  582, 613,  19,  19,   368, 399,
-  492, 523,  608, 608,  20,  51,   609, 640,  52,  83,   610, 641,  274, 305,
-  553, 584,  84,  115,  611, 642,  211, 242,  337, 368,  523, 554,  583, 614,
-  116, 147,  612, 643,  306, 337,  554, 585,  148, 179,  243, 274,  400, 431,
-  493, 524,  584, 615,  613, 644,  369, 400,  524, 555,  180, 211,  614, 645,
-  20,  20,   640, 640,  21,  52,   275, 306,  585, 616,  641, 672,  53,  84,
-  338, 369,  555, 586,  642, 673,  212, 243,  615, 646,  85,  116,  643, 674,
-  432, 463,  494, 525,  117, 148,  644, 675,  401, 432,  525, 556,  307, 338,
-  586, 617,  244, 275,  616, 647,  149, 180,  645, 676,  370, 401,  556, 587,
-  181, 212,  646, 677,  276, 307,  464, 495,  495, 526,  617, 648,  339, 370,
-  587, 618,  21,  21,   672, 672,  22,  53,   433, 464,  526, 557,  673, 704,
-  54,  85,   674, 705,  213, 244,  647, 678,  86,  117,  402, 433,  557, 588,
-  675, 706,  118, 149,  308, 339,  618, 649,  676, 707,  245, 276,  371, 402,
-  588, 619,  648, 679,  150, 181,  677, 708,  496, 527,  465, 496,  527, 558,
-  182, 213,  434, 465,  558, 589,  678, 709,  340, 371,  619, 650,  277, 308,
-  649, 680,  22,  22,   704, 704,  23,  54,   403, 434,  589, 620,  705, 736,
-  55,  86,   214, 245,  679, 710,  706, 737,  87,  118,  707, 738,  309, 340,
-  650, 681,  372, 403,  620, 651,  119, 150,  497, 528,  528, 559,  708, 739,
-  246, 277,  680, 711,  466, 497,  559, 590,  151, 182,  709, 740,  435, 466,
-  590, 621,  341, 372,  651, 682,  183, 214,  278, 309,  681, 712,  710, 741,
-  404, 435,  621, 652,  23,  23,   736, 736,  24,  55,   737, 768,  215, 246,
-  529, 560,  711, 742,  56,  87,   498, 529,  560, 591,  738, 769,  310, 341,
-  682, 713,  88,  119,  373, 404,  652, 683,  739, 770,  467, 498,  591, 622,
-  120, 151,  740, 771,  247, 278,  712, 743,  436, 467,  622, 653,  152, 183,
-  741, 772,  342, 373,  683, 714,  279, 310,  405, 436,  653, 684,  713, 744,
-  184, 215,  742, 773,  530, 561,  561, 592,  499, 530,  592, 623,  24,  24,
-  216, 247,  468, 499,  623, 654,  743, 774,  768, 768,  25,  56,   769, 800,
-  374, 405,  684, 715,  57,  88,   311, 342,  714, 745,  770, 801,  89,  120,
-  771, 802,  437, 468,  654, 685,  248, 279,  744, 775,  121, 152,  772, 803,
-  562, 593,  153, 184,  343, 374,  531, 562,  593, 624,  715, 746,  773, 804,
-  406, 437,  685, 716,  500, 531,  624, 655,  280, 311,  745, 776,  185, 216,
-  774, 805,  469, 500,  655, 686,  375, 406,  716, 747,  217, 248,  775, 806,
-  25,  25,   312, 343,  746, 777,  800, 800,  26,  57,   801, 832,  58,  89,
-  438, 469,  686, 717,  802, 833,  90,  121,  563, 594,  594, 625,  803, 834,
-  249, 280,  532, 563,  625, 656,  776, 807,  122, 153,  804, 835,  344, 375,
-  501, 532,  656, 687,  747, 778,  407, 438,  717, 748,  154, 185,  805, 836,
-  281, 312,  777, 808,  470, 501,  687, 718,  186, 217,  806, 837,  376, 407,
-  748, 779,  595, 626,  564, 595,  626, 657,  218, 249,  313, 344,  439, 470,
-  718, 749,  778, 809,  807, 838,  26,  26,   832, 832,  27,  58,   533, 564,
-  657, 688,  833, 864,  59,  90,   834, 865,  91,  122,  835, 866,  250, 281,
-  502, 533,  688, 719,  808, 839,  123, 154,  408, 439,  749, 780,  836, 867,
-  345, 376,  779, 810,  155, 186,  471, 502,  719, 750,  837, 868,  282, 313,
-  809, 840,  596, 627,  627, 658,  187, 218,  565, 596,  658, 689,  838, 869,
-  377, 408,  780, 811,  440, 471,  750, 781,  534, 565,  689, 720,  314, 345,
-  810, 841,  219, 250,  839, 870,  27,  27,   864, 864,  28,  59,   503, 534,
-  720, 751,  865, 896,  60,  91,   866, 897,  92,  123,  251, 282,  840, 871,
-  867, 898,  409, 440,  781, 812,  346, 377,  811, 842,  124, 155,  628, 659,
-  868, 899,  472, 503,  751, 782,  597, 628,  659, 690,  566, 597,  690, 721,
-  156, 187,  869, 900,  283, 314,  841, 872,  535, 566,  721, 752,  188, 219,
-  378, 409,  812, 843,  870, 901,  441, 472,  782, 813,  315, 346,  842, 873,
-  504, 535,  752, 783,  220, 251,  871, 902,  28,  28,   629, 660,  660, 691,
-  896, 896,  29,  60,   897, 928,  61,  92,   410, 441,  598, 629,  691, 722,
-  813, 844,  898, 929,  252, 283,  872, 903,  93,  124,  347, 378,  473, 504,
-  783, 814,  843, 874,  899, 930,  567, 598,  722, 753,  125, 156,  900, 931,
-  284, 315,  536, 567,  753, 784,  873, 904,  157, 188,  901, 932,  442, 473,
-  814, 845,  379, 410,  844, 875,  189, 220,  902, 933,  505, 536,  784, 815,
-  661, 692,  316, 347,  630, 661,  692, 723,  874, 905,  221, 252,  599, 630,
-  723, 754,  903, 934,  411, 442,  845, 876,  29,  29,   568, 599,  754, 785,
-  928, 928,  30,  61,   474, 505,  815, 846,  929, 960,  62,  93,   930, 961,
-  253, 284,  348, 379,  875, 906,  904, 935,  94,  125,  931, 962,  537, 568,
-  785, 816,  126, 157,  932, 963,  285, 316,  905, 936,  158, 189,  443, 474,
-  662, 693,  693, 724,  846, 877,  933, 964,  380, 411,  876, 907,  631, 662,
-  724, 755,  506, 537,  816, 847,  190, 221,  934, 965,  600, 631,  755, 786,
-  317, 348,  906, 937,  222, 253,  569, 600,  786, 817,  935, 966,  412, 443,
-  877, 908,  475, 506,  847, 878,  30,  30,   960, 960,  31,  62,   349, 380,
-  907, 938,  961, 992,  254, 285,  936, 967,  63,  94,   538, 569,  817, 848,
-  962, 993,  694, 725,  95,  126,  663, 694,  725, 756,  963, 994,  632, 663,
-  756, 787,  127, 158,  964, 995,  444, 475,  878, 909,  286, 317,  937, 968,
-  381, 412,  507, 538,  848, 879,  908, 939,  159, 190,  601, 632,  787, 818,
-  965, 996,  191, 222,  966, 997,  318, 349,  570, 601,  818, 849,  938, 969,
-  476, 507,  879, 910,  223, 254,  413, 444,  909, 940,  967, 998,  695, 726,
-  726, 757,  664, 695,  757, 788,  539, 570,  849, 880,  350, 381,  939, 970,
-  255, 286,  633, 664,  788, 819,  968, 999,  445, 476,  602, 633,  819, 850,
-  910, 941,  508, 539,  880, 911,  287, 318,  969, 1000, 382, 413,  940, 971,
-  571, 602,  850, 881,  727, 758,  696, 727,  758, 789,  319, 350,  970, 1001,
-  477, 508,  665, 696,  789, 820,  911, 942,  414, 445,  941, 972,  540, 571,
-  881, 912,  634, 665,  820, 851,  351, 382,  971, 1002, 603, 634,  851, 882,
-  446, 477,  942, 973,  509, 540,  912, 943,  383, 414,  728, 759,  759, 790,
-  972, 1003, 572, 603,  882, 913,  697, 728,  790, 821,  666, 697,  821, 852,
-  478, 509,  943, 974,  635, 666,  852, 883,  415, 446,  541, 572,  913, 944,
-  973, 1004, 604, 635,  883, 914,  760, 791,  729, 760,  791, 822,  510, 541,
-  944, 975,  447, 478,  974, 1005, 698, 729,  822, 853,  573, 604,  914, 945,
-  667, 698,  853, 884,  636, 667,  884, 915,  479, 510,  975, 1006, 542, 573,
-  945, 976,  761, 792,  792, 823,  605, 636,  915, 946,  730, 761,  823, 854,
-  699, 730,  854, 885,  511, 542,  976, 1007, 574, 605,  946, 977,  668, 699,
-  885, 916,  637, 668,  916, 947,  543, 574,  793, 824,  977, 1008, 762, 793,
-  824, 855,  731, 762,  855, 886,  606, 637,  947, 978,  700, 731,  886, 917,
-  669, 700,  917, 948,  575, 606,  978, 1009, 638, 669,  948, 979,  794, 825,
-  825, 856,  763, 794,  856, 887,  732, 763,  887, 918,  607, 638,  979, 1010,
-  701, 732,  918, 949,  670, 701,  949, 980,  826, 857,  795, 826,  857, 888,
-  764, 795,  888, 919,  639, 670,  980, 1011, 733, 764,  919, 950,  702, 733,
-  950, 981,  671, 702,  981, 1012, 827, 858,  858, 889,  796, 827,  889, 920,
-  765, 796,  920, 951,  734, 765,  951, 982,  703, 734,  982, 1013, 859, 890,
-  828, 859,  890, 921,  797, 828,  921, 952,  766, 797,  952, 983,  735, 766,
-  983, 1014, 860, 891,  891, 922,  829, 860,  922, 953,  798, 829,  953, 984,
-  767, 798,  984, 1015, 892, 923,  861, 892,  923, 954,  830, 861,  954, 985,
-  799, 830,  985, 1016, 893, 924,  924, 955,  862, 893,  955, 986,  831, 862,
-  986, 1017, 925, 956,  894, 925,  956, 987,  863, 894,  987, 1018, 926, 957,
-  957, 988,  895, 926,  988, 1019, 958, 989,  927, 958,  989, 1020, 959, 990,
+  0,   0,    0,   0,    0,   0,    32,  32,   1,   32,  1,   1,    2,   2,
+  2,   33,   33,  64,   64,  64,   96,  96,   65,  96,  34,  65,   3,   34,
+  3,   3,    4,   4,    4,   35,   35,  66,   66,  97,  97,  128,  128, 128,
+  160, 160,  129, 160,  98,  129,  67,  98,   36,  67,  5,   36,   5,   5,
+  6,   6,    6,   37,   37,  68,   68,  99,   99,  130, 130, 161,  161, 192,
+  192, 192,  224, 224,  193, 224,  162, 193,  131, 162, 100, 131,  69,  100,
+  38,  69,   7,   38,   7,   7,    8,   8,    8,   39,  39,  70,   70,  101,
+  101, 132,  132, 163,  163, 194,  194, 225,  225, 256, 256, 256,  288, 288,
+  257, 288,  226, 257,  195, 226,  164, 195,  133, 164, 102, 133,  71,  102,
+  40,  71,   9,   40,   9,   9,    10,  10,   10,  41,  41,  72,   72,  103,
+  103, 134,  134, 165,  165, 196,  196, 227,  227, 258, 258, 289,  289, 320,
+  320, 320,  352, 352,  321, 352,  290, 321,  259, 290, 228, 259,  197, 228,
+  166, 197,  135, 166,  104, 135,  73,  104,  42,  73,  11,  42,   11,  11,
+  12,  12,   12,  43,   43,  74,   74,  105,  105, 136, 136, 167,  167, 198,
+  198, 229,  229, 260,  260, 291,  291, 322,  322, 353, 353, 384,  384, 384,
+  416, 416,  385, 416,  354, 385,  323, 354,  292, 323, 261, 292,  230, 261,
+  199, 230,  168, 199,  137, 168,  106, 137,  75,  106, 44,  75,   13,  44,
+  13,  13,   14,  14,   14,  45,   45,  76,   76,  107, 107, 138,  138, 169,
+  169, 200,  200, 231,  231, 262,  262, 293,  293, 324, 324, 355,  355, 386,
+  386, 417,  417, 448,  448, 448,  480, 480,  449, 480, 418, 449,  387, 418,
+  356, 387,  325, 356,  294, 325,  263, 294,  232, 263, 201, 232,  170, 201,
+  139, 170,  108, 139,  77,  108,  46,  77,   15,  46,  15,  15,   16,  16,
+  16,  47,   47,  78,   78,  109,  109, 140,  140, 171, 171, 202,  202, 233,
+  233, 264,  264, 295,  295, 326,  326, 357,  357, 388, 388, 419,  419, 450,
+  450, 481,  481, 512,  512, 512,  544, 544,  513, 544, 482, 513,  451, 482,
+  420, 451,  389, 420,  358, 389,  327, 358,  296, 327, 265, 296,  234, 265,
+  203, 234,  172, 203,  141, 172,  110, 141,  79,  110, 48,  79,   17,  48,
+  17,  17,   18,  18,   18,  49,   49,  80,   80,  111, 111, 142,  142, 173,
+  173, 204,  204, 235,  235, 266,  266, 297,  297, 328, 328, 359,  359, 390,
+  390, 421,  421, 452,  452, 483,  483, 514,  514, 545, 545, 576,  576, 576,
+  608, 608,  577, 608,  546, 577,  515, 546,  484, 515, 453, 484,  422, 453,
+  391, 422,  360, 391,  329, 360,  298, 329,  267, 298, 236, 267,  205, 236,
+  174, 205,  143, 174,  112, 143,  81,  112,  50,  81,  19,  50,   19,  19,
+  20,  20,   20,  51,   51,  82,   82,  113,  113, 144, 144, 175,  175, 206,
+  206, 237,  237, 268,  268, 299,  299, 330,  330, 361, 361, 392,  392, 423,
+  423, 454,  454, 485,  485, 516,  516, 547,  547, 578, 578, 609,  609, 640,
+  640, 640,  672, 672,  641, 672,  610, 641,  579, 610, 548, 579,  517, 548,
+  486, 517,  455, 486,  424, 455,  393, 424,  362, 393, 331, 362,  300, 331,
+  269, 300,  238, 269,  207, 238,  176, 207,  145, 176, 114, 145,  83,  114,
+  52,  83,   21,  52,   21,  21,   22,  22,   22,  53,  53,  84,   84,  115,
+  115, 146,  146, 177,  177, 208,  208, 239,  239, 270, 270, 301,  301, 332,
+  332, 363,  363, 394,  394, 425,  425, 456,  456, 487, 487, 518,  518, 549,
+  549, 580,  580, 611,  611, 642,  642, 673,  673, 704, 704, 704,  736, 736,
+  705, 736,  674, 705,  643, 674,  612, 643,  581, 612, 550, 581,  519, 550,
+  488, 519,  457, 488,  426, 457,  395, 426,  364, 395, 333, 364,  302, 333,
+  271, 302,  240, 271,  209, 240,  178, 209,  147, 178, 116, 147,  85,  116,
+  54,  85,   23,  54,   23,  23,   24,  24,   24,  55,  55,  86,   86,  117,
+  117, 148,  148, 179,  179, 210,  210, 241,  241, 272, 272, 303,  303, 334,
+  334, 365,  365, 396,  396, 427,  427, 458,  458, 489, 489, 520,  520, 551,
+  551, 582,  582, 613,  613, 644,  644, 675,  675, 706, 706, 737,  737, 768,
+  768, 768,  800, 800,  769, 800,  738, 769,  707, 738, 676, 707,  645, 676,
+  614, 645,  583, 614,  552, 583,  521, 552,  490, 521, 459, 490,  428, 459,
+  397, 428,  366, 397,  335, 366,  304, 335,  273, 304, 242, 273,  211, 242,
+  180, 211,  149, 180,  118, 149,  87,  118,  56,  87,  25,  56,   25,  25,
+  26,  26,   26,  57,   57,  88,   88,  119,  119, 150, 150, 181,  181, 212,
+  212, 243,  243, 274,  274, 305,  305, 336,  336, 367, 367, 398,  398, 429,
+  429, 460,  460, 491,  491, 522,  522, 553,  553, 584, 584, 615,  615, 646,
+  646, 677,  677, 708,  708, 739,  739, 770,  770, 801, 801, 832,  832, 832,
+  864, 864,  833, 864,  802, 833,  771, 802,  740, 771, 709, 740,  678, 709,
+  647, 678,  616, 647,  585, 616,  554, 585,  523, 554, 492, 523,  461, 492,
+  430, 461,  399, 430,  368, 399,  337, 368,  306, 337, 275, 306,  244, 275,
+  213, 244,  182, 213,  151, 182,  120, 151,  89,  120, 58,  89,   27,  58,
+  27,  27,   28,  28,   28,  59,   59,  90,   90,  121, 121, 152,  152, 183,
+  183, 214,  214, 245,  245, 276,  276, 307,  307, 338, 338, 369,  369, 400,
+  400, 431,  431, 462,  462, 493,  493, 524,  524, 555, 555, 586,  586, 617,
+  617, 648,  648, 679,  679, 710,  710, 741,  741, 772, 772, 803,  803, 834,
+  834, 865,  865, 896,  896, 896,  928, 928,  897, 928, 866, 897,  835, 866,
+  804, 835,  773, 804,  742, 773,  711, 742,  680, 711, 649, 680,  618, 649,
+  587, 618,  556, 587,  525, 556,  494, 525,  463, 494, 432, 463,  401, 432,
+  370, 401,  339, 370,  308, 339,  277, 308,  246, 277, 215, 246,  184, 215,
+  153, 184,  122, 153,  91,  122,  60,  91,   29,  60,  29,  29,   30,  30,
+  30,  61,   61,  92,   92,  123,  123, 154,  154, 185, 185, 216,  216, 247,
+  247, 278,  278, 309,  309, 340,  340, 371,  371, 402, 402, 433,  433, 464,
+  464, 495,  495, 526,  526, 557,  557, 588,  588, 619, 619, 650,  650, 681,
+  681, 712,  712, 743,  743, 774,  774, 805,  805, 836, 836, 867,  867, 898,
+  898, 929,  929, 960,  960, 960,  961, 992,  930, 961, 899, 930,  868, 899,
+  837, 868,  806, 837,  775, 806,  744, 775,  713, 744, 682, 713,  651, 682,
+  620, 651,  589, 620,  558, 589,  527, 558,  496, 527, 465, 496,  434, 465,
+  403, 434,  372, 403,  341, 372,  310, 341,  279, 310, 248, 279,  217, 248,
+  186, 217,  155, 186,  124, 155,  93,  124,  62,  93,  31,  62,   63,  94,
+  94,  125,  125, 156,  156, 187,  187, 218,  218, 249, 249, 280,  280, 311,
+  311, 342,  342, 373,  373, 404,  404, 435,  435, 466, 466, 497,  497, 528,
+  528, 559,  559, 590,  590, 621,  621, 652,  652, 683, 683, 714,  714, 745,
+  745, 776,  776, 807,  807, 838,  838, 869,  869, 900, 900, 931,  931, 962,
+  962, 993,  963, 994,  932, 963,  901, 932,  870, 901, 839, 870,  808, 839,
+  777, 808,  746, 777,  715, 746,  684, 715,  653, 684, 622, 653,  591, 622,
+  560, 591,  529, 560,  498, 529,  467, 498,  436, 467, 405, 436,  374, 405,
+  343, 374,  312, 343,  281, 312,  250, 281,  219, 250, 188, 219,  157, 188,
+  126, 157,  95,  126,  127, 158,  158, 189,  189, 220, 220, 251,  251, 282,
+  282, 313,  313, 344,  344, 375,  375, 406,  406, 437, 437, 468,  468, 499,
+  499, 530,  530, 561,  561, 592,  592, 623,  623, 654, 654, 685,  685, 716,
+  716, 747,  747, 778,  778, 809,  809, 840,  840, 871, 871, 902,  902, 933,
+  933, 964,  964, 995,  965, 996,  934, 965,  903, 934, 872, 903,  841, 872,
+  810, 841,  779, 810,  748, 779,  717, 748,  686, 717, 655, 686,  624, 655,
+  593, 624,  562, 593,  531, 562,  500, 531,  469, 500, 438, 469,  407, 438,
+  376, 407,  345, 376,  314, 345,  283, 314,  252, 283, 221, 252,  190, 221,
+  159, 190,  191, 222,  222, 253,  253, 284,  284, 315, 315, 346,  346, 377,
+  377, 408,  408, 439,  439, 470,  470, 501,  501, 532, 532, 563,  563, 594,
+  594, 625,  625, 656,  656, 687,  687, 718,  718, 749, 749, 780,  780, 811,
+  811, 842,  842, 873,  873, 904,  904, 935,  935, 966, 966, 997,  967, 998,
+  936, 967,  905, 936,  874, 905,  843, 874,  812, 843, 781, 812,  750, 781,
+  719, 750,  688, 719,  657, 688,  626, 657,  595, 626, 564, 595,  533, 564,
+  502, 533,  471, 502,  440, 471,  409, 440,  378, 409, 347, 378,  316, 347,
+  285, 316,  254, 285,  223, 254,  255, 286,  286, 317, 317, 348,  348, 379,
+  379, 410,  410, 441,  441, 472,  472, 503,  503, 534, 534, 565,  565, 596,
+  596, 627,  627, 658,  658, 689,  689, 720,  720, 751, 751, 782,  782, 813,
+  813, 844,  844, 875,  875, 906,  906, 937,  937, 968, 968, 999,  969, 1000,
+  938, 969,  907, 938,  876, 907,  845, 876,  814, 845, 783, 814,  752, 783,
+  721, 752,  690, 721,  659, 690,  628, 659,  597, 628, 566, 597,  535, 566,
+  504, 535,  473, 504,  442, 473,  411, 442,  380, 411, 349, 380,  318, 349,
+  287, 318,  319, 350,  350, 381,  381, 412,  412, 443, 443, 474,  474, 505,
+  505, 536,  536, 567,  567, 598,  598, 629,  629, 660, 660, 691,  691, 722,
+  722, 753,  753, 784,  784, 815,  815, 846,  846, 877, 877, 908,  908, 939,
+  939, 970,  970, 1001, 971, 1002, 940, 971,  909, 940, 878, 909,  847, 878,
+  816, 847,  785, 816,  754, 785,  723, 754,  692, 723, 661, 692,  630, 661,
+  599, 630,  568, 599,  537, 568,  506, 537,  475, 506, 444, 475,  413, 444,
+  382, 413,  351, 382,  383, 414,  414, 445,  445, 476, 476, 507,  507, 538,
+  538, 569,  569, 600,  600, 631,  631, 662,  662, 693, 693, 724,  724, 755,
+  755, 786,  786, 817,  817, 848,  848, 879,  879, 910, 910, 941,  941, 972,
+  972, 1003, 973, 1004, 942, 973,  911, 942,  880, 911, 849, 880,  818, 849,
+  787, 818,  756, 787,  725, 756,  694, 725,  663, 694, 632, 663,  601, 632,
+  570, 601,  539, 570,  508, 539,  477, 508,  446, 477, 415, 446,  447, 478,
+  478, 509,  509, 540,  540, 571,  571, 602,  602, 633, 633, 664,  664, 695,
+  695, 726,  726, 757,  757, 788,  788, 819,  819, 850, 850, 881,  881, 912,
+  912, 943,  943, 974,  974, 1005, 975, 1006, 944, 975, 913, 944,  882, 913,
+  851, 882,  820, 851,  789, 820,  758, 789,  727, 758, 696, 727,  665, 696,
+  634, 665,  603, 634,  572, 603,  541, 572,  510, 541, 479, 510,  511, 542,
+  542, 573,  573, 604,  604, 635,  635, 666,  666, 697, 697, 728,  728, 759,
+  759, 790,  790, 821,  821, 852,  852, 883,  883, 914, 914, 945,  945, 976,
+  976, 1007, 977, 1008, 946, 977,  915, 946,  884, 915, 853, 884,  822, 853,
+  791, 822,  760, 791,  729, 760,  698, 729,  667, 698, 636, 667,  605, 636,
+  574, 605,  543, 574,  575, 606,  606, 637,  637, 668, 668, 699,  699, 730,
+  730, 761,  761, 792,  792, 823,  823, 854,  854, 885, 885, 916,  916, 947,
+  947, 978,  978, 1009, 979, 1010, 948, 979,  917, 948, 886, 917,  855, 886,
+  824, 855,  793, 824,  762, 793,  731, 762,  700, 731, 669, 700,  638, 669,
+  607, 638,  639, 670,  670, 701,  701, 732,  732, 763, 763, 794,  794, 825,
+  825, 856,  856, 887,  887, 918,  918, 949,  949, 980, 980, 1011, 981, 1012,
+  950, 981,  919, 950,  888, 919,  857, 888,  826, 857, 795, 826,  764, 795,
+  733, 764,  702, 733,  671, 702,  703, 734,  734, 765, 765, 796,  796, 827,
+  827, 858,  858, 889,  889, 920,  920, 951,  951, 982, 982, 1013, 983, 1014,
+  952, 983,  921, 952,  890, 921,  859, 890,  828, 859, 797, 828,  766, 797,
+  735, 766,  767, 798,  798, 829,  829, 860,  860, 891, 891, 922,  922, 953,
+  953, 984,  984, 1015, 985, 1016, 954, 985,  923, 954, 892, 923,  861, 892,
+  830, 861,  799, 830,  831, 862,  862, 893,  893, 924, 924, 955,  955, 986,
+  986, 1017, 987, 1018, 956, 987,  925, 956,  894, 925, 863, 894,  895, 926,
+  926, 957,  957, 988,  988, 1019, 989, 1020, 958, 989, 927, 958,  959, 990,
   990, 1021, 991, 1022, 0,   0
 };
 
-#if CONFIG_TX64X64
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_32x64_neighbors[2049 * MAX_NEIGHBORS]) = {
-  0,    0,    0,    0,    0,    0,    1,    1,    1,    32,   32,   32,   2,
-  2,    2,    33,   33,   64,   64,   64,   3,    3,    3,    34,   34,   65,
-  65,   96,   96,   96,   4,    4,    4,    35,   35,   66,   66,   97,   97,
-  128,  128,  128,  5,    5,    5,    36,   36,   67,   67,   98,   98,   129,
-  129,  160,  160,  160,  6,    6,    6,    37,   37,   68,   68,   99,   99,
-  130,  130,  161,  161,  192,  192,  192,  7,    7,    7,    38,   38,   69,
-  69,   100,  100,  131,  131,  162,  162,  193,  193,  224,  224,  224,  8,
-  8,    8,    39,   39,   70,   70,   101,  101,  132,  132,  163,  163,  194,
-  194,  225,  225,  256,  256,  256,  9,    9,    9,    40,   40,   71,   71,
-  102,  102,  133,  133,  164,  164,  195,  195,  226,  226,  257,  257,  288,
-  288,  288,  10,   10,   10,   41,   41,   72,   72,   103,  103,  134,  134,
-  165,  165,  196,  196,  227,  227,  258,  258,  289,  289,  320,  320,  320,
-  11,   11,   11,   42,   42,   73,   73,   104,  104,  135,  135,  166,  166,
-  197,  197,  228,  228,  259,  259,  290,  290,  321,  321,  352,  352,  352,
-  12,   12,   12,   43,   43,   74,   74,   105,  105,  136,  136,  167,  167,
-  198,  198,  229,  229,  260,  260,  291,  291,  322,  322,  353,  353,  384,
-  384,  384,  13,   13,   13,   44,   44,   75,   75,   106,  106,  137,  137,
-  168,  168,  199,  199,  230,  230,  261,  261,  292,  292,  323,  323,  354,
-  354,  385,  385,  416,  416,  416,  14,   14,   14,   45,   45,   76,   76,
-  107,  107,  138,  138,  169,  169,  200,  200,  231,  231,  262,  262,  293,
-  293,  324,  324,  355,  355,  386,  386,  417,  417,  448,  448,  448,  15,
-  15,   15,   46,   46,   77,   77,   108,  108,  139,  139,  170,  170,  201,
-  201,  232,  232,  263,  263,  294,  294,  325,  325,  356,  356,  387,  387,
-  418,  418,  449,  449,  480,  480,  480,  16,   16,   16,   47,   47,   78,
-  78,   109,  109,  140,  140,  171,  171,  202,  202,  233,  233,  264,  264,
-  295,  295,  326,  326,  357,  357,  388,  388,  419,  419,  450,  450,  481,
-  481,  512,  512,  512,  17,   17,   17,   48,   48,   79,   79,   110,  110,
-  141,  141,  172,  172,  203,  203,  234,  234,  265,  265,  296,  296,  327,
-  327,  358,  358,  389,  389,  420,  420,  451,  451,  482,  482,  513,  513,
-  544,  544,  544,  18,   18,   18,   49,   49,   80,   80,   111,  111,  142,
-  142,  173,  173,  204,  204,  235,  235,  266,  266,  297,  297,  328,  328,
-  359,  359,  390,  390,  421,  421,  452,  452,  483,  483,  514,  514,  545,
-  545,  576,  576,  576,  19,   19,   19,   50,   50,   81,   81,   112,  112,
-  143,  143,  174,  174,  205,  205,  236,  236,  267,  267,  298,  298,  329,
-  329,  360,  360,  391,  391,  422,  422,  453,  453,  484,  484,  515,  515,
-  546,  546,  577,  577,  608,  608,  608,  20,   20,   20,   51,   51,   82,
-  82,   113,  113,  144,  144,  175,  175,  206,  206,  237,  237,  268,  268,
-  299,  299,  330,  330,  361,  361,  392,  392,  423,  423,  454,  454,  485,
-  485,  516,  516,  547,  547,  578,  578,  609,  609,  640,  640,  640,  21,
-  21,   21,   52,   52,   83,   83,   114,  114,  145,  145,  176,  176,  207,
-  207,  238,  238,  269,  269,  300,  300,  331,  331,  362,  362,  393,  393,
-  424,  424,  455,  455,  486,  486,  517,  517,  548,  548,  579,  579,  610,
-  610,  641,  641,  672,  672,  672,  22,   22,   22,   53,   53,   84,   84,
-  115,  115,  146,  146,  177,  177,  208,  208,  239,  239,  270,  270,  301,
-  301,  332,  332,  363,  363,  394,  394,  425,  425,  456,  456,  487,  487,
-  518,  518,  549,  549,  580,  580,  611,  611,  642,  642,  673,  673,  704,
-  704,  704,  23,   23,   23,   54,   54,   85,   85,   116,  116,  147,  147,
-  178,  178,  209,  209,  240,  240,  271,  271,  302,  302,  333,  333,  364,
-  364,  395,  395,  426,  426,  457,  457,  488,  488,  519,  519,  550,  550,
-  581,  581,  612,  612,  643,  643,  674,  674,  705,  705,  736,  736,  736,
-  24,   24,   24,   55,   55,   86,   86,   117,  117,  148,  148,  179,  179,
-  210,  210,  241,  241,  272,  272,  303,  303,  334,  334,  365,  365,  396,
-  396,  427,  427,  458,  458,  489,  489,  520,  520,  551,  551,  582,  582,
-  613,  613,  644,  644,  675,  675,  706,  706,  737,  737,  768,  768,  768,
-  25,   25,   25,   56,   56,   87,   87,   118,  118,  149,  149,  180,  180,
-  211,  211,  242,  242,  273,  273,  304,  304,  335,  335,  366,  366,  397,
-  397,  428,  428,  459,  459,  490,  490,  521,  521,  552,  552,  583,  583,
-  614,  614,  645,  645,  676,  676,  707,  707,  738,  738,  769,  769,  800,
-  800,  800,  26,   26,   26,   57,   57,   88,   88,   119,  119,  150,  150,
-  181,  181,  212,  212,  243,  243,  274,  274,  305,  305,  336,  336,  367,
-  367,  398,  398,  429,  429,  460,  460,  491,  491,  522,  522,  553,  553,
-  584,  584,  615,  615,  646,  646,  677,  677,  708,  708,  739,  739,  770,
-  770,  801,  801,  832,  832,  832,  27,   27,   27,   58,   58,   89,   89,
-  120,  120,  151,  151,  182,  182,  213,  213,  244,  244,  275,  275,  306,
-  306,  337,  337,  368,  368,  399,  399,  430,  430,  461,  461,  492,  492,
-  523,  523,  554,  554,  585,  585,  616,  616,  647,  647,  678,  678,  709,
-  709,  740,  740,  771,  771,  802,  802,  833,  833,  864,  864,  864,  28,
-  28,   28,   59,   59,   90,   90,   121,  121,  152,  152,  183,  183,  214,
-  214,  245,  245,  276,  276,  307,  307,  338,  338,  369,  369,  400,  400,
-  431,  431,  462,  462,  493,  493,  524,  524,  555,  555,  586,  586,  617,
-  617,  648,  648,  679,  679,  710,  710,  741,  741,  772,  772,  803,  803,
-  834,  834,  865,  865,  896,  896,  896,  29,   29,   29,   60,   60,   91,
-  91,   122,  122,  153,  153,  184,  184,  215,  215,  246,  246,  277,  277,
-  308,  308,  339,  339,  370,  370,  401,  401,  432,  432,  463,  463,  494,
-  494,  525,  525,  556,  556,  587,  587,  618,  618,  649,  649,  680,  680,
-  711,  711,  742,  742,  773,  773,  804,  804,  835,  835,  866,  866,  897,
-  897,  928,  928,  928,  30,   30,   30,   61,   61,   92,   92,   123,  123,
-  154,  154,  185,  185,  216,  216,  247,  247,  278,  278,  309,  309,  340,
-  340,  371,  371,  402,  402,  433,  433,  464,  464,  495,  495,  526,  526,
-  557,  557,  588,  588,  619,  619,  650,  650,  681,  681,  712,  712,  743,
-  743,  774,  774,  805,  805,  836,  836,  867,  867,  898,  898,  929,  929,
-  960,  960,  960,  31,   62,   62,   93,   93,   124,  124,  155,  155,  186,
-  186,  217,  217,  248,  248,  279,  279,  310,  310,  341,  341,  372,  372,
-  403,  403,  434,  434,  465,  465,  496,  496,  527,  527,  558,  558,  589,
-  589,  620,  620,  651,  651,  682,  682,  713,  713,  744,  744,  775,  775,
-  806,  806,  837,  837,  868,  868,  899,  899,  930,  930,  961,  961,  992,
-  992,  992,  63,   94,   94,   125,  125,  156,  156,  187,  187,  218,  218,
-  249,  249,  280,  280,  311,  311,  342,  342,  373,  373,  404,  404,  435,
-  435,  466,  466,  497,  497,  528,  528,  559,  559,  590,  590,  621,  621,
-  652,  652,  683,  683,  714,  714,  745,  745,  776,  776,  807,  807,  838,
-  838,  869,  869,  900,  900,  931,  931,  962,  962,  993,  993,  1024, 1024,
-  1024, 95,   126,  126,  157,  157,  188,  188,  219,  219,  250,  250,  281,
-  281,  312,  312,  343,  343,  374,  374,  405,  405,  436,  436,  467,  467,
-  498,  498,  529,  529,  560,  560,  591,  591,  622,  622,  653,  653,  684,
-  684,  715,  715,  746,  746,  777,  777,  808,  808,  839,  839,  870,  870,
-  901,  901,  932,  932,  963,  963,  994,  994,  1025, 1025, 1056, 1056, 1056,
-  127,  158,  158,  189,  189,  220,  220,  251,  251,  282,  282,  313,  313,
-  344,  344,  375,  375,  406,  406,  437,  437,  468,  468,  499,  499,  530,
-  530,  561,  561,  592,  592,  623,  623,  654,  654,  685,  685,  716,  716,
-  747,  747,  778,  778,  809,  809,  840,  840,  871,  871,  902,  902,  933,
-  933,  964,  964,  995,  995,  1026, 1026, 1057, 1057, 1088, 1088, 1088, 159,
-  190,  190,  221,  221,  252,  252,  283,  283,  314,  314,  345,  345,  376,
-  376,  407,  407,  438,  438,  469,  469,  500,  500,  531,  531,  562,  562,
-  593,  593,  624,  624,  655,  655,  686,  686,  717,  717,  748,  748,  779,
-  779,  810,  810,  841,  841,  872,  872,  903,  903,  934,  934,  965,  965,
-  996,  996,  1027, 1027, 1058, 1058, 1089, 1089, 1120, 1120, 1120, 191,  222,
-  222,  253,  253,  284,  284,  315,  315,  346,  346,  377,  377,  408,  408,
-  439,  439,  470,  470,  501,  501,  532,  532,  563,  563,  594,  594,  625,
-  625,  656,  656,  687,  687,  718,  718,  749,  749,  780,  780,  811,  811,
-  842,  842,  873,  873,  904,  904,  935,  935,  966,  966,  997,  997,  1028,
-  1028, 1059, 1059, 1090, 1090, 1121, 1121, 1152, 1152, 1152, 223,  254,  254,
-  285,  285,  316,  316,  347,  347,  378,  378,  409,  409,  440,  440,  471,
-  471,  502,  502,  533,  533,  564,  564,  595,  595,  626,  626,  657,  657,
-  688,  688,  719,  719,  750,  750,  781,  781,  812,  812,  843,  843,  874,
-  874,  905,  905,  936,  936,  967,  967,  998,  998,  1029, 1029, 1060, 1060,
-  1091, 1091, 1122, 1122, 1153, 1153, 1184, 1184, 1184, 255,  286,  286,  317,
-  317,  348,  348,  379,  379,  410,  410,  441,  441,  472,  472,  503,  503,
-  534,  534,  565,  565,  596,  596,  627,  627,  658,  658,  689,  689,  720,
-  720,  751,  751,  782,  782,  813,  813,  844,  844,  875,  875,  906,  906,
-  937,  937,  968,  968,  999,  999,  1030, 1030, 1061, 1061, 1092, 1092, 1123,
-  1123, 1154, 1154, 1185, 1185, 1216, 1216, 1216, 287,  318,  318,  349,  349,
-  380,  380,  411,  411,  442,  442,  473,  473,  504,  504,  535,  535,  566,
-  566,  597,  597,  628,  628,  659,  659,  690,  690,  721,  721,  752,  752,
-  783,  783,  814,  814,  845,  845,  876,  876,  907,  907,  938,  938,  969,
-  969,  1000, 1000, 1031, 1031, 1062, 1062, 1093, 1093, 1124, 1124, 1155, 1155,
-  1186, 1186, 1217, 1217, 1248, 1248, 1248, 319,  350,  350,  381,  381,  412,
-  412,  443,  443,  474,  474,  505,  505,  536,  536,  567,  567,  598,  598,
-  629,  629,  660,  660,  691,  691,  722,  722,  753,  753,  784,  784,  815,
-  815,  846,  846,  877,  877,  908,  908,  939,  939,  970,  970,  1001, 1001,
-  1032, 1032, 1063, 1063, 1094, 1094, 1125, 1125, 1156, 1156, 1187, 1187, 1218,
-  1218, 1249, 1249, 1280, 1280, 1280, 351,  382,  382,  413,  413,  444,  444,
-  475,  475,  506,  506,  537,  537,  568,  568,  599,  599,  630,  630,  661,
-  661,  692,  692,  723,  723,  754,  754,  785,  785,  816,  816,  847,  847,
-  878,  878,  909,  909,  940,  940,  971,  971,  1002, 1002, 1033, 1033, 1064,
-  1064, 1095, 1095, 1126, 1126, 1157, 1157, 1188, 1188, 1219, 1219, 1250, 1250,
-  1281, 1281, 1312, 1312, 1312, 383,  414,  414,  445,  445,  476,  476,  507,
-  507,  538,  538,  569,  569,  600,  600,  631,  631,  662,  662,  693,  693,
-  724,  724,  755,  755,  786,  786,  817,  817,  848,  848,  879,  879,  910,
-  910,  941,  941,  972,  972,  1003, 1003, 1034, 1034, 1065, 1065, 1096, 1096,
-  1127, 1127, 1158, 1158, 1189, 1189, 1220, 1220, 1251, 1251, 1282, 1282, 1313,
-  1313, 1344, 1344, 1344, 415,  446,  446,  477,  477,  508,  508,  539,  539,
-  570,  570,  601,  601,  632,  632,  663,  663,  694,  694,  725,  725,  756,
-  756,  787,  787,  818,  818,  849,  849,  880,  880,  911,  911,  942,  942,
-  973,  973,  1004, 1004, 1035, 1035, 1066, 1066, 1097, 1097, 1128, 1128, 1159,
-  1159, 1190, 1190, 1221, 1221, 1252, 1252, 1283, 1283, 1314, 1314, 1345, 1345,
-  1376, 1376, 1376, 447,  478,  478,  509,  509,  540,  540,  571,  571,  602,
-  602,  633,  633,  664,  664,  695,  695,  726,  726,  757,  757,  788,  788,
-  819,  819,  850,  850,  881,  881,  912,  912,  943,  943,  974,  974,  1005,
-  1005, 1036, 1036, 1067, 1067, 1098, 1098, 1129, 1129, 1160, 1160, 1191, 1191,
-  1222, 1222, 1253, 1253, 1284, 1284, 1315, 1315, 1346, 1346, 1377, 1377, 1408,
-  1408, 1408, 479,  510,  510,  541,  541,  572,  572,  603,  603,  634,  634,
-  665,  665,  696,  696,  727,  727,  758,  758,  789,  789,  820,  820,  851,
-  851,  882,  882,  913,  913,  944,  944,  975,  975,  1006, 1006, 1037, 1037,
-  1068, 1068, 1099, 1099, 1130, 1130, 1161, 1161, 1192, 1192, 1223, 1223, 1254,
-  1254, 1285, 1285, 1316, 1316, 1347, 1347, 1378, 1378, 1409, 1409, 1440, 1440,
-  1440, 511,  542,  542,  573,  573,  604,  604,  635,  635,  666,  666,  697,
-  697,  728,  728,  759,  759,  790,  790,  821,  821,  852,  852,  883,  883,
-  914,  914,  945,  945,  976,  976,  1007, 1007, 1038, 1038, 1069, 1069, 1100,
-  1100, 1131, 1131, 1162, 1162, 1193, 1193, 1224, 1224, 1255, 1255, 1286, 1286,
-  1317, 1317, 1348, 1348, 1379, 1379, 1410, 1410, 1441, 1441, 1472, 1472, 1472,
-  543,  574,  574,  605,  605,  636,  636,  667,  667,  698,  698,  729,  729,
-  760,  760,  791,  791,  822,  822,  853,  853,  884,  884,  915,  915,  946,
-  946,  977,  977,  1008, 1008, 1039, 1039, 1070, 1070, 1101, 1101, 1132, 1132,
-  1163, 1163, 1194, 1194, 1225, 1225, 1256, 1256, 1287, 1287, 1318, 1318, 1349,
-  1349, 1380, 1380, 1411, 1411, 1442, 1442, 1473, 1473, 1504, 1504, 1504, 575,
-  606,  606,  637,  637,  668,  668,  699,  699,  730,  730,  761,  761,  792,
-  792,  823,  823,  854,  854,  885,  885,  916,  916,  947,  947,  978,  978,
-  1009, 1009, 1040, 1040, 1071, 1071, 1102, 1102, 1133, 1133, 1164, 1164, 1195,
-  1195, 1226, 1226, 1257, 1257, 1288, 1288, 1319, 1319, 1350, 1350, 1381, 1381,
-  1412, 1412, 1443, 1443, 1474, 1474, 1505, 1505, 1536, 1536, 1536, 607,  638,
-  638,  669,  669,  700,  700,  731,  731,  762,  762,  793,  793,  824,  824,
-  855,  855,  886,  886,  917,  917,  948,  948,  979,  979,  1010, 1010, 1041,
-  1041, 1072, 1072, 1103, 1103, 1134, 1134, 1165, 1165, 1196, 1196, 1227, 1227,
-  1258, 1258, 1289, 1289, 1320, 1320, 1351, 1351, 1382, 1382, 1413, 1413, 1444,
-  1444, 1475, 1475, 1506, 1506, 1537, 1537, 1568, 1568, 1568, 639,  670,  670,
-  701,  701,  732,  732,  763,  763,  794,  794,  825,  825,  856,  856,  887,
-  887,  918,  918,  949,  949,  980,  980,  1011, 1011, 1042, 1042, 1073, 1073,
-  1104, 1104, 1135, 1135, 1166, 1166, 1197, 1197, 1228, 1228, 1259, 1259, 1290,
-  1290, 1321, 1321, 1352, 1352, 1383, 1383, 1414, 1414, 1445, 1445, 1476, 1476,
-  1507, 1507, 1538, 1538, 1569, 1569, 1600, 1600, 1600, 671,  702,  702,  733,
-  733,  764,  764,  795,  795,  826,  826,  857,  857,  888,  888,  919,  919,
-  950,  950,  981,  981,  1012, 1012, 1043, 1043, 1074, 1074, 1105, 1105, 1136,
-  1136, 1167, 1167, 1198, 1198, 1229, 1229, 1260, 1260, 1291, 1291, 1322, 1322,
-  1353, 1353, 1384, 1384, 1415, 1415, 1446, 1446, 1477, 1477, 1508, 1508, 1539,
-  1539, 1570, 1570, 1601, 1601, 1632, 1632, 1632, 703,  734,  734,  765,  765,
-  796,  796,  827,  827,  858,  858,  889,  889,  920,  920,  951,  951,  982,
-  982,  1013, 1013, 1044, 1044, 1075, 1075, 1106, 1106, 1137, 1137, 1168, 1168,
-  1199, 1199, 1230, 1230, 1261, 1261, 1292, 1292, 1323, 1323, 1354, 1354, 1385,
-  1385, 1416, 1416, 1447, 1447, 1478, 1478, 1509, 1509, 1540, 1540, 1571, 1571,
-  1602, 1602, 1633, 1633, 1664, 1664, 1664, 735,  766,  766,  797,  797,  828,
-  828,  859,  859,  890,  890,  921,  921,  952,  952,  983,  983,  1014, 1014,
-  1045, 1045, 1076, 1076, 1107, 1107, 1138, 1138, 1169, 1169, 1200, 1200, 1231,
-  1231, 1262, 1262, 1293, 1293, 1324, 1324, 1355, 1355, 1386, 1386, 1417, 1417,
-  1448, 1448, 1479, 1479, 1510, 1510, 1541, 1541, 1572, 1572, 1603, 1603, 1634,
-  1634, 1665, 1665, 1696, 1696, 1696, 767,  798,  798,  829,  829,  860,  860,
-  891,  891,  922,  922,  953,  953,  984,  984,  1015, 1015, 1046, 1046, 1077,
-  1077, 1108, 1108, 1139, 1139, 1170, 1170, 1201, 1201, 1232, 1232, 1263, 1263,
-  1294, 1294, 1325, 1325, 1356, 1356, 1387, 1387, 1418, 1418, 1449, 1449, 1480,
-  1480, 1511, 1511, 1542, 1542, 1573, 1573, 1604, 1604, 1635, 1635, 1666, 1666,
-  1697, 1697, 1728, 1728, 1728, 799,  830,  830,  861,  861,  892,  892,  923,
-  923,  954,  954,  985,  985,  1016, 1016, 1047, 1047, 1078, 1078, 1109, 1109,
-  1140, 1140, 1171, 1171, 1202, 1202, 1233, 1233, 1264, 1264, 1295, 1295, 1326,
-  1326, 1357, 1357, 1388, 1388, 1419, 1419, 1450, 1450, 1481, 1481, 1512, 1512,
-  1543, 1543, 1574, 1574, 1605, 1605, 1636, 1636, 1667, 1667, 1698, 1698, 1729,
-  1729, 1760, 1760, 1760, 831,  862,  862,  893,  893,  924,  924,  955,  955,
-  986,  986,  1017, 1017, 1048, 1048, 1079, 1079, 1110, 1110, 1141, 1141, 1172,
-  1172, 1203, 1203, 1234, 1234, 1265, 1265, 1296, 1296, 1327, 1327, 1358, 1358,
-  1389, 1389, 1420, 1420, 1451, 1451, 1482, 1482, 1513, 1513, 1544, 1544, 1575,
-  1575, 1606, 1606, 1637, 1637, 1668, 1668, 1699, 1699, 1730, 1730, 1761, 1761,
-  1792, 1792, 1792, 863,  894,  894,  925,  925,  956,  956,  987,  987,  1018,
-  1018, 1049, 1049, 1080, 1080, 1111, 1111, 1142, 1142, 1173, 1173, 1204, 1204,
-  1235, 1235, 1266, 1266, 1297, 1297, 1328, 1328, 1359, 1359, 1390, 1390, 1421,
-  1421, 1452, 1452, 1483, 1483, 1514, 1514, 1545, 1545, 1576, 1576, 1607, 1607,
-  1638, 1638, 1669, 1669, 1700, 1700, 1731, 1731, 1762, 1762, 1793, 1793, 1824,
-  1824, 1824, 895,  926,  926,  957,  957,  988,  988,  1019, 1019, 1050, 1050,
-  1081, 1081, 1112, 1112, 1143, 1143, 1174, 1174, 1205, 1205, 1236, 1236, 1267,
-  1267, 1298, 1298, 1329, 1329, 1360, 1360, 1391, 1391, 1422, 1422, 1453, 1453,
-  1484, 1484, 1515, 1515, 1546, 1546, 1577, 1577, 1608, 1608, 1639, 1639, 1670,
-  1670, 1701, 1701, 1732, 1732, 1763, 1763, 1794, 1794, 1825, 1825, 1856, 1856,
-  1856, 927,  958,  958,  989,  989,  1020, 1020, 1051, 1051, 1082, 1082, 1113,
-  1113, 1144, 1144, 1175, 1175, 1206, 1206, 1237, 1237, 1268, 1268, 1299, 1299,
-  1330, 1330, 1361, 1361, 1392, 1392, 1423, 1423, 1454, 1454, 1485, 1485, 1516,
-  1516, 1547, 1547, 1578, 1578, 1609, 1609, 1640, 1640, 1671, 1671, 1702, 1702,
-  1733, 1733, 1764, 1764, 1795, 1795, 1826, 1826, 1857, 1857, 1888, 1888, 1888,
-  959,  990,  990,  1021, 1021, 1052, 1052, 1083, 1083, 1114, 1114, 1145, 1145,
-  1176, 1176, 1207, 1207, 1238, 1238, 1269, 1269, 1300, 1300, 1331, 1331, 1362,
-  1362, 1393, 1393, 1424, 1424, 1455, 1455, 1486, 1486, 1517, 1517, 1548, 1548,
-  1579, 1579, 1610, 1610, 1641, 1641, 1672, 1672, 1703, 1703, 1734, 1734, 1765,
-  1765, 1796, 1796, 1827, 1827, 1858, 1858, 1889, 1889, 1920, 1920, 1920, 991,
-  1022, 1022, 1053, 1053, 1084, 1084, 1115, 1115, 1146, 1146, 1177, 1177, 1208,
-  1208, 1239, 1239, 1270, 1270, 1301, 1301, 1332, 1332, 1363, 1363, 1394, 1394,
-  1425, 1425, 1456, 1456, 1487, 1487, 1518, 1518, 1549, 1549, 1580, 1580, 1611,
-  1611, 1642, 1642, 1673, 1673, 1704, 1704, 1735, 1735, 1766, 1766, 1797, 1797,
-  1828, 1828, 1859, 1859, 1890, 1890, 1921, 1921, 1952, 1952, 1952, 1023, 1054,
-  1054, 1085, 1085, 1116, 1116, 1147, 1147, 1178, 1178, 1209, 1209, 1240, 1240,
-  1271, 1271, 1302, 1302, 1333, 1333, 1364, 1364, 1395, 1395, 1426, 1426, 1457,
-  1457, 1488, 1488, 1519, 1519, 1550, 1550, 1581, 1581, 1612, 1612, 1643, 1643,
-  1674, 1674, 1705, 1705, 1736, 1736, 1767, 1767, 1798, 1798, 1829, 1829, 1860,
-  1860, 1891, 1891, 1922, 1922, 1953, 1953, 1984, 1984, 1984, 1055, 1086, 1086,
-  1117, 1117, 1148, 1148, 1179, 1179, 1210, 1210, 1241, 1241, 1272, 1272, 1303,
-  1303, 1334, 1334, 1365, 1365, 1396, 1396, 1427, 1427, 1458, 1458, 1489, 1489,
-  1520, 1520, 1551, 1551, 1582, 1582, 1613, 1613, 1644, 1644, 1675, 1675, 1706,
-  1706, 1737, 1737, 1768, 1768, 1799, 1799, 1830, 1830, 1861, 1861, 1892, 1892,
-  1923, 1923, 1954, 1954, 1985, 1985, 2016, 1087, 1118, 1118, 1149, 1149, 1180,
-  1180, 1211, 1211, 1242, 1242, 1273, 1273, 1304, 1304, 1335, 1335, 1366, 1366,
-  1397, 1397, 1428, 1428, 1459, 1459, 1490, 1490, 1521, 1521, 1552, 1552, 1583,
-  1583, 1614, 1614, 1645, 1645, 1676, 1676, 1707, 1707, 1738, 1738, 1769, 1769,
-  1800, 1800, 1831, 1831, 1862, 1862, 1893, 1893, 1924, 1924, 1955, 1955, 1986,
-  1986, 2017, 1119, 1150, 1150, 1181, 1181, 1212, 1212, 1243, 1243, 1274, 1274,
-  1305, 1305, 1336, 1336, 1367, 1367, 1398, 1398, 1429, 1429, 1460, 1460, 1491,
-  1491, 1522, 1522, 1553, 1553, 1584, 1584, 1615, 1615, 1646, 1646, 1677, 1677,
-  1708, 1708, 1739, 1739, 1770, 1770, 1801, 1801, 1832, 1832, 1863, 1863, 1894,
-  1894, 1925, 1925, 1956, 1956, 1987, 1987, 2018, 1151, 1182, 1182, 1213, 1213,
-  1244, 1244, 1275, 1275, 1306, 1306, 1337, 1337, 1368, 1368, 1399, 1399, 1430,
-  1430, 1461, 1461, 1492, 1492, 1523, 1523, 1554, 1554, 1585, 1585, 1616, 1616,
-  1647, 1647, 1678, 1678, 1709, 1709, 1740, 1740, 1771, 1771, 1802, 1802, 1833,
-  1833, 1864, 1864, 1895, 1895, 1926, 1926, 1957, 1957, 1988, 1988, 2019, 1183,
-  1214, 1214, 1245, 1245, 1276, 1276, 1307, 1307, 1338, 1338, 1369, 1369, 1400,
-  1400, 1431, 1431, 1462, 1462, 1493, 1493, 1524, 1524, 1555, 1555, 1586, 1586,
-  1617, 1617, 1648, 1648, 1679, 1679, 1710, 1710, 1741, 1741, 1772, 1772, 1803,
-  1803, 1834, 1834, 1865, 1865, 1896, 1896, 1927, 1927, 1958, 1958, 1989, 1989,
-  2020, 1215, 1246, 1246, 1277, 1277, 1308, 1308, 1339, 1339, 1370, 1370, 1401,
-  1401, 1432, 1432, 1463, 1463, 1494, 1494, 1525, 1525, 1556, 1556, 1587, 1587,
-  1618, 1618, 1649, 1649, 1680, 1680, 1711, 1711, 1742, 1742, 1773, 1773, 1804,
-  1804, 1835, 1835, 1866, 1866, 1897, 1897, 1928, 1928, 1959, 1959, 1990, 1990,
-  2021, 1247, 1278, 1278, 1309, 1309, 1340, 1340, 1371, 1371, 1402, 1402, 1433,
-  1433, 1464, 1464, 1495, 1495, 1526, 1526, 1557, 1557, 1588, 1588, 1619, 1619,
-  1650, 1650, 1681, 1681, 1712, 1712, 1743, 1743, 1774, 1774, 1805, 1805, 1836,
-  1836, 1867, 1867, 1898, 1898, 1929, 1929, 1960, 1960, 1991, 1991, 2022, 1279,
-  1310, 1310, 1341, 1341, 1372, 1372, 1403, 1403, 1434, 1434, 1465, 1465, 1496,
-  1496, 1527, 1527, 1558, 1558, 1589, 1589, 1620, 1620, 1651, 1651, 1682, 1682,
-  1713, 1713, 1744, 1744, 1775, 1775, 1806, 1806, 1837, 1837, 1868, 1868, 1899,
-  1899, 1930, 1930, 1961, 1961, 1992, 1992, 2023, 1311, 1342, 1342, 1373, 1373,
-  1404, 1404, 1435, 1435, 1466, 1466, 1497, 1497, 1528, 1528, 1559, 1559, 1590,
-  1590, 1621, 1621, 1652, 1652, 1683, 1683, 1714, 1714, 1745, 1745, 1776, 1776,
-  1807, 1807, 1838, 1838, 1869, 1869, 1900, 1900, 1931, 1931, 1962, 1962, 1993,
-  1993, 2024, 1343, 1374, 1374, 1405, 1405, 1436, 1436, 1467, 1467, 1498, 1498,
-  1529, 1529, 1560, 1560, 1591, 1591, 1622, 1622, 1653, 1653, 1684, 1684, 1715,
-  1715, 1746, 1746, 1777, 1777, 1808, 1808, 1839, 1839, 1870, 1870, 1901, 1901,
-  1932, 1932, 1963, 1963, 1994, 1994, 2025, 1375, 1406, 1406, 1437, 1437, 1468,
-  1468, 1499, 1499, 1530, 1530, 1561, 1561, 1592, 1592, 1623, 1623, 1654, 1654,
-  1685, 1685, 1716, 1716, 1747, 1747, 1778, 1778, 1809, 1809, 1840, 1840, 1871,
-  1871, 1902, 1902, 1933, 1933, 1964, 1964, 1995, 1995, 2026, 1407, 1438, 1438,
-  1469, 1469, 1500, 1500, 1531, 1531, 1562, 1562, 1593, 1593, 1624, 1624, 1655,
-  1655, 1686, 1686, 1717, 1717, 1748, 1748, 1779, 1779, 1810, 1810, 1841, 1841,
-  1872, 1872, 1903, 1903, 1934, 1934, 1965, 1965, 1996, 1996, 2027, 1439, 1470,
-  1470, 1501, 1501, 1532, 1532, 1563, 1563, 1594, 1594, 1625, 1625, 1656, 1656,
-  1687, 1687, 1718, 1718, 1749, 1749, 1780, 1780, 1811, 1811, 1842, 1842, 1873,
-  1873, 1904, 1904, 1935, 1935, 1966, 1966, 1997, 1997, 2028, 1471, 1502, 1502,
-  1533, 1533, 1564, 1564, 1595, 1595, 1626, 1626, 1657, 1657, 1688, 1688, 1719,
-  1719, 1750, 1750, 1781, 1781, 1812, 1812, 1843, 1843, 1874, 1874, 1905, 1905,
-  1936, 1936, 1967, 1967, 1998, 1998, 2029, 1503, 1534, 1534, 1565, 1565, 1596,
-  1596, 1627, 1627, 1658, 1658, 1689, 1689, 1720, 1720, 1751, 1751, 1782, 1782,
-  1813, 1813, 1844, 1844, 1875, 1875, 1906, 1906, 1937, 1937, 1968, 1968, 1999,
-  1999, 2030, 1535, 1566, 1566, 1597, 1597, 1628, 1628, 1659, 1659, 1690, 1690,
-  1721, 1721, 1752, 1752, 1783, 1783, 1814, 1814, 1845, 1845, 1876, 1876, 1907,
-  1907, 1938, 1938, 1969, 1969, 2000, 2000, 2031, 1567, 1598, 1598, 1629, 1629,
-  1660, 1660, 1691, 1691, 1722, 1722, 1753, 1753, 1784, 1784, 1815, 1815, 1846,
-  1846, 1877, 1877, 1908, 1908, 1939, 1939, 1970, 1970, 2001, 2001, 2032, 1599,
-  1630, 1630, 1661, 1661, 1692, 1692, 1723, 1723, 1754, 1754, 1785, 1785, 1816,
-  1816, 1847, 1847, 1878, 1878, 1909, 1909, 1940, 1940, 1971, 1971, 2002, 2002,
-  2033, 1631, 1662, 1662, 1693, 1693, 1724, 1724, 1755, 1755, 1786, 1786, 1817,
-  1817, 1848, 1848, 1879, 1879, 1910, 1910, 1941, 1941, 1972, 1972, 2003, 2003,
-  2034, 1663, 1694, 1694, 1725, 1725, 1756, 1756, 1787, 1787, 1818, 1818, 1849,
-  1849, 1880, 1880, 1911, 1911, 1942, 1942, 1973, 1973, 2004, 2004, 2035, 1695,
-  1726, 1726, 1757, 1757, 1788, 1788, 1819, 1819, 1850, 1850, 1881, 1881, 1912,
-  1912, 1943, 1943, 1974, 1974, 2005, 2005, 2036, 1727, 1758, 1758, 1789, 1789,
-  1820, 1820, 1851, 1851, 1882, 1882, 1913, 1913, 1944, 1944, 1975, 1975, 2006,
-  2006, 2037, 1759, 1790, 1790, 1821, 1821, 1852, 1852, 1883, 1883, 1914, 1914,
-  1945, 1945, 1976, 1976, 2007, 2007, 2038, 1791, 1822, 1822, 1853, 1853, 1884,
-  1884, 1915, 1915, 1946, 1946, 1977, 1977, 2008, 2008, 2039, 1823, 1854, 1854,
-  1885, 1885, 1916, 1916, 1947, 1947, 1978, 1978, 2009, 2009, 2040, 1855, 1886,
-  1886, 1917, 1917, 1948, 1948, 1979, 1979, 2010, 2010, 2041, 1887, 1918, 1918,
-  1949, 1949, 1980, 1980, 2011, 2011, 2042, 1919, 1950, 1950, 1981, 1981, 2012,
-  2012, 2043, 1951, 1982, 1982, 2013, 2013, 2044, 1983, 2014, 2014, 2045, 2015,
-  2046, 0,    0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_64x32_neighbors[2049 * MAX_NEIGHBORS]) = {
-  0,    0,    0,    0,    0,    0,    1,    1,    1,    64,   64,   64,   2,
-  2,    2,    65,   65,   128,  128,  128,  3,    3,    3,    66,   66,   129,
-  129,  192,  192,  192,  4,    4,    4,    67,   67,   130,  130,  193,  193,
-  256,  256,  256,  5,    5,    5,    68,   68,   131,  131,  194,  194,  257,
-  257,  320,  320,  320,  6,    6,    6,    69,   69,   132,  132,  195,  195,
-  258,  258,  321,  321,  384,  384,  384,  7,    7,    7,    70,   70,   133,
-  133,  196,  196,  259,  259,  322,  322,  385,  385,  448,  448,  448,  8,
-  8,    8,    71,   71,   134,  134,  197,  197,  260,  260,  323,  323,  386,
-  386,  449,  449,  512,  512,  512,  9,    9,    9,    72,   72,   135,  135,
-  198,  198,  261,  261,  324,  324,  387,  387,  450,  450,  513,  513,  576,
-  576,  576,  10,   10,   10,   73,   73,   136,  136,  199,  199,  262,  262,
-  325,  325,  388,  388,  451,  451,  514,  514,  577,  577,  640,  640,  640,
-  11,   11,   11,   74,   74,   137,  137,  200,  200,  263,  263,  326,  326,
-  389,  389,  452,  452,  515,  515,  578,  578,  641,  641,  704,  704,  704,
-  12,   12,   12,   75,   75,   138,  138,  201,  201,  264,  264,  327,  327,
-  390,  390,  453,  453,  516,  516,  579,  579,  642,  642,  705,  705,  768,
-  768,  768,  13,   13,   13,   76,   76,   139,  139,  202,  202,  265,  265,
-  328,  328,  391,  391,  454,  454,  517,  517,  580,  580,  643,  643,  706,
-  706,  769,  769,  832,  832,  832,  14,   14,   14,   77,   77,   140,  140,
-  203,  203,  266,  266,  329,  329,  392,  392,  455,  455,  518,  518,  581,
-  581,  644,  644,  707,  707,  770,  770,  833,  833,  896,  896,  896,  15,
-  15,   15,   78,   78,   141,  141,  204,  204,  267,  267,  330,  330,  393,
-  393,  456,  456,  519,  519,  582,  582,  645,  645,  708,  708,  771,  771,
-  834,  834,  897,  897,  960,  960,  960,  16,   16,   16,   79,   79,   142,
-  142,  205,  205,  268,  268,  331,  331,  394,  394,  457,  457,  520,  520,
-  583,  583,  646,  646,  709,  709,  772,  772,  835,  835,  898,  898,  961,
-  961,  1024, 1024, 1024, 17,   17,   17,   80,   80,   143,  143,  206,  206,
-  269,  269,  332,  332,  395,  395,  458,  458,  521,  521,  584,  584,  647,
-  647,  710,  710,  773,  773,  836,  836,  899,  899,  962,  962,  1025, 1025,
-  1088, 1088, 1088, 18,   18,   18,   81,   81,   144,  144,  207,  207,  270,
-  270,  333,  333,  396,  396,  459,  459,  522,  522,  585,  585,  648,  648,
-  711,  711,  774,  774,  837,  837,  900,  900,  963,  963,  1026, 1026, 1089,
-  1089, 1152, 1152, 1152, 19,   19,   19,   82,   82,   145,  145,  208,  208,
-  271,  271,  334,  334,  397,  397,  460,  460,  523,  523,  586,  586,  649,
-  649,  712,  712,  775,  775,  838,  838,  901,  901,  964,  964,  1027, 1027,
-  1090, 1090, 1153, 1153, 1216, 1216, 1216, 20,   20,   20,   83,   83,   146,
-  146,  209,  209,  272,  272,  335,  335,  398,  398,  461,  461,  524,  524,
-  587,  587,  650,  650,  713,  713,  776,  776,  839,  839,  902,  902,  965,
-  965,  1028, 1028, 1091, 1091, 1154, 1154, 1217, 1217, 1280, 1280, 1280, 21,
-  21,   21,   84,   84,   147,  147,  210,  210,  273,  273,  336,  336,  399,
-  399,  462,  462,  525,  525,  588,  588,  651,  651,  714,  714,  777,  777,
-  840,  840,  903,  903,  966,  966,  1029, 1029, 1092, 1092, 1155, 1155, 1218,
-  1218, 1281, 1281, 1344, 1344, 1344, 22,   22,   22,   85,   85,   148,  148,
-  211,  211,  274,  274,  337,  337,  400,  400,  463,  463,  526,  526,  589,
-  589,  652,  652,  715,  715,  778,  778,  841,  841,  904,  904,  967,  967,
-  1030, 1030, 1093, 1093, 1156, 1156, 1219, 1219, 1282, 1282, 1345, 1345, 1408,
-  1408, 1408, 23,   23,   23,   86,   86,   149,  149,  212,  212,  275,  275,
-  338,  338,  401,  401,  464,  464,  527,  527,  590,  590,  653,  653,  716,
-  716,  779,  779,  842,  842,  905,  905,  968,  968,  1031, 1031, 1094, 1094,
-  1157, 1157, 1220, 1220, 1283, 1283, 1346, 1346, 1409, 1409, 1472, 1472, 1472,
-  24,   24,   24,   87,   87,   150,  150,  213,  213,  276,  276,  339,  339,
-  402,  402,  465,  465,  528,  528,  591,  591,  654,  654,  717,  717,  780,
-  780,  843,  843,  906,  906,  969,  969,  1032, 1032, 1095, 1095, 1158, 1158,
-  1221, 1221, 1284, 1284, 1347, 1347, 1410, 1410, 1473, 1473, 1536, 1536, 1536,
-  25,   25,   25,   88,   88,   151,  151,  214,  214,  277,  277,  340,  340,
-  403,  403,  466,  466,  529,  529,  592,  592,  655,  655,  718,  718,  781,
-  781,  844,  844,  907,  907,  970,  970,  1033, 1033, 1096, 1096, 1159, 1159,
-  1222, 1222, 1285, 1285, 1348, 1348, 1411, 1411, 1474, 1474, 1537, 1537, 1600,
-  1600, 1600, 26,   26,   26,   89,   89,   152,  152,  215,  215,  278,  278,
-  341,  341,  404,  404,  467,  467,  530,  530,  593,  593,  656,  656,  719,
-  719,  782,  782,  845,  845,  908,  908,  971,  971,  1034, 1034, 1097, 1097,
-  1160, 1160, 1223, 1223, 1286, 1286, 1349, 1349, 1412, 1412, 1475, 1475, 1538,
-  1538, 1601, 1601, 1664, 1664, 1664, 27,   27,   27,   90,   90,   153,  153,
-  216,  216,  279,  279,  342,  342,  405,  405,  468,  468,  531,  531,  594,
-  594,  657,  657,  720,  720,  783,  783,  846,  846,  909,  909,  972,  972,
-  1035, 1035, 1098, 1098, 1161, 1161, 1224, 1224, 1287, 1287, 1350, 1350, 1413,
-  1413, 1476, 1476, 1539, 1539, 1602, 1602, 1665, 1665, 1728, 1728, 1728, 28,
-  28,   28,   91,   91,   154,  154,  217,  217,  280,  280,  343,  343,  406,
-  406,  469,  469,  532,  532,  595,  595,  658,  658,  721,  721,  784,  784,
-  847,  847,  910,  910,  973,  973,  1036, 1036, 1099, 1099, 1162, 1162, 1225,
-  1225, 1288, 1288, 1351, 1351, 1414, 1414, 1477, 1477, 1540, 1540, 1603, 1603,
-  1666, 1666, 1729, 1729, 1792, 1792, 1792, 29,   29,   29,   92,   92,   155,
-  155,  218,  218,  281,  281,  344,  344,  407,  407,  470,  470,  533,  533,
-  596,  596,  659,  659,  722,  722,  785,  785,  848,  848,  911,  911,  974,
-  974,  1037, 1037, 1100, 1100, 1163, 1163, 1226, 1226, 1289, 1289, 1352, 1352,
-  1415, 1415, 1478, 1478, 1541, 1541, 1604, 1604, 1667, 1667, 1730, 1730, 1793,
-  1793, 1856, 1856, 1856, 30,   30,   30,   93,   93,   156,  156,  219,  219,
-  282,  282,  345,  345,  408,  408,  471,  471,  534,  534,  597,  597,  660,
-  660,  723,  723,  786,  786,  849,  849,  912,  912,  975,  975,  1038, 1038,
-  1101, 1101, 1164, 1164, 1227, 1227, 1290, 1290, 1353, 1353, 1416, 1416, 1479,
-  1479, 1542, 1542, 1605, 1605, 1668, 1668, 1731, 1731, 1794, 1794, 1857, 1857,
-  1920, 1920, 1920, 31,   31,   31,   94,   94,   157,  157,  220,  220,  283,
-  283,  346,  346,  409,  409,  472,  472,  535,  535,  598,  598,  661,  661,
-  724,  724,  787,  787,  850,  850,  913,  913,  976,  976,  1039, 1039, 1102,
-  1102, 1165, 1165, 1228, 1228, 1291, 1291, 1354, 1354, 1417, 1417, 1480, 1480,
-  1543, 1543, 1606, 1606, 1669, 1669, 1732, 1732, 1795, 1795, 1858, 1858, 1921,
-  1921, 1984, 32,   32,   32,   95,   95,   158,  158,  221,  221,  284,  284,
-  347,  347,  410,  410,  473,  473,  536,  536,  599,  599,  662,  662,  725,
-  725,  788,  788,  851,  851,  914,  914,  977,  977,  1040, 1040, 1103, 1103,
-  1166, 1166, 1229, 1229, 1292, 1292, 1355, 1355, 1418, 1418, 1481, 1481, 1544,
-  1544, 1607, 1607, 1670, 1670, 1733, 1733, 1796, 1796, 1859, 1859, 1922, 1922,
-  1985, 33,   33,   33,   96,   96,   159,  159,  222,  222,  285,  285,  348,
-  348,  411,  411,  474,  474,  537,  537,  600,  600,  663,  663,  726,  726,
-  789,  789,  852,  852,  915,  915,  978,  978,  1041, 1041, 1104, 1104, 1167,
-  1167, 1230, 1230, 1293, 1293, 1356, 1356, 1419, 1419, 1482, 1482, 1545, 1545,
-  1608, 1608, 1671, 1671, 1734, 1734, 1797, 1797, 1860, 1860, 1923, 1923, 1986,
-  34,   34,   34,   97,   97,   160,  160,  223,  223,  286,  286,  349,  349,
-  412,  412,  475,  475,  538,  538,  601,  601,  664,  664,  727,  727,  790,
-  790,  853,  853,  916,  916,  979,  979,  1042, 1042, 1105, 1105, 1168, 1168,
-  1231, 1231, 1294, 1294, 1357, 1357, 1420, 1420, 1483, 1483, 1546, 1546, 1609,
-  1609, 1672, 1672, 1735, 1735, 1798, 1798, 1861, 1861, 1924, 1924, 1987, 35,
-  35,   35,   98,   98,   161,  161,  224,  224,  287,  287,  350,  350,  413,
-  413,  476,  476,  539,  539,  602,  602,  665,  665,  728,  728,  791,  791,
-  854,  854,  917,  917,  980,  980,  1043, 1043, 1106, 1106, 1169, 1169, 1232,
-  1232, 1295, 1295, 1358, 1358, 1421, 1421, 1484, 1484, 1547, 1547, 1610, 1610,
-  1673, 1673, 1736, 1736, 1799, 1799, 1862, 1862, 1925, 1925, 1988, 36,   36,
-  36,   99,   99,   162,  162,  225,  225,  288,  288,  351,  351,  414,  414,
-  477,  477,  540,  540,  603,  603,  666,  666,  729,  729,  792,  792,  855,
-  855,  918,  918,  981,  981,  1044, 1044, 1107, 1107, 1170, 1170, 1233, 1233,
-  1296, 1296, 1359, 1359, 1422, 1422, 1485, 1485, 1548, 1548, 1611, 1611, 1674,
-  1674, 1737, 1737, 1800, 1800, 1863, 1863, 1926, 1926, 1989, 37,   37,   37,
-  100,  100,  163,  163,  226,  226,  289,  289,  352,  352,  415,  415,  478,
-  478,  541,  541,  604,  604,  667,  667,  730,  730,  793,  793,  856,  856,
-  919,  919,  982,  982,  1045, 1045, 1108, 1108, 1171, 1171, 1234, 1234, 1297,
-  1297, 1360, 1360, 1423, 1423, 1486, 1486, 1549, 1549, 1612, 1612, 1675, 1675,
-  1738, 1738, 1801, 1801, 1864, 1864, 1927, 1927, 1990, 38,   38,   38,   101,
-  101,  164,  164,  227,  227,  290,  290,  353,  353,  416,  416,  479,  479,
-  542,  542,  605,  605,  668,  668,  731,  731,  794,  794,  857,  857,  920,
-  920,  983,  983,  1046, 1046, 1109, 1109, 1172, 1172, 1235, 1235, 1298, 1298,
-  1361, 1361, 1424, 1424, 1487, 1487, 1550, 1550, 1613, 1613, 1676, 1676, 1739,
-  1739, 1802, 1802, 1865, 1865, 1928, 1928, 1991, 39,   39,   39,   102,  102,
-  165,  165,  228,  228,  291,  291,  354,  354,  417,  417,  480,  480,  543,
-  543,  606,  606,  669,  669,  732,  732,  795,  795,  858,  858,  921,  921,
-  984,  984,  1047, 1047, 1110, 1110, 1173, 1173, 1236, 1236, 1299, 1299, 1362,
-  1362, 1425, 1425, 1488, 1488, 1551, 1551, 1614, 1614, 1677, 1677, 1740, 1740,
-  1803, 1803, 1866, 1866, 1929, 1929, 1992, 40,   40,   40,   103,  103,  166,
-  166,  229,  229,  292,  292,  355,  355,  418,  418,  481,  481,  544,  544,
-  607,  607,  670,  670,  733,  733,  796,  796,  859,  859,  922,  922,  985,
-  985,  1048, 1048, 1111, 1111, 1174, 1174, 1237, 1237, 1300, 1300, 1363, 1363,
-  1426, 1426, 1489, 1489, 1552, 1552, 1615, 1615, 1678, 1678, 1741, 1741, 1804,
-  1804, 1867, 1867, 1930, 1930, 1993, 41,   41,   41,   104,  104,  167,  167,
-  230,  230,  293,  293,  356,  356,  419,  419,  482,  482,  545,  545,  608,
-  608,  671,  671,  734,  734,  797,  797,  860,  860,  923,  923,  986,  986,
-  1049, 1049, 1112, 1112, 1175, 1175, 1238, 1238, 1301, 1301, 1364, 1364, 1427,
-  1427, 1490, 1490, 1553, 1553, 1616, 1616, 1679, 1679, 1742, 1742, 1805, 1805,
-  1868, 1868, 1931, 1931, 1994, 42,   42,   42,   105,  105,  168,  168,  231,
-  231,  294,  294,  357,  357,  420,  420,  483,  483,  546,  546,  609,  609,
-  672,  672,  735,  735,  798,  798,  861,  861,  924,  924,  987,  987,  1050,
-  1050, 1113, 1113, 1176, 1176, 1239, 1239, 1302, 1302, 1365, 1365, 1428, 1428,
-  1491, 1491, 1554, 1554, 1617, 1617, 1680, 1680, 1743, 1743, 1806, 1806, 1869,
-  1869, 1932, 1932, 1995, 43,   43,   43,   106,  106,  169,  169,  232,  232,
-  295,  295,  358,  358,  421,  421,  484,  484,  547,  547,  610,  610,  673,
-  673,  736,  736,  799,  799,  862,  862,  925,  925,  988,  988,  1051, 1051,
-  1114, 1114, 1177, 1177, 1240, 1240, 1303, 1303, 1366, 1366, 1429, 1429, 1492,
-  1492, 1555, 1555, 1618, 1618, 1681, 1681, 1744, 1744, 1807, 1807, 1870, 1870,
-  1933, 1933, 1996, 44,   44,   44,   107,  107,  170,  170,  233,  233,  296,
-  296,  359,  359,  422,  422,  485,  485,  548,  548,  611,  611,  674,  674,
-  737,  737,  800,  800,  863,  863,  926,  926,  989,  989,  1052, 1052, 1115,
-  1115, 1178, 1178, 1241, 1241, 1304, 1304, 1367, 1367, 1430, 1430, 1493, 1493,
-  1556, 1556, 1619, 1619, 1682, 1682, 1745, 1745, 1808, 1808, 1871, 1871, 1934,
-  1934, 1997, 45,   45,   45,   108,  108,  171,  171,  234,  234,  297,  297,
-  360,  360,  423,  423,  486,  486,  549,  549,  612,  612,  675,  675,  738,
-  738,  801,  801,  864,  864,  927,  927,  990,  990,  1053, 1053, 1116, 1116,
-  1179, 1179, 1242, 1242, 1305, 1305, 1368, 1368, 1431, 1431, 1494, 1494, 1557,
-  1557, 1620, 1620, 1683, 1683, 1746, 1746, 1809, 1809, 1872, 1872, 1935, 1935,
-  1998, 46,   46,   46,   109,  109,  172,  172,  235,  235,  298,  298,  361,
-  361,  424,  424,  487,  487,  550,  550,  613,  613,  676,  676,  739,  739,
-  802,  802,  865,  865,  928,  928,  991,  991,  1054, 1054, 1117, 1117, 1180,
-  1180, 1243, 1243, 1306, 1306, 1369, 1369, 1432, 1432, 1495, 1495, 1558, 1558,
-  1621, 1621, 1684, 1684, 1747, 1747, 1810, 1810, 1873, 1873, 1936, 1936, 1999,
-  47,   47,   47,   110,  110,  173,  173,  236,  236,  299,  299,  362,  362,
-  425,  425,  488,  488,  551,  551,  614,  614,  677,  677,  740,  740,  803,
-  803,  866,  866,  929,  929,  992,  992,  1055, 1055, 1118, 1118, 1181, 1181,
-  1244, 1244, 1307, 1307, 1370, 1370, 1433, 1433, 1496, 1496, 1559, 1559, 1622,
-  1622, 1685, 1685, 1748, 1748, 1811, 1811, 1874, 1874, 1937, 1937, 2000, 48,
-  48,   48,   111,  111,  174,  174,  237,  237,  300,  300,  363,  363,  426,
-  426,  489,  489,  552,  552,  615,  615,  678,  678,  741,  741,  804,  804,
-  867,  867,  930,  930,  993,  993,  1056, 1056, 1119, 1119, 1182, 1182, 1245,
-  1245, 1308, 1308, 1371, 1371, 1434, 1434, 1497, 1497, 1560, 1560, 1623, 1623,
-  1686, 1686, 1749, 1749, 1812, 1812, 1875, 1875, 1938, 1938, 2001, 49,   49,
-  49,   112,  112,  175,  175,  238,  238,  301,  301,  364,  364,  427,  427,
-  490,  490,  553,  553,  616,  616,  679,  679,  742,  742,  805,  805,  868,
-  868,  931,  931,  994,  994,  1057, 1057, 1120, 1120, 1183, 1183, 1246, 1246,
-  1309, 1309, 1372, 1372, 1435, 1435, 1498, 1498, 1561, 1561, 1624, 1624, 1687,
-  1687, 1750, 1750, 1813, 1813, 1876, 1876, 1939, 1939, 2002, 50,   50,   50,
-  113,  113,  176,  176,  239,  239,  302,  302,  365,  365,  428,  428,  491,
-  491,  554,  554,  617,  617,  680,  680,  743,  743,  806,  806,  869,  869,
-  932,  932,  995,  995,  1058, 1058, 1121, 1121, 1184, 1184, 1247, 1247, 1310,
-  1310, 1373, 1373, 1436, 1436, 1499, 1499, 1562, 1562, 1625, 1625, 1688, 1688,
-  1751, 1751, 1814, 1814, 1877, 1877, 1940, 1940, 2003, 51,   51,   51,   114,
-  114,  177,  177,  240,  240,  303,  303,  366,  366,  429,  429,  492,  492,
-  555,  555,  618,  618,  681,  681,  744,  744,  807,  807,  870,  870,  933,
-  933,  996,  996,  1059, 1059, 1122, 1122, 1185, 1185, 1248, 1248, 1311, 1311,
-  1374, 1374, 1437, 1437, 1500, 1500, 1563, 1563, 1626, 1626, 1689, 1689, 1752,
-  1752, 1815, 1815, 1878, 1878, 1941, 1941, 2004, 52,   52,   52,   115,  115,
-  178,  178,  241,  241,  304,  304,  367,  367,  430,  430,  493,  493,  556,
-  556,  619,  619,  682,  682,  745,  745,  808,  808,  871,  871,  934,  934,
-  997,  997,  1060, 1060, 1123, 1123, 1186, 1186, 1249, 1249, 1312, 1312, 1375,
-  1375, 1438, 1438, 1501, 1501, 1564, 1564, 1627, 1627, 1690, 1690, 1753, 1753,
-  1816, 1816, 1879, 1879, 1942, 1942, 2005, 53,   53,   53,   116,  116,  179,
-  179,  242,  242,  305,  305,  368,  368,  431,  431,  494,  494,  557,  557,
-  620,  620,  683,  683,  746,  746,  809,  809,  872,  872,  935,  935,  998,
-  998,  1061, 1061, 1124, 1124, 1187, 1187, 1250, 1250, 1313, 1313, 1376, 1376,
-  1439, 1439, 1502, 1502, 1565, 1565, 1628, 1628, 1691, 1691, 1754, 1754, 1817,
-  1817, 1880, 1880, 1943, 1943, 2006, 54,   54,   54,   117,  117,  180,  180,
-  243,  243,  306,  306,  369,  369,  432,  432,  495,  495,  558,  558,  621,
-  621,  684,  684,  747,  747,  810,  810,  873,  873,  936,  936,  999,  999,
-  1062, 1062, 1125, 1125, 1188, 1188, 1251, 1251, 1314, 1314, 1377, 1377, 1440,
-  1440, 1503, 1503, 1566, 1566, 1629, 1629, 1692, 1692, 1755, 1755, 1818, 1818,
-  1881, 1881, 1944, 1944, 2007, 55,   55,   55,   118,  118,  181,  181,  244,
-  244,  307,  307,  370,  370,  433,  433,  496,  496,  559,  559,  622,  622,
-  685,  685,  748,  748,  811,  811,  874,  874,  937,  937,  1000, 1000, 1063,
-  1063, 1126, 1126, 1189, 1189, 1252, 1252, 1315, 1315, 1378, 1378, 1441, 1441,
-  1504, 1504, 1567, 1567, 1630, 1630, 1693, 1693, 1756, 1756, 1819, 1819, 1882,
-  1882, 1945, 1945, 2008, 56,   56,   56,   119,  119,  182,  182,  245,  245,
-  308,  308,  371,  371,  434,  434,  497,  497,  560,  560,  623,  623,  686,
-  686,  749,  749,  812,  812,  875,  875,  938,  938,  1001, 1001, 1064, 1064,
-  1127, 1127, 1190, 1190, 1253, 1253, 1316, 1316, 1379, 1379, 1442, 1442, 1505,
-  1505, 1568, 1568, 1631, 1631, 1694, 1694, 1757, 1757, 1820, 1820, 1883, 1883,
-  1946, 1946, 2009, 57,   57,   57,   120,  120,  183,  183,  246,  246,  309,
-  309,  372,  372,  435,  435,  498,  498,  561,  561,  624,  624,  687,  687,
-  750,  750,  813,  813,  876,  876,  939,  939,  1002, 1002, 1065, 1065, 1128,
-  1128, 1191, 1191, 1254, 1254, 1317, 1317, 1380, 1380, 1443, 1443, 1506, 1506,
-  1569, 1569, 1632, 1632, 1695, 1695, 1758, 1758, 1821, 1821, 1884, 1884, 1947,
-  1947, 2010, 58,   58,   58,   121,  121,  184,  184,  247,  247,  310,  310,
-  373,  373,  436,  436,  499,  499,  562,  562,  625,  625,  688,  688,  751,
-  751,  814,  814,  877,  877,  940,  940,  1003, 1003, 1066, 1066, 1129, 1129,
-  1192, 1192, 1255, 1255, 1318, 1318, 1381, 1381, 1444, 1444, 1507, 1507, 1570,
-  1570, 1633, 1633, 1696, 1696, 1759, 1759, 1822, 1822, 1885, 1885, 1948, 1948,
-  2011, 59,   59,   59,   122,  122,  185,  185,  248,  248,  311,  311,  374,
-  374,  437,  437,  500,  500,  563,  563,  626,  626,  689,  689,  752,  752,
-  815,  815,  878,  878,  941,  941,  1004, 1004, 1067, 1067, 1130, 1130, 1193,
-  1193, 1256, 1256, 1319, 1319, 1382, 1382, 1445, 1445, 1508, 1508, 1571, 1571,
-  1634, 1634, 1697, 1697, 1760, 1760, 1823, 1823, 1886, 1886, 1949, 1949, 2012,
-  60,   60,   60,   123,  123,  186,  186,  249,  249,  312,  312,  375,  375,
-  438,  438,  501,  501,  564,  564,  627,  627,  690,  690,  753,  753,  816,
-  816,  879,  879,  942,  942,  1005, 1005, 1068, 1068, 1131, 1131, 1194, 1194,
-  1257, 1257, 1320, 1320, 1383, 1383, 1446, 1446, 1509, 1509, 1572, 1572, 1635,
-  1635, 1698, 1698, 1761, 1761, 1824, 1824, 1887, 1887, 1950, 1950, 2013, 61,
-  61,   61,   124,  124,  187,  187,  250,  250,  313,  313,  376,  376,  439,
-  439,  502,  502,  565,  565,  628,  628,  691,  691,  754,  754,  817,  817,
-  880,  880,  943,  943,  1006, 1006, 1069, 1069, 1132, 1132, 1195, 1195, 1258,
-  1258, 1321, 1321, 1384, 1384, 1447, 1447, 1510, 1510, 1573, 1573, 1636, 1636,
-  1699, 1699, 1762, 1762, 1825, 1825, 1888, 1888, 1951, 1951, 2014, 62,   62,
-  62,   125,  125,  188,  188,  251,  251,  314,  314,  377,  377,  440,  440,
-  503,  503,  566,  566,  629,  629,  692,  692,  755,  755,  818,  818,  881,
-  881,  944,  944,  1007, 1007, 1070, 1070, 1133, 1133, 1196, 1196, 1259, 1259,
-  1322, 1322, 1385, 1385, 1448, 1448, 1511, 1511, 1574, 1574, 1637, 1637, 1700,
-  1700, 1763, 1763, 1826, 1826, 1889, 1889, 1952, 1952, 2015, 63,   126,  126,
-  189,  189,  252,  252,  315,  315,  378,  378,  441,  441,  504,  504,  567,
-  567,  630,  630,  693,  693,  756,  756,  819,  819,  882,  882,  945,  945,
-  1008, 1008, 1071, 1071, 1134, 1134, 1197, 1197, 1260, 1260, 1323, 1323, 1386,
-  1386, 1449, 1449, 1512, 1512, 1575, 1575, 1638, 1638, 1701, 1701, 1764, 1764,
-  1827, 1827, 1890, 1890, 1953, 1953, 2016, 127,  190,  190,  253,  253,  316,
-  316,  379,  379,  442,  442,  505,  505,  568,  568,  631,  631,  694,  694,
-  757,  757,  820,  820,  883,  883,  946,  946,  1009, 1009, 1072, 1072, 1135,
-  1135, 1198, 1198, 1261, 1261, 1324, 1324, 1387, 1387, 1450, 1450, 1513, 1513,
-  1576, 1576, 1639, 1639, 1702, 1702, 1765, 1765, 1828, 1828, 1891, 1891, 1954,
-  1954, 2017, 191,  254,  254,  317,  317,  380,  380,  443,  443,  506,  506,
-  569,  569,  632,  632,  695,  695,  758,  758,  821,  821,  884,  884,  947,
-  947,  1010, 1010, 1073, 1073, 1136, 1136, 1199, 1199, 1262, 1262, 1325, 1325,
-  1388, 1388, 1451, 1451, 1514, 1514, 1577, 1577, 1640, 1640, 1703, 1703, 1766,
-  1766, 1829, 1829, 1892, 1892, 1955, 1955, 2018, 255,  318,  318,  381,  381,
-  444,  444,  507,  507,  570,  570,  633,  633,  696,  696,  759,  759,  822,
-  822,  885,  885,  948,  948,  1011, 1011, 1074, 1074, 1137, 1137, 1200, 1200,
-  1263, 1263, 1326, 1326, 1389, 1389, 1452, 1452, 1515, 1515, 1578, 1578, 1641,
-  1641, 1704, 1704, 1767, 1767, 1830, 1830, 1893, 1893, 1956, 1956, 2019, 319,
-  382,  382,  445,  445,  508,  508,  571,  571,  634,  634,  697,  697,  760,
-  760,  823,  823,  886,  886,  949,  949,  1012, 1012, 1075, 1075, 1138, 1138,
-  1201, 1201, 1264, 1264, 1327, 1327, 1390, 1390, 1453, 1453, 1516, 1516, 1579,
-  1579, 1642, 1642, 1705, 1705, 1768, 1768, 1831, 1831, 1894, 1894, 1957, 1957,
-  2020, 383,  446,  446,  509,  509,  572,  572,  635,  635,  698,  698,  761,
-  761,  824,  824,  887,  887,  950,  950,  1013, 1013, 1076, 1076, 1139, 1139,
-  1202, 1202, 1265, 1265, 1328, 1328, 1391, 1391, 1454, 1454, 1517, 1517, 1580,
-  1580, 1643, 1643, 1706, 1706, 1769, 1769, 1832, 1832, 1895, 1895, 1958, 1958,
-  2021, 447,  510,  510,  573,  573,  636,  636,  699,  699,  762,  762,  825,
-  825,  888,  888,  951,  951,  1014, 1014, 1077, 1077, 1140, 1140, 1203, 1203,
-  1266, 1266, 1329, 1329, 1392, 1392, 1455, 1455, 1518, 1518, 1581, 1581, 1644,
-  1644, 1707, 1707, 1770, 1770, 1833, 1833, 1896, 1896, 1959, 1959, 2022, 511,
-  574,  574,  637,  637,  700,  700,  763,  763,  826,  826,  889,  889,  952,
-  952,  1015, 1015, 1078, 1078, 1141, 1141, 1204, 1204, 1267, 1267, 1330, 1330,
-  1393, 1393, 1456, 1456, 1519, 1519, 1582, 1582, 1645, 1645, 1708, 1708, 1771,
-  1771, 1834, 1834, 1897, 1897, 1960, 1960, 2023, 575,  638,  638,  701,  701,
-  764,  764,  827,  827,  890,  890,  953,  953,  1016, 1016, 1079, 1079, 1142,
-  1142, 1205, 1205, 1268, 1268, 1331, 1331, 1394, 1394, 1457, 1457, 1520, 1520,
-  1583, 1583, 1646, 1646, 1709, 1709, 1772, 1772, 1835, 1835, 1898, 1898, 1961,
-  1961, 2024, 639,  702,  702,  765,  765,  828,  828,  891,  891,  954,  954,
-  1017, 1017, 1080, 1080, 1143, 1143, 1206, 1206, 1269, 1269, 1332, 1332, 1395,
-  1395, 1458, 1458, 1521, 1521, 1584, 1584, 1647, 1647, 1710, 1710, 1773, 1773,
-  1836, 1836, 1899, 1899, 1962, 1962, 2025, 703,  766,  766,  829,  829,  892,
-  892,  955,  955,  1018, 1018, 1081, 1081, 1144, 1144, 1207, 1207, 1270, 1270,
-  1333, 1333, 1396, 1396, 1459, 1459, 1522, 1522, 1585, 1585, 1648, 1648, 1711,
-  1711, 1774, 1774, 1837, 1837, 1900, 1900, 1963, 1963, 2026, 767,  830,  830,
-  893,  893,  956,  956,  1019, 1019, 1082, 1082, 1145, 1145, 1208, 1208, 1271,
-  1271, 1334, 1334, 1397, 1397, 1460, 1460, 1523, 1523, 1586, 1586, 1649, 1649,
-  1712, 1712, 1775, 1775, 1838, 1838, 1901, 1901, 1964, 1964, 2027, 831,  894,
-  894,  957,  957,  1020, 1020, 1083, 1083, 1146, 1146, 1209, 1209, 1272, 1272,
-  1335, 1335, 1398, 1398, 1461, 1461, 1524, 1524, 1587, 1587, 1650, 1650, 1713,
-  1713, 1776, 1776, 1839, 1839, 1902, 1902, 1965, 1965, 2028, 895,  958,  958,
-  1021, 1021, 1084, 1084, 1147, 1147, 1210, 1210, 1273, 1273, 1336, 1336, 1399,
-  1399, 1462, 1462, 1525, 1525, 1588, 1588, 1651, 1651, 1714, 1714, 1777, 1777,
-  1840, 1840, 1903, 1903, 1966, 1966, 2029, 959,  1022, 1022, 1085, 1085, 1148,
-  1148, 1211, 1211, 1274, 1274, 1337, 1337, 1400, 1400, 1463, 1463, 1526, 1526,
-  1589, 1589, 1652, 1652, 1715, 1715, 1778, 1778, 1841, 1841, 1904, 1904, 1967,
-  1967, 2030, 1023, 1086, 1086, 1149, 1149, 1212, 1212, 1275, 1275, 1338, 1338,
-  1401, 1401, 1464, 1464, 1527, 1527, 1590, 1590, 1653, 1653, 1716, 1716, 1779,
-  1779, 1842, 1842, 1905, 1905, 1968, 1968, 2031, 1087, 1150, 1150, 1213, 1213,
-  1276, 1276, 1339, 1339, 1402, 1402, 1465, 1465, 1528, 1528, 1591, 1591, 1654,
-  1654, 1717, 1717, 1780, 1780, 1843, 1843, 1906, 1906, 1969, 1969, 2032, 1151,
-  1214, 1214, 1277, 1277, 1340, 1340, 1403, 1403, 1466, 1466, 1529, 1529, 1592,
-  1592, 1655, 1655, 1718, 1718, 1781, 1781, 1844, 1844, 1907, 1907, 1970, 1970,
-  2033, 1215, 1278, 1278, 1341, 1341, 1404, 1404, 1467, 1467, 1530, 1530, 1593,
-  1593, 1656, 1656, 1719, 1719, 1782, 1782, 1845, 1845, 1908, 1908, 1971, 1971,
-  2034, 1279, 1342, 1342, 1405, 1405, 1468, 1468, 1531, 1531, 1594, 1594, 1657,
-  1657, 1720, 1720, 1783, 1783, 1846, 1846, 1909, 1909, 1972, 1972, 2035, 1343,
-  1406, 1406, 1469, 1469, 1532, 1532, 1595, 1595, 1658, 1658, 1721, 1721, 1784,
-  1784, 1847, 1847, 1910, 1910, 1973, 1973, 2036, 1407, 1470, 1470, 1533, 1533,
-  1596, 1596, 1659, 1659, 1722, 1722, 1785, 1785, 1848, 1848, 1911, 1911, 1974,
-  1974, 2037, 1471, 1534, 1534, 1597, 1597, 1660, 1660, 1723, 1723, 1786, 1786,
-  1849, 1849, 1912, 1912, 1975, 1975, 2038, 1535, 1598, 1598, 1661, 1661, 1724,
-  1724, 1787, 1787, 1850, 1850, 1913, 1913, 1976, 1976, 2039, 1599, 1662, 1662,
-  1725, 1725, 1788, 1788, 1851, 1851, 1914, 1914, 1977, 1977, 2040, 1663, 1726,
-  1726, 1789, 1789, 1852, 1852, 1915, 1915, 1978, 1978, 2041, 1727, 1790, 1790,
-  1853, 1853, 1916, 1916, 1979, 1979, 2042, 1791, 1854, 1854, 1917, 1917, 1980,
-  1980, 2043, 1855, 1918, 1918, 1981, 1981, 2044, 1919, 1982, 1982, 2045, 1983,
-  2046, 0,    0
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_64x64_neighbors[4097 * MAX_NEIGHBORS]) = {
-  0,    0,    0,    0,    0,    0,    1,    64,   1,    1,    64,   64,   2,
-  65,   65,   128,  66,   129,  2,    2,    128,  128,  3,    66,   129,  192,
-  67,   130,  130,  193,  3,    3,    192,  192,  4,    67,   193,  256,  131,
-  194,  68,   131,  194,  257,  4,    4,    132,  195,  195,  258,  256,  256,
-  5,    68,   257,  320,  69,   132,  258,  321,  196,  259,  133,  196,  259,
-  322,  5,    5,    320,  320,  6,    69,   321,  384,  70,   133,  322,  385,
-  197,  260,  260,  323,  134,  197,  323,  386,  6,    6,    384,  384,  7,
-  70,   261,  324,  385,  448,  198,  261,  324,  387,  71,   134,  386,  449,
-  135,  198,  387,  450,  262,  325,  325,  388,  7,    7,    448,  448,  8,
-  71,   199,  262,  388,  451,  449,  512,  72,   135,  450,  513,  326,  389,
-  136,  199,  451,  514,  263,  326,  389,  452,  200,  263,  452,  515,  8,
-  8,    512,  512,  9,    72,   513,  576,  73,   136,  327,  390,  390,  453,
-  514,  577,  264,  327,  453,  516,  137,  200,  515,  578,  201,  264,  516,
-  579,  391,  454,  9,    9,    328,  391,  454,  517,  576,  576,  10,   73,
-  577,  640,  74,   137,  578,  641,  265,  328,  517,  580,  138,  201,  579,
-  642,  392,  455,  455,  518,  202,  265,  580,  643,  329,  392,  518,  581,
-  10,   10,   640,  640,  11,   74,   641,  704,  75,   138,  266,  329,  581,
-  644,  642,  705,  456,  519,  139,  202,  393,  456,  519,  582,  643,  706,
-  330,  393,  582,  645,  203,  266,  644,  707,  11,   11,   704,  704,  12,
-  75,   457,  520,  520,  583,  705,  768,  267,  330,  645,  708,  76,   139,
-  706,  769,  394,  457,  583,  646,  140,  203,  707,  770,  331,  394,  646,
-  709,  204,  267,  708,  771,  521,  584,  458,  521,  584,  647,  12,   12,
-  268,  331,  709,  772,  768,  768,  13,   76,   395,  458,  647,  710,  769,
-  832,  77,   140,  770,  833,  141,  204,  771,  834,  332,  395,  710,  773,
-  522,  585,  585,  648,  205,  268,  459,  522,  648,  711,  772,  835,  396,
-  459,  711,  774,  269,  332,  773,  836,  13,   13,   832,  832,  14,   77,
-  833,  896,  78,   141,  586,  649,  834,  897,  523,  586,  649,  712,  142,
-  205,  333,  396,  774,  837,  835,  898,  460,  523,  712,  775,  206,  269,
-  836,  899,  397,  460,  775,  838,  270,  333,  587,  650,  650,  713,  837,
-  900,  14,   14,   524,  587,  713,  776,  896,  896,  15,   78,   897,  960,
-  79,   142,  898,  961,  334,  397,  838,  901,  461,  524,  776,  839,  143,
-  206,  899,  962,  207,  270,  900,  963,  651,  714,  588,  651,  714,  777,
-  398,  461,  839,  902,  271,  334,  525,  588,  777,  840,  901,  964,  15,
-  15,   960,  960,  16,   79,   961,  1024, 80,   143,  462,  525,  840,  903,
-  962,  1025, 335,  398,  902,  965,  144,  207,  652,  715,  715,  778,  963,
-  1026, 589,  652,  778,  841,  208,  271,  964,  1027, 399,  462,  903,  966,
-  526,  589,  841,  904,  272,  335,  965,  1028, 716,  779,  16,   16,   463,
-  526,  904,  967,  1024, 1024, 17,   80,   653,  716,  779,  842,  1025, 1088,
-  336,  399,  966,  1029, 81,   144,  1026, 1089, 590,  653,  842,  905,  145,
-  208,  1027, 1090, 209,  272,  400,  463,  967,  1030, 1028, 1091, 527,  590,
-  905,  968,  717,  780,  780,  843,  273,  336,  1029, 1092, 654,  717,  843,
-  906,  464,  527,  968,  1031, 17,   17,   1088, 1088, 18,   81,   337,  400,
-  591,  654,  906,  969,  1030, 1093, 1089, 1152, 82,   145,  1090, 1153, 146,
-  209,  1091, 1154, 528,  591,  969,  1032, 401,  464,  781,  844,  1031, 1094,
-  210,  273,  718,  781,  844,  907,  1092, 1155, 655,  718,  907,  970,  274,
-  337,  1093, 1156, 465,  528,  1032, 1095, 592,  655,  970,  1033, 338,  401,
-  1094, 1157, 18,   18,   1152, 1152, 19,   82,   1153, 1216, 83,   146,  782,
-  845,  845,  908,  1154, 1217, 719,  782,  908,  971,  147,  210,  529,  592,
-  1033, 1096, 1155, 1218, 402,  465,  1095, 1158, 211,  274,  656,  719,  971,
-  1034, 1156, 1219, 275,  338,  1157, 1220, 466,  529,  1096, 1159, 593,  656,
-  1034, 1097, 846,  909,  783,  846,  909,  972,  339,  402,  1158, 1221, 19,
-  19,   720,  783,  972,  1035, 1216, 1216, 20,   83,   1217, 1280, 84,   147,
-  1218, 1281, 530,  593,  1097, 1160, 148,  211,  1219, 1282, 403,  466,  657,
-  720,  1035, 1098, 1159, 1222, 212,  275,  1220, 1283, 847,  910,  910,  973,
-  594,  657,  1098, 1161, 276,  339,  467,  530,  784,  847,  973,  1036, 1160,
-  1223, 1221, 1284, 721,  784,  1036, 1099, 340,  403,  1222, 1285, 20,   20,
-  1280, 1280, 21,   84,   531,  594,  1161, 1224, 1281, 1344, 85,   148,  658,
-  721,  1099, 1162, 1282, 1345, 404,  467,  1223, 1286, 149,  212,  911,  974,
-  1283, 1346, 848,  911,  974,  1037, 213,  276,  1284, 1347, 785,  848,  1037,
-  1100, 595,  658,  1162, 1225, 468,  531,  1224, 1287, 277,  340,  1285, 1348,
-  722,  785,  1100, 1163, 341,  404,  1286, 1349, 532,  595,  912,  975,  975,
-  1038, 1225, 1288, 659,  722,  1163, 1226, 21,   21,   1344, 1344, 22,   85,
-  849,  912,  1038, 1101, 1345, 1408, 86,   149,  1346, 1409, 405,  468,  1287,
-  1350, 150,  213,  786,  849,  1101, 1164, 1347, 1410, 214,  277,  596,  659,
-  1226, 1289, 1348, 1411, 469,  532,  723,  786,  1164, 1227, 1288, 1351, 278,
-  341,  1349, 1412, 976,  1039, 913,  976,  1039, 1102, 342,  405,  850,  913,
-  1102, 1165, 1350, 1413, 660,  723,  1227, 1290, 533,  596,  1289, 1352, 22,
-  22,   1408, 1408, 23,   86,   787,  850,  1165, 1228, 1409, 1472, 87,   150,
-  406,  469,  1351, 1414, 1410, 1473, 151,  214,  1411, 1474, 597,  660,  1290,
-  1353, 724,  787,  1228, 1291, 215,  278,  977,  1040, 1040, 1103, 1412, 1475,
-  470,  533,  1352, 1415, 914,  977,  1103, 1166, 279,  342,  1413, 1476, 851,
-  914,  1166, 1229, 661,  724,  1291, 1354, 343,  406,  534,  597,  1353, 1416,
-  1414, 1477, 788,  851,  1229, 1292, 23,   23,   1472, 1472, 24,   87,   1473,
-  1536, 407,  470,  1041, 1104, 1415, 1478, 88,   151,  978,  1041, 1104, 1167,
-  1474, 1537, 598,  661,  1354, 1417, 152,  215,  725,  788,  1292, 1355, 1475,
-  1538, 915,  978,  1167, 1230, 216,  279,  1476, 1539, 471,  534,  1416, 1479,
-  852,  915,  1230, 1293, 280,  343,  1477, 1540, 662,  725,  1355, 1418, 535,
-  598,  789,  852,  1293, 1356, 1417, 1480, 344,  407,  1478, 1541, 1042, 1105,
-  1105, 1168, 979,  1042, 1168, 1231, 24,   24,   408,  471,  916,  979,  1231,
-  1294, 1479, 1542, 1536, 1536, 25,   88,   1537, 1600, 726,  789,  1356, 1419,
-  89,   152,  599,  662,  1418, 1481, 1538, 1601, 153,  216,  1539, 1602, 853,
-  916,  1294, 1357, 472,  535,  1480, 1543, 217,  280,  1540, 1603, 1106, 1169,
-  281,  344,  663,  726,  1043, 1106, 1169, 1232, 1419, 1482, 1541, 1604, 790,
-  853,  1357, 1420, 980,  1043, 1232, 1295, 536,  599,  1481, 1544, 345,  408,
-  1542, 1605, 917,  980,  1295, 1358, 727,  790,  1420, 1483, 409,  472,  1543,
-  1606, 25,   25,   600,  663,  1482, 1545, 1600, 1600, 26,   89,   1601, 1664,
-  90,   153,  854,  917,  1358, 1421, 1602, 1665, 154,  217,  1107, 1170, 1170,
-  1233, 1603, 1666, 473,  536,  1044, 1107, 1233, 1296, 1544, 1607, 218,  281,
-  1604, 1667, 664,  727,  981,  1044, 1296, 1359, 1483, 1546, 791,  854,  1421,
-  1484, 282,  345,  1605, 1668, 537,  600,  1545, 1608, 918,  981,  1359, 1422,
-  346,  409,  1606, 1669, 728,  791,  1484, 1547, 1171, 1234, 1108, 1171, 1234,
-  1297, 410,  473,  601,  664,  855,  918,  1422, 1485, 1546, 1609, 1607, 1670,
-  26,   26,   1664, 1664, 27,   90,   1045, 1108, 1297, 1360, 1665, 1728, 91,
-  154,  1666, 1729, 155,  218,  1667, 1730, 474,  537,  982,  1045, 1360, 1423,
-  1608, 1671, 219,  282,  792,  855,  1485, 1548, 1668, 1731, 665,  728,  1547,
-  1610, 283,  346,  919,  982,  1423, 1486, 1669, 1732, 538,  601,  1609, 1672,
-  1172, 1235, 1235, 1298, 347,  410,  1109, 1172, 1298, 1361, 1670, 1733, 729,
-  792,  1548, 1611, 856,  919,  1486, 1549, 1046, 1109, 1361, 1424, 602,  665,
-  1610, 1673, 411,  474,  1671, 1734, 27,   27,   1728, 1728, 28,   91,   983,
-  1046, 1424, 1487, 1729, 1792, 92,   155,  1730, 1793, 156,  219,  475,  538,
-  1672, 1735, 1731, 1794, 793,  856,  1549, 1612, 666,  729,  1611, 1674, 220,
-  283,  1236, 1299, 1732, 1795, 920,  983,  1487, 1550, 1173, 1236, 1299, 1362,
-  1110, 1173, 1362, 1425, 284,  347,  1733, 1796, 539,  602,  1673, 1736, 1047,
-  1110, 1425, 1488, 348,  411,  730,  793,  1612, 1675, 1734, 1797, 857,  920,
-  1550, 1613, 603,  666,  1674, 1737, 984,  1047, 1488, 1551, 412,  475,  1735,
-  1798, 28,   28,   1237, 1300, 1300, 1363, 1792, 1792, 29,   92,   1793, 1856,
-  93,   156,  794,  857,  1174, 1237, 1363, 1426, 1613, 1676, 1794, 1857, 476,
-  539,  1736, 1799, 157,  220,  667,  730,  921,  984,  1551, 1614, 1675, 1738,
-  1795, 1858, 1111, 1174, 1426, 1489, 221,  284,  1796, 1859, 540,  603,  1048,
-  1111, 1489, 1552, 1737, 1800, 285,  348,  1797, 1860, 858,  921,  1614, 1677,
-  731,  794,  1676, 1739, 349,  412,  1798, 1861, 985,  1048, 1552, 1615, 1301,
-  1364, 604,  667,  1238, 1301, 1364, 1427, 1738, 1801, 413,  476,  1175, 1238,
-  1427, 1490, 1799, 1862, 795,  858,  1677, 1740, 29,   29,   1112, 1175, 1490,
-  1553, 1856, 1856, 30,   93,   922,  985,  1615, 1678, 1857, 1920, 94,   157,
-  1858, 1921, 477,  540,  668,  731,  1739, 1802, 1800, 1863, 158,  221,  1859,
-  1922, 1049, 1112, 1553, 1616, 222,  285,  1860, 1923, 541,  604,  1801, 1864,
-  286,  349,  859,  922,  1302, 1365, 1365, 1428, 1678, 1741, 1861, 1924, 732,
-  795,  1740, 1803, 1239, 1302, 1428, 1491, 986,  1049, 1616, 1679, 350,  413,
-  1862, 1925, 1176, 1239, 1491, 1554, 605,  668,  1802, 1865, 414,  477,  1113,
-  1176, 1554, 1617, 1863, 1926, 796,  859,  1741, 1804, 923,  986,  1679, 1742,
-  30,   30,   1920, 1920, 31,   94,   669,  732,  1803, 1866, 1921, 1984, 478,
-  541,  1864, 1927, 95,   158,  1050, 1113, 1617, 1680, 1922, 1985, 1366, 1429,
-  159,  222,  1303, 1366, 1429, 1492, 1923, 1986, 1240, 1303, 1492, 1555, 223,
-  286,  1924, 1987, 860,  923,  1742, 1805, 542,  605,  1865, 1928, 733,  796,
-  987,  1050, 1680, 1743, 1804, 1867, 287,  350,  1177, 1240, 1555, 1618, 1925,
-  1988, 351,  414,  1926, 1989, 606,  669,  1114, 1177, 1618, 1681, 1866, 1929,
-  924,  987,  1743, 1806, 415,  478,  797,  860,  1805, 1868, 1927, 1990, 1367,
-  1430, 1430, 1493, 1304, 1367, 1493, 1556, 1051, 1114, 1681, 1744, 670,  733,
-  1867, 1930, 31,   31,   1984, 1984, 32,   95,   479,  542,  1241, 1304, 1556,
-  1619, 1928, 1991, 1985, 2048, 96,   159,  1986, 2049, 160,  223,  1987, 2050,
-  861,  924,  1178, 1241, 1619, 1682, 1806, 1869, 224,  287,  988,  1051, 1744,
-  1807, 1988, 2051, 543,  606,  1929, 1992, 734,  797,  1868, 1931, 288,  351,
-  1989, 2052, 1115, 1178, 1682, 1745, 1431, 1494, 352,  415,  1368, 1431, 1494,
-  1557, 1990, 2053, 607,  670,  1930, 1993, 925,  988,  1305, 1368, 1557, 1620,
-  1807, 1870, 798,  861,  1869, 1932, 416,  479,  1052, 1115, 1745, 1808, 1991,
-  2054, 1242, 1305, 1620, 1683, 671,  734,  1931, 1994, 480,  543,  1992, 2055,
-  32,   32,   2048, 2048, 33,   96,   1179, 1242, 1683, 1746, 2049, 2112, 97,
-  160,  2050, 2113, 862,  925,  1870, 1933, 989,  1052, 1808, 1871, 161,  224,
-  2051, 2114, 225,  288,  544,  607,  735,  798,  1432, 1495, 1495, 1558, 1932,
-  1995, 1993, 2056, 2052, 2115, 1116, 1179, 1746, 1809, 1369, 1432, 1558, 1621,
-  289,  352,  2053, 2116, 1306, 1369, 1621, 1684, 608,  671,  1994, 2057, 353,
-  416,  926,  989,  1871, 1934, 2054, 2117, 1243, 1306, 1684, 1747, 799,  862,
-  1053, 1116, 1809, 1872, 1933, 1996, 417,  480,  2055, 2118, 672,  735,  1180,
-  1243, 1747, 1810, 1995, 2058, 1496, 1559, 481,  544,  2056, 2119, 1433, 1496,
-  1559, 1622, 33,   33,   990,  1053, 1872, 1935, 2112, 2112, 34,   97,   863,
-  926,  1934, 1997, 2113, 2176, 98,   161,  1370, 1433, 1622, 1685, 2114, 2177,
-  162,  225,  1117, 1180, 1810, 1873, 2115, 2178, 736,  799,  1996, 2059, 545,
-  608,  1307, 1370, 1685, 1748, 2057, 2120, 226,  289,  2116, 2179, 290,  353,
-  2117, 2180, 1244, 1307, 1748, 1811, 927,  990,  1935, 1998, 609,  672,  1054,
-  1117, 1873, 1936, 2058, 2121, 354,  417,  2118, 2181, 800,  863,  1997, 2060,
-  1497, 1560, 1560, 1623, 1181, 1244, 1811, 1874, 418,  481,  1434, 1497, 1623,
-  1686, 2119, 2182, 673,  736,  2059, 2122, 1371, 1434, 1686, 1749, 991,  1054,
-  1936, 1999, 482,  545,  864,  927,  1998, 2061, 2120, 2183, 1118, 1181, 1874,
-  1937, 34,   34,   1308, 1371, 1749, 1812, 2176, 2176, 35,   98,   2177, 2240,
-  99,   162,  2178, 2241, 737,  800,  2060, 2123, 163,  226,  2179, 2242, 546,
-  609,  2121, 2184, 227,  290,  1245, 1308, 1812, 1875, 2180, 2243, 928,  991,
-  1999, 2062, 291,  354,  1055, 1118, 1561, 1624, 1937, 2000, 2181, 2244, 1498,
-  1561, 1624, 1687, 610,  673,  2122, 2185, 801,  864,  1435, 1498, 1687, 1750,
-  2061, 2124, 355,  418,  1182, 1245, 1875, 1938, 2182, 2245, 1372, 1435, 1750,
-  1813, 419,  482,  2183, 2246, 674,  737,  2123, 2186, 992,  1055, 2000, 2063,
-  1309, 1372, 1813, 1876, 865,  928,  1119, 1182, 1938, 2001, 2062, 2125, 483,
-  546,  2184, 2247, 35,   35,   2240, 2240, 36,   99,   2241, 2304, 100,  163,
-  738,  801,  1246, 1309, 1876, 1939, 2124, 2187, 2242, 2305, 1562, 1625, 1625,
-  1688, 164,  227,  1499, 1562, 1688, 1751, 2243, 2306, 547,  610,  2185, 2248,
-  228,  291,  2244, 2307, 1056, 1119, 1436, 1499, 1751, 1814, 2001, 2064, 929,
-  992,  2063, 2126, 292,  355,  2245, 2308, 1183, 1246, 1939, 2002, 611,  674,
-  802,  865,  1373, 1436, 1814, 1877, 2125, 2188, 2186, 2249, 356,  419,  2246,
-  2309, 1310, 1373, 1877, 1940, 420,  483,  993,  1056, 2064, 2127, 2247, 2310,
-  675,  738,  2187, 2250, 1120, 1183, 2002, 2065, 866,  929,  1626, 1689, 2126,
-  2189, 1563, 1626, 1689, 1752, 484,  547,  1500, 1563, 1752, 1815, 2248, 2311,
-  1247, 1310, 1940, 2003, 36,   36,   739,  802,  2188, 2251, 2304, 2304, 37,
-  100,  1437, 1500, 1815, 1878, 2305, 2368, 101,  164,  2306, 2369, 548,  611,
-  2249, 2312, 165,  228,  1057, 1120, 2065, 2128, 2307, 2370, 930,  993,  2127,
-  2190, 1374, 1437, 1878, 1941, 229,  292,  1184, 1247, 2003, 2066, 2308, 2371,
-  293,  356,  803,  866,  2189, 2252, 2309, 2372, 612,  675,  2250, 2313, 1311,
-  1374, 1941, 2004, 357,  420,  1627, 1690, 1690, 1753, 2310, 2373, 1564, 1627,
-  1753, 1816, 994,  1057, 2128, 2191, 1121, 1184, 2066, 2129, 676,  739,  1501,
-  1564, 1816, 1879, 2251, 2314, 421,  484,  2311, 2374, 867,  930,  2190, 2253,
-  1248, 1311, 2004, 2067, 1438, 1501, 1879, 1942, 485,  548,  2312, 2375, 740,
-  803,  2252, 2315, 37,   37,   2368, 2368, 38,   101,  1058, 1121, 1375, 1438,
-  1942, 2005, 2129, 2192, 2369, 2432, 102,  165,  2370, 2433, 549,  612,  931,
-  994,  1185, 1248, 2067, 2130, 2191, 2254, 2313, 2376, 166,  229,  2371, 2434,
-  1691, 1754, 230,  293,  1628, 1691, 1754, 1817, 2372, 2435, 804,  867,  1312,
-  1375, 2005, 2068, 2253, 2316, 1565, 1628, 1817, 1880, 294,  357,  613,  676,
-  2314, 2377, 2373, 2436, 1502, 1565, 1880, 1943, 358,  421,  1122, 1185, 2130,
-  2193, 2374, 2437, 995,  1058, 2192, 2255, 1249, 1312, 2068, 2131, 677,  740,
-  1439, 1502, 1943, 2006, 2315, 2378, 868,  931,  2254, 2317, 422,  485,  2375,
-  2438, 486,  549,  1376, 1439, 2006, 2069, 2376, 2439, 741,  804,  1692, 1755,
-  1755, 1818, 2316, 2379, 1059, 1122, 2193, 2256, 1186, 1249, 1629, 1692, 1818,
-  1881, 2131, 2194, 38,   38,   932,  995,  2255, 2318, 2432, 2432, 39,   102,
-  2433, 2496, 103,  166,  550,  613,  1566, 1629, 1881, 1944, 2377, 2440, 2434,
-  2497, 167,  230,  1313, 1376, 2069, 2132, 2435, 2498, 231,  294,  1503, 1566,
-  1944, 2007, 2436, 2499, 805,  868,  2317, 2380, 614,  677,  2378, 2441, 295,
-  358,  2437, 2500, 1123, 1186, 2194, 2257, 996,  1059, 2256, 2319, 1440, 1503,
-  2007, 2070, 1250, 1313, 2132, 2195, 359,  422,  2438, 2501, 678,  741,  869,
-  932,  2318, 2381, 2379, 2442, 1756, 1819, 423,  486,  1693, 1756, 1819, 1882,
-  2439, 2502, 1377, 1440, 2070, 2133, 1630, 1693, 1882, 1945, 487,  550,  1060,
-  1123, 2257, 2320, 2440, 2503, 1187, 1250, 1567, 1630, 1945, 2008, 2195, 2258,
-  742,  805,  2380, 2443, 933,  996,  2319, 2382, 1314, 1377, 2133, 2196, 39,
-  39,   1504, 1567, 2008, 2071, 2496, 2496, 40,   103,  2497, 2560, 551,  614,
-  2441, 2504, 104,  167,  2498, 2561, 168,  231,  2499, 2562, 806,  869,  2381,
-  2444, 232,  295,  2500, 2563, 1441, 1504, 2071, 2134, 1124, 1187, 2258, 2321,
-  615,  678,  2442, 2505, 296,  359,  997,  1060, 1251, 1314, 1757, 1820, 1820,
-  1883, 2196, 2259, 2320, 2383, 2501, 2564, 1694, 1757, 1883, 1946, 360,  423,
-  2502, 2565, 1631, 1694, 1946, 2009, 870,  933,  1378, 1441, 2134, 2197, 2382,
-  2445, 679,  742,  2443, 2506, 424,  487,  1568, 1631, 2009, 2072, 2503, 2566,
-  1188, 1251, 2259, 2322, 1061, 1124, 2321, 2384, 488,  551,  2504, 2567, 743,
-  806,  1505, 1568, 2072, 2135, 2444, 2507, 1315, 1378, 2197, 2260, 934,  997,
-  2383, 2446, 40,   40,   552,  615,  2505, 2568, 2560, 2560, 41,   104,  1821,
-  1884, 2561, 2624, 1758, 1821, 1884, 1947, 105,  168,  1442, 1505, 2135, 2198,
-  2562, 2625, 169,  232,  807,  870,  1695, 1758, 1947, 2010, 2445, 2508, 2563,
-  2626, 1125, 1188, 2322, 2385, 1252, 1315, 2260, 2323, 233,  296,  2564, 2627,
-  616,  679,  998,  1061, 1632, 1695, 2010, 2073, 2384, 2447, 2506, 2569, 297,
-  360,  2565, 2628, 1379, 1442, 2198, 2261, 1569, 1632, 2073, 2136, 361,  424,
-  871,  934,  2446, 2509, 2566, 2629, 680,  743,  2507, 2570, 425,  488,  1189,
-  1252, 2323, 2386, 2567, 2630, 1506, 1569, 2136, 2199, 1062, 1125, 2385, 2448,
-  1316, 1379, 2261, 2324, 1822, 1885, 1885, 1948, 744,  807,  2508, 2571, 489,
-  552,  1759, 1822, 1948, 2011, 2568, 2631, 935,  998,  2447, 2510, 1696, 1759,
-  2011, 2074, 1443, 1506, 2199, 2262, 553,  616,  2569, 2632, 41,   41,   2624,
-  2624, 42,   105,  1633, 1696, 2074, 2137, 2625, 2688, 106,  169,  1126, 1189,
-  2386, 2449, 2626, 2689, 808,  871,  1253, 1316, 2324, 2387, 2509, 2572, 170,
-  233,  2627, 2690, 999,  1062, 2448, 2511, 234,  297,  1380, 1443, 2262, 2325,
-  2628, 2691, 617,  680,  1570, 1633, 2137, 2200, 2570, 2633, 298,  361,  2629,
-  2692, 872,  935,  2510, 2573, 362,  425,  1886, 1949, 2630, 2693, 1507, 1570,
-  2200, 2263, 681,  744,  1823, 1886, 1949, 2012, 2571, 2634, 1190, 1253, 2387,
-  2450, 1760, 1823, 2012, 2075, 1063, 1126, 1317, 1380, 2325, 2388, 2449, 2512,
-  426,  489,  2631, 2694, 1697, 1760, 2075, 2138, 745,  808,  936,  999,  1444,
-  1507, 2263, 2326, 2511, 2574, 2572, 2635, 490,  553,  2632, 2695, 1634, 1697,
-  2138, 2201, 1254, 1317, 2388, 2451, 554,  617,  1127, 1190, 2450, 2513, 2633,
-  2696, 42,   42,   2688, 2688, 43,   106,  809,  872,  1571, 1634, 2201, 2264,
-  2573, 2636, 2689, 2752, 107,  170,  1381, 1444, 2326, 2389, 2690, 2753, 1000,
-  1063, 2512, 2575, 171,  234,  2691, 2754, 1887, 1950, 1950, 2013, 618,  681,
-  2634, 2697, 235,  298,  1824, 1887, 2013, 2076, 2692, 2755, 1508, 1571, 2264,
-  2327, 1761, 1824, 2076, 2139, 299,  362,  2693, 2756, 873,  936,  2574, 2637,
-  1191, 1254, 2451, 2514, 363,  426,  682,  745,  1318, 1381, 1698, 1761, 2139,
-  2202, 2389, 2452, 2635, 2698, 2694, 2757, 1064, 1127, 2513, 2576, 427,  490,
-  1445, 1508, 2327, 2390, 2695, 2758, 1635, 1698, 2202, 2265, 937,  1000, 2575,
-  2638, 746,  809,  2636, 2699, 491,  554,  2696, 2759, 1255, 1318, 1572, 1635,
-  2265, 2328, 2452, 2515, 1951, 2014, 1128, 1191, 1888, 1951, 2014, 2077, 2514,
-  2577, 1382, 1445, 2390, 2453, 555,  618,  1825, 1888, 2077, 2140, 2697, 2760,
-  810,  873,  2637, 2700, 43,   43,   2752, 2752, 44,   107,  1001, 1064, 2576,
-  2639, 2753, 2816, 108,  171,  1762, 1825, 2140, 2203, 2754, 2817, 172,  235,
-  1509, 1572, 2328, 2391, 2755, 2818, 619,  682,  2698, 2761, 236,  299,  2756,
-  2819, 1699, 1762, 2203, 2266, 874,  937,  2638, 2701, 300,  363,  1192, 1255,
-  2515, 2578, 2757, 2820, 1319, 1382, 2453, 2516, 683,  746,  1065, 1128, 2577,
-  2640, 2699, 2762, 364,  427,  1636, 1699, 2266, 2329, 2758, 2821, 1446, 1509,
-  2391, 2454, 428,  491,  1952, 2015, 2015, 2078, 2759, 2822, 938,  1001, 1889,
-  1952, 2078, 2141, 2639, 2702, 747,  810,  2700, 2763, 1573, 1636, 2329, 2392,
-  1826, 1889, 2141, 2204, 492,  555,  1256, 1319, 2516, 2579, 2760, 2823, 1129,
-  1192, 1383, 1446, 2454, 2517, 2578, 2641, 1763, 1826, 2204, 2267, 556,  619,
-  2761, 2824, 811,  874,  2701, 2764, 1002, 1065, 1510, 1573, 2392, 2455, 2640,
-  2703, 44,   44,   1700, 1763, 2267, 2330, 2816, 2816, 45,   108,  2817, 2880,
-  109,  172,  2818, 2881, 173,  236,  2819, 2882, 620,  683,  2762, 2825, 237,
-  300,  1320, 1383, 2517, 2580, 2820, 2883, 1193, 1256, 2579, 2642, 875,  938,
-  1637, 1700, 2330, 2393, 2702, 2765, 2016, 2079, 301,  364,  1447, 1510, 1953,
-  2016, 2079, 2142, 2455, 2518, 2821, 2884, 1066, 1129, 2641, 2704, 1890, 1953,
-  2142, 2205, 684,  747,  2763, 2826, 365,  428,  2822, 2885, 1827, 1890, 2205,
-  2268, 1574, 1637, 2393, 2456, 429,  492,  939,  1002, 2703, 2766, 2823, 2886,
-  748,  811,  1764, 1827, 2268, 2331, 2764, 2827, 1257, 1320, 2580, 2643, 1384,
-  1447, 2518, 2581, 1130, 1193, 2642, 2705, 493,  556,  2824, 2887, 1511, 1574,
-  2456, 2519, 1701, 1764, 2331, 2394, 812,  875,  1003, 1066, 2704, 2767, 2765,
-  2828, 557,  620,  2825, 2888, 2017, 2080, 2080, 2143, 45,   45,   2880, 2880,
-  46,   109,  1954, 2017, 2143, 2206, 2881, 2944, 110,  173,  1638, 1701, 2394,
-  2457, 2882, 2945, 1321, 1384, 2581, 2644, 174,  237,  621,  684,  1194, 1257,
-  1891, 1954, 2206, 2269, 2643, 2706, 2826, 2889, 2883, 2946, 1448, 1511, 2519,
-  2582, 238,  301,  876,  939,  2766, 2829, 2884, 2947, 1828, 1891, 2269, 2332,
-  1067, 1130, 2705, 2768, 302,  365,  2885, 2948, 685,  748,  1575, 1638, 2457,
-  2520, 2827, 2890, 366,  429,  2886, 2949, 1765, 1828, 2332, 2395, 940,  1003,
-  2767, 2830, 1258, 1321, 2644, 2707, 430,  493,  1385, 1448, 2582, 2645, 2887,
-  2950, 749,  812,  2828, 2891, 1131, 1194, 1702, 1765, 2395, 2458, 2706, 2769,
-  1512, 1575, 2520, 2583, 2081, 2144, 494,  557,  2018, 2081, 2144, 2207, 2888,
-  2951, 1955, 2018, 2207, 2270, 1004, 1067, 2768, 2831, 813,  876,  2829, 2892,
-  1892, 1955, 2270, 2333, 558,  621,  1639, 1702, 2458, 2521, 2889, 2952, 1322,
-  1385, 2645, 2708, 46,   46,   2944, 2944, 47,   110,  1195, 1258, 1449, 1512,
-  1829, 1892, 2333, 2396, 2583, 2646, 2707, 2770, 2945, 3008, 111,  174,  2946,
-  3009, 622,  685,  2890, 2953, 175,  238,  2947, 3010, 877,  940,  2830, 2893,
-  239,  302,  1068, 1131, 1576, 1639, 2521, 2584, 2769, 2832, 2948, 3011, 1766,
-  1829, 2396, 2459, 303,  366,  2949, 3012, 686,  749,  2891, 2954, 367,  430,
-  2082, 2145, 2145, 2208, 2950, 3013, 1386, 1449, 2646, 2709, 1259, 1322, 2019,
-  2082, 2208, 2271, 2708, 2771, 941,  1004, 1703, 1766, 2459, 2522, 2831, 2894,
-  1513, 1576, 1956, 2019, 2271, 2334, 2584, 2647, 431,  494,  2951, 3014, 750,
-  813,  1132, 1195, 2770, 2833, 2892, 2955, 1893, 1956, 2334, 2397, 495,  558,
-  2952, 3015, 1640, 1703, 2522, 2585, 1005, 1068, 2832, 2895, 814,  877,  1830,
-  1893, 2397, 2460, 2893, 2956, 559,  622,  1323, 1386, 2709, 2772, 2953, 3016,
-  1450, 1513, 2647, 2710, 1196, 1259, 2771, 2834, 47,   47,   3008, 3008, 48,
-  111,  1767, 1830, 2460, 2523, 3009, 3072, 1577, 1640, 2585, 2648, 112,  175,
-  3010, 3073, 623,  686,  2954, 3017, 878,  941,  2146, 2209, 2894, 2957, 176,
-  239,  3011, 3074, 1069, 1132, 2083, 2146, 2209, 2272, 2833, 2896, 240,  303,
-  2020, 2083, 2272, 2335, 3012, 3075, 304,  367,  1704, 1767, 2523, 2586, 3013,
-  3076, 687,  750,  1957, 2020, 2335, 2398, 2955, 3018, 1387, 1450, 2710, 2773,
-  1260, 1323, 2772, 2835, 368,  431,  1514, 1577, 2648, 2711, 3014, 3077, 942,
-  1005, 2895, 2958, 1894, 1957, 2398, 2461, 1133, 1196, 2834, 2897, 432,  495,
-  751,  814,  2956, 3019, 3015, 3078, 1641, 1704, 2586, 2649, 1831, 1894, 2461,
-  2524, 496,  559,  3016, 3079, 1006, 1069, 2896, 2959, 1324, 1387, 2773, 2836,
-  815,  878,  1451, 1514, 2711, 2774, 2957, 3020, 2147, 2210, 2210, 2273, 1768,
-  1831, 2524, 2587, 560,  623,  2084, 2147, 2273, 2336, 3017, 3080, 1197, 1260,
-  2835, 2898, 1578, 1641, 2649, 2712, 2021, 2084, 2336, 2399, 48,   48,   3072,
-  3072, 49,   112,  3073, 3136, 624,  687,  3018, 3081, 113,  176,  879,  942,
-  1070, 1133, 1958, 2021, 2399, 2462, 2897, 2960, 2958, 3021, 3074, 3137, 177,
-  240,  1705, 1768, 2587, 2650, 3075, 3138, 241,  304,  3076, 3139, 1388, 1451,
-  2774, 2837, 1895, 1958, 2462, 2525, 688,  751,  1261, 1324, 1515, 1578, 2712,
-  2775, 2836, 2899, 3019, 3082, 305,  368,  3077, 3140, 943,  1006, 2959, 3022,
-  369,  432,  3078, 3141, 1134, 1197, 1642, 1705, 2650, 2713, 2898, 2961, 1832,
-  1895, 2525, 2588, 752,  815,  3020, 3083, 433,  496,  2211, 2274, 3079, 3142,
-  2148, 2211, 2274, 2337, 2085, 2148, 2337, 2400, 497,  560,  1007, 1070, 1452,
-  1515, 1769, 1832, 2588, 2651, 2775, 2838, 2960, 3023, 3080, 3143, 1325, 1388,
-  2837, 2900, 2022, 2085, 2400, 2463, 816,  879,  3021, 3084, 1579, 1642, 2713,
-  2776, 1198, 1261, 2899, 2962, 561,  624,  1959, 2022, 2463, 2526, 3081, 3144,
-  1706, 1769, 2651, 2714, 1071, 1134, 2961, 3024, 49,   49,   880,  943,  1896,
-  1959, 2526, 2589, 3022, 3085, 3136, 3136, 50,   113,  625,  688,  3082, 3145,
-  3137, 3200, 114,  177,  3138, 3201, 178,  241,  1389, 1452, 2838, 2901, 3139,
-  3202, 1516, 1579, 2776, 2839, 242,  305,  1262, 1325, 2900, 2963, 3140, 3203,
-  2212, 2275, 2275, 2338, 689,  752,  1833, 1896, 2589, 2652, 3083, 3146, 306,
-  369,  1643, 1706, 2149, 2212, 2338, 2401, 2714, 2777, 3141, 3204, 944,  1007,
-  3023, 3086, 1135, 1198, 2086, 2149, 2401, 2464, 2962, 3025, 370,  433,  3142,
-  3205, 753,  816,  2023, 2086, 2464, 2527, 3084, 3147, 1770, 1833, 2652, 2715,
-  434,  497,  3143, 3206, 1453, 1516, 2839, 2902, 1326, 1389, 2901, 2964, 1008,
-  1071, 3024, 3087, 1580, 1643, 1960, 2023, 2527, 2590, 2777, 2840, 498,  561,
-  3144, 3207, 817,  880,  1199, 1262, 2963, 3026, 3085, 3148, 1707, 1770, 2715,
-  2778, 562,  625,  1897, 1960, 2590, 2653, 3145, 3208, 2276, 2339, 1072, 1135,
-  3025, 3088, 2213, 2276, 2339, 2402, 881,  944,  3086, 3149, 626,  689,  1390,
-  1453, 2150, 2213, 2402, 2465, 2902, 2965, 3146, 3209, 50,   50,   1517, 1580,
-  2840, 2903, 3200, 3200, 51,   114,  3201, 3264, 115,  178,  1834, 1897, 2653,
-  2716, 3202, 3265, 1263, 1326, 2964, 3027, 179,  242,  2087, 2150, 2465, 2528,
-  3203, 3266, 1644, 1707, 2778, 2841, 243,  306,  3204, 3267, 690,  753,  3147,
-  3210, 2024, 2087, 2528, 2591, 307,  370,  945,  1008, 3087, 3150, 3205, 3268,
-  1136, 1199, 3026, 3089, 1771, 1834, 2716, 2779, 371,  434,  3206, 3269, 1961,
-  2024, 2591, 2654, 754,  817,  3148, 3211, 1454, 1517, 2903, 2966, 435,  498,
-  1327, 1390, 1581, 1644, 2841, 2904, 2965, 3028, 3207, 3270, 1009, 1072, 3088,
-  3151, 1898, 1961, 2654, 2717, 499,  562,  1200, 1263, 1708, 1771, 2277, 2340,
-  2340, 2403, 2779, 2842, 3027, 3090, 3208, 3271, 818,  881,  2214, 2277, 2403,
-  2466, 3149, 3212, 2151, 2214, 2466, 2529, 563,  626,  3209, 3272, 2088, 2151,
-  2529, 2592, 1073, 1136, 1835, 1898, 2717, 2780, 3089, 3152, 1518, 1581, 2904,
-  2967, 1391, 1454, 2966, 3029, 882,  945,  3150, 3213, 627,  690,  1645, 1708,
-  2842, 2905, 3210, 3273, 51,   51,   1264, 1327, 3028, 3091, 3264, 3264, 52,
-  115,  2025, 2088, 2592, 2655, 3265, 3328, 116,  179,  3266, 3329, 180,  243,
-  3267, 3330, 244,  307,  1772, 1835, 2780, 2843, 3268, 3331, 691,  754,  3211,
-  3274, 946,  1009, 1137, 1200, 1962, 2025, 2655, 2718, 3090, 3153, 3151, 3214,
-  308,  371,  3269, 3332, 1455, 1518, 2341, 2404, 2967, 3030, 372,  435,  2278,
-  2341, 2404, 2467, 3270, 3333, 1582, 1645, 2905, 2968, 755,  818,  1328, 1391,
-  3029, 3092, 3212, 3275, 2215, 2278, 2467, 2530, 1899, 1962, 2718, 2781, 436,
-  499,  3271, 3334, 1709, 1772, 2843, 2906, 1010, 1073, 2152, 2215, 2530, 2593,
-  3152, 3215, 1201, 1264, 3091, 3154, 500,  563,  3272, 3335, 819,  882,  2089,
-  2152, 2593, 2656, 3213, 3276, 1836, 1899, 2781, 2844, 564,  627,  1519, 1582,
-  2968, 3031, 3273, 3336, 1392, 1455, 2026, 2089, 2656, 2719, 3030, 3093, 1074,
-  1137, 3153, 3216, 1646, 1709, 2906, 2969, 883,  946,  3214, 3277, 1265, 1328,
-  3092, 3155, 628,  691,  3274, 3337, 52,   52,   1773, 1836, 2844, 2907, 3328,
-  3328, 53,   116,  1963, 2026, 2719, 2782, 3329, 3392, 117,  180,  2342, 2405,
-  2405, 2468, 3330, 3393, 2279, 2342, 2468, 2531, 181,  244,  3331, 3394, 1138,
-  1201, 3154, 3217, 245,  308,  692,  755,  2216, 2279, 2531, 2594, 3275, 3338,
-  3332, 3395, 947,  1010, 3215, 3278, 1456, 1519, 3031, 3094, 309,  372,  1583,
-  1646, 2969, 3032, 3333, 3396, 1900, 1963, 2782, 2845, 2153, 2216, 2594, 2657,
-  1329, 1392, 3093, 3156, 373,  436,  1710, 1773, 2907, 2970, 3334, 3397, 756,
-  819,  3276, 3339, 2090, 2153, 2657, 2720, 1011, 1074, 3216, 3279, 437,  500,
-  3335, 3398, 1202, 1265, 3155, 3218, 1837, 1900, 2845, 2908, 501,  564,  820,
-  883,  2027, 2090, 2720, 2783, 3277, 3340, 3336, 3399, 1520, 1583, 3032, 3095,
-  1393, 1456, 1647, 1710, 2970, 3033, 3094, 3157, 2406, 2469, 565,  628,  1075,
-  1138, 2343, 2406, 2469, 2532, 3217, 3280, 3337, 3400, 2280, 2343, 2532, 2595,
-  1964, 2027, 2783, 2846, 884,  947,  1266, 1329, 1774, 1837, 2908, 2971, 3156,
-  3219, 3278, 3341, 2217, 2280, 2595, 2658, 629,  692,  3338, 3401, 53,   53,
-  3392, 3392, 54,   117,  3393, 3456, 118,  181,  2154, 2217, 2658, 2721, 3394,
-  3457, 182,  245,  1139, 1202, 1901, 1964, 2846, 2909, 3218, 3281, 3395, 3458,
-  948,  1011, 1584, 1647, 3033, 3096, 3279, 3342, 693,  756,  1457, 1520, 3095,
-  3158, 3339, 3402, 246,  309,  3396, 3459, 1711, 1774, 2091, 2154, 2721, 2784,
-  2971, 3034, 310,  373,  1330, 1393, 3157, 3220, 3397, 3460, 374,  437,  3398,
-  3461, 757,  820,  3340, 3403, 1838, 1901, 2909, 2972, 1012, 1075, 2028, 2091,
-  2784, 2847, 3280, 3343, 1203, 1266, 3219, 3282, 438,  501,  2407, 2470, 2470,
-  2533, 3399, 3462, 2344, 2407, 2533, 2596, 1521, 1584, 2281, 2344, 2596, 2659,
-  3096, 3159, 821,  884,  3341, 3404, 502,  565,  1648, 1711, 3034, 3097, 3400,
-  3463, 1394, 1457, 3158, 3221, 1965, 2028, 2847, 2910, 2218, 2281, 2659, 2722,
-  1076, 1139, 1775, 1838, 2972, 3035, 3281, 3344, 566,  629,  3401, 3464, 1267,
-  1330, 3220, 3283, 885,  948,  2155, 2218, 2722, 2785, 3342, 3405, 630,  693,
-  1902, 1965, 2910, 2973, 3402, 3465, 54,   54,   2092, 2155, 2785, 2848, 3456,
-  3456, 55,   118,  1585, 1648, 3097, 3160, 3457, 3520, 1140, 1203, 3282, 3345,
-  119,  182,  1458, 1521, 3159, 3222, 3458, 3521, 1712, 1775, 3035, 3098, 183,
-  246,  949,  1012, 3343, 3406, 3459, 3522, 694,  757,  3403, 3466, 247,  310,
-  3460, 3523, 1331, 1394, 2471, 2534, 3221, 3284, 2408, 2471, 2534, 2597, 2029,
-  2092, 2848, 2911, 311,  374,  1839, 1902, 2345, 2408, 2597, 2660, 2973, 3036,
-  3461, 3524, 758,  821,  2282, 2345, 2660, 2723, 3404, 3467, 375,  438,  3462,
-  3525, 1013, 1076, 1204, 1267, 3283, 3346, 3344, 3407, 439,  502,  2219, 2282,
-  2723, 2786, 3463, 3526, 1522, 1585, 3160, 3223, 1649, 1712, 1966, 2029, 2911,
-  2974, 3098, 3161, 822,  885,  1395, 1458, 3222, 3285, 3405, 3468, 1776, 1839,
-  3036, 3099, 503,  566,  3464, 3527, 2156, 2219, 2786, 2849, 1077, 1140, 3345,
-  3408, 1268, 1331, 3284, 3347, 567,  630,  3465, 3528, 1903, 1966, 2974, 3037,
-  886,  949,  3406, 3469, 2093, 2156, 2849, 2912, 2472, 2535, 2535, 2598, 631,
-  694,  1586, 1649, 2409, 2472, 2598, 2661, 3161, 3224, 3466, 3529, 1459, 1522,
-  1713, 1776, 3099, 3162, 3223, 3286, 1141, 1204, 2346, 2409, 2661, 2724, 3346,
-  3409, 55,   55,   3520, 3520, 56,   119,  3521, 3584, 120,  183,  2030, 2093,
-  2912, 2975, 3522, 3585, 950,  1013, 3407, 3470, 184,  247,  1332, 1395, 1840,
-  1903, 2283, 2346, 2724, 2787, 3037, 3100, 3285, 3348, 3523, 3586, 695,  758,
-  3467, 3530, 248,  311,  3524, 3587, 312,  375,  2220, 2283, 2787, 2850, 3525,
-  3588, 759,  822,  3468, 3531, 1205, 1268, 1967, 2030, 2975, 3038, 3347, 3410,
-  376,  439,  1014, 1077, 3408, 3471, 3526, 3589, 1650, 1713, 3162, 3225, 1523,
-  1586, 3224, 3287, 2157, 2220, 2850, 2913, 440,  503,  1777, 1840, 3100, 3163,
-  3527, 3590, 1396, 1459, 3286, 3349, 823,  886,  3469, 3532, 504,  567,  2536,
-  2599, 3528, 3591, 2473, 2536, 2599, 2662, 1904, 1967, 3038, 3101, 1078, 1141,
-  2094, 2157, 2913, 2976, 3409, 3472, 2410, 2473, 2662, 2725, 1269, 1332, 3348,
-  3411, 568,  631,  3529, 3592, 2347, 2410, 2725, 2788, 887,  950,  3470, 3533,
-  1587, 1650, 3225, 3288, 1714, 1777, 3163, 3226, 2284, 2347, 2788, 2851, 1460,
-  1523, 2031, 2094, 2976, 3039, 3287, 3350, 632,  695,  3530, 3593, 1142, 1205,
-  3410, 3473, 1841, 1904, 3101, 3164, 56,   56,   3584, 3584, 57,   120,  951,
-  1014, 1333, 1396, 2221, 2284, 2851, 2914, 3349, 3412, 3471, 3534, 3585, 3648,
-  121,  184,  3586, 3649, 696,  759,  3531, 3594, 185,  248,  3587, 3650, 249,
-  312,  1968, 2031, 3039, 3102, 3588, 3651, 2158, 2221, 2914, 2977, 313,  376,
-  3589, 3652, 1206, 1269, 1651, 1714, 3226, 3289, 3411, 3474, 760,  823,  1524,
-  1587, 3288, 3351, 3532, 3595, 1015, 1078, 2537, 2600, 2600, 2663, 3472, 3535,
-  1778, 1841, 3164, 3227, 377,  440,  2474, 2537, 2663, 2726, 3590, 3653, 1397,
-  1460, 2411, 2474, 2726, 2789, 3350, 3413, 441,  504,  2095, 2158, 2977, 3040,
-  3591, 3654, 1905, 1968, 3102, 3165, 824,  887,  2348, 2411, 2789, 2852, 3533,
-  3596, 505,  568,  3592, 3655, 1079, 1142, 3473, 3536, 1270, 1333, 3412, 3475,
-  2285, 2348, 2852, 2915, 2032, 2095, 3040, 3103, 1588, 1651, 3289, 3352, 569,
-  632,  1715, 1778, 3227, 3290, 3593, 3656, 888,  951,  3534, 3597, 1461, 1524,
-  3351, 3414, 1842, 1905, 2222, 2285, 2915, 2978, 3165, 3228, 633,  696,  1143,
-  1206, 3474, 3537, 3594, 3657, 1334, 1397, 3413, 3476, 952,  1015, 3535, 3598,
-  1969, 2032, 2601, 2664, 3103, 3166, 57,   57,   2538, 2601, 2664, 2727, 3648,
-  3648, 58,   121,  2159, 2222, 2978, 3041, 3649, 3712, 122,  185,  3650, 3713,
-  697,  760,  2475, 2538, 2727, 2790, 3595, 3658, 186,  249,  3651, 3714, 250,
-  313,  1652, 1715, 2412, 2475, 2790, 2853, 3290, 3353, 3652, 3715, 1525, 1588,
-  1779, 1842, 3228, 3291, 3352, 3415, 1207, 1270, 3475, 3538, 314,  377,  3653,
-  3716, 1016, 1079, 3536, 3599, 761,  824,  2096, 2159, 3041, 3104, 3596, 3659,
-  2349, 2412, 2853, 2916, 378,  441,  1398, 1461, 1906, 1969, 3166, 3229, 3414,
-  3477, 3654, 3717, 2286, 2349, 2916, 2979, 442,  505,  3655, 3718, 825,  888,
-  3597, 3660, 1080, 1143, 1271, 1334, 2033, 2096, 3104, 3167, 3476, 3539, 3537,
-  3600, 506,  569,  3656, 3719, 1716, 1779, 3291, 3354, 1589, 1652, 2223, 2286,
-  2979, 3042, 3353, 3416, 1843, 1906, 3229, 3292, 570,  633,  889,  952,  1462,
-  1525, 2602, 2665, 2665, 2728, 3415, 3478, 3598, 3661, 3657, 3720, 2539, 2602,
-  2728, 2791, 2476, 2539, 2791, 2854, 1144, 1207, 2160, 2223, 3042, 3105, 3538,
-  3601, 1970, 2033, 3167, 3230, 634,  697,  3658, 3721, 1335, 1398, 3477, 3540,
-  2413, 2476, 2854, 2917, 953,  1016, 3599, 3662, 58,   58,   3712, 3712, 59,
-  122,  3713, 3776, 123,  186,  698,  761,  1653, 1716, 2350, 2413, 2917, 2980,
-  3354, 3417, 3659, 3722, 3714, 3777, 1780, 1843, 3292, 3355, 187,  250,  2097,
-  2160, 3105, 3168, 3715, 3778, 1526, 1589, 3416, 3479, 251,  314,  1208, 1271,
-  3539, 3602, 3716, 3779, 1907, 1970, 3230, 3293, 1017, 1080, 2287, 2350, 2980,
-  3043, 3600, 3663, 315,  378,  3717, 3780, 762,  825,  3660, 3723, 1399, 1462,
-  3478, 3541, 379,  442,  3718, 3781, 2034, 2097, 3168, 3231, 2666, 2729, 2224,
-  2287, 3043, 3106, 443,  506,  2603, 2666, 2729, 2792, 3719, 3782, 826,  889,
-  3661, 3724, 1272, 1335, 2540, 2603, 2792, 2855, 3540, 3603, 1081, 1144, 1717,
-  1780, 3355, 3418, 3601, 3664, 1590, 1653, 3417, 3480, 507,  570,  1844, 1907,
-  3293, 3356, 3720, 3783, 2477, 2540, 2855, 2918, 1463, 1526, 3479, 3542, 2161,
-  2224, 3106, 3169, 890,  953,  2414, 2477, 2918, 2981, 3662, 3725, 571,  634,
-  1971, 2034, 3231, 3294, 3721, 3784, 1145, 1208, 3602, 3665, 1336, 1399, 3541,
-  3604, 2351, 2414, 2981, 3044, 635,  698,  3722, 3785, 954,  1017, 2098, 2161,
-  3169, 3232, 3663, 3726, 1654, 1717, 3418, 3481, 1781, 1844, 3356, 3419, 59,
-  59,   2288, 2351, 3044, 3107, 3776, 3776, 60,   123,  1527, 1590, 3480, 3543,
-  3777, 3840, 699,  762,  3723, 3786, 124,  187,  1908, 1971, 3294, 3357, 3778,
-  3841, 188,  251,  3779, 3842, 1209, 1272, 3603, 3666, 2667, 2730, 2730, 2793,
-  252,  315,  3780, 3843, 2604, 2667, 2793, 2856, 1018, 1081, 1400, 1463, 3542,
-  3605, 3664, 3727, 316,  379,  763,  826,  2035, 2098, 2541, 2604, 2856, 2919,
-  3232, 3295, 3724, 3787, 3781, 3844, 2225, 2288, 3107, 3170, 380,  443,  3782,
-  3845, 2478, 2541, 2919, 2982, 1718, 1781, 3419, 3482, 444,  507,  1273, 1336,
-  3604, 3667, 3783, 3846, 827,  890,  1591, 1654, 1845, 1908, 3357, 3420, 3481,
-  3544, 3725, 3788, 1082, 1145, 2415, 2478, 2982, 3045, 3665, 3728, 2162, 2225,
-  3170, 3233, 508,  571,  3784, 3847, 1464, 1527, 1972, 2035, 3295, 3358, 3543,
-  3606, 2352, 2415, 3045, 3108, 891,  954,  3726, 3789, 572,  635,  3785, 3848,
-  1146, 1209, 3666, 3729, 1337, 1400, 2099, 2162, 3233, 3296, 3605, 3668, 2289,
-  2352, 3108, 3171, 2731, 2794, 636,  699,  1782, 1845, 2668, 2731, 2794, 2857,
-  3420, 3483, 3786, 3849, 1655, 1718, 3482, 3545, 955,  1018, 2605, 2668, 2857,
-  2920, 3727, 3790, 1909, 1972, 3358, 3421, 1528, 1591, 3544, 3607, 2542, 2605,
-  2920, 2983, 60,   60,   700,  763,  3787, 3850, 3840, 3840, 61,   124,  3841,
-  3904, 125,  188,  1210, 1273, 2226, 2289, 3171, 3234, 3667, 3730, 3842, 3905,
-  2036, 2099, 3296, 3359, 189,  252,  2479, 2542, 2983, 3046, 3843, 3906, 1401,
-  1464, 3606, 3669, 253,  316,  1019, 1082, 3728, 3791, 3844, 3907, 764,  827,
-  3788, 3851, 317,  380,  3845, 3908, 2416, 2479, 3046, 3109, 1719, 1782, 3483,
-  3546, 381,  444,  1846, 1909, 2163, 2226, 3234, 3297, 3421, 3484, 3846, 3909,
-  1592, 1655, 3545, 3608, 1274, 1337, 3668, 3731, 828,  891,  3789, 3852, 445,
-  508,  1083, 1146, 1973, 2036, 2353, 2416, 3109, 3172, 3359, 3422, 3729, 3792,
-  3847, 3910, 1465, 1528, 3607, 3670, 509,  572,  2732, 2795, 2795, 2858, 3848,
-  3911, 2669, 2732, 2858, 2921, 2100, 2163, 3297, 3360, 892,  955,  2290, 2353,
-  3172, 3235, 3790, 3853, 2606, 2669, 2921, 2984, 573,  636,  3849, 3912, 1147,
-  1210, 1338, 1401, 3669, 3732, 3730, 3793, 1783, 1846, 2543, 2606, 2984, 3047,
-  3484, 3547, 1656, 1719, 3546, 3609, 1910, 1973, 3422, 3485, 637,  700,  3850,
-  3913, 956,  1019, 1529, 1592, 2480, 2543, 3047, 3110, 3608, 3671, 3791, 3854,
-  2227, 2290, 3235, 3298, 2037, 2100, 3360, 3423, 701,  764,  1211, 1274, 3731,
-  3794, 3851, 3914, 61,   61,   3904, 3904, 62,   125,  2417, 2480, 3110, 3173,
-  3905, 3968, 126,  189,  1402, 1465, 3670, 3733, 3906, 3969, 190,  253,  3907,
-  3970, 1020, 1083, 3792, 3855, 254,  317,  2164, 2227, 3298, 3361, 3908, 3971,
-  765,  828,  1720, 1783, 3547, 3610, 3852, 3915, 1847, 1910, 3485, 3548, 318,
-  381,  2354, 2417, 3173, 3236, 3909, 3972, 2796, 2859, 1593, 1656, 2733, 2796,
-  2859, 2922, 3609, 3672, 1974, 2037, 3423, 3486, 382,  445,  2670, 2733, 2922,
-  2985, 3910, 3973, 1275, 1338, 3732, 3795, 1084, 1147, 3793, 3856, 829,  892,
-  2607, 2670, 2985, 3048, 3853, 3916, 446,  509,  1466, 1529, 3671, 3734, 3911,
-  3974, 2291, 2354, 3236, 3299, 2101, 2164, 3361, 3424, 2544, 2607, 3048, 3111,
-  510,  573,  3912, 3975, 893,  956,  3854, 3917, 1784, 1847, 3548, 3611, 1339,
-  1402, 2481, 2544, 3111, 3174, 3733, 3796, 1148, 1211, 3794, 3857, 574,  637,
-  1657, 1720, 1911, 1974, 3486, 3549, 3610, 3673, 3913, 3976, 2228, 2291, 3299,
-  3362, 1530, 1593, 2038, 2101, 3424, 3487, 3672, 3735, 638,  701,  2418, 2481,
-  3174, 3237, 3914, 3977, 957,  1020, 3855, 3918, 1212, 1275, 2797, 2860, 2860,
-  2923, 3795, 3858, 702,  765,  1403, 1466, 2165, 2228, 2734, 2797, 2923, 2986,
-  3362, 3425, 3734, 3797, 3915, 3978, 62,   62,   3968, 3968, 63,   126,  2355,
-  2418, 3237, 3300, 3969, 4032, 127,  190,  2671, 2734, 2986, 3049, 3970, 4033,
-  1021, 1084, 1848, 1911, 3549, 3612, 3856, 3919, 191,  254,  1721, 1784, 3611,
-  3674, 3971, 4034, 255,  318,  2608, 2671, 3049, 3112, 3972, 4035, 1975, 2038,
-  3487, 3550, 766,  829,  3916, 3979, 1594, 1657, 3673, 3736, 319,  382,  3973,
-  4036, 1276, 1339, 2292, 2355, 3300, 3363, 3796, 3859, 2545, 2608, 3112, 3175,
-  383,  446,  2102, 2165, 3425, 3488, 3974, 4037, 1085, 1148, 1467, 1530, 3735,
-  3798, 3857, 3920, 830,  893,  3917, 3980, 447,  510,  3975, 4038, 2482, 2545,
-  3175, 3238, 511,  574,  1785, 1848, 3612, 3675, 3976, 4039, 2229, 2292, 3363,
-  3426, 1912, 1975, 3550, 3613, 894,  957,  1658, 1721, 3674, 3737, 3918, 3981,
-  1340, 1403, 3797, 3860, 1149, 1212, 2419, 2482, 3238, 3301, 3858, 3921, 2039,
-  2102, 3488, 3551, 575,  638,  2861, 2924, 3977, 4040, 2798, 2861, 2924, 2987,
-  1531, 1594, 3736, 3799, 2735, 2798, 2987, 3050, 2672, 2735, 3050, 3113, 639,
-  702,  958,  1021, 3919, 3982, 3978, 4041, 2166, 2229, 3426, 3489, 2356, 2419,
-  3301, 3364, 1213, 1276, 2609, 2672, 3113, 3176, 3859, 3922, 1404, 1467, 3798,
-  3861, 703,  766,  1849, 1912, 3613, 3676, 3979, 4042, 1722, 1785, 3675, 3738,
-  1976, 2039, 3551, 3614, 1022, 1085, 2546, 2609, 3176, 3239, 3920, 3983, 2293,
-  2356, 3364, 3427, 1595, 1658, 3737, 3800, 767,  830,  3980, 4043, 2103, 2166,
-  3489, 3552, 1277, 1340, 3860, 3923, 2483, 2546, 3239, 3302, 1468, 1531, 3799,
-  3862, 1086, 1149, 3921, 3984, 831,  894,  3981, 4044, 2230, 2293, 2862, 2925,
-  2925, 2988, 3427, 3490, 2799, 2862, 2988, 3051, 1786, 1849, 2420, 2483, 3302,
-  3365, 3676, 3739, 1913, 1976, 3614, 3677, 2736, 2799, 3051, 3114, 1659, 1722,
-  3738, 3801, 2040, 2103, 3552, 3615, 1341, 1404, 3861, 3924, 895,  958,  2673,
-  2736, 3114, 3177, 3982, 4045, 1150, 1213, 3922, 3985, 1532, 1595, 3800, 3863,
-  2357, 2420, 3365, 3428, 2167, 2230, 2610, 2673, 3177, 3240, 3490, 3553, 959,
-  1022, 3983, 4046, 2547, 2610, 3240, 3303, 1214, 1277, 1405, 1468, 1850, 1913,
-  3677, 3740, 3862, 3925, 3923, 3986, 1723, 1786, 1977, 2040, 3615, 3678, 3739,
-  3802, 2294, 2357, 3428, 3491, 1023, 1086, 1596, 1659, 2104, 2167, 2484, 2547,
-  3303, 3366, 3553, 3616, 3801, 3864, 3984, 4047, 2926, 2989, 2863, 2926, 2989,
-  3052, 2800, 2863, 3052, 3115, 1278, 1341, 3924, 3987, 1469, 1532, 2231, 2294,
-  2737, 2800, 3115, 3178, 3491, 3554, 3863, 3926, 2421, 2484, 3366, 3429, 1087,
-  1150, 3985, 4048, 1914, 1977, 2674, 2737, 3178, 3241, 3678, 3741, 1787, 1850,
-  3740, 3803, 2041, 2104, 3616, 3679, 1660, 1723, 3802, 3865, 2611, 2674, 3241,
-  3304, 1342, 1405, 2358, 2421, 3429, 3492, 3925, 3988, 2168, 2231, 3554, 3617,
-  1151, 1214, 3986, 4049, 1533, 1596, 3864, 3927, 2548, 2611, 3304, 3367, 2295,
-  2358, 3492, 3555, 1851, 1914, 3741, 3804, 1978, 2041, 2927, 2990, 2990, 3053,
-  3679, 3742, 1406, 1469, 3926, 3989, 1724, 1787, 2864, 2927, 3053, 3116, 3803,
-  3866, 1215, 1278, 2485, 2548, 3367, 3430, 3987, 4050, 2801, 2864, 3116, 3179,
-  2105, 2168, 3617, 3680, 1597, 1660, 3865, 3928, 2738, 2801, 3179, 3242, 2422,
-  2485, 3430, 3493, 2232, 2295, 3555, 3618, 2675, 2738, 3242, 3305, 1279, 1342,
-  3988, 4051, 1470, 1533, 3927, 3990, 1915, 1978, 3742, 3805, 1788, 1851, 3804,
-  3867, 2612, 2675, 3305, 3368, 2042, 2105, 3680, 3743, 2359, 2422, 3493, 3556,
-  1661, 1724, 3866, 3929, 2169, 2232, 3618, 3681, 2549, 2612, 3368, 3431, 1343,
-  1406, 3989, 4052, 2991, 3054, 1534, 1597, 2928, 2991, 3054, 3117, 3928, 3991,
-  2865, 2928, 3117, 3180, 2296, 2359, 3556, 3619, 2802, 2865, 3180, 3243, 2486,
-  2549, 3431, 3494, 1852, 1915, 3805, 3868, 1979, 2042, 3743, 3806, 1725, 1788,
-  2739, 2802, 3243, 3306, 3867, 3930, 1407, 1470, 2106, 2169, 3681, 3744, 3990,
-  4053, 2676, 2739, 3306, 3369, 1598, 1661, 2423, 2486, 3494, 3557, 3929, 3992,
-  2233, 2296, 3619, 3682, 2613, 2676, 3369, 3432, 1471, 1534, 3991, 4054, 1916,
-  1979, 3806, 3869, 1789, 1852, 2043, 2106, 2360, 2423, 3557, 3620, 3744, 3807,
-  3868, 3931, 2992, 3055, 3055, 3118, 2550, 2613, 3432, 3495, 2929, 2992, 3118,
-  3181, 1662, 1725, 2170, 2233, 3682, 3745, 3930, 3993, 2866, 2929, 3181, 3244,
-  2803, 2866, 3244, 3307, 1535, 1598, 2297, 2360, 3620, 3683, 3992, 4055, 2487,
-  2550, 3495, 3558, 2740, 2803, 3307, 3370, 1980, 2043, 3807, 3870, 1853, 1916,
-  3869, 3932, 2107, 2170, 3745, 3808, 1726, 1789, 2677, 2740, 3370, 3433, 3931,
-  3994, 2424, 2487, 3558, 3621, 2234, 2297, 3683, 3746, 1599, 1662, 3993, 4056,
-  2614, 2677, 3433, 3496, 3056, 3119, 2993, 3056, 3119, 3182, 2930, 2993, 3182,
-  3245, 2361, 2424, 3621, 3684, 1917, 1980, 3870, 3933, 2044, 2107, 3808, 3871,
-  2551, 2614, 3496, 3559, 2867, 2930, 3245, 3308, 1790, 1853, 3932, 3995, 2171,
-  2234, 3746, 3809, 2804, 2867, 3308, 3371, 1663, 1726, 3994, 4057, 2488, 2551,
-  3559, 3622, 2741, 2804, 3371, 3434, 2298, 2361, 3684, 3747, 2678, 2741, 3434,
-  3497, 1981, 2044, 3871, 3934, 1854, 1917, 3933, 3996, 2108, 2171, 3809, 3872,
-  2425, 2488, 3622, 3685, 1727, 1790, 3995, 4058, 3057, 3120, 3120, 3183, 2235,
-  2298, 2615, 2678, 3497, 3560, 3747, 3810, 2994, 3057, 3183, 3246, 2931, 2994,
-  3246, 3309, 2868, 2931, 3309, 3372, 2362, 2425, 3685, 3748, 2552, 2615, 3560,
-  3623, 1918, 1981, 3934, 3997, 2045, 2108, 2805, 2868, 3372, 3435, 3872, 3935,
-  1791, 1854, 3996, 4059, 2172, 2235, 3810, 3873, 2742, 2805, 3435, 3498, 2489,
-  2552, 3623, 3686, 2299, 2362, 3748, 3811, 2679, 2742, 3498, 3561, 3121, 3184,
-  3058, 3121, 3184, 3247, 1982, 2045, 3935, 3998, 2426, 2489, 3686, 3749, 1855,
-  1918, 2109, 2172, 2995, 3058, 3247, 3310, 3873, 3936, 3997, 4060, 2616, 2679,
-  3561, 3624, 2932, 2995, 3310, 3373, 2236, 2299, 3811, 3874, 2869, 2932, 3373,
-  3436, 2553, 2616, 3624, 3687, 2363, 2426, 3749, 3812, 2806, 2869, 3436, 3499,
-  2046, 2109, 3936, 3999, 1919, 1982, 3998, 4061, 2743, 2806, 3499, 3562, 2173,
-  2236, 3874, 3937, 2490, 2553, 3687, 3750, 2300, 2363, 3812, 3875, 2680, 2743,
-  3562, 3625, 3122, 3185, 3185, 3248, 3059, 3122, 3248, 3311, 2996, 3059, 3311,
-  3374, 2427, 2490, 2933, 2996, 3374, 3437, 3750, 3813, 1983, 2046, 2617, 2680,
-  3625, 3688, 3999, 4062, 2110, 2173, 3937, 4000, 2870, 2933, 3437, 3500, 2237,
-  2300, 3875, 3938, 2807, 2870, 3500, 3563, 2554, 2617, 3688, 3751, 2364, 2427,
-  3813, 3876, 2744, 2807, 3563, 3626, 2047, 2110, 4000, 4063, 2174, 2237, 3186,
-  3249, 3938, 4001, 2491, 2554, 3123, 3186, 3249, 3312, 3751, 3814, 3060, 3123,
-  3312, 3375, 2681, 2744, 3626, 3689, 2301, 2364, 3876, 3939, 2997, 3060, 3375,
-  3438, 2934, 2997, 3438, 3501, 2428, 2491, 3814, 3877, 2618, 2681, 3689, 3752,
-  2871, 2934, 3501, 3564, 2111, 2174, 4001, 4064, 2238, 2301, 3939, 4002, 2808,
-  2871, 3564, 3627, 2555, 2618, 3752, 3815, 2365, 2428, 3877, 3940, 2745, 2808,
-  3627, 3690, 3187, 3250, 3250, 3313, 3124, 3187, 3313, 3376, 3061, 3124, 3376,
-  3439, 2492, 2555, 3815, 3878, 2175, 2238, 2998, 3061, 3439, 3502, 4002, 4065,
-  2682, 2745, 3690, 3753, 2302, 2365, 3940, 4003, 2935, 2998, 3502, 3565, 2872,
-  2935, 3565, 3628, 2619, 2682, 3753, 3816, 2429, 2492, 3878, 3941, 2809, 2872,
-  3628, 3691, 2239, 2302, 4003, 4066, 2556, 2619, 3816, 3879, 3251, 3314, 3188,
-  3251, 3314, 3377, 3125, 3188, 3377, 3440, 2366, 2429, 2746, 2809, 3691, 3754,
-  3941, 4004, 3062, 3125, 3440, 3503, 2999, 3062, 3503, 3566, 2493, 2556, 3879,
-  3942, 2683, 2746, 3754, 3817, 2936, 2999, 3566, 3629, 2303, 2366, 4004, 4067,
-  2873, 2936, 3629, 3692, 2620, 2683, 3817, 3880, 2430, 2493, 3942, 4005, 2810,
-  2873, 3692, 3755, 3252, 3315, 3315, 3378, 3189, 3252, 3378, 3441, 3126, 3189,
-  3441, 3504, 2557, 2620, 3880, 3943, 3063, 3126, 3504, 3567, 2747, 2810, 3755,
-  3818, 2367, 2430, 4005, 4068, 3000, 3063, 3567, 3630, 2684, 2747, 3818, 3881,
-  2494, 2557, 2937, 3000, 3630, 3693, 3943, 4006, 2874, 2937, 3693, 3756, 2621,
-  2684, 3881, 3944, 3316, 3379, 3253, 3316, 3379, 3442, 2431, 2494, 4006, 4069,
-  3190, 3253, 3442, 3505, 2811, 2874, 3756, 3819, 3127, 3190, 3505, 3568, 3064,
-  3127, 3568, 3631, 2558, 2621, 3944, 4007, 2748, 2811, 3819, 3882, 3001, 3064,
-  3631, 3694, 2938, 3001, 3694, 3757, 2685, 2748, 3882, 3945, 2495, 2558, 4007,
-  4070, 2875, 2938, 3757, 3820, 3317, 3380, 3380, 3443, 3254, 3317, 3443, 3506,
-  2622, 2685, 3191, 3254, 3506, 3569, 3945, 4008, 2812, 2875, 3820, 3883, 3128,
-  3191, 3569, 3632, 3065, 3128, 3632, 3695, 2559, 2622, 4008, 4071, 2749, 2812,
-  3883, 3946, 3002, 3065, 3695, 3758, 2939, 3002, 3758, 3821, 2686, 2749, 3946,
-  4009, 3381, 3444, 3318, 3381, 3444, 3507, 2876, 2939, 3821, 3884, 3255, 3318,
-  3507, 3570, 3192, 3255, 3570, 3633, 2623, 2686, 3129, 3192, 3633, 3696, 4009,
-  4072, 2813, 2876, 3884, 3947, 3066, 3129, 3696, 3759, 3003, 3066, 3759, 3822,
-  2750, 2813, 3947, 4010, 2940, 3003, 3822, 3885, 3382, 3445, 3445, 3508, 3319,
-  3382, 3508, 3571, 2687, 2750, 4010, 4073, 3256, 3319, 3571, 3634, 2877, 2940,
-  3885, 3948, 3193, 3256, 3634, 3697, 3130, 3193, 3697, 3760, 2814, 2877, 3948,
-  4011, 3067, 3130, 3760, 3823, 3004, 3067, 3823, 3886, 2751, 2814, 4011, 4074,
-  3446, 3509, 3383, 3446, 3509, 3572, 2941, 3004, 3886, 3949, 3320, 3383, 3572,
-  3635, 3257, 3320, 3635, 3698, 3194, 3257, 3698, 3761, 2878, 2941, 3949, 4012,
-  3131, 3194, 3761, 3824, 3068, 3131, 3824, 3887, 2815, 2878, 4012, 4075, 3005,
-  3068, 3887, 3950, 3447, 3510, 3510, 3573, 3384, 3447, 3573, 3636, 3321, 3384,
-  3636, 3699, 2942, 3005, 3950, 4013, 3258, 3321, 3699, 3762, 3195, 3258, 3762,
-  3825, 2879, 2942, 4013, 4076, 3132, 3195, 3825, 3888, 3069, 3132, 3888, 3951,
-  3511, 3574, 3448, 3511, 3574, 3637, 3006, 3069, 3951, 4014, 3385, 3448, 3637,
-  3700, 3322, 3385, 3700, 3763, 3259, 3322, 3763, 3826, 2943, 3006, 4014, 4077,
-  3196, 3259, 3826, 3889, 3133, 3196, 3889, 3952, 3070, 3133, 3952, 4015, 3512,
-  3575, 3575, 3638, 3449, 3512, 3638, 3701, 3386, 3449, 3701, 3764, 3007, 3070,
-  4015, 4078, 3323, 3386, 3764, 3827, 3260, 3323, 3827, 3890, 3197, 3260, 3890,
-  3953, 3134, 3197, 3953, 4016, 3576, 3639, 3071, 3134, 4016, 4079, 3513, 3576,
-  3639, 3702, 3450, 3513, 3702, 3765, 3387, 3450, 3765, 3828, 3324, 3387, 3828,
-  3891, 3261, 3324, 3891, 3954, 3198, 3261, 3954, 4017, 3135, 3198, 4017, 4080,
-  3577, 3640, 3640, 3703, 3514, 3577, 3703, 3766, 3451, 3514, 3766, 3829, 3388,
-  3451, 3829, 3892, 3325, 3388, 3892, 3955, 3262, 3325, 3955, 4018, 3199, 3262,
-  4018, 4081, 3641, 3704, 3578, 3641, 3704, 3767, 3515, 3578, 3767, 3830, 3452,
-  3515, 3830, 3893, 3389, 3452, 3893, 3956, 3326, 3389, 3956, 4019, 3263, 3326,
-  4019, 4082, 3642, 3705, 3705, 3768, 3579, 3642, 3768, 3831, 3516, 3579, 3831,
-  3894, 3453, 3516, 3894, 3957, 3390, 3453, 3957, 4020, 3327, 3390, 4020, 4083,
-  3706, 3769, 3643, 3706, 3769, 3832, 3580, 3643, 3832, 3895, 3517, 3580, 3895,
-  3958, 3454, 3517, 3958, 4021, 3391, 3454, 4021, 4084, 3707, 3770, 3770, 3833,
-  3644, 3707, 3833, 3896, 3581, 3644, 3896, 3959, 3518, 3581, 3959, 4022, 3455,
-  3518, 4022, 4085, 3771, 3834, 3708, 3771, 3834, 3897, 3645, 3708, 3897, 3960,
-  3582, 3645, 3960, 4023, 3519, 3582, 4023, 4086, 3772, 3835, 3835, 3898, 3709,
-  3772, 3898, 3961, 3646, 3709, 3961, 4024, 3583, 3646, 4024, 4087, 3836, 3899,
-  3773, 3836, 3899, 3962, 3710, 3773, 3962, 4025, 3647, 3710, 4025, 4088, 3837,
-  3900, 3900, 3963, 3774, 3837, 3963, 4026, 3711, 3774, 4026, 4089, 3901, 3964,
-  3838, 3901, 3964, 4027, 3775, 3838, 4027, 4090, 3902, 3965, 3965, 4028, 3839,
-  3902, 4028, 4091, 3966, 4029, 3903, 3966, 4029, 4092, 3967, 4030, 4030, 4093,
-  4031, 4094, 0,    0,
-};
-#endif  // CONFIG_TX64X64
-
-#if CONFIG_CHROMA_2X2
-DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_2x2[4]) = { 0, 1, 2,
-                                                                        3 };
-#endif
-
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x4[16]) = {
-  0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15,
+  0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x4[16]) = {
   0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
 };
@@ -5228,19 +2387,10 @@ DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x4[16]) = {
 DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x4[16]) = {
   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
 };
-#endif  // CONFIG_EXT_TX
-
-DECLARE_ALIGNED(16, static const int16_t, av1_col_iscan_4x4[16]) = {
-  0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_row_iscan_4x4[16]) = {
-  0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
-};
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x8[32]) = {
-  0,  1,  4,  9,  2,  3,  6,  11, 5,  7,  8,  13, 10, 12, 14, 17,
-  15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  0,  1,  3,  6,  2,  4,  7,  10, 5,  8,  11, 14, 9,  12, 15, 18,
+  13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 29, 25, 28, 30, 31,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x8[32]) = {
@@ -5254,8 +2404,8 @@ DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x8[32]) = {
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x4[32]) = {
-  0, 1, 4, 9,  15, 19, 24, 28, 2,  3,  6,  11, 16, 21, 25, 29,
-  5, 7, 8, 13, 18, 22, 26, 30, 10, 12, 14, 17, 20, 23, 27, 31,
+  0, 2, 5,  9,  13, 17, 21, 25, 1, 4,  8,  12, 16, 20, 24, 28,
+  3, 7, 11, 15, 19, 23, 27, 30, 6, 10, 14, 18, 22, 26, 29, 31,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x4[32]) = {
@@ -5269,20 +2419,19 @@ DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x4[32]) = {
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x16[64]) = {
-  0,  1,  4,  9,  2,  3,  6,  11, 5,  7,  8,  13, 10, 12, 14, 17,
-  15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+  0,  1,  3,  6,  2,  4,  7,  10, 5,  8,  11, 14, 9,  12, 15, 18,
+  13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 30, 25, 28, 31, 34,
+  29, 32, 35, 38, 33, 36, 39, 42, 37, 40, 43, 46, 41, 44, 47, 50,
+  45, 48, 51, 54, 49, 52, 55, 58, 53, 56, 59, 61, 57, 60, 62, 63,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x4[64]) = {
-  0,  1,  4,  9,  15, 19, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
-  2,  3,  6,  11, 16, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
-  5,  7,  8,  13, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
-  10, 12, 14, 17, 20, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
+  0, 2,  5,  9,  13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57,
+  1, 4,  8,  12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+  3, 7,  11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 62,
+  6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 61, 63,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x16[64]) = {
   0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
@@ -5310,7 +2459,6 @@ DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x4[64]) = {
   2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
   3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
 };
-#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x32[256]) = {
   0,   1,   3,   6,   10,  15,  21,  28,  2,   4,   7,   11,  16,  22,  29,
@@ -5330,30 +2478,30 @@ DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x32[256]) = {
   201, 208, 215, 222, 229, 235, 195, 202, 209, 216, 223, 230, 236, 241, 203,
   210, 217, 224, 231, 237, 242, 246, 211, 218, 225, 232, 238, 243, 247, 250,
   219, 226, 233, 239, 244, 248, 251, 253, 227, 234, 240, 245, 249, 252, 254,
+  255,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x8[256]) = {
-  0,   1,   3,   6,   10,  15,  21,  28,  36,  44,  52,  60,  68,  76,  84,
-  92,  100, 108, 116, 124, 132, 140, 148, 156, 164, 172, 180, 188, 196, 204,
-  212, 220, 2,   4,   7,   11,  16,  22,  29,  37,  45,  53,  61,  69,  77,
-  85,  93,  101, 109, 117, 125, 133, 141, 149, 157, 165, 173, 181, 189, 197,
-  205, 213, 221, 228, 5,   8,   12,  17,  23,  30,  38,  46,  54,  62,  70,
-  78,  86,  94,  102, 110, 118, 126, 134, 142, 150, 158, 166, 174, 182, 190,
-  198, 206, 214, 222, 229, 235, 9,   13,  18,  24,  31,  39,  47,  55,  63,
-  71,  79,  87,  95,  103, 111, 119, 127, 135, 143, 151, 159, 167, 175, 183,
-  191, 199, 207, 215, 223, 230, 236, 241, 14,  19,  25,  32,  40,  48,  56,
-  64,  72,  80,  88,  96,  104, 112, 120, 128, 136, 144, 152, 160, 168, 176,
-  184, 192, 200, 208, 216, 224, 231, 237, 242, 246, 20,  26,  33,  41,  49,
-  57,  65,  73,  81,  89,  97,  105, 113, 121, 129, 137, 145, 153, 161, 169,
-  177, 185, 193, 201, 209, 217, 225, 232, 238, 243, 247, 250, 27,  34,  42,
-  50,  58,  66,  74,  82,  90,  98,  106, 114, 122, 130, 138, 146, 154, 162,
-  170, 178, 186, 194, 202, 210, 218, 226, 233, 239, 244, 248, 251, 253, 35,
-  43,  51,  59,  67,  75,  83,  91,  99,  107, 115, 123, 131, 139, 147, 155,
-  163, 171, 179, 187, 195, 203, 211, 219, 227, 234, 240, 245, 249, 252, 254,
+  0,   2,   5,   9,   14,  20,  27,  35,  43,  51,  59,  67,  75,  83,  91,
+  99,  107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211,
+  219, 227, 1,   4,   8,   13,  19,  26,  34,  42,  50,  58,  66,  74,  82,
+  90,  98,  106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202,
+  210, 218, 226, 234, 3,   7,   12,  18,  25,  33,  41,  49,  57,  65,  73,
+  81,  89,  97,  105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193,
+  201, 209, 217, 225, 233, 240, 6,   11,  17,  24,  32,  40,  48,  56,  64,
+  72,  80,  88,  96,  104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184,
+  192, 200, 208, 216, 224, 232, 239, 245, 10,  16,  23,  31,  39,  47,  55,
+  63,  71,  79,  87,  95,  103, 111, 119, 127, 135, 143, 151, 159, 167, 175,
+  183, 191, 199, 207, 215, 223, 231, 238, 244, 249, 15,  22,  30,  38,  46,
+  54,  62,  70,  78,  86,  94,  102, 110, 118, 126, 134, 142, 150, 158, 166,
+  174, 182, 190, 198, 206, 214, 222, 230, 237, 243, 248, 252, 21,  29,  37,
+  45,  53,  61,  69,  77,  85,  93,  101, 109, 117, 125, 133, 141, 149, 157,
+  165, 173, 181, 189, 197, 205, 213, 221, 229, 236, 242, 247, 251, 254, 28,
+  36,  44,  52,  60,  68,  76,  84,  92,  100, 108, 116, 124, 132, 140, 148,
+  156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 235, 241, 246, 250, 253,
   255,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x32[256]) = {
   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
   15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
@@ -5435,9 +2583,7 @@ DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x8[256]) = {
   135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247,
   255,
 };
-#endif  // CONFIG_EXT_TX
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x8[64]) = {
   0, 8,  16, 24, 32, 40, 48, 56, 1, 9,  17, 25, 33, 41, 49, 57,
   2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
@@ -5451,27 +2597,12 @@ DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x8[64]) = {
   32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
 };
-#endif  // CONFIG_EXT_TX
-
-DECLARE_ALIGNED(16, static const int16_t, av1_col_iscan_8x8[64]) = {
-  0,  3,  8,  15, 22, 32, 40, 47, 1,  5,  11, 18, 26, 34, 44, 51,
-  2,  7,  13, 20, 28, 38, 46, 54, 4,  10, 16, 24, 31, 41, 50, 56,
-  6,  12, 21, 27, 35, 43, 52, 58, 9,  17, 25, 33, 39, 48, 55, 60,
-  14, 23, 30, 37, 45, 53, 59, 62, 19, 29, 36, 42, 49, 57, 61, 63,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_row_iscan_8x8[64]) = {
-  0,  1,  2,  5,  8,  12, 19, 24, 3,  4,  7,  10, 15, 20, 30, 39,
-  6,  9,  13, 16, 21, 27, 37, 46, 11, 14, 17, 23, 28, 34, 44, 52,
-  18, 22, 25, 31, 35, 41, 50, 57, 26, 29, 33, 38, 43, 49, 55, 59,
-  32, 36, 42, 47, 51, 54, 60, 61, 40, 45, 48, 53, 56, 58, 62, 63,
-};
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x8[64]) = {
-  0,  2,  5,  9,  14, 22, 31, 37, 1,  4,  8,  13, 19, 26, 38, 44,
-  3,  6,  10, 17, 24, 30, 42, 49, 7,  11, 15, 21, 29, 36, 47, 53,
-  12, 16, 20, 27, 34, 43, 52, 57, 18, 23, 28, 35, 41, 48, 56, 60,
-  25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
+  0,  1,  5,  6,  14, 15, 27, 28, 2,  4,  7,  13, 16, 26, 29, 42,
+  3,  8,  12, 17, 25, 30, 41, 43, 9,  11, 18, 24, 31, 40, 44, 53,
+  10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60,
+  21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x16[128]) = {
@@ -5486,14 +2617,14 @@ DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x16[128]) = {
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x8[128]) = {
-  0,  1,  3,  6,  10, 15, 21, 28, 36, 44,  52,  60,  68,  76,  84,  92,
-  2,  4,  7,  11, 16, 22, 29, 37, 45, 53,  61,  69,  77,  85,  93,  100,
-  5,  8,  12, 17, 23, 30, 38, 46, 54, 62,  70,  78,  86,  94,  101, 107,
-  9,  13, 18, 24, 31, 39, 47, 55, 63, 71,  79,  87,  95,  102, 108, 113,
-  14, 19, 25, 32, 40, 48, 56, 64, 72, 80,  88,  96,  103, 109, 114, 118,
-  20, 26, 33, 41, 49, 57, 65, 73, 81, 89,  97,  104, 110, 115, 119, 122,
-  27, 34, 42, 50, 58, 66, 74, 82, 90, 98,  105, 111, 116, 120, 123, 125,
-  35, 43, 51, 59, 67, 75, 83, 91, 99, 106, 112, 117, 121, 124, 126, 127,
+  0,  2,  5,  9,  14, 20, 27, 35, 43, 51,  59,  67,  75,  83,  91,  99,
+  1,  4,  8,  13, 19, 26, 34, 42, 50, 58,  66,  74,  82,  90,  98,  106,
+  3,  7,  12, 18, 25, 33, 41, 49, 57, 65,  73,  81,  89,  97,  105, 112,
+  6,  11, 17, 24, 32, 40, 48, 56, 64, 72,  80,  88,  96,  104, 111, 117,
+  10, 16, 23, 31, 39, 47, 55, 63, 71, 79,  87,  95,  103, 110, 116, 121,
+  15, 22, 30, 38, 46, 54, 62, 70, 78, 86,  94,  102, 109, 115, 120, 124,
+  21, 29, 37, 45, 53, 61, 69, 77, 85, 93,  101, 108, 114, 119, 123, 126,
+  28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 107, 113, 118, 122, 125, 127,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x16[128]) = {
@@ -5581,41 +2712,41 @@ DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x32[512]) = {
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x16[512]) = {
-  0,   1,   3,   6,   10,  15,  21,  28,  36,  45,  55,  66,  78,  91,  105,
-  120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344,
-  360, 376, 2,   4,   7,   11,  16,  22,  29,  37,  46,  56,  67,  79,  92,
-  106, 121, 137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329,
-  345, 361, 377, 392, 5,   8,   12,  17,  23,  30,  38,  47,  57,  68,  80,
-  93,  107, 122, 138, 154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314,
-  330, 346, 362, 378, 393, 407, 9,   13,  18,  24,  31,  39,  48,  58,  69,
-  81,  94,  108, 123, 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299,
-  315, 331, 347, 363, 379, 394, 408, 421, 14,  19,  25,  32,  40,  49,  59,
-  70,  82,  95,  109, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284,
-  300, 316, 332, 348, 364, 380, 395, 409, 422, 434, 20,  26,  33,  41,  50,
-  60,  71,  83,  96,  110, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269,
-  285, 301, 317, 333, 349, 365, 381, 396, 410, 423, 435, 446, 27,  34,  42,
-  51,  61,  72,  84,  97,  111, 126, 142, 158, 174, 190, 206, 222, 238, 254,
-  270, 286, 302, 318, 334, 350, 366, 382, 397, 411, 424, 436, 447, 457, 35,
-  43,  52,  62,  73,  85,  98,  112, 127, 143, 159, 175, 191, 207, 223, 239,
-  255, 271, 287, 303, 319, 335, 351, 367, 383, 398, 412, 425, 437, 448, 458,
-  467, 44,  53,  63,  74,  86,  99,  113, 128, 144, 160, 176, 192, 208, 224,
-  240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 399, 413, 426, 438, 449,
-  459, 468, 476, 54,  64,  75,  87,  100, 114, 129, 145, 161, 177, 193, 209,
-  225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 400, 414, 427, 439,
-  450, 460, 469, 477, 484, 65,  76,  88,  101, 115, 130, 146, 162, 178, 194,
-  210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 401, 415, 428,
-  440, 451, 461, 470, 478, 485, 491, 77,  89,  102, 116, 131, 147, 163, 179,
-  195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371, 387, 402, 416,
-  429, 441, 452, 462, 471, 479, 486, 492, 497, 90,  103, 117, 132, 148, 164,
-  180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340, 356, 372, 388, 403,
-  417, 430, 442, 453, 463, 472, 480, 487, 493, 498, 502, 104, 118, 133, 149,
-  165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325, 341, 357, 373, 389,
-  404, 418, 431, 443, 454, 464, 473, 481, 488, 494, 499, 503, 506, 119, 134,
-  150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342, 358, 374,
-  390, 405, 419, 432, 444, 455, 465, 474, 482, 489, 495, 500, 504, 507, 509,
+  0,   2,   5,   9,   14,  20,  27,  35,  44,  54,  65,  77,  90,  104, 119,
   135, 151, 167, 183, 199, 215, 231, 247, 263, 279, 295, 311, 327, 343, 359,
-  375, 391, 406, 420, 433, 445, 456, 466, 475, 483, 490, 496, 501, 505, 508,
-  510, 511,
+  375, 391, 1,   4,   8,   13,  19,  26,  34,  43,  53,  64,  76,  89,  103,
+  118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342,
+  358, 374, 390, 406, 3,   7,   12,  18,  25,  33,  42,  52,  63,  75,  88,
+  102, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325,
+  341, 357, 373, 389, 405, 420, 6,   11,  17,  24,  32,  41,  51,  62,  74,
+  87,  101, 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308,
+  324, 340, 356, 372, 388, 404, 419, 433, 10,  16,  23,  31,  40,  50,  61,
+  73,  86,  100, 115, 131, 147, 163, 179, 195, 211, 227, 243, 259, 275, 291,
+  307, 323, 339, 355, 371, 387, 403, 418, 432, 445, 15,  22,  30,  39,  49,
+  60,  72,  85,  99,  114, 130, 146, 162, 178, 194, 210, 226, 242, 258, 274,
+  290, 306, 322, 338, 354, 370, 386, 402, 417, 431, 444, 456, 21,  29,  38,
+  48,  59,  71,  84,  98,  113, 129, 145, 161, 177, 193, 209, 225, 241, 257,
+  273, 289, 305, 321, 337, 353, 369, 385, 401, 416, 430, 443, 455, 466, 28,
+  37,  47,  58,  70,  83,  97,  112, 128, 144, 160, 176, 192, 208, 224, 240,
+  256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 415, 429, 442, 454, 465,
+  475, 36,  46,  57,  69,  82,  96,  111, 127, 143, 159, 175, 191, 207, 223,
+  239, 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 414, 428, 441, 453,
+  464, 474, 483, 45,  56,  68,  81,  95,  110, 126, 142, 158, 174, 190, 206,
+  222, 238, 254, 270, 286, 302, 318, 334, 350, 366, 382, 398, 413, 427, 440,
+  452, 463, 473, 482, 490, 55,  67,  80,  94,  109, 125, 141, 157, 173, 189,
+  205, 221, 237, 253, 269, 285, 301, 317, 333, 349, 365, 381, 397, 412, 426,
+  439, 451, 462, 472, 481, 489, 496, 66,  79,  93,  108, 124, 140, 156, 172,
+  188, 204, 220, 236, 252, 268, 284, 300, 316, 332, 348, 364, 380, 396, 411,
+  425, 438, 450, 461, 471, 480, 488, 495, 501, 78,  92,  107, 123, 139, 155,
+  171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, 379, 395,
+  410, 424, 437, 449, 460, 470, 479, 487, 494, 500, 505, 91,  106, 122, 138,
+  154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378,
+  394, 409, 423, 436, 448, 459, 469, 478, 486, 493, 499, 504, 508, 105, 121,
+  137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361,
+  377, 393, 408, 422, 435, 447, 458, 468, 477, 485, 492, 498, 503, 507, 510,
+  120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344,
+  360, 376, 392, 407, 421, 434, 446, 457, 467, 476, 484, 491, 497, 502, 506,
+  509, 511,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x32[512]) = {
@@ -5767,7 +2898,6 @@ DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x16[512]) = {
   510, 511,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x16[256]) = {
   0,  16, 32, 48, 64, 80, 96,  112, 128, 144, 160, 176, 192, 208, 224, 240,
   1,  17, 33, 49, 65, 81, 97,  113, 129, 145, 161, 177, 193, 209, 225, 241,
@@ -5807,70 +2937,28 @@ DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x16[256]) = {
   240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
   255,
 };
-#endif  // CONFIG_EXT_TX
-
-DECLARE_ALIGNED(16, static const int16_t, av1_col_iscan_16x16[256]) = {
-  0,  4,  11,  20,  31,  43,  59,  75,  85,  109, 130, 150, 165, 181, 195, 198,
-  1,  6,  14,  23,  34,  47,  64,  81,  95,  114, 135, 153, 171, 188, 201, 212,
-  2,  8,  16,  25,  38,  52,  67,  83,  101, 116, 136, 157, 172, 190, 205, 216,
-  3,  10, 18,  29,  41,  55,  71,  89,  103, 119, 141, 159, 176, 194, 208, 218,
-  5,  12, 21,  32,  45,  58,  74,  93,  104, 123, 144, 164, 179, 196, 210, 223,
-  7,  15, 26,  37,  49,  63,  78,  96,  112, 129, 146, 166, 182, 200, 215, 228,
-  9,  19, 28,  39,  54,  69,  86,  102, 117, 132, 151, 170, 187, 206, 220, 230,
-  13, 24, 35,  46,  60,  73,  91,  108, 122, 137, 154, 174, 189, 207, 224, 235,
-  17, 30, 40,  53,  66,  82,  98,  115, 126, 142, 161, 180, 197, 213, 227, 237,
-  22, 36, 48,  62,  76,  92,  105, 120, 133, 147, 167, 186, 203, 219, 232, 240,
-  27, 44, 56,  70,  84,  99,  113, 127, 140, 156, 175, 193, 209, 226, 236, 244,
-  33, 51, 68,  79,  94,  110, 125, 138, 149, 162, 184, 202, 217, 229, 241, 247,
-  42, 61, 77,  90,  106, 121, 134, 148, 160, 173, 191, 211, 225, 238, 245, 251,
-  50, 72, 87,  100, 118, 128, 145, 158, 168, 183, 204, 222, 233, 242, 249, 253,
-  57, 80, 97,  111, 131, 143, 155, 169, 178, 192, 214, 231, 239, 246, 250, 254,
-  65, 88, 107, 124, 139, 152, 163, 177, 185, 199, 221, 234, 243, 248, 252, 255,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_row_iscan_16x16[256]) = {
-  0,   1,   2,   4,   6,   9,   12,  17,  22,  29,  36,  43,  54,  64,  76,
-  86,  3,   5,   7,   11,  15,  19,  25,  32,  38,  48,  59,  68,  84,  99,
-  115, 130, 8,   10,  13,  18,  23,  27,  33,  42,  51,  60,  72,  88,  103,
-  119, 142, 167, 14,  16,  20,  26,  31,  37,  44,  53,  61,  73,  85,  100,
-  116, 135, 161, 185, 21,  24,  30,  35,  40,  47,  55,  65,  74,  81,  94,
-  112, 133, 154, 179, 205, 28,  34,  39,  45,  50,  58,  67,  77,  87,  96,
-  106, 121, 146, 169, 196, 212, 41,  46,  49,  56,  63,  70,  79,  90,  98,
-  107, 122, 138, 159, 182, 207, 222, 52,  57,  62,  69,  75,  83,  93,  102,
-  110, 120, 134, 150, 176, 195, 215, 226, 66,  71,  78,  82,  91,  97,  108,
-  113, 127, 136, 148, 168, 188, 202, 221, 232, 80,  89,  92,  101, 105, 114,
-  125, 131, 139, 151, 162, 177, 192, 208, 223, 234, 95,  104, 109, 117, 123,
-  128, 143, 144, 155, 165, 175, 190, 206, 219, 233, 239, 111, 118, 124, 129,
-  140, 147, 157, 164, 170, 181, 191, 203, 224, 230, 240, 243, 126, 132, 137,
-  145, 153, 160, 174, 178, 184, 197, 204, 216, 231, 237, 244, 246, 141, 149,
-  156, 166, 172, 180, 189, 199, 200, 210, 220, 228, 238, 242, 249, 251, 152,
-  163, 171, 183, 186, 193, 201, 211, 214, 218, 227, 236, 245, 247, 252, 253,
-  158, 173, 187, 194, 198, 209, 213, 217, 225, 229, 235, 241, 248, 250, 254,
-  255,
-};
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x16[256]) = {
-  0,   2,   5,   9,   17,  24,  36,  44,  55,  72,  88,  104, 128, 143, 166,
-  179, 1,   4,   8,   13,  20,  30,  40,  54,  66,  79,  96,  113, 141, 154,
-  178, 196, 3,   7,   11,  18,  25,  33,  46,  57,  71,  86,  101, 119, 148,
-  164, 186, 201, 6,   12,  16,  23,  31,  39,  53,  64,  78,  92,  110, 127,
-  153, 169, 193, 208, 10,  14,  19,  28,  37,  47,  58,  67,  84,  98,  114,
-  133, 161, 176, 198, 214, 15,  21,  26,  34,  43,  52,  65,  77,  91,  106,
-  120, 140, 165, 185, 205, 221, 22,  27,  32,  41,  48,  60,  73,  85,  99,
-  116, 130, 151, 175, 190, 211, 225, 29,  35,  42,  49,  59,  69,  81,  95,
-  108, 125, 139, 155, 182, 197, 217, 229, 38,  45,  51,  61,  68,  80,  93,
-  105, 118, 134, 150, 168, 191, 207, 223, 234, 50,  56,  63,  74,  83,  94,
-  109, 117, 129, 147, 163, 177, 199, 213, 228, 238, 62,  70,  76,  87,  97,
-  107, 122, 131, 145, 159, 172, 188, 210, 222, 235, 242, 75,  82,  90,  102,
-  112, 124, 138, 146, 157, 173, 187, 202, 219, 230, 240, 245, 89,  100, 111,
-  123, 132, 142, 156, 167, 180, 189, 203, 216, 231, 237, 246, 250, 103, 115,
-  126, 136, 149, 162, 171, 183, 194, 204, 215, 224, 236, 241, 248, 252, 121,
-  135, 144, 158, 170, 181, 192, 200, 209, 218, 227, 233, 243, 244, 251, 254,
-  137, 152, 160, 174, 184, 195, 206, 212, 220, 226, 232, 239, 247, 249, 253,
-  255,
+  0,   1,   5,   6,   14,  15,  27,  28,  44,  45,  65,  66,  90,  91,  119,
+  120, 2,   4,   7,   13,  16,  26,  29,  43,  46,  64,  67,  89,  92,  118,
+  121, 150, 3,   8,   12,  17,  25,  30,  42,  47,  63,  68,  88,  93,  117,
+  122, 149, 151, 9,   11,  18,  24,  31,  41,  48,  62,  69,  87,  94,  116,
+  123, 148, 152, 177, 10,  19,  23,  32,  40,  49,  61,  70,  86,  95,  115,
+  124, 147, 153, 176, 178, 20,  22,  33,  39,  50,  60,  71,  85,  96,  114,
+  125, 146, 154, 175, 179, 200, 21,  34,  38,  51,  59,  72,  84,  97,  113,
+  126, 145, 155, 174, 180, 199, 201, 35,  37,  52,  58,  73,  83,  98,  112,
+  127, 144, 156, 173, 181, 198, 202, 219, 36,  53,  57,  74,  82,  99,  111,
+  128, 143, 157, 172, 182, 197, 203, 218, 220, 54,  56,  75,  81,  100, 110,
+  129, 142, 158, 171, 183, 196, 204, 217, 221, 234, 55,  76,  80,  101, 109,
+  130, 141, 159, 170, 184, 195, 205, 216, 222, 233, 235, 77,  79,  102, 108,
+  131, 140, 160, 169, 185, 194, 206, 215, 223, 232, 236, 245, 78,  103, 107,
+  132, 139, 161, 168, 186, 193, 207, 214, 224, 231, 237, 244, 246, 104, 106,
+  133, 138, 162, 167, 187, 192, 208, 213, 225, 230, 238, 243, 247, 252, 105,
+  134, 137, 163, 166, 188, 191, 209, 212, 226, 229, 239, 242, 248, 251, 253,
+  135, 136, 164, 165, 189, 190, 210, 211, 227, 228, 240, 241, 249, 250, 254,
+  255
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x32[1024]) = {
   0,   32,   64,  96,   128, 160,  192, 224,  256, 288,  320, 352,  384, 416,
   448, 480,  512, 544,  576, 608,  640, 672,  704, 736,  768, 800,  832, 864,
@@ -6029,1423 +3117,118 @@ DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x32[1024]) = {
   1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
   1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
 };
-#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x32[1024]) = {
-  0,    2,    5,    10,   17,   25,   38,   47,   62,   83,   101,  121,  145,
-  170,  193,  204,  210,  219,  229,  233,  245,  257,  275,  299,  342,  356,
-  377,  405,  455,  471,  495,  527,  1,    4,    8,    15,   22,   30,   45,
-  58,   74,   92,   112,  133,  158,  184,  203,  215,  222,  228,  234,  237,
-  256,  274,  298,  317,  355,  376,  404,  426,  470,  494,  526,  551,  3,
-  7,    12,   18,   28,   36,   52,   64,   82,   102,  118,  142,  164,  189,
-  208,  217,  224,  231,  235,  238,  273,  297,  316,  329,  375,  403,  425,
-  440,  493,  525,  550,  567,  6,    11,   16,   23,   31,   43,   60,   73,
-  90,   109,  126,  150,  173,  196,  211,  220,  226,  232,  236,  239,  296,
-  315,  328,  335,  402,  424,  439,  447,  524,  549,  566,  575,  9,    14,
-  19,   29,   37,   50,   65,   78,   95,   116,  134,  157,  179,  201,  214,
-  223,  244,  255,  272,  295,  341,  354,  374,  401,  454,  469,  492,  523,
-  582,  596,  617,  645,  13,   20,   26,   35,   44,   54,   72,   85,   105,
-  123,  140,  163,  182,  205,  216,  225,  254,  271,  294,  314,  353,  373,
-  400,  423,  468,  491,  522,  548,  595,  616,  644,  666,  21,   27,   33,
-  42,   53,   63,   80,   94,   113,  132,  151,  172,  190,  209,  218,  227,
-  270,  293,  313,  327,  372,  399,  422,  438,  490,  521,  547,  565,  615,
-  643,  665,  680,  24,   32,   39,   48,   57,   71,   88,   104,  120,  139,
-  159,  178,  197,  212,  221,  230,  292,  312,  326,  334,  398,  421,  437,
-  446,  520,  546,  564,  574,  642,  664,  679,  687,  34,   40,   46,   56,
-  68,   81,   96,   111,  130,  147,  167,  186,  243,  253,  269,  291,  340,
-  352,  371,  397,  453,  467,  489,  519,  581,  594,  614,  641,  693,  705,
-  723,  747,  41,   49,   55,   67,   77,   91,   107,  124,  138,  161,  177,
-  194,  252,  268,  290,  311,  351,  370,  396,  420,  466,  488,  518,  545,
-  593,  613,  640,  663,  704,  722,  746,  765,  51,   59,   66,   76,   89,
-  99,   119,  131,  149,  168,  181,  200,  267,  289,  310,  325,  369,  395,
-  419,  436,  487,  517,  544,  563,  612,  639,  662,  678,  721,  745,  764,
-  777,  61,   69,   75,   87,   100,  114,  129,  144,  162,  180,  191,  207,
-  288,  309,  324,  333,  394,  418,  435,  445,  516,  543,  562,  573,  638,
-  661,  677,  686,  744,  763,  776,  783,  70,   79,   86,   97,   108,  122,
-  137,  155,  242,  251,  266,  287,  339,  350,  368,  393,  452,  465,  486,
-  515,  580,  592,  611,  637,  692,  703,  720,  743,  788,  798,  813,  833,
-  84,   93,   103,  110,  125,  141,  154,  171,  250,  265,  286,  308,  349,
-  367,  392,  417,  464,  485,  514,  542,  591,  610,  636,  660,  702,  719,
-  742,  762,  797,  812,  832,  848,  98,   106,  115,  127,  143,  156,  169,
-  185,  264,  285,  307,  323,  366,  391,  416,  434,  484,  513,  541,  561,
-  609,  635,  659,  676,  718,  741,  761,  775,  811,  831,  847,  858,  117,
-  128,  136,  148,  160,  175,  188,  198,  284,  306,  322,  332,  390,  415,
-  433,  444,  512,  540,  560,  572,  634,  658,  675,  685,  740,  760,  774,
-  782,  830,  846,  857,  863,  135,  146,  152,  165,  241,  249,  263,  283,
-  338,  348,  365,  389,  451,  463,  483,  511,  579,  590,  608,  633,  691,
-  701,  717,  739,  787,  796,  810,  829,  867,  875,  887,  903,  153,  166,
-  174,  183,  248,  262,  282,  305,  347,  364,  388,  414,  462,  482,  510,
-  539,  589,  607,  632,  657,  700,  716,  738,  759,  795,  809,  828,  845,
-  874,  886,  902,  915,  176,  187,  195,  202,  261,  281,  304,  321,  363,
-  387,  413,  432,  481,  509,  538,  559,  606,  631,  656,  674,  715,  737,
-  758,  773,  808,  827,  844,  856,  885,  901,  914,  923,  192,  199,  206,
-  213,  280,  303,  320,  331,  386,  412,  431,  443,  508,  537,  558,  571,
-  630,  655,  673,  684,  736,  757,  772,  781,  826,  843,  855,  862,  900,
-  913,  922,  927,  240,  247,  260,  279,  337,  346,  362,  385,  450,  461,
-  480,  507,  578,  588,  605,  629,  690,  699,  714,  735,  786,  794,  807,
-  825,  866,  873,  884,  899,  930,  936,  945,  957,  246,  259,  278,  302,
-  345,  361,  384,  411,  460,  479,  506,  536,  587,  604,  628,  654,  698,
-  713,  734,  756,  793,  806,  824,  842,  872,  883,  898,  912,  935,  944,
-  956,  966,  258,  277,  301,  319,  360,  383,  410,  430,  478,  505,  535,
-  557,  603,  627,  653,  672,  712,  733,  755,  771,  805,  823,  841,  854,
-  882,  897,  911,  921,  943,  955,  965,  972,  276,  300,  318,  330,  382,
-  409,  429,  442,  504,  534,  556,  570,  626,  652,  671,  683,  732,  754,
-  770,  780,  822,  840,  853,  861,  896,  910,  920,  926,  954,  964,  971,
-  975,  336,  344,  359,  381,  449,  459,  477,  503,  577,  586,  602,  625,
-  689,  697,  711,  731,  785,  792,  804,  821,  865,  871,  881,  895,  929,
-  934,  942,  953,  977,  981,  987,  995,  343,  358,  380,  408,  458,  476,
-  502,  533,  585,  601,  624,  651,  696,  710,  730,  753,  791,  803,  820,
-  839,  870,  880,  894,  909,  933,  941,  952,  963,  980,  986,  994,  1001,
-  357,  379,  407,  428,  475,  501,  532,  555,  600,  623,  650,  670,  709,
-  729,  752,  769,  802,  819,  838,  852,  879,  893,  908,  919,  940,  951,
-  962,  970,  985,  993,  1000, 1005, 378,  406,  427,  441,  500,  531,  554,
-  569,  622,  649,  669,  682,  728,  751,  768,  779,  818,  837,  851,  860,
-  892,  907,  918,  925,  950,  961,  969,  974,  992,  999,  1004, 1007, 448,
-  457,  474,  499,  576,  584,  599,  621,  688,  695,  708,  727,  784,  790,
-  801,  817,  864,  869,  878,  891,  928,  932,  939,  949,  976,  979,  984,
-  991,  1008, 1010, 1013, 1017, 456,  473,  498,  530,  583,  598,  620,  648,
-  694,  707,  726,  750,  789,  800,  816,  836,  868,  877,  890,  906,  931,
-  938,  948,  960,  978,  983,  990,  998,  1009, 1012, 1016, 1020, 472,  497,
-  529,  553,  597,  619,  647,  668,  706,  725,  749,  767,  799,  815,  835,
-  850,  876,  889,  905,  917,  937,  947,  959,  968,  982,  989,  997,  1003,
-  1011, 1015, 1019, 1022, 496,  528,  552,  568,  618,  646,  667,  681,  724,
-  748,  766,  778,  814,  834,  849,  859,  888,  904,  916,  924,  946,  958,
-  967,  973,  988,  996,  1002, 1006, 1014, 1018, 1021, 1023,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_v2_iscan_32x32[1024]) = {
-  0,    1,    4,    9,    15,   22,   33,   43,   56,   71,   86,   104,  121,
-  142,  166,  189,  512,  518,  527,  539,  551,  566,  584,  602,  621,  644,
-  668,  695,  721,  748,  780,  811,  2,    3,    6,    11,   17,   26,   35,
-  45,   58,   73,   90,   106,  123,  146,  168,  193,  513,  519,  528,  540,
-  553,  567,  585,  603,  622,  647,  670,  696,  722,  751,  783,  812,  5,
-  7,    8,    13,   20,   28,   37,   50,   62,   75,   92,   108,  129,  150,
-  170,  195,  514,  521,  530,  541,  554,  569,  587,  605,  625,  649,  671,
-  699,  725,  752,  785,  815,  10,   12,   14,   19,   23,   31,   41,   52,
-  65,   81,   96,   113,  133,  152,  175,  201,  515,  522,  531,  542,  556,
-  572,  589,  607,  629,  651,  673,  700,  726,  757,  788,  819,  16,   18,
-  21,   24,   30,   39,   48,   59,   69,   83,   100,  119,  137,  158,  181,
-  203,  516,  523,  534,  545,  559,  574,  591,  610,  632,  654,  679,  704,
-  730,  762,  791,  824,  25,   27,   29,   32,   40,   46,   54,   67,   79,
-  94,   109,  127,  143,  164,  185,  210,  517,  525,  535,  547,  561,  578,
-  595,  615,  635,  656,  684,  707,  737,  766,  793,  830,  34,   36,   38,
-  42,   49,   55,   64,   76,   87,   102,  117,  135,  154,  176,  197,  219,
-  520,  529,  538,  550,  565,  580,  598,  618,  639,  664,  687,  712,  741,
-  769,  802,  833,  44,   47,   51,   53,   60,   68,   77,   85,   98,   114,
-  131,  147,  162,  183,  208,  227,  524,  533,  544,  557,  571,  588,  606,
-  623,  645,  667,  692,  720,  747,  776,  806,  838,  57,   61,   63,   66,
-  70,   80,   88,   99,   112,  124,  140,  159,  179,  199,  216,  233,  526,
-  536,  548,  562,  577,  593,  613,  633,  653,  676,  701,  727,  756,  786,
-  814,  847,  72,   74,   78,   82,   84,   95,   103,  115,  125,  139,  156,
-  173,  190,  211,  229,  246,  532,  543,  555,  568,  581,  601,  619,  637,
-  663,  685,  709,  738,  763,  792,  826,  855,  89,   91,   93,   97,   101,
-  110,  118,  132,  141,  157,  171,  186,  206,  224,  241,  255,  537,  549,
-  560,  576,  592,  608,  628,  650,  669,  693,  719,  744,  773,  805,  834,
-  862,  105,  107,  111,  116,  120,  128,  136,  148,  160,  174,  187,  205,
-  221,  236,  251,  267,  546,  558,  570,  583,  600,  617,  636,  657,  680,
-  706,  729,  758,  787,  813,  846,  871,  122,  126,  130,  134,  138,  144,
-  155,  163,  180,  191,  207,  222,  232,  248,  264,  278,  552,  564,  579,
-  594,  609,  630,  648,  666,  688,  715,  742,  768,  797,  827,  856,  877,
-  145,  149,  151,  153,  161,  165,  177,  184,  200,  212,  225,  237,  249,
-  262,  275,  289,  563,  575,  590,  604,  620,  638,  660,  683,  705,  728,
-  753,  779,  809,  839,  866,  889,  167,  169,  172,  178,  182,  188,  198,
-  209,  217,  230,  242,  252,  265,  276,  288,  301,  573,  586,  599,  616,
-  634,  652,  672,  694,  716,  743,  767,  794,  825,  850,  874,  899,  192,
-  194,  196,  202,  204,  213,  220,  228,  234,  247,  256,  268,  279,  290,
-  302,  315,  582,  597,  614,  631,  646,  665,  686,  708,  732,  759,  784,
-  810,  837,  863,  886,  908,  214,  215,  218,  223,  226,  231,  239,  244,
-  253,  261,  271,  283,  292,  304,  317,  325,  596,  611,  626,  642,  661,
-  681,  702,  723,  745,  770,  800,  828,  853,  875,  897,  919,  235,  238,
-  240,  243,  245,  250,  257,  263,  270,  280,  287,  298,  307,  319,  329,
-  340,  612,  624,  640,  658,  677,  697,  717,  739,  764,  789,  816,  844,
-  867,  890,  909,  927,  254,  258,  259,  260,  266,  269,  272,  282,  286,
-  296,  303,  312,  323,  333,  341,  355,  627,  641,  655,  674,  690,  713,
-  735,  760,  781,  807,  835,  857,  880,  902,  921,  940,  273,  274,  277,
-  281,  284,  285,  291,  299,  305,  310,  320,  327,  337,  346,  357,  369,
-  643,  659,  675,  689,  710,  733,  754,  777,  803,  831,  851,  872,  892,
-  913,  934,  950,  293,  294,  295,  297,  300,  306,  308,  314,  321,  326,
-  335,  343,  352,  361,  372,  378,  662,  678,  691,  711,  731,  749,  774,
-  798,  822,  848,  869,  887,  906,  925,  942,  961,  309,  311,  313,  316,
-  318,  322,  324,  332,  338,  344,  351,  358,  367,  375,  386,  394,  682,
-  698,  714,  734,  750,  772,  795,  820,  842,  864,  884,  904,  923,  938,
-  954,  967,  328,  330,  331,  334,  336,  339,  342,  348,  354,  359,  366,
-  374,  382,  391,  400,  409,  703,  718,  736,  755,  775,  796,  818,  840,
-  860,  882,  900,  917,  936,  952,  965,  977,  345,  347,  349,  350,  353,
-  356,  360,  364,  371,  376,  383,  389,  395,  406,  412,  423,  724,  740,
-  761,  778,  799,  821,  841,  859,  878,  895,  915,  932,  948,  963,  975,
-  986,  362,  363,  365,  368,  370,  373,  377,  379,  387,  392,  397,  405,
-  411,  420,  428,  439,  746,  765,  782,  804,  823,  843,  861,  879,  894,
-  911,  930,  946,  959,  973,  984,  994,  380,  381,  384,  385,  388,  390,
-  393,  396,  403,  408,  413,  422,  427,  436,  444,  452,  771,  790,  808,
-  832,  849,  865,  883,  896,  912,  928,  944,  957,  971,  982,  992,  1001,
-  398,  399,  401,  402,  404,  407,  410,  414,  419,  425,  429,  437,  442,
-  449,  458,  465,  801,  817,  836,  852,  870,  885,  901,  916,  931,  945,
-  956,  969,  980,  990,  999,  1007, 415,  416,  417,  418,  421,  424,  426,
-  430,  434,  441,  445,  453,  459,  463,  473,  480,  829,  845,  858,  873,
-  888,  905,  918,  933,  947,  958,  970,  979,  988,  997,  1005, 1012, 431,
-  432,  433,  435,  438,  440,  443,  446,  451,  456,  461,  468,  475,  479,
-  488,  494,  854,  868,  881,  893,  907,  924,  937,  949,  960,  972,  981,
-  989,  996,  1003, 1010, 1016, 447,  448,  450,  454,  455,  457,  460,  462,
-  469,  472,  477,  482,  490,  495,  499,  503,  876,  891,  903,  914,  926,
-  939,  953,  964,  974,  983,  991,  998,  1004, 1009, 1014, 1019, 464,  466,
-  467,  470,  471,  474,  476,  478,  484,  489,  493,  497,  501,  504,  506,
-  508,  898,  910,  922,  935,  943,  955,  966,  976,  985,  993,  1000, 1006,
-  1011, 1015, 1018, 1021, 481,  483,  485,  486,  487,  491,  492,  496,  498,
-  500,  502,  505,  507,  509,  510,  511,  920,  929,  941,  951,  962,  968,
-  978,  987,  995,  1002, 1008, 1013, 1017, 1020, 1022, 1023,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_h2_iscan_32x32[1024]) = {
-  0,    1,    4,    9,    15,   22,   33,   43,   56,   71,   86,   104,  121,
-  142,  166,  189,  214,  233,  254,  273,  292,  309,  328,  345,  362,  378,
-  397,  415,  431,  447,  464,  481,  2,    3,    6,    11,   17,   26,   35,
-  45,   58,   73,   90,   106,  123,  146,  168,  193,  215,  236,  255,  274,
-  294,  310,  329,  346,  363,  381,  399,  416,  432,  448,  465,  482,  5,
-  7,    8,    13,   20,   28,   37,   50,   62,   75,   92,   108,  129,  150,
-  170,  195,  216,  240,  259,  275,  295,  312,  331,  348,  365,  383,  400,
-  417,  433,  449,  467,  485,  10,   12,   14,   19,   23,   31,   41,   52,
-  65,   81,   96,   113,  133,  152,  175,  201,  221,  243,  260,  280,  297,
-  315,  333,  350,  367,  385,  402,  418,  434,  452,  470,  486,  16,   18,
-  21,   24,   30,   39,   48,   59,   69,   83,   100,  119,  137,  158,  181,
-  203,  226,  244,  264,  283,  300,  318,  335,  353,  370,  388,  404,  420,
-  438,  455,  471,  487,  25,   27,   29,   32,   40,   46,   54,   67,   79,
-  94,   109,  127,  143,  164,  185,  210,  231,  250,  269,  285,  304,  322,
-  339,  356,  373,  389,  407,  423,  440,  457,  473,  491,  34,   36,   38,
-  42,   49,   55,   64,   76,   87,   102,  117,  135,  154,  176,  197,  219,
-  239,  256,  272,  291,  308,  324,  341,  359,  377,  393,  410,  426,  442,
-  460,  476,  492,  44,   47,   51,   53,   60,   68,   77,   85,   98,   114,
-  131,  147,  162,  183,  208,  227,  245,  262,  282,  298,  314,  332,  349,
-  364,  379,  396,  412,  430,  446,  462,  478,  495,  57,   61,   63,   66,
-  70,   80,   88,   99,   112,  124,  140,  159,  179,  199,  217,  234,  253,
-  270,  286,  305,  321,  337,  354,  371,  387,  403,  419,  435,  451,  468,
-  484,  498,  72,   74,   78,   82,   84,   95,   103,  115,  125,  139,  156,
-  173,  190,  211,  229,  246,  261,  281,  296,  311,  325,  344,  360,  375,
-  392,  408,  425,  441,  456,  472,  489,  500,  89,   91,   93,   97,   101,
-  110,  118,  132,  141,  157,  171,  186,  206,  224,  241,  257,  271,  287,
-  303,  320,  336,  351,  366,  384,  398,  413,  429,  445,  461,  477,  493,
-  502,  105,  107,  111,  116,  120,  128,  136,  148,  160,  174,  187,  205,
-  222,  237,  251,  267,  284,  299,  313,  327,  343,  358,  374,  390,  405,
-  422,  437,  453,  469,  483,  497,  505,  122,  126,  130,  134,  138,  144,
-  155,  163,  180,  191,  207,  223,  232,  248,  265,  278,  293,  307,  323,
-  338,  352,  368,  382,  395,  411,  427,  443,  459,  475,  490,  501,  507,
-  145,  149,  151,  153,  161,  165,  177,  184,  200,  212,  225,  238,  249,
-  263,  276,  289,  306,  319,  334,  347,  361,  376,  391,  406,  421,  436,
-  450,  463,  479,  496,  504,  509,  167,  169,  172,  178,  182,  188,  198,
-  209,  218,  230,  242,  252,  266,  277,  288,  301,  317,  330,  342,  357,
-  372,  386,  401,  414,  428,  444,  458,  474,  488,  499,  506,  510,  192,
-  194,  196,  202,  204,  213,  220,  228,  235,  247,  258,  268,  279,  290,
-  302,  316,  326,  340,  355,  369,  380,  394,  409,  424,  439,  454,  466,
-  480,  494,  503,  508,  511,  512,  513,  514,  515,  516,  517,  520,  523,
-  526,  532,  537,  545,  551,  561,  573,  581,  596,  610,  625,  642,  661,
-  680,  701,  722,  745,  770,  800,  827,  853,  875,  897,  919,  518,  519,
-  521,  522,  524,  525,  528,  533,  536,  542,  549,  557,  564,  575,  585,
-  597,  611,  623,  640,  656,  676,  696,  717,  739,  763,  789,  815,  844,
-  867,  889,  909,  927,  527,  529,  530,  531,  534,  535,  538,  544,  548,
-  555,  560,  569,  579,  589,  598,  614,  626,  641,  655,  673,  690,  712,
-  735,  760,  780,  806,  834,  857,  880,  902,  921,  940,  539,  540,  541,
-  543,  546,  547,  550,  558,  562,  567,  576,  583,  593,  603,  616,  631,
-  643,  657,  674,  689,  710,  733,  752,  776,  803,  830,  850,  872,  892,
-  913,  934,  950,  552,  553,  554,  556,  559,  563,  565,  571,  577,  582,
-  591,  600,  609,  620,  634,  644,  662,  677,  691,  711,  730,  748,  773,
-  798,  822,  847,  869,  887,  906,  925,  942,  961,  566,  568,  570,  572,
-  574,  578,  580,  588,  594,  601,  608,  617,  629,  637,  652,  665,  681,
-  697,  713,  734,  749,  772,  793,  819,  842,  863,  884,  904,  923,  938,
-  954,  967,  584,  586,  587,  590,  592,  595,  599,  605,  613,  618,  628,
-  636,  648,  660,  671,  686,  702,  718,  736,  753,  774,  794,  818,  840,
-  860,  882,  900,  917,  936,  952,  965,  977,  602,  604,  606,  607,  612,
-  615,  619,  624,  633,  638,  649,  658,  666,  683,  692,  707,  723,  740,
-  761,  777,  799,  820,  841,  859,  877,  895,  915,  932,  948,  963,  975,
-  986,  621,  622,  627,  630,  632,  635,  639,  645,  653,  663,  668,  682,
-  688,  704,  716,  732,  746,  764,  781,  804,  823,  843,  861,  878,  894,
-  911,  930,  946,  959,  973,  984,  994,  646,  647,  650,  651,  654,  659,
-  664,  667,  678,  685,  693,  706,  715,  728,  743,  757,  771,  790,  807,
-  831,  848,  864,  883,  896,  912,  928,  944,  957,  971,  982,  992,  1001,
-  669,  670,  672,  675,  679,  684,  687,  694,  703,  709,  719,  729,  741,
-  754,  767,  783,  801,  816,  835,  851,  870,  885,  901,  916,  931,  945,
-  956,  969,  980,  990,  999,  1007, 695,  698,  699,  700,  705,  708,  714,
-  720,  726,  738,  744,  758,  768,  779,  795,  810,  828,  845,  858,  873,
-  888,  905,  918,  933,  947,  958,  970,  979,  988,  997,  1005, 1012, 721,
-  724,  725,  727,  731,  737,  742,  747,  756,  765,  775,  786,  797,  809,
-  825,  837,  854,  868,  881,  893,  907,  924,  937,  949,  960,  972,  981,
-  989,  996,  1003, 1010, 1016, 750,  751,  755,  759,  762,  766,  769,  778,
-  787,  792,  805,  812,  829,  838,  852,  865,  876,  890,  903,  914,  926,
-  939,  953,  964,  974,  983,  991,  998,  1004, 1009, 1014, 1019, 782,  784,
-  785,  788,  791,  796,  802,  808,  814,  826,  836,  846,  856,  866,  874,
-  886,  898,  910,  922,  935,  943,  955,  966,  976,  985,  993,  1000, 1006,
-  1011, 1015, 1018, 1021, 811,  813,  817,  821,  824,  832,  833,  839,  849,
-  855,  862,  871,  879,  891,  899,  908,  920,  929,  941,  951,  962,  968,
-  978,  987,  995,  1002, 1008, 1013, 1017, 1020, 1022, 1023,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_qtr_iscan_32x32[1024]) = {
-  0,    1,    4,    9,    15,   22,   33,   43,   56,   71,   86,   104,  121,
-  142,  166,  189,  256,  268,  286,  310,  334,  364,  400,  435,  471,  510,
-  553,  598,  640,  683,  732,  780,  2,    3,    6,    11,   17,   26,   35,
-  45,   58,   73,   90,   106,  123,  146,  168,  193,  258,  270,  288,  312,
-  338,  366,  402,  437,  473,  516,  557,  600,  642,  687,  736,  782,  5,
-  7,    8,    13,   20,   28,   37,   50,   62,   75,   92,   108,  129,  150,
-  170,  195,  260,  274,  292,  314,  340,  370,  406,  441,  478,  520,  559,
-  604,  646,  689,  740,  788,  10,   12,   14,   19,   23,   31,   41,   52,
-  65,   81,   96,   113,  133,  152,  175,  201,  262,  276,  294,  316,  344,
-  376,  410,  445,  484,  524,  563,  606,  648,  697,  746,  793,  16,   18,
-  21,   24,   30,   39,   48,   59,   69,   83,   100,  119,  137,  158,  181,
-  203,  264,  278,  300,  322,  350,  380,  414,  451,  490,  530,  571,  612,
-  656,  705,  750,  799,  25,   27,   29,   32,   40,   46,   54,   67,   79,
-  94,   109,  127,  143,  164,  185,  210,  266,  282,  302,  326,  354,  388,
-  422,  459,  496,  533,  579,  618,  665,  711,  754,  809,  34,   36,   38,
-  42,   49,   55,   64,   76,   87,   102,  117,  135,  154,  176,  197,  216,
-  272,  289,  308,  332,  362,  392,  427,  465,  504,  545,  585,  626,  671,
-  717,  766,  813,  44,   47,   51,   53,   60,   68,   77,   85,   98,   114,
-  131,  147,  162,  183,  208,  222,  279,  298,  320,  346,  374,  408,  442,
-  475,  511,  551,  592,  638,  681,  726,  772,  821,  57,   61,   63,   66,
-  70,   80,   88,   99,   112,  124,  140,  159,  179,  199,  214,  227,  284,
-  304,  328,  355,  386,  418,  455,  492,  528,  567,  608,  649,  695,  742,
-  786,  833,  72,   74,   78,   82,   84,   95,   103,  115,  125,  139,  156,
-  173,  190,  211,  224,  233,  296,  317,  342,  367,  394,  433,  466,  500,
-  543,  581,  622,  667,  707,  752,  803,  843,  89,   91,   93,   97,   101,
-  110,  118,  132,  141,  157,  171,  186,  206,  220,  231,  239,  306,  330,
-  352,  384,  415,  447,  482,  521,  554,  593,  636,  677,  722,  770,  815,
-  852,  105,  107,  111,  116,  120,  128,  136,  148,  160,  174,  187,  205,
-  218,  229,  237,  244,  323,  347,  371,  398,  431,  463,  498,  534,  573,
-  616,  654,  698,  743,  783,  831,  864,  122,  126,  130,  134,  138,  144,
-  155,  163,  180,  191,  207,  219,  226,  235,  242,  248,  335,  360,  390,
-  419,  449,  485,  518,  549,  587,  630,  672,  715,  760,  805,  845,  872,
-  145,  149,  151,  153,  161,  165,  177,  184,  200,  212,  221,  230,  236,
-  241,  246,  251,  356,  382,  411,  438,  469,  501,  539,  577,  613,  652,
-  690,  730,  776,  822,  858,  886,  167,  169,  172,  178,  182,  188,  198,
-  209,  215,  225,  232,  238,  243,  247,  250,  253,  378,  403,  428,  461,
-  494,  526,  560,  594,  632,  675,  713,  755,  801,  837,  868,  897,  192,
-  194,  196,  202,  204,  213,  217,  223,  228,  234,  240,  245,  249,  252,
-  254,  255,  395,  425,  457,  488,  512,  547,  583,  619,  659,  699,  737,
-  778,  819,  854,  882,  907,  257,  259,  261,  263,  265,  267,  273,  280,
-  285,  297,  307,  324,  336,  357,  379,  396,  424,  452,  479,  508,  541,
-  574,  609,  643,  679,  719,  764,  806,  841,  870,  895,  919,  269,  271,
-  275,  277,  281,  283,  290,  299,  305,  318,  331,  348,  361,  383,  404,
-  426,  453,  476,  506,  535,  568,  601,  634,  669,  708,  748,  789,  829,
-  860,  887,  909,  927,  287,  291,  293,  295,  301,  303,  309,  321,  329,
-  343,  353,  372,  391,  412,  429,  458,  480,  507,  532,  564,  590,  627,
-  663,  703,  733,  773,  816,  847,  876,  901,  921,  940,  311,  313,  315,
-  319,  325,  327,  333,  349,  358,  368,  385,  399,  420,  439,  462,  489,
-  509,  536,  565,  589,  624,  661,  691,  727,  768,  810,  838,  866,  890,
-  913,  934,  950,  337,  339,  341,  345,  351,  359,  363,  375,  387,  397,
-  416,  432,  450,  470,  495,  513,  542,  569,  591,  625,  657,  684,  723,
-  762,  797,  834,  862,  884,  905,  925,  942,  961,  365,  369,  373,  377,
-  381,  389,  393,  409,  421,  434,  448,  464,  486,  502,  527,  548,  575,
-  602,  628,  662,  685,  721,  756,  794,  827,  855,  880,  903,  923,  938,
-  954,  967,  401,  405,  407,  413,  417,  423,  430,  443,  456,  467,  483,
-  499,  519,  540,  561,  584,  610,  635,  664,  692,  724,  757,  792,  825,
-  850,  878,  899,  917,  936,  952,  965,  977,  436,  440,  444,  446,  454,
-  460,  468,  477,  493,  503,  522,  537,  550,  578,  595,  620,  644,  670,
-  704,  728,  763,  795,  826,  849,  873,  893,  915,  932,  948,  963,  975,
-  986,  472,  474,  481,  487,  491,  497,  505,  514,  529,  544,  555,  576,
-  588,  614,  633,  660,  680,  709,  734,  769,  798,  828,  851,  874,  892,
-  911,  930,  946,  959,  973,  984,  994,  515,  517,  523,  525,  531,  538,
-  546,  552,  570,  582,  596,  617,  631,  653,  676,  700,  720,  749,  774,
-  811,  835,  856,  879,  894,  912,  928,  944,  957,  971,  982,  992,  1001,
-  556,  558,  562,  566,  572,  580,  586,  597,  611,  623,  637,  655,  673,
-  693,  714,  738,  765,  790,  817,  839,  863,  881,  900,  916,  931,  945,
-  956,  969,  980,  990,  999,  1007, 599,  603,  605,  607,  615,  621,  629,
-  639,  650,  668,  678,  701,  716,  731,  758,  779,  807,  830,  848,  867,
-  885,  904,  918,  933,  947,  958,  970,  979,  988,  997,  1005, 1012, 641,
-  645,  647,  651,  658,  666,  674,  682,  696,  710,  725,  744,  761,  777,
-  802,  820,  842,  861,  877,  891,  906,  924,  937,  949,  960,  972,  981,
-  989,  996,  1003, 1010, 1016, 686,  688,  694,  702,  706,  712,  718,  729,
-  745,  753,  771,  784,  808,  823,  840,  857,  871,  888,  902,  914,  926,
-  939,  953,  964,  974,  983,  991,  998,  1004, 1009, 1014, 1019, 735,  739,
-  741,  747,  751,  759,  767,  775,  787,  804,  818,  832,  846,  859,  869,
-  883,  896,  910,  922,  935,  943,  955,  966,  976,  985,  993,  1000, 1006,
-  1011, 1015, 1018, 1021, 781,  785,  791,  796,  800,  812,  814,  824,  836,
-  844,  853,  865,  875,  889,  898,  908,  920,  929,  941,  951,  962,  968,
-  978,  987,  995,  1002, 1008, 1013, 1017, 1020, 1022, 1023,
-};
-
-#if CONFIG_TX64X64
-DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x64[2048]) = {
-  0,    1,    3,    6,    10,   15,   21,   28,   36,   45,   55,   66,   78,
-  91,   105,  120,  136,  153,  171,  190,  210,  231,  253,  276,  300,  325,
-  351,  378,  406,  435,  465,  496,  2,    4,    7,    11,   16,   22,   29,
-  37,   46,   56,   67,   79,   92,   106,  121,  137,  154,  172,  191,  211,
-  232,  254,  277,  301,  326,  352,  379,  407,  436,  466,  497,  528,  5,
-  8,    12,   17,   23,   30,   38,   47,   57,   68,   80,   93,   107,  122,
-  138,  155,  173,  192,  212,  233,  255,  278,  302,  327,  353,  380,  408,
-  437,  467,  498,  529,  560,  9,    13,   18,   24,   31,   39,   48,   58,
-  69,   81,   94,   108,  123,  139,  156,  174,  193,  213,  234,  256,  279,
-  303,  328,  354,  381,  409,  438,  468,  499,  530,  561,  592,  14,   19,
-  25,   32,   40,   49,   59,   70,   82,   95,   109,  124,  140,  157,  175,
-  194,  214,  235,  257,  280,  304,  329,  355,  382,  410,  439,  469,  500,
-  531,  562,  593,  624,  20,   26,   33,   41,   50,   60,   71,   83,   96,
-  110,  125,  141,  158,  176,  195,  215,  236,  258,  281,  305,  330,  356,
-  383,  411,  440,  470,  501,  532,  563,  594,  625,  656,  27,   34,   42,
-  51,   61,   72,   84,   97,   111,  126,  142,  159,  177,  196,  216,  237,
-  259,  282,  306,  331,  357,  384,  412,  441,  471,  502,  533,  564,  595,
-  626,  657,  688,  35,   43,   52,   62,   73,   85,   98,   112,  127,  143,
-  160,  178,  197,  217,  238,  260,  283,  307,  332,  358,  385,  413,  442,
-  472,  503,  534,  565,  596,  627,  658,  689,  720,  44,   53,   63,   74,
-  86,   99,   113,  128,  144,  161,  179,  198,  218,  239,  261,  284,  308,
-  333,  359,  386,  414,  443,  473,  504,  535,  566,  597,  628,  659,  690,
-  721,  752,  54,   64,   75,   87,   100,  114,  129,  145,  162,  180,  199,
-  219,  240,  262,  285,  309,  334,  360,  387,  415,  444,  474,  505,  536,
-  567,  598,  629,  660,  691,  722,  753,  784,  65,   76,   88,   101,  115,
-  130,  146,  163,  181,  200,  220,  241,  263,  286,  310,  335,  361,  388,
-  416,  445,  475,  506,  537,  568,  599,  630,  661,  692,  723,  754,  785,
-  816,  77,   89,   102,  116,  131,  147,  164,  182,  201,  221,  242,  264,
-  287,  311,  336,  362,  389,  417,  446,  476,  507,  538,  569,  600,  631,
-  662,  693,  724,  755,  786,  817,  848,  90,   103,  117,  132,  148,  165,
-  183,  202,  222,  243,  265,  288,  312,  337,  363,  390,  418,  447,  477,
-  508,  539,  570,  601,  632,  663,  694,  725,  756,  787,  818,  849,  880,
-  104,  118,  133,  149,  166,  184,  203,  223,  244,  266,  289,  313,  338,
-  364,  391,  419,  448,  478,  509,  540,  571,  602,  633,  664,  695,  726,
-  757,  788,  819,  850,  881,  912,  119,  134,  150,  167,  185,  204,  224,
-  245,  267,  290,  314,  339,  365,  392,  420,  449,  479,  510,  541,  572,
-  603,  634,  665,  696,  727,  758,  789,  820,  851,  882,  913,  944,  135,
-  151,  168,  186,  205,  225,  246,  268,  291,  315,  340,  366,  393,  421,
-  450,  480,  511,  542,  573,  604,  635,  666,  697,  728,  759,  790,  821,
-  852,  883,  914,  945,  976,  152,  169,  187,  206,  226,  247,  269,  292,
-  316,  341,  367,  394,  422,  451,  481,  512,  543,  574,  605,  636,  667,
-  698,  729,  760,  791,  822,  853,  884,  915,  946,  977,  1008, 170,  188,
-  207,  227,  248,  270,  293,  317,  342,  368,  395,  423,  452,  482,  513,
-  544,  575,  606,  637,  668,  699,  730,  761,  792,  823,  854,  885,  916,
-  947,  978,  1009, 1040, 189,  208,  228,  249,  271,  294,  318,  343,  369,
-  396,  424,  453,  483,  514,  545,  576,  607,  638,  669,  700,  731,  762,
-  793,  824,  855,  886,  917,  948,  979,  1010, 1041, 1072, 209,  229,  250,
-  272,  295,  319,  344,  370,  397,  425,  454,  484,  515,  546,  577,  608,
-  639,  670,  701,  732,  763,  794,  825,  856,  887,  918,  949,  980,  1011,
-  1042, 1073, 1104, 230,  251,  273,  296,  320,  345,  371,  398,  426,  455,
-  485,  516,  547,  578,  609,  640,  671,  702,  733,  764,  795,  826,  857,
-  888,  919,  950,  981,  1012, 1043, 1074, 1105, 1136, 252,  274,  297,  321,
-  346,  372,  399,  427,  456,  486,  517,  548,  579,  610,  641,  672,  703,
-  734,  765,  796,  827,  858,  889,  920,  951,  982,  1013, 1044, 1075, 1106,
-  1137, 1168, 275,  298,  322,  347,  373,  400,  428,  457,  487,  518,  549,
-  580,  611,  642,  673,  704,  735,  766,  797,  828,  859,  890,  921,  952,
-  983,  1014, 1045, 1076, 1107, 1138, 1169, 1200, 299,  323,  348,  374,  401,
-  429,  458,  488,  519,  550,  581,  612,  643,  674,  705,  736,  767,  798,
-  829,  860,  891,  922,  953,  984,  1015, 1046, 1077, 1108, 1139, 1170, 1201,
-  1232, 324,  349,  375,  402,  430,  459,  489,  520,  551,  582,  613,  644,
-  675,  706,  737,  768,  799,  830,  861,  892,  923,  954,  985,  1016, 1047,
-  1078, 1109, 1140, 1171, 1202, 1233, 1264, 350,  376,  403,  431,  460,  490,
-  521,  552,  583,  614,  645,  676,  707,  738,  769,  800,  831,  862,  893,
-  924,  955,  986,  1017, 1048, 1079, 1110, 1141, 1172, 1203, 1234, 1265, 1296,
-  377,  404,  432,  461,  491,  522,  553,  584,  615,  646,  677,  708,  739,
-  770,  801,  832,  863,  894,  925,  956,  987,  1018, 1049, 1080, 1111, 1142,
-  1173, 1204, 1235, 1266, 1297, 1328, 405,  433,  462,  492,  523,  554,  585,
-  616,  647,  678,  709,  740,  771,  802,  833,  864,  895,  926,  957,  988,
-  1019, 1050, 1081, 1112, 1143, 1174, 1205, 1236, 1267, 1298, 1329, 1360, 434,
-  463,  493,  524,  555,  586,  617,  648,  679,  710,  741,  772,  803,  834,
-  865,  896,  927,  958,  989,  1020, 1051, 1082, 1113, 1144, 1175, 1206, 1237,
-  1268, 1299, 1330, 1361, 1392, 464,  494,  525,  556,  587,  618,  649,  680,
-  711,  742,  773,  804,  835,  866,  897,  928,  959,  990,  1021, 1052, 1083,
-  1114, 1145, 1176, 1207, 1238, 1269, 1300, 1331, 1362, 1393, 1424, 495,  526,
-  557,  588,  619,  650,  681,  712,  743,  774,  805,  836,  867,  898,  929,
-  960,  991,  1022, 1053, 1084, 1115, 1146, 1177, 1208, 1239, 1270, 1301, 1332,
-  1363, 1394, 1425, 1456, 527,  558,  589,  620,  651,  682,  713,  744,  775,
-  806,  837,  868,  899,  930,  961,  992,  1023, 1054, 1085, 1116, 1147, 1178,
-  1209, 1240, 1271, 1302, 1333, 1364, 1395, 1426, 1457, 1488, 559,  590,  621,
-  652,  683,  714,  745,  776,  807,  838,  869,  900,  931,  962,  993,  1024,
-  1055, 1086, 1117, 1148, 1179, 1210, 1241, 1272, 1303, 1334, 1365, 1396, 1427,
-  1458, 1489, 1520, 591,  622,  653,  684,  715,  746,  777,  808,  839,  870,
-  901,  932,  963,  994,  1025, 1056, 1087, 1118, 1149, 1180, 1211, 1242, 1273,
-  1304, 1335, 1366, 1397, 1428, 1459, 1490, 1521, 1552, 623,  654,  685,  716,
-  747,  778,  809,  840,  871,  902,  933,  964,  995,  1026, 1057, 1088, 1119,
-  1150, 1181, 1212, 1243, 1274, 1305, 1336, 1367, 1398, 1429, 1460, 1491, 1522,
-  1553, 1583, 655,  686,  717,  748,  779,  810,  841,  872,  903,  934,  965,
-  996,  1027, 1058, 1089, 1120, 1151, 1182, 1213, 1244, 1275, 1306, 1337, 1368,
-  1399, 1430, 1461, 1492, 1523, 1554, 1584, 1613, 687,  718,  749,  780,  811,
-  842,  873,  904,  935,  966,  997,  1028, 1059, 1090, 1121, 1152, 1183, 1214,
-  1245, 1276, 1307, 1338, 1369, 1400, 1431, 1462, 1493, 1524, 1555, 1585, 1614,
-  1642, 719,  750,  781,  812,  843,  874,  905,  936,  967,  998,  1029, 1060,
-  1091, 1122, 1153, 1184, 1215, 1246, 1277, 1308, 1339, 1370, 1401, 1432, 1463,
-  1494, 1525, 1556, 1586, 1615, 1643, 1670, 751,  782,  813,  844,  875,  906,
-  937,  968,  999,  1030, 1061, 1092, 1123, 1154, 1185, 1216, 1247, 1278, 1309,
-  1340, 1371, 1402, 1433, 1464, 1495, 1526, 1557, 1587, 1616, 1644, 1671, 1697,
-  783,  814,  845,  876,  907,  938,  969,  1000, 1031, 1062, 1093, 1124, 1155,
-  1186, 1217, 1248, 1279, 1310, 1341, 1372, 1403, 1434, 1465, 1496, 1527, 1558,
-  1588, 1617, 1645, 1672, 1698, 1723, 815,  846,  877,  908,  939,  970,  1001,
-  1032, 1063, 1094, 1125, 1156, 1187, 1218, 1249, 1280, 1311, 1342, 1373, 1404,
-  1435, 1466, 1497, 1528, 1559, 1589, 1618, 1646, 1673, 1699, 1724, 1748, 847,
-  878,  909,  940,  971,  1002, 1033, 1064, 1095, 1126, 1157, 1188, 1219, 1250,
-  1281, 1312, 1343, 1374, 1405, 1436, 1467, 1498, 1529, 1560, 1590, 1619, 1647,
-  1674, 1700, 1725, 1749, 1772, 879,  910,  941,  972,  1003, 1034, 1065, 1096,
-  1127, 1158, 1189, 1220, 1251, 1282, 1313, 1344, 1375, 1406, 1437, 1468, 1499,
-  1530, 1561, 1591, 1620, 1648, 1675, 1701, 1726, 1750, 1773, 1795, 911,  942,
-  973,  1004, 1035, 1066, 1097, 1128, 1159, 1190, 1221, 1252, 1283, 1314, 1345,
-  1376, 1407, 1438, 1469, 1500, 1531, 1562, 1592, 1621, 1649, 1676, 1702, 1727,
-  1751, 1774, 1796, 1817, 943,  974,  1005, 1036, 1067, 1098, 1129, 1160, 1191,
-  1222, 1253, 1284, 1315, 1346, 1377, 1408, 1439, 1470, 1501, 1532, 1563, 1593,
-  1622, 1650, 1677, 1703, 1728, 1752, 1775, 1797, 1818, 1838, 975,  1006, 1037,
-  1068, 1099, 1130, 1161, 1192, 1223, 1254, 1285, 1316, 1347, 1378, 1409, 1440,
-  1471, 1502, 1533, 1564, 1594, 1623, 1651, 1678, 1704, 1729, 1753, 1776, 1798,
-  1819, 1839, 1858, 1007, 1038, 1069, 1100, 1131, 1162, 1193, 1224, 1255, 1286,
-  1317, 1348, 1379, 1410, 1441, 1472, 1503, 1534, 1565, 1595, 1624, 1652, 1679,
-  1705, 1730, 1754, 1777, 1799, 1820, 1840, 1859, 1877, 1039, 1070, 1101, 1132,
-  1163, 1194, 1225, 1256, 1287, 1318, 1349, 1380, 1411, 1442, 1473, 1504, 1535,
-  1566, 1596, 1625, 1653, 1680, 1706, 1731, 1755, 1778, 1800, 1821, 1841, 1860,
-  1878, 1895, 1071, 1102, 1133, 1164, 1195, 1226, 1257, 1288, 1319, 1350, 1381,
-  1412, 1443, 1474, 1505, 1536, 1567, 1597, 1626, 1654, 1681, 1707, 1732, 1756,
-  1779, 1801, 1822, 1842, 1861, 1879, 1896, 1912, 1103, 1134, 1165, 1196, 1227,
-  1258, 1289, 1320, 1351, 1382, 1413, 1444, 1475, 1506, 1537, 1568, 1598, 1627,
-  1655, 1682, 1708, 1733, 1757, 1780, 1802, 1823, 1843, 1862, 1880, 1897, 1913,
-  1928, 1135, 1166, 1197, 1228, 1259, 1290, 1321, 1352, 1383, 1414, 1445, 1476,
-  1507, 1538, 1569, 1599, 1628, 1656, 1683, 1709, 1734, 1758, 1781, 1803, 1824,
-  1844, 1863, 1881, 1898, 1914, 1929, 1943, 1167, 1198, 1229, 1260, 1291, 1322,
-  1353, 1384, 1415, 1446, 1477, 1508, 1539, 1570, 1600, 1629, 1657, 1684, 1710,
-  1735, 1759, 1782, 1804, 1825, 1845, 1864, 1882, 1899, 1915, 1930, 1944, 1957,
-  1199, 1230, 1261, 1292, 1323, 1354, 1385, 1416, 1447, 1478, 1509, 1540, 1571,
-  1601, 1630, 1658, 1685, 1711, 1736, 1760, 1783, 1805, 1826, 1846, 1865, 1883,
-  1900, 1916, 1931, 1945, 1958, 1970, 1231, 1262, 1293, 1324, 1355, 1386, 1417,
-  1448, 1479, 1510, 1541, 1572, 1602, 1631, 1659, 1686, 1712, 1737, 1761, 1784,
-  1806, 1827, 1847, 1866, 1884, 1901, 1917, 1932, 1946, 1959, 1971, 1982, 1263,
-  1294, 1325, 1356, 1387, 1418, 1449, 1480, 1511, 1542, 1573, 1603, 1632, 1660,
-  1687, 1713, 1738, 1762, 1785, 1807, 1828, 1848, 1867, 1885, 1902, 1918, 1933,
-  1947, 1960, 1972, 1983, 1993, 1295, 1326, 1357, 1388, 1419, 1450, 1481, 1512,
-  1543, 1574, 1604, 1633, 1661, 1688, 1714, 1739, 1763, 1786, 1808, 1829, 1849,
-  1868, 1886, 1903, 1919, 1934, 1948, 1961, 1973, 1984, 1994, 2003, 1327, 1358,
-  1389, 1420, 1451, 1482, 1513, 1544, 1575, 1605, 1634, 1662, 1689, 1715, 1740,
-  1764, 1787, 1809, 1830, 1850, 1869, 1887, 1904, 1920, 1935, 1949, 1962, 1974,
-  1985, 1995, 2004, 2012, 1359, 1390, 1421, 1452, 1483, 1514, 1545, 1576, 1606,
-  1635, 1663, 1690, 1716, 1741, 1765, 1788, 1810, 1831, 1851, 1870, 1888, 1905,
-  1921, 1936, 1950, 1963, 1975, 1986, 1996, 2005, 2013, 2020, 1391, 1422, 1453,
-  1484, 1515, 1546, 1577, 1607, 1636, 1664, 1691, 1717, 1742, 1766, 1789, 1811,
-  1832, 1852, 1871, 1889, 1906, 1922, 1937, 1951, 1964, 1976, 1987, 1997, 2006,
-  2014, 2021, 2027, 1423, 1454, 1485, 1516, 1547, 1578, 1608, 1637, 1665, 1692,
-  1718, 1743, 1767, 1790, 1812, 1833, 1853, 1872, 1890, 1907, 1923, 1938, 1952,
-  1965, 1977, 1988, 1998, 2007, 2015, 2022, 2028, 2033, 1455, 1486, 1517, 1548,
-  1579, 1609, 1638, 1666, 1693, 1719, 1744, 1768, 1791, 1813, 1834, 1854, 1873,
-  1891, 1908, 1924, 1939, 1953, 1966, 1978, 1989, 1999, 2008, 2016, 2023, 2029,
-  2034, 2038, 1487, 1518, 1549, 1580, 1610, 1639, 1667, 1694, 1720, 1745, 1769,
-  1792, 1814, 1835, 1855, 1874, 1892, 1909, 1925, 1940, 1954, 1967, 1979, 1990,
-  2000, 2009, 2017, 2024, 2030, 2035, 2039, 2042, 1519, 1550, 1581, 1611, 1640,
-  1668, 1695, 1721, 1746, 1770, 1793, 1815, 1836, 1856, 1875, 1893, 1910, 1926,
-  1941, 1955, 1968, 1980, 1991, 2001, 2010, 2018, 2025, 2031, 2036, 2040, 2043,
-  2045, 1551, 1582, 1612, 1641, 1669, 1696, 1722, 1747, 1771, 1794, 1816, 1837,
-  1857, 1876, 1894, 1911, 1927, 1942, 1956, 1969, 1981, 1992, 2002, 2011, 2019,
-  2026, 2032, 2037, 2041, 2044, 2046, 2047,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_64x32[2048]) = {
-  0,    1,    3,    6,    10,   15,   21,   28,   36,   45,   55,   66,   78,
-  91,   105,  120,  136,  153,  171,  190,  210,  231,  253,  276,  300,  325,
-  351,  378,  406,  435,  465,  496,  528,  560,  592,  624,  656,  688,  720,
-  752,  784,  816,  848,  880,  912,  944,  976,  1008, 1040, 1072, 1104, 1136,
-  1168, 1200, 1232, 1264, 1296, 1328, 1360, 1392, 1424, 1456, 1488, 1520, 2,
-  4,    7,    11,   16,   22,   29,   37,   46,   56,   67,   79,   92,   106,
-  121,  137,  154,  172,  191,  211,  232,  254,  277,  301,  326,  352,  379,
-  407,  436,  466,  497,  529,  561,  593,  625,  657,  689,  721,  753,  785,
-  817,  849,  881,  913,  945,  977,  1009, 1041, 1073, 1105, 1137, 1169, 1201,
-  1233, 1265, 1297, 1329, 1361, 1393, 1425, 1457, 1489, 1521, 1552, 5,    8,
-  12,   17,   23,   30,   38,   47,   57,   68,   80,   93,   107,  122,  138,
-  155,  173,  192,  212,  233,  255,  278,  302,  327,  353,  380,  408,  437,
-  467,  498,  530,  562,  594,  626,  658,  690,  722,  754,  786,  818,  850,
-  882,  914,  946,  978,  1010, 1042, 1074, 1106, 1138, 1170, 1202, 1234, 1266,
-  1298, 1330, 1362, 1394, 1426, 1458, 1490, 1522, 1553, 1583, 9,    13,   18,
-  24,   31,   39,   48,   58,   69,   81,   94,   108,  123,  139,  156,  174,
-  193,  213,  234,  256,  279,  303,  328,  354,  381,  409,  438,  468,  499,
-  531,  563,  595,  627,  659,  691,  723,  755,  787,  819,  851,  883,  915,
-  947,  979,  1011, 1043, 1075, 1107, 1139, 1171, 1203, 1235, 1267, 1299, 1331,
-  1363, 1395, 1427, 1459, 1491, 1523, 1554, 1584, 1613, 14,   19,   25,   32,
-  40,   49,   59,   70,   82,   95,   109,  124,  140,  157,  175,  194,  214,
-  235,  257,  280,  304,  329,  355,  382,  410,  439,  469,  500,  532,  564,
-  596,  628,  660,  692,  724,  756,  788,  820,  852,  884,  916,  948,  980,
-  1012, 1044, 1076, 1108, 1140, 1172, 1204, 1236, 1268, 1300, 1332, 1364, 1396,
-  1428, 1460, 1492, 1524, 1555, 1585, 1614, 1642, 20,   26,   33,   41,   50,
-  60,   71,   83,   96,   110,  125,  141,  158,  176,  195,  215,  236,  258,
-  281,  305,  330,  356,  383,  411,  440,  470,  501,  533,  565,  597,  629,
-  661,  693,  725,  757,  789,  821,  853,  885,  917,  949,  981,  1013, 1045,
-  1077, 1109, 1141, 1173, 1205, 1237, 1269, 1301, 1333, 1365, 1397, 1429, 1461,
-  1493, 1525, 1556, 1586, 1615, 1643, 1670, 27,   34,   42,   51,   61,   72,
-  84,   97,   111,  126,  142,  159,  177,  196,  216,  237,  259,  282,  306,
-  331,  357,  384,  412,  441,  471,  502,  534,  566,  598,  630,  662,  694,
-  726,  758,  790,  822,  854,  886,  918,  950,  982,  1014, 1046, 1078, 1110,
-  1142, 1174, 1206, 1238, 1270, 1302, 1334, 1366, 1398, 1430, 1462, 1494, 1526,
-  1557, 1587, 1616, 1644, 1671, 1697, 35,   43,   52,   62,   73,   85,   98,
-  112,  127,  143,  160,  178,  197,  217,  238,  260,  283,  307,  332,  358,
-  385,  413,  442,  472,  503,  535,  567,  599,  631,  663,  695,  727,  759,
-  791,  823,  855,  887,  919,  951,  983,  1015, 1047, 1079, 1111, 1143, 1175,
-  1207, 1239, 1271, 1303, 1335, 1367, 1399, 1431, 1463, 1495, 1527, 1558, 1588,
-  1617, 1645, 1672, 1698, 1723, 44,   53,   63,   74,   86,   99,   113,  128,
-  144,  161,  179,  198,  218,  239,  261,  284,  308,  333,  359,  386,  414,
-  443,  473,  504,  536,  568,  600,  632,  664,  696,  728,  760,  792,  824,
-  856,  888,  920,  952,  984,  1016, 1048, 1080, 1112, 1144, 1176, 1208, 1240,
-  1272, 1304, 1336, 1368, 1400, 1432, 1464, 1496, 1528, 1559, 1589, 1618, 1646,
-  1673, 1699, 1724, 1748, 54,   64,   75,   87,   100,  114,  129,  145,  162,
-  180,  199,  219,  240,  262,  285,  309,  334,  360,  387,  415,  444,  474,
-  505,  537,  569,  601,  633,  665,  697,  729,  761,  793,  825,  857,  889,
-  921,  953,  985,  1017, 1049, 1081, 1113, 1145, 1177, 1209, 1241, 1273, 1305,
-  1337, 1369, 1401, 1433, 1465, 1497, 1529, 1560, 1590, 1619, 1647, 1674, 1700,
-  1725, 1749, 1772, 65,   76,   88,   101,  115,  130,  146,  163,  181,  200,
-  220,  241,  263,  286,  310,  335,  361,  388,  416,  445,  475,  506,  538,
-  570,  602,  634,  666,  698,  730,  762,  794,  826,  858,  890,  922,  954,
-  986,  1018, 1050, 1082, 1114, 1146, 1178, 1210, 1242, 1274, 1306, 1338, 1370,
-  1402, 1434, 1466, 1498, 1530, 1561, 1591, 1620, 1648, 1675, 1701, 1726, 1750,
-  1773, 1795, 77,   89,   102,  116,  131,  147,  164,  182,  201,  221,  242,
-  264,  287,  311,  336,  362,  389,  417,  446,  476,  507,  539,  571,  603,
-  635,  667,  699,  731,  763,  795,  827,  859,  891,  923,  955,  987,  1019,
-  1051, 1083, 1115, 1147, 1179, 1211, 1243, 1275, 1307, 1339, 1371, 1403, 1435,
-  1467, 1499, 1531, 1562, 1592, 1621, 1649, 1676, 1702, 1727, 1751, 1774, 1796,
-  1817, 90,   103,  117,  132,  148,  165,  183,  202,  222,  243,  265,  288,
-  312,  337,  363,  390,  418,  447,  477,  508,  540,  572,  604,  636,  668,
-  700,  732,  764,  796,  828,  860,  892,  924,  956,  988,  1020, 1052, 1084,
-  1116, 1148, 1180, 1212, 1244, 1276, 1308, 1340, 1372, 1404, 1436, 1468, 1500,
-  1532, 1563, 1593, 1622, 1650, 1677, 1703, 1728, 1752, 1775, 1797, 1818, 1838,
-  104,  118,  133,  149,  166,  184,  203,  223,  244,  266,  289,  313,  338,
-  364,  391,  419,  448,  478,  509,  541,  573,  605,  637,  669,  701,  733,
-  765,  797,  829,  861,  893,  925,  957,  989,  1021, 1053, 1085, 1117, 1149,
-  1181, 1213, 1245, 1277, 1309, 1341, 1373, 1405, 1437, 1469, 1501, 1533, 1564,
-  1594, 1623, 1651, 1678, 1704, 1729, 1753, 1776, 1798, 1819, 1839, 1858, 119,
-  134,  150,  167,  185,  204,  224,  245,  267,  290,  314,  339,  365,  392,
-  420,  449,  479,  510,  542,  574,  606,  638,  670,  702,  734,  766,  798,
-  830,  862,  894,  926,  958,  990,  1022, 1054, 1086, 1118, 1150, 1182, 1214,
-  1246, 1278, 1310, 1342, 1374, 1406, 1438, 1470, 1502, 1534, 1565, 1595, 1624,
-  1652, 1679, 1705, 1730, 1754, 1777, 1799, 1820, 1840, 1859, 1877, 135,  151,
-  168,  186,  205,  225,  246,  268,  291,  315,  340,  366,  393,  421,  450,
-  480,  511,  543,  575,  607,  639,  671,  703,  735,  767,  799,  831,  863,
-  895,  927,  959,  991,  1023, 1055, 1087, 1119, 1151, 1183, 1215, 1247, 1279,
-  1311, 1343, 1375, 1407, 1439, 1471, 1503, 1535, 1566, 1596, 1625, 1653, 1680,
-  1706, 1731, 1755, 1778, 1800, 1821, 1841, 1860, 1878, 1895, 152,  169,  187,
-  206,  226,  247,  269,  292,  316,  341,  367,  394,  422,  451,  481,  512,
-  544,  576,  608,  640,  672,  704,  736,  768,  800,  832,  864,  896,  928,
-  960,  992,  1024, 1056, 1088, 1120, 1152, 1184, 1216, 1248, 1280, 1312, 1344,
-  1376, 1408, 1440, 1472, 1504, 1536, 1567, 1597, 1626, 1654, 1681, 1707, 1732,
-  1756, 1779, 1801, 1822, 1842, 1861, 1879, 1896, 1912, 170,  188,  207,  227,
-  248,  270,  293,  317,  342,  368,  395,  423,  452,  482,  513,  545,  577,
-  609,  641,  673,  705,  737,  769,  801,  833,  865,  897,  929,  961,  993,
-  1025, 1057, 1089, 1121, 1153, 1185, 1217, 1249, 1281, 1313, 1345, 1377, 1409,
-  1441, 1473, 1505, 1537, 1568, 1598, 1627, 1655, 1682, 1708, 1733, 1757, 1780,
-  1802, 1823, 1843, 1862, 1880, 1897, 1913, 1928, 189,  208,  228,  249,  271,
-  294,  318,  343,  369,  396,  424,  453,  483,  514,  546,  578,  610,  642,
-  674,  706,  738,  770,  802,  834,  866,  898,  930,  962,  994,  1026, 1058,
-  1090, 1122, 1154, 1186, 1218, 1250, 1282, 1314, 1346, 1378, 1410, 1442, 1474,
-  1506, 1538, 1569, 1599, 1628, 1656, 1683, 1709, 1734, 1758, 1781, 1803, 1824,
-  1844, 1863, 1881, 1898, 1914, 1929, 1943, 209,  229,  250,  272,  295,  319,
-  344,  370,  397,  425,  454,  484,  515,  547,  579,  611,  643,  675,  707,
-  739,  771,  803,  835,  867,  899,  931,  963,  995,  1027, 1059, 1091, 1123,
-  1155, 1187, 1219, 1251, 1283, 1315, 1347, 1379, 1411, 1443, 1475, 1507, 1539,
-  1570, 1600, 1629, 1657, 1684, 1710, 1735, 1759, 1782, 1804, 1825, 1845, 1864,
-  1882, 1899, 1915, 1930, 1944, 1957, 230,  251,  273,  296,  320,  345,  371,
-  398,  426,  455,  485,  516,  548,  580,  612,  644,  676,  708,  740,  772,
-  804,  836,  868,  900,  932,  964,  996,  1028, 1060, 1092, 1124, 1156, 1188,
-  1220, 1252, 1284, 1316, 1348, 1380, 1412, 1444, 1476, 1508, 1540, 1571, 1601,
-  1630, 1658, 1685, 1711, 1736, 1760, 1783, 1805, 1826, 1846, 1865, 1883, 1900,
-  1916, 1931, 1945, 1958, 1970, 252,  274,  297,  321,  346,  372,  399,  427,
-  456,  486,  517,  549,  581,  613,  645,  677,  709,  741,  773,  805,  837,
-  869,  901,  933,  965,  997,  1029, 1061, 1093, 1125, 1157, 1189, 1221, 1253,
-  1285, 1317, 1349, 1381, 1413, 1445, 1477, 1509, 1541, 1572, 1602, 1631, 1659,
-  1686, 1712, 1737, 1761, 1784, 1806, 1827, 1847, 1866, 1884, 1901, 1917, 1932,
-  1946, 1959, 1971, 1982, 275,  298,  322,  347,  373,  400,  428,  457,  487,
-  518,  550,  582,  614,  646,  678,  710,  742,  774,  806,  838,  870,  902,
-  934,  966,  998,  1030, 1062, 1094, 1126, 1158, 1190, 1222, 1254, 1286, 1318,
-  1350, 1382, 1414, 1446, 1478, 1510, 1542, 1573, 1603, 1632, 1660, 1687, 1713,
-  1738, 1762, 1785, 1807, 1828, 1848, 1867, 1885, 1902, 1918, 1933, 1947, 1960,
-  1972, 1983, 1993, 299,  323,  348,  374,  401,  429,  458,  488,  519,  551,
-  583,  615,  647,  679,  711,  743,  775,  807,  839,  871,  903,  935,  967,
-  999,  1031, 1063, 1095, 1127, 1159, 1191, 1223, 1255, 1287, 1319, 1351, 1383,
-  1415, 1447, 1479, 1511, 1543, 1574, 1604, 1633, 1661, 1688, 1714, 1739, 1763,
-  1786, 1808, 1829, 1849, 1868, 1886, 1903, 1919, 1934, 1948, 1961, 1973, 1984,
-  1994, 2003, 324,  349,  375,  402,  430,  459,  489,  520,  552,  584,  616,
-  648,  680,  712,  744,  776,  808,  840,  872,  904,  936,  968,  1000, 1032,
-  1064, 1096, 1128, 1160, 1192, 1224, 1256, 1288, 1320, 1352, 1384, 1416, 1448,
-  1480, 1512, 1544, 1575, 1605, 1634, 1662, 1689, 1715, 1740, 1764, 1787, 1809,
-  1830, 1850, 1869, 1887, 1904, 1920, 1935, 1949, 1962, 1974, 1985, 1995, 2004,
-  2012, 350,  376,  403,  431,  460,  490,  521,  553,  585,  617,  649,  681,
-  713,  745,  777,  809,  841,  873,  905,  937,  969,  1001, 1033, 1065, 1097,
-  1129, 1161, 1193, 1225, 1257, 1289, 1321, 1353, 1385, 1417, 1449, 1481, 1513,
-  1545, 1576, 1606, 1635, 1663, 1690, 1716, 1741, 1765, 1788, 1810, 1831, 1851,
-  1870, 1888, 1905, 1921, 1936, 1950, 1963, 1975, 1986, 1996, 2005, 2013, 2020,
-  377,  404,  432,  461,  491,  522,  554,  586,  618,  650,  682,  714,  746,
-  778,  810,  842,  874,  906,  938,  970,  1002, 1034, 1066, 1098, 1130, 1162,
-  1194, 1226, 1258, 1290, 1322, 1354, 1386, 1418, 1450, 1482, 1514, 1546, 1577,
-  1607, 1636, 1664, 1691, 1717, 1742, 1766, 1789, 1811, 1832, 1852, 1871, 1889,
-  1906, 1922, 1937, 1951, 1964, 1976, 1987, 1997, 2006, 2014, 2021, 2027, 405,
-  433,  462,  492,  523,  555,  587,  619,  651,  683,  715,  747,  779,  811,
-  843,  875,  907,  939,  971,  1003, 1035, 1067, 1099, 1131, 1163, 1195, 1227,
-  1259, 1291, 1323, 1355, 1387, 1419, 1451, 1483, 1515, 1547, 1578, 1608, 1637,
-  1665, 1692, 1718, 1743, 1767, 1790, 1812, 1833, 1853, 1872, 1890, 1907, 1923,
-  1938, 1952, 1965, 1977, 1988, 1998, 2007, 2015, 2022, 2028, 2033, 434,  463,
-  493,  524,  556,  588,  620,  652,  684,  716,  748,  780,  812,  844,  876,
-  908,  940,  972,  1004, 1036, 1068, 1100, 1132, 1164, 1196, 1228, 1260, 1292,
-  1324, 1356, 1388, 1420, 1452, 1484, 1516, 1548, 1579, 1609, 1638, 1666, 1693,
-  1719, 1744, 1768, 1791, 1813, 1834, 1854, 1873, 1891, 1908, 1924, 1939, 1953,
-  1966, 1978, 1989, 1999, 2008, 2016, 2023, 2029, 2034, 2038, 464,  494,  525,
-  557,  589,  621,  653,  685,  717,  749,  781,  813,  845,  877,  909,  941,
-  973,  1005, 1037, 1069, 1101, 1133, 1165, 1197, 1229, 1261, 1293, 1325, 1357,
-  1389, 1421, 1453, 1485, 1517, 1549, 1580, 1610, 1639, 1667, 1694, 1720, 1745,
-  1769, 1792, 1814, 1835, 1855, 1874, 1892, 1909, 1925, 1940, 1954, 1967, 1979,
-  1990, 2000, 2009, 2017, 2024, 2030, 2035, 2039, 2042, 495,  526,  558,  590,
-  622,  654,  686,  718,  750,  782,  814,  846,  878,  910,  942,  974,  1006,
-  1038, 1070, 1102, 1134, 1166, 1198, 1230, 1262, 1294, 1326, 1358, 1390, 1422,
-  1454, 1486, 1518, 1550, 1581, 1611, 1640, 1668, 1695, 1721, 1746, 1770, 1793,
-  1815, 1836, 1856, 1875, 1893, 1910, 1926, 1941, 1955, 1968, 1980, 1991, 2001,
-  2010, 2018, 2025, 2031, 2036, 2040, 2043, 2045, 527,  559,  591,  623,  655,
-  687,  719,  751,  783,  815,  847,  879,  911,  943,  975,  1007, 1039, 1071,
-  1103, 1135, 1167, 1199, 1231, 1263, 1295, 1327, 1359, 1391, 1423, 1455, 1487,
-  1519, 1551, 1582, 1612, 1641, 1669, 1696, 1722, 1747, 1771, 1794, 1816, 1837,
-  1857, 1876, 1894, 1911, 1927, 1942, 1956, 1969, 1981, 1992, 2002, 2011, 2019,
-  2026, 2032, 2037, 2041, 2044, 2046, 2047,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_64x64[4096]) = {
-  0,    1,    4,    9,    15,   22,   33,   43,   56,   71,   86,   104,  121,
-  142,  166,  189,  214,  239,  269,  300,  331,  363,  400,  435,  471,  510,
-  553,  598,  640,  683,  732,  780,  833,  884,  937,  995,  1048, 1107, 1165,
-  1230, 1293, 1353, 1422, 1489, 1562, 1632, 1701, 1776, 1850, 1929, 2006, 2091,
-  2173, 2252, 2339, 2421, 2516, 2603, 2694, 2786, 2879, 2978, 3076, 3175, 2,
-  3,    6,    11,   17,   26,   35,   45,   58,   73,   90,   106,  123,  146,
-  168,  193,  216,  243,  271,  302,  335,  365,  402,  437,  473,  516,  557,
-  600,  642,  687,  736,  782,  835,  886,  941,  999,  1050, 1111, 1167, 1234,
-  1297, 1357, 1424, 1491, 1564, 1636, 1703, 1778, 1852, 1931, 2012, 2095, 2177,
-  2256, 2341, 2425, 2518, 2605, 2698, 2788, 2883, 2982, 3078, 3177, 5,    7,
-  8,    13,   20,   28,   37,   50,   62,   75,   92,   108,  129,  150,  170,
-  195,  218,  249,  277,  304,  337,  369,  406,  441,  478,  520,  559,  604,
-  646,  689,  740,  788,  841,  890,  945,  1001, 1052, 1115, 1173, 1236, 1301,
-  1362, 1428, 1497, 1568, 1638, 1707, 1786, 1858, 1935, 2016, 2097, 2181, 2260,
-  2343, 2431, 2520, 2613, 2702, 2790, 2889, 2984, 3082, 3181, 10,   12,   14,
-  19,   23,   31,   41,   52,   65,   81,   96,   113,  133,  152,  175,  201,
-  224,  253,  279,  310,  341,  375,  410,  445,  484,  524,  563,  606,  648,
-  697,  746,  793,  843,  896,  949,  1005, 1060, 1119, 1181, 1242, 1303, 1366,
-  1436, 1503, 1572, 1640, 1713, 1790, 1865, 1943, 2018, 2103, 2183, 2266, 2347,
-  2437, 2526, 2617, 2708, 2800, 2893, 2992, 3086, 3189, 16,   18,   21,   24,
-  30,   39,   48,   59,   69,   83,   100,  119,  137,  158,  181,  203,  230,
-  255,  286,  316,  347,  380,  414,  451,  490,  530,  571,  612,  656,  705,
-  750,  799,  849,  898,  959,  1009, 1066, 1127, 1184, 1246, 1307, 1376, 1440,
-  1509, 1578, 1644, 1723, 1794, 1871, 1947, 2024, 2109, 2185, 2270, 2361, 2443,
-  2536, 2619, 2710, 2806, 2899, 2998, 3090, 3193, 25,   27,   29,   32,   40,
-  46,   54,   67,   79,   94,   109,  127,  143,  164,  185,  210,  236,  263,
-  292,  320,  353,  388,  422,  459,  496,  533,  579,  618,  665,  711,  754,
-  809,  857,  910,  961,  1015, 1074, 1131, 1194, 1254, 1315, 1384, 1448, 1517,
-  1584, 1655, 1731, 1802, 1875, 1959, 2034, 2115, 2197, 2280, 2367, 2452, 2538,
-  2625, 2722, 2816, 2907, 3004, 3100, 3203, 34,   36,   38,   42,   49,   55,
-  64,   76,   87,   102,  117,  135,  154,  176,  197,  222,  247,  272,  298,
-  329,  361,  392,  427,  465,  504,  545,  585,  626,  671,  717,  766,  813,
-  862,  916,  971,  1028, 1084, 1139, 1200, 1264, 1325, 1390, 1452, 1523, 1594,
-  1667, 1737, 1806, 1887, 1963, 2046, 2123, 2202, 2290, 2371, 2462, 2548, 2641,
-  2732, 2822, 2917, 3010, 3111, 3211, 44,   47,   51,   53,   60,   68,   77,
-  85,   98,   114,  131,  147,  162,  183,  208,  232,  256,  283,  314,  343,
-  373,  408,  442,  475,  511,  551,  592,  638,  681,  726,  772,  821,  874,
-  926,  979,  1034, 1088, 1153, 1214, 1271, 1335, 1396, 1469, 1533, 1600, 1673,
-  1745, 1824, 1897, 1973, 2054, 2131, 2216, 2300, 2383, 2468, 2558, 2649, 2740,
-  2829, 2923, 3022, 3123, 3221, 57,   61,   63,   66,   70,   80,   88,   99,
-  112,  124,  140,  159,  179,  199,  219,  240,  267,  294,  322,  354,  386,
-  418,  455,  492,  528,  567,  608,  649,  695,  742,  786,  836,  882,  933,
-  989,  1046, 1101, 1161, 1216, 1279, 1343, 1410, 1479, 1543, 1614, 1687, 1758,
-  1832, 1905, 1980, 2066, 2141, 2226, 2306, 2395, 2484, 2566, 2659, 2750, 2845,
-  2939, 3032, 3133, 3225, 72,   74,   78,   82,   84,   95,   103,  115,  125,
-  139,  156,  173,  190,  211,  234,  259,  281,  311,  339,  366,  394,  433,
-  466,  500,  543,  581,  622,  667,  707,  752,  803,  853,  899,  955,  1007,
-  1064, 1117, 1175, 1237, 1299, 1354, 1420, 1485, 1556, 1624, 1697, 1770, 1842,
-  1919, 1998, 2074, 2155, 2234, 2319, 2409, 2492, 2581, 2671, 2760, 2859, 2949,
-  3046, 3145, 3245, 89,   91,   93,   97,   101,  110,  118,  132,  141,  157,
-  171,  186,  206,  228,  251,  273,  296,  324,  351,  384,  415,  447,  482,
-  521,  554,  593,  636,  677,  722,  770,  815,  866,  914,  967,  1022, 1078,
-  1135, 1195, 1252, 1313, 1378, 1444, 1507, 1576, 1642, 1714, 1788, 1860, 1933,
-  2013, 2085, 2169, 2250, 2337, 2417, 2502, 2597, 2683, 2778, 2869, 2960, 3060,
-  3157, 3256, 105,  107,  111,  116,  120,  128,  136,  148,  160,  174,  187,
-  205,  225,  244,  265,  290,  317,  344,  370,  398,  431,  463,  498,  534,
-  573,  616,  654,  698,  743,  783,  831,  880,  928,  983,  1036, 1092, 1149,
-  1208, 1266, 1333, 1394, 1457, 1524, 1590, 1665, 1733, 1804, 1879, 1953, 2030,
-  2111, 2189, 2271, 2357, 2441, 2534, 2615, 2704, 2791, 2887, 2979, 3072, 3167,
-  3270, 122,  126,  130,  134,  138,  144,  155,  163,  180,  191,  207,  226,
-  238,  261,  287,  308,  332,  359,  390,  419,  449,  485,  518,  549,  587,
-  630,  672,  715,  760,  805,  855,  900,  953,  1003, 1053, 1108, 1163, 1220,
-  1287, 1345, 1408, 1473, 1541, 1608, 1677, 1749, 1826, 1898, 1971, 2048, 2127,
-  2208, 2294, 2373, 2458, 2542, 2631, 2726, 2818, 2908, 3002, 3094, 3199, 3286,
-  145,  149,  151,  153,  161,  165,  177,  184,  200,  212,  229,  245,  262,
-  284,  305,  327,  355,  382,  411,  438,  469,  501,  539,  577,  613,  652,
-  690,  730,  776,  822,  872,  922,  973,  1024, 1079, 1132, 1188, 1250, 1305,
-  1367, 1432, 1492, 1560, 1626, 1693, 1766, 1838, 1911, 1992, 2068, 2149, 2228,
-  2307, 2393, 2478, 2564, 2655, 2742, 2833, 2927, 3020, 3119, 3219, 3298, 167,
-  169,  172,  178,  182,  188,  198,  209,  220,  235,  252,  266,  288,  306,
-  326,  349,  378,  403,  428,  461,  494,  526,  560,  594,  632,  675,  713,
-  755,  801,  845,  892,  942,  990,  1042, 1096, 1155, 1212, 1267, 1329, 1391,
-  1450, 1519, 1582, 1650, 1724, 1792, 1862, 1936, 2007, 2083, 2167, 2246, 2329,
-  2413, 2496, 2585, 2675, 2761, 2855, 2947, 3040, 3135, 3233, 3320, 192,  194,
-  196,  202,  204,  213,  223,  233,  241,  260,  274,  291,  309,  328,  350,
-  376,  395,  425,  457,  488,  512,  547,  583,  619,  659,  699,  737,  778,
-  819,  868,  917,  965,  1013, 1072, 1123, 1176, 1231, 1289, 1351, 1414, 1474,
-  1539, 1604, 1674, 1741, 1816, 1891, 1961, 2040, 2116, 2191, 2276, 2353, 2438,
-  2524, 2606, 2689, 2784, 2871, 2968, 3062, 3161, 3257, 3334, 215,  217,  221,
-  227,  231,  237,  248,  257,  268,  282,  297,  318,  333,  356,  379,  396,
-  424,  452,  479,  508,  541,  574,  609,  643,  679,  719,  764,  806,  850,
-  894,  938,  987,  1038, 1089, 1145, 1204, 1258, 1316, 1379, 1438, 1501, 1565,
-  1628, 1694, 1764, 1836, 1907, 1981, 2060, 2137, 2220, 2298, 2377, 2464, 2549,
-  2635, 2724, 2812, 2903, 2999, 3088, 3185, 3278, 3350, 242,  246,  250,  254,
-  258,  264,  275,  285,  295,  312,  325,  345,  360,  383,  404,  426,  453,
-  476,  506,  535,  568,  601,  634,  669,  708,  748,  789,  829,  875,  923,
-  968,  1016, 1068, 1120, 1168, 1224, 1280, 1341, 1402, 1465, 1531, 1591, 1661,
-  1729, 1795, 1867, 1937, 2004, 2079, 2159, 2242, 2320, 2405, 2488, 2573, 2661,
-  2744, 2839, 2933, 3023, 3117, 3215, 3296, 3373, 270,  276,  278,  280,  289,
-  293,  299,  315,  323,  340,  352,  371,  391,  412,  429,  458,  480,  507,
-  532,  564,  590,  627,  663,  703,  733,  773,  816,  859,  906,  950,  993,
-  1043, 1094, 1147, 1201, 1256, 1311, 1372, 1429, 1486, 1550, 1618, 1685, 1751,
-  1827, 1895, 1965, 2042, 2119, 2192, 2268, 2348, 2429, 2512, 2599, 2684, 2772,
-  2863, 2951, 3048, 3143, 3239, 3324, 3393, 301,  303,  307,  313,  319,  321,
-  330,  346,  357,  367,  385,  399,  420,  439,  462,  489,  509,  536,  565,
-  589,  624,  661,  691,  727,  768,  810,  846,  887,  929,  977,  1029, 1076,
-  1128, 1177, 1226, 1283, 1339, 1397, 1461, 1521, 1585, 1648, 1715, 1779, 1848,
-  1923, 1996, 2069, 2142, 2224, 2302, 2381, 2465, 2544, 2627, 2720, 2807, 2895,
-  2985, 3073, 3163, 3264, 3338, 3413, 334,  336,  338,  342,  348,  358,  362,
-  374,  387,  397,  416,  432,  450,  470,  495,  513,  542,  569,  591,  625,
-  657,  684,  723,  762,  797,  837,  878,  920,  963,  1010, 1054, 1105, 1157,
-  1206, 1262, 1317, 1374, 1433, 1483, 1545, 1615, 1681, 1743, 1812, 1885, 1954,
-  2025, 2101, 2174, 2248, 2330, 2411, 2490, 2579, 2663, 2745, 2835, 2924, 3018,
-  3115, 3205, 3290, 3363, 3431, 364,  368,  372,  377,  381,  389,  393,  409,
-  421,  434,  448,  464,  486,  502,  527,  548,  575,  602,  628,  662,  685,
-  721,  756,  794,  827,  869,  912,  956,  996,  1040, 1086, 1137, 1189, 1243,
-  1291, 1349, 1404, 1466, 1525, 1588, 1645, 1711, 1774, 1843, 1909, 1988, 2058,
-  2132, 2209, 2288, 2368, 2445, 2527, 2607, 2687, 2780, 2865, 2953, 3049, 3139,
-  3237, 3318, 3387, 3451, 401,  405,  407,  413,  417,  423,  430,  443,  456,
-  467,  483,  499,  519,  540,  561,  584,  610,  635,  664,  692,  724,  757,
-  792,  825,  863,  908,  946,  985,  1032, 1080, 1125, 1169, 1217, 1275, 1330,
-  1386, 1441, 1498, 1554, 1619, 1683, 1746, 1810, 1883, 1949, 2019, 2086, 2165,
-  2238, 2314, 2399, 2479, 2562, 2645, 2733, 2820, 2904, 2996, 3083, 3168, 3268,
-  3339, 3407, 3474, 436,  440,  444,  446,  454,  460,  468,  477,  493,  503,
-  522,  537,  550,  578,  595,  620,  644,  670,  704,  728,  763,  795,  826,
-  861,  901,  935,  980,  1025, 1069, 1112, 1159, 1209, 1260, 1309, 1363, 1418,
-  1475, 1534, 1598, 1656, 1721, 1780, 1846, 1912, 1982, 2056, 2129, 2199, 2278,
-  2358, 2432, 2508, 2593, 2677, 2762, 2851, 2941, 3030, 3124, 3216, 3294, 3365,
-  3433, 3488, 472,  474,  481,  487,  491,  497,  505,  514,  529,  544,  555,
-  576,  588,  614,  633,  660,  680,  709,  734,  769,  798,  828,  864,  902,
-  932,  975,  1020, 1061, 1102, 1150, 1198, 1247, 1294, 1346, 1400, 1455, 1513,
-  1573, 1629, 1689, 1755, 1820, 1888, 1955, 2022, 2092, 2163, 2235, 2312, 2389,
-  2472, 2554, 2632, 2716, 2804, 2884, 2974, 3063, 3153, 3250, 3326, 3395, 3454,
-  3512, 515,  517,  523,  525,  531,  538,  546,  552,  570,  582,  596,  617,
-  631,  653,  676,  700,  720,  749,  774,  811,  838,  870,  909,  936,  976,
-  1017, 1058, 1099, 1143, 1192, 1238, 1284, 1336, 1388, 1445, 1493, 1546, 1610,
-  1671, 1734, 1796, 1856, 1925, 1994, 2062, 2133, 2206, 2281, 2354, 2426, 2503,
-  2587, 2669, 2754, 2843, 2928, 3016, 3105, 3201, 3284, 3351, 3421, 3480, 3534,
-  556,  558,  562,  566,  572,  580,  586,  597,  611,  623,  637,  655,  673,
-  693,  714,  738,  765,  790,  817,  847,  879,  913,  947,  981,  1021, 1059,
-  1097, 1140, 1185, 1227, 1277, 1327, 1380, 1425, 1481, 1537, 1595, 1651, 1708,
-  1771, 1834, 1901, 1966, 2035, 2107, 2170, 2244, 2315, 2396, 2474, 2552, 2628,
-  2711, 2792, 2875, 2966, 3056, 3146, 3234, 3314, 3383, 3445, 3504, 3559, 599,
-  603,  605,  607,  615,  621,  629,  639,  650,  668,  678,  701,  716,  731,
-  758,  779,  807,  830,  860,  888,  921,  957,  986,  1026, 1062, 1100, 1141,
-  1183, 1221, 1272, 1323, 1368, 1416, 1471, 1526, 1580, 1633, 1691, 1752, 1817,
-  1876, 1944, 2002, 2072, 2143, 2218, 2291, 2363, 2435, 2509, 2589, 2672, 2752,
-  2840, 2921, 3008, 3095, 3190, 3274, 3344, 3409, 3470, 3526, 3577, 641,  645,
-  647,  651,  658,  666,  674,  682,  696,  710,  725,  744,  761,  777,  802,
-  820,  851,  876,  907,  930,  964,  997,  1033, 1070, 1103, 1144, 1186, 1222,
-  1270, 1318, 1360, 1411, 1463, 1515, 1569, 1622, 1678, 1739, 1800, 1853, 1917,
-  1983, 2052, 2121, 2186, 2253, 2331, 2406, 2482, 2559, 2639, 2717, 2798, 2877,
-  2961, 3052, 3137, 3226, 3306, 3379, 3437, 3492, 3553, 3601, 686,  688,  694,
-  702,  706,  712,  718,  729,  745,  753,  771,  784,  808,  823,  848,  871,
-  895,  924,  951,  978,  1011, 1041, 1081, 1113, 1151, 1193, 1228, 1273, 1319,
-  1358, 1406, 1458, 1510, 1557, 1612, 1669, 1727, 1781, 1839, 1903, 1969, 2031,
-  2098, 2160, 2232, 2304, 2375, 2453, 2528, 2601, 2679, 2758, 2846, 2929, 3011,
-  3098, 3186, 3271, 3340, 3401, 3466, 3522, 3571, 3620, 735,  739,  741,  747,
-  751,  759,  767,  775,  787,  804,  818,  832,  856,  873,  893,  918,  939,
-  969,  994,  1030, 1055, 1087, 1126, 1160, 1199, 1239, 1278, 1324, 1361, 1407,
-  1453, 1505, 1551, 1605, 1663, 1716, 1768, 1830, 1893, 1951, 2008, 2075, 2139,
-  2214, 2284, 2349, 2418, 2494, 2571, 2653, 2734, 2810, 2890, 2972, 3058, 3147,
-  3231, 3310, 3375, 3435, 3490, 3545, 3595, 3642, 781,  785,  791,  796,  800,
-  812,  814,  824,  839,  854,  867,  881,  903,  925,  943,  966,  988,  1018,
-  1044, 1077, 1106, 1138, 1170, 1210, 1248, 1285, 1328, 1369, 1412, 1459, 1506,
-  1549, 1601, 1657, 1704, 1762, 1821, 1880, 1938, 1999, 2063, 2125, 2193, 2257,
-  2327, 2401, 2475, 2545, 2620, 2691, 2776, 2860, 2942, 3024, 3109, 3197, 3276,
-  3345, 3403, 3468, 3520, 3569, 3616, 3664, 834,  840,  842,  844,  852,  858,
-  865,  877,  883,  904,  915,  931,  954,  974,  991,  1014, 1039, 1071, 1095,
-  1129, 1158, 1190, 1218, 1261, 1295, 1337, 1381, 1417, 1464, 1511, 1552, 1602,
-  1654, 1699, 1759, 1813, 1872, 1927, 1990, 2049, 2113, 2178, 2239, 2308, 2378,
-  2450, 2521, 2594, 2667, 2746, 2824, 2909, 2990, 3070, 3154, 3243, 3316, 3381,
-  3441, 3493, 3547, 3597, 3640, 3682, 885,  889,  891,  897,  905,  911,  919,
-  927,  934,  958,  970,  984,  1004, 1027, 1045, 1073, 1090, 1121, 1148, 1178,
-  1207, 1244, 1276, 1310, 1347, 1389, 1426, 1472, 1516, 1558, 1606, 1658, 1700,
-  1757, 1807, 1868, 1920, 1978, 2043, 2104, 2157, 2229, 2296, 2364, 2422, 2498,
-  2574, 2650, 2727, 2801, 2872, 2954, 3038, 3129, 3212, 3288, 3352, 3419, 3475,
-  3524, 3573, 3621, 3668, 3707, 940,  944,  948,  952,  960,  962,  972,  982,
-  992,  1008, 1023, 1037, 1056, 1082, 1098, 1124, 1146, 1171, 1202, 1229, 1263,
-  1292, 1331, 1364, 1401, 1446, 1482, 1527, 1570, 1613, 1664, 1705, 1760, 1808,
-  1863, 1915, 1976, 2036, 2087, 2153, 2221, 2286, 2344, 2414, 2486, 2556, 2623,
-  2699, 2773, 2853, 2937, 3012, 3091, 3169, 3260, 3330, 3391, 3447, 3505, 3555,
-  3603, 3646, 3684, 3727, 998,  1000, 1002, 1006, 1012, 1019, 1031, 1035, 1047,
-  1065, 1083, 1093, 1109, 1133, 1156, 1179, 1205, 1225, 1257, 1286, 1320, 1350,
-  1387, 1419, 1456, 1494, 1538, 1581, 1623, 1670, 1717, 1763, 1814, 1869, 1916,
-  1974, 2028, 2081, 2150, 2212, 2272, 2335, 2403, 2469, 2539, 2608, 2680, 2755,
-  2827, 2915, 2986, 3068, 3151, 3229, 3300, 3366, 3427, 3484, 3532, 3581, 3630,
-  3672, 3709, 3745, 1049, 1051, 1057, 1063, 1067, 1075, 1085, 1091, 1104, 1118,
-  1136, 1152, 1164, 1191, 1213, 1232, 1259, 1281, 1312, 1340, 1375, 1405, 1442,
-  1476, 1514, 1547, 1596, 1634, 1679, 1728, 1769, 1822, 1873, 1921, 1977, 2029,
-  2078, 2144, 2203, 2264, 2325, 2390, 2459, 2529, 2591, 2665, 2738, 2813, 2880,
-  2957, 3041, 3127, 3206, 3282, 3348, 3399, 3460, 3513, 3565, 3609, 3650, 3695,
-  3733, 3768, 1110, 1114, 1116, 1122, 1130, 1134, 1142, 1154, 1162, 1180, 1196,
-  1211, 1223, 1251, 1268, 1290, 1321, 1342, 1373, 1398, 1434, 1467, 1499, 1535,
-  1574, 1611, 1652, 1692, 1740, 1782, 1831, 1881, 1928, 1979, 2037, 2082, 2145,
-  2200, 2261, 2321, 2387, 2454, 2513, 2583, 2656, 2730, 2793, 2867, 2945, 3025,
-  3101, 3178, 3262, 3328, 3388, 3443, 3494, 3543, 3591, 3636, 3678, 3715, 3754,
-  3790, 1166, 1172, 1174, 1182, 1187, 1197, 1203, 1215, 1219, 1240, 1253, 1269,
-  1288, 1306, 1332, 1352, 1382, 1403, 1430, 1462, 1484, 1528, 1555, 1599, 1630,
-  1672, 1709, 1753, 1801, 1840, 1894, 1939, 1991, 2044, 2088, 2151, 2204, 2262,
-  2318, 2384, 2448, 2504, 2577, 2646, 2712, 2782, 2856, 2934, 3006, 3079, 3158,
-  3240, 3307, 3371, 3425, 3481, 3530, 3575, 3618, 3660, 3701, 3741, 3774, 3807,
-  1233, 1235, 1241, 1245, 1249, 1255, 1265, 1274, 1282, 1300, 1314, 1334, 1348,
-  1370, 1392, 1415, 1439, 1468, 1487, 1522, 1548, 1589, 1620, 1659, 1690, 1735,
-  1772, 1818, 1854, 1904, 1952, 2000, 2050, 2105, 2154, 2213, 2265, 2322, 2385,
-  2446, 2500, 2569, 2642, 2705, 2770, 2849, 2919, 2993, 3064, 3140, 3223, 3292,
-  3353, 3414, 3464, 3516, 3561, 3607, 3648, 3687, 3725, 3762, 3796, 3827, 1296,
-  1298, 1302, 1304, 1308, 1322, 1326, 1338, 1344, 1355, 1383, 1395, 1409, 1435,
-  1451, 1477, 1502, 1532, 1553, 1586, 1616, 1646, 1684, 1722, 1756, 1797, 1835,
-  1877, 1918, 1970, 2009, 2064, 2114, 2158, 2222, 2273, 2326, 2388, 2449, 2501,
-  2567, 2636, 2695, 2768, 2836, 2910, 2976, 3053, 3131, 3209, 3279, 3336, 3397,
-  3449, 3500, 3549, 3593, 3634, 3676, 3713, 3747, 3784, 3817, 3845, 1356, 1359,
-  1365, 1371, 1377, 1385, 1393, 1399, 1413, 1421, 1447, 1460, 1478, 1495, 1520,
-  1540, 1566, 1592, 1621, 1649, 1682, 1712, 1747, 1783, 1823, 1857, 1902, 1945,
-  1984, 2032, 2076, 2126, 2179, 2230, 2287, 2336, 2391, 2455, 2505, 2570, 2637,
-  2692, 2763, 2830, 2901, 2969, 3044, 3120, 3194, 3265, 3331, 3385, 3439, 3486,
-  3536, 3582, 3626, 3665, 3703, 3739, 3772, 3802, 3835, 3864, 1423, 1427, 1431,
-  1437, 1443, 1449, 1454, 1470, 1480, 1488, 1508, 1529, 1542, 1561, 1583, 1607,
-  1631, 1662, 1686, 1718, 1744, 1775, 1811, 1847, 1889, 1926, 1967, 2003, 2053,
-  2099, 2140, 2194, 2240, 2297, 2345, 2404, 2460, 2514, 2578, 2643, 2696, 2764,
-  2826, 2897, 2962, 3036, 3112, 3182, 3254, 3321, 3376, 3429, 3478, 3527, 3567,
-  3611, 3652, 3693, 3731, 3764, 3794, 3825, 3853, 3882, 1490, 1496, 1500, 1504,
-  1512, 1518, 1530, 1536, 1544, 1559, 1577, 1593, 1609, 1627, 1653, 1675, 1695,
-  1730, 1754, 1784, 1815, 1844, 1884, 1913, 1956, 1995, 2038, 2073, 2122, 2161,
-  2215, 2258, 2309, 2365, 2415, 2470, 2530, 2584, 2647, 2706, 2769, 2831, 2898,
-  2959, 3033, 3106, 3170, 3252, 3312, 3367, 3423, 3471, 3518, 3563, 3605, 3644,
-  3680, 3717, 3755, 3788, 3819, 3847, 3874, 3898, 1563, 1567, 1571, 1575, 1579,
-  1587, 1597, 1603, 1617, 1625, 1643, 1666, 1680, 1696, 1725, 1742, 1765, 1798,
-  1828, 1849, 1886, 1910, 1950, 1985, 2023, 2065, 2108, 2146, 2187, 2233, 2285,
-  2328, 2379, 2423, 2487, 2540, 2592, 2657, 2713, 2771, 2837, 2902, 2963, 3034,
-  3104, 3164, 3248, 3304, 3361, 3417, 3462, 3510, 3557, 3598, 3638, 3674, 3711,
-  3743, 3776, 3811, 3839, 3868, 3892, 3917, 1635, 1637, 1639, 1641, 1647, 1660,
-  1668, 1676, 1688, 1698, 1719, 1736, 1750, 1767, 1793, 1819, 1837, 1870, 1896,
-  1924, 1957, 1989, 2020, 2057, 2093, 2134, 2171, 2219, 2254, 2305, 2350, 2402,
-  2451, 2499, 2557, 2609, 2666, 2731, 2783, 2850, 2911, 2970, 3037, 3107, 3165,
-  3246, 3301, 3359, 3410, 3458, 3508, 3551, 3589, 3632, 3670, 3705, 3737, 3770,
-  3800, 3829, 3858, 3886, 3911, 3933, 1702, 1706, 1710, 1720, 1726, 1732, 1738,
-  1748, 1761, 1773, 1789, 1805, 1829, 1841, 1864, 1892, 1908, 1940, 1968, 1997,
-  2026, 2059, 2089, 2130, 2164, 2207, 2245, 2292, 2332, 2376, 2419, 2476, 2522,
-  2575, 2624, 2681, 2739, 2794, 2857, 2920, 2977, 3045, 3113, 3171, 3249, 3302,
-  3358, 3404, 3455, 3502, 3541, 3587, 3628, 3661, 3699, 3735, 3766, 3797, 3823,
-  3851, 3876, 3903, 3927, 3950, 1777, 1785, 1787, 1791, 1799, 1803, 1809, 1825,
-  1833, 1845, 1861, 1882, 1899, 1914, 1941, 1962, 1986, 2005, 2045, 2070, 2102,
-  2135, 2166, 2201, 2236, 2282, 2316, 2366, 2407, 2456, 2495, 2546, 2595, 2651,
-  2700, 2756, 2814, 2868, 2935, 2994, 3054, 3121, 3183, 3253, 3305, 3360, 3405,
-  3453, 3498, 3539, 3585, 3622, 3658, 3697, 3728, 3760, 3792, 3821, 3849, 3872,
-  3896, 3919, 3942, 3964, 1851, 1855, 1859, 1866, 1874, 1878, 1890, 1900, 1906,
-  1922, 1934, 1958, 1972, 1993, 2010, 2041, 2061, 2080, 2120, 2147, 2175, 2210,
-  2241, 2279, 2313, 2355, 2397, 2436, 2483, 2531, 2572, 2621, 2668, 2728, 2774,
-  2828, 2881, 2946, 3007, 3065, 3132, 3195, 3255, 3313, 3362, 3411, 3456, 3499,
-  3538, 3579, 3614, 3656, 3691, 3723, 3758, 3786, 3815, 3843, 3870, 3894, 3915,
-  3937, 3956, 3975, 1930, 1932, 1942, 1946, 1948, 1960, 1964, 1975, 1987, 2001,
-  2014, 2033, 2051, 2071, 2084, 2117, 2138, 2162, 2195, 2225, 2249, 2289, 2317,
-  2359, 2392, 2427, 2477, 2510, 2560, 2602, 2654, 2693, 2747, 2802, 2854, 2916,
-  2958, 3026, 3080, 3141, 3210, 3266, 3322, 3368, 3418, 3459, 3503, 3540, 3580,
-  3613, 3654, 3688, 3721, 3752, 3782, 3813, 3841, 3865, 3890, 3913, 3935, 3954,
-  3972, 3989, 2011, 2015, 2017, 2021, 2027, 2039, 2047, 2055, 2067, 2077, 2090,
-  2112, 2128, 2152, 2168, 2196, 2223, 2243, 2269, 2303, 2333, 2369, 2400, 2433,
-  2473, 2506, 2553, 2590, 2640, 2682, 2735, 2777, 2825, 2873, 2938, 2987, 3042,
-  3102, 3159, 3224, 3280, 3332, 3377, 3424, 3463, 3509, 3542, 3586, 3615, 3655,
-  3685, 3719, 3750, 3780, 3809, 3836, 3862, 3888, 3909, 3931, 3952, 3970, 3987,
-  4003, 2094, 2096, 2100, 2106, 2110, 2118, 2124, 2136, 2148, 2156, 2172, 2190,
-  2211, 2231, 2247, 2277, 2299, 2323, 2351, 2382, 2412, 2447, 2480, 2511, 2555,
-  2588, 2629, 2673, 2718, 2759, 2811, 2861, 2912, 2955, 3013, 3069, 3128, 3179,
-  3241, 3293, 3337, 3386, 3430, 3472, 3511, 3552, 3588, 3623, 3657, 3689, 3720,
-  3749, 3778, 3805, 3833, 3860, 3884, 3907, 3929, 3948, 3968, 3985, 4001, 4016,
-  2176, 2180, 2182, 2184, 2188, 2198, 2205, 2217, 2227, 2237, 2251, 2274, 2295,
-  2310, 2334, 2356, 2380, 2408, 2430, 2466, 2491, 2532, 2563, 2596, 2633, 2670,
-  2714, 2753, 2799, 2847, 2891, 2943, 2991, 3039, 3092, 3152, 3207, 3263, 3308,
-  3354, 3398, 3440, 3479, 3519, 3558, 3590, 3629, 3659, 3692, 3722, 3751, 3779,
-  3804, 3831, 3856, 3880, 3905, 3925, 3946, 3966, 3983, 3999, 4014, 4028, 2255,
-  2259, 2263, 2267, 2275, 2283, 2293, 2301, 2311, 2324, 2338, 2360, 2374, 2394,
-  2416, 2439, 2467, 2489, 2515, 2547, 2580, 2610, 2648, 2678, 2719, 2757, 2795,
-  2841, 2878, 2930, 2973, 3027, 3071, 3130, 3172, 3230, 3283, 3329, 3372, 3415,
-  3450, 3487, 3528, 3564, 3599, 3633, 3662, 3698, 3724, 3753, 3781, 3806, 3832,
-  3855, 3878, 3901, 3923, 3944, 3962, 3981, 3997, 4012, 4026, 4039, 2340, 2342,
-  2346, 2352, 2362, 2370, 2372, 2386, 2398, 2410, 2420, 2442, 2461, 2481, 2497,
-  2525, 2550, 2576, 2600, 2630, 2664, 2688, 2736, 2765, 2805, 2844, 2876, 2922,
-  2964, 3014, 3059, 3110, 3155, 3213, 3261, 3303, 3349, 3389, 3426, 3465, 3501,
-  3537, 3568, 3606, 3639, 3671, 3700, 3729, 3759, 3783, 3810, 3834, 3857, 3879,
-  3900, 3921, 3940, 3960, 3979, 3995, 4010, 4024, 4037, 4049, 2424, 2428, 2434,
-  2440, 2444, 2457, 2463, 2471, 2485, 2493, 2507, 2535, 2543, 2565, 2586, 2611,
-  2638, 2662, 2685, 2721, 2748, 2781, 2821, 2852, 2885, 2931, 2967, 3009, 3055,
-  3099, 3148, 3198, 3244, 3289, 3333, 3369, 3400, 3444, 3482, 3517, 3550, 3583,
-  3612, 3645, 3675, 3706, 3736, 3761, 3787, 3814, 3837, 3861, 3881, 3902, 3922,
-  3939, 3958, 3977, 3993, 4008, 4022, 4035, 4047, 4058, 2517, 2519, 2523, 2533,
-  2537, 2541, 2551, 2561, 2568, 2582, 2598, 2616, 2634, 2658, 2676, 2690, 2725,
-  2749, 2775, 2808, 2838, 2866, 2905, 2944, 2975, 3017, 3057, 3096, 3138, 3187,
-  3232, 3277, 3317, 3355, 3392, 3428, 3461, 3495, 3531, 3562, 3594, 3627, 3653,
-  3681, 3712, 3738, 3767, 3793, 3816, 3842, 3863, 3885, 3906, 3924, 3941, 3959,
-  3974, 3991, 4006, 4020, 4033, 4045, 4056, 4066, 2604, 2612, 2614, 2618, 2622,
-  2626, 2644, 2652, 2660, 2674, 2686, 2707, 2729, 2743, 2766, 2785, 2815, 2842,
-  2864, 2896, 2925, 2956, 2997, 3031, 3066, 3108, 3149, 3191, 3227, 3272, 3311,
-  3346, 3382, 3420, 3448, 3485, 3514, 3544, 3576, 3608, 3635, 3666, 3694, 3718,
-  3744, 3771, 3798, 3822, 3844, 3866, 3889, 3908, 3926, 3945, 3961, 3978, 3992,
-  4005, 4018, 4031, 4043, 4054, 4064, 4073, 2697, 2701, 2703, 2709, 2715, 2723,
-  2737, 2741, 2751, 2767, 2779, 2796, 2819, 2834, 2858, 2874, 2906, 2936, 2952,
-  2988, 3019, 3050, 3084, 3125, 3156, 3202, 3235, 3275, 3309, 3341, 3378, 3406,
-  3442, 3476, 3506, 3533, 3566, 3592, 3619, 3649, 3677, 3704, 3732, 3756, 3777,
-  3801, 3824, 3850, 3871, 3891, 3910, 3930, 3947, 3963, 3980, 3994, 4007, 4019,
-  4030, 4041, 4052, 4062, 4071, 4079, 2787, 2789, 2797, 2803, 2809, 2817, 2823,
-  2832, 2848, 2862, 2870, 2888, 2913, 2932, 2948, 2971, 3000, 3028, 3051, 3074,
-  3116, 3142, 3173, 3217, 3251, 3285, 3315, 3347, 3380, 3402, 3436, 3469, 3496,
-  3525, 3556, 3584, 3610, 3637, 3663, 3690, 3714, 3740, 3765, 3789, 3812, 3830,
-  3852, 3873, 3895, 3914, 3932, 3949, 3967, 3982, 3996, 4009, 4021, 4032, 4042,
-  4051, 4060, 4069, 4077, 4084, 2882, 2886, 2892, 2894, 2900, 2914, 2918, 2926,
-  2940, 2950, 2965, 2980, 3003, 3021, 3043, 3067, 3089, 3118, 3144, 3166, 3208,
-  3238, 3269, 3295, 3327, 3356, 3384, 3412, 3438, 3467, 3491, 3521, 3548, 3574,
-  3604, 3631, 3651, 3679, 3702, 3726, 3748, 3773, 3795, 3820, 3840, 3859, 3877,
-  3897, 3916, 3936, 3953, 3969, 3984, 3998, 4011, 4023, 4034, 4044, 4053, 4061,
-  4068, 4075, 4082, 4088, 2981, 2983, 2989, 2995, 3001, 3005, 3015, 3029, 3035,
-  3047, 3061, 3075, 3097, 3122, 3136, 3162, 3188, 3218, 3242, 3267, 3291, 3319,
-  3342, 3370, 3396, 3422, 3446, 3473, 3497, 3523, 3546, 3570, 3600, 3624, 3647,
-  3673, 3696, 3716, 3742, 3763, 3785, 3803, 3826, 3848, 3869, 3887, 3904, 3920,
-  3938, 3955, 3971, 3986, 4000, 4013, 4025, 4036, 4046, 4055, 4063, 4070, 4076,
-  4081, 4086, 4091, 3077, 3081, 3085, 3087, 3093, 3103, 3114, 3126, 3134, 3150,
-  3160, 3174, 3200, 3220, 3236, 3258, 3281, 3297, 3325, 3343, 3364, 3390, 3408,
-  3434, 3457, 3483, 3507, 3529, 3554, 3572, 3596, 3617, 3641, 3669, 3686, 3710,
-  3734, 3757, 3775, 3799, 3818, 3838, 3854, 3875, 3893, 3912, 3928, 3943, 3957,
-  3973, 3988, 4002, 4015, 4027, 4038, 4048, 4057, 4065, 4072, 4078, 4083, 4087,
-  4090, 4093, 3176, 3180, 3184, 3192, 3196, 3204, 3214, 3222, 3228, 3247, 3259,
-  3273, 3287, 3299, 3323, 3335, 3357, 3374, 3394, 3416, 3432, 3452, 3477, 3489,
-  3515, 3535, 3560, 3578, 3602, 3625, 3643, 3667, 3683, 3708, 3730, 3746, 3769,
-  3791, 3808, 3828, 3846, 3867, 3883, 3899, 3918, 3934, 3951, 3965, 3976, 3990,
-  4004, 4017, 4029, 4040, 4050, 4059, 4067, 4074, 4080, 4085, 4089, 4092, 4094,
-  4095,
-};
-#endif  // CONFIG_TX64X64
+  0,    1,    5,    6,    14,   15,   27,   28,   44,   45,   65,   66,   90,
+  91,   119,  120,  152,  153,  189,  190,  230,  231,  275,  276,  324,  325,
+  377,  378,  434,  435,  495,  496,  2,    4,    7,    13,   16,   26,   29,
+  43,   46,   64,   67,   89,   92,   118,  121,  151,  154,  188,  191,  229,
+  232,  274,  277,  323,  326,  376,  379,  433,  436,  494,  497,  558,  3,
+  8,    12,   17,   25,   30,   42,   47,   63,   68,   88,   93,   117,  122,
+  150,  155,  187,  192,  228,  233,  273,  278,  322,  327,  375,  380,  432,
+  437,  493,  498,  557,  559,  9,    11,   18,   24,   31,   41,   48,   62,
+  69,   87,   94,   116,  123,  149,  156,  186,  193,  227,  234,  272,  279,
+  321,  328,  374,  381,  431,  438,  492,  499,  556,  560,  617,  10,   19,
+  23,   32,   40,   49,   61,   70,   86,   95,   115,  124,  148,  157,  185,
+  194,  226,  235,  271,  280,  320,  329,  373,  382,  430,  439,  491,  500,
+  555,  561,  616,  618,  20,   22,   33,   39,   50,   60,   71,   85,   96,
+  114,  125,  147,  158,  184,  195,  225,  236,  270,  281,  319,  330,  372,
+  383,  429,  440,  490,  501,  554,  562,  615,  619,  672,  21,   34,   38,
+  51,   59,   72,   84,   97,   113,  126,  146,  159,  183,  196,  224,  237,
+  269,  282,  318,  331,  371,  384,  428,  441,  489,  502,  553,  563,  614,
+  620,  671,  673,  35,   37,   52,   58,   73,   83,   98,   112,  127,  145,
+  160,  182,  197,  223,  238,  268,  283,  317,  332,  370,  385,  427,  442,
+  488,  503,  552,  564,  613,  621,  670,  674,  723,  36,   53,   57,   74,
+  82,   99,   111,  128,  144,  161,  181,  198,  222,  239,  267,  284,  316,
+  333,  369,  386,  426,  443,  487,  504,  551,  565,  612,  622,  669,  675,
+  722,  724,  54,   56,   75,   81,   100,  110,  129,  143,  162,  180,  199,
+  221,  240,  266,  285,  315,  334,  368,  387,  425,  444,  486,  505,  550,
+  566,  611,  623,  668,  676,  721,  725,  770,  55,   76,   80,   101,  109,
+  130,  142,  163,  179,  200,  220,  241,  265,  286,  314,  335,  367,  388,
+  424,  445,  485,  506,  549,  567,  610,  624,  667,  677,  720,  726,  769,
+  771,  77,   79,   102,  108,  131,  141,  164,  178,  201,  219,  242,  264,
+  287,  313,  336,  366,  389,  423,  446,  484,  507,  548,  568,  609,  625,
+  666,  678,  719,  727,  768,  772,  813,  78,   103,  107,  132,  140,  165,
+  177,  202,  218,  243,  263,  288,  312,  337,  365,  390,  422,  447,  483,
+  508,  547,  569,  608,  626,  665,  679,  718,  728,  767,  773,  812,  814,
+  104,  106,  133,  139,  166,  176,  203,  217,  244,  262,  289,  311,  338,
+  364,  391,  421,  448,  482,  509,  546,  570,  607,  627,  664,  680,  717,
+  729,  766,  774,  811,  815,  852,  105,  134,  138,  167,  175,  204,  216,
+  245,  261,  290,  310,  339,  363,  392,  420,  449,  481,  510,  545,  571,
+  606,  628,  663,  681,  716,  730,  765,  775,  810,  816,  851,  853,  135,
+  137,  168,  174,  205,  215,  246,  260,  291,  309,  340,  362,  393,  419,
+  450,  480,  511,  544,  572,  605,  629,  662,  682,  715,  731,  764,  776,
+  809,  817,  850,  854,  887,  136,  169,  173,  206,  214,  247,  259,  292,
+  308,  341,  361,  394,  418,  451,  479,  512,  543,  573,  604,  630,  661,
+  683,  714,  732,  763,  777,  808,  818,  849,  855,  886,  888,  170,  172,
+  207,  213,  248,  258,  293,  307,  342,  360,  395,  417,  452,  478,  513,
+  542,  574,  603,  631,  660,  684,  713,  733,  762,  778,  807,  819,  848,
+  856,  885,  889,  918,  171,  208,  212,  249,  257,  294,  306,  343,  359,
+  396,  416,  453,  477,  514,  541,  575,  602,  632,  659,  685,  712,  734,
+  761,  779,  806,  820,  847,  857,  884,  890,  917,  919,  209,  211,  250,
+  256,  295,  305,  344,  358,  397,  415,  454,  476,  515,  540,  576,  601,
+  633,  658,  686,  711,  735,  760,  780,  805,  821,  846,  858,  883,  891,
+  916,  920,  945,  210,  251,  255,  296,  304,  345,  357,  398,  414,  455,
+  475,  516,  539,  577,  600,  634,  657,  687,  710,  736,  759,  781,  804,
+  822,  845,  859,  882,  892,  915,  921,  944,  946,  252,  254,  297,  303,
+  346,  356,  399,  413,  456,  474,  517,  538,  578,  599,  635,  656,  688,
+  709,  737,  758,  782,  803,  823,  844,  860,  881,  893,  914,  922,  943,
+  947,  968,  253,  298,  302,  347,  355,  400,  412,  457,  473,  518,  537,
+  579,  598,  636,  655,  689,  708,  738,  757,  783,  802,  824,  843,  861,
+  880,  894,  913,  923,  942,  948,  967,  969,  299,  301,  348,  354,  401,
+  411,  458,  472,  519,  536,  580,  597,  637,  654,  690,  707,  739,  756,
+  784,  801,  825,  842,  862,  879,  895,  912,  924,  941,  949,  966,  970,
+  987,  300,  349,  353,  402,  410,  459,  471,  520,  535,  581,  596,  638,
+  653,  691,  706,  740,  755,  785,  800,  826,  841,  863,  878,  896,  911,
+  925,  940,  950,  965,  971,  986,  988,  350,  352,  403,  409,  460,  470,
+  521,  534,  582,  595,  639,  652,  692,  705,  741,  754,  786,  799,  827,
+  840,  864,  877,  897,  910,  926,  939,  951,  964,  972,  985,  989,  1002,
+  351,  404,  408,  461,  469,  522,  533,  583,  594,  640,  651,  693,  704,
+  742,  753,  787,  798,  828,  839,  865,  876,  898,  909,  927,  938,  952,
+  963,  973,  984,  990,  1001, 1003, 405,  407,  462,  468,  523,  532,  584,
+  593,  641,  650,  694,  703,  743,  752,  788,  797,  829,  838,  866,  875,
+  899,  908,  928,  937,  953,  962,  974,  983,  991,  1000, 1004, 1013, 406,
+  463,  467,  524,  531,  585,  592,  642,  649,  695,  702,  744,  751,  789,
+  796,  830,  837,  867,  874,  900,  907,  929,  936,  954,  961,  975,  982,
+  992,  999,  1005, 1012, 1014, 464,  466,  525,  530,  586,  591,  643,  648,
+  696,  701,  745,  750,  790,  795,  831,  836,  868,  873,  901,  906,  930,
+  935,  955,  960,  976,  981,  993,  998,  1006, 1011, 1015, 1020, 465,  526,
+  529,  587,  590,  644,  647,  697,  700,  746,  749,  791,  794,  832,  835,
+  869,  872,  902,  905,  931,  934,  956,  959,  977,  980,  994,  997,  1007,
+  1010, 1016, 1019, 1021, 527,  528,  588,  589,  645,  646,  698,  699,  747,
+  748,  792,  793,  833,  834,  870,  871,  903,  904,  932,  933,  957,  958,
+  978,  979,  995,  996,  1008, 1009, 1017, 1018, 1022, 1023
+};
 
 const SCAN_ORDER av1_default_scan_orders[TX_SIZES] = {
-#if CONFIG_CHROMA_2X2
-  { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
-#endif
   { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
   { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
   { default_scan_16x16, av1_default_iscan_16x16, default_scan_16x16_neighbors },
   { default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors },
-#if CONFIG_TX64X64
-  { default_scan_64x64, av1_default_iscan_64x64, default_scan_64x64_neighbors },
-#endif  // CONFIG_TX64X64
+  // Half of the coefficients of tx64 at higher frequencies are set to
+  // zeros. So tx32's scan order is used.
+  { default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors },
 };
 
-const SCAN_ORDER av1_intra_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
-#if CONFIG_CHROMA_2X2
-  {
-      // TX_2X2
-      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
-      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
-      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
-      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
-      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
-      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
-      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
-      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
-      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
-      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-#endif
+const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
   {
       // TX_4X4
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
-      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
-      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
-      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
-      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
-      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
-      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
-      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-  {
-      // TX_8X8
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
-      { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
-      { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
-      { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
-      { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
-      { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
-      { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
-      { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-  {
-      // TX_16X16
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
-      { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { default_scan_16x16, av1_default_iscan_16x16,
-        default_scan_16x16_neighbors },
-      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
-      { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
-      { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
-      { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
-      { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
-      { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
-      { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-  {
-      // TX_32X32
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
-      { v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
-      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
-#if CONFIG_EXT_TX
-      { h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
-      { v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
-      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
-      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
-      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
-      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-#if CONFIG_TX64X64
-  {
-      // TX_64X64
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-#endif  // CONFIG_TX64X64
-  {
-      // TX_4X8
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
-      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
-      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
-      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
-      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
-      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
-      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
-      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-  {
-      // TX_8X4
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
-      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
-      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
-      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
-      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
-      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
-      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
-      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-  {
-      // TX_8X16
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
-      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { default_scan_8x16, av1_default_iscan_8x16,
-        default_scan_8x16_neighbors },
-      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
-      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
-      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
-      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
-      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
-      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
-      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-  {
-      // TX_16X8
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
-      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { default_scan_16x8, av1_default_iscan_16x8,
-        default_scan_16x8_neighbors },
-      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
-      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
-      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
-      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
-      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
-      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
-      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-  {
-      // TX_16X32
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { default_scan_16x32, av1_default_iscan_16x32,
-        default_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
-      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-  {
-      // TX_32X16
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { default_scan_32x16, av1_default_iscan_32x16,
-        default_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
-      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-#if CONFIG_TX64X64
-  {
-      // TX_32X64
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-  {
-      // TX_64X32
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-#endif  // CONFIG_EXT_TX
-  }
-#endif  // CONFIG_TX64X64
-};
-
-const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
-#if CONFIG_CHROMA_2X2
-  {
-      // TX_2X2
-      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
-      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
-      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
-      { default_scan_2x2, av1_default_iscan_2x2, default_scan_2x2_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
-      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
-      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
-      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
-      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
-      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
-      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
-#endif  // CONFIG_EXT_TX
-  },
-#endif
-  {
-      // TX_4X4
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
-      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
       { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
       { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
       { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
       { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
       { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_8X8
@@ -7453,20 +3236,18 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
       { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
       { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
       { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
       { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
       { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_16X16
@@ -7478,7 +3259,6 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_16x16_neighbors },
       { default_scan_16x16, av1_default_iscan_16x16,
         default_scan_16x16_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_16x16, av1_default_iscan_16x16,
         default_scan_16x16_neighbors },
       { default_scan_16x16, av1_default_iscan_16x16,
@@ -7489,96 +3269,93 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_16x16_neighbors },
       { default_scan_16x16, av1_default_iscan_16x16,
         default_scan_16x16_neighbors },
-      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
       { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
       { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
       { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
       { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
       { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
       { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_32X32
       { default_scan_32x32, av1_default_iscan_32x32,
         default_scan_32x32_neighbors },
-      { h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
-      { v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
-      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
-#if CONFIG_EXT_TX
-      { h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
-      { v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
-      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
-      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
-      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
-      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
       { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
       { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
       { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
       { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
       { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
       { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
-#endif  // CONFIG_EXT_TX
   },
-#if CONFIG_TX64X64
   {
       // TX_64X64
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-      { default_scan_64x64, av1_default_iscan_64x64,
-        default_scan_64x64_neighbors },
-#endif  // CONFIG_EXT_TX
+      // Half of the coefficients of tx64 at higher frequencies are set to
+      // zeros. So tx32's scan order is used.
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
   },
-#endif  // CONFIG_TX64X64
   {
       // TX_4X8
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
-      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
       { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
       { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
       { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
       { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
       { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_8X4
@@ -7586,20 +3363,18 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
-      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
       { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
       { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
       { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
       { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
       { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_8X16
@@ -7611,7 +3386,6 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_8x16_neighbors },
       { default_scan_8x16, av1_default_iscan_8x16,
         default_scan_8x16_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_8x16, av1_default_iscan_8x16,
         default_scan_8x16_neighbors },
       { default_scan_8x16, av1_default_iscan_8x16,
@@ -7622,14 +3396,14 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_8x16_neighbors },
       { default_scan_8x16, av1_default_iscan_8x16,
         default_scan_8x16_neighbors },
-      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
       { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
       { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
       { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
       { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
       { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
       { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_16X8
@@ -7641,7 +3415,6 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_16x8_neighbors },
       { default_scan_16x8, av1_default_iscan_16x8,
         default_scan_16x8_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_16x8, av1_default_iscan_16x8,
         default_scan_16x8_neighbors },
       { default_scan_16x8, av1_default_iscan_16x8,
@@ -7652,14 +3425,14 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_16x8_neighbors },
       { default_scan_16x8, av1_default_iscan_16x8,
         default_scan_16x8_neighbors },
-      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
       { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
       { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
       { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
       { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
       { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
       { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_16X32
@@ -7671,7 +3444,6 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_16x32_neighbors },
       { default_scan_16x32, av1_default_iscan_16x32,
         default_scan_16x32_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_16x32, av1_default_iscan_16x32,
         default_scan_16x32_neighbors },
       { default_scan_16x32, av1_default_iscan_16x32,
@@ -7682,14 +3454,14 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_16x32_neighbors },
       { default_scan_16x32, av1_default_iscan_16x32,
         default_scan_16x32_neighbors },
-      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
       { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
       { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
       { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
       { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
       { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
       { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_32X16
@@ -7701,7 +3473,6 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_32x16_neighbors },
       { default_scan_32x16, av1_default_iscan_32x16,
         default_scan_32x16_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_32x16, av1_default_iscan_32x16,
         default_scan_32x16_neighbors },
       { default_scan_32x16, av1_default_iscan_32x16,
@@ -7712,91 +3483,77 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_32x16_neighbors },
       { default_scan_32x16, av1_default_iscan_32x16,
         default_scan_32x16_neighbors },
-      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
       { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
       { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
       { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
       { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
       { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
       { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
-#endif  // CONFIG_EXT_TX
   },
-#if CONFIG_TX64X64
   {
       // TX_32X64
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-      { default_scan_32x64, av1_default_iscan_32x64,
-        default_scan_32x64_neighbors },
-#endif  // CONFIG_EXT_TX
+      // Half of the coefficients of tx64 at higher frequencies are set to
+      // zeros. So tx32's scan order is used.
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
   },
   {
       // TX_64X32
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-#if CONFIG_EXT_TX
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-      { default_scan_64x32, av1_default_iscan_64x32,
-        default_scan_64x32_neighbors },
-#endif  // CONFIG_EXT_TX
+      // Half of the coefficients of tx64 at higher frequencies are set to
+      // zeros. So tx32's scan order is used.
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
   },
-#endif  // CONFIG_TX64X64
   {
       // TX_4X16
       { default_scan_4x16, av1_default_iscan_4x16,
@@ -7807,7 +3564,6 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_4x16_neighbors },
       { default_scan_4x16, av1_default_iscan_4x16,
         default_scan_4x16_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_4x16, av1_default_iscan_4x16,
         default_scan_4x16_neighbors },
       { default_scan_4x16, av1_default_iscan_4x16,
@@ -7818,14 +3574,14 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_4x16_neighbors },
       { default_scan_4x16, av1_default_iscan_4x16,
         default_scan_4x16_neighbors },
-      { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
       { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
       { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
       { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
       { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
       { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
       { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_16X4
@@ -7837,7 +3593,6 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_16x4_neighbors },
       { default_scan_16x4, av1_default_iscan_16x4,
         default_scan_16x4_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_16x4, av1_default_iscan_16x4,
         default_scan_16x4_neighbors },
       { default_scan_16x4, av1_default_iscan_16x4,
@@ -7848,14 +3603,14 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_16x4_neighbors },
       { default_scan_16x4, av1_default_iscan_16x4,
         default_scan_16x4_neighbors },
-      { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
       { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
       { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
       { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
       { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
       { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
       { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_8X32
@@ -7867,7 +3622,6 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_8x32_neighbors },
       { default_scan_8x32, av1_default_iscan_8x32,
         default_scan_8x32_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_8x32, av1_default_iscan_8x32,
         default_scan_8x32_neighbors },
       { default_scan_8x32, av1_default_iscan_8x32,
@@ -7878,14 +3632,14 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_8x32_neighbors },
       { default_scan_8x32, av1_default_iscan_8x32,
         default_scan_8x32_neighbors },
-      { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
       { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
       { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
       { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
       { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
       { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
       { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
-#endif  // CONFIG_EXT_TX
   },
   {
       // TX_32X8
@@ -7897,7 +3651,6 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_32x8_neighbors },
       { default_scan_32x8, av1_default_iscan_32x8,
         default_scan_32x8_neighbors },
-#if CONFIG_EXT_TX
       { default_scan_32x8, av1_default_iscan_32x8,
         default_scan_32x8_neighbors },
       { default_scan_32x8, av1_default_iscan_32x8,
@@ -7908,679 +3661,75 @@ const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
         default_scan_32x8_neighbors },
       { default_scan_32x8, av1_default_iscan_32x8,
         default_scan_32x8_neighbors },
-      { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
       { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
       { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
       { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
       { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
       { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
       { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
-#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_16X64
+      // Half of the coefficients of tx64 at higher frequencies are set to
+      // zeros. So tx32's scan order is used.
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+  },
+  {
+      // TX_64X16
+      // Half of the coefficients of tx64 at higher frequencies are set to
+      // zeros. So tx32's scan order is used.
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
   },
 };
-
-#if CONFIG_ADAPT_SCAN
-// TX_32X32 will has 1024 coefficients whose indexes can be represented in 10
-// bits
-#define COEFF_IDX_BITS (10 + CONFIG_TX64X64)
-#define COEFF_IDX_SIZE (1 << COEFF_IDX_BITS)
-#define COEFF_IDX_MASK (COEFF_IDX_SIZE - 1)
-
-static uint32_t *get_non_zero_prob(FRAME_CONTEXT *fc, TX_SIZE tx_size,
-                                   TX_TYPE tx_type) {
-  switch (tx_size) {
-#if CONFIG_CHROMA_2X2
-    case TX_2X2: return fc->non_zero_prob_2x2[tx_type];
-#endif
-    case TX_4X4: return fc->non_zero_prob_4X4[tx_type];
-    case TX_8X8: return fc->non_zero_prob_8X8[tx_type];
-    case TX_16X16: return fc->non_zero_prob_16X16[tx_type];
-    case TX_32X32: return fc->non_zero_prob_32X32[tx_type];
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    case TX_4X8: return fc->non_zero_prob_4X8[tx_type];
-    case TX_8X4: return fc->non_zero_prob_8X4[tx_type];
-    case TX_8X16: return fc->non_zero_prob_8X16[tx_type];
-    case TX_16X8: return fc->non_zero_prob_16X8[tx_type];
-    case TX_16X32: return fc->non_zero_prob_16X32[tx_type];
-    case TX_32X16: return fc->non_zero_prob_32X16[tx_type];
-#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    default: assert(0); return NULL;
-  }
-}
-
-static int16_t *get_adapt_scan(FRAME_CONTEXT *fc, TX_SIZE tx_size,
-                               TX_TYPE tx_type) {
-  switch (tx_size) {
-#if CONFIG_CHROMA_2X2
-    case TX_2X2: return fc->scan_2x2[tx_type];
-#endif
-    case TX_4X4: return fc->scan_4X4[tx_type];
-    case TX_8X8: return fc->scan_8X8[tx_type];
-    case TX_16X16: return fc->scan_16X16[tx_type];
-    case TX_32X32: return fc->scan_32X32[tx_type];
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    case TX_4X8: return fc->scan_4X8[tx_type];
-    case TX_8X4: return fc->scan_8X4[tx_type];
-    case TX_8X16: return fc->scan_8X16[tx_type];
-    case TX_16X8: return fc->scan_16X8[tx_type];
-    case TX_16X32: return fc->scan_16X32[tx_type];
-    case TX_32X16: return fc->scan_32X16[tx_type];
-#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    default: assert(0); return NULL;
-  }
-}
-
-static int16_t *get_adapt_iscan(FRAME_CONTEXT *fc, TX_SIZE tx_size,
-                                TX_TYPE tx_type) {
-  switch (tx_size) {
-#if CONFIG_CHROMA_2X2
-    case TX_2X2: return fc->iscan_2x2[tx_type];
-#endif
-    case TX_4X4: return fc->iscan_4X4[tx_type];
-    case TX_8X8: return fc->iscan_8X8[tx_type];
-    case TX_16X16: return fc->iscan_16X16[tx_type];
-    case TX_32X32: return fc->iscan_32X32[tx_type];
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    case TX_4X8: return fc->iscan_4X8[tx_type];
-    case TX_8X4: return fc->iscan_8X4[tx_type];
-    case TX_8X16: return fc->iscan_8X16[tx_type];
-    case TX_16X8: return fc->iscan_16X8[tx_type];
-    case TX_16X32: return fc->iscan_16X32[tx_type];
-    case TX_32X16: return fc->iscan_32X16[tx_type];
-#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    default: assert(0); return NULL;
-  }
-}
-
-static int16_t *get_adapt_nb(FRAME_CONTEXT *fc, TX_SIZE tx_size,
-                             TX_TYPE tx_type) {
-  switch (tx_size) {
-#if CONFIG_CHROMA_2X2
-    case TX_2X2: return fc->nb_2x2[tx_type];
-#endif
-    case TX_4X4: return fc->nb_4X4[tx_type];
-    case TX_8X8: return fc->nb_8X8[tx_type];
-    case TX_16X16: return fc->nb_16X16[tx_type];
-    case TX_32X32: return fc->nb_32X32[tx_type];
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    case TX_4X8: return fc->nb_4X8[tx_type];
-    case TX_8X4: return fc->nb_8X4[tx_type];
-    case TX_8X16: return fc->nb_8X16[tx_type];
-    case TX_16X8: return fc->nb_16X8[tx_type];
-    case TX_16X32: return fc->nb_16X32[tx_type];
-    case TX_32X16: return fc->nb_32X16[tx_type];
-#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    default: assert(0); return NULL;
-  }
-}
-
-static uint32_t *get_non_zero_counts(FRAME_COUNTS *counts, TX_SIZE tx_size,
-                                     TX_TYPE tx_type) {
-  switch (tx_size) {
-#if CONFIG_CHROMA_2X2
-    case TX_2X2: return counts->non_zero_count_2x2[tx_type];
-#endif
-    case TX_4X4: return counts->non_zero_count_4X4[tx_type];
-    case TX_8X8: return counts->non_zero_count_8X8[tx_type];
-    case TX_16X16: return counts->non_zero_count_16X16[tx_type];
-    case TX_32X32: return counts->non_zero_count_32X32[tx_type];
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    case TX_4X8: return counts->non_zero_count_4x8[tx_type];
-    case TX_8X4: return counts->non_zero_count_8x4[tx_type];
-    case TX_8X16: return counts->non_zero_count_8x16[tx_type];
-    case TX_16X8: return counts->non_zero_count_16x8[tx_type];
-    case TX_16X32: return counts->non_zero_count_16x32[tx_type];
-    case TX_32X16: return counts->non_zero_count_32x16[tx_type];
-#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    default: assert(0); return NULL;
-  }
-}
-
-static INLINE int clamp_64(int64_t value, int low, int high) {
-  return value < low ? low : (value > high ? high : (int)value);
-}
-
-#if USE_2X2_PROB
-static int do_down_sample(TX_SIZE tx_size) {
-  const int tx_w = tx_size_wide[tx_size];
-  const int tx_h = tx_size_high[tx_size];
-  if (tx_w > 8 || tx_h > 8) {
-    return 1;
-  } else {
-    return 0;
-  }
-}
-
-void av1_down_sample_scan_count(uint32_t *non_zero_count_ds,
-                                const uint32_t *non_zero_count,
-                                TX_SIZE tx_size) {
-  const int tx_w = tx_size_wide[tx_size];
-  const int tx_h = tx_size_high[tx_size];
-  if (tx_w > 8 && tx_h > 8) {
-    const int tx_w_ds = tx_w >> 1;
-    const int tx_h_ds = tx_h >> 1;
-    for (int r_ds = 0; r_ds < tx_h_ds; ++r_ds) {
-      for (int c_ds = 0; c_ds < tx_w_ds; ++c_ds) {
-        const int ci_ds = r_ds * tx_w_ds + c_ds;
-        const int r = r_ds << 1;
-        const int c = c_ds << 1;
-        const int ci = r * tx_w + c;
-        non_zero_count_ds[ci_ds] = non_zero_count[ci];
-      }
-    }
-  } else if (tx_w > 8 && tx_h <= 8) {
-    const int tx_w_ds = tx_w >> 1;
-    const int tx_h_ds = tx_h;
-    for (int r_ds = 0; r_ds < tx_h_ds; ++r_ds) {
-      for (int c_ds = 0; c_ds < tx_w_ds; ++c_ds) {
-        const int ci_ds = r_ds * tx_w_ds + c_ds;
-        const int r = r_ds;
-        const int c = c_ds << 1;
-        const int ci = r * tx_w + c;
-        non_zero_count_ds[ci_ds] = non_zero_count[ci];
-      }
-    }
-  } else if (tx_w <= 8 && tx_h > 8) {
-    const int tx_w_ds = tx_w;
-    const int tx_h_ds = tx_h >> 1;
-    for (int r_ds = 0; r_ds < tx_h_ds; ++r_ds) {
-      for (int c_ds = 0; c_ds < tx_w_ds; ++c_ds) {
-        const int ci_ds = r_ds * tx_w_ds + c_ds;
-        const int r = r_ds << 1;
-        const int c = c_ds;
-        const int ci = r * tx_w + c;
-        non_zero_count_ds[ci_ds] = non_zero_count[ci];
-      }
-    }
-  } else {
-    assert(0);
-  }
-}
-
-void av1_up_sample_scan_count(uint32_t *non_zero_count,
-                              const uint32_t *non_zero_count_ds,
-                              TX_SIZE tx_size, unsigned int block_num) {
-  const int tx_w = tx_size_wide[tx_size];
-  const int tx_h = tx_size_high[tx_size];
-  if (tx_w > 8 && tx_h > 8) {
-    const int tx_w_ds = tx_w >> 1;
-    const int tx_h_ds = tx_h >> 1;
-    for (int r_ds = 0; r_ds < tx_h_ds; ++r_ds) {
-      for (int c_ds = 0; c_ds < tx_w_ds; ++c_ds) {
-        const int ci_ds = r_ds * tx_w_ds + c_ds;
-        const int r = r_ds << 1;
-        const int c = c_ds << 1;
-        const int ci = r * tx_w + c;
-        non_zero_count[ci] = non_zero_count_ds[ci_ds];
-        if (c_ds + 1 < tx_w_ds) {
-          uint32_t count =
-              non_zero_count_ds[ci_ds] + non_zero_count_ds[ci_ds + 1];
-          count = ROUND_POWER_OF_TWO(count, 1);
-          count = clamp32u(count, 0, block_num);
-          non_zero_count[ci + 1] = count;
-        } else {
-          non_zero_count[ci + 1] = non_zero_count_ds[ci_ds];
-        }
-      }
-    }
-    for (int r_ds = 0; r_ds < tx_h_ds; ++r_ds) {
-      for (int c = 0; c < tx_w; ++c) {
-        const int r = r_ds << 1;
-        const int ci = r * tx_w + c;
-        if (r + 2 < tx_h) {
-          uint32_t count = non_zero_count[ci] + non_zero_count[ci + 2 * tx_w];
-          count = ROUND_POWER_OF_TWO(count, 1);
-          count = clamp32u(count, 0, block_num);
-          non_zero_count[ci + tx_w] = count;
-        } else {
-          non_zero_count[ci + tx_w] = non_zero_count[ci];
-        }
-      }
-    }
-  } else if (tx_w > 8 && tx_h <= 8) {
-    const int tx_w_ds = tx_w >> 1;
-    const int tx_h_ds = tx_h;
-    for (int r_ds = 0; r_ds < tx_h_ds; ++r_ds) {
-      for (int c_ds = 0; c_ds < tx_w_ds; ++c_ds) {
-        const int ci_ds = r_ds * tx_w_ds + c_ds;
-        const int r = r_ds;
-        const int c = c_ds << 1;
-        const int ci = r * tx_w + c;
-        non_zero_count[ci] = non_zero_count_ds[ci_ds];
-        if (c_ds + 1 < tx_w_ds) {
-          uint32_t count =
-              non_zero_count_ds[ci_ds] + non_zero_count_ds[ci_ds + 1];
-          count = ROUND_POWER_OF_TWO(count, 1);
-          count = clamp32u(count, 0, block_num);
-          non_zero_count[ci + 1] = count;
-        } else {
-          non_zero_count[ci + 1] = non_zero_count_ds[ci_ds];
-        }
-      }
-    }
-  } else if (tx_w <= 8 && tx_h > 8) {
-    const int tx_w_ds = tx_w;
-    const int tx_h_ds = tx_h >> 1;
-    for (int r_ds = 0; r_ds < tx_h_ds; ++r_ds) {
-      for (int c_ds = 0; c_ds < tx_w_ds; ++c_ds) {
-        const int ci_ds = r_ds * tx_w_ds + c_ds;
-        const int r = r_ds << 1;
-        const int c = c_ds;
-        const int ci = r * tx_w + c;
-        non_zero_count[ci] = non_zero_count_ds[ci_ds];
-        if (r_ds + 1 < tx_h_ds) {
-          uint32_t count =
-              non_zero_count_ds[ci_ds] + non_zero_count_ds[ci_ds + tx_w_ds];
-          count = ROUND_POWER_OF_TWO(count, 1);
-          count = clamp32u(count, 0, block_num);
-          non_zero_count[ci + tx_w] = count;
-        } else {
-          non_zero_count[ci + tx_w] = non_zero_count_ds[ci_ds];
-        }
-      }
-    }
-  } else {
-    assert(0);
-  }
-}
-#endif
-
-static void update_scan_prob(AV1_COMMON *cm, TX_SIZE tx_size, TX_TYPE tx_type,
-                             int rate) {
-  FRAME_CONTEXT *pre_fc = cm->pre_fc;
-  uint32_t *prev_non_zero_prob = get_non_zero_prob(pre_fc, tx_size, tx_type);
-  uint32_t *non_zero_prob = get_non_zero_prob(cm->fc, tx_size, tx_type);
-  uint32_t *non_zero_count = get_non_zero_counts(&cm->counts, tx_size, tx_type);
-  const int tx2d_size = tx_size_2d[tx_size];
-  unsigned int block_num = cm->counts.txb_count[tx_size][tx_type];
-#if USE_2X2_PROB
-#if CONFIG_TX64X64
-  DECLARE_ALIGNED(16, uint32_t, non_zero_count_ds[1024]);
-  assert((tx2d_size >> 2) <= 1024);
-#else   // CONFIG_TX64X64
-  DECLARE_ALIGNED(16, uint32_t, non_zero_count_ds[256]);
-  assert((tx2d_size >> 2) <= 256);
-#endif  // CONFIG_TX64X64
-  if (do_down_sample(tx_size)) {
-    av1_down_sample_scan_count(non_zero_count_ds, non_zero_count, tx_size);
-    av1_up_sample_scan_count(non_zero_count, non_zero_count_ds, tx_size,
-                             block_num);
-  }
-#endif
-  int i;
-  const int inv_precision = 30;
-  int32_t inv_block_num = block_num == 0 ? 0 : (1 << inv_precision) / block_num;
-  for (i = 0; i < tx2d_size; i++) {
-    int64_t curr_prob =
-        block_num == 0 ? 0 : ((non_zero_count[i] * inv_block_num) >>
-                              (inv_precision - ADAPT_SCAN_PROB_PRECISION));
-    int64_t prev_prob = prev_non_zero_prob[i];
-    int64_t pred_prob =
-        (curr_prob * rate +
-         prev_prob * ((1 << ADAPT_SCAN_PROB_PRECISION) - rate)) >>
-        ADAPT_SCAN_PROB_PRECISION;
-    // TODO(angiebird): reduce the bit usage of probabilities and remove
-    // clamp_64()
-    non_zero_prob[i] =
-        clamp_64(pred_prob, 0, (1 << ADAPT_SCAN_PROB_PRECISION) - 1);
-  }
-}
-
-static void update_scan_count(int16_t *scan, int max_scan,
-                              const tran_low_t *dqcoeffs,
-                              uint32_t *non_zero_count) {
-  int i;
-  for (i = 0; i < max_scan; ++i) {
-    int coeff_idx = scan[i];
-    non_zero_count[coeff_idx] += (dqcoeffs[coeff_idx] != 0);
-  }
-}
-
-void av1_update_scan_count_facade(AV1_COMMON *cm, FRAME_COUNTS *counts,
-                                  TX_SIZE tx_size, TX_TYPE tx_type,
-                                  const tran_low_t *dqcoeffs, int max_scan) {
-  if (cm->use_adapt_scan && do_adapt_scan(tx_size, tx_type)) {
-    int16_t *scan = get_adapt_scan(cm->fc, tx_size, tx_type);
-    uint32_t *non_zero_count = get_non_zero_counts(counts, tx_size, tx_type);
-    update_scan_count(scan, max_scan, dqcoeffs, non_zero_count);
-    ++counts->txb_count[tx_size][tx_type];
-  }
-}
-
-static int cmp_prob(const void *a, const void *b) {
-  return *(const uint32_t *)b > *(const uint32_t *)a ? 1 : -1;
-}
-
-void av1_augment_prob(TX_SIZE tx_size, TX_TYPE tx_type, uint32_t *prob) {
-  // TODO(angiebird): check if we need is_inter here
-  const SCAN_ORDER *sc = get_default_scan(tx_size, tx_type, 0);
-  const int tx1d_wide = tx_size_wide[tx_size];
-  const int tx1d_high = tx_size_high[tx_size];
-  int r, c;
-  for (r = 0; r < tx1d_high; r++) {
-    for (c = 0; c < tx1d_wide; c++) {
-      const int idx = r * tx1d_wide + c;
-      const uint32_t mask_16 = ((1 << 16) - 1);
-      const uint32_t tie_breaker = ~((uint32_t)sc->iscan[idx]);
-      // prob[idx]: 16 bits  dummy: 6 bits  scan_idx: 10 bits
-      prob[idx] = (prob[idx] << 16) | (mask_16 & tie_breaker);
-    }
-  }
-}
-
-void av1_update_neighbors(TX_SIZE tx_size, const int16_t *scan,
-                          const int16_t *iscan, int16_t *neighbors) {
-  const int tx1d_wide = tx_size_wide[tx_size];
-  const int tx1d_high = tx_size_high[tx_size];
-  const int tx2d_size = tx_size_2d[tx_size];
-  int scan_idx;
-  for (scan_idx = 0; scan_idx < tx2d_size; ++scan_idx) {
-    const int coeff_idx = scan[scan_idx];
-    const int r = coeff_idx / tx1d_wide;
-    const int c = coeff_idx % tx1d_wide;
-    const int nb_offset_r[5] = { -1, 0, -1, -1, 1 };
-    const int nb_offset_c[5] = { 0, -1, -1, 1, -1 };
-    const int nb_num = 5;
-    int nb_count = 0;
-    int nb_idx;
-
-    for (nb_idx = 0; nb_idx < nb_num; ++nb_idx) {
-      if (nb_count < 2) {
-        int nb_r = r + nb_offset_r[nb_idx];
-        int nb_c = c + nb_offset_c[nb_idx];
-        int nb_coeff_idx = nb_r * tx1d_wide + nb_c;
-        int valid_pos =
-            nb_r >= 0 && nb_r < tx1d_high && nb_c >= 0 && nb_c < tx1d_wide;
-        if (valid_pos && iscan[nb_coeff_idx] < scan_idx) {
-          neighbors[scan_idx * MAX_NEIGHBORS + nb_count] = nb_coeff_idx;
-          ++nb_count;
-        }
-      } else {
-        break;
-      }
-    }
-
-    if (nb_count == 1) {
-      neighbors[scan_idx * MAX_NEIGHBORS + 1] =
-          neighbors[scan_idx * MAX_NEIGHBORS + 0];
-    } else if (nb_count == 0) {
-      neighbors[scan_idx * MAX_NEIGHBORS + 0] = scan[0];
-      neighbors[scan_idx * MAX_NEIGHBORS + 1] = scan[0];
-    }
-  }
-  neighbors[tx2d_size * MAX_NEIGHBORS + 0] = scan[0];
-  neighbors[tx2d_size * MAX_NEIGHBORS + 1] = scan[0];
-}
-
-#if USE_LIMIT_SCAN_DISTANCE
-typedef struct SCAN_NB_QUEUE {
-  int nb_ci_queue[COEFF_IDX_SIZE + 1];
-  int pr_si_queue[COEFF_IDX_SIZE + 1];
-  int size;
-  int start;
-  int end;
-} SCAN_NB_QUEUE;
-
-static void assign_scan_idx(int16_t coeff_idx, int16_t *scan_idx, int tx_width,
-                            int tx_height, int16_t *scan, int16_t *iscan,
-                            int16_t *visit, SCAN_NB_QUEUE *queue) {
-  if (visit[coeff_idx] != 2) {
-    assert(*scan_idx < tx_width * tx_height);
-    scan[*scan_idx] = coeff_idx;
-    iscan[coeff_idx] = *scan_idx;
-    visit[coeff_idx] = 2;
-    int row = coeff_idx / tx_width;
-    int col = coeff_idx % tx_width;
-    int right_ci = coeff_idx + 1;
-    if (col + 1 < tx_width && visit[right_ci] == 0) {
-      visit[right_ci] = 1;
-      queue->pr_si_queue[queue->end] = *scan_idx;
-      queue->nb_ci_queue[queue->end] = right_ci;
-      queue->end = (queue->end + 1) % queue->size;
-    }
-    int down_ci = coeff_idx + tx_width;
-    if (row + 1 < tx_height && visit[down_ci] == 0) {
-      visit[down_ci] = 1;
-      queue->pr_si_queue[queue->end] = *scan_idx;
-      queue->nb_ci_queue[queue->end] = down_ci;
-      queue->end = (queue->end + 1) % queue->size;
-    }
-    ++(*scan_idx);
-  }
-}
-static void limit_nb_scan_distance(TX_SIZE tx_size, int16_t *scan,
-                                   int16_t *iscan) {
-  const int tx2d_size = tx_size_2d[tx_size];
-  int16_t visit[COEFF_IDX_SIZE] = { 0 };
-  int16_t org_scan[COEFF_IDX_SIZE];
-  memcpy(org_scan, scan, tx2d_size * sizeof(*scan));
-  const int tx_width = tx_size_wide[tx_size];
-  const int tx_height = tx_size_high[tx_size];
-  const int limit = 2 * AOMMAX(tx_width, tx_height);
-  SCAN_NB_QUEUE queue;
-  queue.size = tx2d_size;
-  queue.start = 0;
-  queue.end = 0;
-  int16_t new_si = 0;
-  for (int16_t si = 0; si < tx2d_size; ++si) {
-    while (queue.start != queue.end &&
-           queue.pr_si_queue[queue.start] + limit <= new_si) {
-      int nb_ci = queue.nb_ci_queue[queue.start];
-      assign_scan_idx(nb_ci, &new_si, tx_width, tx_height, scan, iscan, visit,
-                      &queue);
-      queue.start = (queue.start + 1) % queue.size;
-    }
-
-    int16_t ci = org_scan[si];
-    assign_scan_idx(ci, &new_si, tx_width, tx_height, scan, iscan, visit,
-                    &queue);
-  }
-  assert(new_si == tx2d_size);
-}
-#endif  // USE_LIMIT_SCAN_DISTANCE
-
-#if USE_TOPOLOGICAL_SORT
-void av1_update_sort_order(TX_SIZE tx_size, TX_TYPE tx_type,
-                           const uint32_t *non_zero_prob, int16_t *sort_order) {
-  const SCAN_ORDER *sc = get_default_scan(tx_size, tx_type, 0);
-  uint32_t temp[COEFF_IDX_SIZE];
-  const int tx2d_size = tx_size_2d[tx_size];
-  int sort_idx;
-  assert(tx2d_size <= COEFF_IDX_SIZE);
-  memcpy(temp, non_zero_prob, tx2d_size * sizeof(*non_zero_prob));
-  av1_augment_prob(tx_size, tx_type, temp);
-  qsort(temp, tx2d_size, sizeof(*temp), cmp_prob);
-  for (sort_idx = 0; sort_idx < tx2d_size; ++sort_idx) {
-    const int default_scan_idx =
-        (temp[sort_idx] & COEFF_IDX_MASK) ^ COEFF_IDX_MASK;
-    const int coeff_idx = sc->scan[default_scan_idx];
-    sort_order[sort_idx] = coeff_idx;
-  }
-}
-
-// topological sort
-static void dfs_scan(int tx1d_size, int *scan_idx, int coeff_idx, int16_t *scan,
-                     int16_t *iscan) {
-  const int r = coeff_idx / tx1d_size;
-  const int c = coeff_idx % tx1d_size;
-
-  if (iscan[coeff_idx] != -1) return;
-
-  if (r > 0) dfs_scan(tx1d_size, scan_idx, coeff_idx - tx1d_size, scan, iscan);
-
-  if (c > 0) dfs_scan(tx1d_size, scan_idx, coeff_idx - 1, scan, iscan);
-
-  scan[*scan_idx] = coeff_idx;
-  iscan[coeff_idx] = *scan_idx;
-  ++(*scan_idx);
-}
-
-void av1_update_scan_order(TX_SIZE tx_size, int16_t *sort_order, int16_t *scan,
-                           int16_t *iscan) {
-  int coeff_idx;
-  int scan_idx;
-  int sort_idx;
-  const int tx1d_size = tx_size_wide[tx_size];
-  const int tx2d_size = tx_size_2d[tx_size];
-
-  for (coeff_idx = 0; coeff_idx < tx2d_size; ++coeff_idx) {
-    iscan[coeff_idx] = -1;
-  }
-
-  scan_idx = 0;
-  for (sort_idx = 0; sort_idx < tx2d_size; ++sort_idx) {
-    coeff_idx = sort_order[sort_idx];
-    dfs_scan(tx1d_size, &scan_idx, coeff_idx, scan, iscan);
-  }
-}
-#else
-
-static void filter_prob(TX_SIZE tx_size, uint32_t *prob) {
-  const int tx1d_wide = tx_size_wide[tx_size];
-  const int tx1d_high = tx_size_high[tx_size];
-  for (int r = tx1d_high - 1; r >= 0; --r) {
-    for (int c = tx1d_wide - 1; c >= 0; --c) {
-      int idx = r * tx1d_wide + c;
-      uint32_t v = prob[idx];
-      if (r > 0 && prob[idx - tx1d_wide] < v) prob[idx - tx1d_wide] = v;
-      if (c > 0 && prob[idx - 1] < v) prob[idx - 1] = v;
-    }
-  }
-}
-
-void av1_update_scan_order(TX_SIZE tx_size, TX_TYPE tx_type,
-                           uint32_t *non_zero_prob, int16_t *scan,
-                           int16_t *iscan) {
-  const SCAN_ORDER *sc = get_default_scan(tx_size, tx_type, 0);
-  uint32_t temp[COEFF_IDX_SIZE];
-  const int tx2d_size = tx_size_2d[tx_size];
-  int scan_idx;
-  assert(tx2d_size <= COEFF_IDX_SIZE);
-  memcpy(temp, non_zero_prob, tx2d_size * sizeof(*non_zero_prob));
-  filter_prob(tx_size, temp);
-  av1_augment_prob(tx_size, tx_type, temp);
-  qsort(temp, tx2d_size, sizeof(*temp), cmp_prob);
-  for (scan_idx = 0; scan_idx < tx2d_size; ++scan_idx) {
-    const int default_scan_idx =
-        (temp[scan_idx] & COEFF_IDX_MASK) ^ COEFF_IDX_MASK;
-    const int coeff_idx = sc->scan[default_scan_idx];
-    scan[scan_idx] = coeff_idx;
-    iscan[coeff_idx] = scan_idx;
-  }
-}
-#endif
-
-static void update_scan_order_facade(AV1_COMMON *cm, TX_SIZE tx_size,
-                                     TX_TYPE tx_type, int use_curr_frame) {
-#if USE_TOPOLOGICAL_SORT
-  int16_t sort_order[COEFF_IDX_SIZE];
-#endif
-  uint32_t *non_zero_prob;
-  if (use_curr_frame)
-    non_zero_prob = get_non_zero_prob(cm->fc, tx_size, tx_type);
-  else
-    non_zero_prob = get_non_zero_prob(cm->pre_fc, tx_size, tx_type);
-  int16_t *scan = get_adapt_scan(cm->fc, tx_size, tx_type);
-  int16_t *iscan = get_adapt_iscan(cm->fc, tx_size, tx_type);
-  int16_t *nb = get_adapt_nb(cm->fc, tx_size, tx_type);
-  assert(tx_size_2d[tx_size] <= COEFF_IDX_SIZE);
-#if USE_TOPOLOGICAL_SORT
-  av1_update_sort_order(tx_size, tx_type, non_zero_prob, sort_order);
-  av1_update_scan_order(tx_size, sort_order, scan, iscan);
-#else
-  av1_update_scan_order(tx_size, tx_type, non_zero_prob, scan, iscan);
-#endif
-#if USE_LIMIT_SCAN_DISTANCE
-  limit_nb_scan_distance(tx_size, scan, iscan);
-#endif  // USE_LIMIT_SCAN_DISTANCE
-  av1_update_neighbors(tx_size, scan, iscan, nb);
-}
-
-static void update_eob_threshold(AV1_COMMON *cm, TX_SIZE tx_size,
-                                 TX_TYPE tx_type) {
-  int i, row, col, row_limit, col_limit, cal_idx = 0;
-  const int tx_width = tx_size_wide[tx_size];
-  const int tx_height = tx_size_high[tx_size];
-
-  row_limit = tx_width >> 1;
-  col_limit = tx_height >> 1;
-
-  if (tx_width >= 8 && tx_height >= 8) {
-    SCAN_ORDER *sc = &cm->fc->sc[tx_size][tx_type];
-    int16_t *threshold = &cm->fc->eob_threshold[tx_size][tx_type][0];
-    const int tx2d_size = tx_size_2d[tx_size];
-
-    while (cal_idx < EOB_THRESHOLD_NUM) {
-      for (i = 0; i < tx2d_size; ++i) {
-        row = sc->scan[i] / tx_height;
-        col = sc->scan[i] % tx_width;
-        if (row >= row_limit || col >= col_limit) break;
-      }
-      row_limit >>= 1;
-      col_limit >>= 1;
-      threshold[cal_idx] = i;
-      cal_idx++;
-    }
-  }
-}
-
-void av1_init_scan_order(AV1_COMMON *cm) {
-  TX_SIZE tx_size;
-  TX_TYPE tx_type;
-  for (tx_size = 0; tx_size < TX_SIZES_ALL; ++tx_size) {
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    if (tx_size > TX_32X16) continue;
-#else
-    if (tx_size >= TX_SIZES) continue;
-#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
-      if (do_adapt_scan(tx_size, tx_type)) {
-        uint32_t *non_zero_prob = get_non_zero_prob(cm->fc, tx_size, tx_type);
-        const int tx2d_size = tx_size_2d[tx_size];
-        int i;
-        SCAN_ORDER *sc = &cm->fc->sc[tx_size][tx_type];
-        for (i = 0; i < tx2d_size; ++i) {
-          non_zero_prob[i] = (1 << ADAPT_SCAN_PROB_PRECISION) /
-                             2;  // init non_zero_prob to 0.5
-        }
-        update_scan_order_facade(cm, tx_size, tx_type, 1);
-        sc->scan = get_adapt_scan(cm->fc, tx_size, tx_type);
-        sc->iscan = get_adapt_iscan(cm->fc, tx_size, tx_type);
-        sc->neighbors = get_adapt_nb(cm->fc, tx_size, tx_type);
-        update_eob_threshold(cm, tx_size, tx_type);
-      }
-    }
-  }
-}
-
-void av1_adapt_scan_order(AV1_COMMON *cm) {
-  if (cm->use_adapt_scan) {
-    TX_SIZE tx_size;
-#if CACHE_SCAN_PROB
-    int use_curr_frame = 0;
-#else   // CACHE_SCAN_PROB
-    int use_curr_frame = 1;
-#endif  // CACHE_SCAN_PROB
-
-    for (tx_size = 0; tx_size < TX_SIZES_ALL; ++tx_size) {
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-      if (tx_size > TX_32X16) continue;
-#else
-      if (tx_size >= TX_SIZES) continue;
-#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-      TX_TYPE tx_type;
-      for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
-        if (do_adapt_scan(tx_size, tx_type)) {
-          update_scan_prob(cm, tx_size, tx_type, ADAPT_SCAN_UPDATE_RATE);
-          update_scan_order_facade(cm, tx_size, tx_type, use_curr_frame);
-          update_eob_threshold(cm, tx_size, tx_type);
-        }
-      }
-    }
-  }
-}
-
-void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd) {
-  xd->eob_threshold_md = (const EobThresholdMD *)cm->fc->eob_threshold;
-}
-#endif  // CONFIG_ADAPT_SCAN
diff --git a/third_party/aom/av1/common/scan.h b/third_party/aom/av1/common/scan.h
index 82d2e917f..c5cebc135 100644
--- a/third_party/aom/av1/common/scan.h
+++ b/third_party/aom/av1/common/scan.h
@@ -25,51 +25,18 @@ extern "C" {
 
 #define MAX_NEIGHBORS 2
 
-extern const SCAN_ORDER av1_default_scan_orders[TX_SIZES];
-extern const SCAN_ORDER av1_intra_scan_orders[TX_SIZES_ALL][TX_TYPES];
-extern const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES];
-
-#if CONFIG_ADAPT_SCAN
-#define USE_2X2_PROB 1
-#define USE_TOPOLOGICAL_SORT 0
-#define USE_LIMIT_SCAN_DISTANCE 0
-void av1_update_scan_count_facade(AV1_COMMON *cm, FRAME_COUNTS *counts,
-                                  TX_SIZE tx_size, TX_TYPE tx_type,
-                                  const tran_low_t *dqcoeffs, int max_scan);
-
-// embed r + c and coeff_idx info with nonzero probabilities. When sorting the
-// nonzero probabilities, if there is a tie, the coefficient with smaller r + c
-// will be scanned first
-void av1_augment_prob(TX_SIZE tx_size, TX_TYPE tx_type, uint32_t *prob);
+typedef enum SCAN_MODE {
+  SCAN_MODE_ZIG_ZAG,
+  SCAN_MODE_COL_DIAG,
+  SCAN_MODE_ROW_DIAG,
+  SCAN_MODE_COL_1D,
+  SCAN_MODE_ROW_1D,
+  SCAN_MODES
+} SCAN_MODE;
 
-#if USE_TOPOLOGICAL_SORT
-// apply quick sort on nonzero probabilities to obtain a sort order
-void av1_update_sort_order(TX_SIZE tx_size, TX_TYPE tx_type,
-                           const uint32_t *non_zero_prob, int16_t *sort_order);
-
-// apply topological sort on the nonzero probabilities sorting order to
-// guarantee each to-be-scanned coefficient's upper and left coefficient will be
-// scanned before the to-be-scanned coefficient.
-void av1_update_scan_order(TX_SIZE tx_size, int16_t *sort_order, int16_t *scan,
-                           int16_t *iscan);
-#else   // USE_TOPOLOGICAL_SORT
-void av1_update_scan_order(TX_SIZE tx_size, TX_TYPE tx_type,
-                           uint32_t *non_zero_prob, int16_t *scan,
-                           int16_t *iscan);
-#endif  // USE_TOPOLOGICAL_SORT
+extern const SCAN_ORDER av1_default_scan_orders[TX_SIZES];
+extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES];
 
-// For each coeff_idx in scan[], update its above and left neighbors in
-// neighbors[] accordingly.
-void av1_update_neighbors(TX_SIZE tx_size, const int16_t *scan,
-                          const int16_t *iscan, int16_t *neighbors);
-void av1_init_scan_order(AV1_COMMON *cm);
-void av1_adapt_scan_order(AV1_COMMON *cm);
-#if USE_2X2_PROB
-void av1_down_sample_scan_count(uint32_t *non_zero_count_ds,
-                                const uint32_t *non_zero_count,
-                                TX_SIZE tx_size);
-#endif  // USE_2X2_PROB
-#endif  // CONFIG_ADAPT_SCAN
 void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd);
 
 static INLINE int get_coef_context(const int16_t *neighbors,
@@ -80,52 +47,12 @@ static INLINE int get_coef_context(const int16_t *neighbors,
 }
 
 static INLINE const SCAN_ORDER *get_default_scan(TX_SIZE tx_size,
-                                                 TX_TYPE tx_type,
-                                                 int is_inter) {
-#if CONFIG_EXT_TX || CONFIG_VAR_TX
-  return is_inter ? &av1_inter_scan_orders[tx_size][tx_type]
-                  : &av1_intra_scan_orders[tx_size][tx_type];
-#else
-  (void)is_inter;
-  return &av1_intra_scan_orders[tx_size][tx_type];
-#endif  // CONFIG_EXT_TX
+                                                 TX_TYPE tx_type) {
+  return &av1_scan_orders[tx_size][tx_type];
 }
 
-static INLINE int do_adapt_scan(TX_SIZE tx_size, TX_TYPE tx_type) {
-  (void)tx_size;
-#if CONFIG_EXT_TX
-  if (tx_size_2d[tx_size] >= 1024 && tx_type != DCT_DCT) return 0;
-  return tx_type < IDTX;
-#else
-  (void)tx_type;
-  return 1;
-#endif
-}
-
-static INLINE const SCAN_ORDER *get_scan(const AV1_COMMON *cm, TX_SIZE tx_size,
-                                         TX_TYPE tx_type,
-                                         const MB_MODE_INFO *mbmi) {
-#if CONFIG_MRC_TX
-  // use the DCT_DCT scan order for MRC_DCT for now
-  if (tx_type == MRC_DCT) tx_type = DCT_DCT;
-#endif  // CONFIG_MRC_TX
-#if CONFIG_LGT_FROM_PRED
-  if (mbmi->use_lgt) tx_type = DCT_DCT;
-#endif
-  const int is_inter = is_inter_block(mbmi);
-#if CONFIG_ADAPT_SCAN
-  (void)mbmi;
-  (void)is_inter;
-#if CONFIG_EXT_TX
-  if (!do_adapt_scan(tx_size, tx_type))
-    return get_default_scan(tx_size, tx_type, is_inter);
-  else
-#endif  // CONFIG_EXT_TX
-    return &cm->fc->sc[tx_size][tx_type];
-#else   // CONFIG_ADAPT_SCAN
-  (void)cm;
-  return get_default_scan(tx_size, tx_type, is_inter);
-#endif  // CONFIG_ADAPT_SCAN
+static INLINE const SCAN_ORDER *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) {
+  return get_default_scan(tx_size, tx_type);
 }
 
 #ifdef __cplusplus
diff --git a/third_party/aom/av1/common/seg_common.c b/third_party/aom/av1/common/seg_common.c
index 4603026bd..cd189ad76 100644
--- a/third_party/aom/av1/common/seg_common.c
+++ b/third_party/aom/av1/common/seg_common.c
@@ -16,18 +16,11 @@
 #include "av1/common/seg_common.h"
 #include "av1/common/quant_common.h"
 
-#if CONFIG_LOOPFILTER_LEVEL
-static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 1, 1, 0, 0 };
+static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 1, 1, 1, 0, 0 };
 
 static const int seg_feature_data_max[SEG_LVL_MAX] = {
-  MAXQ, MAX_LOOP_FILTER, MAX_LOOP_FILTER, MAX_LOOP_FILTER, 0
+  MAXQ, MAX_LOOP_FILTER, MAX_LOOP_FILTER, MAX_LOOP_FILTER, MAX_LOOP_FILTER, 7, 0
 };
-#else
-static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 0, 0 };
-
-static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ, MAX_LOOP_FILTER, 3,
-                                                       0 };
-#endif  // CONFIG_LOOPFILTER_LEVEL
 
 // These functions provide access to new segment level features.
 // Eventually these function may be "optimized out" but for the moment,
@@ -39,6 +32,19 @@ void av1_clearall_segfeatures(struct segmentation *seg) {
   av1_zero(seg->feature_mask);
 }
 
+void calculate_segdata(struct segmentation *seg) {
+  seg->segid_preskip = 0;
+  seg->last_active_segid = 0;
+  for (int i = 0; i < MAX_SEGMENTS; i++) {
+    for (int j = 0; j < SEG_LVL_MAX; j++) {
+      if (seg->feature_mask[i] & (1 << j)) {
+        seg->segid_preskip |= (j >= SEG_LVL_REF_FRAME);
+        seg->last_active_segid = i;
+      }
+    }
+  }
+}
+
 void av1_enable_segfeature(struct segmentation *seg, int segment_id,
                            SEG_LVL_FEATURES feature_id) {
   seg->feature_mask[segment_id] |= 1 << feature_id;
@@ -52,6 +58,17 @@ int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
   return seg_feature_data_signed[feature_id];
 }
 
+// The 'seg_data' given for each segment can be either deltas (from the default
+// value chosen for the frame) or absolute values.
+//
+// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for
+// SEGMENT_ALT_LF)
+// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for
+// SEGMENT_ALT_LF)
+//
+// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
+// the absolute values given).
+
 void av1_set_segdata(struct segmentation *seg, int segment_id,
                      SEG_LVL_FEATURES feature_id, int seg_data) {
   if (seg_data < 0) {
@@ -64,8 +81,4 @@ void av1_set_segdata(struct segmentation *seg, int segment_id,
   seg->feature_data[segment_id][feature_id] = seg_data;
 }
 
-const aom_tree_index av1_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = {
-  2, 4, 6, 8, 10, 12, 0, -1, -2, -3, -4, -5, -6, -7
-};
-
 // TBD? Functions to read and write segment data with range / validity checking
diff --git a/third_party/aom/av1/common/seg_common.h b/third_party/aom/av1/common/seg_common.h
index 6d16aedb6..c851d65fd 100644
--- a/third_party/aom/av1/common/seg_common.h
+++ b/third_party/aom/av1/common/seg_common.h
@@ -18,15 +18,12 @@
 extern "C" {
 #endif
 
-#define SEGMENT_DELTADATA 0
-#define SEGMENT_ABSDATA 1
-
 #define MAX_SEGMENTS 8
 #define SEG_TREE_PROBS (MAX_SEGMENTS - 1)
 
-#define PREDICTION_PROBS 3
+#define SEG_TEMPORAL_PRED_CTXS 3
+#define SPATIAL_PREDICTION_PROBS 3
 
-#if CONFIG_LOOPFILTER_LEVEL
 typedef enum {
   SEG_LVL_ALT_Q,       // Use alternate Quantizer ....
   SEG_LVL_ALT_LF_Y_V,  // Use alternate loop filter value on y plane vertical
@@ -35,47 +32,31 @@ typedef enum {
   SEG_LVL_ALT_LF_V,    // Use alternate loop filter value on v plane
   SEG_LVL_REF_FRAME,   // Optional Segment reference frame
   SEG_LVL_SKIP,        // Optional Segment (0,0) + skip mode
-#if CONFIG_SEGMENT_ZEROMV
-  SEG_LVL_ZEROMV,
-  SEG_LVL_MAX
-#else
+  SEG_LVL_GLOBALMV,
   SEG_LVL_MAX
-#endif
-} SEG_LVL_FEATURES;
-#else  // CONFIG_LOOPFILTER_LEVEL
-// Segment level features.
-typedef enum {
-  SEG_LVL_ALT_Q = 0,      // Use alternate Quantizer ....
-  SEG_LVL_ALT_LF = 1,     // Use alternate loop filter value...
-  SEG_LVL_REF_FRAME = 2,  // Optional Segment reference frame
-  SEG_LVL_SKIP = 3,  // Optional Segment (0,0) + skip mode
-#if CONFIG_SEGMENT_ZEROMV
-  SEG_LVL_ZEROMV = 4,
-  SEG_LVL_MAX = 5
-#else
-  SEG_LVL_MAX = 4
-#endif
 } SEG_LVL_FEATURES;
-#endif  // CONFIG_LOOPFILTER_LEVEL
 
 struct segmentation {
   uint8_t enabled;
   uint8_t update_map;
   uint8_t update_data;
-  uint8_t abs_delta;
   uint8_t temporal_update;
 
   int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX];
   unsigned int feature_mask[MAX_SEGMENTS];
+  int last_active_segid;  // The highest numbered segment id that has some
+                          // enabled feature.
+  uint8_t segid_preskip;  // Whether the segment id will be read before the
+                          // skip syntax element.
+                          // 1: the segment id will be read first.
+                          // 0: the skip syntax element will be read first.
 };
 
 struct segmentation_probs {
-  aom_prob tree_probs[SEG_TREE_PROBS];
   aom_cdf_prob tree_cdf[CDF_SIZE(MAX_SEGMENTS)];
-  aom_prob pred_probs[PREDICTION_PROBS];
-#if CONFIG_NEW_MULTISYMBOL
-  aom_cdf_prob pred_cdf[PREDICTION_PROBS][CDF_SIZE(2)];
-#endif
+  aom_cdf_prob pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)];
+  aom_cdf_prob spatial_pred_seg_cdf[SPATIAL_PREDICTION_PROBS]
+                                   [CDF_SIZE(MAX_SEGMENTS)];
 };
 
 static INLINE int segfeature_active(const struct segmentation *seg,
@@ -84,11 +65,26 @@ static INLINE int segfeature_active(const struct segmentation *seg,
   return seg->enabled && (seg->feature_mask[segment_id] & (1 << feature_id));
 }
 
+static INLINE void segfeatures_copy(struct segmentation *dst,
+                                    const struct segmentation *src) {
+  int i, j;
+  for (i = 0; i < MAX_SEGMENTS; i++) {
+    dst->feature_mask[i] = src->feature_mask[i];
+    for (j = 0; j < SEG_LVL_MAX; j++) {
+      dst->feature_data[i][j] = src->feature_data[i][j];
+    }
+  }
+  dst->segid_preskip = src->segid_preskip;
+  dst->last_active_segid = src->last_active_segid;
+}
+
 void av1_clearall_segfeatures(struct segmentation *seg);
 
 void av1_enable_segfeature(struct segmentation *seg, int segment_id,
                            SEG_LVL_FEATURES feature_id);
 
+void calculate_segdata(struct segmentation *seg);
+
 int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id);
 
 int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id);
@@ -101,8 +97,6 @@ static INLINE int get_segdata(const struct segmentation *seg, int segment_id,
   return seg->feature_data[segment_id][feature_id];
 }
 
-extern const aom_tree_index av1_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/common/thread_common.c b/third_party/aom/av1/common/thread_common.c
index 4c9fa6962..3fa998a91 100644
--- a/third_party/aom/av1/common/thread_common.c
+++ b/third_party/aom/av1/common/thread_common.c
@@ -9,40 +9,158 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
+#include "av1/common/av1_loopfilter.h"
 #include "av1/common/entropymode.h"
 #include "av1/common/thread_common.h"
 #include "av1/common/reconinter.h"
 
+// Set up nsync by width.
+static INLINE int get_sync_range(int width) {
+  // nsync numbers are picked by testing. For example, for 4k
+  // video, using 4 gives best performance.
+  if (width < 640)
+    return 1;
+  else if (width <= 1280)
+    return 2;
+  else if (width <= 4096)
+    return 4;
+  else
+    return 8;
+}
+
+static INLINE int get_lr_sync_range(int width) {
+#if 0
+  // nsync numbers are picked by testing. For example, for 4k
+  // video, using 4 gives best performance.
+  if (width < 640)
+    return 1;
+  else if (width <= 1280)
+    return 2;
+  else if (width <= 4096)
+    return 4;
+  else
+    return 8;
+#else
+  (void)width;
+  return 1;
+#endif
+}
+
+// Allocate memory for lf row synchronization
+static void loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
+                              int width, int num_workers) {
+  lf_sync->rows = rows;
 #if CONFIG_MULTITHREAD
-static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
-  const int kMaxTryLocks = 4000;
-  int locked = 0;
-  int i;
+  {
+    int i, j;
+
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      CHECK_MEM_ERROR(cm, lf_sync->mutex_[j],
+                      aom_malloc(sizeof(*(lf_sync->mutex_[j])) * rows));
+      if (lf_sync->mutex_[j]) {
+        for (i = 0; i < rows; ++i) {
+          pthread_mutex_init(&lf_sync->mutex_[j][i], NULL);
+        }
+      }
 
-  for (i = 0; i < kMaxTryLocks; ++i) {
-    if (!pthread_mutex_trylock(mutex)) {
-      locked = 1;
-      break;
+      CHECK_MEM_ERROR(cm, lf_sync->cond_[j],
+                      aom_malloc(sizeof(*(lf_sync->cond_[j])) * rows));
+      if (lf_sync->cond_[j]) {
+        for (i = 0; i < rows; ++i) {
+          pthread_cond_init(&lf_sync->cond_[j][i], NULL);
+        }
+      }
+    }
+
+    CHECK_MEM_ERROR(cm, lf_sync->job_mutex,
+                    aom_malloc(sizeof(*(lf_sync->job_mutex))));
+    if (lf_sync->job_mutex) {
+      pthread_mutex_init(lf_sync->job_mutex, NULL);
     }
   }
+#endif  // CONFIG_MULTITHREAD
+  CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+                  aom_malloc(num_workers * sizeof(*(lf_sync->lfdata))));
+  lf_sync->num_workers = num_workers;
 
-  if (!locked) pthread_mutex_lock(mutex);
+  for (int j = 0; j < MAX_MB_PLANE; j++) {
+    CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col[j],
+                    aom_malloc(sizeof(*(lf_sync->cur_sb_col[j])) * rows));
+  }
+  CHECK_MEM_ERROR(
+      cm, lf_sync->job_queue,
+      aom_malloc(sizeof(*(lf_sync->job_queue)) * rows * MAX_MB_PLANE * 2));
+  // Set up nsync.
+  lf_sync->sync_range = get_sync_range(width);
 }
+
+// Deallocate lf synchronization related mutex and data
+void av1_loop_filter_dealloc(AV1LfSync *lf_sync) {
+  if (lf_sync != NULL) {
+    int j;
+#if CONFIG_MULTITHREAD
+    int i;
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      if (lf_sync->mutex_[j] != NULL) {
+        for (i = 0; i < lf_sync->rows; ++i) {
+          pthread_mutex_destroy(&lf_sync->mutex_[j][i]);
+        }
+        aom_free(lf_sync->mutex_[j]);
+      }
+      if (lf_sync->cond_[j] != NULL) {
+        for (i = 0; i < lf_sync->rows; ++i) {
+          pthread_cond_destroy(&lf_sync->cond_[j][i]);
+        }
+        aom_free(lf_sync->cond_[j]);
+      }
+    }
+    if (lf_sync->job_mutex != NULL) {
+      pthread_mutex_destroy(lf_sync->job_mutex);
+      aom_free(lf_sync->job_mutex);
+    }
 #endif  // CONFIG_MULTITHREAD
+    aom_free(lf_sync->lfdata);
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      aom_free(lf_sync->cur_sb_col[j]);
+    }
+
+    aom_free(lf_sync->job_queue);
+    // clear the structure as the source of this call may be a resize in which
+    // case this call will be followed by an _alloc() which may fail.
+    av1_zero(*lf_sync);
+  }
+}
+
+static void loop_filter_data_reset(LFWorkerData *lf_data,
+                                   YV12_BUFFER_CONFIG *frame_buffer,
+                                   struct AV1Common *cm, MACROBLOCKD *xd) {
+  struct macroblockd_plane *pd = xd->plane;
+  lf_data->frame_buffer = frame_buffer;
+  lf_data->cm = cm;
+  lf_data->xd = xd;
+  for (int i = 0; i < MAX_MB_PLANE; i++) {
+    memcpy(&lf_data->planes[i].dst, &pd[i].dst, sizeof(lf_data->planes[i].dst));
+    lf_data->planes[i].subsampling_x = pd[i].subsampling_x;
+    lf_data->planes[i].subsampling_y = pd[i].subsampling_y;
+  }
+}
 
-static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c) {
+static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c,
+                             int plane) {
 #if CONFIG_MULTITHREAD
   const int nsync = lf_sync->sync_range;
 
   if (r && !(c & (nsync - 1))) {
-    pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1];
-    mutex_lock(mutex);
+    pthread_mutex_t *const mutex = &lf_sync->mutex_[plane][r - 1];
+    pthread_mutex_lock(mutex);
 
-    while (c > lf_sync->cur_sb_col[r - 1] - nsync) {
-      pthread_cond_wait(&lf_sync->cond_[r - 1], mutex);
+    while (c > lf_sync->cur_sb_col[plane][r - 1] - nsync) {
+      pthread_cond_wait(&lf_sync->cond_[plane][r - 1], mutex);
     }
     pthread_mutex_unlock(mutex);
   }
@@ -50,11 +168,12 @@ static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c) {
   (void)lf_sync;
   (void)r;
   (void)c;
+  (void)plane;
 #endif  // CONFIG_MULTITHREAD
 }
 
 static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c,
-                              const int sb_cols) {
+                              const int sb_cols, int plane) {
 #if CONFIG_MULTITHREAD
   const int nsync = lf_sync->sync_range;
   int cur;
@@ -69,321 +188,156 @@ static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c,
   }
 
   if (sig) {
-    mutex_lock(&lf_sync->mutex_[r]);
+    pthread_mutex_lock(&lf_sync->mutex_[plane][r]);
 
-    lf_sync->cur_sb_col[r] = cur;
+    lf_sync->cur_sb_col[plane][r] = cur;
 
-    pthread_cond_signal(&lf_sync->cond_[r]);
-    pthread_mutex_unlock(&lf_sync->mutex_[r]);
+    pthread_cond_broadcast(&lf_sync->cond_[plane][r]);
+    pthread_mutex_unlock(&lf_sync->mutex_[plane][r]);
   }
 #else
   (void)lf_sync;
   (void)r;
   (void)c;
   (void)sb_cols;
+  (void)plane;
 #endif  // CONFIG_MULTITHREAD
 }
 
-#if !CONFIG_EXT_PARTITION_TYPES
-static INLINE enum lf_path get_loop_filter_path(
-    int y_only, struct macroblockd_plane *planes) {
-  if (y_only)
-    return LF_PATH_444;
-  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
-    return LF_PATH_420;
-  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
-    return LF_PATH_444;
-  else
-    return LF_PATH_SLOW;
-}
+static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start,
+                            int stop, int plane_start, int plane_end) {
+  int mi_row, plane, dir;
+  AV1LfMTInfo *lf_job_queue = lf_sync->job_queue;
+  lf_sync->jobs_enqueued = 0;
+  lf_sync->jobs_dequeued = 0;
 
-static INLINE void loop_filter_block_plane_ver(
-    AV1_COMMON *cm, struct macroblockd_plane *planes, int plane,
-    MODE_INFO **mi, int mi_row, int mi_col, enum lf_path path,
-    LOOP_FILTER_MASK *lfm) {
-  if (plane == 0) {
-    av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, lfm);
-  } else {
-    switch (path) {
-      case LF_PATH_420:
-        av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, lfm);
-        break;
-      case LF_PATH_444:
-        av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, lfm);
-        break;
-      case LF_PATH_SLOW:
-        av1_filter_block_plane_non420_ver(cm, &planes[plane], mi, mi_row,
-                                          mi_col, plane);
+  for (dir = 0; dir < 2; dir++) {
+    for (plane = plane_start; plane < plane_end; plane++) {
+      if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
         break;
+      else if (plane == 1 && !(cm->lf.filter_level_u))
+        continue;
+      else if (plane == 2 && !(cm->lf.filter_level_v))
+        continue;
+      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+        lf_job_queue->mi_row = mi_row;
+        lf_job_queue->plane = plane;
+        lf_job_queue->dir = dir;
+        lf_job_queue++;
+        lf_sync->jobs_enqueued++;
+      }
     }
   }
 }
 
-static INLINE void loop_filter_block_plane_hor(
-    AV1_COMMON *cm, struct macroblockd_plane *planes, int plane,
-    MODE_INFO **mi, int mi_row, int mi_col, enum lf_path path,
-    LOOP_FILTER_MASK *lfm) {
-  if (plane == 0) {
-    av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, lfm);
-  } else {
-    switch (path) {
-      case LF_PATH_420:
-        av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, lfm);
-        break;
-      case LF_PATH_444:
-        av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, lfm);
-        break;
-      case LF_PATH_SLOW:
-        av1_filter_block_plane_non420_hor(cm, &planes[plane], mi, mi_row,
-                                          mi_col, plane);
-        break;
-    }
+AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) {
+  AV1LfMTInfo *cur_job_info = NULL;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(lf_sync->job_mutex);
+
+  if (lf_sync->jobs_dequeued < lf_sync->jobs_enqueued) {
+    cur_job_info = lf_sync->job_queue + lf_sync->jobs_dequeued;
+    lf_sync->jobs_dequeued++;
   }
-}
-#endif
-// Row-based multi-threaded loopfilter hook
-#if CONFIG_PARALLEL_DEBLOCKING
-static int loop_filter_ver_row_worker(AV1LfSync *const lf_sync,
-                                      LFWorkerData *const lf_data) {
-  const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
-  int mi_row, mi_col;
-#if !CONFIG_EXT_PARTITION_TYPES
-  enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
-#endif
-  for (mi_row = lf_data->start; mi_row < lf_data->stop;
-       mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
-    MODE_INFO **const mi =
-        lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
-
-    for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
-         mi_col += lf_data->cm->mib_size) {
-      LOOP_FILTER_MASK lfm;
-      int plane;
-
-      av1_setup_dst_planes(lf_data->planes, lf_data->cm->sb_size,
-                           lf_data->frame_buffer, mi_row, mi_col);
-      av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
-                     lf_data->cm->mi_stride, &lfm);
-
-#if CONFIG_EXT_PARTITION_TYPES
-      for (plane = 0; plane < num_planes; ++plane)
-        av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane],
-                                          mi + mi_col, mi_row, mi_col, plane);
-#else
 
-      for (plane = 0; plane < num_planes; ++plane)
-        loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane,
-                                    mi + mi_col, mi_row, mi_col, path, &lfm);
+  pthread_mutex_unlock(lf_sync->job_mutex);
+#else
+  (void)lf_sync;
 #endif
-    }
-  }
-  return 1;
+
+  return cur_job_info;
 }
 
-static int loop_filter_hor_row_worker(AV1LfSync *const lf_sync,
-                                      LFWorkerData *const lf_data) {
-  const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
+// Implement row loopfiltering for each thread.
+static INLINE void thread_loop_filter_rows(
+    const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
+    struct macroblockd_plane *planes, MACROBLOCKD *xd,
+    AV1LfSync *const lf_sync) {
   const int sb_cols =
-      mi_cols_aligned_to_sb(lf_data->cm) >> lf_data->cm->mib_size_log2;
-  int mi_row, mi_col;
-#if !CONFIG_EXT_PARTITION_TYPES
-  enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
-#endif
-
-  for (mi_row = lf_data->start; mi_row < lf_data->stop;
-       mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
-    MODE_INFO **const mi =
-        lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
-
-    for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
-         mi_col += lf_data->cm->mib_size) {
-      const int r = mi_row >> lf_data->cm->mib_size_log2;
-      const int c = mi_col >> lf_data->cm->mib_size_log2;
-      LOOP_FILTER_MASK lfm;
-      int plane;
-
-      // TODO(wenhao.zhang@intel.com): For better parallelization, reorder
-      // the outer loop to column-based and remove the synchronizations here.
-      sync_read(lf_sync, r, c);
-
-      av1_setup_dst_planes(lf_data->planes, lf_data->cm->sb_size,
-                           lf_data->frame_buffer, mi_row, mi_col);
-      av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
-                     lf_data->cm->mi_stride, &lfm);
-#if CONFIG_EXT_PARTITION_TYPES
-      for (plane = 0; plane < num_planes; ++plane)
-        av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane],
-                                          mi + mi_col, mi_row, mi_col, plane);
-#else
-      for (plane = 0; plane < num_planes; ++plane)
-        loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane,
-                                    mi + mi_col, mi_row, mi_col, path, &lfm);
-#endif
-      sync_write(lf_sync, r, c, sb_cols);
+      ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+  int mi_row, mi_col, plane, dir;
+  int r, c;
+
+  while (1) {
+    AV1LfMTInfo *cur_job_info = get_lf_job_info(lf_sync);
+
+    if (cur_job_info != NULL) {
+      mi_row = cur_job_info->mi_row;
+      plane = cur_job_info->plane;
+      dir = cur_job_info->dir;
+      r = mi_row >> MAX_MIB_SIZE_LOG2;
+
+      if (dir == 0) {
+        for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+          c = mi_col >> MAX_MIB_SIZE_LOG2;
+
+          av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
+                               mi_row, mi_col, plane, plane + 1);
+
+          av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
+                                      mi_col);
+          sync_write(lf_sync, r, c, sb_cols, plane);
+        }
+      } else if (dir == 1) {
+        for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+          c = mi_col >> MAX_MIB_SIZE_LOG2;
+
+          // Wait for vertical edge filtering of the top-right block to be
+          // completed
+          sync_read(lf_sync, r, c, plane);
+
+          // Wait for vertical edge filtering of the right block to be
+          // completed
+          sync_read(lf_sync, r + 1, c, plane);
+
+          av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
+                               mi_row, mi_col, plane, plane + 1);
+          av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
+                                      mi_col);
+        }
+      }
+    } else {
+      break;
     }
   }
-  return 1;
 }
-#else  //  CONFIG_PARALLEL_DEBLOCKING
+
+// Row-based multi-threaded loopfilter hook
 static int loop_filter_row_worker(AV1LfSync *const lf_sync,
                                   LFWorkerData *const lf_data) {
-  const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
-  const int sb_cols =
-      mi_cols_aligned_to_sb(lf_data->cm) >> lf_data->cm->mib_size_log2;
-  int mi_row, mi_col;
-#if !CONFIG_EXT_PARTITION_TYPES
-  enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
-#endif  // !CONFIG_EXT_PARTITION_TYPES
-
-#if CONFIG_EXT_PARTITION
-  printf(
-      "STOPPING: This code has not been modified to work with the "
-      "extended coding unit size experiment");
-  exit(EXIT_FAILURE);
-#endif  // CONFIG_EXT_PARTITION
-
-  for (mi_row = lf_data->start; mi_row < lf_data->stop;
-       mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
-    MODE_INFO **const mi =
-        lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
-
-    for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
-         mi_col += lf_data->cm->mib_size) {
-      const int r = mi_row >> lf_data->cm->mib_size_log2;
-      const int c = mi_col >> lf_data->cm->mib_size_log2;
-#if !CONFIG_EXT_PARTITION_TYPES
-      LOOP_FILTER_MASK lfm;
-#endif
-      int plane;
-
-      sync_read(lf_sync, r, c);
-
-      av1_setup_dst_planes(lf_data->planes, lf_data->cm->sb_size,
-                           lf_data->frame_buffer, mi_row, mi_col);
-#if CONFIG_EXT_PARTITION_TYPES
-      for (plane = 0; plane < num_planes; ++plane) {
-        av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane],
-                                          mi + mi_col, mi_row, mi_col, plane);
-        av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane],
-                                          mi + mi_col, mi_row, mi_col, plane);
-      }
-#else
-      av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
-                     lf_data->cm->mi_stride, &lfm);
-
-      for (plane = 0; plane < num_planes; ++plane) {
-        loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane,
-                                    mi + mi_col, mi_row, mi_col, path, &lfm);
-        loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane,
-                                    mi + mi_col, mi_row, mi_col, path, &lfm);
-      }
-#endif  // CONFIG_EXT_PARTITION_TYPES
-      sync_write(lf_sync, r, c, sb_cols);
-    }
-  }
+  thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                          lf_data->xd, lf_sync);
   return 1;
 }
-#endif  //  CONFIG_PARALLEL_DEBLOCKING
 
 static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
-                                struct macroblockd_plane *planes, int start,
-                                int stop, int y_only, AVxWorker *workers,
-                                int nworkers, AV1LfSync *lf_sync) {
-#if CONFIG_EXT_PARTITION
-  printf(
-      "STOPPING: This code has not been modified to work with the "
-      "extended coding unit size experiment");
-  exit(EXIT_FAILURE);
-#endif  // CONFIG_EXT_PARTITION
-
+                                MACROBLOCKD *xd, int start, int stop,
+                                int plane_start, int plane_end,
+                                AVxWorker *workers, int nworkers,
+                                AV1LfSync *lf_sync) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   // Number of superblock rows and cols
-  const int sb_rows = mi_rows_aligned_to_sb(cm) >> cm->mib_size_log2;
-  // Decoder may allocate more threads than number of tiles based on user's
-  // input.
-  const int tile_cols = cm->tile_cols;
-  const int num_workers = AOMMIN(nworkers, tile_cols);
+  const int sb_rows =
+      ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+  const int num_workers = nworkers;
   int i;
 
   if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
       num_workers > lf_sync->num_workers) {
     av1_loop_filter_dealloc(lf_sync);
-    av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
+    loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
   }
 
-// Set up loopfilter thread data.
-// The decoder is capping num_workers because it has been observed that using
-// more threads on the loopfilter than there are cores will hurt performance
-// on Android. This is because the system will only schedule the tile decode
-// workers on cores equal to the number of tile columns. Then if the decoder
-// tries to use more threads for the loopfilter, it will hurt performance
-// because of contention. If the multithreading code changes in the future
-// then the number of workers used by the loopfilter should be revisited.
-
-#if CONFIG_PARALLEL_DEBLOCKING
   // Initialize cur_sb_col to -1 for all SB rows.
-  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
-
-  // Filter all the vertical edges in the whole frame
-  for (i = 0; i < num_workers; ++i) {
-    AVxWorker *const worker = &workers[i];
-    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
-
-    worker->hook = (AVxWorkerHook)loop_filter_ver_row_worker;
-    worker->data1 = lf_sync;
-    worker->data2 = lf_data;
-
-    // Loopfilter data
-    av1_loop_filter_data_reset(lf_data, frame, cm, planes);
-    lf_data->start = start + i * cm->mib_size;
-    lf_data->stop = stop;
-    lf_data->y_only = y_only;
-
-    // Start loopfiltering
-    if (i == num_workers - 1) {
-      winterface->execute(worker);
-    } else {
-      winterface->launch(worker);
-    }
-  }
-
-  // Wait till all rows are finished
-  for (i = 0; i < num_workers; ++i) {
-    winterface->sync(&workers[i]);
-  }
-
-  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
-  // Filter all the horizontal edges in the whole frame
-  for (i = 0; i < num_workers; ++i) {
-    AVxWorker *const worker = &workers[i];
-    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
-
-    worker->hook = (AVxWorkerHook)loop_filter_hor_row_worker;
-    worker->data1 = lf_sync;
-    worker->data2 = lf_data;
-
-    // Loopfilter data
-    av1_loop_filter_data_reset(lf_data, frame, cm, planes);
-    lf_data->start = start + i * cm->mib_size;
-    lf_data->stop = stop;
-    lf_data->y_only = y_only;
-
-    // Start loopfiltering
-    if (i == num_workers - 1) {
-      winterface->execute(worker);
-    } else {
-      winterface->launch(worker);
-    }
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    memset(lf_sync->cur_sb_col[i], -1,
+           sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows);
   }
 
-  // Wait till all rows are finished
-  for (i = 0; i < num_workers; ++i) {
-    winterface->sync(&workers[i]);
-  }
-#else   // CONFIG_PARALLEL_DEBLOCKING
-  // Initialize cur_sb_col to -1 for all SB rows.
-  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+  enqueue_lf_jobs(lf_sync, cm, start, stop, plane_start, plane_end);
 
+  // Set up loopfilter thread data.
   for (i = 0; i < num_workers; ++i) {
     AVxWorker *const worker = &workers[i];
     LFWorkerData *const lf_data = &lf_sync->lfdata[i];
@@ -393,10 +347,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
     worker->data2 = lf_data;
 
     // Loopfilter data
-    av1_loop_filter_data_reset(lf_data, frame, cm, planes);
-    lf_data->start = start + i * cm->mib_size;
-    lf_data->stop = stop;
-    lf_data->y_only = y_only;
+    loop_filter_data_reset(lf_data, frame, cm, xd);
 
     // Start loopfiltering
     if (i == num_workers - 1) {
@@ -410,21 +361,14 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
   for (i = 0; i < num_workers; ++i) {
     winterface->sync(&workers[i]);
   }
-#endif  // CONFIG_PARALLEL_DEBLOCKING
 }
 
 void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
-                              struct macroblockd_plane *planes,
-                              int frame_filter_level,
-#if CONFIG_LOOPFILTER_LEVEL
-                              int frame_filter_level_r,
-#endif
-                              int y_only, int partial_frame, AVxWorker *workers,
+                              MACROBLOCKD *xd, int plane_start, int plane_end,
+                              int partial_frame, AVxWorker *workers,
                               int num_workers, AV1LfSync *lf_sync) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
 
-  if (!frame_filter_level) return;
-
   start_mi_row = 0;
   mi_rows_to_filter = cm->mi_rows;
   if (partial_frame && cm->mi_rows > 8) {
@@ -433,103 +377,406 @@ void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
     mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8);
   }
   end_mi_row = start_mi_row + mi_rows_to_filter;
-#if CONFIG_LOOPFILTER_LEVEL
-  av1_loop_filter_frame_init(cm, frame_filter_level, frame_filter_level_r,
-                             y_only);
+  av1_loop_filter_frame_init(cm, plane_start, plane_end);
+
+  loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
+                      plane_end, workers, num_workers, lf_sync);
+}
+
+static INLINE void lr_sync_read(void *const lr_sync, int r, int c, int plane) {
+#if CONFIG_MULTITHREAD
+  AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync;
+  const int nsync = loop_res_sync->sync_range;
+
+  if (r && !(c & (nsync - 1))) {
+    pthread_mutex_t *const mutex = &loop_res_sync->mutex_[plane][r - 1];
+    pthread_mutex_lock(mutex);
+
+    while (c > loop_res_sync->cur_sb_col[plane][r - 1] - nsync) {
+      pthread_cond_wait(&loop_res_sync->cond_[plane][r - 1], mutex);
+    }
+    pthread_mutex_unlock(mutex);
+  }
 #else
-  av1_loop_filter_frame_init(cm, frame_filter_level, frame_filter_level);
-#endif  // CONFIG_LOOPFILTER_LEVEL
-  loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row, y_only,
-                      workers, num_workers, lf_sync);
+  (void)lr_sync;
+  (void)r;
+  (void)c;
+  (void)plane;
+#endif  // CONFIG_MULTITHREAD
 }
 
-// Set up nsync by width.
-static INLINE int get_sync_range(int width) {
-  // nsync numbers are picked by testing. For example, for 4k
-  // video, using 4 gives best performance.
-  if (width < 640)
-    return 1;
-  else if (width <= 1280)
-    return 2;
-  else if (width <= 4096)
-    return 4;
-  else
-    return 8;
+static INLINE void lr_sync_write(void *const lr_sync, int r, int c,
+                                 const int sb_cols, int plane) {
+#if CONFIG_MULTITHREAD
+  AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync;
+  const int nsync = loop_res_sync->sync_range;
+  int cur;
+  // Only signal when there are enough filtered SB for next row to run.
+  int sig = 1;
+
+  if (c < sb_cols - 1) {
+    cur = c;
+    if (c % nsync) sig = 0;
+  } else {
+    cur = sb_cols + nsync;
+  }
+
+  if (sig) {
+    pthread_mutex_lock(&loop_res_sync->mutex_[plane][r]);
+
+    loop_res_sync->cur_sb_col[plane][r] = cur;
+
+    pthread_cond_broadcast(&loop_res_sync->cond_[plane][r]);
+    pthread_mutex_unlock(&loop_res_sync->mutex_[plane][r]);
+  }
+#else
+  (void)lr_sync;
+  (void)r;
+  (void)c;
+  (void)sb_cols;
+  (void)plane;
+#endif  // CONFIG_MULTITHREAD
 }
 
-// Allocate memory for lf row synchronization
-void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
-                           int width, int num_workers) {
-  lf_sync->rows = rows;
+// Allocate memory for loop restoration row synchronization
+static void loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm,
+                                   int num_workers, int num_rows_lr,
+                                   int num_planes, int width) {
+  lr_sync->rows = num_rows_lr;
+  lr_sync->num_planes = num_planes;
 #if CONFIG_MULTITHREAD
   {
-    int i;
+    int i, j;
+
+    for (j = 0; j < num_planes; j++) {
+      CHECK_MEM_ERROR(cm, lr_sync->mutex_[j],
+                      aom_malloc(sizeof(*(lr_sync->mutex_[j])) * num_rows_lr));
+      if (lr_sync->mutex_[j]) {
+        for (i = 0; i < num_rows_lr; ++i) {
+          pthread_mutex_init(&lr_sync->mutex_[j][i], NULL);
+        }
+      }
 
-    CHECK_MEM_ERROR(cm, lf_sync->mutex_,
-                    aom_malloc(sizeof(*lf_sync->mutex_) * rows));
-    if (lf_sync->mutex_) {
-      for (i = 0; i < rows; ++i) {
-        pthread_mutex_init(&lf_sync->mutex_[i], NULL);
+      CHECK_MEM_ERROR(cm, lr_sync->cond_[j],
+                      aom_malloc(sizeof(*(lr_sync->cond_[j])) * num_rows_lr));
+      if (lr_sync->cond_[j]) {
+        for (i = 0; i < num_rows_lr; ++i) {
+          pthread_cond_init(&lr_sync->cond_[j][i], NULL);
+        }
       }
     }
 
-    CHECK_MEM_ERROR(cm, lf_sync->cond_,
-                    aom_malloc(sizeof(*lf_sync->cond_) * rows));
-    if (lf_sync->cond_) {
-      for (i = 0; i < rows; ++i) {
-        pthread_cond_init(&lf_sync->cond_[i], NULL);
-      }
+    CHECK_MEM_ERROR(cm, lr_sync->job_mutex,
+                    aom_malloc(sizeof(*(lr_sync->job_mutex))));
+    if (lr_sync->job_mutex) {
+      pthread_mutex_init(lr_sync->job_mutex, NULL);
     }
   }
 #endif  // CONFIG_MULTITHREAD
+  CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata,
+                  aom_malloc(num_workers * sizeof(*(lr_sync->lrworkerdata))));
 
-  CHECK_MEM_ERROR(cm, lf_sync->lfdata,
-                  aom_malloc(num_workers * sizeof(*lf_sync->lfdata)));
-  lf_sync->num_workers = num_workers;
+  for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+    if (worker_idx < num_workers - 1) {
+      CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rst_tmpbuf,
+                      (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
+      CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rlbs,
+                      aom_malloc(sizeof(RestorationLineBuffers)));
+
+    } else {
+      lr_sync->lrworkerdata[worker_idx].rst_tmpbuf = cm->rst_tmpbuf;
+      lr_sync->lrworkerdata[worker_idx].rlbs = cm->rlbs;
+    }
+  }
 
-  CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
-                  aom_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
+  lr_sync->num_workers = num_workers;
 
+  for (int j = 0; j < num_planes; j++) {
+    CHECK_MEM_ERROR(
+        cm, lr_sync->cur_sb_col[j],
+        aom_malloc(sizeof(*(lr_sync->cur_sb_col[j])) * num_rows_lr));
+  }
+  CHECK_MEM_ERROR(
+      cm, lr_sync->job_queue,
+      aom_malloc(sizeof(*(lr_sync->job_queue)) * num_rows_lr * num_planes));
   // Set up nsync.
-  lf_sync->sync_range = get_sync_range(width);
+  lr_sync->sync_range = get_lr_sync_range(width);
 }
 
-// Deallocate lf synchronization related mutex and data
-void av1_loop_filter_dealloc(AV1LfSync *lf_sync) {
-  if (lf_sync != NULL) {
+// Deallocate loop restoration synchronization related mutex and data
+void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers) {
+  if (lr_sync != NULL) {
+    int j;
 #if CONFIG_MULTITHREAD
     int i;
-
-    if (lf_sync->mutex_ != NULL) {
-      for (i = 0; i < lf_sync->rows; ++i) {
-        pthread_mutex_destroy(&lf_sync->mutex_[i]);
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      if (lr_sync->mutex_[j] != NULL) {
+        for (i = 0; i < lr_sync->rows; ++i) {
+          pthread_mutex_destroy(&lr_sync->mutex_[j][i]);
+        }
+        aom_free(lr_sync->mutex_[j]);
       }
-      aom_free(lf_sync->mutex_);
-    }
-    if (lf_sync->cond_ != NULL) {
-      for (i = 0; i < lf_sync->rows; ++i) {
-        pthread_cond_destroy(&lf_sync->cond_[i]);
+      if (lr_sync->cond_[j] != NULL) {
+        for (i = 0; i < lr_sync->rows; ++i) {
+          pthread_cond_destroy(&lr_sync->cond_[j][i]);
+        }
+        aom_free(lr_sync->cond_[j]);
       }
-      aom_free(lf_sync->cond_);
+    }
+    if (lr_sync->job_mutex != NULL) {
+      pthread_mutex_destroy(lr_sync->job_mutex);
+      aom_free(lr_sync->job_mutex);
     }
 #endif  // CONFIG_MULTITHREAD
-    aom_free(lf_sync->lfdata);
-    aom_free(lf_sync->cur_sb_col);
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      aom_free(lr_sync->cur_sb_col[j]);
+    }
+
+    aom_free(lr_sync->job_queue);
+
+    if (lr_sync->lrworkerdata) {
+      for (int worker_idx = 0; worker_idx < num_workers - 1; worker_idx++) {
+        LRWorkerData *const workerdata_data =
+            lr_sync->lrworkerdata + worker_idx;
+
+        aom_free(workerdata_data->rst_tmpbuf);
+        aom_free(workerdata_data->rlbs);
+      }
+      aom_free(lr_sync->lrworkerdata);
+    }
+
     // clear the structure as the source of this call may be a resize in which
     // case this call will be followed by an _alloc() which may fail.
-    av1_zero(*lf_sync);
+    av1_zero(*lr_sync);
+  }
+}
+
+static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt,
+                            AV1_COMMON *cm) {
+  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+
+  const int num_planes = av1_num_planes(cm);
+  AV1LrMTInfo *lr_job_queue = lr_sync->job_queue;
+  int32_t lr_job_counter[2], num_even_lr_jobs = 0;
+  lr_sync->jobs_enqueued = 0;
+  lr_sync->jobs_dequeued = 0;
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+    num_even_lr_jobs =
+        num_even_lr_jobs + ((ctxt[plane].rsi->vert_units_per_tile + 1) >> 1);
+  }
+  lr_job_counter[0] = 0;
+  lr_job_counter[1] = num_even_lr_jobs;
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+    const int is_uv = plane > 0;
+    const int ss_y = is_uv && cm->subsampling_y;
+
+    AV1PixelRect tile_rect = ctxt[plane].tile_rect;
+    const int unit_size = ctxt[plane].rsi->restoration_unit_size;
+
+    const int tile_h = tile_rect.bottom - tile_rect.top;
+    const int ext_size = unit_size * 3 / 2;
+
+    int y0 = 0, i = 0;
+    while (y0 < tile_h) {
+      int remaining_h = tile_h - y0;
+      int h = (remaining_h < ext_size) ? remaining_h : unit_size;
+
+      RestorationTileLimits limits;
+      limits.v_start = tile_rect.top + y0;
+      limits.v_end = tile_rect.top + y0 + h;
+      assert(limits.v_end <= tile_rect.bottom);
+      // Offset the tile upwards to align with the restoration processing stripe
+      const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
+      limits.v_start = AOMMAX(tile_rect.top, limits.v_start - voffset);
+      if (limits.v_end < tile_rect.bottom) limits.v_end -= voffset;
+
+      assert(lr_job_counter[0] <= num_even_lr_jobs);
+
+      lr_job_queue[lr_job_counter[i & 1]].lr_unit_row = i;
+      lr_job_queue[lr_job_counter[i & 1]].plane = plane;
+      lr_job_queue[lr_job_counter[i & 1]].v_start = limits.v_start;
+      lr_job_queue[lr_job_counter[i & 1]].v_end = limits.v_end;
+      lr_job_queue[lr_job_counter[i & 1]].sync_mode = i & 1;
+      if ((i & 1) == 0) {
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
+            limits.v_start + RESTORATION_BORDER;
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
+            limits.v_end - RESTORATION_BORDER;
+        if (i == 0) {
+          assert(limits.v_start == tile_rect.top);
+          lr_job_queue[lr_job_counter[i & 1]].v_copy_start = tile_rect.top;
+        }
+        if (i == (ctxt[plane].rsi->vert_units_per_tile - 1)) {
+          assert(limits.v_end == tile_rect.bottom);
+          lr_job_queue[lr_job_counter[i & 1]].v_copy_end = tile_rect.bottom;
+        }
+      } else {
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
+            AOMMAX(limits.v_start - RESTORATION_BORDER, tile_rect.top);
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
+            AOMMIN(limits.v_end + RESTORATION_BORDER, tile_rect.bottom);
+      }
+      lr_job_counter[i & 1]++;
+      lr_sync->jobs_enqueued++;
+
+      y0 += h;
+      ++i;
+    }
+  }
+}
+
+AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) {
+  AV1LrMTInfo *cur_job_info = NULL;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(lr_sync->job_mutex);
+
+  if (lr_sync->jobs_dequeued < lr_sync->jobs_enqueued) {
+    cur_job_info = lr_sync->job_queue + lr_sync->jobs_dequeued;
+    lr_sync->jobs_dequeued++;
+  }
+
+  pthread_mutex_unlock(lr_sync->job_mutex);
+#else
+  (void)lr_sync;
+#endif
+
+  return cur_job_info;
+}
+
+// Implement row loop restoration for each thread.
+static int loop_restoration_row_worker(AV1LrSync *const lr_sync,
+                                       LRWorkerData *lrworkerdata) {
+  AV1LrStruct *lr_ctxt = (AV1LrStruct *)lrworkerdata->lr_ctxt;
+  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+  int lr_unit_row;
+  int plane;
+  const int tile_row = LR_TILE_ROW;
+  const int tile_col = LR_TILE_COL;
+  const int tile_cols = LR_TILE_COLS;
+  const int tile_idx = tile_col + tile_row * tile_cols;
+  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
+                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
+                           int vstart, int vend);
+  static const copy_fun copy_funs[3] = {
+    aom_yv12_partial_copy_y, aom_yv12_partial_copy_u, aom_yv12_partial_copy_v
+  };
+
+  while (1) {
+    AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync);
+    if (cur_job_info != NULL) {
+      RestorationTileLimits limits;
+      sync_read_fn_t on_sync_read;
+      sync_write_fn_t on_sync_write;
+      limits.v_start = cur_job_info->v_start;
+      limits.v_end = cur_job_info->v_end;
+      lr_unit_row = cur_job_info->lr_unit_row;
+      plane = cur_job_info->plane;
+      const int unit_idx0 = tile_idx * ctxt[plane].rsi->units_per_tile;
+
+      // sync_mode == 1 implies only sync read is required in LR Multi-threading
+      // sync_mode == 0 implies only sync write is required.
+      on_sync_read =
+          cur_job_info->sync_mode == 1 ? lr_sync_read : av1_lr_sync_read_dummy;
+      on_sync_write = cur_job_info->sync_mode == 0 ? lr_sync_write
+                                                   : av1_lr_sync_write_dummy;
+
+      av1_foreach_rest_unit_in_row(
+          &limits, &(ctxt[plane].tile_rect), lr_ctxt->on_rest_unit, lr_unit_row,
+          ctxt[plane].rsi->restoration_unit_size, unit_idx0,
+          ctxt[plane].rsi->horz_units_per_tile,
+          ctxt[plane].rsi->vert_units_per_tile, plane, &ctxt[plane],
+          lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, on_sync_read,
+          on_sync_write, lr_sync);
+
+      copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, ctxt[plane].tile_rect.left,
+                       ctxt[plane].tile_rect.right, cur_job_info->v_copy_start,
+                       cur_job_info->v_copy_end);
+    } else {
+      break;
+    }
+  }
+  return 1;
+}
+
+static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
+                                           AVxWorker *workers, int nworkers,
+                                           AV1LrSync *lr_sync, AV1_COMMON *cm) {
+  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+
+  const int num_planes = av1_num_planes(cm);
+
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int num_rows_lr = 0;
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    const AV1PixelRect tile_rect = ctxt[plane].tile_rect;
+    const int max_tile_h = tile_rect.bottom - tile_rect.top;
+
+    const int unit_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
+
+    num_rows_lr =
+        AOMMAX(num_rows_lr, av1_lr_count_units_in_tile(unit_size, max_tile_h));
+  }
+
+  const int num_workers = nworkers;
+  int i;
+  assert(MAX_MB_PLANE == 3);
+
+  if (!lr_sync->sync_range || num_rows_lr != lr_sync->rows ||
+      num_workers > lr_sync->num_workers || num_planes != lr_sync->num_planes) {
+    av1_loop_restoration_dealloc(lr_sync, num_workers);
+    loop_restoration_alloc(lr_sync, cm, num_workers, num_rows_lr, num_planes,
+                           cm->width);
+  }
+
+  // Initialize cur_sb_col to -1 for all SB rows.
+  for (i = 0; i < num_planes; i++) {
+    memset(lr_sync->cur_sb_col[i], -1,
+           sizeof(*(lr_sync->cur_sb_col[i])) * num_rows_lr);
+  }
+
+  enqueue_lr_jobs(lr_sync, lr_ctxt, cm);
+
+  // Set up looprestoration thread data.
+  for (i = 0; i < num_workers; ++i) {
+    AVxWorker *const worker = &workers[i];
+    lr_sync->lrworkerdata[i].lr_ctxt = (void *)lr_ctxt;
+    worker->hook = (AVxWorkerHook)loop_restoration_row_worker;
+    worker->data1 = lr_sync;
+    worker->data2 = &lr_sync->lrworkerdata[i];
+
+    // Start loopfiltering
+    if (i == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait till all rows are finished
+  for (i = 0; i < num_workers; ++i) {
+    winterface->sync(&workers[i]);
   }
 }
 
-// Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int'
-// members, so we treat it as an array, and sum over the whole length.
-void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts,
-                                 FRAME_COUNTS *counts) {
-  unsigned int *const acc = (unsigned int *)acc_counts;
-  const unsigned int *const cnt = (unsigned int *)counts;
+void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+                                          AV1_COMMON *cm, int optimized_lr,
+                                          AVxWorker *workers, int num_workers,
+                                          AV1LrSync *lr_sync, void *lr_ctxt) {
+  assert(!cm->all_lossless);
+
+  const int num_planes = av1_num_planes(cm);
+
+  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
 
-  const unsigned int n_counts = sizeof(FRAME_COUNTS) / sizeof(unsigned int);
-  unsigned int i;
+  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
+                                         optimized_lr, num_planes);
 
-  for (i = 0; i < n_counts; i++) acc[i] += cnt[i];
+  foreach_rest_unit_in_planes_mt(loop_rest_ctxt, workers, num_workers, lr_sync,
+                                 cm);
 }
diff --git a/third_party/aom/av1/common/thread_common.h b/third_party/aom/av1/common/thread_common.h
index 7eddc662c..4b0d5d2b8 100644
--- a/third_party/aom/av1/common/thread_common.h
+++ b/third_party/aom/av1/common/thread_common.h
@@ -11,7 +11,9 @@
 
 #ifndef AV1_COMMON_LOOPFILTER_THREAD_H_
 #define AV1_COMMON_LOOPFILTER_THREAD_H_
-#include "./aom_config.h"
+
+#include "config/aom_config.h"
+
 #include "av1/common/av1_loopfilter.h"
 #include "aom_util/aom_thread.h"
 
@@ -20,16 +22,21 @@ extern "C" {
 #endif
 
 struct AV1Common;
-struct FRAME_COUNTS;
+
+typedef struct AV1LfMTInfo {
+  int mi_row;
+  int plane;
+  int dir;
+} AV1LfMTInfo;
 
 // Loopfilter row synchronization
 typedef struct AV1LfSyncData {
 #if CONFIG_MULTITHREAD
-  pthread_mutex_t *mutex_;
-  pthread_cond_t *cond_;
+  pthread_mutex_t *mutex_[MAX_MB_PLANE];
+  pthread_cond_t *cond_[MAX_MB_PLANE];
 #endif
   // Allocate memory to store the loop-filtered superblock index in each row.
-  int *cur_sb_col;
+  int *cur_sb_col[MAX_MB_PLANE];
   // The optimal sync_range for different resolution and platform should be
   // determined by testing. Currently, it is chosen to be a power-of-2 number.
   int sync_range;
@@ -38,27 +45,72 @@ typedef struct AV1LfSyncData {
   // Row-based parallel loopfilter data
   LFWorkerData *lfdata;
   int num_workers;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *job_mutex;
+#endif
+  AV1LfMTInfo *job_queue;
+  int jobs_enqueued;
+  int jobs_dequeued;
 } AV1LfSync;
 
-// Allocate memory for loopfilter row synchronization.
-void av1_loop_filter_alloc(AV1LfSync *lf_sync, struct AV1Common *cm, int rows,
-                           int width, int num_workers);
+typedef struct AV1LrMTInfo {
+  int v_start;
+  int v_end;
+  int lr_unit_row;
+  int plane;
+  int sync_mode;
+  int v_copy_start;
+  int v_copy_end;
+} AV1LrMTInfo;
+
+typedef struct LoopRestorationWorkerData {
+  int32_t *rst_tmpbuf;
+  void *rlbs;
+  void *lr_ctxt;
+} LRWorkerData;
+
+// Looprestoration row synchronization
+typedef struct AV1LrSyncData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_[MAX_MB_PLANE];
+  pthread_cond_t *cond_[MAX_MB_PLANE];
+#endif
+  // Allocate memory to store the loop-restoration block index in each row.
+  int *cur_sb_col[MAX_MB_PLANE];
+  // The optimal sync_range for different resolution and platform should be
+  // determined by testing. Currently, it is chosen to be a power-of-2 number.
+  int sync_range;
+  int rows;
+  int num_planes;
+
+  int num_workers;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *job_mutex;
+#endif
+  // Row-based parallel loopfilter data
+  LRWorkerData *lrworkerdata;
+
+  AV1LrMTInfo *job_queue;
+  int jobs_enqueued;
+  int jobs_dequeued;
+} AV1LrSync;
 
 // Deallocate loopfilter synchronization related mutex and data.
 void av1_loop_filter_dealloc(AV1LfSync *lf_sync);
 
-// Multi-threaded loopfilter that uses the tile threads.
 void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
-                              struct macroblockd_plane *planes,
-                              int frame_filter_level,
-#if CONFIG_LOOPFILTER_LEVEL
-                              int frame_filter_level_r,
-#endif
-                              int y_only, int partial_frame, AVxWorker *workers,
-                              int num_workers, AV1LfSync *lf_sync);
-
-void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts,
-                                 struct FRAME_COUNTS *counts);
+                              struct macroblockd *mbd, int plane_start,
+                              int plane_end, int partial_frame,
+                              AVxWorker *workers, int num_workers,
+                              AV1LfSync *lf_sync);
+void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+                                          struct AV1Common *cm,
+                                          int optimized_lr, AVxWorker *workers,
+                                          int num_workers, AV1LrSync *lr_sync,
+                                          void *lr_ctxt);
+void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/common/tile_common.c b/third_party/aom/av1/common/tile_common.c
index 507a01265..9a43ab29a 100644
--- a/third_party/aom/av1/common/tile_common.c
+++ b/third_party/aom/av1/common/tile_common.c
@@ -11,32 +11,14 @@
 
 #include "av1/common/tile_common.h"
 #include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
 #include "aom_dsp/aom_dsp_common.h"
 
-#if CONFIG_DEPENDENT_HORZTILES
-void av1_tile_set_tg_boundary(TileInfo *tile, const AV1_COMMON *const cm,
-                              int row, int col) {
-  const int tg_start_row = cm->tile_group_start_row[row][col];
-  const int tg_start_col = cm->tile_group_start_col[row][col];
-  tile->tg_horz_boundary = ((row == tg_start_row && col >= tg_start_col) ||
-                            (row == tg_start_row + 1 && col < tg_start_col));
-#if CONFIG_MAX_TILE
-  if (cm->tile_row_independent[row]) {
-    tile->tg_horz_boundary = 1;  // this tile row is independent
-  }
-#endif
-}
-#endif
 void av1_tile_init(TileInfo *tile, const AV1_COMMON *cm, int row, int col) {
   av1_tile_set_row(tile, cm, row);
   av1_tile_set_col(tile, cm, col);
-#if CONFIG_DEPENDENT_HORZTILES
-  av1_tile_set_tg_boundary(tile, cm, row, col);
-#endif
 }
 
-#if CONFIG_MAX_TILE
-
 // Find smallest k>=0 such that (blk_size << k) >= target
 static int tile_log2(int blk_size, int target) {
   int k;
@@ -46,25 +28,27 @@ static int tile_log2(int blk_size, int target) {
 }
 
 void av1_get_tile_limits(AV1_COMMON *const cm) {
-  int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
-  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
-  int sb_cols = mi_cols >> MAX_MIB_SIZE_LOG2;
-  int sb_rows = mi_rows >> MAX_MIB_SIZE_LOG2;
+  int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
+  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+  int sb_cols = mi_cols >> cm->seq_params.mib_size_log2;
+  int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
 
-  cm->min_log2_tile_cols = tile_log2(MAX_TILE_WIDTH_SB, sb_cols);
+  int sb_size_log2 = cm->seq_params.mib_size_log2 + MI_SIZE_LOG2;
+  cm->max_tile_width_sb = MAX_TILE_WIDTH >> sb_size_log2;
+  int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2);
+
+  cm->min_log2_tile_cols = tile_log2(cm->max_tile_width_sb, sb_cols);
   cm->max_log2_tile_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS));
   cm->max_log2_tile_rows = tile_log2(1, AOMMIN(sb_rows, MAX_TILE_ROWS));
-  cm->min_log2_tiles = tile_log2(MAX_TILE_AREA_SB, sb_cols * sb_rows);
+  cm->min_log2_tiles = tile_log2(max_tile_area_sb, sb_cols * sb_rows);
   cm->min_log2_tiles = AOMMAX(cm->min_log2_tiles, cm->min_log2_tile_cols);
-  // TODO(dominic.symes@arm.com):
-  // Add in levelMinLog2Tiles as a lower limit when levels are defined
 }
 
 void av1_calculate_tile_cols(AV1_COMMON *const cm) {
-  int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
-  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
-  int sb_cols = mi_cols >> MAX_MIB_SIZE_LOG2;
-  int sb_rows = mi_rows >> MAX_MIB_SIZE_LOG2;
+  int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
+  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+  int sb_cols = mi_cols >> cm->seq_params.mib_size_log2;
+  int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
   int i;
 
   if (cm->uniform_tile_spacing_flag) {
@@ -80,24 +64,27 @@ void av1_calculate_tile_cols(AV1_COMMON *const cm) {
     cm->tile_col_start_sb[i] = sb_cols;
     cm->min_log2_tile_rows = AOMMAX(cm->min_log2_tiles - cm->log2_tile_cols, 0);
     cm->max_tile_height_sb = sb_rows >> cm->min_log2_tile_rows;
+
+    cm->tile_width = size_sb << cm->seq_params.mib_size_log2;
+    cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols);
   } else {
     int max_tile_area_sb = (sb_rows * sb_cols);
-    int max_tile_width_sb = 0;
+    int widest_tile_sb = 1;
     cm->log2_tile_cols = tile_log2(1, cm->tile_cols);
     for (i = 0; i < cm->tile_cols; i++) {
       int size_sb = cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i];
-      max_tile_width_sb = AOMMAX(max_tile_width_sb, size_sb);
+      widest_tile_sb = AOMMAX(widest_tile_sb, size_sb);
     }
     if (cm->min_log2_tiles) {
       max_tile_area_sb >>= (cm->min_log2_tiles + 1);
     }
-    cm->max_tile_height_sb = AOMMAX(max_tile_area_sb / max_tile_width_sb, 1);
+    cm->max_tile_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1);
   }
 }
 
 void av1_calculate_tile_rows(AV1_COMMON *const cm) {
-  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
-  int sb_rows = mi_rows >> MAX_MIB_SIZE_LOG2;
+  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+  int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
   int start_sb, size_sb, i;
 
   if (cm->uniform_tile_spacing_flag) {
@@ -110,106 +97,34 @@ void av1_calculate_tile_rows(AV1_COMMON *const cm) {
     }
     cm->tile_rows = i;
     cm->tile_row_start_sb[i] = sb_rows;
+
+    cm->tile_height = size_sb << cm->seq_params.mib_size_log2;
+    cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows);
   } else {
     cm->log2_tile_rows = tile_log2(1, cm->tile_rows);
   }
-
-#if CONFIG_DEPENDENT_HORZTILES
-  // Record which tile rows must be indpendent for parallelism
-  for (i = 0, start_sb = 0; i < cm->tile_rows; i++) {
-    cm->tile_row_independent[i] = 0;
-    if (cm->tile_row_start_sb[i + 1] - start_sb > cm->max_tile_height_sb) {
-      cm->tile_row_independent[i] = 1;
-      start_sb = cm->tile_row_start_sb[i];
-    }
-  }
-#endif
 }
 
 void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) {
   assert(row < cm->tile_rows);
-  int mi_row_start = cm->tile_row_start_sb[row] << MAX_MIB_SIZE_LOG2;
-  int mi_row_end = cm->tile_row_start_sb[row + 1] << MAX_MIB_SIZE_LOG2;
+  int mi_row_start = cm->tile_row_start_sb[row] << cm->seq_params.mib_size_log2;
+  int mi_row_end = cm->tile_row_start_sb[row + 1]
+                   << cm->seq_params.mib_size_log2;
+  tile->tile_row = row;
   tile->mi_row_start = mi_row_start;
   tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_rows);
+  assert(tile->mi_row_end > tile->mi_row_start);
 }
 
 void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
   assert(col < cm->tile_cols);
-  int mi_col_start = cm->tile_col_start_sb[col] << MAX_MIB_SIZE_LOG2;
-  int mi_col_end = cm->tile_col_start_sb[col + 1] << MAX_MIB_SIZE_LOG2;
+  int mi_col_start = cm->tile_col_start_sb[col] << cm->seq_params.mib_size_log2;
+  int mi_col_end = cm->tile_col_start_sb[col + 1]
+                   << cm->seq_params.mib_size_log2;
+  tile->tile_col = col;
   tile->mi_col_start = mi_col_start;
   tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_cols);
-}
-
-#else
-
-void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) {
-  tile->mi_row_start = row * cm->tile_height;
-  tile->mi_row_end = AOMMIN(tile->mi_row_start + cm->tile_height, cm->mi_rows);
-}
-
-void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
-  tile->mi_col_start = col * cm->tile_width;
-  tile->mi_col_end = AOMMIN(tile->mi_col_start + cm->tile_width, cm->mi_cols);
-}
-
-#if CONFIG_EXT_PARTITION
-#define MIN_TILE_WIDTH_MAX_SB 2
-#define MAX_TILE_WIDTH_MAX_SB 32
-#else
-#define MIN_TILE_WIDTH_MAX_SB 4
-#define MAX_TILE_WIDTH_MAX_SB 64
-#endif  // CONFIG_EXT_PARTITION
-
-static int get_min_log2_tile_cols(int max_sb_cols) {
-  int min_log2 = 0;
-  while ((MAX_TILE_WIDTH_MAX_SB << min_log2) < max_sb_cols) ++min_log2;
-  return min_log2;
-}
-
-static int get_max_log2_tile_cols(int max_sb_cols) {
-  int max_log2 = 1;
-  while ((max_sb_cols >> max_log2) >= MIN_TILE_WIDTH_MAX_SB) ++max_log2;
-  return max_log2 - 1;
-}
-
-void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols,
-                         int *max_log2_tile_cols) {
-  const int max_sb_cols =
-      ALIGN_POWER_OF_TWO(mi_cols, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
-  *min_log2_tile_cols = get_min_log2_tile_cols(max_sb_cols);
-  *max_log2_tile_cols = get_max_log2_tile_cols(max_sb_cols);
-  assert(*min_log2_tile_cols <= *max_log2_tile_cols);
-}
-#endif  // CONFIG_MAX_TILE
-
-void av1_setup_frame_boundary_info(const AV1_COMMON *const cm) {
-  MODE_INFO *mi = cm->mi;
-  int col;
-  for (col = 0; col < cm->mi_cols; ++col) {
-    mi->mbmi.boundary_info |= FRAME_ABOVE_BOUNDARY | TILE_ABOVE_BOUNDARY;
-    mi += 1;
-  }
-
-  mi = cm->mi;
-  int row;
-  for (row = 0; row < cm->mi_rows; ++row) {
-    mi->mbmi.boundary_info |= FRAME_LEFT_BOUNDARY | TILE_LEFT_BOUNDARY;
-    mi += cm->mi_stride;
-  }
-
-  mi = cm->mi + (cm->mi_rows - 1) * cm->mi_stride;
-  for (col = 0; col < cm->mi_cols; ++col) {
-    mi->mbmi.boundary_info |= FRAME_BOTTOM_BOUNDARY | TILE_BOTTOM_BOUNDARY;
-    mi += 1;
-  }
-
-  mi = cm->mi + cm->mi_cols - 1;
-  for (row = 0; row < cm->mi_rows; ++row) {
-    mi->mbmi.boundary_info |= FRAME_RIGHT_BOUNDARY | TILE_RIGHT_BOUNDARY;
-    mi += cm->mi_stride;
-  }
+  assert(tile->mi_col_end > tile->mi_col_start);
 }
 
 int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles) {
@@ -236,56 +151,41 @@ int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles) {
   return mi_tile_size;
 }
 
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-void av1_setup_across_tile_boundary_info(const AV1_COMMON *const cm,
-                                         const TileInfo *const tile_info) {
-  if (cm->tile_cols * cm->tile_rows > 1) {
-    const int mi_row = tile_info->mi_row_start;
-    const int mi_col = tile_info->mi_col_start;
-    MODE_INFO *const mi_start = cm->mi + mi_row * cm->mi_stride + mi_col;
-    assert(mi_start < cm->mip + cm->mi_alloc_size);
-    MODE_INFO *mi = 0;
-    const int row_diff = tile_info->mi_row_end - tile_info->mi_row_start;
-    const int col_diff = tile_info->mi_col_end - tile_info->mi_col_start;
-    int row, col;
-
-#if CONFIG_DEPENDENT_HORZTILES
-    if (!cm->dependent_horz_tiles || tile_info->tg_horz_boundary)
-#endif  // CONFIG_DEPENDENT_HORZTILES
-    {
-      mi = mi_start;
-      for (col = 0; col < col_diff; ++col) {
-        mi->mbmi.boundary_info |= TILE_ABOVE_BOUNDARY;
-        mi += 1;
-      }
-    }
-
-    mi = mi_start;
-    for (row = 0; row < row_diff; ++row) {
-      mi->mbmi.boundary_info |= TILE_LEFT_BOUNDARY;
-      mi += cm->mi_stride;
-    }
+AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
+                               int is_uv) {
+  AV1PixelRect r;
+
+  // Calculate position in the Y plane
+  r.left = tile_info->mi_col_start * MI_SIZE;
+  r.right = tile_info->mi_col_end * MI_SIZE;
+  r.top = tile_info->mi_row_start * MI_SIZE;
+  r.bottom = tile_info->mi_row_end * MI_SIZE;
+
+  // If upscaling is enabled, the tile limits need scaling to match the
+  // upscaled frame where the restoration units live. To do this, scale up the
+  // top-left and bottom-right of the tile.
+  if (av1_superres_scaled(cm)) {
+    av1_calculate_unscaled_superres_size(&r.left, &r.top,
+                                         cm->superres_scale_denominator);
+    av1_calculate_unscaled_superres_size(&r.right, &r.bottom,
+                                         cm->superres_scale_denominator);
+  }
 
-    mi = mi_start + (row_diff - 1) * cm->mi_stride;
+  const int frame_w = cm->superres_upscaled_width;
+  const int frame_h = cm->superres_upscaled_height;
 
-    // explicit bounds checking
-    assert(mi + col_diff <= cm->mip + cm->mi_alloc_size);
+  // Make sure we don't fall off the bottom-right of the frame.
+  r.right = AOMMIN(r.right, frame_w);
+  r.bottom = AOMMIN(r.bottom, frame_h);
 
-    for (col = 0; col < col_diff; ++col) {
-      mi->mbmi.boundary_info |= TILE_BOTTOM_BOUNDARY;
-      mi += 1;
-    }
+  // Convert to coordinates in the appropriate plane
+  const int ss_x = is_uv && cm->subsampling_x;
+  const int ss_y = is_uv && cm->subsampling_y;
 
-    mi = mi_start + col_diff - 1;
-    for (row = 0; row < row_diff; ++row) {
-      mi->mbmi.boundary_info |= TILE_RIGHT_BOUNDARY;
-      mi += cm->mi_stride;
-    }
-  }
-}
+  r.left = ROUND_POWER_OF_TWO(r.left, ss_x);
+  r.right = ROUND_POWER_OF_TWO(r.right, ss_x);
+  r.top = ROUND_POWER_OF_TWO(r.top, ss_y);
+  r.bottom = ROUND_POWER_OF_TWO(r.bottom, ss_y);
 
-int av1_disable_loopfilter_on_tile_boundary(const struct AV1Common *cm) {
-  return (!cm->loop_filter_across_tiles_enabled &&
-          (cm->tile_cols * cm->tile_rows > 1));
+  return r;
 }
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
diff --git a/third_party/aom/av1/common/tile_common.h b/third_party/aom/av1/common/tile_common.h
index be21e1482..be037fb17 100644
--- a/third_party/aom/av1/common/tile_common.h
+++ b/third_party/aom/av1/common/tile_common.h
@@ -16,7 +16,7 @@
 extern "C" {
 #endif
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 struct AV1Common;
 
@@ -26,6 +26,8 @@ typedef struct TileInfo {
   int mi_row_start, mi_row_end;
   int mi_col_start, mi_col_end;
   int tg_horz_boundary;
+  int tile_row;
+  int tile_col;
 } TileInfo;
 
 // initializes 'tile->mi_(row|col)_(start|end)' for (row, col) based on
@@ -35,39 +37,30 @@ void av1_tile_init(TileInfo *tile, const struct AV1Common *cm, int row,
 
 void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row);
 void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col);
-#if CONFIG_DEPENDENT_HORZTILES
-void av1_tile_set_tg_boundary(TileInfo *tile, const struct AV1Common *const cm,
-                              int row, int col);
-#endif
 void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols,
                          int *max_log2_tile_cols);
 
-void av1_setup_frame_boundary_info(const struct AV1Common *const cm);
-
 // Calculate the correct tile size (width or height) for (1 << log2_tile_num)
 // tiles horizontally or vertically in the frame.
 int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles);
 
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-void av1_setup_across_tile_boundary_info(const struct AV1Common *const cm,
-                                         const TileInfo *const tile_info);
-int av1_disable_loopfilter_on_tile_boundary(const struct AV1Common *cm);
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+typedef struct {
+  int left, top, right, bottom;
+} AV1PixelRect;
 
-#if CONFIG_MAX_TILE
+// Return the pixel extents of the given tile
+AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info,
+                               const struct AV1Common *cm, int is_uv);
 
 // Define tile maximum width and area
 // There is no maximum height since height is limited by area and width limits
 // The minimum tile width or height is fixed at one superblock
-#define MAX_TILE_WIDTH (4096)  // Max Tile width in pixels
-#define MAX_TILE_WIDTH_SB (MAX_TILE_WIDTH >> MAX_SB_SIZE_LOG2)
+#define MAX_TILE_WIDTH (4096)        // Max Tile width in pixels
 #define MAX_TILE_AREA (4096 * 2304)  // Maximum tile area in pixels
-#define MAX_TILE_AREA_SB (MAX_TILE_AREA >> (2 * MAX_SB_SIZE_LOG2))
 
 void av1_get_tile_limits(struct AV1Common *const cm);
 void av1_calculate_tile_cols(struct AV1Common *const cm);
 void av1_calculate_tile_rows(struct AV1Common *const cm);
-#endif
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/common/timing.c b/third_party/aom/av1/common/timing.c
new file mode 100644
index 000000000..5ff538ae1
--- /dev/null
+++ b/third_party/aom/av1/common/timing.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/timing.h"
+
+/* Tables for AV1 max bitrates for different levels of main and high tier.
+ * The tables are in Kbps instead of Mbps in the specification.
+ * Note that depending on the profile, a multiplier is needed.
+ */
+
+/* Max Bitrates for levels of Main Tier in kbps. Bitrate in main_kbps [31] */
+/* is a dummy value. The decoder model is not applicable for level 31. */
+static int32_t main_kbps[1 << LEVEL_BITS] = {
+  1500, 3000,  0,     0,     6000,  10000, 0,      0,      12000,  20000,    0,
+  0,    30000, 40000, 60000, 60000, 60000, 100000, 160000, 160000, 0,        0,
+  0,    0,     0,     0,     0,     0,     0,      0,      0,      (1 << 26)
+};
+
+/* Max Bitrates for levels of High Tier in kbps. Bitrate in high_kbps [31] */
+/* is a dummy value. The decoder model is not applicable for level 31. */
+static int32_t high_kbps[1 << LEVEL_BITS] = {
+  0,      0,      0,      0,      0,      0,      0,      0,
+  30000,  50000,  0,      0,      100000, 160000, 240000, 240000,
+  240000, 480000, 800000, 800000, 0,      0,      0,      0,
+  0,      0,      0,      0,      0,      0,      0,      (1 << 26)
+};
+
+/* BitrateProfileFactor */
+static int bitrate_profile_factor[1 << PROFILE_BITS] = {
+  1, 2, 3, 0, 0, 0, 0, 0
+};
+
+int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
+                          int seq_tier) {
+  int64_t bitrate;
+
+  if (seq_tier) {
+    bitrate = high_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile];
+  } else {
+    bitrate = main_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile];
+  }
+
+  return bitrate * 1000;
+}
+
+void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) {
+  decoder_model->encoder_decoder_buffer_delay_length = 16;
+  decoder_model->buffer_removal_delay_length = 10;
+  decoder_model->frame_presentation_delay_length = 10;
+}
+
+void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) {
+  op_params->decoder_model_param_present_flag = 1;
+  op_params->decoder_buffer_delay = 90000 >> 1;  //  0.5 s
+  op_params->encoder_buffer_delay = 90000 >> 1;  //  0.5 s
+  op_params->low_delay_mode_flag = 0;
+  op_params->display_model_param_present_flag = 1;
+  op_params->initial_display_delay = 8;  // 8 frames delay
+}
+
+void set_resource_availability_parameters(
+    aom_dec_model_op_parameters_t *op_params) {
+  op_params->decoder_model_param_present_flag = 0;
+  op_params->decoder_buffer_delay =
+      70000;  // Resource availability mode default
+  op_params->encoder_buffer_delay =
+      20000;                           // Resource availability mode default
+  op_params->low_delay_mode_flag = 0;  // Resource availability mode default
+  op_params->display_model_param_present_flag = 1;
+  op_params->initial_display_delay = 8;  // 8 frames delay
+}
diff --git a/third_party/aom/av1/common/timing.h b/third_party/aom/av1/common/timing.h
new file mode 100644
index 000000000..d31f4b7fc
--- /dev/null
+++ b/third_party/aom/av1/common/timing.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TIMING_H_
+#define AOM_TIMING_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/enums.h"
+
+#define MAX_NUM_OP_POINTS 32
+
+typedef struct aom_timing {
+  uint32_t num_units_in_display_tick;
+  uint32_t time_scale;
+  int equal_picture_interval;
+  uint32_t num_ticks_per_picture;
+} aom_timing_info_t;
+
+typedef struct aom_dec_model_info {
+  uint32_t num_units_in_decoding_tick;
+  int encoder_decoder_buffer_delay_length;
+  int buffer_removal_delay_length;
+  int frame_presentation_delay_length;
+} aom_dec_model_info_t;
+
+typedef struct aom_dec_model_op_parameters {
+  int decoder_model_param_present_flag;
+  int64_t bitrate;
+  int64_t buffer_size;
+  int decoder_buffer_delay;
+  int encoder_buffer_delay;
+  int low_delay_mode_flag;
+  int display_model_param_present_flag;
+  int initial_display_delay;
+} aom_dec_model_op_parameters_t;
+
+typedef struct aom_op_timing_info_t {
+  int64_t buffer_removal_delay;
+} aom_op_timing_info_t;
+
+void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model);
+
+void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params);
+
+void set_resource_availability_parameters(
+    aom_dec_model_op_parameters_t *op_params);
+
+int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
+                          int seq_tier);
+
+#endif  // AOM_TIMING_H_
diff --git a/third_party/aom/av1/common/token_cdfs.h b/third_party/aom/av1/common/token_cdfs.h
index c4f0f94c0..9a6b454ac 100644
--- a/third_party/aom/av1/common/token_cdfs.h
+++ b/third_party/aom/av1/common/token_cdfs.h
@@ -9,5245 +9,3542 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "av1/common/entropy.h"
 
-/* clang-format off */
-static const coeff_cdf_model
-av1_default_coef_head_cdfs_q0[TX_SIZES][PLANE_TYPES] = {
-    {  // TX 4X4
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(21029), AOM_ICDF(21848), AOM_ICDF(26326), AOM_ICDF(29423),
-    AOM_ICDF(30610), AOM_ICDF(32768), },
-    {AOM_ICDF(10066), AOM_ICDF(12716), AOM_ICDF(18523), AOM_ICDF(23277),
-    AOM_ICDF(24780), AOM_ICDF(32768), },
-    {AOM_ICDF(1655), AOM_ICDF(4793), AOM_ICDF(6429), AOM_ICDF(11430),
-    AOM_ICDF(12206), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(10364), AOM_ICDF(14773), AOM_ICDF(25084), AOM_ICDF(25599),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10060), AOM_ICDF(14834), AOM_ICDF(24695), AOM_ICDF(25188),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8279), AOM_ICDF(11106), AOM_ICDF(21159), AOM_ICDF(21671),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5914), AOM_ICDF(6961), AOM_ICDF(15824), AOM_ICDF(16314),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3542), AOM_ICDF(3935), AOM_ICDF(10073), AOM_ICDF(10456),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1492), AOM_ICDF(1808), AOM_ICDF(4428), AOM_ICDF(4747),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(15783), AOM_ICDF(19657), AOM_ICDF(28753), AOM_ICDF(29248),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12047), AOM_ICDF(15766), AOM_ICDF(26989), AOM_ICDF(27464),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8412), AOM_ICDF(9971), AOM_ICDF(21538), AOM_ICDF(22026),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5438), AOM_ICDF(6039), AOM_ICDF(15108), AOM_ICDF(15570),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3247), AOM_ICDF(3593), AOM_ICDF(9495), AOM_ICDF(9859),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1428), AOM_ICDF(1742), AOM_ICDF(4322), AOM_ICDF(4638),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(18469), AOM_ICDF(21675), AOM_ICDF(30172), AOM_ICDF(30563),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12582), AOM_ICDF(16559), AOM_ICDF(27995), AOM_ICDF(28423),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8183), AOM_ICDF(9915), AOM_ICDF(21836), AOM_ICDF(22336),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5255), AOM_ICDF(5845), AOM_ICDF(15137), AOM_ICDF(15593),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3140), AOM_ICDF(3478), AOM_ICDF(9376), AOM_ICDF(9739),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1549), AOM_ICDF(1864), AOM_ICDF(4660), AOM_ICDF(4984),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(18319), AOM_ICDF(23757), AOM_ICDF(30989), AOM_ICDF(31399),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12864), AOM_ICDF(18051), AOM_ICDF(28729), AOM_ICDF(29218),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8090), AOM_ICDF(10047), AOM_ICDF(22011), AOM_ICDF(22680),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5061), AOM_ICDF(5688), AOM_ICDF(14783), AOM_ICDF(15379),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3425), AOM_ICDF(3784), AOM_ICDF(9565), AOM_ICDF(9998),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1564), AOM_ICDF(1884), AOM_ICDF(4703), AOM_ICDF(5054),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(8329), AOM_ICDF(23625), AOM_ICDF(30376), AOM_ICDF(31182),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7265), AOM_ICDF(19981), AOM_ICDF(27965), AOM_ICDF(29333),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5797), AOM_ICDF(12014), AOM_ICDF(21143), AOM_ICDF(23728),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4525), AOM_ICDF(7029), AOM_ICDF(14661), AOM_ICDF(17493),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3200), AOM_ICDF(4082), AOM_ICDF(9679), AOM_ICDF(11816),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1930), AOM_ICDF(2344), AOM_ICDF(5504), AOM_ICDF(6684),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(12366), AOM_ICDF(20513), AOM_ICDF(22133), AOM_ICDF(29810),
-    AOM_ICDF(30422), AOM_ICDF(32768), },
-    {AOM_ICDF(7182), AOM_ICDF(16662), AOM_ICDF(18633), AOM_ICDF(27443),
-    AOM_ICDF(28120), AOM_ICDF(32768), },
-    {AOM_ICDF(1791), AOM_ICDF(10613), AOM_ICDF(11616), AOM_ICDF(21520),
-    AOM_ICDF(22191), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(18943), AOM_ICDF(19755), AOM_ICDF(30340), AOM_ICDF(30674),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15702), AOM_ICDF(17160), AOM_ICDF(28778), AOM_ICDF(29115),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9337), AOM_ICDF(10054), AOM_ICDF(22492), AOM_ICDF(22845),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6550), AOM_ICDF(7019), AOM_ICDF(17068), AOM_ICDF(17420),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4169), AOM_ICDF(4566), AOM_ICDF(11849), AOM_ICDF(12185),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2495), AOM_ICDF(2839), AOM_ICDF(6895), AOM_ICDF(7221),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(20241), AOM_ICDF(21593), AOM_ICDF(31083), AOM_ICDF(31425),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15276), AOM_ICDF(16626), AOM_ICDF(28787), AOM_ICDF(29136),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7656), AOM_ICDF(8102), AOM_ICDF(20347), AOM_ICDF(20700),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4527), AOM_ICDF(4880), AOM_ICDF(13482), AOM_ICDF(13819),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2538), AOM_ICDF(2860), AOM_ICDF(7975), AOM_ICDF(8298),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1394), AOM_ICDF(1707), AOM_ICDF(3770), AOM_ICDF(4086),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(19968), AOM_ICDF(21872), AOM_ICDF(30812), AOM_ICDF(31172),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15081), AOM_ICDF(16805), AOM_ICDF(28957), AOM_ICDF(29326),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8196), AOM_ICDF(8748), AOM_ICDF(21434), AOM_ICDF(21825),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5297), AOM_ICDF(5675), AOM_ICDF(15007), AOM_ICDF(15385),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3102), AOM_ICDF(3429), AOM_ICDF(9255), AOM_ICDF(9607),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1502), AOM_ICDF(1815), AOM_ICDF(4662), AOM_ICDF(4983),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(19362), AOM_ICDF(22537), AOM_ICDF(31260), AOM_ICDF(31624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14450), AOM_ICDF(17789), AOM_ICDF(29362), AOM_ICDF(29788),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7957), AOM_ICDF(8982), AOM_ICDF(21542), AOM_ICDF(22120),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4819), AOM_ICDF(5280), AOM_ICDF(14199), AOM_ICDF(14724),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2831), AOM_ICDF(3180), AOM_ICDF(8511), AOM_ICDF(8950),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1385), AOM_ICDF(1700), AOM_ICDF(4300), AOM_ICDF(4633),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(14134), AOM_ICDF(22252), AOM_ICDF(31119), AOM_ICDF(31577),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11383), AOM_ICDF(19847), AOM_ICDF(29451), AOM_ICDF(30205),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7338), AOM_ICDF(11314), AOM_ICDF(22338), AOM_ICDF(24028),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5071), AOM_ICDF(6634), AOM_ICDF(15379), AOM_ICDF(17178),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2969), AOM_ICDF(3703), AOM_ICDF(9896), AOM_ICDF(11246),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1809), AOM_ICDF(2173), AOM_ICDF(5573), AOM_ICDF(6229),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(25213), AOM_ICDF(26007), AOM_ICDF(29751), AOM_ICDF(31199),
-    AOM_ICDF(31688), AOM_ICDF(32768), },
-    {AOM_ICDF(13781), AOM_ICDF(16489), AOM_ICDF(23298), AOM_ICDF(27505),
-    AOM_ICDF(28405), AOM_ICDF(32768), },
-    {AOM_ICDF(4621), AOM_ICDF(9194), AOM_ICDF(12764), AOM_ICDF(19842),
-    AOM_ICDF(20708), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(12686), AOM_ICDF(19031), AOM_ICDF(28910), AOM_ICDF(29358),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12732), AOM_ICDF(18729), AOM_ICDF(28346), AOM_ICDF(28824),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9753), AOM_ICDF(12954), AOM_ICDF(24344), AOM_ICDF(24920),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6853), AOM_ICDF(7851), AOM_ICDF(18601), AOM_ICDF(19110),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3697), AOM_ICDF(4071), AOM_ICDF(11373), AOM_ICDF(11743),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1738), AOM_ICDF(2057), AOM_ICDF(5307), AOM_ICDF(5627),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(18299), AOM_ICDF(24455), AOM_ICDF(30930), AOM_ICDF(31398),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14316), AOM_ICDF(19083), AOM_ICDF(29266), AOM_ICDF(29766),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9584), AOM_ICDF(11344), AOM_ICDF(23898), AOM_ICDF(24407),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6076), AOM_ICDF(6645), AOM_ICDF(16805), AOM_ICDF(17237),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3535), AOM_ICDF(3885), AOM_ICDF(10393), AOM_ICDF(10746),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1909), AOM_ICDF(2222), AOM_ICDF(5010), AOM_ICDF(5328),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(21106), AOM_ICDF(25258), AOM_ICDF(31172), AOM_ICDF(31576),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14944), AOM_ICDF(20229), AOM_ICDF(29903), AOM_ICDF(30361),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10454), AOM_ICDF(13063), AOM_ICDF(25548), AOM_ICDF(26138),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7667), AOM_ICDF(8529), AOM_ICDF(20025), AOM_ICDF(20588),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4813), AOM_ICDF(5176), AOM_ICDF(13672), AOM_ICDF(14085),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2450), AOM_ICDF(2763), AOM_ICDF(7515), AOM_ICDF(7873),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(18297), AOM_ICDF(25980), AOM_ICDF(31547), AOM_ICDF(31946),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13370), AOM_ICDF(21048), AOM_ICDF(30193), AOM_ICDF(30703),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9326), AOM_ICDF(13020), AOM_ICDF(25206), AOM_ICDF(26074),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6117), AOM_ICDF(7480), AOM_ICDF(18243), AOM_ICDF(19130),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6408), AOM_ICDF(6819), AOM_ICDF(13596), AOM_ICDF(14098),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2179), AOM_ICDF(2485), AOM_ICDF(7393), AOM_ICDF(7768),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(11255), AOM_ICDF(26931), AOM_ICDF(31505), AOM_ICDF(32033),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9120), AOM_ICDF(23148), AOM_ICDF(30070), AOM_ICDF(31091),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7927), AOM_ICDF(15909), AOM_ICDF(25162), AOM_ICDF(27329),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6416), AOM_ICDF(10706), AOM_ICDF(19959), AOM_ICDF(22732),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4232), AOM_ICDF(5545), AOM_ICDF(13107), AOM_ICDF(15118),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2626), AOM_ICDF(2941), AOM_ICDF(8665), AOM_ICDF(9872),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(27618), AOM_ICDF(28976), AOM_ICDF(30940), AOM_ICDF(31993),
-    AOM_ICDF(32336), AOM_ICDF(32768), },
-    {AOM_ICDF(16119), AOM_ICDF(21691), AOM_ICDF(26334), AOM_ICDF(30705),
-    AOM_ICDF(31244), AOM_ICDF(32768), },
-    {AOM_ICDF(5114), AOM_ICDF(14755), AOM_ICDF(17865), AOM_ICDF(27048),
-    AOM_ICDF(27895), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(19468), AOM_ICDF(23767), AOM_ICDF(31339), AOM_ICDF(31674),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16878), AOM_ICDF(20966), AOM_ICDF(30654), AOM_ICDF(31007),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12213), AOM_ICDF(14415), AOM_ICDF(26909), AOM_ICDF(27338),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9404), AOM_ICDF(10670), AOM_ICDF(22239), AOM_ICDF(22719),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6856), AOM_ICDF(7784), AOM_ICDF(17127), AOM_ICDF(17609),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5034), AOM_ICDF(5529), AOM_ICDF(13229), AOM_ICDF(13634),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(21214), AOM_ICDF(25570), AOM_ICDF(31656), AOM_ICDF(31994),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17012), AOM_ICDF(20535), AOM_ICDF(30403), AOM_ICDF(30787),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10855), AOM_ICDF(12147), AOM_ICDF(25451), AOM_ICDF(25874),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7055), AOM_ICDF(7837), AOM_ICDF(19116), AOM_ICDF(19553),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4141), AOM_ICDF(4531), AOM_ICDF(11911), AOM_ICDF(12296),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1706), AOM_ICDF(2041), AOM_ICDF(5622), AOM_ICDF(5957),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(22092), AOM_ICDF(26330), AOM_ICDF(31642), AOM_ICDF(32015),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16433), AOM_ICDF(20889), AOM_ICDF(30263), AOM_ICDF(30704),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11015), AOM_ICDF(13045), AOM_ICDF(26253), AOM_ICDF(26743),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9188), AOM_ICDF(9924), AOM_ICDF(21991), AOM_ICDF(22551),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5259), AOM_ICDF(5634), AOM_ICDF(14131), AOM_ICDF(14627),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1916), AOM_ICDF(2218), AOM_ICDF(6453), AOM_ICDF(6780),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(20331), AOM_ICDF(26854), AOM_ICDF(31896), AOM_ICDF(32255),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15738), AOM_ICDF(22741), AOM_ICDF(31108), AOM_ICDF(31557),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11693), AOM_ICDF(15508), AOM_ICDF(27435), AOM_ICDF(28103),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8066), AOM_ICDF(9281), AOM_ICDF(20855), AOM_ICDF(21631),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4427), AOM_ICDF(4860), AOM_ICDF(12951), AOM_ICDF(13543),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1928), AOM_ICDF(2372), AOM_ICDF(5634), AOM_ICDF(6672),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(15605), AOM_ICDF(27749), AOM_ICDF(31907), AOM_ICDF(32303),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11920), AOM_ICDF(24653), AOM_ICDF(31013), AOM_ICDF(31675),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8007), AOM_ICDF(14898), AOM_ICDF(25377), AOM_ICDF(27353),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6010), AOM_ICDF(8920), AOM_ICDF(18956), AOM_ICDF(21554),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4573), AOM_ICDF(5611), AOM_ICDF(13522), AOM_ICDF(15795),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4274), AOM_ICDF(6411), AOM_ICDF(11398), AOM_ICDF(14247),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 8X8
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(22195), AOM_ICDF(22830), AOM_ICDF(25684), AOM_ICDF(28569),
-    AOM_ICDF(30557), AOM_ICDF(32768), },
-    {AOM_ICDF(9973), AOM_ICDF(12001), AOM_ICDF(15354), AOM_ICDF(20353),
-    AOM_ICDF(23020), AOM_ICDF(32768), },
-    {AOM_ICDF(1514), AOM_ICDF(3998), AOM_ICDF(4873), AOM_ICDF(9182),
-    AOM_ICDF(9967), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(11609), AOM_ICDF(14013), AOM_ICDF(24609), AOM_ICDF(25092),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10302), AOM_ICDF(15208), AOM_ICDF(24145), AOM_ICDF(24658),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7991), AOM_ICDF(10895), AOM_ICDF(20438), AOM_ICDF(21146),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5831), AOM_ICDF(7006), AOM_ICDF(15716), AOM_ICDF(16394),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3536), AOM_ICDF(3969), AOM_ICDF(10117), AOM_ICDF(10569),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1369), AOM_ICDF(1686), AOM_ICDF(4062), AOM_ICDF(4385),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(17334), AOM_ICDF(19416), AOM_ICDF(28420), AOM_ICDF(28798),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13512), AOM_ICDF(15917), AOM_ICDF(26736), AOM_ICDF(27122),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9322), AOM_ICDF(10491), AOM_ICDF(21892), AOM_ICDF(22281),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6187), AOM_ICDF(6682), AOM_ICDF(15992), AOM_ICDF(16351),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3733), AOM_ICDF(4073), AOM_ICDF(10406), AOM_ICDF(10735),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1606), AOM_ICDF(1920), AOM_ICDF(4715), AOM_ICDF(5028),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(20589), AOM_ICDF(22106), AOM_ICDF(30065), AOM_ICDF(30422),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14731), AOM_ICDF(16342), AOM_ICDF(27701), AOM_ICDF(28059),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8554), AOM_ICDF(9080), AOM_ICDF(20831), AOM_ICDF(21182),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5011), AOM_ICDF(5354), AOM_ICDF(13968), AOM_ICDF(14296),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2867), AOM_ICDF(3184), AOM_ICDF(8524), AOM_ICDF(8840),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1174), AOM_ICDF(1486), AOM_ICDF(3643), AOM_ICDF(3955),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(23439), AOM_ICDF(24729), AOM_ICDF(31199), AOM_ICDF(31537),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15716), AOM_ICDF(17015), AOM_ICDF(28650), AOM_ICDF(28989),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8381), AOM_ICDF(8812), AOM_ICDF(21032), AOM_ICDF(21369),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4868), AOM_ICDF(5197), AOM_ICDF(13740), AOM_ICDF(14065),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2744), AOM_ICDF(3058), AOM_ICDF(8333), AOM_ICDF(8648),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1185), AOM_ICDF(1497), AOM_ICDF(3656), AOM_ICDF(3968),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(23980), AOM_ICDF(26041), AOM_ICDF(31566), AOM_ICDF(31904),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16060), AOM_ICDF(18243), AOM_ICDF(29508), AOM_ICDF(29868),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8844), AOM_ICDF(9414), AOM_ICDF(22119), AOM_ICDF(22496),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5265), AOM_ICDF(5612), AOM_ICDF(14599), AOM_ICDF(14944),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3058), AOM_ICDF(3375), AOM_ICDF(9028), AOM_ICDF(9351),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1414), AOM_ICDF(1726), AOM_ICDF(4249), AOM_ICDF(4563),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(9994), AOM_ICDF(19506), AOM_ICDF(21744), AOM_ICDF(29408),
-    AOM_ICDF(30809), AOM_ICDF(32768), },
-    {AOM_ICDF(3771), AOM_ICDF(14862), AOM_ICDF(16756), AOM_ICDF(26385),
-    AOM_ICDF(27927), AOM_ICDF(32768), },
-    {AOM_ICDF(964), AOM_ICDF(10643), AOM_ICDF(11416), AOM_ICDF(21060),
-    AOM_ICDF(22316), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(23263), AOM_ICDF(23761), AOM_ICDF(31250), AOM_ICDF(31580),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19631), AOM_ICDF(21067), AOM_ICDF(30262), AOM_ICDF(30596),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12419), AOM_ICDF(13646), AOM_ICDF(25959), AOM_ICDF(26329),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9274), AOM_ICDF(10229), AOM_ICDF(21588), AOM_ICDF(21981),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6778), AOM_ICDF(7496), AOM_ICDF(17069), AOM_ICDF(17469),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4655), AOM_ICDF(5089), AOM_ICDF(12206), AOM_ICDF(12574),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(24055), AOM_ICDF(24771), AOM_ICDF(31529), AOM_ICDF(31851),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18300), AOM_ICDF(19177), AOM_ICDF(29983), AOM_ICDF(30310),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9684), AOM_ICDF(10239), AOM_ICDF(23130), AOM_ICDF(23465),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6112), AOM_ICDF(6511), AOM_ICDF(16539), AOM_ICDF(16874),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3508), AOM_ICDF(3841), AOM_ICDF(10475), AOM_ICDF(10798),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1647), AOM_ICDF(1963), AOM_ICDF(5379), AOM_ICDF(5693),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(24875), AOM_ICDF(25551), AOM_ICDF(31757), AOM_ICDF(32078),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18585), AOM_ICDF(19328), AOM_ICDF(30217), AOM_ICDF(30542),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8948), AOM_ICDF(9350), AOM_ICDF(22251), AOM_ICDF(22577),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5148), AOM_ICDF(5481), AOM_ICDF(14806), AOM_ICDF(15127),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2852), AOM_ICDF(3169), AOM_ICDF(8930), AOM_ICDF(9249),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1298), AOM_ICDF(1609), AOM_ICDF(4289), AOM_ICDF(4600),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(25149), AOM_ICDF(25840), AOM_ICDF(31833), AOM_ICDF(32153),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19051), AOM_ICDF(19689), AOM_ICDF(30461), AOM_ICDF(30785),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8956), AOM_ICDF(9308), AOM_ICDF(22406), AOM_ICDF(22729),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5001), AOM_ICDF(5325), AOM_ICDF(14586), AOM_ICDF(14906),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2875), AOM_ICDF(3189), AOM_ICDF(8639), AOM_ICDF(8955),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1311), AOM_ICDF(1623), AOM_ICDF(4261), AOM_ICDF(4572),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(25212), AOM_ICDF(26544), AOM_ICDF(31879), AOM_ICDF(32209),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18967), AOM_ICDF(20523), AOM_ICDF(30778), AOM_ICDF(31126),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9672), AOM_ICDF(10140), AOM_ICDF(23740), AOM_ICDF(24117),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5732), AOM_ICDF(6079), AOM_ICDF(16067), AOM_ICDF(16423),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3370), AOM_ICDF(3687), AOM_ICDF(10101), AOM_ICDF(10429),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1696), AOM_ICDF(2007), AOM_ICDF(5320), AOM_ICDF(5648),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(26296), AOM_ICDF(26903), AOM_ICDF(30027), AOM_ICDF(31098),
-    AOM_ICDF(31851), AOM_ICDF(32768), },
-    {AOM_ICDF(13982), AOM_ICDF(16223), AOM_ICDF(22840), AOM_ICDF(26540),
-    AOM_ICDF(28301), AOM_ICDF(32768), },
-    {AOM_ICDF(5643), AOM_ICDF(9834), AOM_ICDF(13670), AOM_ICDF(20220),
-    AOM_ICDF(21734), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(14291), AOM_ICDF(20303), AOM_ICDF(29319), AOM_ICDF(29879),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13407), AOM_ICDF(20905), AOM_ICDF(29052), AOM_ICDF(29644),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10860), AOM_ICDF(15525), AOM_ICDF(25872), AOM_ICDF(26766),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7801), AOM_ICDF(9554), AOM_ICDF(20530), AOM_ICDF(21309),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4523), AOM_ICDF(4994), AOM_ICDF(12583), AOM_ICDF(13069),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1784), AOM_ICDF(2110), AOM_ICDF(5198), AOM_ICDF(5511),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(20153), AOM_ICDF(24114), AOM_ICDF(30802), AOM_ICDF(31195),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16079), AOM_ICDF(19936), AOM_ICDF(29580), AOM_ICDF(29992),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10977), AOM_ICDF(12993), AOM_ICDF(25245), AOM_ICDF(25687),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7386), AOM_ICDF(8212), AOM_ICDF(19223), AOM_ICDF(19683),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4797), AOM_ICDF(5164), AOM_ICDF(12928), AOM_ICDF(13288),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2188), AOM_ICDF(2498), AOM_ICDF(6396), AOM_ICDF(6706),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(24221), AOM_ICDF(26746), AOM_ICDF(31634), AOM_ICDF(31980),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17492), AOM_ICDF(20348), AOM_ICDF(30067), AOM_ICDF(30432),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10522), AOM_ICDF(11531), AOM_ICDF(24642), AOM_ICDF(25031),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6567), AOM_ICDF(7006), AOM_ICDF(17688), AOM_ICDF(18036),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4123), AOM_ICDF(4447), AOM_ICDF(11775), AOM_ICDF(12095),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1770), AOM_ICDF(2065), AOM_ICDF(6491), AOM_ICDF(6786),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(25862), AOM_ICDF(27744), AOM_ICDF(31611), AOM_ICDF(31969),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17752), AOM_ICDF(20079), AOM_ICDF(30169), AOM_ICDF(30530),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10588), AOM_ICDF(11308), AOM_ICDF(24834), AOM_ICDF(25180),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7459), AOM_ICDF(7820), AOM_ICDF(17949), AOM_ICDF(18281),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3984), AOM_ICDF(4294), AOM_ICDF(11863), AOM_ICDF(12173),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2689), AOM_ICDF(2969), AOM_ICDF(11371), AOM_ICDF(11651),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(27703), AOM_ICDF(29662), AOM_ICDF(31910), AOM_ICDF(32262),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17904), AOM_ICDF(21878), AOM_ICDF(30510), AOM_ICDF(30969),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10660), AOM_ICDF(12299), AOM_ICDF(24907), AOM_ICDF(25524),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6972), AOM_ICDF(7545), AOM_ICDF(18660), AOM_ICDF(19251),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5359), AOM_ICDF(5768), AOM_ICDF(14022), AOM_ICDF(14397),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5030), AOM_ICDF(5487), AOM_ICDF(10364), AOM_ICDF(10973),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(27980), AOM_ICDF(28880), AOM_ICDF(31045), AOM_ICDF(31931),
-    AOM_ICDF(32370), AOM_ICDF(32768), },
-    {AOM_ICDF(15958), AOM_ICDF(19891), AOM_ICDF(25963), AOM_ICDF(29601),
-    AOM_ICDF(30931), AOM_ICDF(32768), },
-    {AOM_ICDF(3897), AOM_ICDF(12331), AOM_ICDF(15935), AOM_ICDF(24489),
-    AOM_ICDF(26773), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(21443), AOM_ICDF(24237), AOM_ICDF(31473), AOM_ICDF(31808),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18617), AOM_ICDF(22378), AOM_ICDF(30958), AOM_ICDF(31301),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14626), AOM_ICDF(17725), AOM_ICDF(28852), AOM_ICDF(29246),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12155), AOM_ICDF(14598), AOM_ICDF(26000), AOM_ICDF(26506),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10111), AOM_ICDF(12149), AOM_ICDF(23415), AOM_ICDF(24002),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11352), AOM_ICDF(12864), AOM_ICDF(22589), AOM_ICDF(23010),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(22855), AOM_ICDF(25401), AOM_ICDF(31675), AOM_ICDF(31999),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19086), AOM_ICDF(21008), AOM_ICDF(30886), AOM_ICDF(31214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13477), AOM_ICDF(14473), AOM_ICDF(28104), AOM_ICDF(28450),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9553), AOM_ICDF(10401), AOM_ICDF(23815), AOM_ICDF(24225),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5795), AOM_ICDF(6172), AOM_ICDF(18068), AOM_ICDF(18445),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4297), AOM_ICDF(5909), AOM_ICDF(10206), AOM_ICDF(11818),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(24424), AOM_ICDF(26344), AOM_ICDF(31912), AOM_ICDF(32231),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20229), AOM_ICDF(21775), AOM_ICDF(31283), AOM_ICDF(31610),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14224), AOM_ICDF(14882), AOM_ICDF(28673), AOM_ICDF(29012),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10881), AOM_ICDF(11494), AOM_ICDF(23829), AOM_ICDF(24238),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6367), AOM_ICDF(6988), AOM_ICDF(15685), AOM_ICDF(16306),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7447), AOM_ICDF(11916), AOM_ICDF(17873), AOM_ICDF(22342),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(25536), AOM_ICDF(27216), AOM_ICDF(31570), AOM_ICDF(31916),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19600), AOM_ICDF(21062), AOM_ICDF(30095), AOM_ICDF(30444),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11491), AOM_ICDF(12044), AOM_ICDF(26170), AOM_ICDF(26497),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9629), AOM_ICDF(9963), AOM_ICDF(23790), AOM_ICDF(24112),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8073), AOM_ICDF(8359), AOM_ICDF(22212), AOM_ICDF(22498),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(27425), AOM_ICDF(29611), AOM_ICDF(32005), AOM_ICDF(32347),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20590), AOM_ICDF(24265), AOM_ICDF(31252), AOM_ICDF(31658),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14072), AOM_ICDF(15705), AOM_ICDF(28945), AOM_ICDF(29389),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11295), AOM_ICDF(11926), AOM_ICDF(26485), AOM_ICDF(26872),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10627), AOM_ICDF(11292), AOM_ICDF(22141), AOM_ICDF(22805),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 16X16
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(9850), AOM_ICDF(11321), AOM_ICDF(13211), AOM_ICDF(18246),
-    AOM_ICDF(21613), AOM_ICDF(32768), },
-    {AOM_ICDF(4128), AOM_ICDF(6155), AOM_ICDF(7367), AOM_ICDF(11928),
-    AOM_ICDF(14060), AOM_ICDF(32768), },
-    {AOM_ICDF(932), AOM_ICDF(2794), AOM_ICDF(3234), AOM_ICDF(6647),
-    AOM_ICDF(7340), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(9101), AOM_ICDF(10823), AOM_ICDF(21291), AOM_ICDF(22109),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8086), AOM_ICDF(13032), AOM_ICDF(21855), AOM_ICDF(22748),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6563), AOM_ICDF(10137), AOM_ICDF(18484), AOM_ICDF(20069),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4987), AOM_ICDF(6567), AOM_ICDF(14425), AOM_ICDF(15700),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3399), AOM_ICDF(3947), AOM_ICDF(9950), AOM_ICDF(10738),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1474), AOM_ICDF(1793), AOM_ICDF(4347), AOM_ICDF(4690),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(17035), AOM_ICDF(18650), AOM_ICDF(27401), AOM_ICDF(27793),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13213), AOM_ICDF(16039), AOM_ICDF(26044), AOM_ICDF(26448),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9916), AOM_ICDF(11812), AOM_ICDF(22497), AOM_ICDF(22945),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7227), AOM_ICDF(8059), AOM_ICDF(17399), AOM_ICDF(17817),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5144), AOM_ICDF(5572), AOM_ICDF(12546), AOM_ICDF(12892),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2364), AOM_ICDF(2678), AOM_ICDF(6057), AOM_ICDF(6372),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(19805), AOM_ICDF(21667), AOM_ICDF(29302), AOM_ICDF(29680),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14232), AOM_ICDF(16314), AOM_ICDF(27120), AOM_ICDF(27515),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8796), AOM_ICDF(9578), AOM_ICDF(21112), AOM_ICDF(21479),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5203), AOM_ICDF(5552), AOM_ICDF(14231), AOM_ICDF(14563),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2943), AOM_ICDF(3257), AOM_ICDF(8676), AOM_ICDF(8994),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1363), AOM_ICDF(1675), AOM_ICDF(4064), AOM_ICDF(4376),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(24214), AOM_ICDF(25083), AOM_ICDF(30916), AOM_ICDF(31249),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15904), AOM_ICDF(17001), AOM_ICDF(28199), AOM_ICDF(28532),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8324), AOM_ICDF(8717), AOM_ICDF(20480), AOM_ICDF(20808),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4752), AOM_ICDF(5070), AOM_ICDF(13245), AOM_ICDF(13565),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2729), AOM_ICDF(3042), AOM_ICDF(8218), AOM_ICDF(8530),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1385), AOM_ICDF(1697), AOM_ICDF(4196), AOM_ICDF(4508),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(26956), AOM_ICDF(27719), AOM_ICDF(31679), AOM_ICDF(32005),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16913), AOM_ICDF(17759), AOM_ICDF(29092), AOM_ICDF(29422),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8166), AOM_ICDF(8510), AOM_ICDF(20577), AOM_ICDF(20901),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4804), AOM_ICDF(5119), AOM_ICDF(13537), AOM_ICDF(13853),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2951), AOM_ICDF(3263), AOM_ICDF(8766), AOM_ICDF(9079),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1498), AOM_ICDF(1810), AOM_ICDF(4515), AOM_ICDF(4827),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(7335), AOM_ICDF(13463), AOM_ICDF(14286), AOM_ICDF(24588),
-    AOM_ICDF(29117), AOM_ICDF(32768), },
-    {AOM_ICDF(3212), AOM_ICDF(9869), AOM_ICDF(10336), AOM_ICDF(20172),
-    AOM_ICDF(25029), AOM_ICDF(32768), },
-    {AOM_ICDF(917), AOM_ICDF(6904), AOM_ICDF(7251), AOM_ICDF(15225),
-    AOM_ICDF(18595), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(23988), AOM_ICDF(24467), AOM_ICDF(31033), AOM_ICDF(31407),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20390), AOM_ICDF(23805), AOM_ICDF(30556), AOM_ICDF(30920),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13566), AOM_ICDF(16666), AOM_ICDF(27478), AOM_ICDF(27995),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10353), AOM_ICDF(12637), AOM_ICDF(23789), AOM_ICDF(24437),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7956), AOM_ICDF(9364), AOM_ICDF(19994), AOM_ICDF(20621),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6036), AOM_ICDF(6495), AOM_ICDF(15543), AOM_ICDF(16033),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(25643), AOM_ICDF(26692), AOM_ICDF(31634), AOM_ICDF(31957),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18721), AOM_ICDF(20381), AOM_ICDF(30130), AOM_ICDF(30466),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10914), AOM_ICDF(12337), AOM_ICDF(24817), AOM_ICDF(25177),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7843), AOM_ICDF(8667), AOM_ICDF(19826), AOM_ICDF(20212),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5080), AOM_ICDF(5484), AOM_ICDF(14225), AOM_ICDF(14587),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2880), AOM_ICDF(3192), AOM_ICDF(7916), AOM_ICDF(8236),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26447), AOM_ICDF(27233), AOM_ICDF(31779), AOM_ICDF(32097),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19042), AOM_ICDF(20153), AOM_ICDF(30217), AOM_ICDF(30540),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9858), AOM_ICDF(10440), AOM_ICDF(23424), AOM_ICDF(23753),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6276), AOM_ICDF(6657), AOM_ICDF(17158), AOM_ICDF(17489),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3725), AOM_ICDF(4039), AOM_ICDF(10981), AOM_ICDF(11303),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2041), AOM_ICDF(2345), AOM_ICDF(6069), AOM_ICDF(6373),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(27189), AOM_ICDF(27737), AOM_ICDF(31897), AOM_ICDF(32213),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19763), AOM_ICDF(20443), AOM_ICDF(30288), AOM_ICDF(30607),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9033), AOM_ICDF(9393), AOM_ICDF(22097), AOM_ICDF(22415),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5417), AOM_ICDF(5747), AOM_ICDF(15230), AOM_ICDF(15545),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3397), AOM_ICDF(3709), AOM_ICDF(10342), AOM_ICDF(10655),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2805), AOM_ICDF(3108), AOM_ICDF(6119), AOM_ICDF(6422),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(27874), AOM_ICDF(28490), AOM_ICDF(31981), AOM_ICDF(32301),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20112), AOM_ICDF(20724), AOM_ICDF(30607), AOM_ICDF(30935),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9188), AOM_ICDF(9549), AOM_ICDF(22544), AOM_ICDF(22875),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5590), AOM_ICDF(5918), AOM_ICDF(15550), AOM_ICDF(15878),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3567), AOM_ICDF(4015), AOM_ICDF(10658), AOM_ICDF(10988),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1950), AOM_ICDF(2388), AOM_ICDF(6246), AOM_ICDF(6681),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(25724), AOM_ICDF(26337), AOM_ICDF(28579), AOM_ICDF(29957),
-    AOM_ICDF(30927), AOM_ICDF(32768), },
-    {AOM_ICDF(9657), AOM_ICDF(12074), AOM_ICDF(16790), AOM_ICDF(21738),
-    AOM_ICDF(23899), AOM_ICDF(32768), },
-    {AOM_ICDF(4158), AOM_ICDF(7646), AOM_ICDF(10690), AOM_ICDF(16969),
-    AOM_ICDF(18800), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(14330), AOM_ICDF(19826), AOM_ICDF(28364), AOM_ICDF(29154),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13503), AOM_ICDF(21352), AOM_ICDF(28714), AOM_ICDF(29534),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11754), AOM_ICDF(16853), AOM_ICDF(25931), AOM_ICDF(27325),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8311), AOM_ICDF(10581), AOM_ICDF(21258), AOM_ICDF(22633),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5297), AOM_ICDF(5819), AOM_ICDF(14162), AOM_ICDF(14892),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2887), AOM_ICDF(3208), AOM_ICDF(7455), AOM_ICDF(7768),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(22005), AOM_ICDF(24480), AOM_ICDF(30925), AOM_ICDF(31309),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17332), AOM_ICDF(20557), AOM_ICDF(29696), AOM_ICDF(30096),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11930), AOM_ICDF(14337), AOM_ICDF(25931), AOM_ICDF(26358),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8888), AOM_ICDF(10020), AOM_ICDF(20964), AOM_ICDF(21352),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5694), AOM_ICDF(6135), AOM_ICDF(14997), AOM_ICDF(15376),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2521), AOM_ICDF(2842), AOM_ICDF(7765), AOM_ICDF(8069),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(23993), AOM_ICDF(25546), AOM_ICDF(31427), AOM_ICDF(31762),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18134), AOM_ICDF(20327), AOM_ICDF(29992), AOM_ICDF(30386),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10997), AOM_ICDF(12057), AOM_ICDF(24719), AOM_ICDF(25141),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5719), AOM_ICDF(6153), AOM_ICDF(16654), AOM_ICDF(17032),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3637), AOM_ICDF(3953), AOM_ICDF(11392), AOM_ICDF(11696),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1837), AOM_ICDF(2127), AOM_ICDF(5703), AOM_ICDF(5993),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(26095), AOM_ICDF(26989), AOM_ICDF(31766), AOM_ICDF(32091),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19524), AOM_ICDF(20820), AOM_ICDF(30413), AOM_ICDF(30738),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9962), AOM_ICDF(10551), AOM_ICDF(22667), AOM_ICDF(23010),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5773), AOM_ICDF(6093), AOM_ICDF(15402), AOM_ICDF(15748),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3546), AOM_ICDF(3850), AOM_ICDF(9983), AOM_ICDF(10287),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2387), AOM_ICDF(2668), AOM_ICDF(5711), AOM_ICDF(5992),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29071), AOM_ICDF(29675), AOM_ICDF(31761), AOM_ICDF(32087),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18709), AOM_ICDF(19761), AOM_ICDF(29374), AOM_ICDF(29730),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9336), AOM_ICDF(10048), AOM_ICDF(22625), AOM_ICDF(22988),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6446), AOM_ICDF(6793), AOM_ICDF(16834), AOM_ICDF(17172),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4227), AOM_ICDF(4539), AOM_ICDF(11587), AOM_ICDF(11909),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2624), AOM_ICDF(2929), AOM_ICDF(7139), AOM_ICDF(7444),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(25114), AOM_ICDF(25872), AOM_ICDF(29577), AOM_ICDF(31173),
-    AOM_ICDF(32008), AOM_ICDF(32768), },
-    {AOM_ICDF(11286), AOM_ICDF(14376), AOM_ICDF(22156), AOM_ICDF(26266),
-    AOM_ICDF(29278), AOM_ICDF(32768), },
-    {AOM_ICDF(2680), AOM_ICDF(11055), AOM_ICDF(14683), AOM_ICDF(23068),
-    AOM_ICDF(26651), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(22838), AOM_ICDF(24926), AOM_ICDF(31689), AOM_ICDF(32019),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19245), AOM_ICDF(24299), AOM_ICDF(31481), AOM_ICDF(31852),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15429), AOM_ICDF(21159), AOM_ICDF(30176), AOM_ICDF(30732),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12373), AOM_ICDF(17092), AOM_ICDF(26912), AOM_ICDF(27758),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10899), AOM_ICDF(13395), AOM_ICDF(23604), AOM_ICDF(24329),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12767), AOM_ICDF(13096), AOM_ICDF(21644), AOM_ICDF(22083),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(24527), AOM_ICDF(26101), AOM_ICDF(31912), AOM_ICDF(32226),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20576), AOM_ICDF(22265), AOM_ICDF(31439), AOM_ICDF(31762),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13792), AOM_ICDF(15369), AOM_ICDF(28531), AOM_ICDF(28942),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9392), AOM_ICDF(11153), AOM_ICDF(23790), AOM_ICDF(24274),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5587), AOM_ICDF(6191), AOM_ICDF(19027), AOM_ICDF(19480),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5174), AOM_ICDF(10348), AOM_ICDF(17246), AOM_ICDF(22420),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(24737), AOM_ICDF(25605), AOM_ICDF(31953), AOM_ICDF(32268),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20933), AOM_ICDF(21817), AOM_ICDF(31546), AOM_ICDF(31861),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13887), AOM_ICDF(14656), AOM_ICDF(28490), AOM_ICDF(28817),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10018), AOM_ICDF(11047), AOM_ICDF(23593), AOM_ICDF(23967),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3855), AOM_ICDF(6746), AOM_ICDF(15420), AOM_ICDF(18312),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(25861), AOM_ICDF(26475), AOM_ICDF(32028), AOM_ICDF(32343),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22221), AOM_ICDF(22755), AOM_ICDF(31735), AOM_ICDF(32050),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15517), AOM_ICDF(15928), AOM_ICDF(29558), AOM_ICDF(29870),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7719), AOM_ICDF(8507), AOM_ICDF(20165), AOM_ICDF(20638),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5783), AOM_ICDF(11565), AOM_ICDF(19275), AOM_ICDF(25058),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(28675), AOM_ICDF(29326), AOM_ICDF(31767), AOM_ICDF(32092),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21491), AOM_ICDF(22422), AOM_ICDF(29827), AOM_ICDF(30197),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10080), AOM_ICDF(11350), AOM_ICDF(23883), AOM_ICDF(24321),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8383), AOM_ICDF(8793), AOM_ICDF(21382), AOM_ICDF(21739),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6835), AOM_ICDF(7137), AOM_ICDF(20646), AOM_ICDF(20947),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 32X32
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(15501), AOM_ICDF(16574), AOM_ICDF(17941), AOM_ICDF(20080),
-    AOM_ICDF(21984), AOM_ICDF(32768), },
-    {AOM_ICDF(1676), AOM_ICDF(3221), AOM_ICDF(3952), AOM_ICDF(6916),
-    AOM_ICDF(7628), AOM_ICDF(32768), },
-    {AOM_ICDF(468), AOM_ICDF(1825), AOM_ICDF(2211), AOM_ICDF(4504),
-    AOM_ICDF(4877), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(5597), AOM_ICDF(9461), AOM_ICDF(16777), AOM_ICDF(17896),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5231), AOM_ICDF(9185), AOM_ICDF(16569), AOM_ICDF(17688),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4128), AOM_ICDF(6983), AOM_ICDF(13860), AOM_ICDF(15674),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2908), AOM_ICDF(4209), AOM_ICDF(9762), AOM_ICDF(11321),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2269), AOM_ICDF(2797), AOM_ICDF(7063), AOM_ICDF(7999),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1270), AOM_ICDF(1588), AOM_ICDF(3710), AOM_ICDF(4051),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(14862), AOM_ICDF(16903), AOM_ICDF(25712), AOM_ICDF(26189),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12778), AOM_ICDF(15420), AOM_ICDF(25395), AOM_ICDF(25836),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10402), AOM_ICDF(12279), AOM_ICDF(22858), AOM_ICDF(23302),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8026), AOM_ICDF(8897), AOM_ICDF(18866), AOM_ICDF(19290),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6610), AOM_ICDF(7121), AOM_ICDF(15967), AOM_ICDF(16322),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3980), AOM_ICDF(4296), AOM_ICDF(10443), AOM_ICDF(10757),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(19177), AOM_ICDF(21516), AOM_ICDF(28474), AOM_ICDF(28892),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14901), AOM_ICDF(17006), AOM_ICDF(27100), AOM_ICDF(27500),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10655), AOM_ICDF(11487), AOM_ICDF(23288), AOM_ICDF(23664),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6980), AOM_ICDF(7408), AOM_ICDF(17955), AOM_ICDF(18288),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3891), AOM_ICDF(4206), AOM_ICDF(11255), AOM_ICDF(11570),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1532), AOM_ICDF(1844), AOM_ICDF(4593), AOM_ICDF(4905),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(24338), AOM_ICDF(25864), AOM_ICDF(30962), AOM_ICDF(31346),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16430), AOM_ICDF(18166), AOM_ICDF(28700), AOM_ICDF(29068),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9726), AOM_ICDF(10244), AOM_ICDF(22575), AOM_ICDF(22934),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5539), AOM_ICDF(5868), AOM_ICDF(15030), AOM_ICDF(15363),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3305), AOM_ICDF(3620), AOM_ICDF(9405), AOM_ICDF(9720),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1482), AOM_ICDF(1794), AOM_ICDF(4429), AOM_ICDF(4741),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29843), AOM_ICDF(30312), AOM_ICDF(31922), AOM_ICDF(32242),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17390), AOM_ICDF(18061), AOM_ICDF(28932), AOM_ICDF(29258),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7968), AOM_ICDF(8308), AOM_ICDF(20128), AOM_ICDF(20447),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4523), AOM_ICDF(4838), AOM_ICDF(12959), AOM_ICDF(13274),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2765), AOM_ICDF(3077), AOM_ICDF(8284), AOM_ICDF(8596),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1422), AOM_ICDF(1733), AOM_ICDF(4244), AOM_ICDF(4556),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(19066), AOM_ICDF(20217), AOM_ICDF(21504), AOM_ICDF(24559),
-    AOM_ICDF(26831), AOM_ICDF(32768), },
-    {AOM_ICDF(5708), AOM_ICDF(7393), AOM_ICDF(8108), AOM_ICDF(11986),
-    AOM_ICDF(17424), AOM_ICDF(32768), },
-    {AOM_ICDF(1144), AOM_ICDF(2709), AOM_ICDF(3111), AOM_ICDF(6009),
-    AOM_ICDF(10882), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(17586), AOM_ICDF(17895), AOM_ICDF(27561), AOM_ICDF(28179),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16442), AOM_ICDF(19113), AOM_ICDF(27944), AOM_ICDF(28764),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12438), AOM_ICDF(17724), AOM_ICDF(26435), AOM_ICDF(27714),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9439), AOM_ICDF(12708), AOM_ICDF(22594), AOM_ICDF(24060),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7762), AOM_ICDF(9639), AOM_ICDF(19669), AOM_ICDF(20614),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5324), AOM_ICDF(5894), AOM_ICDF(14504), AOM_ICDF(15100),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(23414), AOM_ICDF(25239), AOM_ICDF(31300), AOM_ICDF(31670),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18089), AOM_ICDF(22136), AOM_ICDF(30318), AOM_ICDF(30720),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12081), AOM_ICDF(15216), AOM_ICDF(27074), AOM_ICDF(27531),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9327), AOM_ICDF(10783), AOM_ICDF(22927), AOM_ICDF(23384),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6381), AOM_ICDF(6914), AOM_ICDF(17070), AOM_ICDF(17506),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3854), AOM_ICDF(4164), AOM_ICDF(10355), AOM_ICDF(10665),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(24366), AOM_ICDF(25993), AOM_ICDF(31678), AOM_ICDF(32001),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18041), AOM_ICDF(21047), AOM_ICDF(30693), AOM_ICDF(31031),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11271), AOM_ICDF(12970), AOM_ICDF(26794), AOM_ICDF(27180),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8173), AOM_ICDF(8758), AOM_ICDF(21941), AOM_ICDF(22340),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5248), AOM_ICDF(5568), AOM_ICDF(15646), AOM_ICDF(15994),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2689), AOM_ICDF(3193), AOM_ICDF(6722), AOM_ICDF(7226),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(27565), AOM_ICDF(28694), AOM_ICDF(31993), AOM_ICDF(32314),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20591), AOM_ICDF(22532), AOM_ICDF(31143), AOM_ICDF(31473),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11268), AOM_ICDF(12113), AOM_ICDF(25966), AOM_ICDF(26331),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7268), AOM_ICDF(7674), AOM_ICDF(19409), AOM_ICDF(19747),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4404), AOM_ICDF(4686), AOM_ICDF(13213), AOM_ICDF(13495),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2637), AOM_ICDF(3766), AOM_ICDF(7533), AOM_ICDF(8663),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29847), AOM_ICDF(30306), AOM_ICDF(32081), AOM_ICDF(32397),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22752), AOM_ICDF(23329), AOM_ICDF(31334), AOM_ICDF(31658),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10305), AOM_ICDF(10672), AOM_ICDF(24328), AOM_ICDF(24657),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5712), AOM_ICDF(6031), AOM_ICDF(16694), AOM_ICDF(17018),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3979), AOM_ICDF(4278), AOM_ICDF(10985), AOM_ICDF(11284),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2465), AOM_ICDF(2900), AOM_ICDF(6815), AOM_ICDF(7250),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(21609), AOM_ICDF(22111), AOM_ICDF(24624), AOM_ICDF(26045),
-    AOM_ICDF(27916), AOM_ICDF(32768), },
-    {AOM_ICDF(5498), AOM_ICDF(7300), AOM_ICDF(12100), AOM_ICDF(15851),
-    AOM_ICDF(18370), AOM_ICDF(32768), },
-    {AOM_ICDF(1268), AOM_ICDF(3284), AOM_ICDF(6295), AOM_ICDF(10711),
-    AOM_ICDF(12999), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(9621), AOM_ICDF(16733), AOM_ICDF(26354), AOM_ICDF(27609),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9619), AOM_ICDF(18339), AOM_ICDF(27578), AOM_ICDF(28547),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9575), AOM_ICDF(18177), AOM_ICDF(24044), AOM_ICDF(25625),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5999), AOM_ICDF(11578), AOM_ICDF(20125), AOM_ICDF(22544),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4842), AOM_ICDF(6220), AOM_ICDF(12898), AOM_ICDF(14944),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(948), AOM_ICDF(1247), AOM_ICDF(3292), AOM_ICDF(3791),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(21002), AOM_ICDF(25135), AOM_ICDF(31208), AOM_ICDF(31629),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18106), AOM_ICDF(22116), AOM_ICDF(29422), AOM_ICDF(30013),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14580), AOM_ICDF(15855), AOM_ICDF(26171), AOM_ICDF(26535),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9965), AOM_ICDF(10971), AOM_ICDF(23043), AOM_ICDF(23378),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7123), AOM_ICDF(7395), AOM_ICDF(16893), AOM_ICDF(17232),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3187), AOM_ICDF(3432), AOM_ICDF(7600), AOM_ICDF(7845),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26393), AOM_ICDF(27823), AOM_ICDF(31691), AOM_ICDF(32028),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18449), AOM_ICDF(20915), AOM_ICDF(30092), AOM_ICDF(30531),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11710), AOM_ICDF(12263), AOM_ICDF(26838), AOM_ICDF(27139),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7737), AOM_ICDF(8192), AOM_ICDF(21299), AOM_ICDF(21572),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3572), AOM_ICDF(4038), AOM_ICDF(13822), AOM_ICDF(14287),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1689), AOM_ICDF(2703), AOM_ICDF(3716), AOM_ICDF(4729),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28371), AOM_ICDF(29507), AOM_ICDF(31986), AOM_ICDF(32314),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19411), AOM_ICDF(21758), AOM_ICDF(30225), AOM_ICDF(30579),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11995), AOM_ICDF(12434), AOM_ICDF(26661), AOM_ICDF(27026),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9175), AOM_ICDF(9721), AOM_ICDF(22173), AOM_ICDF(22501),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9078), AOM_ICDF(9742), AOM_ICDF(13063), AOM_ICDF(13727),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3192), AOM_ICDF(3830), AOM_ICDF(6809), AOM_ICDF(7447),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(31351), AOM_ICDF(31682), AOM_ICDF(32124), AOM_ICDF(32438),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20883), AOM_ICDF(22618), AOM_ICDF(30828), AOM_ICDF(31173),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11388), AOM_ICDF(12381), AOM_ICDF(24266), AOM_ICDF(24700),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6987), AOM_ICDF(7380), AOM_ICDF(18401), AOM_ICDF(18795),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2016), AOM_ICDF(2773), AOM_ICDF(7814), AOM_ICDF(8570),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2849), AOM_ICDF(4986), AOM_ICDF(8548), AOM_ICDF(10685),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(19461), AOM_ICDF(21728), AOM_ICDF(26601), AOM_ICDF(29082),
-    AOM_ICDF(30105), AOM_ICDF(32768), },
-    {AOM_ICDF(2845), AOM_ICDF(10798), AOM_ICDF(14316), AOM_ICDF(23118),
-    AOM_ICDF(24609), AOM_ICDF(32768), },
-    {AOM_ICDF(705), AOM_ICDF(10138), AOM_ICDF(12123), AOM_ICDF(21473),
-    AOM_ICDF(23327), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(24780), AOM_ICDF(25836), AOM_ICDF(31623), AOM_ICDF(31938),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22703), AOM_ICDF(24390), AOM_ICDF(31353), AOM_ICDF(31797),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18218), AOM_ICDF(20834), AOM_ICDF(29429), AOM_ICDF(30327),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12517), AOM_ICDF(15626), AOM_ICDF(26000), AOM_ICDF(27281),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9988), AOM_ICDF(12791), AOM_ICDF(24073), AOM_ICDF(25295),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8529), AOM_ICDF(9202), AOM_ICDF(18853), AOM_ICDF(19751),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(26497), AOM_ICDF(27282), AOM_ICDF(32016), AOM_ICDF(32333),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22767), AOM_ICDF(24548), AOM_ICDF(31680), AOM_ICDF(32007),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10455), AOM_ICDF(13458), AOM_ICDF(26448), AOM_ICDF(26995),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3684), AOM_ICDF(4847), AOM_ICDF(20940), AOM_ICDF(21522),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9063), AOM_ICDF(11155), AOM_ICDF(17430), AOM_ICDF(19521),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6554), AOM_ICDF(11469), AOM_ICDF(16384), AOM_ICDF(21299),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26212), AOM_ICDF(26755), AOM_ICDF(32090), AOM_ICDF(32400),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22239), AOM_ICDF(23123), AOM_ICDF(31406), AOM_ICDF(31725),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7220), AOM_ICDF(7609), AOM_ICDF(22715), AOM_ICDF(22993),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5554), AOM_ICDF(6387), AOM_ICDF(11941), AOM_ICDF(12774),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4915), AOM_ICDF(9830), AOM_ICDF(19661), AOM_ICDF(24576),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28796), AOM_ICDF(29237), AOM_ICDF(32134), AOM_ICDF(32446),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(25912), AOM_ICDF(26456), AOM_ICDF(32010), AOM_ICDF(32321),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14399), AOM_ICDF(14668), AOM_ICDF(26039), AOM_ICDF(26309),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2341), AOM_ICDF(4096), AOM_ICDF(11703), AOM_ICDF(13458),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(30253), AOM_ICDF(30635), AOM_ICDF(32016), AOM_ICDF(32330),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(23066), AOM_ICDF(23485), AOM_ICDF(30571), AOM_ICDF(30897),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11664), AOM_ICDF(12092), AOM_ICDF(22146), AOM_ICDF(22496),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5932), AOM_ICDF(6387), AOM_ICDF(17131), AOM_ICDF(17470),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5501), AOM_ICDF(5846), AOM_ICDF(15763), AOM_ICDF(16097),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4946), AOM_ICDF(6801), AOM_ICDF(14838), AOM_ICDF(16693),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-};
+static const aom_cdf_prob
+    av1_default_dc_sign_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][DC_SIGN_CONTEXTS]
+                            [CDF_SIZE(2)] = {
+                              { {
+                                    { AOM_CDF2(128 * 125) },
+                                    { AOM_CDF2(128 * 102) },
+                                    { AOM_CDF2(128 * 147) },
+                                },
+                                {
+                                    { AOM_CDF2(128 * 119) },
+                                    { AOM_CDF2(128 * 101) },
+                                    { AOM_CDF2(128 * 135) },
+                                } },
+                              { {
+                                    { AOM_CDF2(128 * 125) },
+                                    { AOM_CDF2(128 * 102) },
+                                    { AOM_CDF2(128 * 147) },
+                                },
+                                {
+                                    { AOM_CDF2(128 * 119) },
+                                    { AOM_CDF2(128 * 101) },
+                                    { AOM_CDF2(128 * 135) },
+                                } },
+                              { {
+                                    { AOM_CDF2(128 * 125) },
+                                    { AOM_CDF2(128 * 102) },
+                                    { AOM_CDF2(128 * 147) },
+                                },
+                                {
+                                    { AOM_CDF2(128 * 119) },
+                                    { AOM_CDF2(128 * 101) },
+                                    { AOM_CDF2(128 * 135) },
+                                } },
+                              { {
+                                    { AOM_CDF2(128 * 125) },
+                                    { AOM_CDF2(128 * 102) },
+                                    { AOM_CDF2(128 * 147) },
+                                },
+                                {
+                                    { AOM_CDF2(128 * 119) },
+                                    { AOM_CDF2(128 * 101) },
+                                    { AOM_CDF2(128 * 135) },
+                                } },
+                            };
+
+static const aom_cdf_prob
+    av1_default_txb_skip_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS]
+                             [CDF_SIZE(2)] = { { { { AOM_CDF2(31849) },
+                                                   { AOM_CDF2(5892) },
+                                                   { AOM_CDF2(12112) },
+                                                   { AOM_CDF2(21935) },
+                                                   { AOM_CDF2(20289) },
+                                                   { AOM_CDF2(27473) },
+                                                   { AOM_CDF2(32487) },
+                                                   { AOM_CDF2(7654) },
+                                                   { AOM_CDF2(19473) },
+                                                   { AOM_CDF2(29984) },
+                                                   { AOM_CDF2(9961) },
+                                                   { AOM_CDF2(30242) },
+                                                   { AOM_CDF2(32117) } },
+                                                 { { AOM_CDF2(31548) },
+                                                   { AOM_CDF2(1549) },
+                                                   { AOM_CDF2(10130) },
+                                                   { AOM_CDF2(16656) },
+                                                   { AOM_CDF2(18591) },
+                                                   { AOM_CDF2(26308) },
+                                                   { AOM_CDF2(32537) },
+                                                   { AOM_CDF2(5403) },
+                                                   { AOM_CDF2(18096) },
+                                                   { AOM_CDF2(30003) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(29957) },
+                                                   { AOM_CDF2(5391) },
+                                                   { AOM_CDF2(18039) },
+                                                   { AOM_CDF2(23566) },
+                                                   { AOM_CDF2(22431) },
+                                                   { AOM_CDF2(25822) },
+                                                   { AOM_CDF2(32197) },
+                                                   { AOM_CDF2(3778) },
+                                                   { AOM_CDF2(15336) },
+                                                   { AOM_CDF2(28981) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(17920) },
+                                                   { AOM_CDF2(1818) },
+                                                   { AOM_CDF2(7282) },
+                                                   { AOM_CDF2(25273) },
+                                                   { AOM_CDF2(10923) },
+                                                   { AOM_CDF2(31554) },
+                                                   { AOM_CDF2(32624) },
+                                                   { AOM_CDF2(1366) },
+                                                   { AOM_CDF2(15628) },
+                                                   { AOM_CDF2(30462) },
+                                                   { AOM_CDF2(146) },
+                                                   { AOM_CDF2(5132) },
+                                                   { AOM_CDF2(31657) } },
+                                                 { { AOM_CDF2(6308) },
+                                                   { AOM_CDF2(117) },
+                                                   { AOM_CDF2(1638) },
+                                                   { AOM_CDF2(2161) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(10923) },
+                                                   { AOM_CDF2(30247) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } } },
+                                               { { { AOM_CDF2(30371) },
+                                                   { AOM_CDF2(7570) },
+                                                   { AOM_CDF2(13155) },
+                                                   { AOM_CDF2(20751) },
+                                                   { AOM_CDF2(20969) },
+                                                   { AOM_CDF2(27067) },
+                                                   { AOM_CDF2(32013) },
+                                                   { AOM_CDF2(5495) },
+                                                   { AOM_CDF2(17942) },
+                                                   { AOM_CDF2(28280) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(31782) },
+                                                   { AOM_CDF2(1836) },
+                                                   { AOM_CDF2(10689) },
+                                                   { AOM_CDF2(17604) },
+                                                   { AOM_CDF2(21622) },
+                                                   { AOM_CDF2(27518) },
+                                                   { AOM_CDF2(32399) },
+                                                   { AOM_CDF2(4419) },
+                                                   { AOM_CDF2(16294) },
+                                                   { AOM_CDF2(28345) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(31901) },
+                                                   { AOM_CDF2(10311) },
+                                                   { AOM_CDF2(18047) },
+                                                   { AOM_CDF2(24806) },
+                                                   { AOM_CDF2(23288) },
+                                                   { AOM_CDF2(27914) },
+                                                   { AOM_CDF2(32296) },
+                                                   { AOM_CDF2(4215) },
+                                                   { AOM_CDF2(15756) },
+                                                   { AOM_CDF2(28341) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(26726) },
+                                                   { AOM_CDF2(1045) },
+                                                   { AOM_CDF2(11703) },
+                                                   { AOM_CDF2(20590) },
+                                                   { AOM_CDF2(18554) },
+                                                   { AOM_CDF2(25970) },
+                                                   { AOM_CDF2(31938) },
+                                                   { AOM_CDF2(5583) },
+                                                   { AOM_CDF2(21313) },
+                                                   { AOM_CDF2(29390) },
+                                                   { AOM_CDF2(641) },
+                                                   { AOM_CDF2(22265) },
+                                                   { AOM_CDF2(31452) } },
+                                                 { { AOM_CDF2(26584) },
+                                                   { AOM_CDF2(188) },
+                                                   { AOM_CDF2(8847) },
+                                                   { AOM_CDF2(24519) },
+                                                   { AOM_CDF2(22938) },
+                                                   { AOM_CDF2(30583) },
+                                                   { AOM_CDF2(32608) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } } },
+                                               { { { AOM_CDF2(29614) },
+                                                   { AOM_CDF2(9068) },
+                                                   { AOM_CDF2(12924) },
+                                                   { AOM_CDF2(19538) },
+                                                   { AOM_CDF2(17737) },
+                                                   { AOM_CDF2(24619) },
+                                                   { AOM_CDF2(30642) },
+                                                   { AOM_CDF2(4119) },
+                                                   { AOM_CDF2(16026) },
+                                                   { AOM_CDF2(25657) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(31957) },
+                                                   { AOM_CDF2(3230) },
+                                                   { AOM_CDF2(11153) },
+                                                   { AOM_CDF2(18123) },
+                                                   { AOM_CDF2(20143) },
+                                                   { AOM_CDF2(26536) },
+                                                   { AOM_CDF2(31986) },
+                                                   { AOM_CDF2(3050) },
+                                                   { AOM_CDF2(14603) },
+                                                   { AOM_CDF2(25155) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(32363) },
+                                                   { AOM_CDF2(10692) },
+                                                   { AOM_CDF2(19090) },
+                                                   { AOM_CDF2(24357) },
+                                                   { AOM_CDF2(24442) },
+                                                   { AOM_CDF2(28312) },
+                                                   { AOM_CDF2(32169) },
+                                                   { AOM_CDF2(3648) },
+                                                   { AOM_CDF2(15690) },
+                                                   { AOM_CDF2(26815) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(30669) },
+                                                   { AOM_CDF2(3832) },
+                                                   { AOM_CDF2(11663) },
+                                                   { AOM_CDF2(18889) },
+                                                   { AOM_CDF2(19782) },
+                                                   { AOM_CDF2(23313) },
+                                                   { AOM_CDF2(31330) },
+                                                   { AOM_CDF2(5124) },
+                                                   { AOM_CDF2(18719) },
+                                                   { AOM_CDF2(28468) },
+                                                   { AOM_CDF2(3082) },
+                                                   { AOM_CDF2(20982) },
+                                                   { AOM_CDF2(29443) } },
+                                                 { { AOM_CDF2(28573) },
+                                                   { AOM_CDF2(3183) },
+                                                   { AOM_CDF2(17802) },
+                                                   { AOM_CDF2(25977) },
+                                                   { AOM_CDF2(26677) },
+                                                   { AOM_CDF2(27832) },
+                                                   { AOM_CDF2(32387) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } } },
+                                               { { { AOM_CDF2(26887) },
+                                                   { AOM_CDF2(6729) },
+                                                   { AOM_CDF2(10361) },
+                                                   { AOM_CDF2(17442) },
+                                                   { AOM_CDF2(15045) },
+                                                   { AOM_CDF2(22478) },
+                                                   { AOM_CDF2(29072) },
+                                                   { AOM_CDF2(2713) },
+                                                   { AOM_CDF2(11861) },
+                                                   { AOM_CDF2(20773) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(31903) },
+                                                   { AOM_CDF2(2044) },
+                                                   { AOM_CDF2(7528) },
+                                                   { AOM_CDF2(14618) },
+                                                   { AOM_CDF2(16182) },
+                                                   { AOM_CDF2(24168) },
+                                                   { AOM_CDF2(31037) },
+                                                   { AOM_CDF2(2786) },
+                                                   { AOM_CDF2(11194) },
+                                                   { AOM_CDF2(20155) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(32510) },
+                                                   { AOM_CDF2(8430) },
+                                                   { AOM_CDF2(17318) },
+                                                   { AOM_CDF2(24154) },
+                                                   { AOM_CDF2(23674) },
+                                                   { AOM_CDF2(28789) },
+                                                   { AOM_CDF2(32139) },
+                                                   { AOM_CDF2(3440) },
+                                                   { AOM_CDF2(13117) },
+                                                   { AOM_CDF2(22702) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(31671) },
+                                                   { AOM_CDF2(2056) },
+                                                   { AOM_CDF2(11746) },
+                                                   { AOM_CDF2(16852) },
+                                                   { AOM_CDF2(18635) },
+                                                   { AOM_CDF2(24715) },
+                                                   { AOM_CDF2(31484) },
+                                                   { AOM_CDF2(4656) },
+                                                   { AOM_CDF2(16074) },
+                                                   { AOM_CDF2(24704) },
+                                                   { AOM_CDF2(1806) },
+                                                   { AOM_CDF2(14645) },
+                                                   { AOM_CDF2(25336) } },
+                                                 { { AOM_CDF2(31539) },
+                                                   { AOM_CDF2(8433) },
+                                                   { AOM_CDF2(20576) },
+                                                   { AOM_CDF2(27904) },
+                                                   { AOM_CDF2(27852) },
+                                                   { AOM_CDF2(30026) },
+                                                   { AOM_CDF2(32441) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } } } };
+
+static const aom_cdf_prob
+    av1_default_eob_extra_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                              [EOB_COEF_CONTEXTS][CDF_SIZE(2)] = {
+                                { { {
+                                        { AOM_CDF2(16961) },
+                                        { AOM_CDF2(17223) },
+                                        { AOM_CDF2(7621) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(19069) },
+                                        { AOM_CDF2(22525) },
+                                        { AOM_CDF2(13377) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(20401) },
+                                        { AOM_CDF2(17025) },
+                                        { AOM_CDF2(12845) },
+                                        { AOM_CDF2(12873) },
+                                        { AOM_CDF2(14094) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(20681) },
+                                        { AOM_CDF2(20701) },
+                                        { AOM_CDF2(15250) },
+                                        { AOM_CDF2(15017) },
+                                        { AOM_CDF2(14928) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(23905) },
+                                        { AOM_CDF2(17194) },
+                                        { AOM_CDF2(16170) },
+                                        { AOM_CDF2(17695) },
+                                        { AOM_CDF2(13826) },
+                                        { AOM_CDF2(15810) },
+                                        { AOM_CDF2(12036) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(23959) },
+                                        { AOM_CDF2(20799) },
+                                        { AOM_CDF2(19021) },
+                                        { AOM_CDF2(16203) },
+                                        { AOM_CDF2(17886) },
+                                        { AOM_CDF2(14144) },
+                                        { AOM_CDF2(12010) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(27399) },
+                                        { AOM_CDF2(16327) },
+                                        { AOM_CDF2(18071) },
+                                        { AOM_CDF2(19584) },
+                                        { AOM_CDF2(20721) },
+                                        { AOM_CDF2(18432) },
+                                        { AOM_CDF2(19560) },
+                                        { AOM_CDF2(10150) },
+                                        { AOM_CDF2(8805) },
+                                    },
+                                    {
+                                        { AOM_CDF2(24932) },
+                                        { AOM_CDF2(20833) },
+                                        { AOM_CDF2(12027) },
+                                        { AOM_CDF2(16670) },
+                                        { AOM_CDF2(19914) },
+                                        { AOM_CDF2(15106) },
+                                        { AOM_CDF2(17662) },
+                                        { AOM_CDF2(13783) },
+                                        { AOM_CDF2(28756) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(23406) },
+                                        { AOM_CDF2(21845) },
+                                        { AOM_CDF2(18432) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(17096) },
+                                        { AOM_CDF2(12561) },
+                                        { AOM_CDF2(17320) },
+                                        { AOM_CDF2(22395) },
+                                        { AOM_CDF2(21370) },
+                                    },
+                                    {
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } } },
+                                { { {
+                                        { AOM_CDF2(17471) },
+                                        { AOM_CDF2(20223) },
+                                        { AOM_CDF2(11357) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(20335) },
+                                        { AOM_CDF2(21667) },
+                                        { AOM_CDF2(14818) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(20430) },
+                                        { AOM_CDF2(20662) },
+                                        { AOM_CDF2(15367) },
+                                        { AOM_CDF2(16970) },
+                                        { AOM_CDF2(14657) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(22117) },
+                                        { AOM_CDF2(22028) },
+                                        { AOM_CDF2(18650) },
+                                        { AOM_CDF2(16042) },
+                                        { AOM_CDF2(15885) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(22409) },
+                                        { AOM_CDF2(21012) },
+                                        { AOM_CDF2(15650) },
+                                        { AOM_CDF2(17395) },
+                                        { AOM_CDF2(15469) },
+                                        { AOM_CDF2(20205) },
+                                        { AOM_CDF2(19511) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(24220) },
+                                        { AOM_CDF2(22480) },
+                                        { AOM_CDF2(17737) },
+                                        { AOM_CDF2(18916) },
+                                        { AOM_CDF2(19268) },
+                                        { AOM_CDF2(18412) },
+                                        { AOM_CDF2(18844) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(25991) },
+                                        { AOM_CDF2(20314) },
+                                        { AOM_CDF2(17731) },
+                                        { AOM_CDF2(19678) },
+                                        { AOM_CDF2(18649) },
+                                        { AOM_CDF2(17307) },
+                                        { AOM_CDF2(21798) },
+                                        { AOM_CDF2(17549) },
+                                        { AOM_CDF2(15630) },
+                                    },
+                                    {
+                                        { AOM_CDF2(26585) },
+                                        { AOM_CDF2(21469) },
+                                        { AOM_CDF2(20432) },
+                                        { AOM_CDF2(17735) },
+                                        { AOM_CDF2(19280) },
+                                        { AOM_CDF2(15235) },
+                                        { AOM_CDF2(20297) },
+                                        { AOM_CDF2(22471) },
+                                        { AOM_CDF2(28997) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(26605) },
+                                        { AOM_CDF2(11304) },
+                                        { AOM_CDF2(16726) },
+                                        { AOM_CDF2(16560) },
+                                        { AOM_CDF2(20866) },
+                                        { AOM_CDF2(23524) },
+                                        { AOM_CDF2(19878) },
+                                        { AOM_CDF2(13469) },
+                                        { AOM_CDF2(23084) },
+                                    },
+                                    {
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } } },
+                                { { {
+                                        { AOM_CDF2(18983) },
+                                        { AOM_CDF2(20512) },
+                                        { AOM_CDF2(14885) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(20090) },
+                                        { AOM_CDF2(19444) },
+                                        { AOM_CDF2(17286) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(19139) },
+                                        { AOM_CDF2(21487) },
+                                        { AOM_CDF2(18959) },
+                                        { AOM_CDF2(20910) },
+                                        { AOM_CDF2(19089) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(20536) },
+                                        { AOM_CDF2(20664) },
+                                        { AOM_CDF2(20625) },
+                                        { AOM_CDF2(19123) },
+                                        { AOM_CDF2(14862) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(19833) },
+                                        { AOM_CDF2(21502) },
+                                        { AOM_CDF2(17485) },
+                                        { AOM_CDF2(20267) },
+                                        { AOM_CDF2(18353) },
+                                        { AOM_CDF2(23329) },
+                                        { AOM_CDF2(21478) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(22041) },
+                                        { AOM_CDF2(23434) },
+                                        { AOM_CDF2(20001) },
+                                        { AOM_CDF2(20554) },
+                                        { AOM_CDF2(20951) },
+                                        { AOM_CDF2(20145) },
+                                        { AOM_CDF2(15562) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(23312) },
+                                        { AOM_CDF2(21607) },
+                                        { AOM_CDF2(16526) },
+                                        { AOM_CDF2(18957) },
+                                        { AOM_CDF2(18034) },
+                                        { AOM_CDF2(18934) },
+                                        { AOM_CDF2(24247) },
+                                        { AOM_CDF2(16921) },
+                                        { AOM_CDF2(17080) },
+                                    },
+                                    {
+                                        { AOM_CDF2(26579) },
+                                        { AOM_CDF2(24910) },
+                                        { AOM_CDF2(18637) },
+                                        { AOM_CDF2(19800) },
+                                        { AOM_CDF2(20388) },
+                                        { AOM_CDF2(9887) },
+                                        { AOM_CDF2(15642) },
+                                        { AOM_CDF2(30198) },
+                                        { AOM_CDF2(24721) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(26998) },
+                                        { AOM_CDF2(16737) },
+                                        { AOM_CDF2(17838) },
+                                        { AOM_CDF2(18922) },
+                                        { AOM_CDF2(19515) },
+                                        { AOM_CDF2(18636) },
+                                        { AOM_CDF2(17333) },
+                                        { AOM_CDF2(15776) },
+                                        { AOM_CDF2(22658) },
+                                    },
+                                    {
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } } },
+                                { { {
+                                        { AOM_CDF2(20177) },
+                                        { AOM_CDF2(20789) },
+                                        { AOM_CDF2(20262) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(21416) },
+                                        { AOM_CDF2(20855) },
+                                        { AOM_CDF2(23410) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(20238) },
+                                        { AOM_CDF2(21057) },
+                                        { AOM_CDF2(19159) },
+                                        { AOM_CDF2(22337) },
+                                        { AOM_CDF2(20159) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(20125) },
+                                        { AOM_CDF2(20559) },
+                                        { AOM_CDF2(21707) },
+                                        { AOM_CDF2(22296) },
+                                        { AOM_CDF2(17333) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(19941) },
+                                        { AOM_CDF2(20527) },
+                                        { AOM_CDF2(21470) },
+                                        { AOM_CDF2(22487) },
+                                        { AOM_CDF2(19558) },
+                                        { AOM_CDF2(22354) },
+                                        { AOM_CDF2(20331) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(22752) },
+                                        { AOM_CDF2(25006) },
+                                        { AOM_CDF2(22075) },
+                                        { AOM_CDF2(21576) },
+                                        { AOM_CDF2(17740) },
+                                        { AOM_CDF2(21690) },
+                                        { AOM_CDF2(19211) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(21442) },
+                                        { AOM_CDF2(22358) },
+                                        { AOM_CDF2(18503) },
+                                        { AOM_CDF2(20291) },
+                                        { AOM_CDF2(19945) },
+                                        { AOM_CDF2(21294) },
+                                        { AOM_CDF2(21178) },
+                                        { AOM_CDF2(19400) },
+                                        { AOM_CDF2(10556) },
+                                    },
+                                    {
+                                        { AOM_CDF2(24648) },
+                                        { AOM_CDF2(24949) },
+                                        { AOM_CDF2(20708) },
+                                        { AOM_CDF2(23905) },
+                                        { AOM_CDF2(20501) },
+                                        { AOM_CDF2(9558) },
+                                        { AOM_CDF2(9423) },
+                                        { AOM_CDF2(30365) },
+                                        { AOM_CDF2(19253) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(26064) },
+                                        { AOM_CDF2(22098) },
+                                        { AOM_CDF2(19613) },
+                                        { AOM_CDF2(20525) },
+                                        { AOM_CDF2(17595) },
+                                        { AOM_CDF2(16618) },
+                                        { AOM_CDF2(20497) },
+                                        { AOM_CDF2(18989) },
+                                        { AOM_CDF2(15513) },
+                                    },
+                                    {
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } } }
+                              };
+
+static const aom_cdf_prob
+    av1_default_eob_multi16_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        5)] = { { { { AOM_CDF5(840, 1039, 1980, 4895) },
+                    { AOM_CDF5(370, 671, 1883, 4471) } },
+                  { { AOM_CDF5(3247, 4950, 9688, 14563) },
+                    { AOM_CDF5(1904, 3354, 7763, 14647) } } },
+                { { { AOM_CDF5(2125, 2551, 5165, 8946) },
+                    { AOM_CDF5(513, 765, 1859, 6339) } },
+                  { { AOM_CDF5(7637, 9498, 14259, 19108) },
+                    { AOM_CDF5(2497, 4096, 8866, 16993) } } },
+                { { { AOM_CDF5(4016, 4897, 8881, 14968) },
+                    { AOM_CDF5(716, 1105, 2646, 10056) } },
+                  { { AOM_CDF5(11139, 13270, 18241, 23566) },
+                    { AOM_CDF5(3192, 5032, 10297, 19755) } } },
+                { { { AOM_CDF5(6708, 8958, 14746, 22133) },
+                    { AOM_CDF5(1222, 2074, 4783, 15410) } },
+                  { { AOM_CDF5(19575, 21766, 26044, 29709) },
+                    { AOM_CDF5(7297, 10767, 19273, 28194) } } } };
+
+static const aom_cdf_prob
+    av1_default_eob_multi32_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        6)] = { { { { AOM_CDF6(400, 520, 977, 2102, 6542) },
+                    { AOM_CDF6(210, 405, 1315, 3326, 7537) } },
+                  { { AOM_CDF6(2636, 4273, 7588, 11794, 20401) },
+                    { AOM_CDF6(1786, 3179, 6902, 11357, 19054) } } },
+                { { { AOM_CDF6(989, 1249, 2019, 4151, 10785) },
+                    { AOM_CDF6(313, 441, 1099, 2917, 8562) } },
+                  { { AOM_CDF6(8394, 10352, 13932, 18855, 26014) },
+                    { AOM_CDF6(2578, 4124, 8181, 13670, 24234) } } },
+                { { { AOM_CDF6(2515, 3003, 4452, 8162, 16041) },
+                    { AOM_CDF6(574, 821, 1836, 5089, 13128) } },
+                  { { AOM_CDF6(13468, 16303, 20361, 25105, 29281) },
+                    { AOM_CDF6(3542, 5502, 10415, 16760, 25644) } } },
+                { { { AOM_CDF6(4617, 5709, 8446, 13584, 23135) },
+                    { AOM_CDF6(1156, 1702, 3675, 9274, 20539) } },
+                  { { AOM_CDF6(22086, 24282, 27010, 29770, 31743) },
+                    { AOM_CDF6(7699, 10897, 20891, 26926, 31628) } } } };
+
+static const aom_cdf_prob
+    av1_default_eob_multi64_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        7)] = { { { { AOM_CDF7(329, 498, 1101, 1784, 3265, 7758) },
+                    { AOM_CDF7(335, 730, 1459, 5494, 8755, 12997) } },
+                  { { AOM_CDF7(3505, 5304, 10086, 13814, 17684, 23370) },
+                    { AOM_CDF7(1563, 2700, 4876, 10911, 14706, 22480) } } },
+                { { { AOM_CDF7(1260, 1446, 2253, 3712, 6652, 13369) },
+                    { AOM_CDF7(401, 605, 1029, 2563, 5845, 12626) } },
+                  { { AOM_CDF7(8609, 10612, 14624, 18714, 22614, 29024) },
+                    { AOM_CDF7(1923, 3127, 5867, 9703, 14277, 27100) } } },
+                { { { AOM_CDF7(2374, 2772, 4583, 7276, 12288, 19706) },
+                    { AOM_CDF7(497, 810, 1315, 3000, 7004, 15641) } },
+                  { { AOM_CDF7(15050, 17126, 21410, 24886, 28156, 30726) },
+                    { AOM_CDF7(4034, 6290, 10235, 14982, 21214, 28491) } } },
+                { { { AOM_CDF7(6307, 7541, 12060, 16358, 22553, 27865) },
+                    { AOM_CDF7(1289, 2320, 3971, 7926, 14153, 24291) } },
+                  { { AOM_CDF7(24212, 25708, 28268, 30035, 31307, 32049) },
+                    { AOM_CDF7(8726, 12378, 19409, 26450, 30038, 32462) } } } };
+
+static const aom_cdf_prob
+    av1_default_eob_multi128_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        8)] = {
+      { { { AOM_CDF8(219, 482, 1140, 2091, 3680, 6028, 12586) },
+          { AOM_CDF8(371, 699, 1254, 4830, 9479, 12562, 17497) } },
+        { { AOM_CDF8(5245, 7456, 12880, 15852, 20033, 23932, 27608) },
+          { AOM_CDF8(2054, 3472, 5869, 14232, 18242, 20590, 26752) } } },
+      { { { AOM_CDF8(685, 933, 1488, 2714, 4766, 8562, 19254) },
+          { AOM_CDF8(217, 352, 618, 2303, 5261, 9969, 17472) } },
+        { { AOM_CDF8(8045, 11200, 15497, 19595, 23948, 27408, 30938) },
+          { AOM_CDF8(2310, 4160, 7471, 14997, 17931, 20768, 30240) } } },
+      { { { AOM_CDF8(1366, 1738, 2527, 5016, 9355, 15797, 24643) },
+          { AOM_CDF8(354, 558, 944, 2760, 7287, 14037, 21779) } },
+        { { AOM_CDF8(13627, 16246, 20173, 24429, 27948, 30415, 31863) },
+          { AOM_CDF8(6275, 9889, 14769, 23164, 27988, 30493, 32272) } } },
+      { { { AOM_CDF8(3472, 4885, 7489, 12481, 18517, 24536, 29635) },
+          { AOM_CDF8(886, 1731, 3271, 8469, 15569, 22126, 28383) } },
+        { { AOM_CDF8(24313, 26062, 28385, 30107, 31217, 31898, 32345) },
+          { AOM_CDF8(9165, 13282, 21150, 30286, 31894, 32571, 32712) } } }
+    };
+
+static const aom_cdf_prob
+    av1_default_eob_multi256_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        9)] = {
+      { { { AOM_CDF9(310, 584, 1887, 3589, 6168, 8611, 11352, 15652) },
+          { AOM_CDF9(998, 1850, 2998, 5604, 17341, 19888, 22899, 25583) } },
+        { { AOM_CDF9(2520, 3240, 5952, 8870, 12577, 17558, 19954, 24168) },
+          { AOM_CDF9(2203, 4130, 7435, 10739, 20652, 23681, 25609, 27261) } } },
+      { { { AOM_CDF9(1448, 2109, 4151, 6263, 9329, 13260, 17944, 23300) },
+          { AOM_CDF9(399, 1019, 1749, 3038, 10444, 15546, 22739, 27294) } },
+        { { AOM_CDF9(6402, 8148, 12623, 15072, 18728, 22847, 26447, 29377) },
+          { AOM_CDF9(1674, 3252, 5734, 10159, 22397, 23802, 24821, 30940) } } },
+      { { { AOM_CDF9(3089, 3920, 6038, 9460, 14266, 19881, 25766, 29176) },
+          { AOM_CDF9(1084, 2358, 3488, 5122, 11483, 18103, 26023, 29799) } },
+        { { AOM_CDF9(11514, 13794, 17480, 20754, 24361, 27378, 29492, 31277) },
+          { AOM_CDF9(6571, 9610, 15516, 21826, 29092, 30829, 31842,
+                     32708) } } },
+      { { { AOM_CDF9(5348, 7113, 11820, 15924, 22106, 26777, 30334, 31757) },
+          { AOM_CDF9(2453, 4474, 6307, 8777, 16474, 22975, 29000, 31547) } },
+        { { AOM_CDF9(23110, 24597, 27140, 28894, 30167, 30927, 31392, 32094) },
+          { AOM_CDF9(9998, 17661, 25178, 28097, 31308, 32038, 32403,
+                     32695) } } }
+    };
+
+static const aom_cdf_prob
+    av1_default_eob_multi512_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        10)] = { { { { AOM_CDF10(641, 983, 3707, 5430, 10234, 14958, 18788,
+                                 23412, 26061) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } },
+                   { { AOM_CDF10(5095, 6446, 9996, 13354, 16017, 17986, 20919,
+                                 26129, 29140) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } } },
+                 { { { AOM_CDF10(1230, 2278, 5035, 7776, 11871, 15346, 19590,
+                                 24584, 28749) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } },
+                   { { AOM_CDF10(7265, 9979, 15819, 19250, 21780, 23846, 26478,
+                                 28396, 31811) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } } },
+                 { { { AOM_CDF10(2624, 3936, 6480, 9686, 13979, 17726, 23267,
+                                 28410, 31078) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } },
+                   { { AOM_CDF10(12015, 14769, 19588, 22052, 24222, 25812,
+                                 27300, 29219, 32114) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } } },
+                 { { { AOM_CDF10(5927, 7809, 10923, 14597, 19439, 24135, 28456,
+                                 31142, 32060) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } },
+                   { { AOM_CDF10(21093, 23043, 25742, 27658, 29097, 29716,
+                                 30073, 30820, 31956) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } } } };
 
-static const coeff_cdf_model
-av1_default_coef_head_cdfs_q1[TX_SIZES][PLANE_TYPES] = {
-    {  // TX 4X4
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(21480), AOM_ICDF(22344), AOM_ICDF(27339), AOM_ICDF(29181),
-    AOM_ICDF(29765), AOM_ICDF(32768), },
-    {AOM_ICDF(9705), AOM_ICDF(12374), AOM_ICDF(20269), AOM_ICDF(24109),
-    AOM_ICDF(25071), AOM_ICDF(32768), },
-    {AOM_ICDF(2883), AOM_ICDF(6716), AOM_ICDF(10461), AOM_ICDF(16169),
-    AOM_ICDF(17355), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(8632), AOM_ICDF(15472), AOM_ICDF(26027), AOM_ICDF(26596),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8543), AOM_ICDF(14383), AOM_ICDF(25665), AOM_ICDF(26207),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8561), AOM_ICDF(12583), AOM_ICDF(22962), AOM_ICDF(23529),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6538), AOM_ICDF(8023), AOM_ICDF(18106), AOM_ICDF(18672),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4363), AOM_ICDF(4797), AOM_ICDF(12512), AOM_ICDF(12937),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2471), AOM_ICDF(2791), AOM_ICDF(7274), AOM_ICDF(7605),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(14783), AOM_ICDF(18891), AOM_ICDF(29122), AOM_ICDF(29700),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11829), AOM_ICDF(16696), AOM_ICDF(28114), AOM_ICDF(28591),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8965), AOM_ICDF(11076), AOM_ICDF(23514), AOM_ICDF(24031),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6257), AOM_ICDF(7011), AOM_ICDF(17779), AOM_ICDF(18315),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4329), AOM_ICDF(4704), AOM_ICDF(12448), AOM_ICDF(12839),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2542), AOM_ICDF(2860), AOM_ICDF(7886), AOM_ICDF(8207),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(19181), AOM_ICDF(22038), AOM_ICDF(30697), AOM_ICDF(31106),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12174), AOM_ICDF(17208), AOM_ICDF(28897), AOM_ICDF(29328),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8420), AOM_ICDF(10706), AOM_ICDF(23788), AOM_ICDF(24321),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6153), AOM_ICDF(6850), AOM_ICDF(17983), AOM_ICDF(18530),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4168), AOM_ICDF(4524), AOM_ICDF(12547), AOM_ICDF(12983),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3136), AOM_ICDF(3480), AOM_ICDF(9221), AOM_ICDF(9659),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(18701), AOM_ICDF(23907), AOM_ICDF(31282), AOM_ICDF(31695),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12655), AOM_ICDF(19258), AOM_ICDF(29824), AOM_ICDF(30279),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8699), AOM_ICDF(11467), AOM_ICDF(24763), AOM_ICDF(25450),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6268), AOM_ICDF(7027), AOM_ICDF(18397), AOM_ICDF(19102),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5613), AOM_ICDF(6020), AOM_ICDF(14084), AOM_ICDF(14637),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2443), AOM_ICDF(2919), AOM_ICDF(8222), AOM_ICDF(8639),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(6156), AOM_ICDF(23586), AOM_ICDF(30739), AOM_ICDF(31476),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6056), AOM_ICDF(21852), AOM_ICDF(29323), AOM_ICDF(30442),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6113), AOM_ICDF(14408), AOM_ICDF(24331), AOM_ICDF(26899),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5825), AOM_ICDF(9328), AOM_ICDF(18946), AOM_ICDF(22143),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5023), AOM_ICDF(6340), AOM_ICDF(14812), AOM_ICDF(17429),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5140), AOM_ICDF(6104), AOM_ICDF(11565), AOM_ICDF(14135),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(12606), AOM_ICDF(20577), AOM_ICDF(21354), AOM_ICDF(29249),
-    AOM_ICDF(29714), AOM_ICDF(32768), },
-    {AOM_ICDF(8630), AOM_ICDF(17728), AOM_ICDF(19353), AOM_ICDF(27722),
-    AOM_ICDF(28219), AOM_ICDF(32768), },
-    {AOM_ICDF(3040), AOM_ICDF(12616), AOM_ICDF(14286), AOM_ICDF(23918),
-    AOM_ICDF(24539), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(20824), AOM_ICDF(21610), AOM_ICDF(31110), AOM_ICDF(31445),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15597), AOM_ICDF(17692), AOM_ICDF(29670), AOM_ICDF(30015),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8954), AOM_ICDF(10007), AOM_ICDF(23515), AOM_ICDF(23902),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6693), AOM_ICDF(7282), AOM_ICDF(18144), AOM_ICDF(18537),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4048), AOM_ICDF(4451), AOM_ICDF(12255), AOM_ICDF(12626),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2619), AOM_ICDF(2960), AOM_ICDF(7084), AOM_ICDF(7429),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(21628), AOM_ICDF(22786), AOM_ICDF(31520), AOM_ICDF(31865),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15854), AOM_ICDF(17925), AOM_ICDF(29872), AOM_ICDF(30228),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8120), AOM_ICDF(8815), AOM_ICDF(22575), AOM_ICDF(22964),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5006), AOM_ICDF(5427), AOM_ICDF(15724), AOM_ICDF(16101),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2967), AOM_ICDF(3311), AOM_ICDF(9553), AOM_ICDF(9913),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2878), AOM_ICDF(3188), AOM_ICDF(5418), AOM_ICDF(5825),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(21594), AOM_ICDF(23721), AOM_ICDF(31496), AOM_ICDF(31872),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15704), AOM_ICDF(18452), AOM_ICDF(30207), AOM_ICDF(30585),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8637), AOM_ICDF(9546), AOM_ICDF(23803), AOM_ICDF(24254),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5991), AOM_ICDF(6479), AOM_ICDF(17619), AOM_ICDF(18099),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3856), AOM_ICDF(4220), AOM_ICDF(11623), AOM_ICDF(12111),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3501), AOM_ICDF(3825), AOM_ICDF(6760), AOM_ICDF(7246),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(19929), AOM_ICDF(23849), AOM_ICDF(31581), AOM_ICDF(31956),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14239), AOM_ICDF(19461), AOM_ICDF(30323), AOM_ICDF(30761),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8094), AOM_ICDF(9844), AOM_ICDF(23595), AOM_ICDF(24338),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5204), AOM_ICDF(5848), AOM_ICDF(16396), AOM_ICDF(17121),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3568), AOM_ICDF(3961), AOM_ICDF(10658), AOM_ICDF(11301),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1594), AOM_ICDF(1913), AOM_ICDF(5552), AOM_ICDF(6040),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(13512), AOM_ICDF(24112), AOM_ICDF(31648), AOM_ICDF(32057),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10595), AOM_ICDF(22378), AOM_ICDF(30592), AOM_ICDF(31236),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7571), AOM_ICDF(13305), AOM_ICDF(24936), AOM_ICDF(26656),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6163), AOM_ICDF(8207), AOM_ICDF(18688), AOM_ICDF(20500),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3185), AOM_ICDF(4449), AOM_ICDF(13298), AOM_ICDF(14707),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1890), AOM_ICDF(2731), AOM_ICDF(7562), AOM_ICDF(8192),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(26689), AOM_ICDF(27259), AOM_ICDF(30590), AOM_ICDF(31538),
-    AOM_ICDF(31930), AOM_ICDF(32768), },
-    {AOM_ICDF(17843), AOM_ICDF(19709), AOM_ICDF(27299), AOM_ICDF(29813),
-    AOM_ICDF(30435), AOM_ICDF(32768), },
-    {AOM_ICDF(9138), AOM_ICDF(13232), AOM_ICDF(20487), AOM_ICDF(25798),
-    AOM_ICDF(26794), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(13264), AOM_ICDF(22970), AOM_ICDF(30914), AOM_ICDF(31354),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11647), AOM_ICDF(20651), AOM_ICDF(30191), AOM_ICDF(30692),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10449), AOM_ICDF(15871), AOM_ICDF(27240), AOM_ICDF(27909),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7759), AOM_ICDF(9400), AOM_ICDF(22161), AOM_ICDF(22812),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4095), AOM_ICDF(4544), AOM_ICDF(13856), AOM_ICDF(14309),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3199), AOM_ICDF(3509), AOM_ICDF(8639), AOM_ICDF(8964),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(18180), AOM_ICDF(25717), AOM_ICDF(31446), AOM_ICDF(31899),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14593), AOM_ICDF(22211), AOM_ICDF(30845), AOM_ICDF(31282),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10443), AOM_ICDF(13816), AOM_ICDF(27239), AOM_ICDF(27789),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6760), AOM_ICDF(7698), AOM_ICDF(19648), AOM_ICDF(20234),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3896), AOM_ICDF(4253), AOM_ICDF(12678), AOM_ICDF(13056),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5461), AOM_ICDF(6722), AOM_ICDF(13443), AOM_ICDF(14704),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(22145), AOM_ICDF(27566), AOM_ICDF(31813), AOM_ICDF(32212),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15241), AOM_ICDF(23215), AOM_ICDF(31215), AOM_ICDF(31658),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11148), AOM_ICDF(15527), AOM_ICDF(28336), AOM_ICDF(28891),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8864), AOM_ICDF(10402), AOM_ICDF(24069), AOM_ICDF(24811),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6919), AOM_ICDF(7527), AOM_ICDF(19607), AOM_ICDF(20260),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5174), AOM_ICDF(10348), AOM_ICDF(18971), AOM_ICDF(25869),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(18795), AOM_ICDF(27901), AOM_ICDF(31907), AOM_ICDF(32272),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13177), AOM_ICDF(24166), AOM_ICDF(31395), AOM_ICDF(31820),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9217), AOM_ICDF(15410), AOM_ICDF(28101), AOM_ICDF(28868),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6328), AOM_ICDF(8749), AOM_ICDF(21695), AOM_ICDF(22954),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15672), AOM_ICDF(17809), AOM_ICDF(22795), AOM_ICDF(24932),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(9431), AOM_ICDF(28094), AOM_ICDF(31965), AOM_ICDF(32338),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8107), AOM_ICDF(26038), AOM_ICDF(31393), AOM_ICDF(32024),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9347), AOM_ICDF(19880), AOM_ICDF(28342), AOM_ICDF(29759),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7092), AOM_ICDF(13694), AOM_ICDF(25432), AOM_ICDF(28366),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7802), AOM_ICDF(12483), AOM_ICDF(21845), AOM_ICDF(26526),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(29212), AOM_ICDF(29998), AOM_ICDF(31256), AOM_ICDF(32035),
-    AOM_ICDF(32360), AOM_ICDF(32768), },
-    {AOM_ICDF(19150), AOM_ICDF(23189), AOM_ICDF(28117), AOM_ICDF(31168),
-    AOM_ICDF(31611), AOM_ICDF(32768), },
-    {AOM_ICDF(9324), AOM_ICDF(18178), AOM_ICDF(23556), AOM_ICDF(29422),
-    AOM_ICDF(30204), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(20406), AOM_ICDF(26462), AOM_ICDF(31971), AOM_ICDF(32298),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15834), AOM_ICDF(22647), AOM_ICDF(31547), AOM_ICDF(31902),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11047), AOM_ICDF(15431), AOM_ICDF(27825), AOM_ICDF(28393),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8665), AOM_ICDF(11083), AOM_ICDF(22493), AOM_ICDF(23423),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6191), AOM_ICDF(7733), AOM_ICDF(16624), AOM_ICDF(17708),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3210), AOM_ICDF(3875), AOM_ICDF(10937), AOM_ICDF(11867),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(21520), AOM_ICDF(27152), AOM_ICDF(31994), AOM_ICDF(32324),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17519), AOM_ICDF(23609), AOM_ICDF(31670), AOM_ICDF(32022),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10647), AOM_ICDF(14610), AOM_ICDF(28389), AOM_ICDF(28873),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7660), AOM_ICDF(10704), AOM_ICDF(22849), AOM_ICDF(23680),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5535), AOM_ICDF(6454), AOM_ICDF(17275), AOM_ICDF(17753),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4096), AOM_ICDF(6144), AOM_ICDF(13653), AOM_ICDF(15701),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(22487), AOM_ICDF(27996), AOM_ICDF(32020), AOM_ICDF(32381),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17371), AOM_ICDF(24453), AOM_ICDF(31777), AOM_ICDF(32152),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11366), AOM_ICDF(16072), AOM_ICDF(29193), AOM_ICDF(29761),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12545), AOM_ICDF(13869), AOM_ICDF(24642), AOM_ICDF(25603),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4119), AOM_ICDF(5056), AOM_ICDF(16103), AOM_ICDF(17601),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(18432), AOM_ICDF(24576),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(19350), AOM_ICDF(28517), AOM_ICDF(32050), AOM_ICDF(32401),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14752), AOM_ICDF(25831), AOM_ICDF(31897), AOM_ICDF(32261),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11157), AOM_ICDF(20816), AOM_ICDF(29821), AOM_ICDF(30635),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8157), AOM_ICDF(9691), AOM_ICDF(22868), AOM_ICDF(23705),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(10650), AOM_ICDF(17203), AOM_ICDF(19661),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(15557), AOM_ICDF(29043), AOM_ICDF(32047), AOM_ICDF(32424),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10253), AOM_ICDF(27948), AOM_ICDF(31922), AOM_ICDF(32329),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7797), AOM_ICDF(18860), AOM_ICDF(28870), AOM_ICDF(30661),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5617), AOM_ICDF(11235), AOM_ICDF(27151), AOM_ICDF(29959),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 8X8
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(20585), AOM_ICDF(21554), AOM_ICDF(27179), AOM_ICDF(28995),
-    AOM_ICDF(30170), AOM_ICDF(32768), },
-    {AOM_ICDF(6316), AOM_ICDF(8987), AOM_ICDF(15571), AOM_ICDF(19766),
-    AOM_ICDF(21417), AOM_ICDF(32768), },
-    {AOM_ICDF(1426), AOM_ICDF(4693), AOM_ICDF(6721), AOM_ICDF(11940),
-    AOM_ICDF(12874), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(10177), AOM_ICDF(14297), AOM_ICDF(24926), AOM_ICDF(25396),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8812), AOM_ICDF(13381), AOM_ICDF(24128), AOM_ICDF(24649),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8090), AOM_ICDF(11314), AOM_ICDF(21329), AOM_ICDF(21906),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6324), AOM_ICDF(7511), AOM_ICDF(17212), AOM_ICDF(17717),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4272), AOM_ICDF(4718), AOM_ICDF(12016), AOM_ICDF(12415),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2129), AOM_ICDF(2445), AOM_ICDF(6433), AOM_ICDF(6755),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(15709), AOM_ICDF(18339), AOM_ICDF(28174), AOM_ICDF(28566),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12592), AOM_ICDF(15866), AOM_ICDF(27071), AOM_ICDF(27475),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9361), AOM_ICDF(10768), AOM_ICDF(22752), AOM_ICDF(23166),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6525), AOM_ICDF(7048), AOM_ICDF(17478), AOM_ICDF(17863),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4314), AOM_ICDF(4656), AOM_ICDF(12242), AOM_ICDF(12579),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2419), AOM_ICDF(2735), AOM_ICDF(7387), AOM_ICDF(7707),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(20453), AOM_ICDF(22253), AOM_ICDF(29963), AOM_ICDF(30329),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14090), AOM_ICDF(16483), AOM_ICDF(27992), AOM_ICDF(28355),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8737), AOM_ICDF(9396), AOM_ICDF(22134), AOM_ICDF(22499),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5543), AOM_ICDF(5904), AOM_ICDF(15783), AOM_ICDF(16122),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3358), AOM_ICDF(3677), AOM_ICDF(10362), AOM_ICDF(10680),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1875), AOM_ICDF(2187), AOM_ICDF(5982), AOM_ICDF(6294),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(23693), AOM_ICDF(25306), AOM_ICDF(31174), AOM_ICDF(31516),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14804), AOM_ICDF(16843), AOM_ICDF(28713), AOM_ICDF(29058),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8442), AOM_ICDF(8976), AOM_ICDF(22003), AOM_ICDF(22353),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5397), AOM_ICDF(5741), AOM_ICDF(15529), AOM_ICDF(15867),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3322), AOM_ICDF(3639), AOM_ICDF(10248), AOM_ICDF(10570),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1852), AOM_ICDF(2161), AOM_ICDF(5980), AOM_ICDF(6290),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(24219), AOM_ICDF(26214), AOM_ICDF(31501), AOM_ICDF(31844),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15202), AOM_ICDF(17709), AOM_ICDF(29450), AOM_ICDF(29807),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9044), AOM_ICDF(9603), AOM_ICDF(23134), AOM_ICDF(23506),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5849), AOM_ICDF(6187), AOM_ICDF(16695), AOM_ICDF(17032),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3734), AOM_ICDF(4050), AOM_ICDF(11408), AOM_ICDF(11727),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1898), AOM_ICDF(2201), AOM_ICDF(6126), AOM_ICDF(6430),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(10195), AOM_ICDF(21186), AOM_ICDF(23530), AOM_ICDF(29551),
-    AOM_ICDF(30281), AOM_ICDF(32768), },
-    {AOM_ICDF(3950), AOM_ICDF(15607), AOM_ICDF(18726), AOM_ICDF(26764),
-    AOM_ICDF(27758), AOM_ICDF(32768), },
-    {AOM_ICDF(942), AOM_ICDF(11209), AOM_ICDF(12954), AOM_ICDF(22126),
-    AOM_ICDF(23296), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(24110), AOM_ICDF(24717), AOM_ICDF(31199), AOM_ICDF(31532),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16869), AOM_ICDF(18762), AOM_ICDF(29600), AOM_ICDF(29951),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10702), AOM_ICDF(12122), AOM_ICDF(25122), AOM_ICDF(25503),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8221), AOM_ICDF(9053), AOM_ICDF(20816), AOM_ICDF(21206),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5635), AOM_ICDF(6244), AOM_ICDF(15801), AOM_ICDF(16186),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3776), AOM_ICDF(4210), AOM_ICDF(10380), AOM_ICDF(10766),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(24719), AOM_ICDF(25439), AOM_ICDF(31522), AOM_ICDF(31849),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16693), AOM_ICDF(18162), AOM_ICDF(29698), AOM_ICDF(30036),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9340), AOM_ICDF(10024), AOM_ICDF(23513), AOM_ICDF(23867),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6269), AOM_ICDF(6709), AOM_ICDF(17711), AOM_ICDF(18060),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3841), AOM_ICDF(4185), AOM_ICDF(11892), AOM_ICDF(12230),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1944), AOM_ICDF(2259), AOM_ICDF(6437), AOM_ICDF(6776),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(25795), AOM_ICDF(26524), AOM_ICDF(31784), AOM_ICDF(32108),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17514), AOM_ICDF(18812), AOM_ICDF(30221), AOM_ICDF(30557),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9099), AOM_ICDF(9576), AOM_ICDF(23502), AOM_ICDF(23843),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5738), AOM_ICDF(6097), AOM_ICDF(16847), AOM_ICDF(17182),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3411), AOM_ICDF(3730), AOM_ICDF(10729), AOM_ICDF(11057),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1282), AOM_ICDF(1591), AOM_ICDF(4705), AOM_ICDF(5013),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(26360), AOM_ICDF(27205), AOM_ICDF(31918), AOM_ICDF(32240),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18465), AOM_ICDF(19729), AOM_ICDF(30758), AOM_ICDF(31089),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9488), AOM_ICDF(9915), AOM_ICDF(24339), AOM_ICDF(24678),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5812), AOM_ICDF(6156), AOM_ICDF(17325), AOM_ICDF(17661),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3739), AOM_ICDF(4065), AOM_ICDF(10932), AOM_ICDF(11265),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1391), AOM_ICDF(1700), AOM_ICDF(4764), AOM_ICDF(5073),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(27036), AOM_ICDF(28212), AOM_ICDF(31970), AOM_ICDF(32305),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18634), AOM_ICDF(21073), AOM_ICDF(31116), AOM_ICDF(31477),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9822), AOM_ICDF(10441), AOM_ICDF(24990), AOM_ICDF(25437),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6130), AOM_ICDF(6530), AOM_ICDF(17790), AOM_ICDF(18269),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3725), AOM_ICDF(4044), AOM_ICDF(11127), AOM_ICDF(11602),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1298), AOM_ICDF(1573), AOM_ICDF(4642), AOM_ICDF(5075),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(23042), AOM_ICDF(23702), AOM_ICDF(30487), AOM_ICDF(31370),
-    AOM_ICDF(31898), AOM_ICDF(32768), },
-    {AOM_ICDF(15512), AOM_ICDF(17357), AOM_ICDF(27018), AOM_ICDF(29404),
-    AOM_ICDF(30377), AOM_ICDF(32768), },
-    {AOM_ICDF(8935), AOM_ICDF(12713), AOM_ICDF(20545), AOM_ICDF(25580),
-    AOM_ICDF(26931), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(15021), AOM_ICDF(24086), AOM_ICDF(30796), AOM_ICDF(31272),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13040), AOM_ICDF(21866), AOM_ICDF(30054), AOM_ICDF(30686),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10915), AOM_ICDF(16852), AOM_ICDF(27467), AOM_ICDF(28235),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8096), AOM_ICDF(10403), AOM_ICDF(22531), AOM_ICDF(23355),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4485), AOM_ICDF(5020), AOM_ICDF(13360), AOM_ICDF(13816),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1728), AOM_ICDF(2067), AOM_ICDF(5998), AOM_ICDF(6337),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(20845), AOM_ICDF(25929), AOM_ICDF(31278), AOM_ICDF(31670),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15553), AOM_ICDF(21602), AOM_ICDF(30338), AOM_ICDF(30745),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10953), AOM_ICDF(13829), AOM_ICDF(26398), AOM_ICDF(26854),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7900), AOM_ICDF(8858), AOM_ICDF(20869), AOM_ICDF(21378),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5225), AOM_ICDF(5579), AOM_ICDF(13764), AOM_ICDF(14087),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1881), AOM_ICDF(2352), AOM_ICDF(6742), AOM_ICDF(7212),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(25402), AOM_ICDF(28169), AOM_ICDF(31825), AOM_ICDF(32169),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17086), AOM_ICDF(21375), AOM_ICDF(30582), AOM_ICDF(30951),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11057), AOM_ICDF(12358), AOM_ICDF(25930), AOM_ICDF(26346),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6989), AOM_ICDF(7448), AOM_ICDF(18814), AOM_ICDF(19143),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4476), AOM_ICDF(4752), AOM_ICDF(16025), AOM_ICDF(16301),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2185), AOM_ICDF(4369), AOM_ICDF(12379), AOM_ICDF(14564),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(26444), AOM_ICDF(28656), AOM_ICDF(31864), AOM_ICDF(32231),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17642), AOM_ICDF(20848), AOM_ICDF(30615), AOM_ICDF(30967),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10973), AOM_ICDF(11732), AOM_ICDF(25256), AOM_ICDF(25612),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8325), AOM_ICDF(8726), AOM_ICDF(19826), AOM_ICDF(20146),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5294), AOM_ICDF(5568), AOM_ICDF(14056), AOM_ICDF(14330),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5461), AOM_ICDF(10923), AOM_ICDF(18204), AOM_ICDF(23666),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(27760), AOM_ICDF(29748), AOM_ICDF(31934), AOM_ICDF(32299),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17133), AOM_ICDF(21599), AOM_ICDF(30800), AOM_ICDF(31243),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12224), AOM_ICDF(13907), AOM_ICDF(26992), AOM_ICDF(27546),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9221), AOM_ICDF(9617), AOM_ICDF(21845), AOM_ICDF(22162),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5401), AOM_ICDF(6482), AOM_ICDF(18004), AOM_ICDF(19085),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(29286), AOM_ICDF(29932), AOM_ICDF(31576), AOM_ICDF(32075),
-    AOM_ICDF(32408), AOM_ICDF(32768), },
-    {AOM_ICDF(17969), AOM_ICDF(21693), AOM_ICDF(28937), AOM_ICDF(30945),
-    AOM_ICDF(31682), AOM_ICDF(32768), },
-    {AOM_ICDF(6607), AOM_ICDF(16160), AOM_ICDF(23280), AOM_ICDF(27595),
-    AOM_ICDF(30027), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(24724), AOM_ICDF(28333), AOM_ICDF(32022), AOM_ICDF(32346),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18803), AOM_ICDF(24728), AOM_ICDF(31661), AOM_ICDF(32022),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14179), AOM_ICDF(20757), AOM_ICDF(30098), AOM_ICDF(30633),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12564), AOM_ICDF(17179), AOM_ICDF(27133), AOM_ICDF(28080),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10543), AOM_ICDF(13479), AOM_ICDF(23725), AOM_ICDF(25031),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11377), AOM_ICDF(12741), AOM_ICDF(21923), AOM_ICDF(22888),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(26071), AOM_ICDF(28609), AOM_ICDF(32053), AOM_ICDF(32374),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20389), AOM_ICDF(24820), AOM_ICDF(31690), AOM_ICDF(32027),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12977), AOM_ICDF(16892), AOM_ICDF(29053), AOM_ICDF(29445),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8745), AOM_ICDF(12303), AOM_ICDF(24164), AOM_ICDF(25209),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4042), AOM_ICDF(5052), AOM_ICDF(18333), AOM_ICDF(18910),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5461), AOM_ICDF(9557), AOM_ICDF(13653), AOM_ICDF(17749),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(27936), AOM_ICDF(29582), AOM_ICDF(32107), AOM_ICDF(32422),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22472), AOM_ICDF(25761), AOM_ICDF(31858), AOM_ICDF(32177),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14107), AOM_ICDF(16587), AOM_ICDF(29250), AOM_ICDF(29692),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10726), AOM_ICDF(11739), AOM_ICDF(23985), AOM_ICDF(24576),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5825), AOM_ICDF(8010), AOM_ICDF(18204), AOM_ICDF(20389),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(14336), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(27066), AOM_ICDF(29025), AOM_ICDF(31972), AOM_ICDF(32338),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20639), AOM_ICDF(23330), AOM_ICDF(31616), AOM_ICDF(31985),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13468), AOM_ICDF(15091), AOM_ICDF(29902), AOM_ICDF(30243),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14473), AOM_ICDF(15019), AOM_ICDF(24030), AOM_ICDF(24439),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7864), AOM_ICDF(11796), AOM_ICDF(19661), AOM_ICDF(23593),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(28741), AOM_ICDF(30503), AOM_ICDF(32039), AOM_ICDF(32388),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19712), AOM_ICDF(25328), AOM_ICDF(31621), AOM_ICDF(32049),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13461), AOM_ICDF(17167), AOM_ICDF(29712), AOM_ICDF(30308),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10285), AOM_ICDF(11242), AOM_ICDF(27267), AOM_ICDF(28224),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5174), AOM_ICDF(10348), AOM_ICDF(17246), AOM_ICDF(22420),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 16X16
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(4353), AOM_ICDF(7056), AOM_ICDF(15884), AOM_ICDF(20594),
-    AOM_ICDF(24026), AOM_ICDF(32768), },
-    {AOM_ICDF(2397), AOM_ICDF(5417), AOM_ICDF(9610), AOM_ICDF(14451),
-    AOM_ICDF(16689), AOM_ICDF(32768), },
-    {AOM_ICDF(841), AOM_ICDF(3543), AOM_ICDF(4598), AOM_ICDF(9149),
-    AOM_ICDF(9950), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(8763), AOM_ICDF(11845), AOM_ICDF(22684), AOM_ICDF(23211),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8074), AOM_ICDF(12129), AOM_ICDF(22232), AOM_ICDF(22924),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7453), AOM_ICDF(10017), AOM_ICDF(19822), AOM_ICDF(20662),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5825), AOM_ICDF(6998), AOM_ICDF(16346), AOM_ICDF(16952),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4059), AOM_ICDF(4481), AOM_ICDF(11444), AOM_ICDF(11852),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1973), AOM_ICDF(2289), AOM_ICDF(5827), AOM_ICDF(6149),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(15272), AOM_ICDF(17017), AOM_ICDF(26959), AOM_ICDF(27346),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12476), AOM_ICDF(14916), AOM_ICDF(26163), AOM_ICDF(26575),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9485), AOM_ICDF(10720), AOM_ICDF(22557), AOM_ICDF(22973),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6821), AOM_ICDF(7342), AOM_ICDF(17484), AOM_ICDF(17858),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4370), AOM_ICDF(4714), AOM_ICDF(12030), AOM_ICDF(12366),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2375), AOM_ICDF(2688), AOM_ICDF(6850), AOM_ICDF(7162),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(19929), AOM_ICDF(21244), AOM_ICDF(29489), AOM_ICDF(29829),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14005), AOM_ICDF(16066), AOM_ICDF(27595), AOM_ICDF(27947),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8918), AOM_ICDF(9550), AOM_ICDF(22126), AOM_ICDF(22488),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5741), AOM_ICDF(6095), AOM_ICDF(16004), AOM_ICDF(16340),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3558), AOM_ICDF(3873), AOM_ICDF(10340), AOM_ICDF(10657),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1822), AOM_ICDF(2134), AOM_ICDF(5530), AOM_ICDF(5843),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(23568), AOM_ICDF(24663), AOM_ICDF(30915), AOM_ICDF(31245),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15139), AOM_ICDF(16577), AOM_ICDF(28661), AOM_ICDF(28997),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8850), AOM_ICDF(9259), AOM_ICDF(22366), AOM_ICDF(22700),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5454), AOM_ICDF(5781), AOM_ICDF(15617), AOM_ICDF(15937),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3315), AOM_ICDF(3629), AOM_ICDF(10044), AOM_ICDF(10359),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1736), AOM_ICDF(2047), AOM_ICDF(5698), AOM_ICDF(6009),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(27011), AOM_ICDF(27875), AOM_ICDF(31721), AOM_ICDF(32046),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16855), AOM_ICDF(18018), AOM_ICDF(29676), AOM_ICDF(30005),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8916), AOM_ICDF(9282), AOM_ICDF(22431), AOM_ICDF(22760),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5391), AOM_ICDF(5710), AOM_ICDF(15343), AOM_ICDF(15662),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3316), AOM_ICDF(3629), AOM_ICDF(10223), AOM_ICDF(10537),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1891), AOM_ICDF(2202), AOM_ICDF(6076), AOM_ICDF(6387),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(5744), AOM_ICDF(15508), AOM_ICDF(23294), AOM_ICDF(28653),
-    AOM_ICDF(30781), AOM_ICDF(32768), },
-    {AOM_ICDF(2130), AOM_ICDF(11786), AOM_ICDF(17337), AOM_ICDF(24444),
-    AOM_ICDF(27499), AOM_ICDF(32768), },
-    {AOM_ICDF(615), AOM_ICDF(8230), AOM_ICDF(10191), AOM_ICDF(18291),
-    AOM_ICDF(21029), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(25149), AOM_ICDF(25880), AOM_ICDF(31110), AOM_ICDF(31453),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17454), AOM_ICDF(20460), AOM_ICDF(29560), AOM_ICDF(29929),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11724), AOM_ICDF(14294), AOM_ICDF(25947), AOM_ICDF(26377),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9198), AOM_ICDF(10981), AOM_ICDF(22357), AOM_ICDF(22857),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7164), AOM_ICDF(8069), AOM_ICDF(18345), AOM_ICDF(18857),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5833), AOM_ICDF(6316), AOM_ICDF(14661), AOM_ICDF(15073),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(26117), AOM_ICDF(26928), AOM_ICDF(31526), AOM_ICDF(31850),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16540), AOM_ICDF(18394), AOM_ICDF(29402), AOM_ICDF(29740),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9908), AOM_ICDF(10886), AOM_ICDF(23865), AOM_ICDF(24223),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6805), AOM_ICDF(7383), AOM_ICDF(18402), AOM_ICDF(18777),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4259), AOM_ICDF(4638), AOM_ICDF(12791), AOM_ICDF(13136),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2274), AOM_ICDF(2584), AOM_ICDF(7391), AOM_ICDF(7713),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(27129), AOM_ICDF(27797), AOM_ICDF(31745), AOM_ICDF(32063),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17255), AOM_ICDF(18663), AOM_ICDF(29815), AOM_ICDF(30145),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9538), AOM_ICDF(10091), AOM_ICDF(23590), AOM_ICDF(23931),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6366), AOM_ICDF(6732), AOM_ICDF(17467), AOM_ICDF(17800),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3701), AOM_ICDF(4018), AOM_ICDF(11326), AOM_ICDF(11652),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1976), AOM_ICDF(2284), AOM_ICDF(6325), AOM_ICDF(6633),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(27944), AOM_ICDF(28479), AOM_ICDF(31894), AOM_ICDF(32211),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18032), AOM_ICDF(18997), AOM_ICDF(30130), AOM_ICDF(30452),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9467), AOM_ICDF(9842), AOM_ICDF(23729), AOM_ICDF(24051),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5900), AOM_ICDF(6226), AOM_ICDF(16797), AOM_ICDF(17116),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3282), AOM_ICDF(3595), AOM_ICDF(10418), AOM_ICDF(10730),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2289), AOM_ICDF(2601), AOM_ICDF(6048), AOM_ICDF(6360),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29278), AOM_ICDF(29837), AOM_ICDF(32038), AOM_ICDF(32360),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19805), AOM_ICDF(20846), AOM_ICDF(31007), AOM_ICDF(31343),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9976), AOM_ICDF(10433), AOM_ICDF(24483), AOM_ICDF(24848),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5971), AOM_ICDF(6354), AOM_ICDF(17184), AOM_ICDF(17539),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3497), AOM_ICDF(4693), AOM_ICDF(11940), AOM_ICDF(12291),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1776), AOM_ICDF(2357), AOM_ICDF(6260), AOM_ICDF(6918),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(23166), AOM_ICDF(23821), AOM_ICDF(30269), AOM_ICDF(31075),
-    AOM_ICDF(31847), AOM_ICDF(32768), },
-    {AOM_ICDF(14510), AOM_ICDF(16494), AOM_ICDF(25635), AOM_ICDF(28335),
-    AOM_ICDF(29759), AOM_ICDF(32768), },
-    {AOM_ICDF(7730), AOM_ICDF(12354), AOM_ICDF(18089), AOM_ICDF(24005),
-    AOM_ICDF(25442), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(17908), AOM_ICDF(24824), AOM_ICDF(30533), AOM_ICDF(31042),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13950), AOM_ICDF(22899), AOM_ICDF(29969), AOM_ICDF(30646),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11728), AOM_ICDF(17834), AOM_ICDF(27214), AOM_ICDF(28218),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9581), AOM_ICDF(12074), AOM_ICDF(23689), AOM_ICDF(24616),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6193), AOM_ICDF(6855), AOM_ICDF(16430), AOM_ICDF(16955),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3393), AOM_ICDF(3712), AOM_ICDF(8802), AOM_ICDF(9226),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(23368), AOM_ICDF(26826), AOM_ICDF(31183), AOM_ICDF(31579),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16523), AOM_ICDF(21603), AOM_ICDF(30044), AOM_ICDF(30503),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11171), AOM_ICDF(14152), AOM_ICDF(27009), AOM_ICDF(27644),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8523), AOM_ICDF(9348), AOM_ICDF(21021), AOM_ICDF(21595),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4780), AOM_ICDF(5196), AOM_ICDF(13440), AOM_ICDF(13786),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4328), AOM_ICDF(5255), AOM_ICDF(10820), AOM_ICDF(11747),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(27020), AOM_ICDF(28644), AOM_ICDF(31643), AOM_ICDF(31990),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18016), AOM_ICDF(21678), AOM_ICDF(30346), AOM_ICDF(30712),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10497), AOM_ICDF(11555), AOM_ICDF(24827), AOM_ICDF(25156),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6370), AOM_ICDF(6703), AOM_ICDF(18612), AOM_ICDF(18903),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5355), AOM_ICDF(5738), AOM_ICDF(14790), AOM_ICDF(15173),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3486), AOM_ICDF(5578), AOM_ICDF(11155), AOM_ICDF(13247),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28933), AOM_ICDF(29746), AOM_ICDF(31882), AOM_ICDF(32203),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18171), AOM_ICDF(20286), AOM_ICDF(29713), AOM_ICDF(30052),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9732), AOM_ICDF(10163), AOM_ICDF(23952), AOM_ICDF(24275),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6084), AOM_ICDF(6480), AOM_ICDF(17459), AOM_ICDF(17771),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3250), AOM_ICDF(3656), AOM_ICDF(10291), AOM_ICDF(10697),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4681), AOM_ICDF(8192), AOM_ICDF(15214), AOM_ICDF(18725),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29940), AOM_ICDF(30510), AOM_ICDF(31933), AOM_ICDF(32260),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17688), AOM_ICDF(19258), AOM_ICDF(29757), AOM_ICDF(30125),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9668), AOM_ICDF(10798), AOM_ICDF(24231), AOM_ICDF(24605),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7580), AOM_ICDF(7942), AOM_ICDF(19364), AOM_ICDF(19692),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6043), AOM_ICDF(6446), AOM_ICDF(15578), AOM_ICDF(15981),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5783), AOM_ICDF(11565), AOM_ICDF(21203), AOM_ICDF(26985),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(28553), AOM_ICDF(29151), AOM_ICDF(31521), AOM_ICDF(32038),
-    AOM_ICDF(32413), AOM_ICDF(32768), },
-    {AOM_ICDF(15138), AOM_ICDF(19554), AOM_ICDF(27559), AOM_ICDF(29750),
-    AOM_ICDF(31321), AOM_ICDF(32768), },
-    {AOM_ICDF(3406), AOM_ICDF(18680), AOM_ICDF(23310), AOM_ICDF(27259),
-    AOM_ICDF(30430), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(29000), AOM_ICDF(30219), AOM_ICDF(32098), AOM_ICDF(32414),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21324), AOM_ICDF(25278), AOM_ICDF(31789), AOM_ICDF(32126),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14011), AOM_ICDF(21190), AOM_ICDF(30288), AOM_ICDF(30900),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12762), AOM_ICDF(18476), AOM_ICDF(27140), AOM_ICDF(28461),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11498), AOM_ICDF(14867), AOM_ICDF(24806), AOM_ICDF(25613),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15872), AOM_ICDF(16512), AOM_ICDF(24192), AOM_ICDF(25088),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(29308), AOM_ICDF(30286), AOM_ICDF(32095), AOM_ICDF(32410),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21819), AOM_ICDF(24215), AOM_ICDF(31771), AOM_ICDF(32103),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14853), AOM_ICDF(18028), AOM_ICDF(29729), AOM_ICDF(30160),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10598), AOM_ICDF(13400), AOM_ICDF(26555), AOM_ICDF(27043),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10426), AOM_ICDF(12660), AOM_ICDF(21597), AOM_ICDF(23831),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(29866), AOM_ICDF(30588), AOM_ICDF(32131), AOM_ICDF(32445),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(23473), AOM_ICDF(25323), AOM_ICDF(31960), AOM_ICDF(32280),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17529), AOM_ICDF(19173), AOM_ICDF(30278), AOM_ICDF(30577),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9830), AOM_ICDF(11469), AOM_ICDF(23484), AOM_ICDF(25122),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(30405), AOM_ICDF(31032), AOM_ICDF(32139), AOM_ICDF(32451),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(25453), AOM_ICDF(27199), AOM_ICDF(32040), AOM_ICDF(32361),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15663), AOM_ICDF(16432), AOM_ICDF(30654), AOM_ICDF(31038),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6780), AOM_ICDF(10169), AOM_ICDF(18079), AOM_ICDF(21469),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29785), AOM_ICDF(30368), AOM_ICDF(31904), AOM_ICDF(32245),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18173), AOM_ICDF(21111), AOM_ICDF(30105), AOM_ICDF(30575),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8476), AOM_ICDF(13666), AOM_ICDF(28420), AOM_ICDF(28896),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11427), AOM_ICDF(12066), AOM_ICDF(26197), AOM_ICDF(26691),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6827), AOM_ICDF(10923), AOM_ICDF(21845), AOM_ICDF(25941),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 32X32
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(7848), AOM_ICDF(9841), AOM_ICDF(13623), AOM_ICDF(19351),
-    AOM_ICDF(23196), AOM_ICDF(32768), },
-    {AOM_ICDF(3229), AOM_ICDF(5641), AOM_ICDF(7103), AOM_ICDF(13195),
-    AOM_ICDF(15046), AOM_ICDF(32768), },
-    {AOM_ICDF(810), AOM_ICDF(3129), AOM_ICDF(3687), AOM_ICDF(8373),
-    AOM_ICDF(8971), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(8165), AOM_ICDF(12626), AOM_ICDF(22213), AOM_ICDF(23403),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7602), AOM_ICDF(15378), AOM_ICDF(23248), AOM_ICDF(24331),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5607), AOM_ICDF(10197), AOM_ICDF(18657), AOM_ICDF(20616),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4498), AOM_ICDF(6539), AOM_ICDF(14461), AOM_ICDF(16104),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3387), AOM_ICDF(4098), AOM_ICDF(10245), AOM_ICDF(11322),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1793), AOM_ICDF(2111), AOM_ICDF(5262), AOM_ICDF(5646),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(16815), AOM_ICDF(19141), AOM_ICDF(27640), AOM_ICDF(28110),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13156), AOM_ICDF(15592), AOM_ICDF(26089), AOM_ICDF(26592),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9841), AOM_ICDF(11588), AOM_ICDF(22858), AOM_ICDF(23403),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7765), AOM_ICDF(8871), AOM_ICDF(19127), AOM_ICDF(19526),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5550), AOM_ICDF(6013), AOM_ICDF(14338), AOM_ICDF(14677),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2658), AOM_ICDF(2969), AOM_ICDF(7230), AOM_ICDF(7541),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(22765), AOM_ICDF(24278), AOM_ICDF(30194), AOM_ICDF(30535),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15310), AOM_ICDF(17292), AOM_ICDF(27870), AOM_ICDF(28248),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10047), AOM_ICDF(10839), AOM_ICDF(23345), AOM_ICDF(23710),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6594), AOM_ICDF(6959), AOM_ICDF(17456), AOM_ICDF(17796),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3784), AOM_ICDF(4109), AOM_ICDF(10984), AOM_ICDF(11297),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1569), AOM_ICDF(1875), AOM_ICDF(4586), AOM_ICDF(4892),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(25747), AOM_ICDF(26817), AOM_ICDF(31236), AOM_ICDF(31577),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16018), AOM_ICDF(17720), AOM_ICDF(28833), AOM_ICDF(29219),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9348), AOM_ICDF(10015), AOM_ICDF(22943), AOM_ICDF(23323),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5841), AOM_ICDF(6167), AOM_ICDF(15774), AOM_ICDF(16107),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3385), AOM_ICDF(3703), AOM_ICDF(9664), AOM_ICDF(9975),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1460), AOM_ICDF(1768), AOM_ICDF(4704), AOM_ICDF(5011),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29634), AOM_ICDF(30134), AOM_ICDF(31898), AOM_ICDF(32218),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16976), AOM_ICDF(17856), AOM_ICDF(29258), AOM_ICDF(29584),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8521), AOM_ICDF(8858), AOM_ICDF(21252), AOM_ICDF(21574),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4894), AOM_ICDF(5208), AOM_ICDF(13957), AOM_ICDF(14271),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3140), AOM_ICDF(3452), AOM_ICDF(9099), AOM_ICDF(9411),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1770), AOM_ICDF(2080), AOM_ICDF(5241), AOM_ICDF(5551),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(22253), AOM_ICDF(23279), AOM_ICDF(24319), AOM_ICDF(27691),
-    AOM_ICDF(30884), AOM_ICDF(32768), },
-    {AOM_ICDF(6281), AOM_ICDF(8348), AOM_ICDF(9473), AOM_ICDF(15740),
-    AOM_ICDF(24879), AOM_ICDF(32768), },
-    {AOM_ICDF(1265), AOM_ICDF(3893), AOM_ICDF(4482), AOM_ICDF(9694),
-    AOM_ICDF(18376), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(17243), AOM_ICDF(18993), AOM_ICDF(28515), AOM_ICDF(29242),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15645), AOM_ICDF(23632), AOM_ICDF(29905), AOM_ICDF(30416),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11203), AOM_ICDF(18441), AOM_ICDF(27037), AOM_ICDF(27930),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9306), AOM_ICDF(13788), AOM_ICDF(23647), AOM_ICDF(24669),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8076), AOM_ICDF(10237), AOM_ICDF(20500), AOM_ICDF(21437),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7214), AOM_ICDF(8133), AOM_ICDF(17608), AOM_ICDF(18202),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(23555), AOM_ICDF(26147), AOM_ICDF(31229), AOM_ICDF(31581),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16046), AOM_ICDF(20455), AOM_ICDF(29711), AOM_ICDF(30107),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10810), AOM_ICDF(14014), AOM_ICDF(25967), AOM_ICDF(26499),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8267), AOM_ICDF(9930), AOM_ICDF(21704), AOM_ICDF(22244),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5637), AOM_ICDF(6282), AOM_ICDF(15954), AOM_ICDF(16508),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4090), AOM_ICDF(4363), AOM_ICDF(11771), AOM_ICDF(12044),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26146), AOM_ICDF(27425), AOM_ICDF(31658), AOM_ICDF(31983),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17486), AOM_ICDF(20295), AOM_ICDF(30279), AOM_ICDF(30621),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10812), AOM_ICDF(12230), AOM_ICDF(26095), AOM_ICDF(26460),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7510), AOM_ICDF(8042), AOM_ICDF(21058), AOM_ICDF(21425),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4566), AOM_ICDF(4916), AOM_ICDF(13594), AOM_ICDF(13891),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1956), AOM_ICDF(2445), AOM_ICDF(5380), AOM_ICDF(5869),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28423), AOM_ICDF(29253), AOM_ICDF(31959), AOM_ICDF(32277),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18711), AOM_ICDF(20638), AOM_ICDF(30445), AOM_ICDF(30777),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10301), AOM_ICDF(10903), AOM_ICDF(24702), AOM_ICDF(25060),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6531), AOM_ICDF(6885), AOM_ICDF(18215), AOM_ICDF(18535),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3965), AOM_ICDF(4265), AOM_ICDF(11701), AOM_ICDF(12023),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3255), AOM_ICDF(3906), AOM_ICDF(8897), AOM_ICDF(9548),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29905), AOM_ICDF(30382), AOM_ICDF(32053), AOM_ICDF(32369),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19724), AOM_ICDF(20376), AOM_ICDF(30778), AOM_ICDF(31101),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10430), AOM_ICDF(10786), AOM_ICDF(24620), AOM_ICDF(24943),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6151), AOM_ICDF(6475), AOM_ICDF(17188), AOM_ICDF(17504),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3728), AOM_ICDF(4034), AOM_ICDF(11352), AOM_ICDF(11658),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1456), AOM_ICDF(1748), AOM_ICDF(5024), AOM_ICDF(5316),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(24883), AOM_ICDF(25616), AOM_ICDF(27995), AOM_ICDF(29251),
-    AOM_ICDF(31055), AOM_ICDF(32768), },
-    {AOM_ICDF(9802), AOM_ICDF(11841), AOM_ICDF(18691), AOM_ICDF(22179),
-    AOM_ICDF(26383), AOM_ICDF(32768), },
-    {AOM_ICDF(4096), AOM_ICDF(7928), AOM_ICDF(14072), AOM_ICDF(21042),
-    AOM_ICDF(23453), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(10363), AOM_ICDF(20924), AOM_ICDF(29116), AOM_ICDF(29906),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10682), AOM_ICDF(22326), AOM_ICDF(29093), AOM_ICDF(29642),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10304), AOM_ICDF(21073), AOM_ICDF(26843), AOM_ICDF(28904),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6138), AOM_ICDF(13221), AOM_ICDF(22475), AOM_ICDF(25119),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3788), AOM_ICDF(4356), AOM_ICDF(10607), AOM_ICDF(12690),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1950), AOM_ICDF(4291), AOM_ICDF(10923), AOM_ICDF(12873),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(21958), AOM_ICDF(27093), AOM_ICDF(30741), AOM_ICDF(31349),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18725), AOM_ICDF(23406), AOM_ICDF(30541), AOM_ICDF(31268),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15634), AOM_ICDF(17134), AOM_ICDF(26450), AOM_ICDF(27092),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10012), AOM_ICDF(11287), AOM_ICDF(24758), AOM_ICDF(25304),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6242), AOM_ICDF(7802), AOM_ICDF(19895), AOM_ICDF(21065),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4096), AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(20480),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26587), AOM_ICDF(27934), AOM_ICDF(31817), AOM_ICDF(32094),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20234), AOM_ICDF(22651), AOM_ICDF(30576), AOM_ICDF(30857),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13405), AOM_ICDF(14708), AOM_ICDF(26624), AOM_ICDF(27183),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9132), AOM_ICDF(11281), AOM_ICDF(19876), AOM_ICDF(21487),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5174), AOM_ICDF(10348), AOM_ICDF(15522), AOM_ICDF(20696),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5783), AOM_ICDF(11565), AOM_ICDF(19275), AOM_ICDF(25058),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28277), AOM_ICDF(29312), AOM_ICDF(32101), AOM_ICDF(32400),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18946), AOM_ICDF(23037), AOM_ICDF(31186), AOM_ICDF(31565),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14043), AOM_ICDF(14980), AOM_ICDF(29491), AOM_ICDF(30193),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9638), AOM_ICDF(12529), AOM_ICDF(21203), AOM_ICDF(24094),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6554), AOM_ICDF(11469), AOM_ICDF(18022), AOM_ICDF(22938),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(31039), AOM_ICDF(31404), AOM_ICDF(32048), AOM_ICDF(32372),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20567), AOM_ICDF(21869), AOM_ICDF(28724), AOM_ICDF(29256),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10000), AOM_ICDF(11250), AOM_ICDF(22768), AOM_ICDF(23393),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6291), AOM_ICDF(7078), AOM_ICDF(20447), AOM_ICDF(21234),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3072), AOM_ICDF(6144), AOM_ICDF(18432), AOM_ICDF(21504),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(23448), AOM_ICDF(25882), AOM_ICDF(29692), AOM_ICDF(31272),
-    AOM_ICDF(32065), AOM_ICDF(32768), },
-    {AOM_ICDF(4276), AOM_ICDF(17832), AOM_ICDF(22156), AOM_ICDF(28463),
-    AOM_ICDF(30374), AOM_ICDF(32768), },
-    {AOM_ICDF(842), AOM_ICDF(20937), AOM_ICDF(22447), AOM_ICDF(28559),
-    AOM_ICDF(30333), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(30469), AOM_ICDF(30991), AOM_ICDF(32114), AOM_ICDF(32435),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(27295), AOM_ICDF(29153), AOM_ICDF(31917), AOM_ICDF(32269),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16309), AOM_ICDF(22060), AOM_ICDF(29937), AOM_ICDF(30686),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11440), AOM_ICDF(16853), AOM_ICDF(26633), AOM_ICDF(27427),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13069), AOM_ICDF(15405), AOM_ICDF(27401), AOM_ICDF(28033),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9084), AOM_ICDF(10058), AOM_ICDF(23197), AOM_ICDF(23684),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(30728), AOM_ICDF(31202), AOM_ICDF(32138), AOM_ICDF(32450),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(23421), AOM_ICDF(26186), AOM_ICDF(31939), AOM_ICDF(32278),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12249), AOM_ICDF(15027), AOM_ICDF(28348), AOM_ICDF(28854),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5667), AOM_ICDF(6899), AOM_ICDF(22174), AOM_ICDF(23652),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(10650), AOM_ICDF(17203), AOM_ICDF(20480),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(30721), AOM_ICDF(31093), AOM_ICDF(32141), AOM_ICDF(32453),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(24052), AOM_ICDF(25175), AOM_ICDF(31923), AOM_ICDF(32231),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8145), AOM_ICDF(9281), AOM_ICDF(27654), AOM_ICDF(28412),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7373), AOM_ICDF(9830), AOM_ICDF(21299), AOM_ICDF(23757),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(31284), AOM_ICDF(31621), AOM_ICDF(32143), AOM_ICDF(32455),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(27783), AOM_ICDF(28563), AOM_ICDF(32045), AOM_ICDF(32361),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10149), AOM_ICDF(12179), AOM_ICDF(28128), AOM_ICDF(28998),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5650), AOM_ICDF(9039), AOM_ICDF(19209), AOM_ICDF(22599),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(31038), AOM_ICDF(31383), AOM_ICDF(32035), AOM_ICDF(32357),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20689), AOM_ICDF(22001), AOM_ICDF(28880), AOM_ICDF(29479),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7827), AOM_ICDF(10613), AOM_ICDF(24141), AOM_ICDF(24735),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8021), AOM_ICDF(8585), AOM_ICDF(22014), AOM_ICDF(22383),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6047), AOM_ICDF(6350), AOM_ICDF(19918), AOM_ICDF(20220),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-};
+static const aom_cdf_prob
+    av1_default_eob_multi1024_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        11)] = { { { { AOM_CDF11(393, 421, 751, 1623, 3160, 6352, 13345, 18047,
+                                 22571, 25830) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } },
+                   { { AOM_CDF11(1865, 1988, 2930, 4242, 10533, 16538, 21354,
+                                 27255, 28546, 31784) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } } },
+                 { { { AOM_CDF11(696, 948, 3145, 5702, 9706, 13217, 17851,
+                                 21856, 25692, 28034) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } },
+                   { { AOM_CDF11(2672, 3591, 9330, 17084, 22725, 24284, 26527,
+                                 28027, 28377, 30876) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } } },
+                 { { { AOM_CDF11(2784, 3831, 7041, 10521, 14847, 18844, 23155,
+                                 26682, 29229, 31045) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } },
+                   { { AOM_CDF11(9577, 12466, 17739, 20750, 22061, 23215, 24601,
+                                 25483, 25843, 32056) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } } },
+                 { { { AOM_CDF11(6698, 8334, 11961, 15762, 20186, 23862, 27434,
+                                 29326, 31082, 32050) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } },
+                   { { AOM_CDF11(20569, 22426, 25569, 26859, 28053, 28913,
+                                 29486, 29724, 29807, 32570) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } } } };
 
-static const coeff_cdf_model
-av1_default_coef_head_cdfs_q2[TX_SIZES][PLANE_TYPES] = {
-    {  // TX 4X4
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(23035), AOM_ICDF(23799), AOM_ICDF(27745), AOM_ICDF(29607),
-    AOM_ICDF(30130), AOM_ICDF(32768), },
-    {AOM_ICDF(12409), AOM_ICDF(14763), AOM_ICDF(22883), AOM_ICDF(26775),
-    AOM_ICDF(27649), AOM_ICDF(32768), },
-    {AOM_ICDF(5237), AOM_ICDF(9433), AOM_ICDF(15597), AOM_ICDF(21779),
-    AOM_ICDF(23224), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(10424), AOM_ICDF(17678), AOM_ICDF(28850), AOM_ICDF(29349),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10376), AOM_ICDF(16902), AOM_ICDF(28779), AOM_ICDF(29265),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10166), AOM_ICDF(14387), AOM_ICDF(26253), AOM_ICDF(26807),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8474), AOM_ICDF(9927), AOM_ICDF(22092), AOM_ICDF(22697),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6415), AOM_ICDF(6911), AOM_ICDF(17155), AOM_ICDF(17579),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4611), AOM_ICDF(4928), AOM_ICDF(12174), AOM_ICDF(12497),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(16984), AOM_ICDF(21802), AOM_ICDF(30901), AOM_ICDF(31373),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14003), AOM_ICDF(19369), AOM_ICDF(30193), AOM_ICDF(30615),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10729), AOM_ICDF(13233), AOM_ICDF(26938), AOM_ICDF(27455),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8604), AOM_ICDF(9526), AOM_ICDF(22436), AOM_ICDF(22989),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6828), AOM_ICDF(7236), AOM_ICDF(18056), AOM_ICDF(18456),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4302), AOM_ICDF(4555), AOM_ICDF(12209), AOM_ICDF(12462),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(20261), AOM_ICDF(24381), AOM_ICDF(31612), AOM_ICDF(31989),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13775), AOM_ICDF(20449), AOM_ICDF(30685), AOM_ICDF(31111),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10459), AOM_ICDF(13768), AOM_ICDF(27504), AOM_ICDF(28114),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7994), AOM_ICDF(8989), AOM_ICDF(22906), AOM_ICDF(23636),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5928), AOM_ICDF(6460), AOM_ICDF(16884), AOM_ICDF(17720),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4520), AOM_ICDF(7910), AOM_ICDF(12429), AOM_ICDF(16949),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(17822), AOM_ICDF(26021), AOM_ICDF(31751), AOM_ICDF(32150),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13484), AOM_ICDF(23372), AOM_ICDF(31305), AOM_ICDF(31747),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11009), AOM_ICDF(15469), AOM_ICDF(28452), AOM_ICDF(29132),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8358), AOM_ICDF(9357), AOM_ICDF(22412), AOM_ICDF(23385),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9392), AOM_ICDF(10018), AOM_ICDF(18158), AOM_ICDF(19202),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(5236), AOM_ICDF(26529), AOM_ICDF(31709), AOM_ICDF(32201),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5710), AOM_ICDF(25925), AOM_ICDF(31254), AOM_ICDF(31967),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7645), AOM_ICDF(19427), AOM_ICDF(28170), AOM_ICDF(29920),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7427), AOM_ICDF(13350), AOM_ICDF(23253), AOM_ICDF(25438),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4681), AOM_ICDF(6687), AOM_ICDF(15381), AOM_ICDF(18725),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(11176), AOM_ICDF(18297), AOM_ICDF(19062), AOM_ICDF(28984),
-    AOM_ICDF(29496), AOM_ICDF(32768), },
-    {AOM_ICDF(9778), AOM_ICDF(17798), AOM_ICDF(19934), AOM_ICDF(28434),
-    AOM_ICDF(28921), AOM_ICDF(32768), },
-    {AOM_ICDF(4806), AOM_ICDF(14260), AOM_ICDF(17259), AOM_ICDF(26368),
-    AOM_ICDF(26942), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(21802), AOM_ICDF(22916), AOM_ICDF(31657), AOM_ICDF(31989),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16874), AOM_ICDF(20345), AOM_ICDF(31048), AOM_ICDF(31389),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10717), AOM_ICDF(12576), AOM_ICDF(26899), AOM_ICDF(27294),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8468), AOM_ICDF(9404), AOM_ICDF(21928), AOM_ICDF(22358),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5992), AOM_ICDF(6521), AOM_ICDF(16309), AOM_ICDF(16729),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5134), AOM_ICDF(5452), AOM_ICDF(11491), AOM_ICDF(11865),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(22003), AOM_ICDF(24147), AOM_ICDF(31841), AOM_ICDF(32177),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17179), AOM_ICDF(20593), AOM_ICDF(31041), AOM_ICDF(31394),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9282), AOM_ICDF(10544), AOM_ICDF(25698), AOM_ICDF(26133),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6301), AOM_ICDF(7013), AOM_ICDF(19066), AOM_ICDF(19557),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3845), AOM_ICDF(4316), AOM_ICDF(12209), AOM_ICDF(12812),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4819), AOM_ICDF(6746), AOM_ICDF(11565), AOM_ICDF(13011),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(22820), AOM_ICDF(26023), AOM_ICDF(31888), AOM_ICDF(32236),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17130), AOM_ICDF(21510), AOM_ICDF(31268), AOM_ICDF(31632),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10062), AOM_ICDF(11898), AOM_ICDF(26787), AOM_ICDF(27281),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7681), AOM_ICDF(8590), AOM_ICDF(21264), AOM_ICDF(22034),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4413), AOM_ICDF(5143), AOM_ICDF(13605), AOM_ICDF(14712),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5461), AOM_ICDF(10923), AOM_ICDF(16384), AOM_ICDF(21845),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(20237), AOM_ICDF(25695), AOM_ICDF(31868), AOM_ICDF(32222),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15551), AOM_ICDF(22658), AOM_ICDF(31236), AOM_ICDF(31659),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9584), AOM_ICDF(12389), AOM_ICDF(26347), AOM_ICDF(27242),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6067), AOM_ICDF(7231), AOM_ICDF(19625), AOM_ICDF(20707),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3724), AOM_ICDF(4312), AOM_ICDF(11269), AOM_ICDF(12425),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4096), AOM_ICDF(6554), AOM_ICDF(9830), AOM_ICDF(12288),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(11726), AOM_ICDF(26639), AOM_ICDF(31977), AOM_ICDF(32340),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10754), AOM_ICDF(25823), AOM_ICDF(31568), AOM_ICDF(32060),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8761), AOM_ICDF(16650), AOM_ICDF(27884), AOM_ICDF(29394),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7387), AOM_ICDF(9941), AOM_ICDF(21377), AOM_ICDF(23333),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2374), AOM_ICDF(3799), AOM_ICDF(16147), AOM_ICDF(19471),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(29271), AOM_ICDF(29645), AOM_ICDF(31447), AOM_ICDF(31951),
-    AOM_ICDF(32313), AOM_ICDF(32768), },
-    {AOM_ICDF(22174), AOM_ICDF(23288), AOM_ICDF(29633), AOM_ICDF(31096),
-    AOM_ICDF(31701), AOM_ICDF(32768), },
-    {AOM_ICDF(13601), AOM_ICDF(16603), AOM_ICDF(25296), AOM_ICDF(28966),
-    AOM_ICDF(30043), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(13850), AOM_ICDF(26266), AOM_ICDF(31653), AOM_ICDF(32083),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11979), AOM_ICDF(24610), AOM_ICDF(31369), AOM_ICDF(31810),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11325), AOM_ICDF(18989), AOM_ICDF(29109), AOM_ICDF(29770),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9338), AOM_ICDF(11892), AOM_ICDF(25324), AOM_ICDF(26115),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5725), AOM_ICDF(6243), AOM_ICDF(18483), AOM_ICDF(18919),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6554), AOM_ICDF(9830), AOM_ICDF(16384), AOM_ICDF(19661),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(18097), AOM_ICDF(27765), AOM_ICDF(31891), AOM_ICDF(32286),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14735), AOM_ICDF(24632), AOM_ICDF(31577), AOM_ICDF(31970),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11031), AOM_ICDF(15675), AOM_ICDF(29109), AOM_ICDF(29716),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8859), AOM_ICDF(9891), AOM_ICDF(23909), AOM_ICDF(24940),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7864), AOM_ICDF(11796), AOM_ICDF(20972), AOM_ICDF(24904),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(21057), AOM_ICDF(29116), AOM_ICDF(32033), AOM_ICDF(32367),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15287), AOM_ICDF(25704), AOM_ICDF(31791), AOM_ICDF(32151),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12927), AOM_ICDF(18993), AOM_ICDF(30815), AOM_ICDF(31329),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13227), AOM_ICDF(16234), AOM_ICDF(27657), AOM_ICDF(28860),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6899), AOM_ICDF(12072), AOM_ICDF(18971), AOM_ICDF(25869),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(17688), AOM_ICDF(28768), AOM_ICDF(32140), AOM_ICDF(32435),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13473), AOM_ICDF(26360), AOM_ICDF(31944), AOM_ICDF(32307),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12653), AOM_ICDF(18817), AOM_ICDF(28875), AOM_ICDF(30497),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5461), AOM_ICDF(10923), AOM_ICDF(20025), AOM_ICDF(25486),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(6820), AOM_ICDF(28765), AOM_ICDF(31878), AOM_ICDF(32323),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7737), AOM_ICDF(28672), AOM_ICDF(31972), AOM_ICDF(32313),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11796), AOM_ICDF(18350), AOM_ICDF(24904), AOM_ICDF(28836),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(30079), AOM_ICDF(30525), AOM_ICDF(31559), AOM_ICDF(32085),
-    AOM_ICDF(32407), AOM_ICDF(32768), },
-    {AOM_ICDF(22148), AOM_ICDF(24035), AOM_ICDF(29557), AOM_ICDF(31423),
-    AOM_ICDF(31881), AOM_ICDF(32768), },
-    {AOM_ICDF(13266), AOM_ICDF(17717), AOM_ICDF(26069), AOM_ICDF(29825),
-    AOM_ICDF(30780), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(18219), AOM_ICDF(27530), AOM_ICDF(32048), AOM_ICDF(32373),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14664), AOM_ICDF(25532), AOM_ICDF(31886), AOM_ICDF(32244),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11683), AOM_ICDF(19554), AOM_ICDF(30330), AOM_ICDF(30870),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9410), AOM_ICDF(14238), AOM_ICDF(25794), AOM_ICDF(27268),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6629), AOM_ICDF(9580), AOM_ICDF(20186), AOM_ICDF(22187),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2891), AOM_ICDF(4337), AOM_ICDF(11083), AOM_ICDF(13493),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(20016), AOM_ICDF(28471), AOM_ICDF(32074), AOM_ICDF(32401),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16915), AOM_ICDF(26047), AOM_ICDF(31965), AOM_ICDF(32300),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10725), AOM_ICDF(18206), AOM_ICDF(30056), AOM_ICDF(30606),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6883), AOM_ICDF(13990), AOM_ICDF(26334), AOM_ICDF(27531),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11529), AOM_ICDF(15170), AOM_ICDF(22452), AOM_ICDF(24879),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(23488), AOM_ICDF(29744), AOM_ICDF(32117), AOM_ICDF(32442),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17520), AOM_ICDF(27259), AOM_ICDF(32056), AOM_ICDF(32389),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13107), AOM_ICDF(20597), AOM_ICDF(31416), AOM_ICDF(32092),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20165), AOM_ICDF(22686), AOM_ICDF(26887), AOM_ICDF(29407),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(17711), AOM_ICDF(29963), AOM_ICDF(32137), AOM_ICDF(32452),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14078), AOM_ICDF(28336), AOM_ICDF(32026), AOM_ICDF(32391),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11129), AOM_ICDF(28749), AOM_ICDF(30295), AOM_ICDF(31222),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7447), AOM_ICDF(13405), AOM_ICDF(22342), AOM_ICDF(26810),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(14413), AOM_ICDF(30309), AOM_ICDF(32090), AOM_ICDF(32471),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11814), AOM_ICDF(30354), AOM_ICDF(32251), AOM_ICDF(32509),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7282), AOM_ICDF(12743), AOM_ICDF(21845), AOM_ICDF(27307),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 8X8
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(16945), AOM_ICDF(18241), AOM_ICDF(25718), AOM_ICDF(28152),
-    AOM_ICDF(29383), AOM_ICDF(32768), },
-    {AOM_ICDF(7095), AOM_ICDF(10051), AOM_ICDF(18830), AOM_ICDF(23174),
-    AOM_ICDF(24906), AOM_ICDF(32768), },
-    {AOM_ICDF(2585), AOM_ICDF(6677), AOM_ICDF(10951), AOM_ICDF(17411),
-    AOM_ICDF(18916), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(12894), AOM_ICDF(17897), AOM_ICDF(28218), AOM_ICDF(28651),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11333), AOM_ICDF(16802), AOM_ICDF(27676), AOM_ICDF(28153),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10166), AOM_ICDF(13829), AOM_ICDF(25072), AOM_ICDF(25646),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8356), AOM_ICDF(9772), AOM_ICDF(21358), AOM_ICDF(21912),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5988), AOM_ICDF(6506), AOM_ICDF(16203), AOM_ICDF(16647),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3684), AOM_ICDF(4012), AOM_ICDF(10039), AOM_ICDF(10367),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(18192), AOM_ICDF(21044), AOM_ICDF(30229), AOM_ICDF(30597),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14976), AOM_ICDF(18218), AOM_ICDF(29191), AOM_ICDF(29564),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10914), AOM_ICDF(12508), AOM_ICDF(25451), AOM_ICDF(25857),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7970), AOM_ICDF(8605), AOM_ICDF(20619), AOM_ICDF(21011),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5555), AOM_ICDF(5926), AOM_ICDF(15730), AOM_ICDF(16091),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3522), AOM_ICDF(3847), AOM_ICDF(10567), AOM_ICDF(10892),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(21896), AOM_ICDF(23866), AOM_ICDF(31136), AOM_ICDF(31486),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15913), AOM_ICDF(18331), AOM_ICDF(29670), AOM_ICDF(30019),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10158), AOM_ICDF(10878), AOM_ICDF(24664), AOM_ICDF(25024),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6692), AOM_ICDF(7070), AOM_ICDF(18934), AOM_ICDF(19267),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4603), AOM_ICDF(4914), AOM_ICDF(13724), AOM_ICDF(14041),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2378), AOM_ICDF(3171), AOM_ICDF(7663), AOM_ICDF(8456),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(24113), AOM_ICDF(25740), AOM_ICDF(31668), AOM_ICDF(32000),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16618), AOM_ICDF(18583), AOM_ICDF(30173), AOM_ICDF(30511),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10122), AOM_ICDF(10666), AOM_ICDF(24877), AOM_ICDF(25222),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6721), AOM_ICDF(7062), AOM_ICDF(19250), AOM_ICDF(19588),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4641), AOM_ICDF(4957), AOM_ICDF(13698), AOM_ICDF(14021),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3324), AOM_ICDF(4749), AOM_ICDF(9498), AOM_ICDF(10923),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(24933), AOM_ICDF(27294), AOM_ICDF(31876), AOM_ICDF(32207),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17505), AOM_ICDF(20214), AOM_ICDF(30842), AOM_ICDF(31189),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10756), AOM_ICDF(11345), AOM_ICDF(25989), AOM_ICDF(26362),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7374), AOM_ICDF(7763), AOM_ICDF(19820), AOM_ICDF(20160),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5003), AOM_ICDF(5328), AOM_ICDF(15420), AOM_ICDF(15723),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4915), AOM_ICDF(9830), AOM_ICDF(18022), AOM_ICDF(22938),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(7874), AOM_ICDF(17174), AOM_ICDF(19119), AOM_ICDF(28514),
-    AOM_ICDF(29361), AOM_ICDF(32768), },
-    {AOM_ICDF(3407), AOM_ICDF(13628), AOM_ICDF(16836), AOM_ICDF(26723),
-    AOM_ICDF(27681), AOM_ICDF(32768), },
-    {AOM_ICDF(1062), AOM_ICDF(11514), AOM_ICDF(14002), AOM_ICDF(24081),
-    AOM_ICDF(25232), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(23614), AOM_ICDF(24717), AOM_ICDF(31593), AOM_ICDF(31927),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18177), AOM_ICDF(21581), AOM_ICDF(30890), AOM_ICDF(31234),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12535), AOM_ICDF(14549), AOM_ICDF(27749), AOM_ICDF(28134),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9687), AOM_ICDF(10712), AOM_ICDF(23848), AOM_ICDF(24271),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6461), AOM_ICDF(7119), AOM_ICDF(17940), AOM_ICDF(18368),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3863), AOM_ICDF(4245), AOM_ICDF(10904), AOM_ICDF(11278),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(24334), AOM_ICDF(25912), AOM_ICDF(31795), AOM_ICDF(32120),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17964), AOM_ICDF(20229), AOM_ICDF(30726), AOM_ICDF(31064),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10463), AOM_ICDF(11527), AOM_ICDF(25898), AOM_ICDF(26256),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7431), AOM_ICDF(8071), AOM_ICDF(20542), AOM_ICDF(20928),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4561), AOM_ICDF(4995), AOM_ICDF(13977), AOM_ICDF(14347),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2427), AOM_ICDF(2687), AOM_ICDF(8149), AOM_ICDF(8409),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(25888), AOM_ICDF(27308), AOM_ICDF(31957), AOM_ICDF(32279),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18868), AOM_ICDF(20992), AOM_ICDF(31092), AOM_ICDF(31424),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10480), AOM_ICDF(11191), AOM_ICDF(25801), AOM_ICDF(26149),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6878), AOM_ICDF(7326), AOM_ICDF(19397), AOM_ICDF(19762),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4235), AOM_ICDF(4601), AOM_ICDF(13182), AOM_ICDF(13587),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3584), AOM_ICDF(5120), AOM_ICDF(11264), AOM_ICDF(13312),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(26802), AOM_ICDF(28181), AOM_ICDF(32031), AOM_ICDF(32349),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19661), AOM_ICDF(21746), AOM_ICDF(31360), AOM_ICDF(31688),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10680), AOM_ICDF(11361), AOM_ICDF(26261), AOM_ICDF(26610),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6811), AOM_ICDF(7274), AOM_ICDF(19689), AOM_ICDF(20075),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4881), AOM_ICDF(5230), AOM_ICDF(11882), AOM_ICDF(12324),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4096), AOM_ICDF(6144), AOM_ICDF(9557), AOM_ICDF(11605),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(27511), AOM_ICDF(29045), AOM_ICDF(32051), AOM_ICDF(32376),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19712), AOM_ICDF(22596), AOM_ICDF(31464), AOM_ICDF(31813),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11035), AOM_ICDF(11852), AOM_ICDF(26626), AOM_ICDF(27082),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7190), AOM_ICDF(7674), AOM_ICDF(20245), AOM_ICDF(20794),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5114), AOM_ICDF(5407), AOM_ICDF(12895), AOM_ICDF(13443),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5174), AOM_ICDF(10348), AOM_ICDF(15522), AOM_ICDF(20696),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(26201), AOM_ICDF(26641), AOM_ICDF(31158), AOM_ICDF(31755),
-    AOM_ICDF(32200), AOM_ICDF(32768), },
-    {AOM_ICDF(19651), AOM_ICDF(20883), AOM_ICDF(28935), AOM_ICDF(30581),
-    AOM_ICDF(31426), AOM_ICDF(32768), },
-    {AOM_ICDF(12456), AOM_ICDF(15868), AOM_ICDF(23727), AOM_ICDF(27839),
-    AOM_ICDF(29216), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(16708), AOM_ICDF(25600), AOM_ICDF(31550), AOM_ICDF(31927),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14533), AOM_ICDF(24134), AOM_ICDF(31151), AOM_ICDF(31670),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12771), AOM_ICDF(19041), AOM_ICDF(29256), AOM_ICDF(29926),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9497), AOM_ICDF(12011), AOM_ICDF(24856), AOM_ICDF(25648),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6059), AOM_ICDF(6512), AOM_ICDF(17765), AOM_ICDF(18218),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4498), AOM_ICDF(6425), AOM_ICDF(13493), AOM_ICDF(15420),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(21314), AOM_ICDF(26763), AOM_ICDF(31645), AOM_ICDF(32043),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16898), AOM_ICDF(23241), AOM_ICDF(31276), AOM_ICDF(31667),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12339), AOM_ICDF(16091), AOM_ICDF(28493), AOM_ICDF(28851),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8583), AOM_ICDF(10033), AOM_ICDF(23721), AOM_ICDF(24359),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6801), AOM_ICDF(7728), AOM_ICDF(18857), AOM_ICDF(19784),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(18432), AOM_ICDF(24576),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(25155), AOM_ICDF(28551), AOM_ICDF(31936), AOM_ICDF(32273),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18054), AOM_ICDF(22818), AOM_ICDF(31343), AOM_ICDF(31736),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12381), AOM_ICDF(14088), AOM_ICDF(27865), AOM_ICDF(28300),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7853), AOM_ICDF(8666), AOM_ICDF(21665), AOM_ICDF(22477),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6242), AOM_ICDF(10923), AOM_ICDF(15604), AOM_ICDF(20285),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(26649), AOM_ICDF(29334), AOM_ICDF(32001), AOM_ICDF(32345),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18410), AOM_ICDF(22788), AOM_ICDF(31465), AOM_ICDF(31842),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12504), AOM_ICDF(13480), AOM_ICDF(28600), AOM_ICDF(28955),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9175), AOM_ICDF(10486), AOM_ICDF(21845), AOM_ICDF(23156),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7710), AOM_ICDF(13493), AOM_ICDF(21203), AOM_ICDF(26985),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(27622), AOM_ICDF(30399), AOM_ICDF(32070), AOM_ICDF(32399),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18214), AOM_ICDF(24797), AOM_ICDF(31688), AOM_ICDF(32070),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14564), AOM_ICDF(16894), AOM_ICDF(28981), AOM_ICDF(29564),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7802), AOM_ICDF(12483), AOM_ICDF(17164), AOM_ICDF(21845),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(30040), AOM_ICDF(30464), AOM_ICDF(31682), AOM_ICDF(32091),
-    AOM_ICDF(32421), AOM_ICDF(32768), },
-    {AOM_ICDF(20770), AOM_ICDF(22635), AOM_ICDF(29889), AOM_ICDF(31156),
-    AOM_ICDF(31909), AOM_ICDF(32768), },
-    {AOM_ICDF(9112), AOM_ICDF(13841), AOM_ICDF(23864), AOM_ICDF(27288),
-    AOM_ICDF(30322), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(23477), AOM_ICDF(28240), AOM_ICDF(32035), AOM_ICDF(32360),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18183), AOM_ICDF(26268), AOM_ICDF(31861), AOM_ICDF(32205),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14392), AOM_ICDF(23052), AOM_ICDF(30811), AOM_ICDF(31315),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12579), AOM_ICDF(20081), AOM_ICDF(28411), AOM_ICDF(29467),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9882), AOM_ICDF(14796), AOM_ICDF(25492), AOM_ICDF(27040),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11141), AOM_ICDF(13107), AOM_ICDF(21627), AOM_ICDF(23593),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(24700), AOM_ICDF(28735), AOM_ICDF(32055), AOM_ICDF(32379),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19703), AOM_ICDF(25203), AOM_ICDF(31809), AOM_ICDF(32142),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12756), AOM_ICDF(18882), AOM_ICDF(30716), AOM_ICDF(31103),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9508), AOM_ICDF(13922), AOM_ICDF(25977), AOM_ICDF(26826),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5243), AOM_ICDF(9175), AOM_ICDF(19661), AOM_ICDF(23593),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26792), AOM_ICDF(29367), AOM_ICDF(32090), AOM_ICDF(32407),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21899), AOM_ICDF(25640), AOM_ICDF(31870), AOM_ICDF(32192),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14205), AOM_ICDF(16907), AOM_ICDF(30415), AOM_ICDF(30764),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10570), AOM_ICDF(13741), AOM_ICDF(23255), AOM_ICDF(26426),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(27743), AOM_ICDF(29950), AOM_ICDF(32116), AOM_ICDF(32430),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21595), AOM_ICDF(24944), AOM_ICDF(31927), AOM_ICDF(32259),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15227), AOM_ICDF(16673), AOM_ICDF(30744), AOM_ICDF(31130),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13797), AOM_ICDF(16384), AOM_ICDF(25007), AOM_ICDF(27594),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(14336), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(28888), AOM_ICDF(30883), AOM_ICDF(32127), AOM_ICDF(32447),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20978), AOM_ICDF(26121), AOM_ICDF(32090), AOM_ICDF(32406),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16644), AOM_ICDF(18725), AOM_ICDF(30427), AOM_ICDF(31468),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6554), AOM_ICDF(11469), AOM_ICDF(22938), AOM_ICDF(27853),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 16X16
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(2791), AOM_ICDF(5929), AOM_ICDF(15783), AOM_ICDF(21305),
-    AOM_ICDF(24756), AOM_ICDF(32768), },
-    {AOM_ICDF(2492), AOM_ICDF(5974), AOM_ICDF(11999), AOM_ICDF(17892),
-    AOM_ICDF(20328), AOM_ICDF(32768), },
-    {AOM_ICDF(1232), AOM_ICDF(4784), AOM_ICDF(7266), AOM_ICDF(13409),
-    AOM_ICDF(14638), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(10984), AOM_ICDF(15590), AOM_ICDF(26386), AOM_ICDF(26860),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10300), AOM_ICDF(15555), AOM_ICDF(26075), AOM_ICDF(26661),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9016), AOM_ICDF(12368), AOM_ICDF(23292), AOM_ICDF(24037),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7432), AOM_ICDF(9010), AOM_ICDF(19640), AOM_ICDF(20245),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5340), AOM_ICDF(5830), AOM_ICDF(14605), AOM_ICDF(15017),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3041), AOM_ICDF(3357), AOM_ICDF(8664), AOM_ICDF(8983),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(17487), AOM_ICDF(19944), AOM_ICDF(29422), AOM_ICDF(29785),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14365), AOM_ICDF(17572), AOM_ICDF(28369), AOM_ICDF(28763),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10944), AOM_ICDF(12562), AOM_ICDF(24945), AOM_ICDF(25372),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8061), AOM_ICDF(8670), AOM_ICDF(20179), AOM_ICDF(20570),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5386), AOM_ICDF(5759), AOM_ICDF(14881), AOM_ICDF(15238),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3124), AOM_ICDF(3450), AOM_ICDF(9578), AOM_ICDF(9895),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(21610), AOM_ICDF(23212), AOM_ICDF(30674), AOM_ICDF(31007),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15516), AOM_ICDF(17922), AOM_ICDF(29225), AOM_ICDF(29573),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10431), AOM_ICDF(11308), AOM_ICDF(24594), AOM_ICDF(24955),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6949), AOM_ICDF(7331), AOM_ICDF(18758), AOM_ICDF(19089),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4564), AOM_ICDF(4898), AOM_ICDF(12730), AOM_ICDF(13048),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2435), AOM_ICDF(2739), AOM_ICDF(7406), AOM_ICDF(7710),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(24469), AOM_ICDF(25838), AOM_ICDF(31499), AOM_ICDF(31824),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17238), AOM_ICDF(18899), AOM_ICDF(30066), AOM_ICDF(30395),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10423), AOM_ICDF(10890), AOM_ICDF(24655), AOM_ICDF(24992),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6612), AOM_ICDF(6939), AOM_ICDF(18149), AOM_ICDF(18467),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4122), AOM_ICDF(4431), AOM_ICDF(12556), AOM_ICDF(12874),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1910), AOM_ICDF(2211), AOM_ICDF(7840), AOM_ICDF(8142),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(27205), AOM_ICDF(28145), AOM_ICDF(31900), AOM_ICDF(32218),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18503), AOM_ICDF(19729), AOM_ICDF(30590), AOM_ICDF(30916),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10343), AOM_ICDF(10734), AOM_ICDF(24636), AOM_ICDF(24963),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6629), AOM_ICDF(6955), AOM_ICDF(18492), AOM_ICDF(18810),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4131), AOM_ICDF(4437), AOM_ICDF(13086), AOM_ICDF(13392),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4005), AOM_ICDF(5097), AOM_ICDF(9102), AOM_ICDF(10194),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(1286), AOM_ICDF(10273), AOM_ICDF(21021), AOM_ICDF(28617),
-    AOM_ICDF(29729), AOM_ICDF(32768), },
-    {AOM_ICDF(941), AOM_ICDF(10009), AOM_ICDF(17718), AOM_ICDF(25847),
-    AOM_ICDF(27712), AOM_ICDF(32768), },
-    {AOM_ICDF(508), AOM_ICDF(9488), AOM_ICDF(12907), AOM_ICDF(21634),
-    AOM_ICDF(23969), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(23900), AOM_ICDF(25135), AOM_ICDF(31528), AOM_ICDF(31861),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18613), AOM_ICDF(22015), AOM_ICDF(30774), AOM_ICDF(31124),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13064), AOM_ICDF(16135), AOM_ICDF(28060), AOM_ICDF(28484),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10563), AOM_ICDF(12428), AOM_ICDF(24847), AOM_ICDF(25281),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7960), AOM_ICDF(9069), AOM_ICDF(20548), AOM_ICDF(21017),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6944), AOM_ICDF(7491), AOM_ICDF(16595), AOM_ICDF(17007),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(24972), AOM_ICDF(26434), AOM_ICDF(31771), AOM_ICDF(32097),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18362), AOM_ICDF(20757), AOM_ICDF(30733), AOM_ICDF(31070),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11226), AOM_ICDF(12487), AOM_ICDF(26292), AOM_ICDF(26651),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7823), AOM_ICDF(8448), AOM_ICDF(20940), AOM_ICDF(21314),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4964), AOM_ICDF(5365), AOM_ICDF(14104), AOM_ICDF(14457),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2435), AOM_ICDF(2712), AOM_ICDF(8247), AOM_ICDF(8524),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26551), AOM_ICDF(27694), AOM_ICDF(31943), AOM_ICDF(32261),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19519), AOM_ICDF(21452), AOM_ICDF(31120), AOM_ICDF(31446),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11272), AOM_ICDF(11965), AOM_ICDF(26389), AOM_ICDF(26736),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7109), AOM_ICDF(7485), AOM_ICDF(19585), AOM_ICDF(19920),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4033), AOM_ICDF(4370), AOM_ICDF(12546), AOM_ICDF(12865),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1570), AOM_ICDF(2158), AOM_ICDF(7456), AOM_ICDF(8045),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(27654), AOM_ICDF(28637), AOM_ICDF(32030), AOM_ICDF(32345),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20795), AOM_ICDF(22232), AOM_ICDF(31351), AOM_ICDF(31672),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10841), AOM_ICDF(11329), AOM_ICDF(25676), AOM_ICDF(26002),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6589), AOM_ICDF(6943), AOM_ICDF(18084), AOM_ICDF(18412),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3970), AOM_ICDF(4279), AOM_ICDF(12009), AOM_ICDF(12318),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3449), AOM_ICDF(3967), AOM_ICDF(7761), AOM_ICDF(8278),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29545), AOM_ICDF(30314), AOM_ICDF(32084), AOM_ICDF(32404),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21229), AOM_ICDF(22783), AOM_ICDF(31470), AOM_ICDF(31800),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10409), AOM_ICDF(11031), AOM_ICDF(25267), AOM_ICDF(25669),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6456), AOM_ICDF(6909), AOM_ICDF(18270), AOM_ICDF(18674),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4253), AOM_ICDF(5017), AOM_ICDF(13288), AOM_ICDF(13706),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(1627), AOM_ICDF(2324), AOM_ICDF(8831), AOM_ICDF(9528),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(24627), AOM_ICDF(25102), AOM_ICDF(30943), AOM_ICDF(31607),
-    AOM_ICDF(32215), AOM_ICDF(32768), },
-    {AOM_ICDF(17408), AOM_ICDF(18757), AOM_ICDF(28256), AOM_ICDF(30111),
-    AOM_ICDF(31225), AOM_ICDF(32768), },
-    {AOM_ICDF(10984), AOM_ICDF(14293), AOM_ICDF(22894), AOM_ICDF(27503),
-    AOM_ICDF(28853), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(16390), AOM_ICDF(25826), AOM_ICDF(31293), AOM_ICDF(31726),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14074), AOM_ICDF(25147), AOM_ICDF(31045), AOM_ICDF(31638),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13598), AOM_ICDF(20524), AOM_ICDF(28818), AOM_ICDF(29894),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10035), AOM_ICDF(13322), AOM_ICDF(25086), AOM_ICDF(26332),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7156), AOM_ICDF(8035), AOM_ICDF(18456), AOM_ICDF(19334),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(10923), AOM_ICDF(19115), AOM_ICDF(21845),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(22787), AOM_ICDF(27489), AOM_ICDF(31676), AOM_ICDF(32026),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17518), AOM_ICDF(23800), AOM_ICDF(31204), AOM_ICDF(31578),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10686), AOM_ICDF(15226), AOM_ICDF(28087), AOM_ICDF(28560),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9612), AOM_ICDF(11942), AOM_ICDF(22574), AOM_ICDF(23010),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6437), AOM_ICDF(8192), AOM_ICDF(18139), AOM_ICDF(19895),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(18432), AOM_ICDF(24576),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26773), AOM_ICDF(28429), AOM_ICDF(31782), AOM_ICDF(32120),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18449), AOM_ICDF(22329), AOM_ICDF(30991), AOM_ICDF(31329),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12861), AOM_ICDF(14182), AOM_ICDF(27130), AOM_ICDF(27395),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4681), AOM_ICDF(6554), AOM_ICDF(22469), AOM_ICDF(23874),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8623), AOM_ICDF(13797), AOM_ICDF(22420), AOM_ICDF(27594),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28378), AOM_ICDF(29466), AOM_ICDF(31934), AOM_ICDF(32245),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19880), AOM_ICDF(21733), AOM_ICDF(31206), AOM_ICDF(31550),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12173), AOM_ICDF(13245), AOM_ICDF(27638), AOM_ICDF(27945),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6215), AOM_ICDF(7910), AOM_ICDF(19774), AOM_ICDF(21469),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5783), AOM_ICDF(11565), AOM_ICDF(21203), AOM_ICDF(26985),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(30333), AOM_ICDF(31015), AOM_ICDF(32078), AOM_ICDF(32401),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19277), AOM_ICDF(21376), AOM_ICDF(31072), AOM_ICDF(31407),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12978), AOM_ICDF(13724), AOM_ICDF(28144), AOM_ICDF(28442),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10031), AOM_ICDF(12037), AOM_ICDF(25412), AOM_ICDF(27418),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(29777), AOM_ICDF(30229), AOM_ICDF(31726), AOM_ICDF(32104),
-    AOM_ICDF(32440), AOM_ICDF(32768), },
-    {AOM_ICDF(18551), AOM_ICDF(20755), AOM_ICDF(29778), AOM_ICDF(30685),
-    AOM_ICDF(31935), AOM_ICDF(32768), },
-    {AOM_ICDF(6236), AOM_ICDF(13170), AOM_ICDF(24037), AOM_ICDF(25823),
-    AOM_ICDF(30798), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(28890), AOM_ICDF(30863), AOM_ICDF(32128), AOM_ICDF(32440),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17311), AOM_ICDF(27082), AOM_ICDF(31871), AOM_ICDF(32209),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13447), AOM_ICDF(25217), AOM_ICDF(31158), AOM_ICDF(31793),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11906), AOM_ICDF(20177), AOM_ICDF(29976), AOM_ICDF(30713),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14883), AOM_ICDF(17134), AOM_ICDF(27140), AOM_ICDF(28266),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14959), AOM_ICDF(17096), AOM_ICDF(22795), AOM_ICDF(25645),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(29494), AOM_ICDF(30807), AOM_ICDF(32086), AOM_ICDF(32404),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19860), AOM_ICDF(25179), AOM_ICDF(31857), AOM_ICDF(32190),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13936), AOM_ICDF(19209), AOM_ICDF(30508), AOM_ICDF(31073),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7168), AOM_ICDF(10240), AOM_ICDF(24576), AOM_ICDF(27648),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5783), AOM_ICDF(11565), AOM_ICDF(19275), AOM_ICDF(25058),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(30496), AOM_ICDF(31243), AOM_ICDF(32121), AOM_ICDF(32433),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21369), AOM_ICDF(24262), AOM_ICDF(31827), AOM_ICDF(32158),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18971), AOM_ICDF(21127), AOM_ICDF(29319), AOM_ICDF(30612),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7710), AOM_ICDF(13493), AOM_ICDF(21203), AOM_ICDF(26985),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(30922), AOM_ICDF(31459), AOM_ICDF(32136), AOM_ICDF(32449),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22640), AOM_ICDF(24782), AOM_ICDF(31768), AOM_ICDF(32076),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12955), AOM_ICDF(14860), AOM_ICDF(28958), AOM_ICDF(30101),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7282), AOM_ICDF(12743), AOM_ICDF(21845), AOM_ICDF(27307),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(30469), AOM_ICDF(31279), AOM_ICDF(32115), AOM_ICDF(32446),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19748), AOM_ICDF(24367), AOM_ICDF(31900), AOM_ICDF(32257),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12684), AOM_ICDF(16120), AOM_ICDF(30125), AOM_ICDF(30918),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 32X32
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(8402), AOM_ICDF(9860), AOM_ICDF(23425), AOM_ICDF(26798),
-    AOM_ICDF(28753), AOM_ICDF(32768), },
-    {AOM_ICDF(4503), AOM_ICDF(7478), AOM_ICDF(14541), AOM_ICDF(19455),
-    AOM_ICDF(21058), AOM_ICDF(32768), },
-    {AOM_ICDF(1404), AOM_ICDF(4914), AOM_ICDF(7456), AOM_ICDF(13239),
-    AOM_ICDF(14005), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(11786), AOM_ICDF(17804), AOM_ICDF(26686), AOM_ICDF(27285),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10456), AOM_ICDF(16685), AOM_ICDF(26272), AOM_ICDF(27135),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8297), AOM_ICDF(12591), AOM_ICDF(23088), AOM_ICDF(24288),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6320), AOM_ICDF(8297), AOM_ICDF(18902), AOM_ICDF(20112),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4385), AOM_ICDF(4892), AOM_ICDF(12779), AOM_ICDF(13476),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2151), AOM_ICDF(2470), AOM_ICDF(6432), AOM_ICDF(6758),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(17988), AOM_ICDF(21025), AOM_ICDF(29658), AOM_ICDF(30075),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14641), AOM_ICDF(18188), AOM_ICDF(28759), AOM_ICDF(29202),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10951), AOM_ICDF(12924), AOM_ICDF(25087), AOM_ICDF(25515),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(9165), AOM_ICDF(20302), AOM_ICDF(20696),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5213), AOM_ICDF(5567), AOM_ICDF(14740), AOM_ICDF(15114),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2785), AOM_ICDF(3096), AOM_ICDF(8153), AOM_ICDF(8465),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(22839), AOM_ICDF(24625), AOM_ICDF(31013), AOM_ICDF(31343),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16111), AOM_ICDF(18689), AOM_ICDF(29552), AOM_ICDF(29896),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10736), AOM_ICDF(11502), AOM_ICDF(24493), AOM_ICDF(24827),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7153), AOM_ICDF(7570), AOM_ICDF(18744), AOM_ICDF(19067),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4285), AOM_ICDF(4591), AOM_ICDF(11651), AOM_ICDF(11957),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2064), AOM_ICDF(2322), AOM_ICDF(6321), AOM_ICDF(6579),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(24955), AOM_ICDF(26499), AOM_ICDF(31625), AOM_ICDF(31948),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17242), AOM_ICDF(19354), AOM_ICDF(30096), AOM_ICDF(30432),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10470), AOM_ICDF(11049), AOM_ICDF(24405), AOM_ICDF(24742),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6717), AOM_ICDF(7038), AOM_ICDF(17553), AOM_ICDF(17870),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4030), AOM_ICDF(4342), AOM_ICDF(11280), AOM_ICDF(11592),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2060), AOM_ICDF(2355), AOM_ICDF(6966), AOM_ICDF(7260),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29697), AOM_ICDF(30286), AOM_ICDF(32009), AOM_ICDF(32325),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18629), AOM_ICDF(19720), AOM_ICDF(30251), AOM_ICDF(30574),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9459), AOM_ICDF(9826), AOM_ICDF(22948), AOM_ICDF(23264),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5742), AOM_ICDF(6057), AOM_ICDF(16269), AOM_ICDF(16580),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3696), AOM_ICDF(4006), AOM_ICDF(11276), AOM_ICDF(11586),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2359), AOM_ICDF(2614), AOM_ICDF(5801), AOM_ICDF(6056),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(14224), AOM_ICDF(15827), AOM_ICDF(27984), AOM_ICDF(30263),
-    AOM_ICDF(31458), AOM_ICDF(32768), },
-    {AOM_ICDF(4253), AOM_ICDF(7150), AOM_ICDF(20729), AOM_ICDF(24629),
-    AOM_ICDF(28621), AOM_ICDF(32768), },
-    {AOM_ICDF(1405), AOM_ICDF(5159), AOM_ICDF(12422), AOM_ICDF(17006),
-    AOM_ICDF(24088), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(20029), AOM_ICDF(23525), AOM_ICDF(30941), AOM_ICDF(31369),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15691), AOM_ICDF(22792), AOM_ICDF(30520), AOM_ICDF(30960),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12036), AOM_ICDF(18829), AOM_ICDF(28256), AOM_ICDF(29025),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10881), AOM_ICDF(14586), AOM_ICDF(25416), AOM_ICDF(26318),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11249), AOM_ICDF(13311), AOM_ICDF(23713), AOM_ICDF(24498),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9444), AOM_ICDF(10609), AOM_ICDF(20170), AOM_ICDF(21025),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(23805), AOM_ICDF(26370), AOM_ICDF(31579), AOM_ICDF(31927),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16685), AOM_ICDF(21243), AOM_ICDF(30526), AOM_ICDF(30890),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11661), AOM_ICDF(14143), AOM_ICDF(26804), AOM_ICDF(27193),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8321), AOM_ICDF(9593), AOM_ICDF(21814), AOM_ICDF(22228),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6243), AOM_ICDF(6820), AOM_ICDF(16151), AOM_ICDF(16506),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3612), AOM_ICDF(4386), AOM_ICDF(9547), AOM_ICDF(10321),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26022), AOM_ICDF(27534), AOM_ICDF(31845), AOM_ICDF(32167),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18692), AOM_ICDF(21351), AOM_ICDF(30871), AOM_ICDF(31203),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11493), AOM_ICDF(12410), AOM_ICDF(26280), AOM_ICDF(26619),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7099), AOM_ICDF(7581), AOM_ICDF(19315), AOM_ICDF(19619),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3329), AOM_ICDF(3623), AOM_ICDF(10868), AOM_ICDF(11162),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3104), AOM_ICDF(4139), AOM_ICDF(10003), AOM_ICDF(11038),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28126), AOM_ICDF(29216), AOM_ICDF(32027), AOM_ICDF(32345),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19828), AOM_ICDF(22063), AOM_ICDF(31140), AOM_ICDF(31465),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11206), AOM_ICDF(11832), AOM_ICDF(25718), AOM_ICDF(26041),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6496), AOM_ICDF(6825), AOM_ICDF(18069), AOM_ICDF(18408),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4600), AOM_ICDF(4904), AOM_ICDF(12431), AOM_ICDF(12735),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2016), AOM_ICDF(3529), AOM_ICDF(8066), AOM_ICDF(9578),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(30246), AOM_ICDF(30814), AOM_ICDF(32096), AOM_ICDF(32411),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21165), AOM_ICDF(22238), AOM_ICDF(31122), AOM_ICDF(31445),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10123), AOM_ICDF(10519), AOM_ICDF(24102), AOM_ICDF(24419),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5968), AOM_ICDF(6277), AOM_ICDF(17606), AOM_ICDF(17924),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4312), AOM_ICDF(4620), AOM_ICDF(12131), AOM_ICDF(12439),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4608), AOM_ICDF(6144), AOM_ICDF(9216), AOM_ICDF(10752),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(22808), AOM_ICDF(23508), AOM_ICDF(29956), AOM_ICDF(30649),
-    AOM_ICDF(31698), AOM_ICDF(32768), },
-    {AOM_ICDF(11001), AOM_ICDF(12792), AOM_ICDF(25018), AOM_ICDF(27680),
-    AOM_ICDF(29623), AOM_ICDF(32768), },
-    {AOM_ICDF(6919), AOM_ICDF(10026), AOM_ICDF(19635), AOM_ICDF(24728),
-    AOM_ICDF(26490), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(12861), AOM_ICDF(25068), AOM_ICDF(30802), AOM_ICDF(31375),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11298), AOM_ICDF(21545), AOM_ICDF(29953), AOM_ICDF(30816),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13053), AOM_ICDF(24270), AOM_ICDF(28485), AOM_ICDF(29845),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7710), AOM_ICDF(15059), AOM_ICDF(26383), AOM_ICDF(28431),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8856), AOM_ICDF(10332), AOM_ICDF(18008), AOM_ICDF(19779),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3855), AOM_ICDF(7710), AOM_ICDF(19275), AOM_ICDF(22167),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(19458), AOM_ICDF(25796), AOM_ICDF(31754), AOM_ICDF(32007),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16458), AOM_ICDF(23827), AOM_ICDF(31294), AOM_ICDF(31638),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16274), AOM_ICDF(18913), AOM_ICDF(28150), AOM_ICDF(29029),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12429), AOM_ICDF(15254), AOM_ICDF(24858), AOM_ICDF(26553),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7399), AOM_ICDF(11627), AOM_ICDF(21141), AOM_ICDF(24312),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5783), AOM_ICDF(11565), AOM_ICDF(17348), AOM_ICDF(23130),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(25493), AOM_ICDF(28975), AOM_ICDF(31960), AOM_ICDF(32271),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16904), AOM_ICDF(21759), AOM_ICDF(31381), AOM_ICDF(31728),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9709), AOM_ICDF(11529), AOM_ICDF(24879), AOM_ICDF(26700),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6554), AOM_ICDF(13107), AOM_ICDF(22938), AOM_ICDF(27853),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5461), AOM_ICDF(10923), AOM_ICDF(20025), AOM_ICDF(25486),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(26127), AOM_ICDF(28926), AOM_ICDF(31725), AOM_ICDF(32274),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17673), AOM_ICDF(25036), AOM_ICDF(31940), AOM_ICDF(32216),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14824), AOM_ICDF(17164), AOM_ICDF(26526), AOM_ICDF(28867),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7282), AOM_ICDF(16384), AOM_ICDF(21845), AOM_ICDF(27307),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(14336), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(30683), AOM_ICDF(31149), AOM_ICDF(32155), AOM_ICDF(32449),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17896), AOM_ICDF(22055), AOM_ICDF(31508), AOM_ICDF(31886),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8548), AOM_ICDF(12822), AOM_ICDF(24220), AOM_ICDF(28494),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(27393), AOM_ICDF(28900), AOM_ICDF(31555), AOM_ICDF(31971),
-    AOM_ICDF(32368), AOM_ICDF(32768), },
-    {AOM_ICDF(8379), AOM_ICDF(19364), AOM_ICDF(27675), AOM_ICDF(28688),
-    AOM_ICDF(31114), AOM_ICDF(32768), },
-    {AOM_ICDF(1955), AOM_ICDF(19256), AOM_ICDF(24580), AOM_ICDF(25370),
-    AOM_ICDF(30257), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(31085), AOM_ICDF(31718), AOM_ICDF(32129), AOM_ICDF(32443),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14336), AOM_ICDF(26852), AOM_ICDF(31370), AOM_ICDF(31760),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11751), AOM_ICDF(23544), AOM_ICDF(28851), AOM_ICDF(29567),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14670), AOM_ICDF(21251), AOM_ICDF(28381), AOM_ICDF(29752),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14832), AOM_ICDF(19316), AOM_ICDF(27134), AOM_ICDF(28974),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13312), AOM_ICDF(15360), AOM_ICDF(25600), AOM_ICDF(27648),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(31302), AOM_ICDF(31746), AOM_ICDF(32144), AOM_ICDF(32455),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18343), AOM_ICDF(26723), AOM_ICDF(32018), AOM_ICDF(32434),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10570), AOM_ICDF(16913), AOM_ICDF(29068), AOM_ICDF(30125),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5174), AOM_ICDF(13797), AOM_ICDF(24145), AOM_ICDF(26732),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5783), AOM_ICDF(11565), AOM_ICDF(21203), AOM_ICDF(26985),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(31420), AOM_ICDF(31795), AOM_ICDF(32144), AOM_ICDF(32455),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21510), AOM_ICDF(28245), AOM_ICDF(32064), AOM_ICDF(32366),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6342), AOM_ICDF(11627), AOM_ICDF(25369), AOM_ICDF(28540),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(31470), AOM_ICDF(31806), AOM_ICDF(32143), AOM_ICDF(32455),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19571), AOM_ICDF(25722), AOM_ICDF(31538), AOM_ICDF(31985),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5461), AOM_ICDF(8738), AOM_ICDF(25122), AOM_ICDF(28399),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(31292), AOM_ICDF(31637), AOM_ICDF(32104), AOM_ICDF(32431),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12774), AOM_ICDF(16652), AOM_ICDF(30002), AOM_ICDF(30986),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4652), AOM_ICDF(11442), AOM_ICDF(30231), AOM_ICDF(30593),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7022), AOM_ICDF(10031), AOM_ICDF(28087), AOM_ICDF(29090),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-};
+static const aom_cdf_prob av1_default_coeff_lps_multi_cdfs
+    [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS]
+    [CDF_SIZE(BR_CDF_SIZE)] = {
+      { { { { AOM_CDF4(14298, 20718, 24174) },
+            { AOM_CDF4(12536, 19601, 23789) },
+            { AOM_CDF4(8712, 15051, 19503) },
+            { AOM_CDF4(6170, 11327, 15434) },
+            { AOM_CDF4(4742, 8926, 12538) },
+            { AOM_CDF4(3803, 7317, 10546) },
+            { AOM_CDF4(1696, 3317, 4871) },
+            { AOM_CDF4(14392, 19951, 22756) },
+            { AOM_CDF4(15978, 23218, 26818) },
+            { AOM_CDF4(12187, 19474, 23889) },
+            { AOM_CDF4(9176, 15640, 20259) },
+            { AOM_CDF4(7068, 12655, 17028) },
+            { AOM_CDF4(5656, 10442, 14472) },
+            { AOM_CDF4(2580, 4992, 7244) },
+            { AOM_CDF4(12136, 18049, 21426) },
+            { AOM_CDF4(13784, 20721, 24481) },
+            { AOM_CDF4(10836, 17621, 21900) },
+            { AOM_CDF4(8372, 14444, 18847) },
+            { AOM_CDF4(6523, 11779, 16000) },
+            { AOM_CDF4(5337, 9898, 13760) },
+            { AOM_CDF4(3034, 5860, 8462) } },
+          { { AOM_CDF4(15967, 22905, 26286) },
+            { AOM_CDF4(13534, 20654, 24579) },
+            { AOM_CDF4(9504, 16092, 20535) },
+            { AOM_CDF4(6975, 12568, 16903) },
+            { AOM_CDF4(5364, 10091, 14020) },
+            { AOM_CDF4(4357, 8370, 11857) },
+            { AOM_CDF4(2506, 4934, 7218) },
+            { AOM_CDF4(23032, 28815, 30936) },
+            { AOM_CDF4(19540, 26704, 29719) },
+            { AOM_CDF4(15158, 22969, 27097) },
+            { AOM_CDF4(11408, 18865, 23650) },
+            { AOM_CDF4(8885, 15448, 20250) },
+            { AOM_CDF4(7108, 12853, 17416) },
+            { AOM_CDF4(4231, 8041, 11480) },
+            { AOM_CDF4(19823, 26490, 29156) },
+            { AOM_CDF4(18890, 25929, 28932) },
+            { AOM_CDF4(15660, 23491, 27433) },
+            { AOM_CDF4(12147, 19776, 24488) },
+            { AOM_CDF4(9728, 16774, 21649) },
+            { AOM_CDF4(7919, 14277, 19066) },
+            { AOM_CDF4(5440, 10170, 14185) } } },
+        { { { AOM_CDF4(14406, 20862, 24414) },
+            { AOM_CDF4(11824, 18907, 23109) },
+            { AOM_CDF4(8257, 14393, 18803) },
+            { AOM_CDF4(5860, 10747, 14778) },
+            { AOM_CDF4(4475, 8486, 11984) },
+            { AOM_CDF4(3606, 6954, 10043) },
+            { AOM_CDF4(1736, 3410, 5048) },
+            { AOM_CDF4(14430, 20046, 22882) },
+            { AOM_CDF4(15593, 22899, 26709) },
+            { AOM_CDF4(12102, 19368, 23811) },
+            { AOM_CDF4(9059, 15584, 20262) },
+            { AOM_CDF4(6999, 12603, 17048) },
+            { AOM_CDF4(5684, 10497, 14553) },
+            { AOM_CDF4(2822, 5438, 7862) },
+            { AOM_CDF4(15785, 21585, 24359) },
+            { AOM_CDF4(18347, 25229, 28266) },
+            { AOM_CDF4(14974, 22487, 26389) },
+            { AOM_CDF4(11423, 18681, 23271) },
+            { AOM_CDF4(8863, 15350, 20008) },
+            { AOM_CDF4(7153, 12852, 17278) },
+            { AOM_CDF4(3707, 7036, 9982) } },
+          { { AOM_CDF4(15460, 21696, 25469) },
+            { AOM_CDF4(12170, 19249, 23191) },
+            { AOM_CDF4(8723, 15027, 19332) },
+            { AOM_CDF4(6428, 11704, 15874) },
+            { AOM_CDF4(4922, 9292, 13052) },
+            { AOM_CDF4(4139, 7695, 11010) },
+            { AOM_CDF4(2291, 4508, 6598) },
+            { AOM_CDF4(19856, 26920, 29828) },
+            { AOM_CDF4(17923, 25289, 28792) },
+            { AOM_CDF4(14278, 21968, 26297) },
+            { AOM_CDF4(10910, 18136, 22950) },
+            { AOM_CDF4(8423, 14815, 19627) },
+            { AOM_CDF4(6771, 12283, 16774) },
+            { AOM_CDF4(4074, 7750, 11081) },
+            { AOM_CDF4(19852, 26074, 28672) },
+            { AOM_CDF4(19371, 26110, 28989) },
+            { AOM_CDF4(16265, 23873, 27663) },
+            { AOM_CDF4(12758, 20378, 24952) },
+            { AOM_CDF4(10095, 17098, 21961) },
+            { AOM_CDF4(8250, 14628, 19451) },
+            { AOM_CDF4(5205, 9745, 13622) } } },
+        { { { AOM_CDF4(10563, 16233, 19763) },
+            { AOM_CDF4(9794, 16022, 19804) },
+            { AOM_CDF4(6750, 11945, 15759) },
+            { AOM_CDF4(4963, 9186, 12752) },
+            { AOM_CDF4(3845, 7435, 10627) },
+            { AOM_CDF4(3051, 6085, 8834) },
+            { AOM_CDF4(1311, 2596, 3830) },
+            { AOM_CDF4(11246, 16404, 19689) },
+            { AOM_CDF4(12315, 18911, 22731) },
+            { AOM_CDF4(10557, 17095, 21289) },
+            { AOM_CDF4(8136, 14006, 18249) },
+            { AOM_CDF4(6348, 11474, 15565) },
+            { AOM_CDF4(5196, 9655, 13400) },
+            { AOM_CDF4(2349, 4526, 6587) },
+            { AOM_CDF4(13337, 18730, 21569) },
+            { AOM_CDF4(19306, 26071, 28882) },
+            { AOM_CDF4(15952, 23540, 27254) },
+            { AOM_CDF4(12409, 19934, 24430) },
+            { AOM_CDF4(9760, 16706, 21389) },
+            { AOM_CDF4(8004, 14220, 18818) },
+            { AOM_CDF4(4138, 7794, 10961) } },
+          { { AOM_CDF4(10870, 16684, 20949) },
+            { AOM_CDF4(9664, 15230, 18680) },
+            { AOM_CDF4(6886, 12109, 15408) },
+            { AOM_CDF4(4825, 8900, 12305) },
+            { AOM_CDF4(3630, 7162, 10314) },
+            { AOM_CDF4(3036, 6429, 9387) },
+            { AOM_CDF4(1671, 3296, 4940) },
+            { AOM_CDF4(13819, 19159, 23026) },
+            { AOM_CDF4(11984, 19108, 23120) },
+            { AOM_CDF4(10690, 17210, 21663) },
+            { AOM_CDF4(7984, 14154, 18333) },
+            { AOM_CDF4(6868, 12294, 16124) },
+            { AOM_CDF4(5274, 8994, 12868) },
+            { AOM_CDF4(2988, 5771, 8424) },
+            { AOM_CDF4(19736, 26647, 29141) },
+            { AOM_CDF4(18933, 26070, 28984) },
+            { AOM_CDF4(15779, 23048, 27200) },
+            { AOM_CDF4(12638, 20061, 24532) },
+            { AOM_CDF4(10692, 17545, 22220) },
+            { AOM_CDF4(9217, 15251, 20054) },
+            { AOM_CDF4(5078, 9284, 12594) } } },
+        { { { AOM_CDF4(2331, 3662, 5244) },
+            { AOM_CDF4(2891, 4771, 6145) },
+            { AOM_CDF4(4598, 7623, 9729) },
+            { AOM_CDF4(3520, 6845, 9199) },
+            { AOM_CDF4(3417, 6119, 9324) },
+            { AOM_CDF4(2601, 5412, 7385) },
+            { AOM_CDF4(600, 1173, 1744) },
+            { AOM_CDF4(7672, 13286, 17469) },
+            { AOM_CDF4(4232, 7792, 10793) },
+            { AOM_CDF4(2915, 5317, 7397) },
+            { AOM_CDF4(2318, 4356, 6152) },
+            { AOM_CDF4(2127, 4000, 5554) },
+            { AOM_CDF4(1850, 3478, 5275) },
+            { AOM_CDF4(977, 1933, 2843) },
+            { AOM_CDF4(18280, 24387, 27989) },
+            { AOM_CDF4(15852, 22671, 26185) },
+            { AOM_CDF4(13845, 20951, 24789) },
+            { AOM_CDF4(11055, 17966, 22129) },
+            { AOM_CDF4(9138, 15422, 19801) },
+            { AOM_CDF4(7454, 13145, 17456) },
+            { AOM_CDF4(3370, 6393, 9013) } },
+          { { AOM_CDF4(5842, 9229, 10838) },
+            { AOM_CDF4(2313, 3491, 4276) },
+            { AOM_CDF4(2998, 6104, 7496) },
+            { AOM_CDF4(2420, 7447, 9868) },
+            { AOM_CDF4(3034, 8495, 10923) },
+            { AOM_CDF4(4076, 8937, 10975) },
+            { AOM_CDF4(1086, 2370, 3299) },
+            { AOM_CDF4(9714, 17254, 20444) },
+            { AOM_CDF4(8543, 13698, 17123) },
+            { AOM_CDF4(4918, 9007, 11910) },
+            { AOM_CDF4(4129, 7532, 10553) },
+            { AOM_CDF4(2364, 5533, 8058) },
+            { AOM_CDF4(1834, 3546, 5563) },
+            { AOM_CDF4(1473, 2908, 4133) },
+            { AOM_CDF4(15405, 21193, 25619) },
+            { AOM_CDF4(15691, 21952, 26561) },
+            { AOM_CDF4(12962, 19194, 24165) },
+            { AOM_CDF4(10272, 17855, 22129) },
+            { AOM_CDF4(8588, 15270, 20718) },
+            { AOM_CDF4(8682, 14669, 19500) },
+            { AOM_CDF4(4870, 9636, 13205) } } },
+        { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } },
+          { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } } } },
+      { { { { AOM_CDF4(14995, 21341, 24749) },
+            { AOM_CDF4(13158, 20289, 24601) },
+            { AOM_CDF4(8941, 15326, 19876) },
+            { AOM_CDF4(6297, 11541, 15807) },
+            { AOM_CDF4(4817, 9029, 12776) },
+            { AOM_CDF4(3731, 7273, 10627) },
+            { AOM_CDF4(1847, 3617, 5354) },
+            { AOM_CDF4(14472, 19659, 22343) },
+            { AOM_CDF4(16806, 24162, 27533) },
+            { AOM_CDF4(12900, 20404, 24713) },
+            { AOM_CDF4(9411, 16112, 20797) },
+            { AOM_CDF4(7056, 12697, 17148) },
+            { AOM_CDF4(5544, 10339, 14460) },
+            { AOM_CDF4(2954, 5704, 8319) },
+            { AOM_CDF4(12464, 18071, 21354) },
+            { AOM_CDF4(15482, 22528, 26034) },
+            { AOM_CDF4(12070, 19269, 23624) },
+            { AOM_CDF4(8953, 15406, 20106) },
+            { AOM_CDF4(7027, 12730, 17220) },
+            { AOM_CDF4(5887, 10913, 15140) },
+            { AOM_CDF4(3793, 7278, 10447) } },
+          { { AOM_CDF4(15571, 22232, 25749) },
+            { AOM_CDF4(14506, 21575, 25374) },
+            { AOM_CDF4(10189, 17089, 21569) },
+            { AOM_CDF4(7316, 13301, 17915) },
+            { AOM_CDF4(5783, 10912, 15190) },
+            { AOM_CDF4(4760, 9155, 13088) },
+            { AOM_CDF4(2993, 5966, 8774) },
+            { AOM_CDF4(23424, 28903, 30778) },
+            { AOM_CDF4(20775, 27666, 30290) },
+            { AOM_CDF4(16474, 24410, 28299) },
+            { AOM_CDF4(12471, 20180, 24987) },
+            { AOM_CDF4(9410, 16487, 21439) },
+            { AOM_CDF4(7536, 13614, 18529) },
+            { AOM_CDF4(5048, 9586, 13549) },
+            { AOM_CDF4(21090, 27290, 29756) },
+            { AOM_CDF4(20796, 27402, 30026) },
+            { AOM_CDF4(17819, 25485, 28969) },
+            { AOM_CDF4(13860, 21909, 26462) },
+            { AOM_CDF4(11002, 18494, 23529) },
+            { AOM_CDF4(8953, 15929, 20897) },
+            { AOM_CDF4(6448, 11918, 16454) } } },
+        { { { AOM_CDF4(15999, 22208, 25449) },
+            { AOM_CDF4(13050, 19988, 24122) },
+            { AOM_CDF4(8594, 14864, 19378) },
+            { AOM_CDF4(6033, 11079, 15238) },
+            { AOM_CDF4(4554, 8683, 12347) },
+            { AOM_CDF4(3672, 7139, 10337) },
+            { AOM_CDF4(1900, 3771, 5576) },
+            { AOM_CDF4(15788, 21340, 23949) },
+            { AOM_CDF4(16825, 24235, 27758) },
+            { AOM_CDF4(12873, 20402, 24810) },
+            { AOM_CDF4(9590, 16363, 21094) },
+            { AOM_CDF4(7352, 13209, 17733) },
+            { AOM_CDF4(5960, 10989, 15184) },
+            { AOM_CDF4(3232, 6234, 9007) },
+            { AOM_CDF4(15761, 20716, 23224) },
+            { AOM_CDF4(19318, 25989, 28759) },
+            { AOM_CDF4(15529, 23094, 26929) },
+            { AOM_CDF4(11662, 18989, 23641) },
+            { AOM_CDF4(8955, 15568, 20366) },
+            { AOM_CDF4(7281, 13106, 17708) },
+            { AOM_CDF4(4248, 8059, 11440) } },
+          { { AOM_CDF4(14899, 21217, 24503) },
+            { AOM_CDF4(13519, 20283, 24047) },
+            { AOM_CDF4(9429, 15966, 20365) },
+            { AOM_CDF4(6700, 12355, 16652) },
+            { AOM_CDF4(5088, 9704, 13716) },
+            { AOM_CDF4(4243, 8154, 11731) },
+            { AOM_CDF4(2702, 5364, 7861) },
+            { AOM_CDF4(22745, 28388, 30454) },
+            { AOM_CDF4(20235, 27146, 29922) },
+            { AOM_CDF4(15896, 23715, 27637) },
+            { AOM_CDF4(11840, 19350, 24131) },
+            { AOM_CDF4(9122, 15932, 20880) },
+            { AOM_CDF4(7488, 13581, 18362) },
+            { AOM_CDF4(5114, 9568, 13370) },
+            { AOM_CDF4(20845, 26553, 28932) },
+            { AOM_CDF4(20981, 27372, 29884) },
+            { AOM_CDF4(17781, 25335, 28785) },
+            { AOM_CDF4(13760, 21708, 26297) },
+            { AOM_CDF4(10975, 18415, 23365) },
+            { AOM_CDF4(9045, 15789, 20686) },
+            { AOM_CDF4(6130, 11199, 15423) } } },
+        { { { AOM_CDF4(13549, 19724, 23158) },
+            { AOM_CDF4(11844, 18382, 22246) },
+            { AOM_CDF4(7919, 13619, 17773) },
+            { AOM_CDF4(5486, 10143, 13946) },
+            { AOM_CDF4(4166, 7983, 11324) },
+            { AOM_CDF4(3364, 6506, 9427) },
+            { AOM_CDF4(1598, 3160, 4674) },
+            { AOM_CDF4(15281, 20979, 23781) },
+            { AOM_CDF4(14939, 22119, 25952) },
+            { AOM_CDF4(11363, 18407, 22812) },
+            { AOM_CDF4(8609, 14857, 19370) },
+            { AOM_CDF4(6737, 12184, 16480) },
+            { AOM_CDF4(5506, 10263, 14262) },
+            { AOM_CDF4(2990, 5786, 8380) },
+            { AOM_CDF4(20249, 25253, 27417) },
+            { AOM_CDF4(21070, 27518, 30001) },
+            { AOM_CDF4(16854, 24469, 28074) },
+            { AOM_CDF4(12864, 20486, 25000) },
+            { AOM_CDF4(9962, 16978, 21778) },
+            { AOM_CDF4(8074, 14338, 19048) },
+            { AOM_CDF4(4494, 8479, 11906) } },
+          { { AOM_CDF4(13960, 19617, 22829) },
+            { AOM_CDF4(11150, 17341, 21228) },
+            { AOM_CDF4(7150, 12964, 17190) },
+            { AOM_CDF4(5331, 10002, 13867) },
+            { AOM_CDF4(4167, 7744, 11057) },
+            { AOM_CDF4(3480, 6629, 9646) },
+            { AOM_CDF4(1883, 3784, 5686) },
+            { AOM_CDF4(18752, 25660, 28912) },
+            { AOM_CDF4(16968, 24586, 28030) },
+            { AOM_CDF4(13520, 21055, 25313) },
+            { AOM_CDF4(10453, 17626, 22280) },
+            { AOM_CDF4(8386, 14505, 19116) },
+            { AOM_CDF4(6742, 12595, 17008) },
+            { AOM_CDF4(4273, 8140, 11499) },
+            { AOM_CDF4(22120, 27827, 30233) },
+            { AOM_CDF4(20563, 27358, 29895) },
+            { AOM_CDF4(17076, 24644, 28153) },
+            { AOM_CDF4(13362, 20942, 25309) },
+            { AOM_CDF4(10794, 17965, 22695) },
+            { AOM_CDF4(9014, 15652, 20319) },
+            { AOM_CDF4(5708, 10512, 14497) } } },
+        { { { AOM_CDF4(5705, 10930, 15725) },
+            { AOM_CDF4(7946, 12765, 16115) },
+            { AOM_CDF4(6801, 12123, 16226) },
+            { AOM_CDF4(5462, 10135, 14200) },
+            { AOM_CDF4(4189, 8011, 11507) },
+            { AOM_CDF4(3191, 6229, 9408) },
+            { AOM_CDF4(1057, 2137, 3212) },
+            { AOM_CDF4(10018, 17067, 21491) },
+            { AOM_CDF4(7380, 12582, 16453) },
+            { AOM_CDF4(6068, 10845, 14339) },
+            { AOM_CDF4(5098, 9198, 12555) },
+            { AOM_CDF4(4312, 8010, 11119) },
+            { AOM_CDF4(3700, 6966, 9781) },
+            { AOM_CDF4(1693, 3326, 4887) },
+            { AOM_CDF4(18757, 24930, 27774) },
+            { AOM_CDF4(17648, 24596, 27817) },
+            { AOM_CDF4(14707, 22052, 26026) },
+            { AOM_CDF4(11720, 18852, 23292) },
+            { AOM_CDF4(9357, 15952, 20525) },
+            { AOM_CDF4(7810, 13753, 18210) },
+            { AOM_CDF4(3879, 7333, 10328) } },
+          { { AOM_CDF4(8278, 13242, 15922) },
+            { AOM_CDF4(10547, 15867, 18919) },
+            { AOM_CDF4(9106, 15842, 20609) },
+            { AOM_CDF4(6833, 13007, 17218) },
+            { AOM_CDF4(4811, 9712, 13923) },
+            { AOM_CDF4(3985, 7352, 11128) },
+            { AOM_CDF4(1688, 3458, 5262) },
+            { AOM_CDF4(12951, 21861, 26510) },
+            { AOM_CDF4(9788, 16044, 20276) },
+            { AOM_CDF4(6309, 11244, 14870) },
+            { AOM_CDF4(5183, 9349, 12566) },
+            { AOM_CDF4(4389, 8229, 11492) },
+            { AOM_CDF4(3633, 6945, 10620) },
+            { AOM_CDF4(3600, 6847, 9907) },
+            { AOM_CDF4(21748, 28137, 30255) },
+            { AOM_CDF4(19436, 26581, 29560) },
+            { AOM_CDF4(16359, 24201, 27953) },
+            { AOM_CDF4(13961, 21693, 25871) },
+            { AOM_CDF4(11544, 18686, 23322) },
+            { AOM_CDF4(9372, 16462, 20952) },
+            { AOM_CDF4(6138, 11210, 15390) } } },
+        { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } },
+          { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } } } },
+      { { { { AOM_CDF4(16138, 22223, 25509) },
+            { AOM_CDF4(15347, 22430, 26332) },
+            { AOM_CDF4(9614, 16736, 21332) },
+            { AOM_CDF4(6600, 12275, 16907) },
+            { AOM_CDF4(4811, 9424, 13547) },
+            { AOM_CDF4(3748, 7809, 11420) },
+            { AOM_CDF4(2254, 4587, 6890) },
+            { AOM_CDF4(15196, 20284, 23177) },
+            { AOM_CDF4(18317, 25469, 28451) },
+            { AOM_CDF4(13918, 21651, 25842) },
+            { AOM_CDF4(10052, 17150, 21995) },
+            { AOM_CDF4(7499, 13630, 18587) },
+            { AOM_CDF4(6158, 11417, 16003) },
+            { AOM_CDF4(4014, 7785, 11252) },
+            { AOM_CDF4(15048, 21067, 24384) },
+            { AOM_CDF4(18202, 25346, 28553) },
+            { AOM_CDF4(14302, 22019, 26356) },
+            { AOM_CDF4(10839, 18139, 23166) },
+            { AOM_CDF4(8715, 15744, 20806) },
+            { AOM_CDF4(7536, 13576, 18544) },
+            { AOM_CDF4(5413, 10335, 14498) } },
+          { { AOM_CDF4(17394, 24501, 27895) },
+            { AOM_CDF4(15889, 23420, 27185) },
+            { AOM_CDF4(11561, 19133, 23870) },
+            { AOM_CDF4(8285, 14812, 19844) },
+            { AOM_CDF4(6496, 12043, 16550) },
+            { AOM_CDF4(4771, 9574, 13677) },
+            { AOM_CDF4(3603, 6830, 10144) },
+            { AOM_CDF4(21656, 27704, 30200) },
+            { AOM_CDF4(21324, 27915, 30511) },
+            { AOM_CDF4(17327, 25336, 28997) },
+            { AOM_CDF4(13417, 21381, 26033) },
+            { AOM_CDF4(10132, 17425, 22338) },
+            { AOM_CDF4(8580, 15016, 19633) },
+            { AOM_CDF4(5694, 11477, 16411) },
+            { AOM_CDF4(24116, 29780, 31450) },
+            { AOM_CDF4(23853, 29695, 31591) },
+            { AOM_CDF4(20085, 27614, 30428) },
+            { AOM_CDF4(15326, 24335, 28575) },
+            { AOM_CDF4(11814, 19472, 24810) },
+            { AOM_CDF4(10221, 18611, 24767) },
+            { AOM_CDF4(7689, 14558, 20321) } } },
+        { { { AOM_CDF4(16214, 22380, 25770) },
+            { AOM_CDF4(14213, 21304, 25295) },
+            { AOM_CDF4(9213, 15823, 20455) },
+            { AOM_CDF4(6395, 11758, 16139) },
+            { AOM_CDF4(4779, 9187, 13066) },
+            { AOM_CDF4(3821, 7501, 10953) },
+            { AOM_CDF4(2293, 4567, 6795) },
+            { AOM_CDF4(15859, 21283, 23820) },
+            { AOM_CDF4(18404, 25602, 28726) },
+            { AOM_CDF4(14325, 21980, 26206) },
+            { AOM_CDF4(10669, 17937, 22720) },
+            { AOM_CDF4(8297, 14642, 19447) },
+            { AOM_CDF4(6746, 12389, 16893) },
+            { AOM_CDF4(4324, 8251, 11770) },
+            { AOM_CDF4(16532, 21631, 24475) },
+            { AOM_CDF4(20667, 27150, 29668) },
+            { AOM_CDF4(16728, 24510, 28175) },
+            { AOM_CDF4(12861, 20645, 25332) },
+            { AOM_CDF4(10076, 17361, 22417) },
+            { AOM_CDF4(8395, 14940, 19963) },
+            { AOM_CDF4(5731, 10683, 14912) } },
+          { { AOM_CDF4(14433, 21155, 24938) },
+            { AOM_CDF4(14658, 21716, 25545) },
+            { AOM_CDF4(9923, 16824, 21557) },
+            { AOM_CDF4(6982, 13052, 17721) },
+            { AOM_CDF4(5419, 10503, 15050) },
+            { AOM_CDF4(4852, 9162, 13014) },
+            { AOM_CDF4(3271, 6395, 9630) },
+            { AOM_CDF4(22210, 27833, 30109) },
+            { AOM_CDF4(20750, 27368, 29821) },
+            { AOM_CDF4(16894, 24828, 28573) },
+            { AOM_CDF4(13247, 21276, 25757) },
+            { AOM_CDF4(10038, 17265, 22563) },
+            { AOM_CDF4(8587, 14947, 20327) },
+            { AOM_CDF4(5645, 11371, 15252) },
+            { AOM_CDF4(22027, 27526, 29714) },
+            { AOM_CDF4(23098, 29146, 31221) },
+            { AOM_CDF4(19886, 27341, 30272) },
+            { AOM_CDF4(15609, 23747, 28046) },
+            { AOM_CDF4(11993, 20065, 24939) },
+            { AOM_CDF4(9637, 18267, 23671) },
+            { AOM_CDF4(7625, 13801, 19144) } } },
+        { { { AOM_CDF4(14438, 20798, 24089) },
+            { AOM_CDF4(12621, 19203, 23097) },
+            { AOM_CDF4(8177, 14125, 18402) },
+            { AOM_CDF4(5674, 10501, 14456) },
+            { AOM_CDF4(4236, 8239, 11733) },
+            { AOM_CDF4(3447, 6750, 9806) },
+            { AOM_CDF4(1986, 3950, 5864) },
+            { AOM_CDF4(16208, 22099, 24930) },
+            { AOM_CDF4(16537, 24025, 27585) },
+            { AOM_CDF4(12780, 20381, 24867) },
+            { AOM_CDF4(9767, 16612, 21416) },
+            { AOM_CDF4(7686, 13738, 18398) },
+            { AOM_CDF4(6333, 11614, 15964) },
+            { AOM_CDF4(3941, 7571, 10836) },
+            { AOM_CDF4(22819, 27422, 29202) },
+            { AOM_CDF4(22224, 28514, 30721) },
+            { AOM_CDF4(17660, 25433, 28913) },
+            { AOM_CDF4(13574, 21482, 26002) },
+            { AOM_CDF4(10629, 17977, 22938) },
+            { AOM_CDF4(8612, 15298, 20265) },
+            { AOM_CDF4(5607, 10491, 14596) } },
+          { { AOM_CDF4(13569, 19800, 23206) },
+            { AOM_CDF4(13128, 19924, 23869) },
+            { AOM_CDF4(8329, 14841, 19403) },
+            { AOM_CDF4(6130, 10976, 15057) },
+            { AOM_CDF4(4682, 8839, 12518) },
+            { AOM_CDF4(3656, 7409, 10588) },
+            { AOM_CDF4(2577, 5099, 7412) },
+            { AOM_CDF4(22427, 28684, 30585) },
+            { AOM_CDF4(20913, 27750, 30139) },
+            { AOM_CDF4(15840, 24109, 27834) },
+            { AOM_CDF4(12308, 20029, 24569) },
+            { AOM_CDF4(10216, 16785, 21458) },
+            { AOM_CDF4(8309, 14203, 19113) },
+            { AOM_CDF4(6043, 11168, 15307) },
+            { AOM_CDF4(23166, 28901, 30998) },
+            { AOM_CDF4(21899, 28405, 30751) },
+            { AOM_CDF4(18413, 26091, 29443) },
+            { AOM_CDF4(15233, 23114, 27352) },
+            { AOM_CDF4(12683, 20472, 25288) },
+            { AOM_CDF4(10702, 18259, 23409) },
+            { AOM_CDF4(8125, 14464, 19226) } } },
+        { { { AOM_CDF4(9040, 14786, 18360) },
+            { AOM_CDF4(9979, 15718, 19415) },
+            { AOM_CDF4(7913, 13918, 18311) },
+            { AOM_CDF4(5859, 10889, 15184) },
+            { AOM_CDF4(4593, 8677, 12510) },
+            { AOM_CDF4(3820, 7396, 10791) },
+            { AOM_CDF4(1730, 3471, 5192) },
+            { AOM_CDF4(11803, 18365, 22709) },
+            { AOM_CDF4(11419, 18058, 22225) },
+            { AOM_CDF4(9418, 15774, 20243) },
+            { AOM_CDF4(7539, 13325, 17657) },
+            { AOM_CDF4(6233, 11317, 15384) },
+            { AOM_CDF4(5137, 9656, 13545) },
+            { AOM_CDF4(2977, 5774, 8349) },
+            { AOM_CDF4(21207, 27246, 29640) },
+            { AOM_CDF4(19547, 26578, 29497) },
+            { AOM_CDF4(16169, 23871, 27690) },
+            { AOM_CDF4(12820, 20458, 25018) },
+            { AOM_CDF4(10224, 17332, 22214) },
+            { AOM_CDF4(8526, 15048, 19884) },
+            { AOM_CDF4(5037, 9410, 13118) } },
+          { { AOM_CDF4(12339, 17329, 20140) },
+            { AOM_CDF4(13505, 19895, 23225) },
+            { AOM_CDF4(9847, 16944, 21564) },
+            { AOM_CDF4(7280, 13256, 18348) },
+            { AOM_CDF4(4712, 10009, 14454) },
+            { AOM_CDF4(4361, 7914, 12477) },
+            { AOM_CDF4(2870, 5628, 7995) },
+            { AOM_CDF4(20061, 25504, 28526) },
+            { AOM_CDF4(15235, 22878, 26145) },
+            { AOM_CDF4(12985, 19958, 24155) },
+            { AOM_CDF4(9782, 16641, 21403) },
+            { AOM_CDF4(9456, 16360, 20760) },
+            { AOM_CDF4(6855, 12940, 18557) },
+            { AOM_CDF4(5661, 10564, 15002) },
+            { AOM_CDF4(25656, 30602, 31894) },
+            { AOM_CDF4(22570, 29107, 31092) },
+            { AOM_CDF4(18917, 26423, 29541) },
+            { AOM_CDF4(15940, 23649, 27754) },
+            { AOM_CDF4(12803, 20581, 25219) },
+            { AOM_CDF4(11082, 18695, 23376) },
+            { AOM_CDF4(7939, 14373, 19005) } } },
+        { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } },
+          { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } } } },
+      { { { { AOM_CDF4(18315, 24289, 27551) },
+            { AOM_CDF4(16854, 24068, 27835) },
+            { AOM_CDF4(10140, 17927, 23173) },
+            { AOM_CDF4(6722, 12982, 18267) },
+            { AOM_CDF4(4661, 9826, 14706) },
+            { AOM_CDF4(3832, 8165, 12294) },
+            { AOM_CDF4(2795, 6098, 9245) },
+            { AOM_CDF4(17145, 23326, 26672) },
+            { AOM_CDF4(20733, 27680, 30308) },
+            { AOM_CDF4(16032, 24461, 28546) },
+            { AOM_CDF4(11653, 20093, 25081) },
+            { AOM_CDF4(9290, 16429, 22086) },
+            { AOM_CDF4(7796, 14598, 19982) },
+            { AOM_CDF4(6502, 12378, 17441) },
+            { AOM_CDF4(21681, 27732, 30320) },
+            { AOM_CDF4(22389, 29044, 31261) },
+            { AOM_CDF4(19027, 26731, 30087) },
+            { AOM_CDF4(14739, 23755, 28624) },
+            { AOM_CDF4(11358, 20778, 25511) },
+            { AOM_CDF4(10995, 18073, 24190) },
+            { AOM_CDF4(9162, 14990, 20617) } },
+          { { AOM_CDF4(21425, 27952, 30388) },
+            { AOM_CDF4(18062, 25838, 29034) },
+            { AOM_CDF4(11956, 19881, 24808) },
+            { AOM_CDF4(7718, 15000, 20980) },
+            { AOM_CDF4(5702, 11254, 16143) },
+            { AOM_CDF4(4898, 9088, 16864) },
+            { AOM_CDF4(3679, 6776, 11907) },
+            { AOM_CDF4(23294, 30160, 31663) },
+            { AOM_CDF4(24397, 29896, 31836) },
+            { AOM_CDF4(19245, 27128, 30593) },
+            { AOM_CDF4(13202, 19825, 26404) },
+            { AOM_CDF4(11578, 19297, 23957) },
+            { AOM_CDF4(8073, 13297, 21370) },
+            { AOM_CDF4(5461, 10923, 19745) },
+            { AOM_CDF4(27367, 30521, 31934) },
+            { AOM_CDF4(24904, 30671, 31940) },
+            { AOM_CDF4(23075, 28460, 31299) },
+            { AOM_CDF4(14400, 23658, 30417) },
+            { AOM_CDF4(13885, 23882, 28325) },
+            { AOM_CDF4(14746, 22938, 27853) },
+            { AOM_CDF4(5461, 16384, 27307) } } },
+        { { { AOM_CDF4(18274, 24813, 27890) },
+            { AOM_CDF4(15537, 23149, 27003) },
+            { AOM_CDF4(9449, 16740, 21827) },
+            { AOM_CDF4(6700, 12498, 17261) },
+            { AOM_CDF4(4988, 9866, 14198) },
+            { AOM_CDF4(4236, 8147, 11902) },
+            { AOM_CDF4(2867, 5860, 8654) },
+            { AOM_CDF4(17124, 23171, 26101) },
+            { AOM_CDF4(20396, 27477, 30148) },
+            { AOM_CDF4(16573, 24629, 28492) },
+            { AOM_CDF4(12749, 20846, 25674) },
+            { AOM_CDF4(10233, 17878, 22818) },
+            { AOM_CDF4(8525, 15332, 20363) },
+            { AOM_CDF4(6283, 11632, 16255) },
+            { AOM_CDF4(20466, 26511, 29286) },
+            { AOM_CDF4(23059, 29174, 31191) },
+            { AOM_CDF4(19481, 27263, 30241) },
+            { AOM_CDF4(15458, 23631, 28137) },
+            { AOM_CDF4(12416, 20608, 25693) },
+            { AOM_CDF4(10261, 18011, 23261) },
+            { AOM_CDF4(8016, 14655, 19666) } },
+          { { AOM_CDF4(17616, 24586, 28112) },
+            { AOM_CDF4(15809, 23299, 27155) },
+            { AOM_CDF4(10767, 18890, 23793) },
+            { AOM_CDF4(7727, 14255, 18865) },
+            { AOM_CDF4(6129, 11926, 16882) },
+            { AOM_CDF4(4482, 9704, 14861) },
+            { AOM_CDF4(3277, 7452, 11522) },
+            { AOM_CDF4(22956, 28551, 30730) },
+            { AOM_CDF4(22724, 28937, 30961) },
+            { AOM_CDF4(18467, 26324, 29580) },
+            { AOM_CDF4(13234, 20713, 25649) },
+            { AOM_CDF4(11181, 17592, 22481) },
+            { AOM_CDF4(8291, 18358, 24576) },
+            { AOM_CDF4(7568, 11881, 14984) },
+            { AOM_CDF4(24948, 29001, 31147) },
+            { AOM_CDF4(25674, 30619, 32151) },
+            { AOM_CDF4(20841, 26793, 29603) },
+            { AOM_CDF4(14669, 24356, 28666) },
+            { AOM_CDF4(11334, 23593, 28219) },
+            { AOM_CDF4(8922, 14762, 22873) },
+            { AOM_CDF4(8301, 13544, 20535) } } },
+        { { { AOM_CDF4(17113, 23733, 27081) },
+            { AOM_CDF4(14139, 21406, 25452) },
+            { AOM_CDF4(8552, 15002, 19776) },
+            { AOM_CDF4(5871, 11120, 15378) },
+            { AOM_CDF4(4455, 8616, 12253) },
+            { AOM_CDF4(3469, 6910, 10386) },
+            { AOM_CDF4(2255, 4553, 6782) },
+            { AOM_CDF4(18224, 24376, 27053) },
+            { AOM_CDF4(19290, 26710, 29614) },
+            { AOM_CDF4(14936, 22991, 27184) },
+            { AOM_CDF4(11238, 18951, 23762) },
+            { AOM_CDF4(8786, 15617, 20588) },
+            { AOM_CDF4(7317, 13228, 18003) },
+            { AOM_CDF4(5101, 9512, 13493) },
+            { AOM_CDF4(22639, 28222, 30210) },
+            { AOM_CDF4(23216, 29331, 31307) },
+            { AOM_CDF4(19075, 26762, 29895) },
+            { AOM_CDF4(15014, 23113, 27457) },
+            { AOM_CDF4(11938, 19857, 24752) },
+            { AOM_CDF4(9942, 17280, 22282) },
+            { AOM_CDF4(7167, 13144, 17752) } },
+          { { AOM_CDF4(15820, 22738, 26488) },
+            { AOM_CDF4(13530, 20885, 25216) },
+            { AOM_CDF4(8395, 15530, 20452) },
+            { AOM_CDF4(6574, 12321, 16380) },
+            { AOM_CDF4(5353, 10419, 14568) },
+            { AOM_CDF4(4613, 8446, 12381) },
+            { AOM_CDF4(3440, 7158, 9903) },
+            { AOM_CDF4(24247, 29051, 31224) },
+            { AOM_CDF4(22118, 28058, 30369) },
+            { AOM_CDF4(16498, 24768, 28389) },
+            { AOM_CDF4(12920, 21175, 26137) },
+            { AOM_CDF4(10730, 18619, 25352) },
+            { AOM_CDF4(10187, 16279, 22791) },
+            { AOM_CDF4(9310, 14631, 22127) },
+            { AOM_CDF4(24970, 30558, 32057) },
+            { AOM_CDF4(24801, 29942, 31698) },
+            { AOM_CDF4(22432, 28453, 30855) },
+            { AOM_CDF4(19054, 25680, 29580) },
+            { AOM_CDF4(14392, 23036, 28109) },
+            { AOM_CDF4(12495, 20947, 26650) },
+            { AOM_CDF4(12442, 20326, 26214) } } },
+        { { { AOM_CDF4(12162, 18785, 22648) },
+            { AOM_CDF4(12749, 19697, 23806) },
+            { AOM_CDF4(8580, 15297, 20346) },
+            { AOM_CDF4(6169, 11749, 16543) },
+            { AOM_CDF4(4836, 9391, 13448) },
+            { AOM_CDF4(3821, 7711, 11613) },
+            { AOM_CDF4(2228, 4601, 7070) },
+            { AOM_CDF4(16319, 24725, 28280) },
+            { AOM_CDF4(15698, 23277, 27168) },
+            { AOM_CDF4(12726, 20368, 25047) },
+            { AOM_CDF4(9912, 17015, 21976) },
+            { AOM_CDF4(7888, 14220, 19179) },
+            { AOM_CDF4(6777, 12284, 17018) },
+            { AOM_CDF4(4492, 8590, 12252) },
+            { AOM_CDF4(23249, 28904, 30947) },
+            { AOM_CDF4(21050, 27908, 30512) },
+            { AOM_CDF4(17440, 25340, 28949) },
+            { AOM_CDF4(14059, 22018, 26541) },
+            { AOM_CDF4(11288, 18903, 23898) },
+            { AOM_CDF4(9411, 16342, 21428) },
+            { AOM_CDF4(6278, 11588, 15944) } },
+          { { AOM_CDF4(13981, 20067, 23226) },
+            { AOM_CDF4(16922, 23580, 26783) },
+            { AOM_CDF4(11005, 19039, 24487) },
+            { AOM_CDF4(7389, 14218, 19798) },
+            { AOM_CDF4(5598, 11505, 17206) },
+            { AOM_CDF4(6090, 11213, 15659) },
+            { AOM_CDF4(3820, 7371, 10119) },
+            { AOM_CDF4(21082, 26925, 29675) },
+            { AOM_CDF4(21262, 28627, 31128) },
+            { AOM_CDF4(18392, 26454, 30437) },
+            { AOM_CDF4(14870, 22910, 27096) },
+            { AOM_CDF4(12620, 19484, 24908) },
+            { AOM_CDF4(9290, 16553, 22802) },
+            { AOM_CDF4(6668, 14288, 20004) },
+            { AOM_CDF4(27704, 31055, 31949) },
+            { AOM_CDF4(24709, 29978, 31788) },
+            { AOM_CDF4(21668, 29264, 31657) },
+            { AOM_CDF4(18295, 26968, 30074) },
+            { AOM_CDF4(16399, 24422, 29313) },
+            { AOM_CDF4(14347, 23026, 28104) },
+            { AOM_CDF4(12370, 19806, 24477) } } },
+        { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } },
+          { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } } } }
+    };
 
-static const coeff_cdf_model
-av1_default_coef_head_cdfs_q3[TX_SIZES][PLANE_TYPES] = {
-    {  // TX 4X4
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(25117), AOM_ICDF(25655), AOM_ICDF(28371), AOM_ICDF(30246),
-    AOM_ICDF(30939), AOM_ICDF(32768), },
-    {AOM_ICDF(15083), AOM_ICDF(16850), AOM_ICDF(26029), AOM_ICDF(29031),
-    AOM_ICDF(30115), AOM_ICDF(32768), },
-    {AOM_ICDF(8774), AOM_ICDF(12118), AOM_ICDF(22041), AOM_ICDF(26730),
-    AOM_ICDF(28574), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(13690), AOM_ICDF(23135), AOM_ICDF(31469), AOM_ICDF(31868),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13306), AOM_ICDF(22730), AOM_ICDF(31466), AOM_ICDF(31860),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13503), AOM_ICDF(19892), AOM_ICDF(30528), AOM_ICDF(31005),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13150), AOM_ICDF(16108), AOM_ICDF(28345), AOM_ICDF(28869),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12014), AOM_ICDF(12842), AOM_ICDF(25693), AOM_ICDF(26145),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8937), AOM_ICDF(13405), AOM_ICDF(23831), AOM_ICDF(28300),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(18707), AOM_ICDF(26260), AOM_ICDF(31853), AOM_ICDF(32238),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15985), AOM_ICDF(24804), AOM_ICDF(31717), AOM_ICDF(32115),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14012), AOM_ICDF(18913), AOM_ICDF(30497), AOM_ICDF(31005),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12300), AOM_ICDF(14741), AOM_ICDF(28386), AOM_ICDF(28958),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12483), AOM_ICDF(15084), AOM_ICDF(24966), AOM_ICDF(26526),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(19934), AOM_ICDF(28117), AOM_ICDF(32022), AOM_ICDF(32378),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14925), AOM_ICDF(26201), AOM_ICDF(31828), AOM_ICDF(32262),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13132), AOM_ICDF(18927), AOM_ICDF(30269), AOM_ICDF(31173),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13926), AOM_ICDF(19251), AOM_ICDF(28262), AOM_ICDF(29901),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(16626), AOM_ICDF(28981), AOM_ICDF(32074), AOM_ICDF(32413),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12895), AOM_ICDF(27583), AOM_ICDF(31974), AOM_ICDF(32332),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14150), AOM_ICDF(22094), AOM_ICDF(31030), AOM_ICDF(31775),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(5279), AOM_ICDF(29309), AOM_ICDF(32149), AOM_ICDF(32477),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5880), AOM_ICDF(29657), AOM_ICDF(32086), AOM_ICDF(32385),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11469), AOM_ICDF(18022), AOM_ICDF(22938), AOM_ICDF(27853),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(8302), AOM_ICDF(14024), AOM_ICDF(16072), AOM_ICDF(27926),
-    AOM_ICDF(28871), AOM_ICDF(32768), },
-    {AOM_ICDF(9359), AOM_ICDF(15522), AOM_ICDF(20581), AOM_ICDF(28595),
-    AOM_ICDF(29250), AOM_ICDF(32768), },
-    {AOM_ICDF(5318), AOM_ICDF(12803), AOM_ICDF(19679), AOM_ICDF(27719),
-    AOM_ICDF(28609), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(22745), AOM_ICDF(25806), AOM_ICDF(31997), AOM_ICDF(32327),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18803), AOM_ICDF(25473), AOM_ICDF(31960), AOM_ICDF(32293),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15553), AOM_ICDF(19553), AOM_ICDF(31039), AOM_ICDF(31407),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13037), AOM_ICDF(15169), AOM_ICDF(28589), AOM_ICDF(29060),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10871), AOM_ICDF(11694), AOM_ICDF(24941), AOM_ICDF(25360),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6242), AOM_ICDF(10923), AOM_ICDF(18725), AOM_ICDF(23406),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(22276), AOM_ICDF(27316), AOM_ICDF(32078), AOM_ICDF(32402),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19227), AOM_ICDF(25420), AOM_ICDF(31954), AOM_ICDF(32293),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12383), AOM_ICDF(16969), AOM_ICDF(30280), AOM_ICDF(30766),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11282), AOM_ICDF(13725), AOM_ICDF(26516), AOM_ICDF(27379),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5120), AOM_ICDF(9216), AOM_ICDF(15360), AOM_ICDF(20480),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(22814), AOM_ICDF(28656), AOM_ICDF(32097), AOM_ICDF(32425),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19349), AOM_ICDF(26355), AOM_ICDF(32000), AOM_ICDF(32341),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13824), AOM_ICDF(17830), AOM_ICDF(30780), AOM_ICDF(31142),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6746), AOM_ICDF(13493), AOM_ICDF(25058), AOM_ICDF(27949),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(19746), AOM_ICDF(28536), AOM_ICDF(32088), AOM_ICDF(32411),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17457), AOM_ICDF(27155), AOM_ICDF(32024), AOM_ICDF(32376),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10949), AOM_ICDF(16662), AOM_ICDF(29118), AOM_ICDF(30229),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6096), AOM_ICDF(12955), AOM_ICDF(21337), AOM_ICDF(27434),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(10114), AOM_ICDF(29713), AOM_ICDF(32140), AOM_ICDF(32448),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11455), AOM_ICDF(29324), AOM_ICDF(32094), AOM_ICDF(32419),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6554), AOM_ICDF(14418), AOM_ICDF(23593), AOM_ICDF(27525),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(30309), AOM_ICDF(30623), AOM_ICDF(31738), AOM_ICDF(32084),
-    AOM_ICDF(32428), AOM_ICDF(32768), },
-    {AOM_ICDF(25732), AOM_ICDF(26211), AOM_ICDF(31079), AOM_ICDF(31737),
-    AOM_ICDF(32269), AOM_ICDF(32768), },
-    {AOM_ICDF(19676), AOM_ICDF(21061), AOM_ICDF(29564), AOM_ICDF(31011),
-    AOM_ICDF(31879), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(12328), AOM_ICDF(28270), AOM_ICDF(32125), AOM_ICDF(32447),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11177), AOM_ICDF(28585), AOM_ICDF(32076), AOM_ICDF(32401),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13232), AOM_ICDF(25364), AOM_ICDF(31558), AOM_ICDF(32072),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11997), AOM_ICDF(18443), AOM_ICDF(30261), AOM_ICDF(31873),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7399), AOM_ICDF(11627), AOM_ICDF(24312), AOM_ICDF(27483),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(16893), AOM_ICDF(29817), AOM_ICDF(32005), AOM_ICDF(32463),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14911), AOM_ICDF(27935), AOM_ICDF(32179), AOM_ICDF(32473),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9973), AOM_ICDF(19946), AOM_ICDF(24220), AOM_ICDF(28494),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(18859), AOM_ICDF(29232), AOM_ICDF(31354), AOM_ICDF(32061),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11281), AOM_ICDF(26322), AOM_ICDF(29545), AOM_ICDF(31156),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(8937), AOM_ICDF(19363), AOM_ICDF(23831), AOM_ICDF(28300),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(14336), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(30586), AOM_ICDF(30911), AOM_ICDF(31771), AOM_ICDF(32121),
-    AOM_ICDF(32443), AOM_ICDF(32768), },
-    {AOM_ICDF(23875), AOM_ICDF(24492), AOM_ICDF(30970), AOM_ICDF(31684),
-    AOM_ICDF(32217), AOM_ICDF(32768), },
-    {AOM_ICDF(15874), AOM_ICDF(17477), AOM_ICDF(29172), AOM_ICDF(30703),
-    AOM_ICDF(32023), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(17059), AOM_ICDF(30027), AOM_ICDF(32152), AOM_ICDF(32450),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13931), AOM_ICDF(29387), AOM_ICDF(32103), AOM_ICDF(32414),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12903), AOM_ICDF(25742), AOM_ICDF(31906), AOM_ICDF(32289),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13493), AOM_ICDF(23130), AOM_ICDF(29614), AOM_ICDF(30840),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6554), AOM_ICDF(14746), AOM_ICDF(26214), AOM_ICDF(28672),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(18660), AOM_ICDF(30626), AOM_ICDF(32150), AOM_ICDF(32459),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17338), AOM_ICDF(29279), AOM_ICDF(32168), AOM_ICDF(32495),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11916), AOM_ICDF(17873), AOM_ICDF(26810), AOM_ICDF(29789),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7282), AOM_ICDF(14564), AOM_ICDF(21845), AOM_ICDF(27307),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(23269), AOM_ICDF(31374), AOM_ICDF(32245), AOM_ICDF(32507),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15741), AOM_ICDF(27628), AOM_ICDF(30840), AOM_ICDF(31804),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(15464), AOM_ICDF(29454), AOM_ICDF(30559), AOM_ICDF(31663),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6827), AOM_ICDF(20480), AOM_ICDF(24576), AOM_ICDF(28672),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 8X8
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(18128), AOM_ICDF(19079), AOM_ICDF(27400), AOM_ICDF(29265),
-    AOM_ICDF(30385), AOM_ICDF(32768), },
-    {AOM_ICDF(10290), AOM_ICDF(12446), AOM_ICDF(23496), AOM_ICDF(26905),
-    AOM_ICDF(28729), AOM_ICDF(32768), },
-    {AOM_ICDF(5877), AOM_ICDF(9423), AOM_ICDF(18374), AOM_ICDF(23871),
-    AOM_ICDF(26028), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(16010), AOM_ICDF(22388), AOM_ICDF(30990), AOM_ICDF(31378),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14579), AOM_ICDF(21619), AOM_ICDF(30755), AOM_ICDF(31177),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13859), AOM_ICDF(18660), AOM_ICDF(29381), AOM_ICDF(29904),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12288), AOM_ICDF(14656), AOM_ICDF(27505), AOM_ICDF(28077),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10009), AOM_ICDF(10812), AOM_ICDF(23591), AOM_ICDF(24068),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8663), AOM_ICDF(9981), AOM_ICDF(19962), AOM_ICDF(20904),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(20773), AOM_ICDF(24941), AOM_ICDF(31701), AOM_ICDF(32046),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17537), AOM_ICDF(22279), AOM_ICDF(31257), AOM_ICDF(31629),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13337), AOM_ICDF(15972), AOM_ICDF(29181), AOM_ICDF(29575),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11120), AOM_ICDF(12128), AOM_ICDF(26440), AOM_ICDF(26874),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10061), AOM_ICDF(10800), AOM_ICDF(23999), AOM_ICDF(24276),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(18432), AOM_ICDF(24576),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(24073), AOM_ICDF(27227), AOM_ICDF(31920), AOM_ICDF(32246),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18916), AOM_ICDF(22611), AOM_ICDF(31508), AOM_ICDF(31853),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13371), AOM_ICDF(14495), AOM_ICDF(28662), AOM_ICDF(29093),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9283), AOM_ICDF(9840), AOM_ICDF(24228), AOM_ICDF(24506),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4681), AOM_ICDF(9362), AOM_ICDF(20285), AOM_ICDF(24966),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(25180), AOM_ICDF(28079), AOM_ICDF(32048), AOM_ICDF(32365),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19790), AOM_ICDF(23090), AOM_ICDF(31675), AOM_ICDF(32001),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12634), AOM_ICDF(13382), AOM_ICDF(28384), AOM_ICDF(28718),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11264), AOM_ICDF(12083), AOM_ICDF(28672), AOM_ICDF(29286),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7710), AOM_ICDF(13493), AOM_ICDF(21203), AOM_ICDF(26985),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(26180), AOM_ICDF(29109), AOM_ICDF(32085), AOM_ICDF(32408),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19990), AOM_ICDF(23991), AOM_ICDF(31806), AOM_ICDF(32152),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13735), AOM_ICDF(14612), AOM_ICDF(29022), AOM_ICDF(29326),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(10240), AOM_ICDF(25259), AOM_ICDF(27307),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(5084), AOM_ICDF(13063), AOM_ICDF(15732), AOM_ICDF(27628),
-    AOM_ICDF(28823), AOM_ICDF(32768), },
-    {AOM_ICDF(3233), AOM_ICDF(11850), AOM_ICDF(16878), AOM_ICDF(26809),
-    AOM_ICDF(27973), AOM_ICDF(32768), },
-    {AOM_ICDF(1405), AOM_ICDF(10468), AOM_ICDF(15220), AOM_ICDF(25209),
-    AOM_ICDF(26482), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(23854), AOM_ICDF(26692), AOM_ICDF(31964), AOM_ICDF(32291),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20514), AOM_ICDF(25677), AOM_ICDF(31833), AOM_ICDF(32170),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16504), AOM_ICDF(20235), AOM_ICDF(30877), AOM_ICDF(31237),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13241), AOM_ICDF(15173), AOM_ICDF(28673), AOM_ICDF(29116),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9526), AOM_ICDF(10553), AOM_ICDF(23852), AOM_ICDF(24361),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(6428), AOM_ICDF(17806), AOM_ICDF(18148),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(24345), AOM_ICDF(27736), AOM_ICDF(32033), AOM_ICDF(32355),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20277), AOM_ICDF(23726), AOM_ICDF(31700), AOM_ICDF(32031),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13361), AOM_ICDF(15650), AOM_ICDF(29411), AOM_ICDF(29794),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9421), AOM_ICDF(10887), AOM_ICDF(25426), AOM_ICDF(26039),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6242), AOM_ICDF(7607), AOM_ICDF(17749), AOM_ICDF(18530),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26118), AOM_ICDF(28888), AOM_ICDF(32095), AOM_ICDF(32413),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21286), AOM_ICDF(24631), AOM_ICDF(31871), AOM_ICDF(32198),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13285), AOM_ICDF(15402), AOM_ICDF(29317), AOM_ICDF(29737),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9902), AOM_ICDF(10814), AOM_ICDF(24755), AOM_ICDF(25276),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11431), AOM_ICDF(13717), AOM_ICDF(20575), AOM_ICDF(23623),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(27178), AOM_ICDF(29612), AOM_ICDF(32119), AOM_ICDF(32433),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22095), AOM_ICDF(25550), AOM_ICDF(31976), AOM_ICDF(32298),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13847), AOM_ICDF(16273), AOM_ICDF(29602), AOM_ICDF(30024),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8771), AOM_ICDF(10923), AOM_ICDF(19694), AOM_ICDF(20521),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11398), AOM_ICDF(15672), AOM_ICDF(21370), AOM_ICDF(25645),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(28257), AOM_ICDF(30327), AOM_ICDF(32126), AOM_ICDF(32441),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22325), AOM_ICDF(26453), AOM_ICDF(32054), AOM_ICDF(32380),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14860), AOM_ICDF(17652), AOM_ICDF(30682), AOM_ICDF(31035),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5097), AOM_ICDF(10194), AOM_ICDF(18933), AOM_ICDF(21117),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(28902), AOM_ICDF(29234), AOM_ICDF(31608), AOM_ICDF(31973),
-    AOM_ICDF(32378), AOM_ICDF(32768), },
-    {AOM_ICDF(22721), AOM_ICDF(23397), AOM_ICDF(30476), AOM_ICDF(31293),
-    AOM_ICDF(32179), AOM_ICDF(32768), },
-    {AOM_ICDF(16404), AOM_ICDF(18013), AOM_ICDF(27505), AOM_ICDF(29454),
-    AOM_ICDF(31300), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(14290), AOM_ICDF(27662), AOM_ICDF(31923), AOM_ICDF(32327),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13282), AOM_ICDF(26727), AOM_ICDF(31749), AOM_ICDF(32113),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12514), AOM_ICDF(22487), AOM_ICDF(30689), AOM_ICDF(31459),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11657), AOM_ICDF(16967), AOM_ICDF(29660), AOM_ICDF(30437),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8937), AOM_ICDF(12660), AOM_ICDF(24576), AOM_ICDF(26810),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(20145), AOM_ICDF(28026), AOM_ICDF(31820), AOM_ICDF(32212),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16906), AOM_ICDF(25677), AOM_ICDF(31760), AOM_ICDF(32059),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12332), AOM_ICDF(18322), AOM_ICDF(29597), AOM_ICDF(31006),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(13107), AOM_ICDF(21299), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(23492), AOM_ICDF(29214), AOM_ICDF(32166), AOM_ICDF(32467),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18757), AOM_ICDF(25536), AOM_ICDF(31789), AOM_ICDF(32165),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12603), AOM_ICDF(16384), AOM_ICDF(25206), AOM_ICDF(28987),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(24518), AOM_ICDF(29453), AOM_ICDF(32074), AOM_ICDF(32382),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19369), AOM_ICDF(26533), AOM_ICDF(31972), AOM_ICDF(32370),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(24576), AOM_ICDF(28789), AOM_ICDF(31364), AOM_ICDF(32066),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20052), AOM_ICDF(24454), AOM_ICDF(29834), AOM_ICDF(31301),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(30358), AOM_ICDF(30700), AOM_ICDF(31747), AOM_ICDF(32103),
-    AOM_ICDF(32430), AOM_ICDF(32768), },
-    {AOM_ICDF(22346), AOM_ICDF(23277), AOM_ICDF(30508), AOM_ICDF(31386),
-    AOM_ICDF(32138), AOM_ICDF(32768), },
-    {AOM_ICDF(11974), AOM_ICDF(14562), AOM_ICDF(27349), AOM_ICDF(28970),
-    AOM_ICDF(31969), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(22910), AOM_ICDF(29539), AOM_ICDF(32102), AOM_ICDF(32412),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18429), AOM_ICDF(28710), AOM_ICDF(32106), AOM_ICDF(32432),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13601), AOM_ICDF(25238), AOM_ICDF(31845), AOM_ICDF(32262),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12472), AOM_ICDF(20976), AOM_ICDF(29026), AOM_ICDF(30500),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8738), AOM_ICDF(11469), AOM_ICDF(24030), AOM_ICDF(26761),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(23359), AOM_ICDF(30038), AOM_ICDF(32127), AOM_ICDF(32444),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19590), AOM_ICDF(28108), AOM_ICDF(32056), AOM_ICDF(32382),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15578), AOM_ICDF(22024), AOM_ICDF(29008), AOM_ICDF(30619),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(14336), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26372), AOM_ICDF(31019), AOM_ICDF(32146), AOM_ICDF(32463),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22190), AOM_ICDF(28573), AOM_ICDF(32160), AOM_ICDF(32464),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(22938), AOM_ICDF(27853),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(26672), AOM_ICDF(31311), AOM_ICDF(32156), AOM_ICDF(32462),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20946), AOM_ICDF(27885), AOM_ICDF(31997), AOM_ICDF(32382),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(27342), AOM_ICDF(31385), AOM_ICDF(32130), AOM_ICDF(32449),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8674), AOM_ICDF(22167), AOM_ICDF(26985), AOM_ICDF(29877),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 16X16
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(2479), AOM_ICDF(4993), AOM_ICDF(17332), AOM_ICDF(21885),
-    AOM_ICDF(25826), AOM_ICDF(32768), },
-    {AOM_ICDF(2848), AOM_ICDF(5996), AOM_ICDF(15242), AOM_ICDF(20755),
-    AOM_ICDF(23763), AOM_ICDF(32768), },
-    {AOM_ICDF(2125), AOM_ICDF(6226), AOM_ICDF(11733), AOM_ICDF(18389),
-    AOM_ICDF(20442), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(14539), AOM_ICDF(19828), AOM_ICDF(29467), AOM_ICDF(29934),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12513), AOM_ICDF(19139), AOM_ICDF(29177), AOM_ICDF(29702),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11826), AOM_ICDF(16348), AOM_ICDF(27245), AOM_ICDF(27977),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10123), AOM_ICDF(12262), AOM_ICDF(24690), AOM_ICDF(25359),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7979), AOM_ICDF(8826), AOM_ICDF(20804), AOM_ICDF(21295),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5262), AOM_ICDF(5604), AOM_ICDF(14716), AOM_ICDF(15015),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(20625), AOM_ICDF(24118), AOM_ICDF(31086), AOM_ICDF(31446),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16710), AOM_ICDF(20899), AOM_ICDF(30505), AOM_ICDF(30864),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13161), AOM_ICDF(15579), AOM_ICDF(27988), AOM_ICDF(28449),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10596), AOM_ICDF(11651), AOM_ICDF(24124), AOM_ICDF(24589),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7724), AOM_ICDF(8452), AOM_ICDF(21060), AOM_ICDF(21476),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7282), AOM_ICDF(9466), AOM_ICDF(18933), AOM_ICDF(21117),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(24265), AOM_ICDF(26472), AOM_ICDF(31667), AOM_ICDF(31998),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18213), AOM_ICDF(21117), AOM_ICDF(30932), AOM_ICDF(31280),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12944), AOM_ICDF(14000), AOM_ICDF(27696), AOM_ICDF(28050),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9709), AOM_ICDF(10056), AOM_ICDF(23282), AOM_ICDF(23579),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8590), AOM_ICDF(9862), AOM_ICDF(18770), AOM_ICDF(19724),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(26658), AOM_ICDF(28275), AOM_ICDF(31975), AOM_ICDF(32294),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20049), AOM_ICDF(22203), AOM_ICDF(31374), AOM_ICDF(31708),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12795), AOM_ICDF(13387), AOM_ICDF(28328), AOM_ICDF(28653),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8607), AOM_ICDF(9073), AOM_ICDF(23383), AOM_ICDF(23695),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(9947), AOM_ICDF(18725), AOM_ICDF(20480),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(28651), AOM_ICDF(29902), AOM_ICDF(32085), AOM_ICDF(32402),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21133), AOM_ICDF(23229), AOM_ICDF(31684), AOM_ICDF(32013),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(13231), AOM_ICDF(14045), AOM_ICDF(28203), AOM_ICDF(28576),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7903), AOM_ICDF(8481), AOM_ICDF(21781), AOM_ICDF(22359),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(14336), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(824), AOM_ICDF(8672), AOM_ICDF(16514), AOM_ICDF(27587),
-    AOM_ICDF(29231), AOM_ICDF(32768), },
-    {AOM_ICDF(1118), AOM_ICDF(9561), AOM_ICDF(17021), AOM_ICDF(25911),
-    AOM_ICDF(27753), AOM_ICDF(32768), },
-    {AOM_ICDF(806), AOM_ICDF(9313), AOM_ICDF(13998), AOM_ICDF(22910),
-    AOM_ICDF(25224), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(23650), AOM_ICDF(26487), AOM_ICDF(31840), AOM_ICDF(32166),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19593), AOM_ICDF(25206), AOM_ICDF(31604), AOM_ICDF(31944),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15813), AOM_ICDF(19643), AOM_ICDF(30328), AOM_ICDF(30726),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12978), AOM_ICDF(15108), AOM_ICDF(27886), AOM_ICDF(28310),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9793), AOM_ICDF(11020), AOM_ICDF(23305), AOM_ICDF(23818),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4855), AOM_ICDF(5565), AOM_ICDF(14268), AOM_ICDF(14741),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(24547), AOM_ICDF(27751), AOM_ICDF(31964), AOM_ICDF(32285),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19674), AOM_ICDF(23377), AOM_ICDF(31426), AOM_ICDF(31759),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12643), AOM_ICDF(14489), AOM_ICDF(28159), AOM_ICDF(28541),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9110), AOM_ICDF(10279), AOM_ICDF(23565), AOM_ICDF(23992),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5082), AOM_ICDF(5617), AOM_ICDF(16317), AOM_ICDF(16651),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5174), AOM_ICDF(10348), AOM_ICDF(18971), AOM_ICDF(24145),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26773), AOM_ICDF(29038), AOM_ICDF(32050), AOM_ICDF(32367),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20956), AOM_ICDF(23898), AOM_ICDF(31563), AOM_ICDF(31888),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12527), AOM_ICDF(13472), AOM_ICDF(27840), AOM_ICDF(28211),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8773), AOM_ICDF(9353), AOM_ICDF(22555), AOM_ICDF(22856),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4291), AOM_ICDF(4876), AOM_ICDF(16969), AOM_ICDF(17554),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5783), AOM_ICDF(11565), AOM_ICDF(17348), AOM_ICDF(23130),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28065), AOM_ICDF(29768), AOM_ICDF(32086), AOM_ICDF(32400),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21847), AOM_ICDF(24001), AOM_ICDF(31608), AOM_ICDF(31929),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12482), AOM_ICDF(13091), AOM_ICDF(27413), AOM_ICDF(27739),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7582), AOM_ICDF(8002), AOM_ICDF(22090), AOM_ICDF(22405),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6324), AOM_ICDF(7186), AOM_ICDF(15809), AOM_ICDF(16671),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29731), AOM_ICDF(30798), AOM_ICDF(32113), AOM_ICDF(32431),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22224), AOM_ICDF(24448), AOM_ICDF(31791), AOM_ICDF(32118),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12622), AOM_ICDF(13513), AOM_ICDF(28103), AOM_ICDF(28530),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8886), AOM_ICDF(9600), AOM_ICDF(22890), AOM_ICDF(23604),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8058), AOM_ICDF(9669), AOM_ICDF(18264), AOM_ICDF(19876),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(27375), AOM_ICDF(27731), AOM_ICDF(31591), AOM_ICDF(31993),
-    AOM_ICDF(32404), AOM_ICDF(32768), },
-    {AOM_ICDF(20943), AOM_ICDF(21758), AOM_ICDF(30037), AOM_ICDF(31074),
-    AOM_ICDF(32003), AOM_ICDF(32768), },
-    {AOM_ICDF(16218), AOM_ICDF(17771), AOM_ICDF(26832), AOM_ICDF(29181),
-    AOM_ICDF(30586), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(17239), AOM_ICDF(27853), AOM_ICDF(31557), AOM_ICDF(32198),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14494), AOM_ICDF(25906), AOM_ICDF(31543), AOM_ICDF(32033),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12980), AOM_ICDF(19788), AOM_ICDF(29137), AOM_ICDF(29410),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11796), AOM_ICDF(14680), AOM_ICDF(26477), AOM_ICDF(27787),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12603), AOM_ICDF(15124), AOM_ICDF(21005), AOM_ICDF(23526),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(22821), AOM_ICDF(27655), AOM_ICDF(32024), AOM_ICDF(32303),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16534), AOM_ICDF(23629), AOM_ICDF(31145), AOM_ICDF(31686),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12407), AOM_ICDF(14952), AOM_ICDF(28950), AOM_ICDF(30859),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6554), AOM_ICDF(10486), AOM_ICDF(19661), AOM_ICDF(23593),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(26369), AOM_ICDF(29624), AOM_ICDF(31996), AOM_ICDF(32272),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19346), AOM_ICDF(24807), AOM_ICDF(31750), AOM_ICDF(32027),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15056), AOM_ICDF(19484), AOM_ICDF(27454), AOM_ICDF(30111),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5783), AOM_ICDF(11565), AOM_ICDF(21203), AOM_ICDF(26985),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28213), AOM_ICDF(30301), AOM_ICDF(32199), AOM_ICDF(32483),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22988), AOM_ICDF(27307), AOM_ICDF(31879), AOM_ICDF(32260),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11796), AOM_ICDF(15729), AOM_ICDF(24904), AOM_ICDF(28836),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29813), AOM_ICDF(31323), AOM_ICDF(32142), AOM_ICDF(32444),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21497), AOM_ICDF(25254), AOM_ICDF(31307), AOM_ICDF(32142),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(30560), AOM_ICDF(30889), AOM_ICDF(31795), AOM_ICDF(32128),
-    AOM_ICDF(32455), AOM_ICDF(32768), },
-    {AOM_ICDF(20347), AOM_ICDF(20993), AOM_ICDF(30496), AOM_ICDF(31112),
-    AOM_ICDF(32263), AOM_ICDF(32768), },
-    {AOM_ICDF(9723), AOM_ICDF(10992), AOM_ICDF(27830), AOM_ICDF(28681),
-    AOM_ICDF(32168), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(25900), AOM_ICDF(30610), AOM_ICDF(32179), AOM_ICDF(32474),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18535), AOM_ICDF(29316), AOM_ICDF(32153), AOM_ICDF(32437),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15230), AOM_ICDF(25845), AOM_ICDF(30922), AOM_ICDF(31845),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(16384), AOM_ICDF(27097), AOM_ICDF(28987),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8548), AOM_ICDF(12822), AOM_ICDF(21370), AOM_ICDF(25645),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(26104), AOM_ICDF(30659), AOM_ICDF(32157), AOM_ICDF(32462),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20457), AOM_ICDF(28242), AOM_ICDF(31682), AOM_ICDF(32225),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10923), AOM_ICDF(16384), AOM_ICDF(24576), AOM_ICDF(28672),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(28740), AOM_ICDF(30618), AOM_ICDF(32154), AOM_ICDF(32461),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19333), AOM_ICDF(26214), AOM_ICDF(30802), AOM_ICDF(31785),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28161), AOM_ICDF(30834), AOM_ICDF(32160), AOM_ICDF(32464),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(26536), AOM_ICDF(29149), AOM_ICDF(31562), AOM_ICDF(32165),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(29913), AOM_ICDF(31560), AOM_ICDF(32172), AOM_ICDF(32470),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22209), AOM_ICDF(28035), AOM_ICDF(30583), AOM_ICDF(31676),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-    {  // TX 32X32
-    {  // Y plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(3982), AOM_ICDF(6433), AOM_ICDF(20418), AOM_ICDF(25151),
-    AOM_ICDF(27471), AOM_ICDF(32768), },
-    {AOM_ICDF(3342), AOM_ICDF(6943), AOM_ICDF(15018), AOM_ICDF(20274),
-    AOM_ICDF(22412), AOM_ICDF(32768), },
-    {AOM_ICDF(1805), AOM_ICDF(5863), AOM_ICDF(9932), AOM_ICDF(16426),
-    AOM_ICDF(17655), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(11799), AOM_ICDF(19138), AOM_ICDF(28295), AOM_ICDF(28881),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11008), AOM_ICDF(18597), AOM_ICDF(28369), AOM_ICDF(29021),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10104), AOM_ICDF(15628), AOM_ICDF(26339), AOM_ICDF(27195),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8537), AOM_ICDF(11246), AOM_ICDF(22663), AOM_ICDF(23623),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5895), AOM_ICDF(6476), AOM_ICDF(16647), AOM_ICDF(17329),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(4046), AOM_ICDF(4357), AOM_ICDF(10849), AOM_ICDF(11160),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(18503), AOM_ICDF(22222), AOM_ICDF(30403), AOM_ICDF(30814),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15264), AOM_ICDF(19282), AOM_ICDF(29949), AOM_ICDF(30339),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12101), AOM_ICDF(14721), AOM_ICDF(27350), AOM_ICDF(27783),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9243), AOM_ICDF(10177), AOM_ICDF(22679), AOM_ICDF(23097),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5571), AOM_ICDF(5967), AOM_ICDF(16714), AOM_ICDF(17043),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(2731), AOM_ICDF(3755), AOM_ICDF(14677), AOM_ICDF(15701),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(23077), AOM_ICDF(25272), AOM_ICDF(31444), AOM_ICDF(31771),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16598), AOM_ICDF(19790), AOM_ICDF(30479), AOM_ICDF(30822),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11961), AOM_ICDF(12871), AOM_ICDF(27162), AOM_ICDF(27529),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8156), AOM_ICDF(8563), AOM_ICDF(22220), AOM_ICDF(22579),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5851), AOM_ICDF(6242), AOM_ICDF(15994), AOM_ICDF(16384),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(18432), AOM_ICDF(24576),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(26084), AOM_ICDF(27933), AOM_ICDF(31906), AOM_ICDF(32223),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19335), AOM_ICDF(21760), AOM_ICDF(31149), AOM_ICDF(31477),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12724), AOM_ICDF(13278), AOM_ICDF(27015), AOM_ICDF(27365),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8687), AOM_ICDF(9010), AOM_ICDF(21051), AOM_ICDF(21334),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5814), AOM_ICDF(6606), AOM_ICDF(14534), AOM_ICDF(15327),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(18432), AOM_ICDF(24576),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(30147), AOM_ICDF(30787), AOM_ICDF(32081), AOM_ICDF(32395),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(20402), AOM_ICDF(21697), AOM_ICDF(30943), AOM_ICDF(31266),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11661), AOM_ICDF(12125), AOM_ICDF(25710), AOM_ICDF(26034),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7224), AOM_ICDF(7504), AOM_ICDF(19876), AOM_ICDF(20156),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6183), AOM_ICDF(7110), AOM_ICDF(17002), AOM_ICDF(17930),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5174), AOM_ICDF(10348), AOM_ICDF(17246), AOM_ICDF(22420),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(4079), AOM_ICDF(8378), AOM_ICDF(25109), AOM_ICDF(29897),
-    AOM_ICDF(30898), AOM_ICDF(32768), },
-    {AOM_ICDF(3870), AOM_ICDF(8207), AOM_ICDF(22495), AOM_ICDF(27162),
-    AOM_ICDF(29559), AOM_ICDF(32768), },
-    {AOM_ICDF(2127), AOM_ICDF(6197), AOM_ICDF(15932), AOM_ICDF(20604),
-    AOM_ICDF(27312), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(21253), AOM_ICDF(26168), AOM_ICDF(31780), AOM_ICDF(32120),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16610), AOM_ICDF(23985), AOM_ICDF(31495), AOM_ICDF(31866),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14861), AOM_ICDF(21030), AOM_ICDF(30219), AOM_ICDF(30784),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14573), AOM_ICDF(18162), AOM_ICDF(28524), AOM_ICDF(29116),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14036), AOM_ICDF(15983), AOM_ICDF(26283), AOM_ICDF(27085),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9119), AOM_ICDF(10742), AOM_ICDF(19630), AOM_ICDF(20016),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(23192), AOM_ICDF(27248), AOM_ICDF(31887), AOM_ICDF(32215),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18219), AOM_ICDF(23213), AOM_ICDF(31417), AOM_ICDF(31769),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12657), AOM_ICDF(14754), AOM_ICDF(27845), AOM_ICDF(28233),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8127), AOM_ICDF(8829), AOM_ICDF(20909), AOM_ICDF(21279),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7547), AOM_ICDF(8142), AOM_ICDF(17476), AOM_ICDF(18072),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5461), AOM_ICDF(10923), AOM_ICDF(16384), AOM_ICDF(21845),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(25516), AOM_ICDF(28301), AOM_ICDF(31970), AOM_ICDF(32289),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19094), AOM_ICDF(23041), AOM_ICDF(31404), AOM_ICDF(31732),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12328), AOM_ICDF(13099), AOM_ICDF(27275), AOM_ICDF(27613),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8134), AOM_ICDF(8458), AOM_ICDF(21075), AOM_ICDF(21352),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5041), AOM_ICDF(5881), AOM_ICDF(17644), AOM_ICDF(18485),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(7282), AOM_ICDF(12743), AOM_ICDF(18204), AOM_ICDF(23666),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28082), AOM_ICDF(29782), AOM_ICDF(32087), AOM_ICDF(32400),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(21281), AOM_ICDF(24161), AOM_ICDF(31679), AOM_ICDF(31997),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12144), AOM_ICDF(12913), AOM_ICDF(27139), AOM_ICDF(27460),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8232), AOM_ICDF(8472), AOM_ICDF(21659), AOM_ICDF(21979),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(3034), AOM_ICDF(4855), AOM_ICDF(17598), AOM_ICDF(19418),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(30193), AOM_ICDF(31021), AOM_ICDF(32122), AOM_ICDF(32435),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(22124), AOM_ICDF(23763), AOM_ICDF(31498), AOM_ICDF(31816),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12066), AOM_ICDF(12418), AOM_ICDF(26849), AOM_ICDF(27157),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8701), AOM_ICDF(8979), AOM_ICDF(20920), AOM_ICDF(21197),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5266), AOM_ICDF(7022), AOM_ICDF(15799), AOM_ICDF(17554),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    {  // UV plane
-    {  // Intra
-    {  // Band 0
-    {AOM_ICDF(23468), AOM_ICDF(24062), AOM_ICDF(30645), AOM_ICDF(31200),
-    AOM_ICDF(32193), AOM_ICDF(32768), },
-    {AOM_ICDF(12642), AOM_ICDF(14371), AOM_ICDF(26924), AOM_ICDF(28832),
-    AOM_ICDF(31098), AOM_ICDF(32768), },
-    {AOM_ICDF(7785), AOM_ICDF(8831), AOM_ICDF(23705), AOM_ICDF(26028),
-    AOM_ICDF(29979), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(13575), AOM_ICDF(28087), AOM_ICDF(31130), AOM_ICDF(31832),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11108), AOM_ICDF(27955), AOM_ICDF(31657), AOM_ICDF(32213),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(9797), AOM_ICDF(23985), AOM_ICDF(28039), AOM_ICDF(30741),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5578), AOM_ICDF(18824), AOM_ICDF(26493), AOM_ICDF(28585),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(5041), AOM_ICDF(12603), AOM_ICDF(18905), AOM_ICDF(22686),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(17613), AOM_ICDF(26624), AOM_ICDF(30310), AOM_ICDF(31539),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(11398), AOM_ICDF(22795), AOM_ICDF(29444), AOM_ICDF(30868),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8548), AOM_ICDF(15672), AOM_ICDF(22795), AOM_ICDF(28494),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6144), AOM_ICDF(12288), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(24145), AOM_ICDF(26301), AOM_ICDF(30181), AOM_ICDF(31475),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15565), AOM_ICDF(20480), AOM_ICDF(27853), AOM_ICDF(30310),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(8192), AOM_ICDF(14336), AOM_ICDF(20480), AOM_ICDF(26624),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(27434), AOM_ICDF(28450), AOM_ICDF(30990), AOM_ICDF(31752),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14947), AOM_ICDF(21845), AOM_ICDF(29319), AOM_ICDF(31043),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(31130), AOM_ICDF(31676), AOM_ICDF(32180), AOM_ICDF(32474),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(18289), AOM_ICDF(22099), AOM_ICDF(28196), AOM_ICDF(30482),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    {  // Inter
-    {  // Band 0
-    {AOM_ICDF(29436), AOM_ICDF(29775), AOM_ICDF(31685), AOM_ICDF(32029),
-    AOM_ICDF(32425), AOM_ICDF(32768), },
-    {AOM_ICDF(10536), AOM_ICDF(11074), AOM_ICDF(27753), AOM_ICDF(28385),
-    AOM_ICDF(31293), AOM_ICDF(32768), },
-    {AOM_ICDF(3010), AOM_ICDF(3521), AOM_ICDF(22603), AOM_ICDF(23227),
-    AOM_ICDF(30440), AOM_ICDF(32768), },
-    },
-    {  // Band 1
-    {AOM_ICDF(17576), AOM_ICDF(29491), AOM_ICDF(30981), AOM_ICDF(31874),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(10426), AOM_ICDF(29044), AOM_ICDF(31725), AOM_ICDF(32321),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15766), AOM_ICDF(28286), AOM_ICDF(31377), AOM_ICDF(32304),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(19661), AOM_ICDF(26985), AOM_ICDF(30069), AOM_ICDF(31611),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(16035), AOM_ICDF(23007), AOM_ICDF(28585), AOM_ICDF(30676),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 2
-    {AOM_ICDF(23073), AOM_ICDF(30053), AOM_ICDF(31605), AOM_ICDF(32186),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(12858), AOM_ICDF(24887), AOM_ICDF(30279), AOM_ICDF(31524),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 3
-    {AOM_ICDF(24030), AOM_ICDF(26839), AOM_ICDF(30896), AOM_ICDF(31832),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(17644), AOM_ICDF(23526), AOM_ICDF(27727), AOM_ICDF(30247),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 4
-    {AOM_ICDF(28019), AOM_ICDF(30156), AOM_ICDF(31343), AOM_ICDF(32056),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(14980), AOM_ICDF(22469), AOM_ICDF(27151), AOM_ICDF(29959),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    {  // Band 5
-    {AOM_ICDF(30549), AOM_ICDF(31511), AOM_ICDF(32176), AOM_ICDF(32472),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(15019), AOM_ICDF(20480), AOM_ICDF(24576), AOM_ICDF(28672),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    {AOM_ICDF(6553), AOM_ICDF(13107), AOM_ICDF(19660), AOM_ICDF(26214),
-    AOM_ICDF(32768), },
-    },
-    },
-    },
-    },
-};
-/* clang-format on */
+static const aom_cdf_prob av1_default_coeff_base_multi_cdfs
+    [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]
+    [CDF_SIZE(NUM_BASE_LEVELS + 2)] =
+        { { { { { AOM_CDF4(4034, 8930, 12727) },
+                { AOM_CDF4(18082, 29741, 31877) },
+                { AOM_CDF4(12596, 26124, 30493) },
+                { AOM_CDF4(9446, 21118, 27005) },
+                { AOM_CDF4(6308, 15141, 21279) },
+                { AOM_CDF4(2463, 6357, 9783) },
+                { AOM_CDF4(20667, 30546, 31929) },
+                { AOM_CDF4(13043, 26123, 30134) },
+                { AOM_CDF4(8151, 18757, 24778) },
+                { AOM_CDF4(5255, 12839, 18632) },
+                { AOM_CDF4(2820, 7206, 11161) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(15736, 27553, 30604) },
+                { AOM_CDF4(11210, 23794, 28787) },
+                { AOM_CDF4(5947, 13874, 19701) },
+                { AOM_CDF4(4215, 9323, 13891) },
+                { AOM_CDF4(2833, 6462, 10059) },
+                { AOM_CDF4(19605, 30393, 31582) },
+                { AOM_CDF4(13523, 26252, 30248) },
+                { AOM_CDF4(8446, 18622, 24512) },
+                { AOM_CDF4(3818, 10343, 15974) },
+                { AOM_CDF4(1481, 4117, 6796) },
+                { AOM_CDF4(22649, 31302, 32190) },
+                { AOM_CDF4(14829, 27127, 30449) },
+                { AOM_CDF4(8313, 17702, 23304) },
+                { AOM_CDF4(3022, 8301, 12786) },
+                { AOM_CDF4(1536, 4412, 7184) },
+                { AOM_CDF4(22354, 29774, 31372) },
+                { AOM_CDF4(14723, 25472, 29214) },
+                { AOM_CDF4(6673, 13745, 18662) },
+                { AOM_CDF4(2068, 5766, 9322) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(6302, 16444, 21761) },
+                { AOM_CDF4(23040, 31538, 32475) },
+                { AOM_CDF4(15196, 28452, 31496) },
+                { AOM_CDF4(10020, 22946, 28514) },
+                { AOM_CDF4(6533, 16862, 23501) },
+                { AOM_CDF4(3538, 9816, 15076) },
+                { AOM_CDF4(24444, 31875, 32525) },
+                { AOM_CDF4(15881, 28924, 31635) },
+                { AOM_CDF4(9922, 22873, 28466) },
+                { AOM_CDF4(6527, 16966, 23691) },
+                { AOM_CDF4(4114, 11303, 17220) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(20201, 30770, 32209) },
+                { AOM_CDF4(14754, 28071, 31258) },
+                { AOM_CDF4(8378, 20186, 26517) },
+                { AOM_CDF4(5916, 15299, 21978) },
+                { AOM_CDF4(4268, 11583, 17901) },
+                { AOM_CDF4(24361, 32025, 32581) },
+                { AOM_CDF4(18673, 30105, 31943) },
+                { AOM_CDF4(10196, 22244, 27576) },
+                { AOM_CDF4(5495, 14349, 20417) },
+                { AOM_CDF4(2676, 7415, 11498) },
+                { AOM_CDF4(24678, 31958, 32585) },
+                { AOM_CDF4(18629, 29906, 31831) },
+                { AOM_CDF4(9364, 20724, 26315) },
+                { AOM_CDF4(4641, 12318, 18094) },
+                { AOM_CDF4(2758, 7387, 11579) },
+                { AOM_CDF4(25433, 31842, 32469) },
+                { AOM_CDF4(18795, 29289, 31411) },
+                { AOM_CDF4(7644, 17584, 23592) },
+                { AOM_CDF4(3408, 9014, 15047) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(4536, 10072, 14001) },
+                { AOM_CDF4(25459, 31416, 32206) },
+                { AOM_CDF4(16605, 28048, 30818) },
+                { AOM_CDF4(11008, 22857, 27719) },
+                { AOM_CDF4(6915, 16268, 22315) },
+                { AOM_CDF4(2625, 6812, 10537) },
+                { AOM_CDF4(24257, 31788, 32499) },
+                { AOM_CDF4(16880, 29454, 31879) },
+                { AOM_CDF4(11958, 25054, 29778) },
+                { AOM_CDF4(7916, 18718, 25084) },
+                { AOM_CDF4(3383, 8777, 13446) },
+                { AOM_CDF4(22720, 31603, 32393) },
+                { AOM_CDF4(14960, 28125, 31335) },
+                { AOM_CDF4(9731, 22210, 27928) },
+                { AOM_CDF4(6304, 15832, 22277) },
+                { AOM_CDF4(2910, 7818, 12166) },
+                { AOM_CDF4(20375, 30627, 32131) },
+                { AOM_CDF4(13904, 27284, 30887) },
+                { AOM_CDF4(9368, 21558, 27144) },
+                { AOM_CDF4(5937, 14966, 21119) },
+                { AOM_CDF4(2667, 7225, 11319) },
+                { AOM_CDF4(23970, 31470, 32378) },
+                { AOM_CDF4(17173, 29734, 32018) },
+                { AOM_CDF4(12795, 25441, 29965) },
+                { AOM_CDF4(8981, 19680, 25893) },
+                { AOM_CDF4(4728, 11372, 16902) },
+                { AOM_CDF4(24287, 31797, 32439) },
+                { AOM_CDF4(16703, 29145, 31696) },
+                { AOM_CDF4(10833, 23554, 28725) },
+                { AOM_CDF4(6468, 16566, 23057) },
+                { AOM_CDF4(2415, 6562, 10278) },
+                { AOM_CDF4(26610, 32395, 32659) },
+                { AOM_CDF4(18590, 30498, 32117) },
+                { AOM_CDF4(12420, 25756, 29950) },
+                { AOM_CDF4(7639, 18746, 24710) },
+                { AOM_CDF4(3001, 8086, 12347) },
+                { AOM_CDF4(25076, 32064, 32580) },
+                { AOM_CDF4(17946, 30128, 32028) },
+                { AOM_CDF4(12024, 24985, 29378) },
+                { AOM_CDF4(7517, 18390, 24304) },
+                { AOM_CDF4(3243, 8781, 13331) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(6037, 16771, 21957) },
+                { AOM_CDF4(24774, 31704, 32426) },
+                { AOM_CDF4(16830, 28589, 31056) },
+                { AOM_CDF4(10602, 22828, 27760) },
+                { AOM_CDF4(6733, 16829, 23071) },
+                { AOM_CDF4(3250, 8914, 13556) },
+                { AOM_CDF4(25582, 32220, 32668) },
+                { AOM_CDF4(18659, 30342, 32223) },
+                { AOM_CDF4(12546, 26149, 30515) },
+                { AOM_CDF4(8420, 20451, 26801) },
+                { AOM_CDF4(4636, 12420, 18344) },
+                { AOM_CDF4(27581, 32362, 32639) },
+                { AOM_CDF4(18987, 30083, 31978) },
+                { AOM_CDF4(11327, 24248, 29084) },
+                { AOM_CDF4(7264, 17719, 24120) },
+                { AOM_CDF4(3995, 10768, 16169) },
+                { AOM_CDF4(25893, 31831, 32487) },
+                { AOM_CDF4(16577, 28587, 31379) },
+                { AOM_CDF4(10189, 22748, 28182) },
+                { AOM_CDF4(6832, 17094, 23556) },
+                { AOM_CDF4(3708, 10110, 15334) },
+                { AOM_CDF4(25904, 32282, 32656) },
+                { AOM_CDF4(19721, 30792, 32276) },
+                { AOM_CDF4(12819, 26243, 30411) },
+                { AOM_CDF4(8572, 20614, 26891) },
+                { AOM_CDF4(5364, 14059, 20467) },
+                { AOM_CDF4(26580, 32438, 32677) },
+                { AOM_CDF4(20852, 31225, 32340) },
+                { AOM_CDF4(12435, 25700, 29967) },
+                { AOM_CDF4(8691, 20825, 26976) },
+                { AOM_CDF4(4446, 12209, 17269) },
+                { AOM_CDF4(27350, 32429, 32696) },
+                { AOM_CDF4(21372, 30977, 32272) },
+                { AOM_CDF4(12673, 25270, 29853) },
+                { AOM_CDF4(9208, 20925, 26640) },
+                { AOM_CDF4(5018, 13351, 18732) },
+                { AOM_CDF4(27351, 32479, 32713) },
+                { AOM_CDF4(21398, 31209, 32387) },
+                { AOM_CDF4(12162, 25047, 29842) },
+                { AOM_CDF4(7896, 18691, 25319) },
+                { AOM_CDF4(4670, 12882, 18881) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(5487, 10460, 13708) },
+                { AOM_CDF4(21597, 28303, 30674) },
+                { AOM_CDF4(11037, 21953, 26476) },
+                { AOM_CDF4(8147, 17962, 22952) },
+                { AOM_CDF4(5242, 13061, 18532) },
+                { AOM_CDF4(1889, 5208, 8182) },
+                { AOM_CDF4(26774, 32133, 32590) },
+                { AOM_CDF4(17844, 29564, 31767) },
+                { AOM_CDF4(11690, 24438, 29171) },
+                { AOM_CDF4(7542, 18215, 24459) },
+                { AOM_CDF4(2993, 8050, 12319) },
+                { AOM_CDF4(28023, 32328, 32591) },
+                { AOM_CDF4(18651, 30126, 31954) },
+                { AOM_CDF4(12164, 25146, 29589) },
+                { AOM_CDF4(7762, 18530, 24771) },
+                { AOM_CDF4(3492, 9183, 13920) },
+                { AOM_CDF4(27591, 32008, 32491) },
+                { AOM_CDF4(17149, 28853, 31510) },
+                { AOM_CDF4(11485, 24003, 28860) },
+                { AOM_CDF4(7697, 18086, 24210) },
+                { AOM_CDF4(3075, 7999, 12218) },
+                { AOM_CDF4(28268, 32482, 32654) },
+                { AOM_CDF4(19631, 31051, 32404) },
+                { AOM_CDF4(13860, 27260, 31020) },
+                { AOM_CDF4(9605, 21613, 27594) },
+                { AOM_CDF4(4876, 12162, 17908) },
+                { AOM_CDF4(27248, 32316, 32576) },
+                { AOM_CDF4(18955, 30457, 32075) },
+                { AOM_CDF4(11824, 23997, 28795) },
+                { AOM_CDF4(7346, 18196, 24647) },
+                { AOM_CDF4(3403, 9247, 14111) },
+                { AOM_CDF4(29711, 32655, 32735) },
+                { AOM_CDF4(21169, 31394, 32417) },
+                { AOM_CDF4(13487, 27198, 30957) },
+                { AOM_CDF4(8828, 21683, 27614) },
+                { AOM_CDF4(4270, 11451, 17038) },
+                { AOM_CDF4(28708, 32578, 32731) },
+                { AOM_CDF4(20120, 31241, 32482) },
+                { AOM_CDF4(13692, 27550, 31321) },
+                { AOM_CDF4(9418, 22514, 28439) },
+                { AOM_CDF4(4999, 13283, 19462) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(5673, 14302, 19711) },
+                { AOM_CDF4(26251, 30701, 31834) },
+                { AOM_CDF4(12782, 23783, 27803) },
+                { AOM_CDF4(9127, 20657, 25808) },
+                { AOM_CDF4(6368, 16208, 21462) },
+                { AOM_CDF4(2465, 7177, 10822) },
+                { AOM_CDF4(29961, 32563, 32719) },
+                { AOM_CDF4(18318, 29891, 31949) },
+                { AOM_CDF4(11361, 24514, 29357) },
+                { AOM_CDF4(7900, 19603, 25607) },
+                { AOM_CDF4(4002, 10590, 15546) },
+                { AOM_CDF4(29637, 32310, 32595) },
+                { AOM_CDF4(18296, 29913, 31809) },
+                { AOM_CDF4(10144, 21515, 26871) },
+                { AOM_CDF4(5358, 14322, 20394) },
+                { AOM_CDF4(3067, 8362, 13346) },
+                { AOM_CDF4(28652, 32470, 32676) },
+                { AOM_CDF4(17538, 30771, 32209) },
+                { AOM_CDF4(13924, 26882, 30494) },
+                { AOM_CDF4(10496, 22837, 27869) },
+                { AOM_CDF4(7236, 16396, 21621) },
+                { AOM_CDF4(30743, 32687, 32746) },
+                { AOM_CDF4(23006, 31676, 32489) },
+                { AOM_CDF4(14494, 27828, 31120) },
+                { AOM_CDF4(10174, 22801, 28352) },
+                { AOM_CDF4(6242, 15281, 21043) },
+                { AOM_CDF4(25817, 32243, 32720) },
+                { AOM_CDF4(18618, 31367, 32325) },
+                { AOM_CDF4(13997, 28318, 31878) },
+                { AOM_CDF4(12255, 26534, 31383) },
+                { AOM_CDF4(9561, 21588, 28450) },
+                { AOM_CDF4(28188, 32635, 32724) },
+                { AOM_CDF4(22060, 32365, 32728) },
+                { AOM_CDF4(18102, 30690, 32528) },
+                { AOM_CDF4(14196, 28864, 31999) },
+                { AOM_CDF4(12262, 25792, 30865) },
+                { AOM_CDF4(24176, 32109, 32628) },
+                { AOM_CDF4(18280, 29681, 31963) },
+                { AOM_CDF4(10205, 23703, 29664) },
+                { AOM_CDF4(7889, 20025, 27676) },
+                { AOM_CDF4(6060, 16743, 23970) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(5141, 7096, 8260) },
+                { AOM_CDF4(27186, 29022, 29789) },
+                { AOM_CDF4(6668, 12568, 15682) },
+                { AOM_CDF4(2172, 6181, 8638) },
+                { AOM_CDF4(1126, 3379, 4531) },
+                { AOM_CDF4(443, 1361, 2254) },
+                { AOM_CDF4(26083, 31153, 32436) },
+                { AOM_CDF4(13486, 24603, 28483) },
+                { AOM_CDF4(6508, 14840, 19910) },
+                { AOM_CDF4(3386, 8800, 13286) },
+                { AOM_CDF4(1530, 4322, 7054) },
+                { AOM_CDF4(29639, 32080, 32548) },
+                { AOM_CDF4(15897, 27552, 30290) },
+                { AOM_CDF4(8588, 20047, 25383) },
+                { AOM_CDF4(4889, 13339, 19269) },
+                { AOM_CDF4(2240, 6871, 10498) },
+                { AOM_CDF4(28165, 32197, 32517) },
+                { AOM_CDF4(20735, 30427, 31568) },
+                { AOM_CDF4(14325, 24671, 27692) },
+                { AOM_CDF4(5119, 12554, 17805) },
+                { AOM_CDF4(1810, 5441, 8261) },
+                { AOM_CDF4(31212, 32724, 32748) },
+                { AOM_CDF4(23352, 31766, 32545) },
+                { AOM_CDF4(14669, 27570, 31059) },
+                { AOM_CDF4(8492, 20894, 27272) },
+                { AOM_CDF4(3644, 10194, 15204) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(2461, 7013, 9371) },
+                { AOM_CDF4(24749, 29600, 30986) },
+                { AOM_CDF4(9466, 19037, 22417) },
+                { AOM_CDF4(3584, 9280, 14400) },
+                { AOM_CDF4(1505, 3929, 5433) },
+                { AOM_CDF4(677, 1500, 2736) },
+                { AOM_CDF4(23987, 30702, 32117) },
+                { AOM_CDF4(13554, 24571, 29263) },
+                { AOM_CDF4(6211, 14556, 21155) },
+                { AOM_CDF4(3135, 10972, 15625) },
+                { AOM_CDF4(2435, 7127, 11427) },
+                { AOM_CDF4(31300, 32532, 32550) },
+                { AOM_CDF4(14757, 30365, 31954) },
+                { AOM_CDF4(4405, 11612, 18553) },
+                { AOM_CDF4(580, 4132, 7322) },
+                { AOM_CDF4(1695, 10169, 14124) },
+                { AOM_CDF4(30008, 32282, 32591) },
+                { AOM_CDF4(19244, 30108, 31748) },
+                { AOM_CDF4(11180, 24158, 29555) },
+                { AOM_CDF4(5650, 14972, 19209) },
+                { AOM_CDF4(2114, 5109, 8456) },
+                { AOM_CDF4(31856, 32716, 32748) },
+                { AOM_CDF4(23012, 31664, 32572) },
+                { AOM_CDF4(13694, 26656, 30636) },
+                { AOM_CDF4(8142, 19508, 26093) },
+                { AOM_CDF4(4253, 10955, 16724) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(601, 983, 1311) },
+                { AOM_CDF4(18725, 23406, 28087) },
+                { AOM_CDF4(5461, 8192, 10923) },
+                { AOM_CDF4(3781, 15124, 21425) },
+                { AOM_CDF4(2587, 7761, 12072) },
+                { AOM_CDF4(106, 458, 810) },
+                { AOM_CDF4(22282, 29710, 31894) },
+                { AOM_CDF4(8508, 20926, 25984) },
+                { AOM_CDF4(3726, 12713, 18083) },
+                { AOM_CDF4(1620, 7112, 10893) },
+                { AOM_CDF4(729, 2236, 3495) },
+                { AOM_CDF4(30163, 32474, 32684) },
+                { AOM_CDF4(18304, 30464, 32000) },
+                { AOM_CDF4(11443, 26526, 29647) },
+                { AOM_CDF4(6007, 15292, 21299) },
+                { AOM_CDF4(2234, 6703, 8937) },
+                { AOM_CDF4(30954, 32177, 32571) },
+                { AOM_CDF4(17363, 29562, 31076) },
+                { AOM_CDF4(9686, 22464, 27410) },
+                { AOM_CDF4(8192, 16384, 21390) },
+                { AOM_CDF4(1755, 8046, 11264) },
+                { AOM_CDF4(31168, 32734, 32748) },
+                { AOM_CDF4(22486, 31441, 32471) },
+                { AOM_CDF4(12833, 25627, 29738) },
+                { AOM_CDF4(6980, 17379, 23122) },
+                { AOM_CDF4(3111, 8887, 13479) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } } },
+          { { { { AOM_CDF4(6041, 11854, 15927) },
+                { AOM_CDF4(20326, 30905, 32251) },
+                { AOM_CDF4(14164, 26831, 30725) },
+                { AOM_CDF4(9760, 20647, 26585) },
+                { AOM_CDF4(6416, 14953, 21219) },
+                { AOM_CDF4(2966, 7151, 10891) },
+                { AOM_CDF4(23567, 31374, 32254) },
+                { AOM_CDF4(14978, 27416, 30946) },
+                { AOM_CDF4(9434, 20225, 26254) },
+                { AOM_CDF4(6658, 14558, 20535) },
+                { AOM_CDF4(3916, 8677, 12989) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(18088, 29545, 31587) },
+                { AOM_CDF4(13062, 25843, 30073) },
+                { AOM_CDF4(8940, 16827, 22251) },
+                { AOM_CDF4(7654, 13220, 17973) },
+                { AOM_CDF4(5733, 10316, 14456) },
+                { AOM_CDF4(22879, 31388, 32114) },
+                { AOM_CDF4(15215, 27993, 30955) },
+                { AOM_CDF4(9397, 19445, 24978) },
+                { AOM_CDF4(3442, 9813, 15344) },
+                { AOM_CDF4(1368, 3936, 6532) },
+                { AOM_CDF4(25494, 32033, 32406) },
+                { AOM_CDF4(16772, 27963, 30718) },
+                { AOM_CDF4(9419, 18165, 23260) },
+                { AOM_CDF4(2677, 7501, 11797) },
+                { AOM_CDF4(1516, 4344, 7170) },
+                { AOM_CDF4(26556, 31454, 32101) },
+                { AOM_CDF4(17128, 27035, 30108) },
+                { AOM_CDF4(8324, 15344, 20249) },
+                { AOM_CDF4(1903, 5696, 9469) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8455, 19003, 24368) },
+                { AOM_CDF4(23563, 32021, 32604) },
+                { AOM_CDF4(16237, 29446, 31935) },
+                { AOM_CDF4(10724, 23999, 29358) },
+                { AOM_CDF4(6725, 17528, 24416) },
+                { AOM_CDF4(3927, 10927, 16825) },
+                { AOM_CDF4(26313, 32288, 32634) },
+                { AOM_CDF4(17430, 30095, 32095) },
+                { AOM_CDF4(11116, 24606, 29679) },
+                { AOM_CDF4(7195, 18384, 25269) },
+                { AOM_CDF4(4726, 12852, 19315) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(22822, 31648, 32483) },
+                { AOM_CDF4(16724, 29633, 31929) },
+                { AOM_CDF4(10261, 23033, 28725) },
+                { AOM_CDF4(7029, 17840, 24528) },
+                { AOM_CDF4(4867, 13886, 21502) },
+                { AOM_CDF4(25298, 31892, 32491) },
+                { AOM_CDF4(17809, 29330, 31512) },
+                { AOM_CDF4(9668, 21329, 26579) },
+                { AOM_CDF4(4774, 12956, 18976) },
+                { AOM_CDF4(2322, 7030, 11540) },
+                { AOM_CDF4(25472, 31920, 32543) },
+                { AOM_CDF4(17957, 29387, 31632) },
+                { AOM_CDF4(9196, 20593, 26400) },
+                { AOM_CDF4(4680, 12705, 19202) },
+                { AOM_CDF4(2917, 8456, 13436) },
+                { AOM_CDF4(26471, 32059, 32574) },
+                { AOM_CDF4(18458, 29783, 31909) },
+                { AOM_CDF4(8400, 19464, 25956) },
+                { AOM_CDF4(3812, 10973, 17206) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(6779, 13743, 17678) },
+                { AOM_CDF4(24806, 31797, 32457) },
+                { AOM_CDF4(17616, 29047, 31372) },
+                { AOM_CDF4(11063, 23175, 28003) },
+                { AOM_CDF4(6521, 16110, 22324) },
+                { AOM_CDF4(2764, 7504, 11654) },
+                { AOM_CDF4(25266, 32367, 32637) },
+                { AOM_CDF4(19054, 30553, 32175) },
+                { AOM_CDF4(12139, 25212, 29807) },
+                { AOM_CDF4(7311, 18162, 24704) },
+                { AOM_CDF4(3397, 9164, 14074) },
+                { AOM_CDF4(25988, 32208, 32522) },
+                { AOM_CDF4(16253, 28912, 31526) },
+                { AOM_CDF4(9151, 21387, 27372) },
+                { AOM_CDF4(5688, 14915, 21496) },
+                { AOM_CDF4(2717, 7627, 12004) },
+                { AOM_CDF4(23144, 31855, 32443) },
+                { AOM_CDF4(16070, 28491, 31325) },
+                { AOM_CDF4(8702, 20467, 26517) },
+                { AOM_CDF4(5243, 13956, 20367) },
+                { AOM_CDF4(2621, 7335, 11567) },
+                { AOM_CDF4(26636, 32340, 32630) },
+                { AOM_CDF4(19990, 31050, 32341) },
+                { AOM_CDF4(13243, 26105, 30315) },
+                { AOM_CDF4(8588, 19521, 25918) },
+                { AOM_CDF4(4717, 11585, 17304) },
+                { AOM_CDF4(25844, 32292, 32582) },
+                { AOM_CDF4(19090, 30635, 32097) },
+                { AOM_CDF4(11963, 24546, 28939) },
+                { AOM_CDF4(6218, 16087, 22354) },
+                { AOM_CDF4(2340, 6608, 10426) },
+                { AOM_CDF4(28046, 32576, 32694) },
+                { AOM_CDF4(21178, 31313, 32296) },
+                { AOM_CDF4(13486, 26184, 29870) },
+                { AOM_CDF4(7149, 17871, 23723) },
+                { AOM_CDF4(2833, 7958, 12259) },
+                { AOM_CDF4(27710, 32528, 32686) },
+                { AOM_CDF4(20674, 31076, 32268) },
+                { AOM_CDF4(12413, 24955, 29243) },
+                { AOM_CDF4(6676, 16927, 23097) },
+                { AOM_CDF4(2966, 8333, 12919) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8639, 19339, 24429) },
+                { AOM_CDF4(24404, 31837, 32525) },
+                { AOM_CDF4(16997, 29425, 31784) },
+                { AOM_CDF4(11253, 24234, 29149) },
+                { AOM_CDF4(6751, 17394, 24028) },
+                { AOM_CDF4(3490, 9830, 15191) },
+                { AOM_CDF4(26283, 32471, 32714) },
+                { AOM_CDF4(19599, 31168, 32442) },
+                { AOM_CDF4(13146, 26954, 30893) },
+                { AOM_CDF4(8214, 20588, 26890) },
+                { AOM_CDF4(4699, 13081, 19300) },
+                { AOM_CDF4(28212, 32458, 32669) },
+                { AOM_CDF4(18594, 30316, 32100) },
+                { AOM_CDF4(11219, 24408, 29234) },
+                { AOM_CDF4(6865, 17656, 24149) },
+                { AOM_CDF4(3678, 10362, 16006) },
+                { AOM_CDF4(25825, 32136, 32616) },
+                { AOM_CDF4(17313, 29853, 32021) },
+                { AOM_CDF4(11197, 24471, 29472) },
+                { AOM_CDF4(6947, 17781, 24405) },
+                { AOM_CDF4(3768, 10660, 16261) },
+                { AOM_CDF4(27352, 32500, 32706) },
+                { AOM_CDF4(20850, 31468, 32469) },
+                { AOM_CDF4(14021, 27707, 31133) },
+                { AOM_CDF4(8964, 21748, 27838) },
+                { AOM_CDF4(5437, 14665, 21187) },
+                { AOM_CDF4(26304, 32492, 32698) },
+                { AOM_CDF4(20409, 31380, 32385) },
+                { AOM_CDF4(13682, 27222, 30632) },
+                { AOM_CDF4(8974, 21236, 26685) },
+                { AOM_CDF4(4234, 11665, 16934) },
+                { AOM_CDF4(26273, 32357, 32711) },
+                { AOM_CDF4(20672, 31242, 32441) },
+                { AOM_CDF4(14172, 27254, 30902) },
+                { AOM_CDF4(9870, 21898, 27275) },
+                { AOM_CDF4(5164, 13506, 19270) },
+                { AOM_CDF4(26725, 32459, 32728) },
+                { AOM_CDF4(20991, 31442, 32527) },
+                { AOM_CDF4(13071, 26434, 30811) },
+                { AOM_CDF4(8184, 20090, 26742) },
+                { AOM_CDF4(4803, 13255, 19895) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(7555, 14942, 18501) },
+                { AOM_CDF4(24410, 31178, 32287) },
+                { AOM_CDF4(14394, 26738, 30253) },
+                { AOM_CDF4(8413, 19554, 25195) },
+                { AOM_CDF4(4766, 12924, 18785) },
+                { AOM_CDF4(2029, 5806, 9207) },
+                { AOM_CDF4(26776, 32364, 32663) },
+                { AOM_CDF4(18732, 29967, 31931) },
+                { AOM_CDF4(11005, 23786, 28852) },
+                { AOM_CDF4(6466, 16909, 23510) },
+                { AOM_CDF4(3044, 8638, 13419) },
+                { AOM_CDF4(29208, 32582, 32704) },
+                { AOM_CDF4(20068, 30857, 32208) },
+                { AOM_CDF4(12003, 25085, 29595) },
+                { AOM_CDF4(6947, 17750, 24189) },
+                { AOM_CDF4(3245, 9103, 14007) },
+                { AOM_CDF4(27359, 32465, 32669) },
+                { AOM_CDF4(19421, 30614, 32174) },
+                { AOM_CDF4(11915, 25010, 29579) },
+                { AOM_CDF4(6950, 17676, 24074) },
+                { AOM_CDF4(3007, 8473, 13096) },
+                { AOM_CDF4(29002, 32676, 32735) },
+                { AOM_CDF4(22102, 31849, 32576) },
+                { AOM_CDF4(14408, 28009, 31405) },
+                { AOM_CDF4(9027, 21679, 27931) },
+                { AOM_CDF4(4694, 12678, 18748) },
+                { AOM_CDF4(28216, 32528, 32682) },
+                { AOM_CDF4(20849, 31264, 32318) },
+                { AOM_CDF4(12756, 25815, 29751) },
+                { AOM_CDF4(7565, 18801, 24923) },
+                { AOM_CDF4(3509, 9533, 14477) },
+                { AOM_CDF4(30133, 32687, 32739) },
+                { AOM_CDF4(23063, 31910, 32515) },
+                { AOM_CDF4(14588, 28051, 31132) },
+                { AOM_CDF4(9085, 21649, 27457) },
+                { AOM_CDF4(4261, 11654, 17264) },
+                { AOM_CDF4(29518, 32691, 32748) },
+                { AOM_CDF4(22451, 31959, 32613) },
+                { AOM_CDF4(14864, 28722, 31700) },
+                { AOM_CDF4(9695, 22964, 28716) },
+                { AOM_CDF4(4932, 13358, 19502) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(6465, 16958, 21688) },
+                { AOM_CDF4(25199, 31514, 32360) },
+                { AOM_CDF4(14774, 27149, 30607) },
+                { AOM_CDF4(9257, 21438, 26972) },
+                { AOM_CDF4(5723, 15183, 21882) },
+                { AOM_CDF4(3150, 8879, 13731) },
+                { AOM_CDF4(26989, 32262, 32682) },
+                { AOM_CDF4(17396, 29937, 32085) },
+                { AOM_CDF4(11387, 24901, 29784) },
+                { AOM_CDF4(7289, 18821, 25548) },
+                { AOM_CDF4(3734, 10577, 16086) },
+                { AOM_CDF4(29728, 32501, 32695) },
+                { AOM_CDF4(17431, 29701, 31903) },
+                { AOM_CDF4(9921, 22826, 28300) },
+                { AOM_CDF4(5896, 15434, 22068) },
+                { AOM_CDF4(3430, 9646, 14757) },
+                { AOM_CDF4(28614, 32511, 32705) },
+                { AOM_CDF4(19364, 30638, 32263) },
+                { AOM_CDF4(13129, 26254, 30402) },
+                { AOM_CDF4(8754, 20484, 26440) },
+                { AOM_CDF4(4378, 11607, 17110) },
+                { AOM_CDF4(30292, 32671, 32744) },
+                { AOM_CDF4(21780, 31603, 32501) },
+                { AOM_CDF4(14314, 27829, 31291) },
+                { AOM_CDF4(9611, 22327, 28263) },
+                { AOM_CDF4(4890, 13087, 19065) },
+                { AOM_CDF4(25862, 32567, 32733) },
+                { AOM_CDF4(20794, 32050, 32567) },
+                { AOM_CDF4(17243, 30625, 32254) },
+                { AOM_CDF4(13283, 27628, 31474) },
+                { AOM_CDF4(9669, 22532, 28918) },
+                { AOM_CDF4(27435, 32697, 32748) },
+                { AOM_CDF4(24922, 32390, 32714) },
+                { AOM_CDF4(21449, 31504, 32536) },
+                { AOM_CDF4(16392, 29729, 31832) },
+                { AOM_CDF4(11692, 24884, 29076) },
+                { AOM_CDF4(24193, 32290, 32735) },
+                { AOM_CDF4(18909, 31104, 32563) },
+                { AOM_CDF4(12236, 26841, 31403) },
+                { AOM_CDF4(8171, 21840, 29082) },
+                { AOM_CDF4(7224, 17280, 25275) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(3078, 6839, 9890) },
+                { AOM_CDF4(13837, 20450, 24479) },
+                { AOM_CDF4(5914, 14222, 19328) },
+                { AOM_CDF4(3866, 10267, 14762) },
+                { AOM_CDF4(2612, 7208, 11042) },
+                { AOM_CDF4(1067, 2991, 4776) },
+                { AOM_CDF4(25817, 31646, 32529) },
+                { AOM_CDF4(13708, 26338, 30385) },
+                { AOM_CDF4(7328, 18585, 24870) },
+                { AOM_CDF4(4691, 13080, 19276) },
+                { AOM_CDF4(1825, 5253, 8352) },
+                { AOM_CDF4(29386, 32315, 32624) },
+                { AOM_CDF4(17160, 29001, 31360) },
+                { AOM_CDF4(9602, 21862, 27396) },
+                { AOM_CDF4(5915, 15772, 22148) },
+                { AOM_CDF4(2786, 7779, 12047) },
+                { AOM_CDF4(29246, 32450, 32663) },
+                { AOM_CDF4(18696, 29929, 31818) },
+                { AOM_CDF4(10510, 23369, 28560) },
+                { AOM_CDF4(6229, 16499, 23125) },
+                { AOM_CDF4(2608, 7448, 11705) },
+                { AOM_CDF4(30753, 32710, 32748) },
+                { AOM_CDF4(21638, 31487, 32503) },
+                { AOM_CDF4(12937, 26854, 30870) },
+                { AOM_CDF4(8182, 20596, 26970) },
+                { AOM_CDF4(3637, 10269, 15497) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(5244, 12150, 16906) },
+                { AOM_CDF4(20486, 26858, 29701) },
+                { AOM_CDF4(7756, 18317, 23735) },
+                { AOM_CDF4(3452, 9256, 13146) },
+                { AOM_CDF4(2020, 5206, 8229) },
+                { AOM_CDF4(1801, 4993, 7903) },
+                { AOM_CDF4(27051, 31858, 32531) },
+                { AOM_CDF4(15988, 27531, 30619) },
+                { AOM_CDF4(9188, 21484, 26719) },
+                { AOM_CDF4(6273, 17186, 23800) },
+                { AOM_CDF4(3108, 9355, 14764) },
+                { AOM_CDF4(31076, 32520, 32680) },
+                { AOM_CDF4(18119, 30037, 31850) },
+                { AOM_CDF4(10244, 22969, 27472) },
+                { AOM_CDF4(4692, 14077, 19273) },
+                { AOM_CDF4(3694, 11677, 17556) },
+                { AOM_CDF4(30060, 32581, 32720) },
+                { AOM_CDF4(21011, 30775, 32120) },
+                { AOM_CDF4(11931, 24820, 29289) },
+                { AOM_CDF4(7119, 17662, 24356) },
+                { AOM_CDF4(3833, 10706, 16304) },
+                { AOM_CDF4(31954, 32731, 32748) },
+                { AOM_CDF4(23913, 31724, 32489) },
+                { AOM_CDF4(15520, 28060, 31286) },
+                { AOM_CDF4(11517, 23008, 28571) },
+                { AOM_CDF4(6193, 14508, 20629) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(1035, 2807, 4156) },
+                { AOM_CDF4(13162, 18138, 20939) },
+                { AOM_CDF4(2696, 6633, 8755) },
+                { AOM_CDF4(1373, 4161, 6853) },
+                { AOM_CDF4(1099, 2746, 4716) },
+                { AOM_CDF4(340, 1021, 1599) },
+                { AOM_CDF4(22826, 30419, 32135) },
+                { AOM_CDF4(10395, 21762, 26942) },
+                { AOM_CDF4(4726, 12407, 17361) },
+                { AOM_CDF4(2447, 7080, 10593) },
+                { AOM_CDF4(1227, 3717, 6011) },
+                { AOM_CDF4(28156, 31424, 31934) },
+                { AOM_CDF4(16915, 27754, 30373) },
+                { AOM_CDF4(9148, 20990, 26431) },
+                { AOM_CDF4(5950, 15515, 21148) },
+                { AOM_CDF4(2492, 7327, 11526) },
+                { AOM_CDF4(30602, 32477, 32670) },
+                { AOM_CDF4(20026, 29955, 31568) },
+                { AOM_CDF4(11220, 23628, 28105) },
+                { AOM_CDF4(6652, 17019, 22973) },
+                { AOM_CDF4(3064, 8536, 13043) },
+                { AOM_CDF4(31769, 32724, 32748) },
+                { AOM_CDF4(22230, 30887, 32373) },
+                { AOM_CDF4(12234, 25079, 29731) },
+                { AOM_CDF4(7326, 18816, 25353) },
+                { AOM_CDF4(3933, 10907, 16616) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } } },
+          { { { { AOM_CDF4(8896, 16227, 20630) },
+                { AOM_CDF4(23629, 31782, 32527) },
+                { AOM_CDF4(15173, 27755, 31321) },
+                { AOM_CDF4(10158, 21233, 27382) },
+                { AOM_CDF4(6420, 14857, 21558) },
+                { AOM_CDF4(3269, 8155, 12646) },
+                { AOM_CDF4(24835, 32009, 32496) },
+                { AOM_CDF4(16509, 28421, 31579) },
+                { AOM_CDF4(10957, 21514, 27418) },
+                { AOM_CDF4(7881, 15930, 22096) },
+                { AOM_CDF4(5388, 10960, 15918) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(20745, 30773, 32093) },
+                { AOM_CDF4(15200, 27221, 30861) },
+                { AOM_CDF4(13032, 20873, 25667) },
+                { AOM_CDF4(12285, 18663, 23494) },
+                { AOM_CDF4(11563, 17481, 21489) },
+                { AOM_CDF4(26260, 31982, 32320) },
+                { AOM_CDF4(15397, 28083, 31100) },
+                { AOM_CDF4(9742, 19217, 24824) },
+                { AOM_CDF4(3261, 9629, 15362) },
+                { AOM_CDF4(1480, 4322, 7499) },
+                { AOM_CDF4(27599, 32256, 32460) },
+                { AOM_CDF4(16857, 27659, 30774) },
+                { AOM_CDF4(9551, 18290, 23748) },
+                { AOM_CDF4(3052, 8933, 14103) },
+                { AOM_CDF4(2021, 5910, 9787) },
+                { AOM_CDF4(29005, 32015, 32392) },
+                { AOM_CDF4(17677, 27694, 30863) },
+                { AOM_CDF4(9204, 17356, 23219) },
+                { AOM_CDF4(2403, 7516, 12814) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(10808, 22056, 26896) },
+                { AOM_CDF4(25739, 32313, 32676) },
+                { AOM_CDF4(17288, 30203, 32221) },
+                { AOM_CDF4(11359, 24878, 29896) },
+                { AOM_CDF4(6949, 17767, 24893) },
+                { AOM_CDF4(4287, 11796, 18071) },
+                { AOM_CDF4(27880, 32521, 32705) },
+                { AOM_CDF4(19038, 31004, 32414) },
+                { AOM_CDF4(12564, 26345, 30768) },
+                { AOM_CDF4(8269, 19947, 26779) },
+                { AOM_CDF4(5674, 14657, 21674) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(25742, 32319, 32671) },
+                { AOM_CDF4(19557, 31164, 32454) },
+                { AOM_CDF4(13381, 26381, 30755) },
+                { AOM_CDF4(10101, 21466, 26722) },
+                { AOM_CDF4(9209, 19650, 26825) },
+                { AOM_CDF4(27107, 31917, 32432) },
+                { AOM_CDF4(18056, 28893, 31203) },
+                { AOM_CDF4(10200, 21434, 26764) },
+                { AOM_CDF4(4660, 12913, 19502) },
+                { AOM_CDF4(2368, 6930, 12504) },
+                { AOM_CDF4(26960, 32158, 32613) },
+                { AOM_CDF4(18628, 30005, 32031) },
+                { AOM_CDF4(10233, 22442, 28232) },
+                { AOM_CDF4(5471, 14630, 21516) },
+                { AOM_CDF4(3235, 10767, 17109) },
+                { AOM_CDF4(27696, 32440, 32692) },
+                { AOM_CDF4(20032, 31167, 32438) },
+                { AOM_CDF4(8700, 21341, 28442) },
+                { AOM_CDF4(5662, 14831, 21795) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(9704, 17294, 21132) },
+                { AOM_CDF4(26762, 32278, 32633) },
+                { AOM_CDF4(18382, 29620, 31819) },
+                { AOM_CDF4(10891, 23475, 28723) },
+                { AOM_CDF4(6358, 16583, 23309) },
+                { AOM_CDF4(3248, 9118, 14141) },
+                { AOM_CDF4(27204, 32573, 32699) },
+                { AOM_CDF4(19818, 30824, 32329) },
+                { AOM_CDF4(11772, 25120, 30041) },
+                { AOM_CDF4(6995, 18033, 25039) },
+                { AOM_CDF4(3752, 10442, 16098) },
+                { AOM_CDF4(27222, 32256, 32559) },
+                { AOM_CDF4(15356, 28399, 31475) },
+                { AOM_CDF4(8821, 20635, 27057) },
+                { AOM_CDF4(5511, 14404, 21239) },
+                { AOM_CDF4(2935, 8222, 13051) },
+                { AOM_CDF4(24875, 32120, 32529) },
+                { AOM_CDF4(15233, 28265, 31445) },
+                { AOM_CDF4(8605, 20570, 26932) },
+                { AOM_CDF4(5431, 14413, 21196) },
+                { AOM_CDF4(2994, 8341, 13223) },
+                { AOM_CDF4(28201, 32604, 32700) },
+                { AOM_CDF4(21041, 31446, 32456) },
+                { AOM_CDF4(13221, 26213, 30475) },
+                { AOM_CDF4(8255, 19385, 26037) },
+                { AOM_CDF4(4930, 12585, 18830) },
+                { AOM_CDF4(28768, 32448, 32627) },
+                { AOM_CDF4(19705, 30561, 32021) },
+                { AOM_CDF4(11572, 23589, 28220) },
+                { AOM_CDF4(5532, 15034, 21446) },
+                { AOM_CDF4(2460, 7150, 11456) },
+                { AOM_CDF4(29874, 32619, 32699) },
+                { AOM_CDF4(21621, 31071, 32201) },
+                { AOM_CDF4(12511, 24747, 28992) },
+                { AOM_CDF4(6281, 16395, 22748) },
+                { AOM_CDF4(3246, 9278, 14497) },
+                { AOM_CDF4(29715, 32625, 32712) },
+                { AOM_CDF4(20958, 31011, 32283) },
+                { AOM_CDF4(11233, 23671, 28806) },
+                { AOM_CDF4(6012, 16128, 22868) },
+                { AOM_CDF4(3427, 9851, 15414) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(11016, 22111, 26794) },
+                { AOM_CDF4(25946, 32357, 32677) },
+                { AOM_CDF4(17890, 30452, 32252) },
+                { AOM_CDF4(11678, 25142, 29816) },
+                { AOM_CDF4(6720, 17534, 24584) },
+                { AOM_CDF4(4230, 11665, 17820) },
+                { AOM_CDF4(28400, 32623, 32747) },
+                { AOM_CDF4(21164, 31668, 32575) },
+                { AOM_CDF4(13572, 27388, 31182) },
+                { AOM_CDF4(8234, 20750, 27358) },
+                { AOM_CDF4(5065, 14055, 20897) },
+                { AOM_CDF4(28981, 32547, 32705) },
+                { AOM_CDF4(18681, 30543, 32239) },
+                { AOM_CDF4(10919, 24075, 29286) },
+                { AOM_CDF4(6431, 17199, 24077) },
+                { AOM_CDF4(3819, 10464, 16618) },
+                { AOM_CDF4(26870, 32467, 32693) },
+                { AOM_CDF4(19041, 30831, 32347) },
+                { AOM_CDF4(11794, 25211, 30016) },
+                { AOM_CDF4(6888, 18019, 24970) },
+                { AOM_CDF4(4370, 12363, 18992) },
+                { AOM_CDF4(29578, 32670, 32744) },
+                { AOM_CDF4(23159, 32007, 32613) },
+                { AOM_CDF4(15315, 28669, 31676) },
+                { AOM_CDF4(9298, 22607, 28782) },
+                { AOM_CDF4(6144, 15913, 22968) },
+                { AOM_CDF4(28110, 32499, 32669) },
+                { AOM_CDF4(21574, 30937, 32015) },
+                { AOM_CDF4(12759, 24818, 28727) },
+                { AOM_CDF4(6545, 16761, 23042) },
+                { AOM_CDF4(3649, 10597, 16833) },
+                { AOM_CDF4(28163, 32552, 32728) },
+                { AOM_CDF4(22101, 31469, 32464) },
+                { AOM_CDF4(13160, 25472, 30143) },
+                { AOM_CDF4(7303, 18684, 25468) },
+                { AOM_CDF4(5241, 13975, 20955) },
+                { AOM_CDF4(28400, 32631, 32744) },
+                { AOM_CDF4(22104, 31793, 32603) },
+                { AOM_CDF4(13557, 26571, 30846) },
+                { AOM_CDF4(7749, 19861, 26675) },
+                { AOM_CDF4(4873, 14030, 21234) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(9800, 17635, 21073) },
+                { AOM_CDF4(26153, 31885, 32527) },
+                { AOM_CDF4(15038, 27852, 31006) },
+                { AOM_CDF4(8718, 20564, 26486) },
+                { AOM_CDF4(5128, 14076, 20514) },
+                { AOM_CDF4(2636, 7566, 11925) },
+                { AOM_CDF4(27551, 32504, 32701) },
+                { AOM_CDF4(18310, 30054, 32100) },
+                { AOM_CDF4(10211, 23420, 29082) },
+                { AOM_CDF4(6222, 16876, 23916) },
+                { AOM_CDF4(3462, 9954, 15498) },
+                { AOM_CDF4(29991, 32633, 32721) },
+                { AOM_CDF4(19883, 30751, 32201) },
+                { AOM_CDF4(11141, 24184, 29285) },
+                { AOM_CDF4(6420, 16940, 23774) },
+                { AOM_CDF4(3392, 9753, 15118) },
+                { AOM_CDF4(28465, 32616, 32712) },
+                { AOM_CDF4(19850, 30702, 32244) },
+                { AOM_CDF4(10983, 24024, 29223) },
+                { AOM_CDF4(6294, 16770, 23582) },
+                { AOM_CDF4(3244, 9283, 14509) },
+                { AOM_CDF4(30023, 32717, 32748) },
+                { AOM_CDF4(22940, 32032, 32626) },
+                { AOM_CDF4(14282, 27928, 31473) },
+                { AOM_CDF4(8562, 21327, 27914) },
+                { AOM_CDF4(4846, 13393, 19919) },
+                { AOM_CDF4(29981, 32590, 32695) },
+                { AOM_CDF4(20465, 30963, 32166) },
+                { AOM_CDF4(11479, 23579, 28195) },
+                { AOM_CDF4(5916, 15648, 22073) },
+                { AOM_CDF4(3031, 8605, 13398) },
+                { AOM_CDF4(31146, 32691, 32739) },
+                { AOM_CDF4(23106, 31724, 32444) },
+                { AOM_CDF4(13783, 26738, 30439) },
+                { AOM_CDF4(7852, 19468, 25807) },
+                { AOM_CDF4(3860, 11124, 16853) },
+                { AOM_CDF4(31014, 32724, 32748) },
+                { AOM_CDF4(23629, 32109, 32628) },
+                { AOM_CDF4(14747, 28115, 31403) },
+                { AOM_CDF4(8545, 21242, 27478) },
+                { AOM_CDF4(4574, 12781, 19067) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(9185, 19694, 24688) },
+                { AOM_CDF4(26081, 31985, 32621) },
+                { AOM_CDF4(16015, 29000, 31787) },
+                { AOM_CDF4(10542, 23690, 29206) },
+                { AOM_CDF4(6732, 17945, 24677) },
+                { AOM_CDF4(3916, 11039, 16722) },
+                { AOM_CDF4(28224, 32566, 32744) },
+                { AOM_CDF4(19100, 31138, 32485) },
+                { AOM_CDF4(12528, 26620, 30879) },
+                { AOM_CDF4(7741, 20277, 26885) },
+                { AOM_CDF4(4566, 12845, 18990) },
+                { AOM_CDF4(29933, 32593, 32718) },
+                { AOM_CDF4(17670, 30333, 32155) },
+                { AOM_CDF4(10385, 23600, 28909) },
+                { AOM_CDF4(6243, 16236, 22407) },
+                { AOM_CDF4(3976, 10389, 16017) },
+                { AOM_CDF4(28377, 32561, 32738) },
+                { AOM_CDF4(19366, 31175, 32482) },
+                { AOM_CDF4(13327, 27175, 31094) },
+                { AOM_CDF4(8258, 20769, 27143) },
+                { AOM_CDF4(4703, 13198, 19527) },
+                { AOM_CDF4(31086, 32706, 32748) },
+                { AOM_CDF4(22853, 31902, 32583) },
+                { AOM_CDF4(14759, 28186, 31419) },
+                { AOM_CDF4(9284, 22382, 28348) },
+                { AOM_CDF4(5585, 15192, 21868) },
+                { AOM_CDF4(28291, 32652, 32746) },
+                { AOM_CDF4(19849, 32107, 32571) },
+                { AOM_CDF4(14834, 26818, 29214) },
+                { AOM_CDF4(10306, 22594, 28672) },
+                { AOM_CDF4(6615, 17384, 23384) },
+                { AOM_CDF4(28947, 32604, 32745) },
+                { AOM_CDF4(25625, 32289, 32646) },
+                { AOM_CDF4(18758, 28672, 31403) },
+                { AOM_CDF4(10017, 23430, 28523) },
+                { AOM_CDF4(6862, 15269, 22131) },
+                { AOM_CDF4(23933, 32509, 32739) },
+                { AOM_CDF4(19927, 31495, 32631) },
+                { AOM_CDF4(11903, 26023, 30621) },
+                { AOM_CDF4(7026, 20094, 27252) },
+                { AOM_CDF4(5998, 18106, 24437) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(4456, 11274, 15533) },
+                { AOM_CDF4(21219, 29079, 31616) },
+                { AOM_CDF4(11173, 23774, 28567) },
+                { AOM_CDF4(7282, 18293, 24263) },
+                { AOM_CDF4(4890, 13286, 19115) },
+                { AOM_CDF4(1890, 5508, 8659) },
+                { AOM_CDF4(26651, 32136, 32647) },
+                { AOM_CDF4(14630, 28254, 31455) },
+                { AOM_CDF4(8716, 21287, 27395) },
+                { AOM_CDF4(5615, 15331, 22008) },
+                { AOM_CDF4(2675, 7700, 12150) },
+                { AOM_CDF4(29954, 32526, 32690) },
+                { AOM_CDF4(16126, 28982, 31633) },
+                { AOM_CDF4(9030, 21361, 27352) },
+                { AOM_CDF4(5411, 14793, 21271) },
+                { AOM_CDF4(2943, 8422, 13163) },
+                { AOM_CDF4(29539, 32601, 32730) },
+                { AOM_CDF4(18125, 30385, 32201) },
+                { AOM_CDF4(10422, 24090, 29468) },
+                { AOM_CDF4(6468, 17487, 24438) },
+                { AOM_CDF4(2970, 8653, 13531) },
+                { AOM_CDF4(30912, 32715, 32748) },
+                { AOM_CDF4(20666, 31373, 32497) },
+                { AOM_CDF4(12509, 26640, 30917) },
+                { AOM_CDF4(8058, 20629, 27290) },
+                { AOM_CDF4(4231, 12006, 18052) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(10202, 20633, 25484) },
+                { AOM_CDF4(27336, 31445, 32352) },
+                { AOM_CDF4(12420, 24384, 28552) },
+                { AOM_CDF4(7648, 18115, 23856) },
+                { AOM_CDF4(5662, 14341, 19902) },
+                { AOM_CDF4(3611, 10328, 15390) },
+                { AOM_CDF4(30945, 32616, 32736) },
+                { AOM_CDF4(18682, 30505, 32253) },
+                { AOM_CDF4(11513, 25336, 30203) },
+                { AOM_CDF4(7449, 19452, 26148) },
+                { AOM_CDF4(4482, 13051, 18886) },
+                { AOM_CDF4(32022, 32690, 32747) },
+                { AOM_CDF4(18578, 30501, 32146) },
+                { AOM_CDF4(11249, 23368, 28631) },
+                { AOM_CDF4(5645, 16958, 22158) },
+                { AOM_CDF4(5009, 11444, 16637) },
+                { AOM_CDF4(31357, 32710, 32748) },
+                { AOM_CDF4(21552, 31494, 32504) },
+                { AOM_CDF4(13891, 27677, 31340) },
+                { AOM_CDF4(9051, 22098, 28172) },
+                { AOM_CDF4(5190, 13377, 19486) },
+                { AOM_CDF4(32364, 32740, 32748) },
+                { AOM_CDF4(24839, 31907, 32551) },
+                { AOM_CDF4(17160, 28779, 31696) },
+                { AOM_CDF4(12452, 24137, 29602) },
+                { AOM_CDF4(6165, 15389, 22477) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(2575, 7281, 11077) },
+                { AOM_CDF4(14002, 20866, 25402) },
+                { AOM_CDF4(6343, 15056, 19658) },
+                { AOM_CDF4(4474, 11858, 17041) },
+                { AOM_CDF4(2865, 8299, 12534) },
+                { AOM_CDF4(1344, 3949, 6391) },
+                { AOM_CDF4(24720, 31239, 32459) },
+                { AOM_CDF4(12585, 25356, 29968) },
+                { AOM_CDF4(7181, 18246, 24444) },
+                { AOM_CDF4(5025, 13667, 19885) },
+                { AOM_CDF4(2521, 7304, 11605) },
+                { AOM_CDF4(29908, 32252, 32584) },
+                { AOM_CDF4(17421, 29156, 31575) },
+                { AOM_CDF4(9889, 22188, 27782) },
+                { AOM_CDF4(5878, 15647, 22123) },
+                { AOM_CDF4(2814, 8665, 13323) },
+                { AOM_CDF4(30183, 32568, 32713) },
+                { AOM_CDF4(18528, 30195, 32049) },
+                { AOM_CDF4(10982, 24606, 29657) },
+                { AOM_CDF4(6957, 18165, 25231) },
+                { AOM_CDF4(3508, 10118, 15468) },
+                { AOM_CDF4(31761, 32736, 32748) },
+                { AOM_CDF4(21041, 31328, 32546) },
+                { AOM_CDF4(12568, 26732, 31166) },
+                { AOM_CDF4(8052, 20720, 27733) },
+                { AOM_CDF4(4336, 12192, 18396) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } } },
+          { { { { AOM_CDF4(7062, 16472, 22319) },
+                { AOM_CDF4(24538, 32261, 32674) },
+                { AOM_CDF4(13675, 28041, 31779) },
+                { AOM_CDF4(8590, 20674, 27631) },
+                { AOM_CDF4(5685, 14675, 22013) },
+                { AOM_CDF4(3655, 9898, 15731) },
+                { AOM_CDF4(26493, 32418, 32658) },
+                { AOM_CDF4(16376, 29342, 32090) },
+                { AOM_CDF4(10594, 22649, 28970) },
+                { AOM_CDF4(8176, 17170, 24303) },
+                { AOM_CDF4(5605, 12694, 19139) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(23888, 31902, 32542) },
+                { AOM_CDF4(18612, 29687, 31987) },
+                { AOM_CDF4(16245, 24852, 29249) },
+                { AOM_CDF4(15765, 22608, 27559) },
+                { AOM_CDF4(19895, 24699, 27510) },
+                { AOM_CDF4(28401, 32212, 32457) },
+                { AOM_CDF4(15274, 27825, 30980) },
+                { AOM_CDF4(9364, 18128, 24332) },
+                { AOM_CDF4(2283, 8193, 15082) },
+                { AOM_CDF4(1228, 3972, 7881) },
+                { AOM_CDF4(29455, 32469, 32620) },
+                { AOM_CDF4(17981, 28245, 31388) },
+                { AOM_CDF4(10921, 20098, 26240) },
+                { AOM_CDF4(3743, 11829, 18657) },
+                { AOM_CDF4(2374, 9593, 15715) },
+                { AOM_CDF4(31068, 32466, 32635) },
+                { AOM_CDF4(20321, 29572, 31971) },
+                { AOM_CDF4(10771, 20255, 27119) },
+                { AOM_CDF4(2795, 10410, 17361) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(9320, 22102, 27840) },
+                { AOM_CDF4(27057, 32464, 32724) },
+                { AOM_CDF4(16331, 30268, 32309) },
+                { AOM_CDF4(10319, 23935, 29720) },
+                { AOM_CDF4(6189, 16448, 24106) },
+                { AOM_CDF4(3589, 10884, 18808) },
+                { AOM_CDF4(29026, 32624, 32748) },
+                { AOM_CDF4(19226, 31507, 32587) },
+                { AOM_CDF4(12692, 26921, 31203) },
+                { AOM_CDF4(7049, 19532, 27635) },
+                { AOM_CDF4(7727, 15669, 23252) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(28056, 32625, 32748) },
+                { AOM_CDF4(22383, 32075, 32669) },
+                { AOM_CDF4(15417, 27098, 31749) },
+                { AOM_CDF4(18127, 26493, 27190) },
+                { AOM_CDF4(5461, 16384, 21845) },
+                { AOM_CDF4(27982, 32091, 32584) },
+                { AOM_CDF4(19045, 29868, 31972) },
+                { AOM_CDF4(10397, 22266, 27932) },
+                { AOM_CDF4(5990, 13697, 21500) },
+                { AOM_CDF4(1792, 6912, 15104) },
+                { AOM_CDF4(28198, 32501, 32718) },
+                { AOM_CDF4(21534, 31521, 32569) },
+                { AOM_CDF4(11109, 25217, 30017) },
+                { AOM_CDF4(5671, 15124, 26151) },
+                { AOM_CDF4(4681, 14043, 18725) },
+                { AOM_CDF4(28688, 32580, 32741) },
+                { AOM_CDF4(22576, 32079, 32661) },
+                { AOM_CDF4(10627, 22141, 28340) },
+                { AOM_CDF4(9362, 14043, 28087) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(7754, 16948, 22142) },
+                { AOM_CDF4(25670, 32330, 32691) },
+                { AOM_CDF4(15663, 29225, 31994) },
+                { AOM_CDF4(9878, 23288, 29158) },
+                { AOM_CDF4(6419, 17088, 24336) },
+                { AOM_CDF4(3859, 11003, 17039) },
+                { AOM_CDF4(27562, 32595, 32725) },
+                { AOM_CDF4(17575, 30588, 32399) },
+                { AOM_CDF4(10819, 24838, 30309) },
+                { AOM_CDF4(7124, 18686, 25916) },
+                { AOM_CDF4(4479, 12688, 19340) },
+                { AOM_CDF4(28385, 32476, 32673) },
+                { AOM_CDF4(15306, 29005, 31938) },
+                { AOM_CDF4(8937, 21615, 28322) },
+                { AOM_CDF4(5982, 15603, 22786) },
+                { AOM_CDF4(3620, 10267, 16136) },
+                { AOM_CDF4(27280, 32464, 32667) },
+                { AOM_CDF4(15607, 29160, 32004) },
+                { AOM_CDF4(9091, 22135, 28740) },
+                { AOM_CDF4(6232, 16632, 24020) },
+                { AOM_CDF4(4047, 11377, 17672) },
+                { AOM_CDF4(29220, 32630, 32718) },
+                { AOM_CDF4(19650, 31220, 32462) },
+                { AOM_CDF4(13050, 26312, 30827) },
+                { AOM_CDF4(9228, 20870, 27468) },
+                { AOM_CDF4(6146, 15149, 21971) },
+                { AOM_CDF4(30169, 32481, 32623) },
+                { AOM_CDF4(17212, 29311, 31554) },
+                { AOM_CDF4(9911, 21311, 26882) },
+                { AOM_CDF4(4487, 13314, 20372) },
+                { AOM_CDF4(2570, 7772, 12889) },
+                { AOM_CDF4(30924, 32613, 32708) },
+                { AOM_CDF4(19490, 30206, 32107) },
+                { AOM_CDF4(11232, 23998, 29276) },
+                { AOM_CDF4(6769, 17955, 25035) },
+                { AOM_CDF4(4398, 12623, 19214) },
+                { AOM_CDF4(30609, 32627, 32722) },
+                { AOM_CDF4(19370, 30582, 32287) },
+                { AOM_CDF4(10457, 23619, 29409) },
+                { AOM_CDF4(6443, 17637, 24834) },
+                { AOM_CDF4(4645, 13236, 20106) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8626, 20271, 26216) },
+                { AOM_CDF4(26707, 32406, 32711) },
+                { AOM_CDF4(16999, 30329, 32286) },
+                { AOM_CDF4(11445, 25123, 30286) },
+                { AOM_CDF4(6411, 18828, 25601) },
+                { AOM_CDF4(6801, 12458, 20248) },
+                { AOM_CDF4(29918, 32682, 32748) },
+                { AOM_CDF4(20649, 31739, 32618) },
+                { AOM_CDF4(12879, 27773, 31581) },
+                { AOM_CDF4(7896, 21751, 28244) },
+                { AOM_CDF4(5260, 14870, 23698) },
+                { AOM_CDF4(29252, 32593, 32731) },
+                { AOM_CDF4(17072, 30460, 32294) },
+                { AOM_CDF4(10653, 24143, 29365) },
+                { AOM_CDF4(6536, 17490, 23983) },
+                { AOM_CDF4(4929, 13170, 20085) },
+                { AOM_CDF4(28137, 32518, 32715) },
+                { AOM_CDF4(18171, 30784, 32407) },
+                { AOM_CDF4(11437, 25436, 30459) },
+                { AOM_CDF4(7252, 18534, 26176) },
+                { AOM_CDF4(4126, 13353, 20978) },
+                { AOM_CDF4(31162, 32726, 32748) },
+                { AOM_CDF4(23017, 32222, 32701) },
+                { AOM_CDF4(15629, 29233, 32046) },
+                { AOM_CDF4(9387, 22621, 29480) },
+                { AOM_CDF4(6922, 17616, 25010) },
+                { AOM_CDF4(28838, 32265, 32614) },
+                { AOM_CDF4(19701, 30206, 31920) },
+                { AOM_CDF4(11214, 22410, 27933) },
+                { AOM_CDF4(5320, 14177, 23034) },
+                { AOM_CDF4(5049, 12881, 17827) },
+                { AOM_CDF4(27484, 32471, 32734) },
+                { AOM_CDF4(21076, 31526, 32561) },
+                { AOM_CDF4(12707, 26303, 31211) },
+                { AOM_CDF4(8169, 21722, 28219) },
+                { AOM_CDF4(6045, 19406, 27042) },
+                { AOM_CDF4(27753, 32572, 32745) },
+                { AOM_CDF4(20832, 31878, 32653) },
+                { AOM_CDF4(13250, 27356, 31674) },
+                { AOM_CDF4(7718, 21508, 29858) },
+                { AOM_CDF4(7209, 18350, 25559) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(7876, 16901, 21741) },
+                { AOM_CDF4(24001, 31898, 32625) },
+                { AOM_CDF4(14529, 27959, 31451) },
+                { AOM_CDF4(8273, 20818, 27258) },
+                { AOM_CDF4(5278, 14673, 21510) },
+                { AOM_CDF4(2983, 8843, 14039) },
+                { AOM_CDF4(28016, 32574, 32732) },
+                { AOM_CDF4(17471, 30306, 32301) },
+                { AOM_CDF4(10224, 24063, 29728) },
+                { AOM_CDF4(6602, 17954, 25052) },
+                { AOM_CDF4(4002, 11585, 17759) },
+                { AOM_CDF4(30190, 32634, 32739) },
+                { AOM_CDF4(17497, 30282, 32270) },
+                { AOM_CDF4(10229, 23729, 29538) },
+                { AOM_CDF4(6344, 17211, 24440) },
+                { AOM_CDF4(3849, 11189, 17108) },
+                { AOM_CDF4(28570, 32583, 32726) },
+                { AOM_CDF4(17521, 30161, 32238) },
+                { AOM_CDF4(10153, 23565, 29378) },
+                { AOM_CDF4(6455, 17341, 24443) },
+                { AOM_CDF4(3907, 11042, 17024) },
+                { AOM_CDF4(30689, 32715, 32748) },
+                { AOM_CDF4(21546, 31840, 32610) },
+                { AOM_CDF4(13547, 27581, 31459) },
+                { AOM_CDF4(8912, 21757, 28309) },
+                { AOM_CDF4(5548, 15080, 22046) },
+                { AOM_CDF4(30783, 32540, 32685) },
+                { AOM_CDF4(17540, 29528, 31668) },
+                { AOM_CDF4(10160, 21468, 26783) },
+                { AOM_CDF4(4724, 13393, 20054) },
+                { AOM_CDF4(2702, 8174, 13102) },
+                { AOM_CDF4(31648, 32686, 32742) },
+                { AOM_CDF4(20954, 31094, 32337) },
+                { AOM_CDF4(12420, 25698, 30179) },
+                { AOM_CDF4(7304, 19320, 26248) },
+                { AOM_CDF4(4366, 12261, 18864) },
+                { AOM_CDF4(31581, 32723, 32748) },
+                { AOM_CDF4(21373, 31586, 32525) },
+                { AOM_CDF4(12744, 26625, 30885) },
+                { AOM_CDF4(7431, 20322, 26950) },
+                { AOM_CDF4(4692, 13323, 20111) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(7833, 18369, 24095) },
+                { AOM_CDF4(26650, 32273, 32702) },
+                { AOM_CDF4(16371, 29961, 32191) },
+                { AOM_CDF4(11055, 24082, 29629) },
+                { AOM_CDF4(6892, 18644, 25400) },
+                { AOM_CDF4(5006, 13057, 19240) },
+                { AOM_CDF4(29834, 32666, 32748) },
+                { AOM_CDF4(19577, 31335, 32570) },
+                { AOM_CDF4(12253, 26509, 31122) },
+                { AOM_CDF4(7991, 20772, 27711) },
+                { AOM_CDF4(5677, 15910, 23059) },
+                { AOM_CDF4(30109, 32532, 32720) },
+                { AOM_CDF4(16747, 30166, 32252) },
+                { AOM_CDF4(10134, 23542, 29184) },
+                { AOM_CDF4(5791, 16176, 23556) },
+                { AOM_CDF4(4362, 10414, 17284) },
+                { AOM_CDF4(29492, 32626, 32748) },
+                { AOM_CDF4(19894, 31402, 32525) },
+                { AOM_CDF4(12942, 27071, 30869) },
+                { AOM_CDF4(8346, 21216, 27405) },
+                { AOM_CDF4(6572, 17087, 23859) },
+                { AOM_CDF4(32035, 32735, 32748) },
+                { AOM_CDF4(22957, 31838, 32618) },
+                { AOM_CDF4(14724, 28572, 31772) },
+                { AOM_CDF4(10364, 23999, 29553) },
+                { AOM_CDF4(7004, 18433, 25655) },
+                { AOM_CDF4(27528, 32277, 32681) },
+                { AOM_CDF4(16959, 31171, 32096) },
+                { AOM_CDF4(10486, 23593, 27962) },
+                { AOM_CDF4(8192, 16384, 23211) },
+                { AOM_CDF4(8937, 17873, 20852) },
+                { AOM_CDF4(27715, 32002, 32615) },
+                { AOM_CDF4(15073, 29491, 31676) },
+                { AOM_CDF4(11264, 24576, 28672) },
+                { AOM_CDF4(2341, 18725, 23406) },
+                { AOM_CDF4(7282, 18204, 25486) },
+                { AOM_CDF4(28547, 32213, 32657) },
+                { AOM_CDF4(20788, 29773, 32239) },
+                { AOM_CDF4(6780, 21469, 30508) },
+                { AOM_CDF4(5958, 14895, 23831) },
+                { AOM_CDF4(16384, 21845, 27307) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(5992, 14304, 19765) },
+                { AOM_CDF4(22612, 31238, 32456) },
+                { AOM_CDF4(13456, 27162, 31087) },
+                { AOM_CDF4(8001, 20062, 26504) },
+                { AOM_CDF4(5168, 14105, 20764) },
+                { AOM_CDF4(2632, 7771, 12385) },
+                { AOM_CDF4(27034, 32344, 32709) },
+                { AOM_CDF4(15850, 29415, 31997) },
+                { AOM_CDF4(9494, 22776, 28841) },
+                { AOM_CDF4(6151, 16830, 23969) },
+                { AOM_CDF4(3461, 10039, 15722) },
+                { AOM_CDF4(30134, 32569, 32731) },
+                { AOM_CDF4(15638, 29422, 31945) },
+                { AOM_CDF4(9150, 21865, 28218) },
+                { AOM_CDF4(5647, 15719, 22676) },
+                { AOM_CDF4(3402, 9772, 15477) },
+                { AOM_CDF4(28530, 32586, 32735) },
+                { AOM_CDF4(17139, 30298, 32292) },
+                { AOM_CDF4(10200, 24039, 29685) },
+                { AOM_CDF4(6419, 17674, 24786) },
+                { AOM_CDF4(3544, 10225, 15824) },
+                { AOM_CDF4(31333, 32726, 32748) },
+                { AOM_CDF4(20618, 31487, 32544) },
+                { AOM_CDF4(12901, 27217, 31232) },
+                { AOM_CDF4(8624, 21734, 28171) },
+                { AOM_CDF4(5104, 14191, 20748) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(11206, 21090, 26561) },
+                { AOM_CDF4(28759, 32279, 32671) },
+                { AOM_CDF4(14171, 27952, 31569) },
+                { AOM_CDF4(9743, 22907, 29141) },
+                { AOM_CDF4(6871, 17886, 24868) },
+                { AOM_CDF4(4960, 13152, 19315) },
+                { AOM_CDF4(31077, 32661, 32748) },
+                { AOM_CDF4(19400, 31195, 32515) },
+                { AOM_CDF4(12752, 26858, 31040) },
+                { AOM_CDF4(8370, 22098, 28591) },
+                { AOM_CDF4(5457, 15373, 22298) },
+                { AOM_CDF4(31697, 32706, 32748) },
+                { AOM_CDF4(17860, 30657, 32333) },
+                { AOM_CDF4(12510, 24812, 29261) },
+                { AOM_CDF4(6180, 19124, 24722) },
+                { AOM_CDF4(5041, 13548, 17959) },
+                { AOM_CDF4(31552, 32716, 32748) },
+                { AOM_CDF4(21908, 31769, 32623) },
+                { AOM_CDF4(14470, 28201, 31565) },
+                { AOM_CDF4(9493, 22982, 28608) },
+                { AOM_CDF4(6858, 17240, 24137) },
+                { AOM_CDF4(32543, 32752, 32756) },
+                { AOM_CDF4(24286, 32097, 32666) },
+                { AOM_CDF4(15958, 29217, 32024) },
+                { AOM_CDF4(10207, 24234, 29958) },
+                { AOM_CDF4(6929, 18305, 25652) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(4137, 10847, 15682) },
+                { AOM_CDF4(17824, 27001, 30058) },
+                { AOM_CDF4(10204, 22796, 28291) },
+                { AOM_CDF4(6076, 15935, 22125) },
+                { AOM_CDF4(3852, 10937, 16816) },
+                { AOM_CDF4(2252, 6324, 10131) },
+                { AOM_CDF4(25840, 32016, 32662) },
+                { AOM_CDF4(15109, 28268, 31531) },
+                { AOM_CDF4(9385, 22231, 28340) },
+                { AOM_CDF4(6082, 16672, 23479) },
+                { AOM_CDF4(3318, 9427, 14681) },
+                { AOM_CDF4(30594, 32574, 32718) },
+                { AOM_CDF4(16836, 29552, 31859) },
+                { AOM_CDF4(9556, 22542, 28356) },
+                { AOM_CDF4(6305, 16725, 23540) },
+                { AOM_CDF4(3376, 9895, 15184) },
+                { AOM_CDF4(29383, 32617, 32745) },
+                { AOM_CDF4(18891, 30809, 32401) },
+                { AOM_CDF4(11688, 25942, 30687) },
+                { AOM_CDF4(7468, 19469, 26651) },
+                { AOM_CDF4(3909, 11358, 17012) },
+                { AOM_CDF4(31564, 32736, 32748) },
+                { AOM_CDF4(20906, 31611, 32600) },
+                { AOM_CDF4(13191, 27621, 31537) },
+                { AOM_CDF4(8768, 22029, 28676) },
+                { AOM_CDF4(5079, 14109, 20906) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } } } };
 
-typedef coeff_cdf_model coeff_cdf_table[TX_SIZES][PLANE_TYPES];
-static const coeff_cdf_table *av1_default_qctx_coef_cdfs[TOKEN_CDF_Q_CTXS] = {
-  &av1_default_coef_head_cdfs_q0, &av1_default_coef_head_cdfs_q1,
-  &av1_default_coef_head_cdfs_q2, &av1_default_coef_head_cdfs_q3,
-};
+static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs
+    [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB][CDF_SIZE(
+        NUM_BASE_LEVELS + 1)] = { { { { { AOM_CDF3(17837, 29055) },
+                                        { AOM_CDF3(29600, 31446) },
+                                        { AOM_CDF3(30844, 31878) },
+                                        { AOM_CDF3(24926, 28948) } },
+                                      { { AOM_CDF3(21365, 30026) },
+                                        { AOM_CDF3(30512, 32423) },
+                                        { AOM_CDF3(31658, 32621) },
+                                        { AOM_CDF3(29630, 31881) } } },
+                                    { { { AOM_CDF3(5717, 26477) },
+                                        { AOM_CDF3(30491, 31703) },
+                                        { AOM_CDF3(31550, 32158) },
+                                        { AOM_CDF3(29648, 31491) } },
+                                      { { AOM_CDF3(12608, 27820) },
+                                        { AOM_CDF3(30680, 32225) },
+                                        { AOM_CDF3(30809, 32335) },
+                                        { AOM_CDF3(31299, 32423) } } },
+                                    { { { AOM_CDF3(1786, 12612) },
+                                        { AOM_CDF3(30663, 31625) },
+                                        { AOM_CDF3(32339, 32468) },
+                                        { AOM_CDF3(31148, 31833) } },
+                                      { { AOM_CDF3(18857, 23865) },
+                                        { AOM_CDF3(31428, 32428) },
+                                        { AOM_CDF3(31744, 32373) },
+                                        { AOM_CDF3(31775, 32526) } } },
+                                    { { { AOM_CDF3(1787, 2532) },
+                                        { AOM_CDF3(30832, 31662) },
+                                        { AOM_CDF3(31824, 32682) },
+                                        { AOM_CDF3(32133, 32569) } },
+                                      { { AOM_CDF3(13751, 22235) },
+                                        { AOM_CDF3(32089, 32409) },
+                                        { AOM_CDF3(27084, 27920) },
+                                        { AOM_CDF3(29291, 32594) } } },
+                                    { { { AOM_CDF3(1725, 3449) },
+                                        { AOM_CDF3(31102, 31935) },
+                                        { AOM_CDF3(32457, 32613) },
+                                        { AOM_CDF3(32412, 32649) } },
+                                      { { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) } } } },
+                                  { { { { AOM_CDF3(17560, 29888) },
+                                        { AOM_CDF3(29671, 31549) },
+                                        { AOM_CDF3(31007, 32056) },
+                                        { AOM_CDF3(27286, 30006) } },
+                                      { { AOM_CDF3(26594, 31212) },
+                                        { AOM_CDF3(31208, 32582) },
+                                        { AOM_CDF3(31835, 32637) },
+                                        { AOM_CDF3(30595, 32206) } } },
+                                    { { { AOM_CDF3(15239, 29932) },
+                                        { AOM_CDF3(31315, 32095) },
+                                        { AOM_CDF3(32130, 32434) },
+                                        { AOM_CDF3(30864, 31996) } },
+                                      { { AOM_CDF3(26279, 30968) },
+                                        { AOM_CDF3(31142, 32495) },
+                                        { AOM_CDF3(31713, 32540) },
+                                        { AOM_CDF3(31929, 32594) } } },
+                                    { { { AOM_CDF3(2644, 25198) },
+                                        { AOM_CDF3(32038, 32451) },
+                                        { AOM_CDF3(32639, 32695) },
+                                        { AOM_CDF3(32166, 32518) } },
+                                      { { AOM_CDF3(17187, 27668) },
+                                        { AOM_CDF3(31714, 32550) },
+                                        { AOM_CDF3(32283, 32678) },
+                                        { AOM_CDF3(31930, 32563) } } },
+                                    { { { AOM_CDF3(1044, 2257) },
+                                        { AOM_CDF3(30755, 31923) },
+                                        { AOM_CDF3(32208, 32693) },
+                                        { AOM_CDF3(32244, 32615) } },
+                                      { { AOM_CDF3(21317, 26207) },
+                                        { AOM_CDF3(29133, 30868) },
+                                        { AOM_CDF3(29311, 31231) },
+                                        { AOM_CDF3(29657, 31087) } } },
+                                    { { { AOM_CDF3(478, 1834) },
+                                        { AOM_CDF3(31005, 31987) },
+                                        { AOM_CDF3(32317, 32724) },
+                                        { AOM_CDF3(30865, 32648) } },
+                                      { { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) } } } },
+                                  { { { { AOM_CDF3(20092, 30774) },
+                                        { AOM_CDF3(30695, 32020) },
+                                        { AOM_CDF3(31131, 32103) },
+                                        { AOM_CDF3(28666, 30870) } },
+                                      { { AOM_CDF3(27258, 31095) },
+                                        { AOM_CDF3(31804, 32623) },
+                                        { AOM_CDF3(31763, 32528) },
+                                        { AOM_CDF3(31438, 32506) } } },
+                                    { { { AOM_CDF3(18049, 30489) },
+                                        { AOM_CDF3(31706, 32286) },
+                                        { AOM_CDF3(32163, 32473) },
+                                        { AOM_CDF3(31550, 32184) } },
+                                      { { AOM_CDF3(27116, 30842) },
+                                        { AOM_CDF3(31971, 32598) },
+                                        { AOM_CDF3(32088, 32576) },
+                                        { AOM_CDF3(32067, 32664) } } },
+                                    { { { AOM_CDF3(12854, 29093) },
+                                        { AOM_CDF3(32272, 32558) },
+                                        { AOM_CDF3(32667, 32729) },
+                                        { AOM_CDF3(32306, 32585) } },
+                                      { { AOM_CDF3(25476, 30366) },
+                                        { AOM_CDF3(32169, 32687) },
+                                        { AOM_CDF3(32479, 32689) },
+                                        { AOM_CDF3(31673, 32634) } } },
+                                    { { { AOM_CDF3(2809, 19301) },
+                                        { AOM_CDF3(32205, 32622) },
+                                        { AOM_CDF3(32338, 32730) },
+                                        { AOM_CDF3(31786, 32616) } },
+                                      { { AOM_CDF3(22737, 29105) },
+                                        { AOM_CDF3(30810, 32362) },
+                                        { AOM_CDF3(30014, 32627) },
+                                        { AOM_CDF3(30528, 32574) } } },
+                                    { { { AOM_CDF3(935, 3382) },
+                                        { AOM_CDF3(30789, 31909) },
+                                        { AOM_CDF3(32466, 32756) },
+                                        { AOM_CDF3(30860, 32513) } },
+                                      { { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) } } } },
+                                  { { { { AOM_CDF3(22497, 31198) },
+                                        { AOM_CDF3(31715, 32495) },
+                                        { AOM_CDF3(31606, 32337) },
+                                        { AOM_CDF3(30388, 31990) } },
+                                      { { AOM_CDF3(27877, 31584) },
+                                        { AOM_CDF3(32170, 32728) },
+                                        { AOM_CDF3(32155, 32688) },
+                                        { AOM_CDF3(32219, 32702) } } },
+                                    { { { AOM_CDF3(21457, 31043) },
+                                        { AOM_CDF3(31951, 32483) },
+                                        { AOM_CDF3(32153, 32562) },
+                                        { AOM_CDF3(31473, 32215) } },
+                                      { { AOM_CDF3(27558, 31151) },
+                                        { AOM_CDF3(32020, 32640) },
+                                        { AOM_CDF3(32097, 32575) },
+                                        { AOM_CDF3(32242, 32719) } } },
+                                    { { { AOM_CDF3(19980, 30591) },
+                                        { AOM_CDF3(32219, 32597) },
+                                        { AOM_CDF3(32581, 32706) },
+                                        { AOM_CDF3(31803, 32287) } },
+                                      { { AOM_CDF3(26473, 30507) },
+                                        { AOM_CDF3(32431, 32723) },
+                                        { AOM_CDF3(32196, 32611) },
+                                        { AOM_CDF3(31588, 32528) } } },
+                                    { { { AOM_CDF3(24647, 30463) },
+                                        { AOM_CDF3(32412, 32695) },
+                                        { AOM_CDF3(32468, 32720) },
+                                        { AOM_CDF3(31269, 32523) } },
+                                      { { AOM_CDF3(28482, 31505) },
+                                        { AOM_CDF3(32152, 32701) },
+                                        { AOM_CDF3(31732, 32598) },
+                                        { AOM_CDF3(31767, 32712) } } },
+                                    { { { AOM_CDF3(12358, 24977) },
+                                        { AOM_CDF3(31331, 32385) },
+                                        { AOM_CDF3(32634, 32756) },
+                                        { AOM_CDF3(30411, 32548) } },
+                                      { { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) } } } } };
diff --git a/third_party/aom/av1/common/txb_common.c b/third_party/aom/av1/common/txb_common.c
index c5b91e991..c96d37cca 100644
--- a/third_party/aom/av1/common/txb_common.c
+++ b/third_party/aom/av1/common/txb_common.c
@@ -12,17 +12,17 @@
 #include "av1/common/onyxc_int.h"
 #include "av1/common/txb_common.h"
 
-const int16_t av1_coeff_band_4x4[16] = { 0, 1, 2,  3,  4,  5,  6,  7,
-                                         8, 9, 10, 11, 12, 13, 14, 15 };
+const int8_t av1_coeff_band_4x4[16] = { 0, 1, 2,  3,  4,  5,  6,  7,
+                                        8, 9, 10, 11, 12, 13, 14, 15 };
 
-const int16_t av1_coeff_band_8x8[64] = {
+const int8_t av1_coeff_band_8x8[64] = {
   0,  1,  2,  2,  3,  3,  4,  4,  5,  6,  2,  2,  3,  3,  4,  4,
   7,  7,  8,  8,  9,  9,  10, 10, 7,  7,  8,  8,  9,  9,  10, 10,
   11, 11, 12, 12, 13, 13, 14, 14, 11, 11, 12, 12, 13, 13, 14, 14,
   15, 15, 16, 16, 17, 17, 18, 18, 15, 15, 16, 16, 17, 17, 18, 18,
 };
 
-const int16_t av1_coeff_band_16x16[256] = {
+const int8_t av1_coeff_band_16x16[256] = {
   0,  1,  4,  4,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9,  9,  9,  2,  3,  4,
   4,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9,  9,  9,  5,  5,  6,  6,  7,  7,
   7,  7,  8,  8,  8,  8,  9,  9,  9,  9,  5,  5,  6,  6,  7,  7,  7,  7,  8,
@@ -39,7 +39,7 @@ const int16_t av1_coeff_band_16x16[256] = {
   19, 20, 20, 20, 20, 21, 21, 21, 21,
 };
 
-const int16_t av1_coeff_band_32x32[1024] = {
+const int8_t av1_coeff_band_32x32[1024] = {
   0,  1,  4,  4,  7,  7,  7,  7,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11,
   11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 2,  3,  4,  4,  7,  7,
   7,  7,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12,
@@ -96,223 +96,372 @@ const int16_t av1_coeff_band_32x32[1024] = {
   22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24,
 };
 
-#if LV_MAP_PROB
-void av1_init_txb_probs(FRAME_CONTEXT *fc) {
-  TX_SIZE tx_size;
-  int plane, ctx, level;
+// The ctx offset table when TX is TX_CLASS_2D.
+// TX col and row indices are clamped to 4
 
-  // Update probability models for transform block skip flag
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx) {
-      fc->txb_skip_cdf[tx_size][ctx][0] =
-          AOM_ICDF(128 * (aom_cdf_prob)fc->txb_skip[tx_size][ctx]);
-      fc->txb_skip_cdf[tx_size][ctx][1] = AOM_ICDF(32768);
-      fc->txb_skip_cdf[tx_size][ctx][2] = 0;
-    }
-  }
+const int8_t av1_nz_map_ctx_offset_4x4[16] = {
+  0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21,
+};
 
-  for (plane = 0; plane < PLANE_TYPES; ++plane) {
-    for (ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx) {
-      fc->dc_sign_cdf[plane][ctx][0] =
-          AOM_ICDF(128 * (aom_cdf_prob)fc->dc_sign[plane][ctx]);
-      fc->dc_sign_cdf[plane][ctx][1] = AOM_ICDF(32768);
-      fc->dc_sign_cdf[plane][ctx][2] = 0;
-    }
-  }
+const int8_t av1_nz_map_ctx_offset_8x8[64] = {
+  0,  1,  6,  6,  21, 21, 21, 21, 1,  6,  6,  21, 21, 21, 21, 21,
+  6,  6,  21, 21, 21, 21, 21, 21, 6,  21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
 
-  // Update probability models for non-zero coefficient map and eob flag.
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane) {
-      for (level = 0; level < NUM_BASE_LEVELS; ++level) {
-        for (ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx) {
-          fc->coeff_base_cdf[tx_size][plane][level][ctx][0] = AOM_ICDF(
-              128 * (aom_cdf_prob)fc->coeff_base[tx_size][plane][level][ctx]);
-          fc->coeff_base_cdf[tx_size][plane][level][ctx][1] = AOM_ICDF(32768);
-          fc->coeff_base_cdf[tx_size][plane][level][ctx][2] = 0;
-        }
-      }
-    }
-  }
+const int8_t av1_nz_map_ctx_offset_16x16[256] = {
+  0,  1,  6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1,  6,  6,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6,  6,  21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6,  21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
 
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane) {
-      for (ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) {
-        fc->nz_map_cdf[tx_size][plane][ctx][0] =
-            AOM_ICDF(128 * (aom_cdf_prob)fc->nz_map[tx_size][plane][ctx]);
-        fc->nz_map_cdf[tx_size][plane][ctx][1] = AOM_ICDF(32768);
-        fc->nz_map_cdf[tx_size][plane][ctx][2] = 0;
-      }
+const int8_t av1_nz_map_ctx_offset_32x32[1024] = {
+  0,  1,  6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1,  6,  6,  21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
 
-      for (ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) {
-        fc->eob_flag_cdf[tx_size][plane][ctx][0] =
-            AOM_ICDF(128 * (aom_cdf_prob)fc->eob_flag[tx_size][plane][ctx]);
-        fc->eob_flag_cdf[tx_size][plane][ctx][1] = AOM_ICDF(32768);
-        fc->eob_flag_cdf[tx_size][plane][ctx][2] = 0;
-      }
-    }
-  }
+const int8_t av1_nz_map_ctx_offset_8x4[32] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 16, 16, 6,  21, 21, 21, 21, 21,
+  16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21,
+};
 
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane) {
-      for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
-        fc->coeff_lps_cdf[tx_size][plane][ctx][0] =
-            AOM_ICDF(128 * (aom_cdf_prob)fc->coeff_lps[tx_size][plane][ctx]);
-        fc->coeff_lps_cdf[tx_size][plane][ctx][1] = AOM_ICDF(32768);
-        fc->coeff_lps_cdf[tx_size][plane][ctx][2] = 0;
-      }
-#if BR_NODE
-      for (int br = 0; br < BASE_RANGE_SETS; ++br) {
-        for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
-          fc->coeff_br_cdf[tx_size][plane][br][ctx][0] = AOM_ICDF(
-              128 * (aom_cdf_prob)fc->coeff_br[tx_size][plane][br][ctx]);
-          fc->coeff_br_cdf[tx_size][plane][br][ctx][1] = AOM_ICDF(32768);
-          fc->coeff_br_cdf[tx_size][plane][br][ctx][2] = 0;
-        }
-      }
-#endif  // BR_NODE
-    }
-  }
-#if CONFIG_CTX1D
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane) {
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class) {
-        fc->eob_mode_cdf[tx_size][plane][tx_class][0] = AOM_ICDF(
-            128 * (aom_cdf_prob)fc->eob_mode[tx_size][plane][tx_class]);
-        fc->eob_mode_cdf[tx_size][plane][tx_class][1] = AOM_ICDF(32768);
-        fc->eob_mode_cdf[tx_size][plane][tx_class][2] = 0;
-      }
-    }
-  }
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane) {
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class) {
-        for (ctx = 0; ctx < EMPTY_LINE_CONTEXTS; ++ctx) {
-          fc->empty_line_cdf[tx_size][plane][tx_class][ctx][0] = AOM_ICDF(
-              128 *
-              (aom_cdf_prob)fc->empty_line[tx_size][plane][tx_class][ctx]);
-          fc->empty_line_cdf[tx_size][plane][tx_class][ctx][1] =
-              AOM_ICDF(32768);
-          fc->empty_line_cdf[tx_size][plane][tx_class][ctx][2] = 0;
-        }
-      }
-    }
-  }
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane) {
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class) {
-        for (ctx = 0; ctx < HV_EOB_CONTEXTS; ++ctx) {
-          fc->hv_eob_cdf[tx_size][plane][tx_class][ctx][0] = AOM_ICDF(
-              128 * (aom_cdf_prob)fc->hv_eob[tx_size][plane][tx_class][ctx]);
-          fc->hv_eob_cdf[tx_size][plane][tx_class][ctx][1] = AOM_ICDF(32768);
-          fc->hv_eob_cdf[tx_size][plane][tx_class][ctx][2] = 0;
-        }
-      }
-    }
-  }
-#endif  // CONFIG_CTX1D
-}
-#endif  // LV_MAP_PROB
+const int8_t av1_nz_map_ctx_offset_8x16[128] = {
+  0,  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 6,  6,  21,
+  21, 21, 21, 21, 21, 6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
 
-void av1_adapt_txb_probs(AV1_COMMON *cm, unsigned int count_sat,
-                         unsigned int update_factor) {
-  FRAME_CONTEXT *fc = cm->fc;
-  const FRAME_CONTEXT *pre_fc = cm->pre_fc;
-  const FRAME_COUNTS *counts = &cm->counts;
-  TX_SIZE tx_size;
-  int plane, ctx, level;
+const int8_t av1_nz_map_ctx_offset_16x8[128] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
 
-  // Update probability models for transform block skip flag
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size)
-    for (ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx)
-      fc->txb_skip[tx_size][ctx] = mode_mv_merge_probs(
-          pre_fc->txb_skip[tx_size][ctx], counts->txb_skip[tx_size][ctx]);
+const int8_t av1_nz_map_ctx_offset_16x32[512] = {
+  0,  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 6,  6,  21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6,  21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
 
-  for (plane = 0; plane < PLANE_TYPES; ++plane)
-    for (ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
-      fc->dc_sign[plane][ctx] = mode_mv_merge_probs(
-          pre_fc->dc_sign[plane][ctx], counts->dc_sign[plane][ctx]);
+const int8_t av1_nz_map_ctx_offset_32x16[512] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6,  21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
 
-  // Update probability models for non-zero coefficient map and eob flag.
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size)
-    for (plane = 0; plane < PLANE_TYPES; ++plane)
-      for (level = 0; level < NUM_BASE_LEVELS; ++level)
-        for (ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx)
-          fc->coeff_base[tx_size][plane][level][ctx] =
-              merge_probs(pre_fc->coeff_base[tx_size][plane][level][ctx],
-                          counts->coeff_base[tx_size][plane][level][ctx],
-                          count_sat, update_factor);
+const int8_t av1_nz_map_ctx_offset_32x64[1024] = {
+  0,  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
 
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane) {
-      for (ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) {
-        fc->nz_map[tx_size][plane][ctx] = merge_probs(
-            pre_fc->nz_map[tx_size][plane][ctx],
-            counts->nz_map[tx_size][plane][ctx], count_sat, update_factor);
-      }
+const int8_t av1_nz_map_ctx_offset_64x32[1024] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6,  21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16,
+  16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
 
-      for (ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) {
-        fc->eob_flag[tx_size][plane][ctx] = merge_probs(
-            pre_fc->eob_flag[tx_size][plane][ctx],
-            counts->eob_flag[tx_size][plane][ctx], count_sat, update_factor);
-      }
-    }
-  }
+const int8_t av1_nz_map_ctx_offset_4x16[64] = {
+  0,  11, 11, 11, 11, 11, 11, 11, 6,  6,  21, 21, 6,  21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
 
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane) {
-      for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
-        fc->coeff_lps[tx_size][plane][ctx] = merge_probs(
-            pre_fc->coeff_lps[tx_size][plane][ctx],
-            counts->coeff_lps[tx_size][plane][ctx], count_sat, update_factor);
-      }
-#if BR_NODE
-      for (int br = 0; br < BASE_RANGE_SETS; ++br) {
-        for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
-          fc->coeff_br[tx_size][plane][br][ctx] =
-              merge_probs(pre_fc->coeff_br[tx_size][plane][br][ctx],
-                          counts->coeff_br[tx_size][plane][br][ctx], count_sat,
-                          update_factor);
-        }
-      }
-#endif  // BR_NODE
-    }
-  }
-#if CONFIG_CTX1D
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane)
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
-        fc->eob_mode[tx_size][plane][tx_class] =
-            merge_probs(pre_fc->eob_mode[tx_size][plane][tx_class],
-                        counts->eob_mode[tx_size][plane][tx_class], count_sat,
-                        update_factor);
-  }
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane)
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
-        for (ctx = 0; ctx < EMPTY_LINE_CONTEXTS; ++ctx)
-          fc->empty_line[tx_size][plane][tx_class][ctx] =
-              merge_probs(pre_fc->empty_line[tx_size][plane][tx_class][ctx],
-                          counts->empty_line[tx_size][plane][tx_class][ctx],
-                          count_sat, update_factor);
-  }
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane)
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
-        for (ctx = 0; ctx < HV_EOB_CONTEXTS; ++ctx)
-          fc->hv_eob[tx_size][plane][tx_class][ctx] =
-              merge_probs(pre_fc->hv_eob[tx_size][plane][tx_class][ctx],
-                          counts->hv_eob[tx_size][plane][tx_class][ctx],
-                          count_sat, update_factor);
-  }
-#endif
-}
+const int8_t av1_nz_map_ctx_offset_16x4[64] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  16, 16, 6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_8x32[256] = {
+  0,  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 6,  6,  21,
+  21, 21, 21, 21, 21, 6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_32x8[256] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6,  21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t *av1_nz_map_ctx_offset[19] = {
+  av1_nz_map_ctx_offset_4x4,    // TX_4x4
+  av1_nz_map_ctx_offset_8x8,    // TX_8x8
+  av1_nz_map_ctx_offset_16x16,  // TX_16x16
+  av1_nz_map_ctx_offset_32x32,  // TX_32x32
+  av1_nz_map_ctx_offset_32x32,  // TX_32x32
+  av1_nz_map_ctx_offset_4x16,   // TX_4x8
+  av1_nz_map_ctx_offset_8x4,    // TX_8x4
+  av1_nz_map_ctx_offset_8x32,   // TX_8x16
+  av1_nz_map_ctx_offset_16x8,   // TX_16x8
+  av1_nz_map_ctx_offset_16x32,  // TX_16x32
+  av1_nz_map_ctx_offset_32x16,  // TX_32x16
+  av1_nz_map_ctx_offset_32x64,  // TX_32x64
+  av1_nz_map_ctx_offset_64x32,  // TX_64x32
+  av1_nz_map_ctx_offset_4x16,   // TX_4x16
+  av1_nz_map_ctx_offset_16x4,   // TX_16x4
+  av1_nz_map_ctx_offset_8x32,   // TX_8x32
+  av1_nz_map_ctx_offset_32x8,   // TX_32x8
+  av1_nz_map_ctx_offset_16x32,  // TX_16x64
+  av1_nz_map_ctx_offset_64x32,  // TX_64x16
+};
 
 void av1_init_lv_map(AV1_COMMON *cm) {
   LV_MAP_CTX_TABLE *coeff_ctx_table = &cm->coeff_ctx_table;
   for (int row = 0; row < 2; ++row) {
     for (int col = 0; col < 2; ++col) {
-      for (int sig_mag = 0; sig_mag < 2; ++sig_mag) {
+      for (int sig_mag = 0; sig_mag < 3; ++sig_mag) {
         for (int count = 0; count < BASE_CONTEXT_POSITION_NUM + 1; ++count) {
+          if (row == 0 && col == 0 && count > 5) continue;
+          if ((row == 0 || col == 0) && count > 8) continue;
+
           coeff_ctx_table->base_ctx_table[row][col][sig_mag][count] =
               get_base_ctx_from_count_mag(row, col, count, sig_mag);
         }
@@ -320,3 +469,7 @@ void av1_init_lv_map(AV1_COMMON *cm) {
     }
   }
 }
+
+const int16_t k_eob_group_start[12] = { 0,  1,  2,  3,   5,   9,
+                                        17, 33, 65, 129, 257, 513 };
+const int16_t k_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
diff --git a/third_party/aom/av1/common/txb_common.h b/third_party/aom/av1/common/txb_common.h
index 3bf8f8c61..cdac90d9e 100644
--- a/third_party/aom/av1/common/txb_common.h
+++ b/third_party/aom/av1/common/txb_common.h
@@ -12,72 +12,133 @@
 #ifndef AV1_COMMON_TXB_COMMON_H_
 #define AV1_COMMON_TXB_COMMON_H_
 
-#define REDUCE_CONTEXT_DEPENDENCY 0
-#define MIN_SCAN_IDX_REDUCE_CONTEXT_DEPENDENCY 0
+extern const int16_t k_eob_group_start[12];
+extern const int16_t k_eob_offset_bits[12];
 
-extern const int16_t av1_coeff_band_4x4[16];
+extern const int8_t av1_coeff_band_4x4[16];
 
-extern const int16_t av1_coeff_band_8x8[64];
+extern const int8_t av1_coeff_band_8x8[64];
 
-extern const int16_t av1_coeff_band_16x16[256];
+extern const int8_t av1_coeff_band_16x16[256];
 
-extern const int16_t av1_coeff_band_32x32[1024];
+extern const int8_t av1_coeff_band_32x32[1024];
+
+extern const int8_t *av1_nz_map_ctx_offset[TX_SIZES_ALL];
 
 typedef struct txb_ctx {
   int txb_skip_ctx;
   int dc_sign_ctx;
 } TXB_CTX;
 
-static INLINE TX_SIZE get_txsize_context(TX_SIZE tx_size) {
-  return txsize_sqr_up_map[tx_size];
-}
+static const int base_level_count_to_index[13] = {
+  0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
+};
 
-static int base_ref_offset[BASE_CONTEXT_POSITION_NUM][2] = {
+// Note: TX_PAD_2D is dependent to this offset table.
+static const int base_ref_offset[BASE_CONTEXT_POSITION_NUM][2] = {
   /* clang-format off*/
   { -2, 0 }, { -1, -1 }, { -1, 0 }, { -1, 1 }, { 0, -2 }, { 0, -1 }, { 0, 1 },
   { 0, 2 },  { 1, -1 },  { 1, 0 },  { 1, 1 },  { 2, 0 }
   /* clang-format on*/
 };
 
-static INLINE int get_level_count(const tran_low_t *tcoeffs, int bwl,
-                                  int height, int row, int col, int level,
-                                  int (*nb_offset)[2], int nb_num) {
-  int count = 0;
-  for (int idx = 0; idx < nb_num; ++idx) {
-    const int ref_row = row + nb_offset[idx][0];
-    const int ref_col = col + nb_offset[idx][1];
-    if (ref_row < 0 || ref_col < 0 || ref_row >= height ||
-        ref_col >= (1 << bwl))
-      continue;
-    const int pos = (ref_row << bwl) + ref_col;
-    tran_low_t abs_coeff = abs(tcoeffs[pos]);
-    count += abs_coeff > level;
+#define CONTEXT_MAG_POSITION_NUM 3
+static const int mag_ref_offset_with_txclass[3][CONTEXT_MAG_POSITION_NUM][2] = {
+  { { 0, 1 }, { 1, 0 }, { 1, 1 } },
+  { { 0, 1 }, { 1, 0 }, { 0, 2 } },
+  { { 0, 1 }, { 1, 0 }, { 2, 0 } }
+};
+static const int mag_ref_offset[CONTEXT_MAG_POSITION_NUM][2] = {
+  { 0, 1 }, { 1, 0 }, { 1, 1 }
+};
+
+static const TX_CLASS tx_type_to_class[TX_TYPES] = {
+  TX_CLASS_2D,     // DCT_DCT
+  TX_CLASS_2D,     // ADST_DCT
+  TX_CLASS_2D,     // DCT_ADST
+  TX_CLASS_2D,     // ADST_ADST
+  TX_CLASS_2D,     // FLIPADST_DCT
+  TX_CLASS_2D,     // DCT_FLIPADST
+  TX_CLASS_2D,     // FLIPADST_FLIPADST
+  TX_CLASS_2D,     // ADST_FLIPADST
+  TX_CLASS_2D,     // FLIPADST_ADST
+  TX_CLASS_2D,     // IDTX
+  TX_CLASS_VERT,   // V_DCT
+  TX_CLASS_HORIZ,  // H_DCT
+  TX_CLASS_VERT,   // V_ADST
+  TX_CLASS_HORIZ,  // H_ADST
+  TX_CLASS_VERT,   // V_FLIPADST
+  TX_CLASS_HORIZ,  // H_FLIPADST
+};
+
+static const int8_t eob_to_pos_small[33] = {
+  0, 1, 2,                                        // 0-2
+  3, 3,                                           // 3-4
+  4, 4, 4, 4,                                     // 5-8
+  5, 5, 5, 5, 5, 5, 5, 5,                         // 9-16
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6  // 17-32
+};
+
+static const int8_t eob_to_pos_large[17] = {
+  6,                               // place holder
+  7,                               // 33-64
+  8,  8,                           // 65-128
+  9,  9,  9,  9,                   // 129-256
+  10, 10, 10, 10, 10, 10, 10, 10,  // 257-512
+  11                               // 513-
+};
+
+static INLINE int get_eob_pos_token(const int eob, int *const extra) {
+  int t;
+
+  if (eob < 33) {
+    t = eob_to_pos_small[eob];
+  } else {
+    const int e = AOMMIN((eob - 1) >> 5, 16);
+    t = eob_to_pos_large[e];
   }
-  return count;
+
+  *extra = eob - k_eob_group_start[t];
+
+  return t;
 }
 
-static INLINE void get_mag(int *mag, const tran_low_t *tcoeffs, int bwl,
-                           int height, int row, int col, int (*nb_offset)[2],
-                           int nb_num) {
-  mag[0] = 0;
-  mag[1] = 0;
-  for (int idx = 0; idx < nb_num; ++idx) {
-    const int ref_row = row + nb_offset[idx][0];
-    const int ref_col = col + nb_offset[idx][1];
-    if (ref_row < 0 || ref_col < 0 || ref_row >= height ||
-        ref_col >= (1 << bwl))
-      continue;
-    const int pos = (ref_row << bwl) + ref_col;
-    tran_low_t abs_coeff = abs(tcoeffs[pos]);
-    if (nb_offset[idx][0] >= 0 && nb_offset[idx][1] >= 0) {
-      if (abs_coeff > mag[0]) {
-        mag[0] = abs_coeff;
-        mag[1] = 1;
-      } else if (abs_coeff == mag[0]) {
-        ++mag[1];
-      }
-    }
-  }
+static INLINE int av1_get_eob_pos_ctx(const TX_TYPE tx_type,
+                                      const int eob_token) {
+  static const int8_t tx_type_to_offset[TX_TYPES] = {
+    -1,  // DCT_DCT
+    -1,  // ADST_DCT
+    -1,  // DCT_ADST
+    -1,  // ADST_ADST
+    -1,  // FLIPADST_DCT
+    -1,  // DCT_FLIPADST
+    -1,  // FLIPADST_FLIPADST
+    -1,  // ADST_FLIPADST
+    -1,  // FLIPADST_ADST
+    -1,  // IDTX
+    10,  // V_DCT
+    10,  // H_DCT
+    10,  // V_ADST
+    10,  // H_ADST
+    10,  // V_FLIPADST
+    10,  // H_FLIPADST
+  };
+  return eob_token + tx_type_to_offset[tx_type];
+}
+
+static INLINE int get_txb_bwl(TX_SIZE tx_size) {
+  tx_size = av1_get_adjusted_tx_size(tx_size);
+  return tx_size_wide_log2[tx_size];
+}
+
+static INLINE int get_txb_wide(TX_SIZE tx_size) {
+  tx_size = av1_get_adjusted_tx_size(tx_size);
+  return tx_size_wide[tx_size];
+}
+
+static INLINE int get_txb_high(TX_SIZE tx_size) {
+  tx_size = av1_get_adjusted_tx_size(tx_size);
+  return tx_size_high[tx_size];
 }
 
 static INLINE void get_base_count_mag(int *mag, int *count,
@@ -110,67 +171,124 @@ static INLINE void get_base_count_mag(int *mag, int *count,
   }
 }
 
-static INLINE int get_level_count_mag(int *mag, const tran_low_t *tcoeffs,
-                                      int bwl, int height, int row, int col,
-                                      int level, int (*nb_offset)[2],
-                                      int nb_num) {
-  const int stride = 1 << bwl;
+static INLINE uint8_t *set_levels(uint8_t *const levels_buf, const int width) {
+  return levels_buf + TX_PAD_TOP * (width + TX_PAD_HOR);
+}
+
+static INLINE int get_padded_idx(const int idx, const int bwl) {
+  return idx + ((idx >> bwl) << TX_PAD_HOR_LOG2);
+}
+
+static INLINE int get_level_count(const uint8_t *const levels, const int stride,
+                                  const int row, const int col, const int level,
+                                  const int (*nb_offset)[2], const int nb_num) {
   int count = 0;
-  *mag = 0;
+
   for (int idx = 0; idx < nb_num; ++idx) {
     const int ref_row = row + nb_offset[idx][0];
     const int ref_col = col + nb_offset[idx][1];
-    if (ref_row < 0 || ref_col < 0 || ref_row >= height || ref_col >= stride)
-      continue;
-    const int pos = (ref_row << bwl) + ref_col;
-    tran_low_t abs_coeff = abs(tcoeffs[pos]);
-    count += abs_coeff > level;
-    if (nb_offset[idx][0] >= 0 && nb_offset[idx][1] >= 0)
-      *mag = AOMMAX(*mag, abs_coeff);
+    const int pos = ref_row * stride + ref_col;
+    count += levels[pos] > level;
   }
   return count;
 }
 
+static INLINE void get_level_mag(const uint8_t *const levels, const int stride,
+                                 const int row, const int col, int *const mag) {
+  for (int idx = 0; idx < CONTEXT_MAG_POSITION_NUM; ++idx) {
+    const int ref_row = row + mag_ref_offset[idx][0];
+    const int ref_col = col + mag_ref_offset[idx][1];
+    const int pos = ref_row * stride + ref_col;
+    mag[idx] = levels[pos];
+  }
+}
+
 static INLINE int get_base_ctx_from_count_mag(int row, int col, int count,
                                               int sig_mag) {
-  const int ctx = (count + 1) >> 1;
+  const int ctx = base_level_count_to_index[count];
   int ctx_idx = -1;
+
   if (row == 0 && col == 0) {
-    ctx_idx = (ctx << 1) + sig_mag;
-    // TODO(angiebird): turn this on once the optimization is finalized
-    // assert(ctx_idx < 8);
+    if (sig_mag >= 2) return ctx_idx = 0;
+    if (sig_mag == 1) {
+      if (count >= 2)
+        ctx_idx = 1;
+      else
+        ctx_idx = 2;
+
+      return ctx_idx;
+    }
+
+    ctx_idx = 3 + ctx;
+    assert(ctx_idx <= 6);
+    return ctx_idx;
   } else if (row == 0) {
-    ctx_idx = 8 + (ctx << 1) + sig_mag;
-    // TODO(angiebird): turn this on once the optimization is finalized
-    // assert(ctx_idx < 18);
+    if (sig_mag >= 2) return ctx_idx = 6;
+    if (sig_mag == 1) {
+      if (count >= 2)
+        ctx_idx = 7;
+      else
+        ctx_idx = 8;
+      return ctx_idx;
+    }
+
+    ctx_idx = 9 + ctx;
+    assert(ctx_idx <= 11);
+    return ctx_idx;
   } else if (col == 0) {
-    ctx_idx = 8 + 10 + (ctx << 1) + sig_mag;
+    if (sig_mag >= 2) return ctx_idx = 12;
+    if (sig_mag == 1) {
+      if (count >= 2)
+        ctx_idx = 13;
+      else
+        ctx_idx = 14;
+
+      return ctx_idx;
+    }
+
+    ctx_idx = 15 + ctx;
+    assert(ctx_idx <= 17);
     // TODO(angiebird): turn this on once the optimization is finalized
     // assert(ctx_idx < 28);
   } else {
-    ctx_idx = 8 + 10 + 10 + (ctx << 1) + sig_mag;
-    assert(ctx_idx < COEFF_BASE_CONTEXTS);
+    if (sig_mag >= 2) return ctx_idx = 18;
+    if (sig_mag == 1) {
+      if (count >= 2)
+        ctx_idx = 19;
+      else
+        ctx_idx = 20;
+      return ctx_idx;
+    }
+
+    ctx_idx = 21 + ctx;
+
+    assert(ctx_idx <= 24);
   }
   return ctx_idx;
 }
 
-static INLINE int get_base_ctx(const tran_low_t *tcoeffs,
-                               int c,  // raster order
-                               const int bwl, const int height,
-                               const int level) {
+static INLINE int get_base_ctx(const uint8_t *const levels,
+                               const int c,  // raster order
+                               const int bwl, const int level_minus_1,
+                               const int count) {
   const int row = c >> bwl;
   const int col = c - (row << bwl);
-  const int level_minus_1 = level - 1;
-  int mag;
-  int count =
-      get_level_count_mag(&mag, tcoeffs, bwl, height, row, col, level_minus_1,
-                          base_ref_offset, BASE_CONTEXT_POSITION_NUM);
-  int ctx_idx = get_base_ctx_from_count_mag(row, col, count, mag > level);
+  const int stride = (1 << bwl) + TX_PAD_HOR;
+  int mag_count = 0;
+  int nb_mag[3] = { 0 };
+
+  get_level_mag(levels, stride, row, col, nb_mag);
+
+  for (int idx = 0; idx < 3; ++idx)
+    mag_count += nb_mag[idx] > (level_minus_1 + 1);
+  const int ctx_idx =
+      get_base_ctx_from_count_mag(row, col, count, AOMMIN(2, mag_count));
   return ctx_idx;
 }
 
 #define BR_CONTEXT_POSITION_NUM 8  // Base range coefficient context
-static int br_ref_offset[BR_CONTEXT_POSITION_NUM][2] = {
+// Note: TX_PAD_2D is dependent to this offset table.
+static const int br_ref_offset[BR_CONTEXT_POSITION_NUM][2] = {
   /* clang-format off*/
   { -1, -1 }, { -1, 0 }, { -1, 1 }, { 0, -1 },
   { 0, 1 },   { 1, -1 }, { 1, 0 },  { 1, 1 },
@@ -181,18 +299,8 @@ static const int br_level_map[9] = {
   0, 0, 1, 1, 2, 2, 3, 3, 3,
 };
 
-static const int coeff_to_br_index[COEFF_BASE_RANGE] = {
-  0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
-};
-
-static const int br_index_to_coeff[BASE_RANGE_SETS] = {
-  0, 2, 6,
-};
-
-static const int br_extra_bits[BASE_RANGE_SETS] = {
-  1, 2, 3,
-};
-
+// Note: If BR_MAG_OFFSET changes, the calculation of offset in
+// get_br_ctx_from_count_mag() must be updated.
 #define BR_MAG_OFFSET 1
 // TODO(angiebird): optimize this function by using a table to map from
 // count/mag to ctx
@@ -223,369 +331,356 @@ static INLINE int get_br_count_mag(int *mag, const tran_low_t *tcoeffs, int bwl,
   return count;
 }
 
-static INLINE int get_br_ctx_from_count_mag(int row, int col, int count,
-                                            int mag) {
-  int offset = 0;
-  if (mag <= BR_MAG_OFFSET)
-    offset = 0;
-  else if (mag <= 3)
-    offset = 1;
-  else if (mag <= 5)
-    offset = 2;
-  else
-    offset = 3;
-
-  int ctx = br_level_map[count];
-  ctx += offset * BR_TMP_OFFSET;
-
+static INLINE int get_br_ctx_from_count_mag(const int row, const int col,
+                                            const int count, const int mag) {
   // DC: 0 - 1
-  if (row == 0 && col == 0) return ctx;
-
   // Top row: 2 - 4
-  if (row == 0) return 2 + ctx;
-
   // Left column: 5 - 7
-  if (col == 0) return 5 + ctx;
-
   // others: 8 - 11
-  return 8 + ctx;
+  static const int offset_pos[2][2] = { { 8, 5 }, { 2, 0 } };
+  const int mag_clamp = AOMMIN(mag, 6);
+  const int offset = mag_clamp >> 1;
+  const int ctx =
+      br_level_map[count] + offset * BR_TMP_OFFSET + offset_pos[!row][!col];
+  return ctx;
 }
 
-static INLINE int get_br_ctx(const tran_low_t *tcoeffs,
-                             const int c,  // raster order
-                             const int bwl, const int height) {
+static INLINE int get_br_ctx_2d(const uint8_t *const levels,
+                                const int c,  // raster order
+                                const int bwl) {
+  assert(c > 0);
   const int row = c >> bwl;
   const int col = c - (row << bwl);
-  const int level_minus_1 = NUM_BASE_LEVELS;
-  int mag;
-  const int count =
-      get_level_count_mag(&mag, tcoeffs, bwl, height, row, col, level_minus_1,
-                          br_ref_offset, BR_CONTEXT_POSITION_NUM);
-  const int ctx = get_br_ctx_from_count_mag(row, col, count, mag);
-  return ctx;
+  const int stride = (1 << bwl) + TX_PAD_HOR;
+  const int pos = row * stride + col;
+  int mag = AOMMIN(levels[pos + 1], MAX_BASE_BR_RANGE) +
+            AOMMIN(levels[pos + stride], MAX_BASE_BR_RANGE) +
+            AOMMIN(levels[pos + 1 + stride], MAX_BASE_BR_RANGE);
+  mag = AOMMIN((mag + 1) >> 1, 6);
+  //((row | col) < 2) is equivalent to ((row < 2) && (col < 2))
+  if ((row | col) < 2) return mag + 7;
+  return mag + 14;
 }
 
-#define SIG_REF_OFFSET_NUM 7
-static int sig_ref_offset[SIG_REF_OFFSET_NUM][2] = {
-  { -2, -1 }, { -2, 0 }, { -1, -2 }, { -1, -1 },
-  { -1, 0 },  { 0, -2 }, { 0, -1 },
-};
-
-#if REDUCE_CONTEXT_DEPENDENCY
-static INLINE int get_nz_count(const tran_low_t *tcoeffs, int bwl, int height,
-                               int row, int col, int prev_row, int prev_col) {
-  int count = 0;
-  for (int idx = 0; idx < SIG_REF_OFFSET_NUM; ++idx) {
-    const int ref_row = row + sig_ref_offset[idx][0];
-    const int ref_col = col + sig_ref_offset[idx][1];
-    if (ref_row < 0 || ref_col < 0 || ref_row >= height ||
-        ref_col >= (1 << bwl) || (prev_row == ref_row && prev_col == ref_col))
-      continue;
-    const int nb_pos = (ref_row << bwl) + ref_col;
-    count += (tcoeffs[nb_pos] != 0);
-  }
-  return count;
-}
-#else
-static INLINE int get_nz_count(const tran_low_t *tcoeffs, int bwl, int height,
-                               int row, int col) {
-  int count = 0;
-  for (int idx = 0; idx < SIG_REF_OFFSET_NUM; ++idx) {
-    const int ref_row = row + sig_ref_offset[idx][0];
-    const int ref_col = col + sig_ref_offset[idx][1];
-    if (ref_row < 0 || ref_col < 0 || ref_row >= height ||
-        ref_col >= (1 << bwl))
-      continue;
-    const int nb_pos = (ref_row << bwl) + ref_col;
-    count += (tcoeffs[nb_pos] != 0);
-  }
-  return count;
-}
-#endif
-
-static INLINE TX_CLASS get_tx_class(TX_TYPE tx_type) {
-  switch (tx_type) {
-#if CONFIG_EXT_TX
-    case V_DCT:
-    case V_ADST:
-    case V_FLIPADST: return TX_CLASS_VERT;
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST: return TX_CLASS_HORIZ;
-#endif
-    default: return TX_CLASS_2D;
+static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels,
+                                       const int c,  // raster order
+                                       const int bwl, const TX_CLASS tx_class) {
+  const int row = c >> bwl;
+  const int col = c - (row << bwl);
+  const int stride = (1 << bwl) + TX_PAD_HOR;
+  const int pos = row * stride + col;
+  int mag = levels[pos + 1];
+  mag += levels[pos + stride];
+  switch (tx_class) {
+    case TX_CLASS_2D:
+      mag += levels[pos + stride + 1];
+      mag = AOMMIN((mag + 1) >> 1, 6);
+      if (c == 0) return mag;
+      if ((row < 2) && (col < 2)) return mag + 7;
+      break;
+    case TX_CLASS_HORIZ:
+      mag += levels[pos + 2];
+      mag = AOMMIN((mag + 1) >> 1, 6);
+      if (c == 0) return mag;
+      if (col == 0) return mag + 7;
+      break;
+    case TX_CLASS_VERT:
+      mag += levels[pos + (stride << 1)];
+      mag = AOMMIN((mag + 1) >> 1, 6);
+      if (c == 0) return mag;
+      if (row == 0) return mag + 7;
+      break;
+    default: break;
   }
-}
 
-// TODO(angiebird): optimize this function by generate a table that maps from
-// count to ctx
-static INLINE int get_nz_map_ctx_from_count(int count,
-                                            int coeff_idx,  // raster order
-                                            int bwl, TX_TYPE tx_type) {
-  (void)tx_type;
-  const int row = coeff_idx >> bwl;
-  const int col = coeff_idx - (row << bwl);
-  int ctx = 0;
-#if CONFIG_EXT_TX
-  int tx_class = get_tx_class(tx_type);
-  int offset;
-  if (tx_class == TX_CLASS_2D)
-    offset = 0;
-  else if (tx_class == TX_CLASS_VERT)
-    offset = SIG_COEF_CONTEXTS_2D;
-  else
-    offset = SIG_COEF_CONTEXTS_2D + SIG_COEF_CONTEXTS_1D;
-#else
-  int offset = 0;
-#endif
-
-  if (row == 0 && col == 0) return offset + 0;
-
-  if (row == 0 && col == 1) return offset + 1 + count;
-
-  if (row == 1 && col == 0) return offset + 3 + count;
-
-  if (row == 1 && col == 1) {
-    ctx = (count + 1) >> 1;
-
-    assert(5 + ctx <= 7);
-
-    return offset + 5 + ctx;
-  }
+  return mag + 14;
+}
 
-  if (row == 0) {
-    ctx = (count + 1) >> 1;
+#define SIG_REF_OFFSET_NUM 5
 
-    assert(ctx < 2);
-    return offset + 8 + ctx;
-  }
+// Note: TX_PAD_2D is dependent to these offset tables.
+static const int sig_ref_offset[SIG_REF_OFFSET_NUM][2] = {
+  { 0, 1 }, { 1, 0 }, { 1, 1 }, { 0, 2 }, { 2, 0 }
+  // , { 1, 2 }, { 2, 1 },
+};
 
-  if (col == 0) {
-    ctx = (count + 1) >> 1;
+static const int sig_ref_offset_vert[SIG_REF_OFFSET_NUM][2] = {
+  { 1, 0 }, { 2, 0 }, { 0, 1 }, { 3, 0 }, { 4, 0 }
+  // , { 1, 1 }, { 2, 1 },
+};
 
-    assert(ctx < 2);
-    return offset + 10 + ctx;
-  }
+static const int sig_ref_offset_horiz[SIG_REF_OFFSET_NUM][2] = {
+  { 0, 1 }, { 0, 2 }, { 1, 0 }, { 0, 3 }, { 0, 4 }
+  // , { 1, 1 }, { 1, 2 },
+};
 
-  ctx = count >> 1;
+#define SIG_REF_DIFF_OFFSET_NUM 3
 
-  assert(12 + ctx < 16);
+static const int sig_ref_diff_offset[SIG_REF_DIFF_OFFSET_NUM][2] = {
+  { 1, 1 }, { 0, 2 }, { 2, 0 }
+};
 
-  return offset + 12 + ctx;
-}
+static const int sig_ref_diff_offset_vert[SIG_REF_DIFF_OFFSET_NUM][2] = {
+  { 2, 0 }, { 3, 0 }, { 4, 0 }
+};
 
-static INLINE int get_nz_map_ctx(const tran_low_t *tcoeffs, const int scan_idx,
-                                 const int16_t *scan, const int bwl,
-                                 const int height, TX_TYPE tx_type) {
-  const int coeff_idx = scan[scan_idx];
-  const int row = coeff_idx >> bwl;
-  const int col = coeff_idx - (row << bwl);
-#if REDUCE_CONTEXT_DEPENDENCY
-  int prev_coeff_idx;
-  int prev_row;
-  int prev_col;
-  if (scan_idx > MIN_SCAN_IDX_REDUCE_CONTEXT_DEPENDENCY) {
-    prev_coeff_idx = scan[scan_idx - 1];  // raster order
-    prev_row = prev_coeff_idx >> bwl;
-    prev_col = prev_coeff_idx - (prev_row << bwl);
-  } else {
-    prev_coeff_idx = -1;
-    prev_row = -1;
-    prev_col = -1;
-  }
-  int count = get_nz_count(tcoeffs, bwl, height, row, col, prev_row, prev_col);
-#else
-  int count = get_nz_count(tcoeffs, bwl, height, row, col);
-#endif
-  return get_nz_map_ctx_from_count(count, coeff_idx, bwl, tx_type);
-}
+static const int sig_ref_diff_offset_horiz[SIG_REF_DIFF_OFFSET_NUM][2] = {
+  { 0, 2 }, { 0, 3 }, { 0, 4 }
+};
 
-static INLINE int get_eob_ctx(const tran_low_t *tcoeffs,
-                              const int coeff_idx,  // raster order
-                              const TX_SIZE txs_ctx, TX_TYPE tx_type) {
-  (void)tcoeffs;
-  int offset = 0;
-#if CONFIG_CTX1D
-  TX_CLASS tx_class = get_tx_class(tx_type);
-  if (tx_class == TX_CLASS_VERT)
-    offset = EOB_COEF_CONTEXTS_2D;
-  else if (tx_class == TX_CLASS_HORIZ)
-    offset = EOB_COEF_CONTEXTS_2D + EOB_COEF_CONTEXTS_1D;
-#else
-  (void)tx_type;
-#endif
-
-  if (txs_ctx == TX_4X4) return offset + av1_coeff_band_4x4[coeff_idx];
-  if (txs_ctx == TX_8X8) return offset + av1_coeff_band_8x8[coeff_idx];
-  if (txs_ctx == TX_16X16) return offset + av1_coeff_band_16x16[coeff_idx];
-  if (txs_ctx == TX_32X32) return offset + av1_coeff_band_32x32[coeff_idx];
-
-  assert(0);
-  return 0;
-}
+static const uint8_t clip_max3[256] = {
+  0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+};
 
-static INLINE void set_dc_sign(int *cul_level, tran_low_t v) {
-  if (v < 0)
-    *cul_level |= 1 << COEFF_CONTEXT_BITS;
-  else if (v > 0)
-    *cul_level += 2 << COEFF_CONTEXT_BITS;
-}
+static AOM_FORCE_INLINE int get_nz_mag(const uint8_t *const levels,
+                                       const int bwl, const TX_CLASS tx_class) {
+  int mag;
 
-static INLINE int get_dc_sign_ctx(int dc_sign) {
-  int dc_sign_ctx = 0;
-  if (dc_sign < 0)
-    dc_sign_ctx = 1;
-  else if (dc_sign > 0)
-    dc_sign_ctx = 2;
+  // Note: AOMMIN(level, 3) is useless for decoder since level < 3.
+  mag = clip_max3[levels[1]];                         // { 0, 1 }
+  mag += clip_max3[levels[(1 << bwl) + TX_PAD_HOR]];  // { 1, 0 }
+
+  if (tx_class == TX_CLASS_2D) {
+    mag += clip_max3[levels[(1 << bwl) + TX_PAD_HOR + 1]];          // { 1, 1 }
+    mag += clip_max3[levels[2]];                                    // { 0, 2 }
+    mag += clip_max3[levels[(2 << bwl) + (2 << TX_PAD_HOR_LOG2)]];  // { 2, 0 }
+  } else if (tx_class == TX_CLASS_VERT) {
+    mag += clip_max3[levels[(2 << bwl) + (2 << TX_PAD_HOR_LOG2)]];  // { 2, 0 }
+    mag += clip_max3[levels[(3 << bwl) + (3 << TX_PAD_HOR_LOG2)]];  // { 3, 0 }
+    mag += clip_max3[levels[(4 << bwl) + (4 << TX_PAD_HOR_LOG2)]];  // { 4, 0 }
+  } else {
+    mag += clip_max3[levels[2]];  // { 0, 2 }
+    mag += clip_max3[levels[3]];  // { 0, 3 }
+    mag += clip_max3[levels[4]];  // { 0, 4 }
+  }
 
-  return dc_sign_ctx;
+  return mag;
 }
 
-static INLINE void get_txb_ctx(BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                               int plane, const ENTROPY_CONTEXT *a,
-                               const ENTROPY_CONTEXT *l, TXB_CTX *txb_ctx) {
-  const int txb_w_unit = tx_size_wide_unit[tx_size];
-  const int txb_h_unit = tx_size_high_unit[tx_size];
-  int ctx_offset = (plane == 0) ? 0 : 7;
-
-  if (plane_bsize > txsize_to_bsize[tx_size]) ctx_offset += 3;
-
-  int dc_sign = 0;
-  for (int k = 0; k < txb_w_unit; ++k) {
-    int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS;
-    if (sign == 1)
-      --dc_sign;
-    else if (sign == 2)
-      ++dc_sign;
-    else if (sign != 0)
-      assert(0);
-  }
-
-  for (int k = 0; k < txb_h_unit; ++k) {
-    int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS;
-    if (sign == 1)
-      --dc_sign;
-    else if (sign == 2)
-      ++dc_sign;
-    else if (sign != 0)
-      assert(0);
+static INLINE int get_nz_count(const uint8_t *const levels, const int bwl,
+                               const TX_CLASS tx_class) {
+  int count;
+
+  count = (levels[1] != 0);                         // { 0, 1 }
+  count += (levels[(1 << bwl) + TX_PAD_HOR] != 0);  // { 1, 0 }
+
+  for (int idx = 0; idx < SIG_REF_DIFF_OFFSET_NUM; ++idx) {
+    const int row_offset =
+        ((tx_class == TX_CLASS_2D) ? sig_ref_diff_offset[idx][0]
+                                   : ((tx_class == TX_CLASS_VERT)
+                                          ? sig_ref_diff_offset_vert[idx][0]
+                                          : sig_ref_diff_offset_horiz[idx][0]));
+    const int col_offset =
+        ((tx_class == TX_CLASS_2D) ? sig_ref_diff_offset[idx][1]
+                                   : ((tx_class == TX_CLASS_VERT)
+                                          ? sig_ref_diff_offset_vert[idx][1]
+                                          : sig_ref_diff_offset_horiz[idx][1]));
+    const int nb_pos =
+        (row_offset << bwl) + (row_offset << TX_PAD_HOR_LOG2) + col_offset;
+    count += (levels[nb_pos] != 0);
   }
+  return count;
+}
 
-  txb_ctx->dc_sign_ctx = get_dc_sign_ctx(dc_sign);
-
-  if (plane == 0) {
-    int top = 0;
-    int left = 0;
+#define NZ_MAP_CTX_0 SIG_COEF_CONTEXTS_2D
+#define NZ_MAP_CTX_5 (NZ_MAP_CTX_0 + 5)
+#define NZ_MAP_CTX_10 (NZ_MAP_CTX_0 + 10)
+
+static const int nz_map_ctx_offset_1d[32] = {
+  NZ_MAP_CTX_0,  NZ_MAP_CTX_5,  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+};
 
-    for (int k = 0; k < txb_w_unit; ++k) {
-      top = AOMMAX(top, ((uint8_t)a[k] & COEFF_CONTEXT_MASK));
+static AOM_FORCE_INLINE int get_nz_map_ctx_from_stats(
+    const int stats,
+    const int coeff_idx,  // raster order
+    const int bwl, const TX_SIZE tx_size, const TX_CLASS tx_class) {
+  // tx_class == 0(TX_CLASS_2D)
+  if ((tx_class | coeff_idx) == 0) return 0;
+  int ctx = (stats + 1) >> 1;
+  ctx = AOMMIN(ctx, 4);
+  switch (tx_class) {
+    case TX_CLASS_2D: {
+      // This is the algorithm to generate av1_nz_map_ctx_offset[][]
+      //   const int width = tx_size_wide[tx_size];
+      //   const int height = tx_size_high[tx_size];
+      //   if (width < height) {
+      //     if (row < 2) return 11 + ctx;
+      //   } else if (width > height) {
+      //     if (col < 2) return 16 + ctx;
+      //   }
+      //   if (row + col < 2) return ctx + 1;
+      //   if (row + col < 4) return 5 + ctx + 1;
+      //   return 21 + ctx;
+      return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx];
     }
-
-    for (int k = 0; k < txb_h_unit; ++k) {
-      left = AOMMAX(left, ((uint8_t)l[k] & COEFF_CONTEXT_MASK));
+    case TX_CLASS_HORIZ: {
+      const int row = coeff_idx >> bwl;
+      const int col = coeff_idx - (row << bwl);
+      return ctx + nz_map_ctx_offset_1d[col];
+      break;
     }
-
-    top = AOMMIN(top, 255);
-    left = AOMMIN(left, 255);
-
-    if (plane_bsize == txsize_to_bsize[tx_size])
-      txb_ctx->txb_skip_ctx = 0;
-    else if (top == 0 && left == 0)
-      txb_ctx->txb_skip_ctx = 1;
-    else if (top == 0 || left == 0)
-      txb_ctx->txb_skip_ctx = 2 + (AOMMAX(top, left) > 3);
-    else if (AOMMAX(top, left) <= 3)
-      txb_ctx->txb_skip_ctx = 4;
-    else if (AOMMIN(top, left) <= 3)
-      txb_ctx->txb_skip_ctx = 5;
-    else
-      txb_ctx->txb_skip_ctx = 6;
-  } else {
-    int ctx_base = get_entropy_context(tx_size, a, l);
-    txb_ctx->txb_skip_ctx = ctx_offset + ctx_base;
+    case TX_CLASS_VERT: {
+      const int row = coeff_idx >> bwl;
+      return ctx + nz_map_ctx_offset_1d[row];
+      break;
+    }
+    default: break;
   }
+  return 0;
 }
 
-#if LV_MAP_PROB
-void av1_init_txb_probs(FRAME_CONTEXT *fc);
-#endif  // LV_MAP_PROB
+typedef aom_cdf_prob (*base_cdf_arr)[CDF_SIZE(4)];
+typedef aom_cdf_prob (*br_cdf_arr)[CDF_SIZE(BR_CDF_SIZE)];
 
-void av1_adapt_txb_probs(AV1_COMMON *cm, unsigned int count_sat,
-                         unsigned int update_factor);
+static INLINE int get_lower_levels_ctx_eob(int bwl, int height, int scan_idx) {
+  if (scan_idx == 0) return 0;
+  if (scan_idx <= (height << bwl) / 8) return 1;
+  if (scan_idx <= (height << bwl) / 4) return 2;
+  return 3;
+}
 
-void av1_init_lv_map(AV1_COMMON *cm);
+static INLINE int get_lower_levels_ctx_2d(const uint8_t *levels, int coeff_idx,
+                                          int bwl, TX_SIZE tx_size) {
+  assert(coeff_idx > 0);
+  int mag;
+  // Note: AOMMIN(level, 3) is useless for decoder since level < 3.
+  levels = levels + get_padded_idx(coeff_idx, bwl);
+  mag = AOMMIN(levels[1], 3);                                     // { 0, 1 }
+  mag += AOMMIN(levels[(1 << bwl) + TX_PAD_HOR], 3);              // { 1, 0 }
+  mag += AOMMIN(levels[(1 << bwl) + TX_PAD_HOR + 1], 3);          // { 1, 1 }
+  mag += AOMMIN(levels[2], 3);                                    // { 0, 2 }
+  mag += AOMMIN(levels[(2 << bwl) + (2 << TX_PAD_HOR_LOG2)], 3);  // { 2, 0 }
+
+  const int ctx = AOMMIN((mag + 1) >> 1, 4);
+  return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx];
+}
+static AOM_FORCE_INLINE int get_lower_levels_ctx(const uint8_t *levels,
+                                                 int coeff_idx, int bwl,
+                                                 TX_SIZE tx_size,
+                                                 TX_CLASS tx_class) {
+  const int stats =
+      get_nz_mag(levels + get_padded_idx(coeff_idx, bwl), bwl, tx_class);
+  return get_nz_map_ctx_from_stats(stats, coeff_idx, bwl, tx_size, tx_class);
+}
 
-#if CONFIG_CTX1D
-static INLINE void get_eob_vert(int16_t *eob_ls, const tran_low_t *tcoeff,
-                                int w, int h) {
-  for (int c = 0; c < w; ++c) {
-    eob_ls[c] = 0;
-    for (int r = h - 1; r >= 0; --r) {
-      int coeff_idx = r * w + c;
-      if (tcoeff[coeff_idx] != 0) {
-        eob_ls[c] = r + 1;
-        break;
-      }
-    }
+static INLINE int get_lower_levels_ctx_general(int is_last, int scan_idx,
+                                               int bwl, int height,
+                                               const uint8_t *levels,
+                                               int coeff_idx, TX_SIZE tx_size,
+                                               TX_CLASS tx_class) {
+  if (is_last) {
+    if (scan_idx == 0) return 0;
+    if (scan_idx <= (height << bwl) >> 3) return 1;
+    if (scan_idx <= (height << bwl) >> 2) return 2;
+    return 3;
   }
+  return get_lower_levels_ctx(levels, coeff_idx, bwl, tx_size, tx_class);
 }
 
-static INLINE void get_eob_horiz(int16_t *eob_ls, const tran_low_t *tcoeff,
-                                 int w, int h) {
-  for (int r = 0; r < h; ++r) {
-    eob_ls[r] = 0;
-    for (int c = w - 1; c >= 0; --c) {
-      int coeff_idx = r * w + c;
-      if (tcoeff[coeff_idx] != 0) {
-        eob_ls[r] = c + 1;
-        break;
-      }
-    }
-  }
+static INLINE void set_dc_sign(int *cul_level, int dc_val) {
+  if (dc_val < 0)
+    *cul_level |= 1 << COEFF_CONTEXT_BITS;
+  else if (dc_val > 0)
+    *cul_level += 2 << COEFF_CONTEXT_BITS;
 }
 
-static INLINE int get_empty_line_ctx(int line_idx, int16_t *eob_ls) {
-  if (line_idx > 0) {
-    int prev_eob = eob_ls[line_idx - 1];
-    if (prev_eob == 0) {
-      return 1;
-    } else if (prev_eob < 3) {
-      return 2;
-    } else if (prev_eob < 6) {
-      return 3;
+static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
+                               const TX_SIZE tx_size, const int plane,
+                               const ENTROPY_CONTEXT *const a,
+                               const ENTROPY_CONTEXT *const l,
+                               TXB_CTX *const txb_ctx) {
+#define MAX_TX_SIZE_UNIT 16
+  static const int8_t signs[3] = { 0, -1, 1 };
+  static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+  };
+  const int txb_w_unit = tx_size_wide_unit[tx_size];
+  const int txb_h_unit = tx_size_high_unit[tx_size];
+  int dc_sign = 0;
+  int k = 0;
+
+  do {
+    const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS;
+    assert(sign <= 2);
+    dc_sign += signs[sign];
+  } while (++k < txb_w_unit);
+
+  k = 0;
+  do {
+    const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS;
+    assert(sign <= 2);
+    dc_sign += signs[sign];
+  } while (++k < txb_h_unit);
+
+  txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT];
+
+  if (plane == 0) {
+    if (plane_bsize == txsize_to_bsize[tx_size]) {
+      txb_ctx->txb_skip_ctx = 0;
     } else {
-      return 4;
+      // This is the algorithm to generate table skip_contexts[min][max].
+      //    if (!max)
+      //      txb_skip_ctx = 1;
+      //    else if (!min)
+      //      txb_skip_ctx = 2 + (max > 3);
+      //    else if (max <= 3)
+      //      txb_skip_ctx = 4;
+      //    else if (min <= 3)
+      //      txb_skip_ctx = 5;
+      //    else
+      //      txb_skip_ctx = 6;
+      static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 },
+                                                   { 1, 4, 4, 4, 5 },
+                                                   { 1, 4, 4, 4, 5 },
+                                                   { 1, 4, 4, 4, 5 },
+                                                   { 1, 4, 4, 4, 6 } };
+      int top = 0;
+      int left = 0;
+
+      k = 0;
+      do {
+        top |= a[k];
+      } while (++k < txb_w_unit);
+      top &= COEFF_CONTEXT_MASK;
+
+      k = 0;
+      do {
+        left |= l[k];
+      } while (++k < txb_h_unit);
+      left &= COEFF_CONTEXT_MASK;
+      const int max = AOMMIN(top | left, 4);
+      const int min = AOMMIN(AOMMIN(top, left), 4);
+
+      txb_ctx->txb_skip_ctx = skip_contexts[min][max];
     }
   } else {
-    return 0;
+    const int ctx_base = get_entropy_context(tx_size, a, l);
+    const int ctx_offset = (num_pels_log2_lookup[plane_bsize] >
+                            num_pels_log2_lookup[txsize_to_bsize[tx_size]])
+                               ? 10
+                               : 7;
+    txb_ctx->txb_skip_ctx = ctx_base + ctx_offset;
   }
+#undef MAX_TX_SIZE_UNIT
 }
 
-#define MAX_POS_CTX 8
-static int pos_ctx[MAX_HVTX_SIZE] = {
-  0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
-  6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
-};
-static INLINE int get_hv_eob_ctx(int line_idx, int pos, int16_t *eob_ls) {
-  if (line_idx > 0) {
-    int prev_eob = eob_ls[line_idx - 1];
-    int diff = pos + 1 - prev_eob;
-    int abs_diff = abs(diff);
-    int ctx_idx = pos_ctx[abs_diff];
-    assert(ctx_idx < MAX_POS_CTX);
-    if (diff < 0) {
-      ctx_idx += MAX_POS_CTX;
-      assert(ctx_idx >= MAX_POS_CTX);
-      assert(ctx_idx < 2 * MAX_POS_CTX);
-    }
-    return ctx_idx;
-  } else {
-    int ctx_idx = MAX_POS_CTX + MAX_POS_CTX + pos_ctx[pos];
-    assert(ctx_idx < HV_EOB_CONTEXTS);
-    assert(HV_EOB_CONTEXTS == MAX_POS_CTX * 3);
-    return ctx_idx;
-  }
-}
-#endif  // CONFIG_CTX1D
+void av1_init_lv_map(AV1_COMMON *cm);
 
 #endif  // AV1_COMMON_TXB_COMMON_H_
diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c
index 34374af69..ae6f07657 100644
--- a/third_party/aom/av1/common/warped_motion.c
+++ b/third_party/aom/av1/common/warped_motion.c
@@ -15,7 +15,8 @@
 #include <math.h>
 #include <assert.h>
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "av1/common/warped_motion.h"
 #include "av1/common/scale.h"
 
@@ -91,78 +92,11 @@ static const int error_measure_lut[512] = {
 };
 /* clang-format on */
 
-static ProjectPointsFunc get_project_points_type(TransformationType type) {
-  switch (type) {
-    case VERTRAPEZOID: return project_points_vertrapezoid;
-    case HORTRAPEZOID: return project_points_hortrapezoid;
-    case HOMOGRAPHY: return project_points_homography;
-    case AFFINE: return project_points_affine;
-    case ROTZOOM: return project_points_rotzoom;
-    case TRANSLATION: return project_points_translation;
-    default: assert(0); return NULL;
-  }
-}
-
-void project_points_translation(const int32_t *mat, int *points, int *proj,
-                                const int n, const int stride_points,
-                                const int stride_proj, const int subsampling_x,
-                                const int subsampling_y) {
-  int i;
-  for (i = 0; i < n; ++i) {
-    const int x = *(points++), y = *(points++);
-    if (subsampling_x)
-      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          ((x * (1 << (WARPEDMODEL_PREC_BITS + 1))) + mat[0]),
-          WARPEDDIFF_PREC_BITS + 1);
-    else
-      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          ((x * (1 << WARPEDMODEL_PREC_BITS)) + mat[0]), WARPEDDIFF_PREC_BITS);
-    if (subsampling_y)
-      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          ((y * (1 << (WARPEDMODEL_PREC_BITS + 1))) + mat[1]),
-          WARPEDDIFF_PREC_BITS + 1);
-    else
-      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          ((y * (1 << WARPEDMODEL_PREC_BITS))) + mat[1], WARPEDDIFF_PREC_BITS);
-    points += stride_points - 2;
-    proj += stride_proj - 2;
-  }
-}
-
-void project_points_rotzoom(const int32_t *mat, int *points, int *proj,
-                            const int n, const int stride_points,
-                            const int stride_proj, const int subsampling_x,
-                            const int subsampling_y) {
-  int i;
-  for (i = 0; i < n; ++i) {
-    const int x = *(points++), y = *(points++);
-    if (subsampling_x)
-      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          mat[2] * 2 * x + mat[3] * 2 * y + mat[0] +
-              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-          WARPEDDIFF_PREC_BITS + 1);
-    else
-      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[0],
-                                            WARPEDDIFF_PREC_BITS);
-    if (subsampling_y)
-      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-          -mat[3] * 2 * x + mat[2] * 2 * y + mat[1] +
-              (-mat[3] + mat[2] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-          WARPEDDIFF_PREC_BITS + 1);
-    else
-      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(-mat[3] * x + mat[2] * y + mat[1],
-                                            WARPEDDIFF_PREC_BITS);
-    points += stride_points - 2;
-    proj += stride_proj - 2;
-  }
-}
-
 void project_points_affine(const int32_t *mat, int *points, int *proj,
                            const int n, const int stride_points,
                            const int stride_proj, const int subsampling_x,
                            const int subsampling_y) {
-  int i;
-  for (i = 0; i < n; ++i) {
+  for (int i = 0; i < n; ++i) {
     const int x = *(points++), y = *(points++);
     if (subsampling_x)
       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
@@ -185,301 +119,6 @@ void project_points_affine(const int32_t *mat, int *points, int *proj,
   }
 }
 
-void project_points_hortrapezoid(const int32_t *mat, int *points, int *proj,
-                                 const int n, const int stride_points,
-                                 const int stride_proj, const int subsampling_x,
-                                 const int subsampling_y) {
-  int i;
-  int64_t x, y, Z;
-  int64_t xp, yp;
-  for (i = 0; i < n; ++i) {
-    x = *(points++), y = *(points++);
-    x = (subsampling_x ? 4 * x + 1 : 2 * x);
-    y = (subsampling_y ? 4 * y + 1 : 2 * y);
-
-    Z = (mat[7] * y + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
-    xp = (mat[2] * x + mat[3] * y + 2 * mat[0]) *
-         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
-                WARPEDMODEL_PREC_BITS));
-    yp = (mat[5] * y + 2 * mat[1]) *
-         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
-                WARPEDMODEL_PREC_BITS));
-
-    xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
-    yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
-
-    if (subsampling_x) xp = (xp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
-    if (subsampling_y) yp = (yp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
-    *(proj++) = (int)xp;
-    *(proj++) = (int)yp;
-
-    points += stride_points - 2;
-    proj += stride_proj - 2;
-  }
-}
-
-void project_points_vertrapezoid(const int32_t *mat, int *points, int *proj,
-                                 const int n, const int stride_points,
-                                 const int stride_proj, const int subsampling_x,
-                                 const int subsampling_y) {
-  int i;
-  int64_t x, y, Z;
-  int64_t xp, yp;
-  for (i = 0; i < n; ++i) {
-    x = *(points++), y = *(points++);
-    x = (subsampling_x ? 4 * x + 1 : 2 * x);
-    y = (subsampling_y ? 4 * y + 1 : 2 * y);
-
-    Z = (mat[6] * x + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
-    xp = (mat[2] * x + 2 * mat[0]) *
-         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
-                WARPEDMODEL_PREC_BITS));
-    yp = (mat[4] * x + mat[5] * y + 2 * mat[1]) *
-         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
-                WARPEDMODEL_PREC_BITS));
-
-    xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
-    yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
-
-    if (subsampling_x) xp = (xp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
-    if (subsampling_y) yp = (yp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
-    *(proj++) = (int)xp;
-    *(proj++) = (int)yp;
-
-    points += stride_points - 2;
-    proj += stride_proj - 2;
-  }
-}
-
-void project_points_homography(const int32_t *mat, int *points, int *proj,
-                               const int n, const int stride_points,
-                               const int stride_proj, const int subsampling_x,
-                               const int subsampling_y) {
-  int i;
-  int64_t x, y, Z;
-  int64_t xp, yp;
-  for (i = 0; i < n; ++i) {
-    x = *(points++), y = *(points++);
-    x = (subsampling_x ? 4 * x + 1 : 2 * x);
-    y = (subsampling_y ? 4 * y + 1 : 2 * y);
-
-    Z = (mat[6] * x + mat[7] * y + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
-    xp = (mat[2] * x + mat[3] * y + 2 * mat[0]) *
-         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
-                WARPEDMODEL_PREC_BITS));
-    yp = (mat[4] * x + mat[5] * y + 2 * mat[1]) *
-         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
-                WARPEDMODEL_PREC_BITS));
-
-    xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
-    yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
-
-    if (subsampling_x) xp = (xp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
-    if (subsampling_y) yp = (yp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
-    *(proj++) = (int)xp;
-    *(proj++) = (int)yp;
-
-    points += stride_points - 2;
-    proj += stride_proj - 2;
-  }
-}
-
-static const int16_t
-    filter_ntap[WARPEDPIXEL_PREC_SHIFTS][WARPEDPIXEL_FILTER_TAPS] = {
-#if WARPEDPIXEL_PREC_BITS == 6
-      { 0, 0, 128, 0, 0, 0 },      { 0, -1, 128, 2, -1, 0 },
-      { 1, -3, 127, 4, -1, 0 },    { 1, -4, 126, 6, -2, 1 },
-      { 1, -5, 126, 8, -3, 1 },    { 1, -6, 125, 11, -4, 1 },
-      { 1, -7, 124, 13, -4, 1 },   { 2, -8, 123, 15, -5, 1 },
-      { 2, -9, 122, 18, -6, 1 },   { 2, -10, 121, 20, -6, 1 },
-      { 2, -11, 120, 22, -7, 2 },  { 2, -12, 119, 25, -8, 2 },
-      { 3, -13, 117, 27, -8, 2 },  { 3, -13, 116, 29, -9, 2 },
-      { 3, -14, 114, 32, -10, 3 }, { 3, -15, 113, 35, -10, 2 },
-      { 3, -15, 111, 37, -11, 3 }, { 3, -16, 109, 40, -11, 3 },
-      { 3, -16, 108, 42, -12, 3 }, { 4, -17, 106, 45, -13, 3 },
-      { 4, -17, 104, 47, -13, 3 }, { 4, -17, 102, 50, -14, 3 },
-      { 4, -17, 100, 52, -14, 3 }, { 4, -18, 98, 55, -15, 4 },
-      { 4, -18, 96, 58, -15, 3 },  { 4, -18, 94, 60, -16, 4 },
-      { 4, -18, 91, 63, -16, 4 },  { 4, -18, 89, 65, -16, 4 },
-      { 4, -18, 87, 68, -17, 4 },  { 4, -18, 85, 70, -17, 4 },
-      { 4, -18, 82, 73, -17, 4 },  { 4, -18, 80, 75, -17, 4 },
-      { 4, -18, 78, 78, -18, 4 },  { 4, -17, 75, 80, -18, 4 },
-      { 4, -17, 73, 82, -18, 4 },  { 4, -17, 70, 85, -18, 4 },
-      { 4, -17, 68, 87, -18, 4 },  { 4, -16, 65, 89, -18, 4 },
-      { 4, -16, 63, 91, -18, 4 },  { 4, -16, 60, 94, -18, 4 },
-      { 3, -15, 58, 96, -18, 4 },  { 4, -15, 55, 98, -18, 4 },
-      { 3, -14, 52, 100, -17, 4 }, { 3, -14, 50, 102, -17, 4 },
-      { 3, -13, 47, 104, -17, 4 }, { 3, -13, 45, 106, -17, 4 },
-      { 3, -12, 42, 108, -16, 3 }, { 3, -11, 40, 109, -16, 3 },
-      { 3, -11, 37, 111, -15, 3 }, { 2, -10, 35, 113, -15, 3 },
-      { 3, -10, 32, 114, -14, 3 }, { 2, -9, 29, 116, -13, 3 },
-      { 2, -8, 27, 117, -13, 3 },  { 2, -8, 25, 119, -12, 2 },
-      { 2, -7, 22, 120, -11, 2 },  { 1, -6, 20, 121, -10, 2 },
-      { 1, -6, 18, 122, -9, 2 },   { 1, -5, 15, 123, -8, 2 },
-      { 1, -4, 13, 124, -7, 1 },   { 1, -4, 11, 125, -6, 1 },
-      { 1, -3, 8, 126, -5, 1 },    { 1, -2, 6, 126, -4, 1 },
-      { 0, -1, 4, 127, -3, 1 },    { 0, -1, 2, 128, -1, 0 },
-#elif WARPEDPIXEL_PREC_BITS == 5
-      { 0, 0, 128, 0, 0, 0 },      { 1, -3, 127, 4, -1, 0 },
-      { 1, -5, 126, 8, -3, 1 },    { 1, -7, 124, 13, -4, 1 },
-      { 2, -9, 122, 18, -6, 1 },   { 2, -11, 120, 22, -7, 2 },
-      { 3, -13, 117, 27, -8, 2 },  { 3, -14, 114, 32, -10, 3 },
-      { 3, -15, 111, 37, -11, 3 }, { 3, -16, 108, 42, -12, 3 },
-      { 4, -17, 104, 47, -13, 3 }, { 4, -17, 100, 52, -14, 3 },
-      { 4, -18, 96, 58, -15, 3 },  { 4, -18, 91, 63, -16, 4 },
-      { 4, -18, 87, 68, -17, 4 },  { 4, -18, 82, 73, -17, 4 },
-      { 4, -18, 78, 78, -18, 4 },  { 4, -17, 73, 82, -18, 4 },
-      { 4, -17, 68, 87, -18, 4 },  { 4, -16, 63, 91, -18, 4 },
-      { 3, -15, 58, 96, -18, 4 },  { 3, -14, 52, 100, -17, 4 },
-      { 3, -13, 47, 104, -17, 4 }, { 3, -12, 42, 108, -16, 3 },
-      { 3, -11, 37, 111, -15, 3 }, { 3, -10, 32, 114, -14, 3 },
-      { 2, -8, 27, 117, -13, 3 },  { 2, -7, 22, 120, -11, 2 },
-      { 1, -6, 18, 122, -9, 2 },   { 1, -4, 13, 124, -7, 1 },
-      { 1, -3, 8, 126, -5, 1 },    { 0, -1, 4, 127, -3, 1 },
-#endif  // WARPEDPIXEL_PREC_BITS == 6
-    };
-
-static int32_t do_ntap_filter(const int32_t *const p, int x) {
-  int i;
-  int32_t sum = 0;
-  for (i = 0; i < WARPEDPIXEL_FILTER_TAPS; ++i) {
-    sum += p[i - WARPEDPIXEL_FILTER_TAPS / 2 + 1] * filter_ntap[x][i];
-  }
-  return sum;
-}
-
-static int32_t do_cubic_filter(const int32_t *const p, int x) {
-  if (x == 0) {
-    return p[0] * (1 << WARPEDPIXEL_FILTER_BITS);
-  } else if (x == (1 << WARPEDPIXEL_PREC_BITS)) {
-    return p[1] * (1 << WARPEDPIXEL_FILTER_BITS);
-  } else {
-    const int64_t v1 = (int64_t)x * x * x * (3 * (p[0] - p[1]) + p[2] - p[-1]);
-    const int64_t v2 =
-        (int64_t)x * x * (2 * p[-1] - 5 * p[0] + 4 * p[1] - p[2]);
-    const int64_t v3 = x * (p[1] - p[-1]);
-    const int64_t v4 = 2 * p[0];
-    return (int32_t)ROUND_POWER_OF_TWO_SIGNED(
-        (v4 * (1 << (3 * WARPEDPIXEL_PREC_BITS))) +
-            (v3 * (1 << (2 * WARPEDPIXEL_PREC_BITS))) +
-            (v2 * (1 << WARPEDPIXEL_PREC_BITS)) + v1,
-        3 * WARPEDPIXEL_PREC_BITS + 1 - WARPEDPIXEL_FILTER_BITS);
-  }
-}
-
-static INLINE void get_subcolumn(int taps, const uint8_t *const ref,
-                                 int32_t *col, int stride, int x, int y_start) {
-  int i;
-  for (i = 0; i < taps; ++i) {
-    col[i] = ref[(i + y_start) * stride + x];
-  }
-}
-
-static uint8_t bi_ntap_filter(const uint8_t *const ref, int x, int y,
-                              int stride) {
-  int32_t val, arr[WARPEDPIXEL_FILTER_TAPS];
-  int k;
-  const int i = (int)x >> WARPEDPIXEL_PREC_BITS;
-  const int j = (int)y >> WARPEDPIXEL_PREC_BITS;
-  for (k = 0; k < WARPEDPIXEL_FILTER_TAPS; ++k) {
-    int32_t arr_temp[WARPEDPIXEL_FILTER_TAPS];
-    get_subcolumn(WARPEDPIXEL_FILTER_TAPS, ref, arr_temp, stride,
-                  i + k + 1 - WARPEDPIXEL_FILTER_TAPS / 2,
-                  j + 1 - WARPEDPIXEL_FILTER_TAPS / 2);
-    arr[k] = do_ntap_filter(arr_temp + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
-                            y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
-  }
-  val = do_ntap_filter(arr + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
-                       x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
-  val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
-  return (uint8_t)clip_pixel(val);
-}
-
-static uint8_t bi_cubic_filter(const uint8_t *const ref, int x, int y,
-                               int stride) {
-  int32_t val, arr[4];
-  int k;
-  const int i = (int)x >> WARPEDPIXEL_PREC_BITS;
-  const int j = (int)y >> WARPEDPIXEL_PREC_BITS;
-  for (k = 0; k < 4; ++k) {
-    int32_t arr_temp[4];
-    get_subcolumn(4, ref, arr_temp, stride, i + k - 1, j - 1);
-    arr[k] =
-        do_cubic_filter(arr_temp + 1, y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
-  }
-  val = do_cubic_filter(arr + 1, x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
-  val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
-  return (uint8_t)clip_pixel(val);
-}
-
-static uint8_t bi_linear_filter(const uint8_t *const ref, int x, int y,
-                                int stride) {
-  const int ix = x >> WARPEDPIXEL_PREC_BITS;
-  const int iy = y >> WARPEDPIXEL_PREC_BITS;
-  const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
-  const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
-  int32_t val;
-  val = ROUND_POWER_OF_TWO_SIGNED(
-      ref[iy * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sy) *
-              (WARPEDPIXEL_PREC_SHIFTS - sx) +
-          ref[iy * stride + ix + 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) * sx +
-          ref[(iy + 1) * stride + ix] * sy * (WARPEDPIXEL_PREC_SHIFTS - sx) +
-          ref[(iy + 1) * stride + ix + 1] * sy * sx,
-      WARPEDPIXEL_PREC_BITS * 2);
-  return (uint8_t)clip_pixel(val);
-}
-
-static uint8_t warp_interpolate(const uint8_t *const ref, int x, int y,
-                                int width, int height, int stride) {
-  const int ix = x >> WARPEDPIXEL_PREC_BITS;
-  const int iy = y >> WARPEDPIXEL_PREC_BITS;
-  const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
-  const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
-  int32_t v;
-
-  if (ix < 0 && iy < 0)
-    return ref[0];
-  else if (ix < 0 && iy >= height - 1)
-    return ref[(height - 1) * stride];
-  else if (ix >= width - 1 && iy < 0)
-    return ref[width - 1];
-  else if (ix >= width - 1 && iy >= height - 1)
-    return ref[(height - 1) * stride + (width - 1)];
-  else if (ix < 0) {
-    v = ROUND_POWER_OF_TWO_SIGNED(
-        ref[iy * stride] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
-            ref[(iy + 1) * stride] * sy,
-        WARPEDPIXEL_PREC_BITS);
-    return clip_pixel(v);
-  } else if (iy < 0) {
-    v = ROUND_POWER_OF_TWO_SIGNED(
-        ref[ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) + ref[ix + 1] * sx,
-        WARPEDPIXEL_PREC_BITS);
-    return clip_pixel(v);
-  } else if (ix >= width - 1) {
-    v = ROUND_POWER_OF_TWO_SIGNED(
-        ref[iy * stride + width - 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
-            ref[(iy + 1) * stride + width - 1] * sy,
-        WARPEDPIXEL_PREC_BITS);
-    return clip_pixel(v);
-  } else if (iy >= height - 1) {
-    v = ROUND_POWER_OF_TWO_SIGNED(
-        ref[(height - 1) * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) +
-            ref[(height - 1) * stride + ix + 1] * sx,
-        WARPEDPIXEL_PREC_BITS);
-    return clip_pixel(v);
-  } else if (ix >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
-             iy >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
-             ix < width - WARPEDPIXEL_FILTER_TAPS / 2 &&
-             iy < height - WARPEDPIXEL_FILTER_TAPS / 2) {
-    return bi_ntap_filter(ref, x, y, stride);
-  } else if (ix >= 1 && iy >= 1 && ix < width - 2 && iy < height - 2) {
-    return bi_cubic_filter(ref, x, y, stride);
-  } else {
-    return bi_linear_filter(ref, x, y, stride);
-  }
-}
-
 // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
 // at a time. The zoom/rotation/shear in the model are applied to the
 // "fractional" position of each pixel, which therefore varies within
@@ -683,15 +322,14 @@ static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
   8240,  8224,  8208,  8192,
 };
 
-#if CONFIG_WARPED_MOTION
 // Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
 // at precision of DIV_LUT_PREC_BITS along with the shift.
 static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
-  int64_t e, f;
+  int64_t f;
   *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
                                : get_msb((unsigned int)D));
   // e is obtained from D after resetting the most significant 1 bit.
-  e = D - ((uint64_t)1 << *shift);
+  const int64_t e = D - ((uint64_t)1 << *shift);
   // Get the most significant DIV_LUT_BITS (8) bits of e into f
   if (*shift > DIV_LUT_BITS)
     f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
@@ -702,13 +340,12 @@ static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
   // Use f as lookup into the precomputed table of multipliers
   return div_lut[f];
 }
-#endif  // CONFIG_WARPED_MOTION
 
 static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
-  int32_t e, f;
+  int32_t f;
   *shift = get_msb(D);
   // e is obtained from D after resetting the most significant 1 bit.
-  e = D - ((uint32_t)1 << *shift);
+  const int32_t e = D - ((uint32_t)1 << *shift);
   // Get the most significant DIV_LUT_BITS (8) bits of e into f
   if (*shift > DIV_LUT_BITS)
     f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
@@ -743,16 +380,13 @@ int get_shear_params(WarpedMotionParams *wm) {
   wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
   int16_t shift;
   int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
-  int64_t v;
-  v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
+  int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
   wm->gamma =
       clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
   v = ((int64_t)mat[3] * mat[4]) * y;
   wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
                         (1 << WARPEDMODEL_PREC_BITS),
                     INT16_MIN, INT16_MAX);
-  if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
-    return 0;
 
   wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
               (1 << WARP_PARAM_REDUCE_BITS);
@@ -762,171 +396,24 @@ int get_shear_params(WarpedMotionParams *wm) {
               (1 << WARP_PARAM_REDUCE_BITS);
   wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
               (1 << WARP_PARAM_REDUCE_BITS);
-  return 1;
-}
-
-#if CONFIG_HIGHBITDEPTH
-static INLINE void highbd_get_subcolumn(int taps, const uint16_t *const ref,
-                                        int32_t *col, int stride, int x,
-                                        int y_start) {
-  int i;
-  for (i = 0; i < taps; ++i) {
-    col[i] = ref[(i + y_start) * stride + x];
-  }
-}
-
-static uint16_t highbd_bi_ntap_filter(const uint16_t *const ref, int x, int y,
-                                      int stride, int bd) {
-  int32_t val, arr[WARPEDPIXEL_FILTER_TAPS];
-  int k;
-  const int i = (int)x >> WARPEDPIXEL_PREC_BITS;
-  const int j = (int)y >> WARPEDPIXEL_PREC_BITS;
-  for (k = 0; k < WARPEDPIXEL_FILTER_TAPS; ++k) {
-    int32_t arr_temp[WARPEDPIXEL_FILTER_TAPS];
-    highbd_get_subcolumn(WARPEDPIXEL_FILTER_TAPS, ref, arr_temp, stride,
-                         i + k + 1 - WARPEDPIXEL_FILTER_TAPS / 2,
-                         j + 1 - WARPEDPIXEL_FILTER_TAPS / 2);
-    arr[k] = do_ntap_filter(arr_temp + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
-                            y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
-  }
-  val = do_ntap_filter(arr + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
-                       x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
-  val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
-  return (uint16_t)clip_pixel_highbd(val, bd);
-}
-
-static uint16_t highbd_bi_cubic_filter(const uint16_t *const ref, int x, int y,
-                                       int stride, int bd) {
-  int32_t val, arr[4];
-  int k;
-  const int i = (int)x >> WARPEDPIXEL_PREC_BITS;
-  const int j = (int)y >> WARPEDPIXEL_PREC_BITS;
-  for (k = 0; k < 4; ++k) {
-    int32_t arr_temp[4];
-    highbd_get_subcolumn(4, ref, arr_temp, stride, i + k - 1, j - 1);
-    arr[k] =
-        do_cubic_filter(arr_temp + 1, y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
-  }
-  val = do_cubic_filter(arr + 1, x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
-  val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
-  return (uint16_t)clip_pixel_highbd(val, bd);
-}
 
-static uint16_t highbd_bi_linear_filter(const uint16_t *const ref, int x, int y,
-                                        int stride, int bd) {
-  const int ix = x >> WARPEDPIXEL_PREC_BITS;
-  const int iy = y >> WARPEDPIXEL_PREC_BITS;
-  const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
-  const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
-  int32_t val;
-  val = ROUND_POWER_OF_TWO_SIGNED(
-      ref[iy * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sy) *
-              (WARPEDPIXEL_PREC_SHIFTS - sx) +
-          ref[iy * stride + ix + 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) * sx +
-          ref[(iy + 1) * stride + ix] * sy * (WARPEDPIXEL_PREC_SHIFTS - sx) +
-          ref[(iy + 1) * stride + ix + 1] * sy * sx,
-      WARPEDPIXEL_PREC_BITS * 2);
-  return (uint16_t)clip_pixel_highbd(val, bd);
-}
+  if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
+    return 0;
 
-static uint16_t highbd_warp_interpolate(const uint16_t *const ref, int x, int y,
-                                        int width, int height, int stride,
-                                        int bd) {
-  const int ix = x >> WARPEDPIXEL_PREC_BITS;
-  const int iy = y >> WARPEDPIXEL_PREC_BITS;
-  const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
-  const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
-  int32_t v;
-
-  if (ix < 0 && iy < 0)
-    return ref[0];
-  else if (ix < 0 && iy > height - 1)
-    return ref[(height - 1) * stride];
-  else if (ix > width - 1 && iy < 0)
-    return ref[width - 1];
-  else if (ix > width - 1 && iy > height - 1)
-    return ref[(height - 1) * stride + (width - 1)];
-  else if (ix < 0) {
-    v = ROUND_POWER_OF_TWO_SIGNED(
-        ref[iy * stride] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
-            ref[(iy + 1) * stride] * sy,
-        WARPEDPIXEL_PREC_BITS);
-    return clip_pixel_highbd(v, bd);
-  } else if (iy < 0) {
-    v = ROUND_POWER_OF_TWO_SIGNED(
-        ref[ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) + ref[ix + 1] * sx,
-        WARPEDPIXEL_PREC_BITS);
-    return clip_pixel_highbd(v, bd);
-  } else if (ix > width - 1) {
-    v = ROUND_POWER_OF_TWO_SIGNED(
-        ref[iy * stride + width - 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
-            ref[(iy + 1) * stride + width - 1] * sy,
-        WARPEDPIXEL_PREC_BITS);
-    return clip_pixel_highbd(v, bd);
-  } else if (iy > height - 1) {
-    v = ROUND_POWER_OF_TWO_SIGNED(
-        ref[(height - 1) * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) +
-            ref[(height - 1) * stride + ix + 1] * sx,
-        WARPEDPIXEL_PREC_BITS);
-    return clip_pixel_highbd(v, bd);
-  } else if (ix >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
-             iy >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
-             ix < width - WARPEDPIXEL_FILTER_TAPS / 2 &&
-             iy < height - WARPEDPIXEL_FILTER_TAPS / 2) {
-    return highbd_bi_ntap_filter(ref, x, y, stride, bd);
-  } else if (ix >= 1 && iy >= 1 && ix < width - 2 && iy < height - 2) {
-    return highbd_bi_cubic_filter(ref, x, y, stride, bd);
-  } else {
-    return highbd_bi_linear_filter(ref, x, y, stride, bd);
-  }
+  return 1;
 }
 
 static INLINE int highbd_error_measure(int err, int bd) {
   const int b = bd - 8;
   const int bmask = (1 << b) - 1;
   const int v = (1 << b);
-  int e1, e2;
   err = abs(err);
-  e1 = err >> b;
-  e2 = err & bmask;
+  const int e1 = err >> b;
+  const int e2 = err & bmask;
   return error_measure_lut[255 + e1] * (v - e2) +
          error_measure_lut[256 + e1] * e2;
 }
 
-static void highbd_warp_plane_old(const WarpedMotionParams *const wm,
-                                  const uint8_t *const ref8, int width,
-                                  int height, int stride,
-                                  const uint8_t *const pred8, int p_col,
-                                  int p_row, int p_width, int p_height,
-                                  int p_stride, int subsampling_x,
-                                  int subsampling_y, int x_scale, int y_scale,
-                                  int bd, ConvolveParams *conv_params) {
-  int i, j;
-  ProjectPointsFunc projectpoints = get_project_points_type(wm->wmtype);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
-  if (projectpoints == NULL) return;
-  for (i = p_row; i < p_row + p_height; ++i) {
-    for (j = p_col; j < p_col + p_width; ++j) {
-      int in[2], out[2];
-      in[0] = j;
-      in[1] = i;
-      projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
-      out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, SCALE_SUBPEL_BITS);
-      out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, SCALE_SUBPEL_BITS);
-      if (conv_params->do_average)
-        pred[(j - p_col) + (i - p_row) * p_stride] = ROUND_POWER_OF_TWO(
-            pred[(j - p_col) + (i - p_row) * p_stride] +
-                highbd_warp_interpolate(ref, out[0], out[1], width, height,
-                                        stride, bd),
-            1);
-      else
-        pred[(j - p_col) + (i - p_row) * p_stride] = highbd_warp_interpolate(
-            ref, out[0], out[1], width, height, stride, bd);
-    }
-  }
-}
-
 /* Note: For an explanation of the warp algorithm, and some notes on bit widths
     for hardware implementations, see the comments above av1_warp_affine_c
 */
@@ -938,37 +425,23 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
                               ConvolveParams *conv_params, int16_t alpha,
                               int16_t beta, int16_t gamma, int16_t delta) {
   int32_t tmp[15 * 8];
-  int i, j, k, l, m;
-#if CONFIG_CONVOLVE_ROUND
-  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
   const int reduce_bits_horiz =
-      use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
-  const int max_bits_horiz =
-      use_conv_params
-          ? bd + FILTER_BITS + 1 - conv_params->round_0
-          : bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
-  const int offset_bits_horiz =
-      use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
-  const int offset_bits_vert =
-      use_conv_params
-          ? bd + 2 * FILTER_BITS - conv_params->round_0
-          : bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
-  if (use_conv_params) {
-    conv_params->do_post_rounding = 1;
-  }
-  assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
-#else
-  const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
-  const int max_bits_horiz =
-      bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
-  const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
-  const int offset_bits_vert =
-      bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
-#endif
+      conv_params->round_0 +
+      AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   (void)max_bits_horiz;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
 
-  for (i = p_row; i < p_row + p_height; i += 8) {
-    for (j = p_col; j < p_col + p_width; j += 8) {
+  for (int i = p_row; i < p_row + p_height; i += 8) {
+    for (int j = p_col; j < p_col + p_width; j += 8) {
       // Calculate the center of this 8x8 block,
       // project to luma coordinates (if in a subsampled chroma plane),
       // apply the affine transformation,
@@ -980,9 +453,9 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
       const int32_t x4 = dst_x >> subsampling_x;
       const int32_t y4 = dst_y >> subsampling_y;
 
-      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      const int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      const int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
 
       sx4 += alpha * (-4) + beta * (-4);
@@ -992,15 +465,11 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
 
       // Horizontal filter
-      for (k = -7; k < 8; ++k) {
-        int iy = iy4 + k;
-        if (iy < 0)
-          iy = 0;
-        else if (iy > height - 1)
-          iy = height - 1;
+      for (int k = -7; k < 8; ++k) {
+        const int iy = clamp(iy4 + k, 0, height - 1);
 
         int sx = sx4 + beta * (k + 4);
-        for (l = -4; l < 4; ++l) {
+        for (int l = -4; l < 4; ++l) {
           int ix = ix4 + l - 3;
           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
@@ -1008,12 +477,8 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
           const int16_t *coeffs = warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_horiz;
-          for (m = 0; m < 8; ++m) {
-            int sample_x = ix + m;
-            if (sample_x < 0)
-              sample_x = 0;
-            else if (sample_x > width - 1)
-              sample_x = width - 1;
+          for (int m = 0; m < 8; ++m) {
+            const int sample_x = clamp(ix + m, 0, width - 1);
             sum += ref[iy * stride + sample_x] * coeffs[m];
           }
           sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
@@ -1024,46 +489,50 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
       }
 
       // Vertical filter
-      for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+      for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
         int sy = sy4 + delta * (k + 4);
-        for (l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
+        for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
           const int16_t *coeffs = warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_vert;
-          for (m = 0; m < 8; ++m) {
+          for (int m = 0; m < 8; ++m) {
             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
           }
-#if CONFIG_CONVOLVE_ROUND
-          if (use_conv_params) {
+
+          if (conv_params->is_compound) {
             CONV_BUF_TYPE *p =
                 &conv_params
                      ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
                            (j - p_col + l + 4)];
-            sum = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
-                  (1 << (offset_bits_horiz + FILTER_BITS -
-                         conv_params->round_0 - conv_params->round_1)) -
-                  (1 << (offset_bits_vert - conv_params->round_1));
-            if (conv_params->do_average)
-              *p += sum;
-            else
+            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
+            if (conv_params->do_average) {
+              uint16_t *dst16 =
+                  &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+              int32_t tmp32 = *p;
+              if (conv_params->use_jnt_comp_avg) {
+                tmp32 = tmp32 * conv_params->fwd_offset +
+                        sum * conv_params->bck_offset;
+                tmp32 = tmp32 >> DIST_PRECISION_BITS;
+              } else {
+                tmp32 += sum;
+                tmp32 = tmp32 >> 1;
+              }
+              tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
+                      (1 << (offset_bits - conv_params->round_1 - 1));
+              *dst16 =
+                  clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
+            } else {
               *p = sum;
+            }
           } else {
-#else
-          {
-#endif
             uint16_t *p =
                 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
-            sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
+            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
             assert(0 <= sum && sum < (1 << (bd + 2)));
-            uint16_t px =
-                clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
-            if (conv_params->do_average)
-              *p = ROUND_POWER_OF_TWO(*p + px, 1);
-            else
-              *p = px;
+            *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
           }
           sy += gamma;
         }
@@ -1076,32 +545,25 @@ static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8,
                               int width, int height, int stride,
                               const uint8_t *const pred8, int p_col, int p_row,
                               int p_width, int p_height, int p_stride,
-                              int subsampling_x, int subsampling_y, int x_scale,
-                              int y_scale, int bd,
+                              int subsampling_x, int subsampling_y, int bd,
                               ConvolveParams *conv_params) {
+  assert(wm->wmtype <= AFFINE);
   if (wm->wmtype == ROTZOOM) {
     wm->wmmat[5] = wm->wmmat[2];
     wm->wmmat[4] = -wm->wmmat[3];
   }
-  if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) &&
-      x_scale == SCALE_SUBPEL_SHIFTS && y_scale == SCALE_SUBPEL_SHIFTS) {
-    const int32_t *const mat = wm->wmmat;
-    const int16_t alpha = wm->alpha;
-    const int16_t beta = wm->beta;
-    const int16_t gamma = wm->gamma;
-    const int16_t delta = wm->delta;
-
-    const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
-    uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-    av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
-                           p_width, p_height, p_stride, subsampling_x,
-                           subsampling_y, bd, conv_params, alpha, beta, gamma,
-                           delta);
-  } else {
-    highbd_warp_plane_old(wm, ref8, width, height, stride, pred8, p_col, p_row,
-                          p_width, p_height, p_stride, subsampling_x,
-                          subsampling_y, x_scale, y_scale, bd, conv_params);
-  }
+  const int32_t *const mat = wm->wmmat;
+  const int16_t alpha = wm->alpha;
+  const int16_t beta = wm->beta;
+  const int16_t gamma = wm->gamma;
+  const int16_t delta = wm->delta;
+
+  const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
+                         p_width, p_height, p_stride, subsampling_x,
+                         subsampling_y, bd, conv_params, alpha, beta, gamma,
+                         delta);
 }
 
 static int64_t highbd_frame_error(const uint16_t *const ref, int stride,
@@ -1120,25 +582,25 @@ static int64_t highbd_frame_error(const uint16_t *const ref, int stride,
 static int64_t highbd_warp_error(
     WarpedMotionParams *wm, const uint8_t *const ref8, int width, int height,
     int stride, const uint8_t *const dst8, int p_col, int p_row, int p_width,
-    int p_height, int p_stride, int subsampling_x, int subsampling_y,
-    int x_scale, int y_scale, int bd, int64_t best_error) {
+    int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd,
+    int64_t best_error) {
   int64_t gm_sumerr = 0;
-  int warp_w, warp_h;
-  int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
-  int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
   uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
 
-  ConvolveParams conv_params = get_conv_params(0, 0, 0);
+  ConvolveParams conv_params = get_conv_params(0, 0, 0, bd);
+  conv_params.use_jnt_comp_avg = 0;
   for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
     for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
       // avoid warping extra 8x8 blocks in the padded region of the frame
       // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
-      warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
-      warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
+      const int warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
+      const int warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
       highbd_warp_plane(wm, ref8, width, height, stride,
                         CONVERT_TO_BYTEPTR(tmp), j, i, warp_w, warp_h,
-                        WARP_ERROR_BLOCK, subsampling_x, subsampling_y, x_scale,
-                        y_scale, bd, &conv_params);
+                        WARP_ERROR_BLOCK, subsampling_x, subsampling_y, bd,
+                        &conv_params);
 
       gm_sumerr += highbd_frame_error(
           tmp, WARP_ERROR_BLOCK, CONVERT_TO_SHORTPTR(dst8) + j + i * p_stride,
@@ -1148,41 +610,11 @@ static int64_t highbd_warp_error(
   }
   return gm_sumerr;
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 static INLINE int error_measure(int err) {
   return error_measure_lut[255 + err];
 }
 
-static void warp_plane_old(const WarpedMotionParams *const wm,
-                           const uint8_t *const ref, int width, int height,
-                           int stride, uint8_t *pred, int p_col, int p_row,
-                           int p_width, int p_height, int p_stride,
-                           int subsampling_x, int subsampling_y, int x_scale,
-                           int y_scale, ConvolveParams *conv_params) {
-  int i, j;
-  ProjectPointsFunc projectpoints = get_project_points_type(wm->wmtype);
-  if (projectpoints == NULL) return;
-  for (i = p_row; i < p_row + p_height; ++i) {
-    for (j = p_col; j < p_col + p_width; ++j) {
-      int in[2], out[2];
-      in[0] = j;
-      in[1] = i;
-      projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
-      out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, SCALE_SUBPEL_BITS);
-      out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, SCALE_SUBPEL_BITS);
-      if (conv_params->do_average)
-        pred[(j - p_col) + (i - p_row) * p_stride] = ROUND_POWER_OF_TWO(
-            pred[(j - p_col) + (i - p_row) * p_stride] +
-                warp_interpolate(ref, out[0], out[1], width, height, stride),
-            1);
-      else
-        pred[(j - p_col) + (i - p_row) * p_stride] =
-            warp_interpolate(ref, out[0], out[1], width, height, stride);
-    }
-  }
-}
-
 /* The warp filter for ROTZOOM and AFFINE models works as follows:
    * Split the input into 8x8 blocks
    * For each block, project the point (4, 4) within the block, to get the
@@ -1237,10 +669,10 @@ static void warp_plane_old(const WarpedMotionParams *const wm,
     This allows the derivation of the appropriate bit widths and offsets for
     the various intermediate values: If
 
-    F := WARPEDPIXEL_FILTER_BITS = 7 (or else the above ranges need adjusting)
+    F := FILTER_BITS = 7 (or else the above ranges need adjusting)
          So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
          intermediate value.
-    H := HORSHEAR_REDUCE_PREC_BITS
+    H := ROUND0_BITS
     V := VERSHEAR_REDUCE_PREC_BITS
     (and note that we must have H + V = 2*F for the output to have the same
      scale as the input)
@@ -1275,38 +707,23 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
                        ConvolveParams *conv_params, int16_t alpha, int16_t beta,
                        int16_t gamma, int16_t delta) {
   int32_t tmp[15 * 8];
-  int i, j, k, l, m;
   const int bd = 8;
-#if CONFIG_CONVOLVE_ROUND
-  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
-  const int reduce_bits_horiz =
-      use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
-  const int max_bits_horiz =
-      use_conv_params
-          ? bd + FILTER_BITS + 1 - conv_params->round_0
-          : bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
-  const int offset_bits_horiz =
-      use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
-  const int offset_bits_vert =
-      use_conv_params
-          ? bd + 2 * FILTER_BITS - conv_params->round_0
-          : bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
-  if (use_conv_params) {
-    conv_params->do_post_rounding = 1;
-  }
-  assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
-#else
-  const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
-  const int max_bits_horiz =
-      bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
-  const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
-  const int offset_bits_vert =
-      bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
-#endif
+  const int reduce_bits_horiz = conv_params->round_0;
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   (void)max_bits_horiz;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
 
-  for (i = p_row; i < p_row + p_height; i += 8) {
-    for (j = p_col; j < p_col + p_width; j += 8) {
+  for (int i = p_row; i < p_row + p_height; i += 8) {
+    for (int j = p_col; j < p_col + p_width; j += 8) {
       // Calculate the center of this 8x8 block,
       // project to luma coordinates (if in a subsampled chroma plane),
       // apply the affine transformation,
@@ -1330,17 +747,13 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
 
       // Horizontal filter
-      for (k = -7; k < 8; ++k) {
+      for (int k = -7; k < 8; ++k) {
         // Clamp to top/bottom edge of the frame
-        int iy = iy4 + k;
-        if (iy < 0)
-          iy = 0;
-        else if (iy > height - 1)
-          iy = height - 1;
+        const int iy = clamp(iy4 + k, 0, height - 1);
 
         int sx = sx4 + beta * (k + 4);
 
-        for (l = -4; l < 4; ++l) {
+        for (int l = -4; l < 4; ++l) {
           int ix = ix4 + l - 3;
           // At this point, sx = sx4 + alpha * l + beta * k
           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
@@ -1349,13 +762,9 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
           const int16_t *coeffs = warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_horiz;
-          for (m = 0; m < 8; ++m) {
+          for (int m = 0; m < 8; ++m) {
             // Clamp to left/right edge of the frame
-            int sample_x = ix + m;
-            if (sample_x < 0)
-              sample_x = 0;
-            else if (sample_x > width - 1)
-              sample_x = width - 1;
+            const int sample_x = clamp(ix + m, 0, width - 1);
 
             sum += ref[iy * stride + sample_x] * coeffs[m];
           }
@@ -1367,9 +776,9 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
       }
 
       // Vertical filter
-      for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+      for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
         int sy = sy4 + delta * (k + 4);
-        for (l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
+        for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
           // At this point, sy = sy4 + gamma * l + delta * k
           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
@@ -1377,36 +786,40 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
           const int16_t *coeffs = warped_filter[offs];
 
           int32_t sum = 1 << offset_bits_vert;
-          for (m = 0; m < 8; ++m) {
+          for (int m = 0; m < 8; ++m) {
             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
           }
-#if CONFIG_CONVOLVE_ROUND
-          if (use_conv_params) {
+
+          if (conv_params->is_compound) {
             CONV_BUF_TYPE *p =
                 &conv_params
                      ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
                            (j - p_col + l + 4)];
-            sum = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
-                  (1 << (offset_bits_horiz + FILTER_BITS -
-                         conv_params->round_0 - conv_params->round_1)) -
-                  (1 << (offset_bits_vert - conv_params->round_1));
-            if (conv_params->do_average)
-              *p += sum;
-            else
+            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
+            if (conv_params->do_average) {
+              uint8_t *dst8 =
+                  &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+              int32_t tmp32 = *p;
+              if (conv_params->use_jnt_comp_avg) {
+                tmp32 = tmp32 * conv_params->fwd_offset +
+                        sum * conv_params->bck_offset;
+                tmp32 = tmp32 >> DIST_PRECISION_BITS;
+              } else {
+                tmp32 += sum;
+                tmp32 = tmp32 >> 1;
+              }
+              tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
+                      (1 << (offset_bits - conv_params->round_1 - 1));
+              *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
+            } else {
               *p = sum;
+            }
           } else {
-#else
-          {
-#endif
             uint8_t *p =
                 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
-            sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
+            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
             assert(0 <= sum && sum < (1 << (bd + 2)));
-            uint8_t px = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
-            if (conv_params->do_average)
-              *p = ROUND_POWER_OF_TWO(*p + px, 1);
-            else
-              *p = px;
+            *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
           }
           sy += gamma;
         }
@@ -1419,27 +832,20 @@ static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref,
                        int width, int height, int stride, uint8_t *pred,
                        int p_col, int p_row, int p_width, int p_height,
                        int p_stride, int subsampling_x, int subsampling_y,
-                       int x_scale, int y_scale, ConvolveParams *conv_params) {
+                       ConvolveParams *conv_params) {
+  assert(wm->wmtype <= AFFINE);
   if (wm->wmtype == ROTZOOM) {
     wm->wmmat[5] = wm->wmmat[2];
     wm->wmmat[4] = -wm->wmmat[3];
   }
-  if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) &&
-      x_scale == SCALE_SUBPEL_SHIFTS && y_scale == SCALE_SUBPEL_SHIFTS) {
-    const int32_t *const mat = wm->wmmat;
-    const int16_t alpha = wm->alpha;
-    const int16_t beta = wm->beta;
-    const int16_t gamma = wm->gamma;
-    const int16_t delta = wm->delta;
-
-    av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
-                    p_width, p_height, p_stride, subsampling_x, subsampling_y,
-                    conv_params, alpha, beta, gamma, delta);
-  } else {
-    warp_plane_old(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
-                   p_height, p_stride, subsampling_x, subsampling_y, x_scale,
-                   y_scale, conv_params);
-  }
+  const int32_t *const mat = wm->wmmat;
+  const int16_t alpha = wm->alpha;
+  const int16_t beta = wm->beta;
+  const int16_t gamma = wm->gamma;
+  const int16_t delta = wm->delta;
+  av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
+                  p_height, p_stride, subsampling_x, subsampling_y, conv_params,
+                  alpha, beta, gamma, delta);
 }
 
 static int64_t frame_error(const uint8_t *const ref, int stride,
@@ -1459,14 +865,15 @@ static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
                           int width, int height, int stride,
                           const uint8_t *const dst, int p_col, int p_row,
                           int p_width, int p_height, int p_stride,
-                          int subsampling_x, int subsampling_y, int x_scale,
-                          int y_scale, int64_t best_error) {
+                          int subsampling_x, int subsampling_y,
+                          int64_t best_error) {
   int64_t gm_sumerr = 0;
   int warp_w, warp_h;
   int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
   int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
   uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
-  ConvolveParams conv_params = get_conv_params(0, 0, 0);
+  ConvolveParams conv_params = get_conv_params(0, 0, 0, 8);
+  conv_params.use_jnt_comp_avg = 0;
 
   for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
     for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
@@ -1475,8 +882,7 @@ static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
       warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
       warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
       warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h,
-                 WARP_ERROR_BLOCK, subsampling_x, subsampling_y, x_scale,
-                 y_scale, &conv_params);
+                 WARP_ERROR_BLOCK, subsampling_x, subsampling_y, &conv_params);
 
       gm_sumerr += frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
                                warp_w, warp_h, p_stride);
@@ -1486,70 +892,49 @@ static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
   return gm_sumerr;
 }
 
-int64_t av1_frame_error(
-#if CONFIG_HIGHBITDEPTH
-    int use_hbd, int bd,
-#endif  // CONFIG_HIGHBITDEPTH
-    const uint8_t *ref, int stride, uint8_t *dst, int p_width, int p_height,
-    int p_stride) {
-#if CONFIG_HIGHBITDEPTH
+int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
+                        uint8_t *dst, int p_width, int p_height, int p_stride) {
   if (use_hbd) {
     return highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
                               CONVERT_TO_SHORTPTR(dst), p_width, p_height,
                               p_stride, bd);
   }
-#endif  // CONFIG_HIGHBITDEPTH
   return frame_error(ref, stride, dst, p_width, p_height, p_stride);
 }
 
-int64_t av1_warp_error(WarpedMotionParams *wm,
-#if CONFIG_HIGHBITDEPTH
-                       int use_hbd, int bd,
-#endif  // CONFIG_HIGHBITDEPTH
+int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
                        const uint8_t *ref, int width, int height, int stride,
                        uint8_t *dst, int p_col, int p_row, int p_width,
                        int p_height, int p_stride, int subsampling_x,
-                       int subsampling_y, int x_scale, int y_scale,
-                       int64_t best_error) {
+                       int subsampling_y, int64_t best_error) {
   if (wm->wmtype <= AFFINE)
     if (!get_shear_params(wm)) return 1;
-#if CONFIG_HIGHBITDEPTH
   if (use_hbd)
     return highbd_warp_error(wm, ref, width, height, stride, dst, p_col, p_row,
                              p_width, p_height, p_stride, subsampling_x,
-                             subsampling_y, x_scale, y_scale, bd, best_error);
-#endif  // CONFIG_HIGHBITDEPTH
+                             subsampling_y, bd, best_error);
   return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width,
-                    p_height, p_stride, subsampling_x, subsampling_y, x_scale,
-                    y_scale, best_error);
+                    p_height, p_stride, subsampling_x, subsampling_y,
+                    best_error);
 }
 
-void av1_warp_plane(WarpedMotionParams *wm,
-#if CONFIG_HIGHBITDEPTH
-                    int use_hbd, int bd,
-#endif  // CONFIG_HIGHBITDEPTH
+void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
                     const uint8_t *ref, int width, int height, int stride,
                     uint8_t *pred, int p_col, int p_row, int p_width,
                     int p_height, int p_stride, int subsampling_x,
-                    int subsampling_y, int x_scale, int y_scale,
-                    ConvolveParams *conv_params) {
-#if CONFIG_HIGHBITDEPTH
+                    int subsampling_y, ConvolveParams *conv_params) {
   if (use_hbd)
     highbd_warp_plane(wm, ref, width, height, stride, pred, p_col, p_row,
                       p_width, p_height, p_stride, subsampling_x, subsampling_y,
-                      x_scale, y_scale, bd, conv_params);
+                      bd, conv_params);
   else
-#endif  // CONFIG_HIGHBITDEPTH
     warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
-               p_height, p_stride, subsampling_x, subsampling_y, x_scale,
-               y_scale, conv_params);
+               p_height, p_stride, subsampling_x, subsampling_y, conv_params);
 }
 
-#if CONFIG_WARPED_MOTION
-#define LEAST_SQUARES_ORDER 2
-
 #define LS_MV_MAX 256  // max mv in 1/8-pel
-#define LS_STEP 2
+// Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
+#define LS_STEP 8
 
 // Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
 // the precision needed is:
@@ -1570,13 +955,17 @@ void av1_warp_plane(WarpedMotionParams *wm,
 #define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
 #define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
 
-#define LS_SUM(a) ((a)*4 + LS_STEP * 2)
-#define LS_SQUARE(a) \
-  (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> 2)
-#define LS_PRODUCT1(a, b) \
-  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> 2)
-#define LS_PRODUCT2(a, b) \
-  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> 2)
+// By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
+// 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
+#define LS_SQUARE(a)                                          \
+  (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
+   (2 + LS_MAT_DOWN_BITS))
+#define LS_PRODUCT1(a, b)                                           \
+  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
+   (2 + LS_MAT_DOWN_BITS))
+#define LS_PRODUCT2(a, b)                                               \
+  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
+   (2 + LS_MAT_DOWN_BITS))
 
 #define USE_LIMITED_PREC_MULT 0
 
@@ -1655,22 +1044,24 @@ static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
 }
 #endif  // USE_LIMITED_PREC_MULT
 
-static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize,
-                           int mvy, int mvx, WarpedMotionParams *wm, int mi_row,
-                           int mi_col) {
+static int find_affine_int(int np, const int *pts1, const int *pts2,
+                           BLOCK_SIZE bsize, int mvy, int mvx,
+                           WarpedMotionParams *wm, int mi_row, int mi_col) {
   int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
   int32_t Bx[2] = { 0, 0 };
   int32_t By[2] = { 0, 0 };
-  int i, n = 0;
+  int i;
 
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
-  const int isuy = (mi_row * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1);
-  const int isux = (mi_col * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1);
-  const int suy = isuy * 8;
-  const int sux = isux * 8;
+  const int rsuy = (AOMMAX(bh, MI_SIZE) / 2 - 1);
+  const int rsux = (AOMMAX(bw, MI_SIZE) / 2 - 1);
+  const int suy = rsuy * 8;
+  const int sux = rsux * 8;
   const int duy = suy + mvy;
   const int dux = sux + mvx;
+  const int isuy = (mi_row * MI_SIZE + rsuy);
+  const int isux = (mi_col * MI_SIZE + rsux);
 
   // Assume the center pixel of the block has exactly the same motion vector
   // as transmitted for the block. First shift the origin of the source
@@ -1694,13 +1085,15 @@ static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize,
   //
   // The loop below computes: A = P'P, Bx = P'q, By = P'r
   // We need to just compute inv(A).Bx and inv(A).By for the solutions.
-  int sx, sy, dx, dy;
   // Contribution from neighbor block
-  for (i = 0; i < np && n < LEAST_SQUARES_SAMPLES_MAX; i++) {
-    dx = pts2[i * 2] - dux;
-    dy = pts2[i * 2 + 1] - duy;
-    sx = pts1[i * 2] - sux;
-    sy = pts1[i * 2 + 1] - suy;
+  for (i = 0; i < np; i++) {
+    const int dx = pts2[i * 2] - dux;
+    const int dy = pts2[i * 2 + 1] - duy;
+    const int sx = pts1[i * 2] - sux;
+    const int sy = pts1[i * 2 + 1] - suy;
+    // (TODO)yunqing: This comparison wouldn't be necessary if the sample
+    // selection is done in find_samples(). Also, global offset can be removed
+    // while collecting samples.
     if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
       A[0][0] += LS_SQUARE(sx);
       A[0][1] += LS_PRODUCT1(sx, sy);
@@ -1709,41 +1102,20 @@ static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize,
       Bx[1] += LS_PRODUCT1(sy, dx);
       By[0] += LS_PRODUCT1(sx, dy);
       By[1] += LS_PRODUCT2(sy, dy);
-      n++;
     }
   }
-  int downshift;
-  if (n >= 4)
-    downshift = LS_MAT_DOWN_BITS;
-  else if (n >= 2)
-    downshift = LS_MAT_DOWN_BITS - 1;
-  else
-    downshift = LS_MAT_DOWN_BITS - 2;
-
-  // Reduce precision by downshift bits
-  A[0][0] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[0][0], downshift), LS_MAT_MIN,
-                  LS_MAT_MAX);
-  A[0][1] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[0][1], downshift), LS_MAT_MIN,
-                  LS_MAT_MAX);
-  A[1][1] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[1][1], downshift), LS_MAT_MIN,
-                  LS_MAT_MAX);
-  Bx[0] = clamp(ROUND_POWER_OF_TWO_SIGNED(Bx[0], downshift), LS_MAT_MIN,
-                LS_MAT_MAX);
-  Bx[1] = clamp(ROUND_POWER_OF_TWO_SIGNED(Bx[1], downshift), LS_MAT_MIN,
-                LS_MAT_MAX);
-  By[0] = clamp(ROUND_POWER_OF_TWO_SIGNED(By[0], downshift), LS_MAT_MIN,
-                LS_MAT_MAX);
-  By[1] = clamp(ROUND_POWER_OF_TWO_SIGNED(By[1], downshift), LS_MAT_MIN,
-                LS_MAT_MAX);
-
-  int64_t Px[2], Py[2], Det;
-  int16_t iDet, shift;
 
-  // These divided by the Det, are the least squares solutions
-  Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
-  Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
-  Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
-  Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
+  // Just for debugging, and can be removed later.
+  assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
+  assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
+  assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
+  assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
+  assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
+  assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
+  assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
+
+  int64_t Det;
+  int16_t iDet, shift;
 
   // Compute Determinant of A
   Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
@@ -1755,6 +1127,14 @@ static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize,
     shift = 0;
   }
 
+  int64_t Px[2], Py[2];
+
+  // These divided by the Det, are the least squares solutions
+  Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
+  Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
+  Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
+  Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
+
   wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
   wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
   wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
@@ -1783,13 +1163,13 @@ int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
                     int mvx, WarpedMotionParams *wm_params, int mi_row,
                     int mi_col) {
   assert(wm_params->wmtype == AFFINE);
-  const int result = find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params,
-                                     mi_row, mi_col);
-  if (result == 0) {
-    // check compatibility with the fast warp filter
-    if (!get_shear_params(wm_params)) return 1;
-  }
 
-  return result;
+  if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
+                      mi_col))
+    return 1;
+
+  // check compatibility with the fast warp filter
+  if (!get_shear_params(wm_params)) return 1;
+
+  return 0;
 }
-#endif  // CONFIG_WARPED_MOTION
diff --git a/third_party/aom/av1/common/warped_motion.h b/third_party/aom/av1/common/warped_motion.h
index e05f6a85f..f5da36bbb 100644
--- a/third_party/aom/av1/common/warped_motion.h
+++ b/third_party/aom/av1/common/warped_motion.h
@@ -18,94 +18,79 @@
 #include <math.h>
 #include <assert.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_ports/mem.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "av1/common/mv.h"
 #include "av1/common/convolve.h"
 
 #define MAX_PARAMDIM 9
-#if CONFIG_WARPED_MOTION
 #define LEAST_SQUARES_SAMPLES_MAX_BITS 3
 #define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
-
-#if WARPED_MOTION_SORT_SAMPLES
-// Search 1 row on the top and 1 column on the left, 1 upper-left block,
-// 1 upper-right block.
-#define SAMPLES_ARRAY_SIZE ((MAX_MIB_SIZE * 2 + 2) * 2)
-#else
 #define SAMPLES_ARRAY_SIZE (LEAST_SQUARES_SAMPLES_MAX * 2)
-#endif  // WARPED_MOTION_SORT_SAMPLES
-
+#define WARPED_MOTION_DEBUG 0
 #define DEFAULT_WMTYPE AFFINE
-#endif  // CONFIG_WARPED_MOTION
 
 extern const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
 
-typedef void (*ProjectPointsFunc)(const int32_t *mat, int *points, int *proj,
-                                  const int n, const int stride_points,
-                                  const int stride_proj,
-                                  const int subsampling_x,
-                                  const int subsampling_y);
-
-void project_points_translation(const int32_t *mat, int *points, int *proj,
-                                const int n, const int stride_points,
-                                const int stride_proj, const int subsampling_x,
-                                const int subsampling_y);
-
-void project_points_rotzoom(const int32_t *mat, int *points, int *proj,
-                            const int n, const int stride_points,
-                            const int stride_proj, const int subsampling_x,
-                            const int subsampling_y);
+static const uint8_t warp_pad_left[14][16] = {
+  { 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 3, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 6, 6, 6, 6, 6, 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 7, 7, 7, 7, 7, 7, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 11, 12, 13, 14, 15 },
+  { 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 12, 13, 14, 15 },
+  { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 13, 14, 15 },
+  { 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 14, 15 },
+  { 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 15 },
+  { 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15 },
+};
+
+static const uint8_t warp_pad_right[14][16] = {
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11, 11 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9, 9, 9, 9 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 },
+  { 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 },
+  { 0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 },
+  { 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 },
+  { 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+  { 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+  { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
+};
 
 void project_points_affine(const int32_t *mat, int *points, int *proj,
                            const int n, const int stride_points,
                            const int stride_proj, const int subsampling_x,
                            const int subsampling_y);
 
-void project_points_hortrapezoid(const int32_t *mat, int *points, int *proj,
-                                 const int n, const int stride_points,
-                                 const int stride_proj, const int subsampling_x,
-                                 const int subsampling_y);
-void project_points_vertrapezoid(const int32_t *mat, int *points, int *proj,
-                                 const int n, const int stride_points,
-                                 const int stride_proj, const int subsampling_x,
-                                 const int subsampling_y);
-void project_points_homography(const int32_t *mat, int *points, int *proj,
-                               const int n, const int stride_points,
-                               const int stride_proj, const int subsampling_x,
-                               const int subsampling_y);
-
 // Returns the error between the result of applying motion 'wm' to the frame
 // described by 'ref' and the frame described by 'dst'.
-int64_t av1_warp_error(WarpedMotionParams *wm,
-#if CONFIG_HIGHBITDEPTH
-                       int use_hbd, int bd,
-#endif  // CONFIG_HIGHBITDEPTH
+int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
                        const uint8_t *ref, int width, int height, int stride,
                        uint8_t *dst, int p_col, int p_row, int p_width,
                        int p_height, int p_stride, int subsampling_x,
-                       int subsampling_y, int x_scale, int y_scale,
-                       int64_t best_error);
+                       int subsampling_y, int64_t best_error);
 
 // Returns the error between the frame described by 'ref' and the frame
 // described by 'dst'.
-int64_t av1_frame_error(
-#if CONFIG_HIGHBITDEPTH
-    int use_hbd, int bd,
-#endif  // CONFIG_HIGHBITDEPTH
-    const uint8_t *ref, int stride, uint8_t *dst, int p_width, int p_height,
-    int p_stride);
+int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
+                        uint8_t *dst, int p_width, int p_height, int p_stride);
 
-void av1_warp_plane(WarpedMotionParams *wm,
-#if CONFIG_HIGHBITDEPTH
-                    int use_hbd, int bd,
-#endif  // CONFIG_HIGHBITDEPTH
+void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
                     const uint8_t *ref, int width, int height, int stride,
                     uint8_t *pred, int p_col, int p_row, int p_width,
                     int p_height, int p_stride, int subsampling_x,
-                    int subsampling_y, int x_scale, int y_scale,
-                    ConvolveParams *conv_params);
+                    int subsampling_y, ConvolveParams *conv_params);
 
 int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
                     int mvx, WarpedMotionParams *wm_params, int mi_row,
diff --git a/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c
new file mode 100644
index 000000000..8aa14696f
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "av1/common/resize.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Note: If the crop width is not a multiple of 4, then, unlike the C version,
+// this function will overwrite some of the padding on the right hand side of
+// the frame. This padding appears to be trashed anyway, so this should not
+// affect the running of the decoder.
+void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  const int16_t *x_filters, int x0_qn,
+                                  int x_step_qn) {
+  assert(UPSCALE_NORMATIVE_TAPS == 8);
+
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+  const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+
+  const uint8_t *src_y;
+  uint8_t *dst_y;
+  int x_qn = x0_qn;
+  for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
+    const int x_filter_idx0 =
+        ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx1 =
+        ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx2 =
+        ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx3 =
+        ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+
+    assert(x_filter_idx0 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx1 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx2 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx3 <= RS_SUBPEL_MASK);
+
+    const int16_t *const x_filter0 =
+        &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter1 =
+        &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter2 =
+        &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter3 =
+        &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
+
+    const __m128i fil0_16 = xx_loadu_128(x_filter0);
+    const __m128i fil1_16 = xx_loadu_128(x_filter1);
+    const __m128i fil2_16 = xx_loadu_128(x_filter2);
+    const __m128i fil3_16 = xx_loadu_128(x_filter3);
+
+    src_y = src;
+    dst_y = dst;
+    for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
+      const uint8_t *const src_x0 =
+          &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint8_t *const src_x1 =
+          &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint8_t *const src_x2 =
+          &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint8_t *const src_x3 =
+          &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+
+      // Load up the source data. This is 8-bit input data, so each load
+      // gets 8 pixels.
+      const __m128i src0_8 = xx_loadl_64(src_x0);
+      const __m128i src1_8 = xx_loadl_64(src_x1);
+      const __m128i src2_8 = xx_loadl_64(src_x2);
+      const __m128i src3_8 = xx_loadl_64(src_x3);
+
+      // Now zero-extend up to 16-bit precision, i.e.
+      // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ]
+      const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8);
+      const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8);
+      const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8);
+      const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8);
+
+      // Multiply by filter coefficients (results in a 32-bit value),
+      // and add adjacent pairs, i.e.
+      // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
+      // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
+      const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
+      const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
+      const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
+      const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
+
+      // Reduce horizontally and add, i.e.
+      // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
+      const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
+      const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
+
+      const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
+
+      // Divide down by (1 << FILTER_BITS), rounding to nearest.
+      const __m128i shifted_32 =
+          _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
+
+      // Pack 32-bit values into 16-bit values, i.e.
+      // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
+      const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
+
+      // Pack 16-bit values into 8-bit values, i.e.
+      // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ])
+      // -> [ 0 0 0 0 0 0 DC BA ]
+      const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero);
+
+      // Write to the output
+      xx_storel_32(&dst_y[x], shifted_8);
+    }
+  }
+}
+
+// Note: If the crop width is not a multiple of 4, then, unlike the C version,
+// this function will overwrite some of the padding on the right hand side of
+// the frame. This padding appears to be trashed anyway, so this should not
+// affect the running of the decoder.
+void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride,
+                                         uint16_t *dst, int dst_stride, int w,
+                                         int h, const int16_t *x_filters,
+                                         int x0_qn, int x_step_qn, int bd) {
+  assert(UPSCALE_NORMATIVE_TAPS == 8);
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+  const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i clip_maximum = _mm_set1_epi16((1 << bd) - 1);
+
+  const uint16_t *src_y;
+  uint16_t *dst_y;
+  int x_qn = x0_qn;
+  for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
+    const int x_filter_idx0 =
+        ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx1 =
+        ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx2 =
+        ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx3 =
+        ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+
+    assert(x_filter_idx0 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx1 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx2 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx3 <= RS_SUBPEL_MASK);
+
+    const int16_t *const x_filter0 =
+        &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter1 =
+        &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter2 =
+        &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter3 =
+        &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
+
+    const __m128i fil0_16 = xx_loadu_128(x_filter0);
+    const __m128i fil1_16 = xx_loadu_128(x_filter1);
+    const __m128i fil2_16 = xx_loadu_128(x_filter2);
+    const __m128i fil3_16 = xx_loadu_128(x_filter3);
+
+    src_y = src;
+    dst_y = dst;
+    for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
+      const uint16_t *const src_x0 =
+          &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint16_t *const src_x1 =
+          &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint16_t *const src_x2 =
+          &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint16_t *const src_x3 =
+          &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+
+      // Load up the source data. This is 16-bit input data, so each load
+      // gets 8 pixels.
+      const __m128i src0_16 = xx_loadu_128(src_x0);
+      const __m128i src1_16 = xx_loadu_128(src_x1);
+      const __m128i src2_16 = xx_loadu_128(src_x2);
+      const __m128i src3_16 = xx_loadu_128(src_x3);
+
+      // Multiply by filter coefficients (results in a 32-bit value),
+      // and add adjacent pairs, i.e.
+      // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
+      // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
+      const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
+      const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
+      const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
+      const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
+
+      // Reduce horizontally and add, i.e.
+      // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
+      const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
+      const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
+
+      const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
+
+      // Divide down by (1 << FILTER_BITS), rounding to nearest.
+      const __m128i shifted_32 =
+          _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
+
+      // Pack 32-bit values into 16-bit values, i.e.
+      // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
+      const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
+
+      // Clip the values at (1 << bd) - 1
+      const __m128i clipped_16 = _mm_min_epi16(shifted_16, clip_maximum);
+
+      // Write to the output
+      xx_storel_64(&dst_y[x], clipped_16);
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
index 1f0fedb2a..6747cae01 100644
--- a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
@@ -12,135 +12,16 @@
 #include <assert.h>
 #include <smmintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "av1/common/convolve.h"
 
-// Make a mask for coefficients of 10/12 tap filters. The coefficients are
-// packed "89ab89ab". If it's a 12-tap filter, we want all 1's; if it's a
-// 10-tap filter, we want "11001100" to just match the 8,9 terms.
-static __m128i make_1012_mask(int ntaps) {
-  uint32_t low = 0xffffffff;
-  uint32_t high = (ntaps == 12) ? low : 0;
-  return _mm_set_epi32(high, low, high, low);
-}
-
-// Zero-extend the given input operand to an entire __m128i register.
-//
-// Note that there's almost an intrinsic to do this but 32-bit Visual Studio
-// doesn't have _mm_set_epi64x so we have to do it by hand.
-static __m128i extend_32_to_128(uint32_t x) {
-  return _mm_set_epi32(0, 0, 0, x);
-}
-
-// Load an SSE register from p and bitwise AND with a.
-static __m128i load_and_128i(const void *p, __m128i a) {
-  const __m128d ad = _mm_castsi128_pd(a);
-  const __m128d bd = _mm_load1_pd((const double *)p);
-  return _mm_castpd_si128(_mm_and_pd(ad, bd));
-}
-
-// The horizontal filter for av1_convolve_2d_scale_sse4_1. This is the more
-// general version, supporting 10 and 12 tap filters. For 8-tap filters, use
-// hfilter8.
-static void hfilter(const uint8_t *src, int src_stride, int32_t *dst, int w,
-                    int h, int subpel_x_qn, int x_step_qn,
-                    const InterpFilterParams *filter_params, unsigned round) {
-  const int bd = 8;
-  const int ntaps = filter_params->taps;
-  assert(ntaps == 10 || ntaps == 12);
-
-  src -= ntaps / 2 - 1;
-
-  // Construct a mask with which we'll AND filter coefficients 89ab89ab to zero
-  // out the unneeded entries.
-  const __m128i hicoeff_mask = make_1012_mask(ntaps);
-
-  int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
-  const __m128i round_add = _mm_set1_epi32(round_add32);
-  const __m128i round_shift = extend_32_to_128(round);
-
-  int x_qn = subpel_x_qn;
-  for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
-    const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
-    const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-    assert(filter_idx < SUBPEL_SHIFTS);
-    const int16_t *filter =
-        av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
-
-    // The "lo" coefficients are coefficients 0..7. For a 12-tap filter, the
-    // "hi" coefficients are arranged as 89ab89ab. For a 10-tap filter, they
-    // are masked out with hicoeff_mask.
-    const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
-    const __m128i coeffhi = load_and_128i(filter + 8, hicoeff_mask);
-    const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
-
-    int y;
-    for (y = 0; y <= h - 4; y += 4) {
-      const uint8_t *const src0 = src_col + y * src_stride;
-      const uint8_t *const src1 = src0 + 1 * src_stride;
-      const uint8_t *const src2 = src0 + 2 * src_stride;
-      const uint8_t *const src3 = src0 + 3 * src_stride;
-
-      // Load up source data. This is 8-bit input data, so each load gets 16
-      // pixels (we need at most 12)
-      const __m128i data08 = _mm_loadu_si128((__m128i *)src0);
-      const __m128i data18 = _mm_loadu_si128((__m128i *)src1);
-      const __m128i data28 = _mm_loadu_si128((__m128i *)src2);
-      const __m128i data38 = _mm_loadu_si128((__m128i *)src3);
-
-      // Now zero-extend up to 16-bit precision by interleaving with zeros. For
-      // the "high" pixels (8 to 11), interleave first (so that the expansion
-      // to 16-bits operates on an entire register).
-      const __m128i data0lo = _mm_unpacklo_epi8(data08, zero);
-      const __m128i data1lo = _mm_unpacklo_epi8(data18, zero);
-      const __m128i data2lo = _mm_unpacklo_epi8(data28, zero);
-      const __m128i data3lo = _mm_unpacklo_epi8(data38, zero);
-      const __m128i data01hi8 = _mm_unpackhi_epi32(data08, data18);
-      const __m128i data23hi8 = _mm_unpackhi_epi32(data28, data38);
-      const __m128i data01hi = _mm_unpacklo_epi8(data01hi8, zero);
-      const __m128i data23hi = _mm_unpacklo_epi8(data23hi8, zero);
-
-      // Multiply by coefficients
-      const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
-      const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
-      const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
-      const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
-      const __m128i conv01hi = _mm_madd_epi16(data01hi, coeffhi);
-      const __m128i conv23hi = _mm_madd_epi16(data23hi, coeffhi);
-
-      // Reduce horizontally and add
-      const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
-      const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
-      const __m128i convlo = _mm_hadd_epi32(conv01lo, conv23lo);
-      const __m128i convhi = _mm_hadd_epi32(conv01hi, conv23hi);
-      const __m128i conv = _mm_add_epi32(convlo, convhi);
-
-      // Divide down by (1 << round), rounding to nearest.
-      const __m128i shifted =
-          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
-
-      // Write transposed to the output
-      _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
-    }
-    for (; y < h; ++y) {
-      const uint8_t *const src_row = src_col + y * src_stride;
-
-      int32_t sum = (1 << (bd + FILTER_BITS - 1));
-      for (int k = 0; k < ntaps; ++k) {
-        sum += filter[k] * src_row[k];
-      }
-
-      dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
-    }
-  }
-}
-
 // A specialised version of hfilter, the horizontal filter for
 // av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
-static void hfilter8(const uint8_t *src, int src_stride, int32_t *dst, int w,
+static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w,
                      int h, int subpel_x_qn, int x_step_qn,
                      const InterpFilterParams *filter_params, unsigned round) {
   const int bd = 8;
@@ -150,7 +31,7 @@ static void hfilter8(const uint8_t *src, int src_stride, int32_t *dst, int w,
 
   int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
   const __m128i round_add = _mm_set1_epi32(round_add32);
-  const __m128i round_shift = extend_32_to_128(round);
+  const __m128i round_shift = _mm_cvtsi32_si128(round);
 
   int x_qn = subpel_x_qn;
   for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
@@ -197,11 +78,12 @@ static void hfilter8(const uint8_t *src, int src_stride, int32_t *dst, int w,
       const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
 
       // Divide down by (1 << round), rounding to nearest.
-      const __m128i shifted =
+      __m128i shifted =
           _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
 
+      shifted = _mm_packus_epi32(shifted, shifted);
       // Write transposed to the output
-      _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
+      _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
     }
     for (; y < h; ++y) {
       const uint8_t *const src_row = src_col + y * src_stride;
@@ -216,256 +98,179 @@ static void hfilter8(const uint8_t *src, int src_stride, int32_t *dst, int w,
   }
 }
 
-// Do a 12-tap convolution with the given coefficients, loading data from src.
-static __m128i convolve_32(const int32_t *src, __m128i coeff03, __m128i coeff47,
-                           __m128i coeff8d) {
-  const __m128i data03 = _mm_loadu_si128((__m128i *)src);
-  const __m128i data47 = _mm_loadu_si128((__m128i *)(src + 4));
-  const __m128i data8d = _mm_loadu_si128((__m128i *)(src + 8));
-  const __m128i conv03 = _mm_mullo_epi32(data03, coeff03);
-  const __m128i conv47 = _mm_mullo_epi32(data47, coeff47);
-  const __m128i conv8d = _mm_mullo_epi32(data8d, coeff8d);
-  return _mm_add_epi32(_mm_add_epi32(conv03, conv47), conv8d);
-}
-
-// Do an 8-tap convolution with the given coefficients, loading data from src.
-static __m128i convolve_32_8(const int32_t *src, __m128i coeff03,
-                             __m128i coeff47) {
-  const __m128i data03 = _mm_loadu_si128((__m128i *)src);
-  const __m128i data47 = _mm_loadu_si128((__m128i *)(src + 4));
-  const __m128i conv03 = _mm_mullo_epi32(data03, coeff03);
-  const __m128i conv47 = _mm_mullo_epi32(data47, coeff47);
-  return _mm_add_epi32(conv03, conv47);
-}
-
-// The vertical filter for av1_convolve_2d_scale_sse4_1. This is the more
-// general version, supporting 10 and 12 tap filters. For 8-tap filters, use
-// vfilter8.
-static void vfilter(const int32_t *src, int src_stride, int32_t *dst,
-                    int dst_stride, int w, int h, int subpel_y_qn,
-                    int y_step_qn, const InterpFilterParams *filter_params,
-                    const ConvolveParams *conv_params, int bd) {
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int ntaps = filter_params->taps;
-
-  // Construct a mask with which we'll AND filter coefficients 89ab to zero out
-  // the unneeded entries. The upper bits of this mask are unused.
-  const __m128i hicoeff_mask = make_1012_mask(ntaps);
-
-  int32_t round_add32 = (1 << conv_params->round_1) / 2 + (1 << offset_bits);
-  const __m128i round_add = _mm_set1_epi32(round_add32);
-  const __m128i round_shift = extend_32_to_128(conv_params->round_1);
-
-  const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
-                         (1 << (offset_bits - conv_params->round_1 - 1)));
-  const __m128i sub = _mm_set1_epi32(sub32);
-
-  int y_qn = subpel_y_qn;
-  for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
-    const int32_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
-    const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-    assert(filter_idx < SUBPEL_SHIFTS);
-    const int16_t *filter =
-        av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
-
-    // Load up coefficients for the filter and sign-extend to 32-bit precision
-    // (to do so, calculate sign bits and then interleave)
-    const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
-    const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
-    const __m128i coeffhi16 = load_and_128i(filter + 8, hicoeff_mask);
-    const __m128i csign0716 = _mm_cmplt_epi16(coeff0716, zero);
-    const __m128i csignhi16 = _mm_cmplt_epi16(coeffhi16, zero);
-    const __m128i coeff03 = _mm_unpacklo_epi16(coeff0716, csign0716);
-    const __m128i coeff47 = _mm_unpackhi_epi16(coeff0716, csign0716);
-    const __m128i coeff8d = _mm_unpacklo_epi16(coeffhi16, csignhi16);
-
-    int x;
-    for (x = 0; x <= w - 4; x += 4) {
-      const int32_t *const src0 = src_y + x * src_stride;
-      const int32_t *const src1 = src0 + 1 * src_stride;
-      const int32_t *const src2 = src0 + 2 * src_stride;
-      const int32_t *const src3 = src0 + 3 * src_stride;
-
-      // Load the source data for the three rows, adding the three registers of
-      // convolved products to one as we go (conv0..conv3) to avoid the
-      // register pressure getting too high.
-      const __m128i conv0 = convolve_32(src0, coeff03, coeff47, coeff8d);
-      const __m128i conv1 = convolve_32(src1, coeff03, coeff47, coeff8d);
-      const __m128i conv2 = convolve_32(src2, coeff03, coeff47, coeff8d);
-      const __m128i conv3 = convolve_32(src3, coeff03, coeff47, coeff8d);
-
-      // Now reduce horizontally to get one lane for each result
-      const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
-      const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
-      const __m128i conv = _mm_hadd_epi32(conv01, conv23);
-
-      // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
-      const __m128i shifted =
-          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
-      const __m128i subbed = _mm_sub_epi32(shifted, sub);
-
-      int32_t *dst_x = dst + y * dst_stride + x;
-      const __m128i result =
-          (conv_params->do_average)
-              ? _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x))
-              : subbed;
-
-      _mm_storeu_si128((__m128i *)dst_x, result);
-    }
-    for (; x < w; ++x) {
-      const int32_t *src_x = src_y + x * src_stride;
-      CONV_BUF_TYPE sum = 1 << offset_bits;
-      for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - sub32;
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
-        dst[y * dst_stride + x] = res;
-    }
-  }
+static __m128i convolve_16_8(const int16_t *src, __m128i coeff) {
+  __m128i data = _mm_loadu_si128((__m128i *)src);
+  return _mm_madd_epi16(data, coeff);
 }
 
 // A specialised version of vfilter, the vertical filter for
 // av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
-static void vfilter8(const int32_t *src, int src_stride, int32_t *dst,
+static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
                      int dst_stride, int w, int h, int subpel_y_qn,
                      int y_step_qn, const InterpFilterParams *filter_params,
                      const ConvolveParams *conv_params, int bd) {
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   const int ntaps = 8;
 
-  int32_t round_add32 = (1 << conv_params->round_1) / 2 + (1 << offset_bits);
-  const __m128i round_add = _mm_set1_epi32(round_add32);
-  const __m128i round_shift = extend_32_to_128(conv_params->round_1);
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
 
   const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
                          (1 << (offset_bits - conv_params->round_1 - 1)));
-  const __m128i sub = _mm_set1_epi32(sub32);
+  const __m128i sub = _mm_set1_epi16(sub32);
+
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const __m128i bits_shift = _mm_cvtsi32_si128(bits);
+  const __m128i bits_const = _mm_set1_epi16(((1 << bits) >> 1));
+  const __m128i round_shift_add =
+      _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
 
   int y_qn = subpel_y_qn;
   for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
-    const int32_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
+    const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
     const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
     assert(filter_idx < SUBPEL_SHIFTS);
     const int16_t *filter =
         av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
 
-    // Load up coefficients for the filter and sign-extend to 32-bit precision
-    // (to do so, calculate sign bits and then interleave)
-    const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
     const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
-    const __m128i csign0716 = _mm_cmplt_epi16(coeff0716, zero);
-    const __m128i coeff03 = _mm_unpacklo_epi16(coeff0716, csign0716);
-    const __m128i coeff47 = _mm_unpackhi_epi16(coeff0716, csign0716);
-
     int x;
     for (x = 0; x <= w - 4; x += 4) {
-      const int32_t *const src0 = src_y + x * src_stride;
-      const int32_t *const src1 = src0 + 1 * src_stride;
-      const int32_t *const src2 = src0 + 2 * src_stride;
-      const int32_t *const src3 = src0 + 3 * src_stride;
+      const int16_t *const src0 = src_y + x * src_stride;
+      const int16_t *const src1 = src0 + 1 * src_stride;
+      const int16_t *const src2 = src0 + 2 * src_stride;
+      const int16_t *const src3 = src0 + 3 * src_stride;
 
       // Load the source data for the three rows, adding the three registers of
       // convolved products to one as we go (conv0..conv3) to avoid the
       // register pressure getting too high.
-      const __m128i conv0 = convolve_32_8(src0, coeff03, coeff47);
-      const __m128i conv1 = convolve_32_8(src1, coeff03, coeff47);
-      const __m128i conv2 = convolve_32_8(src2, coeff03, coeff47);
-      const __m128i conv3 = convolve_32_8(src3, coeff03, coeff47);
+      const __m128i conv0 = convolve_16_8(src0, coeff0716);
+      const __m128i conv1 = convolve_16_8(src1, coeff0716);
+      const __m128i conv2 = convolve_16_8(src2, coeff0716);
+      const __m128i conv3 = convolve_16_8(src3, coeff0716);
 
       // Now reduce horizontally to get one lane for each result
       const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
       const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
-      const __m128i conv = _mm_hadd_epi32(conv01, conv23);
+      __m128i conv = _mm_hadd_epi32(conv01, conv23);
 
+      conv = _mm_add_epi32(conv, res_add_const);
       // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
-      const __m128i shifted =
-          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
-      const __m128i subbed = _mm_sub_epi32(shifted, sub);
-
-      int32_t *dst_x = dst + y * dst_stride + x;
-      const __m128i result =
-          (conv_params->do_average)
-              ? _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x))
-              : subbed;
-
-      _mm_storeu_si128((__m128i *)dst_x, result);
+      __m128i shifted =
+          _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
+
+      uint8_t *dst_x = dst + y * dst_stride + x;
+      CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
+      __m128i result;
+      __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
+
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x);
+          if (conv_params->use_jnt_comp_avg) {
+            const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16);
+            const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
+            const __m128i shifted_32 =
+                _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+            shifted_16 = _mm_packus_epi32(shifted_32, shifted_32);
+          } else {
+            shifted_16 = _mm_srai_epi16(_mm_add_epi16(p_16, shifted_16), 1);
+          }
+          const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
+          result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
+          const __m128i result_8 = _mm_packus_epi16(result, result);
+          *(uint32_t *)dst_x = _mm_cvtsi128_si32(result_8);
+        } else {
+          _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
+        }
+      } else {
+        const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
+        result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
+        const __m128i result_8 = _mm_packus_epi16(result, result);
+        *(uint32_t *)dst_x = _mm_cvtsi128_si32(result_8);
+      }
     }
     for (; x < w; ++x) {
-      const int32_t *src_x = src_y + x * src_stride;
-      CONV_BUF_TYPE sum = 1 << offset_bits;
+      const int16_t *src_x = src_y + x * src_stride;
+      int32_t sum = 1 << offset_bits;
       for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - sub32;
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
-        dst[y * dst_stride + x] = res;
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          int32_t tmp = dst16[y * dst16_stride + x];
+          if (conv_params->use_jnt_comp_avg) {
+            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+            tmp = tmp >> DIST_PRECISION_BITS;
+          } else {
+            tmp += res;
+            tmp = tmp >> 1;
+          }
+          /* Subtract round offset and convolve round */
+          tmp = tmp - sub32;
+          dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+        } else {
+          dst16[y * dst16_stride + x] = res;
+        }
+      } else {
+        /* Subtract round offset and convolve round */
+        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+                             (1 << (offset_bits - conv_params->round_1 - 1)));
+        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+      }
     }
   }
 }
-
 void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
-                                  CONV_BUF_TYPE *dst, int dst_stride, int w,
-                                  int h, InterpFilterParams *filter_params_x,
+                                  uint8_t *dst8, int dst8_stride, int w, int h,
+                                  InterpFilterParams *filter_params_x,
                                   InterpFilterParams *filter_params_y,
                                   const int subpel_x_qn, const int x_step_qn,
                                   const int subpel_y_qn, const int y_step_qn,
                                   ConvolveParams *conv_params) {
-  int32_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+  // TODO(yaowu): remove unnecessary initializations
+  int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE] = { 0 };
   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
              filter_params_y->taps;
 
   const int xtaps = filter_params_x->taps;
   const int ytaps = filter_params_y->taps;
-
   const int fo_vert = ytaps / 2 - 1;
+  assert((xtaps == 8) && (ytaps == 8));
+  (void)xtaps;
 
   // horizontal filter
-  if (xtaps == 8)
-    hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
-             x_step_qn, filter_params_x, conv_params->round_0);
-  else
-    hfilter(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
-            x_step_qn, filter_params_x, conv_params->round_0);
+  hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
+           x_step_qn, filter_params_x, conv_params->round_0);
 
   // vertical filter (input is transposed)
-  if (ytaps == 8)
-    vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
-             filter_params_y, conv_params, 8);
-  else
-    vfilter(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
-            filter_params_y, conv_params, 8);
+  vfilter8(tmp, im_h, dst8, dst8_stride, w, h, subpel_y_qn, y_step_qn,
+           filter_params_y, conv_params, 8);
 }
 
-#if CONFIG_HIGHBITDEPTH
-// An wrapper to generate the SHUFPD instruction with __m128i types (just
-// writing _mm_shuffle_pd at the callsites gets a bit ugly because of the
-// casts)
-static __m128i mm_shuffle0_si128(__m128i a, __m128i b) {
-  __m128d ad = _mm_castsi128_pd(a);
-  __m128d bd = _mm_castsi128_pd(b);
-  return _mm_castpd_si128(_mm_shuffle_pd(ad, bd, 0));
-}
-
-// The horizontal filter for av1_highbd_convolve_2d_scale_sse4_1. This
-// is the more general version, supporting 10 and 12 tap filters. For
-// 8-tap filters, use hfilter8.
-static void highbd_hfilter(const uint16_t *src, int src_stride, int32_t *dst,
-                           int w, int h, int subpel_x_qn, int x_step_qn,
-                           const InterpFilterParams *filter_params,
-                           unsigned round, int bd) {
-  const int ntaps = filter_params->taps;
-  assert(ntaps == 10 || ntaps == 12);
+// A specialised version of hfilter, the horizontal filter for
+// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
+// filters.
+static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst,
+                            int w, int h, int subpel_x_qn, int x_step_qn,
+                            const InterpFilterParams *filter_params,
+                            unsigned round, int bd) {
+  const int ntaps = 8;
 
   src -= ntaps / 2 - 1;
 
-  // Construct a mask with which we'll AND filter coefficients 89ab89ab to zero
-  // out the unneeded entries.
-  const __m128i hicoeff_mask = make_1012_mask(ntaps);
-
   int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
   const __m128i round_add = _mm_set1_epi32(round_add32);
-  const __m128i round_shift = extend_32_to_128(round);
+  const __m128i round_shift = _mm_cvtsi32_si128(round);
 
   int x_qn = subpel_x_qn;
   for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
@@ -475,11 +280,8 @@ static void highbd_hfilter(const uint16_t *src, int src_stride, int32_t *dst,
     const int16_t *filter =
         av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
 
-    // The "lo" coefficients are coefficients 0..7. For a 12-tap filter, the
-    // "hi" coefficients are arranged as 89ab89ab. For a 10-tap filter, they
-    // are masked out with hicoeff_mask.
+    // Load the filter coefficients
     const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
-    const __m128i coeffhi = load_and_128i(filter + 8, hicoeff_mask);
 
     int y;
     for (y = 0; y <= h - 4; y += 4) {
@@ -488,43 +290,31 @@ static void highbd_hfilter(const uint16_t *src, int src_stride, int32_t *dst,
       const uint16_t *const src2 = src0 + 2 * src_stride;
       const uint16_t *const src3 = src0 + 3 * src_stride;
 
-      // Load up source data. This is 16-bit input data, so each load gets 8
-      // pixels (we need at most 12)
+      // Load up source data. This is 16-bit input data, so each load gets the 8
+      // pixels we need.
       const __m128i data0lo = _mm_loadu_si128((__m128i *)src0);
       const __m128i data1lo = _mm_loadu_si128((__m128i *)src1);
       const __m128i data2lo = _mm_loadu_si128((__m128i *)src2);
       const __m128i data3lo = _mm_loadu_si128((__m128i *)src3);
-      const __m128i data0hi = _mm_loadu_si128((__m128i *)(src0 + 8));
-      const __m128i data1hi = _mm_loadu_si128((__m128i *)(src1 + 8));
-      const __m128i data2hi = _mm_loadu_si128((__m128i *)(src2 + 8));
-      const __m128i data3hi = _mm_loadu_si128((__m128i *)(src3 + 8));
-
-      // The "hi" data has rubbish in the top half so interleave pairs together
-      // to minimise the calculation we need to do.
-      const __m128i data01hi = mm_shuffle0_si128(data0hi, data1hi);
-      const __m128i data23hi = mm_shuffle0_si128(data2hi, data3hi);
 
       // Multiply by coefficients
       const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
       const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
       const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
       const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
-      const __m128i conv01hi = _mm_madd_epi16(data01hi, coeffhi);
-      const __m128i conv23hi = _mm_madd_epi16(data23hi, coeffhi);
 
       // Reduce horizontally and add
       const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
       const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
-      const __m128i convlo = _mm_hadd_epi32(conv01lo, conv23lo);
-      const __m128i convhi = _mm_hadd_epi32(conv01hi, conv23hi);
-      const __m128i conv = _mm_add_epi32(convlo, convhi);
+      const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
 
       // Divide down by (1 << round), rounding to nearest.
-      const __m128i shifted =
+      __m128i shifted =
           _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
 
+      shifted = _mm_packus_epi32(shifted, shifted);
       // Write transposed to the output
-      _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
+      _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
     }
     for (; y < h; ++y) {
       const uint16_t *const src_row = src_col + y * src_stride;
@@ -538,108 +328,173 @@ static void highbd_hfilter(const uint16_t *src, int src_stride, int32_t *dst,
     }
   }
 }
-
-// A specialised version of hfilter, the horizontal filter for
+// A specialised version of vfilter, the vertical filter for
 // av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
 // filters.
-static void highbd_hfilter8(const uint16_t *src, int src_stride, int32_t *dst,
-                            int w, int h, int subpel_x_qn, int x_step_qn,
+static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
+                            int dst_stride, int w, int h, int subpel_y_qn,
+                            int y_step_qn,
                             const InterpFilterParams *filter_params,
-                            unsigned round, int bd) {
+                            const ConvolveParams *conv_params, int bd) {
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   const int ntaps = 8;
 
-  src -= ntaps / 2 - 1;
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
 
-  int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
-  const __m128i round_add = _mm_set1_epi32(round_add32);
-  const __m128i round_shift = extend_32_to_128(round);
+  const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
+                         (1 << (offset_bits - conv_params->round_1 - 1)));
+  const __m128i sub = _mm_set1_epi32(sub32);
 
-  int x_qn = subpel_x_qn;
-  for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
-    const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
-    const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+  const __m128i clip_pixel_ =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const __m128i bits_shift = _mm_cvtsi32_si128(bits);
+  const __m128i bits_const = _mm_set1_epi32(((1 << bits) >> 1));
+  const __m128i round_shift_add =
+      _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+  __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+
+  int y_qn = subpel_y_qn;
+  for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+    const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
+    const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
     assert(filter_idx < SUBPEL_SHIFTS);
     const int16_t *filter =
         av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
 
-    // Load the filter coefficients
-    const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
-
-    int y;
-    for (y = 0; y <= h - 4; y += 4) {
-      const uint16_t *const src0 = src_col + y * src_stride;
-      const uint16_t *const src1 = src0 + 1 * src_stride;
-      const uint16_t *const src2 = src0 + 2 * src_stride;
-      const uint16_t *const src3 = src0 + 3 * src_stride;
-
-      // Load up source data. This is 16-bit input data, so each load gets the 8
-      // pixels we need.
-      const __m128i data0lo = _mm_loadu_si128((__m128i *)src0);
-      const __m128i data1lo = _mm_loadu_si128((__m128i *)src1);
-      const __m128i data2lo = _mm_loadu_si128((__m128i *)src2);
-      const __m128i data3lo = _mm_loadu_si128((__m128i *)src3);
-
-      // Multiply by coefficients
-      const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
-      const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
-      const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
-      const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
+    const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
+    int x;
+    for (x = 0; x <= w - 4; x += 4) {
+      const int16_t *const src0 = src_y + x * src_stride;
+      const int16_t *const src1 = src0 + 1 * src_stride;
+      const int16_t *const src2 = src0 + 2 * src_stride;
+      const int16_t *const src3 = src0 + 3 * src_stride;
 
-      // Reduce horizontally and add
-      const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
-      const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
-      const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
+      // Load the source data for the three rows, adding the three registers of
+      // convolved products to one as we go (conv0..conv3) to avoid the
+      // register pressure getting too high.
+      const __m128i conv0 = convolve_16_8(src0, coeff0716);
+      const __m128i conv1 = convolve_16_8(src1, coeff0716);
+      const __m128i conv2 = convolve_16_8(src2, coeff0716);
+      const __m128i conv3 = convolve_16_8(src3, coeff0716);
 
-      // Divide down by (1 << round), rounding to nearest.
-      const __m128i shifted =
-          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+      // Now reduce horizontally to get one lane for each result
+      const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
+      const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
+      __m128i conv = _mm_hadd_epi32(conv01, conv23);
+      conv = _mm_add_epi32(conv, res_add_const);
 
-      // Write transposed to the output
-      _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
+      // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
+      __m128i shifted =
+          _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
+
+      uint16_t *dst_x = dst + y * dst_stride + x;
+      CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
+
+      __m128i result;
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          __m128i p_32 =
+              _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x));
+
+          if (conv_params->use_jnt_comp_avg) {
+            shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
+                                    _mm_mullo_epi32(shifted, wt1));
+            shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS);
+          } else {
+            shifted = _mm_srai_epi32(_mm_add_epi32(p_32, shifted), 1);
+          }
+          __m128i res32 = _mm_sub_epi32(shifted, sub);
+          res32 = _mm_sra_epi32(_mm_add_epi32(res32, round_bits_const),
+                                round_bits_shift);
+
+          __m128i res16 = _mm_packus_epi32(res32, res32);
+          res16 = _mm_min_epi16(res16, clip_pixel_);
+          _mm_storel_epi64((__m128i *)dst_x, res16);
+        } else {
+          __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
+          _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
+        }
+      } else {
+        const __m128i subbed = _mm_sub_epi32(shifted, sub);
+        result = _mm_sra_epi16(_mm_add_epi32(subbed, bits_const), bits_shift);
+        result = _mm_packus_epi32(result, result);
+        result = _mm_min_epi16(result, clip_pixel_);
+        _mm_storel_epi64((__m128i *)dst_x, result);
+      }
     }
-    for (; y < h; ++y) {
-      const uint16_t *const src_row = src_col + y * src_stride;
 
-      int32_t sum = (1 << (bd + FILTER_BITS - 1));
-      for (int k = 0; k < ntaps; ++k) {
-        sum += filter[k] * src_row[k];
+    for (; x < w; ++x) {
+      const int16_t *src_x = src_y + x * src_stride;
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          int32_t tmp = dst16[y * dst16_stride + x];
+          if (conv_params->use_jnt_comp_avg) {
+            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+            tmp = tmp >> DIST_PRECISION_BITS;
+          } else {
+            tmp += res;
+            tmp = tmp >> 1;
+          }
+          /* Subtract round offset and convolve round */
+          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
+                       (1 << (offset_bits - conv_params->round_1 - 1)));
+          dst[y * dst_stride + x] =
+              clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+        } else {
+          dst16[y * dst16_stride + x] = res;
+        }
+      } else {
+        /* Subtract round offset and convolve round */
+        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+                             (1 << (offset_bits - conv_params->round_1 - 1)));
+        dst[y * dst_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
       }
-
-      dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
     }
   }
 }
 
 void av1_highbd_convolve_2d_scale_sse4_1(
-    const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride,
-    int w, int h, InterpFilterParams *filter_params_x,
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, InterpFilterParams *filter_params_x,
     InterpFilterParams *filter_params_y, const int subpel_x_qn,
     const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
     ConvolveParams *conv_params, int bd) {
-  int32_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+  // TODO(yaowu): Move this out of stack
+  DECLARE_ALIGNED(16, int16_t,
+                  tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
              filter_params_y->taps;
-
   const int xtaps = filter_params_x->taps;
   const int ytaps = filter_params_y->taps;
   const int fo_vert = ytaps / 2 - 1;
 
+  memset(tmp, 0, sizeof(tmp));
+  assert((xtaps == 8) && (ytaps == 8));
+  (void)xtaps;
+
   // horizontal filter
-  if (xtaps == 8)
-    highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
-                    subpel_x_qn, x_step_qn, filter_params_x,
-                    conv_params->round_0, bd);
-  else
-    highbd_hfilter(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
-                   subpel_x_qn, x_step_qn, filter_params_x,
-                   conv_params->round_0, bd);
+  highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
+                  subpel_x_qn, x_step_qn, filter_params_x, conv_params->round_0,
+                  bd);
 
   // vertical filter (input is transposed)
-  if (ytaps == 8)
-    vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
-             filter_params_y, conv_params, bd);
-  else
-    vfilter(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
-            filter_params_y, conv_params, bd);
+  highbd_vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
+                  filter_params_y, conv_params, bd);
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
deleted file mode 100644
index e85c15eaf..000000000
--- a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
+++ /dev/null
@@ -1,1034 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <tmmintrin.h>
-
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
-#include "av1/common/filter.h"
-
-#define WIDTH_BOUND (16)
-#define HEIGHT_BOUND (16)
-
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-DECLARE_ALIGNED(16, static int8_t,
-                sub_pel_filters_12sharp_signal_dir[15][2][16]);
-
-DECLARE_ALIGNED(16, static int8_t,
-                sub_pel_filters_12sharp_ver_signal_dir[15][6][16]);
-#endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-
-#if USE_TEMPORALFILTER_12TAP
-DECLARE_ALIGNED(16, static int8_t,
-                sub_pel_filters_temporalfilter_12_signal_dir[15][2][16]);
-
-DECLARE_ALIGNED(16, static int8_t,
-                sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16]);
-#endif
-
-typedef int8_t (*SubpelFilterCoeffs)[16];
-
-static INLINE SubpelFilterCoeffs
-get_subpel_filter_signal_dir(const InterpFilterParams p, int index) {
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-  if (p.interp_filter == MULTITAP_SHARP) {
-    return &sub_pel_filters_12sharp_signal_dir[index][0];
-  }
-#endif
-#if USE_TEMPORALFILTER_12TAP
-  if (p.interp_filter == TEMPORALFILTER_12TAP) {
-    return &sub_pel_filters_temporalfilter_12_signal_dir[index][0];
-  }
-#endif
-  (void)p;
-  (void)index;
-  return NULL;
-}
-
-static INLINE SubpelFilterCoeffs
-get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-  if (p.interp_filter == MULTITAP_SHARP) {
-    return &sub_pel_filters_12sharp_ver_signal_dir[index][0];
-  }
-#endif
-#if USE_TEMPORALFILTER_12TAP
-  if (p.interp_filter == TEMPORALFILTER_12TAP) {
-    return &sub_pel_filters_temporalfilter_12_ver_signal_dir[index][0];
-  }
-#endif
-  (void)p;
-  (void)index;
-  return NULL;
-}
-
-static INLINE void transpose_4x8(const __m128i *in, __m128i *out) {
-  __m128i t0, t1;
-
-  t0 = _mm_unpacklo_epi16(in[0], in[1]);
-  t1 = _mm_unpacklo_epi16(in[2], in[3]);
-
-  out[0] = _mm_unpacklo_epi32(t0, t1);
-  out[1] = _mm_srli_si128(out[0], 8);
-  out[2] = _mm_unpackhi_epi32(t0, t1);
-  out[3] = _mm_srli_si128(out[2], 8);
-
-  t0 = _mm_unpackhi_epi16(in[0], in[1]);
-  t1 = _mm_unpackhi_epi16(in[2], in[3]);
-
-  out[4] = _mm_unpacklo_epi32(t0, t1);
-  out[5] = _mm_srli_si128(out[4], 8);
-  // Note: We ignore out[6] and out[7] because
-  // they're zero vectors.
-}
-
-typedef void (*store_pixel_t)(const __m128i *x, uint8_t *dst);
-
-static INLINE __m128i accumulate_store(const __m128i *x, uint8_t *src) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i y = _mm_loadl_epi64((__m128i const *)src);
-  y = _mm_unpacklo_epi8(y, zero);
-  y = _mm_add_epi16(*x, y);
-  y = _mm_add_epi16(y, one);
-  y = _mm_srai_epi16(y, 1);
-  y = _mm_packus_epi16(y, y);
-  return y;
-}
-
-static INLINE void store_2_pixel_only(const __m128i *x, uint8_t *dst) {
-  uint32_t temp;
-  __m128i u = _mm_packus_epi16(*x, *x);
-  temp = _mm_cvtsi128_si32(u);
-  *(uint16_t *)dst = (uint16_t)temp;
-}
-
-static INLINE void accumulate_store_2_pixel(const __m128i *x, uint8_t *dst) {
-  uint32_t temp;
-  __m128i y = accumulate_store(x, dst);
-  temp = _mm_cvtsi128_si32(y);
-  *(uint16_t *)dst = (uint16_t)temp;
-}
-
-static store_pixel_t store2pixelTab[2] = { store_2_pixel_only,
-                                           accumulate_store_2_pixel };
-
-static INLINE void store_4_pixel_only(const __m128i *x, uint8_t *dst) {
-  __m128i u = _mm_packus_epi16(*x, *x);
-  *(int *)dst = _mm_cvtsi128_si32(u);
-}
-
-static INLINE void accumulate_store_4_pixel(const __m128i *x, uint8_t *dst) {
-  __m128i y = accumulate_store(x, dst);
-  *(int *)dst = _mm_cvtsi128_si32(y);
-}
-
-static store_pixel_t store4pixelTab[2] = { store_4_pixel_only,
-                                           accumulate_store_4_pixel };
-
-static void horiz_w4_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                           store_pixel_t store_func, uint8_t *dst) {
-  __m128i sumPairRow[4];
-  __m128i sumPairCol[8];
-  __m128i pixel;
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i zero = _mm_setzero_si128();
-
-  assert(tapsNum == 10 || tapsNum == 12);
-  if (10 == tapsNum) {
-    src -= 1;
-  }
-
-  pixel = _mm_loadu_si128((__m128i const *)src);
-  sumPairRow[0] = _mm_maddubs_epi16(pixel, f[0]);
-  sumPairRow[2] = _mm_maddubs_epi16(pixel, f[1]);
-  sumPairRow[2] = _mm_srli_si128(sumPairRow[2], 2);
-
-  pixel = _mm_loadu_si128((__m128i const *)(src + 1));
-  sumPairRow[1] = _mm_maddubs_epi16(pixel, f[0]);
-  sumPairRow[3] = _mm_maddubs_epi16(pixel, f[1]);
-  sumPairRow[3] = _mm_srli_si128(sumPairRow[3], 2);
-
-  transpose_4x8(sumPairRow, sumPairCol);
-
-  sumPairRow[0] = _mm_adds_epi16(sumPairCol[0], sumPairCol[1]);
-  sumPairRow[1] = _mm_adds_epi16(sumPairCol[4], sumPairCol[5]);
-
-  sumPairRow[2] = _mm_min_epi16(sumPairCol[2], sumPairCol[3]);
-  sumPairRow[3] = _mm_max_epi16(sumPairCol[2], sumPairCol[3]);
-
-  sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[1]);
-  sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[2]);
-  sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[3]);
-
-  sumPairRow[1] = _mm_mulhrs_epi16(sumPairRow[0], k_256);
-  sumPairRow[1] = _mm_packus_epi16(sumPairRow[1], sumPairRow[1]);
-  sumPairRow[1] = _mm_unpacklo_epi8(sumPairRow[1], zero);
-
-  store_func(&sumPairRow[1], dst);
-}
-
-static void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                           store_pixel_t store, uint8_t *buf) {
-  horiz_w4_ssse3(src, f, tapsNum, store, buf);
-  src += 4;
-  buf += 4;
-  horiz_w4_ssse3(src, f, tapsNum, store, buf);
-}
-
-static void horiz_w16_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                            store_pixel_t store, uint8_t *buf) {
-  horiz_w8_ssse3(src, f, tapsNum, store, buf);
-  src += 8;
-  buf += 8;
-  horiz_w8_ssse3(src, f, tapsNum, store, buf);
-}
-
-static void horiz_w32_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                            store_pixel_t store, uint8_t *buf) {
-  horiz_w16_ssse3(src, f, tapsNum, store, buf);
-  src += 16;
-  buf += 16;
-  horiz_w16_ssse3(src, f, tapsNum, store, buf);
-}
-
-static void horiz_w64_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                            store_pixel_t store, uint8_t *buf) {
-  horiz_w32_ssse3(src, f, tapsNum, store, buf);
-  src += 32;
-  buf += 32;
-  horiz_w32_ssse3(src, f, tapsNum, store, buf);
-}
-
-static void horiz_w128_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                             store_pixel_t store, uint8_t *buf) {
-  horiz_w64_ssse3(src, f, tapsNum, store, buf);
-  src += 64;
-  buf += 64;
-  horiz_w64_ssse3(src, f, tapsNum, store, buf);
-}
-
-static void (*horizTab[6])(const uint8_t *, const __m128i *, int, store_pixel_t,
-                           uint8_t *) = {
-  horiz_w4_ssse3,  horiz_w8_ssse3,  horiz_w16_ssse3,
-  horiz_w32_ssse3, horiz_w64_ssse3, horiz_w128_ssse3,
-};
-
-static void filter_horiz_ssse3(const uint8_t *src, __m128i *f, int tapsNum,
-                               int width, store_pixel_t store, uint8_t *dst) {
-  switch (width) {
-    // Note:
-    // For width=2 and 4, store function must be different
-    case 2:
-    case 4: horizTab[0](src, f, tapsNum, store, dst); break;
-    case 8: horizTab[1](src, f, tapsNum, store, dst); break;
-    case 16: horizTab[2](src, f, tapsNum, store, dst); break;
-    case 32: horizTab[3](src, f, tapsNum, store, dst); break;
-    case 64: horizTab[4](src, f, tapsNum, store, dst); break;
-    case 128: horizTab[5](src, f, tapsNum, store, dst); break;
-    default: assert(0);
-  }
-}
-
-// Vertical 8-pixel parallel
-typedef void (*transpose_to_dst_t)(const uint16_t *src, int src_stride,
-                                   uint8_t *dst, int dst_stride);
-
-static INLINE void transpose8x8_direct_to_dst(const uint16_t *src,
-                                              int src_stride, uint8_t *dst,
-                                              int dst_stride) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  __m128i v0, v1, v2, v3;
-
-  __m128i u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  __m128i u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  __m128i u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  __m128i u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-  __m128i u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
-  __m128i u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
-  __m128i u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
-  __m128i u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
-
-  u0 = _mm_mulhrs_epi16(u0, k_256);
-  u1 = _mm_mulhrs_epi16(u1, k_256);
-  u2 = _mm_mulhrs_epi16(u2, k_256);
-  u3 = _mm_mulhrs_epi16(u3, k_256);
-  u4 = _mm_mulhrs_epi16(u4, k_256);
-  u5 = _mm_mulhrs_epi16(u5, k_256);
-  u6 = _mm_mulhrs_epi16(u6, k_256);
-  u7 = _mm_mulhrs_epi16(u7, k_256);
-
-  v0 = _mm_packus_epi16(u0, u1);
-  v1 = _mm_packus_epi16(u2, u3);
-  v2 = _mm_packus_epi16(u4, u5);
-  v3 = _mm_packus_epi16(u6, u7);
-
-  u0 = _mm_unpacklo_epi8(v0, v1);
-  u1 = _mm_unpackhi_epi8(v0, v1);
-  u2 = _mm_unpacklo_epi8(v2, v3);
-  u3 = _mm_unpackhi_epi8(v2, v3);
-
-  u4 = _mm_unpacklo_epi8(u0, u1);
-  u5 = _mm_unpacklo_epi8(u2, u3);
-  u6 = _mm_unpackhi_epi8(u0, u1);
-  u7 = _mm_unpackhi_epi8(u2, u3);
-
-  u0 = _mm_unpacklo_epi32(u4, u5);
-  u1 = _mm_unpackhi_epi32(u4, u5);
-  u2 = _mm_unpacklo_epi32(u6, u7);
-  u3 = _mm_unpackhi_epi32(u6, u7);
-
-  u4 = _mm_srli_si128(u0, 8);
-  u5 = _mm_srli_si128(u1, 8);
-  u6 = _mm_srli_si128(u2, 8);
-  u7 = _mm_srli_si128(u3, 8);
-
-  _mm_storel_epi64((__m128i *)dst, u0);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), u4);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), u1);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), u5);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), u2);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), u6);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), u3);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), u7);
-}
-
-static INLINE void transpose8x8_accumu_to_dst(const uint16_t *src,
-                                              int src_stride, uint8_t *dst,
-                                              int dst_stride) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-
-  __m128i u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  __m128i u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  __m128i u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  __m128i u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-  __m128i u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
-  __m128i u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
-  __m128i u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
-  __m128i u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
-
-  u0 = _mm_mulhrs_epi16(u0, k_256);
-  u1 = _mm_mulhrs_epi16(u1, k_256);
-  u2 = _mm_mulhrs_epi16(u2, k_256);
-  u3 = _mm_mulhrs_epi16(u3, k_256);
-  u4 = _mm_mulhrs_epi16(u4, k_256);
-  u5 = _mm_mulhrs_epi16(u5, k_256);
-  u6 = _mm_mulhrs_epi16(u6, k_256);
-  u7 = _mm_mulhrs_epi16(u7, k_256);
-
-  v0 = _mm_packus_epi16(u0, u1);
-  v1 = _mm_packus_epi16(u2, u3);
-  v2 = _mm_packus_epi16(u4, u5);
-  v3 = _mm_packus_epi16(u6, u7);
-
-  u0 = _mm_unpacklo_epi8(v0, v1);
-  u1 = _mm_unpackhi_epi8(v0, v1);
-  u2 = _mm_unpacklo_epi8(v2, v3);
-  u3 = _mm_unpackhi_epi8(v2, v3);
-
-  u4 = _mm_unpacklo_epi8(u0, u1);
-  u5 = _mm_unpacklo_epi8(u2, u3);
-  u6 = _mm_unpackhi_epi8(u0, u1);
-  u7 = _mm_unpackhi_epi8(u2, u3);
-
-  u0 = _mm_unpacklo_epi32(u4, u5);
-  u1 = _mm_unpackhi_epi32(u4, u5);
-  u2 = _mm_unpacklo_epi32(u6, u7);
-  u3 = _mm_unpackhi_epi32(u6, u7);
-
-  u4 = _mm_srli_si128(u0, 8);
-  u5 = _mm_srli_si128(u1, 8);
-  u6 = _mm_srli_si128(u2, 8);
-  u7 = _mm_srli_si128(u3, 8);
-
-  v0 = _mm_loadl_epi64((__m128i const *)(dst + 0 * dst_stride));
-  v1 = _mm_loadl_epi64((__m128i const *)(dst + 1 * dst_stride));
-  v2 = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
-  v3 = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
-  v4 = _mm_loadl_epi64((__m128i const *)(dst + 4 * dst_stride));
-  v5 = _mm_loadl_epi64((__m128i const *)(dst + 5 * dst_stride));
-  v6 = _mm_loadl_epi64((__m128i const *)(dst + 6 * dst_stride));
-  v7 = _mm_loadl_epi64((__m128i const *)(dst + 7 * dst_stride));
-
-  u0 = _mm_unpacklo_epi8(u0, zero);
-  u1 = _mm_unpacklo_epi8(u1, zero);
-  u2 = _mm_unpacklo_epi8(u2, zero);
-  u3 = _mm_unpacklo_epi8(u3, zero);
-  u4 = _mm_unpacklo_epi8(u4, zero);
-  u5 = _mm_unpacklo_epi8(u5, zero);
-  u6 = _mm_unpacklo_epi8(u6, zero);
-  u7 = _mm_unpacklo_epi8(u7, zero);
-
-  v0 = _mm_unpacklo_epi8(v0, zero);
-  v1 = _mm_unpacklo_epi8(v1, zero);
-  v2 = _mm_unpacklo_epi8(v2, zero);
-  v3 = _mm_unpacklo_epi8(v3, zero);
-  v4 = _mm_unpacklo_epi8(v4, zero);
-  v5 = _mm_unpacklo_epi8(v5, zero);
-  v6 = _mm_unpacklo_epi8(v6, zero);
-  v7 = _mm_unpacklo_epi8(v7, zero);
-
-  v0 = _mm_adds_epi16(u0, v0);
-  v1 = _mm_adds_epi16(u4, v1);
-  v2 = _mm_adds_epi16(u1, v2);
-  v3 = _mm_adds_epi16(u5, v3);
-  v4 = _mm_adds_epi16(u2, v4);
-  v5 = _mm_adds_epi16(u6, v5);
-  v6 = _mm_adds_epi16(u3, v6);
-  v7 = _mm_adds_epi16(u7, v7);
-
-  v0 = _mm_adds_epi16(v0, one);
-  v1 = _mm_adds_epi16(v1, one);
-  v2 = _mm_adds_epi16(v2, one);
-  v3 = _mm_adds_epi16(v3, one);
-  v4 = _mm_adds_epi16(v4, one);
-  v5 = _mm_adds_epi16(v5, one);
-  v6 = _mm_adds_epi16(v6, one);
-  v7 = _mm_adds_epi16(v7, one);
-
-  v0 = _mm_srai_epi16(v0, 1);
-  v1 = _mm_srai_epi16(v1, 1);
-  v2 = _mm_srai_epi16(v2, 1);
-  v3 = _mm_srai_epi16(v3, 1);
-  v4 = _mm_srai_epi16(v4, 1);
-  v5 = _mm_srai_epi16(v5, 1);
-  v6 = _mm_srai_epi16(v6, 1);
-  v7 = _mm_srai_epi16(v7, 1);
-
-  u0 = _mm_packus_epi16(v0, v1);
-  u1 = _mm_packus_epi16(v2, v3);
-  u2 = _mm_packus_epi16(v4, v5);
-  u3 = _mm_packus_epi16(v6, v7);
-
-  u4 = _mm_srli_si128(u0, 8);
-  u5 = _mm_srli_si128(u1, 8);
-  u6 = _mm_srli_si128(u2, 8);
-  u7 = _mm_srli_si128(u3, 8);
-
-  _mm_storel_epi64((__m128i *)dst, u0);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), u4);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), u1);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), u5);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), u2);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), u6);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), u3);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), u7);
-}
-
-static transpose_to_dst_t trans8x8Tab[2] = { transpose8x8_direct_to_dst,
-                                             transpose8x8_accumu_to_dst };
-
-static INLINE void transpose_8x16(const __m128i *in, __m128i *out) {
-  __m128i t0, t1, t2, t3, u0, u1;
-
-  t0 = _mm_unpacklo_epi16(in[0], in[1]);
-  t1 = _mm_unpacklo_epi16(in[2], in[3]);
-  t2 = _mm_unpacklo_epi16(in[4], in[5]);
-  t3 = _mm_unpacklo_epi16(in[6], in[7]);
-
-  u0 = _mm_unpacklo_epi32(t0, t1);
-  u1 = _mm_unpacklo_epi32(t2, t3);
-
-  out[0] = _mm_unpacklo_epi64(u0, u1);
-  out[1] = _mm_unpackhi_epi64(u0, u1);
-
-  u0 = _mm_unpackhi_epi32(t0, t1);
-  u1 = _mm_unpackhi_epi32(t2, t3);
-
-  out[2] = _mm_unpacklo_epi64(u0, u1);
-  out[3] = _mm_unpackhi_epi64(u0, u1);
-
-  t0 = _mm_unpackhi_epi16(in[0], in[1]);
-  t1 = _mm_unpackhi_epi16(in[2], in[3]);
-  t2 = _mm_unpackhi_epi16(in[4], in[5]);
-  t3 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  u0 = _mm_unpacklo_epi32(t0, t1);
-  u1 = _mm_unpacklo_epi32(t2, t3);
-
-  out[4] = _mm_unpacklo_epi64(u0, u1);
-  out[5] = _mm_unpackhi_epi64(u0, u1);
-
-  // Ignore out[6] and out[7]
-  // they're zero vectors.
-}
-
-static void filter_horiz_v8p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                   __m128i *f, int tapsNum, uint16_t *buf) {
-  __m128i s[8], t[6];
-  __m128i min_x2x3, max_x2x3;
-  __m128i temp;
-
-  assert(tapsNum == 10 || tapsNum == 12);
-  if (tapsNum == 10) {
-    src_ptr -= 1;
-  }
-  s[0] = _mm_loadu_si128((const __m128i *)src_ptr);
-  s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
-  s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-  s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-  s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
-  s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-  s[6] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-  s[7] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-
-  // TRANSPOSE...
-  // Vecotor represents column pixel pairs instead of a row
-  transpose_8x16(s, t);
-
-  // multiply 2 adjacent elements with the filter and add the result
-  s[0] = _mm_maddubs_epi16(t[0], f[0]);
-  s[1] = _mm_maddubs_epi16(t[1], f[1]);
-  s[2] = _mm_maddubs_epi16(t[2], f[2]);
-  s[3] = _mm_maddubs_epi16(t[3], f[3]);
-  s[4] = _mm_maddubs_epi16(t[4], f[4]);
-  s[5] = _mm_maddubs_epi16(t[5], f[5]);
-
-  // add and saturate the results together
-  min_x2x3 = _mm_min_epi16(s[2], s[3]);
-  max_x2x3 = _mm_max_epi16(s[2], s[3]);
-  temp = _mm_adds_epi16(s[0], s[1]);
-  temp = _mm_adds_epi16(temp, s[5]);
-  temp = _mm_adds_epi16(temp, s[4]);
-
-  temp = _mm_adds_epi16(temp, min_x2x3);
-  temp = _mm_adds_epi16(temp, max_x2x3);
-
-  _mm_storeu_si128((__m128i *)buf, temp);
-}
-
-// Vertical 4-pixel parallel
-static INLINE void transpose4x4_direct_to_dst(const uint16_t *src,
-                                              int src_stride, uint8_t *dst,
-                                              int dst_stride) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  __m128i v0, v1, v2, v3;
-
-  // TODO(luoyi): two loads, 8 elements per load (two bytes per element)
-  __m128i u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
-  __m128i u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
-  __m128i u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
-  __m128i u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
-
-  v0 = _mm_unpacklo_epi16(u0, u1);
-  v1 = _mm_unpacklo_epi16(u2, u3);
-
-  v2 = _mm_unpacklo_epi32(v0, v1);
-  v3 = _mm_unpackhi_epi32(v0, v1);
-
-  u0 = _mm_mulhrs_epi16(v2, k_256);
-  u1 = _mm_mulhrs_epi16(v3, k_256);
-
-  u0 = _mm_packus_epi16(u0, u1);
-  u1 = _mm_srli_si128(u0, 4);
-  u2 = _mm_srli_si128(u0, 8);
-  u3 = _mm_srli_si128(u0, 12);
-
-  *(int *)(dst) = _mm_cvtsi128_si32(u0);
-  *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u1);
-  *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(u2);
-  *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(u3);
-}
-
-static INLINE void transpose4x4_accumu_to_dst(const uint16_t *src,
-                                              int src_stride, uint8_t *dst,
-                                              int dst_stride) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-
-  __m128i v0, v1, v2, v3;
-
-  __m128i u0 = _mm_loadl_epi64((__m128i const *)(src));
-  __m128i u1 = _mm_loadl_epi64((__m128i const *)(src + src_stride));
-  __m128i u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
-  __m128i u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
-
-  v0 = _mm_unpacklo_epi16(u0, u1);
-  v1 = _mm_unpacklo_epi16(u2, u3);
-
-  v2 = _mm_unpacklo_epi32(v0, v1);
-  v3 = _mm_unpackhi_epi32(v0, v1);
-
-  u0 = _mm_mulhrs_epi16(v2, k_256);
-  u1 = _mm_mulhrs_epi16(v3, k_256);
-
-  u2 = _mm_packus_epi16(u0, u1);
-  u0 = _mm_unpacklo_epi8(u2, zero);
-  u1 = _mm_unpackhi_epi8(u2, zero);
-
-  // load pixel values
-  v0 = _mm_loadl_epi64((__m128i const *)(dst));
-  v1 = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
-  v2 = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
-  v3 = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
-
-  v0 = _mm_unpacklo_epi8(v0, zero);
-  v1 = _mm_unpacklo_epi8(v1, zero);
-  v2 = _mm_unpacklo_epi8(v2, zero);
-  v3 = _mm_unpacklo_epi8(v3, zero);
-
-  v0 = _mm_unpacklo_epi64(v0, v1);
-  v1 = _mm_unpacklo_epi64(v2, v3);
-
-  u0 = _mm_adds_epi16(u0, v0);
-  u1 = _mm_adds_epi16(u1, v1);
-
-  u0 = _mm_adds_epi16(u0, one);
-  u1 = _mm_adds_epi16(u1, one);
-
-  u0 = _mm_srai_epi16(u0, 1);
-  u1 = _mm_srai_epi16(u1, 1);
-
-  // saturation and pack to pixels
-  u0 = _mm_packus_epi16(u0, u1);
-  u1 = _mm_srli_si128(u0, 4);
-  u2 = _mm_srli_si128(u0, 8);
-  u3 = _mm_srli_si128(u0, 12);
-
-  *(int *)(dst) = _mm_cvtsi128_si32(u0);
-  *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u1);
-  *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(u2);
-  *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(u3);
-}
-
-static transpose_to_dst_t trans4x4Tab[2] = { transpose4x4_direct_to_dst,
-                                             transpose4x4_accumu_to_dst };
-
-static void filter_horiz_v4p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                   __m128i *f, int tapsNum, uint16_t *buf) {
-  __m128i A, B, C, D;
-  __m128i tr0_0, tr0_1, s1s0, s3s2, s5s4, s7s6, s9s8, sbsa;
-  __m128i x0, x1, x2, x3, x4, x5;
-  __m128i min_x2x3, max_x2x3, temp;
-
-  assert(tapsNum == 10 || tapsNum == 12);
-  if (tapsNum == 10) {
-    src_ptr -= 1;
-  }
-  A = _mm_loadu_si128((const __m128i *)src_ptr);
-  B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
-  C = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-  D = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-
-  // TRANSPOSE...
-  // Vecotor represents column pixel pairs instead of a row
-  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
-  tr0_0 = _mm_unpacklo_epi16(A, B);
-  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
-  tr0_1 = _mm_unpacklo_epi16(C, D);
-  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
-  s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  // 02 03 12 13 22 23 32 33
-  s3s2 = _mm_srli_si128(s1s0, 8);
-  // 06 07 16 17 26 27 36 37
-  s7s6 = _mm_srli_si128(s5s4, 8);
-
-  tr0_0 = _mm_unpackhi_epi16(A, B);
-  tr0_1 = _mm_unpackhi_epi16(C, D);
-  s9s8 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  sbsa = _mm_srli_si128(s9s8, 8);
-
-  // multiply 2 adjacent elements with the filter and add the result
-  x0 = _mm_maddubs_epi16(s1s0, f[0]);
-  x1 = _mm_maddubs_epi16(s3s2, f[1]);
-  x2 = _mm_maddubs_epi16(s5s4, f[2]);
-  x3 = _mm_maddubs_epi16(s7s6, f[3]);
-  x4 = _mm_maddubs_epi16(s9s8, f[4]);
-  x5 = _mm_maddubs_epi16(sbsa, f[5]);
-  // add and saturate the results together
-  min_x2x3 = _mm_min_epi16(x2, x3);
-  max_x2x3 = _mm_max_epi16(x2, x3);
-  temp = _mm_adds_epi16(x0, x1);
-  temp = _mm_adds_epi16(temp, x5);
-  temp = _mm_adds_epi16(temp, x4);
-
-  temp = _mm_adds_epi16(temp, min_x2x3);
-  temp = _mm_adds_epi16(temp, max_x2x3);
-  _mm_storel_epi64((__m128i *)buf, temp);
-}
-
-// Note:
-//  This function assumes:
-// (1) 10/12-taps filters
-// (2) x_step_q4 = 16 then filter is fixed at the call
-
-void av1_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
-                              int dst_stride, int w, int h,
-                              const InterpFilterParams filter_params,
-                              const int subpel_x_q4, int x_step_q4,
-                              ConvolveParams *conv_params) {
-  DECLARE_ALIGNED(16, uint16_t, temp[8 * 8]);
-  __m128i verf[6];
-  __m128i horf[2];
-  SubpelFilterCoeffs hCoeffs, vCoeffs;
-  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
-  const uint8_t *src_ptr;
-  store_pixel_t store2p = store2pixelTab[conv_params->do_average];
-  store_pixel_t store4p = store4pixelTab[conv_params->do_average];
-  transpose_to_dst_t transpose_4x4 = trans4x4Tab[conv_params->do_average];
-  transpose_to_dst_t transpose_8x8 = trans8x8Tab[conv_params->do_average];
-
-  const int tapsNum = filter_params.taps;
-  int block_height, block_residu;
-  int i, col, count;
-  (void)x_step_q4;
-
-  if (0 == subpel_x_q4 || 16 != x_step_q4) {
-    av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
-                         subpel_x_q4, x_step_q4, conv_params);
-    return;
-  }
-
-  hCoeffs = get_subpel_filter_signal_dir(filter_params, subpel_x_q4 - 1);
-  vCoeffs = get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1);
-
-  if (!hCoeffs || !vCoeffs) {
-    av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
-                         subpel_x_q4, x_step_q4, conv_params);
-    return;
-  }
-
-  verf[0] = *((const __m128i *)(vCoeffs));
-  verf[1] = *((const __m128i *)(vCoeffs + 1));
-  verf[2] = *((const __m128i *)(vCoeffs + 2));
-  verf[3] = *((const __m128i *)(vCoeffs + 3));
-  verf[4] = *((const __m128i *)(vCoeffs + 4));
-  verf[5] = *((const __m128i *)(vCoeffs + 5));
-
-  horf[0] = *((const __m128i *)(hCoeffs));
-  horf[1] = *((const __m128i *)(hCoeffs + 1));
-
-  count = 0;
-
-  // here tapsNum is filter size
-  src -= (tapsNum >> 1) - 1;
-  src_ptr = src;
-  if (w > WIDTH_BOUND && h > HEIGHT_BOUND) {
-    // 8-pixels parallel
-    block_height = h >> 3;
-    block_residu = h & 7;
-
-    do {
-      for (col = 0; col < w; col += 8) {
-        for (i = 0; i < 8; ++i) {
-          filter_horiz_v8p_ssse3(src_ptr, src_stride, verf, tapsNum,
-                                 temp + (i * 8));
-          src_ptr += 1;
-        }
-        transpose_8x8(temp, 8, dst + col, dst_stride);
-      }
-      count++;
-      src_ptr = src + count * src_stride * 8;
-      dst += dst_stride * 8;
-    } while (count < block_height);
-
-    for (i = 0; i < block_residu; ++i) {
-      filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
-      src_ptr += src_stride;
-      dst += dst_stride;
-    }
-  } else {
-    if (w > 2) {
-      // 4-pixels parallel
-      block_height = h >> 2;
-      block_residu = h & 3;
-
-      do {
-        for (col = 0; col < w; col += 4) {
-          for (i = 0; i < 4; ++i) {
-            filter_horiz_v4p_ssse3(src_ptr, src_stride, verf, tapsNum,
-                                   temp + (i * 4));
-            src_ptr += 1;
-          }
-          transpose_4x4(temp, 4, dst + col, dst_stride);
-        }
-        count++;
-        src_ptr = src + count * src_stride * 4;
-        dst += dst_stride * 4;
-      } while (count < block_height);
-
-      for (i = 0; i < block_residu; ++i) {
-        filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
-        src_ptr += src_stride;
-        dst += dst_stride;
-      }
-    } else {
-      for (i = 0; i < h; i++) {
-        filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store2p, dst);
-        src_ptr += src_stride;
-        dst += dst_stride;
-      }
-    }
-  }
-}
-
-// Vertical convolution filtering
-static INLINE void store_8_pixel_only(const __m128i *x, uint8_t *dst) {
-  __m128i u = _mm_packus_epi16(*x, *x);
-  _mm_storel_epi64((__m128i *)dst, u);
-}
-
-static INLINE void accumulate_store_8_pixel(const __m128i *x, uint8_t *dst) {
-  __m128i y = accumulate_store(x, dst);
-  _mm_storel_epi64((__m128i *)dst, y);
-}
-
-static store_pixel_t store8pixelTab[2] = { store_8_pixel_only,
-                                           accumulate_store_8_pixel };
-
-static __m128i filter_vert_ssse3(const uint8_t *src, int src_stride,
-                                 int tapsNum, __m128i *f) {
-  __m128i s[12];
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i zero = _mm_setzero_si128();
-  __m128i min_x2x3, max_x2x3, sum;
-  int i = 0;
-  int r = 0;
-
-  if (10 == tapsNum) {
-    i += 1;
-    s[0] = zero;
-  }
-  while (i < 12) {
-    s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
-    i += 1;
-    r += 1;
-  }
-
-  s[0] = _mm_unpacklo_epi8(s[0], s[1]);
-  s[2] = _mm_unpacklo_epi8(s[2], s[3]);
-  s[4] = _mm_unpacklo_epi8(s[4], s[5]);
-  s[6] = _mm_unpacklo_epi8(s[6], s[7]);
-  s[8] = _mm_unpacklo_epi8(s[8], s[9]);
-  s[10] = _mm_unpacklo_epi8(s[10], s[11]);
-
-  s[0] = _mm_maddubs_epi16(s[0], f[0]);
-  s[2] = _mm_maddubs_epi16(s[2], f[1]);
-  s[4] = _mm_maddubs_epi16(s[4], f[2]);
-  s[6] = _mm_maddubs_epi16(s[6], f[3]);
-  s[8] = _mm_maddubs_epi16(s[8], f[4]);
-  s[10] = _mm_maddubs_epi16(s[10], f[5]);
-
-  min_x2x3 = _mm_min_epi16(s[4], s[6]);
-  max_x2x3 = _mm_max_epi16(s[4], s[6]);
-  sum = _mm_adds_epi16(s[0], s[2]);
-  sum = _mm_adds_epi16(sum, s[10]);
-  sum = _mm_adds_epi16(sum, s[8]);
-
-  sum = _mm_adds_epi16(sum, min_x2x3);
-  sum = _mm_adds_epi16(sum, max_x2x3);
-
-  sum = _mm_mulhrs_epi16(sum, k_256);
-  sum = _mm_packus_epi16(sum, sum);
-  sum = _mm_unpacklo_epi8(sum, zero);
-  return sum;
-}
-
-static void filter_vert_horiz_parallel_ssse3(const uint8_t *src, int src_stride,
-                                             __m128i *f, int tapsNum,
-                                             store_pixel_t store_func,
-                                             uint8_t *dst) {
-  __m128i sum = filter_vert_ssse3(src, src_stride, tapsNum, f);
-  store_func(&sum, dst);
-}
-
-static void filter_vert_compute_small(const uint8_t *src, int src_stride,
-                                      __m128i *f, int tapsNum,
-                                      store_pixel_t store_func, int h,
-                                      uint8_t *dst, int dst_stride) {
-  int rowIndex = 0;
-  do {
-    filter_vert_horiz_parallel_ssse3(src, src_stride, f, tapsNum, store_func,
-                                     dst);
-    rowIndex++;
-    src += src_stride;
-    dst += dst_stride;
-  } while (rowIndex < h);
-}
-
-static void filter_vert_compute_large(const uint8_t *src, int src_stride,
-                                      __m128i *f, int tapsNum,
-                                      store_pixel_t store_func, int w, int h,
-                                      uint8_t *dst, int dst_stride) {
-  int col;
-  int rowIndex = 0;
-  const uint8_t *src_ptr = src;
-  uint8_t *dst_ptr = dst;
-
-  do {
-    for (col = 0; col < w; col += 8) {
-      filter_vert_horiz_parallel_ssse3(src_ptr, src_stride, f, tapsNum,
-                                       store_func, dst_ptr);
-      src_ptr += 8;
-      dst_ptr += 8;
-    }
-    rowIndex++;
-    src_ptr = src + rowIndex * src_stride;
-    dst_ptr = dst + rowIndex * dst_stride;
-  } while (rowIndex < h);
-}
-
-void av1_convolve_vert_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
-                             int dst_stride, int w, int h,
-                             const InterpFilterParams filter_params,
-                             const int subpel_y_q4, int y_step_q4,
-                             ConvolveParams *conv_params) {
-  __m128i verf[6];
-  SubpelFilterCoeffs vCoeffs;
-  const uint8_t *src_ptr;
-  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
-  uint8_t *dst_ptr = dst;
-  store_pixel_t store2p = store2pixelTab[conv_params->do_average];
-  store_pixel_t store4p = store4pixelTab[conv_params->do_average];
-  store_pixel_t store8p = store8pixelTab[conv_params->do_average];
-  const int tapsNum = filter_params.taps;
-
-  if (0 == subpel_y_q4 || 16 != y_step_q4) {
-    av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
-                        subpel_y_q4, y_step_q4, conv_params);
-    return;
-  }
-
-  vCoeffs = get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1);
-
-  if (!vCoeffs) {
-    av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
-                        subpel_y_q4, y_step_q4, conv_params);
-    return;
-  }
-
-  verf[0] = *((const __m128i *)(vCoeffs));
-  verf[1] = *((const __m128i *)(vCoeffs + 1));
-  verf[2] = *((const __m128i *)(vCoeffs + 2));
-  verf[3] = *((const __m128i *)(vCoeffs + 3));
-  verf[4] = *((const __m128i *)(vCoeffs + 4));
-  verf[5] = *((const __m128i *)(vCoeffs + 5));
-
-  src -= src_stride * ((tapsNum >> 1) - 1);
-  src_ptr = src;
-
-  if (w > 4) {
-    filter_vert_compute_large(src_ptr, src_stride, verf, tapsNum, store8p, w, h,
-                              dst_ptr, dst_stride);
-  } else if (4 == w) {
-    filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store4p, h,
-                              dst_ptr, dst_stride);
-  } else if (2 == w) {
-    filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store2p, h,
-                              dst_ptr, dst_stride);
-  } else {
-    assert(0);
-  }
-}
-
-static void init_simd_horiz_filter(const int16_t *filter_ptr, int taps,
-                                   int8_t (*simd_horiz_filter)[2][16]) {
-  int shift;
-  int offset = (12 - taps) / 2;
-  const int16_t *filter_row;
-  for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
-    int i;
-    filter_row = filter_ptr + shift * taps;
-    for (i = 0; i < offset; ++i) simd_horiz_filter[shift - 1][0][i] = 0;
-
-    for (i = 0; i < offset + 2; ++i) simd_horiz_filter[shift - 1][1][i] = 0;
-
-    for (i = 0; i < taps; ++i) {
-      simd_horiz_filter[shift - 1][0][i + offset] = (int8_t)filter_row[i];
-      simd_horiz_filter[shift - 1][1][i + offset + 2] = (int8_t)filter_row[i];
-    }
-
-    for (i = offset + taps; i < 16; ++i) simd_horiz_filter[shift - 1][0][i] = 0;
-
-    for (i = offset + 2 + taps; i < 16; ++i)
-      simd_horiz_filter[shift - 1][1][i] = 0;
-  }
-}
-
-static void init_simd_vert_filter(const int16_t *filter_ptr, int taps,
-                                  int8_t (*simd_vert_filter)[6][16]) {
-  int shift;
-  int offset = (12 - taps) / 2;
-  const int16_t *filter_row;
-  for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
-    int i;
-    filter_row = filter_ptr + shift * taps;
-    for (i = 0; i < 6; ++i) {
-      int j;
-      for (j = 0; j < 16; ++j) {
-        int c = i * 2 + (j % 2) - offset;
-        if (c >= 0 && c < taps)
-          simd_vert_filter[shift - 1][i][j] = (int8_t)filter_row[c];
-        else
-          simd_vert_filter[shift - 1][i][j] = 0;
-      }
-    }
-  }
-}
-
-typedef struct SimdFilter {
-  InterpFilter interp_filter;
-  int8_t (*simd_horiz_filter)[2][16];
-  int8_t (*simd_vert_filter)[6][16];
-} SimdFilter;
-
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-#define MULTITAP_FILTER_NUM 1
-SimdFilter simd_filters[MULTITAP_FILTER_NUM] = {
-  { MULTITAP_SHARP, &sub_pel_filters_12sharp_signal_dir[0],
-    &sub_pel_filters_12sharp_ver_signal_dir[0] },
-};
-#endif
-
-#if USE_TEMPORALFILTER_12TAP
-SimdFilter temporal_simd_filter = {
-  TEMPORALFILTER_12TAP, &sub_pel_filters_temporalfilter_12_signal_dir[0],
-  &sub_pel_filters_temporalfilter_12_ver_signal_dir[0]
-};
-#endif
-
-void av1_lowbd_convolve_init_ssse3(void) {
-#if USE_TEMPORALFILTER_12TAP
-  {
-    InterpFilterParams filter_params =
-        av1_get_interp_filter_params(temporal_simd_filter.interp_filter);
-    int taps = filter_params.taps;
-    const int16_t *filter_ptr = filter_params.filter_ptr;
-    init_simd_horiz_filter(filter_ptr, taps,
-                           temporal_simd_filter.simd_horiz_filter);
-    init_simd_vert_filter(filter_ptr, taps,
-                          temporal_simd_filter.simd_vert_filter);
-  }
-#endif
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-  {
-    int i;
-    for (i = 0; i < MULTITAP_FILTER_NUM; ++i) {
-      InterpFilter interp_filter = simd_filters[i].interp_filter;
-      InterpFilterParams filter_params =
-          av1_get_interp_filter_params(interp_filter);
-      int taps = filter_params.taps;
-      const int16_t *filter_ptr = filter_params.filter_ptr;
-      init_simd_horiz_filter(filter_ptr, taps,
-                             simd_filters[i].simd_horiz_filter);
-      init_simd_vert_filter(filter_ptr, taps, simd_filters[i].simd_vert_filter);
-    }
-  }
-#endif
-  return;
-}
diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c
deleted file mode 100644
index 97d2e74b1..000000000
--- a/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c
+++ /dev/null
@@ -1,839 +0,0 @@
-#include "av1/common/x86/av1_txfm1d_sse4.h"
-
-void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range) {
-  const int txfm_size = 32;
-  const int num_per_128 = 4;
-  const int32_t *cospi;
-  __m128i buf0[32];
-  __m128i buf1[32];
-  int col_num = txfm_size / num_per_128;
-  int bit;
-  int col;
-  (void)stage_range;
-  for (col = 0; col < col_num; col++) {
-    // stage 0;
-    int32_t stage_idx = 0;
-    int j;
-    for (j = 0; j < 32; ++j) {
-      buf0[j] = input[j * col_num + col];
-    }
-
-    // stage 1
-    stage_idx++;
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[31]);
-    buf1[31] = _mm_sub_epi32(buf0[0], buf0[31]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[30]);
-    buf1[30] = _mm_sub_epi32(buf0[1], buf0[30]);
-    buf1[2] = _mm_add_epi32(buf0[2], buf0[29]);
-    buf1[29] = _mm_sub_epi32(buf0[2], buf0[29]);
-    buf1[3] = _mm_add_epi32(buf0[3], buf0[28]);
-    buf1[28] = _mm_sub_epi32(buf0[3], buf0[28]);
-    buf1[4] = _mm_add_epi32(buf0[4], buf0[27]);
-    buf1[27] = _mm_sub_epi32(buf0[4], buf0[27]);
-    buf1[5] = _mm_add_epi32(buf0[5], buf0[26]);
-    buf1[26] = _mm_sub_epi32(buf0[5], buf0[26]);
-    buf1[6] = _mm_add_epi32(buf0[6], buf0[25]);
-    buf1[25] = _mm_sub_epi32(buf0[6], buf0[25]);
-    buf1[7] = _mm_add_epi32(buf0[7], buf0[24]);
-    buf1[24] = _mm_sub_epi32(buf0[7], buf0[24]);
-    buf1[8] = _mm_add_epi32(buf0[8], buf0[23]);
-    buf1[23] = _mm_sub_epi32(buf0[8], buf0[23]);
-    buf1[9] = _mm_add_epi32(buf0[9], buf0[22]);
-    buf1[22] = _mm_sub_epi32(buf0[9], buf0[22]);
-    buf1[10] = _mm_add_epi32(buf0[10], buf0[21]);
-    buf1[21] = _mm_sub_epi32(buf0[10], buf0[21]);
-    buf1[11] = _mm_add_epi32(buf0[11], buf0[20]);
-    buf1[20] = _mm_sub_epi32(buf0[11], buf0[20]);
-    buf1[12] = _mm_add_epi32(buf0[12], buf0[19]);
-    buf1[19] = _mm_sub_epi32(buf0[12], buf0[19]);
-    buf1[13] = _mm_add_epi32(buf0[13], buf0[18]);
-    buf1[18] = _mm_sub_epi32(buf0[13], buf0[18]);
-    buf1[14] = _mm_add_epi32(buf0[14], buf0[17]);
-    buf1[17] = _mm_sub_epi32(buf0[14], buf0[17]);
-    buf1[15] = _mm_add_epi32(buf0[15], buf0[16]);
-    buf1[16] = _mm_sub_epi32(buf0[15], buf0[16]);
-
-    // stage 2
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
-    buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
-    buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
-    buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]);
-    buf0[2] = _mm_add_epi32(buf1[2], buf1[13]);
-    buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]);
-    buf0[3] = _mm_add_epi32(buf1[3], buf1[12]);
-    buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]);
-    buf0[4] = _mm_add_epi32(buf1[4], buf1[11]);
-    buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]);
-    buf0[5] = _mm_add_epi32(buf1[5], buf1[10]);
-    buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]);
-    buf0[6] = _mm_add_epi32(buf1[6], buf1[9]);
-    buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]);
-    buf0[7] = _mm_add_epi32(buf1[7], buf1[8]);
-    buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]);
-    buf0[16] = buf1[16];
-    buf0[17] = buf1[17];
-    buf0[18] = buf1[18];
-    buf0[19] = buf1[19];
-    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
-                        buf0[27], bit);
-    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
-                        buf0[26], bit);
-    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
-                        buf0[25], bit);
-    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
-                        buf0[24], bit);
-    buf0[28] = buf1[28];
-    buf0[29] = buf1[29];
-    buf0[30] = buf1[30];
-    buf0[31] = buf1[31];
-
-    // stage 3
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
-    buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
-    buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
-    buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
-    buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
-    buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
-    buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
-    buf1[8] = buf0[8];
-    buf1[9] = buf0[9];
-    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
-                        buf1[13], bit);
-    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
-                        buf1[12], bit);
-    buf1[14] = buf0[14];
-    buf1[15] = buf0[15];
-    buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
-    buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]);
-    buf1[17] = _mm_add_epi32(buf0[17], buf0[22]);
-    buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]);
-    buf1[18] = _mm_add_epi32(buf0[18], buf0[21]);
-    buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]);
-    buf1[19] = _mm_add_epi32(buf0[19], buf0[20]);
-    buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]);
-    buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]);
-    buf1[31] = _mm_add_epi32(buf0[31], buf0[24]);
-    buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]);
-    buf1[30] = _mm_add_epi32(buf0[30], buf0[25]);
-    buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]);
-    buf1[29] = _mm_add_epi32(buf0[29], buf0[26]);
-    buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]);
-    buf1[28] = _mm_add_epi32(buf0[28], buf0[27]);
-
-    // stage 4
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
-    buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
-    buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
-    buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
-    buf0[4] = buf1[4];
-    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5],
-                        buf0[6], bit);
-    buf0[7] = buf1[7];
-    buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
-    buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
-    buf0[9] = _mm_add_epi32(buf1[9], buf1[10]);
-    buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]);
-    buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]);
-    buf0[15] = _mm_add_epi32(buf1[15], buf1[12]);
-    buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]);
-    buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
-    buf0[16] = buf1[16];
-    buf0[17] = buf1[17];
-    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
-                        buf0[29], bit);
-    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
-                        buf0[28], bit);
-    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
-                        buf0[27], bit);
-    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
-                        buf0[26], bit);
-    buf0[22] = buf1[22];
-    buf0[23] = buf1[23];
-    buf0[24] = buf1[24];
-    buf0[25] = buf1[25];
-    buf0[30] = buf1[30];
-    buf0[31] = buf1[31];
-
-    // stage 5
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
-                        buf1[1], bit);
-    btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
-                        buf1[3], bit);
-    buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
-    buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
-    buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
-    buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
-    buf1[8] = buf0[8];
-    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
-                        buf1[14], bit);
-    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
-                        buf1[13], bit);
-    buf1[11] = buf0[11];
-    buf1[12] = buf0[12];
-    buf1[15] = buf0[15];
-    buf1[16] = _mm_add_epi32(buf0[16], buf0[19]);
-    buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]);
-    buf1[17] = _mm_add_epi32(buf0[17], buf0[18]);
-    buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]);
-    buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]);
-    buf1[23] = _mm_add_epi32(buf0[23], buf0[20]);
-    buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]);
-    buf1[22] = _mm_add_epi32(buf0[22], buf0[21]);
-    buf1[24] = _mm_add_epi32(buf0[24], buf0[27]);
-    buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]);
-    buf1[25] = _mm_add_epi32(buf0[25], buf0[26]);
-    buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]);
-    buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]);
-    buf1[31] = _mm_add_epi32(buf0[31], buf0[28]);
-    buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]);
-    buf1[30] = _mm_add_epi32(buf0[30], buf0[29]);
-
-    // stage 6
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    buf0[2] = buf1[2];
-    buf0[3] = buf1[3];
-    btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
-                        bit);
-    btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5],
-                        buf0[6], bit);
-    buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
-    buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
-    buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
-    buf0[11] = _mm_add_epi32(buf1[11], buf1[10]);
-    buf0[12] = _mm_add_epi32(buf1[12], buf1[13]);
-    buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]);
-    buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
-    buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
-    buf0[16] = buf1[16];
-    btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
-                        buf0[30], bit);
-    btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
-                        buf0[29], bit);
-    buf0[19] = buf1[19];
-    buf0[20] = buf1[20];
-    btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
-                        buf0[26], bit);
-    btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
-                        buf0[25], bit);
-    buf0[23] = buf1[23];
-    buf0[24] = buf1[24];
-    buf0[27] = buf1[27];
-    buf0[28] = buf1[28];
-    buf0[31] = buf1[31];
-
-    // stage 7
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf1[0] = buf0[0];
-    buf1[1] = buf0[1];
-    buf1[2] = buf0[2];
-    buf1[3] = buf0[3];
-    buf1[4] = buf0[4];
-    buf1[5] = buf0[5];
-    buf1[6] = buf0[6];
-    buf1[7] = buf0[7];
-    btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8],
-                        buf1[15], bit);
-    btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
-                        buf1[14], bit);
-    btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
-                        buf1[13], bit);
-    btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
-                        buf1[12], bit);
-    buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
-    buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
-    buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
-    buf1[19] = _mm_add_epi32(buf0[19], buf0[18]);
-    buf1[20] = _mm_add_epi32(buf0[20], buf0[21]);
-    buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]);
-    buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]);
-    buf1[23] = _mm_add_epi32(buf0[23], buf0[22]);
-    buf1[24] = _mm_add_epi32(buf0[24], buf0[25]);
-    buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]);
-    buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]);
-    buf1[27] = _mm_add_epi32(buf0[27], buf0[26]);
-    buf1[28] = _mm_add_epi32(buf0[28], buf0[29]);
-    buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]);
-    buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]);
-    buf1[31] = _mm_add_epi32(buf0[31], buf0[30]);
-
-    // stage 8
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    buf0[2] = buf1[2];
-    buf0[3] = buf1[3];
-    buf0[4] = buf1[4];
-    buf0[5] = buf1[5];
-    buf0[6] = buf1[6];
-    buf0[7] = buf1[7];
-    buf0[8] = buf1[8];
-    buf0[9] = buf1[9];
-    buf0[10] = buf1[10];
-    buf0[11] = buf1[11];
-    buf0[12] = buf1[12];
-    buf0[13] = buf1[13];
-    buf0[14] = buf1[14];
-    buf0[15] = buf1[15];
-    btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
-                        buf0[31], bit);
-    btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
-                        buf0[30], bit);
-    btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
-                        buf0[29], bit);
-    btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
-                        buf0[28], bit);
-    btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
-                        buf0[27], bit);
-    btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
-                        buf0[26], bit);
-    btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
-                        buf0[25], bit);
-    btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
-                        buf0[24], bit);
-
-    // stage 9
-    stage_idx++;
-    buf1[0] = buf0[0];
-    buf1[1] = buf0[16];
-    buf1[2] = buf0[8];
-    buf1[3] = buf0[24];
-    buf1[4] = buf0[4];
-    buf1[5] = buf0[20];
-    buf1[6] = buf0[12];
-    buf1[7] = buf0[28];
-    buf1[8] = buf0[2];
-    buf1[9] = buf0[18];
-    buf1[10] = buf0[10];
-    buf1[11] = buf0[26];
-    buf1[12] = buf0[6];
-    buf1[13] = buf0[22];
-    buf1[14] = buf0[14];
-    buf1[15] = buf0[30];
-    buf1[16] = buf0[1];
-    buf1[17] = buf0[17];
-    buf1[18] = buf0[9];
-    buf1[19] = buf0[25];
-    buf1[20] = buf0[5];
-    buf1[21] = buf0[21];
-    buf1[22] = buf0[13];
-    buf1[23] = buf0[29];
-    buf1[24] = buf0[3];
-    buf1[25] = buf0[19];
-    buf1[26] = buf0[11];
-    buf1[27] = buf0[27];
-    buf1[28] = buf0[7];
-    buf1[29] = buf0[23];
-    buf1[30] = buf0[15];
-    buf1[31] = buf0[31];
-
-    for (j = 0; j < 32; ++j) {
-      output[j * col_num + col] = buf1[j];
-    }
-  }
-}
-
-void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range) {
-  const int txfm_size = 4;
-  const int num_per_128 = 4;
-  const int32_t *cospi;
-  __m128i buf0[4];
-  __m128i buf1[4];
-  int col_num = txfm_size / num_per_128;
-  int bit;
-  int col;
-  (void)stage_range;
-  for (col = 0; col < col_num; col++) {
-    // stage 0;
-    int32_t stage_idx = 0;
-    int j;
-    for (j = 0; j < 4; ++j) {
-      buf0[j] = input[j * col_num + col];
-    }
-
-    // stage 1
-    stage_idx++;
-    buf1[0] = buf0[3];
-    buf1[1] = buf0[0];
-    buf1[2] = buf0[1];
-    buf1[3] = buf0[2];
-
-    // stage 2
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
-                        bit);
-    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
-                        buf0[3], bit);
-
-    // stage 3
-    stage_idx++;
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
-    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
-    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
-
-    // stage 4
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
-                        buf0[3], bit);
-
-    // stage 5
-    stage_idx++;
-    buf1[0] = buf0[0];
-    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
-    buf1[2] = buf0[3];
-    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
-
-    for (j = 0; j < 4; ++j) {
-      output[j * col_num + col] = buf1[j];
-    }
-  }
-}
-
-void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
-                            const int8_t *cos_bit, const int8_t *stage_range) {
-  const int txfm_size = 32;
-  const int num_per_128 = 4;
-  const int32_t *cospi;
-  __m128i buf0[32];
-  __m128i buf1[32];
-  int col_num = txfm_size / num_per_128;
-  int bit;
-  int col;
-  (void)stage_range;
-  for (col = 0; col < col_num; col++) {
-    // stage 0;
-    int32_t stage_idx = 0;
-    int j;
-    for (j = 0; j < 32; ++j) {
-      buf0[j] = input[j * col_num + col];
-    }
-
-    // stage 1
-    stage_idx++;
-    buf1[0] = buf0[31];
-    buf1[1] = buf0[0];
-    buf1[2] = buf0[29];
-    buf1[3] = buf0[2];
-    buf1[4] = buf0[27];
-    buf1[5] = buf0[4];
-    buf1[6] = buf0[25];
-    buf1[7] = buf0[6];
-    buf1[8] = buf0[23];
-    buf1[9] = buf0[8];
-    buf1[10] = buf0[21];
-    buf1[11] = buf0[10];
-    buf1[12] = buf0[19];
-    buf1[13] = buf0[12];
-    buf1[14] = buf0[17];
-    buf1[15] = buf0[14];
-    buf1[16] = buf0[15];
-    buf1[17] = buf0[16];
-    buf1[18] = buf0[13];
-    buf1[19] = buf0[18];
-    buf1[20] = buf0[11];
-    buf1[21] = buf0[20];
-    buf1[22] = buf0[9];
-    buf1[23] = buf0[22];
-    buf1[24] = buf0[7];
-    buf1[25] = buf0[24];
-    buf1[26] = buf0[5];
-    buf1[27] = buf0[26];
-    buf1[28] = buf0[3];
-    buf1[29] = buf0[28];
-    buf1[30] = buf0[1];
-    buf1[31] = buf0[30];
-
-    // stage 2
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    btf_32_sse4_1_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1],
-                        bit);
-    btf_32_sse4_1_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3],
-                        bit);
-    btf_32_sse4_1_type0(cospi[9], cospi[55], buf1[4], buf1[5], buf0[4], buf0[5],
-                        bit);
-    btf_32_sse4_1_type0(cospi[13], cospi[51], buf1[6], buf1[7], buf0[6],
-                        buf0[7], bit);
-    btf_32_sse4_1_type0(cospi[17], cospi[47], buf1[8], buf1[9], buf0[8],
-                        buf0[9], bit);
-    btf_32_sse4_1_type0(cospi[21], cospi[43], buf1[10], buf1[11], buf0[10],
-                        buf0[11], bit);
-    btf_32_sse4_1_type0(cospi[25], cospi[39], buf1[12], buf1[13], buf0[12],
-                        buf0[13], bit);
-    btf_32_sse4_1_type0(cospi[29], cospi[35], buf1[14], buf1[15], buf0[14],
-                        buf0[15], bit);
-    btf_32_sse4_1_type0(cospi[33], cospi[31], buf1[16], buf1[17], buf0[16],
-                        buf0[17], bit);
-    btf_32_sse4_1_type0(cospi[37], cospi[27], buf1[18], buf1[19], buf0[18],
-                        buf0[19], bit);
-    btf_32_sse4_1_type0(cospi[41], cospi[23], buf1[20], buf1[21], buf0[20],
-                        buf0[21], bit);
-    btf_32_sse4_1_type0(cospi[45], cospi[19], buf1[22], buf1[23], buf0[22],
-                        buf0[23], bit);
-    btf_32_sse4_1_type0(cospi[49], cospi[15], buf1[24], buf1[25], buf0[24],
-                        buf0[25], bit);
-    btf_32_sse4_1_type0(cospi[53], cospi[11], buf1[26], buf1[27], buf0[26],
-                        buf0[27], bit);
-    btf_32_sse4_1_type0(cospi[57], cospi[7], buf1[28], buf1[29], buf0[28],
-                        buf0[29], bit);
-    btf_32_sse4_1_type0(cospi[61], cospi[3], buf1[30], buf1[31], buf0[30],
-                        buf0[31], bit);
-
-    // stage 3
-    stage_idx++;
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[16]);
-    buf1[16] = _mm_sub_epi32(buf0[0], buf0[16]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[17]);
-    buf1[17] = _mm_sub_epi32(buf0[1], buf0[17]);
-    buf1[2] = _mm_add_epi32(buf0[2], buf0[18]);
-    buf1[18] = _mm_sub_epi32(buf0[2], buf0[18]);
-    buf1[3] = _mm_add_epi32(buf0[3], buf0[19]);
-    buf1[19] = _mm_sub_epi32(buf0[3], buf0[19]);
-    buf1[4] = _mm_add_epi32(buf0[4], buf0[20]);
-    buf1[20] = _mm_sub_epi32(buf0[4], buf0[20]);
-    buf1[5] = _mm_add_epi32(buf0[5], buf0[21]);
-    buf1[21] = _mm_sub_epi32(buf0[5], buf0[21]);
-    buf1[6] = _mm_add_epi32(buf0[6], buf0[22]);
-    buf1[22] = _mm_sub_epi32(buf0[6], buf0[22]);
-    buf1[7] = _mm_add_epi32(buf0[7], buf0[23]);
-    buf1[23] = _mm_sub_epi32(buf0[7], buf0[23]);
-    buf1[8] = _mm_add_epi32(buf0[8], buf0[24]);
-    buf1[24] = _mm_sub_epi32(buf0[8], buf0[24]);
-    buf1[9] = _mm_add_epi32(buf0[9], buf0[25]);
-    buf1[25] = _mm_sub_epi32(buf0[9], buf0[25]);
-    buf1[10] = _mm_add_epi32(buf0[10], buf0[26]);
-    buf1[26] = _mm_sub_epi32(buf0[10], buf0[26]);
-    buf1[11] = _mm_add_epi32(buf0[11], buf0[27]);
-    buf1[27] = _mm_sub_epi32(buf0[11], buf0[27]);
-    buf1[12] = _mm_add_epi32(buf0[12], buf0[28]);
-    buf1[28] = _mm_sub_epi32(buf0[12], buf0[28]);
-    buf1[13] = _mm_add_epi32(buf0[13], buf0[29]);
-    buf1[29] = _mm_sub_epi32(buf0[13], buf0[29]);
-    buf1[14] = _mm_add_epi32(buf0[14], buf0[30]);
-    buf1[30] = _mm_sub_epi32(buf0[14], buf0[30]);
-    buf1[15] = _mm_add_epi32(buf0[15], buf0[31]);
-    buf1[31] = _mm_sub_epi32(buf0[15], buf0[31]);
-
-    // stage 4
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    buf0[2] = buf1[2];
-    buf0[3] = buf1[3];
-    buf0[4] = buf1[4];
-    buf0[5] = buf1[5];
-    buf0[6] = buf1[6];
-    buf0[7] = buf1[7];
-    buf0[8] = buf1[8];
-    buf0[9] = buf1[9];
-    buf0[10] = buf1[10];
-    buf0[11] = buf1[11];
-    buf0[12] = buf1[12];
-    buf0[13] = buf1[13];
-    buf0[14] = buf1[14];
-    buf0[15] = buf1[15];
-    btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[16], buf1[17], buf0[16],
-                        buf0[17], bit);
-    btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[18], buf1[19], buf0[18],
-                        buf0[19], bit);
-    btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[20], buf1[21], buf0[20],
-                        buf0[21], bit);
-    btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[22], buf1[23], buf0[22],
-                        buf0[23], bit);
-    btf_32_sse4_1_type0(-cospi[60], cospi[4], buf1[24], buf1[25], buf0[24],
-                        buf0[25], bit);
-    btf_32_sse4_1_type0(-cospi[44], cospi[20], buf1[26], buf1[27], buf0[26],
-                        buf0[27], bit);
-    btf_32_sse4_1_type0(-cospi[28], cospi[36], buf1[28], buf1[29], buf0[28],
-                        buf0[29], bit);
-    btf_32_sse4_1_type0(-cospi[12], cospi[52], buf1[30], buf1[31], buf0[30],
-                        buf0[31], bit);
-
-    // stage 5
-    stage_idx++;
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[8]);
-    buf1[8] = _mm_sub_epi32(buf0[0], buf0[8]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[9]);
-    buf1[9] = _mm_sub_epi32(buf0[1], buf0[9]);
-    buf1[2] = _mm_add_epi32(buf0[2], buf0[10]);
-    buf1[10] = _mm_sub_epi32(buf0[2], buf0[10]);
-    buf1[3] = _mm_add_epi32(buf0[3], buf0[11]);
-    buf1[11] = _mm_sub_epi32(buf0[3], buf0[11]);
-    buf1[4] = _mm_add_epi32(buf0[4], buf0[12]);
-    buf1[12] = _mm_sub_epi32(buf0[4], buf0[12]);
-    buf1[5] = _mm_add_epi32(buf0[5], buf0[13]);
-    buf1[13] = _mm_sub_epi32(buf0[5], buf0[13]);
-    buf1[6] = _mm_add_epi32(buf0[6], buf0[14]);
-    buf1[14] = _mm_sub_epi32(buf0[6], buf0[14]);
-    buf1[7] = _mm_add_epi32(buf0[7], buf0[15]);
-    buf1[15] = _mm_sub_epi32(buf0[7], buf0[15]);
-    buf1[16] = _mm_add_epi32(buf0[16], buf0[24]);
-    buf1[24] = _mm_sub_epi32(buf0[16], buf0[24]);
-    buf1[17] = _mm_add_epi32(buf0[17], buf0[25]);
-    buf1[25] = _mm_sub_epi32(buf0[17], buf0[25]);
-    buf1[18] = _mm_add_epi32(buf0[18], buf0[26]);
-    buf1[26] = _mm_sub_epi32(buf0[18], buf0[26]);
-    buf1[19] = _mm_add_epi32(buf0[19], buf0[27]);
-    buf1[27] = _mm_sub_epi32(buf0[19], buf0[27]);
-    buf1[20] = _mm_add_epi32(buf0[20], buf0[28]);
-    buf1[28] = _mm_sub_epi32(buf0[20], buf0[28]);
-    buf1[21] = _mm_add_epi32(buf0[21], buf0[29]);
-    buf1[29] = _mm_sub_epi32(buf0[21], buf0[29]);
-    buf1[22] = _mm_add_epi32(buf0[22], buf0[30]);
-    buf1[30] = _mm_sub_epi32(buf0[22], buf0[30]);
-    buf1[23] = _mm_add_epi32(buf0[23], buf0[31]);
-    buf1[31] = _mm_sub_epi32(buf0[23], buf0[31]);
-
-    // stage 6
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    buf0[2] = buf1[2];
-    buf0[3] = buf1[3];
-    buf0[4] = buf1[4];
-    buf0[5] = buf1[5];
-    buf0[6] = buf1[6];
-    buf0[7] = buf1[7];
-    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
-                        bit);
-    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
-                        buf0[11], bit);
-    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
-                        buf0[13], bit);
-    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
-                        buf0[15], bit);
-    buf0[16] = buf1[16];
-    buf0[17] = buf1[17];
-    buf0[18] = buf1[18];
-    buf0[19] = buf1[19];
-    buf0[20] = buf1[20];
-    buf0[21] = buf1[21];
-    buf0[22] = buf1[22];
-    buf0[23] = buf1[23];
-    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[24], buf1[25], buf0[24],
-                        buf0[25], bit);
-    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[26], buf1[27], buf0[26],
-                        buf0[27], bit);
-    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[28], buf1[29], buf0[28],
-                        buf0[29], bit);
-    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[30], buf1[31], buf0[30],
-                        buf0[31], bit);
-
-    // stage 7
-    stage_idx++;
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
-    buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
-    buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
-    buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
-    buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
-    buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
-    buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
-    buf1[8] = _mm_add_epi32(buf0[8], buf0[12]);
-    buf1[12] = _mm_sub_epi32(buf0[8], buf0[12]);
-    buf1[9] = _mm_add_epi32(buf0[9], buf0[13]);
-    buf1[13] = _mm_sub_epi32(buf0[9], buf0[13]);
-    buf1[10] = _mm_add_epi32(buf0[10], buf0[14]);
-    buf1[14] = _mm_sub_epi32(buf0[10], buf0[14]);
-    buf1[11] = _mm_add_epi32(buf0[11], buf0[15]);
-    buf1[15] = _mm_sub_epi32(buf0[11], buf0[15]);
-    buf1[16] = _mm_add_epi32(buf0[16], buf0[20]);
-    buf1[20] = _mm_sub_epi32(buf0[16], buf0[20]);
-    buf1[17] = _mm_add_epi32(buf0[17], buf0[21]);
-    buf1[21] = _mm_sub_epi32(buf0[17], buf0[21]);
-    buf1[18] = _mm_add_epi32(buf0[18], buf0[22]);
-    buf1[22] = _mm_sub_epi32(buf0[18], buf0[22]);
-    buf1[19] = _mm_add_epi32(buf0[19], buf0[23]);
-    buf1[23] = _mm_sub_epi32(buf0[19], buf0[23]);
-    buf1[24] = _mm_add_epi32(buf0[24], buf0[28]);
-    buf1[28] = _mm_sub_epi32(buf0[24], buf0[28]);
-    buf1[25] = _mm_add_epi32(buf0[25], buf0[29]);
-    buf1[29] = _mm_sub_epi32(buf0[25], buf0[29]);
-    buf1[26] = _mm_add_epi32(buf0[26], buf0[30]);
-    buf1[30] = _mm_sub_epi32(buf0[26], buf0[30]);
-    buf1[27] = _mm_add_epi32(buf0[27], buf0[31]);
-    buf1[31] = _mm_sub_epi32(buf0[27], buf0[31]);
-
-    // stage 8
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    buf0[2] = buf1[2];
-    buf0[3] = buf1[3];
-    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
-                        buf0[5], bit);
-    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
-                        buf0[7], bit);
-    buf0[8] = buf1[8];
-    buf0[9] = buf1[9];
-    buf0[10] = buf1[10];
-    buf0[11] = buf1[11];
-    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
-                        buf0[13], bit);
-    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
-                        buf0[15], bit);
-    buf0[16] = buf1[16];
-    buf0[17] = buf1[17];
-    buf0[18] = buf1[18];
-    buf0[19] = buf1[19];
-    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[20], buf1[21], buf0[20],
-                        buf0[21], bit);
-    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[22], buf1[23], buf0[22],
-                        buf0[23], bit);
-    buf0[24] = buf1[24];
-    buf0[25] = buf1[25];
-    buf0[26] = buf1[26];
-    buf0[27] = buf1[27];
-    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[28], buf1[29], buf0[28],
-                        buf0[29], bit);
-    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[30], buf1[31], buf0[30],
-                        buf0[31], bit);
-
-    // stage 9
-    stage_idx++;
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
-    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
-    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
-    buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
-    buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
-    buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
-    buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
-    buf1[8] = _mm_add_epi32(buf0[8], buf0[10]);
-    buf1[10] = _mm_sub_epi32(buf0[8], buf0[10]);
-    buf1[9] = _mm_add_epi32(buf0[9], buf0[11]);
-    buf1[11] = _mm_sub_epi32(buf0[9], buf0[11]);
-    buf1[12] = _mm_add_epi32(buf0[12], buf0[14]);
-    buf1[14] = _mm_sub_epi32(buf0[12], buf0[14]);
-    buf1[13] = _mm_add_epi32(buf0[13], buf0[15]);
-    buf1[15] = _mm_sub_epi32(buf0[13], buf0[15]);
-    buf1[16] = _mm_add_epi32(buf0[16], buf0[18]);
-    buf1[18] = _mm_sub_epi32(buf0[16], buf0[18]);
-    buf1[17] = _mm_add_epi32(buf0[17], buf0[19]);
-    buf1[19] = _mm_sub_epi32(buf0[17], buf0[19]);
-    buf1[20] = _mm_add_epi32(buf0[20], buf0[22]);
-    buf1[22] = _mm_sub_epi32(buf0[20], buf0[22]);
-    buf1[21] = _mm_add_epi32(buf0[21], buf0[23]);
-    buf1[23] = _mm_sub_epi32(buf0[21], buf0[23]);
-    buf1[24] = _mm_add_epi32(buf0[24], buf0[26]);
-    buf1[26] = _mm_sub_epi32(buf0[24], buf0[26]);
-    buf1[25] = _mm_add_epi32(buf0[25], buf0[27]);
-    buf1[27] = _mm_sub_epi32(buf0[25], buf0[27]);
-    buf1[28] = _mm_add_epi32(buf0[28], buf0[30]);
-    buf1[30] = _mm_sub_epi32(buf0[28], buf0[30]);
-    buf1[29] = _mm_add_epi32(buf0[29], buf0[31]);
-    buf1[31] = _mm_sub_epi32(buf0[29], buf0[31]);
-
-    // stage 10
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr(bit);
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
-                        buf0[3], bit);
-    buf0[4] = buf1[4];
-    buf0[5] = buf1[5];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
-                        buf0[7], bit);
-    buf0[8] = buf1[8];
-    buf0[9] = buf1[9];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
-                        buf0[11], bit);
-    buf0[12] = buf1[12];
-    buf0[13] = buf1[13];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
-                        buf0[15], bit);
-    buf0[16] = buf1[16];
-    buf0[17] = buf1[17];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[18], buf1[19], buf0[18],
-                        buf0[19], bit);
-    buf0[20] = buf1[20];
-    buf0[21] = buf1[21];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[22], buf1[23], buf0[22],
-                        buf0[23], bit);
-    buf0[24] = buf1[24];
-    buf0[25] = buf1[25];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[26], buf1[27], buf0[26],
-                        buf0[27], bit);
-    buf0[28] = buf1[28];
-    buf0[29] = buf1[29];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[30], buf1[31], buf0[30],
-                        buf0[31], bit);
-
-    // stage 11
-    stage_idx++;
-    buf1[0] = buf0[0];
-    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[16]);
-    buf1[2] = buf0[24];
-    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[8]);
-    buf1[4] = buf0[12];
-    buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[28]);
-    buf1[6] = buf0[20];
-    buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
-    buf1[8] = buf0[6];
-    buf1[9] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[22]);
-    buf1[10] = buf0[30];
-    buf1[11] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[14]);
-    buf1[12] = buf0[10];
-    buf1[13] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[26]);
-    buf1[14] = buf0[18];
-    buf1[15] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
-    buf1[16] = buf0[3];
-    buf1[17] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[19]);
-    buf1[18] = buf0[27];
-    buf1[19] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[11]);
-    buf1[20] = buf0[15];
-    buf1[21] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[31]);
-    buf1[22] = buf0[23];
-    buf1[23] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
-    buf1[24] = buf0[5];
-    buf1[25] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[21]);
-    buf1[26] = buf0[29];
-    buf1[27] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[13]);
-    buf1[28] = buf0[9];
-    buf1[29] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[25]);
-    buf1[30] = buf0[17];
-    buf1[31] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
-
-    for (j = 0; j < 32; ++j) {
-      output[j * col_num + col] = buf1[j];
-    }
-  }
-}
diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
deleted file mode 100644
index 58ede028a..000000000
--- a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./av1_rtcd.h"
-#include "av1/common/enums.h"
-#include "av1/common/av1_txfm.h"
-#include "av1/common/x86/av1_txfm1d_sse4.h"
-
-static INLINE void int16_array_with_stride_to_int32_array_without_stride(
-    const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
-  int r, c;
-  for (r = 0; r < txfm1d_size; r++) {
-    for (c = 0; c < txfm1d_size; c++) {
-      output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
-    }
-  }
-}
-
-typedef void (*TxfmFuncSSE2)(const __m128i *input, __m128i *output,
-                             const int8_t *cos_bit, const int8_t *stage_range);
-
-static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
-  switch (txfm_type) {
-    case TXFM_TYPE_DCT32: return av1_fdct32_new_sse4_1; break;
-    case TXFM_TYPE_ADST32: return av1_fadst32_new_sse4_1; break;
-    default: assert(0);
-  }
-  return NULL;
-}
-
-static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
-                                     const int stride,
-                                     const TXFM_2D_FLIP_CFG *cfg,
-                                     int32_t *txfm_buf) {
-  // TODO(sarahparker) This does not currently support rectangular transforms
-  // and will break without splitting txfm_size out into row and col size.
-  // Rectangular transforms use c code only, so it should be ok for now.
-  // It will be corrected when there are sse implementations for rectangular
-  // transforms.
-  assert(cfg->row_cfg->txfm_size == cfg->col_cfg->txfm_size);
-  const int txfm_size = cfg->row_cfg->txfm_size;
-  const int8_t *shift = cfg->row_cfg->shift;
-  const int8_t *stage_range_col = cfg->col_cfg->stage_range;
-  const int8_t *stage_range_row = cfg->row_cfg->stage_range;
-  const int8_t *cos_bit_col = cfg->col_cfg->cos_bit;
-  const int8_t *cos_bit_row = cfg->row_cfg->cos_bit;
-  const TxfmFuncSSE2 txfm_func_col =
-      fwd_txfm_type_to_func(cfg->col_cfg->txfm_type);
-  const TxfmFuncSSE2 txfm_func_row =
-      fwd_txfm_type_to_func(cfg->row_cfg->txfm_type);
-
-  __m128i *buf_128 = (__m128i *)txfm_buf;
-  __m128i *out_128 = (__m128i *)output;
-  int num_per_128 = 4;
-  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
-
-  int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
-                                                        txfm_size);
-  round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
-  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
-  round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
-  transpose_32(txfm_size, out_128, buf_128);
-  txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
-  round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
-  transpose_32(txfm_size, buf_128, out_128);
-}
-
-void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
-                                 int stride, TX_TYPE tx_type, int bd) {
-  DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
-  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_32X32);
-  (void)bd;
-  fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
-}
diff --git a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
index 68461bc36..212d3bd72 100644
--- a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
@@ -12,81 +12,14 @@
 #include <assert.h>
 #include <smmintrin.h>
 
-#include "./av1_rtcd.h"
-#include "av1/common/filter.h"
-
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-DECLARE_ALIGNED(16, static int16_t, subpel_filters_sharp[15][6][8]);
-#endif
-
-#if USE_TEMPORALFILTER_12TAP
-DECLARE_ALIGNED(16, static int16_t, subpel_temporalfilter[15][6][8]);
-#endif
+#include "config/av1_rtcd.h"
 
-typedef int16_t (*HbdSubpelFilterCoeffs)[8];
+#include "av1/common/filter.h"
 
 typedef void (*TransposeSave)(int width, int pixelsNum, uint32_t *src,
                               int src_stride, uint16_t *dst, int dst_stride,
                               int bd);
 
-static INLINE HbdSubpelFilterCoeffs
-hbd_get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-  if (p.interp_filter == MULTITAP_SHARP) {
-    return &subpel_filters_sharp[index][0];
-  }
-#endif
-#if USE_TEMPORALFILTER_12TAP
-  if (p.interp_filter == TEMPORALFILTER_12TAP) {
-    return &subpel_temporalfilter[index][0];
-  }
-#endif
-  (void)p;
-  (void)index;
-  return NULL;
-}
-
-static void init_simd_filter(const int16_t *filter_ptr, int taps,
-                             int16_t (*simd_filter)[6][8]) {
-  int shift;
-  int offset = (12 - taps) / 2;
-  for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
-    const int16_t *filter_row = filter_ptr + shift * taps;
-    int i, j;
-    for (i = 0; i < 12; ++i) {
-      for (j = 0; j < 4; ++j) {
-        int r = i / 2;
-        int c = j * 2 + (i % 2);
-        if (i - offset >= 0 && i - offset < taps)
-          simd_filter[shift - 1][r][c] = filter_row[i - offset];
-        else
-          simd_filter[shift - 1][r][c] = 0;
-      }
-    }
-  }
-}
-
-void av1_highbd_convolve_init_sse4_1(void) {
-#if USE_TEMPORALFILTER_12TAP
-  {
-    InterpFilterParams filter_params =
-        av1_get_interp_filter_params(TEMPORALFILTER_12TAP);
-    int taps = filter_params.taps;
-    const int16_t *filter_ptr = filter_params.filter_ptr;
-    init_simd_filter(filter_ptr, taps, subpel_temporalfilter);
-  }
-#endif
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-  {
-    InterpFilterParams filter_params =
-        av1_get_interp_filter_params(MULTITAP_SHARP);
-    int taps = filter_params.taps;
-    const int16_t *filter_ptr = filter_params.filter_ptr;
-    init_simd_filter(filter_ptr, taps, subpel_filters_sharp);
-  }
-#endif
-}
-
 // pixelsNum 0: write all 4 pixels
 //           1/2/3: residual pixels 1/2/3
 static void writePixel(__m128i *u, int width, int pixelsNum, uint16_t *dst,
@@ -218,138 +151,6 @@ void trans_accum_save_4x4(int width, int pixelsNum, uint32_t *src,
   writePixel(u, width, pixelsNum, dst, dst_stride);
 }
 
-static TransposeSave transSaveTab[2] = { trans_save_4x4, trans_accum_save_4x4 };
-
-static INLINE void transpose_pair(__m128i *in, __m128i *out) {
-  __m128i x0, x1;
-
-  x0 = _mm_unpacklo_epi32(in[0], in[1]);
-  x1 = _mm_unpacklo_epi32(in[2], in[3]);
-
-  out[0] = _mm_unpacklo_epi64(x0, x1);
-  out[1] = _mm_unpackhi_epi64(x0, x1);
-
-  x0 = _mm_unpackhi_epi32(in[0], in[1]);
-  x1 = _mm_unpackhi_epi32(in[2], in[3]);
-
-  out[2] = _mm_unpacklo_epi64(x0, x1);
-  out[3] = _mm_unpackhi_epi64(x0, x1);
-
-  x0 = _mm_unpacklo_epi32(in[4], in[5]);
-  x1 = _mm_unpacklo_epi32(in[6], in[7]);
-
-  out[4] = _mm_unpacklo_epi64(x0, x1);
-  out[5] = _mm_unpackhi_epi64(x0, x1);
-}
-
-static void highbd_filter_horiz(const uint16_t *src, int src_stride, __m128i *f,
-                                int tapsNum, uint32_t *buf) {
-  __m128i u[8], v[6];
-
-  assert(tapsNum == 10 || tapsNum == 12);
-  if (tapsNum == 10) {
-    src -= 1;
-  }
-
-  u[0] = _mm_loadu_si128((__m128i const *)src);
-  u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
-  u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-
-  u[4] = _mm_loadu_si128((__m128i const *)(src + 8));
-  u[5] = _mm_loadu_si128((__m128i const *)(src + src_stride + 8));
-  u[6] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride + 8));
-  u[7] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride + 8));
-
-  transpose_pair(u, v);
-
-  u[0] = _mm_madd_epi16(v[0], f[0]);
-  u[1] = _mm_madd_epi16(v[1], f[1]);
-  u[2] = _mm_madd_epi16(v[2], f[2]);
-  u[3] = _mm_madd_epi16(v[3], f[3]);
-  u[4] = _mm_madd_epi16(v[4], f[4]);
-  u[5] = _mm_madd_epi16(v[5], f[5]);
-
-  u[6] = _mm_min_epi32(u[2], u[3]);
-  u[7] = _mm_max_epi32(u[2], u[3]);
-
-  u[0] = _mm_add_epi32(u[0], u[1]);
-  u[0] = _mm_add_epi32(u[0], u[5]);
-  u[0] = _mm_add_epi32(u[0], u[4]);
-  u[0] = _mm_add_epi32(u[0], u[6]);
-  u[0] = _mm_add_epi32(u[0], u[7]);
-
-  _mm_storeu_si128((__m128i *)buf, u[0]);
-}
-
-void av1_highbd_convolve_horiz_sse4_1(const uint16_t *src, int src_stride,
-                                      uint16_t *dst, int dst_stride, int w,
-                                      int h,
-                                      const InterpFilterParams filter_params,
-                                      const int subpel_x_q4, int x_step_q4,
-                                      int avg, int bd) {
-  DECLARE_ALIGNED(16, uint32_t, temp[4 * 4]);
-  __m128i verf[6];
-  HbdSubpelFilterCoeffs vCoeffs;
-  const uint16_t *srcPtr;
-  const int tapsNum = filter_params.taps;
-  int i, col, count, blkResidu, blkHeight;
-  TransposeSave transSave = transSaveTab[avg];
-  (void)x_step_q4;
-
-  if (0 == subpel_x_q4 || 16 != x_step_q4) {
-    av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
-                                filter_params, subpel_x_q4, x_step_q4, avg, bd);
-    return;
-  }
-
-  vCoeffs =
-      hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1);
-  if (!vCoeffs) {
-    av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
-                                filter_params, subpel_x_q4, x_step_q4, avg, bd);
-    return;
-  }
-
-  verf[0] = *((const __m128i *)(vCoeffs));
-  verf[1] = *((const __m128i *)(vCoeffs + 1));
-  verf[2] = *((const __m128i *)(vCoeffs + 2));
-  verf[3] = *((const __m128i *)(vCoeffs + 3));
-  verf[4] = *((const __m128i *)(vCoeffs + 4));
-  verf[5] = *((const __m128i *)(vCoeffs + 5));
-
-  src -= (tapsNum >> 1) - 1;
-  srcPtr = src;
-
-  count = 0;
-  blkHeight = h >> 2;
-  blkResidu = h & 3;
-
-  while (blkHeight != 0) {
-    for (col = 0; col < w; col += 4) {
-      for (i = 0; i < 4; ++i) {
-        highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
-        srcPtr += 1;
-      }
-      transSave(w, 0, temp, 4, dst + col, dst_stride, bd);
-    }
-    count++;
-    srcPtr = src + count * src_stride * 4;
-    dst += dst_stride * 4;
-    blkHeight--;
-  }
-
-  if (blkResidu == 0) return;
-
-  for (col = 0; col < w; col += 4) {
-    for (i = 0; i < 4; ++i) {
-      highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
-      srcPtr += 1;
-    }
-    transSave(w, blkResidu, temp, 4, dst + col, dst_stride, bd);
-  }
-}
-
 // Vertical convolutional filter
 
 typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst);
@@ -402,134 +203,3 @@ static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
 }
 
 WritePixels write4pixelsTab[2] = { write4pixelsOnly, write4pixelsAccum };
-
-static void filter_vert_horiz_parallel(const uint16_t *src, int src_stride,
-                                       const __m128i *f, int taps,
-                                       uint16_t *dst, WritePixels saveFunc,
-                                       int bd) {
-  __m128i s[12];
-  __m128i zero = _mm_setzero_si128();
-  int i = 0;
-  int r = 0;
-
-  // TODO(luoyi) treat s[12] as a circular buffer in width = 2 case
-  assert(taps == 10 || taps == 12);
-  if (10 == taps) {
-    i += 1;
-    s[0] = zero;
-  }
-  while (i < 12) {
-    s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
-    i += 1;
-    r += 1;
-  }
-
-  s[0] = _mm_unpacklo_epi16(s[0], s[1]);
-  s[2] = _mm_unpacklo_epi16(s[2], s[3]);
-  s[4] = _mm_unpacklo_epi16(s[4], s[5]);
-  s[6] = _mm_unpacklo_epi16(s[6], s[7]);
-  s[8] = _mm_unpacklo_epi16(s[8], s[9]);
-  s[10] = _mm_unpacklo_epi16(s[10], s[11]);
-
-  s[0] = _mm_madd_epi16(s[0], f[0]);
-  s[2] = _mm_madd_epi16(s[2], f[1]);
-  s[4] = _mm_madd_epi16(s[4], f[2]);
-  s[6] = _mm_madd_epi16(s[6], f[3]);
-  s[8] = _mm_madd_epi16(s[8], f[4]);
-  s[10] = _mm_madd_epi16(s[10], f[5]);
-
-  s[1] = _mm_min_epi32(s[4], s[6]);
-  s[3] = _mm_max_epi32(s[4], s[6]);
-
-  s[0] = _mm_add_epi32(s[0], s[2]);
-  s[0] = _mm_add_epi32(s[0], s[10]);
-  s[0] = _mm_add_epi32(s[0], s[8]);
-  s[0] = _mm_add_epi32(s[0], s[1]);
-  s[0] = _mm_add_epi32(s[0], s[3]);
-
-  saveFunc(s, bd, dst);
-}
-
-static void highbd_filter_vert_compute_large(const uint16_t *src,
-                                             int src_stride, const __m128i *f,
-                                             int taps, int w, int h,
-                                             uint16_t *dst, int dst_stride,
-                                             int avg, int bd) {
-  int col;
-  int rowIndex = 0;
-  const uint16_t *src_ptr = src;
-  uint16_t *dst_ptr = dst;
-  const int step = 4;
-  WritePixels write4pixels = write4pixelsTab[avg];
-
-  do {
-    for (col = 0; col < w; col += step) {
-      filter_vert_horiz_parallel(src_ptr, src_stride, f, taps, dst_ptr,
-                                 write4pixels, bd);
-      src_ptr += step;
-      dst_ptr += step;
-    }
-    rowIndex++;
-    src_ptr = src + rowIndex * src_stride;
-    dst_ptr = dst + rowIndex * dst_stride;
-  } while (rowIndex < h);
-}
-
-static void highbd_filter_vert_compute_small(const uint16_t *src,
-                                             int src_stride, const __m128i *f,
-                                             int taps, int w, int h,
-                                             uint16_t *dst, int dst_stride,
-                                             int avg, int bd) {
-  int rowIndex = 0;
-  WritePixels write2pixels = write2pixelsTab[avg];
-  (void)w;
-
-  do {
-    filter_vert_horiz_parallel(src, src_stride, f, taps, dst, write2pixels, bd);
-    rowIndex++;
-    src += src_stride;
-    dst += dst_stride;
-  } while (rowIndex < h);
-}
-
-void av1_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride,
-                                     uint16_t *dst, int dst_stride, int w,
-                                     int h,
-                                     const InterpFilterParams filter_params,
-                                     const int subpel_y_q4, int y_step_q4,
-                                     int avg, int bd) {
-  __m128i verf[6];
-  HbdSubpelFilterCoeffs vCoeffs;
-  const int tapsNum = filter_params.taps;
-
-  if (0 == subpel_y_q4 || 16 != y_step_q4) {
-    av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
-                               filter_params, subpel_y_q4, y_step_q4, avg, bd);
-    return;
-  }
-
-  vCoeffs =
-      hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1);
-  if (!vCoeffs) {
-    av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
-                               filter_params, subpel_y_q4, y_step_q4, avg, bd);
-    return;
-  }
-
-  verf[0] = *((const __m128i *)(vCoeffs));
-  verf[1] = *((const __m128i *)(vCoeffs + 1));
-  verf[2] = *((const __m128i *)(vCoeffs + 2));
-  verf[3] = *((const __m128i *)(vCoeffs + 3));
-  verf[4] = *((const __m128i *)(vCoeffs + 4));
-  verf[5] = *((const __m128i *)(vCoeffs + 5));
-
-  src -= src_stride * ((tapsNum >> 1) - 1);
-
-  if (w > 2) {
-    highbd_filter_vert_compute_large(src, src_stride, verf, tapsNum, w, h, dst,
-                                     dst_stride, avg, bd);
-  } else {
-    highbd_filter_vert_compute_small(src, src_stride, verf, tapsNum, w, h, dst,
-                                     dst_stride, avg, bd);
-  }
-}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
new file mode 100644
index 000000000..7415c58df
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
@@ -0,0 +1,1957 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/av1_inv_txfm_avx2.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+
+static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
+                                      const __m256i __rounding,
+                                      int8_t cos_bit) {
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(x1[0], x1[3]);
+  btf_16_adds_subs_avx2(x1[1], x1[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x1[5], x1[6]);
+
+  btf_16_adds_subs_avx2(x1[8], x1[11]);
+  btf_16_adds_subs_avx2(x1[9], x1[10]);
+  btf_16_subs_adds_avx2(x1[15], x1[12]);
+  btf_16_subs_adds_avx2(x1[14], x1[13]);
+}
+
+static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
+                                      const __m256i __rounding,
+                                      int8_t cos_bit) {
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(x[0], x[7]);
+  btf_16_adds_subs_avx2(x[1], x[6]);
+  btf_16_adds_subs_avx2(x[2], x[5]);
+  btf_16_adds_subs_avx2(x[3], x[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+}
+
+static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
+  btf_16_adds_subs_out_avx2(output[0], output[15], x1[0], x1[15]);
+  btf_16_adds_subs_out_avx2(output[1], output[14], x1[1], x1[14]);
+  btf_16_adds_subs_out_avx2(output[2], output[13], x1[2], x1[13]);
+  btf_16_adds_subs_out_avx2(output[3], output[12], x1[3], x1[12]);
+  btf_16_adds_subs_out_avx2(output[4], output[11], x1[4], x1[11]);
+  btf_16_adds_subs_out_avx2(output[5], output[10], x1[5], x1[10]);
+  btf_16_adds_subs_out_avx2(output[6], output[9], x1[6], x1[9]);
+  btf_16_adds_subs_out_avx2(output[7], output[8], x1[7], x1[8]);
+}
+
+static void idct16_new_avx2(const __m256i *input, __m256i *output,
+                            int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+
+  // stage 1
+  __m256i x1[16];
+  x1[0] = input[0];
+  x1[1] = input[8];
+  x1[2] = input[4];
+  x1[3] = input[12];
+  x1[4] = input[2];
+  x1[5] = input[10];
+  x1[6] = input[6];
+  x1[7] = input[14];
+  x1[8] = input[1];
+  x1[9] = input[9];
+  x1[10] = input[5];
+  x1[11] = input[13];
+  x1[12] = input[3];
+  x1[13] = input[11];
+  x1[14] = input[7];
+  x1[15] = input[15];
+
+  // stage 2
+  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, x1[8], x1[15], x1[8], x1[15]);
+  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, x1[9], x1[14], x1[9], x1[14]);
+  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, x1[10], x1[13], x1[10], x1[13]);
+  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, x1[11], x1[12], x1[11], x1[12]);
+
+  // stage 3
+  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, x1[4], x1[7], x1[4], x1[7]);
+  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, x1[5], x1[6], x1[5], x1[6]);
+  btf_16_adds_subs_avx2(x1[8], x1[9]);
+  btf_16_subs_adds_avx2(x1[11], x1[10]);
+  btf_16_adds_subs_avx2(x1[12], x1[13]);
+  btf_16_subs_adds_avx2(x1[15], x1[14]);
+
+  // stage 4
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x1[0], x1[1]);
+  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, x1[2], x1[3], x1[2], x1[3]);
+  btf_16_adds_subs_avx2(x1[4], x1[5]);
+  btf_16_subs_adds_avx2(x1[7], x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x1[9], x1[14], x1[9], x1[14]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x1[10], x1[13], x1[10], x1[13]);
+
+  idct16_stage5_avx2(x1, cospi, __rounding, cos_bit);
+  idct16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+  idct16_stage7_avx2(output, x1);
+}
+
+static void idct16_low8_new_avx2(const __m256i *input, __m256i *output,
+                                 int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+
+  // stage 1
+  __m256i x1[16];
+  x1[0] = input[0];
+  x1[2] = input[4];
+  x1[4] = input[2];
+  x1[6] = input[6];
+  x1[8] = input[1];
+  x1[10] = input[5];
+  x1[12] = input[3];
+  x1[14] = input[7];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]);
+  btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]);
+  btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]);
+  btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
+  btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
+  btf_16_adds_subs_avx2(x1[8], x1[9]);
+  btf_16_subs_adds_avx2(x1[11], x1[10]);
+  btf_16_adds_subs_avx2(x1[12], x1[13]);
+  btf_16_subs_adds_avx2(x1[15], x1[14]);
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
+  btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
+  btf_16_adds_subs_avx2(x1[4], x1[5]);
+  btf_16_subs_adds_avx2(x1[7], x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x1[9], x1[14], x1[9], x1[14]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x1[10], x1[13], x1[10], x1[13]);
+
+  idct16_stage5_avx2(x1, cospi, __rounding, cos_bit);
+  idct16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+  idct16_stage7_avx2(output, x1);
+}
+
+static void idct16_low1_new_avx2(const __m256i *input, __m256i *output,
+                                 int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m256i x1[2];
+  x1[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
+
+  // stage 5
+  // stage 6
+  output[0] = x1[0];
+  output[1] = x1[1];
+  output[2] = x1[1];
+  output[3] = x1[0];
+  output[4] = x1[0];
+  output[5] = x1[1];
+  output[6] = x1[1];
+  output[7] = x1[0];
+  output[8] = x1[0];
+  output[9] = x1[1];
+  output[10] = x1[1];
+  output[11] = x1[0];
+  output[12] = x1[0];
+  output[13] = x1[1];
+  output[14] = x1[1];
+  output[15] = x1[0];
+}
+
+static INLINE void iadst16_stage3_avx2(__m256i *x) {
+  btf_16_adds_subs_avx2(x[0], x[8]);
+  btf_16_adds_subs_avx2(x[1], x[9]);
+  btf_16_adds_subs_avx2(x[2], x[10]);
+  btf_16_adds_subs_avx2(x[3], x[11]);
+  btf_16_adds_subs_avx2(x[4], x[12]);
+  btf_16_adds_subs_avx2(x[5], x[13]);
+  btf_16_adds_subs_avx2(x[6], x[14]);
+  btf_16_adds_subs_avx2(x[7], x[15]);
+}
+
+static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
+                                       const __m256i __rounding,
+                                       int8_t cos_bit) {
+  const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+  const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+  const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+  const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+  const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
+  const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
+  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+  btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
+  btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
+  btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage5_avx2(__m256i *x) {
+  btf_16_adds_subs_avx2(x[0], x[4]);
+  btf_16_adds_subs_avx2(x[1], x[5]);
+  btf_16_adds_subs_avx2(x[2], x[6]);
+  btf_16_adds_subs_avx2(x[3], x[7]);
+  btf_16_adds_subs_avx2(x[8], x[12]);
+  btf_16_adds_subs_avx2(x[9], x[13]);
+  btf_16_adds_subs_avx2(x[10], x[14]);
+  btf_16_adds_subs_avx2(x[11], x[15]);
+}
+
+static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
+                                       const __m256i __rounding,
+                                       int8_t cos_bit) {
+  const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+  const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+  const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage7_avx2(__m256i *x) {
+  btf_16_adds_subs_avx2(x[0], x[2]);
+  btf_16_adds_subs_avx2(x[1], x[3]);
+  btf_16_adds_subs_avx2(x[4], x[6]);
+  btf_16_adds_subs_avx2(x[5], x[7]);
+  btf_16_adds_subs_avx2(x[8], x[10]);
+  btf_16_adds_subs_avx2(x[9], x[11]);
+  btf_16_adds_subs_avx2(x[12], x[14]);
+  btf_16_adds_subs_avx2(x[13], x[15]);
+}
+
+static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
+                                       const __m256i __rounding,
+                                       int8_t cos_bit) {
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x1[2], x1[3]);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x1[6], x1[7]);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x1[10], x1[11]);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x1[14], x1[15]);
+}
+
+static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
+  const __m256i __zero = _mm256_setzero_si256();
+  output[0] = x1[0];
+  output[1] = _mm256_subs_epi16(__zero, x1[8]);
+  output[2] = x1[12];
+  output[3] = _mm256_subs_epi16(__zero, x1[4]);
+  output[4] = x1[6];
+  output[5] = _mm256_subs_epi16(__zero, x1[14]);
+  output[6] = x1[10];
+  output[7] = _mm256_subs_epi16(__zero, x1[2]);
+  output[8] = x1[3];
+  output[9] = _mm256_subs_epi16(__zero, x1[11]);
+  output[10] = x1[15];
+  output[11] = _mm256_subs_epi16(__zero, x1[7]);
+  output[12] = x1[5];
+  output[13] = _mm256_subs_epi16(__zero, x1[13]);
+  output[14] = x1[9];
+  output[15] = _mm256_subs_epi16(__zero, x1[1]);
+}
+
+static void iadst16_new_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+
+  // stage 1
+  __m256i x1[16];
+  x1[0] = input[15];
+  x1[1] = input[0];
+  x1[2] = input[13];
+  x1[3] = input[2];
+  x1[4] = input[11];
+  x1[5] = input[4];
+  x1[6] = input[9];
+  x1[7] = input[6];
+  x1[8] = input[7];
+  x1[9] = input[8];
+  x1[10] = input[5];
+  x1[11] = input[10];
+  x1[12] = input[3];
+  x1[13] = input[12];
+  x1[14] = input[1];
+  x1[15] = input[14];
+
+  // stage 2
+  btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, x1[0], x1[1], x1[0], x1[1]);
+  btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, x1[2], x1[3], x1[2], x1[3]);
+  btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, x1[4], x1[5], x1[4], x1[5]);
+  btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, x1[6], x1[7], x1[6], x1[7]);
+  btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, x1[8], x1[9], x1[8], x1[9]);
+  btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, x1[10], x1[11], x1[10], x1[11]);
+  btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, x1[12], x1[13], x1[12], x1[13]);
+  btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, x1[14], x1[15], x1[14], x1[15]);
+
+  iadst16_stage3_avx2(x1);
+  iadst16_stage4_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage5_avx2(x1);
+  iadst16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage7_avx2(x1);
+  iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage9_avx2(output, x1);
+}
+
+static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m256i x1[16];
+  x1[1] = input[0];
+  x1[3] = input[2];
+  x1[5] = input[4];
+  x1[7] = input[6];
+  x1[8] = input[7];
+  x1[10] = input[5];
+  x1[12] = input[3];
+  x1[14] = input[1];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
+  btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]);
+  btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]);
+  btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]);
+  btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]);
+  btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]);
+  btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]);
+  btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
+
+  iadst16_stage3_avx2(x1);
+  iadst16_stage4_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage5_avx2(x1);
+  iadst16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage7_avx2(x1);
+  iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage9_avx2(output, x1);
+}
+
+static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+  const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+  const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+  const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+
+  // stage 1
+  __m256i x1[16];
+  x1[1] = input[0];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
+
+  // stage 3
+  x1[8] = x1[0];
+  x1[9] = x1[1];
+
+  // stage 4
+  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, x1[8], x1[9], x1[8], x1[9]);
+
+  // stage 5
+  x1[4] = x1[0];
+  x1[5] = x1[1];
+
+  x1[12] = x1[8];
+  x1[13] = x1[9];
+
+  // stage 6
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x1[4], x1[5], x1[4], x1[5]);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x1[12], x1[13], x1[12], x1[13]);
+
+  // stage 7
+  x1[2] = x1[0];
+  x1[3] = x1[1];
+  x1[6] = x1[4];
+  x1[7] = x1[5];
+  x1[10] = x1[8];
+  x1[11] = x1[9];
+  x1[14] = x1[12];
+  x1[15] = x1[13];
+
+  iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit);
+  iadst16_stage9_avx2(output, x1);
+}
+
+static INLINE void idct32_high16_stage3_avx2(__m256i *x) {
+  btf_16_adds_subs_avx2(x[16], x[17]);
+  btf_16_subs_adds_avx2(x[19], x[18]);
+  btf_16_adds_subs_avx2(x[20], x[21]);
+  btf_16_subs_adds_avx2(x[23], x[22]);
+  btf_16_adds_subs_avx2(x[24], x[25]);
+  btf_16_subs_adds_avx2(x[27], x[26]);
+  btf_16_adds_subs_avx2(x[28], x[29]);
+  btf_16_subs_adds_avx2(x[31], x[30]);
+}
+
+static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i __rounding,
+                                             int8_t cos_bit) {
+  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+}
+
+static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i __rounding,
+                                             int8_t cos_bit) {
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+  btf_16_adds_subs_avx2(x[16], x[19]);
+  btf_16_adds_subs_avx2(x[17], x[18]);
+  btf_16_subs_adds_avx2(x[23], x[20]);
+  btf_16_subs_adds_avx2(x[22], x[21]);
+  btf_16_adds_subs_avx2(x[24], x[27]);
+  btf_16_adds_subs_avx2(x[25], x[26]);
+  btf_16_subs_adds_avx2(x[31], x[28]);
+  btf_16_subs_adds_avx2(x[30], x[29]);
+}
+
+static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i __rounding,
+                                             int8_t cos_bit) {
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_avx2(x[8], x[11]);
+  btf_16_adds_subs_avx2(x[9], x[10]);
+  btf_16_subs_adds_avx2(x[15], x[12]);
+  btf_16_subs_adds_avx2(x[14], x[13]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+}
+
+static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
+                                      const __m256i __rounding,
+                                      int8_t cos_bit) {
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(x[0], x[7]);
+  btf_16_adds_subs_avx2(x[1], x[6]);
+  btf_16_adds_subs_avx2(x[2], x[5]);
+  btf_16_adds_subs_avx2(x[3], x[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  btf_16_adds_subs_avx2(x[16], x[23]);
+  btf_16_adds_subs_avx2(x[17], x[22]);
+  btf_16_adds_subs_avx2(x[18], x[21]);
+  btf_16_adds_subs_avx2(x[19], x[20]);
+  btf_16_subs_adds_avx2(x[31], x[24]);
+  btf_16_subs_adds_avx2(x[30], x[25]);
+  btf_16_subs_adds_avx2(x[29], x[26]);
+  btf_16_subs_adds_avx2(x[28], x[27]);
+}
+
+static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
+                                      const __m256i __rounding,
+                                      int8_t cos_bit) {
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(x[0], x[15]);
+  btf_16_adds_subs_avx2(x[1], x[14]);
+  btf_16_adds_subs_avx2(x[2], x[13]);
+  btf_16_adds_subs_avx2(x[3], x[12]);
+  btf_16_adds_subs_avx2(x[4], x[11]);
+  btf_16_adds_subs_avx2(x[5], x[10]);
+  btf_16_adds_subs_avx2(x[6], x[9]);
+  btf_16_adds_subs_avx2(x[7], x[8]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+}
+
+static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
+  btf_16_adds_subs_out_avx2(output[0], output[31], x[0], x[31]);
+  btf_16_adds_subs_out_avx2(output[1], output[30], x[1], x[30]);
+  btf_16_adds_subs_out_avx2(output[2], output[29], x[2], x[29]);
+  btf_16_adds_subs_out_avx2(output[3], output[28], x[3], x[28]);
+  btf_16_adds_subs_out_avx2(output[4], output[27], x[4], x[27]);
+  btf_16_adds_subs_out_avx2(output[5], output[26], x[5], x[26]);
+  btf_16_adds_subs_out_avx2(output[6], output[25], x[6], x[25]);
+  btf_16_adds_subs_out_avx2(output[7], output[24], x[7], x[24]);
+  btf_16_adds_subs_out_avx2(output[8], output[23], x[8], x[23]);
+  btf_16_adds_subs_out_avx2(output[9], output[22], x[9], x[22]);
+  btf_16_adds_subs_out_avx2(output[10], output[21], x[10], x[21]);
+  btf_16_adds_subs_out_avx2(output[11], output[20], x[11], x[20]);
+  btf_16_adds_subs_out_avx2(output[12], output[19], x[12], x[19]);
+  btf_16_adds_subs_out_avx2(output[13], output[18], x[13], x[18]);
+  btf_16_adds_subs_out_avx2(output[14], output[17], x[14], x[17]);
+  btf_16_adds_subs_out_avx2(output[15], output[16], x[15], x[16]);
+}
+
+static void idct32_low1_new_avx2(const __m256i *input, __m256i *output,
+                                 int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m256i x[2];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 6
+  // stage 7
+  // stage 8
+  // stage 9
+  output[0] = x[0];
+  output[31] = x[0];
+  output[1] = x[1];
+  output[30] = x[1];
+  output[2] = x[1];
+  output[29] = x[1];
+  output[3] = x[0];
+  output[28] = x[0];
+  output[4] = x[0];
+  output[27] = x[0];
+  output[5] = x[1];
+  output[26] = x[1];
+  output[6] = x[1];
+  output[25] = x[1];
+  output[7] = x[0];
+  output[24] = x[0];
+  output[8] = x[0];
+  output[23] = x[0];
+  output[9] = x[1];
+  output[22] = x[1];
+  output[10] = x[1];
+  output[21] = x[1];
+  output[11] = x[0];
+  output[20] = x[0];
+  output[12] = x[0];
+  output[19] = x[0];
+  output[13] = x[1];
+  output[18] = x[1];
+  output[14] = x[1];
+  output[17] = x[1];
+  output[15] = x[0];
+  output[16] = x[0];
+}
+
+static void idct32_low8_new_avx2(const __m256i *input, __m256i *output,
+                                 int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m256i x[32];
+  x[0] = input[0];
+  x[4] = input[4];
+  x[8] = input[2];
+  x[12] = input[6];
+  x[16] = input[1];
+  x[20] = input[5];
+  x[24] = input[3];
+  x[28] = input[7];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  x[17] = x[16];
+  x[18] = x[19];
+  x[21] = x[20];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[26] = x[27];
+  x[29] = x[28];
+  x[30] = x[31];
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+  x[9] = x[8];
+  x[10] = x[11];
+  x[13] = x[12];
+  x[14] = x[15];
+  idct32_high16_stage4_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+  x[5] = x[4];
+  x[6] = x[7];
+  idct32_high24_stage5_avx2(x, cospi, __rounding, cos_bit);
+  // stage 6
+  x[3] = x[0];
+  x[2] = x[1];
+  idct32_high28_stage6_avx2(x, cospi, __rounding, cos_bit);
+
+  idct32_stage7_avx2(x, cospi, __rounding, cos_bit);
+  idct32_stage8_avx2(x, cospi, __rounding, cos_bit);
+  idct32_stage9_avx2(output, x);
+}
+
+static void idct32_low16_new_avx2(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m256i x[32];
+  x[0] = input[0];
+  x[2] = input[8];
+  x[4] = input[4];
+  x[6] = input[12];
+  x[8] = input[2];
+  x[10] = input[10];
+  x[12] = input[6];
+  x[14] = input[14];
+  x[16] = input[1];
+  x[18] = input[9];
+  x[20] = input[5];
+  x[22] = input[13];
+  x[24] = input[3];
+  x[26] = input[11];
+  x[28] = input[7];
+  x[30] = input[15];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
+  btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
+  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
+  btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
+  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
+  btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
+  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  idct32_high16_stage3_avx2(x);
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+  btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
+  btf_16_adds_subs_avx2(x[8], x[9]);
+  btf_16_subs_adds_avx2(x[11], x[10]);
+  btf_16_adds_subs_avx2(x[12], x[13]);
+  btf_16_subs_adds_avx2(x[15], x[14]);
+  idct32_high16_stage4_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
+  btf_16_adds_subs_avx2(x[4], x[5]);
+  btf_16_subs_adds_avx2(x[7], x[6]);
+  idct32_high24_stage5_avx2(x, cospi, __rounding, cos_bit);
+
+  btf_16_adds_subs_avx2(x[0], x[3]);
+  btf_16_adds_subs_avx2(x[1], x[2]);
+  idct32_high28_stage6_avx2(x, cospi, __rounding, cos_bit);
+
+  idct32_stage7_avx2(x, cospi, __rounding, cos_bit);
+  idct32_stage8_avx2(x, cospi, __rounding, cos_bit);
+  idct32_stage9_avx2(output, x);
+}
+
+static void idct32_new_avx2(const __m256i *input, __m256i *output,
+                            int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+
+  // stage 1
+  __m256i x1[32];
+  x1[0] = input[0];
+  x1[1] = input[16];
+  x1[2] = input[8];
+  x1[3] = input[24];
+  x1[4] = input[4];
+  x1[5] = input[20];
+  x1[6] = input[12];
+  x1[7] = input[28];
+  x1[8] = input[2];
+  x1[9] = input[18];
+  x1[10] = input[10];
+  x1[11] = input[26];
+  x1[12] = input[6];
+  x1[13] = input[22];
+  x1[14] = input[14];
+  x1[15] = input[30];
+  x1[16] = input[1];
+  x1[17] = input[17];
+  x1[18] = input[9];
+  x1[19] = input[25];
+  x1[20] = input[5];
+  x1[21] = input[21];
+  x1[22] = input[13];
+  x1[23] = input[29];
+  x1[24] = input[3];
+  x1[25] = input[19];
+  x1[26] = input[11];
+  x1[27] = input[27];
+  x1[28] = input[7];
+  x1[29] = input[23];
+  x1[30] = input[15];
+  x1[31] = input[31];
+
+  // stage 2
+  btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, x1[16], x1[31], x1[16], x1[31]);
+  btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, x1[17], x1[30], x1[17], x1[30]);
+  btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, x1[18], x1[29], x1[18], x1[29]);
+  btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, x1[19], x1[28], x1[19], x1[28]);
+  btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, x1[20], x1[27], x1[20], x1[27]);
+  btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, x1[21], x1[26], x1[21], x1[26]);
+  btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, x1[22], x1[25], x1[22], x1[25]);
+  btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, x1[23], x1[24], x1[23], x1[24]);
+
+  // stage 3
+  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, x1[8], x1[15], x1[8], x1[15]);
+  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, x1[9], x1[14], x1[9], x1[14]);
+  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, x1[10], x1[13], x1[10], x1[13]);
+  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, x1[11], x1[12], x1[11], x1[12]);
+  idct32_high16_stage3_avx2(x1);
+
+  // stage 4
+  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, x1[4], x1[7], x1[4], x1[7]);
+  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, x1[5], x1[6], x1[5], x1[6]);
+  btf_16_adds_subs_avx2(x1[8], x1[9]);
+  btf_16_subs_adds_avx2(x1[11], x1[10]);
+  btf_16_adds_subs_avx2(x1[12], x1[13]);
+  btf_16_subs_adds_avx2(x1[15], x1[14]);
+  idct32_high16_stage4_avx2(x1, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x1[0], x1[1]);
+  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, x1[2], x1[3], x1[2], x1[3]);
+  btf_16_adds_subs_avx2(x1[4], x1[5]);
+  btf_16_subs_adds_avx2(x1[7], x1[6]);
+  idct32_high24_stage5_avx2(x1, cospi, __rounding, cos_bit);
+
+  // stage 6
+  btf_16_adds_subs_avx2(x1[0], x1[3]);
+  btf_16_adds_subs_avx2(x1[1], x1[2]);
+  idct32_high28_stage6_avx2(x1, cospi, __rounding, cos_bit);
+
+  idct32_stage7_avx2(x1, cospi, __rounding, cos_bit);
+  idct32_stage8_avx2(x1, cospi, __rounding, cos_bit);
+  idct32_stage9_avx2(output, x1);
+}
+
+static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i __rounding,
+                                             int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+  const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+  const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
+  const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+  const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+  const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+  const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+  const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+  const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
+  const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+  const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+  const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+  btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
+  btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
+  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+  btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
+  btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
+  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+}
+
+static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i __rounding,
+                                             int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+  btf_16_adds_subs_avx2(x[32], x[35]);
+  btf_16_adds_subs_avx2(x[33], x[34]);
+  btf_16_subs_adds_avx2(x[39], x[36]);
+  btf_16_subs_adds_avx2(x[38], x[37]);
+  btf_16_adds_subs_avx2(x[40], x[43]);
+  btf_16_adds_subs_avx2(x[41], x[42]);
+  btf_16_subs_adds_avx2(x[47], x[44]);
+  btf_16_subs_adds_avx2(x[46], x[45]);
+  btf_16_adds_subs_avx2(x[48], x[51]);
+  btf_16_adds_subs_avx2(x[49], x[50]);
+  btf_16_subs_adds_avx2(x[55], x[52]);
+  btf_16_subs_adds_avx2(x[54], x[53]);
+  btf_16_adds_subs_avx2(x[56], x[59]);
+  btf_16_adds_subs_avx2(x[57], x[58]);
+  btf_16_subs_adds_avx2(x[63], x[60]);
+  btf_16_subs_adds_avx2(x[62], x[61]);
+}
+
+static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i __rounding,
+                                             int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
+}
+
+static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i __rounding,
+                                             int8_t cos_bit) {
+  btf_16_adds_subs_avx2(x[16], x[19]);
+  btf_16_adds_subs_avx2(x[17], x[18]);
+  btf_16_subs_adds_avx2(x[23], x[20]);
+  btf_16_subs_adds_avx2(x[22], x[21]);
+  btf_16_adds_subs_avx2(x[24], x[27]);
+  btf_16_adds_subs_avx2(x[25], x[26]);
+  btf_16_subs_adds_avx2(x[31], x[28]);
+  btf_16_subs_adds_avx2(x[30], x[29]);
+  idct64_stage6_high32_avx2(x, cospi, __rounding, cos_bit);
+}
+
+static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i __rounding,
+                                             int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+  btf_16_adds_subs_avx2(x[32], x[39]);
+  btf_16_adds_subs_avx2(x[33], x[38]);
+  btf_16_adds_subs_avx2(x[34], x[37]);
+  btf_16_adds_subs_avx2(x[35], x[36]);
+  btf_16_subs_adds_avx2(x[47], x[40]);
+  btf_16_subs_adds_avx2(x[46], x[41]);
+  btf_16_subs_adds_avx2(x[45], x[42]);
+  btf_16_subs_adds_avx2(x[44], x[43]);
+  btf_16_adds_subs_avx2(x[48], x[55]);
+  btf_16_adds_subs_avx2(x[49], x[54]);
+  btf_16_adds_subs_avx2(x[50], x[53]);
+  btf_16_adds_subs_avx2(x[51], x[52]);
+  btf_16_subs_adds_avx2(x[63], x[56]);
+  btf_16_subs_adds_avx2(x[62], x[57]);
+  btf_16_subs_adds_avx2(x[61], x[58]);
+  btf_16_subs_adds_avx2(x[60], x[59]);
+}
+
+static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i __rounding,
+                                             int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  btf_16_adds_subs_avx2(x[16], x[23]);
+  btf_16_adds_subs_avx2(x[17], x[22]);
+  btf_16_adds_subs_avx2(x[18], x[21]);
+  btf_16_adds_subs_avx2(x[19], x[20]);
+  btf_16_subs_adds_avx2(x[31], x[24]);
+  btf_16_subs_adds_avx2(x[30], x[25]);
+  btf_16_subs_adds_avx2(x[29], x[26]);
+  btf_16_subs_adds_avx2(x[28], x[27]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
+}
+
+static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
+                                      const __m256i __rounding,
+                                      int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(x[0], x[15]);
+  btf_16_adds_subs_avx2(x[1], x[14]);
+  btf_16_adds_subs_avx2(x[2], x[13]);
+  btf_16_adds_subs_avx2(x[3], x[12]);
+  btf_16_adds_subs_avx2(x[4], x[11]);
+  btf_16_adds_subs_avx2(x[5], x[10]);
+  btf_16_adds_subs_avx2(x[6], x[9]);
+  btf_16_adds_subs_avx2(x[7], x[8]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+  btf_16_adds_subs_avx2(x[32], x[47]);
+  btf_16_adds_subs_avx2(x[33], x[46]);
+  btf_16_adds_subs_avx2(x[34], x[45]);
+  btf_16_adds_subs_avx2(x[35], x[44]);
+  btf_16_adds_subs_avx2(x[36], x[43]);
+  btf_16_adds_subs_avx2(x[37], x[42]);
+  btf_16_adds_subs_avx2(x[38], x[41]);
+  btf_16_adds_subs_avx2(x[39], x[40]);
+  btf_16_subs_adds_avx2(x[63], x[48]);
+  btf_16_subs_adds_avx2(x[62], x[49]);
+  btf_16_subs_adds_avx2(x[61], x[50]);
+  btf_16_subs_adds_avx2(x[60], x[51]);
+  btf_16_subs_adds_avx2(x[59], x[52]);
+  btf_16_subs_adds_avx2(x[58], x[53]);
+  btf_16_subs_adds_avx2(x[57], x[54]);
+  btf_16_subs_adds_avx2(x[56], x[55]);
+}
+
+static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
+                                       const __m256i __rounding,
+                                       int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(x[0], x[31]);
+  btf_16_adds_subs_avx2(x[1], x[30]);
+  btf_16_adds_subs_avx2(x[2], x[29]);
+  btf_16_adds_subs_avx2(x[3], x[28]);
+  btf_16_adds_subs_avx2(x[4], x[27]);
+  btf_16_adds_subs_avx2(x[5], x[26]);
+  btf_16_adds_subs_avx2(x[6], x[25]);
+  btf_16_adds_subs_avx2(x[7], x[24]);
+  btf_16_adds_subs_avx2(x[8], x[23]);
+  btf_16_adds_subs_avx2(x[9], x[22]);
+  btf_16_adds_subs_avx2(x[10], x[21]);
+  btf_16_adds_subs_avx2(x[11], x[20]);
+  btf_16_adds_subs_avx2(x[12], x[19]);
+  btf_16_adds_subs_avx2(x[13], x[18]);
+  btf_16_adds_subs_avx2(x[14], x[17]);
+  btf_16_adds_subs_avx2(x[15], x[16]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
+}
+
+static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
+  btf_16_adds_subs_out_avx2(output[0], output[63], x[0], x[63]);
+  btf_16_adds_subs_out_avx2(output[1], output[62], x[1], x[62]);
+  btf_16_adds_subs_out_avx2(output[2], output[61], x[2], x[61]);
+  btf_16_adds_subs_out_avx2(output[3], output[60], x[3], x[60]);
+  btf_16_adds_subs_out_avx2(output[4], output[59], x[4], x[59]);
+  btf_16_adds_subs_out_avx2(output[5], output[58], x[5], x[58]);
+  btf_16_adds_subs_out_avx2(output[6], output[57], x[6], x[57]);
+  btf_16_adds_subs_out_avx2(output[7], output[56], x[7], x[56]);
+  btf_16_adds_subs_out_avx2(output[8], output[55], x[8], x[55]);
+  btf_16_adds_subs_out_avx2(output[9], output[54], x[9], x[54]);
+  btf_16_adds_subs_out_avx2(output[10], output[53], x[10], x[53]);
+  btf_16_adds_subs_out_avx2(output[11], output[52], x[11], x[52]);
+  btf_16_adds_subs_out_avx2(output[12], output[51], x[12], x[51]);
+  btf_16_adds_subs_out_avx2(output[13], output[50], x[13], x[50]);
+  btf_16_adds_subs_out_avx2(output[14], output[49], x[14], x[49]);
+  btf_16_adds_subs_out_avx2(output[15], output[48], x[15], x[48]);
+  btf_16_adds_subs_out_avx2(output[16], output[47], x[16], x[47]);
+  btf_16_adds_subs_out_avx2(output[17], output[46], x[17], x[46]);
+  btf_16_adds_subs_out_avx2(output[18], output[45], x[18], x[45]);
+  btf_16_adds_subs_out_avx2(output[19], output[44], x[19], x[44]);
+  btf_16_adds_subs_out_avx2(output[20], output[43], x[20], x[43]);
+  btf_16_adds_subs_out_avx2(output[21], output[42], x[21], x[42]);
+  btf_16_adds_subs_out_avx2(output[22], output[41], x[22], x[41]);
+  btf_16_adds_subs_out_avx2(output[23], output[40], x[23], x[40]);
+  btf_16_adds_subs_out_avx2(output[24], output[39], x[24], x[39]);
+  btf_16_adds_subs_out_avx2(output[25], output[38], x[25], x[38]);
+  btf_16_adds_subs_out_avx2(output[26], output[37], x[26], x[37]);
+  btf_16_adds_subs_out_avx2(output[27], output[36], x[27], x[36]);
+  btf_16_adds_subs_out_avx2(output[28], output[35], x[28], x[35]);
+  btf_16_adds_subs_out_avx2(output[29], output[34], x[29], x[34]);
+  btf_16_adds_subs_out_avx2(output[30], output[33], x[30], x[33]);
+  btf_16_adds_subs_out_avx2(output[31], output[32], x[31], x[32]);
+}
+
+static void idct64_low1_new_avx2(const __m256i *input, __m256i *output,
+                                 int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m256i x[32];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  // stage 6
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 7
+  // stage 8
+  // stage 9
+  // stage 10
+  // stage 11
+  output[0] = x[0];
+  output[63] = x[0];
+  output[1] = x[1];
+  output[62] = x[1];
+  output[2] = x[1];
+  output[61] = x[1];
+  output[3] = x[0];
+  output[60] = x[0];
+  output[4] = x[0];
+  output[59] = x[0];
+  output[5] = x[1];
+  output[58] = x[1];
+  output[6] = x[1];
+  output[57] = x[1];
+  output[7] = x[0];
+  output[56] = x[0];
+  output[8] = x[0];
+  output[55] = x[0];
+  output[9] = x[1];
+  output[54] = x[1];
+  output[10] = x[1];
+  output[53] = x[1];
+  output[11] = x[0];
+  output[52] = x[0];
+  output[12] = x[0];
+  output[51] = x[0];
+  output[13] = x[1];
+  output[50] = x[1];
+  output[14] = x[1];
+  output[49] = x[1];
+  output[15] = x[0];
+  output[48] = x[0];
+  output[16] = x[0];
+  output[47] = x[0];
+  output[17] = x[1];
+  output[46] = x[1];
+  output[18] = x[1];
+  output[45] = x[1];
+  output[19] = x[0];
+  output[44] = x[0];
+  output[20] = x[0];
+  output[43] = x[0];
+  output[21] = x[1];
+  output[42] = x[1];
+  output[22] = x[1];
+  output[41] = x[1];
+  output[23] = x[0];
+  output[40] = x[0];
+  output[24] = x[0];
+  output[39] = x[0];
+  output[25] = x[1];
+  output[38] = x[1];
+  output[26] = x[1];
+  output[37] = x[1];
+  output[27] = x[0];
+  output[36] = x[0];
+  output[28] = x[0];
+  output[35] = x[0];
+  output[29] = x[1];
+  output[34] = x[1];
+  output[30] = x[1];
+  output[33] = x[1];
+  output[31] = x[0];
+  output[32] = x[0];
+}
+
+static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
+                                 int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+  const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+  const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+  const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+  const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+  const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+  const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+  const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m256i x[64];
+  x[0] = input[0];
+  x[8] = input[4];
+  x[16] = input[2];
+  x[24] = input[6];
+  x[32] = input[1];
+  x[40] = input[5];
+  x[48] = input[3];
+  x[56] = input[7];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  x[33] = x[32];
+  x[38] = x[39];
+  x[41] = x[40];
+  x[46] = x[47];
+  x[49] = x[48];
+  x[54] = x[55];
+  x[57] = x[56];
+  x[62] = x[63];
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+  x[17] = x[16];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[30] = x[31];
+  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+
+  // stage 5
+  x[9] = x[8];
+  x[14] = x[15];
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+  x[35] = x[32];
+  x[34] = x[33];
+  x[36] = x[39];
+  x[37] = x[38];
+  x[43] = x[40];
+  x[42] = x[41];
+  x[44] = x[47];
+  x[45] = x[46];
+  x[51] = x[48];
+  x[50] = x[49];
+  x[52] = x[55];
+  x[53] = x[54];
+  x[59] = x[56];
+  x[58] = x[57];
+  x[60] = x[63];
+  x[61] = x[62];
+
+  // stage 6
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  x[19] = x[16];
+  x[18] = x[17];
+  x[20] = x[23];
+  x[21] = x[22];
+  x[27] = x[24];
+  x[26] = x[25];
+  x[28] = x[31];
+  x[29] = x[30];
+  idct64_stage6_high32_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 7
+  x[3] = x[0];
+  x[2] = x[1];
+  x[11] = x[8];
+  x[10] = x[9];
+  x[12] = x[15];
+  x[13] = x[14];
+  idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 8
+  x[7] = x[0];
+  x[6] = x[1];
+  x[5] = x[2];
+  x[4] = x[3];
+  x[9] = x[9];
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit);
+
+  idct64_stage9_avx2(x, cospi, __rounding, cos_bit);
+  idct64_stage10_avx2(x, cospi, __rounding, cos_bit);
+  idct64_stage11_avx2(output, x);
+}
+
+static void idct64_low16_new_avx2(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m256i x[64];
+  x[0] = input[0];
+  x[4] = input[8];
+  x[8] = input[4];
+  x[12] = input[12];
+  x[16] = input[2];
+  x[20] = input[10];
+  x[24] = input[6];
+  x[28] = input[14];
+  x[32] = input[1];
+  x[36] = input[9];
+  x[40] = input[5];
+  x[44] = input[13];
+  x[48] = input[3];
+  x[52] = input[11];
+  x[56] = input[7];
+  x[60] = input[15];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
+  btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
+  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
+  btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
+  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  x[33] = x[32];
+  x[34] = x[35];
+  x[37] = x[36];
+  x[38] = x[39];
+  x[41] = x[40];
+  x[42] = x[43];
+  x[45] = x[44];
+  x[46] = x[47];
+  x[49] = x[48];
+  x[50] = x[51];
+  x[53] = x[52];
+  x[54] = x[55];
+  x[57] = x[56];
+  x[58] = x[59];
+  x[61] = x[60];
+  x[62] = x[63];
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  x[17] = x[16];
+  x[18] = x[19];
+  x[21] = x[20];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[26] = x[27];
+  x[29] = x[28];
+  x[30] = x[31];
+  idct64_stage4_high32_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+  x[9] = x[8];
+  x[10] = x[11];
+  x[13] = x[12];
+  x[14] = x[15];
+  idct64_stage5_high48_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 6
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+  x[5] = x[4];
+  x[6] = x[7];
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+  idct64_stage6_high48_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 7
+  x[3] = x[0];
+  x[2] = x[1];
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_avx2(x[8], x[11]);
+  btf_16_adds_subs_avx2(x[9], x[10]);
+  btf_16_subs_adds_avx2(x[15], x[12]);
+  btf_16_subs_adds_avx2(x[14], x[13]);
+  idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 8
+  btf_16_adds_subs_avx2(x[0], x[7]);
+  btf_16_adds_subs_avx2(x[1], x[6]);
+  btf_16_adds_subs_avx2(x[2], x[5]);
+  btf_16_adds_subs_avx2(x[3], x[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit);
+
+  idct64_stage9_avx2(x, cospi, __rounding, cos_bit);
+  idct64_stage10_avx2(x, cospi, __rounding, cos_bit);
+  idct64_stage11_avx2(output, x);
+}
+
+static void idct64_low32_new_avx2(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m256i x[64];
+  x[0] = input[0];
+  x[2] = input[16];
+  x[4] = input[8];
+  x[6] = input[24];
+  x[8] = input[4];
+  x[10] = input[20];
+  x[12] = input[12];
+  x[14] = input[28];
+  x[16] = input[2];
+  x[18] = input[18];
+  x[20] = input[10];
+  x[22] = input[26];
+  x[24] = input[6];
+  x[26] = input[22];
+  x[28] = input[14];
+  x[30] = input[30];
+  x[32] = input[1];
+  x[34] = input[17];
+  x[36] = input[9];
+  x[38] = input[25];
+  x[40] = input[5];
+  x[42] = input[21];
+  x[44] = input[13];
+  x[46] = input[29];
+  x[48] = input[3];
+  x[50] = input[19];
+  x[52] = input[11];
+  x[54] = input[27];
+  x[56] = input[7];
+  x[58] = input[23];
+  x[60] = input[15];
+  x[62] = input[31];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]);
+  btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]);
+  btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
+  btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
+  btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]);
+  btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]);
+  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]);
+  btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]);
+  btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
+  btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
+  btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]);
+  btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]);
+  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
+  btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
+  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
+  btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
+  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  btf_16_adds_subs_avx2(x[32], x[33]);
+  btf_16_subs_adds_avx2(x[35], x[34]);
+  btf_16_adds_subs_avx2(x[36], x[37]);
+  btf_16_subs_adds_avx2(x[39], x[38]);
+  btf_16_adds_subs_avx2(x[40], x[41]);
+  btf_16_subs_adds_avx2(x[43], x[42]);
+  btf_16_adds_subs_avx2(x[44], x[45]);
+  btf_16_subs_adds_avx2(x[47], x[46]);
+  btf_16_adds_subs_avx2(x[48], x[49]);
+  btf_16_subs_adds_avx2(x[51], x[50]);
+  btf_16_adds_subs_avx2(x[52], x[53]);
+  btf_16_subs_adds_avx2(x[55], x[54]);
+  btf_16_adds_subs_avx2(x[56], x[57]);
+  btf_16_subs_adds_avx2(x[59], x[58]);
+  btf_16_adds_subs_avx2(x[60], x[61]);
+  btf_16_subs_adds_avx2(x[63], x[62]);
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
+  btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
+  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  btf_16_adds_subs_avx2(x[16], x[17]);
+  btf_16_subs_adds_avx2(x[19], x[18]);
+  btf_16_adds_subs_avx2(x[20], x[21]);
+  btf_16_subs_adds_avx2(x[23], x[22]);
+  btf_16_adds_subs_avx2(x[24], x[25]);
+  btf_16_subs_adds_avx2(x[27], x[26]);
+  btf_16_adds_subs_avx2(x[28], x[29]);
+  btf_16_subs_adds_avx2(x[31], x[30]);
+  idct64_stage4_high32_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+  btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
+  btf_16_adds_subs_avx2(x[8], x[9]);
+  btf_16_subs_adds_avx2(x[11], x[10]);
+  btf_16_adds_subs_avx2(x[12], x[13]);
+  btf_16_subs_adds_avx2(x[15], x[14]);
+  idct64_stage5_high48_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 6
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
+  btf_16_adds_subs_avx2(x[4], x[5]);
+  btf_16_subs_adds_avx2(x[7], x[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+  idct64_stage6_high48_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 7
+  btf_16_adds_subs_avx2(x[0], x[3]);
+  btf_16_adds_subs_avx2(x[1], x[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_avx2(x[8], x[11]);
+  btf_16_adds_subs_avx2(x[9], x[10]);
+  btf_16_subs_adds_avx2(x[15], x[12]);
+  btf_16_subs_adds_avx2(x[14], x[13]);
+  idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 8
+  btf_16_adds_subs_avx2(x[0], x[7]);
+  btf_16_adds_subs_avx2(x[1], x[6]);
+  btf_16_adds_subs_avx2(x[2], x[5]);
+  btf_16_adds_subs_avx2(x[3], x[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit);
+
+  // stage 9~11
+  idct64_stage9_avx2(x, cospi, __rounding, cos_bit);
+  idct64_stage10_avx2(x, cospi, __rounding, cos_bit);
+  idct64_stage11_avx2(output, x);
+}
+
+// 1D functions process 16 pixels at one time.
+static const transform_1d_avx2
+    lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      {
+          { idct16_low1_new_avx2, idct16_low8_new_avx2, idct16_new_avx2, NULL },
+          { iadst16_low1_new_avx2, iadst16_low8_new_avx2, iadst16_new_avx2,
+            NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct32_low1_new_avx2, idct32_low8_new_avx2, idct32_low16_new_avx2,
+          idct32_new_avx2 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      { { idct64_low1_new_avx2, idct64_low8_new_avx2, idct64_low16_new_avx2,
+          idct64_low32_new_avx2 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+// only process w >= 16 h >= 16
+static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  __m256i buf1[64 * 16];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div16 = txfm_size_col >> 4;
+  const int buf_size_nonzero_w_div16 = (eobx + 16) >> 4;
+  const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_avx2 row_txfm =
+      lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_avx2 col_txfm =
+      lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
+    __m256i buf0[64];
+    const int32_t *input_row = input + (i << 4) * input_stride;
+    for (int j = 0; j < buf_size_nonzero_w_div16; ++j) {
+      __m256i *buf0_cur = buf0 + j * 16;
+      const int32_t *input_cur = input_row + j * 16;
+      load_buffer_32bit_to_16bit_w16_avx2(input_cur, input_stride, buf0_cur,
+                                          16);
+      transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      round_shift_avx2(buf0, buf0, input_stride);  // rect special code
+    }
+    row_txfm(buf0, buf0, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
+
+    __m256i *buf1_cur = buf1 + (i << 4);
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div16; ++j) {
+        __m256i temp[16];
+        flip_buf_av2(buf0 + 16 * j, temp, 16);
+        int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
+        transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div16; ++j) {
+        transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j);
+      }
+    }
+  }
+  for (int i = 0; i < buf_size_w_div16; i++) {
+    __m256i *buf1_cur = buf1 + i * txfm_size_row;
+    col_txfm(buf1_cur, buf1_cur, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf1_cur, txfm_size_row, shift[1]);
+  }
+  for (int i = 0; i < buf_size_w_div16; i++) {
+    lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
+                                 stride, ud_flip, txfm_size_row);
+  }
+}
+
+static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
+                                           int stride, int shift, int height,
+                                           int txw_idx, int rect_type) {
+  const int32_t *input_row = input;
+  const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
+  const __m256i rounding = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
+                                             (1 << (NewSqrt2Bits - shift - 1)));
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i scale_rounding = _mm256_unpacklo_epi16(scale, rounding);
+  if (rect_type != 1 && rect_type != -1) {
+    for (int i = 0; i < height; ++i) {
+      const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
+      input_row += stride;
+      __m256i lo = _mm256_unpacklo_epi16(src, one);
+      __m256i hi = _mm256_unpackhi_epi16(src, one);
+      lo = _mm256_madd_epi16(lo, scale_rounding);
+      hi = _mm256_madd_epi16(hi, scale_rounding);
+      lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
+      hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
+      out[i] = _mm256_packs_epi32(lo, hi);
+    }
+  } else {
+    const __m256i rect_scale =
+        _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
+    for (int i = 0; i < height; ++i) {
+      __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
+      src = _mm256_mulhrs_epi16(src, rect_scale);
+      input_row += stride;
+      __m256i lo = _mm256_unpacklo_epi16(src, one);
+      __m256i hi = _mm256_unpackhi_epi16(src, one);
+      lo = _mm256_madd_epi16(lo, scale_rounding);
+      hi = _mm256_madd_epi16(hi, scale_rounding);
+      lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
+      hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
+      out[i] = _mm256_packs_epi32(lo, hi);
+    }
+  }
+}
+
+static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
+                                           __m256i *buf, int shift, int height,
+                                           int txh_idx) {
+  const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
+  const __m256i scale_rounding = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
+  const __m256i shift_rounding = _mm256_set1_epi32(1 << (-shift - 1));
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale_rounding);
+  for (int h = 0; h < height; ++h) {
+    __m256i lo = _mm256_unpacklo_epi16(buf[h], one);
+    __m256i hi = _mm256_unpackhi_epi16(buf[h], one);
+    lo = _mm256_madd_epi16(lo, scale_coeff);
+    hi = _mm256_madd_epi16(hi, scale_coeff);
+    lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
+    hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
+    lo = _mm256_add_epi32(lo, shift_rounding);
+    hi = _mm256_add_epi32(hi, shift_rounding);
+    lo = _mm256_srai_epi32(lo, -shift);
+    hi = _mm256_srai_epi32(hi, -shift);
+    const __m256i x = _mm256_packs_epi32(lo, hi);
+    write_recon_w16_avx2(x, output);
+    output += stride;
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
+                                                  uint8_t *output, int stride,
+                                                  TX_SIZE tx_size,
+                                                  int32_t eob) {
+  (void)eob;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  __m256i buf[32];
+  for (int i = 0; i < input_stride; i += 16) {
+    iidentity_row_16xn_avx2(buf, input + i, input_stride, shift[0], row_max,
+                            txw_idx, rect_type);
+    iidentity_col_16xn_avx2(output + i, stride, buf, shift[1], row_max,
+                            txh_idx);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  int eobx, eoby;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col);
+  const int input_stride = txfm_size_col_notzero;
+  const int buf_size_w_div16 = (eobx + 16) >> 4;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_avx2 col_txfm =
+      lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_w_div16; i++) {
+    __m256i buf0[64];
+    iidentity_row_16xn_avx2(buf0, input + (i << 4), input_stride, shift[0],
+                            eoby + 1, txw_idx, rect_type);
+    col_txfm(buf0, buf0, cos_bit_col);
+    __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
+    int k = ud_flip ? (txfm_size_row - 1) : 0;
+    const int step = ud_flip ? -1 : 1;
+    for (int j = 0; j < txfm_size_row; ++j, k += step) {
+      __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift);
+      write_recon_w16_avx2(res, output + (i << 4) + j * stride);
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  __m256i buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div16 = txfm_size_col >> 4;
+  const int buf_size_h_div16 = (eoby + 16) >> 4;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const transform_1d_avx2 row_txfm =
+      lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+
+  assert(row_txfm != NULL);
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_h_div16; i++) {
+    __m256i buf0[64];
+    const int32_t *input_row = input + i * input_stride * 16;
+    for (int j = 0; j < AOMMIN(4, buf_size_w_div16); ++j) {
+      __m256i *buf0_cur = buf0 + j * 16;
+      load_buffer_32bit_to_16bit_w16_avx2(input_row + j * 16, input_stride,
+                                          buf0_cur, 16);
+      transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      round_shift_avx2(buf0, buf0, input_stride);  // rect special code
+    }
+    row_txfm(buf0, buf0, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
+    __m256i *_buf1 = buf1;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div16; ++j) {
+        __m256i temp[16];
+        flip_buf_av2(buf0 + 16 * j, temp, 16);
+        transpose_16bit_16x16_avx2(temp,
+                                   _buf1 + 16 * (buf_size_w_div16 - 1 - j));
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div16; ++j) {
+        transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j);
+      }
+    }
+    for (int j = 0; j < buf_size_w_div16; ++j) {
+      iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride,
+                              buf1 + j * 16, shift[1], 16, txh_idx);
+    }
+  }
+}
+
+// for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64
+static INLINE void lowbd_inv_txfm2d_add_universe_avx2(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  (void)eob;
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:   // ADST in vertical, DCT in horizontal
+    case DCT_ADST:   // DCT  in vertical, ADST in horizontal
+    case ADST_ADST:  // ADST in both directions
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type,
+                                            tx_size, eob);
+      break;
+    case IDTX:
+      lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob);
+      break;
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type,
+                                           tx_size, eob);
+      break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type,
+                                           tx_size, eob);
+      break;
+    default:
+      av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+  }
+}
+
+void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+                                   int eob) {
+  switch (tx_size) {
+    case TX_4X4:
+    case TX_8X8:
+    case TX_4X8:
+    case TX_8X4:
+    case TX_8X16:
+    case TX_16X8:
+    case TX_4X16:
+    case TX_16X4:
+    case TX_8X32:
+    case TX_32X8:
+      av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+    case TX_16X16:
+    case TX_32X32:
+    case TX_64X64:
+    case TX_16X32:
+    case TX_32X16:
+    case TX_32X64:
+    case TX_64X32:
+    case TX_16X64:
+    case TX_64X16:
+    default:
+      lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type,
+                                         tx_size, eob);
+      break;
+  }
+}
+
+void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+                           const TxfmParam *txfm_param) {
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  if (!txfm_param->lossless) {
+    av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type,
+                                  txfm_param->tx_size, txfm_param->eob);
+  } else {
+    av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
new file mode 100644
index 000000000..c17f655c5
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+#define AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define pair_set_w16_epi16(a, b) \
+  _mm256_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
+
+#define btf_16_w16_avx2(w0, w1, in0, in1, out0, out1) \
+  {                                                   \
+    __m256i t0 = _mm256_unpacklo_epi16(in0, in1);     \
+    __m256i t1 = _mm256_unpackhi_epi16(in0, in1);     \
+    __m256i u0 = _mm256_madd_epi16(t0, w0);           \
+    __m256i u1 = _mm256_madd_epi16(t1, w0);           \
+    __m256i v0 = _mm256_madd_epi16(t0, w1);           \
+    __m256i v1 = _mm256_madd_epi16(t1, w1);           \
+                                                      \
+    __m256i a0 = _mm256_add_epi32(u0, __rounding);    \
+    __m256i a1 = _mm256_add_epi32(u1, __rounding);    \
+    __m256i b0 = _mm256_add_epi32(v0, __rounding);    \
+    __m256i b1 = _mm256_add_epi32(v1, __rounding);    \
+                                                      \
+    __m256i c0 = _mm256_srai_epi32(a0, cos_bit);      \
+    __m256i c1 = _mm256_srai_epi32(a1, cos_bit);      \
+    __m256i d0 = _mm256_srai_epi32(b0, cos_bit);      \
+    __m256i d1 = _mm256_srai_epi32(b1, cos_bit);      \
+                                                      \
+    out0 = _mm256_packs_epi32(c0, c1);                \
+    out1 = _mm256_packs_epi32(d0, d1);                \
+  }
+
+// half input is zero
+#define btf_16_w16_0_avx2(w0, w1, in, out0, out1)  \
+  {                                                \
+    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
+    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
+    const __m256i _in = in;                        \
+    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
+    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
+  }
+
+#define btf_16_adds_subs_avx2(in0, in1)  \
+  {                                      \
+    const __m256i _in0 = in0;            \
+    const __m256i _in1 = in1;            \
+    in0 = _mm256_adds_epi16(_in0, _in1); \
+    in1 = _mm256_subs_epi16(_in0, _in1); \
+  }
+
+#define btf_16_subs_adds_avx2(in0, in1)  \
+  {                                      \
+    const __m256i _in0 = in0;            \
+    const __m256i _in1 = in1;            \
+    in1 = _mm256_subs_epi16(_in0, _in1); \
+    in0 = _mm256_adds_epi16(_in0, _in1); \
+  }
+
+#define btf_16_adds_subs_out_avx2(out0, out1, in0, in1) \
+  {                                                     \
+    const __m256i _in0 = in0;                           \
+    const __m256i _in1 = in1;                           \
+    out0 = _mm256_adds_epi16(_in0, _in1);               \
+    out1 = _mm256_subs_epi16(_in0, _in1);               \
+  }
+
+static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
+  const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a);
+  const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
+  return _mm256_permute4x64_epi64(b, 0xD8);
+}
+
+static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
+                                                       int stride, __m256i *out,
+                                                       int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride);
+  }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+                                              __m256i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  08 09 0a 0b  04 05 06 07  0c 0d 0e 0f
+  // in[1]: 10 11 12 13  18 19 1a 1b  14 15 16 17  1c 1d 1e 1f
+  // in[2]: 20 21 22 23  28 29 2a 2b  24 25 26 27  2c 2d 2e 2f
+  // in[3]: 30 31 32 33  38 39 3a 3b  34 35 36 37  3c 3d 3e 3f
+  // in[4]: 40 41 42 43  48 49 4a 4b  44 45 46 47  4c 4d 4e 4f
+  // in[5]: 50 51 52 53  58 59 5a 5b  54 55 56 57  5c 5d 5e 5f
+  // in[6]: 60 61 62 63  68 69 6a 6b  64 65 66 67  6c 6d 6e 6f
+  // in[7]: 70 71 72 73  78 79 7a 7b  74 75 76 77  7c 7d 7e 7f
+  // in[8]: 80 81 82 83  88 89 8a 8b  84 85 86 87  8c 8d 8e 8f
+  // to:
+  // a0:    00 10 01 11  02 12 03 13  04 14 05 15  06 16 07 17
+  // a1:    20 30 21 31  22 32 23 33  24 34 25 35  26 36 27 37
+  // a2:    40 50 41 51  42 52 43 53  44 54 45 55  46 56 47 57
+  // a3:    60 70 61 71  62 72 63 73  64 74 65 75  66 76 67 77
+  // ...
+  __m256i a[16];
+  for (int i = 0; i < 16; i += 2) {
+    a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]);
+    a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]);
+  }
+  __m256i b[16];
+  for (int i = 0; i < 16; i += 2) {
+    b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]);
+    b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]);
+  }
+  __m256i c[16];
+  for (int i = 0; i < 16; i += 2) {
+    c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]);
+    c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]);
+  }
+  out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20);
+  out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20);
+  out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20);
+  out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20);
+
+  out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31);
+  out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31);
+  out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31);
+  out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31);
+
+  out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20);
+  out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20);
+  out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20);
+  out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20);
+
+  out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31);
+  out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31);
+  out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31);
+  out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31);
+}
+
+static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
+  if (bit < 0) {
+    __m256i scale = _mm256_set1_epi16(1 << (bit + 15));
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm256_mulhrs_epi16(in[i], scale);
+    }
+  } else if (bit > 0) {
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm256_slli_epi16(in[i], bit);
+    }
+  }
+}
+
+static INLINE void round_shift_avx2(const __m256i *input, __m256i *output,
+                                    int size) {
+  const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8);
+  for (int i = 0; i < size; ++i) {
+    output[i] = _mm256_mulhrs_epi16(input[i], scale);
+  }
+}
+
+static INLINE void flip_buf_av2(__m256i *in, __m256i *out, int size) {
+  for (int i = 0; i < size; ++i) {
+    out[size - i - 1] = in[i];
+  }
+}
+
+static INLINE void write_recon_w16_avx2(__m256i res, uint8_t *output) {
+  __m128i pred = _mm_loadu_si128((__m128i const *)(output));
+  __m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res);
+  __m128i y = _mm256_castsi256_si128(
+      _mm256_permute4x64_epi64(_mm256_packus_epi16(u, u), 168));
+  _mm_storeu_si128((__m128i *)(output), y);
+}
+
+static INLINE void lowbd_write_buffer_16xn_avx2(__m256i *in, uint8_t *output,
+                                                int stride, int flipud,
+                                                int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    write_recon_w16_avx2(in[j], output + i * stride);
+  }
+}
+
+typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit);
+
+void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+                                   int eob);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
new file mode 100644
index 000000000..dd7cee24c
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -0,0 +1,2917 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+
+// TODO(binpengsmail@gmail.com): replace some for loop with do {} while
+
+static void idct4_new_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+  // stage 1
+  __m128i x[4];
+  x[0] = input[0];
+  x[1] = input[2];
+  x[2] = input[1];
+  x[3] = input[3];
+
+  // stage 2
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+
+  // stage 3
+  btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
+  btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
+}
+
+void idct4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+  // stage 1
+  __m128i x[4];
+  x[0] = input[0];
+  x[1] = input[2];
+  x[2] = input[1];
+  x[3] = input[3];
+
+  // stage 2
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+
+  // stage 3
+  btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
+  btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
+}
+
+void idct8_low1_new_ssse3(const __m128i *input, __m128i *output,
+                          int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m128i x[2];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 4
+  // stage 5
+  output[0] = x[0];
+  output[7] = x[0];
+  output[1] = x[1];
+  output[6] = x[1];
+  output[2] = x[1];
+  output[5] = x[1];
+  output[3] = x[0];
+  output[4] = x[0];
+}
+
+void idct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[8];
+  x[0] = input[0];
+  x[1] = input[4];
+  x[2] = input[2];
+  x[3] = input[6];
+  x[4] = input[1];
+  x[5] = input[5];
+  x[6] = input[3];
+  x[7] = input[7];
+
+  // stage 2
+  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+
+  // stage 3
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+
+  // stage 4
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+
+  // stage 5
+  btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
+  btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
+  btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
+  btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
+}
+
+void idct8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[8];
+  x[0] = input[0];
+  x[1] = input[4];
+  x[2] = input[2];
+  x[3] = input[6];
+  x[4] = input[1];
+  x[5] = input[5];
+  x[6] = input[3];
+  x[7] = input[7];
+
+  // stage 2
+  btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+  btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+
+  // stage 3
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+
+  // stage 4
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+
+  // stage 5
+  btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
+  btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
+  btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
+  btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
+}
+
+static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi,
+                                      const __m128i __rounding,
+                                      int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[11]);
+  btf_16_adds_subs_sse2(x[9], x[10]);
+  btf_16_subs_adds_sse2(x[15], x[12]);
+  btf_16_subs_adds_sse2(x[14], x[13]);
+}
+
+static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi,
+                                      const __m128i __rounding,
+                                      int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[7]);
+  btf_16_adds_subs_sse2(x[1], x[6]);
+  btf_16_adds_subs_sse2(x[2], x[5]);
+  btf_16_adds_subs_sse2(x[3], x[4]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+}
+
+static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
+  btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
+  btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
+  btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
+  btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]);
+  btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]);
+  btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]);
+  btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]);
+  btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
+}
+
+static void idct16_low1_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m128i x[2];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 5
+  // stage 6
+  // stage 7
+  output[0] = x[0];
+  output[15] = x[0];
+  output[1] = x[1];
+  output[14] = x[1];
+  output[2] = x[1];
+  output[13] = x[1];
+  output[3] = x[0];
+  output[12] = x[0];
+  output[4] = x[0];
+  output[11] = x[0];
+  output[5] = x[1];
+  output[10] = x[1];
+  output[6] = x[1];
+  output[9] = x[1];
+  output[7] = x[0];
+  output[8] = x[0];
+}
+
+static void idct16_low8_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+
+  // stage 1
+  __m128i x[16];
+  x[0] = input[0];
+  x[2] = input[4];
+  x[4] = input[2];
+  x[6] = input[6];
+  x[8] = input[1];
+  x[10] = input[5];
+  x[12] = input[3];
+  x[14] = input[7];
+
+  // stage 2
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+
+  // stage 3
+  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+
+  // stage 4
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+  idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
+  idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
+  idct16_stage7_sse2(output, x);
+}
+
+void idct16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+
+  // stage 1
+  __m128i x[16];
+  x[0] = input[0];
+  x[1] = input[8];
+  x[2] = input[4];
+  x[3] = input[12];
+  x[4] = input[2];
+  x[5] = input[10];
+  x[6] = input[6];
+  x[7] = input[14];
+  x[8] = input[1];
+  x[9] = input[9];
+  x[10] = input[5];
+  x[11] = input[13];
+  x[12] = input[3];
+  x[13] = input[11];
+  x[14] = input[7];
+  x[15] = input[15];
+
+  // stage 2
+  btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+  btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+
+  // stage 3
+  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+
+  // stage 4
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+  // stage 5~7
+  idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
+  idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
+  idct16_stage7_sse2(output, x);
+}
+
+void idct16_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[16];
+  x[0] = input[0];
+  x[1] = input[8];
+  x[2] = input[4];
+  x[3] = input[12];
+  x[4] = input[2];
+  x[5] = input[10];
+  x[6] = input[6];
+  x[7] = input[14];
+  x[8] = input[1];
+  x[9] = input[9];
+  x[10] = input[5];
+  x[11] = input[13];
+  x[12] = input[3];
+  x[13] = input[11];
+  x[14] = input[7];
+  x[15] = input[15];
+
+  // stage 2
+  btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+  btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+  btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+  btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+
+  // stage 3
+  btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+  btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+
+  // stage 4
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+  btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+  // stage 5
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[11]);
+  btf_16_adds_subs_sse2(x[9], x[10]);
+  btf_16_subs_adds_sse2(x[15], x[12]);
+  btf_16_subs_adds_sse2(x[14], x[13]);
+
+  // stage 6
+  btf_16_adds_subs_sse2(x[0], x[7]);
+  btf_16_adds_subs_sse2(x[1], x[6]);
+  btf_16_adds_subs_sse2(x[2], x[5]);
+  btf_16_adds_subs_sse2(x[3], x[4]);
+  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+
+  // stage 7
+  idct16_stage7_sse2(output, x);
+}
+
+static INLINE void idct32_high16_stage3_sse2(__m128i *x) {
+  btf_16_adds_subs_sse2(x[16], x[17]);
+  btf_16_subs_adds_sse2(x[19], x[18]);
+  btf_16_adds_subs_sse2(x[20], x[21]);
+  btf_16_subs_adds_sse2(x[23], x[22]);
+  btf_16_adds_subs_sse2(x[24], x[25]);
+  btf_16_subs_adds_sse2(x[27], x[26]);
+  btf_16_adds_subs_sse2(x[28], x[29]);
+  btf_16_subs_adds_sse2(x[31], x[30]);
+}
+
+static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+}
+
+static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+  btf_16_adds_subs_sse2(x[16], x[19]);
+  btf_16_adds_subs_sse2(x[17], x[18]);
+  btf_16_subs_adds_sse2(x[23], x[20]);
+  btf_16_subs_adds_sse2(x[22], x[21]);
+  btf_16_adds_subs_sse2(x[24], x[27]);
+  btf_16_adds_subs_sse2(x[25], x[26]);
+  btf_16_subs_adds_sse2(x[31], x[28]);
+  btf_16_subs_adds_sse2(x[30], x[29]);
+}
+
+static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[11]);
+  btf_16_adds_subs_sse2(x[9], x[10]);
+  btf_16_subs_adds_sse2(x[15], x[12]);
+  btf_16_subs_adds_sse2(x[14], x[13]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+}
+
+static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi,
+                                      const __m128i __rounding,
+                                      int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[7]);
+  btf_16_adds_subs_sse2(x[1], x[6]);
+  btf_16_adds_subs_sse2(x[2], x[5]);
+  btf_16_adds_subs_sse2(x[3], x[4]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  btf_16_adds_subs_sse2(x[16], x[23]);
+  btf_16_adds_subs_sse2(x[17], x[22]);
+  btf_16_adds_subs_sse2(x[18], x[21]);
+  btf_16_adds_subs_sse2(x[19], x[20]);
+  btf_16_subs_adds_sse2(x[31], x[24]);
+  btf_16_subs_adds_sse2(x[30], x[25]);
+  btf_16_subs_adds_sse2(x[29], x[26]);
+  btf_16_subs_adds_sse2(x[28], x[27]);
+}
+
+static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi,
+                                      const __m128i __rounding,
+                                      int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[15]);
+  btf_16_adds_subs_sse2(x[1], x[14]);
+  btf_16_adds_subs_sse2(x[2], x[13]);
+  btf_16_adds_subs_sse2(x[3], x[12]);
+  btf_16_adds_subs_sse2(x[4], x[11]);
+  btf_16_adds_subs_sse2(x[5], x[10]);
+  btf_16_adds_subs_sse2(x[6], x[9]);
+  btf_16_adds_subs_sse2(x[7], x[8]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+}
+
+static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
+  btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
+  btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
+  btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
+  btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]);
+  btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]);
+  btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]);
+  btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]);
+  btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]);
+  btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]);
+  btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]);
+  btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]);
+  btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]);
+  btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]);
+  btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]);
+  btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]);
+  btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
+}
+
+static void idct32_low1_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m128i x[2];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 6
+  // stage 7
+  // stage 8
+  // stage 9
+  output[0] = x[0];
+  output[31] = x[0];
+  output[1] = x[1];
+  output[30] = x[1];
+  output[2] = x[1];
+  output[29] = x[1];
+  output[3] = x[0];
+  output[28] = x[0];
+  output[4] = x[0];
+  output[27] = x[0];
+  output[5] = x[1];
+  output[26] = x[1];
+  output[6] = x[1];
+  output[25] = x[1];
+  output[7] = x[0];
+  output[24] = x[0];
+  output[8] = x[0];
+  output[23] = x[0];
+  output[9] = x[1];
+  output[22] = x[1];
+  output[10] = x[1];
+  output[21] = x[1];
+  output[11] = x[0];
+  output[20] = x[0];
+  output[12] = x[0];
+  output[19] = x[0];
+  output[13] = x[1];
+  output[18] = x[1];
+  output[14] = x[1];
+  output[17] = x[1];
+  output[15] = x[0];
+  output[16] = x[0];
+}
+
+static void idct32_low8_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m128i x[32];
+  x[0] = input[0];
+  x[4] = input[4];
+  x[8] = input[2];
+  x[12] = input[6];
+  x[16] = input[1];
+  x[20] = input[5];
+  x[24] = input[3];
+  x[28] = input[7];
+
+  // stage 2
+  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+  // stage 3
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  x[17] = x[16];
+  x[18] = x[19];
+  x[21] = x[20];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[26] = x[27];
+  x[29] = x[28];
+  x[30] = x[31];
+
+  // stage 4
+  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+  x[9] = x[8];
+  x[10] = x[11];
+  x[13] = x[12];
+  x[14] = x[15];
+  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  x[5] = x[4];
+  x[6] = x[7];
+  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+  // stage 6
+  x[3] = x[0];
+  x[2] = x[1];
+  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage9_sse2(output, x);
+}
+
+static void idct32_low16_new_ssse3(const __m128i *input, __m128i *output,
+                                   int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m128i x[32];
+  x[0] = input[0];
+  x[2] = input[8];
+  x[4] = input[4];
+  x[6] = input[12];
+  x[8] = input[2];
+  x[10] = input[10];
+  x[12] = input[6];
+  x[14] = input[14];
+  x[16] = input[1];
+  x[18] = input[9];
+  x[20] = input[5];
+  x[22] = input[13];
+  x[24] = input[3];
+  x[26] = input[11];
+  x[28] = input[7];
+  x[30] = input[15];
+
+  // stage 2
+  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
+  btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
+  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
+  btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
+  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+  // stage 3
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  idct32_high16_stage3_sse2(x);
+
+  // stage 4
+  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage9_sse2(output, x);
+}
+
+static void idct32_new_sse2(const __m128i *input, __m128i *output,
+                            int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+  // stage 1
+  __m128i x[32];
+  x[0] = input[0];
+  x[1] = input[16];
+  x[2] = input[8];
+  x[3] = input[24];
+  x[4] = input[4];
+  x[5] = input[20];
+  x[6] = input[12];
+  x[7] = input[28];
+  x[8] = input[2];
+  x[9] = input[18];
+  x[10] = input[10];
+  x[11] = input[26];
+  x[12] = input[6];
+  x[13] = input[22];
+  x[14] = input[14];
+  x[15] = input[30];
+  x[16] = input[1];
+  x[17] = input[17];
+  x[18] = input[9];
+  x[19] = input[25];
+  x[20] = input[5];
+  x[21] = input[21];
+  x[22] = input[13];
+  x[23] = input[29];
+  x[24] = input[3];
+  x[25] = input[19];
+  x[26] = input[11];
+  x[27] = input[27];
+  x[28] = input[7];
+  x[29] = input[23];
+  x[30] = input[15];
+  x[31] = input[31];
+
+  // stage 2
+  btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]);
+  btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]);
+  btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]);
+  btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]);
+  btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]);
+  btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]);
+  btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]);
+  btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]);
+
+  // stage 3
+  btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+  btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+  idct32_high16_stage3_sse2(x);
+
+  // stage 4
+  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_adds_subs_sse2(x[7], x[6]);
+  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 6
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 7~8
+  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage9_sse2(output, x);
+}
+
+static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
+  const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+  const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
+  const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+  const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+  btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+  btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
+  btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
+  btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+  btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+  btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
+  btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
+  btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+}
+
+static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+  btf_16_adds_subs_sse2(x[32], x[35]);
+  btf_16_adds_subs_sse2(x[33], x[34]);
+  btf_16_subs_adds_sse2(x[39], x[36]);
+  btf_16_subs_adds_sse2(x[38], x[37]);
+  btf_16_adds_subs_sse2(x[40], x[43]);
+  btf_16_adds_subs_sse2(x[41], x[42]);
+  btf_16_subs_adds_sse2(x[47], x[44]);
+  btf_16_subs_adds_sse2(x[46], x[45]);
+  btf_16_adds_subs_sse2(x[48], x[51]);
+  btf_16_adds_subs_sse2(x[49], x[50]);
+  btf_16_subs_adds_sse2(x[55], x[52]);
+  btf_16_subs_adds_sse2(x[54], x[53]);
+  btf_16_adds_subs_sse2(x[56], x[59]);
+  btf_16_adds_subs_sse2(x[57], x[58]);
+  btf_16_subs_adds_sse2(x[63], x[60]);
+  btf_16_subs_adds_sse2(x[62], x[61]);
+}
+
+static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
+}
+
+static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  btf_16_adds_subs_sse2(x[16], x[19]);
+  btf_16_adds_subs_sse2(x[17], x[18]);
+  btf_16_subs_adds_sse2(x[23], x[20]);
+  btf_16_subs_adds_sse2(x[22], x[21]);
+  btf_16_adds_subs_sse2(x[24], x[27]);
+  btf_16_adds_subs_sse2(x[25], x[26]);
+  btf_16_subs_adds_sse2(x[31], x[28]);
+  btf_16_subs_adds_sse2(x[30], x[29]);
+  idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
+}
+
+static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+  btf_16_adds_subs_sse2(x[32], x[39]);
+  btf_16_adds_subs_sse2(x[33], x[38]);
+  btf_16_adds_subs_sse2(x[34], x[37]);
+  btf_16_adds_subs_sse2(x[35], x[36]);
+  btf_16_subs_adds_sse2(x[47], x[40]);
+  btf_16_subs_adds_sse2(x[46], x[41]);
+  btf_16_subs_adds_sse2(x[45], x[42]);
+  btf_16_subs_adds_sse2(x[44], x[43]);
+  btf_16_adds_subs_sse2(x[48], x[55]);
+  btf_16_adds_subs_sse2(x[49], x[54]);
+  btf_16_adds_subs_sse2(x[50], x[53]);
+  btf_16_adds_subs_sse2(x[51], x[52]);
+  btf_16_subs_adds_sse2(x[63], x[56]);
+  btf_16_subs_adds_sse2(x[62], x[57]);
+  btf_16_subs_adds_sse2(x[61], x[58]);
+  btf_16_subs_adds_sse2(x[60], x[59]);
+}
+
+static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  btf_16_adds_subs_sse2(x[16], x[23]);
+  btf_16_adds_subs_sse2(x[17], x[22]);
+  btf_16_adds_subs_sse2(x[18], x[21]);
+  btf_16_adds_subs_sse2(x[19], x[20]);
+  btf_16_subs_adds_sse2(x[31], x[24]);
+  btf_16_subs_adds_sse2(x[30], x[25]);
+  btf_16_subs_adds_sse2(x[29], x[26]);
+  btf_16_subs_adds_sse2(x[28], x[27]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
+}
+
+static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi,
+                                      const __m128i __rounding,
+                                      int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[15]);
+  btf_16_adds_subs_sse2(x[1], x[14]);
+  btf_16_adds_subs_sse2(x[2], x[13]);
+  btf_16_adds_subs_sse2(x[3], x[12]);
+  btf_16_adds_subs_sse2(x[4], x[11]);
+  btf_16_adds_subs_sse2(x[5], x[10]);
+  btf_16_adds_subs_sse2(x[6], x[9]);
+  btf_16_adds_subs_sse2(x[7], x[8]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+  btf_16_adds_subs_sse2(x[32], x[47]);
+  btf_16_adds_subs_sse2(x[33], x[46]);
+  btf_16_adds_subs_sse2(x[34], x[45]);
+  btf_16_adds_subs_sse2(x[35], x[44]);
+  btf_16_adds_subs_sse2(x[36], x[43]);
+  btf_16_adds_subs_sse2(x[37], x[42]);
+  btf_16_adds_subs_sse2(x[38], x[41]);
+  btf_16_adds_subs_sse2(x[39], x[40]);
+  btf_16_subs_adds_sse2(x[63], x[48]);
+  btf_16_subs_adds_sse2(x[62], x[49]);
+  btf_16_subs_adds_sse2(x[61], x[50]);
+  btf_16_subs_adds_sse2(x[60], x[51]);
+  btf_16_subs_adds_sse2(x[59], x[52]);
+  btf_16_subs_adds_sse2(x[58], x[53]);
+  btf_16_subs_adds_sse2(x[57], x[54]);
+  btf_16_subs_adds_sse2(x[56], x[55]);
+}
+
+static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi,
+                                       const __m128i __rounding,
+                                       int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[31]);
+  btf_16_adds_subs_sse2(x[1], x[30]);
+  btf_16_adds_subs_sse2(x[2], x[29]);
+  btf_16_adds_subs_sse2(x[3], x[28]);
+  btf_16_adds_subs_sse2(x[4], x[27]);
+  btf_16_adds_subs_sse2(x[5], x[26]);
+  btf_16_adds_subs_sse2(x[6], x[25]);
+  btf_16_adds_subs_sse2(x[7], x[24]);
+  btf_16_adds_subs_sse2(x[8], x[23]);
+  btf_16_adds_subs_sse2(x[9], x[22]);
+  btf_16_adds_subs_sse2(x[10], x[21]);
+  btf_16_adds_subs_sse2(x[11], x[20]);
+  btf_16_adds_subs_sse2(x[12], x[19]);
+  btf_16_adds_subs_sse2(x[13], x[18]);
+  btf_16_adds_subs_sse2(x[14], x[17]);
+  btf_16_adds_subs_sse2(x[15], x[16]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
+}
+
+static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
+  btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]);
+  btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]);
+  btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]);
+  btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]);
+  btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]);
+  btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]);
+  btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]);
+  btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]);
+  btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]);
+  btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]);
+  btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]);
+  btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]);
+  btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]);
+  btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]);
+  btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]);
+  btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]);
+  btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]);
+  btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]);
+  btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]);
+  btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]);
+  btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]);
+  btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]);
+  btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]);
+  btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]);
+  btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]);
+  btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]);
+  btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]);
+  btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]);
+  btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]);
+  btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]);
+  btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]);
+  btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
+}
+
+static void idct64_low1_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m128i x[32];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  // stage 6
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 7
+  // stage 8
+  // stage 9
+  // stage 10
+  // stage 11
+  output[0] = x[0];
+  output[63] = x[0];
+  output[1] = x[1];
+  output[62] = x[1];
+  output[2] = x[1];
+  output[61] = x[1];
+  output[3] = x[0];
+  output[60] = x[0];
+  output[4] = x[0];
+  output[59] = x[0];
+  output[5] = x[1];
+  output[58] = x[1];
+  output[6] = x[1];
+  output[57] = x[1];
+  output[7] = x[0];
+  output[56] = x[0];
+  output[8] = x[0];
+  output[55] = x[0];
+  output[9] = x[1];
+  output[54] = x[1];
+  output[10] = x[1];
+  output[53] = x[1];
+  output[11] = x[0];
+  output[52] = x[0];
+  output[12] = x[0];
+  output[51] = x[0];
+  output[13] = x[1];
+  output[50] = x[1];
+  output[14] = x[1];
+  output[49] = x[1];
+  output[15] = x[0];
+  output[48] = x[0];
+  output[16] = x[0];
+  output[47] = x[0];
+  output[17] = x[1];
+  output[46] = x[1];
+  output[18] = x[1];
+  output[45] = x[1];
+  output[19] = x[0];
+  output[44] = x[0];
+  output[20] = x[0];
+  output[43] = x[0];
+  output[21] = x[1];
+  output[42] = x[1];
+  output[22] = x[1];
+  output[41] = x[1];
+  output[23] = x[0];
+  output[40] = x[0];
+  output[24] = x[0];
+  output[39] = x[0];
+  output[25] = x[1];
+  output[38] = x[1];
+  output[26] = x[1];
+  output[37] = x[1];
+  output[27] = x[0];
+  output[36] = x[0];
+  output[28] = x[0];
+  output[35] = x[0];
+  output[29] = x[1];
+  output[34] = x[1];
+  output[30] = x[1];
+  output[33] = x[1];
+  output[31] = x[0];
+  output[32] = x[0];
+}
+
+static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+  const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+  const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[64];
+  x[0] = input[0];
+  x[8] = input[4];
+  x[16] = input[2];
+  x[24] = input[6];
+  x[32] = input[1];
+  x[40] = input[5];
+  x[48] = input[3];
+  x[56] = input[7];
+
+  // stage 2
+  btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  x[33] = x[32];
+  x[38] = x[39];
+  x[41] = x[40];
+  x[46] = x[47];
+  x[49] = x[48];
+  x[54] = x[55];
+  x[57] = x[56];
+  x[62] = x[63];
+
+  // stage 4
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  x[17] = x[16];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[30] = x[31];
+  btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+  btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+  btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+  btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+
+  // stage 5
+  x[9] = x[8];
+  x[14] = x[15];
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+  x[35] = x[32];
+  x[34] = x[33];
+  x[36] = x[39];
+  x[37] = x[38];
+  x[43] = x[40];
+  x[42] = x[41];
+  x[44] = x[47];
+  x[45] = x[46];
+  x[51] = x[48];
+  x[50] = x[49];
+  x[52] = x[55];
+  x[53] = x[54];
+  x[59] = x[56];
+  x[58] = x[57];
+  x[60] = x[63];
+  x[61] = x[62];
+
+  // stage 6
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  x[19] = x[16];
+  x[18] = x[17];
+  x[20] = x[23];
+  x[21] = x[22];
+  x[27] = x[24];
+  x[26] = x[25];
+  x[28] = x[31];
+  x[29] = x[30];
+  idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 7
+  x[3] = x[0];
+  x[2] = x[1];
+  x[11] = x[8];
+  x[10] = x[9];
+  x[12] = x[15];
+  x[13] = x[14];
+  idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 8
+  x[7] = x[0];
+  x[6] = x[1];
+  x[5] = x[2];
+  x[4] = x[3];
+  x[9] = x[9];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage11_sse2(output, x);
+}
+
+static void idct64_low16_new_ssse3(const __m128i *input, __m128i *output,
+                                   int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[64];
+  x[0] = input[0];
+  x[4] = input[8];
+  x[8] = input[4];
+  x[12] = input[12];
+  x[16] = input[2];
+  x[20] = input[10];
+  x[24] = input[6];
+  x[28] = input[14];
+  x[32] = input[1];
+  x[36] = input[9];
+  x[40] = input[5];
+  x[44] = input[13];
+  x[48] = input[3];
+  x[52] = input[11];
+  x[56] = input[7];
+  x[60] = input[15];
+
+  // stage 2
+  btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
+  btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
+  btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
+  btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
+  btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  x[33] = x[32];
+  x[34] = x[35];
+  x[37] = x[36];
+  x[38] = x[39];
+  x[41] = x[40];
+  x[42] = x[43];
+  x[45] = x[44];
+  x[46] = x[47];
+  x[49] = x[48];
+  x[50] = x[51];
+  x[53] = x[52];
+  x[54] = x[55];
+  x[57] = x[56];
+  x[58] = x[59];
+  x[61] = x[60];
+  x[62] = x[63];
+
+  // stage 4
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  x[17] = x[16];
+  x[18] = x[19];
+  x[21] = x[20];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[26] = x[27];
+  x[29] = x[28];
+  x[30] = x[31];
+  idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+  x[9] = x[8];
+  x[10] = x[11];
+  x[13] = x[12];
+  x[14] = x[15];
+  idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 6
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  x[5] = x[4];
+  x[6] = x[7];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+  idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 7
+  x[3] = x[0];
+  x[2] = x[1];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[11]);
+  btf_16_adds_subs_sse2(x[9], x[10]);
+  btf_16_subs_adds_sse2(x[15], x[12]);
+  btf_16_subs_adds_sse2(x[14], x[13]);
+  idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 8
+  btf_16_adds_subs_sse2(x[0], x[7]);
+  btf_16_adds_subs_sse2(x[1], x[6]);
+  btf_16_adds_subs_sse2(x[2], x[5]);
+  btf_16_adds_subs_sse2(x[3], x[4]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage11_sse2(output, x);
+}
+
+static void idct64_low32_new_ssse3(const __m128i *input, __m128i *output,
+                                   int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[64];
+  x[0] = input[0];
+  x[2] = input[16];
+  x[4] = input[8];
+  x[6] = input[24];
+  x[8] = input[4];
+  x[10] = input[20];
+  x[12] = input[12];
+  x[14] = input[28];
+  x[16] = input[2];
+  x[18] = input[18];
+  x[20] = input[10];
+  x[22] = input[26];
+  x[24] = input[6];
+  x[26] = input[22];
+  x[28] = input[14];
+  x[30] = input[30];
+  x[32] = input[1];
+  x[34] = input[17];
+  x[36] = input[9];
+  x[38] = input[25];
+  x[40] = input[5];
+  x[42] = input[21];
+  x[44] = input[13];
+  x[46] = input[29];
+  x[48] = input[3];
+  x[50] = input[19];
+  x[52] = input[11];
+  x[54] = input[27];
+  x[56] = input[7];
+  x[58] = input[23];
+  x[60] = input[15];
+  x[62] = input[31];
+
+  // stage 2
+  btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]);
+  btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]);
+  btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
+  btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
+  btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]);
+  btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]);
+  btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]);
+  btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]);
+  btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
+  btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
+  btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]);
+  btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]);
+  btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
+  btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
+  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
+  btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
+  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  btf_16_adds_subs_sse2(x[32], x[33]);
+  btf_16_subs_adds_sse2(x[35], x[34]);
+  btf_16_adds_subs_sse2(x[36], x[37]);
+  btf_16_subs_adds_sse2(x[39], x[38]);
+  btf_16_adds_subs_sse2(x[40], x[41]);
+  btf_16_subs_adds_sse2(x[43], x[42]);
+  btf_16_adds_subs_sse2(x[44], x[45]);
+  btf_16_subs_adds_sse2(x[47], x[46]);
+  btf_16_adds_subs_sse2(x[48], x[49]);
+  btf_16_subs_adds_sse2(x[51], x[50]);
+  btf_16_adds_subs_sse2(x[52], x[53]);
+  btf_16_subs_adds_sse2(x[55], x[54]);
+  btf_16_adds_subs_sse2(x[56], x[57]);
+  btf_16_subs_adds_sse2(x[59], x[58]);
+  btf_16_adds_subs_sse2(x[60], x[61]);
+  btf_16_subs_adds_sse2(x[63], x[62]);
+
+  // stage 4
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  btf_16_adds_subs_sse2(x[16], x[17]);
+  btf_16_subs_adds_sse2(x[19], x[18]);
+  btf_16_adds_subs_sse2(x[20], x[21]);
+  btf_16_subs_adds_sse2(x[23], x[22]);
+  btf_16_adds_subs_sse2(x[24], x[25]);
+  btf_16_subs_adds_sse2(x[27], x[26]);
+  btf_16_adds_subs_sse2(x[28], x[29]);
+  btf_16_subs_adds_sse2(x[31], x[30]);
+  idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+  idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 6
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+  idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 7
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[11]);
+  btf_16_adds_subs_sse2(x[9], x[10]);
+  btf_16_subs_adds_sse2(x[15], x[12]);
+  btf_16_subs_adds_sse2(x[14], x[13]);
+  idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 8
+  btf_16_adds_subs_sse2(x[0], x[7]);
+  btf_16_adds_subs_sse2(x[1], x[6]);
+  btf_16_adds_subs_sse2(x[2], x[5]);
+  btf_16_adds_subs_sse2(x[3], x[4]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 9~11
+  idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage11_sse2(output, x);
+}
+
+void iadst4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
+  const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
+  const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
+  const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
+  const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
+  const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
+  const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
+  const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
+  const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
+  __m128i x0[4];
+  x0[0] = input[0];
+  x0[1] = input[1];
+  x0[2] = input[2];
+  x0[3] = input[3];
+
+  __m128i u[4];
+  u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
+  u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
+  u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
+  u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
+
+  __m128i x1[16];
+  x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
+  x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
+  x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
+  x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
+  x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02);  // x1*sin3 + x3*sin2
+  x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
+  x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04);  // x1*sin3 - x3*sin4
+  x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
+  x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
+  x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
+  x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03);  // x2*sin3
+  x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
+  x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
+  x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02);
+  x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
+  x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01);
+
+  __m128i x2[8];
+  x2[0] = _mm_add_epi32(x1[0], x1[4]);  // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2
+  x2[1] = _mm_add_epi32(x1[1], x1[5]);
+  x2[2] = _mm_add_epi32(x1[2], x1[6]);  // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4
+  x2[3] = _mm_add_epi32(x1[3], x1[7]);
+  x2[4] = _mm_add_epi32(x1[8], x1[10]);  // x0*sin3 -x2*sin3 +x3*sin3
+  x2[5] = _mm_add_epi32(x1[9], x1[11]);
+  x2[6] = _mm_add_epi32(x1[12], x1[14]);  // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1
+  x2[7] = _mm_add_epi32(x1[13], x1[15]);
+
+  const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+  for (int i = 0; i < 4; ++i) {
+    __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
+    __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
+    out0 = _mm_srai_epi32(out0, INV_COS_BIT);
+    out1 = _mm_srai_epi32(out1, INV_COS_BIT);
+    output[i] = _mm_packs_epi32(out0, out1);
+  }
+}
+
+// TODO(binpengsmail@gmail.com):
+// To explore the reuse of VP9 versions of corresponding SSE2 functions and
+// evaluate whether there is a possibility for further speedup.
+void iadst4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
+  const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
+  const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
+  const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
+  const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
+  const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
+  const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
+  const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
+  const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
+  __m128i x0[4];
+  x0[0] = input[0];
+  x0[1] = input[1];
+  x0[2] = input[2];
+  x0[3] = input[3];
+
+  __m128i u[2];
+  u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
+  u[1] = _mm_unpacklo_epi16(x0[1], x0[3]);
+
+  __m128i x1[8];
+  x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
+  x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
+  x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02);  // x1*sin3 + x3*sin2
+  x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04);  // x1*sin3 - x3*sin4
+  x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
+  x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03);    // x2*sin3
+  x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
+  x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
+
+  __m128i x2[4];
+  x2[0] = _mm_add_epi32(x1[0], x1[2]);  // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
+  x2[1] = _mm_add_epi32(x1[1], x1[3]);  // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4
+  x2[2] = _mm_add_epi32(x1[4], x1[5]);  // x0*sin3 - x2*sin3 + x3*sin3
+  x2[3] = _mm_add_epi32(x1[6], x1[7]);  // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1
+
+  const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+  for (int i = 0; i < 4; ++i) {
+    __m128i out0 = _mm_add_epi32(x2[i], rounding);
+    out0 = _mm_srai_epi32(out0, INV_COS_BIT);
+    output[i] = _mm_packs_epi32(out0, out0);
+  }
+}
+
+static void iadst8_low1_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+  // stage 1
+  __m128i x[8];
+  x[1] = input[0];
+
+  // stage 2
+  btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]);
+
+  // stage 3
+  x[4] = x[0];
+  x[5] = x[1];
+
+  // stage 4
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+
+  // stage 5
+  x[2] = x[0];
+  x[3] = x[1];
+  x[6] = x[4];
+  x[7] = x[5];
+
+  // stage 6
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+  // stage 7
+  output[0] = x[0];
+  output[1] = _mm_subs_epi16(__zero, x[4]);
+  output[2] = x[6];
+  output[3] = _mm_subs_epi16(__zero, x[2]);
+  output[4] = x[3];
+  output[5] = _mm_subs_epi16(__zero, x[7]);
+  output[6] = x[5];
+  output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+  // stage 1
+  __m128i x[8];
+  x[0] = input[7];
+  x[1] = input[0];
+  x[2] = input[5];
+  x[3] = input[2];
+  x[4] = input[3];
+  x[5] = input[4];
+  x[6] = input[1];
+  x[7] = input[6];
+
+  // stage 2
+  btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
+  btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
+  btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
+
+  // stage 3
+  btf_16_adds_subs_sse2(x[0], x[4]);
+  btf_16_adds_subs_sse2(x[1], x[5]);
+  btf_16_adds_subs_sse2(x[2], x[6]);
+  btf_16_adds_subs_sse2(x[3], x[7]);
+
+  // stage 4
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+
+  // stage 5
+  btf_16_adds_subs_sse2(x[0], x[2]);
+  btf_16_adds_subs_sse2(x[1], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[6]);
+  btf_16_adds_subs_sse2(x[5], x[7]);
+
+  // stage 6
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+  // stage 7
+  output[0] = x[0];
+  output[1] = _mm_subs_epi16(__zero, x[4]);
+  output[2] = x[6];
+  output[3] = _mm_subs_epi16(__zero, x[2]);
+  output[4] = x[3];
+  output[5] = _mm_subs_epi16(__zero, x[7]);
+  output[6] = x[5];
+  output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+void iadst8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+  // stage 1
+  __m128i x[8];
+  x[0] = input[7];
+  x[1] = input[0];
+  x[2] = input[5];
+  x[3] = input[2];
+  x[4] = input[3];
+  x[5] = input[4];
+  x[6] = input[1];
+  x[7] = input[6];
+
+  // stage 2
+  btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
+  btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
+  btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
+  btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
+
+  // stage 3
+  btf_16_adds_subs_sse2(x[0], x[4]);
+  btf_16_adds_subs_sse2(x[1], x[5]);
+  btf_16_adds_subs_sse2(x[2], x[6]);
+  btf_16_adds_subs_sse2(x[3], x[7]);
+
+  // stage 4
+  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+
+  // stage 5
+  btf_16_adds_subs_sse2(x[0], x[2]);
+  btf_16_adds_subs_sse2(x[1], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[6]);
+  btf_16_adds_subs_sse2(x[5], x[7]);
+
+  // stage 6
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+  // stage 7
+  output[0] = x[0];
+  output[1] = _mm_subs_epi16(__zero, x[4]);
+  output[2] = x[6];
+  output[3] = _mm_subs_epi16(__zero, x[2]);
+  output[4] = x[3];
+  output[5] = _mm_subs_epi16(__zero, x[7]);
+  output[6] = x[5];
+  output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+static INLINE void iadst16_stage3_ssse3(__m128i *x) {
+  btf_16_adds_subs_sse2(x[0], x[8]);
+  btf_16_adds_subs_sse2(x[1], x[9]);
+  btf_16_adds_subs_sse2(x[2], x[10]);
+  btf_16_adds_subs_sse2(x[3], x[11]);
+  btf_16_adds_subs_sse2(x[4], x[12]);
+  btf_16_adds_subs_sse2(x[5], x[13]);
+  btf_16_adds_subs_sse2(x[6], x[14]);
+  btf_16_adds_subs_sse2(x[7], x[15]);
+}
+
+static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi,
+                                        const __m128i __rounding,
+                                        int8_t cos_bit) {
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+  const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+  btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
+  btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
+  btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage5_ssse3(__m128i *x) {
+  btf_16_adds_subs_sse2(x[0], x[4]);
+  btf_16_adds_subs_sse2(x[1], x[5]);
+  btf_16_adds_subs_sse2(x[2], x[6]);
+  btf_16_adds_subs_sse2(x[3], x[7]);
+  btf_16_adds_subs_sse2(x[8], x[12]);
+  btf_16_adds_subs_sse2(x[9], x[13]);
+  btf_16_adds_subs_sse2(x[10], x[14]);
+  btf_16_adds_subs_sse2(x[11], x[15]);
+}
+
+static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi,
+                                        const __m128i __rounding,
+                                        int8_t cos_bit) {
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage7_ssse3(__m128i *x) {
+  btf_16_adds_subs_sse2(x[0], x[2]);
+  btf_16_adds_subs_sse2(x[1], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[6]);
+  btf_16_adds_subs_sse2(x[5], x[7]);
+  btf_16_adds_subs_sse2(x[8], x[10]);
+  btf_16_adds_subs_sse2(x[9], x[11]);
+  btf_16_adds_subs_sse2(x[12], x[14]);
+  btf_16_adds_subs_sse2(x[13], x[15]);
+}
+
+static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi,
+                                        const __m128i __rounding,
+                                        int8_t cos_bit) {
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
+  const __m128i __zero = _mm_setzero_si128();
+  output[0] = x[0];
+  output[1] = _mm_subs_epi16(__zero, x[8]);
+  output[2] = x[12];
+  output[3] = _mm_subs_epi16(__zero, x[4]);
+  output[4] = x[6];
+  output[5] = _mm_subs_epi16(__zero, x[14]);
+  output[6] = x[10];
+  output[7] = _mm_subs_epi16(__zero, x[2]);
+  output[8] = x[3];
+  output[9] = _mm_subs_epi16(__zero, x[11]);
+  output[10] = x[15];
+  output[11] = _mm_subs_epi16(__zero, x[7]);
+  output[12] = x[5];
+  output[13] = _mm_subs_epi16(__zero, x[13]);
+  output[14] = x[9];
+  output[15] = _mm_subs_epi16(__zero, x[1]);
+}
+
+static void iadst16_low1_new_ssse3(const __m128i *input, __m128i *output,
+                                   int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+
+  // stage 1
+  __m128i x[16];
+  x[1] = input[0];
+
+  // stage 2
+  btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
+
+  // stage 3
+  x[8] = x[0];
+  x[9] = x[1];
+
+  // stage 4
+  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+
+  // stage 5
+  x[4] = x[0];
+  x[5] = x[1];
+  x[12] = x[8];
+  x[13] = x[9];
+
+  // stage 6
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+
+  // stage 7
+  x[2] = x[0];
+  x[3] = x[1];
+  x[6] = x[4];
+  x[7] = x[5];
+  x[10] = x[8];
+  x[11] = x[9];
+  x[14] = x[12];
+  x[15] = x[13];
+
+  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage9_ssse3(output, x);
+}
+
+static void iadst16_low8_new_ssse3(const __m128i *input, __m128i *output,
+                                   int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m128i x[16];
+  x[1] = input[0];
+  x[3] = input[2];
+  x[5] = input[4];
+  x[7] = input[6];
+  x[8] = input[7];
+  x[10] = input[5];
+  x[12] = input[3];
+  x[14] = input[1];
+
+  // stage 2
+  btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
+  btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]);
+  btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]);
+  btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]);
+  btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]);
+  btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]);
+  btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]);
+  btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]);
+
+  // stage 3
+  iadst16_stage3_ssse3(x);
+  iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage5_ssse3(x);
+  iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage7_ssse3(x);
+  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage9_ssse3(output, x);
+}
+void iadst16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+  // stage 1
+  __m128i x[16];
+  x[0] = input[15];
+  x[1] = input[0];
+  x[2] = input[13];
+  x[3] = input[2];
+  x[4] = input[11];
+  x[5] = input[4];
+  x[6] = input[9];
+  x[7] = input[6];
+  x[8] = input[7];
+  x[9] = input[8];
+  x[10] = input[5];
+  x[11] = input[10];
+  x[12] = input[3];
+  x[13] = input[12];
+  x[14] = input[1];
+  x[15] = input[14];
+
+  // stage 2
+  btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
+  btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
+  btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
+  btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
+  btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
+  btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
+  btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
+
+  // stage 3~9
+  iadst16_stage3_ssse3(x);
+  iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage5_ssse3(x);
+  iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage7_ssse3(x);
+  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage9_ssse3(output, x);
+}
+
+void iadst16_w4_new_sse2(const __m128i *input, __m128i *output,
+                         int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+  const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+  // stage 1
+  __m128i x[16];
+  x[0] = input[15];
+  x[1] = input[0];
+  x[2] = input[13];
+  x[3] = input[2];
+  x[4] = input[11];
+  x[5] = input[4];
+  x[6] = input[9];
+  x[7] = input[6];
+  x[8] = input[7];
+  x[9] = input[8];
+  x[10] = input[5];
+  x[11] = input[10];
+  x[12] = input[3];
+  x[13] = input[12];
+  x[14] = input[1];
+  x[15] = input[14];
+
+  // stage 2
+  btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
+  btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
+  btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
+  btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
+  btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
+  btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
+  btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
+  btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
+
+  // stage 3
+  iadst16_stage3_ssse3(x);
+
+  // stage 4
+  btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+  btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
+  btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
+  btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+
+  // stage 5
+  iadst16_stage5_ssse3(x);
+
+  // stage 6
+  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+
+  // stage 7
+  iadst16_stage7_ssse3(x);
+
+  // stage 8
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
+
+  // stage 9
+  iadst16_stage9_ssse3(output, x);
+}
+
+static void iidentity4_new_ssse3(const __m128i *input, __m128i *output,
+                                 int8_t cos_bit) {
+  (void)cos_bit;
+  const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
+  const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
+  for (int i = 0; i < 4; ++i) {
+    __m128i x = _mm_mulhrs_epi16(input[i], scale);
+    output[i] = _mm_adds_epi16(x, input[i]);
+  }
+}
+
+static void iidentity8_new_sse2(const __m128i *input, __m128i *output,
+                                int8_t cos_bit) {
+  (void)cos_bit;
+  for (int i = 0; i < 8; ++i) {
+    output[i] = _mm_adds_epi16(input[i], input[i]);
+  }
+}
+
+static void iidentity16_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
+  const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
+  for (int i = 0; i < 16; ++i) {
+    __m128i x = _mm_mulhrs_epi16(input[i], scale);
+    __m128i srcx2 = _mm_adds_epi16(input[i], input[i]);
+    output[i] = _mm_adds_epi16(x, srcx2);
+  }
+}
+
+static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred,
+                                               __m128i res) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
+  return _mm_packus_epi16(x0, x0);
+}
+
+static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output,
+                                               int stride, int flipud,
+                                               const int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  const __m128i zero = _mm_setzero_si128();
+  for (int i = 0; i < height; ++i, j += step) {
+    const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output + i * stride)));
+    __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
+    u = _mm_packus_epi16(u, zero);
+    *((uint32_t *)(output + i * stride)) = _mm_cvtsi128_si32(u);
+  }
+}
+
+static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output,
+                                               int stride, int flipud,
+                                               const int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
+    const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]);
+    _mm_storel_epi64((__m128i *)(output + i * stride), u);
+  }
+}
+
+// 1D functions process process 8 pixels at one time.
+static const transform_1d_ssse3
+    lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
+      { idct4_new_sse2, iadst4_new_sse2, iidentity4_new_ssse3 },
+      { idct8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2 },
+      { idct16_new_sse2, iadst16_new_sse2, iidentity16_new_ssse3 },
+      { idct32_new_sse2, NULL, NULL },
+      { idct64_low32_new_ssse3, NULL, NULL },
+    };
+
+// functions for blocks with eob at DC and within
+// topleft 8x8, 16x16, 32x32 corner
+static const transform_1d_ssse3
+    lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { idct4_new_sse2, idct4_new_sse2, NULL, NULL },
+          { iadst4_new_sse2, iadst4_new_sse2, NULL, NULL },
+          { iidentity4_new_ssse3, iidentity4_new_ssse3, NULL, NULL },
+      },
+      { { idct8_low1_new_ssse3, idct8_new_sse2, NULL, NULL },
+        { iadst8_low1_new_ssse3, iadst8_new_sse2, NULL, NULL },
+        { iidentity8_new_sse2, iidentity8_new_sse2, NULL, NULL } },
+      {
+          { idct16_low1_new_ssse3, idct16_low8_new_ssse3, idct16_new_sse2,
+            NULL },
+          { iadst16_low1_new_ssse3, iadst16_low8_new_ssse3, iadst16_new_sse2,
+            NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct32_low1_new_ssse3, idct32_low8_new_ssse3, idct32_low16_new_ssse3,
+          idct32_new_sse2 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      { { idct64_low1_new_ssse3, idct64_low8_new_ssse3, idct64_low16_new_ssse3,
+          idct64_low32_new_ssse3 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+// 1D functions process process 4 pixels at one time.
+// used in 4x4, 4x8, 4x16, 8x4, 16x4
+static const transform_1d_ssse3
+    lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
+      { idct4_w4_new_sse2, iadst4_w4_new_sse2, iidentity4_new_ssse3 },
+      { idct8_w4_new_sse2, iadst8_w4_new_sse2, iidentity8_new_sse2 },
+      { idct16_w4_new_sse2, iadst16_w4_new_sse2, iidentity16_new_ssse3 },
+      { NULL, NULL, NULL },
+      { NULL, NULL, NULL },
+    };
+
+static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input,
+                                           int stride, int shift, int height,
+                                           int txw_idx, int rect_type) {
+  const int32_t *input_row = input;
+  const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]);
+  const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) +
+                                          (1 << (NewSqrt2Bits - shift - 1)));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
+  if (rect_type != 1 && rect_type != -1) {
+    for (int i = 0; i < height; ++i) {
+      const __m128i src = load_32bit_to_16bit(input_row);
+      input_row += stride;
+      __m128i lo = _mm_unpacklo_epi16(src, one);
+      __m128i hi = _mm_unpackhi_epi16(src, one);
+      lo = _mm_madd_epi16(lo, scale_rounding);
+      hi = _mm_madd_epi16(hi, scale_rounding);
+      lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
+      hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
+      out[i] = _mm_packs_epi32(lo, hi);
+    }
+  } else {
+    const __m128i rect_scale =
+        _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
+    for (int i = 0; i < height; ++i) {
+      __m128i src = load_32bit_to_16bit(input_row);
+      src = _mm_mulhrs_epi16(src, rect_scale);
+      input_row += stride;
+      __m128i lo = _mm_unpacklo_epi16(src, one);
+      __m128i hi = _mm_unpackhi_epi16(src, one);
+      lo = _mm_madd_epi16(lo, scale_rounding);
+      hi = _mm_madd_epi16(hi, scale_rounding);
+      lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
+      hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
+      out[i] = _mm_packs_epi32(lo, hi);
+    }
+  }
+}
+
+static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride,
+                                           __m128i *buf, int shift, int height,
+                                           int txh_idx) {
+  const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]);
+  const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
+  const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding);
+  const __m128i zero = _mm_setzero_si128();
+  for (int h = 0; h < height; ++h) {
+    __m128i lo = _mm_unpacklo_epi16(buf[h], one);
+    __m128i hi = _mm_unpackhi_epi16(buf[h], one);
+    lo = _mm_madd_epi16(lo, scale_coeff);
+    hi = _mm_madd_epi16(hi, scale_coeff);
+    lo = _mm_srai_epi32(lo, NewSqrt2Bits);
+    hi = _mm_srai_epi32(hi, NewSqrt2Bits);
+    lo = _mm_add_epi32(lo, shift_rounding);
+    hi = _mm_add_epi32(hi, shift_rounding);
+    lo = _mm_srai_epi32(lo, -shift);
+    hi = _mm_srai_epi32(hi, -shift);
+    __m128i x = _mm_packs_epi32(lo, hi);
+
+    const __m128i pred = _mm_loadl_epi64((__m128i const *)(output));
+    x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero));
+    const __m128i u = _mm_packus_epi16(x, x);
+    _mm_storel_epi64((__m128i *)(output), u);
+    output += stride;
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input,
+                                                   uint8_t *output, int stride,
+                                                   TX_SIZE tx_size) {
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  __m128i buf[32];
+
+  for (int i = 0; i < (input_stride >> 3); ++i) {
+    iidentity_row_8xn_ssse3(buf, input + 8 * i, input_stride, shift[0], row_max,
+                            txw_idx, rect_type);
+    iidentity_col_8xn_ssse3(output + 8 * i, stride, buf, shift[1], row_max,
+                            txh_idx);
+  }
+}
+
+void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size_, int eob) {
+  (void)tx_size_;
+  (void)eob;
+  __m128i buf[4];
+  const TX_SIZE tx_size = TX_4X4;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
+  transpose_16bit_4x4(buf, buf);
+  row_txfm(buf, buf, cos_bit_row);
+  if (lr_flip) {
+    __m128i temp[4];
+    flip_buf_sse2(buf, temp, txfm_size_col);
+    transpose_16bit_4x4(temp, buf);
+  } else {
+    transpose_16bit_4x4(buf, buf);
+  }
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred,
+                                                 __m128i res0, __m128i res1) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i x0 = _mm_unpacklo_epi8(pred, zero);
+  __m128i x1 = _mm_unpackhi_epi8(pred, zero);
+  x0 = _mm_adds_epi16(res0, x0);
+  x1 = _mm_adds_epi16(res1, x1);
+  return _mm_packus_epi16(x0, x1);
+}
+
+static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output,
+                                                int stride, int flipud,
+                                                int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
+    __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]);
+    _mm_storeu_si128((__m128i *)(output + i * stride), u);
+  }
+}
+
+static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output,
+                                     int size) {
+  const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8);
+  for (int i = 0; i < size; ++i) {
+    output[i] = _mm_mulhrs_epi16(input[i], scale);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  __m128i buf1[64 * 8];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    __m128i buf0[64];
+    const int32_t *input_row = input + i * input_stride * 8;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      __m128i *buf0_cur = buf0 + j * 8;
+      load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
+      transpose_16bit_8x8(buf0_cur, buf0_cur);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      round_shift_ssse3(buf0, buf0, input_stride);  // rect special code
+    }
+    row_txfm(buf0, buf0, cos_bit_row);
+    round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
+    __m128i *_buf1 = buf1 + i * 8;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        __m128i temp[8];
+        flip_buf_sse2(buf0 + 8 * j, temp, 8);
+        transpose_16bit_8x8(temp,
+                            _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
+      }
+    }
+  }
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, cos_bit_col);
+    round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
+  }
+
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
+                                   output + 16 * i, stride, ud_flip,
+                                   txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  int eobx, eoby;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = (eobx + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
+  assert(fun_idx < 5);
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
+
+  assert(col_txfm != NULL);
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    __m128i buf0[64];
+    iidentity_row_8xn_ssse3(buf0, input + 8 * i, input_stride, shift[0],
+                            eoby + 1, txw_idx, rect_type);
+    col_txfm(buf0, buf0, cos_bit_col);
+    __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1]));
+    int k = ud_flip ? (txfm_size_row - 1) : 0;
+    const int step = ud_flip ? -1 : 1;
+    uint8_t *out = output + 8 * i;
+    for (int j = 0; j < txfm_size_row; ++j, k += step) {
+      const __m128i v = _mm_loadl_epi64((__m128i const *)(out));
+      __m128i res = _mm_mulhrs_epi16(buf0[k], mshift);
+      const __m128i u = lowbd_get_recon_8x8_sse2(v, res);
+      _mm_storel_epi64((__m128i *)(out), u);
+      out += stride;
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  __m128i buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_h_div8 = (eoby + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_h_div8; i++) {
+    __m128i buf0[64];
+    const int32_t *input_row = input + i * input_stride * 8;
+    for (int j = 0; j < AOMMIN(4, buf_size_w_div8); ++j) {
+      __m128i *buf0_cur = buf0 + j * 8;
+      load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
+      transpose_16bit_8x8(buf0_cur, buf0_cur);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      round_shift_ssse3(buf0, buf0, input_stride);  // rect special code
+    }
+    row_txfm(buf0, buf0, cos_bit_row);
+    round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
+    __m128i *_buf1 = buf1;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        __m128i temp[8];
+        flip_buf_sse2(buf0 + 8 * j, temp, 8);
+        transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j));
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j);
+      }
+    }
+
+    for (int j = 0; j < buf_size_w_div8; ++j) {
+      iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride,
+                              buf1 + j * 8, shift[1], 8, txh_idx);
+    }
+  }
+}
+
+// for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64
+static INLINE void lowbd_inv_txfm2d_add_universe_ssse3(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  switch (tx_type) {
+    case DCT_DCT:
+      lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
+                                             tx_size, eob);
+      break;
+    case IDTX:
+      lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
+      break;
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
+                                            tx_size, eob);
+      break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
+                                            tx_size, eob);
+      break;
+    default:
+      lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
+                                             tx_size, eob);
+      break;
+  }
+}
+
+void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size_, int eob) {
+  (void)tx_size_;
+  (void)eob;
+  __m128i buf[8];
+  const TX_SIZE tx_size = TX_4X8;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
+  transpose_16bit_4x8(buf, buf);
+  round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
+  row_txfm(buf, buf, cos_bit_row);
+  // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
+  if (lr_flip) {
+    __m128i temp[4];
+    flip_buf_sse2(buf, temp, txfm_size_col);
+    transpose_16bit_8x4(temp, buf);
+  } else {
+    transpose_16bit_8x4(buf, buf);
+  }
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size_, int eob) {
+  (void)tx_size_;
+  (void)eob;
+  __m128i buf[8];
+  const TX_SIZE tx_size = TX_8X4;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row);
+  transpose_16bit_8x4(buf, buf);
+  round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
+  row_txfm(buf, buf, cos_bit_row);
+  // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
+  if (lr_flip) {
+    __m128i temp[8];
+    flip_buf_sse2(buf, temp, txfm_size_col);
+    transpose_16bit_4x8(temp, buf);
+  } else {
+    transpose_16bit_4x8(buf, buf);
+  }
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+  lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, uint8_t *output,
+                                     int stride, TX_TYPE tx_type,
+                                     TX_SIZE tx_size_, int eob) {
+  (void)tx_size_;
+  (void)eob;
+  __m128i buf[16];
+  const TX_SIZE tx_size = TX_4X16;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  const int row_one_loop = 8;
+  for (int i = 0; i < 2; ++i) {
+    const int32_t *input_cur = input + i * txfm_size_col * row_one_loop;
+    __m128i *buf_cur = buf + i * row_one_loop;
+    load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur,
+                                  row_one_loop);
+    transpose_16bit_4x8(buf_cur, buf_cur);
+    row_txfm(buf_cur, buf_cur, cos_bit_row);
+    round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
+    if (lr_flip) {
+      __m128i temp[8];
+      flip_buf_sse2(buf_cur, temp, txfm_size_col);
+      transpose_16bit_8x4(temp, buf_cur);
+    } else {
+      transpose_16bit_8x4(buf_cur, buf_cur);
+    }
+  }
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, uint8_t *output,
+                                     int stride, TX_TYPE tx_type,
+                                     TX_SIZE tx_size_, int eob) {
+  (void)tx_size_;
+  (void)eob;
+  __m128i buf[16];
+  const TX_SIZE tx_size = TX_16X4;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int row_one_loop = 8;
+  for (int i = 0; i < buf_size_w_div8; ++i) {
+    const int32_t *input_cur = input + i * row_one_loop;
+    __m128i *buf_cur = buf + i * row_one_loop;
+    load_buffer_32bit_to_16bit(input_cur, txfm_size_col, buf_cur,
+                               txfm_size_row);
+    transpose_16bit_8x4(buf_cur, buf_cur);
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
+  if (lr_flip) {
+    __m128i temp[16];
+    flip_buf_sse2(buf, temp, 16);
+    transpose_16bit_4x8(temp, buf);
+    transpose_16bit_4x8(temp + 8, buf + 8);
+  } else {
+    transpose_16bit_4x8(buf, buf);
+    transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
+  }
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col);
+    round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
+  }
+  lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
+  lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4);
+}
+
+void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size, int eob) {
+  switch (tx_size) {
+    case TX_4X4:
+      lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+    case TX_4X8:
+      lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+    case TX_8X4:
+      lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+    case TX_4X16:
+      lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size,
+                                      eob);
+      break;
+    case TX_16X4:
+      lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size,
+                                      eob);
+      break;
+    default:
+      lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type,
+                                          tx_size, eob);
+      break;
+  }
+}
+void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+                            const TxfmParam *txfm_param) {
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  if (!txfm_param->lossless) {
+    av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
+                                   txfm_param->tx_size, txfm_param->eob);
+  } else {
+    av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
new file mode 100644
index 000000000..dc9be25d2
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+#define AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+
+#include <emmintrin.h>  // SSE2
+#include <tmmintrin.h>  // SSSE3
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define btf_16_ssse3(w0, w1, in, out0, out1)    \
+  do {                                          \
+    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
+    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
+    const __m128i _in = in;                     \
+    out0 = _mm_mulhrs_epi16(_in, _w0);          \
+    out1 = _mm_mulhrs_epi16(_in, _w1);          \
+  } while (0)
+
+#define btf_16_adds_subs_sse2(in0, in1) \
+  do {                                  \
+    const __m128i _in0 = in0;           \
+    const __m128i _in1 = in1;           \
+    in0 = _mm_adds_epi16(_in0, _in1);   \
+    in1 = _mm_subs_epi16(_in0, _in1);   \
+  } while (0)
+
+#define btf_16_subs_adds_sse2(in0, in1) \
+  do {                                  \
+    const __m128i _in0 = in0;           \
+    const __m128i _in1 = in1;           \
+    in1 = _mm_subs_epi16(_in0, _in1);   \
+    in0 = _mm_adds_epi16(_in0, _in1);   \
+  } while (0)
+
+#define btf_16_adds_subs_out_sse2(out0, out1, in0, in1) \
+  do {                                                  \
+    const __m128i _in0 = in0;                           \
+    const __m128i _in1 = in1;                           \
+    out0 = _mm_adds_epi16(_in0, _in1);                  \
+    out1 = _mm_subs_epi16(_in0, _in1);                  \
+  } while (0)
+
+static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
+  if (bit < 0) {
+    const __m128i scale = _mm_set1_epi16(1 << (15 + bit));
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm_mulhrs_epi16(in[i], scale);
+    }
+  } else if (bit > 0) {
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm_slli_epi16(in[i], bit);
+    }
+  }
+}
+
+// 1D itx types
+typedef enum ATTRIBUTE_PACKED {
+  IDCT_1D,
+  IADST_1D,
+  IFLIPADST_1D = IADST_1D,
+  IIDENTITY_1D,
+  ITX_TYPES_1D,
+} ITX_TYPE_1D;
+
+static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
+  IDCT_1D,      IADST_1D,     IDCT_1D,      IADST_1D,
+  IFLIPADST_1D, IDCT_1D,      IFLIPADST_1D, IADST_1D,
+  IFLIPADST_1D, IIDENTITY_1D, IDCT_1D,      IIDENTITY_1D,
+  IADST_1D,     IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
+};
+
+static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
+  IDCT_1D,      IDCT_1D,      IADST_1D,     IADST_1D,
+  IDCT_1D,      IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
+  IADST_1D,     IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
+  IIDENTITY_1D, IADST_1D,     IIDENTITY_1D, IFLIPADST_1D,
+};
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+                                          4 * 5793 };
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_16x16_default[16]) = {
+  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+  0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_32x32_default[32]) = {
+  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
+  0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_16x32_default[32]) = {
+  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+  0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_32x16_default[16]) = {
+  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+  0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
+  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
+  0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t *,
+                av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
+  NULL,
+  av1_eob_to_eobxy_8x8_default,
+  av1_eob_to_eobxy_16x16_default,
+  av1_eob_to_eobxy_32x32_default,
+  av1_eob_to_eobxy_32x32_default,
+  NULL,
+  NULL,
+  av1_eob_to_eobxy_8x16_default,
+  av1_eob_to_eobxy_16x8_default,
+  av1_eob_to_eobxy_16x32_default,
+  av1_eob_to_eobxy_32x16_default,
+  av1_eob_to_eobxy_32x32_default,
+  av1_eob_to_eobxy_32x32_default,
+  NULL,
+  NULL,
+  av1_eob_to_eobxy_8x32_default,
+  av1_eob_to_eobxy_32x8_default,
+  av1_eob_to_eobxy_16x32_default,
+  av1_eob_to_eobxy_32x16_default,
+};
+
+static const int lowbd_txfm_all_1d_zeros_idx[32] = {
+  0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+// Transform block width in log2 for eob (size of 64 map to 32)
+static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
+  2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
+};
+
+static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+                                              TX_SIZE tx_size, int eob) {
+  if (eob == 1) {
+    *eobx = 0;
+    *eoby = 0;
+    return;
+  }
+
+  const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
+  const int eob_row = (eob - 1) >> tx_w_log2;
+  const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
+  *eobx = eobxy & 0xFF;
+  *eoby = eobxy >> 8;
+}
+
+static int eob_fill[32] = {
+  0,  7,  7,  7,  7,  7,  7,  7,  15, 15, 15, 15, 15, 15, 15, 15,
+  31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+};
+
+static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+                                                 TX_SIZE tx_size, int eob) {
+  eob -= 1;
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
+  *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
+  const int temp_eoby = eob / (eobx_max + 1);
+  assert(temp_eoby < 32);
+  *eoby = eob_fill[temp_eoby];
+}
+
+static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+                                                 TX_SIZE tx_size, int eob) {
+  eob -= 1;
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
+  *eobx = eob / (eoby_max + 1);
+  *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
+}
+
+typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output,
+                                   int8_t cos_bit);
+
+void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size, int eob);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h
deleted file mode 100644
index fd0a6ed2c..000000000
--- a/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h
+++ /dev/null
@@ -1,144 +0,0 @@
-#ifndef AV1_TXMF1D_SSE2_H_
-#define AV1_TXMF1D_SSE2_H_
-
-#include <smmintrin.h>
-#include "av1/common/av1_txfm.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output,
-                          const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output,
-                          const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range);
-
-void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output,
-                            const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
-                            const int8_t *cos_bit, const int8_t *stage_range);
-
-void av1_idct4_new_sse4_1(const __m128i *input, __m128i *output,
-                          const int8_t *cos_bit, const int8_t *stage_range);
-void av1_idct8_new_sse4_1(const __m128i *input, __m128i *output,
-                          const int8_t *cos_bit, const int8_t *stage_range);
-void av1_idct16_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range);
-void av1_idct32_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range);
-void av1_idct64_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range);
-
-void av1_iadst4_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iadst8_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iadst16_new_sse4_1(const __m128i *input, __m128i *output,
-                            const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iadst32_new_sse4_1(const __m128i *input, __m128i *output,
-                            const int8_t *cos_bit, const int8_t *stage_range);
-
-static INLINE void transpose_32_4x4(int stride, const __m128i *input,
-                                    __m128i *output) {
-  __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
-  __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
-  __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
-  __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
-
-  output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
-  output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
-  output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
-  output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
-}
-
-// the entire input block can be represent by a grid of 4x4 blocks
-// each 4x4 blocks can be represent by 4 vertical __m128i
-// we first transpose each 4x4 block internally
-// then transpose the grid
-static INLINE void transpose_32(int txfm_size, const __m128i *input,
-                                __m128i *output) {
-  const int num_per_128 = 4;
-  const int row_size = txfm_size;
-  const int col_size = txfm_size / num_per_128;
-  int r, c;
-
-  // transpose each 4x4 block internally
-  for (r = 0; r < row_size; r += 4) {
-    for (c = 0; c < col_size; c++) {
-      transpose_32_4x4(col_size, &input[r * col_size + c],
-                       &output[c * 4 * col_size + r / 4]);
-    }
-  }
-}
-
-static INLINE __m128i round_shift_32_sse4_1(__m128i vec, int bit) {
-  __m128i tmp, round;
-  round = _mm_set1_epi32(1 << (bit - 1));
-  tmp = _mm_add_epi32(vec, round);
-  return _mm_srai_epi32(tmp, bit);
-}
-
-static INLINE void round_shift_array_32_sse4_1(__m128i *input, __m128i *output,
-                                               const int size, const int bit) {
-  if (bit > 0) {
-    int i;
-    for (i = 0; i < size; i++) {
-      output[i] = round_shift_32_sse4_1(input[i], bit);
-    }
-  } else {
-    int i;
-    for (i = 0; i < size; i++) {
-      output[i] = _mm_slli_epi32(input[i], -bit);
-    }
-  }
-}
-
-// out0 = in0*w0 + in1*w1
-// out1 = -in1*w0 + in0*w1
-#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
-  do {                                                         \
-    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
-    ww0 = _mm_set1_epi32(w0);                                  \
-    ww1 = _mm_set1_epi32(w1);                                  \
-    in0_w0 = _mm_mullo_epi32(in0, ww0);                        \
-    in1_w1 = _mm_mullo_epi32(in1, ww1);                        \
-    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
-    out0 = round_shift_32_sse4_1(out0, bit);                   \
-    in0_w1 = _mm_mullo_epi32(in0, ww1);                        \
-    in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
-    out1 = _mm_sub_epi32(in0_w1, in1_w0);                      \
-    out1 = round_shift_32_sse4_1(out1, bit);                   \
-  } while (0)
-
-// out0 = in0*w0 + in1*w1
-// out1 = in1*w0 - in0*w1
-#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
-  do {                                                         \
-    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
-    ww0 = _mm_set1_epi32(w0);                                  \
-    ww1 = _mm_set1_epi32(w1);                                  \
-    in0_w0 = _mm_mullo_epi32(in0, ww0);                        \
-    in1_w1 = _mm_mullo_epi32(in1, ww1);                        \
-    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
-    out0 = round_shift_32_sse4_1(out0, bit);                   \
-    in0_w1 = _mm_mullo_epi32(in0, ww1);                        \
-    in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
-    out1 = _mm_sub_epi32(in1_w0, in0_w1);                      \
-    out1 = round_shift_32_sse4_1(out1, bit);                   \
-  } while (0)
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // AV1_TXMF1D_SSE2_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse2.h b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
new file mode 100644
index 000000000..721cfe059
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+#define AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE void btf_16_w4_sse2(
+    const __m128i *const w0, const __m128i *const w1, const __m128i __rounding,
+    const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1,
+    __m128i *const out0, __m128i *const out1) {
+  const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1);
+  const __m128i u0 = _mm_madd_epi16(t0, *w0);
+  const __m128i v0 = _mm_madd_epi16(t0, *w1);
+  const __m128i a0 = _mm_add_epi32(u0, __rounding);
+  const __m128i b0 = _mm_add_epi32(v0, __rounding);
+  const __m128i c0 = _mm_srai_epi32(a0, cos_bit);
+  const __m128i d0 = _mm_srai_epi32(b0, cos_bit);
+
+  *out0 = _mm_packs_epi32(c0, c0);
+  *out1 = _mm_packs_epi32(d0, c0);
+}
+
+#define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \
+  {                                                  \
+    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
+    __m128i u0 = _mm_madd_epi16(t0, w0);             \
+    __m128i v0 = _mm_madd_epi16(t0, w1);             \
+                                                     \
+    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
+    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
+                                                     \
+    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
+    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
+                                                     \
+    out0 = _mm_packs_epi32(c0, c0);                  \
+    out1 = _mm_packs_epi32(d0, d0);                  \
+  }
+
+#define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
+  {                                               \
+    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
+    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
+    __m128i u0 = _mm_madd_epi16(t0, w0);          \
+    __m128i u1 = _mm_madd_epi16(t1, w0);          \
+    __m128i v0 = _mm_madd_epi16(t0, w1);          \
+    __m128i v1 = _mm_madd_epi16(t1, w1);          \
+                                                  \
+    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
+    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
+    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
+    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
+                                                  \
+    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
+    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
+    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
+    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
+                                                  \
+    out0 = _mm_packs_epi32(c0, c1);               \
+    out1 = _mm_packs_epi32(d0, d1);               \
+  }
+
+static INLINE __m128i load_16bit_to_16bit(const int16_t *a) {
+  return _mm_load_si128((const __m128i *)a);
+}
+
+static INLINE __m128i load_32bit_to_16bit(const int32_t *a) {
+  const __m128i a_low = _mm_load_si128((const __m128i *)a);
+  return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
+}
+
+static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) {
+  const __m128i a_low = _mm_load_si128((const __m128i *)a);
+  return _mm_packs_epi32(a_low, a_low);
+}
+
+// Store 4 16 bit values. Sign extend the values.
+static INLINE void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) {
+  const __m128i a_lo = _mm_unpacklo_epi16(a, a);
+  const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
+  _mm_store_si128((__m128i *)b, a_1);
+}
+
+// Store 8 16 bit values. Sign extend the values.
+static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) {
+  const __m128i a_lo = _mm_unpacklo_epi16(a, a);
+  const __m128i a_hi = _mm_unpackhi_epi16(a, a);
+  const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
+  const __m128i a_2 = _mm_srai_epi32(a_hi, 16);
+  _mm_store_si128((__m128i *)b, a_1);
+  _mm_store_si128((__m128i *)(b + 4), a_2);
+}
+
+static INLINE __m128i scale_round_sse2(const __m128i a, const int scale) {
+  const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1));
+  const __m128i b = _mm_madd_epi16(a, scale_rounding);
+  return _mm_srai_epi32(b, NewSqrt2Bits);
+}
+
+static INLINE void store_rect_16bit_to_32bit_w4(const __m128i a,
+                                                int32_t *const b) {
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a_lo = _mm_unpacklo_epi16(a, one);
+  const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+  _mm_store_si128((__m128i *)b, b_lo);
+}
+
+static INLINE void store_rect_16bit_to_32bit(const __m128i a,
+                                             int32_t *const b) {
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a_lo = _mm_unpacklo_epi16(a, one);
+  const __m128i a_hi = _mm_unpackhi_epi16(a, one);
+  const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+  const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
+  _mm_store_si128((__m128i *)b, b_lo);
+  _mm_store_si128((__m128i *)(b + 4), b_hi);
+}
+
+static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
+                                                 const int stride,
+                                                 __m128i *const out,
+                                                 const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
+  }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
+                                                      const int stride,
+                                                      __m128i *const out,
+                                                      const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
+  }
+}
+
+static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
+                                              __m128i *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_16bit_to_16bit(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
+                                                   int stride, __m128i *out,
+                                                   int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride,
+                                              __m128i *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_32bit_to_16bit(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride,
+                                                 __m128i *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_32bit_to_16bit_w4(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in,
+                                                   int stride, __m128i *out,
+                                                   int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride);
+  }
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in,
+                                                  int32_t *const out,
+                                                  const int stride,
+                                                  const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_16bit_to_32bit_w4(in[i], out + i * stride);
+  }
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in,
+                                                  int32_t *const out,
+                                                  const int stride,
+                                                  const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_16bit_to_32bit(in[i], out + i * stride);
+  }
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in,
+                                                       int32_t *const out,
+                                                       const int stride,
+                                                       const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit_w4(in[i], out + i * stride);
+  }
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in,
+                                                       int32_t *const out,
+                                                       const int stride,
+                                                       const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit(in[i], out + i * stride);
+  }
+}
+
+static INLINE void store_buffer_16bit_to_16bit_8x8(const __m128i *in,
+                                                   uint16_t *out,
+                                                   const int stride) {
+  for (int i = 0; i < 8; ++i) {
+    _mm_store_si128((__m128i *)(out + i * stride), in[i]);
+  }
+}
+
+static INLINE void round_shift_16bit(__m128i *in, int size, int bit) {
+  if (bit < 0) {
+    bit = -bit;
+    __m128i rounding = _mm_set1_epi16(1 << (bit - 1));
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm_adds_epi16(in[i], rounding);
+      in[i] = _mm_srai_epi16(in[i], bit);
+    }
+  } else if (bit > 0) {
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm_slli_epi16(in[i], bit);
+    }
+  }
+}
+
+static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
+  for (int i = 0; i < size; ++i) {
+    out[size - i - 1] = in[i];
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+typedef void (*transform_1d_sse2)(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit);
+
+typedef struct {
+  transform_1d_sse2 col, row;  // vertical and horizontal
+} transform_2d_sse2;
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+#endif  // AV1_COMMON_X86_AV1_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.c b/third_party/aom/av1/common/x86/av1_txfm_sse4.c
new file mode 100644
index 000000000..cccc62f03
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.c
@@ -0,0 +1,10 @@
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+
+void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit) {
+  __m128i *const vec = (__m128i *)arr;
+  const int vec_size = size >> 2;
+  av1_round_shift_array_32_sse4_1(vec, vec, vec_size, bit);
+}
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.h b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
new file mode 100644
index 000000000..faf7251fa
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
@@ -0,0 +1,60 @@
+#ifndef AV1_TXFM_SSE4_H_
+#define AV1_TXFM_SSE4_H_
+
+#include <smmintrin.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) {
+  __m128i tmp, round;
+  round = _mm_set1_epi32(1 << (bit - 1));
+  tmp = _mm_add_epi32(vec, round);
+  return _mm_srai_epi32(tmp, bit);
+}
+
+static INLINE void av1_round_shift_array_32_sse4_1(__m128i *input,
+                                                   __m128i *output,
+                                                   const int size,
+                                                   const int bit) {
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = av1_round_shift_32_sse4_1(input[i], bit);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = _mm_slli_epi32(input[i], -bit);
+    }
+  }
+}
+
+static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input,
+                                                        __m128i *output,
+                                                        const int size,
+                                                        const int bit) {
+  const __m128i sqrt2 = _mm_set1_epi32(NewSqrt2);
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      const __m128i r0 = av1_round_shift_32_sse4_1(input[i], bit);
+      const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
+      output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      const __m128i r0 = _mm_slli_epi32(input[i], -bit);
+      const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
+      output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
+    }
+  }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AV1_TXFM_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/cfl_avx2.c b/third_party/aom/av1/common/x86/cfl_avx2.c
new file mode 100644
index 000000000..a8bfdcce6
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_avx2.c
@@ -0,0 +1,491 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#include "av1/common/x86/cfl_simd.h"
+
+#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd)                           \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 32)                                     \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 16)                                     \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 8)                                      \
+  cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2(    \
+      TX_SIZE tx_size) {                                                   \
+    static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {     \
+      subsample_##bd##_##sub##_4x4_ssse3,   /* 4x4 */                      \
+      subsample_##bd##_##sub##_8x8_ssse3,   /* 8x8 */                      \
+      subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */                    \
+      subsample_##bd##_##sub##_32x32_avx2,  /* 32x32 */                    \
+      cfl_subsample_##bd##_null,            /* 64x64 (invalid CFL size) */ \
+      subsample_##bd##_##sub##_4x8_ssse3,   /* 4x8 */                      \
+      subsample_##bd##_##sub##_8x4_ssse3,   /* 8x4 */                      \
+      subsample_##bd##_##sub##_8x16_ssse3,  /* 8x16 */                     \
+      subsample_##bd##_##sub##_16x8_ssse3,  /* 16x8 */                     \
+      subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */                    \
+      subsample_##bd##_##sub##_32x16_avx2,  /* 32x16 */                    \
+      cfl_subsample_##bd##_null,            /* 32x64 (invalid CFL size) */ \
+      cfl_subsample_##bd##_null,            /* 64x32 (invalid CFL size) */ \
+      subsample_##bd##_##sub##_4x16_ssse3,  /* 4x16  */                    \
+      subsample_##bd##_##sub##_16x4_ssse3,  /* 16x4  */                    \
+      subsample_##bd##_##sub##_8x32_ssse3,  /* 8x32  */                    \
+      subsample_##bd##_##sub##_32x8_avx2,   /* 32x8  */                    \
+      cfl_subsample_##bd##_null,            /* 16x64 (invalid CFL size) */ \
+      cfl_subsample_##bd##_null,            /* 64x16 (invalid CFL size) */ \
+    };                                                                     \
+    return subfn_##sub[tx_size];                                           \
+  }
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
+ */
+static void cfl_luma_subsampling_420_lbd_avx2(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;                               // Forever 32
+  const __m256i twos = _mm256_set1_epi8(2);  // Thirty two twos
+  const int luma_stride = input_stride << 1;
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
+
+    __m256i top_16x16 = _mm256_maddubs_epi16(top, twos);
+    __m256i bot_16x16 = _mm256_maddubs_epi16(bot, twos);
+    __m256i sum_16x16 = _mm256_add_epi16(top_16x16, bot_16x16);
+
+    _mm256_storeu_si256(row, sum_16x16);
+
+    input += luma_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, lbd)
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static void cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;                                // Forever 32
+  const __m256i fours = _mm256_set1_epi8(4);  // Thirty two fours
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i top_16x16 = _mm256_maddubs_epi16(top, fours);
+    _mm256_storeu_si256(row, top_16x16);
+    input += input_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, lbd)
+
+/**
+ * Multiplies the pixels by 8 (scaling in Q3). The AVX2 subsampling is only
+ * performed on block of width 32.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;  // Forever 32
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  const __m256i zeros = _mm256_setzero_si256();
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    top = _mm256_permute4x64_epi64(top, _MM_SHUFFLE(3, 1, 2, 0));
+
+    __m256i row_lo = _mm256_unpacklo_epi8(top, zeros);
+    row_lo = _mm256_slli_epi16(row_lo, 3);
+    __m256i row_hi = _mm256_unpackhi_epi8(top, zeros);
+    row_hi = _mm256_slli_epi16(row_hi, 3);
+
+    _mm256_storeu_si256(row, row_lo);
+    _mm256_storeu_si256(row + 1, row_hi);
+
+    input += input_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd)
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
+ */
+static void cfl_luma_subsampling_420_hbd_avx2(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;  // Forever 32
+  const int luma_stride = input_stride << 1;
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
+    __m256i sum = _mm256_add_epi16(top, bot);
+
+    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+    __m256i bot_1 = _mm256_loadu_si256((__m256i *)(input + 16 + input_stride));
+    __m256i sum_1 = _mm256_add_epi16(top_1, bot_1);
+
+    __m256i hsum = _mm256_hadd_epi16(sum, sum_1);
+    hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
+    hsum = _mm256_add_epi16(hsum, hsum);
+
+    _mm256_storeu_si256(row, hsum);
+
+    input += luma_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, hbd)
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ */
+static void cfl_luma_subsampling_422_hbd_avx2(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;  // Forever 32
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+    __m256i hsum = _mm256_hadd_epi16(top, top_1);
+    hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
+    hsum = _mm256_slli_epi16(hsum, 2);
+
+    _mm256_storeu_si256(row, hsum);
+
+    input += input_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, hbd)
+
+static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;  // Forever 32
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+    _mm256_storeu_si256(row, _mm256_slli_epi16(top, 3));
+    _mm256_storeu_si256(row + 1, _mm256_slli_epi16(top_1, 3));
+    input += input_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd)
+
+static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12,
+                                        __m256i alpha_sign, __m256i dc_q0) {
+  __m256i ac_q3 = _mm256_loadu_si256(input);
+  __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3);
+  __m256i scaled_luma_q0 =
+      _mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12);
+  scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign);
+  return _mm256_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+static INLINE void cfl_predict_lbd_avx2(const int16_t *pred_buf_q3,
+                                        uint8_t *dst, int dst_stride,
+                                        int alpha_q3, int width, int height) {
+  (void)width;
+  const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
+  const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
+  const __m256i dc_q0 = _mm256_set1_epi16(*dst);
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+
+  do {
+    __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+    __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+    res = _mm256_packus_epi16(res, next);
+    res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_storeu_si256((__m256i *)dst, res);
+    dst += dst_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_PREDICT_X(avx2, 32, 8, lbd);
+CFL_PREDICT_X(avx2, 32, 16, lbd);
+CFL_PREDICT_X(avx2, 32, 32, lbd);
+
+cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
+  static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = {
+    predict_lbd_4x4_ssse3,   /* 4x4 */
+    predict_lbd_8x8_ssse3,   /* 8x8 */
+    predict_lbd_16x16_ssse3, /* 16x16 */
+    predict_lbd_32x32_avx2,  /* 32x32 */
+    cfl_predict_lbd_null,    /* 64x64 (invalid CFL size) */
+    predict_lbd_4x8_ssse3,   /* 4x8 */
+    predict_lbd_8x4_ssse3,   /* 8x4 */
+    predict_lbd_8x16_ssse3,  /* 8x16 */
+    predict_lbd_16x8_ssse3,  /* 16x8 */
+    predict_lbd_16x32_ssse3, /* 16x32 */
+    predict_lbd_32x16_avx2,  /* 32x16 */
+    cfl_predict_lbd_null,    /* 32x64 (invalid CFL size) */
+    cfl_predict_lbd_null,    /* 64x32 (invalid CFL size) */
+    predict_lbd_4x16_ssse3,  /* 4x16  */
+    predict_lbd_16x4_ssse3,  /* 16x4  */
+    predict_lbd_8x32_ssse3,  /* 8x32  */
+    predict_lbd_32x8_avx2,   /* 32x8  */
+    cfl_predict_lbd_null,    /* 16x64 (invalid CFL size) */
+    cfl_predict_lbd_null,    /* 64x16 (invalid CFL size) */
+  };
+  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
+  // function pointer array out of bounds.
+  return pred[tx_size % TX_SIZES_ALL];
+}
+
+static __m256i highbd_max_epi16(int bd) {
+  const __m256i neg_one = _mm256_set1_epi16(-1);
+  // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
+  return _mm256_xor_si256(_mm256_slli_epi16(neg_one, bd), neg_one);
+}
+
+static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) {
+  return _mm256_max_epi16(_mm256_min_epi16(u, max), zero);
+}
+
+static INLINE void cfl_predict_hbd_avx2(const int16_t *pred_buf_q3,
+                                        uint16_t *dst, int dst_stride,
+                                        int alpha_q3, int bd, int width,
+                                        int height) {
+  // Use SSSE3 version for smaller widths
+  assert(width == 16 || width == 32);
+  const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
+  const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
+  const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)dst);
+  const __m256i max = highbd_max_epi16(bd);
+
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  do {
+    const __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+    _mm256_storeu_si256((__m256i *)dst,
+                        highbd_clamp_epi16(res, _mm256_setzero_si256(), max));
+    if (width == 32) {
+      const __m256i res_1 =
+          predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+      _mm256_storeu_si256(
+          (__m256i *)(dst + 16),
+          highbd_clamp_epi16(res_1, _mm256_setzero_si256(), max));
+    }
+    dst += dst_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_PREDICT_X(avx2, 16, 4, hbd)
+CFL_PREDICT_X(avx2, 16, 8, hbd)
+CFL_PREDICT_X(avx2, 16, 16, hbd)
+CFL_PREDICT_X(avx2, 16, 32, hbd)
+CFL_PREDICT_X(avx2, 32, 8, hbd)
+CFL_PREDICT_X(avx2, 32, 16, hbd)
+CFL_PREDICT_X(avx2, 32, 32, hbd)
+
+cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size) {
+  static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = {
+    predict_hbd_4x4_ssse3,  /* 4x4 */
+    predict_hbd_8x8_ssse3,  /* 8x8 */
+    predict_hbd_16x16_avx2, /* 16x16 */
+    predict_hbd_32x32_avx2, /* 32x32 */
+    cfl_predict_hbd_null,   /* 64x64 (invalid CFL size) */
+    predict_hbd_4x8_ssse3,  /* 4x8 */
+    predict_hbd_8x4_ssse3,  /* 8x4 */
+    predict_hbd_8x16_ssse3, /* 8x16 */
+    predict_hbd_16x8_avx2,  /* 16x8 */
+    predict_hbd_16x32_avx2, /* 16x32 */
+    predict_hbd_32x16_avx2, /* 32x16 */
+    cfl_predict_hbd_null,   /* 32x64 (invalid CFL size) */
+    cfl_predict_hbd_null,   /* 64x32 (invalid CFL size) */
+    predict_hbd_4x16_ssse3, /* 4x16  */
+    predict_hbd_16x4_avx2,  /* 16x4  */
+    predict_hbd_8x32_ssse3, /* 8x32  */
+    predict_hbd_32x8_avx2,  /* 32x8  */
+    cfl_predict_hbd_null,   /* 16x64 (invalid CFL size) */
+    cfl_predict_hbd_null,   /* 64x16 (invalid CFL size) */
+  };
+  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
+  // function pointer array out of bounds.
+  return pred[tx_size % TX_SIZES_ALL];
+}
+
+// Returns a vector where all the (32-bits) elements are the sum of all the
+// lanes in a.
+static INLINE __m256i fill_sum_epi32(__m256i a) {
+  // Given that a == [A, B, C, D, E, F, G, H]
+  a = _mm256_hadd_epi32(a, a);
+  // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H
+  // a == [A', C', A', C', E', G', E', G']
+  a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0));
+  // a == [A', C', E', G', A', C', E', G']
+  a = _mm256_hadd_epi32(a, a);
+  // Given that A'' == A' + C' and E'' == E' + G'
+  // a == [A'', E'', A'', E'', A'', E'', A'', E'']
+  return _mm256_hadd_epi32(a, a);
+  // Given that A''' == A'' + E''
+  // a == [A''', A''', A''', A''', A''', A''', A''', A''']
+}
+
+static INLINE __m256i _mm256_addl_epi16(__m256i a) {
+  return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()),
+                          _mm256_unpackhi_epi16(a, _mm256_setzero_si256()));
+}
+
+static INLINE void subtract_average_avx2(const uint16_t *src_ptr,
+                                         int16_t *dst_ptr, int width,
+                                         int height, int round_offset,
+                                         int num_pel_log2) {
+  // Use SSE2 version for smaller widths
+  assert(width == 16 || width == 32);
+
+  const __m256i *src = (__m256i *)src_ptr;
+  const __m256i *const end = src + height * CFL_BUF_LINE_I256;
+  // To maximize usage of the AVX2 registers, we sum two rows per loop
+  // iteration
+  const int step = 2 * CFL_BUF_LINE_I256;
+
+  __m256i sum = _mm256_setzero_si256();
+  // For width 32, we use a second sum accumulator to reduce accumulator
+  // dependencies in the loop.
+  __m256i sum2;
+  if (width == 32) sum2 = _mm256_setzero_si256();
+
+  do {
+    // Add top row to the bottom row
+    __m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src),
+                                  _mm256_loadu_si256(src + CFL_BUF_LINE_I256));
+    sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0));
+    if (width == 32) { /* Don't worry, this if it gets optimized out. */
+      // Add the second part of the top row to the second part of the bottom row
+      __m256i l1 =
+          _mm256_add_epi16(_mm256_loadu_si256(src + 1),
+                           _mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256));
+      sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1));
+    }
+    src += step;
+  } while (src < end);
+  // Combine both sum accumulators
+  if (width == 32) sum = _mm256_add_epi32(sum, sum2);
+
+  __m256i fill = fill_sum_epi32(sum);
+
+  __m256i avg_epi16 = _mm256_srli_epi32(
+      _mm256_add_epi32(fill, _mm256_set1_epi32(round_offset)), num_pel_log2);
+  avg_epi16 = _mm256_packs_epi32(avg_epi16, avg_epi16);
+
+  // Store and subtract loop
+  src = (__m256i *)src_ptr;
+  __m256i *dst = (__m256i *)dst_ptr;
+  do {
+    _mm256_storeu_si256(dst,
+                        _mm256_sub_epi16(_mm256_loadu_si256(src), avg_epi16));
+    if (width == 32) {
+      _mm256_storeu_si256(
+          dst + 1, _mm256_sub_epi16(_mm256_loadu_si256(src + 1), avg_epi16));
+    }
+    src += CFL_BUF_LINE_I256;
+    dst += CFL_BUF_LINE_I256;
+  } while (src < end);
+}
+
+// Declare wrappers for AVX2 sizes
+CFL_SUB_AVG_X(avx2, 16, 4, 32, 6)
+CFL_SUB_AVG_X(avx2, 16, 8, 64, 7)
+CFL_SUB_AVG_X(avx2, 16, 16, 128, 8)
+CFL_SUB_AVG_X(avx2, 16, 32, 256, 9)
+CFL_SUB_AVG_X(avx2, 32, 8, 128, 8)
+CFL_SUB_AVG_X(avx2, 32, 16, 256, 9)
+CFL_SUB_AVG_X(avx2, 32, 32, 512, 10)
+
+// Based on the observation that for small blocks AVX2 does not outperform
+// SSE2, we call the SSE2 code for block widths 4 and 8.
+cfl_subtract_average_fn get_subtract_average_fn_avx2(TX_SIZE tx_size) {
+  static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
+    subtract_average_4x4_sse2,   /* 4x4 */
+    subtract_average_8x8_sse2,   /* 8x8 */
+    subtract_average_16x16_avx2, /* 16x16 */
+    subtract_average_32x32_avx2, /* 32x32 */
+    cfl_subtract_average_null,   /* 64x64 (invalid CFL size) */
+    subtract_average_4x8_sse2,   /* 4x8 */
+    subtract_average_8x4_sse2,   /* 8x4 */
+    subtract_average_8x16_sse2,  /* 8x16 */
+    subtract_average_16x8_avx2,  /* 16x8 */
+    subtract_average_16x32_avx2, /* 16x32 */
+    subtract_average_32x16_avx2, /* 32x16 */
+    cfl_subtract_average_null,   /* 32x64 (invalid CFL size) */
+    cfl_subtract_average_null,   /* 64x32 (invalid CFL size) */
+    subtract_average_4x16_sse2,  /* 4x16 */
+    subtract_average_16x4_avx2,  /* 16x4 */
+    subtract_average_8x32_sse2,  /* 8x32 */
+    subtract_average_32x8_avx2,  /* 32x8 */
+    cfl_subtract_average_null,   /* 16x64 (invalid CFL size) */
+    cfl_subtract_average_null,   /* 64x16 (invalid CFL size) */
+  };
+  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
+  // index the function pointer array out of bounds.
+  return sub_avg[tx_size % TX_SIZES_ALL];
+}
diff --git a/third_party/aom/av1/common/x86/cfl_simd.h b/third_party/aom/av1/common/x86/cfl_simd.h
new file mode 100644
index 000000000..7479ac3e1
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_simd.h
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/blockd.h"
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void subsample_lbd_420_4x4_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_420_4x8_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_420_4x16_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_lbd_420_8x4_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_420_8x8_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_420_8x16_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_420_8x32_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void subsample_lbd_420_16x4_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_420_16x8_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_420_16x16_ssse3(const uint8_t *input, int input_stride,
+                                   uint16_t *output_q3);
+void subsample_lbd_420_16x32_ssse3(const uint8_t *input, int input_stride,
+                                   uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void subsample_lbd_422_4x4_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_422_4x8_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_422_4x16_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_lbd_422_8x4_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_422_8x8_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_422_8x16_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_422_8x32_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void subsample_lbd_422_16x4_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_422_16x8_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_422_16x16_ssse3(const uint8_t *input, int input_stride,
+                                   uint16_t *output_q3);
+void subsample_lbd_422_16x32_ssse3(const uint8_t *input, int input_stride,
+                                   uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void subsample_lbd_444_4x4_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_444_4x8_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_444_4x16_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_lbd_444_8x4_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_444_8x8_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_444_8x16_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_444_8x32_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void subsample_lbd_444_16x4_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_444_16x8_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_444_16x16_ssse3(const uint8_t *input, int input_stride,
+                                   uint16_t *output_q3);
+void subsample_lbd_444_16x32_ssse3(const uint8_t *input, int input_stride,
+                                   uint16_t *output_q3);
+
+void subsample_hbd_420_4x4_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_420_4x8_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_420_4x16_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_hbd_420_8x4_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_420_8x8_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_420_8x16_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_420_8x32_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void subsample_hbd_420_16x4_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_420_16x8_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_420_16x16_ssse3(const uint16_t *input, int input_stride,
+                                   uint16_t *output_q3);
+void subsample_hbd_420_16x32_ssse3(const uint16_t *input, int input_stride,
+                                   uint16_t *output_q3);
+
+void subsample_hbd_422_4x4_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_422_4x8_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_422_4x16_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_hbd_422_8x4_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_422_8x8_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_422_8x16_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_422_8x32_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void subsample_hbd_422_16x4_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_422_16x8_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_422_16x16_ssse3(const uint16_t *input, int input_stride,
+                                   uint16_t *output_q3);
+void subsample_hbd_422_16x32_ssse3(const uint16_t *input, int input_stride,
+                                   uint16_t *output_q3);
+
+void subsample_hbd_444_4x4_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_444_4x8_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_444_4x16_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_hbd_444_8x4_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_444_8x8_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_444_8x16_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_444_8x32_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void subsample_hbd_444_16x4_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_444_16x8_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_444_16x16_ssse3(const uint16_t *input, int input_stride,
+                                   uint16_t *output_q3);
+void subsample_hbd_444_16x32_ssse3(const uint16_t *input, int input_stride,
+                                   uint16_t *output_q3);
+
+// SSE2 version is optimal for with == 4, we reuse them in AVX2
+void subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst);
+
+// SSE2 version is optimal for with == 8, we reuse them in AVX2
+void subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst);
+
+void predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                           int dst_stride, int alpha_q3);
+void predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                           int dst_stride, int alpha_q3);
+void predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                            int dst_stride, int alpha_q3);
+
+void predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                           int dst_stride, int alpha_q3);
+void predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                           int dst_stride, int alpha_q3);
+void predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                            int dst_stride, int alpha_q3);
+void predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                            int dst_stride, int alpha_q3);
+
+void predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                            int dst_stride, int alpha_q3);
+void predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                            int dst_stride, int alpha_q3);
+void predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                             int dst_stride, int alpha_q3);
+void predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                             int dst_stride, int alpha_q3);
+
+void predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                           int dst_stride, int alpha_q3, int bd);
+void predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                           int dst_stride, int alpha_q3, int bd);
+void predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                            int dst_stride, int alpha_q3, int bd);
+
+void predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                           int dst_stride, int alpha_q3, int bd);
+void predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                           int dst_stride, int alpha_q3, int bd);
+void predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                            int dst_stride, int alpha_q3, int bd);
+void predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                            int dst_stride, int alpha_q3, int bd);
+
+void predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                            int dst_stride, int alpha_q3, int bd);
+void predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                            int dst_stride, int alpha_q3, int bd);
+void predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                             int dst_stride, int alpha_q3, int bd);
+void predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                             int dst_stride, int alpha_q3, int bd);
diff --git a/third_party/aom/av1/common/x86/cfl_sse2.c b/third_party/aom/av1/common/x86/cfl_sse2.c
new file mode 100644
index 000000000..4783fe098
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_sse2.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "av1/common/cfl.h"
+#include "config/av1_rtcd.h"
+
+static INLINE __m128i fill_sum_epi32(__m128i l0) {
+  l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2)));
+  return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1)));
+}
+
+static INLINE void subtract_average_sse2(const uint16_t *src_ptr,
+                                         int16_t *dst_ptr, int width,
+                                         int height, int round_offset,
+                                         int num_pel_log2) {
+  const __m128i zeros = _mm_setzero_si128();
+  const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset);
+  const __m128i *src = (__m128i *)src_ptr;
+  const __m128i *const end = src + height * CFL_BUF_LINE_I128;
+  const int step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4));
+
+  __m128i sum = zeros;
+  do {
+    __m128i l0;
+    if (width == 4) {
+      l0 = _mm_add_epi16(_mm_loadl_epi64(src),
+                         _mm_loadl_epi64(src + CFL_BUF_LINE_I128));
+      __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128),
+                                 _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128));
+      sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+                                             _mm_unpacklo_epi16(l1, zeros)));
+    } else {
+      if (width == 8) {
+        l0 = _mm_add_epi16(_mm_loadu_si128(src),
+                           _mm_loadu_si128(src + CFL_BUF_LINE_I128));
+      } else {
+        l0 = _mm_add_epi16(_mm_loadu_si128(src), _mm_loadu_si128(src + 1));
+      }
+      sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+                                             _mm_unpackhi_epi16(l0, zeros)));
+      if (width == 32) {
+        l0 = _mm_add_epi16(_mm_loadu_si128(src + 2), _mm_loadu_si128(src + 3));
+        sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+                                               _mm_unpackhi_epi16(l0, zeros)));
+      }
+    }
+    src += step;
+  } while (src < end);
+
+  sum = fill_sum_epi32(sum);
+
+  __m128i avg_epi16 =
+      _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2);
+  avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16);
+
+  src = (__m128i *)src_ptr;
+  __m128i *dst = (__m128i *)dst_ptr;
+  do {
+    if (width == 4) {
+      _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16));
+    } else {
+      _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16));
+      if (width > 8) {
+        _mm_storeu_si128(dst + 1,
+                         _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16));
+        if (width == 32) {
+          _mm_storeu_si128(dst + 2,
+                           _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16));
+          _mm_storeu_si128(dst + 3,
+                           _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16));
+        }
+      }
+    }
+    src += CFL_BUF_LINE_I128;
+    dst += CFL_BUF_LINE_I128;
+  } while (src < end);
+}
+
+CFL_SUB_AVG_FN(sse2)
diff --git a/third_party/aom/av1/common/x86/cfl_ssse3.c b/third_party/aom/av1/common/x86/cfl_ssse3.c
new file mode 100644
index 000000000..bbf007295
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_ssse3.c
@@ -0,0 +1,393 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#include "av1/common/x86/cfl_simd.h"
+
+// Load 32-bit integer from memory into the first element of dst.
+static INLINE __m128i _mm_loadh_epi32(__m128i const *mem_addr) {
+  return _mm_cvtsi32_si128(*((int *)mem_addr));
+}
+
+// Store 32-bit integer from the first element of a into memory.
+static INLINE void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) {
+  *((int *)mem_addr) = _mm_cvtsi128_si32(a);
+}
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const __m128i twos = _mm_set1_epi8(2);
+  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+  const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128;
+  const int luma_stride = input_stride << 1;
+  do {
+    if (width == 4) {
+      __m128i top = _mm_loadh_epi32((__m128i *)input);
+      top = _mm_maddubs_epi16(top, twos);
+      __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride));
+      bot = _mm_maddubs_epi16(bot, twos);
+      const __m128i sum = _mm_add_epi16(top, bot);
+      _mm_storeh_epi32(pred_buf_m128i, sum);
+    } else if (width == 8) {
+      __m128i top = _mm_loadl_epi64((__m128i *)input);
+      top = _mm_maddubs_epi16(top, twos);
+      __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
+      bot = _mm_maddubs_epi16(bot, twos);
+      const __m128i sum = _mm_add_epi16(top, bot);
+      _mm_storel_epi64(pred_buf_m128i, sum);
+    } else {
+      __m128i top = _mm_loadu_si128((__m128i *)input);
+      top = _mm_maddubs_epi16(top, twos);
+      __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
+      bot = _mm_maddubs_epi16(bot, twos);
+      const __m128i sum = _mm_add_epi16(top, bot);
+      _mm_storeu_si128(pred_buf_m128i, sum);
+      if (width == 32) {
+        __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        __m128i bot_1 =
+            _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
+        top_1 = _mm_maddubs_epi16(top_1, twos);
+        bot_1 = _mm_maddubs_epi16(bot_1, twos);
+        __m128i sum_1 = _mm_add_epi16(top_1, bot_1);
+        _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
+      }
+    }
+    input += luma_stride;
+    pred_buf_m128i += CFL_BUF_LINE_I128;
+  } while (pred_buf_m128i < end);
+}
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const __m128i fours = _mm_set1_epi8(4);
+  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+  do {
+    if (width == 4) {
+      __m128i top = _mm_loadh_epi32((__m128i *)input);
+      top = _mm_maddubs_epi16(top, fours);
+      _mm_storeh_epi32(pred_buf_m128i, top);
+    } else if (width == 8) {
+      __m128i top = _mm_loadl_epi64((__m128i *)input);
+      top = _mm_maddubs_epi16(top, fours);
+      _mm_storel_epi64(pred_buf_m128i, top);
+    } else {
+      __m128i top = _mm_loadu_si128((__m128i *)input);
+      top = _mm_maddubs_epi16(top, fours);
+      _mm_storeu_si128(pred_buf_m128i, top);
+      if (width == 32) {
+        __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        top_1 = _mm_maddubs_epi16(top_1, fours);
+        _mm_storeu_si128(pred_buf_m128i + 1, top_1);
+      }
+    }
+    input += input_stride;
+    pred_buf_m128i += CFL_BUF_LINE_I128;
+  } while (pred_buf_m128i < end);
+}
+
+/**
+ * Multiplies the pixels by 8 (scaling in Q3).
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const __m128i zeros = _mm_setzero_si128();
+  const int luma_stride = input_stride;
+  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+  do {
+    if (width == 4) {
+      __m128i row = _mm_loadh_epi32((__m128i *)input);
+      row = _mm_unpacklo_epi8(row, zeros);
+      _mm_storel_epi64(pred_buf_m128i, _mm_slli_epi16(row, 3));
+    } else if (width == 8) {
+      __m128i row = _mm_loadl_epi64((__m128i *)input);
+      row = _mm_unpacklo_epi8(row, zeros);
+      _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row, 3));
+    } else {
+      __m128i row = _mm_loadu_si128((__m128i *)input);
+      const __m128i row_lo = _mm_unpacklo_epi8(row, zeros);
+      const __m128i row_hi = _mm_unpackhi_epi8(row, zeros);
+      _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row_lo, 3));
+      _mm_storeu_si128(pred_buf_m128i + 1, _mm_slli_epi16(row_hi, 3));
+      if (width == 32) {
+        __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        const __m128i row_1_lo = _mm_unpacklo_epi8(row_1, zeros);
+        const __m128i row_1_hi = _mm_unpackhi_epi8(row_1, zeros);
+        _mm_storeu_si128(pred_buf_m128i + 2, _mm_slli_epi16(row_1_lo, 3));
+        _mm_storeu_si128(pred_buf_m128i + 3, _mm_slli_epi16(row_1_hi, 3));
+      }
+    }
+    input += luma_stride;
+    pred_buf_m128i += CFL_BUF_LINE_I128;
+  } while (pred_buf_m128i < end);
+}
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+  const int luma_stride = input_stride << 1;
+  do {
+    if (width == 4) {
+      const __m128i top = _mm_loadl_epi64((__m128i *)input);
+      const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
+      __m128i sum = _mm_add_epi16(top, bot);
+      sum = _mm_hadd_epi16(sum, sum);
+      *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum));
+    } else {
+      const __m128i top = _mm_loadu_si128((__m128i *)input);
+      const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
+      __m128i sum = _mm_add_epi16(top, bot);
+      if (width == 8) {
+        sum = _mm_hadd_epi16(sum, sum);
+        _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
+      } else {
+        const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        const __m128i bot_1 =
+            _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
+        sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1));
+        _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
+        if (width == 32) {
+          const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+          const __m128i bot_2 =
+              _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2);
+          const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+          const __m128i bot_3 =
+              _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3);
+          const __m128i sum_2 = _mm_add_epi16(top_2, bot_2);
+          const __m128i sum_3 = _mm_add_epi16(top_3, bot_3);
+          __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3);
+          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1,
+                           _mm_add_epi16(next_sum, next_sum));
+        }
+      }
+    }
+    input += luma_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+  do {
+    if (width == 4) {
+      const __m128i top = _mm_loadl_epi64((__m128i *)input);
+      const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
+      _mm_storeh_epi32(pred_buf_m128i, sum);
+    } else {
+      const __m128i top = _mm_loadu_si128((__m128i *)input);
+      if (width == 8) {
+        const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
+        _mm_storel_epi64(pred_buf_m128i, sum);
+      } else {
+        const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top_1), 2);
+        _mm_storeu_si128(pred_buf_m128i, sum);
+        if (width == 32) {
+          const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+          const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+          const __m128i sum_1 = _mm_slli_epi16(_mm_hadd_epi16(top_2, top_3), 2);
+          _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
+        }
+      }
+    }
+    pred_buf_m128i += CFL_BUF_LINE_I128;
+    input += input_stride;
+  } while (pred_buf_m128i < end);
+}
+
+static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const __m128i row = _mm_slli_epi16(_mm_loadl_epi64((__m128i *)input), 3);
+      _mm_storel_epi64((__m128i *)pred_buf_q3, row);
+    } else {
+      const __m128i row = _mm_slli_epi16(_mm_loadu_si128((__m128i *)input), 3);
+      _mm_storeu_si128((__m128i *)pred_buf_q3, row);
+      if (width >= 16) {
+        __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        row_1 = _mm_slli_epi16(row_1, 3);
+        _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, row_1);
+        if (width == 32) {
+          __m128i row_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+          row_2 = _mm_slli_epi16(row_2, 3);
+          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 2, row_2);
+          __m128i row_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+          row_3 = _mm_slli_epi16(row_3, 3);
+          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 3, row_3);
+        }
+      }
+    }
+    input += input_stride;
+    pred_buf_q3 += CFL_BUF_LINE;
+  } while (pred_buf_q3 < end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION(ssse3)
+
+static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12,
+                                        __m128i alpha_sign, __m128i dc_q0) {
+  __m128i ac_q3 = _mm_loadu_si128(input);
+  __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+  __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+  scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+  return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3,
+                                         uint8_t *dst, int dst_stride,
+                                         int alpha_q3, int width, int height) {
+  const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
+  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+  const __m128i dc_q0 = _mm_set1_epi16(*dst);
+  __m128i *row = (__m128i *)pred_buf_q3;
+  const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
+  do {
+    __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+    if (width < 16) {
+      res = _mm_packus_epi16(res, res);
+      if (width == 4)
+        _mm_storeh_epi32((__m128i *)dst, res);
+      else
+        _mm_storel_epi64((__m128i *)dst, res);
+    } else {
+      __m128i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+      res = _mm_packus_epi16(res, next);
+      _mm_storeu_si128((__m128i *)dst, res);
+      if (width == 32) {
+        res = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
+        next = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
+        res = _mm_packus_epi16(res, next);
+        _mm_storeu_si128((__m128i *)(dst + 16), res);
+      }
+    }
+    dst += dst_stride;
+  } while ((row += CFL_BUF_LINE_I128) < row_end);
+}
+
+CFL_PREDICT_FN(ssse3, lbd)
+
+static INLINE __m128i highbd_max_epi16(int bd) {
+  const __m128i neg_one = _mm_set1_epi16(-1);
+  // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
+  return _mm_xor_si128(_mm_slli_epi16(neg_one, bd), neg_one);
+}
+
+static INLINE __m128i highbd_clamp_epi16(__m128i u, __m128i zero, __m128i max) {
+  return _mm_max_epi16(_mm_min_epi16(u, max), zero);
+}
+
+static INLINE void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3,
+                                         uint16_t *dst, int dst_stride,
+                                         int alpha_q3, int bd, int width,
+                                         int height) {
+  const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
+  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+  const __m128i dc_q0 = _mm_set1_epi16(*dst);
+  const __m128i max = highbd_max_epi16(bd);
+  const __m128i zeros = _mm_setzero_si128();
+  __m128i *row = (__m128i *)pred_buf_q3;
+  const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
+  do {
+    __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+    res = highbd_clamp_epi16(res, zeros, max);
+    if (width == 4) {
+      _mm_storel_epi64((__m128i *)dst, res);
+    } else {
+      _mm_storeu_si128((__m128i *)dst, res);
+    }
+    if (width >= 16) {
+      const __m128i res_1 =
+          predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+      _mm_storeu_si128(((__m128i *)dst) + 1,
+                       highbd_clamp_epi16(res_1, zeros, max));
+    }
+    if (width == 32) {
+      const __m128i res_2 =
+          predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
+      _mm_storeu_si128((__m128i *)(dst + 16),
+                       highbd_clamp_epi16(res_2, zeros, max));
+      const __m128i res_3 =
+          predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
+      _mm_storeu_si128((__m128i *)(dst + 24),
+                       highbd_clamp_epi16(res_3, zeros, max));
+    }
+    dst += dst_stride;
+  } while ((row += CFL_BUF_LINE_I128) < row_end);
+}
+
+CFL_PREDICT_FN(ssse3, hbd)
diff --git a/third_party/aom/av1/common/x86/convolve_2d_avx2.c b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
new file mode 100644
index 000000000..fd5e90a2e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/convolve.h"
+
+void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             InterpFilterParams *filter_params_x,
+                             InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  const int bd = 8;
+
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = 8;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+  __m256i filt[4], coeffs_h[4], coeffs_v[4];
+
+  assert(conv_params->round_0 > 0);
+
+  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h);
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v);
+
+  const __m256i round_const_h = _mm256_set1_epi16(
+      ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
+  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+  const __m256i sum_round_v = _mm256_set1_epi32(
+      (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+  const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+  const __m256i round_const_v = _mm256_set1_epi32(
+      ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+      ((1 << (offset_bits - conv_params->round_1)) >> 1));
+  const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
+
+  for (j = 0; j < w; j += 8) {
+    for (i = 0; i < im_h; i += 2) {
+      __m256i data = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+
+      // Load the next line
+      if (i + 1 < im_h)
+        data = _mm256_inserti128_si256(
+            data,
+            _mm_loadu_si128(
+                (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
+            1);
+
+      __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
+
+      res =
+          _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
+
+      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+    }
+
+    /* Vertical filter */
+    {
+      __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+      __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+      __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+      __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+      __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+      __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+      __m256i s[8];
+      s[0] = _mm256_unpacklo_epi16(src_0, src_1);
+      s[1] = _mm256_unpacklo_epi16(src_2, src_3);
+      s[2] = _mm256_unpacklo_epi16(src_4, src_5);
+
+      s[4] = _mm256_unpackhi_epi16(src_0, src_1);
+      s[5] = _mm256_unpackhi_epi16(src_2, src_3);
+      s[6] = _mm256_unpackhi_epi16(src_4, src_5);
+
+      for (i = 0; i < h; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        const __m256i s6 =
+            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+        const __m256i s7 =
+            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+        s[3] = _mm256_unpacklo_epi16(s6, s7);
+        s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+        __m256i res_a = convolve(s, coeffs_v);
+        __m256i res_b = convolve(s + 4, coeffs_v);
+
+        // Combine V round and 2F-H-V round into a single rounding
+        res_a =
+            _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);
+        res_b =
+            _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);
+
+        const __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+        const __m256i res_b_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+
+        /* rounding code */
+        // 16 bit conversion
+        const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+        // 8 bit conversion and saturation to uint8
+        const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
+
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+        // Store values into the destination buffer
+        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
+        if (w - j > 4) {
+          _mm_storel_epi64(p_0, res_0);
+          _mm_storel_epi64(p_1, res_1);
+        } else if (w == 4) {
+          xx_storel_32(p_0, res_0);
+          xx_storel_32(p_1, res_1);
+        } else {
+          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+  __m256i s[4];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32));
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]);
+}
+
+void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  InterpFilterParams *filter_params_x,
+                                  InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memcpy(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memcpy(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      memcpy(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memcpy(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m256i s[2];
+      s[0] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      s[1] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[0]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      __m256i s[4];
+      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+      src += src_stride;
+      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
index e4d352c0e..fc0e65453 100644
--- a/third_party/aom/av1/common/x86/convolve_2d_sse2.c
+++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
@@ -11,197 +11,20 @@
 
 #include <emmintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
 #include "av1/common/convolve.h"
 
-#if CONFIG_COMPOUND_ROUND
-void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
-                          CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
-                          InterpFilterParams *filter_params_x,
-                          InterpFilterParams *filter_params_y,
-                          const int subpel_x_q4, const int subpel_y_q4,
-                          ConvolveParams *conv_params) {
-  DECLARE_ALIGNED(16, uint8_t,
-                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
-  int im_h = h + filter_params_y->taps - 1;
-  int im_stride = MAX_SB_SIZE;
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const int do_average = conv_params->do_average;
-  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
-  const __m128i zero = _mm_setzero_si128();
-
-  /* Horizontal filter */
-  {
-    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
-    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << conv_params->round_0) >> 1);
-    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
-    for (i = 0; i < im_h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i data =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-
-        // Filter even-index pixels
-        const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                         _mm_add_epi32(res_2, res_6));
-        res_even =
-            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
-
-        // Filter odd-index pixels
-        const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                        _mm_add_epi32(res_3, res_7));
-        res_odd =
-            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
-
-        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
-        __m128i res = _mm_packs_epi32(res_even, res_odd);
-        res = _mm_packus_epi16(res, res);
-        _mm_storel_epi64((__m128i *)&im_block[i * im_stride + j], res);
-      }
-    }
-  }
-
-  /* Vertical filter */
-  {
-    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
-    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << conv_params->round_1) >> 1);
-    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        // Filter even-index pixels
-        const uint8_t *data = &im_block[i * im_stride + j];
-        const __m128i src_01 = _mm_unpacklo_epi8(
-            _mm_loadl_epi64((__m128i *)(data + 0 * im_stride)),
-            _mm_loadl_epi64((__m128i *)(data + 1 * im_stride)));
-        const __m128i src_23 = _mm_unpacklo_epi8(
-            _mm_loadl_epi64((__m128i *)(data + 2 * im_stride)),
-            _mm_loadl_epi64((__m128i *)(data + 3 * im_stride)));
-        const __m128i src_45 = _mm_unpacklo_epi8(
-            _mm_loadl_epi64((__m128i *)(data + 4 * im_stride)),
-            _mm_loadl_epi64((__m128i *)(data + 5 * im_stride)));
-        const __m128i src_67 = _mm_unpacklo_epi8(
-            _mm_loadl_epi64((__m128i *)(data + 6 * im_stride)),
-            _mm_loadl_epi64((__m128i *)(data + 7 * im_stride)));
-
-        const __m128i src_0 = _mm_unpacklo_epi8(src_01, zero);
-        const __m128i src_2 = _mm_unpacklo_epi8(src_23, zero);
-        const __m128i src_4 = _mm_unpacklo_epi8(src_45, zero);
-        const __m128i src_6 = _mm_unpacklo_epi8(src_67, zero);
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 = _mm_unpackhi_epi8(src_01, zero);
-        const __m128i src_3 = _mm_unpackhi_epi8(src_23, zero);
-        const __m128i src_5 = _mm_unpackhi_epi8(src_45, zero);
-        const __m128i src_7 = _mm_unpackhi_epi8(src_67, zero);
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        const __m128i res_lo_round =
-            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
-        const __m128i res_hi_round =
-            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
-        // Accumulate values into the destination buffer
-        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
-        if (do_average) {
-          _mm_storeu_si128(p + 0,
-                           _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
-          _mm_storeu_si128(p + 1,
-                           _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
-        } else {
-          _mm_storeu_si128(p + 0, res_lo_round);
-          _mm_storeu_si128(p + 1, res_hi_round);
-        }
-      }
-    }
-  }
-}
-#else
-void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
-                          CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
-                          InterpFilterParams *filter_params_x,
-                          InterpFilterParams *filter_params_y,
-                          const int subpel_x_q4, const int subpel_y_q4,
-                          ConvolveParams *conv_params) {
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             InterpFilterParams *filter_params_x,
+                             InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
   const int bd = 8;
 
   DECLARE_ALIGNED(16, int16_t,
@@ -211,10 +34,14 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
   int i, j;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const int do_average = conv_params->do_average;
   const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
 
   const __m128i zero = _mm_setzero_si128();
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+  assert(conv_params->round_0 > 0);
 
   /* Horizontal filter */
   {
@@ -237,7 +64,7 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
     const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
 
     const __m128i round_const = _mm_set1_epi32(
-        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+        (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1));
     const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
 
     for (i = 0; i < im_h; ++i) {
@@ -302,10 +129,14 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
     // coeffs 6 7 6 7 6 7 6 7
     const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
 
+    const __m128i sum_round =
+        _mm_set1_epi32((1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+    const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
     const __m128i round_const = _mm_set1_epi32(
-        ((1 << conv_params->round_1) >> 1) -
-        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
-    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+        ((1 << (offset_bits - conv_params->round_1)) >> 1));
+    const __m128i round_shift = _mm_cvtsi32_si128(bits);
 
     for (i = 0; i < h; ++i) {
       for (j = 0; j < w; j += 8) {
@@ -358,24 +189,285 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
         const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
         const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
 
-        const __m128i res_lo_round =
-            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
-        const __m128i res_hi_round =
-            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+        __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift);
+        __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift);
+
+        res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+                                     round_shift);
+        res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+                                     round_shift);
+
+        const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+        const __m128i res = _mm_packus_epi16(res16, res16);
 
         // Accumulate values into the destination buffer
         __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+
+        if (w == 2) {
+          *(uint16_t *)p = _mm_cvtsi128_si32(res);
+        } else if (w == 4) {
+          *(uint32_t *)p = _mm_cvtsi128_si32(res);
+        } else {
+          _mm_storel_epi64(p, res);
+        }
+      }
+    }
+  }
+}
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+  __m128i s[8];
+  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16));
+  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16));
+  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16));
+  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16));
+  _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+  _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+  _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+  _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+  _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]);
+  _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]);
+  _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]);
+  _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]);
+}
+
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  InterpFilterParams *filter_params_x,
+                                  InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memcpy(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memcpy(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      memcpy(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memcpy(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m128i s[4];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      src += src_stride;
+      s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      __m128i s[8];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+      src += src_stride;
+      s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+      s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+      _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+      _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]);
+      _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]);
+      _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
+
+void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
+                                   uint8_t *dst0, int dst_stride0, int w, int h,
+                                   InterpFilterParams *filter_params_x,
+                                   InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params) {
+  const int bd = 8;
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  int i, j;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+  assert((w % 4) == 0);
+
+  if (!(w % 16)) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 16) {
+        const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]);
+
+        const __m128i d16_lo = _mm_unpacklo_epi8(d8, zero);
+        const __m128i d16_hi = _mm_unpackhi_epi8(d8, zero);
+
+        const __m128i res_lo = _mm_sll_epi16(d16_lo, left_shift);
+        const __m128i res_unsigned_lo = _mm_add_epi16(res_lo, offset_const);
+
+        const __m128i res_hi = _mm_sll_epi16(d16_hi, left_shift);
+        const __m128i res_unsigned_hi = _mm_add_epi16(res_hi, offset_const);
+
         if (do_average) {
-          _mm_storeu_si128(p + 0,
-                           _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
-          _mm_storeu_si128(p + 1,
-                           _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+          const __m128i data_ref_0_lo = _mm_loadu_si128((__m128i *)(&dst[j]));
+          const __m128i data_ref_0_hi =
+              _mm_loadu_si128((__m128i *)(&dst[j + 8]));
+
+          const __m128i comp_avg_res_lo =
+              comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result_lo = convolve_rounding(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i comp_avg_res_hi =
+              comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result_hi = convolve_rounding(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 =
+              _mm_packus_epi16(round_result_lo, round_result_hi);
+
+          _mm_store_si128((__m128i *)(&dst0[j]), res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[j]), res_unsigned_lo);
+          _mm_store_si128((__m128i *)(&dst[j + 8]), res_unsigned_hi);
+        }
+      }
+      src += src_stride;
+      dst += dst_stride;
+      dst0 += dst_stride0;
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]);
+        const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
+
+        const __m128i res = _mm_sll_epi16(d16_0, left_shift);
+        const __m128i res_unsigned = _mm_add_epi16(res, offset_const);
+
+        if (do_average) {
+          const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+          if (w > 4)
+            _mm_storel_epi64((__m128i *)(&dst0[j]), res_8);
+          else
+            *(uint32_t *)(&dst0[j]) = _mm_cvtsi128_si32(res_8);
         } else {
-          _mm_storeu_si128(p + 0, res_lo_round);
-          _mm_storeu_si128(p + 1, res_hi_round);
+          _mm_store_si128((__m128i *)(&dst[j]), res_unsigned);
         }
       }
+      src += src_stride;
+      dst += dst_stride;
+      dst0 += dst_stride0;
     }
   }
 }
-#endif
diff --git a/third_party/aom/av1/common/x86/convolve_avx2.c b/third_party/aom/av1/common/x86/convolve_avx2.c
index a0e58716d..6fdfb0954 100644
--- a/third_party/aom/av1/common/x86/convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/convolve_avx2.c
@@ -11,332 +11,267 @@
 
 #include <immintrin.h>
 
-#include "aom_dsp/aom_dsp_common.h"
-#include "./av1_rtcd.h"
-
-#if CONFIG_CONVOLVE_ROUND
-static const uint32_t sindex[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
-
-// 16 epi16 pixels
-static INLINE void pixel_clamp_avx2(__m256i *u, int bd) {
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
-  __m256i clamped, mask;
-
-  mask = _mm256_cmpgt_epi16(*u, max);
-  clamped = _mm256_andnot_si256(mask, *u);
-  mask = _mm256_and_si256(mask, max);
-  clamped = _mm256_or_si256(mask, clamped);
-
-  const __m256i zero = _mm256_setzero_si256();
-  mask = _mm256_cmpgt_epi16(clamped, zero);
-  *u = _mm256_and_si256(clamped, mask);
-}
-
-// 8 epi16 pixels
-static INLINE void pixel_clamp_sse2(__m128i *u, int bd) {
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
-  __m128i clamped, mask;
-
-  mask = _mm_cmpgt_epi16(*u, max);
-  clamped = _mm_andnot_si128(mask, *u);
-  mask = _mm_and_si128(mask, max);
-  clamped = _mm_or_si128(mask, clamped);
-
-  const __m128i zero = _mm_setzero_si128();
-  mask = _mm_cmpgt_epi16(clamped, zero);
-  *u = _mm_and_si128(clamped, mask);
-}
-
-// Work on multiple of 32 pixels
-static INLINE void cal_rounding_32xn_avx2(const int32_t *src, uint8_t *dst,
-                                          const __m256i *rnd, int shift,
-                                          int num) {
-  do {
-    __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
-    __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
-    __m256i x2 = _mm256_loadu_si256((const __m256i *)src + 2);
-    __m256i x3 = _mm256_loadu_si256((const __m256i *)src + 3);
-
-    x0 = _mm256_add_epi32(x0, *rnd);
-    x1 = _mm256_add_epi32(x1, *rnd);
-    x2 = _mm256_add_epi32(x2, *rnd);
-    x3 = _mm256_add_epi32(x3, *rnd);
-
-    x0 = _mm256_srai_epi32(x0, shift);
-    x1 = _mm256_srai_epi32(x1, shift);
-    x2 = _mm256_srai_epi32(x2, shift);
-    x3 = _mm256_srai_epi32(x3, shift);
-
-    x0 = _mm256_packs_epi32(x0, x1);
-    x2 = _mm256_packs_epi32(x2, x3);
-
-    pixel_clamp_avx2(&x0, 8);
-    pixel_clamp_avx2(&x2, 8);
-
-    x0 = _mm256_packus_epi16(x0, x2);
-    x1 = _mm256_loadu_si256((const __m256i *)sindex);
-    x2 = _mm256_permutevar8x32_epi32(x0, x1);
-
-    _mm256_storeu_si256((__m256i *)dst, x2);
-    src += 32;
-    dst += 32;
-    num--;
-  } while (num > 0);
-}
-
-static INLINE void cal_rounding_16_avx2(const int32_t *src, uint8_t *dst,
-                                        const __m256i *rnd, int shift) {
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
-
-  x0 = _mm256_add_epi32(x0, *rnd);
-  x1 = _mm256_add_epi32(x1, *rnd);
-
-  x0 = _mm256_srai_epi32(x0, shift);
-  x1 = _mm256_srai_epi32(x1, shift);
-
-  x0 = _mm256_packs_epi32(x0, x1);
-  pixel_clamp_avx2(&x0, 8);
-
-  const __m256i x2 = _mm256_packus_epi16(x0, x0);
-  x1 = _mm256_loadu_si256((const __m256i *)sindex);
-  x0 = _mm256_permutevar8x32_epi32(x2, x1);
-
-  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(x0));
-}
-
-static INLINE void cal_rounding_8_avx2(const int32_t *src, uint8_t *dst,
-                                       const __m256i *rnd, int shift) {
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
-  x0 = _mm256_add_epi32(x0, *rnd);
-  x0 = _mm256_srai_epi32(x0, shift);
-
-  x0 = _mm256_packs_epi32(x0, x0);
-  pixel_clamp_avx2(&x0, 8);
-
-  x0 = _mm256_packus_epi16(x0, x0);
-  const __m256i x1 = _mm256_loadu_si256((const __m256i *)sindex);
-  x0 = _mm256_permutevar8x32_epi32(x0, x1);
+#include "config/av1_rtcd.h"
 
-  _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(x0));
-}
-
-static INLINE void cal_rounding_4_sse2(const int32_t *src, uint8_t *dst,
-                                       const __m128i *rnd, int shift) {
-  __m128i x = _mm_loadu_si128((const __m128i *)src);
-  x = _mm_add_epi32(x, *rnd);
-  x = _mm_srai_epi32(x, shift);
-
-  x = _mm_packs_epi32(x, x);
-  pixel_clamp_sse2(&x, 8);
-
-  x = _mm_packus_epi16(x, x);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(x);
-}
-
-void av1_convolve_rounding_avx2(const int32_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride, int w, int h,
-                                int bits) {
-  const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1)));
-  const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
-
-  if (w > 64) {  // width = 128
-    do {
-      cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 4);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (w > 32) {  // width = 64
-    do {
-      cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 2);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (w > 16) {  // width = 32
-    do {
-      cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 1);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (w > 8) {  // width = 16
-    do {
-      cal_rounding_16_avx2(src, dst, &rnd_num, bits);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (w > 4) {  // width = 8
-    do {
-      cal_rounding_8_avx2(src, dst, &rnd_num, bits);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (w > 2) {  // width = 4
-    do {
-      cal_rounding_4_sse2(src, dst, &rnd_num_sse2, bits);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else {  // width = 2
-    do {
-      dst[0] = clip_pixel(ROUND_POWER_OF_TWO(src[0], bits));
-      dst[1] = clip_pixel(ROUND_POWER_OF_TWO(src[1], bits));
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            InterpFilterParams *filter_params_x,
+                            InterpFilterParams *filter_params_y,
+                            const int subpel_x_q4, const int subpel_y_q4,
+                            ConvolveParams *conv_params) {
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+  // right shift is F-1 because we are already dividing
+  // filter co-efficients by 2
+  const int right_shift_bits = (FILTER_BITS - 1);
+  const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
+  const __m256i right_shift_const =
+      _mm256_set1_epi16((1 << right_shift_bits) >> 1);
+  __m256i coeffs[4], s[8];
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
+
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  for (j = 0; j < w; j += 16) {
+    const uint8_t *data = &src_ptr[j];
+    __m256i src6;
+
+    // Load lines a and b. Line a to lower 128, line b to upper 128
+    const __m256i src_01a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+        0x20);
+
+    const __m256i src_12a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+        0x20);
+
+    const __m256i src_23a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+        0x20);
+
+    const __m256i src_34a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+        0x20);
+
+    const __m256i src_45a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+        0x20);
+
+    src6 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+    const __m256i src_56a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+        src6, 0x20);
+
+    s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+    s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+    s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+
+    s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
+    s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
+    s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+    for (i = 0; i < h; i += 2) {
+      data = &src_ptr[i * src_stride + j];
+      const __m256i src_67a = _mm256_permute2x128_si256(
+          src6,
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+          0x20);
+
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+      const __m256i src_78a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+          src6, 0x20);
+
+      s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+      s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+      const __m256i res_lo = convolve_lowbd(s, coeffs);
+
+      /* rounding code */
+      // shift by F - 1
+      const __m256i res_16b_lo = _mm256_sra_epi16(
+          _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+      // 8 bit conversion and saturation to uint8
+      __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+      if (w - j > 8) {
+        const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+
+        /* rounding code */
+        // shift by F - 1
+        const __m256i res_16b_hi = _mm256_sra_epi16(
+            _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+        __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+        const __m128i res_0 = _mm256_castsi256_si128(res_a);
+        const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                         res_1);
+      } else {
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+        if (w - j > 4) {
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_1);
+        } else if (w - j > 2) {
+          xx_storel_32(&dst[i * dst_stride + j], res_0);
+          xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+        } else {
+          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
+          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+        }
+      }
+
+      s[0] = s[1];
+      s[1] = s[2];
+      s[2] = s[3];
+
+      s[4] = s[5];
+      s[5] = s[6];
+      s[6] = s[7];
+    }
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
-static INLINE void cal_highbd_rounding_32xn_avx2(const int32_t *src,
-                                                 uint16_t *dst,
-                                                 const __m256i *rnd, int shift,
-                                                 int num, int bd) {
-  do {
-    __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
-    __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
-    __m256i x2 = _mm256_loadu_si256((const __m256i *)src + 2);
-    __m256i x3 = _mm256_loadu_si256((const __m256i *)src + 3);
-
-    x0 = _mm256_add_epi32(x0, *rnd);
-    x1 = _mm256_add_epi32(x1, *rnd);
-    x2 = _mm256_add_epi32(x2, *rnd);
-    x3 = _mm256_add_epi32(x3, *rnd);
-
-    x0 = _mm256_srai_epi32(x0, shift);
-    x1 = _mm256_srai_epi32(x1, shift);
-    x2 = _mm256_srai_epi32(x2, shift);
-    x3 = _mm256_srai_epi32(x3, shift);
-
-    x0 = _mm256_packs_epi32(x0, x1);
-    x2 = _mm256_packs_epi32(x2, x3);
-
-    pixel_clamp_avx2(&x0, bd);
-    pixel_clamp_avx2(&x2, bd);
-
-    x0 = _mm256_permute4x64_epi64(x0, 0xD8);
-    x2 = _mm256_permute4x64_epi64(x2, 0xD8);
-
-    _mm256_storeu_si256((__m256i *)dst, x0);
-    _mm256_storeu_si256((__m256i *)(dst + 16), x2);
-    src += 32;
-    dst += 32;
-    num--;
-  } while (num > 0);
-}
-
-static INLINE void cal_highbd_rounding_16_avx2(const int32_t *src,
-                                               uint16_t *dst,
-                                               const __m256i *rnd, int shift,
-                                               int bd) {
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
-
-  x0 = _mm256_add_epi32(x0, *rnd);
-  x1 = _mm256_add_epi32(x1, *rnd);
-
-  x0 = _mm256_srai_epi32(x0, shift);
-  x1 = _mm256_srai_epi32(x1, shift);
-
-  x0 = _mm256_packs_epi32(x0, x1);
-  pixel_clamp_avx2(&x0, bd);
-
-  x0 = _mm256_permute4x64_epi64(x0, 0xD8);
-  _mm256_storeu_si256((__m256i *)dst, x0);
-}
-
-static INLINE void cal_highbd_rounding_8_avx2(const int32_t *src, uint16_t *dst,
-                                              const __m256i *rnd, int shift,
-                                              int bd) {
-  __m256i x = _mm256_loadu_si256((const __m256i *)src);
-  x = _mm256_add_epi32(x, *rnd);
-  x = _mm256_srai_epi32(x, shift);
-
-  x = _mm256_packs_epi32(x, x);
-  pixel_clamp_avx2(&x, bd);
-
-  x = _mm256_permute4x64_epi64(x, 0xD8);
-  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(x));
-}
-
-static INLINE void cal_highbd_rounding_4_sse2(const int32_t *src, uint16_t *dst,
-                                              const __m128i *rnd, int shift,
-                                              int bd) {
-  __m128i x = _mm_loadu_si128((const __m128i *)src);
-  x = _mm_add_epi32(x, *rnd);
-  x = _mm_srai_epi32(x, shift);
-
-  x = _mm_packs_epi32(x, x);
-  pixel_clamp_sse2(&x, bd);
-  _mm_storel_epi64((__m128i *)dst, x);
-}
-
-void av1_highbd_convolve_rounding_avx2(const int32_t *src, int src_stride,
-                                       uint8_t *dst8, int dst_stride, int w,
-                                       int h, int bits, int bd) {
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1)));
-  const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
-
-  if (w > 64) {  // width = 128
-    do {
-      cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 4, bd);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (w > 32) {  // width = 64
-    do {
-      cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 2, bd);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (w > 16) {  // width = 32
-    do {
-      cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 1, bd);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (w > 8) {  // width = 16
-    do {
-      cal_highbd_rounding_16_avx2(src, dst, &rnd_num, bits, bd);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (w > 4) {  // width = 8
-    do {
-      cal_highbd_rounding_8_avx2(src, dst, &rnd_num, bits, bd);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (w > 2) {  // width = 4
-    do {
-      cal_highbd_rounding_4_sse2(src, dst, &rnd_num_sse2, bits, bd);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else {  // width = 2
-    do {
-      dst[0] = clip_pixel_highbd(ROUND_POWER_OF_TWO(src[0], bits), bd);
-      dst[1] = clip_pixel_highbd(ROUND_POWER_OF_TWO(src[1], bits), bd);
-      src += src_stride;
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
+void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            InterpFilterParams *filter_params_x,
+                            InterpFilterParams *filter_params_y,
+                            const int subpel_x_q4, const int subpel_y_q4,
+                            ConvolveParams *conv_params) {
+  int i, j;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_0;
+
+  __m256i filt[4], coeffs[4];
+
+  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
+
+  const __m256i round_0_const =
+      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
+  const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+  const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(bits);
+
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+  assert(conv_params->round_0 > 0);
+
+  if (w <= 8) {
+    for (i = 0; i < h; i += 2) {
+      const __m256i data = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+          _mm256_castsi128_si256(_mm_loadu_si128(
+              (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+          0x20);
+
+      __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+      res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                 round_0_shift);
+
+      res_16b =
+          _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift);
+
+      /* rounding code */
+      // 8 bit conversion and saturation to uint8
+      __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+      const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+      const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+      if (w > 4) {
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+      } else if (w > 2) {
+        xx_storel_32(&dst[i * dst_stride], res_0);
+        xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+      } else {
+        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+        *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+        *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+      }
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 16) {
+        // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 18
+        // 19 20 21 22 23
+        const __m256i data = _mm256_inserti128_si256(
+            _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+            _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+            1);
+
+        __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                   round_0_shift);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                   round_shift);
+
+        /* rounding code */
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+        // Store values into the destination buffer
+        // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+        res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+        __m128i res = _mm256_castsi256_si128(res_8b);
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+      }
+    }
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_CONVOLVE_ROUND
diff --git a/third_party/aom/av1/common/x86/convolve_sse2.c b/third_party/aom/av1/common/x86/convolve_sse2.c
new file mode 100644
index 000000000..18fe9ae5a
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_sse2.c
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "av1/common/convolve.h"
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+                                  const int subpel_q4,
+                                  __m128i *const coeffs /* [4] */) {
+  const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+  coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0);  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1);  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1);  // coeffs 6 7 6 7 6 7 6 7
+}
+
+static INLINE __m128i convolve(const __m128i *const s,
+                               const __m128i *const coeffs) {
+  const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
+  const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
+  const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
+  const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]);
+  const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3));
+  return d;
+}
+
+static INLINE __m128i convolve_lo_x(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
+  ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+  ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_lo_y(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+  ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
+  ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
+  ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
+  ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride,
+                            const uint8_t *dst, int dst_stride, int w, int h,
+                            InterpFilterParams *filter_params_x,
+                            InterpFilterParams *filter_params_y,
+                            const int subpel_x_q4, const int subpel_y_q4,
+                            ConvolveParams *conv_params) {
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint8_t *src_ptr = src - fo_vert * src_stride;
+  const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
+  __m128i coeffs[4];
+
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
+
+  if (w <= 4) {
+    __m128i s[8], src6, res, res_round, res16;
+    uint32_t res_int;
+    src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
+    s[0] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
+    s[1] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
+    s[2] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
+    s[3] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
+    s[4] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
+    s[5] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+
+    do {
+      s[6] = _mm_unpacklo_epi8(
+          src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
+      src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
+      s[7] = _mm_unpacklo_epi8(
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
+
+      res = convolve_lo_y(s + 0, coeffs);
+      res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+      res16 = _mm_packs_epi32(res_round, res_round);
+      res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
+
+      if (w == 2)
+        *(uint16_t *)dst = res_int;
+      else
+        *(uint32_t *)dst = res_int;
+
+      src_ptr += src_stride;
+      dst += dst_stride;
+
+      res = convolve_lo_y(s + 1, coeffs);
+      res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+      res16 = _mm_packs_epi32(res_round, res_round);
+      res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
+
+      if (w == 2)
+        *(uint16_t *)dst = res_int;
+      else
+        *(uint32_t *)dst = res_int;
+
+      src_ptr += src_stride;
+      dst += dst_stride;
+
+      s[0] = s[2];
+      s[1] = s[3];
+      s[2] = s[4];
+      s[3] = s[5];
+      s[4] = s[6];
+      s[5] = s[7];
+      h -= 2;
+    } while (h);
+  } else {
+    assert(!(w % 8));
+    int j = 0;
+    do {
+      __m128i s[8], src6, res_lo, res_hi;
+      __m128i res_lo_round, res_hi_round, res16, res;
+      const uint8_t *data = &src_ptr[j];
+
+      src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+      s[0] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+      s[1] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+      s[2] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+      s[3] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+      s[4] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+      s[5] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+
+      int i = 0;
+      do {
+        data = &src_ptr[i * src_stride + j];
+        s[6] = _mm_unpacklo_epi8(
+            src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+        src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+        s[7] = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+
+        res_lo = convolve_lo_y(s, coeffs);  // Filter low index pixels
+        res_hi = convolve_hi_y(s, coeffs);  // Filter high index pixels
+
+        res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+        res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+        res = _mm_packus_epi16(res16, res16);
+
+        _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+        i++;
+
+        res_lo = convolve_lo_y(s + 1, coeffs);  // Filter low index pixels
+        res_hi = convolve_hi_y(s + 1, coeffs);  // Filter high index pixels
+
+        res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+        res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+        res = _mm_packus_epi16(res16, res16);
+
+        _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+        i++;
+
+        s[0] = s[2];
+        s[1] = s[3];
+        s[2] = s[4];
+        s[3] = s[5];
+        s[4] = s[6];
+        s[5] = s[7];
+      } while (i < h);
+      j += 8;
+    } while (j < w);
+  }
+}
+
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride,
+                            const uint8_t *dst, int dst_stride, int w, int h,
+                            InterpFilterParams *filter_params_x,
+                            InterpFilterParams *filter_params_y,
+                            const int subpel_x_q4, const int subpel_y_q4,
+                            ConvolveParams *conv_params) {
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const __m128i round_0_const =
+      _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+  const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
+  const __m128i round_shift = _mm_cvtsi32_si128(bits);
+  __m128i coeffs[4];
+
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
+
+  if (w <= 4) {
+    do {
+      const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
+      __m128i s[4];
+
+      s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
+      s[1] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
+      s[2] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
+      s[3] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
+      const __m128i res_lo = convolve_lo_x(s, coeffs);
+      __m128i res_lo_round =
+          _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
+      res_lo_round =
+          _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), round_shift);
+
+      const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round);
+      const __m128i res = _mm_packus_epi16(res16, res16);
+
+      uint32_t r = _mm_cvtsi128_si32(res);
+      if (w == 2)
+        *(uint16_t *)dst = r;
+      else
+        *(uint32_t *)dst = r;
+
+      src_ptr += src_stride;
+      dst += dst_stride;
+    } while (--h);
+  } else {
+    assert(!(w % 8));
+    int i = 0;
+    do {
+      int j = 0;
+      do {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        __m128i s[4];
+
+        // Filter even-index pixels
+        s[0] = data;
+        s[1] = _mm_srli_si128(data, 2);
+        s[2] = _mm_srli_si128(data, 4);
+        s[3] = _mm_srli_si128(data, 6);
+        const __m128i res_even = convolve_lo_x(s, coeffs);
+
+        // Filter odd-index pixels
+        s[0] = _mm_srli_si128(data, 1);
+        s[1] = _mm_srli_si128(data, 3);
+        s[2] = _mm_srli_si128(data, 5);
+        s[3] = _mm_srli_si128(data, 7);
+        const __m128i res_odd = convolve_lo_x(s, coeffs);
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+        __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
+        res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+                                     round_shift);
+        __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_0_const), round_0_shift);
+        res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+                                     round_shift);
+
+        const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+        const __m128i res = _mm_packus_epi16(res16, res16);
+
+        _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+        j += 8;
+      } while (j < w);
+    } while (++i < h);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/filterintra_sse4.c b/third_party/aom/av1/common/x86/filterintra_sse4.c
index 4f77da446..c11edc1d4 100644
--- a/third_party/aom/av1/common/x86/filterintra_sse4.c
+++ b/third_party/aom/av1/common/x86/filterintra_sse4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -11,888 +11,65 @@
 
 #include <smmintrin.h>
 
-#include "./av1_rtcd.h"
-#include "aom_ports/mem.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
 #include "av1/common/enums.h"
 #include "av1/common/reconintra.h"
 
-#if USE_3TAP_INTRA_FILTER
-void filterintra_sse4_3tap_dummy_func(void);
-void filterintra_sse4_3tap_dummy_func(void) {}
-#else
-
-static INLINE void AddPixelsSmall(const uint8_t *above, const uint8_t *left,
-                                  __m128i *sum) {
-  const __m128i a = _mm_loadu_si128((const __m128i *)above);
-  const __m128i l = _mm_loadu_si128((const __m128i *)left);
-  const __m128i zero = _mm_setzero_si128();
-
-  __m128i u0 = _mm_unpacklo_epi8(a, zero);
-  __m128i u1 = _mm_unpacklo_epi8(l, zero);
-
-  sum[0] = _mm_add_epi16(u0, u1);
-}
-
-static INLINE int GetMeanValue4x4(const uint8_t *above, const uint8_t *left,
-                                  __m128i *params) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector, u;
-  uint16_t sum_value;
-
-  AddPixelsSmall(above, left, &sum_vector);
-
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
-  u = _mm_srli_si128(sum_vector, 2);
-  sum_vector = _mm_add_epi16(sum_vector, u);
-
-  sum_value = _mm_extract_epi16(sum_vector, 0);
-  sum_value += 4;
-  sum_value >>= 3;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-static INLINE int GetMeanValue8x8(const uint8_t *above, const uint8_t *left,
-                                  __m128i *params) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector, u;
-  uint16_t sum_value;
-
-  AddPixelsSmall(above, left, &sum_vector);
-
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
-
-  u = _mm_srli_si128(sum_vector, 2);
-  sum_vector = _mm_add_epi16(sum_vector, u);
-
-  sum_value = _mm_extract_epi16(sum_vector, 0);
-  sum_value += 8;
-  sum_value >>= 4;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-static INLINE void AddPixelsLarge(const uint8_t *above, const uint8_t *left,
-                                  __m128i *sum) {
-  const __m128i a = _mm_loadu_si128((const __m128i *)above);
-  const __m128i l = _mm_loadu_si128((const __m128i *)left);
-  const __m128i zero = _mm_setzero_si128();
-
-  __m128i u0 = _mm_unpacklo_epi8(a, zero);
-  __m128i u1 = _mm_unpacklo_epi8(l, zero);
-
-  sum[0] = _mm_add_epi16(u0, u1);
-
-  u0 = _mm_unpackhi_epi8(a, zero);
-  u1 = _mm_unpackhi_epi8(l, zero);
-
-  sum[0] = _mm_add_epi16(sum[0], u0);
-  sum[0] = _mm_add_epi16(sum[0], u1);
-}
-
-static INLINE int GetMeanValue16x16(const uint8_t *above, const uint8_t *left,
-                                    __m128i *params) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector, u;
-  uint16_t sum_value;
-
-  AddPixelsLarge(above, left, &sum_vector);
-
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
-
-  u = _mm_srli_si128(sum_vector, 2);
-  sum_vector = _mm_add_epi16(sum_vector, u);
-
-  sum_value = _mm_extract_epi16(sum_vector, 0);
-  sum_value += 16;
-  sum_value >>= 5;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-static INLINE int GetMeanValue32x32(const uint8_t *above, const uint8_t *left,
-                                    __m128i *params) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector[2], u;
-  uint16_t sum_value;
-
-  AddPixelsLarge(above, left, &sum_vector[0]);
-  AddPixelsLarge(above + 16, left + 16, &sum_vector[1]);
-
-  sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]);
-  sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 4 values
-  sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 2 values
-
-  u = _mm_srli_si128(sum_vector[0], 2);
-  sum_vector[0] = _mm_add_epi16(sum_vector[0], u);
-
-  sum_value = _mm_extract_epi16(sum_vector[0], 0);
-  sum_value += 32;
-  sum_value >>= 6;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-// Note:
-//  params[4] : mean value, 4 int32_t repetition
-//
-static INLINE int CalcRefPixelsMeanValue(const uint8_t *above,
-                                         const uint8_t *left, int bs,
-                                         __m128i *params) {
-  int meanValue = 0;
-  switch (bs) {
-    case 4: meanValue = GetMeanValue4x4(above, left, params); break;
-    case 8: meanValue = GetMeanValue8x8(above, left, params); break;
-    case 16: meanValue = GetMeanValue16x16(above, left, params); break;
-    case 32: meanValue = GetMeanValue32x32(above, left, params); break;
-    default: assert(0);
-  }
-  return meanValue;
-}
-
-// Note:
-//  params[0-3] : 4-tap filter coefficients (int32_t per coefficient)
-//
-static INLINE void GetIntraFilterParams(int bs, int mode, __m128i *params) {
-  const TX_SIZE tx_size =
-      (bs == 32) ? TX_32X32
-                 : ((bs == 16) ? TX_16X16 : ((bs == 8) ? TX_8X8 : (TX_4X4)));
-  // c0
-  params[0] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][0],
-                            av1_filter_intra_taps_4[tx_size][mode][0],
-                            av1_filter_intra_taps_4[tx_size][mode][0],
-                            av1_filter_intra_taps_4[tx_size][mode][0]);
-  // c1
-  params[1] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][1],
-                            av1_filter_intra_taps_4[tx_size][mode][1],
-                            av1_filter_intra_taps_4[tx_size][mode][1],
-                            av1_filter_intra_taps_4[tx_size][mode][1]);
-  // c2
-  params[2] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][2],
-                            av1_filter_intra_taps_4[tx_size][mode][2],
-                            av1_filter_intra_taps_4[tx_size][mode][2],
-                            av1_filter_intra_taps_4[tx_size][mode][2]);
-  // c3
-  params[3] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][3],
-                            av1_filter_intra_taps_4[tx_size][mode][3],
-                            av1_filter_intra_taps_4[tx_size][mode][3],
-                            av1_filter_intra_taps_4[tx_size][mode][3]);
-}
-
-static const int maxBlkSize = 32;
-
-static INLINE void SavePred4x4(int *pred, const __m128i *mean, uint8_t *dst,
-                               ptrdiff_t stride) {
-  const int predStride = (maxBlkSize << 1) + 1;
-  __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
-  __m128i p1 = _mm_loadu_si128((const __m128i *)(pred + predStride));
-  __m128i p2 = _mm_loadu_si128((const __m128i *)(pred + 2 * predStride));
-  __m128i p3 = _mm_loadu_si128((const __m128i *)(pred + 3 * predStride));
-
-  p0 = _mm_add_epi32(p0, mean[0]);
-  p1 = _mm_add_epi32(p1, mean[0]);
-  p2 = _mm_add_epi32(p2, mean[0]);
-  p3 = _mm_add_epi32(p3, mean[0]);
-
-  p0 = _mm_packus_epi32(p0, p1);
-  p1 = _mm_packus_epi32(p2, p3);
-  p0 = _mm_packus_epi16(p0, p1);
-
-  *((int *)dst) = _mm_cvtsi128_si32(p0);
-  p0 = _mm_srli_si128(p0, 4);
-  *((int *)(dst + stride)) = _mm_cvtsi128_si32(p0);
-  p0 = _mm_srli_si128(p0, 4);
-  *((int *)(dst + 2 * stride)) = _mm_cvtsi128_si32(p0);
-  p0 = _mm_srli_si128(p0, 4);
-  *((int *)(dst + 3 * stride)) = _mm_cvtsi128_si32(p0);
-}
-
-static void SavePred8x8(int *pred, const __m128i *mean, uint8_t *dst,
-                        ptrdiff_t stride) {
-  const int predStride = (maxBlkSize << 1) + 1;
-  __m128i p0, p1, p2, p3;
-  int r = 0;
-
-  while (r < 8) {
-    p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
-    p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
-    r += 1;
-    p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
-    p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
-
-    p0 = _mm_add_epi32(p0, mean[0]);
-    p1 = _mm_add_epi32(p1, mean[0]);
-    p2 = _mm_add_epi32(p2, mean[0]);
-    p3 = _mm_add_epi32(p3, mean[0]);
-
-    p0 = _mm_packus_epi32(p0, p1);
-    p1 = _mm_packus_epi32(p2, p3);
-    p0 = _mm_packus_epi16(p0, p1);
-
-    _mm_storel_epi64((__m128i *)dst, p0);
-    dst += stride;
-    p0 = _mm_srli_si128(p0, 8);
-    _mm_storel_epi64((__m128i *)dst, p0);
-    dst += stride;
-    r += 1;
-  }
-}
-
-static void SavePred16x16(int *pred, const __m128i *mean, uint8_t *dst,
-                          ptrdiff_t stride) {
-  const int predStride = (maxBlkSize << 1) + 1;
-  __m128i p0, p1, p2, p3;
-  int r = 0;
-
-  while (r < 16) {
-    p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
-    p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
-    p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8));
-    p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12));
-
-    p0 = _mm_add_epi32(p0, mean[0]);
-    p1 = _mm_add_epi32(p1, mean[0]);
-    p2 = _mm_add_epi32(p2, mean[0]);
-    p3 = _mm_add_epi32(p3, mean[0]);
-
-    p0 = _mm_packus_epi32(p0, p1);
-    p1 = _mm_packus_epi32(p2, p3);
-    p0 = _mm_packus_epi16(p0, p1);
-
-    _mm_storel_epi64((__m128i *)dst, p0);
-    p0 = _mm_srli_si128(p0, 8);
-    _mm_storel_epi64((__m128i *)(dst + 8), p0);
-    dst += stride;
-    r += 1;
-  }
-}
-
-static void SavePred32x32(int *pred, const __m128i *mean, uint8_t *dst,
-                          ptrdiff_t stride) {
-  const int predStride = (maxBlkSize << 1) + 1;
-  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
-  int r = 0;
-
-  while (r < 32) {
-    p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
-    p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
-    p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8));
-    p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12));
-
-    p4 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 16));
-    p5 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 20));
-    p6 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 24));
-    p7 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 28));
-
-    p0 = _mm_add_epi32(p0, mean[0]);
-    p1 = _mm_add_epi32(p1, mean[0]);
-    p2 = _mm_add_epi32(p2, mean[0]);
-    p3 = _mm_add_epi32(p3, mean[0]);
-
-    p4 = _mm_add_epi32(p4, mean[0]);
-    p5 = _mm_add_epi32(p5, mean[0]);
-    p6 = _mm_add_epi32(p6, mean[0]);
-    p7 = _mm_add_epi32(p7, mean[0]);
-
-    p0 = _mm_packus_epi32(p0, p1);
-    p1 = _mm_packus_epi32(p2, p3);
-    p0 = _mm_packus_epi16(p0, p1);
-
-    p4 = _mm_packus_epi32(p4, p5);
-    p5 = _mm_packus_epi32(p6, p7);
-    p4 = _mm_packus_epi16(p4, p5);
-
-    _mm_storel_epi64((__m128i *)dst, p0);
-    p0 = _mm_srli_si128(p0, 8);
-    _mm_storel_epi64((__m128i *)(dst + 8), p0);
-
-    _mm_storel_epi64((__m128i *)(dst + 16), p4);
-    p4 = _mm_srli_si128(p4, 8);
-    _mm_storel_epi64((__m128i *)(dst + 24), p4);
-
-    dst += stride;
-    r += 1;
-  }
-}
-
-static void SavePrediction(int *pred, const __m128i *mean, int bs, uint8_t *dst,
-                           ptrdiff_t stride) {
-  switch (bs) {
-    case 4: SavePred4x4(pred, mean, dst, stride); break;
-    case 8: SavePred8x8(pred, mean, dst, stride); break;
-    case 16: SavePred16x16(pred, mean, dst, stride); break;
-    case 32: SavePred32x32(pred, mean, dst, stride); break;
-    default: assert(0);
-  }
-}
-
-typedef void (*ProducePixelsFunc)(__m128i *p, const __m128i *prm, int *pred,
-                                  const int predStride);
-
-static void ProduceFourPixels(__m128i *p, const __m128i *prm, int *pred,
-                              const int predStride) {
-  __m128i u0, u1, u2;
-  int c0 = _mm_extract_epi32(prm[1], 0);
-  int x = *(pred + predStride);
-  int sum;
-
-  u0 = _mm_mullo_epi32(p[0], prm[2]);
-  u1 = _mm_mullo_epi32(p[1], prm[0]);
-  u2 = _mm_mullo_epi32(p[2], prm[3]);
-
-  u0 = _mm_add_epi32(u0, u1);
-  u0 = _mm_add_epi32(u0, u2);
-
-  sum = _mm_extract_epi32(u0, 0);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 1) = x;
-
-  sum = _mm_extract_epi32(u0, 1);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 2) = x;
-
-  sum = _mm_extract_epi32(u0, 2);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 3) = x;
-
-  sum = _mm_extract_epi32(u0, 3);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 4) = x;
-}
-
-static void ProduceThreePixels(__m128i *p, const __m128i *prm, int *pred,
-                               const int predStride) {
-  __m128i u0, u1, u2;
-  int c0 = _mm_extract_epi32(prm[1], 0);
-  int x = *(pred + predStride);
-  int sum;
-
-  u0 = _mm_mullo_epi32(p[0], prm[2]);
-  u1 = _mm_mullo_epi32(p[1], prm[0]);
-  u2 = _mm_mullo_epi32(p[2], prm[3]);
-
-  u0 = _mm_add_epi32(u0, u1);
-  u0 = _mm_add_epi32(u0, u2);
-
-  sum = _mm_extract_epi32(u0, 0);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 1) = x;
-
-  sum = _mm_extract_epi32(u0, 1);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 2) = x;
-
-  sum = _mm_extract_epi32(u0, 2);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 3) = x;
-}
-
-static void ProduceTwoPixels(__m128i *p, const __m128i *prm, int *pred,
-                             const int predStride) {
-  __m128i u0, u1, u2;
-  int c0 = _mm_extract_epi32(prm[1], 0);
-  int x = *(pred + predStride);
-  int sum;
-
-  u0 = _mm_mullo_epi32(p[0], prm[2]);
-  u1 = _mm_mullo_epi32(p[1], prm[0]);
-  u2 = _mm_mullo_epi32(p[2], prm[3]);
-
-  u0 = _mm_add_epi32(u0, u1);
-  u0 = _mm_add_epi32(u0, u2);
-
-  sum = _mm_extract_epi32(u0, 0);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 1) = x;
-
-  sum = _mm_extract_epi32(u0, 1);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 2) = x;
-}
-
-static void ProduceOnePixels(__m128i *p, const __m128i *prm, int *pred,
-                             const int predStride) {
-  __m128i u0, u1, u2;
-  int c0 = _mm_extract_epi32(prm[1], 0);
-  int x = *(pred + predStride);
-  int sum;
-
-  u0 = _mm_mullo_epi32(p[0], prm[2]);
-  u1 = _mm_mullo_epi32(p[1], prm[0]);
-  u2 = _mm_mullo_epi32(p[2], prm[3]);
-
-  u0 = _mm_add_epi32(u0, u1);
-  u0 = _mm_add_epi32(u0, u2);
-
-  sum = _mm_extract_epi32(u0, 0);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 1) = x;
-}
-
-static ProducePixelsFunc prodPixelsFuncTab[4] = {
-  ProduceOnePixels, ProduceTwoPixels, ProduceThreePixels, ProduceFourPixels
-};
-
-static void ProducePixels(int *pred, const __m128i *prm, int remain) {
-  __m128i p[3];
-  const int predStride = (maxBlkSize << 1) + 1;
-  int index;
-
-  p[0] = _mm_loadu_si128((const __m128i *)pred);
-  p[1] = _mm_loadu_si128((const __m128i *)(pred + 1));
-  p[2] = _mm_loadu_si128((const __m128i *)(pred + 2));
-
-  if (remain <= 2) {
-    return;
-  }
-  if (remain > 5) {
-    index = 3;
-  } else {
-    index = remain - 3;
-  }
-  prodPixelsFuncTab[index](p, prm, pred, predStride);
-}
-
-// Note:
-//  At column index c, the remaining pixels are R = 2 * bs + 1 - r - c
-//  the number of pixels to produce is R - 2 = 2 * bs - r - c - 1
-static void GeneratePrediction(const uint8_t *above, const uint8_t *left,
-                               const int bs, const __m128i *prm, int meanValue,
-                               uint8_t *dst, ptrdiff_t stride) {
-  int pred[33][65];
-  int r, c, colBound;
-  int remainings;
-
-  for (r = 0; r < bs; ++r) {
-    pred[r + 1][0] = (int)left[r] - meanValue;
-  }
-
-  above -= 1;
-  for (c = 0; c < 2 * bs + 1; ++c) {
-    pred[0][c] = (int)above[c] - meanValue;
-  }
-
-  r = 0;
-  c = 0;
-  while (r < bs) {
-    colBound = (bs << 1) - r;
-    for (c = 0; c < colBound; c += 4) {
-      remainings = colBound - c + 1;
-      ProducePixels(&pred[r][c], prm, remainings);
-    }
-    r += 1;
-  }
-
-  SavePrediction(&pred[1][1], &prm[4], bs, dst, stride);
-}
-
-static void FilterPrediction(const uint8_t *above, const uint8_t *left, int bs,
-                             __m128i *prm, uint8_t *dst, ptrdiff_t stride) {
-  int meanValue = 0;
-  meanValue = CalcRefPixelsMeanValue(above, left, bs, &prm[4]);
-  GeneratePrediction(above, left, bs, prm, meanValue, dst, stride);
-}
-
-void av1_dc_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, DC_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_v_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, V_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_h_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, H_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d45_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D45_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d135_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D135_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d117_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D117_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d153_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D153_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d207_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D207_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d63_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D63_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_tm_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, TM_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-// ============== High Bit Depth ==============
-#if CONFIG_HIGHBITDEPTH
-static INLINE int HighbdGetMeanValue4x4(const uint16_t *above,
-                                        const uint16_t *left, const int bd,
-                                        __m128i *params) {
-  const __m128i a = _mm_loadu_si128((const __m128i *)above);
-  const __m128i l = _mm_loadu_si128((const __m128i *)left);
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector, u;
-  uint16_t sum_value;
-  (void)bd;
-
-  sum_vector = _mm_add_epi16(a, l);
-
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
-  u = _mm_srli_si128(sum_vector, 2);
-  sum_vector = _mm_add_epi16(sum_vector, u);
-
-  sum_value = _mm_extract_epi16(sum_vector, 0);
-  sum_value += 4;
-  sum_value >>= 3;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-static INLINE int HighbdGetMeanValue8x8(const uint16_t *above,
-                                        const uint16_t *left, const int bd,
-                                        __m128i *params) {
-  const __m128i a = _mm_loadu_si128((const __m128i *)above);
-  const __m128i l = _mm_loadu_si128((const __m128i *)left);
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector, u;
-  uint16_t sum_value;
-  (void)bd;
-
-  sum_vector = _mm_add_epi16(a, l);
-
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
-
-  u = _mm_srli_si128(sum_vector, 2);
-  sum_vector = _mm_add_epi16(sum_vector, u);
-
-  sum_value = _mm_extract_epi16(sum_vector, 0);
-  sum_value += 8;
-  sum_value >>= 4;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-// Note:
-//  Process 16 pixels above and left, 10-bit depth
-//  Add to the last 8 pixels sum
-static INLINE void AddPixels10bit(const uint16_t *above, const uint16_t *left,
-                                  __m128i *sum) {
-  __m128i a = _mm_loadu_si128((const __m128i *)above);
-  __m128i l = _mm_loadu_si128((const __m128i *)left);
-  sum[0] = _mm_add_epi16(a, l);
-  a = _mm_loadu_si128((const __m128i *)(above + 8));
-  l = _mm_loadu_si128((const __m128i *)(left + 8));
-  sum[0] = _mm_add_epi16(sum[0], a);
-  sum[0] = _mm_add_epi16(sum[0], l);
-}
-
-// Note:
-//  Process 16 pixels above and left, 12-bit depth
-//  Add to the last 8 pixels sum
-static INLINE void AddPixels12bit(const uint16_t *above, const uint16_t *left,
-                                  __m128i *sum) {
-  __m128i a = _mm_loadu_si128((const __m128i *)above);
-  __m128i l = _mm_loadu_si128((const __m128i *)left);
-  const __m128i zero = _mm_setzero_si128();
-  __m128i v0, v1;
-
-  v0 = _mm_unpacklo_epi16(a, zero);
-  v1 = _mm_unpacklo_epi16(l, zero);
-  sum[0] = _mm_add_epi32(v0, v1);
-
-  v0 = _mm_unpackhi_epi16(a, zero);
-  v1 = _mm_unpackhi_epi16(l, zero);
-  sum[0] = _mm_add_epi32(sum[0], v0);
-  sum[0] = _mm_add_epi32(sum[0], v1);
-
-  a = _mm_loadu_si128((const __m128i *)(above + 8));
-  l = _mm_loadu_si128((const __m128i *)(left + 8));
-
-  v0 = _mm_unpacklo_epi16(a, zero);
-  v1 = _mm_unpacklo_epi16(l, zero);
-  sum[0] = _mm_add_epi32(sum[0], v0);
-  sum[0] = _mm_add_epi32(sum[0], v1);
-
-  v0 = _mm_unpackhi_epi16(a, zero);
-  v1 = _mm_unpackhi_epi16(l, zero);
-  sum[0] = _mm_add_epi32(sum[0], v0);
-  sum[0] = _mm_add_epi32(sum[0], v1);
-}
-
-static INLINE int HighbdGetMeanValue16x16(const uint16_t *above,
-                                          const uint16_t *left, const int bd,
-                                          __m128i *params) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector, u;
-  uint32_t sum_value = 0;
-
-  if (10 == bd) {
-    AddPixels10bit(above, left, &sum_vector);
-    sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
-    sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
-
-    u = _mm_srli_si128(sum_vector, 2);
-    sum_vector = _mm_add_epi16(sum_vector, u);
-    sum_value = _mm_extract_epi16(sum_vector, 0);
-  } else if (12 == bd) {
-    AddPixels12bit(above, left, &sum_vector);
-
-    sum_vector = _mm_hadd_epi32(sum_vector, zero);
-    u = _mm_srli_si128(sum_vector, 4);
-    sum_vector = _mm_add_epi32(u, sum_vector);
-    sum_value = _mm_extract_epi32(sum_vector, 0);
-  }
-
-  sum_value += 16;
-  sum_value >>= 5;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-static INLINE int HighbdGetMeanValue32x32(const uint16_t *above,
-                                          const uint16_t *left, const int bd,
-                                          __m128i *params) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector[2], u;
-  uint32_t sum_value = 0;
-
-  if (10 == bd) {
-    AddPixels10bit(above, left, &sum_vector[0]);
-    AddPixels10bit(above + 16, left + 16, &sum_vector[1]);
-
-    sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]);
-    sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 4 values
-    sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 2 values
-
-    u = _mm_srli_si128(sum_vector[0], 2);
-    sum_vector[0] = _mm_add_epi16(sum_vector[0], u);
-    sum_value = _mm_extract_epi16(sum_vector[0], 0);
-  } else if (12 == bd) {
-    AddPixels12bit(above, left, &sum_vector[0]);
-    AddPixels12bit(above + 16, left + 16, &sum_vector[1]);
-
-    sum_vector[0] = _mm_add_epi32(sum_vector[0], sum_vector[1]);
-    sum_vector[0] = _mm_hadd_epi32(sum_vector[0], zero);
-    u = _mm_srli_si128(sum_vector[0], 4);
-    sum_vector[0] = _mm_add_epi32(u, sum_vector[0]);
-    sum_value = _mm_extract_epi32(sum_vector[0], 0);
-  }
-
-  sum_value += 32;
-  sum_value >>= 6;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-// Note:
-//  params[4] : mean value, 4 int32_t repetition
-//
-static INLINE int HighbdCalcRefPixelsMeanValue(const uint16_t *above,
-                                               const uint16_t *left, int bs,
-                                               const int bd, __m128i *params) {
-  int meanValue = 0;
-  switch (bs) {
-    case 4: meanValue = HighbdGetMeanValue4x4(above, left, bd, params); break;
-    case 8: meanValue = HighbdGetMeanValue8x8(above, left, bd, params); break;
-    case 16:
-      meanValue = HighbdGetMeanValue16x16(above, left, bd, params);
-      break;
-    case 32:
-      meanValue = HighbdGetMeanValue32x32(above, left, bd, params);
-      break;
-    default: assert(0);
-  }
-  return meanValue;
-}
-
-// Note:
-//  At column index c, the remaining pixels are R = 2 * bs + 1 - r - c
-//  the number of pixels to produce is R - 2 = 2 * bs - r - c - 1
-static void HighbdGeneratePrediction(const uint16_t *above,
-                                     const uint16_t *left, const int bs,
-                                     const int bd, const __m128i *prm,
-                                     int meanValue, uint16_t *dst,
-                                     ptrdiff_t stride) {
-  int pred[33][65];
-  int r, c, colBound;
-  int remainings;
-  int ipred;
-
-  for (r = 0; r < bs; ++r) {
-    pred[r + 1][0] = (int)left[r] - meanValue;
-  }
-
-  above -= 1;
-  for (c = 0; c < 2 * bs + 1; ++c) {
-    pred[0][c] = (int)above[c] - meanValue;
-  }
-
-  r = 0;
-  c = 0;
-  while (r < bs) {
-    colBound = (bs << 1) - r;
-    for (c = 0; c < colBound; c += 4) {
-      remainings = colBound - c + 1;
-      ProducePixels(&pred[r][c], prm, remainings);
+void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                       TX_SIZE tx_size, const uint8_t *above,
+                                       const uint8_t *left, int mode) {
+  int r, c;
+  uint8_t buffer[33][33];
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+
+  assert(bw <= 32 && bh <= 32);
+
+  // The initialization is just for silencing Jenkins static analysis warnings
+  for (r = 0; r < bh + 1; ++r)
+    memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
+
+  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
+
+  const __m128i f1f0 = xx_load_128(av1_filter_intra_taps[mode][0]);
+  const __m128i f3f2 = xx_load_128(av1_filter_intra_taps[mode][2]);
+  const __m128i f5f4 = xx_load_128(av1_filter_intra_taps[mode][4]);
+  const __m128i f7f6 = xx_load_128(av1_filter_intra_taps[mode][6]);
+  const __m128i filter_intra_scale_bits =
+      _mm_set1_epi16(1 << (15 - FILTER_INTRA_SCALE_BITS));
+
+  for (r = 1; r < bh + 1; r += 2) {
+    for (c = 1; c < bw + 1; c += 4) {
+      DECLARE_ALIGNED(16, uint8_t, p[8]);
+      memcpy(p, &buffer[r - 1][c - 1], 5 * sizeof(uint8_t));
+      p[5] = buffer[r][c - 1];
+      p[6] = buffer[r + 1][c - 1];
+      p[7] = 0;
+      const __m128i p_b = xx_loadl_64(p);
+      const __m128i in = _mm_unpacklo_epi64(p_b, p_b);
+      const __m128i out_01 = _mm_maddubs_epi16(in, f1f0);
+      const __m128i out_23 = _mm_maddubs_epi16(in, f3f2);
+      const __m128i out_45 = _mm_maddubs_epi16(in, f5f4);
+      const __m128i out_67 = _mm_maddubs_epi16(in, f7f6);
+      const __m128i out_0123 = _mm_hadd_epi16(out_01, out_23);
+      const __m128i out_4567 = _mm_hadd_epi16(out_45, out_67);
+      const __m128i out_01234567 = _mm_hadd_epi16(out_0123, out_4567);
+      // Rounding
+      const __m128i round_w =
+          _mm_mulhrs_epi16(out_01234567, filter_intra_scale_bits);
+      const __m128i out_r = _mm_packus_epi16(round_w, round_w);
+      const __m128i out_r1 = _mm_srli_si128(out_r, 4);
+      // Storing
+      xx_storel_32(&buffer[r][c], out_r);
+      xx_storel_32(&buffer[r + 1][c], out_r1);
     }
-    r += 1;
   }
 
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      ipred = pred[r + 1][c + 1] + meanValue;
-      dst[c] = clip_pixel_highbd(ipred, bd);
-    }
+  for (r = 0; r < bh; ++r) {
+    memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
     dst += stride;
   }
 }
-
-static void HighbdFilterPrediction(const uint16_t *above, const uint16_t *left,
-                                   int bs, const int bd, __m128i *prm,
-                                   uint16_t *dst, ptrdiff_t stride) {
-  int meanValue = 0;
-  meanValue = HighbdCalcRefPixelsMeanValue(above, left, bs, bd, &prm[4]);
-  HighbdGeneratePrediction(above, left, bs, bd, prm, meanValue, dst, stride);
-}
-
-void av1_highbd_dc_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                           int bs, const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, DC_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_v_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                          int bs, const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, V_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_h_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                          int bs, const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, H_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d45_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                            int bs, const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D45_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d135_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                             int bs, const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D135_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d117_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                             int bs, const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D117_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d153_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                             int bs, const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D153_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d207_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                             int bs, const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D207_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d63_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                            int bs, const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D63_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_tm_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                           int bs, const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, TM_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-#endif  // USE_3TAP_INTRA_FILTER
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
new file mode 100644
index 000000000..a34c618d0
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    InterpFilterParams *filter_params_x,
+                                    InterpFilterParams *filter_params_y,
+                                    const int subpel_x_q4,
+                                    const int subpel_y_q4,
+                                    ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = 8;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  __m256i s[8], coeffs_y[4], coeffs_x[4];
+
+  const __m256i round_const_x = _mm256_set1_epi32(
+      ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const __m256i round_const_y = _mm256_set1_epi32(
+      ((1 << conv_params->round_1) >> 1) -
+      (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      for (i = 0; i < im_h; i += 2) {
+        const __m256i row0 =
+            _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+        __m256i row1 = _mm256_set1_epi16(0);
+        if (i + 1 < im_h)
+          row1 =
+              _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+        const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+        const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+        // even pixels
+        s[0] = _mm256_alignr_epi8(r1, r0, 0);
+        s[1] = _mm256_alignr_epi8(r1, r0, 4);
+        s[2] = _mm256_alignr_epi8(r1, r0, 8);
+        s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+        __m256i res_even = convolve(s, coeffs_x);
+        res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+                                    round_shift_x);
+
+        // odd pixels
+        s[0] = _mm256_alignr_epi8(r1, r0, 2);
+        s[1] = _mm256_alignr_epi8(r1, r0, 6);
+        s[2] = _mm256_alignr_epi8(r1, r0, 10);
+        s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+        __m256i res_odd = convolve(s, coeffs_x);
+        res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+                                   round_shift_x);
+
+        __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+        __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+        __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+
+        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+      }
+    }
+
+    /* Vertical filter */
+    {
+      __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+      __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+      __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+      __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+      __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+      __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+      s[0] = _mm256_unpacklo_epi16(s0, s1);
+      s[1] = _mm256_unpacklo_epi16(s2, s3);
+      s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm256_unpackhi_epi16(s0, s1);
+      s[5] = _mm256_unpackhi_epi16(s2, s3);
+      s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+      for (i = 0; i < h; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        const __m256i s6 =
+            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+        const __m256i s7 =
+            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+        s[3] = _mm256_unpacklo_epi16(s6, s7);
+        s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+        __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_y), round_shift_y);
+
+        res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a_round, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          __m256i res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b, round_const_y), round_shift_y);
+          res_b_round =
+              _mm256_sra_epi32(_mm256_add_epi32(res_b_round, round_const_bits),
+                               round_shift_bits);
+
+          __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+          res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+          res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+                           _mm256_castsi256_si128(res_16bit));
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           _mm256_extracti128_si256(res_16bit, 1));
+        } else if (w == 4) {
+          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+          res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+                           _mm256_castsi256_si128(res_a_round));
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           _mm256_extracti128_si256(res_a_round, 1));
+        } else {
+          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+          res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+          xx_storel_32((__m128i *)&dst[i * dst_stride + j],
+                       _mm256_castsi256_si128(res_a_round));
+          xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                       _mm256_extracti128_si256(res_a_round, 1));
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
+
+static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
+  __m256i s[4];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+}
+
+static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
+  __m256i s[8];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+  s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
+  s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16));
+  s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16));
+  s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16));
+
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+  _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]);
+  _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]);
+  _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]);
+  _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]);
+}
+
+void av1_highbd_convolve_2d_copy_sr_avx2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+  (void)bd;
+
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memcpy(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memcpy(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m256i s[2];
+      s[0] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      s[1] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[0]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m256i s[4];
+      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+      src += src_stride;
+      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c
new file mode 100644
index 000000000..bdf813fa0
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <emmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+
+static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
+  __m128i s[8];
+  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
+  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
+  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
+  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
+  _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+  _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+  _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+  _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+  _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
+  _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
+  _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
+  _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
+}
+
+static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
+  __m128i s[16];
+  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
+  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
+  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
+  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
+  s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8));
+  s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8));
+  s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8));
+  s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8));
+  s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8));
+  s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8));
+  s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8));
+  s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8));
+  _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+  _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+  _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+  _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+  _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
+  _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
+  _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
+  _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
+  _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]);
+  _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]);
+  _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]);
+  _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]);
+  _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]);
+  _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]);
+  _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]);
+  _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]);
+}
+
+void av1_highbd_convolve_2d_copy_sr_sse2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+  (void)bd;
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      __m128i s = _mm_loadl_epi64((__m128i *)src);
+      *(uint32_t *)dst = _mm_cvtsi128_si32(s);
+      src += src_stride;
+      dst += dst_stride;
+      s = _mm_loadl_epi64((__m128i *)src);
+      *(uint32_t *)dst = _mm_cvtsi128_si32(s);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m128i s[4];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      src += src_stride;
+      s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m128i s[8];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+      src += src_stride;
+      s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+      s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+      _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+      _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]);
+      _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]);
+      _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
new file mode 100644
index 000000000..5d2fc465e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -0,0 +1,421 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <smmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_jnt_convolve_2d_copy_sse4_1(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+  const __m128i zero = _mm_setzero_si128();
+  int i, j;
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi32(offset);
+  const __m128i offset_const_16b = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+  const __m128i clip_pixel_to_bd =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  assert(bits <= 4);
+
+  if (!(w % 8)) {
+    for (i = 0; i < h; i += 1) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i src_16bit =
+            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
+        const __m128i res = _mm_sll_epi16(src_16bit, left_shift);
+        if (do_average) {
+          const __m128i data_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
+          const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
+
+          const __m128i res_32b_lo = _mm_unpacklo_epi16(res, zero);
+          const __m128i res_unsigned_lo =
+              _mm_add_epi32(res_32b_lo, offset_const);
+
+          const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
+          const __m128i res_unsigned_hi =
+              _mm_add_epi32(res_32b_hi, offset_const);
+
+          const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+          const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_16b =
+              _mm_packus_epi32(round_result_lo, round_result_hi);
+          const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+        } else {
+          const __m128i res_unsigned_16b =
+              _mm_adds_epu16(res, offset_const_16b);
+
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]),
+                          res_unsigned_16b);
+        }
+      }
+    }
+  } else if (!(w % 4)) {
+    for (i = 0; i < h; i += 2) {
+      for (j = 0; j < w; j += 4) {
+        const __m128i src_row_0 =
+            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
+        const __m128i src_row_1 =
+            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride]));
+        const __m128i src_10 = _mm_unpacklo_epi64(src_row_0, src_row_1);
+
+        const __m128i res = _mm_sll_epi16(src_10, left_shift);
+
+        if (do_average) {
+          const __m128i data_0 =
+              _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+          const __m128i data_1 = _mm_loadl_epi64(
+              (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+
+          const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+          const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
+
+          const __m128i res_32b = _mm_unpacklo_epi16(res, zero);
+          const __m128i res_unsigned_lo = _mm_add_epi32(res_32b, offset_const);
+
+          const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
+          const __m128i res_unsigned_hi =
+              _mm_add_epi32(res_32b_hi, offset_const);
+
+          const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+          const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+              &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+          const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_16b =
+              _mm_packus_epi32(round_result_lo, round_result_hi);
+          const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+          const __m128i res_1 = _mm_srli_si128(res_clip, 8);
+
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+          _mm_storel_epi64(
+              (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+        } else {
+          const __m128i res_unsigned_16b =
+              _mm_adds_epu16(res, offset_const_16b);
+
+          const __m128i res_1 = _mm_srli_si128(res_unsigned_16b, 8);
+
+          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]),
+                           res_unsigned_16b);
+          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                           res_1);
+        }
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_2d_sse4_1(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = MAX_SB_SIZE;
+  int i, j;
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+  const __m128i clip_pixel_to_bd =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  /* Horizontal filter */
+  {
+    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+    for (i = 0; i < im_h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        const __m128i data2 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
+
+        // Filter even-index pixels
+        const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
+        const __m128i res_2 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
+        const __m128i res_4 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
+        const __m128i res_6 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even =
+            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+        // Filter odd-index pixels
+        const __m128i res_1 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
+        const __m128i res_3 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
+        const __m128i res_5 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
+        const __m128i res_7 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_1) >> 1) -
+        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const int16_t *data = &im_block[i * im_stride + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+
+        const __m128i res_unsigned_lo =
+            _mm_add_epi32(res_lo_round, offset_const);
+
+        if (w < 8) {
+          if (do_average) {
+            const __m128i data_0 =
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+
+            const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0);
+
+            const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
+                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m128i round_result = highbd_convolve_rounding_sse2(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m128i res_16b =
+                _mm_packus_epi32(round_result, round_result);
+            const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+          } else {
+            const __m128i res_16b =
+                _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+          }
+        } else {
+          const __m128i res_hi_round =
+              _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+          const __m128i res_unsigned_hi =
+              _mm_add_epi32(res_hi_round, offset_const);
+
+          if (do_average) {
+            const __m128i data_lo =
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+            const __m128i data_hi =
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j + 4]));
+
+            const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo);
+            const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi);
+
+            const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m128i round_result_lo =
+                highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const,
+                                              &rounding_const, rounding_shift);
+            const __m128i round_result_hi =
+                highbd_convolve_rounding_sse2(&comp_avg_res_hi, &offset_const,
+                                              &rounding_const, rounding_shift);
+
+            const __m128i res_16b =
+                _mm_packus_epi32(round_result_lo, round_result_hi);
+            const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+          } else {
+            const __m128i res_16b =
+                _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
index 195f0f570..a9cf6a4d6 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -12,375 +12,209 @@
 #include <tmmintrin.h>
 #include <assert.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
 #include "av1/common/convolve.h"
 
-#if CONFIG_COMPOUND_ROUND
-void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
-                                  CONV_BUF_TYPE *dst, int dst_stride, int w,
-                                  int h, InterpFilterParams *filter_params_x,
-                                  InterpFilterParams *filter_params_y,
-                                  const int subpel_x_q4, const int subpel_y_q4,
-                                  ConvolveParams *conv_params, int bd) {
-  DECLARE_ALIGNED(16, int16_t,
-                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride,
+                                     uint16_t *dst, int dst_stride, int w,
+                                     int h, InterpFilterParams *filter_params_x,
+                                     InterpFilterParams *filter_params_y,
+                                     const int subpel_x_q4,
+                                     const int subpel_y_q4,
+                                     ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
   int im_h = h + filter_params_y->taps - 1;
-  int im_stride = MAX_SB_SIZE;
+  int im_stride = 8;
   int i, j;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const int do_average = conv_params->do_average;
   const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
 
-  /* Horizontal filter */
-  {
-    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
-    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << conv_params->round_0) >> 1);
-    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
-    for (i = 0; i < im_h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i data =
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+  __m128i coeffs_x[4], coeffs_y[4], s[16];
+
+  const __m128i round_const_x = _mm_set1_epi32(
+      ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const __m128i round_const_y =
+      _mm_set1_epi32(((1 << conv_params->round_1) >> 1) -
+                     (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      for (i = 0; i < im_h; i += 1) {
+        const __m128i row00 =
             _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-        const __m128i data2 =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
-
-        // Filter even-index pixels
-        const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
-        const __m128i res_2 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
-        const __m128i res_4 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
-        const __m128i res_6 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
-
-        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                         _mm_add_epi32(res_2, res_6));
-        res_even =
-            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
-
-        // Filter odd-index pixels
-        const __m128i res_1 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
-        const __m128i res_3 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
-        const __m128i res_5 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
-        const __m128i res_7 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
-
-        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                        _mm_add_epi32(res_3, res_7));
+        const __m128i row01 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+        // even pixels
+        s[0] = _mm_alignr_epi8(row01, row00, 0);
+        s[1] = _mm_alignr_epi8(row01, row00, 4);
+        s[2] = _mm_alignr_epi8(row01, row00, 8);
+        s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+        __m128i res_even = convolve(s, coeffs_x);
+        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+                                 round_shift_x);
+
+        // odd pixels
+        s[0] = _mm_alignr_epi8(row01, row00, 2);
+        s[1] = _mm_alignr_epi8(row01, row00, 6);
+        s[2] = _mm_alignr_epi8(row01, row00, 10);
+        s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+        __m128i res_odd = convolve(s, coeffs_x);
         res_odd =
-            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
 
-        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
-        const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
-        __m128i res = _mm_packs_epi32(res_even, res_odd);
-        res = _mm_max_epi16(_mm_min_epi16(res, maxval), _mm_setzero_si128());
-        _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
-      }
-    }
-  }
+        __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+        __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+        __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
 
-  /* Vertical filter */
-  {
-    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
-    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << conv_params->round_1) >> 1);
-    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        // Filter even-index pixels
-        const int16_t *data = &im_block[i * im_stride + j];
-        const __m128i src_0 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
-                               *(__m128i *)(data + 1 * im_stride));
-        const __m128i src_2 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
-                               *(__m128i *)(data + 3 * im_stride));
-        const __m128i src_4 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
-                               *(__m128i *)(data + 5 * im_stride));
-        const __m128i src_6 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
-                               *(__m128i *)(data + 7 * im_stride));
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
-                               *(__m128i *)(data + 1 * im_stride));
-        const __m128i src_3 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
-                               *(__m128i *)(data + 3 * im_stride));
-        const __m128i src_5 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
-                               *(__m128i *)(data + 5 * im_stride));
-        const __m128i src_7 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
-                               *(__m128i *)(data + 7 * im_stride));
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        const __m128i res_lo_round =
-            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
-        const __m128i res_hi_round =
-            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
-        // Accumulate values into the destination buffer
-        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
-        if (do_average) {
-          _mm_storeu_si128(p + 0,
-                           _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
-          _mm_storeu_si128(p + 1,
-                           _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
-        } else {
-          _mm_storeu_si128(p + 0, res_lo_round);
-          _mm_storeu_si128(p + 1, res_hi_round);
-        }
+        _mm_store_si128((__m128i *)&im_block[i * im_stride], res);
       }
     }
-  }
-}
-#else
-void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
-                                  CONV_BUF_TYPE *dst, int dst_stride, int w,
-                                  int h, InterpFilterParams *filter_params_x,
-                                  InterpFilterParams *filter_params_y,
-                                  const int subpel_x_q4, const int subpel_y_q4,
-                                  ConvolveParams *conv_params, int bd) {
-  DECLARE_ALIGNED(16, int16_t,
-                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
-  int im_h = h + filter_params_y->taps - 1;
-  int im_stride = MAX_SB_SIZE;
-  int i, j;
-  const int do_average = conv_params->do_average;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+    /* Vertical filter */
+    {
+      __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride));
+      __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride));
+      __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride));
+      __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride));
+      __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride));
+      __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride));
+      __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride));
+
+      s[0] = _mm_unpacklo_epi16(s0, s1);
+      s[1] = _mm_unpacklo_epi16(s2, s3);
+      s[2] = _mm_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm_unpackhi_epi16(s0, s1);
+      s[5] = _mm_unpackhi_epi16(s2, s3);
+      s[6] = _mm_unpackhi_epi16(s4, s5);
+
+      s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+      s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+      s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+      s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+      s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+      s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+      for (i = 0; i < h; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
+        __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * im_stride));
+
+        s[3] = _mm_unpacklo_epi16(s6, s7);
+        s[7] = _mm_unpackhi_epi16(s6, s7);
+
+        s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+        s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+        const __m128i res_a0 = convolve(s, coeffs_y);
+        __m128i res_a_round0 =
+            _mm_sra_epi32(_mm_add_epi32(res_a0, round_const_y), round_shift_y);
+        res_a_round0 = _mm_sra_epi32(
+            _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits);
+
+        const __m128i res_a1 = convolve(s + 8, coeffs_y);
+        __m128i res_a_round1 =
+            _mm_sra_epi32(_mm_add_epi32(res_a1, round_const_y), round_shift_y);
+        res_a_round1 = _mm_sra_epi32(
+            _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m128i res_b0 = convolve(s + 4, coeffs_y);
+          __m128i res_b_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_b0, round_const_y), round_shift_y);
+          res_b_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_b_round0, round_const_bits), round_shift_bits);
+
+          const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+          __m128i res_b_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_b1, round_const_y), round_shift_y);
+          res_b_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_b_round1, round_const_bits), round_shift_bits);
+
+          __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+          res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+          res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+          __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+          res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+          res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_16bit1);
+        } else if (w == 4) {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_a_round1);
+        } else {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
 
-  // Check that, even with 12-bit input, the intermediate values will fit
-  // into an unsigned 15-bit intermediate array.
-  assert(conv_params->round_0 >= 5);
-
-  /* Horizontal filter */
-  {
-    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
-    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const = _mm_set1_epi32(
-        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
-    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
-    for (i = 0; i < im_h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i data =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-        const __m128i data2 =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
-
-        // Filter even-index pixels
-        const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
-        const __m128i res_2 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
-        const __m128i res_4 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
-        const __m128i res_6 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
-
-        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                         _mm_add_epi32(res_2, res_6));
-        res_even =
-            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
-
-        // Filter odd-index pixels
-        const __m128i res_1 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
-        const __m128i res_3 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
-        const __m128i res_5 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
-        const __m128i res_7 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
-
-        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                        _mm_add_epi32(res_3, res_7));
-        res_odd =
-            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
 
-        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
-        __m128i res = _mm_packs_epi32(res_even, res_odd);
-        _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
-      }
-    }
-  }
+          *((uint32_t *)(&dst[i * dst_stride + j])) =
+              _mm_cvtsi128_si32(res_a_round0);
 
-  /* Vertical filter */
-  {
-    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
-    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const = _mm_set1_epi32(
-        ((1 << conv_params->round_1) >> 1) -
-        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
-    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        // Filter even-index pixels
-        const int16_t *data = &im_block[i * im_stride + j];
-        const __m128i src_0 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
-                               *(__m128i *)(data + 1 * im_stride));
-        const __m128i src_2 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
-                               *(__m128i *)(data + 3 * im_stride));
-        const __m128i src_4 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
-                               *(__m128i *)(data + 5 * im_stride));
-        const __m128i src_6 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
-                               *(__m128i *)(data + 7 * im_stride));
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
-                               *(__m128i *)(data + 1 * im_stride));
-        const __m128i src_3 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
-                               *(__m128i *)(data + 3 * im_stride));
-        const __m128i src_5 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
-                               *(__m128i *)(data + 5 * im_stride));
-        const __m128i src_7 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
-                               *(__m128i *)(data + 7 * im_stride));
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        const __m128i res_lo_round =
-            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
-        const __m128i res_hi_round =
-            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
-        // Accumulate values into the destination buffer
-        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
-        if (do_average) {
-          _mm_storeu_si128(p + 0,
-                           _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
-          _mm_storeu_si128(p + 1,
-                           _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
-        } else {
-          _mm_storeu_si128(p + 0, res_lo_round);
-          _mm_storeu_si128(p + 1, res_hi_round);
+          *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
+              _mm_cvtsi128_si32(res_a_round1);
         }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+
+        s[0 + 8] = s[1 + 8];
+        s[1 + 8] = s[2 + 8];
+        s[2 + 8] = s[3 + 8];
+
+        s[4 + 8] = s[5 + 8];
+        s[5 + 8] = s[6 + 8];
+        s[6 + 8] = s[7 + 8];
+
+        s6 = s8;
       }
     }
   }
 }
-#endif
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
index 0e833e6d9..debb05a6d 100644
--- a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -11,8 +11,9 @@
 #include <assert.h>
 #include <immintrin.h>
 
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
 #include "av1/common/av1_inv_txfm1d_cfg.h"
 
 // Note:
@@ -85,17 +86,6 @@ static void load_buffer_32x32(const int32_t *coeff, __m256i *in) {
   }
 }
 
-static void round_shift_32x32(__m256i *in, int shift) {
-  __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
-  int i = 0;
-
-  while (i < 128) {
-    in[i] = _mm256_add_epi32(in[i], rnding);
-    in[i] = _mm256_srai_epi32(in[i], shift);
-    i++;
-  }
-}
-
 static __m256i highbd_clamp_epi32(__m256i x, int bd) {
   const __m256i zero = _mm256_setzero_si256();
   const __m256i one = _mm256_set1_epi16(1);
@@ -120,7 +110,7 @@ static void write_buffer_32x32(__m256i *in, uint16_t *output, int stride,
   (void)fliplr;
   (void)flipud;
 
-  round_shift_32x32(in, shift);
+  __m256i round = _mm256_set1_epi32((1 << shift) >> 1);
 
   while (i < 128) {
     u0 = _mm256_loadu_si256((const __m256i *)output);
@@ -136,6 +126,16 @@ static void write_buffer_32x32(__m256i *in, uint16_t *output, int stride,
     v2 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x20);
     v3 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x31);
 
+    v0 = _mm256_add_epi32(v0, round);
+    v1 = _mm256_add_epi32(v1, round);
+    v2 = _mm256_add_epi32(v2, round);
+    v3 = _mm256_add_epi32(v3, round);
+
+    v0 = _mm256_sra_epi32(v0, _mm_cvtsi32_si128(shift));
+    v1 = _mm256_sra_epi32(v1, _mm_cvtsi32_si128(shift));
+    v2 = _mm256_sra_epi32(v2, _mm_cvtsi32_si128(shift));
+    v3 = _mm256_sra_epi32(v3, _mm_cvtsi32_si128(shift));
+
     v0 = _mm256_add_epi32(v0, x0);
     v1 = _mm256_add_epi32(v1, x1);
     v2 = _mm256_add_epi32(v2, x2);
@@ -167,7 +167,53 @@ static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
   return x;
 }
 
-static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
+static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
+                        __m256i *out1, const __m256i *clamp_lo,
+                        const __m256i *clamp_hi) {
+  __m256i a0 = _mm256_add_epi32(in0, in1);
+  __m256i a1 = _mm256_sub_epi32(in0, in1);
+
+  a0 = _mm256_max_epi32(a0, *clamp_lo);
+  a0 = _mm256_min_epi32(a0, *clamp_hi);
+  a1 = _mm256_max_epi32(a1, *clamp_lo);
+  a1 = _mm256_min_epi32(a1, *clamp_hi);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void addsub_no_clamp_avx2(const __m256i in0, const __m256i in1,
+                                 __m256i *out0, __m256i *out1) {
+  __m256i a0 = _mm256_add_epi32(in0, in1);
+  __m256i a1 = _mm256_sub_epi32(in0, in1);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void addsub_shift_avx2(const __m256i in0, const __m256i in1,
+                              __m256i *out0, __m256i *out1,
+                              const __m256i *clamp_lo, const __m256i *clamp_hi,
+                              int shift) {
+  __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
+  __m256i in0_w_offset = _mm256_add_epi32(in0, offset);
+  __m256i a0 = _mm256_add_epi32(in0_w_offset, in1);
+  __m256i a1 = _mm256_sub_epi32(in0_w_offset, in1);
+
+  a0 = _mm256_max_epi32(a0, *clamp_lo);
+  a0 = _mm256_min_epi32(a0, *clamp_hi);
+  a1 = _mm256_max_epi32(a1, *clamp_lo);
+  a1 = _mm256_min_epi32(a1, *clamp_hi);
+
+  a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+  a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
+                        int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
   const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
   const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
@@ -220,6 +266,9 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
   const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
   const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
   const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
   __m256i bf1[32], bf0[32];
   int col;
 
@@ -334,22 +383,15 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
         half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
     bf1[15] =
         half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
-    bf1[16] = _mm256_add_epi32(bf0[16], bf0[17]);
-    bf1[17] = _mm256_sub_epi32(bf0[16], bf0[17]);
-    bf1[18] = _mm256_sub_epi32(bf0[19], bf0[18]);
-    bf1[19] = _mm256_add_epi32(bf0[18], bf0[19]);
-    bf1[20] = _mm256_add_epi32(bf0[20], bf0[21]);
-    bf1[21] = _mm256_sub_epi32(bf0[20], bf0[21]);
-    bf1[22] = _mm256_sub_epi32(bf0[23], bf0[22]);
-    bf1[23] = _mm256_add_epi32(bf0[22], bf0[23]);
-    bf1[24] = _mm256_add_epi32(bf0[24], bf0[25]);
-    bf1[25] = _mm256_sub_epi32(bf0[24], bf0[25]);
-    bf1[26] = _mm256_sub_epi32(bf0[27], bf0[26]);
-    bf1[27] = _mm256_add_epi32(bf0[26], bf0[27]);
-    bf1[28] = _mm256_add_epi32(bf0[28], bf0[29]);
-    bf1[29] = _mm256_sub_epi32(bf0[28], bf0[29]);
-    bf1[30] = _mm256_sub_epi32(bf0[31], bf0[30]);
-    bf1[31] = _mm256_add_epi32(bf0[30], bf0[31]);
+
+    addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
 
     // stage 4
     bf0[0] = bf1[0];
@@ -363,14 +405,12 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
     bf0[6] =
         half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
     bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
-    bf0[8] = _mm256_add_epi32(bf1[8], bf1[9]);
-    bf0[9] = _mm256_sub_epi32(bf1[8], bf1[9]);
-    bf0[10] = _mm256_sub_epi32(bf1[11], bf1[10]);
-    bf0[11] = _mm256_add_epi32(bf1[10], bf1[11]);
-    bf0[12] = _mm256_add_epi32(bf1[12], bf1[13]);
-    bf0[13] = _mm256_sub_epi32(bf1[12], bf1[13]);
-    bf0[14] = _mm256_sub_epi32(bf1[15], bf1[14]);
-    bf0[15] = _mm256_add_epi32(bf1[14], bf1[15]);
+
+    addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
+
     bf0[16] = bf1[16];
     bf0[17] =
         half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
@@ -405,10 +445,8 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
         half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
     bf1[3] =
         half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
-    bf1[4] = _mm256_add_epi32(bf0[4], bf0[5]);
-    bf1[5] = _mm256_sub_epi32(bf0[4], bf0[5]);
-    bf1[6] = _mm256_sub_epi32(bf0[7], bf0[6]);
-    bf1[7] = _mm256_add_epi32(bf0[6], bf0[7]);
+    addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
     bf1[8] = bf0[8];
     bf1[9] =
         half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
@@ -421,42 +459,28 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
     bf1[14] =
         half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
     bf1[15] = bf0[15];
-    bf1[16] = _mm256_add_epi32(bf0[16], bf0[19]);
-    bf1[17] = _mm256_add_epi32(bf0[17], bf0[18]);
-    bf1[18] = _mm256_sub_epi32(bf0[17], bf0[18]);
-    bf1[19] = _mm256_sub_epi32(bf0[16], bf0[19]);
-    bf1[20] = _mm256_sub_epi32(bf0[23], bf0[20]);
-    bf1[21] = _mm256_sub_epi32(bf0[22], bf0[21]);
-    bf1[22] = _mm256_add_epi32(bf0[21], bf0[22]);
-    bf1[23] = _mm256_add_epi32(bf0[20], bf0[23]);
-    bf1[24] = _mm256_add_epi32(bf0[24], bf0[27]);
-    bf1[25] = _mm256_add_epi32(bf0[25], bf0[26]);
-    bf1[26] = _mm256_sub_epi32(bf0[25], bf0[26]);
-    bf1[27] = _mm256_sub_epi32(bf0[24], bf0[27]);
-    bf1[28] = _mm256_sub_epi32(bf0[31], bf0[28]);
-    bf1[29] = _mm256_sub_epi32(bf0[30], bf0[29]);
-    bf1[30] = _mm256_add_epi32(bf0[29], bf0[30]);
-    bf1[31] = _mm256_add_epi32(bf0[28], bf0[31]);
+    addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
 
     // stage 6
-    bf0[0] = _mm256_add_epi32(bf1[0], bf1[3]);
-    bf0[1] = _mm256_add_epi32(bf1[1], bf1[2]);
-    bf0[2] = _mm256_sub_epi32(bf1[1], bf1[2]);
-    bf0[3] = _mm256_sub_epi32(bf1[0], bf1[3]);
+    addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
     bf0[4] = bf1[4];
     bf0[5] =
         half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
     bf0[6] =
         half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
     bf0[7] = bf1[7];
-    bf0[8] = _mm256_add_epi32(bf1[8], bf1[11]);
-    bf0[9] = _mm256_add_epi32(bf1[9], bf1[10]);
-    bf0[10] = _mm256_sub_epi32(bf1[9], bf1[10]);
-    bf0[11] = _mm256_sub_epi32(bf1[8], bf1[11]);
-    bf0[12] = _mm256_sub_epi32(bf1[15], bf1[12]);
-    bf0[13] = _mm256_sub_epi32(bf1[14], bf1[13]);
-    bf0[14] = _mm256_add_epi32(bf1[13], bf1[14]);
-    bf0[15] = _mm256_add_epi32(bf1[12], bf1[15]);
+    addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
     bf0[16] = bf1[16];
     bf0[17] = bf1[17];
     bf0[18] =
@@ -483,14 +507,10 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
     bf0[31] = bf1[31];
 
     // stage 7
-    bf1[0] = _mm256_add_epi32(bf0[0], bf0[7]);
-    bf1[1] = _mm256_add_epi32(bf0[1], bf0[6]);
-    bf1[2] = _mm256_add_epi32(bf0[2], bf0[5]);
-    bf1[3] = _mm256_add_epi32(bf0[3], bf0[4]);
-    bf1[4] = _mm256_sub_epi32(bf0[3], bf0[4]);
-    bf1[5] = _mm256_sub_epi32(bf0[2], bf0[5]);
-    bf1[6] = _mm256_sub_epi32(bf0[1], bf0[6]);
-    bf1[7] = _mm256_sub_epi32(bf0[0], bf0[7]);
+    addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
     bf1[8] = bf0[8];
     bf1[9] = bf0[9];
     bf1[10] =
@@ -503,40 +523,24 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
         half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
     bf1[14] = bf0[14];
     bf1[15] = bf0[15];
-    bf1[16] = _mm256_add_epi32(bf0[16], bf0[23]);
-    bf1[17] = _mm256_add_epi32(bf0[17], bf0[22]);
-    bf1[18] = _mm256_add_epi32(bf0[18], bf0[21]);
-    bf1[19] = _mm256_add_epi32(bf0[19], bf0[20]);
-    bf1[20] = _mm256_sub_epi32(bf0[19], bf0[20]);
-    bf1[21] = _mm256_sub_epi32(bf0[18], bf0[21]);
-    bf1[22] = _mm256_sub_epi32(bf0[17], bf0[22]);
-    bf1[23] = _mm256_sub_epi32(bf0[16], bf0[23]);
-    bf1[24] = _mm256_sub_epi32(bf0[31], bf0[24]);
-    bf1[25] = _mm256_sub_epi32(bf0[30], bf0[25]);
-    bf1[26] = _mm256_sub_epi32(bf0[29], bf0[26]);
-    bf1[27] = _mm256_sub_epi32(bf0[28], bf0[27]);
-    bf1[28] = _mm256_add_epi32(bf0[27], bf0[28]);
-    bf1[29] = _mm256_add_epi32(bf0[26], bf0[29]);
-    bf1[30] = _mm256_add_epi32(bf0[25], bf0[30]);
-    bf1[31] = _mm256_add_epi32(bf0[24], bf0[31]);
+    addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
 
     // stage 8
-    bf0[0] = _mm256_add_epi32(bf1[0], bf1[15]);
-    bf0[1] = _mm256_add_epi32(bf1[1], bf1[14]);
-    bf0[2] = _mm256_add_epi32(bf1[2], bf1[13]);
-    bf0[3] = _mm256_add_epi32(bf1[3], bf1[12]);
-    bf0[4] = _mm256_add_epi32(bf1[4], bf1[11]);
-    bf0[5] = _mm256_add_epi32(bf1[5], bf1[10]);
-    bf0[6] = _mm256_add_epi32(bf1[6], bf1[9]);
-    bf0[7] = _mm256_add_epi32(bf1[7], bf1[8]);
-    bf0[8] = _mm256_sub_epi32(bf1[7], bf1[8]);
-    bf0[9] = _mm256_sub_epi32(bf1[6], bf1[9]);
-    bf0[10] = _mm256_sub_epi32(bf1[5], bf1[10]);
-    bf0[11] = _mm256_sub_epi32(bf1[4], bf1[11]);
-    bf0[12] = _mm256_sub_epi32(bf1[3], bf1[12]);
-    bf0[13] = _mm256_sub_epi32(bf1[2], bf1[13]);
-    bf0[14] = _mm256_sub_epi32(bf1[1], bf1[14]);
-    bf0[15] = _mm256_sub_epi32(bf1[0], bf1[15]);
+    addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
     bf0[16] = bf1[16];
     bf0[17] = bf1[17];
     bf0[18] = bf1[18];
@@ -563,58 +567,91 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
     bf0[31] = bf1[31];
 
     // stage 9
-    out[0 * 4 + col] = _mm256_add_epi32(bf0[0], bf0[31]);
-    out[1 * 4 + col] = _mm256_add_epi32(bf0[1], bf0[30]);
-    out[2 * 4 + col] = _mm256_add_epi32(bf0[2], bf0[29]);
-    out[3 * 4 + col] = _mm256_add_epi32(bf0[3], bf0[28]);
-    out[4 * 4 + col] = _mm256_add_epi32(bf0[4], bf0[27]);
-    out[5 * 4 + col] = _mm256_add_epi32(bf0[5], bf0[26]);
-    out[6 * 4 + col] = _mm256_add_epi32(bf0[6], bf0[25]);
-    out[7 * 4 + col] = _mm256_add_epi32(bf0[7], bf0[24]);
-    out[8 * 4 + col] = _mm256_add_epi32(bf0[8], bf0[23]);
-    out[9 * 4 + col] = _mm256_add_epi32(bf0[9], bf0[22]);
-    out[10 * 4 + col] = _mm256_add_epi32(bf0[10], bf0[21]);
-    out[11 * 4 + col] = _mm256_add_epi32(bf0[11], bf0[20]);
-    out[12 * 4 + col] = _mm256_add_epi32(bf0[12], bf0[19]);
-    out[13 * 4 + col] = _mm256_add_epi32(bf0[13], bf0[18]);
-    out[14 * 4 + col] = _mm256_add_epi32(bf0[14], bf0[17]);
-    out[15 * 4 + col] = _mm256_add_epi32(bf0[15], bf0[16]);
-    out[16 * 4 + col] = _mm256_sub_epi32(bf0[15], bf0[16]);
-    out[17 * 4 + col] = _mm256_sub_epi32(bf0[14], bf0[17]);
-    out[18 * 4 + col] = _mm256_sub_epi32(bf0[13], bf0[18]);
-    out[19 * 4 + col] = _mm256_sub_epi32(bf0[12], bf0[19]);
-    out[20 * 4 + col] = _mm256_sub_epi32(bf0[11], bf0[20]);
-    out[21 * 4 + col] = _mm256_sub_epi32(bf0[10], bf0[21]);
-    out[22 * 4 + col] = _mm256_sub_epi32(bf0[9], bf0[22]);
-    out[23 * 4 + col] = _mm256_sub_epi32(bf0[8], bf0[23]);
-    out[24 * 4 + col] = _mm256_sub_epi32(bf0[7], bf0[24]);
-    out[25 * 4 + col] = _mm256_sub_epi32(bf0[6], bf0[25]);
-    out[26 * 4 + col] = _mm256_sub_epi32(bf0[5], bf0[26]);
-    out[27 * 4 + col] = _mm256_sub_epi32(bf0[4], bf0[27]);
-    out[28 * 4 + col] = _mm256_sub_epi32(bf0[3], bf0[28]);
-    out[29 * 4 + col] = _mm256_sub_epi32(bf0[2], bf0[29]);
-    out[30 * 4 + col] = _mm256_sub_epi32(bf0[1], bf0[30]);
-    out[31 * 4 + col] = _mm256_sub_epi32(bf0[0], bf0[31]);
+    if (do_cols) {
+      addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0 * 4 + col,
+                           out + 31 * 4 + col);
+      addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1 * 4 + col,
+                           out + 30 * 4 + col);
+      addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2 * 4 + col,
+                           out + 29 * 4 + col);
+      addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3 * 4 + col,
+                           out + 28 * 4 + col);
+      addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4 * 4 + col,
+                           out + 27 * 4 + col);
+      addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5 * 4 + col,
+                           out + 26 * 4 + col);
+      addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6 * 4 + col,
+                           out + 25 * 4 + col);
+      addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7 * 4 + col,
+                           out + 24 * 4 + col);
+      addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8 * 4 + col,
+                           out + 23 * 4 + col);
+      addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9 * 4 + col,
+                           out + 22 * 4 + col);
+      addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10 * 4 + col,
+                           out + 21 * 4 + col);
+      addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11 * 4 + col,
+                           out + 20 * 4 + col);
+      addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12 * 4 + col,
+                           out + 19 * 4 + col);
+      addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13 * 4 + col,
+                           out + 18 * 4 + col);
+      addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14 * 4 + col,
+                           out + 17 * 4 + col);
+      addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15 * 4 + col,
+                           out + 16 * 4 + col);
+    } else {
+      addsub_shift_avx2(bf0[0], bf0[31], out + 0 * 4 + col, out + 31 * 4 + col,
+                        &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[1], bf0[30], out + 1 * 4 + col, out + 30 * 4 + col,
+                        &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[2], bf0[29], out + 2 * 4 + col, out + 29 * 4 + col,
+                        &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[3], bf0[28], out + 3 * 4 + col, out + 28 * 4 + col,
+                        &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[4], bf0[27], out + 4 * 4 + col, out + 27 * 4 + col,
+                        &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[5], bf0[26], out + 5 * 4 + col, out + 26 * 4 + col,
+                        &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[6], bf0[25], out + 6 * 4 + col, out + 25 * 4 + col,
+                        &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[7], bf0[24], out + 7 * 4 + col, out + 24 * 4 + col,
+                        &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[8], bf0[23], out + 8 * 4 + col, out + 23 * 4 + col,
+                        &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[9], bf0[22], out + 9 * 4 + col, out + 22 * 4 + col,
+                        &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[10], bf0[21], out + 10 * 4 + col,
+                        out + 21 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[11], bf0[20], out + 11 * 4 + col,
+                        out + 20 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[12], bf0[19], out + 12 * 4 + col,
+                        out + 19 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[13], bf0[18], out + 13 * 4 + col,
+                        out + 18 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[14], bf0[17], out + 14 * 4 + col,
+                        out + 17 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_avx2(bf0[15], bf0[16], out + 15 * 4 + col,
+                        out + 16 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+    }
   }
 }
 
 void av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff, uint16_t *output,
                                    int stride, TX_TYPE tx_type, int bd) {
   __m256i in[128], out[128];
-  const TXFM_1D_CFG *row_cfg = NULL;
-  const TXFM_1D_CFG *col_cfg = NULL;
+  const int8_t *shift = inv_txfm_shift_ls[TX_32X32];
+  const int txw_idx = get_txw_idx(TX_32X32);
+  const int txh_idx = get_txh_idx(TX_32X32);
 
   switch (tx_type) {
     case DCT_DCT:
-      row_cfg = &inv_txfm_1d_row_cfg_dct_32;
-      col_cfg = &inv_txfm_1d_col_cfg_dct_32;
       load_buffer_32x32(coeff, in);
       transpose_32x32(in, out);
-      idct32_avx2(out, in, row_cfg->cos_bit[2]);
-      round_shift_32x32(in, -row_cfg->shift[0]);
+      idct32_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
       transpose_32x32(in, out);
-      idct32_avx2(out, in, col_cfg->cos_bit[2]);
-      write_buffer_32x32(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      idct32_avx2(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_32x32(in, output, stride, 0, 0, -shift[1], bd);
       break;
     default: assert(0);
   }
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
index 8613bed86..801a4133b 100644
--- a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -11,8 +11,9 @@
 #include <assert.h>
 #include <smmintrin.h> /* SSE4.1 */
 
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
 #include "av1/common/av1_inv_txfm1d_cfg.h"
 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
 
@@ -23,13 +24,82 @@ static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
   in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
 }
 
-static void idct4x4_sse4_1(__m128i *in, int bit) {
+static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0,
+                          __m128i *out1, const __m128i *clamp_lo,
+                          const __m128i *clamp_hi) {
+  __m128i a0 = _mm_add_epi32(in0, in1);
+  __m128i a1 = _mm_sub_epi32(in0, in1);
+
+  a0 = _mm_max_epi32(a0, *clamp_lo);
+  a0 = _mm_min_epi32(a0, *clamp_hi);
+  a1 = _mm_max_epi32(a1, *clamp_lo);
+  a1 = _mm_min_epi32(a1, *clamp_hi);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void addsub_no_clamp_sse4_1(const __m128i in0, const __m128i in1,
+                                   __m128i *out0, __m128i *out1) {
+  __m128i a0 = _mm_add_epi32(in0, in1);
+  __m128i a1 = _mm_sub_epi32(in0, in1);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1,
+                                __m128i *out0, __m128i *out1,
+                                const __m128i *clamp_lo,
+                                const __m128i *clamp_hi, int shift) {
+  __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
+  __m128i in0_w_offset = _mm_add_epi32(in0, offset);
+  __m128i a0 = _mm_add_epi32(in0_w_offset, in1);
+  __m128i a1 = _mm_sub_epi32(in0_w_offset, in1);
+
+  a0 = _mm_max_epi32(a0, *clamp_lo);
+  a0 = _mm_min_epi32(a0, *clamp_hi);
+  a1 = _mm_max_epi32(a1, *clamp_lo);
+  a1 = _mm_min_epi32(a1, *clamp_hi);
+
+  a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+  a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
+                             __m128i *out0, __m128i *out1,
+                             const __m128i *clamp_lo, const __m128i *clamp_hi,
+                             int shift) {
+  __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
+  __m128i a0 = _mm_add_epi32(offset, in0);
+  __m128i a1 = _mm_sub_epi32(offset, in1);
+
+  a0 = _mm_max_epi32(a0, *clamp_lo);
+  a0 = _mm_min_epi32(a0, *clamp_hi);
+  a1 = _mm_max_epi32(a1, *clamp_lo);
+  a1 = _mm_min_epi32(a1, *clamp_hi);
+
+  a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+  a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
   const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
   __m128i u0, u1, u2, u3;
   __m128i v0, v1, v2, v3, x, y;
 
@@ -65,84 +135,72 @@ static void idct4x4_sse4_1(__m128i *in, int bit) {
   v3 = _mm_add_epi32(v3, rnding);
   v3 = _mm_srai_epi32(v3, bit);
 
-  in[0] = _mm_add_epi32(v0, v3);
-  in[1] = _mm_add_epi32(v1, v2);
-  in[2] = _mm_sub_epi32(v1, v2);
-  in[3] = _mm_sub_epi32(v0, v3);
+  addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi);
 }
 
 static void iadst4x4_sse4_1(__m128i *in, int bit) {
-  const int32_t *cospi = cospi_arr(bit);
-  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
-  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
-  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
-  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
-  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
-  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
-  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const int32_t *sinpi = sinpi_arr(bit);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
-  const __m128i zero = _mm_setzero_si128();
+  const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
+  const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
+  const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
+  const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
+  __m128i t;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i x0, x1, x2, x3;
   __m128i u0, u1, u2, u3;
-  __m128i v0, v1, v2, v3, x, y;
+  __m128i v0, v1, v2, v3;
 
   v0 = _mm_unpacklo_epi32(in[0], in[1]);
   v1 = _mm_unpackhi_epi32(in[0], in[1]);
   v2 = _mm_unpacklo_epi32(in[2], in[3]);
   v3 = _mm_unpackhi_epi32(in[2], in[3]);
 
-  u0 = _mm_unpacklo_epi64(v0, v2);
-  u1 = _mm_unpackhi_epi64(v0, v2);
-  u2 = _mm_unpacklo_epi64(v1, v3);
-  u3 = _mm_unpackhi_epi64(v1, v3);
-
-  // stage 0
-  // stage 1
-  u1 = _mm_sub_epi32(zero, u1);
-  u3 = _mm_sub_epi32(zero, u3);
-
-  // stage 2
-  v0 = u0;
-  v1 = u3;
-  x = _mm_mullo_epi32(u1, cospi32);
-  y = _mm_mullo_epi32(u2, cospi32);
-  v2 = _mm_add_epi32(x, y);
-  v2 = _mm_add_epi32(v2, rnding);
-  v2 = _mm_srai_epi32(v2, bit);
-
-  v3 = _mm_sub_epi32(x, y);
-  v3 = _mm_add_epi32(v3, rnding);
-  v3 = _mm_srai_epi32(v3, bit);
-
-  // stage 3
-  u0 = _mm_add_epi32(v0, v2);
-  u1 = _mm_add_epi32(v1, v3);
-  u2 = _mm_sub_epi32(v0, v2);
-  u3 = _mm_sub_epi32(v1, v3);
-
-  // stage 4
-  x = _mm_mullo_epi32(u0, cospi8);
-  y = _mm_mullo_epi32(u1, cospi56);
-  in[3] = _mm_add_epi32(x, y);
-  in[3] = _mm_add_epi32(in[3], rnding);
-  in[3] = _mm_srai_epi32(in[3], bit);
-
-  x = _mm_mullo_epi32(u0, cospi56);
-  y = _mm_mullo_epi32(u1, cospim8);
-  in[0] = _mm_add_epi32(x, y);
-  in[0] = _mm_add_epi32(in[0], rnding);
-  in[0] = _mm_srai_epi32(in[0], bit);
-
-  x = _mm_mullo_epi32(u2, cospi40);
-  y = _mm_mullo_epi32(u3, cospi24);
-  in[1] = _mm_add_epi32(x, y);
-  in[1] = _mm_add_epi32(in[1], rnding);
-  in[1] = _mm_srai_epi32(in[1], bit);
-
-  x = _mm_mullo_epi32(u2, cospi24);
-  y = _mm_mullo_epi32(u3, cospim40);
-  in[2] = _mm_add_epi32(x, y);
-  in[2] = _mm_add_epi32(in[2], rnding);
-  in[2] = _mm_srai_epi32(in[2], bit);
+  x0 = _mm_unpacklo_epi64(v0, v2);
+  x1 = _mm_unpackhi_epi64(v0, v2);
+  x2 = _mm_unpacklo_epi64(v1, v3);
+  x3 = _mm_unpackhi_epi64(v1, v3);
+
+  s0 = _mm_mullo_epi32(x0, sinpi1);
+  s1 = _mm_mullo_epi32(x0, sinpi2);
+  s2 = _mm_mullo_epi32(x1, sinpi3);
+  s3 = _mm_mullo_epi32(x2, sinpi4);
+  s4 = _mm_mullo_epi32(x2, sinpi1);
+  s5 = _mm_mullo_epi32(x3, sinpi2);
+  s6 = _mm_mullo_epi32(x3, sinpi4);
+  t = _mm_sub_epi32(x0, x2);
+  s7 = _mm_add_epi32(t, x3);
+
+  t = _mm_add_epi32(s0, s3);
+  s0 = _mm_add_epi32(t, s5);
+  t = _mm_sub_epi32(s1, s4);
+  s1 = _mm_sub_epi32(t, s6);
+  s3 = s2;
+  s2 = _mm_mullo_epi32(s7, sinpi3);
+
+  u0 = _mm_add_epi32(s0, s3);
+  u1 = _mm_add_epi32(s1, s3);
+  u2 = s2;
+  t = _mm_add_epi32(s0, s1);
+  u3 = _mm_sub_epi32(t, s3);
+
+  u0 = _mm_add_epi32(u0, rnding);
+  u0 = _mm_srai_epi32(u0, bit);
+
+  u1 = _mm_add_epi32(u1, rnding);
+  u1 = _mm_srai_epi32(u1, bit);
+
+  u2 = _mm_add_epi32(u2, rnding);
+  u2 = _mm_srai_epi32(u2, bit);
+
+  u3 = _mm_add_epi32(u3, rnding);
+  u3 = _mm_srai_epi32(u3, bit);
+
+  in[0] = u0;
+  in[1] = u1;
+  in[2] = u2;
+  in[3] = u3;
 }
 
 static INLINE void round_shift_4x4(__m128i *in, int shift) {
@@ -232,84 +290,65 @@ static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
 void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
                                    int stride, TX_TYPE tx_type, int bd) {
   __m128i in[4];
-  const TXFM_1D_CFG *row_cfg = NULL;
-  const TXFM_1D_CFG *col_cfg = NULL;
+  const int8_t *shift = inv_txfm_shift_ls[TX_4X4];
+  const int txw_idx = get_txw_idx(TX_4X4);
+  const int txh_idx = get_txh_idx(TX_4X4);
 
   switch (tx_type) {
     case DCT_DCT:
-      row_cfg = &inv_txfm_1d_row_cfg_dct_4;
-      col_cfg = &inv_txfm_1d_col_cfg_dct_4;
       load_buffer_4x4(coeff, in);
-      idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
-      idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_DCT:
-      row_cfg = &inv_txfm_1d_row_cfg_dct_4;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_4;
       load_buffer_4x4(coeff, in);
-      idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
-      iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case DCT_ADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_4;
-      col_cfg = &inv_txfm_1d_col_cfg_dct_4;
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
-      idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+      idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_ADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_4;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_4;
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
-      iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
       break;
-#if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      row_cfg = &inv_txfm_1d_row_cfg_dct_4;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_4;
       load_buffer_4x4(coeff, in);
-      idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
-      iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      write_buffer_4x4(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
+      idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
       break;
     case DCT_FLIPADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_4;
-      col_cfg = &inv_txfm_1d_col_cfg_dct_4;
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
-      idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      write_buffer_4x4(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+      idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_FLIPADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_4;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_4;
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
-      iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      write_buffer_4x4(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
       break;
     case ADST_FLIPADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_4;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_4;
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
-      iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      write_buffer_4x4(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_ADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_4;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_4;
       load_buffer_4x4(coeff, in);
-      iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
-      iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      write_buffer_4x4(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
       break;
-#endif  // CONFIG_EXT_TX
     default: assert(0);
   }
 }
@@ -334,7 +373,8 @@ static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
   in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
 }
 
-static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                           int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
@@ -347,6 +387,9 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
   __m128i x, y;
@@ -413,16 +456,12 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
     v3 = _mm_add_epi32(v3, rnding);
     v3 = _mm_srai_epi32(v3, bit);
 
-    v4 = _mm_add_epi32(u4, u5);
-    v5 = _mm_sub_epi32(u4, u5);
-    v6 = _mm_sub_epi32(u7, u6);
-    v7 = _mm_add_epi32(u6, u7);
+    addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
 
     // stage 4
-    u0 = _mm_add_epi32(v0, v3);
-    u1 = _mm_add_epi32(v1, v2);
-    u2 = _mm_sub_epi32(v1, v2);
-    u3 = _mm_sub_epi32(v0, v3);
+    addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
     u4 = v4;
     u7 = v7;
 
@@ -437,195 +476,334 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
     u5 = _mm_srai_epi32(u5, bit);
 
     // stage 5
-    out[0 * 2 + col] = _mm_add_epi32(u0, u7);
-    out[1 * 2 + col] = _mm_add_epi32(u1, u6);
-    out[2 * 2 + col] = _mm_add_epi32(u2, u5);
-    out[3 * 2 + col] = _mm_add_epi32(u3, u4);
-    out[4 * 2 + col] = _mm_sub_epi32(u3, u4);
-    out[5 * 2 + col] = _mm_sub_epi32(u2, u5);
-    out[6 * 2 + col] = _mm_sub_epi32(u1, u6);
-    out[7 * 2 + col] = _mm_sub_epi32(u0, u7);
+    if (do_cols) {
+      addsub_no_clamp_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col);
+      addsub_no_clamp_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col);
+      addsub_no_clamp_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col);
+      addsub_no_clamp_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col);
+    } else {
+      addsub_shift_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+    }
   }
 }
 
-static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                            int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
-  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
-  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
-  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
-  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
-  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
-  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
-  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
-  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
-  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
-  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
-  const __m128i zero = _mm_setzero_si128();
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-  __m128i x, y;
-  int col;
+  const __m128i kZero = _mm_setzero_si128();
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u[8], v[8], x;
 
-  // Note:
-  //  Even column: 0, 2, ..., 14
-  //  Odd column: 1, 3, ..., 15
-  //  one even column plus one odd column constructs one row (8 coeffs)
-  //  total we have 8 rows (8x8).
-  for (col = 0; col < 2; ++col) {
-    // stage 0
-    // stage 1
-    u0 = in[2 * 0 + col];
-    u1 = _mm_sub_epi32(zero, in[2 * 7 + col]);
-    u2 = _mm_sub_epi32(zero, in[2 * 3 + col]);
-    u3 = in[2 * 4 + col];
-    u4 = _mm_sub_epi32(zero, in[2 * 1 + col]);
-    u5 = in[2 * 6 + col];
-    u6 = in[2 * 2 + col];
-    u7 = _mm_sub_epi32(zero, in[2 * 5 + col]);
-
-    // stage 2
-    v0 = u0;
-    v1 = u1;
-
-    x = _mm_mullo_epi32(u2, cospi32);
-    y = _mm_mullo_epi32(u3, cospi32);
-    v2 = _mm_add_epi32(x, y);
-    v2 = _mm_add_epi32(v2, rnding);
-    v2 = _mm_srai_epi32(v2, bit);
-
-    v3 = _mm_sub_epi32(x, y);
-    v3 = _mm_add_epi32(v3, rnding);
-    v3 = _mm_srai_epi32(v3, bit);
-
-    v4 = u4;
-    v5 = u5;
-
-    x = _mm_mullo_epi32(u6, cospi32);
-    y = _mm_mullo_epi32(u7, cospi32);
-    v6 = _mm_add_epi32(x, y);
-    v6 = _mm_add_epi32(v6, rnding);
-    v6 = _mm_srai_epi32(v6, bit);
-
-    v7 = _mm_sub_epi32(x, y);
-    v7 = _mm_add_epi32(v7, rnding);
-    v7 = _mm_srai_epi32(v7, bit);
-
-    // stage 3
-    u0 = _mm_add_epi32(v0, v2);
-    u1 = _mm_add_epi32(v1, v3);
-    u2 = _mm_sub_epi32(v0, v2);
-    u3 = _mm_sub_epi32(v1, v3);
-    u4 = _mm_add_epi32(v4, v6);
-    u5 = _mm_add_epi32(v5, v7);
-    u6 = _mm_sub_epi32(v4, v6);
-    u7 = _mm_sub_epi32(v5, v7);
-
-    // stage 4
-    v0 = u0;
-    v1 = u1;
-    v2 = u2;
-    v3 = u3;
-
-    x = _mm_mullo_epi32(u4, cospi16);
-    y = _mm_mullo_epi32(u5, cospi48);
-    v4 = _mm_add_epi32(x, y);
-    v4 = _mm_add_epi32(v4, rnding);
-    v4 = _mm_srai_epi32(v4, bit);
-
-    x = _mm_mullo_epi32(u4, cospi48);
-    y = _mm_mullo_epi32(u5, cospim16);
-    v5 = _mm_add_epi32(x, y);
-    v5 = _mm_add_epi32(v5, rnding);
-    v5 = _mm_srai_epi32(v5, bit);
-
-    x = _mm_mullo_epi32(u6, cospim48);
-    y = _mm_mullo_epi32(u7, cospi16);
-    v6 = _mm_add_epi32(x, y);
-    v6 = _mm_add_epi32(v6, rnding);
-    v6 = _mm_srai_epi32(v6, bit);
-
-    x = _mm_mullo_epi32(u6, cospi16);
-    y = _mm_mullo_epi32(u7, cospi48);
-    v7 = _mm_add_epi32(x, y);
-    v7 = _mm_add_epi32(v7, rnding);
-    v7 = _mm_srai_epi32(v7, bit);
-
-    // stage 5
-    u0 = _mm_add_epi32(v0, v4);
-    u1 = _mm_add_epi32(v1, v5);
-    u2 = _mm_add_epi32(v2, v6);
-    u3 = _mm_add_epi32(v3, v7);
-    u4 = _mm_sub_epi32(v0, v4);
-    u5 = _mm_sub_epi32(v1, v5);
-    u6 = _mm_sub_epi32(v2, v6);
-    u7 = _mm_sub_epi32(v3, v7);
-
-    // stage 6
-    x = _mm_mullo_epi32(u0, cospi4);
-    y = _mm_mullo_epi32(u1, cospi60);
-    v0 = _mm_add_epi32(x, y);
-    v0 = _mm_add_epi32(v0, rnding);
-    v0 = _mm_srai_epi32(v0, bit);
+  // Even 8 points: 0, 2, ..., 14
+  // stage 0
+  // stage 1
+  // stage 2
+  // (1)
+  u[0] = _mm_mullo_epi32(in[14], cospi4);
+  x = _mm_mullo_epi32(in[0], cospi60);
+  u[0] = _mm_add_epi32(u[0], x);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_mullo_epi32(in[14], cospi60);
+  x = _mm_mullo_epi32(in[0], cospi4);
+  u[1] = _mm_sub_epi32(u[1], x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // (2)
+  u[2] = _mm_mullo_epi32(in[10], cospi20);
+  x = _mm_mullo_epi32(in[4], cospi44);
+  u[2] = _mm_add_epi32(u[2], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_mullo_epi32(in[10], cospi44);
+  x = _mm_mullo_epi32(in[4], cospi20);
+  u[3] = _mm_sub_epi32(u[3], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  // (3)
+  u[4] = _mm_mullo_epi32(in[6], cospi36);
+  x = _mm_mullo_epi32(in[8], cospi28);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(in[6], cospi28);
+  x = _mm_mullo_epi32(in[8], cospi36);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  // (4)
+  u[6] = _mm_mullo_epi32(in[2], cospi52);
+  x = _mm_mullo_epi32(in[12], cospi12);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(in[2], cospi12);
+  x = _mm_mullo_epi32(in[12], cospi52);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
 
-    x = _mm_mullo_epi32(u0, cospi60);
-    y = _mm_mullo_epi32(u1, cospim4);
-    v1 = _mm_add_epi32(x, y);
-    v1 = _mm_add_epi32(v1, rnding);
-    v1 = _mm_srai_epi32(v1, bit);
+  // stage 3
+  addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
 
-    x = _mm_mullo_epi32(u2, cospi20);
-    y = _mm_mullo_epi32(u3, cospi44);
-    v2 = _mm_add_epi32(x, y);
-    v2 = _mm_add_epi32(v2, rnding);
-    v2 = _mm_srai_epi32(v2, bit);
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  u[6] = _mm_mullo_epi32(v[6], cospim48);
+  x = _mm_mullo_epi32(v[7], cospi16);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(v[6], cospi16);
+  x = _mm_mullo_epi32(v[7], cospim48);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 5
+  addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  u[2] = _mm_add_epi32(v[0], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_sub_epi32(v[0], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  v[0] = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  u[6] = _mm_add_epi32(v[0], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(v[0], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[2] = _mm_sub_epi32(kZero, u[4]);
+    out[4] = u[6];
+    out[6] = _mm_sub_epi32(kZero, u[2]);
+    out[8] = u[3];
+    out[10] = _mm_sub_epi32(kZero, u[7]);
+    out[12] = u[5];
+    out[14] = _mm_sub_epi32(kZero, u[1]);
+  } else {
+    neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo, &clamp_hi,
+                     out_shift);
+    neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo, &clamp_hi,
+                     out_shift);
+    neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo, &clamp_hi,
+                     out_shift);
+    neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo, &clamp_hi,
+                     out_shift);
+  }
 
-    x = _mm_mullo_epi32(u2, cospi44);
-    y = _mm_mullo_epi32(u3, cospim20);
-    v3 = _mm_add_epi32(x, y);
-    v3 = _mm_add_epi32(v3, rnding);
-    v3 = _mm_srai_epi32(v3, bit);
+  // Odd 8 points: 1, 3, ..., 15
+  // stage 0
+  // stage 1
+  // stage 2
+  // (1)
+  u[0] = _mm_mullo_epi32(in[15], cospi4);
+  x = _mm_mullo_epi32(in[1], cospi60);
+  u[0] = _mm_add_epi32(u[0], x);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_mullo_epi32(in[15], cospi60);
+  x = _mm_mullo_epi32(in[1], cospi4);
+  u[1] = _mm_sub_epi32(u[1], x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // (2)
+  u[2] = _mm_mullo_epi32(in[11], cospi20);
+  x = _mm_mullo_epi32(in[5], cospi44);
+  u[2] = _mm_add_epi32(u[2], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_mullo_epi32(in[11], cospi44);
+  x = _mm_mullo_epi32(in[5], cospi20);
+  u[3] = _mm_sub_epi32(u[3], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  // (3)
+  u[4] = _mm_mullo_epi32(in[7], cospi36);
+  x = _mm_mullo_epi32(in[9], cospi28);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(in[7], cospi28);
+  x = _mm_mullo_epi32(in[9], cospi36);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  // (4)
+  u[6] = _mm_mullo_epi32(in[3], cospi52);
+  x = _mm_mullo_epi32(in[13], cospi12);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(in[3], cospi12);
+  x = _mm_mullo_epi32(in[13], cospi52);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
 
-    x = _mm_mullo_epi32(u4, cospi36);
-    y = _mm_mullo_epi32(u5, cospi28);
-    v4 = _mm_add_epi32(x, y);
-    v4 = _mm_add_epi32(v4, rnding);
-    v4 = _mm_srai_epi32(v4, bit);
-
-    x = _mm_mullo_epi32(u4, cospi28);
-    y = _mm_mullo_epi32(u5, cospim36);
-    v5 = _mm_add_epi32(x, y);
-    v5 = _mm_add_epi32(v5, rnding);
-    v5 = _mm_srai_epi32(v5, bit);
-
-    x = _mm_mullo_epi32(u6, cospi52);
-    y = _mm_mullo_epi32(u7, cospi12);
-    v6 = _mm_add_epi32(x, y);
-    v6 = _mm_add_epi32(v6, rnding);
-    v6 = _mm_srai_epi32(v6, bit);
-
-    x = _mm_mullo_epi32(u6, cospi12);
-    y = _mm_mullo_epi32(u7, cospim52);
-    v7 = _mm_add_epi32(x, y);
-    v7 = _mm_add_epi32(v7, rnding);
-    v7 = _mm_srai_epi32(v7, bit);
+  // stage 3
+  addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
 
-    // stage 7
-    out[2 * 0 + col] = v1;
-    out[2 * 1 + col] = v6;
-    out[2 * 2 + col] = v3;
-    out[2 * 3 + col] = v4;
-    out[2 * 4 + col] = v5;
-    out[2 * 5 + col] = v2;
-    out[2 * 6 + col] = v7;
-    out[2 * 7 + col] = v0;
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  u[6] = _mm_mullo_epi32(v[6], cospim48);
+  x = _mm_mullo_epi32(v[7], cospi16);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(v[6], cospi16);
+  x = _mm_mullo_epi32(v[7], cospim48);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 5
+  addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  u[2] = _mm_add_epi32(v[0], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_sub_epi32(v[0], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  v[0] = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  u[6] = _mm_add_epi32(v[0], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(v[0], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 7
+  if (do_cols) {
+    out[1] = u[0];
+    out[3] = _mm_sub_epi32(kZero, u[4]);
+    out[5] = u[6];
+    out[7] = _mm_sub_epi32(kZero, u[2]);
+    out[9] = u[3];
+    out[11] = _mm_sub_epi32(kZero, u[7]);
+    out[13] = u[5];
+    out[15] = _mm_sub_epi32(kZero, u[1]);
+  } else {
+    neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo, &clamp_hi,
+                     out_shift);
+    neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo, &clamp_hi,
+                     out_shift);
+    neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo, &clamp_hi,
+                     out_shift);
+    neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo, &clamp_hi,
+                     out_shift);
   }
 }
 
@@ -708,102 +886,92 @@ static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
 void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
                                    int stride, TX_TYPE tx_type, int bd) {
   __m128i in[16], out[16];
-  const TXFM_1D_CFG *row_cfg = NULL;
-  const TXFM_1D_CFG *col_cfg = NULL;
+  const int8_t *shift = inv_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
 
   switch (tx_type) {
     case DCT_DCT:
-      row_cfg = &inv_txfm_1d_row_cfg_dct_8;
-      col_cfg = &inv_txfm_1d_col_cfg_dct_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+      idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                     -shift[0]);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case DCT_ADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_8;
-      col_cfg = &inv_txfm_1d_col_cfg_dct_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_DCT:
-      row_cfg = &inv_txfm_1d_row_cfg_dct_8;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+      idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                     -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_ADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_8;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
       break;
-#if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      row_cfg = &inv_txfm_1d_row_cfg_dct_8;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+      idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                     -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_8x8(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
       break;
     case DCT_FLIPADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_8;
-      col_cfg = &inv_txfm_1d_col_cfg_dct_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
       transpose_8x8(in, out);
-      idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_8x8(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
+      idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case ADST_FLIPADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_8;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_8x8(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_FLIPADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_8;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_8x8(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
       break;
     case FLIPADST_ADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_8;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_8;
       load_buffer_8x8(coeff, in);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
       transpose_8x8(in, out);
-      iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_8x8(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
       break;
-#endif  // CONFIG_EXT_TX
     default: assert(0);
   }
 }
@@ -868,7 +1036,8 @@ static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride,
   write_buffer_8x8(in8x8, rightDown, stride, fliplr, flipud, shift, bd);
 }
 
-static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
@@ -894,6 +1063,9 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
   __m128i u[16], v[16], x, y;
   int col;
 
@@ -945,14 +1117,10 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
     u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
     u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
     u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
-    u[8] = _mm_add_epi32(v[8], v[9]);
-    u[9] = _mm_sub_epi32(v[8], v[9]);
-    u[10] = _mm_sub_epi32(v[11], v[10]);
-    u[11] = _mm_add_epi32(v[10], v[11]);
-    u[12] = _mm_add_epi32(v[12], v[13]);
-    u[13] = _mm_sub_epi32(v[12], v[13]);
-    u[14] = _mm_sub_epi32(v[15], v[14]);
-    u[15] = _mm_add_epi32(v[14], v[15]);
+    addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
 
     // stage 4
     x = _mm_mullo_epi32(u[0], cospi32);
@@ -967,10 +1135,8 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
 
     v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
     v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
-    v[4] = _mm_add_epi32(u[4], u[5]);
-    v[5] = _mm_sub_epi32(u[4], u[5]);
-    v[6] = _mm_sub_epi32(u[7], u[6]);
-    v[7] = _mm_add_epi32(u[6], u[7]);
+    addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
     v[8] = u[8];
     v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
     v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
@@ -981,10 +1147,8 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
     v[15] = u[15];
 
     // stage 5
-    u[0] = _mm_add_epi32(v[0], v[3]);
-    u[1] = _mm_add_epi32(v[1], v[2]);
-    u[2] = _mm_sub_epi32(v[1], v[2]);
-    u[3] = _mm_sub_epi32(v[0], v[3]);
+    addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
     u[4] = v[4];
 
     x = _mm_mullo_epi32(v[5], cospi32);
@@ -998,24 +1162,16 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
     u[6] = _mm_srai_epi32(u[6], bit);
 
     u[7] = v[7];
-    u[8] = _mm_add_epi32(v[8], v[11]);
-    u[9] = _mm_add_epi32(v[9], v[10]);
-    u[10] = _mm_sub_epi32(v[9], v[10]);
-    u[11] = _mm_sub_epi32(v[8], v[11]);
-    u[12] = _mm_sub_epi32(v[15], v[12]);
-    u[13] = _mm_sub_epi32(v[14], v[13]);
-    u[14] = _mm_add_epi32(v[13], v[14]);
-    u[15] = _mm_add_epi32(v[12], v[15]);
+    addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
 
     // stage 6
-    v[0] = _mm_add_epi32(u[0], u[7]);
-    v[1] = _mm_add_epi32(u[1], u[6]);
-    v[2] = _mm_add_epi32(u[2], u[5]);
-    v[3] = _mm_add_epi32(u[3], u[4]);
-    v[4] = _mm_sub_epi32(u[3], u[4]);
-    v[5] = _mm_sub_epi32(u[2], u[5]);
-    v[6] = _mm_sub_epi32(u[1], u[6]);
-    v[7] = _mm_sub_epi32(u[0], u[7]);
+    addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
     v[8] = u[8];
     v[9] = u[9];
 
@@ -1043,386 +1199,1141 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
     v[15] = u[15];
 
     // stage 7
-    out[0 * 4 + col] = _mm_add_epi32(v[0], v[15]);
-    out[1 * 4 + col] = _mm_add_epi32(v[1], v[14]);
-    out[2 * 4 + col] = _mm_add_epi32(v[2], v[13]);
-    out[3 * 4 + col] = _mm_add_epi32(v[3], v[12]);
-    out[4 * 4 + col] = _mm_add_epi32(v[4], v[11]);
-    out[5 * 4 + col] = _mm_add_epi32(v[5], v[10]);
-    out[6 * 4 + col] = _mm_add_epi32(v[6], v[9]);
-    out[7 * 4 + col] = _mm_add_epi32(v[7], v[8]);
-    out[8 * 4 + col] = _mm_sub_epi32(v[7], v[8]);
-    out[9 * 4 + col] = _mm_sub_epi32(v[6], v[9]);
-    out[10 * 4 + col] = _mm_sub_epi32(v[5], v[10]);
-    out[11 * 4 + col] = _mm_sub_epi32(v[4], v[11]);
-    out[12 * 4 + col] = _mm_sub_epi32(v[3], v[12]);
-    out[13 * 4 + col] = _mm_sub_epi32(v[2], v[13]);
-    out[14 * 4 + col] = _mm_sub_epi32(v[1], v[14]);
-    out[15 * 4 + col] = _mm_sub_epi32(v[0], v[15]);
+    if (do_cols) {
+      addsub_no_clamp_sse4_1(v[0], v[15], out + 0 * 4 + col,
+                             out + 15 * 4 + col);
+      addsub_no_clamp_sse4_1(v[1], v[14], out + 1 * 4 + col,
+                             out + 14 * 4 + col);
+      addsub_no_clamp_sse4_1(v[2], v[13], out + 2 * 4 + col,
+                             out + 13 * 4 + col);
+      addsub_no_clamp_sse4_1(v[3], v[12], out + 3 * 4 + col,
+                             out + 12 * 4 + col);
+      addsub_no_clamp_sse4_1(v[4], v[11], out + 4 * 4 + col,
+                             out + 11 * 4 + col);
+      addsub_no_clamp_sse4_1(v[5], v[10], out + 5 * 4 + col,
+                             out + 10 * 4 + col);
+      addsub_no_clamp_sse4_1(v[6], v[9], out + 6 * 4 + col, out + 9 * 4 + col);
+      addsub_no_clamp_sse4_1(v[7], v[8], out + 7 * 4 + col, out + 8 * 4 + col);
+    } else {
+      addsub_shift_sse4_1(v[0], v[15], out + 0 * 4 + col, out + 15 * 4 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_sse4_1(v[1], v[14], out + 1 * 4 + col, out + 14 * 4 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_sse4_1(v[2], v[13], out + 2 * 4 + col, out + 13 * 4 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_sse4_1(v[3], v[12], out + 3 * 4 + col, out + 12 * 4 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_sse4_1(v[4], v[11], out + 4 * 4 + col, out + 11 * 4 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_sse4_1(v[5], v[10], out + 5 * 4 + col, out + 10 * 4 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_sse4_1(v[6], v[9], out + 6 * 4 + col, out + 9 * 4 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+      addsub_shift_sse4_1(v[7], v[8], out + 7 * 4 + col, out + 8 * 4 + col,
+                          &clamp_lo, &clamp_hi, out_shift);
+    }
   }
 }
 
-static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
   const int32_t *cospi = cospi_arr(bit);
-  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
-  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
-  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
-  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
-  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
-  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
-  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
-  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
-  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
-  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
-  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
-  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
-  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
-  const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
-  const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
-  const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
-  const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
   const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
-  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
   const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
-  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
   const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
-  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
   const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
-  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
-  const __m128i zero = _mm_setzero_si128();
-
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
   __m128i u[16], v[16], x, y;
+  const int col_num = 4;
   int col;
 
-  for (col = 0; col < 4; ++col) {
+  // Calculate the column 0, 1, 2, 3
+  for (col = 0; col < col_num; ++col) {
     // stage 0
     // stage 1
-    u[0] = in[0 * 4 + col];
-    u[1] = _mm_sub_epi32(zero, in[15 * 4 + col]);
-    u[2] = _mm_sub_epi32(zero, in[7 * 4 + col]);
-    u[3] = in[8 * 4 + col];
-    u[4] = _mm_sub_epi32(zero, in[3 * 4 + col]);
-    u[5] = in[12 * 4 + col];
-    u[6] = in[4 * 4 + col];
-    u[7] = _mm_sub_epi32(zero, in[11 * 4 + col]);
-    u[8] = _mm_sub_epi32(zero, in[1 * 4 + col]);
-    u[9] = in[14 * 4 + col];
-    u[10] = in[6 * 4 + col];
-    u[11] = _mm_sub_epi32(zero, in[9 * 4 + col]);
-    u[12] = in[2 * 4 + col];
-    u[13] = _mm_sub_epi32(zero, in[13 * 4 + col]);
-    u[14] = _mm_sub_epi32(zero, in[5 * 4 + col]);
-    u[15] = in[10 * 4 + col];
-
     // stage 2
-    v[0] = u[0];
-    v[1] = u[1];
+    v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2);
+    x = _mm_mullo_epi32(in[0 * col_num + col], cospi62);
+    v[0] = _mm_add_epi32(v[0], x);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
 
-    x = _mm_mullo_epi32(u[2], cospi32);
-    y = _mm_mullo_epi32(u[3], cospi32);
-    v[2] = _mm_add_epi32(x, y);
+    v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62);
+    x = _mm_mullo_epi32(in[0 * col_num + col], cospi2);
+    v[1] = _mm_sub_epi32(v[1], x);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10);
+    x = _mm_mullo_epi32(in[2 * col_num + col], cospi54);
+    v[2] = _mm_add_epi32(v[2], x);
     v[2] = _mm_add_epi32(v[2], rnding);
     v[2] = _mm_srai_epi32(v[2], bit);
 
-    v[3] = _mm_sub_epi32(x, y);
+    v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54);
+    x = _mm_mullo_epi32(in[2 * col_num + col], cospi10);
+    v[3] = _mm_sub_epi32(v[3], x);
     v[3] = _mm_add_epi32(v[3], rnding);
     v[3] = _mm_srai_epi32(v[3], bit);
 
-    v[4] = u[4];
-    v[5] = u[5];
-
-    x = _mm_mullo_epi32(u[6], cospi32);
-    y = _mm_mullo_epi32(u[7], cospi32);
-    v[6] = _mm_add_epi32(x, y);
+    v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18);
+    x = _mm_mullo_epi32(in[4 * col_num + col], cospi46);
+    v[4] = _mm_add_epi32(v[4], x);
+    v[4] = _mm_add_epi32(v[4], rnding);
+    v[4] = _mm_srai_epi32(v[4], bit);
+
+    v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46);
+    x = _mm_mullo_epi32(in[4 * col_num + col], cospi18);
+    v[5] = _mm_sub_epi32(v[5], x);
+    v[5] = _mm_add_epi32(v[5], rnding);
+    v[5] = _mm_srai_epi32(v[5], bit);
+
+    v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26);
+    x = _mm_mullo_epi32(in[6 * col_num + col], cospi38);
+    v[6] = _mm_add_epi32(v[6], x);
     v[6] = _mm_add_epi32(v[6], rnding);
     v[6] = _mm_srai_epi32(v[6], bit);
 
-    v[7] = _mm_sub_epi32(x, y);
+    v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38);
+    x = _mm_mullo_epi32(in[6 * col_num + col], cospi26);
+    v[7] = _mm_sub_epi32(v[7], x);
     v[7] = _mm_add_epi32(v[7], rnding);
     v[7] = _mm_srai_epi32(v[7], bit);
 
-    v[8] = u[8];
-    v[9] = u[9];
-
-    x = _mm_mullo_epi32(u[10], cospi32);
-    y = _mm_mullo_epi32(u[11], cospi32);
-    v[10] = _mm_add_epi32(x, y);
+    v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34);
+    x = _mm_mullo_epi32(in[8 * col_num + col], cospi30);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30);
+    x = _mm_mullo_epi32(in[8 * col_num + col], cospi34);
+    v[9] = _mm_sub_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42);
+    x = _mm_mullo_epi32(in[10 * col_num + col], cospi22);
+    v[10] = _mm_add_epi32(v[10], x);
     v[10] = _mm_add_epi32(v[10], rnding);
     v[10] = _mm_srai_epi32(v[10], bit);
 
-    v[11] = _mm_sub_epi32(x, y);
+    v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22);
+    x = _mm_mullo_epi32(in[10 * col_num + col], cospi42);
+    v[11] = _mm_sub_epi32(v[11], x);
     v[11] = _mm_add_epi32(v[11], rnding);
     v[11] = _mm_srai_epi32(v[11], bit);
 
-    v[12] = u[12];
-    v[13] = u[13];
+    v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50);
+    x = _mm_mullo_epi32(in[12 * col_num + col], cospi14);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
 
-    x = _mm_mullo_epi32(u[14], cospi32);
-    y = _mm_mullo_epi32(u[15], cospi32);
-    v[14] = _mm_add_epi32(x, y);
+    v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14);
+    x = _mm_mullo_epi32(in[12 * col_num + col], cospi50);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58);
+    x = _mm_mullo_epi32(in[14 * col_num + col], cospi6);
+    v[14] = _mm_add_epi32(v[14], x);
     v[14] = _mm_add_epi32(v[14], rnding);
     v[14] = _mm_srai_epi32(v[14], bit);
 
-    v[15] = _mm_sub_epi32(x, y);
+    v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6);
+    x = _mm_mullo_epi32(in[14 * col_num + col], cospi58);
+    v[15] = _mm_sub_epi32(v[15], x);
     v[15] = _mm_add_epi32(v[15], rnding);
     v[15] = _mm_srai_epi32(v[15], bit);
 
     // stage 3
-    u[0] = _mm_add_epi32(v[0], v[2]);
-    u[1] = _mm_add_epi32(v[1], v[3]);
-    u[2] = _mm_sub_epi32(v[0], v[2]);
-    u[3] = _mm_sub_epi32(v[1], v[3]);
-    u[4] = _mm_add_epi32(v[4], v[6]);
-    u[5] = _mm_add_epi32(v[5], v[7]);
-    u[6] = _mm_sub_epi32(v[4], v[6]);
-    u[7] = _mm_sub_epi32(v[5], v[7]);
-    u[8] = _mm_add_epi32(v[8], v[10]);
-    u[9] = _mm_add_epi32(v[9], v[11]);
-    u[10] = _mm_sub_epi32(v[8], v[10]);
-    u[11] = _mm_sub_epi32(v[9], v[11]);
-    u[12] = _mm_add_epi32(v[12], v[14]);
-    u[13] = _mm_add_epi32(v[13], v[15]);
-    u[14] = _mm_sub_epi32(v[12], v[14]);
-    u[15] = _mm_sub_epi32(v[13], v[15]);
+    addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
 
     // stage 4
     v[0] = u[0];
     v[1] = u[1];
     v[2] = u[2];
     v[3] = u[3];
-    v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
-    v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
-    v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
-    v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
-    v[8] = u[8];
-    v[9] = u[9];
-    v[10] = u[10];
-    v[11] = u[11];
-    v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
-    v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
-    v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
-    v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = _mm_mullo_epi32(u[8], cospi8);
+    x = _mm_mullo_epi32(u[9], cospi56);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[9] = _mm_mullo_epi32(u[8], cospi56);
+    x = _mm_mullo_epi32(u[9], cospi8);
+    v[9] = _mm_sub_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[10] = _mm_mullo_epi32(u[10], cospi40);
+    x = _mm_mullo_epi32(u[11], cospi24);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_mullo_epi32(u[10], cospi24);
+    x = _mm_mullo_epi32(u[11], cospi40);
+    v[11] = _mm_sub_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(u[12], cospim56);
+    x = _mm_mullo_epi32(u[13], cospi8);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(u[12], cospi8);
+    x = _mm_mullo_epi32(u[13], cospim56);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(u[14], cospim24);
+    x = _mm_mullo_epi32(u[15], cospi40);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(u[14], cospi40);
+    x = _mm_mullo_epi32(u[15], cospim24);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
 
     // stage 5
-    u[0] = _mm_add_epi32(v[0], v[4]);
-    u[1] = _mm_add_epi32(v[1], v[5]);
-    u[2] = _mm_add_epi32(v[2], v[6]);
-    u[3] = _mm_add_epi32(v[3], v[7]);
-    u[4] = _mm_sub_epi32(v[0], v[4]);
-    u[5] = _mm_sub_epi32(v[1], v[5]);
-    u[6] = _mm_sub_epi32(v[2], v[6]);
-    u[7] = _mm_sub_epi32(v[3], v[7]);
-    u[8] = _mm_add_epi32(v[8], v[12]);
-    u[9] = _mm_add_epi32(v[9], v[13]);
-    u[10] = _mm_add_epi32(v[10], v[14]);
-    u[11] = _mm_add_epi32(v[11], v[15]);
-    u[12] = _mm_sub_epi32(v[8], v[12]);
-    u[13] = _mm_sub_epi32(v[9], v[13]);
-    u[14] = _mm_sub_epi32(v[10], v[14]);
-    u[15] = _mm_sub_epi32(v[11], v[15]);
+    addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
 
     // stage 6
     v[0] = u[0];
     v[1] = u[1];
     v[2] = u[2];
     v[3] = u[3];
-    v[4] = u[4];
-    v[5] = u[5];
-    v[6] = u[6];
-    v[7] = u[7];
-    v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
-    v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
-    v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
-    v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
-    v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
-    v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
-    v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
-    v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
+
+    v[4] = _mm_mullo_epi32(u[4], cospi16);
+    x = _mm_mullo_epi32(u[5], cospi48);
+    v[4] = _mm_add_epi32(v[4], x);
+    v[4] = _mm_add_epi32(v[4], rnding);
+    v[4] = _mm_srai_epi32(v[4], bit);
+
+    v[5] = _mm_mullo_epi32(u[4], cospi48);
+    x = _mm_mullo_epi32(u[5], cospi16);
+    v[5] = _mm_sub_epi32(v[5], x);
+    v[5] = _mm_add_epi32(v[5], rnding);
+    v[5] = _mm_srai_epi32(v[5], bit);
+
+    v[6] = _mm_mullo_epi32(u[6], cospim48);
+    x = _mm_mullo_epi32(u[7], cospi16);
+    v[6] = _mm_add_epi32(v[6], x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_mullo_epi32(u[6], cospi16);
+    x = _mm_mullo_epi32(u[7], cospim48);
+    v[7] = _mm_sub_epi32(v[7], x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+
+    v[12] = _mm_mullo_epi32(u[12], cospi16);
+    x = _mm_mullo_epi32(u[13], cospi48);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(u[12], cospi48);
+    x = _mm_mullo_epi32(u[13], cospi16);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(u[14], cospim48);
+    x = _mm_mullo_epi32(u[15], cospi16);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(u[14], cospi16);
+    x = _mm_mullo_epi32(u[15], cospim48);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
 
     // stage 7
-    u[0] = _mm_add_epi32(v[0], v[8]);
-    u[1] = _mm_add_epi32(v[1], v[9]);
-    u[2] = _mm_add_epi32(v[2], v[10]);
-    u[3] = _mm_add_epi32(v[3], v[11]);
-    u[4] = _mm_add_epi32(v[4], v[12]);
-    u[5] = _mm_add_epi32(v[5], v[13]);
-    u[6] = _mm_add_epi32(v[6], v[14]);
-    u[7] = _mm_add_epi32(v[7], v[15]);
-    u[8] = _mm_sub_epi32(v[0], v[8]);
-    u[9] = _mm_sub_epi32(v[1], v[9]);
-    u[10] = _mm_sub_epi32(v[2], v[10]);
-    u[11] = _mm_sub_epi32(v[3], v[11]);
-    u[12] = _mm_sub_epi32(v[4], v[12]);
-    u[13] = _mm_sub_epi32(v[5], v[13]);
-    u[14] = _mm_sub_epi32(v[6], v[14]);
-    u[15] = _mm_sub_epi32(v[7], v[15]);
+    addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
 
     // stage 8
-    v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
-    v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
-    v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
-    v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
-    v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
-    v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
-    v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
-    v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
-    v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
-    v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
-    v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
-    v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
-    v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
-    v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
-    v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
-    v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
+    v[0] = u[0];
+    v[1] = u[1];
+
+    y = _mm_mullo_epi32(u[2], cospi32);
+    x = _mm_mullo_epi32(u[3], cospi32);
+    v[2] = _mm_add_epi32(y, x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_sub_epi32(y, x);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    y = _mm_mullo_epi32(u[6], cospi32);
+    x = _mm_mullo_epi32(u[7], cospi32);
+    v[6] = _mm_add_epi32(y, x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_sub_epi32(y, x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    y = _mm_mullo_epi32(u[10], cospi32);
+    x = _mm_mullo_epi32(u[11], cospi32);
+    v[10] = _mm_add_epi32(y, x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_sub_epi32(y, x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    y = _mm_mullo_epi32(u[14], cospi32);
+    x = _mm_mullo_epi32(u[15], cospi32);
+    v[14] = _mm_add_epi32(y, x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_sub_epi32(y, x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
 
     // stage 9
-    out[0 * 4 + col] = v[1];
-    out[1 * 4 + col] = v[14];
-    out[2 * 4 + col] = v[3];
-    out[3 * 4 + col] = v[12];
-    out[4 * 4 + col] = v[5];
-    out[5 * 4 + col] = v[10];
-    out[6 * 4 + col] = v[7];
-    out[7 * 4 + col] = v[8];
-    out[8 * 4 + col] = v[9];
-    out[9 * 4 + col] = v[6];
-    out[10 * 4 + col] = v[11];
-    out[11 * 4 + col] = v[4];
-    out[12 * 4 + col] = v[13];
-    out[13 * 4 + col] = v[2];
-    out[14 * 4 + col] = v[15];
-    out[15 * 4 + col] = v[0];
+    if (do_cols) {
+      out[0 * col_num + col] = v[0];
+      out[1 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
+      out[2 * col_num + col] = v[12];
+      out[3 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
+      out[4 * col_num + col] = v[6];
+      out[5 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
+      out[6 * col_num + col] = v[10];
+      out[7 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
+      out[8 * col_num + col] = v[3];
+      out[9 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
+      out[10 * col_num + col] = v[15];
+      out[11 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
+      out[12 * col_num + col] = v[5];
+      out[13 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
+      out[14 * col_num + col] = v[9];
+      out[15 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
+    } else {
+      neg_shift_sse4_1(v[0], v[8], out + 0 * col_num + col,
+                       out + 1 * col_num + col, &clamp_lo, &clamp_hi,
+                       out_shift);
+      neg_shift_sse4_1(v[12], v[4], out + 2 * col_num + col,
+                       out + 3 * col_num + col, &clamp_lo, &clamp_hi,
+                       out_shift);
+      neg_shift_sse4_1(v[6], v[14], out + 4 * col_num + col,
+                       out + 5 * col_num + col, &clamp_lo, &clamp_hi,
+                       out_shift);
+      neg_shift_sse4_1(v[10], v[2], out + 6 * col_num + col,
+                       out + 7 * col_num + col, &clamp_lo, &clamp_hi,
+                       out_shift);
+      neg_shift_sse4_1(v[3], v[11], out + 8 * col_num + col,
+                       out + 9 * col_num + col, &clamp_lo, &clamp_hi,
+                       out_shift);
+      neg_shift_sse4_1(v[15], v[7], out + 10 * col_num + col,
+                       out + 11 * col_num + col, &clamp_lo, &clamp_hi,
+                       out_shift);
+      neg_shift_sse4_1(v[5], v[13], out + 12 * col_num + col,
+                       out + 13 * col_num + col, &clamp_lo, &clamp_hi,
+                       out_shift);
+      neg_shift_sse4_1(v[9], v[1], out + 14 * col_num + col,
+                       out + 15 * col_num + col, &clamp_lo, &clamp_hi,
+                       out_shift);
+    }
   }
 }
 
-static void round_shift_16x16(__m128i *in, int shift) {
-  round_shift_8x8(&in[0], shift);
-  round_shift_8x8(&in[16], shift);
-  round_shift_8x8(&in[32], shift);
-  round_shift_8x8(&in[48], shift);
-}
-
 void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
                                      int stride, TX_TYPE tx_type, int bd) {
   __m128i in[64], out[64];
-  const TXFM_1D_CFG *row_cfg = NULL;
-  const TXFM_1D_CFG *col_cfg = NULL;
+  const int8_t *shift = inv_txfm_shift_ls[TX_16X16];
+  const int txw_idx = get_txw_idx(TX_16X16);
+  const int txh_idx = get_txh_idx(TX_16X16);
 
   switch (tx_type) {
     case DCT_DCT:
-      row_cfg = &inv_txfm_1d_row_cfg_dct_16;
-      col_cfg = &inv_txfm_1d_col_cfg_dct_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
-      round_shift_16x16(in, -row_cfg->shift[0]);
+      idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                       -shift[0]);
       transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case DCT_ADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_16;
-      col_cfg = &inv_txfm_1d_col_cfg_dct_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
-      round_shift_16x16(in, -row_cfg->shift[0]);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        -shift[0]);
       transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_DCT:
-      row_cfg = &inv_txfm_1d_row_cfg_dct_16;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
-      round_shift_16x16(in, -row_cfg->shift[0]);
+      idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                       -shift[0]);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
       break;
     case ADST_ADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_16;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
-      round_shift_16x16(in, -row_cfg->shift[0]);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        -shift[0]);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
       break;
-#if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      row_cfg = &inv_txfm_1d_row_cfg_dct_16;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
-      round_shift_16x16(in, -row_cfg->shift[0]);
+      idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                       -shift[0]);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_16x16(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd);
       break;
     case DCT_FLIPADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_16;
-      col_cfg = &inv_txfm_1d_col_cfg_dct_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
-      round_shift_16x16(in, -row_cfg->shift[0]);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        -shift[0]);
       transpose_16x16(in, out);
-      idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_16x16(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
+      idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case ADST_FLIPADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_16;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
-      round_shift_16x16(in, -row_cfg->shift[0]);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        -shift[0]);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_16x16(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd);
       break;
     case FLIPADST_FLIPADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_16;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
-      round_shift_16x16(in, -row_cfg->shift[0]);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        -shift[0]);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_16x16(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_16x16(in, output, stride, 1, 1, -shift[1], bd);
       break;
     case FLIPADST_ADST:
-      row_cfg = &inv_txfm_1d_row_cfg_adst_16;
-      col_cfg = &inv_txfm_1d_col_cfg_adst_16;
       load_buffer_16x16(coeff, in);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
-      round_shift_16x16(in, -row_cfg->shift[0]);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        -shift[0]);
       transpose_16x16(in, out);
-      iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
-      write_buffer_16x16(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
+      iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd);
       break;
-#endif
     default: assert(0);
   }
 }
+
+static void load_buffer_64x64_lower_32x32(const int32_t *coeff, __m128i *in) {
+  int i, j;
+
+  __m128i zero = _mm_setzero_si128();
+
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 8; ++j) {
+      in[16 * i + j] =
+          _mm_loadu_si128((const __m128i *)(coeff + 32 * i + 4 * j));
+      in[16 * i + j + 8] = zero;
+    }
+  }
+
+  for (i = 0; i < 512; ++i) in[512 + i] = zero;
+}
+
+static void transpose_64x64(__m128i *in, __m128i *out, int do_cols) {
+  int i, j;
+  for (i = 0; i < (do_cols ? 16 : 8); ++i) {
+    for (j = 0; j < 8; ++j) {
+      TRANSPOSE_4X4(in[(4 * i + 0) * 16 + j], in[(4 * i + 1) * 16 + j],
+                    in[(4 * i + 2) * 16 + j], in[(4 * i + 3) * 16 + j],
+                    out[(4 * j + 0) * 16 + i], out[(4 * j + 1) * 16 + i],
+                    out[(4 * j + 2) * 16 + i], out[(4 * j + 3) * 16 + i]);
+    }
+  }
+}
+
+static void assign_16x16_input_from_32x32(const __m128i *in, __m128i *in16x16,
+                                          int col) {
+  int i;
+  for (i = 0; i < 16 * 16 / 4; i += 4) {
+    in16x16[i] = in[col];
+    in16x16[i + 1] = in[col + 1];
+    in16x16[i + 2] = in[col + 2];
+    in16x16[i + 3] = in[col + 3];
+    col += 8;
+  }
+}
+
+static void write_buffer_32x32(__m128i *in, uint16_t *output, int stride,
+                               int fliplr, int flipud, int shift, int bd) {
+  __m128i in16x16[16 * 16 / 4];
+  uint16_t *leftUp = &output[0];
+  uint16_t *rightUp = &output[16];
+  uint16_t *leftDown = &output[16 * stride];
+  uint16_t *rightDown = &output[16 * stride + 16];
+
+  if (fliplr) {
+    swap_addr(&leftUp, &rightUp);
+    swap_addr(&leftDown, &rightDown);
+  }
+
+  if (flipud) {
+    swap_addr(&leftUp, &leftDown);
+    swap_addr(&rightUp, &rightDown);
+  }
+
+  // Left-up quarter
+  assign_16x16_input_from_32x32(in, in16x16, 0);
+  write_buffer_16x16(in16x16, leftUp, stride, fliplr, flipud, shift, bd);
+
+  // Right-up quarter
+  assign_16x16_input_from_32x32(in, in16x16, 32 / 2 / 4);
+  write_buffer_16x16(in16x16, rightUp, stride, fliplr, flipud, shift, bd);
+
+  // Left-down quarter
+  assign_16x16_input_from_32x32(in, in16x16, 32 * 32 / 2 / 4);
+  write_buffer_16x16(in16x16, leftDown, stride, fliplr, flipud, shift, bd);
+
+  // Right-down quarter
+  assign_16x16_input_from_32x32(in, in16x16, 32 * 32 / 2 / 4 + 32 / 2 / 4);
+  write_buffer_16x16(in16x16, rightDown, stride, fliplr, flipud, shift, bd);
+}
+
+static void assign_32x32_input_from_64x64(const __m128i *in, __m128i *in32x32,
+                                          int col) {
+  int i;
+  for (i = 0; i < 32 * 32 / 4; i += 8) {
+    in32x32[i] = in[col];
+    in32x32[i + 1] = in[col + 1];
+    in32x32[i + 2] = in[col + 2];
+    in32x32[i + 3] = in[col + 3];
+    in32x32[i + 4] = in[col + 4];
+    in32x32[i + 5] = in[col + 5];
+    in32x32[i + 6] = in[col + 6];
+    in32x32[i + 7] = in[col + 7];
+    col += 16;
+  }
+}
+
+static void write_buffer_64x64(__m128i *in, uint16_t *output, int stride,
+                               int fliplr, int flipud, int shift, int bd) {
+  __m128i in32x32[32 * 32 / 4];
+  uint16_t *leftUp = &output[0];
+  uint16_t *rightUp = &output[32];
+  uint16_t *leftDown = &output[32 * stride];
+  uint16_t *rightDown = &output[32 * stride + 32];
+
+  if (fliplr) {
+    swap_addr(&leftUp, &rightUp);
+    swap_addr(&leftDown, &rightDown);
+  }
+
+  if (flipud) {
+    swap_addr(&leftUp, &leftDown);
+    swap_addr(&rightUp, &rightDown);
+  }
+
+  // Left-up quarter
+  assign_32x32_input_from_64x64(in, in32x32, 0);
+  write_buffer_32x32(in32x32, leftUp, stride, fliplr, flipud, shift, bd);
+
+  // Right-up quarter
+  assign_32x32_input_from_64x64(in, in32x32, 64 / 2 / 4);
+  write_buffer_32x32(in32x32, rightUp, stride, fliplr, flipud, shift, bd);
+
+  // Left-down quarter
+  assign_32x32_input_from_64x64(in, in32x32, 64 * 64 / 2 / 4);
+  write_buffer_32x32(in32x32, leftDown, stride, fliplr, flipud, shift, bd);
+
+  // Right-down quarter
+  assign_32x32_input_from_64x64(in, in32x32, 64 * 64 / 2 / 4 + 64 / 2 / 4);
+  write_buffer_32x32(in32x32, rightDown, stride, fliplr, flipud, shift, bd);
+}
+
+static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  int col;
+
+  const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi17 = _mm_set1_epi32(cospi[17]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi19 = _mm_set1_epi32(cospi[19]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi21 = _mm_set1_epi32(cospi[21]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi23 = _mm_set1_epi32(cospi[23]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi25 = _mm_set1_epi32(cospi[25]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi27 = _mm_set1_epi32(cospi[27]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi29 = _mm_set1_epi32(cospi[29]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi31 = _mm_set1_epi32(cospi[31]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi35 = _mm_set1_epi32(cospi[35]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi39 = _mm_set1_epi32(cospi[39]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi43 = _mm_set1_epi32(cospi[43]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi47 = _mm_set1_epi32(cospi[47]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospim33 = _mm_set1_epi32(-cospi[33]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim37 = _mm_set1_epi32(-cospi[37]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim41 = _mm_set1_epi32(-cospi[41]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
+  const __m128i cospim45 = _mm_set1_epi32(-cospi[45]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
+  const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+
+  for (col = 0; col < (do_cols ? 64 / 4 : 32 / 4); ++col) {
+    __m128i u[64], v[64];
+
+    // stage 1
+    u[32] = in[1 * 16 + col];
+    u[34] = in[17 * 16 + col];
+    u[36] = in[9 * 16 + col];
+    u[38] = in[25 * 16 + col];
+    u[40] = in[5 * 16 + col];
+    u[42] = in[21 * 16 + col];
+    u[44] = in[13 * 16 + col];
+    u[46] = in[29 * 16 + col];
+    u[48] = in[3 * 16 + col];
+    u[50] = in[19 * 16 + col];
+    u[52] = in[11 * 16 + col];
+    u[54] = in[27 * 16 + col];
+    u[56] = in[7 * 16 + col];
+    u[58] = in[23 * 16 + col];
+    u[60] = in[15 * 16 + col];
+    u[62] = in[31 * 16 + col];
+
+    v[16] = in[2 * 16 + col];
+    v[18] = in[18 * 16 + col];
+    v[20] = in[10 * 16 + col];
+    v[22] = in[26 * 16 + col];
+    v[24] = in[6 * 16 + col];
+    v[26] = in[22 * 16 + col];
+    v[28] = in[14 * 16 + col];
+    v[30] = in[30 * 16 + col];
+
+    u[8] = in[4 * 16 + col];
+    u[10] = in[20 * 16 + col];
+    u[12] = in[12 * 16 + col];
+    u[14] = in[28 * 16 + col];
+
+    v[4] = in[8 * 16 + col];
+    v[6] = in[24 * 16 + col];
+
+    u[0] = in[0 * 16 + col];
+    u[2] = in[16 * 16 + col];
+
+    // stage 2
+    v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+    v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit);
+    v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit);
+    v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
+    v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
+    v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit);
+    v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit);
+    v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+    v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+    v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit);
+    v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit);
+    v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
+    v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
+    v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit);
+    v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit);
+    v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+    v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+    v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit);
+    v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit);
+    v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
+    v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
+    v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit);
+    v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit);
+    v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+    v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+    v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit);
+    v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit);
+    v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
+    v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
+    v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit);
+    v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit);
+    v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+
+    // stage 3
+    u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit);
+    u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit);
+    u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit);
+    u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit);
+    u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit);
+    u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit);
+    u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit);
+    u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit);
+    u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit);
+    u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit);
+    u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit);
+    u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit);
+    u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit);
+    u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit);
+    u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit);
+    u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit);
+
+    for (i = 32; i < 64; i += 4) {
+      addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    // stage 4
+    v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+    v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
+    v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
+    v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+    v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+    v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
+    v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
+    v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+
+    for (i = 16; i < 32; i += 4) {
+      addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 32; i < 64; i += 4) {
+      v[i + 0] = u[i + 0];
+      v[i + 3] = u[i + 3];
+    }
+
+    v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+    v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+    v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+    v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+    v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+    v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+    v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+    v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+    v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+    v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+    v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+    v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+    v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+    v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+
+    // stage 5
+    u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit);
+    u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit);
+    u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit);
+    u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit);
+
+    for (i = 8; i < 16; i += 4) {
+      addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 16; i < 32; i += 4) {
+      u[i + 0] = v[i + 0];
+      u[i + 3] = v[i + 3];
+    }
+
+    u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
+    u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
+    u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
+    u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
+    u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
+    u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
+
+    for (i = 32; i < 64; i += 8) {
+      addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+
+      addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    // stage 6
+    v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
+    v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+
+    addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+
+    for (i = 8; i < 16; i += 4) {
+      v[i + 0] = u[i + 0];
+      v[i + 3] = u[i + 3];
+    }
+
+    v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+
+    for (i = 16; i < 32; i += 8) {
+      addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
+                    &clamp_hi);
+
+      addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 32; i < 64; i += 8) {
+      v[i + 0] = u[i + 0];
+      v[i + 1] = u[i + 1];
+      v[i + 6] = u[i + 6];
+      v[i + 7] = u[i + 7];
+    }
+
+    v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+    v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+    v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+    v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+    v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+    v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+    v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+    v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+    v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+    v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+    v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+    v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+    v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+    v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+
+    // stage 7
+    addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+    u[4] = v[4];
+    u[7] = v[7];
+    u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
+    u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
+
+    addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    for (i = 16; i < 32; i += 8) {
+      u[i + 0] = v[i + 0];
+      u[i + 1] = v[i + 1];
+      u[i + 6] = v[i + 6];
+      u[i + 7] = v[i + 7];
+    }
+
+    u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
+    u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
+    u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
+    u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
+    u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
+    u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
+
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                      &clamp_hi);
+      }
+    }
+
+    // stage 8
+    for (i = 0; i < 4; ++i) {
+      addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
+    }
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[14] = u[14];
+    v[15] = u[15];
+
+    v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
+
+    for (i = 16; i < 20; ++i) {
+      addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
+      addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 32; i < 36; ++i) {
+      v[i] = u[i];
+      v[i + 12] = u[i + 12];
+      v[i + 16] = u[i + 16];
+      v[i + 28] = u[i + 28];
+    }
+
+    v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
+    v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
+    v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
+    v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
+    v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
+    v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
+    v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
+    v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
+    v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
+    v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
+    v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
+    v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
+    v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
+    v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
+
+    // stage 9
+    for (i = 0; i < 8; ++i) {
+      addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 16; i < 20; ++i) {
+      u[i] = v[i];
+      u[i + 12] = v[i + 12];
+    }
+
+    u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
+    u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
+    u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
+    u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
+    u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
+    u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
+    u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
+
+    for (i = 32; i < 40; i++) {
+      addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 48; i < 56; i++) {
+      addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
+    }
+
+    // stage 10
+    for (i = 0; i < 16; i++) {
+      addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 32; i < 40; i++) v[i] = u[i];
+
+    v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
+    v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
+    v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
+    v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
+    v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
+    v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
+    v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
+    v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
+    v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
+    v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
+    v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
+    v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
+    v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
+    v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
+
+    for (i = 56; i < 64; i++) v[i] = u[i];
+
+    // stage 11
+    if (do_cols) {
+      for (i = 0; i < 32; i++) {
+        addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[16 * (i) + col],
+                               &out[16 * (63 - i) + col]);
+      }
+    } else {
+      for (i = 0; i < 32; i++) {
+        addsub_shift_sse4_1(v[i], v[63 - i], &out[16 * (i) + col],
+                            &out[16 * (63 - i) + col], &clamp_lo, &clamp_hi,
+                            out_shift);
+      }
+    }
+  }
+}
+
+void av1_inv_txfm2d_add_64x64_sse4_1(const int32_t *coeff, uint16_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[64 * 64 / 4], out[64 * 64 / 4];
+  const int8_t *shift = inv_txfm_shift_ls[TX_64X64];
+  const int txw_idx = tx_size_wide_log2[TX_64X64] - tx_size_wide_log2[0];
+  const int txh_idx = tx_size_high_log2[TX_64X64] - tx_size_high_log2[0];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_64x64_lower_32x32(coeff, in);
+      transpose_64x64(in, out, 0);
+      idct64x64_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                       -shift[0]);
+      transpose_64x64(in, out, 1);
+      idct64x64_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_64x64(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+
+    default:
+      av1_inv_txfm2d_add_64x64_c(coeff, output, stride, tx_type, bd);
+      break;
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
new file mode 100644
index 000000000..89d0ecb1e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -0,0 +1,853 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_jnt_convolve_2d_copy_avx2(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi32(w0);
+  const __m256i wt1 = _mm256_set1_epi32(w1);
+  const __m256i zero = _mm256_setzero_si256();
+  int i, j;
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi32(offset);
+  const __m256i offset_const_16b = _mm256_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+  const __m256i clip_pixel_to_bd =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  assert(bits <= 4);
+
+  if (!(w % 16)) {
+    for (i = 0; i < h; i += 1) {
+      for (j = 0; j < w; j += 16) {
+        const __m256i src_16bit =
+            _mm256_loadu_si256((__m256i *)(&src[i * src_stride + j]));
+
+        const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
+
+        if (do_average) {
+          const __m256i data_0 =
+              _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
+
+          const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_0, zero);
+          const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_0, zero);
+
+          const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
+          const __m256i res_unsigned_lo =
+              _mm256_add_epi32(res_32b_lo, offset_const);
+
+          const __m256i comp_avg_res_lo = highbd_comp_avg(
+              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
+          const __m256i res_unsigned_hi =
+              _mm256_add_epi32(res_32b_hi, offset_const);
+
+          const __m256i comp_avg_res_hi = highbd_comp_avg(
+              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m256i round_result_lo = highbd_convolve_rounding(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+          const __m256i round_result_hi = highbd_convolve_rounding(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_16b =
+              _mm256_packus_epi32(round_result_lo, round_result_hi);
+          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+          _mm256_store_si256((__m256i *)(&dst0[i * dst_stride0 + j]), res_clip);
+        } else {
+          const __m256i res_unsigned_16b =
+              _mm256_adds_epu16(res, offset_const_16b);
+
+          _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
+                             res_unsigned_16b);
+        }
+      }
+    }
+  } else if (!(w % 4)) {
+    for (i = 0; i < h; i += 2) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i src_row_0 =
+            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
+        const __m128i src_row_1 =
+            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j + src_stride]));
+        // since not all compilers yet support _mm256_set_m128i()
+        const __m256i src_10 = _mm256_insertf128_si256(
+            _mm256_castsi128_si256(src_row_0), src_row_1, 1);
+
+        const __m256i res = _mm256_sll_epi16(src_10, left_shift);
+
+        if (w - j < 8) {
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+            const __m256i res_32b = _mm256_unpacklo_epi16(res, zero);
+            const __m256i res_unsigned_lo =
+                _mm256_add_epi32(res_32b, offset_const);
+
+            const __m256i comp_avg_res = highbd_comp_avg(
+                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i round_result = highbd_convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result, round_result);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            const __m256i res_unsigned_16b =
+                _mm256_adds_epu16(res, offset_const_16b);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                             res_1);
+          }
+        } else {
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+            const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
+            const __m256i res_unsigned_lo =
+                _mm256_add_epi32(res_32b_lo, offset_const);
+
+            const __m256i comp_avg_res_lo = highbd_comp_avg(
+                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
+            const __m256i res_unsigned_hi =
+                _mm256_add_epi32(res_32b_hi, offset_const);
+
+            const __m256i comp_avg_res_hi = highbd_comp_avg(
+                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i round_result_lo =
+                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+                                         &rounding_const, rounding_shift);
+            const __m256i round_result_hi =
+                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+                                         &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result_lo, round_result_hi);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_store_si128(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            const __m256i res_unsigned_16b =
+                _mm256_adds_epu16(res, offset_const_16b);
+            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
+
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        }
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride,
+                                     uint16_t *dst0, int dst_stride0, int w,
+                                     int h, InterpFilterParams *filter_params_x,
+                                     InterpFilterParams *filter_params_y,
+                                     const int subpel_x_q4,
+                                     const int subpel_y_q4,
+                                     ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = 8;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  __m256i s[8], coeffs_y[4], coeffs_x[4];
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi32(w0);
+  const __m256i wt1 = _mm256_set1_epi32(w1);
+  const __m256i zero = _mm256_setzero_si256();
+
+  const __m256i round_const_x = _mm256_set1_epi32(
+      ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const __m256i round_const_y = _mm256_set1_epi32(
+      ((1 << conv_params->round_1) >> 1) -
+      (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+
+  const __m256i clip_pixel_to_bd =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      for (i = 0; i < im_h; i += 2) {
+        const __m256i row0 =
+            _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+        __m256i row1 = _mm256_set1_epi16(0);
+        if (i + 1 < im_h)
+          row1 =
+              _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+        const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+        const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+        // even pixels
+        s[0] = _mm256_alignr_epi8(r1, r0, 0);
+        s[1] = _mm256_alignr_epi8(r1, r0, 4);
+        s[2] = _mm256_alignr_epi8(r1, r0, 8);
+        s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+        __m256i res_even = convolve(s, coeffs_x);
+        res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+                                    round_shift_x);
+
+        // odd pixels
+        s[0] = _mm256_alignr_epi8(r1, r0, 2);
+        s[1] = _mm256_alignr_epi8(r1, r0, 6);
+        s[2] = _mm256_alignr_epi8(r1, r0, 10);
+        s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+        __m256i res_odd = convolve(s, coeffs_x);
+        res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+                                   round_shift_x);
+
+        __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+        __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+        __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+
+        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+      }
+    }
+
+    /* Vertical filter */
+    {
+      __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+      __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+      __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+      __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+      __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+      __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+      s[0] = _mm256_unpacklo_epi16(s0, s1);
+      s[1] = _mm256_unpacklo_epi16(s2, s3);
+      s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm256_unpackhi_epi16(s0, s1);
+      s[5] = _mm256_unpackhi_epi16(s2, s3);
+      s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+      for (i = 0; i < h; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        const __m256i s6 =
+            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+        const __m256i s7 =
+            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+        s[3] = _mm256_unpacklo_epi16(s6, s7);
+        s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+
+        const __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_y), round_shift_y);
+
+        const __m256i res_unsigned_lo =
+            _mm256_add_epi32(res_a_round, offset_const);
+
+        if (w - j < 8) {
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+            const __m256i comp_avg_res = highbd_comp_avg(
+                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i round_result = highbd_convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result, round_result);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            __m256i res_16b =
+                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                             res_1);
+          }
+        } else {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          const __m256i res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b, round_const_y), round_shift_y);
+
+          __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
+
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+            const __m256i comp_avg_res_lo = highbd_comp_avg(
+                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res_hi = highbd_comp_avg(
+                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i round_result_lo =
+                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+                                         &rounding_const, rounding_shift);
+            const __m256i round_result_hi =
+                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+                                         &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result_lo, round_result_hi);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_store_si128(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            __m256i res_16b =
+                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride,
+                                    uint16_t *dst0, int dst_stride0, int w,
+                                    int h, InterpFilterParams *filter_params_x,
+                                    InterpFilterParams *filter_params_y,
+                                    const int subpel_x_q4,
+                                    const int subpel_y_q4,
+                                    ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  int i, j;
+  __m256i s[4], coeffs_x[4];
+
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi32(w0);
+  const __m256i wt1 = _mm256_set1_epi32(w1);
+  const __m256i zero = _mm256_setzero_si256();
+
+  const __m256i round_const_x =
+      _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+  const __m256i clip_pixel_to_bd =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  assert(bits >= 0);
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    for (i = 0; i < h; i += 2) {
+      const __m256i row0 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+      __m256i row1 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+      // even pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 0);
+      s[1] = _mm256_alignr_epi8(r1, r0, 4);
+      s[2] = _mm256_alignr_epi8(r1, r0, 8);
+      s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+      __m256i res_even = convolve(s, coeffs_x);
+      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+                                  round_shift_x);
+
+      // odd pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 2);
+      s[1] = _mm256_alignr_epi8(r1, r0, 6);
+      s[2] = _mm256_alignr_epi8(r1, r0, 10);
+      s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+      __m256i res_odd = convolve(s, coeffs_x);
+      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+                                 round_shift_x);
+
+      res_even = _mm256_sll_epi32(res_even, round_shift_bits);
+      res_odd = _mm256_sll_epi32(res_odd, round_shift_bits);
+
+      __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd);
+
+      __m256i res_unsigned_lo = _mm256_add_epi32(res1, offset_const);
+
+      if (w - j < 8) {
+        if (do_average) {
+          const __m256i data_0 = _mm256_castsi128_si256(
+              _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+          const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+              (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+          const __m256i data_01 =
+              _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+          const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+          const __m256i comp_avg_res = highbd_comp_avg(
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m256i round_result = highbd_convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_16b =
+              _mm256_packus_epi32(round_result, round_result);
+          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+          const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+          const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+          _mm_storel_epi64(
+              (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+        } else {
+          __m256i res_16b =
+              _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+          const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+          const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                           res_1);
+        }
+      } else {
+        __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd);
+        __m256i res_unsigned_hi = _mm256_add_epi32(res2, offset_const);
+
+        if (do_average) {
+          const __m256i data_0 = _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+          const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+              (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+          const __m256i data_01 =
+              _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+          const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+          const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+          const __m256i comp_avg_res_lo = highbd_comp_avg(
+              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+          const __m256i comp_avg_res_hi = highbd_comp_avg(
+              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m256i round_result_lo = highbd_convolve_rounding(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+          const __m256i round_result_hi = highbd_convolve_rounding(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_16b =
+              _mm256_packus_epi32(round_result_lo, round_result_hi);
+          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+          const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+          const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+                          res_1);
+        } else {
+          __m256i res_16b =
+              _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+          const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+          const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_1);
+        }
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride,
+                                    uint16_t *dst0, int dst_stride0, int w,
+                                    int h, InterpFilterParams *filter_params_x,
+                                    InterpFilterParams *filter_params_y,
+                                    const int subpel_x_q4,
+                                    const int subpel_y_q4,
+                                    ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+
+  assert(bits >= 0);
+  int i, j;
+  __m256i s[8], coeffs_y[4];
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi32(w0);
+  const __m256i wt1 = _mm256_set1_epi32(w1);
+  const __m256i round_const_y =
+      _mm256_set1_epi32(((1 << conv_params->round_1) >> 1));
+  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+  const __m256i clip_pixel_to_bd =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    const uint16_t *data = &src_ptr[j];
+    /* Vertical filter */
+    {
+      __m256i src6;
+      __m256i s01 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+          0x20);
+      __m256i s12 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+          0x20);
+      __m256i s23 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+          0x20);
+      __m256i s34 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+          0x20);
+      __m256i s45 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+          0x20);
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+      __m256i s56 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+          src6, 0x20);
+
+      s[0] = _mm256_unpacklo_epi16(s01, s12);
+      s[1] = _mm256_unpacklo_epi16(s23, s34);
+      s[2] = _mm256_unpacklo_epi16(s45, s56);
+
+      s[4] = _mm256_unpackhi_epi16(s01, s12);
+      s[5] = _mm256_unpackhi_epi16(s23, s34);
+      s[6] = _mm256_unpackhi_epi16(s45, s56);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        const __m256i s67 = _mm256_permute2x128_si256(
+            src6,
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            0x20);
+
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+
+        const __m256i s78 = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            src6, 0x20);
+
+        s[3] = _mm256_unpacklo_epi16(s67, s78);
+        s[7] = _mm256_unpackhi_epi16(s67, s78);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+
+        __m256i res_a_round = _mm256_sll_epi32(res_a, round_shift_bits);
+        res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a_round, round_const_y), round_shift_y);
+
+        __m256i res_unsigned_lo = _mm256_add_epi32(res_a_round, offset_const);
+
+        if (w - j < 8) {
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+            const __m256i comp_avg_res = highbd_comp_avg(
+                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i round_result = highbd_convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result, round_result);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            __m256i res_16b =
+                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                             res_1);
+          }
+        } else {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          __m256i res_b_round = _mm256_sll_epi32(res_b, round_shift_bits);
+          res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b_round, round_const_y), round_shift_y);
+
+          __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
+
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+            const __m256i comp_avg_res_lo = highbd_comp_avg(
+                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res_hi = highbd_comp_avg(
+                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i round_result_lo =
+                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+                                         &rounding_const, rounding_shift);
+            const __m256i round_result_hi =
+                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+                                         &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result_lo, round_result_hi);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_store_si128(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            __m256i res_16b =
+                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
new file mode 100644
index 000000000..ccca6b07a
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+
+void av1_highbd_jnt_convolve_y_sse4_1(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+
+  assert(bits >= 0);
+  int i, j;
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+  const __m128i round_const_y =
+      _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+  const __m128i clip_pixel_to_bd =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i s[16], coeffs_y[4];
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    const uint16_t *data = &src_ptr[j];
+    /* Vertical filter */
+    {
+      __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+      __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+      __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+
+      s[0] = _mm_unpacklo_epi16(s0, s1);
+      s[1] = _mm_unpacklo_epi16(s2, s3);
+      s[2] = _mm_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm_unpackhi_epi16(s0, s1);
+      s[5] = _mm_unpackhi_epi16(s2, s3);
+      s[6] = _mm_unpackhi_epi16(s4, s5);
+
+      s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+      s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+      s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+      s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+      s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+      s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+        __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+
+        s[3] = _mm_unpacklo_epi16(s6, s7);
+        s[7] = _mm_unpackhi_epi16(s6, s7);
+
+        s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+        s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+        const __m128i res_a0 = convolve(s, coeffs_y);
+        __m128i res_a_round0 = _mm_sll_epi32(res_a0, round_shift_bits);
+        res_a_round0 = _mm_sra_epi32(_mm_add_epi32(res_a_round0, round_const_y),
+                                     round_shift_y);
+
+        const __m128i res_a1 = convolve(s + 8, coeffs_y);
+        __m128i res_a_round1 = _mm_sll_epi32(res_a1, round_shift_bits);
+        res_a_round1 = _mm_sra_epi32(_mm_add_epi32(res_a_round1, round_const_y),
+                                     round_shift_y);
+
+        __m128i res_unsigned_lo_0 = _mm_add_epi32(res_a_round0, offset_const);
+        __m128i res_unsigned_lo_1 = _mm_add_epi32(res_a_round1, offset_const);
+
+        if (w - j < 8) {
+          if (do_average) {
+            const __m128i data_0 =
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+            const __m128i data_1 = _mm_loadl_epi64(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+
+            const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+            const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
+
+            const __m128i comp_avg_res_0 = highbd_comp_avg_sse4_1(
+                &data_ref_0, &res_unsigned_lo_0, &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_1 = highbd_comp_avg_sse4_1(
+                &data_ref_1, &res_unsigned_lo_1, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m128i round_result_0 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_0, &offset_const,
+                                              &rounding_const, rounding_shift);
+            const __m128i round_result_1 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_1, &offset_const,
+                                              &rounding_const, rounding_shift);
+
+            const __m128i res_16b_0 =
+                _mm_packus_epi32(round_result_0, round_result_0);
+            const __m128i res_clip_0 =
+                _mm_min_epi16(res_16b_0, clip_pixel_to_bd);
+            const __m128i res_16b_1 =
+                _mm_packus_epi32(round_result_1, round_result_1);
+            const __m128i res_clip_1 =
+                _mm_min_epi16(res_16b_1, clip_pixel_to_bd);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]),
+                             res_clip_0);
+            _mm_storel_epi64(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+                res_clip_1);
+
+          } else {
+            __m128i res_16b_0 =
+                _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_lo_0);
+
+            __m128i res_16b_1 =
+                _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_lo_1);
+
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b_0);
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_16b_1);
+          }
+        } else {
+          const __m128i res_b0 = convolve(s + 4, coeffs_y);
+          __m128i res_b_round0 = _mm_sll_epi32(res_b0, round_shift_bits);
+          res_b_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_b_round0, round_const_y), round_shift_y);
+
+          const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+          __m128i res_b_round1 = _mm_sll_epi32(res_b1, round_shift_bits);
+          res_b_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_b_round1, round_const_y), round_shift_y);
+
+          __m128i res_unsigned_hi_0 = _mm_add_epi32(res_b_round0, offset_const);
+          __m128i res_unsigned_hi_1 = _mm_add_epi32(res_b_round1, offset_const);
+
+          if (do_average) {
+            const __m128i data_0 =
+                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+            const __m128i data_1 = _mm_loadu_si128(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+            const __m128i data_ref_0_lo_0 = _mm_unpacklo_epi16(data_0, zero);
+            const __m128i data_ref_0_lo_1 = _mm_unpacklo_epi16(data_1, zero);
+
+            const __m128i data_ref_0_hi_0 = _mm_unpackhi_epi16(data_0, zero);
+            const __m128i data_ref_0_hi_1 = _mm_unpackhi_epi16(data_1, zero);
+
+            const __m128i comp_avg_res_lo_0 =
+                highbd_comp_avg_sse4_1(&data_ref_0_lo_0, &res_unsigned_lo_0,
+                                       &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_lo_1 =
+                highbd_comp_avg_sse4_1(&data_ref_0_lo_1, &res_unsigned_lo_1,
+                                       &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_hi_0 =
+                highbd_comp_avg_sse4_1(&data_ref_0_hi_0, &res_unsigned_hi_0,
+                                       &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_hi_1 =
+                highbd_comp_avg_sse4_1(&data_ref_0_hi_1, &res_unsigned_hi_1,
+                                       &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m128i round_result_lo_0 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_lo_0, &offset_const,
+                                              &rounding_const, rounding_shift);
+            const __m128i round_result_lo_1 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_lo_1, &offset_const,
+                                              &rounding_const, rounding_shift);
+            const __m128i round_result_hi_0 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_hi_0, &offset_const,
+                                              &rounding_const, rounding_shift);
+            const __m128i round_result_hi_1 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_hi_1, &offset_const,
+                                              &rounding_const, rounding_shift);
+
+            const __m128i res_16b_0 =
+                _mm_packus_epi32(round_result_lo_0, round_result_hi_0);
+            const __m128i res_clip_0 =
+                _mm_min_epi16(res_16b_0, clip_pixel_to_bd);
+
+            const __m128i res_16b_1 =
+                _mm_packus_epi32(round_result_lo_1, round_result_hi_1);
+            const __m128i res_clip_1 =
+                _mm_min_epi16(res_16b_1, clip_pixel_to_bd);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
+                            res_clip_0);
+            _mm_store_si128(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+                res_clip_1);
+          } else {
+            __m128i res_16bit0 =
+                _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_hi_0);
+            __m128i res_16bit1 =
+                _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_hi_1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16bit0);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_16bit1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+
+        s[0 + 8] = s[1 + 8];
+        s[1 + 8] = s[2 + 8];
+        s[2 + 8] = s[3 + 8];
+
+        s[4 + 8] = s[5 + 8];
+        s[5 + 8] = s[6 + 8];
+        s[6 + 8] = s[7 + 8];
+
+        s6 = s8;
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_x_sse4_1(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, InterpFilterParams *filter_params_x,
+    InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  int i, j;
+  __m128i s[4], coeffs_x[4];
+
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i round_const_x =
+      _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+  const __m128i clip_pixel_to_bd =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  assert(bits >= 0);
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    for (i = 0; i < h; i += 1) {
+      const __m128i row00 =
+          _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+      const __m128i row01 =
+          _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+      // even pixels
+      s[0] = _mm_alignr_epi8(row01, row00, 0);
+      s[1] = _mm_alignr_epi8(row01, row00, 4);
+      s[2] = _mm_alignr_epi8(row01, row00, 8);
+      s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+      __m128i res_even = convolve(s, coeffs_x);
+      res_even =
+          _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), round_shift_x);
+
+      // odd pixels
+      s[0] = _mm_alignr_epi8(row01, row00, 2);
+      s[1] = _mm_alignr_epi8(row01, row00, 6);
+      s[2] = _mm_alignr_epi8(row01, row00, 10);
+      s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+      __m128i res_odd = convolve(s, coeffs_x);
+      res_odd =
+          _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
+
+      res_even = _mm_sll_epi32(res_even, round_shift_bits);
+      res_odd = _mm_sll_epi32(res_odd, round_shift_bits);
+
+      __m128i res1 = _mm_unpacklo_epi32(res_even, res_odd);
+      __m128i res_unsigned_lo = _mm_add_epi32(res1, offset_const);
+      if (w - j < 8) {
+        if (do_average) {
+          const __m128i data_0 =
+              _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+          const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+
+          const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+          const __m128i round_result = highbd_convolve_rounding_sse2(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_16b = _mm_packus_epi32(round_result, round_result);
+          const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+        } else {
+          __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b);
+        }
+      } else {
+        __m128i res2 = _mm_unpackhi_epi32(res_even, res_odd);
+        __m128i res_unsigned_hi = _mm_add_epi32(res2, offset_const);
+        if (do_average) {
+          const __m128i data_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+          const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
+          const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
+
+          const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+          const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+          const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_16b =
+              _mm_packus_epi32(round_result_lo, round_result_hi);
+          const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+        } else {
+          __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
index fb246674a..b29bd1d79 100644
--- a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
+++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -90,4 +90,14 @@ static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0,
   return x;
 }
 
+static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0,
+                                        const __m128i *rounding, int bit) {
+  __m128i x;
+
+  x = _mm_mullo_epi32(*w0, *n0);
+  x = _mm_add_epi32(x, *rounding);
+  x = _mm_srai_epi32(x, bit);
+  return x;
+}
+
 #endif  // _HIGHBD_TXFM_UTILITY_SSE4_H
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
new file mode 100644
index 000000000..a08beaafd
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
@@ -0,0 +1,454 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+
+static const uint8_t warp_highbd_arrange_bytes[16] = {
+  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+};
+
+static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp,
+                                     int sx, int alpha, int k,
+                                     const int offset_bits_horiz,
+                                     const int reduce_bits_horiz) {
+  // Filter even-index pixels
+  const __m128i tmp_0 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_2 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_4 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_6 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+  // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
+  const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+  // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
+  const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+  // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
+  const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+  // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
+  const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+  // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
+  const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+  // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
+  const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+  // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
+  const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+  // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
+  const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+  const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
+                                             ((1 << reduce_bits_horiz) >> 1));
+
+  // Calculate filtered results
+  const __m128i res_0 = _mm_madd_epi16(src, coeff_0);
+  const __m128i res_2 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 4), coeff_2);
+  const __m128i res_4 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 8), coeff_4);
+  const __m128i res_6 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 12), coeff_6);
+
+  __m128i res_even =
+      _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
+  res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
+                           _mm_cvtsi32_si128(reduce_bits_horiz));
+
+  // Filter odd-index pixels
+  const __m128i tmp_1 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_3 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_5 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_7 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+  const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+  const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+  const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+  const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+  const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+  const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+  const __m128i res_1 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 2), coeff_1);
+  const __m128i res_3 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 6), coeff_3);
+  const __m128i res_5 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 10), coeff_5);
+  const __m128i res_7 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 14), coeff_7);
+
+  __m128i res_odd =
+      _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));
+  res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
+                          _mm_cvtsi32_si128(reduce_bits_horiz));
+
+  // Combine results into one register.
+  // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
+  // as this order helps with the vertical filter.
+  tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
+}
+
+void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
+                                   int width, int height, int stride,
+                                   uint16_t *pred, int p_col, int p_row,
+                                   int p_width, int p_height, int p_stride,
+                                   int subsampling_x, int subsampling_y, int bd,
+                                   ConvolveParams *conv_params, int16_t alpha,
+                                   int16_t beta, int16_t gamma, int16_t delta) {
+  __m128i tmp[15];
+  int i, j, k;
+  const int reduce_bits_horiz =
+      conv_params->round_0 +
+      AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+  assert(!(bd == 12 && reduce_bits_horiz < 5));
+  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
+  const __m128i reduce_bits_vert_const =
+      _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
+  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const __m128i res_sub_const =
+      _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
+                     (1 << (offset_bits - conv_params->round_1 - 1)));
+  __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+  __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+
+  /* Note: For this code to work, the left/right frame borders need to be
+  extended by at least 13 pixels each. By the time we get here, other
+  code will have set up this border, but we allow an explicit check
+  for debugging purposes.
+  */
+  /*for (i = 0; i < height; ++i) {
+  for (j = 0; j < 13; ++j) {
+  assert(ref[i * stride - 13 + j] == ref[i * stride]);
+  assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+  }
+  }*/
+
+  for (i = 0; i < p_height; i += 8) {
+    for (j = 0; j < p_width; j += 8) {
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Add in all the constant terms, including rounding and offset
+      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+      // Horizontal filter
+      // If the block is aligned such that, after clamping, every sample
+      // would be taken from the leftmost/rightmost column, then we can
+      // skip the expensive horizontal filter.
+      if (ix4 <= -7) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          tmp[k + 7] = _mm_set1_epi16(
+              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
+        }
+      } else if (ix4 >= width + 6) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          tmp[k + 7] =
+              _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+                             ref[iy * stride + (width - 1)] *
+                                 (1 << (FILTER_BITS - reduce_bits_horiz)));
+        }
+      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+        const int out_of_boundary_left = -(ix4 - 6);
+        const int out_of_boundary_right = (ix4 + 8) - width;
+
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
+
+          // Load source pixels
+          const __m128i src =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+          const __m128i src2 =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+          const __m128i src_01 = _mm_shuffle_epi8(
+              src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
+          const __m128i src2_01 = _mm_shuffle_epi8(
+              src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
+
+          __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01);
+          __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01);
+
+          if (out_of_boundary_left >= 0) {
+            const __m128i shuffle_reg_left =
+                _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+            src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left);
+            src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left);
+          }
+
+          if (out_of_boundary_right >= 0) {
+            const __m128i shuffle_reg_right = _mm_loadu_si128(
+                (__m128i *)warp_pad_right[out_of_boundary_right]);
+            src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right);
+            src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right);
+          }
+
+          const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);
+          const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);
+
+          horizontal_filter(src_padded, src2_padded, tmp, sx, alpha, k,
+                            offset_bits_horiz, reduce_bits_horiz);
+        }
+      } else {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
+
+          // Load source pixels
+          const __m128i src =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+          const __m128i src2 =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+          horizontal_filter(src, src2, tmp, sx, alpha, k, offset_bits_horiz,
+                            reduce_bits_horiz);
+        }
+      }
+
+      // Vertical filter
+      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+        int sy = sy4 + delta * (k + 4);
+
+        // Load from tmp and rearrange pairs of consecutive rows into the
+        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+        const __m128i *src = tmp + (k + 4);
+        const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+        const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+        const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+        const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+        // Filter even-index pixels
+        const __m128i tmp_0 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_2 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_4 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_6 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+        const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+        const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+        const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+        const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+        const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+        const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+        const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+        const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+        const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+        const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+        const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+        const __m128i tmp_1 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_3 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_5 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_7 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+        const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+        const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+        const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+        const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+        const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+        const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+        const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+        const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        if (conv_params->is_compound) {
+          __m128i *const p =
+              (__m128i *)&conv_params
+                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
+          res_lo = _mm_add_epi32(res_lo, res_add_const);
+          res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
+                                 reduce_bits_vert_shift);
+
+          if (conv_params->do_average) {
+            __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+            __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
+
+            if (conv_params->use_jnt_comp_avg) {
+              res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
+                                     _mm_mullo_epi32(res_lo, wt1));
+              res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
+            } else {
+              res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1);
+            }
+
+            __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const);
+            res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const),
+                                     round_bits_shift);
+
+            __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo);
+            res16_lo = _mm_min_epi16(res16_lo, clip_pixel);
+            _mm_storel_epi64(dst16, res16_lo);
+          } else {
+            res_lo = _mm_packus_epi32(res_lo, res_lo);
+            _mm_storel_epi64(p, res_lo);
+          }
+          if (p_width > 4) {
+            __m128i *const p4 =
+                (__m128i *)&conv_params
+                    ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+
+            res_hi = _mm_add_epi32(res_hi, res_add_const);
+            res_hi =
+                _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
+                              reduce_bits_vert_shift);
+            if (conv_params->do_average) {
+              __m128i *const dst16_4 =
+                  (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+              __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
+
+              if (conv_params->use_jnt_comp_avg) {
+                res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
+                                       _mm_mullo_epi32(res_hi, wt1));
+                res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
+              } else {
+                res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1);
+              }
+
+              __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const);
+              res32_hi = _mm_sra_epi32(
+                  _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift);
+              __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi);
+              res16_hi = _mm_min_epi16(res16_hi, clip_pixel);
+              _mm_storel_epi64(dst16_4, res16_hi);
+            } else {
+              res_hi = _mm_packus_epi32(res_hi, res_hi);
+              _mm_storel_epi64(p4, res_hi);
+            }
+          }
+        } else {
+          // Round and pack into 8 bits
+          const __m128i round_const =
+              _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+                             ((1 << reduce_bits_vert) >> 1));
+
+          const __m128i res_lo_round = _mm_srai_epi32(
+              _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
+          const __m128i res_hi_round = _mm_srai_epi32(
+              _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
+
+          __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+          // Clamp res_16bit to the range [0, 2^bd - 1]
+          const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
+          const __m128i zero = _mm_setzero_si128();
+          res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
+
+          // Store, blending with 'pred' if needed
+          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+          // Note: If we're outputting a 4x4 block, we need to be very careful
+          // to only output 4 pixels at this point, to avoid encode/decode
+          // mismatches when encoding with multiple threads.
+          if (p_width == 4) {
+            _mm_storel_epi64(p, res_16bit);
+          } else {
+            _mm_storeu_si128(p, res_16bit);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
deleted file mode 100644
index 71b0ec7a3..000000000
--- a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
+++ /dev/null
@@ -1,365 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-
-#include "./av1_rtcd.h"
-#include "av1/common/warped_motion.h"
-
-void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
-                                  int width, int height, int stride,
-                                  uint16_t *pred, int p_col, int p_row,
-                                  int p_width, int p_height, int p_stride,
-                                  int subsampling_x, int subsampling_y, int bd,
-                                  ConvolveParams *conv_params, int16_t alpha,
-                                  int16_t beta, int16_t gamma, int16_t delta) {
-  int comp_avg = conv_params->do_average;
-#if HORSHEAR_REDUCE_PREC_BITS >= 5
-  __m128i tmp[15];
-#else
-#error "HORSHEAR_REDUCE_PREC_BITS < 5 not currently supported by SSSE3 filter"
-#endif
-  int i, j, k;
-#if CONFIG_CONVOLVE_ROUND
-  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
-  const int reduce_bits_horiz =
-      use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
-  const int offset_bits_horiz =
-      use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
-  if (use_conv_params) {
-    conv_params->do_post_rounding = 1;
-  }
-  assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
-#else
-  const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
-  const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
-#endif
-
-  /* Note: For this code to work, the left/right frame borders need to be
-     extended by at least 13 pixels each. By the time we get here, other
-     code will have set up this border, but we allow an explicit check
-     for debugging purposes.
-  */
-  /*for (i = 0; i < height; ++i) {
-    for (j = 0; j < 13; ++j) {
-      assert(ref[i * stride - 13 + j] == ref[i * stride]);
-      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
-    }
-  }*/
-
-  for (i = 0; i < p_height; i += 8) {
-    for (j = 0; j < p_width; j += 8) {
-      const int32_t src_x = (p_col + j + 4) << subsampling_x;
-      const int32_t src_y = (p_row + i + 4) << subsampling_y;
-      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
-      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
-      const int32_t x4 = dst_x >> subsampling_x;
-      const int32_t y4 = dst_y >> subsampling_y;
-
-      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
-      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
-      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
-      // Add in all the constant terms, including rounding and offset
-      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-
-      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-
-      // Horizontal filter
-      // If the block is aligned such that, after clamping, every sample
-      // would be taken from the leftmost/rightmost column, then we can
-      // skip the expensive horizontal filter.
-      if (ix4 <= -7) {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
-              ref[iy * stride] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
-        }
-      } else if (ix4 >= width + 6) {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
-              ref[iy * stride + (width - 1)] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
-        }
-      } else {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          int sx = sx4 + beta * (k + 4);
-
-          // Load source pixels
-          const __m128i src =
-              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-          const __m128i src2 =
-              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
-
-          // Filter even-index pixels
-          const __m128i tmp_0 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_2 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_4 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_6 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
-
-          // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
-          const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
-          // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
-          const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
-          // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
-          const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
-          // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
-          const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
-          // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
-          const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
-          // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
-          const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
-          // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
-          const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
-          // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
-          const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
-          const __m128i round_const = _mm_set1_epi32(
-              (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
-
-          // Calculate filtered results
-          const __m128i res_0 = _mm_madd_epi16(src, coeff_0);
-          const __m128i res_2 =
-              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 4), coeff_2);
-          const __m128i res_4 =
-              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 8), coeff_4);
-          const __m128i res_6 =
-              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 12), coeff_6);
-
-          __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                           _mm_add_epi32(res_2, res_6));
-          res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
-                                   _mm_cvtsi32_si128(reduce_bits_horiz));
-
-          // Filter odd-index pixels
-          const __m128i tmp_1 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_3 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_5 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_7 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
-
-          const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
-          const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
-          const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
-          const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
-          const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
-          const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
-          const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
-          const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
-          const __m128i res_1 =
-              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 2), coeff_1);
-          const __m128i res_3 =
-              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 6), coeff_3);
-          const __m128i res_5 =
-              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 10), coeff_5);
-          const __m128i res_7 =
-              _mm_madd_epi16(_mm_alignr_epi8(src2, src, 14), coeff_7);
-
-          __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                          _mm_add_epi32(res_3, res_7));
-          res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
-                                  _mm_cvtsi32_si128(reduce_bits_horiz));
-
-          // Combine results into one register.
-          // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
-          // as this order helps with the vertical filter.
-          tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
-        }
-      }
-
-      // Vertical filter
-      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
-        int sy = sy4 + delta * (k + 4);
-
-        // Load from tmp and rearrange pairs of consecutive rows into the
-        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
-        const __m128i *src = tmp + (k + 4);
-        const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
-        const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
-        const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
-        const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
-
-        // Filter even-index pixels
-        const __m128i tmp_0 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_2 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_4 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_6 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-        const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
-        const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
-        const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
-        const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
-        const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
-        const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
-        const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
-        const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
-        const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
-        const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
-        const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
-
-        const __m128i tmp_1 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_3 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_5 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_7 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-        const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
-        const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
-        const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
-        const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
-        const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
-        const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
-        const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
-        const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-#if CONFIG_CONVOLVE_ROUND
-        if (use_conv_params) {
-          __m128i *const p =
-              (__m128i *)&conv_params
-                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
-          const __m128i round_const = _mm_set1_epi32(
-              -(1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)) +
-              ((1 << (conv_params->round_1)) >> 1));
-          res_lo = _mm_add_epi32(res_lo, round_const);
-          res_lo =
-              _mm_srl_epi16(res_lo, _mm_cvtsi32_si128(conv_params->round_1));
-          if (comp_avg) res_lo = _mm_add_epi32(_mm_loadu_si128(p), res_lo);
-          _mm_storeu_si128(p, res_lo);
-          if (p_width > 4) {
-            res_hi = _mm_add_epi32(res_hi, round_const);
-            res_hi =
-                _mm_srl_epi16(res_hi, _mm_cvtsi32_si128(conv_params->round_1));
-            if (comp_avg)
-              res_hi = _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi);
-            _mm_storeu_si128(p + 1, res_hi);
-          }
-        } else {
-#else
-        {
-#endif
-          // Round and pack into 8 bits
-          const __m128i round_const =
-              _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
-                             ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
-
-          const __m128i res_lo_round = _mm_srai_epi32(
-              _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
-          const __m128i res_hi_round = _mm_srai_epi32(
-              _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
-
-          __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
-          // Clamp res_16bit to the range [0, 2^bd - 1]
-          const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
-          const __m128i zero = _mm_setzero_si128();
-          res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
-
-          // Store, blending with 'pred' if needed
-          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-
-          // Note: If we're outputting a 4x4 block, we need to be very careful
-          // to only output 4 pixels at this point, to avoid encode/decode
-          // mismatches when encoding with multiple threads.
-          if (p_width == 4) {
-            if (comp_avg)
-              res_16bit = _mm_avg_epu16(res_16bit, _mm_loadl_epi64(p));
-            _mm_storel_epi64(p, res_16bit);
-          } else {
-            if (comp_avg)
-              res_16bit = _mm_avg_epu16(res_16bit, _mm_loadu_si128(p));
-            _mm_storeu_si128(p, res_16bit);
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c
new file mode 100644
index 000000000..0c8a8505b
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
+// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
+// on the left.
+// A row of, say, 16-bit pixels with values p0, p1, p2, ..., p14, p15 will be
+// loaded and stored as [ p15 ... p9 p8 ][ p7 ... p1 p0 ].
+void av1_highbd_wiener_convolve_add_src_avx2(
+    const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+    const int16_t *filter_y, int y_step_q4, int w, int h,
+    const ConvolveParams *conv_params, int bd) {
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(!(w & 7));
+  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
+
+  DECLARE_ALIGNED(32, uint16_t,
+                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+  int intermediate_height = h + SUBPEL_TAPS - 1;
+  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+  const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+  const __m128i zero_128 = _mm_setzero_si128();
+  const __m256i zero_256 = _mm256_setzero_si256();
+
+  // Add an offset to account for the "add_src" part of the convolve function.
+  const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+
+  const __m256i clamp_low = zero_256;
+
+  /* Horizontal filter */
+  {
+    const __m256i clamp_high_ep =
+        _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+
+    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+    const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
+
+    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+    const __m256i round_const = _mm256_set1_epi32(
+        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+    for (int i = 0; i < intermediate_height; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        const uint16_t *src_ij = src_ptr + i * src_stride + j;
+
+        // Load 16-bit src data
+        const __m256i src_0 = yy_loadu_256(src_ij + 0);
+        const __m256i src_1 = yy_loadu_256(src_ij + 1);
+        const __m256i src_2 = yy_loadu_256(src_ij + 2);
+        const __m256i src_3 = yy_loadu_256(src_ij + 3);
+        const __m256i src_4 = yy_loadu_256(src_ij + 4);
+        const __m256i src_5 = yy_loadu_256(src_ij + 5);
+        const __m256i src_6 = yy_loadu_256(src_ij + 6);
+        const __m256i src_7 = yy_loadu_256(src_ij + 7);
+
+        // Multiply src data by filter coeffs and sum pairs
+        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+        // Calculate scalar product for even- and odd-indices separately,
+        // increasing to 32-bit precision
+        const __m256i res_even_sum = _mm256_add_epi32(
+            _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
+        const __m256i res_even = _mm256_srai_epi32(
+            _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
+
+        const __m256i res_odd_sum = _mm256_add_epi32(
+            _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
+        const __m256i res_odd = _mm256_srai_epi32(
+            _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
+
+        // Reduce to 16-bit precision and pack even- and odd-index results
+        // back into one register. The _mm256_packs_epi32 intrinsic returns
+        // a register with the pixels ordered as follows:
+        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+        const __m256i res = _mm256_packs_epi32(res_even, res_odd);
+        const __m256i res_clamped =
+            _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep);
+
+        // Store in a temporary array
+        yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1);
+
+    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+    const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
+
+    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+    const __m256i round_const =
+        _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+                          (1 << (bd + conv_params->round_1 - 1)));
+
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j;
+
+        // Load 16-bit data from the output of the horizontal filter in
+        // which the pixels are ordered as follows:
+        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+        const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE);
+        const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE);
+        const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE);
+        const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE);
+        const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE);
+        const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE);
+        const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE);
+        const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE);
+
+        // Filter the even-indices, increasing to 32-bit precision
+        const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
+        const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
+        const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
+        const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
+
+        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+
+        const __m256i res_even = _mm256_add_epi32(
+            _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
+
+        // Filter the odd-indices, increasing to 32-bit precision
+        const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
+        const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
+        const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
+        const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
+
+        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+        const __m256i res_odd = _mm256_add_epi32(
+            _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
+
+        // Pixels are currently in the following order:
+        // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
+        // res_odd order:  [ 15 13 11 9 ] [ 7 5 3 1 ]
+        //
+        // Rearrange the pixels into the following order:
+        // res_lo order: [ 11 10  9  8 ] [ 3 2 1 0 ]
+        // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
+        const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+        const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+
+        const __m256i res_lo_round = _mm256_srai_epi32(
+            _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
+        const __m256i res_hi_round = _mm256_srai_epi32(
+            _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
+
+        // Reduce to 16-bit precision and pack into the correct order:
+        // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
+        const __m256i res_16bit =
+            _mm256_packs_epi32(res_lo_round, res_hi_round);
+        const __m256i res_16bit_clamped = _mm256_min_epi16(
+            _mm256_max_epi16(res_16bit, clamp_low), clamp_high);
+
+        // Store in the dst array
+        yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped);
+      }
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c b/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c
index 74ce80e50..818b1099c 100644
--- a/third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c
+++ b/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c
@@ -12,29 +12,28 @@
 #include <tmmintrin.h>
 #include <assert.h>
 
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 
-#if EXTRAPREC_BITS > 2
-#error "Highbd high-prec convolve filter only supports EXTRAPREC_BITS <= 2"
-#error "(need to use 32-bit intermediates for EXTRAPREC_BITS > 2)"
-#endif
-
-void aom_highbd_convolve8_add_src_hip_ssse3(
+void av1_highbd_wiener_convolve_add_src_ssse3(
     const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
     ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
-    const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
+    const int16_t *filter_y, int y_step_q4, int w, int h,
+    const ConvolveParams *conv_params, int bd) {
   assert(x_step_q4 == 16 && y_step_q4 == 16);
   assert(!(w & 7));
+  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
   (void)x_step_q4;
   (void)y_step_q4;
 
   const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
 
-  uint16_t temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE];
+  DECLARE_ALIGNED(16, uint16_t,
+                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
   int intermediate_height = h + SUBPEL_TAPS - 1;
   int i, j;
   const int center_tap = ((SUBPEL_TAPS - 1) / 2);
@@ -63,9 +62,8 @@ void aom_highbd_convolve8_add_src_hip_ssse3(
     // coeffs 6 7 6 7 6 7 6 7
     const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
 
-    const __m128i round_const =
-        _mm_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) +
-                       (1 << (bd + FILTER_BITS - 1)));
+    const __m128i round_const = _mm_set1_epi32(
+        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
 
     for (i = 0; i < intermediate_height; ++i) {
       for (j = 0; j < w; j += 8) {
@@ -86,7 +84,7 @@ void aom_highbd_convolve8_add_src_hip_ssse3(
         __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
                                          _mm_add_epi32(res_2, res_6));
         res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
-                                  FILTER_BITS - EXTRAPREC_BITS);
+                                  conv_params->round_0);
 
         // Filter odd-index pixels
         const __m128i res_1 =
@@ -101,10 +99,11 @@ void aom_highbd_convolve8_add_src_hip_ssse3(
         __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
                                         _mm_add_epi32(res_3, res_7));
         res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
-                                 FILTER_BITS - EXTRAPREC_BITS);
+                                 conv_params->round_0);
 
         // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
-        const __m128i maxval = _mm_set1_epi16((EXTRAPREC_CLAMP_LIMIT(bd)) - 1);
+        const __m128i maxval =
+            _mm_set1_epi16((WIENER_CLAMP_LIMIT(conv_params->round_0, bd)) - 1);
         __m128i res = _mm_packs_epi32(res_even, res_odd);
         res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval);
         _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
@@ -132,8 +131,8 @@ void aom_highbd_convolve8_add_src_hip_ssse3(
     const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
 
     const __m128i round_const =
-        _mm_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) -
-                       (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)));
+        _mm_set1_epi32((1 << (conv_params->round_1 - 1)) -
+                       (1 << (bd + conv_params->round_1 - 1)));
 
     for (i = 0; i < h; ++i) {
       for (j = 0; j < w; j += 8) {
@@ -187,9 +186,9 @@ void aom_highbd_convolve8_add_src_hip_ssse3(
         const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
 
         const __m128i res_lo_round = _mm_srai_epi32(
-            _mm_add_epi32(res_lo, round_const), FILTER_BITS + EXTRAPREC_BITS);
+            _mm_add_epi32(res_lo, round_const), conv_params->round_1);
         const __m128i res_hi_round = _mm_srai_epi32(
-            _mm_add_epi32(res_hi, round_const), FILTER_BITS + EXTRAPREC_BITS);
+            _mm_add_epi32(res_hi, round_const), conv_params->round_1);
 
         const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
         __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
diff --git a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
deleted file mode 100644
index c440d0f88..000000000
--- a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
+++ /dev/null
@@ -1,450 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>  // avx2
-
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
-
-#include "aom_dsp/x86/inv_txfm_common_avx2.h"
-
-void av1_idct16_avx2(__m256i *in) {
-  const __m256i cospi_p30_m02 = pair256_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m256i cospi_p02_p30 = pair256_set_epi16(cospi_2_64, cospi_30_64);
-  const __m256i cospi_p14_m18 = pair256_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m256i cospi_p18_p14 = pair256_set_epi16(cospi_18_64, cospi_14_64);
-  const __m256i cospi_p22_m10 = pair256_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m256i cospi_p10_p22 = pair256_set_epi16(cospi_10_64, cospi_22_64);
-  const __m256i cospi_p06_m26 = pair256_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m256i cospi_p26_p06 = pair256_set_epi16(cospi_26_64, cospi_6_64);
-  const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
-  const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
-  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
-  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
-  __m256i t0, t1, t2, t3, t4, t5, t6, t7;
-
-  // stage 1, (0-7)
-  u0 = in[0];
-  u1 = in[8];
-  u2 = in[4];
-  u3 = in[12];
-  u4 = in[2];
-  u5 = in[10];
-  u6 = in[6];
-  u7 = in[14];
-
-  // stage 2, (0-7)
-  // stage 3, (0-7)
-  t0 = u0;
-  t1 = u1;
-  t2 = u2;
-  t3 = u3;
-  unpack_butter_fly(&u4, &u7, &cospi_p28_m04, &cospi_p04_p28, &t4, &t7);
-  unpack_butter_fly(&u5, &u6, &cospi_p12_m20, &cospi_p20_p12, &t5, &t6);
-
-  // stage 4, (0-7)
-  unpack_butter_fly(&t0, &t1, &cospi_p16_p16, &cospi_p16_m16, &u0, &u1);
-  unpack_butter_fly(&t2, &t3, &cospi_p24_m08, &cospi_p08_p24, &u2, &u3);
-  u4 = _mm256_add_epi16(t4, t5);
-  u5 = _mm256_sub_epi16(t4, t5);
-  u6 = _mm256_sub_epi16(t7, t6);
-  u7 = _mm256_add_epi16(t7, t6);
-
-  // stage 5, (0-7)
-  t0 = _mm256_add_epi16(u0, u3);
-  t1 = _mm256_add_epi16(u1, u2);
-  t2 = _mm256_sub_epi16(u1, u2);
-  t3 = _mm256_sub_epi16(u0, u3);
-  t4 = u4;
-  t7 = u7;
-  unpack_butter_fly(&u6, &u5, &cospi_p16_m16, &cospi_p16_p16, &t5, &t6);
-
-  // stage 6, (0-7)
-  u0 = _mm256_add_epi16(t0, t7);
-  u1 = _mm256_add_epi16(t1, t6);
-  u2 = _mm256_add_epi16(t2, t5);
-  u3 = _mm256_add_epi16(t3, t4);
-  u4 = _mm256_sub_epi16(t3, t4);
-  u5 = _mm256_sub_epi16(t2, t5);
-  u6 = _mm256_sub_epi16(t1, t6);
-  u7 = _mm256_sub_epi16(t0, t7);
-
-  // stage 1, (8-15)
-  v0 = in[1];
-  v1 = in[9];
-  v2 = in[5];
-  v3 = in[13];
-  v4 = in[3];
-  v5 = in[11];
-  v6 = in[7];
-  v7 = in[15];
-
-  // stage 2, (8-15)
-  unpack_butter_fly(&v0, &v7, &cospi_p30_m02, &cospi_p02_p30, &t0, &t7);
-  unpack_butter_fly(&v1, &v6, &cospi_p14_m18, &cospi_p18_p14, &t1, &t6);
-  unpack_butter_fly(&v2, &v5, &cospi_p22_m10, &cospi_p10_p22, &t2, &t5);
-  unpack_butter_fly(&v3, &v4, &cospi_p06_m26, &cospi_p26_p06, &t3, &t4);
-
-  // stage 3, (8-15)
-  v0 = _mm256_add_epi16(t0, t1);
-  v1 = _mm256_sub_epi16(t0, t1);
-  v2 = _mm256_sub_epi16(t3, t2);
-  v3 = _mm256_add_epi16(t2, t3);
-  v4 = _mm256_add_epi16(t4, t5);
-  v5 = _mm256_sub_epi16(t4, t5);
-  v6 = _mm256_sub_epi16(t7, t6);
-  v7 = _mm256_add_epi16(t6, t7);
-
-  // stage 4, (8-15)
-  t0 = v0;
-  t7 = v7;
-  t3 = v3;
-  t4 = v4;
-  unpack_butter_fly(&v1, &v6, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6);
-  unpack_butter_fly(&v2, &v5, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5);
-
-  // stage 5, (8-15)
-  v0 = _mm256_add_epi16(t0, t3);
-  v1 = _mm256_add_epi16(t1, t2);
-  v2 = _mm256_sub_epi16(t1, t2);
-  v3 = _mm256_sub_epi16(t0, t3);
-  v4 = _mm256_sub_epi16(t7, t4);
-  v5 = _mm256_sub_epi16(t6, t5);
-  v6 = _mm256_add_epi16(t6, t5);
-  v7 = _mm256_add_epi16(t7, t4);
-
-  // stage 6, (8-15)
-  t0 = v0;
-  t1 = v1;
-  t6 = v6;
-  t7 = v7;
-  unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &t2, &t5);
-  unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &t3, &t4);
-
-  // stage 7
-  in[0] = _mm256_add_epi16(u0, t7);
-  in[1] = _mm256_add_epi16(u1, t6);
-  in[2] = _mm256_add_epi16(u2, t5);
-  in[3] = _mm256_add_epi16(u3, t4);
-  in[4] = _mm256_add_epi16(u4, t3);
-  in[5] = _mm256_add_epi16(u5, t2);
-  in[6] = _mm256_add_epi16(u6, t1);
-  in[7] = _mm256_add_epi16(u7, t0);
-  in[8] = _mm256_sub_epi16(u7, t0);
-  in[9] = _mm256_sub_epi16(u6, t1);
-  in[10] = _mm256_sub_epi16(u5, t2);
-  in[11] = _mm256_sub_epi16(u4, t3);
-  in[12] = _mm256_sub_epi16(u3, t4);
-  in[13] = _mm256_sub_epi16(u2, t5);
-  in[14] = _mm256_sub_epi16(u1, t6);
-  in[15] = _mm256_sub_epi16(u0, t7);
-}
-
-static void idct16(__m256i *in) {
-  mm256_transpose_16x16(in, in);
-  av1_idct16_avx2(in);
-}
-
-static INLINE void butterfly_32b(const __m256i *a0, const __m256i *a1,
-                                 const __m256i *c0, const __m256i *c1,
-                                 __m256i *b) {
-  __m256i x0, x1;
-  x0 = _mm256_unpacklo_epi16(*a0, *a1);
-  x1 = _mm256_unpackhi_epi16(*a0, *a1);
-  b[0] = _mm256_madd_epi16(x0, *c0);
-  b[1] = _mm256_madd_epi16(x1, *c0);
-  b[2] = _mm256_madd_epi16(x0, *c1);
-  b[3] = _mm256_madd_epi16(x1, *c1);
-}
-
-static INLINE void group_rounding(__m256i *a, int num) {
-  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  int i;
-  for (i = 0; i < num; ++i) {
-    a[i] = _mm256_add_epi32(a[i], dct_rounding);
-    a[i] = _mm256_srai_epi32(a[i], DCT_CONST_BITS);
-  }
-}
-
-static INLINE void add_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
-  __m256i x[4];
-  x[0] = _mm256_add_epi32(a[0], b[0]);
-  x[1] = _mm256_add_epi32(a[1], b[1]);
-  x[2] = _mm256_add_epi32(a[2], b[2]);
-  x[3] = _mm256_add_epi32(a[3], b[3]);
-
-  group_rounding(x, 4);
-
-  out[0] = _mm256_packs_epi32(x[0], x[1]);
-  out[1] = _mm256_packs_epi32(x[2], x[3]);
-}
-
-static INLINE void sub_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
-  __m256i x[4];
-  x[0] = _mm256_sub_epi32(a[0], b[0]);
-  x[1] = _mm256_sub_epi32(a[1], b[1]);
-  x[2] = _mm256_sub_epi32(a[2], b[2]);
-  x[3] = _mm256_sub_epi32(a[3], b[3]);
-
-  group_rounding(x, 4);
-
-  out[0] = _mm256_packs_epi32(x[0], x[1]);
-  out[1] = _mm256_packs_epi32(x[2], x[3]);
-}
-
-static INLINE void butterfly_rnd(__m256i *a, __m256i *out) {
-  group_rounding(a, 4);
-  out[0] = _mm256_packs_epi32(a[0], a[1]);
-  out[1] = _mm256_packs_epi32(a[2], a[3]);
-}
-
-static void iadst16_avx2(__m256i *in) {
-  const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64);
-  const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64);
-  const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64);
-  const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64);
-  const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64);
-  const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
-  const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64);
-  const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64);
-  const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
-  const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
-  const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64);
-  const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64);
-  const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
-  const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64);
-  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i x[16], s[16];
-  __m256i u[4], v[4];
-
-  // stage 1
-  butterfly_32b(&in[15], &in[0], &cospi_p01_p31, &cospi_p31_m01, u);
-  butterfly_32b(&in[7], &in[8], &cospi_p17_p15, &cospi_p15_m17, v);
-  add_rnd(u, v, &x[0]);
-  sub_rnd(u, v, &x[8]);
-
-  butterfly_32b(&in[13], &in[2], &cospi_p05_p27, &cospi_p27_m05, u);
-  butterfly_32b(&in[5], &in[10], &cospi_p21_p11, &cospi_p11_m21, v);
-  add_rnd(u, v, &x[2]);
-  sub_rnd(u, v, &x[10]);
-
-  butterfly_32b(&in[11], &in[4], &cospi_p09_p23, &cospi_p23_m09, u);
-  butterfly_32b(&in[3], &in[12], &cospi_p25_p07, &cospi_p07_m25, v);
-  add_rnd(u, v, &x[4]);
-  sub_rnd(u, v, &x[12]);
-
-  butterfly_32b(&in[9], &in[6], &cospi_p13_p19, &cospi_p19_m13, u);
-  butterfly_32b(&in[1], &in[14], &cospi_p29_p03, &cospi_p03_m29, v);
-  add_rnd(u, v, &x[6]);
-  sub_rnd(u, v, &x[14]);
-
-  // stage 2
-  s[0] = _mm256_add_epi16(x[0], x[4]);
-  s[1] = _mm256_add_epi16(x[1], x[5]);
-  s[2] = _mm256_add_epi16(x[2], x[6]);
-  s[3] = _mm256_add_epi16(x[3], x[7]);
-  s[4] = _mm256_sub_epi16(x[0], x[4]);
-  s[5] = _mm256_sub_epi16(x[1], x[5]);
-  s[6] = _mm256_sub_epi16(x[2], x[6]);
-  s[7] = _mm256_sub_epi16(x[3], x[7]);
-  butterfly_32b(&x[8], &x[9], &cospi_p04_p28, &cospi_p28_m04, u);
-  butterfly_32b(&x[12], &x[13], &cospi_m28_p04, &cospi_p04_p28, v);
-  add_rnd(u, v, &s[8]);
-  sub_rnd(u, v, &s[12]);
-
-  butterfly_32b(&x[10], &x[11], &cospi_p20_p12, &cospi_p12_m20, u);
-  butterfly_32b(&x[14], &x[15], &cospi_m12_p20, &cospi_p20_p12, v);
-  add_rnd(u, v, &s[10]);
-  sub_rnd(u, v, &s[14]);
-
-  // stage 3
-  x[0] = _mm256_add_epi16(s[0], s[2]);
-  x[1] = _mm256_add_epi16(s[1], s[3]);
-  x[2] = _mm256_sub_epi16(s[0], s[2]);
-  x[3] = _mm256_sub_epi16(s[1], s[3]);
-
-  x[8] = _mm256_add_epi16(s[8], s[10]);
-  x[9] = _mm256_add_epi16(s[9], s[11]);
-  x[10] = _mm256_sub_epi16(s[8], s[10]);
-  x[11] = _mm256_sub_epi16(s[9], s[11]);
-
-  butterfly_32b(&s[4], &s[5], &cospi_p08_p24, &cospi_p24_m08, u);
-  butterfly_32b(&s[6], &s[7], &cospi_m24_p08, &cospi_p08_p24, v);
-  add_rnd(u, v, &x[4]);
-  sub_rnd(u, v, &x[6]);
-
-  butterfly_32b(&s[12], &s[13], &cospi_p08_p24, &cospi_p24_m08, u);
-  butterfly_32b(&s[14], &s[15], &cospi_m24_p08, &cospi_p08_p24, v);
-  add_rnd(u, v, &x[12]);
-  sub_rnd(u, v, &x[14]);
-
-  // stage 4
-  butterfly_32b(&x[2], &x[3], &cospi_m16_m16, &cospi_p16_m16, u);
-  butterfly_32b(&x[6], &x[7], &cospi_p16_p16, &cospi_m16_p16, v);
-  butterfly_rnd(u, &x[2]);
-  butterfly_rnd(v, &x[6]);
-
-  butterfly_32b(&x[10], &x[11], &cospi_p16_p16, &cospi_m16_p16, u);
-  butterfly_32b(&x[14], &x[15], &cospi_m16_m16, &cospi_p16_m16, v);
-  butterfly_rnd(u, &x[10]);
-  butterfly_rnd(v, &x[14]);
-
-  in[0] = x[0];
-  in[1] = _mm256_sub_epi16(zero, x[8]);
-  in[2] = x[12];
-  in[3] = _mm256_sub_epi16(zero, x[4]);
-  in[4] = x[6];
-  in[5] = x[14];
-  in[6] = x[10];
-  in[7] = x[2];
-  in[8] = x[3];
-  in[9] = x[11];
-  in[10] = x[15];
-  in[11] = x[7];
-  in[12] = x[5];
-  in[13] = _mm256_sub_epi16(zero, x[13]);
-  in[14] = x[9];
-  in[15] = _mm256_sub_epi16(zero, x[1]);
-}
-
-static void iadst16(__m256i *in) {
-  mm256_transpose_16x16(in, in);
-  iadst16_avx2(in);
-}
-
-#if CONFIG_EXT_TX
-static void flip_row(__m256i *in, int rows) {
-  int i;
-  for (i = 0; i < rows; ++i) {
-    mm256_reverse_epi16(&in[i]);
-  }
-}
-
-static void flip_col(uint8_t **dest, int *stride, int rows) {
-  *dest = *dest + (rows - 1) * (*stride);
-  *stride = -*stride;
-}
-
-static void iidtx16(__m256i *in) {
-  mm256_transpose_16x16(in, in);
-  txfm_scaling16_avx2((int16_t)Sqrt2, in);
-}
-#endif
-
-void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  __m256i in[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  load_buffer_16x16(input, in);
-  switch (tx_type) {
-    case DCT_DCT:
-      idct16(in);
-      idct16(in);
-      break;
-    case ADST_DCT:
-      idct16(in);
-      iadst16(in);
-      break;
-    case DCT_ADST:
-      iadst16(in);
-      idct16(in);
-      break;
-    case ADST_ADST:
-      iadst16(in);
-      iadst16(in);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      idct16(in);
-      iadst16(in);
-      flip_col(&dest, &stride, 16);
-      break;
-    case DCT_FLIPADST:
-      iadst16(in);
-      idct16(in);
-      flip_row(in, 16);
-      break;
-    case FLIPADST_FLIPADST:
-      iadst16(in);
-      iadst16(in);
-      flip_row(in, 16);
-      flip_col(&dest, &stride, 16);
-      break;
-    case ADST_FLIPADST:
-      iadst16(in);
-      iadst16(in);
-      flip_row(in, 16);
-      break;
-    case FLIPADST_ADST:
-      iadst16(in);
-      iadst16(in);
-      flip_col(&dest, &stride, 16);
-      break;
-    case IDTX:
-      iidtx16(in);
-      iidtx16(in);
-      break;
-    case V_DCT:
-      iidtx16(in);
-      idct16(in);
-      break;
-    case H_DCT:
-      idct16(in);
-      iidtx16(in);
-      break;
-    case V_ADST:
-      iidtx16(in);
-      iadst16(in);
-      break;
-    case H_ADST:
-      iadst16(in);
-      iidtx16(in);
-      break;
-    case V_FLIPADST:
-      iidtx16(in);
-      iadst16(in);
-      flip_col(&dest, &stride, 16);
-      break;
-    case H_FLIPADST:
-      iadst16(in);
-      iidtx16(in);
-      flip_row(in, 16);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-  store_buffer_16xN(in, stride, dest, 16);
-}
diff --git a/third_party/aom/av1/common/x86/idct_intrin_sse2.c b/third_party/aom/av1/common/x86/idct_intrin_sse2.c
deleted file mode 100644
index 541165c8d..000000000
--- a/third_party/aom/av1/common/x86/idct_intrin_sse2.c
+++ /dev/null
@@ -1,1411 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./av1_rtcd.h"
-#include "aom_dsp/x86/inv_txfm_sse2.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-#include "aom_ports/mem.h"
-#include "av1/common/enums.h"
-
-#if CONFIG_EXT_TX
-static INLINE void fliplr_4x4(__m128i *in /*in[2]*/) {
-  in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
-  in[0] = _mm_shufflehi_epi16(in[0], 0x1b);
-  in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
-  in[1] = _mm_shufflehi_epi16(in[1], 0x1b);
-}
-
-static INLINE void fliplr_8x8(__m128i *in /*in[8]*/) {
-  in[0] = mm_reverse_epi16(in[0]);
-  in[1] = mm_reverse_epi16(in[1]);
-  in[2] = mm_reverse_epi16(in[2]);
-  in[3] = mm_reverse_epi16(in[3]);
-
-  in[4] = mm_reverse_epi16(in[4]);
-  in[5] = mm_reverse_epi16(in[5]);
-  in[6] = mm_reverse_epi16(in[6]);
-  in[7] = mm_reverse_epi16(in[7]);
-}
-
-static INLINE void fliplr_16x8(__m128i *in /*in[16]*/) {
-  fliplr_8x8(&in[0]);
-  fliplr_8x8(&in[8]);
-}
-
-#define FLIPLR_16x16(in0, in1) \
-  do {                         \
-    __m128i *tmp;              \
-    fliplr_16x8(in0);          \
-    fliplr_16x8(in1);          \
-    tmp = (in0);               \
-    (in0) = (in1);             \
-    (in1) = tmp;               \
-  } while (0)
-
-#define FLIPUD_PTR(dest, stride, size)       \
-  do {                                       \
-    (dest) = (dest) + ((size)-1) * (stride); \
-    (stride) = -(stride);                    \
-  } while (0)
-#endif
-
-void av1_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
-                            const TxfmParam *txfm_param) {
-  __m128i in[2];
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i eight = _mm_set1_epi16(8);
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8);
-
-  switch (tx_type) {
-    case DCT_DCT:
-      aom_idct4_sse2(in);
-      aom_idct4_sse2(in);
-      break;
-    case ADST_DCT:
-      aom_idct4_sse2(in);
-      aom_iadst4_sse2(in);
-      break;
-    case DCT_ADST:
-      aom_iadst4_sse2(in);
-      aom_idct4_sse2(in);
-      break;
-    case ADST_ADST:
-      aom_iadst4_sse2(in);
-      aom_iadst4_sse2(in);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      aom_idct4_sse2(in);
-      aom_iadst4_sse2(in);
-      FLIPUD_PTR(dest, stride, 4);
-      break;
-    case DCT_FLIPADST:
-      aom_iadst4_sse2(in);
-      aom_idct4_sse2(in);
-      fliplr_4x4(in);
-      break;
-    case FLIPADST_FLIPADST:
-      aom_iadst4_sse2(in);
-      aom_iadst4_sse2(in);
-      FLIPUD_PTR(dest, stride, 4);
-      fliplr_4x4(in);
-      break;
-    case ADST_FLIPADST:
-      aom_iadst4_sse2(in);
-      aom_iadst4_sse2(in);
-      fliplr_4x4(in);
-      break;
-    case FLIPADST_ADST:
-      aom_iadst4_sse2(in);
-      aom_iadst4_sse2(in);
-      FLIPUD_PTR(dest, stride, 4);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-
-  // Final round and shift
-  in[0] = _mm_add_epi16(in[0], eight);
-  in[1] = _mm_add_epi16(in[1], eight);
-
-  in[0] = _mm_srai_epi16(in[0], 4);
-  in[1] = _mm_srai_epi16(in[1], 4);
-
-  // Reconstruction and Store
-  {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
-    __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
-    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
-    d0 = _mm_unpacklo_epi32(d0, d1);
-    d2 = _mm_unpacklo_epi32(d2, d3);
-    d0 = _mm_unpacklo_epi8(d0, zero);
-    d2 = _mm_unpacklo_epi8(d2, zero);
-    d0 = _mm_add_epi16(d0, in[0]);
-    d2 = _mm_add_epi16(d2, in[1]);
-    d0 = _mm_packus_epi16(d0, d2);
-    // store result[0]
-    *(int *)dest = _mm_cvtsi128_si32(d0);
-    // store result[1]
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-    // store result[2]
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
-    // store result[3]
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
-  }
-}
-
-void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
-                            const TxfmParam *txfm_param) {
-  __m128i in[8];
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  // load input data
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 1);
-  in[2] = load_input_data(input + 8 * 2);
-  in[3] = load_input_data(input + 8 * 3);
-  in[4] = load_input_data(input + 8 * 4);
-  in[5] = load_input_data(input + 8 * 5);
-  in[6] = load_input_data(input + 8 * 6);
-  in[7] = load_input_data(input + 8 * 7);
-
-  switch (tx_type) {
-    case DCT_DCT:
-      aom_idct8_sse2(in);
-      aom_idct8_sse2(in);
-      break;
-    case ADST_DCT:
-      aom_idct8_sse2(in);
-      aom_iadst8_sse2(in);
-      break;
-    case DCT_ADST:
-      aom_iadst8_sse2(in);
-      aom_idct8_sse2(in);
-      break;
-    case ADST_ADST:
-      aom_iadst8_sse2(in);
-      aom_iadst8_sse2(in);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      aom_idct8_sse2(in);
-      aom_iadst8_sse2(in);
-      FLIPUD_PTR(dest, stride, 8);
-      break;
-    case DCT_FLIPADST:
-      aom_iadst8_sse2(in);
-      aom_idct8_sse2(in);
-      fliplr_8x8(in);
-      break;
-    case FLIPADST_FLIPADST:
-      aom_iadst8_sse2(in);
-      aom_iadst8_sse2(in);
-      FLIPUD_PTR(dest, stride, 8);
-      fliplr_8x8(in);
-      break;
-    case ADST_FLIPADST:
-      aom_iadst8_sse2(in);
-      aom_iadst8_sse2(in);
-      fliplr_8x8(in);
-      break;
-    case FLIPADST_ADST:
-      aom_iadst8_sse2(in);
-      aom_iadst8_sse2(in);
-      FLIPUD_PTR(dest, stride, 8);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-
-  // Final rounding and shift
-  in[0] = _mm_adds_epi16(in[0], final_rounding);
-  in[1] = _mm_adds_epi16(in[1], final_rounding);
-  in[2] = _mm_adds_epi16(in[2], final_rounding);
-  in[3] = _mm_adds_epi16(in[3], final_rounding);
-  in[4] = _mm_adds_epi16(in[4], final_rounding);
-  in[5] = _mm_adds_epi16(in[5], final_rounding);
-  in[6] = _mm_adds_epi16(in[6], final_rounding);
-  in[7] = _mm_adds_epi16(in[7], final_rounding);
-
-  in[0] = _mm_srai_epi16(in[0], 5);
-  in[1] = _mm_srai_epi16(in[1], 5);
-  in[2] = _mm_srai_epi16(in[2], 5);
-  in[3] = _mm_srai_epi16(in[3], 5);
-  in[4] = _mm_srai_epi16(in[4], 5);
-  in[5] = _mm_srai_epi16(in[5], 5);
-  in[6] = _mm_srai_epi16(in[6], 5);
-  in[7] = _mm_srai_epi16(in[7], 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in[0]);
-  RECON_AND_STORE(dest + 1 * stride, in[1]);
-  RECON_AND_STORE(dest + 2 * stride, in[2]);
-  RECON_AND_STORE(dest + 3 * stride, in[3]);
-  RECON_AND_STORE(dest + 4 * stride, in[4]);
-  RECON_AND_STORE(dest + 5 * stride, in[5]);
-  RECON_AND_STORE(dest + 6 * stride, in[6]);
-  RECON_AND_STORE(dest + 7 * stride, in[7]);
-}
-
-#if CONFIG_EXT_TX
-static void iidtx16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  idtx16_8col(in0);
-  idtx16_8col(in1);
-}
-#endif  // CONFIG_EXT_TX
-
-void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  __m128i in[32];
-  __m128i *in0 = &in[0];
-  __m128i *in1 = &in[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  load_buffer_8x16(input, in0);
-  input += 8;
-  load_buffer_8x16(input, in1);
-
-  switch (tx_type) {
-    case DCT_DCT:
-      aom_idct16_sse2(in0, in1);
-      aom_idct16_sse2(in0, in1);
-      break;
-    case ADST_DCT:
-      aom_idct16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      break;
-    case DCT_ADST:
-      aom_iadst16_sse2(in0, in1);
-      aom_idct16_sse2(in0, in1);
-      break;
-    case ADST_ADST:
-      aom_iadst16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      aom_idct16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      FLIPUD_PTR(dest, stride, 16);
-      break;
-    case DCT_FLIPADST:
-      aom_iadst16_sse2(in0, in1);
-      aom_idct16_sse2(in0, in1);
-      FLIPLR_16x16(in0, in1);
-      break;
-    case FLIPADST_FLIPADST:
-      aom_iadst16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      FLIPUD_PTR(dest, stride, 16);
-      FLIPLR_16x16(in0, in1);
-      break;
-    case ADST_FLIPADST:
-      aom_iadst16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      FLIPLR_16x16(in0, in1);
-      break;
-    case FLIPADST_ADST:
-      aom_iadst16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      FLIPUD_PTR(dest, stride, 16);
-      break;
-    case IDTX:
-      iidtx16_sse2(in0, in1);
-      iidtx16_sse2(in0, in1);
-      break;
-    case V_DCT:
-      iidtx16_sse2(in0, in1);
-      aom_idct16_sse2(in0, in1);
-      break;
-    case H_DCT:
-      aom_idct16_sse2(in0, in1);
-      iidtx16_sse2(in0, in1);
-      break;
-    case V_ADST:
-      iidtx16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      break;
-    case H_ADST:
-      aom_iadst16_sse2(in0, in1);
-      iidtx16_sse2(in0, in1);
-      break;
-    case V_FLIPADST:
-      iidtx16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      FLIPUD_PTR(dest, stride, 16);
-      break;
-    case H_FLIPADST:
-      aom_iadst16_sse2(in0, in1);
-      iidtx16_sse2(in0, in1);
-      FLIPLR_16x16(in0, in1);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-
-  write_buffer_8x16(dest, in0, stride);
-  dest += 8;
-  write_buffer_8x16(dest, in1, stride);
-}
-
-#if CONFIG_EXT_TX
-static void iidtx8_sse2(__m128i *in) {
-  in[0] = _mm_slli_epi16(in[0], 1);
-  in[1] = _mm_slli_epi16(in[1], 1);
-  in[2] = _mm_slli_epi16(in[2], 1);
-  in[3] = _mm_slli_epi16(in[3], 1);
-  in[4] = _mm_slli_epi16(in[4], 1);
-  in[5] = _mm_slli_epi16(in[5], 1);
-  in[6] = _mm_slli_epi16(in[6], 1);
-  in[7] = _mm_slli_epi16(in[7], 1);
-}
-
-static INLINE void iidtx4_sse2(__m128i *in) {
-  const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);
-
-  const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
-  const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
-  const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
-  const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
-
-  const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
-  const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
-  const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
-  const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
-
-  in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
-  in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
-}
-
-// load 8x8 array
-static INLINE void flip_buffer_lr_8x8(__m128i *in) {
-  in[0] = mm_reverse_epi16(in[0]);
-  in[1] = mm_reverse_epi16(in[1]);
-  in[2] = mm_reverse_epi16(in[2]);
-  in[3] = mm_reverse_epi16(in[3]);
-  in[4] = mm_reverse_epi16(in[4]);
-  in[5] = mm_reverse_epi16(in[5]);
-  in[6] = mm_reverse_epi16(in[6]);
-  in[7] = mm_reverse_epi16(in[7]);
-}
-#endif  // CONFIG_EXT_TX
-
-void av1_iht8x16_128_add_sse2(const tran_low_t *input, uint8_t *dest,
-                              int stride, const TxfmParam *txfm_param) {
-  __m128i in[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  in[0] = load_input_data(input + 0 * 8);
-  in[1] = load_input_data(input + 1 * 8);
-  in[2] = load_input_data(input + 2 * 8);
-  in[3] = load_input_data(input + 3 * 8);
-  in[4] = load_input_data(input + 4 * 8);
-  in[5] = load_input_data(input + 5 * 8);
-  in[6] = load_input_data(input + 6 * 8);
-  in[7] = load_input_data(input + 7 * 8);
-
-  in[8] = load_input_data(input + 8 * 8);
-  in[9] = load_input_data(input + 9 * 8);
-  in[10] = load_input_data(input + 10 * 8);
-  in[11] = load_input_data(input + 11 * 8);
-  in[12] = load_input_data(input + 12 * 8);
-  in[13] = load_input_data(input + 13 * 8);
-  in[14] = load_input_data(input + 14 * 8);
-  in[15] = load_input_data(input + 15 * 8);
-
-  // Row transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case H_DCT:
-#endif
-      aom_idct8_sse2(in);
-      array_transpose_8x8(in, in);
-      aom_idct8_sse2(in + 8);
-      array_transpose_8x8(in + 8, in + 8);
-      break;
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case H_ADST:
-    case H_FLIPADST:
-#endif
-      aom_iadst8_sse2(in);
-      array_transpose_8x8(in, in);
-      aom_iadst8_sse2(in + 8);
-      array_transpose_8x8(in + 8, in + 8);
-      break;
-#if CONFIG_EXT_TX
-    case V_FLIPADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX:
-      iidtx8_sse2(in);
-      iidtx8_sse2(in + 8);
-      break;
-#endif
-    default: assert(0); break;
-  }
-  scale_sqrt2_8x8(in);
-  scale_sqrt2_8x8(in + 8);
-
-  // Column transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case DCT_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case V_DCT:
-#endif
-      idct16_8col(in);
-      break;
-    case ADST_DCT:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case FLIPADST_ADST:
-    case ADST_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case FLIPADST_DCT:
-    case V_ADST:
-    case V_FLIPADST:
-#endif
-      iadst16_8col(in);
-      break;
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
-    case IDTX: idtx16_8col(in); break;
-#endif
-    default: assert(0); break;
-  }
-
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-#if CONFIG_EXT_TX
-    case H_DCT:
-#endif
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case H_ADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX:
-#endif
-      write_buffer_8x16(dest, in, stride);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case FLIPADST_ADST:
-    case V_FLIPADST: write_buffer_8x16(dest + stride * 15, in, -stride); break;
-    case DCT_FLIPADST:
-    case ADST_FLIPADST:
-    case H_FLIPADST:
-      flip_buffer_lr_8x8(in);
-      flip_buffer_lr_8x8(in + 8);
-      write_buffer_8x16(dest, in, stride);
-      break;
-    case FLIPADST_FLIPADST:
-      flip_buffer_lr_8x8(in);
-      flip_buffer_lr_8x8(in + 8);
-      write_buffer_8x16(dest + stride * 15, in, -stride);
-      break;
-#endif
-    default: assert(0); break;
-  }
-}
-
-static INLINE void write_buffer_8x8_round6(uint8_t *dest, __m128i *in,
-                                           int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-  // Final rounding and shift
-  in[0] = _mm_adds_epi16(in[0], final_rounding);
-  in[1] = _mm_adds_epi16(in[1], final_rounding);
-  in[2] = _mm_adds_epi16(in[2], final_rounding);
-  in[3] = _mm_adds_epi16(in[3], final_rounding);
-  in[4] = _mm_adds_epi16(in[4], final_rounding);
-  in[5] = _mm_adds_epi16(in[5], final_rounding);
-  in[6] = _mm_adds_epi16(in[6], final_rounding);
-  in[7] = _mm_adds_epi16(in[7], final_rounding);
-
-  in[0] = _mm_srai_epi16(in[0], 6);
-  in[1] = _mm_srai_epi16(in[1], 6);
-  in[2] = _mm_srai_epi16(in[2], 6);
-  in[3] = _mm_srai_epi16(in[3], 6);
-  in[4] = _mm_srai_epi16(in[4], 6);
-  in[5] = _mm_srai_epi16(in[5], 6);
-  in[6] = _mm_srai_epi16(in[6], 6);
-  in[7] = _mm_srai_epi16(in[7], 6);
-
-  RECON_AND_STORE(dest + 0 * stride, in[0]);
-  RECON_AND_STORE(dest + 1 * stride, in[1]);
-  RECON_AND_STORE(dest + 2 * stride, in[2]);
-  RECON_AND_STORE(dest + 3 * stride, in[3]);
-  RECON_AND_STORE(dest + 4 * stride, in[4]);
-  RECON_AND_STORE(dest + 5 * stride, in[5]);
-  RECON_AND_STORE(dest + 6 * stride, in[6]);
-  RECON_AND_STORE(dest + 7 * stride, in[7]);
-}
-
-void av1_iht16x8_128_add_sse2(const tran_low_t *input, uint8_t *dest,
-                              int stride, const TxfmParam *txfm_param) {
-  __m128i in[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  // Transpose 16x8 input into in[]
-  in[0] = load_input_data(input + 0 * 16);
-  in[1] = load_input_data(input + 1 * 16);
-  in[2] = load_input_data(input + 2 * 16);
-  in[3] = load_input_data(input + 3 * 16);
-  in[4] = load_input_data(input + 4 * 16);
-  in[5] = load_input_data(input + 5 * 16);
-  in[6] = load_input_data(input + 6 * 16);
-  in[7] = load_input_data(input + 7 * 16);
-  array_transpose_8x8(in, in);
-
-  in[8] = load_input_data(input + 8 + 0 * 16);
-  in[9] = load_input_data(input + 8 + 1 * 16);
-  in[10] = load_input_data(input + 8 + 2 * 16);
-  in[11] = load_input_data(input + 8 + 3 * 16);
-  in[12] = load_input_data(input + 8 + 4 * 16);
-  in[13] = load_input_data(input + 8 + 5 * 16);
-  in[14] = load_input_data(input + 8 + 6 * 16);
-  in[15] = load_input_data(input + 8 + 7 * 16);
-  array_transpose_8x8(in + 8, in + 8);
-
-  // Row transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case H_DCT:
-#endif
-      idct16_8col(in);
-      break;
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case H_ADST:
-    case H_FLIPADST:
-#endif
-      iadst16_8col(in);
-      break;
-#if CONFIG_EXT_TX
-    case V_FLIPADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX: idtx16_8col(in); break;
-#endif
-    default: assert(0); break;
-  }
-
-  // Scale
-  scale_sqrt2_8x8(in);
-  scale_sqrt2_8x8(in + 8);
-
-  // Column transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case DCT_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case V_DCT:
-#endif
-      aom_idct8_sse2(in);
-      aom_idct8_sse2(in + 8);
-      break;
-    case ADST_DCT:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case FLIPADST_ADST:
-    case ADST_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case FLIPADST_DCT:
-    case V_ADST:
-    case V_FLIPADST:
-#endif
-      aom_iadst8_sse2(in);
-      aom_iadst8_sse2(in + 8);
-      break;
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
-    case IDTX:
-      array_transpose_8x8(in, in);
-      array_transpose_8x8(in + 8, in + 8);
-      iidtx8_sse2(in);
-      iidtx8_sse2(in + 8);
-      break;
-#endif
-    default: assert(0); break;
-  }
-
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX:
-#endif
-      write_buffer_8x8_round6(dest, in, stride);
-      write_buffer_8x8_round6(dest + 8, in + 8, stride);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case FLIPADST_ADST:
-    case V_FLIPADST:
-      write_buffer_8x8_round6(dest + stride * 7, in, -stride);
-      write_buffer_8x8_round6(dest + stride * 7 + 8, in + 8, -stride);
-      break;
-    case DCT_FLIPADST:
-    case ADST_FLIPADST:
-    case H_FLIPADST:
-      flip_buffer_lr_8x8(in);
-      flip_buffer_lr_8x8(in + 8);
-      write_buffer_8x8_round6(dest, in + 8, stride);
-      write_buffer_8x8_round6(dest + 8, in, stride);
-      break;
-    case FLIPADST_FLIPADST:
-      flip_buffer_lr_8x8(in);
-      flip_buffer_lr_8x8(in + 8);
-      write_buffer_8x8_round6(dest + stride * 7, in + 8, -stride);
-      write_buffer_8x8_round6(dest + stride * 7 + 8, in, -stride);
-      break;
-#endif
-    default: assert(0); break;
-  }
-}
-
-static INLINE void write_buffer_8x4_round5(uint8_t *dest, __m128i *in,
-                                           int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i zero = _mm_setzero_si128();
-  // Final rounding and shift
-  in[0] = _mm_adds_epi16(in[0], final_rounding);
-  in[1] = _mm_adds_epi16(in[1], final_rounding);
-  in[2] = _mm_adds_epi16(in[2], final_rounding);
-  in[3] = _mm_adds_epi16(in[3], final_rounding);
-
-  in[0] = _mm_srai_epi16(in[0], 5);
-  in[1] = _mm_srai_epi16(in[1], 5);
-  in[2] = _mm_srai_epi16(in[2], 5);
-  in[3] = _mm_srai_epi16(in[3], 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in[0]);
-  RECON_AND_STORE(dest + 1 * stride, in[1]);
-  RECON_AND_STORE(dest + 2 * stride, in[2]);
-  RECON_AND_STORE(dest + 3 * stride, in[3]);
-}
-
-void av1_iht8x4_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
-                            const TxfmParam *txfm_param) {
-  __m128i in[8];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  in[0] = load_input_data(input + 0 * 8);
-  in[1] = load_input_data(input + 1 * 8);
-  in[2] = load_input_data(input + 2 * 8);
-  in[3] = load_input_data(input + 3 * 8);
-
-  // Row transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case H_DCT:
-#endif
-      aom_idct8_sse2(in);
-      break;
-    case DCT_ADST:
-    case ADST_ADST: aom_iadst8_sse2(in); break;
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case H_ADST:
-    case H_FLIPADST: aom_iadst8_sse2(in); break;
-    case V_FLIPADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX: iidtx8_sse2(in); array_transpose_8x8(in, in);
-#endif
-      break;
-    default: assert(0); break;
-  }
-
-  scale_sqrt2_8x8(in);
-
-  // Repack data. We pack into the bottom half of 'in'
-  // so that the next repacking stage can pack into the
-  // top half without overwriting anything
-  in[7] = _mm_unpacklo_epi64(in[6], in[7]);
-  in[6] = _mm_unpacklo_epi64(in[4], in[5]);
-  in[5] = _mm_unpacklo_epi64(in[2], in[3]);
-  in[4] = _mm_unpacklo_epi64(in[0], in[1]);
-
-  // Column transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case DCT_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case V_DCT:
-#endif
-      aom_idct4_sse2(in + 4);
-      aom_idct4_sse2(in + 6);
-      break;
-    case ADST_DCT:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case FLIPADST_ADST:
-    case ADST_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case FLIPADST_DCT:
-    case V_ADST:
-    case V_FLIPADST:
-#endif
-      aom_iadst4_sse2(in + 4);
-      aom_iadst4_sse2(in + 6);
-      break;
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
-    case IDTX:
-      iidtx4_sse2(in + 4);
-      array_transpose_4x4(in + 4);
-      iidtx4_sse2(in + 6);
-      array_transpose_4x4(in + 6);
-      break;
-#endif
-    default: assert(0); break;
-  }
-
-  // Repack data
-  in[0] = _mm_unpacklo_epi64(in[4], in[6]);
-  in[1] = _mm_unpackhi_epi64(in[4], in[6]);
-  in[2] = _mm_unpacklo_epi64(in[5], in[7]);
-  in[3] = _mm_unpackhi_epi64(in[5], in[7]);
-
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX: break;
-    case FLIPADST_DCT:
-    case FLIPADST_ADST:
-    case V_FLIPADST: FLIPUD_PTR(dest, stride, 4); break;
-    case DCT_FLIPADST:
-    case ADST_FLIPADST:
-    case H_FLIPADST:
-      in[0] = mm_reverse_epi16(in[0]);
-      in[1] = mm_reverse_epi16(in[1]);
-      in[2] = mm_reverse_epi16(in[2]);
-      in[3] = mm_reverse_epi16(in[3]);
-      break;
-    case FLIPADST_FLIPADST:
-      in[0] = mm_reverse_epi16(in[0]);
-      in[1] = mm_reverse_epi16(in[1]);
-      in[2] = mm_reverse_epi16(in[2]);
-      in[3] = mm_reverse_epi16(in[3]);
-      FLIPUD_PTR(dest, stride, 4);
-#endif
-      break;
-    default: assert(0); break;
-  }
-  write_buffer_8x4_round5(dest, in, stride);
-}
-
-static INLINE void write_buffer_4x8_round5(uint8_t *dest, __m128i *in,
-                                           int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i zero = _mm_setzero_si128();
-  // Final rounding and shift
-  in[0] = _mm_adds_epi16(in[0], final_rounding);
-  in[1] = _mm_adds_epi16(in[1], final_rounding);
-  in[2] = _mm_adds_epi16(in[2], final_rounding);
-  in[3] = _mm_adds_epi16(in[3], final_rounding);
-
-  in[0] = _mm_srai_epi16(in[0], 5);
-  in[1] = _mm_srai_epi16(in[1], 5);
-  in[2] = _mm_srai_epi16(in[2], 5);
-  in[3] = _mm_srai_epi16(in[3], 5);
-
-  // Reconstruction and Store
-  {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
-    __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
-    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
-    __m128i d4 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 4));
-    __m128i d5 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 5));
-    __m128i d6 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 6));
-    __m128i d7 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 7));
-
-    d0 = _mm_unpacklo_epi32(d0, d1);
-    d2 = _mm_unpacklo_epi32(d2, d3);
-    d4 = _mm_unpacklo_epi32(d4, d5);
-    d6 = _mm_unpacklo_epi32(d6, d7);
-    d0 = _mm_unpacklo_epi8(d0, zero);
-    d2 = _mm_unpacklo_epi8(d2, zero);
-    d4 = _mm_unpacklo_epi8(d4, zero);
-    d6 = _mm_unpacklo_epi8(d6, zero);
-    d0 = _mm_add_epi16(d0, in[0]);
-    d2 = _mm_add_epi16(d2, in[1]);
-    d4 = _mm_add_epi16(d4, in[2]);
-    d6 = _mm_add_epi16(d6, in[3]);
-
-    d0 = _mm_packus_epi16(d0, d2);
-    *(int *)dest = _mm_cvtsi128_si32(d0);
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
-    d0 = _mm_packus_epi16(d4, d6);
-    *(int *)(dest + stride * 4) = _mm_cvtsi128_si32(d0);
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 5) = _mm_cvtsi128_si32(d0);
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 6) = _mm_cvtsi128_si32(d0);
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 7) = _mm_cvtsi128_si32(d0);
-  }
-}
-
-void av1_iht4x8_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
-                            const TxfmParam *txfm_param) {
-  __m128i in[8];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  // Load rows, packed two per element of 'in'.
-  // We pack into the bottom half of 'in' so that the
-  // later repacking stage can pack into the
-  // top half without overwriting anything
-  in[4] = load_input_data(input + 0 * 8);
-  in[5] = load_input_data(input + 1 * 8);
-  in[6] = load_input_data(input + 2 * 8);
-  in[7] = load_input_data(input + 3 * 8);
-
-  // Row transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case H_DCT:
-#endif
-      aom_idct4_sse2(in + 4);
-      aom_idct4_sse2(in + 6);
-      break;
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case H_ADST:
-    case H_FLIPADST:
-#endif
-      aom_iadst4_sse2(in + 4);
-      aom_iadst4_sse2(in + 6);
-      break;
-#if CONFIG_EXT_TX
-    case V_FLIPADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX:
-      iidtx4_sse2(in + 4);
-      array_transpose_4x4(in + 4);
-      iidtx4_sse2(in + 6);
-      array_transpose_4x4(in + 6);
-      break;
-#endif
-    default: assert(0); break;
-  }
-
-  scale_sqrt2_8x4(in + 4);
-
-  // Repack data
-  in[0] = _mm_unpacklo_epi64(in[4], in[6]);
-  in[1] = _mm_unpackhi_epi64(in[4], in[6]);
-  in[2] = _mm_unpacklo_epi64(in[5], in[7]);
-  in[3] = _mm_unpackhi_epi64(in[5], in[7]);
-
-  // Column transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case DCT_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case V_DCT:
-#endif
-      aom_idct8_sse2(in);
-      break;
-    case ADST_DCT:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case FLIPADST_ADST:
-    case ADST_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case FLIPADST_DCT:
-    case V_ADST:
-    case V_FLIPADST:
-#endif
-      aom_iadst8_sse2(in);
-      break;
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
-    case IDTX:
-      iidtx8_sse2(in);
-      array_transpose_8x8(in, in);
-      break;
-#endif
-    default: assert(0); break;
-  }
-
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX:
-#endif
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case FLIPADST_ADST:
-    case V_FLIPADST: FLIPUD_PTR(dest, stride, 8); break;
-    case DCT_FLIPADST:
-    case ADST_FLIPADST:
-    case H_FLIPADST:
-      in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
-      in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
-      in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
-      in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
-      in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
-      in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
-      in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
-      in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
-      break;
-    case FLIPADST_FLIPADST:
-      in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
-      in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
-      in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
-      in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
-      in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
-      in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
-      in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
-      in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
-      FLIPUD_PTR(dest, stride, 8);
-      break;
-#endif
-    default: assert(0); break;
-  }
-  in[0] = _mm_unpacklo_epi64(in[0], in[1]);
-  in[1] = _mm_unpacklo_epi64(in[2], in[3]);
-  in[2] = _mm_unpacklo_epi64(in[4], in[5]);
-  in[3] = _mm_unpacklo_epi64(in[6], in[7]);
-  write_buffer_4x8_round5(dest, in, stride);
-}
-
-// Note: The 16-column 32-element transforms take input in the form of four
-// 8x16 blocks (each stored as a __m128i[16]), which are the four quadrants
-// of the overall 16x32 input buffer.
-static INLINE void idct32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
-                                __m128i *br) {
-  array_transpose_16x16(tl, tr);
-  array_transpose_16x16(bl, br);
-  idct32_8col(tl, bl);
-  idct32_8col(tr, br);
-}
-
-static INLINE void ihalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
-                                      __m128i *br) {
-  __m128i tmpl[16], tmpr[16];
-  int i;
-
-  // Copy the top half of the input to temporary storage
-  for (i = 0; i < 16; ++i) {
-    tmpl[i] = tl[i];
-    tmpr[i] = tr[i];
-  }
-
-  // Generate the top half of the output
-  for (i = 0; i < 16; ++i) {
-    tl[i] = _mm_slli_epi16(bl[i], 2);
-    tr[i] = _mm_slli_epi16(br[i], 2);
-  }
-  array_transpose_16x16(tl, tr);
-
-  // Copy the temporary storage back to the bottom half of the input
-  for (i = 0; i < 16; ++i) {
-    bl[i] = tmpl[i];
-    br[i] = tmpr[i];
-  }
-
-  // Generate the bottom half of the output
-  scale_sqrt2_8x16(bl);
-  scale_sqrt2_8x16(br);
-  aom_idct16_sse2(bl, br);  // Includes a transposition
-}
-
-#if CONFIG_EXT_TX
-static INLINE void iidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
-                                 __m128i *br) {
-  int i;
-  array_transpose_16x16(tl, tr);
-  array_transpose_16x16(bl, br);
-  for (i = 0; i < 16; ++i) {
-    tl[i] = _mm_slli_epi16(tl[i], 2);
-    tr[i] = _mm_slli_epi16(tr[i], 2);
-    bl[i] = _mm_slli_epi16(bl[i], 2);
-    br[i] = _mm_slli_epi16(br[i], 2);
-  }
-}
-#endif  // CONFIG_EXT_TX
-
-static INLINE void write_buffer_16x32_round6(uint8_t *dest, __m128i *intl,
-                                             __m128i *intr, __m128i *inbl,
-                                             __m128i *inbr, int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  int i;
-
-  for (i = 0; i < 16; ++i) {
-    intl[i] = _mm_adds_epi16(intl[i], final_rounding);
-    intr[i] = _mm_adds_epi16(intr[i], final_rounding);
-    inbl[i] = _mm_adds_epi16(inbl[i], final_rounding);
-    inbr[i] = _mm_adds_epi16(inbr[i], final_rounding);
-    intl[i] = _mm_srai_epi16(intl[i], 6);
-    intr[i] = _mm_srai_epi16(intr[i], 6);
-    inbl[i] = _mm_srai_epi16(inbl[i], 6);
-    inbr[i] = _mm_srai_epi16(inbr[i], 6);
-    RECON_AND_STORE(dest + i * stride + 0, intl[i]);
-    RECON_AND_STORE(dest + i * stride + 8, intr[i]);
-    RECON_AND_STORE(dest + (i + 16) * stride + 0, inbl[i]);
-    RECON_AND_STORE(dest + (i + 16) * stride + 8, inbr[i]);
-  }
-}
-
-void av1_iht16x32_512_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  __m128i intl[16], intr[16], inbl[16], inbr[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    intl[i] = load_input_data(input + i * 16 + 0);
-    intr[i] = load_input_data(input + i * 16 + 8);
-    inbl[i] = load_input_data(input + (i + 16) * 16 + 0);
-    inbr[i] = load_input_data(input + (i + 16) * 16 + 8);
-  }
-
-  // Row transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case H_DCT:
-#endif
-      aom_idct16_sse2(intl, intr);
-      aom_idct16_sse2(inbl, inbr);
-      break;
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case H_ADST:
-    case H_FLIPADST:
-#endif
-      aom_iadst16_sse2(intl, intr);
-      aom_iadst16_sse2(inbl, inbr);
-      break;
-#if CONFIG_EXT_TX
-    case V_FLIPADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX:
-      iidtx16_sse2(intl, intr);
-      iidtx16_sse2(inbl, inbr);
-      break;
-#endif
-    default: assert(0); break;
-  }
-
-  scale_sqrt2_8x16(intl);
-  scale_sqrt2_8x16(intr);
-  scale_sqrt2_8x16(inbl);
-  scale_sqrt2_8x16(inbr);
-
-  // Column transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case DCT_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case V_DCT:
-#endif
-      idct32_16col(intl, intr, inbl, inbr);
-      break;
-    case ADST_DCT:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case FLIPADST_ADST:
-    case ADST_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case FLIPADST_DCT:
-    case V_ADST:
-    case V_FLIPADST:
-#endif
-      ihalfright32_16col(intl, intr, inbl, inbr);
-      break;
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
-    case IDTX: iidtx32_16col(intl, intr, inbl, inbr); break;
-#endif
-    default: assert(0); break;
-  }
-
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX:
-#endif
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case FLIPADST_ADST:
-    case V_FLIPADST: FLIPUD_PTR(dest, stride, 32); break;
-    case DCT_FLIPADST:
-    case ADST_FLIPADST:
-    case H_FLIPADST:
-      for (i = 0; i < 16; ++i) {
-        __m128i tmp = intl[i];
-        intl[i] = mm_reverse_epi16(intr[i]);
-        intr[i] = mm_reverse_epi16(tmp);
-        tmp = inbl[i];
-        inbl[i] = mm_reverse_epi16(inbr[i]);
-        inbr[i] = mm_reverse_epi16(tmp);
-      }
-      break;
-    case FLIPADST_FLIPADST:
-      for (i = 0; i < 16; ++i) {
-        __m128i tmp = intl[i];
-        intl[i] = mm_reverse_epi16(intr[i]);
-        intr[i] = mm_reverse_epi16(tmp);
-        tmp = inbl[i];
-        inbl[i] = mm_reverse_epi16(inbr[i]);
-        inbr[i] = mm_reverse_epi16(tmp);
-      }
-      FLIPUD_PTR(dest, stride, 32);
-      break;
-#endif
-    default: assert(0); break;
-  }
-  write_buffer_16x32_round6(dest, intl, intr, inbl, inbr, stride);
-}
-
-static INLINE void write_buffer_32x16_round6(uint8_t *dest, __m128i *in0,
-                                             __m128i *in1, __m128i *in2,
-                                             __m128i *in3, int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  int i;
-
-  for (i = 0; i < 16; ++i) {
-    in0[i] = _mm_adds_epi16(in0[i], final_rounding);
-    in1[i] = _mm_adds_epi16(in1[i], final_rounding);
-    in2[i] = _mm_adds_epi16(in2[i], final_rounding);
-    in3[i] = _mm_adds_epi16(in3[i], final_rounding);
-    in0[i] = _mm_srai_epi16(in0[i], 6);
-    in1[i] = _mm_srai_epi16(in1[i], 6);
-    in2[i] = _mm_srai_epi16(in2[i], 6);
-    in3[i] = _mm_srai_epi16(in3[i], 6);
-    RECON_AND_STORE(dest + i * stride + 0, in0[i]);
-    RECON_AND_STORE(dest + i * stride + 8, in1[i]);
-    RECON_AND_STORE(dest + i * stride + 16, in2[i]);
-    RECON_AND_STORE(dest + i * stride + 24, in3[i]);
-  }
-}
-
-void av1_iht32x16_512_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride, const TxfmParam *txfm_param) {
-  __m128i in0[16], in1[16], in2[16], in3[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  int i;
-
-  for (i = 0; i < 16; ++i) {
-    in0[i] = load_input_data(input + i * 32 + 0);
-    in1[i] = load_input_data(input + i * 32 + 8);
-    in2[i] = load_input_data(input + i * 32 + 16);
-    in3[i] = load_input_data(input + i * 32 + 24);
-  }
-
-  // Row transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case H_DCT:
-#endif
-      idct32_16col(in0, in1, in2, in3);
-      break;
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case H_ADST:
-    case H_FLIPADST:
-#endif
-      ihalfright32_16col(in0, in1, in2, in3);
-      break;
-#if CONFIG_EXT_TX
-    case V_FLIPADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX: iidtx32_16col(in0, in1, in2, in3); break;
-#endif
-    default: assert(0); break;
-  }
-
-  scale_sqrt2_8x16(in0);
-  scale_sqrt2_8x16(in1);
-  scale_sqrt2_8x16(in2);
-  scale_sqrt2_8x16(in3);
-
-  // Column transform
-  switch (tx_type) {
-    case DCT_DCT:
-    case DCT_ADST:
-#if CONFIG_EXT_TX
-    case DCT_FLIPADST:
-    case V_DCT:
-#endif
-      aom_idct16_sse2(in0, in1);
-      aom_idct16_sse2(in2, in3);
-      break;
-    case ADST_DCT:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case FLIPADST_ADST:
-    case ADST_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case FLIPADST_DCT:
-    case V_ADST:
-    case V_FLIPADST:
-#endif
-      aom_iadst16_sse2(in0, in1);
-      aom_iadst16_sse2(in2, in3);
-      break;
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
-    case IDTX:
-      iidtx16_sse2(in0, in1);
-      iidtx16_sse2(in2, in3);
-      break;
-#endif
-    default: assert(0); break;
-  }
-
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-#if CONFIG_EXT_TX
-    case H_DCT:
-    case H_ADST:
-    case V_ADST:
-    case V_DCT:
-    case IDTX:
-#endif
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case FLIPADST_ADST:
-    case V_FLIPADST: FLIPUD_PTR(dest, stride, 16); break;
-    case DCT_FLIPADST:
-    case ADST_FLIPADST:
-    case H_FLIPADST:
-      for (i = 0; i < 16; ++i) {
-        __m128i tmp1 = in0[i];
-        __m128i tmp2 = in1[i];
-        in0[i] = mm_reverse_epi16(in3[i]);
-        in1[i] = mm_reverse_epi16(in2[i]);
-        in2[i] = mm_reverse_epi16(tmp2);
-        in3[i] = mm_reverse_epi16(tmp1);
-      }
-      break;
-    case FLIPADST_FLIPADST:
-      for (i = 0; i < 16; ++i) {
-        __m128i tmp1 = in0[i];
-        __m128i tmp2 = in1[i];
-        in0[i] = mm_reverse_epi16(in3[i]);
-        in1[i] = mm_reverse_epi16(in2[i]);
-        in2[i] = mm_reverse_epi16(tmp2);
-        in3[i] = mm_reverse_epi16(tmp1);
-      }
-      FLIPUD_PTR(dest, stride, 16);
-      break;
-#endif
-    default: assert(0); break;
-  }
-  write_buffer_32x16_round6(dest, in0, in1, in2, in3, stride);
-}
diff --git a/third_party/aom/av1/common/x86/intra_edge_sse4.c b/third_party/aom/av1/common/x86/intra_edge_sse4.c
index ea4acff33..0c857b583 100644
--- a/third_party/aom/av1/common/x86/intra_edge_sse4.c
+++ b/third_party/aom/av1/common/x86/intra_edge_sse4.c
@@ -12,8 +12,8 @@
 #include <assert.h>
 #include <smmintrin.h>
 
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
 
 void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
   if (!strength) return;
@@ -39,9 +39,9 @@ void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
   // Adjust input pointer for filter support area
   uint8_t *in = (strength == 3) ? p - 1 : p;
 
-  // Avoid modifying first/last samples
+  // Avoid modifying first sample
   uint8_t *out = p + 1;
-  int len = sz - 2;
+  int len = sz - 1;
 
   const int use_3tap_filter = (strength < 3);
 
@@ -133,9 +133,9 @@ void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength) {
   // Adjust input pointer for filter support area
   uint16_t *in = (strength == 3) ? p - 1 : p;
 
-  // Avoid modifying first/last samples
+  // Avoid modifying first sample
   uint16_t *out = p + 1;
-  int len = sz - 2;
+  int len = sz - 1;
 
   const int use_3tap_filter = (strength < 3);
 
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
new file mode 100644
index 000000000..ac1d2c9ca
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
@@ -0,0 +1,704 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
+                             int dst_stride0, int w, int h,
+                             InterpFilterParams *filter_params_x,
+                             InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bd = 8;
+  int i, j;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi16(w0);
+  const __m256i wt1 = _mm256_set1_epi16(w1);
+  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+  __m256i filt[4], coeffs[4];
+
+  assert(bits >= 0);
+  assert(conv_params->round_0 > 0);
+
+  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
+
+  const __m256i round_const =
+      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  for (i = 0; i < h; i += 2) {
+    for (j = 0; j < w; j += 8) {
+      const __m256i data = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
+          _mm256_castsi128_si256(_mm_loadu_si128(
+              (__m128i *)(&src_ptr[i * src_stride + j + src_stride]))),
+          0x20);
+
+      __m256i res = convolve_lowbd_x(data, coeffs, filt);
+
+      res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+
+      res = _mm256_slli_epi16(res, bits);
+
+      const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+      // Accumulate values into the destination buffer
+      if (do_average) {
+        const __m256i data_ref_0 = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
+            _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
+            0x20);
+
+        const __m256i comp_avg_res =
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+        const __m256i round_result = convolve_rounding(
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+        const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+        const __m128i res_0 = _mm256_castsi256_si128(res_8);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+        if (w > 4) {
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+          _mm_storel_epi64(
+              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+        } else {
+          *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
+          *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+              _mm_cvtsi128_si32(res_1);
+        }
+      } else {
+        const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+        const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                        res_1);
+      }
+    }
+  }
+}
+
+void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
+                             int dst_stride0, int w, int h,
+                             InterpFilterParams *filter_params_x,
+                             InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bd = 8;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride;
+  // +1 to compensate for dividing the filter coeffs by 2
+  const int left_shift = FILTER_BITS - conv_params->round_0 + 1;
+  const __m256i round_const =
+      _mm256_set1_epi32((1 << conv_params->round_1) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi16(w0);
+  const __m256i wt1 = _mm256_set1_epi16(w1);
+  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi16(offset);
+  const int offset_1 = (1 << (bd + FILTER_BITS - 2));
+  const __m256i offset_const_1 = _mm256_set1_epi16(offset_1);
+  const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0));
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i coeffs[4], s[8];
+
+  assert((FILTER_BITS - conv_params->round_0) >= 0);
+
+  prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
+
+  (void)conv_params;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+
+  for (j = 0; j < w; j += 16) {
+    const uint8_t *data = &src_ptr[j];
+    __m256i src6;
+
+    // Load lines a and b. Line a to lower 128, line b to upper 128
+    const __m256i src_01a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+        0x20);
+
+    const __m256i src_12a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+        0x20);
+
+    const __m256i src_23a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+        0x20);
+
+    const __m256i src_34a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+        0x20);
+
+    const __m256i src_45a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+        0x20);
+
+    src6 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+    const __m256i src_56a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+        src6, 0x20);
+
+    s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+    s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+    s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+
+    s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
+    s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
+    s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+    for (i = 0; i < h; i += 2) {
+      data = &src_ptr[i * src_stride + j];
+      const __m256i src_67a = _mm256_permute2x128_si256(
+          src6,
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+          0x20);
+
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+      const __m256i src_78a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+          src6, 0x20);
+
+      s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+      s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+      __m256i res_lo = convolve_lowbd(s, coeffs);
+
+      res_lo = _mm256_add_epi16(res_lo, offset_const_1);
+
+      const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+      const __m256i res_lo_0_shift =
+          _mm256_slli_epi32(res_lo_0_32b, left_shift);
+      const __m256i res_lo_0_round = _mm256_sra_epi32(
+          _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
+
+      const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+      const __m256i res_lo_1_shift =
+          _mm256_slli_epi32(res_lo_1_32b, left_shift);
+      const __m256i res_lo_1_round = _mm256_sra_epi32(
+          _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
+
+      const __m256i res_lo_round =
+          _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
+
+      const __m256i res_lo_unsigned =
+          _mm256_add_epi16(res_lo_round, offset_const_2);
+
+      if (w - j < 16) {
+        if (do_average) {
+          const __m256i data_ref_0 = _mm256_permute2x128_si256(
+              _mm256_castsi128_si256(
+                  _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
+              _mm256_castsi128_si256(_mm_loadu_si128(
+                  (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
+              0x20);
+
+          const __m256i comp_avg_res =
+              comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m256i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+          if (w - j > 4) {
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          } else {
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_0);
+            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+          }
+        } else {
+          const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+          const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_1);
+        }
+      } else {
+        __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+
+        res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+        const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+        const __m256i res_hi_0_shift =
+            _mm256_slli_epi32(res_hi_0_32b, left_shift);
+        const __m256i res_hi_0_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+        const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+        const __m256i res_hi_1_shift =
+            _mm256_slli_epi32(res_hi_1_32b, left_shift);
+        const __m256i res_hi_1_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+        const __m256i res_hi_round =
+            _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+
+        const __m256i res_hi_unsigned =
+            _mm256_add_epi16(res_hi_round, offset_const_2);
+
+        if (do_average) {
+          const __m256i data_ref_0_lo = _mm256_permute2x128_si256(
+              _mm256_castsi128_si256(
+                  _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
+              _mm256_castsi128_si256(_mm_loadu_si128(
+                  (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
+              0x20);
+
+          const __m256i data_ref_0_hi = _mm256_permute2x128_si256(
+              _mm256_castsi128_si256(
+                  _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j + 8]))),
+              _mm256_castsi128_si256(_mm_loadu_si128(
+                  (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]))),
+              0x20);
+
+          const __m256i comp_avg_res_lo =
+              comp_avg(&data_ref_0_lo, &res_lo_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m256i comp_avg_res_hi =
+              comp_avg(&data_ref_0_hi, &res_hi_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m256i round_result_lo = convolve_rounding(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i round_result_hi = convolve_rounding(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_8 =
+              _mm256_packus_epi16(round_result_lo, round_result_hi);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+          _mm_store_si128(
+              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+        } else {
+          const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+          const __m128i res_lo_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_lo_1);
+
+          const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), res_hi_0);
+
+          const __m128i res_hi_1 = _mm256_extracti128_si256(res_hi_unsigned, 1);
+          _mm_store_si128(
+              (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), res_hi_1);
+        }
+      }
+      s[0] = s[1];
+      s[1] = s[2];
+      s[2] = s[3];
+
+      s[4] = s[5];
+      s[5] = s[6];
+      s[6] = s[7];
+    }
+  }
+}
+
+void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
+                              int dst_stride0, int w, int h,
+                              InterpFilterParams *filter_params_x,
+                              InterpFilterParams *filter_params_y,
+                              const int subpel_x_q4, const int subpel_y_q4,
+                              ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bd = 8;
+
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = 8;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi16(w0);
+  const __m256i wt1 = _mm256_set1_epi16(w1);
+  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+  __m256i filt[4], s[8], coeffs_x[4], coeffs_y[4];
+
+  assert(conv_params->round_0 > 0);
+
+  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  const __m256i round_const_h = _mm256_set1_epi16(
+      ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
+  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+  const __m256i round_const_v = _mm256_set1_epi32(
+      ((1 << conv_params->round_1) >> 1) -
+      (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+  const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      for (i = 0; i < im_h; i += 2) {
+        __m256i data = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+        if (i + 1 < im_h)
+          data = _mm256_inserti128_si256(
+              data,
+              _mm_loadu_si128(
+                  (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
+              1);
+        __m256i res = convolve_lowbd_x(data, coeffs_x, filt);
+
+        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
+                               round_shift_h);
+
+        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+      }
+    }
+
+    /* Vertical filter */
+    {
+      __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+      __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+      __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+      __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+      __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+      __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+      s[0] = _mm256_unpacklo_epi16(s0, s1);
+      s[1] = _mm256_unpacklo_epi16(s2, s3);
+      s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm256_unpackhi_epi16(s0, s1);
+      s[5] = _mm256_unpackhi_epi16(s2, s3);
+      s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+      for (i = 0; i < h; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        const __m256i s6 =
+            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+        const __m256i s7 =
+            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+        s[3] = _mm256_unpacklo_epi16(s6, s7);
+        s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+        const __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+
+        if (w - j > 4) {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          const __m256i res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+          const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);
+          const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
+
+          if (do_average) {
+            const __m256i data_ref_0 = _mm256_permute2x128_si256(
+                _mm256_castsi128_si256(
+                    _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
+                _mm256_castsi128_si256(_mm_loadu_si128(
+                    (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
+                0x20);
+
+            const __m256i comp_avg_res =
+                comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+            const __m256i round_result = convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result, round_result);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          } else {
+            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        } else {
+          const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);
+          const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
+
+          if (do_average) {
+            const __m256i data_ref_0 = _mm256_permute2x128_si256(
+                _mm256_castsi128_si256(
+                    _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
+                _mm256_castsi128_si256(_mm_loadu_si128(
+                    (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
+                0x20);
+
+            const __m256i comp_avg_res =
+                comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+            const __m256i round_result = convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result, round_result);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_0);
+            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+
+          } else {
+            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
+
+void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
+                                   uint8_t *dst0, int dst_stride0, int w, int h,
+                                   InterpFilterParams *filter_params_x,
+                                   InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params) {
+  const int bd = 8;
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi16(w0);
+  const __m256i wt1 = _mm256_set1_epi16(w1);
+  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+  const __m256i zero = _mm256_setzero_si256();
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+  int i, j;
+
+  if (!(w % 16)) {
+    for (i = 0; i < h; i += 1) {
+      for (j = 0; j < w; j += 16) {
+        const __m256i src_16bit = _mm256_cvtepu8_epi16(
+            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])));
+
+        const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
+        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+        if (do_average) {
+          const __m256i data_ref_0 =
+              _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
+
+          const __m256i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m256i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+          const __m256i res_0 = _mm256_permute4x64_epi64(res_8, 0xD8);
+
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
+                          _mm256_castsi256_si128(res_0));
+        } else {
+          _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
+                             res_unsigned);
+        }
+      }
+    }
+  } else if (!(w % 4)) {
+    for (i = 0; i < h; i += 2) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i src_row_0 =
+            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
+        const __m128i src_row_1 =
+            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride]));
+        // since not all compilers yet support _mm256_set_m128i()
+        const __m256i src_10 = _mm256_insertf128_si256(
+            _mm256_castsi128_si256(src_row_0), src_row_1, 1);
+
+        const __m256i src_16bit = _mm256_unpacklo_epi8(src_10, zero);
+
+        const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
+
+        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m256i data_ref_0 = _mm256_permute2x128_si256(
+              _mm256_castsi128_si256(
+                  _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
+              _mm256_castsi128_si256(_mm_loadu_si128(
+                  (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
+              0x20);
+
+          const __m256i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m256i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+          if (w > 4) {
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          } else {
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_0);
+            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+          }
+        } else {
+          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_1);
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_sse2.c b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
new file mode 100644
index 000000000..4df7bd42e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
+                             int dst_stride0, int w, int h,
+                             InterpFilterParams *filter_params_x,
+                             InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  const int bd = 8;
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+  __m128i coeffs[4];
+
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
+
+  if (w == 4) {
+    do {
+      const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
+      __m128i s[4];
+
+      s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
+      s[1] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
+      s[2] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
+      s[3] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
+      const __m128i res_lo = convolve_lo_x(s, coeffs);
+      const __m128i res_lo_round =
+          _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+      const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
+
+      const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_lo_shift);
+      const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+      // Accumulate values into the destination buffer
+      if (do_average) {
+        const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+        const __m128i comp_avg_res =
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+        const __m128i round_result = convolve_rounding(
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+        const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+        *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+      } else {
+        _mm_store_si128((__m128i *)(&dst[0]), res_unsigned);
+      }
+      src_ptr += src_stride;
+      dst += dst_stride;
+      dst0 += dst_stride0;
+    } while (--h);
+  } else {
+    assert(!(w % 8));
+    int i = 0;
+    do {
+      int j = 0;
+      do {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        __m128i s[4];
+
+        // Filter even-index pixels
+        s[0] = data;
+        s[1] = _mm_srli_si128(data, 2);
+        s[2] = _mm_srli_si128(data, 4);
+        s[3] = _mm_srli_si128(data, 6);
+        const __m128i res_even = convolve_lo_x(s, coeffs);
+
+        // Filter odd-index pixels
+        s[0] = _mm_srli_si128(data, 1);
+        s[1] = _mm_srli_si128(data, 3);
+        s[2] = _mm_srli_si128(data, 5);
+        s[3] = _mm_srli_si128(data, 7);
+        const __m128i res_odd = convolve_lo_x(s, coeffs);
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+        const __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        const __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+        const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
+        const __m128i res_hi_shift = _mm_sll_epi32(res_hi_round, left_shift);
+
+        const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+        const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m128i data_ref_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+        }
+        j += 8;
+      } while (j < w);
+    } while (++i < h);
+  }
+}
+
+void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
+                             int dst_stride0, int w, int h,
+                             InterpFilterParams *filter_params_x,
+                             InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  const int bd = 8;
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint8_t *src_ptr = src - fo_vert * src_stride;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  const __m128i wt0 = _mm_set1_epi16(conv_params->fwd_offset);
+  const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+  const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_1) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+  __m128i coeffs[4];
+
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
+
+  if (w == 4) {
+    __m128i s[8], src6, res, res_shift;
+    src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
+    s[0] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
+    s[1] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
+    s[2] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
+    s[3] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
+    s[4] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
+    s[5] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+
+    do {
+      s[6] = _mm_unpacklo_epi8(
+          src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
+      src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
+      s[7] = _mm_unpacklo_epi8(
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
+
+      res = convolve_lo_y(s + 0, coeffs);
+      res_shift = _mm_sll_epi32(res, left_shift);
+      res_shift =
+          _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
+
+      __m128i res_16b = _mm_packs_epi32(res_shift, res_shift);
+      __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+      // Accumulate values into the destination buffer
+      if (do_average) {
+        const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+        const __m128i comp_avg_res =
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+        const __m128i round_result = convolve_rounding(
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+        const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+        *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+
+      } else {
+        _mm_store_si128((__m128i *)dst, res_unsigned);
+      }
+
+      src_ptr += src_stride;
+      dst += dst_stride;
+      dst0 += dst_stride0;
+
+      res = convolve_lo_y(s + 1, coeffs);
+      res_shift = _mm_sll_epi32(res, left_shift);
+      res_shift =
+          _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
+
+      res_16b = _mm_packs_epi32(res_shift, res_shift);
+      res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+      // Accumulate values into the destination buffer
+      if (do_average) {
+        const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+        const __m128i comp_avg_res =
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+        const __m128i round_result = convolve_rounding(
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+        const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+        *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+
+      } else {
+        _mm_store_si128((__m128i *)dst, res_unsigned);
+      }
+
+      src_ptr += src_stride;
+      dst += dst_stride;
+      dst0 += dst_stride0;
+
+      s[0] = s[2];
+      s[1] = s[3];
+      s[2] = s[4];
+      s[3] = s[5];
+      s[4] = s[6];
+      s[5] = s[7];
+      h -= 2;
+    } while (h);
+  } else {
+    assert(!(w % 8));
+    int j = 0;
+    do {
+      __m128i s[8], src6, res_lo, res_hi, res_lo_shift, res_hi_shift;
+      const uint8_t *data = &src_ptr[j];
+
+      src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+      s[0] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+      s[1] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+      s[2] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+      s[3] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+      s[4] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+      s[5] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+
+      int i = 0;
+      do {
+        data = &src_ptr[i * src_stride + j];
+        s[6] = _mm_unpacklo_epi8(
+            src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+        src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+        s[7] = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+
+        res_lo = convolve_lo_y(s, coeffs);  // Filter low index pixels
+        res_hi = convolve_hi_y(s, coeffs);  // Filter high index pixels
+        res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
+        res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
+        res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
+                                     round_shift);
+        res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
+                                     round_shift);
+
+        __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+        __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m128i data_ref_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+        }
+        i++;
+
+        res_lo = convolve_lo_y(s + 1, coeffs);  // Filter low index pixels
+        res_hi = convolve_hi_y(s + 1, coeffs);  // Filter high index pixels
+        res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
+        res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
+        res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
+                                     round_shift);
+        res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
+                                     round_shift);
+        res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+        res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          __m128i data_ref_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+        }
+        i++;
+
+        s[0] = s[2];
+        s[1] = s[3];
+        s[2] = s[4];
+        s[3] = s[5];
+        s[4] = s[6];
+        s[5] = s[7];
+      } while (i < h);
+      j += 8;
+    } while (j < w);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
new file mode 100644
index 000000000..e4d51ac8d
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
+                               uint8_t *dst0, int dst_stride0, int w, int h,
+                               InterpFilterParams *filter_params_x,
+                               InterpFilterParams *filter_params_y,
+                               const int subpel_x_q4, const int subpel_y_q4,
+                               ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bd = 8;
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = MAX_SB_SIZE;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  const __m128i zero = _mm_setzero_si128();
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+  /* Horizontal filter */
+  {
+    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+    for (i = 0; i < im_h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+        const __m128i src_lo = _mm_unpacklo_epi8(data, zero);
+        const __m128i src_hi = _mm_unpackhi_epi8(data, zero);
+
+        // Filter even-index pixels
+        const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01);
+        const __m128i src_2 = _mm_alignr_epi8(src_hi, src_lo, 4);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i src_4 = _mm_alignr_epi8(src_hi, src_lo, 8);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i src_6 = _mm_alignr_epi8(src_hi, src_lo, 12);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even =
+            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_alignr_epi8(src_hi, src_lo, 2);
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i src_3 = _mm_alignr_epi8(src_hi, src_lo, 6);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i src_5 = _mm_alignr_epi8(src_hi, src_lo, 10);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i src_7 = _mm_alignr_epi8(src_hi, src_lo, 14);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_1) >> 1) -
+        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const int16_t *data = &im_block[i * im_stride + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        const __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+        const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round);
+        const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m128i data_ref_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+          if (w > 4)
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+          else
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/pvq_sse4.c b/third_party/aom/av1/common/x86/pvq_sse4.c
deleted file mode 100644
index b3ed9efdf..000000000
--- a/third_party/aom/av1/common/x86/pvq_sse4.c
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h>
-#include <emmintrin.h>
-#include <tmmintrin.h>
-#include <float.h>
-
-#include "./av1_rtcd.h"
-#include "av1/common/x86/pvq_sse4.h"
-#include "../odintrin.h"
-#include "av1/common/pvq.h"
-
-#define EPSILON 1e-15f
-
-static __m128 horizontal_sum_ps(__m128 x) {
-  x = _mm_add_ps(x, _mm_shuffle_ps(x, x, _MM_SHUFFLE(1, 0, 3, 2)));
-  x = _mm_add_ps(x, _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1)));
-  return x;
-}
-
-static __m128i horizontal_sum_epi32(__m128i x) {
-  x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
-  x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)));
-  return x;
-}
-
-static INLINE float rsqrtf(float x) {
-  float y;
-  _mm_store_ss(&y, _mm_rsqrt_ss(_mm_load_ss(&x)));
-  return y;
-}
-
-/** Find the codepoint on the given PSphere closest to the desired
- * vector. This is a float-precision PVQ search just to make sure
- * our tests aren't limited by numerical accuracy. It's close to the
- * pvq_search_rdo_double_c implementation, but is not bit accurate and
- * it performs slightly worse on PSNR. One reason is that this code runs
- * more RDO iterations than the C code. It also uses single precision
- * floating point math, whereas the C version uses double precision.
- *
- * @param [in]      xcoeff  input vector to quantize (x in the math doc)
- * @param [in]      n       number of dimensions
- * @param [in]      k       number of pulses
- * @param [out]     ypulse  optimal codevector found (y in the math doc)
- * @param [in]      g2      multiplier for the distortion (typically squared
- *                          gain units)
- * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
- * @param [in]      prev_k  number of pulses already in ypulse that we should
- *                          reuse for the search (or 0 for a new search)
- * @return                  cosine distance between x and y (between 0 and 1)
- */
-double pvq_search_rdo_double_sse4_1(const od_val16 *xcoeff, int n, int k,
-                                    int *ypulse, double g2,
-                                    double pvq_norm_lambda, int prev_k) {
-  int i, j;
-  int reuse_pulses = prev_k > 0 && prev_k <= k;
-  /* TODO - This blows our 8kB stack space budget and should be fixed when
-   converting PVQ to fixed point. */
-  float xx = 0, xy = 0, yy = 0;
-  float x[MAXN + 3];
-  float y[MAXN + 3];
-  float sign_y[MAXN + 3];
-  for (i = 0; i < n; i++) {
-    float tmp = (float)xcoeff[i];
-    xx += tmp * tmp;
-    x[i] = xcoeff[i];
-  }
-
-  x[n] = x[n + 1] = x[n + 2] = 0;
-  ypulse[n] = ypulse[n + 1] = ypulse[n + 2] = 0;
-
-  __m128 sums = _mm_setzero_ps();
-  for (i = 0; i < n; i += 4) {
-    __m128 x4 = _mm_loadu_ps(&x[i]);
-    __m128 s4 = _mm_cmplt_ps(x4, _mm_setzero_ps());
-    /* Save the sign, we'll put it back later. */
-    _mm_storeu_ps(&sign_y[i], s4);
-    /* Get rid of the sign. */
-    x4 = _mm_andnot_ps(_mm_set_ps1(-0.f), x4);
-    sums = _mm_add_ps(sums, x4);
-    if (!reuse_pulses) {
-      /* Clear y and ypulse in case we don't do the projection. */
-      _mm_storeu_ps(&y[i], _mm_setzero_ps());
-      _mm_storeu_si128((__m128i *)&ypulse[i], _mm_setzero_si128());
-    }
-    _mm_storeu_ps(&x[i], x4);
-  }
-  sums = horizontal_sum_ps(sums);
-  int pulses_left = k;
-  {
-    __m128i pulses_sum;
-    __m128 yy4, xy4;
-    xy4 = yy4 = _mm_setzero_ps();
-    pulses_sum = _mm_setzero_si128();
-    if (reuse_pulses) {
-      /* We reuse pulses from a previous search so we don't have to search them
-          again. */
-      for (j = 0; j < n; j += 4) {
-        __m128 x4, y4;
-        __m128i iy4;
-        iy4 = _mm_abs_epi32(_mm_loadu_si128((__m128i *)&ypulse[j]));
-        pulses_sum = _mm_add_epi32(pulses_sum, iy4);
-        _mm_storeu_si128((__m128i *)&ypulse[j], iy4);
-        y4 = _mm_cvtepi32_ps(iy4);
-        x4 = _mm_loadu_ps(&x[j]);
-        xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
-        yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
-        /* Double the y[] vector so we don't have to do it in the search loop.
-         */
-        _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
-      }
-      pulses_left -= _mm_cvtsi128_si32(horizontal_sum_epi32(pulses_sum));
-      xy4 = horizontal_sum_ps(xy4);
-      xy = _mm_cvtss_f32(xy4);
-      yy4 = horizontal_sum_ps(yy4);
-      yy = _mm_cvtss_f32(yy4);
-    } else if (k > (n >> 1)) {
-      /* Do a pre-search by projecting on the pyramid. */
-      __m128 rcp4;
-      float sum = _mm_cvtss_f32(sums);
-      /* If x is too small, just replace it with a pulse at 0. This prevents
-         infinities and NaNs from causing too many pulses to be allocated. Here,
-         64 is an
-         approximation of infinity. */
-      if (sum <= EPSILON) {
-        x[0] = 1.f;
-        for (i = 1; i < n; i++) {
-          x[i] = 0;
-        }
-        sums = _mm_set_ps1(1.f);
-      }
-      /* Using k + e with e < 1 guarantees we cannot get more than k pulses. */
-      rcp4 = _mm_mul_ps(_mm_set_ps1((float)k + .8f), _mm_rcp_ps(sums));
-      xy4 = yy4 = _mm_setzero_ps();
-      pulses_sum = _mm_setzero_si128();
-      for (j = 0; j < n; j += 4) {
-        __m128 rx4, x4, y4;
-        __m128i iy4;
-        x4 = _mm_loadu_ps(&x[j]);
-        rx4 = _mm_mul_ps(x4, rcp4);
-        iy4 = _mm_cvttps_epi32(rx4);
-        pulses_sum = _mm_add_epi32(pulses_sum, iy4);
-        _mm_storeu_si128((__m128i *)&ypulse[j], iy4);
-        y4 = _mm_cvtepi32_ps(iy4);
-        xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
-        yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
-        /* Double the y[] vector so we don't have to do it in the search loop.
-         */
-        _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
-      }
-      pulses_left -= _mm_cvtsi128_si32(horizontal_sum_epi32(pulses_sum));
-      xy = _mm_cvtss_f32(horizontal_sum_ps(xy4));
-      yy = _mm_cvtss_f32(horizontal_sum_ps(yy4));
-    }
-    x[n] = x[n + 1] = x[n + 2] = -100;
-    y[n] = y[n + 1] = y[n + 2] = 100;
-  }
-
-  /* This should never happen. */
-  OD_ASSERT(pulses_left <= n + 3);
-
-  float lambda_delta_rate[MAXN + 3];
-  if (pulses_left) {
-    /* Hoist lambda to avoid the multiply in the loop. */
-    float lambda =
-        0.5f * sqrtf(xx) * (float)pvq_norm_lambda / (FLT_MIN + (float)g2);
-    float delta_rate = 3.f / n;
-    __m128 count = _mm_set_ps(3, 2, 1, 0);
-    for (i = 0; i < n; i += 4) {
-      _mm_storeu_ps(&lambda_delta_rate[i],
-                    _mm_mul_ps(count, _mm_set_ps1(lambda * delta_rate)));
-      count = _mm_add_ps(count, _mm_set_ps(4, 4, 4, 4));
-    }
-  }
-  lambda_delta_rate[n] = lambda_delta_rate[n + 1] = lambda_delta_rate[n + 2] =
-      1e30f;
-
-  for (i = 0; i < pulses_left; i++) {
-    int best_id = 0;
-    __m128 xy4, yy4;
-    __m128 max, max2;
-    __m128i count;
-    __m128i pos;
-
-    /* The squared magnitude term gets added anyway, so we might as well
-        add it outside the loop. */
-    yy = yy + 1;
-    xy4 = _mm_load1_ps(&xy);
-    yy4 = _mm_load1_ps(&yy);
-    max = _mm_setzero_ps();
-    pos = _mm_setzero_si128();
-    count = _mm_set_epi32(3, 2, 1, 0);
-    for (j = 0; j < n; j += 4) {
-      __m128 x4, y4, r4;
-      x4 = _mm_loadu_ps(&x[j]);
-      y4 = _mm_loadu_ps(&y[j]);
-      x4 = _mm_add_ps(x4, xy4);
-      y4 = _mm_add_ps(y4, yy4);
-      y4 = _mm_rsqrt_ps(y4);
-      r4 = _mm_mul_ps(x4, y4);
-      /* Subtract lambda. */
-      r4 = _mm_sub_ps(r4, _mm_loadu_ps(&lambda_delta_rate[j]));
-      /* Update the index of the max. */
-      pos = _mm_max_epi16(
-          pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max))));
-      /* Update the max. */
-      max = _mm_max_ps(max, r4);
-      /* Update the indices (+4) */
-      count = _mm_add_epi32(count, _mm_set_epi32(4, 4, 4, 4));
-    }
-    /* Horizontal max. */
-    max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2)));
-    max2 =
-        _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1)));
-    /* Now that max2 contains the max at all positions, look at which value(s)
-       of the
-        partial max is equal to the global max. */
-    pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2)));
-    pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos));
-    pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2)));
-    best_id = _mm_cvtsi128_si32(pos);
-    OD_ASSERT(best_id < n);
-    /* Updating the sums of the new pulse(s) */
-    xy = xy + x[best_id];
-    /* We're multiplying y[j] by two so we don't have to do it here. */
-    yy = yy + y[best_id];
-    /* Only now that we've made the final choice, update y/ypulse. */
-    /* Multiplying y[j] by 2 so we don't have to do it everywhere else. */
-    y[best_id] += 2;
-    ypulse[best_id]++;
-  }
-
-  /* Put the original sign back. */
-  for (i = 0; i < n; i += 4) {
-    __m128i y4;
-    __m128i s4;
-    y4 = _mm_loadu_si128((__m128i *)&ypulse[i]);
-    s4 = _mm_castps_si128(_mm_loadu_ps(&sign_y[i]));
-    y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
-    _mm_storeu_si128((__m128i *)&ypulse[i], y4);
-  }
-  return xy * rsqrtf(xx * yy + FLT_MIN);
-}
diff --git a/third_party/aom/av1/common/x86/pvq_sse4.h b/third_party/aom/av1/common/x86/pvq_sse4.h
deleted file mode 100644
index 3c4ce8543..000000000
--- a/third_party/aom/av1/common/x86/pvq_sse4.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#ifndef AOM_COMMON_PVQ_X86_SSE4_H_
-#define AOM_COMMON_PVQ_X86_SSE4_H_
-#endif  // AOM_COMMON_PVQ_X86_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/reconinter_avx2.c b/third_party/aom/av1/common/x86/reconinter_avx2.c
new file mode 100644
index 000000000..ffbb31849
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_avx2.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/blockd.h"
+
+void av1_build_compound_diffwtd_mask_highbd_avx2(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+    int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+    int bd) {
+  if (w < 16) {
+    av1_build_compound_diffwtd_mask_highbd_ssse3(
+        mask, mask_type, src0, src0_stride, src1, src1_stride, h, w, bd);
+  } else {
+    assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV);
+    assert(bd >= 8);
+    assert((w % 16) == 0);
+    const __m256i y0 = _mm256_setzero_si256();
+    const __m256i yAOM_BLEND_A64_MAX_ALPHA =
+        _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+    const int mask_base = 38;
+    const __m256i ymask_base = _mm256_set1_epi16(mask_base);
+    const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0);
+    const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1);
+    if (bd == 8) {
+      if (mask_type == DIFFWTD_38_INV) {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 16) {
+            __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+            __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+            __m256i diff = _mm256_srai_epi16(
+                _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2);
+            __m256i m = _mm256_min_epi16(
+                _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+                yAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m);
+            m = _mm256_packus_epi16(m, m);
+            m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+            __m128i m0 = _mm256_castsi256_si128(m);
+            _mm_storeu_si128((__m128i *)&mask[j], m0);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      } else {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 16) {
+            __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+            __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+            __m256i diff = _mm256_srai_epi16(
+                _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2);
+            __m256i m = _mm256_min_epi16(
+                _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+                yAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm256_packus_epi16(m, m);
+            m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+            __m128i m0 = _mm256_castsi256_si128(m);
+            _mm_storeu_si128((__m128i *)&mask[j], m0);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      }
+    } else {
+      const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
+      if (mask_type == DIFFWTD_38_INV) {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 16) {
+            __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+            __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+            __m256i diff = _mm256_sra_epi16(
+                _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift);
+            __m256i m = _mm256_min_epi16(
+                _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+                yAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m);
+            m = _mm256_packus_epi16(m, m);
+            m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+            __m128i m0 = _mm256_castsi256_si128(m);
+            _mm_storeu_si128((__m128i *)&mask[j], m0);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      } else {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 16) {
+            __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+            __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+            __m256i diff = _mm256_sra_epi16(
+                _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift);
+            __m256i m = _mm256_min_epi16(
+                _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+                yAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm256_packus_epi16(m, m);
+            m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+            __m128i m0 = _mm256_castsi256_si128(m);
+            _mm_storeu_si128((__m128i *)&mask[j], m0);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/reconinter_sse4.c b/third_party/aom/av1/common/x86/reconinter_sse4.c
new file mode 100644
index 000000000..5171ca493
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_sse4.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+#include <smmintrin.h>  /* SSE4.1 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "av1/common/blockd.h"
+
+static INLINE __m128i calc_mask(const __m128i mask_base, const __m128i s0,
+                                const __m128i s1) {
+  const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(s0, s1));
+  return _mm_abs_epi16(_mm_add_epi16(mask_base, _mm_srli_epi16(diff, 4)));
+  // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
+}
+
+void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask,
+                                            DIFFWTD_MASK_TYPE mask_type,
+                                            const uint8_t *src0, int stride0,
+                                            const uint8_t *src1, int stride1,
+                                            int h, int w) {
+  const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
+  const __m128i mask_base = _mm_set1_epi16(38 - mb);
+  int i = 0;
+  if (4 == w) {
+    do {
+      const __m128i s0A = _mm_cvtsi32_si128(*(uint32_t *)src0);
+      const __m128i s0B = _mm_cvtsi32_si128(*(uint32_t *)(src0 + stride0));
+      const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
+      const __m128i s0 = _mm_cvtepu8_epi16(s0AB);
+
+      const __m128i s1A = _mm_cvtsi32_si128(*(uint32_t *)src1);
+      const __m128i s1B = _mm_cvtsi32_si128(*(uint32_t *)(src1 + stride1));
+      const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
+      const __m128i s1 = _mm_cvtepu8_epi16(s1AB);
+
+      const __m128i m16 = calc_mask(mask_base, s0, s1);
+      const __m128i m8 = _mm_packus_epi16(m16, m16);
+
+      *(uint32_t *)mask = _mm_cvtsi128_si32(m8);
+      *(uint32_t *)(mask + w) = _mm_extract_epi32(m8, 1);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += 8;
+      i += 2;
+    } while (i < h);
+  } else if (8 == w) {
+    do {
+      __m128i s0 = _mm_loadl_epi64((__m128i const *)src0);
+      __m128i s1 = _mm_loadl_epi64((__m128i const *)src1);
+      s0 = _mm_cvtepu8_epi16(s0);
+      s1 = _mm_cvtepu8_epi16(s1);
+      const __m128i m16 = calc_mask(mask_base, s0, s1);
+      const __m128i m8 = _mm_packus_epi16(m16, m16);
+      _mm_storel_epi64((__m128i *)mask, m8);
+      src0 += stride0;
+      src1 += stride1;
+      mask += 8;
+      i += 1;
+    } while (i < h);
+  } else {
+    const __m128i zero = _mm_setzero_si128();
+    do {
+      int j = 0;
+      do {
+        const __m128i s0 = _mm_load_si128((__m128i const *)(src0 + j));
+        const __m128i s1 = _mm_load_si128((__m128i const *)(src1 + j));
+        const __m128i s0L = _mm_cvtepu8_epi16(s0);
+        const __m128i s1L = _mm_cvtepu8_epi16(s1);
+        const __m128i s0H = _mm_unpackhi_epi8(s0, zero);
+        const __m128i s1H = _mm_unpackhi_epi8(s1, zero);
+
+        const __m128i m16L = calc_mask(mask_base, s0L, s1L);
+        const __m128i m16H = calc_mask(mask_base, s0H, s1H);
+
+        const __m128i m8 = _mm_packus_epi16(m16L, m16H);
+        _mm_store_si128((__m128i *)(mask + j), m8);
+        j += 16;
+      } while (j < w);
+      src0 += stride0;
+      src1 += stride1;
+      mask += w;
+      i += 1;
+    } while (i < h);
+  }
+}
+
+void av1_build_compound_diffwtd_mask_d16_sse4_1(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
+  const int which_inverse = (mask_type == DIFFWTD_38) ? 0 : 1;
+  const int mask_base = 38;
+  int round =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+  const __m128i round_const = _mm_set1_epi16((1 << round) >> 1);
+  const __m128i mask_base_16 = _mm_set1_epi16(mask_base);
+  const __m128i clip_diff = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i add_const =
+      _mm_set1_epi16((which_inverse ? AOM_BLEND_A64_MAX_ALPHA : 0));
+  const __m128i add_sign = _mm_set1_epi16((which_inverse ? -1 : 1));
+
+  int i, j;
+  // When rounding constant is added, there is a possibility of overflow.
+  // However that much precision is not required. Code should very well work for
+  // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
+  // there is a possibility of corner case bugs.
+  assert(DIFF_FACTOR_LOG2 == 4);
+  assert(AOM_BLEND_A64_MAX_ALPHA == 64);
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; j += 8) {
+      const __m128i data_src0 =
+          _mm_loadu_si128((__m128i *)&src0[(i * src0_stride) + j]);
+      const __m128i data_src1 =
+          _mm_loadu_si128((__m128i *)&src1[(i * src1_stride) + j]);
+
+      const __m128i diffa = _mm_subs_epu16(data_src0, data_src1);
+      const __m128i diffb = _mm_subs_epu16(data_src1, data_src0);
+      const __m128i diff = _mm_max_epu16(diffa, diffb);
+      const __m128i diff_round =
+          _mm_srli_epi16(_mm_adds_epu16(diff, round_const), round);
+      const __m128i diff_factor = _mm_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+      const __m128i diff_mask = _mm_adds_epi16(diff_factor, mask_base_16);
+      __m128i diff_clamp = _mm_min_epi16(diff_mask, clip_diff);
+      // clamp to 0 can be skipped since we are using add and saturate
+      // instruction
+
+      const __m128i diff_sign = _mm_sign_epi16(diff_clamp, add_sign);
+      const __m128i diff_const_16 = _mm_add_epi16(diff_sign, add_const);
+
+      // 8 bit conversion and saturation to uint8
+      const __m128i res_8 = _mm_packus_epi16(diff_const_16, diff_const_16);
+
+      // Store values into the destination buffer
+      __m128i *const dst = (__m128i *)&mask[i * w + j];
+
+      if ((w - j) > 4) {
+        _mm_storel_epi64(dst, res_8);
+      } else {  // w==4
+        *(uint32_t *)dst = _mm_cvtsi128_si32(res_8);
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/reconinter_ssse3.c b/third_party/aom/av1/common/x86/reconinter_ssse3.c
new file mode 100644
index 000000000..cf684447c
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_ssse3.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/blockd.h"
+
+void av1_build_compound_diffwtd_mask_highbd_ssse3(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+    int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+    int bd) {
+  if (w < 8) {
+    av1_build_compound_diffwtd_mask_highbd_c(mask, mask_type, src0, src0_stride,
+                                             src1, src1_stride, h, w, bd);
+  } else {
+    assert(bd >= 8);
+    assert((w % 8) == 0);
+    assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV);
+    const __m128i x0 = _mm_setzero_si128();
+    const __m128i xAOM_BLEND_A64_MAX_ALPHA =
+        _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+    const int mask_base = 38;
+    const __m128i xmask_base = _mm_set1_epi16(mask_base);
+    const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0);
+    const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1);
+    if (bd == 8) {
+      if (mask_type == DIFFWTD_38_INV) {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 8) {
+            __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+            __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+            __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)),
+                                          DIFF_FACTOR_LOG2);
+            __m128i m = _mm_min_epi16(
+                _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+                xAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m);
+            m = _mm_packus_epi16(m, m);
+            _mm_storel_epi64((__m128i *)&mask[j], m);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      } else {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 8) {
+            __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+            __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+            __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)),
+                                          DIFF_FACTOR_LOG2);
+            __m128i m = _mm_min_epi16(
+                _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+                xAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm_packus_epi16(m, m);
+            _mm_storel_epi64((__m128i *)&mask[j], m);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      }
+    } else {
+      const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
+      if (mask_type == DIFFWTD_38_INV) {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 8) {
+            __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+            __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+            __m128i diff =
+                _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift);
+            __m128i m = _mm_min_epi16(
+                _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+                xAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m);
+            m = _mm_packus_epi16(m, m);
+            _mm_storel_epi64((__m128i *)&mask[j], m);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      } else {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 8) {
+            __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+            __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+            __m128i diff =
+                _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift);
+            __m128i m = _mm_min_epi16(
+                _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+                xAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm_packus_epi16(m, m);
+            _mm_storel_epi64((__m128i *)&mask[j], m);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/selfguided_avx2.c b/third_party/aom/av1/common/x86/selfguided_avx2.c
new file mode 100644
index 000000000..375def62e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/selfguided_avx2.c
@@ -0,0 +1,719 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/restoration.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// Load 8 bytes from the possibly-misaligned pointer p, extend each byte to
+// 32-bit precision and return them in an AVX2 register.
+static __m256i yy256_load_extend_8_32(const void *p) {
+  return _mm256_cvtepu8_epi32(xx_loadl_64(p));
+}
+
+// Load 8 halfwords from the possibly-misaligned pointer p, extend each
+// halfword to 32-bit precision and return them in an AVX2 register.
+static __m256i yy256_load_extend_16_32(const void *p) {
+  return _mm256_cvtepu16_epi32(xx_loadu_128(p));
+}
+
+// Compute the scan of an AVX2 register holding 8 32-bit integers. If the
+// register holds x0..x7 then the scan will hold x0, x0+x1, x0+x1+x2, ...,
+// x0+x1+...+x7
+//
+// Let [...] represent a 128-bit block, and let a, ..., h be 32-bit integers
+// (assumed small enough to be able to add them without overflow).
+//
+// Use -> as shorthand for summing, i.e. h->a = h + g + f + e + d + c + b + a.
+//
+// x   = [h g f e][d c b a]
+// x01 = [g f e 0][c b a 0]
+// x02 = [g+h f+g e+f e][c+d b+c a+b a]
+// x03 = [e+f e 0 0][a+b a 0 0]
+// x04 = [e->h e->g e->f e][a->d a->c a->b a]
+// s   = a->d
+// s01 = [a->d a->d a->d a->d]
+// s02 = [a->d a->d a->d a->d][0 0 0 0]
+// ret = [a->h a->g a->f a->e][a->d a->c a->b a]
+static __m256i scan_32(__m256i x) {
+  const __m256i x01 = _mm256_slli_si256(x, 4);
+  const __m256i x02 = _mm256_add_epi32(x, x01);
+  const __m256i x03 = _mm256_slli_si256(x02, 8);
+  const __m256i x04 = _mm256_add_epi32(x02, x03);
+  const int32_t s = _mm256_extract_epi32(x04, 3);
+  const __m128i s01 = _mm_set1_epi32(s);
+  const __m256i s02 = _mm256_insertf128_si256(_mm256_setzero_si256(), s01, 1);
+  return _mm256_add_epi32(x04, s02);
+}
+
+// Compute two integral images from src. B sums elements; A sums their
+// squares. The images are offset by one pixel, so will have width and height
+// equal to width + 1, height + 1 and the first row and column will be zero.
+//
+// A+1 and B+1 should be aligned to 32 bytes. buf_stride should be a multiple
+// of 8.
+
+static void *memset_zero_avx(int32_t *dest, const __m256i *zero, size_t count) {
+  unsigned int i = 0;
+  for (i = 0; i < (count & 0xffffffe0); i += 32) {
+    _mm256_storeu_si256((__m256i *)(dest + i), *zero);
+    _mm256_storeu_si256((__m256i *)(dest + i + 8), *zero);
+    _mm256_storeu_si256((__m256i *)(dest + i + 16), *zero);
+    _mm256_storeu_si256((__m256i *)(dest + i + 24), *zero);
+  }
+  for (; i < (count & 0xfffffff8); i += 8) {
+    _mm256_storeu_si256((__m256i *)(dest + i), *zero);
+  }
+  for (; i < count; i++) {
+    dest[i] = 0;
+  }
+  return dest;
+}
+
+static void integral_images(const uint8_t *src, int src_stride, int width,
+                            int height, int32_t *A, int32_t *B,
+                            int buf_stride) {
+  const __m256i zero = _mm256_setzero_si256();
+  // Write out the zero top row
+  memset_zero_avx(A, &zero, (width + 8));
+  memset_zero_avx(B, &zero, (width + 8));
+  for (int i = 0; i < height; ++i) {
+    // Zero the left column.
+    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+    // ldiff is the difference H - D where H is the output sample immediately
+    // to the left and D is the output sample above it. These are scalars,
+    // replicated across the eight lanes.
+    __m256i ldiff1 = zero, ldiff2 = zero;
+    for (int j = 0; j < width; j += 8) {
+      const int ABj = 1 + j;
+
+      const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
+      const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
+
+      const __m256i x1 = yy256_load_extend_8_32(src + j + i * src_stride);
+      const __m256i x2 = _mm256_madd_epi16(x1, x1);
+
+      const __m256i sc1 = scan_32(x1);
+      const __m256i sc2 = scan_32(x2);
+
+      const __m256i row1 =
+          _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
+      const __m256i row2 =
+          _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
+
+      yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
+      yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
+
+      // Calculate the new H - D.
+      ldiff1 = _mm256_set1_epi32(
+          _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
+      ldiff2 = _mm256_set1_epi32(
+          _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
+    }
+  }
+}
+
+// Compute two integral images from src. B sums elements; A sums their squares
+//
+// A and B should be aligned to 32 bytes. buf_stride should be a multiple of 8.
+static void integral_images_highbd(const uint16_t *src, int src_stride,
+                                   int width, int height, int32_t *A,
+                                   int32_t *B, int buf_stride) {
+  const __m256i zero = _mm256_setzero_si256();
+  // Write out the zero top row
+  memset_zero_avx(A, &zero, (width + 8));
+  memset_zero_avx(B, &zero, (width + 8));
+
+  for (int i = 0; i < height; ++i) {
+    // Zero the left column.
+    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+    // ldiff is the difference H - D where H is the output sample immediately
+    // to the left and D is the output sample above it. These are scalars,
+    // replicated across the eight lanes.
+    __m256i ldiff1 = zero, ldiff2 = zero;
+    for (int j = 0; j < width; j += 8) {
+      const int ABj = 1 + j;
+
+      const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
+      const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
+
+      const __m256i x1 = yy256_load_extend_16_32(src + j + i * src_stride);
+      const __m256i x2 = _mm256_madd_epi16(x1, x1);
+
+      const __m256i sc1 = scan_32(x1);
+      const __m256i sc2 = scan_32(x2);
+
+      const __m256i row1 =
+          _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
+      const __m256i row2 =
+          _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
+
+      yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
+      yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
+
+      // Calculate the new H - D.
+      ldiff1 = _mm256_set1_epi32(
+          _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
+      ldiff2 = _mm256_set1_epi32(
+          _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
+    }
+  }
+}
+
+// Compute 8 values of boxsum from the given integral image. ii should point
+// at the middle of the box (for the first value). r is the box radius.
+static INLINE __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) {
+  const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride);
+  const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride);
+  const __m256i bl = yy_loadu_256(ii - (r + 1) + r * stride);
+  const __m256i br = yy_loadu_256(ii + (r + 0) + r * stride);
+  const __m256i u = _mm256_sub_epi32(tr, tl);
+  const __m256i v = _mm256_sub_epi32(br, bl);
+  return _mm256_sub_epi32(v, u);
+}
+
+static __m256i round_for_shift(unsigned shift) {
+  return _mm256_set1_epi32((1 << shift) >> 1);
+}
+
+static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) {
+  __m256i an, bb;
+  if (bit_depth > 8) {
+    const __m256i rounding_a = round_for_shift(2 * (bit_depth - 8));
+    const __m256i rounding_b = round_for_shift(bit_depth - 8);
+    const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
+    const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
+    const __m256i a =
+        _mm256_srl_epi32(_mm256_add_epi32(sum2, rounding_a), shift_a);
+    const __m256i b =
+        _mm256_srl_epi32(_mm256_add_epi32(sum1, rounding_b), shift_b);
+    // b < 2^14, so we can use a 16-bit madd rather than a 32-bit
+    // mullo to square it
+    bb = _mm256_madd_epi16(b, b);
+    an = _mm256_max_epi32(_mm256_mullo_epi32(a, _mm256_set1_epi32(n)), bb);
+  } else {
+    bb = _mm256_madd_epi16(sum1, sum1);
+    an = _mm256_mullo_epi32(sum2, _mm256_set1_epi32(n));
+  }
+  return _mm256_sub_epi32(an, bb);
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
+                    int width, int height, int buf_stride, int bit_depth,
+                    int sgr_params_idx, int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int n = (2 * r + 1) * (2 * r + 1);
+  const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
+  // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+  const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]);
+
+  const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+  const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+  // Set up masks
+  const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  __m256i mask[8];
+  for (int idx = 0; idx < 8; idx++) {
+    const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
+    mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+  }
+
+  for (int i = -1; i < height + 1; ++i) {
+    for (int j = -1; j < width + 1; j += 8) {
+      const int32_t *Cij = C + i * buf_stride + j;
+      const int32_t *Dij = D + i * buf_stride + j;
+
+      __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+      __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+      // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
+      // some uninitialised data in their upper words. We use a mask to
+      // ensure that these bits are set to 0.
+      int idx = AOMMIN(8, width + 1 - j);
+      assert(idx >= 1);
+
+      if (idx < 8) {
+        sum1 = _mm256_and_si256(mask[idx], sum1);
+        sum2 = _mm256_and_si256(mask[idx], sum2);
+      }
+
+      const __m256i p = compute_p(sum1, sum2, bit_depth, n);
+
+      const __m256i z = _mm256_min_epi32(
+          _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
+                            SGRPROJ_MTABLE_BITS),
+          _mm256_set1_epi32(255));
+
+      const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4);
+
+      yy_storeu_256(A + i * buf_stride + j, a_res);
+
+      const __m256i a_complement =
+          _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
+
+      // sum1 might have lanes greater than 2^15, so we can't use madd to do
+      // multiplication involving sum1. However, a_complement and one_over_n
+      // are both less than 256, so we can multiply them first.
+      const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
+      const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
+      const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
+                                              SGRPROJ_RECIP_BITS);
+
+      yy_storeu_256(B + i * buf_stride + j, b_res);
+    }
+  }
+}
+
+// Calculate 8 values of the "cross sum" starting at buf. This is a 3x3 filter
+// where the outer four corners have weight 3 and all other pixels have weight
+// 4.
+//
+// Pixels are indexed as follows:
+// xtl  xt   xtr
+// xl    x   xr
+// xbl  xb   xbr
+//
+// buf points to x
+//
+// fours = xl + xt + xr + xb + x
+// threes = xtl + xtr + xbr + xbl
+// cross_sum = 4 * fours + 3 * threes
+//           = 4 * (fours + threes) - threes
+//           = (fours + threes) << 2 - threes
+static INLINE __m256i cross_sum(const int32_t *buf, int stride) {
+  const __m256i xtl = yy_loadu_256(buf - 1 - stride);
+  const __m256i xt = yy_loadu_256(buf - stride);
+  const __m256i xtr = yy_loadu_256(buf + 1 - stride);
+  const __m256i xl = yy_loadu_256(buf - 1);
+  const __m256i x = yy_loadu_256(buf);
+  const __m256i xr = yy_loadu_256(buf + 1);
+  const __m256i xbl = yy_loadu_256(buf - 1 + stride);
+  const __m256i xb = yy_loadu_256(buf + stride);
+  const __m256i xbr = yy_loadu_256(buf + 1 + stride);
+
+  const __m256i fours = _mm256_add_epi32(
+      xl, _mm256_add_epi32(xt, _mm256_add_epi32(xr, _mm256_add_epi32(xb, x))));
+  const __m256i threes =
+      _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
+
+  return _mm256_sub_epi32(_mm256_slli_epi32(_mm256_add_epi32(fours, threes), 2),
+                          threes);
+}
+
+// The final filter for self-guided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum implementation above).
+static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
+                         const int32_t *B, int buf_stride, const void *dgd8,
+                         int dgd_stride, int width, int height, int highbd) {
+  const int nb = 5;
+  const __m256i rounding =
+      round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+  const uint8_t *dgd_real =
+      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride);
+      const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride);
+
+      const __m128i raw =
+          xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+      const __m256i src =
+          highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+      __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+      __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding),
+                                    SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+
+      yy_storeu_256(dst + i * dst_stride + j, w);
+    }
+  }
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
+                         const int32_t *D, int width, int height,
+                         int buf_stride, int bit_depth, int sgr_params_idx,
+                         int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int n = (2 * r + 1) * (2 * r + 1);
+  const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
+  // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+  const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]);
+
+  const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+  const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+  // Set up masks
+  const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  __m256i mask[8];
+  for (int idx = 0; idx < 8; idx++) {
+    const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
+    mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+  }
+
+  for (int i = -1; i < height + 1; i += 2) {
+    for (int j = -1; j < width + 1; j += 8) {
+      const int32_t *Cij = C + i * buf_stride + j;
+      const int32_t *Dij = D + i * buf_stride + j;
+
+      __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+      __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+      // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
+      // some uninitialised data in their upper words. We use a mask to
+      // ensure that these bits are set to 0.
+      int idx = AOMMIN(8, width + 1 - j);
+      assert(idx >= 1);
+
+      if (idx < 8) {
+        sum1 = _mm256_and_si256(mask[idx], sum1);
+        sum2 = _mm256_and_si256(mask[idx], sum2);
+      }
+
+      const __m256i p = compute_p(sum1, sum2, bit_depth, n);
+
+      const __m256i z = _mm256_min_epi32(
+          _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
+                            SGRPROJ_MTABLE_BITS),
+          _mm256_set1_epi32(255));
+
+      const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4);
+
+      yy_storeu_256(A + i * buf_stride + j, a_res);
+
+      const __m256i a_complement =
+          _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
+
+      // sum1 might have lanes greater than 2^15, so we can't use madd to do
+      // multiplication involving sum1. However, a_complement and one_over_n
+      // are both less than 256, so we can multiply them first.
+      const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
+      const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
+      const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
+                                              SGRPROJ_RECIP_BITS);
+
+      yy_storeu_256(B + i * buf_stride + j, b_res);
+    }
+  }
+}
+
+// Calculate 8 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xtl  xt   xtr
+//  -   buf   -
+// xbl  xb   xbr
+//
+// Pixels are weighted like this:
+//  5    6    5
+//  0    0    0
+//  5    6    5
+//
+// fives = xtl + xtr + xbl + xbr
+// sixes = xt + xb
+// cross_sum = 6 * sixes + 5 * fives
+//           = 5 * (fives + sixes) - sixes
+//           = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) {
+  const __m256i xtl = yy_loadu_256(buf - 1 - stride);
+  const __m256i xt = yy_loadu_256(buf - stride);
+  const __m256i xtr = yy_loadu_256(buf + 1 - stride);
+  const __m256i xbl = yy_loadu_256(buf - 1 + stride);
+  const __m256i xb = yy_loadu_256(buf + stride);
+  const __m256i xbr = yy_loadu_256(buf + 1 + stride);
+
+  const __m256i fives =
+      _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
+  const __m256i sixes = _mm256_add_epi32(xt, xb);
+  const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
+
+  return _mm256_add_epi32(
+      _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
+                       fives_plus_sixes),
+      sixes);
+}
+
+// Calculate 8 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xl    x   xr
+//
+// Pixels are weighted like this:
+//  5    6    5
+//
+// buf points to x
+//
+// fives = xl + xr
+// sixes = x
+// cross_sum = 5 * fives + 6 * sixes
+//           = 4 * (fives + sixes) + (fives + sixes) + sixes
+//           = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m256i cross_sum_fast_odd_row(const int32_t *buf) {
+  const __m256i xl = yy_loadu_256(buf - 1);
+  const __m256i x = yy_loadu_256(buf);
+  const __m256i xr = yy_loadu_256(buf + 1);
+
+  const __m256i fives = _mm256_add_epi32(xl, xr);
+  const __m256i sixes = x;
+
+  const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
+
+  return _mm256_add_epi32(
+      _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
+                       fives_plus_sixes),
+      sixes);
+}
+
+// The final filter for the self-guided restoration. Computes a
+// weighted average across A, B with "cross sums" (see cross_sum_...
+// implementations above).
+static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
+                              const int32_t *B, int buf_stride,
+                              const void *dgd8, int dgd_stride, int width,
+                              int height, int highbd) {
+  const int nb0 = 5;
+  const int nb1 = 4;
+
+  const __m256i rounding0 =
+      round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+  const __m256i rounding1 =
+      round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+  const uint8_t *dgd_real =
+      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+  for (int i = 0; i < height; ++i) {
+    if (!(i & 1)) {  // even row
+      for (int j = 0; j < width; j += 8) {
+        const __m256i a =
+            cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
+        const __m256i b =
+            cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
+
+        const __m128i raw =
+            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+        const __m256i src =
+            highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+        __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+        __m256i w =
+            _mm256_srai_epi32(_mm256_add_epi32(v, rounding0),
+                              SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+
+        yy_storeu_256(dst + i * dst_stride + j, w);
+      }
+    } else {  // odd row
+      for (int j = 0; j < width; j += 8) {
+        const __m256i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
+        const __m256i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
+
+        const __m128i raw =
+            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+        const __m256i src =
+            highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+        __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+        __m256i w =
+            _mm256_srai_epi32(_mm256_add_epi32(v, rounding1),
+                              SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+        yy_storeu_256(dst + i * dst_stride + j, w);
+      }
+    }
+  }
+}
+
+void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+                                     int dgd_stride, int32_t *flt0,
+                                     int32_t *flt1, int flt_stride,
+                                     int sgr_params_idx, int bit_depth,
+                                     int highbd) {
+  // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl,
+  // Ctl and Dtl is 32-byte aligned.
+  const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3);
+
+  DECLARE_ALIGNED(32, int32_t,
+                  buf[4 * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3)]);
+
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 32 bytes for efficiency.
+  int buf_stride = ALIGN_POWER_OF_TWO(width_ext + 16, 3);
+
+  // The "tl" pointers point at the top-left of the initialised data for the
+  // array.
+  int32_t *Atl = buf + 0 * buf_elts + 7;
+  int32_t *Btl = buf + 1 * buf_elts + 7;
+  int32_t *Ctl = buf + 2 * buf_elts + 7;
+  int32_t *Dtl = buf + 3 * buf_elts + 7;
+
+  // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
+  // there's a zero row and column in A, B (integral images), so we move down
+  // and right one for them.
+  const int buf_diag_border =
+      SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
+
+  int32_t *A0 = Atl + 1 + buf_stride;
+  int32_t *B0 = Btl + 1 + buf_stride;
+  int32_t *C0 = Ctl + 1 + buf_stride;
+  int32_t *D0 = Dtl + 1 + buf_stride;
+
+  // Finally, A, B, C, D point at position (0, 0).
+  int32_t *A = A0 + buf_diag_border;
+  int32_t *B = B0 + buf_diag_border;
+  int32_t *C = C0 + buf_diag_border;
+  int32_t *D = D0 + buf_diag_border;
+
+  const int dgd_diag_border =
+      SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
+  const uint8_t *dgd0 = dgd8 - dgd_diag_border;
+
+  // Generate integral images from the input. C will contain sums of squares; D
+  // will contain just sums
+  if (highbd)
+    integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
+                           height_ext, Ctl, Dtl, buf_stride);
+  else
+    integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
+                    buf_stride);
+
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  // Write to flt0 and flt1
+  // If params->r == 0 we skip the corresponding filter. We only allow one of
+  // the radii to be 0, as having both equal to 0 would be equivalent to
+  // skipping SGR entirely.
+  assert(!(params->r[0] == 0 && params->r[1] == 0));
+  assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+  assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+
+  if (params->r[0] > 0) {
+    calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
+                 sgr_params_idx, 0);
+    final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
+                      width, height, highbd);
+  }
+
+  if (params->r[1] > 0) {
+    calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
+            1);
+    final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+                 height, highbd);
+  }
+}
+
+void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
+                                       int height, int stride, int eps,
+                                       const int *xqd, uint8_t *dst8,
+                                       int dst_stride, int32_t *tmpbuf,
+                                       int bit_depth, int highbd) {
+  int32_t *flt0 = tmpbuf;
+  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+  assert(width * height <= RESTORATION_UNITPELS_MAX);
+  av1_selfguided_restoration_avx2(dat8, width, height, stride, flt0, flt1,
+                                  width, eps, bit_depth, highbd);
+  const sgr_params_type *const params = &sgr_params[eps];
+  int xq[2];
+  decode_xq(xqd, xq, params);
+
+  __m256i xq0 = _mm256_set1_epi32(xq[0]);
+  __m256i xq1 = _mm256_set1_epi32(xq[1]);
+
+  for (int i = 0; i < height; ++i) {
+    // Calculate output in batches of 16 pixels
+    for (int j = 0; j < width; j += 16) {
+      const int k = i * width + j;
+      const int m = i * dst_stride + j;
+
+      const uint8_t *dat8ij = dat8 + i * stride + j;
+      __m256i ep_0, ep_1;
+      __m128i src_0, src_1;
+      if (highbd) {
+        src_0 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
+        src_1 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij + 8));
+        ep_0 = _mm256_cvtepu16_epi32(src_0);
+        ep_1 = _mm256_cvtepu16_epi32(src_1);
+      } else {
+        src_0 = xx_loadu_128(dat8ij);
+        ep_0 = _mm256_cvtepu8_epi32(src_0);
+        ep_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src_0, 8));
+      }
+
+      const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS);
+      const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS);
+
+      __m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
+      __m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
+
+      if (params->r[0] > 0) {
+        const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[k]), u_0);
+        v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0));
+
+        const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[k + 8]), u_1);
+        v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1));
+      }
+
+      if (params->r[1] > 0) {
+        const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[k]), u_0);
+        v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0));
+
+        const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[k + 8]), u_1);
+        v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1));
+      }
+
+      const __m256i rounding =
+          round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      const __m256i w_0 = _mm256_srai_epi32(
+          _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      const __m256i w_1 = _mm256_srai_epi32(
+          _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+      if (highbd) {
+        // Pack into 16 bits and clamp to [0, 2^bit_depth)
+        // Note that packing into 16 bits messes up the order of the bits,
+        // so we use a permute function to correct this
+        const __m256i tmp = _mm256_packus_epi32(w_0, w_1);
+        const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
+        const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
+        const __m256i res = _mm256_min_epi16(tmp2, max);
+        yy_storeu_256(CONVERT_TO_SHORTPTR(dst8 + m), res);
+      } else {
+        // Pack into 8 bits and clamp to [0, 256)
+        // Note that each pack messes up the order of the bits,
+        // so we use a permute function to correct this
+        const __m256i tmp = _mm256_packs_epi32(w_0, w_1);
+        const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
+        const __m256i res =
+            _mm256_packus_epi16(tmp2, tmp2 /* "don't care" value */);
+        const __m128i res2 =
+            _mm256_castsi256_si128(_mm256_permute4x64_epi64(res, 0xd8));
+        xx_storeu_128(dst8 + m, res2);
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
index 9de9177c1..a42c94028 100644
--- a/third_party/aom/av1/common/x86/selfguided_sse4.c
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -1,1821 +1,643 @@
 #include <smmintrin.h>
 
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
 #include "av1/common/restoration.h"
 #include "aom_dsp/x86/synonyms.h"
 
-/* Calculate four consecutive entries of the intermediate A and B arrays
-   (corresponding to the first loop in the C version of
-   av1_selfguided_restoration)
-*/
-static void calc_block(__m128i sum, __m128i sum_sq, __m128i n,
-                       __m128i *one_over_n_, __m128i *s_, int bit_depth,
-                       int idx, int32_t *A, int32_t *B) {
-  __m128i a, b, p;
-  __m128i one_over_n = *one_over_n_;
-  __m128i s = *s_;
-#if CONFIG_HIGHBITDEPTH
-  if (bit_depth > 8) {
-    __m128i rounding_a = _mm_set1_epi32((1 << (2 * (bit_depth - 8))) >> 1);
-    __m128i rounding_b = _mm_set1_epi32((1 << (bit_depth - 8)) >> 1);
-    __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
-    __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
-    a = _mm_srl_epi32(_mm_add_epi32(sum_sq, rounding_a), shift_a);
-    b = _mm_srl_epi32(_mm_add_epi32(sum, rounding_b), shift_b);
-    a = _mm_mullo_epi32(a, n);
-    b = _mm_mullo_epi32(b, b);
-    p = _mm_sub_epi32(_mm_max_epi32(a, b), b);
-  } else {
-#endif
-    (void)bit_depth;
-    a = _mm_mullo_epi32(sum_sq, n);
-    b = _mm_mullo_epi32(sum, sum);
-    p = _mm_sub_epi32(a, b);
-#if CONFIG_HIGHBITDEPTH
-  }
-#endif
-
-  __m128i rounding_z = _mm_set1_epi32((1 << SGRPROJ_MTABLE_BITS) >> 1);
-  __m128i z = _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rounding_z),
-                             SGRPROJ_MTABLE_BITS);
-  z = _mm_min_epi32(z, _mm_set1_epi32(255));
-
-  // 'Gather' type instructions are not available pre-AVX2, so synthesize a
-  // gather using scalar loads.
-  __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
-                                x_by_xplus1[_mm_extract_epi32(z, 2)],
-                                x_by_xplus1[_mm_extract_epi32(z, 1)],
-                                x_by_xplus1[_mm_extract_epi32(z, 0)]);
-
-  _mm_storeu_si128((__m128i *)&A[idx], a_res);
-
-  __m128i rounding_res = _mm_set1_epi32((1 << SGRPROJ_RECIP_BITS) >> 1);
-  __m128i a_complement = _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
-  __m128i b_int =
-      _mm_mullo_epi32(a_complement, _mm_mullo_epi32(sum, one_over_n));
-  __m128i b_res =
-      _mm_srli_epi32(_mm_add_epi32(b_int, rounding_res), SGRPROJ_RECIP_BITS);
-
-  _mm_storeu_si128((__m128i *)&B[idx], b_res);
+// Load 4 bytes from the possibly-misaligned pointer p, extend each byte to
+// 32-bit precision and return them in an SSE register.
+static __m128i xx_load_extend_8_32(const void *p) {
+  return _mm_cvtepu8_epi32(xx_loadl_32(p));
 }
 
-static void selfguided_restoration_1_v(uint8_t *src, int width, int height,
-                                       int src_stride, int32_t *A, int32_t *B,
-                                       int buf_stride) {
-  int i, j;
-
-  // Vertical sum
-  // When the width is not a multiple of 4, we know that 'stride' is rounded up
-  // to a multiple of 4. So it is safe for this loop to calculate extra columns
-  // at the right-hand edge of the frame.
-  int width_extend = (width + 3) & ~3;
-  for (j = 0; j < width_extend; j += 4) {
-    __m128i a, b, x, y, x2, y2;
-    __m128i sum, sum_sq, tmp;
-
-    a = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[j]));
-    b = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[src_stride + j]));
-
-    sum = _mm_cvtepi16_epi32(_mm_add_epi16(a, b));
-    tmp = _mm_unpacklo_epi16(a, b);
-    sum_sq = _mm_madd_epi16(tmp, tmp);
-
-    _mm_store_si128((__m128i *)&B[j], sum);
-    _mm_store_si128((__m128i *)&A[j], sum_sq);
-
-    x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[2 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
-
-    for (i = 1; i < height - 2; ++i) {
-      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
-      x = _mm_cvtepu8_epi32(
-          xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j]));
-      y = _mm_cvtepu8_epi32(
-          xx_loadl_32((__m128i *)&src[(i + 2) * src_stride + j]));
-
-      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
-
-      x2 = _mm_mullo_epi32(x, x);
-      y2 = _mm_mullo_epi32(y, y);
-
-      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
-    }
-    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu8_epi32(
-        xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
+// Load 4 halfwords from the possibly-misaligned pointer p, extend each
+// halfword to 32-bit precision and return them in an SSE register.
+static __m128i xx_load_extend_16_32(const void *p) {
+  return _mm_cvtepu16_epi32(xx_loadl_64(p));
+}
 
-    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
-  }
+// Compute the scan of an SSE register holding 4 32-bit integers. If the
+// register holds x0..x3 then the scan will hold x0, x0+x1, x0+x1+x2,
+// x0+x1+x2+x3
+static __m128i scan_32(__m128i x) {
+  const __m128i x01 = _mm_add_epi32(x, _mm_slli_si128(x, 4));
+  return _mm_add_epi32(x01, _mm_slli_si128(x01, 8));
 }
 
-static void selfguided_restoration_1_h(int32_t *A, int32_t *B, int width,
-                                       int height, int buf_stride, int eps,
-                                       int bit_depth) {
-  int i, j;
-
-  // Horizontal sum
-  int width_extend = (width + 3) & ~3;
-  for (i = 0; i < height; ++i) {
-    int h = AOMMIN(2, height - i) + AOMMIN(1, i);
-
-    __m128i a1 = _mm_loadu_si128((__m128i *)&A[i * buf_stride]);
-    __m128i b1 = _mm_loadu_si128((__m128i *)&B[i * buf_stride]);
-    __m128i a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 4]);
-    __m128i b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 4]);
-
-    // Note: The _mm_slli_si128 call sets up a register containing
-    // {0, A[i * buf_stride], ..., A[i * buf_stride + 2]},
-    // so that the first element of 'sum' (which should only add two values
-    // together) ends up calculated correctly.
-    __m128i sum_ = _mm_add_epi32(_mm_slli_si128(b1, 4),
-                                 _mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4)));
-    __m128i sum_sq_ = _mm_add_epi32(
-        _mm_slli_si128(a1, 4), _mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4)));
-    __m128i n = _mm_set_epi32(3 * h, 3 * h, 3 * h, 2 * h);
-    __m128i one_over_n =
-        _mm_set_epi32(one_by_x[3 * h - 1], one_by_x[3 * h - 1],
-                      one_by_x[3 * h - 1], one_by_x[2 * h - 1]);
-    __m128i s = _mm_set_epi32(
-        sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1],
-        sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][2 * h - 1]);
-    calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride, A,
-               B);
-
-    n = _mm_set1_epi32(3 * h);
-    one_over_n = _mm_set1_epi32(one_by_x[3 * h - 1]);
-    s = _mm_set1_epi32(sgrproj_mtable[eps - 1][3 * h - 1]);
-
-    // Re-align a1 and b1 so that they start at index i * buf_stride + 3
-    a2 = _mm_alignr_epi8(a2, a1, 12);
-    b2 = _mm_alignr_epi8(b2, b1, 12);
-
-    // Note: When the width is not a multiple of 4, this loop may end up
-    // writing to the last 4 columns of the frame, potentially with incorrect
-    // values (especially for r=2 and r=3).
-    // This is fine, since we fix up those values in the block after this
-    // loop, and in exchange we never have more than four values to
-    // write / fix up after this loop finishes.
-    for (j = 4; j < width_extend - 4; j += 4) {
-      a1 = a2;
-      b1 = b2;
-      a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 3]);
-      b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 3]);
-      /* Loop invariant: At this point,
-         a1 = original A[i * buf_stride + j - 1 : i * buf_stride + j + 3]
-         a2 = original A[i * buf_stride + j + 3 : i * buf_stride + j + 7]
-         and similar for b1,b2 and B
-      */
-      sum_ = _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
-                                             _mm_alignr_epi8(b2, b1, 8)));
-      sum_sq_ = _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
-                                                _mm_alignr_epi8(a2, a1, 8)));
-      calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth,
-                 i * buf_stride + j, A, B);
-    }
-    __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 3]);
-    __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 3]);
-
-    j = width - 4;
-    switch (width % 4) {
-      case 0:
-        a1 = a2;
-        b1 = b2;
-        a2 = a3;
-        b2 = b3;
-        break;
-      case 1:
-        a1 = _mm_alignr_epi8(a2, a1, 4);
-        b1 = _mm_alignr_epi8(b2, b1, 4);
-        a2 = _mm_alignr_epi8(a3, a2, 4);
-        b2 = _mm_alignr_epi8(b3, b2, 4);
-        break;
-      case 2:
-        a1 = _mm_alignr_epi8(a2, a1, 8);
-        b1 = _mm_alignr_epi8(b2, b1, 8);
-        a2 = _mm_alignr_epi8(a3, a2, 8);
-        b2 = _mm_alignr_epi8(b3, b2, 8);
-        break;
-      case 3:
-        a1 = _mm_alignr_epi8(a2, a1, 12);
-        b1 = _mm_alignr_epi8(b2, b1, 12);
-        a2 = _mm_alignr_epi8(a3, a2, 12);
-        b2 = _mm_alignr_epi8(b3, b2, 12);
-        break;
+// Compute two integral images from src. B sums elements; A sums their
+// squares. The images are offset by one pixel, so will have width and height
+// equal to width + 1, height + 1 and the first row and column will be zero.
+//
+// A+1 and B+1 should be aligned to 16 bytes. buf_stride should be a multiple
+// of 4.
+static void integral_images(const uint8_t *src, int src_stride, int width,
+                            int height, int32_t *A, int32_t *B,
+                            int buf_stride) {
+  // Write out the zero top row
+  memset(A, 0, sizeof(*A) * (width + 1));
+  memset(B, 0, sizeof(*B) * (width + 1));
+
+  const __m128i zero = _mm_setzero_si128();
+  for (int i = 0; i < height; ++i) {
+    // Zero the left column.
+    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+    // ldiff is the difference H - D where H is the output sample immediately
+    // to the left and D is the output sample above it. These are scalars,
+    // replicated across the four lanes.
+    __m128i ldiff1 = zero, ldiff2 = zero;
+    for (int j = 0; j < width; j += 4) {
+      const int ABj = 1 + j;
+
+      const __m128i above1 = xx_load_128(B + ABj + i * buf_stride);
+      const __m128i above2 = xx_load_128(A + ABj + i * buf_stride);
+
+      const __m128i x1 = xx_load_extend_8_32(src + j + i * src_stride);
+      const __m128i x2 = _mm_madd_epi16(x1, x1);
+
+      const __m128i sc1 = scan_32(x1);
+      const __m128i sc2 = scan_32(x2);
+
+      const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1);
+      const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2);
+
+      xx_store_128(B + ABj + (i + 1) * buf_stride, row1);
+      xx_store_128(A + ABj + (i + 1) * buf_stride, row2);
+
+      // Calculate the new H - D.
+      ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff);
+      ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff);
     }
-
-    // Zero out the data loaded from "off the edge" of the array
-    __m128i zero = _mm_setzero_si128();
-    a2 = _mm_blend_epi16(a2, zero, 0xfc);
-    b2 = _mm_blend_epi16(b2, zero, 0xfc);
-
-    sum_ = _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
-                                           _mm_alignr_epi8(b2, b1, 8)));
-    sum_sq_ = _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
-                                              _mm_alignr_epi8(a2, a1, 8)));
-    n = _mm_set_epi32(2 * h, 3 * h, 3 * h, 3 * h);
-    one_over_n = _mm_set_epi32(one_by_x[2 * h - 1], one_by_x[3 * h - 1],
-                               one_by_x[3 * h - 1], one_by_x[3 * h - 1]);
-    s = _mm_set_epi32(
-        sgrproj_mtable[eps - 1][2 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1],
-        sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1]);
-    calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride + j,
-               A, B);
   }
 }
 
-static void selfguided_restoration_2_v(uint8_t *src, int width, int height,
-                                       int src_stride, int32_t *A, int32_t *B,
-                                       int buf_stride) {
-  int i, j;
-
-  // Vertical sum
-  int width_extend = (width + 3) & ~3;
-  for (j = 0; j < width_extend; j += 4) {
-    __m128i a, b, c, c2, x, y, x2, y2;
-    __m128i sum, sum_sq, tmp;
-
-    a = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[j]));
-    b = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[src_stride + j]));
-    c = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[2 * src_stride + j]));
+// Compute two integral images from src. B sums elements; A sums their squares
+//
+// A and B should be aligned to 16 bytes. buf_stride should be a multiple of 4.
+static void integral_images_highbd(const uint16_t *src, int src_stride,
+                                   int width, int height, int32_t *A,
+                                   int32_t *B, int buf_stride) {
+  // Write out the zero top row
+  memset(A, 0, sizeof(*A) * (width + 1));
+  memset(B, 0, sizeof(*B) * (width + 1));
 
-    sum = _mm_cvtepi16_epi32(_mm_add_epi16(_mm_add_epi16(a, b), c));
-    // Important: Since c may be up to 2^8, the result on squaring may
-    // be up to 2^16. So we need to zero-extend, not sign-extend.
-    c2 = _mm_cvtepu16_epi32(_mm_mullo_epi16(c, c));
-    tmp = _mm_unpacklo_epi16(a, b);
-    sum_sq = _mm_add_epi32(_mm_madd_epi16(tmp, tmp), c2);
+  const __m128i zero = _mm_setzero_si128();
+  for (int i = 0; i < height; ++i) {
+    // Zero the left column.
+    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
 
-    _mm_store_si128((__m128i *)&B[j], sum);
-    _mm_store_si128((__m128i *)&A[j], sum_sq);
+    // ldiff is the difference H - D where H is the output sample immediately
+    // to the left and D is the output sample above it. These are scalars,
+    // replicated across the four lanes.
+    __m128i ldiff1 = zero, ldiff2 = zero;
+    for (int j = 0; j < width; j += 4) {
+      const int ABj = 1 + j;
 
-    x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[3 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
+      const __m128i above1 = xx_load_128(B + ABj + i * buf_stride);
+      const __m128i above2 = xx_load_128(A + ABj + i * buf_stride);
 
-    _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
+      const __m128i x1 = xx_load_extend_16_32(src + j + i * src_stride);
+      const __m128i x2 = _mm_madd_epi16(x1, x1);
 
-    x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[4 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
+      const __m128i sc1 = scan_32(x1);
+      const __m128i sc2 = scan_32(x2);
 
-    for (i = 2; i < height - 3; ++i) {
-      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+      const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1);
+      const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2);
 
-      x = _mm_cvtepu8_epi32(
-          _mm_cvtsi32_si128(*((int *)&src[(i - 2) * src_stride + j])));
-      y = _mm_cvtepu8_epi32(
-          _mm_cvtsi32_si128(*((int *)&src[(i + 3) * src_stride + j])));
+      xx_store_128(B + ABj + (i + 1) * buf_stride, row1);
+      xx_store_128(A + ABj + (i + 1) * buf_stride, row2);
 
-      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
-
-      x2 = _mm_mullo_epi32(x, x);
-      y2 = _mm_mullo_epi32(y, y);
-
-      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+      // Calculate the new H - D.
+      ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff);
+      ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff);
     }
-    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu8_epi32(
-        xx_loadl_32((__m128i *)&src[(i - 2) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu8_epi32(
-        xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
   }
 }
 
-static void selfguided_restoration_2_h(int32_t *A, int32_t *B, int width,
-                                       int height, int buf_stride, int eps,
-                                       int bit_depth) {
-  int i, j;
-
-  // Horizontal sum
-  int width_extend = (width + 3) & ~3;
-  for (i = 0; i < height; ++i) {
-    int h = AOMMIN(3, height - i) + AOMMIN(2, i);
-
-    __m128i a1 = _mm_loadu_si128((__m128i *)&A[i * buf_stride]);
-    __m128i b1 = _mm_loadu_si128((__m128i *)&B[i * buf_stride]);
-    __m128i a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 4]);
-    __m128i b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 4]);
-
-    __m128i sum_ = _mm_add_epi32(
-        _mm_add_epi32(
-            _mm_add_epi32(_mm_slli_si128(b1, 8), _mm_slli_si128(b1, 4)),
-            _mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4))),
-        _mm_alignr_epi8(b2, b1, 8));
-    __m128i sum_sq_ = _mm_add_epi32(
-        _mm_add_epi32(
-            _mm_add_epi32(_mm_slli_si128(a1, 8), _mm_slli_si128(a1, 4)),
-            _mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4))),
-        _mm_alignr_epi8(a2, a1, 8));
-
-    __m128i n = _mm_set_epi32(5 * h, 5 * h, 4 * h, 3 * h);
-    __m128i one_over_n =
-        _mm_set_epi32(one_by_x[5 * h - 1], one_by_x[5 * h - 1],
-                      one_by_x[4 * h - 1], one_by_x[3 * h - 1]);
-    __m128i s = _mm_set_epi32(
-        sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1],
-        sgrproj_mtable[eps - 1][4 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1]);
-    calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride, A,
-               B);
-
-    // Re-align a1 and b1 so that they start at index i * buf_stride + 2
-    a2 = _mm_alignr_epi8(a2, a1, 8);
-    b2 = _mm_alignr_epi8(b2, b1, 8);
-
-    n = _mm_set1_epi32(5 * h);
-    one_over_n = _mm_set1_epi32(one_by_x[5 * h - 1]);
-    s = _mm_set1_epi32(sgrproj_mtable[eps - 1][5 * h - 1]);
-
-    for (j = 4; j < width_extend - 4; j += 4) {
-      a1 = a2;
-      a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 2]);
-      b1 = b2;
-      b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 2]);
-      /* Loop invariant: At this point,
-         a1 = original A[i * buf_stride + j - 2 : i * buf_stride + j + 2]
-         a2 = original A[i * buf_stride + j + 2 : i * buf_stride + j + 6]
-         and similar for b1,b2 and B
-      */
-      sum_ = _mm_add_epi32(
-          _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
-                                          _mm_alignr_epi8(b2, b1, 8))),
-          _mm_add_epi32(_mm_alignr_epi8(b2, b1, 12), b2));
-      sum_sq_ = _mm_add_epi32(
-          _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
-                                          _mm_alignr_epi8(a2, a1, 8))),
-          _mm_add_epi32(_mm_alignr_epi8(a2, a1, 12), a2));
-
-      calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth,
-                 i * buf_stride + j, A, B);
-    }
-    // If the width is not a multiple of 4, we need to reset j to width - 4
-    // and adjust a1, a2, b1, b2 so that the loop invariant above is maintained
-    __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 2]);
-    __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 2]);
-
-    j = width - 4;
-    switch (width % 4) {
-      case 0:
-        a1 = a2;
-        b1 = b2;
-        a2 = a3;
-        b2 = b3;
-        break;
-      case 1:
-        a1 = _mm_alignr_epi8(a2, a1, 4);
-        b1 = _mm_alignr_epi8(b2, b1, 4);
-        a2 = _mm_alignr_epi8(a3, a2, 4);
-        b2 = _mm_alignr_epi8(b3, b2, 4);
-        break;
-      case 2:
-        a1 = _mm_alignr_epi8(a2, a1, 8);
-        b1 = _mm_alignr_epi8(b2, b1, 8);
-        a2 = _mm_alignr_epi8(a3, a2, 8);
-        b2 = _mm_alignr_epi8(b3, b2, 8);
-        break;
-      case 3:
-        a1 = _mm_alignr_epi8(a2, a1, 12);
-        b1 = _mm_alignr_epi8(b2, b1, 12);
-        a2 = _mm_alignr_epi8(a3, a2, 12);
-        b2 = _mm_alignr_epi8(b3, b2, 12);
-        break;
-    }
-
-    // Zero out the data loaded from "off the edge" of the array
-    __m128i zero = _mm_setzero_si128();
-    a2 = _mm_blend_epi16(a2, zero, 0xf0);
-    b2 = _mm_blend_epi16(b2, zero, 0xf0);
-
-    sum_ = _mm_add_epi32(
-        _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
-                                        _mm_alignr_epi8(b2, b1, 8))),
-        _mm_add_epi32(_mm_alignr_epi8(b2, b1, 12), b2));
-    sum_sq_ = _mm_add_epi32(
-        _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
-                                        _mm_alignr_epi8(a2, a1, 8))),
-        _mm_add_epi32(_mm_alignr_epi8(a2, a1, 12), a2));
-
-    n = _mm_set_epi32(3 * h, 4 * h, 5 * h, 5 * h);
-    one_over_n = _mm_set_epi32(one_by_x[3 * h - 1], one_by_x[4 * h - 1],
-                               one_by_x[5 * h - 1], one_by_x[5 * h - 1]);
-    s = _mm_set_epi32(
-        sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][4 * h - 1],
-        sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1]);
-    calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride + j,
-               A, B);
-  }
+// Compute 4 values of boxsum from the given integral image. ii should point
+// at the middle of the box (for the first value). r is the box radius.
+static INLINE __m128i boxsum_from_ii(const int32_t *ii, int stride, int r) {
+  const __m128i tl = xx_loadu_128(ii - (r + 1) - (r + 1) * stride);
+  const __m128i tr = xx_loadu_128(ii + (r + 0) - (r + 1) * stride);
+  const __m128i bl = xx_loadu_128(ii - (r + 1) + r * stride);
+  const __m128i br = xx_loadu_128(ii + (r + 0) + r * stride);
+  const __m128i u = _mm_sub_epi32(tr, tl);
+  const __m128i v = _mm_sub_epi32(br, bl);
+  return _mm_sub_epi32(v, u);
 }
 
-static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
-                                       int src_stride, int32_t *A, int32_t *B,
-                                       int buf_stride) {
-  int i, j;
-
-  // Vertical sum over 7-pixel regions, 4 columns at a time
-  int width_extend = (width + 3) & ~3;
-  for (j = 0; j < width_extend; j += 4) {
-    __m128i a, b, c, d, x, y, x2, y2;
-    __m128i sum, sum_sq, tmp, tmp2;
+static __m128i round_for_shift(unsigned shift) {
+  return _mm_set1_epi32((1 << shift) >> 1);
+}
 
-    a = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[j]));
-    b = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[src_stride + j]));
-    c = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[2 * src_stride + j]));
-    d = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[3 * src_stride + j]));
+static __m128i compute_p(__m128i sum1, __m128i sum2, int bit_depth, int n) {
+  __m128i an, bb;
+  if (bit_depth > 8) {
+    const __m128i rounding_a = round_for_shift(2 * (bit_depth - 8));
+    const __m128i rounding_b = round_for_shift(bit_depth - 8);
+    const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
+    const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
+    const __m128i a = _mm_srl_epi32(_mm_add_epi32(sum2, rounding_a), shift_a);
+    const __m128i b = _mm_srl_epi32(_mm_add_epi32(sum1, rounding_b), shift_b);
+    // b < 2^14, so we can use a 16-bit madd rather than a 32-bit
+    // mullo to square it
+    bb = _mm_madd_epi16(b, b);
+    an = _mm_max_epi32(_mm_mullo_epi32(a, _mm_set1_epi32(n)), bb);
+  } else {
+    bb = _mm_madd_epi16(sum1, sum1);
+    an = _mm_mullo_epi32(sum2, _mm_set1_epi32(n));
+  }
+  return _mm_sub_epi32(an, bb);
+}
 
-    sum = _mm_cvtepi16_epi32(
-        _mm_add_epi16(_mm_add_epi16(a, b), _mm_add_epi16(c, d)));
-    tmp = _mm_unpacklo_epi16(a, b);
-    tmp2 = _mm_unpacklo_epi16(c, d);
-    sum_sq =
-        _mm_add_epi32(_mm_madd_epi16(tmp, tmp), _mm_madd_epi16(tmp2, tmp2));
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
+                    int width, int height, int buf_stride, int bit_depth,
+                    int sgr_params_idx, int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int n = (2 * r + 1) * (2 * r + 1);
+  const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
+  // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+  const __m128i one_over_n = _mm_set1_epi32(one_by_x[n - 1]);
+
+  const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+  const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+  // Set up masks
+  const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  __m128i mask[4];
+  for (int idx = 0; idx < 4; idx++) {
+    const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx));
+    mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+  }
 
-    _mm_store_si128((__m128i *)&B[j], sum);
-    _mm_store_si128((__m128i *)&A[j], sum_sq);
+  for (int i = -1; i < height + 1; ++i) {
+    for (int j = -1; j < width + 1; j += 4) {
+      const int32_t *Cij = C + i * buf_stride + j;
+      const int32_t *Dij = D + i * buf_stride + j;
 
-    x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[4 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
+      __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+      __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r);
 
-    _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
+      // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain
+      // some uninitialised data in their upper words. We use a mask to
+      // ensure that these bits are set to 0.
+      int idx = AOMMIN(4, width + 1 - j);
+      assert(idx >= 1);
 
-    x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[5 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
+      if (idx < 4) {
+        sum1 = _mm_and_si128(mask[idx], sum1);
+        sum2 = _mm_and_si128(mask[idx], sum2);
+      }
 
-    _mm_store_si128((__m128i *)&B[2 * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[2 * buf_stride + j], sum_sq);
+      const __m128i p = compute_p(sum1, sum2, bit_depth, n);
 
-    x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[6 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
+      const __m128i z = _mm_min_epi32(
+          _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z),
+                         SGRPROJ_MTABLE_BITS),
+          _mm_set1_epi32(255));
 
-    for (i = 3; i < height - 4; ++i) {
-      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+      // 'Gather' type instructions are not available pre-AVX2, so synthesize a
+      // gather using scalar loads.
+      const __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
+                                          x_by_xplus1[_mm_extract_epi32(z, 2)],
+                                          x_by_xplus1[_mm_extract_epi32(z, 1)],
+                                          x_by_xplus1[_mm_extract_epi32(z, 0)]);
 
-      x = _mm_cvtepu8_epi32(xx_loadl_32(&src[(i - 3) * src_stride + j]));
-      y = _mm_cvtepu8_epi32(xx_loadl_32(&src[(i + 4) * src_stride + j]));
+      xx_storeu_128(A + i * buf_stride + j, a_res);
 
-      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+      const __m128i a_complement =
+          _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
 
-      x2 = _mm_mullo_epi32(x, x);
-      y2 = _mm_mullo_epi32(y, y);
+      // sum1 might have lanes greater than 2^15, so we can't use madd to do
+      // multiplication involving sum1. However, a_complement and one_over_n
+      // are both less than 256, so we can multiply them first.
+      const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n);
+      const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1);
+      const __m128i b_res =
+          _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS);
 
-      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+      xx_storeu_128(B + i * buf_stride + j, b_res);
     }
-    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu8_epi32(
-        xx_loadl_32((__m128i *)&src[(i - 3) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu8_epi32(
-        xx_loadl_32((__m128i *)&src[(i - 2) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu8_epi32(
-        xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 3) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 3) * buf_stride + j], sum_sq);
   }
 }
 
-static void selfguided_restoration_3_h(int32_t *A, int32_t *B, int width,
-                                       int height, int buf_stride, int eps,
-                                       int bit_depth) {
-  int i, j;
-  // Horizontal sum over 7-pixel regions of dst
-  int width_extend = (width + 3) & ~3;
-  for (i = 0; i < height; ++i) {
-    int h = AOMMIN(4, height - i) + AOMMIN(3, i);
-
-    __m128i a1 = _mm_loadu_si128((__m128i *)&A[i * buf_stride]);
-    __m128i b1 = _mm_loadu_si128((__m128i *)&B[i * buf_stride]);
-    __m128i a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 4]);
-    __m128i b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 4]);
-
-    __m128i sum_ = _mm_add_epi32(
-        _mm_add_epi32(
-            _mm_add_epi32(_mm_slli_si128(b1, 12), _mm_slli_si128(b1, 8)),
-            _mm_add_epi32(_mm_slli_si128(b1, 4), b1)),
-        _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
-                                    _mm_alignr_epi8(b2, b1, 8)),
-                      _mm_alignr_epi8(b2, b1, 12)));
-    __m128i sum_sq_ = _mm_add_epi32(
-        _mm_add_epi32(
-            _mm_add_epi32(_mm_slli_si128(a1, 12), _mm_slli_si128(a1, 8)),
-            _mm_add_epi32(_mm_slli_si128(a1, 4), a1)),
-        _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
-                                    _mm_alignr_epi8(a2, a1, 8)),
-                      _mm_alignr_epi8(a2, a1, 12)));
-
-    __m128i n = _mm_set_epi32(7 * h, 6 * h, 5 * h, 4 * h);
-    __m128i one_over_n =
-        _mm_set_epi32(one_by_x[7 * h - 1], one_by_x[6 * h - 1],
-                      one_by_x[5 * h - 1], one_by_x[4 * h - 1]);
-    __m128i s = _mm_set_epi32(
-        sgrproj_mtable[eps - 1][7 * h - 1], sgrproj_mtable[eps - 1][6 * h - 1],
-        sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][4 * h - 1]);
-    calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride, A,
-               B);
-
-    // Re-align a1 and b1 so that they start at index i * buf_stride + 1
-    a2 = _mm_alignr_epi8(a2, a1, 4);
-    b2 = _mm_alignr_epi8(b2, b1, 4);
-
-    n = _mm_set1_epi32(7 * h);
-    one_over_n = _mm_set1_epi32(one_by_x[7 * h - 1]);
-    s = _mm_set1_epi32(sgrproj_mtable[eps - 1][7 * h - 1]);
-
-    for (j = 4; j < width_extend - 4; j += 4) {
-      a1 = a2;
-      a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 1]);
-      b1 = b2;
-      b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 1]);
-      __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 5]);
-      __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 5]);
-      /* Loop invariant: At this point,
-         a1 = original A[i * buf_stride + j - 3 : i * buf_stride + j + 1]
-         a2 = original A[i * buf_stride + j + 1 : i * buf_stride + j + 5]
-         a3 = original A[i * buf_stride + j + 5 : i * buf_stride + j + 9]
-         and similar for b1,b2,b3 and B
-      */
-      sum_ = _mm_add_epi32(
-          _mm_add_epi32(_mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4)),
-                        _mm_add_epi32(_mm_alignr_epi8(b2, b1, 8),
-                                      _mm_alignr_epi8(b2, b1, 12))),
-          _mm_add_epi32(_mm_add_epi32(b2, _mm_alignr_epi8(b3, b2, 4)),
-                        _mm_alignr_epi8(b3, b2, 8)));
-      sum_sq_ = _mm_add_epi32(
-          _mm_add_epi32(_mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4)),
-                        _mm_add_epi32(_mm_alignr_epi8(a2, a1, 8),
-                                      _mm_alignr_epi8(a2, a1, 12))),
-          _mm_add_epi32(_mm_add_epi32(a2, _mm_alignr_epi8(a3, a2, 4)),
-                        _mm_alignr_epi8(a3, a2, 8)));
-
-      calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth,
-                 i * buf_stride + j, A, B);
-    }
-    __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 1]);
-    __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 1]);
-
-    j = width - 4;
-    switch (width % 4) {
-      case 0:
-        a1 = a2;
-        b1 = b2;
-        a2 = a3;
-        b2 = b3;
-        break;
-      case 1:
-        a1 = _mm_alignr_epi8(a2, a1, 4);
-        b1 = _mm_alignr_epi8(b2, b1, 4);
-        a2 = _mm_alignr_epi8(a3, a2, 4);
-        b2 = _mm_alignr_epi8(b3, b2, 4);
-        break;
-      case 2:
-        a1 = _mm_alignr_epi8(a2, a1, 8);
-        b1 = _mm_alignr_epi8(b2, b1, 8);
-        a2 = _mm_alignr_epi8(a3, a2, 8);
-        b2 = _mm_alignr_epi8(b3, b2, 8);
-        break;
-      case 3:
-        a1 = _mm_alignr_epi8(a2, a1, 12);
-        b1 = _mm_alignr_epi8(b2, b1, 12);
-        a2 = _mm_alignr_epi8(a3, a2, 12);
-        b2 = _mm_alignr_epi8(b3, b2, 12);
-        break;
-    }
-
-    // Zero out the data loaded from "off the edge" of the array
-    __m128i zero = _mm_setzero_si128();
-    a2 = _mm_blend_epi16(a2, zero, 0xc0);
-    b2 = _mm_blend_epi16(b2, zero, 0xc0);
-
-    sum_ = _mm_add_epi32(
-        _mm_add_epi32(_mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4)),
-                      _mm_add_epi32(_mm_alignr_epi8(b2, b1, 8),
-                                    _mm_alignr_epi8(b2, b1, 12))),
-        _mm_add_epi32(_mm_add_epi32(b2, _mm_alignr_epi8(zero, b2, 4)),
-                      _mm_alignr_epi8(zero, b2, 8)));
-    sum_sq_ = _mm_add_epi32(
-        _mm_add_epi32(_mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4)),
-                      _mm_add_epi32(_mm_alignr_epi8(a2, a1, 8),
-                                    _mm_alignr_epi8(a2, a1, 12))),
-        _mm_add_epi32(_mm_add_epi32(a2, _mm_alignr_epi8(zero, a2, 4)),
-                      _mm_alignr_epi8(zero, a2, 8)));
-
-    n = _mm_set_epi32(4 * h, 5 * h, 6 * h, 7 * h);
-    one_over_n = _mm_set_epi32(one_by_x[4 * h - 1], one_by_x[5 * h - 1],
-                               one_by_x[6 * h - 1], one_by_x[7 * h - 1]);
-    s = _mm_set_epi32(
-        sgrproj_mtable[eps - 1][4 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1],
-        sgrproj_mtable[eps - 1][6 * h - 1], sgrproj_mtable[eps - 1][7 * h - 1]);
-    calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride + j,
-               A, B);
-  }
+// Calculate 4 values of the "cross sum" starting at buf. This is a 3x3 filter
+// where the outer four corners have weight 3 and all other pixels have weight
+// 4.
+//
+// Pixels are indexed like this:
+// xtl  xt   xtr
+// xl    x   xr
+// xbl  xb   xbr
+//
+// buf points to x
+//
+// fours = xl + xt + xr + xb + x
+// threes = xtl + xtr + xbr + xbl
+// cross_sum = 4 * fours + 3 * threes
+//           = 4 * (fours + threes) - threes
+//           = (fours + threes) << 2 - threes
+static INLINE __m128i cross_sum(const int32_t *buf, int stride) {
+  const __m128i xtl = xx_loadu_128(buf - 1 - stride);
+  const __m128i xt = xx_loadu_128(buf - stride);
+  const __m128i xtr = xx_loadu_128(buf + 1 - stride);
+  const __m128i xl = xx_loadu_128(buf - 1);
+  const __m128i x = xx_loadu_128(buf);
+  const __m128i xr = xx_loadu_128(buf + 1);
+  const __m128i xbl = xx_loadu_128(buf - 1 + stride);
+  const __m128i xb = xx_loadu_128(buf + stride);
+  const __m128i xbr = xx_loadu_128(buf + 1 + stride);
+
+  const __m128i fours = _mm_add_epi32(
+      xl, _mm_add_epi32(xt, _mm_add_epi32(xr, _mm_add_epi32(xb, x))));
+  const __m128i threes =
+      _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl)));
+
+  return _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(fours, threes), 2), threes);
 }
 
-void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
-                                       int dgd_stride, int32_t *dst,
-                                       int dst_stride, int r, int eps) {
-  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
-  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
-  int32_t A_[RESTORATION_PROC_UNIT_PELS];
-  int32_t B_[RESTORATION_PROC_UNIT_PELS];
-  int32_t *A = A_;
-  int32_t *B = B_;
-  int i, j;
-  // Adjusting the stride of A and B here appears to avoid bad cache effects,
-  // leading to a significant speed improvement.
-  // We also align the stride to a multiple of 16 bytes for efficiency.
-  int buf_stride = ((width_ext + 3) & ~3) + 16;
-
-  // Don't filter tiles with dimensions < 5 on any axis
-  if ((width < 5) || (height < 5)) return;
-
-  uint8_t *dgd0 = dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ;
-  if (r == 1) {
-    selfguided_restoration_1_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
-                               buf_stride);
-    selfguided_restoration_1_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
-  } else if (r == 2) {
-    selfguided_restoration_2_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
-                               buf_stride);
-    selfguided_restoration_2_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
-  } else if (r == 3) {
-    selfguided_restoration_3_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
-                               buf_stride);
-    selfguided_restoration_3_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
-  } else {
-    assert(0);
-  }
-  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
-  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
-
-  {
-    i = 0;
-    j = 0;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
-                        A[k + buf_stride + 1];
-      const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] +
-                        B[k + buf_stride + 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-    for (j = 1; j < width - 1; ++j) {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
-                        A[k + buf_stride - 1] + A[k + buf_stride + 1];
-      const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
-                        B[k + buf_stride - 1] + B[k + buf_stride + 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-    j = width - 1;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
-                        A[k + buf_stride - 1];
-      const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] +
-                        B[k + buf_stride - 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-  }
-  for (i = 1; i < height - 1; ++i) {
-    j = 0;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
-                        A[k + 1] + A[k - buf_stride + 1] +
-                        A[k + buf_stride + 1];
-      const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
-                        B[k + 1] + B[k - buf_stride + 1] +
-                        B[k + buf_stride + 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-
-    // Vectorize the innermost loop
-    for (j = 1; j < width - 1; j += 4) {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 5;
-
-      __m128i tmp0 = _mm_loadu_si128((__m128i *)&A[k - 1 - buf_stride]);
-      __m128i tmp1 = _mm_loadu_si128((__m128i *)&A[k + 3 - buf_stride]);
-      __m128i tmp2 = _mm_loadu_si128((__m128i *)&A[k - 1]);
-      __m128i tmp3 = _mm_loadu_si128((__m128i *)&A[k + 3]);
-      __m128i tmp4 = _mm_loadu_si128((__m128i *)&A[k - 1 + buf_stride]);
-      __m128i tmp5 = _mm_loadu_si128((__m128i *)&A[k + 3 + buf_stride]);
-
-      __m128i a0 = _mm_add_epi32(
-          _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 4), tmp2),
-                        _mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 8),
-                                      _mm_alignr_epi8(tmp5, tmp4, 4))),
-          _mm_alignr_epi8(tmp1, tmp0, 4));
-      __m128i a1 = _mm_add_epi32(_mm_add_epi32(tmp0, tmp4),
-                                 _mm_add_epi32(_mm_alignr_epi8(tmp1, tmp0, 8),
-                                               _mm_alignr_epi8(tmp5, tmp4, 8)));
-      __m128i a = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(a0, a1), 2), a1);
-
-      __m128i tmp6 = _mm_loadu_si128((__m128i *)&B[k - 1 - buf_stride]);
-      __m128i tmp7 = _mm_loadu_si128((__m128i *)&B[k + 3 - buf_stride]);
-      __m128i tmp8 = _mm_loadu_si128((__m128i *)&B[k - 1]);
-      __m128i tmp9 = _mm_loadu_si128((__m128i *)&B[k + 3]);
-      __m128i tmp10 = _mm_loadu_si128((__m128i *)&B[k - 1 + buf_stride]);
-      __m128i tmp11 = _mm_loadu_si128((__m128i *)&B[k + 3 + buf_stride]);
-
-      __m128i b0 = _mm_add_epi32(
-          _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 4), tmp8),
-                        _mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 8),
-                                      _mm_alignr_epi8(tmp11, tmp10, 4))),
-          _mm_alignr_epi8(tmp7, tmp6, 4));
-      __m128i b1 =
-          _mm_add_epi32(_mm_add_epi32(tmp6, tmp10),
-                        _mm_add_epi32(_mm_alignr_epi8(tmp7, tmp6, 8),
-                                      _mm_alignr_epi8(tmp11, tmp10, 8)));
-      __m128i b = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(b0, b1), 2), b1);
-
-      __m128i src = _mm_cvtepu8_epi32(_mm_loadu_si128((__m128i *)&dgd[l]));
-
-      __m128i rounding = _mm_set1_epi32(
-          (1 << (SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS)) >> 1);
-      __m128i v = _mm_add_epi32(_mm_mullo_epi32(a, src), b);
+// The final filter for self-guided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum implementation above).
+static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
+                         const int32_t *B, int buf_stride, const void *dgd8,
+                         int dgd_stride, int width, int height, int highbd) {
+  const int nb = 5;
+  const __m128i rounding =
+      round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+  const uint8_t *dgd_real =
+      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i a = cross_sum(A + i * buf_stride + j, buf_stride);
+      const __m128i b = cross_sum(B + i * buf_stride + j, buf_stride);
+      const __m128i raw =
+          xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+      const __m128i src =
+          highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+      __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
       __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding),
                                  SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-      _mm_storeu_si128((__m128i *)&dst[m], w);
-    }
 
-    // Deal with any extra pixels at the right-hand edge of the frame
-    // (typically have 2 such pixels, but may have anywhere between 0 and 3)
-    for (; j < width - 1; ++j) {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 5;
-      const int32_t a =
-          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
-              4 +
-          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
-           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
-              3;
-      const int32_t b =
-          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
-              4 +
-          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
-           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
-              3;
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-
-    j = width - 1;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
-                        A[k - 1] + A[k - buf_stride - 1] +
-                        A[k + buf_stride - 1];
-      const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
-                        B[k - 1] + B[k - buf_stride - 1] +
-                        B[k + buf_stride - 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-  }
-
-  {
-    i = height - 1;
-    j = 0;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
-                        A[k - buf_stride + 1];
-      const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] +
-                        B[k - buf_stride + 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-    for (j = 1; j < width - 1; ++j) {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
-                        A[k - buf_stride - 1] + A[k - buf_stride + 1];
-      const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
-                        B[k - buf_stride - 1] + B[k - buf_stride + 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-    j = width - 1;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
-                        A[k - buf_stride - 1];
-      const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] +
-                        B[k - buf_stride - 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-  }
-}
-
-void av1_highpass_filter_sse4_1(uint8_t *dgd, int width, int height, int stride,
-                                int32_t *dst, int dst_stride, int corner,
-                                int edge) {
-  int i, j;
-  const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
-
-  {
-    i = 0;
-    j = 0;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
-          corner *
-              (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
-    }
-    for (j = 1; j < width - 1; ++j) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] = center * dgd[k] +
-               edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
-               corner * (dgd[k + stride - 1] + dgd[k + stride + 1] +
-                         dgd[k - 1] + dgd[k + 1]);
-    }
-    j = width - 1;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
-          corner *
-              (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
-    }
-  }
-  {
-    i = height - 1;
-    j = 0;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
-          corner *
-              (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
-    }
-    for (j = 1; j < width - 1; ++j) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] = center * dgd[k] +
-               edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
-               corner * (dgd[k - stride - 1] + dgd[k - stride + 1] +
-                         dgd[k - 1] + dgd[k + 1]);
-    }
-    j = width - 1;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
-          corner *
-              (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
-    }
-  }
-  __m128i center_ = _mm_set1_epi16(center);
-  __m128i edge_ = _mm_set1_epi16(edge);
-  __m128i corner_ = _mm_set1_epi16(corner);
-  for (i = 1; i < height - 1; ++i) {
-    j = 0;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] +
-          edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
-          corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
-                    dgd[k - stride] + dgd[k + stride]);
-    }
-    // Process in units of 8 pixels at a time.
-    for (j = 1; j < width - 8; j += 8) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-
-      __m128i a = _mm_loadu_si128((__m128i *)&dgd[k - stride - 1]);
-      __m128i b = _mm_loadu_si128((__m128i *)&dgd[k - 1]);
-      __m128i c = _mm_loadu_si128((__m128i *)&dgd[k + stride - 1]);
-
-      __m128i tl = _mm_cvtepu8_epi16(a);
-      __m128i tr = _mm_cvtepu8_epi16(_mm_srli_si128(a, 8));
-      __m128i cl = _mm_cvtepu8_epi16(b);
-      __m128i cr = _mm_cvtepu8_epi16(_mm_srli_si128(b, 8));
-      __m128i bl = _mm_cvtepu8_epi16(c);
-      __m128i br = _mm_cvtepu8_epi16(_mm_srli_si128(c, 8));
-
-      __m128i x = _mm_alignr_epi8(cr, cl, 2);
-      __m128i y = _mm_add_epi16(_mm_add_epi16(_mm_alignr_epi8(tr, tl, 2), cl),
-                                _mm_add_epi16(_mm_alignr_epi8(br, bl, 2),
-                                              _mm_alignr_epi8(cr, cl, 4)));
-      __m128i z = _mm_add_epi16(_mm_add_epi16(tl, bl),
-                                _mm_add_epi16(_mm_alignr_epi8(tr, tl, 4),
-                                              _mm_alignr_epi8(br, bl, 4)));
-
-      __m128i res = _mm_add_epi16(_mm_mullo_epi16(x, center_),
-                                  _mm_add_epi16(_mm_mullo_epi16(y, edge_),
-                                                _mm_mullo_epi16(z, corner_)));
-
-      _mm_storeu_si128((__m128i *)&dst[l], _mm_cvtepi16_epi32(res));
-      _mm_storeu_si128((__m128i *)&dst[l + 4],
-                       _mm_cvtepi16_epi32(_mm_srli_si128(res, 8)));
-    }
-    // If there are enough pixels left in this row, do another batch of 4
-    // pixels.
-    for (; j < width - 4; j += 4) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-
-      __m128i a = _mm_loadl_epi64((__m128i *)&dgd[k - stride - 1]);
-      __m128i b = _mm_loadl_epi64((__m128i *)&dgd[k - 1]);
-      __m128i c = _mm_loadl_epi64((__m128i *)&dgd[k + stride - 1]);
-
-      __m128i tl = _mm_cvtepu8_epi16(a);
-      __m128i cl = _mm_cvtepu8_epi16(b);
-      __m128i bl = _mm_cvtepu8_epi16(c);
-
-      __m128i x = _mm_srli_si128(cl, 2);
-      __m128i y = _mm_add_epi16(
-          _mm_add_epi16(_mm_srli_si128(tl, 2), cl),
-          _mm_add_epi16(_mm_srli_si128(bl, 2), _mm_srli_si128(cl, 4)));
-      __m128i z = _mm_add_epi16(
-          _mm_add_epi16(tl, bl),
-          _mm_add_epi16(_mm_srli_si128(tl, 4), _mm_srli_si128(bl, 4)));
-
-      __m128i res = _mm_add_epi16(_mm_mullo_epi16(x, center_),
-                                  _mm_add_epi16(_mm_mullo_epi16(y, edge_),
-                                                _mm_mullo_epi16(z, corner_)));
-
-      _mm_storeu_si128((__m128i *)&dst[l], _mm_cvtepi16_epi32(res));
-    }
-    // Handle any leftover pixels
-    for (; j < width - 1; ++j) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] +
-          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
-          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
-                    dgd[k - stride + 1] + dgd[k + stride + 1]);
-    }
-    j = width - 1;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] +
-          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
-          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
-                    dgd[k - stride] + dgd[k + stride]);
+      xx_storeu_128(dst + i * dst_stride + j, w);
     }
   }
 }
 
-void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height,
-                                         int stride, int eps, int *xqd,
-                                         uint8_t *dst, int dst_stride,
-                                         int32_t *tmpbuf) {
-  int xq[2];
-  int32_t *flt1 = tmpbuf;
-  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
-  int i, j;
-  assert(width * height <= RESTORATION_TILEPELS_MAX);
-#if USE_HIGHPASS_IN_SGRPROJ
-  av1_highpass_filter_sse4_1(dat, width, height, stride, flt1, width,
-                             sgr_params[eps].corner, sgr_params[eps].edge);
-#else
-    av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt1, width,
-                                      sgr_params[eps].r1, sgr_params[eps].e1);
-#endif  // USE_HIGHPASS_IN_SGRPROJ
-  av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt2, width,
-                                    sgr_params[eps].r2, sgr_params[eps].e2);
-  decode_xq(xqd, xq);
-
-  __m128i xq0 = _mm_set1_epi32(xq[0]);
-  __m128i xq1 = _mm_set1_epi32(xq[1]);
-  for (i = 0; i < height; ++i) {
-    // Calculate output in batches of 8 pixels
-    for (j = 0; j < width; j += 8) {
-      const int k = i * width + j;
-      const int l = i * stride + j;
-      const int m = i * dst_stride + j;
-      __m128i src =
-          _mm_slli_epi16(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&dat[l])),
-                         SGRPROJ_RST_BITS);
-
-      const __m128i u_0 = _mm_cvtepu16_epi32(src);
-      const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(src, 8));
-
-      const __m128i f1_0 =
-          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k]), u_0);
-      const __m128i f2_0 =
-          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k]), u_0);
-      const __m128i f1_1 =
-          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k + 4]), u_1);
-      const __m128i f2_1 =
-          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k + 4]), u_1);
-
-      const __m128i v_0 = _mm_add_epi32(
-          _mm_add_epi32(_mm_mullo_epi32(xq0, f1_0), _mm_mullo_epi32(xq1, f2_0)),
-          _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS));
-      const __m128i v_1 = _mm_add_epi32(
-          _mm_add_epi32(_mm_mullo_epi32(xq0, f1_1), _mm_mullo_epi32(xq1, f2_1)),
-          _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS));
-
-      const __m128i rounding =
-          _mm_set1_epi32((1 << (SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS)) >> 1);
-      const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding),
-                                         SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
-      const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
-                                         SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
-
-      const __m128i tmp = _mm_packs_epi32(w_0, w_1);
-      const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */);
-      _mm_storel_epi64((__m128i *)&dst[m], res);
-    }
-    // Process leftover pixels
-    for (; j < width; ++j) {
-      const int k = i * width + j;
-      const int l = i * stride + j;
-      const int m = i * dst_stride + j;
-      const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
-      const int32_t f1 = (int32_t)flt1[k] - u;
-      const int32_t f2 = (int32_t)flt2[k] - u;
-      const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
-      const int16_t w =
-          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
-      dst[m] = (uint16_t)clip_pixel(w);
-    }
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
+                         const int32_t *D, int width, int height,
+                         int buf_stride, int bit_depth, int sgr_params_idx,
+                         int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int n = (2 * r + 1) * (2 * r + 1);
+  const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
+  // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+  const __m128i one_over_n = _mm_set1_epi32(one_by_x[n - 1]);
+
+  const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+  const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+  // Set up masks
+  const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  __m128i mask[4];
+  for (int idx = 0; idx < 4; idx++) {
+    const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx));
+    mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
   }
-}
-
-#if CONFIG_HIGHBITDEPTH
-// Only the vertical sums need to be adjusted for highbitdepth
 
-static void highbd_selfguided_restoration_1_v(uint16_t *src, int width,
-                                              int height, int src_stride,
-                                              int32_t *A, int32_t *B,
-                                              int buf_stride) {
-  int i, j;
+  for (int i = -1; i < height + 1; i += 2) {
+    for (int j = -1; j < width + 1; j += 4) {
+      const int32_t *Cij = C + i * buf_stride + j;
+      const int32_t *Dij = D + i * buf_stride + j;
 
-  int width_extend = (width + 3) & ~3;
-  for (j = 0; j < width_extend; j += 4) {
-    __m128i a, b, x, y, x2, y2;
-    __m128i sum, sum_sq, tmp;
+      __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+      __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r);
 
-    a = _mm_loadl_epi64((__m128i *)&src[j]);
-    b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
+      // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain
+      // some uninitialised data in their upper words. We use a mask to
+      // ensure that these bits are set to 0.
+      int idx = AOMMIN(4, width + 1 - j);
+      assert(idx >= 1);
 
-    sum = _mm_cvtepi16_epi32(_mm_add_epi16(a, b));
-    tmp = _mm_unpacklo_epi16(a, b);
-    sum_sq = _mm_madd_epi16(tmp, tmp);
+      if (idx < 4) {
+        sum1 = _mm_and_si128(mask[idx], sum1);
+        sum2 = _mm_and_si128(mask[idx], sum2);
+      }
 
-    _mm_store_si128((__m128i *)&B[j], sum);
-    _mm_store_si128((__m128i *)&A[j], sum_sq);
+      const __m128i p = compute_p(sum1, sum2, bit_depth, n);
 
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
+      const __m128i z = _mm_min_epi32(
+          _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z),
+                         SGRPROJ_MTABLE_BITS),
+          _mm_set1_epi32(255));
 
-    for (i = 1; i < height - 2; ++i) {
-      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+      // 'Gather' type instructions are not available pre-AVX2, so synthesize a
+      // gather using scalar loads.
+      const __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
+                                          x_by_xplus1[_mm_extract_epi32(z, 2)],
+                                          x_by_xplus1[_mm_extract_epi32(z, 1)],
+                                          x_by_xplus1[_mm_extract_epi32(z, 0)]);
 
-      x = _mm_cvtepu16_epi32(
-          _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
-      y = _mm_cvtepu16_epi32(
-          _mm_loadl_epi64((__m128i *)&src[(i + 2) * src_stride + j]));
+      xx_storeu_128(A + i * buf_stride + j, a_res);
 
-      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+      const __m128i a_complement =
+          _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
 
-      x2 = _mm_mullo_epi32(x, x);
-      y2 = _mm_mullo_epi32(y, y);
+      // sum1 might have lanes greater than 2^15, so we can't use madd to do
+      // multiplication involving sum1. However, a_complement and one_over_n
+      // are both less than 256, so we can multiply them first.
+      const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n);
+      const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1);
+      const __m128i b_res =
+          _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS);
 
-      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+      xx_storeu_128(B + i * buf_stride + j, b_res);
     }
-    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
   }
 }
 
-static void highbd_selfguided_restoration_2_v(uint16_t *src, int width,
-                                              int height, int src_stride,
-                                              int32_t *A, int32_t *B,
-                                              int buf_stride) {
-  int i, j;
-
-  int width_extend = (width + 3) & ~3;
-  for (j = 0; j < width_extend; j += 4) {
-    __m128i a, b, c, c2, x, y, x2, y2;
-    __m128i sum, sum_sq, tmp;
-
-    a = _mm_loadl_epi64((__m128i *)&src[j]);
-    b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
-    c = _mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]);
-
-    sum = _mm_cvtepi16_epi32(_mm_add_epi16(_mm_add_epi16(a, b), c));
-    // Important: We need to widen *before* squaring here, since
-    // c^2 may be up to 2^24.
-    c = _mm_cvtepu16_epi32(c);
-    c2 = _mm_mullo_epi32(c, c);
-    tmp = _mm_unpacklo_epi16(a, b);
-    sum_sq = _mm_add_epi32(_mm_madd_epi16(tmp, tmp), c2);
-
-    _mm_store_si128((__m128i *)&B[j], sum);
-    _mm_store_si128((__m128i *)&A[j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
-
-    for (i = 2; i < height - 3; ++i) {
-      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
-      x = _mm_cvtepu16_epi32(
-          _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
-      y = _mm_cvtepu16_epi32(
-          _mm_loadl_epi64((__m128i *)&src[(i + 3) * src_stride + j]));
-
-      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
-
-      x2 = _mm_mullo_epi32(x, x);
-      y2 = _mm_mullo_epi32(y, y);
-
-      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
-    }
-    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
-  }
+// Calculate 4 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xtl  xt   xtr
+//  -   buf   -
+// xbl  xb   xbr
+//
+// Pixels are weighted like this:
+//  5    6    5
+//  0    0    0
+//  5    6    5
+//
+// fives = xtl + xtr + xbl + xbr
+// sixes = xt + xb
+// cross_sum = 6 * sixes + 5 * fives
+//           = 5 * (fives + sixes) - sixes
+//           = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m128i cross_sum_fast_even_row(const int32_t *buf, int stride) {
+  const __m128i xtl = xx_loadu_128(buf - 1 - stride);
+  const __m128i xt = xx_loadu_128(buf - stride);
+  const __m128i xtr = xx_loadu_128(buf + 1 - stride);
+  const __m128i xbl = xx_loadu_128(buf - 1 + stride);
+  const __m128i xb = xx_loadu_128(buf + stride);
+  const __m128i xbr = xx_loadu_128(buf + 1 + stride);
+
+  const __m128i fives =
+      _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl)));
+  const __m128i sixes = _mm_add_epi32(xt, xb);
+  const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes);
+
+  return _mm_add_epi32(
+      _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes),
+      sixes);
+}
+
+// Calculate 4 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xl    x   xr
+//
+// Pixels are weighted like this:
+//  5    6    5
+//
+// buf points to x
+//
+// fives = xl + xr
+// sixes = x
+// cross_sum = 5 * fives + 6 * sixes
+//           = 4 * (fives + sixes) + (fives + sixes) + sixes
+//           = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m128i cross_sum_fast_odd_row(const int32_t *buf) {
+  const __m128i xl = xx_loadu_128(buf - 1);
+  const __m128i x = xx_loadu_128(buf);
+  const __m128i xr = xx_loadu_128(buf + 1);
+
+  const __m128i fives = _mm_add_epi32(xl, xr);
+  const __m128i sixes = x;
+
+  const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes);
+
+  return _mm_add_epi32(
+      _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes),
+      sixes);
 }
 
-static void highbd_selfguided_restoration_3_v(uint16_t *src, int width,
-                                              int height, int src_stride,
-                                              int32_t *A, int32_t *B,
-                                              int buf_stride) {
-  int i, j;
-
-  int width_extend = (width + 3) & ~3;
-  for (j = 0; j < width_extend; j += 4) {
-    __m128i a, b, c, d, x, y, x2, y2;
-    __m128i sum, sum_sq, tmp, tmp2;
-
-    a = _mm_loadl_epi64((__m128i *)&src[j]);
-    b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
-    c = _mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]);
-    d = _mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]);
-
-    sum = _mm_cvtepi16_epi32(
-        _mm_add_epi16(_mm_add_epi16(a, b), _mm_add_epi16(c, d)));
-    tmp = _mm_unpacklo_epi16(a, b);
-    tmp2 = _mm_unpacklo_epi16(c, d);
-    sum_sq =
-        _mm_add_epi32(_mm_madd_epi16(tmp, tmp), _mm_madd_epi16(tmp2, tmp2));
-
-    _mm_store_si128((__m128i *)&B[j], sum);
-    _mm_store_si128((__m128i *)&A[j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[5 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[2 * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[2 * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[6 * src_stride + j]));
-    sum = _mm_add_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_add_epi32(sum_sq, x2);
-
-    for (i = 3; i < height - 4; ++i) {
-      _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-      _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
-      x = _mm_cvtepu16_epi32(
-          _mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j]));
-      y = _mm_cvtepu16_epi32(
-          _mm_loadl_epi64((__m128i *)&src[(i + 4) * src_stride + j]));
-
-      sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
-
-      x2 = _mm_mullo_epi32(x, x);
-      y2 = _mm_mullo_epi32(y, y);
-
-      sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+// The final filter for the self-guided restoration. Computes a
+// weighted average across A, B with "cross sums" (see cross_sum_...
+// implementations above).
+static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
+                              const int32_t *B, int buf_stride,
+                              const void *dgd8, int dgd_stride, int width,
+                              int height, int highbd) {
+  const int nb0 = 5;
+  const int nb1 = 4;
+
+  const __m128i rounding0 =
+      round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+  const __m128i rounding1 =
+      round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+  const uint8_t *dgd_real =
+      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+  for (int i = 0; i < height; ++i) {
+    if (!(i & 1)) {  // even row
+      for (int j = 0; j < width; j += 4) {
+        const __m128i a =
+            cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
+        const __m128i b =
+            cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
+        const __m128i raw =
+            xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+        const __m128i src =
+            highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+        __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+        __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding0),
+                                   SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+
+        xx_storeu_128(dst + i * dst_stride + j, w);
+      }
+    } else {  // odd row
+      for (int j = 0; j < width; j += 4) {
+        const __m128i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
+        const __m128i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
+        const __m128i raw =
+            xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+        const __m128i src =
+            highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+        __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+        __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding1),
+                                   SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+        xx_storeu_128(dst + i * dst_stride + j, w);
+      }
     }
-    _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
-
-    x = _mm_cvtepu16_epi32(
-        _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
-    sum = _mm_sub_epi32(sum, x);
-    x2 = _mm_mullo_epi32(x, x);
-    sum_sq = _mm_sub_epi32(sum_sq, x2);
-
-    _mm_store_si128((__m128i *)&B[(i + 3) * buf_stride + j], sum);
-    _mm_store_si128((__m128i *)&A[(i + 3) * buf_stride + j], sum_sq);
   }
 }
 
-void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
-                                              int height, int dgd_stride,
-                                              int32_t *dst, int dst_stride,
-                                              int bit_depth, int r, int eps) {
+void av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
+                                       int height, int dgd_stride,
+                                       int32_t *flt0, int32_t *flt1,
+                                       int flt_stride, int sgr_params_idx,
+                                       int bit_depth, int highbd) {
+  DECLARE_ALIGNED(16, int32_t, buf[4 * RESTORATION_PROC_UNIT_PELS]);
+  memset(buf, 0, sizeof(buf));
+
   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
-  int32_t A_[RESTORATION_PROC_UNIT_PELS];
-  int32_t B_[RESTORATION_PROC_UNIT_PELS];
-  int32_t *A = A_;
-  int32_t *B = B_;
-  int i, j;
+
   // Adjusting the stride of A and B here appears to avoid bad cache effects,
   // leading to a significant speed improvement.
   // We also align the stride to a multiple of 16 bytes for efficiency.
   int buf_stride = ((width_ext + 3) & ~3) + 16;
 
-  // Don't filter tiles with dimensions < 5 on any axis
-  if ((width < 5) || (height < 5)) return;
-
-  uint16_t *dgd0 = dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ;
-  if (r == 1) {
-    highbd_selfguided_restoration_1_v(dgd0, width_ext, height_ext, dgd_stride,
-                                      A, B, buf_stride);
-    selfguided_restoration_1_h(A, B, width_ext, height_ext, buf_stride, eps,
-                               bit_depth);
-  } else if (r == 2) {
-    highbd_selfguided_restoration_2_v(dgd0, width_ext, height_ext, dgd_stride,
-                                      A, B, buf_stride);
-    selfguided_restoration_2_h(A, B, width_ext, height_ext, buf_stride, eps,
-                               bit_depth);
-  } else if (r == 3) {
-    highbd_selfguided_restoration_3_v(dgd0, width_ext, height_ext, dgd_stride,
-                                      A, B, buf_stride);
-    selfguided_restoration_3_h(A, B, width_ext, height_ext, buf_stride, eps,
-                               bit_depth);
-  } else {
-    assert(0);
-  }
-  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
-  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
-
-  {
-    i = 0;
-    j = 0;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
-                        A[k + buf_stride + 1];
-      const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] +
-                        B[k + buf_stride + 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-    for (j = 1; j < width - 1; ++j) {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
-                        A[k + buf_stride - 1] + A[k + buf_stride + 1];
-      const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
-                        B[k + buf_stride - 1] + B[k + buf_stride + 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-    j = width - 1;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
-                        A[k + buf_stride - 1];
-      const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] +
-                        B[k + buf_stride - 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-  }
-  for (i = 1; i < height - 1; ++i) {
-    j = 0;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
-                        A[k + 1] + A[k - buf_stride + 1] +
-                        A[k + buf_stride + 1];
-      const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
-                        B[k + 1] + B[k - buf_stride + 1] +
-                        B[k + buf_stride + 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-
-    // Vectorize the innermost loop
-    for (j = 1; j < width - 1; j += 4) {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 5;
-
-      __m128i tmp0 = _mm_loadu_si128((__m128i *)&A[k - 1 - buf_stride]);
-      __m128i tmp1 = _mm_loadu_si128((__m128i *)&A[k + 3 - buf_stride]);
-      __m128i tmp2 = _mm_loadu_si128((__m128i *)&A[k - 1]);
-      __m128i tmp3 = _mm_loadu_si128((__m128i *)&A[k + 3]);
-      __m128i tmp4 = _mm_loadu_si128((__m128i *)&A[k - 1 + buf_stride]);
-      __m128i tmp5 = _mm_loadu_si128((__m128i *)&A[k + 3 + buf_stride]);
-
-      __m128i a0 = _mm_add_epi32(
-          _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 4), tmp2),
-                        _mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 8),
-                                      _mm_alignr_epi8(tmp5, tmp4, 4))),
-          _mm_alignr_epi8(tmp1, tmp0, 4));
-      __m128i a1 = _mm_add_epi32(_mm_add_epi32(tmp0, tmp4),
-                                 _mm_add_epi32(_mm_alignr_epi8(tmp1, tmp0, 8),
-                                               _mm_alignr_epi8(tmp5, tmp4, 8)));
-      __m128i a = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(a0, a1), 2), a1);
-
-      __m128i tmp6 = _mm_loadu_si128((__m128i *)&B[k - 1 - buf_stride]);
-      __m128i tmp7 = _mm_loadu_si128((__m128i *)&B[k + 3 - buf_stride]);
-      __m128i tmp8 = _mm_loadu_si128((__m128i *)&B[k - 1]);
-      __m128i tmp9 = _mm_loadu_si128((__m128i *)&B[k + 3]);
-      __m128i tmp10 = _mm_loadu_si128((__m128i *)&B[k - 1 + buf_stride]);
-      __m128i tmp11 = _mm_loadu_si128((__m128i *)&B[k + 3 + buf_stride]);
-
-      __m128i b0 = _mm_add_epi32(
-          _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 4), tmp8),
-                        _mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 8),
-                                      _mm_alignr_epi8(tmp11, tmp10, 4))),
-          _mm_alignr_epi8(tmp7, tmp6, 4));
-      __m128i b1 =
-          _mm_add_epi32(_mm_add_epi32(tmp6, tmp10),
-                        _mm_add_epi32(_mm_alignr_epi8(tmp7, tmp6, 8),
-                                      _mm_alignr_epi8(tmp11, tmp10, 8)));
-      __m128i b = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(b0, b1), 2), b1);
-
-      __m128i src = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)&dgd[l]));
-
-      __m128i rounding = _mm_set1_epi32(
-          (1 << (SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS)) >> 1);
-      __m128i v = _mm_add_epi32(_mm_mullo_epi32(a, src), b);
-      __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding),
-                                 SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-      _mm_storeu_si128((__m128i *)&dst[m], w);
-    }
-
-    // Deal with any extra pixels at the right-hand edge of the frame
-    // (typically have 2 such pixels, but may have anywhere between 0 and 3)
-    for (; j < width - 1; ++j) {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 5;
-      const int32_t a =
-          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
-              4 +
-          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
-           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
-              3;
-      const int32_t b =
-          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
-              4 +
-          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
-           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
-              3;
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-
-    j = width - 1;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
-                        A[k - 1] + A[k - buf_stride - 1] +
-                        A[k + buf_stride - 1];
-      const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
-                        B[k - 1] + B[k - buf_stride - 1] +
-                        B[k + buf_stride - 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
+  // The "tl" pointers point at the top-left of the initialised data for the
+  // array. Adding 3 here ensures that column 1 is 16-byte aligned.
+  int32_t *Atl = buf + 0 * RESTORATION_PROC_UNIT_PELS + 3;
+  int32_t *Btl = buf + 1 * RESTORATION_PROC_UNIT_PELS + 3;
+  int32_t *Ctl = buf + 2 * RESTORATION_PROC_UNIT_PELS + 3;
+  int32_t *Dtl = buf + 3 * RESTORATION_PROC_UNIT_PELS + 3;
+
+  // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
+  // there's a zero row and column in A, B (integral images), so we move down
+  // and right one for them.
+  const int buf_diag_border =
+      SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
+
+  int32_t *A0 = Atl + 1 + buf_stride;
+  int32_t *B0 = Btl + 1 + buf_stride;
+  int32_t *C0 = Ctl + 1 + buf_stride;
+  int32_t *D0 = Dtl + 1 + buf_stride;
+
+  // Finally, A, B, C, D point at position (0, 0).
+  int32_t *A = A0 + buf_diag_border;
+  int32_t *B = B0 + buf_diag_border;
+  int32_t *C = C0 + buf_diag_border;
+  int32_t *D = D0 + buf_diag_border;
+
+  const int dgd_diag_border =
+      SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
+  const uint8_t *dgd0 = dgd8 - dgd_diag_border;
+
+  // Generate integral images from the input. C will contain sums of squares; D
+  // will contain just sums
+  if (highbd)
+    integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
+                           height_ext, Ctl, Dtl, buf_stride);
+  else
+    integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
+                    buf_stride);
+
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  // Write to flt0 and flt1
+  // If params->r == 0 we skip the corresponding filter. We only allow one of
+  // the radii to be 0, as having both equal to 0 would be equivalent to
+  // skipping SGR entirely.
+  assert(!(params->r[0] == 0 && params->r[1] == 0));
+  assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+  assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+
+  if (params->r[0] > 0) {
+    calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
+                 sgr_params_idx, 0);
+    final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
+                      width, height, highbd);
   }
 
-  {
-    i = height - 1;
-    j = 0;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
-                        A[k - buf_stride + 1];
-      const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] +
-                        B[k - buf_stride + 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-    for (j = 1; j < width - 1; ++j) {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
-                        A[k - buf_stride - 1] + A[k - buf_stride + 1];
-      const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
-                        B[k - buf_stride - 1] + B[k - buf_stride + 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
-    j = width - 1;
-    {
-      const int k = i * buf_stride + j;
-      const int l = i * dgd_stride + j;
-      const int m = i * dst_stride + j;
-      const int nb = 3;
-      const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
-                        A[k - buf_stride - 1];
-      const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] +
-                        B[k - buf_stride - 1];
-      const int32_t v = a * dgd[l] + b;
-      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-    }
+  if (params->r[1] > 0) {
+    calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
+            1);
+    final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+                 height, highbd);
   }
 }
 
-void av1_highpass_filter_highbd_sse4_1(uint16_t *dgd, int width, int height,
-                                       int stride, int32_t *dst, int dst_stride,
-                                       int corner, int edge) {
-  int i, j;
-  const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
-
-  {
-    i = 0;
-    j = 0;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
-          corner *
-              (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
-    }
-    for (j = 1; j < width - 1; ++j) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] = center * dgd[k] +
-               edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
-               corner * (dgd[k + stride - 1] + dgd[k + stride + 1] +
-                         dgd[k - 1] + dgd[k + 1]);
-    }
-    j = width - 1;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
-          corner *
-              (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
-    }
-  }
-  __m128i center_ = _mm_set1_epi32(center);
-  __m128i edge_ = _mm_set1_epi32(edge);
-  __m128i corner_ = _mm_set1_epi32(corner);
-  for (i = 1; i < height - 1; ++i) {
-    j = 0;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] +
-          edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
-          corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
-                    dgd[k - stride] + dgd[k + stride]);
-    }
-    // Process 4 pixels at a time
-    for (j = 1; j < width - 4; j += 4) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-
-      __m128i a = _mm_loadu_si128((__m128i *)&dgd[k - stride - 1]);
-      __m128i b = _mm_loadu_si128((__m128i *)&dgd[k - 1]);
-      __m128i c = _mm_loadu_si128((__m128i *)&dgd[k + stride - 1]);
-
-      __m128i tl = _mm_cvtepu16_epi32(a);
-      __m128i tr = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
-      __m128i cl = _mm_cvtepu16_epi32(b);
-      __m128i cr = _mm_cvtepu16_epi32(_mm_srli_si128(b, 8));
-      __m128i bl = _mm_cvtepu16_epi32(c);
-      __m128i br = _mm_cvtepu16_epi32(_mm_srli_si128(c, 8));
-
-      __m128i x = _mm_alignr_epi8(cr, cl, 4);
-      __m128i y = _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tr, tl, 4), cl),
-                                _mm_add_epi32(_mm_alignr_epi8(br, bl, 4),
-                                              _mm_alignr_epi8(cr, cl, 8)));
-      __m128i z = _mm_add_epi32(_mm_add_epi32(tl, bl),
-                                _mm_add_epi32(_mm_alignr_epi8(tr, tl, 8),
-                                              _mm_alignr_epi8(br, bl, 8)));
-
-      __m128i res = _mm_add_epi32(_mm_mullo_epi32(x, center_),
-                                  _mm_add_epi32(_mm_mullo_epi32(y, edge_),
-                                                _mm_mullo_epi32(z, corner_)));
-
-      _mm_storeu_si128((__m128i *)&dst[l], res);
-    }
-    // Handle any leftover pixels
-    for (; j < width - 1; ++j) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] +
-          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
-          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
-                    dgd[k - stride + 1] + dgd[k + stride + 1]);
-    }
-    j = width - 1;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] +
-          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
-          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
-                    dgd[k - stride] + dgd[k + stride]);
-    }
-  }
-  {
-    i = height - 1;
-    j = 0;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
-          corner *
-              (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
-    }
-    for (j = 1; j < width - 1; ++j) {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] = center * dgd[k] +
-               edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
-               corner * (dgd[k - stride - 1] + dgd[k - stride + 1] +
-                         dgd[k - 1] + dgd[k + 1]);
-    }
-    j = width - 1;
-    {
-      const int k = i * stride + j;
-      const int l = i * dst_stride + j;
-      dst[l] =
-          center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
-          corner *
-              (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
-    }
-  }
-}
-
-void apply_selfguided_restoration_highbd_sse4_1(
-    uint16_t *dat, int width, int height, int stride, int bit_depth, int eps,
-    int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf) {
+void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
+                                         int height, int stride, int eps,
+                                         const int *xqd, uint8_t *dst8,
+                                         int dst_stride, int32_t *tmpbuf,
+                                         int bit_depth, int highbd) {
+  int32_t *flt0 = tmpbuf;
+  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+  assert(width * height <= RESTORATION_UNITPELS_MAX);
+  av1_selfguided_restoration_sse4_1(dat8, width, height, stride, flt0, flt1,
+                                    width, eps, bit_depth, highbd);
+  const sgr_params_type *const params = &sgr_params[eps];
   int xq[2];
-  int32_t *flt1 = tmpbuf;
-  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
-  int i, j;
-  assert(width * height <= RESTORATION_TILEPELS_MAX);
-#if USE_HIGHPASS_IN_SGRPROJ
-  av1_highpass_filter_highbd_sse4_1(dat, width, height, stride, flt1, width,
-                                    sgr_params[eps].corner,
-                                    sgr_params[eps].edge);
-#else
-  av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt1,
-                                           width, bit_depth, sgr_params[eps].r1,
-                                           sgr_params[eps].e1);
-#endif  // USE_HIGHPASS_IN_SGRPROJ
-  av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt2,
-                                           width, bit_depth, sgr_params[eps].r2,
-                                           sgr_params[eps].e2);
-  decode_xq(xqd, xq);
+  decode_xq(xqd, xq, params);
 
   __m128i xq0 = _mm_set1_epi32(xq[0]);
   __m128i xq1 = _mm_set1_epi32(xq[1]);
-  for (i = 0; i < height; ++i) {
+
+  for (int i = 0; i < height; ++i) {
     // Calculate output in batches of 8 pixels
-    for (j = 0; j < width; j += 8) {
+    for (int j = 0; j < width; j += 8) {
       const int k = i * width + j;
-      const int l = i * stride + j;
       const int m = i * dst_stride + j;
-      __m128i src =
-          _mm_slli_epi16(_mm_load_si128((__m128i *)&dat[l]), SGRPROJ_RST_BITS);
-
-      const __m128i u_0 = _mm_cvtepu16_epi32(src);
-      const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(src, 8));
-
-      const __m128i f1_0 =
-          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k]), u_0);
-      const __m128i f2_0 =
-          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k]), u_0);
-      const __m128i f1_1 =
-          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k + 4]), u_1);
-      const __m128i f2_1 =
-          _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k + 4]), u_1);
-
-      const __m128i v_0 = _mm_add_epi32(
-          _mm_add_epi32(_mm_mullo_epi32(xq0, f1_0), _mm_mullo_epi32(xq1, f2_0)),
-          _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS));
-      const __m128i v_1 = _mm_add_epi32(
-          _mm_add_epi32(_mm_mullo_epi32(xq0, f1_1), _mm_mullo_epi32(xq1, f2_1)),
-          _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS));
+
+      const uint8_t *dat8ij = dat8 + i * stride + j;
+      __m128i src;
+      if (highbd) {
+        src = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
+      } else {
+        src = _mm_cvtepu8_epi16(xx_loadl_64(dat8ij));
+      }
+
+      const __m128i u = _mm_slli_epi16(src, SGRPROJ_RST_BITS);
+      const __m128i u_0 = _mm_cvtepu16_epi32(u);
+      const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(u, 8));
+
+      __m128i v_0 = _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
+      __m128i v_1 = _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
+
+      if (params->r[0] > 0) {
+        const __m128i f1_0 = _mm_sub_epi32(xx_loadu_128(&flt0[k]), u_0);
+        v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq0, f1_0));
+
+        const __m128i f1_1 = _mm_sub_epi32(xx_loadu_128(&flt0[k + 4]), u_1);
+        v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq0, f1_1));
+      }
+
+      if (params->r[1] > 0) {
+        const __m128i f2_0 = _mm_sub_epi32(xx_loadu_128(&flt1[k]), u_0);
+        v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq1, f2_0));
+
+        const __m128i f2_1 = _mm_sub_epi32(xx_loadu_128(&flt1[k + 4]), u_1);
+        v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq1, f2_1));
+      }
 
       const __m128i rounding =
-          _mm_set1_epi32((1 << (SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS)) >> 1);
+          round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
       const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding),
                                          SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
       const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
                                          SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
 
-      // Pack into 16 bits and clamp to [0, 2^bit_depth)
-      const __m128i tmp = _mm_packus_epi32(w_0, w_1);
-      const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1);
-      const __m128i res = _mm_min_epi16(tmp, max);
-
-      _mm_store_si128((__m128i *)&dst[m], res);
-    }
-    // Process leftover pixels
-    for (; j < width; ++j) {
-      const int k = i * width + j;
-      const int l = i * stride + j;
-      const int m = i * dst_stride + j;
-      const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
-      const int32_t f1 = (int32_t)flt1[k] - u;
-      const int32_t f2 = (int32_t)flt2[k] - u;
-      const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
-      const int16_t w =
-          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
-      dst[m] = (uint16_t)clip_pixel_highbd(w, bit_depth);
+      if (highbd) {
+        // Pack into 16 bits and clamp to [0, 2^bit_depth)
+        const __m128i tmp = _mm_packus_epi32(w_0, w_1);
+        const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1);
+        const __m128i res = _mm_min_epi16(tmp, max);
+        xx_storeu_128(CONVERT_TO_SHORTPTR(dst8 + m), res);
+      } else {
+        // Pack into 8 bits and clamp to [0, 256)
+        const __m128i tmp = _mm_packs_epi32(w_0, w_1);
+        const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */);
+        xx_storel_64(dst8 + m, res);
+      }
     }
   }
 }
-
-#endif
diff --git a/third_party/aom/av1/common/x86/warp_plane_sse2.c b/third_party/aom/av1/common/x86/warp_plane_sse2.c
deleted file mode 100644
index d30466ae6..000000000
--- a/third_party/aom/av1/common/x86/warp_plane_sse2.c
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "./av1_rtcd.h"
-#include "av1/common/warped_motion.h"
-
-void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
-                          int height, int stride, uint8_t *pred, int p_col,
-                          int p_row, int p_width, int p_height, int p_stride,
-                          int subsampling_x, int subsampling_y,
-                          ConvolveParams *conv_params, int16_t alpha,
-                          int16_t beta, int16_t gamma, int16_t delta) {
-  int comp_avg = conv_params->do_average;
-  __m128i tmp[15];
-  int i, j, k;
-  const int bd = 8;
-#if CONFIG_CONVOLVE_ROUND
-  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
-  const int reduce_bits_horiz =
-      use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
-  const int offset_bits_horiz =
-      use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
-  if (use_conv_params) {
-    conv_params->do_post_rounding = 1;
-  }
-  assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
-#else
-  const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
-  const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
-#endif
-
-  /* Note: For this code to work, the left/right frame borders need to be
-     extended by at least 13 pixels each. By the time we get here, other
-     code will have set up this border, but we allow an explicit check
-     for debugging purposes.
-  */
-  /*for (i = 0; i < height; ++i) {
-    for (j = 0; j < 13; ++j) {
-      assert(ref[i * stride - 13 + j] == ref[i * stride]);
-      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
-    }
-  }*/
-
-  for (i = 0; i < p_height; i += 8) {
-    for (j = 0; j < p_width; j += 8) {
-      const int32_t src_x = (p_col + j + 4) << subsampling_x;
-      const int32_t src_y = (p_row + i + 4) << subsampling_y;
-      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
-      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
-      const int32_t x4 = dst_x >> subsampling_x;
-      const int32_t y4 = dst_y >> subsampling_y;
-
-      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
-      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
-      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
-      // Add in all the constant terms, including rounding and offset
-      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-
-      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-
-      // Horizontal filter
-      // If the block is aligned such that, after clamping, every sample
-      // would be taken from the leftmost/rightmost column, then we can
-      // skip the expensive horizontal filter.
-      if (ix4 <= -7) {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
-              ref[iy * stride] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
-        }
-      } else if (ix4 >= width + 6) {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
-              ref[iy * stride + (width - 1)] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
-        }
-      } else {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          int sx = sx4 + beta * (k + 4);
-
-          // Load source pixels
-          const __m128i zero = _mm_setzero_si128();
-          const __m128i src =
-              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-
-          // Filter even-index pixels
-          const __m128i tmp_0 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_2 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_4 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_6 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
-
-          // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
-          const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
-          // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
-          const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
-          // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
-          const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
-          // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
-          const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
-          // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
-          const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
-          // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
-          const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
-          // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
-          const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
-          // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
-          const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
-          const __m128i round_const = _mm_set1_epi32(
-              (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
-
-          // Calculate filtered results
-          const __m128i src_0 = _mm_unpacklo_epi8(src, zero);
-          const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
-          const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(src, 2), zero);
-          const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
-          const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(src, 4), zero);
-          const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
-          const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(src, 6), zero);
-          const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
-
-          __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                           _mm_add_epi32(res_2, res_6));
-          res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
-                                   _mm_cvtsi32_si128(reduce_bits_horiz));
-
-          // Filter odd-index pixels
-          const __m128i tmp_1 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_3 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_5 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
-          const __m128i tmp_7 = _mm_loadu_si128(
-              (__m128i *)(warped_filter +
-                          ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
-
-          const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
-          const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
-          const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
-          const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
-          const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
-          const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
-          const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
-          const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
-          const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(src, 1), zero);
-          const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
-          const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(src, 3), zero);
-          const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
-          const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(src, 5), zero);
-          const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
-          const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(src, 7), zero);
-          const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
-
-          __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                          _mm_add_epi32(res_3, res_7));
-          res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
-                                  _mm_cvtsi32_si128(reduce_bits_horiz));
-
-          // Combine results into one register.
-          // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
-          // as this order helps with the vertical filter.
-          tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
-        }
-      }
-
-      // Vertical filter
-      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
-        int sy = sy4 + delta * (k + 4);
-
-        // Load from tmp and rearrange pairs of consecutive rows into the
-        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
-        const __m128i *src = tmp + (k + 4);
-        const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
-        const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
-        const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
-        const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
-
-        // Filter even-index pixels
-        const __m128i tmp_0 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_2 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_4 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_6 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-        const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
-        const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
-        const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
-        const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
-        const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
-        const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
-        const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
-        const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
-        const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
-        const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
-        const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
-
-        const __m128i tmp_1 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_3 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_5 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
-        const __m128i tmp_7 = _mm_loadu_si128(
-            (__m128i *)(warped_filter +
-                        ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-        const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
-        const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
-        const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
-        const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
-        const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
-        const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
-        const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
-        const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-#if CONFIG_CONVOLVE_ROUND
-        if (use_conv_params) {
-          __m128i *const p =
-              (__m128i *)&conv_params
-                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
-          const __m128i round_const = _mm_set1_epi32(
-              -(1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)) +
-              ((1 << (conv_params->round_1)) >> 1));
-          res_lo = _mm_add_epi32(res_lo, round_const);
-          res_lo =
-              _mm_srl_epi16(res_lo, _mm_cvtsi32_si128(conv_params->round_1));
-          if (comp_avg) res_lo = _mm_add_epi32(_mm_loadu_si128(p), res_lo);
-          _mm_storeu_si128(p, res_lo);
-          if (p_width > 4) {
-            res_hi = _mm_add_epi32(res_hi, round_const);
-            res_hi =
-                _mm_srl_epi16(res_hi, _mm_cvtsi32_si128(conv_params->round_1));
-            if (comp_avg)
-              res_hi = _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi);
-            _mm_storeu_si128(p + 1, res_hi);
-          }
-        } else {
-#else
-        {
-#endif
-          // Round and pack into 8 bits
-          const __m128i round_const =
-              _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
-                             ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
-
-          const __m128i res_lo_round = _mm_srai_epi32(
-              _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
-          const __m128i res_hi_round = _mm_srai_epi32(
-              _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
-
-          const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
-          __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
-
-          // Store, blending with 'pred' if needed
-          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-
-          // Note: If we're outputting a 4x4 block, we need to be very careful
-          // to only output 4 pixels at this point, to avoid encode/decode
-          // mismatches when encoding with multiple threads.
-          if (p_width == 4) {
-            if (comp_avg) {
-              const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
-              res_8bit = _mm_avg_epu8(res_8bit, orig);
-            }
-            *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
-          } else {
-            if (comp_avg) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
-            _mm_storel_epi64(p, res_8bit);
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/third_party/aom/av1/common/x86/warp_plane_ssse3.c b/third_party/aom/av1/common/x86/warp_plane_sse4.c
index 3986ad389..efc542cbf 100644
--- a/third_party/aom/av1/common/x86/warp_plane_ssse3.c
+++ b/third_party/aom/av1/common/x86/warp_plane_sse4.c
@@ -9,9 +9,11 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include <tmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
 
-#include "./av1_rtcd.h"
 #include "av1/common/warped_motion.h"
 
 /* This is a modified version of 'warped_filter' from warped_motion.c:
@@ -201,41 +203,142 @@ static const uint8_t even_mask[16] = { 0, 2,  2,  4,  4,  6,  6,  8,
 static const uint8_t odd_mask[16] = { 1, 3,  3,  5,  5,  7,  7,  9,
                                       9, 11, 11, 13, 13, 15, 15, 0 };
 
-void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
-                           int height, int stride, uint8_t *pred, int p_col,
-                           int p_row, int p_width, int p_height, int p_stride,
-                           int subsampling_x, int subsampling_y,
-                           ConvolveParams *conv_params, int16_t alpha,
-                           int16_t beta, int16_t gamma, int16_t delta) {
-  int comp_avg = conv_params->do_average;
+static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
+                                     int alpha, int k,
+                                     const int offset_bits_horiz,
+                                     const int reduce_bits_horiz) {
+  const __m128i src_even =
+      _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
+  const __m128i src_odd =
+      _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
+
+  // Filter even-index pixels
+  const __m128i tmp_0 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_1 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_2 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_3 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_4 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_5 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_6 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_7 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
+  const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
+  const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
+  const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
+  const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+  // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
+  const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
+  // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
+  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
+  // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
+  const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
+  // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
+  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
+
+  // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
+  const __m128i coeff_02 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+  // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
+  const __m128i coeff_46 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+  // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
+  const __m128i coeff_13 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+  // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
+  const __m128i coeff_57 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+  // The pixel order we need for 'src' is:
+  // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
+  const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
+  const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff_02);
+  // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
+  const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
+                                            _mm_srli_si128(src_odd, 4));
+  const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff_46);
+  // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
+  const __m128i src_13 =
+      _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
+  const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff_13);
+  // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
+  const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
+                                            _mm_srli_si128(src_even, 6));
+  const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57);
+
+  const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
+                                             ((1 << reduce_bits_horiz) >> 1));
+
+  // Note: The values res_02 + res_46 and res_13 + res_57 both
+  // fit into int16s at this point, but their sum may be too wide to fit
+  // into an int16. However, once we also add round_const, the sum of
+  // all of these fits into a uint16.
+  //
+  // The wrapping behaviour of _mm_add_* is used here to make sure we
+  // get the correct result despite converting between different
+  // (implicit) types.
+  const __m128i res_even = _mm_add_epi16(res_02, res_46);
+  const __m128i res_odd = _mm_add_epi16(res_13, res_57);
+  const __m128i res =
+      _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
+  tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
+}
+
+void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
+                            int height, int stride, uint8_t *pred, int p_col,
+                            int p_row, int p_width, int p_height, int p_stride,
+                            int subsampling_x, int subsampling_y,
+                            ConvolveParams *conv_params, int16_t alpha,
+                            int16_t beta, int16_t gamma, int16_t delta) {
   __m128i tmp[15];
   int i, j, k;
   const int bd = 8;
-#if CONFIG_CONVOLVE_ROUND
-  const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
-  const int reduce_bits_horiz =
-      use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
-  const int offset_bits_horiz =
-      use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
-  if (use_conv_params) {
-    conv_params->do_post_rounding = 1;
-  }
-  assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
-#else
-  const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
-  const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
-#endif
+  const int reduce_bits_horiz = conv_params->round_0;
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
+  const __m128i reduce_bits_vert_const =
+      _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
+  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const __m128i res_sub_const =
+      _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
+                     (1 << (offset_bits - conv_params->round_1 - 1)));
+  __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+  __m128i round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
 
   /* Note: For this code to work, the left/right frame borders need to be
-     extended by at least 13 pixels each. By the time we get here, other
-     code will have set up this border, but we allow an explicit check
-     for debugging purposes.
+  extended by at least 13 pixels each. By the time we get here, other
+  code will have set up this border, but we allow an explicit check
+  for debugging purposes.
   */
   /*for (i = 0; i < height; ++i) {
-    for (j = 0; j < 13; ++j) {
-      assert(ref[i * stride - 13 + j] == ref[i * stride]);
-      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
-    }
+  for (j = 0; j < 13; ++j) {
+  assert(ref[i * stride - 13 + j] == ref[i * stride]);
+  assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+  }
   }*/
 
   for (i = 0; i < p_height; i += 8) {
@@ -273,10 +376,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
           else if (iy > height - 1)
             iy = height - 1;
           tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
-              ref[iy * stride] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
         }
       } else if (ix4 >= width + 6) {
         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -285,11 +386,37 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
             iy = 0;
           else if (iy > height - 1)
             iy = height - 1;
-          tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
-                     1)) +
-              ref[iy * stride + (width - 1)] *
-                  (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+          tmp[k + 7] =
+              _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+                             ref[iy * stride + (width - 1)] *
+                                 (1 << (FILTER_BITS - reduce_bits_horiz)));
+        }
+      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+        const int out_of_boundary_left = -(ix4 - 6);
+        const int out_of_boundary_right = (ix4 + 8) - width;
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
+
+          // Load source pixels
+          __m128i src =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+          if (out_of_boundary_left >= 0) {
+            const __m128i shuffle_reg_left =
+                _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+            src = _mm_shuffle_epi8(src, shuffle_reg_left);
+          }
+          if (out_of_boundary_right >= 0) {
+            const __m128i shuffle_reg_right = _mm_loadu_si128(
+                (__m128i *)warp_pad_right[out_of_boundary_right]);
+            src = _mm_shuffle_epi8(src, shuffle_reg_right);
+          }
+          horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
+                            reduce_bits_horiz);
         }
       } else {
         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -303,89 +430,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
           // Load source pixels
           const __m128i src =
               _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-          const __m128i src_even =
-              _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
-          const __m128i src_odd =
-              _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
-
-          // Filter even-index pixels
-          const __m128i tmp_0 = _mm_loadl_epi64((
-              __m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
-          const __m128i tmp_1 = _mm_loadl_epi64((
-              __m128i *)&filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
-          const __m128i tmp_2 = _mm_loadl_epi64((
-              __m128i *)&filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
-          const __m128i tmp_3 = _mm_loadl_epi64((
-              __m128i *)&filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
-          const __m128i tmp_4 = _mm_loadl_epi64((
-              __m128i *)&filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
-          const __m128i tmp_5 = _mm_loadl_epi64((
-              __m128i *)&filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
-          const __m128i tmp_6 = _mm_loadl_epi64((
-              __m128i *)&filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
-          const __m128i tmp_7 = _mm_loadl_epi64((
-              __m128i *)&filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
-
-          // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
-          const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
-          // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
-          const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
-          // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
-          const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
-          // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
-          const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
-
-          // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
-          const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
-          // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
-          const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
-          // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
-          const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
-          // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
-          const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
-
-          // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
-          const __m128i coeff_02 = _mm_unpacklo_epi64(tmp_12, tmp_14);
-          // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
-          const __m128i coeff_46 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-          // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
-          const __m128i coeff_13 = _mm_unpacklo_epi64(tmp_13, tmp_15);
-          // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
-          const __m128i coeff_57 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
-          // The pixel order we need for 'src' is:
-          // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
-          const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
-          const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff_02);
-          // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
-          const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
-                                                    _mm_srli_si128(src_odd, 4));
-          const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff_46);
-          // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
-          const __m128i src_13 =
-              _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
-          const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff_13);
-          // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
-          const __m128i src_57 = _mm_unpacklo_epi64(
-              _mm_srli_si128(src_odd, 4), _mm_srli_si128(src_even, 6));
-          const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57);
-
-          const __m128i round_const = _mm_set1_epi16(
-              (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
-
-          // Note: The values res_02 + res_46 and res_13 + res_57 both
-          // fit into int16s at this point, but their sum may be too wide to fit
-          // into an int16. However, once we also add round_const, the sum of
-          // all of these fits into a uint16.
-          //
-          // The wrapping behaviour of _mm_add_* is used here to make sure we
-          // get the correct result despite converting between different
-          // (implicit) types.
-          const __m128i res_even = _mm_add_epi16(res_02, res_46);
-          const __m128i res_odd = _mm_add_epi16(res_13, res_57);
-          const __m128i res =
-              _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
-          tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
+          horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
+                            reduce_bits_horiz);
         }
       }
 
@@ -474,40 +520,85 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
         __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
         __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
 
-#if CONFIG_CONVOLVE_ROUND
-        if (use_conv_params) {
+        if (conv_params->is_compound) {
           __m128i *const p =
               (__m128i *)&conv_params
                   ->dst[(i + k + 4) * conv_params->dst_stride + j];
-          const __m128i round_const = _mm_set1_epi32(
-              -(1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)) +
-              ((1 << (conv_params->round_1)) >> 1));
-          res_lo = _mm_add_epi32(res_lo, round_const);
-          res_lo =
-              _mm_srl_epi16(res_lo, _mm_cvtsi32_si128(conv_params->round_1));
-          if (comp_avg) res_lo = _mm_add_epi32(_mm_loadu_si128(p), res_lo);
-          _mm_storeu_si128(p, res_lo);
+          res_lo = _mm_add_epi32(res_lo, res_add_const);
+          res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
+                                 reduce_bits_vert_shift);
+          const __m128i temp_lo_16 = _mm_packus_epi32(res_lo, res_lo);
+          __m128i res_lo_16;
+          if (conv_params->do_average) {
+            __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+            const __m128i p_16 = _mm_loadl_epi64(p);
+
+            if (conv_params->use_jnt_comp_avg) {
+              const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
+              const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
+              const __m128i shifted_32 =
+                  _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+              res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
+            } else {
+              res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
+            }
+
+            res_lo_16 = _mm_add_epi16(res_lo_16, res_sub_const);
+
+            res_lo_16 = _mm_sra_epi16(
+                _mm_add_epi16(res_lo_16, round_bits_const), round_bits_shift);
+            __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
+            *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo);
+          } else {
+            _mm_storel_epi64(p, temp_lo_16);
+          }
           if (p_width > 4) {
-            res_hi = _mm_add_epi32(res_hi, round_const);
+            __m128i *const p4 =
+                (__m128i *)&conv_params
+                    ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+
+            res_hi = _mm_add_epi32(res_hi, res_add_const);
             res_hi =
-                _mm_srl_epi16(res_hi, _mm_cvtsi32_si128(conv_params->round_1));
-            if (comp_avg)
-              res_hi = _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi);
-            _mm_storeu_si128(p + 1, res_hi);
+                _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
+                              reduce_bits_vert_shift);
+            const __m128i temp_hi_16 = _mm_packus_epi32(res_hi, res_hi);
+            __m128i res_hi_16;
+
+            if (conv_params->do_average) {
+              __m128i *const dst8_4 =
+                  (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+              const __m128i p4_16 = _mm_loadl_epi64(p4);
+
+              if (conv_params->use_jnt_comp_avg) {
+                const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
+                const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, wt);
+                const __m128i shifted_32 =
+                    _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+                res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
+              } else {
+                res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
+              }
+              res_hi_16 = _mm_add_epi16(res_hi_16, res_sub_const);
+
+              res_hi_16 = _mm_sra_epi16(
+                  _mm_add_epi16(res_hi_16, round_bits_const), round_bits_shift);
+              __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
+              *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
+
+            } else {
+              _mm_storel_epi64(p4, temp_hi_16);
+            }
           }
         } else {
-#else
-        {
-#endif
           // Round and pack into 8 bits
           const __m128i round_const =
-              _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
-                             ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
+              _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+                             ((1 << reduce_bits_vert) >> 1));
 
           const __m128i res_lo_round = _mm_srai_epi32(
-              _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
+              _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
           const __m128i res_hi_round = _mm_srai_epi32(
-              _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
+              _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
 
           const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
           __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
@@ -519,13 +610,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
           // to only output 4 pixels at this point, to avoid encode/decode
           // mismatches when encoding with multiple threads.
           if (p_width == 4) {
-            if (comp_avg) {
-              const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
-              res_8bit = _mm_avg_epu8(res_8bit, orig);
-            }
             *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
           } else {
-            if (comp_avg) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
             _mm_storel_epi64(p, res_8bit);
           }
         }
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
new file mode 100644
index 000000000..e1449fd21
--- /dev/null
+++ b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
+// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
+// on the left.
+// A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be
+// loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
+void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h,
+                                      const ConvolveParams *conv_params) {
+  const int bd = 8;
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(!(w & 7));
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  DECLARE_ALIGNED(32, uint16_t,
+                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+  int intermediate_height = h + SUBPEL_TAPS - 1;
+  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+  const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+  const __m128i zero_128 = _mm_setzero_si128();
+  const __m256i zero_256 = _mm256_setzero_si256();
+
+  // Add an offset to account for the "add_src" part of the convolve function.
+  const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+
+  const __m256i clamp_low = zero_256;
+  const __m256i clamp_high =
+      _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+
+  /* Horizontal filter */
+  {
+    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+    const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
+
+    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+    const __m256i round_const = _mm256_set1_epi32(
+        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+    for (int i = 0; i < intermediate_height; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        const uint8_t *data_ij = src_ptr + i * src_stride + j;
+
+        // Load 8-bit src data
+        const __m128i data_0 = xx_loadu_128(data_ij + 0);
+        const __m128i data_1 = xx_loadu_128(data_ij + 1);
+        const __m128i data_2 = xx_loadu_128(data_ij + 2);
+        const __m128i data_3 = xx_loadu_128(data_ij + 3);
+        const __m128i data_4 = xx_loadu_128(data_ij + 4);
+        const __m128i data_5 = xx_loadu_128(data_ij + 5);
+        const __m128i data_6 = xx_loadu_128(data_ij + 6);
+        const __m128i data_7 = xx_loadu_128(data_ij + 7);
+
+        // (Zero-)Extend 8-bit data to 16-bit data
+        const __m256i src_0 = _mm256_cvtepu8_epi16(data_0);
+        const __m256i src_1 = _mm256_cvtepu8_epi16(data_1);
+        const __m256i src_2 = _mm256_cvtepu8_epi16(data_2);
+        const __m256i src_3 = _mm256_cvtepu8_epi16(data_3);
+        const __m256i src_4 = _mm256_cvtepu8_epi16(data_4);
+        const __m256i src_5 = _mm256_cvtepu8_epi16(data_5);
+        const __m256i src_6 = _mm256_cvtepu8_epi16(data_6);
+        const __m256i src_7 = _mm256_cvtepu8_epi16(data_7);
+
+        // Multiply src data by filter coeffs and sum pairs
+        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+        // Calculate scalar product for even- and odd-indices separately,
+        // increasing to 32-bit precision
+        const __m256i res_even_sum = _mm256_add_epi32(
+            _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
+        const __m256i res_odd_sum = _mm256_add_epi32(
+            _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
+
+        const __m256i res_even = _mm256_srai_epi32(
+            _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
+        const __m256i res_odd = _mm256_srai_epi32(
+            _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
+
+        // Reduce to 16-bit precision and pack even- and odd-index results
+        // back into one register. The _mm256_packs_epi32 intrinsic returns
+        // a register with the pixels ordered as follows:
+        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+        const __m256i res = _mm256_packs_epi32(res_even, res_odd);
+        const __m256i res_clamped =
+            _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
+
+        // Store in a temporary array
+        yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    // coeffs [ g7 g6 g5 g4 g3 g2 g1 g0 ]
+    const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
+
+    // coeffs [ g3 g2 g3 g2 g1 g0 g1 g0 ]
+    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs [ g7 g6 g7 g6 g5 g4 g5 g4 ]
+    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ]
+    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ]
+    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ]
+    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+    // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ]
+    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+    // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ][ g1 g0 g1 g0 g1 g0 g1 g0 ]
+    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+    // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ][ g3 g2 g3 g2 g3 g2 g3 g2 ]
+    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+    // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ][ g5 g4 g5 g4 g5 g4 g5 g4 ]
+    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+    // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ][ g7 g6 g7 g6 g7 g6 g7 g6 ]
+    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+    const __m256i round_const =
+        _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+                          (1 << (bd + conv_params->round_1 - 1)));
+
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        const uint16_t *data_ij = temp + i * MAX_SB_SIZE + j;
+
+        // Load 16-bit data from the output of the horizontal filter in
+        // which the pixels are ordered as follows:
+        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+        const __m256i data_0 = yy_loadu_256(data_ij + 0 * MAX_SB_SIZE);
+        const __m256i data_1 = yy_loadu_256(data_ij + 1 * MAX_SB_SIZE);
+        const __m256i data_2 = yy_loadu_256(data_ij + 2 * MAX_SB_SIZE);
+        const __m256i data_3 = yy_loadu_256(data_ij + 3 * MAX_SB_SIZE);
+        const __m256i data_4 = yy_loadu_256(data_ij + 4 * MAX_SB_SIZE);
+        const __m256i data_5 = yy_loadu_256(data_ij + 5 * MAX_SB_SIZE);
+        const __m256i data_6 = yy_loadu_256(data_ij + 6 * MAX_SB_SIZE);
+        const __m256i data_7 = yy_loadu_256(data_ij + 7 * MAX_SB_SIZE);
+
+        // Filter the even-indices, increasing to 32-bit precision
+        const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
+        const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
+        const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
+        const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
+
+        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+
+        const __m256i res_even = _mm256_add_epi32(
+            _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
+
+        // Filter the odd-indices, increasing to 32-bit precision
+        const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
+        const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
+        const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
+        const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
+
+        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+        const __m256i res_odd = _mm256_add_epi32(
+            _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
+
+        // Pixels are currently in the following order:
+        // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
+        // res_odd order:  [ 15 13 11 9 ] [ 7 5 3 1 ]
+        //
+        // Rearrange the pixels into the following order:
+        // res_lo order: [ 11 10  9  8 ] [ 3 2 1 0 ]
+        // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
+        const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+        const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+
+        const __m256i res_lo_round = _mm256_srai_epi32(
+            _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
+        const __m256i res_hi_round = _mm256_srai_epi32(
+            _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
+
+        // Reduce to 16-bit precision and pack into the correct order:
+        // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
+        const __m256i res_16bit =
+            _mm256_packs_epi32(res_lo_round, res_hi_round);
+
+        // Reduce to 8-bit precision. This messes up the order:
+        // [ - - - - - - - - 15 14 13 12 11 10 9 8 ]
+        // [ - - - - - - - - 7 6 5 4 3 2 1 0 ]
+        const __m256i res_8bit =
+            _mm256_packus_epi16(res_16bit, zero_256 /* don't care value */);
+
+        // Swap the two central 32-bit values to get the order:
+        // [ - - - - - - - - - - - - - - - - ]
+        // [ 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 ]
+        const __m256i res_8bit2 = _mm256_permute4x64_epi64(res_8bit, 0xd8);
+
+        // Store the lower 128-bit lane in the dst array
+        xx_storeu_128(dst + i * dst_stride + j,
+                      _mm256_castsi256_si128(res_8bit2));
+      }
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
index 14352895d..3083d224b 100644
--- a/third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c
+++ b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
@@ -12,23 +12,26 @@
 #include <emmintrin.h>
 #include <assert.h>
 
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 
-void aom_convolve8_add_src_hip_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const int16_t *filter_x, int x_step_q4,
-                                    const int16_t *filter_y, int y_step_q4,
-                                    int w, int h) {
+void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h,
+                                      const ConvolveParams *conv_params) {
   const int bd = 8;
   assert(x_step_q4 == 16 && y_step_q4 == 16);
   assert(!(w & 7));
   (void)x_step_q4;
   (void)y_step_q4;
 
-  uint16_t temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE];
+  DECLARE_ALIGNED(16, uint16_t,
+                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
   int intermediate_height = h + SUBPEL_TAPS - 1;
   int i, j;
   const int center_tap = ((SUBPEL_TAPS - 1) / 2);
@@ -57,9 +60,8 @@ void aom_convolve8_add_src_hip_sse2(const uint8_t *src, ptrdiff_t src_stride,
     // coeffs 6 7 6 7 6 7 6 7
     const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
 
-    const __m128i round_const =
-        _mm_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) +
-                       (1 << (bd + FILTER_BITS - 1)));
+    const __m128i round_const = _mm_set1_epi32(
+        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
 
     for (i = 0; i < intermediate_height; ++i) {
       for (j = 0; j < w; j += 8) {
@@ -79,7 +81,7 @@ void aom_convolve8_add_src_hip_sse2(const uint8_t *src, ptrdiff_t src_stride,
         __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
                                          _mm_add_epi32(res_2, res_6));
         res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
-                                  FILTER_BITS - EXTRAPREC_BITS);
+                                  conv_params->round_0);
 
         // Filter odd-index pixels
         const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
@@ -94,12 +96,13 @@ void aom_convolve8_add_src_hip_sse2(const uint8_t *src, ptrdiff_t src_stride,
         __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
                                         _mm_add_epi32(res_3, res_7));
         res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
-                                 FILTER_BITS - EXTRAPREC_BITS);
+                                 conv_params->round_0);
 
         // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
         __m128i res = _mm_packs_epi32(res_even, res_odd);
-        res = _mm_min_epi16(_mm_max_epi16(res, zero),
-                            _mm_set1_epi16(EXTRAPREC_CLAMP_LIMIT(bd) - 1));
+        res = _mm_min_epi16(
+            _mm_max_epi16(res, zero),
+            _mm_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1));
         _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
       }
     }
@@ -125,8 +128,8 @@ void aom_convolve8_add_src_hip_sse2(const uint8_t *src, ptrdiff_t src_stride,
     const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
 
     const __m128i round_const =
-        _mm_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) -
-                       (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)));
+        _mm_set1_epi32((1 << (conv_params->round_1 - 1)) -
+                       (1 << (bd + conv_params->round_1 - 1)));
 
     for (i = 0; i < h; ++i) {
       for (j = 0; j < w; j += 8) {
@@ -180,9 +183,9 @@ void aom_convolve8_add_src_hip_sse2(const uint8_t *src, ptrdiff_t src_stride,
         const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
 
         const __m128i res_lo_round = _mm_srai_epi32(
-            _mm_add_epi32(res_lo, round_const), FILTER_BITS + EXTRAPREC_BITS);
+            _mm_add_epi32(res_lo, round_const), conv_params->round_1);
         const __m128i res_hi_round = _mm_srai_epi32(
-            _mm_add_epi32(res_hi, round_const), FILTER_BITS + EXTRAPREC_BITS);
+            _mm_add_epi32(res_hi, round_const), conv_params->round_1);
 
         const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
         __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
diff --git a/third_party/aom/av1/common/zigzag.h b/third_party/aom/av1/common/zigzag.h
deleted file mode 100644
index c58b18b57..000000000
--- a/third_party/aom/av1/common/zigzag.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#if !defined(_zigzag_H)
-# define _zigzag_H (1)
-
-extern const unsigned char OD_ZIGZAG4_DCT_DCT[15][2];
-extern const unsigned char OD_ZIGZAG4_ADST_DCT[15][2];
-extern const unsigned char OD_ZIGZAG4_DCT_ADST[15][2];
-#define OD_ZIGZAG4_ADST_ADST OD_ZIGZAG4_DCT_DCT
-
-extern const unsigned char OD_ZIGZAG8_DCT_DCT[48][2];
-extern const unsigned char OD_ZIGZAG8_ADST_DCT[48][2];
-extern const unsigned char OD_ZIGZAG8_DCT_ADST[48][2];
-#define OD_ZIGZAG8_ADST_ADST OD_ZIGZAG8_DCT_DCT
-
-extern const unsigned char OD_ZIGZAG16_DCT_DCT[192][2];
-extern const unsigned char OD_ZIGZAG16_ADST_DCT[192][2];
-extern const unsigned char OD_ZIGZAG16_DCT_ADST[192][2];
-#define OD_ZIGZAG16_ADST_ADST OD_ZIGZAG16_DCT_DCT
-
-extern const unsigned char OD_ZIGZAG32_DCT_DCT[768][2];
-#endif
diff --git a/third_party/aom/av1/common/zigzag16.c b/third_party/aom/av1/common/zigzag16.c
deleted file mode 100644
index 6df6e3855..000000000
--- a/third_party/aom/av1/common/zigzag16.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/* This file is generated by gen_zigzag16.m */
-
-/* clang-format off */
-
-#include "odintrin.h"
-OD_EXTERN const unsigned char OD_ZIGZAG16_DCT_DCT[192][2] = {
-  {8, 0}, {8, 1}, {8, 2}, {9, 0},
-  {8, 3}, {9, 1}, {9, 2}, {10, 0},
-  {9, 3}, {10, 1}, {10, 2}, {11, 0},
-  {10, 3}, {11, 1}, {11, 2}, {11, 3},
-  {12, 0}, {12, 1}, {13, 0}, {12, 2},
-  {12, 3}, {13, 1}, {13, 2}, {14, 0},
-  {13, 3}, {14, 1}, {15, 0}, {14, 2},
-  {14, 3}, {15, 1}, {15, 2}, {15, 3},
-  {0, 8}, {1, 8}, {0, 9}, {2, 8},
-  {1, 9}, {3, 8}, {0, 10}, {2, 9},
-  {1, 10}, {3, 9}, {0, 11}, {2, 10},
-  {1, 11}, {3, 10}, {0, 12}, {2, 11},
-  {1, 12}, {3, 11}, {0, 13}, {2, 12},
-  {1, 13}, {0, 14}, {3, 12}, {2, 13},
-  {1, 14}, {3, 13}, {0, 15}, {2, 14},
-  {1, 15}, {3, 14}, {2, 15}, {3, 15},
-  {4, 8}, {5, 8}, {4, 9}, {8, 4},
-  {8, 5}, {6, 8}, {5, 9}, {4, 10},
-  {9, 4}, {8, 6}, {7, 8}, {9, 5},
-  {5, 10}, {8, 7}, {6, 9}, {4, 11},
-  {10, 4}, {9, 6}, {7, 9}, {8, 8},
-  {10, 5}, {6, 10}, {5, 11}, {9, 7},
-  {8, 9}, {10, 6}, {7, 10}, {4, 12},
-  {11, 4}, {9, 8}, {6, 11}, {10, 7},
-  {11, 5}, {5, 12}, {8, 10}, {7, 11},
-  {9, 9}, {4, 13}, {10, 8}, {11, 6},
-  {11, 7}, {6, 12}, {8, 11}, {9, 10},
-  {12, 4}, {5, 13}, {10, 9}, {12, 5},
-  {7, 12}, {11, 8}, {4, 14}, {6, 13},
-  {10, 10}, {9, 11}, {12, 6}, {13, 4},
-  {11, 9}, {8, 12}, {5, 14}, {12, 7},
-  {7, 13}, {4, 15}, {13, 5}, {10, 11},
-  {11, 10}, {9, 12}, {13, 6}, {12, 8},
-  {6, 14}, {8, 13}, {5, 15}, {13, 7},
-  {14, 4}, {12, 9}, {7, 14}, {11, 11},
-  {10, 12}, {9, 13}, {14, 5}, {6, 15},
-  {13, 8}, {8, 14}, {12, 10}, {14, 6},
-  {7, 15}, {13, 9}, {15, 4}, {10, 13},
-  {11, 12}, {14, 7}, {9, 14}, {12, 11},
-  {8, 15}, {15, 5}, {13, 10}, {14, 8},
-  {11, 13}, {15, 6}, {9, 15}, {10, 14},
-  {14, 9}, {15, 7}, {13, 11}, {12, 12},
-  {10, 15}, {11, 14}, {15, 8}, {14, 10},
-  {12, 13}, {13, 12}, {15, 9}, {11, 15},
-  {14, 11}, {13, 13}, {15, 10}, {12, 14},
-  {13, 14}, {15, 11}, {14, 12}, {12, 15},
-  {14, 13}, {13, 15}, {15, 12}, {14, 14},
-  {15, 13}, {14, 15}, {15, 14}, {15, 15}
-  };
-
-OD_EXTERN const unsigned char OD_ZIGZAG16_ADST_DCT[192][2] = {
-  {8, 0}, {9, 0}, {10, 0}, {8, 1},
-  {11, 0}, {9, 1}, {8, 2}, {12, 0},
-  {10, 1}, {9, 2}, {8, 3}, {13, 0},
-  {11, 1}, {10, 2}, {9, 3}, {14, 0},
-  {12, 1}, {10, 3}, {15, 0}, {11, 2},
-  {13, 1}, {11, 3}, {12, 2}, {14, 1},
-  {12, 3}, {13, 2}, {15, 1}, {13, 3},
-  {14, 2}, {14, 3}, {15, 2}, {15, 3},
-  {0, 8}, {1, 8}, {2, 8}, {0, 9},
-  {3, 8}, {1, 9}, {2, 9}, {0, 10},
-  {3, 9}, {1, 10}, {2, 10}, {0, 11},
-  {3, 10}, {1, 11}, {2, 11}, {0, 12},
-  {3, 11}, {1, 12}, {2, 12}, {0, 13},
-  {3, 12}, {1, 13}, {0, 14}, {2, 13},
-  {0, 15}, {1, 14}, {3, 13}, {2, 14},
-  {1, 15}, {3, 14}, {2, 15}, {3, 15},
-  {8, 4}, {9, 4}, {8, 5}, {4, 8},
-  {10, 4}, {9, 5}, {5, 8}, {8, 6},
-  {4, 9}, {10, 5}, {9, 6}, {6, 8},
-  {8, 7}, {11, 4}, {7, 8}, {5, 9},
-  {9, 7}, {11, 5}, {10, 6}, {4, 10},
-  {6, 9}, {8, 8}, {5, 10}, {7, 9},
-  {12, 4}, {10, 7}, {9, 8}, {11, 6},
-  {8, 9}, {4, 11}, {6, 10}, {7, 10},
-  {12, 5}, {5, 11}, {10, 8}, {11, 7},
-  {9, 9}, {4, 12}, {13, 4}, {8, 10},
-  {6, 11}, {12, 6}, {5, 12}, {10, 9},
-  {7, 11}, {9, 10}, {11, 8}, {13, 5},
-  {8, 11}, {4, 13}, {6, 12}, {10, 10},
-  {12, 7}, {11, 9}, {7, 12}, {14, 4},
-  {5, 13}, {9, 11}, {13, 6}, {8, 12},
-  {4, 14}, {12, 8}, {6, 13}, {11, 10},
-  {10, 11}, {12, 9}, {5, 14}, {13, 7},
-  {14, 5}, {9, 12}, {4, 15}, {7, 13},
-  {8, 13}, {6, 14}, {13, 8}, {11, 11},
-  {10, 12}, {15, 4}, {12, 10}, {14, 6},
-  {13, 9}, {5, 15}, {9, 13}, {7, 14},
-  {15, 5}, {6, 15}, {8, 14}, {14, 7},
-  {11, 12}, {7, 15}, {9, 14}, {13, 10},
-  {10, 13}, {14, 8}, {15, 6}, {14, 9},
-  {12, 11}, {8, 15}, {15, 7}, {10, 14},
-  {11, 13}, {9, 15}, {13, 11}, {12, 12},
-  {15, 8}, {14, 10}, {15, 9}, {10, 15},
-  {11, 14}, {13, 12}, {12, 13}, {15, 10},
-  {14, 11}, {11, 15}, {13, 13}, {15, 11},
-  {14, 12}, {12, 14}, {15, 12}, {13, 14},
-  {12, 15}, {14, 13}, {13, 15}, {15, 13},
-  {14, 14}, {15, 14}, {14, 15}, {15, 15}
-  };
-
-OD_EXTERN const unsigned char OD_ZIGZAG16_DCT_ADST[192][2] = {
-  {8, 0}, {8, 1}, {8, 2}, {8, 3},
-  {9, 0}, {9, 1}, {9, 2}, {9, 3},
-  {10, 0}, {10, 1}, {10, 2}, {10, 3},
-  {11, 0}, {11, 1}, {11, 2}, {11, 3},
-  {12, 0}, {12, 1}, {12, 2}, {12, 3},
-  {13, 0}, {13, 1}, {13, 2}, {13, 3},
-  {14, 0}, {15, 0}, {14, 1}, {14, 2},
-  {14, 3}, {15, 1}, {15, 2}, {15, 3},
-  {0, 8}, {0, 9}, {0, 10}, {1, 8},
-  {0, 11}, {1, 9}, {2, 8}, {0, 12},
-  {1, 10}, {2, 9}, {0, 13}, {1, 11},
-  {3, 8}, {2, 10}, {0, 14}, {1, 12},
-  {3, 9}, {0, 15}, {2, 11}, {3, 10},
-  {1, 13}, {2, 12}, {3, 11}, {1, 14},
-  {2, 13}, {1, 15}, {3, 12}, {2, 14},
-  {3, 13}, {2, 15}, {3, 14}, {3, 15},
-  {4, 8}, {4, 9}, {5, 8}, {4, 10},
-  {5, 9}, {4, 11}, {6, 8}, {5, 10},
-  {8, 4}, {6, 9}, {4, 12}, {5, 11},
-  {8, 5}, {6, 10}, {7, 8}, {8, 6},
-  {4, 13}, {7, 9}, {5, 12}, {8, 7},
-  {9, 4}, {6, 11}, {8, 8}, {7, 10},
-  {5, 13}, {9, 5}, {4, 14}, {9, 6},
-  {8, 9}, {6, 12}, {9, 7}, {7, 11},
-  {4, 15}, {8, 10}, {9, 8}, {5, 14},
-  {10, 4}, {6, 13}, {10, 5}, {9, 9},
-  {7, 12}, {8, 11}, {10, 6}, {5, 15},
-  {10, 7}, {6, 14}, {9, 10}, {7, 13},
-  {8, 12}, {10, 8}, {9, 11}, {6, 15},
-  {11, 4}, {11, 5}, {10, 9}, {8, 13},
-  {7, 14}, {11, 6}, {9, 12}, {11, 7},
-  {10, 10}, {7, 15}, {8, 14}, {12, 4},
-  {11, 8}, {12, 5}, {9, 13}, {10, 11},
-  {8, 15}, {11, 9}, {12, 6}, {12, 7},
-  {10, 12}, {9, 14}, {11, 10}, {13, 4},
-  {12, 8}, {9, 15}, {13, 5}, {11, 11},
-  {12, 9}, {10, 13}, {13, 6}, {13, 7},
-  {12, 10}, {14, 4}, {11, 12}, {13, 8},
-  {10, 14}, {14, 5}, {12, 11}, {13, 9},
-  {14, 6}, {10, 15}, {11, 13}, {15, 4},
-  {14, 7}, {12, 12}, {13, 10}, {14, 8},
-  {15, 5}, {13, 11}, {15, 6}, {11, 14},
-  {14, 9}, {12, 13}, {11, 15}, {15, 7},
-  {14, 10}, {15, 8}, {13, 12}, {12, 14},
-  {15, 9}, {14, 11}, {13, 13}, {12, 15},
-  {15, 10}, {14, 12}, {13, 14}, {15, 11},
-  {13, 15}, {14, 13}, {14, 14}, {15, 12},
-  {14, 15}, {15, 13}, {15, 14}, {15, 15}
-  };
diff --git a/third_party/aom/av1/common/zigzag32.c b/third_party/aom/av1/common/zigzag32.c
deleted file mode 100644
index cb3b9bc63..000000000
--- a/third_party/aom/av1/common/zigzag32.c
+++ /dev/null
@@ -1,199 +0,0 @@
-/* This file is generated by gen_zigzag32.m */
-
-/* clang-format off */
-
-#include "odintrin.h"
-OD_EXTERN const unsigned char OD_ZIGZAG32_DCT_DCT[768][2] = {
-  { 16, 0 }, { 17, 0 }, { 18, 0 }, { 19, 0 },
-  { 16, 1 }, { 17, 1 }, { 20, 0 }, { 16, 2 },
-  { 18, 1 }, { 21, 0 }, { 17, 2 }, { 16, 3 },
-  { 19, 1 }, { 22, 0 }, { 18, 2 }, { 17, 3 },
-  { 20, 1 }, { 16, 4 }, { 23, 0 }, { 19, 2 },
-  { 24, 0 }, { 16, 5 }, { 21, 1 }, { 17, 4 },
-  { 18, 3 }, { 20, 2 }, { 17, 5 }, { 16, 6 },
-  { 19, 3 }, { 18, 4 }, { 25, 0 }, { 22, 1 },
-  { 16, 7 }, { 21, 2 }, { 17, 6 }, { 20, 3 },
-  { 26, 0 }, { 18, 5 }, { 19, 4 }, { 17, 7 },
-  { 23, 1 }, { 22, 2 }, { 18, 6 }, { 27, 0 },
-  { 19, 5 }, { 24, 1 }, { 21, 3 }, { 28, 0 },
-  { 20, 4 }, { 18, 7 }, { 19, 6 }, { 23, 2 },
-  { 29, 0 }, { 25, 1 }, { 21, 4 }, { 30, 0 },
-  { 20, 5 }, { 22, 3 }, { 31, 0 }, { 19, 7 },
-  { 24, 2 }, { 26, 1 }, { 20, 6 }, { 21, 5 },
-  { 22, 4 }, { 23, 3 }, { 27, 1 }, { 25, 2 },
-  { 20, 7 }, { 28, 1 }, { 24, 3 }, { 21, 6 },
-  { 22, 5 }, { 23, 4 }, { 26, 2 }, { 21, 7 },
-  { 29, 1 }, { 25, 3 }, { 30, 1 }, { 27, 2 },
-  { 22, 6 }, { 23, 5 }, { 31, 1 }, { 24, 4 },
-  { 26, 3 }, { 28, 2 }, { 22, 7 }, { 23, 6 },
-  { 25, 4 }, { 24, 5 }, { 29, 2 }, { 30, 2 },
-  { 27, 3 }, { 23, 7 }, { 31, 2 }, { 24, 6 },
-  { 26, 4 }, { 25, 5 }, { 28, 3 }, { 24, 7 },
-  { 27, 4 }, { 29, 3 }, { 25, 6 }, { 26, 5 },
-  { 30, 3 }, { 31, 3 }, { 28, 4 }, { 27, 5 },
-  { 25, 7 }, { 29, 4 }, { 26, 6 }, { 28, 5 },
-  { 30, 4 }, { 26, 7 }, { 27, 6 }, { 31, 4 },
-  { 29, 5 }, { 27, 7 }, { 30, 5 }, { 28, 6 },
-  { 31, 5 }, { 29, 6 }, { 28, 7 }, { 30, 6 },
-  { 31, 6 }, { 29, 7 }, { 30, 7 }, { 31, 7 },
-  { 0, 16 }, { 0, 17 }, { 1, 16 }, { 0, 18 },
-  { 1, 17 }, { 0, 19 }, { 2, 16 }, { 1, 18 },
-  { 0, 20 }, { 2, 17 }, { 3, 16 }, { 1, 19 },
-  { 2, 18 }, { 0, 21 }, { 3, 17 }, { 4, 16 },
-  { 1, 20 }, { 2, 19 }, { 0, 22 }, { 3, 18 },
-  { 4, 17 }, { 5, 16 }, { 0, 23 }, { 3, 19 },
-  { 2, 20 }, { 1, 21 }, { 4, 18 }, { 6, 16 },
-  { 5, 17 }, { 3, 20 }, { 2, 21 }, { 1, 22 },
-  { 0, 24 }, { 0, 25 }, { 4, 19 }, { 7, 16 },
-  { 6, 17 }, { 5, 18 }, { 0, 26 }, { 3, 21 },
-  { 2, 22 }, { 1, 23 }, { 4, 20 }, { 5, 19 },
-  { 6, 18 }, { 1, 24 }, { 7, 17 }, { 0, 27 },
-  { 2, 23 }, { 3, 22 }, { 4, 21 }, { 1, 25 },
-  { 5, 20 }, { 7, 18 }, { 0, 28 }, { 6, 19 },
-  { 2, 24 }, { 1, 26 }, { 0, 29 }, { 4, 22 },
-  { 3, 23 }, { 2, 25 }, { 5, 21 }, { 0, 31 },
-  { 7, 19 }, { 6, 20 }, { 0, 30 }, { 1, 27 },
-  { 3, 24 }, { 2, 26 }, { 4, 23 }, { 5, 22 },
-  { 7, 20 }, { 1, 28 }, { 6, 21 }, { 3, 25 },
-  { 2, 27 }, { 1, 29 }, { 4, 24 }, { 2, 28 },
-  { 1, 30 }, { 7, 21 }, { 5, 23 }, { 3, 26 },
-  { 6, 22 }, { 1, 31 }, { 4, 25 }, { 7, 22 },
-  { 3, 27 }, { 2, 29 }, { 2, 30 }, { 5, 24 },
-  { 2, 31 }, { 6, 23 }, { 4, 26 }, { 3, 28 },
-  { 5, 25 }, { 3, 29 }, { 6, 24 }, { 7, 23 },
-  { 3, 30 }, { 4, 27 }, { 3, 31 }, { 5, 26 },
-  { 6, 25 }, { 4, 28 }, { 7, 24 }, { 4, 29 },
-  { 5, 27 }, { 4, 30 }, { 4, 31 }, { 6, 26 },
-  { 5, 28 }, { 7, 25 }, { 6, 27 }, { 5, 29 },
-  { 7, 26 }, { 5, 30 }, { 5, 31 }, { 6, 28 },
-  { 7, 27 }, { 6, 29 }, { 6, 30 }, { 7, 28 },
-  { 6, 31 }, { 7, 29 }, { 7, 30 }, { 7, 31 },
-  { 8, 16 }, { 9, 16 }, { 8, 17 }, { 10, 16 },
-  { 9, 17 }, { 16, 8 }, { 8, 18 }, { 16, 9 },
-  { 10, 17 }, { 11, 16 }, { 17, 8 }, { 9, 18 },
-  { 8, 19 }, { 16, 10 }, { 11, 17 }, { 12, 16 },
-  { 10, 18 }, { 17, 9 }, { 9, 19 }, { 16, 11 },
-  { 8, 20 }, { 18, 8 }, { 17, 10 }, { 10, 19 },
-  { 12, 17 }, { 11, 18 }, { 9, 20 }, { 16, 12 },
-  { 18, 9 }, { 8, 21 }, { 13, 16 }, { 17, 11 },
-  { 19, 8 }, { 18, 10 }, { 13, 17 }, { 16, 13 },
-  { 11, 19 }, { 12, 18 }, { 10, 20 }, { 17, 12 },
-  { 9, 21 }, { 19, 9 }, { 8, 22 }, { 14, 16 },
-  { 18, 11 }, { 11, 20 }, { 10, 21 }, { 20, 8 },
-  { 13, 18 }, { 16, 14 }, { 12, 19 }, { 17, 13 },
-  { 19, 10 }, { 14, 17 }, { 9, 22 }, { 18, 12 },
-  { 8, 23 }, { 17, 14 }, { 20, 9 }, { 15, 16 },
-  { 16, 15 }, { 13, 19 }, { 10, 22 }, { 19, 11 },
-  { 11, 21 }, { 14, 18 }, { 12, 20 }, { 18, 13 },
-  { 20, 10 }, { 21, 8 }, { 15, 17 }, { 9, 23 },
-  { 19, 12 }, { 11, 22 }, { 8, 24 }, { 21, 9 },
-  { 17, 15 }, { 16, 16 }, { 14, 19 }, { 18, 14 },
-  { 12, 21 }, { 13, 20 }, { 20, 11 }, { 10, 23 },
-  { 19, 13 }, { 15, 18 }, { 16, 17 }, { 21, 10 },
-  { 22, 8 }, { 9, 24 }, { 8, 25 }, { 20, 12 },
-  { 15, 19 }, { 11, 23 }, { 17, 16 }, { 18, 15 },
-  { 14, 20 }, { 12, 22 }, { 10, 24 }, { 22, 9 },
-  { 21, 11 }, { 19, 14 }, { 13, 21 }, { 16, 18 },
-  { 9, 25 }, { 17, 17 }, { 8, 26 }, { 20, 13 },
-  { 23, 8 }, { 12, 23 }, { 13, 22 }, { 22, 10 },
-  { 19, 15 }, { 15, 20 }, { 16, 19 }, { 21, 12 },
-  { 11, 24 }, { 14, 21 }, { 8, 27 }, { 18, 16 },
-  { 10, 25 }, { 9, 26 }, { 22, 11 }, { 20, 14 },
-  { 23, 9 }, { 18, 17 }, { 17, 18 }, { 17, 19 },
-  { 19, 16 }, { 21, 13 }, { 10, 26 }, { 12, 24 },
-  { 23, 10 }, { 24, 8 }, { 8, 28 }, { 16, 20 },
-  { 9, 27 }, { 15, 21 }, { 22, 12 }, { 14, 22 },
-  { 13, 23 }, { 20, 15 }, { 11, 25 }, { 24, 9 },
-  { 18, 18 }, { 19, 17 }, { 23, 11 }, { 10, 27 },
-  { 8, 29 }, { 12, 25 }, { 9, 28 }, { 8, 30 },
-  { 21, 14 }, { 13, 24 }, { 11, 26 }, { 25, 8 },
-  { 24, 10 }, { 20, 16 }, { 19, 18 }, { 14, 23 },
-  { 22, 13 }, { 8, 31 }, { 17, 20 }, { 9, 29 },
-  { 23, 12 }, { 15, 22 }, { 25, 9 }, { 11, 27 },
-  { 10, 28 }, { 20, 17 }, { 21, 15 }, { 18, 19 },
-  { 16, 21 }, { 24, 11 }, { 9, 30 }, { 12, 26 },
-  { 10, 29 }, { 22, 14 }, { 14, 24 }, { 9, 31 },
-  { 26, 8 }, { 13, 25 }, { 25, 10 }, { 18, 20 },
-  { 19, 19 }, { 11, 28 }, { 15, 23 }, { 20, 18 },
-  { 10, 30 }, { 12, 27 }, { 17, 21 }, { 23, 13 },
-  { 24, 12 }, { 21, 16 }, { 16, 22 }, { 26, 9 },
-  { 27, 8 }, { 13, 26 }, { 22, 15 }, { 10, 31 },
-  { 14, 25 }, { 12, 28 }, { 25, 11 }, { 21, 17 },
-  { 26, 10 }, { 20, 19 }, { 11, 29 }, { 15, 24 },
-  { 23, 14 }, { 27, 9 }, { 11, 30 }, { 13, 27 },
-  { 19, 20 }, { 24, 13 }, { 28, 8 }, { 11, 31 },
-  { 22, 16 }, { 17, 22 }, { 16, 23 }, { 25, 12 },
-  { 18, 21 }, { 12, 29 }, { 21, 18 }, { 28, 9 },
-  { 27, 10 }, { 26, 11 }, { 29, 8 }, { 14, 26 },
-  { 15, 25 }, { 13, 28 }, { 12, 30 }, { 23, 15 },
-  { 30, 8 }, { 16, 24 }, { 13, 29 }, { 25, 13 },
-  { 24, 14 }, { 20, 20 }, { 31, 8 }, { 12, 31 },
-  { 14, 27 }, { 28, 10 }, { 26, 12 }, { 22, 17 },
-  { 21, 19 }, { 17, 23 }, { 18, 22 }, { 29, 9 },
-  { 27, 11 }, { 19, 21 }, { 27, 12 }, { 30, 9 },
-  { 31, 9 }, { 13, 30 }, { 24, 15 }, { 23, 16 },
-  { 15, 26 }, { 14, 28 }, { 29, 10 }, { 28, 11 },
-  { 26, 13 }, { 17, 24 }, { 13, 31 }, { 25, 14 },
-  { 22, 18 }, { 16, 25 }, { 30, 10 }, { 14, 29 },
-  { 15, 27 }, { 19, 22 }, { 21, 20 }, { 20, 21 },
-  { 27, 13 }, { 29, 11 }, { 18, 23 }, { 23, 17 },
-  { 16, 26 }, { 31, 10 }, { 24, 16 }, { 14, 30 },
-  { 22, 19 }, { 14, 31 }, { 28, 12 }, { 26, 14 },
-  { 30, 11 }, { 15, 28 }, { 25, 15 }, { 17, 25 },
-  { 23, 18 }, { 18, 24 }, { 15, 30 }, { 29, 12 },
-  { 31, 11 }, { 16, 27 }, { 24, 17 }, { 28, 13 },
-  { 19, 23 }, { 15, 29 }, { 25, 16 }, { 17, 26 },
-  { 27, 14 }, { 22, 20 }, { 15, 31 }, { 20, 22 },
-  { 21, 21 }, { 16, 28 }, { 17, 27 }, { 30, 12 },
-  { 26, 15 }, { 19, 24 }, { 18, 25 }, { 23, 19 },
-  { 29, 13 }, { 31, 12 }, { 24, 18 }, { 26, 16 },
-  { 25, 17 }, { 16, 29 }, { 28, 14 }, { 20, 23 },
-  { 18, 26 }, { 21, 22 }, { 19, 25 }, { 22, 21 },
-  { 27, 15 }, { 17, 28 }, { 16, 30 }, { 26, 17 },
-  { 23, 20 }, { 16, 31 }, { 25, 18 }, { 27, 16 },
-  { 20, 24 }, { 24, 19 }, { 31, 13 }, { 30, 13 },
-  { 29, 14 }, { 18, 27 }, { 28, 15 }, { 17, 29 },
-  { 19, 26 }, { 17, 30 }, { 21, 23 }, { 22, 22 },
-  { 30, 14 }, { 20, 25 }, { 23, 21 }, { 17, 31 },
-  { 18, 28 }, { 25, 19 }, { 24, 20 }, { 28, 16 },
-  { 31, 14 }, { 26, 18 }, { 19, 27 }, { 29, 15 },
-  { 27, 17 }, { 30, 15 }, { 21, 24 }, { 22, 23 },
-  { 26, 19 }, { 23, 22 }, { 28, 17 }, { 29, 16 },
-  { 18, 30 }, { 24, 21 }, { 25, 20 }, { 18, 31 },
-  { 18, 29 }, { 20, 26 }, { 19, 28 }, { 27, 18 },
-  { 31, 15 }, { 20, 27 }, { 30, 16 }, { 19, 29 },
-  { 29, 17 }, { 31, 16 }, { 27, 19 }, { 21, 25 },
-  { 28, 18 }, { 26, 20 }, { 22, 24 }, { 25, 21 },
-  { 19, 30 }, { 24, 22 }, { 30, 17 }, { 21, 26 },
-  { 23, 23 }, { 19, 31 }, { 20, 28 }, { 31, 17 },
-  { 28, 19 }, { 27, 20 }, { 21, 27 }, { 29, 18 },
-  { 30, 18 }, { 25, 22 }, { 26, 21 }, { 20, 29 },
-  { 22, 25 }, { 24, 23 }, { 29, 19 }, { 23, 24 },
-  { 20, 31 }, { 20, 30 }, { 28, 20 }, { 21, 28 },
-  { 22, 26 }, { 31, 18 }, { 27, 21 }, { 30, 19 },
-  { 22, 27 }, { 29, 20 }, { 23, 25 }, { 24, 24 },
-  { 26, 22 }, { 21, 29 }, { 25, 23 }, { 31, 19 },
-  { 21, 30 }, { 23, 26 }, { 28, 21 }, { 21, 31 },
-  { 22, 28 }, { 30, 20 }, { 25, 24 }, { 27, 22 },
-  { 29, 21 }, { 26, 23 }, { 24, 25 }, { 31, 20 },
-  { 23, 27 }, { 22, 29 }, { 30, 21 }, { 28, 22 },
-  { 24, 26 }, { 25, 25 }, { 27, 23 }, { 22, 30 },
-  { 23, 28 }, { 22, 31 }, { 26, 24 }, { 31, 21 },
-  { 24, 27 }, { 29, 22 }, { 27, 24 }, { 30, 22 },
-  { 25, 26 }, { 28, 23 }, { 23, 30 }, { 23, 29 },
-  { 24, 28 }, { 25, 27 }, { 31, 22 }, { 23, 31 },
-  { 26, 25 }, { 28, 24 }, { 29, 23 }, { 24, 29 },
-  { 24, 30 }, { 27, 25 }, { 25, 28 }, { 26, 26 },
-  { 30, 23 }, { 26, 27 }, { 31, 23 }, { 28, 25 },
-  { 27, 26 }, { 25, 29 }, { 24, 31 }, { 29, 24 },
-  { 30, 24 }, { 27, 27 }, { 29, 25 }, { 26, 28 },
-  { 31, 24 }, { 25, 30 }, { 25, 31 }, { 28, 26 },
-  { 27, 28 }, { 26, 29 }, { 30, 25 }, { 29, 26 },
-  { 28, 27 }, { 26, 30 }, { 31, 25 }, { 27, 29 },
-  { 26, 31 }, { 30, 26 }, { 28, 28 }, { 31, 26 },
-  { 29, 27 }, { 27, 30 }, { 28, 29 }, { 27, 31 },
-  { 30, 27 }, { 31, 27 }, { 28, 30 }, { 29, 28 },
-  { 30, 28 }, { 29, 29 }, { 30, 29 }, { 31, 28 },
-  { 28, 31 }, { 29, 30 }, { 29, 31 }, { 31, 29 },
-  { 30, 30 }, { 30, 31 }, { 31, 30 }, { 31, 31 }
-};
diff --git a/third_party/aom/av1/common/zigzag4.c b/third_party/aom/av1/common/zigzag4.c
deleted file mode 100644
index 1fb5a320b..000000000
--- a/third_party/aom/av1/common/zigzag4.c
+++ /dev/null
@@ -1,22 +0,0 @@
-/* This file is generated by gen_zigzag4.m */
-
-/* clang-format off */
-
-#include "odintrin.h"
-OD_EXTERN const unsigned char OD_ZIGZAG4_DCT_DCT[15][2] = {
-  {0, 1}, {1, 0}, {1, 1}, {0, 2},
-  {2, 0}, {0, 3}, {1, 2}, {3, 0},
-  {2, 1}, {1, 3}, {2, 2}, {3, 1},
-  {2, 3}, {3, 2}, {3, 3} };
-
-OD_EXTERN const unsigned char OD_ZIGZAG4_ADST_DCT[15][2] = {
-  {1, 0}, {0, 1}, {2, 0}, {1, 1},
-  {3, 0}, {2, 1}, {0, 2}, {1, 2},
-  {3, 1}, {0, 3}, {2, 2}, {1, 3},
-  {3, 2}, {2, 3}, {3, 3} };
-
-OD_EXTERN const unsigned char OD_ZIGZAG4_DCT_ADST[15][2] = {
-  {0, 1}, {0, 2}, {1, 0}, {0, 3},
-  {1, 1}, {1, 2}, {2, 0}, {1, 3},
-  {2, 1}, {2, 2}, {3, 0}, {3, 1},
-  {2, 3}, {3, 2}, {3, 3} };
diff --git a/third_party/aom/av1/common/zigzag8.c b/third_party/aom/av1/common/zigzag8.c
deleted file mode 100644
index 3f11e0c03..000000000
--- a/third_party/aom/av1/common/zigzag8.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/* This file is generated by gen_zigzag8.m */
-
-/* clang-format off */
-
-#include "odintrin.h"
-
-OD_EXTERN const unsigned char OD_ZIGZAG8_DCT_DCT[48][2] = {
-  {4, 0}, {4, 1}, {5, 0}, {5, 1},
-  {6, 0}, {7, 0}, {6, 1}, {7, 1},
-  {0, 4}, {1, 4}, {0, 5}, {1, 5},
-  {0, 6}, {1, 6}, {0, 7}, {1, 7},
-  {2, 4}, {4, 2}, {3, 4}, {2, 5},
-  {4, 3}, {5, 2}, {4, 4}, {3, 5},
-  {5, 3}, {2, 6}, {4, 5}, {6, 2},
-  {5, 4}, {3, 6}, {2, 7}, {6, 3},
-  {5, 5}, {7, 2}, {4, 6}, {3, 7},
-  {6, 4}, {7, 3}, {4, 7}, {5, 6},
-  {6, 5}, {7, 4}, {5, 7}, {6, 6},
-  {7, 5}, {6, 7}, {7, 6}, {7, 7}
-  };
-
-OD_EXTERN const unsigned char OD_ZIGZAG8_ADST_DCT[48][2] = {
-  {4, 0}, {5, 0}, {4, 1}, {6, 0},
-  {5, 1}, {7, 0}, {6, 1}, {7, 1},
-  {0, 4}, {1, 4}, {0, 5}, {1, 5},
-  {0, 6}, {1, 6}, {0, 7}, {1, 7},
-  {4, 2}, {2, 4}, {5, 2}, {4, 3},
-  {3, 4}, {2, 5}, {5, 3}, {4, 4},
-  {6, 2}, {3, 5}, {5, 4}, {2, 6},
-  {4, 5}, {6, 3}, {7, 2}, {3, 6},
-  {2, 7}, {5, 5}, {6, 4}, {4, 6},
-  {7, 3}, {3, 7}, {5, 6}, {6, 5},
-  {4, 7}, {7, 4}, {5, 7}, {7, 5},
-  {6, 6}, {7, 6}, {6, 7}, {7, 7}
-  };
-
-OD_EXTERN const unsigned char OD_ZIGZAG8_DCT_ADST[48][2] = {
-  {4, 0}, {4, 1}, {5, 0}, {5, 1},
-  {6, 0}, {6, 1}, {7, 0}, {7, 1},
-  {0, 4}, {0, 5}, {1, 4}, {0, 6},
-  {1, 5}, {0, 7}, {1, 6}, {1, 7},
-  {2, 4}, {2, 5}, {3, 4}, {4, 2},
-  {2, 6}, {4, 3}, {3, 5}, {4, 4},
-  {2, 7}, {3, 6}, {5, 2}, {4, 5},
-  {5, 3}, {3, 7}, {5, 4}, {4, 6},
-  {6, 2}, {5, 5}, {4, 7}, {6, 3},
-  {6, 4}, {5, 6}, {7, 2}, {6, 5},
-  {7, 3}, {5, 7}, {7, 4}, {6, 6},
-  {7, 5}, {6, 7}, {7, 6}, {7, 7}
-  };
diff --git a/third_party/aom/av1/decoder/accounting.c b/third_party/aom/av1/decoder/accounting.c
index ba243c9e1..8d8f3dfdb 100644
--- a/third_party/aom/av1/decoder/accounting.c
+++ b/third_party/aom/av1/decoder/accounting.c
@@ -15,7 +15,7 @@
 #include <string.h>
 
 #include "aom/aom_integer.h"
-#include "./accounting.h"
+#include "av1/decoder/accounting.h"
 
 static int aom_accounting_hash(const char *str) {
   uint32_t val;
@@ -31,7 +31,7 @@ static int aom_accounting_hash(const char *str) {
 /* Dictionary lookup based on an open-addressing hash table. */
 int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str) {
   int hash;
-  int len;
+  size_t len;
   AccountingDictionary *dictionary;
   dictionary = &accounting->syms.dictionary;
   hash = aom_accounting_hash(str);
diff --git a/third_party/aom/av1/decoder/accounting.h b/third_party/aom/av1/decoder/accounting.h
index 889865b2e..9099d081b 100644
--- a/third_party/aom/av1/decoder/accounting.h
+++ b/third_party/aom/av1/decoder/accounting.h
@@ -11,6 +11,7 @@
 #ifndef AOM_ACCOUNTING_H_
 #define AOM_ACCOUNTING_H_
 #include <stdlib.h>
+#include "aom/aomdx.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -58,8 +59,6 @@ typedef struct {
   AccountingDictionary dictionary;
 } AccountingSymbols;
 
-typedef struct Accounting Accounting;
-
 struct Accounting {
   AccountingSymbols syms;
   /** Size allocated for symbols (not all may be used). */
diff --git a/third_party/aom/av1/decoder/decint.h b/third_party/aom/av1/decoder/decint.h
deleted file mode 100644
index e887ad5e0..000000000
--- a/third_party/aom/av1/decoder/decint.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#if !defined(_decint_H)
-# define _decint_H (1)
-# include "av1/common/pvq_state.h"
-# include "aom_dsp/bitreader.h"
-# include "aom_dsp/entdec.h"
-
-typedef struct daala_dec_ctx daala_dec_ctx;
-
-typedef struct daala_dec_ctx od_dec_ctx;
-
-
-struct daala_dec_ctx {
-  /* Stores context-adaptive CDFs for PVQ. */
-  od_state state;
-  /* AOM entropy decoder. */
-  aom_reader *r;
-  int use_activity_masking;
-  /* Mode of quantization matrice : FLAT (0) or HVS (1) */
-  int qm;
-};
-
-#endif
diff --git a/third_party/aom/av1/decoder/decodeframe.c b/third_party/aom/av1/decoder/decodeframe.c
index 9ec3b60eb..e92c6b28c 100644
--- a/third_party/aom/av1/decoder/decodeframe.c
+++ b/third_party/aom/av1/decoder/decodeframe.c
@@ -10,12 +10,12 @@
  */
 
 #include <assert.h>
-#include <stdlib.h>  // qsort()
+#include <stddef.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "./aom_scale_rtcd.h"
-#include "./av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom/aom_codec.h"
 #include "aom_dsp/aom_dsp_common.h"
@@ -23,19 +23,19 @@
 #include "aom_dsp/bitreader.h"
 #include "aom_dsp/bitreader_buffer.h"
 #include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/mem_ops.h"
 #include "aom_scale/aom_scale.h"
 #include "aom_util/aom_thread.h"
 
-#if CONFIG_BITSTREAM_DEBUG
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 #include "aom_util/debug_util.h"
-#endif  // CONFIG_BITSTREAM_DEBUG
+#endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 
 #include "av1/common/alloccommon.h"
-#if CONFIG_CDEF
 #include "av1/common/cdef.h"
-#endif
+#include "av1/common/cfl.h"
 #if CONFIG_INSPECTION
 #include "av1/decoder/inspection.h"
 #endif
@@ -49,78 +49,69 @@
 #include "av1/common/quant_common.h"
 #include "av1/common/reconinter.h"
 #include "av1/common/reconintra.h"
-#if CONFIG_FRAME_SUPERRES
 #include "av1/common/resize.h"
-#endif  // CONFIG_FRAME_SUPERRES
 #include "av1/common/seg_common.h"
 #include "av1/common/thread_common.h"
 #include "av1/common/tile_common.h"
-
+#include "av1/common/warped_motion.h"
+#include "av1/common/obmc.h"
 #include "av1/decoder/decodeframe.h"
 #include "av1/decoder/decodemv.h"
 #include "av1/decoder/decoder.h"
-#if CONFIG_LV_MAP
 #include "av1/decoder/decodetxb.h"
-#endif
 #include "av1/decoder/detokenize.h"
-#include "av1/decoder/dsubexp.h"
-#include "av1/decoder/symbolrate.h"
 
-#if CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION
-#include "av1/common/warped_motion.h"
-#endif  // CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION
-
-#define MAX_AV1_HEADER_SIZE 80
 #define ACCT_STR __func__
 
-#if CONFIG_PVQ
-#include "av1/common/partition.h"
-#include "av1/common/pvq.h"
-#include "av1/common/scan.h"
-#include "av1/decoder/decint.h"
-#include "av1/decoder/pvq_decoder.h"
-#include "av1/encoder/encodemb.h"
-#include "av1/encoder/hybrid_fwd_txfm.h"
-#endif
+// This is needed by ext_tile related unit tests.
+#define EXT_TILE_DEBUG 1
+#define MC_TEMP_BUF_PELS                       \
+  (((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2) * \
+   ((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2))
 
-#if CONFIG_CFL
-#include "av1/common/cfl.h"
-#endif
+// Checks that the remaining bits start with a 1 and ends with 0s.
+// It consumes an additional byte, if already byte aligned before the check.
+int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
+  AV1_COMMON *const cm = &pbi->common;
+  // bit_offset is set to 0 (mod 8) when the reader is already byte aligned
+  int bits_before_alignment = 8 - rb->bit_offset % 8;
+  int trailing = aom_rb_read_literal(rb, bits_before_alignment);
+  if (trailing != (1 << (bits_before_alignment - 1))) {
+    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    return -1;
+  }
+  return 0;
+}
 
-#if CONFIG_STRIPED_LOOP_RESTORATION && !CONFIG_LOOP_RESTORATION
-#error "striped_loop_restoration requires loop_restoration"
-#endif
+// Use only_chroma = 1 to only set the chroma planes
+static void set_planes_to_neutral_grey(AV1_COMMON *const cm,
+                                       const YV12_BUFFER_CONFIG *const buf,
+                                       int only_chroma) {
+  const int val = 1 << (cm->bit_depth - 1);
+
+  for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
+    const int is_uv = plane > 0;
+    for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) {
+      if (cm->use_highbitdepth) {
+        // TODO(yaowu): replace this with aom_memset16() for speed
+        for (int col_idx = 0; col_idx < buf->crop_widths[is_uv]; col_idx++) {
+          uint16_t *base = CONVERT_TO_SHORTPTR(buf->buffers[plane]);
+          base[row_idx * buf->strides[is_uv] + col_idx] = val;
+        }
+      } else {
+        memset(&buf->buffers[plane][row_idx * buf->uv_stride], 1 << 7,
+               buf->crop_widths[is_uv]);
+      }
+    }
+  }
+}
 
-#if CONFIG_LOOP_RESTORATION
 static void loop_restoration_read_sb_coeffs(const AV1_COMMON *const cm,
                                             MACROBLOCKD *xd,
                                             aom_reader *const r, int plane,
-                                            int rtile_idx);
-#endif
-
-static struct aom_read_bit_buffer *init_read_bit_buffer(
-    AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
-    const uint8_t *data_end, uint8_t clear_data[MAX_AV1_HEADER_SIZE]);
-static int read_compressed_header(AV1Decoder *pbi, const uint8_t *data,
-                                  size_t partition_size);
-static size_t read_uncompressed_header(AV1Decoder *pbi,
-                                       struct aom_read_bit_buffer *rb);
-
-static int is_compound_reference_allowed(const AV1_COMMON *cm) {
-#if CONFIG_ONE_SIDED_COMPOUND  // Normative in decoder
-  return !frame_is_intra_only(cm);
-#else
-  int i;
-  if (frame_is_intra_only(cm)) return 0;
-  for (i = 1; i < INTER_REFS_PER_FRAME; ++i)
-    if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) return 1;
-
-  return 0;
-#endif  // CONFIG_ONE_SIDED_COMPOUND
-}
+                                            int runit_idx);
 
 static void setup_compound_reference_mode(AV1_COMMON *cm) {
-#if CONFIG_EXT_REFS
   cm->comp_fwd_ref[0] = LAST_FRAME;
   cm->comp_fwd_ref[1] = LAST2_FRAME;
   cm->comp_fwd_ref[2] = LAST3_FRAME;
@@ -129,1952 +120,1099 @@ static void setup_compound_reference_mode(AV1_COMMON *cm) {
   cm->comp_bwd_ref[0] = BWDREF_FRAME;
   cm->comp_bwd_ref[1] = ALTREF2_FRAME;
   cm->comp_bwd_ref[2] = ALTREF_FRAME;
-#else   // !CONFIG_EXT_REFS
-  if (cm->ref_frame_sign_bias[LAST_FRAME] ==
-      cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
-    cm->comp_fixed_ref = ALTREF_FRAME;
-    cm->comp_var_ref[0] = LAST_FRAME;
-    cm->comp_var_ref[1] = GOLDEN_FRAME;
-  } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==
-             cm->ref_frame_sign_bias[ALTREF_FRAME]) {
-    cm->comp_fixed_ref = GOLDEN_FRAME;
-    cm->comp_var_ref[0] = LAST_FRAME;
-    cm->comp_var_ref[1] = ALTREF_FRAME;
-  } else {
-    cm->comp_fixed_ref = LAST_FRAME;
-    cm->comp_var_ref[0] = GOLDEN_FRAME;
-    cm->comp_var_ref[1] = ALTREF_FRAME;
-  }
-#endif  // CONFIG_EXT_REFS
 }
 
 static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
   return len != 0 && len <= (size_t)(end - start);
 }
 
-static int decode_unsigned_max(struct aom_read_bit_buffer *rb, int max) {
-  const int data = aom_rb_read_literal(rb, get_unsigned_bits(max));
-  return data > max ? max : data;
-}
-
 static TX_MODE read_tx_mode(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
-#if CONFIG_TX64X64
-  TX_MODE tx_mode;
-#endif
-  if (cm->all_lossless) return ONLY_4X4;
-#if CONFIG_VAR_TX_NO_TX_MODE
-  (void)rb;
-  return TX_MODE_SELECT;
-#else
-#if CONFIG_TX64X64
-  tx_mode = aom_rb_read_bit(rb) ? TX_MODE_SELECT : aom_rb_read_literal(rb, 2);
-  if (tx_mode == ALLOW_32X32) tx_mode += aom_rb_read_bit(rb);
-  return tx_mode;
-#else
-  return aom_rb_read_bit(rb) ? TX_MODE_SELECT : aom_rb_read_literal(rb, 2);
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_VAR_TX_NO_TX_MODE
+  if (cm->coded_lossless) return ONLY_4X4;
+  return aom_rb_read_bit(rb) ? TX_MODE_SELECT : TX_MODE_LARGEST;
 }
 
-#if !CONFIG_RESTRICT_COMPRESSED_HDR
-static void read_inter_mode_probs(FRAME_CONTEXT *fc, aom_reader *r) {
-  int i;
-  for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
-    av1_diff_update_prob(r, &fc->newmv_prob[i], ACCT_STR);
-  for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
-    av1_diff_update_prob(r, &fc->zeromv_prob[i], ACCT_STR);
-  for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
-    av1_diff_update_prob(r, &fc->refmv_prob[i], ACCT_STR);
-  for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
-    av1_diff_update_prob(r, &fc->drl_prob[i], ACCT_STR);
-}
-#endif
-
 static REFERENCE_MODE read_frame_reference_mode(
     const AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
-  if (is_compound_reference_allowed(cm)) {
-#if CONFIG_REF_ADAPT
-    return aom_rb_read_bit(rb) ? REFERENCE_MODE_SELECT : SINGLE_REFERENCE;
-#else
-    return aom_rb_read_bit(rb)
-               ? REFERENCE_MODE_SELECT
-               : (aom_rb_read_bit(rb) ? COMPOUND_REFERENCE : SINGLE_REFERENCE);
-#endif  // CONFIG_REF_ADAPT
-  } else {
+  if (frame_is_intra_only(cm)) {
     return SINGLE_REFERENCE;
+  } else {
+    return aom_rb_read_bit(rb) ? REFERENCE_MODE_SELECT : SINGLE_REFERENCE;
   }
 }
 
-#if !CONFIG_RESTRICT_COMPRESSED_HDR
-static void read_frame_reference_mode_probs(AV1_COMMON *cm, aom_reader *r) {
-  FRAME_CONTEXT *const fc = cm->fc;
-  int i;
-
-  if (cm->reference_mode == REFERENCE_MODE_SELECT)
-    for (i = 0; i < COMP_INTER_CONTEXTS; ++i)
-      av1_diff_update_prob(r, &fc->comp_inter_prob[i], ACCT_STR);
-
-  if (cm->reference_mode != COMPOUND_REFERENCE) {
-    for (i = 0; i < REF_CONTEXTS; ++i) {
-      int j;
-      for (j = 0; j < (SINGLE_REFS - 1); ++j) {
-        av1_diff_update_prob(r, &fc->single_ref_prob[i][j], ACCT_STR);
-      }
-    }
-  }
-
-  if (cm->reference_mode != SINGLE_REFERENCE) {
-#if CONFIG_EXT_COMP_REFS
-    for (i = 0; i < COMP_REF_TYPE_CONTEXTS; ++i)
-      av1_diff_update_prob(r, &fc->comp_ref_type_prob[i], ACCT_STR);
-
-    for (i = 0; i < UNI_COMP_REF_CONTEXTS; ++i) {
-      int j;
-      for (j = 0; j < (UNIDIR_COMP_REFS - 1); ++j)
-        av1_diff_update_prob(r, &fc->uni_comp_ref_prob[i][j], ACCT_STR);
-    }
-#endif  // CONFIG_EXT_COMP_REFS
-
-    for (i = 0; i < REF_CONTEXTS; ++i) {
-      int j;
-#if CONFIG_EXT_REFS
-      for (j = 0; j < (FWD_REFS - 1); ++j)
-        av1_diff_update_prob(r, &fc->comp_ref_prob[i][j], ACCT_STR);
-      for (j = 0; j < (BWD_REFS - 1); ++j)
-        av1_diff_update_prob(r, &fc->comp_bwdref_prob[i][j], ACCT_STR);
-#else
-      for (j = 0; j < (COMP_REFS - 1); ++j)
-        av1_diff_update_prob(r, &fc->comp_ref_prob[i][j], ACCT_STR);
-#endif  // CONFIG_EXT_REFS
-    }
-  }
-}
-
-static void update_mv_probs(aom_prob *p, int n, aom_reader *r) {
-  int i;
-  for (i = 0; i < n; ++i) av1_diff_update_prob(r, &p[i], ACCT_STR);
-}
-
-static void read_mv_probs(nmv_context *ctx, int allow_hp, aom_reader *r) {
-  int i;
-  if (allow_hp) {
-    for (i = 0; i < 2; ++i) {
-      nmv_component *const comp_ctx = &ctx->comps[i];
-      update_mv_probs(&comp_ctx->class0_hp, 1, r);
-      update_mv_probs(&comp_ctx->hp, 1, r);
-    }
-  }
-}
-#endif
-
 static void inverse_transform_block(MACROBLOCKD *xd, int plane,
-#if CONFIG_LGT_FROM_PRED
-                                    PREDICTION_MODE mode,
-#endif
                                     const TX_TYPE tx_type,
                                     const TX_SIZE tx_size, uint8_t *dst,
-                                    int stride, int16_t scan_line, int eob) {
+                                    int stride, int reduced_tx_set) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *const dqcoeff = pd->dqcoeff;
-  av1_inverse_transform_block(xd, dqcoeff,
-#if CONFIG_LGT_FROM_PRED
-                              mode,
-#endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                              xd->mrc_mask,
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                              tx_type, tx_size, dst, stride, eob);
+  eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+  uint16_t scan_line = eob_data->max_scan_line;
+  uint16_t eob = eob_data->eob;
+
+  memcpy(dqcoeff, pd->dqcoeff_block + xd->cb_offset[plane],
+         (scan_line + 1) * sizeof(dqcoeff[0]));
+  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, stride,
+                              eob, reduced_tx_set);
   memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0]));
 }
 
-static int get_block_idx(const MACROBLOCKD *xd, int plane, int row, int col) {
-  const int bsize = xd->mi[0]->mbmi.sb_type;
-  const struct macroblockd_plane *pd = &xd->plane[plane];
-#if CONFIG_CHROMA_SUB8X8
-  const BLOCK_SIZE plane_bsize =
-      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#elif CONFIG_CB4X4
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#else
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
-#endif
-  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-  const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-  const uint8_t txh_unit = tx_size_high_unit[tx_size];
-  return row * max_blocks_wide + col * txh_unit;
-}
-
-#if CONFIG_PVQ
-static int av1_pvq_decode_helper(MACROBLOCKD *xd, tran_low_t *ref_coeff,
-                                 tran_low_t *dqcoeff, int16_t *quant, int pli,
-                                 int bs, TX_TYPE tx_type, int xdec,
-                                 PVQ_SKIP_TYPE ac_dc_coded) {
-  unsigned int flags;  // used for daala's stream analyzer.
-  int off;
-  const int is_keyframe = 0;
-  const int has_dc_skip = 1;
-  int coeff_shift = 3 - av1_get_tx_scale(bs);
-  int hbd_downshift = 0;
-  int rounding_mask;
-  // DC quantizer for PVQ
-  int pvq_dc_quant;
-  int lossless = (quant[0] == 0);
-  const int blk_size = tx_size_wide[bs];
-  int eob = 0;
-  int i;
-  od_dec_ctx *dec = &xd->daala_dec;
-  int use_activity_masking = dec->use_activity_masking;
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_coeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
-
-  od_coeff ref_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX];
-  od_coeff out_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX];
-
-  hbd_downshift = xd->bd - 8;
-
-  od_raster_to_coding_order(ref_coeff_pvq, blk_size, tx_type, ref_coeff,
-                            blk_size);
-
-  assert(OD_COEFF_SHIFT >= 4);
-  if (lossless)
-    pvq_dc_quant = 1;
-  else {
-    if (use_activity_masking)
-      pvq_dc_quant =
-          OD_MAXI(1,
-                  (quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift) *
-                          dec->state.pvq_qm_q4[pli][od_qm_get_index(bs, 0)] >>
-                      4);
-    else
-      pvq_dc_quant =
-          OD_MAXI(1, quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift);
-  }
-
-  off = od_qm_offset(bs, xdec);
-
-  // copy int16 inputs to int32
-  for (i = 0; i < blk_size * blk_size; i++) {
-    ref_int32[i] =
-        AOM_SIGNED_SHL(ref_coeff_pvq[i], OD_COEFF_SHIFT - coeff_shift) >>
-        hbd_downshift;
-  }
-
-  od_pvq_decode(dec, ref_int32, out_int32,
-                OD_MAXI(1, quant[1] << (OD_COEFF_SHIFT - 3) >> hbd_downshift),
-                pli, bs, OD_PVQ_BETA[use_activity_masking][pli][bs],
-                is_keyframe, &flags, ac_dc_coded, dec->state.qm + off,
-                dec->state.qm_inv + off);
-
-  if (!has_dc_skip || out_int32[0]) {
-    out_int32[0] =
-        has_dc_skip + generic_decode(dec->r, &dec->state.adapt->model_dc[pli],
-                                     &dec->state.adapt->ex_dc[pli][bs][0], 2,
-                                     "dc:mag");
-    if (out_int32[0]) out_int32[0] *= aom_read_bit(dec->r, "dc:sign") ? -1 : 1;
-  }
-  out_int32[0] = out_int32[0] * pvq_dc_quant + ref_int32[0];
-
-  // copy int32 result back to int16
-  assert(OD_COEFF_SHIFT > coeff_shift);
-  rounding_mask = (1 << (OD_COEFF_SHIFT - coeff_shift - 1)) - 1;
-  for (i = 0; i < blk_size * blk_size; i++) {
-    out_int32[i] = AOM_SIGNED_SHL(out_int32[i], hbd_downshift);
-    dqcoeff_pvq[i] = (out_int32[i] + (out_int32[i] < 0) + rounding_mask) >>
-                     (OD_COEFF_SHIFT - coeff_shift);
-  }
-
-  od_coding_order_to_raster(dqcoeff, blk_size, tx_type, dqcoeff_pvq, blk_size);
-
-  eob = blk_size * blk_size;
-
-  return eob;
-}
-
-static PVQ_SKIP_TYPE read_pvq_skip(AV1_COMMON *cm, MACROBLOCKD *const xd,
-                                   int plane, TX_SIZE tx_size) {
-  // decode ac/dc coded flag. bit0: DC coded, bit1 : AC coded
-  // NOTE : we don't use 5 symbols for luma here in aom codebase,
-  // since block partition is taken care of by aom.
-  // So, only AC/DC skip info is coded
-  const int ac_dc_coded = aom_read_symbol(
-      xd->daala_dec.r,
-      xd->daala_dec.state.adapt->skip_cdf[2 * tx_size + (plane != 0)], 4,
-      "skip");
-  if (ac_dc_coded < 0 || ac_dc_coded > 3) {
-    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
-                       "Invalid PVQ Skip Type");
-  }
-  return ac_dc_coded;
-}
-
-static int av1_pvq_decode_helper2(AV1_COMMON *cm, MACROBLOCKD *const xd,
-                                  MB_MODE_INFO *const mbmi, int plane, int row,
-                                  int col, TX_SIZE tx_size, TX_TYPE tx_type) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  // transform block size in pixels
-  int tx_blk_size = tx_size_wide[tx_size];
-  int i, j;
-  tran_low_t *pvq_ref_coeff = pd->pvq_ref_coeff;
-  const int diff_stride = tx_blk_size;
-  int16_t *pred = pd->pred;
-  tran_low_t *const dqcoeff = pd->dqcoeff;
-  uint8_t *dst;
-  int eob;
-  const PVQ_SKIP_TYPE ac_dc_coded = read_pvq_skip(cm, xd, plane, tx_size);
-
-  eob = 0;
-  dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
-
-  if (ac_dc_coded) {
-    int xdec = pd->subsampling_x;
-    int seg_id = mbmi->segment_id;
-    int16_t *quant;
-    TxfmParam txfm_param;
-    // ToDo(yaowu): correct this with optimal number from decoding process.
-    const int max_scan_line = tx_size_2d[tx_size];
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      for (j = 0; j < tx_blk_size; j++)
-        for (i = 0; i < tx_blk_size; i++)
-          pred[diff_stride * j + i] =
-              CONVERT_TO_SHORTPTR(dst)[pd->dst.stride * j + i];
-    } else {
+static void read_coeffs_tx_intra_block(AV1_COMMON *cm, MACROBLOCKD *const xd,
+                                       aom_reader *const r, int plane, int row,
+                                       int col, TX_SIZE tx_size) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  if (!mbmi->skip) {
+#if TXCOEFF_TIMER
+    struct aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
 #endif
-      for (j = 0; j < tx_blk_size; j++)
-        for (i = 0; i < tx_blk_size; i++)
-          pred[diff_stride * j + i] = dst[pd->dst.stride * j + i];
-#if CONFIG_HIGHBITDEPTH
-    }
+    av1_read_coeffs_txb_facade(cm, xd, r, row, col, plane, tx_size);
+#if TXCOEFF_TIMER
+    aom_usec_timer_mark(&timer);
+    const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+    cm->txcoeff_timer += elapsed_time;
+    ++cm->txb_count;
 #endif
-
-    txfm_param.tx_type = tx_type;
-    txfm_param.tx_size = tx_size;
-    txfm_param.lossless = xd->lossless[seg_id];
-
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      txfm_param.bd = xd->bd;
-      av1_highbd_fwd_txfm(pred, pvq_ref_coeff, diff_stride, &txfm_param);
-    } else {
-#endif  // CONFIG_HIGHBITDEPTH
-      av1_fwd_txfm(pred, pvq_ref_coeff, diff_stride, &txfm_param);
-#if CONFIG_HIGHBITDEPTH
-    }
-#endif  // CONFIG_HIGHBITDEPTH
-
-    quant = &pd->seg_dequant[seg_id][0];  // aom's quantizer
-
-    eob = av1_pvq_decode_helper(xd, pvq_ref_coeff, dqcoeff, quant, plane,
-                                tx_size, tx_type, xdec, ac_dc_coded);
-
-    inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
-                            max_scan_line, eob);
   }
-
-  return eob;
 }
-#endif
 
-static void predict_and_reconstruct_intra_block(
-    AV1_COMMON *cm, MACROBLOCKD *const xd, aom_reader *const r,
-    MB_MODE_INFO *const mbmi, int plane, int row, int col, TX_SIZE tx_size) {
-  PLANE_TYPE plane_type = get_plane_type(plane);
-  const int block_idx = get_block_idx(xd, plane, row, col);
-#if CONFIG_PVQ
+static void predict_and_reconstruct_intra_block(AV1_COMMON *cm,
+                                                MACROBLOCKD *const xd,
+                                                aom_reader *const r, int plane,
+                                                int row, int col,
+                                                TX_SIZE tx_size) {
   (void)r;
-#endif
-  av1_predict_intra_block_facade(cm, xd, plane, block_idx, col, row, tx_size);
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  PLANE_TYPE plane_type = get_plane_type(plane);
+
+  av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
 
   if (!mbmi->skip) {
-#if !CONFIG_PVQ
     struct macroblockd_plane *const pd = &xd->plane[plane];
-#if CONFIG_LV_MAP
-    int16_t max_scan_line = 0;
-    int eob;
-    av1_read_coeffs_txb_facade(cm, xd, r, row, col, block_idx, plane,
-                               pd->dqcoeff, tx_size, &max_scan_line, &eob);
+
     // tx_type will be read out in av1_read_coeffs_txb_facade
-    const TX_TYPE tx_type =
-        av1_get_tx_type(plane_type, xd, row, col, block_idx, tx_size);
-#else   // CONFIG_LV_MAP
-    const TX_TYPE tx_type =
-        av1_get_tx_type(plane_type, xd, row, col, block_idx, tx_size);
-    const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-    int16_t max_scan_line = 0;
-    const int eob =
-        av1_decode_block_tokens(cm, xd, plane, scan_order, col, row, tx_size,
-                                tx_type, &max_scan_line, r, mbmi->segment_id);
-#endif  // CONFIG_LV_MAP
-    if (eob) {
+    const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, row, col, tx_size,
+                                            cm->reduced_tx_set_used);
+    eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+    if (eob_data->eob) {
       uint8_t *dst =
           &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
-      inverse_transform_block(xd, plane,
-#if CONFIG_LGT_FROM_PRED
-                              mbmi->mode,
-#endif
-                              tx_type, tx_size, dst, pd->dst.stride,
-                              max_scan_line, eob);
+      inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
+                              cm->reduced_tx_set_used);
     }
-#else   // !CONFIG_PVQ
-    const TX_TYPE tx_type =
-        av1_get_tx_type(plane_type, xd, row, col, block_idx, tx_size);
-    av1_pvq_decode_helper2(cm, xd, mbmi, plane, row, col, tx_size, tx_type);
-#endif  // !CONFIG_PVQ
-  }
-#if CONFIG_CFL
-  if (plane == AOM_PLANE_Y && xd->cfl->store_y) {
+  }
+  if (plane == AOM_PLANE_Y && store_cfl_required(cm, xd)) {
     cfl_store_tx(xd, row, col, tx_size, mbmi->sb_type);
   }
-#endif  // CONFIG_CFL
 }
 
-#if CONFIG_VAR_TX && !CONFIG_COEF_INTERLEAVE
+static void inverse_transform_inter_block(const AV1_COMMON *const cm,
+                                          MACROBLOCKD *const xd,
+                                          aom_reader *const r,
+                                          const int blk_row, const int blk_col,
+                                          const int plane,
+                                          const TX_SIZE tx_size) {
+  (void)r;
+  PLANE_TYPE plane_type = get_plane_type(plane);
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  MB_MODE_INFO *mbmi = xd->mi[0];
+
+  // tx_type will be read out in av1_read_coeffs_txb_facade
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+
+  if (plane == 0)
+    update_txk_array(mbmi->txk_type, mbmi->sb_type, blk_row, blk_col, tx_size,
+                     tx_type);
+
+  uint8_t *dst =
+      &pd->dst
+           .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
+  inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
+                          cm->reduced_tx_set_used);
+}
+
+static void set_cb_buffer_offsets(MACROBLOCKD *const xd, TX_SIZE tx_size,
+                                  int plane) {
+  xd->cb_offset[plane] += tx_size_wide[tx_size] * tx_size_high[tx_size];
+  xd->txb_offset[plane] =
+      xd->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+}
+
 static void decode_reconstruct_tx(AV1_COMMON *cm, MACROBLOCKD *const xd,
                                   aom_reader *r, MB_MODE_INFO *const mbmi,
                                   int plane, BLOCK_SIZE plane_bsize,
                                   int blk_row, int blk_col, int block,
                                   TX_SIZE tx_size, int *eob_total) {
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-  const int tx_row = blk_row >> (1 - pd->subsampling_y);
-  const int tx_col = blk_col >> (1 - pd->subsampling_x);
   const TX_SIZE plane_tx_size =
-      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
-            : mbmi->inter_tx_size[tx_row][tx_col];
+      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
   // Scale to match transform block unit.
   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
-  if (tx_size == plane_tx_size) {
-    PLANE_TYPE plane_type = get_plane_type(plane);
-#if CONFIG_LV_MAP
-    int16_t max_scan_line = 0;
-    int eob;
-    av1_read_coeffs_txb_facade(cm, xd, r, blk_row, blk_col, block, plane,
-                               pd->dqcoeff, tx_size, &max_scan_line, &eob);
-    // tx_type will be read out in av1_read_coeffs_txb_facade
-    const TX_TYPE tx_type =
-        av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, plane_tx_size);
-#else   // CONFIG_LV_MAP
-    const TX_TYPE tx_type =
-        av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, plane_tx_size);
-    const SCAN_ORDER *sc = get_scan(cm, plane_tx_size, tx_type, mbmi);
-    int16_t max_scan_line = 0;
-    const int eob = av1_decode_block_tokens(
-        cm, xd, plane, sc, blk_col, blk_row, plane_tx_size, tx_type,
-        &max_scan_line, r, mbmi->segment_id);
-#endif  // CONFIG_LV_MAP
-    inverse_transform_block(xd, plane,
-#if CONFIG_LGT_FROM_PRED
-                            mbmi->mode,
-#endif
-                            tx_type, plane_tx_size,
-                            &pd->dst.buf[(blk_row * pd->dst.stride + blk_col)
-                                         << tx_size_wide_log2[0]],
-                            pd->dst.stride, max_scan_line, eob);
-    *eob_total += eob;
+  if (tx_size == plane_tx_size || plane) {
+#if TXCOEFF_TIMER
+    struct aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+#endif
+    av1_read_coeffs_txb_facade(cm, xd, r, blk_row, blk_col, plane, tx_size);
+#if TXCOEFF_TIMER
+    aom_usec_timer_mark(&timer);
+    const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+    cm->txcoeff_timer += elapsed_time;
+    ++cm->txb_count;
+#endif
+    inverse_transform_inter_block(cm, xd, r, blk_row, blk_col, plane, tx_size);
+
+#if CONFIG_MISMATCH_DEBUG
+    int pixel_c, pixel_r;
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int blk_w = block_size_wide[bsize];
+    int blk_h = block_size_high[bsize];
+    mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row,
+                    pd->subsampling_x, pd->subsampling_y);
+    mismatch_check_block_tx(dst, pd->dst.stride, cm->frame_offset, plane,
+                            pixel_c, pixel_r, blk_w, blk_h,
+                            xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+#endif
+    eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+    *eob_total += eob_data->eob;
+    set_cb_buffer_offsets(xd, tx_size, plane);
   } else {
-#if CONFIG_RECT_TX_EXT
-    int is_qttx = plane_tx_size == quarter_txsize_lookup[plane_bsize];
-    const TX_SIZE sub_txs = is_qttx ? plane_tx_size : sub_tx_size_map[tx_size];
-    if (is_qttx) assert(blk_row == 0 && blk_col == 0 && block == 0);
-#else
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
     assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size));
     assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size));
-#endif
-    const int bsl = tx_size_wide_unit[sub_txs];
-    int sub_step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
-    int i;
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int sub_step = bsw * bsh;
 
-    assert(bsl > 0);
-
-    for (i = 0; i < 4; ++i) {
-#if CONFIG_RECT_TX_EXT
-      int is_wide_tx = tx_size_wide_unit[sub_txs] > tx_size_high_unit[sub_txs];
-      const int offsetr =
-          is_qttx ? (is_wide_tx ? i * tx_size_high_unit[sub_txs] : 0)
-                  : blk_row + ((i >> 1) * bsl);
-      const int offsetc =
-          is_qttx ? (is_wide_tx ? 0 : i * tx_size_wide_unit[sub_txs])
-                  : blk_col + (i & 0x01) * bsl;
-#else
-      const int offsetr = blk_row + (i >> 1) * bsl;
-      const int offsetc = blk_col + (i & 0x01) * bsl;
-#endif
+    assert(bsw > 0 && bsh > 0);
+
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
 
-      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
 
-      decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize, offsetr,
-                            offsetc, block, sub_txs, eob_total);
-      block += sub_step;
+        decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize, offsetr,
+                              offsetc, block, sub_txs, eob_total);
+        block += sub_step;
+      }
     }
   }
 }
-#endif  // CONFIG_VAR_TX
-
-#if !CONFIG_VAR_TX || CONFIG_SUPERTX || CONFIG_COEF_INTERLEAVE || \
-    (!CONFIG_VAR_TX && CONFIG_EXT_TX && CONFIG_RECT_TX)
-static int reconstruct_inter_block(AV1_COMMON *cm, MACROBLOCKD *const xd,
-                                   aom_reader *const r, int segment_id,
-                                   int plane, int row, int col,
-                                   TX_SIZE tx_size) {
-  PLANE_TYPE plane_type = get_plane_type(plane);
-  int block_idx = get_block_idx(xd, plane, row, col);
-#if CONFIG_PVQ
-  int eob;
-  (void)r;
-  (void)segment_id;
-#else
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-#endif
-
-#if !CONFIG_PVQ
-#if CONFIG_LV_MAP
-  (void)segment_id;
-  int16_t max_scan_line = 0;
-  int eob;
-  av1_read_coeffs_txb_facade(cm, xd, r, row, col, block_idx, plane, pd->dqcoeff,
-                             tx_size, &max_scan_line, &eob);
-  // tx_type will be read out in av1_read_coeffs_txb_facade
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, row, col, block_idx, tx_size);
-#else   // CONFIG_LV_MAP
-  int16_t max_scan_line = 0;
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, row, col, block_idx, tx_size);
-  const SCAN_ORDER *scan_order =
-      get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
-  const int eob =
-      av1_decode_block_tokens(cm, xd, plane, scan_order, col, row, tx_size,
-                              tx_type, &max_scan_line, r, segment_id);
-#endif  // CONFIG_LV_MAP
-  uint8_t *dst =
-      &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
-  if (eob)
-    inverse_transform_block(xd, plane,
-#if CONFIG_LGT_FROM_PRED
-                            xd->mi[0]->mbmi.mode,
-#endif
-                            tx_type, tx_size, dst, pd->dst.stride,
-                            max_scan_line, eob);
-#else
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, row, col, block_idx, tx_size);
-  eob = av1_pvq_decode_helper2(cm, xd, &xd->mi[0]->mbmi, plane, row, col,
-                               tx_size, tx_type);
-#endif
-  return eob;
-}
-#endif  // !CONFIG_VAR_TX || CONFIG_SUPER_TX
 
 static void set_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
                         BLOCK_SIZE bsize, int mi_row, int mi_col, int bw,
                         int bh, int x_mis, int y_mis) {
+  const int num_planes = av1_num_planes(cm);
+
   const int offset = mi_row * cm->mi_stride + mi_col;
-  int x, y;
   const TileInfo *const tile = &xd->tile;
 
   xd->mi = cm->mi_grid_visible + offset;
   xd->mi[0] = &cm->mi[offset];
   // TODO(slavarnway): Generate sb_type based on bwl and bhl, instead of
   // passing bsize from decode_partition().
-  xd->mi[0]->mbmi.sb_type = bsize;
+  xd->mi[0]->sb_type = bsize;
 #if CONFIG_RD_DEBUG
-  xd->mi[0]->mbmi.mi_row = mi_row;
-  xd->mi[0]->mbmi.mi_col = mi_col;
+  xd->mi[0]->mi_row = mi_row;
+  xd->mi[0]->mi_col = mi_col;
 #endif
-#if CONFIG_CFL
-  xd->cfl->mi_row = mi_row;
-  xd->cfl->mi_col = mi_col;
-#endif
-  for (y = 0; y < y_mis; ++y)
-    for (x = !y; x < x_mis; ++x) xd->mi[y * cm->mi_stride + x] = xd->mi[0];
+  xd->cfl.mi_row = mi_row;
+  xd->cfl.mi_col = mi_col;
 
-  set_plane_n4(xd, bw, bh);
-  set_skip_context(xd, mi_row, mi_col);
+  assert(x_mis && y_mis);
+  for (int x = 1; x < x_mis; ++x) xd->mi[x] = xd->mi[0];
+  int idx = cm->mi_stride;
+  for (int y = 1; y < y_mis; ++y) {
+    memcpy(&xd->mi[idx], &xd->mi[0], x_mis * sizeof(xd->mi[0]));
+    idx += cm->mi_stride;
+  }
 
-#if CONFIG_VAR_TX
-  xd->max_tx_size = max_txsize_lookup[bsize];
-#endif
+  set_plane_n4(xd, bw, bh, num_planes);
+  set_skip_context(xd, mi_row, mi_col, num_planes);
 
   // Distance of Mb to the various image edges. These are specified to 8th pel
   // as they are always compared to values that are in 1/8th pel units
-  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
 
   av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                       mi_col);
+                       mi_col, 0, num_planes);
 }
 
-#if CONFIG_SUPERTX
-static MB_MODE_INFO *set_offsets_extend(AV1_COMMON *const cm,
-                                        MACROBLOCKD *const xd,
-                                        const TileInfo *const tile,
-                                        BLOCK_SIZE bsize_pred, int mi_row_pred,
-                                        int mi_col_pred, int mi_row_ori,
-                                        int mi_col_ori) {
-  // Used in supertx
-  // (mi_row_ori, mi_col_ori): location for mv
-  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
-  const int bw = mi_size_wide[bsize_pred];
-  const int bh = mi_size_high[bsize_pred];
-  const int offset = mi_row_ori * cm->mi_stride + mi_col_ori;
-  xd->mi = cm->mi_grid_visible + offset;
-  xd->mi[0] = cm->mi + offset;
-  set_mi_row_col(xd, tile, mi_row_pred, bh, mi_col_pred, bw,
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
-
-  xd->up_available = (mi_row_ori > tile->mi_row_start);
-  xd->left_available = (mi_col_ori > tile->mi_col_start);
+static void decode_mbmi_block(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+                              int mi_row, int mi_col, aom_reader *r,
+                              PARTITION_TYPE partition, BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
 
-  set_plane_n4(xd, bw, bh);
+#if CONFIG_ACCOUNTING
+  aom_accounting_set_context(&pbi->accounting, mi_col, mi_row);
+#endif
+  set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
+  xd->mi[0]->partition = partition;
+  av1_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
+  if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
+    const BLOCK_SIZE uv_subsize =
+        ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y];
+    if (uv_subsize == BLOCK_INVALID)
+      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid block size.");
+  }
 
-  return &xd->mi[0]->mbmi;
+  int reader_corrupted_flag = aom_reader_has_error(r);
+  aom_merge_corrupted_flag(&xd->corrupted, reader_corrupted_flag);
 }
 
-#if CONFIG_SUPERTX
-static MB_MODE_INFO *set_mb_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                    int bw, int bh, int x_mis, int y_mis) {
-  const int offset = mi_row * cm->mi_stride + mi_col;
-  const TileInfo *const tile = &xd->tile;
-  int x, y;
+typedef struct PadBlock {
+  int x0;
+  int x1;
+  int y0;
+  int y1;
+} PadBlock;
 
-  xd->mi = cm->mi_grid_visible + offset;
-  xd->mi[0] = cm->mi + offset;
-  xd->mi[0]->mbmi.sb_type = bsize;
-  for (y = 0; y < y_mis; ++y)
-    for (x = !y; x < x_mis; ++x) xd->mi[y * cm->mi_stride + x] = xd->mi[0];
-
-  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
-  return &xd->mi[0]->mbmi;
-}
-#endif
+static void highbd_build_mc_border(const uint8_t *src8, int src_stride,
+                                   uint8_t *dst8, int dst_stride, int x, int y,
+                                   int b_w, int b_h, int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  const uint16_t *ref_row = src - x - y * src_stride;
 
-static void set_offsets_topblock(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                 const TileInfo *const tile, BLOCK_SIZE bsize,
-                                 int mi_row, int mi_col) {
-  const int bw = mi_size_wide[bsize];
-  const int bh = mi_size_high[bsize];
-  const int offset = mi_row * cm->mi_stride + mi_col;
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
 
-  xd->mi = cm->mi_grid_visible + offset;
-  xd->mi[0] = cm->mi + offset;
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
 
-  set_plane_n4(xd, bw, bh);
+    if (left > b_w) left = b_w;
 
-  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
+    if (x + b_w > w) right = x + b_w - w;
 
-  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                       mi_col);
-}
+    if (right > b_w) right = b_w;
 
-static void set_param_topblock(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                               BLOCK_SIZE bsize, int mi_row, int mi_col,
-                               int txfm, int skip) {
-  const int bw = mi_size_wide[bsize];
-  const int bh = mi_size_high[bsize];
-  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
-  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
-  const int offset = mi_row * cm->mi_stride + mi_col;
-  int x, y;
+    copy = b_w - left - right;
 
-  xd->mi = cm->mi_grid_visible + offset;
-  xd->mi[0] = cm->mi + offset;
+    if (left) aom_memset16(dst, ref_row[0], left);
 
-  for (y = 0; y < y_mis; ++y)
-    for (x = 0; x < x_mis; ++x) {
-      xd->mi[y * cm->mi_stride + x]->mbmi.skip = skip;
-      xd->mi[y * cm->mi_stride + x]->mbmi.tx_type = txfm;
-    }
-#if CONFIG_VAR_TX
-  xd->above_txfm_context = cm->above_txfm_context + mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
-  set_txfm_ctxs(xd->mi[0]->mbmi.tx_size, bw, bh, skip, xd);
-#endif
-}
+    if (copy) memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
 
-static void set_ref(AV1_COMMON *const cm, MACROBLOCKD *const xd, int idx,
-                    int mi_row, int mi_col) {
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_COMPOUND_SINGLEREF
-  RefBuffer *ref_buffer =
-      has_second_ref(mbmi) ? &cm->frame_refs[mbmi->ref_frame[idx] - LAST_FRAME]
-                           : &cm->frame_refs[mbmi->ref_frame[0] - LAST_FRAME];
-#else
-  RefBuffer *ref_buffer = &cm->frame_refs[mbmi->ref_frame[idx] - LAST_FRAME];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  xd->block_refs[idx] = ref_buffer;
-  if (!av1_is_valid_scale(&ref_buffer->sf))
-    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                       "Invalid scale factors");
-  av1_setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col,
-                       &ref_buffer->sf);
-  aom_merge_corrupted_flag(&xd->corrupted, ref_buffer->buf->corrupted);
+    if (right) aom_memset16(dst + left + copy, ref_row[w - 1], right);
+
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h) ref_row += src_stride;
+  } while (--b_h);
 }
 
-static void dec_predict_b_extend(
-    AV1Decoder *const pbi, MACROBLOCKD *const xd, const TileInfo *const tile,
-    int block, int mi_row_ori, int mi_col_ori, int mi_row_pred, int mi_col_pred,
-    int mi_row_top, int mi_col_top, int plane, uint8_t *dst_buf, int dst_stride,
-    BLOCK_SIZE bsize_top, BLOCK_SIZE bsize_pred, int b_sub8x8, int bextend) {
-  // Used in supertx
-  // (mi_row_ori, mi_col_ori): location for mv
-  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
-  // (mi_row_top, mi_col_top, bsize_top): region of the top partition size
-  // block: sub location of sub8x8 blocks
-  // b_sub8x8: 1: ori is sub8x8; 0: ori is not sub8x8
-  // bextend: 1: region to predict is an extension of ori; 0: not
-  int r = (mi_row_pred - mi_row_top) * MI_SIZE;
-  int c = (mi_col_pred - mi_col_top) * MI_SIZE;
-  const int mi_width_top = mi_size_wide[bsize_top];
-  const int mi_height_top = mi_size_high[bsize_top];
-  MB_MODE_INFO *mbmi;
-  AV1_COMMON *const cm = &pbi->common;
+static void build_mc_border(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int x, int y, int b_w, int b_h,
+                            int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint8_t *ref_row = src - x - y * src_stride;
 
-  if (mi_row_pred < mi_row_top || mi_col_pred < mi_col_top ||
-      mi_row_pred >= mi_row_top + mi_height_top ||
-      mi_col_pred >= mi_col_top + mi_width_top || mi_row_pred >= cm->mi_rows ||
-      mi_col_pred >= cm->mi_cols)
-    return;
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
 
-  mbmi = set_offsets_extend(cm, xd, tile, bsize_pred, mi_row_pred, mi_col_pred,
-                            mi_row_ori, mi_col_ori);
-  set_ref(cm, xd, 0, mi_row_pred, mi_col_pred);
-  if (has_second_ref(&xd->mi[0]->mbmi)
-#if CONFIG_COMPOUND_SINGLEREF
-      || is_inter_singleref_comp_mode(xd->mi[0]->mbmi.mode)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-          )
-    set_ref(cm, xd, 1, mi_row_pred, mi_col_pred);
-  if (!bextend) mbmi->tx_size = max_txsize_lookup[bsize_top];
-
-  xd->plane[plane].dst.stride = dst_stride;
-  xd->plane[plane].dst.buf =
-      dst_buf + (r >> xd->plane[plane].subsampling_y) * dst_stride +
-      (c >> xd->plane[plane].subsampling_x);
-
-  if (!b_sub8x8)
-    av1_build_inter_predictor_sb_extend(&pbi->common, xd, mi_row_ori,
-                                        mi_col_ori, mi_row_pred, mi_col_pred,
-                                        plane, bsize_pred);
-  else
-    av1_build_inter_predictor_sb_sub8x8_extend(
-        &pbi->common, xd, mi_row_ori, mi_col_ori, mi_row_pred, mi_col_pred,
-        plane, bsize_pred, block);
-}
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
 
-static void dec_extend_dir(AV1Decoder *const pbi, MACROBLOCKD *const xd,
-                           const TileInfo *const tile, int block,
-                           BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
-                           int mi_row_ori, int mi_col_ori, int mi_row,
-                           int mi_col, int mi_row_top, int mi_col_top,
-                           int plane, uint8_t *dst_buf, int dst_stride,
-                           int dir) {
-  // dir: 0-lower, 1-upper, 2-left, 3-right
-  //      4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-  int xss = xd->plane[1].subsampling_x;
-  int yss = xd->plane[1].subsampling_y;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-  int b_sub8x8 = (bsize < BLOCK_8X8) && !unify_bsize ? 1 : 0;
-  BLOCK_SIZE extend_bsize;
-  int mi_row_pred, mi_col_pred;
-
-  int wide_unit, high_unit;
-  int i, j;
-  int ext_offset = 0;
-
-  if (dir == 0 || dir == 1) {
-    extend_bsize =
-        (mi_width == mi_size_wide[BLOCK_8X8] || bsize < BLOCK_8X8 || xss < yss)
-            ? BLOCK_8X8
-            : BLOCK_16X8;
-#if CONFIG_CB4X4
-    if (bsize < BLOCK_8X8) {
-      extend_bsize = BLOCK_4X4;
-      ext_offset = mi_size_wide[BLOCK_8X8];
-    }
-#endif
+    if (left > b_w) left = b_w;
 
-    wide_unit = mi_size_wide[extend_bsize];
-    high_unit = mi_size_high[extend_bsize];
-
-    mi_row_pred = mi_row + ((dir == 0) ? mi_height : -(mi_height + ext_offset));
-    mi_col_pred = mi_col;
-
-    for (j = 0; j < mi_height + ext_offset; j += high_unit)
-      for (i = 0; i < mi_width + ext_offset; i += wide_unit)
-        dec_predict_b_extend(pbi, xd, tile, block, mi_row_ori, mi_col_ori,
-                             mi_row_pred + j, mi_col_pred + i, mi_row_top,
-                             mi_col_top, plane, dst_buf, dst_stride, top_bsize,
-                             extend_bsize, b_sub8x8, 1);
-  } else if (dir == 2 || dir == 3) {
-    extend_bsize =
-        (mi_height == mi_size_high[BLOCK_8X8] || bsize < BLOCK_8X8 || yss < xss)
-            ? BLOCK_8X8
-            : BLOCK_8X16;
-#if CONFIG_CB4X4
-    if (bsize < BLOCK_8X8) {
-      extend_bsize = BLOCK_4X4;
-      ext_offset = mi_size_wide[BLOCK_8X8];
-    }
-#endif
+    if (x + b_w > w) right = x + b_w - w;
 
-    wide_unit = mi_size_wide[extend_bsize];
-    high_unit = mi_size_high[extend_bsize];
+    if (right > b_w) right = b_w;
 
-    mi_row_pred = mi_row;
-    mi_col_pred = mi_col + ((dir == 3) ? mi_width : -(mi_width + ext_offset));
+    copy = b_w - left - right;
 
-    for (j = 0; j < mi_height + ext_offset; j += high_unit)
-      for (i = 0; i < mi_width + ext_offset; i += wide_unit)
-        dec_predict_b_extend(pbi, xd, tile, block, mi_row_ori, mi_col_ori,
-                             mi_row_pred + j, mi_col_pred + i, mi_row_top,
-                             mi_col_top, plane, dst_buf, dst_stride, top_bsize,
-                             extend_bsize, b_sub8x8, 1);
-  } else {
-    extend_bsize = BLOCK_8X8;
-#if CONFIG_CB4X4
-    if (bsize < BLOCK_8X8) {
-      extend_bsize = BLOCK_4X4;
-      ext_offset = mi_size_wide[BLOCK_8X8];
+    if (left) memset(dst, ref_row[0], left);
+
+    if (copy) memcpy(dst + left, ref_row + x + left, copy);
+
+    if (right) memset(dst + left + copy, ref_row[w - 1], right);
+
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h) ref_row += src_stride;
+  } while (--b_h);
+}
+
+static INLINE int update_extend_mc_border_params(
+    const struct scale_factors *const sf, struct buf_2d *const pre_buf,
+    MV32 scaled_mv, PadBlock *block, int subpel_x_mv, int subpel_y_mv,
+    int do_warp, int is_intrabc, int *x_pad, int *y_pad) {
+  const int is_scaled = av1_is_scaled(sf);
+  // Get reference width and height.
+  int frame_width = pre_buf->width;
+  int frame_height = pre_buf->height;
+
+  // Do border extension if there is motion or
+  // width/height is not a multiple of 8 pixels.
+  if ((!is_intrabc) && (!do_warp) &&
+      (is_scaled || scaled_mv.col || scaled_mv.row || (frame_width & 0x7) ||
+       (frame_height & 0x7))) {
+    if (subpel_x_mv || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
+      block->x0 -= AOM_INTERP_EXTEND - 1;
+      block->x1 += AOM_INTERP_EXTEND;
+      *x_pad = 1;
     }
-#endif
-    wide_unit = mi_size_wide[extend_bsize];
-    high_unit = mi_size_high[extend_bsize];
 
-    mi_row_pred = mi_row + ((dir == 4 || dir == 6) ? mi_height
-                                                   : -(mi_height + ext_offset));
-    mi_col_pred =
-        mi_col + ((dir == 6 || dir == 7) ? mi_width : -(mi_width + ext_offset));
+    if (subpel_y_mv || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
+      block->y0 -= AOM_INTERP_EXTEND - 1;
+      block->y1 += AOM_INTERP_EXTEND;
+      *y_pad = 1;
+    }
 
-    for (j = 0; j < mi_height + ext_offset; j += high_unit)
-      for (i = 0; i < mi_width + ext_offset; i += wide_unit)
-        dec_predict_b_extend(pbi, xd, tile, block, mi_row_ori, mi_col_ori,
-                             mi_row_pred + j, mi_col_pred + i, mi_row_top,
-                             mi_col_top, plane, dst_buf, dst_stride, top_bsize,
-                             extend_bsize, b_sub8x8, 1);
+    // Skip border extension if block is inside the frame.
+    if (block->x0 < 0 || block->x1 > frame_width - 1 || block->y0 < 0 ||
+        block->y1 > frame_height - 1) {
+      return 1;
+    }
   }
+  return 0;
 }
 
-static void dec_extend_all(AV1Decoder *const pbi, MACROBLOCKD *const xd,
-                           const TileInfo *const tile, int block,
-                           BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
-                           int mi_row_ori, int mi_col_ori, int mi_row,
-                           int mi_col, int mi_row_top, int mi_col_top,
-                           int plane, uint8_t *dst_buf, int dst_stride) {
-  for (int i = 0; i < 8; ++i) {
-    dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row_ori,
-                   mi_col_ori, mi_row, mi_col, mi_row_top, mi_col_top, plane,
-                   dst_buf, dst_stride, i);
+static INLINE void extend_mc_border(const struct scale_factors *const sf,
+                                    struct buf_2d *const pre_buf,
+                                    MV32 scaled_mv, PadBlock block,
+                                    int subpel_x_mv, int subpel_y_mv,
+                                    int do_warp, int is_intrabc, int highbd,
+                                    uint8_t *mc_buf, uint8_t **pre,
+                                    int *src_stride) {
+  int x_pad = 0, y_pad = 0;
+  if (update_extend_mc_border_params(sf, pre_buf, scaled_mv, &block,
+                                     subpel_x_mv, subpel_y_mv, do_warp,
+                                     is_intrabc, &x_pad, &y_pad)) {
+    // Get reference block pointer.
+    const uint8_t *const buf_ptr =
+        pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
+    int buf_stride = pre_buf->stride;
+    const int b_w = block.x1 - block.x0;
+    const int b_h = block.y1 - block.y0;
+
+    // Extend the border.
+    if (highbd) {
+      highbd_build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0,
+                             block.y0, b_w, b_h, pre_buf->width,
+                             pre_buf->height);
+    } else {
+      build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w,
+                      b_h, pre_buf->width, pre_buf->height);
+    }
+    *src_stride = b_w;
+    *pre = mc_buf + y_pad * (AOM_INTERP_EXTEND - 1) * b_w +
+           x_pad * (AOM_INTERP_EXTEND - 1);
   }
 }
 
-static void dec_predict_sb_complex(AV1Decoder *const pbi, MACROBLOCKD *const xd,
-                                   const TileInfo *const tile, int mi_row,
-                                   int mi_col, int mi_row_top, int mi_col_top,
-                                   BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
-                                   uint8_t *dst_buf[3], int dst_stride[3]) {
-  const AV1_COMMON *const cm = &pbi->common;
-  const int hbs = mi_size_wide[bsize] / 2;
-  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
-  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
-#if CONFIG_EXT_PARTITION_TYPES
-  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
-#endif
-  int i;
-  const int mi_offset = mi_row * cm->mi_stride + mi_col;
-  uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf3[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
-  int dst_stride1[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
-  int dst_stride2[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
-  int dst_stride3[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
-
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    int len = sizeof(uint16_t);
-    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
-    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_TX_SQUARE * len);
-    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_TX_SQUARE * len);
-    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
-    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_TX_SQUARE * len);
-    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_TX_SQUARE * len);
-    dst_buf3[0] = CONVERT_TO_BYTEPTR(tmp_buf3);
-    dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAX_TX_SQUARE * len);
-    dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAX_TX_SQUARE * len);
+static INLINE void dec_calc_subpel_params(
+    MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv,
+    int plane, const int pre_x, const int pre_y, int x, int y,
+    struct buf_2d *const pre_buf, SubpelParams *subpel_params, int bw, int bh,
+    PadBlock *block, int mi_x, int mi_y, MV32 *scaled_mv, int *subpel_x_mv,
+    int *subpel_y_mv) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int is_scaled = av1_is_scaled(sf);
+  if (is_scaled) {
+    int ssx = pd->subsampling_x;
+    int ssy = pd->subsampling_y;
+    int orig_pos_y = (pre_y + y) << SUBPEL_BITS;
+    orig_pos_y += mv.row * (1 << (1 - ssy));
+    int orig_pos_x = (pre_x + x) << SUBPEL_BITS;
+    orig_pos_x += mv.col * (1 << (1 - ssx));
+    int pos_y = sf->scale_value_y(orig_pos_y, sf);
+    int pos_x = sf->scale_value_x(orig_pos_x, sf);
+    pos_x += SCALE_EXTRA_OFF;
+    pos_y += SCALE_EXTRA_OFF;
+
+    const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+    const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+    const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                       << SCALE_SUBPEL_BITS;
+    const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+    pos_y = clamp(pos_y, top, bottom);
+    pos_x = clamp(pos_x, left, right);
+
+    subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
+    subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
+    subpel_params->xs = sf->x_step_q4;
+    subpel_params->ys = sf->y_step_q4;
+
+    // Get reference block top left coordinate.
+    block->x0 = pos_x >> SCALE_SUBPEL_BITS;
+    block->y0 = pos_y >> SCALE_SUBPEL_BITS;
+
+    // Get reference block bottom right coordinate.
+    block->x1 =
+        ((pos_x + (bw - 1) * subpel_params->xs) >> SCALE_SUBPEL_BITS) + 1;
+    block->y1 =
+        ((pos_y + (bh - 1) * subpel_params->ys) >> SCALE_SUBPEL_BITS) + 1;
+
+    MV temp_mv;
+    temp_mv = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh, pd->subsampling_x,
+                                        pd->subsampling_y);
+    *scaled_mv = av1_scale_mv(&temp_mv, (mi_x + x), (mi_y + y), sf);
+    scaled_mv->row += SCALE_EXTRA_OFF;
+    scaled_mv->col += SCALE_EXTRA_OFF;
+
+    *subpel_x_mv = scaled_mv->col & SCALE_SUBPEL_MASK;
+    *subpel_y_mv = scaled_mv->row & SCALE_SUBPEL_MASK;
   } else {
-#endif
-    dst_buf1[0] = tmp_buf1;
-    dst_buf1[1] = tmp_buf1 + MAX_TX_SQUARE;
-    dst_buf1[2] = tmp_buf1 + 2 * MAX_TX_SQUARE;
-    dst_buf2[0] = tmp_buf2;
-    dst_buf2[1] = tmp_buf2 + MAX_TX_SQUARE;
-    dst_buf2[2] = tmp_buf2 + 2 * MAX_TX_SQUARE;
-    dst_buf3[0] = tmp_buf3;
-    dst_buf3[1] = tmp_buf3 + MAX_TX_SQUARE;
-    dst_buf3[2] = tmp_buf3 + 2 * MAX_TX_SQUARE;
-#if CONFIG_HIGHBITDEPTH
+    // Get block position in current frame.
+    int pos_x = (pre_x + x) << SUBPEL_BITS;
+    int pos_y = (pre_y + y) << SUBPEL_BITS;
+
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(
+        xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+    subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
+    subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+    subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+
+    // Get reference block top left coordinate.
+    pos_x += mv_q4.col;
+    pos_y += mv_q4.row;
+    block->x0 = pos_x >> SUBPEL_BITS;
+    block->y0 = pos_y >> SUBPEL_BITS;
+
+    // Get reference block bottom right coordinate.
+    block->x1 = (pos_x >> SUBPEL_BITS) + (bw - 1) + 1;
+    block->y1 = (pos_y >> SUBPEL_BITS) + (bh - 1) + 1;
+
+    scaled_mv->row = mv_q4.row;
+    scaled_mv->col = mv_q4.col;
+    *subpel_x_mv = scaled_mv->col & SUBPEL_MASK;
+    *subpel_y_mv = scaled_mv->row & SUBPEL_MASK;
   }
-#endif
-
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
-
-  xd->mi = cm->mi_grid_visible + mi_offset;
-  xd->mi[0] = cm->mi + mi_offset;
+}
 
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    xd->plane[i].dst.buf = dst_buf[i];
-    xd->plane[i].dst.stride = dst_stride[i];
+static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
+                                              MACROBLOCKD *xd, int plane,
+                                              const MB_MODE_INFO *mi,
+                                              int build_for_obmc, int bw,
+                                              int bh, int mi_x, int mi_y) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  int is_compound = has_second_ref(mi);
+  int ref;
+  const int is_intrabc = is_intrabc_block(mi);
+  assert(IMPLIES(is_intrabc, !is_compound));
+  int is_global[2] = { 0, 0 };
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+    is_global[ref] = is_global_mv_block(mi, wm->wmtype);
+  }
+
+  const BLOCK_SIZE bsize = mi->sb_type;
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) ||
+                     (block_size_high[bsize] < 8 && ss_y);
+
+  if (is_intrabc) sub8x8_inter = 0;
+
+  // For sub8x8 chroma blocks, we may be covering more than one luma block's
+  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+  // the top-left corner of the prediction source - the correct top-left corner
+  // is at (pre_x, pre_y).
+  const int row_start =
+      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
+  const int col_start =
+      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
+  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+  sub8x8_inter = sub8x8_inter && !build_for_obmc;
+  if (sub8x8_inter) {
+    for (int row = row_start; row <= 0 && sub8x8_inter; ++row) {
+      for (int col = col_start; col <= 0; ++col) {
+        const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+        if (!is_inter_block(this_mbmi)) sub8x8_inter = 0;
+        if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0;
+      }
+    }
   }
 
-  switch (partition) {
-    case PARTITION_NONE:
-      assert(bsize < top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                             mi_row_top, mi_col_top, i, dst_buf[i],
-                             dst_stride[i], top_bsize, bsize, 0, 0);
-        dec_extend_all(pbi, xd, tile, 0, bsize, top_bsize, mi_row, mi_col,
-                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                       dst_stride[i]);
-      }
-      break;
-    case PARTITION_HORZ:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          // For sub8x8, predict in 8x8 unit
-          // First half
-          dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf[i],
-                               dst_stride[i], top_bsize, BLOCK_8X8, 1, 0);
-          if (bsize < top_bsize)
-            dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                           mi_row, mi_col, mi_row_top, mi_col_top, i,
-                           dst_buf[i], dst_stride[i]);
-
-          // Second half
-          dec_predict_b_extend(pbi, xd, tile, 2, mi_row, mi_col, mi_row, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf1[i],
-                               dst_stride1[i], top_bsize, BLOCK_8X8, 1, 1);
-          if (bsize < top_bsize)
-            dec_extend_all(pbi, xd, tile, 2, subsize, top_bsize, mi_row, mi_col,
-                           mi_row, mi_col, mi_row_top, mi_col_top, i,
-                           dst_buf1[i], dst_stride1[i]);
+  if (sub8x8_inter) {
+    // block size
+    const int b4_w = block_size_wide[bsize] >> ss_x;
+    const int b4_h = block_size_high[bsize] >> ss_y;
+    const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
+    const int b8_w = block_size_wide[plane_bsize] >> ss_x;
+    const int b8_h = block_size_high[plane_bsize] >> ss_y;
+    assert(!is_compound);
+
+    const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] };
+
+    int row = row_start;
+    int src_stride;
+    for (int y = 0; y < b8_h; y += b4_h) {
+      int col = col_start;
+      for (int x = 0; x < b8_w; x += b4_w) {
+        MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+        is_compound = has_second_ref(this_mbmi);
+        DECLARE_ALIGNED(32, CONV_BUF_TYPE, tmp_dst[8 * 8]);
+        int tmp_dst_stride = 8;
+        assert(bw < 8 || bh < 8);
+        ConvolveParams conv_params = get_conv_params_no_round(
+            0, 0, plane, tmp_dst, tmp_dst_stride, is_compound, xd->bd);
+        conv_params.use_jnt_comp_avg = 0;
+        struct buf_2d *const dst_buf = &pd->dst;
+        uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
+
+        ref = 0;
+        const RefBuffer *ref_buf =
+            &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
+
+        pd->pre[ref].buf0 =
+            (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer;
+        pd->pre[ref].buf =
+            pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
+                                                     ref_buf->buf->uv_stride,
+                                                     &ref_buf->sf);
+        pd->pre[ref].width = ref_buf->buf->uv_crop_width;
+        pd->pre[ref].height = ref_buf->buf->uv_crop_height;
+        pd->pre[ref].stride = ref_buf->buf->uv_stride;
+
+        const struct scale_factors *const sf =
+            is_intrabc ? &cm->sf_identity : &ref_buf->sf;
+        struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+
+        const MV mv = this_mbmi->mv[ref].as_mv;
+
+        uint8_t *pre;
+        SubpelParams subpel_params;
+        PadBlock block;
+        MV32 scaled_mv;
+        int subpel_x_mv, subpel_y_mv;
+        int highbd;
+        WarpTypesAllowed warp_types;
+        warp_types.global_warp_allowed = is_global[ref];
+        warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL;
+
+        dec_calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf,
+                               &subpel_params, bw, bh, &block, mi_x, mi_y,
+                               &scaled_mv, &subpel_x_mv, &subpel_y_mv);
+        pre = pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
+        src_stride = pre_buf->stride;
+        highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+        extend_mc_border(sf, pre_buf, scaled_mv, block, subpel_x_mv,
+                         subpel_y_mv, 0, is_intrabc, highbd, xd->mc_buf[ref],
+                         &pre, &src_stride);
+        conv_params.ref = ref;
+        conv_params.do_average = ref;
+        if (is_masked_compound_type(mi->interinter_comp.type)) {
+          // masked compound type has its own average mechanism
+          conv_params.do_average = 0;
         }
 
-        // weighted average to smooth the boundary
-        xd->plane[0].dst.buf = dst_buf[0];
-        xd->plane[0].dst.stride = dst_stride[0];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
-            0);
-      } else {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-#if CONFIG_CB4X4
-          const struct macroblockd_plane *pd = &xd->plane[i];
-          int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
-              subsize, pd->subsampling_x, pd->subsampling_y);
-
-          if (handle_chroma_sub8x8) {
-            int mode_offset_row = CONFIG_CHROMA_SUB8X8 ? hbs : 0;
-
-            dec_predict_b_extend(pbi, xd, tile, 0, mi_row + mode_offset_row,
-                                 mi_col, mi_row, mi_col, mi_row_top, mi_col_top,
-                                 i, dst_buf[i], dst_stride[i], top_bsize, bsize,
-                                 0, 0);
-            if (bsize < top_bsize)
-              dec_extend_all(pbi, xd, tile, 0, bsize, top_bsize,
-                             mi_row + mode_offset_row, mi_col, mi_row, mi_col,
-                             mi_row_top, mi_col_top, i, dst_buf[i],
-                             dst_stride[i]);
-          } else {
-#endif
-            // First half
-            dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row,
-                                 mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                                 dst_stride[i], top_bsize, subsize, 0, 0);
-            if (bsize < top_bsize)
-              dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
-                             mi_col, mi_row, mi_col, mi_row_top, mi_col_top, i,
-                             dst_buf[i], dst_stride[i]);
-            else
-              dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
-                             mi_col, mi_row, mi_col, mi_row_top, mi_col_top, i,
-                             dst_buf[i], dst_stride[i], 0);
-
-            if (mi_row + hbs < cm->mi_rows) {
-              // Second half
-              dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col,
-                                   mi_row + hbs, mi_col, mi_row_top, mi_col_top,
-                                   i, dst_buf1[i], dst_stride1[i], top_bsize,
-                                   subsize, 0, 0);
-              if (bsize < top_bsize)
-                dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize,
-                               mi_row + hbs, mi_col, mi_row + hbs, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf1[i],
-                               dst_stride1[i]);
-              else
-                dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize,
-                               mi_row + hbs, mi_col, mi_row + hbs, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf1[i],
-                               dst_stride1[i], 1);
-
-              // weighted average to smooth the boundary
-              xd->plane[i].dst.buf = dst_buf[i];
-              xd->plane[i].dst.stride = dst_stride[i];
-              av1_build_masked_inter_predictor_complex(
-                  xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
-                  mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-                  PARTITION_HORZ, i);
-            }
-#if CONFIG_CB4X4
-          }
-#endif
-        }
-      }
-      break;
-    case PARTITION_VERT:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          // First half
-          dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf[i],
-                               dst_stride[i], top_bsize, BLOCK_8X8, 1, 0);
-          if (bsize < top_bsize)
-            dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                           mi_row, mi_col, mi_row_top, mi_col_top, i,
-                           dst_buf[i], dst_stride[i]);
-
-          // Second half
-          dec_predict_b_extend(pbi, xd, tile, 1, mi_row, mi_col, mi_row, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf1[i],
-                               dst_stride1[i], top_bsize, BLOCK_8X8, 1, 1);
-          if (bsize < top_bsize)
-            dec_extend_all(pbi, xd, tile, 1, subsize, top_bsize, mi_row, mi_col,
-                           mi_row, mi_col, mi_row_top, mi_col_top, i,
-                           dst_buf1[i], dst_stride1[i]);
-        }
+        av1_make_inter_predictor(
+            pre, src_stride, dst, dst_buf->stride, &subpel_params, sf, b4_w,
+            b4_h, &conv_params, this_mbmi->interp_filters, &warp_types,
+            (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y,
+            plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
 
-        // Smooth
-        xd->plane[0].dst.buf = dst_buf[0];
-        xd->plane[0].dst.stride = dst_stride[0];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
-            0);
-      } else {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-#if CONFIG_CB4X4
-          const struct macroblockd_plane *pd = &xd->plane[i];
-          int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
-              subsize, pd->subsampling_x, pd->subsampling_y);
-
-          if (handle_chroma_sub8x8) {
-            int mode_offset_col = CONFIG_CHROMA_SUB8X8 ? hbs : 0;
-            assert(i > 0 && bsize == BLOCK_8X8);
-
-            dec_predict_b_extend(pbi, xd, tile, 0, mi_row,
-                                 mi_col + mode_offset_col, mi_row, mi_col,
-                                 mi_row_top, mi_col_top, i, dst_buf[i],
-                                 dst_stride[i], top_bsize, bsize, 0, 0);
-            if (bsize < top_bsize)
-              dec_extend_all(pbi, xd, tile, 0, bsize, top_bsize, mi_row,
-                             mi_col + mode_offset_col, mi_row, mi_col,
-                             mi_row_top, mi_col_top, i, dst_buf[i],
-                             dst_stride[i]);
-          } else {
-#endif
-            // First half
-            dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row,
-                                 mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                                 dst_stride[i], top_bsize, subsize, 0, 0);
-            if (bsize < top_bsize)
-              dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
-                             mi_col, mi_row, mi_col, mi_row_top, mi_col_top, i,
-                             dst_buf[i], dst_stride[i]);
-            else
-              dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
-                             mi_col, mi_row, mi_col, mi_row_top, mi_col_top, i,
-                             dst_buf[i], dst_stride[i], 3);
-
-            // Second half
-            if (mi_col + hbs < cm->mi_cols) {
-              dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs,
-                                   mi_row, mi_col + hbs, mi_row_top, mi_col_top,
-                                   i, dst_buf1[i], dst_stride1[i], top_bsize,
-                                   subsize, 0, 0);
-              if (bsize < top_bsize)
-                dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
-                               mi_col + hbs, mi_row, mi_col + hbs, mi_row_top,
-                               mi_col_top, i, dst_buf1[i], dst_stride1[i]);
-              else
-                dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
-                               mi_col + hbs, mi_row, mi_col + hbs, mi_row_top,
-                               mi_col_top, i, dst_buf1[i], dst_stride1[i], 2);
-
-              // Smooth
-              xd->plane[i].dst.buf = dst_buf[i];
-              xd->plane[i].dst.stride = dst_stride[i];
-              av1_build_masked_inter_predictor_complex(
-                  xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
-                  mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-                  PARTITION_VERT, i);
-            }
-#if CONFIG_CB4X4
-          }
-#endif
-        }
+        ++col;
       }
-      break;
-    case PARTITION_SPLIT:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf[i],
-                               dst_stride[i], top_bsize, BLOCK_8X8, 1, 0);
-          dec_predict_b_extend(pbi, xd, tile, 1, mi_row, mi_col, mi_row, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf1[i],
-                               dst_stride1[i], top_bsize, BLOCK_8X8, 1, 1);
-          dec_predict_b_extend(pbi, xd, tile, 2, mi_row, mi_col, mi_row, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf2[i],
-                               dst_stride2[i], top_bsize, BLOCK_8X8, 1, 1);
-          dec_predict_b_extend(pbi, xd, tile, 3, mi_row, mi_col, mi_row, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf3[i],
-                               dst_stride3[i], top_bsize, BLOCK_8X8, 1, 1);
-          if (bsize < top_bsize) {
-            dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                           mi_row, mi_col, mi_row_top, mi_col_top, i,
-                           dst_buf[i], dst_stride[i]);
-            dec_extend_all(pbi, xd, tile, 1, subsize, top_bsize, mi_row, mi_col,
-                           mi_row, mi_col, mi_row_top, mi_col_top, i,
-                           dst_buf1[i], dst_stride1[i]);
-            dec_extend_all(pbi, xd, tile, 2, subsize, top_bsize, mi_row, mi_col,
-                           mi_row, mi_col, mi_row_top, mi_col_top, i,
-                           dst_buf2[i], dst_stride2[i]);
-            dec_extend_all(pbi, xd, tile, 3, subsize, top_bsize, mi_row, mi_col,
-                           mi_row, mi_col, mi_row_top, mi_col_top, i,
-                           dst_buf3[i], dst_stride3[i]);
-          }
-        }
-#if CONFIG_CB4X4
-      } else if (bsize == BLOCK_8X8) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          const struct macroblockd_plane *pd = &xd->plane[i];
-          int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
-              subsize, pd->subsampling_x, pd->subsampling_y);
-
-          if (handle_chroma_sub8x8) {
-            int mode_offset_row =
-                CONFIG_CHROMA_SUB8X8 && mi_row + hbs < cm->mi_rows ? hbs : 0;
-            int mode_offset_col =
-                CONFIG_CHROMA_SUB8X8 && mi_col + hbs < cm->mi_cols ? hbs : 0;
-
-            dec_predict_b_extend(pbi, xd, tile, 0, mi_row + mode_offset_row,
-                                 mi_col + mode_offset_col, mi_row, mi_col,
-                                 mi_row_top, mi_col_top, i, dst_buf[i],
-                                 dst_stride[i], top_bsize, BLOCK_8X8, 0, 0);
-            if (bsize < top_bsize)
-              dec_extend_all(pbi, xd, tile, 0, BLOCK_8X8, top_bsize,
-                             mi_row + mode_offset_row, mi_col + mode_offset_col,
-                             mi_row, mi_col, mi_row_top, mi_col_top, i,
-                             dst_buf[i], dst_stride[i]);
-          } else {
-            dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row,
-                                 mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                                 dst_stride[i], top_bsize, subsize, 0, 0);
-            if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-              dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs,
-                                   mi_row, mi_col + hbs, mi_row_top, mi_col_top,
-                                   i, dst_buf1[i], dst_stride1[i], top_bsize,
-                                   subsize, 0, 0);
-            if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
-              dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col,
-                                   mi_row + hbs, mi_col, mi_row_top, mi_col_top,
-                                   i, dst_buf2[i], dst_stride2[i], top_bsize,
-                                   subsize, 0, 0);
-            if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-              dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col + hbs,
-                                   mi_row + hbs, mi_col + hbs, mi_row_top,
-                                   mi_col_top, i, dst_buf3[i], dst_stride3[i],
-                                   top_bsize, subsize, 0, 0);
-
-            if (bsize < top_bsize) {
-              dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
-                             mi_col, mi_row, mi_col, mi_row_top, mi_col_top, i,
-                             dst_buf[i], dst_stride[i]);
-              if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-                dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
-                               mi_col + hbs, mi_row, mi_col + hbs, mi_row_top,
-                               mi_col_top, i, dst_buf1[i], dst_stride1[i]);
-              if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
-                dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize,
-                               mi_row + hbs, mi_col, mi_row + hbs, mi_col,
-                               mi_row_top, mi_col_top, i, dst_buf2[i],
-                               dst_stride2[i]);
-              if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-                dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize,
-                               mi_row + hbs, mi_col + hbs, mi_row + hbs,
-                               mi_col + hbs, mi_row_top, mi_col_top, i,
-                               dst_buf3[i], dst_stride3[i]);
-            }
-          }
-        }
-#endif
-      } else {
-        dec_predict_sb_complex(pbi, xd, tile, mi_row, mi_col, mi_row_top,
-                               mi_col_top, subsize, top_bsize, dst_buf,
-                               dst_stride);
-        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-          dec_predict_sb_complex(pbi, xd, tile, mi_row, mi_col + hbs,
-                                 mi_row_top, mi_col_top, subsize, top_bsize,
-                                 dst_buf1, dst_stride1);
-        if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
-          dec_predict_sb_complex(pbi, xd, tile, mi_row + hbs, mi_col,
-                                 mi_row_top, mi_col_top, subsize, top_bsize,
-                                 dst_buf2, dst_stride2);
-        if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-          dec_predict_sb_complex(pbi, xd, tile, mi_row + hbs, mi_col + hbs,
-                                 mi_row_top, mi_col_top, subsize, top_bsize,
-                                 dst_buf3, dst_stride3);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-#if CONFIG_CB4X4
-        const struct macroblockd_plane *pd = &xd->plane[i];
-        int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
-            subsize, pd->subsampling_x, pd->subsampling_y);
-        if (handle_chroma_sub8x8) continue;  // Skip <4x4 chroma smoothing
-#else
-        if (bsize == BLOCK_8X8 && i != 0)
-          continue;  // Skip <4x4 chroma smoothing
-#endif
-        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
-          av1_build_masked_inter_predictor_complex(
-              xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
-              mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-              PARTITION_VERT, i);
-          if (mi_row + hbs < cm->mi_rows) {
-            av1_build_masked_inter_predictor_complex(
-                xd, dst_buf2[i], dst_stride2[i], dst_buf3[i], dst_stride3[i],
-                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-                PARTITION_VERT, i);
-            av1_build_masked_inter_predictor_complex(
-                xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i],
-                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-                PARTITION_HORZ, i);
-          }
-        } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
-          av1_build_masked_inter_predictor_complex(
-              xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i],
-              mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-              PARTITION_HORZ, i);
-        }
-      }
-      break;
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-#error HORZ/VERT_A/B partitions not yet updated in superres code
-#endif
-    case PARTITION_HORZ_A:
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, dst_buf, dst_stride,
-                           top_bsize, bsize2, 0, 0);
-      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dst_buf, dst_stride);
-
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs, mi_row,
-                           mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
-                           dst_stride1, top_bsize, bsize2, 0, 0);
-      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
-                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
-
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
-                           mi_col, mi_row_top, mi_col_top, dst_buf2,
-                           dst_stride2, top_bsize, subsize, 0, 0);
-      if (bsize < top_bsize)
-        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row + hbs,
-                       mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2);
-      else
-        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row + hbs,
-                       mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2,
-                       1);
-
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
-            i);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
-            i);
-      }
-      break;
-    case PARTITION_VERT_A:
+      ++row;
+    }
 
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, dst_buf, dst_stride,
-                           top_bsize, bsize2, 0, 0);
-      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dst_buf, dst_stride);
-
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
-                           mi_col, mi_row_top, mi_col_top, dst_buf1,
-                           dst_stride1, top_bsize, bsize2, 0, 0);
-      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
-                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
-
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs, mi_row,
-                           mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
-                           dst_stride2, top_bsize, subsize, 0, 0);
-      if (bsize < top_bsize)
-        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
-                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
-                       dst_stride2);
-      else
-        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
-                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
-                       dst_stride2, 2);
-
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
-            i);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
-            i);
-      }
-      break;
-    case PARTITION_HORZ_B:
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, dst_buf, dst_stride,
-                           top_bsize, subsize, 0, 0);
-      if (bsize < top_bsize)
-        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                       mi_row_top, mi_col_top, dst_buf, dst_stride);
-      else
-        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                       mi_row_top, mi_col_top, dst_buf, dst_stride, 0);
-
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
-                           mi_col, mi_row_top, mi_col_top, dst_buf1,
-                           dst_stride1, top_bsize, bsize2, 0, 0);
-      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
-                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
-
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col + hbs,
-                           mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
-                           dst_buf2, dst_stride2, top_bsize, bsize2, 0, 0);
-      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row + hbs,
-                     mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
-                     dst_stride2);
-
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i],
-            mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-            PARTITION_VERT, i);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
-            i);
+    for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref];
+    return;
+  }
+
+  {
+    struct buf_2d *const dst_buf = &pd->dst;
+    uint8_t *const dst = dst_buf->buf;
+    uint8_t *pre[2];
+    SubpelParams subpel_params[2];
+    DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
+    int src_stride[2];
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      const struct scale_factors *const sf =
+          is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
+      struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+      const MV mv = mi->mv[ref].as_mv;
+      PadBlock block;
+      MV32 scaled_mv;
+      int subpel_x_mv, subpel_y_mv;
+      int highbd;
+
+      dec_calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf,
+                             &subpel_params[ref], bw, bh, &block, mi_x, mi_y,
+                             &scaled_mv, &subpel_x_mv, &subpel_y_mv);
+      pre[ref] = pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
+      src_stride[ref] = pre_buf->stride;
+      highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global[ref];
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+      int do_warp = (bw >= 8 && bh >= 8 &&
+                     av1_allow_warp(mi, &warp_types,
+                                    &xd->global_motion[mi->ref_frame[ref]],
+                                    build_for_obmc, subpel_params[ref].xs,
+                                    subpel_params[ref].ys, NULL));
+      do_warp = (do_warp && xd->cur_frame_force_integer_mv == 0);
+
+      extend_mc_border(sf, pre_buf, scaled_mv, block, subpel_x_mv, subpel_y_mv,
+                       do_warp, is_intrabc, highbd, xd->mc_buf[ref], &pre[ref],
+                       &src_stride[ref]);
+    }
+
+    ConvolveParams conv_params = get_conv_params_no_round(
+        0, 0, plane, tmp_dst, MAX_SB_SIZE, is_compound, xd->bd);
+    av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
+                               &conv_params.bck_offset,
+                               &conv_params.use_jnt_comp_avg, is_compound);
+
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      const struct scale_factors *const sf =
+          is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global[ref];
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+      conv_params.ref = ref;
+      conv_params.do_average = ref;
+      if (is_masked_compound_type(mi->interinter_comp.type)) {
+        // masked compound type has its own average mechanism
+        conv_params.do_average = 0;
       }
-      break;
-    case PARTITION_VERT_B:
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, dst_buf, dst_stride,
-                           top_bsize, subsize, 0, 0);
-      if (bsize < top_bsize)
-        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                       mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+      if (ref && is_masked_compound_type(mi->interinter_comp.type))
+        av1_make_masked_inter_predictor(
+            pre[ref], src_stride[ref], dst, dst_buf->stride,
+            &subpel_params[ref], sf, bw, bh, &conv_params, mi->interp_filters,
+            plane, &warp_types, mi_x >> pd->subsampling_x,
+            mi_y >> pd->subsampling_y, ref, xd, cm->allow_warped_motion);
       else
-        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                       mi_row_top, mi_col_top, dst_buf, dst_stride, 3);
-
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs, mi_row,
-                           mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
-                           dst_stride1, top_bsize, bsize2, 0, 0);
-      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
-                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
-
-      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col + hbs,
-                           mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
-                           dst_buf2, dst_stride2, top_bsize, bsize2, 0, 0);
-      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row + hbs,
-                     mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
-                     dst_stride2);
-
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i],
-            mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-            PARTITION_HORZ, i);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
-            i);
-      }
-      break;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-    default: assert(0);
+        av1_make_inter_predictor(
+            pre[ref], src_stride[ref], dst, dst_buf->stride,
+            &subpel_params[ref], sf, bw, bh, &conv_params, mi->interp_filters,
+            &warp_types, mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y,
+            plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
+    }
   }
 }
 
-static void set_segment_id_supertx(const AV1_COMMON *const cm, int mi_row,
-                                   int mi_col, BLOCK_SIZE bsize) {
-  const struct segmentation *seg = &cm->seg;
-  const int miw = AOMMIN(mi_size_wide[bsize], cm->mi_cols - mi_col);
-  const int mih = AOMMIN(mi_size_high[bsize], cm->mi_rows - mi_row);
-  const int mi_offset = mi_row * cm->mi_stride + mi_col;
-  MODE_INFO **const mip = cm->mi_grid_visible + mi_offset;
-  int r, c;
-  int seg_id_supertx = MAX_SEGMENTS;
+static void dec_build_inter_predictors_for_planes(const AV1_COMMON *cm,
+                                                  MACROBLOCKD *xd,
+                                                  BLOCK_SIZE bsize, int mi_row,
+                                                  int mi_col, int plane_from,
+                                                  int plane_to) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    const int bw = pd->width;
+    const int bh = pd->height;
+
+    if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                             pd->subsampling_y))
+      continue;
 
-  if (!seg->enabled) {
-    seg_id_supertx = 0;
-  } else {
-    // Find the minimum segment_id
-    for (r = 0; r < mih; r++)
-      for (c = 0; c < miw; c++)
-        seg_id_supertx =
-            AOMMIN(mip[r * cm->mi_stride + c]->mbmi.segment_id, seg_id_supertx);
-    assert(0 <= seg_id_supertx && seg_id_supertx < MAX_SEGMENTS);
-  }
-
-  // Assign the the segment_id back to segment_id_supertx
-  for (r = 0; r < mih; r++)
-    for (c = 0; c < miw; c++)
-      mip[r * cm->mi_stride + c]->mbmi.segment_id_supertx = seg_id_supertx;
+    dec_build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y);
+  }
 }
-#endif  // CONFIG_SUPERTX
 
-static void decode_mbmi_block(AV1Decoder *const pbi, MACROBLOCKD *const xd,
-#if CONFIG_SUPERTX
-                              int supertx_enabled,
-#endif  // CONFIG_SUPERTX
-                              int mi_row, int mi_col, aom_reader *r,
-#if CONFIG_EXT_PARTITION_TYPES
-                              PARTITION_TYPE partition,
-#endif  // CONFIG_EXT_PARTITION_TYPES
-                              BLOCK_SIZE bsize) {
-  AV1_COMMON *const cm = &pbi->common;
-  const int bw = mi_size_wide[bsize];
-  const int bh = mi_size_high[bsize];
-  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
-  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
+static void dec_build_inter_predictors_sby(const AV1_COMMON *cm,
+                                           MACROBLOCKD *xd, int mi_row,
+                                           int mi_col, BUFFER_SET *ctx,
+                                           BLOCK_SIZE bsize) {
+  dec_build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 0, 0);
 
-#if CONFIG_ACCOUNTING
-  aom_accounting_set_context(&pbi->accounting, mi_col, mi_row);
-#endif
-#if CONFIG_SUPERTX
-  if (supertx_enabled) {
-    set_mb_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
-  } else {
-    set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
+  if (is_interintra_pred(xd->mi[0])) {
+    BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL },
+                               { xd->plane[0].dst.stride, 0, 0 } };
+    if (!ctx) ctx = &default_ctx;
+    av1_build_interintra_predictors_sby(cm, xd, xd->plane[0].dst.buf,
+                                        xd->plane[0].dst.stride, ctx, bsize);
   }
-#if CONFIG_EXT_PARTITION_TYPES
-  xd->mi[0]->mbmi.partition = partition;
-#endif
-  av1_read_mode_info(pbi, xd, supertx_enabled, mi_row, mi_col, r, x_mis, y_mis);
-#else
-  set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
-#if CONFIG_EXT_PARTITION_TYPES
-  xd->mi[0]->mbmi.partition = partition;
-#endif
-  av1_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
-#endif  // CONFIG_SUPERTX
-  if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
-    const BLOCK_SIZE uv_subsize =
-        ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y];
-    if (uv_subsize == BLOCK_INVALID)
-      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
-                         "Invalid block size.");
+}
+
+static void dec_build_inter_predictors_sbuv(const AV1_COMMON *cm,
+                                            MACROBLOCKD *xd, int mi_row,
+                                            int mi_col, BUFFER_SET *ctx,
+                                            BLOCK_SIZE bsize) {
+  dec_build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 1,
+                                        MAX_MB_PLANE - 1);
+
+  if (is_interintra_pred(xd->mi[0])) {
+    BUFFER_SET default_ctx = {
+      { NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
+      { 0, xd->plane[1].dst.stride, xd->plane[2].dst.stride }
+    };
+    if (!ctx) ctx = &default_ctx;
+    av1_build_interintra_predictors_sbuv(
+        cm, xd, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
+        xd->plane[1].dst.stride, xd->plane[2].dst.stride, ctx, bsize);
   }
+}
 
-#if CONFIG_SUPERTX
-  xd->mi[0]->mbmi.segment_id_supertx = MAX_SEGMENTS;
-#endif  // CONFIG_SUPERTX
+static void dec_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          BUFFER_SET *ctx, BLOCK_SIZE bsize) {
+  const int num_planes = av1_num_planes(cm);
+  dec_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
+  if (num_planes > 1)
+    dec_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
+}
 
-  int reader_corrupted_flag = aom_reader_has_error(r);
-  aom_merge_corrupted_flag(&xd->corrupted, reader_corrupted_flag);
+static INLINE void dec_build_prediction_by_above_pred(
+    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+    MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
+  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+  const int above_mi_col = ctxt->mi_col + rel_mi_col;
+  int mi_x, mi_y;
+  MB_MODE_INFO backup_mbmi = *above_mbmi;
+
+  av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width,
+                                           &backup_mbmi, ctxt, num_planes);
+  mi_x = above_mi_col << MI_SIZE_LOG2;
+  mi_y = ctxt->mi_row << MI_SIZE_LOG2;
+
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+  for (int j = 0; j < num_planes; ++j) {
+    const struct macroblockd_plane *pd = &xd->plane[j];
+    int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
+    int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
+                   block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
+
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+    dec_build_inter_predictors(ctxt->cm, xd, j, &backup_mbmi, 1, bw, bh, mi_x,
+                               mi_y);
+  }
 }
 
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-static void set_mode_info_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                  int mi_row, int mi_col) {
-  const int offset = mi_row * cm->mi_stride + mi_col;
-  xd->mi = cm->mi_grid_visible + offset;
-  xd->mi[0] = &cm->mi[offset];
+static void dec_build_prediction_by_above_preds(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
+    uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE],
+    int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
+  if (!xd->up_available) return;
+
+  // Adjust mb_to_bottom_edge to have the correct value for the OBMC
+  // prediction block. This is half the height of the original block,
+  // except for 128-wide blocks, where we only use a height of 32.
+  int this_height = xd->n8_h * MI_SIZE;
+  int pred_height = AOMMIN(this_height / 2, 32);
+  xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
+
+  struct build_prediction_ctxt ctxt = { cm,         mi_row,
+                                        mi_col,     tmp_buf,
+                                        tmp_width,  tmp_height,
+                                        tmp_stride, xd->mb_to_right_edge };
+  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  foreach_overlappable_nb_above(cm, xd, mi_col,
+                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
+                                dec_build_prediction_by_above_pred, &ctxt);
+
+  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge = ctxt.mb_to_far_edge;
+  xd->mb_to_bottom_edge -= (this_height - pred_height) * 8;
 }
 
-static void get_ncobmc_recon(AV1_COMMON *const cm, MACROBLOCKD *xd, int mi_row,
-                             int mi_col, int bsize, int mode) {
-  uint8_t *pred_buf[4][MAX_MB_PLANE];
-  int pred_stride[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  // target block in pxl
-  int pxl_row = mi_row << MI_SIZE_LOG2;
-  int pxl_col = mi_col << MI_SIZE_LOG2;
+static INLINE void dec_build_prediction_by_left_pred(
+    MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height,
+    MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
+  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+  const int left_mi_row = ctxt->mi_row + rel_mi_row;
+  int mi_x, mi_y;
+  MB_MODE_INFO backup_mbmi = *left_mbmi;
+
+  av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height,
+                                          &backup_mbmi, ctxt, num_planes);
+  mi_x = ctxt->mi_col << MI_SIZE_LOG2;
+  mi_y = left_mi_row << MI_SIZE_LOG2;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+  for (int j = 0; j < num_planes; ++j) {
+    const struct macroblockd_plane *pd = &xd->plane[j];
+    int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
+                   block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
+    int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y;
+
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+    dec_build_inter_predictors(ctxt->cm, xd, j, &backup_mbmi, 1, bw, bh, mi_x,
+                               mi_y);
+  }
+}
+
+static void dec_build_prediction_by_left_preds(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
+    uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE],
+    int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
+  if (!xd->left_available) return;
+
+  // Adjust mb_to_right_edge to have the correct value for the OBMC
+  // prediction block. This is half the width of the original block,
+  // except for 128-wide blocks, where we only use a width of 32.
+  int this_width = xd->n8_w * MI_SIZE;
+  int pred_width = AOMMIN(this_width / 2, 32);
+  xd->mb_to_right_edge += (this_width - pred_width) * 8;
+
+  struct build_prediction_ctxt ctxt = { cm,         mi_row,
+                                        mi_col,     tmp_buf,
+                                        tmp_width,  tmp_height,
+                                        tmp_stride, xd->mb_to_bottom_edge };
+  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  foreach_overlappable_nb_left(cm, xd, mi_row,
+                               max_neighbor_obmc[mi_size_high_log2[bsize]],
+                               dec_build_prediction_by_left_pred, &ctxt);
+
+  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_right_edge -= (this_width - pred_width) * 8;
+  xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
+}
+
+static void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm,
+                                               MACROBLOCKD *xd, int mi_row,
+                                               int mi_col) {
+  const int num_planes = av1_num_planes(cm);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+  int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
 
-  int plane;
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     int len = sizeof(uint16_t);
-    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[0], cm->ncobmcaw_buf[0], MAX_SB_SQUARE,
-                            len);
-    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[1], cm->ncobmcaw_buf[1], MAX_SB_SQUARE,
-                            len);
-    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[2], cm->ncobmcaw_buf[2], MAX_SB_SQUARE,
-                            len);
-    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[3], cm->ncobmcaw_buf[3], MAX_SB_SQUARE,
-                            len);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
+    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
+    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
+    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
+    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
   } else {
-#endif  // CONFIG_HIGHBITDEPTH
-    ASSIGN_ALIGNED_PTRS(pred_buf[0], cm->ncobmcaw_buf[0], MAX_SB_SQUARE);
-    ASSIGN_ALIGNED_PTRS(pred_buf[1], cm->ncobmcaw_buf[1], MAX_SB_SQUARE);
-    ASSIGN_ALIGNED_PTRS(pred_buf[2], cm->ncobmcaw_buf[2], MAX_SB_SQUARE);
-    ASSIGN_ALIGNED_PTRS(pred_buf[3], cm->ncobmcaw_buf[3], MAX_SB_SQUARE);
-#if CONFIG_HIGHBITDEPTH
-  }
-#endif
-  av1_get_ext_blk_preds(cm, xd, bsize, mi_row, mi_col, pred_buf, pred_stride);
-  av1_get_ori_blk_pred(cm, xd, bsize, mi_row, mi_col, pred_buf[3], pred_stride);
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    build_ncobmc_intrpl_pred(cm, xd, plane, pxl_row, pxl_col, bsize, pred_buf,
-                             pred_stride, mode);
+    dst_buf1[0] = tmp_buf1;
+    dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
+    dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
+    dst_buf2[0] = tmp_buf2;
+    dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
+    dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
+  }
+  dec_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
+                                      dst_width1, dst_height1, dst_stride1);
+  dec_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
+                                     dst_width2, dst_height2, dst_stride2);
+  av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, get_frame_new_buffer(cm),
+                       mi_row, mi_col, 0, num_planes);
+  av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
+                                  dst_buf2, dst_stride2);
+}
+
+static void cfl_store_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  if (store_cfl_required(cm, xd)) {
+    cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
   }
 }
 
-static void av1_get_ncobmc_recon(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                 int bsize, const int mi_row, const int mi_col,
-                                 const NCOBMC_MODE modes) {
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
+static void predict_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int num_planes = av1_num_planes(cm);
+  for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+    if (frame < LAST_FRAME) {
+      assert(is_intrabc_block(mbmi));
+      assert(frame == INTRA_FRAME);
+      assert(ref == 0);
+    } else {
+      RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
 
-  assert(bsize >= BLOCK_8X8);
+      xd->block_refs[ref] = ref_buf;
+      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col, &ref_buf->sf,
+                           num_planes);
+    }
+  }
 
-  reset_xd_boundary(xd, mi_row, mi_height, mi_col, mi_width, cm->mi_rows,
-                    cm->mi_cols);
-  get_ncobmc_recon(cm, xd, mi_row, mi_col, bsize, modes);
+  dec_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+  if (mbmi->motion_mode == OBMC_CAUSAL)
+    dec_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
 }
 
-static void recon_ncobmc_intrpl_pred(AV1_COMMON *const cm,
-                                     MACROBLOCKD *const xd, int mi_row,
-                                     int mi_col, BLOCK_SIZE bsize) {
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-  const int hbs = AOMMAX(mi_size_wide[bsize] / 2, mi_size_high[bsize] / 2);
-  const BLOCK_SIZE sqr_blk = bsize_2_sqr_bsize[bsize];
-  if (mi_width > mi_height) {
-    // horizontal partition
-    av1_get_ncobmc_recon(cm, xd, sqr_blk, mi_row, mi_col, mbmi->ncobmc_mode[0]);
-    xd->mi += hbs;
-    av1_get_ncobmc_recon(cm, xd, sqr_blk, mi_row, mi_col + hbs,
-                         mbmi->ncobmc_mode[1]);
-  } else if (mi_height > mi_width) {
-    // vertical partition
-    av1_get_ncobmc_recon(cm, xd, sqr_blk, mi_row, mi_col, mbmi->ncobmc_mode[0]);
-    xd->mi += hbs * xd->mi_stride;
-    av1_get_ncobmc_recon(cm, xd, sqr_blk, mi_row + hbs, mi_col,
-                         mbmi->ncobmc_mode[1]);
-  } else {
-    av1_get_ncobmc_recon(cm, xd, sqr_blk, mi_row, mi_col, mbmi->ncobmc_mode[0]);
-  }
-  set_mode_info_offsets(cm, xd, mi_row, mi_col);
-  // restore dst buffer and mode info
-  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                       mi_col);
+static void set_color_index_map_offset(MACROBLOCKD *const xd, int plane,
+                                       aom_reader *r) {
+  (void)r;
+  Av1ColorMapParam params;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  av1_get_block_dimensions(mbmi->sb_type, plane, xd, &params.plane_width,
+                           &params.plane_height, NULL, NULL);
+  xd->color_index_map_offset[plane] += params.plane_width * params.plane_height;
 }
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
 
 static void decode_token_and_recon_block(AV1Decoder *const pbi,
                                          MACROBLOCKD *const xd, int mi_row,
                                          int mi_col, aom_reader *r,
                                          BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
   const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
   const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
 
   set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8
-  CFL_CTX *const cfl = xd->cfl;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  CFL_CTX *const cfl = &xd->cfl;
   cfl->is_chroma_reference = is_chroma_reference(
       mi_row, mi_col, bsize, cfl->subsampling_x, cfl->subsampling_y);
-#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8
 
   if (cm->delta_q_present_flag) {
-    int i;
-    for (i = 0; i < MAX_SEGMENTS; i++) {
-#if CONFIG_EXT_DELTA_Q
+    for (int i = 0; i < MAX_SEGMENTS; i++) {
       const int current_qindex =
           av1_get_qindex(&cm->seg, i, xd->current_qindex);
-#else
-      const int current_qindex = xd->current_qindex;
-#endif  // CONFIG_EXT_DELTA_Q
-      int j;
-      for (j = 0; j < MAX_MB_PLANE; ++j) {
-        const int dc_delta_q = j == 0 ? cm->y_dc_delta_q : cm->uv_dc_delta_q;
-        const int ac_delta_q = j == 0 ? 0 : cm->uv_ac_delta_q;
-
-        xd->plane[j].seg_dequant[i][0] =
-            av1_dc_quant(current_qindex, dc_delta_q, cm->bit_depth);
-        xd->plane[j].seg_dequant[i][1] =
-            av1_ac_quant(current_qindex, ac_delta_q, cm->bit_depth);
+      for (int j = 0; j < num_planes; ++j) {
+        const int dc_delta_q =
+            j == 0 ? cm->y_dc_delta_q
+                   : (j == 1 ? cm->u_dc_delta_q : cm->v_dc_delta_q);
+        const int ac_delta_q =
+            j == 0 ? 0 : (j == 1 ? cm->u_ac_delta_q : cm->v_ac_delta_q);
+        xd->plane[j].seg_dequant_QTX[i][0] =
+            av1_dc_quant_QTX(current_qindex, dc_delta_q, cm->bit_depth);
+        xd->plane[j].seg_dequant_QTX[i][1] =
+            av1_ac_quant_QTX(current_qindex, ac_delta_q, cm->bit_depth);
       }
     }
   }
+  if (mbmi->skip) av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
 
-#if CONFIG_CB4X4
-  if (mbmi->skip) av1_reset_skip_context(xd, mi_row, mi_col, bsize);
-#else
-  if (mbmi->skip) {
-    av1_reset_skip_context(xd, mi_row, mi_col, AOMMAX(BLOCK_8X8, bsize));
-  }
-#endif
-
-#if CONFIG_COEF_INTERLEAVE
-  {
-    const struct macroblockd_plane *const pd_y = &xd->plane[0];
-    const struct macroblockd_plane *const pd_c = &xd->plane[1];
-    const TX_SIZE tx_log2_y = mbmi->tx_size;
-    const TX_SIZE tx_log2_c = av1_get_uv_tx_size(mbmi, pd_c);
-    const int tx_sz_y = (1 << tx_log2_y);
-    const int tx_sz_c = (1 << tx_log2_c);
-    const int num_4x4_w_y = pd_y->n4_w;
-    const int num_4x4_h_y = pd_y->n4_h;
-    const int num_4x4_w_c = pd_c->n4_w;
-    const int num_4x4_h_c = pd_c->n4_h;
-    const int max_4x4_w_y = get_max_4x4_size(num_4x4_w_y, xd->mb_to_right_edge,
-                                             pd_y->subsampling_x);
-    const int max_4x4_h_y = get_max_4x4_size(num_4x4_h_y, xd->mb_to_bottom_edge,
-                                             pd_y->subsampling_y);
-    const int max_4x4_w_c = get_max_4x4_size(num_4x4_w_c, xd->mb_to_right_edge,
-                                             pd_c->subsampling_x);
-    const int max_4x4_h_c = get_max_4x4_size(num_4x4_h_c, xd->mb_to_bottom_edge,
-                                             pd_c->subsampling_y);
-
-    // The max_4x4_w/h may be smaller than tx_sz under some corner cases,
-    // i.e. when the SB is splitted by tile boundaries.
-    const int tu_num_w_y = (max_4x4_w_y + tx_sz_y - 1) / tx_sz_y;
-    const int tu_num_h_y = (max_4x4_h_y + tx_sz_y - 1) / tx_sz_y;
-    const int tu_num_w_c = (max_4x4_w_c + tx_sz_c - 1) / tx_sz_c;
-    const int tu_num_h_c = (max_4x4_h_c + tx_sz_c - 1) / tx_sz_c;
-    const int tu_num_c = tu_num_w_c * tu_num_h_c;
-
-    if (!is_inter_block(mbmi)) {
-      int tu_idx_c = 0;
-      int row_y, col_y, row_c, col_c;
-      int plane;
-
-// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
-#if !CONFIG_PVQ
-      for (plane = 0; plane <= 1; ++plane) {
-        if (mbmi->palette_mode_info.palette_size[plane])
-          av1_decode_palette_tokens(xd, plane, r);
-      }
-#endif  // !CONFIG_PVQ
-
-      for (row_y = 0; row_y < tu_num_h_y; row_y++) {
-        for (col_y = 0; col_y < tu_num_w_y; col_y++) {
-          // luma
-          predict_and_reconstruct_intra_block(
-              cm, xd, r, mbmi, 0, row_y * tx_sz_y, col_y * tx_sz_y, tx_log2_y);
-          // chroma
-          if (tu_idx_c < tu_num_c) {
-            row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
-            col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
-            predict_and_reconstruct_intra_block(cm, xd, r, mbmi, 1, row_c,
-                                                col_c, tx_log2_c);
-            predict_and_reconstruct_intra_block(cm, xd, r, mbmi, 2, row_c,
-                                                col_c, tx_log2_c);
-            tu_idx_c++;
-          }
-        }
-      }
-
-      // In 422 case, it's possilbe that Chroma has more TUs than Luma
-      while (tu_idx_c < tu_num_c) {
-        row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
-        col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
-        predict_and_reconstruct_intra_block(cm, xd, r, mbmi, 1, row_c, col_c,
-                                            tx_log2_c);
-        predict_and_reconstruct_intra_block(cm, xd, r, mbmi, 2, row_c, col_c,
-                                            tx_log2_c);
-        tu_idx_c++;
-      }
-    } else {
-      // Prediction
-      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL,
-                                    AOMMAX(bsize, BLOCK_8X8));
-
-      // Reconstruction
-      if (!mbmi->skip) {
-        int eobtotal = 0;
-        int tu_idx_c = 0;
-        int row_y, col_y, row_c, col_c;
-
-        for (row_y = 0; row_y < tu_num_h_y; row_y++) {
-          for (col_y = 0; col_y < tu_num_w_y; col_y++) {
-            // luma
-            eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id, 0,
-                                                row_y * tx_sz_y,
-                                                col_y * tx_sz_y, tx_log2_y);
-            // chroma
-            if (tu_idx_c < tu_num_c) {
-              row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
-              col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
-              eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id,
-                                                  1, row_c, col_c, tx_log2_c);
-              eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id,
-                                                  2, row_c, col_c, tx_log2_c);
-              tu_idx_c++;
+  if (!is_inter_block(mbmi)) {
+    int row, col;
+    assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
+                                         xd->plane[0].subsampling_y));
+    const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+    const int max_blocks_high = max_block_high(xd, bsize, 0);
+    const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+    int mu_blocks_wide =
+        block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+    int mu_blocks_high =
+        block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+    mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+    mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+    for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
+      for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+        for (int plane = 0; plane < num_planes; ++plane) {
+          const struct macroblockd_plane *const pd = &xd->plane[plane];
+          if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                                   pd->subsampling_y))
+            continue;
+
+          const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+          const int stepr = tx_size_high_unit[tx_size];
+          const int stepc = tx_size_wide_unit[tx_size];
+
+          const int unit_height = ROUND_POWER_OF_TWO(
+              AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y);
+          const int unit_width = ROUND_POWER_OF_TWO(
+              AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x);
+
+          for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+               blk_row += stepr) {
+            for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+                 blk_col += stepc) {
+              read_coeffs_tx_intra_block(cm, xd, r, plane, blk_row, blk_col,
+                                         tx_size);
+              predict_and_reconstruct_intra_block(cm, xd, r, plane, blk_row,
+                                                  blk_col, tx_size);
+              set_cb_buffer_offsets(xd, tx_size, plane);
             }
           }
         }
-
-        // In 422 case, it's possilbe that Chroma has more TUs than Luma
-        while (tu_idx_c < tu_num_c) {
-          row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
-          col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
-          eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id, 1,
-                                              row_c, col_c, tx_log2_c);
-          eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id, 2,
-                                              row_c, col_c, tx_log2_c);
-          tu_idx_c++;
-        }
-
-        // TODO(CONFIG_COEF_INTERLEAVE owners): bring eob == 0 corner case
-        // into line with the defaut configuration
-        if (bsize >= BLOCK_8X8 && eobtotal == 0) mbmi->skip = 1;
       }
     }
-  }
-#else  // CONFIG_COEF_INTERLEAVE
-  if (!is_inter_block(mbmi)) {
-    int plane;
-
-// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
-#if !CONFIG_PVQ
-    for (plane = 0; plane <= 1; ++plane) {
-      if (mbmi->palette_mode_info.palette_size[plane])
-        av1_decode_palette_tokens(xd, plane, r);
-    }
-#endif  // #if !CONFIG_PVQ
-
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-      const int stepr = tx_size_high_unit[tx_size];
-      const int stepc = tx_size_wide_unit[tx_size];
-#if CONFIG_CHROMA_SUB8X8
-      const BLOCK_SIZE plane_bsize =
-          AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#elif CONFIG_CB4X4
-      const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#else
-      const BLOCK_SIZE plane_bsize =
-          get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
-#endif
-      int row, col;
-      const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-      const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
-#if CONFIG_CB4X4
+  } else {
+    predict_inter_block(cm, xd, mi_row, mi_col, bsize);
+#if CONFIG_MISMATCH_DEBUG
+    for (int plane = 0; plane < num_planes; ++plane) {
+      const struct macroblockd_plane *pd = &xd->plane[plane];
+      int pixel_c, pixel_r;
+      mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
+                      pd->subsampling_x, pd->subsampling_y);
       if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
                                pd->subsampling_y))
         continue;
+      mismatch_check_block_pre(pd->dst.buf, pd->dst.stride, cm->frame_offset,
+                               plane, pixel_c, pixel_r, pd->width, pd->height,
+                               xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+    }
 #endif
-      int blk_row, blk_col;
-      const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd);
+
+    // Reconstruction
+    if (!mbmi->skip) {
+      int eobtotal = 0;
+
+      const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+      const int max_blocks_high = max_block_high(xd, bsize, 0);
+      int row, col;
+
+      const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+      assert(max_unit_bsize ==
+             get_plane_block_size(BLOCK_64X64, xd->plane[0].subsampling_x,
+                                  xd->plane[0].subsampling_y));
       int mu_blocks_wide =
           block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
       int mu_blocks_high =
           block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
       mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
       mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
 
       for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
-        const int unit_height = AOMMIN(mu_blocks_high + row, max_blocks_high);
         for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
-          const int unit_width = AOMMIN(mu_blocks_wide + col, max_blocks_wide);
-
-          for (blk_row = row; blk_row < unit_height; blk_row += stepr)
-            for (blk_col = col; blk_col < unit_width; blk_col += stepc)
-              predict_and_reconstruct_intra_block(cm, xd, r, mbmi, plane,
-                                                  blk_row, blk_col, tx_size);
-        }
-      }
-    }
-  } else {
-    int ref;
-
-#if CONFIG_COMPOUND_SINGLEREF
-    for (ref = 0; ref < 1 + is_inter_anyref_comp_mode(mbmi->mode); ++ref)
-#else
-    for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    {
-      const MV_REFERENCE_FRAME frame =
-#if CONFIG_COMPOUND_SINGLEREF
-          has_second_ref(mbmi) ? mbmi->ref_frame[ref] : mbmi->ref_frame[0];
-#else
-          mbmi->ref_frame[ref];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      if (frame < LAST_FRAME) {
-#if CONFIG_INTRABC
-        assert(is_intrabc_block(mbmi));
-        assert(frame == INTRA_FRAME);
-        assert(ref == 0);
-#else
-        assert(0);
-#endif  // CONFIG_INTRABC
-      } else {
-        RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-
-        xd->block_refs[ref] = ref_buf;
-        if ((!av1_is_valid_scale(&ref_buf->sf)))
-          aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                             "Reference frame has invalid dimensions");
-        av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col,
-                             &ref_buf->sf);
-      }
-    }
-
-#if CONFIG_CB4X4
-    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
-#else
-    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL,
-                                  AOMMAX(bsize, BLOCK_8X8));
-#endif
-
-#if CONFIG_MOTION_VAR
-    if (mbmi->motion_mode == OBMC_CAUSAL) {
-#if CONFIG_NCOBMC
-      av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-#else
-      av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-#endif
-    }
-#endif  // CONFIG_MOTION_VAR
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-    if (mbmi->motion_mode == NCOBMC_ADAPT_WEIGHT) {
-      int plane;
-      recon_ncobmc_intrpl_pred(cm, xd, mi_row, mi_col, bsize);
-      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-        get_pred_from_intrpl_buf(xd, mi_row, mi_col, bsize, plane);
-      }
-    }
-#endif
-    // Reconstruction
-    if (!mbmi->skip) {
-      int eobtotal = 0;
-      int plane;
-
-      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-        const struct macroblockd_plane *const pd = &xd->plane[plane];
-#if CONFIG_CHROMA_SUB8X8
-        const BLOCK_SIZE plane_bsize =
-            AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#elif CONFIG_CB4X4
-        const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#else
-        const BLOCK_SIZE plane_bsize =
-            get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
-#endif
-        const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-        const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
-        int row, col;
-
-#if CONFIG_CB4X4
-        if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
-                                 pd->subsampling_y))
-          continue;
-#endif
-
-#if CONFIG_VAR_TX
-        const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd);
-        int mu_blocks_wide =
-            block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
-        int mu_blocks_high =
-            block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
-
-        mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
-        mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
-
-        const TX_SIZE max_tx_size = get_vartx_max_txsize(
-            mbmi, plane_bsize, pd->subsampling_x || pd->subsampling_y);
-        const int bh_var_tx = tx_size_high_unit[max_tx_size];
-        const int bw_var_tx = tx_size_wide_unit[max_tx_size];
-        int block = 0;
-        int step =
-            tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
-
-        for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
-          for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+          for (int plane = 0; plane < num_planes; ++plane) {
+            const struct macroblockd_plane *const pd = &xd->plane[plane];
+            if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                                     pd->subsampling_y))
+              continue;
+            const BLOCK_SIZE bsizec =
+                scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
+            const BLOCK_SIZE plane_bsize = get_plane_block_size(
+                bsizec, pd->subsampling_x, pd->subsampling_y);
+
+            const TX_SIZE max_tx_size =
+                get_vartx_max_txsize(xd, plane_bsize, plane);
+            const int bh_var_tx = tx_size_high_unit[max_tx_size];
+            const int bw_var_tx = tx_size_wide_unit[max_tx_size];
+            int block = 0;
+            int step =
+                tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
             int blk_row, blk_col;
-            const int unit_height =
-                AOMMIN(mu_blocks_high + row, max_blocks_high);
-            const int unit_width =
-                AOMMIN(mu_blocks_wide + col, max_blocks_wide);
-            for (blk_row = row; blk_row < unit_height; blk_row += bh_var_tx) {
-              for (blk_col = col; blk_col < unit_width; blk_col += bw_var_tx) {
+            const int unit_height = ROUND_POWER_OF_TWO(
+                AOMMIN(mu_blocks_high + row, max_blocks_high),
+                pd->subsampling_y);
+            const int unit_width = ROUND_POWER_OF_TWO(
+                AOMMIN(mu_blocks_wide + col, max_blocks_wide),
+                pd->subsampling_x);
+
+            for (blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+                 blk_row += bh_var_tx) {
+              for (blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+                   blk_col += bw_var_tx) {
                 decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize,
                                       blk_row, blk_col, block, max_tx_size,
                                       &eobtotal);
@@ -2083,388 +1221,291 @@ static void decode_token_and_recon_block(AV1Decoder *const pbi,
             }
           }
         }
-#else
-        const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-        const int stepr = tx_size_high_unit[tx_size];
-        const int stepc = tx_size_wide_unit[tx_size];
-        for (row = 0; row < max_blocks_high; row += stepr)
-          for (col = 0; col < max_blocks_wide; col += stepc)
-            eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id,
-                                                plane, row, col, tx_size);
-#endif
       }
     }
+    cfl_store_inter_block(cm, xd);
   }
-#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8
-  if (mbmi->uv_mode != UV_CFL_PRED) {
-#if CONFIG_DEBUG
-    if (cfl->is_chroma_reference) {
-      cfl_clear_sub8x8_val(cfl);
-    }
-#endif
-    if (!cfl->is_chroma_reference && is_inter_block(mbmi)) {
-      cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
-    }
-  }
-#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8
-#endif  // CONFIG_COEF_INTERLEAVE
+
+  av1_visit_palette(pbi, xd, mi_row, mi_col, r, bsize,
+                    set_color_index_map_offset);
 
   int reader_corrupted_flag = aom_reader_has_error(r);
   aom_merge_corrupted_flag(&xd->corrupted, reader_corrupted_flag);
 }
 
-#if NC_MODE_INFO && CONFIG_MOTION_VAR
-static void detoken_and_recon_sb(AV1Decoder *const pbi, MACROBLOCKD *const xd,
-                                 int mi_row, int mi_col, aom_reader *r,
-                                 BLOCK_SIZE bsize) {
-  AV1_COMMON *const cm = &pbi->common;
-  const int hbs = mi_size_wide[bsize] >> 1;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
-#endif
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
-  const int has_rows = (mi_row + hbs) < cm->mi_rows;
-  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+static void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
+                               TX_SIZE tx_size, int depth, int blk_row,
+                               int blk_col, aom_reader *r) {
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  int is_split = 0;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+  assert(tx_size > TX_4X4);
+
+  if (depth == MAX_VARTX_DEPTH) {
+    for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+      for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+        const int index =
+            av1_get_txb_size_index(bsize, blk_row + idy, blk_col + idx);
+        mbmi->inter_tx_size[index] = tx_size;
+      }
+    }
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+    return;
+  }
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+  const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+                                         xd->left_txfm_context + blk_row,
+                                         mbmi->sb_type, tx_size);
+  is_split = aom_read_symbol(r, ec_ctx->txfm_partition_cdf[ctx], 2, ACCT_STR);
 
-  partition = get_partition(cm, mi_row, mi_col, bsize);
-  subsize = subsize_lookup[partition][bsize];
+  if (is_split) {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+
+    if (sub_txs == TX_4X4) {
+      for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+        for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+          const int index =
+              av1_get_txb_size_index(bsize, blk_row + idy, blk_col + idx);
+          mbmi->inter_tx_size[index] = sub_txs;
+        }
+      }
+      mbmi->tx_size = sub_txs;
+      txfm_partition_update(xd->above_txfm_context + blk_col,
+                            xd->left_txfm_context + blk_row, sub_txs, tx_size);
+      return;
+    }
 
-  if (!hbs && !unify_bsize) {
-    xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
-    xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
-    decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, subsize);
+    assert(bsw > 0 && bsh > 0);
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        int offsetr = blk_row + row;
+        int offsetc = blk_col + col;
+        read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, r);
+      }
+    }
   } else {
-    switch (partition) {
-      case PARTITION_NONE:
-        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, bsize);
-        break;
-      case PARTITION_HORZ:
-        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, subsize);
-        if (has_rows)
-          decode_token_and_recon_block(pbi, xd, mi_row + hbs, mi_col, r,
-                                       subsize);
-        break;
-      case PARTITION_VERT:
-        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, subsize);
-        if (has_cols)
-          decode_token_and_recon_block(pbi, xd, mi_row, mi_col + hbs, r,
-                                       subsize);
-        break;
-      case PARTITION_SPLIT:
-        detoken_and_recon_sb(pbi, xd, mi_row, mi_col, r, subsize);
-        detoken_and_recon_sb(pbi, xd, mi_row, mi_col + hbs, r, subsize);
-        detoken_and_recon_sb(pbi, xd, mi_row + hbs, mi_col, r, subsize);
-        detoken_and_recon_sb(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize);
-        break;
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-#error NC_MODE_INFO+MOTION_VAR not yet supported for new HORZ/VERT_AB partitions
-#endif
-      case PARTITION_HORZ_A:
-        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, bsize2);
-        decode_token_and_recon_block(pbi, xd, mi_row, mi_col + hbs, r, bsize2);
-        decode_token_and_recon_block(pbi, xd, mi_row + hbs, mi_col, r, subsize);
-        break;
-      case PARTITION_HORZ_B:
-        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, subsize);
-        decode_token_and_recon_block(pbi, xd, mi_row + hbs, mi_col, r, bsize2);
-        decode_token_and_recon_block(pbi, xd, mi_row + hbs, mi_col + hbs, r,
-                                     bsize2);
-        break;
-      case PARTITION_VERT_A:
-        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, bsize2);
-        decode_token_and_recon_block(pbi, xd, mi_row + hbs, mi_col, r, bsize2);
-        decode_token_and_recon_block(pbi, xd, mi_row, mi_col + hbs, r, subsize);
-        break;
-      case PARTITION_VERT_B:
-        decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, subsize);
-        decode_token_and_recon_block(pbi, xd, mi_row, mi_col + hbs, r, bsize2);
-        decode_token_and_recon_block(pbi, xd, mi_row + hbs, mi_col + hbs, r,
-                                     bsize2);
-        break;
-#endif
-      default: assert(0 && "Invalid partition type");
+    for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+      for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+        const int index =
+            av1_get_txb_size_index(bsize, blk_row + idy, blk_col + idx);
+        mbmi->inter_tx_size[index] = tx_size;
+      }
     }
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+  }
+}
+
+static TX_SIZE read_selected_tx_size(MACROBLOCKD *xd, aom_reader *r) {
+  // TODO(debargha): Clean up the logic here. This function should only
+  // be called for intra.
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+  const int max_depths = bsize_to_max_depth(bsize);
+  const int ctx = get_tx_size_context(xd);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const int depth = aom_read_symbol(r, ec_ctx->tx_size_cdf[tx_size_cat][ctx],
+                                    max_depths + 1, ACCT_STR);
+  assert(depth >= 0 && depth <= max_depths);
+  const TX_SIZE tx_size = depth_to_tx_size(depth, bsize);
+  return tx_size;
+}
+
+static TX_SIZE read_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd, int is_inter,
+                            int allow_select_inter, aom_reader *r) {
+  const TX_MODE tx_mode = cm->tx_mode;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;
+
+  if (block_signals_txsize(bsize)) {
+    if ((!is_inter || allow_select_inter) && tx_mode == TX_MODE_SELECT) {
+      const TX_SIZE coded_tx_size = read_selected_tx_size(xd, r);
+      return coded_tx_size;
+    } else {
+      return tx_size_from_tx_mode(bsize, tx_mode);
+    }
+  } else {
+    assert(IMPLIES(tx_mode == ONLY_4X4, bsize == BLOCK_4X4));
+    return max_txsize_rect_lookup[bsize];
   }
 }
-#endif
 
 static void decode_block(AV1Decoder *const pbi, MACROBLOCKD *const xd,
-#if CONFIG_SUPERTX
-                         int supertx_enabled,
-#endif  // CONFIG_SUPERTX
                          int mi_row, int mi_col, aom_reader *r,
-#if CONFIG_EXT_PARTITION_TYPES
-                         PARTITION_TYPE partition,
-#endif  // CONFIG_EXT_PARTITION_TYPES
-                         BLOCK_SIZE bsize) {
-  decode_mbmi_block(pbi, xd,
-#if CONFIG_SUPERTX
-                    supertx_enabled,
-#endif
-                    mi_row, mi_col, r,
-#if CONFIG_EXT_PARTITION_TYPES
-                    partition,
-#endif
-                    bsize);
+                         PARTITION_TYPE partition, BLOCK_SIZE bsize) {
+  decode_mbmi_block(pbi, xd, mi_row, mi_col, r, partition, bsize);
+
+  av1_visit_palette(pbi, xd, mi_row, mi_col, r, bsize,
+                    av1_decode_palette_tokens);
+
+  AV1_COMMON *cm = &pbi->common;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  int inter_block_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi);
+  if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
+      !mbmi->skip && inter_block_tx && !xd->lossless[mbmi->segment_id]) {
+    const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
+    const int bh = tx_size_high_unit[max_tx_size];
+    const int bw = tx_size_wide_unit[max_tx_size];
+    const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
+    const int height = block_size_high[bsize] >> tx_size_high_log2[0];
+
+    for (int idy = 0; idy < height; idy += bh)
+      for (int idx = 0; idx < width; idx += bw)
+        read_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, r);
+  } else {
+    mbmi->tx_size = read_tx_size(cm, xd, inter_block_tx, !mbmi->skip, r);
+    if (inter_block_tx)
+      memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+    set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h,
+                  mbmi->skip && is_inter_block(mbmi), xd);
+  }
 
-#if !(CONFIG_MOTION_VAR && NC_MODE_INFO)
-#if CONFIG_SUPERTX
-  if (!supertx_enabled)
-#endif  // CONFIG_SUPERTX
-    decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, bsize);
-#endif
+  decode_token_and_recon_block(pbi, xd, mi_row, mi_col, r, bsize);
 }
 
-static PARTITION_TYPE read_partition(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     int mi_row, int mi_col, aom_reader *r,
-                                     int has_rows, int has_cols,
+static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                     aom_reader *r, int has_rows, int has_cols,
                                      BLOCK_SIZE bsize) {
-#if CONFIG_UNPOISON_PARTITION_CTX
-  const int ctx =
-      partition_plane_context(xd, mi_row, mi_col, has_rows, has_cols, bsize);
-#else
   const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
-#endif
-  PARTITION_TYPE p;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
 
-  aom_cdf_prob *partition_cdf = (ctx >= 0) ? ec_ctx->partition_cdf[ctx] : NULL;
+  if (!has_rows && !has_cols) return PARTITION_SPLIT;
 
+  assert(ctx >= 0);
+  aom_cdf_prob *partition_cdf = ec_ctx->partition_cdf[ctx];
   if (has_rows && has_cols) {
-#if CONFIG_EXT_PARTITION_TYPES
-    const int num_partition_types =
-        (mi_width_log2_lookup[bsize] > mi_width_log2_lookup[BLOCK_8X8])
-            ? EXT_PARTITION_TYPES
-            : PARTITION_TYPES;
-#else
-    const int num_partition_types = PARTITION_TYPES;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-    p = (PARTITION_TYPE)aom_read_symbol(r, partition_cdf, num_partition_types,
-                                        ACCT_STR);
+    return (PARTITION_TYPE)aom_read_symbol(
+        r, partition_cdf, partition_cdf_length(bsize), ACCT_STR);
   } else if (!has_rows && has_cols) {
     assert(bsize > BLOCK_8X8);
     aom_cdf_prob cdf[2];
-    partition_gather_vert_alike(cdf, partition_cdf);
+    partition_gather_vert_alike(cdf, partition_cdf, bsize);
     assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP));
-    p = aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_HORZ;
-    // gather cols
-  } else if (has_rows && !has_cols) {
+    return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_HORZ;
+  } else {
+    assert(has_rows && !has_cols);
     assert(bsize > BLOCK_8X8);
     aom_cdf_prob cdf[2];
-    partition_gather_horz_alike(cdf, partition_cdf);
+    partition_gather_horz_alike(cdf, partition_cdf, bsize);
     assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP));
-    p = aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_VERT;
-  } else {
-    p = PARTITION_SPLIT;
+    return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_VERT;
   }
-
-  return p;
 }
 
-#if CONFIG_SUPERTX
-static int read_skip(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
-                     aom_reader *r) {
-  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
-    return 1;
-  } else {
-    const int ctx = av1_get_skip_context(xd);
-#if CONFIG_NEW_MULTISYMBOL
-    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-    const int skip = aom_read_symbol(r, ec_ctx->skip_cdfs[ctx], 2, ACCT_STR);
-#else
-    const int skip = aom_read(r, cm->fc->skip_probs[ctx], ACCT_STR);
-#endif
-    FRAME_COUNTS *counts = xd->counts;
-    if (counts) ++counts->skip[ctx][skip];
-    return skip;
-  }
-}
-#endif  // CONFIG_SUPERTX
-
 // TODO(slavarnway): eliminate bsize and subsize in future commits
 static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
-#if CONFIG_SUPERTX
-                             int supertx_enabled,
-#endif
                              int mi_row, int mi_col, aom_reader *r,
                              BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &pbi->common;
-  const int num_8x8_wh = mi_size_wide[bsize];
-  const int hbs = num_8x8_wh >> 1;
-#if CONFIG_EXT_PARTITION_TYPES && CONFIG_EXT_PARTITION_TYPES_AB
-  const int qbs = num_8x8_wh >> 2;
-#endif
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
+  const int bw = mi_size_wide[bsize];
+  const int hbs = bw >> 1;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
-#if CONFIG_EXT_PARTITION_TYPES
-  const int quarter_step = num_8x8_wh / 4;
-  int i;
-#if !CONFIG_EXT_PARTITION_TYPES_AB
-  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
-#endif
-#endif
+  const int quarter_step = bw / 4;
+  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
   const int has_rows = (mi_row + hbs) < cm->mi_rows;
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
-#if CONFIG_SUPERTX
-  const int read_token = !supertx_enabled;
-  int skip = 0;
-  TX_SIZE supertx_size = max_txsize_lookup[bsize];
-  const TileInfo *const tile = &xd->tile;
-  int txfm = DCT_DCT;
-#endif  // CONFIG_SUPERTX
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
+  const int num_planes = av1_num_planes(cm);
+  for (int plane = 0; plane < num_planes; ++plane) {
+    int rcol0, rcol1, rrow0, rrow1, tile_tl_idx;
+    if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
+                                           &rcol0, &rcol1, &rrow0, &rrow1,
+                                           &tile_tl_idx)) {
+      const int rstride = cm->rst_info[plane].horz_units_per_tile;
+      for (int rrow = rrow0; rrow < rrow1; ++rrow) {
+        for (int rcol = rcol0; rcol < rcol1; ++rcol) {
+          const int runit_idx = tile_tl_idx + rcol + rrow * rstride;
+          loop_restoration_read_sb_coeffs(cm, xd, r, plane, runit_idx);
+        }
+      }
+    }
+  }
+
   partition = (bsize < BLOCK_8X8) ? PARTITION_NONE
-                                  : read_partition(cm, xd, mi_row, mi_col, r,
+                                  : read_partition(xd, mi_row, mi_col, r,
                                                    has_rows, has_cols, bsize);
-  subsize = subsize_lookup[partition][bsize];  // get_subsize(bsize, partition);
+  subsize = get_partition_subsize(bsize, partition);
 
   // Check the bitstream is conformant: if there is subsampling on the
   // chroma planes, subsize must subsample to a valid block size.
   const struct macroblockd_plane *const pd_u = &xd->plane[1];
-  if (get_plane_block_size(subsize, pd_u) == BLOCK_INVALID) {
+  if (get_plane_block_size(subsize, pd_u->subsampling_x, pd_u->subsampling_y) ==
+      BLOCK_INVALID) {
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Block size %dx%d invalid with this subsampling mode",
                        block_size_wide[subsize], block_size_high[subsize]);
   }
 
-#if CONFIG_PVQ
-  assert(partition < PARTITION_TYPES);
-  assert(subsize < BLOCK_SIZES_ALL);
-#endif
-#if CONFIG_SUPERTX
-  if (!frame_is_intra_only(cm) && partition != PARTITION_NONE &&
-      bsize <= MAX_SUPERTX_BLOCK_SIZE && !supertx_enabled && !xd->lossless[0]) {
-    const int supertx_context = partition_supertx_context_lookup[partition];
-    supertx_enabled = aom_read(
-        r, cm->fc->supertx_prob[supertx_context][supertx_size], ACCT_STR);
-    if (xd->counts)
-      xd->counts->supertx[supertx_context][supertx_size][supertx_enabled]++;
-#if CONFIG_VAR_TX
-    if (supertx_enabled) xd->supertx_size = supertx_size;
-#endif
-  }
-#endif  // CONFIG_SUPERTX
-
-#if CONFIG_SUPERTX
-#define DEC_BLOCK_STX_ARG supertx_enabled,
-#else
 #define DEC_BLOCK_STX_ARG
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
 #define DEC_BLOCK_EPT_ARG partition,
-#else
-#define DEC_BLOCK_EPT_ARG
-#endif
 #define DEC_BLOCK(db_r, db_c, db_subsize)                   \
   decode_block(pbi, xd, DEC_BLOCK_STX_ARG(db_r), (db_c), r, \
                DEC_BLOCK_EPT_ARG(db_subsize))
 #define DEC_PARTITION(db_r, db_c, db_subsize) \
   decode_partition(pbi, xd, DEC_BLOCK_STX_ARG(db_r), (db_c), r, (db_subsize))
 
-  if (!hbs && !unify_bsize) {
-    // calculate bmode block dimensions (log 2)
-    xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
-    xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
-    DEC_BLOCK(mi_row, mi_col, subsize);
-  } else {
-    switch (partition) {
-      case PARTITION_NONE: DEC_BLOCK(mi_row, mi_col, subsize); break;
-      case PARTITION_HORZ:
-        DEC_BLOCK(mi_row, mi_col, subsize);
-        if (has_rows) DEC_BLOCK(mi_row + hbs, mi_col, subsize);
-        break;
-      case PARTITION_VERT:
-        DEC_BLOCK(mi_row, mi_col, subsize);
-        if (has_cols) DEC_BLOCK(mi_row, mi_col + hbs, subsize);
-        break;
-      case PARTITION_SPLIT:
-        DEC_PARTITION(mi_row, mi_col, subsize);
-        DEC_PARTITION(mi_row, mi_col + hbs, subsize);
-        DEC_PARTITION(mi_row + hbs, mi_col, subsize);
-        DEC_PARTITION(mi_row + hbs, mi_col + hbs, subsize);
-        break;
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-      case PARTITION_HORZ_A:
-        DEC_BLOCK(mi_row, mi_col, get_subsize(bsize, PARTITION_HORZ_4));
-        DEC_BLOCK(mi_row + qbs, mi_col, get_subsize(bsize, PARTITION_HORZ_4));
-        DEC_BLOCK(mi_row + hbs, mi_col, subsize);
-        break;
-      case PARTITION_HORZ_B:
-        DEC_BLOCK(mi_row, mi_col, subsize);
-        DEC_BLOCK(mi_row + hbs, mi_col, get_subsize(bsize, PARTITION_HORZ_4));
-        if (mi_row + 3 * qbs < cm->mi_rows)
-          DEC_BLOCK(mi_row + 3 * qbs, mi_col,
-                    get_subsize(bsize, PARTITION_HORZ_4));
-        break;
-      case PARTITION_VERT_A:
-        DEC_BLOCK(mi_row, mi_col, get_subsize(bsize, PARTITION_VERT_4));
-        DEC_BLOCK(mi_row, mi_col + qbs, get_subsize(bsize, PARTITION_VERT_4));
-        DEC_BLOCK(mi_row, mi_col + hbs, subsize);
-        break;
-      case PARTITION_VERT_B:
-        DEC_BLOCK(mi_row, mi_col, subsize);
-        DEC_BLOCK(mi_row, mi_col + hbs, get_subsize(bsize, PARTITION_VERT_4));
-        if (mi_col + 3 * qbs < cm->mi_cols)
-          DEC_BLOCK(mi_row, mi_col + 3 * qbs,
-                    get_subsize(bsize, PARTITION_VERT_4));
-        break;
-#else
-      case PARTITION_HORZ_A:
-        DEC_BLOCK(mi_row, mi_col, bsize2);
-        DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
-        DEC_BLOCK(mi_row + hbs, mi_col, subsize);
-        break;
-      case PARTITION_HORZ_B:
-        DEC_BLOCK(mi_row, mi_col, subsize);
-        DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
-        DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
-        break;
-      case PARTITION_VERT_A:
-        DEC_BLOCK(mi_row, mi_col, bsize2);
-        DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
-        DEC_BLOCK(mi_row, mi_col + hbs, subsize);
-        break;
-      case PARTITION_VERT_B:
-        DEC_BLOCK(mi_row, mi_col, subsize);
-        DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
-        DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
-        break;
-#endif
-      case PARTITION_HORZ_4:
-        for (i = 0; i < 4; ++i) {
-          int this_mi_row = mi_row + i * quarter_step;
-          if (i > 0 && this_mi_row >= cm->mi_rows) break;
-          DEC_BLOCK(this_mi_row, mi_col, subsize);
-        }
-        break;
-      case PARTITION_VERT_4:
-        for (i = 0; i < 4; ++i) {
-          int this_mi_col = mi_col + i * quarter_step;
-          if (i > 0 && this_mi_col >= cm->mi_cols) break;
-          DEC_BLOCK(mi_row, this_mi_col, subsize);
-        }
-        break;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-      default: assert(0 && "Invalid partition type");
-    }
+  switch (partition) {
+    case PARTITION_NONE: DEC_BLOCK(mi_row, mi_col, subsize); break;
+    case PARTITION_HORZ:
+      DEC_BLOCK(mi_row, mi_col, subsize);
+      if (has_rows) DEC_BLOCK(mi_row + hbs, mi_col, subsize);
+      break;
+    case PARTITION_VERT:
+      DEC_BLOCK(mi_row, mi_col, subsize);
+      if (has_cols) DEC_BLOCK(mi_row, mi_col + hbs, subsize);
+      break;
+    case PARTITION_SPLIT:
+      DEC_PARTITION(mi_row, mi_col, subsize);
+      DEC_PARTITION(mi_row, mi_col + hbs, subsize);
+      DEC_PARTITION(mi_row + hbs, mi_col, subsize);
+      DEC_PARTITION(mi_row + hbs, mi_col + hbs, subsize);
+      break;
+    case PARTITION_HORZ_A:
+      DEC_BLOCK(mi_row, mi_col, bsize2);
+      DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
+      DEC_BLOCK(mi_row + hbs, mi_col, subsize);
+      break;
+    case PARTITION_HORZ_B:
+      DEC_BLOCK(mi_row, mi_col, subsize);
+      DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
+      DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
+      break;
+    case PARTITION_VERT_A:
+      DEC_BLOCK(mi_row, mi_col, bsize2);
+      DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
+      DEC_BLOCK(mi_row, mi_col + hbs, subsize);
+      break;
+    case PARTITION_VERT_B:
+      DEC_BLOCK(mi_row, mi_col, subsize);
+      DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
+      DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
+      break;
+    case PARTITION_HORZ_4:
+      for (int i = 0; i < 4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && this_mi_row >= cm->mi_rows) break;
+        DEC_BLOCK(this_mi_row, mi_col, subsize);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (int i = 0; i < 4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= cm->mi_cols) break;
+        DEC_BLOCK(mi_row, this_mi_col, subsize);
+      }
+      break;
+    default: assert(0 && "Invalid partition type");
   }
 
 #undef DEC_PARTITION
@@ -2472,219 +1513,13 @@ static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
 #undef DEC_BLOCK_EPT_ARG
 #undef DEC_BLOCK_STX_ARG
 
-#if CONFIG_SUPERTX
-  if (supertx_enabled && read_token) {
-    uint8_t *dst_buf[3];
-    int dst_stride[3], i;
-    int offset = mi_row * cm->mi_stride + mi_col;
-
-    set_segment_id_supertx(cm, mi_row, mi_col, bsize);
-
-    if (cm->delta_q_present_flag) {
-      for (i = 0; i < MAX_SEGMENTS; i++) {
-        int j;
-        for (j = 0; j < MAX_MB_PLANE; ++j) {
-          const int dc_delta_q = j == 0 ? cm->y_dc_delta_q : cm->uv_dc_delta_q;
-          const int ac_delta_q = j == 0 ? 0 : cm->uv_ac_delta_q;
-
-          xd->plane[j].seg_dequant[i][0] =
-              av1_dc_quant(xd->current_qindex, dc_delta_q, cm->bit_depth);
-          xd->plane[j].seg_dequant[i][1] =
-              av1_ac_quant(xd->current_qindex, ac_delta_q, cm->bit_depth);
-        }
-      }
-    }
-
-    xd->mi = cm->mi_grid_visible + offset;
-    xd->mi[0] = cm->mi + offset;
-    set_mi_row_col(xd, tile, mi_row, mi_size_high[bsize], mi_col,
-                   mi_size_wide[bsize],
-#if CONFIG_DEPENDENT_HORZTILES
-                   cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                   cm->mi_rows, cm->mi_cols);
-    set_skip_context(xd, mi_row, mi_col);
-    skip = read_skip(cm, xd, xd->mi[0]->mbmi.segment_id_supertx, r);
-    if (skip) {
-      av1_reset_skip_context(xd, mi_row, mi_col, bsize);
-    } else {
-      FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-#if CONFIG_EXT_TX
-      if (get_ext_tx_types(supertx_size, bsize, 1, cm->reduced_tx_set_used) >
-          1) {
-        const int eset =
-            get_ext_tx_set(supertx_size, bsize, 1, cm->reduced_tx_set_used);
-        if (eset > 0) {
-          const TxSetType tx_set_type = get_ext_tx_set_type(
-              supertx_size, bsize, 1, cm->reduced_tx_set_used);
-          const int packed_sym =
-              aom_read_symbol(r, ec_ctx->inter_ext_tx_cdf[eset][supertx_size],
-                              av1_num_ext_tx_set[tx_set_type], ACCT_STR);
-          txfm = av1_ext_tx_inv[tx_set_type][packed_sym];
-#if CONFIG_ENTROPY_STATS
-          if (xd->counts) ++xd->counts->inter_ext_tx[eset][supertx_size][txfm];
-#endif  // CONFIG_ENTROPY_STATS
-        }
-      }
-#else
-      if (supertx_size < TX_32X32) {
-        txfm = aom_read_symbol(r, ec_ctx->inter_ext_tx_cdf[supertx_size],
-                               TX_TYPES, ACCT_STR);
-#if CONFIG_ENTROPY_STATS
-        if (xd->counts) ++xd->counts->inter_ext_tx[supertx_size][txfm];
-#endif  // CONFIG_ENTROPY_STATS
-      }
-#endif  // CONFIG_EXT_TX
-    }
-
-    av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                         mi_col);
-    for (i = 0; i < MAX_MB_PLANE; i++) {
-      dst_buf[i] = xd->plane[i].dst.buf;
-      dst_stride[i] = xd->plane[i].dst.stride;
-    }
-    dec_predict_sb_complex(pbi, xd, tile, mi_row, mi_col, mi_row, mi_col, bsize,
-                           bsize, dst_buf, dst_stride);
-
-    if (!skip) {
-      int eobtotal = 0;
-      MB_MODE_INFO *mbmi;
-      set_offsets_topblock(cm, xd, tile, bsize, mi_row, mi_col);
-      mbmi = &xd->mi[0]->mbmi;
-      mbmi->tx_type = txfm;
-      assert(mbmi->segment_id_supertx != MAX_SEGMENTS);
-      for (i = 0; i < MAX_MB_PLANE; ++i) {
-        const struct macroblockd_plane *const pd = &xd->plane[i];
-        int row, col;
-        const TX_SIZE tx_size = av1_get_tx_size(i, xd);
-        const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-        const int stepr = tx_size_high_unit[tx_size];
-        const int stepc = tx_size_wide_unit[tx_size];
-        const int max_blocks_wide = max_block_wide(xd, plane_bsize, i);
-        const int max_blocks_high = max_block_high(xd, plane_bsize, i);
-
-        for (row = 0; row < max_blocks_high; row += stepr)
-          for (col = 0; col < max_blocks_wide; col += stepc)
-            eobtotal += reconstruct_inter_block(
-                cm, xd, r, mbmi->segment_id_supertx, i, row, col, tx_size);
-      }
-      if ((unify_bsize || !(subsize < BLOCK_8X8)) && eobtotal == 0) skip = 1;
-    }
-    set_param_topblock(cm, xd, bsize, mi_row, mi_col, txfm, skip);
-  }
-#endif  // CONFIG_SUPERTX
-
-#if CONFIG_EXT_PARTITION_TYPES
   update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
-#else
-  // update partition context
-  if (bsize >= BLOCK_8X8 &&
-      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
-    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
-#endif  // CONFIG_EXT_PARTITION_TYPES
-
-#if CONFIG_LPF_SB
-  if (bsize == cm->sb_size) {
-    int filt_lvl;
-    if (mi_row == 0 && mi_col == 0) {
-      filt_lvl = aom_read_literal(r, 6, ACCT_STR);
-      cm->mi_grid_visible[0]->mbmi.reuse_sb_lvl = 0;
-      cm->mi_grid_visible[0]->mbmi.delta = 0;
-      cm->mi_grid_visible[0]->mbmi.sign = 0;
-    } else {
-      int prev_mi_row, prev_mi_col;
-      if (mi_col - MAX_MIB_SIZE < 0) {
-        prev_mi_row = mi_row - MAX_MIB_SIZE;
-        prev_mi_col = mi_col;
-      } else {
-        prev_mi_row = mi_row;
-        prev_mi_col = mi_col - MAX_MIB_SIZE;
-      }
-
-      MB_MODE_INFO *curr_mbmi =
-          &cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi;
-      MB_MODE_INFO *prev_mbmi =
-          &cm->mi_grid_visible[prev_mi_row * cm->mi_stride + prev_mi_col]->mbmi;
-      const uint8_t prev_lvl = prev_mbmi->filt_lvl;
-
-      const int reuse_ctx = prev_mbmi->reuse_sb_lvl;
-      const int reuse_prev_lvl = aom_read_symbol(
-          r, xd->tile_ctx->lpf_reuse_cdf[reuse_ctx], 2, ACCT_STR);
-      curr_mbmi->reuse_sb_lvl = reuse_prev_lvl;
-
-      if (reuse_prev_lvl) {
-        filt_lvl = prev_lvl;
-        curr_mbmi->delta = 0;
-        curr_mbmi->sign = 0;
-      } else {
-        const int delta_ctx = prev_mbmi->delta;
-        unsigned int delta = aom_read_symbol(
-            r, xd->tile_ctx->lpf_delta_cdf[delta_ctx], DELTA_RANGE, ACCT_STR);
-        curr_mbmi->delta = delta;
-        delta *= LPF_STEP;
-
-        if (delta) {
-          const int sign_ctx = prev_mbmi->sign;
-          const int sign = aom_read_symbol(
-              r, xd->tile_ctx->lpf_sign_cdf[reuse_ctx][sign_ctx], 2, ACCT_STR);
-          curr_mbmi->sign = sign;
-          filt_lvl = sign ? prev_lvl + delta : prev_lvl - delta;
-        } else {
-          filt_lvl = prev_lvl;
-          curr_mbmi->sign = 0;
-        }
-      }
-    }
-
-    av1_loop_filter_sb_level_init(cm, mi_row, mi_col, filt_lvl);
-  }
-#endif
-
-#if CONFIG_CDEF
-  if (bsize == cm->sb_size) {
-    int width_step = mi_size_wide[BLOCK_64X64];
-    int height_step = mi_size_wide[BLOCK_64X64];
-    int w, h;
-    for (h = 0; (h < mi_size_high[cm->sb_size]) && (mi_row + h < cm->mi_rows);
-         h += height_step) {
-      for (w = 0; (w < mi_size_wide[cm->sb_size]) && (mi_col + w < cm->mi_cols);
-           w += width_step) {
-        if (!cm->all_lossless && !sb_all_skip(cm, mi_row + h, mi_col + w))
-          cm->mi_grid_visible[(mi_row + h) * cm->mi_stride + (mi_col + w)]
-              ->mbmi.cdef_strength =
-              aom_read_literal(r, cm->cdef_bits, ACCT_STR);
-        else
-          cm->mi_grid_visible[(mi_row + h) * cm->mi_stride + (mi_col + w)]
-              ->mbmi.cdef_strength = -1;
-      }
-    }
-  }
-#endif  // CONFIG_CDEF
-#if CONFIG_LOOP_RESTORATION
-  for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    int rcol0, rcol1, rrow0, rrow1, nhtiles;
-    if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
-                                           &rcol0, &rcol1, &rrow0, &rrow1,
-                                           &nhtiles)) {
-      for (int rrow = rrow0; rrow < rrow1; ++rrow) {
-        for (int rcol = rcol0; rcol < rcol1; ++rcol) {
-          int rtile_idx = rcol + rrow * nhtiles;
-          loop_restoration_read_sb_coeffs(cm, xd, r, plane, rtile_idx);
-        }
-      }
-    }
-  }
-#endif
 }
 
 static void setup_bool_decoder(const uint8_t *data, const uint8_t *data_end,
                                const size_t read_size,
                                struct aom_internal_error_info *error_info,
-                               aom_reader *r,
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-                               int window_size,
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-                               aom_decrypt_cb decrypt_cb, void *decrypt_state) {
+                               aom_reader *r, uint8_t allow_update_cdf) {
   // Validate the calculated partition length. If the buffer
   // described by the partition can't be fully read, then restrict
   // it to the portion that can be (for EC mode) or throw an error.
@@ -2692,117 +1527,147 @@ static void setup_bool_decoder(const uint8_t *data, const uint8_t *data_end,
     aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt tile length");
 
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  r->window_size = window_size;
-#endif
-  if (aom_reader_init(r, data, read_size, decrypt_cb, decrypt_state))
+  if (aom_reader_init(r, data, read_size))
     aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder %d", 1);
+
+  r->allow_update_cdf = allow_update_cdf;
 }
 
 static void setup_segmentation(AV1_COMMON *const cm,
                                struct aom_read_bit_buffer *rb) {
   struct segmentation *const seg = &cm->seg;
-  int i, j;
 
   seg->update_map = 0;
   seg->update_data = 0;
   seg->temporal_update = 0;
 
   seg->enabled = aom_rb_read_bit(rb);
-  if (!seg->enabled) return;
+  if (!seg->enabled) {
+    if (cm->cur_frame->seg_map)
+      memset(cm->cur_frame->seg_map, 0, (cm->mi_rows * cm->mi_cols));
 
-  // Segmentation map update
-  if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+    memset(seg, 0, sizeof(*seg));
+    segfeatures_copy(&cm->cur_frame->seg, seg);
+    return;
+  }
+  if (cm->seg.enabled && cm->prev_frame &&
+      (cm->mi_rows == cm->prev_frame->mi_rows) &&
+      (cm->mi_cols == cm->prev_frame->mi_cols)) {
+    cm->last_frame_seg_map = cm->prev_frame->seg_map;
+  } else {
+    cm->last_frame_seg_map = NULL;
+  }
+  // Read update flags
+  if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
+    // These frames can't use previous frames, so must signal map + features
     seg->update_map = 1;
+    seg->temporal_update = 0;
+    seg->update_data = 1;
   } else {
     seg->update_map = aom_rb_read_bit(rb);
-  }
-  if (seg->update_map) {
-    if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
-      seg->temporal_update = 0;
-    } else {
+    if (seg->update_map) {
       seg->temporal_update = aom_rb_read_bit(rb);
+    } else {
+      seg->temporal_update = 0;
     }
+    seg->update_data = aom_rb_read_bit(rb);
   }
 
   // Segmentation data update
-  seg->update_data = aom_rb_read_bit(rb);
   if (seg->update_data) {
-    seg->abs_delta = aom_rb_read_bit(rb);
-
     av1_clearall_segfeatures(seg);
 
-    for (i = 0; i < MAX_SEGMENTS; i++) {
-      for (j = 0; j < SEG_LVL_MAX; j++) {
+    for (int i = 0; i < MAX_SEGMENTS; i++) {
+      for (int j = 0; j < SEG_LVL_MAX; j++) {
         int data = 0;
         const int feature_enabled = aom_rb_read_bit(rb);
         if (feature_enabled) {
           av1_enable_segfeature(seg, i, j);
-          data = decode_unsigned_max(rb, av1_seg_feature_data_max(j));
-          if (av1_is_segfeature_signed(j))
-            data = aom_rb_read_bit(rb) ? -data : data;
+
+          const int data_max = av1_seg_feature_data_max(j);
+          const int data_min = -data_max;
+          const int ubits = get_unsigned_bits(data_max);
+
+          if (av1_is_segfeature_signed(j)) {
+            data = aom_rb_read_inv_signed_literal(rb, ubits);
+          } else {
+            data = aom_rb_read_literal(rb, ubits);
+          }
+
+          data = clamp(data, data_min, data_max);
         }
         av1_set_segdata(seg, i, j, data);
       }
     }
+    calculate_segdata(seg);
+  } else if (cm->prev_frame) {
+    segfeatures_copy(seg, &cm->prev_frame->seg);
   }
+  segfeatures_copy(&cm->cur_frame->seg, seg);
 }
 
-#if CONFIG_LOOP_RESTORATION
 static void decode_restoration_mode(AV1_COMMON *cm,
                                     struct aom_read_bit_buffer *rb) {
-  int p;
-  RestorationInfo *rsi = &cm->rst_info[0];
-  if (aom_rb_read_bit(rb)) {
-    rsi->frame_restoration_type =
-        aom_rb_read_bit(rb) ? RESTORE_SGRPROJ : RESTORE_WIENER;
-  } else {
-    rsi->frame_restoration_type =
-        aom_rb_read_bit(rb) ? RESTORE_SWITCHABLE : RESTORE_NONE;
-  }
-  for (p = 1; p < MAX_MB_PLANE; ++p) {
-    rsi = &cm->rst_info[p];
+  assert(!cm->all_lossless);
+  const int num_planes = av1_num_planes(cm);
+  if (cm->allow_intrabc) return;
+  int all_none = 1, chroma_none = 1;
+  for (int p = 0; p < num_planes; ++p) {
+    RestorationInfo *rsi = &cm->rst_info[p];
     if (aom_rb_read_bit(rb)) {
       rsi->frame_restoration_type =
           aom_rb_read_bit(rb) ? RESTORE_SGRPROJ : RESTORE_WIENER;
     } else {
-      rsi->frame_restoration_type = RESTORE_NONE;
+      rsi->frame_restoration_type =
+          aom_rb_read_bit(rb) ? RESTORE_SWITCHABLE : RESTORE_NONE;
+    }
+    if (rsi->frame_restoration_type != RESTORE_NONE) {
+      all_none = 0;
+      chroma_none &= p == 0;
     }
   }
+  if (!all_none) {
+    assert(cm->seq_params.sb_size == BLOCK_64X64 ||
+           cm->seq_params.sb_size == BLOCK_128X128);
+    const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
 
-  cm->rst_info[0].restoration_tilesize = RESTORATION_TILESIZE_MAX;
-  cm->rst_info[1].restoration_tilesize = RESTORATION_TILESIZE_MAX;
-  cm->rst_info[2].restoration_tilesize = RESTORATION_TILESIZE_MAX;
-  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
-    rsi = &cm->rst_info[0];
-    rsi->restoration_tilesize >>= aom_rb_read_bit(rb);
-    if (rsi->restoration_tilesize != RESTORATION_TILESIZE_MAX) {
-      rsi->restoration_tilesize >>= aom_rb_read_bit(rb);
+    for (int p = 0; p < num_planes; ++p)
+      cm->rst_info[p].restoration_unit_size = sb_size;
+
+    RestorationInfo *rsi = &cm->rst_info[0];
+
+    if (sb_size == 64) {
+      rsi->restoration_unit_size <<= aom_rb_read_bit(rb);
+    }
+    if (rsi->restoration_unit_size > 64) {
+      rsi->restoration_unit_size <<= aom_rb_read_bit(rb);
     }
-  }
-  int s = AOMMIN(cm->subsampling_x, cm->subsampling_y);
-  if (s && (cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
-            cm->rst_info[2].frame_restoration_type != RESTORE_NONE)) {
-    cm->rst_info[1].restoration_tilesize =
-        cm->rst_info[0].restoration_tilesize >> (aom_rb_read_bit(rb) * s);
   } else {
-    cm->rst_info[1].restoration_tilesize = cm->rst_info[0].restoration_tilesize;
+    const int size = RESTORATION_UNITSIZE_MAX;
+    for (int p = 0; p < num_planes; ++p)
+      cm->rst_info[p].restoration_unit_size = size;
   }
-  cm->rst_info[2].restoration_tilesize = cm->rst_info[1].restoration_tilesize;
 
-  cm->rst_info[0].procunit_width = cm->rst_info[0].procunit_height =
-      RESTORATION_PROC_UNIT_SIZE;
-  cm->rst_info[1].procunit_width = cm->rst_info[2].procunit_width =
-      RESTORATION_PROC_UNIT_SIZE >> cm->subsampling_x;
-  cm->rst_info[1].procunit_height = cm->rst_info[2].procunit_height =
-      RESTORATION_PROC_UNIT_SIZE >> cm->subsampling_y;
+  if (num_planes > 1) {
+    int s = AOMMIN(cm->subsampling_x, cm->subsampling_y);
+    if (s && !chroma_none) {
+      cm->rst_info[1].restoration_unit_size =
+          cm->rst_info[0].restoration_unit_size >> (aom_rb_read_bit(rb) * s);
+    } else {
+      cm->rst_info[1].restoration_unit_size =
+          cm->rst_info[0].restoration_unit_size;
+    }
+    cm->rst_info[2].restoration_unit_size =
+        cm->rst_info[1].restoration_unit_size;
+  }
 }
 
 static void read_wiener_filter(int wiener_win, WienerInfo *wiener_info,
                                WienerInfo *ref_wiener_info, aom_reader *rb) {
+  memset(wiener_info->vfilter, 0, sizeof(wiener_info->vfilter));
+  memset(wiener_info->hfilter, 0, sizeof(wiener_info->hfilter));
+
   if (wiener_win == WIENER_WIN)
     wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] =
         aom_read_primitive_refsubexpfin(
@@ -2860,75 +1725,109 @@ static void read_wiener_filter(int wiener_win, WienerInfo *wiener_info,
 static void read_sgrproj_filter(SgrprojInfo *sgrproj_info,
                                 SgrprojInfo *ref_sgrproj_info, aom_reader *rb) {
   sgrproj_info->ep = aom_read_literal(rb, SGRPROJ_PARAMS_BITS, ACCT_STR);
-  sgrproj_info->xqd[0] =
-      aom_read_primitive_refsubexpfin(
-          rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
-          ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) +
-      SGRPROJ_PRJ_MIN0;
-  sgrproj_info->xqd[1] =
-      aom_read_primitive_refsubexpfin(
-          rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
-          ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) +
-      SGRPROJ_PRJ_MIN1;
+  const sgr_params_type *params = &sgr_params[sgrproj_info->ep];
+
+  if (params->r[0] == 0) {
+    sgrproj_info->xqd[0] = 0;
+    sgrproj_info->xqd[1] =
+        aom_read_primitive_refsubexpfin(
+            rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+            ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) +
+        SGRPROJ_PRJ_MIN1;
+  } else if (params->r[1] == 0) {
+    sgrproj_info->xqd[0] =
+        aom_read_primitive_refsubexpfin(
+            rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+            ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) +
+        SGRPROJ_PRJ_MIN0;
+    sgrproj_info->xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - sgrproj_info->xqd[0],
+                                 SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1);
+  } else {
+    sgrproj_info->xqd[0] =
+        aom_read_primitive_refsubexpfin(
+            rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+            ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) +
+        SGRPROJ_PRJ_MIN0;
+    sgrproj_info->xqd[1] =
+        aom_read_primitive_refsubexpfin(
+            rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+            ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) +
+        SGRPROJ_PRJ_MIN1;
+  }
+
   memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
 }
 
 static void loop_restoration_read_sb_coeffs(const AV1_COMMON *const cm,
                                             MACROBLOCKD *xd,
                                             aom_reader *const r, int plane,
-                                            int rtile_idx) {
-  const RestorationInfo *rsi = cm->rst_info + plane;
+                                            int runit_idx) {
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+  RestorationUnitInfo *rui = &rsi->unit_info[runit_idx];
   if (rsi->frame_restoration_type == RESTORE_NONE) return;
 
+  assert(!cm->all_lossless);
+
   const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
   WienerInfo *wiener_info = xd->wiener_info + plane;
   SgrprojInfo *sgrproj_info = xd->sgrproj_info + plane;
 
   if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) {
-    assert(plane == 0);
-    rsi->restoration_type[rtile_idx] =
-        aom_read_tree(r, av1_switchable_restore_tree,
-                      cm->fc->switchable_restore_prob, ACCT_STR);
-
-    if (rsi->restoration_type[rtile_idx] == RESTORE_WIENER) {
-      read_wiener_filter(wiener_win, &rsi->wiener_info[rtile_idx], wiener_info,
-                         r);
-    } else if (rsi->restoration_type[rtile_idx] == RESTORE_SGRPROJ) {
-      read_sgrproj_filter(&rsi->sgrproj_info[rtile_idx], sgrproj_info, r);
+    rui->restoration_type =
+        aom_read_symbol(r, xd->tile_ctx->switchable_restore_cdf,
+                        RESTORE_SWITCHABLE_TYPES, ACCT_STR);
+    switch (rui->restoration_type) {
+      case RESTORE_WIENER:
+        read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r);
+        break;
+      case RESTORE_SGRPROJ:
+        read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r);
+        break;
+      default: assert(rui->restoration_type == RESTORE_NONE); break;
     }
   } else if (rsi->frame_restoration_type == RESTORE_WIENER) {
-    if (aom_read(r, RESTORE_NONE_WIENER_PROB, ACCT_STR)) {
-      rsi->restoration_type[rtile_idx] = RESTORE_WIENER;
-      read_wiener_filter(wiener_win, &rsi->wiener_info[rtile_idx], wiener_info,
-                         r);
+    if (aom_read_symbol(r, xd->tile_ctx->wiener_restore_cdf, 2, ACCT_STR)) {
+      rui->restoration_type = RESTORE_WIENER;
+      read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r);
     } else {
-      rsi->restoration_type[rtile_idx] = RESTORE_NONE;
+      rui->restoration_type = RESTORE_NONE;
     }
   } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) {
-    if (aom_read(r, RESTORE_NONE_SGRPROJ_PROB, ACCT_STR)) {
-      rsi->restoration_type[rtile_idx] = RESTORE_SGRPROJ;
-      read_sgrproj_filter(&rsi->sgrproj_info[rtile_idx], sgrproj_info, r);
+    if (aom_read_symbol(r, xd->tile_ctx->sgrproj_restore_cdf, 2, ACCT_STR)) {
+      rui->restoration_type = RESTORE_SGRPROJ;
+      read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r);
     } else {
-      rsi->restoration_type[rtile_idx] = RESTORE_NONE;
+      rui->restoration_type = RESTORE_NONE;
     }
   }
 }
-#endif  // CONFIG_LOOP_RESTORATION
 
 static void setup_loopfilter(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  const int num_planes = av1_num_planes(cm);
   struct loopfilter *lf = &cm->lf;
-#if !CONFIG_LPF_SB
-#if CONFIG_LOOPFILTER_LEVEL
+  if (cm->allow_intrabc || cm->coded_lossless) {
+    // write default deltas to frame buffer
+    av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
+    av1_set_default_mode_deltas(cm->cur_frame->mode_deltas);
+    return;
+  }
+  assert(!cm->coded_lossless);
+  if (cm->prev_frame) {
+    // write deltas to frame buffer
+    memcpy(lf->ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES);
+    memcpy(lf->mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS);
+  } else {
+    av1_set_default_ref_deltas(lf->ref_deltas);
+    av1_set_default_mode_deltas(lf->mode_deltas);
+  }
   lf->filter_level[0] = aom_rb_read_literal(rb, 6);
   lf->filter_level[1] = aom_rb_read_literal(rb, 6);
-  if (lf->filter_level[0] || lf->filter_level[1]) {
-    lf->filter_level_u = aom_rb_read_literal(rb, 6);
-    lf->filter_level_v = aom_rb_read_literal(rb, 6);
+  if (num_planes > 1) {
+    if (lf->filter_level[0] || lf->filter_level[1]) {
+      lf->filter_level_u = aom_rb_read_literal(rb, 6);
+      lf->filter_level_v = aom_rb_read_literal(rb, 6);
+    }
   }
-#else
-  lf->filter_level = aom_rb_read_literal(rb, 6);
-#endif
-#endif  // CONFIG_LPF_SB
   lf->sharpness_level = aom_rb_read_literal(rb, 3);
 
   // Read in loop filter deltas applied at the MB level based on mode or ref
@@ -2939,38 +1838,33 @@ static void setup_loopfilter(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
   if (lf->mode_ref_delta_enabled) {
     lf->mode_ref_delta_update = aom_rb_read_bit(rb);
     if (lf->mode_ref_delta_update) {
-      int i;
-
-      for (i = 0; i < TOTAL_REFS_PER_FRAME; i++)
+      for (int i = 0; i < REF_FRAMES; i++)
         if (aom_rb_read_bit(rb))
           lf->ref_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
 
-      for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
+      for (int i = 0; i < MAX_MODE_LF_DELTAS; i++)
         if (aom_rb_read_bit(rb))
           lf->mode_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
     }
   }
+
+  // write deltas to frame buffer
+  memcpy(cm->cur_frame->ref_deltas, lf->ref_deltas, REF_FRAMES);
+  memcpy(cm->cur_frame->mode_deltas, lf->mode_deltas, MAX_MODE_LF_DELTAS);
 }
 
-#if CONFIG_CDEF
 static void setup_cdef(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
-  int i;
-#if CONFIG_CDEF_SINGLEPASS
+  const int num_planes = av1_num_planes(cm);
+  if (cm->allow_intrabc) return;
   cm->cdef_pri_damping = cm->cdef_sec_damping = aom_rb_read_literal(rb, 2) + 3;
-#else
-  cm->cdef_pri_damping = aom_rb_read_literal(rb, 1) + 5;
-  cm->cdef_sec_damping = aom_rb_read_literal(rb, 2) + 3;
-#endif
   cm->cdef_bits = aom_rb_read_literal(rb, 2);
   cm->nb_cdef_strengths = 1 << cm->cdef_bits;
-  for (i = 0; i < cm->nb_cdef_strengths; i++) {
+  for (int i = 0; i < cm->nb_cdef_strengths; i++) {
     cm->cdef_strengths[i] = aom_rb_read_literal(rb, CDEF_STRENGTH_BITS);
-    cm->cdef_uv_strengths[i] = cm->subsampling_x == cm->subsampling_y
-                                   ? aom_rb_read_literal(rb, CDEF_STRENGTH_BITS)
-                                   : 0;
+    cm->cdef_uv_strengths[i] =
+        num_planes > 1 ? aom_rb_read_literal(rb, CDEF_STRENGTH_BITS) : 0;
   }
 }
-#endif  // CONFIG_CDEF
 
 static INLINE int read_delta_q(struct aom_read_bit_buffer *rb) {
   return aom_rb_read_bit(rb) ? aom_rb_read_inv_signed_literal(rb, 6) : 0;
@@ -2978,66 +1872,74 @@ static INLINE int read_delta_q(struct aom_read_bit_buffer *rb) {
 
 static void setup_quantization(AV1_COMMON *const cm,
                                struct aom_read_bit_buffer *rb) {
+  const int num_planes = av1_num_planes(cm);
   cm->base_qindex = aom_rb_read_literal(rb, QINDEX_BITS);
   cm->y_dc_delta_q = read_delta_q(rb);
-  cm->uv_dc_delta_q = read_delta_q(rb);
-  cm->uv_ac_delta_q = read_delta_q(rb);
+  if (num_planes > 1) {
+    int diff_uv_delta = 0;
+    if (cm->separate_uv_delta_q) diff_uv_delta = aom_rb_read_bit(rb);
+    cm->u_dc_delta_q = read_delta_q(rb);
+    cm->u_ac_delta_q = read_delta_q(rb);
+    if (diff_uv_delta) {
+      cm->v_dc_delta_q = read_delta_q(rb);
+      cm->v_ac_delta_q = read_delta_q(rb);
+    } else {
+      cm->v_dc_delta_q = cm->u_dc_delta_q;
+      cm->v_ac_delta_q = cm->u_ac_delta_q;
+    }
+  }
   cm->dequant_bit_depth = cm->bit_depth;
-#if CONFIG_AOM_QM
   cm->using_qmatrix = aom_rb_read_bit(rb);
   if (cm->using_qmatrix) {
-    cm->min_qmlevel = aom_rb_read_literal(rb, QM_LEVEL_BITS);
-    cm->max_qmlevel = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+    cm->qm_y = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+    cm->qm_u = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+    if (!cm->separate_uv_delta_q)
+      cm->qm_v = cm->qm_u;
+    else
+      cm->qm_v = aom_rb_read_literal(rb, QM_LEVEL_BITS);
   } else {
-    cm->min_qmlevel = 0;
-    cm->max_qmlevel = 0;
+    cm->qm_y = 0;
+    cm->qm_u = 0;
+    cm->qm_v = 0;
   }
-#endif
 }
 
 // Build y/uv dequant values based on segmentation.
 static void setup_segmentation_dequant(AV1_COMMON *const cm) {
-#if CONFIG_AOM_QM
   const int using_qm = cm->using_qmatrix;
-  const int minqm = cm->min_qmlevel;
-  const int maxqm = cm->max_qmlevel;
-#endif
   // When segmentation is disabled, only the first value is used.  The
   // remaining are don't cares.
   const int max_segments = cm->seg.enabled ? MAX_SEGMENTS : 1;
   for (int i = 0; i < max_segments; ++i) {
     const int qindex = av1_get_qindex(&cm->seg, i, cm->base_qindex);
-    cm->y_dequant[i][0] = av1_dc_quant(qindex, cm->y_dc_delta_q, cm->bit_depth);
-    cm->y_dequant[i][1] = av1_ac_quant(qindex, 0, cm->bit_depth);
-    cm->uv_dequant[i][0] =
-        av1_dc_quant(qindex, cm->uv_dc_delta_q, cm->bit_depth);
-    cm->uv_dequant[i][1] =
-        av1_ac_quant(qindex, cm->uv_ac_delta_q, cm->bit_depth);
-#if CONFIG_AOM_QM
+    cm->y_dequant_QTX[i][0] =
+        av1_dc_quant_QTX(qindex, cm->y_dc_delta_q, cm->bit_depth);
+    cm->y_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, 0, cm->bit_depth);
+    cm->u_dequant_QTX[i][0] =
+        av1_dc_quant_QTX(qindex, cm->u_dc_delta_q, cm->bit_depth);
+    cm->u_dequant_QTX[i][1] =
+        av1_ac_quant_QTX(qindex, cm->u_ac_delta_q, cm->bit_depth);
+    cm->v_dequant_QTX[i][0] =
+        av1_dc_quant_QTX(qindex, cm->v_dc_delta_q, cm->bit_depth);
+    cm->v_dequant_QTX[i][1] =
+        av1_ac_quant_QTX(qindex, cm->v_ac_delta_q, cm->bit_depth);
     const int lossless = qindex == 0 && cm->y_dc_delta_q == 0 &&
-                         cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
+                         cm->u_dc_delta_q == 0 && cm->u_ac_delta_q == 0 &&
+                         cm->v_dc_delta_q == 0 && cm->v_ac_delta_q == 0;
     // NB: depends on base index so there is only 1 set per frame
     // No quant weighting when lossless or signalled not using QM
-    const int qmlevel = (lossless || using_qm == 0)
-                            ? NUM_QM_LEVELS - 1
-                            : aom_get_qmlevel(cm->base_qindex, minqm, maxqm);
+    int qmlevel = (lossless || using_qm == 0) ? NUM_QM_LEVELS - 1 : cm->qm_y;
     for (int j = 0; j < TX_SIZES_ALL; ++j) {
-      cm->y_iqmatrix[i][1][j] = aom_iqmatrix(cm, qmlevel, 0, j, 1);
-      cm->y_iqmatrix[i][0][j] = aom_iqmatrix(cm, qmlevel, 0, j, 0);
-      cm->uv_iqmatrix[i][1][j] = aom_iqmatrix(cm, qmlevel, 1, j, 1);
-      cm->uv_iqmatrix[i][0][j] = aom_iqmatrix(cm, qmlevel, 1, j, 0);
+      cm->y_iqmatrix[i][j] = av1_iqmatrix(cm, qmlevel, AOM_PLANE_Y, j);
     }
-#endif  // CONFIG_AOM_QM
-#if CONFIG_NEW_QUANT
-    for (int dq = 0; dq < QUANT_PROFILES; dq++) {
-      for (int b = 0; b < COEF_BANDS; ++b) {
-        av1_get_dequant_val_nuq(cm->y_dequant[i][b != 0], b,
-                                cm->y_dequant_nuq[i][dq][b], NULL, dq);
-        av1_get_dequant_val_nuq(cm->uv_dequant[i][b != 0], b,
-                                cm->uv_dequant_nuq[i][dq][b], NULL, dq);
-      }
+    qmlevel = (lossless || using_qm == 0) ? NUM_QM_LEVELS - 1 : cm->qm_u;
+    for (int j = 0; j < TX_SIZES_ALL; ++j) {
+      cm->u_iqmatrix[i][j] = av1_iqmatrix(cm, qmlevel, AOM_PLANE_U, j);
+    }
+    qmlevel = (lossless || using_qm == 0) ? NUM_QM_LEVELS - 1 : cm->qm_v;
+    for (int j = 0; j < TX_SIZES_ALL; ++j) {
+      cm->v_iqmatrix[i][j] = av1_iqmatrix(cm, qmlevel, AOM_PLANE_V, j);
     }
-#endif  //  CONFIG_NEW_QUANT
   }
 }
 
@@ -3047,23 +1949,21 @@ static InterpFilter read_frame_interp_filter(struct aom_read_bit_buffer *rb) {
 }
 
 static void setup_render_size(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
-#if CONFIG_FRAME_SUPERRES
   cm->render_width = cm->superres_upscaled_width;
   cm->render_height = cm->superres_upscaled_height;
-#else
-  cm->render_width = cm->width;
-  cm->render_height = cm->height;
-#endif  // CONFIG_FRAME_SUPERRES
   if (aom_rb_read_bit(rb))
-    av1_read_frame_size(rb, &cm->render_width, &cm->render_height);
+    av1_read_frame_size(rb, 16, 16, &cm->render_width, &cm->render_height);
 }
 
-#if CONFIG_FRAME_SUPERRES
 // TODO(afergs): make "struct aom_read_bit_buffer *const rb"?
 static void setup_superres(AV1_COMMON *const cm, struct aom_read_bit_buffer *rb,
                            int *width, int *height) {
   cm->superres_upscaled_width = *width;
   cm->superres_upscaled_height = *height;
+
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  if (!seq_params->enable_superres) return;
+
   if (aom_rb_read_bit(rb)) {
     cm->superres_scale_denominator =
         (uint8_t)aom_rb_read_literal(rb, SUPERRES_SCALE_BITS);
@@ -3077,7 +1977,6 @@ static void setup_superres(AV1_COMMON *const cm, struct aom_read_bit_buffer *rb,
     cm->superres_scale_denominator = SCALE_NUMERATOR;
   }
 }
-#endif  // CONFIG_FRAME_SUPERRES
 
 static void resize_context_buffers(AV1_COMMON *cm, int width, int height) {
 #if CONFIG_SIZE_LIMIT
@@ -3111,24 +2010,34 @@ static void resize_context_buffers(AV1_COMMON *cm, int width, int height) {
   cm->cur_frame->height = cm->height;
 }
 
-static void setup_frame_size(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag,
+                             struct aom_read_bit_buffer *rb) {
   int width, height;
   BufferPool *const pool = cm->buffer_pool;
-  av1_read_frame_size(rb, &width, &height);
-#if CONFIG_FRAME_SUPERRES
+
+  if (frame_size_override_flag) {
+    int num_bits_width = cm->seq_params.num_bits_width;
+    int num_bits_height = cm->seq_params.num_bits_height;
+    av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
+    if (width > cm->seq_params.max_frame_width ||
+        height > cm->seq_params.max_frame_height) {
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Frame dimensions are larger than the maximum values");
+    }
+  } else {
+    width = cm->seq_params.max_frame_width;
+    height = cm->seq_params.max_frame_height;
+  }
+
   setup_superres(cm, rb, &width, &height);
-#endif  // CONFIG_FRAME_SUPERRES
-  setup_render_size(cm, rb);
   resize_context_buffers(cm, width, height);
+  setup_render_size(cm, rb);
 
   lock_buffer_pool(pool);
   if (aom_realloc_frame_buffer(
           get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x,
-          cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-          cm->use_highbitdepth,
-#endif
-          AOM_BORDER_IN_PIXELS, cm->byte_alignment,
+          cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+          cm->byte_alignment,
           &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
           pool->cb_priv)) {
     unlock_buffer_pool(pool);
@@ -3140,25 +2049,22 @@ static void setup_frame_size(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
   pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
-  pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
-#if CONFIG_COLORSPACE_HEADERS
-  pool->frame_bufs[cm->new_fb_idx].buf.transfer_function =
-      cm->transfer_function;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_primaries = cm->color_primaries;
+  pool->frame_bufs[cm->new_fb_idx].buf.transfer_characteristics =
+      cm->transfer_characteristics;
+  pool->frame_bufs[cm->new_fb_idx].buf.matrix_coefficients =
+      cm->matrix_coefficients;
+  pool->frame_bufs[cm->new_fb_idx].buf.monochrome = cm->seq_params.monochrome;
   pool->frame_bufs[cm->new_fb_idx].buf.chroma_sample_position =
       cm->chroma_sample_position;
-#endif
   pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range;
   pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width;
   pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
 }
 
-static void setup_sb_size(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
-  (void)rb;
-#if CONFIG_EXT_PARTITION
-  set_sb_size(cm, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64);
-#else
-  set_sb_size(cm, BLOCK_64X64);
-#endif  // CONFIG_EXT_PARTITION
+static void setup_sb_size(SequenceHeader *seq_params,
+                          struct aom_read_bit_buffer *rb) {
+  set_sb_size(seq_params, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64);
 }
 
 static INLINE int valid_ref_frame_img_fmt(aom_bit_depth_t ref_bit_depth,
@@ -3172,29 +2078,30 @@ static INLINE int valid_ref_frame_img_fmt(aom_bit_depth_t ref_bit_depth,
 static void setup_frame_size_with_refs(AV1_COMMON *cm,
                                        struct aom_read_bit_buffer *rb) {
   int width, height;
-  int found = 0, i;
+  int found = 0;
   int has_valid_ref_frame = 0;
   BufferPool *const pool = cm->buffer_pool;
-  for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
     if (aom_rb_read_bit(rb)) {
       YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
       width = buf->y_crop_width;
       height = buf->y_crop_height;
       cm->render_width = buf->render_width;
       cm->render_height = buf->render_height;
-#if CONFIG_FRAME_SUPERRES
       setup_superres(cm, rb, &width, &height);
-#endif  // CONFIG_FRAME_SUPERRES
+      resize_context_buffers(cm, width, height);
       found = 1;
       break;
     }
   }
 
   if (!found) {
-    av1_read_frame_size(rb, &width, &height);
-#if CONFIG_FRAME_SUPERRES
+    int num_bits_width = cm->seq_params.num_bits_width;
+    int num_bits_height = cm->seq_params.num_bits_height;
+
+    av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
     setup_superres(cm, rb, &width, &height);
-#endif  // CONFIG_FRAME_SUPERRES
+    resize_context_buffers(cm, width, height);
     setup_render_size(cm, rb);
   }
 
@@ -3204,7 +2111,7 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
 
   // Check to make sure at least one of frames that this frame references
   // has valid dimensions.
-  for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
     RefBuffer *const ref_frame = &cm->frame_refs[i];
     has_valid_ref_frame |=
         valid_ref_frame_size(ref_frame->buf->y_crop_width,
@@ -3213,7 +2120,7 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
   if (!has_valid_ref_frame)
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Referenced frame has invalid size");
-  for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
     RefBuffer *const ref_frame = &cm->frame_refs[i];
     if (!valid_ref_frame_img_fmt(ref_frame->buf->bit_depth,
                                  ref_frame->buf->subsampling_x,
@@ -3223,16 +2130,11 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
                          "Referenced frame has incompatible color format");
   }
 
-  resize_context_buffers(cm, width, height);
-
   lock_buffer_pool(pool);
   if (aom_realloc_frame_buffer(
           get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x,
-          cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-          cm->use_highbitdepth,
-#endif
-          AOM_BORDER_IN_PIXELS, cm->byte_alignment,
+          cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+          cm->byte_alignment,
           &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
           pool->cb_priv)) {
     unlock_buffer_pool(pool);
@@ -3244,33 +2146,19 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
   pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
-  pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
-#if CONFIG_COLORSPACE_HEADERS
-  pool->frame_bufs[cm->new_fb_idx].buf.transfer_function =
-      cm->transfer_function;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_primaries = cm->color_primaries;
+  pool->frame_bufs[cm->new_fb_idx].buf.transfer_characteristics =
+      cm->transfer_characteristics;
+  pool->frame_bufs[cm->new_fb_idx].buf.matrix_coefficients =
+      cm->matrix_coefficients;
+  pool->frame_bufs[cm->new_fb_idx].buf.monochrome = cm->seq_params.monochrome;
   pool->frame_bufs[cm->new_fb_idx].buf.chroma_sample_position =
       cm->chroma_sample_position;
-#endif
   pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range;
   pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width;
   pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
 }
 
-static void read_tile_group_range(AV1Decoder *pbi,
-                                  struct aom_read_bit_buffer *const rb) {
-  AV1_COMMON *const cm = &pbi->common;
-  const int num_bits = cm->log2_tile_rows + cm->log2_tile_cols;
-  const int num_tiles =
-      cm->tile_rows * cm->tile_cols;  // Note: May be < (1<<num_bits)
-  pbi->tg_start = aom_rb_read_literal(rb, num_bits);
-  pbi->tg_size = 1 + aom_rb_read_literal(rb, num_bits);
-  if (pbi->tg_start + pbi->tg_size > num_tiles)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "Tile group extends past last tile in frame");
-}
-
-#if CONFIG_MAX_TILE
-
 // Same function as av1_read_uniform but reading from uncompresses header wb
 static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) {
   const int l = get_unsigned_bits(n);
@@ -3285,11 +2173,10 @@ static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) {
 
 static void read_tile_info_max_tile(AV1_COMMON *const cm,
                                     struct aom_read_bit_buffer *const rb) {
-  int width_mi = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
-  int height_mi = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
-  int width_sb = width_mi >> MAX_MIB_SIZE_LOG2;
-  int height_sb = height_mi >> MAX_MIB_SIZE_LOG2;
-  int start_sb, size_sb, i;
+  int width_mi = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
+  int height_mi = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+  int width_sb = width_mi >> cm->seq_params.mib_size_log2;
+  int height_sb = height_mi >> cm->seq_params.mib_size_log2;
 
   av1_get_tile_limits(cm);
   cm->uniform_tile_spacing_flag = aom_rb_read_bit(rb);
@@ -3304,8 +2191,11 @@ static void read_tile_info_max_tile(AV1_COMMON *const cm,
       cm->log2_tile_cols++;
     }
   } else {
+    int i;
+    int start_sb;
     for (i = 0, start_sb = 0; width_sb > 0 && i < MAX_TILE_COLS; i++) {
-      size_sb = 1 + rb_read_uniform(rb, AOMMIN(width_sb, MAX_TILE_WIDTH_SB));
+      const int size_sb =
+          1 + rb_read_uniform(rb, AOMMIN(width_sb, cm->max_tile_width_sb));
       cm->tile_col_start_sb[i] = start_sb;
       start_sb += size_sb;
       width_sb -= size_sb;
@@ -3325,8 +2215,10 @@ static void read_tile_info_max_tile(AV1_COMMON *const cm,
       cm->log2_tile_rows++;
     }
   } else {
+    int i;
+    int start_sb;
     for (i = 0, start_sb = 0; height_sb > 0 && i < MAX_TILE_ROWS; i++) {
-      size_sb =
+      const int size_sb =
           1 + rb_read_uniform(rb, AOMMIN(height_sb, cm->max_tile_height_sb));
       cm->tile_row_start_sb[i] = start_sb;
       start_sb += size_sb;
@@ -3337,110 +2229,61 @@ static void read_tile_info_max_tile(AV1_COMMON *const cm,
   }
   av1_calculate_tile_rows(cm);
 }
-#endif
 
-static void read_tile_info(AV1Decoder *const pbi,
-                           struct aom_read_bit_buffer *const rb) {
-  AV1_COMMON *const cm = &pbi->common;
-#if CONFIG_EXT_TILE
+void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm) {
   cm->single_tile_decoding = 0;
   if (cm->large_scale_tile) {
     struct loopfilter *lf = &cm->lf;
 
     // Figure out single_tile_decoding by loopfilter_level.
-    cm->single_tile_decoding = (!lf->filter_level) ? 1 : 0;
-// Read the tile width/height
-#if CONFIG_EXT_PARTITION
-    if (cm->sb_size == BLOCK_128X128) {
-      cm->tile_width = aom_rb_read_literal(rb, 5) + 1;
-      cm->tile_height = aom_rb_read_literal(rb, 5) + 1;
-    } else {
-#endif  // CONFIG_EXT_PARTITION
-      cm->tile_width = aom_rb_read_literal(rb, 6) + 1;
-      cm->tile_height = aom_rb_read_literal(rb, 6) + 1;
-#if CONFIG_EXT_PARTITION
-    }
-#endif  // CONFIG_EXT_PARTITION
-
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-    cm->loop_filter_across_tiles_enabled = aom_rb_read_bit(rb);
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-
-    cm->tile_width <<= cm->mib_size_log2;
-    cm->tile_height <<= cm->mib_size_log2;
-
-    cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols);
-    cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows);
-
-    // Get the number of tiles
-    cm->tile_cols = 1;
-    while (cm->tile_cols * cm->tile_width < cm->mi_cols) ++cm->tile_cols;
-
-    cm->tile_rows = 1;
-    while (cm->tile_rows * cm->tile_height < cm->mi_rows) ++cm->tile_rows;
-
-    if (cm->tile_cols * cm->tile_rows > 1) {
-      // Read the number of bytes used to store tile size
-      pbi->tile_col_size_bytes = aom_rb_read_literal(rb, 2) + 1;
-      pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
-    }
-
-#if CONFIG_DEPENDENT_HORZTILES
-    cm->dependent_horz_tiles = 0;
-#endif
-  } else {
-#endif  // CONFIG_EXT_TILE
-
-#if CONFIG_MAX_TILE
-    read_tile_info_max_tile(cm, rb);
-#else
-  int min_log2_tile_cols, max_log2_tile_cols, max_ones;
-  av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+    const int no_loopfilter = !(lf->filter_level[0] || lf->filter_level[1]);
+    const int no_cdef = cm->cdef_bits == 0 && cm->cdef_strengths[0] == 0 &&
+                        cm->cdef_uv_strengths[0] == 0;
+    const int no_restoration =
+        cm->rst_info[0].frame_restoration_type == RESTORE_NONE &&
+        cm->rst_info[1].frame_restoration_type == RESTORE_NONE &&
+        cm->rst_info[2].frame_restoration_type == RESTORE_NONE;
+    assert(IMPLIES(cm->coded_lossless, no_loopfilter && no_cdef));
+    assert(IMPLIES(cm->all_lossless, no_restoration));
+    cm->single_tile_decoding = no_loopfilter && no_cdef && no_restoration;
+  }
+}
 
-  // columns
-  max_ones = max_log2_tile_cols - min_log2_tile_cols;
-  cm->log2_tile_cols = min_log2_tile_cols;
-  while (max_ones-- && aom_rb_read_bit(rb)) cm->log2_tile_cols++;
+static void read_tile_info(AV1Decoder *const pbi,
+                           struct aom_read_bit_buffer *const rb) {
+  AV1_COMMON *const cm = &pbi->common;
 
-  if (cm->log2_tile_cols > 6)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "Invalid number of tile columns");
+  read_tile_info_max_tile(cm, rb);
 
-  // rows
-  cm->log2_tile_rows = aom_rb_read_bit(rb);
-  if (cm->log2_tile_rows) cm->log2_tile_rows += aom_rb_read_bit(rb);
+  cm->context_update_tile_id = 0;
+  if (cm->tile_rows * cm->tile_cols > 1) {
+    // tile to use for cdf update
+    cm->context_update_tile_id =
+        aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
+    // tile size magnitude
+    pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
+  }
+}
 
-  cm->tile_width =
-      get_tile_size(cm->mi_cols, cm->log2_tile_cols, &cm->tile_cols);
-  cm->tile_height =
-      get_tile_size(cm->mi_rows, cm->log2_tile_rows, &cm->tile_rows);
+#if EXT_TILE_DEBUG
+static void read_ext_tile_info(AV1Decoder *const pbi,
+                               struct aom_read_bit_buffer *const rb) {
+  AV1_COMMON *const cm = &pbi->common;
 
-#endif  // CONFIG_MAX_TILE
-#if CONFIG_DEPENDENT_HORZTILES
-    if (cm->tile_rows > 1)
-      cm->dependent_horz_tiles = aom_rb_read_bit(rb);
-    else
-      cm->dependent_horz_tiles = 0;
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-    cm->loop_filter_across_tiles_enabled = aom_rb_read_bit(rb);
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+  // This information is stored as a separate byte.
+  int mod = rb->bit_offset % CHAR_BIT;
+  if (mod > 0) aom_rb_read_literal(rb, CHAR_BIT - mod);
+  assert(rb->bit_offset % CHAR_BIT == 0);
 
-    // tile size magnitude
+  if (cm->tile_cols * cm->tile_rows > 1) {
+    // Read the number of bytes used to store tile size
+    pbi->tile_col_size_bytes = aom_rb_read_literal(rb, 2) + 1;
     pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
-#if CONFIG_EXT_TILE
   }
-#endif  // CONFIG_EXT_TILE
-
-// each tile group header is in its own tile group OBU
-#if !CONFIG_OBU
-  // Store an index to the location of the tile group information
-  pbi->tg_size_bit_offset = rb->bit_offset;
-  read_tile_group_range(pbi, rb);
-#endif
 }
+#endif  // EXT_TILE_DEBUG
 
-static int mem_get_varsize(const uint8_t *src, int sz) {
+static size_t mem_get_varsize(const uint8_t *src, int sz) {
   switch (sz) {
     case 1: return src[0];
     case 2: return mem_get_le16(src);
@@ -3450,14 +2293,14 @@ static int mem_get_varsize(const uint8_t *src, int sz) {
   }
 }
 
-#if CONFIG_EXT_TILE
+#if EXT_TILE_DEBUG
 // Reads the next tile returning its size and adjusting '*data' accordingly
-// based on 'is_last'.
+// based on 'is_last'. On return, '*data' is updated to point to the end of the
+// raw tile buffer in the bit stream.
 static void get_ls_tile_buffer(
     const uint8_t *const data_end, struct aom_internal_error_info *error_info,
-    const uint8_t **data, aom_decrypt_cb decrypt_cb, void *decrypt_state,
-    TileBufferDec (*const tile_buffers)[MAX_TILE_COLS], int tile_size_bytes,
-    int col, int row, int tile_copy_mode) {
+    const uint8_t **data, TileBufferDec (*const tile_buffers)[MAX_TILE_COLS],
+    int tile_size_bytes, int col, int row, int tile_copy_mode) {
   size_t size;
 
   size_t copy_size = 0;
@@ -3466,15 +2309,7 @@ static void get_ls_tile_buffer(
   if (!read_is_valid(*data, tile_size_bytes, data_end))
     aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt tile length");
-  if (decrypt_cb) {
-    uint8_t be_data[4];
-    decrypt_cb(decrypt_state, *data, be_data, tile_size_bytes);
-
-    // Only read number of bytes in cm->tile_size_bytes.
-    size = mem_get_varsize(be_data, tile_size_bytes);
-  } else {
-    size = mem_get_varsize(*data, tile_size_bytes);
-  }
+  size = mem_get_varsize(*data, tile_size_bytes);
 
   // If tile_copy_mode = 1, then the top bit of the tile header indicates copy
   // mode.
@@ -3486,6 +2321,8 @@ static void get_ls_tile_buffer(
     copy_data = tile_buffers[row - offset][col].data;
     copy_size = tile_buffers[row - offset][col].size;
     size = 0;
+  } else {
+    size += AV1_MIN_TILE_SIZE_BYTES;
   }
 
   *data += tile_size_bytes;
@@ -3503,30 +2340,31 @@ static void get_ls_tile_buffer(
   }
 
   *data += size;
-
-  tile_buffers[row][col].raw_data_end = *data;
 }
 
-static void get_ls_tile_buffers(
+// Returns the end of the last tile buffer
+// (tile_buffers[cm->tile_rows - 1][cm->tile_cols - 1]).
+static const uint8_t *get_ls_tile_buffers(
     AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end,
     TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
   AV1_COMMON *const cm = &pbi->common;
   const int tile_cols = cm->tile_cols;
   const int tile_rows = cm->tile_rows;
   const int have_tiles = tile_cols * tile_rows > 1;
+  const uint8_t *raw_data_end;  // The end of the last tile buffer
 
   if (!have_tiles) {
     const size_t tile_size = data_end - data;
     tile_buffers[0][0].data = data;
     tile_buffers[0][0].size = tile_size;
-    tile_buffers[0][0].raw_data_end = NULL;
+    raw_data_end = NULL;
   } else {
     // We locate only the tile buffers that are required, which are the ones
     // specified by pbi->dec_tile_col and pbi->dec_tile_row. Also, we always
     // need the last (bottom right) tile buffer, as we need to know where the
     // end of the compressed frame buffer is for proper superframe decoding.
 
-    const uint8_t *tile_col_data_end[MAX_TILE_COLS];
+    const uint8_t *tile_col_data_end[MAX_TILE_COLS] = { NULL };
     const uint8_t *const data_start = data;
 
     const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
@@ -3543,12 +2381,11 @@ static void get_ls_tile_buffers(
     const int tile_copy_mode =
         ((AOMMAX(cm->tile_width, cm->tile_height) << MI_SIZE_LOG2) <= 256) ? 1
                                                                            : 0;
-    size_t tile_col_size;
-    int r, c;
-
     // Read tile column sizes for all columns (we need the last tile buffer)
-    for (c = 0; c < tile_cols; ++c) {
+    for (int c = 0; c < tile_cols; ++c) {
       const int is_last = c == tile_cols - 1;
+      size_t tile_col_size;
+
       if (!is_last) {
         tile_col_size = mem_get_varsize(data, tile_col_size_bytes);
         data += tile_col_size_bytes;
@@ -3563,7 +2400,7 @@ static void get_ls_tile_buffers(
     data = data_start;
 
     // Read the required tile sizes.
-    for (c = tile_cols_start; c < tile_cols_end; ++c) {
+    for (int c = tile_cols_start; c < tile_cols_end; ++c) {
       const int is_last = c == tile_cols - 1;
 
       if (c > 0) data = tile_col_data_end[c - 1];
@@ -3571,40 +2408,45 @@ static void get_ls_tile_buffers(
       if (!is_last) data += tile_col_size_bytes;
 
       // Get the whole of the last column, otherwise stop at the required tile.
-      for (r = 0; r < (is_last ? tile_rows : tile_rows_end); ++r) {
-        tile_buffers[r][c].col = c;
-
+      for (int r = 0; r < (is_last ? tile_rows : tile_rows_end); ++r) {
         get_ls_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data,
-                           pbi->decrypt_cb, pbi->decrypt_state, tile_buffers,
-                           tile_size_bytes, c, r, tile_copy_mode);
+                           tile_buffers, tile_size_bytes, c, r, tile_copy_mode);
       }
     }
 
     // If we have not read the last column, then read it to get the last tile.
     if (tile_cols_end != tile_cols) {
-      c = tile_cols - 1;
+      const int c = tile_cols - 1;
 
       data = tile_col_data_end[c - 1];
 
-      for (r = 0; r < tile_rows; ++r) {
-        tile_buffers[r][c].col = c;
-
+      for (int r = 0; r < tile_rows; ++r) {
         get_ls_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data,
-                           pbi->decrypt_cb, pbi->decrypt_state, tile_buffers,
-                           tile_size_bytes, c, r, tile_copy_mode);
+                           tile_buffers, tile_size_bytes, c, r, tile_copy_mode);
       }
     }
+    raw_data_end = data;
   }
+  return raw_data_end;
+}
+#endif  // EXT_TILE_DEBUG
+
+static const uint8_t *get_ls_single_tile_buffer(
+    AV1Decoder *pbi, const uint8_t *data,
+    TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
+  assert(pbi->dec_tile_row >= 0 && pbi->dec_tile_col >= 0);
+  tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].data = data;
+  tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].size =
+      (size_t)pbi->coded_tile_data_size;
+  return data + pbi->coded_tile_data_size;
 }
-#endif  // CONFIG_EXT_TILE
 
 // Reads the next tile returning its size and adjusting '*data' accordingly
 // based on 'is_last'.
 static void get_tile_buffer(const uint8_t *const data_end,
                             const int tile_size_bytes, int is_last,
                             struct aom_internal_error_info *error_info,
-                            const uint8_t **data, aom_decrypt_cb decrypt_cb,
-                            void *decrypt_state, TileBufferDec *const buf) {
+                            const uint8_t **data, TileBufferDec *const buf) {
   size_t size;
 
   if (!is_last) {
@@ -3612,13 +2454,7 @@ static void get_tile_buffer(const uint8_t *const data_end,
       aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
                          "Truncated packet or corrupt tile length");
 
-    if (decrypt_cb) {
-      uint8_t be_data[4];
-      decrypt_cb(decrypt_state, *data, be_data, tile_size_bytes);
-      size = mem_get_varsize(be_data, tile_size_bytes);
-    } else {
-      size = mem_get_varsize(*data, tile_size_bytes);
-    }
+    size = mem_get_varsize(*data, tile_size_bytes) + AV1_MIN_TILE_SIZE_BYTES;
     *data += tile_size_bytes;
 
     if (size > (size_t)(data_end - *data))
@@ -3637,140 +2473,123 @@ static void get_tile_buffer(const uint8_t *const data_end,
 static void get_tile_buffers(AV1Decoder *pbi, const uint8_t *data,
                              const uint8_t *data_end,
                              TileBufferDec (*const tile_buffers)[MAX_TILE_COLS],
-                             int startTile, int endTile) {
+                             int start_tile, int end_tile) {
   AV1_COMMON *const cm = &pbi->common;
-  int r, c;
   const int tile_cols = cm->tile_cols;
   const int tile_rows = cm->tile_rows;
   int tc = 0;
   int first_tile_in_tg = 0;
-  struct aom_read_bit_buffer rb_tg_hdr;
-  uint8_t clear_data[MAX_AV1_HEADER_SIZE];
-#if !CONFIG_OBU
-  const size_t hdr_size = pbi->uncomp_hdr_size + pbi->first_partition_size;
-  const int tg_size_bit_offset = pbi->tg_size_bit_offset;
-#else
-  const int tg_size_bit_offset = 0;
-#endif
-
-#if CONFIG_DEPENDENT_HORZTILES
-  int tile_group_start_col = 0;
-  int tile_group_start_row = 0;
-#endif
 
-  for (r = 0; r < tile_rows; ++r) {
-    for (c = 0; c < tile_cols; ++c, ++tc) {
+  for (int r = 0; r < tile_rows; ++r) {
+    for (int c = 0; c < tile_cols; ++c, ++tc) {
       TileBufferDec *const buf = &tile_buffers[r][c];
-#if CONFIG_OBU
-      const int is_last = (tc == endTile);
+
+      const int is_last = (tc == end_tile);
       const size_t hdr_offset = 0;
-#else
-      const int is_last = (r == tile_rows - 1) && (c == tile_cols - 1);
-      const size_t hdr_offset = (tc && tc == first_tile_in_tg) ? hdr_size : 0;
-#endif
 
-      if (tc < startTile || tc > endTile) continue;
+      if (tc < start_tile || tc > end_tile) continue;
 
       if (data + hdr_offset >= data_end)
         aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                            "Data ended before all tiles were read.");
-      buf->col = c;
-      if (hdr_offset) {
-        init_read_bit_buffer(pbi, &rb_tg_hdr, data, data_end, clear_data);
-        rb_tg_hdr.bit_offset = tg_size_bit_offset;
-        read_tile_group_range(pbi, &rb_tg_hdr);
-#if CONFIG_DEPENDENT_HORZTILES
-        tile_group_start_row = r;
-        tile_group_start_col = c;
-#endif
-      }
       first_tile_in_tg += tc == first_tile_in_tg ? pbi->tg_size : 0;
       data += hdr_offset;
       get_tile_buffer(data_end, pbi->tile_size_bytes, is_last,
-                      &pbi->common.error, &data, pbi->decrypt_cb,
-                      pbi->decrypt_state, buf);
-#if CONFIG_DEPENDENT_HORZTILES
-      cm->tile_group_start_row[r][c] = tile_group_start_row;
-      cm->tile_group_start_col[r][c] = tile_group_start_col;
-#endif
+                      &pbi->common.error, &data, buf);
     }
   }
 }
 
-#if CONFIG_PVQ
-static void daala_dec_init(AV1_COMMON *const cm, daala_dec_ctx *daala_dec,
-                           aom_reader *r) {
-  daala_dec->r = r;
+static void set_cb_buffer(MACROBLOCKD *const xd, CB_BUFFER *cb_buffer,
+                          const int num_planes) {
+  for (int plane = 0; plane < num_planes; ++plane) {
+    xd->plane[plane].dqcoeff_block = cb_buffer->dqcoeff[plane];
+    xd->plane[plane].eob_data = cb_buffer->eob_data[plane];
+    xd->cb_offset[plane] = 0;
+    xd->txb_offset[plane] = 0;
+  }
+  xd->plane[0].color_index_map = cb_buffer->color_index_map[0];
+  xd->plane[1].color_index_map = cb_buffer->color_index_map[1];
+  xd->color_index_map_offset[0] = 0;
+  xd->color_index_map_offset[1] = 0;
+}
 
-  // TODO(yushin) : activity masking info needs be signaled by a bitstream
-  daala_dec->use_activity_masking = AV1_PVQ_ENABLE_ACTIVITY_MASKING;
+static void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
+                               TileInfo tile_info, const int mi_row) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
+  av1_zero_left_context(&td->xd);
 
-  if (daala_dec->use_activity_masking)
-    daala_dec->qm = OD_HVS_QM;
-  else
-    daala_dec->qm = OD_FLAT_QM;
+  for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+       mi_col += cm->seq_params.mib_size) {
+    set_cb_buffer(&td->xd, &td->cb_buffer_base, num_planes);
 
-  od_init_qm(daala_dec->state.qm, daala_dec->state.qm_inv,
-             daala_dec->qm == OD_HVS_QM ? OD_QM8_Q4_HVS : OD_QM8_Q4_FLAT);
+    decode_partition(pbi, &td->xd, mi_row, mi_col, td->bit_reader,
+                     cm->seq_params.sb_size);
+  }
+}
 
-  if (daala_dec->use_activity_masking) {
-    int pli;
-    int use_masking = daala_dec->use_activity_masking;
-    int segment_id = 0;
-    int qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+static int check_trailing_bits_after_symbol_coder(aom_reader *r) {
+  uint32_t nb_bits = aom_reader_tell(r);
+  uint32_t nb_bytes = (nb_bits + 7) >> 3;
 
-    for (pli = 0; pli < MAX_MB_PLANE; pli++) {
-      int i;
-      int q;
+  const uint8_t *p_begin = aom_reader_find_begin(r);
+  const uint8_t *p_end = aom_reader_find_end(r);
 
-      q = qindex;
-      if (q <= OD_DEFAULT_QMS[use_masking][0][pli].interp_q << OD_COEFF_SHIFT) {
-        od_interp_qm(&daala_dec->state.pvq_qm_q4[pli][0], q,
-                     &OD_DEFAULT_QMS[use_masking][0][pli], NULL);
-      } else {
-        i = 0;
-        while (OD_DEFAULT_QMS[use_masking][i + 1][pli].qm_q4 != NULL &&
-               q > OD_DEFAULT_QMS[use_masking][i + 1][pli].interp_q
-                       << OD_COEFF_SHIFT) {
-          i++;
-        }
-        od_interp_qm(&daala_dec->state.pvq_qm_q4[pli][0], q,
-                     &OD_DEFAULT_QMS[use_masking][i][pli],
-                     &OD_DEFAULT_QMS[use_masking][i + 1][pli]);
-      }
-    }
+  // It is legal to have no padding bytes (nb_bytes == p_end - p_begin).
+  if ((ptrdiff_t)nb_bytes > p_end - p_begin) return -1;
+  const uint8_t *p = p_begin + nb_bytes;
+
+  // aom_reader_tell() returns 1 for a newly initialized decoder, and the
+  // return value only increases as values are decoded. So nb_bits > 0, and
+  // thus p > p_begin. Therefore accessing p[-1] is safe.
+  uint8_t last_byte = p[-1];
+  uint8_t pattern = 128 >> ((nb_bits - 1) & 7);
+  if ((last_byte & (2 * pattern - 1)) != pattern) return -1;
+
+  // Make sure that all padding bytes are zero as required by the spec.
+  while (p < p_end) {
+    if (*p != 0) return -1;
+    p++;
   }
+  return 0;
 }
-#endif  // #if CONFIG_PVQ
 
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-static void dec_setup_across_tile_boundary_info(
-    const AV1_COMMON *const cm, const TileInfo *const tile_info) {
-  if (tile_info->mi_row_start >= tile_info->mi_row_end ||
-      tile_info->mi_col_start >= tile_info->mi_col_end)
-    return;
+static void decode_tile(AV1Decoder *pbi, ThreadData *const td, int tile_row,
+                        int tile_col) {
+  TileInfo tile_info;
+
+  AV1_COMMON *const cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
 
-  if (!cm->loop_filter_across_tiles_enabled) {
-    av1_setup_across_tile_boundary_info(cm, tile_info);
+  av1_tile_set_row(&tile_info, cm, tile_row);
+  av1_tile_set_col(&tile_info, cm, tile_col);
+  av1_zero_above_context(cm, tile_info.mi_col_start, tile_info.mi_col_end,
+                         tile_row);
+  av1_reset_loop_restoration(&td->xd, num_planes);
+
+  for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+       mi_row += cm->seq_params.mib_size) {
+    decode_tile_sb_row(pbi, td, tile_info, mi_row);
   }
+
+  int corrupted =
+      (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0;
+  aom_merge_corrupted_flag(&td->xd.corrupted, corrupted);
 }
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
 
 static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
-                                   const uint8_t *data_end, int startTile,
-                                   int endTile) {
+                                   const uint8_t *data_end, int start_tile,
+                                   int end_tile) {
   AV1_COMMON *const cm = &pbi->common;
-  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   const int tile_cols = cm->tile_cols;
   const int tile_rows = cm->tile_rows;
   const int n_tiles = tile_cols * tile_rows;
   TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
-#if CONFIG_EXT_TILE
   const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
   const int single_row = pbi->dec_tile_row >= 0;
   const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
   const int single_col = pbi->dec_tile_col >= 0;
-#endif  // CONFIG_EXT_TILE
   int tile_rows_start;
   int tile_rows_end;
   int tile_cols_start;
@@ -3778,8 +2597,9 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
   int inv_col_order;
   int inv_row_order;
   int tile_row, tile_col;
+  uint8_t allow_update_cdf;
+  const uint8_t *raw_data_end = NULL;
 
-#if CONFIG_EXT_TILE
   if (cm->large_scale_tile) {
     tile_rows_start = single_row ? dec_tile_row : 0;
     tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
@@ -3787,46 +2607,38 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
     tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
     inv_col_order = pbi->inv_tile_order && !single_col;
     inv_row_order = pbi->inv_tile_order && !single_row;
+    allow_update_cdf = 0;
   } else {
-#endif  // CONFIG_EXT_TILE
     tile_rows_start = 0;
     tile_rows_end = tile_rows;
     tile_cols_start = 0;
     tile_cols_end = tile_cols;
     inv_col_order = pbi->inv_tile_order;
     inv_row_order = pbi->inv_tile_order;
-#if CONFIG_EXT_TILE
-  }
-#endif  // CONFIG_EXT_TILE
-
-  if (cm->lf.filter_level && !cm->skip_loop_filter &&
-      pbi->lf_worker.data1 == NULL) {
-    CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
-                    aom_memalign(32, sizeof(LFWorkerData)));
-    pbi->lf_worker.hook = (AVxWorkerHook)av1_loop_filter_worker;
-    if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
-      aom_internal_error(&cm->error, AOM_CODEC_ERROR,
-                         "Loop filter thread creation failed");
-    }
+    allow_update_cdf = 1;
   }
 
-  if (cm->lf.filter_level && !cm->skip_loop_filter) {
-    LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1;
-    // Be sure to sync as we might be resuming after a failed frame decode.
-    winterface->sync(&pbi->lf_worker);
-    av1_loop_filter_data_reset(lf_data, get_frame_new_buffer(cm), cm,
-                               pbi->mb.plane);
-  }
+  // No tiles to decode.
+  if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
+      // First tile is larger than end_tile.
+      tile_rows_start * cm->tile_cols + tile_cols_start > end_tile ||
+      // Last tile is smaller than start_tile.
+      (tile_rows_end - 1) * cm->tile_cols + tile_cols_end - 1 < start_tile)
+    return data;
+
+  allow_update_cdf = allow_update_cdf && !cm->disable_cdf_update;
 
   assert(tile_rows <= MAX_TILE_ROWS);
   assert(tile_cols <= MAX_TILE_COLS);
 
-#if CONFIG_EXT_TILE
-  if (cm->large_scale_tile)
-    get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
+#if EXT_TILE_DEBUG
+  if (cm->large_scale_tile && !pbi->ext_tile_debug)
+    raw_data_end = get_ls_single_tile_buffer(pbi, data, tile_buffers);
+  else if (cm->large_scale_tile && pbi->ext_tile_debug)
+    raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
   else
-#endif  // CONFIG_EXT_TILE
-    get_tile_buffers(pbi, data, data_end, tile_buffers, startTile, endTile);
+#endif  // EXT_TILE_DEBUG
+    get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
 
   if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
     aom_free(pbi->tile_data);
@@ -3839,536 +2651,411 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
     aom_accounting_reset(&pbi->accounting);
   }
 #endif
-  // Load all tile information into tile_data.
+  // Load all tile information into thread_data.
   for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
+    const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row;
+
     for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
-      const TileBufferDec *const buf = &tile_buffers[tile_row][tile_col];
-      TileData *const td = pbi->tile_data + tile_cols * tile_row + tile_col;
+      const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col;
+      ThreadData *const td = &pbi->td;
+      TileDataDec *const tile_data = pbi->tile_data + row * cm->tile_cols + col;
+      const TileBufferDec *const tile_bs_buf = &tile_buffers[row][col];
 
-      if (tile_row * cm->tile_cols + tile_col < startTile ||
-          tile_row * cm->tile_cols + tile_col > endTile)
+      if (row * cm->tile_cols + col < start_tile ||
+          row * cm->tile_cols + col > end_tile)
         continue;
 
-      td->cm = cm;
       td->xd = pbi->mb;
       td->xd.corrupted = 0;
-      td->xd.counts =
-          cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD
-              ? &cm->counts
-              : NULL;
+      td->xd.mc_buf[0] = pbi->td.mc_buf[0];
+      td->xd.mc_buf[1] = pbi->td.mc_buf[1];
+      td->bit_reader = &tile_data->bit_reader;
       av1_zero(td->dqcoeff);
-#if CONFIG_PVQ
-      av1_zero(td->pvq_ref_coeff);
-#endif
-      av1_tile_init(&td->xd.tile, td->cm, tile_row, tile_col);
-      setup_bool_decoder(buf->data, data_end, buf->size, &cm->error,
-                         &td->bit_reader,
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-                         1 << cm->ans_window_size_log2,
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-                         pbi->decrypt_cb, pbi->decrypt_state);
+      av1_tile_init(&td->xd.tile, cm, row, col);
+      setup_bool_decoder(tile_bs_buf->data, data_end, tile_bs_buf->size,
+                         &cm->error, td->bit_reader, allow_update_cdf);
 #if CONFIG_ACCOUNTING
       if (pbi->acct_enabled) {
-        td->bit_reader.accounting = &pbi->accounting;
+        td->bit_reader->accounting = &pbi->accounting;
+        td->bit_reader->accounting->last_tell_frac =
+            aom_reader_tell_frac(td->bit_reader);
       } else {
-        td->bit_reader.accounting = NULL;
+        td->bit_reader->accounting = NULL;
       }
 #endif
-      av1_init_macroblockd(cm, &td->xd,
-#if CONFIG_PVQ
-                           td->pvq_ref_coeff,
-#endif
-#if CONFIG_CFL
-                           &td->cfl,
-#endif
-                           td->dqcoeff);
+      av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+      av1_init_above_context(cm, &td->xd, row);
 
       // Initialise the tile context from the frame context
-      td->tctx = *cm->fc;
-      td->xd.tile_ctx = &td->tctx;
-
-#if CONFIG_PVQ
-      daala_dec_init(cm, &td->xd.daala_dec, &td->bit_reader);
-      td->xd.daala_dec.state.adapt = &td->tctx.pvq_context;
-#endif
+      tile_data->tctx = *cm->fc;
+      td->xd.tile_ctx = &tile_data->tctx;
 
-      td->xd.plane[0].color_index_map = td->color_index_map[0];
-      td->xd.plane[1].color_index_map = td->color_index_map[1];
-#if CONFIG_MRC_TX
-      td->xd.mrc_mask = td->mrc_mask;
-#endif  // CONFIG_MRC_TX
+      // decode tile
+      decode_tile(pbi, &pbi->td, row, col);
+      aom_merge_corrupted_flag(&pbi->mb.corrupted, td->xd.corrupted);
+      if (pbi->mb.corrupted)
+        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                           "Failed to decode tile data");
     }
   }
 
-  for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
-    const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row;
-    int mi_row = 0;
-    TileInfo tile_info;
-
-    av1_tile_set_row(&tile_info, cm, row);
-
-    for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
-      const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col;
-      TileData *const td = pbi->tile_data + tile_cols * row + col;
+  if (cm->large_scale_tile) {
+    if (n_tiles == 1) {
+      // Find the end of the single tile buffer
+      return aom_reader_find_end(&pbi->tile_data->bit_reader);
+    }
+    // Return the end of the last tile buffer
+    return raw_data_end;
+  }
+  TileDataDec *const tile_data = pbi->tile_data + end_tile;
 
-      if (tile_row * cm->tile_cols + tile_col < startTile ||
-          tile_row * cm->tile_cols + tile_col > endTile)
-        continue;
+  return aom_reader_find_end(&tile_data->bit_reader);
+}
 
-#if CONFIG_ACCOUNTING
-      if (pbi->acct_enabled) {
-        td->bit_reader.accounting->last_tell_frac =
-            aom_reader_tell_frac(&td->bit_reader);
-      }
-#endif
+static TileJobsDec *get_dec_job_info(AV1DecTileMT *tile_mt_info) {
+  TileJobsDec *cur_job_info = NULL;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(tile_mt_info->job_mutex);
 
-      av1_tile_set_col(&tile_info, cm, col);
+  if (tile_mt_info->jobs_dequeued < tile_mt_info->jobs_enqueued) {
+    cur_job_info = tile_mt_info->job_queue + tile_mt_info->jobs_dequeued;
+    tile_mt_info->jobs_dequeued++;
+  }
 
-#if CONFIG_DEPENDENT_HORZTILES
-      av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col);
-      if (!cm->dependent_horz_tiles || tile_row == 0 ||
-          tile_info.tg_horz_boundary) {
-        av1_zero_above_context(cm, tile_info.mi_col_start,
-                               tile_info.mi_col_end);
-      }
+  pthread_mutex_unlock(tile_mt_info->job_mutex);
 #else
-      av1_zero_above_context(cm, tile_info.mi_col_start, tile_info.mi_col_end);
+  (void)tile_mt_info;
 #endif
-#if CONFIG_LOOP_RESTORATION
-      for (int p = 0; p < MAX_MB_PLANE; ++p) {
-        set_default_wiener(td->xd.wiener_info + p);
-        set_default_sgrproj(td->xd.sgrproj_info + p);
-      }
-#endif  // CONFIG_LOOP_RESTORATION
+  return cur_job_info;
+}
 
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-      dec_setup_across_tile_boundary_info(cm, &tile_info);
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+static int tile_worker_hook(void *arg1, void *arg2) {
+  DecWorkerData *const thread_data = (DecWorkerData *)arg1;
+  AV1Decoder *const pbi = (AV1Decoder *)arg2;
+  AV1_COMMON *cm = &pbi->common;
+  ThreadData *const td = thread_data->td;
+  uint8_t allow_update_cdf;
 
-      for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
-           mi_row += cm->mib_size) {
-        int mi_col;
+  if (setjmp(thread_data->error_info.jmp)) {
+    thread_data->error_info.setjmp = 0;
+    thread_data->td->xd.corrupted = 1;
+    return 0;
+  }
+  allow_update_cdf = cm->large_scale_tile ? 0 : 1;
+  allow_update_cdf = allow_update_cdf && !cm->disable_cdf_update;
 
-        av1_zero_left_context(&td->xd);
+  assert(cm->tile_cols > 0);
+  while (1) {
+    TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
 
-        for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-             mi_col += cm->mib_size) {
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-          alloc_ncobmc_pred_buffer(&td->xd);
-          set_sb_mi_boundaries(cm, &td->xd, mi_row, mi_col);
-#endif
-          decode_partition(pbi, &td->xd,
-#if CONFIG_SUPERTX
-                           0,
-#endif  // CONFIG_SUPERTX
-                           mi_row, mi_col, &td->bit_reader, cm->sb_size);
-#if NC_MODE_INFO && CONFIG_MOTION_VAR
-          detoken_and_recon_sb(pbi, &td->xd, mi_row, mi_col, &td->bit_reader,
-                               cm->sb_size);
-#endif
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-          free_ncobmc_pred_buffer(&td->xd);
-#endif
-        }
-        aom_merge_corrupted_flag(&pbi->mb.corrupted, td->xd.corrupted);
-        if (pbi->mb.corrupted)
-          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                             "Failed to decode tile data");
-      }
-    }
-
-#if !CONFIG_OBU
-    assert(mi_row > 0);
-#endif
+    if (cur_job_info != NULL && !td->xd.corrupted) {
+      const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer;
+      TileDataDec *const tile_data = cur_job_info->tile_data;
+      volatile int tile_row = tile_data->tile_info.tile_row;
+      volatile int tile_col = tile_data->tile_info.tile_col;
 
-// when Parallel deblocking is enabled, deblocking should not
-// be interleaved with decoding. Instead, deblocking should be done
-// after the entire frame is decoded.
-#if !CONFIG_VAR_TX && !CONFIG_PARALLEL_DEBLOCKING && !CONFIG_CB4X4
-    // Loopfilter one tile row.
-    // Note: If out-of-order tile decoding is used(for example, inv_row_order
-    // = 1), the loopfiltering has be done after all tile rows are decoded.
-    if (!inv_row_order && cm->lf.filter_level && !cm->skip_loop_filter) {
-      LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1;
-      const int lf_start = AOMMAX(0, tile_info.mi_row_start - cm->mib_size);
-      const int lf_end = tile_info.mi_row_end - cm->mib_size;
-
-      // Delay the loopfilter if the first tile row is only
-      // a single superblock high.
-      if (lf_end <= 0) continue;
-
-      // Decoding has completed. Finish up the loop filter in this thread.
-      if (tile_info.mi_row_end >= cm->mi_rows) continue;
-
-      winterface->sync(&pbi->lf_worker);
-      lf_data->start = lf_start;
-      lf_data->stop = lf_end;
-      if (pbi->max_threads > 1) {
-        winterface->launch(&pbi->lf_worker);
+      td->xd = pbi->mb;
+      td->xd.corrupted = 0;
+      td->xd.mc_buf[0] = td->mc_buf[0];
+      td->xd.mc_buf[1] = td->mc_buf[1];
+      td->bit_reader = &tile_data->bit_reader;
+      av1_zero(td->dqcoeff);
+      av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
+      setup_bool_decoder(tile_buffer->data, thread_data->data_end,
+                         tile_buffer->size, &cm->error, td->bit_reader,
+                         allow_update_cdf);
+#if CONFIG_ACCOUNTING
+      if (pbi->acct_enabled) {
+        td->bit_reader->accounting = &pbi->accounting;
+        td->bit_reader->accounting->last_tell_frac =
+            aom_reader_tell_frac(td->bit_reader);
       } else {
-        winterface->execute(&pbi->lf_worker);
+        td->bit_reader->accounting = NULL;
       }
-    }
-#endif  // !CONFIG_VAR_TX && !CONFIG_PARALLEL_DEBLOCKING
-
-    // After loopfiltering, the last 7 row pixels in each superblock row may
-    // still be changed by the longest loopfilter of the next superblock row.
-    if (cm->frame_parallel_decode)
-      av1_frameworker_broadcast(pbi->cur_buf, mi_row << cm->mib_size_log2);
-  }
+#endif
+      av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+      av1_init_above_context(cm, &td->xd, tile_row);
 
-#if CONFIG_VAR_TX || CONFIG_CB4X4
-// Loopfilter the whole frame.
-#if CONFIG_LPF_SB
-  av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb,
-                        cm->lf.filter_level, 0, 0, 0, 0);
-#else
-#if CONFIG_LOOPFILTER_LEVEL
-  if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) {
-    av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb,
-                          cm->lf.filter_level[0], cm->lf.filter_level[1], 0, 0);
-    av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb,
-                          cm->lf.filter_level_u, cm->lf.filter_level_u, 1, 0);
-    av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb,
-                          cm->lf.filter_level_v, cm->lf.filter_level_v, 2, 0);
-  }
-#else
-#if CONFIG_OBU
-  if (endTile == cm->tile_rows * cm->tile_cols - 1)
+      // Initialise the tile context from the frame context
+      tile_data->tctx = *cm->fc;
+      td->xd.tile_ctx = &tile_data->tctx;
+#if CONFIG_ACCOUNTING
+      if (pbi->acct_enabled) {
+        tile_data->bit_reader.accounting->last_tell_frac =
+            aom_reader_tell_frac(&tile_data->bit_reader);
+      }
 #endif
-    av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb,
-                          cm->lf.filter_level, 0, 0);
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif  // CONFIG_LPF_SB
-#else
-#if CONFIG_PARALLEL_DEBLOCKING
-  // Loopfilter all rows in the frame in the frame.
-  if (cm->lf.filter_level && !cm->skip_loop_filter) {
-    LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1;
-    winterface->sync(&pbi->lf_worker);
-    lf_data->start = 0;
-    lf_data->stop = cm->mi_rows;
-    winterface->execute(&pbi->lf_worker);
-  }
-#else
-  // Loopfilter remaining rows in the frame.
-  if (cm->lf.filter_level && !cm->skip_loop_filter) {
-    LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1;
-    winterface->sync(&pbi->lf_worker);
-    lf_data->start = lf_data->stop;
-    lf_data->stop = cm->mi_rows;
-    winterface->execute(&pbi->lf_worker);
-  }
-#endif  // CONFIG_PARALLEL_DEBLOCKING
-#endif  // CONFIG_VAR_TX
-  if (cm->frame_parallel_decode)
-    av1_frameworker_broadcast(pbi->cur_buf, INT_MAX);
-
-#if CONFIG_EXT_TILE
-  if (cm->large_scale_tile) {
-    if (n_tiles == 1) {
-#if CONFIG_ANS
-      return data_end;
-#else
-      // Find the end of the single tile buffer
-      return aom_reader_find_end(&pbi->tile_data->bit_reader);
-#endif  // CONFIG_ANS
+      // decode tile
+      decode_tile(pbi, td, tile_row, tile_col);
     } else {
-      // Return the end of the last tile buffer
-      return tile_buffers[tile_rows - 1][tile_cols - 1].raw_data_end;
+      break;
     }
-  } else {
-#endif  // CONFIG_EXT_TILE
-#if CONFIG_ANS
-    return data_end;
-#else
-#if !CONFIG_OBU
-  {
-    // Get last tile data.
-    TileData *const td = pbi->tile_data + tile_cols * tile_rows - 1;
-    return aom_reader_find_end(&td->bit_reader);
-  }
-#else
-  TileData *const td = pbi->tile_data + endTile;
-  return aom_reader_find_end(&td->bit_reader);
-#endif
-#endif  // CONFIG_ANS
-#if CONFIG_EXT_TILE
   }
-#endif  // CONFIG_EXT_TILE
+  return !td->xd.corrupted;
 }
 
-static int tile_worker_hook(TileWorkerData *const tile_data,
-                            const TileInfo *const tile) {
-  AV1Decoder *const pbi = tile_data->pbi;
-  const AV1_COMMON *const cm = &pbi->common;
-  int mi_row, mi_col;
+// sorts in descending order
+static int compare_tile_buffers(const void *a, const void *b) {
+  const TileJobsDec *const buf1 = (const TileJobsDec *)a;
+  const TileJobsDec *const buf2 = (const TileJobsDec *)b;
+  return (((int)buf2->tile_buffer->size) - ((int)buf1->tile_buffer->size));
+}
 
-  if (setjmp(tile_data->error_info.jmp)) {
-    tile_data->error_info.setjmp = 0;
-    aom_merge_corrupted_flag(&tile_data->xd.corrupted, 1);
-    return 0;
+static void enqueue_tile_jobs(AV1Decoder *pbi, AV1_COMMON *cm,
+                              int tile_rows_start, int tile_rows_end,
+                              int tile_cols_start, int tile_cols_end,
+                              int startTile, int endTile) {
+  AV1DecTileMT *tile_mt_info = &pbi->tile_mt_info;
+  TileJobsDec *tile_job_queue = tile_mt_info->job_queue;
+  tile_mt_info->jobs_enqueued = 0;
+  tile_mt_info->jobs_dequeued = 0;
+
+  for (int row = tile_rows_start; row < tile_rows_end; row++) {
+    for (int col = tile_cols_start; col < tile_cols_end; col++) {
+      if (row * cm->tile_cols + col < startTile ||
+          row * cm->tile_cols + col > endTile)
+        continue;
+      tile_job_queue->tile_buffer = &pbi->tile_buffers[row][col];
+      tile_job_queue->tile_data = pbi->tile_data + row * cm->tile_cols + col;
+      tile_job_queue++;
+      tile_mt_info->jobs_enqueued++;
+    }
   }
+}
+
+static void alloc_dec_jobs(AV1DecTileMT *tile_mt_info, AV1_COMMON *cm,
+                           int tile_rows, int tile_cols) {
+  tile_mt_info->alloc_tile_rows = tile_rows;
+  tile_mt_info->alloc_tile_cols = tile_cols;
+  int num_tiles = tile_rows * tile_cols;
+#if CONFIG_MULTITHREAD
+  {
+    CHECK_MEM_ERROR(cm, tile_mt_info->job_mutex,
+                    aom_malloc(sizeof(*tile_mt_info->job_mutex) * num_tiles));
 
-  tile_data->error_info.setjmp = 1;
-  tile_data->xd.error_info = &tile_data->error_info;
-#if CONFIG_DEPENDENT_HORZTILES
-  if (!cm->dependent_horz_tiles || tile->tg_horz_boundary) {
-    av1_zero_above_context(&pbi->common, tile->mi_col_start, tile->mi_col_end);
+    for (int i = 0; i < num_tiles; i++) {
+      pthread_mutex_init(&tile_mt_info->job_mutex[i], NULL);
+    }
   }
-#else
-  av1_zero_above_context(&pbi->common, tile->mi_col_start, tile->mi_col_end);
 #endif
+  CHECK_MEM_ERROR(cm, tile_mt_info->job_queue,
+                  aom_malloc(sizeof(*tile_mt_info->job_queue) * num_tiles));
+}
 
-  for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
-       mi_row += cm->mib_size) {
-    av1_zero_left_context(&tile_data->xd);
-
-    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
-         mi_col += cm->mib_size) {
-      decode_partition(pbi, &tile_data->xd,
-#if CONFIG_SUPERTX
-                       0,
-#endif
-                       mi_row, mi_col, &tile_data->bit_reader, cm->sb_size);
-#if NC_MODE_INFO && CONFIG_MOTION_VAR
-      detoken_and_recon_sb(pbi, &tile_data->xd, mi_row, mi_col,
-                           &tile_data->bit_reader, cm->sb_size);
-#endif
-    }
+void av1_free_mc_tmp_buf(void *td, int use_highbd) {
+  ThreadData *thread_data = (ThreadData *)td;
+  int ref;
+  for (ref = 0; ref < 2; ref++) {
+    if (use_highbd)
+      aom_free(CONVERT_TO_SHORTPTR(thread_data->mc_buf[ref]));
+    else
+      aom_free(thread_data->mc_buf[ref]);
+    thread_data->mc_buf[ref] = NULL;
   }
-  return !tile_data->xd.corrupted;
+  thread_data->mc_buf_size = 0;
 }
 
-// sorts in descending order
-static int compare_tile_buffers(const void *a, const void *b) {
-  const TileBufferDec *const buf1 = (const TileBufferDec *)a;
-  const TileBufferDec *const buf2 = (const TileBufferDec *)b;
-  return (int)(buf2->size - buf1->size);
+static void allocate_mc_tmp_buf(AV1_COMMON *const cm, void *td, int buf_size,
+                                int use_highbd) {
+  ThreadData *thread_data = (ThreadData *)td;
+
+  for (int ref = 0; ref < 2; ref++) {
+    if (use_highbd) {
+      uint16_t *hbd_mc_buf;
+      CHECK_MEM_ERROR(cm, hbd_mc_buf, (uint16_t *)aom_memalign(16, buf_size));
+      thread_data->mc_buf[ref] = CONVERT_TO_BYTEPTR(hbd_mc_buf);
+    } else {
+      CHECK_MEM_ERROR(cm, thread_data->mc_buf[ref],
+                      (uint8_t *)aom_memalign(16, buf_size));
+    }
+  }
+  thread_data->mc_buf_size = buf_size;
 }
 
 static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
-                                      const uint8_t *data_end) {
+                                      const uint8_t *data_end, int start_tile,
+                                      int end_tile) {
   AV1_COMMON *const cm = &pbi->common;
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   const int tile_cols = cm->tile_cols;
   const int tile_rows = cm->tile_rows;
-  const int num_workers = AOMMIN(pbi->max_threads & ~1, tile_cols);
+  const int n_tiles = tile_cols * tile_rows;
   TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
-#if CONFIG_EXT_TILE
   const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
   const int single_row = pbi->dec_tile_row >= 0;
   const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
   const int single_col = pbi->dec_tile_col >= 0;
-#endif  // CONFIG_EXT_TILE
   int tile_rows_start;
   int tile_rows_end;
   int tile_cols_start;
   int tile_cols_end;
-  int tile_row, tile_col;
-  int i;
+  int tile_count_tg;
+  int num_workers;
+  int worker_idx;
+  const uint8_t *raw_data_end = NULL;
 
-#if CONFIG_EXT_TILE
   if (cm->large_scale_tile) {
     tile_rows_start = single_row ? dec_tile_row : 0;
     tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
     tile_cols_start = single_col ? dec_tile_col : 0;
     tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
   } else {
-#endif  // CONFIG_EXT_TILE
     tile_rows_start = 0;
     tile_rows_end = tile_rows;
     tile_cols_start = 0;
     tile_cols_end = tile_cols;
-#if CONFIG_EXT_TILE
   }
-#endif  // CONFIG_EXT_TILE
+  tile_count_tg = end_tile - start_tile + 1;
+  num_workers = AOMMIN(pbi->max_threads, tile_count_tg);
 
-#if !CONFIG_ANS
-  int final_worker = -1;
-#endif  // !CONFIG_ANS
+  // No tiles to decode.
+  if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
+      // First tile is larger than end_tile.
+      tile_rows_start * tile_cols + tile_cols_start > end_tile ||
+      // Last tile is smaller than start_tile.
+      (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile)
+    return data;
 
   assert(tile_rows <= MAX_TILE_ROWS);
   assert(tile_cols <= MAX_TILE_COLS);
-
-  assert(tile_cols * tile_rows > 1);
-
-  // TODO(jzern): See if we can remove the restriction of passing in max
-  // threads to the decoder.
-  if (pbi->num_tile_workers == 0) {
-    const int num_threads = pbi->max_threads & ~1;
+  assert(tile_count_tg > 0);
+  assert(num_workers > 0);
+  assert(start_tile <= end_tile);
+  assert(start_tile >= 0 && end_tile < n_tiles);
+
+  // Create workers and thread_data
+  if (pbi->num_workers == 0) {
+    const int num_threads = pbi->max_threads;
     CHECK_MEM_ERROR(cm, pbi->tile_workers,
                     aom_malloc(num_threads * sizeof(*pbi->tile_workers)));
-    // Ensure tile data offsets will be properly aligned. This may fail on
-    // platforms without DECLARE_ALIGNED().
-    assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
-    CHECK_MEM_ERROR(
-        cm, pbi->tile_worker_data,
-        aom_memalign(32, num_threads * sizeof(*pbi->tile_worker_data)));
-    CHECK_MEM_ERROR(cm, pbi->tile_worker_info,
-                    aom_malloc(num_threads * sizeof(*pbi->tile_worker_info)));
-    for (i = 0; i < num_threads; ++i) {
-      AVxWorker *const worker = &pbi->tile_workers[i];
-      ++pbi->num_tile_workers;
+    CHECK_MEM_ERROR(cm, pbi->thread_data,
+                    aom_malloc(num_threads * sizeof(*pbi->thread_data)));
+
+    for (worker_idx = 0; worker_idx < num_threads; ++worker_idx) {
+      AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+      DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+      ++pbi->num_workers;
 
       winterface->init(worker);
-      if (i < num_threads - 1 && !winterface->reset(worker)) {
+      if (worker_idx < num_threads - 1 && !winterface->reset(worker)) {
         aom_internal_error(&cm->error, AOM_CODEC_ERROR,
                            "Tile decoder thread creation failed");
       }
-    }
-  }
 
-  // Reset tile decoding hook
-  for (i = 0; i < num_workers; ++i) {
-    AVxWorker *const worker = &pbi->tile_workers[i];
-    winterface->sync(worker);
-    worker->hook = (AVxWorkerHook)tile_worker_hook;
-    worker->data1 = &pbi->tile_worker_data[i];
-    worker->data2 = &pbi->tile_worker_info[i];
+      if (worker_idx < num_threads - 1) {
+        // Allocate thread data.
+        CHECK_MEM_ERROR(cm, thread_data->td,
+                        aom_memalign(32, sizeof(*thread_data->td)));
+        av1_zero(*thread_data->td);
+      } else {
+        // Main thread acts as a worker and uses the thread data in pbi
+        thread_data->td = &pbi->td;
+      }
+    }
   }
-
-  // Initialize thread frame counts.
-  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-    for (i = 0; i < num_workers; ++i) {
-      TileWorkerData *const twd = (TileWorkerData *)pbi->tile_workers[i].data1;
-      av1_zero(twd->counts);
+  const int use_highbd = cm->use_highbitdepth ? 1 : 0;
+  const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
+  for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) {
+    DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+    if (thread_data->td->mc_buf_size != buf_size) {
+      av1_free_mc_tmp_buf(thread_data->td, use_highbd);
+      allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd);
     }
   }
 
-// Load tile data into tile_buffers
-#if CONFIG_EXT_TILE
+    // get tile size in tile group
+#if EXT_TILE_DEBUG
   if (cm->large_scale_tile)
-    get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
+    raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
   else
-#endif  // CONFIG_EXT_TILE
-    get_tile_buffers(pbi, data, data_end, tile_buffers, 0,
-                     cm->tile_rows * cm->tile_cols - 1);
+#endif  // EXT_TILE_DEBUG
+    get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
 
-  for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
-    // Sort the buffers in this tile row based on size in descending order.
-    qsort(&tile_buffers[tile_row][tile_cols_start],
-          tile_cols_end - tile_cols_start, sizeof(tile_buffers[0][0]),
-          compare_tile_buffers);
-
-    // Rearrange the tile buffers in this tile row such that per-tile group
-    // the largest, and presumably the most difficult tile will be decoded in
-    // the main thread. This should help minimize the number of instances
-    // where the main thread is waiting for a worker to complete.
-    {
-      int group_start;
-      for (group_start = tile_cols_start; group_start < tile_cols_end;
-           group_start += num_workers) {
-        const int group_end = AOMMIN(group_start + num_workers, tile_cols);
-        const TileBufferDec largest = tile_buffers[tile_row][group_start];
-        memmove(&tile_buffers[tile_row][group_start],
-                &tile_buffers[tile_row][group_start + 1],
-                (group_end - group_start - 1) * sizeof(tile_buffers[0][0]));
-        tile_buffers[tile_row][group_end - 1] = largest;
-      }
-    }
+  if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
+    aom_free(pbi->tile_data);
+    CHECK_MEM_ERROR(cm, pbi->tile_data,
+                    aom_memalign(32, n_tiles * sizeof(*pbi->tile_data)));
+    pbi->allocated_tiles = n_tiles;
+  }
 
-    for (tile_col = tile_cols_start; tile_col < tile_cols_end;) {
-      // Launch workers for individual columns
-      for (i = 0; i < num_workers && tile_col < tile_cols_end;
-           ++i, ++tile_col) {
-        TileBufferDec *const buf = &tile_buffers[tile_row][tile_col];
-        AVxWorker *const worker = &pbi->tile_workers[i];
-        TileWorkerData *const twd = (TileWorkerData *)worker->data1;
-        TileInfo *const tile_info = (TileInfo *)worker->data2;
-
-        twd->pbi = pbi;
-        twd->xd = pbi->mb;
-        twd->xd.corrupted = 0;
-        twd->xd.counts =
-            cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD
-                ? &twd->counts
-                : NULL;
-        av1_zero(twd->dqcoeff);
-        av1_tile_init(tile_info, cm, tile_row, buf->col);
-        av1_tile_init(&twd->xd.tile, cm, tile_row, buf->col);
-
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-        dec_setup_across_tile_boundary_info(cm, tile_info);
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-
-        setup_bool_decoder(buf->data, data_end, buf->size, &cm->error,
-                           &twd->bit_reader,
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-                           1 << cm->ans_window_size_log2,
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-                           pbi->decrypt_cb, pbi->decrypt_state);
-        av1_init_macroblockd(cm, &twd->xd,
-#if CONFIG_PVQ
-                             twd->pvq_ref_coeff,
-#endif
-#if CONFIG_CFL
-                             &twd->cfl,
-#endif
-                             twd->dqcoeff);
-#if CONFIG_PVQ
-        daala_dec_init(cm, &twd->xd.daala_dec, &twd->bit_reader);
-        twd->xd.daala_dec.state.adapt = &twd->tctx.pvq_context;
+  // Reset tile decoding hook
+  for (worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+    AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+    DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+    winterface->sync(worker);
+
+    worker->hook = tile_worker_hook;
+    worker->data1 = thread_data;
+    worker->data2 = pbi;
+  }
+#if CONFIG_ACCOUNTING
+  if (pbi->acct_enabled) {
+    aom_accounting_reset(&pbi->accounting);
+  }
 #endif
-        // Initialise the tile context from the frame context
-        twd->tctx = *cm->fc;
-        twd->xd.tile_ctx = &twd->tctx;
-        twd->xd.plane[0].color_index_map = twd->color_index_map[0];
-        twd->xd.plane[1].color_index_map = twd->color_index_map[1];
-
-        worker->had_error = 0;
-        if (i == num_workers - 1 || tile_col == tile_cols_end - 1) {
-          winterface->execute(worker);
-        } else {
-          winterface->launch(worker);
-        }
+  for (int row = 0; row < tile_rows; row++) {
+    for (int col = 0; col < tile_cols; col++) {
+      TileDataDec *tile_data = pbi->tile_data + row * cm->tile_cols + col;
+      av1_tile_init(&tile_data->tile_info, cm, row, col);
+    }
+  }
 
-#if !CONFIG_ANS
-        if (tile_row == tile_rows - 1 && buf->col == tile_cols - 1) {
-          final_worker = i;
-        }
-#endif  // !CONFIG_ANS
-      }
+  if (pbi->tile_mt_info.alloc_tile_cols != tile_cols ||
+      pbi->tile_mt_info.alloc_tile_rows != tile_rows) {
+    av1_dealloc_dec_jobs(&pbi->tile_mt_info);
+    alloc_dec_jobs(&pbi->tile_mt_info, cm, tile_rows, tile_cols);
+  }
+  enqueue_tile_jobs(pbi, cm, tile_rows_start, tile_rows_end, tile_cols_start,
+                    tile_cols_end, start_tile, end_tile);
+  qsort(pbi->tile_mt_info.job_queue, pbi->tile_mt_info.jobs_enqueued,
+        sizeof(pbi->tile_mt_info.job_queue[0]), compare_tile_buffers);
 
-      // Sync all workers
-      for (; i > 0; --i) {
-        AVxWorker *const worker = &pbi->tile_workers[i - 1];
-        // TODO(jzern): The tile may have specific error data associated with
-        // its aom_internal_error_info which could be propagated to the main
-        // info in cm. Additionally once the threads have been synced and an
-        // error is detected, there's no point in continuing to decode tiles.
-        pbi->mb.corrupted |= !winterface->sync(worker);
+  {
+    const int base = tile_count_tg / num_workers;
+    const int remain = tile_count_tg % num_workers;
+    int tile_start = start_tile;
+    int corrupted = 0;
+
+    for (worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+      // compute number of tiles assign to each worker
+      const int count = base + (remain + worker_idx) / num_workers;
+      AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+      DecWorkerData *const thread_data = (DecWorkerData *)worker->data1;
+
+      thread_data->data_end = data_end;
+      tile_start += count;
+
+      worker->had_error = 0;
+      if (worker_idx == num_workers - 1) {
+        winterface->execute(worker);
+      } else {
+        winterface->launch(worker);
       }
     }
-  }
 
-  // Accumulate thread frame counts.
-  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-    for (i = 0; i < num_workers; ++i) {
-      TileWorkerData *const twd = (TileWorkerData *)pbi->tile_workers[i].data1;
-      av1_accumulate_frame_counts(&cm->counts, &twd->counts);
+    for (; worker_idx > 0; --worker_idx) {
+      AVxWorker *const worker = &pbi->tile_workers[worker_idx - 1];
+      aom_merge_corrupted_flag(&corrupted, !winterface->sync(worker));
     }
+
+    pbi->mb.corrupted = corrupted;
   }
 
-#if CONFIG_EXT_TILE
+  if (pbi->mb.corrupted)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Failed to decode tile data");
+
   if (cm->large_scale_tile) {
+    if (n_tiles == 1) {
+      // Find the end of the single tile buffer
+      return aom_reader_find_end(&pbi->tile_data->bit_reader);
+    }
     // Return the end of the last tile buffer
-    return tile_buffers[tile_rows - 1][tile_cols - 1].raw_data_end;
-  } else {
-#endif  // CONFIG_EXT_TILE
-#if CONFIG_ANS
-    return data_end;
-#else
-  assert(final_worker != -1);
-  {
-    TileWorkerData *const twd =
-        (TileWorkerData *)pbi->tile_workers[final_worker].data1;
-    return aom_reader_find_end(&twd->bit_reader);
-  }
-#endif  // CONFIG_ANS
-#if CONFIG_EXT_TILE
+    return raw_data_end;
   }
-#endif  // CONFIG_EXT_TILE
+  TileDataDec *const tile_data = pbi->tile_data + end_tile;
+
+  return aom_reader_find_end(&tile_data->bit_reader);
 }
 
 static void error_handler(void *data) {
@@ -4376,217 +3063,462 @@ static void error_handler(void *data) {
   aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet");
 }
 
-static void read_bitdepth_colorspace_sampling(AV1_COMMON *cm,
-                                              struct aom_read_bit_buffer *rb,
-                                              int allow_lowbitdepth) {
-  if (cm->profile >= PROFILE_2) {
-    cm->bit_depth = aom_rb_read_bit(rb) ? AOM_BITS_12 : AOM_BITS_10;
+// Reads the high_bitdepth and twelve_bit fields in color_config() and sets
+// cm->bit_depth based on the values of those fields and cm->profile. Reports
+// errors by calling rb->error_handler() or aom_internal_error().
+static void av1_read_bitdepth(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  const int high_bitdepth = aom_rb_read_bit(rb);
+  if (cm->profile == PROFILE_2 && high_bitdepth) {
+    const int twelve_bit = aom_rb_read_bit(rb);
+    cm->bit_depth = twelve_bit ? AOM_BITS_12 : AOM_BITS_10;
+  } else if (cm->profile <= PROFILE_2) {
+    cm->bit_depth = high_bitdepth ? AOM_BITS_10 : AOM_BITS_8;
   } else {
-    cm->bit_depth = AOM_BITS_8;
+    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Unsupported profile/bit-depth combination");
   }
+}
 
-#if CONFIG_HIGHBITDEPTH
-  cm->use_highbitdepth = cm->bit_depth > AOM_BITS_8 || !allow_lowbitdepth;
-#else
-  (void)allow_lowbitdepth;
-#endif
-#if CONFIG_COLORSPACE_HEADERS
-  cm->color_space = aom_rb_read_literal(rb, 5);
-  cm->transfer_function = aom_rb_read_literal(rb, 5);
-#else
-  cm->color_space = aom_rb_read_literal(rb, 3);
-#endif
-  if (cm->color_space != AOM_CS_SRGB) {
-    // [16,235] (including xvycc) vs [0,255] range
-    cm->color_range = aom_rb_read_bit(rb);
-    if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
-      cm->subsampling_x = aom_rb_read_bit(rb);
-      cm->subsampling_y = aom_rb_read_bit(rb);
-      if (cm->subsampling_x == 1 && cm->subsampling_y == 1)
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                           "4:2:0 color not supported in profile 1 or 3");
-      if (aom_rb_read_bit(rb))
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Reserved bit set");
-    } else {
-      cm->subsampling_y = cm->subsampling_x = 1;
+void av1_read_film_grain_params(AV1_COMMON *cm,
+                                struct aom_read_bit_buffer *rb) {
+  aom_film_grain_t *pars = &cm->film_grain_params;
+
+  pars->apply_grain = aom_rb_read_bit(rb);
+  if (!pars->apply_grain) {
+    memset(pars, 0, sizeof(*pars));
+    return;
+  }
+
+  pars->random_seed = aom_rb_read_literal(rb, 16);
+  if (cm->frame_type == INTER_FRAME)
+    pars->update_parameters = aom_rb_read_bit(rb);
+  else
+    pars->update_parameters = 1;
+
+  if (!pars->update_parameters) {
+    // inherit parameters from a previous reference frame
+    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+    int film_grain_params_ref_idx = aom_rb_read_literal(rb, 3);
+    int buf_idx = cm->ref_frame_map[film_grain_params_ref_idx];
+    if (buf_idx == INVALID_IDX) {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Invalid Film grain reference idx");
     }
-#if CONFIG_COLORSPACE_HEADERS
-    if (cm->subsampling_x == 1 && cm->subsampling_y == 1) {
-      cm->chroma_sample_position = aom_rb_read_literal(rb, 2);
+    if (!frame_bufs[buf_idx].film_grain_params_present) {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Film grain reference parameters not available");
     }
-#endif
+    uint16_t random_seed = pars->random_seed;
+    *pars = frame_bufs[buf_idx].film_grain_params;  // inherit paramaters
+    pars->random_seed = random_seed;                // with new random seed
+    return;
+  }
+
+  // Scaling functions parameters
+  pars->num_y_points = aom_rb_read_literal(rb, 4);  // max 14
+  if (pars->num_y_points > 14)
+    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Number of points for film grain luma scaling function "
+                       "exceeds the maximum value.");
+  for (int i = 0; i < pars->num_y_points; i++) {
+    pars->scaling_points_y[i][0] = aom_rb_read_literal(rb, 8);
+    if (i && pars->scaling_points_y[i - 1][0] >= pars->scaling_points_y[i][0])
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "First coordinate of the scaling function points "
+                         "shall be increasing.");
+    pars->scaling_points_y[i][1] = aom_rb_read_literal(rb, 8);
+  }
+
+  if (!cm->seq_params.monochrome)
+    pars->chroma_scaling_from_luma = aom_rb_read_bit(rb);
+
+  if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma ||
+      ((cm->subsampling_x == 1) && (cm->subsampling_y == 1) &&
+       (pars->num_y_points == 0))) {
+    pars->num_cb_points = 0;
+    pars->num_cr_points = 0;
   } else {
-    if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
-      // Note if colorspace is SRGB then 4:4:4 chroma sampling is assumed.
-      // 4:2:2 or 4:4:0 chroma sampling is not allowed.
-      cm->subsampling_y = cm->subsampling_x = 0;
-      if (aom_rb_read_bit(rb))
+    pars->num_cb_points = aom_rb_read_literal(rb, 4);  // max 10
+    if (pars->num_cb_points > 10)
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Number of points for film grain cb scaling function "
+                         "exceeds the maximum value.");
+    for (int i = 0; i < pars->num_cb_points; i++) {
+      pars->scaling_points_cb[i][0] = aom_rb_read_literal(rb, 8);
+      if (i &&
+          pars->scaling_points_cb[i - 1][0] >= pars->scaling_points_cb[i][0])
         aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Reserved bit set");
-    } else {
+                           "First coordinate of the scaling function points "
+                           "shall be increasing.");
+      pars->scaling_points_cb[i][1] = aom_rb_read_literal(rb, 8);
+    }
+
+    pars->num_cr_points = aom_rb_read_literal(rb, 4);  // max 10
+    if (pars->num_cr_points > 10)
       aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                         "4:4:4 color not supported in profile 0 or 2");
+                         "Number of points for film grain cr scaling function "
+                         "exceeds the maximum value.");
+    for (int i = 0; i < pars->num_cr_points; i++) {
+      pars->scaling_points_cr[i][0] = aom_rb_read_literal(rb, 8);
+      if (i &&
+          pars->scaling_points_cr[i - 1][0] >= pars->scaling_points_cr[i][0])
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "First coordinate of the scaling function points "
+                           "shall be increasing.");
+      pars->scaling_points_cr[i][1] = aom_rb_read_literal(rb, 8);
     }
+
+    if ((cm->subsampling_x == 1) && (cm->subsampling_y == 1) &&
+        (((pars->num_cb_points == 0) && (pars->num_cr_points != 0)) ||
+         ((pars->num_cb_points != 0) && (pars->num_cr_points == 0))))
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "In YCbCr 4:2:0, film grain shall be applied "
+                         "to both chroma components or neither.");
   }
+
+  pars->scaling_shift = aom_rb_read_literal(rb, 2) + 8;  // 8 + value
+
+  // AR coefficients
+  // Only sent if the corresponsing scaling function has
+  // more than 0 points
+
+  pars->ar_coeff_lag = aom_rb_read_literal(rb, 2);
+
+  int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+  int num_pos_chroma = num_pos_luma;
+  if (pars->num_y_points > 0) ++num_pos_chroma;
+
+  if (pars->num_y_points)
+    for (int i = 0; i < num_pos_luma; i++)
+      pars->ar_coeffs_y[i] = aom_rb_read_literal(rb, 8) - 128;
+
+  if (pars->num_cb_points || pars->chroma_scaling_from_luma)
+    for (int i = 0; i < num_pos_chroma; i++)
+      pars->ar_coeffs_cb[i] = aom_rb_read_literal(rb, 8) - 128;
+
+  if (pars->num_cr_points || pars->chroma_scaling_from_luma)
+    for (int i = 0; i < num_pos_chroma; i++)
+      pars->ar_coeffs_cr[i] = aom_rb_read_literal(rb, 8) - 128;
+
+  pars->ar_coeff_shift = aom_rb_read_literal(rb, 2) + 6;  // 6 + value
+
+  pars->grain_scale_shift = aom_rb_read_literal(rb, 2);
+
+  if (pars->num_cb_points) {
+    pars->cb_mult = aom_rb_read_literal(rb, 8);
+    pars->cb_luma_mult = aom_rb_read_literal(rb, 8);
+    pars->cb_offset = aom_rb_read_literal(rb, 9);
+  }
+
+  if (pars->num_cr_points) {
+    pars->cr_mult = aom_rb_read_literal(rb, 8);
+    pars->cr_luma_mult = aom_rb_read_literal(rb, 8);
+    pars->cr_offset = aom_rb_read_literal(rb, 9);
+  }
+
+  pars->overlap_flag = aom_rb_read_bit(rb);
+
+  pars->clip_to_restricted_range = aom_rb_read_bit(rb);
 }
 
-#if CONFIG_REFERENCE_BUFFER
-void read_sequence_header(SequenceHeader *seq_params,
-                          struct aom_read_bit_buffer *rb) {
-  /* Placeholder for actually reading from the bitstream */
-  seq_params->frame_id_numbers_present_flag = aom_rb_read_bit(rb);
-  if (seq_params->frame_id_numbers_present_flag) {
-    seq_params->frame_id_length_minus7 = aom_rb_read_literal(rb, 4);
-    seq_params->delta_frame_id_length_minus2 = aom_rb_read_literal(rb, 4);
+static void read_film_grain(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  if (cm->film_grain_params_present && (cm->show_frame || cm->showable_frame)) {
+    av1_read_film_grain_params(cm, rb);
+  } else {
+    memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
   }
+  cm->film_grain_params.bit_depth = cm->bit_depth;
+  memcpy(&cm->cur_frame->film_grain_params, &cm->film_grain_params,
+         sizeof(aom_film_grain_t));
 }
-#endif  // CONFIG_REFERENCE_BUFFER
 
-static void read_compound_tools(AV1_COMMON *cm,
-                                struct aom_read_bit_buffer *rb) {
-  (void)cm;
-  (void)rb;
-#if CONFIG_INTERINTRA
-  if (!frame_is_intra_only(cm) && cm->reference_mode != COMPOUND_REFERENCE) {
-    cm->allow_interintra_compound = aom_rb_read_bit(rb);
+void av1_read_color_config(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
+                           int allow_lowbitdepth) {
+  av1_read_bitdepth(cm, rb);
+
+  cm->use_highbitdepth = cm->bit_depth > AOM_BITS_8 || !allow_lowbitdepth;
+  // monochrome bit (not needed for PROFILE_1)
+  const int is_monochrome = cm->profile != PROFILE_1 ? aom_rb_read_bit(rb) : 0;
+  cm->seq_params.monochrome = is_monochrome;
+  int color_description_present_flag = aom_rb_read_bit(rb);
+  if (color_description_present_flag) {
+    cm->color_primaries = aom_rb_read_literal(rb, 8);
+    cm->transfer_characteristics = aom_rb_read_literal(rb, 8);
+    cm->matrix_coefficients = aom_rb_read_literal(rb, 8);
   } else {
-    cm->allow_interintra_compound = 0;
-  }
-#endif  // CONFIG_INTERINTRA
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!frame_is_intra_only(cm)) {
-#else   // !CONFIG_COMPOUND_SINGLEREF
-  if (!frame_is_intra_only(cm) && cm->reference_mode != SINGLE_REFERENCE) {
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    cm->allow_masked_compound = aom_rb_read_bit(rb);
+    cm->color_primaries = AOM_CICP_CP_UNSPECIFIED;
+    cm->transfer_characteristics = AOM_CICP_TC_UNSPECIFIED;
+    cm->matrix_coefficients = AOM_CICP_MC_UNSPECIFIED;
+  }
+  if (is_monochrome) {
+    // [16,235] (including xvycc) vs [0,255] range
+    cm->color_range = aom_rb_read_bit(rb);
+    cm->subsampling_y = cm->subsampling_x = 1;
+    cm->chroma_sample_position = AOM_CSP_UNKNOWN;
+    cm->separate_uv_delta_q = 0;
+    return;
+  }
+  if (cm->color_primaries == AOM_CICP_CP_BT_709 &&
+      cm->transfer_characteristics == AOM_CICP_TC_SRGB &&
+      cm->matrix_coefficients == AOM_CICP_MC_IDENTITY) {  // it would be better
+                                                          // to remove this
+                                                          // dependency too
+    cm->subsampling_y = cm->subsampling_x = 0;
+    cm->color_range = 1;  // assume full color-range
+    if (!(cm->profile == PROFILE_1 ||
+          (cm->profile == PROFILE_2 && cm->bit_depth == AOM_BITS_12))) {
+      aom_internal_error(
+          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          "sRGB colorspace not compatible with specified profile");
+    }
   } else {
-    cm->allow_masked_compound = 0;
+    // [16,235] (including xvycc) vs [0,255] range
+    cm->color_range = aom_rb_read_bit(rb);
+    if (cm->profile == PROFILE_0) {
+      // 420 only
+      cm->subsampling_x = cm->subsampling_y = 1;
+    } else if (cm->profile == PROFILE_1) {
+      // 444 only
+      cm->subsampling_x = cm->subsampling_y = 0;
+    } else {
+      assert(cm->profile == PROFILE_2);
+      if (cm->bit_depth == AOM_BITS_12) {
+        cm->subsampling_x = aom_rb_read_bit(rb);
+        if (cm->subsampling_x)
+          cm->subsampling_y = aom_rb_read_bit(rb);  // 422 or 420
+        else
+          cm->subsampling_y = 0;  // 444
+      } else {
+        // 422
+        cm->subsampling_x = 1;
+        cm->subsampling_y = 0;
+      }
+    }
+    if (cm->matrix_coefficients == AOM_CICP_MC_IDENTITY &&
+        (cm->subsampling_x || cm->subsampling_y)) {
+      aom_internal_error(
+          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          "Identity CICP Matrix incompatible with non 4:4:4 color sampling");
+    }
+    if (cm->subsampling_x && cm->subsampling_y) {
+      cm->chroma_sample_position = aom_rb_read_literal(rb, 2);
+    }
   }
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
+  cm->separate_uv_delta_q = aom_rb_read_bit(rb);
 }
 
-#if CONFIG_VAR_REFS
-static void check_valid_ref_frames(AV1_COMMON *cm) {
-  MV_REFERENCE_FRAME ref_frame;
-  // TODO(zoeliu): To handle ALTREF_FRAME the same way as do with other
-  //               reference frames: Current encoder invalid ALTREF when ALTREF
-  //               is the same as LAST, but invalid all the other references
-  //               when they are the same as ALTREF.
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - LAST_FRAME];
-
-    if (ref_buf->idx != INVALID_IDX) {
-      ref_buf->is_valid = 1;
-
-      MV_REFERENCE_FRAME ref;
-      for (ref = LAST_FRAME; ref < ref_frame; ++ref) {
-        RefBuffer *const buf = &cm->frame_refs[ref - LAST_FRAME];
-        if (buf->is_valid && buf->idx == ref_buf->idx) {
-          if (ref_frame != ALTREF_FRAME || ref == LAST_FRAME) {
-            ref_buf->is_valid = 0;
-            break;
-          } else {
-            buf->is_valid = 0;
-          }
-        }
+void av1_read_timing_info_header(AV1_COMMON *cm,
+                                 struct aom_read_bit_buffer *rb) {
+  cm->timing_info.num_units_in_display_tick = aom_rb_read_unsigned_literal(
+      rb, 32);  // Number of units in a display tick
+  cm->timing_info.time_scale =
+      aom_rb_read_unsigned_literal(rb, 32);  // Time scale
+  if (cm->timing_info.num_units_in_display_tick == 0 ||
+      cm->timing_info.time_scale == 0) {
+    aom_internal_error(
+        &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        "num_units_in_display_tick and time_scale must be greater than 0.");
+  }
+  cm->timing_info.equal_picture_interval =
+      aom_rb_read_bit(rb);  // Equal picture interval bit
+  if (cm->timing_info.equal_picture_interval) {
+    cm->timing_info.num_ticks_per_picture =
+        aom_rb_read_uvlc(rb) + 1;  // ticks per picture
+    if (cm->timing_info.num_ticks_per_picture == 0) {
+      aom_internal_error(
+          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          "num_ticks_per_picture_minus_1 cannot be (1 << 32) − 1.");
+    }
+  }
+}
+
+void av1_read_decoder_model_info(AV1_COMMON *cm,
+                                 struct aom_read_bit_buffer *rb) {
+  cm->buffer_model.encoder_decoder_buffer_delay_length =
+      aom_rb_read_literal(rb, 5) + 1;
+  cm->buffer_model.num_units_in_decoding_tick = aom_rb_read_unsigned_literal(
+      rb, 32);  // Number of units in a decoding tick
+  cm->buffer_model.buffer_removal_delay_length = aom_rb_read_literal(rb, 5) + 1;
+  cm->buffer_model.frame_presentation_delay_length =
+      aom_rb_read_literal(rb, 5) + 1;
+}
+
+void av1_read_op_parameters_info(AV1_COMMON *const cm,
+                                 struct aom_read_bit_buffer *rb, int op_num) {
+  // The cm->op_params array has MAX_NUM_OPERATING_POINTS + 1 elements.
+  if (op_num > MAX_NUM_OPERATING_POINTS) {
+    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                       "AV1 does not support %d decoder model operating points",
+                       op_num + 1);
+  }
+
+  cm->op_params[op_num].decoder_buffer_delay = aom_rb_read_literal(
+      rb, cm->buffer_model.encoder_decoder_buffer_delay_length);
+
+  cm->op_params[op_num].encoder_buffer_delay = aom_rb_read_literal(
+      rb, cm->buffer_model.encoder_decoder_buffer_delay_length);
+
+  cm->op_params[op_num].low_delay_mode_flag = aom_rb_read_bit(rb);
+}
+
+static void av1_read_tu_pts_info(AV1_COMMON *const cm,
+                                 struct aom_read_bit_buffer *rb) {
+  cm->tu_presentation_delay =
+      aom_rb_read_literal(rb, cm->buffer_model.frame_presentation_delay_length);
+}
+
+void read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  // rb->error_handler may be triggered during aom_rb_read_bit(), raising
+  // internal errors and immediate decoding termination. We use a local variable
+  // to store the info. as we decode. At the end, if no errors have occurred,
+  // cm->seq_params is updated.
+  SequenceHeader sh = cm->seq_params;
+  SequenceHeader *const seq_params = &sh;
+  int num_bits_width = aom_rb_read_literal(rb, 4) + 1;
+  int num_bits_height = aom_rb_read_literal(rb, 4) + 1;
+  int max_frame_width = aom_rb_read_literal(rb, num_bits_width) + 1;
+  int max_frame_height = aom_rb_read_literal(rb, num_bits_height) + 1;
+
+  seq_params->num_bits_width = num_bits_width;
+  seq_params->num_bits_height = num_bits_height;
+  seq_params->max_frame_width = max_frame_width;
+  seq_params->max_frame_height = max_frame_height;
+
+  if (seq_params->reduced_still_picture_hdr) {
+    seq_params->frame_id_numbers_present_flag = 0;
+  } else {
+    seq_params->frame_id_numbers_present_flag = aom_rb_read_bit(rb);
+  }
+  if (seq_params->frame_id_numbers_present_flag) {
+    // We must always have delta_frame_id_length < frame_id_length,
+    // in order for a frame to be referenced with a unique delta.
+    // Avoid wasting bits by using a coding that enforces this restriction.
+    seq_params->delta_frame_id_length = aom_rb_read_literal(rb, 4) + 2;
+    seq_params->frame_id_length =
+        aom_rb_read_literal(rb, 3) + seq_params->delta_frame_id_length + 1;
+    if (seq_params->frame_id_length > 16)
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid frame_id_length");
+  }
+
+  setup_sb_size(seq_params, rb);
+
+  seq_params->enable_filter_intra = aom_rb_read_bit(rb);
+  seq_params->enable_intra_edge_filter = aom_rb_read_bit(rb);
+
+  if (seq_params->reduced_still_picture_hdr) {
+    seq_params->enable_interintra_compound = 0;
+    seq_params->enable_masked_compound = 0;
+    seq_params->enable_warped_motion = 0;
+    seq_params->enable_dual_filter = 0;
+    seq_params->enable_order_hint = 0;
+    seq_params->enable_jnt_comp = 0;
+    seq_params->enable_ref_frame_mvs = 0;
+    seq_params->force_screen_content_tools = 2;  // SELECT_SCREEN_CONTENT_TOOLS
+    seq_params->force_integer_mv = 2;            // SELECT_INTEGER_MV
+    seq_params->order_hint_bits_minus_1 = -1;
+  } else {
+    seq_params->enable_interintra_compound = aom_rb_read_bit(rb);
+    seq_params->enable_masked_compound = aom_rb_read_bit(rb);
+    seq_params->enable_warped_motion = aom_rb_read_bit(rb);
+    seq_params->enable_dual_filter = aom_rb_read_bit(rb);
+
+    seq_params->enable_order_hint = aom_rb_read_bit(rb);
+    seq_params->enable_jnt_comp =
+        seq_params->enable_order_hint ? aom_rb_read_bit(rb) : 0;
+    seq_params->enable_ref_frame_mvs =
+        seq_params->enable_order_hint ? aom_rb_read_bit(rb) : 0;
+
+    if (aom_rb_read_bit(rb)) {
+      seq_params->force_screen_content_tools =
+          2;  // SELECT_SCREEN_CONTENT_TOOLS
+    } else {
+      seq_params->force_screen_content_tools = aom_rb_read_bit(rb);
+    }
+
+    if (seq_params->force_screen_content_tools > 0) {
+      if (aom_rb_read_bit(rb)) {
+        seq_params->force_integer_mv = 2;  // SELECT_INTEGER_MV
+      } else {
+        seq_params->force_integer_mv = aom_rb_read_bit(rb);
       }
     } else {
-      ref_buf->is_valid = 0;
+      seq_params->force_integer_mv = 2;  // SELECT_INTEGER_MV
     }
+    seq_params->order_hint_bits_minus_1 =
+        seq_params->enable_order_hint ? aom_rb_read_literal(rb, 3) : -1;
   }
+
+  seq_params->enable_superres = aom_rb_read_bit(rb);
+  seq_params->enable_cdef = aom_rb_read_bit(rb);
+  seq_params->enable_restoration = aom_rb_read_bit(rb);
+  cm->seq_params = *seq_params;
 }
-#endif  // CONFIG_VAR_REFS
 
-#if CONFIG_GLOBAL_MOTION
 static int read_global_motion_params(WarpedMotionParams *params,
                                      const WarpedMotionParams *ref_params,
                                      struct aom_read_bit_buffer *rb,
                                      int allow_hp) {
   TransformationType type = aom_rb_read_bit(rb);
   if (type != IDENTITY) {
-#if GLOBAL_TRANS_TYPES > 4
-    type += aom_rb_read_literal(rb, GLOBAL_TYPE_BITS);
-#else
     if (aom_rb_read_bit(rb))
       type = ROTZOOM;
     else
       type = aom_rb_read_bit(rb) ? TRANSLATION : AFFINE;
-#endif  // GLOBAL_TRANS_TYPES > 4
   }
 
-  int trans_bits;
-  int trans_dec_factor;
-  int trans_prec_diff;
   *params = default_warp_params;
   params->wmtype = type;
-  switch (type) {
-    case HOMOGRAPHY:
-    case HORTRAPEZOID:
-    case VERTRAPEZOID:
-      if (type != HORTRAPEZOID)
-        params->wmmat[6] =
-            aom_rb_read_signed_primitive_refsubexpfin(
-                rb, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
-                (ref_params->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF)) *
-            GM_ROW3HOMO_DECODE_FACTOR;
-      if (type != VERTRAPEZOID)
-        params->wmmat[7] =
-            aom_rb_read_signed_primitive_refsubexpfin(
-                rb, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
-                (ref_params->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF)) *
-            GM_ROW3HOMO_DECODE_FACTOR;
-    case AFFINE:
-    case ROTZOOM:
-      params->wmmat[2] = aom_rb_read_signed_primitive_refsubexpfin(
-                             rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-                             (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
-                                 (1 << GM_ALPHA_PREC_BITS)) *
-                             GM_ALPHA_DECODE_FACTOR +
-                         (1 << WARPEDMODEL_PREC_BITS);
-      if (type != VERTRAPEZOID)
-        params->wmmat[3] = aom_rb_read_signed_primitive_refsubexpfin(
-                               rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-                               (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF)) *
-                           GM_ALPHA_DECODE_FACTOR;
-      if (type >= AFFINE) {
-        if (type != HORTRAPEZOID)
-          params->wmmat[4] = aom_rb_read_signed_primitive_refsubexpfin(
-                                 rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-                                 (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF)) *
-                             GM_ALPHA_DECODE_FACTOR;
-        params->wmmat[5] = aom_rb_read_signed_primitive_refsubexpfin(
-                               rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-                               (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
-                                   (1 << GM_ALPHA_PREC_BITS)) *
-                               GM_ALPHA_DECODE_FACTOR +
-                           (1 << WARPEDMODEL_PREC_BITS);
-      } else {
-        params->wmmat[4] = -params->wmmat[3];
-        params->wmmat[5] = params->wmmat[2];
-      }
-    // fallthrough intended
-    case TRANSLATION:
-      trans_bits = (type == TRANSLATION) ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
-                                         : GM_ABS_TRANS_BITS;
-      trans_dec_factor = (type == TRANSLATION)
-                             ? GM_TRANS_ONLY_DECODE_FACTOR * (1 << !allow_hp)
-                             : GM_TRANS_DECODE_FACTOR;
-      trans_prec_diff = (type == TRANSLATION)
-                            ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
-                            : GM_TRANS_PREC_DIFF;
-      params->wmmat[0] = aom_rb_read_signed_primitive_refsubexpfin(
-                             rb, (1 << trans_bits) + 1, SUBEXPFIN_K,
-                             (ref_params->wmmat[0] >> trans_prec_diff)) *
-                         trans_dec_factor;
-      params->wmmat[1] = aom_rb_read_signed_primitive_refsubexpfin(
-                             rb, (1 << trans_bits) + 1, SUBEXPFIN_K,
-                             (ref_params->wmmat[1] >> trans_prec_diff)) *
-                         trans_dec_factor;
-    case IDENTITY: break;
-    default: assert(0);
+
+  if (type >= ROTZOOM) {
+    params->wmmat[2] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
+                               (1 << GM_ALPHA_PREC_BITS)) *
+                           GM_ALPHA_DECODE_FACTOR +
+                       (1 << WARPEDMODEL_PREC_BITS);
+    params->wmmat[3] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF)) *
+                       GM_ALPHA_DECODE_FACTOR;
+  }
+
+  if (type >= AFFINE) {
+    params->wmmat[4] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF)) *
+                       GM_ALPHA_DECODE_FACTOR;
+    params->wmmat[5] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+                               (1 << GM_ALPHA_PREC_BITS)) *
+                           GM_ALPHA_DECODE_FACTOR +
+                       (1 << WARPEDMODEL_PREC_BITS);
+  } else {
+    params->wmmat[4] = -params->wmmat[3];
+    params->wmmat[5] = params->wmmat[2];
+  }
+
+  if (type >= TRANSLATION) {
+    const int trans_bits = (type == TRANSLATION)
+                               ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+                               : GM_ABS_TRANS_BITS;
+    const int trans_dec_factor =
+        (type == TRANSLATION) ? GM_TRANS_ONLY_DECODE_FACTOR * (1 << !allow_hp)
+                              : GM_TRANS_DECODE_FACTOR;
+    const int trans_prec_diff = (type == TRANSLATION)
+                                    ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+                                    : GM_TRANS_PREC_DIFF;
+    params->wmmat[0] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[0] >> trans_prec_diff)) *
+                       trans_dec_factor;
+    params->wmmat[1] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[1] >> trans_prec_diff)) *
+                       trans_dec_factor;
   }
+
   if (params->wmtype <= AFFINE) {
     int good_shear_params = get_shear_params(params);
     if (!good_shear_params) return 0;
@@ -4596,16 +3528,18 @@ static int read_global_motion_params(WarpedMotionParams *params,
 }
 
 static void read_global_motion(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
-  int frame;
-  for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+  for (int frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
     const WarpedMotionParams *ref_params =
-        cm->error_resilient_mode ? &default_warp_params
-                                 : &cm->prev_frame->global_motion[frame];
+        cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+                       : &default_warp_params;
     int good_params = read_global_motion_params(
         &cm->global_motion[frame], ref_params, rb, cm->allow_high_precision_mv);
-    if (!good_params)
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                         "Invalid shear parameters for global motion.");
+    if (!good_params) {
+#if WARPED_MOTION_DEBUG
+      printf("Warning: unexpected global motion shear params from aomenc\n");
+#endif
+      cm->global_motion[frame].invalid = 1;
+    }
 
     // TODO(sarahparker, debargha): The logic in the commented out code below
     // does not work currently and causes mismatches when resize is on. Fix it
@@ -4631,252 +3565,397 @@ static void read_global_motion(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
            */
   }
   memcpy(cm->cur_frame->global_motion, cm->global_motion,
-         TOTAL_REFS_PER_FRAME * sizeof(WarpedMotionParams));
+         REF_FRAMES * sizeof(WarpedMotionParams));
 }
-#endif  // CONFIG_GLOBAL_MOTION
 
-static size_t read_uncompressed_header(AV1Decoder *pbi,
-                                       struct aom_read_bit_buffer *rb) {
+static void show_existing_frame_reset(AV1Decoder *const pbi,
+                                      int existing_frame_idx) {
+  AV1_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = pool->frame_bufs;
+
+  assert(cm->show_existing_frame);
+
+  cm->frame_type = KEY_FRAME;
+
+  pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
+
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    cm->frame_refs[i].idx = INVALID_IDX;
+    cm->frame_refs[i].buf = NULL;
+  }
+
+  if (pbi->need_resync) {
+    memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+    pbi->need_resync = 0;
+  }
+
+  cm->cur_frame->intra_only = 1;
+
+  if (cm->seq_params.frame_id_numbers_present_flag) {
+    /* If bitmask is set, update reference frame id values and
+       mark frames as valid for reference.
+       Note that the displayed frame be valid for referencing
+       in order to have been selected.
+    */
+    int refresh_frame_flags = pbi->refresh_frame_flags;
+    int display_frame_id = cm->ref_frame_id[existing_frame_idx];
+    for (int i = 0; i < REF_FRAMES; i++) {
+      if ((refresh_frame_flags >> i) & 1) {
+        cm->ref_frame_id[i] = display_frame_id;
+        cm->valid_for_referencing[i] = 1;
+      }
+    }
+  }
+
+  cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+
+  // Generate next_ref_frame_map.
+  lock_buffer_pool(pool);
+  int ref_index = 0;
+  for (int mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+    if (mask & 1) {
+      cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
+      ++frame_bufs[cm->new_fb_idx].ref_count;
+    } else {
+      cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+    }
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+    ++ref_index;
+  }
+
+  for (; ref_index < REF_FRAMES; ++ref_index) {
+    cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+  }
+  unlock_buffer_pool(pool);
+  pbi->hold_ref_buf = 1;
+
+  // Reload the adapted CDFs from when we originally coded this keyframe
+  *cm->fc = cm->frame_contexts[existing_frame_idx];
+}
+
+static int read_uncompressed_header(AV1Decoder *pbi,
+                                    struct aom_read_bit_buffer *rb) {
   AV1_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
   BufferPool *const pool = cm->buffer_pool;
   RefCntBuffer *const frame_bufs = pool->frame_bufs;
-  int i, mask, ref_index = 0;
-  size_t sz;
+
+  if (!pbi->sequence_header_ready) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "No sequence header");
+  }
 
   cm->last_frame_type = cm->frame_type;
   cm->last_intra_only = cm->intra_only;
 
-#if CONFIG_EXT_REFS
   // NOTE: By default all coded frames to be used as a reference
   cm->is_reference_frame = 1;
-#endif  // CONFIG_EXT_REFS
 
-#if !CONFIG_OBU
-  if (aom_rb_read_literal(rb, 2) != AOM_FRAME_MARKER)
-    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                       "Invalid frame marker");
+  if (cm->seq_params.reduced_still_picture_hdr) {
+    cm->show_existing_frame = 0;
+    cm->show_frame = 1;
+    cm->frame_type = KEY_FRAME;
+    cm->error_resilient_mode = 1;
+  } else {
+    cm->show_existing_frame = aom_rb_read_bit(rb);
+    cm->reset_decoder_state = 0;
+
+    if (cm->show_existing_frame) {
+      // Show an existing frame directly.
+      const int existing_frame_idx = aom_rb_read_literal(rb, 3);
+      const int frame_to_show = cm->ref_frame_map[existing_frame_idx];
+      if (cm->seq_params.decoder_model_info_present_flag &&
+          cm->timing_info.equal_picture_interval == 0) {
+        av1_read_tu_pts_info(cm, rb);
+      }
+      if (cm->seq_params.frame_id_numbers_present_flag) {
+        int frame_id_length = cm->seq_params.frame_id_length;
+        int display_frame_id = aom_rb_read_literal(rb, frame_id_length);
+        /* Compare display_frame_id with ref_frame_id and check valid for
+         * referencing */
+        if (display_frame_id != cm->ref_frame_id[existing_frame_idx] ||
+            cm->valid_for_referencing[existing_frame_idx] == 0)
+          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                             "Reference buffer frame ID mismatch");
+      }
+      lock_buffer_pool(pool);
+      if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
+        unlock_buffer_pool(pool);
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Buffer %d does not contain a decoded frame",
+                           frame_to_show);
+      }
+      ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+      cm->reset_decoder_state =
+          frame_bufs[frame_to_show].frame_type == KEY_FRAME;
+      unlock_buffer_pool(pool);
 
-  cm->profile = av1_read_profile(rb);
+      cm->lf.filter_level[0] = 0;
+      cm->lf.filter_level[1] = 0;
+      cm->show_frame = 1;
 
-  const BITSTREAM_PROFILE MAX_SUPPORTED_PROFILE =
-      CONFIG_HIGHBITDEPTH ? MAX_PROFILES : PROFILE_2;
+      if (!frame_bufs[frame_to_show].showable_frame) {
+        aom_merge_corrupted_flag(&xd->corrupted, 1);
+      }
+      if (cm->reset_decoder_state) frame_bufs[frame_to_show].showable_frame = 0;
 
-  if (cm->profile >= MAX_SUPPORTED_PROFILE)
-    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                       "Unsupported bitstream profile");
-#endif
+      cm->film_grain_params = frame_bufs[frame_to_show].film_grain_params;
 
-#if CONFIG_EXT_TILE
-  cm->large_scale_tile = aom_rb_read_literal(rb, 1);
-#if CONFIG_REFERENCE_BUFFER
-  if (cm->large_scale_tile) cm->seq_params.frame_id_numbers_present_flag = 0;
-#endif  // CONFIG_REFERENCE_BUFFER
-#endif  // CONFIG_EXT_TILE
+      if (cm->reset_decoder_state) {
+        show_existing_frame_reset(pbi, existing_frame_idx);
+      } else {
+        pbi->refresh_frame_flags = 0;
+      }
 
-  cm->show_existing_frame = aom_rb_read_bit(rb);
+      return 0;
+    }
 
-  if (cm->show_existing_frame) {
-    // Show an existing frame directly.
-    const int existing_frame_idx = aom_rb_read_literal(rb, 3);
-    const int frame_to_show = cm->ref_frame_map[existing_frame_idx];
-#if CONFIG_REFERENCE_BUFFER
-    if (cm->seq_params.frame_id_numbers_present_flag) {
-      int frame_id_length = cm->seq_params.frame_id_length_minus7 + 7;
-      int display_frame_id = aom_rb_read_literal(rb, frame_id_length);
-      /* Compare display_frame_id with ref_frame_id and check valid for
-       * referencing */
-      if (display_frame_id != cm->ref_frame_id[existing_frame_idx] ||
-          cm->valid_for_referencing[existing_frame_idx] == 0)
-        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                           "Reference buffer frame ID mismatch");
+    cm->frame_type = (FRAME_TYPE)aom_rb_read_literal(rb, 2);  // 2 bits
+    cm->show_frame = aom_rb_read_bit(rb);
+    if (cm->seq_params.still_picture &&
+        (cm->frame_type != KEY_FRAME || !cm->show_frame)) {
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Still pictures must be coded as shown keyframes");
     }
-#endif
-    lock_buffer_pool(pool);
-    if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
-      unlock_buffer_pool(pool);
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                         "Buffer %d does not contain a decoded frame",
-                         frame_to_show);
+    cm->showable_frame = cm->frame_type != KEY_FRAME;
+    if (cm->show_frame) {
+      if (cm->seq_params.decoder_model_info_present_flag &&
+          cm->timing_info.equal_picture_interval == 0)
+        av1_read_tu_pts_info(cm, rb);
+    } else {
+      // See if this frame can be used as show_existing_frame in future
+      cm->showable_frame = aom_rb_read_bit(rb);
     }
-    ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
-    unlock_buffer_pool(pool);
+    cm->cur_frame->showable_frame = cm->showable_frame;
+    cm->intra_only = cm->frame_type == INTRA_ONLY_FRAME;
+    cm->error_resilient_mode =
+        frame_is_sframe(cm) || (cm->frame_type == KEY_FRAME && cm->show_frame)
+            ? 1
+            : aom_rb_read_bit(rb);
+  }
 
-#if CONFIG_LOOPFILTER_LEVEL
-    cm->lf.filter_level[0] = 0;
-    cm->lf.filter_level[1] = 0;
-#else
-    cm->lf.filter_level = 0;
-#endif
-    cm->show_frame = 1;
-    pbi->refresh_frame_flags = 0;
+  cm->disable_cdf_update = aom_rb_read_bit(rb);
+  if (cm->seq_params.force_screen_content_tools == 2) {
+    cm->allow_screen_content_tools = aom_rb_read_bit(rb);
+  } else {
+    cm->allow_screen_content_tools = cm->seq_params.force_screen_content_tools;
+  }
 
-    if (cm->frame_parallel_decode) {
-      for (i = 0; i < REF_FRAMES; ++i)
-        cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
+  if (cm->allow_screen_content_tools) {
+    if (cm->seq_params.force_integer_mv == 2) {
+      cm->cur_frame_force_integer_mv = aom_rb_read_bit(rb);
+    } else {
+      cm->cur_frame_force_integer_mv = cm->seq_params.force_integer_mv;
     }
-
-    return 0;
+  } else {
+    cm->cur_frame_force_integer_mv = 0;
   }
 
-#if !CONFIG_OBU
-  cm->frame_type = (FRAME_TYPE)aom_rb_read_bit(rb);
-  cm->show_frame = aom_rb_read_bit(rb);
-  if (cm->frame_type != KEY_FRAME)
-    cm->intra_only = cm->show_frame ? 0 : aom_rb_read_bit(rb);
-#else
-  cm->frame_type = (FRAME_TYPE)aom_rb_read_literal(rb, 2);  // 2 bits
-  cm->show_frame = aom_rb_read_bit(rb);
-  cm->intra_only = cm->frame_type == INTRA_ONLY_FRAME;
-#endif
-  cm->error_resilient_mode = aom_rb_read_bit(rb);
-#if CONFIG_REFERENCE_BUFFER
-#if !CONFIG_OBU
-  if (frame_is_intra_only(cm)) read_sequence_header(&cm->seq_params, rb);
-#endif  // !CONFIG_OBU
-  if (cm->seq_params.frame_id_numbers_present_flag) {
-    int frame_id_length = cm->seq_params.frame_id_length_minus7 + 7;
-    int diff_len = cm->seq_params.delta_frame_id_length_minus2 + 2;
-    int prev_frame_id = 0;
-    if (cm->frame_type != KEY_FRAME) {
-      prev_frame_id = cm->current_frame_id;
-    }
-    cm->current_frame_id = aom_rb_read_literal(rb, frame_id_length);
+  cm->frame_refs_short_signaling = 0;
+  int frame_size_override_flag = 0;
+  cm->allow_intrabc = 0;
+  cm->primary_ref_frame = PRIMARY_REF_NONE;
 
-    if (cm->frame_type != KEY_FRAME) {
-      int diff_frame_id;
-      if (cm->current_frame_id > prev_frame_id) {
-        diff_frame_id = cm->current_frame_id - prev_frame_id;
-      } else {
-        diff_frame_id =
-            (1 << frame_id_length) + cm->current_frame_id - prev_frame_id;
+  if (!cm->seq_params.reduced_still_picture_hdr) {
+    if (cm->seq_params.frame_id_numbers_present_flag) {
+      int frame_id_length = cm->seq_params.frame_id_length;
+      int diff_len = cm->seq_params.delta_frame_id_length;
+      int prev_frame_id = 0;
+      int have_prev_frame_id = !pbi->decoding_first_frame &&
+                               !(cm->frame_type == KEY_FRAME && cm->show_frame);
+      if (have_prev_frame_id) {
+        prev_frame_id = cm->current_frame_id;
       }
-      /* Check current_frame_id for conformance */
-      if (prev_frame_id == cm->current_frame_id ||
-          diff_frame_id >= (1 << (frame_id_length - 1))) {
-        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                           "Invalid value of current_frame_id");
+      cm->current_frame_id = aom_rb_read_literal(rb, frame_id_length);
+
+      if (have_prev_frame_id) {
+        int diff_frame_id;
+        if (cm->current_frame_id > prev_frame_id) {
+          diff_frame_id = cm->current_frame_id - prev_frame_id;
+        } else {
+          diff_frame_id =
+              (1 << frame_id_length) + cm->current_frame_id - prev_frame_id;
+        }
+        /* Check current_frame_id for conformance */
+        if (prev_frame_id == cm->current_frame_id ||
+            diff_frame_id >= (1 << (frame_id_length - 1))) {
+          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                             "Invalid value of current_frame_id");
+        }
       }
-    }
-    /* Check if some frames need to be marked as not valid for referencing */
-    for (i = 0; i < REF_FRAMES; i++) {
-      if (cm->frame_type == KEY_FRAME) {
-        cm->valid_for_referencing[i] = 0;
-      } else if (cm->current_frame_id - (1 << diff_len) > 0) {
-        if (cm->ref_frame_id[i] > cm->current_frame_id ||
-            cm->ref_frame_id[i] < cm->current_frame_id - (1 << diff_len))
-          cm->valid_for_referencing[i] = 0;
-      } else {
-        if (cm->ref_frame_id[i] > cm->current_frame_id &&
-            cm->ref_frame_id[i] <
-                (1 << frame_id_length) + cm->current_frame_id - (1 << diff_len))
+      /* Check if some frames need to be marked as not valid for referencing */
+      for (int i = 0; i < REF_FRAMES; i++) {
+        if (cm->frame_type == KEY_FRAME && cm->show_frame) {
           cm->valid_for_referencing[i] = 0;
+        } else if (cm->current_frame_id - (1 << diff_len) > 0) {
+          if (cm->ref_frame_id[i] > cm->current_frame_id ||
+              cm->ref_frame_id[i] < cm->current_frame_id - (1 << diff_len))
+            cm->valid_for_referencing[i] = 0;
+        } else {
+          if (cm->ref_frame_id[i] > cm->current_frame_id &&
+              cm->ref_frame_id[i] < (1 << frame_id_length) +
+                                        cm->current_frame_id - (1 << diff_len))
+            cm->valid_for_referencing[i] = 0;
+        }
+      }
+    }
+
+    frame_size_override_flag =
+        frame_is_sframe(cm) ? 1 : aom_rb_read_literal(rb, 1);
+
+    cm->frame_offset =
+        aom_rb_read_literal(rb, cm->seq_params.order_hint_bits_minus_1 + 1);
+    cm->current_video_frame = cm->frame_offset;
+
+    if (!cm->error_resilient_mode && !frame_is_intra_only(cm)) {
+      cm->primary_ref_frame = aom_rb_read_literal(rb, PRIMARY_REF_BITS);
+    }
+  }
+
+  if (cm->seq_params.decoder_model_info_present_flag) {
+    cm->buffer_removal_delay_present = aom_rb_read_bit(rb);
+    if (cm->buffer_removal_delay_present) {
+      for (int op_num = 0;
+           op_num < cm->seq_params.operating_points_cnt_minus_1 + 1; op_num++) {
+        if (cm->op_params[op_num].decoder_model_param_present_flag) {
+          if ((((cm->seq_params.operating_point_idc[op_num] >>
+                 cm->temporal_layer_id) &
+                0x1) &&
+               ((cm->seq_params.operating_point_idc[op_num] >>
+                 (cm->spatial_layer_id + 8)) &
+                0x1)) ||
+              cm->seq_params.operating_point_idc[op_num] == 0) {
+            cm->op_frame_timing[op_num].buffer_removal_delay =
+                aom_rb_read_literal(
+                    rb, cm->buffer_model.buffer_removal_delay_length);
+          } else {
+            cm->op_frame_timing[op_num].buffer_removal_delay = 0;
+          }
+        } else {
+          cm->op_frame_timing[op_num].buffer_removal_delay = 0;
+        }
       }
     }
   }
-#endif  // CONFIG_REFERENCE_BUFFER
   if (cm->frame_type == KEY_FRAME) {
-#if !CONFIG_OBU
-    read_bitdepth_colorspace_sampling(cm, rb, pbi->allow_lowbitdepth);
-#endif
-    pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
+    if (!cm->show_frame)  // unshown keyframe (forward keyframe)
+      pbi->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
+    else  // shown keyframe
+      pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
 
-    for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
       cm->frame_refs[i].idx = INVALID_IDX;
       cm->frame_refs[i].buf = NULL;
-#if CONFIG_VAR_REFS
-      cm->frame_refs[i].is_valid = 0;
-#endif  // CONFIG_VAR_REFS
     }
-
-    setup_frame_size(cm, rb);
-    setup_sb_size(cm, rb);
-
     if (pbi->need_resync) {
       memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
       pbi->need_resync = 0;
     }
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-    cm->ans_window_size_log2 = aom_rb_read_literal(rb, 4) + 8;
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-    cm->allow_screen_content_tools = aom_rb_read_bit(rb);
-#if CONFIG_AMVR
-    if (cm->allow_screen_content_tools) {
-      if (aom_rb_read_bit(rb)) {
-        cm->seq_mv_precision_level = 2;
-      } else {
-        cm->seq_mv_precision_level = aom_rb_read_bit(rb) ? 0 : 1;
-      }
-    } else {
-      cm->seq_mv_precision_level = 0;
-    }
-#endif
-#if CONFIG_TEMPMV_SIGNALING
-    cm->use_prev_frame_mvs = 0;
-#endif
   } else {
-    if (cm->intra_only) cm->allow_screen_content_tools = aom_rb_read_bit(rb);
-#if CONFIG_TEMPMV_SIGNALING
-    if (cm->intra_only || cm->error_resilient_mode) cm->use_prev_frame_mvs = 0;
-#endif
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-// The only way to reset all frame contexts to their default values is with a
-// keyframe.
-#else
-    if (cm->error_resilient_mode) {
-      cm->reset_frame_context = RESET_FRAME_CONTEXT_ALL;
-    } else {
-      if (cm->intra_only) {
-        cm->reset_frame_context = aom_rb_read_bit(rb)
-                                      ? RESET_FRAME_CONTEXT_ALL
-                                      : RESET_FRAME_CONTEXT_CURRENT;
-      } else {
-        cm->reset_frame_context = aom_rb_read_bit(rb)
-                                      ? RESET_FRAME_CONTEXT_CURRENT
-                                      : RESET_FRAME_CONTEXT_NONE;
-        if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT)
-          cm->reset_frame_context = aom_rb_read_bit(rb)
-                                        ? RESET_FRAME_CONTEXT_ALL
-                                        : RESET_FRAME_CONTEXT_CURRENT;
-      }
-    }
-#endif
-
     if (cm->intra_only) {
-#if !CONFIG_OBU
-      read_bitdepth_colorspace_sampling(cm, rb, pbi->allow_lowbitdepth);
-#endif
-
       pbi->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
-      setup_frame_size(cm, rb);
-      setup_sb_size(cm, rb);
+      if (pbi->refresh_frame_flags == 0xFF) {
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Intra only frames cannot have refresh flags 0xFF");
+      }
       if (pbi->need_resync) {
         memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
         pbi->need_resync = 0;
       }
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-      cm->ans_window_size_log2 = aom_rb_read_literal(rb, 4) + 8;
-#endif
     } else if (pbi->need_resync != 1) { /* Skip if need resync */
-#if CONFIG_OBU
-      pbi->refresh_frame_flags = (cm->frame_type == S_FRAME)
-                                     ? ~(1 << REF_FRAMES)
-                                     : aom_rb_read_literal(rb, REF_FRAMES);
-#else
-      pbi->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
-#endif
-
-#if CONFIG_EXT_REFS
+      pbi->refresh_frame_flags =
+          frame_is_sframe(cm) ? 0xFF : aom_rb_read_literal(rb, REF_FRAMES);
       if (!pbi->refresh_frame_flags) {
         // NOTE: "pbi->refresh_frame_flags == 0" indicates that the coded frame
         //       will not be used as a reference
         cm->is_reference_frame = 0;
       }
-#endif  // CONFIG_EXT_REFS
+    }
+  }
 
-      for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
-        const int ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
-        const int idx = cm->ref_frame_map[ref];
+  if (!frame_is_intra_only(cm) || pbi->refresh_frame_flags != 0xFF) {
+    // Read all ref frame order hints if error_resilient_mode == 1
+    if (cm->error_resilient_mode && cm->seq_params.enable_order_hint) {
+      for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
+        // Read order hint from bit stream
+        unsigned int frame_offset =
+            aom_rb_read_literal(rb, cm->seq_params.order_hint_bits_minus_1 + 1);
+        // Get buffer index
+        int buf_idx = cm->ref_frame_map[ref_idx];
+        assert(buf_idx < FRAME_BUFFERS);
+        if (buf_idx == -1 ||
+            frame_offset != frame_bufs[buf_idx].cur_frame_offset) {
+          if (buf_idx >= 0) {
+            lock_buffer_pool(pool);
+            decrease_ref_count(buf_idx, frame_bufs, pool);
+            unlock_buffer_pool(pool);
+          }
+          // If no corresponding buffer exists, allocate a new buffer with all
+          // pixels set to neutral grey.
+          buf_idx = get_free_fb(cm);
+          if (buf_idx == INVALID_IDX) {
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Unable to find free frame buffer");
+          }
+          lock_buffer_pool(pool);
+          if (aom_realloc_frame_buffer(
+                  &frame_bufs[buf_idx].buf, cm->seq_params.max_frame_width,
+                  cm->seq_params.max_frame_height, cm->subsampling_x,
+                  cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                  cm->byte_alignment,
+                  &pool->frame_bufs[buf_idx].raw_frame_buffer, pool->get_fb_cb,
+                  pool->cb_priv)) {
+            unlock_buffer_pool(pool);
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffer");
+          }
+          unlock_buffer_pool(pool);
+          set_planes_to_neutral_grey(cm, &frame_bufs[buf_idx].buf, 0);
+
+          cm->ref_frame_map[ref_idx] = buf_idx;
+          frame_bufs[buf_idx].cur_frame_offset = frame_offset;
+        }
+      }
+    }
+  }
+
+  if (cm->frame_type == KEY_FRAME) {
+    setup_frame_size(cm, frame_size_override_flag, rb);
+
+    if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
+      cm->allow_intrabc = aom_rb_read_bit(rb);
+    cm->allow_ref_frame_mvs = 0;
+    cm->prev_frame = NULL;
+  } else {
+    cm->allow_ref_frame_mvs = 0;
+
+    if (cm->intra_only) {
+      cm->cur_frame->film_grain_params_present = cm->film_grain_params_present;
+      setup_frame_size(cm, frame_size_override_flag, rb);
+      if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
+        cm->allow_intrabc = aom_rb_read_bit(rb);
+
+    } else if (pbi->need_resync != 1) { /* Skip if need resync */
+
+      // Frame refs short signaling is off when error resilient mode is on.
+      if (cm->seq_params.enable_order_hint)
+        cm->frame_refs_short_signaling = aom_rb_read_bit(rb);
+
+      if (cm->frame_refs_short_signaling) {
+        // == LAST_FRAME ==
+        const int lst_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
+        const int lst_idx = cm->ref_frame_map[lst_ref];
+
+        // == GOLDEN_FRAME ==
+        const int gld_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
+        const int gld_idx = cm->ref_frame_map[gld_ref];
 
         // Most of the time, streams start with a keyframe. In that case,
         // ref_frame_map will have been filled in at that point and will not
@@ -4884,146 +3963,136 @@ static size_t read_uncompressed_header(AV1Decoder *pbi,
         // with an intra-only frame, so long as they don't then signal a
         // reference to a slot that hasn't been set yet. That's what we are
         // checking here.
-        if (idx == -1)
+        if (lst_idx == -1)
           aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                              "Inter frame requests nonexistent reference");
+        if (gld_idx == -1)
+          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                             "Inter frame requests nonexistent reference");
+
+        av1_set_frame_refs(cm, lst_ref, gld_ref);
+      }
+
+      for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+        int ref = 0;
+        if (!cm->frame_refs_short_signaling) {
+          ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
+          const int idx = cm->ref_frame_map[ref];
+
+          // Most of the time, streams start with a keyframe. In that case,
+          // ref_frame_map will have been filled in at that point and will not
+          // contain any -1's. However, streams are explicitly allowed to start
+          // with an intra-only frame, so long as they don't then signal a
+          // reference to a slot that hasn't been set yet. That's what we are
+          // checking here.
+          if (idx == -1)
+            aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                               "Inter frame requests nonexistent reference");
+
+          RefBuffer *const ref_frame = &cm->frame_refs[i];
+          ref_frame->idx = idx;
+          ref_frame->buf = &frame_bufs[idx].buf;
+          ref_frame->map_idx = ref;
+        } else {
+          ref = cm->frame_refs[i].map_idx;
+        }
 
-        RefBuffer *const ref_frame = &cm->frame_refs[i];
-        ref_frame->idx = idx;
-        ref_frame->buf = &frame_bufs[idx].buf;
-#if CONFIG_FRAME_SIGN_BIAS
-#if CONFIG_OBU
-        // NOTE: For the scenario of (cm->frame_type != S_FRAME),
-        // ref_frame_sign_bias will be reset based on frame offsets.
         cm->ref_frame_sign_bias[LAST_FRAME + i] = 0;
-#endif  // CONFIG_OBU
-#else   // !CONFIG_FRAME_SIGN_BIAS
-#if CONFIG_OBU
-        cm->ref_frame_sign_bias[LAST_FRAME + i] =
-            (cm->frame_type == S_FRAME) ? 0 : aom_rb_read_bit(rb);
-#else   // !CONFIG_OBU
-        cm->ref_frame_sign_bias[LAST_FRAME + i] = aom_rb_read_bit(rb);
-#endif  // CONFIG_OBU
-#endif  // CONFIG_FRAME_SIGN_BIAS
-#if CONFIG_REFERENCE_BUFFER
+
         if (cm->seq_params.frame_id_numbers_present_flag) {
-          int frame_id_length = cm->seq_params.frame_id_length_minus7 + 7;
-          int diff_len = cm->seq_params.delta_frame_id_length_minus2 + 2;
-          int delta_frame_id_minus1 = aom_rb_read_literal(rb, diff_len);
+          int frame_id_length = cm->seq_params.frame_id_length;
+          int diff_len = cm->seq_params.delta_frame_id_length;
+          int delta_frame_id_minus_1 = aom_rb_read_literal(rb, diff_len);
           int ref_frame_id =
-              ((cm->current_frame_id - (delta_frame_id_minus1 + 1) +
+              ((cm->current_frame_id - (delta_frame_id_minus_1 + 1) +
                 (1 << frame_id_length)) %
                (1 << frame_id_length));
-          /* Compare values derived from delta_frame_id_minus1 and
-           * refresh_frame_flags. Also, check valid for referencing */
+          // Compare values derived from delta_frame_id_minus_1 and
+          // refresh_frame_flags. Also, check valid for referencing
           if (ref_frame_id != cm->ref_frame_id[ref] ||
               cm->valid_for_referencing[ref] == 0)
             aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                                "Reference buffer frame ID mismatch");
         }
-#endif  // CONFIG_REFERENCE_BUFFER
       }
 
-#if CONFIG_VAR_REFS
-      check_valid_ref_frames(cm);
-#endif  // CONFIG_VAR_REFS
-
-#if CONFIG_FRAME_SIZE
-      if (cm->error_resilient_mode == 0) {
+      if (!cm->error_resilient_mode && frame_size_override_flag) {
         setup_frame_size_with_refs(cm, rb);
       } else {
-        setup_frame_size(cm, rb);
+        setup_frame_size(cm, frame_size_override_flag, rb);
       }
-#else
-      setup_frame_size_with_refs(cm, rb);
-#endif
 
-#if CONFIG_AMVR
-      if (cm->seq_mv_precision_level == 2) {
-        cm->cur_frame_mv_precision_level = aom_rb_read_bit(rb) ? 0 : 1;
+      if (cm->cur_frame_force_integer_mv) {
+        cm->allow_high_precision_mv = 0;
       } else {
-        cm->cur_frame_mv_precision_level = cm->seq_mv_precision_level;
+        cm->allow_high_precision_mv = aom_rb_read_bit(rb);
       }
-#endif
-      cm->allow_high_precision_mv = aom_rb_read_bit(rb);
       cm->interp_filter = read_frame_interp_filter(rb);
-#if CONFIG_TEMPMV_SIGNALING
-      if (frame_might_use_prev_frame_mvs(cm))
-        cm->use_prev_frame_mvs = aom_rb_read_bit(rb);
+      cm->switchable_motion_mode = aom_rb_read_bit(rb);
+    }
+
+    cm->prev_frame = get_prev_frame(cm);
+    if (cm->primary_ref_frame != PRIMARY_REF_NONE &&
+        cm->frame_refs[cm->primary_ref_frame].idx < 0) {
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Reference frame containing this frame's initial "
+                         "frame context is unavailable.");
+    }
+
+    if (!cm->intra_only && pbi->need_resync != 1) {
+      if (frame_might_allow_ref_frame_mvs(cm))
+        cm->allow_ref_frame_mvs = aom_rb_read_bit(rb);
       else
-        cm->use_prev_frame_mvs = 0;
-#endif
-      for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+        cm->allow_ref_frame_mvs = 0;
+
+      for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
         RefBuffer *const ref_buf = &cm->frame_refs[i];
-#if CONFIG_HIGHBITDEPTH
-        av1_setup_scale_factors_for_frame(
-            &ref_buf->sf, ref_buf->buf->y_crop_width,
-            ref_buf->buf->y_crop_height, cm->width, cm->height,
-            cm->use_highbitdepth);
-#else
         av1_setup_scale_factors_for_frame(
             &ref_buf->sf, ref_buf->buf->y_crop_width,
             ref_buf->buf->y_crop_height, cm->width, cm->height);
-#endif
+        if ((!av1_is_valid_scale(&ref_buf->sf)))
+          aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                             "Reference frame has invalid dimensions");
       }
     }
   }
 
-#if CONFIG_FRAME_MARKER
-  if (cm->show_frame == 0) {
-    cm->frame_offset = cm->current_video_frame + aom_rb_read_literal(rb, 4);
-  } else {
-    cm->frame_offset = cm->current_video_frame;
-  }
   av1_setup_frame_buf_refs(cm);
 
-#if CONFIG_FRAME_SIGN_BIAS
-#if CONFIG_OBU
-  if (cm->frame_type != S_FRAME)
-#endif  // CONFIG_OBU
-    av1_setup_frame_sign_bias(cm);
-#define FRAME_SIGN_BIAS_DEBUG 0
-#if FRAME_SIGN_BIAS_DEBUG
-  {
-    printf("\n\nDECODER: Frame=%d, show_frame=%d:", cm->current_video_frame,
-           cm->show_frame);
-    MV_REFERENCE_FRAME ref_frame;
-    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-      printf(" sign_bias[%d]=%d", ref_frame,
-             cm->ref_frame_sign_bias[ref_frame]);
-    }
-    printf("\n");
-  }
-#endif  // FRAME_SIGN_BIAS_DEBUG
-#undef FRAME_SIGN_BIAS_DEBUG
-#endif  // CONFIG_FRAME_SIGN_BIAS
-#endif  // CONFIG_FRAME_MARKER
+  av1_setup_frame_sign_bias(cm);
 
-#if CONFIG_TEMPMV_SIGNALING
   cm->cur_frame->intra_only = cm->frame_type == KEY_FRAME || cm->intra_only;
-#endif
+  cm->cur_frame->frame_type = cm->frame_type;
 
-#if CONFIG_REFERENCE_BUFFER
   if (cm->seq_params.frame_id_numbers_present_flag) {
     /* If bitmask is set, update reference frame id values and
        mark frames as valid for reference */
-    int refresh_frame_flags =
-        cm->frame_type == KEY_FRAME ? 0xFF : pbi->refresh_frame_flags;
-    for (i = 0; i < REF_FRAMES; i++) {
+    int refresh_frame_flags = pbi->refresh_frame_flags;
+    for (int i = 0; i < REF_FRAMES; i++) {
       if ((refresh_frame_flags >> i) & 1) {
         cm->ref_frame_id[i] = cm->current_frame_id;
         cm->valid_for_referencing[i] = 1;
       }
     }
   }
-#endif  // CONFIG_REFERENCE_BUFFER
+
+  const int might_bwd_adapt =
+      !(cm->seq_params.reduced_still_picture_hdr) && !(cm->disable_cdf_update);
+  if (might_bwd_adapt) {
+    cm->refresh_frame_context = aom_rb_read_bit(rb)
+                                    ? REFRESH_FRAME_CONTEXT_DISABLED
+                                    : REFRESH_FRAME_CONTEXT_BACKWARD;
+  } else {
+    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+  }
 
   get_frame_new_buffer(cm)->bit_depth = cm->bit_depth;
-  get_frame_new_buffer(cm)->color_space = cm->color_space;
-#if CONFIG_COLORSPACE_HEADERS
-  get_frame_new_buffer(cm)->transfer_function = cm->transfer_function;
+  get_frame_new_buffer(cm)->color_primaries = cm->color_primaries;
+  get_frame_new_buffer(cm)->transfer_characteristics =
+      cm->transfer_characteristics;
+  get_frame_new_buffer(cm)->matrix_coefficients = cm->matrix_coefficients;
+  get_frame_new_buffer(cm)->monochrome = cm->seq_params.monochrome;
   get_frame_new_buffer(cm)->chroma_sample_position = cm->chroma_sample_position;
-#endif
   get_frame_new_buffer(cm)->color_range = cm->color_range;
   get_frame_new_buffer(cm)->render_width = cm->render_width;
   get_frame_new_buffer(cm)->render_height = cm->render_height;
@@ -5034,22 +4103,10 @@ static size_t read_uncompressed_header(AV1Decoder *pbi,
                        " state");
   }
 
-  if (!cm->error_resilient_mode) {
-    cm->refresh_frame_context = aom_rb_read_bit(rb)
-                                    ? REFRESH_FRAME_CONTEXT_FORWARD
-                                    : REFRESH_FRAME_CONTEXT_BACKWARD;
-  } else {
-    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_FORWARD;
-  }
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  // This flag will be overridden by the call to av1_setup_past_independence
-  // below, forcing the use of context 0 for those frame types.
-  cm->frame_context_idx = aom_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);
-#endif
-
   // Generate next_ref_frame_map.
   lock_buffer_pool(pool);
-  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+  int ref_index = 0;
+  for (int mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
     if (mask & 1) {
       cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
       ++frame_bufs[cm->new_fb_idx].ref_count;
@@ -5072,461 +4129,185 @@ static size_t read_uncompressed_header(AV1Decoder *pbi,
   unlock_buffer_pool(pool);
   pbi->hold_ref_buf = 1;
 
-  if (frame_is_intra_only(cm) || cm->error_resilient_mode)
-    av1_setup_past_independence(cm);
+  if (cm->allow_intrabc) {
+    // Set parameters corresponding to no filtering.
+    struct loopfilter *lf = &cm->lf;
+    lf->filter_level[0] = 0;
+    lf->filter_level[1] = 0;
+    cm->cdef_bits = 0;
+    cm->cdef_strengths[0] = 0;
+    cm->nb_cdef_strengths = 1;
+    cm->cdef_uv_strengths[0] = 0;
+    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+  }
 
-  setup_loopfilter(cm, rb);
+  read_tile_info(pbi, rb);
   setup_quantization(cm, rb);
   xd->bd = (int)cm->bit_depth;
 
-#if CONFIG_Q_ADAPT_PROBS
-  av1_default_coef_probs(cm);
-  if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode ||
-      cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL) {
-    for (i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
-  } else if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT) {
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-    if (cm->frame_refs[0].idx <= 0) {
-      cm->frame_contexts[cm->frame_refs[0].idx] = *cm->fc;
-    }
-#else
-    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
-#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
+  if (cm->num_allocated_above_context_planes < av1_num_planes(cm) ||
+      cm->num_allocated_above_context_mi_col < cm->mi_cols ||
+      cm->num_allocated_above_contexts < cm->tile_rows) {
+    av1_free_above_context_buffers(cm, cm->num_allocated_above_contexts);
+    if (av1_alloc_above_context_buffers(cm, cm->tile_rows))
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate context buffers");
   }
-#endif  // CONFIG_Q_ADAPT_PROBS
 
-  setup_segmentation(cm, rb);
+  if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
+    av1_setup_past_independence(cm);
+  }
 
-  {
-    struct segmentation *const seg = &cm->seg;
-    int segment_quantizer_active = 0;
-    for (i = 0; i < MAX_SEGMENTS; i++) {
-      if (segfeature_active(seg, i, SEG_LVL_ALT_Q)) {
-        segment_quantizer_active = 1;
-      }
-    }
+  setup_segmentation(cm, rb);
 
-    cm->delta_q_res = 1;
-#if CONFIG_EXT_DELTA_Q
-    cm->delta_lf_res = 1;
-    cm->delta_lf_present_flag = 0;
-#if CONFIG_LOOPFILTER_LEVEL
-    cm->delta_lf_multi = 0;
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif
-    if (segment_quantizer_active == 0 && cm->base_qindex > 0) {
-      cm->delta_q_present_flag = aom_rb_read_bit(rb);
-    } else {
-      cm->delta_q_present_flag = 0;
-    }
-    if (cm->delta_q_present_flag) {
-      xd->prev_qindex = cm->base_qindex;
-      cm->delta_q_res = 1 << aom_rb_read_literal(rb, 2);
-#if CONFIG_EXT_DELTA_Q
-      assert(!segment_quantizer_active);
-      cm->delta_lf_present_flag = aom_rb_read_bit(rb);
-      if (cm->delta_lf_present_flag) {
-        xd->prev_delta_lf_from_base = 0;
-        cm->delta_lf_res = 1 << aom_rb_read_literal(rb, 2);
-#if CONFIG_LOOPFILTER_LEVEL
-        cm->delta_lf_multi = aom_rb_read_bit(rb);
-        for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id)
-          xd->prev_delta_lf[lf_id] = 0;
-#endif  // CONFIG_LOOPFILTER_LEVEL
-      }
-#endif  // CONFIG_EXT_DELTA_Q
+  cm->delta_q_res = 1;
+  cm->delta_lf_res = 1;
+  cm->delta_lf_present_flag = 0;
+  cm->delta_lf_multi = 0;
+  cm->delta_q_present_flag = cm->base_qindex > 0 ? aom_rb_read_bit(rb) : 0;
+  if (cm->delta_q_present_flag) {
+    xd->current_qindex = cm->base_qindex;
+    cm->delta_q_res = 1 << aom_rb_read_literal(rb, 2);
+    if (!cm->allow_intrabc) cm->delta_lf_present_flag = aom_rb_read_bit(rb);
+    if (cm->delta_lf_present_flag) {
+      cm->delta_lf_res = 1 << aom_rb_read_literal(rb, 2);
+      cm->delta_lf_multi = aom_rb_read_bit(rb);
+      av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
     }
   }
-#if CONFIG_AMVR
-  xd->cur_frame_mv_precision_level = cm->cur_frame_mv_precision_level;
-#endif
 
-  for (i = 0; i < MAX_SEGMENTS; ++i) {
+  xd->cur_frame_force_integer_mv = cm->cur_frame_force_integer_mv;
+
+  for (int i = 0; i < MAX_SEGMENTS; ++i) {
     const int qindex = cm->seg.enabled
                            ? av1_get_qindex(&cm->seg, i, cm->base_qindex)
                            : cm->base_qindex;
     xd->lossless[i] = qindex == 0 && cm->y_dc_delta_q == 0 &&
-                      cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
+                      cm->u_dc_delta_q == 0 && cm->u_ac_delta_q == 0 &&
+                      cm->v_dc_delta_q == 0 && cm->v_ac_delta_q == 0;
     xd->qindex[i] = qindex;
   }
-  cm->all_lossless = all_lossless(cm, xd);
+  cm->coded_lossless = is_coded_lossless(cm, xd);
+  cm->all_lossless = cm->coded_lossless && !av1_superres_scaled(cm);
   setup_segmentation_dequant(cm);
-#if CONFIG_CDEF
-  if (!cm->all_lossless) {
-    setup_cdef(cm, rb);
+  if (cm->coded_lossless) {
+    cm->lf.filter_level[0] = 0;
+    cm->lf.filter_level[1] = 0;
   }
-#endif
-#if CONFIG_LOOP_RESTORATION
-  decode_restoration_mode(cm, rb);
-#endif  // CONFIG_LOOP_RESTORATION
-  cm->tx_mode = read_tx_mode(cm, rb);
-  cm->reference_mode = read_frame_reference_mode(cm, rb);
-  if (cm->reference_mode != SINGLE_REFERENCE) setup_compound_reference_mode(cm);
-  read_compound_tools(cm, rb);
-
-#if CONFIG_EXT_TX
-  cm->reduced_tx_set_used = aom_rb_read_bit(rb);
-#endif  // CONFIG_EXT_TX
-
-#if CONFIG_ADAPT_SCAN
-  cm->use_adapt_scan = aom_rb_read_bit(rb);
-  // TODO(angiebird): call av1_init_scan_order only when use_adapt_scan
-  // switches from 1 to 0
-  if (cm->use_adapt_scan == 0) av1_init_scan_order(cm);
-#endif  // CONFIG_ADAPT_SCAN
-
-#if CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
-  // NOTE(zoeliu): As cm->prev_frame can take neither a frame of
-  //               show_exisiting_frame=1, nor can it take a frame not used as
-  //               a reference, it is probable that by the time it is being
-  //               referred to, the frame buffer it originally points to may
-  //               already get expired and have been reassigned to the current
-  //               newly coded frame. Hence, we need to check whether this is
-  //               the case, and if yes, we have 2 choices:
-  //               (1) Simply disable the use of previous frame mvs; or
-  //               (2) Have cm->prev_frame point to one reference frame buffer,
-  //                   e.g. LAST_FRAME.
-  if (!dec_is_ref_frame_buf(pbi, cm->prev_frame)) {
-    // Reassign the LAST_FRAME buffer to cm->prev_frame.
-    cm->prev_frame =
-        cm->frame_refs[LAST_FRAME - LAST_FRAME].idx != INVALID_IDX
-            ? &cm->buffer_pool
-                   ->frame_bufs[cm->frame_refs[LAST_FRAME - LAST_FRAME].idx]
-            : NULL;
-  }
-#endif  // CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
-
-#if CONFIG_TEMPMV_SIGNALING
-  if (cm->use_prev_frame_mvs && !frame_can_use_prev_frame_mvs(cm)) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "Frame wrongly requests previous frame MVs");
+  if (cm->coded_lossless || !cm->seq_params.enable_cdef) {
+    cm->cdef_bits = 0;
+    cm->cdef_strengths[0] = 0;
+    cm->cdef_uv_strengths[0] = 0;
   }
-#else
-  cm->use_prev_frame_mvs = !cm->error_resilient_mode && cm->prev_frame &&
-#if CONFIG_FRAME_SUPERRES
-                           cm->width == cm->last_width &&
-                           cm->height == cm->last_height &&
-#else
-                           cm->width == cm->prev_frame->buf.y_crop_width &&
-                           cm->height == cm->prev_frame->buf.y_crop_height &&
-#endif  // CONFIG_FRAME_SUPERRES
-                           !cm->last_intra_only && cm->last_show_frame &&
-                           (cm->last_frame_type != KEY_FRAME);
-#endif  // CONFIG_TEMPMV_SIGNALING
-
-#if CONFIG_GLOBAL_MOTION
-  if (!frame_is_intra_only(cm)) read_global_motion(cm, rb);
-#endif
-
-  read_tile_info(pbi, rb);
-  if (use_compressed_header(cm)) {
-    sz = aom_rb_read_literal(rb, 16);
-    if (sz == 0)
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                         "Invalid header size");
-  } else {
-    sz = 0;
+  if (cm->all_lossless || !cm->seq_params.enable_restoration) {
+    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
   }
-  return sz;
-}
+  setup_loopfilter(cm, rb);
 
-#if CONFIG_SUPERTX
-static void read_supertx_probs(FRAME_CONTEXT *fc, aom_reader *r) {
-  int i, j;
-  if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
-    for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
-      for (j = TX_8X8; j < TX_SIZES; ++j) {
-        av1_diff_update_prob(r, &fc->supertx_prob[i][j], ACCT_STR);
-      }
-    }
+  if (!cm->coded_lossless && cm->seq_params.enable_cdef) {
+    setup_cdef(cm, rb);
+  }
+  if (!cm->all_lossless && cm->seq_params.enable_restoration) {
+    decode_restoration_mode(cm, rb);
   }
-}
-#endif  // CONFIG_SUPERTX
-
-static int read_compressed_header(AV1Decoder *pbi, const uint8_t *data,
-                                  size_t partition_size) {
-#if CONFIG_RESTRICT_COMPRESSED_HDR
-  (void)pbi;
-  (void)data;
-  (void)partition_size;
-  return 0;
-#else
-  AV1_COMMON *const cm = &pbi->common;
-#if CONFIG_SUPERTX
-  MACROBLOCKD *const xd = &pbi->mb;
-#endif
-  aom_reader r;
-#if !CONFIG_NEW_MULTISYMBOL
-  FRAME_CONTEXT *const fc = cm->fc;
-  int i;
-#endif
-
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  r.window_size = 1 << cm->ans_window_size_log2;
-#endif
-  if (aom_reader_init(&r, data, partition_size, pbi->decrypt_cb,
-                      pbi->decrypt_state))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate bool decoder 0");
 
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-  if (cm->tx_mode == TX_MODE_SELECT)
-    av1_diff_update_prob(&r, &fc->quarter_tx_size_prob, ACCT_STR);
-#endif
+  cm->tx_mode = read_tx_mode(cm, rb);
+  cm->reference_mode = read_frame_reference_mode(cm, rb);
+  if (cm->reference_mode != SINGLE_REFERENCE) setup_compound_reference_mode(cm);
 
-#if CONFIG_LV_MAP && !LV_MAP_PROB
-  av1_read_txb_probs(fc, cm->tx_mode, &r, &cm->counts);
-#endif  // CONFIG_LV_MAP && !LV_MAP_PROB
-
-#if !CONFIG_NEW_MULTISYMBOL
-#if CONFIG_VAR_TX
-  if (cm->tx_mode == TX_MODE_SELECT)
-    for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i)
-      av1_diff_update_prob(&r, &fc->txfm_partition_prob[i], ACCT_STR);
-#endif  // CONFIG_VAR_TX
-  for (i = 0; i < SKIP_CONTEXTS; ++i)
-    av1_diff_update_prob(&r, &fc->skip_probs[i], ACCT_STR);
-#endif
+  av1_setup_skip_mode_allowed(cm);
+  cm->skip_mode_flag = cm->is_skip_mode_allowed ? aom_rb_read_bit(rb) : 0;
 
-  if (!frame_is_intra_only(cm)) {
-#if !CONFIG_NEW_MULTISYMBOL
-    read_inter_mode_probs(fc, &r);
-#endif
+  if (frame_might_allow_warped_motion(cm))
+    cm->allow_warped_motion = aom_rb_read_bit(rb);
+  else
+    cm->allow_warped_motion = 0;
 
-#if CONFIG_INTERINTRA
-    if (cm->reference_mode != COMPOUND_REFERENCE &&
-        cm->allow_interintra_compound) {
-#if !CONFIG_NEW_MULTISYMBOL
-      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
-        if (is_interintra_allowed_bsize_group(i)) {
-          av1_diff_update_prob(&r, &fc->interintra_prob[i], ACCT_STR);
-        }
-      }
-#endif
-#if CONFIG_WEDGE && !CONFIG_NEW_MULTISYMBOL
-#if CONFIG_EXT_PARTITION_TYPES
-      int block_sizes_to_update = BLOCK_SIZES_ALL;
-#else
-      int block_sizes_to_update = BLOCK_SIZES;
-#endif
-      for (i = 0; i < block_sizes_to_update; i++) {
-        if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i)) {
-          av1_diff_update_prob(&r, &fc->wedge_interintra_prob[i], ACCT_STR);
-        }
-      }
-#endif  // CONFIG_WEDGE
-    }
-#endif  // CONFIG_INTERINTRA
+  cm->reduced_tx_set_used = aom_rb_read_bit(rb);
 
-#if !CONFIG_NEW_MULTISYMBOL
-    for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-      av1_diff_update_prob(&r, &fc->intra_inter_prob[i], ACCT_STR);
-#endif
+  if (cm->allow_ref_frame_mvs && !frame_might_allow_ref_frame_mvs(cm)) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Frame wrongly requests reference frame MVs");
+  }
 
-#if !CONFIG_NEW_MULTISYMBOL
-    read_frame_reference_mode_probs(cm, &r);
-#endif
+  if (!frame_is_intra_only(cm)) read_global_motion(cm, rb);
 
-#if CONFIG_COMPOUND_SINGLEREF
-    for (i = 0; i < COMP_INTER_MODE_CONTEXTS; i++)
-      av1_diff_update_prob(&r, &fc->comp_inter_mode_prob[i], ACCT_STR);
-#endif  // CONFIG_COMPOUND_SINGLEREF
+  cm->cur_frame->film_grain_params_present = cm->film_grain_params_present;
+  read_film_grain(cm, rb);
 
-#if !CONFIG_NEW_MULTISYMBOL
-#if CONFIG_AMVR
-    if (cm->cur_frame_mv_precision_level == 0) {
-#endif
-      for (i = 0; i < NMV_CONTEXTS; ++i)
-        read_mv_probs(&fc->nmvc[i], cm->allow_high_precision_mv, &r);
-#if CONFIG_AMVR
-    }
-#endif
-#endif
-#if CONFIG_SUPERTX
-    if (!xd->lossless[0]) read_supertx_probs(fc, &r);
-#endif
+#if EXT_TILE_DEBUG
+  if (pbi->ext_tile_debug && cm->large_scale_tile) {
+    read_ext_tile_info(pbi, rb);
+    av1_set_single_tile_decoding_mode(cm);
   }
-
-  return aom_reader_has_error(&r);
-#endif  // CONFIG_RESTRICT_COMPRESSED_HDR
-}
-
-#ifdef NDEBUG
-#define debug_check_frame_counts(cm) (void)0
-#else  // !NDEBUG
-// Counts should only be incremented when frame_parallel_decoding_mode and
-// error_resilient_mode are disabled.
-static void debug_check_frame_counts(const AV1_COMMON *const cm) {
-  FRAME_COUNTS zero_counts;
-  av1_zero(zero_counts);
-  assert(cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_BACKWARD ||
-         cm->error_resilient_mode);
-  assert(!memcmp(cm->counts.partition, zero_counts.partition,
-                 sizeof(cm->counts.partition)));
-  assert(!memcmp(cm->counts.switchable_interp, zero_counts.switchable_interp,
-                 sizeof(cm->counts.switchable_interp)));
-  assert(!memcmp(cm->counts.inter_compound_mode,
-                 zero_counts.inter_compound_mode,
-                 sizeof(cm->counts.inter_compound_mode)));
-#if CONFIG_INTERINTRA
-  assert(!memcmp(cm->counts.interintra, zero_counts.interintra,
-                 sizeof(cm->counts.interintra)));
-#if CONFIG_WEDGE
-  assert(!memcmp(cm->counts.wedge_interintra, zero_counts.wedge_interintra,
-                 sizeof(cm->counts.wedge_interintra)));
-#endif  // CONFIG_WEDGE
-#endif  // CONFIG_INTERINTRA
-  assert(!memcmp(cm->counts.compound_interinter,
-                 zero_counts.compound_interinter,
-                 sizeof(cm->counts.compound_interinter)));
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  assert(!memcmp(cm->counts.motion_mode, zero_counts.motion_mode,
-                 sizeof(cm->counts.motion_mode)));
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_MOTION_VAR
-  assert(!memcmp(cm->counts.ncobmc_mode, zero_counts.ncobmc_mode,
-                 sizeof(cm->counts.ncobmc_mode)));
-#endif
-  assert(!memcmp(cm->counts.intra_inter, zero_counts.intra_inter,
-                 sizeof(cm->counts.intra_inter)));
-#if CONFIG_COMPOUND_SINGLEREF
-  assert(!memcmp(cm->counts.comp_inter_mode, zero_counts.comp_inter_mode,
-                 sizeof(cm->counts.comp_inter_mode)));
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  assert(!memcmp(cm->counts.comp_inter, zero_counts.comp_inter,
-                 sizeof(cm->counts.comp_inter)));
-#if CONFIG_EXT_COMP_REFS
-  assert(!memcmp(cm->counts.comp_ref_type, zero_counts.comp_ref_type,
-                 sizeof(cm->counts.comp_ref_type)));
-  assert(!memcmp(cm->counts.uni_comp_ref, zero_counts.uni_comp_ref,
-                 sizeof(cm->counts.uni_comp_ref)));
-#endif  // CONFIG_EXT_COMP_REFS
-  assert(!memcmp(cm->counts.single_ref, zero_counts.single_ref,
-                 sizeof(cm->counts.single_ref)));
-  assert(!memcmp(cm->counts.comp_ref, zero_counts.comp_ref,
-                 sizeof(cm->counts.comp_ref)));
-#if CONFIG_EXT_REFS
-  assert(!memcmp(cm->counts.comp_bwdref, zero_counts.comp_bwdref,
-                 sizeof(cm->counts.comp_bwdref)));
-#endif  // CONFIG_EXT_REFS
-  assert(!memcmp(&cm->counts.tx_size, &zero_counts.tx_size,
-                 sizeof(cm->counts.tx_size)));
-  assert(!memcmp(cm->counts.skip, zero_counts.skip, sizeof(cm->counts.skip)));
-  assert(
-      !memcmp(&cm->counts.mv[0], &zero_counts.mv[0], sizeof(cm->counts.mv[0])));
-  assert(
-      !memcmp(&cm->counts.mv[1], &zero_counts.mv[1], sizeof(cm->counts.mv[0])));
+#endif  // EXT_TILE_DEBUG
+  return 0;
 }
-#endif  // NDEBUG
 
-static struct aom_read_bit_buffer *init_read_bit_buffer(
+struct aom_read_bit_buffer *av1_init_read_bit_buffer(
     AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
-    const uint8_t *data_end, uint8_t clear_data[MAX_AV1_HEADER_SIZE]) {
+    const uint8_t *data_end) {
   rb->bit_offset = 0;
   rb->error_handler = error_handler;
   rb->error_handler_data = &pbi->common;
-  if (pbi->decrypt_cb) {
-    const int n = (int)AOMMIN(MAX_AV1_HEADER_SIZE, data_end - data);
-    pbi->decrypt_cb(pbi->decrypt_state, data, clear_data, n);
-    rb->bit_buffer = clear_data;
-    rb->bit_buffer_end = clear_data + n;
-  } else {
-    rb->bit_buffer = data;
-    rb->bit_buffer_end = data_end;
-  }
+  rb->bit_buffer = data;
+  rb->bit_buffer_end = data_end;
   return rb;
 }
 
-//------------------------------------------------------------------------------
-
-void av1_read_frame_size(struct aom_read_bit_buffer *rb, int *width,
-                         int *height) {
-  *width = aom_rb_read_literal(rb, 16) + 1;
-  *height = aom_rb_read_literal(rb, 16) + 1;
+void av1_read_frame_size(struct aom_read_bit_buffer *rb, int num_bits_width,
+                         int num_bits_height, int *width, int *height) {
+  *width = aom_rb_read_literal(rb, num_bits_width) + 1;
+  *height = aom_rb_read_literal(rb, num_bits_height) + 1;
 }
 
 BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb) {
-  int profile = aom_rb_read_bit(rb);
-  profile |= aom_rb_read_bit(rb) << 1;
-  if (profile > 2) profile += aom_rb_read_bit(rb);
+  int profile = aom_rb_read_literal(rb, PROFILE_BITS);
   return (BITSTREAM_PROFILE)profile;
 }
 
-static void make_update_tile_list_dec(AV1Decoder *pbi, int tile_rows,
-                                      int tile_cols, FRAME_CONTEXT *ec_ctxs[]) {
-  int i;
-  for (i = 0; i < tile_rows * tile_cols; ++i)
-    ec_ctxs[i] = &pbi->tile_data[i].tctx;
-}
-
-#if CONFIG_FRAME_SUPERRES
 void superres_post_decode(AV1Decoder *pbi) {
   AV1_COMMON *const cm = &pbi->common;
   BufferPool *const pool = cm->buffer_pool;
 
-  if (av1_superres_unscaled(cm)) return;
+  if (!av1_superres_scaled(cm)) return;
+  assert(!cm->all_lossless);
 
   lock_buffer_pool(pool);
   av1_superres_upscale(cm, pool);
   unlock_buffer_pool(pool);
 }
-#endif  // CONFIG_FRAME_SUPERRES
-
-static void dec_setup_frame_boundary_info(AV1_COMMON *const cm) {
-// Note: When LOOPFILTERING_ACROSS_TILES is enabled, we need to clear the
-// boundary information every frame, since the tile boundaries may
-// change every frame (particularly when dependent-horztiles is also
-// enabled); when it is disabled, the only information stored is the frame
-// boundaries, which only depend on the frame size.
-#if !CONFIG_LOOPFILTERING_ACROSS_TILES
-  if (cm->width != cm->last_width || cm->height != cm->last_height)
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-  {
-    int row, col;
-    for (row = 0; row < cm->mi_rows; ++row) {
-      MODE_INFO *mi = cm->mi + row * cm->mi_stride;
-      for (col = 0; col < cm->mi_cols; ++col) {
-        mi->mbmi.boundary_info = 0;
-        mi++;
-      }
-    }
-    av1_setup_frame_boundary_info(cm);
-  }
-}
 
-size_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi, const uint8_t *data,
-                                          const uint8_t *data_end,
-                                          const uint8_t **p_data_end) {
+int av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
+                                       struct aom_read_bit_buffer *rb,
+                                       const uint8_t *data,
+                                       const uint8_t **p_data_end,
+                                       int trailing_bits_present) {
   AV1_COMMON *const cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &pbi->mb;
-  struct aom_read_bit_buffer rb;
-  uint8_t clear_data[MAX_AV1_HEADER_SIZE];
-  size_t first_partition_size;
-  YV12_BUFFER_CONFIG *new_fb;
-#if CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
-  RefBuffer *last_fb_ref_buf = &cm->frame_refs[LAST_FRAME - LAST_FRAME];
-#endif  // CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
-
-#if CONFIG_ADAPT_SCAN
-  av1_deliver_eob_threshold(cm, xd);
-#endif
+
 #if CONFIG_BITSTREAM_DEBUG
   bitstream_queue_set_frame_read(cm->current_video_frame * 2 + cm->show_frame);
 #endif
+#if CONFIG_MISMATCH_DEBUG
+  mismatch_move_frame_idx_r();
+#endif
 
-#if CONFIG_GLOBAL_MOTION
-  int i;
-  for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+  for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
     cm->global_motion[i] = default_warp_params;
     cm->cur_frame->global_motion[i] = default_warp_params;
   }
   xd->global_motion = cm->global_motion;
-#endif  // CONFIG_GLOBAL_MOTION
 
-  first_partition_size = read_uncompressed_header(
-      pbi, init_read_bit_buffer(pbi, &rb, data, data_end, clear_data));
+  read_uncompressed_header(pbi, rb);
+
+  if (trailing_bits_present) av1_check_trailing_bits(pbi, rb);
 
-#if CONFIG_EXT_TILE
   // If cm->single_tile_decoding = 0, the independent decoding of a single tile
   // or a section of a frame is not allowed.
   if (!cm->single_tile_decoding &&
@@ -5534,268 +4315,160 @@ size_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi, const uint8_t *data,
     pbi->dec_tile_row = -1;
     pbi->dec_tile_col = -1;
   }
-#endif  // CONFIG_EXT_TILE
 
-  pbi->first_partition_size = first_partition_size;
-  pbi->uncomp_hdr_size = aom_rb_bytes_read(&rb);
-  new_fb = get_frame_new_buffer(cm);
+  pbi->uncomp_hdr_size = aom_rb_bytes_read(rb);
+  YV12_BUFFER_CONFIG *new_fb = get_frame_new_buffer(cm);
   xd->cur_buf = new_fb;
-#if CONFIG_INTRABC
-#if CONFIG_HIGHBITDEPTH
-  av1_setup_scale_factors_for_frame(
-      &xd->sf_identity, xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height,
-      xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height,
-      cm->use_highbitdepth);
-#else
-  av1_setup_scale_factors_for_frame(
-      &xd->sf_identity, xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height,
-      xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height);
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_INTRABC
+  if (av1_allow_intrabc(cm)) {
+    av1_setup_scale_factors_for_frame(
+        &cm->sf_identity, xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height,
+        xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height);
+  }
 
   if (cm->show_existing_frame) {
     // showing a frame directly
-    *p_data_end = data + aom_rb_bytes_read(&rb);
+    *p_data_end = data + aom_rb_bytes_read(rb);
+    if (cm->reset_decoder_state) {
+      // Use the default frame context values.
+      *cm->fc = cm->frame_contexts[FRAME_CONTEXT_DEFAULTS];
+      if (!cm->fc->initialized)
+        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                           "Uninitialized entropy context.");
+    }
     return 0;
   }
 
-  data += aom_rb_bytes_read(&rb);
-  if (first_partition_size)
-    if (!read_is_valid(data, first_partition_size, data_end))
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                         "Truncated packet or corrupt header length");
-
   cm->setup_mi(cm);
 
-#if CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
-  // NOTE(zoeliu): As cm->prev_frame can take neither a frame of
-  //               show_exisiting_frame=1, nor can it take a frame not used as
-  //               a reference, it is probable that by the time it is being
-  //               referred to, the frame buffer it originally points to may
-  //               already get expired and have been reassigned to the current
-  //               newly coded frame. Hence, we need to check whether this is
-  //               the case, and if yes, we have 2 choices:
-  //               (1) Simply disable the use of previous frame mvs; or
-  //               (2) Have cm->prev_frame point to one reference frame buffer,
-  //                   e.g. LAST_FRAME.
-  if (!dec_is_ref_frame_buf(pbi, cm->prev_frame)) {
-    // Reassign the LAST_FRAME buffer to cm->prev_frame.
-    cm->prev_frame = last_fb_ref_buf->idx != INVALID_IDX
-                         ? &cm->buffer_pool->frame_bufs[last_fb_ref_buf->idx]
-                         : NULL;
-  }
-#endif  // CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
-
-#if CONFIG_TEMPMV_SIGNALING
-  if (cm->use_prev_frame_mvs && !frame_can_use_prev_frame_mvs(cm)) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                       "Frame wrongly requests previous frame MVs");
-  }
-#else
-  cm->use_prev_frame_mvs = !cm->error_resilient_mode && cm->prev_frame &&
-#if CONFIG_FRAME_SUPERRES
-                           cm->width == cm->last_width &&
-                           cm->height == cm->last_height &&
-#else
-                           cm->width == cm->prev_frame->buf.y_crop_width &&
-                           cm->height == cm->prev_frame->buf.y_crop_height &&
-#endif  // CONFIG_FRAME_SUPERRES
-                           !cm->last_intra_only && cm->last_show_frame &&
-                           (cm->last_frame_type != KEY_FRAME);
-#endif  // CONFIG_TEMPMV_SIGNALING
-
-#if CONFIG_MFMV
+  cm->current_frame_seg_map = cm->cur_frame->seg_map;
+
   av1_setup_motion_field(cm);
-#endif  // CONFIG_MFMV
 
-  av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  if (cm->error_resilient_mode || frame_is_intra_only(cm)) {
+  av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y, num_planes);
+  if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
     // use the default frame context values
     *cm->fc = cm->frame_contexts[FRAME_CONTEXT_DEFAULTS];
-    cm->pre_fc = &cm->frame_contexts[FRAME_CONTEXT_DEFAULTS];
   } else {
-    *cm->fc = cm->frame_contexts[cm->frame_refs[0].idx];
-    cm->pre_fc = &cm->frame_contexts[cm->frame_refs[0].idx];
+    *cm->fc = cm->frame_contexts[cm->frame_refs[cm->primary_ref_frame].idx];
   }
-#else
-  *cm->fc = cm->frame_contexts[cm->frame_context_idx];
-  cm->pre_fc = &cm->frame_contexts[cm->frame_context_idx];
-#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
   if (!cm->fc->initialized)
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Uninitialized entropy context.");
 
-  av1_zero(cm->counts);
-
   xd->corrupted = 0;
-  if (first_partition_size) {
-    new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
-    if (new_fb->corrupted)
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                         "Decode failed. Frame data header is corrupted.");
-  }
-  return first_partition_size;
+  return 0;
 }
 
-void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
-                                    const uint8_t *data_end,
-                                    const uint8_t **p_data_end, int startTile,
-                                    int endTile, int initialize_flag) {
+// Once-per-frame initialization
+static void setup_frame_info(AV1Decoder *pbi) {
   AV1_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
-  int context_updated = 0;
 
-#if CONFIG_LOOP_RESTORATION
   if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
       cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
       cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
     av1_alloc_restoration_buffers(cm);
   }
-#endif
-
-#if !CONFIG_LOOPFILTER_LEVEL
-  if (cm->lf.filter_level && !cm->skip_loop_filter) {
-    av1_loop_filter_frame_init(cm, cm->lf.filter_level, cm->lf.filter_level);
+  const int use_highbd = cm->use_highbitdepth ? 1 : 0;
+  const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
+  if (pbi->td.mc_buf_size != buf_size) {
+    av1_free_mc_tmp_buf(&pbi->td, use_highbd);
+    allocate_mc_tmp_buf(cm, &pbi->td, buf_size, use_highbd);
   }
-#endif
+}
 
-  // If encoded in frame parallel mode, frame context is ready after decoding
-  // the frame header.
-  if (cm->frame_parallel_decode && initialize_flag &&
-      cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_BACKWARD) {
-    AVxWorker *const worker = pbi->frame_worker_owner;
-    FrameWorkerData *const frame_worker_data = worker->data1;
-    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD) {
-      context_updated = 1;
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-      cm->frame_contexts[cm->new_fb_idx] = *cm->fc;
-#else
-      cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
-#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
-    }
-    av1_frameworker_lock_stats(worker);
-    pbi->cur_buf->row = -1;
-    pbi->cur_buf->col = -1;
-    frame_worker_data->frame_context_ready = 1;
-    // Signal the main thread that context is ready.
-    av1_frameworker_signal_stats(worker);
-    av1_frameworker_unlock_stats(worker);
-  }
-
-  dec_setup_frame_boundary_info(cm);
-
-  if (pbi->max_threads > 1 && !CONFIG_CB4X4 &&
-#if CONFIG_EXT_TILE
-      pbi->dec_tile_col < 0 &&  // Decoding all columns
-#endif                          // CONFIG_EXT_TILE
-      cm->tile_cols > 1) {
-    // Multi-threaded tile decoder
-    *p_data_end =
-        decode_tiles_mt(pbi, data + pbi->first_partition_size, data_end);
-    if (!xd->corrupted) {
-      if (!cm->skip_loop_filter) {
-// If multiple threads are used to decode tiles, then we use those
-// threads to do parallel loopfiltering.
-#if CONFIG_LOOPFILTER_LEVEL
-        av1_loop_filter_frame_mt(
-            (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, pbi->mb.plane,
-            cm->lf.filter_level[0], cm->lf.filter_level[1], 0, 0,
-            pbi->tile_workers, pbi->num_tile_workers, &pbi->lf_row_sync);
-#else
-        av1_loop_filter_frame_mt((YV12_BUFFER_CONFIG *)xd->cur_buf, cm,
-                                 pbi->mb.plane, cm->lf.filter_level, 0, 0,
-                                 pbi->tile_workers, pbi->num_tile_workers,
-                                 &pbi->lf_row_sync);
-#endif  // CONFIG_LOOPFILTER_LEVEL
-      }
-    } else {
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
-                         "Decode failed. Frame data is corrupted.");
-    }
-  } else {
-#if CONFIG_OBU
-    *p_data_end = decode_tiles(pbi, data, data_end, startTile, endTile);
-#else
-    *p_data_end = decode_tiles(
-        pbi, data + pbi->uncomp_hdr_size + pbi->first_partition_size, data_end,
-        startTile, endTile);
-#endif
-  }
+void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
+                                    const uint8_t *data_end,
+                                    const uint8_t **p_data_end, int start_tile,
+                                    int end_tile, int initialize_flag) {
+  AV1_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  const int tile_count_tg = end_tile - start_tile + 1;
 
-  if (endTile != cm->tile_rows * cm->tile_cols - 1) {
-    return;
-  }
+  if (initialize_flag) setup_frame_info(pbi);
 
-#if CONFIG_STRIPED_LOOP_RESTORATION
-  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
-    av1_loop_restoration_save_boundary_lines(&pbi->cur_buf->buf, cm);
-  }
-#endif
+  if (pbi->max_threads > 1 && tile_count_tg > 1 && !cm->large_scale_tile)
+    *p_data_end = decode_tiles_mt(pbi, data, data_end, start_tile, end_tile);
+  else
+    *p_data_end = decode_tiles(pbi, data, data_end, start_tile, end_tile);
 
-#if CONFIG_CDEF
-  if (!cm->skip_loop_filter && !cm->all_lossless) {
-    av1_cdef_frame(&pbi->cur_buf->buf, cm, &pbi->mb);
+  const int num_planes = av1_num_planes(cm);
+  // If the bit stream is monochrome, set the U and V buffers to a constant.
+  if (num_planes < 3) set_planes_to_neutral_grey(cm, xd->cur_buf, 1);
+
+  if (end_tile != cm->tile_rows * cm->tile_cols - 1) {
+    return;
   }
-#endif  // CONFIG_CDEF
 
-#if CONFIG_FRAME_SUPERRES
-  superres_post_decode(pbi);
-#endif  // CONFIG_FRAME_SUPERRES
+  if (!cm->allow_intrabc && !cm->single_tile_decoding) {
+    if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) {
+#if LOOP_FILTER_BITMASK
+      av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb, 0,
+                            num_planes, 0);
+#else
+      if (pbi->num_workers > 1) {
+        av1_loop_filter_frame_mt(get_frame_new_buffer(cm), cm, &pbi->mb, 0,
+                                 num_planes, 0, pbi->tile_workers,
+                                 pbi->num_workers, &pbi->lf_row_sync);
+      } else {
+        av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb, 0,
+                              num_planes, 0);
+      }
+#endif
+    }
 
-#if CONFIG_LOOP_RESTORATION
-  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
-    aom_extend_frame_borders((YV12_BUFFER_CONFIG *)xd->cur_buf);
-    av1_loop_restoration_frame((YV12_BUFFER_CONFIG *)xd->cur_buf, cm,
-                               cm->rst_info, 7, 0, NULL);
+    const int do_loop_restoration =
+        cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+        cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+        cm->rst_info[2].frame_restoration_type != RESTORE_NONE;
+    const int do_cdef =
+        !cm->skip_loop_filter && !cm->coded_lossless &&
+        (cm->cdef_bits || cm->cdef_strengths[0] || cm->cdef_uv_strengths[0]);
+    const int do_superres = av1_superres_scaled(cm);
+    const int optimized_loop_restoration = !do_cdef && !do_superres;
+
+    if (!optimized_loop_restoration) {
+      if (do_loop_restoration)
+        av1_loop_restoration_save_boundary_lines(&pbi->cur_buf->buf, cm, 0);
+
+      if (do_cdef) av1_cdef_frame(&pbi->cur_buf->buf, cm, &pbi->mb);
+
+      superres_post_decode(pbi);
+
+      if (do_loop_restoration) {
+        av1_loop_restoration_save_boundary_lines(&pbi->cur_buf->buf, cm, 1);
+        if (pbi->num_workers > 1) {
+          av1_loop_restoration_filter_frame_mt(
+              (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration,
+              pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync,
+              &pbi->lr_ctxt);
+        } else {
+          av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf,
+                                            cm, optimized_loop_restoration,
+                                            &pbi->lr_ctxt);
+        }
+      }
+    } else {
+      // In no cdef and no superres case. Provide an optimized version of
+      // loop_restoration_filter.
+      if (do_loop_restoration) {
+        if (pbi->num_workers > 1) {
+          av1_loop_restoration_filter_frame_mt(
+              (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration,
+              pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync,
+              &pbi->lr_ctxt);
+        } else {
+          av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf,
+                                            cm, optimized_loop_restoration,
+                                            &pbi->lr_ctxt);
+        }
+      }
+    }
   }
-#endif  // CONFIG_LOOP_RESTORATION
 
   if (!xd->corrupted) {
     if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-      FRAME_CONTEXT **tile_ctxs = aom_malloc(cm->tile_rows * cm->tile_cols *
-                                             sizeof(&pbi->tile_data[0].tctx));
-      aom_cdf_prob **cdf_ptrs =
-          aom_malloc(cm->tile_rows * cm->tile_cols *
-                     sizeof(&pbi->tile_data[0].tctx.partition_cdf[0][0]));
-      make_update_tile_list_dec(pbi, cm->tile_rows, cm->tile_cols, tile_ctxs);
-#if CONFIG_LV_MAP
-      av1_adapt_coef_probs(cm);
-#endif  // CONFIG_LV_MAP
-#if CONFIG_SYMBOLRATE
-      av1_dump_symbol_rate(cm);
-#endif
-      av1_adapt_intra_frame_probs(cm);
-      av1_average_tile_coef_cdfs(pbi->common.fc, tile_ctxs, cdf_ptrs,
-                                 cm->tile_rows * cm->tile_cols);
-      av1_average_tile_intra_cdfs(pbi->common.fc, tile_ctxs, cdf_ptrs,
-                                  cm->tile_rows * cm->tile_cols);
-#if CONFIG_PVQ
-      av1_average_tile_pvq_cdfs(pbi->common.fc, tile_ctxs,
-                                cm->tile_rows * cm->tile_cols);
-#endif  // CONFIG_PVQ
-#if CONFIG_ADAPT_SCAN
-      av1_adapt_scan_order(cm);
-#endif  // CONFIG_ADAPT_SCAN
-
-      if (!frame_is_intra_only(cm)) {
-        av1_adapt_inter_frame_probs(cm);
-#if !CONFIG_NEW_MULTISYMBOL
-        av1_adapt_mv_probs(cm, cm->allow_high_precision_mv);
-#endif
-        av1_average_tile_inter_cdfs(&pbi->common, pbi->common.fc, tile_ctxs,
-                                    cdf_ptrs, cm->tile_rows * cm->tile_cols);
-        av1_average_tile_mv_cdfs(pbi->common.fc, tile_ctxs, cdf_ptrs,
-                                 cm->tile_rows * cm->tile_cols);
-      }
-      aom_free(tile_ctxs);
-      aom_free(cdf_ptrs);
-    } else {
-      debug_check_frame_counts(cm);
+      *cm->fc = pbi->tile_data[cm->context_update_tile_id].tctx;
+      av1_reset_cdf_symbol_counters(cm->fc);
     }
   } else {
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
@@ -5808,153 +4481,8 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
   }
 #endif
 
-// Non frame parallel update frame context here.
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  if (!context_updated) cm->frame_contexts[cm->new_fb_idx] = *cm->fc;
-#else
-  if (!cm->error_resilient_mode && !context_updated)
-    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
-#endif
-}
-
-#if CONFIG_OBU
-
-static OBU_TYPE read_obu_header(struct aom_read_bit_buffer *rb,
-                                uint32_t *header_size) {
-  OBU_TYPE obu_type;
-  int obu_extension_flag;
-
-  *header_size = 1;
-
-  obu_type = (OBU_TYPE)aom_rb_read_literal(rb, 5);
-  aom_rb_read_literal(rb, 2);  // reserved
-  obu_extension_flag = aom_rb_read_bit(rb);
-  if (obu_extension_flag) {
-    *header_size += 1;
-    aom_rb_read_literal(rb, 3);  // temporal_id
-    aom_rb_read_literal(rb, 2);
-    aom_rb_read_literal(rb, 2);
-    aom_rb_read_literal(rb, 1);  // reserved
-  }
-
-  return obu_type;
-}
-
-static uint32_t read_temporal_delimiter_obu() { return 0; }
-
-static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
-                                         struct aom_read_bit_buffer *rb) {
-  AV1_COMMON *const cm = &pbi->common;
-  SequenceHeader *const seq_params = &cm->seq_params;
-  uint32_t saved_bit_offset = rb->bit_offset;
-
-  cm->profile = av1_read_profile(rb);
-  aom_rb_read_literal(rb, 4);  // level
-
-  seq_params->frame_id_numbers_present_flag = aom_rb_read_bit(rb);
-  if (seq_params->frame_id_numbers_present_flag) {
-    seq_params->frame_id_length_minus7 = aom_rb_read_literal(rb, 4);
-    seq_params->delta_frame_id_length_minus2 = aom_rb_read_literal(rb, 4);
-  }
-
-  read_bitdepth_colorspace_sampling(cm, rb, pbi->allow_lowbitdepth);
-
-  return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
-}
-
-static uint32_t read_frame_header_obu(AV1Decoder *pbi, const uint8_t *data,
-                                      const uint8_t *data_end,
-                                      const uint8_t **p_data_end) {
-  size_t header_size;
-
-  header_size =
-      av1_decode_frame_headers_and_setup(pbi, data, data_end, p_data_end);
-  return (uint32_t)(pbi->uncomp_hdr_size + header_size);
-}
-
-static uint32_t read_tile_group_header(AV1Decoder *pbi,
-                                       struct aom_read_bit_buffer *rb,
-                                       int *startTile, int *endTile) {
-  AV1_COMMON *const cm = &pbi->common;
-  uint32_t saved_bit_offset = rb->bit_offset;
-
-  *startTile = aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
-  *endTile = aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
-
-  return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
-}
-
-static uint32_t read_one_tile_group_obu(AV1Decoder *pbi,
-                                        struct aom_read_bit_buffer *rb,
-                                        int is_first_tg, const uint8_t *data,
-                                        const uint8_t *data_end,
-                                        const uint8_t **p_data_end,
-                                        int *is_last_tg) {
-  AV1_COMMON *const cm = &pbi->common;
-  int startTile, endTile;
-  uint32_t header_size, tg_payload_size;
-
-  header_size = read_tile_group_header(pbi, rb, &startTile, &endTile);
-  data += header_size;
-  av1_decode_tg_tiles_and_wrapup(pbi, data, data_end, p_data_end, startTile,
-                                 endTile, is_first_tg);
-  tg_payload_size = (uint32_t)(*p_data_end - data);
-
-  // TODO(shan):  For now, assume all tile groups received in order
-  *is_last_tg = endTile == cm->tile_rows * cm->tile_cols - 1;
-
-  return header_size + tg_payload_size;
-}
-
-void av1_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
-                                const uint8_t *data_end,
-                                const uint8_t **p_data_end) {
-  AV1_COMMON *const cm = &pbi->common;
-  int frame_decoding_finished = 0;
-  int is_first_tg_obu_received = 1;
-  int frame_header_received = 0;
-  int frame_header_size = 0;
-
-  // decode frame as a series of OBUs
-  while (!frame_decoding_finished && !cm->error.error_code) {
-    struct aom_read_bit_buffer rb;
-    uint8_t clear_data[80];
-    uint32_t obu_size, obu_header_size, obu_payload_size = 0;
-    OBU_TYPE obu_type;
-
-    init_read_bit_buffer(pbi, &rb, data + 4, data_end, clear_data);
-
-    // every obu is preceded by 4-byte size of obu (obu header + payload size)
-    // The obu size is only needed for tile group OBUs
-    obu_size = mem_get_le32(data);
-    obu_type = read_obu_header(&rb, &obu_header_size);
-    data += (4 + obu_header_size);
-
-    switch (obu_type) {
-      case OBU_TD: obu_payload_size = read_temporal_delimiter_obu(); break;
-      case OBU_SEQUENCE_HEADER:
-        obu_payload_size = read_sequence_header_obu(pbi, &rb);
-        break;
-      case OBU_FRAME_HEADER:
-        // Only decode first frame header received
-        if (!frame_header_received) {
-          frame_header_size = obu_payload_size =
-              read_frame_header_obu(pbi, data, data_end, p_data_end);
-          frame_header_received = 1;
-        } else {
-          obu_payload_size = frame_header_size;
-        }
-        if (cm->show_existing_frame) frame_decoding_finished = 1;
-        break;
-      case OBU_TILE_GROUP:
-        obu_payload_size = read_one_tile_group_obu(
-            pbi, &rb, is_first_tg_obu_received, data, data + obu_size - 1,
-            p_data_end, &frame_decoding_finished);
-        is_first_tg_obu_received = 0;
-        break;
-      default: break;
-    }
-    data += obu_payload_size;
+  // Non frame parallel update frame context here.
+  if (!cm->large_scale_tile) {
+    cm->frame_contexts[cm->new_fb_idx] = *cm->fc;
   }
 }
-#endif
diff --git a/third_party/aom/av1/decoder/decodeframe.h b/third_party/aom/av1/decoder/decodeframe.h
index 0e7eb6a1d..330cedcdc 100644
--- a/third_party/aom/av1/decoder/decodeframe.h
+++ b/third_party/aom/av1/decoder/decodeframe.h
@@ -19,35 +19,59 @@ extern "C" {
 struct AV1Decoder;
 struct aom_read_bit_buffer;
 
-#if CONFIG_REFERENCE_BUFFER
-/* Placeholder for now */
-void read_sequence_header(SequenceHeader *seq_params,
-                          struct aom_read_bit_buffer *rb);
-#endif
+// Reads the middle part of the sequence header OBU (from
+// frame_width_bits_minus_1 to enable_restoration) into cm->seq_params (a
+// SequenceHeader). Reports errors by calling rb->error_handler() or
+// aom_internal_error().
+void read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb);
 
-void av1_read_frame_size(struct aom_read_bit_buffer *rb, int *width,
-                         int *height);
+void av1_read_frame_size(struct aom_read_bit_buffer *rb, int num_bits_width,
+                         int num_bits_height, int *width, int *height);
 BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb);
 
-// This function is now obsolete
-void av1_decode_frame(struct AV1Decoder *pbi, const uint8_t *data,
-                      const uint8_t *data_end, const uint8_t **p_data_end);
-size_t av1_decode_frame_headers_and_setup(struct AV1Decoder *pbi,
-                                          const uint8_t *data,
-                                          const uint8_t *data_end,
-                                          const uint8_t **p_data_end);
+// Returns 0 on success. Sets pbi->common.error.error_code and returns -1 on
+// failure.
+int av1_check_trailing_bits(struct AV1Decoder *pbi,
+                            struct aom_read_bit_buffer *rb);
+
+int av1_decode_frame_headers_and_setup(struct AV1Decoder *pbi,
+                                       struct aom_read_bit_buffer *rb,
+                                       const uint8_t *data,
+                                       const uint8_t **p_data_end,
+                                       int trailing_bits_present);
 
 void av1_decode_tg_tiles_and_wrapup(struct AV1Decoder *pbi, const uint8_t *data,
                                     const uint8_t *data_end,
                                     const uint8_t **p_data_end, int startTile,
                                     int endTile, int initialize_flag);
 
-#if CONFIG_OBU
-// replaces av1_decode_frame
-void av1_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
-                                const uint8_t *data_end,
-                                const uint8_t **p_data_end);
-#endif
+// Implements the color_config() function in the spec. Reports errors by
+// calling rb->error_handler() or aom_internal_error().
+void av1_read_color_config(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
+                           int allow_lowbitdepth);
+
+// Implements the timing_info() function in the spec. Reports errors by calling
+// rb->error_handler().
+void av1_read_timing_info_header(AV1_COMMON *cm,
+                                 struct aom_read_bit_buffer *rb);
+
+// Implements the decoder_model_info() function in the spec. Reports errors by
+// calling rb->error_handler().
+void av1_read_decoder_model_info(AV1_COMMON *cm,
+                                 struct aom_read_bit_buffer *rb);
+
+// Implements the operating_parameters_info() function in the spec. Reports
+// errors by calling rb->error_handler() or aom_internal_error().
+void av1_read_op_parameters_info(AV1_COMMON *const cm,
+                                 struct aom_read_bit_buffer *rb, int op_num);
+
+struct aom_read_bit_buffer *av1_init_read_bit_buffer(
+    struct AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
+    const uint8_t *data_end);
+
+void av1_free_mc_tmp_buf(void *td, int use_highbd);
+
+void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/decoder/decodemv.c b/third_party/aom/av1/decoder/decodemv.c
index cac27e9a6..cc8f4d29e 100644
--- a/third_party/aom/av1/decoder/decodemv.c
+++ b/third_party/aom/av1/decoder/decodemv.c
@@ -11,6 +11,7 @@
 
 #include <assert.h>
 
+#include "av1/common/cfl.h"
 #include "av1/common/common.h"
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
@@ -18,13 +19,9 @@
 #include "av1/common/mvref_common.h"
 #include "av1/common/pred_common.h"
 #include "av1/common/reconinter.h"
-#if CONFIG_EXT_INTRA
 #include "av1/common/reconintra.h"
-#endif  // CONFIG_EXT_INTRA
 #include "av1/common/seg_common.h"
-#if CONFIG_WARPED_MOTION
 #include "av1/common/warped_motion.h"
-#endif  // CONFIG_WARPED_MOTION
 
 #include "av1/decoder/decodeframe.h"
 #include "av1/decoder/decodemv.h"
@@ -39,30 +36,51 @@ static PREDICTION_MODE read_intra_mode(aom_reader *r, aom_cdf_prob *cdf) {
   return (PREDICTION_MODE)aom_read_symbol(r, cdf, INTRA_MODES, ACCT_STR);
 }
 
-static int read_delta_qindex(AV1_COMMON *cm, MACROBLOCKD *xd, aom_reader *r,
-                             MB_MODE_INFO *const mbmi, int mi_col, int mi_row) {
-  FRAME_COUNTS *counts = xd->counts;
+static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd,
+                      int mi_col, int mi_row) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  if (cm->coded_lossless) return;
+  if (cm->allow_intrabc) {
+    assert(cm->cdef_bits == 0);
+    return;
+  }
+
+  if (!(mi_col & (cm->seq_params.mib_size - 1)) &&
+      !(mi_row & (cm->seq_params.mib_size - 1))) {  // Top left?
+    xd->cdef_preset[0] = xd->cdef_preset[1] = xd->cdef_preset[2] =
+        xd->cdef_preset[3] = -1;
+  }
+  // Read CDEF param at the first non-skip coding block
+  const int mask = (1 << (6 - MI_SIZE_LOG2));
+  const int m = ~(mask - 1);
+  const int index = cm->seq_params.sb_size == BLOCK_128X128
+                        ? !!(mi_col & mask) + 2 * !!(mi_row & mask)
+                        : 0;
+  cm->mi_grid_visible[(mi_row & m) * cm->mi_stride + (mi_col & m)]
+      ->cdef_strength = xd->cdef_preset[index] =
+      xd->cdef_preset[index] == -1 && !mbmi->skip
+          ? aom_read_literal(r, cm->cdef_bits, ACCT_STR)
+          : xd->cdef_preset[index];
+}
+
+static int read_delta_qindex(AV1_COMMON *cm, const MACROBLOCKD *xd,
+                             aom_reader *r, MB_MODE_INFO *const mbmi,
+                             int mi_col, int mi_row) {
   int sign, abs, reduced_delta_qindex = 0;
   BLOCK_SIZE bsize = mbmi->sb_type;
-  const int b_col = mi_col & MAX_MIB_MASK;
-  const int b_row = mi_row & MAX_MIB_MASK;
+  const int b_col = mi_col & (cm->seq_params.mib_size - 1);
+  const int b_row = mi_row & (cm->seq_params.mib_size - 1);
   const int read_delta_q_flag = (b_col == 0 && b_row == 0);
-  int rem_bits, thr;
-  int i, smallval;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
 
-  if ((bsize != BLOCK_LARGEST || mbmi->skip == 0) && read_delta_q_flag) {
+  if ((bsize != cm->seq_params.sb_size || mbmi->skip == 0) &&
+      read_delta_q_flag) {
     abs = aom_read_symbol(r, ec_ctx->delta_q_cdf, DELTA_Q_PROBS + 1, ACCT_STR);
-    smallval = (abs < DELTA_Q_SMALL);
-    if (counts) {
-      for (i = 0; i < abs; ++i) counts->delta_q[i][1]++;
-      if (smallval) counts->delta_q[abs][0]++;
-    }
+    const int smallval = (abs < DELTA_Q_SMALL);
 
     if (!smallval) {
-      rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
-      thr = (1 << rem_bits) + 1;
+      const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
+      const int thr = (1 << rem_bits) + 1;
       abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
     }
 
@@ -76,56 +94,33 @@ static int read_delta_qindex(AV1_COMMON *cm, MACROBLOCKD *xd, aom_reader *r,
   }
   return reduced_delta_qindex;
 }
-#if CONFIG_EXT_DELTA_Q
-static int read_delta_lflevel(AV1_COMMON *cm, MACROBLOCKD *xd, aom_reader *r,
-#if CONFIG_LOOPFILTER_LEVEL
-                              int lf_id,
-#endif
+static int read_delta_lflevel(AV1_COMMON *cm, const MACROBLOCKD *xd,
+                              aom_reader *r, int lf_id,
                               MB_MODE_INFO *const mbmi, int mi_col,
                               int mi_row) {
-  FRAME_COUNTS *counts = xd->counts;
   int sign, abs, reduced_delta_lflevel = 0;
   BLOCK_SIZE bsize = mbmi->sb_type;
-  const int b_col = mi_col & MAX_MIB_MASK;
-  const int b_row = mi_row & MAX_MIB_MASK;
+  const int b_col = mi_col & (cm->seq_params.mib_size - 1);
+  const int b_row = mi_row & (cm->seq_params.mib_size - 1);
   const int read_delta_lf_flag = (b_col == 0 && b_row == 0);
-  int rem_bits, thr;
-  int i, smallval;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
 
-  if ((bsize != cm->sb_size || mbmi->skip == 0) && read_delta_lf_flag) {
-#if CONFIG_LOOPFILTER_LEVEL
+  if ((bsize != cm->seq_params.sb_size || mbmi->skip == 0) &&
+      read_delta_lf_flag) {
     if (cm->delta_lf_multi) {
-      assert(lf_id >= 0 && lf_id < FRAME_LF_COUNT);
+      assert(lf_id >= 0 &&
+             lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT
+                                             : FRAME_LF_COUNT - 2));
       abs = aom_read_symbol(r, ec_ctx->delta_lf_multi_cdf[lf_id],
                             DELTA_LF_PROBS + 1, ACCT_STR);
     } else {
       abs = aom_read_symbol(r, ec_ctx->delta_lf_cdf, DELTA_LF_PROBS + 1,
                             ACCT_STR);
     }
-#else
-    abs =
-        aom_read_symbol(r, ec_ctx->delta_lf_cdf, DELTA_LF_PROBS + 1, ACCT_STR);
-#endif  // CONFIG_LOOPFILTER_LEVEL
-    smallval = (abs < DELTA_LF_SMALL);
-    if (counts) {
-#if CONFIG_LOOPFILTER_LEVEL
-      if (cm->delta_lf_multi) {
-        for (i = 0; i < abs; ++i) counts->delta_lf_multi[lf_id][i][1]++;
-        if (smallval) counts->delta_lf_multi[lf_id][abs][0]++;
-      } else {
-        for (i = 0; i < abs; ++i) counts->delta_lf[i][1]++;
-        if (smallval) counts->delta_lf[abs][0]++;
-      }
-#else
-      for (i = 0; i < abs; ++i) counts->delta_lf[i][1]++;
-      if (smallval) counts->delta_lf[abs][0]++;
-#endif  // CONFIG_LOOPFILTER_LEVEL
-    }
+    const int smallval = (abs < DELTA_LF_SMALL);
     if (!smallval) {
-      rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
-      thr = (1 << rem_bits) + 1;
+      const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
+      const int thr = (1 << rem_bits) + 1;
       abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
     }
 
@@ -139,21 +134,17 @@ static int read_delta_lflevel(AV1_COMMON *cm, MACROBLOCKD *xd, aom_reader *r,
   }
   return reduced_delta_lflevel;
 }
-#endif
 
 static UV_PREDICTION_MODE read_intra_mode_uv(FRAME_CONTEXT *ec_ctx,
                                              aom_reader *r,
+                                             CFL_ALLOWED_TYPE cfl_allowed,
                                              PREDICTION_MODE y_mode) {
   const UV_PREDICTION_MODE uv_mode =
-#if CONFIG_CFL
-      aom_read_symbol(r, ec_ctx->uv_mode_cdf[y_mode], UV_INTRA_MODES, ACCT_STR);
-#else
-      read_intra_mode(r, ec_ctx->uv_mode_cdf[y_mode]);
-#endif  // CONFIG_CFL
+      aom_read_symbol(r, ec_ctx->uv_mode_cdf[cfl_allowed][y_mode],
+                      UV_INTRA_MODES - !cfl_allowed, ACCT_STR);
   return uv_mode;
 }
 
-#if CONFIG_CFL
 static int read_cfl_alphas(FRAME_CONTEXT *const ec_ctx, aom_reader *r,
                            int *signs_out) {
   const int joint_sign =
@@ -172,400 +163,145 @@ static int read_cfl_alphas(FRAME_CONTEXT *const ec_ctx, aom_reader *r,
   *signs_out = joint_sign;
   return idx;
 }
-#endif
 
-#if CONFIG_INTERINTRA
-static INTERINTRA_MODE read_interintra_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                            aom_reader *r, int size_group) {
-  (void)cm;
+static INTERINTRA_MODE read_interintra_mode(MACROBLOCKD *xd, aom_reader *r,
+                                            int size_group) {
   const INTERINTRA_MODE ii_mode = (INTERINTRA_MODE)aom_read_symbol(
       r, xd->tile_ctx->interintra_mode_cdf[size_group], INTERINTRA_MODES,
       ACCT_STR);
-  FRAME_COUNTS *counts = xd->counts;
-  if (counts) ++counts->interintra_mode[size_group][ii_mode];
   return ii_mode;
 }
-#endif  // CONFIG_INTERINTRA
 
-static PREDICTION_MODE read_inter_mode(FRAME_CONTEXT *ec_ctx, MACROBLOCKD *xd,
-                                       aom_reader *r, int16_t ctx) {
-  FRAME_COUNTS *counts = xd->counts;
+static PREDICTION_MODE read_inter_mode(FRAME_CONTEXT *ec_ctx, aom_reader *r,
+                                       int16_t ctx) {
   int16_t mode_ctx = ctx & NEWMV_CTX_MASK;
   int is_newmv, is_zeromv, is_refmv;
-#if CONFIG_NEW_MULTISYMBOL
   is_newmv = aom_read_symbol(r, ec_ctx->newmv_cdf[mode_ctx], 2, ACCT_STR) == 0;
-#else
-  is_newmv = aom_read(r, ec_ctx->newmv_prob[mode_ctx], ACCT_STR) == 0;
-#endif
-
-  if (is_newmv) {
-    if (counts) ++counts->newmv_mode[mode_ctx][0];
-    return NEWMV;
-  }
-  if (counts) ++counts->newmv_mode[mode_ctx][1];
-
-  if (ctx & (1 << ALL_ZERO_FLAG_OFFSET)) return ZEROMV;
+  if (is_newmv) return NEWMV;
 
-  mode_ctx = (ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
-
-#if CONFIG_NEW_MULTISYMBOL
+  mode_ctx = (ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
   is_zeromv =
       aom_read_symbol(r, ec_ctx->zeromv_cdf[mode_ctx], 2, ACCT_STR) == 0;
-#else
-  is_zeromv = aom_read(r, ec_ctx->zeromv_prob[mode_ctx], ACCT_STR) == 0;
-#endif
-  if (is_zeromv) {
-    if (counts) ++counts->zeromv_mode[mode_ctx][0];
-    return ZEROMV;
-  }
-  if (counts) ++counts->zeromv_mode[mode_ctx][1];
+  if (is_zeromv) return GLOBALMV;
 
   mode_ctx = (ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
-
-  if (ctx & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6;
-  if (ctx & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7;
-  if (ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8;
-
-#if CONFIG_NEW_MULTISYMBOL
   is_refmv = aom_read_symbol(r, ec_ctx->refmv_cdf[mode_ctx], 2, ACCT_STR) == 0;
-#else
-  is_refmv = aom_read(r, ec_ctx->refmv_prob[mode_ctx], ACCT_STR) == 0;
-#endif
-
-  if (is_refmv) {
-    if (counts) ++counts->refmv_mode[mode_ctx][0];
-
+  if (is_refmv)
     return NEARESTMV;
-  } else {
-    if (counts) ++counts->refmv_mode[mode_ctx][1];
+  else
     return NEARMV;
-  }
-
-  // Invalid prediction mode.
-  assert(0);
 }
 
 static void read_drl_idx(FRAME_CONTEXT *ec_ctx, MACROBLOCKD *xd,
                          MB_MODE_INFO *mbmi, aom_reader *r) {
   uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
   mbmi->ref_mv_idx = 0;
-
-  if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV
-#if CONFIG_COMPOUND_SINGLEREF
-      || mbmi->mode == SR_NEW_NEWMV
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      ) {
-    int idx;
-    for (idx = 0; idx < 2; ++idx) {
+  if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+    for (int idx = 0; idx < 2; ++idx) {
       if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
         uint8_t drl_ctx = av1_drl_ctx(xd->ref_mv_stack[ref_frame_type], idx);
-#if CONFIG_NEW_MULTISYMBOL
         int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
-#else
-        int drl_idx = aom_read(r, ec_ctx->drl_prob[drl_ctx], ACCT_STR);
-#endif
         mbmi->ref_mv_idx = idx + drl_idx;
-        if (xd->counts) ++xd->counts->drl_mode[drl_ctx][drl_idx];
         if (!drl_idx) return;
       }
     }
   }
-
   if (have_nearmv_in_inter_mode(mbmi->mode)) {
-    int idx;
     // Offset the NEARESTMV mode.
     // TODO(jingning): Unify the two syntax decoding loops after the NEARESTMV
     // mode is factored in.
-    for (idx = 1; idx < 3; ++idx) {
+    for (int idx = 1; idx < 3; ++idx) {
       if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
         uint8_t drl_ctx = av1_drl_ctx(xd->ref_mv_stack[ref_frame_type], idx);
-#if CONFIG_NEW_MULTISYMBOL
         int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
-#else
-        int drl_idx = aom_read(r, ec_ctx->drl_prob[drl_ctx], ACCT_STR);
-#endif
         mbmi->ref_mv_idx = idx + drl_idx - 1;
-        if (xd->counts) ++xd->counts->drl_mode[drl_ctx][drl_idx];
         if (!drl_idx) return;
       }
     }
   }
 }
 
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 static MOTION_MODE read_motion_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                    MODE_INFO *mi, aom_reader *r) {
-  MB_MODE_INFO *mbmi = &mi->mbmi;
-#if !CONFIG_MOTION_VAR || !CONFIG_WARPED_MOTION || CONFIG_NEW_MULTISYMBOL || \
-    CONFIG_NCOBMC_ADAPT_WEIGHT
-  (void)cm;
-#endif
+                                    MB_MODE_INFO *mbmi, aom_reader *r) {
+  if (cm->switchable_motion_mode == 0) return SIMPLE_TRANSLATION;
+  if (mbmi->skip_mode) return SIMPLE_TRANSLATION;
 
-  const MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION
-      0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-      xd,
-#endif
-      mi);
+  const MOTION_MODE last_motion_mode_allowed =
+      motion_mode_allowed(xd->global_motion, xd, mbmi, cm->allow_warped_motion);
   int motion_mode;
-  FRAME_COUNTS *counts = xd->counts;
 
   if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return SIMPLE_TRANSLATION;
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  if (last_motion_mode_allowed == NCOBMC_ADAPT_WEIGHT) {
-    motion_mode = aom_read_symbol(r, xd->tile_ctx->ncobmc_cdf[mbmi->sb_type],
-                                  OBMC_FAMILY_MODES, ACCT_STR);
-    if (counts) ++counts->ncobmc[mbmi->sb_type][motion_mode];
-    return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
-  } else if (last_motion_mode_allowed == OBMC_CAUSAL) {
-    motion_mode =
-        aom_read_symbol(r, xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2, ACCT_STR);
-    if (counts) ++counts->obmc[mbmi->sb_type][motion_mode];
-    return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
-  } else {
-#else
+
   if (last_motion_mode_allowed == OBMC_CAUSAL) {
-#if CONFIG_NEW_MULTISYMBOL
     motion_mode =
         aom_read_symbol(r, xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2, ACCT_STR);
-#else
-    motion_mode = aom_read(r, cm->fc->obmc_prob[mbmi->sb_type], ACCT_STR);
-#endif
-    if (counts) ++counts->obmc[mbmi->sb_type][motion_mode];
     return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
   } else {
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
     motion_mode =
         aom_read_symbol(r, xd->tile_ctx->motion_mode_cdf[mbmi->sb_type],
                         MOTION_MODES, ACCT_STR);
-    if (counts) ++counts->motion_mode[mbmi->sb_type][motion_mode];
     return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
   }
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
 }
 
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-static void read_ncobmc_mode(MACROBLOCKD *xd, MODE_INFO *mi,
-                             NCOBMC_MODE ncobmc_mode[2], aom_reader *r) {
-  MB_MODE_INFO *mbmi = &mi->mbmi;
-  FRAME_COUNTS *counts = xd->counts;
-  ADAPT_OVERLAP_BLOCK ao_block = adapt_overlap_block_lookup[mbmi->sb_type];
-  if (mbmi->motion_mode != NCOBMC_ADAPT_WEIGHT) return;
-
-  ncobmc_mode[0] = aom_read_symbol(r, xd->tile_ctx->ncobmc_mode_cdf[ao_block],
-                                   MAX_NCOBMC_MODES, ACCT_STR);
-  if (counts) ++counts->ncobmc_mode[ao_block][ncobmc_mode[0]];
-
-  if (mi_size_wide[mbmi->sb_type] != mi_size_high[mbmi->sb_type]) {
-    ncobmc_mode[1] = aom_read_symbol(r, xd->tile_ctx->ncobmc_mode_cdf[ao_block],
-                                     MAX_NCOBMC_MODES, ACCT_STR);
-    if (counts) ++counts->ncobmc_mode[ao_block][ncobmc_mode[1]];
-  }
-}
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-
-static PREDICTION_MODE read_inter_compound_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                                aom_reader *r, int16_t ctx) {
-  (void)cm;
+static PREDICTION_MODE read_inter_compound_mode(MACROBLOCKD *xd, aom_reader *r,
+                                                int16_t ctx) {
   const int mode =
       aom_read_symbol(r, xd->tile_ctx->inter_compound_mode_cdf[ctx],
                       INTER_COMPOUND_MODES, ACCT_STR);
-  FRAME_COUNTS *counts = xd->counts;
-
-  if (counts) ++counts->inter_compound_mode[ctx][mode];
-
   assert(is_inter_compound_mode(NEAREST_NEARESTMV + mode));
   return NEAREST_NEARESTMV + mode;
 }
 
-#if CONFIG_COMPOUND_SINGLEREF
-static PREDICTION_MODE read_inter_singleref_comp_mode(MACROBLOCKD *xd,
-                                                      aom_reader *r,
-                                                      int16_t ctx) {
-  const int mode =
-      aom_read_symbol(r, xd->tile_ctx->inter_singleref_comp_mode_cdf[ctx],
-                      INTER_SINGLEREF_COMP_MODES, ACCT_STR);
-  FRAME_COUNTS *counts = xd->counts;
-
-  if (counts) ++counts->inter_singleref_comp_mode[ctx][mode];
-
-  assert(is_inter_singleref_comp_mode(SR_NEAREST_NEARMV + mode));
-  return SR_NEAREST_NEARMV + mode;
-}
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-static int read_segment_id(aom_reader *r, struct segmentation_probs *segp) {
-  return aom_read_symbol(r, segp->tree_cdf, MAX_SEGMENTS, ACCT_STR);
-}
-
-#if CONFIG_VAR_TX
-static void read_tx_size_vartx(AV1_COMMON *cm, MACROBLOCKD *xd,
-                               MB_MODE_INFO *mbmi, FRAME_COUNTS *counts,
-                               TX_SIZE tx_size, int depth, int blk_row,
-                               int blk_col, aom_reader *r) {
-#if CONFIG_NEW_MULTISYMBOL
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
-#endif
-  int is_split = 0;
-  const int tx_row = blk_row >> 1;
-  const int tx_col = blk_col >> 1;
-  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
-  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
-  int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
-                                   xd->left_txfm_context + blk_row,
-                                   mbmi->sb_type, tx_size);
-  TX_SIZE(*const inter_tx_size)
-  [MAX_MIB_SIZE] =
-      (TX_SIZE(*)[MAX_MIB_SIZE]) & mbmi->inter_tx_size[tx_row][tx_col];
-  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
-  assert(tx_size > TX_4X4);
-
-  if (depth == MAX_VARTX_DEPTH) {
-    int idx, idy;
-    inter_tx_size[0][0] = tx_size;
-    for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
-      for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
-        inter_tx_size[idy][idx] = tx_size;
-    mbmi->tx_size = tx_size;
-    mbmi->min_tx_size = AOMMIN(mbmi->min_tx_size, get_min_tx_size(tx_size));
-    txfm_partition_update(xd->above_txfm_context + blk_col,
-                          xd->left_txfm_context + blk_row, tx_size, tx_size);
-    return;
-  }
-
-#if CONFIG_NEW_MULTISYMBOL
-  is_split = aom_read_symbol(r, ec_ctx->txfm_partition_cdf[ctx], 2, ACCT_STR);
-#else
-  is_split = aom_read(r, cm->fc->txfm_partition_prob[ctx], ACCT_STR);
-#endif
-
-  if (is_split) {
-    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bsl = tx_size_wide_unit[sub_txs];
-    int i;
-
-    if (counts) ++counts->txfm_partition[ctx][1];
-
-    if (sub_txs == TX_4X4) {
-      int idx, idy;
-      inter_tx_size[0][0] = sub_txs;
-      for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
-        for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
-          inter_tx_size[idy][idx] = inter_tx_size[0][0];
-      mbmi->tx_size = sub_txs;
-      mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-      txfm_partition_update(xd->above_txfm_context + blk_col,
-                            xd->left_txfm_context + blk_row, sub_txs, tx_size);
-      return;
-    }
-
-    assert(bsl > 0);
-    for (i = 0; i < 4; ++i) {
-      int offsetr = blk_row + (i >> 1) * bsl;
-      int offsetc = blk_col + (i & 0x01) * bsl;
-      read_tx_size_vartx(cm, xd, mbmi, counts, sub_txs, depth + 1, offsetr,
-                         offsetc, r);
+int av1_neg_deinterleave(int diff, int ref, int max) {
+  if (!ref) return diff;
+  if (ref >= (max - 1)) return max - diff - 1;
+  if (2 * ref < max) {
+    if (diff <= 2 * ref) {
+      if (diff & 1)
+        return ref + ((diff + 1) >> 1);
+      else
+        return ref - (diff >> 1);
     }
+    return diff;
   } else {
-    int idx, idy;
-    inter_tx_size[0][0] = tx_size;
-    for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
-      for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
-        inter_tx_size[idy][idx] = tx_size;
-    mbmi->tx_size = tx_size;
-    mbmi->min_tx_size = AOMMIN(mbmi->min_tx_size, get_min_tx_size(tx_size));
-    if (counts) ++counts->txfm_partition[ctx][0];
-    txfm_partition_update(xd->above_txfm_context + blk_col,
-                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+    if (diff <= 2 * (max - ref - 1)) {
+      if (diff & 1)
+        return ref + ((diff + 1) >> 1);
+      else
+        return ref - (diff >> 1);
+    }
+    return max - (diff + 1);
   }
 }
-#endif
 
-static TX_SIZE read_selected_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     int32_t tx_size_cat, aom_reader *r) {
-  FRAME_COUNTS *counts = xd->counts;
-  const int ctx = get_tx_size_context(xd);
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
-
-  const int depth = aom_read_symbol(r, ec_ctx->tx_size_cdf[tx_size_cat][ctx],
-                                    tx_size_cat + 2, ACCT_STR);
-  const TX_SIZE tx_size = depth_to_tx_size(depth);
-#if CONFIG_RECT_TX
-  assert(!is_rect_tx(tx_size));
-#endif  // CONFIG_RECT_TX
-  if (counts) ++counts->tx_size[tx_size_cat][ctx][depth];
-  return tx_size;
-}
+static int read_segment_id(AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+                           int mi_row, int mi_col, aom_reader *r, int skip) {
+  int cdf_num;
+  const int pred = av1_get_spatial_seg_pred(cm, xd, mi_row, mi_col, &cdf_num);
+  if (skip) return pred;
 
-static TX_SIZE read_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd, int is_inter,
-                            int allow_select_inter, aom_reader *r) {
-  const TX_MODE tx_mode = cm->tx_mode;
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  if (xd->lossless[xd->mi[0]->mbmi.segment_id]) return TX_4X4;
-
-  if (block_signals_txsize(bsize)) {
-    if ((!is_inter || allow_select_inter) && tx_mode == TX_MODE_SELECT) {
-      const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
-                                           : intra_tx_size_cat_lookup[bsize];
-      const TX_SIZE coded_tx_size =
-          read_selected_tx_size(cm, xd, tx_size_cat, r);
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-      if (coded_tx_size > max_txsize_lookup[bsize]) {
-        assert(coded_tx_size == max_txsize_lookup[bsize] + 1);
-#if CONFIG_RECT_TX_EXT
-        if (is_quarter_tx_allowed(xd, &xd->mi[0]->mbmi, is_inter)) {
-          int quarter_tx;
-
-          if (quarter_txsize_lookup[bsize] != max_txsize_lookup[bsize]) {
-#if CONFIG_NEW_MULTISYMBOL
-            quarter_tx =
-                aom_read_symbol(r, cm->fc->quarter_tx_size_cdf, 2, ACCT_STR);
-#else
-            quarter_tx = aom_read(r, cm->fc->quarter_tx_size_prob, ACCT_STR);
-            FRAME_COUNTS *counts = xd->counts;
-            if (counts) ++counts->quarter_tx_size[quarter_tx];
-#endif
-          } else {
-            quarter_tx = 1;
-          }
-          return quarter_tx ? quarter_txsize_lookup[bsize]
-                            : max_txsize_rect_lookup[bsize];
-        }
-#endif  // CONFIG_RECT_TX_EXT
-
-        return max_txsize_rect_lookup[bsize];
-      }
-#else
-      assert(coded_tx_size <= max_txsize_lookup[bsize]);
-#endif  // CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-      return coded_tx_size;
-    } else {
-      return tx_size_from_tx_mode(bsize, tx_mode, is_inter);
-    }
-  } else {
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-    assert(IMPLIES(tx_mode == ONLY_4X4, bsize == BLOCK_4X4));
-    return max_txsize_rect_lookup[bsize];
-#else
-    return TX_4X4;
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  struct segmentation *const seg = &cm->seg;
+  struct segmentation_probs *const segp = &ec_ctx->seg;
+  aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
+  const int coded_id = aom_read_symbol(r, pred_cdf, MAX_SEGMENTS, ACCT_STR);
+  const int segment_id =
+      av1_neg_deinterleave(coded_id, pred, seg->last_active_segid + 1);
+
+  if (segment_id < 0 || segment_id > seg->last_active_segid) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Corrupted segment_ids");
   }
+  return segment_id;
 }
 
 static int dec_get_segment_id(const AV1_COMMON *cm, const uint8_t *segment_ids,
                               int mi_offset, int x_mis, int y_mis) {
-  int x, y, segment_id = INT_MAX;
+  int segment_id = INT_MAX;
 
-  for (y = 0; y < y_mis; y++)
-    for (x = 0; x < x_mis; x++)
+  for (int y = 0; y < y_mis; y++)
+    for (int x = 0; x < x_mis; x++)
       segment_id =
           AOMMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]);
 
@@ -575,30 +311,28 @@ static int dec_get_segment_id(const AV1_COMMON *cm, const uint8_t *segment_ids,
 
 static void set_segment_id(AV1_COMMON *cm, int mi_offset, int x_mis, int y_mis,
                            int segment_id) {
-  int x, y;
-
   assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
 
-  for (y = 0; y < y_mis; y++)
-    for (x = 0; x < x_mis; x++)
+  for (int y = 0; y < y_mis; y++)
+    for (int x = 0; x < x_mis; x++)
       cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
 }
 
-static int read_intra_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                 int mi_offset, int x_mis, int y_mis,
-                                 aom_reader *r) {
+static int read_intra_segment_id(AV1_COMMON *const cm,
+                                 const MACROBLOCKD *const xd, int mi_row,
+                                 int mi_col, int bsize, aom_reader *r,
+                                 int skip) {
   struct segmentation *const seg = &cm->seg;
-  FRAME_COUNTS *counts = xd->counts;
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  struct segmentation_probs *const segp = &ec_ctx->seg;
-  int segment_id;
-
   if (!seg->enabled) return 0;  // Default for disabled segmentation
 
   assert(seg->update_map && !seg->temporal_update);
 
-  segment_id = read_segment_id(r, segp);
-  if (counts) ++counts->seg.tree_total[segment_id];
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int x_mis = AOMMIN(cm->mi_cols - mi_col, bw);
+  const int y_mis = AOMMIN(cm->mi_rows - mi_row, bh);
+  const int segment_id = read_segment_id(cm, xd, mi_row, mi_col, r, skip);
   set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
   return segment_id;
 }
@@ -607,24 +341,25 @@ static void copy_segment_id(const AV1_COMMON *cm,
                             const uint8_t *last_segment_ids,
                             uint8_t *current_segment_ids, int mi_offset,
                             int x_mis, int y_mis) {
-  int x, y;
-
-  for (y = 0; y < y_mis; y++)
-    for (x = 0; x < x_mis; x++)
+  for (int y = 0; y < y_mis; y++)
+    for (int x = 0; x < x_mis; x++)
       current_segment_ids[mi_offset + y * cm->mi_cols + x] =
           last_segment_ids ? last_segment_ids[mi_offset + y * cm->mi_cols + x]
                            : 0;
 }
 
+static int get_predicted_segment_id(AV1_COMMON *const cm, int mi_offset,
+                                    int x_mis, int y_mis) {
+  return cm->last_frame_seg_map ? dec_get_segment_id(cm, cm->last_frame_seg_map,
+                                                     mi_offset, x_mis, y_mis)
+                                : 0;
+}
+
 static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                 int mi_row, int mi_col, aom_reader *r) {
+                                 int mi_row, int mi_col, int preskip,
+                                 aom_reader *r) {
   struct segmentation *const seg = &cm->seg;
-  FRAME_COUNTS *counts = xd->counts;
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  struct segmentation_probs *const segp = &ec_ctx->seg;
-
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  int predicted_segment_id, segment_id;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
   const int bw = mi_size_wide[mbmi->sb_type];
   const int bh = mi_size_high[mbmi->sb_type];
@@ -635,60 +370,82 @@ static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
 
   if (!seg->enabled) return 0;  // Default for disabled segmentation
 
-  predicted_segment_id = cm->last_frame_seg_map
-                             ? dec_get_segment_id(cm, cm->last_frame_seg_map,
-                                                  mi_offset, x_mis, y_mis)
-                             : 0;
-
   if (!seg->update_map) {
     copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
                     mi_offset, x_mis, y_mis);
-    return predicted_segment_id;
+    return get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
+  }
+
+  int segment_id;
+  if (preskip) {
+    if (!seg->segid_preskip) return 0;
+  } else {
+    if (seg->segid_preskip) return mbmi->segment_id;
+    if (mbmi->skip) {
+      if (seg->temporal_update) {
+        mbmi->seg_id_predicted = 0;
+      }
+      segment_id = read_segment_id(cm, xd, mi_row, mi_col, r, 1);
+      set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+      return segment_id;
+    }
   }
 
   if (seg->temporal_update) {
     const int ctx = av1_get_pred_context_seg_id(xd);
-#if CONFIG_NEW_MULTISYMBOL
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    struct segmentation_probs *const segp = &ec_ctx->seg;
     aom_cdf_prob *pred_cdf = segp->pred_cdf[ctx];
     mbmi->seg_id_predicted = aom_read_symbol(r, pred_cdf, 2, ACCT_STR);
-#else
-    const aom_prob pred_prob = segp->pred_probs[ctx];
-    mbmi->seg_id_predicted = aom_read(r, pred_prob, ACCT_STR);
-#endif
-    if (counts) ++counts->seg.pred[ctx][mbmi->seg_id_predicted];
     if (mbmi->seg_id_predicted) {
-      segment_id = predicted_segment_id;
+      segment_id = get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
     } else {
-      segment_id = read_segment_id(r, segp);
-      if (counts) ++counts->seg.tree_mispred[segment_id];
+      segment_id = read_segment_id(cm, xd, mi_row, mi_col, r, 0);
     }
   } else {
-    segment_id = read_segment_id(r, segp);
-    if (counts) ++counts->seg.tree_total[segment_id];
+    segment_id = read_segment_id(cm, xd, mi_row, mi_col, r, 0);
   }
   set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
   return segment_id;
 }
 
+static int read_skip_mode(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
+                          aom_reader *r) {
+  if (!cm->skip_mode_flag) return 0;
+
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 0;
+  }
+
+  if (!is_comp_ref_allowed(xd->mi[0]->sb_type)) return 0;
+
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ||
+      segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    // These features imply single-reference mode, while skip mode implies
+    // compound reference. Hence, the two are mutually exclusive.
+    // In other words, skip_mode is implicitly 0 here.
+    return 0;
+  }
+
+  const int ctx = av1_get_skip_mode_context(xd);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const int skip_mode =
+      aom_read_symbol(r, ec_ctx->skip_mode_cdfs[ctx], 2, ACCT_STR);
+  return skip_mode;
+}
+
 static int read_skip(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
                      aom_reader *r) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
     const int ctx = av1_get_skip_context(xd);
-#if CONFIG_NEW_MULTISYMBOL
     FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
     const int skip = aom_read_symbol(r, ec_ctx->skip_cdfs[ctx], 2, ACCT_STR);
-#else
-    const int skip = aom_read(r, cm->fc->skip_probs[ctx], ACCT_STR);
-#endif
-    FRAME_COUNTS *counts = xd->counts;
-    if (counts) ++counts->skip[ctx][skip];
     return skip;
   }
 }
 
-#if CONFIG_PALETTE_DELTA_ENCODING
 // Merge the sorted list of cached colors(cached_colors[0...n_cached_colors-1])
 // and the sorted list of transmitted colors(colors[n_cached_colors...n-1]) into
 // one single sorted list(colors[...]).
@@ -796,346 +553,114 @@ static void read_palette_colors_uv(MACROBLOCKD *const xd, int bit_depth,
     }
   }
 }
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
 
 static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                   aom_reader *r) {
-  MODE_INFO *const mi = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const MODE_INFO *const above_mi = xd->above_mi;
-  const MODE_INFO *const left_mi = xd->left_mi;
+                                   int mi_row, int mi_col, aom_reader *r) {
+  const int num_planes = av1_num_planes(cm);
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(av1_allow_palette(cm->allow_screen_content_tools, bsize));
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-
-  assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_LARGEST);
-  const int block_palette_idx = bsize - BLOCK_8X8;
-  int modev;
+  const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
 
   if (mbmi->mode == DC_PRED) {
-    int palette_y_mode_ctx = 0;
-    if (above_mi) {
-      palette_y_mode_ctx +=
-          (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-    }
-    if (left_mi) {
-      palette_y_mode_ctx +=
-          (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-    }
-#if CONFIG_NEW_MULTISYMBOL
-    modev = aom_read_symbol(
-        r,
-        xd->tile_ctx->palette_y_mode_cdf[block_palette_idx][palette_y_mode_ctx],
-        2, ACCT_STR);
-#else
-    modev = aom_read(
-        r,
-        av1_default_palette_y_mode_prob[block_palette_idx][palette_y_mode_ctx],
+    const int palette_mode_ctx = av1_get_palette_mode_ctx(xd);
+    const int modev = aom_read_symbol(
+        r, xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_mode_ctx], 2,
         ACCT_STR);
-#endif
     if (modev) {
       pmi->palette_size[0] =
-          aom_read_symbol(r,
-                          xd->tile_ctx->palette_y_size_cdf[block_palette_idx],
+          aom_read_symbol(r, xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
                           PALETTE_SIZES, ACCT_STR) +
           2;
-#if CONFIG_PALETTE_DELTA_ENCODING
       read_palette_colors_y(xd, cm->bit_depth, pmi, r);
-#else
-      for (int i = 0; i < pmi->palette_size[0]; ++i)
-        pmi->palette_colors[i] = aom_read_literal(r, cm->bit_depth, ACCT_STR);
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
     }
   }
-  if (mbmi->uv_mode == UV_DC_PRED) {
+  if (num_planes > 1 && mbmi->uv_mode == UV_DC_PRED &&
+      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                          xd->plane[1].subsampling_y)) {
     const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
-#if CONFIG_NEW_MULTISYMBOL
-    modev = aom_read_symbol(
+    const int modev = aom_read_symbol(
         r, xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2, ACCT_STR);
-#else
-    modev = aom_read(r, av1_default_palette_uv_mode_prob[palette_uv_mode_ctx],
-                     ACCT_STR);
-#endif
     if (modev) {
       pmi->palette_size[1] =
-          aom_read_symbol(r,
-                          xd->tile_ctx->palette_uv_size_cdf[block_palette_idx],
+          aom_read_symbol(r, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
                           PALETTE_SIZES, ACCT_STR) +
           2;
-#if CONFIG_PALETTE_DELTA_ENCODING
       read_palette_colors_uv(xd, cm->bit_depth, pmi, r);
-#else
-      for (int i = 0; i < pmi->palette_size[1]; ++i) {
-        pmi->palette_colors[PALETTE_MAX_SIZE + i] =
-            aom_read_literal(r, cm->bit_depth, ACCT_STR);
-        pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] =
-            aom_read_literal(r, cm->bit_depth, ACCT_STR);
-      }
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
     }
   }
 }
 
-#if CONFIG_FILTER_INTRA
-static void read_filter_intra_mode_info(AV1_COMMON *const cm,
-                                        MACROBLOCKD *const xd, int mi_row,
-                                        int mi_col, aom_reader *r) {
-  MODE_INFO *const mi = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  FRAME_COUNTS *counts = xd->counts;
+static int read_angle_delta(aom_reader *r, aom_cdf_prob *cdf) {
+  const int sym = aom_read_symbol(r, cdf, 2 * MAX_ANGLE_DELTA + 1, ACCT_STR);
+  return sym - MAX_ANGLE_DELTA;
+}
+
+static void read_filter_intra_mode_info(const AV1_COMMON *const cm,
+                                        MACROBLOCKD *const xd, aom_reader *r) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   FILTER_INTRA_MODE_INFO *filter_intra_mode_info =
       &mbmi->filter_intra_mode_info;
 
-  if (mbmi->mode == DC_PRED && mbmi->palette_mode_info.palette_size[0] == 0) {
-    filter_intra_mode_info->use_filter_intra_mode[0] =
-        aom_read(r, cm->fc->filter_intra_probs[0], ACCT_STR);
-    if (filter_intra_mode_info->use_filter_intra_mode[0]) {
-      filter_intra_mode_info->filter_intra_mode[0] =
-          av1_read_uniform(r, FILTER_INTRA_MODES);
+  if (av1_filter_intra_allowed(cm, mbmi)) {
+    filter_intra_mode_info->use_filter_intra = aom_read_symbol(
+        r, xd->tile_ctx->filter_intra_cdfs[mbmi->sb_type], 2, ACCT_STR);
+    if (filter_intra_mode_info->use_filter_intra) {
+      filter_intra_mode_info->filter_intra_mode = aom_read_symbol(
+          r, xd->tile_ctx->filter_intra_mode_cdf, FILTER_INTRA_MODES, ACCT_STR);
     }
-    if (counts) {
-      ++counts
-            ->filter_intra[0][filter_intra_mode_info->use_filter_intra_mode[0]];
-    }
-  }
-
-#if CONFIG_CB4X4
-  if (!is_chroma_reference(mi_row, mi_col, mbmi->sb_type,
-                           xd->plane[1].subsampling_x,
-                           xd->plane[1].subsampling_y))
-    return;
-#else
-  (void)mi_row;
-  (void)mi_col;
-#endif  // CONFIG_CB4X4
-
-  if (mbmi->uv_mode == UV_DC_PRED &&
-      mbmi->palette_mode_info.palette_size[1] == 0) {
-    filter_intra_mode_info->use_filter_intra_mode[1] =
-        aom_read(r, cm->fc->filter_intra_probs[1], ACCT_STR);
-    if (filter_intra_mode_info->use_filter_intra_mode[1]) {
-      filter_intra_mode_info->filter_intra_mode[1] =
-          av1_read_uniform(r, FILTER_INTRA_MODES);
-    }
-    if (counts) {
-      ++counts
-            ->filter_intra[1][filter_intra_mode_info->use_filter_intra_mode[1]];
-    }
-  }
-}
-#endif  // CONFIG_FILTER_INTRA
-
-#if CONFIG_EXT_INTRA
-static void read_intra_angle_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
-                                  aom_reader *r) {
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_INTRA_INTERP
-  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
-  const int ctx = av1_get_pred_context_intra_interp(xd);
-  int p_angle;
-#endif  // CONFIG_INTRA_INTERP
-
-  (void)cm;
-
-  mbmi->angle_delta[0] = 0;
-  mbmi->angle_delta[1] = 0;
-#if CONFIG_INTRA_INTERP
-  mbmi->intra_filter = INTRA_FILTER_LINEAR;
-#endif  // CONFIG_INTRA_INTERP
-
-  if (!av1_use_angle_delta(bsize)) return;
-
-  if (av1_is_directional_mode(mbmi->mode, bsize)) {
-    mbmi->angle_delta[0] =
-        av1_read_uniform(r, 2 * MAX_ANGLE_DELTA + 1) - MAX_ANGLE_DELTA;
-#if CONFIG_INTRA_INTERP
-    p_angle = mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
-    if (av1_is_intra_filter_switchable(p_angle)) {
-      FRAME_COUNTS *counts = xd->counts;
-      mbmi->intra_filter = aom_read_symbol(r, ec_ctx->intra_filter_cdf[ctx],
-                                           INTRA_FILTERS, ACCT_STR);
-      if (counts) ++counts->intra_filter[ctx][mbmi->intra_filter];
-    }
-#endif  // CONFIG_INTRA_INTERP
-  }
-
-  if (av1_is_directional_mode(get_uv_mode(mbmi->uv_mode), bsize)) {
-    mbmi->angle_delta[1] =
-        av1_read_uniform(r, 2 * MAX_ANGLE_DELTA + 1) - MAX_ANGLE_DELTA;
+  } else {
+    filter_intra_mode_info->use_filter_intra = 0;
   }
 }
-#endif  // CONFIG_EXT_INTRA
 
-void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd,
-#if CONFIG_SUPERTX
-                      int supertx_enabled,
-#endif
-#if CONFIG_TXK_SEL
-                      int blk_row, int blk_col, int block, int plane,
-                      TX_SIZE tx_size,
-#endif
-                      aom_reader *r) {
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
+                      int blk_col, TX_SIZE tx_size, aom_reader *r) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
   const int inter_block = is_inter_block(mbmi);
-#if !CONFIG_TXK_SEL
-#if CONFIG_VAR_TX
-  const TX_SIZE tx_size = inter_block ? mbmi->min_tx_size : mbmi->tx_size;
-#else
-  const TX_SIZE tx_size = mbmi->tx_size;
-#endif
-#endif  // !CONFIG_TXK_SEL
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
-#if !CONFIG_TXK_SEL
-  TX_TYPE *tx_type = &mbmi->tx_type;
-#else
-  // only y plane's tx_type is transmitted
-  if (plane > 0) return;
-  (void)block;
-  TX_TYPE *tx_type = &mbmi->txk_type[(blk_row << 4) + blk_col];
-#endif
-#if CONFIG_LGT_FROM_PRED
-  mbmi->use_lgt = 0;
-#endif
-
-  if (!FIXED_TX_TYPE) {
-#if CONFIG_EXT_TX
-    const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
-    if (get_ext_tx_types(tx_size, mbmi->sb_type, inter_block,
-                         cm->reduced_tx_set_used) > 1 &&
-        ((!cm->seg.enabled && cm->base_qindex > 0) ||
-         (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
-        !mbmi->skip &&
-#if CONFIG_SUPERTX
-        !supertx_enabled &&
-#endif  // CONFIG_SUPERTX
-        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-      const TxSetType tx_set_type = get_ext_tx_set_type(
-          tx_size, mbmi->sb_type, inter_block, cm->reduced_tx_set_used);
-      const int eset = get_ext_tx_set(tx_size, mbmi->sb_type, inter_block,
-                                      cm->reduced_tx_set_used);
-      // eset == 0 should correspond to a set with only DCT_DCT and
-      // there is no need to read the tx_type
-      assert(eset != 0);
-
-#if !CONFIG_LGT_FROM_PRED
-      if (inter_block) {
-        *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
-            r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
-            av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
-      } else if (ALLOW_INTRA_EXT_TX) {
-        *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
-            r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
-            av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
-      }
-#else
-      // only signal tx_type when lgt is not allowed or not selected
-      if (inter_block) {
-        if (LGT_FROM_PRED_INTER) {
-          if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used) {
-            mbmi->use_lgt =
-                aom_read(r, ec_ctx->inter_lgt_prob[square_tx_size], ACCT_STR);
-#if CONFIG_ENTROPY_STATS
-            if (counts) ++counts->inter_lgt[square_tx_size][mbmi->use_lgt];
-#endif  // CONFIG_ENTROPY_STATS
-          }
-          if (!mbmi->use_lgt) {
-            *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
-                r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
-                av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
-#if CONFIG_ENTROPY_STATS
-            if (counts) ++counts->inter_ext_tx[eset][square_tx_size][*tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-          } else {
-            *tx_type = DCT_DCT;  // assign a dummy tx_type
-          }
-        } else {
-          *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
-              r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
-              av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
-#if CONFIG_ENTROPY_STATS
-          if (counts) ++counts->inter_ext_tx[eset][square_tx_size][*tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-        }
-      } else if (ALLOW_INTRA_EXT_TX) {
-        if (LGT_FROM_PRED_INTRA) {
-          if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used) {
-            mbmi->use_lgt =
-                aom_read(r, ec_ctx->intra_lgt_prob[square_tx_size][mbmi->mode],
-                         ACCT_STR);
-#if CONFIG_ENTROPY_STATS
-            if (counts)
-              ++counts->intra_lgt[square_tx_size][mbmi->mode][mbmi->use_lgt];
-#endif  // CONFIG_ENTROPY_STATS
-          }
-          if (!mbmi->use_lgt) {
-            *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
-                r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
-                av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
-#if CONFIG_ENTROPY_STATS
-            if (counts)
-              ++counts
-                    ->intra_ext_tx[eset][square_tx_size][mbmi->mode][*tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-          } else {
-            *tx_type = DCT_DCT;  // assign a dummy tx_type
-          }
-        } else {
-          *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
-              r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
-              av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
-#if CONFIG_ENTROPY_STATS
-          if (counts)
-            ++counts->intra_ext_tx[eset][square_tx_size][mbmi->mode][*tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-        }
-      }
-#endif  // CONFIG_LGT_FROM_PRED
+  const int txk_type_idx =
+      av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+  TX_TYPE *tx_type = &mbmi->txk_type[txk_type_idx];
+
+  const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+  if (get_ext_tx_types(tx_size, inter_block, cm->reduced_tx_set_used) > 1 &&
+      ((!cm->seg.enabled && cm->base_qindex > 0) ||
+       (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
+      !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    const TxSetType tx_set_type =
+        av1_get_ext_tx_set_type(tx_size, inter_block, cm->reduced_tx_set_used);
+    const int eset =
+        get_ext_tx_set(tx_size, inter_block, cm->reduced_tx_set_used);
+    // eset == 0 should correspond to a set with only DCT_DCT and
+    // there is no need to read the tx_type
+    assert(eset != 0);
+
+    if (inter_block) {
+      *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
+          r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
+          av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
     } else {
-      *tx_type = DCT_DCT;
-    }
-#else  // CONFIG_EXT_TX
-
-    if (tx_size < TX_32X32 &&
-        ((!cm->seg.enabled && cm->base_qindex > 0) ||
-         (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
-        !mbmi->skip &&
-#if CONFIG_SUPERTX
-        !supertx_enabled &&
-#endif  // CONFIG_SUPERTX
-        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-#if CONFIG_ENTROPY_STATS
-      FRAME_COUNTS *counts = xd->counts;
-#endif  // CONFIG_ENTROPY_STATS
-      if (inter_block) {
-        *tx_type = av1_ext_tx_inv[aom_read_symbol(
-            r, ec_ctx->inter_ext_tx_cdf[tx_size], TX_TYPES, ACCT_STR)];
-#if CONFIG_ENTROPY_STATS
-        if (counts) ++counts->inter_ext_tx[tx_size][*tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-      } else {
-        const TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode];
-        *tx_type = av1_ext_tx_inv[aom_read_symbol(
-            r, ec_ctx->intra_ext_tx_cdf[tx_size][tx_type_nom], TX_TYPES,
-            ACCT_STR)];
-#if CONFIG_ENTROPY_STATS
-        if (counts) ++counts->intra_ext_tx[tx_size][tx_type_nom][*tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-      }
-    } else {
-      *tx_type = DCT_DCT;
+      PREDICTION_MODE intra_dir;
+      if (mbmi->filter_intra_mode_info.use_filter_intra)
+        intra_dir =
+            fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode];
+      else
+        intra_dir = mbmi->mode;
+      *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
+          r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_dir],
+          av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
     }
-#endif  // CONFIG_EXT_TX
+  } else {
+    *tx_type = DCT_DCT;
   }
-#if FIXED_TX_TYPE
-  assert(mbmi->tx_type == DCT_DCT);
-#endif
 }
 
-#if CONFIG_INTRABC
 static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
-                           nmv_context *ctx, nmv_context_counts *counts,
-                           MvSubpelPrecision precision);
+                           nmv_context *ctx, MvSubpelPrecision precision);
 
 static INLINE int is_mv_valid(const MV *mv);
 
@@ -1143,267 +668,195 @@ static INLINE int assign_dv(AV1_COMMON *cm, MACROBLOCKD *xd, int_mv *mv,
                             const int_mv *ref_mv, int mi_row, int mi_col,
                             BLOCK_SIZE bsize, aom_reader *r) {
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
-  FRAME_COUNTS *counts = xd->counts;
-  nmv_context_counts *const dv_counts = counts ? &counts->dv : NULL;
-  read_mv(r, &mv->as_mv, &ref_mv->as_mv, &ec_ctx->ndvc, dv_counts,
-          MV_SUBPEL_NONE);
+  read_mv(r, &mv->as_mv, &ref_mv->as_mv, &ec_ctx->ndvc, MV_SUBPEL_NONE);
+  // DV should not have sub-pel.
+  assert((mv->as_mv.col & 7) == 0);
+  assert((mv->as_mv.row & 7) == 0);
+  mv->as_mv.col = (mv->as_mv.col >> 3) * 8;
+  mv->as_mv.row = (mv->as_mv.row >> 3) * 8;
   int valid = is_mv_valid(&mv->as_mv) &&
-              is_dv_valid(mv->as_mv, &xd->tile, mi_row, mi_col, bsize);
+              av1_is_dv_valid(mv->as_mv, cm, xd, mi_row, mi_col, bsize,
+                              cm->seq_params.mib_size_log2);
   return valid;
 }
-#endif  // CONFIG_INTRABC
+
+static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                              int mi_row, int mi_col, aom_reader *r) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  mbmi->use_intrabc = aom_read_symbol(r, ec_ctx->intrabc_cdf, 2, ACCT_STR);
+  if (mbmi->use_intrabc) {
+    BLOCK_SIZE bsize = mbmi->sb_type;
+    mbmi->mode = DC_PRED;
+    mbmi->uv_mode = UV_DC_PRED;
+    mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+    mbmi->motion_mode = SIMPLE_TRANSLATION;
+
+    int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
+    int_mv ref_mvs[INTRA_FRAME + 1][MAX_MV_REF_CANDIDATES];
+    int_mv global_mvs[REF_FRAMES];
+
+    av1_find_mv_refs(cm, xd, mbmi, INTRA_FRAME, xd->ref_mv_count,
+                     xd->ref_mv_stack, ref_mvs, global_mvs, mi_row, mi_col,
+                     inter_mode_ctx);
+
+    int_mv nearestmv, nearmv;
+
+    av1_find_best_ref_mvs(0, ref_mvs[INTRA_FRAME], &nearestmv, &nearmv, 0);
+    int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
+    if (dv_ref.as_int == 0)
+      av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params.mib_size, mi_row,
+                      mi_col);
+    // Ref DV should not have sub-pel.
+    int valid_dv = (dv_ref.as_mv.col & 7) == 0 && (dv_ref.as_mv.row & 7) == 0;
+    dv_ref.as_mv.col = (dv_ref.as_mv.col >> 3) * 8;
+    dv_ref.as_mv.row = (dv_ref.as_mv.row >> 3) * 8;
+    valid_dv = valid_dv && assign_dv(cm, xd, &mbmi->mv[0], &dv_ref, mi_row,
+                                     mi_col, bsize, r);
+    if (!valid_dv) {
+      // Intra bc motion vectors are not valid - signal corrupt frame
+      aom_merge_corrupted_flag(&xd->corrupted, 1);
+    }
+  }
+}
 
 static void read_intra_frame_mode_info(AV1_COMMON *const cm,
                                        MACROBLOCKD *const xd, int mi_row,
                                        int mi_col, aom_reader *r) {
-  MODE_INFO *const mi = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const MODE_INFO *above_mi = xd->above_mi;
-  const MODE_INFO *left_mi = xd->left_mi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const MB_MODE_INFO *above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *left_mi = xd->left_mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
-  int i;
-  const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = mi_size_wide[bsize];
-  const int bh = mi_size_high[bsize];
+  struct segmentation *const seg = &cm->seg;
 
-  // TODO(slavarnway): move x_mis, y_mis into xd ?????
-  const int x_mis = AOMMIN(cm->mi_cols - mi_col, bw);
-  const int y_mis = AOMMIN(cm->mi_rows - mi_row, bh);
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
-  mbmi->segment_id = read_intra_segment_id(cm, xd, mi_offset, x_mis, y_mis, r);
+  if (seg->segid_preskip)
+    mbmi->segment_id =
+        read_intra_segment_id(cm, xd, mi_row, mi_col, bsize, r, 0);
+
   mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
 
+  if (!seg->segid_preskip)
+    mbmi->segment_id =
+        read_intra_segment_id(cm, xd, mi_row, mi_col, bsize, r, mbmi->skip);
+
+  read_cdef(cm, r, xd, mi_col, mi_row);
+
   if (cm->delta_q_present_flag) {
-    xd->current_qindex =
-        xd->prev_qindex +
+    xd->current_qindex +=
         read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res;
     /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
     xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
-    xd->prev_qindex = xd->current_qindex;
-#if CONFIG_EXT_DELTA_Q
     if (cm->delta_lf_present_flag) {
-#if CONFIG_LOOPFILTER_LEVEL
       if (cm->delta_lf_multi) {
-        for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) {
-          mbmi->curr_delta_lf[lf_id] = xd->curr_delta_lf[lf_id] =
-              xd->prev_delta_lf[lf_id] +
+        const int frame_lf_count =
+            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+          const int tmp_lvl =
+              xd->delta_lf[lf_id] +
               read_delta_lflevel(cm, xd, r, lf_id, mbmi, mi_col, mi_row) *
                   cm->delta_lf_res;
-          xd->prev_delta_lf[lf_id] = xd->curr_delta_lf[lf_id];
+          mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] =
+              clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
         }
       } else {
-        mbmi->current_delta_lf_from_base = xd->current_delta_lf_from_base =
-            xd->prev_delta_lf_from_base +
+        const int tmp_lvl =
+            xd->delta_lf_from_base +
             read_delta_lflevel(cm, xd, r, -1, mbmi, mi_col, mi_row) *
                 cm->delta_lf_res;
-        xd->prev_delta_lf_from_base = xd->current_delta_lf_from_base;
+        mbmi->delta_lf_from_base = xd->delta_lf_from_base =
+            clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
       }
-#else
-      const int current_delta_lf_from_base =
-          xd->prev_delta_lf_from_base +
-          read_delta_lflevel(cm, xd, r, mbmi, mi_col, mi_row) *
-              cm->delta_lf_res;
-      mbmi->current_delta_lf_from_base = xd->current_delta_lf_from_base =
-          clamp(current_delta_lf_from_base, 0, MAX_LOOP_FILTER);
-      xd->prev_delta_lf_from_base = xd->current_delta_lf_from_base;
-#endif  // CONFIG_LOOPFILTER_LEVEL
     }
-#endif
   }
 
+  mbmi->current_qindex = xd->current_qindex;
+
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
 
-#if CONFIG_INTRABC
-  if (av1_allow_intrabc(bsize, cm)) {
-    mbmi->use_intrabc = aom_read_symbol(r, ec_ctx->intrabc_cdf, 2, ACCT_STR);
-    if (mbmi->use_intrabc) {
-      mbmi->tx_size = read_tx_size(cm, xd, 1, !mbmi->skip, r);
-      mbmi->mode = mbmi->uv_mode = UV_DC_PRED;
-      mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
-
-      int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
-      int_mv ref_mvs[MAX_MV_REF_CANDIDATES];
-
-      av1_find_mv_refs(cm, xd, mi, INTRA_FRAME, &xd->ref_mv_count[INTRA_FRAME],
-                       xd->ref_mv_stack[INTRA_FRAME], NULL, ref_mvs, mi_row,
-                       mi_col, NULL, NULL, inter_mode_ctx);
-
-      int_mv nearestmv, nearmv;
-      av1_find_best_ref_mvs(0, ref_mvs, &nearestmv, &nearmv);
-
-      int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
-      if (dv_ref.as_int == 0) av1_find_ref_dv(&dv_ref, mi_row, mi_col);
-
-      xd->corrupted |=
-          !assign_dv(cm, xd, &mbmi->mv[0], &dv_ref, mi_row, mi_col, bsize, r);
-#if CONFIG_VAR_TX
-      // TODO(aconverse@google.com): Evaluate allowing VAR TX on intrabc blocks
-      const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
-      const int height = block_size_high[bsize] >> tx_size_high_log2[0];
-      int idx, idy;
-      for (idy = 0; idy < height; ++idy)
-        for (idx = 0; idx < width; ++idx)
-          mbmi->inter_tx_size[idy >> 1][idx >> 1] = mbmi->tx_size;
-      mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-#endif  // CONFIG_VAR_TX
-#if CONFIG_EXT_TX && !CONFIG_TXK_SEL
-      av1_read_tx_type(cm, xd,
-#if CONFIG_SUPERTX
-                       0,
-#endif
-                       r);
-#endif  // CONFIG_EXT_TX && !CONFIG_TXK_SEL
-      return;
-    }
-  }
-#endif  // CONFIG_INTRABC
-
-  mbmi->tx_size = read_tx_size(cm, xd, 0, 1, r);
-
-#if CONFIG_CB4X4
-  (void)i;
-  mbmi->mode =
-      read_intra_mode(r, get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, 0));
-#else
-  switch (bsize) {
-    case BLOCK_4X4:
-      for (i = 0; i < 4; ++i)
-        mi->bmi[i].as_mode = read_intra_mode(
-            r, get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, i));
-      mbmi->mode = mi->bmi[3].as_mode;
-      break;
-    case BLOCK_4X8:
-      mi->bmi[0].as_mode = mi->bmi[2].as_mode =
-          read_intra_mode(r, get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, 0));
-      mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode =
-          read_intra_mode(r, get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, 1));
-      break;
-    case BLOCK_8X4:
-      mi->bmi[0].as_mode = mi->bmi[1].as_mode =
-          read_intra_mode(r, get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, 0));
-      mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode =
-          read_intra_mode(r, get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, 2));
-      break;
-    default:
-      mbmi->mode =
-          read_intra_mode(r, get_y_mode_cdf(ec_ctx, mi, above_mi, left_mi, 0));
+  xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  if (av1_allow_intrabc(cm)) {
+    read_intrabc_info(cm, xd, mi_row, mi_col, r);
+    if (is_intrabc_block(mbmi)) return;
   }
-#endif
 
-#if CONFIG_CB4X4
-  if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                          xd->plane[1].subsampling_y)) {
-#if CONFIG_CFL
-    xd->cfl->is_chroma_reference = 1;
-#endif  // CONFIG_CFL
-#endif  // CONFIG_CB4X4
-    mbmi->uv_mode = read_intra_mode_uv(ec_ctx, r, mbmi->mode);
+  mbmi->mode = read_intra_mode(r, get_y_mode_cdf(ec_ctx, above_mi, left_mi));
 
-#if CONFIG_CFL
+  const int use_angle_delta = av1_use_angle_delta(bsize);
+  mbmi->angle_delta[PLANE_TYPE_Y] =
+      (use_angle_delta && av1_is_directional_mode(mbmi->mode))
+          ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
+          : 0;
+
+  if (!cm->seq_params.monochrome &&
+      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                          xd->plane[1].subsampling_y)) {
+    xd->cfl.is_chroma_reference = 1;
+    mbmi->uv_mode =
+        read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
     if (mbmi->uv_mode == UV_CFL_PRED) {
       mbmi->cfl_alpha_idx = read_cfl_alphas(ec_ctx, r, &mbmi->cfl_alpha_signs);
-      xd->cfl->store_y = 1;
-    } else {
-      xd->cfl->store_y = 0;
     }
-#endif  // CONFIG_CFL
-
-#if CONFIG_CB4X4
+    mbmi->angle_delta[PLANE_TYPE_UV] =
+        (use_angle_delta && av1_is_directional_mode(get_uv_mode(mbmi->uv_mode)))
+            ? read_angle_delta(r,
+                               ec_ctx->angle_delta_cdf[mbmi->uv_mode - V_PRED])
+            : 0;
   } else {
     // Avoid decoding angle_info if there is is no chroma prediction
     mbmi->uv_mode = UV_DC_PRED;
-#if CONFIG_CFL
-    xd->cfl->is_chroma_reference = 0;
-    xd->cfl->store_y = 1;
-#endif
+    xd->cfl.is_chroma_reference = 0;
   }
-#endif
+  xd->cfl.store_y = store_cfl_required(cm, xd);
 
-#if CONFIG_EXT_INTRA
-  read_intra_angle_info(cm, xd, r);
-#endif  // CONFIG_EXT_INTRA
-  mbmi->palette_mode_info.palette_size[0] = 0;
-  mbmi->palette_mode_info.palette_size[1] = 0;
   if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
-    read_palette_mode_info(cm, xd, r);
-#if CONFIG_FILTER_INTRA
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
-  if (bsize >= BLOCK_8X8 || CONFIG_CB4X4)
-    read_filter_intra_mode_info(cm, xd, mi_row, mi_col, r);
-#endif  // CONFIG_FILTER_INTRA
-
-#if !CONFIG_TXK_SEL
-  av1_read_tx_type(cm, xd,
-#if CONFIG_SUPERTX
-                   0,
-#endif
-                   r);
-#endif  // !CONFIG_TXK_SEL
+    read_palette_mode_info(cm, xd, mi_row, mi_col, r);
+
+  read_filter_intra_mode_info(cm, xd, r);
 }
 
 static int read_mv_component(aom_reader *r, nmv_component *mvcomp,
-#if CONFIG_INTRABC || CONFIG_AMVR
-                             int use_subpel,
-#endif  // CONFIG_INTRABC || CONFIG_AMVR
-                             int usehp) {
+                             int use_subpel, int usehp) {
   int mag, d, fr, hp;
-#if CONFIG_NEW_MULTISYMBOL
-  const int sign = aom_read_bit(r, ACCT_STR);
-#else
-  const int sign = aom_read(r, mvcomp->sign, ACCT_STR);
-#endif
+  const int sign = aom_read_symbol(r, mvcomp->sign_cdf, 2, ACCT_STR);
   const int mv_class =
-      aom_read_symbol(r, mvcomp->class_cdf, MV_CLASSES, ACCT_STR);
+      aom_read_symbol(r, mvcomp->classes_cdf, MV_CLASSES, ACCT_STR);
   const int class0 = mv_class == MV_CLASS_0;
 
   // Integer part
   if (class0) {
-#if CONFIG_NEW_MULTISYMBOL
     d = aom_read_symbol(r, mvcomp->class0_cdf, CLASS0_SIZE, ACCT_STR);
-#else
-    d = aom_read(r, mvcomp->class0[0], ACCT_STR);
-#endif
     mag = 0;
   } else {
-    int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
     d = 0;
-#if CONFIG_NEW_MULTISYMBOL
-    for (i = 0; i < n; ++i)
-      d |= aom_read_symbol(r, mvcomp->bits_cdf[(i + 1) / 2], 2, ACCT_STR) << i;
-#else
-    for (i = 0; i < n; ++i) d |= aom_read(r, mvcomp->bits[i], ACCT_STR) << i;
-#endif
+    for (int i = 0; i < n; ++i)
+      d |= aom_read_symbol(r, mvcomp->bits_cdf[i], 2, ACCT_STR) << i;
     mag = CLASS0_SIZE << (mv_class + 2);
   }
 
-#if CONFIG_INTRABC || CONFIG_AMVR
   if (use_subpel) {
-#endif  // CONFIG_INTRABC || CONFIG_AMVR
-        // Fractional part
+    // Fractional part
     fr = aom_read_symbol(r, class0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
                          MV_FP_SIZE, ACCT_STR);
 
-// High precision part (if hp is not used, the default value of the hp is 1)
-#if CONFIG_NEW_MULTISYMBOL
+    // High precision part (if hp is not used, the default value of the hp is 1)
     hp = usehp ? aom_read_symbol(
                      r, class0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf, 2,
                      ACCT_STR)
                : 1;
-#else
-  hp = usehp ? aom_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp, ACCT_STR)
-             : 1;
-#endif
-#if CONFIG_INTRABC || CONFIG_AMVR
   } else {
     fr = 3;
     hp = 1;
   }
-#endif  // CONFIG_INTRABC || CONFIG_AMVR
 
   // Result
   mag += ((d << 3) | (fr << 1) | hp) + 1;
@@ -1411,29 +864,19 @@ static int read_mv_component(aom_reader *r, nmv_component *mvcomp,
 }
 
 static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
-                           nmv_context *ctx, nmv_context_counts *counts,
-                           MvSubpelPrecision precision) {
-  MV_JOINT_TYPE joint_type;
-  MV diff = { 0, 0 };
-  joint_type =
-      (MV_JOINT_TYPE)aom_read_symbol(r, ctx->joint_cdf, MV_JOINTS, ACCT_STR);
+                           nmv_context *ctx, MvSubpelPrecision precision) {
+  MV diff = kZeroMv;
+  const MV_JOINT_TYPE joint_type =
+      (MV_JOINT_TYPE)aom_read_symbol(r, ctx->joints_cdf, MV_JOINTS, ACCT_STR);
 
   if (mv_joint_vertical(joint_type))
-    diff.row = read_mv_component(r, &ctx->comps[0],
-#if CONFIG_INTRABC || CONFIG_AMVR
-                                 precision > MV_SUBPEL_NONE,
-#endif  // CONFIG_INTRABC || CONFIG_AMVR
+    diff.row = read_mv_component(r, &ctx->comps[0], precision > MV_SUBPEL_NONE,
                                  precision > MV_SUBPEL_LOW_PRECISION);
 
   if (mv_joint_horizontal(joint_type))
-    diff.col = read_mv_component(r, &ctx->comps[1],
-#if CONFIG_INTRABC || CONFIG_AMVR
-                                 precision > MV_SUBPEL_NONE,
-#endif  // CONFIG_INTRABC || CONFIG_AMVR
+    diff.col = read_mv_component(r, &ctx->comps[1], precision > MV_SUBPEL_NONE,
                                  precision > MV_SUBPEL_LOW_PRECISION);
 
-  av1_inc_mv(&diff, counts, precision);
-
   mv->row = ref->row + diff.row;
   mv->col = ref->col + diff.col;
 }
@@ -1441,138 +884,68 @@ static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
 static REFERENCE_MODE read_block_reference_mode(AV1_COMMON *cm,
                                                 const MACROBLOCKD *xd,
                                                 aom_reader *r) {
-  if (!is_comp_ref_allowed(xd->mi[0]->mbmi.sb_type)) return SINGLE_REFERENCE;
+  if (!is_comp_ref_allowed(xd->mi[0]->sb_type)) return SINGLE_REFERENCE;
   if (cm->reference_mode == REFERENCE_MODE_SELECT) {
-    const int ctx = av1_get_reference_mode_context(cm, xd);
-#if CONFIG_NEW_MULTISYMBOL
+    const int ctx = av1_get_reference_mode_context(xd);
     const REFERENCE_MODE mode = (REFERENCE_MODE)aom_read_symbol(
         r, xd->tile_ctx->comp_inter_cdf[ctx], 2, ACCT_STR);
-#else
-    const REFERENCE_MODE mode =
-        (REFERENCE_MODE)aom_read(r, cm->fc->comp_inter_prob[ctx], ACCT_STR);
-#endif
-    FRAME_COUNTS *counts = xd->counts;
-    if (counts) ++counts->comp_inter[ctx][mode];
     return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
   } else {
+    assert(cm->reference_mode == SINGLE_REFERENCE);
     return cm->reference_mode;
   }
 }
 
-#if CONFIG_NEW_MULTISYMBOL
 #define READ_REF_BIT(pname) \
-  aom_read_symbol(r, av1_get_pred_cdf_##pname(cm, xd), 2, ACCT_STR)
-#define READ_REF_BIT2(pname) \
   aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
-#else
-#define READ_REF_BIT(pname) \
-  aom_read(r, av1_get_pred_prob_##pname(cm, xd), ACCT_STR)
-#define READ_REF_BIT2(pname) \
-  aom_read(r, av1_get_pred_prob_##pname(cm, xd), ACCT_STR)
-#endif
 
-#if CONFIG_EXT_COMP_REFS
-static COMP_REFERENCE_TYPE read_comp_reference_type(AV1_COMMON *cm,
-                                                    const MACROBLOCKD *xd,
+static COMP_REFERENCE_TYPE read_comp_reference_type(const MACROBLOCKD *xd,
                                                     aom_reader *r) {
   const int ctx = av1_get_comp_reference_type_context(xd);
-#if USE_UNI_COMP_REFS
-  COMP_REFERENCE_TYPE comp_ref_type;
-#if CONFIG_VAR_REFS
-  if ((L_OR_L2(cm) || L3_OR_G(cm)) && BWD_OR_ALT(cm)) {
-    if (L_AND_L2(cm) || L_AND_L3(cm) || L_AND_G(cm) || BWD_AND_ALT(cm)) {
-#endif  // CONFIG_VAR_REFS
-#if CONFIG_NEW_MULTISYMBOL
-      (void)cm;
-      comp_ref_type = (COMP_REFERENCE_TYPE)aom_read_symbol(
+  const COMP_REFERENCE_TYPE comp_ref_type =
+      (COMP_REFERENCE_TYPE)aom_read_symbol(
           r, xd->tile_ctx->comp_ref_type_cdf[ctx], 2, ACCT_STR);
-#else
-  comp_ref_type = (COMP_REFERENCE_TYPE)aom_read(
-      r, cm->fc->comp_ref_type_prob[ctx], ACCT_STR);
-#endif
-#if CONFIG_VAR_REFS
-    } else {
-      comp_ref_type = BIDIR_COMP_REFERENCE;
-    }
-  } else {
-    comp_ref_type = UNIDIR_COMP_REFERENCE;
-  }
-#endif  // CONFIG_VAR_REFS
-#else   // !USE_UNI_COMP_REFS
-  // TODO(zoeliu): Temporarily turn off uni-directional comp refs
-  const COMP_REFERENCE_TYPE comp_ref_type = BIDIR_COMP_REFERENCE;
-#endif  // USE_UNI_COMP_REFS
-  FRAME_COUNTS *counts = xd->counts;
-  if (counts) ++counts->comp_ref_type[ctx][comp_ref_type];
   return comp_ref_type;  // UNIDIR_COMP_REFERENCE or BIDIR_COMP_REFERENCE
 }
-#endif  // CONFIG_EXT_COMP_REFS
+
+static void set_ref_frames_for_skip_mode(AV1_COMMON *const cm,
+                                         MV_REFERENCE_FRAME ref_frame[2]) {
+  ref_frame[0] = LAST_FRAME + cm->ref_frame_idx_0;
+  ref_frame[1] = LAST_FRAME + cm->ref_frame_idx_1;
+}
 
 // Read the referncence frame
 static void read_ref_frames(AV1_COMMON *const cm, MACROBLOCKD *const xd,
                             aom_reader *r, int segment_id,
                             MV_REFERENCE_FRAME ref_frame[2]) {
-  FRAME_COUNTS *counts = xd->counts;
+  if (xd->mi[0]->skip_mode) {
+    set_ref_frames_for_skip_mode(cm, ref_frame);
+    return;
+  }
 
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id,
                                                    SEG_LVL_REF_FRAME);
     ref_frame[1] = NONE_FRAME;
+  } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) ||
+             segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    ref_frame[0] = LAST_FRAME;
+    ref_frame[1] = NONE_FRAME;
   } else {
     const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
-    // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
-    if (mode == COMPOUND_REFERENCE) {
-#if CONFIG_EXT_COMP_REFS
-      const COMP_REFERENCE_TYPE comp_ref_type =
-          read_comp_reference_type(cm, xd, r);
 
-#if !USE_UNI_COMP_REFS
-      // TODO(zoeliu): Temporarily turn off uni-directional comp refs
-      assert(comp_ref_type == BIDIR_COMP_REFERENCE);
-#endif  // !USE_UNI_COMP_REFS
+    if (mode == COMPOUND_REFERENCE) {
+      const COMP_REFERENCE_TYPE comp_ref_type = read_comp_reference_type(xd, r);
 
       if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
-        const int ctx = av1_get_pred_context_uni_comp_ref_p(xd);
-        int bit;
-#if CONFIG_VAR_REFS
-        if ((L_AND_L2(cm) || L_AND_L3(cm) || L_AND_G(cm)) && BWD_AND_ALT(cm))
-#endif  // CONFIG_VAR_REFS
-          bit = READ_REF_BIT2(uni_comp_ref_p);
-#if CONFIG_VAR_REFS
-        else
-          bit = BWD_AND_ALT(cm);
-#endif  // CONFIG_VAR_REFS
-        if (counts) ++counts->uni_comp_ref[ctx][0][bit];
-
+        const int bit = READ_REF_BIT(uni_comp_ref_p);
         if (bit) {
           ref_frame[0] = BWDREF_FRAME;
           ref_frame[1] = ALTREF_FRAME;
         } else {
-          const int ctx1 = av1_get_pred_context_uni_comp_ref_p1(xd);
-          int bit1;
-#if CONFIG_VAR_REFS
-          if (L_AND_L2(cm) && (L_AND_L3(cm) || L_AND_G(cm)))
-#endif  // CONFIG_VAR_REFS
-            bit1 = READ_REF_BIT2(uni_comp_ref_p1);
-#if CONFIG_VAR_REFS
-          else
-            bit1 = L_AND_L3(cm) || L_AND_G(cm);
-#endif  // CONFIG_VAR_REFS
-          if (counts) ++counts->uni_comp_ref[ctx1][1][bit1];
-
+          const int bit1 = READ_REF_BIT(uni_comp_ref_p1);
           if (bit1) {
-            const int ctx2 = av1_get_pred_context_uni_comp_ref_p2(xd);
-            int bit2;
-#if CONFIG_VAR_REFS
-            if (L_AND_L3(cm) && L_AND_G(cm))
-#endif  // CONFIG_VAR_REFS
-              bit2 = READ_REF_BIT2(uni_comp_ref_p2);
-#if CONFIG_VAR_REFS
-            else
-              bit2 = L_AND_G(cm);
-#endif  // CONFIG_VAR_REFS
-            if (counts) ++counts->uni_comp_ref[ctx2][2][bit2];
-
+            const int bit2 = READ_REF_BIT(uni_comp_ref_p2);
             if (bit2) {
               ref_frame[0] = LAST_FRAME;
               ref_frame[1] = GOLDEN_FRAME;
@@ -1590,202 +963,46 @@ static void read_ref_frames(AV1_COMMON *const cm, MACROBLOCKD *const xd,
       }
 
       assert(comp_ref_type == BIDIR_COMP_REFERENCE);
-#endif  // CONFIG_EXT_COMP_REFS
 
-// Normative in decoder (for low delay)
-#if CONFIG_ONE_SIDED_COMPOUND || CONFIG_FRAME_SIGN_BIAS
       const int idx = 1;
-#else  // !(CONFIG_ONE_SIDED_COMPOUND || CONFIG_FRAME_SIGN_BIAS)
-#if CONFIG_EXT_REFS
-      const int idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
-#else   // !CONFIG_EXT_REFS
-      const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
-#endif  // CONFIG_EXT_REFS
-#endif  // CONFIG_ONE_SIDED_COMPOUND || CONFIG_FRAME_SIGN_BIAS)
-
-      const int ctx = av1_get_pred_context_comp_ref_p(cm, xd);
-#if CONFIG_VAR_REFS
-      int bit;
-      // Test need to explicitly code (L,L2) vs (L3,G) branch node in tree
-      if (L_OR_L2(cm) && L3_OR_G(cm))
-        bit = READ_REF_BIT(comp_ref_p);
-      else
-        bit = L3_OR_G(cm);
-#else   // !CONFIG_VAR_REFS
       const int bit = READ_REF_BIT(comp_ref_p);
-#endif  // CONFIG_VAR_REFS
-      if (counts) ++counts->comp_ref[ctx][0][bit];
-
-#if CONFIG_EXT_REFS
       // Decode forward references.
       if (!bit) {
-        const int ctx1 = av1_get_pred_context_comp_ref_p1(cm, xd);
-#if CONFIG_VAR_REFS
-        int bit1;
-        // Test need to explicitly code (L) vs (L2) branch node in tree
-        if (L_AND_L2(cm))
-          bit1 = READ_REF_BIT(comp_ref_p1);
-        else
-          bit1 = LAST_IS_VALID(cm);
-#else   // !CONFIG_VAR_REFS
         const int bit1 = READ_REF_BIT(comp_ref_p1);
-#endif  // CONFIG_VAR_REFS
-        if (counts) ++counts->comp_ref[ctx1][1][bit1];
-        ref_frame[!idx] = cm->comp_fwd_ref[bit1 ? 0 : 1];
+        ref_frame[!idx] = cm->comp_fwd_ref[bit1 ? 1 : 0];
       } else {
-        const int ctx2 = av1_get_pred_context_comp_ref_p2(cm, xd);
-#if CONFIG_VAR_REFS
-        int bit2;
-        // Test need to explicitly code (L3) vs (G) branch node in tree
-        if (L3_AND_G(cm))
-          bit2 = READ_REF_BIT(comp_ref_p2);
-        else
-          bit2 = GOLDEN_IS_VALID(cm);
-#else   // !CONFIG_VAR_REFS
         const int bit2 = READ_REF_BIT(comp_ref_p2);
-#endif  // CONFIG_VAR_REFS
-        if (counts) ++counts->comp_ref[ctx2][2][bit2];
         ref_frame[!idx] = cm->comp_fwd_ref[bit2 ? 3 : 2];
       }
 
       // Decode backward references.
-      const int ctx_bwd = av1_get_pred_context_comp_bwdref_p(cm, xd);
-#if CONFIG_VAR_REFS
-      int bit_bwd;
-      // Test need to explicitly code (BWD/ALT2) vs (ALT) branch node in tree
-      const int bit_bwd_uncertain = BWD_OR_ALT2(cm) && ALTREF_IS_VALID(cm);
-      if (bit_bwd_uncertain)
-        bit_bwd = READ_REF_BIT(comp_bwdref_p);
-      else
-        bit_bwd = ALTREF_IS_VALID(cm);
-#else  // !CONFIG_VAR_REFS
       const int bit_bwd = READ_REF_BIT(comp_bwdref_p);
-#endif  // CONFIG_VAR_REFS
-      if (counts) ++counts->comp_bwdref[ctx_bwd][0][bit_bwd];
       if (!bit_bwd) {
-        const int ctx1_bwd = av1_get_pred_context_comp_bwdref_p1(cm, xd);
-#if CONFIG_VAR_REFS
-        int bit1_bwd;
-        if (BWD_AND_ALT2(cm))
-          bit1_bwd = READ_REF_BIT(comp_bwdref_p1);
-        else
-          bit1_bwd = ALTREF2_IS_VALID(cm);
-#else  // !CONFIG_VAR_REFS
         const int bit1_bwd = READ_REF_BIT(comp_bwdref_p1);
-#endif  // CONFIG_VAR_REFS
-        if (counts) ++counts->comp_bwdref[ctx1_bwd][1][bit1_bwd];
         ref_frame[idx] = cm->comp_bwd_ref[bit1_bwd];
       } else {
         ref_frame[idx] = cm->comp_bwd_ref[2];
       }
-#else   // !CONFIG_EXT_REFS
-      ref_frame[!idx] = cm->comp_var_ref[bit];
-      ref_frame[idx] = cm->comp_fixed_ref;
-#endif  // CONFIG_EXT_REFS
     } else if (mode == SINGLE_REFERENCE) {
-#if CONFIG_EXT_REFS
-      const int ctx0 = av1_get_pred_context_single_ref_p1(xd);
-#if CONFIG_VAR_REFS
-      int bit0;
-      // Test need to explicitly code (L,L2,L3,G) vs (BWD,ALT2,ALT) branch node
-      // in tree
-      if ((L_OR_L2(cm) || L3_OR_G(cm)) &&
-          (BWD_OR_ALT2(cm) || ALTREF_IS_VALID(cm)))
-        bit0 = READ_REF_BIT(single_ref_p1);
-      else
-        bit0 = (BWD_OR_ALT2(cm) || ALTREF_IS_VALID(cm));
-#else   // !CONFIG_VAR_REFS
       const int bit0 = READ_REF_BIT(single_ref_p1);
-#endif  // CONFIG_VAR_REFS
-      if (counts) ++counts->single_ref[ctx0][0][bit0];
-
       if (bit0) {
-        const int ctx1 = av1_get_pred_context_single_ref_p2(xd);
-#if CONFIG_VAR_REFS
-        int bit1;
-        // Test need to explicitly code (BWD/ALT2) vs (ALT) branch node in tree
-        const int bit1_uncertain = BWD_OR_ALT2(cm) && ALTREF_IS_VALID(cm);
-        if (bit1_uncertain)
-          bit1 = READ_REF_BIT(single_ref_p2);
-        else
-          bit1 = ALTREF_IS_VALID(cm);
-#else  // !CONFIG_VAR_REFS
         const int bit1 = READ_REF_BIT(single_ref_p2);
-#endif  // CONFIG_VAR_REFS
-        if (counts) ++counts->single_ref[ctx1][1][bit1];
         if (!bit1) {
-          const int ctx5 = av1_get_pred_context_single_ref_p6(xd);
-#if CONFIG_VAR_REFS
-          int bit5;
-          if (BWD_AND_ALT2(cm))
-            bit5 = READ_REF_BIT(single_ref_p6);
-          else
-            bit5 = ALTREF2_IS_VALID(cm);
-#else  // !CONFIG_VAR_REFS
           const int bit5 = READ_REF_BIT(single_ref_p6);
-#endif  // CONFIG_VAR_REFS
-          if (counts) ++counts->single_ref[ctx5][5][bit5];
           ref_frame[0] = bit5 ? ALTREF2_FRAME : BWDREF_FRAME;
         } else {
           ref_frame[0] = ALTREF_FRAME;
         }
       } else {
-        const int ctx2 = av1_get_pred_context_single_ref_p3(xd);
-#if CONFIG_VAR_REFS
-        int bit2;
-        // Test need to explicitly code (L,L2) vs (L3,G) branch node in tree
-        if (L_OR_L2(cm) && L3_OR_G(cm))
-          bit2 = READ_REF_BIT(single_ref_p3);
-        else
-          bit2 = L3_OR_G(cm);
-#else  // !CONFIG_VAR_REFS
         const int bit2 = READ_REF_BIT(single_ref_p3);
-#endif  // CONFIG_VAR_REFS
-        if (counts) ++counts->single_ref[ctx2][2][bit2];
         if (bit2) {
-          const int ctx4 = av1_get_pred_context_single_ref_p5(xd);
-#if CONFIG_VAR_REFS
-          int bit4;
-          // Test need to explicitly code (L3) vs (G) branch node in tree
-          if (L3_AND_G(cm))
-            bit4 = READ_REF_BIT(single_ref_p5);
-          else
-            bit4 = GOLDEN_IS_VALID(cm);
-#else  // !CONFIG_VAR_REFS
           const int bit4 = READ_REF_BIT(single_ref_p5);
-#endif  // CONFIG_VAR_REFS
-          if (counts) ++counts->single_ref[ctx4][4][bit4];
           ref_frame[0] = bit4 ? GOLDEN_FRAME : LAST3_FRAME;
         } else {
-          const int ctx3 = av1_get_pred_context_single_ref_p4(xd);
-#if CONFIG_VAR_REFS
-          int bit3;
-          // Test need to explicitly code (L) vs (L2) branch node in tree
-          if (L_AND_L2(cm))
-            bit3 = READ_REF_BIT(single_ref_p4);
-          else
-            bit3 = LAST2_IS_VALID(cm);
-#else  // !CONFIG_VAR_REFS
           const int bit3 = READ_REF_BIT(single_ref_p4);
-#endif  // CONFIG_VAR_REFS
-          if (counts) ++counts->single_ref[ctx3][3][bit3];
           ref_frame[0] = bit3 ? LAST2_FRAME : LAST_FRAME;
         }
       }
-#else   // !CONFIG_EXT_REFS
-      const int ctx0 = av1_get_pred_context_single_ref_p1(xd);
-      const int bit0 = READ_REF_BIT(single_ref_p1);
-      if (counts) ++counts->single_ref[ctx0][0][bit0];
-
-      if (bit0) {
-        const int ctx1 = av1_get_pred_context_single_ref_p2(xd);
-        const int bit1 = READ_REF_BIT(single_ref_p2);
-        if (counts) ++counts->single_ref[ctx1][1][bit1];
-        ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
-      } else {
-        ref_frame[0] = LAST_FRAME;
-      }
-#endif  // CONFIG_EXT_REFS
 
       ref_frame[1] = NONE_FRAME;
     } else {
@@ -1798,7 +1015,6 @@ static INLINE void read_mb_interp_filter(AV1_COMMON *const cm,
                                          MACROBLOCKD *const xd,
                                          MB_MODE_INFO *const mbmi,
                                          aom_reader *r) {
-  FRAME_COUNTS *counts = xd->counts;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
   if (!av1_is_interp_needed(xd)) {
@@ -1809,120 +1025,68 @@ static INLINE void read_mb_interp_filter(AV1_COMMON *const cm,
   if (cm->interp_filter != SWITCHABLE) {
     mbmi->interp_filters = av1_broadcast_interp_filter(cm->interp_filter);
   } else {
-#if CONFIG_DUAL_FILTER
     InterpFilter ref0_filter[2] = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR };
     for (int dir = 0; dir < 2; ++dir) {
-      if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
-          (mbmi->ref_frame[1] > INTRA_FRAME &&
-           has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
-        const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
-        ref0_filter[dir] =
-            (InterpFilter)aom_read_symbol(r, ec_ctx->switchable_interp_cdf[ctx],
-                                          SWITCHABLE_FILTERS, ACCT_STR);
-        if (counts) ++counts->switchable_interp[ctx][ref0_filter[dir]];
+      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+      ref0_filter[dir] = (InterpFilter)aom_read_symbol(
+          r, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS, ACCT_STR);
+      if (cm->seq_params.enable_dual_filter == 0) {
+        ref0_filter[1] = ref0_filter[0];
+        break;
       }
     }
     // The index system works as: (0, 1) -> (vertical, horizontal) filter types
     mbmi->interp_filters =
         av1_make_interp_filters(ref0_filter[0], ref0_filter[1]);
-#else   // CONFIG_DUAL_FILTER
-    const int ctx = av1_get_pred_context_switchable_interp(xd);
-    InterpFilter filter = (InterpFilter)aom_read_symbol(
-        r, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS, ACCT_STR);
-    mbmi->interp_filters = av1_broadcast_interp_filter(filter);
-    if (counts) ++counts->switchable_interp[ctx][filter];
-#endif  // CONFIG_DUAL_FILTER
   }
 }
 
 static void read_intra_block_mode_info(AV1_COMMON *const cm, const int mi_row,
                                        const int mi_col, MACROBLOCKD *const xd,
-                                       MODE_INFO *mi, aom_reader *r) {
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
-  int i;
+                                       MB_MODE_INFO *const mbmi,
+                                       aom_reader *r) {
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int use_angle_delta = av1_use_angle_delta(bsize);
 
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE_FRAME;
 
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
-#if CONFIG_CB4X4
-  (void)i;
   mbmi->mode = read_intra_mode(r, ec_ctx->y_mode_cdf[size_group_lookup[bsize]]);
-#else
-  switch (bsize) {
-    case BLOCK_4X4:
-      for (i = 0; i < 4; ++i)
-        mi->bmi[i].as_mode = read_intra_mode(r, ec_ctx->y_mode_cdf[0]);
-      mbmi->mode = mi->bmi[3].as_mode;
-      break;
-    case BLOCK_4X8:
-      mi->bmi[0].as_mode = mi->bmi[2].as_mode =
-          read_intra_mode(r, ec_ctx->y_mode_cdf[0]);
-      mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode =
-          read_intra_mode(r, ec_ctx->y_mode_cdf[0]);
-      break;
-    case BLOCK_8X4:
-      mi->bmi[0].as_mode = mi->bmi[1].as_mode =
-          read_intra_mode(r, ec_ctx->y_mode_cdf[0]);
-      mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode =
-          read_intra_mode(r, ec_ctx->y_mode_cdf[0]);
-      break;
-    default:
-      mbmi->mode =
-          read_intra_mode(r, ec_ctx->y_mode_cdf[size_group_lookup[bsize]]);
-  }
-#endif
 
-#if CONFIG_CB4X4
-  if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                          xd->plane[1].subsampling_y)) {
-    mbmi->uv_mode = read_intra_mode_uv(ec_ctx, r, mbmi->mode);
-#else
-  mbmi->uv_mode = read_intra_mode_uv(ec_ctx, r, mbmi->mode);
-  (void)mi_row;
-  (void)mi_col;
-#endif
-
-#if CONFIG_CFL
+  mbmi->angle_delta[PLANE_TYPE_Y] =
+      use_angle_delta && av1_is_directional_mode(mbmi->mode)
+          ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
+          : 0;
+  const int has_chroma =
+      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                          xd->plane[1].subsampling_y);
+  xd->cfl.is_chroma_reference = has_chroma;
+  if (!cm->seq_params.monochrome && has_chroma) {
+    mbmi->uv_mode =
+        read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
     if (mbmi->uv_mode == UV_CFL_PRED) {
       mbmi->cfl_alpha_idx =
           read_cfl_alphas(xd->tile_ctx, r, &mbmi->cfl_alpha_signs);
-      xd->cfl->store_y = 1;
-    } else {
-      xd->cfl->store_y = 0;
     }
-#endif  // CONFIG_CFL
-
-#if CONFIG_CB4X4
+    mbmi->angle_delta[PLANE_TYPE_UV] =
+        use_angle_delta && av1_is_directional_mode(get_uv_mode(mbmi->uv_mode))
+            ? read_angle_delta(r,
+                               ec_ctx->angle_delta_cdf[mbmi->uv_mode - V_PRED])
+            : 0;
   } else {
     // Avoid decoding angle_info if there is is no chroma prediction
     mbmi->uv_mode = UV_DC_PRED;
-#if CONFIG_CFL
-    xd->cfl->is_chroma_reference = 0;
-    xd->cfl->store_y = 1;
-#endif
   }
-#endif
-
-  // Explicitly ignore cm here to avoid a compile warning if none of
-  // ext-intra, palette and filter-intra are enabled.
-  (void)cm;
+  xd->cfl.store_y = store_cfl_required(cm, xd);
 
-#if CONFIG_EXT_INTRA
-  read_intra_angle_info(cm, xd, r);
-#endif  // CONFIG_EXT_INTRA
   mbmi->palette_mode_info.palette_size[0] = 0;
   mbmi->palette_mode_info.palette_size[1] = 0;
   if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
-    read_palette_mode_info(cm, xd, r);
-#if CONFIG_FILTER_INTRA
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
-  if (bsize >= BLOCK_8X8 || CONFIG_CB4X4)
-    read_filter_intra_mode_info(cm, xd, mi_row, mi_col, r);
-#endif  // CONFIG_FILTER_INTRA
+    read_palette_mode_info(cm, xd, mi_row, mi_col, r);
+
+  read_filter_intra_mode_info(cm, xd, r);
 }
 
 static INLINE int is_mv_valid(const MV *mv) {
@@ -1932,188 +1096,43 @@ static INLINE int is_mv_valid(const MV *mv) {
 
 static INLINE int assign_mv(AV1_COMMON *cm, MACROBLOCKD *xd,
                             PREDICTION_MODE mode,
-                            MV_REFERENCE_FRAME ref_frame[2], int block,
-                            int_mv mv[2], int_mv ref_mv[2],
-                            int_mv nearest_mv[2], int_mv near_mv[2], int mi_row,
-                            int mi_col, int is_compound, int allow_hp,
-                            aom_reader *r) {
-  int i;
-  int ret = 1;
+                            MV_REFERENCE_FRAME ref_frame[2], int_mv mv[2],
+                            int_mv ref_mv[2], int_mv nearest_mv[2],
+                            int_mv near_mv[2], int mi_row, int mi_col,
+                            int is_compound, int allow_hp, aom_reader *r) {
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_CB4X4
-  int_mv *pred_mv = mbmi->pred_mv;
-  (void)block;
-#else
-  int_mv *pred_mv =
-      (bsize >= BLOCK_8X8) ? mbmi->pred_mv : xd->mi[0]->bmi[block].pred_mv;
-#endif  // CONFIG_CB4X4
-  (void)ref_frame;
-  (void)cm;
-  (void)mi_row;
-  (void)mi_col;
-  (void)bsize;
-#if CONFIG_AMVR
-  if (cm->cur_frame_mv_precision_level) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  BLOCK_SIZE bsize = mbmi->sb_type;
+  if (cm->cur_frame_force_integer_mv) {
     allow_hp = MV_SUBPEL_NONE;
   }
-#endif
   switch (mode) {
     case NEWMV: {
-      FRAME_COUNTS *counts = xd->counts;
-      for (i = 0; i < 1 + is_compound; ++i) {
-        int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-        int nmv_ctx =
-            av1_nmv_ctx(xd->ref_mv_count[rf_type], xd->ref_mv_stack[rf_type], i,
-                        mbmi->ref_mv_idx);
-        nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
-        nmv_context_counts *const mv_counts =
-            counts ? &counts->mv[nmv_ctx] : NULL;
-        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, nmvc, mv_counts, allow_hp);
-        ret = ret && is_mv_valid(&mv[i].as_mv);
-
-        pred_mv[i].as_int = ref_mv[i].as_int;
-      }
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
       break;
     }
     case NEARESTMV: {
       mv[0].as_int = nearest_mv[0].as_int;
-      if (is_compound) mv[1].as_int = nearest_mv[1].as_int;
-
-      pred_mv[0].as_int = nearest_mv[0].as_int;
-      if (is_compound) pred_mv[1].as_int = nearest_mv[1].as_int;
       break;
     }
     case NEARMV: {
       mv[0].as_int = near_mv[0].as_int;
-      if (is_compound) mv[1].as_int = near_mv[1].as_int;
-
-      pred_mv[0].as_int = near_mv[0].as_int;
-      if (is_compound) pred_mv[1].as_int = near_mv[1].as_int;
-      break;
-    }
-    case ZEROMV: {
-#if CONFIG_GLOBAL_MOTION
-      mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
-                                          cm->allow_high_precision_mv, bsize,
-                                          mi_col, mi_row, block
-#if CONFIG_AMVR
-                                          ,
-                                          cm->cur_frame_mv_precision_level
-#endif
-                                          )
-                         .as_int;
-      if (is_compound)
-        mv[1].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[1]],
-                                            cm->allow_high_precision_mv, bsize,
-                                            mi_col, mi_row, block
-#if CONFIG_AMVR
-                                            ,
-                                            cm->cur_frame_mv_precision_level
-#endif
-                                            )
-                           .as_int;
-#else
-      mv[0].as_int = 0;
-      if (is_compound) mv[1].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
-
-      pred_mv[0].as_int = mv[0].as_int;
-      if (is_compound) pred_mv[1].as_int = mv[1].as_int;
-      break;
-    }
-#if CONFIG_COMPOUND_SINGLEREF
-    case SR_NEAREST_NEARMV: {
-      assert(!is_compound);
-      mv[0].as_int = nearest_mv[0].as_int;
-      mv[1].as_int = near_mv[0].as_int;
-      break;
-    }
-    /*
-    case SR_NEAREST_NEWMV: {
-      assert(!is_compound);
-      mv[0].as_int = nearest_mv[0].as_int;
-
-      FRAME_COUNTS *counts = xd->counts;
-      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
-                                xd->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-      nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
-      nmv_context_counts *const mv_counts =
-          counts ? &counts->mv[nmv_ctx] : NULL;
-      read_mv(r, &mv[1].as_mv, &ref_mv[0].as_mv, nmvc, mv_counts, allow_hp);
-      ret = ret && is_mv_valid(&mv[1].as_mv);
-      break;
-    }*/
-    case SR_NEAR_NEWMV: {
-      assert(!is_compound);
-      mv[0].as_int = near_mv[0].as_int;
-
-      FRAME_COUNTS *counts = xd->counts;
-      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
-                                xd->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-      nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
-      nmv_context_counts *const mv_counts =
-          counts ? &counts->mv[nmv_ctx] : NULL;
-      read_mv(r, &mv[1].as_mv, &ref_mv[0].as_mv, nmvc, mv_counts, allow_hp);
-      ret = ret && is_mv_valid(&mv[1].as_mv);
-      break;
-    }
-    case SR_ZERO_NEWMV: {
-      assert(!is_compound);
-#if CONFIG_GLOBAL_MOTION
-      mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
-                                          cm->allow_high_precision_mv, bsize,
-                                          mi_col, mi_row, block)
-                         .as_int;
-#else
-      mv[0].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
-
-      FRAME_COUNTS *counts = xd->counts;
-      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
-                                xd->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-      nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
-      nmv_context_counts *const mv_counts =
-          counts ? &counts->mv[nmv_ctx] : NULL;
-      read_mv(r, &mv[1].as_mv, &ref_mv[0].as_mv, nmvc, mv_counts, allow_hp);
-      ret = ret && is_mv_valid(&mv[1].as_mv);
       break;
     }
-    case SR_NEW_NEWMV: {
-      assert(!is_compound);
-
-      FRAME_COUNTS *counts = xd->counts;
-      for (i = 0; i < 2; ++i) {
-        int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-        int nmv_ctx =
-            av1_nmv_ctx(xd->ref_mv_count[rf_type], xd->ref_mv_stack[rf_type], 0,
-                        mbmi->ref_mv_idx);
-        nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
-        nmv_context_counts *const mv_counts =
-            counts ? &counts->mv[nmv_ctx] : NULL;
-        read_mv(r, &mv[i].as_mv, &ref_mv[0].as_mv, nmvc, mv_counts, allow_hp);
-        ret = ret && is_mv_valid(&mv[i].as_mv);
-      }
+    case GLOBALMV: {
+      mv[0].as_int =
+          gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
+                               cm->allow_high_precision_mv, bsize, mi_col,
+                               mi_row, cm->cur_frame_force_integer_mv)
+              .as_int;
       break;
     }
-#endif  // CONFIG_COMPOUND_SINGLEREF
     case NEW_NEWMV: {
-      FRAME_COUNTS *counts = xd->counts;
       assert(is_compound);
-      for (i = 0; i < 2; ++i) {
-        int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-        int nmv_ctx =
-            av1_nmv_ctx(xd->ref_mv_count[rf_type], xd->ref_mv_stack[rf_type], i,
-                        mbmi->ref_mv_idx);
-        nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
-        nmv_context_counts *const mv_counts =
-            counts ? &counts->mv[nmv_ctx] : NULL;
-        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, nmvc, mv_counts, allow_hp);
-        ret = ret && is_mv_valid(&mv[i].as_mv);
+      for (int i = 0; i < 2; ++i) {
+        nmv_context *const nmvc = &ec_ctx->nmvc;
+        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, nmvc, allow_hp);
       }
       break;
     }
@@ -2130,984 +1149,440 @@ static INLINE int assign_mv(AV1_COMMON *cm, MACROBLOCKD *xd,
       break;
     }
     case NEW_NEARESTMV: {
-      FRAME_COUNTS *counts = xd->counts;
-      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
-                                xd->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-      nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
-      nmv_context_counts *const mv_counts =
-          counts ? &counts->mv[nmv_ctx] : NULL;
-      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, mv_counts, allow_hp);
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
       assert(is_compound);
-      ret = ret && is_mv_valid(&mv[0].as_mv);
       mv[1].as_int = nearest_mv[1].as_int;
       break;
     }
     case NEAREST_NEWMV: {
-      FRAME_COUNTS *counts = xd->counts;
-      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
-                                xd->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
-      nmv_context_counts *const mv_counts =
-          counts ? &counts->mv[nmv_ctx] : NULL;
-      nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
+      nmv_context *const nmvc = &ec_ctx->nmvc;
       mv[0].as_int = nearest_mv[0].as_int;
-      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, mv_counts, allow_hp);
+      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp);
       assert(is_compound);
-      ret = ret && is_mv_valid(&mv[1].as_mv);
       break;
     }
     case NEAR_NEWMV: {
-      FRAME_COUNTS *counts = xd->counts;
-      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
-                                xd->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
-      nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
-      nmv_context_counts *const mv_counts =
-          counts ? &counts->mv[nmv_ctx] : NULL;
+      nmv_context *const nmvc = &ec_ctx->nmvc;
       mv[0].as_int = near_mv[0].as_int;
-      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, mv_counts, allow_hp);
+      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp);
       assert(is_compound);
-
-      ret = ret && is_mv_valid(&mv[1].as_mv);
       break;
     }
     case NEW_NEARMV: {
-      FRAME_COUNTS *counts = xd->counts;
-      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
-                                xd->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-      nmv_context *const nmvc = &ec_ctx->nmvc[nmv_ctx];
-      nmv_context_counts *const mv_counts =
-          counts ? &counts->mv[nmv_ctx] : NULL;
-      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, mv_counts, allow_hp);
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
       assert(is_compound);
-      ret = ret && is_mv_valid(&mv[0].as_mv);
       mv[1].as_int = near_mv[1].as_int;
       break;
     }
-    case ZERO_ZEROMV: {
+    case GLOBAL_GLOBALMV: {
       assert(is_compound);
-#if CONFIG_GLOBAL_MOTION
-      mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
-                                          cm->allow_high_precision_mv, bsize,
-                                          mi_col, mi_row, block
-#if CONFIG_AMVR
-                                          ,
-                                          cm->cur_frame_mv_precision_level
-#endif
-                                          )
-                         .as_int;
-      mv[1].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[1]],
-                                          cm->allow_high_precision_mv, bsize,
-                                          mi_col, mi_row, block
-#if CONFIG_AMVR
-                                          ,
-                                          cm->cur_frame_mv_precision_level
-#endif
-                                          )
-                         .as_int;
-#else
-      mv[0].as_int = 0;
-      mv[1].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
+      mv[0].as_int =
+          gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
+                               cm->allow_high_precision_mv, bsize, mi_col,
+                               mi_row, cm->cur_frame_force_integer_mv)
+              .as_int;
+      mv[1].as_int =
+          gm_get_motion_vector(&cm->global_motion[ref_frame[1]],
+                               cm->allow_high_precision_mv, bsize, mi_col,
+                               mi_row, cm->cur_frame_force_integer_mv)
+              .as_int;
       break;
     }
     default: { return 0; }
   }
+
+  int ret = is_mv_valid(&mv[0].as_mv);
+  if (is_compound) {
+    ret = ret && is_mv_valid(&mv[1].as_mv);
+  }
   return ret;
 }
 
 static int read_is_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd,
                                int segment_id, aom_reader *r) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
-    return get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
-  } else {
-    const int ctx = av1_get_intra_inter_context(xd);
-#if CONFIG_NEW_MULTISYMBOL
-    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-    const int is_inter =
-        aom_read_symbol(r, ec_ctx->intra_inter_cdf[ctx], 2, ACCT_STR);
-#else
-    const int is_inter = aom_read(r, cm->fc->intra_inter_prob[ctx], ACCT_STR);
-#endif
-    FRAME_COUNTS *counts = xd->counts;
-    if (counts) ++counts->intra_inter[ctx][is_inter];
-    return is_inter;
+    const int frame = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+    if (frame < LAST_FRAME) return 0;
+    return frame != INTRA_FRAME;
   }
-}
-
-#if CONFIG_COMPOUND_SINGLEREF
-static int read_is_inter_singleref_comp_mode(AV1_COMMON *const cm,
-                                             MACROBLOCKD *const xd,
-                                             int segment_id, aom_reader *r) {
-  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) return 0;
-
-  const int ctx = av1_get_inter_mode_context(xd);
-  const int is_singleref_comp_mode =
-      aom_read(r, cm->fc->comp_inter_mode_prob[ctx], ACCT_STR);
-  FRAME_COUNTS *counts = xd->counts;
-
-  if (counts) ++counts->comp_inter_mode[ctx][is_singleref_comp_mode];
-  return is_singleref_comp_mode;
-}
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-static void fpm_sync(void *const data, int mi_row) {
-  AV1Decoder *const pbi = (AV1Decoder *)data;
-  av1_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame,
-                       mi_row << pbi->common.mib_size_log2);
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    return 1;
+  }
+  const int ctx = av1_get_intra_inter_context(xd);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const int is_inter =
+      aom_read_symbol(r, ec_ctx->intra_inter_cdf[ctx], 2, ACCT_STR);
+  return is_inter;
 }
 
 #if DEC_MISMATCH_DEBUG
-static void dec_dump_logs(AV1_COMMON *cm, MODE_INFO *const mi, int mi_row,
-                          int mi_col,
-                          int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES],
-                          int16_t mode_ctx) {
+static void dec_dump_logs(AV1_COMMON *cm, MB_MODE_INFO *const mbmi, int mi_row,
+                          int mi_col, int16_t mode_ctx) {
   int_mv mv[2] = { { 0 } };
-  int ref;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
+  for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
     mv[ref].as_mv = mbmi->mv[ref].as_mv;
 
   const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
   int16_t zeromv_ctx = -1;
   int16_t refmv_ctx = -1;
   if (mbmi->mode != NEWMV) {
-    if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) assert(mbmi->mode == ZEROMV);
-    zeromv_ctx = (mode_ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
-    if (mbmi->mode != ZEROMV) {
+    zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+    if (mbmi->mode != GLOBALMV)
       refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
-      if (mode_ctx & (1 << SKIP_NEARESTMV_OFFSET)) refmv_ctx = 6;
-      if (mode_ctx & (1 << SKIP_NEARMV_OFFSET)) refmv_ctx = 7;
-      if (mode_ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) refmv_ctx = 8;
-    }
   }
 
-  int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-#define FRAME_TO_CHECK 1
+#define FRAME_TO_CHECK 11
   if (cm->current_video_frame == FRAME_TO_CHECK && cm->show_frame == 1) {
     printf(
         "=== DECODER ===: "
-        "Frame=%d, (mi_row,mi_col)=(%d,%d), mode=%d, bsize=%d, "
+        "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, "
         "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, "
-        "ref[1]=%d, motion_mode=%d, inter_mode_ctx=%d, mode_ctx=%d, "
-        "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d\n",
-        cm->current_video_frame, mi_row, mi_col, mbmi->mode, mbmi->sb_type,
-        cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col, mv[1].as_mv.row,
-        mv[1].as_mv.col, mbmi->ref_frame[0], mbmi->ref_frame[1],
-        mbmi->motion_mode, inter_mode_ctx[ref_frame_type], mode_ctx, newmv_ctx,
-        zeromv_ctx, refmv_ctx);
+        "ref[1]=%d, motion_mode=%d, mode_ctx=%d, "
+        "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n",
+        cm->current_video_frame, mi_row, mi_col, mbmi->skip_mode, mbmi->mode,
+        mbmi->sb_type, cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col,
+        mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0],
+        mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx, zeromv_ctx,
+        refmv_ctx, mbmi->tx_size);
   }
 }
 #endif  // DEC_MISMATCH_DEBUG
 
 static void read_inter_block_mode_info(AV1Decoder *const pbi,
                                        MACROBLOCKD *const xd,
-                                       MODE_INFO *const mi,
-#if CONFIG_SUPERTX
-                                       int mi_row, int mi_col, aom_reader *r,
-                                       int supertx_enabled) {
-#else
-                                       int mi_row, int mi_col, aom_reader *r) {
-#endif  // CONFIG_MOTION_VAR && CONFIG_SUPERTX
+                                       MB_MODE_INFO *const mbmi, int mi_row,
+                                       int mi_col, aom_reader *r) {
   AV1_COMMON *const cm = &pbi->common;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
-  const int unify_bsize = CONFIG_CB4X4;
   int_mv nearestmv[2], nearmv[2];
-  int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
-  int ref, is_compound;
-#if CONFIG_COMPOUND_SINGLEREF
-  int is_singleref_comp_mode = 0;
-#endif  // CONFIG_COMPOUND_SINGLEREF
+  int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES] = { { { 0 } } };
   int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
-  int16_t compound_inter_mode_ctx[MODE_CTX_REF_FRAMES];
-  int16_t mode_ctx = 0;
-#if CONFIG_WARPED_MOTION
   int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
-#if WARPED_MOTION_SORT_SAMPLES
-  int pts_mv[SAMPLES_ARRAY_SIZE];
-#endif  // WARPED_MOTION_SORT_SAMPLES
-#endif  // CONFIG_WARPED_MOTION
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
-  assert(NELEMENTS(mode_2_counter) == MB_MODE_COUNT);
-
   mbmi->uv_mode = UV_DC_PRED;
   mbmi->palette_mode_info.palette_size[0] = 0;
   mbmi->palette_mode_info.palette_size[1] = 0;
 
-  memset(ref_mvs, 0, sizeof(ref_mvs));
+  av1_collect_neighbors_ref_counts(xd);
 
   read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
-  is_compound = has_second_ref(mbmi);
-
-#if CONFIG_EXT_COMP_REFS
-#if !USE_UNI_COMP_REFS
-  // NOTE: uni-directional comp refs disabled
-  if (is_compound)
-    assert(mbmi->ref_frame[0] < BWDREF_FRAME &&
-           mbmi->ref_frame[1] >= BWDREF_FRAME);
-#endif  // !USE_UNI_COMP_REFS
-#endif  // CONFIG_EXT_COMP_REFS
-
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!is_compound)
-    is_singleref_comp_mode =
-        read_is_inter_singleref_comp_mode(cm, xd, mbmi->segment_id, r);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
-
-    av1_find_mv_refs(cm, xd, mi, frame, &xd->ref_mv_count[frame],
-                     xd->ref_mv_stack[frame], compound_inter_mode_ctx,
-                     ref_mvs[frame], mi_row, mi_col, fpm_sync, (void *)pbi,
-                     inter_mode_ctx);
-  }
+  const int is_compound = has_second_ref(mbmi);
 
-  if (is_compound) {
-    MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame);
-    av1_find_mv_refs(cm, xd, mi, ref_frame, &xd->ref_mv_count[ref_frame],
-                     xd->ref_mv_stack[ref_frame], compound_inter_mode_ctx,
-                     ref_mvs[ref_frame], mi_row, mi_col, fpm_sync, (void *)pbi,
-                     inter_mode_ctx);
+  MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame);
+  int_mv global_mvs[REF_FRAMES];
+  av1_find_mv_refs(cm, xd, mbmi, ref_frame, xd->ref_mv_count, xd->ref_mv_stack,
+                   ref_mvs, global_mvs, mi_row, mi_col, inter_mode_ctx);
 
-    if (xd->ref_mv_count[ref_frame] < 2) {
-      MV_REFERENCE_FRAME rf[2];
-      int_mv zeromv[2];
-      av1_set_ref_frame(rf, ref_frame);
-#if CONFIG_GLOBAL_MOTION
-      zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[rf[0]],
-                                              cm->allow_high_precision_mv,
-                                              bsize, mi_col, mi_row, 0
-#if CONFIG_AMVR
-                                              ,
-                                              cm->cur_frame_mv_precision_level
-#endif
-                                              )
-                             .as_int;
-      zeromv[1].as_int =
-          (rf[1] != NONE_FRAME)
-              ? gm_get_motion_vector(&cm->global_motion[rf[1]],
-                                     cm->allow_high_precision_mv, bsize, mi_col,
-                                     mi_row, 0
-#if CONFIG_AMVR
-                                     ,
-                                     cm->cur_frame_mv_precision_level
-#endif
-                                     )
-                    .as_int
-              : 0;
-#else
-      zeromv[0].as_int = zeromv[1].as_int = 0;
-#endif
-      for (ref = 0; ref < 2; ++ref) {
-        if (rf[ref] == NONE_FRAME) continue;
-#if CONFIG_AMVR
-        lower_mv_precision(&ref_mvs[rf[ref]][0].as_mv, allow_hp,
-                           cm->cur_frame_mv_precision_level);
-        lower_mv_precision(&ref_mvs[rf[ref]][1].as_mv, allow_hp,
-                           cm->cur_frame_mv_precision_level);
-#else
-        lower_mv_precision(&ref_mvs[rf[ref]][0].as_mv, allow_hp);
-        lower_mv_precision(&ref_mvs[rf[ref]][1].as_mv, allow_hp);
-#endif
-        if (ref_mvs[rf[ref]][0].as_int != zeromv[ref].as_int ||
-            ref_mvs[rf[ref]][1].as_int != zeromv[ref].as_int)
-          inter_mode_ctx[ref_frame] &= ~(1 << ALL_ZERO_FLAG_OFFSET);
-      }
-    }
-  }
-
-#if CONFIG_COMPOUND_SINGLEREF
-  if (is_compound || is_singleref_comp_mode)
-#else   // !CONFIG_COMPOUND_SINGLEREF
-  if (is_compound)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    mode_ctx = compound_inter_mode_ctx[mbmi->ref_frame[0]];
-  else
-    mode_ctx =
-        av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame, bsize, -1);
+  int mode_ctx = av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame);
   mbmi->ref_mv_idx = 0;
 
-#if CONFIG_SEGMENT_ZEROMV
-  if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
-      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_ZEROMV)) {
-#else
-  if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-#endif
-    mbmi->mode = ZEROMV;
-    if (bsize < BLOCK_8X8 && !unify_bsize) {
-      aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                         "Invalid usage of segment feature on small blocks");
-      return;
-    }
+  if (mbmi->skip_mode) {
+    assert(is_compound);
+    mbmi->mode = NEAREST_NEARESTMV;
   } else {
-    if (bsize >= BLOCK_8X8 || unify_bsize) {
+    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
+        segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_GLOBALMV)) {
+      mbmi->mode = GLOBALMV;
+    } else {
       if (is_compound)
-        mbmi->mode = read_inter_compound_mode(cm, xd, r, mode_ctx);
-#if CONFIG_COMPOUND_SINGLEREF
-      else if (is_singleref_comp_mode)
-        mbmi->mode = read_inter_singleref_comp_mode(xd, r, mode_ctx);
-#endif  // CONFIG_COMPOUND_SINGLEREF
+        mbmi->mode = read_inter_compound_mode(xd, r, mode_ctx);
       else
-        mbmi->mode = read_inter_mode(ec_ctx, xd, r, mode_ctx);
+        mbmi->mode = read_inter_mode(ec_ctx, r, mode_ctx);
       if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV ||
-#if CONFIG_COMPOUND_SINGLEREF
-          mbmi->mode == SR_NEW_NEWMV ||
-#endif  // CONFIG_COMPOUND_SINGLEREF
           have_nearmv_in_inter_mode(mbmi->mode))
         read_drl_idx(ec_ctx, xd, mbmi, r);
     }
   }
 
-  if ((bsize < BLOCK_8X8 && !unify_bsize) ||
-      (mbmi->mode != ZEROMV && mbmi->mode != ZERO_ZEROMV)) {
-    for (ref = 0; ref < 1 + is_compound; ++ref) {
-#if CONFIG_AMVR
-      av1_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[ref]],
-                            &nearestmv[ref], &nearmv[ref],
-                            cm->cur_frame_mv_precision_level);
-#else
-      av1_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[ref]],
-                            &nearestmv[ref], &nearmv[ref]);
-#endif
-    }
+  if (is_compound != is_inter_compound_mode(mbmi->mode)) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Prediction mode %d invalid with ref frame %d %d",
+                       mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
   }
 
-#if CONFIG_COMPOUND_SINGLEREF
-  if ((is_compound || is_singleref_comp_mode) &&
-      (bsize >= BLOCK_8X8 || unify_bsize) && mbmi->mode != ZERO_ZEROMV)
-#else   // !CONFIG_COMPOUND_SINGLEREF
-  if (is_compound && (bsize >= BLOCK_8X8 || unify_bsize) &&
-      mbmi->mode != ZERO_ZEROMV)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  {
-    uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-
-    if (xd->ref_mv_count[ref_frame_type] > 0) {
-      if (mbmi->mode == NEAREST_NEARESTMV) {
-        nearestmv[0] = xd->ref_mv_stack[ref_frame_type][0].this_mv;
-        nearestmv[1] = xd->ref_mv_stack[ref_frame_type][0].comp_mv;
-#if CONFIG_AMVR
-        lower_mv_precision(&nearestmv[0].as_mv, allow_hp,
-                           cm->cur_frame_mv_precision_level);
-        lower_mv_precision(&nearestmv[1].as_mv, allow_hp,
-                           cm->cur_frame_mv_precision_level);
-#else
-        lower_mv_precision(&nearestmv[0].as_mv, allow_hp);
-        lower_mv_precision(&nearestmv[1].as_mv, allow_hp);
-#endif
-      } else if (mbmi->mode == NEAREST_NEWMV
-#if CONFIG_COMPOUND_SINGLEREF
-                 || mbmi->mode == SR_NEAREST_NEARMV
-// || mbmi->mode == SR_NEAREST_NEWMV
-#endif  // CONFIG_COMPOUND_SINGLEREF
-                 ) {
-        nearestmv[0] = xd->ref_mv_stack[ref_frame_type][0].this_mv;
-
-#if CONFIG_AMVR
-        lower_mv_precision(&nearestmv[0].as_mv, allow_hp,
-                           cm->cur_frame_mv_precision_level);
-#else
-        lower_mv_precision(&nearestmv[0].as_mv, allow_hp);
-#endif
-      } else if (mbmi->mode == NEW_NEARESTMV) {
-        nearestmv[1] = xd->ref_mv_stack[ref_frame_type][0].comp_mv;
-#if CONFIG_AMVR
-        lower_mv_precision(&nearestmv[1].as_mv, allow_hp,
-                           cm->cur_frame_mv_precision_level);
-#else
-        lower_mv_precision(&nearestmv[1].as_mv, allow_hp);
-#endif
-      }
-    }
-
-    if (xd->ref_mv_count[ref_frame_type] > 1) {
-      int ref_mv_idx = 1 + mbmi->ref_mv_idx;
-#if CONFIG_COMPOUND_SINGLEREF
-      if (is_compound) {
-#endif  // CONFIG_COMPOUND_SINGLEREF
-        if (compound_ref0_mode(mbmi->mode) == NEARMV) {
-          nearmv[0] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
-#if CONFIG_AMVR
-          lower_mv_precision(&nearmv[0].as_mv, allow_hp,
-                             cm->cur_frame_mv_precision_level);
-#else
-        lower_mv_precision(&nearmv[0].as_mv, allow_hp);
-#endif
-        }
+  if (!is_compound && mbmi->mode != GLOBALMV) {
+    av1_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[0]], &nearestmv[0],
+                          &nearmv[0], cm->cur_frame_force_integer_mv);
+  }
 
-        if (compound_ref1_mode(mbmi->mode) == NEARMV) {
-          nearmv[1] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
-#if CONFIG_AMVR
-          lower_mv_precision(&nearmv[1].as_mv, allow_hp,
-                             cm->cur_frame_mv_precision_level);
-#else
-        lower_mv_precision(&nearmv[1].as_mv, allow_hp);
-#endif
-        }
-#if CONFIG_COMPOUND_SINGLEREF
-      } else {
-        assert(is_singleref_comp_mode);
-        if (compound_ref0_mode(mbmi->mode) == NEARMV ||
-            compound_ref1_mode(mbmi->mode) == NEARMV) {
-          nearmv[0] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
-          lower_mv_precision(&nearmv[0].as_mv, allow_hp);
-        }
-      }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    }
+  if (is_compound && mbmi->mode != GLOBAL_GLOBALMV) {
+    int ref_mv_idx = mbmi->ref_mv_idx + 1;
+    nearestmv[0] = xd->ref_mv_stack[ref_frame][0].this_mv;
+    nearestmv[1] = xd->ref_mv_stack[ref_frame][0].comp_mv;
+    nearmv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv;
+    nearmv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv;
+    lower_mv_precision(&nearestmv[0].as_mv, allow_hp,
+                       cm->cur_frame_force_integer_mv);
+    lower_mv_precision(&nearestmv[1].as_mv, allow_hp,
+                       cm->cur_frame_force_integer_mv);
+    lower_mv_precision(&nearmv[0].as_mv, allow_hp,
+                       cm->cur_frame_force_integer_mv);
+    lower_mv_precision(&nearmv[1].as_mv, allow_hp,
+                       cm->cur_frame_force_integer_mv);
   } else if (mbmi->ref_mv_idx > 0 && mbmi->mode == NEARMV) {
     int_mv cur_mv =
         xd->ref_mv_stack[mbmi->ref_frame[0]][1 + mbmi->ref_mv_idx].this_mv;
     nearmv[0] = cur_mv;
   }
 
-#if !CONFIG_DUAL_FILTER && !CONFIG_WARPED_MOTION && !CONFIG_GLOBAL_MOTION
-  read_mb_interp_filter(cm, xd, mbmi, r);
-#endif  // !CONFIG_DUAL_FILTER && !CONFIG_WARPED_MOTION
-
-  if (bsize < BLOCK_8X8 && !unify_bsize) {
-    const int num_4x4_w = 1 << xd->bmode_blocks_wl;
-    const int num_4x4_h = 1 << xd->bmode_blocks_hl;
-    int idx, idy;
-    PREDICTION_MODE b_mode;
-    int_mv nearest_sub8x8[2], near_sub8x8[2];
-    int_mv ref_mv[2][2];
-    for (idy = 0; idy < 2; idy += num_4x4_h) {
-      for (idx = 0; idx < 2; idx += num_4x4_w) {
-        int_mv block[2];
-        const int j = idy * 2 + idx;
-        int_mv ref_mv_s8[2];
-        if (!is_compound)
-          mode_ctx = av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame,
-                                               bsize, j);
-        if (is_compound)
-          b_mode = read_inter_compound_mode(cm, xd, r, mode_ctx);
-        else
-          b_mode = read_inter_mode(ec_ctx, xd, r, mode_ctx);
-
-        if (b_mode != ZEROMV && b_mode != ZERO_ZEROMV) {
-          CANDIDATE_MV ref_mv_stack[2][MAX_REF_MV_STACK_SIZE];
-          uint8_t ref_mv_count[2];
-          for (ref = 0; ref < 1 + is_compound; ++ref) {
-            int_mv mv_ref_list[MAX_MV_REF_CANDIDATES];
-            av1_update_mv_context(cm, xd, mi, mbmi->ref_frame[ref], mv_ref_list,
-                                  j, mi_row, mi_col, NULL);
-            av1_append_sub8x8_mvs_for_idx(cm, xd, j, ref, mi_row, mi_col,
-                                          ref_mv_stack[ref], &ref_mv_count[ref],
-                                          mv_ref_list, &nearest_sub8x8[ref],
-                                          &near_sub8x8[ref]);
-            if (have_newmv_in_inter_mode(b_mode)) {
-              mv_ref_list[0].as_int = nearest_sub8x8[ref].as_int;
-              mv_ref_list[1].as_int = near_sub8x8[ref].as_int;
-#if CONFIG_AMVR
-              av1_find_best_ref_mvs(allow_hp, mv_ref_list, &ref_mv[0][ref],
-                                    &ref_mv[1][ref],
-                                    cm->cur_frame_mv_precision_level);
-#else
-              av1_find_best_ref_mvs(allow_hp, mv_ref_list, &ref_mv[0][ref],
-                                    &ref_mv[1][ref]);
-#endif
-            }
-          }
-        }
-
-        for (ref = 0; ref < 1 + is_compound && b_mode != ZEROMV; ++ref) {
-          ref_mv_s8[ref] = nearest_sub8x8[ref];
-#if CONFIG_AMVR
-          lower_mv_precision(&ref_mv_s8[ref].as_mv, allow_hp,
-                             cm->cur_frame_mv_precision_level);
-#else
-          lower_mv_precision(&ref_mv_s8[ref].as_mv, allow_hp);
-#endif
-        }
-        (void)ref_mv_s8;
-
-        if (!assign_mv(cm, xd, b_mode, mbmi->ref_frame, j, block, ref_mv[0],
-                       nearest_sub8x8, near_sub8x8, mi_row, mi_col, is_compound,
-                       allow_hp, r)) {
-          aom_merge_corrupted_flag(&xd->corrupted, 1);
-          break;
-        };
-
-        mi->bmi[j].as_mv[0].as_int = block[0].as_int;
-        mi->bmi[j].as_mode = b_mode;
-        if (is_compound) mi->bmi[j].as_mv[1].as_int = block[1].as_int;
-
-        if (num_4x4_h == 2) mi->bmi[j + 2] = mi->bmi[j];
-        if (num_4x4_w == 2) mi->bmi[j + 1] = mi->bmi[j];
-      }
-    }
+  int_mv ref_mv[2];
+  ref_mv[0] = nearestmv[0];
+  ref_mv[1] = nearestmv[1];
 
-    mbmi->pred_mv[0].as_int = mi->bmi[3].pred_mv[0].as_int;
-    mbmi->pred_mv[1].as_int = mi->bmi[3].pred_mv[1].as_int;
-    mi->mbmi.mode = b_mode;
-
-    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
-    mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+  if (is_compound) {
+    int ref_mv_idx = mbmi->ref_mv_idx;
+    // Special case: NEAR_NEWMV and NEW_NEARMV modes use
+    // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
+    // mbmi->ref_mv_idx (like NEWMV)
+    if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
+      ref_mv_idx = 1 + mbmi->ref_mv_idx;
+
+    // TODO(jingning, yunqing): Do we need a lower_mv_precision() call here?
+    if (compound_ref0_mode(mbmi->mode) == NEWMV)
+      ref_mv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv;
+
+    if (compound_ref1_mode(mbmi->mode) == NEWMV)
+      ref_mv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv;
   } else {
-    int_mv ref_mv[2];
-    ref_mv[0] = nearestmv[0];
-    ref_mv[1] = nearestmv[1];
-
-    if (is_compound) {
-      int ref_mv_idx = mbmi->ref_mv_idx;
-      // Special case: NEAR_NEWMV and NEW_NEARMV modes use
-      // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
-      // mbmi->ref_mv_idx (like NEWMV)
-      if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
-        ref_mv_idx = 1 + mbmi->ref_mv_idx;
-
-      if (compound_ref0_mode(mbmi->mode) == NEWMV) {
-        uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-        if (xd->ref_mv_count[ref_frame_type] > 1) {
-          ref_mv[0] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
-          clamp_mv_ref(&ref_mv[0].as_mv, xd->n8_w << MI_SIZE_LOG2,
-                       xd->n8_h << MI_SIZE_LOG2, xd);
-        }
-        nearestmv[0] = ref_mv[0];
-      }
-      if (compound_ref1_mode(mbmi->mode) == NEWMV) {
-        uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-        if (xd->ref_mv_count[ref_frame_type] > 1) {
-          ref_mv[1] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
-          clamp_mv_ref(&ref_mv[1].as_mv, xd->n8_w << MI_SIZE_LOG2,
-                       xd->n8_h << MI_SIZE_LOG2, xd);
-        }
-        nearestmv[1] = ref_mv[1];
-      }
-#if CONFIG_COMPOUND_SINGLEREF
-    } else if (is_singleref_comp_mode) {
-      int ref_mv_idx = mbmi->ref_mv_idx;
-      // Special case: SR_NEAR_NEWMV use 1 + mbmi->ref_mv_idx (like NEARMV)
-      //               instead of mbmi->ref_mv_idx (like NEWMV)
-      if (mbmi->mode == SR_NEAR_NEWMV) ref_mv_idx = 1 + mbmi->ref_mv_idx;
-
-      if (compound_ref0_mode(mbmi->mode) == NEWMV ||
-          compound_ref1_mode(mbmi->mode) == NEWMV) {
-        uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-        if (xd->ref_mv_count[ref_frame_type] > 1) {
-          ref_mv[0] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
-          clamp_mv_ref(&ref_mv[0].as_mv, xd->n8_w << MI_SIZE_LOG2,
-                       xd->n8_h << MI_SIZE_LOG2, xd);
-        }
-        // TODO(zoeliu): To further investigate why this would not cause a
-        //               mismatch for the mode of SR_NEAREST_NEWMV.
-        nearestmv[0] = ref_mv[0];
-      }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    } else {
-      if (mbmi->mode == NEWMV) {
-        for (ref = 0; ref < 1 + is_compound; ++ref) {
-          uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-          if (xd->ref_mv_count[ref_frame_type] > 1) {
-            ref_mv[ref] =
-                (ref == 0)
-                    ? xd->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx].this_mv
-                    : xd->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
-                          .comp_mv;
-            clamp_mv_ref(&ref_mv[ref].as_mv, xd->n8_w << MI_SIZE_LOG2,
-                         xd->n8_h << MI_SIZE_LOG2, xd);
-          }
-          nearestmv[ref] = ref_mv[ref];
-        }
-      }
+    if (mbmi->mode == NEWMV) {
+      if (xd->ref_mv_count[ref_frame] > 1)
+        ref_mv[0] = xd->ref_mv_stack[ref_frame][mbmi->ref_mv_idx].this_mv;
     }
+  }
 
+  if (mbmi->skip_mode) {
+    assert(mbmi->mode == NEAREST_NEARESTMV);
+    mbmi->mv[0].as_int = nearestmv[0].as_int;
+    mbmi->mv[1].as_int = nearestmv[1].as_int;
+  } else {
     int mv_corrupted_flag =
-        !assign_mv(cm, xd, mbmi->mode, mbmi->ref_frame, 0, mbmi->mv, ref_mv,
+        !assign_mv(cm, xd, mbmi->mode, mbmi->ref_frame, mbmi->mv, ref_mv,
                    nearestmv, nearmv, mi_row, mi_col, is_compound, allow_hp, r);
     aom_merge_corrupted_flag(&xd->corrupted, mv_corrupted_flag);
   }
 
-#if CONFIG_INTERINTRA
   mbmi->use_wedge_interintra = 0;
-  if (cm->reference_mode != COMPOUND_REFERENCE &&
-#if CONFIG_SUPERTX
-      !supertx_enabled &&
-#endif
-      cm->allow_interintra_compound && is_interintra_allowed(mbmi)) {
+  if (cm->seq_params.enable_interintra_compound && !mbmi->skip_mode &&
+      is_interintra_allowed(mbmi)) {
     const int bsize_group = size_group_lookup[bsize];
-#if CONFIG_NEW_MULTISYMBOL
     const int interintra =
         aom_read_symbol(r, ec_ctx->interintra_cdf[bsize_group], 2, ACCT_STR);
-#else
-    const int interintra =
-        aom_read(r, cm->fc->interintra_prob[bsize_group], ACCT_STR);
-#endif
-    if (xd->counts) xd->counts->interintra[bsize_group][interintra]++;
     assert(mbmi->ref_frame[1] == NONE_FRAME);
     if (interintra) {
       const INTERINTRA_MODE interintra_mode =
-          read_interintra_mode(cm, xd, r, bsize_group);
+          read_interintra_mode(xd, r, bsize_group);
       mbmi->ref_frame[1] = INTRA_FRAME;
       mbmi->interintra_mode = interintra_mode;
-#if CONFIG_EXT_INTRA
-      mbmi->angle_delta[0] = 0;
-      mbmi->angle_delta[1] = 0;
-#if CONFIG_INTRA_INTERP
-      mbmi->intra_filter = INTRA_FILTER_LINEAR;
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-      mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-      mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
-#endif  // CONFIG_FILTER_INTRA
+      mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+      mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+      mbmi->filter_intra_mode_info.use_filter_intra = 0;
       if (is_interintra_wedge_used(bsize)) {
-#if CONFIG_NEW_MULTISYMBOL
         mbmi->use_wedge_interintra = aom_read_symbol(
             r, ec_ctx->wedge_interintra_cdf[bsize], 2, ACCT_STR);
-#else
-        mbmi->use_wedge_interintra =
-            aom_read(r, cm->fc->wedge_interintra_prob[bsize], ACCT_STR);
-#endif
-        if (xd->counts)
-          xd->counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
         if (mbmi->use_wedge_interintra) {
           mbmi->interintra_wedge_index =
-              aom_read_literal(r, get_wedge_bits_lookup(bsize), ACCT_STR);
+              aom_read_symbol(r, ec_ctx->wedge_idx_cdf[bsize], 16, ACCT_STR);
           mbmi->interintra_wedge_sign = 0;
         }
       }
     }
   }
-#endif  // CONFIG_INTERINTRA
 
-#if CONFIG_WARPED_MOTION
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+  for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
     const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
     RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
 
     xd->block_refs[ref] = ref_buf;
   }
-#endif
 
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   mbmi->motion_mode = SIMPLE_TRANSLATION;
-#if CONFIG_WARPED_MOTION
-  if (mbmi->sb_type >= BLOCK_8X8 && !has_second_ref(mbmi))
-#if WARPED_MOTION_SORT_SAMPLES
-    mbmi->num_proj_ref[0] =
-        findSamples(cm, xd, mi_row, mi_col, pts, pts_inref, pts_mv);
-#else
+  if (is_motion_variation_allowed_bsize(mbmi->sb_type) && !mbmi->skip_mode &&
+      !has_second_ref(mbmi))
     mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
-#endif  // WARPED_MOTION_SORT_SAMPLES
-#endif  // CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR
   av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
-#endif
 
-#if CONFIG_SUPERTX
-  if (!supertx_enabled) {
-#endif  // CONFIG_SUPERTX
-    if (mbmi->ref_frame[1] != INTRA_FRAME)
-      mbmi->motion_mode = read_motion_mode(cm, xd, mi, r);
+  if (mbmi->ref_frame[1] != INTRA_FRAME)
+    mbmi->motion_mode = read_motion_mode(cm, xd, mbmi, r);
 
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-    read_ncobmc_mode(xd, mi, mbmi->ncobmc_mode, r);
-#endif
+  // init
+  mbmi->comp_group_idx = 0;
+  mbmi->compound_idx = 1;
+  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
 
-#if CONFIG_COMPOUND_SINGLEREF
-    if (is_singleref_comp_mode) assert(mbmi->motion_mode == SIMPLE_TRANSLATION);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_WARPED_MOTION
-    if (mbmi->motion_mode == WARPED_CAUSAL) {
-      mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE;
-
-#if WARPED_MOTION_SORT_SAMPLES
-      if (mbmi->num_proj_ref[0] > 1)
-        mbmi->num_proj_ref[0] = sortSamples(pts_mv, &mbmi->mv[0].as_mv, pts,
-                                            pts_inref, mbmi->num_proj_ref[0]);
-#endif  // WARPED_MOTION_SORT_SAMPLES
-
-      if (find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
-                          mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
-                          &mbmi->wm_params[0], mi_row, mi_col)) {
-        aom_internal_error(&cm->error, AOM_CODEC_ERROR, "Invalid Warped Model");
-      }
+  if (has_second_ref(mbmi) && !mbmi->skip_mode) {
+    // Read idx to indicate current compound inter prediction mode group
+    const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                     cm->seq_params.enable_masked_compound;
+
+    if (masked_compound_used) {
+      const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
+      mbmi->comp_group_idx = aom_read_symbol(
+          r, ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2, ACCT_STR);
     }
-#endif  // CONFIG_WARPED_MOTION
-#if CONFIG_SUPERTX
-  }
-#endif  // CONFIG_SUPERTX
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-
-  mbmi->interinter_compound_type = COMPOUND_AVERAGE;
-  if (
-#if CONFIG_COMPOUND_SINGLEREF
-      is_inter_anyref_comp_mode(mbmi->mode)
-#else   // !CONFIG_COMPOUND_SINGLEREF
-      cm->reference_mode != SINGLE_REFERENCE &&
-      is_inter_compound_mode(mbmi->mode)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-      && mbmi->motion_mode == SIMPLE_TRANSLATION
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-      ) {
-    if (is_any_masked_compound_used(bsize)) {
-#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-      if (cm->allow_masked_compound) {
-#if CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
-        if (!is_interinter_compound_used(COMPOUND_WEDGE, bsize))
-          mbmi->interinter_compound_type =
-              aom_read_bit(r, ACCT_STR) ? COMPOUND_AVERAGE : COMPOUND_SEG;
-        else
-#endif  // CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
-          mbmi->interinter_compound_type = aom_read_symbol(
-              r, ec_ctx->compound_type_cdf[bsize], COMPOUND_TYPES, ACCT_STR);
-#if CONFIG_WEDGE
-        if (mbmi->interinter_compound_type == COMPOUND_WEDGE) {
-          assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
-          mbmi->wedge_index =
-              aom_read_literal(r, get_wedge_bits_lookup(bsize), ACCT_STR);
-          mbmi->wedge_sign = aom_read_bit(r, ACCT_STR);
-        }
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-        if (mbmi->interinter_compound_type == COMPOUND_SEG) {
-          mbmi->mask_type = aom_read_literal(r, MAX_SEG_MASK_BITS, ACCT_STR);
-        }
-#endif  // CONFIG_COMPOUND_SEGMENT
+
+    if (mbmi->comp_group_idx == 0) {
+      if (cm->seq_params.enable_jnt_comp) {
+        const int comp_index_ctx = get_comp_index_context(cm, xd);
+        mbmi->compound_idx = aom_read_symbol(
+            r, ec_ctx->compound_index_cdf[comp_index_ctx], 2, ACCT_STR);
+      } else {
+        // Distance-weighted compound is disabled, so always use average
+        mbmi->compound_idx = 1;
       }
-#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
     } else {
-      mbmi->interinter_compound_type = COMPOUND_AVERAGE;
+      assert(cm->reference_mode != SINGLE_REFERENCE &&
+             is_inter_compound_mode(mbmi->mode) &&
+             mbmi->motion_mode == SIMPLE_TRANSLATION);
+      assert(masked_compound_used);
+
+      // compound_diffwtd, wedge
+      if (is_interinter_compound_used(COMPOUND_WEDGE, bsize))
+        mbmi->interinter_comp.type =
+            1 + aom_read_symbol(r, ec_ctx->compound_type_cdf[bsize],
+                                COMPOUND_TYPES - 1, ACCT_STR);
+      else
+        mbmi->interinter_comp.type = COMPOUND_DIFFWTD;
+
+      if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+        assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+        mbmi->interinter_comp.wedge_index =
+            aom_read_symbol(r, ec_ctx->wedge_idx_cdf[bsize], 16, ACCT_STR);
+        mbmi->interinter_comp.wedge_sign = aom_read_bit(r, ACCT_STR);
+      } else {
+        assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+        mbmi->interinter_comp.mask_type =
+            aom_read_literal(r, MAX_DIFFWTD_MASK_BITS, ACCT_STR);
+      }
     }
-    if (xd->counts)
-      xd->counts->compound_interinter[bsize][mbmi->interinter_compound_type]++;
   }
 
-#if CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION
   read_mb_interp_filter(cm, xd, mbmi, r);
-#endif  // CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION
+
+  if (mbmi->motion_mode == WARPED_CAUSAL) {
+    mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE;
+    mbmi->wm_params[0].invalid = 0;
+
+    if (mbmi->num_proj_ref[0] > 1)
+      mbmi->num_proj_ref[0] = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+                                            mbmi->num_proj_ref[0], bsize);
+
+    if (find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
+                        mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
+                        &mbmi->wm_params[0], mi_row, mi_col)) {
+#if WARPED_MOTION_DEBUG
+      printf("Warning: unexpected warped model from aomenc\n");
+#endif
+      mbmi->wm_params[0].invalid = 1;
+    }
+  }
+
+  xd->cfl.is_chroma_reference = is_chroma_reference(
+      mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y);
+  xd->cfl.store_y = store_cfl_required(cm, xd);
 
 #if DEC_MISMATCH_DEBUG
-  dec_dump_logs(cm, mi, mi_row, mi_col, inter_mode_ctx, mode_ctx);
+  dec_dump_logs(cm, mi, mi_row, mi_col, mode_ctx);
 #endif  // DEC_MISMATCH_DEBUG
 }
 
 static void read_inter_frame_mode_info(AV1Decoder *const pbi,
-                                       MACROBLOCKD *const xd,
-#if CONFIG_SUPERTX
-                                       int supertx_enabled,
-#endif  // CONFIG_SUPERTX
-                                       int mi_row, int mi_col, aom_reader *r) {
+                                       MACROBLOCKD *const xd, int mi_row,
+                                       int mi_col, aom_reader *r) {
   AV1_COMMON *const cm = &pbi->common;
-  MODE_INFO *const mi = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   int inter_block = 1;
-#if CONFIG_VAR_TX
-  BLOCK_SIZE bsize = mbmi->sb_type;
-#endif  // CONFIG_VAR_TX
 
   mbmi->mv[0].as_int = 0;
   mbmi->mv[1].as_int = 0;
-  mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
-#if CONFIG_SUPERTX
-  if (!supertx_enabled)
-#endif  // CONFIG_SUPERTX
+  mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, 1, r);
+
+  mbmi->skip_mode = read_skip_mode(cm, xd, mbmi->segment_id, r);
+
+  if (mbmi->skip_mode)
+    mbmi->skip = 1;
+  else
     mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
 
+  mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, 0, r);
+
+  read_cdef(cm, r, xd, mi_col, mi_row);
+
   if (cm->delta_q_present_flag) {
-    xd->current_qindex =
-        xd->prev_qindex +
+    xd->current_qindex +=
         read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res;
     /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
     xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
-    xd->prev_qindex = xd->current_qindex;
-#if CONFIG_EXT_DELTA_Q
     if (cm->delta_lf_present_flag) {
-#if CONFIG_LOOPFILTER_LEVEL
       if (cm->delta_lf_multi) {
-        for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) {
-          mbmi->curr_delta_lf[lf_id] = xd->curr_delta_lf[lf_id] =
-              xd->prev_delta_lf[lf_id] +
+        const int frame_lf_count =
+            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+          const int tmp_lvl =
+              xd->delta_lf[lf_id] +
               read_delta_lflevel(cm, xd, r, lf_id, mbmi, mi_col, mi_row) *
                   cm->delta_lf_res;
-          xd->prev_delta_lf[lf_id] = xd->curr_delta_lf[lf_id];
+          mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] =
+              clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
         }
       } else {
-        mbmi->current_delta_lf_from_base = xd->current_delta_lf_from_base =
-            xd->prev_delta_lf_from_base +
+        const int tmp_lvl =
+            xd->delta_lf_from_base +
             read_delta_lflevel(cm, xd, r, -1, mbmi, mi_col, mi_row) *
                 cm->delta_lf_res;
-        xd->prev_delta_lf_from_base = xd->current_delta_lf_from_base;
+        mbmi->delta_lf_from_base = xd->delta_lf_from_base =
+            clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
       }
-#else
-      const int current_delta_lf_from_base =
-          xd->prev_delta_lf_from_base +
-          read_delta_lflevel(cm, xd, r, mbmi, mi_col, mi_row) *
-              cm->delta_lf_res;
-      mbmi->current_delta_lf_from_base = xd->current_delta_lf_from_base =
-          clamp(current_delta_lf_from_base, 0, MAX_LOOP_FILTER);
-      xd->prev_delta_lf_from_base = xd->current_delta_lf_from_base;
-#endif  // CONFIG_LOOPFILTER_LEVEL
     }
-#endif
   }
 
-#if CONFIG_SUPERTX
-  if (!supertx_enabled) {
-#endif  // CONFIG_SUPERTX
+  if (!mbmi->skip_mode)
     inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
 
-#if CONFIG_VAR_TX
-    xd->above_txfm_context =
-        cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
-    xd->left_txfm_context = xd->left_txfm_context_buffer +
-                            ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
-
-    if (cm->tx_mode == TX_MODE_SELECT &&
-#if CONFIG_CB4X4
-        bsize > BLOCK_4X4 &&
-#else
-        bsize >= BLOCK_8X8 &&
-#endif
-        !mbmi->skip && inter_block && !xd->lossless[mbmi->segment_id]) {
-      const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
-      const int bh = tx_size_high_unit[max_tx_size];
-      const int bw = tx_size_wide_unit[max_tx_size];
-      const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
-      const int height = block_size_high[bsize] >> tx_size_wide_log2[0];
-      int idx, idy;
-      int init_depth =
-          (height != width) ? RECT_VARTX_DEPTH_INIT : SQR_VARTX_DEPTH_INIT;
-
-      mbmi->min_tx_size = TX_SIZES_ALL;
-      for (idy = 0; idy < height; idy += bh)
-        for (idx = 0; idx < width; idx += bw)
-          read_tx_size_vartx(cm, xd, mbmi, xd->counts, max_tx_size, init_depth,
-                             idy, idx, r);
-#if CONFIG_RECT_TX_EXT
-      if (is_quarter_tx_allowed(xd, mbmi, inter_block) &&
-          mbmi->tx_size == max_tx_size) {
-        int quarter_tx;
-
-        if (quarter_txsize_lookup[bsize] != max_tx_size) {
-#if CONFIG_NEW_MULTISYMBOL
-          quarter_tx =
-              aom_read_symbol(r, cm->fc->quarter_tx_size_cdf, 2, ACCT_STR);
-#else
-          quarter_tx = aom_read(r, cm->fc->quarter_tx_size_prob, ACCT_STR);
-          if (xd->counts) ++xd->counts->quarter_tx_size[quarter_tx];
-#endif
-        } else {
-          quarter_tx = 1;
-        }
-        if (quarter_tx) {
-          mbmi->tx_size = quarter_txsize_lookup[bsize];
-          for (idy = 0; idy < tx_size_high_unit[max_tx_size] / 2; ++idy)
-            for (idx = 0; idx < tx_size_wide_unit[max_tx_size] / 2; ++idx)
-              mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
-          mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-        }
-      }
-#endif
-    } else {
-      mbmi->tx_size = read_tx_size(cm, xd, inter_block, !mbmi->skip, r);
-
-      if (inter_block) {
-        const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
-        const int height = block_size_high[bsize] >> tx_size_high_log2[0];
-        int idx, idy;
-        for (idy = 0; idy < height; ++idy)
-          for (idx = 0; idx < width; ++idx)
-            mbmi->inter_tx_size[idy >> 1][idx >> 1] = mbmi->tx_size;
-      }
-      mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-      set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, mbmi->skip, xd);
-    }
-#else
-  mbmi->tx_size = read_tx_size(cm, xd, inter_block, !mbmi->skip, r);
-#endif  // CONFIG_VAR_TX
-#if CONFIG_SUPERTX
-  }
-#if CONFIG_VAR_TX
-  else if (inter_block) {
-    const int width = num_4x4_blocks_wide_lookup[bsize];
-    const int height = num_4x4_blocks_high_lookup[bsize];
-    int idx, idy;
-    xd->mi[0]->mbmi.tx_size = xd->supertx_size;
-    for (idy = 0; idy < height; ++idy)
-      for (idx = 0; idx < width; ++idx)
-        xd->mi[0]->mbmi.inter_tx_size[idy >> 1][idx >> 1] = xd->supertx_size;
-  }
-#endif  // CONFIG_VAR_TX
-#endif  // CONFIG_SUPERTX
+  mbmi->current_qindex = xd->current_qindex;
+
+  xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 
   if (inter_block)
-    read_inter_block_mode_info(pbi, xd,
-#if CONFIG_SUPERTX
-                               mi, mi_row, mi_col, r, supertx_enabled);
-#else
-                               mi, mi_row, mi_col, r);
-#endif  // CONFIG_MOTION_VAR && CONFIG_SUPERTX
+    read_inter_block_mode_info(pbi, xd, mbmi, mi_row, mi_col, r);
   else
-    read_intra_block_mode_info(cm, mi_row, mi_col, xd, mi, r);
-
-#if !CONFIG_TXK_SEL
-  av1_read_tx_type(cm, xd,
-#if CONFIG_SUPERTX
-                   supertx_enabled,
-#endif
-                   r);
-#endif  // !CONFIG_TXK_SEL
+    read_intra_block_mode_info(cm, mi_row, mi_col, xd, mbmi, r);
 }
 
-static void av1_intra_copy_frame_mvs(AV1_COMMON *const cm, int mi_row,
-                                     int mi_col, int x_mis, int y_mis) {
-#if CONFIG_TMV
+static void intra_copy_frame_mvs(AV1_COMMON *const cm, int mi_row, int mi_col,
+                                 int x_mis, int y_mis) {
   const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1);
-  MV_REF *frame_mvs = cm->cur_frame->mvs +
-                      ((mi_row & 0xfffe) >> 1) * frame_mvs_stride +
-                      ((mi_col & 0xfffe) >> 1);
+  MV_REF *frame_mvs =
+      cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
   x_mis = ROUND_POWER_OF_TWO(x_mis, 1);
   y_mis = ROUND_POWER_OF_TWO(y_mis, 1);
-#else
-  const int frame_mvs_stride = cm->mi_cols;
-  MV_REF *frame_mvs = cm->cur_frame->mvs +
-                      (mi_row & 0xfffe) * frame_mvs_stride + (mi_col & 0xfffe);
-  x_mis = AOMMAX(x_mis, 2);
-  y_mis = AOMMAX(y_mis, 2);
-#endif  // CONFIG_TMV
-  int w, h;
-
-  for (h = 0; h < y_mis; h++) {
-    MV_REF *const frame_mv = frame_mvs + h * frame_mvs_stride;
-    for (w = 0; w < x_mis; w++) {
-      MV_REF *const mv = frame_mv + w;
-      mv->ref_frame[0] = NONE_FRAME;
-      mv->ref_frame[1] = NONE_FRAME;
+
+  for (int h = 0; h < y_mis; h++) {
+    MV_REF *mv = frame_mvs;
+    for (int w = 0; w < x_mis; w++) {
+      mv->ref_frame = NONE_FRAME;
+      mv++;
     }
+    frame_mvs += frame_mvs_stride;
   }
 }
 
-void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd,
-#if CONFIG_SUPERTX
-                        int supertx_enabled,
-#endif  // CONFIG_SUPERTX
-                        int mi_row, int mi_col, aom_reader *r, int x_mis,
-                        int y_mis) {
+void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd, int mi_row,
+                        int mi_col, aom_reader *r, int x_mis, int y_mis) {
   AV1_COMMON *const cm = &pbi->common;
-  MODE_INFO *const mi = xd->mi[0];
-#if CONFIG_INTRABC
-  mi->mbmi.use_intrabc = 0;
-#endif  // CONFIG_INTRABC
+  MB_MODE_INFO *const mi = xd->mi[0];
+  mi->use_intrabc = 0;
 
   if (frame_is_intra_only(cm)) {
     read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
-    av1_intra_copy_frame_mvs(cm, mi_row, mi_col, x_mis, y_mis);
+    intra_copy_frame_mvs(cm, mi_row, mi_col, x_mis, y_mis);
   } else {
-    read_inter_frame_mode_info(pbi, xd,
-#if CONFIG_SUPERTX
-                               supertx_enabled,
-#endif  // CONFIG_SUPERTX
-                               mi_row, mi_col, r);
+    read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r);
     av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
   }
 }
diff --git a/third_party/aom/av1/decoder/decodemv.h b/third_party/aom/av1/decoder/decodemv.h
index 162cf3254..6243bb168 100644
--- a/third_party/aom/av1/decoder/decodemv.h
+++ b/third_party/aom/av1/decoder/decodemv.h
@@ -21,9 +21,6 @@ extern "C" {
 #endif
 
 void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd,
-#if CONFIG_SUPERTX
-                        int supertx_enabled,
-#endif
 
                         int mi_row, int mi_col, aom_reader *r, int x_mis,
                         int y_mis);
@@ -32,14 +29,7 @@ void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd,
 }  // extern "C"
 #endif
 
-void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd,
-#if CONFIG_SUPERTX
-                      int supertx_enabled,
-#endif
-#if CONFIG_TXK_SEL
-                      int blk_row, int blk_col, int block, int plane,
-                      TX_SIZE tx_size,
-#endif
-                      aom_reader *r);
+void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
+                      int blk_col, TX_SIZE tx_size, aom_reader *r);
 
 #endif  // AV1_DECODER_DECODEMV_H_
diff --git a/third_party/aom/av1/decoder/decoder.c b/third_party/aom/av1/decoder/decoder.c
index cd82d5b53..2e91d27d3 100644
--- a/third_party/aom/av1/decoder/decoder.c
+++ b/third_party/aom/av1/decoder/decoder.c
@@ -13,9 +13,9 @@
 #include <limits.h>
 #include <stdio.h>
 
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-#include "./aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
 
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/system_state.h"
@@ -33,12 +33,8 @@
 
 #include "av1/decoder/decodeframe.h"
 #include "av1/decoder/decoder.h"
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-#include "av1/common/ncobmc_kernels.h"
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#if !CONFIG_PVQ
 #include "av1/decoder/detokenize.h"
-#endif
+#include "av1/decoder/obu.h"
 
 static void initialize_dec(void) {
   static volatile int init_done = 0;
@@ -53,23 +49,24 @@ static void initialize_dec(void) {
   }
 }
 
-static void av1_dec_setup_mi(AV1_COMMON *cm) {
-  cm->mi = cm->mip + cm->mi_stride + 1;
-  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+static void dec_setup_mi(AV1_COMMON *cm) {
+  cm->mi = cm->mip;
+  cm->mi_grid_visible = cm->mi_grid_base;
   memset(cm->mi_grid_base, 0,
-         cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
+         cm->mi_stride * cm->mi_rows * sizeof(*cm->mi_grid_base));
 }
 
 static int av1_dec_alloc_mi(AV1_COMMON *cm, int mi_size) {
   cm->mip = aom_calloc(mi_size, sizeof(*cm->mip));
   if (!cm->mip) return 1;
   cm->mi_alloc_size = mi_size;
-  cm->mi_grid_base = (MODE_INFO **)aom_calloc(mi_size, sizeof(MODE_INFO *));
+  cm->mi_grid_base =
+      (MB_MODE_INFO **)aom_calloc(mi_size, sizeof(MB_MODE_INFO *));
   if (!cm->mi_grid_base) return 1;
   return 0;
 }
 
-static void av1_dec_free_mi(AV1_COMMON *cm) {
+static void dec_free_mi(AV1_COMMON *cm) {
   aom_free(cm->mip);
   cm->mip = NULL;
   aom_free(cm->mi_grid_base);
@@ -108,28 +105,20 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) {
   memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
 
   cm->current_video_frame = 0;
-  pbi->ready_for_new_data = 1;
+  pbi->decoding_first_frame = 1;
   pbi->common.buffer_pool = pool;
 
   cm->bit_depth = AOM_BITS_8;
   cm->dequant_bit_depth = AOM_BITS_8;
 
   cm->alloc_mi = av1_dec_alloc_mi;
-  cm->free_mi = av1_dec_free_mi;
-  cm->setup_mi = av1_dec_setup_mi;
+  cm->free_mi = dec_free_mi;
+  cm->setup_mi = dec_setup_mi;
 
   av1_loop_filter_init(cm);
 
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  get_default_ncobmc_kernels(cm);
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-
-#if CONFIG_AOM_QM
-  aom_qm_init(cm);
-#endif
-#if CONFIG_LOOP_RESTORATION
+  av1_qm_init(cm);
   av1_loop_restoration_precal();
-#endif  // CONFIG_LOOP_RESTORATION
 #if CONFIG_ACCOUNTING
   pbi->acct_enabled = 1;
   aom_accounting_init(&pbi->accounting);
@@ -142,33 +131,83 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) {
   return pbi;
 }
 
+void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_mt_info) {
+  if (tile_mt_info != NULL) {
+#if CONFIG_MULTITHREAD
+    if (tile_mt_info->job_mutex != NULL) {
+      pthread_mutex_destroy(tile_mt_info->job_mutex);
+      aom_free(tile_mt_info->job_mutex);
+    }
+#endif
+    aom_free(tile_mt_info->job_queue);
+    // clear the structure as the source of this call may be a resize in which
+    // case this call will be followed by an _alloc() which may fail.
+    av1_zero(*tile_mt_info);
+  }
+}
+
 void av1_decoder_remove(AV1Decoder *pbi) {
   int i;
 
   if (!pbi) return;
 
+  // Free the tile list output buffer.
+  if (pbi->tile_list_output != NULL) aom_free(pbi->tile_list_output);
+  pbi->tile_list_output = NULL;
+
   aom_get_worker_interface()->end(&pbi->lf_worker);
   aom_free(pbi->lf_worker.data1);
-  aom_free(pbi->tile_data);
-  for (i = 0; i < pbi->num_tile_workers; ++i) {
+
+  if (pbi->thread_data) {
+    for (int worker_idx = 0; worker_idx < pbi->max_threads - 1; worker_idx++) {
+      DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+      const int use_highbd = pbi->common.use_highbitdepth ? 1 : 0;
+      av1_free_mc_tmp_buf(thread_data->td, use_highbd);
+      aom_free(thread_data->td);
+    }
+    aom_free(pbi->thread_data);
+  }
+
+  for (i = 0; i < pbi->num_workers; ++i) {
     AVxWorker *const worker = &pbi->tile_workers[i];
     aom_get_worker_interface()->end(worker);
   }
-  aom_free(pbi->tile_worker_data);
-  aom_free(pbi->tile_worker_info);
+  aom_free(pbi->tile_data);
   aom_free(pbi->tile_workers);
 
-  if (pbi->num_tile_workers > 0) {
+  if (pbi->num_workers > 0) {
     av1_loop_filter_dealloc(&pbi->lf_row_sync);
+    av1_loop_restoration_dealloc(&pbi->lr_row_sync, pbi->num_workers);
+    av1_dealloc_dec_jobs(&pbi->tile_mt_info);
   }
 
 #if CONFIG_ACCOUNTING
   aom_accounting_clear(&pbi->accounting);
 #endif
+  const int use_highbd = pbi->common.use_highbitdepth ? 1 : 0;
+  av1_free_mc_tmp_buf(&pbi->td, use_highbd);
 
   aom_free(pbi);
 }
 
+void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd, int mi_row,
+                       int mi_col, aom_reader *r, BLOCK_SIZE bsize,
+                       palette_visitor_fn_t visit) {
+  if (!is_inter_block(xd->mi[0])) {
+    for (int plane = 0; plane < AOMMIN(2, av1_num_planes(&pbi->common));
+         ++plane) {
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      if (is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                              pd->subsampling_y)) {
+        if (xd->mi[0]->palette_mode_info.palette_size[plane])
+          visit(xd, plane, r);
+      } else {
+        assert(xd->mi[0]->palette_mode_info.palette_size[plane] == 0);
+      }
+    }
+  }
+}
+
 static int equal_dimensions(const YV12_BUFFER_CONFIG *a,
                             const YV12_BUFFER_CONFIG *b) {
   return a->y_height == b->y_height && a->y_width == b->y_width &&
@@ -178,6 +217,7 @@ static int equal_dimensions(const YV12_BUFFER_CONFIG *a,
 aom_codec_err_t av1_copy_reference_dec(AV1Decoder *pbi, int idx,
                                        YV12_BUFFER_CONFIG *sd) {
   AV1_COMMON *cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
 
   const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, idx);
   if (cfg == NULL) {
@@ -188,13 +228,25 @@ aom_codec_err_t av1_copy_reference_dec(AV1Decoder *pbi, int idx,
     aom_internal_error(&cm->error, AOM_CODEC_ERROR,
                        "Incorrect buffer dimensions");
   else
-    aom_yv12_copy_frame(cfg, sd);
+    aom_yv12_copy_frame(cfg, sd, num_planes);
 
   return cm->error.error_code;
 }
 
+static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
+                                       const YV12_BUFFER_CONFIG *b) {
+  return a->y_height == b->y_height && a->y_width == b->y_width &&
+         a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
+         a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
+         a->border == b->border &&
+         (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
+             (b->flags & YV12_FLAG_HIGHBITDEPTH);
+}
+
 aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
+                                      int use_external_ref,
                                       YV12_BUFFER_CONFIG *sd) {
+  const int num_planes = av1_num_planes(cm);
   YV12_BUFFER_CONFIG *ref_buf = NULL;
 
   // Get the destination reference buffer.
@@ -205,60 +257,132 @@ aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
     return AOM_CODEC_ERROR;
   }
 
-  if (!equal_dimensions(ref_buf, sd)) {
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
-                       "Incorrect buffer dimensions");
+  if (!use_external_ref) {
+    if (!equal_dimensions(ref_buf, sd)) {
+      aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                         "Incorrect buffer dimensions");
+    } else {
+      // Overwrite the reference frame buffer.
+      aom_yv12_copy_frame(sd, ref_buf, num_planes);
+    }
   } else {
-    // Overwrite the reference frame buffer.
-    aom_yv12_copy_frame(sd, ref_buf);
+    if (!equal_dimensions_and_border(ref_buf, sd)) {
+      aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                         "Incorrect buffer dimensions");
+    } else {
+      // Overwrite the reference frame buffer pointers.
+      // Once we no longer need the external reference buffer, these pointers
+      // are restored.
+      ref_buf->store_buf_adr[0] = ref_buf->y_buffer;
+      ref_buf->store_buf_adr[1] = ref_buf->u_buffer;
+      ref_buf->store_buf_adr[2] = ref_buf->v_buffer;
+      ref_buf->y_buffer = sd->y_buffer;
+      ref_buf->u_buffer = sd->u_buffer;
+      ref_buf->v_buffer = sd->v_buffer;
+      ref_buf->use_external_refernce_buffers = 1;
+    }
   }
 
   return cm->error.error_code;
 }
 
-/* If any buffer updating is signaled it should be done here. */
-static void swap_frame_buffers(AV1Decoder *pbi) {
+aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm,
+                                       YV12_BUFFER_CONFIG *new_frame,
+                                       YV12_BUFFER_CONFIG *sd) {
+  const int num_planes = av1_num_planes(cm);
+
+  if (!equal_dimensions_and_border(new_frame, sd))
+    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                       "Incorrect buffer dimensions");
+  else
+    aom_yv12_copy_frame(new_frame, sd, num_planes);
+
+  return cm->error.error_code;
+}
+
+/* If any buffer updating is signaled it should be done here.
+   Consumes a reference to cm->new_fb_idx.
+*/
+static void swap_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
   int ref_index = 0, mask;
   AV1_COMMON *const cm = &pbi->common;
   BufferPool *const pool = cm->buffer_pool;
   RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 
-  lock_buffer_pool(pool);
-  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
-    const int old_idx = cm->ref_frame_map[ref_index];
-    // Current thread releases the holding of reference frame.
-    decrease_ref_count(old_idx, frame_bufs, pool);
-
-    // Release the reference frame holding in the reference map for the decoding
-    // of the next frame.
-    if (mask & 1) decrease_ref_count(old_idx, frame_bufs, pool);
-    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
-    ++ref_index;
-  }
+  if (frame_decoded) {
+    lock_buffer_pool(pool);
 
-  // Current thread releases the holding of reference frame.
-  for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
-    const int old_idx = cm->ref_frame_map[ref_index];
-    decrease_ref_count(old_idx, frame_bufs, pool);
-    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
-  }
+    // In ext-tile decoding, the camera frame header is only decoded once. So,
+    // we don't release the references here.
+    if (!pbi->camera_frame_header_ready) {
+      for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        // Current thread releases the holding of reference frame.
+        decrease_ref_count(old_idx, frame_bufs, pool);
 
-  unlock_buffer_pool(pool);
-  pbi->hold_ref_buf = 0;
-  cm->frame_to_show = get_frame_new_buffer(cm);
+        // Release the reference frame holding in the reference map for the
+        // decoding of the next frame.
+        if (mask & 1) decrease_ref_count(old_idx, frame_bufs, pool);
+        cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+        ++ref_index;
+      }
 
-  // TODO(zoeliu): To fix the ref frame buffer update for the scenario of
-  //               cm->frame_parellel_decode == 1
-  if (!cm->frame_parallel_decode || !cm->show_frame) {
+      // Current thread releases the holding of reference frame.
+      const int check_on_show_existing_frame =
+          !cm->show_existing_frame || cm->reset_decoder_state;
+      for (; ref_index < REF_FRAMES && check_on_show_existing_frame;
+           ++ref_index) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        decrease_ref_count(old_idx, frame_bufs, pool);
+        cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+      }
+    }
+
+    YV12_BUFFER_CONFIG *cur_frame = get_frame_new_buffer(cm);
+
+    if (cm->show_existing_frame || cm->show_frame) {
+      if (pbi->output_all_layers) {
+        // Append this frame to the output queue
+        if (pbi->num_output_frames >= MAX_NUM_SPATIAL_LAYERS) {
+          // We can't store the new frame anywhere, so drop it and return an
+          // error
+          decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+          cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+        } else {
+          pbi->output_frames[pbi->num_output_frames] = cur_frame;
+          pbi->output_frame_index[pbi->num_output_frames] = cm->new_fb_idx;
+          pbi->num_output_frames++;
+        }
+      } else {
+        // Replace any existing output frame
+        assert(pbi->num_output_frames == 0 || pbi->num_output_frames == 1);
+        if (pbi->num_output_frames > 0) {
+          decrease_ref_count((int)pbi->output_frame_index[0], frame_bufs, pool);
+        }
+        pbi->output_frames[0] = cur_frame;
+        pbi->output_frame_index[0] = cm->new_fb_idx;
+        pbi->num_output_frames = 1;
+      }
+    } else {
+      decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+    }
+
+    unlock_buffer_pool(pool);
+  } else {
+    // Nothing was decoded, so just drop this frame buffer
     lock_buffer_pool(pool);
-    --frame_bufs[cm->new_fb_idx].ref_count;
+    decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
     unlock_buffer_pool(pool);
   }
 
-  // Invalidate these references until the next frame starts.
-  for (ref_index = 0; ref_index < INTER_REFS_PER_FRAME; ref_index++) {
-    cm->frame_refs[ref_index].idx = INVALID_IDX;
-    cm->frame_refs[ref_index].buf = NULL;
+  if (!pbi->camera_frame_header_ready) {
+    pbi->hold_ref_buf = 0;
+
+    // Invalidate these references until the next frame starts.
+    for (ref_index = 0; ref_index < INTER_REFS_PER_FRAME; ref_index++) {
+      cm->frame_refs[ref_index].idx = INVALID_IDX;
+      cm->frame_refs[ref_index].buf = NULL;
+    }
   }
 }
 
@@ -268,7 +392,6 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
   BufferPool *volatile const pool = cm->buffer_pool;
   RefCntBuffer *volatile const frame_bufs = cm->buffer_pool->frame_bufs;
   const uint8_t *source = *psource;
-  int retcode = 0;
   cm->error.error_code = AOM_CODEC_OK;
 
   if (size == 0) {
@@ -286,18 +409,9 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
     }
   }
 
-  pbi->ready_for_new_data = 0;
-
   // Find a free buffer for the new frame, releasing the reference previously
   // held.
 
-  // Check if the previous frame was a frame without any references to it.
-  // Release frame buffer if not decoding in frame parallel mode.
-  if (!cm->frame_parallel_decode && cm->new_fb_idx >= 0 &&
-      frame_bufs[cm->new_fb_idx].ref_count == 0)
-    pool->release_fb_cb(pool->cb_priv,
-                        &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
-
   // Find a free frame buffer. Return error if can not find any.
   cm->new_fb_idx = get_free_fb(cm);
   if (cm->new_fb_idx == INVALID_IDX) return AOM_CODEC_MEM_ERROR;
@@ -305,31 +419,20 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
   // Assign a MV array to the frame buffer.
   cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
 
-  pbi->hold_ref_buf = 0;
-  if (cm->frame_parallel_decode) {
-    AVxWorker *const worker = pbi->frame_worker_owner;
-    av1_frameworker_lock_stats(worker);
-    frame_bufs[cm->new_fb_idx].frame_worker_owner = worker;
-    // Reset decoding progress.
-    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
-    pbi->cur_buf->row = -1;
-    pbi->cur_buf->col = -1;
-    av1_frameworker_unlock_stats(worker);
-  } else {
-    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
-  }
+  if (!pbi->camera_frame_header_ready) pbi->hold_ref_buf = 0;
+
+  pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
 
   if (setjmp(cm->error.jmp)) {
     const AVxWorkerInterface *const winterface = aom_get_worker_interface();
     int i;
 
     cm->error.setjmp = 0;
-    pbi->ready_for_new_data = 1;
 
     // Synchronize all threads immediately as a subsequent decode call may
     // cause a resize invalidating some allocations.
     winterface->sync(&pbi->lf_worker);
-    for (i = 0; i < pbi->num_tile_workers; ++i) {
+    for (i = 0; i < pbi->num_workers; ++i) {
       winterface->sync(&pbi->tile_workers[i]);
     }
 
@@ -349,7 +452,10 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
       }
 
       // Current thread releases the holding of reference frame.
-      for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+      const int check_on_show_existing_frame =
+          !cm->show_existing_frame || cm->reset_decoder_state;
+      for (; ref_index < REF_FRAMES && check_on_show_existing_frame;
+           ++ref_index) {
         const int old_idx = cm->ref_frame_map[ref_index];
         decrease_ref_count(old_idx, frame_bufs, pool);
       }
@@ -365,160 +471,72 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
 
   cm->error.setjmp = 1;
 
-#if !CONFIG_OBU
-  av1_decode_frame_headers_and_setup(pbi, source, source + size, psource);
-  if (!cm->show_existing_frame) {
-    av1_decode_tg_tiles_and_wrapup(pbi, source, source + size, psource, 0,
-                                   cm->tile_rows * cm->tile_cols - 1, 1);
-  }
-#else
-  av1_decode_frame_from_obus(pbi, source, source + size, psource);
+  int frame_decoded =
+      aom_decode_frame_from_obus(pbi, source, source + size, psource);
+
+  if (cm->error.error_code != AOM_CODEC_OK) return 1;
+
+#if TXCOEFF_TIMER
+  cm->cum_txcoeff_timer += cm->txcoeff_timer;
+  fprintf(stderr,
+          "txb coeff block number: %d, frame time: %ld, cum time %ld in us\n",
+          cm->txb_count, cm->txcoeff_timer, cm->cum_txcoeff_timer);
+  cm->txcoeff_timer = 0;
+  cm->txb_count = 0;
 #endif
 
-  swap_frame_buffers(pbi);
+  // Note: At this point, this function holds a reference to cm->new_fb_idx
+  // in the buffer pool. This reference is consumed by swap_frame_buffers().
+  swap_frame_buffers(pbi, frame_decoded);
+
+  if (frame_decoded) {
+    pbi->decoding_first_frame = 0;
+  }
 
-#if CONFIG_EXT_TILE
-  // For now, we only extend the frame borders when the whole frame is decoded.
-  // Later, if needed, extend the border for the decoded tile on the frame
-  // border.
-  if (pbi->dec_tile_row == -1 && pbi->dec_tile_col == -1)
-#endif  // CONFIG_EXT_TILE
-    // TODO(debargha): Fix encoder side mv range, so that we can use the
-    // inner border extension. As of now use the larger extension.
-    // aom_extend_frame_inner_borders(cm->frame_to_show);
-    aom_extend_frame_borders(cm->frame_to_show);
+  if (cm->error.error_code != AOM_CODEC_OK) return 1;
 
   aom_clear_system_state();
 
   if (!cm->show_existing_frame) {
     cm->last_show_frame = cm->show_frame;
 
-#if CONFIG_EXT_REFS
-    // NOTE: It is not supposed to ref to any frame not used as reference
-    if (cm->is_reference_frame)
-#endif  // CONFIG_EXT_REFS
-      cm->prev_frame = cm->cur_frame;
-
-    if (cm->seg.enabled && !cm->frame_parallel_decode)
-      av1_swap_current_and_last_seg_map(cm);
-  }
-
-  // Update progress in frame parallel decode.
-  if (cm->frame_parallel_decode) {
-    // Need to lock the mutex here as another thread may
-    // be accessing this buffer.
-    AVxWorker *const worker = pbi->frame_worker_owner;
-    FrameWorkerData *const frame_worker_data = worker->data1;
-    av1_frameworker_lock_stats(worker);
-
-    if (cm->show_frame) {
-      cm->current_video_frame++;
-    }
-    frame_worker_data->frame_decoded = 1;
-    frame_worker_data->frame_context_ready = 1;
-    av1_frameworker_signal_stats(worker);
-    av1_frameworker_unlock_stats(worker);
-  } else {
-    cm->last_width = cm->width;
-    cm->last_height = cm->height;
-    cm->last_tile_cols = cm->tile_cols;
-    cm->last_tile_rows = cm->tile_rows;
-    if (cm->show_frame) {
-      cm->current_video_frame++;
+    if (cm->seg.enabled) {
+      if (cm->prev_frame && (cm->mi_rows == cm->prev_frame->mi_rows) &&
+          (cm->mi_cols == cm->prev_frame->mi_cols)) {
+        cm->last_frame_seg_map = cm->prev_frame->seg_map;
+      } else {
+        cm->last_frame_seg_map = NULL;
+      }
     }
   }
 
+  // Update progress in frame parallel decode.
+  cm->last_width = cm->width;
+  cm->last_height = cm->height;
+  cm->last_tile_cols = cm->tile_cols;
+  cm->last_tile_rows = cm->tile_rows;
   cm->error.setjmp = 0;
-  return retcode;
-}
-
-int av1_get_raw_frame(AV1Decoder *pbi, YV12_BUFFER_CONFIG *sd) {
-  AV1_COMMON *const cm = &pbi->common;
-  int ret = -1;
-  if (pbi->ready_for_new_data == 1) return ret;
 
-  pbi->ready_for_new_data = 1;
+  return 0;
+}
 
-  /* no raw frame to show!!! */
-  if (!cm->show_frame) return ret;
+// Get the frame at a particular index in the output queue
+int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd,
+                      aom_film_grain_t **grain_params) {
+  RefCntBuffer *const frame_bufs = pbi->common.buffer_pool->frame_bufs;
 
-  *sd = *cm->frame_to_show;
-  ret = 0;
+  if (index >= pbi->num_output_frames) return -1;
+  *sd = pbi->output_frames[index];
+  *grain_params = &frame_bufs[pbi->output_frame_index[index]].film_grain_params;
   aom_clear_system_state();
-  return ret;
+  return 0;
 }
 
+// Get the highest-spatial-layer output
+// TODO(david.barker): What should this do?
 int av1_get_frame_to_show(AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame) {
-  AV1_COMMON *const cm = &pbi->common;
-
-  if (!cm->show_frame || !cm->frame_to_show) return -1;
+  if (pbi->num_output_frames == 0) return -1;
 
-  *frame = *cm->frame_to_show;
+  *frame = *pbi->output_frames[pbi->num_output_frames - 1];
   return 0;
 }
-
-aom_codec_err_t av1_parse_superframe_index(const uint8_t *data, size_t data_sz,
-                                           uint32_t sizes[8], int *count,
-                                           int *index_size,
-                                           aom_decrypt_cb decrypt_cb,
-                                           void *decrypt_state) {
-  // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
-  // it is a super frame index. If the last byte of real video compression
-  // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
-  // not the associated matching marker byte at the front of the index we have
-  // an invalid bitstream and need to return an error.
-
-  uint8_t marker;
-  size_t frame_sz_sum = 0;
-
-  assert(data_sz);
-  marker = read_marker(decrypt_cb, decrypt_state, data);
-  *count = 0;
-
-  if ((marker & 0xe0) == 0xc0) {
-    const uint32_t frames = (marker & 0x7) + 1;
-    const uint32_t mag = ((marker >> 3) & 0x3) + 1;
-    const size_t index_sz = 2 + mag * (frames - 1);
-    *index_size = (int)index_sz;
-
-    // This chunk is marked as having a superframe index but doesn't have
-    // enough data for it, thus it's an invalid superframe index.
-    if (data_sz < index_sz) return AOM_CODEC_CORRUPT_FRAME;
-
-    {
-      const uint8_t marker2 =
-          read_marker(decrypt_cb, decrypt_state, data + index_sz - 1);
-
-      // This chunk is marked as having a superframe index but doesn't have
-      // the matching marker byte at the front of the index therefore it's an
-      // invalid chunk.
-      if (marker != marker2) return AOM_CODEC_CORRUPT_FRAME;
-    }
-
-    {
-      // Found a valid superframe index.
-      uint32_t i, j;
-      const uint8_t *x = &data[1];
-
-      // Frames has a maximum of 8 and mag has a maximum of 4.
-      uint8_t clear_buffer[28];
-      assert(sizeof(clear_buffer) >= (frames - 1) * mag);
-      if (decrypt_cb) {
-        decrypt_cb(decrypt_state, x, clear_buffer, (frames - 1) * mag);
-        x = clear_buffer;
-      }
-
-      for (i = 0; i < frames - 1; ++i) {
-        uint32_t this_sz = 0;
-
-        for (j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8);
-        this_sz += 1;
-        sizes[i] = this_sz;
-        frame_sz_sum += this_sz;
-      }
-      sizes[i] = (uint32_t)(data_sz - index_sz - frame_sz_sum);
-      *count = frames;
-    }
-  }
-  return AOM_CODEC_OK;
-}
diff --git a/third_party/aom/av1/decoder/decoder.h b/third_party/aom/av1/decoder/decoder.h
index 20129b669..42fcc1256 100644
--- a/third_party/aom/av1/decoder/decoder.h
+++ b/third_party/aom/av1/decoder/decoder.h
@@ -12,7 +12,7 @@
 #ifndef AV1_DECODER_DECODER_H_
 #define AV1_DECODER_DECODER_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #include "aom/aom_codec.h"
 #include "aom_dsp/bitreader.h"
@@ -29,73 +29,61 @@
 #include "av1/decoder/inspection.h"
 #endif
 
-#if CONFIG_PVQ
-#include "aom_dsp/entdec.h"
-#include "av1/decoder/decint.h"
-#include "av1/encoder/encodemb.h"
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-// TODO(hkuang): combine this with TileWorkerData.
-typedef struct TileData {
-  AV1_COMMON *cm;
-  aom_reader bit_reader;
-  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+typedef struct ThreadData {
+  aom_reader *bit_reader;
+  DECLARE_ALIGNED(32, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
-#if CONFIG_PVQ
-  /* forward transformed predicted image, a reference for PVQ */
-  DECLARE_ALIGNED(16, tran_low_t, pvq_ref_coeff[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
-#endif
-#if CONFIG_CFL
-  CFL_CTX cfl;
-#endif
-  DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
-  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
-#if CONFIG_MRC_TX
-  DECLARE_ALIGNED(16, uint8_t, mrc_mask[MAX_SB_SQUARE]);
-#endif  // CONFIG_MRC_TX
-} TileData;
-
-typedef struct TileWorkerData {
-  struct AV1Decoder *pbi;
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
+  CB_BUFFER cb_buffer_base;
+  uint8_t *mc_buf[2];
+  int32_t mc_buf_size;
+} ThreadData;
+
+typedef struct TileDataDec {
+  TileInfo tile_info;
   aom_reader bit_reader;
-  FRAME_COUNTS counts;
-  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
-  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
-#if CONFIG_PVQ
-  /* forward transformed predicted image, a reference for PVQ */
-  DECLARE_ALIGNED(16, tran_low_t, pvq_ref_coeff[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
-#endif
-#if CONFIG_CFL
-  CFL_CTX cfl;
-#endif
-  FRAME_CONTEXT tctx;
-  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
-#if CONFIG_MRC_TX
-  DECLARE_ALIGNED(16, uint8_t, mrc_mask[MAX_SB_SQUARE]);
-#endif  // CONFIG_MRC_TX
-  struct aom_internal_error_info error_info;
-} TileWorkerData;
+  DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
+} TileDataDec;
 
 typedef struct TileBufferDec {
   const uint8_t *data;
   size_t size;
-  const uint8_t *raw_data_end;  // The end of the raw tile buffer in the
-                                // bit stream.
-  int col;                      // only used with multi-threaded decoding
 } TileBufferDec;
 
-typedef struct AV1Decoder {
-  DECLARE_ALIGNED(16, MACROBLOCKD, mb);
+typedef struct DataBuffer {
+  const uint8_t *data;
+  size_t size;
+} DataBuffer;
+
+typedef struct EXTERNAL_REFERENCES {
+  YV12_BUFFER_CONFIG refs[MAX_EXTERNAL_REFERENCES];
+  int num;
+} EXTERNAL_REFERENCES;
+
+typedef struct TileJobsDec {
+  TileBufferDec *tile_buffer;
+  TileDataDec *tile_data;
+} TileJobsDec;
 
-  DECLARE_ALIGNED(16, AV1_COMMON, common);
+typedef struct AV1DecTileMTData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *job_mutex;
+#endif
+  TileJobsDec *job_queue;
+  int jobs_enqueued;
+  int jobs_dequeued;
+  int alloc_tile_rows;
+  int alloc_tile_cols;
+} AV1DecTileMT;
+
+typedef struct AV1Decoder {
+  DECLARE_ALIGNED(32, MACROBLOCKD, mb);
 
-  int ready_for_new_data;
+  DECLARE_ALIGNED(32, AV1_COMMON, common);
 
   int refresh_frame_flags;
 
@@ -105,20 +93,38 @@ typedef struct AV1Decoder {
 
   AVxWorker *frame_worker_owner;  // frame_worker that owns this pbi.
   AVxWorker lf_worker;
+  AV1LfSync lf_row_sync;
+  AV1LrSync lr_row_sync;
+  AV1LrStruct lr_ctxt;
   AVxWorker *tile_workers;
-  TileWorkerData *tile_worker_data;
-  TileInfo *tile_worker_info;
-  int num_tile_workers;
-
-  TileData *tile_data;
+  int num_workers;
+  DecWorkerData *thread_data;
+  ThreadData td;
+  TileDataDec *tile_data;
   int allocated_tiles;
 
   TileBufferDec tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
-
-  AV1LfSync lf_row_sync;
-
-  aom_decrypt_cb decrypt_cb;
-  void *decrypt_state;
+  AV1DecTileMT tile_mt_info;
+
+  // Each time the decoder is called, we expect to receive a full temporal unit.
+  // This can contain up to one shown frame per spatial layer in the current
+  // operating point (note that some layers may be entirely omitted).
+  // If the 'output_all_layers' option is true, we save all of these shown
+  // frames so that they can be returned to the application. If the
+  // 'output_all_layers' option is false, then we only output one image per
+  // temporal unit.
+  //
+  // Note: The saved buffers are released at the start of the next time the
+  // application calls aom_codec_decode().
+  int output_all_layers;
+  YV12_BUFFER_CONFIG *output_frames[MAX_NUM_SPATIAL_LAYERS];
+  size_t output_frame_index[MAX_NUM_SPATIAL_LAYERS];  // Buffer pool indices
+  size_t num_output_frames;  // How many frames are queued up so far?
+
+  // In order to properly support random-access decoding, we need
+  // to behave slightly differently for the very first frame we decode.
+  // So we track whether this is the first frame or not.
+  int decoding_first_frame;
 
   int allow_lowbitdepth;
   int max_threads;
@@ -127,29 +133,47 @@ typedef struct AV1Decoder {
   int hold_ref_buf;  // hold the reference buffer.
 
   int tile_size_bytes;
-#if CONFIG_EXT_TILE
   int tile_col_size_bytes;
   int dec_tile_row, dec_tile_col;  // always -1 for non-VR tile encoding
-#endif                             // CONFIG_EXT_TILE
 #if CONFIG_ACCOUNTING
   int acct_enabled;
   Accounting accounting;
 #endif
-  size_t uncomp_hdr_size;       // Size of the uncompressed header
-  size_t first_partition_size;  // Size of the compressed header
-  int tg_size;                  // Number of tiles in the current tilegroup
-  int tg_start;                 // First tile in the current tilegroup
+  size_t uncomp_hdr_size;  // Size of the uncompressed header
+  int tg_size;             // Number of tiles in the current tilegroup
+  int tg_start;            // First tile in the current tilegroup
   int tg_size_bit_offset;
+  int sequence_header_ready;
 #if CONFIG_INSPECTION
   aom_inspect_cb inspect_cb;
   void *inspect_ctx;
 #endif
+  int operating_point;
+  int current_operating_point;
+  int seen_frame_header;
+
+  // State if the camera frame header is already decoded while
+  // large_scale_tile = 1.
+  int camera_frame_header_ready;
+  size_t frame_header_size;
+  DataBuffer obu_size_hdr;
+  int output_frame_width_in_tiles_minus_1;
+  int output_frame_height_in_tiles_minus_1;
+  int tile_count_minus_1;
+  uint32_t coded_tile_data_size;
+  unsigned int ext_tile_debug;  // for ext-tile software debug & testing
+  EXTERNAL_REFERENCES ext_refs;
+  size_t tile_list_size;
+  uint8_t *tile_list_output;
+  size_t buffer_sz;
 } AV1Decoder;
 
 int av1_receive_compressed_data(struct AV1Decoder *pbi, size_t size,
                                 const uint8_t **dest);
 
-int av1_get_raw_frame(struct AV1Decoder *pbi, YV12_BUFFER_CONFIG *sd);
+// Get the frame at a particular index in the output queue
+int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd,
+                      aom_film_grain_t **grain_params);
 
 int av1_get_frame_to_show(struct AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame);
 
@@ -157,29 +181,16 @@ aom_codec_err_t av1_copy_reference_dec(struct AV1Decoder *pbi, int idx,
                                        YV12_BUFFER_CONFIG *sd);
 
 aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
+                                      int use_external_ref,
                                       YV12_BUFFER_CONFIG *sd);
-
-static INLINE uint8_t read_marker(aom_decrypt_cb decrypt_cb,
-                                  void *decrypt_state, const uint8_t *data) {
-  if (decrypt_cb) {
-    uint8_t marker;
-    decrypt_cb(decrypt_state, data, &marker, 1);
-    return marker;
-  }
-  return *data;
-}
-
-// This function is exposed for use in tests, as well as the inlined function
-// "read_marker".
-aom_codec_err_t av1_parse_superframe_index(const uint8_t *data, size_t data_sz,
-                                           uint32_t sizes[8], int *count,
-                                           int *index_size,
-                                           aom_decrypt_cb decrypt_cb,
-                                           void *decrypt_state);
+aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm,
+                                       YV12_BUFFER_CONFIG *new_frame,
+                                       YV12_BUFFER_CONFIG *sd);
 
 struct AV1Decoder *av1_decoder_create(BufferPool *const pool);
 
 void av1_decoder_remove(struct AV1Decoder *pbi);
+void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_jobs_sync);
 
 static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
                                       BufferPool *const pool) {
@@ -196,7 +207,6 @@ static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
   }
 }
 
-#if CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
 static INLINE int dec_is_ref_frame_buf(AV1Decoder *const pbi,
                                        RefCntBuffer *frame_buf) {
   AV1_COMMON *const cm = &pbi->common;
@@ -208,7 +218,6 @@ static INLINE int dec_is_ref_frame_buf(AV1Decoder *const pbi,
   }
   return (i < INTER_REFS_PER_FRAME);
 }
-#endif  // CONFIG_EXT_REFS
 
 #define ACCT_STR __func__
 static INLINE int av1_read_uniform(aom_reader *r, int n) {
@@ -222,6 +231,13 @@ static INLINE int av1_read_uniform(aom_reader *r, int n) {
     return (v << 1) - m + aom_read_literal(r, 1, ACCT_STR);
 }
 
+typedef void (*palette_visitor_fn_t)(MACROBLOCKD *const xd, int plane,
+                                     aom_reader *r);
+
+void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd, int mi_row,
+                       int mi_col, aom_reader *r, BLOCK_SIZE bsize,
+                       palette_visitor_fn_t visit);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/decoder/decodetxb.c b/third_party/aom/av1/decoder/decodetxb.c
index 13f944b35..f9a3e8578 100644
--- a/third_party/aom/av1/decoder/decodetxb.c
+++ b/third_party/aom/av1/decoder/decodetxb.c
@@ -9,28 +9,25 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "av1/common/scan.h"
+#include "av1/decoder/decodetxb.h"
+
+#include "aom_ports/mem.h"
 #include "av1/common/idct.h"
+#include "av1/common/scan.h"
 #include "av1/common/txb_common.h"
 #include "av1/decoder/decodemv.h"
-#include "av1/decoder/decodetxb.h"
-#include "av1/decoder/dsubexp.h"
-#include "av1/decoder/symbolrate.h"
 
 #define ACCT_STR __func__
 
-static int read_golomb(MACROBLOCKD *xd, aom_reader *r, FRAME_COUNTS *counts) {
-#if !CONFIG_SYMBOLRATE
-  (void)counts;
-#endif
+static int read_golomb(MACROBLOCKD *xd, aom_reader *r) {
   int x = 1;
   int length = 0;
   int i = 0;
 
   while (!i) {
-    i = av1_read_record_bit(counts, r, ACCT_STR);
+    i = aom_read_bit(r, ACCT_STR);
     ++length;
-    if (length >= 32) {
+    if (length > 20) {
       aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
                          "Invalid length in read_golomb");
       break;
@@ -39,570 +36,306 @@ static int read_golomb(MACROBLOCKD *xd, aom_reader *r, FRAME_COUNTS *counts) {
 
   for (i = 0; i < length - 1; ++i) {
     x <<= 1;
-    x += av1_read_record_bit(counts, r, ACCT_STR);
+    x += aom_read_bit(r, ACCT_STR);
   }
 
   return x - 1;
 }
 
-static INLINE int read_nz_map(aom_reader *r, tran_low_t *tcoeffs, int plane,
-                              const int16_t *scan, TX_SIZE tx_size,
-                              TX_TYPE tx_type, FRAME_CONTEXT *fc,
-                              FRAME_COUNTS *counts) {
-  TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int height = tx_size_high[tx_size];
-#if CONFIG_CTX1D
-  const int width = tx_size_wide[tx_size];
-  const int eob_offset = width + height;
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int seg_eob =
-      (tx_class == TX_CLASS_2D) ? tx_size_2d[tx_size] : eob_offset;
-#else
-  const int seg_eob = tx_size_2d[tx_size];
-#endif
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  unsigned int(*nz_map_count)[SIG_COEF_CONTEXTS][2] =
-      (counts) ? &counts->nz_map[txs_ctx][plane_type] : NULL;
-#if !LV_MAP_PROB
-  aom_prob *nz_map = fc->nz_map[txs_ctx][plane_type];
-  aom_prob *eob_flag = fc->eob_flag[txs_ctx][plane_type];
-#endif
-  int c;
-  for (c = 0; c < seg_eob; ++c) {
-    int is_nz;
-    int coeff_ctx = get_nz_map_ctx(tcoeffs, c, scan, bwl, height, tx_type);
-    int eob_ctx = get_eob_ctx(tcoeffs, scan[c], txs_ctx, tx_type);
-
-    if (c < seg_eob - 1) {
-#if LV_MAP_PROB
-      is_nz = av1_read_record_bin(
-          counts, r, fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], 2,
-          ACCT_STR);
-#else
-      is_nz = aom_read(r, nz_map[coeff_ctx], ACCT_STR);
-#endif
-    } else {
-      is_nz = 1;
-    }
-
-    // set non-zero coefficient map.
-    tcoeffs[scan[c]] = is_nz;
-
-    if (c == seg_eob - 1) {
-      ++c;
-      break;
-    }
-
-    if (counts) ++(*nz_map_count)[coeff_ctx][is_nz];
-
-    if (is_nz) {
-#if LV_MAP_PROB
-      int is_eob = av1_read_record_bin(
-          counts, r, fc->eob_flag_cdf[txs_ctx][plane_type][eob_ctx], 2,
-          ACCT_STR);
-#else
-      int is_eob = aom_read(r, eob_flag[eob_ctx], ACCT_STR);
-#endif
-      if (counts) ++counts->eob_flag[txs_ctx][plane_type][eob_ctx][is_eob];
-      if (is_eob) break;
-    }
+static INLINE int rec_eob_pos(const int eob_token, const int extra) {
+  int eob = k_eob_group_start[eob_token];
+  if (eob > 2) {
+    eob += extra;
   }
-  return AOMMIN(seg_eob, c + 1);
+  return eob;
 }
 
-#if CONFIG_CTX1D
-static INLINE int read_nz_map_vert(aom_reader *r, tran_low_t *tcoeffs,
-                                   int plane, const int16_t *scan,
-                                   const int16_t *iscan, TX_SIZE tx_size,
-                                   TX_TYPE tx_type, FRAME_CONTEXT *fc,
-                                   FRAME_COUNTS *counts) {
-  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  int16_t eob_ls[MAX_HVTX_SIZE];
-  int eob = 0;
-#if !LV_MAP_PROB
-  aom_prob *nz_map = fc->nz_map[txs_ctx][plane_type];
-#endif
-  for (int col = 0; col < width; ++col) {
-    int el_ctx = get_empty_line_ctx(col, eob_ls);
-#if LV_MAP_PROB
-    int empty_line = av1_read_record_bin(
-        counts, r, fc->empty_line_cdf[txs_ctx][plane_type][tx_class][el_ctx], 2,
-        ACCT_STR);
-#else
-    int empty_line = aom_read(
-        r, fc->empty_line[txs_ctx][plane_type][tx_class][el_ctx], ACCT_STR);
-#endif
-    if (counts)
-      ++counts->empty_line[txs_ctx][plane_type][tx_class][el_ctx][empty_line];
-    if (!empty_line) {
-      int row;
-      for (row = 0; row < height; ++row) {
-        if (row + 1 != height) {
-          int coeff_idx = row * width + col;
-          int scan_idx = iscan[coeff_idx];
-          int coeff_ctx =
-              get_nz_map_ctx(tcoeffs, scan_idx, scan, bwl, height, tx_type);
-#if LV_MAP_PROB
-          int is_nz = av1_read_record_bin(
-              counts, r, fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], 2,
-              ACCT_STR);
-#else
-          int is_nz = aom_read(r, nz_map[coeff_ctx], ACCT_STR);
-#endif
-          if (counts) ++counts->nz_map[txs_ctx][plane_type][coeff_ctx][is_nz];
-          tcoeffs[coeff_idx] = is_nz;
-          if (is_nz) {
-            eob = AOMMAX(eob, iscan[coeff_idx] + 1);
-            if (row + 1 != height) {
-              int eob_ctx = get_hv_eob_ctx(col, row, eob_ls);
-#if LV_MAP_PROB
-              int is_eob = av1_read_record_bin(
-                  counts, r,
-                  fc->hv_eob_cdf[txs_ctx][plane_type][tx_class][eob_ctx], 2,
-                  ACCT_STR);
-#else
-              int is_eob = aom_read(
-                  r, fc->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx],
-                  ACCT_STR);
-#endif
-              if (counts)
-                ++counts
-                      ->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx][is_eob];
-              if (is_eob) break;
-            }
-          }
-        } else {
-          int coeff_idx = row * width + col;
-          tcoeffs[coeff_idx] = 1;
-          eob = AOMMAX(eob, iscan[coeff_idx] + 1);
-        }
+static INLINE int get_dqv(const int16_t *dequant, int coeff_idx,
+                          const qm_val_t *iqmatrix) {
+  int dqv = dequant[!!coeff_idx];
+  if (iqmatrix != NULL)
+    dqv =
+        ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+  return dqv;
+}
+
+static INLINE void read_coeffs_reverse_2d(aom_reader *r, TX_SIZE tx_size,
+                                          int start_si, int end_si,
+                                          const int16_t *scan, int bwl,
+                                          uint8_t *levels,
+                                          base_cdf_arr base_cdf,
+                                          br_cdf_arr br_cdf) {
+  for (int c = end_si; c >= start_si; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx = get_lower_levels_ctx_2d(levels, pos, bwl, tx_size);
+    const int nsymbs = 4;
+    int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR);
+    if (level > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx_2d(levels, pos, bwl);
+      aom_cdf_prob *cdf = br_cdf[br_ctx];
+      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+        const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
+        level += k;
+        if (k < BR_CDF_SIZE - 1) break;
       }
-      eob_ls[col] = AOMMIN(height, row + 1);
-    } else {
-      eob_ls[col] = 0;
     }
+    levels[get_padded_idx(pos, bwl)] = level;
   }
-  return eob;
 }
 
-static INLINE int read_nz_map_horiz(aom_reader *r, tran_low_t *tcoeffs,
-                                    int plane, const int16_t *scan,
-                                    const int16_t *iscan, TX_SIZE tx_size,
-                                    TX_TYPE tx_type, FRAME_CONTEXT *fc,
-                                    FRAME_COUNTS *counts) {
-  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  int16_t eob_ls[MAX_HVTX_SIZE];
-  int eob = 0;
-#if !LV_MAP_PROB
-  aom_prob *nz_map = fc->nz_map[txs_ctx][plane_type];
-#endif
-  for (int row = 0; row < height; ++row) {
-    int el_ctx = get_empty_line_ctx(row, eob_ls);
-#if LV_MAP_PROB
-    int empty_line = av1_read_record_bin(
-        counts, r, fc->empty_line_cdf[txs_ctx][plane_type][tx_class][el_ctx], 2,
-        ACCT_STR);
-#else
-    int empty_line = aom_read(
-        r, fc->empty_line[txs_ctx][plane_type][tx_class][el_ctx], ACCT_STR);
-#endif
-    if (counts)
-      ++counts->empty_line[txs_ctx][plane_type][tx_class][el_ctx][empty_line];
-    if (!empty_line) {
-      int col;
-      for (col = 0; col < width; ++col) {
-        if (col + 1 != width) {
-          int coeff_idx = row * width + col;
-          int scan_idx = iscan[coeff_idx];
-          int coeff_ctx =
-              get_nz_map_ctx(tcoeffs, scan_idx, scan, bwl, height, tx_type);
-#if LV_MAP_PROB
-          int is_nz = av1_read_record_bin(
-              counts, r, fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], 2,
-              ACCT_STR);
-#else
-          int is_nz = aom_read(r, nz_map[coeff_ctx], ACCT_STR);
-#endif
-          if (counts) ++counts->nz_map[txs_ctx][plane_type][coeff_ctx][is_nz];
-          tcoeffs[coeff_idx] = is_nz;
-          if (is_nz) {
-            eob = AOMMAX(eob, iscan[coeff_idx] + 1);
-            int eob_ctx = get_hv_eob_ctx(row, col, eob_ls);
-#if LV_MAP_PROB
-            int is_eob = av1_read_record_bin(
-                counts, r,
-                fc->hv_eob_cdf[txs_ctx][plane_type][tx_class][eob_ctx], 2,
-                ACCT_STR);
-#else
-            int is_eob =
-                aom_read(r, fc->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx],
-                         ACCT_STR);
-#endif
-            if (counts)
-              ++counts->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx][is_eob];
-            if (is_eob) break;
-          }
-        } else {
-          int coeff_idx = row * width + col;
-          tcoeffs[coeff_idx] = 1;
-          eob = AOMMAX(eob, iscan[coeff_idx] + 1);
-        }
+static INLINE void read_coeffs_reverse(aom_reader *r, TX_SIZE tx_size,
+                                       TX_CLASS tx_class, int start_si,
+                                       int end_si, const int16_t *scan, int bwl,
+                                       uint8_t *levels, base_cdf_arr base_cdf,
+                                       br_cdf_arr br_cdf) {
+  for (int c = end_si; c >= start_si; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx =
+        get_lower_levels_ctx(levels, pos, bwl, tx_size, tx_class);
+    const int nsymbs = 4;
+    int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR);
+    if (level > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+      aom_cdf_prob *cdf = br_cdf[br_ctx];
+      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+        const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
+        level += k;
+        if (k < BR_CDF_SIZE - 1) break;
       }
-      eob_ls[row] = AOMMIN(width, col + 1);
-    } else {
-      eob_ls[row] = 0;
     }
+    levels[get_padded_idx(pos, bwl)] = level;
   }
-  return eob;
 }
-#endif
 
-uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
-                            aom_reader *r, int blk_row, int blk_col, int block,
-                            int plane, tran_low_t *tcoeffs, TXB_CTX *txb_ctx,
-                            TX_SIZE tx_size, int16_t *max_scan_line, int *eob) {
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  FRAME_COUNTS *counts = xd->counts;
-  TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  PLANE_TYPE plane_type = get_plane_type(plane);
-#if !LV_MAP_PROB
-  aom_prob *nz_map = ec_ctx->nz_map[txs_ctx][plane_type];
-  aom_prob *eob_flag = ec_ctx->eob_flag[txs_ctx][plane_type];
-#endif
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const int seg_eob = tx_size_2d[tx_size];
-  int c = 0;
-  int update_eob = -1;
-  const int16_t *const dequant = xd->plane[plane].seg_dequant[mbmi->segment_id];
+uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                            aom_reader *const r, const int blk_row,
+                            const int blk_col, const int plane,
+                            const TXB_CTX *const txb_ctx,
+                            const TX_SIZE tx_size) {
+  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+  const int32_t max_value = (1 << (7 + xd->bd)) - 1;
+  const int32_t min_value = -(1 << (7 + xd->bd));
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int16_t *const dequant = pd->seg_dequant_QTX[mbmi->segment_id];
+  tran_low_t *const tcoeffs = pd->dqcoeff_block + xd->cb_offset[plane];
   const int shift = av1_get_tx_scale(tx_size);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int height = tx_size_high[tx_size];
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
   int cul_level = 0;
-  memset(tcoeffs, 0, sizeof(*tcoeffs) * seg_eob);
-
-#if LV_MAP_PROB
-  int all_zero = av1_read_record_bin(
-      counts, r, ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2,
-      ACCT_STR);
-#else
-  int all_zero =
-      aom_read(r, ec_ctx->txb_skip[txs_ctx][txb_ctx->txb_skip_ctx], ACCT_STR);
-#endif
-  if (xd->counts)
-    ++xd->counts->txb_skip[txs_ctx][txb_ctx->txb_skip_ctx][all_zero];
-
+  int dc_val = 0;
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  const int all_zero = aom_read_symbol(
+      r, ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2, ACCT_STR);
+  eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+  uint16_t *const eob = &(eob_data->eob);
+  uint16_t *const max_scan_line = &(eob_data->max_scan_line);
+  *max_scan_line = 0;
   *eob = 0;
   if (all_zero) {
     *max_scan_line = 0;
-#if CONFIG_TXK_SEL
-    if (plane == 0) mbmi->txk_type[(blk_row << 4) + blk_col] = DCT_DCT;
-#endif
+    if (plane == 0) {
+      const int txk_type_idx =
+          av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+      mbmi->txk_type[txk_type_idx] = DCT_DCT;
+    }
     return 0;
   }
 
-  (void)blk_row;
-  (void)blk_col;
-#if CONFIG_TXK_SEL
-  av1_read_tx_type(cm, xd, blk_row, blk_col, block, plane,
-                   get_min_tx_size(tx_size), r);
-#endif
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-  const int16_t *scan = scan_order->scan;
-
-#if CONFIG_CTX1D
-  const int16_t *iscan = scan_order->iscan;
-  TX_CLASS tx_class = get_tx_class(tx_type);
-  if (tx_class == TX_CLASS_2D) {
-    *eob =
-        read_nz_map(r, tcoeffs, plane, scan, tx_size, tx_type, ec_ctx, counts);
-  } else {
-#if LV_MAP_PROB
-    const int eob_mode = av1_read_record_bin(
-        counts, r, ec_ctx->eob_mode_cdf[txs_ctx][plane_type][tx_class], 2,
-        ACCT_STR);
-#else
-    const int eob_mode =
-        aom_read(r, ec_ctx->eob_mode[txs_ctx][plane_type][tx_class], ACCT_STR);
-#endif
-    if (counts) ++counts->eob_mode[txs_ctx][plane_type][tx_class][eob_mode];
-    if (eob_mode == 0) {
-      *eob = read_nz_map(r, tcoeffs, plane, scan, tx_size, tx_type, ec_ctx,
-                         counts);
-    } else {
-      assert(tx_class == TX_CLASS_VERT || tx_class == TX_CLASS_HORIZ);
-      if (tx_class == TX_CLASS_VERT)
-        *eob = read_nz_map_vert(r, tcoeffs, plane, scan, iscan, tx_size,
-                                tx_type, ec_ctx, counts);
-      else
-        *eob = read_nz_map_horiz(r, tcoeffs, plane, scan, iscan, tx_size,
-                                 tx_type, ec_ctx, counts);
-    }
+  memset(levels_buf, 0,
+         sizeof(*levels_buf) *
+             ((width + TX_PAD_HOR) * (height + TX_PAD_VER) + TX_PAD_END));
+  if (plane == AOM_PLANE_Y) {
+    // only y plane's tx_type is transmitted
+    av1_read_tx_type(cm, xd, blk_row, blk_col, tx_size, r);
+  }
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
+  const qm_val_t *iqmatrix =
+      IS_2D_TRANSFORM(tx_type)
+          ? pd->seg_iqmatrix[mbmi->segment_id][qm_tx_size]
+          : cm->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int16_t *const scan = scan_order->scan;
+  int eob_extra = 0;
+  int eob_pt = 1;
+
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+  switch (eob_multi_size) {
+    case 0:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx],
+                          5, ACCT_STR) +
+          1;
+      break;
+    case 1:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx],
+                          6, ACCT_STR) +
+          1;
+      break;
+    case 2:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx],
+                          7, ACCT_STR) +
+          1;
+      break;
+    case 3:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx],
+                          8, ACCT_STR) +
+          1;
+      break;
+    case 4:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx],
+                          9, ACCT_STR) +
+          1;
+      break;
+    case 5:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx],
+                          10, ACCT_STR) +
+          1;
+      break;
+    case 6:
+    default:
+      eob_pt = aom_read_symbol(
+                   r, ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11,
+                   ACCT_STR) +
+               1;
+      break;
   }
-#else
-  *eob = read_nz_map(r, tcoeffs, plane, scan, tx_size, tx_type, ec_ctx, counts);
-#endif
-  *max_scan_line = *eob;
-
-  int i;
-  for (i = 0; i < NUM_BASE_LEVELS; ++i) {
-#if !LV_MAP_PROB
-    aom_prob *coeff_base = ec_ctx->coeff_base[txs_ctx][plane_type][i];
-#endif
-    update_eob = 0;
-    for (c = *eob - 1; c >= 0; --c) {
-      tran_low_t *v = &tcoeffs[scan[c]];
-      int sign;
-      int ctx;
-
-      if (*v <= i) continue;
-
-      ctx = get_base_ctx(tcoeffs, scan[c], bwl, height, i + 1);
-
-#if LV_MAP_PROB
-      if (av1_read_record_bin(
-              counts, r, ec_ctx->coeff_base_cdf[txs_ctx][plane_type][i][ctx], 2,
-              ACCT_STR))
-#else
-      if (aom_read(r, coeff_base[ctx], ACCT_STR))
-#endif
-      {
-        *v = i + 1;
-        cul_level += i + 1;
 
-        if (counts) ++counts->coeff_base[txs_ctx][plane_type][i][ctx][1];
+  if (k_eob_offset_bits[eob_pt] > 0) {
+    const int eob_ctx = eob_pt - 3;
+    int bit = aom_read_symbol(
+        r, ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2, ACCT_STR);
+    if (bit) {
+      eob_extra += (1 << (k_eob_offset_bits[eob_pt] - 1));
+    }
 
-        if (c == 0) {
-          int dc_sign_ctx = txb_ctx->dc_sign_ctx;
-#if LV_MAP_PROB
-          sign = av1_read_record_bin(
-              counts, r, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], 2,
-              ACCT_STR);
-#else
-          sign =
-              aom_read(r, ec_ctx->dc_sign[plane_type][dc_sign_ctx], ACCT_STR);
-#endif
-          if (counts) ++counts->dc_sign[plane_type][dc_sign_ctx][sign];
-        } else {
-          sign = av1_read_record_bit(counts, r, ACCT_STR);
-        }
-        if (sign) *v = -(*v);
-        continue;
+    for (int i = 1; i < k_eob_offset_bits[eob_pt]; i++) {
+      bit = aom_read_bit(r, ACCT_STR);
+      if (bit) {
+        eob_extra += (1 << (k_eob_offset_bits[eob_pt] - 1 - i));
       }
-      *v = i + 2;
-      if (counts) ++counts->coeff_base[txs_ctx][plane_type][i][ctx][0];
-
-      // update the eob flag for coefficients with magnitude above 1.
-      update_eob = AOMMAX(update_eob, c);
     }
   }
-
-  for (c = update_eob; c >= 0; --c) {
-    tran_low_t *v = &tcoeffs[scan[c]];
-    int sign;
-    int idx;
-    int ctx;
-
-    if (*v <= NUM_BASE_LEVELS) continue;
-
-    if (c == 0) {
-      int dc_sign_ctx = txb_ctx->dc_sign_ctx;
-#if LV_MAP_PROB
-      sign = av1_read_record_bin(
-          counts, r, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], 2, ACCT_STR);
-#else
-      sign = aom_read(r, ec_ctx->dc_sign[plane_type][dc_sign_ctx], ACCT_STR);
-#endif
-      if (counts) ++counts->dc_sign[plane_type][dc_sign_ctx][sign];
+  *eob = rec_eob_pos(eob_pt, eob_extra);
+
+  {
+    // Read the non-zero coefficient with scan index eob-1
+    // TODO(angiebird): Put this into a function
+    const int c = *eob - 1;
+    const int pos = scan[c];
+    const int coeff_ctx = get_lower_levels_ctx_eob(bwl, height, c);
+    const int nsymbs = 3;
+    aom_cdf_prob *cdf =
+        ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx];
+    int level = aom_read_symbol(r, cdf, nsymbs, ACCT_STR) + 1;
+    if (level > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+        const int k = aom_read_symbol(
+            r,
+            ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx],
+            BR_CDF_SIZE, ACCT_STR);
+        level += k;
+        if (k < BR_CDF_SIZE - 1) break;
+      }
+    }
+    levels[get_padded_idx(pos, bwl)] = level;
+  }
+  if (*eob > 1) {
+    base_cdf_arr base_cdf = ec_ctx->coeff_base_cdf[txs_ctx][plane_type];
+    br_cdf_arr br_cdf =
+        ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type];
+    if (tx_class == TX_CLASS_2D) {
+      read_coeffs_reverse_2d(r, tx_size, 1, *eob - 1 - 1, scan, bwl, levels,
+                             base_cdf, br_cdf);
+      read_coeffs_reverse(r, tx_size, tx_class, 0, 0, scan, bwl, levels,
+                          base_cdf, br_cdf);
     } else {
-      sign = av1_read_record_bit(counts, r, ACCT_STR);
+      read_coeffs_reverse(r, tx_size, tx_class, 0, *eob - 1 - 1, scan, bwl,
+                          levels, base_cdf, br_cdf);
     }
+  }
 
-    ctx = get_br_ctx(tcoeffs, scan[c], bwl, height);
-
-#if BR_NODE
-    for (idx = 0; idx < BASE_RANGE_SETS; ++idx) {
-#if LV_MAP_PROB
-      if (av1_read_record_bin(
-              counts, r, ec_ctx->coeff_br_cdf[txs_ctx][plane_type][idx][ctx], 2,
-              ACCT_STR))
-#else   // LV_MAP_PROB
-      if (aom_read(r, ec_ctx->coeff_br[txs_ctx][plane_type][idx][ctx],
-                   ACCT_STR))
-#endif  // LV_MAP_PROB
-      {
-        int extra_bits = (1 << br_extra_bits[idx]) - 1;
-        //        int br_offset = aom_read_literal(r, extra_bits, ACCT_STR);
-        int br_offset = 0;
-        int tok;
-        if (counts) ++counts->coeff_br[txs_ctx][plane_type][idx][ctx][1];
-        for (tok = 0; tok < extra_bits; ++tok) {
-#if LV_MAP_PROB
-          if (av1_read_record_bin(
-                  counts, r, ec_ctx->coeff_lps_cdf[txs_ctx][plane_type][ctx], 2,
-                  ACCT_STR))
-#else
-          if (aom_read(r, ec_ctx->coeff_lps[txs_ctx][plane_type][ctx],
-                       ACCT_STR))
-#endif
-          {
-            br_offset = tok;
-            if (counts) ++counts->coeff_lps[txs_ctx][plane_type][ctx][1];
-            break;
-          }
-          if (counts) ++counts->coeff_lps[txs_ctx][plane_type][ctx][0];
-        }
-        if (tok == extra_bits) br_offset = extra_bits;
-
-        int br_base = br_index_to_coeff[idx];
-
-        *v = NUM_BASE_LEVELS + 1 + br_base + br_offset;
-        cul_level += *v;
-        if (sign) *v = -(*v);
-        break;
+  int16_t num_zero_coeffs = 0;
+  for (int c = 0; c < *eob; ++c) {
+    const int pos = scan[c];
+    num_zero_coeffs = AOMMAX(num_zero_coeffs, pos);
+  }
+  memset(tcoeffs, 0, (num_zero_coeffs + 1) * sizeof(tcoeffs[0]));
+
+  for (int c = 0; c < *eob; ++c) {
+    const int pos = scan[c];
+    uint8_t sign;
+    tran_low_t level = levels[get_padded_idx(pos, bwl)];
+    if (level) {
+      *max_scan_line = AOMMAX(*max_scan_line, pos);
+      if (c == 0) {
+        const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+        sign = aom_read_symbol(r, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx],
+                               2, ACCT_STR);
+      } else {
+        sign = aom_read_bit(r, ACCT_STR);
+      }
+      if (level >= MAX_BASE_BR_RANGE) {
+        level += read_golomb(xd, r);
       }
-      if (counts) ++counts->coeff_br[txs_ctx][plane_type][idx][ctx][0];
-    }
-
-    if (idx < BASE_RANGE_SETS) continue;
-#else
-    for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
-#if LV_MAP_PROB
-      if (av1_read_record_bin(counts, r,
-                              ec_ctx->coeff_lps_cdf[txs_ctx][plane_type][ctx],
-                              2, ACCT_STR))
-#else
-      if (aom_read(r, ec_ctx->coeff_lps[txs_ctx][plane_type][ctx], ACCT_STR))
-#endif
-      {
-        *v = (idx + 1 + NUM_BASE_LEVELS);
-        if (sign) *v = -(*v);
-        cul_level += abs(*v);
 
-        if (counts) ++counts->coeff_lps[txs_ctx][plane_type][ctx][1];
-        break;
+      if (c == 0) dc_val = sign ? -level : level;
+
+      // Bitmasking to clamp level to valid range:
+      //   The valid range for 8/10/12 bit vdieo is at most 14/16/18 bit
+      level &= 0xfffff;
+      cul_level += level;
+      tran_low_t dq_coeff;
+      // Bitmasking to clamp dq_coeff to valid range:
+      //   The valid range for 8/10/12 bit video is at most 17/19/21 bit
+      dq_coeff = (tran_low_t)(
+          (int64_t)level * get_dqv(dequant, scan[c], iqmatrix) & 0xffffff);
+      dq_coeff = dq_coeff >> shift;
+      if (sign) {
+        dq_coeff = -dq_coeff;
       }
-      if (counts) ++counts->coeff_lps[txs_ctx][plane_type][ctx][0];
+      tcoeffs[pos] = clamp(dq_coeff, min_value, max_value);
     }
-    if (idx < COEFF_BASE_RANGE) continue;
-#endif
-
-    // decode 0-th order Golomb code
-    *v = read_golomb(xd, r, counts) + COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS;
-    if (sign) *v = -(*v);
-    cul_level += abs(*v);
-  }
-
-  for (c = 0; c < *eob; ++c) {
-    int16_t dqv = (c == 0) ? dequant[0] : dequant[1];
-    tran_low_t *v = &tcoeffs[scan[c]];
-#if CONFIG_SYMBOLRATE
-    av1_record_coeff(counts, abs(*v));
-#endif
-    int sign = (*v) < 0;
-    *v = (abs(*v) * dqv) >> shift;
-    if (sign) *v = -(*v);
   }
 
-  cul_level = AOMMIN(63, cul_level);
+  cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
 
   // DC value
-  set_dc_sign(&cul_level, tcoeffs[0]);
+  set_dc_sign(&cul_level, dc_val);
 
   return cul_level;
 }
 
-uint8_t av1_read_coeffs_txb_facade(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                   aom_reader *r, int row, int col, int block,
-                                   int plane, tran_low_t *tcoeffs,
-                                   TX_SIZE tx_size, int16_t *max_scan_line,
-                                   int *eob) {
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct macroblockd_plane *pd = &xd->plane[plane];
+uint8_t av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
+                                   MACROBLOCKD *const xd, aom_reader *const r,
+                                   const int row, const int col,
+                                   const int plane, const TX_SIZE tx_size) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
 
   const BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_CHROMA_SUB8X8
-  const BLOCK_SIZE plane_bsize =
-      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#elif CONFIG_CB4X4
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#else   // CONFIG_CB4X4
   const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
-#endif  // CONFIG_CB4X4
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
 
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, pd->above_context + col,
               pd->left_context + row, &txb_ctx);
-  uint8_t cul_level =
-      av1_read_coeffs_txb(cm, xd, r, row, col, block, plane, tcoeffs, &txb_ctx,
-                          tx_size, max_scan_line, eob);
-#if CONFIG_ADAPT_SCAN
-  PLANE_TYPE plane_type = get_plane_type(plane);
-  TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, row, col, block, tx_size);
-  if (xd->counts && *eob > 0)
-    av1_update_scan_count_facade(cm, xd->counts, tx_size, tx_type, pd->dqcoeff,
-                                 *eob);
-#endif
-  av1_set_contexts(xd, pd, plane, tx_size, cul_level, col, row);
+  const uint8_t cul_level =
+      av1_read_coeffs_txb(cm, xd, r, row, col, plane, &txb_ctx, tx_size);
+  av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, col, row);
   return cul_level;
 }
-
-#if !LV_MAP_PROB
-static void read_txb_probs(FRAME_CONTEXT *fc, const TX_SIZE tx_size,
-                           aom_reader *r, FRAME_COUNTS *counts) {
-#if !CONFIG_SYMBOLRATE
-  (void)counts;
-#endif
-  int plane, ctx, level;
-
-  if (av1_read_record_bit(counts, r, ACCT_STR) == 0) return;
-
-  for (ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx)
-    av1_diff_update_prob(r, &fc->txb_skip[tx_size][ctx], ACCT_STR);
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane)
-    for (ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx)
-      av1_diff_update_prob(r, &fc->nz_map[tx_size][plane][ctx], ACCT_STR);
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane)
-    for (ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx)
-      av1_diff_update_prob(r, &fc->eob_flag[tx_size][plane][ctx], ACCT_STR);
-
-  for (level = 0; level < NUM_BASE_LEVELS; ++level)
-    for (plane = 0; plane < PLANE_TYPES; ++plane)
-      for (ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx)
-        av1_diff_update_prob(r, &fc->coeff_base[tx_size][plane][level][ctx],
-                             ACCT_STR);
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane)
-    for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx)
-      av1_diff_update_prob(r, &fc->coeff_lps[tx_size][plane][ctx], ACCT_STR);
-}
-
-void av1_read_txb_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode, aom_reader *r,
-                        FRAME_COUNTS *counts) {
-  const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
-  TX_SIZE tx_size;
-  int ctx, plane;
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane)
-    for (ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
-      av1_diff_update_prob(r, &fc->dc_sign[plane][ctx], ACCT_STR);
-
-  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
-    read_txb_probs(fc, tx_size, r, counts);
-}
-#endif  // !LV_MAP_PROB
diff --git a/third_party/aom/av1/decoder/decodetxb.h b/third_party/aom/av1/decoder/decodetxb.h
index 1c6512e97..d0b3d8c7a 100644
--- a/third_party/aom/av1/decoder/decodetxb.h
+++ b/third_party/aom/av1/decoder/decodetxb.h
@@ -12,24 +12,21 @@
 #ifndef DECODETXB_H_
 #define DECODETXB_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "av1/common/blockd.h"
 #include "av1/common/onyxc_int.h"
 #include "av1/common/txb_common.h"
 #include "aom_dsp/bitreader.h"
 
-uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
-                            aom_reader *r, int blk_row, int blk_col, int block,
-                            int plane, tran_low_t *tcoeffs, TXB_CTX *txb_ctx,
-                            TX_SIZE tx_size, int16_t *max_scan_line, int *eob);
+uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                            aom_reader *const r, const int blk_row,
+                            const int blk_col, const int plane,
+                            const TXB_CTX *const txb_ctx,
+                            const TX_SIZE tx_size);
 
-uint8_t av1_read_coeffs_txb_facade(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                   aom_reader *r, int row, int col, int block,
-                                   int plane, tran_low_t *tcoeffs,
-                                   TX_SIZE tx_size, int16_t *max_scan_line,
-                                   int *eob);
-#if !LV_MAP_PROB
-void av1_read_txb_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode, aom_reader *r,
-                        FRAME_COUNTS *counts);
-#endif  // !LV_MAP_PROB
+uint8_t av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
+                                   MACROBLOCKD *const xd, aom_reader *const r,
+                                   const int row, const int col,
+                                   const int plane, const TX_SIZE tx_size);
 #endif  //  DECODETXB_H_
diff --git a/third_party/aom/av1/decoder/detokenize.c b/third_party/aom/av1/decoder/detokenize.c
index a59a7bac1..9d54bd13d 100644
--- a/third_party/aom/av1/decoder/detokenize.c
+++ b/third_party/aom/av1/decoder/detokenize.c
@@ -9,245 +9,18 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_config.h"
-#if !CONFIG_PVQ
+#include "config/aom_config.h"
+
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
-#endif  // !CONFIG_PVQ
-
 #include "av1/common/blockd.h"
 #include "av1/decoder/detokenize.h"
 
 #define ACCT_STR __func__
 
-#if !CONFIG_PVQ || CONFIG_VAR_TX
 #include "av1/common/common.h"
 #include "av1/common/entropy.h"
 #include "av1/common/idct.h"
-#endif
-
-#include "av1/decoder/symbolrate.h"
-
-#if !CONFIG_PVQ || CONFIG_VAR_TX
-#define EOB_CONTEXT_NODE 0
-#define ZERO_CONTEXT_NODE 1
-#define ONE_CONTEXT_NODE 2
-#define LOW_VAL_CONTEXT_NODE 0
-#define TWO_CONTEXT_NODE 1
-#define THREE_CONTEXT_NODE 2
-#define HIGH_LOW_CONTEXT_NODE 3
-#define CAT_ONE_CONTEXT_NODE 4
-#define CAT_THREEFOUR_CONTEXT_NODE 5
-#define CAT_THREE_CONTEXT_NODE 6
-#define CAT_FIVE_CONTEXT_NODE 7
-
-#define INCREMENT_COUNT(token)                   \
-  do {                                           \
-    if (counts) ++coef_counts[band][ctx][token]; \
-  } while (0)
-
-#if CONFIG_NEW_MULTISYMBOL
-#define READ_COEFF(counts, prob_name, cdf_name, num, r) \
-  read_coeff(counts, cdf_name, num, r);
-static INLINE int read_coeff(FRAME_COUNTS *counts,
-                             const aom_cdf_prob *const *cdf, int n,
-                             aom_reader *r) {
-#if !CONFIG_SYMBOLRATE
-  (void)counts;
-#endif
-  int val = 0;
-  int i = 0;
-  int count = 0;
-  while (count < n) {
-    const int size = AOMMIN(n - count, 4);
-    val |= av1_read_record_cdf(counts, r, cdf[i++], 1 << size, ACCT_STR)
-           << count;
-    count += size;
-  }
-  return val;
-}
-#else
-#define READ_COEFF(counts, prob_name, cdf_name, num, r) \
-  read_coeff(counts, prob_name, num, r);
-static INLINE int read_coeff(FRAME_COUNTS *counts, const aom_prob *probs, int n,
-                             aom_reader *r) {
-#if !CONFIG_SYMBOLRATE
-  (void)counts;
-#endif
-  int i, val = 0;
-  for (i = 0; i < n; ++i)
-    val = (val << 1) | av1_read_record(counts, r, probs[i], ACCT_STR);
-  return val;
-}
-
-#endif
-
-static int token_to_value(FRAME_COUNTS *counts, aom_reader *const r, int token,
-                          TX_SIZE tx_size, int bit_depth) {
-#if !CONFIG_HIGHBITDEPTH
-  assert(bit_depth == 8);
-#endif  // !CONFIG_HIGHBITDEPTH
-
-  switch (token) {
-    case ZERO_TOKEN:
-    case ONE_TOKEN:
-    case TWO_TOKEN:
-    case THREE_TOKEN:
-    case FOUR_TOKEN: return token;
-    case CATEGORY1_TOKEN:
-      return CAT1_MIN_VAL +
-             READ_COEFF(counts, av1_cat1_prob, av1_cat1_cdf, 1, r);
-    case CATEGORY2_TOKEN:
-      return CAT2_MIN_VAL +
-             READ_COEFF(counts, av1_cat2_prob, av1_cat2_cdf, 2, r);
-    case CATEGORY3_TOKEN:
-      return CAT3_MIN_VAL +
-             READ_COEFF(counts, av1_cat3_prob, av1_cat3_cdf, 3, r);
-    case CATEGORY4_TOKEN:
-      return CAT4_MIN_VAL +
-             READ_COEFF(counts, av1_cat4_prob, av1_cat4_cdf, 4, r);
-    case CATEGORY5_TOKEN:
-      return CAT5_MIN_VAL +
-             READ_COEFF(counts, av1_cat5_prob, av1_cat5_cdf, 5, r);
-    case CATEGORY6_TOKEN: {
-      const int skip_bits = (int)sizeof(av1_cat6_prob) -
-                            av1_get_cat6_extrabits_size(tx_size, bit_depth);
-      return CAT6_MIN_VAL + READ_COEFF(counts, av1_cat6_prob + skip_bits,
-                                       av1_cat6_cdf, 18 - skip_bits, r);
-    }
-    default:
-      assert(0);  // Invalid token.
-      return -1;
-  }
-}
-
-static int decode_coefs(MACROBLOCKD *xd, PLANE_TYPE type, tran_low_t *dqcoeff,
-                        TX_SIZE tx_size, TX_TYPE tx_type, const int16_t *dq,
-#if CONFIG_NEW_QUANT
-                        dequant_val_type_nuq *dq_val,
-#else
-#if CONFIG_AOM_QM
-                        qm_val_t *iqm[2][TX_SIZES_ALL],
-#endif  // CONFIG_AOM_QM
-#endif  // CONFIG_NEW_QUANT
-                        int ctx, const int16_t *scan, const int16_t *nb,
-                        int16_t *max_scan_line, aom_reader *r) {
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  const int max_eob = tx_size_2d[tx_size];
-  const int ref = is_inter_block(&xd->mi[0]->mbmi);
-#if CONFIG_AOM_QM && !CONFIG_NEW_QUANT
-  const qm_val_t *iqmatrix = iqm[!ref][tx_size];
-#endif  // CONFIG_AOM_QM
-  (void)tx_type;
-  int band, c = 0;
-  const TX_SIZE tx_size_ctx = txsize_sqr_map[tx_size];
-  aom_cdf_prob(*coef_head_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
-      ec_ctx->coef_head_cdfs[tx_size_ctx][type][ref];
-  aom_cdf_prob(*coef_tail_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
-      ec_ctx->coef_tail_cdfs[tx_size_ctx][type][ref];
-  int val = 0;
-
-  uint8_t token_cache[MAX_TX_SQUARE];
-  const uint8_t *band_translate = get_band_translate(tx_size);
-  int dq_shift;
-  int v, token;
-  int32_t dqv = dq[0];
-#if CONFIG_NEW_QUANT
-  const tran_low_t *dqv_val = &dq_val[0][0];
-#endif  // CONFIG_NEW_QUANT
-
-  dq_shift = av1_get_tx_scale(tx_size);
-
-  band = *band_translate++;
-
-  int more_data = 1;
-  while (more_data) {
-    int comb_token;
-    int last_pos = (c + 1 == max_eob);
-    int first_pos = (c == 0);
-
-#if CONFIG_NEW_QUANT
-    dqv_val = &dq_val[band][0];
-#endif  // CONFIG_NEW_QUANT
-
-    comb_token = last_pos ? 2 * av1_read_record_bit(xd->counts, r, ACCT_STR) + 2
-                          : av1_read_record_symbol(
-                                xd->counts, r, coef_head_cdfs[band][ctx],
-                                HEAD_TOKENS + first_pos, ACCT_STR) +
-                                !first_pos;
-    if (first_pos) {
-      if (comb_token == 0) return 0;
-    }
-    token = comb_token >> 1;
-
-    while (!token) {
-      *max_scan_line = AOMMAX(*max_scan_line, scan[c]);
-      token_cache[scan[c]] = 0;
-#if CONFIG_SYMBOLRATE
-      av1_record_coeff(xd->counts, 0);
-#endif
-      ++c;
-      dqv = dq[1];
-      ctx = get_coef_context(nb, token_cache, c);
-      band = *band_translate++;
-
-      last_pos = (c + 1 == max_eob);
-
-      comb_token =
-          last_pos
-              ? 2 * av1_read_record_bit(xd->counts, r, ACCT_STR) + 2
-              : av1_read_record_symbol(xd->counts, r, coef_head_cdfs[band][ctx],
-                                       HEAD_TOKENS, ACCT_STR) +
-                    1;
-      token = comb_token >> 1;
-    }
-
-    more_data = comb_token & 1;
-
-    if (token > ONE_TOKEN)
-      token += av1_read_record_symbol(xd->counts, r, coef_tail_cdfs[band][ctx],
-                                      TAIL_TOKENS, ACCT_STR);
-#if CONFIG_NEW_QUANT
-    dqv_val = &dq_val[band][0];
-#endif  // CONFIG_NEW_QUANT
-
-    *max_scan_line = AOMMAX(*max_scan_line, scan[c]);
-    token_cache[scan[c]] = av1_pt_energy_class[token];
-
-    val = token_to_value(xd->counts, r, token, tx_size, xd->bd);
-#if CONFIG_SYMBOLRATE
-    av1_record_coeff(xd->counts, val);
-#endif
-
-#if CONFIG_NEW_QUANT
-    v = av1_dequant_abscoeff_nuq(val, dqv, dqv_val);
-    v = dq_shift ? ROUND_POWER_OF_TWO(v, dq_shift) : v;
-#else
-#if CONFIG_AOM_QM
-    // Apply quant matrix only for 2D transforms
-    if (IS_2D_TRANSFORM(tx_type) && iqmatrix != NULL)
-      dqv = ((iqmatrix[scan[c]] * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >>
-            AOM_QM_BITS;
-#endif
-    v = (val * dqv) >> dq_shift;
-#endif
-
-    v = (int)check_range(av1_read_record_bit(xd->counts, r, ACCT_STR) ? -v : v,
-                         xd->bd);
-
-    dqcoeff[scan[c]] = v;
-
-    ++c;
-    more_data &= (c < max_eob);
-    if (!more_data) break;
-    dqv = dq[1];
-    ctx = get_coef_context(nb, token_cache, c);
-    band = *band_translate++;
-  }
-
-  return c;
-}
-#endif  // !CONFIG_PVQ
 
 static void decode_color_map_tokens(Av1ColorMapParam *param, aom_reader *r) {
   uint8_t color_order[PALETTE_MAX_SIZE];
@@ -263,7 +36,6 @@ static void decode_color_map_tokens(Av1ColorMapParam *param, aom_reader *r) {
   color_map[0] = av1_read_uniform(r, n);
   assert(color_map[0] < n);
 
-#if CONFIG_PALETTE_THROUGHPUT
   // Run wavefront on the palette map index decoding.
   for (int i = 1; i < rows + cols - 1; ++i) {
     for (int j = AOMMIN(i, cols - 1); j >= AOMMAX(0, i - rows + 1); --j) {
@@ -283,21 +55,6 @@ static void decode_color_map_tokens(Av1ColorMapParam *param, aom_reader *r) {
              (plane_block_width - cols));
     }
   }
-#else
-  for (int i = 0; i < rows; ++i) {
-    for (int j = (i == 0 ? 1 : 0); j < cols; ++j) {
-      const int color_ctx = av1_get_palette_color_index_context(
-          color_map, plane_block_width, i, j, n, color_order, NULL);
-      const int color_idx = aom_read_symbol(
-          r, color_map_cdf[n - PALETTE_MIN_SIZE][color_ctx], n, ACCT_STR);
-      assert(color_idx >= 0 && color_idx < n);
-      color_map[i * plane_block_width + j] = color_order[color_idx];
-    }
-    memset(color_map + i * plane_block_width + cols,
-           color_map[i * plane_block_width + cols - 1],
-           (plane_block_width - cols));  // Copy last column to extra columns.
-  }
-#endif  // CONFIG_PALETTE_THROUGHPUT
   // Copy last row to extra rows.
   for (int i = rows; i < plane_block_height; ++i) {
     memcpy(color_map + i * plane_block_width,
@@ -305,97 +62,17 @@ static void decode_color_map_tokens(Av1ColorMapParam *param, aom_reader *r) {
   }
 }
 
-static void get_palette_params(const MACROBLOCKD *const xd, int plane,
-                               BLOCK_SIZE bsize, Av1ColorMapParam *params) {
-  assert(plane == 0 || plane == 1);
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  params->color_map = xd->plane[plane].color_index_map;
-  params->map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
-                          : xd->tile_ctx->palette_y_color_index_cdf;
-  params->n_colors = pmi->palette_size[plane];
-  av1_get_block_dimensions(bsize, plane, xd, &params->plane_width,
-                           &params->plane_height, &params->rows, &params->cols);
-}
-
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-static void get_mrc_params(const MACROBLOCKD *const xd, TX_SIZE tx_size,
-                           Av1ColorMapParam *params) {
-  memset(params, 0, sizeof(*params));
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const int is_inter = is_inter_block(mbmi);
-  params->color_map = xd->mrc_mask;
-  params->map_cdf = is_inter ? xd->tile_ctx->mrc_mask_inter_cdf
-                             : xd->tile_ctx->mrc_mask_intra_cdf;
-  params->n_colors = 2;
-  params->plane_width = tx_size_wide[tx_size];
-  params->rows = tx_size_high[tx_size];
-  params->cols = tx_size_wide[tx_size];
-}
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-
 void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane,
                                aom_reader *r) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   assert(plane == 0 || plane == 1);
-  assert(mbmi->sb_type >= BLOCK_8X8);
-  Av1ColorMapParam color_map_params;
-  memset(&color_map_params, 0, sizeof(color_map_params));
-  get_palette_params(xd, plane, mbmi->sb_type, &color_map_params);
-  decode_color_map_tokens(&color_map_params, r);
-}
-
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-static void decode_mrc_tokens(MACROBLOCKD *const xd, TX_TYPE tx_size,
-                              aom_reader *r) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const int is_inter = is_inter_block(mbmi);
-  if ((is_inter && !SIGNAL_MRC_MASK_INTER) ||
-      (!is_inter && !SIGNAL_MRC_MASK_INTRA))
-    return;
-  Av1ColorMapParam color_map_params;
-  get_mrc_params(xd, tx_size, &color_map_params);
-  decode_color_map_tokens(&color_map_params, r);
-}
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-
-#if !CONFIG_PVQ || CONFIG_VAR_TX
-int av1_decode_block_tokens(AV1_COMMON *cm, MACROBLOCKD *const xd, int plane,
-                            const SCAN_ORDER *sc, int x, int y, TX_SIZE tx_size,
-                            TX_TYPE tx_type, int16_t *max_scan_line,
-                            aom_reader *r, int seg_id) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int16_t *const dequant = pd->seg_dequant[seg_id];
-  const int ctx =
-      get_entropy_context(tx_size, pd->above_context + x, pd->left_context + y);
-#if CONFIG_NEW_QUANT
-  const int ref = is_inter_block(&xd->mi[0]->mbmi);
-  int dq =
-      get_dq_profile_from_ctx(xd->qindex[seg_id], ctx, ref, pd->plane_type);
-#endif  //  CONFIG_NEW_QUANT
-
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  if (tx_type == MRC_DCT) decode_mrc_tokens(xd, tx_size, r);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-
-  const int eob =
-      decode_coefs(xd, pd->plane_type, pd->dqcoeff, tx_size, tx_type, dequant,
-#if CONFIG_NEW_QUANT
-                   pd->seg_dequant_nuq[seg_id][dq],
-#else
-#if CONFIG_AOM_QM
-                   pd->seg_iqmatrix[seg_id],
-#endif  // CONFIG_AOM_QM
-#endif  // CONFIG_NEW_QUANT
-                   ctx, sc->scan, sc->neighbors, max_scan_line, r);
-  av1_set_contexts(xd, pd, plane, tx_size, eob > 0, x, y);
-#if CONFIG_ADAPT_SCAN
-  if (xd->counts)
-    av1_update_scan_count_facade(cm, xd->counts, tx_size, tx_type, pd->dqcoeff,
-                                 eob);
-#else
-  (void)cm;
-#endif
-  return eob;
+  Av1ColorMapParam params;
+  params.color_map =
+      xd->plane[plane].color_index_map + xd->color_index_map_offset[plane];
+  params.map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
+                         : xd->tile_ctx->palette_y_color_index_cdf;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  params.n_colors = mbmi->palette_mode_info.palette_size[plane];
+  av1_get_block_dimensions(mbmi->sb_type, plane, xd, &params.plane_width,
+                           &params.plane_height, &params.rows, &params.cols);
+  decode_color_map_tokens(&params, r);
 }
-#endif  // !CONFIG_PVQ
diff --git a/third_party/aom/av1/decoder/detokenize.h b/third_party/aom/av1/decoder/detokenize.h
index eb31d58c6..ec85bf7ea 100644
--- a/third_party/aom/av1/decoder/detokenize.h
+++ b/third_party/aom/av1/decoder/detokenize.h
@@ -12,10 +12,9 @@
 #ifndef AV1_DECODER_DETOKENIZE_H_
 #define AV1_DECODER_DETOKENIZE_H_
 
-#include "./aom_config.h"
-#if !CONFIG_PVQ || CONFIG_VAR_TX
+#include "config/aom_config.h"
+
 #include "av1/common/scan.h"
-#endif  // !CONFIG_PVQ || CONFIG_VAR_TX
 #include "av1/decoder/decoder.h"
 
 #ifdef __cplusplus
@@ -24,12 +23,6 @@ extern "C" {
 
 void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane, aom_reader *r);
 
-#if !CONFIG_PVQ || CONFIG_VAR_TX
-int av1_decode_block_tokens(AV1_COMMON *cm, MACROBLOCKD *const xd, int plane,
-                            const SCAN_ORDER *sc, int x, int y, TX_SIZE tx_size,
-                            TX_TYPE tx_type, int16_t *max_scan_line,
-                            aom_reader *r, int seg_id);
-#endif  // !CONFIG_PVQ
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/decoder/dsubexp.c b/third_party/aom/av1/decoder/dsubexp.c
deleted file mode 100644
index 5171f1144..000000000
--- a/third_party/aom/av1/decoder/dsubexp.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "av1/common/entropy.h"
-
-#include "av1/decoder/dsubexp.h"
-
-static int inv_recenter_nonneg(int v, int m) {
-  if (v > 2 * m) return v;
-
-  return (v & 1) ? m - ((v + 1) >> 1) : m + (v >> 1);
-}
-
-#define decode_uniform(r, ACCT_STR_NAME) \
-  decode_uniform_(r ACCT_STR_ARG(ACCT_STR_NAME))
-#define decode_term_subexp(r, ACCT_STR_NAME) \
-  decode_term_subexp_(r ACCT_STR_ARG(ACCT_STR_NAME))
-
-static int decode_uniform_(aom_reader *r ACCT_STR_PARAM) {
-  const int l = 8;
-  const int m = (1 << l) - 190;
-  const int v = aom_read_literal(r, l - 1, ACCT_STR_NAME);
-  return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME);
-}
-
-static int inv_remap_prob(int v, int m) {
-  /* clang-format off */
-  static uint8_t inv_map_table[MAX_PROB - 1] = {
-      7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176, 189,
-    202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,  10,  11,
-     12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  27,
-     28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,
-     44,  45,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  60,
-     61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  73,  74,  75,  76,
-     77,  78,  79,  80,  81,  82,  83,  84,  86,  87,  88,  89,  90,  91,  92,
-     93,  94,  95,  96,  97,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
-    109, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 125,
-    126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 138, 139, 140, 141,
-    142, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157,
-    158, 159, 160, 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173,
-    174, 175, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190,
-    191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
-    207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221, 222,
-    223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238,
-    239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253
-  }; /* clang-format on */
-  assert(v < (int)(sizeof(inv_map_table) / sizeof(inv_map_table[0])));
-  v = inv_map_table[v];
-  m--;
-  if ((m << 1) <= MAX_PROB) {
-    return 1 + inv_recenter_nonneg(v, m);
-  } else {
-    return MAX_PROB - inv_recenter_nonneg(v, MAX_PROB - 1 - m);
-  }
-}
-
-static int decode_term_subexp_(aom_reader *r ACCT_STR_PARAM) {
-  if (!aom_read_bit(r, ACCT_STR_NAME))
-    return aom_read_literal(r, 4, ACCT_STR_NAME);
-  if (!aom_read_bit(r, ACCT_STR_NAME))
-    return aom_read_literal(r, 4, ACCT_STR_NAME) + 16;
-  if (!aom_read_bit(r, ACCT_STR_NAME))
-    return aom_read_literal(r, 5, ACCT_STR_NAME) + 32;
-  return decode_uniform(r, ACCT_STR_NAME) + 64;
-}
-
-void av1_diff_update_prob_(aom_reader *r, aom_prob *p ACCT_STR_PARAM) {
-  if (aom_read(r, DIFF_UPDATE_PROB, ACCT_STR_NAME)) {
-    const int delp = decode_term_subexp(r, ACCT_STR_NAME);
-    *p = (aom_prob)inv_remap_prob(delp, *p);
-  }
-}
diff --git a/third_party/aom/av1/decoder/dsubexp.h b/third_party/aom/av1/decoder/dsubexp.h
deleted file mode 100644
index 4bc38578c..000000000
--- a/third_party/aom/av1/decoder/dsubexp.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_DECODER_DSUBEXP_H_
-#define AV1_DECODER_DSUBEXP_H_
-
-#include "aom_dsp/bitreader.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if CONFIG_ACCOUNTING
-#define av1_diff_update_prob(r, p, str) av1_diff_update_prob_(r, p, str)
-#else
-#define av1_diff_update_prob(r, p, str) av1_diff_update_prob_(r, p)
-#endif
-
-void av1_diff_update_prob_(aom_reader *r, aom_prob *p ACCT_STR_PARAM);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-#endif  // AV1_DECODER_DSUBEXP_H_
diff --git a/third_party/aom/av1/decoder/dthread.c b/third_party/aom/av1/decoder/dthread.c
index 7f16b233c..ff03502e6 100644
--- a/third_party/aom/av1/decoder/dthread.c
+++ b/third_party/aom/av1/decoder/dthread.c
@@ -9,7 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_mem/aom_mem.h"
 #include "av1/common/reconinter.h"
 #include "av1/decoder/dthread.h"
@@ -157,12 +158,8 @@ void av1_frameworker_copy_context(AVxWorker *const dst_worker,
   av1_frameworker_unlock_stats(src_worker);
 
   dst_cm->bit_depth = src_cm->bit_depth;
-#if CONFIG_HIGHBITDEPTH
   dst_cm->use_highbitdepth = src_cm->use_highbitdepth;
-#endif
-#if CONFIG_EXT_REFS
-// TODO(zoeliu): To handle parallel decoding
-#endif  // CONFIG_EXT_REFS
+  // TODO(zoeliu): To handle parallel decoding
   dst_cm->prev_frame =
       src_cm->show_existing_frame ? src_cm->prev_frame : src_cm->cur_frame;
   dst_cm->last_width =
@@ -180,14 +177,10 @@ void av1_frameworker_copy_context(AVxWorker *const dst_worker,
 
   memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr,
          (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh));
-  dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level;
-#if CONFIG_LOOPFILTER_LEVEL
+  dst_cm->lf.sharpness_level = src_cm->lf.sharpness_level;
   dst_cm->lf.filter_level[0] = src_cm->lf.filter_level[0];
   dst_cm->lf.filter_level[1] = src_cm->lf.filter_level[1];
-#else
-  dst_cm->lf.filter_level = src_cm->lf.filter_level;
-#endif
-  memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, TOTAL_REFS_PER_FRAME);
+  memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, REF_FRAMES);
   memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
   dst_cm->seg = src_cm->seg;
   memcpy(dst_cm->frame_contexts, src_cm->frame_contexts,
diff --git a/third_party/aom/av1/decoder/dthread.h b/third_party/aom/av1/decoder/dthread.h
index c17053d9c..33d89006e 100644
--- a/third_party/aom/av1/decoder/dthread.h
+++ b/third_party/aom/av1/decoder/dthread.h
@@ -12,7 +12,8 @@
 #ifndef AV1_DECODER_DTHREAD_H_
 #define AV1_DECODER_DTHREAD_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_util/aom_thread.h"
 #include "aom/internal/aom_codec_internal.h"
 
@@ -22,6 +23,13 @@ extern "C" {
 
 struct AV1Common;
 struct AV1Decoder;
+struct ThreadData;
+
+typedef struct DecWorkerData {
+  struct ThreadData *td;
+  const uint8_t *data_end;
+  struct aom_internal_error_info error_info;
+} DecWorkerData;
 
 // WorkerData for the FrameWorker thread. It contains all the information of
 // the worker and decode structures for decoding a frame.
diff --git a/third_party/aom/av1/decoder/generic_decoder.c b/third_party/aom/av1/decoder/generic_decoder.c
deleted file mode 100644
index 0c7d71b9f..000000000
--- a/third_party/aom/av1/decoder/generic_decoder.c
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include <stdio.h>
-
-#include "aom_dsp/bitreader.h"
-#include "av1/common/generic_code.h"
-#include "av1/common/odintrin.h"
-#include "pvq_decoder.h"
-
-/** Decodes a value from 0 to N-1 (with N up to 16) based on a cdf and adapts
- * the cdf accordingly.
- *
- * @param [in,out] r     multi-symbol entropy decoder
- * @param [in,out] cdf   CDF of the variable (Q15)
- * @param [in]     n     number of values possible
- * @param [in,out] count number of symbols encoded with that cdf so far
- * @param [in]     rate  adaptation rate shift (smaller is faster)
- * @return decoded variable
- */
-int aom_decode_cdf_adapt_q15_(aom_reader *r, uint16_t *cdf, int n,
- int *count, int rate ACCT_STR_PARAM) {
-  int val;
-  int i;
-  if (*count == 0) {
-    int ft;
-    ft = cdf[n - 1];
-    for (i = 0; i < n; i++) {
-      cdf[i] = AOM_ICDF(cdf[i]*32768/ft);
-    }
-  }
-  val = aom_read_cdf(r, cdf, n, ACCT_STR_NAME);
-  aom_cdf_adapt_q15(val, cdf, n, count, rate);
-  return val;
-}
-
-/** Encodes a random variable using a "generic" model, assuming that the
- * distribution is one-sided (zero and up), has a single mode, and decays
- * exponentially past the model.
- *
- * @param [in,out] r     multi-symbol entropy decoder
- * @param [in,out] model generic probability model
- * @param [in]     x     variable being encoded
- * @param [in,out] ExQ16 expectation of x (adapted)
- * @param [in]     integration integration period of ExQ16 (leaky average over
- * 1<<integration samples)
- *
- * @retval decoded variable x
- */
-int generic_decode_(aom_reader *r, generic_encoder *model,
- int *ex_q16, int integration ACCT_STR_PARAM) {
-  int lg_q1;
-  int shift;
-  int id;
-  uint16_t *cdf;
-  int xs;
-  int lsb;
-  int x;
-  lsb = 0;
-  lg_q1 = log_ex(*ex_q16);
-  /* If expectation is too large, shift x to ensure that
-     all we have past xs=15 is the exponentially decaying tail
-     of the distribution. */
-  shift = OD_MAXI(0, (lg_q1 - 5) >> 1);
-  /* Choose the cdf to use: we have two per "octave" of ExQ16. */
-  id = OD_MINI(GENERIC_TABLES - 1, lg_q1);
-  cdf = model->cdf[id];
-  xs = aom_read_symbol_pvq(r, cdf, 16, ACCT_STR_NAME);
-  if (xs == 15) {
-    int e;
-    unsigned decay;
-    /* Estimate decay based on the assumption that the distribution is close
-       to Laplacian for large values. We should probably have an adaptive
-       estimate instead. Note: The 2* is a kludge that's not fully understood
-       yet. */
-    OD_ASSERT(*ex_q16 < INT_MAX >> 1);
-    e = ((2**ex_q16 >> 8) + (1 << shift >> 1)) >> shift;
-    decay = OD_MAXI(2, OD_MINI(254, 256*e/(e + 256)));
-    xs += aom_laplace_decode_special(r, decay, ACCT_STR_NAME);
-  }
-  if (shift != 0) {
-    int special;
-    /* Because of the rounding, there's only half the number of possibilities
-       for xs=0 */
-    special = xs == 0;
-    if (shift - special > 0) {
-      lsb = aom_read_literal(r, shift - special, ACCT_STR_NAME);
-    }
-    lsb -= !special << (shift - 1);
-  }
-  x = (xs << shift) + lsb;
-  generic_model_update(ex_q16, x, integration);
-  OD_LOG((OD_LOG_ENTROPY_CODER, OD_LOG_DEBUG,
-   "dec: %d %d %d %d %d %x", *ex_q16, x, shift, id, xs, dec->rng));
-  return x;
-}
diff --git a/third_party/aom/av1/decoder/inspection.c b/third_party/aom/av1/decoder/inspection.c
index 98c51d4ba..e6c89298a 100644
--- a/third_party/aom/av1/decoder/inspection.c
+++ b/third_party/aom/av1/decoder/inspection.c
@@ -11,12 +11,7 @@
 #include "av1/decoder/decoder.h"
 #include "av1/decoder/inspection.h"
 #include "av1/common/enums.h"
-#if CONFIG_CDEF
 #include "av1/common/cdef.h"
-#endif
-#if CONFIG_CFL
-#include "av1/common/cfl.h"
-#endif
 
 static void ifd_init_mi_rc(insp_frame_data *fd, int mi_cols, int mi_rows) {
   fd->mi_cols = mi_cols;
@@ -48,25 +43,29 @@ int ifd_inspect(insp_frame_data *fd, void *decoder) {
   fd->show_frame = cm->show_frame;
   fd->frame_type = cm->frame_type;
   fd->base_qindex = cm->base_qindex;
-  fd->tile_mi_cols = cm->tile_width;
-  fd->tile_mi_rows = cm->tile_height;
+  // Set width and height of the first tile until generic support can be added
+  TileInfo tile_info;
+  av1_tile_set_row(&tile_info, cm, 0);
+  av1_tile_set_col(&tile_info, cm, 0);
+  fd->tile_mi_cols = tile_info.mi_col_end - tile_info.mi_col_start;
+  fd->tile_mi_rows = tile_info.mi_row_end - tile_info.mi_row_start;
+  fd->delta_q_present_flag = cm->delta_q_present_flag;
+  fd->delta_q_res = cm->delta_q_res;
 #if CONFIG_ACCOUNTING
   fd->accounting = &pbi->accounting;
 #endif
-#if CONFIG_CDEF
-// TODO(negge): copy per frame CDEF data
-#endif
+  // TODO(negge): copy per frame CDEF data
   int i, j;
   for (i = 0; i < MAX_SEGMENTS; i++) {
     for (j = 0; j < 2; j++) {
-      fd->y_dequant[i][j] = cm->y_dequant[i][j];
-      fd->uv_dequant[i][j] = cm->uv_dequant[i][j];
+      fd->y_dequant[i][j] = cm->y_dequant_QTX[i][j];
+      fd->u_dequant[i][j] = cm->u_dequant_QTX[i][j];
+      fd->v_dequant[i][j] = cm->v_dequant_QTX[i][j];
     }
   }
   for (j = 0; j < cm->mi_rows; j++) {
     for (i = 0; i < cm->mi_cols; i++) {
-      const MB_MODE_INFO *mbmi =
-          &cm->mi_grid_visible[j * cm->mi_stride + i]->mbmi;
+      const MB_MODE_INFO *mbmi = cm->mi_grid_visible[j * cm->mi_stride + i];
       insp_mi_data *mi = &fd->mi_grid[j * cm->mi_cols + i];
       // Segment
       mi->segment_id = mbmi->segment_id;
@@ -90,24 +89,19 @@ int ifd_inspect(insp_frame_data *fd, void *decoder) {
       mi->sb_type = mbmi->sb_type;
       // Skip Flag
       mi->skip = mbmi->skip;
-#if CONFIG_DUAL_FILTER
       mi->filter[0] = av1_extract_interp_filter(mbmi->interp_filters, 0);
       mi->filter[1] = av1_extract_interp_filter(mbmi->interp_filters, 1);
-#else
-      mi->filter = av1_extract_interp_filter(mbmi->interp_filters, 0);
-#endif
+      mi->dual_filter_type = mi->filter[0] * 3 + mi->filter[1];
       // Transform
-      mi->tx_type = mbmi->tx_type;
+      // TODO(anyone): extract tx type info from mbmi->txk_type[].
+      mi->tx_type = DCT_DCT;
       mi->tx_size = mbmi->tx_size;
 
-#if CONFIG_CDEF
       mi->cdef_level =
           cm->cdef_strengths[mbmi->cdef_strength] / CDEF_SEC_STRENGTHS;
       mi->cdef_strength =
           cm->cdef_strengths[mbmi->cdef_strength] % CDEF_SEC_STRENGTHS;
       mi->cdef_strength += mi->cdef_strength == 3;
-#endif
-#if CONFIG_CFL
       if (mbmi->uv_mode == UV_CFL_PRED) {
         mi->cfl_alpha_idx = mbmi->cfl_alpha_idx;
         mi->cfl_alpha_sign = mbmi->cfl_alpha_signs;
@@ -115,7 +109,8 @@ int ifd_inspect(insp_frame_data *fd, void *decoder) {
         mi->cfl_alpha_idx = 0;
         mi->cfl_alpha_sign = 0;
       }
-#endif
+      // delta_q
+      mi->current_qindex = mbmi->current_qindex;
     }
   }
   return 1;
diff --git a/third_party/aom/av1/decoder/inspection.h b/third_party/aom/av1/decoder/inspection.h
index 06a94b737..bb604f684 100644
--- a/third_party/aom/av1/decoder/inspection.h
+++ b/third_party/aom/av1/decoder/inspection.h
@@ -20,7 +20,9 @@ extern "C" {
 #include "av1/decoder/accounting.h"
 #endif
 
+#ifndef AOM_AOMDX_H_
 typedef void (*aom_inspect_cb)(void *decoder, void *data);
+#endif
 
 typedef struct insp_mv insp_mv;
 
@@ -33,27 +35,21 @@ typedef struct insp_mi_data insp_mi_data;
 
 struct insp_mi_data {
   insp_mv mv[2];
-  int8_t ref_frame[2];
-  int8_t mode;
-  int8_t uv_mode;
-  int8_t sb_type;
-  int8_t skip;
-  int8_t segment_id;
-#if CONFIG_DUAL_FILTER
-  int8_t filter[2];
-#else
-  int8_t filter;
-#endif
-  int8_t tx_type;
-  int8_t tx_size;
-#if CONFIG_CDEF
-  int8_t cdef_level;
-  int8_t cdef_strength;
-#endif
-#if CONFIG_CFL
-  int8_t cfl_alpha_idx;
-  int8_t cfl_alpha_sign;
-#endif
+  int16_t ref_frame[2];
+  int16_t mode;
+  int16_t uv_mode;
+  int16_t sb_type;
+  int16_t skip;
+  int16_t segment_id;
+  int16_t dual_filter_type;
+  int16_t filter[2];
+  int16_t tx_type;
+  int16_t tx_size;
+  int16_t cdef_level;
+  int16_t cdef_strength;
+  int16_t cfl_alpha_idx;
+  int16_t cfl_alpha_sign;
+  int16_t current_qindex;
 };
 
 typedef struct insp_frame_data insp_frame_data;
@@ -71,10 +67,11 @@ struct insp_frame_data {
   int tile_mi_rows;
   int tile_mi_cols;
   int16_t y_dequant[MAX_SEGMENTS][2];
-  int16_t uv_dequant[MAX_SEGMENTS][2];
-#if CONFIG_CDEF
-// TODO(negge): add per frame CDEF data
-#endif
+  int16_t u_dequant[MAX_SEGMENTS][2];
+  int16_t v_dequant[MAX_SEGMENTS][2];
+  // TODO(negge): add per frame CDEF data
+  int delta_q_present_flag;
+  int delta_q_res;
 };
 
 void ifd_init(insp_frame_data *fd, int frame_width, int frame_height);
diff --git a/third_party/aom/av1/decoder/laplace_decoder.c b/third_party/aom/av1/decoder/laplace_decoder.c
deleted file mode 100644
index 5cc080ea7..000000000
--- a/third_party/aom/av1/decoder/laplace_decoder.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-/* clang-format off */
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include <stdio.h>
-
-#include "aom_dsp/bitreader.h"
-#include "av1/common/pvq.h"
-#include "pvq_decoder.h"
-
-#define aom_decode_pvq_split(r, adapt, sum, ctx, ACCT_STR_NAME) \
-  aom_decode_pvq_split_(r, adapt, sum, ctx ACCT_STR_ARG(ACCT_STR_NAME))
-
-static int aom_decode_pvq_split_(aom_reader *r, od_pvq_codeword_ctx *adapt,
- int sum, int ctx ACCT_STR_PARAM) {
-  int shift;
-  int count;
-  int msbs;
-  int fctx;
-  count = 0;
-  if (sum == 0) return 0;
-  shift = OD_MAXI(0, OD_ILOG(sum) - 3);
-  fctx = 7*ctx + (sum >> shift) - 1;
-  msbs = aom_read_symbol_pvq(r, adapt->pvq_split_cdf[fctx], (sum >> shift) + 1,
-      ACCT_STR_NAME);
-  if (shift) count = aom_read_literal(r, shift, ACCT_STR_NAME);
-  count += msbs << shift;
-  if (count > sum) {
-    count = sum;
-#if !CONFIG_ANS
-    r->ec.error = 1;
-#else
-# error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-  }
-  return count;
-}
-
-void aom_decode_band_pvq_splits(aom_reader *r, od_pvq_codeword_ctx *adapt,
- od_coeff *y, int n, int k, int level) {
-  int mid;
-  int count_right;
-  if (n == 1) {
-    y[0] = k;
-  }
-  else if (k == 0) {
-    OD_CLEAR(y, n);
-  }
-  else if (k == 1 && n <= 16) {
-    int cdf_id;
-    int pos;
-    cdf_id = od_pvq_k1_ctx(n, level == 0);
-    OD_CLEAR(y, n);
-    pos = aom_read_symbol_pvq(r, adapt->pvq_k1_cdf[cdf_id], n, "pvq:k1");
-    y[pos] = 1;
-  }
-  else {
-    mid = n >> 1;
-    count_right = aom_decode_pvq_split(r, adapt, k, od_pvq_size_ctx(n),
-     "pvq:split");
-    aom_decode_band_pvq_splits(r, adapt, y, mid, k - count_right, level + 1);
-    aom_decode_band_pvq_splits(r, adapt, y + mid, n - mid, count_right,
-     level + 1);
-  }
-}
-
-/** Decodes the tail of a Laplace-distributed variable, i.e. it doesn't
- * do anything special for the zero case.
- *
- * @param [dec] range decoder
- * @param [decay] decay factor of the distribution, i.e. pdf ~= decay^x
- *
- * @retval decoded variable x
- */
-int aom_laplace_decode_special_(aom_reader *r, unsigned decay ACCT_STR_PARAM) {
-  int pos;
-  int shift;
-  int xs;
-  int sym;
-  const uint16_t *cdf;
-  shift = 0;
-  /* We don't want a large decay value because that would require too many
-     symbols. */
-  while (decay > 235) {
-    decay = (decay*decay + 128) >> 8;
-    shift++;
-  }
-  decay = OD_MINI(decay, 254);
-  decay = OD_MAXI(decay, 2);
-  cdf = EXP_CDF_TABLE[(decay + 1) >> 1];
-  OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "decay = %d\n", decay));
-  xs = 0;
-  do {
-    sym = OD_MINI(xs, 15);
-    {
-      int i;
-      OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "%d %d %d", xs, shift, sym));
-      for (i = 0; i < 16; i++) {
-        OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "%d ", cdf[i]));
-      }
-      OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "\n"));
-    }
-    sym = aom_read_cdf(r, cdf, 16, ACCT_STR_NAME);
-    xs += sym;
-  } while (sym >= 15);
-  if (shift) pos = (xs << shift) + aom_read_literal(r, shift, ACCT_STR_NAME);
-  else pos = xs;
-  return pos;
-}
diff --git a/third_party/aom/av1/decoder/obu.c b/third_party/aom/av1/decoder/obu.c
new file mode 100644
index 000000000..482b6415e
--- /dev/null
+++ b/third_party/aom/av1/decoder/obu.c
@@ -0,0 +1,907 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_codec.h"
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_ports/mem_ops.h"
+
+#include "av1/common/common.h"
+#include "av1/common/timing.h"
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/obu.h"
+
+// Picture prediction structures (0-12 are predefined) in scalability metadata.
+typedef enum {
+  SCALABILITY_L1T2 = 0,
+  SCALABILITY_L1T3 = 1,
+  SCALABILITY_L2T1 = 2,
+  SCALABILITY_L2T2 = 3,
+  SCALABILITY_L2T3 = 4,
+  SCALABILITY_S2T1 = 5,
+  SCALABILITY_S2T2 = 6,
+  SCALABILITY_S2T3 = 7,
+  SCALABILITY_L2T1h = 8,
+  SCALABILITY_L2T2h = 9,
+  SCALABILITY_L2T3h = 10,
+  SCALABILITY_S2T1h = 11,
+  SCALABILITY_S2T2h = 12,
+  SCALABILITY_S2T3h = 13,
+  SCALABILITY_SS = 14
+} SCALABILITY_STRUCTURES;
+
+// Returns 1 when OBU type is valid, and 0 otherwise.
+static int valid_obu_type(int obu_type) {
+  int valid_type = 0;
+  switch (obu_type) {
+    case OBU_SEQUENCE_HEADER:
+    case OBU_TEMPORAL_DELIMITER:
+    case OBU_FRAME_HEADER:
+    case OBU_TILE_GROUP:
+    case OBU_METADATA:
+    case OBU_FRAME:
+    case OBU_REDUNDANT_FRAME_HEADER:
+    case OBU_TILE_LIST:
+    case OBU_PADDING: valid_type = 1; break;
+    default: break;
+  }
+  return valid_type;
+}
+
+// Parses OBU header and stores values in 'header'.
+static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb,
+                                       int is_annexb, ObuHeader *header) {
+  if (!rb || !header) return AOM_CODEC_INVALID_PARAM;
+
+  const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer;
+  if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME;
+
+  header->size = 1;
+
+  if (aom_rb_read_bit(rb) != 0) {
+    // Forbidden bit. Must not be set.
+    return AOM_CODEC_CORRUPT_FRAME;
+  }
+
+  header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4);
+
+  if (!valid_obu_type(header->type)) return AOM_CODEC_CORRUPT_FRAME;
+
+  header->has_extension = aom_rb_read_bit(rb);
+  header->has_size_field = aom_rb_read_bit(rb);
+
+  if (!header->has_size_field && !is_annexb) {
+    // section 5 obu streams must have obu_size field set.
+    return AOM_CODEC_UNSUP_BITSTREAM;
+  }
+
+  if (aom_rb_read_bit(rb) != 0) {
+    // obu_reserved_1bit must be set to 0.
+    return AOM_CODEC_CORRUPT_FRAME;
+  }
+
+  if (header->has_extension) {
+    if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME;
+
+    header->size += 1;
+    header->temporal_layer_id = aom_rb_read_literal(rb, 3);
+    header->spatial_layer_id = aom_rb_read_literal(rb, 2);
+    if (aom_rb_read_literal(rb, 3) != 0) {
+      // extension_header_reserved_3bits must be set to 0.
+      return AOM_CODEC_CORRUPT_FRAME;
+    }
+  }
+
+  return AOM_CODEC_OK;
+}
+
+aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
+                                    size_t *consumed, ObuHeader *header,
+                                    int is_annexb) {
+  if (buffer_length < 1 || !consumed || !header) return AOM_CODEC_INVALID_PARAM;
+
+  // TODO(tomfinegan): Set the error handler here and throughout this file, and
+  // confirm parsing work done via aom_read_bit_buffer is successful.
+  struct aom_read_bit_buffer rb = { buffer, buffer + buffer_length, 0, NULL,
+                                    NULL };
+  aom_codec_err_t parse_result = read_obu_header(&rb, is_annexb, header);
+  if (parse_result == AOM_CODEC_OK) *consumed = header->size;
+  return parse_result;
+}
+
+aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
+    int operating_point_idc, unsigned int *number_spatial_layers,
+    unsigned int *number_temporal_layers) {
+  // derive number of spatial/temporal layers from operating_point_idc
+
+  if (!number_spatial_layers || !number_temporal_layers)
+    return AOM_CODEC_INVALID_PARAM;
+
+  if (operating_point_idc == 0) {
+    *number_temporal_layers = 1;
+    *number_spatial_layers = 1;
+  } else {
+    *number_spatial_layers = 0;
+    *number_temporal_layers = 0;
+    for (int j = 0; j < MAX_NUM_SPATIAL_LAYERS; j++) {
+      *number_spatial_layers +=
+          (operating_point_idc >> (j + MAX_NUM_TEMPORAL_LAYERS)) & 0x1;
+    }
+    for (int j = 0; j < MAX_NUM_TEMPORAL_LAYERS; j++) {
+      *number_temporal_layers += (operating_point_idc >> j) & 0x1;
+    }
+  }
+
+  return AOM_CODEC_OK;
+}
+
+static int is_obu_in_current_operating_point(AV1Decoder *pbi,
+                                             ObuHeader obu_header) {
+  if (!pbi->current_operating_point) {
+    return 1;
+  }
+
+  if ((pbi->current_operating_point >> obu_header.temporal_layer_id) & 0x1 &&
+      (pbi->current_operating_point >> (obu_header.spatial_layer_id + 8)) &
+          0x1) {
+    return 1;
+  }
+  return 0;
+}
+
+static uint32_t read_temporal_delimiter_obu() { return 0; }
+
+// Returns a boolean that indicates success.
+static int read_bitstream_level(BitstreamLevel *bl,
+                                struct aom_read_bit_buffer *rb) {
+  const uint8_t seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS);
+  if (!is_valid_seq_level_idx(seq_level_idx)) return 0;
+  bl->major = (seq_level_idx >> LEVEL_MINOR_BITS) + LEVEL_MAJOR_MIN;
+  bl->minor = seq_level_idx & ((1 << LEVEL_MINOR_BITS) - 1);
+  return 1;
+}
+
+// On success, sets pbi->sequence_header_ready to 1 and returns the number of
+// bytes read from 'rb'.
+// On failure, sets pbi->common.error.error_code and returns 0.
+static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
+                                         struct aom_read_bit_buffer *rb) {
+  AV1_COMMON *const cm = &pbi->common;
+  const uint32_t saved_bit_offset = rb->bit_offset;
+
+  // Verify rb has been configured to report errors.
+  assert(rb->error_handler);
+
+  cm->profile = av1_read_profile(rb);
+  if (cm->profile > PROFILE_2) {
+    cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+    return 0;
+  }
+
+  SequenceHeader *const seq_params = &cm->seq_params;
+
+  // Still picture or not
+  seq_params->still_picture = aom_rb_read_bit(rb);
+  seq_params->reduced_still_picture_hdr = aom_rb_read_bit(rb);
+  // Video must have reduced_still_picture_hdr = 0
+  if (!seq_params->still_picture && seq_params->reduced_still_picture_hdr) {
+    cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+    return 0;
+  }
+
+  if (seq_params->reduced_still_picture_hdr) {
+    cm->timing_info_present = 0;
+    seq_params->decoder_model_info_present_flag = 0;
+    seq_params->display_model_info_present_flag = 0;
+    seq_params->operating_points_cnt_minus_1 = 0;
+    seq_params->operating_point_idc[0] = 0;
+    if (!read_bitstream_level(&seq_params->level[0], rb)) {
+      cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+      return 0;
+    }
+    seq_params->tier[0] = 0;
+    cm->op_params[0].decoder_model_param_present_flag = 0;
+    cm->op_params[0].display_model_param_present_flag = 0;
+  } else {
+    cm->timing_info_present = aom_rb_read_bit(rb);  // timing_info_present_flag
+    if (cm->timing_info_present) {
+      av1_read_timing_info_header(cm, rb);
+
+      seq_params->decoder_model_info_present_flag = aom_rb_read_bit(rb);
+      if (seq_params->decoder_model_info_present_flag)
+        av1_read_decoder_model_info(cm, rb);
+    } else {
+      seq_params->decoder_model_info_present_flag = 0;
+    }
+    seq_params->display_model_info_present_flag = aom_rb_read_bit(rb);
+    seq_params->operating_points_cnt_minus_1 =
+        aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS);
+    for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) {
+      seq_params->operating_point_idc[i] =
+          aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
+      if (!read_bitstream_level(&seq_params->level[i], rb)) {
+        cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+        return 0;
+      }
+      // This is the seq_level_idx[i] > 7 check in the spec. seq_level_idx 7
+      // is equivalent to level 3.3.
+      if (seq_params->level[i].major > 3)
+        seq_params->tier[i] = aom_rb_read_bit(rb);
+      else
+        seq_params->tier[i] = 0;
+      if (seq_params->decoder_model_info_present_flag) {
+        cm->op_params[i].decoder_model_param_present_flag = aom_rb_read_bit(rb);
+        if (cm->op_params[i].decoder_model_param_present_flag)
+          av1_read_op_parameters_info(cm, rb, i);
+      } else {
+        cm->op_params[i].decoder_model_param_present_flag = 0;
+      }
+      if (cm->timing_info_present &&
+          (cm->timing_info.equal_picture_interval ||
+           cm->op_params[i].decoder_model_param_present_flag)) {
+        cm->op_params[i].bitrate = max_level_bitrate(
+            cm->profile, major_minor_to_seq_level_idx(seq_params->level[i]),
+            seq_params->tier[i]);
+        // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass
+        // the check
+        if (cm->op_params[i].bitrate == 0)
+          aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                             "AV1 does not support this combination of "
+                             "profile, level, and tier.");
+        // Buffer size in bits/s is bitrate in bits/s * 1 s
+        cm->op_params[i].buffer_size = cm->op_params[i].bitrate;
+      }
+      if (cm->timing_info_present && cm->timing_info.equal_picture_interval &&
+          !cm->op_params[i].decoder_model_param_present_flag) {
+        // When the decoder_model_parameters are not sent for this op, set
+        // the default ones that can be used with the resource availability mode
+        cm->op_params[i].decoder_buffer_delay = 70000;
+        cm->op_params[i].encoder_buffer_delay = 20000;
+        cm->op_params[i].low_delay_mode_flag = 0;
+      }
+
+      if (seq_params->display_model_info_present_flag) {
+        cm->op_params[i].display_model_param_present_flag = aom_rb_read_bit(rb);
+        if (cm->op_params[i].display_model_param_present_flag) {
+          cm->op_params[i].initial_display_delay =
+              aom_rb_read_literal(rb, 4) + 1;
+          if (cm->op_params[i].initial_display_delay > 10)
+            aom_internal_error(
+                &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                "AV1 does not support more than 10 decoded frames delay");
+        } else {
+          cm->op_params[i].initial_display_delay = 10;
+        }
+      } else {
+        cm->op_params[i].display_model_param_present_flag = 0;
+        cm->op_params[i].initial_display_delay = 10;
+      }
+    }
+  }
+  // This decoder supports all levels.  Choose operating point provided by
+  // external means
+  int operating_point = pbi->operating_point;
+  if (operating_point < 0 ||
+      operating_point > seq_params->operating_points_cnt_minus_1)
+    operating_point = 0;
+  pbi->current_operating_point =
+      seq_params->operating_point_idc[operating_point];
+  if (aom_get_num_layers_from_operating_point_idc(
+          pbi->current_operating_point, &cm->number_spatial_layers,
+          &cm->number_temporal_layers) != AOM_CODEC_OK) {
+    cm->error.error_code = AOM_CODEC_ERROR;
+    return 0;
+  }
+
+  read_sequence_header(cm, rb);
+
+  av1_read_color_config(cm, rb, pbi->allow_lowbitdepth);
+
+  cm->film_grain_params_present = aom_rb_read_bit(rb);
+
+  if (av1_check_trailing_bits(pbi, rb) != 0) {
+    // cm->error.error_code is already set.
+    return 0;
+  }
+
+  pbi->sequence_header_ready = 1;
+
+  return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
+}
+
+static uint32_t read_frame_header_obu(AV1Decoder *pbi,
+                                      struct aom_read_bit_buffer *rb,
+                                      const uint8_t *data,
+                                      const uint8_t **p_data_end,
+                                      int trailing_bits_present) {
+  av1_decode_frame_headers_and_setup(pbi, rb, data, p_data_end,
+                                     trailing_bits_present);
+  return (uint32_t)(pbi->uncomp_hdr_size);
+}
+
+static int32_t read_tile_group_header(AV1Decoder *pbi,
+                                      struct aom_read_bit_buffer *rb,
+                                      int *start_tile, int *end_tile,
+                                      int tile_start_implicit) {
+  AV1_COMMON *const cm = &pbi->common;
+  uint32_t saved_bit_offset = rb->bit_offset;
+  int tile_start_and_end_present_flag = 0;
+  const int num_tiles = pbi->common.tile_rows * pbi->common.tile_cols;
+
+  if (!pbi->common.large_scale_tile && num_tiles > 1) {
+    tile_start_and_end_present_flag = aom_rb_read_bit(rb);
+  }
+  if (pbi->common.large_scale_tile || num_tiles == 1 ||
+      !tile_start_and_end_present_flag) {
+    *start_tile = 0;
+    *end_tile = num_tiles - 1;
+    return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
+  }
+  if (tile_start_implicit && tile_start_and_end_present_flag) {
+    aom_internal_error(
+        &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        "For OBU_FRAME type obu tile_start_and_end_present_flag must be 0");
+    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    return -1;
+  }
+  *start_tile =
+      aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
+  *end_tile = aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
+
+  return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
+}
+
+static uint32_t read_one_tile_group_obu(
+    AV1Decoder *pbi, struct aom_read_bit_buffer *rb, int is_first_tg,
+    const uint8_t *data, const uint8_t *data_end, const uint8_t **p_data_end,
+    int *is_last_tg, int tile_start_implicit) {
+  AV1_COMMON *const cm = &pbi->common;
+  int start_tile, end_tile;
+  int32_t header_size, tg_payload_size;
+
+  header_size = read_tile_group_header(pbi, rb, &start_tile, &end_tile,
+                                       tile_start_implicit);
+  if (header_size == -1) return 0;
+  if (start_tile > end_tile) return header_size;
+  data += header_size;
+  av1_decode_tg_tiles_and_wrapup(pbi, data, data_end, p_data_end, start_tile,
+                                 end_tile, is_first_tg);
+
+  tg_payload_size = (uint32_t)(*p_data_end - data);
+
+  // TODO(shan):  For now, assume all tile groups received in order
+  *is_last_tg = end_tile == cm->tile_rows * cm->tile_cols - 1;
+  return header_size + tg_payload_size;
+}
+
+// Only called while large_scale_tile = 1.
+static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi,
+                                              struct aom_read_bit_buffer *rb,
+                                              const uint8_t *data,
+                                              const uint8_t *data_end,
+                                              const uint8_t **p_data_end,
+                                              int *frame_decoding_finished) {
+  AV1_COMMON *const cm = &pbi->common;
+  uint32_t tile_list_payload_size = 0;
+  const int num_tiles = cm->tile_cols * cm->tile_rows;
+  const int start_tile = 0;
+  const int end_tile = num_tiles - 1;
+  int i = 0;
+
+  // Process the tile list info.
+  pbi->output_frame_width_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
+  pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
+  pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16);
+  if (pbi->tile_count_minus_1 > 511) {
+    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    return 0;
+  }
+
+  // Allocate output frame buffer for the tile list.
+  // TODO(yunqing): for now, copy each tile's decoded YUV data directly to the
+  // output buffer. This needs to be modified according to the application
+  // requirement.
+  const int tile_width_in_pixels = cm->tile_width * MI_SIZE;
+  const int tile_height_in_pixels = cm->tile_height * MI_SIZE;
+  const int ssy = cm->subsampling_y;
+  const int ssx = cm->subsampling_x;
+  const int num_planes = av1_num_planes(cm);
+  const size_t yplane_tile_size = tile_height_in_pixels * tile_width_in_pixels;
+  const size_t uvplane_tile_size =
+      (num_planes > 1)
+          ? (tile_height_in_pixels >> ssy) * (tile_width_in_pixels >> ssx)
+          : 0;
+  const size_t tile_size = (cm->use_highbitdepth ? 2 : 1) *
+                           (yplane_tile_size + 2 * uvplane_tile_size);
+  pbi->tile_list_size = tile_size * (pbi->tile_count_minus_1 + 1);
+
+  if (pbi->tile_list_size > pbi->buffer_sz) {
+    if (pbi->tile_list_output != NULL) aom_free(pbi->tile_list_output);
+    pbi->tile_list_output = NULL;
+
+    pbi->tile_list_output = (uint8_t *)aom_memalign(32, pbi->tile_list_size);
+    if (pbi->tile_list_output == NULL)
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate the tile list output buffer");
+    pbi->buffer_sz = pbi->tile_list_size;
+  }
+
+  uint32_t tile_list_info_bytes = 4;
+  tile_list_payload_size += tile_list_info_bytes;
+  data += tile_list_info_bytes;
+  uint8_t *output = pbi->tile_list_output;
+
+  for (i = 0; i <= pbi->tile_count_minus_1; i++) {
+    // Process 1 tile.
+    // Reset the bit reader.
+    rb->bit_offset = 0;
+    rb->bit_buffer = data;
+
+    // Read out the tile info.
+    uint32_t tile_info_bytes = 5;
+    // Set reference for each tile.
+    int ref_idx = aom_rb_read_literal(rb, 8);
+    if (ref_idx >= MAX_EXTERNAL_REFERENCES) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+    av1_set_reference_dec(cm, 0, 1, &pbi->ext_refs.refs[ref_idx]);
+
+    pbi->dec_tile_row = aom_rb_read_literal(rb, 8);
+    pbi->dec_tile_col = aom_rb_read_literal(rb, 8);
+    if (pbi->dec_tile_row < 0 || pbi->dec_tile_col < 0 ||
+        pbi->dec_tile_row >= cm->tile_rows ||
+        pbi->dec_tile_col >= cm->tile_cols) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+
+    pbi->coded_tile_data_size = aom_rb_read_literal(rb, 16) + 1;
+    data += tile_info_bytes;
+    if ((size_t)(data_end - data) < pbi->coded_tile_data_size) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+
+    av1_decode_tg_tiles_and_wrapup(pbi, data, data + pbi->coded_tile_data_size,
+                                   p_data_end, start_tile, end_tile, 0);
+    uint32_t tile_payload_size = (uint32_t)(*p_data_end - data);
+
+    tile_list_payload_size += tile_info_bytes + tile_payload_size;
+
+    // Update data ptr for next tile decoding.
+    data = *p_data_end;
+    assert(data <= data_end);
+
+    // Copy decoded tile to the tile list output buffer.
+    YV12_BUFFER_CONFIG *cur_frame = get_frame_new_buffer(cm);
+    const int mi_row = pbi->dec_tile_row * cm->tile_height;
+    const int mi_col = pbi->dec_tile_col * cm->tile_width;
+    const int is_hbd = (cur_frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+    uint8_t *bufs[MAX_MB_PLANE] = { NULL, NULL, NULL };
+    int strides[MAX_MB_PLANE] = { 0, 0, 0 };
+    int plane;
+
+    for (plane = 0; plane < num_planes; ++plane) {
+      int shift_x = plane > 0 ? ssx : 0;
+      int shift_y = plane > 0 ? ssy : 0;
+
+      bufs[plane] = cur_frame->buffers[plane];
+      strides[plane] =
+          (plane > 0) ? cur_frame->strides[1] : cur_frame->strides[0];
+      if (is_hbd) {
+        bufs[plane] = (uint8_t *)CONVERT_TO_SHORTPTR(cur_frame->buffers[plane]);
+        strides[plane] =
+            (plane > 0) ? 2 * cur_frame->strides[1] : 2 * cur_frame->strides[0];
+      }
+
+      bufs[plane] += mi_row * (MI_SIZE >> shift_y) * strides[plane] +
+                     mi_col * (MI_SIZE >> shift_x);
+
+      int w, h;
+      w = (plane > 0 && shift_x > 0) ? ((tile_width_in_pixels + 1) >> shift_x)
+                                     : tile_width_in_pixels;
+      w *= (1 + is_hbd);
+      h = (plane > 0 && shift_y > 0) ? ((tile_height_in_pixels + 1) >> shift_y)
+                                     : tile_height_in_pixels;
+      int j;
+
+      for (j = 0; j < h; ++j) {
+        memcpy(output, bufs[plane], w);
+        bufs[plane] += strides[plane];
+        output += w;
+      }
+    }
+  }
+
+  *frame_decoding_finished = 1;
+  return tile_list_payload_size;
+}
+
+static void read_metadata_itut_t35(const uint8_t *data, size_t sz) {
+  struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
+  for (size_t i = 0; i < sz; i++) {
+    aom_rb_read_literal(&rb, 8);
+  }
+}
+
+static void read_metadata_hdr_cll(const uint8_t *data, size_t sz) {
+  struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
+  aom_rb_read_literal(&rb, 16);  // max_cll
+  aom_rb_read_literal(&rb, 16);  // max_fall
+}
+
+static void read_metadata_hdr_mdcv(const uint8_t *data, size_t sz) {
+  struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
+  for (int i = 0; i < 3; i++) {
+    aom_rb_read_literal(&rb, 16);  // primary_i_chromaticity_x
+    aom_rb_read_literal(&rb, 16);  // primary_i_chromaticity_y
+  }
+
+  aom_rb_read_literal(&rb, 16);  // white_point_chromaticity_x
+  aom_rb_read_literal(&rb, 16);  // white_point_chromaticity_y
+
+  aom_rb_read_unsigned_literal(&rb, 32);  // luminance_max
+  aom_rb_read_unsigned_literal(&rb, 32);  // luminance_min
+}
+
+static void scalability_structure(struct aom_read_bit_buffer *rb) {
+  int spatial_layers_cnt = aom_rb_read_literal(rb, 2);
+  int spatial_layer_dimensions_present_flag = aom_rb_read_literal(rb, 1);
+  int spatial_layer_description_present_flag = aom_rb_read_literal(rb, 1);
+  int temporal_group_description_present_flag = aom_rb_read_literal(rb, 1);
+  aom_rb_read_literal(rb, 3);  // reserved
+
+  if (spatial_layer_dimensions_present_flag) {
+    int i;
+    for (i = 0; i < spatial_layers_cnt + 1; i++) {
+      aom_rb_read_literal(rb, 16);
+      aom_rb_read_literal(rb, 16);
+    }
+  }
+  if (spatial_layer_description_present_flag) {
+    int i;
+    for (i = 0; i < spatial_layers_cnt + 1; i++) {
+      aom_rb_read_literal(rb, 8);
+    }
+  }
+  if (temporal_group_description_present_flag) {
+    int i, j, temporal_group_size;
+    temporal_group_size = aom_rb_read_literal(rb, 8);
+    for (i = 0; i < temporal_group_size; i++) {
+      aom_rb_read_literal(rb, 3);
+      aom_rb_read_literal(rb, 1);
+      aom_rb_read_literal(rb, 1);
+      int temporal_group_ref_cnt = aom_rb_read_literal(rb, 3);
+      for (j = 0; j < temporal_group_ref_cnt; j++) {
+        aom_rb_read_literal(rb, 8);
+      }
+    }
+  }
+}
+
+static void read_metadata_scalability(const uint8_t *data, size_t sz) {
+  struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
+  int scalability_mode_idc = aom_rb_read_literal(&rb, 8);
+  if (scalability_mode_idc == SCALABILITY_SS) {
+    scalability_structure(&rb);
+  }
+}
+
+static void read_metadata_timecode(const uint8_t *data, size_t sz) {
+  struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
+  aom_rb_read_literal(&rb, 5);                     // counting_type f(5)
+  int full_timestamp_flag = aom_rb_read_bit(&rb);  // full_timestamp_flag f(1)
+  aom_rb_read_bit(&rb);                            // discontinuity_flag (f1)
+  aom_rb_read_bit(&rb);                            // cnt_dropped_flag f(1)
+  aom_rb_read_literal(&rb, 9);                     // n_frames f(9)
+  if (full_timestamp_flag) {
+    aom_rb_read_literal(&rb, 6);  // seconds_value f(6)
+    aom_rb_read_literal(&rb, 6);  // minutes_value f(6)
+    aom_rb_read_literal(&rb, 5);  // hours_value f(5)
+  } else {
+    int seconds_flag = aom_rb_read_bit(&rb);  // seconds_flag f(1)
+    if (seconds_flag) {
+      aom_rb_read_literal(&rb, 6);              // seconds_value f(6)
+      int minutes_flag = aom_rb_read_bit(&rb);  // minutes_flag f(1)
+      if (minutes_flag) {
+        aom_rb_read_literal(&rb, 6);            // minutes_value f(6)
+        int hours_flag = aom_rb_read_bit(&rb);  // hours_flag f(1)
+        if (hours_flag) {
+          aom_rb_read_literal(&rb, 5);  // hours_value f(5)
+        }
+      }
+    }
+  }
+  // time_offset_length f(5)
+  int time_offset_length = aom_rb_read_literal(&rb, 5);
+  if (time_offset_length) {
+    aom_rb_read_literal(&rb, time_offset_length);  // f(time_offset_length)
+  }
+}
+
+static size_t read_metadata(const uint8_t *data, size_t sz) {
+  size_t type_length;
+  uint64_t type_value;
+  OBU_METADATA_TYPE metadata_type;
+  if (aom_uleb_decode(data, sz, &type_value, &type_length) < 0) {
+    return sz;
+  }
+  metadata_type = (OBU_METADATA_TYPE)type_value;
+  if (metadata_type == OBU_METADATA_TYPE_ITUT_T35) {
+    read_metadata_itut_t35(data + type_length, sz - type_length);
+  } else if (metadata_type == OBU_METADATA_TYPE_HDR_CLL) {
+    read_metadata_hdr_cll(data + type_length, sz - type_length);
+  } else if (metadata_type == OBU_METADATA_TYPE_HDR_MDCV) {
+    read_metadata_hdr_mdcv(data + type_length, sz - type_length);
+  } else if (metadata_type == OBU_METADATA_TYPE_SCALABILITY) {
+    read_metadata_scalability(data + type_length, sz - type_length);
+  } else if (metadata_type == OBU_METADATA_TYPE_TIMECODE) {
+    read_metadata_timecode(data + type_length, sz - type_length);
+  }
+
+  return sz;
+}
+
+static aom_codec_err_t read_obu_size(const uint8_t *data,
+                                     size_t bytes_available,
+                                     size_t *const obu_size,
+                                     size_t *const length_field_size) {
+  uint64_t u_obu_size = 0;
+  if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) !=
+      0) {
+    return AOM_CODEC_CORRUPT_FRAME;
+  }
+
+  if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME;
+  *obu_size = (size_t)u_obu_size;
+  return AOM_CODEC_OK;
+}
+
+aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
+                                             size_t bytes_available,
+                                             int is_annexb,
+                                             ObuHeader *obu_header,
+                                             size_t *const payload_size,
+                                             size_t *const bytes_read) {
+  size_t length_field_size = 0, obu_size = 0;
+  aom_codec_err_t status;
+
+  if (is_annexb) {
+    // Size field comes before the OBU header, and includes the OBU header
+    status =
+        read_obu_size(data, bytes_available, &obu_size, &length_field_size);
+
+    if (status != AOM_CODEC_OK) return status;
+  }
+
+  struct aom_read_bit_buffer rb = { data + length_field_size,
+                                    data + bytes_available, 0, NULL, NULL };
+
+  status = read_obu_header(&rb, is_annexb, obu_header);
+  if (status != AOM_CODEC_OK) return status;
+
+  if (is_annexb) {
+    // Derive the payload size from the data we've already read
+    if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME;
+
+    *payload_size = obu_size - obu_header->size;
+  } else {
+    // Size field comes after the OBU header, and is just the payload size
+    status = read_obu_size(data + obu_header->size,
+                           bytes_available - obu_header->size, payload_size,
+                           &length_field_size);
+    if (status != AOM_CODEC_OK) return status;
+  }
+
+  *bytes_read = length_field_size + obu_header->size;
+  return AOM_CODEC_OK;
+}
+
+#define EXT_TILE_DEBUG 0
+// On success, returns a boolean that indicates whether the decoding of the
+// current frame is finished. On failure, sets cm->error.error_code and
+// returns -1.
+int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
+                               const uint8_t *data_end,
+                               const uint8_t **p_data_end) {
+  AV1_COMMON *const cm = &pbi->common;
+  int frame_decoding_finished = 0;
+  int is_first_tg_obu_received = 1;
+  int frame_header_size = 0;
+  int seq_header_received = 0;
+  size_t seq_header_size = 0;
+  ObuHeader obu_header;
+  memset(&obu_header, 0, sizeof(obu_header));
+  pbi->seen_frame_header = 0;
+
+  if (data_end < data) {
+    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    return -1;
+  }
+
+  // Reset pbi->camera_frame_header_ready to 0 if cm->large_scale_tile = 0.
+  if (!cm->large_scale_tile) pbi->camera_frame_header_ready = 0;
+
+  // decode frame as a series of OBUs
+  while (!frame_decoding_finished && !cm->error.error_code) {
+    struct aom_read_bit_buffer rb;
+    size_t payload_size = 0;
+    size_t decoded_payload_size = 0;
+    size_t obu_payload_offset = 0;
+    size_t bytes_read = 0;
+    const size_t bytes_available = data_end - data;
+
+    if (bytes_available == 0 && !pbi->seen_frame_header) {
+      *p_data_end = data;
+      cm->error.error_code = AOM_CODEC_OK;
+      break;
+    }
+
+    aom_codec_err_t status =
+        aom_read_obu_header_and_size(data, bytes_available, cm->is_annexb,
+                                     &obu_header, &payload_size, &bytes_read);
+
+    if (status != AOM_CODEC_OK) {
+      cm->error.error_code = status;
+      return -1;
+    }
+
+    // Record obu size header information.
+    pbi->obu_size_hdr.data = data + obu_header.size;
+    pbi->obu_size_hdr.size = bytes_read - obu_header.size;
+
+    // Note: aom_read_obu_header_and_size() takes care of checking that this
+    // doesn't cause 'data' to advance past 'data_end'.
+    data += bytes_read;
+
+    if ((size_t)(data_end - data) < payload_size) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return -1;
+    }
+
+    cm->temporal_layer_id = obu_header.temporal_layer_id;
+    cm->spatial_layer_id = obu_header.spatial_layer_id;
+
+    if (obu_header.type != OBU_TEMPORAL_DELIMITER &&
+        obu_header.type != OBU_SEQUENCE_HEADER &&
+        obu_header.type != OBU_PADDING) {
+      // don't decode obu if it's not in current operating mode
+      if (!is_obu_in_current_operating_point(pbi, obu_header)) {
+        data += payload_size;
+        continue;
+      }
+    }
+
+    av1_init_read_bit_buffer(pbi, &rb, data, data_end);
+
+    switch (obu_header.type) {
+      case OBU_TEMPORAL_DELIMITER:
+        decoded_payload_size = read_temporal_delimiter_obu();
+        pbi->seen_frame_header = 0;
+        break;
+      case OBU_SEQUENCE_HEADER:
+        if (!seq_header_received) {
+          decoded_payload_size = read_sequence_header_obu(pbi, &rb);
+          if (cm->error.error_code != AOM_CODEC_OK) return -1;
+
+          seq_header_size = decoded_payload_size;
+          seq_header_received = 1;
+        } else {
+          // Seeing another sequence header, skip as all sequence headers are
+          // required to be identical except for the contents of
+          // operating_parameters_info and the amount of trailing bits.
+          // TODO(yaowu): verifying redundant sequence headers are identical.
+          decoded_payload_size = seq_header_size;
+        }
+        break;
+      case OBU_FRAME_HEADER:
+      case OBU_REDUNDANT_FRAME_HEADER:
+      case OBU_FRAME:
+        // Only decode first frame header received
+        if (!pbi->seen_frame_header ||
+            (cm->large_scale_tile && !pbi->camera_frame_header_ready)) {
+          pbi->seen_frame_header = 1;
+          frame_header_size = read_frame_header_obu(
+              pbi, &rb, data, p_data_end, obu_header.type != OBU_FRAME);
+          if (cm->large_scale_tile) pbi->camera_frame_header_ready = 1;
+        }
+        decoded_payload_size = frame_header_size;
+        pbi->frame_header_size = (size_t)frame_header_size;
+
+        if (cm->show_existing_frame) {
+          frame_decoding_finished = 1;
+          pbi->seen_frame_header = 0;
+          break;
+        }
+
+#if !EXT_TILE_DEBUG
+        // In large scale tile coding, decode the common camera frame header
+        // before any tile list OBU.
+        if (!pbi->ext_tile_debug && pbi->camera_frame_header_ready) {
+          frame_decoding_finished = 1;
+          // Skip the rest of the frame data.
+          decoded_payload_size = payload_size;
+          // Update data_end.
+          *p_data_end = data_end;
+          break;
+        }
+#endif  // EXT_TILE_DEBUG
+
+        if (obu_header.type != OBU_FRAME) break;
+        obu_payload_offset = frame_header_size;
+        AOM_FALLTHROUGH_INTENDED;  // fall through to read tile group.
+      case OBU_TILE_GROUP:
+        if (!pbi->seen_frame_header) {
+          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          return -1;
+        }
+        if ((size_t)(data_end - data) < obu_payload_offset) {
+          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          return -1;
+        }
+        decoded_payload_size += read_one_tile_group_obu(
+            pbi, &rb, is_first_tg_obu_received, data + obu_payload_offset,
+            data + payload_size, p_data_end, &frame_decoding_finished,
+            obu_header.type == OBU_FRAME);
+        is_first_tg_obu_received = 0;
+        if (frame_decoding_finished) pbi->seen_frame_header = 0;
+        break;
+      case OBU_METADATA:
+        decoded_payload_size = read_metadata(data, payload_size);
+        break;
+      case OBU_TILE_LIST:
+        // This OBU type is purely for the large scale tile coding mode.
+        // The common camera frame header has to be already decoded.
+        if (!pbi->camera_frame_header_ready) {
+          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          return -1;
+        }
+
+        cm->large_scale_tile = 1;
+        av1_set_single_tile_decoding_mode(cm);
+        decoded_payload_size =
+            read_and_decode_one_tile_list(pbi, &rb, data, data + payload_size,
+                                          p_data_end, &frame_decoding_finished);
+        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        break;
+      case OBU_PADDING:
+      default:
+        // Skip unrecognized OBUs
+        decoded_payload_size = payload_size;
+        break;
+    }
+
+    // Check that the signalled OBU size matches the actual amount of data read
+    if (decoded_payload_size > payload_size) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return -1;
+    }
+
+    // If there are extra padding bytes, they should all be zero
+    while (decoded_payload_size < payload_size) {
+      uint8_t padding_byte = data[decoded_payload_size++];
+      if (padding_byte != 0) {
+        cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+        return -1;
+      }
+    }
+
+    data += payload_size;
+  }
+
+  return frame_decoding_finished;
+}
+#undef EXT_TILE_DEBUG
diff --git a/third_party/aom/av1/decoder/obu.h b/third_party/aom/av1/decoder/obu.h
new file mode 100644
index 000000000..5f2197058
--- /dev/null
+++ b/third_party/aom/av1/decoder/obu.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_DECODER_OBU_H
+#define AV1_DECODER_OBU_H
+
+#include "aom/aom_codec.h"
+#include "av1/decoder/decoder.h"
+
+typedef struct {
+  size_t size;  // Size (1 or 2 bytes) of the OBU header (including the
+                // optional OBU extension header) in the bitstream.
+  OBU_TYPE type;
+  int has_size_field;
+  int has_extension;
+  // The following fields come from the OBU extension header and therefore are
+  // only used if has_extension is true.
+  int temporal_layer_id;
+  int spatial_layer_id;
+} ObuHeader;
+
+aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
+                                    size_t *consumed, ObuHeader *header,
+                                    int is_annexb);
+
+aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
+                                             size_t bytes_available,
+                                             int is_annexb,
+                                             ObuHeader *obu_header,
+                                             size_t *const payload_size,
+                                             size_t *const bytes_read);
+
+// Try to decode one frame from a buffer.
+// Returns 1 if we decoded a frame,
+//         0 if we didn't decode a frame but that's okay
+//           (eg, if there was a frame but we skipped it),
+//     or -1 on error
+int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
+                               const uint8_t *data_end,
+                               const uint8_t **p_data_end);
+
+aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
+    int operating_point_idc, unsigned int *num_spatial_layers,
+    unsigned int *num_temporal_layers);
+
+#endif
diff --git a/third_party/aom/av1/decoder/pvq_decoder.c b/third_party/aom/av1/decoder/pvq_decoder.c
deleted file mode 100644
index d9a8e8056..000000000
--- a/third_party/aom/av1/decoder/pvq_decoder.c
+++ /dev/null
@@ -1,378 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "./aom_config.h"
-#include "aom_dsp/bitreader.h"
-#include "aom_dsp/entcode.h"
-#include "aom_dsp/entdec.h"
-#include "av1/common/odintrin.h"
-#include "av1/common/partition.h"
-#include "av1/common/pvq_state.h"
-#include "av1/decoder/decint.h"
-#include "av1/decoder/pvq_decoder.h"
-#include "aom_ports/system_state.h"
-
-int aom_read_symbol_pvq_(aom_reader *r, aom_cdf_prob *cdf, int nsymbs
- ACCT_STR_PARAM) {
-  if (cdf[0] == 0)
-    aom_cdf_init_q15_1D(cdf, nsymbs, CDF_SIZE(nsymbs));
-  return aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME);
-}
-
-static void aom_decode_pvq_codeword(aom_reader *r, od_pvq_codeword_ctx *ctx,
- od_coeff *y, int n, int k) {
-  int i;
-  aom_decode_band_pvq_splits(r, ctx, y, n, k, 0);
-  for (i = 0; i < n; i++) {
-    if (y[i] && aom_read_bit(r, "pvq:sign")) y[i] = -y[i];
-  }
-}
-
-/** Inverse of neg_interleave; decodes the interleaved gain.
- *
- * @param [in]      x      quantized/interleaved gain to decode
- * @param [in]      ref    quantized gain of the reference
- * @return                 original quantized gain value
- */
-static int neg_deinterleave(int x, int ref) {
-  if (x < 2*ref-1) {
-    if (x & 1) return ref - 1 - (x >> 1);
-    else return ref + (x >> 1);
-  }
-  else return x+1;
-}
-
-/** Synthesizes one parition of coefficient values from a PVQ-encoded
- * vector.
- *
- * @param [out]     xcoeff  output coefficient partition (x in math doc)
- * @param [in]      ypulse  PVQ-encoded values (y in math doc); in the noref
- *                          case, this vector has n entries, in the
- *                          reference case it contains n-1 entries
- *                          (the m-th entry is not included)
- * @param [in]      ref     reference vector (prediction)
- * @param [in]      n       number of elements in this partition
- * @param [in]      gr      gain of the reference vector (prediction)
- * @param [in]      noref   indicates presence or lack of prediction
- * @param [in]      g       decoded quantized vector gain
- * @param [in]      theta   decoded theta (prediction error)
- * @param [in]      qm      QM with magnitude compensation
- * @param [in]      qm_inv  Inverse of QM with magnitude compensation
- */
-static void pvq_synthesis(od_coeff *xcoeff, od_coeff *ypulse, od_val16 *r16,
- int n, od_val32 gr, int noref, od_val32 g, od_val32 theta, const int16_t *qm_inv,
- int shift) {
-  int s;
-  int m;
-  /* Sign of the Householder reflection vector */
-  s = 0;
-  /* Direction of the Householder reflection vector */
-  m = noref ? 0 : od_compute_householder(r16, n, gr, &s, shift);
-  od_pvq_synthesis_partial(xcoeff, ypulse, r16, n, noref, g, theta, m, s,
-   qm_inv);
-}
-
-typedef struct {
-  od_coeff *ref;
-  int nb_coeffs;
-  int allow_flip;
-} cfl_ctx;
-
-/** Decodes a single vector of integers (eg, a partition within a
- *  coefficient block) encoded using PVQ
- *
- * @param [in,out] ec          range encoder
- * @param [in]     q0          scale/quantizer
- * @param [in]     n           number of coefficients in partition
- * @param [in,out] model       entropy decoder state
- * @param [in,out] adapt       adaptation context
- * @param [in,out] exg         ExQ16 expectation of decoded gain value
- * @param [in,out] ext         ExQ16 expectation of decoded theta value
- * @param [in]     ref         'reference' (prediction) vector
- * @param [out]    out         decoded partition
- * @param [out]    noref       boolean indicating absence of reference
- * @param [in]     beta        per-band activity masking beta param
- * @param [in]     is_keyframe whether we're encoding a keyframe
- * @param [in]     pli         plane index
- * @param [in]     cdf_ctx     selects which cdf context to use
- * @param [in,out] skip_rest   whether to skip further bands in each direction
- * @param [in]     band        index of the band being decoded
- * @param [in]     band        index of the band being decoded
- * @param [out]    skip        skip flag with range [0,1]
- * @param [in]     qm          QM with magnitude compensation
- * @param [in]     qm_inv      Inverse of QM with magnitude compensation
- */
-static void pvq_decode_partition(aom_reader *r,
-                                 int q0,
-                                 int n,
-                                 generic_encoder model[3],
-                                 od_adapt_ctx *adapt,
-                                 int *exg,
-                                 int *ext,
-                                 od_coeff *ref,
-                                 od_coeff *out,
-                                 int *noref,
-                                 od_val16 beta,
-                                 int is_keyframe,
-                                 int pli,
-                                 int cdf_ctx,
-                                 cfl_ctx *cfl,
-                                 int has_skip,
-                                 int *skip_rest,
-                                 int band,
-                                 int *skip,
-                                 const int16_t *qm,
-                                 const int16_t *qm_inv) {
-  int k;
-  od_val32 qcg;
-  int itheta;
-  od_val32 theta;
-  od_val32 gr;
-  od_val32 gain_offset;
-  od_coeff y[MAXN];
-  int qg;
-  int id;
-  int i;
-  od_val16 ref16[MAXN];
-  int rshift;
-  theta = 0;
-  gr = 0;
-  gain_offset = 0;
-  /* Skip is per-direction. For band=0, we can use any of the flags. */
-  if (skip_rest[(band + 2) % 3]) {
-    qg = 0;
-    if (is_keyframe) {
-      itheta = -1;
-      *noref = 1;
-    }
-    else {
-      itheta = 0;
-      *noref = 0;
-    }
-  }
-  else {
-    /* Jointly decode gain, itheta and noref for small values. Then we handle
-       larger gain. */
-    id = aom_read_symbol_pvq(r, &adapt->pvq.pvq_gaintheta_cdf[cdf_ctx][0],
-     8 + 7*has_skip, "pvq:gaintheta");
-    if (!is_keyframe && id >= 10) id++;
-    if (is_keyframe && id >= 8) id++;
-    if (id >= 8) {
-      id -= 8;
-      skip_rest[0] = skip_rest[1] = skip_rest[2] = 1;
-    }
-    qg = id & 1;
-    itheta = (id >> 1) - 1;
-    *noref = (itheta == -1);
-  }
-  /* The CfL flip bit is only decoded on the first band that has noref=0. */
-  if (cfl->allow_flip && !*noref) {
-    int flip;
-    flip = aom_read_bit(r, "cfl:flip");
-    if (flip) {
-      for (i = 0; i < cfl->nb_coeffs; i++) cfl->ref[i] = -cfl->ref[i];
-    }
-    cfl->allow_flip = 0;
-  }
-  if (qg > 0) {
-    int tmp;
-    tmp = *exg;
-    qg = 1 + generic_decode(r, &model[!*noref], &tmp, 2, "pvq:gain");
-    OD_IIR_DIADIC(*exg, qg << 16, 2);
-  }
-  *skip = 0;
-#if defined(OD_FLOAT_PVQ)
-  rshift = 0;
-#else
-  /* Shift needed to make the reference fit in 15 bits, so that the Householder
-     vector can fit in 16 bits. */
-  rshift = OD_MAXI(0, od_vector_log_mag(ref, n) - 14);
-#endif
-  for (i = 0; i < n; i++) {
-#if defined(OD_FLOAT_PVQ)
-    ref16[i] = ref[i]*(double)qm[i]*OD_QM_SCALE_1;
-#else
-    ref16[i] = OD_SHR_ROUND(ref[i]*qm[i], OD_QM_SHIFT + rshift);
-#endif
-  }
-  if(!*noref){
-    /* we have a reference; compute its gain */
-    od_val32 cgr;
-    int icgr;
-    int cfl_enabled;
-    cfl_enabled = pli != 0 && is_keyframe && !OD_DISABLE_CFL;
-    cgr = od_pvq_compute_gain(ref16, n, q0, &gr, beta, rshift);
-    if (cfl_enabled) cgr = OD_CGAIN_SCALE;
-#if defined(OD_FLOAT_PVQ)
-    icgr = (int)floor(.5 + cgr);
-#else
-    icgr = OD_SHR_ROUND(cgr, OD_CGAIN_SHIFT);
-#endif
-    /* quantized gain is interleave encoded when there's a reference;
-       deinterleave it now */
-    if (is_keyframe) qg = neg_deinterleave(qg, icgr);
-    else {
-      qg = neg_deinterleave(qg, icgr + 1) - 1;
-      if (qg == 0) *skip = (icgr ? OD_PVQ_SKIP_ZERO : OD_PVQ_SKIP_COPY);
-    }
-    if (qg == icgr && itheta == 0 && !cfl_enabled) *skip = OD_PVQ_SKIP_COPY;
-    gain_offset = cgr - OD_SHL(icgr, OD_CGAIN_SHIFT);
-    qcg = OD_SHL(qg, OD_CGAIN_SHIFT) + gain_offset;
-    /* read and decode first-stage PVQ error theta */
-    if (itheta > 1) {
-      int tmp;
-      tmp = *ext;
-      itheta = 2 + generic_decode(r, &model[2], &tmp, 2, "pvq:theta");
-      OD_IIR_DIADIC(*ext, itheta << 16, 2);
-    }
-    theta = od_pvq_compute_theta(itheta, od_pvq_compute_max_theta(qcg, beta));
-  }
-  else{
-    itheta = 0;
-    if (!is_keyframe) qg++;
-    qcg = OD_SHL(qg, OD_CGAIN_SHIFT);
-    if (qg == 0) *skip = OD_PVQ_SKIP_ZERO;
-  }
-
-  k = od_pvq_compute_k(qcg, itheta, *noref, n, beta);
-  if (k != 0) {
-    /* when noref==0, y is actually size n-1 */
-    aom_decode_pvq_codeword(r, &adapt->pvq.pvq_codeword_ctx, y,
-     n - !*noref, k);
-  }
-  else {
-    OD_CLEAR(y, n);
-  }
-  if (*skip) {
-    if (*skip == OD_PVQ_SKIP_COPY) OD_COPY(out, ref, n);
-    else OD_CLEAR(out, n);
-  }
-  else {
-    od_val32 g;
-    g = od_gain_expand(qcg, q0, beta);
-    pvq_synthesis(out, y, ref16, n, gr, *noref, g, theta, qm_inv, rshift);
-  }
-  /* If OD_PVQ_SKIP_ZERO or OD_PVQ_SKIP_COPY, set skip to 1 for visualization */
-  if (*skip) *skip = 1;
-}
-
-/** Decodes a coefficient block (except for DC) encoded using PVQ
- *
- * @param [in,out] dec         daala decoder context
- * @param [in]     ref         'reference' (prediction) vector
- * @param [out]    out         decoded partition
- * @param [in]     q0          quantizer
- * @param [in]     pli         plane index
- * @param [in]     bs          log of the block size minus two
- * @param [in]     beta        per-band activity masking beta param
- * @param [in]     is_keyframe whether we're encoding a keyframe
- * @param [out]    flags       bitmask of the per band skip and noref flags
- * @param [in]     ac_dc_coded skip flag for the block (range 0-3)
- * @param [in]     qm          QM with magnitude compensation
- * @param [in]     qm_inv      Inverse of QM with magnitude compensation
- */
-void od_pvq_decode(daala_dec_ctx *dec,
-                   od_coeff *ref,
-                   od_coeff *out,
-                   int q0,
-                   int pli,
-                   int bs,
-                   const od_val16 *beta,
-                   int is_keyframe,
-                   unsigned int *flags,
-                   PVQ_SKIP_TYPE ac_dc_coded,
-                   const int16_t *qm,
-                   const int16_t *qm_inv){
-
-  int noref[PVQ_MAX_PARTITIONS];
-  int skip[PVQ_MAX_PARTITIONS];
-  int *exg;
-  int *ext;
-  int nb_bands;
-  int i;
-  const int *off;
-  int size[PVQ_MAX_PARTITIONS];
-  generic_encoder *model;
-  int skip_rest[3] = {0};
-  cfl_ctx cfl;
-  const unsigned char *pvq_qm;
-  int use_masking;
-
-  aom_clear_system_state();
-
-  /*Default to skip=1 and noref=0 for all bands.*/
-  for (i = 0; i < PVQ_MAX_PARTITIONS; i++) {
-    noref[i] = 0;
-    skip[i] = 1;
-  }
-
-  use_masking = dec->use_activity_masking;
-
-  if (use_masking)
-    pvq_qm = &dec->state.pvq_qm_q4[pli][0];
-  else
-    pvq_qm = 0;
-
-  exg = &dec->state.adapt->pvq.pvq_exg[pli][bs][0];
-  ext = dec->state.adapt->pvq.pvq_ext + bs*PVQ_MAX_PARTITIONS;
-  model = dec->state.adapt->pvq.pvq_param_model;
-  nb_bands = OD_BAND_OFFSETS[bs][0];
-  off = &OD_BAND_OFFSETS[bs][1];
-  out[0] = ac_dc_coded & DC_CODED;
-  if (ac_dc_coded < AC_CODED) {
-    if (is_keyframe) for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = 0;
-    else for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = ref[i];
-  }
-  else {
-    for (i = 0; i < nb_bands; i++) size[i] = off[i+1] - off[i];
-    cfl.ref = ref;
-    cfl.nb_coeffs = off[nb_bands];
-    cfl.allow_flip = pli != 0 && is_keyframe;
-    for (i = 0; i < nb_bands; i++) {
-      int q;
-
-      if (use_masking)
-        q = OD_MAXI(1, q0 * pvq_qm[od_qm_get_index(bs, i + 1)] >> 4);
-      else
-        q = OD_MAXI(1, q0);
-
-      pvq_decode_partition(dec->r, q, size[i],
-       model, dec->state.adapt, exg + i, ext + i, ref + off[i], out + off[i],
-       &noref[i], beta[i], is_keyframe, pli,
-       (pli != 0)*OD_TXSIZES*PVQ_MAX_PARTITIONS + bs*PVQ_MAX_PARTITIONS + i,
-       &cfl, i == 0 && (i < nb_bands - 1), skip_rest, i, &skip[i],
-       qm + off[i], qm_inv + off[i]);
-      if (i == 0 && !skip_rest[0] && bs > 0) {
-        int skip_dir;
-        int j;
-        skip_dir = aom_read_symbol(dec->r,
-         &dec->state.adapt->pvq.pvq_skip_dir_cdf[(pli != 0) + 2*(bs - 1)][0], 7,
-         "pvq:skiprest");
-        for (j = 0; j < 3; j++) skip_rest[j] = !!(skip_dir & (1 << j));
-      }
-    }
-  }
-  *flags = 0;
-  for (i = nb_bands - 1; i >= 0; i--) {
-    *flags <<= 1;
-    *flags |= noref[i]&1;
-    *flags <<= 1;
-    *flags |= skip[i]&1;
-  }
-}
diff --git a/third_party/aom/av1/decoder/pvq_decoder.h b/third_party/aom/av1/decoder/pvq_decoder.h
deleted file mode 100644
index 98970663b..000000000
--- a/third_party/aom/av1/decoder/pvq_decoder.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#if !defined(_pvq_decoder_H)
-# define _pvq_decoder_H (1)
-# include "aom_dsp/bitreader.h"
-# include "aom_dsp/entdec.h"
-# include "av1/common/pvq.h"
-# include "av1/decoder/decint.h"
-
-#define aom_read_symbol_pvq(r, cdf, nsymbs, ACCT_STR_NAME) \
-  aom_read_symbol_pvq_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
-
-int aom_read_symbol_pvq_(aom_reader *r, aom_cdf_prob *cdf, int nsymbs
-  ACCT_STR_PARAM);
-
-void aom_decode_band_pvq_splits(aom_reader *r, od_pvq_codeword_ctx *adapt,
- od_coeff *y, int n, int k, int level);
-
-#define aom_laplace_decode_special(r, decay, ACCT_STR_NAME) \
-  aom_laplace_decode_special_(r, decay ACCT_STR_ARG(ACCT_STR_NAME))
-
-int aom_laplace_decode_special_(aom_reader *r, unsigned decay ACCT_STR_PARAM);
-
-void od_pvq_decode(daala_dec_ctx *dec, od_coeff *ref, od_coeff *out, int q0,
-    int pli, int bs, const od_val16 *beta, int is_keyframe,
-    unsigned int *flags, PVQ_SKIP_TYPE ac_dc_coded, const int16_t *qm,
-    const int16_t *qm_inv);
-
-#endif
diff --git a/third_party/aom/av1/decoder/symbolrate.h b/third_party/aom/av1/decoder/symbolrate.h
deleted file mode 100644
index 023287732..000000000
--- a/third_party/aom/av1/decoder/symbolrate.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/bitreader.h"
-
-#ifndef AV1_DECODER_SYMBOLRATE_H_
-#define AV1_DECODER_SYMBOLRATE_H_
-
-#if CONFIG_SYMBOLRATE
-static INLINE void av1_dump_symbol_rate(struct AV1Common *cm) {
-  const FRAME_COUNTS *counts = &cm->counts;
-  printf("%d %d %d %d\n", counts->coeff_num[0], counts->coeff_num[1],
-         counts->symbol_num[0], counts->symbol_num[1]);
-}
-static INLINE int av1_read_record_symbol(FRAME_COUNTS *counts, aom_reader *r,
-                                         aom_cdf_prob *cdf, int nsymbs,
-                                         const char *str) {
-  (void)str;
-  if (counts) ++counts->symbol_num[0];
-  return aom_read_symbol(r, cdf, nsymbs, str);
-}
-
-#if CONFIG_LV_MAP
-static INLINE int av1_read_record_bin(FRAME_COUNTS *counts, aom_reader *r,
-                                      aom_cdf_prob *cdf, int nsymbs,
-                                      const char *str) {
-  (void)str;
-  if (counts) ++counts->symbol_num[0];
-  return aom_read_bin(r, cdf, nsymbs, str);
-}
-#endif
-
-static INLINE int av1_read_record(FRAME_COUNTS *counts, aom_reader *r, int prob,
-                                  const char *str) {
-  (void)str;
-  if (counts) ++counts->symbol_num[0];
-  return aom_read(r, prob, str);
-}
-
-static INLINE int av1_read_record_cdf(FRAME_COUNTS *counts, aom_reader *r,
-                                      const aom_cdf_prob *cdf, int nsymbs,
-                                      const char *str) {
-  (void)str;
-  if (counts) ++counts->symbol_num[0];
-  return aom_read_cdf(r, cdf, nsymbs, str);
-}
-
-static INLINE int av1_read_record_bit(FRAME_COUNTS *counts, aom_reader *r,
-                                      const char *str) {
-  (void)str;
-  if (counts) ++counts->symbol_num[1];
-  return aom_read_bit(r, str);
-}
-
-static INLINE void av1_record_coeff(FRAME_COUNTS *counts, tran_low_t qcoeff) {
-  assert(qcoeff >= 0);
-  if (counts) ++counts->coeff_num[qcoeff != 0];
-}
-#else  // CONFIG_SYMBOLRATE
-
-#define av1_read_record_symbol(counts, r, cdf, nsymbs, ACCT_STR_NAME) \
-  aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME)
-
-#if CONFIG_LV_MAP
-#define av1_read_record_bin(counts, r, cdf, nsymbs, ACCT_STR_NAME) \
-  aom_read_bin(r, cdf, nsymbs, ACCT_STR_NAME)
-#endif
-
-#define av1_read_record(counts, r, prob, ACCT_STR_NAME) \
-  aom_read(r, prob, ACCT_STR_NAME)
-
-#define av1_read_record_cdf(counts, r, cdf, nsymbs, ACCT_STR_NAME) \
-  aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME)
-
-#define av1_read_record_bit(counts, r, ACCT_STR_NAME) \
-  aom_read_bit(r, ACCT_STR_NAME)
-
-#endif  // CONFIG_SYMBOLRATE
-
-#endif  // AV1_DECODER_SYMBOLRATE_H_
diff --git a/third_party/aom/av1/encoder/ab_partition_model_weights.h b/third_party/aom/av1/encoder/ab_partition_model_weights.h
new file mode 100644
index 000000000..5b918fae2
--- /dev/null
+++ b/third_party/aom/av1/encoder/ab_partition_model_weights.h
@@ -0,0 +1,1318 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_
+#define AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+#define FEATURE_SIZE 10
+#define LABEL_SIZE 16
+// nn model for ab partition pruning, 128x128.
+static const float av1_ab_partition_nn_weights_128_layer0[FEATURE_SIZE * 64] = {
+  -0.715251f, -0.015767f, -0.667353f, -0.345255f, 0.177887f,  -0.469759f,
+  0.426152f,  0.489798f,  0.469865f,  0.773821f,  0.088517f,  0.074585f,
+  0.838754f,  0.048449f,  -0.007584f, 0.638968f,  0.233305f,  -0.319236f,
+  -0.257124f, -0.170869f, 0.137180f,  0.114852f,  -0.721241f, -0.947962f,
+  -0.411298f, 0.494306f,  -0.060435f, -0.648421f, -0.126624f, 0.072686f,
+  -0.143904f, -0.115839f, -0.175527f, -0.117728f, 0.040686f,  -0.189925f,
+  0.134361f,  -0.258070f, -0.177558f, 0.158049f,  0.168668f,  -0.062919f,
+  0.341986f,  0.038100f,  -0.435577f, -0.321255f, 0.203213f,  0.213061f,
+  0.533304f,  0.359296f,  -0.079558f, 0.004637f,  0.663904f,  0.043779f,
+  0.383018f,  1.136559f,  -0.084155f, 0.333057f,  -0.199011f, 0.152059f,
+  -0.078419f, -0.167752f, -0.093651f, 0.083171f,  -0.190143f, 0.086195f,
+  -0.280632f, -0.160663f, -0.017298f, 0.122628f,  -0.138116f, 0.062927f,
+  0.222462f,  0.626979f,  0.426928f,  0.117170f,  -0.240457f, 0.053750f,
+  0.038017f,  0.007359f,  -0.017595f, 0.101407f,  0.332891f,  0.074933f,
+  0.306498f,  0.219380f,  -0.151638f, -0.247976f, 0.343405f,  0.121256f,
+  0.049173f,  0.171474f,  -0.139608f, -1.016599f, -0.345553f, -0.901138f,
+  0.243401f,  0.059928f,  -0.089396f, -0.195565f, 0.364705f,  -0.020400f,
+  -1.383672f, 0.413018f,  0.536950f,  -0.020904f, -1.335306f, -0.732290f,
+  0.102885f,  0.315290f,  -0.208521f, -0.081811f, 0.182300f,  0.125712f,
+  -0.593833f, -0.220639f, -0.314155f, 0.188327f,  0.118503f,  0.524427f,
+  -1.083859f, -1.130640f, 0.390352f,  -0.045591f, 0.113160f,  -0.009149f,
+  -0.096183f, 0.115829f,  0.377752f,  0.318396f,  -0.591983f, 0.004797f,
+  -0.497377f, -0.342248f, 0.079546f,  -0.025249f, -0.295972f, 0.615501f,
+  -0.464372f, 0.418315f,  -0.173556f, 0.105217f,  0.298073f,  0.082478f,
+  0.033223f,  0.977341f,  -0.372982f, -0.052337f, 0.154124f,  0.396787f,
+  0.536654f,  -0.139061f, -0.223702f, 0.229666f,  -0.846766f, 0.107723f,
+  0.563839f,  -0.483141f, 0.304813f,  -0.765283f, 0.070964f,  0.151101f,
+  0.275188f,  0.490303f,  1.175892f,  0.085377f,  -0.191200f, 0.544532f,
+  -0.365075f, 0.167546f,  0.052183f,  -0.220529f, -0.212227f, -0.144988f,
+  -0.273356f, -0.062023f, 0.103993f,  -0.238493f, -0.161204f, -0.054611f,
+  -0.166672f, 0.128327f,  0.461751f,  -0.545822f, 0.739798f,  0.594386f,
+  -0.163192f, -0.332501f, 0.363834f,  -0.065043f, 0.474812f,  -0.138811f,
+  0.170924f,  -0.778142f, -0.316474f, -0.508065f, -0.039986f, -0.478001f,
+  0.340591f,  0.041783f,  0.055419f,  0.015155f,  -0.981830f, -1.355237f,
+  0.347516f,  1.155327f,  0.081319f,  0.274163f,  -0.327230f, -0.113478f,
+  0.556552f,  -0.055986f, 0.217318f,  -0.445351f, 0.325759f,  0.526547f,
+  -0.657434f, -0.572214f, -0.037087f, 0.081384f,  0.064518f,  0.014892f,
+  0.215279f,  1.834504f,  -0.242107f, 0.079810f,  0.129558f,  0.079588f,
+  -0.035189f, -0.221745f, -0.163414f, 0.043978f,  -1.028662f, -0.623609f,
+  1.130336f,  0.664661f,  -0.063975f, -0.415863f, 0.018581f,  0.157758f,
+  0.200570f,  0.063420f,  0.901039f,  -0.746286f, 0.196230f,  -0.290592f,
+  0.042373f,  -0.502500f, 0.183638f,  0.103394f,  -0.298858f, 0.145436f,
+  0.196916f,  0.108319f,  -0.448572f, -0.881385f, 0.302497f,  0.121679f,
+  -0.021327f, 0.025150f,  0.481306f,  -0.359634f, 0.350257f,  -0.228647f,
+  -0.669860f, 0.260025f,  -0.034182f, 0.619247f,  -0.158826f, -0.405864f,
+  0.674112f,  -0.027885f, -0.325274f, -0.241492f, 0.036024f,  -0.437685f,
+  -0.091458f, -0.109295f, -0.350676f, 0.044706f,  0.297059f,  0.016290f,
+  1.121203f,  1.289062f,  -1.299476f, -1.129221f, 0.103752f,  0.131302f,
+  -0.263265f, 0.222155f,  -0.229908f, 0.013922f,  -0.226001f, -0.248383f,
+  -0.004415f, -0.020958f, 0.055634f,  0.086200f,  0.114556f,  -0.184061f,
+  -0.096210f, -0.146466f, -0.249618f, -0.195998f, 0.088758f,  0.023781f,
+  -0.264460f, 0.157026f,  -0.235228f, -0.102564f, 0.043463f,  -0.187823f,
+  -0.257500f, -0.199049f, -0.242210f, 0.030448f,  0.221604f,  0.151804f,
+  -0.100404f, -0.073931f, 0.144749f,  -0.001572f, -1.438079f, -0.233716f,
+  0.733422f,  1.727080f,  -0.036397f, 0.027551f,  0.425321f,  0.085703f,
+  0.031186f,  0.032333f,  -0.675130f, 1.437733f,  -0.202392f, -0.525003f,
+  0.087048f,  0.328194f,  -0.079989f, -0.391088f, -0.238732f, -0.120660f,
+  -0.139600f, 0.154665f,  0.026202f,  -0.233501f, -0.009046f, -0.149187f,
+  -0.199646f, 0.115375f,  0.209762f,  -0.014875f, 0.124038f,  -0.119985f,
+  1.079625f,  -0.461513f, 0.614114f,  0.021003f,  0.439449f,  -0.824834f,
+  -0.299701f, 0.193817f,  -0.870551f, -1.262313f, -0.079517f, 0.341570f,
+  0.305310f,  -0.089721f, -0.317314f, -0.075631f, 0.127172f,  -0.208635f,
+  1.191922f,  0.163141f,  0.564285f,  0.286352f,  0.480865f,  0.173094f,
+  -0.094034f, -0.071339f, -0.328992f, -0.006382f, 0.314705f,  0.090258f,
+  -0.016099f, 0.193230f,  0.188061f,  0.398144f,  0.722781f,  0.769949f,
+  0.025442f,  -0.162016f, 0.070192f,  -0.056946f, -0.100957f, -0.219934f,
+  -0.203492f, -0.015454f, -0.013272f, -0.098008f, 0.051707f,  -0.017493f,
+  0.527446f,  0.083605f,  0.588318f,  0.878215f,  0.028747f,  -0.146479f,
+  -0.345170f, -0.136059f, -0.152005f, -0.203634f, 0.232702f,  -0.101340f,
+  -0.027733f, -0.282611f, 0.265366f,  0.082362f,  -0.265420f, -0.131124f,
+  0.166303f,  0.040194f,  -0.100710f, 0.579151f,  -0.530136f, 0.163422f,
+  -0.998821f, -1.565311f, -1.774785f, -2.493372f, 0.116970f,  -0.090302f,
+  1.723272f,  0.552370f,  -0.295954f, -0.439095f, -0.266730f, 0.027936f,
+  0.539616f,  -0.234902f, -0.167601f, -0.149877f, -0.242983f, 0.122353f,
+  -0.121620f, -0.205517f, -0.180144f, -0.264208f, 0.151500f,  -0.159378f,
+  0.029145f,  -0.050892f, -0.223407f, -0.246239f, 0.043152f,  -0.018460f,
+  0.169972f,  -0.187769f, -0.034670f, -0.238330f, 0.288070f,  -0.093243f,
+  -0.437105f, -0.573376f, 0.660073f,  0.285727f,  0.408470f,  0.158475f,
+  0.032699f,  0.056280f,  -0.237176f, -0.083003f, 0.105598f,  -0.169522f,
+  -0.260420f, -0.121100f, -0.173983f, -0.195693f, -0.232028f, 0.224940f,
+  0.029124f,  0.009580f,  -0.252034f, 0.103087f,  1.156561f,  0.603848f,
+  -0.562805f, -1.652742f, -0.568288f, -1.829395f, 0.046169f,  0.076095f,
+  1.490819f,  0.415893f,  -0.277788f, -0.115787f, 0.093750f,  0.270726f,
+  -0.395983f, -0.353742f, 0.034605f,  0.005342f,  0.184537f,  0.086445f,
+  0.156417f,  1.476367f,  0.122587f,  0.002145f,  0.431057f,  -0.381184f,
+  -1.646457f, -0.014009f, -0.671224f, 0.193726f,  -0.019247f, -0.031267f,
+  -0.046208f, 0.298733f,  0.064734f,  0.616984f,  0.039381f,  0.182722f,
+  -0.116670f, 0.233093f,  -1.214374f, -0.817970f, -0.064394f, -0.584783f,
+  0.077697f,  -0.266720f, 0.130875f,  -0.235295f, -0.265754f, -0.159999f,
+  -0.250114f, -0.183017f, 0.194403f,  -0.105808f, -0.169215f, -0.240866f,
+  -0.026662f, -0.045123f, -0.036175f, -0.167471f, -0.192908f, -0.232602f,
+  -0.267036f, -0.112500f, -0.257944f, -0.111909f, -0.802226f, -0.008800f,
+  0.881460f,  -0.678603f, 0.008666f,  -0.252053f, -0.341035f, -0.175290f,
+  0.183012f,  0.385991f,  0.079888f,  -0.014039f, -0.148653f, 0.671778f,
+  -0.130219f, 1.086467f,  0.129267f,  -0.040400f, -0.201221f, -0.077005f,
+  0.015890f,  0.000781f,  0.137764f,  1.389546f,  0.172152f,  0.047279f,
+  -0.042783f, 0.127740f,  0.141467f,  -0.335738f, -1.396392f, 0.031496f,
+  0.357385f,  0.343602f,  -0.714553f, 0.311014f,  0.132845f,  0.061149f,
+  0.006796f,  0.568106f,  -0.255949f, 0.104134f,  -0.993447f, 0.298135f,
+  -0.406590f, -0.049228f, -0.578570f, -0.188561f, -0.107046f, 0.374095f,
+  0.068481f,  0.036240f,  -0.495801f, 0.180574f,  -0.766129f, 0.886967f,
+  -0.568868f, -0.936062f, -0.418886f, -0.058735f, -0.511964f, -0.438596f,
+  0.019016f,  -0.015837f, 0.600197f,  0.429773f,  0.315026f,  0.319667f,
+  0.214617f,  -0.017316f, 0.270257f,  -0.040524f, 0.695803f,  -0.015223f,
+  -1.554965f, 0.356997f,  -1.472428f, 0.024637f,  -0.562958f, 0.870351f,
+  0.193635f,  0.036063f,  0.328638f,  0.200274f,  -1.634707f, 0.110534f,
+  0.420104f,  -0.072042f, -0.006404f, 0.171680f,
+};
+
+static const float av1_ab_partition_nn_bias_128_layer0[64] = {
+  0.643147f,  -1.348826f, 0.431627f,  0.000000f,  0.102717f,  -0.772628f,
+  -0.034351f, -0.761977f, -0.638397f, 0.541969f,  -0.391311f, 0.563076f,
+  0.148553f,  0.267217f,  -0.788092f, 0.544573f,  -0.546280f, 0.000000f,
+  -0.446945f, 0.127732f,  0.270624f,  -0.219435f, -1.220203f, 0.324584f,
+  0.110885f,  0.276547f,  0.179726f,  -0.375160f, 0.026401f,  -0.032595f,
+  0.000000f,  -0.047932f, -0.648602f, -0.512637f, -0.031661f, -0.236761f,
+  0.476453f,  -0.028021f, -0.013673f, -0.015578f, -0.920077f, 0.000000f,
+  0.915351f,  -0.209962f, 0.000000f,  -0.025731f, 0.218288f,  0.000000f,
+  0.047726f,  -0.813077f, -1.263281f, 0.239087f,  0.278614f,  -0.030753f,
+  0.000000f,  0.346744f,  -0.948543f, -1.174211f, 0.216377f,  0.498913f,
+  0.853918f,  0.002504f,  -0.190403f, 0.452050f,
+};
+
+static const float av1_ab_partition_nn_weights_128_layer1[64 * LABEL_SIZE] = {
+  0.179769f,  1.499417f,  -0.445135f, -0.142278f, -0.337661f, 0.682064f,
+  -0.203213f, 0.302171f,  0.226877f,  -0.422169f, 1.687586f,  0.783773f,
+  0.220995f,  0.253482f,  0.370435f,  -1.342775f, 0.337229f,  -0.271473f,
+  0.291796f,  1.362227f,  -1.751397f, -0.086178f, 0.725496f,  -0.118597f,
+  0.227963f,  -0.501577f, 0.223849f,  -0.122421f, -0.123437f, -0.051045f,
+  -0.020115f, 0.212711f,  0.246025f,  0.088120f,  -0.168995f, 1.740190f,
+  -0.195098f, 0.680339f,  -0.589572f, -0.075244f, 0.878766f,  0.064092f,
+  -3.548527f, 0.001660f,  0.107926f,  -0.169501f, -0.455212f, 0.123045f,
+  -1.836998f, 0.330365f,  1.301475f,  0.454761f,  -0.576552f, -0.190761f,
+  0.208459f,  0.618483f,  1.383364f,  0.970718f,  0.390174f,  0.406252f,
+  -0.564519f, -0.312062f, 1.345712f,  -0.151873f, 0.109290f,  0.408847f,
+  0.391243f,  0.152024f,  0.181764f,  -0.036263f, -0.160466f, 0.153595f,
+  0.049163f,  -0.753012f, -1.804062f, 0.347475f,  -2.746580f, 0.575618f,
+  0.261799f,  0.210505f,  -0.302054f, -0.109872f, 0.199506f,  -1.182971f,
+  0.723668f,  0.177758f,  -0.338202f, 0.254396f,  -0.220023f, 0.043504f,
+  0.669866f,  -0.040816f, -0.402730f, 0.017990f,  0.215523f,  -0.216816f,
+  0.454826f,  -0.726067f, -0.018750f, -0.928679f, 0.154315f,  -0.465641f,
+  0.144566f,  -0.030064f, -0.054667f, -0.154055f, 0.625384f,  1.323795f,
+  -0.159496f, 0.097072f,  -0.463197f, -0.057938f, 0.750290f,  -0.233061f,
+  0.412631f,  -0.535223f, -0.151423f, -0.154583f, 0.024721f,  -0.494448f,
+  0.230594f,  -0.980138f, -0.653968f, 0.126079f,  0.051814f,  -0.053219f,
+  -0.421708f, -0.228853f, 0.237885f,  0.888157f,  0.059655f,  0.241295f,
+  0.210443f,  0.228238f,  0.119127f,  -0.051989f, -0.355408f, 0.182215f,
+  0.244277f,  -0.104577f, -0.558035f, -0.023270f, 0.054571f,  0.700646f,
+  -0.223006f, 0.115523f,  0.023391f,  0.437264f,  0.709477f,  -0.531212f,
+  -0.094731f, 0.328161f,  -0.105418f, -0.133511f, 0.497168f,  -0.030948f,
+  -0.407132f, -0.043943f, 0.155505f,  0.251945f,  0.205010f,  0.167160f,
+  0.083654f,  -0.636810f, 0.401315f,  -0.398414f, 0.290046f,  0.206846f,
+  0.042218f,  0.168150f,  0.843181f,  -0.671242f, -0.202392f, -0.073301f,
+  0.142895f,  0.237466f,  0.212145f,  -0.091828f, 0.187038f,  -0.720841f,
+  -0.616069f, -0.238021f, 0.065365f,  0.434119f,  0.179023f,  -0.040107f,
+  -0.430734f, -0.297368f, 0.575954f,  0.382619f,  -0.709787f, -0.320810f,
+  0.242342f,  -0.047614f, 0.705216f,  0.098077f,  0.357179f,  0.046017f,
+  0.115074f,  -0.412305f, -0.272304f, 0.048096f,  -0.803811f, 0.275000f,
+  0.642198f,  0.180286f,  -0.087178f, -0.112707f, -0.394443f, 0.201989f,
+  0.241759f,  -1.038870f, 0.728124f,  0.800559f,  -1.296268f, 0.198612f,
+  -0.053478f, 0.414344f,  -0.510529f, 0.124179f,  -2.219115f, -0.074583f,
+  -0.143055f, 0.001697f,  0.810811f,  -0.657140f, 0.186818f,  -0.936414f,
+  0.539578f,  -0.308244f, -0.126624f, -0.204767f, 0.091145f,  -0.049340f,
+  0.252014f,  0.394582f,  0.018764f,  -0.060377f, -0.019133f, 0.064083f,
+  0.069211f,  -0.526693f, 0.209850f,  -0.481466f, -0.468302f, -0.100407f,
+  0.241018f,  -1.037781f, 0.038539f,  -2.113840f, -0.974895f, 0.163187f,
+  0.425132f,  -0.772546f, -1.261254f, -0.217488f, -0.971748f, -0.805640f,
+  -0.745175f, -0.177077f, 0.217658f,  0.381431f,  -0.052338f, 0.087176f,
+  -0.165972f, 0.085937f,  0.472564f,  -0.796627f, -2.453307f, 0.569664f,
+  -0.233010f, -0.192134f, 0.064339f,  -0.111411f, -0.262469f, -0.410022f,
+  0.519993f,  -0.684620f, 0.393460f,  -0.277753f, -0.153624f, 0.528984f,
+  -0.415558f, -0.445863f, 0.588512f,  -0.142439f, -0.132127f, 0.199776f,
+  -0.579284f, 0.119488f,  -0.033590f, -0.503846f, -0.674979f, 0.335125f,
+  0.020519f,  0.233973f,  -0.297998f, -0.051511f, 0.518626f,  -0.412782f,
+  -0.074045f, 0.130523f,  0.465751f,  -0.117795f, 2.535813f,  0.352108f,
+  -0.499228f, 0.379784f,  0.056699f,  0.173142f,  -0.076519f, -0.026666f,
+  0.017834f,  0.492333f,  0.093364f,  0.037867f,  -0.165420f, -0.356429f,
+  -0.562334f, 0.057656f,  -0.307544f, 0.085857f,  -0.559851f, 0.107230f,
+  -0.398633f, 0.152618f,  -0.216835f, -0.024539f, 0.026044f,  -0.249519f,
+  -0.563594f, -0.746025f, 0.025265f,  -0.298888f, -0.185243f, 0.058794f,
+  0.233696f,  -0.115223f, 0.144617f,  -0.864390f, 0.619944f,  -0.023980f,
+  0.019481f,  0.225252f,  0.416552f,  -0.115993f, 0.935387f,  0.744386f,
+  0.053353f,  -0.052582f, -0.065650f, 0.228488f,  -0.032042f, -0.371252f,
+  -0.003638f, -0.736984f, -0.203776f, 0.030922f,  -0.065577f, -0.031643f,
+  -0.049253f, -0.054640f, 0.787134f,  0.545414f,  -0.140297f, -0.124274f,
+  -0.110011f, -0.029552f, 0.657005f,  0.214973f,  -0.374300f, 0.251642f,
+  0.276591f,  0.030566f,  -0.145470f, 0.350579f,  -0.356436f, -0.052694f,
+  -0.063966f, -0.751008f, -1.042392f, 0.328892f,  -0.425058f, -0.421571f,
+  -0.571889f, -1.141472f, -0.125216f, 0.212713f,  -0.485170f, -0.088791f,
+  0.124589f,  0.023237f,  0.077635f,  0.020901f,  -0.271402f, -0.321424f,
+  -0.513946f, -0.867872f, -0.284593f, 0.106276f,  0.220192f,  -0.143532f,
+  -0.014648f, 0.073402f,  0.327256f,  -0.139803f, 0.168763f,  0.048199f,
+  -0.122526f, 0.111713f,  -0.134257f, 0.810364f,  -0.085222f, -0.259221f,
+  -0.239349f, 0.044448f,  0.205031f,  0.413113f,  -0.107720f, -0.018816f,
+  -0.247741f, -0.004963f, 0.041170f,  -0.158019f, 0.134839f,  0.129502f,
+  0.800488f,  -1.041584f, -0.129336f, 0.170834f,  0.566586f,  -0.230443f,
+  0.437937f,  -0.149922f, -0.046665f, -0.094646f, 0.200070f,  0.072943f,
+  -0.076943f, -0.084971f, -0.515843f, -0.146720f, 0.472869f,  -0.444731f,
+  -0.100877f, 0.545196f,  -1.786626f, -0.482946f, 0.500509f,  -0.843257f,
+  0.200374f,  0.045103f,  -0.575718f, -0.164335f, -0.232522f, -0.021825f,
+  -0.139490f, 0.356058f,  -0.352075f, 0.061751f,  -0.200616f, -1.180921f,
+  -0.181355f, -0.137459f, 0.247574f,  0.181541f,  0.184314f,  -0.961482f,
+  0.493615f,  0.910261f,  -2.279238f, 0.648631f,  -0.055526f, -0.037137f,
+  0.038643f,  0.136609f,  -0.819373f, -0.040840f, -0.265989f, 0.006877f,
+  0.454651f,  -0.595323f, -0.099500f, -0.263717f, 0.150456f,  0.245077f,
+  -0.268666f, 0.162232f,  -0.516451f, -0.024501f, 0.188046f,  -0.002262f,
+  0.261319f,  0.004173f,  0.746982f,  0.174761f,  0.470447f,  -0.159558f,
+  -0.385240f, 0.023084f,  -0.133520f, -0.220607f, -0.018731f, -0.373558f,
+  -0.707763f, -1.850150f, -0.807404f, -0.168063f, -0.071435f, -0.160740f,
+  -0.478789f, -1.070674f, -0.489740f, -0.255796f, 0.100486f,  -0.153361f,
+  0.334394f,  -0.569472f, -0.198118f, 0.255922f,  0.104717f,  -0.065179f,
+  0.111879f,  -0.447237f, 1.373623f,  -0.190191f, -0.063311f, 0.337529f,
+  -0.138800f, 0.057009f,  -0.137006f, 0.641378f,  0.883147f,  -0.679655f,
+  0.267717f,  -0.351602f, -0.135225f, 0.229398f,  -0.513225f, -1.120345f,
+  0.528786f,  -0.051081f, 0.086653f,  0.140141f,  -0.563969f, 0.333402f,
+  -0.174745f, 0.321093f,  -0.438641f, -0.005131f, 0.247415f,  0.110120f,
+  -0.076308f, -0.083244f, 0.838944f,  -0.113043f, -0.013258f, -0.175028f,
+  -0.179941f, 0.272676f,  -0.047946f, -0.088076f, -0.450031f, 0.053929f,
+  -0.083549f, -0.089952f, -0.186253f, 0.257483f,  0.011019f,  0.586435f,
+  0.060580f,  -0.052078f, 0.090277f,  -0.780869f, 0.969811f,  -0.025349f,
+  -0.281917f, 0.014857f,  0.231863f,  -0.228601f, -0.003861f, 0.226550f,
+  0.141825f,  -0.102171f, -0.010387f, 0.220378f,  -2.561975f, -0.497071f,
+  -0.315117f, 0.371981f,  0.138247f,  0.625031f,  -0.308133f, -0.217876f,
+  0.005615f,  -0.860179f, 0.747491f,  0.006356f,  -0.057024f, -0.483189f,
+  0.055592f,  -0.316834f, 0.069858f,  0.218788f,  -0.200044f, 0.227588f,
+  0.215496f,  -0.055324f, -0.393147f, -0.394062f, -0.253264f, -0.075619f,
+  -0.152512f, -0.332995f, 0.129053f,  0.178668f,  -0.302694f, 0.030678f,
+  0.925896f,  0.964375f,  0.169021f,  -0.218657f, -0.627204f, 0.206437f,
+  -0.521336f, 0.176206f,  0.142733f,  0.139248f,  0.411682f,  0.181544f,
+  0.224850f,  -0.935547f, -0.558208f, 0.348096f,  0.342129f,  -0.389340f,
+  -0.236308f, -0.132099f, 0.073642f,  0.089391f,  -0.306901f, -0.397842f,
+  0.444282f,  0.074623f,  -0.051075f, -0.106617f, -0.184037f, -0.239046f,
+  -0.138761f, 0.120794f,  -0.647577f, -0.336471f, 0.527899f,  -0.164234f,
+  -0.028354f, 1.083678f,  -0.251534f, -0.145903f, -0.182783f, 0.070976f,
+  -0.199590f, -0.400306f, -0.029763f, -0.548042f, -0.266270f, -0.118084f,
+  -1.152632f, 0.383685f,  -0.105895f, -0.096829f, 0.118382f,  0.047447f,
+  -0.019051f, 0.310180f,  -0.162793f, -0.029574f, 0.058054f,  -0.636017f,
+  0.490639f,  0.158347f,  -0.385701f, -0.147057f, 1.285825f,  -1.276083f,
+  -0.021795f, -0.101600f, 0.163254f,  0.267160f,  -2.317864f, -0.098598f,
+  -0.296337f, -0.309017f, 0.164127f,  -0.270012f, -0.071187f, -0.262270f,
+  0.075415f,  -0.368328f, 0.186728f,  -0.158031f, 0.481663f,  0.515950f,
+  -0.162551f, 0.497981f,  0.262196f,  0.168479f,  0.726066f,  -0.243856f,
+  -0.058998f, 0.140168f,  0.053242f,  -0.624623f, -0.249480f, 0.055197f,
+  -1.376804f, 0.417571f,  0.203784f,  0.174370f,  -0.155531f, -0.029400f,
+  -0.491473f, 0.079811f,  -0.080123f, 1.345900f,  0.637077f,  0.434862f,
+  -1.787438f, 0.005756f,  -0.362706f, 0.179458f,  -0.288263f, 0.516788f,
+  -0.921248f, 0.043794f,  -0.137729f, -0.196171f, -0.046295f, -0.793781f,
+  -0.156532f, -0.132566f, 0.517989f,  -0.154321f, -0.054174f, -0.077900f,
+  -0.373316f, -0.117718f, 0.188986f,  -0.476188f, -0.245312f, 0.181439f,
+  -0.161024f, -0.229059f, -3.079907f, -0.225452f, -0.594355f, -0.558027f,
+  -0.135429f, 0.125766f,  -0.081314f, -0.350894f, -0.163165f, -1.936507f,
+  -0.205966f, 0.031472f,  0.744446f,  -0.006680f, -0.837551f, 0.605862f,
+  -0.854929f, -1.543750f, -0.307704f, -0.240517f, 0.178240f,  -0.183586f,
+  -0.010307f, 0.099373f,  -0.228278f, 0.175236f,  -0.000133f, 0.104491f,
+  -1.540545f, -0.570971f, -0.252885f, 0.483036f,  0.052531f,  0.260214f,
+  -0.515016f, -0.602081f, -0.485690f, -0.730710f, 0.163719f,  -1.775975f,
+  -0.298634f, 0.323626f,  -0.373579f, -0.872977f, 0.619574f,  0.026862f,
+  -0.122531f, -0.084698f, -2.436297f, 0.483996f,  -0.203640f, -0.302157f,
+  -0.150666f, -0.238320f, 0.089250f,  0.236485f,  -0.668654f, -0.122863f,
+  0.491152f,  -0.226444f, -0.181248f, 0.120158f,  0.294027f,  0.250056f,
+  0.307601f,  0.357875f,  -1.746455f, -0.175670f, 0.385447f,  -0.108808f,
+  -0.090235f, -0.642504f, -0.486004f, -0.055160f, -0.068692f, 0.009736f,
+  0.607555f,  -0.489426f, 0.150624f,  0.598114f,  -0.128816f, -0.445793f,
+  -0.066524f, -0.254380f, 0.227106f,  -0.406495f, -0.121632f, -0.275960f,
+  -0.136494f, 0.339457f,  -1.318132f, -0.417572f, -2.614077f, 0.324603f,
+  -0.001211f, 0.375192f,  -0.473448f, -0.162510f, 0.099329f,  -0.277965f,
+  0.101221f,  -0.060263f, 0.121867f,  -1.042140f, 0.440851f,  0.078898f,
+  -0.209007f, -0.243699f, 0.715197f,  -0.093997f, 0.086022f,  -0.178203f,
+  -2.275496f, -0.098413f, 0.199352f,  -0.526791f, -0.162086f, -0.197806f,
+  -0.231657f, -0.269202f, -0.794294f, -0.223461f, 0.503584f,  0.416236f,
+  0.064082f,  0.197655f,  0.340871f,  -0.186645f, -0.291498f, 0.433938f,
+  -1.110063f, 0.003751f,  0.392738f,  0.069360f,  0.102088f,  -0.302128f,
+  -1.518457f, 0.106939f,  0.404527f,  -0.306868f, -0.286928f, 0.729276f,
+  -0.531710f, 0.745048f,  -0.168837f, -1.953886f, -0.258828f, -0.190252f,
+  0.241877f,  -0.916744f, -0.030326f, -0.070541f, -0.271037f, 0.211303f,
+  -0.489957f, 0.100850f,  0.323999f,  -0.802837f, -0.462408f, -0.079350f,
+  -0.029374f, 0.131213f,  -0.825032f, 0.040202f,  0.351821f,  0.002869f,
+  -0.132516f, -0.471264f, -0.297002f, 0.263913f,  0.033478f,  0.146161f,
+  0.533229f,  -0.228608f, -0.200639f, -0.170955f, -0.915037f, 0.724491f,
+  0.005151f,  0.018584f,  -0.029771f, -0.396038f, -0.159236f, 0.038691f,
+  -1.197056f, 0.146302f,  0.226840f,  -0.852126f, 0.031214f,  0.108880f,
+  0.562000f,  -0.134633f, -0.713343f, -0.342252f, -1.764521f, -0.114653f,
+  0.515073f,  -0.080515f, -0.121155f, -0.865139f, -0.833694f, -0.368553f,
+  0.347673f,  0.623379f,  0.722067f,  -0.492458f, -0.513263f, 0.585167f,
+  0.721518f,  -0.693499f, 0.343725f,  -0.273861f, -0.040230f, -0.785664f,
+  -0.157500f, -0.308445f, 0.054062f,  0.600131f,  -0.860887f, 0.434470f,
+  -0.191382f, -0.306150f, -0.243965f, 0.705444f,  0.007789f,  -0.146154f,
+  -0.054499f, -0.073500f, -1.067364f, 0.404936f,  -2.864590f, 0.182323f,
+  0.326126f,  0.102405f,  -0.135800f, 1.128095f,  -0.012267f, -0.023996f,
+  -0.264834f, -0.108967f, -1.176746f, -0.926666f, 0.082999f,  -0.498361f,
+  0.083560f,  -0.210074f, 0.019225f,  -0.201614f, -0.904760f, 0.181421f,
+  0.586384f,  -0.177706f, 0.065471f,  0.168552f,  0.054705f,  0.045241f,
+  0.048057f,  -0.410957f, -2.188854f, -0.169812f, 0.015521f,  0.176856f,
+  -0.179331f, -0.352640f, -0.491735f, -1.743206f, 0.044227f,  0.010454f,
+  0.823643f,  -0.119781f, -0.098359f, 0.093119f,
+};
+
+static const float av1_ab_partition_nn_bias_128_layer1[LABEL_SIZE] = {
+  -0.433195f, -0.120488f, -0.116721f, 0.112134f,  0.118170f, -0.259769f,
+  -0.077530f, 0.394044f,  0.279167f,  -0.317988f, 0.189538f, 0.314776f,
+  0.325655f,  -0.107123f, 0.591049f,  0.358744f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_128 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      64,  // num_hidden_nodes
+  },
+  {
+      av1_ab_partition_nn_weights_128_layer0,
+      av1_ab_partition_nn_weights_128_layer1,
+  },
+  {
+      av1_ab_partition_nn_bias_128_layer0,
+      av1_ab_partition_nn_bias_128_layer1,
+  },
+};
+
+// nn model for ab partition pruning, 64x64.
+static const float av1_ab_partition_nn_weights_64_layer0[FEATURE_SIZE * 64] = {
+  -0.495347f, -0.049498f, -0.026804f, 0.030474f,  -0.289308f, -0.264193f,
+  -0.141121f, -0.072562f, -0.391665f, -0.051491f, -0.234761f, 0.027155f,
+  -0.038217f, 0.014872f,  -0.289728f, -0.233577f, -0.415875f, -0.343615f,
+  -0.442543f, -0.482492f, 0.073510f,  0.007503f,  2.162329f,  -0.362849f,
+  2.145915f,  -0.883135f, 0.185636f,  -0.062859f, -0.465574f, -0.486205f,
+  -0.056710f, -0.330642f, -0.321860f, 0.042321f,  -0.348965f, 0.003542f,
+  -0.291365f, -0.078164f, -0.345093f, -0.220272f, -0.471270f, -0.763853f,
+  0.246622f,  0.199651f,  -0.663420f, -0.154152f, -1.220383f, 0.047138f,
+  0.816811f,  0.083247f,  -0.218839f, 0.038143f,  -0.063436f, 0.015517f,
+  -0.307320f, -0.166956f, -0.169499f, -0.399005f, -0.234638f, -0.162266f,
+  0.050425f,  -0.221723f, -0.256942f, -0.287285f, 0.144011f,  -0.033245f,
+  0.083649f,  0.119428f,  -0.056706f, -0.117805f, 0.021866f,  -0.257300f,
+  -0.201378f, -0.217484f, -0.413780f, -0.145793f, 0.082792f,  -0.347247f,
+  0.042539f,  -0.302697f, 1.652316f,  0.000701f,  -0.482843f, -0.160332f,
+  -0.450099f, 0.212399f,  -4.715360f, -5.336774f, -5.375758f, -6.048339f,
+  0.085956f,  -0.037767f, 1.052409f,  -0.931924f, -2.221907f, 0.268946f,
+  0.015512f,  1.237094f,  -1.092185f, 0.418247f,  -0.082143f, -0.076914f,
+  -0.060749f, -0.325440f, -0.296960f, -0.066815f, -0.158477f, -0.373945f,
+  -0.122322f, -0.113495f, -0.097978f, -0.192816f, -0.270418f, 0.035840f,
+  -0.015458f, -0.121071f, -0.279582f, -0.067683f, 0.097855f,  0.019839f,
+  0.451127f,  0.004376f,  1.410392f,  3.255835f,  -0.344815f, 0.145202f,
+  0.204132f,  0.171948f,  -0.527736f, -0.110353f, 0.901448f,  0.003238f,
+  -3.822090f, 0.235462f,  1.024823f,  -0.821244f, 0.876056f,  2.553762f,
+  -3.478597f, -2.076582f, -0.265515f, -0.055923f, -0.156980f, -0.164097f,
+  -0.246040f, 0.039430f,  -0.071769f, -0.118847f, -0.304053f, -0.281541f,
+  -0.226021f, -0.263091f, -0.127359f, -0.249410f, -0.051023f, 0.083911f,
+  0.084721f,  0.168089f,  -0.272169f, -0.204998f, -0.008303f, -0.173998f,
+  0.079376f,  -0.197426f, -0.199052f, -0.118794f, -0.063753f, -0.094769f,
+  0.066176f,  -0.175832f, -0.238752f, -0.287960f, -0.134307f, -0.185953f,
+  -0.385845f, 0.119769f,  -0.006567f, -0.382126f, -0.214221f, 0.038449f,
+  -0.253484f, -0.282766f, -0.020249f, -0.193929f, 0.016281f,  -0.114423f,
+  -0.145940f, -0.281621f, -0.007588f, -0.131470f, -0.189012f, -0.185699f,
+  -0.279011f, -0.008132f, 0.208463f,  0.020569f,  -0.206803f, -0.213408f,
+  -0.206131f, -0.290245f, 0.069701f,  -0.000371f, -0.307572f, -0.451785f,
+  -0.300838f, -0.453186f, -0.301691f, 0.046327f,  -0.312668f, 0.058272f,
+  -0.303131f, -0.376252f, 0.108384f,  -0.086623f, -0.100630f, -0.027330f,
+  -0.003969f, 0.089502f,  -0.200722f, -0.107889f, 0.061843f,  -0.008478f,
+  -0.265057f, -0.271132f, -0.073562f, 0.129337f,  -0.283698f, -0.353414f,
+  0.076420f,  -0.244280f, -0.119537f, -0.105366f, -0.184692f, -0.038817f,
+  -0.478507f, -0.118808f, -0.472979f, -0.305884f, -0.462813f, -0.189581f,
+  -0.011932f, -0.585700f, 0.253212f,  -1.061900f, -0.205116f, -0.336407f,
+  -0.762199f, 0.577737f,  0.230832f,  0.434440f,  -0.096713f, 0.038552f,
+  -0.147800f, -0.213553f, 0.041740f,  -0.281907f, -0.026154f, -0.082356f,
+  -0.331871f, -0.408247f, -0.129022f, -0.037550f, -0.310233f, -0.320883f,
+  -0.391963f, -0.467392f, 0.027453f,  -0.394761f, -0.045544f, 0.076052f,
+  0.483985f,  0.067093f,  0.141361f,  0.576772f,  0.859718f,  2.566515f,
+  -0.025476f, 0.769738f,  -0.680235f, -1.683309f, -2.394131f, -0.000714f,
+  -0.615021f, -0.195856f, -0.434035f, -0.295010f, -0.668659f, -0.245959f,
+  0.551148f,  1.777227f,  -0.461630f, 0.043093f,  0.012293f,  -0.255841f,
+  -0.097070f, -0.371156f, -0.146323f, -0.015508f, -0.103873f, -0.087476f,
+  -0.297266f, -0.128699f, -0.149555f, 0.016534f,  -0.375498f, -0.346759f,
+  -0.455156f, -0.147509f, -0.427076f, -0.354431f, -0.158025f, -0.164604f,
+  -0.237038f, -0.010314f, -0.092884f, -0.397084f, -0.217980f, -0.127184f,
+  -0.048421f, -0.144133f, 0.889073f,  0.012606f,  3.007608f,  -0.602584f,
+  -1.849480f, -0.373159f, -1.890695f, -3.609938f, 0.811923f,  -1.867208f,
+  -0.244326f, -0.018012f, -0.211192f, -0.220196f, 0.169363f,  0.119141f,
+  -0.230715f, 0.083247f,  0.020367f,  -0.128629f, -0.217455f, -0.159640f,
+  1.815952f,  -0.369238f, -1.186447f, -0.658753f, -0.511026f, -0.096934f,
+  0.662971f,  0.486475f,  0.159746f,  -0.018932f, 3.692397f,  1.384353f,
+  -0.401984f, -0.248380f, -0.140861f, 0.215248f,  -0.023711f, 0.059679f,
+  -0.072260f, 0.004271f,  0.039545f,  -0.347971f, -0.081851f, -0.474896f,
+  -0.181572f, 0.066736f,  -0.157822f, -0.163760f, -0.171113f, -0.089935f,
+  -0.338281f, -0.421444f, -0.306687f, -0.085283f, -0.377953f, -0.138750f,
+  -0.102701f, -0.312336f, 0.149831f,  0.007229f,  -0.155700f, -0.173611f,
+  4.074261f,  1.342306f,  -1.272712f, 1.570899f,  -0.545093f, -0.317605f,
+  -0.189440f, -0.133910f, -0.273190f, -0.108020f, -0.166107f, 0.021413f,
+  -0.239130f, -0.067211f, 0.041957f,  -0.039234f, -1.003587f, -0.094412f,
+  0.532512f,  -0.870538f, -1.118023f, -1.160983f, -0.736307f, -0.418752f,
+  0.419466f,  0.492122f,  -0.004368f, -0.022096f, -1.115132f, 0.150886f,
+  2.396852f,  2.660000f,  -0.376537f, 0.468628f,  0.149413f,  -0.074898f,
+  -0.067154f, 0.021245f,  0.127857f,  0.294189f,  0.508056f,  0.390232f,
+  -3.899177f, -3.414681f, -3.929195f, -4.160545f, -0.274323f, -0.052583f,
+  -0.003545f, -0.433084f, -0.404891f, -0.145051f, -0.312367f, 0.004579f,
+  -0.398724f, -0.372068f, -0.234279f, 0.017799f,  -0.424760f, -0.646717f,
+  -0.047568f, 2.924664f,  -0.644165f, 0.359349f,  -0.294800f, 0.591746f,
+  -0.404710f, -0.092358f, -0.250729f, 0.030829f,  -0.147149f, -0.476023f,
+  -0.071803f, -0.482516f, -0.293117f, -0.215923f, -0.373122f, -0.085315f,
+  -0.377052f, -0.449899f, -0.056452f, 0.138081f,  -0.085350f, -0.308391f,
+  0.106661f,  0.176234f,  0.258869f,  -0.230172f, -0.233029f, -0.241208f,
+  -0.067509f, -0.223172f, -0.118353f, -0.302478f, -0.579632f, -0.561326f,
+  -0.158114f, -0.223167f, -0.026689f, 0.051863f,  0.212834f,  -0.304714f,
+  -0.169071f, -0.193695f, -0.075682f, -0.170860f, -0.241008f, -0.044648f,
+  0.280815f,  -0.002585f, -0.283552f, -0.037701f, -0.681169f, -0.274535f,
+  -0.380595f, 0.109504f,  -0.111141f, -0.437685f, -0.094459f, 0.144206f,
+  -0.106139f, -0.211832f, -0.054742f, -0.172813f, -0.295905f, -0.071907f,
+  -0.418429f, -0.183240f, 0.031319f,  -0.095785f, -0.315447f, 0.069404f,
+  -0.422910f, -0.029867f, -0.357321f, -0.199976f, -0.337707f, -0.070188f,
+  -0.178198f, 0.177208f,  0.134688f,  -0.081933f, -0.229452f, -0.208872f,
+  0.026287f,  -0.364040f, -0.063696f, -0.227443f, -0.234401f, -0.205699f,
+  -0.267238f, -0.494125f, -0.056255f, 0.053715f,  -0.487754f, 0.014818f,
+  0.087383f,  -0.077556f, -0.168085f, -0.436851f, -0.276286f, -0.137845f,
+  -0.107606f, -0.103653f, -0.233766f, -0.419083f, 0.169185f,  0.010186f,
+  -0.001587f, 0.086735f,  -2.465718f, 1.482185f,  1.621193f,  -2.081680f,
+  1.386553f,  -3.204335f, -0.267111f, -0.004508f, 0.164712f,  0.274147f,
+  1.724306f,  -2.273659f, 0.749574f,  -0.891905f, 0.105965f,  -0.030428f,
+  -0.416018f, -0.300762f, 0.122911f,  -0.316908f, -0.292504f, 0.138666f,
+  -0.161327f, -0.042143f, -0.249128f, 0.149210f,  -0.088987f, -0.654101f,
+  -1.501843f, 0.216777f,  0.955914f,  0.524158f,  -1.642561f, -1.643626f,
+  0.864797f,  -0.425451f, -2.115764f, -0.012502f, 0.065172f,  1.297270f,
+  0.018845f,  1.167276f,  -0.470970f, -0.244995f, 0.374782f,  -1.811056f,
+  -0.055430f, -0.024102f, -0.376519f, -0.339640f, -0.119177f, -0.277995f,
+  -0.290095f, -0.081362f, -0.144139f, -0.118037f, -0.180357f, -0.217559f,
+  -0.370683f, 0.172816f,  -0.265069f, 0.194321f,  -0.273478f, 0.037442f,
+  -0.235552f, -0.078625f, -0.447541f, 0.016836f,  -0.271123f, -0.171481f,
+  -0.321477f, -0.184826f, -0.442981f, -0.227273f, -0.370666f, -0.237232f,
+  -0.257493f, -0.225714f, -0.153716f, -0.283487f, -0.155399f, 0.067697f,
+  0.230343f,  -0.034318f, -0.022687f, -0.047090f,
+};
+
+static const float av1_ab_partition_nn_bias_64_layer0[64] = {
+  -0.212182f, -0.233725f, -0.758846f, -0.158162f, 0.614743f,  -0.150944f,
+  -0.075727f, -0.208414f, 1.054996f,  0.713758f,  -0.300051f, -0.151482f,
+  -2.443570f, 0.430590f,  -0.129001f, -0.160733f, -0.230547f, -0.143228f,
+  -0.140577f, -0.086812f, -0.212298f, -0.159557f, -0.055647f, -0.211423f,
+  0.578161f,  -0.220318f, -0.210107f, -3.111584f, 0.604419f,  -0.232622f,
+  -0.209924f, -0.130794f, -0.084097f, -0.036005f, 0.294594f,  -2.535531f,
+  -0.209783f, -0.211189f, -2.766337f, 0.000000f,  0.450177f,  -1.754884f,
+  3.262664f,  -0.209691f, -0.614886f, -0.211257f, -0.109096f, -0.190492f,
+  -0.109007f, -0.026910f, -0.136035f, -0.212321f, -0.139320f, -0.212233f,
+  -0.305430f, 0.739171f,  0.991277f,  -0.088150f, 0.086313f,  -0.023379f,
+  -0.125366f, -0.063576f, -0.212169f, -0.047463f,
+};
+
+static const float av1_ab_partition_nn_weights_64_layer1[64 * LABEL_SIZE] = {
+  -0.036800f, 0.528721f,  0.490767f,   0.144409f,  1.103640f,  0.361910f,
+  -0.180069f, 0.068033f,  -14.868382f, 0.359013f,  0.322567f,  -0.199212f,
+  0.906164f,  -0.488254f, 0.149653f,   -0.216394f, -0.099347f, 0.004936f,
+  -0.111391f, 0.074848f,  -0.041709f,  0.147627f,  -0.018905f, 0.096116f,
+  0.184817f,  -0.016241f, 0.115739f,   2.376754f,  0.637097f,  0.052954f,
+  0.136428f,  0.225267f,  -0.181873f,  -0.142876f, 0.684048f,  0.658791f,
+  0.105795f,  0.241705f,  1.381114f,   -0.209379f, 1.145949f,  0.795293f,
+  -9.361877f, 0.198302f,  0.539600f,   0.092317f,  -0.081695f, 0.200777f,
+  0.102334f,  0.081583f,  0.060948f,   -0.025110f, 0.160951f,  -0.020170f,
+  0.234006f,  -0.029369f, 0.375036f,   0.270209f,  -0.556529f, 1.402949f,
+  0.101777f,  -0.027331f, 0.004502f,   -0.153166f, -0.116651f, 0.151573f,
+  -0.022187f, 0.144044f,  -0.108719f,  -0.129942f, -0.270321f, 0.227363f,
+  1.892330f,  -0.661052f, -0.219398f,  -0.229417f, -0.856438f, -1.196988f,
+  -0.081774f, 0.078847f,  -0.207057f,  -0.048947f, 0.152073f,  -0.243056f,
+  -0.233329f, -0.288689f, -0.158333f,  -0.141177f, -0.715436f, 0.016947f,
+  -0.093752f, 0.204984f,  -1.209782f,  0.155683f,  0.092239f,  0.146495f,
+  0.813146f,  -0.027757f, 0.330982f,   2.173948f,  -0.028867f, -0.141815f,
+  0.292708f,  -0.204794f, 0.014496f,   1.032799f,  1.312155f,  0.107020f,
+  0.824752f,  -0.013945f, 0.184829f,   -0.041633f, 0.215300f,  -0.476088f,
+  -0.053213f, 0.126862f,  -0.020777f,  0.082893f,  -0.223727f, -0.923063f,
+  0.466529f,  0.082140f,  -0.845758f,  -1.140791f, -0.262033f, 0.138491f,
+  0.151717f,  -0.182479f, -0.131128f,  0.055411f,  0.106771f,  0.125552f,
+  0.297184f,  -0.257403f, -0.059884f,  -0.274903f, 2.694357f,  -0.108244f,
+  0.025377f,  0.043092f,  -0.558317f,  3.517159f,  -0.270833f, -0.240676f,
+  0.205100f,  -0.057068f, -0.140445f,  -0.193449f, -0.030061f, -0.286762f,
+  -0.467523f, -0.012647f, 0.190564f,   0.022394f,  -0.101479f, 0.339684f,
+  -0.902743f, -0.169578f, -0.178029f,  -0.041836f, -3.952108f, -0.028298f,
+  -0.221137f, -0.733895f, -0.223895f,  0.039012f,  0.687867f,  0.021423f,
+  0.113063f,  0.676087f,  -0.961000f,  -0.064847f, 0.712856f,  -0.192765f,
+  -0.001132f, 0.016689f,  -0.236020f,  -0.766186f, -0.175729f, 0.012879f,
+  -0.251064f, -0.105523f, -0.039212f,  -0.347584f, 0.304352f,  -0.034174f,
+  -0.364258f, -0.685252f, -0.266115f,  -0.247345f, -0.155905f, 0.152283f,
+  -0.156315f, 0.174082f,  -0.757654f,  0.102303f,  -2.192316f, -0.245815f,
+  0.119882f,  -0.086542f, 1.987246f,   -1.353163f, -0.374813f, -0.233504f,
+  -1.980895f, 0.692093f,  -0.168351f,  0.172700f,  -0.009052f, -0.015734f,
+  0.106679f,  -0.060472f, -0.256813f,  -0.074874f, -0.207488f, -0.329515f,
+  -0.418268f, -0.017940f, -0.036081f,  0.064719f,  -1.488016f, 0.020591f,
+  -0.176325f, -0.141074f, 0.944494f,   0.150237f,  -0.249805f, -0.277280f,
+  0.012686f,  0.132483f,  0.116123f,   0.013737f,  -0.116091f, 0.750340f,
+  3.251343f,  -0.188864f, 1.096992f,   0.058467f,  -0.041433f, -0.037937f,
+  -0.133294f, -0.137908f, -0.171132f,  0.106362f,  0.069383f,  -0.052662f,
+  -0.177883f, -0.408049f, 0.680221f,   -0.117035f, -0.904240f, -1.395228f,
+  0.154527f,  0.134427f,  0.022767f,   -0.158886f, -0.230316f, 0.161096f,
+  0.362213f,  -0.235060f, -0.941620f,  0.055912f,  -0.049458f, -0.166632f,
+  0.481418f,  0.930146f,  0.041108f,   0.033674f,  1.372066f,  -1.847709f,
+  0.003324f,  0.259534f,  0.177014f,   -0.202761f, -0.262017f, -0.190852f,
+  -0.102839f, 0.028338f,  0.187193f,   -0.041684f, 0.123973f,  -0.198576f,
+  -0.110369f, -1.431400f, 0.208369f,   -0.302370f, -0.248549f, 0.062985f,
+  0.673409f,  0.036662f,  -0.711340f,  -0.120584f, -0.189789f, 0.098812f,
+  2.947819f,  0.216567f,  -0.414472f,  -0.181742f, 1.873779f,  -0.222726f,
+  -0.782870f, 0.007889f,  0.015062f,   -0.554328f, 0.182928f,  -0.191430f,
+  0.123636f,  -0.215460f, -0.225245f,  0.251516f,  -0.013025f, -1.359595f,
+  -0.750602f, 0.342667f,  -0.141899f,  -0.687493f, -0.072639f, 0.048018f,
+  -0.242107f, -0.031917f, -0.287472f,  -0.046088f, 0.832197f,  -0.016576f,
+  -1.553349f, -0.216341f, 0.023077f,   -0.410867f, 4.243743f,  -0.514878f,
+  -0.066007f, -0.160696f, -0.262678f,  -0.648790f, -0.430586f, 0.199940f,
+  -0.202496f, -0.222241f, -0.016406f,  -0.121473f, 0.000828f,  -0.081584f,
+  -0.152641f, -0.190166f, 0.644400f,   0.040196f,  -0.302104f, -1.143654f,
+  -0.160327f, -0.320780f, -0.187006f,  0.037311f,  0.440618f,  -0.070733f,
+  -0.117785f, 1.527539f,  -0.419310f,  0.001300f,  1.389956f,  -0.036366f,
+  -0.269203f, 0.612265f,  2.721897f,   -0.086836f, -0.446999f, 0.012525f,
+  -0.078317f, -0.287052f, -0.111188f,  -0.085181f, -0.164667f, -0.010466f,
+  -0.569722f, -0.018888f, -0.101663f,  -1.147130f, -0.465204f, 0.114524f,
+  -2.192402f, -0.221325f, 0.375748f,   0.206284f,  -0.261548f, -0.246257f,
+  -0.143004f, -0.069981f, -0.057306f,  -0.116481f, -0.435903f, -0.314970f,
+  0.013210f,  -0.010175f, 4.630571f,   -0.473226f, -0.197199f, -0.028204f,
+  0.122907f,  2.475548f,  0.025011f,   -0.092603f, -0.127561f, -0.151330f,
+  -0.077295f, 0.245016f,  -0.045005f,  0.183396f,  -0.330556f, -0.384887f,
+  0.356374f,  -0.016618f, -0.463353f,  -1.291546f, -0.071986f, -0.311599f,
+  0.072385f,  -0.430786f, -2.094788f,  0.202733f,  -0.910109f, -1.336543f,
+  -0.086800f, -0.096413f, 1.544383f,   0.031860f,  -0.796211f, 0.762786f,
+  3.250022f,  -0.441798f, -0.698537f,  0.062839f,  0.033525f,  -0.362996f,
+  0.027022f,  -1.131264f, -0.228926f,  0.053885f,  -0.338628f, 0.155037f,
+  -0.046844f, -0.888172f, -0.241767f,  0.084965f,  -0.617743f, -0.049896f,
+  -0.036894f, -0.304783f, -0.002639f,  0.137957f,  0.052121f,  -0.131161f,
+  -0.117200f, -0.253380f, -0.205561f,  -0.302450f, -0.047397f, -0.330518f,
+  3.613420f,  -1.525951f, -0.026738f,  0.209150f,  -2.103534f, 2.019689f,
+  -0.366199f, -0.095260f, 0.027417f,   -0.242512f, 0.162579f,  0.052113f,
+  -0.293851f, -0.068138f, -0.005799f,  -0.344696f, -0.114824f, -0.431107f,
+  -0.120058f, -1.139926f, -1.048379f,  0.036446f,  -0.323020f, -0.432945f,
+  0.454151f,  -0.140058f, 0.050649f,   -0.094900f, -0.017278f, -0.238719f,
+  1.193153f,  0.120447f,  -0.496061f,  0.917431f,  2.936126f,  -0.115521f,
+  -0.347397f, -0.435325f, -0.004383f,  -0.211864f, 0.162383f,  -1.040726f,
+  0.089537f,  -0.128579f, -0.133505f,  0.107129f,  -0.435657f, -0.180388f,
+  0.043650f,  0.018709f,  -0.773242f,  -0.687192f, -0.120633f, -0.063626f,
+  0.029912f,  0.113972f,  -0.403502f,  -0.127640f, -0.269625f, 0.129794f,
+  -0.188539f, 0.041641f,  0.029769f,   -0.198374f, 1.401407f,  0.353887f,
+  -0.219925f, 0.260515f,  1.157034f,   -2.992044f, -0.097618f, -0.064417f,
+  -0.203626f, -0.008217f, -0.112339f,  -0.227407f, -0.155118f, 0.247705f,
+  -0.012304f, -0.248447f, -0.913463f,  -0.064788f, -0.214619f, -0.251761f,
+  -0.386861f, -0.040574f, -0.163219f,  -0.100700f, 1.488274f,  -0.071684f,
+  -0.033626f, -0.006497f, -0.246945f,  -0.145221f, -3.747390f, 0.149609f,
+  -0.263326f, -0.297385f, -1.039896f,  -0.083174f, -0.025473f, -0.235586f,
+  -0.001087f, 0.254286f,  0.265106f,   0.007325f,  0.199239f,  0.134103f,
+  -0.578211f, -0.259801f, -0.062373f,  2.368348f,  0.560556f,  -0.252260f,
+  0.889997f,  -0.447872f, -0.059218f,  -0.095315f, -0.061667f, 0.183580f,
+  -0.157479f, 0.055387f,  -0.831734f,  0.007606f,  -1.104906f, 0.301180f,
+  -0.117115f, 0.212959f,  4.727223f,   -0.243833f, -0.397495f, -0.025021f,
+  -0.367587f, -2.082058f, -0.217699f,  0.148111f,  0.252430f,  0.111088f,
+  -0.260692f, 0.095124f,  -0.407774f,  -0.322169f, 0.002927f,  0.126169f,
+  -1.272325f, -0.279772f, -0.373680f,  -0.485177f, -0.605458f, 0.021225f,
+  -0.092031f, -0.226585f, 1.895162f,   0.037866f,  -0.275475f, 1.614360f,
+  -0.014972f, -0.277679f, -3.449082f,  -0.092060f, -0.747873f, 0.020716f,
+  2.776178f,  -0.049963f, 0.183999f,   -0.295259f, -0.028868f, 0.221895f,
+  0.001265f,  0.336823f,  0.219372f,   0.112824f,  0.408132f,  -0.017940f,
+  -0.311666f, 1.489606f,  -0.058093f,  -0.305659f, -0.491933f, -0.143847f,
+  0.166115f,  0.042867f,  -0.123447f,  -0.087099f, -0.305395f, -0.365079f,
+  -0.755801f, -0.160649f, 0.736260f,   -0.008611f, 0.095836f,  -0.017345f,
+  5.697515f,  -0.498971f, -0.125280f,  0.199907f,  0.300053f,  0.605026f,
+  -0.228225f, -0.259523f, 0.016384f,   0.146973f,  0.210258f,  0.226766f,
+  -0.075178f, -0.050924f, 0.188496f,   -0.415266f, -0.484880f, -0.236384f,
+  0.071931f,  -0.331863f, -0.601243f,  -0.232479f, -0.285272f, 0.123789f,
+  -1.341333f, 0.037082f,  -0.315202f,  -1.587215f, -0.271576f, 0.003216f,
+  -4.437186f, -0.256205f, -0.576589f,  -0.114147f, 2.153916f,  -0.369618f,
+  0.271415f,  0.145036f,  -0.158731f,  -0.240938f, -0.187369f, 0.036325f,
+  0.254771f,  0.211488f,  -0.240297f,  0.098417f,  -0.415011f, 2.334793f,
+  -0.127252f, 0.020069f,  -0.168755f,  -0.448922f, -0.219207f, 0.016232f,
+  -0.221935f, -0.269500f, -0.100636f,  0.102545f,  -0.809376f, -0.054979f,
+  0.360713f,  -0.326541f, 0.112933f,   0.138073f,  4.229404f,  -0.763801f,
+  -0.305429f, 0.199955f,  -1.787713f,  0.272866f,  0.109895f,  0.138466f,
+  -0.250259f, -0.167162f, -0.212588f,  -0.217589f, -0.067125f, -0.077490f,
+  -0.208970f, -0.006863f, -0.671146f,  -0.298320f, -0.165509f, 0.044597f,
+  -1.408624f, -0.213957f, -0.220947f,  0.129718f,  1.316777f,  -0.098928f,
+  -0.008121f, -0.558293f, -0.297290f,  -0.218873f, -4.346638f, -0.228174f,
+  -0.204710f, -0.388864f, 2.697919f,   0.025260f,  0.857020f,  0.009921f,
+  0.036915f,  -0.320275f, -0.087937f,  0.022636f,  0.236667f,  0.135496f,
+  -0.059616f, -0.192955f, 0.009470f,   2.139589f,  -0.200449f, 0.129818f,
+  1.017444f,  -0.608299f, 0.257914f,   -0.134306f, -0.033327f, 0.002855f,
+  -0.338598f, 0.015559f,  0.117362f,   -0.166760f, 0.086903f,  -0.167666f,
+  0.193523f,  0.033852f,  -1.147686f,  0.489468f,  -0.006969f, 0.125630f,
+  1.557907f,  -1.604449f, -0.071114f,  0.096178f,  0.007065f,  0.200013f,
+  0.213393f,  0.168466f,  -0.100568f,  -0.117861f, -0.161542f, -0.072561f,
+  -1.069871f, -0.470138f, -0.352578f,  -1.503513f, -0.001394f, -0.380109f,
+  0.065089f,  -0.281668f, 0.988953f,   -0.002778f, -0.659026f, -0.470692f,
+  -0.407292f, 0.011710f,  -1.362085f,  0.184738f,  -0.135786f, -1.374241f,
+  4.487930f,  -0.067274f, -0.956404f,  -0.233995f, 0.224527f,  -0.454556f,
+  0.037900f,  -0.281658f, 0.208224f,   -0.254753f, 0.045740f,  0.051444f,
+  -0.388281f, 0.257112f,  -0.485030f,  -0.082659f, 0.148103f,  -1.007456f,
+  -0.022295f, 0.036984f,  -0.369401f,  -0.076943f, -0.007636f, -0.293022f,
+  0.470466f,  0.199012f,  -2.158182f,  0.036577f,  -0.014725f, -0.229516f,
+  2.236929f,  0.030945f,  -0.400045f,  0.109348f,  0.214691f,  -0.891516f,
+  -0.251379f, -0.217358f, 0.013733f,   0.205573f,  -0.151725f, -0.191782f,
+  -0.339630f, -0.163905f, -0.119191f,  -0.032516f, 0.503015f,  0.025772f,
+  0.029094f,  -1.146153f, 0.216723f,   -0.330023f, 0.064695f,  -0.262521f,
+  0.425612f,  -0.093080f, -0.489648f,  1.051293f,  -0.092332f, 0.095557f,
+  -0.874132f, 0.218483f,  -0.127648f,  -1.605802f, 2.763617f,  -0.186734f,
+  -1.243166f, -0.193514f, -0.173748f,  0.337822f,  0.183873f,  -0.251594f,
+  -0.211582f, 0.144081f,  0.029620f,   -0.024853f, -0.385140f, 0.467341f,
+  -0.928316f, -0.195442f, 0.917783f,   0.357084f,  0.174445f,  -0.073659f,
+  -0.012811f, -0.115420f, -0.181147f,  -0.364449f, -0.567395f, -0.012969f,
+  -1.680714f, 0.065323f,  0.198063f,   -0.244201f, 1.428545f,  -0.432539f,
+  -0.208931f, -0.091205f, 0.957125f,   0.813519f,  -0.262677f, 0.246852f,
+  0.015536f,  0.055026f,  0.067054f,   0.262103f,  -0.358115f, -0.095206f,
+  -0.267522f, -0.402710f, -0.680397f,  -0.123627f, -0.385590f, -1.504680f,
+  -0.169513f, -0.215338f, 0.043633f,   -0.079052f, -0.464410f, 0.122894f,
+  -0.278231f, -2.456445f, -0.159917f,  -0.015597f, -0.735449f, -0.078854f,
+  -0.400290f, -1.153870f, 3.657228f,   -0.287093f, -1.174355f, -0.102001f,
+  -0.288281f, 0.185209f,  -0.145228f,  -0.200449f, -0.099914f, -0.138354f,
+  0.254428f,  -0.161751f, -0.118206f,  0.296043f,  -0.482613f, 0.080932f,
+  1.097605f,  -0.010190f, 0.232439f,   0.447617f,  -0.133508f, 0.115763f,
+  -0.388589f, 0.174695f,  -0.236014f,  0.006284f,  -1.374129f, 0.092015f,
+  -0.241419f, -0.231667f, 2.763950f,   -0.922932f, -0.061605f, 0.208740f,
+  -1.597190f, 1.353325f,  -0.198528f,  0.250498f,  -0.013950f, -0.203861f,
+  -0.254563f, 0.081931f,  -0.413369f,  0.011844f,  0.080961f,  -0.231161f,
+  -1.234909f, -0.440843f, -0.174980f,  -0.315283f, -0.337474f, -0.123243f,
+  -0.310001f, -0.271028f, 0.364179f,   0.022845f,  -0.535517f, -0.772936f,
+  -0.188435f, 0.039667f,  -0.807463f,  0.266550f,  -0.288857f, -1.630789f,
+  1.280155f,  0.065712f,  -0.279960f,  -0.300056f, 0.258440f,  -0.073781f,
+  0.213878f,  0.042196f,  0.021360f,   0.211698f,  -0.003751f, -0.192673f,
+  -0.137008f, 0.247878f,  -0.470604f,  0.073164f,  1.523241f,  0.734755f,
+  -0.114126f, -0.193834f, -0.025759f,  0.263183f,
+};
+
+static const float av1_ab_partition_nn_bias_64_layer1[LABEL_SIZE] = {
+  -0.343508f, -0.706936f, -0.160676f, -0.877101f, -0.517567f, -0.253254f,
+  -0.148074f, 0.923430f,  -0.364770f, 0.203550f,  0.401216f,  0.938246f,
+  -0.872737f, 0.718723f,  0.703398f,  2.560015f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_64 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      64,  // num_hidden_nodes
+  },
+  {
+      av1_ab_partition_nn_weights_64_layer0,
+      av1_ab_partition_nn_weights_64_layer1,
+  },
+  {
+      av1_ab_partition_nn_bias_64_layer0,
+      av1_ab_partition_nn_bias_64_layer1,
+  },
+};
+
+// nn model for ab partition pruning, 32x32.
+static const float av1_ab_partition_nn_weights_32_layer0[FEATURE_SIZE * 64] = {
+  -0.323723f, -0.214013f, -0.007772f, -0.458851f, -0.125542f, -0.123860f,
+  -0.410973f, -0.209389f, -0.087580f, -0.272881f, -0.168500f, -1.130845f,
+  0.344916f,  -0.475017f, -0.362262f, -0.195662f, -0.566124f, 0.782163f,
+  0.411575f,  -0.013378f, -0.318650f, -0.124678f, -0.612909f, -0.315788f,
+  -0.263990f, -0.508783f, -0.048938f, -0.416407f, -0.402648f, -0.156644f,
+  0.225887f,  -0.000493f, 2.682241f,  0.871204f,  0.059014f,  0.803542f,
+  -1.407028f, -1.154669f, 1.388148f,  -0.293348f, -0.003669f, -0.009607f,
+  1.330030f,  -0.337841f, 2.118617f,  1.033059f,  -0.084788f, 0.212904f,
+  0.082405f,  -0.070579f, -0.494005f, -0.173392f, 0.039546f,  -0.463865f,
+  0.077163f,  -0.434066f, 0.030835f,  -0.427139f, -0.560520f, -0.031606f,
+  -0.368541f, -0.027458f, 0.370574f,  0.461418f,  1.087682f,  -0.572137f,
+  -1.509596f, -0.765697f, -0.499383f, -0.277998f, -0.106492f, -0.129564f,
+  -0.169133f, -0.269834f, -0.114270f, -0.275431f, 0.016339f,  -0.156744f,
+  -0.267922f, 0.171216f,  0.110556f,  0.002954f,  -0.200327f, -0.187663f,
+  3.691601f,  1.234152f,  0.186315f,  -0.125370f, -0.211235f, -0.554432f,
+  -0.131072f, -0.124982f, -0.130339f, -0.235350f, 0.018903f,  0.012896f,
+  -0.159372f, -0.269571f, -0.025709f, -0.221251f, 0.061919f,  0.016307f,
+  0.384673f,  -0.134525f, -1.599126f, -0.416459f, -0.743052f, 0.670249f,
+  -0.169709f, 0.421681f,  -0.033360f, -0.072817f, 0.003647f,  -0.110632f,
+  -0.158651f, -0.095136f, 0.223759f,  0.165767f,  -0.269129f, -0.196075f,
+  -0.023183f, -0.293420f, 0.014875f,  0.018688f,  -0.153407f, -0.172009f,
+  -0.259947f, -0.124015f, 0.173653f,  -0.089103f, -0.021001f, -0.334230f,
+  0.027177f,  0.103371f,  -0.183860f, -0.204051f, -0.023721f, -0.192297f,
+  -0.143771f, -0.247106f, 0.218116f,  -0.013240f, 2.831783f,  1.483928f,
+  -0.877025f, -0.313462f, -0.411320f, -0.447825f, 0.605977f,  0.234684f,
+  -0.119150f, -0.075182f, -0.330463f, 0.071503f,  -0.254924f, -0.360071f,
+  -0.037022f, 0.063261f,  -0.148759f, -0.238254f, -0.462018f, -0.027166f,
+  0.065318f,  -0.235743f, -0.257194f, -0.094784f, 0.022423f,  0.055925f,
+  0.086672f,  -0.021010f, 0.009965f,  -0.001648f, -0.104917f, -0.387443f,
+  -0.102673f, -0.281706f, 0.145923f,  -0.233391f, -0.378365f, -0.145584f,
+  -0.077751f, -0.121166f, 1.134565f,  -0.097500f, -0.749202f, -0.544566f,
+  -1.361374f, -0.102494f, 1.089275f,  0.375299f,  -0.105091f, 0.037641f,
+  -0.054248f, -0.282691f, -0.377797f, -0.066427f, -0.253815f, -0.329677f,
+  -0.339326f, -0.128217f, -0.282905f, 0.014937f,  1.067185f,  -0.171764f,
+  0.484458f,  0.396706f,  -0.557055f, -0.891596f, -0.257839f, -0.720879f,
+  -0.218449f, -0.004755f, 1.572857f,  0.006229f,  1.962895f,  -0.029746f,
+  -4.137691f, -2.185991f, -2.763477f, -0.520437f, -0.208708f, 0.006444f,
+  -1.263078f, -0.304560f, 1.072374f,  2.556429f,  0.312850f,  0.257488f,
+  -0.634264f, 0.156769f,  -0.188943f, 0.040295f,  -0.389915f, 0.085250f,
+  -0.248525f, 0.045667f,  -0.776115f, -0.274680f, -0.448145f, -0.566161f,
+  -1.285316f, 0.079060f,  0.389124f,  -0.510401f, -0.015299f, -0.664661f,
+  0.099901f,  -0.470694f, -0.051593f, -1.076381f, -0.442104f, -0.197867f,
+  -0.330011f, -0.448523f, -0.301018f, -0.442093f, -0.491953f, -0.582091f,
+  -0.064569f, -0.156516f, 0.543522f,  -0.005924f, 0.161432f,  0.974793f,
+  0.273712f,  1.104850f,  -0.290312f, 0.313417f,  -0.125370f, 0.136234f,
+  -0.191227f, -0.165054f, 0.011872f,  -0.298871f, 0.095740f,  0.142760f,
+  -0.215771f, -0.031437f, 0.101041f,  -0.085620f, 0.435387f,  0.002786f,
+  1.971375f,  0.018392f,  -1.771940f, -0.401433f, 0.808263f,  -3.350013f,
+  2.296952f,  -1.024403f, -0.041645f, -0.034799f, -0.024078f, -0.347301f,
+  -0.276088f, -0.455907f, 0.266021f,  0.087348f,  -0.146566f, 0.040492f,
+  -0.539866f, -0.206851f, -0.387874f, -0.125508f, -0.496676f, -0.373845f,
+  -0.472356f, -0.357082f, -0.081254f, -0.456466f, 0.554713f,  0.002185f,
+  -4.225019f, 0.344025f,  0.728796f,  -0.262936f, 1.383924f,  1.577300f,
+  -2.653320f, -2.516156f, -0.301604f, -0.204105f, -0.138252f, -0.587536f,
+  -0.097889f, -0.352414f, -0.288276f, -0.184340f, -0.122741f, -0.243376f,
+  0.031970f,  -0.373402f, -0.396079f, 0.045566f,  0.072595f,  -0.222681f,
+  -0.243802f, -0.340129f, -0.258494f, -0.192041f, -0.386112f, -0.240940f,
+  -0.047268f, -0.555802f, -0.032514f, -0.241341f, -0.167463f, -0.478308f,
+  -0.205936f, -0.316275f, 0.103729f,  -0.197893f, -0.128029f, -0.218796f,
+  -0.167362f, -0.111814f, -0.126062f, -0.394260f, -0.025357f, -0.402697f,
+  -0.587395f, -0.400385f, -0.259664f, -0.415588f, -0.338503f, -0.399166f,
+  -0.270504f, 0.234505f,  0.272144f,  0.266938f,  -0.392395f, -0.011717f,
+  -0.384221f, -0.473446f, -0.038420f, -0.241101f, -0.234402f, -0.275567f,
+  -0.410454f, -0.377599f, -0.179099f, -0.138432f, -0.248083f, -0.543026f,
+  -0.428043f, -0.239895f, -0.333193f, -0.103346f, -0.039038f, -0.171109f,
+  -0.119432f, -0.222351f, 0.000450f,  0.208724f,  -0.510526f, -0.144656f,
+  -0.316721f, -0.344846f, -0.244794f, -0.129134f, -0.045634f, -0.400183f,
+  0.043714f,  -0.235414f, 0.115594f,  -0.195616f, -0.106693f, -0.124242f,
+  0.083990f,  0.049110f,  -0.196130f, -0.059860f, -0.464235f, -0.516443f,
+  -0.101521f, -0.422379f, -0.413955f, -0.042991f, -0.345263f, -0.129264f,
+  -0.106911f, -0.140156f, -0.457841f, -0.199848f, -0.218954f, -0.329850f,
+  -0.364097f, -0.335262f, -0.312254f, -0.299331f, -0.052710f, -0.251019f,
+  -0.023459f, -0.222538f, 0.028849f,  -0.088038f, -0.301550f, -0.273566f,
+  0.067295f,  -0.174608f, -0.445784f, -0.158366f, -0.567275f, -0.557652f,
+  -0.353503f, -0.302092f, -0.302049f, -0.551793f, -0.034535f, -0.225190f,
+  -0.210733f, -0.219377f, -0.057197f, -0.430933f, -0.025185f, -0.388150f,
+  -0.086147f, -0.430088f, 0.058466f,  -0.152129f, -0.058411f, -0.236392f,
+  -0.547669f, -0.613849f, -0.893774f, -0.351715f, -0.399227f, -0.454909f,
+  -0.324501f, 0.000490f,  -0.282167f, -0.073163f, -0.281452f, 0.047932f,
+  -0.175500f, 0.165220f,  -0.276212f, 0.062153f,  -0.217054f, -0.255487f,
+  -0.146416f, -0.097718f, -0.173809f, -0.559328f, -0.055695f, -0.391193f,
+  -0.132020f, -0.561184f, -0.308666f, -0.474053f, -0.219149f, -0.246558f,
+  -0.158325f, 0.151907f,  -0.266835f, -0.144697f, -0.193960f, -0.046587f,
+  -0.220028f, -0.247355f, 0.135584f,  0.016511f,  0.367705f,  -1.855877f,
+  0.435622f,  0.444710f,  -3.372301f, -3.030489f, 1.013267f,  0.380951f,
+  -0.170011f, -0.111415f, -0.456146f, -0.107254f, -0.095220f, -0.053078f,
+  -0.135864f, -0.591949f, -0.252810f, -0.324799f, -0.094796f, -0.260969f,
+  -0.391981f, -0.063170f, -0.336130f, -0.470127f, -0.405168f, -0.433219f,
+  -0.309563f, -0.295462f, -0.552270f, -0.012300f, -0.057793f, -0.034494f,
+  -0.446843f, -0.640160f, -1.188681f, -0.791361f, 0.543271f,  1.189112f,
+  1.458468f,  -0.005876f, -0.927475f, 0.062038f,  -1.170818f, 0.338227f,
+  -3.007096f, -4.559296f, -4.045457f, -5.953635f, -0.228386f, -0.266890f,
+  -0.092595f, -0.377440f, -0.044534f, -0.053565f, -0.349268f, -0.415030f,
+  -0.310094f, 0.062721f,  0.251422f,  -0.014350f, -1.282910f, 1.619560f,
+  1.180566f,  -0.032163f, -1.322951f, -0.603601f, 1.443710f,  0.654650f,
+  -0.393227f, 0.003536f,  0.029725f,  -0.108925f, -0.053911f, 0.133977f,
+  -0.036145f, -0.168438f, 0.046989f,  -0.331463f, -0.176983f, -0.311922f,
+  -0.272389f, -0.379592f, -0.399993f, -0.297873f, -0.193425f, -0.177524f,
+  -0.258309f, -0.567312f, -0.260217f, -0.241869f, 0.024010f,  -0.032867f,
+  -0.039424f, -0.063670f, 0.193808f,  -0.303514f, -0.013376f, -0.057761f,
+  0.187922f,  0.006938f,  0.031810f,  0.180594f,  -1.198427f, 2.820662f,
+  0.154986f,  -0.375518f, 0.116925f,  -0.795782f, -0.085139f, -0.079365f,
+  -0.197936f, -0.321468f, -0.205271f, -0.558203f, -0.296235f, -0.151193f,
+  -0.158282f, -0.245402f, -0.208504f, -0.042335f, -0.087426f, -0.557129f,
+  -0.381427f, -0.441551f, -0.541011f, -0.060567f, -0.469305f, -0.032326f,
+  -2.453587f, -0.045568f, -0.296932f, 0.613061f,  -0.320284f, 0.191620f,
+  -0.827145f, -0.225277f, 0.275800f,  1.696635f,
+};
+
+static const float av1_ab_partition_nn_bias_32_layer0[64] = {
+  -0.176206f, 0.660189f,  -0.186156f, -2.481963f, -1.564218f, -0.280424f,
+  0.732684f,  -0.135581f, -2.193132f, -0.172771f, 0.605001f,  -0.060392f,
+  -0.067190f, -0.132969f, -1.410812f, -0.298701f, -0.105963f, -0.086173f,
+  0.632779f,  0.005585f,  1.310169f,  1.392136f,  -0.563860f, -0.051053f,
+  0.660998f,  -0.214726f, -1.894342f, -0.128288f, -0.330721f, -0.053988f,
+  -0.177726f, 1.200859f,  -0.178902f, -0.172620f, -0.184476f, -0.175559f,
+  0.538503f,  -0.322158f, -0.219080f, -0.058208f, -0.171347f, -0.216060f,
+  -0.174950f, -0.295740f, -0.184820f, -0.213896f, 1.317728f,  -0.020116f,
+  -0.208096f, 0.000000f,  1.246166f,  -0.225421f, -0.181555f, 0.861761f,
+  1.172429f,  -0.172892f, -0.737092f, -0.189904f, -0.179385f, -0.114618f,
+  -1.384604f, -0.201713f, -0.271948f, 0.372351f,
+};
+
+static const float av1_ab_partition_nn_weights_32_layer1[64 * 16] = {
+  -0.037828f,  1.529029f,  0.004927f,  1.475763f,  0.627172f,  0.325872f,
+  -0.990757f,  0.129476f,  0.889958f,  -0.082031f, 0.332133f,  0.074422f,
+  -0.176212f,  -0.074355f, 0.774378f,  0.110987f,  -0.155469f, 0.253310f,
+  0.882538f,   0.253605f,  0.332436f,  -5.389474f, 0.278470f,  0.168644f,
+  0.914611f,   0.154165f,  0.809262f,  -0.174734f, 0.923673f,  0.064716f,
+  -0.070228f,  -0.228735f, 0.002312f,  0.112222f,  -0.045502f, -0.046004f,
+  0.514101f,   0.306480f,  0.021232f,  -0.015955f, -0.288260f, 0.189177f,
+  -0.104158f,  0.103273f,  0.096910f,  -0.086328f, 1.327289f,  -0.154247f,
+  0.056676f,   -0.243327f, -0.646676f, 0.177221f,  -0.086761f, 0.729729f,
+  -14.710893f, -0.044881f, 0.339003f,  -0.134737f, 0.073621f,  -0.162913f,
+  1.215237f,   0.140723f,  0.138630f,  1.241719f,  0.204092f,  -0.463080f,
+  -0.176086f,  1.125868f,  1.034814f,  0.225455f,  -0.203421f, -0.078787f,
+  -0.527498f,  0.012491f,  -0.563307f, -0.170792f, 0.002679f,  0.116153f,
+  0.211348f,   -0.191900f, -0.212505f, 0.263445f,  -0.074679f, -0.081441f,
+  -0.815405f,  2.448215f,  0.781299f,  0.149542f,  -1.045162f, 0.043014f,
+  0.217381f,   -0.094500f, -0.090427f, 0.025784f,  -0.228906f, -2.741798f,
+  0.230475f,   -0.256112f, -0.103297f, 0.159121f,  -0.229793f, -0.014883f,
+  -0.104131f,  -0.123816f, 0.164148f,  -0.052279f, -0.071845f, -0.041197f,
+  0.208527f,   -0.234197f, -0.542336f, 0.020053f,  0.088870f,  0.014346f,
+  2.502164f,   -0.010244f, -0.267792f, 0.844394f,  2.711486f,  -0.015262f,
+  -0.868053f,  -0.295704f, 0.222289f,  -0.000286f, -0.352098f, -0.079000f,
+  0.021267f,   -0.721739f, -0.240558f, -0.384775f, 0.065974f,  -2.161058f,
+  0.195889f,   0.268966f,  -0.009329f, 0.014949f,  0.314943f,  0.235885f,
+  0.072591f,   -0.127120f, 0.150784f,  0.105697f,  -1.297403f, -0.207509f,
+  -0.217688f,  -0.076752f, 0.170952f,  -0.294235f, 0.449973f,  -1.712690f,
+  0.860989f,   0.054757f,  -0.812627f, -0.105316f, -0.736230f, -0.133192f,
+  -3.741608f,  0.495660f,  -0.288936f, 4.654852f,  -0.021305f, -0.308916f,
+  0.049205f,   -0.259996f, 0.114248f,  -0.252647f, -0.253180f, -0.449314f,
+  0.022979f,   0.063281f,  -0.196154f, 0.078295f,  -0.322317f, -0.145142f,
+  0.300573f,   0.048385f,  -0.254787f, 0.123939f,  -1.263088f, -0.228565f,
+  -0.389061f,  0.391084f,  2.322438f,  0.075009f,  0.225743f,  -0.198808f,
+  -0.280538f,  -0.173939f, -0.120543f, -0.070792f, -0.417187f, -0.781056f,
+  -0.102756f,  -1.760965f, 0.019149f,  -0.867342f, 0.347141f,  0.031588f,
+  0.302572f,   -0.203573f, -0.357320f, -0.096078f, -0.527528f, 0.046699f,
+  -0.108561f,  -0.167077f, -2.851509f, -0.307116f, 0.202720f,  -0.160280f,
+  -0.215525f,  0.064355f,  -0.427220f, 1.516230f,  0.634453f,  0.099400f,
+  -1.013887f,  -0.029740f, -0.093426f, -0.044272f, -1.297636f, -0.237614f,
+  -0.160953f,  0.399036f,  -0.030685f, -0.113619f, -0.184704f, 0.040519f,
+  -0.588252f,  -0.210235f, -0.067623f, -0.031841f, -0.107261f, -0.192582f,
+  -0.253959f,  -0.430821f, -0.103184f, -0.280185f, -0.357723f, 0.197761f,
+  -0.175087f,  -0.055171f, 1.642014f,  -0.192559f, -0.288147f, 0.610311f,
+  4.688195f,   -0.128728f, -0.914869f, -0.108286f, 0.013789f,  0.092125f,
+  0.019770f,   -0.178386f, 0.074164f,  -1.152658f, -0.216738f, -0.277286f,
+  0.012381f,   0.418259f,  -0.680727f, -0.221917f, -0.485946f, 0.101672f,
+  2.009457f,   0.054302f,  1.019838f,  -0.116170f, 0.165134f,  -0.112567f,
+  0.852632f,   -0.385796f, -0.108666f, 0.053181f,  -0.311797f, -0.372875f,
+  -0.675717f,  2.409268f,  -0.514720f, -0.214245f, -0.646596f, 0.009756f,
+  0.203993f,   0.093617f,  -0.301290f, 0.253551f,  -0.128909f, -1.448442f,
+  -0.186823f,  -0.278001f, -0.294993f, -0.176928f, -0.473605f, 0.062049f,
+  -0.212084f,  -0.137326f, 0.012505f,  0.087850f,  -0.200413f, -0.394119f,
+  -0.132224f,  0.146917f,  0.155746f,  0.198725f,  -0.322541f, 0.196391f,
+  -0.945500f,  0.036736f,  -0.155646f, -0.677341f, 1.130545f,  -0.339554f,
+  0.411628f,   -0.355813f, -0.249843f, 0.213694f,  -2.035607f, 0.055694f,
+  -0.111669f,  0.408696f,  -0.067043f, -0.048182f, 0.398110f,  -0.067542f,
+  1.459801f,   0.236833f,  -0.178806f, 0.168758f,  0.492387f,  0.099691f,
+  -0.776680f,  -0.172865f, 0.204225f,  0.193982f,  0.575685f,  -0.062248f,
+  0.011486f,   0.058571f,  -0.493391f, 0.026893f,  -0.900467f, 3.793129f,
+  -0.634613f,  -0.064660f, -0.048262f, 0.361905f,  0.033641f,  0.245171f,
+  -0.064671f,  0.034954f,  0.204358f,  -0.904023f, -0.052714f, -0.250134f,
+  0.136700f,   0.000734f,  -0.371720f, 0.226483f,  0.217958f,  0.060559f,
+  0.180111f,   0.000970f,  0.079556f,  -0.096775f, 0.093855f,  -0.026224f,
+  -0.243664f,  0.004290f,  0.123281f,  -0.239476f, 1.230374f,  -0.107826f,
+  -0.101982f,  -0.153917f, 5.464427f,  0.304375f,  -0.809957f, 0.090564f,
+  -0.278416f,  -0.245555f, -2.078421f, 0.243093f,  -0.127666f, 0.052451f,
+  -0.126662f,  -0.783505f, 0.025149f,  -1.422675f, -0.207769f, -0.362547f,
+  0.115310f,   0.133390f,  1.264754f,  -0.027055f, -0.485312f, -0.240717f,
+  -0.239722f,  0.146818f,  -1.265043f, -0.235553f, 0.267104f,  -0.021357f,
+  -0.435949f,  -0.309371f, 0.049920f,  1.302721f,  -0.233978f, -0.097551f,
+  -0.240631f,  -0.287821f, -0.378380f, -0.273131f, -3.075169f, 0.226404f,
+  -0.029361f,  2.703590f,  -0.430659f, 0.067927f,  -0.387520f, -0.370630f,
+  -0.229236f,  0.085653f,  -0.370956f, -0.065556f, -0.187859f, 0.068309f,
+  -0.109299f,  -0.259898f, -0.103644f, -0.271199f, -0.209350f, 0.140993f,
+  -0.196713f,  -0.135508f, -1.423209f, -0.406385f, -0.019956f, -0.864694f,
+  5.963707f,   -0.201157f, 0.726377f,  -0.011076f, 0.010553f,  -0.102918f,
+  -2.230088f,  -0.258098f, -0.039547f, -0.029262f, -0.082324f, -0.860222f,
+  -0.094735f,  -1.381839f, 0.587298f,  -0.173048f, 0.721360f,  0.241900f,
+  0.764302f,   -0.023609f, -1.173755f, 0.103912f,  -0.185363f, 0.078435f,
+  -2.245062f,  -0.127269f, 0.202234f,  0.158975f,  -0.260909f, 0.098608f,
+  -0.348247f,  1.732502f,  -0.412298f, -0.269602f, -0.425771f, -0.146243f,
+  -0.530730f,  0.125716f,  -1.004419f, 0.145109f,  -0.059289f, 1.096304f,
+  0.012891f,   0.045033f,  -0.306875f, 0.003514f,  -0.176110f, 0.037544f,
+  -0.441537f,  -0.518921f, -0.262149f, -0.060407f, -0.379419f, -0.141245f,
+  -0.128894f,  -0.176537f, -1.161318f, -0.249100f, -0.118330f, 0.042816f,
+  1.173404f,   0.088312f,  -0.393568f, -0.175134f, 6.529819f,  -0.326652f,
+  -0.631917f,  -0.393476f, 0.057781f,  -0.217748f, -1.781139f, -0.012614f,
+  -0.212621f,  -0.720322f, -0.218498f, -0.388556f, -0.254796f, -0.248399f,
+  -0.608744f,  -0.265146f, 0.238517f,  0.066882f,  -2.916806f, 0.054642f,
+  0.282590f,   0.075248f,  0.010188f,  -0.133486f, 0.985945f,  -0.045849f,
+  -0.347564f,  0.057320f,  -0.417920f, 0.063664f,  0.387062f,  -2.692059f,
+  -0.535549f,  0.263736f,  0.327889f,  -0.070273f, -0.775254f, 0.147250f,
+  3.309425f,   -0.212191f, -0.067204f, -2.912663f, -0.061496f, 0.084233f,
+  0.022907f,   0.138421f,  -0.112159f, -0.288447f, -0.010799f, 0.056049f,
+  -0.036527f,  0.021525f,  0.106649f,  -0.291883f, 0.088424f,  -0.057773f,
+  -0.086031f,  0.015277f,  -0.318505f, -0.269049f, -1.008913f, -0.224785f,
+  -0.025820f,  -0.649037f, 0.706381f,  0.096410f,  0.643776f,  -0.046743f,
+  -0.009654f,  -0.024246f, 1.469255f,  -0.183536f, -0.370046f, -0.048442f,
+  -0.376527f,  -0.431264f, -0.245109f, -0.093951f, 0.203683f,  -0.099872f,
+  0.087210f,   0.160692f,  -3.527694f, -0.068891f, -0.228994f, -0.231817f,
+  -0.241949f,  0.193613f,  0.979597f,  -0.091259f, 0.414424f,  -0.047341f,
+  -0.209582f,  -0.295134f, -0.016824f, 0.460327f,  -0.072671f, 0.246234f,
+  0.235896f,   0.127238f,  -1.068683f, 0.035648f,  2.254888f,  0.180105f,
+  -0.260098f,  -2.322120f, -0.184249f, -0.314801f, -0.099969f, -0.272117f,
+  -0.237916f,  0.031103f,  -0.274063f, -0.049384f, -0.044917f, 0.102477f,
+  -0.342148f,  -0.257558f, -0.346300f, 0.115333f,  -0.115456f, 0.208354f,
+  -0.359301f,  -0.167395f, 1.146514f,  -0.177861f, -0.098658f, -0.444570f,
+  6.759993f,   -0.369772f, -0.831118f, 0.001866f,  -0.073298f, -0.072095f,
+  0.811902f,   -0.431997f, -0.286587f, -0.269500f, 0.111492f,  -0.525364f,
+  -0.351785f,  -2.463474f, -1.852659f, 0.135325f,  0.138267f,  0.100643f,
+  -2.373278f,  -0.285514f, -0.395388f, -0.185016f, -0.030249f, -0.005767f,
+  -0.716424f,  -0.031674f, 0.011147f,  0.057405f,  -0.215873f, -0.094401f,
+  0.573528f,   -1.223820f, 0.414852f,  -0.059053f, -0.076488f, -0.287168f,
+  -0.842640f,  0.174084f,  -0.567186f, 0.336629f,  -0.062514f, 2.075448f,
+  -0.061680f,  -0.131529f, -0.098994f, -0.204111f, -0.347865f, 0.108516f,
+  -0.049616f,  -0.069212f, -0.273935f, -0.096545f, -0.210784f, -0.284698f,
+  0.141501f,   -0.176924f, -0.361341f, -0.251197f, -0.286694f, 0.245569f,
+  -1.521661f,  -0.122639f, -0.015760f, -0.718912f, 5.877828f,  0.146916f,
+  0.151767f,   0.220785f,  -0.032298f, 0.230902f,  0.663943f,  -0.252613f,
+  0.057718f,   -0.436038f, -0.323994f, -1.139787f, -0.042489f, -1.326298f,
+  -1.031206f,  -0.104136f, 0.389897f,  0.127602f,  -2.667789f, -0.212366f,
+  -0.506262f,  -0.009115f, -0.213202f, 0.076167f,  -1.629405f, 0.055129f,
+  0.375393f,   -0.150272f, -0.241515f, -0.326497f, 0.100069f,  0.410703f,
+  0.340622f,   0.042437f,  -0.349945f, 0.041176f,  -1.178950f, 0.030992f,
+  0.933908f,   -0.035844f, -0.098660f, 1.030584f,  -0.092043f, -0.355739f,
+  -0.305562f,  0.036161f,  -0.049558f, -0.033225f, -0.403856f, -0.088276f,
+  0.215493f,   -0.149105f, -0.013363f, 0.025886f,  -0.101306f, -0.205781f,
+  -1.072487f,  -0.076019f, 0.077555f,  0.131003f,  1.267763f,  -0.008954f,
+  -0.327617f,  -0.246539f, 6.664081f,  -0.404403f, -1.442489f, 0.191301f,
+  -0.336361f,  0.181156f,  0.833108f,  0.007879f,  -0.194464f, -1.029408f,
+  -0.036268f,  -0.927110f, -0.379190f, -0.293443f, -1.848579f, -0.242548f,
+  -0.065990f,  0.203160f,  -0.291788f, 0.000680f,  0.587011f,  -0.241289f,
+  0.037034f,   0.000552f,  1.072308f,  -0.387230f, -0.230050f, 0.292322f,
+  -0.720001f,  0.034109f,  -0.467260f, 2.211644f,  -1.839191f, -0.048797f,
+  -0.083469f,  -0.334686f, -0.269056f, 0.051295f,  1.319904f,  -0.035603f,
+  -0.018457f,  -0.824915f, -0.212285f, -0.230516f, -0.035093f, -0.400843f,
+  -0.305469f,  -0.099011f, 0.014225f,  -0.452772f, 0.170331f,  -0.389312f,
+  -0.115084f,  -0.014770f, -0.429387f, -0.155961f, -0.568200f, -0.037853f,
+  -0.125137f,  0.067228f,  -1.329271f, -0.117874f, -0.132499f, -0.218376f,
+  -0.588325f,  -0.320024f, 0.085695f,  -0.235047f, -0.217790f, 0.103015f,
+  -0.698644f,  0.017766f,  -0.058299f, 0.199411f,  -0.122485f, -0.563949f,
+  -0.349011f,  -0.557045f, -0.131165f, 0.002281f,  0.118559f,  -0.210302f,
+  -1.153815f,  0.116738f,  -0.236007f, -0.003487f, -0.006885f, -0.244816f,
+  0.953222f,   0.093748f,  0.266869f,  0.241869f,  -0.860832f, -0.387012f,
+  -0.338986f,  2.097515f,  -1.942512f, -0.298021f, 0.543911f,  -0.043214f,
+  0.082125f,   -0.120242f, 0.712231f,  0.213327f,  -0.301687f, -0.544011f,
+  -0.392131f,  0.004302f,  0.004825f,  -0.317440f, -0.107518f, -0.293407f,
+  -0.159111f,  -0.080367f, 0.132663f,  -0.017726f, -0.237521f, -0.190297f,
+  -0.361633f,  0.200518f,  -0.538296f, -0.027975f, -0.381704f, -0.016963f,
+  0.630105f,   -0.190997f, -0.287840f, -0.603488f, 3.605598f,  -0.276614f,
+  -1.346383f,  0.186912f,  -0.047575f, -0.189232f, -1.519072f, 0.097816f,
+  -0.223722f,  0.304924f,  -0.213022f, -1.052433f, -0.322283f, -1.706734f,
+  -2.458027f,  0.237976f,  0.171050f,  -0.103139f, -0.278689f, 0.329824f,
+  -0.262448f,  -0.122916f, -0.236398f, -0.013848f, -0.969160f, -0.374907f,
+  0.091018f,   -0.386471f, -0.723940f, 0.064956f,  -0.057652f, 1.321024f,
+  -1.397418f,  -0.143136f, 0.272468f,  -0.030749f, 0.037324f,  0.069316f,
+  -0.904925f,  -0.333693f, -0.117709f, 2.279598f,  -0.428065f, -0.131157f,
+  -0.014288f,  -0.402862f, -0.666090f, 0.017070f,  -0.028333f, 0.002481f,
+  0.197156f,   -0.038120f, -0.271062f, -0.188275f, -0.021370f, -0.070849f,
+  -0.905007f,  -0.095886f, -0.093055f, -0.121821f, -1.239812f, -0.411799f,
+  -0.089948f,  -0.936827f, 1.437569f,  -0.388908f, 0.126170f,  0.186162f,
+  -0.018819f,  -0.138364f, -1.066412f, -0.138222f, -0.022186f, 0.107331f,
+  -0.230436f,  -1.352605f, -0.161323f, -1.081810f, -0.933825f, -0.136675f,
+  0.378157f,   0.113377f,  -0.850610f, 0.080245f,  -0.087305f, -0.002852f,
+  0.044408f,   -0.188172f, -1.891998f, 0.092189f,  0.125325f,  -0.105090f,
+  -0.848510f,  -0.396308f, -0.384130f, 2.007509f,  -1.480787f, -0.126946f,
+  0.314767f,   0.000195f,  -0.285628f, -0.110442f, -0.293948f, 0.258559f,
+  -0.417603f,  1.570705f,  0.092459f,  -0.340974f, -0.284754f, -0.007801f,
+  -0.324610f,  -0.004734f, -0.207716f, -0.057175f, 0.055467f,  -0.210830f,
+  -0.113005f,  -0.299177f, 0.068074f,  0.017929f,  -2.897598f, -0.260074f,
+  -0.014422f,  -0.206467f, 1.246997f,  -0.372863f, -0.214160f, -0.114035f,
+  5.805862f,   0.003611f,  -1.340990f, -0.021085f, -0.260431f, -0.002720f,
+  -1.251640f,  -0.353531f, -0.304009f, -0.153376f,
+};
+
+static const float av1_ab_partition_nn_bias_32_layer1[LABEL_SIZE] = {
+  -0.521497f, -1.061572f, -0.078756f, -0.660662f, -0.403741f, -0.960163f,
+  0.001427f,  0.523607f,  0.225068f,  -0.055273f, 1.019519f,  1.181880f,
+  -0.010198f, 0.130597f,  1.276752f,  2.028188f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_32 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      64,  // num_hidden_nodes
+  },
+  {
+      av1_ab_partition_nn_weights_32_layer0,
+      av1_ab_partition_nn_weights_32_layer1,
+  },
+  {
+      av1_ab_partition_nn_bias_32_layer0,
+      av1_ab_partition_nn_bias_32_layer1,
+  },
+};
+
+// nn model for ab partition pruning, 16x16.
+static const float av1_ab_partition_nn_weights_16_layer0[FEATURE_SIZE * 64] = {
+  0.151902f,  0.007947f,  -1.788454f, 0.431869f,  -2.971387f, 0.923566f,
+  1.632542f,  -1.665136f, -0.338632f, -5.075884f, 0.398267f,  0.030467f,
+  2.263534f,  -0.045532f, -1.066128f, 0.915139f,  -0.560500f, -3.293125f,
+  2.072793f,  -1.011414f, 0.122716f,  -0.060169f, -0.388860f, 0.031019f,
+  -0.381861f, 0.001551f,  -0.328472f, 0.038296f,  -0.060398f, -0.375556f,
+  0.209226f,  0.014764f,  -1.443469f, -0.345486f, 2.409269f,  1.524846f,
+  -0.640666f, 1.322139f,  -2.074771f, -0.580944f, -0.203960f, -0.072893f,
+  0.329701f,  0.115339f,  -1.339542f, 0.249024f,  -0.421545f, -0.409151f,
+  -0.258293f, 0.836288f,  -0.073685f, -0.009624f, 0.895712f,  0.320639f,
+  0.451002f,  -1.544558f, 0.193709f,  -1.389012f, 1.305451f,  0.089795f,
+  0.050338f,  -0.017433f, -0.304667f, 0.500729f,  0.504346f,  0.073757f,
+  0.582649f,  -0.993623f, 1.766766f,  -3.067265f, -0.415774f, -0.006036f,
+  -1.245281f, 0.253205f,  -0.591245f, -0.626238f, 0.551852f,  0.593755f,
+  0.491023f,  1.099384f,  -0.348448f, 0.054564f,  -0.451422f, -0.375781f,
+  -0.248390f, -0.052548f, -0.380069f, -0.165391f, -0.297968f, -0.052142f,
+  -0.316381f, -0.045246f, -0.243905f, -0.034169f, -0.247523f, -0.180773f,
+  0.068066f,  -0.374920f, 0.057536f,  -0.189748f, 0.058375f,  -0.267749f,
+  -0.147286f, -0.246153f, 0.006183f,  -0.202029f, -0.059128f, 0.116852f,
+  0.134719f,  -0.126900f, -0.064646f, -0.196458f, -0.182331f, 0.108029f,
+  -0.264499f, 0.155816f,  -0.107255f, -0.056983f, -0.209771f, -0.099070f,
+  0.007313f,  -0.254124f, -0.231964f, -0.275972f, 0.032098f,  -0.264564f,
+  -0.208743f, 0.155599f,  -0.121511f, -0.156145f, -0.162315f, -0.059788f,
+  -0.257073f, -0.076654f, -0.110616f, -0.321675f, -0.051952f, 0.006301f,
+  -0.154114f, 0.017032f,  -0.017364f, -0.233247f, 0.009918f,  -0.179289f,
+  -0.190722f, 0.147106f,  -0.063910f, -0.396872f, -0.263123f, -0.003850f,
+  -0.040718f, -0.324699f, 0.118660f,  -0.170727f, -0.316788f, 0.100886f,
+  -0.202842f, 0.045371f,  0.150561f,  -0.057054f, -0.308150f, 0.028346f,
+  -0.381473f, -0.195365f, 0.026221f,  -0.281795f, 0.087204f,  0.047689f,
+  -0.027643f, -0.104724f, -0.089030f, -0.117661f, -0.349160f, 0.056982f,
+  -0.340273f, 0.048086f,  0.046103f,  -0.121527f, 0.021697f,  0.054109f,
+  -0.002768f, -0.008461f, -2.297240f, 0.124651f,  3.621661f,  -0.057120f,
+  -1.151656f, 2.296894f,  -3.678720f, -0.290240f, 0.087683f,  -0.186389f,
+  0.007656f,  -0.090236f, -0.245217f, 0.110389f,  -0.251719f, -0.029084f,
+  -0.128203f, -0.100005f, -0.032779f, 0.007281f,  -0.366596f, -0.267870f,
+  -0.215620f, 0.047687f,  0.010303f,  0.097980f,  -0.191569f, -0.341162f,
+  0.119249f,  0.026279f,  -2.161546f, 0.459591f,  1.290566f,  1.791797f,
+  -0.409835f, 0.127081f,  -1.156367f, 0.198286f,  0.099561f,  -0.067445f,
+  -0.034352f, 0.017966f,  -0.277380f, -0.057220f, -0.174198f, -0.014164f,
+  0.146090f,  -0.357530f, 0.097644f,  -0.000932f, 0.446603f,  -0.066793f,
+  2.448620f,  0.937617f,  -1.232922f, 0.313183f,  0.816827f,  -0.275115f,
+  -0.245205f, -0.126895f, 0.156668f,  -0.186977f, -0.273505f, 0.013315f,
+  0.168629f,  -0.089084f, 0.006166f,  -0.116107f, -0.199316f, -0.024010f,
+  -0.242303f, 0.011612f,  -0.218485f, -0.229661f, -0.123922f, 0.136699f,
+  0.006732f,  -0.148718f, -0.164225f, 0.116063f,  1.587898f,  0.690519f,
+  0.360566f,  0.009739f,  -0.678702f, -0.046003f, 0.126984f,  0.605212f,
+  1.240663f,  -0.000228f, -1.119369f, -0.415589f, -0.721003f, 0.097936f,
+  -1.410586f, -2.358833f, -2.773129f, -3.983361f, -0.087144f, -0.050029f,
+  -0.242255f, 0.137424f,  -0.307490f, -0.084637f, -0.023812f, -0.196582f,
+  -0.078695f, 0.038257f,  -0.012110f, -0.263521f, 0.009839f,  -0.109125f,
+  -0.226036f, 0.060712f,  0.093671f,  0.153143f,  0.039116f,  -0.290891f,
+  0.227057f,  -0.204633f, -0.207539f, -0.148242f, 0.046204f,  -0.231268f,
+  -0.209315f, -0.307579f, -0.436556f, 0.023475f,  0.131793f,  -0.038301f,
+  1.650584f,  0.392570f,  1.446576f,  1.254380f,  -0.516867f, -0.057116f,
+  0.149320f,  0.414424f,  -0.246309f, 0.003877f,  -0.480238f, -1.037035f,
+  -0.830779f, -1.122244f, -0.408267f, -0.253956f, 0.382005f,  0.940609f,
+  -1.113370f, -0.018554f, 0.141064f,  -0.182504f, 1.270707f,  0.414904f,
+  -0.216036f, 0.203831f,  0.450716f,  -0.452909f, 0.139358f,  -0.027143f,
+  1.956892f,  1.643732f,  -0.867839f, -0.620520f, -0.334607f, -0.519982f,
+  0.205023f,  0.661159f,  -0.000809f, 0.049033f,  -0.348579f, -0.200338f,
+  -0.362144f, -0.346590f, -0.230096f, 0.180746f,  -0.149954f, -0.253429f,
+  -0.378170f, -0.040724f, -0.041597f, 0.243659f,  -0.472181f, 0.015401f,
+  -0.180376f, 0.153139f,  -0.247738f, -0.010485f, -0.157158f, 0.016825f,
+  -0.238925f, -0.265798f, -0.318374f, 0.142352f,  -0.210520f, 0.051928f,
+  -0.352190f, -0.179052f, -0.185498f, 0.025540f,  -0.111667f, -0.235187f,
+  -0.215454f, 0.010931f,  -0.238372f, -0.126659f, 0.075691f,  -0.091167f,
+  -2.462379f, -0.007950f, -0.637990f, 0.285554f,  -0.051275f, 0.282279f,
+  -0.744083f, -0.570646f, 0.592198f,  1.421332f,  -0.256027f, -0.140315f,
+  0.160247f,  -0.063185f, -0.055895f, -0.199864f, -0.287353f, -0.074561f,
+  -0.071228f, 0.055864f,  -1.084764f, -0.263409f, 0.779266f,  0.228187f,
+  0.375013f,  0.121204f,  -0.656948f, 0.533561f,  0.272671f,  -0.015423f,
+  -0.124180f, -0.009127f, 2.934838f,  -0.150998f, 1.163152f,  0.081997f,
+  -4.715939f, -3.676595f, -1.524886f, -0.167593f, 0.281186f,  0.024046f,
+  -1.451709f, 0.332558f,  0.990504f,  0.376290f,  -1.466773f, -0.448439f,
+  -2.929108f, -4.255188f, 0.065238f,  0.019950f,  1.372393f,  0.444052f,
+  -2.538772f, 1.579767f,  -0.464911f, -1.866114f, 1.053958f,  0.434467f,
+  -0.125964f, 0.034671f,  0.077116f,  -0.138466f, -0.413395f, -0.223453f,
+  -0.172127f, -0.251265f, -0.048239f, -0.395519f, 0.023141f,  0.037459f,
+  -0.249593f, -0.062215f, -0.047209f, -0.435189f, -0.164155f, -0.077590f,
+  -0.241164f, -0.126128f, -0.038243f, -0.180888f, 0.198840f,  -0.328036f,
+  -0.169790f, 0.036506f,  0.052572f,  -0.183570f, -0.073617f, -0.244959f,
+  0.266498f,  0.032846f,  -1.902106f, 0.486078f,  2.414993f,  0.975182f,
+  -0.382875f, 1.647810f,  -2.197017f, -0.890107f, 0.221287f,  0.010889f,
+  3.817042f,  0.572728f,  0.092466f,  0.473337f,  -1.634659f, -1.069455f,
+  1.486776f,  -1.023850f, 0.088184f,  0.008842f,  0.518202f,  0.270259f,
+  1.757191f,  -0.121839f, -2.912229f, -1.250866f, -2.381808f, 0.335309f,
+  -0.120079f, -0.061294f, -0.058725f, -0.315169f, -0.262443f, 0.072434f,
+  -0.267836f, -0.319354f, -0.274975f, 0.068970f,  -0.406467f, 0.044074f,
+  -0.152311f, -0.333656f, -0.228355f, -0.185613f, 0.017346f,  -0.177674f,
+  -0.090675f, -0.102047f, -0.011768f, -0.025280f, -0.271661f, 0.098099f,
+  -0.312272f, -0.222217f, -0.100548f, 0.106260f,  -0.034655f, 0.135109f,
+  -0.021276f, 0.018177f,  -0.353097f, -0.011128f, 0.061136f,  -0.511662f,
+  -0.223236f, -0.308841f, 0.118789f,  -0.154628f, -0.053178f, -0.055973f,
+  0.013175f,  -0.368337f, -0.090863f, -0.116920f, 0.178990f,  -0.025278f,
+  -0.190553f, -0.238092f, 0.303943f,  -0.024944f, 0.719373f,  0.384332f,
+  -0.378480f, -0.423316f, 0.709922f,  0.758514f,  -1.559023f, -2.503173f,
+  0.068652f,  -0.234741f, -0.182932f, 0.037878f,  0.020684f,  -0.174142f,
+  -0.182300f, -0.052796f, -0.219145f, 0.113028f,  -1.041826f, 0.035317f,
+  0.919904f,  -0.676011f, 0.652297f,  1.456447f,  -0.166904f, -0.861823f,
+  0.895827f,  0.429821f,  -0.180376f, -0.076587f, -0.273945f, -0.288990f,
+  -0.206692f, -0.080745f, -0.085444f, 0.186953f,  -0.050135f, 0.044243f,
+  -0.391706f, -0.160498f, -0.292268f, 0.164060f,  0.412649f,  0.211611f,
+  -0.327294f, -0.919399f, 0.320297f,  0.385284f,  -0.088848f, -0.072556f,
+  -0.384813f, -0.176267f, -0.065918f, 0.134724f,  -0.231104f, -0.337707f,
+  -0.195442f, -0.263569f, 0.098090f,  -0.341411f, -0.189211f, -0.439276f,
+  -0.404046f, 0.262491f,  -0.311093f, -0.086454f, -0.013400f, -0.061447f,
+  -0.026945f, -0.112036f, -0.322985f, 0.078500f,  -0.230205f, -0.344535f,
+  -0.021087f, 0.110220f,  -0.128671f, 0.044219f,
+};
+
+static const float av1_ab_partition_nn_bias_16_layer0[64] = {
+  2.936406f,  -0.396539f, -0.110456f, -1.254954f, 0.785350f,  0.516290f,
+  -0.172341f, 0.254386f,  -0.192465f, -0.106751f, -0.055518f, -0.094994f,
+  0.000000f,  -0.065018f, -0.004908f, -0.130483f, -0.119580f, -0.142072f,
+  0.457446f,  -0.125051f, -0.107712f, 0.714607f,  -0.140809f, -1.788650f,
+  -0.087199f, 0.000000f,  -1.290050f, 0.443930f,  -0.110634f, -0.109380f,
+  -0.188213f, -1.414179f, 1.193579f,  0.388775f,  -0.873193f, -0.110050f,
+  -0.072565f, -0.117050f, -0.119132f, 0.456959f,  -0.132069f, 0.131974f,
+  1.160474f,  1.746465f,  0.442628f,  -0.188849f, -0.207794f, -0.108364f,
+  -0.856655f, -2.141620f, 0.335476f,  -0.105508f, -0.212162f, -0.109319f,
+  -0.237213f, -0.109980f, -0.291044f, -0.137877f, 0.470191f,  -0.023908f,
+  0.123809f,  -0.109797f, 0.200510f,  -0.147542f,
+};
+
+static const float av1_ab_partition_nn_weights_16_layer1[64 * LABEL_SIZE] = {
+  -6.823716f, 1.406568f,  -0.144009f, 2.228765f,  0.838336f,  0.738107f,
+  -0.319014f, -0.148756f, 0.240862f,  -0.111089f, -0.004241f, 0.025758f,
+  -0.193820f, -0.246362f, -0.181363f, -0.201556f, 0.024268f,  0.252994f,
+  -0.289443f, 0.194932f,  0.057467f,  0.724735f,  0.014063f,  1.361352f,
+  0.025191f,  0.024274f,  0.231462f,  -7.227959f, -0.094515f, 0.039946f,
+  0.412719f,  0.812318f,  3.038903f,  -0.286289f, 0.647482f,  -0.115114f,
+  0.053590f,  0.066069f,  0.153134f,  0.996250f,  -0.125700f, 0.951365f,
+  -6.243494f, -4.827697f, 0.566320f,  0.239515f,  -0.099702f, 0.054546f,
+  1.847330f,  3.680076f,  -3.049829f, -0.127709f, 0.068469f,  -0.017794f,
+  0.223864f,  -0.106778f, -0.020425f, -0.040226f, -0.251890f, -0.168673f,
+  -0.552073f, 0.043311f,  0.218668f,  0.033209f,  -3.199210f, 0.193079f,
+  0.321406f,  0.718307f,  -0.181418f, -0.459612f, -1.981170f, 0.968496f,
+  -0.029757f, -0.130065f, 0.043782f,  0.072394f,  -0.088686f, 0.025322f,
+  0.129882f,  0.101324f,  0.335707f,  0.072714f,  -2.079774f, 0.203997f,
+  0.239321f,  -0.301757f, 0.257845f,  1.288382f,  -0.031275f, -0.234194f,
+  0.310722f,  2.045469f,  0.034716f,  0.135638f,  -0.251388f, 0.320071f,
+  -1.065301f, -0.322731f, -0.545028f, 0.226276f,  0.090799f,  0.019289f,
+  0.048950f,  -1.079300f, 0.231938f,  0.083683f,  4.762127f,  0.145037f,
+  -0.145549f, 0.075592f,  0.172336f,  0.108175f,  0.333751f,  1.090501f,
+  1.056114f,  0.047073f,  0.182052f,  -0.081587f, 0.089900f,  0.339286f,
+  2.049988f,  0.073585f,  0.537355f,  -0.243322f, -0.010179f, -0.052601f,
+  -0.174915f, 0.117793f,  2.222990f,  -2.520837f, -0.092699f, 1.199887f,
+  0.138720f,  0.679918f,  -0.463155f, -0.659496f, -0.109913f, -0.003398f,
+  0.114633f,  -0.128377f, 0.092970f,  -0.107489f, -0.191078f, 0.185182f,
+  0.216980f,  -0.019343f, 3.443133f,  0.287953f,  0.099314f,  0.985958f,
+  0.157268f,  -0.606516f, 0.049418f,  -0.221809f, -0.453081f, -0.344796f,
+  -0.003735f, -0.107269f, -0.128541f, -0.259543f, -0.934806f, -0.542456f,
+  -1.011192f, 0.022795f,  0.186363f,  -0.076356f, -0.050932f, -0.165098f,
+  0.168177f,  -0.101596f, -5.270886f, 2.553943f,  -0.440870f, -0.017494f,
+  0.215208f,  -0.017032f, 1.495915f,  -4.304677f, 0.762211f,  0.182937f,
+  0.254406f,  -0.029433f, -0.088364f, -0.110160f, -0.108257f, -0.036538f,
+  0.737697f,  -0.234989f, 0.168095f,  0.245118f,  -0.077262f, 0.195718f,
+  0.753302f,  -1.637869f, 0.126227f,  0.982129f,  -0.121444f, -0.295570f,
+  -1.215799f, 0.147867f,  -0.068496f, 0.132726f,  -0.005772f, -0.181774f,
+  0.126513f,  0.204723f,  -0.366123f, 0.103906f,  -0.148053f, -0.075272f,
+  0.243884f,  -0.104828f, 0.198988f,  0.501034f,  -0.112671f, 0.111421f,
+  0.167508f,  -0.117803f, -0.738624f, 2.046292f,  0.124011f,  0.057983f,
+  -0.359154f, -0.648883f, -0.259462f, -0.459041f, -2.501223f, -0.065138f,
+  0.122417f,  0.060291f,  -0.129033f, -0.843086f, 0.268241f,  -0.399927f,
+  1.585888f,  1.816393f,  -0.631427f, 0.127826f,  0.088105f,  0.073488f,
+  0.717694f,  -1.497362f, 2.608528f,  0.066896f,  -0.079230f, 0.223436f,
+  -0.010530f, 0.175310f,  1.120365f,  0.034391f,  0.835312f,  0.071652f,
+  -0.080615f, 0.111395f,  0.162742f,  0.079927f,  -3.859582f, -0.638431f,
+  -0.167880f, -0.992659f, -0.885355f, -1.276197f, 1.334344f,  0.931940f,
+  -0.078244f, -0.149030f, -0.070974f, -0.133566f, 0.200034f,  0.102793f,
+  -0.048546f, 0.063545f,  0.023864f,  -0.190863f, 1.934257f,  -0.136286f,
+  -0.107916f, -0.637468f, 0.066449f,  1.089693f,  -0.214047f, -0.265780f,
+  0.899660f,  -0.130333f, 0.288311f,  -0.049024f, 0.090202f,  0.487969f,
+  0.339704f,  0.858479f,  0.841253f,  -0.184100f, -0.637070f, -0.125071f,
+  -0.077650f, -0.087877f, 0.202268f,  -0.027300f, 2.842862f,  -0.100698f,
+  -0.259080f, 0.260556f,  0.157912f,  -0.070364f, 0.467190f,  1.200037f,
+  1.419317f,  -0.033588f, -0.227824f, 0.292617f,  0.228574f,  0.213839f,
+  -1.091099f, -0.022258f, -1.294681f, 0.136118f,  0.081652f,  -0.185359f,
+  -0.039706f, 0.191407f,  -2.053219f, -0.261934f, 0.047812f,  -0.029536f,
+  -0.823869f, -1.090534f, -0.755890f, 0.441035f,  -0.167945f, 0.231441f,
+  -0.135013f, -0.260762f, 0.256872f,  0.130339f,  -0.243751f, 0.189760f,
+  -0.288454f, 0.145363f,  0.338490f,  0.403898f,  -0.022814f, -1.263598f,
+  -0.101315f, 0.860135f,  0.136511f,  0.028942f,  0.574047f,  2.656370f,
+  0.037587f,  -0.188690f, -0.125312f, 1.100435f,  -1.080402f, 0.380905f,
+  0.004635f,  0.097144f,  -0.214309f, 0.085552f,  -0.285066f, -0.705134f,
+  -0.054704f, -0.319951f, 5.486626f,  0.958158f,  -1.380585f, 0.223340f,
+  -0.169167f, -0.170697f, -0.216748f, 0.324232f,  2.684204f,  -0.008490f,
+  -0.211052f, -0.201190f, 0.123466f,  -0.000234f, 0.579907f,  0.096938f,
+  -0.042745f, 0.201855f,  0.157195f,  -0.261440f, 0.029699f,  -0.046599f,
+  1.618216f,  -2.596280f, -0.377420f, -0.526725f, -0.493592f, -0.579615f,
+  0.579699f,  -0.100392f, 0.150694f,  0.061794f,  0.200425f,  -0.062515f,
+  -0.179122f, 0.250112f,  -0.344675f, -0.118359f, -0.095670f, 0.152311f,
+  3.662276f,  -0.154921f, -0.312991f, 0.972008f,  -0.308596f, -0.190426f,
+  0.133889f,  -0.238673f, -0.094726f, 1.683835f,  -0.215629f, -0.198890f,
+  -0.035278f, -0.367973f, -0.822435f, 0.240848f,  -0.194656f, 0.034655f,
+  -0.079424f, 0.146670f,  0.026646f,  -0.034507f, 0.059467f,  -0.153109f,
+  -0.431033f, 2.552991f,  -1.894091f, -0.180462f, -0.306839f, -0.025648f,
+  1.026326f,  -3.096230f, 1.346935f,  0.033633f,  -0.181827f, 0.094376f,
+  0.001696f,  -0.379264f, -1.069503f, -0.140972f, -0.208769f, -0.195239f,
+  0.281795f,  -0.127251f, 0.180776f,  0.067763f,  0.697124f,  -1.040779f,
+  0.111280f,  0.188351f,  -0.340234f, -0.207790f, -0.720075f, -0.137409f,
+  -0.070310f, -0.032918f, -0.060787f, 0.131484f,  -0.077845f, -0.258652f,
+  0.056911f,  -0.062034f, 0.007663f,  -0.185100f, 1.340361f,  0.014096f,
+  -0.124602f, 0.194241f,  0.128383f,  0.360465f,  0.082979f,  -0.050475f,
+  -0.519294f, 3.323262f,  0.067014f,  0.221203f,  -0.085082f, -0.228606f,
+  -0.916668f, -0.022643f, -1.386737f, -0.131902f, -0.349952f, -0.032874f,
+  -0.189190f, -0.898790f, -0.102394f, -1.017387f, 2.214050f,  1.790253f,
+  -1.913561f, -0.043716f, -0.214924f, -0.194598f, -0.064723f, -1.671793f,
+  2.251166f,  -0.146007f, 0.138527f,  -0.003134f, 0.103665f,  0.006928f,
+  -0.240253f, -0.227464f, 0.578437f,  -0.214724f, 0.503085f,  0.158093f,
+  0.033091f,  0.008061f,  4.815371f,  2.132264f,  0.281850f,  -2.288560f,
+  -0.145012f, 1.296832f,  -0.362401f, -0.403252f, 0.109873f,  0.185746f,
+  0.244764f,  0.172367f,  -0.185588f, 0.139801f,  -0.178254f, 0.068629f,
+  0.358488f,  -0.153969f, -6.433524f, 0.225983f,  -0.138123f, -0.095971f,
+  -0.036089f, -1.400083f, 0.265908f,  0.257787f,  0.181144f,  -1.647228f,
+  -0.136289f, -0.074206f, 0.122988f,  -0.088895f, -1.266717f, 0.006010f,
+  0.536681f,  0.263061f,  -0.032207f, -0.155136f, 0.086431f,  0.441950f,
+  -0.060755f, -0.280683f, -0.783475f, -2.567033f, 1.093221f,  0.117667f,
+  -0.000408f, 0.225719f,  -2.199698f, 0.141447f,  -1.459051f, 0.051315f,
+  0.203228f,  0.354432f,  -0.005775f, -0.028073f, -0.965817f, 0.231083f,
+  -0.666884f, 0.026283f,  -0.317486f, 0.210754f,  0.123897f,  0.223827f,
+  4.214405f,  1.457334f,  -0.253945f, -1.306733f, -0.391235f, 0.451154f,
+  -1.553888f, -0.353429f, 0.069533f,  0.159278f,  -0.173836f, -0.004952f,
+  -0.137033f, 0.127012f,  0.143600f,  0.051587f,  -0.070549f, 0.066509f,
+  -5.776547f, 0.180021f,  -0.189183f, -1.288504f, -0.233575f, -1.473873f,
+  0.140940f,  0.144451f,  -0.104534f, 2.089873f,  -0.168168f, 0.110726f,
+  0.132134f,  -0.215223f, -1.682754f, 0.157757f,  -0.146163f, 0.064882f,
+  0.117313f,  -0.038780f, -0.124720f, -0.501697f, 0.092047f,  -0.233992f,
+  3.324976f,  0.516601f,  1.294202f,  0.119989f,  0.061055f,  0.043420f,
+  -2.750727f, -0.382812f, -0.648496f, -0.115353f, -0.334205f, 0.024354f,
+  -0.282998f, -0.282705f, 0.073798f,  0.169851f,  0.135651f,  0.182677f,
+  -0.040220f, 0.132462f,  -0.303120f, -0.230113f, 6.165739f,  -0.258596f,
+  0.024127f,  -1.388283f, -0.006042f, 0.572600f,  0.348411f,  -0.387376f,
+  -0.075845f, 0.122319f,  -0.029616f, 0.077873f,  0.154763f,  0.049073f,
+  0.018597f,  0.102688f,  -0.204165f, 0.020734f,  -1.389133f, -0.032854f,
+  -0.147561f, 0.853944f,  0.132100f,  -3.259659f, 0.243745f,  0.181529f,
+  -0.738414f, 1.509994f,  0.023470f,  -0.005329f, 0.066115f,  -1.345081f,
+  -1.455402f, -0.172023f, -0.194625f, 0.071885f,  -0.201742f, -0.262402f,
+  0.077601f,  -0.048938f, 0.257993f,  -0.504029f, -2.032415f, 1.158880f,
+  0.448647f,  -0.025633f, 0.117586f,  -0.072275f, -0.673744f, -3.854342f,
+  -0.983843f, 0.047766f,  -0.017193f, -0.215775f, -0.158743f, -0.232042f,
+  -0.509112f, 0.148812f,  0.130122f,  0.006486f,  -0.099016f, 0.022514f,
+  -0.486850f, -0.059623f, 4.012731f,  0.025454f,  0.029059f,  -0.783546f,
+  -0.295260f, 0.322521f,  -0.473201f, -0.172100f, -0.100087f, -0.076516f,
+  -0.258367f, -0.112897f, 0.269364f,  -0.065912f, 0.169022f,  -0.178783f,
+  -0.095114f, 0.122089f,  -2.790099f, -0.100431f, -0.087963f, -0.009431f,
+  -0.087819f, -2.774399f, -0.100757f, 0.013005f,  -0.964533f, 3.236665f,
+  -0.354903f, -0.144169f, -0.166869f, -1.396513f, -0.931271f, -0.046261f,
+  -1.799262f, -0.365269f, 0.108611f,  0.037994f,  0.024747f,  -1.073639f,
+  -0.203158f, -0.935006f, 1.880891f,  1.578385f,  0.726272f,  -0.024546f,
+  -0.011626f, -0.151363f, -1.121716f, -1.787484f, 0.232806f,  0.075451f,
+  0.182899f,  0.092215f,  -0.207347f, -0.030111f, 0.054316f,  0.192481f,
+  0.594639f,  -0.247694f, 0.547471f,  -0.032094f, -0.065000f, 0.007198f,
+  1.605377f,  -0.155945f, -0.066200f, -2.343716f, -1.016283f, -0.079321f,
+  0.919365f,  0.599980f,  0.125545f,  0.265813f,  0.246884f,  0.095385f,
+  -0.260374f, -0.202916f, -0.042770f, 0.234967f,  -0.233139f, -0.326994f,
+  -1.375256f, 0.121766f,  0.077433f,  -1.103569f, 0.019497f,  -1.029185f,
+  0.253905f,  0.206569f,  0.187334f,  -0.237089f, -0.294351f, 0.164137f,
+  0.149696f,  -0.749787f, -0.413433f, 0.976587f,  1.027976f,  -0.285264f,
+  0.209273f,  -0.124762f, 0.050884f,  0.250764f,  -0.082031f, -0.646520f,
+  4.116680f,  0.437336f,  0.671684f,  0.129509f,  -0.078462f, 0.014072f,
+  -0.678232f, 0.094831f,  1.125624f,  0.207070f,  -0.154750f, -0.025780f,
+  -0.103030f, 0.118019f,  -0.908186f, -0.263546f, -1.555324f, -0.236887f,
+  -0.217854f, -0.051790f, 0.017915f,  0.171001f,  1.355562f,  0.094603f,
+  -0.233929f, -1.282169f, -0.773183f, -0.161682f, -0.834565f, -0.286776f,
+  -0.298901f, 0.038162f,  0.251899f,  0.039612f,  -0.022935f, -0.232308f,
+  -0.043855f, -0.192892f, -0.279009f, -0.182234f, -1.272808f, -0.070344f,
+  -0.092432f, -1.915946f, -0.134373f, -1.405496f, -0.067071f, -0.131922f,
+  0.185269f,  1.465082f,  0.040240f,  0.112665f,  0.144329f,  -0.286112f,
+  -0.617649f, 0.916177f,  0.221044f,  -0.079867f, 0.170251f,  -0.093638f,
+  -0.212620f, -0.305945f, -0.234356f, -0.482501f, 3.928472f,  1.241179f,
+  0.355922f,  -0.170848f, -0.189168f, 0.080225f,  -1.357793f, 0.190890f,
+  0.976800f,  -0.068070f, -0.016295f, -0.088623f, -0.129560f, -0.212267f,
+  -0.071537f, -0.219501f, -0.655198f, -0.225188f, -0.116024f, 0.224174f,
+  -0.049715f, -0.178005f, 3.029985f,  -1.141546f, 0.080066f,  -1.932316f,
+  -0.641137f, -0.189564f, 0.935080f,  0.136119f,  0.015558f,  -0.179331f,
+  0.204571f,  0.020350f,  0.009362f,  0.108478f,  0.037076f,  -0.049009f,
+  0.081090f,  -0.180202f, 1.455561f,  -0.081559f, 0.059361f,  0.484971f,
+  0.160923f,  -2.170744f, -0.013204f, 0.126561f,  -0.407122f, 1.223661f,
+  0.044262f,  0.118044f,  0.058274f,  -1.747100f, -0.171318f, 0.971374f,
+  0.306995f,  -0.103268f, -0.319443f, -0.333176f, -0.038608f, 0.119674f,
+  -0.106479f, -0.907933f, 1.121231f,  1.673840f,  -0.421458f, -0.021146f,
+  -0.254838f, 0.097632f,  0.235109f,  -2.901782f, 0.289518f,  -0.355459f,
+  -0.068264f, -0.179121f, 0.068560f,  -0.047570f, -0.522523f, -0.228963f,
+  -1.037158f, -0.163723f, 0.280563f,  -0.000868f, -0.197220f, -0.239329f,
+  1.985274f,  -0.256181f, -0.064341f, -0.822417f, -0.465140f, -0.010942f,
+  -0.792024f, -0.114290f, 0.060969f,  0.104106f,  -0.252123f, -0.150400f,
+  -0.133277f, 0.267147f,  0.274413f,  0.223744f,  -0.180223f, -0.345415f,
+  -0.104883f, 0.119210f,  -0.095041f, -0.301635f, 0.013175f,  -2.128121f,
+  -0.147208f, -0.151509f, -0.692013f, 3.418555f,  -0.016541f, 0.171511f,
+  0.107159f,  -1.516672f, 0.127408f,  0.687035f,  -0.906486f, -0.145463f,
+  -0.169382f, -0.143906f, 0.125091f,  -0.960645f, -0.180869f, -0.716908f,
+  2.840951f,  1.904919f,  -0.416268f, -0.425181f, -0.194697f, -0.075932f,
+  -0.950604f, -1.599800f, 0.943671f,  -0.022744f, -0.270492f, 0.080843f,
+  -0.372916f, 0.047838f,  -0.100300f, -0.026600f, 0.011733f,  -0.226051f,
+  0.172790f,  -0.172982f, 0.041258f,  -0.299379f,
+};
+
+static const float av1_ab_partition_nn_bias_16_layer1[LABEL_SIZE] = {
+  -0.053805f, -1.248639f, 0.520965f, -0.904962f, -0.126425f, -0.118798f,
+  0.748430f,  0.203096f,  0.059317f, 0.418219f,  0.841294f,  0.402693f,
+  -0.658522f, 0.723479f,  0.544264f, 1.035225f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_16 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      64,  // num_hidden_nodes
+  },
+  {
+      av1_ab_partition_nn_weights_16_layer0,
+      av1_ab_partition_nn_weights_16_layer1,
+  },
+  {
+      av1_ab_partition_nn_bias_16_layer0,
+      av1_ab_partition_nn_bias_16_layer1,
+  },
+};
+
+#undef FEATURE_SIZE
+#undef LABEL_SIZE
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c
index 054b0e062..c5a6bc831 100644
--- a/third_party/aom/av1/encoder/aq_complexity.c
+++ b/third_party/aom/av1/encoder/aq_complexity.c
@@ -39,21 +39,29 @@ static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
   { -3.0, -2.0, -1.0, 100.00, 100.0 }
 };
 
-#define DEFAULT_COMPLEXITY 64
-
 static int get_aq_c_strength(int q_index, aom_bit_depth_t bit_depth) {
   // Approximate base quatizer (truncated to int)
-  const int base_quant = av1_ac_quant(q_index, 0, bit_depth) / 4;
+  const int base_quant = av1_ac_quant_Q3(q_index, 0, bit_depth) / 4;
   return (base_quant > 10) + (base_quant > 25);
 }
 
 void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   struct segmentation *const seg = &cm->seg;
+  int resolution_change =
+      cm->prev_frame && (cm->width != cm->prev_frame->width ||
+                         cm->height != cm->prev_frame->height);
 
   // Make SURE use of floating point in this function is safe.
   aom_clear_system_state();
 
+  if (resolution_change) {
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    av1_clearall_segfeatures(seg);
+    av1_disable_segmentation(seg);
+    return;
+  }
+
   if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
       cpi->refresh_alt_ref_frame ||
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
@@ -74,9 +82,6 @@ void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
 
     av1_enable_segmentation(seg);
 
-    // Select delta coding method.
-    seg->abs_delta = SEGMENT_DELTADATA;
-
     // Default segment "Q" feature is disabled so it defaults to the baseline Q.
     av1_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q);
 
@@ -107,13 +112,13 @@ void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
 
 #define DEFAULT_LV_THRESH 10.0
 #define MIN_DEFAULT_LV_THRESH 8.0
-#define VAR_STRENGTH_STEP 0.25
 // Select a segment for the current block.
 // The choice of segment for a block depends on the ratio of the projected
 // bits for the block vs a target average and its spatial complexity.
 void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
                             int mi_row, int mi_col, int projected_rate) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
 
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
   const int xmis = AOMMIN(cm->mi_cols - mi_col, mi_size_wide[bs]);
@@ -126,9 +131,10 @@ void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
     segment = DEFAULT_AQ2_SEG;
   } else {
     // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
-    // It is converted to bits * 256 units.
-    const int64_t num = (int64_t)cpi->rc.sb64_target_rate * xmis * ymis * 256;
-    const int denom = cm->mib_size * cm->mib_size;
+    // It is converted to bits << AV1_PROB_COST_SHIFT units.
+    const int64_t num = (int64_t)(cpi->rc.sb64_target_rate * xmis * ymis)
+                        << AV1_PROB_COST_SHIFT;
+    const int denom = cm->seq_params.mib_size * cm->seq_params.mib_size;
     const int target_rate = (int)(num / denom);
     double logvar;
     double low_var_thresh;
@@ -139,7 +145,7 @@ void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
                                                     MIN_DEFAULT_LV_THRESH)
                                            : DEFAULT_LV_THRESH;
 
-    av1_setup_src_planes(mb, cpi->source, mi_row, mi_col);
+    av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes);
     logvar = av1_log_block_var(cpi, mb, bs);
 
     segment = AQ_C_SEGMENTS - 1;  // Just in case no break out below.
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
index 8f61c7eb8..a1fe37d4a 100644
--- a/third_party/aom/av1/encoder/aq_cyclicrefresh.c
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
@@ -320,7 +320,7 @@ void av1_cyclic_refresh_check_golden_update(AV1_COMP *const cpi) {
   double fraction_low = 0.0;
   int low_content_frame = 0;
 
-  MODE_INFO **mi;
+  MB_MODE_INFO **mi;
   RATE_CONTROL *const rc = &cpi->rc;
   const int rows = cm->mi_rows, cols = cm->mi_cols;
   int cnt1 = 0, cnt2 = 0;
@@ -330,12 +330,12 @@ void av1_cyclic_refresh_check_golden_update(AV1_COMP *const cpi) {
     mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
 
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      int16_t abs_mvr = mi[0]->mbmi.mv[0].as_mv.row >= 0
-                            ? mi[0]->mbmi.mv[0].as_mv.row
-                            : -1 * mi[0]->mbmi.mv[0].as_mv.row;
-      int16_t abs_mvc = mi[0]->mbmi.mv[0].as_mv.col >= 0
-                            ? mi[0]->mbmi.mv[0].as_mv.col
-                            : -1 * mi[0]->mbmi.mv[0].as_mv.col;
+      int16_t abs_mvr = mi[0]->mv[0].as_mv.row >= 0
+                            ? mi[0]->mv[0].as_mv.row
+                            : -1 * mi[0]->mv[0].as_mv.row;
+      int16_t abs_mvc = mi[0]->mv[0].as_mv.col >= 0
+                            ? mi[0]->mv[0].as_mv.col
+                            : -1 * mi[0]->mv[0].as_mv.col;
 
       // Calculate the motion of the background.
       if (abs_mvr <= 16 && abs_mvc <= 16) {
@@ -389,8 +389,10 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
   int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
   int xmis, ymis, x, y;
   memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols);
-  sb_cols = (cm->mi_cols + cm->mib_size - 1) / cm->mib_size;
-  sb_rows = (cm->mi_rows + cm->mib_size - 1) / cm->mib_size;
+  sb_cols =
+      (cm->mi_cols + cm->seq_params.mib_size - 1) / cm->seq_params.mib_size;
+  sb_rows =
+      (cm->mi_rows + cm->seq_params.mib_size - 1) / cm->seq_params.mib_size;
   sbs_in_frame = sb_cols * sb_rows;
   // Number of target blocks to get the q delta (segment 1).
   block_count = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
@@ -406,8 +408,8 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
     // Get the mi_row/mi_col corresponding to superblock index i.
     int sb_row_index = (i / sb_cols);
     int sb_col_index = i - sb_row_index * sb_cols;
-    int mi_row = sb_row_index * cm->mib_size;
-    int mi_col = sb_col_index * cm->mib_size;
+    int mi_row = sb_row_index * cm->seq_params.mib_size;
+    int mi_col = sb_col_index * cm->seq_params.mib_size;
     int qindex_thresh =
         cpi->oxcf.content == AOM_CONTENT_SCREEN
             ? av1_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
@@ -416,14 +418,14 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
     assert(mi_col >= 0 && mi_col < cm->mi_cols);
     bl_index = mi_row * cm->mi_cols + mi_col;
     // Loop through all MI blocks in superblock and update map.
-    xmis = AOMMIN(cm->mi_cols - mi_col, cm->mib_size);
-    ymis = AOMMIN(cm->mi_rows - mi_row, cm->mib_size);
+    xmis = AOMMIN(cm->mi_cols - mi_col, cm->seq_params.mib_size);
+    ymis = AOMMIN(cm->mi_rows - mi_row, cm->seq_params.mib_size);
     for (y = 0; y < ymis; y++) {
       for (x = 0; x < xmis; x++) {
         const int bl_index2 = bl_index + y * cm->mi_cols + x;
         // If the block is as a candidate for clean up then mark it
         // for possible boost/refresh (segment 1). The segment id may get
-        // reset to 0 later if block gets coded anything other than ZEROMV.
+        // reset to 0 later if block gets coded anything other than GLOBALMV.
         if (cr->map[bl_index2] == 0) {
           if (cr->last_coded_q_map[bl_index2] > qindex_thresh) sum_map++;
         } else if (cr->map[bl_index2] < 0) {
@@ -479,6 +481,16 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   struct segmentation *const seg = &cm->seg;
   const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc);
+  int resolution_change =
+      cm->prev_frame && (cm->width != cm->prev_frame->width ||
+                         cm->height != cm->prev_frame->height);
+  if (resolution_change) {
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    av1_clearall_segfeatures(seg);
+    aom_clear_system_state();
+    av1_disable_segmentation(seg);
+    return;
+  }
   if (cm->current_video_frame == 0) cr->low_content_avg = 0.0;
   // Don't apply refresh on key frame or enhancement layer frames.
   if (!apply_cyclic_refresh || cm->frame_type == KEY_FRAME) {
@@ -509,8 +521,6 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
     // Clear down the segment map.
     av1_enable_segmentation(&cm->seg);
     av1_clearall_segfeatures(seg);
-    // Select delta coding method.
-    seg->abs_delta = SEGMENT_DELTADATA;
 
     // Note: setting temporal_update has no effect, as the seg-map coding method
     // (temporal or spatial) is determined in
diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c
index 84d967215..29a311447 100644
--- a/third_party/aom/av1/encoder/aq_variance.c
+++ b/third_party/aom/av1/encoder/aq_variance.c
@@ -19,6 +19,7 @@
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/segmentation.h"
+#include "av1/encoder/dwt.h"
 #include "aom_ports/system_state.h"
 
 #define ENERGY_MIN (-4)
@@ -34,10 +35,8 @@ static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 };
 #define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN]
 
 DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 };
-#if CONFIG_HIGHBITDEPTH
 DECLARE_ALIGNED(16, static const uint16_t,
                 av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
-#endif
 
 unsigned int av1_vaq_segment_id(int energy) {
   ENERGY_IN_BOUNDS(energy);
@@ -49,6 +48,16 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
   struct segmentation *seg = &cm->seg;
   int i;
 
+  int resolution_change =
+      cm->prev_frame && (cm->width != cm->prev_frame->width ||
+                         cm->height != cm->prev_frame->height);
+  if (resolution_change) {
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    av1_clearall_segfeatures(seg);
+    aom_clear_system_state();
+    av1_disable_segmentation(seg);
+    return;
+  }
   if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
       cpi->refresh_alt_ref_frame ||
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
@@ -57,8 +66,6 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
     av1_enable_segmentation(seg);
     av1_clearall_segfeatures(seg);
 
-    seg->abs_delta = SEGMENT_DELTADATA;
-
     aom_clear_system_state();
 
     for (i = 0; i < MAX_SEGMENTS; ++i) {
@@ -74,11 +81,6 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
         qindex_delta = -cm->base_qindex + 1;
       }
 
-      // No need to enable SEG_LVL_ALT_Q for this segment.
-      if (rate_ratio[i] == 1.0) {
-        continue;
-      }
-
       av1_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
       av1_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
     }
@@ -108,7 +110,6 @@ static void aq_variance(const uint8_t *a, int a_stride, const uint8_t *b,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 static void aq_highbd_variance64(const uint8_t *a8, int a_stride,
                                  const uint8_t *b8, int b_stride, int w, int h,
                                  uint64_t *sse, uint64_t *sum) {
@@ -139,7 +140,6 @@ static void aq_highbd_8_variance(const uint8_t *a8, int a_stride,
   *sse = (unsigned int)sse_long;
   *sum = (int)sum_long;
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x,
                                    BLOCK_SIZE bs) {
@@ -154,7 +154,6 @@ static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x,
     const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
     const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
     int avg;
-#if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
                            CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, bw, bh,
@@ -165,14 +164,9 @@ static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x,
       aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_all_zeros, 0,
                   bw, bh, &sse, &avg);
     }
-#else
-    aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_all_zeros, 0,
-                bw, bh, &sse, &avg);
-#endif  // CONFIG_HIGHBITDEPTH
     var = sse - (unsigned int)(((int64_t)avg * avg) / (bw * bh));
     return (unsigned int)((uint64_t)var * 256) / (bw * bh);
   } else {
-#if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       var =
           cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
@@ -181,10 +175,6 @@ static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x,
       var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
                                av1_all_zeros, 0, &sse);
     }
-#else
-    var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
-                             av1_all_zeros, 0, &sse);
-#endif  // CONFIG_HIGHBITDEPTH
     return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs];
   }
 }
@@ -205,3 +195,53 @@ int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
   energy = av1_log_block_var(cpi, x, bs) - energy_midpoint;
   return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
 }
+
+unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int stride = x->plane[0].src.stride;
+  uint8_t *buf = x->plane[0].src.buf;
+  const int bw = MI_SIZE * mi_size_wide[bs];
+  const int bh = MI_SIZE * mi_size_high[bs];
+  int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+
+  int var = 0;
+  for (int r = 0; r < bh; r += 8)
+    for (int c = 0; c < bw; c += 8) {
+      var += av1_haar_ac_sad_8x8_uint8_input(buf + c + r * stride, stride, hbd);
+    }
+
+  return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs];
+}
+
+double av1_log_block_wavelet_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
+  unsigned int haar_sad = haar_ac_energy(x, bs);
+  aom_clear_system_state();
+  return log(haar_sad + 1.0);
+}
+
+int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
+                                   BLOCK_SIZE bs) {
+  double energy, energy_midpoint;
+  aom_clear_system_state();
+  energy_midpoint = (cpi->oxcf.pass == 2) ? cpi->twopass.frame_avg_haar_energy
+                                          : DEFAULT_E_MIDPOINT;
+  energy = av1_log_block_wavelet_energy(x, bs) - energy_midpoint;
+  return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
+}
+
+int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi,
+                                         int block_var_level) {
+  ENERGY_IN_BOUNDS(block_var_level);
+
+  const int rate_level = SEGMENT_ID(block_var_level);
+  const AV1_COMMON *const cm = &cpi->common;
+  int qindex_delta =
+      av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+                                 rate_ratio[rate_level], cm->bit_depth);
+
+  if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+    qindex_delta = -cm->base_qindex + 1;
+  }
+
+  return qindex_delta;
+}
diff --git a/third_party/aom/av1/encoder/aq_variance.h b/third_party/aom/av1/encoder/aq_variance.h
index 05725c5de..b1a8bc38a 100644
--- a/third_party/aom/av1/encoder/aq_variance.h
+++ b/third_party/aom/av1/encoder/aq_variance.h
@@ -23,6 +23,10 @@ void av1_vaq_frame_setup(AV1_COMP *cpi);
 
 int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
 double av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi,
+                                         int block_var_level);
+int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
+                                   BLOCK_SIZE bs);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/arm/neon/error_neon.c b/third_party/aom/av1/encoder/arm/neon/error_neon.c
deleted file mode 100644
index fe5233f89..000000000
--- a/third_party/aom/av1/encoder/arm/neon/error_neon.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "./av1_rtcd.h"
-
-int64_t av1_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff,
-                                int block_size) {
-  int64x2_t error = vdupq_n_s64(0);
-
-  assert(block_size >= 8);
-  assert((block_size % 8) == 0);
-
-  do {
-    const int16x8_t c = vld1q_s16(coeff);
-    const int16x8_t d = vld1q_s16(dqcoeff);
-    const int16x8_t diff = vsubq_s16(c, d);
-    const int16x4_t diff_lo = vget_low_s16(diff);
-    const int16x4_t diff_hi = vget_high_s16(diff);
-    // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
-    // accumulating them in 64-bits.
-    const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
-    const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
-    const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
-    error = vaddq_s64(error, err2);
-    coeff += 8;
-    dqcoeff += 8;
-    block_size -= 8;
-  } while (block_size != 0);
-
-  return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
-}
diff --git a/third_party/aom/av1/common/av1_fwd_txfm1d.c b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
index c9c7f437e..b92b3469f 100644
--- a/third_party/aom/av1/common/av1_fwd_txfm1d.c
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
@@ -10,16 +10,16 @@
  */
 
 #include <stdlib.h>
-#include "aom_dsp/inv_txfm.h"
-#include "av1/common/av1_fwd_txfm1d.h"
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#include "av1/encoder/av1_fwd_txfm1d.h"
 
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
 void range_check_func(int32_t stage, const int32_t *input, const int32_t *buf,
                       int32_t size, int8_t bit);
 
 #define range_check(stage, input, buf, size, bit) \
   range_check_func(stage, input, buf, size, bit)
-#else
+#else  // CONFIG_COEFFICIENT_RANGE_CHECKING
+
 #define range_check(stage, input, buf, size, bit) \
   {                                               \
     (void)stage;                                  \
@@ -28,10 +28,9 @@ void range_check_func(int32_t stage, const int32_t *input, const int32_t *buf,
     (void)size;                                   \
     (void)bit;                                    \
   }
-#endif
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
 
-// TODO(angiebird): Make 1-d txfm functions static
-void av1_fdct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
                    const int8_t *stage_range) {
   const int32_t size = 4;
   const int32_t *cospi;
@@ -54,13 +53,13 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
-  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
@@ -74,7 +73,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
   range_check(stage, input, bf1, size, stage_range[stage]);
 }
 
-void av1_fdct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
                    const int8_t *stage_range) {
   const int32_t size = 8;
   const int32_t *cospi;
@@ -101,7 +100,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0] + bf0[3];
@@ -109,20 +108,20 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
   bf1[2] = -bf0[2] + bf0[1];
   bf1[3] = -bf0[3] + bf0[0];
   bf1[4] = bf0[4];
-  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
   bf1[7] = bf0[7];
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = step;
   bf1 = output;
-  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
   bf1[4] = bf0[4] + bf0[5];
   bf1[5] = -bf0[5] + bf0[4];
   bf1[6] = -bf0[6] + bf0[7];
@@ -131,17 +130,17 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
 
   // stage 4
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
   bf1[2] = bf0[2];
   bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
@@ -159,8 +158,8 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
   range_check(stage, input, bf1, size, stage_range[stage]);
 }
 
-void av1_fdct16_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
   const int32_t size = 16;
   const int32_t *cospi;
 
@@ -194,7 +193,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output,
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0] + bf0[7];
@@ -207,17 +206,17 @@ void av1_fdct16_new(const int32_t *input, int32_t *output,
   bf1[7] = -bf0[7] + bf0[0];
   bf1[8] = bf0[8];
   bf1[9] = bf0[9];
-  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
   bf1[14] = bf0[14];
   bf1[15] = bf0[15];
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0] + bf0[3];
@@ -225,8 +224,8 @@ void av1_fdct16_new(const int32_t *input, int32_t *output,
   bf1[2] = -bf0[2] + bf0[1];
   bf1[3] = -bf0[3] + bf0[0];
   bf1[4] = bf0[4];
-  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
   bf1[7] = bf0[7];
   bf1[8] = bf0[8] + bf0[11];
   bf1[9] = bf0[9] + bf0[10];
@@ -240,40 +239,40 @@ void av1_fdct16_new(const int32_t *input, int32_t *output,
 
   // stage 4
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
-  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
   bf1[4] = bf0[4] + bf0[5];
   bf1[5] = -bf0[5] + bf0[4];
   bf1[6] = -bf0[6] + bf0[7];
   bf1[7] = bf0[7] + bf0[6];
   bf1[8] = bf0[8];
-  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
   bf1[11] = bf0[11];
   bf1[12] = bf0[12];
-  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
   bf1[15] = bf0[15];
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
   bf1[2] = bf0[2];
   bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
   bf1[8] = bf0[8] + bf0[9];
   bf1[9] = -bf0[9] + bf0[8];
   bf1[10] = -bf0[10] + bf0[11];
@@ -286,7 +285,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output,
 
   // stage 6
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
@@ -297,14 +296,14 @@ void av1_fdct16_new(const int32_t *input, int32_t *output,
   bf1[5] = bf0[5];
   bf1[6] = bf0[6];
   bf1[7] = bf0[7];
-  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
-  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
@@ -330,8 +329,8 @@ void av1_fdct16_new(const int32_t *input, int32_t *output,
   range_check(stage, input, bf1, size, stage_range[stage]);
 }
 
-void av1_fdct32_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
   const int32_t size = 32;
   const int32_t *cospi;
 
@@ -381,7 +380,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output,
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0] + bf0[15];
@@ -404,14 +403,14 @@ void av1_fdct32_new(const int32_t *input, int32_t *output,
   bf1[17] = bf0[17];
   bf1[18] = bf0[18];
   bf1[19] = bf0[19];
-  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
-  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
-  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
+  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
   bf1[28] = bf0[28];
   bf1[29] = bf0[29];
   bf1[30] = bf0[30];
@@ -420,7 +419,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output,
 
   // stage 3
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0] + bf0[7];
@@ -433,10 +432,10 @@ void av1_fdct32_new(const int32_t *input, int32_t *output,
   bf1[7] = -bf0[7] + bf0[0];
   bf1[8] = bf0[8];
   bf1[9] = bf0[9];
-  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
   bf1[14] = bf0[14];
   bf1[15] = bf0[15];
   bf1[16] = bf0[16] + bf0[23];
@@ -459,7 +458,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output,
 
   // stage 4
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0] + bf0[3];
@@ -467,8 +466,8 @@ void av1_fdct32_new(const int32_t *input, int32_t *output,
   bf1[2] = -bf0[2] + bf0[1];
   bf1[3] = -bf0[3] + bf0[0];
   bf1[4] = bf0[4];
-  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
   bf1[7] = bf0[7];
   bf1[8] = bf0[8] + bf0[11];
   bf1[9] = bf0[9] + bf0[10];
@@ -480,42 +479,42 @@ void av1_fdct32_new(const int32_t *input, int32_t *output,
   bf1[15] = bf0[15] + bf0[12];
   bf1[16] = bf0[16];
   bf1[17] = bf0[17];
-  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
-  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
-  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
   bf1[22] = bf0[22];
   bf1[23] = bf0[23];
   bf1[24] = bf0[24];
   bf1[25] = bf0[25];
-  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit[stage]);
-  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
+  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
   bf1[30] = bf0[30];
   bf1[31] = bf0[31];
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = step;
   bf1 = output;
-  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
   bf1[4] = bf0[4] + bf0[5];
   bf1[5] = -bf0[5] + bf0[4];
   bf1[6] = -bf0[6] + bf0[7];
   bf1[7] = bf0[7] + bf0[6];
   bf1[8] = bf0[8];
-  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
   bf1[11] = bf0[11];
   bf1[12] = bf0[12];
-  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
   bf1[15] = bf0[15];
   bf1[16] = bf0[16] + bf0[19];
   bf1[17] = bf0[17] + bf0[18];
@@ -537,17 +536,17 @@ void av1_fdct32_new(const int32_t *input, int32_t *output,
 
   // stage 6
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
   bf1[2] = bf0[2];
   bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
   bf1[8] = bf0[8] + bf0[9];
   bf1[9] = -bf0[9] + bf0[8];
   bf1[10] = -bf0[10] + bf0[11];
@@ -557,26 +556,26 @@ void av1_fdct32_new(const int32_t *input, int32_t *output,
   bf1[14] = -bf0[14] + bf0[15];
   bf1[15] = bf0[15] + bf0[14];
   bf1[16] = bf0[16];
-  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
-  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
   bf1[19] = bf0[19];
   bf1[20] = bf0[20];
-  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
   bf1[23] = bf0[23];
   bf1[24] = bf0[24];
-  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
   bf1[27] = bf0[27];
   bf1[28] = bf0[28];
-  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit[stage]);
-  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
+  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
   bf1[31] = bf0[31];
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0];
@@ -587,14 +586,14 @@ void av1_fdct32_new(const int32_t *input, int32_t *output,
   bf1[5] = bf0[5];
   bf1[6] = bf0[6];
   bf1[7] = bf0[7];
-  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
-  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
   bf1[16] = bf0[16] + bf0[17];
   bf1[17] = -bf0[17] + bf0[16];
   bf1[18] = -bf0[18] + bf0[19];
@@ -615,7 +614,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output,
 
   // stage 8
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
@@ -634,22 +633,22 @@ void av1_fdct32_new(const int32_t *input, int32_t *output,
   bf1[13] = bf0[13];
   bf1[14] = bf0[14];
   bf1[15] = bf0[15];
-  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit[stage]);
-  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit[stage]);
-  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit[stage]);
-  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit[stage]);
-  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit[stage]);
-  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit[stage]);
-  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit[stage]);
-  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit[stage]);
-  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit[stage]);
+  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
+  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
+  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
+  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
+  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
+  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
+  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
+  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
+  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
+  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
+  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
+  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
+  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 9
@@ -691,72 +690,67 @@ void av1_fdct32_new(const int32_t *input, int32_t *output,
   range_check(stage, input, bf1, size, stage_range[stage]);
 }
 
-void av1_fadst4_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range) {
-  const int32_t size = 4;
-  const int32_t *cospi;
-
-  int32_t stage = 0;
-  int32_t *bf0, *bf1;
-  int32_t step[4];
-
-  // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
+void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  int bit = cos_bit;
+  const int32_t *sinpi = sinpi_arr(bit);
+  int32_t x0, x1, x2, x3;
+  int32_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  // stage 0
+  range_check(0, input, input, 4, stage_range[0]);
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
 
-  // stage 1;
-  stage++;
-  bf1 = output;
-  bf1[0] = input[3];
-  bf1[1] = input[0];
-  bf1[2] = input[1];
-  bf1[3] = input[2];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  // stage 1
+  s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]);
+  s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]);
+  s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]);
+  s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]);
+  s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]);
+  s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]);
+  s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]);
+  s7 = range_check_value(x0 + x1, stage_range[1]);
 
   // stage 2
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(-cospi[8], bf0[1], cospi[56], bf0[0], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(-cospi[40], bf0[3], cospi[24], bf0[2], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  s7 = range_check_value(s7 - x3, stage_range[2]);
 
   // stage 3
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[2];
-  bf1[1] = bf0[1] + bf0[3];
-  bf1[2] = -bf0[2] + bf0[0];
-  bf1[3] = -bf0[3] + bf0[1];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  x0 = range_check_value(s0 + s2, bit + stage_range[3]);
+  x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]);
+  x2 = range_check_value(s1 - s3, bit + stage_range[3]);
+  x3 = range_check_value(s4, bit + stage_range[3]);
 
   // stage 4
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  x0 = range_check_value(x0 + s5, bit + stage_range[4]);
+  x2 = range_check_value(x2 + s6, bit + stage_range[4]);
 
   // stage 5
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0];
-  bf1[1] = -bf0[2];
-  bf1[2] = bf0[3];
-  bf1[3] = -bf0[1];
-  range_check(stage, input, bf1, size, stage_range[stage]);
+  s0 = range_check_value(x0 + x3, bit + stage_range[5]);
+  s1 = range_check_value(x1, bit + stage_range[5]);
+  s2 = range_check_value(x2 - x3, bit + stage_range[5]);
+  s3 = range_check_value(x2 - x0, bit + stage_range[5]);
+
+  // stage 6
+  s3 = range_check_value(s3 + x3, bit + stage_range[6]);
+
+  // 1-D transform scaling factor is sqrt(2).
+  output[0] = round_shift(s0, bit);
+  output[1] = round_shift(s1, bit);
+  output[2] = round_shift(s2, bit);
+  output[3] = round_shift(s3, bit);
+  range_check(6, input, output, 4, stage_range[6]);
 }
 
-void av1_fadst8_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
   const int32_t size = 8;
   const int32_t *cospi;
 
@@ -769,107 +763,108 @@ void av1_fadst8_new(const int32_t *input, int32_t *output,
 
   // stage 1;
   stage++;
+  assert(output != input);
   bf1 = output;
-  bf1[0] = input[7];
-  bf1[1] = input[0];
-  bf1[2] = input[5];
-  bf1[3] = input[2];
-  bf1[4] = input[3];
-  bf1[5] = input[4];
-  bf1[6] = input[1];
-  bf1[7] = input[6];
+  bf1[0] = input[0];
+  bf1[1] = -input[7];
+  bf1[2] = -input[3];
+  bf1[3] = input[4];
+  bf1[4] = -input[1];
+  bf1[5] = input[6];
+  bf1[6] = input[2];
+  bf1[7] = -input[5];
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
-  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(-cospi[4], bf0[1], cospi[60], bf0[0], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(-cospi[20], bf0[3], cospi[44], bf0[2], cos_bit[stage]);
-  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(-cospi[36], bf0[5], cospi[28], bf0[4], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(-cospi[52], bf0[7], cospi[12], bf0[6], cos_bit[stage]);
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 3
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[4];
-  bf1[1] = bf0[1] + bf0[5];
-  bf1[2] = bf0[2] + bf0[6];
-  bf1[3] = bf0[3] + bf0[7];
-  bf1[4] = -bf0[4] + bf0[0];
-  bf1[5] = -bf0[5] + bf0[1];
-  bf1[6] = -bf0[6] + bf0[2];
-  bf1[7] = -bf0[7] + bf0[3];
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
   bf1[2] = bf0[2];
   bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
-  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[2];
-  bf1[1] = bf0[1] + bf0[3];
-  bf1[2] = -bf0[2] + bf0[0];
-  bf1[3] = -bf0[3] + bf0[1];
-  bf1[4] = bf0[4] + bf0[6];
-  bf1[5] = bf0[5] + bf0[7];
-  bf1[6] = -bf0[6] + bf0[4];
-  bf1[7] = -bf0[7] + bf0[5];
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 6
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
+  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
+  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0];
-  bf1[1] = -bf0[4];
-  bf1[2] = bf0[6];
-  bf1[3] = -bf0[2];
-  bf1[4] = bf0[3];
-  bf1[5] = -bf0[7];
-  bf1[6] = bf0[5];
-  bf1[7] = -bf0[1];
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[6];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[4];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[2];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[0];
   range_check(stage, input, bf1, size, stage_range[stage]);
 }
 
-void av1_fadst16_new(const int32_t *input, int32_t *output,
-                     const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                     const int8_t *stage_range) {
   const int32_t size = 16;
   const int32_t *cospi;
 
@@ -882,686 +877,241 @@ void av1_fadst16_new(const int32_t *input, int32_t *output,
 
   // stage 1;
   stage++;
+  assert(output != input);
   bf1 = output;
-  bf1[0] = input[15];
-  bf1[1] = input[0];
-  bf1[2] = input[13];
-  bf1[3] = input[2];
-  bf1[4] = input[11];
-  bf1[5] = input[4];
-  bf1[6] = input[9];
-  bf1[7] = input[6];
-  bf1[8] = input[7];
-  bf1[9] = input[8];
-  bf1[10] = input[5];
-  bf1[11] = input[10];
-  bf1[12] = input[3];
-  bf1[13] = input[12];
-  bf1[14] = input[1];
-  bf1[15] = input[14];
+  bf1[0] = input[0];
+  bf1[1] = -input[15];
+  bf1[2] = -input[7];
+  bf1[3] = input[8];
+  bf1[4] = -input[3];
+  bf1[5] = input[12];
+  bf1[6] = input[4];
+  bf1[7] = -input[11];
+  bf1[8] = -input[1];
+  bf1[9] = input[14];
+  bf1[10] = input[6];
+  bf1[11] = -input[9];
+  bf1[12] = input[2];
+  bf1[13] = -input[13];
+  bf1[14] = -input[5];
+  bf1[15] = input[10];
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(-cospi[2], bf0[1], cospi[62], bf0[0], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(-cospi[10], bf0[3], cospi[54], bf0[2], cos_bit[stage]);
-  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(-cospi[18], bf0[5], cospi[46], bf0[4], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(-cospi[26], bf0[7], cospi[38], bf0[6], cos_bit[stage]);
-  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit[stage]);
-  bf1[9] = half_btf(-cospi[34], bf0[9], cospi[30], bf0[8], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[42], bf0[11], cospi[22], bf0[10], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(-cospi[50], bf0[13], cospi[14], bf0[12], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(-cospi[58], bf0[15], cospi[6], bf0[14], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 3
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[8];
-  bf1[1] = bf0[1] + bf0[9];
-  bf1[2] = bf0[2] + bf0[10];
-  bf1[3] = bf0[3] + bf0[11];
-  bf1[4] = bf0[4] + bf0[12];
-  bf1[5] = bf0[5] + bf0[13];
-  bf1[6] = bf0[6] + bf0[14];
-  bf1[7] = bf0[7] + bf0[15];
-  bf1[8] = -bf0[8] + bf0[0];
-  bf1[9] = -bf0[9] + bf0[1];
-  bf1[10] = -bf0[10] + bf0[2];
-  bf1[11] = -bf0[11] + bf0[3];
-  bf1[12] = -bf0[12] + bf0[4];
-  bf1[13] = -bf0[13] + bf0[5];
-  bf1[14] = -bf0[14] + bf0[6];
-  bf1[15] = -bf0[15] + bf0[7];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 4
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
   bf1[4] = bf0[4];
   bf1[5] = bf0[5];
-  bf1[6] = bf0[6];
-  bf1[7] = bf0[7];
-  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
-  bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit[stage]);
-  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit[stage]);
-  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 5
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[4];
-  bf1[1] = bf0[1] + bf0[5];
-  bf1[2] = bf0[2] + bf0[6];
-  bf1[3] = bf0[3] + bf0[7];
-  bf1[4] = -bf0[4] + bf0[0];
-  bf1[5] = -bf0[5] + bf0[1];
-  bf1[6] = -bf0[6] + bf0[2];
-  bf1[7] = -bf0[7] + bf0[3];
-  bf1[8] = bf0[8] + bf0[12];
-  bf1[9] = bf0[9] + bf0[13];
-  bf1[10] = bf0[10] + bf0[14];
-  bf1[11] = bf0[11] + bf0[15];
-  bf1[12] = -bf0[12] + bf0[8];
-  bf1[13] = -bf0[13] + bf0[9];
-  bf1[14] = -bf0[14] + bf0[10];
-  bf1[15] = -bf0[15] + bf0[11];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 6
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
-  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
   bf1[8] = bf0[8];
   bf1[9] = bf0[9];
-  bf1[10] = bf0[10];
-  bf1[11] = bf0[11];
-  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit[stage]);
-  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
   range_check(stage, input, bf1, size, stage_range[stage]);
 
-  // stage 7
+  // stage 3
   stage++;
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0] + bf0[2];
   bf1[1] = bf0[1] + bf0[3];
-  bf1[2] = -bf0[2] + bf0[0];
-  bf1[3] = -bf0[3] + bf0[1];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
   bf1[4] = bf0[4] + bf0[6];
   bf1[5] = bf0[5] + bf0[7];
-  bf1[6] = -bf0[6] + bf0[4];
-  bf1[7] = -bf0[7] + bf0[5];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
   bf1[8] = bf0[8] + bf0[10];
   bf1[9] = bf0[9] + bf0[11];
-  bf1[10] = -bf0[10] + bf0[8];
-  bf1[11] = -bf0[11] + bf0[9];
+  bf1[10] = bf0[8] - bf0[10];
+  bf1[11] = bf0[9] - bf0[11];
   bf1[12] = bf0[12] + bf0[14];
   bf1[13] = bf0[13] + bf0[15];
-  bf1[14] = -bf0[14] + bf0[12];
-  bf1[15] = -bf0[15] + bf0[13];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 8
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
-  bf1[8] = bf0[8];
-  bf1[9] = bf0[9];
-  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit[stage]);
-  bf1[12] = bf0[12];
-  bf1[13] = bf0[13];
-  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 9
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0];
-  bf1[1] = -bf0[8];
-  bf1[2] = bf0[12];
-  bf1[3] = -bf0[4];
-  bf1[4] = bf0[6];
-  bf1[5] = -bf0[14];
-  bf1[6] = bf0[10];
-  bf1[7] = -bf0[2];
-  bf1[8] = bf0[3];
-  bf1[9] = -bf0[11];
-  bf1[10] = bf0[15];
-  bf1[11] = -bf0[7];
-  bf1[12] = bf0[5];
-  bf1[13] = -bf0[13];
-  bf1[14] = bf0[9];
-  bf1[15] = -bf0[1];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-}
-
-void av1_fadst32_new(const int32_t *input, int32_t *output,
-                     const int8_t *cos_bit, const int8_t *stage_range) {
-  const int32_t size = 32;
-  const int32_t *cospi;
-
-  int32_t stage = 0;
-  int32_t *bf0, *bf1;
-  int32_t step[32];
-
-  // stage 0;
-  range_check(stage, input, input, size, stage_range[stage]);
-
-  // stage 1;
-  stage++;
-  bf1 = output;
-  bf1[0] = input[31];
-  bf1[1] = input[0];
-  bf1[2] = input[29];
-  bf1[3] = input[2];
-  bf1[4] = input[27];
-  bf1[5] = input[4];
-  bf1[6] = input[25];
-  bf1[7] = input[6];
-  bf1[8] = input[23];
-  bf1[9] = input[8];
-  bf1[10] = input[21];
-  bf1[11] = input[10];
-  bf1[12] = input[19];
-  bf1[13] = input[12];
-  bf1[14] = input[17];
-  bf1[15] = input[14];
-  bf1[16] = input[15];
-  bf1[17] = input[16];
-  bf1[18] = input[13];
-  bf1[19] = input[18];
-  bf1[20] = input[11];
-  bf1[21] = input[20];
-  bf1[22] = input[9];
-  bf1[23] = input[22];
-  bf1[24] = input[7];
-  bf1[25] = input[24];
-  bf1[26] = input[5];
-  bf1[27] = input[26];
-  bf1[28] = input[3];
-  bf1[29] = input[28];
-  bf1[30] = input[1];
-  bf1[31] = input[30];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 2
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(-cospi[1], bf0[1], cospi[63], bf0[0], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(-cospi[5], bf0[3], cospi[59], bf0[2], cos_bit[stage]);
-  bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(-cospi[9], bf0[5], cospi[55], bf0[4], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(-cospi[13], bf0[7], cospi[51], bf0[6], cos_bit[stage]);
-  bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit[stage]);
-  bf1[9] = half_btf(-cospi[17], bf0[9], cospi[47], bf0[8], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[21], bf0[11], cospi[43], bf0[10], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(-cospi[25], bf0[13], cospi[39], bf0[12], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(-cospi[29], bf0[15], cospi[35], bf0[14], cos_bit[stage]);
-  bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit[stage]);
-  bf1[17] = half_btf(-cospi[33], bf0[17], cospi[31], bf0[16], cos_bit[stage]);
-  bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit[stage]);
-  bf1[19] = half_btf(-cospi[37], bf0[19], cospi[27], bf0[18], cos_bit[stage]);
-  bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[41], bf0[21], cospi[23], bf0[20], cos_bit[stage]);
-  bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit[stage]);
-  bf1[23] = half_btf(-cospi[45], bf0[23], cospi[19], bf0[22], cos_bit[stage]);
-  bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit[stage]);
-  bf1[25] = half_btf(-cospi[49], bf0[25], cospi[15], bf0[24], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit[stage]);
-  bf1[27] = half_btf(-cospi[53], bf0[27], cospi[11], bf0[26], cos_bit[stage]);
-  bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit[stage]);
-  bf1[29] = half_btf(-cospi[57], bf0[29], cospi[7], bf0[28], cos_bit[stage]);
-  bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit[stage]);
-  bf1[31] = half_btf(-cospi[61], bf0[31], cospi[3], bf0[30], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 3
-  stage++;
-  bf0 = step;
-  bf1 = output;
-  bf1[0] = bf0[0] + bf0[16];
-  bf1[1] = bf0[1] + bf0[17];
-  bf1[2] = bf0[2] + bf0[18];
-  bf1[3] = bf0[3] + bf0[19];
-  bf1[4] = bf0[4] + bf0[20];
-  bf1[5] = bf0[5] + bf0[21];
-  bf1[6] = bf0[6] + bf0[22];
-  bf1[7] = bf0[7] + bf0[23];
-  bf1[8] = bf0[8] + bf0[24];
-  bf1[9] = bf0[9] + bf0[25];
-  bf1[10] = bf0[10] + bf0[26];
-  bf1[11] = bf0[11] + bf0[27];
-  bf1[12] = bf0[12] + bf0[28];
-  bf1[13] = bf0[13] + bf0[29];
-  bf1[14] = bf0[14] + bf0[30];
-  bf1[15] = bf0[15] + bf0[31];
-  bf1[16] = -bf0[16] + bf0[0];
-  bf1[17] = -bf0[17] + bf0[1];
-  bf1[18] = -bf0[18] + bf0[2];
-  bf1[19] = -bf0[19] + bf0[3];
-  bf1[20] = -bf0[20] + bf0[4];
-  bf1[21] = -bf0[21] + bf0[5];
-  bf1[22] = -bf0[22] + bf0[6];
-  bf1[23] = -bf0[23] + bf0[7];
-  bf1[24] = -bf0[24] + bf0[8];
-  bf1[25] = -bf0[25] + bf0[9];
-  bf1[26] = -bf0[26] + bf0[10];
-  bf1[27] = -bf0[27] + bf0[11];
-  bf1[28] = -bf0[28] + bf0[12];
-  bf1[29] = -bf0[29] + bf0[13];
-  bf1[30] = -bf0[30] + bf0[14];
-  bf1[31] = -bf0[31] + bf0[15];
+  bf1[14] = bf0[12] - bf0[14];
+  bf1[15] = bf0[13] - bf0[15];
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 4
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
   bf1[2] = bf0[2];
   bf1[3] = bf0[3];
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = bf0[6];
-  bf1[7] = bf0[7];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
   bf1[8] = bf0[8];
   bf1[9] = bf0[9];
   bf1[10] = bf0[10];
   bf1[11] = bf0[11];
-  bf1[12] = bf0[12];
-  bf1[13] = bf0[13];
-  bf1[14] = bf0[14];
-  bf1[15] = bf0[15];
-  bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit[stage]);
-  bf1[17] = half_btf(-cospi[4], bf0[17], cospi[60], bf0[16], cos_bit[stage]);
-  bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit[stage]);
-  bf1[19] = half_btf(-cospi[20], bf0[19], cospi[44], bf0[18], cos_bit[stage]);
-  bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[36], bf0[21], cospi[28], bf0[20], cos_bit[stage]);
-  bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit[stage]);
-  bf1[23] = half_btf(-cospi[52], bf0[23], cospi[12], bf0[22], cos_bit[stage]);
-  bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[60], bf0[25], cospi[4], bf0[24], cos_bit[stage]);
-  bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[44], bf0[27], cospi[20], bf0[26], cos_bit[stage]);
-  bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[28], bf0[29], cospi[36], bf0[28], cos_bit[stage]);
-  bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[12], bf0[31], cospi[52], bf0[30], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 5
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[8];
-  bf1[1] = bf0[1] + bf0[9];
-  bf1[2] = bf0[2] + bf0[10];
-  bf1[3] = bf0[3] + bf0[11];
-  bf1[4] = bf0[4] + bf0[12];
-  bf1[5] = bf0[5] + bf0[13];
-  bf1[6] = bf0[6] + bf0[14];
-  bf1[7] = bf0[7] + bf0[15];
-  bf1[8] = -bf0[8] + bf0[0];
-  bf1[9] = -bf0[9] + bf0[1];
-  bf1[10] = -bf0[10] + bf0[2];
-  bf1[11] = -bf0[11] + bf0[3];
-  bf1[12] = -bf0[12] + bf0[4];
-  bf1[13] = -bf0[13] + bf0[5];
-  bf1[14] = -bf0[14] + bf0[6];
-  bf1[15] = -bf0[15] + bf0[7];
-  bf1[16] = bf0[16] + bf0[24];
-  bf1[17] = bf0[17] + bf0[25];
-  bf1[18] = bf0[18] + bf0[26];
-  bf1[19] = bf0[19] + bf0[27];
-  bf1[20] = bf0[20] + bf0[28];
-  bf1[21] = bf0[21] + bf0[29];
-  bf1[22] = bf0[22] + bf0[30];
-  bf1[23] = bf0[23] + bf0[31];
-  bf1[24] = -bf0[24] + bf0[16];
-  bf1[25] = -bf0[25] + bf0[17];
-  bf1[26] = -bf0[26] + bf0[18];
-  bf1[27] = -bf0[27] + bf0[19];
-  bf1[28] = -bf0[28] + bf0[20];
-  bf1[29] = -bf0[29] + bf0[21];
-  bf1[30] = -bf0[30] + bf0[22];
-  bf1[31] = -bf0[31] + bf0[23];
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 6
-  stage++;
-  cospi = cospi_arr(cos_bit[stage]);
-  bf0 = output;
-  bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = bf0[2];
-  bf1[3] = bf0[3];
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = bf0[6];
-  bf1[7] = bf0[7];
-  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
-  bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit[stage]);
-  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit[stage]);
-  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit[stage]);
-  bf1[16] = bf0[16];
-  bf1[17] = bf0[17];
-  bf1[18] = bf0[18];
-  bf1[19] = bf0[19];
-  bf1[20] = bf0[20];
-  bf1[21] = bf0[21];
-  bf1[22] = bf0[22];
-  bf1[23] = bf0[23];
-  bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit[stage]);
-  bf1[25] = half_btf(-cospi[8], bf0[25], cospi[56], bf0[24], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit[stage]);
-  bf1[27] = half_btf(-cospi[40], bf0[27], cospi[24], bf0[26], cos_bit[stage]);
-  bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[56], bf0[29], cospi[8], bf0[28], cos_bit[stage]);
-  bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[24], bf0[31], cospi[40], bf0[30], cos_bit[stage]);
-  range_check(stage, input, bf1, size, stage_range[stage]);
-
-  // stage 7
-  stage++;
-  bf0 = step;
-  bf1 = output;
   bf1[0] = bf0[0] + bf0[4];
   bf1[1] = bf0[1] + bf0[5];
   bf1[2] = bf0[2] + bf0[6];
   bf1[3] = bf0[3] + bf0[7];
-  bf1[4] = -bf0[4] + bf0[0];
-  bf1[5] = -bf0[5] + bf0[1];
-  bf1[6] = -bf0[6] + bf0[2];
-  bf1[7] = -bf0[7] + bf0[3];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
   bf1[8] = bf0[8] + bf0[12];
   bf1[9] = bf0[9] + bf0[13];
   bf1[10] = bf0[10] + bf0[14];
   bf1[11] = bf0[11] + bf0[15];
-  bf1[12] = -bf0[12] + bf0[8];
-  bf1[13] = -bf0[13] + bf0[9];
-  bf1[14] = -bf0[14] + bf0[10];
-  bf1[15] = -bf0[15] + bf0[11];
-  bf1[16] = bf0[16] + bf0[20];
-  bf1[17] = bf0[17] + bf0[21];
-  bf1[18] = bf0[18] + bf0[22];
-  bf1[19] = bf0[19] + bf0[23];
-  bf1[20] = -bf0[20] + bf0[16];
-  bf1[21] = -bf0[21] + bf0[17];
-  bf1[22] = -bf0[22] + bf0[18];
-  bf1[23] = -bf0[23] + bf0[19];
-  bf1[24] = bf0[24] + bf0[28];
-  bf1[25] = bf0[25] + bf0[29];
-  bf1[26] = bf0[26] + bf0[30];
-  bf1[27] = bf0[27] + bf0[31];
-  bf1[28] = -bf0[28] + bf0[24];
-  bf1[29] = -bf0[29] + bf0[25];
-  bf1[30] = -bf0[30] + bf0[26];
-  bf1[31] = -bf0[31] + bf0[27];
+  bf1[12] = bf0[8] - bf0[12];
+  bf1[13] = bf0[9] - bf0[13];
+  bf1[14] = bf0[10] - bf0[14];
+  bf1[15] = bf0[11] - bf0[15];
   range_check(stage, input, bf1, size, stage_range[stage]);
 
-  // stage 8
+  // stage 6
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
   bf1[2] = bf0[2];
   bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
-  bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
-  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
-  bf1[8] = bf0[8];
-  bf1[9] = bf0[9];
-  bf1[10] = bf0[10];
-  bf1[11] = bf0[11];
-  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
-  bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit[stage]);
-  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit[stage]);
-  bf1[16] = bf0[16];
-  bf1[17] = bf0[17];
-  bf1[18] = bf0[18];
-  bf1[19] = bf0[19];
-  bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[20], cos_bit[stage]);
-  bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit[stage]);
-  bf1[23] = half_btf(cospi[48], bf0[23], cospi[16], bf0[22], cos_bit[stage]);
-  bf1[24] = bf0[24];
-  bf1[25] = bf0[25];
-  bf1[26] = bf0[26];
-  bf1[27] = bf0[27];
-  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit[stage]);
-  bf1[29] = half_btf(-cospi[16], bf0[29], cospi[48], bf0[28], cos_bit[stage]);
-  bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[48], bf0[31], cospi[16], bf0[30], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
+  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
   range_check(stage, input, bf1, size, stage_range[stage]);
 
-  // stage 9
+  // stage 7
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0] + bf0[2];
-  bf1[1] = bf0[1] + bf0[3];
-  bf1[2] = -bf0[2] + bf0[0];
-  bf1[3] = -bf0[3] + bf0[1];
-  bf1[4] = bf0[4] + bf0[6];
-  bf1[5] = bf0[5] + bf0[7];
-  bf1[6] = -bf0[6] + bf0[4];
-  bf1[7] = -bf0[7] + bf0[5];
-  bf1[8] = bf0[8] + bf0[10];
-  bf1[9] = bf0[9] + bf0[11];
-  bf1[10] = -bf0[10] + bf0[8];
-  bf1[11] = -bf0[11] + bf0[9];
-  bf1[12] = bf0[12] + bf0[14];
-  bf1[13] = bf0[13] + bf0[15];
-  bf1[14] = -bf0[14] + bf0[12];
-  bf1[15] = -bf0[15] + bf0[13];
-  bf1[16] = bf0[16] + bf0[18];
-  bf1[17] = bf0[17] + bf0[19];
-  bf1[18] = -bf0[18] + bf0[16];
-  bf1[19] = -bf0[19] + bf0[17];
-  bf1[20] = bf0[20] + bf0[22];
-  bf1[21] = bf0[21] + bf0[23];
-  bf1[22] = -bf0[22] + bf0[20];
-  bf1[23] = -bf0[23] + bf0[21];
-  bf1[24] = bf0[24] + bf0[26];
-  bf1[25] = bf0[25] + bf0[27];
-  bf1[26] = -bf0[26] + bf0[24];
-  bf1[27] = -bf0[27] + bf0[25];
-  bf1[28] = bf0[28] + bf0[30];
-  bf1[29] = bf0[29] + bf0[31];
-  bf1[30] = -bf0[30] + bf0[28];
-  bf1[31] = -bf0[31] + bf0[29];
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = bf0[0] - bf0[8];
+  bf1[9] = bf0[1] - bf0[9];
+  bf1[10] = bf0[2] - bf0[10];
+  bf1[11] = bf0[3] - bf0[11];
+  bf1[12] = bf0[4] - bf0[12];
+  bf1[13] = bf0[5] - bf0[13];
+  bf1[14] = bf0[6] - bf0[14];
+  bf1[15] = bf0[7] - bf0[15];
   range_check(stage, input, bf1, size, stage_range[stage]);
 
-  // stage 10
+  // stage 8
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
-  bf1[0] = bf0[0];
-  bf1[1] = bf0[1];
-  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
-  bf1[4] = bf0[4];
-  bf1[5] = bf0[5];
-  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
-  bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
-  bf1[8] = bf0[8];
-  bf1[9] = bf0[9];
-  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit[stage]);
-  bf1[12] = bf0[12];
-  bf1[13] = bf0[13];
-  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
-  bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit[stage]);
-  bf1[16] = bf0[16];
-  bf1[17] = bf0[17];
-  bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit[stage]);
-  bf1[19] = half_btf(-cospi[32], bf0[19], cospi[32], bf0[18], cos_bit[stage]);
-  bf1[20] = bf0[20];
-  bf1[21] = bf0[21];
-  bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit[stage]);
-  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[22], cos_bit[stage]);
-  bf1[24] = bf0[24];
-  bf1[25] = bf0[25];
-  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit[stage]);
-  bf1[27] = half_btf(-cospi[32], bf0[27], cospi[32], bf0[26], cos_bit[stage]);
-  bf1[28] = bf0[28];
-  bf1[29] = bf0[29];
-  bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit[stage]);
-  bf1[31] = half_btf(-cospi[32], bf0[31], cospi[32], bf0[30], cos_bit[stage]);
+  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
+  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
+  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
+  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
+  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
+  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
+  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
   range_check(stage, input, bf1, size, stage_range[stage]);
 
-  // stage 11
+  // stage 9
   stage++;
   bf0 = step;
   bf1 = output;
-  bf1[0] = bf0[0];
-  bf1[1] = -bf0[16];
-  bf1[2] = bf0[24];
-  bf1[3] = -bf0[8];
-  bf1[4] = bf0[12];
-  bf1[5] = -bf0[28];
-  bf1[6] = bf0[20];
-  bf1[7] = -bf0[4];
-  bf1[8] = bf0[6];
-  bf1[9] = -bf0[22];
-  bf1[10] = bf0[30];
-  bf1[11] = -bf0[14];
-  bf1[12] = bf0[10];
-  bf1[13] = -bf0[26];
-  bf1[14] = bf0[18];
-  bf1[15] = -bf0[2];
-  bf1[16] = bf0[3];
-  bf1[17] = -bf0[19];
-  bf1[18] = bf0[27];
-  bf1[19] = -bf0[11];
-  bf1[20] = bf0[15];
-  bf1[21] = -bf0[31];
-  bf1[22] = bf0[23];
-  bf1[23] = -bf0[7];
-  bf1[24] = bf0[5];
-  bf1[25] = -bf0[21];
-  bf1[26] = bf0[29];
-  bf1[27] = -bf0[13];
-  bf1[28] = bf0[9];
-  bf1[29] = -bf0[25];
-  bf1[30] = bf0[17];
-  bf1[31] = -bf0[1];
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[14];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[12];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[10];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[8];
+  bf1[8] = bf0[9];
+  bf1[9] = bf0[6];
+  bf1[10] = bf0[11];
+  bf1[11] = bf0[4];
+  bf1[12] = bf0[13];
+  bf1[13] = bf0[2];
+  bf1[14] = bf0[15];
+  bf1[15] = bf0[0];
   range_check(stage, input, bf1, size, stage_range[stage]);
 }
 
-#if CONFIG_EXT_TX
-void av1_fidentity4_c(const int32_t *input, int32_t *output,
-                      const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range) {
   (void)cos_bit;
   for (int i = 0; i < 4; ++i)
-    output[i] = (int32_t)dct_const_round_shift(input[i] * Sqrt2);
+    output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits);
+  assert(stage_range[0] + NewSqrt2Bits <= 32);
   range_check(0, input, output, 4, stage_range[0]);
 }
 
-void av1_fidentity8_c(const int32_t *input, int32_t *output,
-                      const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range) {
   (void)cos_bit;
   for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
   range_check(0, input, output, 8, stage_range[0]);
 }
 
-void av1_fidentity16_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range) {
   (void)cos_bit;
   for (int i = 0; i < 16; ++i)
-    output[i] = (int32_t)dct_const_round_shift(input[i] * 2 * Sqrt2);
+    output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits);
+  assert(stage_range[0] + NewSqrt2Bits <= 32);
   range_check(0, input, output, 16, stage_range[0]);
 }
 
-void av1_fidentity32_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range) {
   (void)cos_bit;
   for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
   range_check(0, input, output, 32, stage_range[0]);
 }
 
-#if CONFIG_TX64X64
-void av1_fidentity64_c(const int32_t *input, int32_t *output,
-                       const int8_t *cos_bit, const int8_t *stage_range) {
-  (void)cos_bit;
-  for (int i = 0; i < 64; ++i)
-    output[i] = (int32_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
-  range_check(0, input, output, 64, stage_range[0]);
-}
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
-
-#if CONFIG_TX64X64
-void av1_fdct64_new(const int32_t *input, int32_t *output,
-                    const int8_t *cos_bit, const int8_t *stage_range) {
+void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
   const int32_t size = 64;
   const int32_t *cospi;
 
@@ -1574,7 +1124,6 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
 
   // stage 1;
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf1 = output;
   bf1[0] = input[0] + input[63];
   bf1[1] = input[1] + input[62];
@@ -1644,7 +1193,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
 
   // stage 2
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0] + bf0[31];
@@ -1687,22 +1236,22 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
   bf1[37] = bf0[37];
   bf1[38] = bf0[38];
   bf1[39] = bf0[39];
-  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
-  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
-  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
-  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
-  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
-  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
-  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
-  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
-  bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit[stage]);
-  bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit[stage]);
-  bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit[stage]);
-  bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit[stage]);
-  bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit[stage]);
-  bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit[stage]);
-  bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit[stage]);
-  bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit[stage]);
+  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
+  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
+  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
+  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
+  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
+  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
+  bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
+  bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
+  bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
+  bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
+  bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
+  bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
+  bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
+  bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
   bf1[56] = bf0[56];
   bf1[57] = bf0[57];
   bf1[58] = bf0[58];
@@ -1715,7 +1264,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
 
   // stage 3
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0] + bf0[15];
@@ -1738,14 +1287,14 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
   bf1[17] = bf0[17];
   bf1[18] = bf0[18];
   bf1[19] = bf0[19];
-  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
-  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
-  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
+  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
   bf1[28] = bf0[28];
   bf1[29] = bf0[29];
   bf1[30] = bf0[30];
@@ -1786,7 +1335,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
 
   // stage 4
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0] + bf0[7];
@@ -1799,10 +1348,10 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
   bf1[7] = -bf0[7] + bf0[0];
   bf1[8] = bf0[8];
   bf1[9] = bf0[9];
-  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
   bf1[14] = bf0[14];
   bf1[15] = bf0[15];
   bf1[16] = bf0[16] + bf0[23];
@@ -1825,14 +1374,14 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
   bf1[33] = bf0[33];
   bf1[34] = bf0[34];
   bf1[35] = bf0[35];
-  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit[stage]);
-  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit[stage]);
-  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit[stage]);
-  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit[stage]);
-  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit[stage]);
-  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit[stage]);
-  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit[stage]);
-  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit[stage]);
+  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
+  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
+  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
+  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
+  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
+  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
   bf1[44] = bf0[44];
   bf1[45] = bf0[45];
   bf1[46] = bf0[46];
@@ -1841,14 +1390,14 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
   bf1[49] = bf0[49];
   bf1[50] = bf0[50];
   bf1[51] = bf0[51];
-  bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit[stage]);
-  bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit[stage]);
-  bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit[stage]);
-  bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit[stage]);
-  bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit[stage]);
-  bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit[stage]);
-  bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit[stage]);
-  bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
+  bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
+  bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
+  bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
+  bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
+  bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
+  bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
+  bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
   bf1[60] = bf0[60];
   bf1[61] = bf0[61];
   bf1[62] = bf0[62];
@@ -1857,7 +1406,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
 
   // stage 5
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0] + bf0[3];
@@ -1865,8 +1414,8 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
   bf1[2] = -bf0[2] + bf0[1];
   bf1[3] = -bf0[3] + bf0[0];
   bf1[4] = bf0[4];
-  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
   bf1[7] = bf0[7];
   bf1[8] = bf0[8] + bf0[11];
   bf1[9] = bf0[9] + bf0[10];
@@ -1878,18 +1427,18 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
   bf1[15] = bf0[15] + bf0[12];
   bf1[16] = bf0[16];
   bf1[17] = bf0[17];
-  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
-  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
-  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
   bf1[22] = bf0[22];
   bf1[23] = bf0[23];
   bf1[24] = bf0[24];
   bf1[25] = bf0[25];
-  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit[stage]);
-  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
+  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
   bf1[30] = bf0[30];
   bf1[31] = bf0[31];
   bf1[32] = bf0[32] + bf0[39];
@@ -1928,24 +1477,24 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
 
   // stage 6
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
-  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
-  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
-  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
-  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
   bf1[4] = bf0[4] + bf0[5];
   bf1[5] = -bf0[5] + bf0[4];
   bf1[6] = -bf0[6] + bf0[7];
   bf1[7] = bf0[7] + bf0[6];
   bf1[8] = bf0[8];
-  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
   bf1[11] = bf0[11];
   bf1[12] = bf0[12];
-  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
   bf1[15] = bf0[15];
   bf1[16] = bf0[16] + bf0[19];
   bf1[17] = bf0[17] + bf0[18];
@@ -1965,51 +1514,51 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
   bf1[31] = bf0[31] + bf0[28];
   bf1[32] = bf0[32];
   bf1[33] = bf0[33];
-  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit[stage]);
-  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit[stage]);
-  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit[stage]);
-  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit[stage]);
+  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
+  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
+  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
+  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
   bf1[38] = bf0[38];
   bf1[39] = bf0[39];
   bf1[40] = bf0[40];
   bf1[41] = bf0[41];
-  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit[stage]);
-  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit[stage]);
-  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit[stage]);
-  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
+  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
+  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
   bf1[46] = bf0[46];
   bf1[47] = bf0[47];
   bf1[48] = bf0[48];
   bf1[49] = bf0[49];
-  bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit[stage]);
-  bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit[stage]);
-  bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit[stage]);
-  bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
+  bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
+  bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
+  bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
   bf1[54] = bf0[54];
   bf1[55] = bf0[55];
   bf1[56] = bf0[56];
   bf1[57] = bf0[57];
-  bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit[stage]);
-  bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit[stage]);
-  bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit[stage]);
-  bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
+  bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
+  bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
+  bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
   bf1[62] = bf0[62];
   bf1[63] = bf0[63];
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 7
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0];
   bf1[1] = bf0[1];
   bf1[2] = bf0[2];
   bf1[3] = bf0[3];
-  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
-  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
-  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
-  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
   bf1[8] = bf0[8] + bf0[9];
   bf1[9] = -bf0[9] + bf0[8];
   bf1[10] = -bf0[10] + bf0[11];
@@ -2019,20 +1568,20 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
   bf1[14] = -bf0[14] + bf0[15];
   bf1[15] = bf0[15] + bf0[14];
   bf1[16] = bf0[16];
-  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
-  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
   bf1[19] = bf0[19];
   bf1[20] = bf0[20];
-  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
   bf1[23] = bf0[23];
   bf1[24] = bf0[24];
-  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
   bf1[27] = bf0[27];
   bf1[28] = bf0[28];
-  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit[stage]);
-  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
+  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
   bf1[31] = bf0[31];
   bf1[32] = bf0[32] + bf0[35];
   bf1[33] = bf0[33] + bf0[34];
@@ -2070,7 +1619,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
 
   // stage 8
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
@@ -2081,14 +1630,14 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
   bf1[5] = bf0[5];
   bf1[6] = bf0[6];
   bf1[7] = bf0[7];
-  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
-  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
-  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
-  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
-  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
-  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
-  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
-  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
   bf1[16] = bf0[16] + bf0[17];
   bf1[17] = -bf0[17] + bf0[16];
   bf1[18] = -bf0[18] + bf0[19];
@@ -2106,42 +1655,42 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
   bf1[30] = -bf0[30] + bf0[31];
   bf1[31] = bf0[31] + bf0[30];
   bf1[32] = bf0[32];
-  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit[stage]);
-  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit[stage]);
+  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
+  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
   bf1[35] = bf0[35];
   bf1[36] = bf0[36];
-  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit[stage]);
-  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
+  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
   bf1[39] = bf0[39];
   bf1[40] = bf0[40];
-  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit[stage]);
-  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
   bf1[43] = bf0[43];
   bf1[44] = bf0[44];
-  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit[stage]);
-  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
+  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
   bf1[47] = bf0[47];
   bf1[48] = bf0[48];
-  bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit[stage]);
-  bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
+  bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
   bf1[51] = bf0[51];
   bf1[52] = bf0[52];
-  bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit[stage]);
-  bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
+  bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
   bf1[55] = bf0[55];
   bf1[56] = bf0[56];
-  bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit[stage]);
-  bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
+  bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
   bf1[59] = bf0[59];
   bf1[60] = bf0[60];
-  bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit[stage]);
-  bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
+  bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
   bf1[63] = bf0[63];
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 9
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0];
@@ -2160,22 +1709,22 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
   bf1[13] = bf0[13];
   bf1[14] = bf0[14];
   bf1[15] = bf0[15];
-  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit[stage]);
-  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit[stage]);
-  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit[stage]);
-  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit[stage]);
-  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit[stage]);
-  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit[stage]);
-  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit[stage]);
-  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit[stage]);
-  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit[stage]);
-  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit[stage]);
-  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit[stage]);
-  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit[stage]);
-  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit[stage]);
-  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit[stage]);
-  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit[stage]);
-  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit[stage]);
+  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
+  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
+  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
+  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
+  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
+  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
+  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
+  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
+  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
+  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
+  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
+  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
+  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
   bf1[32] = bf0[32] + bf0[33];
   bf1[33] = -bf0[33] + bf0[32];
   bf1[34] = -bf0[34] + bf0[35];
@@ -2212,7 +1761,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
 
   // stage 10
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
+  cospi = cospi_arr(cos_bit);
   bf0 = output;
   bf1 = step;
   bf1[0] = bf0[0];
@@ -2247,43 +1796,42 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
   bf1[29] = bf0[29];
   bf1[30] = bf0[30];
   bf1[31] = bf0[31];
-  bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit[stage]);
-  bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit[stage]);
-  bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit[stage]);
-  bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit[stage]);
-  bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit[stage]);
-  bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit[stage]);
-  bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit[stage]);
-  bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit[stage]);
-  bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit[stage]);
-  bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit[stage]);
-  bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit[stage]);
-  bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit[stage]);
-  bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit[stage]);
-  bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit[stage]);
-  bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit[stage]);
-  bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit[stage]);
-  bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit[stage]);
-  bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit[stage]);
-  bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit[stage]);
-  bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit[stage]);
-  bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit[stage]);
-  bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit[stage]);
-  bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit[stage]);
-  bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit[stage]);
-  bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit[stage]);
-  bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit[stage]);
-  bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit[stage]);
-  bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit[stage]);
-  bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit[stage]);
-  bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit[stage]);
-  bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit[stage]);
-  bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit[stage]);
+  bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
+  bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit);
+  bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
+  bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit);
+  bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
+  bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit);
+  bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
+  bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit);
+  bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
+  bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit);
+  bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
+  bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit);
+  bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
+  bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit);
+  bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
+  bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit);
+  bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
+  bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit);
+  bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
+  bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit);
+  bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
+  bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit);
+  bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
+  bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit);
+  bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
+  bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit);
+  bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
+  bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit);
+  bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
+  bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
+  bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
+  bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
   range_check(stage, input, bf1, size, stage_range[stage]);
 
   // stage 11
   stage++;
-  cospi = cospi_arr(cos_bit[stage]);
   bf0 = step;
   bf1 = output;
   bf1[0] = bf0[0];
@@ -2352,4 +1900,3 @@ void av1_fdct64_new(const int32_t *input, int32_t *output,
   bf1[63] = bf0[63];
   range_check(stage, input, bf1, size, stage_range[stage]);
 }
-#endif  // CONFIG_TX64X64
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
new file mode 100644
index 000000000..9472af8e6
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_FWD_TXFM1D_H_
+#define AV1_FWD_TXFM1D_H_
+
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                   const int8_t *stage_range);
+void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                   const int8_t *stage_range);
+void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                     const int8_t *stage_range);
+void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range);
+void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range);
+void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range);
+void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AV1_FWD_TXFM1D_H_
diff --git a/third_party/aom/av1/common/ncobmc_kernels.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
index 358b7b7c8..174689a14 100644
--- a/third_party/aom/av1/common/ncobmc_kernels.h
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
@@ -9,14 +9,11 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include <stdio.h>
+#ifndef AV1_FWD_TXFM2D_CFG_H_
+#define AV1_FWD_TXFM2D_CFG_H_
 #include "av1/common/enums.h"
-#include "av1/common/onyxc_int.h"
-#include "av1/common/common.h"
-
-#ifndef AV1_COMMON_NCOBMC_KERNELS_H_
-#define AV1_COMMON_NCOBMC_KERNELS_H_
-
-void get_default_ncobmc_kernels(AV1_COMMON *cm);
-
-#endif  // AV1_COMMON_NCOBMC_KERNELS_H_
+#include "av1/encoder/av1_fwd_txfm1d.h"
+extern const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL];
+extern const int8_t fwd_cos_bit_col[5][5];
+extern const int8_t fwd_cos_bit_row[5][5];
+#endif  // AV1_FWD_TXFM2D_CFG_H_
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm2d.c b/third_party/aom/av1/encoder/av1_fwd_txfm2d.c
new file mode 100644
index 000000000..f25a667cf
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm2d.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+
+static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT4: return av1_fdct4_new;
+    case TXFM_TYPE_DCT8: return av1_fdct8_new;
+    case TXFM_TYPE_DCT16: return av1_fdct16_new;
+    case TXFM_TYPE_DCT32: return av1_fdct32_new;
+    case TXFM_TYPE_DCT64: return av1_fdct64_new;
+    case TXFM_TYPE_ADST4: return av1_fadst4_new;
+    case TXFM_TYPE_ADST8: return av1_fadst8_new;
+    case TXFM_TYPE_ADST16: return av1_fadst16_new;
+    case TXFM_TYPE_IDENTITY4: return av1_fidentity4_c;
+    case TXFM_TYPE_IDENTITY8: return av1_fidentity8_c;
+    case TXFM_TYPE_IDENTITY16: return av1_fidentity16_c;
+    case TXFM_TYPE_IDENTITY32: return av1_fidentity32_c;
+    default: assert(0); return NULL;
+  }
+}
+
+void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
+                             const TXFM_2D_FLIP_CFG *cfg, int bd) {
+  // Take the shift from the larger dimension in the rectangular case.
+  const int8_t *shift = cfg->shift;
+  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+  for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) {
+    stage_range_col[i] = cfg->stage_range_col[i] + shift[0] + bd + 1;
+  }
+
+  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+  for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) {
+    stage_range_row[i] = cfg->stage_range_row[i] + shift[0] + shift[1] + bd + 1;
+  }
+}
+
+static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
+                                const int stride, const TXFM_2D_FLIP_CFG *cfg,
+                                int32_t *buf, int bd) {
+  int c, r;
+  // Note when assigning txfm_size_col, we use the txfm_size from the
+  // row configuration and vice versa. This is intentionally done to
+  // accurately perform rectangular transforms. When the transform is
+  // rectangular, the number of columns will be the same as the
+  // txfm_size stored in the row cfg struct. It will make no difference
+  // for square transforms.
+  const int txfm_size_col = tx_size_wide[cfg->tx_size];
+  const int txfm_size_row = tx_size_high[cfg->tx_size];
+  // Take the shift from the larger dimension in the rectangular case.
+  const int8_t *shift = cfg->shift;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+  int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+  assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
+  assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
+  av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bd);
+
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+  // use output buffer as temp buffer
+  int32_t *temp_in = output;
+  int32_t *temp_out = output + txfm_size_row;
+
+  // Columns
+  for (c = 0; c < txfm_size_col; ++c) {
+    if (cfg->ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * stride + c];
+    } else {
+      for (r = 0; r < txfm_size_row; ++r)
+        // flip upside down
+        temp_in[r] = input[(txfm_size_row - r - 1) * stride + c];
+    }
+    av1_round_shift_array(temp_in, txfm_size_row, -shift[0]);
+    txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+    if (cfg->lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        buf[r * txfm_size_col + c] = temp_out[r];
+    } else {
+      for (r = 0; r < txfm_size_row; ++r)
+        // flip from left to right
+        buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
+    }
+  }
+
+  // Rows
+  for (r = 0; r < txfm_size_row; ++r) {
+    txfm_func_row(buf + r * txfm_size_col, output + r * txfm_size_col,
+                  cos_bit_row, stage_range_row);
+    av1_round_shift_array(output + r * txfm_size_col, txfm_size_col, -shift[2]);
+    if (abs(rect_type) == 1) {
+      // Multiply everything by Sqrt2 if the transform is rectangular and the
+      // size difference is a factor of 2.
+      for (c = 0; c < txfm_size_col; ++c) {
+        output[r * txfm_size_col + c] = round_shift(
+            (int64_t)output[r * txfm_size_col + c] * NewSqrt2, NewSqrt2Bits);
+      }
+    }
+  }
+}
+
+void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 8]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_4X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[8 * 4];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X4, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 16]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[16 * 8];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 32]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[32 * 16];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_4x16_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 16]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_4X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x4_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[16 * 4];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X4, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x32_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 8]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x8_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[32 * 8];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[4 * 4];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_4X4, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[8 * 8];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[16 * 16];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[32 * 32];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[64 * 64];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+
+  // Zero out top-right 32x32 area.
+  for (int row = 0; row < 32; ++row) {
+    memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
+  }
+  // Zero out the bottom 64x32 area.
+  memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output));
+  // Re-pack non-zero coeffs in the first 32x32 indices.
+  for (int row = 1; row < 32; ++row) {
+    memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
+  }
+}
+
+void av1_fwd_txfm2d_32x64_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 64]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X64, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+  // Zero out the bottom 32x32 area.
+  memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output));
+  // Note: no repacking needed here.
+}
+
+void av1_fwd_txfm2d_64x32_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[64 * 32];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+
+  // Zero out right 32x32 area.
+  for (int row = 0; row < 32; ++row) {
+    memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
+  }
+  // Re-pack non-zero coeffs in the first 32x32 indices.
+  for (int row = 1; row < 32; ++row) {
+    memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
+  }
+}
+
+void av1_fwd_txfm2d_16x64_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[64 * 16]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X64, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+  // Zero out the bottom 16x32 area.
+  memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+  // Note: no repacking needed here.
+}
+
+void av1_fwd_txfm2d_64x16_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[64 * 16];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+  // Zero out right 32x16 area.
+  for (int row = 0; row < 16; ++row) {
+    memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
+  }
+  // Re-pack non-zero coeffs in the first 32x16 indices.
+  for (int row = 1; row < 16; ++row) {
+    memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
+  }
+}
+
+static const int8_t fwd_shift_4x4[3] = { 2, 0, 0 };
+static const int8_t fwd_shift_8x8[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_16x16[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_32x32[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_64x64[3] = { 0, -2, -2 };
+static const int8_t fwd_shift_4x8[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x4[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x16[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x8[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x32[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_32x16[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_32x64[3] = { 0, -2, -2 };
+static const int8_t fwd_shift_64x32[3] = { 2, -4, -2 };
+static const int8_t fwd_shift_4x16[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_16x4[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x32[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_32x8[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x64[3] = { 0, -2, 0 };
+static const int8_t fwd_shift_64x16[3] = { 2, -4, 0 };
+
+const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL] = {
+  fwd_shift_4x4,   fwd_shift_8x8,   fwd_shift_16x16, fwd_shift_32x32,
+  fwd_shift_64x64, fwd_shift_4x8,   fwd_shift_8x4,   fwd_shift_8x16,
+  fwd_shift_16x8,  fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64,
+  fwd_shift_64x32, fwd_shift_4x16,  fwd_shift_16x4,  fwd_shift_8x32,
+  fwd_shift_32x8,  fwd_shift_16x64, fwd_shift_64x16,
+};
+
+const int8_t fwd_cos_bit_col[MAX_TXWH_IDX /*txw_idx*/]
+                            [MAX_TXWH_IDX /*txh_idx*/] = {
+                              { 13, 13, 13, 0, 0 },
+                              { 13, 13, 13, 12, 0 },
+                              { 13, 13, 13, 12, 13 },
+                              { 0, 13, 13, 12, 13 },
+                              { 0, 0, 13, 12, 13 }
+                            };
+
+const int8_t fwd_cos_bit_row[MAX_TXWH_IDX /*txw_idx*/]
+                            [MAX_TXWH_IDX /*txh_idx*/] = {
+                              { 13, 13, 12, 0, 0 },
+                              { 13, 13, 13, 12, 0 },
+                              { 13, 13, 12, 13, 12 },
+                              { 0, 12, 13, 12, 11 },
+                              { 0, 0, 12, 11, 10 }
+                            };
+
+static const int8_t fdct4_range_mult2[4] = { 0, 2, 3, 3 };
+static const int8_t fdct8_range_mult2[6] = { 0, 2, 4, 5, 5, 5 };
+static const int8_t fdct16_range_mult2[8] = { 0, 2, 4, 6, 7, 7, 7, 7 };
+static const int8_t fdct32_range_mult2[10] = { 0, 2, 4, 6, 8, 9, 9, 9, 9, 9 };
+static const int8_t fdct64_range_mult2[12] = { 0,  2,  4,  6,  8,  10,
+                                               11, 11, 11, 11, 11, 11 };
+
+static const int8_t fadst4_range_mult2[7] = { 0, 2, 4, 3, 3, 3, 3 };
+static const int8_t fadst8_range_mult2[8] = { 0, 0, 1, 3, 3, 5, 5, 5 };
+static const int8_t fadst16_range_mult2[10] = { 0, 0, 1, 3, 3, 5, 5, 7, 7, 7 };
+
+static const int8_t max_fwd_range_mult2_col[5] = { 3, 5, 7, 9, 11 };
+
+static const int8_t fidtx4_range_mult2[1] = { 1 };
+static const int8_t fidtx8_range_mult2[1] = { 2 };
+static const int8_t fidtx16_range_mult2[1] = { 3 };
+static const int8_t fidtx32_range_mult2[1] = { 4 };
+
+#if 0
+const int8_t fwd_idtx_range_row[MAX_TXWH_IDX /*txw_idx*/]
+                               [MAX_TXWH_IDX /*txh_idx*/] = { { 2, 4, 5, 0, 0 },
+                                                              { 3, 4, 5, 6, 0 },
+                                                              { 4, 5, 6, 7, 8 },
+                                                              { 0, 5, 6, 7, 8 },
+                                                              { 0, 0, 7, 8,
+                                                                9 } };
+#endif
+
+const int8_t *fwd_txfm_range_mult2_list[TXFM_TYPES] = {
+  fdct4_range_mult2,  fdct8_range_mult2,   fdct16_range_mult2,
+  fdct32_range_mult2, fdct64_range_mult2,  fadst4_range_mult2,
+  fadst8_range_mult2, fadst16_range_mult2, fidtx4_range_mult2,
+  fidtx8_range_mult2, fidtx16_range_mult2, fidtx32_range_mult2
+};
+
+static INLINE void set_fwd_txfm_non_scale_range(TXFM_2D_FLIP_CFG *cfg) {
+  const int txh_idx = get_txh_idx(cfg->tx_size);
+  av1_zero(cfg->stage_range_col);
+  av1_zero(cfg->stage_range_row);
+
+  if (cfg->txfm_type_col != TXFM_TYPE_INVALID) {
+    int stage_num_col = cfg->stage_num_col;
+    const int8_t *range_mult2_col =
+        fwd_txfm_range_mult2_list[cfg->txfm_type_col];
+    for (int i = 0; i < stage_num_col; ++i)
+      cfg->stage_range_col[i] = (range_mult2_col[i] + 1) >> 1;
+  }
+
+  if (cfg->txfm_type_row != TXFM_TYPE_INVALID) {
+    int stage_num_row = cfg->stage_num_row;
+    const int8_t *range_mult2_row =
+        fwd_txfm_range_mult2_list[cfg->txfm_type_row];
+    for (int i = 0; i < stage_num_row; ++i)
+      cfg->stage_range_row[i] =
+          (max_fwd_range_mult2_col[txh_idx] + range_mult2_row[i] + 1) >> 1;
+  }
+}
+
+void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+                          TXFM_2D_FLIP_CFG *cfg) {
+  assert(cfg != NULL);
+  cfg->tx_size = tx_size;
+  set_flip_cfg(tx_type, cfg);
+  const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+  const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+  const int txw_idx = tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
+  const int txh_idx = tx_size_high_log2[tx_size] - tx_size_high_log2[0];
+  cfg->shift = fwd_txfm_shift_ls[tx_size];
+  cfg->cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  cfg->cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
+  cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
+  cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col];
+  cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row];
+  set_fwd_txfm_non_scale_range(cfg);
+}
diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c
index 033b4ba1a..1c5bdeb25 100644
--- a/third_party/aom/av1/encoder/av1_quantize.c
+++ b/third_party/aom/av1/encoder/av1_quantize.c
@@ -10,7 +10,9 @@
  */
 
 #include <math.h>
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/quantize.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
@@ -24,413 +26,6 @@
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/rd.h"
 
-#if CONFIG_NEW_QUANT
-static INLINE int quantize_coeff_nuq(
-    const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
-    const int16_t dequant, const tran_low_t *cuml_bins_ptr,
-    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr) {
-  const int coeff = coeffv;
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int i, q;
-  int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-  for (i = 0; i < NUQ_KNOTS; i++) {
-    if (tmp < cuml_bins_ptr[i]) {
-      q = i;
-      break;
-    }
-  }
-  if (i == NUQ_KNOTS) {
-    tmp -= cuml_bins_ptr[NUQ_KNOTS - 1];
-    q = NUQ_KNOTS + (((((tmp * quant) >> 16) + tmp) * quant_shift) >> 16);
-  }
-  if (q) {
-    *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val);
-    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
-    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
-  } else {
-    *qcoeff_ptr = 0;
-    *dqcoeff_ptr = 0;
-  }
-  return (q != 0);
-}
-
-static INLINE int quantize_coeff_bigtx_nuq(
-    const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
-    const int16_t dequant, const tran_low_t *cuml_bins_ptr,
-    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, int logsizeby16) {
-  const int coeff = coeffv;
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int i, q;
-  int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-  for (i = 0; i < NUQ_KNOTS; i++) {
-    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) {
-      q = i;
-      break;
-    }
-  }
-  if (i == NUQ_KNOTS) {
-    tmp -= ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16);
-    q = NUQ_KNOTS +
-        (((((tmp * quant) >> 16) + tmp) * quant_shift) >> (16 - logsizeby16));
-  }
-  if (q) {
-    *dqcoeff_ptr = ROUND_POWER_OF_TWO(
-        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16);
-    // *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val) >>
-    // (logsizeby16);
-    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
-    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
-  } else {
-    *qcoeff_ptr = 0;
-    *dqcoeff_ptr = 0;
-  }
-  return (q != 0);
-}
-
-static INLINE int quantize_coeff_fp_nuq(
-    const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
-    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr) {
-  const int coeff = coeffv;
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int i, q;
-  int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-  for (i = 0; i < NUQ_KNOTS; i++) {
-    if (tmp < cuml_bins_ptr[i]) {
-      q = i;
-      break;
-    }
-  }
-  if (i == NUQ_KNOTS) {
-    q = NUQ_KNOTS +
-        ((((int64_t)tmp - cuml_bins_ptr[NUQ_KNOTS - 1]) * quant) >> 16);
-  }
-  if (q) {
-    *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val);
-    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
-    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
-  } else {
-    *qcoeff_ptr = 0;
-    *dqcoeff_ptr = 0;
-  }
-  return (q != 0);
-}
-
-static INLINE int quantize_coeff_bigtx_fp_nuq(
-    const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
-    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int logsizeby16) {
-  const int coeff = coeffv;
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int i, q;
-  int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-  for (i = 0; i < NUQ_KNOTS; i++) {
-    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) {
-      q = i;
-      break;
-    }
-  }
-  if (i == NUQ_KNOTS) {
-    q = NUQ_KNOTS +
-        ((((int64_t)tmp -
-           ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16)) *
-          quant) >>
-         (16 - logsizeby16));
-  }
-  if (q) {
-    *dqcoeff_ptr = ROUND_POWER_OF_TWO(
-        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16);
-    // *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val) >>
-    // (logsizeby16);
-    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
-    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
-  } else {
-    *qcoeff_ptr = 0;
-    *dqcoeff_ptr = 0;
-  }
-  return (q != 0);
-}
-
-void quantize_dc_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                     int skip_block, const int16_t quant,
-                     const int16_t quant_shift, const int16_t dequant,
-                     const tran_low_t *cuml_bins_ptr,
-                     const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
-                     tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (quantize_coeff_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
-                           cuml_bins_ptr, dequant_val, qcoeff_ptr, dqcoeff_ptr))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void quantize_dc_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                        int skip_block, const int16_t quant,
-                        const int16_t dequant, const tran_low_t *cuml_bins_ptr,
-                        const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
-                        tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (quantize_coeff_fp_nuq(coeff_ptr[rc], quant, dequant, cuml_bins_ptr,
-                              dequant_val, qcoeff_ptr, dqcoeff_ptr))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void quantize_dc_32x32_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                           int skip_block, const int16_t quant,
-                           const int16_t quant_shift, const int16_t dequant,
-                           const tran_low_t *cuml_bins_ptr,
-                           const tran_low_t *dequant_val,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (quantize_coeff_bigtx_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
-                                 cuml_bins_ptr, dequant_val, qcoeff_ptr,
-                                 dqcoeff_ptr, av1_get_tx_scale(TX_32X32)))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void quantize_dc_32x32_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                              int skip_block, const int16_t quant,
-                              const int16_t dequant,
-                              const tran_low_t *cuml_bins_ptr,
-                              const tran_low_t *dequant_val,
-                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                              uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc], quant, dequant,
-                                    cuml_bins_ptr, dequant_val, qcoeff_ptr,
-                                    dqcoeff_ptr, av1_get_tx_scale(TX_32X32)))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_TX64X64
-void quantize_dc_64x64_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                           int skip_block, const int16_t quant,
-                           const int16_t quant_shift, const int16_t dequant,
-                           const tran_low_t *cuml_bins_ptr,
-                           const tran_low_t *dequant_val,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (quantize_coeff_bigtx_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
-                                 cuml_bins_ptr, dequant_val, qcoeff_ptr,
-                                 dqcoeff_ptr, av1_get_tx_scale(TX_64X64)))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void quantize_dc_64x64_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                              int skip_block, const int16_t quant,
-                              const int16_t dequant,
-                              const tran_low_t *cuml_bins_ptr,
-                              const tran_low_t *dequant_val,
-                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                              uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc], quant, dequant,
-                                    cuml_bins_ptr, dequant_val, qcoeff_ptr,
-                                    dqcoeff_ptr, av1_get_tx_scale(TX_64X64)))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-#endif  // CONFIG_TX64X64
-
-void quantize_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                    int skip_block, const int16_t *quant_ptr,
-                    const int16_t *quant_shift_ptr, const int16_t *dequant_ptr,
-                    const cuml_bins_type_nuq *cuml_bins_ptr,
-                    const dequant_val_type_nuq *dequant_val,
-                    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                    uint16_t *eob_ptr, const int16_t *scan,
-                    const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (quantize_coeff_nuq(coeff_ptr[rc], quant_ptr[rc != 0],
-                             quant_shift_ptr[rc != 0], dequant_ptr[rc != 0],
-                             cuml_bins_ptr[band[i]], dequant_val[band[i]],
-                             &qcoeff_ptr[rc], &dqcoeff_ptr[rc]))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-void quantize_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                       int skip_block, const int16_t *quant_ptr,
-                       const int16_t *dequant_ptr,
-                       const cuml_bins_type_nuq *cuml_bins_ptr,
-                       const dequant_val_type_nuq *dequant_val,
-                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                       uint16_t *eob_ptr, const int16_t *scan,
-                       const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (quantize_coeff_fp_nuq(coeff_ptr[rc], quant_ptr[rc != 0],
-                                dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
-                                dequant_val[band[i]], &qcoeff_ptr[rc],
-                                &dqcoeff_ptr[rc]))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-void quantize_32x32_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *quant_ptr,
-                          const int16_t *quant_shift_ptr,
-                          const int16_t *dequant_ptr,
-                          const cuml_bins_type_nuq *cuml_bins_ptr,
-                          const dequant_val_type_nuq *dequant_val,
-                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan,
-                          const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (quantize_coeff_bigtx_nuq(
-              coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
-              dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
-              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc],
-              av1_get_tx_scale(TX_32X32)))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-void quantize_32x32_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *quant_ptr,
-                             const int16_t *dequant_ptr,
-                             const cuml_bins_type_nuq *cuml_bins_ptr,
-                             const dequant_val_type_nuq *dequant_val,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             uint16_t *eob_ptr, const int16_t *scan,
-                             const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (quantize_coeff_bigtx_fp_nuq(
-              coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
-              cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
-              &dqcoeff_ptr[rc], av1_get_tx_scale(TX_32X32)))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_TX64X64
-void quantize_64x64_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *quant_ptr,
-                          const int16_t *quant_shift_ptr,
-                          const int16_t *dequant_ptr,
-                          const cuml_bins_type_nuq *cuml_bins_ptr,
-                          const dequant_val_type_nuq *dequant_val,
-                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan,
-                          const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (quantize_coeff_bigtx_nuq(
-              coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
-              dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
-              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc],
-              av1_get_tx_scale(TX_64X64)))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-void quantize_64x64_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *quant_ptr,
-                             const int16_t *dequant_ptr,
-                             const cuml_bins_type_nuq *cuml_bins_ptr,
-                             const dequant_val_type_nuq *dequant_val,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             uint16_t *eob_ptr, const int16_t *scan,
-                             const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (quantize_coeff_bigtx_fp_nuq(
-              coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
-              cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
-              &dqcoeff_ptr[rc], av1_get_tx_scale(TX_64X64)))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_NEW_QUANT
-
 void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
                        tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
@@ -439,8 +34,8 @@ void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
 }
 
 static void quantize_fp_helper_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
@@ -450,12 +45,45 @@ static void quantize_fp_helper_c(
   // quantization process is completed.
   (void)zbin_ptr;
   (void)quant_shift_ptr;
-  (void)iscan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
+  if (qm_ptr == NULL && iqm_ptr == NULL) {
+    const int rounding0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+    {  // rc == 0
+      const int coeff = coeff_ptr[0];
+      const int coeff_sign = (coeff >> 31);
+      int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      if ((abs_coeff << (1 + log_scale)) >= (int32_t)(dequant_ptr[0])) {
+        abs_coeff = clamp64(abs_coeff + rounding0, INT16_MIN, INT16_MAX);
+        const int tmp32 = (int)((abs_coeff * quant_ptr[0]) >> (16 - log_scale));
+        if (tmp32) {
+          qcoeff_ptr[0] = (tmp32 ^ coeff_sign) - coeff_sign;
+          const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[0]) >> log_scale;
+          dqcoeff_ptr[0] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+          eob = 0;
+        }
+      }
+    }
+    const int rounding1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
+    const int32_t thresh1 = (int32_t)(dequant_ptr[1]);
+    for (i = 1; i < n_coeffs; i++) {
+      const int coeff = coeff_ptr[i];
+      const int coeff_sign = (coeff >> 31);
+      int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      if ((abs_coeff << (1 + log_scale)) >= thresh1) {
+        abs_coeff = clamp64(abs_coeff + rounding1, INT16_MIN, INT16_MAX);
+        const int tmp32 = (int)((abs_coeff * quant_ptr[1]) >> (16 - log_scale));
+        if (tmp32) {
+          qcoeff_ptr[i] = (tmp32 ^ coeff_sign) - coeff_sign;
+          const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[1]) >> log_scale;
+          dqcoeff_ptr[i] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+          eob = AOMMAX(iscan[i], eob);
+        }
+      }
+    }
+  } else {
     // Quantization pass: All coefficients with index >= zero_flag are
     // skippable. Note: zero_flag can be zero.
     for (i = 0; i < n_coeffs; i++) {
@@ -476,7 +104,8 @@ static void quantize_fp_helper_c(
         tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >>
                       (16 - log_scale + AOM_QM_BITS));
         qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / (1 << log_scale);
+        const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+        dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
       }
 
       if (tmp32) eob = i;
@@ -486,15 +115,14 @@ static void quantize_fp_helper_c(
 }
 
 static void highbd_quantize_fp_helper_c(
-    const tran_low_t *coeff_ptr, intptr_t count, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
     const qm_val_t *iqm_ptr, int log_scale) {
   int i;
   int eob = -1;
-  const int scale = 1 << log_scale;
   const int shift = 16 - log_scale;
   // TODO(jingning) Decide the need of these arguments after the
   // quantization process is completed.
@@ -502,10 +130,7 @@ static void highbd_quantize_fp_helper_c(
   (void)quant_shift_ptr;
   (void)iscan;
 
-  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
+  if (qm_ptr || iqm_ptr) {
     // Quantization pass: All coefficients with index >= zero_flag are
     // skippable. Note: zero_flag can be zero.
     for (i = 0; i < count; i++) {
@@ -517,150 +142,170 @@ static void highbd_quantize_fp_helper_c(
           (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
           AOM_QM_BITS;
       const int coeff_sign = (coeff >> 31);
+      const int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      int abs_qcoeff = 0;
+      if (abs_coeff * wt >=
+          (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) {
+        const int64_t tmp =
+            abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+        abs_qcoeff =
+            (int)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS));
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+        dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+        if (abs_qcoeff) eob = i;
+      } else {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+      }
+    }
+  } else {
+    const int log_scaled_round_arr[2] = {
+      ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+      ROUND_POWER_OF_TWO(round_ptr[1], log_scale),
+    };
+    for (i = 0; i < count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int rc01 = (rc != 0);
+      const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp = abs_coeff + (round_ptr[rc != 0] >> log_scale);
-      const int abs_qcoeff =
-          (int)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS));
-      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / scale;
-      if (abs_qcoeff) eob = i;
+      const int log_scaled_round = log_scaled_round_arr[rc01];
+      if ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) {
+        const int quant = quant_ptr[rc01];
+        const int dequant = dequant_ptr[rc01];
+        const int64_t tmp = (int64_t)abs_coeff + log_scaled_round;
+        const int abs_qcoeff = (int)((tmp * quant) >> shift);
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+        if (abs_qcoeff) eob = i;
+        dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+      } else {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+      }
     }
   }
   *eob_ptr = eob + 1;
 }
 
 void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                       int skip_block, const int16_t *zbin_ptr,
-                       const int16_t *round_ptr, const int16_t *quant_ptr,
-                       const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                       tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                       uint16_t *eob_ptr, const int16_t *scan,
-                       const int16_t *iscan) {
-  quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
-                       quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
-                       dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 0);
+                       const int16_t *zbin_ptr, const int16_t *round_ptr,
+                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                       const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                       const int16_t *scan, const int16_t *iscan) {
+  quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                       eob_ptr, scan, iscan, NULL, NULL, 0);
 }
 
 void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *zbin_ptr,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
                              const int16_t *quant_shift_ptr,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
                              const int16_t *scan, const int16_t *iscan) {
-  quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
-                       quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
-                       dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1);
+  quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                       eob_ptr, scan, iscan, NULL, NULL, 1);
 }
 
-#if CONFIG_TX64X64
 void av1_quantize_fp_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *zbin_ptr,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
                              const int16_t *quant_shift_ptr,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
                              const int16_t *scan, const int16_t *iscan) {
-  quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
-                       quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
-                       dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2);
+  quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                       eob_ptr, scan, iscan, NULL, NULL, 2);
 }
-#endif  // CONFIG_TX64X64
 
 void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
-                            const MACROBLOCKD_PLANE *pd,
                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                             const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
-  // obsolete skip_block
-  const int skip_block = 0;
-#if CONFIG_AOM_QM
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
   if (qm_ptr != NULL && iqm_ptr != NULL) {
-    quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
-                         p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                         pd->dequant, eob_ptr, sc->scan, sc->iscan, qm_ptr,
-                         iqm_ptr, qparam->log_scale);
+    quantize_fp_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                         p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                         dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                         sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
   } else {
-#endif
     switch (qparam->log_scale) {
       case 0:
         if (n_coeffs < 16) {
           // TODO(jingning): Need SIMD implementation for smaller block size
           // quantization.
           quantize_fp_helper_c(
-              coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
-              p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr, pd->dequant,
-              eob_ptr, sc->scan, sc->iscan, NULL, NULL, qparam->log_scale);
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
+              p->dequant_QTX, eob_ptr, sc->scan, sc->iscan, NULL, NULL, 0);
         } else {
-          av1_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
-                          p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                          pd->dequant, eob_ptr, sc->scan, sc->iscan);
+          av1_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                          p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                          dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                          sc->iscan);
         }
         break;
       case 1:
-        av1_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                              p->round_fp, p->quant_fp, p->quant_shift,
-                              qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
-                              sc->scan, sc->iscan);
+        av1_quantize_fp_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                              dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                              sc->iscan);
         break;
-#if CONFIG_TX64X64
       case 2:
-        av1_quantize_fp_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                              p->round_fp, p->quant_fp, p->quant_shift,
-                              qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
-                              sc->scan, sc->iscan);
+        av1_quantize_fp_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                              dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                              sc->iscan);
         break;
-#endif  // CONFIG_TX64X64
       default: assert(0);
     }
-#if CONFIG_AOM_QM
   }
-#endif
 }
 
 void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
-                           const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr,
-                           uint16_t *eob_ptr, const SCAN_ORDER *sc,
-                           const QUANT_PARAM *qparam) {
+                           tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                           const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
   // obsolete skip_block
   const int skip_block = 0;
-#if CONFIG_AOM_QM
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
   if (qm_ptr != NULL && iqm_ptr != NULL) {
-    quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
-                        p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                        pd->dequant, eob_ptr, sc->scan, sc->iscan, qm_ptr,
-                        iqm_ptr, qparam->log_scale);
+    quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+                        p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
+                        qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+                        sc->scan, sc->iscan, qm_ptr, iqm_ptr,
+                        qparam->log_scale);
   } else {
-#endif  // CONFIG_AOM_QM
-
     switch (qparam->log_scale) {
       case 0:
-        aom_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
-                       p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                       pd->dequant, eob_ptr, sc->scan, sc->iscan);
+        aom_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+                       p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
+                       qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+                       sc->scan, sc->iscan);
         break;
       case 1:
-        aom_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
-                             p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                             pd->dequant, eob_ptr, sc->scan, sc->iscan);
+        aom_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+                             p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
+                             qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+                             sc->scan, sc->iscan);
         break;
-#if CONFIG_TX64X64
       case 2:
-        aom_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
-                             p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                             pd->dequant, eob_ptr, sc->scan, sc->iscan);
+        aom_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+                             p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
+                             qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+                             sc->scan, sc->iscan);
         break;
-#endif  // CONFIG_TX64X64
       default: assert(0);
     }
-#if CONFIG_AOM_QM
   }
-#endif
 }
 
 static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
@@ -689,7 +334,8 @@ static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
     tmp32 = (int32_t)((tmp * wt * quant) >> (16 - log_scale + AOM_QM_BITS));
     qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
     dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-    dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / (1 << log_scale);
+    const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+    dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
     if (tmp32) eob = 0;
   }
   *eob_ptr = eob + 1;
@@ -697,237 +343,97 @@ static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
 
 void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
-                            const MACROBLOCKD_PLANE *pd,
                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                             const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
   // obsolete skip_block
   const int skip_block = 0;
   (void)sc;
-  assert(qparam->log_scale >= 0 && qparam->log_scale < (2 + CONFIG_TX64X64));
-#if CONFIG_AOM_QM
+  assert(qparam->log_scale >= 0 && qparam->log_scale < (3));
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
-#else
-  const qm_val_t *qm_ptr = NULL;
-  const qm_val_t *iqm_ptr = NULL;
-#endif
-  quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round, p->quant_fp[0],
-              qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr, qm_ptr, iqm_ptr,
-              qparam->log_scale);
-}
-
-#if CONFIG_NEW_QUANT
-void av1_quantize_b_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               const MACROBLOCK_PLANE *p,
-                               tran_low_t *qcoeff_ptr,
-                               const MACROBLOCKD_PLANE *pd,
-                               tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                               const SCAN_ORDER *sc,
-                               const QUANT_PARAM *qparam) {
-  // obsolete skip_block
-  const int skip_block = 0;
-  const uint8_t *band = get_band_translate(qparam->tx_size);
-  int dq = qparam->dq;
-
-  switch (qparam->log_scale) {
-    case 0:
-      quantize_nuq(coeff_ptr, n_coeffs, skip_block, p->quant, p->quant_shift,
-                   pd->dequant,
-                   (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                   (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
-                   qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-    case 1:
-      quantize_32x32_nuq(coeff_ptr, n_coeffs, skip_block, p->quant,
-                         p->quant_shift, pd->dequant,
-                         (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                         (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
-                         qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-#if CONFIG_TX64X64
-    case 2:
-      quantize_64x64_nuq(coeff_ptr, n_coeffs, skip_block, p->quant,
-                         p->quant_shift, pd->dequant,
-                         (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                         (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
-                         qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-#endif  // CONFIG_TX64X64
-    default: assert(0);
-  }
-}
-
-void av1_quantize_fp_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const MACROBLOCK_PLANE *p,
-                                tran_low_t *qcoeff_ptr,
-                                const MACROBLOCKD_PLANE *pd,
-                                tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                const SCAN_ORDER *sc,
-                                const QUANT_PARAM *qparam) {
-  // obsolete skip_block
-  const int skip_block = 0;
-  const uint8_t *band = get_band_translate(qparam->tx_size);
-  int dq = qparam->dq;
-
-  switch (qparam->log_scale) {
-    case 0:
-      quantize_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
-                      (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                      (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
-                      qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-    case 1:
-      quantize_32x32_fp_nuq(
-          coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
-          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
-          dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-#if CONFIG_TX64X64
-    case 2:
-      quantize_64x64_fp_nuq(
-          coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
-          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
-          dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-#endif  // CONFIG_TX64X64
-    default: assert(0);
-  }
+  quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX,
+              p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX[0],
+              eob_ptr, qm_ptr, iqm_ptr, qparam->log_scale);
 }
 
-void av1_quantize_dc_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const MACROBLOCK_PLANE *p,
-                                tran_low_t *qcoeff_ptr,
-                                const MACROBLOCKD_PLANE *pd,
-                                tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                const SCAN_ORDER *sc,
-                                const QUANT_PARAM *qparam) {
-  // obsolete skip_block
-  const int skip_block = 0;
-  int dq = qparam->dq;
-  (void)sc;
-
-  switch (qparam->log_scale) {
-    case 0:
-      quantize_dc_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0],
-                         pd->dequant[0], p->cuml_bins_nuq[dq][0],
-                         pd->dequant_val_nuq[dq][0], qcoeff_ptr, dqcoeff_ptr,
-                         eob_ptr);
-      break;
-    case 1:
-      quantize_dc_32x32_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0],
-                               pd->dequant[0], p->cuml_bins_nuq[dq][0],
-                               pd->dequant_val_nuq[dq][0], qcoeff_ptr,
-                               dqcoeff_ptr, eob_ptr);
-      break;
-#if CONFIG_TX64X64
-    case 2:
-      quantize_dc_64x64_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0],
-                               pd->dequant[0], p->cuml_bins_nuq[dq][0],
-                               pd->dequant_val_nuq[dq][0], qcoeff_ptr,
-                               dqcoeff_ptr, eob_ptr);
-      break;
-#endif  // CONFIG_TX64X64
-    default: assert(0);
-  }
-}
-#endif  // CONFIG_NEW_QUANT
-
 void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
                                    intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                    tran_low_t *qcoeff_ptr,
-                                   const MACROBLOCKD_PLANE *pd,
                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                                    const SCAN_ORDER *sc,
                                    const QUANT_PARAM *qparam) {
-  // obsolete skip_block
-  const int skip_block = 0;
-#if CONFIG_AOM_QM
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
   if (qm_ptr != NULL && iqm_ptr != NULL) {
     highbd_quantize_fp_helper_c(
-        coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp, p->quant_fp,
-        p->quant_shift, qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
-        sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+        coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
+        p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+        sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
   } else {
-#endif  // CONFIG_AOM_QM
-
     if (n_coeffs < 16) {
       // TODO(jingning): Need SIMD implementation for smaller block size
       // quantization.
-      av1_highbd_quantize_fp_c(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                               p->round_fp, p->quant_fp, p->quant_shift,
-                               qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
-                               sc->scan, sc->iscan, qparam->log_scale);
+      av1_highbd_quantize_fp_c(
+          coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
+          p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+          sc->scan, sc->iscan, qparam->log_scale);
       return;
     }
-
-    av1_highbd_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                           p->round_fp, p->quant_fp, p->quant_shift, qcoeff_ptr,
-                           dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
+    av1_highbd_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                           p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                           dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
                            sc->iscan, qparam->log_scale);
-#if CONFIG_AOM_QM
   }
-#endif
 }
 
 void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                   tran_low_t *qcoeff_ptr,
-                                  const MACROBLOCKD_PLANE *pd,
                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                                   const SCAN_ORDER *sc,
                                   const QUANT_PARAM *qparam) {
   // obsolete skip_block
   const int skip_block = 0;
-#if CONFIG_AOM_QM
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
   if (qm_ptr != NULL && iqm_ptr != NULL) {
-    highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                               p->round, p->quant, p->quant_shift, qcoeff_ptr,
-                               dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
-                               sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+    highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+                               p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
+                               qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+                               sc->scan, sc->iscan, qm_ptr, iqm_ptr,
+                               qparam->log_scale);
   } else {
-#endif  // CONFIG_AOM_QM
-
     switch (qparam->log_scale) {
       case 0:
         if (LIKELY(n_coeffs >= 8)) {
-          aom_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                                p->round, p->quant, p->quant_shift, qcoeff_ptr,
-                                dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
-                                sc->iscan);
+          aom_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+                                p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
+                                qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+                                eob_ptr, sc->scan, sc->iscan);
         } else {
           // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
           // quantization
-          aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                                  p->round, p->quant, p->quant_shift,
-                                  qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
-                                  sc->scan, sc->iscan);
+          aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+                                  p->round_QTX, p->quant_QTX,
+                                  p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
+                                  p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
         }
         break;
       case 1:
-        aom_highbd_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                                    p->round, p->quant, p->quant_shift,
-                                    qcoeff_ptr, dqcoeff_ptr, pd->dequant,
-                                    eob_ptr, sc->scan, sc->iscan);
+        aom_highbd_quantize_b_32x32(
+            coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, p->round_QTX,
+            p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
+            p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
         break;
-#if CONFIG_TX64X64
       case 2:
-        aom_highbd_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                                    p->round, p->quant, p->quant_shift,
-                                    qcoeff_ptr, dqcoeff_ptr, pd->dequant,
-                                    eob_ptr, sc->scan, sc->iscan);
+        aom_highbd_quantize_b_64x64(
+            coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, p->round_QTX,
+            p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
+            p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
         break;
-#endif  // CONFIG_TX64X64
       default: assert(0);
     }
-#if CONFIG_AOM_QM
   }
-#endif
 }
 
 static INLINE void highbd_quantize_dc(
@@ -954,7 +460,8 @@ static INLINE void highbd_quantize_dc(
     const int dequant =
         (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
 
-    dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / (1 << log_scale);
+    const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+    dqcoeff_ptr[0] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
     if (abs_qcoeff) eob = 0;
   }
   *eob_ptr = eob + 1;
@@ -963,550 +470,33 @@ static INLINE void highbd_quantize_dc(
 void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
                                    intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                    tran_low_t *qcoeff_ptr,
-                                   const MACROBLOCKD_PLANE *pd,
                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                                    const SCAN_ORDER *sc,
                                    const QUANT_PARAM *qparam) {
   // obsolete skip_block
   const int skip_block = 0;
-#if CONFIG_AOM_QM
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
-#else
-  const qm_val_t *qm_ptr = NULL;
-  const qm_val_t *iqm_ptr = NULL;
-#endif  // CONFIG_AOM_QM
-
   (void)sc;
 
-  highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
-                     p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr, pd->dequant[0],
-                     eob_ptr, qm_ptr, iqm_ptr, qparam->log_scale);
-}
-
-#if CONFIG_NEW_QUANT
-static INLINE int highbd_quantize_coeff_nuq(
-    const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
-    const int16_t dequant, const tran_low_t *cuml_bins_ptr,
-    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr) {
-  const int coeff = coeffv;
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int i, q;
-  int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
-  for (i = 0; i < NUQ_KNOTS; i++) {
-    if (tmp < cuml_bins_ptr[i]) {
-      q = i;
-      break;
-    }
-  }
-  if (i == NUQ_KNOTS) {
-    tmp -= cuml_bins_ptr[NUQ_KNOTS - 1];
-    q = NUQ_KNOTS + (int)(((((tmp * quant) >> 16) + tmp) * quant_shift) >> 16);
-  }
-  if (q) {
-    *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val);
-    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
-    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
-  } else {
-    *qcoeff_ptr = 0;
-    *dqcoeff_ptr = 0;
-  }
-  return (q != 0);
-}
-
-static INLINE int highbd_quantize_coeff_fp_nuq(
-    const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
-    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr) {
-  const int coeff = coeffv;
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int i, q;
-  int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
-  for (i = 0; i < NUQ_KNOTS; i++) {
-    if (tmp < cuml_bins_ptr[i]) {
-      q = i;
-      break;
-    }
-  }
-  if (i == NUQ_KNOTS) {
-    q = NUQ_KNOTS + (int)(((tmp - cuml_bins_ptr[NUQ_KNOTS - 1]) * quant) >> 16);
-  }
-  if (q) {
-    *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val);
-    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
-    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
-  } else {
-    *qcoeff_ptr = 0;
-    *dqcoeff_ptr = 0;
-  }
-  return (q != 0);
-}
-
-static INLINE int highbd_quantize_coeff_bigtx_fp_nuq(
-    const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
-    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int logsizeby16) {
-  const int coeff = coeffv;
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int i, q;
-  int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
-  for (i = 0; i < NUQ_KNOTS; i++) {
-    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) {
-      q = i;
-      break;
-    }
-  }
-  if (i == NUQ_KNOTS) {
-    q = NUQ_KNOTS +
-        (int)(((tmp -
-                ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16)) *
-               quant) >>
-              (16 - logsizeby16));
-  }
-  if (q) {
-    *dqcoeff_ptr = ROUND_POWER_OF_TWO(
-        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16);
-    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
-    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
-  } else {
-    *qcoeff_ptr = 0;
-    *dqcoeff_ptr = 0;
-  }
-  return (q != 0);
-}
-
-static INLINE int highbd_quantize_coeff_bigtx_nuq(
-    const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
-    const int16_t dequant, const tran_low_t *cuml_bins_ptr,
-    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, int logsizeby16) {
-  const int coeff = coeffv;
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int i, q;
-  int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
-  for (i = 0; i < NUQ_KNOTS; i++) {
-    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) {
-      q = i;
-      break;
-    }
-  }
-  if (i == NUQ_KNOTS) {
-    tmp -= ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16);
-    q = NUQ_KNOTS + (int)(((((tmp * quant) >> 16) + tmp) * quant_shift) >>
-                          (16 - logsizeby16));
-  }
-  if (q) {
-    *dqcoeff_ptr = ROUND_POWER_OF_TWO(
-        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16);
-    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
-    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
-  } else {
-    *qcoeff_ptr = 0;
-    *dqcoeff_ptr = 0;
-  }
-  return (q != 0);
-}
-
-void highbd_quantize_dc_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block, const int16_t quant,
-                            const int16_t quant_shift, const int16_t dequant,
-                            const tran_low_t *cuml_bins_ptr,
-                            const tran_low_t *dequant_val,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (highbd_quantize_coeff_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
-                                  cuml_bins_ptr, dequant_val, qcoeff_ptr,
-                                  dqcoeff_ptr))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void highbd_quantize_dc_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               int skip_block, const int16_t quant,
-                               const int16_t dequant,
-                               const tran_low_t *cuml_bins_ptr,
-                               const tran_low_t *dequant_val,
-                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                               uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (highbd_quantize_coeff_fp_nuq(coeff_ptr[rc], quant, dequant,
-                                     cuml_bins_ptr, dequant_val, qcoeff_ptr,
-                                     dqcoeff_ptr))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void highbd_quantize_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                           int skip_block, const int16_t *quant_ptr,
-                           const int16_t *quant_shift_ptr,
-                           const int16_t *dequant_ptr,
-                           const cuml_bins_type_nuq *cuml_bins_ptr,
-                           const dequant_val_type_nuq *dequant_val,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           uint16_t *eob_ptr, const int16_t *scan,
-                           const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (highbd_quantize_coeff_nuq(
-              coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
-              dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
-              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc]))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-void highbd_quantize_32x32_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                 int skip_block, const int16_t *quant_ptr,
-                                 const int16_t *quant_shift_ptr,
-                                 const int16_t *dequant_ptr,
-                                 const cuml_bins_type_nuq *cuml_bins_ptr,
-                                 const dequant_val_type_nuq *dequant_val,
-                                 tran_low_t *qcoeff_ptr,
-                                 tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                 const int16_t *scan, const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (highbd_quantize_coeff_bigtx_nuq(
-              coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
-              dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
-              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc],
-              av1_get_tx_scale(TX_32X32)))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-void highbd_quantize_32x32_fp_nuq_c(const tran_low_t *coeff_ptr,
-                                    intptr_t n_coeffs, int skip_block,
-                                    const int16_t *quant_ptr,
-                                    const int16_t *dequant_ptr,
-                                    const cuml_bins_type_nuq *cuml_bins_ptr,
-                                    const dequant_val_type_nuq *dequant_val,
-                                    tran_low_t *qcoeff_ptr,
-                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                    const int16_t *scan, const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (highbd_quantize_coeff_bigtx_fp_nuq(
-              coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
-              cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
-              &dqcoeff_ptr[rc], av1_get_tx_scale(TX_32X32)))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_TX64X64
-void highbd_quantize_64x64_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                 int skip_block, const int16_t *quant_ptr,
-                                 const int16_t *quant_shift_ptr,
-                                 const int16_t *dequant_ptr,
-                                 const cuml_bins_type_nuq *cuml_bins_ptr,
-                                 const dequant_val_type_nuq *dequant_val,
-                                 tran_low_t *qcoeff_ptr,
-                                 tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                 const int16_t *scan, const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (highbd_quantize_coeff_bigtx_nuq(
-              coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
-              dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
-              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc],
-              av1_get_tx_scale(TX_64X64)))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
+  highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX,
+                     p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr,
+                     p->dequant_QTX[0], eob_ptr, qm_ptr, iqm_ptr,
+                     qparam->log_scale);
 }
 
-void highbd_quantize_64x64_fp_nuq_c(const tran_low_t *coeff_ptr,
-                                    intptr_t n_coeffs, int skip_block,
-                                    const int16_t *quant_ptr,
-                                    const int16_t *dequant_ptr,
-                                    const cuml_bins_type_nuq *cuml_bins_ptr,
-                                    const dequant_val_type_nuq *dequant_val,
-                                    tran_low_t *qcoeff_ptr,
-                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                    const int16_t *scan, const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (highbd_quantize_coeff_bigtx_fp_nuq(
-              coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
-              cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
-              &dqcoeff_ptr[rc], av1_get_tx_scale(TX_64X64)))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-#endif  // CONFIG_TX64X64
-
-void highbd_quantize_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                              int skip_block, const int16_t *quant_ptr,
-                              const int16_t *dequant_ptr,
-                              const cuml_bins_type_nuq *cuml_bins_ptr,
-                              const dequant_val_type_nuq *dequant_val,
+void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
+                              const int16_t *zbin_ptr, const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                              uint16_t *eob_ptr, const int16_t *scan,
-                              const uint8_t *band) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    int i;
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      if (highbd_quantize_coeff_fp_nuq(
-              coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
-              cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
-              &dqcoeff_ptr[rc]))
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-void highbd_quantize_dc_32x32_nuq(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t quant, const int16_t quant_shift, const int16_t dequant,
-    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (highbd_quantize_coeff_bigtx_nuq(
-            coeff_ptr[rc], quant, quant_shift, dequant, cuml_bins_ptr,
-            dequant_val, qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_32X32)))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void highbd_quantize_dc_32x32_fp_nuq(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t quant, const int16_t dequant, const tran_low_t *cuml_bins_ptr,
-    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (highbd_quantize_coeff_bigtx_fp_nuq(
-            coeff_ptr[rc], quant, dequant, cuml_bins_ptr, dequant_val,
-            qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_32X32)))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_TX64X64
-void highbd_quantize_dc_64x64_nuq(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t quant, const int16_t quant_shift, const int16_t dequant,
-    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (highbd_quantize_coeff_bigtx_nuq(
-            coeff_ptr[rc], quant, quant_shift, dequant, cuml_bins_ptr,
-            dequant_val, qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_64X64)))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void highbd_quantize_dc_64x64_fp_nuq(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t quant, const int16_t dequant, const tran_low_t *cuml_bins_ptr,
-    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
-  int eob = -1;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-  if (!skip_block) {
-    const int rc = 0;
-    if (highbd_quantize_coeff_bigtx_fp_nuq(
-            coeff_ptr[rc], quant, dequant, cuml_bins_ptr, dequant_val,
-            qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_64X64)))
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-#endif  // CONFIG_TX64X64
-
-void av1_highbd_quantize_b_nuq_facade(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
-    const QUANT_PARAM *qparam) {
-  // obsolete skip_block
-  const int skip_block = 0;
-  const uint8_t *band = get_band_translate(qparam->tx_size);
-  const int dq = qparam->dq;
-
-  switch (qparam->log_scale) {
-    case 0:
-      highbd_quantize_nuq(coeff_ptr, n_coeffs, skip_block, p->quant,
-                          p->quant_shift, pd->dequant,
-                          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-                          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
-                          qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-    case 1:
-      highbd_quantize_32x32_nuq(
-          coeff_ptr, n_coeffs, skip_block, p->quant, p->quant_shift,
-          pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
-          dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-#if CONFIG_TX64X64
-    case 2:
-      highbd_quantize_64x64_nuq(
-          coeff_ptr, n_coeffs, skip_block, p->quant, p->quant_shift,
-          pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
-          dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-#endif  // CONFIG_TX64X64
-    default: assert(0);
-  }
-}
-
-void av1_highbd_quantize_fp_nuq_facade(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
-    const QUANT_PARAM *qparam) {
-  // obsolete skip_block
-  const int skip_block = 0;
-  const uint8_t *band = get_band_translate(qparam->tx_size);
-  const int dq = qparam->dq;
-
-  switch (qparam->log_scale) {
-    case 0:
-      highbd_quantize_fp_nuq(
-          coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
-          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
-          dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-    case 1:
-      highbd_quantize_32x32_fp_nuq(
-          coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
-          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
-          dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-#if CONFIG_TX64X64
-    case 2:
-      highbd_quantize_64x64_fp_nuq(
-          coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant,
-          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
-          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr,
-          dqcoeff_ptr, eob_ptr, sc->scan, band);
-      break;
-#endif  // CONFIG_TX64X64
-    default: assert(0);
-  }
-}
-
-void av1_highbd_quantize_dc_nuq_facade(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
-    const QUANT_PARAM *qparam) {
-  // obsolete skip_block
-  const int skip_block = 0;
-  const int dq = qparam->dq;
-  (void)sc;
-
-  switch (qparam->log_scale) {
-    case 0:
-      highbd_quantize_dc_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0],
-                                pd->dequant[0], p->cuml_bins_nuq[dq][0],
-                                pd->dequant_val_nuq[dq][0], qcoeff_ptr,
-                                dqcoeff_ptr, eob_ptr);
-      break;
-    case 1:
-      highbd_quantize_dc_32x32_fp_nuq(
-          coeff_ptr, n_coeffs, skip_block, p->quant_fp[0], pd->dequant[0],
-          p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0], qcoeff_ptr,
-          dqcoeff_ptr, eob_ptr);
-      break;
-#if CONFIG_TX64X64
-    case 2:
-      highbd_quantize_dc_64x64_fp_nuq(
-          coeff_ptr, n_coeffs, skip_block, p->quant_fp[0], pd->dequant[0],
-          p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0], qcoeff_ptr,
-          dqcoeff_ptr, eob_ptr);
-      break;
-#endif  // CONFIG_TX64X64
-    default: assert(0);
-  }
-}
-#endif  // CONFIG_NEW_QUANT
-
-void av1_highbd_quantize_fp_c(
-    const tran_low_t *coeff_ptr, intptr_t count, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan, int log_scale) {
-  highbd_quantize_fp_helper_c(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                              quant_ptr, quant_shift_ptr, qcoeff_ptr,
-                              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
-                              NULL, NULL, log_scale);
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const int16_t *scan, const int16_t *iscan,
+                              int log_scale) {
+  highbd_quantize_fp_helper_c(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
+                              quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+                              dequant_ptr, eob_ptr, scan, iscan, NULL, NULL,
+                              log_scale);
 }
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
@@ -1520,8 +510,7 @@ static void invert_quant(int16_t *quant, int16_t *shift, int d) {
 }
 
 static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) {
-  const int quant = av1_dc_quant(q, 0, bit_depth);
-#if CONFIG_HIGHBITDEPTH
+  const int quant = av1_dc_quant_Q3(q, 0, bit_depth);
   switch (bit_depth) {
     case AOM_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80);
     case AOM_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80);
@@ -1530,16 +519,13 @@ static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) {
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
   }
-#else
-  (void)bit_depth;
-  return q == 0 ? 64 : (quant < 148 ? 84 : 80);
-#endif
 }
 
 void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
-                         int uv_dc_delta_q, int uv_ac_delta_q,
-                         QUANTS *const quants, Dequants *const deq) {
-  int i, q, quant;
+                         int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q,
+                         int v_ac_delta_q, QUANTS *const quants,
+                         Dequants *const deq) {
+  int i, q, quant_Q3, quant_QTX;
 
   for (q = 0; q < QINDEX_RANGE; q++) {
     const int qzbin_factor = get_qzbin_factor(q, bit_depth);
@@ -1547,41 +533,51 @@ void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
 
     for (i = 0; i < 2; ++i) {
       int qrounding_factor_fp = 64;
-      // y
-      quant = i == 0 ? av1_dc_quant(q, y_dc_delta_q, bit_depth)
-                     : av1_ac_quant(q, 0, bit_depth);
-      invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant);
-      quants->y_quant_fp[q][i] = (1 << 16) / quant;
-      quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
-      quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
-      quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
-      deq->y_dequant[q][i] = quant;
-
-      // uv
-      quant = i == 0 ? av1_dc_quant(q, uv_dc_delta_q, bit_depth)
-                     : av1_ac_quant(q, uv_ac_delta_q, bit_depth);
-      invert_quant(&quants->uv_quant[q][i], &quants->uv_quant_shift[q][i],
-                   quant);
-      quants->uv_quant_fp[q][i] = (1 << 16) / quant;
-      quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
-      quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
-      quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
-      deq->uv_dequant[q][i] = quant;
-    }
-
-#if CONFIG_NEW_QUANT
-    int dq;
-    for (dq = 0; dq < QUANT_PROFILES; dq++) {
-      for (i = 0; i < COEF_BANDS; i++) {
-        const int y_quant = deq->y_dequant[q][i != 0];
-        const int uvquant = deq->uv_dequant[q][i != 0];
-        av1_get_dequant_val_nuq(y_quant, i, deq->y_dequant_val_nuq[dq][q][i],
-                                quants->y_cuml_bins_nuq[dq][q][i], dq);
-        av1_get_dequant_val_nuq(uvquant, i, deq->uv_dequant_val_nuq[dq][q][i],
-                                quants->uv_cuml_bins_nuq[dq][q][i], dq);
-      }
+      // y quantizer setup with original coeff shift of Q3
+      quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, y_dc_delta_q, bit_depth)
+                        : av1_ac_quant_Q3(q, 0, bit_depth);
+      // y quantizer with TX scale
+      quant_QTX = i == 0 ? av1_dc_quant_QTX(q, y_dc_delta_q, bit_depth)
+                         : av1_ac_quant_QTX(q, 0, bit_depth);
+      invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i],
+                   quant_QTX);
+      quants->y_quant_fp[q][i] = (1 << 16) / quant_QTX;
+      quants->y_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+      quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+      quants->y_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+      deq->y_dequant_QTX[q][i] = quant_QTX;
+      deq->y_dequant_Q3[q][i] = quant_Q3;
+
+      // u quantizer setup with original coeff shift of Q3
+      quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, u_dc_delta_q, bit_depth)
+                        : av1_ac_quant_Q3(q, u_ac_delta_q, bit_depth);
+      // u quantizer with TX scale
+      quant_QTX = i == 0 ? av1_dc_quant_QTX(q, u_dc_delta_q, bit_depth)
+                         : av1_ac_quant_QTX(q, u_ac_delta_q, bit_depth);
+      invert_quant(&quants->u_quant[q][i], &quants->u_quant_shift[q][i],
+                   quant_QTX);
+      quants->u_quant_fp[q][i] = (1 << 16) / quant_QTX;
+      quants->u_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+      quants->u_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+      quants->u_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+      deq->u_dequant_QTX[q][i] = quant_QTX;
+      deq->u_dequant_Q3[q][i] = quant_Q3;
+
+      // v quantizer setup with original coeff shift of Q3
+      quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, v_dc_delta_q, bit_depth)
+                        : av1_ac_quant_Q3(q, v_ac_delta_q, bit_depth);
+      // v quantizer with TX scale
+      quant_QTX = i == 0 ? av1_dc_quant_QTX(q, v_dc_delta_q, bit_depth)
+                         : av1_ac_quant_QTX(q, v_ac_delta_q, bit_depth);
+      invert_quant(&quants->v_quant[q][i], &quants->v_quant_shift[q][i],
+                   quant_QTX);
+      quants->v_quant_fp[q][i] = (1 << 16) / quant_QTX;
+      quants->v_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+      quants->v_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+      quants->v_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+      deq->v_dequant_QTX[q][i] = quant_QTX;
+      deq->v_dequant_Q3[q][i] = quant_Q3;
     }
-#endif  // CONFIG_NEW_QUANT
 
     for (i = 2; i < 8; i++) {  // 8: SIMD width
       quants->y_quant[q][i] = quants->y_quant[q][1];
@@ -1590,15 +586,25 @@ void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
       quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
       quants->y_zbin[q][i] = quants->y_zbin[q][1];
       quants->y_round[q][i] = quants->y_round[q][1];
-      deq->y_dequant[q][i] = deq->y_dequant[q][1];
-
-      quants->uv_quant[q][i] = quants->uv_quant[q][1];
-      quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1];
-      quants->uv_round_fp[q][i] = quants->uv_round_fp[q][1];
-      quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
-      quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
-      quants->uv_round[q][i] = quants->uv_round[q][1];
-      deq->uv_dequant[q][i] = deq->uv_dequant[q][1];
+      deq->y_dequant_QTX[q][i] = deq->y_dequant_QTX[q][1];
+      deq->y_dequant_Q3[q][i] = deq->y_dequant_Q3[q][1];
+
+      quants->u_quant[q][i] = quants->u_quant[q][1];
+      quants->u_quant_fp[q][i] = quants->u_quant_fp[q][1];
+      quants->u_round_fp[q][i] = quants->u_round_fp[q][1];
+      quants->u_quant_shift[q][i] = quants->u_quant_shift[q][1];
+      quants->u_zbin[q][i] = quants->u_zbin[q][1];
+      quants->u_round[q][i] = quants->u_round[q][1];
+      deq->u_dequant_QTX[q][i] = deq->u_dequant_QTX[q][1];
+      deq->u_dequant_Q3[q][i] = deq->u_dequant_Q3[q][1];
+      quants->v_quant[q][i] = quants->u_quant[q][1];
+      quants->v_quant_fp[q][i] = quants->v_quant_fp[q][1];
+      quants->v_round_fp[q][i] = quants->v_round_fp[q][1];
+      quants->v_quant_shift[q][i] = quants->v_quant_shift[q][1];
+      quants->v_zbin[q][i] = quants->v_zbin[q][1];
+      quants->v_round[q][i] = quants->v_round[q][1];
+      deq->v_dequant_QTX[q][i] = deq->v_dequant_QTX[q][1];
+      deq->v_dequant_Q3[q][i] = deq->v_dequant_Q3[q][1];
     }
   }
 }
@@ -1607,8 +613,9 @@ void av1_init_quantizer(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   QUANTS *const quants = &cpi->quants;
   Dequants *const dequants = &cpi->dequants;
-  av1_build_quantizer(cm->bit_depth, cm->y_dc_delta_q, cm->uv_dc_delta_q,
-                      cm->uv_ac_delta_q, quants, dequants);
+  av1_build_quantizer(cm->bit_depth, cm->y_dc_delta_q, cm->u_dc_delta_q,
+                      cm->u_ac_delta_q, cm->v_dc_delta_q, cm->v_ac_delta_q,
+                      quants, dequants);
 }
 
 void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
@@ -1617,79 +624,68 @@ void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
   MACROBLOCKD *const xd = &x->e_mbd;
   const QUANTS *const quants = &cpi->quants;
 
-#if CONFIG_EXT_DELTA_Q
-  int current_q_index =
-      AOMMAX(0, AOMMIN(QINDEX_RANGE - 1,
-                       cpi->oxcf.deltaq_mode != NO_DELTA_Q
-                           ? cm->base_qindex + xd->delta_qindex
-                           : cm->base_qindex));
-#else
-  int current_q_index = AOMMAX(
-      0, AOMMIN(QINDEX_RANGE - 1,
-                cm->delta_q_present_flag ? cm->base_qindex + xd->delta_qindex
-                                         : cm->base_qindex));
-#endif
-  const int qindex = av1_get_qindex(&cm->seg, segment_id, current_q_index);
+  int current_qindex = AOMMAX(
+      0, AOMMIN(QINDEX_RANGE - 1, cpi->oxcf.deltaq_mode != NO_DELTA_Q
+                                      ? cm->base_qindex + xd->delta_qindex
+                                      : cm->base_qindex));
+  const int qindex = av1_get_qindex(&cm->seg, segment_id, current_qindex);
   const int rdmult = av1_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
-  int i;
-#if CONFIG_AOM_QM
-  int minqm = cm->min_qmlevel;
-  int maxqm = cm->max_qmlevel;
-  // Quant matrix only depends on the base QP so there is only one set per frame
   int qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0)
                     ? NUM_QM_LEVELS - 1
-                    : aom_get_qmlevel(cm->base_qindex, minqm, maxqm);
-#endif
-#if CONFIG_NEW_QUANT
-  int dq;
-#endif
+                    : cm->qm_y;
 
   // Y
-  x->plane[0].quant = quants->y_quant[qindex];
-  x->plane[0].quant_fp = quants->y_quant_fp[qindex];
-  x->plane[0].round_fp = quants->y_round_fp[qindex];
-  x->plane[0].quant_shift = quants->y_quant_shift[qindex];
-  x->plane[0].zbin = quants->y_zbin[qindex];
-  x->plane[0].round = quants->y_round[qindex];
-#if CONFIG_AOM_QM
+  x->plane[0].quant_QTX = quants->y_quant[qindex];
+  x->plane[0].quant_fp_QTX = quants->y_quant_fp[qindex];
+  x->plane[0].round_fp_QTX = quants->y_round_fp[qindex];
+  x->plane[0].quant_shift_QTX = quants->y_quant_shift[qindex];
+  x->plane[0].zbin_QTX = quants->y_zbin[qindex];
+  x->plane[0].round_QTX = quants->y_round[qindex];
+  x->plane[0].dequant_QTX = cpi->dequants.y_dequant_QTX[qindex];
   memcpy(&xd->plane[0].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][0],
          sizeof(cm->gqmatrix[qmlevel][0]));
   memcpy(&xd->plane[0].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][0],
          sizeof(cm->giqmatrix[qmlevel][0]));
-#endif
-  xd->plane[0].dequant = cpi->dequants.y_dequant[qindex];
-#if CONFIG_NEW_QUANT
-  for (dq = 0; dq < QUANT_PROFILES; dq++) {
-    x->plane[0].cuml_bins_nuq[dq] = quants->y_cuml_bins_nuq[dq][qindex];
-    xd->plane[0].dequant_val_nuq[dq] =
-        cpi->dequants.y_dequant_val_nuq[dq][qindex];
-  }
-#endif  // CONFIG_NEW_QUANT
-
-  // UV
-  for (i = 1; i < 3; i++) {
-    x->plane[i].quant = quants->uv_quant[qindex];
-    x->plane[i].quant_fp = quants->uv_quant_fp[qindex];
-    x->plane[i].round_fp = quants->uv_round_fp[qindex];
-    x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
-    x->plane[i].zbin = quants->uv_zbin[qindex];
-    x->plane[i].round = quants->uv_round[qindex];
-#if CONFIG_AOM_QM
-    memcpy(&xd->plane[i].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][1],
+  xd->plane[0].dequant_Q3 = cpi->dequants.y_dequant_Q3[qindex];
+
+  // U
+  qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0)
+                ? NUM_QM_LEVELS - 1
+                : cm->qm_u;
+  {
+    x->plane[1].quant_QTX = quants->u_quant[qindex];
+    x->plane[1].quant_fp_QTX = quants->u_quant_fp[qindex];
+    x->plane[1].round_fp_QTX = quants->u_round_fp[qindex];
+    x->plane[1].quant_shift_QTX = quants->u_quant_shift[qindex];
+    x->plane[1].zbin_QTX = quants->u_zbin[qindex];
+    x->plane[1].round_QTX = quants->u_round[qindex];
+    x->plane[1].dequant_QTX = cpi->dequants.u_dequant_QTX[qindex];
+    memcpy(&xd->plane[1].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][1],
            sizeof(cm->gqmatrix[qmlevel][1]));
-    memcpy(&xd->plane[i].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][1],
+    memcpy(&xd->plane[1].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][1],
            sizeof(cm->giqmatrix[qmlevel][1]));
-#endif
-    xd->plane[i].dequant = cpi->dequants.uv_dequant[qindex];
-#if CONFIG_NEW_QUANT
-    for (dq = 0; dq < QUANT_PROFILES; dq++) {
-      x->plane[i].cuml_bins_nuq[dq] = quants->uv_cuml_bins_nuq[dq][qindex];
-      xd->plane[i].dequant_val_nuq[dq] =
-          cpi->dequants.uv_dequant_val_nuq[dq][qindex];
-    }
-#endif  // CONFIG_NEW_QUANT
+    x->plane[1].dequant_QTX = cpi->dequants.u_dequant_QTX[qindex];
+    xd->plane[1].dequant_Q3 = cpi->dequants.u_dequant_Q3[qindex];
+  }
+  // V
+  qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0)
+                ? NUM_QM_LEVELS - 1
+                : cm->qm_v;
+  {
+    x->plane[2].quant_QTX = quants->v_quant[qindex];
+    x->plane[2].quant_fp_QTX = quants->v_quant_fp[qindex];
+    x->plane[2].round_fp_QTX = quants->v_round_fp[qindex];
+    x->plane[2].quant_shift_QTX = quants->v_quant_shift[qindex];
+    x->plane[2].zbin_QTX = quants->v_zbin[qindex];
+    x->plane[2].round_QTX = quants->v_round[qindex];
+    x->plane[2].dequant_QTX = cpi->dequants.v_dequant_QTX[qindex];
+    memcpy(&xd->plane[2].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][2],
+           sizeof(cm->gqmatrix[qmlevel][2]));
+    memcpy(&xd->plane[2].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][2],
+           sizeof(cm->giqmatrix[qmlevel][2]));
+    x->plane[2].dequant_QTX = cpi->dequants.v_dequant_QTX[qindex];
+    xd->plane[2].dequant_Q3 = cpi->dequants.v_dequant_Q3[qindex];
   }
-
   x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
   x->qindex = qindex;
 
@@ -1701,16 +697,27 @@ void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
 void av1_frame_init_quantizer(AV1_COMP *cpi) {
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
+  av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
 }
 
 void av1_set_quantizer(AV1_COMMON *cm, int q) {
   // quantizer has to be reinitialized with av1_init_quantizer() if any
   // delta_q changes.
-  cm->base_qindex = q;
+  cm->base_qindex = AOMMAX(cm->delta_q_present_flag, q);
   cm->y_dc_delta_q = 0;
-  cm->uv_dc_delta_q = 0;
-  cm->uv_ac_delta_q = 0;
+  cm->u_dc_delta_q = 0;
+  cm->u_ac_delta_q = 0;
+  cm->v_dc_delta_q = 0;
+  cm->v_ac_delta_q = 0;
+  cm->qm_y = aom_get_qmlevel(cm->base_qindex, cm->min_qmlevel, cm->max_qmlevel);
+  cm->qm_u = aom_get_qmlevel(cm->base_qindex + cm->u_ac_delta_q,
+                             cm->min_qmlevel, cm->max_qmlevel);
+
+  if (!cm->separate_uv_delta_q)
+    cm->qm_v = cm->qm_u;
+  else
+    cm->qm_v = aom_get_qmlevel(cm->base_qindex + cm->v_ac_delta_q,
+                               cm->min_qmlevel, cm->max_qmlevel);
 }
 
 // Table that converts 0-63 Q-range values passed in outside to the Qindex
diff --git a/third_party/aom/av1/encoder/av1_quantize.h b/third_party/aom/av1/encoder/av1_quantize.h
index e5fc8b528..eaf8374de 100644
--- a/third_party/aom/av1/encoder/av1_quantize.h
+++ b/third_party/aom/av1/encoder/av1_quantize.h
@@ -12,7 +12,8 @@
 #ifndef AV1_ENCODER_QUANTIZE_H_
 #define AV1_ENCODER_QUANTIZE_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "av1/common/quant_common.h"
 #include "av1/common/scan.h"
 #include "av1/encoder/block.h"
@@ -23,33 +24,22 @@ extern "C" {
 
 typedef struct QUANT_PARAM {
   int log_scale;
-#if CONFIG_NEW_QUANT
   TX_SIZE tx_size;
-  int dq;
-#endif  // CONFIG_NEW_QUANT
-#if CONFIG_AOM_QM
   const qm_val_t *qmatrix;
   const qm_val_t *iqmatrix;
-#endif  // CONFIG_AOM_QM
 } QUANT_PARAM;
 
 typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                                  const MACROBLOCK_PLANE *p,
                                  tran_low_t *qcoeff_ptr,
-                                 const MACROBLOCKD_PLANE *pd,
                                  tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                                  const SCAN_ORDER *sc,
                                  const QUANT_PARAM *qparam);
 
+// The QUANTS structure is used only for internal quantizer setup in
+// av1_quantize.c.
+// All of its fields use the same coefficient shift/scaling at TX.
 typedef struct {
-#if CONFIG_NEW_QUANT
-  DECLARE_ALIGNED(
-      16, tran_low_t,
-      y_cuml_bins_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS][NUQ_KNOTS]);
-  DECLARE_ALIGNED(
-      16, tran_low_t,
-      uv_cuml_bins_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS][NUQ_KNOTS]);
-#endif  // CONFIG_NEW_QUANT
   // 0: dc 1: ac 2-8: ac repeated to SIMD width
   DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
@@ -59,25 +49,36 @@ typedef struct {
   // TODO(jingning): in progress of re-working the quantization. will decide
   // if we want to deprecate the current use of y_quant.
   DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_quant_fp[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_round_fp[QINDEX_RANGE][8]);
-
-  DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_round_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_round_fp[QINDEX_RANGE][8]);
+
+  DECLARE_ALIGNED(16, int16_t, u_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_round[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_round[QINDEX_RANGE][8]);
 } QUANTS;
 
+// The Dequants structure is used only for internal quantizer setup in
+// av1_quantize.c.
+// Fields are sufffixed according to whether or not they're expressed in
+// the same coefficient shift/precision as TX or a fixed Q3 format.
 typedef struct {
-  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);   // 8: SIMD width
-  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);  // 8: SIMD width
-#if CONFIG_NEW_QUANT
-  DECLARE_ALIGNED(16, dequant_val_type_nuq,
-                  y_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]);
-  DECLARE_ALIGNED(16, dequant_val_type_nuq,
-                  uv_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]);
-#endif  // CONFIG_NEW_QUANT
+  DECLARE_ALIGNED(16, int16_t,
+                  y_dequant_QTX[QINDEX_RANGE][8]);  // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t,
+                  u_dequant_QTX[QINDEX_RANGE][8]);  // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t,
+                  v_dequant_QTX[QINDEX_RANGE][8]);              // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t, y_dequant_Q3[QINDEX_RANGE][8]);  // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t, u_dequant_Q3[QINDEX_RANGE][8]);  // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t, v_dequant_Q3[QINDEX_RANGE][8]);  // 8: SIMD width
 } Dequants;
 
 struct AV1_COMP;
@@ -89,8 +90,9 @@ void av1_init_plane_quantizers(const struct AV1_COMP *cpi, MACROBLOCK *x,
                                int segment_id);
 
 void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
-                         int uv_dc_delta_q, int uv_ac_delta_q,
-                         QUANTS *const quants, Dequants *const deq);
+                         int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q,
+                         int v_ac_delta_q, QUANTS *const quants,
+                         Dequants *const deq);
 
 void av1_init_quantizer(struct AV1_COMP *cpi);
 
@@ -105,51 +107,22 @@ void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
 
 void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
-                            const MACROBLOCKD_PLANE *pd,
                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                             const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
 
 void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
-                           const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr,
-                           uint16_t *eob_ptr, const SCAN_ORDER *sc,
-                           const QUANT_PARAM *qparam);
+                           tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                           const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
 
 void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
-                            const MACROBLOCKD_PLANE *pd,
                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                             const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
 
-#if CONFIG_NEW_QUANT
-void av1_quantize_fp_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const MACROBLOCK_PLANE *p,
-                                tran_low_t *qcoeff_ptr,
-                                const MACROBLOCKD_PLANE *pd,
-                                tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                const SCAN_ORDER *sc,
-                                const QUANT_PARAM *qparam);
-
-void av1_quantize_b_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               const MACROBLOCK_PLANE *p,
-                               tran_low_t *qcoeff_ptr,
-                               const MACROBLOCKD_PLANE *pd,
-                               tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                               const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
-
-void av1_quantize_dc_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const MACROBLOCK_PLANE *p,
-                                tran_low_t *qcoeff_ptr,
-                                const MACROBLOCKD_PLANE *pd,
-                                tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                const SCAN_ORDER *sc,
-                                const QUANT_PARAM *qparam);
-#endif  // CONFIG_NEW_QUANT
-
 void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
                                    intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                    tran_low_t *qcoeff_ptr,
-                                   const MACROBLOCKD_PLANE *pd,
                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                                    const SCAN_ORDER *sc,
                                    const QUANT_PARAM *qparam);
@@ -157,7 +130,6 @@ void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
 void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                   tran_low_t *qcoeff_ptr,
-                                  const MACROBLOCKD_PLANE *pd,
                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                                   const SCAN_ORDER *sc,
                                   const QUANT_PARAM *qparam);
@@ -165,31 +137,10 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
 void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
                                    intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                    tran_low_t *qcoeff_ptr,
-                                   const MACROBLOCKD_PLANE *pd,
                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                                    const SCAN_ORDER *sc,
                                    const QUANT_PARAM *qparam);
 
-#if CONFIG_NEW_QUANT
-void av1_highbd_quantize_fp_nuq_facade(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
-    const QUANT_PARAM *qparam);
-
-void av1_highbd_quantize_b_nuq_facade(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
-    const QUANT_PARAM *qparam);
-
-void av1_highbd_quantize_dc_nuq_facade(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
-    const QUANT_PARAM *qparam);
-#endif  // CONFIG_NEW_QUANT
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/bgsprite.c b/third_party/aom/av1/encoder/bgsprite.c
deleted file mode 100644
index ae2cb1d40..000000000
--- a/third_party/aom/av1/encoder/bgsprite.c
+++ /dev/null
@@ -1,1257 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#define _POSIX_C_SOURCE 200112L  // rand_r()
-#include <assert.h>
-#include <float.h>
-#include <limits.h>
-#include <math.h>
-#include <stdlib.h>
-#include <time.h>
-
-#include "av1/encoder/bgsprite.h"
-
-#include "aom_mem/aom_mem.h"
-#include "./aom_scale_rtcd.h"
-#include "av1/common/mv.h"
-#include "av1/common/warped_motion.h"
-#include "av1/encoder/encoder.h"
-#include "av1/encoder/global_motion.h"
-#include "av1/encoder/mathutils.h"
-#include "av1/encoder/temporal_filter.h"
-
-/* Blending Modes:
- * 0 = Median
- * 1 = Mean
- */
-#define BGSPRITE_BLENDING_MODE 1
-
-// Enable removal of outliers from mean blending mode.
-#if BGSPRITE_BLENDING_MODE == 1
-#define BGSPRITE_MEAN_REMOVE_OUTLIERS 0
-#endif  // BGSPRITE_BLENDING_MODE == 1
-
-/* Interpolation for panorama alignment sampling:
- * 0 = Nearest neighbor
- * 1 = Bilinear
- */
-#define BGSPRITE_INTERPOLATION 0
-
-// Enable turning off bgsprite from firstpass metrics in define_gf_group.
-#define BGSPRITE_ENABLE_METRICS 1
-
-// Enable foreground/backgrond segmentation and combine with temporal filter.
-#define BGSPRITE_ENABLE_SEGMENTATION 1
-
-// Enable alignment using global motion.
-#define BGSPRITE_ENABLE_GME 0
-
-// Block size for foreground mask.
-#define BGSPRITE_MASK_BLOCK_SIZE 4
-
-typedef struct {
-#if CONFIG_HIGHBITDEPTH
-  uint16_t y;
-  uint16_t u;
-  uint16_t v;
-#else
-  uint8_t y;
-  uint8_t u;
-  uint8_t v;
-#endif  // CONFIG_HIGHBITDEPTH
-  uint8_t exists;
-} YuvPixel;
-
-typedef struct {
-  int curr_model;
-  double mean[2];
-  double var[2];
-  int age[2];
-  double u_mean[2];
-  double v_mean[2];
-
-#if CONFIG_HIGHBITDEPTH
-  uint16_t y;
-  uint16_t u;
-  uint16_t v;
-#else
-  uint8_t y;
-  uint8_t u;
-  uint8_t v;
-#endif  // CONFIG_HIGHBITDEPTH
-  double final_var;
-} YuvPixelGaussian;
-
-// Maps to convert from matrix form to param vector form.
-static const int params_to_matrix_map[] = { 2, 3, 0, 4, 5, 1, 6, 7 };
-static const int matrix_to_params_map[] = { 2, 5, 0, 1, 3, 4, 6, 7 };
-
-// Convert the parameter array to a 3x3 matrix form.
-static void params_to_matrix(const double *const params, double *target) {
-  for (int i = 0; i < MAX_PARAMDIM - 1; i++) {
-    assert(params_to_matrix_map[i] < MAX_PARAMDIM - 1);
-    target[i] = params[params_to_matrix_map[i]];
-  }
-  target[8] = 1;
-}
-
-// Convert a 3x3 matrix to a parameter array form.
-static void matrix_to_params(const double *const matrix, double *target) {
-  for (int i = 0; i < MAX_PARAMDIM - 1; i++) {
-    assert(matrix_to_params_map[i] < MAX_PARAMDIM - 1);
-    target[i] = matrix[matrix_to_params_map[i]];
-  }
-}
-
-#define TRANSFORM_MAT_DIM 3
-
-// Do matrix multiplication on params.
-static void multiply_params(double *const m1, double *const m2,
-                            double *target) {
-  double m1_matrix[MAX_PARAMDIM];
-  double m2_matrix[MAX_PARAMDIM];
-  double result[MAX_PARAMDIM];
-
-  params_to_matrix(m1, m1_matrix);
-  params_to_matrix(m2, m2_matrix);
-  multiply_mat(m2_matrix, m1_matrix, result, TRANSFORM_MAT_DIM,
-               TRANSFORM_MAT_DIM, TRANSFORM_MAT_DIM);
-  matrix_to_params(result, target);
-}
-
-// Finds x and y limits of a single transformed image.
-// Width and height are the size of the input video.
-static void find_frame_limit(int width, int height,
-                             const double *const transform, int *x_min,
-                             int *x_max, int *y_min, int *y_max) {
-  double transform_matrix[MAX_PARAMDIM];
-  double xy_matrix[3] = { 0, 0, 1 };
-  double uv_matrix[3] = { 0 };
-// Macro used to update frame limits based on transformed coordinates.
-#define UPDATELIMITS(u, v, x_min, x_max, y_min, y_max) \
-  {                                                    \
-    if ((int)ceil(u) > *x_max) {                       \
-      *x_max = (int)ceil(u);                           \
-    }                                                  \
-    if ((int)floor(u) < *x_min) {                      \
-      *x_min = (int)floor(u);                          \
-    }                                                  \
-    if ((int)ceil(v) > *y_max) {                       \
-      *y_max = (int)ceil(v);                           \
-    }                                                  \
-    if ((int)floor(v) < *y_min) {                      \
-      *y_min = (int)floor(v);                          \
-    }                                                  \
-  }
-
-  params_to_matrix(transform, transform_matrix);
-  xy_matrix[0] = 0;
-  xy_matrix[1] = 0;
-  multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM,
-               TRANSFORM_MAT_DIM, 1);
-  *x_max = (int)ceil(uv_matrix[0]);
-  *x_min = (int)floor(uv_matrix[0]);
-  *y_max = (int)ceil(uv_matrix[1]);
-  *y_min = (int)floor(uv_matrix[1]);
-
-  xy_matrix[0] = width - 1;
-  xy_matrix[1] = 0;
-  multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM,
-               TRANSFORM_MAT_DIM, 1);
-  UPDATELIMITS(uv_matrix[0], uv_matrix[1], x_min, x_max, y_min, y_max);
-
-  xy_matrix[0] = width - 1;
-  xy_matrix[1] = height - 1;
-  multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM,
-               TRANSFORM_MAT_DIM, 1);
-  UPDATELIMITS(uv_matrix[0], uv_matrix[1], x_min, x_max, y_min, y_max);
-
-  xy_matrix[0] = 0;
-  xy_matrix[1] = height - 1;
-  multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM,
-               TRANSFORM_MAT_DIM, 1);
-  UPDATELIMITS(uv_matrix[0], uv_matrix[1], x_min, x_max, y_min, y_max);
-
-#undef UPDATELIMITS
-}
-
-// Finds x and y limits for arrays. Also finds the overall max and minimums
-static void find_limits(int width, int height, const double **const params,
-                        int num_frames, int *x_min, int *x_max, int *y_min,
-                        int *y_max, int *pano_x_min, int *pano_x_max,
-                        int *pano_y_min, int *pano_y_max) {
-  *pano_x_max = INT_MIN;
-  *pano_x_min = INT_MAX;
-  *pano_y_max = INT_MIN;
-  *pano_y_min = INT_MAX;
-  for (int i = 0; i < num_frames; ++i) {
-    find_frame_limit(width, height, (const double *const)params[i], &x_min[i],
-                     &x_max[i], &y_min[i], &y_max[i]);
-    if (x_max[i] > *pano_x_max) {
-      *pano_x_max = x_max[i];
-    }
-    if (x_min[i] < *pano_x_min) {
-      *pano_x_min = x_min[i];
-    }
-    if (y_max[i] > *pano_y_max) {
-      *pano_y_max = y_max[i];
-    }
-    if (y_min[i] < *pano_y_min) {
-      *pano_y_min = y_min[i];
-    }
-  }
-}
-
-// Inverts a 3x3 matrix that is in the parameter form.
-static void invert_params(const double *const params, double *target) {
-  double temp[MAX_PARAMDIM] = { 0 };
-  params_to_matrix(params, temp);
-
-  // Find determinant of matrix (expansion by minors).
-  const double det = temp[0] * ((temp[4] * temp[8]) - (temp[5] * temp[7])) -
-                     temp[1] * ((temp[3] * temp[8]) - (temp[5] * temp[6])) +
-                     temp[2] * ((temp[3] * temp[7]) - (temp[4] * temp[6]));
-  assert(det != 0);
-
-  // inverse is transpose of cofactor * 1/det.
-  double inverse[MAX_PARAMDIM] = { 0 };
-  inverse[0] = (temp[4] * temp[8] - temp[7] * temp[5]) / det;
-  inverse[1] = (temp[2] * temp[7] - temp[1] * temp[8]) / det;
-  inverse[2] = (temp[1] * temp[5] - temp[2] * temp[4]) / det;
-  inverse[3] = (temp[5] * temp[6] - temp[3] * temp[8]) / det;
-  inverse[4] = (temp[0] * temp[8] - temp[2] * temp[6]) / det;
-  inverse[5] = (temp[3] * temp[2] - temp[0] * temp[5]) / det;
-  inverse[6] = (temp[3] * temp[7] - temp[6] * temp[4]) / det;
-  inverse[7] = (temp[6] * temp[1] - temp[0] * temp[7]) / det;
-  inverse[8] = (temp[0] * temp[4] - temp[3] * temp[1]) / det;
-
-  matrix_to_params(inverse, target);
-}
-
-static void build_image_stack(YV12_BUFFER_CONFIG **const frames,
-                              const int num_frames, const double **const params,
-                              const int *const x_min, const int *const x_max,
-                              const int *const y_min, const int *const y_max,
-                              int pano_x_min, int pano_y_min,
-                              YuvPixel ***img_stack) {
-  // Re-sample images onto panorama (pre-filtering).
-  const int x_offset = -pano_x_min;
-  const int y_offset = -pano_y_min;
-  const int frame_width = frames[0]->y_width;
-  const int frame_height = frames[0]->y_height;
-  for (int i = 0; i < num_frames; ++i) {
-    // Find transforms from panorama coordinate system back to single image
-    // coordinate system for sampling.
-    int transformed_width = x_max[i] - x_min[i] + 1;
-    int transformed_height = y_max[i] - y_min[i] + 1;
-
-    double transform_matrix[MAX_PARAMDIM];
-    double transform_params[MAX_PARAMDIM - 1];
-    invert_params(params[i], transform_params);
-    params_to_matrix(transform_params, transform_matrix);
-
-#if CONFIG_HIGHBITDEPTH
-    const uint16_t *y_buffer16 = CONVERT_TO_SHORTPTR(frames[i]->y_buffer);
-    const uint16_t *u_buffer16 = CONVERT_TO_SHORTPTR(frames[i]->u_buffer);
-    const uint16_t *v_buffer16 = CONVERT_TO_SHORTPTR(frames[i]->v_buffer);
-#endif  // CONFIG_HIGHBITDEPTH
-
-    for (int y = 0; y < transformed_height; ++y) {
-      for (int x = 0; x < transformed_width; ++x) {
-        // Do transform.
-        double xy_matrix[3] = { x + x_min[i], y + y_min[i], 1 };
-        double uv_matrix[3] = { 0 };
-        multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM,
-                     TRANSFORM_MAT_DIM, 1);
-
-        // Coordinates used for nearest neighbor interpolation.
-        int image_x = (int)round(uv_matrix[0]);
-        int image_y = (int)round(uv_matrix[1]);
-
-        // Temporary values for bilinear interpolation
-        double interpolated_yvalue = 0.0;
-        double interpolated_uvalue = 0.0;
-        double interpolated_vvalue = 0.0;
-        double interpolated_fraction = 0.0;
-        int interpolation_count = 0;
-
-#if BGSPRITE_INTERPOLATION == 1
-        // Coordintes used for bilinear interpolation.
-        double x_base;
-        double y_base;
-        double x_decimal = modf(uv_matrix[0], &x_base);
-        double y_decimal = modf(uv_matrix[1], &y_base);
-
-        if ((x_decimal > 0.2 && x_decimal < 0.8) ||
-            (y_decimal > 0.2 && y_decimal < 0.8)) {
-          for (int u = 0; u < 2; ++u) {
-            for (int v = 0; v < 2; ++v) {
-              int interp_x = (int)x_base + u;
-              int interp_y = (int)y_base + v;
-              if (interp_x >= 0 && interp_x < frame_width && interp_y >= 0 &&
-                  interp_y < frame_height) {
-                interpolation_count++;
-
-                interpolated_fraction +=
-                    fabs(u - x_decimal) * fabs(v - y_decimal);
-                int ychannel_idx = interp_y * frames[i]->y_stride + interp_x;
-                int uvchannel_idx = (interp_y >> frames[i]->subsampling_y) *
-                                        frames[i]->uv_stride +
-                                    (interp_x >> frames[i]->subsampling_x);
-#if CONFIG_HIGHBITDEPTH
-                if (frames[i]->flags & YV12_FLAG_HIGHBITDEPTH) {
-                  interpolated_yvalue += (1 - fabs(u - x_decimal)) *
-                                         (1 - fabs(v - y_decimal)) *
-                                         y_buffer16[ychannel_idx];
-                  interpolated_uvalue += (1 - fabs(u - x_decimal)) *
-                                         (1 - fabs(v - y_decimal)) *
-                                         u_buffer16[uvchannel_idx];
-                  interpolated_vvalue += (1 - fabs(u - x_decimal)) *
-                                         (1 - fabs(v - y_decimal)) *
-                                         v_buffer16[uvchannel_idx];
-                } else {
-#endif  // CONFIG_HIGHBITDEPTH
-                  interpolated_yvalue += (1 - fabs(u - x_decimal)) *
-                                         (1 - fabs(v - y_decimal)) *
-                                         frames[i]->y_buffer[ychannel_idx];
-                  interpolated_uvalue += (1 - fabs(u - x_decimal)) *
-                                         (1 - fabs(v - y_decimal)) *
-                                         frames[i]->u_buffer[uvchannel_idx];
-                  interpolated_vvalue += (1 - fabs(u - x_decimal)) *
-                                         (1 - fabs(v - y_decimal)) *
-                                         frames[i]->v_buffer[uvchannel_idx];
-#if CONFIG_HIGHBITDEPTH
-                }
-#endif  // CONFIG_HIGHBITDEPTH
-              }
-            }
-          }
-        }
-#endif  // BGSPRITE_INTERPOLATION == 1
-
-        if (BGSPRITE_INTERPOLATION && interpolation_count > 2) {
-          if (interpolation_count != 4) {
-            interpolated_yvalue /= interpolated_fraction;
-            interpolated_uvalue /= interpolated_fraction;
-            interpolated_vvalue /= interpolated_fraction;
-          }
-          int pano_x = x + x_min[i] + x_offset;
-          int pano_y = y + y_min[i] + y_offset;
-
-#if CONFIG_HIGHBITDEPTH
-          if (frames[i]->flags & YV12_FLAG_HIGHBITDEPTH) {
-            img_stack[pano_y][pano_x][i].y = (uint16_t)interpolated_yvalue;
-            img_stack[pano_y][pano_x][i].u = (uint16_t)interpolated_uvalue;
-            img_stack[pano_y][pano_x][i].v = (uint16_t)interpolated_vvalue;
-            img_stack[pano_y][pano_x][i].exists = 1;
-          } else {
-#endif  // CONFIG_HIGHBITDEPTH
-            img_stack[pano_y][pano_x][i].y = (uint8_t)interpolated_yvalue;
-            img_stack[pano_y][pano_x][i].u = (uint8_t)interpolated_uvalue;
-            img_stack[pano_y][pano_x][i].v = (uint8_t)interpolated_vvalue;
-            img_stack[pano_y][pano_x][i].exists = 1;
-#if CONFIG_HIGHBITDEPTH
-          }
-#endif  // CONFIG_HIGHBITDEPTH
-        } else if (image_x >= 0 && image_x < frame_width && image_y >= 0 &&
-                   image_y < frame_height) {
-          // Place in panorama stack.
-          int pano_x = x + x_min[i] + x_offset;
-          int pano_y = y + y_min[i] + y_offset;
-
-          int ychannel_idx = image_y * frames[i]->y_stride + image_x;
-          int uvchannel_idx =
-              (image_y >> frames[i]->subsampling_y) * frames[i]->uv_stride +
-              (image_x >> frames[i]->subsampling_x);
-#if CONFIG_HIGHBITDEPTH
-          if (frames[i]->flags & YV12_FLAG_HIGHBITDEPTH) {
-            img_stack[pano_y][pano_x][i].y = y_buffer16[ychannel_idx];
-            img_stack[pano_y][pano_x][i].u = u_buffer16[uvchannel_idx];
-            img_stack[pano_y][pano_x][i].v = v_buffer16[uvchannel_idx];
-            img_stack[pano_y][pano_x][i].exists = 1;
-          } else {
-#endif  // CONFIG_HIGHBITDEPTH
-            img_stack[pano_y][pano_x][i].y = frames[i]->y_buffer[ychannel_idx];
-            img_stack[pano_y][pano_x][i].u = frames[i]->u_buffer[uvchannel_idx];
-            img_stack[pano_y][pano_x][i].v = frames[i]->v_buffer[uvchannel_idx];
-            img_stack[pano_y][pano_x][i].exists = 1;
-#if CONFIG_HIGHBITDEPTH
-          }
-#endif  // CONFIG_HIGHBITDEPTH
-        }
-      }
-    }
-  }
-}
-
-#if BGSPRITE_BLENDING_MODE == 0
-// swaps two YuvPixels.
-static void swap_yuv(YuvPixel *a, YuvPixel *b) {
-  const YuvPixel temp = *b;
-  *b = *a;
-  *a = temp;
-}
-
-// Partitions array to find pivot index in qselect.
-static int partition(YuvPixel arr[], int left, int right, int pivot_idx) {
-  YuvPixel pivot = arr[pivot_idx];
-
-  // Move pivot to the end.
-  swap_yuv(&arr[pivot_idx], &arr[right]);
-
-  int p_idx = left;
-  for (int i = left; i < right; ++i) {
-    if (arr[i].y <= pivot.y) {
-      swap_yuv(&arr[i], &arr[p_idx]);
-      p_idx++;
-    }
-  }
-
-  swap_yuv(&arr[p_idx], &arr[right]);
-
-  return p_idx;
-}
-
-// Returns the kth element in array, partially sorted in place (quickselect).
-static YuvPixel qselect(YuvPixel arr[], int left, int right, int k) {
-  if (left >= right) {
-    return arr[left];
-  }
-  unsigned int seed = (int)time(NULL);
-  int pivot_idx = left + rand_r(&seed) % (right - left + 1);
-  pivot_idx = partition(arr, left, right, pivot_idx);
-
-  if (k == pivot_idx) {
-    return arr[k];
-  } else if (k < pivot_idx) {
-    return qselect(arr, left, pivot_idx - 1, k);
-  } else {
-    return qselect(arr, pivot_idx + 1, right, k);
-  }
-}
-
-// Blends image stack together using a temporal median.
-static void blend_median(const int width, const int height,
-                         const int num_frames, const YuvPixel ***image_stack,
-                         YuvPixel **blended_img) {
-  // Allocate stack of pixels
-  YuvPixel *pixel_stack = aom_calloc(num_frames, sizeof(*pixel_stack));
-
-  // Apply median filtering using quickselect.
-  for (int y = 0; y < height; ++y) {
-    for (int x = 0; x < width; ++x) {
-      int count = 0;
-      for (int i = 0; i < num_frames; ++i) {
-        if (image_stack[y][x][i].exists) {
-          pixel_stack[count] = image_stack[y][x][i];
-          ++count;
-        }
-      }
-      if (count == 0) {
-        // Just make the pixel black.
-        // TODO(toddnguyen): Color the pixel with nearest neighbor
-        blended_img[y][x].exists = 0;
-      } else {
-        const int median_idx = (int)floor(count / 2);
-        YuvPixel median = qselect(pixel_stack, 0, count - 1, median_idx);
-
-        // Make the median value the 0th index for UV subsampling later
-        blended_img[y][x] = median;
-        blended_img[y][x].exists = 1;
-      }
-    }
-  }
-
-  aom_free(pixel_stack);
-}
-#endif  // BGSPRITE_BLENDING_MODE == 0
-
-#if BGSPRITE_BLENDING_MODE == 1
-// Blends image stack together using a temporal mean.
-static void blend_mean(const int width, const int height, const int num_frames,
-                       const YuvPixel ***image_stack, YuvPixel **blended_img,
-                       int highbitdepth) {
-  for (int y = 0; y < height; ++y) {
-    for (int x = 0; x < width; ++x) {
-      // Find
-      uint32_t y_sum = 0;
-      uint32_t u_sum = 0;
-      uint32_t v_sum = 0;
-      uint32_t count = 0;
-      for (int i = 0; i < num_frames; ++i) {
-        if (image_stack[y][x][i].exists) {
-          y_sum += image_stack[y][x][i].y;
-          u_sum += image_stack[y][x][i].u;
-          v_sum += image_stack[y][x][i].v;
-          ++count;
-        }
-      }
-
-#if BGSPRITE_MEAN_REMOVE_OUTLIERS
-      if (count > 1) {
-        double stdev = 0;
-        double y_mean = (double)y_sum / count;
-        for (int i = 0; i < num_frames; ++i) {
-          if (image_stack[y][x][i].exists) {
-            stdev += pow(y_mean - image_stack[y][x][i].y, 2);
-          }
-        }
-        stdev = sqrt(stdev / count);
-
-        uint32_t inlier_y_sum = 0;
-        uint32_t inlier_u_sum = 0;
-        uint32_t inlier_v_sum = 0;
-        uint32_t inlier_count = 0;
-        for (int i = 0; i < num_frames; ++i) {
-          if (image_stack[y][x][i].exists &&
-              fabs(image_stack[y][x][i].y - y_mean) <= 1.5 * stdev) {
-            inlier_y_sum += image_stack[y][x][i].y;
-            inlier_u_sum += image_stack[y][x][i].u;
-            inlier_v_sum += image_stack[y][x][i].v;
-            ++inlier_count;
-          }
-        }
-        count = inlier_count;
-        y_sum = inlier_y_sum;
-        u_sum = inlier_u_sum;
-        v_sum = inlier_v_sum;
-      }
-#endif  // BGSPRITE_MEAN_REMOVE_OUTLIERS
-
-      if (count != 0) {
-        blended_img[y][x].exists = 1;
-#if CONFIG_HIGHBITDEPTH
-        if (highbitdepth) {
-          blended_img[y][x].y = (uint16_t)OD_DIVU(y_sum, count);
-          blended_img[y][x].u = (uint16_t)OD_DIVU(u_sum, count);
-          blended_img[y][x].v = (uint16_t)OD_DIVU(v_sum, count);
-        } else {
-#endif  // CONFIG_HIGHBITDEPTH
-          (void)highbitdepth;
-          blended_img[y][x].y = (uint8_t)OD_DIVU(y_sum, count);
-          blended_img[y][x].u = (uint8_t)OD_DIVU(u_sum, count);
-          blended_img[y][x].v = (uint8_t)OD_DIVU(v_sum, count);
-#if CONFIG_HIGHBITDEPTH
-        }
-#endif  // CONFIG_HIGHBITDEPTH
-      } else {
-        blended_img[y][x].exists = 0;
-      }
-    }
-  }
-}
-#endif  // BGSPRITE_BLENDING_MODE == 1
-
-#if BGSPRITE_ENABLE_SEGMENTATION
-// Builds dual-mode single gaussian model from image stack.
-static void build_gaussian(const YuvPixel ***image_stack, const int num_frames,
-                           const int width, const int height,
-                           const int x_block_width, const int y_block_height,
-                           const int block_size, YuvPixelGaussian **gauss) {
-  const double initial_variance = 10.0;
-  const double s_theta = 2.0;
-
-  // Add images to dual-mode single gaussian model
-  for (int y_block = 0; y_block < y_block_height; ++y_block) {
-    for (int x_block = 0; x_block < x_block_width; ++x_block) {
-      // Process all blocks.
-      YuvPixelGaussian *model = &gauss[y_block][x_block];
-
-      // Process all frames.
-      for (int i = 0; i < num_frames; ++i) {
-        // Add block to the Gaussian model.
-        double max_variance[2] = { 0.0, 0.0 };
-        double temp_y_mean = 0.0;
-        double temp_u_mean = 0.0;
-        double temp_v_mean = 0.0;
-
-        // Find mean/variance of a block of pixels.
-        int temp_count = 0;
-        for (int sub_y = 0; sub_y < block_size; ++sub_y) {
-          for (int sub_x = 0; sub_x < block_size; ++sub_x) {
-            const int y = y_block * block_size + sub_y;
-            const int x = x_block * block_size + sub_x;
-            if (y < height && x < width && image_stack[y][x][i].exists) {
-              ++temp_count;
-              temp_y_mean += (double)image_stack[y][x][i].y;
-              temp_u_mean += (double)image_stack[y][x][i].u;
-              temp_v_mean += (double)image_stack[y][x][i].v;
-
-              const double variance_0 =
-                  pow((double)image_stack[y][x][i].y - model->mean[0], 2);
-              const double variance_1 =
-                  pow((double)image_stack[y][x][i].y - model->mean[1], 2);
-
-              if (variance_0 > max_variance[0]) {
-                max_variance[0] = variance_0;
-              }
-              if (variance_1 > max_variance[1]) {
-                max_variance[1] = variance_1;
-              }
-            }
-          }
-        }
-
-        // If pixels exist in the block, add to the model.
-        if (temp_count > 0) {
-          assert(temp_count <= block_size * block_size);
-          temp_y_mean /= temp_count;
-          temp_u_mean /= temp_count;
-          temp_v_mean /= temp_count;
-
-          // Switch the background model to the oldest model.
-          if (model->age[0] > model->age[1]) {
-            model->curr_model = 0;
-          } else if (model->age[1] > model->age[0]) {
-            model->curr_model = 1;
-          }
-
-          // If model is empty, initialize model.
-          if (model->age[model->curr_model] == 0) {
-            model->mean[model->curr_model] = temp_y_mean;
-            model->u_mean[model->curr_model] = temp_u_mean;
-            model->v_mean[model->curr_model] = temp_v_mean;
-            model->var[model->curr_model] = initial_variance;
-            model->age[model->curr_model] = 1;
-          } else {
-            // Constants for current model and foreground model (0 or 1).
-            const int opposite = 1 - model->curr_model;
-            const int current = model->curr_model;
-            const double j = i;
-
-            // Put block into the appropriate model.
-            if (pow(temp_y_mean - model->mean[current], 2) <
-                s_theta * model->var[current]) {
-              // Add block to the current background model
-              model->age[current] += 1;
-              const double prev_weight = 1 / j;
-              const double curr_weight = (j - 1) / j;
-              model->mean[current] = prev_weight * model->mean[current] +
-                                     curr_weight * temp_y_mean;
-              model->u_mean[current] = prev_weight * model->u_mean[current] +
-                                       curr_weight * temp_u_mean;
-              model->v_mean[current] = prev_weight * model->v_mean[current] +
-                                       curr_weight * temp_v_mean;
-              model->var[current] = prev_weight * model->var[current] +
-                                    curr_weight * max_variance[current];
-            } else {
-              // Block does not fit into current background candidate. Add to
-              // foreground candidate and reinitialize if necessary.
-              const double var_fg = pow(temp_y_mean - model->mean[opposite], 2);
-
-              if (var_fg <= s_theta * model->var[opposite]) {
-                model->age[opposite] += 1;
-                const double prev_weight = 1 / j;
-                const double curr_weight = (j - 1) / j;
-                model->mean[opposite] = prev_weight * model->mean[opposite] +
-                                        curr_weight * temp_y_mean;
-                model->u_mean[opposite] =
-                    prev_weight * model->u_mean[opposite] +
-                    curr_weight * temp_u_mean;
-                model->v_mean[opposite] =
-                    prev_weight * model->v_mean[opposite] +
-                    curr_weight * temp_v_mean;
-                model->var[opposite] = prev_weight * model->var[opposite] +
-                                       curr_weight * max_variance[opposite];
-              } else if (model->age[opposite] == 0 ||
-                         var_fg > s_theta * model->var[opposite]) {
-                model->mean[opposite] = temp_y_mean;
-                model->u_mean[opposite] = temp_u_mean;
-                model->v_mean[opposite] = temp_v_mean;
-                model->var[opposite] = initial_variance;
-                model->age[opposite] = 1;
-              } else {
-                // This case should never happen.
-                assert(0);
-              }
-            }
-          }
-        }
-      }
-
-      // Select the oldest candidate as the background model.
-      if (model->age[0] == 0 && model->age[1] == 0) {
-        model->y = 0;
-        model->u = 0;
-        model->v = 0;
-        model->final_var = 0;
-      } else if (model->age[0] > model->age[1]) {
-        model->y = (uint8_t)model->mean[0];
-        model->u = (uint8_t)model->u_mean[0];
-        model->v = (uint8_t)model->v_mean[0];
-        model->final_var = model->var[0];
-      } else {
-        model->y = (uint8_t)model->mean[1];
-        model->u = (uint8_t)model->u_mean[1];
-        model->v = (uint8_t)model->v_mean[1];
-        model->final_var = model->var[1];
-      }
-    }
-  }
-}
-
-// Builds foreground mask based on reference image and gaussian model.
-// In mask[][], 1 is foreground and 0 is background.
-static void build_mask(const int x_min, const int y_min, const int x_offset,
-                       const int y_offset, const int x_block_width,
-                       const int y_block_height, const int block_size,
-                       const YuvPixelGaussian **gauss,
-                       YV12_BUFFER_CONFIG *const reference,
-                       YV12_BUFFER_CONFIG *const panorama, uint8_t **mask) {
-  const int crop_x_offset = x_min + x_offset;
-  const int crop_y_offset = y_min + y_offset;
-  const double d_theta = 4.0;
-
-  for (int y_block = 0; y_block < y_block_height; ++y_block) {
-    for (int x_block = 0; x_block < x_block_width; ++x_block) {
-      // Create mask to determine if ARF is background for foreground.
-      const YuvPixelGaussian *model = &gauss[y_block][x_block];
-      double temp_y_mean = 0.0;
-      int temp_count = 0;
-
-      for (int sub_y = 0; sub_y < block_size; ++sub_y) {
-        for (int sub_x = 0; sub_x < block_size; ++sub_x) {
-          // x and y are panorama coordinates.
-          const int y = y_block * block_size + sub_y;
-          const int x = x_block * block_size + sub_x;
-
-          const int arf_y = y - crop_y_offset;
-          const int arf_x = x - crop_x_offset;
-
-          if (arf_y >= 0 && arf_y < panorama->y_height && arf_x >= 0 &&
-              arf_x < panorama->y_width) {
-            ++temp_count;
-            const int ychannel_idx = arf_y * panorama->y_stride + arf_x;
-            temp_y_mean += (double)reference->y_buffer[ychannel_idx];
-          }
-        }
-      }
-      if (temp_count > 0) {
-        assert(temp_count <= block_size * block_size);
-        temp_y_mean /= temp_count;
-
-        if (pow(temp_y_mean - model->y, 2) > model->final_var * d_theta) {
-          // Mark block as foreground.
-          mask[y_block][x_block] = 1;
-        }
-      }
-    }
-  }
-}
-#endif  // BGSPRITE_ENABLE_SEGMENTATION
-
-// Resamples blended_img into panorama, including UV subsampling.
-static void resample_panorama(YuvPixel **blended_img, const int center_idx,
-                              const int *const x_min, const int *const y_min,
-                              int pano_x_min, int pano_x_max, int pano_y_min,
-                              int pano_y_max, YV12_BUFFER_CONFIG *panorama) {
-  const int width = pano_x_max - pano_x_min + 1;
-  const int height = pano_y_max - pano_y_min + 1;
-  const int x_offset = -pano_x_min;
-  const int y_offset = -pano_y_min;
-  const int crop_x_offset = x_min[center_idx] + x_offset;
-  const int crop_y_offset = y_min[center_idx] + y_offset;
-#if CONFIG_HIGHBITDEPTH
-  if (panorama->flags & YV12_FLAG_HIGHBITDEPTH) {
-    // Use median Y value.
-    uint16_t *pano_y_buffer16 = CONVERT_TO_SHORTPTR(panorama->y_buffer);
-    uint16_t *pano_u_buffer16 = CONVERT_TO_SHORTPTR(panorama->u_buffer);
-    uint16_t *pano_v_buffer16 = CONVERT_TO_SHORTPTR(panorama->v_buffer);
-
-    for (int y = 0; y < panorama->y_height; ++y) {
-      for (int x = 0; x < panorama->y_width; ++x) {
-        const int ychannel_idx = y * panorama->y_stride + x;
-        if (blended_img[y + crop_y_offset][x + crop_x_offset].exists) {
-          pano_y_buffer16[ychannel_idx] =
-              blended_img[y + crop_y_offset][x + crop_x_offset].y;
-        } else {
-          pano_y_buffer16[ychannel_idx] = 0;
-        }
-      }
-    }
-
-    // UV subsampling with median UV values
-    for (int y = 0; y < panorama->uv_height; ++y) {
-      for (int x = 0; x < panorama->uv_width; ++x) {
-        uint32_t avg_count = 0;
-        uint32_t u_sum = 0;
-        uint32_t v_sum = 0;
-
-        // Look at surrounding pixels for subsampling
-        for (int s_x = 0; s_x < panorama->subsampling_x + 1; ++s_x) {
-          for (int s_y = 0; s_y < panorama->subsampling_y + 1; ++s_y) {
-            int y_sample = crop_y_offset + (y << panorama->subsampling_y) + s_y;
-            int x_sample = crop_x_offset + (x << panorama->subsampling_x) + s_x;
-            if (y_sample > 0 && y_sample < height && x_sample > 0 &&
-                x_sample < width && blended_img[y_sample][x_sample].exists) {
-              u_sum += blended_img[y_sample][x_sample].u;
-              v_sum += blended_img[y_sample][x_sample].v;
-              avg_count++;
-            }
-          }
-        }
-
-        const int uvchannel_idx = y * panorama->uv_stride + x;
-        if (avg_count != 0) {
-          pano_u_buffer16[uvchannel_idx] = (uint16_t)OD_DIVU(u_sum, avg_count);
-          pano_v_buffer16[uvchannel_idx] = (uint16_t)OD_DIVU(v_sum, avg_count);
-        } else {
-          pano_u_buffer16[uvchannel_idx] = 0;
-          pano_v_buffer16[uvchannel_idx] = 0;
-        }
-      }
-    }
-  } else {
-#endif  // CONFIG_HIGHBITDEPTH
-    // Use blended Y value.
-    for (int y = 0; y < panorama->y_height; ++y) {
-      for (int x = 0; x < panorama->y_width; ++x) {
-        const int ychannel_idx = y * panorama->y_stride + x;
-        // Use filtered background.
-        if (blended_img[y + crop_y_offset][x + crop_x_offset].exists) {
-          panorama->y_buffer[ychannel_idx] =
-              blended_img[y + crop_y_offset][x + crop_x_offset].y;
-        } else {
-          panorama->y_buffer[ychannel_idx] = 0;
-        }
-      }
-    }
-
-    // UV subsampling with blended UV values.
-    for (int y = 0; y < panorama->uv_height; ++y) {
-      for (int x = 0; x < panorama->uv_width; ++x) {
-        uint16_t avg_count = 0;
-        uint16_t u_sum = 0;
-        uint16_t v_sum = 0;
-
-        // Look at surrounding pixels for subsampling.
-        for (int s_x = 0; s_x < panorama->subsampling_x + 1; ++s_x) {
-          for (int s_y = 0; s_y < panorama->subsampling_y + 1; ++s_y) {
-            int y_sample = crop_y_offset + (y << panorama->subsampling_y) + s_y;
-            int x_sample = crop_x_offset + (x << panorama->subsampling_x) + s_x;
-            if (y_sample > 0 && y_sample < height && x_sample > 0 &&
-                x_sample < width && blended_img[y_sample][x_sample].exists) {
-              u_sum += blended_img[y_sample][x_sample].u;
-              v_sum += blended_img[y_sample][x_sample].v;
-              avg_count++;
-            }
-          }
-        }
-
-        const int uvchannel_idx = y * panorama->uv_stride + x;
-        if (avg_count != 0) {
-          panorama->u_buffer[uvchannel_idx] =
-              (uint8_t)OD_DIVU(u_sum, avg_count);
-          panorama->v_buffer[uvchannel_idx] =
-              (uint8_t)OD_DIVU(v_sum, avg_count);
-        } else {
-          panorama->u_buffer[uvchannel_idx] = 0;
-          panorama->v_buffer[uvchannel_idx] = 0;
-        }
-      }
-    }
-#if CONFIG_HIGHBITDEPTH
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-}
-
-#if BGSPRITE_ENABLE_SEGMENTATION
-// Combines temporal filter output and bgsprite output to make final ARF output
-static void combine_arf(YV12_BUFFER_CONFIG *const temporal_arf,
-                        YV12_BUFFER_CONFIG *const bgsprite,
-                        uint8_t **const mask, const int block_size,
-                        const int x_offset, const int y_offset,
-                        YV12_BUFFER_CONFIG *target) {
-  const int height = temporal_arf->y_height;
-  const int width = temporal_arf->y_width;
-
-  YuvPixel **blended_img = aom_malloc(height * sizeof(*blended_img));
-  for (int i = 0; i < height; ++i) {
-    blended_img[i] = aom_malloc(width * sizeof(**blended_img));
-  }
-
-  const int block_2_height = (height / BGSPRITE_MASK_BLOCK_SIZE) +
-                             (height % BGSPRITE_MASK_BLOCK_SIZE != 0 ? 1 : 0);
-  const int block_2_width = (width / BGSPRITE_MASK_BLOCK_SIZE) +
-                            (width % BGSPRITE_MASK_BLOCK_SIZE != 0 ? 1 : 0);
-
-  for (int block_y = 0; block_y < block_2_height; ++block_y) {
-    for (int block_x = 0; block_x < block_2_width; ++block_x) {
-      int count = 0;
-      int total = 0;
-      for (int sub_y = 0; sub_y < BGSPRITE_MASK_BLOCK_SIZE; ++sub_y) {
-        for (int sub_x = 0; sub_x < BGSPRITE_MASK_BLOCK_SIZE; ++sub_x) {
-          const int img_y = block_y * BGSPRITE_MASK_BLOCK_SIZE + sub_y;
-          const int img_x = block_x * BGSPRITE_MASK_BLOCK_SIZE + sub_x;
-          const int mask_y = (y_offset + img_y) / block_size;
-          const int mask_x = (x_offset + img_x) / block_size;
-
-          if (img_y < height && img_x < width) {
-            if (mask[mask_y][mask_x]) {
-              ++count;
-            }
-            ++total;
-          }
-        }
-      }
-
-      const double threshold = 0.30;
-      const int amount = (int)(threshold * total);
-      for (int sub_y = 0; sub_y < BGSPRITE_MASK_BLOCK_SIZE; ++sub_y) {
-        for (int sub_x = 0; sub_x < BGSPRITE_MASK_BLOCK_SIZE; ++sub_x) {
-          const int y = block_y * BGSPRITE_MASK_BLOCK_SIZE + sub_y;
-          const int x = block_x * BGSPRITE_MASK_BLOCK_SIZE + sub_x;
-          if (y < height && x < width) {
-            blended_img[y][x].exists = 1;
-            const int ychannel_idx = y * temporal_arf->y_stride + x;
-            const int uvchannel_idx =
-                (y >> temporal_arf->subsampling_y) * temporal_arf->uv_stride +
-                (x >> temporal_arf->subsampling_x);
-
-            if (count > amount) {
-// Foreground; use temporal arf.
-#if CONFIG_HIGHBITDEPTH
-              if (temporal_arf->flags & YV12_FLAG_HIGHBITDEPTH) {
-                uint16_t *pano_y_buffer16 =
-                    CONVERT_TO_SHORTPTR(temporal_arf->y_buffer);
-                uint16_t *pano_u_buffer16 =
-                    CONVERT_TO_SHORTPTR(temporal_arf->u_buffer);
-                uint16_t *pano_v_buffer16 =
-                    CONVERT_TO_SHORTPTR(temporal_arf->v_buffer);
-                blended_img[y][x].y = pano_y_buffer16[ychannel_idx];
-                blended_img[y][x].u = pano_u_buffer16[uvchannel_idx];
-                blended_img[y][x].v = pano_v_buffer16[uvchannel_idx];
-              } else {
-#endif  // CONFIG_HIGHBITDEPTH
-                blended_img[y][x].y = temporal_arf->y_buffer[ychannel_idx];
-                blended_img[y][x].u = temporal_arf->u_buffer[uvchannel_idx];
-                blended_img[y][x].v = temporal_arf->v_buffer[uvchannel_idx];
-#if CONFIG_HIGHBITDEPTH
-              }
-#endif  // CONFIG_HIGHBITDEPTH
-            } else {
-// Background; use bgsprite arf.
-#if CONFIG_HIGHBITDEPTH
-              if (bgsprite->flags & YV12_FLAG_HIGHBITDEPTH) {
-                uint16_t *pano_y_buffer16 =
-                    CONVERT_TO_SHORTPTR(bgsprite->y_buffer);
-                uint16_t *pano_u_buffer16 =
-                    CONVERT_TO_SHORTPTR(bgsprite->u_buffer);
-                uint16_t *pano_v_buffer16 =
-                    CONVERT_TO_SHORTPTR(bgsprite->v_buffer);
-                blended_img[y][x].y = pano_y_buffer16[ychannel_idx];
-                blended_img[y][x].u = pano_u_buffer16[uvchannel_idx];
-                blended_img[y][x].v = pano_v_buffer16[uvchannel_idx];
-              } else {
-#endif  // CONFIG_HIGHBITDEPTH
-                blended_img[y][x].y = bgsprite->y_buffer[ychannel_idx];
-                blended_img[y][x].u = bgsprite->u_buffer[uvchannel_idx];
-                blended_img[y][x].v = bgsprite->v_buffer[uvchannel_idx];
-#if CONFIG_HIGHBITDEPTH
-              }
-#endif  // CONFIG_HIGHBITDEPTH
-            }
-          }
-        }
-      }
-    }
-  }
-
-  const int x_min = 0;
-  const int y_min = 0;
-  resample_panorama(blended_img, 0, &x_min, &y_min, 0, width - 1, 0, height - 1,
-                    target);
-
-  for (int i = 0; i < height; ++i) {
-    aom_free(blended_img[i]);
-  }
-  aom_free(blended_img);
-}
-#endif  // BGSPRITE_ENABLE_SEGMENTATION
-
-// Stitches images together to create ARF and stores it in 'panorama'.
-static void stitch_images(AV1_COMP *cpi, YV12_BUFFER_CONFIG **const frames,
-                          const int num_frames, const int distance,
-                          const int center_idx, const double **const params,
-                          const int *const x_min, const int *const x_max,
-                          const int *const y_min, const int *const y_max,
-                          int pano_x_min, int pano_x_max, int pano_y_min,
-                          int pano_y_max, YV12_BUFFER_CONFIG *panorama) {
-  const int width = pano_x_max - pano_x_min + 1;
-  const int height = pano_y_max - pano_y_min + 1;
-
-  // Create pano_stack[y][x][num_frames] stack of pixel values
-  YuvPixel ***pano_stack = aom_malloc(height * sizeof(*pano_stack));
-  for (int i = 0; i < height; ++i) {
-    pano_stack[i] = aom_malloc(width * sizeof(**pano_stack));
-    for (int j = 0; j < width; ++j) {
-      pano_stack[i][j] = aom_calloc(num_frames, sizeof(***pano_stack));
-    }
-  }
-
-  build_image_stack(frames, num_frames, params, x_min, x_max, y_min, y_max,
-                    pano_x_min, pano_y_min, pano_stack);
-
-  // Create blended_img[y][x] of combined panorama pixel values.
-  YuvPixel **blended_img = aom_malloc(height * sizeof(*blended_img));
-  for (int i = 0; i < height; ++i) {
-    blended_img[i] = aom_malloc(width * sizeof(**blended_img));
-  }
-
-// Blending and saving result in blended_img.
-#if BGSPRITE_BLENDING_MODE == 1
-  blend_mean(width, height, num_frames, (const YuvPixel ***)pano_stack,
-             blended_img, panorama->flags & YV12_FLAG_HIGHBITDEPTH);
-#else   // BGSPRITE_BLENDING_MODE != 1
-  blend_median(width, height, num_frames, (const YuvPixel ***)pano_stack,
-               blended_img);
-#endif  // BGSPRITE_BLENDING_MODE == 1
-
-  // NOTE(toddnguyen): Right now the ARF in the cpi struct is fixed size at
-  // the same size as the frames. For now, we crop the generated panorama.
-  assert(panorama->y_width <= width && panorama->y_height <= height);
-
-  // Resamples the blended_img into the panorama buffer.
-  YV12_BUFFER_CONFIG bgsprite;
-  memset(&bgsprite, 0, sizeof(bgsprite));
-  aom_alloc_frame_buffer(&bgsprite, frames[0]->y_width, frames[0]->y_height,
-                         frames[0]->subsampling_x, frames[0]->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                         frames[0]->flags & YV12_FLAG_HIGHBITDEPTH,
-#endif
-                         frames[0]->border, 0);
-  aom_yv12_copy_frame(frames[0], &bgsprite);
-  bgsprite.bit_depth = frames[0]->bit_depth;
-  resample_panorama(blended_img, center_idx, x_min, y_min, pano_x_min,
-                    pano_x_max, pano_y_min, pano_y_max, &bgsprite);
-
-#if BGSPRITE_ENABLE_SEGMENTATION
-  YV12_BUFFER_CONFIG temporal_bgsprite;
-  memset(&temporal_bgsprite, 0, sizeof(temporal_bgsprite));
-  aom_alloc_frame_buffer(&temporal_bgsprite, frames[0]->y_width,
-                         frames[0]->y_height, frames[0]->subsampling_x,
-                         frames[0]->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                         frames[0]->flags & YV12_FLAG_HIGHBITDEPTH,
-#endif
-                         frames[0]->border, 0);
-  aom_yv12_copy_frame(frames[0], &temporal_bgsprite);
-  temporal_bgsprite.bit_depth = frames[0]->bit_depth;
-
-  av1_temporal_filter(cpi, &bgsprite, &temporal_bgsprite, distance);
-
-  // Block size constants for gaussian model.
-  const int N_1 = 2;
-  const int y_block_height = (height / N_1) + (height % N_1 != 0 ? 1 : 0);
-  const int x_block_width = (width / N_1) + (height % N_1 != 0 ? 1 : 0);
-  YuvPixelGaussian **gauss = aom_malloc(y_block_height * sizeof(*gauss));
-  for (int i = 0; i < y_block_height; ++i) {
-    gauss[i] = aom_calloc(x_block_width, sizeof(**gauss));
-  }
-
-  // Build Gaussian model.
-  build_gaussian((const YuvPixel ***)pano_stack, num_frames, width, height,
-                 x_block_width, y_block_height, N_1, gauss);
-
-  // Select background model and build foreground mask.
-  uint8_t **mask = aom_malloc(y_block_height * sizeof(*mask));
-  for (int i = 0; i < y_block_height; ++i) {
-    mask[i] = aom_calloc(x_block_width, sizeof(**mask));
-  }
-
-  const int x_offset = -pano_x_min;
-  const int y_offset = -pano_y_min;
-  build_mask(x_min[center_idx], y_min[center_idx], x_offset, y_offset,
-             x_block_width, y_block_height, N_1,
-             (const YuvPixelGaussian **)gauss,
-             (YV12_BUFFER_CONFIG * const) frames[center_idx], panorama, mask);
-
-  YV12_BUFFER_CONFIG temporal_arf;
-  memset(&temporal_arf, 0, sizeof(temporal_arf));
-  aom_alloc_frame_buffer(&temporal_arf, frames[0]->y_width, frames[0]->y_height,
-                         frames[0]->subsampling_x, frames[0]->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                         frames[0]->flags & YV12_FLAG_HIGHBITDEPTH,
-#endif
-                         frames[0]->border, 0);
-  aom_yv12_copy_frame(frames[0], &temporal_arf);
-  temporal_arf.bit_depth = frames[0]->bit_depth;
-  av1_temporal_filter(cpi, NULL, &temporal_arf, distance);
-
-  combine_arf(&temporal_arf, &temporal_bgsprite, mask, N_1, x_offset, y_offset,
-              panorama);
-
-  aom_free_frame_buffer(&temporal_arf);
-  aom_free_frame_buffer(&temporal_bgsprite);
-  for (int i = 0; i < y_block_height; ++i) {
-    aom_free(gauss[i]);
-    aom_free(mask[i]);
-  }
-  aom_free(gauss);
-  aom_free(mask);
-#else   // !BGSPRITE_ENABLE_SEGMENTATION
-  av1_temporal_filter(cpi, &bgsprite, panorama, distance);
-#endif  // BGSPRITE_ENABLE_SEGMENTATION
-
-  aom_free_frame_buffer(&bgsprite);
-  for (int i = 0; i < height; ++i) {
-    for (int j = 0; j < width; ++j) {
-      aom_free(pano_stack[i][j]);
-    }
-    aom_free(pano_stack[i]);
-    aom_free(blended_img[i]);
-  }
-  aom_free(pano_stack);
-  aom_free(blended_img);
-}
-
-int av1_background_sprite(AV1_COMP *cpi, int distance) {
-#if BGSPRITE_ENABLE_METRICS
-  // Do temporal filter if firstpass stats disable bgsprite.
-  if (!cpi->bgsprite_allowed) {
-    return 1;
-  }
-#endif  // BGSPRITE_ENABLE_METRICS
-
-  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
-  static const double identity_params[MAX_PARAMDIM - 1] = {
-    0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0
-  };
-
-  const int frames_after_arf =
-      av1_lookahead_depth(cpi->lookahead) - distance - 1;
-  int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1;
-  int frames_bwd;
-
-  // Define the forward and backwards filter limits for this arnr group.
-  if (frames_fwd > frames_after_arf) frames_fwd = frames_after_arf;
-  if (frames_fwd > distance) frames_fwd = distance;
-  frames_bwd = frames_fwd;
-
-#if CONFIG_EXT_REFS
-  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-  if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW) {
-    cpi->is_arf_filter_off[gf_group->arf_update_idx[gf_group->index]] = 1;
-    frames_fwd = 0;
-    frames_bwd = 0;
-  } else {
-    cpi->is_arf_filter_off[gf_group->arf_update_idx[gf_group->index]] = 0;
-  }
-#endif  // CONFIG_EXT_REFS
-
-  const int start_frame = distance + frames_fwd;
-  const int frames_to_stitch = frames_bwd + 1 + frames_fwd;
-
-  // Get frames to be included in background sprite.
-  for (int frame = 0; frame < frames_to_stitch; ++frame) {
-    const int which_buffer = start_frame - frame;
-    struct lookahead_entry *buf =
-        av1_lookahead_peek(cpi->lookahead, which_buffer);
-    frames[frames_to_stitch - 1 - frame] = &buf->img;
-  }
-
-  // Allocate empty arrays for parameters between frames.
-  double **params = aom_malloc(frames_to_stitch * sizeof(*params));
-  for (int i = 0; i < frames_to_stitch; ++i) {
-    params[i] = aom_malloc(sizeof(identity_params));
-    memcpy(params[i], identity_params, sizeof(identity_params));
-  }
-
-// Use global motion to find affine transformations between frames.
-// params[i] will have the transform from frame[i] to frame[i-1].
-// params[0] will have the identity matrix (has no previous frame).
-#if BGSPRITE_ENABLE_GME
-  TransformationType model = AFFINE;
-  int inliers_by_motion[RANSAC_NUM_MOTIONS];
-  for (int frame = 0; frame < frames_to_stitch - 1; ++frame) {
-    const int global_motion_ret = compute_global_motion_feature_based(
-        model, frames[frame + 1], frames[frame],
-#if CONFIG_HIGHBITDEPTH
-        cpi->common.bit_depth,
-#endif  // CONFIG_HIGHBITDEPTH
-        inliers_by_motion, params[frame + 1], RANSAC_NUM_MOTIONS);
-
-    // Quit if global motion had an error.
-    if (global_motion_ret == 0) {
-      for (int i = 0; i < frames_to_stitch; ++i) {
-        aom_free(params[i]);
-      }
-      aom_free(params);
-      return 1;
-    }
-  }
-#endif  // BGSPRITE_ENABLE_GME
-
-  // Compound the transformation parameters.
-  for (int i = 1; i < frames_to_stitch; ++i) {
-    multiply_params(params[i - 1], params[i], params[i]);
-  }
-
-  // Compute frame limits for final stitched images.
-  int pano_x_max = INT_MIN;
-  int pano_x_min = INT_MAX;
-  int pano_y_max = INT_MIN;
-  int pano_y_min = INT_MAX;
-  int *x_max = aom_malloc(frames_to_stitch * sizeof(*x_max));
-  int *x_min = aom_malloc(frames_to_stitch * sizeof(*x_min));
-  int *y_max = aom_malloc(frames_to_stitch * sizeof(*y_max));
-  int *y_min = aom_malloc(frames_to_stitch * sizeof(*y_min));
-
-  find_limits(frames[0]->y_width, frames[0]->y_height,
-              (const double **const)params, frames_to_stitch, x_min, x_max,
-              y_min, y_max, &pano_x_min, &pano_x_max, &pano_y_min, &pano_y_max);
-
-  // Center panorama on the ARF.
-  const int center_idx = frames_bwd;
-  assert(center_idx >= 0 && center_idx < frames_to_stitch);
-
-  // Recompute transformations to adjust to center image.
-  // Invert center image's transform.
-  double inverse[MAX_PARAMDIM - 1] = { 0 };
-  invert_params(params[center_idx], inverse);
-
-  // Multiply the inverse to all transformation parameters.
-  for (int i = 0; i < frames_to_stitch; ++i) {
-    multiply_params(inverse, params[i], params[i]);
-  }
-
-  // Recompute frame limits for new adjusted center.
-  find_limits(frames[0]->y_width, frames[0]->y_height,
-              (const double **const)params, frames_to_stitch, x_min, x_max,
-              y_min, y_max, &pano_x_min, &pano_x_max, &pano_y_min, &pano_y_max);
-
-  // Stitch Images and apply bgsprite filter.
-  stitch_images(cpi, frames, frames_to_stitch, distance, center_idx,
-                (const double **const)params, x_min, x_max, y_min, y_max,
-                pano_x_min, pano_x_max, pano_y_min, pano_y_max,
-                &cpi->alt_ref_buffer);
-
-  // Free memory.
-  for (int i = 0; i < frames_to_stitch; ++i) {
-    aom_free(params[i]);
-  }
-  aom_free(params);
-  aom_free(x_max);
-  aom_free(x_min);
-  aom_free(y_max);
-  aom_free(y_min);
-
-  return 0;
-}
-
-#undef _POSIX_C_SOURCE
-#undef BGSPRITE_BLENDING_MODE
-#undef BGSPRITE_INTERPOLATION
-#undef BGSPRITE_ENABLE_METRICS
-#undef BGSPRITE_ENABLE_SEGMENTATION
-#undef BGSPRITE_ENABLE_GME
-#undef BGSPRITE_MASK_BLOCK_SIZE
-#undef TRANSFORM_MAT_DIM
diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c
index 08f605f10..cdd7c2492 100644
--- a/third_party/aom/av1/encoder/bitstream.c
+++ b/third_party/aom/av1/encoder/bitstream.c
@@ -24,9 +24,8 @@
 #include "aom_util/debug_util.h"
 #endif  // CONFIG_BITSTREAM_DEBUG
 
-#if CONFIG_CDEF
 #include "av1/common/cdef.h"
-#endif  // CONFIG_CDEF
+#include "av1/common/cfl.h"
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
 #include "av1/common/entropymv.h"
@@ -34,38 +33,21 @@
 #include "av1/common/odintrin.h"
 #include "av1/common/pred_common.h"
 #include "av1/common/reconinter.h"
-#if CONFIG_EXT_INTRA
 #include "av1/common/reconintra.h"
-#endif  // CONFIG_EXT_INTRA
 #include "av1/common/seg_common.h"
 #include "av1/common/tile_common.h"
 
-#if CONFIG_LV_MAP
-#include "av1/encoder/encodetxb.h"
-#endif  // CONFIG_LV_MAP
 #include "av1/encoder/bitstream.h"
 #include "av1/encoder/cost.h"
 #include "av1/encoder/encodemv.h"
+#include "av1/encoder/encodetxb.h"
 #include "av1/encoder/mcomp.h"
-#if CONFIG_PALETTE_DELTA_ENCODING
 #include "av1/encoder/palette.h"
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
 #include "av1/encoder/segmentation.h"
-#include "av1/encoder/subexp.h"
 #include "av1/encoder/tokenize.h"
-#if CONFIG_PVQ
-#include "av1/encoder/pvq_encoder.h"
-#endif
 
 #define ENC_MISMATCH_DEBUG 0
 
-#if CONFIG_COMPOUND_SINGLEREF
-static struct av1_token
-    inter_singleref_comp_mode_encodings[INTER_SINGLEREF_COMP_MODES];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
-#if !CONFIG_PVQ || CONFIG_EXT_INTRA
 static INLINE void write_uniform(aom_writer *w, int n, int v) {
   const int l = get_unsigned_bits(n);
   const int m = (1 << l) - n;
@@ -77,110 +59,38 @@ static INLINE void write_uniform(aom_writer *w, int n, int v) {
     aom_write_literal(w, (v - m) & 1, 1);
   }
 }
-#endif  // !CONFIG_PVQ || CONFIG_EXT_INTRA
-
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-static struct av1_token intra_filter_encodings[INTRA_FILTERS];
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_INTERINTRA
-static struct av1_token interintra_mode_encodings[INTERINTRA_MODES];
-#endif
-#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-static struct av1_token compound_type_encodings[COMPOUND_TYPES];
-#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-#if CONFIG_LOOP_RESTORATION
-static struct av1_token switchable_restore_encodings[RESTORE_SWITCHABLE_TYPES];
+
 static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
                                              MACROBLOCKD *xd,
+                                             const RestorationUnitInfo *rui,
                                              aom_writer *const w, int plane,
-                                             int rtile_idx);
-#endif  // CONFIG_LOOP_RESTORATION
-#if CONFIG_OBU
-static void write_uncompressed_header_obu(AV1_COMP *cpi,
-                                          struct aom_write_bit_buffer *wb);
-#else
-static void write_uncompressed_header_frame(AV1_COMP *cpi,
-                                            struct aom_write_bit_buffer *wb);
-#endif
-
-static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data);
-
-#if !CONFIG_OBU || CONFIG_EXT_TILE
-static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
-                       const uint32_t data_size, const uint32_t max_tile_size,
-                       const uint32_t max_tile_col_size,
-                       int *const tile_size_bytes,
-                       int *const tile_col_size_bytes);
-#endif
-void av1_encode_token_init(void) {
-#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-  av1_tokens_from_tree(intra_filter_encodings, av1_intra_filter_tree);
-#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-#if CONFIG_INTERINTRA
-  av1_tokens_from_tree(interintra_mode_encodings, av1_interintra_mode_tree);
-#endif  // CONFIG_INTERINTRA
-#if CONFIG_COMPOUND_SINGLEREF
-  av1_tokens_from_tree(inter_singleref_comp_mode_encodings,
-                       av1_inter_singleref_comp_mode_tree);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-  av1_tokens_from_tree(compound_type_encodings, av1_compound_type_tree);
-#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-#if CONFIG_LOOP_RESTORATION
-  av1_tokens_from_tree(switchable_restore_encodings,
-                       av1_switchable_restore_tree);
-#endif  // CONFIG_LOOP_RESTORATION
-}
+                                             FRAME_COUNTS *counts);
 
-static void write_intra_mode_kf(const AV1_COMMON *cm, FRAME_CONTEXT *frame_ctx,
-                                const MODE_INFO *mi, const MODE_INFO *above_mi,
-                                const MODE_INFO *left_mi, int block,
+static void write_intra_mode_kf(FRAME_CONTEXT *frame_ctx,
+                                const MB_MODE_INFO *mi,
+                                const MB_MODE_INFO *above_mi,
+                                const MB_MODE_INFO *left_mi,
                                 PREDICTION_MODE mode, aom_writer *w) {
-#if CONFIG_INTRABC
-  assert(!is_intrabc_block(&mi->mbmi));
-#endif  // CONFIG_INTRABC
-  aom_write_symbol(w, mode,
-                   get_y_mode_cdf(frame_ctx, mi, above_mi, left_mi, block),
+  assert(!is_intrabc_block(mi));
+  (void)mi;
+  aom_write_symbol(w, mode, get_y_mode_cdf(frame_ctx, above_mi, left_mi),
                    INTRA_MODES);
-  (void)cm;
 }
 
 static void write_inter_mode(aom_writer *w, PREDICTION_MODE mode,
                              FRAME_CONTEXT *ec_ctx, const int16_t mode_ctx) {
   const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
 
-#if CONFIG_NEW_MULTISYMBOL
   aom_write_symbol(w, mode != NEWMV, ec_ctx->newmv_cdf[newmv_ctx], 2);
-#else
-  aom_write(w, mode != NEWMV, ec_ctx->newmv_prob[newmv_ctx]);
-#endif
 
   if (mode != NEWMV) {
-    if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) {
-      assert(mode == ZEROMV);
-      return;
-    }
+    const int16_t zeromv_ctx =
+        (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+    aom_write_symbol(w, mode != GLOBALMV, ec_ctx->zeromv_cdf[zeromv_ctx], 2);
 
-    const int16_t zeromv_ctx = (mode_ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
-#if CONFIG_NEW_MULTISYMBOL
-    aom_write_symbol(w, mode != ZEROMV, ec_ctx->zeromv_cdf[zeromv_ctx], 2);
-#else
-    aom_write(w, mode != ZEROMV, ec_ctx->zeromv_prob[zeromv_ctx]);
-#endif
-
-    if (mode != ZEROMV) {
+    if (mode != GLOBALMV) {
       int16_t refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
-
-      if (mode_ctx & (1 << SKIP_NEARESTMV_OFFSET)) refmv_ctx = 6;
-      if (mode_ctx & (1 << SKIP_NEARMV_OFFSET)) refmv_ctx = 7;
-      if (mode_ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) refmv_ctx = 8;
-#if CONFIG_NEW_MULTISYMBOL
       aom_write_symbol(w, mode != NEARESTMV, ec_ctx->refmv_cdf[refmv_ctx], 2);
-#else
-      aom_write(w, mode != NEARESTMV, ec_ctx->refmv_prob[refmv_ctx]);
-#endif
     }
   }
 }
@@ -191,24 +101,16 @@ static void write_drl_idx(FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi,
 
   assert(mbmi->ref_mv_idx < 3);
 
-#if CONFIG_COMPOUND_SINGLEREF
-  if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV ||
-      mbmi->mode == SR_NEW_NEWMV) {
-#else   // !CONFIG_COMPOUND_SINGLEREF
-  if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
-#endif  // CONFIG_COMPOUND_SINGLEREF
+  const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV;
+  if (new_mv) {
     int idx;
     for (idx = 0; idx < 2; ++idx) {
       if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
         uint8_t drl_ctx =
             av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
 
-#if CONFIG_NEW_MULTISYMBOL
         aom_write_symbol(w, mbmi->ref_mv_idx != idx, ec_ctx->drl_cdf[drl_ctx],
                          2);
-#else
-        aom_write(w, mbmi->ref_mv_idx != idx, ec_ctx->drl_prob[drl_ctx]);
-#endif
         if (mbmi->ref_mv_idx == idx) return;
       }
     }
@@ -222,12 +124,8 @@ static void write_drl_idx(FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi,
       if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
         uint8_t drl_ctx =
             av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
-#if CONFIG_NEW_MULTISYMBOL
         aom_write_symbol(w, mbmi->ref_mv_idx != (idx - 1),
                          ec_ctx->drl_cdf[drl_ctx], 2);
-#else
-        aom_write(w, mbmi->ref_mv_idx != (idx - 1), ec_ctx->drl_prob[drl_ctx]);
-#endif
         if (mbmi->ref_mv_idx == (idx - 1)) return;
       }
     }
@@ -235,52 +133,22 @@ static void write_drl_idx(FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi,
   }
 }
 
-static void write_inter_compound_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                      aom_writer *w, PREDICTION_MODE mode,
+static void write_inter_compound_mode(MACROBLOCKD *xd, aom_writer *w,
+                                      PREDICTION_MODE mode,
                                       const int16_t mode_ctx) {
   assert(is_inter_compound_mode(mode));
-  (void)cm;
   aom_write_symbol(w, INTER_COMPOUND_OFFSET(mode),
                    xd->tile_ctx->inter_compound_mode_cdf[mode_ctx],
                    INTER_COMPOUND_MODES);
 }
 
-#if CONFIG_COMPOUND_SINGLEREF
-static void write_inter_singleref_comp_mode(MACROBLOCKD *xd, aom_writer *w,
-                                            PREDICTION_MODE mode,
-                                            const int16_t mode_ctx) {
-  assert(is_inter_singleref_comp_mode(mode));
-  aom_cdf_prob *const inter_singleref_comp_cdf =
-      xd->tile_ctx->inter_singleref_comp_mode_cdf[mode_ctx];
-
-  aom_write_symbol(w, INTER_SINGLEREF_COMP_OFFSET(mode),
-                   inter_singleref_comp_cdf, INTER_SINGLEREF_COMP_MODES);
-}
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-static void encode_unsigned_max(struct aom_write_bit_buffer *wb, int data,
-                                int max) {
-  aom_wb_write_literal(wb, data, get_unsigned_bits(max));
-}
-
-#if CONFIG_VAR_TX
-static void write_tx_size_vartx(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                const MB_MODE_INFO *mbmi, TX_SIZE tx_size,
-                                int depth, int blk_row, int blk_col,
-                                aom_writer *w) {
-#if CONFIG_NEW_MULTISYMBOL
+static void write_tx_size_vartx(MACROBLOCKD *xd, const MB_MODE_INFO *mbmi,
+                                TX_SIZE tx_size, int depth, int blk_row,
+                                int blk_col, aom_writer *w) {
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
-#endif
-  const int tx_row = blk_row >> 1;
-  const int tx_col = blk_col >> 1;
   const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
   const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
 
-  int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
-                                   xd->left_txfm_context + blk_row,
-                                   mbmi->sb_type, tx_size);
-
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
   if (depth == MAX_VARTX_DEPTH) {
@@ -289,31 +157,25 @@ static void write_tx_size_vartx(const AV1_COMMON *cm, MACROBLOCKD *xd,
     return;
   }
 
-#if CONFIG_RECT_TX_EXT
-  if (tx_size == mbmi->inter_tx_size[tx_row][tx_col] ||
-      mbmi->tx_size == quarter_txsize_lookup[mbmi->sb_type]) {
-#else
-  if (tx_size == mbmi->inter_tx_size[tx_row][tx_col]) {
-#endif
-#if CONFIG_NEW_MULTISYMBOL
+  const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+                                         xd->left_txfm_context + blk_row,
+                                         mbmi->sb_type, tx_size);
+  const int txb_size_index =
+      av1_get_txb_size_index(mbmi->sb_type, blk_row, blk_col);
+  const int write_txfm_partition =
+      tx_size == mbmi->inter_tx_size[txb_size_index];
+  if (write_txfm_partition) {
     aom_write_symbol(w, 0, ec_ctx->txfm_partition_cdf[ctx], 2);
-#else
-    aom_write(w, 0, cm->fc->txfm_partition_prob[ctx]);
-#endif
 
     txfm_partition_update(xd->above_txfm_context + blk_col,
                           xd->left_txfm_context + blk_row, tx_size, tx_size);
     // TODO(yuec): set correct txfm partition update for qttx
   } else {
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bsl = tx_size_wide_unit[sub_txs];
-    int i;
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
 
-#if CONFIG_NEW_MULTISYMBOL
     aom_write_symbol(w, 1, ec_ctx->txfm_partition_cdf[ctx], 2);
-#else
-    aom_write(w, 1, cm->fc->txfm_partition_prob[ctx]);
-#endif
 
     if (sub_txs == TX_4X4) {
       txfm_partition_update(xd->above_txfm_context + blk_col,
@@ -321,185 +183,115 @@ static void write_tx_size_vartx(const AV1_COMMON *cm, MACROBLOCKD *xd,
       return;
     }
 
-    assert(bsl > 0);
-    for (i = 0; i < 4; ++i) {
-      int offsetr = blk_row + (i >> 1) * bsl;
-      int offsetc = blk_col + (i & 0x01) * bsl;
-      write_tx_size_vartx(cm, xd, mbmi, sub_txs, depth + 1, offsetr, offsetc,
-                          w);
-    }
+    assert(bsw > 0 && bsh > 0);
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh)
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        int offsetr = blk_row + row;
+        int offsetc = blk_col + col;
+        write_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, w);
+      }
   }
 }
 
-#if !CONFIG_NEW_MULTISYMBOL
-static void update_txfm_partition_probs(AV1_COMMON *cm, aom_writer *w,
-                                        FRAME_COUNTS *counts, int probwt) {
-  int k;
-  for (k = 0; k < TXFM_PARTITION_CONTEXTS; ++k)
-    av1_cond_prob_diff_update(w, &cm->fc->txfm_partition_prob[k],
-                              counts->txfm_partition[k], probwt);
-}
-#endif  // CONFIG_NEW_MULTISYMBOL
-#endif  // CONFIG_VAR_TX
-
-static void write_selected_tx_size(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                                   aom_writer *w) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+static void write_selected_tx_size(const MACROBLOCKD *xd, aom_writer *w) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   const BLOCK_SIZE bsize = mbmi->sb_type;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
   if (block_signals_txsize(bsize)) {
     const TX_SIZE tx_size = mbmi->tx_size;
-    const int is_inter = is_inter_block(mbmi);
     const int tx_size_ctx = get_tx_size_context(xd);
-    const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
-                                         : intra_tx_size_cat_lookup[bsize];
-    const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
-    const int depth = tx_size_to_depth(coded_tx_size);
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
+    const int depth = tx_size_to_depth(tx_size, bsize);
+    const int max_depths = bsize_to_max_depth(bsize);
+    const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+
+    assert(depth >= 0 && depth <= max_depths);
+    assert(!is_inter_block(mbmi));
     assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
 
     aom_write_symbol(w, depth, ec_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
-                     tx_size_cat + 2);
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    if (is_quarter_tx_allowed(xd, mbmi, is_inter) && tx_size != coded_tx_size)
-#if CONFIG_NEW_MULTISYMBOL
-      aom_write_symbol(w, tx_size == quarter_txsize_lookup[bsize],
-                       cm->fc->quarter_tx_size_cdf, 2);
-#else
-      aom_write(w, tx_size == quarter_txsize_lookup[bsize],
-                cm->fc->quarter_tx_size_prob);
-#endif
-#endif
+                     max_depths + 1);
   }
 }
 
-#if !CONFIG_NEW_MULTISYMBOL
-static void update_inter_mode_probs(AV1_COMMON *cm, aom_writer *w,
-                                    FRAME_COUNTS *counts) {
-  int i;
-  const int probwt = cm->num_tg;
-  for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
-    av1_cond_prob_diff_update(w, &cm->fc->newmv_prob[i], counts->newmv_mode[i],
-                              probwt);
-  for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
-    av1_cond_prob_diff_update(w, &cm->fc->zeromv_prob[i],
-                              counts->zeromv_mode[i], probwt);
-  for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
-    av1_cond_prob_diff_update(w, &cm->fc->refmv_prob[i], counts->refmv_mode[i],
-                              probwt);
-  for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
-    av1_cond_prob_diff_update(w, &cm->fc->drl_prob[i], counts->drl_mode[i],
-                              probwt);
-}
-#endif
-
 static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                      int segment_id, const MODE_INFO *mi, aom_writer *w) {
+                      int segment_id, const MB_MODE_INFO *mi, aom_writer *w) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
-    const int skip = mi->mbmi.skip;
-#if CONFIG_NEW_MULTISYMBOL
-    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    const int skip = mi->skip;
     const int ctx = av1_get_skip_context(xd);
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
     aom_write_symbol(w, skip, ec_ctx->skip_cdfs[ctx], 2);
-#else
-    aom_write(w, skip, av1_get_skip_prob(cm, xd));
-#endif
     return skip;
   }
 }
 
+static int write_skip_mode(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                           int segment_id, const MB_MODE_INFO *mi,
+                           aom_writer *w) {
+  if (!cm->skip_mode_flag) return 0;
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 0;
+  }
+  const int skip_mode = mi->skip_mode;
+  if (!is_comp_ref_allowed(mi->sb_type)) {
+    assert(!skip_mode);
+    return 0;
+  }
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ||
+      segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    // These features imply single-reference mode, while skip mode implies
+    // compound reference. Hence, the two are mutually exclusive.
+    // In other words, skip_mode is implicitly 0 here.
+    assert(!skip_mode);
+    return 0;
+  }
+  const int ctx = av1_get_skip_mode_context(xd);
+  aom_write_symbol(w, skip_mode, xd->tile_ctx->skip_mode_cdfs[ctx], 2);
+  return skip_mode;
+}
+
 static void write_is_inter(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                            int segment_id, aom_writer *w, const int is_inter) {
   if (!segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
-#if CONFIG_NEW_MULTISYMBOL
-    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+      assert(is_inter);
+      return;
+    }
     const int ctx = av1_get_intra_inter_context(xd);
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
     aom_write_symbol(w, is_inter, ec_ctx->intra_inter_cdf[ctx], 2);
-#else
-    aom_write(w, is_inter, av1_get_intra_inter_prob(cm, xd));
-#endif
   }
 }
 
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 static void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                              const MODE_INFO *mi, aom_writer *w) {
-  const MB_MODE_INFO *mbmi = &mi->mbmi;
-
-#if !CONFIG_GLOBAL_MOTION
-  // The cm parameter is only used with global_motion or with
-  // motion_var and warped_motion. In other cases, explicitly ignore
-  // it to avoid a compiler warning.
-  (void)cm;
-#endif
-  MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION
-      0, cm->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-      xd,
-#endif
-      mi);
-  if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return;
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  if (last_motion_mode_allowed == NCOBMC_ADAPT_WEIGHT) {
-    aom_write_symbol(w, mbmi->motion_mode,
-                     xd->tile_ctx->ncobmc_cdf[mbmi->sb_type],
-                     OBMC_FAMILY_MODES);
-  } else if (last_motion_mode_allowed == OBMC_CAUSAL) {
-    aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL,
-                     xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2);
-  } else {
-#else
-  if (last_motion_mode_allowed == OBMC_CAUSAL) {
-#if CONFIG_NEW_MULTISYMBOL
-    aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL,
-                     xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2);
-#else
-    aom_write(w, mbmi->motion_mode == OBMC_CAUSAL,
-              cm->fc->obmc_prob[mbmi->sb_type]);
-#endif
-  } else {
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-    aom_write_symbol(w, mbmi->motion_mode,
-                     xd->tile_ctx->motion_mode_cdf[mbmi->sb_type],
-                     MOTION_MODES);
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-  }
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-}
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-static void write_ncobmc_mode(MACROBLOCKD *xd, const MODE_INFO *mi,
-                              aom_writer *w) {
-  const MB_MODE_INFO *mbmi = &mi->mbmi;
-  ADAPT_OVERLAP_BLOCK ao_block = adapt_overlap_block_lookup[mbmi->sb_type];
-  if (mbmi->motion_mode != NCOBMC_ADAPT_WEIGHT) return;
-
-  aom_write_symbol(w, mbmi->ncobmc_mode[0],
-                   xd->tile_ctx->ncobmc_mode_cdf[ao_block], MAX_NCOBMC_MODES);
-  if (mi_size_wide[mbmi->sb_type] != mi_size_high[mbmi->sb_type]) {
-    aom_write_symbol(w, mbmi->ncobmc_mode[1],
-                     xd->tile_ctx->ncobmc_mode_cdf[ao_block], MAX_NCOBMC_MODES);
+                              const MB_MODE_INFO *mbmi, aom_writer *w) {
+  MOTION_MODE last_motion_mode_allowed =
+      cm->switchable_motion_mode
+          ? motion_mode_allowed(cm->global_motion, xd, mbmi,
+                                cm->allow_warped_motion)
+          : SIMPLE_TRANSLATION;
+  assert(mbmi->motion_mode <= last_motion_mode_allowed);
+  switch (last_motion_mode_allowed) {
+    case SIMPLE_TRANSLATION: break;
+    case OBMC_CAUSAL:
+      aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL,
+                       xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2);
+      break;
+    default:
+      aom_write_symbol(w, mbmi->motion_mode,
+                       xd->tile_ctx->motion_mode_cdf[mbmi->sb_type],
+                       MOTION_MODES);
   }
 }
-#endif
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
-static void write_delta_qindex(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                               int delta_qindex, aom_writer *w) {
+static void write_delta_qindex(const MACROBLOCKD *xd, int delta_qindex,
+                               aom_writer *w) {
   int sign = delta_qindex < 0;
   int abs = sign ? -delta_qindex : delta_qindex;
   int rem_bits, thr;
   int smallval = abs < DELTA_Q_SMALL ? 1 : 0;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
 
   aom_write_symbol(w, AOMMIN(abs, DELTA_Q_SMALL), ec_ctx->delta_q_cdf,
                    DELTA_Q_PROBS + 1);
@@ -515,32 +307,23 @@ static void write_delta_qindex(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   }
 }
 
-#if CONFIG_EXT_DELTA_Q
 static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-#if CONFIG_LOOPFILTER_LEVEL
-                                int lf_id,
-#endif
-                                int delta_lflevel, aom_writer *w) {
+                                int lf_id, int delta_lflevel, aom_writer *w) {
   int sign = delta_lflevel < 0;
   int abs = sign ? -delta_lflevel : delta_lflevel;
   int rem_bits, thr;
   int smallval = abs < DELTA_LF_SMALL ? 1 : 0;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
 
-#if CONFIG_LOOPFILTER_LEVEL
   if (cm->delta_lf_multi) {
-    assert(lf_id >= 0 && lf_id < FRAME_LF_COUNT);
+    assert(lf_id >= 0 && lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT
+                                                         : FRAME_LF_COUNT - 2));
     aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL),
                      ec_ctx->delta_lf_multi_cdf[lf_id], DELTA_LF_PROBS + 1);
   } else {
     aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf,
                      DELTA_LF_PROBS + 1);
   }
-#else
-  aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf,
-                   DELTA_LF_PROBS + 1);
-#endif  // CONFIG_LOOPFILTER_LEVEL
 
   if (!smallval) {
     rem_bits = OD_ILOG_NZ(abs - 1) - 1;
@@ -552,22 +335,7 @@ static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd,
     aom_write_bit(w, sign);
   }
 }
-#endif  // CONFIG_EXT_DELTA_Q
-
-#if !CONFIG_NEW_MULTISYMBOL
-static void update_skip_probs(AV1_COMMON *cm, aom_writer *w,
-                              FRAME_COUNTS *counts) {
-  int k;
-  const int probwt = cm->num_tg;
-  for (k = 0; k < SKIP_CONTEXTS; ++k) {
-    av1_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k],
-                              probwt);
-  }
-}
-#endif
 
-// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
-#if !CONFIG_PVQ
 static void pack_map_tokens(aom_writer *w, const TOKENEXTRA **tp, int n,
                             int num) {
   const TOKENEXTRA *p = *tp;
@@ -580,423 +348,142 @@ static void pack_map_tokens(aom_writer *w, const TOKENEXTRA **tp, int n,
   }
   *tp = p;
 }
-#endif  // !CONFIG_PVQ
 
-#if !CONFIG_PVQ
-#if CONFIG_SUPERTX
-static void update_supertx_probs(AV1_COMMON *cm, int probwt, aom_writer *w) {
-  const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
-                             av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
-  int i, j;
-  int savings = 0;
-  int do_update = 0;
-  for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
-    for (j = TX_8X8; j < TX_SIZES; ++j) {
-      savings += av1_cond_prob_diff_update_savings(
-          &cm->fc->supertx_prob[i][j], cm->counts.supertx[i][j], probwt);
-    }
-  }
-  do_update = savings > savings_thresh;
-  aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
-  if (do_update) {
-    for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
-      for (j = TX_8X8; j < TX_SIZES; ++j) {
-        av1_cond_prob_diff_update(w, &cm->fc->supertx_prob[i][j],
-                                  cm->counts.supertx[i][j], probwt);
-      }
-    }
-  }
-}
-#endif  // CONFIG_SUPERTX
-
-#if !CONFIG_LV_MAP
-#if CONFIG_NEW_MULTISYMBOL
-static INLINE void write_coeff_extra(const aom_cdf_prob *const *cdf, int val,
-                                     int n, aom_writer *w) {
-  // Code the extra bits from LSB to MSB in groups of 4
-  int i = 0;
-  int count = 0;
-  while (count < n) {
-    const int size = AOMMIN(n - count, 4);
-    const int mask = (1 << size) - 1;
-    aom_write_cdf(w, val & mask, cdf[i++], 1 << size);
-    val >>= size;
-    count += size;
-  }
-}
-#else
-static INLINE void write_coeff_extra(const aom_prob *pb, int value,
-                                     int num_bits, int skip_bits, aom_writer *w,
-                                     TOKEN_STATS *token_stats) {
-  // Code the extra bits from MSB to LSB 1 bit at a time
-  int index;
-  for (index = skip_bits; index < num_bits; ++index) {
-    const int shift = num_bits - index - 1;
-    const int bb = (value >> shift) & 1;
-    aom_write_record(w, bb, pb[index], token_stats);
-  }
-}
-#endif  // CONFIG_NEW_MULTISYMBOL
-
-static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp,
-                           const TOKENEXTRA *const stop,
-                           aom_bit_depth_t bit_depth, const TX_SIZE tx_size,
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                           TX_TYPE tx_type, int is_inter,
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                           TOKEN_STATS *token_stats) {
-  const TOKENEXTRA *p = *tp;
-#if CONFIG_VAR_TX
-  int count = 0;
-  const int seg_eob = tx_size_2d[tx_size];
-#endif
-
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  if (tx_type == MRC_DCT && ((is_inter && SIGNAL_MRC_MASK_INTER) ||
-                             (!is_inter && SIGNAL_MRC_MASK_INTRA))) {
-    int rows = tx_size_high[tx_size];
-    int cols = tx_size_wide[tx_size];
-    assert(tx_size == TX_32X32);
-    assert(p < stop);
-    pack_map_tokens(w, &p, 2, rows * cols);
-  }
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-
-  while (p < stop && p->token != EOSB_TOKEN) {
-    const int token = p->token;
-    const int eob_val = p->eob_val;
-    if (token == BLOCK_Z_TOKEN) {
-      aom_write_symbol(w, 0, *p->head_cdf, HEAD_TOKENS + 1);
-      p++;
-#if CONFIG_VAR_TX
-      break;
-#endif
-      continue;
-    }
-
-    const av1_extra_bit *const extra_bits = &av1_extra_bits[token];
-    if (eob_val == LAST_EOB) {
-      // Just code a flag indicating whether the value is >1 or 1.
-      aom_write_bit(w, token != ONE_TOKEN);
-    } else {
-      int comb_symb = 2 * AOMMIN(token, TWO_TOKEN) - eob_val + p->first_val;
-      aom_write_symbol(w, comb_symb, *p->head_cdf, HEAD_TOKENS + p->first_val);
-    }
-    if (token > ONE_TOKEN) {
-      aom_write_symbol(w, token - TWO_TOKEN, *p->tail_cdf, TAIL_TOKENS);
-    }
-
-    if (extra_bits->base_val) {
-      const int bit_string = p->extra;
-      const int bit_string_length = extra_bits->len;  // Length of extra bits to
-      const int is_cat6 = (extra_bits->base_val == CAT6_MIN_VAL);
-      // be written excluding
-      // the sign bit.
-      int skip_bits = is_cat6
-                          ? (int)sizeof(av1_cat6_prob) -
-                                av1_get_cat6_extrabits_size(tx_size, bit_depth)
-                          : 0;
-
-      assert(!(bit_string >> (bit_string_length - skip_bits + 1)));
-      if (bit_string_length > 0)
-#if CONFIG_NEW_MULTISYMBOL
-        write_coeff_extra(extra_bits->cdf, bit_string >> 1,
-                          bit_string_length - skip_bits, w);
-#else
-        write_coeff_extra(extra_bits->prob, bit_string >> 1, bit_string_length,
-                          skip_bits, w, token_stats);
-#endif
-
-      aom_write_bit_record(w, bit_string & 1, token_stats);
-    }
-    ++p;
-
-#if CONFIG_VAR_TX
-    ++count;
-    if (eob_val == EARLY_EOB || count == seg_eob) break;
-#endif
-  }
-
-  *tp = p;
-}
-#endif  // !CONFIG_LV_MAP
-#else   // !CONFIG_PVQ
-static PVQ_INFO *get_pvq_block(PVQ_QUEUE *pvq_q) {
-  PVQ_INFO *pvq;
-
-  assert(pvq_q->curr_pos <= pvq_q->last_pos);
-  assert(pvq_q->curr_pos < pvq_q->buf_len);
-
-  pvq = pvq_q->buf + pvq_q->curr_pos;
-  ++pvq_q->curr_pos;
-
-  return pvq;
-}
-
-static void pack_pvq_tokens(aom_writer *w, MACROBLOCK *const x,
-                            MACROBLOCKD *const xd, int plane, BLOCK_SIZE bsize,
-                            const TX_SIZE tx_size) {
-  PVQ_INFO *pvq;
-  int idx, idy;
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  od_adapt_ctx *adapt;
-  int max_blocks_wide;
-  int max_blocks_high;
-  int step = (1 << tx_size);
-
-#if CONFIG_CHROMA_SUB8X8
-  const BLOCK_SIZE plane_bsize =
-      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#elif CONFIG_CB4X4
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#else
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
-#endif
-
-  adapt = x->daala_enc.state.adapt;
-
-  max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-  max_blocks_high = max_block_high(xd, plane_bsize, plane);
-
-  for (idy = 0; idy < max_blocks_high; idy += step) {
-    for (idx = 0; idx < max_blocks_wide; idx += step) {
-      const int is_keyframe = 0;
-      const int encode_flip = 0;
-      const int flip = 0;
-      int i;
-      const int has_dc_skip = 1;
-      int *exg = &adapt->pvq.pvq_exg[plane][tx_size][0];
-      int *ext = adapt->pvq.pvq_ext + tx_size * PVQ_MAX_PARTITIONS;
-      generic_encoder *model = adapt->pvq.pvq_param_model;
-
-      pvq = get_pvq_block(x->pvq_q);
-
-      // encode block skip info
-      aom_write_symbol(w, pvq->ac_dc_coded,
-                       adapt->skip_cdf[2 * tx_size + (plane != 0)], 4);
-
-      // AC coeffs coded?
-      if (pvq->ac_dc_coded & AC_CODED) {
-        assert(pvq->bs == tx_size);
-        for (i = 0; i < pvq->nb_bands; i++) {
-          if (i == 0 ||
-              (!pvq->skip_rest && !(pvq->skip_dir & (1 << ((i - 1) % 3))))) {
-            pvq_encode_partition(
-                w, pvq->qg[i], pvq->theta[i], pvq->y + pvq->off[i],
-                pvq->size[i], pvq->k[i], model, adapt, exg + i, ext + i,
-                (plane != 0) * OD_TXSIZES * PVQ_MAX_PARTITIONS +
-                    pvq->bs * PVQ_MAX_PARTITIONS + i,
-                is_keyframe, i == 0 && (i < pvq->nb_bands - 1), pvq->skip_rest,
-                encode_flip, flip);
-          }
-          if (i == 0 && !pvq->skip_rest && pvq->bs > 0) {
-            aom_write_symbol(
-                w, pvq->skip_dir,
-                &adapt->pvq
-                     .pvq_skip_dir_cdf[(plane != 0) + 2 * (pvq->bs - 1)][0],
-                7);
-          }
-        }
-      }
-      // Encode residue of DC coeff, if exist.
-      if (!has_dc_skip || (pvq->ac_dc_coded & DC_CODED)) {
-        generic_encode(w, &adapt->model_dc[plane],
-                       abs(pvq->dq_dc_residue) - has_dc_skip,
-                       &adapt->ex_dc[plane][pvq->bs][0], 2);
-      }
-      if ((pvq->ac_dc_coded & DC_CODED)) {
-        aom_write_bit(w, pvq->dq_dc_residue < 0);
-      }
-    }
-  }  // for (idy = 0;
-}
-#endif  // !CONFIG_PVG
-
-#if CONFIG_VAR_TX && !CONFIG_COEF_INTERLEAVE
-#if CONFIG_LV_MAP
-static void pack_txb_tokens(aom_writer *w,
-#if CONFIG_LV_MAP
-                            AV1_COMMON *cm,
-#endif  // CONFIG_LV_MAP
+static void pack_txb_tokens(aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x,
                             const TOKENEXTRA **tp,
-                            const TOKENEXTRA *const tok_end,
-#if CONFIG_PVQ || CONFIG_LV_MAP
-                            MACROBLOCK *const x,
-#endif
-                            MACROBLOCKD *xd, MB_MODE_INFO *mbmi, int plane,
+                            const TOKENEXTRA *const tok_end, MACROBLOCKD *xd,
+                            MB_MODE_INFO *mbmi, int plane,
                             BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth,
                             int block, int blk_row, int blk_col,
                             TX_SIZE tx_size, TOKEN_STATS *token_stats) {
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-  const int tx_row = blk_row >> (1 - pd->subsampling_y);
-  const int tx_col = blk_col >> (1 - pd->subsampling_x);
-  TX_SIZE plane_tx_size;
   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
-  plane_tx_size =
-      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
-            : mbmi->inter_tx_size[tx_row][tx_col];
-
-  if (tx_size == plane_tx_size) {
-    TOKEN_STATS tmp_token_stats;
-    init_token_stats(&tmp_token_stats);
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
 
-#if !CONFIG_PVQ
+  if (tx_size == plane_tx_size || plane) {
     tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
-    uint16_t eob = x->mbmi_ext->eobs[plane][block];
+    const uint16_t eob = x->mbmi_ext->eobs[plane][block];
     TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block],
                         x->mbmi_ext->dc_sign_ctx[plane][block] };
-    av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, block, plane, tx_size,
-                         tcoeff, eob, &txb_ctx);
-#else
-    pack_pvq_tokens(w, x, xd, plane, bsize, tx_size);
-#endif
+    av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, plane, tx_size, tcoeff,
+                         eob, &txb_ctx);
 #if CONFIG_RD_DEBUG
-    token_stats->txb_coeff_cost_map[blk_row][blk_col] = tmp_token_stats.cost;
-    token_stats->cost += tmp_token_stats.cost;
-#endif
-  } else {
-    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bsl = tx_size_wide_unit[sub_txs];
-    int i;
-
-    assert(bsl > 0);
-
-    for (i = 0; i < 4; ++i) {
-      const int offsetr = blk_row + (i >> 1) * bsl;
-      const int offsetc = blk_col + (i & 0x01) * bsl;
-      const int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
-
-      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-
-      pack_txb_tokens(w,
-#if CONFIG_LV_MAP
-                      cm,
-#endif
-                      tp, tok_end,
-#if CONFIG_PVQ || CONFIG_LV_MAP
-                      x,
-#endif
-                      xd, mbmi, plane, plane_bsize, bit_depth, block, offsetr,
-                      offsetc, sub_txs, token_stats);
-      block += step;
-    }
-  }
-}
-#else  // CONFIG_LV_MAP
-static void pack_txb_tokens(aom_writer *w, const TOKENEXTRA **tp,
-                            const TOKENEXTRA *const tok_end,
-#if CONFIG_PVQ
-                            MACROBLOCK *const x,
-#endif
-                            MACROBLOCKD *xd, MB_MODE_INFO *mbmi, int plane,
-                            BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth,
-                            int block, int blk_row, int blk_col,
-                            TX_SIZE tx_size, TOKEN_STATS *token_stats) {
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-  const int tx_row = blk_row >> (1 - pd->subsampling_y);
-  const int tx_col = blk_col >> (1 - pd->subsampling_x);
-  TX_SIZE plane_tx_size;
-  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
-  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  TX_TYPE tx_type = av1_get_tx_type(plane ? PLANE_TYPE_UV : PLANE_TYPE_Y, xd,
-                                    blk_row, blk_col, block, tx_size);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-
-  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
-
-  plane_tx_size =
-      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
-            : mbmi->inter_tx_size[tx_row][tx_col];
-
-  if (tx_size == plane_tx_size) {
     TOKEN_STATS tmp_token_stats;
     init_token_stats(&tmp_token_stats);
-#if !CONFIG_PVQ
-    pack_mb_tokens(w, tp, tok_end, bit_depth, tx_size,
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                   tx_type, is_inter_block(mbmi),
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                   &tmp_token_stats);
-#else
-    pack_pvq_tokens(w, x, xd, plane, bsize, tx_size);
-#endif
-#if CONFIG_RD_DEBUG
     token_stats->txb_coeff_cost_map[blk_row][blk_col] = tmp_token_stats.cost;
     token_stats->cost += tmp_token_stats.cost;
 #endif
   } else {
-#if CONFIG_RECT_TX_EXT
-    int is_qttx = plane_tx_size == quarter_txsize_lookup[plane_bsize];
-    const TX_SIZE sub_txs = is_qttx ? plane_tx_size : sub_tx_size_map[tx_size];
-#else
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-#endif
-    const int bsl = tx_size_wide_unit[sub_txs];
-    int i;
-
-    assert(bsl > 0);
-
-    for (i = 0; i < 4; ++i) {
-#if CONFIG_RECT_TX_EXT
-      int is_wide_tx = tx_size_wide_unit[sub_txs] > tx_size_high_unit[sub_txs];
-      const int offsetr =
-          is_qttx ? (is_wide_tx ? i * tx_size_high_unit[sub_txs] : 0)
-                  : blk_row + (i >> 1) * bsl;
-      const int offsetc =
-          is_qttx ? (is_wide_tx ? 0 : i * tx_size_wide_unit[sub_txs])
-                  : blk_col + (i & 0x01) * bsl;
-#else
-      const int offsetr = blk_row + (i >> 1) * bsl;
-      const int offsetc = blk_col + (i & 0x01) * bsl;
-#endif
-      const int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
-
-      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-
-      pack_txb_tokens(w, tp, tok_end,
-#if CONFIG_PVQ
-                      x,
-#endif
-                      xd, mbmi, plane, plane_bsize, bit_depth, block, offsetr,
-                      offsetc, sub_txs, token_stats);
-      block += step;
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int step = bsh * bsw;
+
+    assert(bsw > 0 && bsh > 0);
+
+    for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) {
+      for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw) {
+        const int offsetr = blk_row + r;
+        const int offsetc = blk_col + c;
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+        pack_txb_tokens(w, cm, x, tp, tok_end, xd, mbmi, plane, plane_bsize,
+                        bit_depth, block, offsetr, offsetc, sub_txs,
+                        token_stats);
+        block += step;
+      }
+    }
+  }
+}
+
+static INLINE void set_spatial_segment_id(const AV1_COMMON *const cm,
+                                          uint8_t *segment_ids,
+                                          BLOCK_SIZE bsize, int mi_row,
+                                          int mi_col, int segment_id) {
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int xmis = AOMMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = AOMMIN(cm->mi_rows - mi_row, bh);
+  int x, y;
+
+  for (y = 0; y < ymis; ++y)
+    for (x = 0; x < xmis; ++x)
+      segment_ids[mi_offset + y * cm->mi_cols + x] = segment_id;
+}
+
+int av1_neg_interleave(int x, int ref, int max) {
+  assert(x < max);
+  const int diff = x - ref;
+  if (!ref) return x;
+  if (ref >= (max - 1)) return -x + max - 1;
+  if (2 * ref < max) {
+    if (abs(diff) <= ref) {
+      if (diff > 0)
+        return (diff << 1) - 1;
+      else
+        return ((-diff) << 1);
+    }
+    return x;
+  } else {
+    if (abs(diff) < (max - ref)) {
+      if (diff > 0)
+        return (diff << 1) - 1;
+      else
+        return ((-diff) << 1);
     }
+    return (max - x) - 1;
   }
 }
-#endif  // CONFIG_LV_MAP
-#endif  // CONFIG_VAR_TX
 
-static void write_segment_id(aom_writer *w, const struct segmentation *seg,
-                             struct segmentation_probs *segp, int segment_id) {
-  if (seg->enabled && seg->update_map) {
-    aom_write_symbol(w, segment_id, segp->tree_cdf, MAX_SEGMENTS);
+static void write_segment_id(AV1_COMP *cpi, const MB_MODE_INFO *const mbmi,
+                             aom_writer *w, const struct segmentation *seg,
+                             struct segmentation_probs *segp, int mi_row,
+                             int mi_col, int skip) {
+  if (!seg->enabled || !seg->update_map) return;
+
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  int cdf_num;
+  const int pred = av1_get_spatial_seg_pred(cm, xd, mi_row, mi_col, &cdf_num);
+
+  if (skip) {
+    // Still need to transmit tx size for intra blocks even if skip is
+    // true. Changing segment_id may make the tx size become invalid, e.g
+    // changing from lossless to lossy.
+    assert(is_inter_block(mbmi) || !cpi->has_lossless_segment);
+
+    set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type, mi_row,
+                           mi_col, pred);
+    set_spatial_segment_id(cm, cpi->segmentation_map, mbmi->sb_type, mi_row,
+                           mi_col, pred);
+    /* mbmi is read only but we need to update segment_id */
+    ((MB_MODE_INFO *)mbmi)->segment_id = pred;
+    return;
   }
+
+  const int coded_id =
+      av1_neg_interleave(mbmi->segment_id, pred, seg->last_active_segid + 1);
+  aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
+  aom_write_symbol(w, coded_id, pred_cdf, MAX_SEGMENTS);
+  set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type, mi_row,
+                         mi_col, mbmi->segment_id);
 }
 
-#if CONFIG_NEW_MULTISYMBOL
 #define WRITE_REF_BIT(bname, pname) \
-  aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(cm, xd), 2)
-#define WRITE_REF_BIT2(bname, pname) \
   aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(xd), 2)
-#else
-#define WRITE_REF_BIT(bname, pname) \
-  aom_write(w, bname, av1_get_pred_prob_##pname(cm, xd))
-#define WRITE_REF_BIT2(bname, pname) \
-  aom_write(w, bname, av1_get_pred_prob_##pname(cm, xd))
-#endif
 
 // This function encodes the reference frame
 static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                              aom_writer *w) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   const int is_compound = has_second_ref(mbmi);
   const int segment_id = mbmi->segment_id;
 
@@ -1006,75 +493,40 @@ static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
     assert(!is_compound);
     assert(mbmi->ref_frame[0] ==
            get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
+  } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) ||
+             segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    assert(!is_compound);
+    assert(mbmi->ref_frame[0] == LAST_FRAME);
   } else {
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
       if (is_comp_ref_allowed(mbmi->sb_type))
-#if CONFIG_NEW_MULTISYMBOL
-        aom_write_symbol(w, is_compound, av1_get_reference_mode_cdf(cm, xd), 2);
-#else
-        aom_write(w, is_compound, av1_get_reference_mode_prob(cm, xd));
-#endif  // CONFIG_NEW_MULTISYMBOL
+        aom_write_symbol(w, is_compound, av1_get_reference_mode_cdf(xd), 2);
     } else {
       assert((!is_compound) == (cm->reference_mode == SINGLE_REFERENCE));
     }
 
     if (is_compound) {
-#if CONFIG_EXT_COMP_REFS
       const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
                                                     ? UNIDIR_COMP_REFERENCE
                                                     : BIDIR_COMP_REFERENCE;
-#if USE_UNI_COMP_REFS
-#if CONFIG_VAR_REFS
-      if ((L_OR_L2(cm) || L3_OR_G(cm)) && BWD_OR_ALT(cm))
-        if (L_AND_L2(cm) || L_AND_L3(cm) || L_AND_G(cm) || BWD_AND_ALT(cm))
-#endif  // CONFIG_VAR_REFS
-#if CONFIG_NEW_MULTISYMBOL
-          aom_write_symbol(w, comp_ref_type,
-                           av1_get_comp_reference_type_cdf(xd), 2);
-#else
-      aom_write(w, comp_ref_type, av1_get_comp_reference_type_prob(cm, xd));
-#endif
-#if CONFIG_VAR_REFS
-        else
-          assert(comp_ref_type == BIDIR_COMP_REFERENCE);
-      else
-        assert(comp_ref_type == UNIDIR_COMP_REFERENCE);
-#endif  // CONFIG_VAR_REFS
-#else   // !USE_UNI_COMP_REFS
-      // NOTE: uni-directional comp refs disabled
-      assert(comp_ref_type == BIDIR_COMP_REFERENCE);
-#endif  // USE_UNI_COMP_REFS
+      aom_write_symbol(w, comp_ref_type, av1_get_comp_reference_type_cdf(xd),
+                       2);
 
       if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
         const int bit = mbmi->ref_frame[0] == BWDREF_FRAME;
-#if CONFIG_VAR_REFS
-        if ((L_AND_L2(cm) || L_AND_L3(cm) || L_AND_G(cm)) && BWD_AND_ALT(cm))
-#endif  // CONFIG_VAR_REFS
-          WRITE_REF_BIT2(bit, uni_comp_ref_p);
+        WRITE_REF_BIT(bit, uni_comp_ref_p);
 
         if (!bit) {
           assert(mbmi->ref_frame[0] == LAST_FRAME);
-#if CONFIG_VAR_REFS
-          if (L_AND_L2(cm) && (L_AND_L3(cm) || L_AND_G(cm))) {
-#endif  // CONFIG_VAR_REFS
-            const int bit1 = mbmi->ref_frame[1] == LAST3_FRAME ||
-                             mbmi->ref_frame[1] == GOLDEN_FRAME;
-            WRITE_REF_BIT2(bit1, uni_comp_ref_p1);
-            if (bit1) {
-#if CONFIG_VAR_REFS
-              if (L_AND_L3(cm) && L_AND_G(cm)) {
-#endif  // CONFIG_VAR_REFS
-                const int bit2 = mbmi->ref_frame[1] == GOLDEN_FRAME;
-                WRITE_REF_BIT2(bit2, uni_comp_ref_p2);
-#if CONFIG_VAR_REFS
-              }
-#endif  // CONFIG_VAR_REFS
-            }
-#if CONFIG_VAR_REFS
+          const int bit1 = mbmi->ref_frame[1] == LAST3_FRAME ||
+                           mbmi->ref_frame[1] == GOLDEN_FRAME;
+          WRITE_REF_BIT(bit1, uni_comp_ref_p1);
+          if (bit1) {
+            const int bit2 = mbmi->ref_frame[1] == GOLDEN_FRAME;
+            WRITE_REF_BIT(bit2, uni_comp_ref_p2);
           }
-#endif  // CONFIG_VAR_REFS
         } else {
           assert(mbmi->ref_frame[1] == ALTREF_FRAME);
         }
@@ -1083,213 +535,81 @@ static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
       }
 
       assert(comp_ref_type == BIDIR_COMP_REFERENCE);
-#endif  // CONFIG_EXT_COMP_REFS
 
-#if CONFIG_EXT_REFS
       const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME ||
                        mbmi->ref_frame[0] == LAST3_FRAME);
-#if CONFIG_VAR_REFS
-      // Test need to explicitly code (L,L2) vs (L3,G) branch node in tree
-      if (L_OR_L2(cm) && L3_OR_G(cm))
-#endif  // CONFIG_VAR_REFS
-        WRITE_REF_BIT(bit, comp_ref_p);
+      WRITE_REF_BIT(bit, comp_ref_p);
 
       if (!bit) {
-#if CONFIG_VAR_REFS
-        // Test need to explicitly code (L) vs (L2) branch node in tree
-        if (L_AND_L2(cm)) {
-#endif  // CONFIG_VAR_REFS
-          const int bit1 = mbmi->ref_frame[0] == LAST_FRAME;
-          WRITE_REF_BIT(bit1, comp_ref_p1);
-#if CONFIG_VAR_REFS
-        }
-#endif  // CONFIG_VAR_REFS
+        const int bit1 = mbmi->ref_frame[0] == LAST2_FRAME;
+        WRITE_REF_BIT(bit1, comp_ref_p1);
       } else {
-#if CONFIG_VAR_REFS
-        // Test need to explicitly code (L3) vs (G) branch node in tree
-        if (L3_AND_G(cm)) {
-#endif  // CONFIG_VAR_REFS
-          const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME;
-          WRITE_REF_BIT(bit2, comp_ref_p2);
-#if CONFIG_VAR_REFS
-        }
-#endif  // CONFIG_VAR_REFS
+        const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME;
+        WRITE_REF_BIT(bit2, comp_ref_p2);
       }
 
-#if CONFIG_VAR_REFS
-      // Test need to explicitly code (BWD,ALT2) vs (ALT) branch node in tree
-      if (BWD_OR_ALT2(cm) && ALTREF_IS_VALID(cm)) {
-#endif  // CONFIG_VAR_REFS
-        const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME;
-        WRITE_REF_BIT(bit_bwd, comp_bwdref_p);
-
-        if (!bit_bwd) {
-#if CONFIG_VAR_REFS
-          // Test need to explicitly code (BWD,ALT2) vs (ALT) branch node in
-          // tree
-          if (BWD_AND_ALT2(cm))
-#endif  // CONFIG_VAR_REFS
-            WRITE_REF_BIT(mbmi->ref_frame[1] == ALTREF2_FRAME, comp_bwdref_p1);
-        }
-#if CONFIG_VAR_REFS
+      const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME;
+      WRITE_REF_BIT(bit_bwd, comp_bwdref_p);
+
+      if (!bit_bwd) {
+        WRITE_REF_BIT(mbmi->ref_frame[1] == ALTREF2_FRAME, comp_bwdref_p1);
       }
-#endif  // CONFIG_VAR_REFS
 
-#else   // !CONFIG_EXT_REFS
-      const int bit = mbmi->ref_frame[0] == GOLDEN_FRAME;
-      WRITE_REF_BIT(bit, comp_ref_p);
-#endif  // CONFIG_EXT_REFS
     } else {
-#if CONFIG_EXT_REFS
       const int bit0 = (mbmi->ref_frame[0] <= ALTREF_FRAME &&
                         mbmi->ref_frame[0] >= BWDREF_FRAME);
-#if CONFIG_VAR_REFS
-      // Test need to explicitly code (L,L2,L3,G) vs (BWD,ALT2,ALT) branch node
-      // in tree
-      if ((L_OR_L2(cm) || L3_OR_G(cm)) &&
-          (BWD_OR_ALT2(cm) || ALTREF_IS_VALID(cm)))
-#endif  // CONFIG_VAR_REFS
-        WRITE_REF_BIT(bit0, single_ref_p1);
+      WRITE_REF_BIT(bit0, single_ref_p1);
 
       if (bit0) {
-#if CONFIG_VAR_REFS
-        // Test need to explicitly code (BWD,ALT2) vs (ALT) branch node in tree
-        if (BWD_OR_ALT2(cm) && ALTREF_IS_VALID(cm)) {
-#endif  // CONFIG_VAR_REFS
-          const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME;
-          WRITE_REF_BIT(bit1, single_ref_p2);
-
-          if (!bit1) {
-#if CONFIG_VAR_REFS
-            // Test need to explicitly code (BWD) vs (ALT2) branch node in tree
-            if (BWD_AND_ALT2(cm))
-#endif  // CONFIG_VAR_REFS
-              WRITE_REF_BIT(mbmi->ref_frame[0] == ALTREF2_FRAME, single_ref_p6);
-          }
-#if CONFIG_VAR_REFS
+        const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME;
+        WRITE_REF_BIT(bit1, single_ref_p2);
+
+        if (!bit1) {
+          WRITE_REF_BIT(mbmi->ref_frame[0] == ALTREF2_FRAME, single_ref_p6);
         }
-#endif  // CONFIG_VAR_REFS
       } else {
         const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME ||
                           mbmi->ref_frame[0] == GOLDEN_FRAME);
-#if CONFIG_VAR_REFS
-        // Test need to explicitly code (L,L2) vs (L3,G) branch node in tree
-        if (L_OR_L2(cm) && L3_OR_G(cm))
-#endif  // CONFIG_VAR_REFS
-          WRITE_REF_BIT(bit2, single_ref_p3);
+        WRITE_REF_BIT(bit2, single_ref_p3);
 
         if (!bit2) {
-#if CONFIG_VAR_REFS
-          // Test need to explicitly code (L) vs (L2) branch node in tree
-          if (L_AND_L2(cm)) {
-#endif  // CONFIG_VAR_REFS
-            const int bit3 = mbmi->ref_frame[0] != LAST_FRAME;
-            WRITE_REF_BIT(bit3, single_ref_p4);
-#if CONFIG_VAR_REFS
-          }
-#endif  // CONFIG_VAR_REFS
+          const int bit3 = mbmi->ref_frame[0] != LAST_FRAME;
+          WRITE_REF_BIT(bit3, single_ref_p4);
         } else {
-#if CONFIG_VAR_REFS
-          // Test need to explicitly code (L3) vs (G) branch node in tree
-          if (L3_AND_G(cm)) {
-#endif  // CONFIG_VAR_REFS
-            const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME;
-            WRITE_REF_BIT(bit4, single_ref_p5);
-#if CONFIG_VAR_REFS
-          }
-#endif  // CONFIG_VAR_REFS
+          const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME;
+          WRITE_REF_BIT(bit4, single_ref_p5);
         }
       }
-#else   // !CONFIG_EXT_REFS
-      const int bit0 = mbmi->ref_frame[0] != LAST_FRAME;
-      WRITE_REF_BIT(bit0, single_ref_p1);
-
-      if (bit0) {
-        const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
-        WRITE_REF_BIT(bit1, single_ref_p2);
-      }
-#endif  // CONFIG_EXT_REFS
     }
   }
 }
 
-#if CONFIG_FILTER_INTRA
-static void write_filter_intra_mode_info(const AV1_COMMON *const cm,
+static void write_filter_intra_mode_info(const AV1_COMMON *cm,
                                          const MACROBLOCKD *xd,
                                          const MB_MODE_INFO *const mbmi,
-                                         int mi_row, int mi_col,
                                          aom_writer *w) {
-  if (mbmi->mode == DC_PRED && mbmi->palette_mode_info.palette_size[0] == 0) {
-    aom_write(w, mbmi->filter_intra_mode_info.use_filter_intra_mode[0],
-              cm->fc->filter_intra_probs[0]);
-    if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) {
-      const FILTER_INTRA_MODE mode =
-          mbmi->filter_intra_mode_info.filter_intra_mode[0];
-      write_uniform(w, FILTER_INTRA_MODES, mode);
-    }
-  }
-
-#if CONFIG_CB4X4
-  if (!is_chroma_reference(mi_row, mi_col, mbmi->sb_type,
-                           xd->plane[1].subsampling_x,
-                           xd->plane[1].subsampling_y))
-    return;
-#else
-  (void)xd;
-  (void)mi_row;
-  (void)mi_col;
-#endif  // CONFIG_CB4X4
-
-  if (mbmi->uv_mode == UV_DC_PRED &&
-      mbmi->palette_mode_info.palette_size[1] == 0) {
-    aom_write(w, mbmi->filter_intra_mode_info.use_filter_intra_mode[1],
-              cm->fc->filter_intra_probs[1]);
-    if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1]) {
+  if (av1_filter_intra_allowed(cm, mbmi)) {
+    aom_write_symbol(w, mbmi->filter_intra_mode_info.use_filter_intra,
+                     xd->tile_ctx->filter_intra_cdfs[mbmi->sb_type], 2);
+    if (mbmi->filter_intra_mode_info.use_filter_intra) {
       const FILTER_INTRA_MODE mode =
-          mbmi->filter_intra_mode_info.filter_intra_mode[1];
-      write_uniform(w, FILTER_INTRA_MODES, mode);
+          mbmi->filter_intra_mode_info.filter_intra_mode;
+      aom_write_symbol(w, mode, xd->tile_ctx->filter_intra_mode_cdf,
+                       FILTER_INTRA_MODES);
     }
   }
 }
-#endif  // CONFIG_FILTER_INTRA
 
-#if CONFIG_EXT_INTRA
-static void write_intra_angle_info(const MACROBLOCKD *xd,
-                                   FRAME_CONTEXT *const ec_ctx, aom_writer *w) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_INTRA_INTERP
-  const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
-  int p_angle;
-#endif  // CONFIG_INTRA_INTERP
-
-  (void)ec_ctx;
-  if (!av1_use_angle_delta(bsize)) return;
-
-  if (av1_is_directional_mode(mbmi->mode, bsize)) {
-    write_uniform(w, 2 * MAX_ANGLE_DELTA + 1,
-                  MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
-#if CONFIG_INTRA_INTERP
-    p_angle = mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
-    if (av1_is_intra_filter_switchable(p_angle)) {
-      aom_write_symbol(w, mbmi->intra_filter,
-                       ec_ctx->intra_filter_cdf[intra_filter_ctx],
-                       INTRA_FILTERS);
-    }
-#endif  // CONFIG_INTRA_INTERP
-  }
-
-  if (av1_is_directional_mode(get_uv_mode(mbmi->uv_mode), bsize)) {
-    write_uniform(w, 2 * MAX_ANGLE_DELTA + 1,
-                  MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
-  }
+static void write_angle_delta(aom_writer *w, int angle_delta,
+                              aom_cdf_prob *cdf) {
+  aom_write_symbol(w, angle_delta + MAX_ANGLE_DELTA, cdf,
+                   2 * MAX_ANGLE_DELTA + 1);
 }
-#endif  // CONFIG_EXT_INTRA
 
 static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd,
                                    aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
   if (!av1_is_interp_needed(xd)) {
@@ -1299,36 +619,19 @@ static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd,
     return;
   }
   if (cm->interp_filter == SWITCHABLE) {
-#if CONFIG_DUAL_FILTER
     int dir;
     for (dir = 0; dir < 2; ++dir) {
-      if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
-          (mbmi->ref_frame[1] > INTRA_FRAME &&
-           has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
-        const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
-        InterpFilter filter =
-            av1_extract_interp_filter(mbmi->interp_filters, dir);
-        aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx],
-                         SWITCHABLE_FILTERS);
-        ++cpi->interp_filter_selected[0][filter];
-      } else {
-        assert(av1_extract_interp_filter(mbmi->interp_filters, dir) ==
-               EIGHTTAP_REGULAR);
-      }
-    }
-#else
-    {
-      const int ctx = av1_get_pred_context_switchable_interp(xd);
-      InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, 0);
+      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+      InterpFilter filter =
+          av1_extract_interp_filter(mbmi->interp_filters, dir);
       aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx],
                        SWITCHABLE_FILTERS);
       ++cpi->interp_filter_selected[0][filter];
+      if (cm->seq_params.enable_dual_filter == 0) return;
     }
-#endif  // CONFIG_DUAL_FILTER
   }
 }
 
-#if CONFIG_PALETTE_DELTA_ENCODING
 // Transmit color values with delta encoding. Write the first value as
 // literal, and the deltas between each value and the previous one. "min_val" is
 // the smallest possible value of the deltas.
@@ -1446,207 +749,90 @@ static void write_palette_colors_uv(const MACROBLOCKD *const xd,
     }
   }
 }
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
 
 static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                                    const MODE_INFO *const mi, aom_writer *w) {
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const MODE_INFO *const above_mi = xd->above_mi;
-  const MODE_INFO *const left_mi = xd->left_mi;
+                                    const MB_MODE_INFO *const mbmi, int mi_row,
+                                    int mi_col, aom_writer *w) {
+  const int num_planes = av1_num_planes(cm);
   const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(av1_allow_palette(cm->allow_screen_content_tools, bsize));
   const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-
-  assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_LARGEST);
-  const int block_palette_idx = bsize - BLOCK_8X8;
+  const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
 
   if (mbmi->mode == DC_PRED) {
     const int n = pmi->palette_size[0];
-    int palette_y_mode_ctx = 0;
-    if (above_mi) {
-      palette_y_mode_ctx +=
-          (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-    }
-    if (left_mi) {
-      palette_y_mode_ctx +=
-          (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-    }
-#if CONFIG_NEW_MULTISYMBOL
+    const int palette_y_mode_ctx = av1_get_palette_mode_ctx(xd);
     aom_write_symbol(
         w, n > 0,
-        xd->tile_ctx->palette_y_mode_cdf[block_palette_idx][palette_y_mode_ctx],
-        2);
-#else
-    aom_write(
-        w, n > 0,
-        av1_default_palette_y_mode_prob[block_palette_idx][palette_y_mode_ctx]);
-#endif
+        xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_y_mode_ctx], 2);
     if (n > 0) {
       aom_write_symbol(w, n - PALETTE_MIN_SIZE,
-                       xd->tile_ctx->palette_y_size_cdf[block_palette_idx],
+                       xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
                        PALETTE_SIZES);
-#if CONFIG_PALETTE_DELTA_ENCODING
       write_palette_colors_y(xd, pmi, cm->bit_depth, w);
-#else
-      for (int i = 0; i < n; ++i) {
-        assert(pmi->palette_colors[i] < (1 << cm->bit_depth));
-        aom_write_literal(w, pmi->palette_colors[i], cm->bit_depth);
-      }
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
     }
   }
 
-  if (mbmi->uv_mode == UV_DC_PRED) {
+  const int uv_dc_pred =
+      num_planes > 1 && mbmi->uv_mode == UV_DC_PRED &&
+      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                          xd->plane[1].subsampling_y);
+  if (uv_dc_pred) {
     const int n = pmi->palette_size[1];
     const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
-#if CONFIG_NEW_MULTISYMBOL
     aom_write_symbol(w, n > 0,
                      xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2);
-#else
-    aom_write(w, n > 0, av1_default_palette_uv_mode_prob[palette_uv_mode_ctx]);
-#endif
     if (n > 0) {
       aom_write_symbol(w, n - PALETTE_MIN_SIZE,
-                       xd->tile_ctx->palette_uv_size_cdf[block_palette_idx],
+                       xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
                        PALETTE_SIZES);
-#if CONFIG_PALETTE_DELTA_ENCODING
       write_palette_colors_uv(xd, pmi, cm->bit_depth, w);
-#else
-      for (int i = 0; i < n; ++i) {
-        assert(pmi->palette_colors[PALETTE_MAX_SIZE + i] <
-               (1 << cm->bit_depth));
-        assert(pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] <
-               (1 << cm->bit_depth));
-        aom_write_literal(w, pmi->palette_colors[PALETTE_MAX_SIZE + i],
-                          cm->bit_depth);
-        aom_write_literal(w, pmi->palette_colors[2 * PALETTE_MAX_SIZE + i],
-                          cm->bit_depth);
-      }
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
     }
   }
 }
 
 void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
-#if CONFIG_SUPERTX
-                       const int supertx_enabled,
-#endif
-#if CONFIG_TXK_SEL
-                       int blk_row, int blk_col, int block, int plane,
-                       TX_SIZE tx_size,
-#endif
+                       int blk_row, int blk_col, int plane, TX_SIZE tx_size,
                        aom_writer *w) {
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   const int is_inter = is_inter_block(mbmi);
-#if !CONFIG_TXK_SEL
-#if CONFIG_VAR_TX
-  const TX_SIZE tx_size = is_inter ? mbmi->min_tx_size : mbmi->tx_size;
-#else
-  const TX_SIZE tx_size = mbmi->tx_size;
-#endif  // CONFIG_VAR_TX
-#endif  // !CONFIG_TXK_SEL
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
-#if !CONFIG_TXK_SEL
-  TX_TYPE tx_type = mbmi->tx_type;
-#else
   // Only y plane's tx_type is transmitted
   if (plane > 0) return;
   PLANE_TYPE plane_type = get_plane_type(plane);
-  TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-#endif
-
-  if (!FIXED_TX_TYPE) {
-#if CONFIG_EXT_TX
-    const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
-    const BLOCK_SIZE bsize = mbmi->sb_type;
-    if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) >
-            1 &&
-        ((!cm->seg.enabled && cm->base_qindex > 0) ||
-         (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
-        !mbmi->skip &&
-#if CONFIG_SUPERTX
-        !supertx_enabled &&
-#endif  // CONFIG_SUPERTX
-        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-#if CONFIG_MRC_TX
-      if (tx_type == MRC_DCT)
-        assert(mbmi->valid_mrc_mask && "Invalid MRC mask");
-#endif  // CONFIG_MRC_TX
-      const TxSetType tx_set_type = get_ext_tx_set_type(
-          tx_size, bsize, is_inter, cm->reduced_tx_set_used);
-      const int eset =
-          get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
-      // eset == 0 should correspond to a set with only DCT_DCT and there
-      // is no need to send the tx_type
-      assert(eset > 0);
-      assert(av1_ext_tx_used[tx_set_type][tx_type]);
-#if !CONFIG_LGT_FROM_PRED
-      if (is_inter) {
-        aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type],
-                         ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
-                         av1_num_ext_tx_set[tx_set_type]);
-      } else if (ALLOW_INTRA_EXT_TX) {
-        aom_write_symbol(
-            w, av1_ext_tx_ind[tx_set_type][tx_type],
-            ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
-            av1_num_ext_tx_set[tx_set_type]);
-      }
-#else
-      // only signal tx_type when lgt is not allowed or not selected
-      if (is_inter) {
-        if (LGT_FROM_PRED_INTER) {
-          if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used)
-            aom_write(w, mbmi->use_lgt, ec_ctx->inter_lgt_prob[square_tx_size]);
-          if (!mbmi->use_lgt)
-            aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type],
-                             ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
-                             av1_num_ext_tx_set[tx_set_type]);
-        } else {
-          aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type],
-                           ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
-                           av1_num_ext_tx_set[tx_set_type]);
-        }
-      } else if (ALLOW_INTRA_EXT_TX) {
-        if (LGT_FROM_PRED_INTRA) {
-          if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used)
-            aom_write(w, mbmi->use_lgt,
-                      ec_ctx->intra_lgt_prob[square_tx_size][mbmi->mode]);
-          if (!mbmi->use_lgt)
-            aom_write_symbol(
-                w, av1_ext_tx_ind[tx_set_type][tx_type],
-                ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
-                av1_num_ext_tx_set[tx_set_type]);
-        } else {
-          aom_write_symbol(
-              w, av1_ext_tx_ind[tx_set_type][tx_type],
-              ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
-              av1_num_ext_tx_set[tx_set_type]);
-        }
-      }
-#endif  // CONFIG_LGT_FROM_PRED
-    }
-#else  // CONFIG_EXT_TX
-    if (tx_size < TX_32X32 &&
-        ((!cm->seg.enabled && cm->base_qindex > 0) ||
-         (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
-        !mbmi->skip &&
-#if CONFIG_SUPERTX
-        !supertx_enabled &&
-#endif  // CONFIG_SUPERTX
-        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-      if (is_inter) {
-        aom_write_symbol(w, av1_ext_tx_ind[tx_type],
-                         ec_ctx->inter_ext_tx_cdf[tx_size], TX_TYPES);
-      } else {
-        aom_write_symbol(
-            w, av1_ext_tx_ind[tx_type],
-            ec_ctx->intra_ext_tx_cdf[tx_size]
-                                    [intra_mode_to_tx_type_context[mbmi->mode]],
-            TX_TYPES);
-      }
+  TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, tx_size,
+                                    cm->reduced_tx_set_used);
+
+  const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+  if (get_ext_tx_types(tx_size, is_inter, cm->reduced_tx_set_used) > 1 &&
+      ((!cm->seg.enabled && cm->base_qindex > 0) ||
+       (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
+      !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    const TxSetType tx_set_type =
+        av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used);
+    const int eset = get_ext_tx_set(tx_size, is_inter, cm->reduced_tx_set_used);
+    // eset == 0 should correspond to a set with only DCT_DCT and there
+    // is no need to send the tx_type
+    assert(eset > 0);
+    assert(av1_ext_tx_used[tx_set_type][tx_type]);
+    if (is_inter) {
+      aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type],
+                       ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
+                       av1_num_ext_tx_set[tx_set_type]);
+    } else {
+      PREDICTION_MODE intra_dir;
+      if (mbmi->filter_intra_mode_info.use_filter_intra)
+        intra_dir =
+            fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode];
+      else
+        intra_dir = mbmi->mode;
+      aom_write_symbol(
+          w, av1_ext_tx_ind[tx_set_type][tx_type],
+          ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_dir],
+          av1_num_ext_tx_set[tx_set_type]);
     }
-#endif  // CONFIG_EXT_TX
   }
 }
 
@@ -1658,14 +844,12 @@ static void write_intra_mode(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize,
 
 static void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx,
                                 UV_PREDICTION_MODE uv_mode,
-                                PREDICTION_MODE y_mode, aom_writer *w) {
-#if !CONFIG_CFL
-  uv_mode = get_uv_mode(uv_mode);
-#endif
-  aom_write_symbol(w, uv_mode, frame_ctx->uv_mode_cdf[y_mode], UV_INTRA_MODES);
+                                PREDICTION_MODE y_mode,
+                                CFL_ALLOWED_TYPE cfl_allowed, aom_writer *w) {
+  aom_write_symbol(w, uv_mode, frame_ctx->uv_mode_cdf[cfl_allowed][y_mode],
+                   UV_INTRA_MODES - !cfl_allowed);
 }
 
-#if CONFIG_CFL
 static void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx, int idx,
                              int joint_sign, aom_writer *w) {
   aom_write_symbol(w, joint_sign, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS);
@@ -1679,23 +863,85 @@ static void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx, int idx,
     aom_write_symbol(w, CFL_IDX_V(idx), cdf_v, CFL_ALPHABET_SIZE);
   }
 }
-#endif
+
+static void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd, aom_writer *w,
+                       int skip, int mi_col, int mi_row) {
+  if (cm->coded_lossless || cm->allow_intrabc) {
+    // Initialize to indicate no CDEF for safety.
+    cm->cdef_bits = 0;
+    cm->cdef_strengths[0] = 0;
+    cm->nb_cdef_strengths = 1;
+    cm->cdef_uv_strengths[0] = 0;
+    return;
+  }
+
+  const int m = ~((1 << (6 - MI_SIZE_LOG2)) - 1);
+  const MB_MODE_INFO *mbmi =
+      cm->mi_grid_visible[(mi_row & m) * cm->mi_stride + (mi_col & m)];
+  // Initialise when at top left part of the superblock
+  if (!(mi_row & (cm->seq_params.mib_size - 1)) &&
+      !(mi_col & (cm->seq_params.mib_size - 1))) {  // Top left?
+    xd->cdef_preset[0] = xd->cdef_preset[1] = xd->cdef_preset[2] =
+        xd->cdef_preset[3] = -1;
+  }
+
+  // Emit CDEF param at first non-skip coding block
+  const int mask = 1 << (6 - MI_SIZE_LOG2);
+  const int index = cm->seq_params.sb_size == BLOCK_128X128
+                        ? !!(mi_col & mask) + 2 * !!(mi_row & mask)
+                        : 0;
+  if (xd->cdef_preset[index] == -1 && !skip) {
+    aom_write_literal(w, mbmi->cdef_strength, cm->cdef_bits);
+    xd->cdef_preset[index] = mbmi->cdef_strength;
+  }
+}
+
+static void write_inter_segment_id(AV1_COMP *cpi, aom_writer *w,
+                                   const struct segmentation *const seg,
+                                   struct segmentation_probs *const segp,
+                                   int mi_row, int mi_col, int skip,
+                                   int preskip) {
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  AV1_COMMON *const cm = &cpi->common;
+
+  if (seg->update_map) {
+    if (preskip) {
+      if (!seg->segid_preskip) return;
+    } else {
+      if (seg->segid_preskip) return;
+      if (skip) {
+        write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 1);
+        if (seg->temporal_update) ((MB_MODE_INFO *)mbmi)->seg_id_predicted = 0;
+        return;
+      }
+    }
+    if (seg->temporal_update) {
+      const int pred_flag = mbmi->seg_id_predicted;
+      aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd);
+      aom_write_symbol(w, pred_flag, pred_cdf, 2);
+      if (!pred_flag) {
+        write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0);
+      }
+      if (pred_flag) {
+        set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type,
+                               mi_row, mi_col, mbmi->segment_id);
+      }
+    } else {
+      write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0);
+    }
+  }
+}
 
 static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
-                                const int mi_col,
-#if CONFIG_SUPERTX
-                                int supertx_enabled,
-#endif
-                                aom_writer *w) {
+                                const int mi_col, aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  const MODE_INFO *mi = xd->mi[0];
-
   const struct segmentation *const seg = &cm->seg;
   struct segmentation_probs *const segp = &ec_ctx->seg;
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const PREDICTION_MODE mode = mbmi->mode;
   const int segment_id = mbmi->segment_id;
@@ -1704,595 +950,323 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
   const int is_inter = is_inter_block(mbmi);
   const int is_compound = has_second_ref(mbmi);
   int skip, ref;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
   (void)mi_row;
   (void)mi_col;
 
-  if (seg->update_map) {
-    if (seg->temporal_update) {
-      const int pred_flag = mbmi->seg_id_predicted;
-#if CONFIG_NEW_MULTISYMBOL
-      aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd);
-      aom_write_symbol(w, pred_flag, pred_cdf, 2);
-#else
-      aom_prob pred_prob = av1_get_pred_prob_seg_id(segp, xd);
-      aom_write(w, pred_flag, pred_prob);
-#endif
-      if (!pred_flag) write_segment_id(w, seg, segp, segment_id);
-    } else {
-      write_segment_id(w, seg, segp, segment_id);
-    }
-  }
+  write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, 0, 1);
+
+  write_skip_mode(cm, xd, segment_id, mbmi, w);
+
+  assert(IMPLIES(mbmi->skip_mode, mbmi->skip));
+  skip = mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w);
+
+  write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, skip, 0);
+
+  write_cdef(cm, xd, w, skip, mi_col, mi_row);
 
-#if CONFIG_SUPERTX
-  if (supertx_enabled)
-    skip = mbmi->skip;
-  else
-    skip = write_skip(cm, xd, segment_id, mi, w);
-#else
-  skip = write_skip(cm, xd, segment_id, mi, w);
-#endif  // CONFIG_SUPERTX
   if (cm->delta_q_present_flag) {
     int super_block_upper_left =
-        ((mi_row & MAX_MIB_MASK) == 0) && ((mi_col & MAX_MIB_MASK) == 0);
-    if ((bsize != BLOCK_LARGEST || skip == 0) && super_block_upper_left) {
-      assert(mbmi->current_q_index > 0);
+        ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
+        ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
+    if ((bsize != cm->seq_params.sb_size || skip == 0) &&
+        super_block_upper_left) {
+      assert(mbmi->current_qindex > 0);
       int reduced_delta_qindex =
-          (mbmi->current_q_index - xd->prev_qindex) / cm->delta_q_res;
-      write_delta_qindex(cm, xd, reduced_delta_qindex, w);
-      xd->prev_qindex = mbmi->current_q_index;
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
+          (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res;
+      write_delta_qindex(xd, reduced_delta_qindex, w);
+      xd->current_qindex = mbmi->current_qindex;
       if (cm->delta_lf_present_flag) {
         if (cm->delta_lf_multi) {
-          for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) {
+          const int frame_lf_count =
+              av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+          for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
             int reduced_delta_lflevel =
-                (mbmi->curr_delta_lf[lf_id] - xd->prev_delta_lf[lf_id]) /
+                (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
                 cm->delta_lf_res;
             write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w);
-            xd->prev_delta_lf[lf_id] = mbmi->curr_delta_lf[lf_id];
+            xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
           }
         } else {
           int reduced_delta_lflevel =
-              (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
+              (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
               cm->delta_lf_res;
           write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w);
-          xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
+          xd->delta_lf_from_base = mbmi->delta_lf_from_base;
         }
       }
-#else
-      if (cm->delta_lf_present_flag) {
-        int reduced_delta_lflevel =
-            (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
-            cm->delta_lf_res;
-        write_delta_lflevel(cm, xd, reduced_delta_lflevel, w);
-        xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
-      }
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif  // CONFIG_EXT_DELTA_Q
     }
   }
 
-#if CONFIG_SUPERTX
-  if (!supertx_enabled)
-#endif  // CONFIG_SUPERTX
-    write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
+  if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
 
-  if (cm->tx_mode == TX_MODE_SELECT &&
-#if CONFIG_CB4X4 && CONFIG_VAR_TX && !CONFIG_RECT_TX
-      (bsize >= BLOCK_8X8 || (bsize > BLOCK_4X4 && is_inter)) &&
-#else
-      block_signals_txsize(bsize) &&
-#endif
-#if CONFIG_SUPERTX
-      !supertx_enabled &&
-#endif  // CONFIG_SUPERTX
-      !(is_inter && skip) && !xd->lossless[segment_id]) {
-#if CONFIG_VAR_TX
-    if (is_inter) {  // This implies skip flag is 0.
-      const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, bsize, 0);
-      const int bh = tx_size_high_unit[max_tx_size];
-      const int bw = tx_size_wide_unit[max_tx_size];
-      const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
-      const int height = block_size_high[bsize] >> tx_size_wide_log2[0];
-      int init_depth =
-          (height != width) ? RECT_VARTX_DEPTH_INIT : SQR_VARTX_DEPTH_INIT;
-      int idx, idy;
-      for (idy = 0; idy < height; idy += bh)
-        for (idx = 0; idx < width; idx += bw)
-          write_tx_size_vartx(cm, xd, mbmi, max_tx_size, init_depth, idy, idx,
-                              w);
-#if CONFIG_RECT_TX_EXT
-      if (is_quarter_tx_allowed(xd, mbmi, is_inter_block(mbmi)) &&
-          quarter_txsize_lookup[bsize] != max_tx_size &&
-          (mbmi->tx_size == quarter_txsize_lookup[bsize] ||
-           mbmi->tx_size == max_tx_size)) {
-#if CONFIG_NEW_MULTISYMBOL
-        aom_write_symbol(w, mbmi->tx_size != max_tx_size,
-                         cm->fc->quarter_tx_size_cdf, 2);
-#else
-        aom_write(w, mbmi->tx_size != max_tx_size,
-                  cm->fc->quarter_tx_size_prob);
-#endif
-      }
-#endif
-    } else {
-      set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, skip, xd);
-      write_selected_tx_size(cm, xd, w);
-    }
-  } else {
-    set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, skip, xd);
-#else
-    write_selected_tx_size(cm, xd, w);
-#endif
-  }
+  if (mbmi->skip_mode) return;
 
   if (!is_inter) {
-    if (bsize >= BLOCK_8X8 || unify_bsize) {
-      write_intra_mode(ec_ctx, bsize, mode, w);
-    } else {
-      int idx, idy;
-      const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
-      const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
-      for (idy = 0; idy < 2; idy += num_4x4_h) {
-        for (idx = 0; idx < 2; idx += num_4x4_w) {
-          const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode;
-          write_intra_mode(ec_ctx, bsize, b_mode, w);
-        }
-      }
+    write_intra_mode(ec_ctx, bsize, mode, w);
+    const int use_angle_delta = av1_use_angle_delta(bsize);
+
+    if (use_angle_delta && av1_is_directional_mode(mode)) {
+      write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y],
+                        ec_ctx->angle_delta_cdf[mode - V_PRED]);
     }
-#if CONFIG_CB4X4
-    if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                            xd->plane[1].subsampling_y)) {
-      write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mode, w);
-#else  // !CONFIG_CB4X4
-    write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mode, w);
-#endif  // CONFIG_CB4X4
 
-#if CONFIG_CFL
-      if (mbmi->uv_mode == UV_CFL_PRED) {
+    if (!cm->seq_params.monochrome &&
+        is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                            xd->plane[1].subsampling_y)) {
+      const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+      write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
+      if (uv_mode == UV_CFL_PRED)
         write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
+      if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) {
+        write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
+                          ec_ctx->angle_delta_cdf[uv_mode - V_PRED]);
       }
-#endif
-
-#if CONFIG_CB4X4
     }
-#endif
 
-#if CONFIG_EXT_INTRA
-    write_intra_angle_info(xd, ec_ctx, w);
-#endif  // CONFIG_EXT_INTRA
     if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
-      write_palette_mode_info(cm, xd, mi, w);
-#if CONFIG_FILTER_INTRA
-    if (bsize >= BLOCK_8X8 || unify_bsize)
-      write_filter_intra_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
-#endif  // CONFIG_FILTER_INTRA
+      write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
+
+    write_filter_intra_mode_info(cm, xd, mbmi, w);
   } else {
     int16_t mode_ctx;
-    write_ref_frames(cm, xd, w);
 
-#if CONFIG_COMPOUND_SINGLEREF
-    if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
-      // NOTE: Handle single ref comp mode
-      if (!is_compound)
-        aom_write(w, is_inter_singleref_comp_mode(mode),
-                  av1_get_inter_mode_prob(cm, xd));
-    }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-#if CONFIG_COMPOUND_SINGLEREF
-    if (is_compound || is_inter_singleref_comp_mode(mode))
-#else   // !CONFIG_COMPOUND_SINGLEREF
-    if (is_compound)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
-    else
+    av1_collect_neighbors_ref_counts(xd);
 
-      mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
-                                           mbmi->ref_frame, bsize, -1);
+    write_ref_frames(cm, xd, w);
+
+    mode_ctx =
+        av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
 
     // If segment skip is not enabled code the mode.
     if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
-      if (bsize >= BLOCK_8X8 || unify_bsize) {
-        if (is_inter_compound_mode(mode))
-          write_inter_compound_mode(cm, xd, w, mode, mode_ctx);
-#if CONFIG_COMPOUND_SINGLEREF
-        else if (is_inter_singleref_comp_mode(mode))
-          write_inter_singleref_comp_mode(xd, w, mode, mode_ctx);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-        else if (is_inter_singleref_mode(mode))
-          write_inter_mode(w, mode, ec_ctx, mode_ctx);
-
-        if (mode == NEWMV || mode == NEW_NEWMV ||
-#if CONFIG_COMPOUND_SINGLEREF
-            mbmi->mode == SR_NEW_NEWMV ||
-#endif  // CONFIG_COMPOUND_SINGLEREF
-            have_nearmv_in_inter_mode(mode))
-          write_drl_idx(ec_ctx, mbmi, mbmi_ext, w);
-        else
-          assert(mbmi->ref_mv_idx == 0);
-      }
+      if (is_inter_compound_mode(mode))
+        write_inter_compound_mode(xd, w, mode, mode_ctx);
+      else if (is_inter_singleref_mode(mode))
+        write_inter_mode(w, mode, ec_ctx, mode_ctx);
+
+      if (mode == NEWMV || mode == NEW_NEWMV || have_nearmv_in_inter_mode(mode))
+        write_drl_idx(ec_ctx, mbmi, mbmi_ext, w);
+      else
+        assert(mbmi->ref_mv_idx == 0);
     }
 
-#if !CONFIG_DUAL_FILTER && !CONFIG_WARPED_MOTION && !CONFIG_GLOBAL_MOTION
-    write_mb_interp_filter(cpi, xd, w);
-#endif  // !CONFIG_DUAL_FILTER && !CONFIG_WARPED_MOTION
-
-    if (bsize < BLOCK_8X8 && !unify_bsize) {
-#if CONFIG_COMPOUND_SINGLEREF
-      /// NOTE: Single ref comp mode does not support sub8x8.
-      assert(is_compound || !is_inter_singleref_comp_mode(mbmi->mode));
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
-      const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
-      int idx, idy;
-      for (idy = 0; idy < 2; idy += num_4x4_h) {
-        for (idx = 0; idx < 2; idx += num_4x4_w) {
-          const int j = idy * 2 + idx;
-          const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
-          if (!is_compound)
-            mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
-                                                 mbmi->ref_frame, bsize, j);
-          if (is_inter_compound_mode(b_mode))
-            write_inter_compound_mode(cm, xd, w, b_mode, mode_ctx);
-          else if (is_inter_singleref_mode(b_mode))
-            write_inter_mode(w, b_mode, ec_ctx, mode_ctx);
-
-          if (b_mode == NEWMV || b_mode == NEW_NEWMV) {
-            for (ref = 0; ref < 1 + is_compound; ++ref) {
-              int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-              int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                                        mbmi_ext->ref_mv_stack[rf_type], ref,
-                                        mbmi->ref_mv_idx);
-              nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
-              av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
-                            &mi->bmi[j].ref_mv[ref].as_mv, nmvc, allow_hp);
-            }
-          } else if (b_mode == NEAREST_NEWMV || b_mode == NEAR_NEWMV) {
-            int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-            int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                                      mbmi_ext->ref_mv_stack[rf_type], 1,
-                                      mbmi->ref_mv_idx);
-            nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
-            av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[1].as_mv,
-                          &mi->bmi[j].ref_mv[1].as_mv, nmvc, allow_hp);
-          } else if (b_mode == NEW_NEARESTMV || b_mode == NEW_NEARMV) {
-            int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-            int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                                      mbmi_ext->ref_mv_stack[rf_type], 0,
-                                      mbmi->ref_mv_idx);
-            nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
-            av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[0].as_mv,
-                          &mi->bmi[j].ref_mv[0].as_mv, nmvc, allow_hp);
-          }
-        }
-      }
-    } else {
-      if (mode == NEWMV || mode == NEW_NEWMV) {
-        int_mv ref_mv;
-        for (ref = 0; ref < 1 + is_compound; ++ref) {
-          int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-          int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                                    mbmi_ext->ref_mv_stack[rf_type], ref,
-                                    mbmi->ref_mv_idx);
-          nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
-          ref_mv = mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0];
-          av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
-                        allow_hp);
-        }
-      } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
-        int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-        int nmv_ctx =
-            av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                        mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
-        nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
-        av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv,
-                      &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv, nmvc,
-                      allow_hp);
-      } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
-        int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-        int nmv_ctx =
-            av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                        mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-        nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
-        av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv,
-                      &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv, nmvc,
+    if (mode == NEWMV || mode == NEW_NEWMV) {
+      for (ref = 0; ref < 1 + is_compound; ++ref) {
+        nmv_context *nmvc = &ec_ctx->nmvc;
+        const int_mv ref_mv = av1_get_ref_mv(x, ref);
+        av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
                       allow_hp);
-#if CONFIG_COMPOUND_SINGLEREF
-      } else if (  //  mode == SR_NEAREST_NEWMV ||
-          mode == SR_NEAR_NEWMV || mode == SR_ZERO_NEWMV ||
-          mode == SR_NEW_NEWMV) {
-        int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-        int nmv_ctx =
-            av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                        mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-        nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
-        int_mv ref_mv = mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0];
-        if (mode == SR_NEW_NEWMV)
-          av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc,
-                        allow_hp);
-        av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc,
-                      allow_hp);
-#endif  // CONFIG_COMPOUND_SINGLEREF
       }
+    } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
+      nmv_context *nmvc = &ec_ctx->nmvc;
+      const int_mv ref_mv = av1_get_ref_mv(x, 1);
+      av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc, allow_hp);
+    } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+      nmv_context *nmvc = &ec_ctx->nmvc;
+      const int_mv ref_mv = av1_get_ref_mv(x, 0);
+      av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc, allow_hp);
     }
 
-#if CONFIG_INTERINTRA
     if (cpi->common.reference_mode != COMPOUND_REFERENCE &&
-#if CONFIG_SUPERTX
-        !supertx_enabled &&
-#endif  // CONFIG_SUPERTX
-        cpi->common.allow_interintra_compound && is_interintra_allowed(mbmi)) {
+        cpi->common.seq_params.enable_interintra_compound &&
+        is_interintra_allowed(mbmi)) {
       const int interintra = mbmi->ref_frame[1] == INTRA_FRAME;
       const int bsize_group = size_group_lookup[bsize];
-#if CONFIG_NEW_MULTISYMBOL
       aom_write_symbol(w, interintra, ec_ctx->interintra_cdf[bsize_group], 2);
-#else
-      aom_write(w, interintra, cm->fc->interintra_prob[bsize_group]);
-#endif
       if (interintra) {
         aom_write_symbol(w, mbmi->interintra_mode,
                          ec_ctx->interintra_mode_cdf[bsize_group],
                          INTERINTRA_MODES);
         if (is_interintra_wedge_used(bsize)) {
-#if CONFIG_NEW_MULTISYMBOL
           aom_write_symbol(w, mbmi->use_wedge_interintra,
                            ec_ctx->wedge_interintra_cdf[bsize], 2);
-#else
-          aom_write(w, mbmi->use_wedge_interintra,
-                    cm->fc->wedge_interintra_prob[bsize]);
-#endif
           if (mbmi->use_wedge_interintra) {
-            aom_write_literal(w, mbmi->interintra_wedge_index,
-                              get_wedge_bits_lookup(bsize));
+            aom_write_symbol(w, mbmi->interintra_wedge_index,
+                             ec_ctx->wedge_idx_cdf[bsize], 16);
             assert(mbmi->interintra_wedge_sign == 0);
           }
         }
       }
     }
-#endif  // CONFIG_INTERINTRA
-
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-#if CONFIG_SUPERTX
-    if (!supertx_enabled)
-#endif  // CONFIG_SUPERTX
-      if (mbmi->ref_frame[1] != INTRA_FRAME) write_motion_mode(cm, xd, mi, w);
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-    write_ncobmc_mode(xd, mi, w);
-#endif
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-
-    if (
-#if CONFIG_COMPOUND_SINGLEREF
-        is_inter_anyref_comp_mode(mbmi->mode) &&
-#else   // !CONFIG_COMPOUND_SINGLEREF
-        cpi->common.reference_mode != SINGLE_REFERENCE &&
-        is_inter_compound_mode(mbmi->mode) &&
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_MOTION_VAR
-        mbmi->motion_mode == SIMPLE_TRANSLATION &&
-#endif  // CONFIG_MOTION_VAR
-        is_any_masked_compound_used(bsize)) {
-#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-      if (cm->allow_masked_compound) {
-#if CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
-        if (!is_interinter_compound_used(COMPOUND_WEDGE, bsize))
-          aom_write_bit(w, mbmi->interinter_compound_type == COMPOUND_AVERAGE);
-        else
-#endif  // CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
-          aom_write_symbol(w, mbmi->interinter_compound_type,
-                           ec_ctx->compound_type_cdf[bsize], COMPOUND_TYPES);
-#if CONFIG_WEDGE
-        if (is_interinter_compound_used(COMPOUND_WEDGE, bsize) &&
-            mbmi->interinter_compound_type == COMPOUND_WEDGE) {
-          aom_write_literal(w, mbmi->wedge_index, get_wedge_bits_lookup(bsize));
-          aom_write_bit(w, mbmi->wedge_sign);
+
+    if (mbmi->ref_frame[1] != INTRA_FRAME) write_motion_mode(cm, xd, mbmi, w);
+
+    // First write idx to indicate current compound inter prediction mode group
+    // Group A (0): jnt_comp, compound_average
+    // Group B (1): interintra, compound_diffwtd, wedge
+    if (has_second_ref(mbmi)) {
+      const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                       cm->seq_params.enable_masked_compound;
+
+      if (masked_compound_used) {
+        const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
+        aom_write_symbol(w, mbmi->comp_group_idx,
+                         ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2);
+      } else {
+        assert(mbmi->comp_group_idx == 0);
+      }
+
+      if (mbmi->comp_group_idx == 0) {
+        if (mbmi->compound_idx)
+          assert(mbmi->interinter_comp.type == COMPOUND_AVERAGE);
+
+        if (cm->seq_params.enable_jnt_comp) {
+          const int comp_index_ctx = get_comp_index_context(cm, xd);
+          aom_write_symbol(w, mbmi->compound_idx,
+                           ec_ctx->compound_index_cdf[comp_index_ctx], 2);
+        } else {
+          assert(mbmi->compound_idx == 1);
         }
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-        if (mbmi->interinter_compound_type == COMPOUND_SEG) {
-          aom_write_literal(w, mbmi->mask_type, MAX_SEG_MASK_BITS);
+      } else {
+        assert(cpi->common.reference_mode != SINGLE_REFERENCE &&
+               is_inter_compound_mode(mbmi->mode) &&
+               mbmi->motion_mode == SIMPLE_TRANSLATION);
+        assert(masked_compound_used);
+        // compound_diffwtd, wedge
+        assert(mbmi->interinter_comp.type == COMPOUND_WEDGE ||
+               mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+
+        if (is_interinter_compound_used(COMPOUND_WEDGE, bsize))
+          aom_write_symbol(w, mbmi->interinter_comp.type - 1,
+                           ec_ctx->compound_type_cdf[bsize],
+                           COMPOUND_TYPES - 1);
+
+        if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+          assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+          aom_write_symbol(w, mbmi->interinter_comp.wedge_index,
+                           ec_ctx->wedge_idx_cdf[bsize], 16);
+          aom_write_bit(w, mbmi->interinter_comp.wedge_sign);
+        } else {
+          assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+          aom_write_literal(w, mbmi->interinter_comp.mask_type,
+                            MAX_DIFFWTD_MASK_BITS);
         }
-#endif  // CONFIG_COMPOUND_SEGMENT
       }
-#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
     }
 
-#if CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION
     write_mb_interp_filter(cpi, xd, w);
-#endif  // CONFIG_DUAL_FILTE || CONFIG_WARPED_MOTION
   }
+}
 
-#if !CONFIG_TXK_SEL
-  av1_write_tx_type(cm, xd,
-#if CONFIG_SUPERTX
-                    supertx_enabled,
-#endif
-                    w);
-#endif  // !CONFIG_TXK_SEL
+static void write_intrabc_info(MACROBLOCKD *xd,
+                               const MB_MODE_INFO_EXT *mbmi_ext,
+                               aom_writer *w) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  int use_intrabc = is_intrabc_block(mbmi);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  aom_write_symbol(w, use_intrabc, ec_ctx->intrabc_cdf, 2);
+  if (use_intrabc) {
+    assert(mbmi->mode == DC_PRED);
+    assert(mbmi->uv_mode == UV_DC_PRED);
+    assert(mbmi->motion_mode == SIMPLE_TRANSLATION);
+    int_mv dv_ref = mbmi_ext->ref_mv_stack[INTRA_FRAME][0].this_mv;
+    av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc);
+  }
 }
 
-static void write_mb_modes_kf(AV1_COMMON *cm, MACROBLOCKD *xd,
-#if CONFIG_INTRABC
+static void write_mb_modes_kf(AV1_COMP *cpi, MACROBLOCKD *xd,
                               const MB_MODE_INFO_EXT *mbmi_ext,
-#endif  // CONFIG_INTRABC
                               const int mi_row, const int mi_col,
                               aom_writer *w) {
+  AV1_COMMON *const cm = &cpi->common;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   const struct segmentation *const seg = &cm->seg;
   struct segmentation_probs *const segp = &ec_ctx->seg;
-  const MODE_INFO *const mi = xd->mi[0];
-  const MODE_INFO *const above_mi = xd->above_mi;
-  const MODE_INFO *const left_mi = xd->left_mi;
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   const BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-  (void)mi_row;
-  (void)mi_col;
+  const PREDICTION_MODE mode = mbmi->mode;
+
+  if (seg->segid_preskip && seg->update_map)
+    write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0);
 
-  if (seg->update_map) write_segment_id(w, seg, segp, mbmi->segment_id);
+  const int skip = write_skip(cm, xd, mbmi->segment_id, mbmi, w);
+
+  if (!seg->segid_preskip && seg->update_map)
+    write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, skip);
+
+  write_cdef(cm, xd, w, skip, mi_col, mi_row);
 
-  const int skip = write_skip(cm, xd, mbmi->segment_id, mi, w);
   if (cm->delta_q_present_flag) {
     int super_block_upper_left =
-        ((mi_row & MAX_MIB_MASK) == 0) && ((mi_col & MAX_MIB_MASK) == 0);
-    if ((bsize != BLOCK_LARGEST || skip == 0) && super_block_upper_left) {
-      assert(mbmi->current_q_index > 0);
+        ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
+        ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
+    if ((bsize != cm->seq_params.sb_size || skip == 0) &&
+        super_block_upper_left) {
+      assert(mbmi->current_qindex > 0);
       int reduced_delta_qindex =
-          (mbmi->current_q_index - xd->prev_qindex) / cm->delta_q_res;
-      write_delta_qindex(cm, xd, reduced_delta_qindex, w);
-      xd->prev_qindex = mbmi->current_q_index;
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
+          (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res;
+      write_delta_qindex(xd, reduced_delta_qindex, w);
+      xd->current_qindex = mbmi->current_qindex;
       if (cm->delta_lf_present_flag) {
         if (cm->delta_lf_multi) {
-          for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) {
+          const int frame_lf_count =
+              av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+          for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
             int reduced_delta_lflevel =
-                (mbmi->curr_delta_lf[lf_id] - xd->prev_delta_lf[lf_id]) /
+                (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
                 cm->delta_lf_res;
             write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w);
-            xd->prev_delta_lf[lf_id] = mbmi->curr_delta_lf[lf_id];
+            xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
           }
         } else {
           int reduced_delta_lflevel =
-              (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
+              (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
               cm->delta_lf_res;
           write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w);
-          xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
+          xd->delta_lf_from_base = mbmi->delta_lf_from_base;
         }
       }
-#else
-      if (cm->delta_lf_present_flag) {
-        int reduced_delta_lflevel =
-            (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
-            cm->delta_lf_res;
-        write_delta_lflevel(cm, xd, reduced_delta_lflevel, w);
-        xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
-      }
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif  // CONFIG_EXT_DELTA_Q
     }
   }
 
-  int enable_tx_size = cm->tx_mode == TX_MODE_SELECT &&
-                       block_signals_txsize(bsize) &&
-                       !xd->lossless[mbmi->segment_id];
-
-#if CONFIG_INTRABC
-  if (av1_allow_intrabc(bsize, cm)) {
-    int use_intrabc = is_intrabc_block(mbmi);
-    aom_write_symbol(w, use_intrabc, ec_ctx->intrabc_cdf, 2);
-    if (use_intrabc) {
-      assert(mbmi->mode == DC_PRED);
-      assert(mbmi->uv_mode == UV_DC_PRED);
-      if (enable_tx_size && !mbmi->skip) write_selected_tx_size(cm, xd, w);
-      int_mv dv_ref = mbmi_ext->ref_mvs[INTRA_FRAME][0];
-      av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc);
-#if CONFIG_EXT_TX && !CONFIG_TXK_SEL
-      av1_write_tx_type(cm, xd,
-#if CONFIG_SUPERTX
-                        0,
-#endif
-                        w);
-#endif  // CONFIG_EXT_TX && !CONFIG_TXK_SEL
-      return;
-    }
+  if (av1_allow_intrabc(cm)) {
+    write_intrabc_info(xd, mbmi_ext, w);
+    if (is_intrabc_block(mbmi)) return;
   }
-#endif  // CONFIG_INTRABC
-  if (enable_tx_size) write_selected_tx_size(cm, xd, w);
 
-  if (bsize >= BLOCK_8X8 || unify_bsize) {
-    write_intra_mode_kf(cm, ec_ctx, mi, above_mi, left_mi, 0, mbmi->mode, w);
-  } else {
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
-    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
-    int idx, idy;
-
-    for (idy = 0; idy < 2; idy += num_4x4_h) {
-      for (idx = 0; idx < 2; idx += num_4x4_w) {
-        const int block = idy * 2 + idx;
-        write_intra_mode_kf(cm, ec_ctx, mi, above_mi, left_mi, block,
-                            mi->bmi[block].as_mode, w);
-      }
-    }
+  write_intra_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w);
+
+  const int use_angle_delta = av1_use_angle_delta(bsize);
+  if (use_angle_delta && av1_is_directional_mode(mode)) {
+    write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y],
+                      ec_ctx->angle_delta_cdf[mode - V_PRED]);
   }
 
-#if CONFIG_CB4X4
-  if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+  if (!cm->seq_params.monochrome &&
+      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
                           xd->plane[1].subsampling_y)) {
-    write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mbmi->mode, w);
-#else  // !CONFIG_CB4X4
-  write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mbmi->mode, w);
-#endif  // CONFIG_CB4X4
-
-#if CONFIG_CFL
-    if (mbmi->uv_mode == UV_CFL_PRED) {
+    const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+    write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
+    if (uv_mode == UV_CFL_PRED)
       write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
+    if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) {
+      write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
+                        ec_ctx->angle_delta_cdf[uv_mode - V_PRED]);
     }
-#endif
-
-#if CONFIG_CB4X4
   }
-#endif
-#if CONFIG_EXT_INTRA
-  write_intra_angle_info(xd, ec_ctx, w);
-#endif  // CONFIG_EXT_INTRA
+
   if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
-    write_palette_mode_info(cm, xd, mi, w);
-#if CONFIG_FILTER_INTRA
-  if (bsize >= BLOCK_8X8 || unify_bsize)
-    write_filter_intra_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
-#endif  // CONFIG_FILTER_INTRA
-
-#if !CONFIG_TXK_SEL
-  av1_write_tx_type(cm, xd,
-#if CONFIG_SUPERTX
-                    0,
-#endif
-                    w);
-#endif  // !CONFIG_TXK_SEL
-}
+    write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
 
-#if CONFIG_SUPERTX
-#define write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \
-                              mi_row, mi_col)                              \
-  write_modes_b(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, mi_col)
-#else
-#define write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \
-                              mi_row, mi_col)                              \
-  write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col)
-#endif  // CONFIG_SUPERTX
+  write_filter_intra_mode_info(cm, xd, mbmi, w);
+}
 
 #if CONFIG_RD_DEBUG
 static void dump_mode_info(MODE_INFO *mi) {
-  printf("\nmi->mbmi.mi_row == %d\n", mi->mbmi.mi_row);
-  printf("&& mi->mbmi.mi_col == %d\n", mi->mbmi.mi_col);
-  printf("&& mi->mbmi.sb_type == %d\n", mi->mbmi.sb_type);
-  printf("&& mi->mbmi.tx_size == %d\n", mi->mbmi.tx_size);
-  if (mi->mbmi.sb_type >= BLOCK_8X8) {
-    printf("&& mi->mbmi.mode == %d\n", mi->mbmi.mode);
-  } else {
-    printf("&& mi->bmi[0].as_mode == %d\n", mi->bmi[0].as_mode);
-  }
+  printf("\nmi->mi_row == %d\n", mi->mi_row);
+  printf("&& mi->mi_col == %d\n", mi->mi_col);
+  printf("&& mi->sb_type == %d\n", mi->sb_type);
+  printf("&& mi->tx_size == %d\n", mi->tx_size);
+  printf("&& mi->mode == %d\n", mi->mode);
 }
 static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
                                    int plane) {
   if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) {
-#if CONFIG_VAR_TX
     int r, c;
-#endif
     printf("\nplane %d rd_stats->txb_coeff_cost %d token_stats->cost %d\n",
            plane, rd_stats->txb_coeff_cost[plane], token_stats->cost);
-#if CONFIG_VAR_TX
     printf("rd txb_coeff_cost_map\n");
     for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
       for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
@@ -2308,7 +1282,6 @@ static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
       }
       printf("\n");
     }
-#endif
     return 1;
   }
   return 0;
@@ -2319,128 +1292,139 @@ static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
 static void enc_dump_logs(AV1_COMP *cpi, int mi_row, int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-  MODE_INFO *m;
   xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
-  m = xd->mi[0];
-  if (is_inter_block(&m->mbmi)) {
-#define FRAME_TO_CHECK 1
+  const MB_MODE_INFO *const *mbmi = xd->mi[0];
+  if (is_inter_block(mbmi)) {
+#define FRAME_TO_CHECK 11
     if (cm->current_video_frame == FRAME_TO_CHECK && cm->show_frame == 1) {
-      const MB_MODE_INFO *const mbmi = &m->mbmi;
       const BLOCK_SIZE bsize = mbmi->sb_type;
 
       int_mv mv[2];
-      int is_comp_ref = has_second_ref(&m->mbmi);
+      int is_comp_ref = has_second_ref(mbmi);
       int ref;
 
       for (ref = 0; ref < 1 + is_comp_ref; ++ref)
-        mv[ref].as_mv = m->mbmi.mv[ref].as_mv;
+        mv[ref].as_mv = mbmi->mv[ref].as_mv;
 
       if (!is_comp_ref) {
-#if CONFIG_COMPOUND_SINGLEREF
-        if (is_inter_singleref_comp_mode(m->mbmi.mode))
-          mv[1].as_mv = m->mbmi.mv[1].as_mv;
-        else
-#endif  // CONFIG_COMPOUND_SINGLEREF
-          mv[1].as_int = 0;
+        mv[1].as_int = 0;
       }
 
       MACROBLOCK *const x = &cpi->td.mb;
       const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-      const int16_t mode_ctx = av1_mode_context_analyzer(
-          mbmi_ext->mode_context, mbmi->ref_frame, bsize, -1);
+      const int16_t mode_ctx =
+          is_comp_ref ? mbmi_ext->compound_mode_context[mbmi->ref_frame[0]]
+                      : av1_mode_context_analyzer(mbmi_ext->mode_context,
+                                                  mbmi->ref_frame);
+
       const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
       int16_t zeromv_ctx = -1;
       int16_t refmv_ctx = -1;
+
       if (mbmi->mode != NEWMV) {
-        zeromv_ctx = (mode_ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
-        if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) {
-          assert(mbmi->mode == ZEROMV);
-        }
-        if (mbmi->mode != ZEROMV) {
+        zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+        if (mbmi->mode != GLOBALMV)
           refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
-          if (mode_ctx & (1 << SKIP_NEARESTMV_OFFSET)) refmv_ctx = 6;
-          if (mode_ctx & (1 << SKIP_NEARMV_OFFSET)) refmv_ctx = 7;
-          if (mode_ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) refmv_ctx = 8;
-        }
       }
 
-      int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
       printf(
           "=== ENCODER ===: "
-          "Frame=%d, (mi_row,mi_col)=(%d,%d), mode=%d, bsize=%d, "
+          "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, "
           "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, "
-          "ref[1]=%d, motion_mode=%d, inter_mode_ctx=%d, mode_ctx=%d, "
-          "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d\n",
-          cm->current_video_frame, mi_row, mi_col, mbmi->mode, bsize,
-          cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col, mv[1].as_mv.row,
-          mv[1].as_mv.col, mbmi->ref_frame[0], mbmi->ref_frame[1],
-          mbmi->motion_mode, mbmi_ext->mode_context[ref_frame_type], mode_ctx,
-          newmv_ctx, zeromv_ctx, refmv_ctx);
+          "ref[1]=%d, motion_mode=%d, mode_ctx=%d, "
+          "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n",
+          cm->current_video_frame, mi_row, mi_col, mbmi->skip_mode, mbmi->mode,
+          bsize, cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col,
+          mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0],
+          mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx,
+          zeromv_ctx, refmv_ctx, mbmi->tx_size);
     }
   }
 }
 #endif  // ENC_MISMATCH_DEBUG
 
 static void write_mbmi_b(AV1_COMP *cpi, const TileInfo *const tile,
-                         aom_writer *w,
-#if CONFIG_SUPERTX
-                         int supertx_enabled,
-#endif
-                         int mi_row, int mi_col) {
+                         aom_writer *w, int mi_row, int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-  MODE_INFO *m;
   int bh, bw;
   xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
-  m = xd->mi[0];
+  MB_MODE_INFO *m = xd->mi[0];
 
-  assert(m->mbmi.sb_type <= cm->sb_size ||
-         (m->mbmi.sb_type >= BLOCK_SIZES && m->mbmi.sb_type < BLOCK_SIZES_ALL));
+  assert(m->sb_type <= cm->seq_params.sb_size ||
+         (m->sb_type >= BLOCK_SIZES && m->sb_type < BLOCK_SIZES_ALL));
 
-  bh = mi_size_high[m->mbmi.sb_type];
-  bw = mi_size_wide[m->mbmi.sb_type];
+  bh = mi_size_high[m->sb_type];
+  bw = mi_size_wide[m->sb_type];
 
   cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
 
-  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+  xd->above_txfm_context = cm->above_txfm_context[tile->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 
   if (frame_is_intra_only(cm)) {
-    write_mb_modes_kf(cm, xd,
-#if CONFIG_INTRABC
-                      cpi->td.mb.mbmi_ext,
-#endif  // CONFIG_INTRABC
-                      mi_row, mi_col, w);
+    write_mb_modes_kf(cpi, xd, cpi->td.mb.mbmi_ext, mi_row, mi_col, w);
   } else {
-#if CONFIG_VAR_TX
-    xd->above_txfm_context =
-        cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
-    xd->left_txfm_context = xd->left_txfm_context_buffer +
-                            ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
-#endif
-#if CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION
     // has_subpel_mv_component needs the ref frame buffers set up to look
     // up if they are scaled. has_subpel_mv_component is in turn needed by
     // write_switchable_interp_filter, which is called by pack_inter_mode_mvs.
-    set_ref_ptrs(cm, xd, m->mbmi.ref_frame[0], m->mbmi.ref_frame[1]);
-#if CONFIG_COMPOUND_SINGLEREF
-    if (!has_second_ref(&m->mbmi) && is_inter_singleref_comp_mode(m->mbmi.mode))
-      xd->block_refs[1] = xd->block_refs[0];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#endif  // CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION
+    set_ref_ptrs(cm, xd, m->ref_frame[0], m->ref_frame[1]);
 
 #if ENC_MISMATCH_DEBUG
     enc_dump_logs(cpi, mi_row, mi_col);
 #endif  // ENC_MISMATCH_DEBUG
 
-    pack_inter_mode_mvs(cpi, mi_row, mi_col,
-#if CONFIG_SUPERTX
-                        supertx_enabled,
-#endif
-                        w);
+    pack_inter_mode_mvs(cpi, mi_row, mi_col, w);
+  }
+}
+
+static void write_inter_txb_coeff(AV1_COMMON *const cm, MACROBLOCK *const x,
+                                  MB_MODE_INFO *const mbmi, aom_writer *w,
+                                  const TOKENEXTRA **tok,
+                                  const TOKENEXTRA *const tok_end,
+                                  TOKEN_STATS *token_stats, const int row,
+                                  const int col, int *block, const int plane) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsizec =
+      scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
+
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
+
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+  const int step =
+      tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+  const int bkw = tx_size_wide_unit[max_tx_size];
+  const int bkh = tx_size_high_unit[max_tx_size];
+
+  const BLOCK_SIZE max_unit_bsize =
+      get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
+  int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+  int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
+  int blk_row, blk_col;
+
+  const int num_4x4_w = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  const int num_4x4_h = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+
+  const int unit_height =
+      AOMMIN(mu_blocks_high + (row >> pd->subsampling_y), num_4x4_h);
+  const int unit_width =
+      AOMMIN(mu_blocks_wide + (col >> pd->subsampling_x), num_4x4_w);
+  for (blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+       blk_row += bkh) {
+    for (blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+         blk_col += bkw) {
+      pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize,
+                      cm->bit_depth, *block, blk_row, blk_col, max_tx_size,
+                      token_stats);
+      *block += step;
+    }
   }
 }
 
@@ -2449,167 +1433,48 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
                            const TOKENEXTRA *const tok_end, int mi_row,
                            int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   const int mi_offset = mi_row * cm->mi_stride + mi_col;
-  MODE_INFO *const m = *(cm->mi_grid_visible + mi_offset);
-  MB_MODE_INFO *const mbmi = &m->mbmi;
+  MB_MODE_INFO *const mbmi = *(cm->mi_grid_visible + mi_offset);
   int plane;
   int bh, bw;
-#if CONFIG_PVQ || CONFIG_LV_MAP
   MACROBLOCK *const x = &cpi->td.mb;
   (void)tok;
   (void)tok_end;
-#endif
   xd->mi = cm->mi_grid_visible + mi_offset;
 
-  assert(mbmi->sb_type <= cm->sb_size ||
+  assert(mbmi->sb_type <= cm->seq_params.sb_size ||
          (mbmi->sb_type >= BLOCK_SIZES && mbmi->sb_type < BLOCK_SIZES_ALL));
 
   bh = mi_size_high[mbmi->sb_type];
   bw = mi_size_wide[mbmi->sb_type];
   cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
 
-  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
-
-// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
-#if !CONFIG_PVQ
-  for (plane = 0; plane <= 1; ++plane) {
-    const uint8_t palette_size_plane =
-        mbmi->palette_mode_info.palette_size[plane];
-    if (palette_size_plane > 0) {
-#if CONFIG_INTRABC
-      assert(mbmi->use_intrabc == 0);
-#endif
-      int rows, cols;
-      assert(mbmi->sb_type >= BLOCK_8X8);
-      av1_get_block_dimensions(mbmi->sb_type, plane, xd, NULL, NULL, &rows,
-                               &cols);
-      assert(*tok < tok_end);
-      pack_map_tokens(w, tok, palette_size_plane, rows * cols);
-#if !CONFIG_LV_MAP
-      assert(*tok < tok_end + mbmi->skip);
-#endif  // !CONFIG_LV_MAP
-    }
-  }
-#endif  // !CONFIG_PVQ
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
 
-#if CONFIG_COEF_INTERLEAVE
   if (!mbmi->skip) {
-    const struct macroblockd_plane *const pd_y = &xd->plane[0];
-    const struct macroblockd_plane *const pd_c = &xd->plane[1];
-    const TX_SIZE tx_log2_y = mbmi->tx_size;
-    const TX_SIZE tx_log2_c = av1_get_uv_tx_size(mbmi, pd_c);
-    const int tx_sz_y = (1 << tx_log2_y);
-    const int tx_sz_c = (1 << tx_log2_c);
-
-    const BLOCK_SIZE plane_bsize_y =
-        get_plane_block_size(AOMMAX(mbmi->sb_type, 3), pd_y);
-    const BLOCK_SIZE plane_bsize_c =
-        get_plane_block_size(AOMMAX(mbmi->sb_type, 3), pd_c);
-
-    const int num_4x4_w_y = num_4x4_blocks_wide_lookup[plane_bsize_y];
-    const int num_4x4_w_c = num_4x4_blocks_wide_lookup[plane_bsize_c];
-    const int num_4x4_h_y = num_4x4_blocks_high_lookup[plane_bsize_y];
-    const int num_4x4_h_c = num_4x4_blocks_high_lookup[plane_bsize_c];
-
-    const int max_4x4_w_y = get_max_4x4_size(num_4x4_w_y, xd->mb_to_right_edge,
-                                             pd_y->subsampling_x);
-    const int max_4x4_h_y = get_max_4x4_size(num_4x4_h_y, xd->mb_to_bottom_edge,
-                                             pd_y->subsampling_y);
-    const int max_4x4_w_c = get_max_4x4_size(num_4x4_w_c, xd->mb_to_right_edge,
-                                             pd_c->subsampling_x);
-    const int max_4x4_h_c = get_max_4x4_size(num_4x4_h_c, xd->mb_to_bottom_edge,
-                                             pd_c->subsampling_y);
-
-    // The max_4x4_w/h may be smaller than tx_sz under some corner cases,
-    // i.e. when the SB is splitted by tile boundaries.
-    const int tu_num_w_y = (max_4x4_w_y + tx_sz_y - 1) / tx_sz_y;
-    const int tu_num_h_y = (max_4x4_h_y + tx_sz_y - 1) / tx_sz_y;
-    const int tu_num_w_c = (max_4x4_w_c + tx_sz_c - 1) / tx_sz_c;
-    const int tu_num_h_c = (max_4x4_h_c + tx_sz_c - 1) / tx_sz_c;
-    const int tu_num_y = tu_num_w_y * tu_num_h_y;
-    const int tu_num_c = tu_num_w_c * tu_num_h_c;
-
-    int tu_idx_y = 0, tu_idx_c = 0;
-    TOKEN_STATS token_stats;
-    init_token_stats(&token_stats);
-
-    assert(*tok < tok_end);
-
-    while (tu_idx_y < tu_num_y) {
-      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_y, &token_stats);
-      assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
-      (*tok)++;
-      tu_idx_y++;
-
-      if (tu_idx_c < tu_num_c) {
-        pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats);
-        assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
-        (*tok)++;
-
-        pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats);
-        assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
-        (*tok)++;
-
-        tu_idx_c++;
-      }
-    }
-
-    // In 422 case, it's possilbe that Chroma has more TUs than Luma
-    while (tu_idx_c < tu_num_c) {
-      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats);
-      assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
-      (*tok)++;
-
-      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats);
-      assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
-      (*tok)++;
-
-      tu_idx_c++;
-    }
-  }
-#else  // CONFIG_COEF_INTERLEAVE
-  if (!mbmi->skip) {
-#if !CONFIG_PVQ && !CONFIG_LV_MAP
-    assert(*tok < tok_end);
-#endif
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-#if CONFIG_CB4X4
-      if (!is_chroma_reference(mi_row, mi_col, mbmi->sb_type,
-                               xd->plane[plane].subsampling_x,
-                               xd->plane[plane].subsampling_y)) {
-#if !CONFIG_LV_MAP
-        (*tok)++;
-#endif  // !CONFIG_LV_MAP
-        continue;
-      }
-#endif
-#if CONFIG_VAR_TX
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_CHROMA_SUB8X8
-      const BLOCK_SIZE plane_bsize =
-          AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#elif CONFIG_CB4X4
-      const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#else
-      const BLOCK_SIZE plane_bsize =
-          get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd);
-#endif
-
+    if (!is_inter_block(mbmi))
+      av1_write_coeffs_mb(cm, x, mi_row, mi_col, w, mbmi->sb_type);
+
+    if (is_inter_block(mbmi)) {
+      int block[MAX_MB_PLANE] = { 0 };
+      const BLOCK_SIZE plane_bsize = mbmi->sb_type;
+      assert(plane_bsize == get_plane_block_size(mbmi->sb_type,
+                                                 xd->plane[0].subsampling_x,
+                                                 xd->plane[0].subsampling_y));
       const int num_4x4_w =
           block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
       const int num_4x4_h =
-          block_size_high[plane_bsize] >> tx_size_wide_log2[0];
+          block_size_high[plane_bsize] >> tx_size_high_log2[0];
       int row, col;
       TOKEN_STATS token_stats;
       init_token_stats(&token_stats);
 
-      const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd);
+      const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+      assert(max_unit_bsize ==
+             get_plane_block_size(BLOCK_64X64, xd->plane[0].subsampling_x,
+                                  xd->plane[0].subsampling_y));
       int mu_blocks_wide =
           block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
       int mu_blocks_high =
@@ -2618,37 +1483,16 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
       mu_blocks_wide = AOMMIN(num_4x4_w, mu_blocks_wide);
       mu_blocks_high = AOMMIN(num_4x4_h, mu_blocks_high);
 
-      if (is_inter_block(mbmi)) {
-        const TX_SIZE max_tx_size = get_vartx_max_txsize(
-            mbmi, plane_bsize, pd->subsampling_x || pd->subsampling_y);
-        int block = 0;
-        const int step =
-            tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
-        const int bkw = tx_size_wide_unit[max_tx_size];
-        const int bkh = tx_size_high_unit[max_tx_size];
-        assert(bkw <= mu_blocks_wide);
-        assert(bkh <= mu_blocks_high);
-        for (row = 0; row < num_4x4_h; row += mu_blocks_high) {
-          const int unit_height = AOMMIN(mu_blocks_high + row, num_4x4_h);
-          for (col = 0; col < num_4x4_w; col += mu_blocks_wide) {
-            int blk_row, blk_col;
-            const int unit_width = AOMMIN(mu_blocks_wide + col, num_4x4_w);
-            for (blk_row = row; blk_row < unit_height; blk_row += bkh) {
-              for (blk_col = col; blk_col < unit_width; blk_col += bkw) {
-                pack_txb_tokens(w,
-#if CONFIG_LV_MAP
-                                cm,
-#endif
-                                tok, tok_end,
-#if CONFIG_PVQ || CONFIG_LV_MAP
-                                x,
-#endif
-                                xd, mbmi, plane, plane_bsize, cm->bit_depth,
-                                block, blk_row, blk_col, max_tx_size,
-                                &token_stats);
-                block += step;
-              }
+      for (row = 0; row < num_4x4_h; row += mu_blocks_high) {
+        for (col = 0; col < num_4x4_w; col += mu_blocks_wide) {
+          for (plane = 0; plane < num_planes && is_inter_block(mbmi); ++plane) {
+            const struct macroblockd_plane *const pd = &xd->plane[plane];
+            if (!is_chroma_reference(mi_row, mi_col, mbmi->sb_type,
+                                     pd->subsampling_x, pd->subsampling_y)) {
+              continue;
             }
+            write_inter_txb_coeff(cm, x, mbmi, w, tok, tok_end, &token_stats,
+                                  row, col, &block[plane], plane);
           }
         }
 #if CONFIG_RD_DEBUG
@@ -2658,607 +1502,196 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
           assert(0);
         }
 #endif  // CONFIG_RD_DEBUG
-      } else {
-#if CONFIG_LV_MAP
-        av1_write_coeffs_mb(cm, x, w, plane);
-#else
-        const TX_SIZE tx = av1_get_tx_size(plane, xd);
-        const int bkw = tx_size_wide_unit[tx];
-        const int bkh = tx_size_high_unit[tx];
-        int blk_row, blk_col;
-
-        for (row = 0; row < num_4x4_h; row += mu_blocks_high) {
-          for (col = 0; col < num_4x4_w; col += mu_blocks_wide) {
-            const int unit_height = AOMMIN(mu_blocks_high + row, num_4x4_h);
-            const int unit_width = AOMMIN(mu_blocks_wide + col, num_4x4_w);
-
-            for (blk_row = row; blk_row < unit_height; blk_row += bkh) {
-              for (blk_col = col; blk_col < unit_width; blk_col += bkw) {
-#if !CONFIG_PVQ
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                TX_TYPE tx_type =
-                    av1_get_tx_type(plane ? PLANE_TYPE_UV : PLANE_TYPE_Y, xd,
-                                    blk_row, blk_col, 0, tx);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx,
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                               tx_type, is_inter_block(mbmi),
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                               &token_stats);
-#else
-                pack_pvq_tokens(w, x, xd, plane, bsize, tx);
-#endif
-              }
-            }
-          }
-        }
-#endif  // CONFIG_LV_MAP
-      }
-#else
-      const TX_SIZE tx = av1_get_tx_size(plane, xd);
-      TOKEN_STATS token_stats;
-#if !CONFIG_PVQ
-      init_token_stats(&token_stats);
-#if CONFIG_LV_MAP
-      (void)tx;
-      av1_write_coeffs_mb(cm, x, w, plane);
-#else  // CONFIG_LV_MAP
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-      TX_TYPE tx_type = av1_get_tx_type(plane ? PLANE_TYPE_UV : PLANE_TYPE_Y,
-                                        xd, blk_row, blk_col, 0, tx);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx,
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                     tx_type, is_inter_block(mbmi),
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                     &token_stats);
-#endif  // CONFIG_LV_MAP
-
-#else
-      (void)token_stats;
-      pack_pvq_tokens(w, x, xd, plane, mbmi->sb_type, tx);
-#endif
-#if CONFIG_RD_DEBUG
-      if (is_inter_block(mbmi) && mbmi->sb_type >= BLOCK_8X8 &&
-          rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) {
-        dump_mode_info(m);
-        assert(0);
       }
-#endif  // CONFIG_RD_DEBUG
-#endif  // CONFIG_VAR_TX
-
-#if !CONFIG_PVQ && !CONFIG_LV_MAP
-      assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
-      (*tok)++;
-#endif
     }
   }
-#endif  // CONFIG_COEF_INTERLEAVE
 }
 
-#if CONFIG_MOTION_VAR && NC_MODE_INFO
-static void write_tokens_sb(AV1_COMP *cpi, const TileInfo *const tile,
-                            aom_writer *w, const TOKENEXTRA **tok,
-                            const TOKENEXTRA *const tok_end, int mi_row,
-                            int mi_col, BLOCK_SIZE bsize) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int hbs = mi_size_wide[bsize] / 2;
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
-
-  partition = get_partition(cm, mi_row, mi_col, bsize);
-  subsize = get_subsize(bsize, partition);
-
-  if (subsize < BLOCK_8X8 && !unify_bsize) {
-    write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-  } else {
-    switch (partition) {
-      case PARTITION_NONE:
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-        break;
-      case PARTITION_HORZ:
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-        if (mi_row + hbs < cm->mi_rows)
-          write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
-        break;
-      case PARTITION_VERT:
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-        if (mi_col + hbs < cm->mi_cols)
-          write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
-        break;
-      case PARTITION_SPLIT:
-        write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
-        write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs,
-                        subsize);
-        write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col,
-                        subsize);
-        write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs,
-                        subsize);
-        break;
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-#error NC_MODE_INFO+MOTION_VAR not yet supported for new HORZ/VERT_AB partitions
-#endif
-      case PARTITION_HORZ_A:
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
-        break;
-      case PARTITION_HORZ_B:
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
-        break;
-      case PARTITION_VERT_A:
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
-        break;
-      case PARTITION_VERT_B:
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
-        write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
-        break;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-      default: assert(0);
+static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
+                          aom_writer *w, const TOKENEXTRA **tok,
+                          const TOKENEXTRA *const tok_end, int mi_row,
+                          int mi_col) {
+  write_mbmi_b(cpi, tile, w, mi_row, mi_col);
+
+  AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  for (int plane = 0; plane < AOMMIN(2, av1_num_planes(cm)); ++plane) {
+    const uint8_t palette_size_plane =
+        mbmi->palette_mode_info.palette_size[plane];
+    assert(!mbmi->skip_mode || !palette_size_plane);
+    if (palette_size_plane > 0) {
+      assert(mbmi->use_intrabc == 0);
+      assert(av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type));
+      int rows, cols;
+      av1_get_block_dimensions(mbmi->sb_type, plane, xd, NULL, NULL, &rows,
+                               &cols);
+      assert(*tok < tok_end);
+      pack_map_tokens(w, tok, palette_size_plane, rows * cols);
     }
   }
-}
-#endif
 
-static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
-                          aom_writer *w, const TOKENEXTRA **tok,
-                          const TOKENEXTRA *const tok_end,
-#if CONFIG_SUPERTX
-                          int supertx_enabled,
-#endif
-                          int mi_row, int mi_col) {
-  write_mbmi_b(cpi, tile, w,
-#if CONFIG_SUPERTX
-               supertx_enabled,
-#endif
-               mi_row, mi_col);
+  BLOCK_SIZE bsize = mbmi->sb_type;
+  int is_inter_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi);
+  int skip = mbmi->skip;
+  int segment_id = mbmi->segment_id;
+  if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
+      !(is_inter_tx && skip) && !xd->lossless[segment_id]) {
+    if (is_inter_tx) {  // This implies skip flag is 0.
+      const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0);
+      const int txbh = tx_size_high_unit[max_tx_size];
+      const int txbw = tx_size_wide_unit[max_tx_size];
+      const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
+      const int height = block_size_high[bsize] >> tx_size_high_log2[0];
+      int idx, idy;
+      for (idy = 0; idy < height; idy += txbh)
+        for (idx = 0; idx < width; idx += txbw)
+          write_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, w);
+    } else {
+      write_selected_tx_size(xd, w);
+      set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, 0, xd);
+    }
+  } else {
+    set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h,
+                  skip && is_inter_block(mbmi), xd);
+  }
 
-#if CONFIG_MOTION_VAR && NC_MODE_INFO
-  (void)tok;
-  (void)tok_end;
-#else
-#if !CONFIG_PVQ && CONFIG_SUPERTX
-  if (!supertx_enabled)
-#endif
-    write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-#endif
+  write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
 }
 
 static void write_partition(const AV1_COMMON *const cm,
                             const MACROBLOCKD *const xd, int hbs, int mi_row,
                             int mi_col, PARTITION_TYPE p, BLOCK_SIZE bsize,
                             aom_writer *w) {
+  const int is_partition_point = bsize >= BLOCK_8X8;
+
+  if (!is_partition_point) return;
+
   const int has_rows = (mi_row + hbs) < cm->mi_rows;
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
-  const int is_partition_point = bsize >= BLOCK_8X8;
-  const int ctx = is_partition_point
-                      ? partition_plane_context(xd, mi_row, mi_col,
-#if CONFIG_UNPOISON_PARTITION_CTX
-                                                has_rows, has_cols,
-#endif
-                                                bsize)
-                      : 0;
+  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  (void)cm;
 
-  if (!is_partition_point) return;
+  if (!has_rows && !has_cols) {
+    assert(p == PARTITION_SPLIT);
+    return;
+  }
 
   if (has_rows && has_cols) {
-#if CONFIG_EXT_PARTITION_TYPES
-    const int num_partition_types =
-        (mi_width_log2_lookup[bsize] > mi_width_log2_lookup[BLOCK_8X8])
-            ? EXT_PARTITION_TYPES
-            : PARTITION_TYPES;
-#else
-    const int num_partition_types = PARTITION_TYPES;
-#endif
-    aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], num_partition_types);
+    aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx],
+                     partition_cdf_length(bsize));
   } else if (!has_rows && has_cols) {
     assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
     assert(bsize > BLOCK_8X8);
     aom_cdf_prob cdf[2];
-    partition_gather_vert_alike(cdf, ec_ctx->partition_cdf[ctx]);
+    partition_gather_vert_alike(cdf, ec_ctx->partition_cdf[ctx], bsize);
     aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2);
-  } else if (has_rows && !has_cols) {
+  } else {
+    assert(has_rows && !has_cols);
     assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
     assert(bsize > BLOCK_8X8);
     aom_cdf_prob cdf[2];
-    partition_gather_horz_alike(cdf, ec_ctx->partition_cdf[ctx]);
+    partition_gather_horz_alike(cdf, ec_ctx->partition_cdf[ctx], bsize);
     aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2);
-  } else {
-    assert(p == PARTITION_SPLIT);
   }
 }
 
-#if CONFIG_SUPERTX
-#define write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,   \
-                               mi_row, mi_col, bsize)                         \
-  write_modes_sb(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, mi_col, \
-                 bsize)
-#else
-#define write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \
-                               mi_row, mi_col, bsize)                       \
-  write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, bsize)
-#endif  // CONFIG_SUPERTX
-
 static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
                            aom_writer *const w, const TOKENEXTRA **tok,
-                           const TOKENEXTRA *const tok_end,
-#if CONFIG_SUPERTX
-                           int supertx_enabled,
-#endif
-                           int mi_row, int mi_col, BLOCK_SIZE bsize) {
+                           const TOKENEXTRA *const tok_end, int mi_row,
+                           int mi_col, BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   const int hbs = mi_size_wide[bsize] / 2;
-#if CONFIG_EXT_PARTITION_TYPES
   const int quarter_step = mi_size_wide[bsize] / 4;
   int i;
-#if CONFIG_EXT_PARTITION_TYPES_AB
-  const int qbs = mi_size_wide[bsize] / 4;
-#endif  // CONFIG_EXT_PARTITION_TYPES_AB
-#endif  // CONFIG_EXT_PARTITION_TYPES
   const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
-  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-
-#if CONFIG_SUPERTX
-  const int mi_offset = mi_row * cm->mi_stride + mi_col;
-  MB_MODE_INFO *mbmi;
-  const int pack_token = !supertx_enabled;
-  TX_SIZE supertx_size;
-#endif
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
-  write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
-#if CONFIG_SUPERTX
-  mbmi = &cm->mi_grid_visible[mi_offset]->mbmi;
-  xd->mi = cm->mi_grid_visible + mi_offset;
-  set_mi_row_col(xd, tile, mi_row, mi_size_high[bsize], mi_col,
-                 mi_size_wide[bsize],
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
-  if (!supertx_enabled && !frame_is_intra_only(cm) &&
-      partition != PARTITION_NONE && bsize <= MAX_SUPERTX_BLOCK_SIZE &&
-      !xd->lossless[0]) {
-    aom_prob prob;
-    supertx_size = max_txsize_lookup[bsize];
-    prob = cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
-                               [supertx_size];
-    supertx_enabled = (xd->mi[0]->mbmi.tx_size == supertx_size);
-    aom_write(w, supertx_enabled, prob);
-  }
-#endif  // CONFIG_SUPERTX
-  if (subsize < BLOCK_8X8 && !unify_bsize) {
-    write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row,
-                          mi_col);
-  } else {
-    switch (partition) {
-      case PARTITION_NONE:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        break;
-      case PARTITION_HORZ:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        if (mi_row + hbs < cm->mi_rows)
-          write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                                mi_row + hbs, mi_col);
-        break;
-      case PARTITION_VERT:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        if (mi_col + hbs < cm->mi_cols)
-          write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                                mi_row, mi_col + hbs);
-        break;
-      case PARTITION_SPLIT:
-        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                               mi_row, mi_col, subsize);
-        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                               mi_row, mi_col + hbs, subsize);
-        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                               mi_row + hbs, mi_col, subsize);
-        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                               mi_row + hbs, mi_col + hbs, subsize);
-        break;
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-      case PARTITION_HORZ_A:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row + qbs, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row + hbs, mi_col);
-        break;
-      case PARTITION_HORZ_B:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row + hbs, mi_col);
-        if (mi_row + 3 * qbs < cm->mi_rows)
-          write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                                mi_row + 3 * qbs, mi_col);
-        break;
-      case PARTITION_VERT_A:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col + qbs);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col + hbs);
-        break;
-      case PARTITION_VERT_B:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col + hbs);
-        if (mi_col + 3 * qbs < cm->mi_cols)
-          write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                                mi_row, mi_col + 3 * qbs);
-        break;
-#else
-      case PARTITION_HORZ_A:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col + hbs);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row + hbs, mi_col);
-        break;
-      case PARTITION_HORZ_B:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row + hbs, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row + hbs, mi_col + hbs);
-        break;
-      case PARTITION_VERT_A:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row + hbs, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col + hbs);
-        break;
-      case PARTITION_VERT_B:
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row, mi_col + hbs);
-        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                              mi_row + hbs, mi_col + hbs);
-        break;
-#endif
-      case PARTITION_HORZ_4:
-        for (i = 0; i < 4; ++i) {
-          int this_mi_row = mi_row + i * quarter_step;
-          if (i > 0 && this_mi_row >= cm->mi_rows) break;
-
-          write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                                this_mi_row, mi_col);
-        }
-        break;
-      case PARTITION_VERT_4:
-        for (i = 0; i < 4; ++i) {
-          int this_mi_col = mi_col + i * quarter_step;
-          if (i > 0 && this_mi_col >= cm->mi_cols) break;
-
-          write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                                mi_row, this_mi_col);
+  const int num_planes = av1_num_planes(cm);
+  for (int plane = 0; plane < num_planes; ++plane) {
+    int rcol0, rcol1, rrow0, rrow1, tile_tl_idx;
+    if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
+                                           &rcol0, &rcol1, &rrow0, &rrow1,
+                                           &tile_tl_idx)) {
+      const int rstride = cm->rst_info[plane].horz_units_per_tile;
+      for (int rrow = rrow0; rrow < rrow1; ++rrow) {
+        for (int rcol = rcol0; rcol < rcol1; ++rcol) {
+          const int runit_idx = tile_tl_idx + rcol + rrow * rstride;
+          const RestorationUnitInfo *rui =
+              &cm->rst_info[plane].unit_info[runit_idx];
+          loop_restoration_write_sb_coeffs(cm, xd, rui, w, plane,
+                                           cpi->td.counts);
         }
-        break;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-      default: assert(0);
+      }
     }
   }
-#if CONFIG_SUPERTX
-  if (partition != PARTITION_NONE && supertx_enabled && pack_token) {
-    int skip;
-    const int bsw = mi_size_wide[bsize];
-    const int bsh = mi_size_high[bsize];
-
-    xd->mi = cm->mi_grid_visible + mi_offset;
-    supertx_size = mbmi->tx_size;
-    set_mi_row_col(xd, tile, mi_row, bsh, mi_col, bsw,
-#if CONFIG_DEPENDENT_HORZTILES
-                   cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                   cm->mi_rows, cm->mi_cols);
 
-    assert(IMPLIES(!cm->seg.enabled, mbmi->segment_id_supertx == 0));
-    assert(mbmi->segment_id_supertx < MAX_SEGMENTS);
-
-    skip = write_skip(cm, xd, mbmi->segment_id_supertx, xd->mi[0], w);
-
-    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
+  switch (partition) {
+    case PARTITION_NONE:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      break;
+    case PARTITION_HORZ:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      if (mi_row + hbs < cm->mi_rows)
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      break;
+    case PARTITION_VERT:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      if (mi_col + hbs < cm->mi_cols)
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      break;
+    case PARTITION_SPLIT:
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs, subsize);
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col, subsize);
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs,
+                     subsize);
+      break;
+    case PARTITION_HORZ_A:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      break;
+    case PARTITION_HORZ_B:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_VERT_A:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      break;
+    case PARTITION_VERT_B:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_HORZ_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && this_mi_row >= cm->mi_rows) break;
 
-#if CONFIG_EXT_TX
-    if (get_ext_tx_types(supertx_size, bsize, 1, cm->reduced_tx_set_used) > 1 &&
-        !skip) {
-      const int eset =
-          get_ext_tx_set(supertx_size, bsize, 1, cm->reduced_tx_set_used);
-      const int tx_set_type =
-          get_ext_tx_set_type(supertx_size, bsize, 1, cm->reduced_tx_set_used);
-      if (eset > 0) {
-        aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][mbmi->tx_type],
-                         ec_ctx->inter_ext_tx_cdf[eset][supertx_size],
-                         av1_num_ext_tx_set[tx_set_type]);
+        write_modes_b(cpi, tile, w, tok, tok_end, this_mi_row, mi_col);
       }
-    }
-#else
-    if (supertx_size < TX_32X32 && !skip) {
-      aom_write_symbol(w, mbmi->tx_type, ec_ctx->inter_ext_tx_cdf[supertx_size],
-                       TX_TYPES);
-    }
-#endif  // CONFIG_EXT_TX
+      break;
+    case PARTITION_VERT_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= cm->mi_cols) break;
 
-    if (!skip) {
-      assert(*tok < tok_end);
-      for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-        TX_TYPE tx_type = av1_get_tx_type(plane ? PLANE_TYPE_UV : PLANE_TYPE_Y,
-                                          xd, blk_row, blk_col, block, tx_size);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-        const struct macroblockd_plane *const pd = &xd->plane[plane];
-        const int mbmi_txb_size = txsize_to_bsize[mbmi->tx_size];
-        const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi_txb_size, pd);
-
-        const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-        const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
-
-        int row, col;
-        const TX_SIZE tx = av1_get_tx_size(plane, xd);
-        BLOCK_SIZE txb_size = txsize_to_bsize[tx];
-
-        const int stepr = tx_size_high_unit[txb_size];
-        const int stepc = tx_size_wide_unit[txb_size];
-
-        TOKEN_STATS token_stats;
-        token_stats.cost = 0;
-        for (row = 0; row < max_blocks_high; row += stepr)
-          for (col = 0; col < max_blocks_wide; col += stepc)
-            pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx,
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                           tx_type, is_inter_block(mbmi),
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                           &token_stats);
-        assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
-        (*tok)++;
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, this_mi_col);
       }
-    }
-#if CONFIG_VAR_TX
-    xd->above_txfm_context = cm->above_txfm_context + mi_col;
-    xd->left_txfm_context =
-        xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
-    set_txfm_ctxs(xd->mi[0]->mbmi.tx_size, bsw, bsh, skip, xd);
-#endif
+      break;
+    default: assert(0);
   }
-#endif  // CONFIG_SUPERTX
 
-// update partition context
-#if CONFIG_EXT_PARTITION_TYPES
+  // update partition context
   update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
-#else
-  if (bsize >= BLOCK_8X8 &&
-      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
-    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
-#endif  // CONFIG_EXT_PARTITION_TYPES
-
-#if CONFIG_LPF_SB
-  // send filter level for each superblock (64x64)
-  if (bsize == cm->sb_size) {
-    if (mi_row == 0 && mi_col == 0) {
-      aom_write_literal(w, cm->mi_grid_visible[0]->mbmi.filt_lvl, 6);
-      cm->mi_grid_visible[0]->mbmi.reuse_sb_lvl = 0;
-      cm->mi_grid_visible[0]->mbmi.delta = 0;
-      cm->mi_grid_visible[0]->mbmi.sign = 0;
-    } else {
-      int prev_mi_row, prev_mi_col;
-      if (mi_col - MAX_MIB_SIZE < 0) {
-        prev_mi_row = mi_row - MAX_MIB_SIZE;
-        prev_mi_col = mi_col;
-      } else {
-        prev_mi_row = mi_row;
-        prev_mi_col = mi_col - MAX_MIB_SIZE;
-      }
-      MB_MODE_INFO *curr_mbmi =
-          &cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi;
-      MB_MODE_INFO *prev_mbmi =
-          &cm->mi_grid_visible[prev_mi_row * cm->mi_stride + prev_mi_col]->mbmi;
-
-      const uint8_t curr_lvl = curr_mbmi->filt_lvl;
-      const uint8_t prev_lvl = prev_mbmi->filt_lvl;
-
-      const int reuse_prev_lvl = curr_lvl == prev_lvl;
-      const int reuse_ctx = prev_mbmi->reuse_sb_lvl;
-      curr_mbmi->reuse_sb_lvl = reuse_prev_lvl;
-      aom_write_symbol(w, reuse_prev_lvl,
-                       xd->tile_ctx->lpf_reuse_cdf[reuse_ctx], 2);
-
-      if (reuse_prev_lvl) {
-        curr_mbmi->delta = 0;
-        curr_mbmi->sign = 0;
-      } else {
-        const unsigned int delta = abs(curr_lvl - prev_lvl) / LPF_STEP;
-        const int delta_ctx = prev_mbmi->delta;
-        curr_mbmi->delta = delta;
-        aom_write_symbol(w, delta, xd->tile_ctx->lpf_delta_cdf[delta_ctx],
-                         DELTA_RANGE);
-
-        if (delta) {
-          const int sign = curr_lvl > prev_lvl;
-          const int sign_ctx = prev_mbmi->sign;
-          curr_mbmi->sign = sign;
-          aom_write_symbol(w, sign,
-                           xd->tile_ctx->lpf_sign_cdf[reuse_ctx][sign_ctx], 2);
-        } else {
-          curr_mbmi->sign = 0;
-        }
-      }
-    }
-  }
-#endif
-
-#if CONFIG_CDEF
-  if (bsize == cm->sb_size && cm->cdef_bits != 0 && !cm->all_lossless) {
-    int width_step = mi_size_wide[BLOCK_64X64];
-    int height_step = mi_size_high[BLOCK_64X64];
-    int width, height;
-    for (height = 0; (height < mi_size_high[cm->sb_size]) &&
-                     (mi_row + height < cm->mi_rows);
-         height += height_step) {
-      for (width = 0; (width < mi_size_wide[cm->sb_size]) &&
-                      (mi_col + width < cm->mi_cols);
-           width += width_step) {
-        if (!sb_all_skip(cm, mi_row + height, mi_col + width))
-          aom_write_literal(
-              w,
-              cm->mi_grid_visible[(mi_row + height) * cm->mi_stride +
-                                  (mi_col + width)]
-                  ->mbmi.cdef_strength,
-              cm->cdef_bits);
-      }
-    }
-  }
-#endif
-#if CONFIG_LOOP_RESTORATION
-  for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    int rcol0, rcol1, rrow0, rrow1, nhtiles;
-    if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
-                                           &rcol0, &rcol1, &rrow0, &rrow1,
-                                           &nhtiles)) {
-      for (int rrow = rrow0; rrow < rrow1; ++rrow) {
-        for (int rcol = rcol0; rcol < rcol1; ++rcol) {
-          int rtile_idx = rcol + rrow * nhtiles;
-          loop_restoration_write_sb_coeffs(cm, xd, w, plane, rtile_idx);
-        }
-      }
-    }
-  }
-#endif
 }
 
 static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
@@ -3272,78 +1705,46 @@ static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
   const int mi_col_end = tile->mi_col_end;
   int mi_row, mi_col;
 
-#if CONFIG_DEPENDENT_HORZTILES
-  if (!cm->dependent_horz_tiles || mi_row_start == 0 ||
-      tile->tg_horz_boundary) {
-    av1_zero_above_context(cm, mi_col_start, mi_col_end);
-  }
-#else
-  av1_zero_above_context(cm, mi_col_start, mi_col_end);
-#endif
-#if CONFIG_PVQ
-  assert(cpi->td.mb.pvq_q->curr_pos == 0);
-#endif
+  av1_zero_above_context(cm, mi_col_start, mi_col_end, tile->tile_row);
+  av1_init_above_context(cm, xd, tile->tile_row);
+
   if (cpi->common.delta_q_present_flag) {
-    xd->prev_qindex = cpi->common.base_qindex;
-#if CONFIG_EXT_DELTA_Q
+    xd->current_qindex = cpi->common.base_qindex;
     if (cpi->common.delta_lf_present_flag) {
-#if CONFIG_LOOPFILTER_LEVEL
-      for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id)
-        xd->prev_delta_lf[lf_id] = 0;
-#endif  // CONFIG_LOOPFILTER_LEVEL
-      xd->prev_delta_lf_from_base = 0;
+      av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
     }
-#endif  // CONFIG_EXT_DELTA_Q
   }
 
-  for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += cm->mib_size) {
+  for (mi_row = mi_row_start; mi_row < mi_row_end;
+       mi_row += cm->seq_params.mib_size) {
     av1_zero_left_context(xd);
 
-    for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += cm->mib_size) {
-      write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, 0, mi_row, mi_col,
-                             cm->sb_size);
-#if CONFIG_MOTION_VAR && NC_MODE_INFO
-      write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, cm->sb_size);
-#endif
+    for (mi_col = mi_col_start; mi_col < mi_col_end;
+         mi_col += cm->seq_params.mib_size) {
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col,
+                     cm->seq_params.sb_size);
     }
   }
-#if CONFIG_PVQ
-  // Check that the number of PVQ blocks encoded and written to the bitstream
-  // are the same
-  assert(cpi->td.mb.pvq_q->curr_pos == cpi->td.mb.pvq_q->last_pos);
-  // Reset curr_pos in case we repack the bitstream
-  cpi->td.mb.pvq_q->curr_pos = 0;
-#endif
 }
 
-#if CONFIG_LOOP_RESTORATION
 static void encode_restoration_mode(AV1_COMMON *cm,
                                     struct aom_write_bit_buffer *wb) {
-  int p;
-  RestorationInfo *rsi = &cm->rst_info[0];
-  switch (rsi->frame_restoration_type) {
-    case RESTORE_NONE:
-      aom_wb_write_bit(wb, 0);
-      aom_wb_write_bit(wb, 0);
-      break;
-    case RESTORE_WIENER:
-      aom_wb_write_bit(wb, 1);
-      aom_wb_write_bit(wb, 0);
-      break;
-    case RESTORE_SGRPROJ:
-      aom_wb_write_bit(wb, 1);
-      aom_wb_write_bit(wb, 1);
-      break;
-    case RESTORE_SWITCHABLE:
-      aom_wb_write_bit(wb, 0);
-      aom_wb_write_bit(wb, 1);
-      break;
-    default: assert(0);
-  }
-  for (p = 1; p < MAX_MB_PLANE; ++p) {
-    rsi = &cm->rst_info[p];
+  assert(!cm->all_lossless);
+  if (!cm->seq_params.enable_restoration) return;
+  if (cm->allow_intrabc) return;
+  const int num_planes = av1_num_planes(cm);
+  int all_none = 1, chroma_none = 1;
+  for (int p = 0; p < num_planes; ++p) {
+    RestorationInfo *rsi = &cm->rst_info[p];
+    if (rsi->frame_restoration_type != RESTORE_NONE) {
+      all_none = 0;
+      chroma_none &= p == 0;
+    }
     switch (rsi->frame_restoration_type) {
-      case RESTORE_NONE: aom_wb_write_bit(wb, 0); break;
+      case RESTORE_NONE:
+        aom_wb_write_bit(wb, 0);
+        aom_wb_write_bit(wb, 0);
+        break;
       case RESTORE_WIENER:
         aom_wb_write_bit(wb, 1);
         aom_wb_write_bit(wb, 0);
@@ -3352,40 +1753,52 @@ static void encode_restoration_mode(AV1_COMMON *cm,
         aom_wb_write_bit(wb, 1);
         aom_wb_write_bit(wb, 1);
         break;
+      case RESTORE_SWITCHABLE:
+        aom_wb_write_bit(wb, 0);
+        aom_wb_write_bit(wb, 1);
+        break;
       default: assert(0);
     }
   }
-  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
-    rsi = &cm->rst_info[0];
-    aom_wb_write_bit(wb, rsi->restoration_tilesize != RESTORATION_TILESIZE_MAX);
-    if (rsi->restoration_tilesize != RESTORATION_TILESIZE_MAX) {
-      aom_wb_write_bit(
-          wb, rsi->restoration_tilesize != (RESTORATION_TILESIZE_MAX >> 1));
+  if (!all_none) {
+    assert(cm->seq_params.sb_size == BLOCK_64X64 ||
+           cm->seq_params.sb_size == BLOCK_128X128);
+    const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
+
+    RestorationInfo *rsi = &cm->rst_info[0];
+
+    assert(rsi->restoration_unit_size >= sb_size);
+    assert(RESTORATION_UNITSIZE_MAX == 256);
+
+    if (sb_size == 64) {
+      aom_wb_write_bit(wb, rsi->restoration_unit_size > 64);
+    }
+    if (rsi->restoration_unit_size > 64) {
+      aom_wb_write_bit(wb, rsi->restoration_unit_size > 128);
     }
   }
-  int s = AOMMIN(cm->subsampling_x, cm->subsampling_y);
-  if (s && (cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
-            cm->rst_info[2].frame_restoration_type != RESTORE_NONE)) {
-    aom_wb_write_bit(wb,
-                     cm->rst_info[1].restoration_tilesize !=
-                         cm->rst_info[0].restoration_tilesize);
-    assert(cm->rst_info[1].restoration_tilesize ==
-               cm->rst_info[0].restoration_tilesize ||
-           cm->rst_info[1].restoration_tilesize ==
-               (cm->rst_info[0].restoration_tilesize >> s));
-    assert(cm->rst_info[2].restoration_tilesize ==
-           cm->rst_info[1].restoration_tilesize);
-  } else if (!s) {
-    assert(cm->rst_info[1].restoration_tilesize ==
-           cm->rst_info[0].restoration_tilesize);
-    assert(cm->rst_info[2].restoration_tilesize ==
-           cm->rst_info[1].restoration_tilesize);
+
+  if (num_planes > 1) {
+    int s = AOMMIN(cm->subsampling_x, cm->subsampling_y);
+    if (s && !chroma_none) {
+      aom_wb_write_bit(wb, cm->rst_info[1].restoration_unit_size !=
+                               cm->rst_info[0].restoration_unit_size);
+      assert(cm->rst_info[1].restoration_unit_size ==
+                 cm->rst_info[0].restoration_unit_size ||
+             cm->rst_info[1].restoration_unit_size ==
+                 (cm->rst_info[0].restoration_unit_size >> s));
+      assert(cm->rst_info[2].restoration_unit_size ==
+             cm->rst_info[1].restoration_unit_size);
+    } else if (!s) {
+      assert(cm->rst_info[1].restoration_unit_size ==
+             cm->rst_info[0].restoration_unit_size);
+      assert(cm->rst_info[2].restoration_unit_size ==
+             cm->rst_info[1].restoration_unit_size);
+    }
   }
 }
 
-static void write_wiener_filter(int wiener_win, WienerInfo *wiener_info,
+static void write_wiener_filter(int wiener_win, const WienerInfo *wiener_info,
                                 WienerInfo *ref_wiener_info, aom_writer *wb) {
   if (wiener_win == WIENER_WIN)
     aom_write_primitive_refsubexpfin(
@@ -3428,78 +1841,106 @@ static void write_wiener_filter(int wiener_win, WienerInfo *wiener_info,
   memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info));
 }
 
-static void write_sgrproj_filter(SgrprojInfo *sgrproj_info,
+static void write_sgrproj_filter(const SgrprojInfo *sgrproj_info,
                                  SgrprojInfo *ref_sgrproj_info,
                                  aom_writer *wb) {
   aom_write_literal(wb, sgrproj_info->ep, SGRPROJ_PARAMS_BITS);
-  aom_write_primitive_refsubexpfin(wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1,
-                                   SGRPROJ_PRJ_SUBEXP_K,
-                                   ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
-                                   sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
-  aom_write_primitive_refsubexpfin(wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1,
-                                   SGRPROJ_PRJ_SUBEXP_K,
-                                   ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
-                                   sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+  const sgr_params_type *params = &sgr_params[sgrproj_info->ep];
+
+  if (params->r[0] == 0) {
+    assert(sgrproj_info->xqd[0] == 0);
+    aom_write_primitive_refsubexpfin(
+        wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+        sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+  } else if (params->r[1] == 0) {
+    aom_write_primitive_refsubexpfin(
+        wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+        sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+  } else {
+    aom_write_primitive_refsubexpfin(
+        wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+        sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+    aom_write_primitive_refsubexpfin(
+        wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+        sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+  }
+
   memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
 }
 
 static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
                                              MACROBLOCKD *xd,
+                                             const RestorationUnitInfo *rui,
                                              aom_writer *const w, int plane,
-                                             int rtile_idx) {
+                                             FRAME_COUNTS *counts) {
   const RestorationInfo *rsi = cm->rst_info + plane;
-  if (rsi->frame_restoration_type == RESTORE_NONE) return;
+  RestorationType frame_rtype = rsi->frame_restoration_type;
+  if (frame_rtype == RESTORE_NONE) return;
+
+  (void)counts;
+  assert(!cm->all_lossless);
 
   const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
   WienerInfo *wiener_info = xd->wiener_info + plane;
   SgrprojInfo *sgrproj_info = xd->sgrproj_info + plane;
+  RestorationType unit_rtype = rui->restoration_type;
 
-  if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) {
-    assert(plane == 0);
-    av1_write_token(
-        w, av1_switchable_restore_tree, cm->fc->switchable_restore_prob,
-        &switchable_restore_encodings[rsi->restoration_type[rtile_idx]]);
-    if (rsi->restoration_type[rtile_idx] == RESTORE_WIENER) {
-      write_wiener_filter(wiener_win, &rsi->wiener_info[rtile_idx], wiener_info,
-                          w);
-    } else if (rsi->restoration_type[rtile_idx] == RESTORE_SGRPROJ) {
-      write_sgrproj_filter(&rsi->sgrproj_info[rtile_idx], sgrproj_info, w);
+  if (frame_rtype == RESTORE_SWITCHABLE) {
+    aom_write_symbol(w, unit_rtype, xd->tile_ctx->switchable_restore_cdf,
+                     RESTORE_SWITCHABLE_TYPES);
+#if CONFIG_ENTROPY_STATS
+    ++counts->switchable_restore[unit_rtype];
+#endif
+    switch (unit_rtype) {
+      case RESTORE_WIENER:
+        write_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, w);
+        break;
+      case RESTORE_SGRPROJ:
+        write_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, w);
+        break;
+      default: assert(unit_rtype == RESTORE_NONE); break;
     }
-  } else if (rsi->frame_restoration_type == RESTORE_WIENER) {
-    aom_write(w, rsi->restoration_type[rtile_idx] != RESTORE_NONE,
-              RESTORE_NONE_WIENER_PROB);
-    if (rsi->restoration_type[rtile_idx] != RESTORE_NONE) {
-      write_wiener_filter(wiener_win, &rsi->wiener_info[rtile_idx], wiener_info,
-                          w);
+  } else if (frame_rtype == RESTORE_WIENER) {
+    aom_write_symbol(w, unit_rtype != RESTORE_NONE,
+                     xd->tile_ctx->wiener_restore_cdf, 2);
+#if CONFIG_ENTROPY_STATS
+    ++counts->wiener_restore[unit_rtype != RESTORE_NONE];
+#endif
+    if (unit_rtype != RESTORE_NONE) {
+      write_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, w);
     }
-  } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) {
-    aom_write(w, rsi->restoration_type[rtile_idx] != RESTORE_NONE,
-              RESTORE_NONE_SGRPROJ_PROB);
-    if (rsi->restoration_type[rtile_idx] != RESTORE_NONE) {
-      write_sgrproj_filter(&rsi->sgrproj_info[rtile_idx], sgrproj_info, w);
+  } else if (frame_rtype == RESTORE_SGRPROJ) {
+    aom_write_symbol(w, unit_rtype != RESTORE_NONE,
+                     xd->tile_ctx->sgrproj_restore_cdf, 2);
+#if CONFIG_ENTROPY_STATS
+    ++counts->sgrproj_restore[unit_rtype != RESTORE_NONE];
+#endif
+    if (unit_rtype != RESTORE_NONE) {
+      write_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, w);
     }
   }
 }
 
-#endif  // CONFIG_LOOP_RESTORATION
-
 static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
+  assert(!cm->coded_lossless);
+  if (cm->allow_intrabc) return;
+  const int num_planes = av1_num_planes(cm);
   int i;
   struct loopfilter *lf = &cm->lf;
 
-// Encode the loop filter level and type
-#if !CONFIG_LPF_SB
-#if CONFIG_LOOPFILTER_LEVEL
+  // Encode the loop filter level and type
   aom_wb_write_literal(wb, lf->filter_level[0], 6);
   aom_wb_write_literal(wb, lf->filter_level[1], 6);
-  if (lf->filter_level[0] || lf->filter_level[1]) {
-    aom_wb_write_literal(wb, lf->filter_level_u, 6);
-    aom_wb_write_literal(wb, lf->filter_level_v, 6);
-  }
-#else
-  aom_wb_write_literal(wb, lf->filter_level, 6);
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif  // CONFIG_LPF_SB
+  if (num_planes > 1) {
+    if (lf->filter_level[0] || lf->filter_level[1]) {
+      aom_wb_write_literal(wb, lf->filter_level_u, 6);
+      aom_wb_write_literal(wb, lf->filter_level_v, 6);
+    }
+  }
   aom_wb_write_literal(wb, lf->sharpness_level, 3);
 
   // Write out loop filter deltas applied at the MB level based on mode or
@@ -3508,48 +1949,58 @@ static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
 
   if (lf->mode_ref_delta_enabled) {
     aom_wb_write_bit(wb, lf->mode_ref_delta_update);
+
     if (lf->mode_ref_delta_update) {
-      for (i = 0; i < TOTAL_REFS_PER_FRAME; i++) {
+      const int prime_idx = cm->primary_ref_frame;
+      const int buf_idx =
+          prime_idx == PRIMARY_REF_NONE ? -1 : cm->frame_refs[prime_idx].idx;
+      int8_t last_ref_deltas[REF_FRAMES];
+      if (prime_idx == PRIMARY_REF_NONE || buf_idx < 0) {
+        av1_set_default_ref_deltas(last_ref_deltas);
+      } else {
+        memcpy(last_ref_deltas, cm->buffer_pool->frame_bufs[buf_idx].ref_deltas,
+               REF_FRAMES);
+      }
+      for (i = 0; i < REF_FRAMES; i++) {
         const int delta = lf->ref_deltas[i];
-        const int changed = delta != lf->last_ref_deltas[i];
+        const int changed = delta != last_ref_deltas[i];
         aom_wb_write_bit(wb, changed);
-        if (changed) {
-          lf->last_ref_deltas[i] = delta;
-          aom_wb_write_inv_signed_literal(wb, delta, 6);
-        }
+        if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
       }
 
+      int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
+      if (prime_idx == PRIMARY_REF_NONE || buf_idx < 0) {
+        av1_set_default_mode_deltas(last_mode_deltas);
+      } else {
+        memcpy(last_mode_deltas,
+               cm->buffer_pool->frame_bufs[buf_idx].mode_deltas,
+               MAX_MODE_LF_DELTAS);
+      }
       for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
         const int delta = lf->mode_deltas[i];
-        const int changed = delta != lf->last_mode_deltas[i];
+        const int changed = delta != last_mode_deltas[i];
         aom_wb_write_bit(wb, changed);
-        if (changed) {
-          lf->last_mode_deltas[i] = delta;
-          aom_wb_write_inv_signed_literal(wb, delta, 6);
-        }
+        if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
       }
     }
   }
 }
 
-#if CONFIG_CDEF
 static void encode_cdef(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
+  assert(!cm->coded_lossless);
+  if (!cm->seq_params.enable_cdef) return;
+  if (cm->allow_intrabc) return;
+  const int num_planes = av1_num_planes(cm);
   int i;
-#if CONFIG_CDEF_SINGLEPASS
   aom_wb_write_literal(wb, cm->cdef_pri_damping - 3, 2);
   assert(cm->cdef_pri_damping == cm->cdef_sec_damping);
-#else
-  aom_wb_write_literal(wb, cm->cdef_pri_damping - 5, 1);
-  aom_wb_write_literal(wb, cm->cdef_sec_damping - 3, 2);
-#endif
   aom_wb_write_literal(wb, cm->cdef_bits, 2);
   for (i = 0; i < cm->nb_cdef_strengths; i++) {
     aom_wb_write_literal(wb, cm->cdef_strengths[i], CDEF_STRENGTH_BITS);
-    if (cm->subsampling_x == cm->subsampling_y)
+    if (num_planes > 1)
       aom_wb_write_literal(wb, cm->cdef_uv_strengths[i], CDEF_STRENGTH_BITS);
   }
 }
-#endif
 
 static void write_delta_q(struct aom_write_bit_buffer *wb, int delta_q) {
   if (delta_q != 0) {
@@ -3562,63 +2013,71 @@ static void write_delta_q(struct aom_write_bit_buffer *wb, int delta_q) {
 
 static void encode_quantization(const AV1_COMMON *const cm,
                                 struct aom_write_bit_buffer *wb) {
+  const int num_planes = av1_num_planes(cm);
+
   aom_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS);
   write_delta_q(wb, cm->y_dc_delta_q);
-  write_delta_q(wb, cm->uv_dc_delta_q);
-  write_delta_q(wb, cm->uv_ac_delta_q);
-#if CONFIG_AOM_QM
+  if (num_planes > 1) {
+    int diff_uv_delta = (cm->u_dc_delta_q != cm->v_dc_delta_q) ||
+                        (cm->u_ac_delta_q != cm->v_ac_delta_q);
+    if (cm->separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta);
+    write_delta_q(wb, cm->u_dc_delta_q);
+    write_delta_q(wb, cm->u_ac_delta_q);
+    if (diff_uv_delta) {
+      write_delta_q(wb, cm->v_dc_delta_q);
+      write_delta_q(wb, cm->v_ac_delta_q);
+    }
+  }
   aom_wb_write_bit(wb, cm->using_qmatrix);
   if (cm->using_qmatrix) {
-    aom_wb_write_literal(wb, cm->min_qmlevel, QM_LEVEL_BITS);
-    aom_wb_write_literal(wb, cm->max_qmlevel, QM_LEVEL_BITS);
+    aom_wb_write_literal(wb, cm->qm_y, QM_LEVEL_BITS);
+    aom_wb_write_literal(wb, cm->qm_u, QM_LEVEL_BITS);
+    if (!cm->separate_uv_delta_q)
+      assert(cm->qm_u == cm->qm_v);
+    else
+      aom_wb_write_literal(wb, cm->qm_v, QM_LEVEL_BITS);
   }
-#endif
 }
 
 static void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd,
                                 struct aom_write_bit_buffer *wb) {
   int i, j;
-  const struct segmentation *seg = &cm->seg;
+  struct segmentation *seg = &cm->seg;
 
   aom_wb_write_bit(wb, seg->enabled);
   if (!seg->enabled) return;
 
-  // Segmentation map
-  if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
-    aom_wb_write_bit(wb, seg->update_map);
-  } else {
+  // Write update flags
+  if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
     assert(seg->update_map == 1);
-  }
-  if (seg->update_map) {
-    // Select the coding strategy (temporal or spatial)
-    av1_choose_segmap_coding_method(cm, xd);
-
-    // Write out the chosen coding method.
-    if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
+    seg->temporal_update = 0;
+    assert(seg->update_data == 1);
+  } else {
+    aom_wb_write_bit(wb, seg->update_map);
+    if (seg->update_map) {
+      // Select the coding strategy (temporal or spatial)
+      av1_choose_segmap_coding_method(cm, xd);
       aom_wb_write_bit(wb, seg->temporal_update);
-    } else {
-      assert(seg->temporal_update == 0);
     }
+    aom_wb_write_bit(wb, seg->update_data);
   }
 
   // Segmentation data
-  aom_wb_write_bit(wb, seg->update_data);
   if (seg->update_data) {
-    aom_wb_write_bit(wb, seg->abs_delta);
-
     for (i = 0; i < MAX_SEGMENTS; i++) {
       for (j = 0; j < SEG_LVL_MAX; j++) {
         const int active = segfeature_active(seg, i, j);
         aom_wb_write_bit(wb, active);
         if (active) {
-          const int data = get_segdata(seg, i, j);
           const int data_max = av1_seg_feature_data_max(j);
+          const int data_min = -data_max;
+          const int ubits = get_unsigned_bits(data_max);
+          const int data = clamp(get_segdata(seg, i, j), data_min, data_max);
 
           if (av1_is_segfeature_signed(j)) {
-            encode_unsigned_max(wb, abs(data), data_max);
-            aom_wb_write_bit(wb, data < 0);
+            aom_wb_write_inv_signed_literal(wb, data, ubits);
           } else {
-            encode_unsigned_max(wb, data, data_max);
+            aom_wb_write_literal(wb, data, ubits);
           }
         }
       }
@@ -3628,26 +2087,11 @@ static void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd,
 
 static void write_tx_mode(AV1_COMMON *cm, TX_MODE *mode,
                           struct aom_write_bit_buffer *wb) {
-  if (cm->all_lossless) {
+  if (cm->coded_lossless) {
     *mode = ONLY_4X4;
     return;
   }
-#if CONFIG_VAR_TX_NO_TX_MODE
-  (void)wb;
-  *mode = TX_MODE_SELECT;
-  return;
-#else
-#if CONFIG_TX64X64
-  aom_wb_write_bit(wb, *mode == TX_MODE_SELECT);
-  if (*mode != TX_MODE_SELECT) {
-    aom_wb_write_literal(wb, AOMMIN(*mode, ALLOW_32X32), 2);
-    if (*mode >= ALLOW_32X32) aom_wb_write_bit(wb, *mode == ALLOW_64X64);
-  }
-#else
   aom_wb_write_bit(wb, *mode == TX_MODE_SELECT);
-  if (*mode != TX_MODE_SELECT) aom_wb_write_literal(wb, *mode, 2);
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_VAR_TX_NO_TX_MODE
 }
 
 static void write_frame_interp_filter(InterpFilter filter,
@@ -3672,14 +2116,7 @@ static void fix_interp_filter(AV1_COMMON *cm, FRAME_COUNTS *counts) {
       // Only one filter is used. So set the filter at frame level
       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
         if (count[i]) {
-#if CONFIG_MOTION_VAR && (CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION)
-#if CONFIG_WARPED_MOTION
-          if (i == EIGHTTAP_REGULAR || WARP_WM_NEIGHBORS_WITH_OBMC)
-#else
-          if (i == EIGHTTAP_REGULAR || WARP_GM_NEIGHBORS_WITH_OBMC)
-#endif  // CONFIG_WARPED_MOTION
-#endif  // CONFIG_MOTION_VAR && (CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION)
-            cm->interp_filter = i;
+          if (i == EIGHTTAP_REGULAR) cm->interp_filter = i;
           break;
         }
       }
@@ -3687,8 +2124,6 @@ static void fix_interp_filter(AV1_COMMON *cm, FRAME_COUNTS *counts) {
   }
 }
 
-#if CONFIG_MAX_TILE
-
 // Same function as write_uniform but writing to uncompresses header wb
 static void wb_write_uniform(struct aom_write_bit_buffer *wb, int n, int v) {
   const int l = get_unsigned_bits(n);
@@ -3704,10 +2139,10 @@ static void wb_write_uniform(struct aom_write_bit_buffer *wb, int n, int v) {
 
 static void write_tile_info_max_tile(const AV1_COMMON *const cm,
                                      struct aom_write_bit_buffer *wb) {
-  int width_mi = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
-  int height_mi = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
-  int width_sb = width_mi >> MAX_MIB_SIZE_LOG2;
-  int height_sb = height_mi >> MAX_MIB_SIZE_LOG2;
+  int width_mi = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
+  int height_mi = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+  int width_sb = width_mi >> cm->seq_params.mib_size_log2;
+  int height_sb = height_mi >> cm->seq_params.mib_size_log2;
   int size_sb, i;
 
   aom_wb_write_bit(wb, cm->uniform_tile_spacing_flag);
@@ -3736,7 +2171,8 @@ static void write_tile_info_max_tile(const AV1_COMMON *const cm,
     // columns
     for (i = 0; i < cm->tile_cols; i++) {
       size_sb = cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i];
-      wb_write_uniform(wb, AOMMIN(width_sb, MAX_TILE_WIDTH_SB), size_sb - 1);
+      wb_write_uniform(wb, AOMMIN(width_sb, cm->max_tile_width_sb),
+                       size_sb - 1);
       width_sb -= size_sb;
     }
     assert(width_sb == 0);
@@ -3751,72 +2187,45 @@ static void write_tile_info_max_tile(const AV1_COMMON *const cm,
     assert(height_sb == 0);
   }
 }
-#endif
 
 static void write_tile_info(const AV1_COMMON *const cm,
+                            struct aom_write_bit_buffer *saved_wb,
                             struct aom_write_bit_buffer *wb) {
-#if CONFIG_EXT_TILE
-  if (cm->large_scale_tile) {
-    const int tile_width =
-        ALIGN_POWER_OF_TWO(cm->tile_width, cm->mib_size_log2) >>
-        cm->mib_size_log2;
-    const int tile_height =
-        ALIGN_POWER_OF_TWO(cm->tile_height, cm->mib_size_log2) >>
-        cm->mib_size_log2;
-
-    assert(tile_width > 0);
-    assert(tile_height > 0);
-
-// Write the tile sizes
-#if CONFIG_EXT_PARTITION
-    if (cm->sb_size == BLOCK_128X128) {
-      assert(tile_width <= 32);
-      assert(tile_height <= 32);
-      aom_wb_write_literal(wb, tile_width - 1, 5);
-      aom_wb_write_literal(wb, tile_height - 1, 5);
-    } else {
-#endif  // CONFIG_EXT_PARTITION
-      assert(tile_width <= 64);
-      assert(tile_height <= 64);
-      aom_wb_write_literal(wb, tile_width - 1, 6);
-      aom_wb_write_literal(wb, tile_height - 1, 6);
-#if CONFIG_EXT_PARTITION
-    }
-#endif  // CONFIG_EXT_PARTITION
-  } else {
-#endif  // CONFIG_EXT_TILE
-
-#if CONFIG_MAX_TILE
-    write_tile_info_max_tile(cm, wb);
-#else
-  int min_log2_tile_cols, max_log2_tile_cols, ones;
-  av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+  write_tile_info_max_tile(cm, wb);
 
-  // columns
-  ones = cm->log2_tile_cols - min_log2_tile_cols;
-  while (ones--) aom_wb_write_bit(wb, 1);
+  *saved_wb = *wb;
+  if (cm->tile_rows * cm->tile_cols > 1) {
+    // tile id used for cdf update
+    aom_wb_write_literal(wb, 0, cm->log2_tile_cols + cm->log2_tile_rows);
+    // Number of bytes in tile size - 1
+    aom_wb_write_literal(wb, 3, 2);
+  }
+}
 
-  if (cm->log2_tile_cols < max_log2_tile_cols) aom_wb_write_bit(wb, 0);
+static void write_ext_tile_info(const AV1_COMMON *const cm,
+                                struct aom_write_bit_buffer *saved_wb,
+                                struct aom_write_bit_buffer *wb) {
+  // This information is stored as a separate byte.
+  int mod = wb->bit_offset % CHAR_BIT;
+  if (mod > 0) aom_wb_write_literal(wb, 0, CHAR_BIT - mod);
+  assert(aom_wb_is_byte_aligned(wb));
 
-  // rows
-  aom_wb_write_bit(wb, cm->log2_tile_rows != 0);
-  if (cm->log2_tile_rows != 0) aom_wb_write_bit(wb, cm->log2_tile_rows != 1);
-#endif
-#if CONFIG_DEPENDENT_HORZTILES
-    if (cm->tile_rows > 1) aom_wb_write_bit(wb, cm->dependent_horz_tiles);
-#endif
-#if CONFIG_EXT_TILE
+  *saved_wb = *wb;
+  if (cm->tile_rows * cm->tile_cols > 1) {
+    // Note that the last item in the uncompressed header is the data
+    // describing tile configuration.
+    // Number of bytes in tile column size - 1
+    aom_wb_write_literal(wb, 0, 2);
+    // Number of bytes in tile size - 1
+    aom_wb_write_literal(wb, 0, 2);
   }
-#endif  // CONFIG_EXT_TILE
-
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  aom_wb_write_bit(wb, cm->loop_filter_across_tiles_enabled);
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
 }
 
-#if CONFIG_EXT_REFS
 #if USE_GF16_MULTI_LAYER
 static int get_refresh_mask_gf16(AV1_COMP *cpi) {
+  if (cpi->common.frame_type == KEY_FRAME || frame_is_sframe(&cpi->common))
+    return 0xFF;
+
   int refresh_mask = 0;
 
   if (cpi->refresh_last_frame || cpi->refresh_golden_frame ||
@@ -3829,11 +2238,12 @@ static int get_refresh_mask_gf16(AV1_COMP *cpi) {
   return refresh_mask;
 }
 #endif  // USE_GF16_MULTI_LAYER
-#endif  // CONFIG_EXT_REFS
 
 static int get_refresh_mask(AV1_COMP *cpi) {
+  if (cpi->common.frame_type == KEY_FRAME || frame_is_sframe(&cpi->common))
+    return 0xFF;
+
   int refresh_mask = 0;
-#if CONFIG_EXT_REFS
 #if USE_GF16_MULTI_LAYER
   if (cpi->rc.baseline_gf_interval == 16) return get_refresh_mask_gf16(cpi);
 #endif  // USE_GF16_MULTI_LAYER
@@ -3847,13 +2257,12 @@ static int get_refresh_mask(AV1_COMP *cpi) {
   //     shifted and become the new virtual indexes for LAST2_FRAME and
   //     LAST3_FRAME.
   refresh_mask |=
-      (cpi->refresh_last_frame << cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]);
+      (cpi->refresh_last_frame << cpi->ref_fb_idx[LAST_REF_FRAMES - 1]);
 
-  refresh_mask |= (cpi->refresh_bwd_ref_frame << cpi->bwd_fb_idx);
-  refresh_mask |= (cpi->refresh_alt2_ref_frame << cpi->alt2_fb_idx);
-#else   // !CONFIG_EXT_REFS
-  refresh_mask |= (cpi->refresh_last_frame << cpi->lst_fb_idx);
-#endif  // CONFIG_EXT_REFS
+  refresh_mask |=
+      (cpi->refresh_bwd_ref_frame << cpi->ref_fb_idx[BWDREF_FRAME - 1]);
+  refresh_mask |=
+      (cpi->refresh_alt2_ref_frame << cpi->ref_fb_idx[ALTREF2_FRAME - 1]);
 
   if (av1_preserve_existing_gf(cpi)) {
     // We have decided to preserve the previously existing golden frame as our
@@ -3866,26 +2275,19 @@ static int get_refresh_mask(AV1_COMP *cpi) {
     // Note: This is highly specific to the use of ARF as a forward reference,
     // and this needs to be generalized as other uses are implemented
     // (like RTC/temporal scalability).
-    return refresh_mask | (cpi->refresh_golden_frame << cpi->alt_fb_idx);
+    return refresh_mask |
+           (cpi->refresh_golden_frame << cpi->ref_fb_idx[ALTREF_FRAME - 1]);
   } else {
-#if CONFIG_EXT_REFS
-    const int arf_idx = cpi->alt_fb_idx;
-#else   // !CONFIG_EXT_REFS
-    int arf_idx = cpi->alt_fb_idx;
-    if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
-      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-      arf_idx = gf_group->arf_update_idx[gf_group->index];
-    }
-#endif  // CONFIG_EXT_REFS
-    return refresh_mask | (cpi->refresh_golden_frame << cpi->gld_fb_idx) |
+    const int arf_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
+    return refresh_mask |
+           (cpi->refresh_golden_frame << cpi->ref_fb_idx[GOLDEN_FRAME - 1]) |
            (cpi->refresh_alt_ref_frame << arf_idx);
   }
 }
 
-#if CONFIG_EXT_TILE
 static INLINE int find_identical_tile(
     const int tile_row, const int tile_col,
-    TileBufferEnc (*const tile_buffers)[1024]) {
+    TileBufferEnc (*const tile_buffers)[MAX_TILE_COLS]) {
   const MV32 candidate_offset[1] = { { 1, 0 } };
   const uint8_t *const cur_tile_data =
       tile_buffers[tile_row][tile_col].data + 4;
@@ -3933,329 +2335,10 @@ static INLINE int find_identical_tile(
   // No identical tile found
   return 0;
 }
-#endif  // CONFIG_EXT_TILE
-
-#if !CONFIG_OBU || CONFIG_EXT_TILE
-static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
-                            unsigned int *max_tile_size,
-                            unsigned int *max_tile_col_size) {
-  const AV1_COMMON *const cm = &cpi->common;
-  aom_writer mode_bc;
-  int tile_row, tile_col;
-  TOKENEXTRA *(*const tok_buffers)[MAX_TILE_COLS] = cpi->tile_tok;
-  TileBufferEnc(*const tile_buffers)[MAX_TILE_COLS] = cpi->tile_buffers;
-  uint32_t total_size = 0;
-  const int tile_cols = cm->tile_cols;
-  const int tile_rows = cm->tile_rows;
-  unsigned int tile_size = 0;
-  const int have_tiles = tile_cols * tile_rows > 1;
-  struct aom_write_bit_buffer wb = { dst, 0 };
-  const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
-  uint32_t compressed_hdr_size;
-  // Fixed size tile groups for the moment
-  const int num_tg_hdrs = cm->num_tg;
-  const int tg_size =
-#if CONFIG_EXT_TILE
-      (cm->large_scale_tile)
-          ? 1
-          :
-#endif  // CONFIG_EXT_TILE
-          (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
-  int tile_count = 0;
-  int tg_count = 1;
-  int tile_size_bytes = 4;
-  int tile_col_size_bytes;
-  uint32_t uncompressed_hdr_size = 0;
-  struct aom_write_bit_buffer tg_params_wb;
-  struct aom_write_bit_buffer tile_size_bytes_wb;
-  uint32_t saved_offset;
-  int mtu_size = cpi->oxcf.mtu;
-  int curr_tg_data_size = 0;
-  int hdr_size;
-
-  *max_tile_size = 0;
-  *max_tile_col_size = 0;
-
-// All tile size fields are output on 4 bytes. A call to remux_tiles will
-// later compact the data if smaller headers are adequate.
-
-#if CONFIG_EXT_TILE
-  if (cm->large_scale_tile) {
-    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-      TileInfo tile_info;
-      const int is_last_col = (tile_col == tile_cols - 1);
-      const uint32_t col_offset = total_size;
-
-      av1_tile_set_col(&tile_info, cm, tile_col);
-
-      // The last column does not have a column header
-      if (!is_last_col) total_size += 4;
-
-      for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-        TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
-        const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
-        const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
-        const int data_offset = have_tiles ? 4 : 0;
-        const int tile_idx = tile_row * tile_cols + tile_col;
-        TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
-        av1_tile_set_row(&tile_info, cm, tile_row);
-
-        buf->data = dst + total_size;
-
-        // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
-        // even for the last one, unless no tiling is used at all.
-        total_size += data_offset;
-        // Initialise tile context from the frame context
-        this_tile->tctx = *cm->fc;
-        cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
-#if CONFIG_PVQ
-        cpi->td.mb.pvq_q = &this_tile->pvq_q;
-        cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
-#endif  // CONFIG_PVQ
-#if CONFIG_ANS
-        mode_bc.size = 1 << cpi->common.ans_window_size_log2;
-#endif
-        aom_start_encode(&mode_bc, buf->data + data_offset);
-        write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
-        assert(tok == tok_end);
-        aom_stop_encode(&mode_bc);
-        tile_size = mode_bc.pos;
-#if CONFIG_PVQ
-        cpi->td.mb.pvq_q = NULL;
-#endif
-        buf->size = tile_size;
-
-        // Record the maximum tile size we see, so we can compact headers later.
-        *max_tile_size = AOMMAX(*max_tile_size, tile_size);
-
-        if (have_tiles) {
-          // tile header: size of this tile, or copy offset
-          uint32_t tile_header = tile_size;
-          const int tile_copy_mode =
-              ((AOMMAX(cm->tile_width, cm->tile_height) << MI_SIZE_LOG2) <= 256)
-                  ? 1
-                  : 0;
-
-          // If tile_copy_mode = 1, check if this tile is a copy tile.
-          // Very low chances to have copy tiles on the key frames, so don't
-          // search on key frames to reduce unnecessary search.
-          if (cm->frame_type != KEY_FRAME && tile_copy_mode) {
-            const int idendical_tile_offset =
-                find_identical_tile(tile_row, tile_col, tile_buffers);
-
-            if (idendical_tile_offset > 0) {
-              tile_size = 0;
-              tile_header = idendical_tile_offset | 0x80;
-              tile_header <<= 24;
-            }
-          }
-
-          mem_put_le32(buf->data, tile_header);
-        }
-
-        total_size += tile_size;
-      }
-
-      if (!is_last_col) {
-        uint32_t col_size = total_size - col_offset - 4;
-        mem_put_le32(dst + col_offset, col_size);
-
-        // If it is not final packing, record the maximum tile column size we
-        // see, otherwise, check if the tile size is out of the range.
-        *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size);
-      }
-    }
-  } else {
-#endif  // CONFIG_EXT_TILE
-    write_uncompressed_header_frame(cpi, &wb);
-
-#if CONFIG_EXT_REFS
-    if (cm->show_existing_frame) {
-      total_size = aom_wb_bytes_written(&wb);
-      return (uint32_t)total_size;
-    }
-#endif  // CONFIG_EXT_REFS
-
-    // Write the tile length code
-    tile_size_bytes_wb = wb;
-    aom_wb_write_literal(&wb, 3, 2);
-
-    /* Write a placeholder for the number of tiles in each tile group */
-    tg_params_wb = wb;
-    saved_offset = wb.bit_offset;
-    if (have_tiles) {
-      aom_wb_overwrite_literal(&wb, 3, n_log2_tiles);
-      aom_wb_overwrite_literal(&wb, (1 << n_log2_tiles) - 1, n_log2_tiles);
-    }
-
-    if (!use_compressed_header(cm)) {
-      uncompressed_hdr_size = aom_wb_bytes_written(&wb);
-      compressed_hdr_size = 0;
-    } else {
-      /* Write a placeholder for the compressed header length */
-      struct aom_write_bit_buffer comp_hdr_len_wb = wb;
-      aom_wb_write_literal(&wb, 0, 16);
-
-      uncompressed_hdr_size = aom_wb_bytes_written(&wb);
-      compressed_hdr_size =
-          write_compressed_header(cpi, dst + uncompressed_hdr_size);
-      aom_wb_overwrite_literal(&comp_hdr_len_wb, (int)(compressed_hdr_size),
-                               16);
-    }
-
-    hdr_size = uncompressed_hdr_size + compressed_hdr_size;
-    total_size += hdr_size;
-
-    for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      TileInfo tile_info;
-      const int is_last_row = (tile_row == tile_rows - 1);
-      av1_tile_set_row(&tile_info, cm, tile_row);
-
-      for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-        const int tile_idx = tile_row * tile_cols + tile_col;
-        TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
-        TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
-        const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
-        const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
-        const int is_last_col = (tile_col == tile_cols - 1);
-        const int is_last_tile = is_last_col && is_last_row;
-
-        if ((!mtu_size && tile_count > tg_size) ||
-            (mtu_size && tile_count && curr_tg_data_size >= mtu_size)) {
-          // New tile group
-          tg_count++;
-          // We've exceeded the packet size
-          if (tile_count > 1) {
-            /* The last tile exceeded the packet size. The tile group size
-               should therefore be tile_count-1.
-               Move the last tile and insert headers before it
-             */
-            uint32_t old_total_size = total_size - tile_size - 4;
-            memmove(dst + old_total_size + hdr_size, dst + old_total_size,
-                    (tile_size + 4) * sizeof(uint8_t));
-            // Copy uncompressed header
-            memmove(dst + old_total_size, dst,
-                    uncompressed_hdr_size * sizeof(uint8_t));
-            // Write the number of tiles in the group into the last uncompressed
-            // header before the one we've just inserted
-            aom_wb_overwrite_literal(&tg_params_wb, tile_idx - tile_count,
-                                     n_log2_tiles);
-            aom_wb_overwrite_literal(&tg_params_wb, tile_count - 2,
-                                     n_log2_tiles);
-            // Update the pointer to the last TG params
-            tg_params_wb.bit_offset = saved_offset + 8 * old_total_size;
-            // Copy compressed header
-            memmove(dst + old_total_size + uncompressed_hdr_size,
-                    dst + uncompressed_hdr_size,
-                    compressed_hdr_size * sizeof(uint8_t));
-            total_size += hdr_size;
-            tile_count = 1;
-            curr_tg_data_size = hdr_size + tile_size + 4;
-          } else {
-            // We exceeded the packet size in just one tile
-            // Copy uncompressed header
-            memmove(dst + total_size, dst,
-                    uncompressed_hdr_size * sizeof(uint8_t));
-            // Write the number of tiles in the group into the last uncompressed
-            // header
-            aom_wb_overwrite_literal(&tg_params_wb, tile_idx - tile_count,
-                                     n_log2_tiles);
-            aom_wb_overwrite_literal(&tg_params_wb, tile_count - 1,
-                                     n_log2_tiles);
-            tg_params_wb.bit_offset = saved_offset + 8 * total_size;
-            // Copy compressed header
-            memmove(dst + total_size + uncompressed_hdr_size,
-                    dst + uncompressed_hdr_size,
-                    compressed_hdr_size * sizeof(uint8_t));
-            total_size += hdr_size;
-            tile_count = 0;
-            curr_tg_data_size = hdr_size;
-          }
-        }
-        tile_count++;
-        av1_tile_set_col(&tile_info, cm, tile_col);
-
-#if CONFIG_DEPENDENT_HORZTILES
-        av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col);
-#endif
-        buf->data = dst + total_size;
-
-        // The last tile does not have a header.
-        if (!is_last_tile) total_size += 4;
-
-        // Initialise tile context from the frame context
-        this_tile->tctx = *cm->fc;
-        cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
-#if CONFIG_PVQ
-        cpi->td.mb.pvq_q = &this_tile->pvq_q;
-        cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
-#endif  // CONFIG_PVQ
-#if CONFIG_ANS
-        mode_bc.size = 1 << cpi->common.ans_window_size_log2;
-#endif  // CONFIG_ANS
-#if CONFIG_LOOP_RESTORATION
-        for (int p = 0; p < MAX_MB_PLANE; ++p) {
-          set_default_wiener(cpi->td.mb.e_mbd.wiener_info + p);
-          set_default_sgrproj(cpi->td.mb.e_mbd.sgrproj_info + p);
-        }
-#endif  // CONFIG_LOOP_RESTORATION
-
-        aom_start_encode(&mode_bc, dst + total_size);
-        write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
-#if !CONFIG_LV_MAP
-#if !CONFIG_PVQ
-        assert(tok == tok_end);
-#endif  // !CONFIG_PVQ
-#endif  // !CONFIG_LV_MAP
-        aom_stop_encode(&mode_bc);
-        tile_size = mode_bc.pos;
-#if CONFIG_PVQ
-        cpi->td.mb.pvq_q = NULL;
-#endif
-
-        assert(tile_size > 0);
-
-        curr_tg_data_size += tile_size + 4;
-        buf->size = tile_size;
-
-        if (!is_last_tile) {
-          *max_tile_size = AOMMAX(*max_tile_size, tile_size);
-          // size of this tile
-          mem_put_le32(buf->data, tile_size);
-        }
-
-        total_size += tile_size;
-      }
-    }
-    // Write the final tile group size
-    if (n_log2_tiles) {
-      aom_wb_overwrite_literal(
-          &tg_params_wb, (tile_cols * tile_rows) - tile_count, n_log2_tiles);
-      aom_wb_overwrite_literal(&tg_params_wb, tile_count - 1, n_log2_tiles);
-    }
-    // Remux if possible. TODO (Thomas Davies): do this for more than one tile
-    // group
-    if (have_tiles && tg_count == 1) {
-      int data_size =
-          total_size - (uncompressed_hdr_size + compressed_hdr_size);
-      data_size =
-          remux_tiles(cm, dst + uncompressed_hdr_size + compressed_hdr_size,
-                      data_size, *max_tile_size, *max_tile_col_size,
-                      &tile_size_bytes, &tile_col_size_bytes);
-      total_size = data_size + uncompressed_hdr_size + compressed_hdr_size;
-      aom_wb_overwrite_literal(&tile_size_bytes_wb, tile_size_bytes - 1, 2);
-    }
-
-#if CONFIG_EXT_TILE
-  }
-#endif  // CONFIG_EXT_TILE
-  return (uint32_t)total_size;
-}
-#endif
 
 static void write_render_size(const AV1_COMMON *cm,
                               struct aom_write_bit_buffer *wb) {
-  const int scaling_active = !av1_resize_unscaled(cm);
+  const int scaling_active = av1_resize_scaled(cm);
   aom_wb_write_bit(wb, scaling_active);
   if (scaling_active) {
     aom_wb_write_literal(wb, cm->render_width - 1, 16);
@@ -4263,31 +2346,42 @@ static void write_render_size(const AV1_COMMON *cm,
   }
 }
 
-#if CONFIG_FRAME_SUPERRES
 static void write_superres_scale(const AV1_COMMON *const cm,
                                  struct aom_write_bit_buffer *wb) {
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  if (!seq_params->enable_superres) {
+    assert(cm->superres_scale_denominator == SCALE_NUMERATOR);
+    return;
+  }
+
   // First bit is whether to to scale or not
   if (cm->superres_scale_denominator == SCALE_NUMERATOR) {
     aom_wb_write_bit(wb, 0);  // no scaling
   } else {
     aom_wb_write_bit(wb, 1);  // scaling, write scale factor
+    assert(cm->superres_scale_denominator >= SUPERRES_SCALE_DENOMINATOR_MIN);
+    assert(cm->superres_scale_denominator <
+           SUPERRES_SCALE_DENOMINATOR_MIN + (1 << SUPERRES_SCALE_BITS));
     aom_wb_write_literal(
         wb, cm->superres_scale_denominator - SUPERRES_SCALE_DENOMINATOR_MIN,
         SUPERRES_SCALE_BITS);
   }
 }
-#endif  // CONFIG_FRAME_SUPERRES
 
-static void write_frame_size(const AV1_COMMON *cm,
+static void write_frame_size(const AV1_COMMON *cm, int frame_size_override,
                              struct aom_write_bit_buffer *wb) {
-#if CONFIG_FRAME_SUPERRES
-  aom_wb_write_literal(wb, cm->superres_upscaled_width - 1, 16);
-  aom_wb_write_literal(wb, cm->superres_upscaled_height - 1, 16);
+  const int coded_width = cm->superres_upscaled_width - 1;
+  const int coded_height = cm->superres_upscaled_height - 1;
+
+  if (frame_size_override) {
+    const SequenceHeader *seq_params = &cm->seq_params;
+    int num_bits_width = seq_params->num_bits_width;
+    int num_bits_height = seq_params->num_bits_height;
+    aom_wb_write_literal(wb, coded_width, num_bits_width);
+    aom_wb_write_literal(wb, coded_height, num_bits_height);
+  }
+
   write_superres_scale(cm, wb);
-#else
-  aom_wb_write_literal(wb, cm->width - 1, 16);
-  aom_wb_write_literal(wb, cm->height - 1, 16);
-#endif  // CONFIG_FRAME_SUPERRES
   write_render_size(cm, wb);
 }
 
@@ -4301,209 +2395,426 @@ static void write_frame_size_with_refs(AV1_COMP *cpi,
     YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame);
 
     if (cfg != NULL) {
-#if CONFIG_FRAME_SUPERRES
       found = cm->superres_upscaled_width == cfg->y_crop_width &&
               cm->superres_upscaled_height == cfg->y_crop_height;
-#else
-      found =
-          cm->width == cfg->y_crop_width && cm->height == cfg->y_crop_height;
-#endif
       found &= cm->render_width == cfg->render_width &&
                cm->render_height == cfg->render_height;
     }
     aom_wb_write_bit(wb, found);
     if (found) {
-#if CONFIG_FRAME_SUPERRES
       write_superres_scale(cm, wb);
-#endif  // CONFIG_FRAME_SUPERRES
       break;
     }
   }
 
-  if (!found) write_frame_size(cm, wb);
+  if (!found) {
+    int frame_size_override = 1;  // Always equal to 1 in this function
+    write_frame_size(cm, frame_size_override, wb);
+  }
 }
 
 static void write_profile(BITSTREAM_PROFILE profile,
                           struct aom_write_bit_buffer *wb) {
-  switch (profile) {
-    case PROFILE_0: aom_wb_write_literal(wb, 0, 2); break;
-    case PROFILE_1: aom_wb_write_literal(wb, 2, 2); break;
-    case PROFILE_2: aom_wb_write_literal(wb, 1, 2); break;
-    case PROFILE_3: aom_wb_write_literal(wb, 6, 3); break;
-    default: assert(0);
-  }
+  assert(profile >= PROFILE_0 && profile < MAX_PROFILES);
+  aom_wb_write_literal(wb, profile, PROFILE_BITS);
 }
 
-static void write_bitdepth_colorspace_sampling(
-    AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) {
-  if (cm->profile >= PROFILE_2) {
-    assert(cm->bit_depth > AOM_BITS_8);
+static void write_bitdepth(AV1_COMMON *const cm,
+                           struct aom_write_bit_buffer *wb) {
+  // Profile 0/1: [0] for 8 bit, [1]  10-bit
+  // Profile   2: [0] for 8 bit, [10] 10-bit, [11] - 12-bit
+  aom_wb_write_bit(wb, cm->bit_depth == AOM_BITS_8 ? 0 : 1);
+  if (cm->profile == PROFILE_2 && cm->bit_depth != AOM_BITS_8) {
     aom_wb_write_bit(wb, cm->bit_depth == AOM_BITS_10 ? 0 : 1);
   }
-#if CONFIG_COLORSPACE_HEADERS
-  aom_wb_write_literal(wb, cm->color_space, 5);
-  aom_wb_write_literal(wb, cm->transfer_function, 5);
-#else
-  aom_wb_write_literal(wb, cm->color_space, 3);
-#endif
-  if (cm->color_space != AOM_CS_SRGB) {
+}
+
+static void write_color_config(AV1_COMMON *const cm,
+                               struct aom_write_bit_buffer *wb) {
+  write_bitdepth(cm, wb);
+  const int is_monochrome = cm->seq_params.monochrome;
+  // monochrome bit
+  if (cm->profile != PROFILE_1)
+    aom_wb_write_bit(wb, is_monochrome);
+  else
+    assert(!is_monochrome);
+  if (cm->color_primaries == AOM_CICP_CP_UNSPECIFIED &&
+      cm->transfer_characteristics == AOM_CICP_TC_UNSPECIFIED &&
+      cm->matrix_coefficients == AOM_CICP_MC_UNSPECIFIED) {
+    aom_wb_write_bit(wb, 0);  // No color description present
+  } else {
+    aom_wb_write_bit(wb, 1);  // Color description present
+    aom_wb_write_literal(wb, cm->color_primaries, 8);
+    aom_wb_write_literal(wb, cm->transfer_characteristics, 8);
+    aom_wb_write_literal(wb, cm->matrix_coefficients, 8);
+  }
+  if (is_monochrome) {
     // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
     aom_wb_write_bit(wb, cm->color_range);
-    if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
-      assert(cm->subsampling_x != 1 || cm->subsampling_y != 1);
-      aom_wb_write_bit(wb, cm->subsampling_x);
-      aom_wb_write_bit(wb, cm->subsampling_y);
-      aom_wb_write_bit(wb, 0);  // unused
-    } else {
+    return;
+  }
+  if (cm->color_primaries == AOM_CICP_CP_BT_709 &&
+      cm->transfer_characteristics == AOM_CICP_TC_SRGB &&
+      cm->matrix_coefficients ==
+          AOM_CICP_MC_IDENTITY) {  // it would be better to remove this
+                                   // dependency too
+    assert(cm->subsampling_x == 0 && cm->subsampling_y == 0);
+    assert(cm->profile == PROFILE_1 ||
+           (cm->profile == PROFILE_2 && cm->bit_depth == AOM_BITS_12));
+  } else {
+    // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+    aom_wb_write_bit(wb, cm->color_range);
+    if (cm->profile == PROFILE_0) {
+      // 420 only
       assert(cm->subsampling_x == 1 && cm->subsampling_y == 1);
+    } else if (cm->profile == PROFILE_1) {
+      // 444 only
+      assert(cm->subsampling_x == 0 && cm->subsampling_y == 0);
+    } else if (cm->profile == PROFILE_2) {
+      if (cm->bit_depth == AOM_BITS_12) {
+        // 420, 444 or 422
+        aom_wb_write_bit(wb, cm->subsampling_x);
+        if (cm->subsampling_x == 0) {
+          assert(cm->subsampling_y == 0 &&
+                 "4:4:0 subsampling not allowed in AV1");
+        } else {
+          aom_wb_write_bit(wb, cm->subsampling_y);
+        }
+      } else {
+        // 422 only
+        assert(cm->subsampling_x == 1 && cm->subsampling_y == 0);
+      }
+    }
+    if (cm->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+      assert(cm->subsampling_x == 0 && cm->subsampling_y == 0);
     }
-#if CONFIG_COLORSPACE_HEADERS
     if (cm->subsampling_x == 1 && cm->subsampling_y == 1) {
       aom_wb_write_literal(wb, cm->chroma_sample_position, 2);
     }
-#endif
-  } else {
-    assert(cm->profile == PROFILE_1 || cm->profile == PROFILE_3);
-    aom_wb_write_bit(wb, 0);  // unused
   }
+  aom_wb_write_bit(wb, cm->separate_uv_delta_q);
 }
 
-#if CONFIG_REFERENCE_BUFFER
-void write_sequence_header(AV1_COMMON *const cm,
-                           struct aom_write_bit_buffer *wb) {
-  SequenceHeader *seq_params = &cm->seq_params;
-  /* Placeholder for actually writing to the bitstream */
-  seq_params->frame_id_numbers_present_flag =
-#if CONFIG_EXT_TILE
-      cm->large_scale_tile ? 0 :
-#endif  // CONFIG_EXT_TILE
-                           FRAME_ID_NUMBERS_PRESENT_FLAG;
-  seq_params->frame_id_length_minus7 = FRAME_ID_LENGTH_MINUS7;
-  seq_params->delta_frame_id_length_minus2 = DELTA_FRAME_ID_LENGTH_MINUS2;
-
-  aom_wb_write_bit(wb, seq_params->frame_id_numbers_present_flag);
-  if (seq_params->frame_id_numbers_present_flag) {
-    aom_wb_write_literal(wb, seq_params->frame_id_length_minus7, 4);
-    aom_wb_write_literal(wb, seq_params->delta_frame_id_length_minus2, 4);
+static void write_timing_info_header(AV1_COMMON *const cm,
+                                     struct aom_write_bit_buffer *wb) {
+  aom_wb_write_unsigned_literal(wb, cm->timing_info.num_units_in_display_tick,
+                                32);  // Number of units in tick
+  aom_wb_write_unsigned_literal(wb, cm->timing_info.time_scale,
+                                32);  // Time scale
+  aom_wb_write_bit(
+      wb,
+      cm->timing_info.equal_picture_interval);  // Equal picture interval bit
+  if (cm->timing_info.equal_picture_interval) {
+    aom_wb_write_uvlc(
+        wb,
+        cm->timing_info.num_ticks_per_picture - 1);  // ticks per picture
+  }
+}
+
+static void write_decoder_model_info(AV1_COMMON *const cm,
+                                     struct aom_write_bit_buffer *wb) {
+  aom_wb_write_literal(
+      wb, cm->buffer_model.encoder_decoder_buffer_delay_length - 1, 5);
+  aom_wb_write_unsigned_literal(wb, cm->buffer_model.num_units_in_decoding_tick,
+                                32);  // Number of units in decoding tick
+  aom_wb_write_literal(wb, cm->buffer_model.buffer_removal_delay_length - 1, 5);
+  aom_wb_write_literal(wb, cm->buffer_model.frame_presentation_delay_length - 1,
+                       5);
+}
+
+static void write_dec_model_op_parameters(AV1_COMMON *const cm,
+                                          struct aom_write_bit_buffer *wb,
+                                          int op_num) {
+  if (op_num > MAX_NUM_OPERATING_POINTS)
+    aom_internal_error(
+        &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        "Encoder does not support %d decoder model operating points", op_num);
+
+  //  aom_wb_write_bit(wb, cm->op_params[op_num].has_parameters);
+  //  if (!cm->op_params[op_num].has_parameters) return;
+
+  aom_wb_write_literal(wb, cm->op_params[op_num].decoder_buffer_delay,
+                       cm->buffer_model.encoder_decoder_buffer_delay_length);
+
+  aom_wb_write_literal(wb, cm->op_params[op_num].encoder_buffer_delay,
+                       cm->buffer_model.encoder_decoder_buffer_delay_length);
+
+  aom_wb_write_bit(wb, cm->op_params[op_num].low_delay_mode_flag);
+
+  cm->op_frame_timing[op_num].buffer_removal_delay =
+      0;  // reset the decoded frame counter
+}
+
+static void write_tu_pts_info(AV1_COMMON *const cm,
+                              struct aom_write_bit_buffer *wb) {
+  aom_wb_write_unsigned_literal(
+      wb, (uint32_t)cm->tu_presentation_delay,
+      cm->buffer_model.frame_presentation_delay_length);
+}
+
+static void write_film_grain_params(AV1_COMP *cpi,
+                                    struct aom_write_bit_buffer *wb) {
+  AV1_COMMON *const cm = &cpi->common;
+  aom_film_grain_t *pars = &cm->film_grain_params;
+
+  cm->cur_frame->film_grain_params = *pars;
+
+  aom_wb_write_bit(wb, pars->apply_grain);
+  if (!pars->apply_grain) return;
+
+  aom_wb_write_literal(wb, pars->random_seed, 16);
+
+  pars->random_seed += 3245;  // For film grain test vectors purposes
+  if (!pars->random_seed)     // Random seed should not be zero
+    pars->random_seed += 1735;
+  if (cm->frame_type == INTER_FRAME)
+    aom_wb_write_bit(wb, pars->update_parameters);
+  else
+    pars->update_parameters = 1;
+  if (!pars->update_parameters) {
+    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+    int ref_frame, ref_idx, buf_idx;
+    for (ref_frame = LAST_FRAME; ref_frame < REF_FRAMES; ref_frame++) {
+      ref_idx = get_ref_frame_map_idx(cpi, ref_frame);
+      assert(ref_idx != INVALID_IDX);
+      buf_idx = cm->ref_frame_map[ref_idx];
+      if (frame_bufs[buf_idx].film_grain_params_present &&
+          memcmp(pars, &frame_bufs[buf_idx].film_grain_params, sizeof(*pars))) {
+        break;
+      }
+    }
+    assert(ref_frame < REF_FRAMES);
+    aom_wb_write_literal(wb, ref_idx, 3);
+    return;
+  }
+
+  // Scaling functions parameters
+  aom_wb_write_literal(wb, pars->num_y_points, 4);  // max 14
+  for (int i = 0; i < pars->num_y_points; i++) {
+    aom_wb_write_literal(wb, pars->scaling_points_y[i][0], 8);
+    aom_wb_write_literal(wb, pars->scaling_points_y[i][1], 8);
+  }
+
+  if (!cm->seq_params.monochrome)
+    aom_wb_write_bit(wb, pars->chroma_scaling_from_luma);
+  else
+    pars->chroma_scaling_from_luma = 0;  // for monochrome override to 0
+
+  if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma ||
+      ((cm->subsampling_x == 1) && (cm->subsampling_y == 1) &&
+       (pars->num_y_points == 0))) {
+    pars->num_cb_points = 0;
+    pars->num_cr_points = 0;
+  } else {
+    aom_wb_write_literal(wb, pars->num_cb_points, 4);  // max 10
+    for (int i = 0; i < pars->num_cb_points; i++) {
+      aom_wb_write_literal(wb, pars->scaling_points_cb[i][0], 8);
+      aom_wb_write_literal(wb, pars->scaling_points_cb[i][1], 8);
+    }
+
+    aom_wb_write_literal(wb, pars->num_cr_points, 4);  // max 10
+    for (int i = 0; i < pars->num_cr_points; i++) {
+      aom_wb_write_literal(wb, pars->scaling_points_cr[i][0], 8);
+      aom_wb_write_literal(wb, pars->scaling_points_cr[i][1], 8);
+    }
   }
+
+  aom_wb_write_literal(wb, pars->scaling_shift - 8, 2);  // 8 + value
+
+  // AR coefficients
+  // Only sent if the corresponsing scaling function has
+  // more than 0 points
+
+  aom_wb_write_literal(wb, pars->ar_coeff_lag, 2);
+
+  int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+  int num_pos_chroma = num_pos_luma;
+  if (pars->num_y_points > 0) ++num_pos_chroma;
+
+  if (pars->num_y_points)
+    for (int i = 0; i < num_pos_luma; i++)
+      aom_wb_write_literal(wb, pars->ar_coeffs_y[i] + 128, 8);
+
+  if (pars->num_cb_points || pars->chroma_scaling_from_luma)
+    for (int i = 0; i < num_pos_chroma; i++)
+      aom_wb_write_literal(wb, pars->ar_coeffs_cb[i] + 128, 8);
+
+  if (pars->num_cr_points || pars->chroma_scaling_from_luma)
+    for (int i = 0; i < num_pos_chroma; i++)
+      aom_wb_write_literal(wb, pars->ar_coeffs_cr[i] + 128, 8);
+
+  aom_wb_write_literal(wb, pars->ar_coeff_shift - 6, 2);  // 8 + value
+
+  aom_wb_write_literal(wb, pars->grain_scale_shift, 2);
+
+  if (pars->num_cb_points) {
+    aom_wb_write_literal(wb, pars->cb_mult, 8);
+    aom_wb_write_literal(wb, pars->cb_luma_mult, 8);
+    aom_wb_write_literal(wb, pars->cb_offset, 9);
+  }
+
+  if (pars->num_cr_points) {
+    aom_wb_write_literal(wb, pars->cr_mult, 8);
+    aom_wb_write_literal(wb, pars->cr_luma_mult, 8);
+    aom_wb_write_literal(wb, pars->cr_offset, 9);
+  }
+
+  aom_wb_write_bit(wb, pars->overlap_flag);
+
+  aom_wb_write_bit(wb, pars->clip_to_restricted_range);
 }
-#endif  // CONFIG_REFERENCE_BUFFER
 
-static void write_sb_size(const AV1_COMMON *cm,
+static void write_sb_size(SequenceHeader *seq_params,
                           struct aom_write_bit_buffer *wb) {
-  (void)cm;
+  (void)seq_params;
   (void)wb;
-  assert(cm->mib_size == mi_size_wide[cm->sb_size]);
-  assert(cm->mib_size == 1 << cm->mib_size_log2);
-#if CONFIG_EXT_PARTITION
-  assert(cm->sb_size == BLOCK_128X128 || cm->sb_size == BLOCK_64X64);
-  aom_wb_write_bit(wb, cm->sb_size == BLOCK_128X128 ? 1 : 0);
-#else
-  assert(cm->sb_size == BLOCK_64X64);
-#endif  // CONFIG_EXT_PARTITION
+  assert(seq_params->mib_size == mi_size_wide[seq_params->sb_size]);
+  assert(seq_params->mib_size == 1 << seq_params->mib_size_log2);
+  assert(seq_params->sb_size == BLOCK_128X128 ||
+         seq_params->sb_size == BLOCK_64X64);
+  aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0);
 }
 
-static void write_compound_tools(const AV1_COMMON *cm,
-                                 struct aom_write_bit_buffer *wb) {
-  (void)cm;
-  (void)wb;
-#if CONFIG_INTERINTRA
-  if (!frame_is_intra_only(cm) && cm->reference_mode != COMPOUND_REFERENCE) {
-    aom_wb_write_bit(wb, cm->allow_interintra_compound);
-  } else {
-    assert(cm->allow_interintra_compound == 0);
-  }
-#endif  // CONFIG_INTERINTRA
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!frame_is_intra_only(cm)) {
-#else   // !CONFIG_COMPOUND_SINGLEREF
-  if (!frame_is_intra_only(cm) && cm->reference_mode != SINGLE_REFERENCE) {
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    aom_wb_write_bit(wb, cm->allow_masked_compound);
-  } else {
-    assert(cm->allow_masked_compound == 0);
+void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb) {
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *seq_params = &cm->seq_params;
+
+  int max_frame_width = cpi->oxcf.forced_max_frame_width
+                            ? cpi->oxcf.forced_max_frame_width
+                            : cpi->oxcf.width;
+  int max_frame_height = cpi->oxcf.forced_max_frame_height
+                             ? cpi->oxcf.forced_max_frame_height
+                             : cpi->oxcf.height;
+  const int num_bits_width =
+      (max_frame_width > 1) ? get_msb(max_frame_width - 1) + 1 : 1;
+  const int num_bits_height =
+      (max_frame_height > 1) ? get_msb(max_frame_height - 1) + 1 : 1;
+  assert(num_bits_width <= 16);
+  assert(num_bits_height <= 16);
+
+  seq_params->num_bits_width = num_bits_width;
+  seq_params->num_bits_height = num_bits_height;
+  seq_params->max_frame_width = max_frame_width;
+  seq_params->max_frame_height = max_frame_height;
+
+  aom_wb_write_literal(wb, num_bits_width - 1, 4);
+  aom_wb_write_literal(wb, num_bits_height - 1, 4);
+  aom_wb_write_literal(wb, max_frame_width - 1, num_bits_width);
+  aom_wb_write_literal(wb, max_frame_height - 1, num_bits_height);
+
+  /* Placeholder for actually writing to the bitstream */
+  if (!seq_params->reduced_still_picture_hdr) {
+    seq_params->frame_id_numbers_present_flag =
+        cm->large_scale_tile ? 0 : cm->error_resilient_mode;
+    seq_params->frame_id_length = FRAME_ID_LENGTH;
+    seq_params->delta_frame_id_length = DELTA_FRAME_ID_LENGTH;
+
+    aom_wb_write_bit(wb, seq_params->frame_id_numbers_present_flag);
+    if (seq_params->frame_id_numbers_present_flag) {
+      // We must always have delta_frame_id_length < frame_id_length,
+      // in order for a frame to be referenced with a unique delta.
+      // Avoid wasting bits by using a coding that enforces this restriction.
+      aom_wb_write_literal(wb, seq_params->delta_frame_id_length - 2, 4);
+      aom_wb_write_literal(
+          wb,
+          seq_params->frame_id_length - seq_params->delta_frame_id_length - 1,
+          3);
+    }
+  }
+
+  write_sb_size(seq_params, wb);
+
+  aom_wb_write_bit(wb, seq_params->enable_filter_intra);
+  aom_wb_write_bit(wb, seq_params->enable_intra_edge_filter);
+
+  if (!seq_params->reduced_still_picture_hdr) {
+    aom_wb_write_bit(wb, seq_params->enable_interintra_compound);
+    aom_wb_write_bit(wb, seq_params->enable_masked_compound);
+    aom_wb_write_bit(wb, seq_params->enable_warped_motion);
+    aom_wb_write_bit(wb, seq_params->enable_dual_filter);
+
+    aom_wb_write_bit(wb, seq_params->enable_order_hint);
+
+    if (seq_params->enable_order_hint) {
+      aom_wb_write_bit(wb, seq_params->enable_jnt_comp);
+      aom_wb_write_bit(wb, seq_params->enable_ref_frame_mvs);
+    }
+    if (seq_params->force_screen_content_tools == 2) {
+      aom_wb_write_bit(wb, 1);
+    } else {
+      aom_wb_write_bit(wb, 0);
+      aom_wb_write_bit(wb, seq_params->force_screen_content_tools);
+    }
+    if (seq_params->force_screen_content_tools > 0) {
+      if (seq_params->force_integer_mv == 2) {
+        aom_wb_write_bit(wb, 1);
+      } else {
+        aom_wb_write_bit(wb, 0);
+        aom_wb_write_bit(wb, seq_params->force_integer_mv);
+      }
+    } else {
+      assert(seq_params->force_integer_mv == 2);
+    }
+    if (seq_params->enable_order_hint)
+      aom_wb_write_literal(wb, seq_params->order_hint_bits_minus_1, 3);
   }
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
+
+  aom_wb_write_bit(wb, seq_params->enable_superres);
+  aom_wb_write_bit(wb, seq_params->enable_cdef);
+  aom_wb_write_bit(wb, seq_params->enable_restoration);
 }
 
-#if CONFIG_GLOBAL_MOTION
 static void write_global_motion_params(const WarpedMotionParams *params,
                                        const WarpedMotionParams *ref_params,
                                        struct aom_write_bit_buffer *wb,
                                        int allow_hp) {
-  TransformationType type = params->wmtype;
-  int trans_bits;
-  int trans_prec_diff;
+  const TransformationType type = params->wmtype;
 
   aom_wb_write_bit(wb, type != IDENTITY);
   if (type != IDENTITY) {
-#if GLOBAL_TRANS_TYPES > 4
-    aom_wb_write_literal(wb, type - 1, GLOBAL_TYPE_BITS);
-#else
     aom_wb_write_bit(wb, type == ROTZOOM);
     if (type != ROTZOOM) aom_wb_write_bit(wb, type == TRANSLATION);
-#endif  // GLOBAL_TRANS_TYPES > 4
-  }
-
-  switch (type) {
-    case HOMOGRAPHY:
-    case HORTRAPEZOID:
-    case VERTRAPEZOID:
-      if (type != HORTRAPEZOID)
-        aom_wb_write_signed_primitive_refsubexpfin(
-            wb, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
-            (ref_params->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF),
-            (params->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF));
-      if (type != VERTRAPEZOID)
-        aom_wb_write_signed_primitive_refsubexpfin(
-            wb, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
-            (ref_params->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF),
-            (params->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF));
-    // fallthrough intended
-    case AFFINE:
-    case ROTZOOM:
-      aom_wb_write_signed_primitive_refsubexpfin(
-          wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-          (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
-              (1 << GM_ALPHA_PREC_BITS),
-          (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
-      if (type != VERTRAPEZOID)
-        aom_wb_write_signed_primitive_refsubexpfin(
-            wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-            (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF),
-            (params->wmmat[3] >> GM_ALPHA_PREC_DIFF));
-      if (type >= AFFINE) {
-        if (type != HORTRAPEZOID)
-          aom_wb_write_signed_primitive_refsubexpfin(
-              wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-              (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF),
-              (params->wmmat[4] >> GM_ALPHA_PREC_DIFF));
-        aom_wb_write_signed_primitive_refsubexpfin(
-            wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-            (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
-                (1 << GM_ALPHA_PREC_BITS),
-            (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
-                (1 << GM_ALPHA_PREC_BITS));
-      }
-    // fallthrough intended
-    case TRANSLATION:
-      trans_bits = (type == TRANSLATION) ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
-                                         : GM_ABS_TRANS_BITS;
-      trans_prec_diff = (type == TRANSLATION)
-                            ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
-                            : GM_TRANS_PREC_DIFF;
-      aom_wb_write_signed_primitive_refsubexpfin(
-          wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
-          (ref_params->wmmat[0] >> trans_prec_diff),
-          (params->wmmat[0] >> trans_prec_diff));
-      aom_wb_write_signed_primitive_refsubexpfin(
-          wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
-          (ref_params->wmmat[1] >> trans_prec_diff),
-          (params->wmmat[1] >> trans_prec_diff));
-      break;
-    case IDENTITY: break;
-    default: assert(0);
+  }
+
+  if (type >= ROTZOOM) {
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
+            (1 << GM_ALPHA_PREC_BITS),
+        (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+        (params->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+  }
+
+  if (type >= AFFINE) {
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+        (params->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+            (1 << GM_ALPHA_PREC_BITS),
+        (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+  }
+
+  if (type >= TRANSLATION) {
+    const int trans_bits = (type == TRANSLATION)
+                               ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+                               : GM_ABS_TRANS_BITS;
+    const int trans_prec_diff = (type == TRANSLATION)
+                                    ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+                                    : GM_TRANS_PREC_DIFF;
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[0] >> trans_prec_diff),
+        (params->wmmat[0] >> trans_prec_diff));
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[1] >> trans_prec_diff),
+        (params->wmmat[1] >> trans_prec_diff));
   }
 }
 
@@ -4513,8 +2824,8 @@ static void write_global_motion(AV1_COMP *cpi,
   int frame;
   for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
     const WarpedMotionParams *ref_params =
-        cm->error_resilient_mode ? &default_warp_params
-                                 : &cm->prev_frame->global_motion[frame];
+        cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+                       : &default_warp_params;
     write_global_motion_params(&cm->global_motion[frame], ref_params, wb,
                                cm->allow_high_precision_mv);
     // TODO(sarahparker, debargha): The logic in the commented out code below
@@ -4541,820 +2852,452 @@ static void write_global_motion(AV1_COMP *cpi,
            */
   }
 }
-#endif
 
-#if !CONFIG_OBU
-static void write_uncompressed_header_frame(AV1_COMP *cpi,
-                                            struct aom_write_bit_buffer *wb) {
+static void check_frame_refs_short_signaling(AV1_COMP *const cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-
-  aom_wb_write_literal(wb, AOM_FRAME_MARKER, 2);
-
-  write_profile(cm->profile, wb);
+  if (!cm->frame_refs_short_signaling) return;
 
-#if CONFIG_EXT_TILE
-  aom_wb_write_literal(wb, cm->large_scale_tile, 1);
-#endif  // CONFIG_EXT_TILE
-
-#if CONFIG_EXT_REFS
-  // NOTE: By default all coded frames to be used as a reference
-  cm->is_reference_frame = 1;
-
-  if (cm->show_existing_frame) {
-    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
-    const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show];
-
-    if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                         "Buffer %d does not contain a reconstructed frame",
-                         frame_to_show);
-    }
-    ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
-
-    aom_wb_write_bit(wb, 1);  // show_existing_frame
-    aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
-
-#if CONFIG_REFERENCE_BUFFER
-    if (cm->seq_params.frame_id_numbers_present_flag) {
-      int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
-      int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
-      aom_wb_write_literal(wb, display_frame_id, frame_id_len);
-      /* Add a zero byte to prevent emulation of superframe marker */
-      /* Same logic as when when terminating the entropy coder */
-      /* Consider to have this logic only one place */
-      aom_wb_write_literal(wb, 0, 8);
+  // Check whether all references are distinct frames.
+  int buf_markers[FRAME_BUFFERS] = { 0 };
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+    if (buf_idx != INVALID_IDX) {
+      assert(buf_idx >= 0 && buf_idx < FRAME_BUFFERS);
+      buf_markers[buf_idx] = 1;
     }
-#endif  // CONFIG_REFERENCE_BUFFER
-
-    return;
-  } else {
-#endif                        // CONFIG_EXT_REFS
-    aom_wb_write_bit(wb, 0);  // show_existing_frame
-#if CONFIG_EXT_REFS
   }
-#endif  // CONFIG_EXT_REFS
 
-  aom_wb_write_bit(wb, cm->frame_type);
-  aom_wb_write_bit(wb, cm->show_frame);
-  if (cm->frame_type != KEY_FRAME)
-    if (!cm->show_frame) aom_wb_write_bit(wb, cm->intra_only);
-  aom_wb_write_bit(wb, cm->error_resilient_mode);
-
-  if (frame_is_intra_only(cm)) {
-#if CONFIG_REFERENCE_BUFFER
-    write_sequence_header(cm, wb);
-#endif  // CONFIG_REFERENCE_BUFFER
-  }
-#if CONFIG_REFERENCE_BUFFER
-  cm->invalid_delta_frame_id_minus1 = 0;
-  if (cm->seq_params.frame_id_numbers_present_flag) {
-    int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
-    aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len);
+  int num_refs = 0;
+  for (int buf_idx = 0; buf_idx < FRAME_BUFFERS; ++buf_idx) {
+    num_refs += buf_markers[buf_idx];
   }
-#endif  // CONFIG_REFERENCE_BUFFER
-  if (cm->frame_type == KEY_FRAME) {
-    write_bitdepth_colorspace_sampling(cm, wb);
-    write_frame_size(cm, wb);
-    write_sb_size(cm, wb);
-
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-    assert(cpi->common.ans_window_size_log2 >= 8);
-    assert(cpi->common.ans_window_size_log2 < 24);
-    aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4);
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-    aom_wb_write_bit(wb, cm->allow_screen_content_tools);
-#if CONFIG_AMVR
-    if (cm->allow_screen_content_tools) {
-      if (cm->seq_mv_precision_level == 2) {
-        aom_wb_write_bit(wb, 1);
-      } else {
-        aom_wb_write_bit(wb, 0);
-        aom_wb_write_bit(wb, cm->seq_mv_precision_level == 0);
-      }
-    }
-#endif
-  } else {
-    if (cm->intra_only) aom_wb_write_bit(wb, cm->allow_screen_content_tools);
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-    if (!cm->error_resilient_mode) {
-      if (cm->intra_only) {
-        aom_wb_write_bit(wb,
-                         cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
-      } else {
-        aom_wb_write_bit(wb,
-                         cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE);
-        if (cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE)
-          aom_wb_write_bit(wb,
-                           cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
-      }
-    }
-#endif
-#if CONFIG_EXT_REFS
-    cpi->refresh_frame_mask = get_refresh_mask(cpi);
-#endif  // CONFIG_EXT_REFS
-
-    if (cm->intra_only) {
-      write_bitdepth_colorspace_sampling(cm, wb);
-
-#if CONFIG_EXT_REFS
-      aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
-#else
-      aom_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
-#endif  // CONFIG_EXT_REFS
-      write_frame_size(cm, wb);
-
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-      assert(cpi->common.ans_window_size_log2 >= 8);
-      assert(cpi->common.ans_window_size_log2 < 24);
-      aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4);
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-    } else {
-      MV_REFERENCE_FRAME ref_frame;
-
-#if CONFIG_EXT_REFS
-      aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
-#else
-      aom_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
-#endif  // CONFIG_EXT_REFS
-
-#if CONFIG_EXT_REFS
-      if (!cpi->refresh_frame_mask) {
-        // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame
-        //       will not be used as a reference
-        cm->is_reference_frame = 0;
-      }
-#endif  // CONFIG_EXT_REFS
-
-      for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-        assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
-        aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
-                             REF_FRAMES_LOG2);
-#if !CONFIG_FRAME_SIGN_BIAS
-        aom_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
-#endif  // !CONFIG_FRAME_SIGN_BIAS
-#if CONFIG_REFERENCE_BUFFER
-        if (cm->seq_params.frame_id_numbers_present_flag) {
-          int i = get_ref_frame_map_idx(cpi, ref_frame);
-          int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
-          int diff_len = cm->seq_params.delta_frame_id_length_minus2 + 2;
-          int delta_frame_id_minus1 =
-              ((cm->current_frame_id - cm->ref_frame_id[i] +
-                (1 << frame_id_len)) %
-               (1 << frame_id_len)) -
-              1;
-          if (delta_frame_id_minus1 < 0 ||
-              delta_frame_id_minus1 >= (1 << diff_len))
-            cm->invalid_delta_frame_id_minus1 = 1;
-          aom_wb_write_literal(wb, delta_frame_id_minus1, diff_len);
-        }
-#endif  // CONFIG_REFERENCE_BUFFER
-      }
 
-#if CONFIG_FRAME_SIGN_BIAS
-#define FRAME_SIGN_BIAS_DEBUG 0
-#if FRAME_SIGN_BIAS_DEBUG
-      {
-        printf("\n\nENCODER: Frame=%d, show_frame=%d:", cm->current_video_frame,
-               cm->show_frame);
-        for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-          printf(" sign_bias[%d]=%d", ref_frame,
-                 cm->ref_frame_sign_bias[ref_frame]);
-        }
-        printf("\n");
-      }
-#endif  // FRAME_SIGN_BIAS_DEBUG
-#undef FRAME_SIGN_BIAS_DEBUG
-#endif  // CONFIG_FRAME_SIGN_BIAS
-
-#if CONFIG_FRAME_SIZE
-      if (cm->error_resilient_mode == 0) {
-        write_frame_size_with_refs(cpi, wb);
-      } else {
-        write_frame_size(cm, wb);
-      }
-#else
-      write_frame_size_with_refs(cpi, wb);
-#endif
-
-#if CONFIG_AMVR
-      if (cm->seq_mv_precision_level == 2) {
-        aom_wb_write_bit(wb, cm->cur_frame_mv_precision_level == 0);
-      }
-#endif
-      aom_wb_write_bit(wb, cm->allow_high_precision_mv);
-
-      fix_interp_filter(cm, cpi->td.counts);
-      write_frame_interp_filter(cm->interp_filter, wb);
-#if CONFIG_TEMPMV_SIGNALING
-      if (frame_might_use_prev_frame_mvs(cm)) {
-        aom_wb_write_bit(wb, cm->use_prev_frame_mvs);
-      }
-#endif
-    }
+  // We only turn on frame_refs_short_signaling when all references are
+  // distinct.
+  if (num_refs < INTER_REFS_PER_FRAME) {
+    // It indicates that there exist more than one reference frame pointing to
+    // the same reference buffer, i.e. two or more references are duplicate.
+    cm->frame_refs_short_signaling = 0;
+    return;
   }
 
-#if CONFIG_FRAME_MARKER
-  if (cm->show_frame == 0) {
-    int arf_offset = AOMMIN(
-        (MAX_GF_INTERVAL - 1),
-        cpi->twopass.gf_group.arf_src_offset[cpi->twopass.gf_group.index]);
-#if CONFIG_EXT_REFS
-    int brf_offset =
-        cpi->twopass.gf_group.brf_src_offset[cpi->twopass.gf_group.index];
+  // Check whether the encoder side ref frame choices are aligned with that to
+  // be derived at the decoder side.
+  RefBuffer frame_refs_copy[INTER_REFS_PER_FRAME];
 
-    arf_offset = AOMMIN((MAX_GF_INTERVAL - 1), arf_offset + brf_offset);
-#endif
-    aom_wb_write_literal(wb, arf_offset, 4);
-  }
-#endif
+  // Backup the frame refs info
+  memcpy(frame_refs_copy, cm->frame_refs,
+         INTER_REFS_PER_FRAME * sizeof(RefBuffer));
 
-#if CONFIG_REFERENCE_BUFFER
-  if (cm->seq_params.frame_id_numbers_present_flag) {
-    cm->refresh_mask =
-        cm->frame_type == KEY_FRAME ? 0xFF : get_refresh_mask(cpi);
-  }
-#endif  // CONFIG_REFERENCE_BUFFER
+  const int lst_map_idx = get_ref_frame_map_idx(cpi, LAST_FRAME);
+  const int gld_map_idx = get_ref_frame_map_idx(cpi, GOLDEN_FRAME);
 
-  if (!cm->error_resilient_mode) {
-    aom_wb_write_bit(
-        wb, cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD);
-  }
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  aom_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
-#endif
-  encode_loopfilter(cm, wb);
-  encode_quantization(cm, wb);
-  encode_segmentation(cm, xd, wb);
-  {
-    int i;
-    struct segmentation *const seg = &cm->seg;
-    int segment_quantizer_active = 0;
-    for (i = 0; i < MAX_SEGMENTS; i++) {
-      if (segfeature_active(seg, i, SEG_LVL_ALT_Q)) {
-        segment_quantizer_active = 1;
-      }
-    }
+  // Set up the frame refs mapping indexes according to the
+  // frame_refs_short_signaling policy.
+  av1_set_frame_refs(cm, lst_map_idx, gld_map_idx);
 
-    if (cm->delta_q_present_flag)
-      assert(segment_quantizer_active == 0 && cm->base_qindex > 0);
-    if (segment_quantizer_active == 0 && cm->base_qindex > 0) {
-      aom_wb_write_bit(wb, cm->delta_q_present_flag);
-      if (cm->delta_q_present_flag) {
-        aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_q_res) - 1, 2);
-        xd->prev_qindex = cm->base_qindex;
-#if CONFIG_EXT_DELTA_Q
-        assert(seg->abs_delta == SEGMENT_DELTADATA);
-        aom_wb_write_bit(wb, cm->delta_lf_present_flag);
-        if (cm->delta_lf_present_flag) {
-          aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_lf_res) - 1, 2);
-          xd->prev_delta_lf_from_base = 0;
-#if CONFIG_LOOPFILTER_LEVEL
-          aom_wb_write_bit(wb, cm->delta_lf_multi);
-          for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id)
-            xd->prev_delta_lf[lf_id] = 0;
-#endif  // CONFIG_LOOPFILTER_LEVEL
-        }
-#endif  // CONFIG_EXT_DELTA_Q
-      }
+  // We only turn on frame_refs_short_signaling when the encoder side decision
+  // on ref frames is identical to that at the decoder side.
+  for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ++ref_idx) {
+    // Compare the buffer index between two reference frames indexed
+    // respectively by the encoder and the decoder side decisions.
+    if (cm->frame_refs[ref_idx].idx != frame_refs_copy[ref_idx].idx) {
+      cm->frame_refs_short_signaling = 0;
+      break;
     }
   }
-#if CONFIG_CDEF
-  if (!cm->all_lossless) {
-    encode_cdef(cm, wb);
-  }
-#endif
-#if CONFIG_LOOP_RESTORATION
-  encode_restoration_mode(cm, wb);
-#endif  // CONFIG_LOOP_RESTORATION
-  write_tx_mode(cm, &cm->tx_mode, wb);
-
-  if (cpi->allow_comp_inter_inter) {
-    const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
-#if !CONFIG_REF_ADAPT
-    const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
-#endif  // !CONFIG_REF_ADAPT
 
-    aom_wb_write_bit(wb, use_hybrid_pred);
-#if !CONFIG_REF_ADAPT
-    if (!use_hybrid_pred) aom_wb_write_bit(wb, use_compound_pred);
-#endif  // !CONFIG_REF_ADAPT
+#if 0   // For debug
+  printf("\nFrame=%d: \n", cm->current_video_frame);
+  printf("***frame_refs_short_signaling=%d\n", cm->frame_refs_short_signaling);
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    printf("enc_ref(map_idx=%d, buf_idx=%d)=%d, vs. "
+        "dec_ref(map_idx=%d, buf_idx=%d)=%d\n",
+        get_ref_frame_map_idx(cpi, ref_frame),
+        get_ref_frame_buf_idx(cpi, ref_frame), ref_frame,
+        cm->frame_refs[ref_frame - LAST_FRAME].map_idx,
+        cm->frame_refs[ref_frame - LAST_FRAME].idx, ref_frame);
   }
-  write_compound_tools(cm, wb);
-
-#if CONFIG_EXT_TX
-  aom_wb_write_bit(wb, cm->reduced_tx_set_used);
-#endif  // CONFIG_EXT_TX
-
-#if CONFIG_ADAPT_SCAN
-  aom_wb_write_bit(wb, cm->use_adapt_scan);
-#endif
-
-#if CONFIG_GLOBAL_MOTION
-  if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb);
-#endif  // CONFIG_GLOBAL_MOTION
+#endif  // 0
 
-  write_tile_info(cm, wb);
+  // Restore the frame refs info if frame_refs_short_signaling is off.
+  if (!cm->frame_refs_short_signaling)
+    memcpy(cm->frame_refs, frame_refs_copy,
+           INTER_REFS_PER_FRAME * sizeof(RefBuffer));
 }
 
-#else
 // New function based on HLS R18
 static void write_uncompressed_header_obu(AV1_COMP *cpi,
+                                          struct aom_write_bit_buffer *saved_wb,
                                           struct aom_write_bit_buffer *wb) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 
-#if CONFIG_EXT_TILE
-  aom_wb_write_literal(wb, cm->large_scale_tile, 1);
-#endif  // CONFIG_EXT_TILE
-
-#if CONFIG_EXT_REFS
   // NOTE: By default all coded frames to be used as a reference
   cm->is_reference_frame = 1;
+  cm->frame_type = cm->intra_only ? INTRA_ONLY_FRAME : cm->frame_type;
 
-  if (cm->show_existing_frame) {
-    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
-    const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show];
+  if (cm->seq_params.still_picture) {
+    assert(cm->show_existing_frame == 0);
+    assert(cm->show_frame == 1);
+    assert(cm->frame_type == KEY_FRAME);
+  }
+  if (!cm->seq_params.reduced_still_picture_hdr) {
+    if (cm->show_existing_frame) {
+      RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+      const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show];
 
-    if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
-                         "Buffer %d does not contain a reconstructed frame",
-                         frame_to_show);
+      if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Buffer %d does not contain a reconstructed frame",
+                           frame_to_show);
+      }
+      ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+
+      aom_wb_write_bit(wb, 1);  // show_existing_frame
+      aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
+
+      if (cm->seq_params.decoder_model_info_present_flag &&
+          cm->timing_info.equal_picture_interval == 0) {
+        write_tu_pts_info(cm, wb);
+      }
+      if (cm->seq_params.frame_id_numbers_present_flag) {
+        int frame_id_len = cm->seq_params.frame_id_length;
+        int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
+        aom_wb_write_literal(wb, display_frame_id, frame_id_len);
+      }
+
+      if (cm->reset_decoder_state &&
+          frame_bufs[frame_to_show].frame_type != KEY_FRAME) {
+        aom_internal_error(
+            &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+            "show_existing_frame to reset state on KEY_FRAME only");
+      }
+
+      return;
+    } else {
+      aom_wb_write_bit(wb, 0);  // show_existing_frame
     }
-    ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
 
-    aom_wb_write_bit(wb, 1);  // show_existing_frame
-    aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
+    aom_wb_write_literal(wb, cm->frame_type, 2);
 
-#if CONFIG_REFERENCE_BUFFER
-    if (cm->seq_params.frame_id_numbers_present_flag) {
-      int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
-      int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
-      aom_wb_write_literal(wb, display_frame_id, frame_id_len);
-      /* Add a zero byte to prevent emulation of superframe marker */
-      /* Same logic as when when terminating the entropy coder */
-      /* Consider to have this logic only one place */
-      aom_wb_write_literal(wb, 0, 8);
+    aom_wb_write_bit(wb, cm->show_frame);
+    if (cm->show_frame) {
+      if (cm->seq_params.decoder_model_info_present_flag &&
+          cm->timing_info.equal_picture_interval == 0)
+        write_tu_pts_info(cm, wb);
+    } else {
+      aom_wb_write_bit(wb, cm->showable_frame);
+    }
+    if (frame_is_sframe(cm)) {
+      assert(cm->error_resilient_mode);
+    } else if (!(cm->frame_type == KEY_FRAME && cm->show_frame)) {
+      aom_wb_write_bit(wb, cm->error_resilient_mode);
     }
-#endif  // CONFIG_REFERENCE_BUFFER
+  }
+  aom_wb_write_bit(wb, cm->disable_cdf_update);
 
-    return;
+  if (cm->seq_params.force_screen_content_tools == 2) {
+    aom_wb_write_bit(wb, cm->allow_screen_content_tools);
   } else {
-#endif  // CONFIG_EXT_REFS
-    aom_wb_write_bit(wb, 0);  // show_existing_frame
-#if CONFIG_EXT_REFS
+    assert(cm->allow_screen_content_tools ==
+           cm->seq_params.force_screen_content_tools);
   }
-#endif  // CONFIG_EXT_REFS
 
-  cm->frame_type = cm->intra_only ? INTRA_ONLY_FRAME : cm->frame_type;
-  aom_wb_write_literal(wb, cm->frame_type, 2);
+  if (cm->allow_screen_content_tools) {
+    if (cm->seq_params.force_integer_mv == 2) {
+      aom_wb_write_bit(wb, cm->cur_frame_force_integer_mv);
+    } else {
+      assert(cm->cur_frame_force_integer_mv == cm->seq_params.force_integer_mv);
+    }
+  } else {
+    assert(cm->cur_frame_force_integer_mv == 0);
+  }
 
-  if (cm->intra_only) cm->frame_type = INTRA_ONLY_FRAME;
+  cm->invalid_delta_frame_id_minus_1 = 0;
+  int frame_size_override_flag = 0;
+  cm->frame_refs_short_signaling = 0;
 
-  aom_wb_write_bit(wb, cm->show_frame);
-  aom_wb_write_bit(wb, cm->error_resilient_mode);
+  if (cm->seq_params.reduced_still_picture_hdr) {
+    assert(cm->width == cm->seq_params.max_frame_width &&
+           cm->height == cm->seq_params.max_frame_height);
+  } else {
+    if (cm->seq_params.frame_id_numbers_present_flag) {
+      int frame_id_len = cm->seq_params.frame_id_length;
+      aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len);
+    }
 
-#if CONFIG_REFERENCE_BUFFER
-  cm->invalid_delta_frame_id_minus1 = 0;
-  if (cm->seq_params.frame_id_numbers_present_flag) {
-    int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
-    aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len);
+    if (cm->width > cm->seq_params.max_frame_width ||
+        cm->height > cm->seq_params.max_frame_height) {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Frame dimensions are larger than the maximum values");
+    }
+
+    frame_size_override_flag =
+        frame_is_sframe(cm) ? 1
+                            : (cm->width != cm->seq_params.max_frame_width ||
+                               cm->height != cm->seq_params.max_frame_height);
+    if (!frame_is_sframe(cm)) aom_wb_write_bit(wb, frame_size_override_flag);
+
+    if (cm->seq_params.enable_order_hint)
+      aom_wb_write_literal(wb, cm->frame_offset,
+                           cm->seq_params.order_hint_bits_minus_1 + 1);
+
+    if (!cm->error_resilient_mode && !frame_is_intra_only(cm)) {
+      aom_wb_write_literal(wb, cm->primary_ref_frame, PRIMARY_REF_BITS);
+    }
+  }
+
+  if (cm->seq_params.decoder_model_info_present_flag) {
+    aom_wb_write_bit(wb, cm->buffer_removal_delay_present);
+    if (cm->buffer_removal_delay_present) {
+      for (int op_num = 0;
+           op_num < cm->seq_params.operating_points_cnt_minus_1 + 1; op_num++) {
+        if (cm->op_params[op_num].decoder_model_param_present_flag) {
+          if (((cm->seq_params.operating_point_idc[op_num] >>
+                cm->temporal_layer_id) &
+                   0x1 &&
+               (cm->seq_params.operating_point_idc[op_num] >>
+                (cm->spatial_layer_id + 8)) &
+                   0x1) ||
+              cm->seq_params.operating_point_idc[op_num] == 0) {
+            aom_wb_write_literal(
+                wb, (uint32_t)cm->op_frame_timing[op_num].buffer_removal_delay,
+                cm->buffer_model.buffer_removal_delay_length);
+            cm->op_frame_timing[op_num].buffer_removal_delay++;
+          }
+        }
+      }
+    }
   }
-#endif  // CONFIG_REFERENCE_BUFFER
+  cpi->refresh_frame_mask = get_refresh_mask(cpi);
   if (cm->frame_type == KEY_FRAME) {
-    write_frame_size(cm, wb);
-    write_sb_size(cm, wb);
-
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-    assert(cpi->common.ans_window_size_log2 >= 8);
-    assert(cpi->common.ans_window_size_log2 < 24);
-    aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4);
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-    aom_wb_write_bit(wb, cm->allow_screen_content_tools);
-#if CONFIG_AMVR
-    if (cm->allow_screen_content_tools) {
-      if (cm->seq_mv_precision_level == 2) {
-        aom_wb_write_bit(wb, 1);
-      } else {
-        aom_wb_write_bit(wb, 0);
-        aom_wb_write_bit(wb, cm->seq_mv_precision_level == 0);
-      }
+    if (!cm->show_frame) {  // unshown keyframe (forward keyframe)
+      aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+    } else {
+      assert(cpi->refresh_frame_mask == 0xFF);
     }
-#endif
-  } else if (cm->frame_type == INTRA_ONLY_FRAME) {
-    if (cm->intra_only) aom_wb_write_bit(wb, cm->allow_screen_content_tools);
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-    if (!cm->error_resilient_mode) {
-      if (cm->intra_only) {
-        aom_wb_write_bit(wb,
-                         cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
+  } else {
+    if (cm->frame_type == INTRA_ONLY_FRAME) {
+      assert(cpi->refresh_frame_mask != 0xFF);
+      int updated_fb = -1;
+      for (int i = 0; i < REF_FRAMES; i++) {
+        // If more than one frame is refreshed, it doesn't matter which one
+        // we pick, so pick the first.
+        if (cpi->refresh_frame_mask & (1 << i)) {
+          updated_fb = i;
+          break;
+        }
       }
-    }
-#endif
-#if CONFIG_EXT_REFS
-    cpi->refresh_frame_mask = get_refresh_mask(cpi);
-#endif  // CONFIG_EXT_REFS
-
-    if (cm->intra_only) {
-#if CONFIG_EXT_REFS
+      assert(updated_fb >= 0);
+      cm->fb_of_context_type[cm->frame_context_idx] = updated_fb;
       aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
-#else
-      aom_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
-#endif  // CONFIG_EXT_REFS
-      write_frame_size(cm, wb);
-
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-      assert(cpi->common.ans_window_size_log2 >= 8);
-      assert(cpi->common.ans_window_size_log2 < 24);
-      aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4);
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-    }
-  } else if (cm->frame_type == INTER_FRAME) {
-    MV_REFERENCE_FRAME ref_frame;
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-    if (!cm->error_resilient_mode) {
-      aom_wb_write_bit(wb, cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE);
-      if (cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE)
-        aom_wb_write_bit(wb,
-                         cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
-    }
-#endif
+    } else if (cm->frame_type == INTER_FRAME || frame_is_sframe(cm)) {
+      if (cm->frame_type == INTER_FRAME) {
+        aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+      } else {
+        assert(frame_is_sframe(cm) && cpi->refresh_frame_mask == 0xFF);
+      }
+      int updated_fb = -1;
+      for (int i = 0; i < REF_FRAMES; i++) {
+        // If more than one frame is refreshed, it doesn't matter which one
+        // we pick, so pick the first.
+        if (cpi->refresh_frame_mask & (1 << i)) {
+          updated_fb = i;
+          break;
+        }
+      }
+      // large scale tile sometimes won't refresh any fbs
+      if (updated_fb >= 0) {
+        cm->fb_of_context_type[cm->frame_context_idx] = updated_fb;
+      }
 
-#if CONFIG_EXT_REFS
-    cpi->refresh_frame_mask = get_refresh_mask(cpi);
-    aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
-#else
-    aom_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
-#endif  // CONFIG_EXT_REFS
-
-#if CONFIG_EXT_REFS
-    if (!cpi->refresh_frame_mask) {
-      // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame
-      //       will not be used as a reference
-      cm->is_reference_frame = 0;
-    }
-#endif  // CONFIG_EXT_REFS
-
-    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-      assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
-      aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
-                           REF_FRAMES_LOG2);
-#if !CONFIG_FRAME_SIGN_BIAS
-      aom_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
-#endif  // !CONFIG_FRAME_SIGN_BIAS
-#if CONFIG_REFERENCE_BUFFER
-      if (cm->seq_params.frame_id_numbers_present_flag) {
-        int i = get_ref_frame_map_idx(cpi, ref_frame);
-        int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
-        int diff_len = cm->seq_params.delta_frame_id_length_minus2 + 2;
-        int delta_frame_id_minus1 =
-            ((cm->current_frame_id - cm->ref_frame_id[i] +
-              (1 << frame_id_len)) %
-             (1 << frame_id_len)) -
-            1;
-        if (delta_frame_id_minus1 < 0 ||
-            delta_frame_id_minus1 >= (1 << diff_len))
-          cm->invalid_delta_frame_id_minus1 = 1;
-        aom_wb_write_literal(wb, delta_frame_id_minus1, diff_len);
+      if (!cpi->refresh_frame_mask) {
+        // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame
+        //       will not be used as a reference
+        cm->is_reference_frame = 0;
       }
-#endif  // CONFIG_REFERENCE_BUFFER
     }
+  }
 
-#if CONFIG_FRAME_SIZE
-    if (cm->error_resilient_mode == 0) {
-      write_frame_size_with_refs(cpi, wb);
-    } else {
-      write_frame_size(cm, wb);
-    }
-#else
-    write_frame_size_with_refs(cpi, wb);
-#endif
+  if (!frame_is_intra_only(cm) || cpi->refresh_frame_mask != 0xFF) {
+    // Write all ref frame order hints if error_resilient_mode == 1
+    if (cm->error_resilient_mode && cm->seq_params.enable_order_hint) {
+      RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+      for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
+        // Get buffer index
+        const int buf_idx = cm->ref_frame_map[ref_idx];
+        assert(buf_idx >= 0 && buf_idx < FRAME_BUFFERS);
 
-#if CONFIG_AMVR
-    if (cm->seq_mv_precision_level == 2) {
-      aom_wb_write_bit(wb, cm->cur_frame_mv_precision_level == 0);
+        // Write order hint to bit stream
+        aom_wb_write_literal(wb, frame_bufs[buf_idx].cur_frame_offset,
+                             cm->seq_params.order_hint_bits_minus_1 + 1);
+      }
     }
-#endif
-    aom_wb_write_bit(wb, cm->allow_high_precision_mv);
+  }
 
-    fix_interp_filter(cm, cpi->td.counts);
-    write_frame_interp_filter(cm->interp_filter, wb);
-#if CONFIG_TEMPMV_SIGNALING
-    if (frame_might_use_prev_frame_mvs(cm)) {
-      aom_wb_write_bit(wb, cm->use_prev_frame_mvs);
-    }
-#endif
-  } else if (cm->frame_type == S_FRAME) {
-    MV_REFERENCE_FRAME ref_frame;
-
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-    if (!cm->error_resilient_mode) {
-      aom_wb_write_bit(wb, cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE);
-      if (cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE)
-        aom_wb_write_bit(wb,
-                         cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
-    }
-#endif
+  if (cm->frame_type == KEY_FRAME) {
+    write_frame_size(cm, frame_size_override_flag, wb);
+    assert(!av1_superres_scaled(cm) || !cm->allow_intrabc);
+    if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
+      aom_wb_write_bit(wb, cm->allow_intrabc);
+    // all eight fbs are refreshed, pick one that will live long enough
+    cm->fb_of_context_type[REGULAR_FRAME] = 0;
+  } else {
+    if (cm->frame_type == INTRA_ONLY_FRAME) {
+      write_frame_size(cm, frame_size_override_flag, wb);
+      assert(!av1_superres_scaled(cm) || !cm->allow_intrabc);
+      if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
+        aom_wb_write_bit(wb, cm->allow_intrabc);
+    } else if (cm->frame_type == INTER_FRAME || frame_is_sframe(cm)) {
+      MV_REFERENCE_FRAME ref_frame;
 
-#if CONFIG_EXT_REFS
-    if (!cpi->refresh_frame_mask) {
-      // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame
-      //       will not be used as a reference
-      cm->is_reference_frame = 0;
-    }
-#endif  // CONFIG_EXT_REFS
-
-    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-      assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
-      aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
-                           REF_FRAMES_LOG2);
-      assert(cm->ref_frame_sign_bias[ref_frame] == 0);
-#if CONFIG_REFERENCE_BUFFER
-      if (cm->seq_params.frame_id_numbers_present_flag) {
-        int i = get_ref_frame_map_idx(cpi, ref_frame);
-        int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
-        int diff_len = cm->seq_params.delta_frame_id_length_minus2 + 2;
-        int delta_frame_id_minus1 =
-            ((cm->current_frame_id - cm->ref_frame_id[i] +
-              (1 << frame_id_len)) %
-             (1 << frame_id_len)) -
-            1;
-        if (delta_frame_id_minus1 < 0 ||
-            delta_frame_id_minus1 >= (1 << diff_len))
-          cm->invalid_delta_frame_id_minus1 = 1;
-        aom_wb_write_literal(wb, delta_frame_id_minus1, diff_len);
+      // NOTE: Error resilient mode turns off frame_refs_short_signaling
+      //       automatically.
+#define FRAME_REFS_SHORT_SIGNALING 0
+#if FRAME_REFS_SHORT_SIGNALING
+      cm->frame_refs_short_signaling = cm->seq_params.enable_order_hint;
+#endif  // FRAME_REFS_SHORT_SIGNALING
+
+      if (cm->frame_refs_short_signaling) {
+        // NOTE(zoeliu@google.com):
+        //   An example solution for encoder-side implementation on frame refs
+        //   short signaling, which is only turned on when the encoder side
+        //   decision on ref frames is identical to that at the decoder side.
+        check_frame_refs_short_signaling(cpi);
       }
-#endif  // CONFIG_REFERENCE_BUFFER
-    }
 
-#if CONFIG_FRAME_SIZE
-    if (cm->error_resilient_mode == 0) {
-      write_frame_size_with_refs(cpi, wb);
-    } else {
-      write_frame_size(cm, wb);
-    }
-#else
-    write_frame_size_with_refs(cpi, wb);
-#endif
+      if (cm->seq_params.enable_order_hint)
+        aom_wb_write_bit(wb, cm->frame_refs_short_signaling);
 
-    aom_wb_write_bit(wb, cm->allow_high_precision_mv);
+      if (cm->frame_refs_short_signaling) {
+        const int lst_ref = get_ref_frame_map_idx(cpi, LAST_FRAME);
+        aom_wb_write_literal(wb, lst_ref, REF_FRAMES_LOG2);
 
-    fix_interp_filter(cm, cpi->td.counts);
-    write_frame_interp_filter(cm->interp_filter, wb);
-#if CONFIG_TEMPMV_SIGNALING
-    if (frame_might_use_prev_frame_mvs(cm)) {
-      aom_wb_write_bit(wb, cm->use_prev_frame_mvs);
-    }
-#endif
-  }
+        const int gld_ref = get_ref_frame_map_idx(cpi, GOLDEN_FRAME);
+        aom_wb_write_literal(wb, gld_ref, REF_FRAMES_LOG2);
+      }
 
-#if CONFIG_MFMV
-  if (cm->show_frame == 0) {
-    int arf_offset = AOMMIN(
-        (MAX_GF_INTERVAL - 1),
-        cpi->twopass.gf_group.arf_src_offset[cpi->twopass.gf_group.index]);
-#if CONFIG_EXT_REFS
-    int brf_offset =
-        cpi->twopass.gf_group.brf_src_offset[cpi->twopass.gf_group.index];
+      for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+        assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
+        if (!cm->frame_refs_short_signaling)
+          aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
+                               REF_FRAMES_LOG2);
+        if (cm->seq_params.frame_id_numbers_present_flag) {
+          int i = get_ref_frame_map_idx(cpi, ref_frame);
+          int frame_id_len = cm->seq_params.frame_id_length;
+          int diff_len = cm->seq_params.delta_frame_id_length;
+          int delta_frame_id_minus_1 =
+              ((cm->current_frame_id - cm->ref_frame_id[i] +
+                (1 << frame_id_len)) %
+               (1 << frame_id_len)) -
+              1;
+          if (delta_frame_id_minus_1 < 0 ||
+              delta_frame_id_minus_1 >= (1 << diff_len))
+            cm->invalid_delta_frame_id_minus_1 = 1;
+          aom_wb_write_literal(wb, delta_frame_id_minus_1, diff_len);
+        }
+      }
 
-    arf_offset = AOMMIN((MAX_GF_INTERVAL - 1), arf_offset + brf_offset);
-#endif
-    aom_wb_write_literal(wb, arf_offset, 4);
-  }
-#endif
+      if (!cm->error_resilient_mode && frame_size_override_flag) {
+        write_frame_size_with_refs(cpi, wb);
+      } else {
+        write_frame_size(cm, frame_size_override_flag, wb);
+      }
 
-#if CONFIG_REFERENCE_BUFFER
-  if (cm->seq_params.frame_id_numbers_present_flag) {
-    cm->refresh_mask =
-        cm->frame_type == KEY_FRAME ? 0xFF : get_refresh_mask(cpi);
+      if (cm->cur_frame_force_integer_mv) {
+        cm->allow_high_precision_mv = 0;
+      } else {
+        aom_wb_write_bit(wb, cm->allow_high_precision_mv);
+      }
+      fix_interp_filter(cm, cpi->td.counts);
+      write_frame_interp_filter(cm->interp_filter, wb);
+      aom_wb_write_bit(wb, cm->switchable_motion_mode);
+      if (frame_might_allow_ref_frame_mvs(cm)) {
+        aom_wb_write_bit(wb, cm->allow_ref_frame_mvs);
+      } else {
+        assert(cm->allow_ref_frame_mvs == 0);
+      }
+    }
   }
-#endif  // CONFIG_REFERENCE_BUFFER
 
-  if (!cm->error_resilient_mode) {
+  const int might_bwd_adapt =
+      !(cm->seq_params.reduced_still_picture_hdr) && !(cm->disable_cdf_update);
+  if (cm->large_scale_tile)
+    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+
+  if (might_bwd_adapt) {
     aom_wb_write_bit(
-        wb, cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD);
+        wb, cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED);
   }
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  aom_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
-#endif
-  encode_loopfilter(cm, wb);
+
+  write_tile_info(cm, saved_wb, wb);
   encode_quantization(cm, wb);
   encode_segmentation(cm, xd, wb);
-  {
-    int i;
-    struct segmentation *const seg = &cm->seg;
-    int segment_quantizer_active = 0;
-    for (i = 0; i < MAX_SEGMENTS; i++) {
-      if (segfeature_active(seg, i, SEG_LVL_ALT_Q)) {
-        segment_quantizer_active = 1;
-      }
-    }
 
-    if (cm->delta_q_present_flag)
-      assert(segment_quantizer_active == 0 && cm->base_qindex > 0);
-    if (segment_quantizer_active == 0 && cm->base_qindex > 0) {
-      aom_wb_write_bit(wb, cm->delta_q_present_flag);
-      if (cm->delta_q_present_flag) {
-        aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_q_res) - 1, 2);
-        xd->prev_qindex = cm->base_qindex;
-#if CONFIG_EXT_DELTA_Q
-        assert(seg->abs_delta == SEGMENT_DELTADATA);
+  if (cm->delta_q_present_flag) assert(cm->base_qindex > 0);
+  if (cm->base_qindex > 0) {
+    aom_wb_write_bit(wb, cm->delta_q_present_flag);
+    if (cm->delta_q_present_flag) {
+      aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_q_res) - 1, 2);
+      xd->current_qindex = cm->base_qindex;
+      if (cm->allow_intrabc)
+        assert(cm->delta_lf_present_flag == 0);
+      else
         aom_wb_write_bit(wb, cm->delta_lf_present_flag);
-        if (cm->delta_lf_present_flag) {
-          aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_lf_res) - 1, 2);
-#if CONFIG_LOOPFILTER_LEVEL
-          for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id)
-            xd->prev_delta_lf[lf_id] = 0;
-#endif  // CONFIG_LOOPFILTER_LEVEL
-          xd->prev_delta_lf_from_base = 0;
-        }
-#endif  // CONFIG_EXT_DELTA_Q
+      if (cm->delta_lf_present_flag) {
+        aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_lf_res) - 1, 2);
+        aom_wb_write_bit(wb, cm->delta_lf_multi);
+        av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
       }
     }
   }
-#if CONFIG_CDEF
-  if (!cm->all_lossless) {
-    encode_cdef(cm, wb);
+
+  if (cm->all_lossless) {
+    assert(!av1_superres_scaled(cm));
+  } else {
+    if (!cm->coded_lossless) {
+      encode_loopfilter(cm, wb);
+      encode_cdef(cm, wb);
+    }
+    encode_restoration_mode(cm, wb);
   }
-#endif
-#if CONFIG_LOOP_RESTORATION
-  encode_restoration_mode(cm, wb);
-#endif  // CONFIG_LOOP_RESTORATION
+
   write_tx_mode(cm, &cm->tx_mode, wb);
 
   if (cpi->allow_comp_inter_inter) {
     const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
-#if !CONFIG_REF_ADAPT
-    const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
-#endif  // !CONFIG_REF_ADAPT
 
     aom_wb_write_bit(wb, use_hybrid_pred);
-#if !CONFIG_REF_ADAPT
-    if (!use_hybrid_pred) aom_wb_write_bit(wb, use_compound_pred);
-#endif  // !CONFIG_REF_ADAPT
   }
-  write_compound_tools(cm, wb);
-
-#if CONFIG_EXT_TX
-  aom_wb_write_bit(wb, cm->reduced_tx_set_used);
-#endif  // CONFIG_EXT_TX
-
-#if CONFIG_GLOBAL_MOTION
-  if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb);
-#endif  // CONFIG_GLOBAL_MOTION
-
-  write_tile_info(cm, wb);
-}
-#endif  // CONFIG_OBU
-
-static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
-  AV1_COMMON *const cm = &cpi->common;
-#if CONFIG_SUPERTX
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-#endif  // CONFIG_SUPERTX
-  FRAME_CONTEXT *const fc = cm->fc;
-  aom_writer *header_bc;
-  int i;
-#if !CONFIG_NEW_MULTISYMBOL
-  FRAME_COUNTS *counts = cpi->td.counts;
-  int j;
-#endif
-
-  const int probwt = cm->num_tg;
-  (void)probwt;
-  (void)i;
-  (void)fc;
-
-  aom_writer real_header_bc;
-  header_bc = &real_header_bc;
-#if CONFIG_ANS
-  header_bc->size = 1 << cpi->common.ans_window_size_log2;
-#endif
-  aom_start_encode(header_bc, data);
 
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-  if (cm->tx_mode == TX_MODE_SELECT)
-    av1_cond_prob_diff_update(header_bc, &cm->fc->quarter_tx_size_prob,
-                              cm->counts.quarter_tx_size, probwt);
-#endif
-#if CONFIG_LV_MAP
-  av1_write_txb_probs(cpi, header_bc);
-#endif  // CONFIG_LV_MAP
+  if (cm->is_skip_mode_allowed) aom_wb_write_bit(wb, cm->skip_mode_flag);
 
-#if CONFIG_VAR_TX && !CONFIG_NEW_MULTISYMBOL
-  if (cm->tx_mode == TX_MODE_SELECT)
-    update_txfm_partition_probs(cm, header_bc, counts, probwt);
-#endif
-
-#if !CONFIG_NEW_MULTISYMBOL
-  update_skip_probs(cm, header_bc, counts);
-#endif
-
-  if (!frame_is_intra_only(cm)) {
-#if !CONFIG_NEW_MULTISYMBOL
-    update_inter_mode_probs(cm, header_bc, counts);
-#endif
-#if CONFIG_INTERINTRA
-    if (cm->reference_mode != COMPOUND_REFERENCE &&
-        cm->allow_interintra_compound) {
-#if !CONFIG_NEW_MULTISYMBOL
-      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
-        if (is_interintra_allowed_bsize_group(i)) {
-          av1_cond_prob_diff_update(header_bc, &fc->interintra_prob[i],
-                                    cm->counts.interintra[i], probwt);
-        }
-      }
-#endif
-#if CONFIG_WEDGE && !CONFIG_NEW_MULTISYMBOL
-#if CONFIG_EXT_PARTITION_TYPES
-      int block_sizes_to_update = BLOCK_SIZES_ALL;
-#else
-      int block_sizes_to_update = BLOCK_SIZES;
-#endif
-      for (i = 0; i < block_sizes_to_update; i++) {
-        if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i))
-          av1_cond_prob_diff_update(header_bc, &fc->wedge_interintra_prob[i],
-                                    cm->counts.wedge_interintra[i], probwt);
-      }
-#endif  // CONFIG_WEDGE && CONFIG_NEW_MULTISYMBOL
-    }
-#endif  // CONFIG_INTERINTRA
-
-#if !CONFIG_NEW_MULTISYMBOL
-    for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-      av1_cond_prob_diff_update(header_bc, &fc->intra_inter_prob[i],
-                                counts->intra_inter[i], probwt);
-#endif
+  if (frame_might_allow_warped_motion(cm))
+    aom_wb_write_bit(wb, cm->allow_warped_motion);
+  else
+    assert(!cm->allow_warped_motion);
 
-#if !CONFIG_NEW_MULTISYMBOL
-    if (cpi->allow_comp_inter_inter) {
-      const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
-      if (use_hybrid_pred)
-        for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-          av1_cond_prob_diff_update(header_bc, &fc->comp_inter_prob[i],
-                                    counts->comp_inter[i], probwt);
-    }
+  aom_wb_write_bit(wb, cm->reduced_tx_set_used);
 
-    if (cm->reference_mode != COMPOUND_REFERENCE) {
-      for (i = 0; i < REF_CONTEXTS; i++) {
-        for (j = 0; j < (SINGLE_REFS - 1); j++) {
-          av1_cond_prob_diff_update(header_bc, &fc->single_ref_prob[i][j],
-                                    counts->single_ref[i][j], probwt);
-        }
-      }
-    }
+  if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb);
 
-    if (cm->reference_mode != SINGLE_REFERENCE) {
-#if CONFIG_EXT_COMP_REFS
-      for (i = 0; i < COMP_REF_TYPE_CONTEXTS; i++)
-        av1_cond_prob_diff_update(header_bc, &fc->comp_ref_type_prob[i],
-                                  counts->comp_ref_type[i], probwt);
-
-      for (i = 0; i < UNI_COMP_REF_CONTEXTS; i++)
-        for (j = 0; j < (UNIDIR_COMP_REFS - 1); j++)
-          av1_cond_prob_diff_update(header_bc, &fc->uni_comp_ref_prob[i][j],
-                                    counts->uni_comp_ref[i][j], probwt);
-#endif  // CONFIG_EXT_COMP_REFS
-
-      for (i = 0; i < REF_CONTEXTS; i++) {
-#if CONFIG_EXT_REFS
-        for (j = 0; j < (FWD_REFS - 1); j++) {
-          av1_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j],
-                                    counts->comp_ref[i][j], probwt);
-        }
-        for (j = 0; j < (BWD_REFS - 1); j++) {
-          av1_cond_prob_diff_update(header_bc, &fc->comp_bwdref_prob[i][j],
-                                    counts->comp_bwdref[i][j], probwt);
-        }
-#else
-        for (j = 0; j < (COMP_REFS - 1); j++) {
-          av1_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j],
-                                    counts->comp_ref[i][j], probwt);
-        }
-#endif  // CONFIG_EXT_REFS
-      }
+  if (cm->film_grain_params_present && (cm->show_frame || cm->showable_frame)) {
+    int flip_back_update_parameters_flag = 0;
+    if (cm->frame_type != INTER_FRAME &&
+        cm->film_grain_params.update_parameters == 0) {
+      cm->film_grain_params.update_parameters = 1;
+      flip_back_update_parameters_flag = 1;
     }
-#endif  // CONFIG_NEW_MULTISYMBOL
-
-#if CONFIG_COMPOUND_SINGLEREF
-    for (i = 0; i < COMP_INTER_MODE_CONTEXTS; i++)
-      av1_cond_prob_diff_update(header_bc, &fc->comp_inter_mode_prob[i],
-                                counts->comp_inter_mode[i], probwt);
-#endif  // CONFIG_COMPOUND_SINGLEREF
+    write_film_grain_params(cpi, wb);
 
-#if !CONFIG_NEW_MULTISYMBOL
-    av1_write_nmv_probs(cm, cm->allow_high_precision_mv, header_bc, counts->mv);
-#endif
-#if CONFIG_SUPERTX
-    if (!xd->lossless[0]) update_supertx_probs(cm, probwt, header_bc);
-#endif  // CONFIG_SUPERTX
+    if (flip_back_update_parameters_flag)
+      cm->film_grain_params.update_parameters = 0;
   }
-  aom_stop_encode(header_bc);
-  assert(header_bc->pos <= 0xffff);
-  return header_bc->pos;
+
+  if (cm->large_scale_tile) write_ext_tile_info(cm, saved_wb, wb);
 }
 
-#if !CONFIG_OBU || CONFIG_EXT_TILE
 static int choose_size_bytes(uint32_t size, int spare_msbs) {
   // Choose the number of bytes required to represent size, without
   // using the 'spare_msbs' number of most significant bits.
@@ -5394,116 +3337,112 @@ static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
   int tsb;
   int tcsb;
 
-#if CONFIG_EXT_TILE
   if (cm->large_scale_tile) {
     // The top bit in the tile size field indicates tile copy mode, so we
     // have 1 less bit to code the tile size
     tsb = choose_size_bytes(max_tile_size, 1);
     tcsb = choose_size_bytes(max_tile_col_size, 0);
   } else {
-#endif  // CONFIG_EXT_TILE
     tsb = choose_size_bytes(max_tile_size, 0);
     tcsb = 4;  // This is ignored
     (void)max_tile_col_size;
-#if CONFIG_EXT_TILE
   }
-#endif  // CONFIG_EXT_TILE
 
   assert(tsb > 0);
   assert(tcsb > 0);
 
   *tile_size_bytes = tsb;
   *tile_col_size_bytes = tcsb;
+  if (tsb == 4 && tcsb == 4) return data_size;
 
-  if (tsb == 4 && tcsb == 4) {
-    return data_size;
-  } else {
-    uint32_t wpos = 0;
-    uint32_t rpos = 0;
-
-#if CONFIG_EXT_TILE
-    if (cm->large_scale_tile) {
-      int tile_row;
-      int tile_col;
-
-      for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
-        // All but the last column has a column header
-        if (tile_col < cm->tile_cols - 1) {
-          uint32_t tile_col_size = mem_get_le32(dst + rpos);
-          rpos += 4;
-
-          // Adjust the tile column size by the number of bytes removed
-          // from the tile size fields.
-          tile_col_size -= (4 - tsb) * cm->tile_rows;
-
-          mem_put_varsize(dst + wpos, tcsb, tile_col_size);
-          wpos += tcsb;
-        }
+  uint32_t wpos = 0;
+  uint32_t rpos = 0;
 
-        for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
-          // All, including the last row has a header
-          uint32_t tile_header = mem_get_le32(dst + rpos);
-          rpos += 4;
-
-          // If this is a copy tile, we need to shift the MSB to the
-          // top bit of the new width, and there is no data to copy.
-          if (tile_header >> 31 != 0) {
-            if (tsb < 4) tile_header >>= 32 - 8 * tsb;
-            mem_put_varsize(dst + wpos, tsb, tile_header);
-            wpos += tsb;
-          } else {
-            mem_put_varsize(dst + wpos, tsb, tile_header);
-            wpos += tsb;
+  if (cm->large_scale_tile) {
+    int tile_row;
+    int tile_col;
 
-            memmove(dst + wpos, dst + rpos, tile_header);
-            rpos += tile_header;
-            wpos += tile_header;
-          }
-        }
+    for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
+      // All but the last column has a column header
+      if (tile_col < cm->tile_cols - 1) {
+        uint32_t tile_col_size = mem_get_le32(dst + rpos);
+        rpos += 4;
+
+        // Adjust the tile column size by the number of bytes removed
+        // from the tile size fields.
+        tile_col_size -= (4 - tsb) * cm->tile_rows;
+
+        mem_put_varsize(dst + wpos, tcsb, tile_col_size);
+        wpos += tcsb;
       }
-    } else {
-#endif  // CONFIG_EXT_TILE
-      const int n_tiles = cm->tile_cols * cm->tile_rows;
-      int n;
 
-      for (n = 0; n < n_tiles; n++) {
-        int tile_size;
+      for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+        // All, including the last row has a header
+        uint32_t tile_header = mem_get_le32(dst + rpos);
+        rpos += 4;
 
-        if (n == n_tiles - 1) {
-          tile_size = data_size - rpos;
+        // If this is a copy tile, we need to shift the MSB to the
+        // top bit of the new width, and there is no data to copy.
+        if (tile_header >> 31 != 0) {
+          if (tsb < 4) tile_header >>= 32 - 8 * tsb;
+          mem_put_varsize(dst + wpos, tsb, tile_header);
+          wpos += tsb;
         } else {
-          tile_size = mem_get_le32(dst + rpos);
-          rpos += 4;
-          mem_put_varsize(dst + wpos, tsb, tile_size);
+          mem_put_varsize(dst + wpos, tsb, tile_header);
           wpos += tsb;
-        }
 
-        memmove(dst + wpos, dst + rpos, tile_size);
-
-        rpos += tile_size;
-        wpos += tile_size;
+          tile_header += AV1_MIN_TILE_SIZE_BYTES;
+          memmove(dst + wpos, dst + rpos, tile_header);
+          rpos += tile_header;
+          wpos += tile_header;
+        }
       }
-#if CONFIG_EXT_TILE
     }
-#endif  // CONFIG_EXT_TILE
 
     assert(rpos > wpos);
     assert(rpos == data_size);
 
     return wpos;
   }
+  const int n_tiles = cm->tile_cols * cm->tile_rows;
+  int n;
+
+  for (n = 0; n < n_tiles; n++) {
+    int tile_size;
+
+    if (n == n_tiles - 1) {
+      tile_size = data_size - rpos;
+    } else {
+      tile_size = mem_get_le32(dst + rpos);
+      rpos += 4;
+      mem_put_varsize(dst + wpos, tsb, tile_size);
+      tile_size += AV1_MIN_TILE_SIZE_BYTES;
+      wpos += tsb;
+    }
+
+    memmove(dst + wpos, dst + rpos, tile_size);
+
+    rpos += tile_size;
+    wpos += tile_size;
+  }
+
+  assert(rpos > wpos);
+  assert(rpos == data_size);
+
+  return wpos;
 }
-#endif
 
-#if CONFIG_OBU
-static uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension,
-                                 uint8_t *const dst) {
+uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension,
+                          uint8_t *const dst) {
   struct aom_write_bit_buffer wb = { dst, 0 };
   uint32_t size = 0;
 
-  aom_wb_write_literal(&wb, (int)obu_type, 5);
-  aom_wb_write_literal(&wb, 0, 2);
+  aom_wb_write_literal(&wb, 0, 1);  // forbidden bit.
+  aom_wb_write_literal(&wb, (int)obu_type, 4);
   aom_wb_write_literal(&wb, obu_extension ? 1 : 0, 1);
+  aom_wb_write_literal(&wb, 1, 1);  // obu_has_payload_length_field
+  aom_wb_write_literal(&wb, 0, 1);  // reserved
+
   if (obu_extension) {
     aom_wb_write_literal(&wb, obu_extension & 0xFF, 8);
   }
@@ -5512,87 +3451,156 @@ static uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension,
   return size;
 }
 
-static uint32_t write_temporal_delimiter_obu() { return 0; }
+int write_uleb_obu_size(uint32_t obu_header_size, uint32_t obu_payload_size,
+                        uint8_t *dest) {
+  const uint32_t obu_size = obu_payload_size;
+  const uint32_t offset = obu_header_size;
+  size_t coded_obu_size = 0;
 
-static uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
-  AV1_COMMON *const cm = &cpi->common;
-  SequenceHeader *const seq_params = &cm->seq_params;
-  struct aom_write_bit_buffer wb = { dst, 0 };
-  uint32_t size = 0;
+  if (aom_uleb_encode(obu_size, sizeof(obu_size), dest + offset,
+                      &coded_obu_size) != 0) {
+    return AOM_CODEC_ERROR;
+  }
 
-  write_profile(cm->profile, &wb);
+  return AOM_CODEC_OK;
+}
 
-  aom_wb_write_literal(&wb, 0, 4);
+static size_t obu_memmove(uint32_t obu_header_size, uint32_t obu_payload_size,
+                          uint8_t *data) {
+  const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size);
+  const uint32_t move_dst_offset =
+      (uint32_t)length_field_size + obu_header_size;
+  const uint32_t move_src_offset = obu_header_size;
+  const uint32_t move_size = obu_payload_size;
+  memmove(data + move_dst_offset, data + move_src_offset, move_size);
+  return length_field_size;
+}
 
-  seq_params->frame_id_numbers_present_flag = FRAME_ID_NUMBERS_PRESENT_FLAG;
-  aom_wb_write_literal(&wb, seq_params->frame_id_numbers_present_flag, 1);
-  if (seq_params->frame_id_numbers_present_flag) {
-    seq_params->frame_id_length_minus7 = FRAME_ID_LENGTH_MINUS7;
-    seq_params->delta_frame_id_length_minus2 = DELTA_FRAME_ID_LENGTH_MINUS2;
-    aom_wb_write_literal(&wb, seq_params->frame_id_length_minus7, 4);
-    aom_wb_write_literal(&wb, seq_params->delta_frame_id_length_minus2, 4);
+static void add_trailing_bits(struct aom_write_bit_buffer *wb) {
+  if (aom_wb_is_byte_aligned(wb)) {
+    aom_wb_write_literal(wb, 0x80, 8);
+  } else {
+    // assumes that the other bits are already 0s
+    aom_wb_write_bit(wb, 1);
   }
+}
 
-  // color_config
-  write_bitdepth_colorspace_sampling(cm, &wb);
-
-  size = aom_wb_bytes_written(&wb);
-  return size;
+static void write_bitstream_level(BitstreamLevel bl,
+                                  struct aom_write_bit_buffer *wb) {
+  uint8_t seq_level_idx = major_minor_to_seq_level_idx(bl);
+  assert(is_valid_seq_level_idx(seq_level_idx));
+  aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS);
 }
 
-static uint32_t write_frame_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
+static uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
   AV1_COMMON *const cm = &cpi->common;
   struct aom_write_bit_buffer wb = { dst, 0 };
-  uint32_t total_size = 0;
-  uint32_t compressed_hdr_size, uncompressed_hdr_size;
+  uint32_t size = 0;
 
-  write_uncompressed_header_obu(cpi, &wb);
+  write_profile(cm->profile, &wb);
 
-  if (cm->show_existing_frame) {
-    total_size = aom_wb_bytes_written(&wb);
-    return total_size;
+  // Still picture or not
+  aom_wb_write_bit(&wb, cm->seq_params.still_picture);
+  assert(IMPLIES(!cm->seq_params.still_picture,
+                 !cm->seq_params.reduced_still_picture_hdr));
+  // whether to use reduced still picture header
+  aom_wb_write_bit(&wb, cm->seq_params.reduced_still_picture_hdr);
+
+  if (cm->seq_params.reduced_still_picture_hdr) {
+    assert(cm->timing_info_present == 0);
+    assert(cm->seq_params.decoder_model_info_present_flag == 0);
+    assert(cm->seq_params.display_model_info_present_flag == 0);
+    write_bitstream_level(cm->seq_params.level[0], &wb);
+  } else {
+    aom_wb_write_bit(&wb, cm->timing_info_present);  // timing info present flag
+
+    if (cm->timing_info_present) {
+      // timing_info
+      write_timing_info_header(cm, &wb);
+      aom_wb_write_bit(&wb, cm->seq_params.decoder_model_info_present_flag);
+      if (cm->seq_params.decoder_model_info_present_flag) {
+        write_decoder_model_info(cm, &wb);
+      }
+    }
+    aom_wb_write_bit(&wb, cm->seq_params.display_model_info_present_flag);
+    aom_wb_write_literal(&wb, cm->seq_params.operating_points_cnt_minus_1,
+                         OP_POINTS_CNT_MINUS_1_BITS);
+    int i;
+    for (i = 0; i < cm->seq_params.operating_points_cnt_minus_1 + 1; i++) {
+      aom_wb_write_literal(&wb, cm->seq_params.operating_point_idc[i],
+                           OP_POINTS_IDC_BITS);
+      write_bitstream_level(cm->seq_params.level[i], &wb);
+      if (cm->seq_params.level[i].major > 3)
+        aom_wb_write_bit(&wb, cm->seq_params.tier[i]);
+      if (cm->seq_params.decoder_model_info_present_flag) {
+        aom_wb_write_bit(&wb,
+                         cm->op_params[i].decoder_model_param_present_flag);
+        if (cm->op_params[i].decoder_model_param_present_flag)
+          write_dec_model_op_parameters(cm, &wb, i);
+      }
+      if (cm->seq_params.display_model_info_present_flag) {
+        aom_wb_write_bit(&wb,
+                         cm->op_params[i].display_model_param_present_flag);
+        if (cm->op_params[i].display_model_param_present_flag) {
+          assert(cm->op_params[i].initial_display_delay <= 10);
+          aom_wb_write_literal(&wb, cm->op_params[i].initial_display_delay - 1,
+                               4);
+        }
+      }
+    }
   }
+  write_sequence_header(cpi, &wb);
 
-  // write the tile length code  (Always 4 bytes for now)
-  aom_wb_write_literal(&wb, 3, 2);
+  write_color_config(cm, &wb);
 
-  if (!use_compressed_header(cm)) {
-    uncompressed_hdr_size = aom_wb_bytes_written(&wb);
-    compressed_hdr_size = 0;
-  } else {
-    // placeholder for the compressed header length
-    struct aom_write_bit_buffer compr_hdr_len_wb = wb;
-    aom_wb_write_literal(&wb, 0, 16);
+  aom_wb_write_bit(&wb, cm->film_grain_params_present);
 
-    uncompressed_hdr_size = aom_wb_bytes_written(&wb);
-    compressed_hdr_size =
-        write_compressed_header(cpi, dst + uncompressed_hdr_size);
-    aom_wb_overwrite_literal(&compr_hdr_len_wb, (int)(compressed_hdr_size), 16);
-  }
+  add_trailing_bits(&wb);
 
-  total_size = uncompressed_hdr_size + compressed_hdr_size;
-  return total_size;
+  size = aom_wb_bytes_written(&wb);
+  return size;
+}
+
+static uint32_t write_frame_header_obu(AV1_COMP *cpi,
+                                       struct aom_write_bit_buffer *saved_wb,
+                                       uint8_t *const dst,
+                                       int append_trailing_bits) {
+  struct aom_write_bit_buffer wb = { dst, 0 };
+  write_uncompressed_header_obu(cpi, saved_wb, &wb);
+  if (append_trailing_bits) add_trailing_bits(&wb);
+  return aom_wb_bytes_written(&wb);
 }
 
 static uint32_t write_tile_group_header(uint8_t *const dst, int startTile,
-                                        int endTile, int tiles_log2) {
+                                        int endTile, int tiles_log2,
+                                        int tile_start_and_end_present_flag) {
   struct aom_write_bit_buffer wb = { dst, 0 };
   uint32_t size = 0;
 
-  aom_wb_write_literal(&wb, startTile, tiles_log2);
-  aom_wb_write_literal(&wb, endTile, tiles_log2);
+  if (!tiles_log2) return size;
+
+  aom_wb_write_bit(&wb, tile_start_and_end_present_flag);
+
+  if (tile_start_and_end_present_flag) {
+    aom_wb_write_literal(&wb, startTile, tiles_log2);
+    aom_wb_write_literal(&wb, endTile, tiles_log2);
+  }
 
   size = aom_wb_bytes_written(&wb);
   return size;
 }
 
+typedef struct {
+  uint8_t *frame_header;
+  size_t obu_header_byte_offset;
+  size_t total_length;
+} FrameHeaderInfo;
+
 static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
-                                       unsigned int *max_tile_size,
-                                       unsigned int *max_tile_col_size,
-                                       uint8_t *const frame_header_obu_location,
-                                       uint32_t frame_header_obu_size,
-                                       int insert_frame_header_obu_flag) {
-  const AV1_COMMON *const cm = &cpi->common;
+                                       struct aom_write_bit_buffer *saved_wb,
+                                       uint8_t obu_extension_header,
+                                       const FrameHeaderInfo *fh_info) {
+  AV1_COMMON *const cm = &cpi->common;
   aom_writer mode_bc;
   int tile_row, tile_col;
   TOKENEXTRA *(*const tok_buffers)[MAX_TILE_COLS] = cpi->tile_tok;
@@ -5601,29 +3609,53 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
   const int tile_cols = cm->tile_cols;
   const int tile_rows = cm->tile_rows;
   unsigned int tile_size = 0;
+  unsigned int max_tile_size = 0;
+  unsigned int max_tile_col_size = 0;
   const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
   // Fixed size tile groups for the moment
   const int num_tg_hdrs = cm->num_tg;
   const int tg_size =
-#if CONFIG_EXT_TILE
       (cm->large_scale_tile)
           ? 1
-          :
-#endif  // CONFIG_EXT_TILE
-          (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
+          : (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
   int tile_count = 0;
   int curr_tg_data_size = 0;
   uint8_t *data = dst;
   int new_tg = 1;
-#if CONFIG_EXT_TILE
   const int have_tiles = tile_cols * tile_rows > 1;
-#endif
+  int first_tg = 1;
 
-  *max_tile_size = 0;
-  *max_tile_col_size = 0;
+  cm->largest_tile_id = 0;
 
-#if CONFIG_EXT_TILE
   if (cm->large_scale_tile) {
+    // For large_scale_tile case, we always have only one tile group, so it can
+    // be written as an OBU_FRAME.
+    const OBU_TYPE obu_type = OBU_FRAME;
+    const uint32_t tg_hdr_size = write_obu_header(obu_type, 0, data);
+    data += tg_hdr_size;
+
+    const uint32_t frame_header_size =
+        write_frame_header_obu(cpi, saved_wb, data, 0);
+    data += frame_header_size;
+    total_size += frame_header_size;
+
+#define EXT_TILE_DEBUG 0
+#if EXT_TILE_DEBUG
+    {
+      char fn[20] = "./fh";
+      fn[4] = cm->current_video_frame / 100 + '0';
+      fn[5] = (cm->current_video_frame % 100) / 10 + '0';
+      fn[6] = (cm->current_video_frame % 10) + '0';
+      fn[7] = '\0';
+      av1_print_uncompressed_frame_header(data - frame_header_size,
+                                          frame_header_size, fn);
+    }
+#endif  // EXT_TILE_DEBUG
+#undef EXT_TILE_DEBUG
+
+    int tile_size_bytes = 0;
+    int tile_col_size_bytes = 0;
+
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
       TileInfo tile_info;
       const int is_last_col = (tile_col == tile_cols - 1);
@@ -5643,7 +3675,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
         TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
         av1_tile_set_row(&tile_info, cm, tile_row);
 
-        buf->data = dst + total_size;
+        buf->data = dst + total_size + tg_hdr_size;
 
         // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
         // even for the last one, unless no tiling is used at all.
@@ -5651,29 +3683,25 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
         // Initialise tile context from the frame context
         this_tile->tctx = *cm->fc;
         cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
-#if CONFIG_PVQ
-        cpi->td.mb.pvq_q = &this_tile->pvq_q;
-        cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
-#endif  // CONFIG_PVQ
-#if CONFIG_ANS
-        mode_bc.size = 1 << cpi->common.ans_window_size_log2;
-#endif
+        mode_bc.allow_update_cdf = !cm->large_scale_tile;
+        mode_bc.allow_update_cdf =
+            mode_bc.allow_update_cdf && !cm->disable_cdf_update;
         aom_start_encode(&mode_bc, buf->data + data_offset);
         write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
         assert(tok == tok_end);
         aom_stop_encode(&mode_bc);
         tile_size = mode_bc.pos;
-#if CONFIG_PVQ
-        cpi->td.mb.pvq_q = NULL;
-#endif
         buf->size = tile_size;
 
         // Record the maximum tile size we see, so we can compact headers later.
-        *max_tile_size = AOMMAX(*max_tile_size, tile_size);
+        if (tile_size > max_tile_size) {
+          max_tile_size = tile_size;
+          cm->largest_tile_id = tile_cols * tile_row + tile_col;
+        }
 
         if (have_tiles) {
           // tile header: size of this tile, or copy offset
-          uint32_t tile_header = tile_size;
+          uint32_t tile_header = tile_size - AV1_MIN_TILE_SIZE_BYTES;
           const int tile_copy_mode =
               ((AOMMAX(cm->tile_width, cm->tile_height) << MI_SIZE_LOG2) <= 256)
                   ? 1
@@ -5683,12 +3711,12 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
           // Very low chances to have copy tiles on the key frames, so don't
           // search on key frames to reduce unnecessary search.
           if (cm->frame_type != KEY_FRAME && tile_copy_mode) {
-            const int idendical_tile_offset =
+            const int identical_tile_offset =
                 find_identical_tile(tile_row, tile_col, tile_buffers);
 
-            if (idendical_tile_offset > 0) {
+            if (identical_tile_offset > 0) {
               tile_size = 0;
-              tile_header = idendical_tile_offset | 0x80;
+              tile_header = identical_tile_offset | 0x80;
               tile_header <<= 24;
             }
           }
@@ -5701,263 +3729,287 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
 
       if (!is_last_col) {
         uint32_t col_size = total_size - col_offset - 4;
-        mem_put_le32(dst + col_offset, col_size);
+        mem_put_le32(dst + col_offset + tg_hdr_size, col_size);
 
-        // If it is not final packing, record the maximum tile column size we
-        // see, otherwise, check if the tile size is out of the range.
-        *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size);
+        // Record the maximum tile column size we see.
+        max_tile_col_size = AOMMAX(max_tile_col_size, col_size);
       }
     }
-  } else {
-#endif  // CONFIG_EXT_TILE
 
-    for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      TileInfo tile_info;
-      const int is_last_row = (tile_row == tile_rows - 1);
-      av1_tile_set_row(&tile_info, cm, tile_row);
+    if (have_tiles) {
+      total_size = remux_tiles(cm, data, total_size - frame_header_size,
+                               max_tile_size, max_tile_col_size,
+                               &tile_size_bytes, &tile_col_size_bytes);
+      total_size += frame_header_size;
+    }
+
+    // In EXT_TILE case, only use 1 tile group. Follow the obu syntax, write
+    // current tile group size before tile data(include tile column header).
+    // Tile group size doesn't include the bytes storing tg size.
+    total_size += tg_hdr_size;
+    const uint32_t obu_payload_size = total_size - tg_hdr_size;
+    const size_t length_field_size =
+        obu_memmove(tg_hdr_size, obu_payload_size, dst);
+    if (write_uleb_obu_size(tg_hdr_size, obu_payload_size, dst) !=
+        AOM_CODEC_OK) {
+      assert(0);
+    }
+    total_size += (uint32_t)length_field_size;
+    saved_wb->bit_buffer += length_field_size;
 
-      for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-        const int tile_idx = tile_row * tile_cols + tile_col;
-        TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
-        TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
-        const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
-        const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
-        const int is_last_col = (tile_col == tile_cols - 1);
-        const int is_last_tile = is_last_col && is_last_row;
-        int is_last_tile_in_tg = 0;
-
-        if (new_tg) {
-          if (insert_frame_header_obu_flag && tile_idx) {
-            // insert a copy of frame header OBU (including 4-byte size),
-            // except before the first tile group
-            data = dst + total_size;
-            memmove(data, frame_header_obu_location, frame_header_obu_size);
-            total_size += frame_header_obu_size;
-          }
-          data = dst + total_size;
-          // A new tile group begins at this tile.  Write the obu header and
-          // tile group header
-          curr_tg_data_size = write_obu_header(OBU_TILE_GROUP, 0, data + 4);
-          if (n_log2_tiles)
-            curr_tg_data_size += write_tile_group_header(
-                data + curr_tg_data_size + 4, tile_idx,
-                AOMMIN(tile_idx + tg_size - 1, tile_cols * tile_rows - 1),
-                n_log2_tiles);
-          total_size += curr_tg_data_size + 4;
-          new_tg = 0;
-          tile_count = 0;
-        }
-        tile_count++;
-        av1_tile_set_col(&tile_info, cm, tile_col);
+    // Now fill in the gaps in the uncompressed header.
+    if (have_tiles) {
+      assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4);
+      aom_wb_overwrite_literal(saved_wb, tile_col_size_bytes - 1, 2);
 
-        if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1)) {
-          is_last_tile_in_tg = 1;
-          new_tg = 1;
-        } else {
-          is_last_tile_in_tg = 0;
+      assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+      aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2);
+    }
+    return total_size;
+  }
+
+  uint32_t obu_header_size = 0;
+  uint8_t *tile_data_start = dst + total_size;
+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    TileInfo tile_info;
+    av1_tile_set_row(&tile_info, cm, tile_row);
+
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      const int tile_idx = tile_row * tile_cols + tile_col;
+      TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+      TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+      const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
+      const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
+      int is_last_tile_in_tg = 0;
+
+      if (new_tg) {
+        data = dst + total_size;
+
+        // A new tile group begins at this tile.  Write the obu header and
+        // tile group header
+        const OBU_TYPE obu_type =
+            (num_tg_hdrs == 1) ? OBU_FRAME : OBU_TILE_GROUP;
+        curr_tg_data_size =
+            write_obu_header(obu_type, obu_extension_header, data);
+        obu_header_size = curr_tg_data_size;
+
+        if (num_tg_hdrs == 1) {
+          curr_tg_data_size += write_frame_header_obu(
+              cpi, saved_wb, data + curr_tg_data_size, 0);
         }
+        curr_tg_data_size += write_tile_group_header(
+            data + curr_tg_data_size, tile_idx,
+            AOMMIN(tile_idx + tg_size - 1, tile_cols * tile_rows - 1),
+            n_log2_tiles, cm->num_tg > 1);
+        total_size += curr_tg_data_size;
+        tile_data_start += curr_tg_data_size;
+        new_tg = 0;
+        tile_count = 0;
+      }
+      tile_count++;
+      av1_tile_set_col(&tile_info, cm, tile_col);
 
-#if CONFIG_DEPENDENT_HORZTILES
-        av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col);
-#endif
-        buf->data = dst + total_size;
+      if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1)) {
+        is_last_tile_in_tg = 1;
+        new_tg = 1;
+      } else {
+        is_last_tile_in_tg = 0;
+      }
 
-        // The last tile of the tile group does not have a header.
-        if (!is_last_tile_in_tg) total_size += 4;
+      buf->data = dst + total_size;
 
-        // Initialise tile context from the frame context
-        this_tile->tctx = *cm->fc;
-        cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
-#if CONFIG_PVQ
-        cpi->td.mb.pvq_q = &this_tile->pvq_q;
-        cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
-#endif  // CONFIG_PVQ
-#if CONFIG_ANS
-        mode_bc.size = 1 << cpi->common.ans_window_size_log2;
-#endif  // CONFIG_ANS
-        aom_start_encode(&mode_bc, dst + total_size);
-        write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
-#if !CONFIG_LV_MAP
-#if !CONFIG_PVQ
-        assert(tok == tok_end);
-#endif  // !CONFIG_PVQ
-#endif  // !CONFIG_LV_MAP
-        aom_stop_encode(&mode_bc);
-        tile_size = mode_bc.pos;
-#if CONFIG_PVQ
-        cpi->td.mb.pvq_q = NULL;
-#endif
-        assert(tile_size > 0);
+      // The last tile of the tile group does not have a header.
+      if (!is_last_tile_in_tg) total_size += 4;
 
-        curr_tg_data_size += (tile_size + (is_last_tile_in_tg ? 0 : 4));
-        buf->size = tile_size;
+      // Initialise tile context from the frame context
+      this_tile->tctx = *cm->fc;
+      cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+      mode_bc.allow_update_cdf = 1;
+      mode_bc.allow_update_cdf =
+          mode_bc.allow_update_cdf && !cm->disable_cdf_update;
+      const int num_planes = av1_num_planes(cm);
+      av1_reset_loop_restoration(&cpi->td.mb.e_mbd, num_planes);
+
+      aom_start_encode(&mode_bc, dst + total_size);
+      write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
+      aom_stop_encode(&mode_bc);
+      tile_size = mode_bc.pos;
+      assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES);
 
-        if (!is_last_tile) {
-          *max_tile_size = AOMMAX(*max_tile_size, tile_size);
+      curr_tg_data_size += (tile_size + (is_last_tile_in_tg ? 0 : 4));
+      buf->size = tile_size;
+      if (tile_size > max_tile_size) {
+        cm->largest_tile_id = tile_cols * tile_row + tile_col;
+        max_tile_size = tile_size;
+      }
+
+      if (!is_last_tile_in_tg) {
+        // size of this tile
+        mem_put_le32(buf->data, tile_size - AV1_MIN_TILE_SIZE_BYTES);
+      } else {
+        // write current tile group size
+        const uint32_t obu_payload_size = curr_tg_data_size - obu_header_size;
+        const size_t length_field_size =
+            obu_memmove(obu_header_size, obu_payload_size, data);
+        if (write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+            AOM_CODEC_OK) {
+          assert(0);
         }
-        if (!is_last_tile_in_tg) {
-          // size of this tile
-          mem_put_le32(buf->data, tile_size);
-        } else {
-          // write current tile group size
-          mem_put_le32(data, curr_tg_data_size);
+        curr_tg_data_size += (int)length_field_size;
+        total_size += (uint32_t)length_field_size;
+        tile_data_start += length_field_size;
+        if (num_tg_hdrs == 1) {
+          // if this tg is combined with the frame header then update saved
+          // frame header base offset accroding to length field size
+          saved_wb->bit_buffer += length_field_size;
         }
 
-        total_size += tile_size;
+        if (!first_tg && cm->error_resilient_mode) {
+          // Make room for a duplicate Frame Header OBU.
+          memmove(data + fh_info->total_length, data, curr_tg_data_size);
+
+          // Insert a copy of the Frame Header OBU.
+          memcpy(data, fh_info->frame_header, fh_info->total_length);
+
+          // Force context update tile to be the first tile in error
+          // resiliant mode as the duplicate frame headers will have
+          // context_update_tile_id set to 0
+          cm->largest_tile_id = 0;
+
+          // Rewrite the OBU header to change the OBU type to Redundant Frame
+          // Header.
+          write_obu_header(OBU_REDUNDANT_FRAME_HEADER, obu_extension_header,
+                           &data[fh_info->obu_header_byte_offset]);
+
+          data += fh_info->total_length;
+
+          curr_tg_data_size += (int)(fh_info->total_length);
+          total_size += (uint32_t)(fh_info->total_length);
+        }
+        first_tg = 0;
       }
+
+      total_size += tile_size;
     }
-#if CONFIG_EXT_TILE
   }
-#endif  // CONFIG_EXT_TILE
-  return (uint32_t)total_size;
-}
 
-#endif
+  if (have_tiles) {
+    // Fill in context_update_tile_id indicating the tile to use for the
+    // cdf update. The encoder currently sets it to the largest tile
+    // (but is up to the encoder)
+    aom_wb_overwrite_literal(saved_wb, cm->largest_tile_id,
+                             cm->log2_tile_cols + cm->log2_tile_rows);
+    // If more than one tile group. tile_size_bytes takes the default value 4
+    // and does not need to be set. For a single tile group it is set in the
+    // section below.
+    if (num_tg_hdrs == 1) {
+      int tile_size_bytes = 4, unused;
+      const uint32_t tile_data_offset = (uint32_t)(tile_data_start - dst);
+      const uint32_t tile_data_size = total_size - tile_data_offset;
+
+      total_size =
+          remux_tiles(cm, tile_data_start, tile_data_size, max_tile_size,
+                      max_tile_col_size, &tile_size_bytes, &unused);
+      total_size += tile_data_offset;
+      assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
 
-void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
+      aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2);
+
+      // Update the OBU length if remux_tiles() reduced the size.
+      uint64_t payload_size;
+      size_t length_field_size;
+      int res =
+          aom_uleb_decode(dst + obu_header_size, total_size - obu_header_size,
+                          &payload_size, &length_field_size);
+      assert(res == 0);
+      (void)res;
+
+      const uint64_t new_payload_size =
+          total_size - obu_header_size - length_field_size;
+      if (new_payload_size != payload_size) {
+        size_t new_length_field_size;
+        res = aom_uleb_encode(new_payload_size, length_field_size,
+                              dst + obu_header_size, &new_length_field_size);
+        assert(res == 0);
+        if (new_length_field_size < length_field_size) {
+          const size_t src_offset = obu_header_size + length_field_size;
+          const size_t dst_offset = obu_header_size + new_length_field_size;
+          memmove(dst + dst_offset, dst + src_offset, (size_t)payload_size);
+          total_size -= (int)(length_field_size - new_length_field_size);
+        }
+      }
+    }
+  }
+  return total_size;
+}
+
+int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
   uint8_t *data = dst;
   uint32_t data_size;
-#if CONFIG_EXT_TILE
-  AV1_COMMON *const cm = &cpi->common;
-  uint32_t compressed_hdr_size = 0;
-  uint32_t uncompressed_hdr_size;
-  struct aom_write_bit_buffer saved_wb;
-  struct aom_write_bit_buffer wb = { data, 0 };
-  const int have_tiles = cm->tile_cols * cm->tile_rows > 1;
-  int tile_size_bytes;
-  int tile_col_size_bytes;
-#endif  // CONFIG_EXT_TILE
-  unsigned int max_tile_size;
-  unsigned int max_tile_col_size;
-#if CONFIG_OBU
-#if !CONFIG_EXT_TILE
   AV1_COMMON *const cm = &cpi->common;
-#endif
-  uint32_t obu_size;
-  uint8_t *frame_header_location;
-  uint32_t frame_header_size;
-#endif
+  uint32_t obu_header_size = 0;
+  uint32_t obu_payload_size = 0;
+  FrameHeaderInfo fh_info = { NULL, 0, 0 };
+  const uint8_t obu_extension_header =
+      cm->temporal_layer_id << 5 | cm->spatial_layer_id << 3 | 0;
 
 #if CONFIG_BITSTREAM_DEBUG
   bitstream_queue_reset_write();
 #endif
 
-#if CONFIG_OBU
-  // write temporal delimiter obu, preceded by 4-byte size
-  obu_size = write_obu_header(OBU_TD, 0, data + 4);
-  obu_size += write_temporal_delimiter_obu(/*data + 4 + obu_size*/);
-  mem_put_le32(data, obu_size);
-  data += obu_size + 4;
+  // The TD is now written outside the frame encode loop
 
   // write sequence header obu if KEY_FRAME, preceded by 4-byte size
   if (cm->frame_type == KEY_FRAME) {
-    obu_size = write_obu_header(OBU_SEQUENCE_HEADER, 0, data + 4);
-    obu_size += write_sequence_header_obu(cpi, data + 4 + obu_size);
-    mem_put_le32(data, obu_size);
-    data += obu_size + 4;
-  }
+    obu_header_size = write_obu_header(OBU_SEQUENCE_HEADER, 0, data);
 
-  // write frame header obu, preceded by 4-byte size
-  frame_header_location = data + 4;
-  obu_size = write_obu_header(OBU_FRAME_HEADER, 0, frame_header_location);
-  frame_header_size = write_frame_header_obu(cpi, data + 4 + obu_size);
-  obu_size += frame_header_size;
-  mem_put_le32(data, obu_size);
-  data += obu_size + 4;
+    obu_payload_size = write_sequence_header_obu(cpi, data + obu_header_size);
+    const size_t length_field_size =
+        obu_memmove(obu_header_size, obu_payload_size, data);
+    if (write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
 
-  if (cm->show_existing_frame) {
-    data_size = 0;
-  } else {
-    //  Each tile group obu will be preceded by 4-byte size of the tile group
-    //  obu
-    data_size =
-        write_tiles_in_tg_obus(cpi, data, &max_tile_size, &max_tile_col_size,
-                               frame_header_location - 4, obu_size + 4,
-                               1 /* cm->error_resilient_mode */);
+    data += obu_header_size + obu_payload_size + length_field_size;
   }
 
-#endif
-
-#if CONFIG_EXT_TILE
-  if (cm->large_scale_tile) {
-    // Write the uncompressed header
-    write_uncompressed_header_frame(cpi, &wb);
-
-#if CONFIG_EXT_REFS
-    if (cm->show_existing_frame) {
-      *size = aom_wb_bytes_written(&wb);
-      return;
-    }
-#endif  // CONFIG_EXT_REFS
-
-    // We do not know these in advance. Output placeholder bit.
-    saved_wb = wb;
-    // Write tile size magnitudes
-    if (have_tiles) {
-      // Note that the last item in the uncompressed header is the data
-      // describing tile configuration.
-      // Number of bytes in tile column size - 1
-      aom_wb_write_literal(&wb, 0, 2);
+  const int write_frame_header = (cm->num_tg > 1 || cm->show_existing_frame);
+  struct aom_write_bit_buffer saved_wb;
+  if (write_frame_header) {
+    // Write Frame Header OBU.
+    fh_info.frame_header = data;
+    obu_header_size =
+        write_obu_header(OBU_FRAME_HEADER, obu_extension_header, data);
+    obu_payload_size =
+        write_frame_header_obu(cpi, &saved_wb, data + obu_header_size, 1);
 
-      // Number of bytes in tile size - 1
-      aom_wb_write_literal(&wb, 0, 2);
+    const size_t length_field_size =
+        obu_memmove(obu_header_size, obu_payload_size, data);
+    if (write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
     }
 
-    if (!use_compressed_header(cm)) {
-      uncompressed_hdr_size = (uint32_t)aom_wb_bytes_written(&wb);
-      aom_clear_system_state();
-      compressed_hdr_size = 0;
-    } else {
-      // Size of compressed header
-      aom_wb_write_literal(&wb, 0, 16);
-      uncompressed_hdr_size = (uint32_t)aom_wb_bytes_written(&wb);
-      aom_clear_system_state();
-      // Write the compressed header
-      compressed_hdr_size =
-          write_compressed_header(cpi, data + uncompressed_hdr_size);
-    }
-    data += uncompressed_hdr_size + compressed_hdr_size;
+    fh_info.obu_header_byte_offset = 0;
+    fh_info.total_length =
+        obu_header_size + obu_payload_size + length_field_size;
+    data += fh_info.total_length;
 
-    // Write the encoded tile data
-    data_size = write_tiles(cpi, data, &max_tile_size, &max_tile_col_size);
-  } else {
-#endif  // CONFIG_EXT_TILE
-#if !CONFIG_OBU
-    data_size = write_tiles(cpi, data, &max_tile_size, &max_tile_col_size);
-#endif
-#if CONFIG_EXT_TILE
+    // Since length_field_size is determined adaptively after frame header
+    // encoding, saved_wb must be adjusted accordingly.
+    saved_wb.bit_buffer += length_field_size;
   }
-#endif  // CONFIG_EXT_TILE
-#if CONFIG_EXT_TILE
-  if (cm->large_scale_tile) {
-    if (have_tiles) {
-      data_size =
-          remux_tiles(cm, data, data_size, max_tile_size, max_tile_col_size,
-                      &tile_size_bytes, &tile_col_size_bytes);
-    }
-
-    data += data_size;
 
-    // Now fill in the gaps in the uncompressed header.
-    if (have_tiles) {
-      assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4);
-      aom_wb_write_literal(&saved_wb, tile_col_size_bytes - 1, 2);
-
-      assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
-      aom_wb_write_literal(&saved_wb, tile_size_bytes - 1, 2);
-    }
-    // TODO(jbb): Figure out what to do if compressed_hdr_size > 16 bits.
-    assert(compressed_hdr_size <= 0xffff);
-    aom_wb_write_literal(&saved_wb, compressed_hdr_size, 16);
+  if (cm->show_existing_frame) {
+    data_size = 0;
   } else {
-#endif  // CONFIG_EXT_TILE
-    data += data_size;
-#if CONFIG_EXT_TILE
-  }
-#endif  // CONFIG_EXT_TILE
-#if CONFIG_ANS && ANS_REVERSE
-  // Avoid aliasing the superframe index
-  *data++ = 0;
-#endif
+    //  Each tile group obu will be preceded by 4-byte size of the tile group
+    //  obu
+    data_size = write_tiles_in_tg_obus(cpi, data, &saved_wb,
+                                       obu_extension_header, &fh_info);
+  }
+  data += data_size;
   *size = data - dst;
+  return AOM_CODEC_OK;
 }
diff --git a/third_party/aom/av1/encoder/bitstream.h b/third_party/aom/av1/encoder/bitstream.h
index 76eb85116..2047b6833 100644
--- a/third_party/aom/av1/encoder/bitstream.h
+++ b/third_party/aom/av1/encoder/bitstream.h
@@ -20,34 +20,24 @@ extern "C" {
 
 struct aom_write_bit_buffer;
 
-#if CONFIG_REFERENCE_BUFFER
-void write_sequence_header(AV1_COMMON *const cm,
-                           struct aom_write_bit_buffer *wb);
-#endif
+void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb);
+
+uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension,
+                          uint8_t *const dst);
 
-void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size);
+int write_uleb_obu_size(uint32_t obu_header_size, uint32_t obu_payload_size,
+                        uint8_t *dest);
 
-void av1_encode_token_init(void);
+int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size);
 
 static INLINE int av1_preserve_existing_gf(AV1_COMP *cpi) {
-#if CONFIG_EXT_REFS
   // Do not swap gf and arf indices for internal overlay frames
   return !cpi->multi_arf_allowed && cpi->rc.is_src_frame_alt_ref &&
          !cpi->rc.is_src_frame_ext_arf;
-#else
-  return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
-         cpi->rc.is_src_frame_alt_ref;
-#endif  // CONFIG_EXT_REFS
 }
 
 void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
-#if CONFIG_SUPERTX
-                       const int supertx_enabled,
-#endif
-#if CONFIG_TXK_SEL
-                       int blk_row, int blk_col, int block, int plane,
-                       TX_SIZE tx_size,
-#endif
+                       int blk_row, int blk_col, int plane, TX_SIZE tx_size,
                        aom_writer *w);
 
 #ifdef __cplusplus
diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h
index 8b6627825..13fc11c31 100644
--- a/third_party/aom/av1/encoder/block.h
+++ b/third_party/aom/av1/encoder/block.h
@@ -14,9 +14,6 @@
 
 #include "av1/common/entropymv.h"
 #include "av1/common/entropy.h"
-#if CONFIG_PVQ
-#include "av1/encoder/encint.h"
-#endif
 #include "av1/common/mvref_common.h"
 #include "av1/encoder/hash.h"
 #if CONFIG_DIST_8X8
@@ -27,12 +24,6 @@
 extern "C" {
 #endif
 
-#if CONFIG_PVQ
-// Maximum possible # of tx blocks in luma plane, which is currently 256,
-// since there can be 16x16 of 4x4 tx.
-#define MAX_PVQ_BLOCKS_IN_SB (MAX_SB_SQUARE >> 2 * OD_LOG_BSIZE0)
-#endif
-
 typedef struct {
   unsigned int sse;
   int sum;
@@ -41,53 +32,39 @@ typedef struct {
 
 typedef struct macroblock_plane {
   DECLARE_ALIGNED(16, int16_t, src_diff[MAX_SB_SQUARE]);
-#if CONFIG_PVQ
-  DECLARE_ALIGNED(16, int16_t, src_int16[MAX_SB_SQUARE]);
-#endif
   tran_low_t *qcoeff;
   tran_low_t *coeff;
   uint16_t *eobs;
-#if CONFIG_LV_MAP
   uint8_t *txb_entropy_ctx;
-#endif
   struct buf_2d src;
 
   // Quantizer setings
-  const int16_t *quant_fp;
-  const int16_t *round_fp;
-  const int16_t *quant;
-  const int16_t *quant_shift;
-  const int16_t *zbin;
-  const int16_t *round;
-#if CONFIG_NEW_QUANT
-  const cuml_bins_type_nuq *cuml_bins_nuq[QUANT_PROFILES];
-#endif  // CONFIG_NEW_QUANT
+  // These are used/accessed only in the quantization process
+  // RDO does not / must not depend on any of these values
+  // All values below share the coefficient scale/shift used in TX
+  const int16_t *quant_fp_QTX;
+  const int16_t *round_fp_QTX;
+  const int16_t *quant_QTX;
+  const int16_t *quant_shift_QTX;
+  const int16_t *zbin_QTX;
+  const int16_t *round_QTX;
+  const int16_t *dequant_QTX;
 } MACROBLOCK_PLANE;
 
-typedef int av1_coeff_cost[PLANE_TYPES][REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
-                          [TAIL_TOKENS];
-
-#if CONFIG_LV_MAP
 typedef struct {
   int txb_skip_cost[TXB_SKIP_CONTEXTS][2];
-  int nz_map_cost[SIG_COEF_CONTEXTS][2];
-  int eob_cost[EOB_COEF_CONTEXTS][2];
+  int base_eob_cost[SIG_COEF_CONTEXTS_EOB][3];
+  int base_cost[SIG_COEF_CONTEXTS][4];
+  int eob_extra_cost[EOB_COEF_CONTEXTS][2];
   int dc_sign_cost[DC_SIGN_CONTEXTS][2];
-  int base_cost[NUM_BASE_LEVELS][COEFF_BASE_CONTEXTS][2];
-#if BR_NODE
   int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1];
-  int br_cost[BASE_RANGE_SETS][LEVEL_CONTEXTS][2];
-#else   // BR_NODE
-  int lps_cost[LEVEL_CONTEXTS][2];
-#endif  // BR_NODE
-#if CONFIG_CTX1D
-  int eob_mode_cost[TX_CLASSES][2];
-  int empty_line_cost[TX_CLASSES][EMPTY_LINE_CONTEXTS][2];
-  int hv_eob_cost[TX_CLASSES][HV_EOB_CONTEXTS][2];
-#endif
 } LV_MAP_COEFF_COST;
 
 typedef struct {
+  int eob_cost[2][11];
+} LV_MAP_EOB_COST;
+
+typedef struct {
   tran_low_t tcoeff[MAX_MB_PLANE][MAX_SB_SQUARE];
   uint16_t eobs[MAX_MB_PLANE][MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
   uint8_t txb_skip_ctx[MAX_MB_PLANE]
@@ -95,20 +72,17 @@ typedef struct {
   int dc_sign_ctx[MAX_MB_PLANE]
                  [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
 } CB_COEFF_BUFFER;
-#endif
 
 typedef struct {
-  int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
   int16_t mode_context[MODE_CTX_REF_FRAMES];
-#if CONFIG_LV_MAP
   // TODO(angiebird): Reduce the buffer size according to sb_type
   tran_low_t *tcoeff[MAX_MB_PLANE];
   uint16_t *eobs[MAX_MB_PLANE];
   uint8_t *txb_skip_ctx[MAX_MB_PLANE];
   int *dc_sign_ctx[MAX_MB_PLANE];
-#endif
   uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
   CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+  int_mv global_mvs[REF_FRAMES];
   int16_t compound_mode_context[MODE_CTX_REF_FRAMES];
 } MB_MODE_INFO_EXT;
 
@@ -120,39 +94,119 @@ typedef struct {
 } MvLimits;
 
 typedef struct {
-  uint8_t best_palette_color_map[MAX_SB_SQUARE];
-  float kmeans_data_buf[2 * MAX_SB_SQUARE];
+  uint8_t best_palette_color_map[MAX_PALETTE_SQUARE];
+  int kmeans_data_buf[2 * MAX_PALETTE_SQUARE];
 } PALETTE_BUFFER;
 
 typedef struct {
-  TX_TYPE tx_type;
   TX_SIZE tx_size;
-#if CONFIG_VAR_TX
-  TX_SIZE min_tx_size;
-  TX_SIZE inter_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
-  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
-#endif  // CONFIG_VAR_TX
-#if CONFIG_TXK_SEL
-  TX_TYPE txk_type[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
-#endif  // CONFIG_TXK_SEL
+  TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN];
+  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  TX_TYPE txk_type[TXK_TYPE_BUF_LEN];
   RD_STATS rd_stats;
   uint32_t hash_value;
-} TX_RD_INFO;
+} MB_RD_INFO;
 
 #define RD_RECORD_BUFFER_LEN 8
 typedef struct {
-  TX_RD_INFO tx_rd_info[RD_RECORD_BUFFER_LEN];  // Circular buffer.
+  MB_RD_INFO tx_rd_info[RD_RECORD_BUFFER_LEN];  // Circular buffer.
   int index_start;
   int num;
-  CRC_CALCULATOR crc_calculator;  // Hash function.
-} TX_RD_RECORD;
+  CRC32C crc_calculator;  // Hash function.
+} MB_RD_RECORD;
+
+typedef struct {
+  int64_t dist;
+  int64_t sse;
+  int rate;
+  uint16_t eob;
+  TX_TYPE tx_type;
+  uint16_t entropy_context;
+  uint8_t txb_entropy_ctx;
+  uint8_t valid;
+  uint8_t fast;  // This is not being used now.
+} TXB_RD_INFO;
+
+#define TX_SIZE_RD_RECORD_BUFFER_LEN 256
+typedef struct {
+  uint32_t hash_vals[TX_SIZE_RD_RECORD_BUFFER_LEN];
+  TXB_RD_INFO tx_rd_info[TX_SIZE_RD_RECORD_BUFFER_LEN];
+  int index_start;
+  int num;
+} TXB_RD_RECORD;
+
+typedef struct tx_size_rd_info_node {
+  TXB_RD_INFO *rd_info_array;  // Points to array of size TX_TYPES.
+  struct tx_size_rd_info_node *children[4];
+} TXB_RD_INFO_NODE;
+
+// Region size for mode decision sampling in the first pass of partition
+// search(two_pass_partition_search speed feature), in units of mi size(4).
+// Used by the mode_pruning_based_on_two_pass_partition_search speed feature.
+#define FIRST_PARTITION_PASS_SAMPLE_REGION 8
+#define FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2 3
+#define FIRST_PARTITION_PASS_STATS_TABLES                     \
+  (MAX_MIB_SIZE >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2) * \
+      (MAX_MIB_SIZE >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2)
+#define FIRST_PARTITION_PASS_STATS_STRIDE \
+  (MAX_MIB_SIZE_LOG2 - FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2)
+
+static INLINE int av1_first_partition_pass_stats_index(int mi_row, int mi_col) {
+  const int row =
+      (mi_row & MAX_MIB_MASK) >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2;
+  const int col =
+      (mi_col & MAX_MIB_MASK) >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2;
+  return (row << FIRST_PARTITION_PASS_STATS_STRIDE) + col;
+}
+
+typedef struct {
+  uint8_t ref0_counts[REF_FRAMES];  // Counters for ref_frame[0].
+  uint8_t ref1_counts[REF_FRAMES];  // Counters for ref_frame[1].
+  int sample_counts;                // Number of samples collected.
+} FIRST_PARTITION_PASS_STATS;
+
+#define MAX_INTERP_FILTER_STATS 64
+typedef struct {
+  InterpFilters filters;
+  int_mv mv[2];
+  int8_t ref_frames[2];
+} INTERPOLATION_FILTER_STATS;
 
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
   struct macroblock_plane plane[MAX_MB_PLANE];
 
-  // Save the transform RD search info.
-  TX_RD_RECORD tx_rd_record;
+  // Determine if one would go with reduced complexity transform block
+  // search model to select prediction modes, or full complexity model
+  // to select transform kernel.
+  int rd_model;
+
+  // Indicate if the encoder is running in the first pass partition search.
+  // In that case, apply certain speed features therein to reduce the overhead
+  // cost in the first pass search.
+  int cb_partition_scan;
+
+  FIRST_PARTITION_PASS_STATS
+  first_partition_pass_stats[FIRST_PARTITION_PASS_STATS_TABLES];
+
+  // [comp_idx][saved stat_idx]
+  INTERPOLATION_FILTER_STATS interp_filter_stats[2][MAX_INTERP_FILTER_STATS];
+  int interp_filter_stats_idx[2];
+
+  // Activate constrained coding block partition search range.
+  int use_cb_search_range;
+
+  // Inter macroblock RD search info.
+  MB_RD_RECORD mb_rd_record;
+
+  // Inter transform block RD search info. for square TX sizes.
+  TXB_RD_RECORD txb_rd_record_8X8[(MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)];
+  TXB_RD_RECORD txb_rd_record_16X16[(MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)];
+  TXB_RD_RECORD txb_rd_record_32X32[(MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)];
+  TXB_RD_RECORD txb_rd_record_64X64[(MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)];
+
+  // Intra transform block RD search info. for square TX sizes.
+  TXB_RD_RECORD txb_rd_record_intra;
 
   MACROBLOCKD e_mbd;
   MB_MODE_INFO_EXT *mbmi_ext;
@@ -173,34 +227,29 @@ struct macroblock {
   int *m_search_count_ptr;
   int *ex_search_count_ptr;
 
-#if CONFIG_VAR_TX
   unsigned int txb_split_count;
-#endif
 
   // These are set to their default values at the beginning, and then adjusted
   // further in the encoding process.
   BLOCK_SIZE min_partition_size;
   BLOCK_SIZE max_partition_size;
 
-  int mv_best_ref_index[TOTAL_REFS_PER_FRAME];
-  unsigned int max_mv_context[TOTAL_REFS_PER_FRAME];
+  unsigned int max_mv_context[REF_FRAMES];
   unsigned int source_variance;
-  unsigned int pred_sse[TOTAL_REFS_PER_FRAME];
-  int pred_mv_sad[TOTAL_REFS_PER_FRAME];
+  unsigned int pred_sse[REF_FRAMES];
+  int pred_mv_sad[REF_FRAMES];
 
   int *nmvjointcost;
-  int nmv_vec_cost[NMV_CONTEXTS][MV_JOINTS];
-  int *nmvcost[NMV_CONTEXTS][2];
-  int *nmvcost_hp[NMV_CONTEXTS][2];
-  int **mv_cost_stack[NMV_CONTEXTS];
+  int nmv_vec_cost[MV_JOINTS];
+  int *nmvcost[2];
+  int *nmvcost_hp[2];
+  int **mv_cost_stack;
   int **mvcost;
 
-#if CONFIG_MOTION_VAR
   int32_t *wsrc_buf;
   int32_t *mask_buf;
   uint8_t *above_pred_buf;
   uint8_t *left_pred_buf;
-#endif  // CONFIG_MOTION_VAR
 
   PALETTE_BUFFER *palette_buffer;
 
@@ -208,108 +257,80 @@ struct macroblock {
   // from extending outside the UMV borders
   MvLimits mv_limits;
 
-#if CONFIG_VAR_TX
-  uint8_t blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
-  uint8_t blk_skip_drl[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
-#endif
+  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t blk_skip_drl[MAX_MIB_SIZE * MAX_MIB_SIZE];
 
   int skip;
-
-#if CONFIG_CB4X4
   int skip_chroma_rd;
-#endif
+  int skip_cost[SKIP_CONTEXTS][2];
+
+  int skip_mode;  // 0: off; 1: on
+  int skip_mode_cost[SKIP_CONTEXTS][2];
+
+  int compound_idx;
 
-#if CONFIG_LV_MAP
   LV_MAP_COEFF_COST coeff_costs[TX_SIZES][PLANE_TYPES];
+  LV_MAP_EOB_COST eob_costs[7][2];
   uint16_t cb_offset;
-#endif
-
-  av1_coeff_cost token_head_costs[TX_SIZES];
-  av1_coeff_cost token_tail_costs[TX_SIZES];
 
   // mode costs
+  int intra_inter_cost[INTRA_INTER_CONTEXTS][2];
+
   int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES];
   int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2];
-  int zeromv_mode_cost[ZEROMV_MODE_CONTEXTS][2];
+  int zeromv_mode_cost[GLOBALMV_MODE_CONTEXTS][2];
   int refmv_mode_cost[REFMV_MODE_CONTEXTS][2];
   int drl_mode_cost0[DRL_MODE_CONTEXTS][2];
 
+  int comp_inter_cost[COMP_INTER_CONTEXTS][2];
+  int single_ref_cost[REF_CONTEXTS][SINGLE_REFS - 1][2];
+  int comp_ref_type_cost[COMP_REF_TYPE_CONTEXTS]
+                        [CDF_SIZE(COMP_REFERENCE_TYPES)];
+  int uni_comp_ref_cost[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1]
+                       [CDF_SIZE(2)];
+  // Cost for signaling ref_frame[0] (LAST_FRAME, LAST2_FRAME, LAST3_FRAME or
+  // GOLDEN_FRAME) in bidir-comp mode.
+  int comp_ref_cost[REF_CONTEXTS][FWD_REFS - 1][2];
+  // Cost for signaling ref_frame[1] (ALTREF_FRAME, ALTREF2_FRAME, or
+  // BWDREF_FRAME) in bidir-comp mode.
+  int comp_bwdref_cost[REF_CONTEXTS][BWD_REFS - 1][2];
   int inter_compound_mode_cost[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
-  int compound_type_cost[BLOCK_SIZES_ALL][COMPOUND_TYPES];
-#if CONFIG_COMPOUND_SINGLEREF
-  int inter_singleref_comp_mode_cost[INTER_MODE_CONTEXTS]
-                                    [INTER_SINGLEREF_COMP_MODES];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_INTERINTRA
+  int compound_type_cost[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1];
+  int wedge_idx_cost[BLOCK_SIZES_ALL][16];
+  int interintra_cost[BLOCK_SIZE_GROUPS][2];
+  int wedge_interintra_cost[BLOCK_SIZES_ALL][2];
   int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
-#endif  // CONFIG_INTERINTRA
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   int motion_mode_cost[BLOCK_SIZES_ALL][MOTION_MODES];
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
   int motion_mode_cost1[BLOCK_SIZES_ALL][2];
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  int motion_mode_cost2[BLOCK_SIZES_ALL][OBMC_FAMILY_MODES];
-#endif
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT
-  int ncobmc_mode_cost[ADAPT_OVERLAP_BLOCKS][MAX_NCOBMC_MODES];
-#endif  // CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  int intra_uv_mode_cost[INTRA_MODES][UV_INTRA_MODES];
+  int intra_uv_mode_cost[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES];
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+  int filter_intra_cost[BLOCK_SIZES_ALL][2];
+  int filter_intra_mode_cost[FILTER_INTRA_MODES];
   int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
-#if CONFIG_EXT_PARTITION_TYPES
-  int partition_cost[PARTITION_CONTEXTS + CONFIG_UNPOISON_PARTITION_CTX]
-                    [EXT_PARTITION_TYPES];
-#else
-  int partition_cost[PARTITION_CONTEXTS + CONFIG_UNPOISON_PARTITION_CTX]
-                    [PARTITION_TYPES];
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_MRC_TX
-  int mrc_mask_inter_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
-                         [PALETTE_COLORS];
-  int mrc_mask_intra_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
-                         [PALETTE_COLORS];
-#endif  // CONFIG_MRC_TX
-  int palette_y_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
-  int palette_uv_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
+  int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+  int palette_y_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  int palette_uv_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
   int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
                           [PALETTE_COLORS];
   int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
                            [PALETTE_COLORS];
-#if CONFIG_CFL
+  int palette_y_mode_cost[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2];
+  int palette_uv_mode_cost[PALETTE_UV_MODE_CONTEXTS][2];
   // The rate associated with each alpha codeword
   int cfl_cost[CFL_JOINT_SIGNS][CFL_PRED_PLANES][CFL_ALPHABET_SIZE];
-#endif  // CONFIG_CFL
   int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
-#if CONFIG_EXT_TX
-#if CONFIG_LGT_FROM_PRED
-  int intra_lgt_cost[LGT_SIZES][INTRA_MODES][2];
-  int inter_lgt_cost[LGT_SIZES][2];
-#endif
+  int txfm_partition_cost[TXFM_PARTITION_CONTEXTS][2];
   int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
   int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
                          [TX_TYPES];
-#else
-  int intra_tx_type_costs[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
-  int inter_tx_type_costs[EXT_TX_SIZES][TX_TYPES];
-#endif  // CONFIG_EXT_TX
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-  int intra_filter_cost[INTRA_FILTERS + 1][INTRA_FILTERS];
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_LOOP_RESTORATION
+  int angle_delta_cost[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1];
   int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES];
-#endif  // CONFIG_LOOP_RESTORATION
-#if CONFIG_INTRABC
+  int wiener_restore_cost[2];
+  int sgrproj_restore_cost[2];
   int intrabc_cost[2];
-#endif  // CONFIG_INTRABC
-
-  int optimize;
 
   // Used to store sub partition's choices.
-  MV pred_mv[TOTAL_REFS_PER_FRAME];
+  MV pred_mv[REF_FRAMES];
 
   // Store the best motion vector during motion search
   int_mv best_mv;
@@ -320,38 +341,65 @@ struct macroblock {
   int use_default_intra_tx_type;
   // use default transform and skip transform type search for inter modes
   int use_default_inter_tx_type;
-#if CONFIG_PVQ
-  int rate;
-  // 1 if neither AC nor DC is coded. Only used during RDO.
-  int pvq_skip[MAX_MB_PLANE];
-  PVQ_QUEUE *pvq_q;
-
-  // Storage for PVQ tx block encodings in a superblock.
-  // There can be max 16x16 of 4x4 blocks (and YUV) encode by PVQ
-  // 256 is the max # of 4x4 blocks in a SB (64x64), which comes from:
-  // 1) Since PVQ is applied to each trasnform-ed block
-  // 2) 4x4 is the smallest tx size in AV1
-  // 3) AV1 allows using smaller tx size than block (i.e. partition) size
-  // TODO(yushin) : The memory usage could be improved a lot, since this has
-  // storage for 10 bands and 128 coefficients for every 4x4 block,
-  PVQ_INFO pvq[MAX_PVQ_BLOCKS_IN_SB][MAX_MB_PLANE];
-  daala_enc_ctx daala_enc;
-  int pvq_speed;
-  int pvq_coded;  // Indicates whether pvq_info needs be stored to tokenize
-#endif
 #if CONFIG_DIST_8X8
   int using_dist_8x8;
   aom_tune_metric tune_metric;
-#if CONFIG_CB4X4
-#if CONFIG_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, decoded_8x8[8 * 8]);
-#else
-  DECLARE_ALIGNED(16, uint8_t, decoded_8x8[8 * 8]);
-#endif
-#endif  // CONFIG_CB4X4
+  DECLARE_ALIGNED(16, int16_t, pred_luma[MAX_SB_SQUARE]);
 #endif  // CONFIG_DIST_8X8
+  int comp_idx_cost[COMP_INDEX_CONTEXTS][2];
+  int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2];
+  // Bit flags for pruning tx type search, tx split, etc.
+  int tx_search_prune[EXT_TX_SET_TYPES];
+  int must_find_valid_partition;
+  int tx_split_prune_flag;  // Flag to skip tx split RD search.
 };
 
+static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
+  static const char LUT[BLOCK_SIZES_ALL] = {
+    0,  // BLOCK_4X4
+    1,  // BLOCK_4X8
+    1,  // BLOCK_8X4
+    0,  // BLOCK_8X8
+    1,  // BLOCK_8X16
+    1,  // BLOCK_16X8
+    0,  // BLOCK_16X16
+    1,  // BLOCK_16X32
+    1,  // BLOCK_32X16
+    0,  // BLOCK_32X32
+    1,  // BLOCK_32X64
+    1,  // BLOCK_64X32
+    0,  // BLOCK_64X64
+    0,  // BLOCK_64X128
+    0,  // BLOCK_128X64
+    0,  // BLOCK_128X128
+    1,  // BLOCK_4X16
+    1,  // BLOCK_16X4
+    1,  // BLOCK_8X32
+    1,  // BLOCK_32X8
+    1,  // BLOCK_16X64
+    1,  // BLOCK_64X16
+  };
+
+  return LUT[bsize];
+}
+
+static INLINE int is_rect_tx_allowed(const MACROBLOCKD *xd,
+                                     const MB_MODE_INFO *mbmi) {
+  return is_rect_tx_allowed_bsize(mbmi->sb_type) &&
+         !xd->lossless[mbmi->segment_id];
+}
+
+static INLINE int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) {
+  TX_SIZE ctx_size = max_txsize_rect_lookup[bsize];
+  int depth = 0;
+  while (tx_size != ctx_size) {
+    depth++;
+    ctx_size = sub_tx_size_map[ctx_size];
+    assert(depth <= MAX_TX_DEPTH);
+  }
+  return depth;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/blockiness.c b/third_party/aom/av1/encoder/blockiness.c
index 113ceb29d..66dedd9ed 100644
--- a/third_party/aom/av1/encoder/blockiness.c
+++ b/third_party/aom/av1/encoder/blockiness.c
@@ -9,9 +9,10 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "av1/common/common.h"
 #include "av1/common/filter.h"
 #include "aom/aom_integer.h"
diff --git a/third_party/aom/av1/encoder/context_tree.c b/third_party/aom/av1/encoder/context_tree.c
index 4bbf0e5fb..d6e556b93 100644
--- a/third_party/aom/av1/encoder/context_tree.c
+++ b/third_party/aom/av1/encoder/context_tree.c
@@ -13,32 +13,18 @@
 #include "av1/encoder/encoder.h"
 
 static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = {
-#if CONFIG_CB4X4
-  BLOCK_4X4,
-#endif
-  BLOCK_8X8,     BLOCK_16X16, BLOCK_32X32, BLOCK_64X64,
-#if CONFIG_EXT_PARTITION
-  BLOCK_128X128,
-#endif  // CONFIG_EXT_PARTITION
+  BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128,
 };
 
 static void alloc_mode_context(AV1_COMMON *cm, int num_pix,
-#if CONFIG_EXT_PARTITION_TYPES
-                               PARTITION_TYPE partition,
-#endif
                                PICK_MODE_CONTEXT *ctx) {
+  const int num_planes = av1_num_planes(cm);
   int i;
   const int num_blk = num_pix / 16;
   ctx->num_4x4_blk = num_blk;
 
-#if CONFIG_EXT_PARTITION_TYPES
-  ctx->partition = partition;
-#endif
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-#if CONFIG_VAR_TX
-    CHECK_MEM_ERROR(cm, ctx->blk_skip[i], aom_calloc(num_blk, sizeof(uint8_t)));
-#endif
+  CHECK_MEM_ERROR(cm, ctx->blk_skip, aom_calloc(num_blk, sizeof(uint8_t)));
+  for (i = 0; i < num_planes; ++i) {
     CHECK_MEM_ERROR(cm, ctx->coeff[i],
                     aom_memalign(32, num_pix * sizeof(*ctx->coeff[i])));
     CHECK_MEM_ERROR(cm, ctx->qcoeff[i],
@@ -47,148 +33,94 @@ static void alloc_mode_context(AV1_COMMON *cm, int num_pix,
                     aom_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i])));
     CHECK_MEM_ERROR(cm, ctx->eobs[i],
                     aom_memalign(32, num_blk * sizeof(*ctx->eobs[i])));
-#if CONFIG_LV_MAP
     CHECK_MEM_ERROR(
         cm, ctx->txb_entropy_ctx[i],
         aom_memalign(32, num_blk * sizeof(*ctx->txb_entropy_ctx[i])));
-#endif
-
-#if CONFIG_PVQ
-    CHECK_MEM_ERROR(cm, ctx->pvq_ref_coeff[i],
-                    aom_memalign(32, num_pix * sizeof(*ctx->pvq_ref_coeff[i])));
-#endif
   }
 
-  for (i = 0; i < 2; ++i) {
-    CHECK_MEM_ERROR(
-        cm, ctx->color_index_map[i],
-        aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
+  if (num_pix <= MAX_PALETTE_SQUARE) {
+    for (i = 0; i < 2; ++i) {
+      CHECK_MEM_ERROR(
+          cm, ctx->color_index_map[i],
+          aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
+    }
   }
-#if CONFIG_MRC_TX
-  CHECK_MEM_ERROR(cm, ctx->mrc_mask,
-                  aom_memalign(32, num_pix * sizeof(*ctx->mrc_mask)));
-#endif  // CONFIG_MRC_TX
 }
 
-static void free_mode_context(PICK_MODE_CONTEXT *ctx) {
+static void free_mode_context(PICK_MODE_CONTEXT *ctx, const int num_planes) {
   int i;
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-#if CONFIG_VAR_TX
-    aom_free(ctx->blk_skip[i]);
-    ctx->blk_skip[i] = 0;
-#endif
+  aom_free(ctx->blk_skip);
+  ctx->blk_skip = 0;
+  for (i = 0; i < num_planes; ++i) {
     aom_free(ctx->coeff[i]);
     ctx->coeff[i] = 0;
     aom_free(ctx->qcoeff[i]);
     ctx->qcoeff[i] = 0;
     aom_free(ctx->dqcoeff[i]);
     ctx->dqcoeff[i] = 0;
-#if CONFIG_PVQ
-    aom_free(ctx->pvq_ref_coeff[i]);
-    ctx->pvq_ref_coeff[i] = 0;
-#endif
     aom_free(ctx->eobs[i]);
     ctx->eobs[i] = 0;
-#if CONFIG_LV_MAP
     aom_free(ctx->txb_entropy_ctx[i]);
     ctx->txb_entropy_ctx[i] = 0;
-#endif
   }
 
   for (i = 0; i < 2; ++i) {
     aom_free(ctx->color_index_map[i]);
     ctx->color_index_map[i] = 0;
   }
-#if CONFIG_MRC_TX
-  aom_free(ctx->mrc_mask);
-  ctx->mrc_mask = 0;
-#endif  // CONFIG_MRC_TX
 }
 
-static void alloc_tree_contexts(AV1_COMMON *cm, PC_TREE *tree, int num_pix) {
-#if CONFIG_EXT_PARTITION_TYPES
-  alloc_mode_context(cm, num_pix, PARTITION_NONE, &tree->none);
-  alloc_mode_context(cm, num_pix / 2, PARTITION_HORZ, &tree->horizontal[0]);
-  alloc_mode_context(cm, num_pix / 2, PARTITION_VERT, &tree->vertical[0]);
-  alloc_mode_context(cm, num_pix / 2, PARTITION_VERT, &tree->horizontal[1]);
-  alloc_mode_context(cm, num_pix / 2, PARTITION_VERT, &tree->vertical[1]);
-
-  alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_A, &tree->horizontala[0]);
-  alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_A, &tree->horizontala[1]);
-  alloc_mode_context(cm, num_pix / 2, PARTITION_HORZ_A, &tree->horizontala[2]);
-  alloc_mode_context(cm, num_pix / 2, PARTITION_HORZ_B, &tree->horizontalb[0]);
-  alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_B, &tree->horizontalb[1]);
-  alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_B, &tree->horizontalb[2]);
-  alloc_mode_context(cm, num_pix / 4, PARTITION_VERT_A, &tree->verticala[0]);
-  alloc_mode_context(cm, num_pix / 4, PARTITION_VERT_A, &tree->verticala[1]);
-  alloc_mode_context(cm, num_pix / 2, PARTITION_VERT_A, &tree->verticala[2]);
-  alloc_mode_context(cm, num_pix / 2, PARTITION_VERT_B, &tree->verticalb[0]);
-  alloc_mode_context(cm, num_pix / 4, PARTITION_VERT_B, &tree->verticalb[1]);
-  alloc_mode_context(cm, num_pix / 4, PARTITION_VERT_B, &tree->verticalb[2]);
-  for (int i = 0; i < 4; ++i) {
-    alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_4,
-                       &tree->horizontal4[i]);
-    alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_4, &tree->vertical4[i]);
-  }
-#if CONFIG_SUPERTX
-  alloc_mode_context(cm, num_pix, PARTITION_HORZ, &tree->horizontal_supertx);
-  alloc_mode_context(cm, num_pix, PARTITION_VERT, &tree->vertical_supertx);
-  alloc_mode_context(cm, num_pix, PARTITION_SPLIT, &tree->split_supertx);
-  alloc_mode_context(cm, num_pix, PARTITION_HORZ_A, &tree->horizontala_supertx);
-  alloc_mode_context(cm, num_pix, PARTITION_HORZ_B, &tree->horizontalb_supertx);
-  alloc_mode_context(cm, num_pix, PARTITION_VERT_A, &tree->verticala_supertx);
-  alloc_mode_context(cm, num_pix, PARTITION_VERT_B, &tree->verticalb_supertx);
-#endif  // CONFIG_SUPERTX
-#else
+static void alloc_tree_contexts(AV1_COMMON *cm, PC_TREE *tree, int num_pix,
+                                int is_leaf) {
   alloc_mode_context(cm, num_pix, &tree->none);
+
+  if (is_leaf) return;
+
   alloc_mode_context(cm, num_pix / 2, &tree->horizontal[0]);
   alloc_mode_context(cm, num_pix / 2, &tree->vertical[0]);
-#if CONFIG_SUPERTX
-  alloc_mode_context(cm, num_pix, &tree->horizontal_supertx);
-  alloc_mode_context(cm, num_pix, &tree->vertical_supertx);
-  alloc_mode_context(cm, num_pix, &tree->split_supertx);
-#endif
 
-  if (num_pix > 16) {
-    alloc_mode_context(cm, num_pix / 2, &tree->horizontal[1]);
-    alloc_mode_context(cm, num_pix / 2, &tree->vertical[1]);
-  } else {
-    memset(&tree->horizontal[1], 0, sizeof(tree->horizontal[1]));
-    memset(&tree->vertical[1], 0, sizeof(tree->vertical[1]));
+  alloc_mode_context(cm, num_pix / 2, &tree->horizontal[1]);
+  alloc_mode_context(cm, num_pix / 2, &tree->vertical[1]);
+
+  alloc_mode_context(cm, num_pix / 4, &tree->horizontala[0]);
+  alloc_mode_context(cm, num_pix / 4, &tree->horizontala[1]);
+  alloc_mode_context(cm, num_pix / 2, &tree->horizontala[2]);
+
+  alloc_mode_context(cm, num_pix / 2, &tree->horizontalb[0]);
+  alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[1]);
+  alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[2]);
+
+  alloc_mode_context(cm, num_pix / 4, &tree->verticala[0]);
+  alloc_mode_context(cm, num_pix / 4, &tree->verticala[1]);
+  alloc_mode_context(cm, num_pix / 2, &tree->verticala[2]);
+
+  alloc_mode_context(cm, num_pix / 2, &tree->verticalb[0]);
+  alloc_mode_context(cm, num_pix / 4, &tree->verticalb[1]);
+  alloc_mode_context(cm, num_pix / 4, &tree->verticalb[2]);
+
+  for (int i = 0; i < 4; ++i) {
+    alloc_mode_context(cm, num_pix / 4, &tree->horizontal4[i]);
+    alloc_mode_context(cm, num_pix / 4, &tree->vertical4[i]);
   }
-#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
-static void free_tree_contexts(PC_TREE *tree) {
-#if CONFIG_EXT_PARTITION_TYPES
+static void free_tree_contexts(PC_TREE *tree, const int num_planes) {
   int i;
   for (i = 0; i < 3; i++) {
-    free_mode_context(&tree->horizontala[i]);
-    free_mode_context(&tree->horizontalb[i]);
-    free_mode_context(&tree->verticala[i]);
-    free_mode_context(&tree->verticalb[i]);
+    free_mode_context(&tree->horizontala[i], num_planes);
+    free_mode_context(&tree->horizontalb[i], num_planes);
+    free_mode_context(&tree->verticala[i], num_planes);
+    free_mode_context(&tree->verticalb[i], num_planes);
   }
   for (i = 0; i < 4; ++i) {
-    free_mode_context(&tree->horizontal4[i]);
-    free_mode_context(&tree->vertical4[i]);
+    free_mode_context(&tree->horizontal4[i], num_planes);
+    free_mode_context(&tree->vertical4[i], num_planes);
   }
-#endif  // CONFIG_EXT_PARTITION_TYPES
-  free_mode_context(&tree->none);
-  free_mode_context(&tree->horizontal[0]);
-  free_mode_context(&tree->horizontal[1]);
-  free_mode_context(&tree->vertical[0]);
-  free_mode_context(&tree->vertical[1]);
-#if CONFIG_SUPERTX
-  free_mode_context(&tree->horizontal_supertx);
-  free_mode_context(&tree->vertical_supertx);
-  free_mode_context(&tree->split_supertx);
-#if CONFIG_EXT_PARTITION_TYPES
-  free_mode_context(&tree->horizontala_supertx);
-  free_mode_context(&tree->horizontalb_supertx);
-  free_mode_context(&tree->verticala_supertx);
-  free_mode_context(&tree->verticalb_supertx);
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#endif  // CONFIG_SUPERTX
+  free_mode_context(&tree->none, num_planes);
+  free_mode_context(&tree->horizontal[0], num_planes);
+  free_mode_context(&tree->horizontal[1], num_planes);
+  free_mode_context(&tree->vertical[0], num_planes);
+  free_mode_context(&tree->vertical[1], num_planes);
 }
 
 // This function sets up a tree of contexts such that at each square
@@ -197,65 +129,25 @@ static void free_tree_contexts(PC_TREE *tree) {
 // represents the state of our search.
 void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
   int i, j;
-#if CONFIG_CB4X4
-#if CONFIG_EXT_PARTITION
   const int tree_nodes_inc = 1024;
-#else
-  const int tree_nodes_inc = 256;
-#endif  // CONFIG_EXT_PARTITION
   const int leaf_factor = 4;
-#else
-  const int tree_nodes_inc = 0;
-  const int leaf_factor = 1;
-#endif
-#if CONFIG_EXT_PARTITION
   const int leaf_nodes = 256 * leaf_factor;
   const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
-#else
-  const int leaf_nodes = 64 * leaf_factor;
-  const int tree_nodes = tree_nodes_inc + 64 + 16 + 4 + 1;
-#endif  // CONFIG_EXT_PARTITION
   int pc_tree_index = 0;
   PC_TREE *this_pc;
   int square_index = 1;
   int nodes;
 
-#if !CONFIG_CB4X4
-  aom_free(td->leaf_tree);
-  CHECK_MEM_ERROR(cm, td->leaf_tree,
-                  aom_calloc(leaf_nodes, sizeof(*td->leaf_tree)));
-  PICK_MODE_CONTEXT *this_leaf = &td->leaf_tree[0];
-#endif
   aom_free(td->pc_tree);
   CHECK_MEM_ERROR(cm, td->pc_tree,
                   aom_calloc(tree_nodes, sizeof(*td->pc_tree)));
   this_pc = &td->pc_tree[0];
 
-#if !CONFIG_CB4X4
-  // 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same
-  // context so we only need to allocate 1 for each 8x8 block.
-  for (i = 0; i < leaf_nodes; ++i) {
-#if CONFIG_EXT_PARTITION_TYPES
-    alloc_mode_context(cm, 4, PARTITION_NONE, &td->leaf_tree[i]);
-#else
-    alloc_mode_context(cm, 16, &td->leaf_tree[i]);
-#endif
-  }
-#endif
-
   // Sets up all the leaf nodes in the tree.
   for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
     PC_TREE *const tree = &td->pc_tree[pc_tree_index];
     tree->block_size = square[0];
-#if CONFIG_CB4X4
-    alloc_tree_contexts(cm, tree, 16);
-#else
-    alloc_tree_contexts(cm, tree, 4);
-#endif
-#if !CONFIG_CB4X4
-    tree->leaf_split[0] = this_leaf++;
-    for (j = 1; j < 4; j++) tree->leaf_split[j] = tree->leaf_split[0];
-#endif
+    alloc_tree_contexts(cm, tree, 16, 1);
   }
 
   // Each node has 4 leaf nodes, fill each block_size level of the tree
@@ -263,11 +155,7 @@ void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
   for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
     for (i = 0; i < nodes; ++i) {
       PC_TREE *const tree = &td->pc_tree[pc_tree_index];
-#if CONFIG_CB4X4
-      alloc_tree_contexts(cm, tree, 16 << (2 * square_index));
-#else
-      alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
-#endif
+      alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 0);
       tree->block_size = square[square_index];
       for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
       ++pc_tree_index;
@@ -286,35 +174,41 @@ void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
   }
 }
 
-void av1_free_pc_tree(ThreadData *td) {
-#if CONFIG_CB4X4
-#if CONFIG_EXT_PARTITION
+void av1_free_pc_tree(ThreadData *td, const int num_planes) {
   const int tree_nodes_inc = 1024;
-#else
-  const int tree_nodes_inc = 256;
-#endif  // CONFIG_EXT_PARTITION
-#else
-  const int tree_nodes_inc = 0;
-#endif
 
-#if CONFIG_EXT_PARTITION
   const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
-#else
-  const int tree_nodes = tree_nodes_inc + 64 + 16 + 4 + 1;
-#endif  // CONFIG_EXT_PARTITION
   int i;
-  for (i = 0; i < tree_nodes; ++i) free_tree_contexts(&td->pc_tree[i]);
+  for (i = 0; i < tree_nodes; ++i)
+    free_tree_contexts(&td->pc_tree[i], num_planes);
   aom_free(td->pc_tree);
   td->pc_tree = NULL;
-#if !CONFIG_CB4X4
-  const int leaf_factor = 1;
-#if CONFIG_EXT_PARTITION
-  const int leaf_nodes = 256 * leaf_factor;
-#else
-  const int leaf_nodes = 64 * leaf_factor;
-#endif  // CONFIG_EXT_PARTITION
-  for (i = 0; i < leaf_nodes; ++i) free_mode_context(&td->leaf_tree[i]);
-  aom_free(td->leaf_tree);
-  td->leaf_tree = NULL;
-#endif
+}
+
+void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
+                           PICK_MODE_CONTEXT *src_ctx) {
+  dst_ctx->mic = src_ctx->mic;
+  dst_ctx->mbmi_ext = src_ctx->mbmi_ext;
+
+  dst_ctx->num_4x4_blk = src_ctx->num_4x4_blk;
+  dst_ctx->skip = src_ctx->skip;
+  dst_ctx->skippable = src_ctx->skippable;
+  dst_ctx->best_mode_index = src_ctx->best_mode_index;
+
+  memcpy(dst_ctx->blk_skip, src_ctx->blk_skip,
+         sizeof(uint8_t) * src_ctx->num_4x4_blk);
+
+  dst_ctx->hybrid_pred_diff = src_ctx->hybrid_pred_diff;
+  dst_ctx->comp_pred_diff = src_ctx->comp_pred_diff;
+  dst_ctx->single_pred_diff = src_ctx->single_pred_diff;
+
+  dst_ctx->rate = src_ctx->rate;
+  dst_ctx->dist = src_ctx->dist;
+  dst_ctx->rdcost = src_ctx->rdcost;
+  dst_ctx->rd_mode_is_ready = src_ctx->rd_mode_is_ready;
+
+  memcpy(dst_ctx->pred_mv, src_ctx->pred_mv, sizeof(MV) * REF_FRAMES);
+  dst_ctx->pred_interp_filter = src_ctx->pred_interp_filter;
+
+  dst_ctx->partition = src_ctx->partition;
 }
diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h
index 38052ba27..c05f48a7a 100644
--- a/third_party/aom/av1/encoder/context_tree.h
+++ b/third_party/aom/av1/encoder/context_tree.h
@@ -23,28 +23,29 @@ struct AV1_COMP;
 struct AV1Common;
 struct ThreadData;
 
+typedef enum {
+  // Search all the partition types in this plane.
+  SEARCH_FULL_PLANE = 0,
+  // Only search none_partition coding block.
+  NONE_PARTITION_PLANE = 1,
+  // Search all the partition types in this plane except split.
+  SEARCH_SAME_PLANE = 2,
+  // Skip search partition on this plane. Go split directly.
+  SPLIT_PLANE = 3,
+} CB_TREE_SEARCH;
+
 // Structure to hold snapshot of coding context during the mode picking process
 typedef struct {
-  MODE_INFO mic;
+  MB_MODE_INFO mic;
   MB_MODE_INFO_EXT mbmi_ext;
   uint8_t *color_index_map[2];
-#if CONFIG_MRC_TX
-  uint8_t *mrc_mask;
-#endif  // CONFIG_MRC_TX
-#if CONFIG_VAR_TX
-  uint8_t *blk_skip[MAX_MB_PLANE];
-#endif
+  uint8_t *blk_skip;
 
   tran_low_t *coeff[MAX_MB_PLANE];
   tran_low_t *qcoeff[MAX_MB_PLANE];
   tran_low_t *dqcoeff[MAX_MB_PLANE];
-#if CONFIG_PVQ
-  tran_low_t *pvq_ref_coeff[MAX_MB_PLANE];
-#endif
   uint16_t *eobs[MAX_MB_PLANE];
-#if CONFIG_LV_MAP
   uint8_t *txb_entropy_ctx[MAX_MB_PLANE];
-#endif
 
   int num_4x4_blk;
   int skip;
@@ -60,16 +61,27 @@ typedef struct {
   // scope of refactoring.
   int rate;
   int64_t dist;
+  int64_t rdcost;
+  int rd_mode_is_ready;  // Flag to indicate whether rd pick mode decision has
+                         // been made.
 
   // motion vector cache for adaptive motion search control in partition
   // search loop
-  MV pred_mv[TOTAL_REFS_PER_FRAME];
+  MV pred_mv[REF_FRAMES];
   InterpFilter pred_interp_filter;
-#if CONFIG_EXT_PARTITION_TYPES
   PARTITION_TYPE partition;
-#endif
 } PICK_MODE_CONTEXT;
 
+typedef struct {
+  int valid;
+  int split;
+  int skip;
+  int64_t rdcost;
+  int sub_block_split[4];
+  int sub_block_skip[4];
+  int64_t sub_block_rdcost[4];
+} PC_TREE_STATS;
+
 typedef struct PC_TREE {
   int index;
   PARTITION_TYPE partitioning;
@@ -77,34 +89,21 @@ typedef struct PC_TREE {
   PICK_MODE_CONTEXT none;
   PICK_MODE_CONTEXT horizontal[2];
   PICK_MODE_CONTEXT vertical[2];
-#if CONFIG_EXT_PARTITION_TYPES
   PICK_MODE_CONTEXT horizontala[3];
   PICK_MODE_CONTEXT horizontalb[3];
   PICK_MODE_CONTEXT verticala[3];
   PICK_MODE_CONTEXT verticalb[3];
   PICK_MODE_CONTEXT horizontal4[4];
   PICK_MODE_CONTEXT vertical4[4];
-#endif
-  // TODO(jingning): remove leaf_split[] when cb4x4 experiment flag is removed.
-  union {
-    struct PC_TREE *split[4];
-    PICK_MODE_CONTEXT *leaf_split[4];
-  };
-#if CONFIG_SUPERTX
-  PICK_MODE_CONTEXT horizontal_supertx;
-  PICK_MODE_CONTEXT vertical_supertx;
-  PICK_MODE_CONTEXT split_supertx;
-#if CONFIG_EXT_PARTITION_TYPES
-  PICK_MODE_CONTEXT horizontala_supertx;
-  PICK_MODE_CONTEXT horizontalb_supertx;
-  PICK_MODE_CONTEXT verticala_supertx;
-  PICK_MODE_CONTEXT verticalb_supertx;
-#endif
-#endif
+  CB_TREE_SEARCH cb_search_range;
+  struct PC_TREE *split[4];
+  PC_TREE_STATS pc_tree_stats;
 } PC_TREE;
 
 void av1_setup_pc_tree(struct AV1Common *cm, struct ThreadData *td);
-void av1_free_pc_tree(struct ThreadData *td);
+void av1_free_pc_tree(struct ThreadData *td, const int num_planes);
+void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
+                           PICK_MODE_CONTEXT *src_ctx);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/corner_match.c b/third_party/aom/av1/encoder/corner_match.c
index 3827b65fa..29e934deb 100644
--- a/third_party/aom/av1/encoder/corner_match.c
+++ b/third_party/aom/av1/encoder/corner_match.c
@@ -13,7 +13,8 @@
 #include <memory.h>
 #include <math.h>
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "av1/encoder/corner_match.h"
 
 #define SEARCH_SZ 9
diff --git a/third_party/aom/av1/encoder/cost.c b/third_party/aom/av1/encoder/cost.c
index e33df53e4..323e2aed5 100644
--- a/third_party/aom/av1/encoder/cost.c
+++ b/third_party/aom/av1/encoder/cost.c
@@ -13,65 +13,26 @@
 #include "av1/encoder/cost.h"
 #include "av1/common/entropy.h"
 
-/* round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT))
-   Begins with a bogus entry for simpler addressing. */
-const uint16_t av1_prob_cost[256] = {
-  4096, 4096, 3584, 3284, 3072, 2907, 2772, 2659, 2560, 2473, 2395, 2325, 2260,
-  2201, 2147, 2096, 2048, 2003, 1961, 1921, 1883, 1847, 1813, 1780, 1748, 1718,
-  1689, 1661, 1635, 1609, 1584, 1559, 1536, 1513, 1491, 1470, 1449, 1429, 1409,
-  1390, 1371, 1353, 1335, 1318, 1301, 1284, 1268, 1252, 1236, 1221, 1206, 1192,
-  1177, 1163, 1149, 1136, 1123, 1110, 1097, 1084, 1072, 1059, 1047, 1036, 1024,
-  1013, 1001, 990,  979,  968,  958,  947,  937,  927,  917,  907,  897,  887,
-  878,  868,  859,  850,  841,  832,  823,  814,  806,  797,  789,  780,  772,
-  764,  756,  748,  740,  732,  724,  717,  709,  702,  694,  687,  680,  673,
-  665,  658,  651,  644,  637,  631,  624,  617,  611,  604,  598,  591,  585,
-  578,  572,  566,  560,  554,  547,  541,  535,  530,  524,  518,  512,  506,
-  501,  495,  489,  484,  478,  473,  467,  462,  456,  451,  446,  441,  435,
-  430,  425,  420,  415,  410,  405,  400,  395,  390,  385,  380,  375,  371,
-  366,  361,  356,  352,  347,  343,  338,  333,  329,  324,  320,  316,  311,
-  307,  302,  298,  294,  289,  285,  281,  277,  273,  268,  264,  260,  256,
-  252,  248,  244,  240,  236,  232,  228,  224,  220,  216,  212,  209,  205,
-  201,  197,  194,  190,  186,  182,  179,  175,  171,  168,  164,  161,  157,
-  153,  150,  146,  143,  139,  136,  132,  129,  125,  122,  119,  115,  112,
-  109,  105,  102,  99,   95,   92,   89,   86,   82,   79,   76,   73,   70,
-  66,   63,   60,   57,   54,   51,   48,   45,   42,   38,   35,   32,   29,
-  26,   23,   20,   18,   15,   12,   9,    6,    3
+// round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT)); i = 128~255.
+const uint16_t av1_prob_cost[128] = {
+  512, 506, 501, 495, 489, 484, 478, 473, 467, 462, 456, 451, 446, 441, 435,
+  430, 425, 420, 415, 410, 405, 400, 395, 390, 385, 380, 375, 371, 366, 361,
+  356, 352, 347, 343, 338, 333, 329, 324, 320, 316, 311, 307, 302, 298, 294,
+  289, 285, 281, 277, 273, 268, 264, 260, 256, 252, 248, 244, 240, 236, 232,
+  228, 224, 220, 216, 212, 209, 205, 201, 197, 194, 190, 186, 182, 179, 175,
+  171, 168, 164, 161, 157, 153, 150, 146, 143, 139, 136, 132, 129, 125, 122,
+  119, 115, 112, 109, 105, 102, 99,  95,  92,  89,  86,  82,  79,  76,  73,
+  70,  66,  63,  60,  57,  54,  51,  48,  45,  42,  38,  35,  32,  29,  26,
+  23,  20,  18,  15,  12,  9,   6,   3,
 };
 
-static void cost(int *costs, aom_tree tree, const aom_prob *probs, int i,
-                 int c) {
-  const aom_prob prob = probs[i / 2];
-  int b;
-
-  assert(prob != 0);
-  for (b = 0; b <= 1; ++b) {
-    const int cc = c + av1_cost_bit(prob, b);
-    const aom_tree_index ii = tree[i + b];
-
-    if (ii <= 0)
-      costs[-ii] = cc;
-    else
-      cost(costs, tree, probs, ii, cc);
-  }
-}
-
-void av1_cost_tokens(int *costs, const aom_prob *probs, aom_tree tree) {
-  cost(costs, tree, probs, 0, 0);
-}
-
-void av1_cost_tokens_skip(int *costs, const aom_prob *probs, aom_tree tree) {
-  assert(tree[0] <= 0 && tree[1] > 0);
-
-  costs[-tree[0]] = av1_cost_bit(probs[0], 0);
-  cost(costs, tree, probs, 2, 0);
-}
-
 void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
                               const int *inv_map) {
   int i;
   aom_cdf_prob prev_cdf = 0;
   for (i = 0;; ++i) {
-    const aom_cdf_prob p15 = AOM_ICDF(cdf[i]) - prev_cdf;
+    aom_cdf_prob p15 = AOM_ICDF(cdf[i]) - prev_cdf;
+    p15 = (p15 < EC_MIN_PROB) ? EC_MIN_PROB : p15;
     prev_cdf = AOM_ICDF(cdf[i]);
 
     if (inv_map)
diff --git a/third_party/aom/av1/encoder/cost.h b/third_party/aom/av1/encoder/cost.h
index e60632005..5de7765c5 100644
--- a/third_party/aom/av1/encoder/cost.h
+++ b/third_party/aom/av1/encoder/cost.h
@@ -19,17 +19,11 @@
 extern "C" {
 #endif
 
-extern const uint16_t av1_prob_cost[256];
+extern const uint16_t av1_prob_cost[128];
 
 // The factor to scale from cost in bits to cost in av1_prob_cost units.
 #define AV1_PROB_COST_SHIFT 9
 
-#define av1_cost_zero(prob) (av1_prob_cost[prob])
-
-#define av1_cost_one(prob) av1_cost_zero(256 - (prob))
-
-#define av1_cost_bit(prob, bit) av1_cost_zero((bit) ? 256 - (prob) : (prob))
-
 // Cost of coding an n bit literal, using 128 (i.e. 50%) probability
 // for each bit.
 #define av1_cost_literal(n) ((n) * (1 << AV1_PROB_COST_SHIFT))
@@ -38,31 +32,11 @@ extern const uint16_t av1_prob_cost[256];
 static INLINE int av1_cost_symbol(aom_cdf_prob p15) {
   assert(0 < p15 && p15 < CDF_PROB_TOP);
   const int shift = CDF_PROB_BITS - 1 - get_msb(p15);
-  return av1_cost_zero(get_prob(p15 << shift, CDF_PROB_TOP)) +
-         av1_cost_literal(shift);
-}
-
-static INLINE unsigned int cost_branch256(const unsigned int ct[2],
-                                          aom_prob p) {
-  return ct[0] * av1_cost_zero(p) + ct[1] * av1_cost_one(p);
-}
-
-static INLINE int treed_cost(aom_tree tree, const aom_prob *probs, int bits,
-                             int len) {
-  int cost = 0;
-  aom_tree_index i = 0;
-
-  do {
-    const int bit = (bits >> --len) & 1;
-    cost += av1_cost_bit(probs[i >> 1], bit);
-    i = tree[i + bit];
-  } while (len);
-
-  return cost;
+  const int prob = get_prob(p15 << shift, CDF_PROB_TOP);
+  assert(prob >= 128);
+  return av1_prob_cost[prob - 128] + av1_cost_literal(shift);
 }
 
-void av1_cost_tokens(int *costs, const aom_prob *probs, aom_tree tree);
-void av1_cost_tokens_skip(int *costs, const aom_prob *probs, aom_tree tree);
 void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
                               const int *inv_map);
 
diff --git a/third_party/aom/av1/encoder/daala_compat_enc.c b/third_party/aom/av1/encoder/daala_compat_enc.c
deleted file mode 100644
index c60e2d3d7..000000000
--- a/third_party/aom/av1/encoder/daala_compat_enc.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "encint.h"
-
-void od_encode_checkpoint(const daala_enc_ctx *enc, od_rollback_buffer *rbuf) {
-#if !CONFIG_ANS
-  od_ec_enc_checkpoint(&rbuf->ec, &enc->w.ec);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-  OD_COPY(&rbuf->adapt, enc->state.adapt, 1);
-}
-
-void od_encode_rollback(daala_enc_ctx *enc, const od_rollback_buffer *rbuf) {
-#if !CONFIG_ANS
-  od_ec_enc_rollback(&enc->w.ec, &rbuf->ec);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-  OD_COPY(enc->state.adapt, &rbuf->adapt, 1);
-}
diff --git a/third_party/aom/av1/encoder/dct.c b/third_party/aom/av1/encoder/dct.c
deleted file mode 100644
index a04d46b72..000000000
--- a/third_party/aom/av1/encoder/dct.c
+++ /dev/null
@@ -1,2797 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <math.h>
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
-#include "aom_dsp/fwd_txfm.h"
-#include "aom_ports/mem.h"
-#include "av1/common/blockd.h"
-#include "av1/common/av1_fwd_txfm1d.h"
-#include "av1/common/av1_fwd_txfm1d_cfg.h"
-#include "av1/common/idct.h"
-#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \
-    CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64
-#include "av1/common/daala_tx.h"
-#endif
-
-static INLINE void range_check(const tran_low_t *input, const int size,
-                               const int bit) {
-#if 0  // CONFIG_COEFFICIENT_RANGE_CHECKING
-// TODO(angiebird): the range_check is not used because the bit range
-// in fdct# is not correct. Since we are going to merge in a new version
-// of fdct# from nextgenv2, we won't fix the incorrect bit range now.
-  int i;
-  for (i = 0; i < size; ++i) {
-    assert(abs(input[i]) < (1 << bit));
-  }
-#else
-  (void)input;
-  (void)size;
-  (void)bit;
-#endif
-}
-
-static void fdct4(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t temp;
-  tran_low_t step[4];
-
-  // stage 0
-  range_check(input, 4, 14);
-
-  // stage 1
-  output[0] = input[0] + input[3];
-  output[1] = input[1] + input[2];
-  output[2] = input[1] - input[2];
-  output[3] = input[0] - input[3];
-
-  range_check(output, 4, 15);
-
-  // stage 2
-  temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
-  step[0] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
-  step[1] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
-  step[2] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
-  step[3] = (tran_low_t)fdct_round_shift(temp);
-
-  range_check(step, 4, 16);
-
-  // stage 3
-  output[0] = step[0];
-  output[1] = step[2];
-  output[2] = step[1];
-  output[3] = step[3];
-
-  range_check(output, 4, 16);
-}
-
-static void fdct8(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t temp;
-  tran_low_t step[8];
-
-  // stage 0
-  range_check(input, 8, 13);
-
-  // stage 1
-  output[0] = input[0] + input[7];
-  output[1] = input[1] + input[6];
-  output[2] = input[2] + input[5];
-  output[3] = input[3] + input[4];
-  output[4] = input[3] - input[4];
-  output[5] = input[2] - input[5];
-  output[6] = input[1] - input[6];
-  output[7] = input[0] - input[7];
-
-  range_check(output, 8, 14);
-
-  // stage 2
-  step[0] = output[0] + output[3];
-  step[1] = output[1] + output[2];
-  step[2] = output[1] - output[2];
-  step[3] = output[0] - output[3];
-  step[4] = output[4];
-  temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64;
-  step[5] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[6] * cospi_16_64 + output[5] * cospi_16_64;
-  step[6] = (tran_low_t)fdct_round_shift(temp);
-  step[7] = output[7];
-
-  range_check(step, 8, 15);
-
-  // stage 3
-  temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
-  output[0] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64;
-  output[1] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[2] * cospi_24_64 + step[3] * cospi_8_64;
-  output[2] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64;
-  output[3] = (tran_low_t)fdct_round_shift(temp);
-  output[4] = step[4] + step[5];
-  output[5] = step[4] - step[5];
-  output[6] = step[7] - step[6];
-  output[7] = step[7] + step[6];
-
-  range_check(output, 8, 16);
-
-  // stage 4
-  step[0] = output[0];
-  step[1] = output[1];
-  step[2] = output[2];
-  step[3] = output[3];
-  temp = output[4] * cospi_28_64 + output[7] * cospi_4_64;
-  step[4] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[5] * cospi_12_64 + output[6] * cospi_20_64;
-  step[5] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64;
-  step[6] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
-  step[7] = (tran_low_t)fdct_round_shift(temp);
-
-  range_check(step, 8, 16);
-
-  // stage 5
-  output[0] = step[0];
-  output[1] = step[4];
-  output[2] = step[2];
-  output[3] = step[6];
-  output[4] = step[1];
-  output[5] = step[5];
-  output[6] = step[3];
-  output[7] = step[7];
-
-  range_check(output, 8, 16);
-}
-
-static void fdct16(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t temp;
-  tran_low_t step[16];
-
-  // stage 0
-  range_check(input, 16, 13);
-
-  // stage 1
-  output[0] = input[0] + input[15];
-  output[1] = input[1] + input[14];
-  output[2] = input[2] + input[13];
-  output[3] = input[3] + input[12];
-  output[4] = input[4] + input[11];
-  output[5] = input[5] + input[10];
-  output[6] = input[6] + input[9];
-  output[7] = input[7] + input[8];
-  output[8] = input[7] - input[8];
-  output[9] = input[6] - input[9];
-  output[10] = input[5] - input[10];
-  output[11] = input[4] - input[11];
-  output[12] = input[3] - input[12];
-  output[13] = input[2] - input[13];
-  output[14] = input[1] - input[14];
-  output[15] = input[0] - input[15];
-
-  range_check(output, 16, 14);
-
-  // stage 2
-  step[0] = output[0] + output[7];
-  step[1] = output[1] + output[6];
-  step[2] = output[2] + output[5];
-  step[3] = output[3] + output[4];
-  step[4] = output[3] - output[4];
-  step[5] = output[2] - output[5];
-  step[6] = output[1] - output[6];
-  step[7] = output[0] - output[7];
-  step[8] = output[8];
-  step[9] = output[9];
-  temp = output[10] * -cospi_16_64 + output[13] * cospi_16_64;
-  step[10] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[11] * -cospi_16_64 + output[12] * cospi_16_64;
-  step[11] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[12] * cospi_16_64 + output[11] * cospi_16_64;
-  step[12] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[13] * cospi_16_64 + output[10] * cospi_16_64;
-  step[13] = (tran_low_t)fdct_round_shift(temp);
-  step[14] = output[14];
-  step[15] = output[15];
-
-  range_check(step, 16, 15);
-
-  // stage 3
-  output[0] = step[0] + step[3];
-  output[1] = step[1] + step[2];
-  output[2] = step[1] - step[2];
-  output[3] = step[0] - step[3];
-  output[4] = step[4];
-  temp = step[5] * -cospi_16_64 + step[6] * cospi_16_64;
-  output[5] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[6] * cospi_16_64 + step[5] * cospi_16_64;
-  output[6] = (tran_low_t)fdct_round_shift(temp);
-  output[7] = step[7];
-  output[8] = step[8] + step[11];
-  output[9] = step[9] + step[10];
-  output[10] = step[9] - step[10];
-  output[11] = step[8] - step[11];
-  output[12] = step[15] - step[12];
-  output[13] = step[14] - step[13];
-  output[14] = step[14] + step[13];
-  output[15] = step[15] + step[12];
-
-  range_check(output, 16, 16);
-
-  // stage 4
-  temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
-  step[0] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
-  step[1] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
-  step[2] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
-  step[3] = (tran_low_t)fdct_round_shift(temp);
-  step[4] = output[4] + output[5];
-  step[5] = output[4] - output[5];
-  step[6] = output[7] - output[6];
-  step[7] = output[7] + output[6];
-  step[8] = output[8];
-  temp = output[9] * -cospi_8_64 + output[14] * cospi_24_64;
-  step[9] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[10] * -cospi_24_64 + output[13] * -cospi_8_64;
-  step[10] = (tran_low_t)fdct_round_shift(temp);
-  step[11] = output[11];
-  step[12] = output[12];
-  temp = output[13] * cospi_24_64 + output[10] * -cospi_8_64;
-  step[13] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[14] * cospi_8_64 + output[9] * cospi_24_64;
-  step[14] = (tran_low_t)fdct_round_shift(temp);
-  step[15] = output[15];
-
-  range_check(step, 16, 16);
-
-  // stage 5
-  output[0] = step[0];
-  output[1] = step[1];
-  output[2] = step[2];
-  output[3] = step[3];
-  temp = step[4] * cospi_28_64 + step[7] * cospi_4_64;
-  output[4] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[5] * cospi_12_64 + step[6] * cospi_20_64;
-  output[5] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[6] * cospi_12_64 + step[5] * -cospi_20_64;
-  output[6] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[7] * cospi_28_64 + step[4] * -cospi_4_64;
-  output[7] = (tran_low_t)fdct_round_shift(temp);
-  output[8] = step[8] + step[9];
-  output[9] = step[8] - step[9];
-  output[10] = step[11] - step[10];
-  output[11] = step[11] + step[10];
-  output[12] = step[12] + step[13];
-  output[13] = step[12] - step[13];
-  output[14] = step[15] - step[14];
-  output[15] = step[15] + step[14];
-
-  range_check(output, 16, 16);
-
-  // stage 6
-  step[0] = output[0];
-  step[1] = output[1];
-  step[2] = output[2];
-  step[3] = output[3];
-  step[4] = output[4];
-  step[5] = output[5];
-  step[6] = output[6];
-  step[7] = output[7];
-  temp = output[8] * cospi_30_64 + output[15] * cospi_2_64;
-  step[8] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[9] * cospi_14_64 + output[14] * cospi_18_64;
-  step[9] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[10] * cospi_22_64 + output[13] * cospi_10_64;
-  step[10] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[11] * cospi_6_64 + output[12] * cospi_26_64;
-  step[11] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[12] * cospi_6_64 + output[11] * -cospi_26_64;
-  step[12] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[13] * cospi_22_64 + output[10] * -cospi_10_64;
-  step[13] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[14] * cospi_14_64 + output[9] * -cospi_18_64;
-  step[14] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[15] * cospi_30_64 + output[8] * -cospi_2_64;
-  step[15] = (tran_low_t)fdct_round_shift(temp);
-
-  range_check(step, 16, 16);
-
-  // stage 7
-  output[0] = step[0];
-  output[1] = step[8];
-  output[2] = step[4];
-  output[3] = step[12];
-  output[4] = step[2];
-  output[5] = step[10];
-  output[6] = step[6];
-  output[7] = step[14];
-  output[8] = step[1];
-  output[9] = step[9];
-  output[10] = step[5];
-  output[11] = step[13];
-  output[12] = step[3];
-  output[13] = step[11];
-  output[14] = step[7];
-  output[15] = step[15];
-
-  range_check(output, 16, 16);
-}
-
-static void fdct32(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t temp;
-  tran_low_t step[32];
-
-  // stage 0
-  range_check(input, 32, 14);
-
-  // stage 1
-  output[0] = input[0] + input[31];
-  output[1] = input[1] + input[30];
-  output[2] = input[2] + input[29];
-  output[3] = input[3] + input[28];
-  output[4] = input[4] + input[27];
-  output[5] = input[5] + input[26];
-  output[6] = input[6] + input[25];
-  output[7] = input[7] + input[24];
-  output[8] = input[8] + input[23];
-  output[9] = input[9] + input[22];
-  output[10] = input[10] + input[21];
-  output[11] = input[11] + input[20];
-  output[12] = input[12] + input[19];
-  output[13] = input[13] + input[18];
-  output[14] = input[14] + input[17];
-  output[15] = input[15] + input[16];
-  output[16] = input[15] - input[16];
-  output[17] = input[14] - input[17];
-  output[18] = input[13] - input[18];
-  output[19] = input[12] - input[19];
-  output[20] = input[11] - input[20];
-  output[21] = input[10] - input[21];
-  output[22] = input[9] - input[22];
-  output[23] = input[8] - input[23];
-  output[24] = input[7] - input[24];
-  output[25] = input[6] - input[25];
-  output[26] = input[5] - input[26];
-  output[27] = input[4] - input[27];
-  output[28] = input[3] - input[28];
-  output[29] = input[2] - input[29];
-  output[30] = input[1] - input[30];
-  output[31] = input[0] - input[31];
-
-  range_check(output, 32, 15);
-
-  // stage 2
-  step[0] = output[0] + output[15];
-  step[1] = output[1] + output[14];
-  step[2] = output[2] + output[13];
-  step[3] = output[3] + output[12];
-  step[4] = output[4] + output[11];
-  step[5] = output[5] + output[10];
-  step[6] = output[6] + output[9];
-  step[7] = output[7] + output[8];
-  step[8] = output[7] - output[8];
-  step[9] = output[6] - output[9];
-  step[10] = output[5] - output[10];
-  step[11] = output[4] - output[11];
-  step[12] = output[3] - output[12];
-  step[13] = output[2] - output[13];
-  step[14] = output[1] - output[14];
-  step[15] = output[0] - output[15];
-  step[16] = output[16];
-  step[17] = output[17];
-  step[18] = output[18];
-  step[19] = output[19];
-  temp = output[20] * -cospi_16_64 + output[27] * cospi_16_64;
-  step[20] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[21] * -cospi_16_64 + output[26] * cospi_16_64;
-  step[21] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[22] * -cospi_16_64 + output[25] * cospi_16_64;
-  step[22] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[23] * -cospi_16_64 + output[24] * cospi_16_64;
-  step[23] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[24] * cospi_16_64 + output[23] * cospi_16_64;
-  step[24] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[25] * cospi_16_64 + output[22] * cospi_16_64;
-  step[25] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[26] * cospi_16_64 + output[21] * cospi_16_64;
-  step[26] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[27] * cospi_16_64 + output[20] * cospi_16_64;
-  step[27] = (tran_low_t)fdct_round_shift(temp);
-  step[28] = output[28];
-  step[29] = output[29];
-  step[30] = output[30];
-  step[31] = output[31];
-
-  range_check(step, 32, 16);
-
-  // stage 3
-  output[0] = step[0] + step[7];
-  output[1] = step[1] + step[6];
-  output[2] = step[2] + step[5];
-  output[3] = step[3] + step[4];
-  output[4] = step[3] - step[4];
-  output[5] = step[2] - step[5];
-  output[6] = step[1] - step[6];
-  output[7] = step[0] - step[7];
-  output[8] = step[8];
-  output[9] = step[9];
-  temp = step[10] * -cospi_16_64 + step[13] * cospi_16_64;
-  output[10] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[11] * -cospi_16_64 + step[12] * cospi_16_64;
-  output[11] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[12] * cospi_16_64 + step[11] * cospi_16_64;
-  output[12] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[13] * cospi_16_64 + step[10] * cospi_16_64;
-  output[13] = (tran_low_t)fdct_round_shift(temp);
-  output[14] = step[14];
-  output[15] = step[15];
-  output[16] = step[16] + step[23];
-  output[17] = step[17] + step[22];
-  output[18] = step[18] + step[21];
-  output[19] = step[19] + step[20];
-  output[20] = step[19] - step[20];
-  output[21] = step[18] - step[21];
-  output[22] = step[17] - step[22];
-  output[23] = step[16] - step[23];
-  output[24] = step[31] - step[24];
-  output[25] = step[30] - step[25];
-  output[26] = step[29] - step[26];
-  output[27] = step[28] - step[27];
-  output[28] = step[28] + step[27];
-  output[29] = step[29] + step[26];
-  output[30] = step[30] + step[25];
-  output[31] = step[31] + step[24];
-
-  range_check(output, 32, 17);
-
-  // stage 4
-  step[0] = output[0] + output[3];
-  step[1] = output[1] + output[2];
-  step[2] = output[1] - output[2];
-  step[3] = output[0] - output[3];
-  step[4] = output[4];
-  temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64;
-  step[5] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[6] * cospi_16_64 + output[5] * cospi_16_64;
-  step[6] = (tran_low_t)fdct_round_shift(temp);
-  step[7] = output[7];
-  step[8] = output[8] + output[11];
-  step[9] = output[9] + output[10];
-  step[10] = output[9] - output[10];
-  step[11] = output[8] - output[11];
-  step[12] = output[15] - output[12];
-  step[13] = output[14] - output[13];
-  step[14] = output[14] + output[13];
-  step[15] = output[15] + output[12];
-  step[16] = output[16];
-  step[17] = output[17];
-  temp = output[18] * -cospi_8_64 + output[29] * cospi_24_64;
-  step[18] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[19] * -cospi_8_64 + output[28] * cospi_24_64;
-  step[19] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[20] * -cospi_24_64 + output[27] * -cospi_8_64;
-  step[20] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[21] * -cospi_24_64 + output[26] * -cospi_8_64;
-  step[21] = (tran_low_t)fdct_round_shift(temp);
-  step[22] = output[22];
-  step[23] = output[23];
-  step[24] = output[24];
-  step[25] = output[25];
-  temp = output[26] * cospi_24_64 + output[21] * -cospi_8_64;
-  step[26] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[27] * cospi_24_64 + output[20] * -cospi_8_64;
-  step[27] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[28] * cospi_8_64 + output[19] * cospi_24_64;
-  step[28] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[29] * cospi_8_64 + output[18] * cospi_24_64;
-  step[29] = (tran_low_t)fdct_round_shift(temp);
-  step[30] = output[30];
-  step[31] = output[31];
-
-  range_check(step, 32, 18);
-
-  // stage 5
-  temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
-  output[0] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64;
-  output[1] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[2] * cospi_24_64 + step[3] * cospi_8_64;
-  output[2] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64;
-  output[3] = (tran_low_t)fdct_round_shift(temp);
-  output[4] = step[4] + step[5];
-  output[5] = step[4] - step[5];
-  output[6] = step[7] - step[6];
-  output[7] = step[7] + step[6];
-  output[8] = step[8];
-  temp = step[9] * -cospi_8_64 + step[14] * cospi_24_64;
-  output[9] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[10] * -cospi_24_64 + step[13] * -cospi_8_64;
-  output[10] = (tran_low_t)fdct_round_shift(temp);
-  output[11] = step[11];
-  output[12] = step[12];
-  temp = step[13] * cospi_24_64 + step[10] * -cospi_8_64;
-  output[13] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[14] * cospi_8_64 + step[9] * cospi_24_64;
-  output[14] = (tran_low_t)fdct_round_shift(temp);
-  output[15] = step[15];
-  output[16] = step[16] + step[19];
-  output[17] = step[17] + step[18];
-  output[18] = step[17] - step[18];
-  output[19] = step[16] - step[19];
-  output[20] = step[23] - step[20];
-  output[21] = step[22] - step[21];
-  output[22] = step[22] + step[21];
-  output[23] = step[23] + step[20];
-  output[24] = step[24] + step[27];
-  output[25] = step[25] + step[26];
-  output[26] = step[25] - step[26];
-  output[27] = step[24] - step[27];
-  output[28] = step[31] - step[28];
-  output[29] = step[30] - step[29];
-  output[30] = step[30] + step[29];
-  output[31] = step[31] + step[28];
-
-  range_check(output, 32, 18);
-
-  // stage 6
-  step[0] = output[0];
-  step[1] = output[1];
-  step[2] = output[2];
-  step[3] = output[3];
-  temp = output[4] * cospi_28_64 + output[7] * cospi_4_64;
-  step[4] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[5] * cospi_12_64 + output[6] * cospi_20_64;
-  step[5] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64;
-  step[6] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
-  step[7] = (tran_low_t)fdct_round_shift(temp);
-  step[8] = output[8] + output[9];
-  step[9] = output[8] - output[9];
-  step[10] = output[11] - output[10];
-  step[11] = output[11] + output[10];
-  step[12] = output[12] + output[13];
-  step[13] = output[12] - output[13];
-  step[14] = output[15] - output[14];
-  step[15] = output[15] + output[14];
-  step[16] = output[16];
-  temp = output[17] * -cospi_4_64 + output[30] * cospi_28_64;
-  step[17] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[18] * -cospi_28_64 + output[29] * -cospi_4_64;
-  step[18] = (tran_low_t)fdct_round_shift(temp);
-  step[19] = output[19];
-  step[20] = output[20];
-  temp = output[21] * -cospi_20_64 + output[26] * cospi_12_64;
-  step[21] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[22] * -cospi_12_64 + output[25] * -cospi_20_64;
-  step[22] = (tran_low_t)fdct_round_shift(temp);
-  step[23] = output[23];
-  step[24] = output[24];
-  temp = output[25] * cospi_12_64 + output[22] * -cospi_20_64;
-  step[25] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[26] * cospi_20_64 + output[21] * cospi_12_64;
-  step[26] = (tran_low_t)fdct_round_shift(temp);
-  step[27] = output[27];
-  step[28] = output[28];
-  temp = output[29] * cospi_28_64 + output[18] * -cospi_4_64;
-  step[29] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[30] * cospi_4_64 + output[17] * cospi_28_64;
-  step[30] = (tran_low_t)fdct_round_shift(temp);
-  step[31] = output[31];
-
-  range_check(step, 32, 18);
-
-  // stage 7
-  output[0] = step[0];
-  output[1] = step[1];
-  output[2] = step[2];
-  output[3] = step[3];
-  output[4] = step[4];
-  output[5] = step[5];
-  output[6] = step[6];
-  output[7] = step[7];
-  temp = step[8] * cospi_30_64 + step[15] * cospi_2_64;
-  output[8] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[9] * cospi_14_64 + step[14] * cospi_18_64;
-  output[9] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[10] * cospi_22_64 + step[13] * cospi_10_64;
-  output[10] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[11] * cospi_6_64 + step[12] * cospi_26_64;
-  output[11] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[12] * cospi_6_64 + step[11] * -cospi_26_64;
-  output[12] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[13] * cospi_22_64 + step[10] * -cospi_10_64;
-  output[13] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[14] * cospi_14_64 + step[9] * -cospi_18_64;
-  output[14] = (tran_low_t)fdct_round_shift(temp);
-  temp = step[15] * cospi_30_64 + step[8] * -cospi_2_64;
-  output[15] = (tran_low_t)fdct_round_shift(temp);
-  output[16] = step[16] + step[17];
-  output[17] = step[16] - step[17];
-  output[18] = step[19] - step[18];
-  output[19] = step[19] + step[18];
-  output[20] = step[20] + step[21];
-  output[21] = step[20] - step[21];
-  output[22] = step[23] - step[22];
-  output[23] = step[23] + step[22];
-  output[24] = step[24] + step[25];
-  output[25] = step[24] - step[25];
-  output[26] = step[27] - step[26];
-  output[27] = step[27] + step[26];
-  output[28] = step[28] + step[29];
-  output[29] = step[28] - step[29];
-  output[30] = step[31] - step[30];
-  output[31] = step[31] + step[30];
-
-  range_check(output, 32, 18);
-
-  // stage 8
-  step[0] = output[0];
-  step[1] = output[1];
-  step[2] = output[2];
-  step[3] = output[3];
-  step[4] = output[4];
-  step[5] = output[5];
-  step[6] = output[6];
-  step[7] = output[7];
-  step[8] = output[8];
-  step[9] = output[9];
-  step[10] = output[10];
-  step[11] = output[11];
-  step[12] = output[12];
-  step[13] = output[13];
-  step[14] = output[14];
-  step[15] = output[15];
-  temp = output[16] * cospi_31_64 + output[31] * cospi_1_64;
-  step[16] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[17] * cospi_15_64 + output[30] * cospi_17_64;
-  step[17] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[18] * cospi_23_64 + output[29] * cospi_9_64;
-  step[18] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[19] * cospi_7_64 + output[28] * cospi_25_64;
-  step[19] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[20] * cospi_27_64 + output[27] * cospi_5_64;
-  step[20] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[21] * cospi_11_64 + output[26] * cospi_21_64;
-  step[21] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[22] * cospi_19_64 + output[25] * cospi_13_64;
-  step[22] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[23] * cospi_3_64 + output[24] * cospi_29_64;
-  step[23] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[24] * cospi_3_64 + output[23] * -cospi_29_64;
-  step[24] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[25] * cospi_19_64 + output[22] * -cospi_13_64;
-  step[25] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[26] * cospi_11_64 + output[21] * -cospi_21_64;
-  step[26] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[27] * cospi_27_64 + output[20] * -cospi_5_64;
-  step[27] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[28] * cospi_7_64 + output[19] * -cospi_25_64;
-  step[28] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[29] * cospi_23_64 + output[18] * -cospi_9_64;
-  step[29] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[30] * cospi_15_64 + output[17] * -cospi_17_64;
-  step[30] = (tran_low_t)fdct_round_shift(temp);
-  temp = output[31] * cospi_31_64 + output[16] * -cospi_1_64;
-  step[31] = (tran_low_t)fdct_round_shift(temp);
-
-  range_check(step, 32, 18);
-
-  // stage 9
-  output[0] = step[0];
-  output[1] = step[16];
-  output[2] = step[8];
-  output[3] = step[24];
-  output[4] = step[4];
-  output[5] = step[20];
-  output[6] = step[12];
-  output[7] = step[28];
-  output[8] = step[2];
-  output[9] = step[18];
-  output[10] = step[10];
-  output[11] = step[26];
-  output[12] = step[6];
-  output[13] = step[22];
-  output[14] = step[14];
-  output[15] = step[30];
-  output[16] = step[1];
-  output[17] = step[17];
-  output[18] = step[9];
-  output[19] = step[25];
-  output[20] = step[5];
-  output[21] = step[21];
-  output[22] = step[13];
-  output[23] = step[29];
-  output[24] = step[3];
-  output[25] = step[19];
-  output[26] = step[11];
-  output[27] = step[27];
-  output[28] = step[7];
-  output[29] = step[23];
-  output[30] = step[15];
-  output[31] = step[31];
-
-  range_check(output, 32, 18);
-}
-
-#ifndef AV1_DCT_GTEST
-static void fadst4(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t x0, x1, x2, x3;
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  x0 = input[0];
-  x1 = input[1];
-  x2 = input[2];
-  x3 = input[3];
-
-  if (!(x0 | x1 | x2 | x3)) {
-    output[0] = output[1] = output[2] = output[3] = 0;
-    return;
-  }
-
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_4_9 * x0;
-  s2 = sinpi_2_9 * x1;
-  s3 = sinpi_1_9 * x1;
-  s4 = sinpi_3_9 * x2;
-  s5 = sinpi_4_9 * x3;
-  s6 = sinpi_2_9 * x3;
-  s7 = x0 + x1 - x3;
-
-  x0 = s0 + s2 + s5;
-  x1 = sinpi_3_9 * s7;
-  x2 = s1 - s3 + s6;
-  x3 = s4;
-
-  s0 = x0 + x3;
-  s1 = x1;
-  s2 = x2 - x3;
-  s3 = x2 - x0 + x3;
-
-  // 1-D transform scaling factor is sqrt(2).
-  output[0] = (tran_low_t)fdct_round_shift(s0);
-  output[1] = (tran_low_t)fdct_round_shift(s1);
-  output[2] = (tran_low_t)fdct_round_shift(s2);
-  output[3] = (tran_low_t)fdct_round_shift(s3);
-}
-
-static void fadst8(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_high_t x0 = input[7];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[5];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[3];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[1];
-  tran_high_t x7 = input[6];
-
-  // stage 1
-  s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
-  s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
-  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
-  s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
-  s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
-
-  x0 = s0 + s4;
-  x1 = s1 + s5;
-  x2 = s2 + s6;
-  x3 = s3 + s7;
-  x4 = fdct_round_shift(s0 - s4);
-  x5 = fdct_round_shift(s1 - s5);
-  x6 = fdct_round_shift(s2 - s6);
-  x7 = fdct_round_shift(s3 - s7);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
-  s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
-  s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
-  s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
-
-  x0 = fdct_round_shift(s0 + s2);
-  x1 = fdct_round_shift(s1 + s3);
-  x2 = fdct_round_shift(s0 - s2);
-  x3 = fdct_round_shift(s1 - s3);
-  x4 = fdct_round_shift(s4 + s6);
-  x5 = fdct_round_shift(s5 + s7);
-  x6 = fdct_round_shift(s4 - s6);
-  x7 = fdct_round_shift(s5 - s7);
-
-  // stage 3
-  s2 = cospi_16_64 * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (x6 - x7);
-
-  x2 = fdct_round_shift(s2);
-  x3 = fdct_round_shift(s3);
-  x6 = fdct_round_shift(s6);
-  x7 = fdct_round_shift(s7);
-
-  output[0] = (tran_low_t)x0;
-  output[1] = (tran_low_t)-x4;
-  output[2] = (tran_low_t)x6;
-  output[3] = (tran_low_t)-x2;
-  output[4] = (tran_low_t)x3;
-  output[5] = (tran_low_t)-x7;
-  output[6] = (tran_low_t)x5;
-  output[7] = (tran_low_t)-x1;
-}
-
-static void fadst16(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-  tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
-  tran_high_t x0 = input[15];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[13];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[11];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[9];
-  tran_high_t x7 = input[6];
-  tran_high_t x8 = input[7];
-  tran_high_t x9 = input[8];
-  tran_high_t x10 = input[5];
-  tran_high_t x11 = input[10];
-  tran_high_t x12 = input[3];
-  tran_high_t x13 = input[12];
-  tran_high_t x14 = input[1];
-  tran_high_t x15 = input[14];
-
-  // stage 1
-  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
-  x0 = s0 + s8;
-  x1 = s1 + s9;
-  x2 = s2 + s10;
-  x3 = s3 + s11;
-  x4 = s4 + s12;
-  x5 = s5 + s13;
-  x6 = s6 + s14;
-  x7 = s7 + s15;
-
-  x8 = fdct_round_shift(s0 - s8);
-  x9 = fdct_round_shift(s1 - s9);
-  x10 = fdct_round_shift(s2 - s10);
-  x11 = fdct_round_shift(s3 - s11);
-  x12 = fdct_round_shift(s4 - s12);
-  x13 = fdct_round_shift(s5 - s13);
-  x14 = fdct_round_shift(s6 - s14);
-  x15 = fdct_round_shift(s7 - s15);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4;
-  s5 = x5;
-  s6 = x6;
-  s7 = x7;
-  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
-  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
-  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
-  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
-  x0 = s0 + s4;
-  x1 = s1 + s5;
-  x2 = s2 + s6;
-  x3 = s3 + s7;
-  x4 = fdct_round_shift(s0 - s4);
-  x5 = fdct_round_shift(s1 - s5);
-  x6 = fdct_round_shift(s2 - s6);
-  x7 = fdct_round_shift(s3 - s7);
-
-  x8 = s8 + s12;
-  x9 = s9 + s13;
-  x10 = s10 + s14;
-  x11 = s11 + s15;
-  x12 = fdct_round_shift(s8 - s12);
-  x13 = fdct_round_shift(s9 - s13);
-  x14 = fdct_round_shift(s10 - s14);
-  x15 = fdct_round_shift(s11 - s15);
-
-  // stage 3
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
-  s8 = x8;
-  s9 = x9;
-  s10 = x10;
-  s11 = x11;
-  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
-  x0 = fdct_round_shift(s0 + s2);
-  x1 = fdct_round_shift(s1 + s3);
-  x2 = fdct_round_shift(s0 - s2);
-  x3 = fdct_round_shift(s1 - s3);
-
-  x4 = fdct_round_shift(s4 + s6);
-  x5 = fdct_round_shift(s5 + s7);
-  x6 = fdct_round_shift(s4 - s6);
-  x7 = fdct_round_shift(s5 - s7);
-
-  x8 = fdct_round_shift(s8 + s10);
-  x9 = fdct_round_shift(s9 + s11);
-  x10 = fdct_round_shift(s8 - s10);
-  x11 = fdct_round_shift(s9 - s11);
-
-  x12 = fdct_round_shift(s12 + s14);
-  x13 = fdct_round_shift(s13 + s15);
-  x14 = fdct_round_shift(s12 - s14);
-  x15 = fdct_round_shift(s13 - s15);
-
-  // stage 4
-  s2 = (-cospi_16_64) * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (-x6 + x7);
-  s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (-x10 + x11);
-  s14 = (-cospi_16_64) * (x14 + x15);
-  s15 = cospi_16_64 * (x14 - x15);
-
-  x2 = fdct_round_shift(s2);
-  x3 = fdct_round_shift(s3);
-  x6 = fdct_round_shift(s6);
-  x7 = fdct_round_shift(s7);
-  x10 = fdct_round_shift(s10);
-  x11 = fdct_round_shift(s11);
-  x14 = fdct_round_shift(s14);
-  x15 = fdct_round_shift(s15);
-
-  output[0] = (tran_low_t)x0;
-  output[1] = (tran_low_t)-x8;
-  output[2] = (tran_low_t)x12;
-  output[3] = (tran_low_t)-x4;
-  output[4] = (tran_low_t)x6;
-  output[5] = (tran_low_t)x14;
-  output[6] = (tran_low_t)x10;
-  output[7] = (tran_low_t)x2;
-  output[8] = (tran_low_t)x3;
-  output[9] = (tran_low_t)x11;
-  output[10] = (tran_low_t)x15;
-  output[11] = (tran_low_t)x7;
-  output[12] = (tran_low_t)x5;
-  output[13] = (tran_low_t)-x13;
-  output[14] = (tran_low_t)x9;
-  output[15] = (tran_low_t)-x1;
-}
-
-// For use in lieu of ADST
-static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[16];
-  for (i = 0; i < 16; ++i) {
-    output[16 + i] = input[i] * 4;
-  }
-  // Multiply input by sqrt(2)
-  for (i = 0; i < 16; ++i) {
-    inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 16] * Sqrt2);
-  }
-  fdct16(inputhalf, output);
-  // Note overall scaling factor is 4 times orthogonal
-}
-
-#if CONFIG_MRC_TX
-static void get_masked_residual32(const int16_t **input, int *input_stride,
-                                  const uint8_t *pred, int pred_stride,
-                                  int16_t *masked_input,
-                                  TxfmParam *txfm_param) {
-  int n_masked_vals = 0;
-  uint8_t *mrc_mask;
-  uint8_t mask_tmp[32 * 32];
-  if ((txfm_param->is_inter && SIGNAL_MRC_MASK_INTER) ||
-      (!txfm_param->is_inter && SIGNAL_MRC_MASK_INTRA)) {
-    mrc_mask = txfm_param->mask;
-    n_masked_vals = get_mrc_diff_mask(*input, *input_stride, mrc_mask, 32, 32,
-                                      32, txfm_param->is_inter);
-  } else {
-    mrc_mask = mask_tmp;
-    n_masked_vals = get_mrc_pred_mask(pred, pred_stride, mrc_mask, 32, 32, 32,
-                                      txfm_param->is_inter);
-  }
-
-  // Do not use MRC_DCT if mask is invalid. DCT_DCT will be used instead.
-  if (!is_valid_mrc_mask(n_masked_vals, 32, 32)) {
-    *txfm_param->valid_mask = 0;
-    return;
-  }
-  int32_t sum = 0;
-  int16_t avg;
-  // Get the masked average of the prediction
-  for (int i = 0; i < 32; ++i) {
-    for (int j = 0; j < 32; ++j) {
-      sum += mrc_mask[i * 32 + j] * (*input)[i * (*input_stride) + j];
-    }
-  }
-  avg = sum / n_masked_vals;
-  // Replace all of the unmasked pixels in the prediction with the average
-  // of the masked pixels
-  for (int i = 0; i < 32; ++i) {
-    for (int j = 0; j < 32; ++j)
-      masked_input[i * 32 + j] =
-          (mrc_mask[i * 32 + j]) ? (*input)[i * (*input_stride) + j] : avg;
-  }
-  *input = masked_input;
-  *input_stride = 32;
-  *txfm_param->valid_mask = 1;
-}
-#endif  // CONFIG_MRC_TX
-
-#if CONFIG_LGT || CONFIG_LGT_FROM_PRED
-static void flgt4(const tran_low_t *input, tran_low_t *output,
-                  const tran_high_t *lgtmtx) {
-  if (!lgtmtx) assert(0);
-#if CONFIG_LGT_FROM_PRED
-  // For DCT/ADST, use butterfly implementations
-  if (lgtmtx[0] == DCT4) {
-    fdct4(input, output);
-    return;
-  } else if (lgtmtx[0] == ADST4) {
-    fadst4(input, output);
-    return;
-  }
-#endif  // CONFIG_LGT_FROM_PRED
-
-  // evaluate s[j] = sum of all lgtmtx[j][i]*input[i] over i=1,...,4
-  tran_high_t s[4] = { 0 };
-  for (int i = 0; i < 4; ++i)
-    for (int j = 0; j < 4; ++j) s[j] += lgtmtx[j * 4 + i] * input[i];
-
-  for (int i = 0; i < 4; ++i) output[i] = (tran_low_t)fdct_round_shift(s[i]);
-}
-
-static void flgt8(const tran_low_t *input, tran_low_t *output,
-                  const tran_high_t *lgtmtx) {
-  if (!lgtmtx) assert(0);
-#if CONFIG_LGT_FROM_PRED
-  // For DCT/ADST, use butterfly implementations
-  if (lgtmtx[0] == DCT8) {
-    fdct8(input, output);
-    return;
-  } else if (lgtmtx[0] == ADST8) {
-    fadst8(input, output);
-    return;
-  }
-#endif  // CONFIG_LGT_FROM_PRED
-
-  // evaluate s[j] = sum of all lgtmtx[j][i]*input[i] over i=1,...,8
-  tran_high_t s[8] = { 0 };
-  for (int i = 0; i < 8; ++i)
-    for (int j = 0; j < 8; ++j) s[j] += lgtmtx[j * 8 + i] * input[i];
-
-  for (int i = 0; i < 8; ++i) output[i] = (tran_low_t)fdct_round_shift(s[i]);
-}
-#endif  // CONFIG_LGT || CONFIG_LGT_FROM_PRED
-
-#if CONFIG_LGT_FROM_PRED
-static void flgt16up(const tran_low_t *input, tran_low_t *output,
-                     const tran_high_t *lgtmtx) {
-  if (lgtmtx[0] == DCT16) {
-    fdct16(input, output);
-    return;
-  } else if (lgtmtx[0] == ADST16) {
-    fadst16(input, output);
-    return;
-  } else if (lgtmtx[0] == DCT32) {
-    fdct32(input, output);
-    return;
-  } else if (lgtmtx[0] == ADST32) {
-    fhalfright32(input, output);
-    return;
-  } else {
-    assert(0);
-  }
-}
-
-typedef void (*FlgtFunc)(const tran_low_t *input, tran_low_t *output,
-                         const tran_high_t *lgtmtx);
-
-static FlgtFunc flgt_func[4] = { flgt4, flgt8, flgt16up, flgt16up };
-
-typedef void (*GetLgtFunc)(const TxfmParam *txfm_param, int is_col,
-                           const tran_high_t *lgtmtx[], int ntx);
-
-static GetLgtFunc get_lgt_func[4] = { get_lgt4_from_pred, get_lgt8_from_pred,
-                                      get_lgt16up_from_pred,
-                                      get_lgt16up_from_pred };
-
-// this inline function corresponds to the up scaling before the first
-// transform in the av1_fht* functions
-static INLINE tran_low_t fwd_upscale_wrt_txsize(const tran_high_t val,
-                                                const TX_SIZE tx_size) {
-  switch (tx_size) {
-    case TX_4X4: return (tran_low_t)val << 4;
-    case TX_8X8:
-    case TX_4X16:
-    case TX_16X4:
-    case TX_8X32:
-    case TX_32X8: return (tran_low_t)val << 2;
-    case TX_4X8:
-    case TX_8X4:
-    case TX_8X16:
-    case TX_16X8: return (tran_low_t)fdct_round_shift(val * 4 * Sqrt2);
-    default: assert(0); break;
-  }
-  return 0;
-}
-
-// This inline function corresponds to the bit shift after the second
-// transform in the av1_fht* functions
-static INLINE tran_low_t fwd_downscale_wrt_txsize(const tran_low_t val,
-                                                  const TX_SIZE tx_size) {
-  switch (tx_size) {
-    case TX_4X4: return (val + 1) >> 2;
-    case TX_4X8:
-    case TX_8X4:
-    case TX_8X8:
-    case TX_4X16:
-    case TX_16X4: return (val + (val < 0)) >> 1;
-    case TX_8X16:
-    case TX_16X8: return val;
-    case TX_8X32:
-    case TX_32X8: return ROUND_POWER_OF_TWO_SIGNED(val, 2);
-    default: assert(0); break;
-  }
-  return 0;
-}
-
-void flgt2d_from_pred_c(const int16_t *input, tran_low_t *output, int stride,
-                        TxfmParam *txfm_param) {
-  const TX_SIZE tx_size = txfm_param->tx_size;
-  const int w = tx_size_wide[tx_size];
-  const int h = tx_size_high[tx_size];
-  const int wlog2 = tx_size_wide_log2[tx_size];
-  const int hlog2 = tx_size_high_log2[tx_size];
-  assert(w <= 8 || h <= 8);
-
-  int i, j;
-  tran_low_t out[256];  // max size: 8x32 and 32x8
-  tran_low_t temp_in[32], temp_out[32];
-  const tran_high_t *lgtmtx_col[1];
-  const tran_high_t *lgtmtx_row[1];
-  get_lgt_func[hlog2 - 2](txfm_param, 1, lgtmtx_col, w);
-  get_lgt_func[wlog2 - 2](txfm_param, 0, lgtmtx_row, h);
-
-  // For forward transforms, to be consistent with av1_fht functions, we apply
-  // short transform first and long transform second.
-  if (w < h) {
-    // Row transforms
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j)
-        temp_in[j] = fwd_upscale_wrt_txsize(input[i * stride + j], tx_size);
-      flgt_func[wlog2 - 2](temp_in, temp_out, lgtmtx_row[0]);
-      // right shift of 2 bits here in fht8x16 and fht16x8
-      for (j = 0; j < w; ++j)
-        out[j * h + i] = (tx_size == TX_16X8 || tx_size == TX_8X16)
-                             ? ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2)
-                             : temp_out[j];
-    }
-    // Column transforms
-    for (i = 0; i < w; ++i) {
-      for (j = 0; j < h; ++j) temp_in[j] = out[j + i * h];
-      flgt_func[hlog2 - 2](temp_in, temp_out, lgtmtx_col[0]);
-      for (j = 0; j < h; ++j)
-        output[j * w + i] = fwd_downscale_wrt_txsize(temp_out[j], tx_size);
-    }
-  } else {
-    // Column transforms
-    for (i = 0; i < w; ++i) {
-      for (j = 0; j < h; ++j)
-        temp_in[j] = fwd_upscale_wrt_txsize(input[j * stride + i], tx_size);
-      flgt_func[hlog2 - 2](temp_in, temp_out, lgtmtx_col[0]);
-      // fht8x16 and fht16x8 have right shift of 2 bits here
-      for (j = 0; j < h; ++j)
-        out[j * w + i] = (tx_size == TX_16X8 || tx_size == TX_8X16)
-                             ? ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2)
-                             : temp_out[j];
-    }
-    // Row transforms
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) temp_in[j] = out[j + i * w];
-      flgt_func[wlog2 - 2](temp_in, temp_out, lgtmtx_row[0]);
-      for (j = 0; j < w; ++j)
-        output[j + i * w] = fwd_downscale_wrt_txsize(temp_out[j], tx_size);
-    }
-  }
-}
-#endif  // CONFIG_LGT_FROM_PRED
-
-#if CONFIG_EXT_TX
-// TODO(sarahparker) these functions will be removed once the highbitdepth
-// codepath works properly for rectangular transforms. They have almost
-// identical versions in av1_fwd_txfm1d.c, but those are currently only
-// being used for square transforms.
-static void fidtx4(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 4; ++i) {
-    output[i] = (tran_low_t)fdct_round_shift(input[i] * Sqrt2);
-  }
-}
-
-static void fidtx8(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 8; ++i) {
-    output[i] = input[i] * 2;
-  }
-}
-
-static void fidtx16(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    output[i] = (tran_low_t)fdct_round_shift(input[i] * 2 * Sqrt2);
-  }
-}
-
-static void fidtx32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 32; ++i) {
-    output[i] = input[i] * 4;
-  }
-}
-
-static void copy_block(const int16_t *src, int src_stride, int l, int w,
-                       int16_t *dest, int dest_stride) {
-  int i;
-  for (i = 0; i < l; ++i) {
-    memcpy(dest + dest_stride * i, src + src_stride * i, w * sizeof(int16_t));
-  }
-}
-
-static void fliplr(int16_t *dest, int stride, int l, int w) {
-  int i, j;
-  for (i = 0; i < l; ++i) {
-    for (j = 0; j < w / 2; ++j) {
-      const int16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[i * stride + w - 1 - j];
-      dest[i * stride + w - 1 - j] = tmp;
-    }
-  }
-}
-
-static void flipud(int16_t *dest, int stride, int l, int w) {
-  int i, j;
-  for (j = 0; j < w; ++j) {
-    for (i = 0; i < l / 2; ++i) {
-      const int16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
-      dest[(l - 1 - i) * stride + j] = tmp;
-    }
-  }
-}
-
-static void fliplrud(int16_t *dest, int stride, int l, int w) {
-  int i, j;
-  for (i = 0; i < l / 2; ++i) {
-    for (j = 0; j < w; ++j) {
-      const int16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + w - 1 - j];
-      dest[(l - 1 - i) * stride + w - 1 - j] = tmp;
-    }
-  }
-}
-
-static void copy_fliplr(const int16_t *src, int src_stride, int l, int w,
-                        int16_t *dest, int dest_stride) {
-  copy_block(src, src_stride, l, w, dest, dest_stride);
-  fliplr(dest, dest_stride, l, w);
-}
-
-static void copy_flipud(const int16_t *src, int src_stride, int l, int w,
-                        int16_t *dest, int dest_stride) {
-  copy_block(src, src_stride, l, w, dest, dest_stride);
-  flipud(dest, dest_stride, l, w);
-}
-
-static void copy_fliplrud(const int16_t *src, int src_stride, int l, int w,
-                          int16_t *dest, int dest_stride) {
-  copy_block(src, src_stride, l, w, dest, dest_stride);
-  fliplrud(dest, dest_stride, l, w);
-}
-
-static void maybe_flip_input(const int16_t **src, int *src_stride, int l, int w,
-                             int16_t *buff, TX_TYPE tx_type) {
-  switch (tx_type) {
-#if CONFIG_MRC_TX
-    case MRC_DCT:
-#endif  // CONFIG_MRC_TX
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-    case IDTX:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST: break;
-    case FLIPADST_DCT:
-    case FLIPADST_ADST:
-    case V_FLIPADST:
-      copy_flipud(*src, *src_stride, l, w, buff, w);
-      *src = buff;
-      *src_stride = w;
-      break;
-    case DCT_FLIPADST:
-    case ADST_FLIPADST:
-    case H_FLIPADST:
-      copy_fliplr(*src, *src_stride, l, w, buff, w);
-      *src = buff;
-      *src_stride = w;
-      break;
-    case FLIPADST_FLIPADST:
-      copy_fliplrud(*src, *src_stride, l, w, buff, w);
-      *src = buff;
-      *src_stride = w;
-      break;
-    default: assert(0); break;
-  }
-}
-#endif  // CONFIG_EXT_TX
-
-void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
-                  TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-#if !CONFIG_DAALA_DCT4
-  if (tx_type == DCT_DCT) {
-    aom_fdct4x4_c(input, output, stride);
-    return;
-  }
-#endif
-  {
-    static const transform_2d FHT[] = {
-#if CONFIG_DAALA_DCT4
-      { daala_fdct4, daala_fdct4 },  // DCT_DCT
-      { daala_fdst4, daala_fdct4 },  // ADST_DCT
-      { daala_fdct4, daala_fdst4 },  // DCT_ADST
-      { daala_fdst4, daala_fdst4 },  // ADST_ADST
-#if CONFIG_EXT_TX
-      { daala_fdst4, daala_fdct4 },  // FLIPADST_DCT
-      { daala_fdct4, daala_fdst4 },  // DCT_FLIPADST
-      { daala_fdst4, daala_fdst4 },  // FLIPADST_FLIPADST
-      { daala_fdst4, daala_fdst4 },  // ADST_FLIPADST
-      { daala_fdst4, daala_fdst4 },  // FLIPADST_ADST
-      { daala_idtx4, daala_idtx4 },  // IDTX
-      { daala_fdct4, daala_idtx4 },  // V_DCT
-      { daala_idtx4, daala_fdct4 },  // H_DCT
-      { daala_fdst4, daala_idtx4 },  // V_ADST
-      { daala_idtx4, daala_fdst4 },  // H_ADST
-      { daala_fdst4, daala_idtx4 },  // V_FLIPADST
-      { daala_idtx4, daala_fdst4 },  // H_FLIPADST
-#endif
-#else
-      { fdct4, fdct4 },    // DCT_DCT
-      { fadst4, fdct4 },   // ADST_DCT
-      { fdct4, fadst4 },   // DCT_ADST
-      { fadst4, fadst4 },  // ADST_ADST
-#if CONFIG_EXT_TX
-      { fadst4, fdct4 },   // FLIPADST_DCT
-      { fdct4, fadst4 },   // DCT_FLIPADST
-      { fadst4, fadst4 },  // FLIPADST_FLIPADST
-      { fadst4, fadst4 },  // ADST_FLIPADST
-      { fadst4, fadst4 },  // FLIPADST_ADST
-      { fidtx4, fidtx4 },  // IDTX
-      { fdct4, fidtx4 },   // V_DCT
-      { fidtx4, fdct4 },   // H_DCT
-      { fadst4, fidtx4 },  // V_ADST
-      { fidtx4, fadst4 },  // H_ADST
-      { fadst4, fidtx4 },  // V_FLIPADST
-      { fidtx4, fadst4 },  // H_FLIPADST
-#endif
-#endif
-    };
-    const transform_2d ht = FHT[tx_type];
-    tran_low_t out[4 * 4];
-    int i, j;
-    tran_low_t temp_in[4], temp_out[4];
-
-#if CONFIG_EXT_TX
-    int16_t flipped_input[4 * 4];
-    maybe_flip_input(&input, &stride, 4, 4, flipped_input, tx_type);
-#endif
-
-#if CONFIG_LGT
-    // Choose LGT adaptive to the prediction. We may apply different LGTs for
-    // different rows/columns, indicated by the pointers to 2D arrays
-    const tran_high_t *lgtmtx_col[1];
-    const tran_high_t *lgtmtx_row[1];
-    int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
-    int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
-#endif
-
-    // Columns
-    for (i = 0; i < 4; ++i) {
-      /* A C99-safe upshift by 4 for both Daala and VPx TX. */
-      for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16;
-#if !CONFIG_DAALA_DCT4
-      if (i == 0 && temp_in[0]) temp_in[0] += 1;
-#endif
-#if CONFIG_LGT
-      if (use_lgt_col)
-        flgt4(temp_in, temp_out, lgtmtx_col[0]);
-      else
-#endif
-        ht.cols(temp_in, temp_out);
-      for (j = 0; j < 4; ++j) out[j * 4 + i] = temp_out[j];
-    }
-
-    // Rows
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j) temp_in[j] = out[j + i * 4];
-#if CONFIG_LGT
-      if (use_lgt_row)
-        flgt4(temp_in, temp_out, lgtmtx_row[0]);
-      else
-#endif
-        ht.rows(temp_in, temp_out);
-#if CONFIG_DAALA_DCT4
-      /* Daala TX has orthonormal scaling; shift down by only 1 to achieve
-         the usual VPx coefficient left-shift of 3. */
-      for (j = 0; j < 4; ++j) output[j + i * 4] = temp_out[j] >> 1;
-#else
-      for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2;
-#endif
-    }
-  }
-}
-
-void av1_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
-                  TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct8, fdct4 },    // DCT_DCT
-    { fadst8, fdct4 },   // ADST_DCT
-    { fdct8, fadst4 },   // DCT_ADST
-    { fadst8, fadst4 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fadst8, fdct4 },   // FLIPADST_DCT
-    { fdct8, fadst4 },   // DCT_FLIPADST
-    { fadst8, fadst4 },  // FLIPADST_FLIPADST
-    { fadst8, fadst4 },  // ADST_FLIPADST
-    { fadst8, fadst4 },  // FLIPADST_ADST
-    { fidtx8, fidtx4 },  // IDTX
-    { fdct8, fidtx4 },   // V_DCT
-    { fidtx8, fdct4 },   // H_DCT
-    { fadst8, fidtx4 },  // V_ADST
-    { fidtx8, fadst4 },  // H_ADST
-    { fadst8, fidtx4 },  // V_FLIPADST
-    { fidtx8, fadst4 },  // H_FLIPADST
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  const int n = 4;
-  const int n2 = 8;
-  tran_low_t out[8 * 4];
-  tran_low_t temp_in[8], temp_out[8];
-  int i, j;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[8 * 4];
-  maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
-#endif
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
-  int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // Rows
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
-      temp_in[j] =
-          (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
-#if CONFIG_LGT
-    if (use_lgt_row)
-      flgt4(temp_in, temp_out, lgtmtx_row[0]);
-    else
-#endif
-      ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
-  }
-
-  // Columns
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
-#if CONFIG_LGT
-    if (use_lgt_col)
-      flgt8(temp_in, temp_out, lgtmtx_col[0]);
-    else
-#endif
-      ht.cols(temp_in, temp_out);
-    for (j = 0; j < n2; ++j)
-      output[i + j * n] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-  }
-  // Note: overall scale factor of transform is 8 times unitary
-}
-
-void av1_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
-                  TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct4, fdct8 },    // DCT_DCT
-    { fadst4, fdct8 },   // ADST_DCT
-    { fdct4, fadst8 },   // DCT_ADST
-    { fadst4, fadst8 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fadst4, fdct8 },   // FLIPADST_DCT
-    { fdct4, fadst8 },   // DCT_FLIPADST
-    { fadst4, fadst8 },  // FLIPADST_FLIPADST
-    { fadst4, fadst8 },  // ADST_FLIPADST
-    { fadst4, fadst8 },  // FLIPADST_ADST
-    { fidtx4, fidtx8 },  // IDTX
-    { fdct4, fidtx8 },   // V_DCT
-    { fidtx4, fdct8 },   // H_DCT
-    { fadst4, fidtx8 },  // V_ADST
-    { fidtx4, fadst8 },  // H_ADST
-    { fadst4, fidtx8 },  // V_FLIPADST
-    { fidtx4, fadst8 },  // H_FLIPADST
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  const int n = 4;
-  const int n2 = 8;
-  tran_low_t out[8 * 4];
-  tran_low_t temp_in[8], temp_out[8];
-  int i, j;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[8 * 4];
-  maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
-#endif
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
-  int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // Columns
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
-      temp_in[j] =
-          (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
-#if CONFIG_LGT
-    if (use_lgt_col)
-      flgt4(temp_in, temp_out, lgtmtx_col[0]);
-    else
-#endif
-      ht.cols(temp_in, temp_out);
-    for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
-  }
-
-  // Rows
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
-#if CONFIG_LGT
-    if (use_lgt_row)
-      flgt8(temp_in, temp_out, lgtmtx_row[0]);
-    else
-#endif
-      ht.rows(temp_in, temp_out);
-    for (j = 0; j < n2; ++j)
-      output[j + i * n2] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-  }
-  // Note: overall scale factor of transform is 8 times unitary
-}
-
-void av1_fht4x16_c(const int16_t *input, tran_low_t *output, int stride,
-                   TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct16, fdct4 },    // DCT_DCT
-    { fadst16, fdct4 },   // ADST_DCT
-    { fdct16, fadst4 },   // DCT_ADST
-    { fadst16, fadst4 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fadst16, fdct4 },   // FLIPADST_DCT
-    { fdct16, fadst4 },   // DCT_FLIPADST
-    { fadst16, fadst4 },  // FLIPADST_FLIPADST
-    { fadst16, fadst4 },  // ADST_FLIPADST
-    { fadst16, fadst4 },  // FLIPADST_ADST
-    { fidtx16, fidtx4 },  // IDTX
-    { fdct16, fidtx4 },   // V_DCT
-    { fidtx16, fdct4 },   // H_DCT
-    { fadst16, fidtx4 },  // V_ADST
-    { fidtx16, fadst4 },  // H_ADST
-    { fadst16, fidtx4 },  // V_FLIPADST
-    { fidtx16, fadst4 },  // H_FLIPADST
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  const int n = 4;
-  const int n4 = 16;
-  tran_low_t out[16 * 4];
-  tran_low_t temp_in[16], temp_out[16];
-  int i, j;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[16 * 4];
-  maybe_flip_input(&input, &stride, n4, n, flipped_input, tx_type);
-#endif
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // Rows
-  for (i = 0; i < n4; ++i) {
-    for (j = 0; j < n; ++j) temp_in[j] = input[i * stride + j] * 4;
-#if CONFIG_LGT
-    if (use_lgt_row)
-      flgt4(temp_in, temp_out, lgtmtx_row[0]);
-    else
-#endif
-      ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
-  }
-
-  // Columns
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < n4; ++j)
-      output[i + j * n] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-  }
-  // Note: overall scale factor of transform is 8 times unitary
-}
-
-void av1_fht16x4_c(const int16_t *input, tran_low_t *output, int stride,
-                   TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct4, fdct16 },    // DCT_DCT
-    { fadst4, fdct16 },   // ADST_DCT
-    { fdct4, fadst16 },   // DCT_ADST
-    { fadst4, fadst16 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fadst4, fdct16 },   // FLIPADST_DCT
-    { fdct4, fadst16 },   // DCT_FLIPADST
-    { fadst4, fadst16 },  // FLIPADST_FLIPADST
-    { fadst4, fadst16 },  // ADST_FLIPADST
-    { fadst4, fadst16 },  // FLIPADST_ADST
-    { fidtx4, fidtx16 },  // IDTX
-    { fdct4, fidtx16 },   // V_DCT
-    { fidtx4, fdct16 },   // H_DCT
-    { fadst4, fidtx16 },  // V_ADST
-    { fidtx4, fadst16 },  // H_ADST
-    { fadst4, fidtx16 },  // V_FLIPADST
-    { fidtx4, fadst16 },  // H_FLIPADST
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  const int n = 4;
-  const int n4 = 16;
-  tran_low_t out[16 * 4];
-  tran_low_t temp_in[16], temp_out[16];
-  int i, j;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[16 * 4];
-  maybe_flip_input(&input, &stride, n, n4, flipped_input, tx_type);
-#endif
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
-#endif
-
-  // Columns
-  for (i = 0; i < n4; ++i) {
-    for (j = 0; j < n; ++j) temp_in[j] = input[j * stride + i] * 4;
-#if CONFIG_LGT
-    if (use_lgt_col)
-      flgt4(temp_in, temp_out, lgtmtx_col[0]);
-    else
-#endif
-      ht.cols(temp_in, temp_out);
-    for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
-  }
-
-  // Rows
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < n4; ++j)
-      output[j + i * n4] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-  }
-  // Note: overall scale factor of transform is 8 times unitary
-}
-
-void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
-                   TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct16, fdct8 },    // DCT_DCT
-    { fadst16, fdct8 },   // ADST_DCT
-    { fdct16, fadst8 },   // DCT_ADST
-    { fadst16, fadst8 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fadst16, fdct8 },   // FLIPADST_DCT
-    { fdct16, fadst8 },   // DCT_FLIPADST
-    { fadst16, fadst8 },  // FLIPADST_FLIPADST
-    { fadst16, fadst8 },  // ADST_FLIPADST
-    { fadst16, fadst8 },  // FLIPADST_ADST
-    { fidtx16, fidtx8 },  // IDTX
-    { fdct16, fidtx8 },   // V_DCT
-    { fidtx16, fdct8 },   // H_DCT
-    { fadst16, fidtx8 },  // V_ADST
-    { fidtx16, fadst8 },  // H_ADST
-    { fadst16, fidtx8 },  // V_FLIPADST
-    { fidtx16, fadst8 },  // H_FLIPADST
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  const int n = 8;
-  const int n2 = 16;
-  tran_low_t out[16 * 8];
-  tran_low_t temp_in[16], temp_out[16];
-  int i, j;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[16 * 8];
-  maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
-#endif
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // Rows
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
-      temp_in[j] =
-          (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
-#if CONFIG_LGT
-    if (use_lgt_row)
-      flgt8(temp_in, temp_out, lgtmtx_row[0]);
-    else
-#endif
-      ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j)
-      out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-  }
-
-  // Columns
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j];
-  }
-  // Note: overall scale factor of transform is 8 times unitary
-}
-
-void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
-                   TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct8, fdct16 },    // DCT_DCT
-    { fadst8, fdct16 },   // ADST_DCT
-    { fdct8, fadst16 },   // DCT_ADST
-    { fadst8, fadst16 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fadst8, fdct16 },   // FLIPADST_DCT
-    { fdct8, fadst16 },   // DCT_FLIPADST
-    { fadst8, fadst16 },  // FLIPADST_FLIPADST
-    { fadst8, fadst16 },  // ADST_FLIPADST
-    { fadst8, fadst16 },  // FLIPADST_ADST
-    { fidtx8, fidtx16 },  // IDTX
-    { fdct8, fidtx16 },   // V_DCT
-    { fidtx8, fdct16 },   // H_DCT
-    { fadst8, fidtx16 },  // V_ADST
-    { fidtx8, fadst16 },  // H_ADST
-    { fadst8, fidtx16 },  // V_FLIPADST
-    { fidtx8, fadst16 },  // H_FLIPADST
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  const int n = 8;
-  const int n2 = 16;
-  tran_low_t out[16 * 8];
-  tran_low_t temp_in[16], temp_out[16];
-  int i, j;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[16 * 8];
-  maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
-#endif
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
-#endif
-
-  // Columns
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
-      temp_in[j] =
-          (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
-#if CONFIG_LGT
-    if (use_lgt_col)
-      flgt8(temp_in, temp_out, lgtmtx_col[0]);
-    else
-#endif
-      ht.cols(temp_in, temp_out);
-    for (j = 0; j < n; ++j)
-      out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-  }
-
-  // Rows
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j];
-  }
-  // Note: overall scale factor of transform is 8 times unitary
-}
-
-void av1_fht8x32_c(const int16_t *input, tran_low_t *output, int stride,
-                   TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct32, fdct8 },         // DCT_DCT
-    { fhalfright32, fdct8 },   // ADST_DCT
-    { fdct32, fadst8 },        // DCT_ADST
-    { fhalfright32, fadst8 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fhalfright32, fdct8 },   // FLIPADST_DCT
-    { fdct32, fadst8 },        // DCT_FLIPADST
-    { fhalfright32, fadst8 },  // FLIPADST_FLIPADST
-    { fhalfright32, fadst8 },  // ADST_FLIPADST
-    { fhalfright32, fadst8 },  // FLIPADST_ADST
-    { fidtx32, fidtx8 },       // IDTX
-    { fdct32, fidtx8 },        // V_DCT
-    { fidtx32, fdct8 },        // H_DCT
-    { fhalfright32, fidtx8 },  // V_ADST
-    { fidtx32, fadst8 },       // H_ADST
-    { fhalfright32, fidtx8 },  // V_FLIPADST
-    { fidtx32, fadst8 },       // H_FLIPADST
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  const int n = 8;
-  const int n4 = 32;
-  tran_low_t out[32 * 8];
-  tran_low_t temp_in[32], temp_out[32];
-  int i, j;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[32 * 8];
-  maybe_flip_input(&input, &stride, n4, n, flipped_input, tx_type);
-#endif
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_row[1];
-  int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
-#endif
-
-  // Rows
-  for (i = 0; i < n4; ++i) {
-    for (j = 0; j < n; ++j) temp_in[j] = input[i * stride + j] * 4;
-#if CONFIG_LGT
-    if (use_lgt_row)
-      flgt8(temp_in, temp_out, lgtmtx_row[0]);
-    else
-#endif
-      ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
-  }
-
-  // Columns
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < n4; ++j)
-      output[i + j * n] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-  }
-  // Note: overall scale factor of transform is 8 times unitary
-}
-
-void av1_fht32x8_c(const int16_t *input, tran_low_t *output, int stride,
-                   TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct8, fdct32 },         // DCT_DCT
-    { fadst8, fdct32 },        // ADST_DCT
-    { fdct8, fhalfright32 },   // DCT_ADST
-    { fadst8, fhalfright32 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fadst8, fdct32 },        // FLIPADST_DCT
-    { fdct8, fhalfright32 },   // DCT_FLIPADST
-    { fadst8, fhalfright32 },  // FLIPADST_FLIPADST
-    { fadst8, fhalfright32 },  // ADST_FLIPADST
-    { fadst8, fhalfright32 },  // FLIPADST_ADST
-    { fidtx8, fidtx32 },       // IDTX
-    { fdct8, fidtx32 },        // V_DCT
-    { fidtx8, fdct32 },        // H_DCT
-    { fadst8, fidtx32 },       // V_ADST
-    { fidtx8, fhalfright32 },  // H_ADST
-    { fadst8, fidtx32 },       // V_FLIPADST
-    { fidtx8, fhalfright32 },  // H_FLIPADST
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  const int n = 8;
-  const int n4 = 32;
-  tran_low_t out[32 * 8];
-  tran_low_t temp_in[32], temp_out[32];
-  int i, j;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[32 * 8];
-  maybe_flip_input(&input, &stride, n, n4, flipped_input, tx_type);
-#endif
-
-#if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[1];
-  int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
-#endif
-
-  // Columns
-  for (i = 0; i < n4; ++i) {
-    for (j = 0; j < n; ++j) temp_in[j] = input[j * stride + i] * 4;
-#if CONFIG_LGT
-    if (use_lgt_col)
-      flgt8(temp_in, temp_out, lgtmtx_col[0]);
-    else
-#endif
-      ht.cols(temp_in, temp_out);
-    for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
-  }
-
-  // Rows
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < n4; ++j)
-      output[j + i * n4] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-  }
-  // Note: overall scale factor of transform is 8 times unitary
-}
-
-void av1_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
-                    TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct32, fdct16 },         // DCT_DCT
-    { fhalfright32, fdct16 },   // ADST_DCT
-    { fdct32, fadst16 },        // DCT_ADST
-    { fhalfright32, fadst16 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fhalfright32, fdct16 },   // FLIPADST_DCT
-    { fdct32, fadst16 },        // DCT_FLIPADST
-    { fhalfright32, fadst16 },  // FLIPADST_FLIPADST
-    { fhalfright32, fadst16 },  // ADST_FLIPADST
-    { fhalfright32, fadst16 },  // FLIPADST_ADST
-    { fidtx32, fidtx16 },       // IDTX
-    { fdct32, fidtx16 },        // V_DCT
-    { fidtx32, fdct16 },        // H_DCT
-    { fhalfright32, fidtx16 },  // V_ADST
-    { fidtx32, fadst16 },       // H_ADST
-    { fhalfright32, fidtx16 },  // V_FLIPADST
-    { fidtx32, fadst16 },       // H_FLIPADST
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  const int n = 16;
-  const int n2 = 32;
-  tran_low_t out[32 * 16];
-  tran_low_t temp_in[32], temp_out[32];
-  int i, j;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[32 * 16];
-  maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
-#endif
-
-  // Rows
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
-      temp_in[j] =
-          (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j)
-      out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
-  }
-
-  // Columns
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j];
-  }
-  // Note: overall scale factor of transform is 4 times unitary
-}
-
-void av1_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
-                    TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct16, fdct32 },         // DCT_DCT
-    { fadst16, fdct32 },        // ADST_DCT
-    { fdct16, fhalfright32 },   // DCT_ADST
-    { fadst16, fhalfright32 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fadst16, fdct32 },        // FLIPADST_DCT
-    { fdct16, fhalfright32 },   // DCT_FLIPADST
-    { fadst16, fhalfright32 },  // FLIPADST_FLIPADST
-    { fadst16, fhalfright32 },  // ADST_FLIPADST
-    { fadst16, fhalfright32 },  // FLIPADST_ADST
-    { fidtx16, fidtx32 },       // IDTX
-    { fdct16, fidtx32 },        // V_DCT
-    { fidtx16, fdct32 },        // H_DCT
-    { fadst16, fidtx32 },       // V_ADST
-    { fidtx16, fhalfright32 },  // H_ADST
-    { fadst16, fidtx32 },       // V_FLIPADST
-    { fidtx16, fhalfright32 },  // H_FLIPADST
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  const int n = 16;
-  const int n2 = 32;
-  tran_low_t out[32 * 16];
-  tran_low_t temp_in[32], temp_out[32];
-  int i, j;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[32 * 16];
-  maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
-#endif
-
-  // Columns
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
-      temp_in[j] =
-          (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < n; ++j)
-      out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
-  }
-
-  // Rows
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j];
-  }
-  // Note: overall scale factor of transform is 4 times unitary
-}
-
-void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
-                  TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-#if !CONFIG_DAALA_DCT8
-  if (tx_type == DCT_DCT) {
-    aom_fdct8x8_c(input, output, stride);
-    return;
-  }
-#endif
-  {
-    static const transform_2d FHT[] = {
-#if CONFIG_DAALA_DCT8
-      { daala_fdct8, daala_fdct8 },  // DCT_DCT
-      { daala_fdst8, daala_fdct8 },  // ADST_DCT
-      { daala_fdct8, daala_fdst8 },  // DCT_ADST
-      { daala_fdst8, daala_fdst8 },  // ADST_ADST
-#if CONFIG_EXT_TX
-      { daala_fdst8, daala_fdct8 },  // FLIPADST_DCT
-      { daala_fdct8, daala_fdst8 },  // DCT_FLIPADST
-      { daala_fdst8, daala_fdst8 },  // FLIPADST_FLIPADST
-      { daala_fdst8, daala_fdst8 },  // ADST_FLIPADST
-      { daala_fdst8, daala_fdst8 },  // FLIPADST_ADST
-      { daala_idtx8, daala_idtx8 },  // IDTX
-      { daala_fdct8, daala_idtx8 },  // V_DCT
-      { daala_idtx8, daala_fdct8 },  // H_DCT
-      { daala_fdst8, daala_idtx8 },  // V_ADST
-      { daala_idtx8, daala_fdst8 },  // H_ADST
-      { daala_fdst8, daala_idtx8 },  // V_FLIPADST
-      { daala_idtx8, daala_fdst8 },  // H_FLIPADST
-#endif
-#else
-      { fdct8, fdct8 },    // DCT_DCT
-      { fadst8, fdct8 },   // ADST_DCT
-      { fdct8, fadst8 },   // DCT_ADST
-      { fadst8, fadst8 },  // ADST_ADST
-#if CONFIG_EXT_TX
-      { fadst8, fdct8 },   // FLIPADST_DCT
-      { fdct8, fadst8 },   // DCT_FLIPADST
-      { fadst8, fadst8 },  // FLIPADST_FLIPADST
-      { fadst8, fadst8 },  // ADST_FLIPADST
-      { fadst8, fadst8 },  // FLIPADST_ADST
-      { fidtx8, fidtx8 },  // IDTX
-      { fdct8, fidtx8 },   // V_DCT
-      { fidtx8, fdct8 },   // H_DCT
-      { fadst8, fidtx8 },  // V_ADST
-      { fidtx8, fadst8 },  // H_ADST
-      { fadst8, fidtx8 },  // V_FLIPADST
-      { fidtx8, fadst8 },  // H_FLIPADST
-#endif
-#endif
-    };
-    const transform_2d ht = FHT[tx_type];
-    tran_low_t out[64];
-    int i, j;
-    tran_low_t temp_in[8], temp_out[8];
-
-#if CONFIG_EXT_TX
-    int16_t flipped_input[8 * 8];
-    maybe_flip_input(&input, &stride, 8, 8, flipped_input, tx_type);
-#endif
-
-#if CONFIG_LGT
-    const tran_high_t *lgtmtx_col[1];
-    const tran_high_t *lgtmtx_row[1];
-    int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
-    int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
-#endif
-
-    // Columns
-    for (i = 0; i < 8; ++i) {
-#if CONFIG_DAALA_DCT8
-      for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 16;
-#else
-      for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 4;
-#endif
-#if CONFIG_LGT
-      if (use_lgt_col)
-        flgt8(temp_in, temp_out, lgtmtx_col[0]);
-      else
-#endif
-        ht.cols(temp_in, temp_out);
-      for (j = 0; j < 8; ++j) out[j * 8 + i] = temp_out[j];
-    }
-
-    // Rows
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j) temp_in[j] = out[j + i * 8];
-#if CONFIG_LGT
-      if (use_lgt_row)
-        flgt8(temp_in, temp_out, lgtmtx_row[0]);
-      else
-#endif
-        ht.rows(temp_in, temp_out);
-#if CONFIG_DAALA_DCT8
-      for (j = 0; j < 8; ++j)
-        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-#else
-      for (j = 0; j < 8; ++j)
-        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-#endif
-    }
-  }
-}
-
-/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
-   pixel. */
-void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
-  int i;
-  tran_high_t a1, b1, c1, d1, e1;
-  const int16_t *ip_pass0 = input;
-  const tran_low_t *ip = NULL;
-  tran_low_t *op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip_pass0[0 * stride];
-    b1 = ip_pass0[1 * stride];
-    c1 = ip_pass0[2 * stride];
-    d1 = ip_pass0[3 * stride];
-
-    a1 += b1;
-    d1 = d1 - c1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= c1;
-    d1 += b1;
-    op[0] = (tran_low_t)a1;
-    op[4] = (tran_low_t)c1;
-    op[8] = (tran_low_t)d1;
-    op[12] = (tran_low_t)b1;
-
-    ip_pass0++;
-    op++;
-  }
-  ip = output;
-  op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0];
-    b1 = ip[1];
-    c1 = ip[2];
-    d1 = ip[3];
-
-    a1 += b1;
-    d1 -= c1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= c1;
-    d1 += b1;
-    op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
-    op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
-    op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
-    op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
-
-    ip += 4;
-    op += 4;
-  }
-}
-
-void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
-                    TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-#if CONFIG_DAALA_DCT16
-    { daala_fdct16, daala_fdct16 },  // DCT_DCT
-    { daala_fdst16, daala_fdct16 },  // ADST_DCT
-    { daala_fdct16, daala_fdst16 },  // DCT_ADST
-    { daala_fdst16, daala_fdst16 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { daala_fdst16, daala_fdct16 },  // FLIPADST_DCT
-    { daala_fdct16, daala_fdst16 },  // DCT_FLIPADST
-    { daala_fdst16, daala_fdst16 },  // FLIPADST_FLIPADST
-    { daala_fdst16, daala_fdst16 },  // ADST_FLIPADST
-    { daala_fdst16, daala_fdst16 },  // FLIPADST_ADST
-    { daala_idtx16, daala_idtx16 },  // IDTX
-    { daala_fdct16, daala_idtx16 },  // V_DCT
-    { daala_idtx16, daala_fdct16 },  // H_DCT
-    { daala_fdst16, daala_idtx16 },  // V_ADST
-    { daala_idtx16, daala_fdst16 },  // H_ADST
-    { daala_fdst16, daala_idtx16 },  // V_FLIPADST
-    { daala_idtx16, daala_fdst16 },  // H_FLIPADST
-#endif
-#else
-    { fdct16, fdct16 },    // DCT_DCT
-    { fadst16, fdct16 },   // ADST_DCT
-    { fdct16, fadst16 },   // DCT_ADST
-    { fadst16, fadst16 },  // ADST_ADST
-#if CONFIG_EXT_TX
-    { fadst16, fdct16 },   // FLIPADST_DCT
-    { fdct16, fadst16 },   // DCT_FLIPADST
-    { fadst16, fadst16 },  // FLIPADST_FLIPADST
-    { fadst16, fadst16 },  // ADST_FLIPADST
-    { fadst16, fadst16 },  // FLIPADST_ADST
-    { fidtx16, fidtx16 },  // IDTX
-    { fdct16, fidtx16 },   // V_DCT
-    { fidtx16, fdct16 },   // H_DCT
-    { fadst16, fidtx16 },  // V_ADST
-    { fidtx16, fadst16 },  // H_ADST
-    { fadst16, fidtx16 },  // V_FLIPADST
-    { fidtx16, fadst16 },  // H_FLIPADST
-#endif
-#endif
-  };
-  const transform_2d ht = FHT[tx_type];
-  tran_low_t out[256];
-  int i, j;
-  tran_low_t temp_in[16], temp_out[16];
-
-#if CONFIG_EXT_TX
-  int16_t flipped_input[16 * 16];
-  maybe_flip_input(&input, &stride, 16, 16, flipped_input, tx_type);
-#endif
-
-  // Columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) {
-#if CONFIG_DAALA_DCT16
-      temp_in[j] = input[j * stride + i] * 16;
-#else
-      temp_in[j] = input[j * stride + i] * 4;
-#endif
-    }
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) {
-#if CONFIG_DAALA_DCT16
-      out[j * 16 + i] = temp_out[j];
-#else
-      out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
-#endif
-    }
-  }
-
-  // Rows
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) {
-#if CONFIG_DAALA_DCT16
-      output[j + i * 16] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-#else
-      output[j + i * 16] = temp_out[j];
-#endif
-    }
-  }
-}
-
-void av1_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
-                          int stride) {
-  av1_fwht4x4_c(input, output, stride);
-}
-
-void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
-                    TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-#if CONFIG_DAALA_DCT32
-    { daala_fdct32, daala_fdct32 },  // DCT_DCT
-#if CONFIG_EXT_TX
-    { daala_fdst32, daala_fdct32 },  // ADST_DCT
-    { daala_fdct32, daala_fdst32 },  // DCT_ADST
-    { daala_fdst32, daala_fdst32 },  // ADST_ADST
-    { daala_fdst32, daala_fdct32 },  // FLIPADST_DCT
-    { daala_fdct32, daala_fdst32 },  // DCT_FLIPADST
-    { daala_fdst32, daala_fdst32 },  // FLIPADST_FLIPADST
-    { daala_fdst32, daala_fdst32 },  // ADST_FLIPADST
-    { daala_fdst32, daala_fdst32 },  // FLIPADST_ADST
-    { daala_idtx32, daala_idtx32 },  // IDTX
-    { daala_fdct32, daala_idtx32 },  // V_DCT
-    { daala_idtx32, daala_fdct32 },  // H_DCT
-    { daala_fdst32, daala_idtx32 },  // V_ADST
-    { daala_idtx32, daala_fdst32 },  // H_ADST
-    { daala_fdst32, daala_idtx32 },  // V_FLIPADST
-    { daala_idtx32, daala_fdst32 },  // H_FLIPADST
-#endif
-#else
-    { fdct32, fdct32 },              // DCT_DCT
-#if CONFIG_EXT_TX
-    { fhalfright32, fdct32 },        // ADST_DCT
-    { fdct32, fhalfright32 },        // DCT_ADST
-    { fhalfright32, fhalfright32 },  // ADST_ADST
-    { fhalfright32, fdct32 },        // FLIPADST_DCT
-    { fdct32, fhalfright32 },        // DCT_FLIPADST
-    { fhalfright32, fhalfright32 },  // FLIPADST_FLIPADST
-    { fhalfright32, fhalfright32 },  // ADST_FLIPADST
-    { fhalfright32, fhalfright32 },  // FLIPADST_ADST
-    { fidtx32, fidtx32 },            // IDTX
-    { fdct32, fidtx32 },             // V_DCT
-    { fidtx32, fdct32 },             // H_DCT
-    { fhalfright32, fidtx32 },       // V_ADST
-    { fidtx32, fhalfright32 },       // H_ADST
-    { fhalfright32, fidtx32 },       // V_FLIPADST
-    { fidtx32, fhalfright32 },       // H_FLIPADST
-#endif
-#endif
-#if CONFIG_MRC_TX
-    { fdct32, fdct32 },  // MRC_TX
-#endif                   // CONFIG_MRC_TX
-  };
-  const transform_2d ht = FHT[tx_type];
-  tran_low_t out[1024];
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-#if CONFIG_EXT_TX
-  int16_t flipped_input[32 * 32];
-  maybe_flip_input(&input, &stride, 32, 32, flipped_input, tx_type);
-#endif
-
-#if CONFIG_MRC_TX
-  if (tx_type == MRC_DCT) {
-    int16_t masked_input[32 * 32];
-    get_masked_residual32(&input, &stride, txfm_param->dst, txfm_param->stride,
-                          masked_input, txfm_param);
-  }
-#endif  // CONFIG_MRC_TX
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) {
-#if CONFIG_DAALA_DCT32
-      temp_in[j] = input[j * stride + i] * 16;
-#else
-      temp_in[j] = input[j * stride + i] * 4;
-#endif
-    }
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-#if CONFIG_DAALA_DCT32
-      out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-#else
-      out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
-#endif
-    }
-  }
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      output[j + i * 32] = temp_out[j];
-    }
-  }
-}
-
-#if CONFIG_TX64X64
-#if !CONFIG_DAALA_DCT64
-#if CONFIG_EXT_TX
-static void fidtx64(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 64; ++i)
-    output[i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2);
-}
-
-// For use in lieu of ADST
-static void fhalfright64(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[32];
-  for (i = 0; i < 32; ++i) {
-    output[32 + i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2);
-  }
-  // Multiply input by sqrt(2)
-  for (i = 0; i < 32; ++i) {
-    inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 32] * Sqrt2);
-  }
-  fdct32(inputhalf, output);
-  // Note overall scaling factor is 2 times unitary
-}
-#endif  // CONFIG_EXT_TX
-
-static void fdct64_col(const tran_low_t *input, tran_low_t *output) {
-  int32_t in[64], out[64];
-  int i;
-  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
-  av1_fdct64_new(in, out, fwd_cos_bit_col_dct_64, fwd_stage_range_col_dct_64);
-  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
-}
-
-static void fdct64_row(const tran_low_t *input, tran_low_t *output) {
-  int32_t in[64], out[64];
-  int i;
-  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
-  av1_fdct64_new(in, out, fwd_cos_bit_row_dct_64, fwd_stage_range_row_dct_64);
-  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
-}
-#endif
-
-void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
-                    TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-#if CONFIG_DAALA_DCT64
-    { daala_fdct64, daala_fdct64 },  // DCT_DCT
-#if CONFIG_EXT_TX
-    { daala_fdst64, daala_fdct64 },  // ADST_DCT
-    { daala_fdct64, daala_fdst64 },  // DCT_ADST
-    { daala_fdst64, daala_fdst64 },  // ADST_ADST
-    { daala_fdst64, daala_fdct64 },  // FLIPADST_DCT
-    { daala_fdct64, daala_fdst64 },  // DCT_FLIPADST
-    { daala_fdst64, daala_fdst64 },  // FLIPADST_FLIPADST
-    { daala_fdst64, daala_fdst64 },  // ADST_FLIPADST
-    { daala_fdst64, daala_fdst64 },  // FLIPADST_ADST
-    { daala_idtx64, daala_idtx64 },  // IDTX
-    { daala_fdct64, daala_idtx64 },  // V_DCT
-    { daala_idtx64, daala_fdct64 },  // H_DCT
-    { daala_fdst64, daala_idtx64 },  // V_ADST
-    { daala_idtx64, daala_fdst64 },  // H_ADST
-    { daala_fdst64, daala_idtx64 },  // V_FLIPADST
-    { daala_idtx64, daala_fdst64 },  // H_FLIPADST
-#endif                               // CONFIG_EXT_TX
-#else
-    { fdct64_col, fdct64_row },      // DCT_DCT
-#if CONFIG_EXT_TX
-    { fhalfright64, fdct64_row },    // ADST_DCT
-    { fdct64_col, fhalfright64 },    // DCT_ADST
-    { fhalfright64, fhalfright64 },  // ADST_ADST
-    { fhalfright64, fdct64_row },    // FLIPADST_DCT
-    { fdct64_col, fhalfright64 },    // DCT_FLIPADST
-    { fhalfright64, fhalfright64 },  // FLIPADST_FLIPADST
-    { fhalfright64, fhalfright64 },  // ADST_FLIPADST
-    { fhalfright64, fhalfright64 },  // FLIPADST_ADST
-    { fidtx64, fidtx64 },            // IDTX
-    { fdct64_col, fidtx64 },         // V_DCT
-    { fidtx64, fdct64_row },         // H_DCT
-    { fhalfright64, fidtx64 },       // V_ADST
-    { fidtx64, fhalfright64 },       // H_ADST
-    { fhalfright64, fidtx64 },       // V_FLIPADST
-    { fidtx64, fhalfright64 },       // H_FLIPADST
-#endif  // CONFIG_EXT_TX
-#endif  // CONFIG_DAALA_DCT64
-  };
-  const transform_2d ht = FHT[tx_type];
-  tran_low_t out[4096];
-  int i, j;
-  tran_low_t temp_in[64], temp_out[64];
-#if CONFIG_EXT_TX
-  int16_t flipped_input[64 * 64];
-  maybe_flip_input(&input, &stride, 64, 64, flipped_input, tx_type);
-#endif
-
-  // Columns
-  for (i = 0; i < 64; ++i) {
-#if CONFIG_DAALA_DCT64
-    for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i] * 16;
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 64; ++j)
-      out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 3;
-
-#else
-    for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i];
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 64; ++j)
-      out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-#endif
-  }
-
-  // Rows
-  for (i = 0; i < 64; ++i) {
-    for (j = 0; j < 64; ++j) temp_in[j] = out[j + i * 64];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < 64; ++j)
-#if CONFIG_DAALA_DCT64
-      output[j + i * 64] = temp_out[j];
-#else
-      output[j + i * 64] =
-          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
-#endif
-  }
-}
-
-void av1_fht64x32_c(const int16_t *input, tran_low_t *output, int stride,
-                    TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct32, fdct64_row },  // DCT_DCT
-#if CONFIG_EXT_TX
-    { fhalfright32, fdct64_row },    // ADST_DCT
-    { fdct32, fhalfright64 },        // DCT_ADST
-    { fhalfright32, fhalfright64 },  // ADST_ADST
-    { fhalfright32, fdct64_row },    // FLIPADST_DCT
-    { fdct32, fhalfright64 },        // DCT_FLIPADST
-    { fhalfright32, fhalfright64 },  // FLIPADST_FLIPADST
-    { fhalfright32, fhalfright64 },  // ADST_FLIPADST
-    { fhalfright32, fhalfright64 },  // FLIPADST_ADST
-    { fidtx32, fidtx64 },            // IDTX
-    { fdct32, fidtx64 },             // V_DCT
-    { fidtx32, fdct64_row },         // H_DCT
-    { fhalfright32, fidtx64 },       // V_ADST
-    { fidtx32, fhalfright64 },       // H_ADST
-    { fhalfright32, fidtx64 },       // V_FLIPADST
-    { fidtx32, fhalfright64 },       // H_FLIPADST
-#endif                               // CONFIG_EXT_TX
-  };
-  const transform_2d ht = FHT[tx_type];
-  tran_low_t out[2048];
-  int i, j;
-  tran_low_t temp_in[64], temp_out[64];
-  const int n = 32;
-  const int n2 = 64;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[32 * 64];
-  maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
-#endif
-
-  // Columns
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
-      temp_in[j] = (tran_low_t)fdct_round_shift(input[j * stride + i] * Sqrt2);
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < n; ++j)
-      out[j * n2 + i] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-  }
-
-  // Rows
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < n2; ++j)
-      output[j + i * n2] =
-          (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-  }
-}
-
-void av1_fht32x64_c(const int16_t *input, tran_low_t *output, int stride,
-                    TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif  // CONFIG_MRC_TX
-#if CONFIG_DCT_ONLY
-  assert(tx_type == DCT_DCT);
-#endif
-  static const transform_2d FHT[] = {
-    { fdct64_row, fdct32 },  // DCT_DCT
-#if CONFIG_EXT_TX
-    { fhalfright64, fdct32 },        // ADST_DCT
-    { fdct64_row, fhalfright32 },    // DCT_ADST
-    { fhalfright64, fhalfright32 },  // ADST_ADST
-    { fhalfright64, fdct32 },        // FLIPADST_DCT
-    { fdct64_row, fhalfright32 },    // DCT_FLIPADST
-    { fhalfright64, fhalfright32 },  // FLIPADST_FLIPADST
-    { fhalfright64, fhalfright32 },  // ADST_FLIPADST
-    { fhalfright64, fhalfright32 },  // FLIPADST_ADST
-    { fidtx64, fidtx32 },            // IDTX
-    { fdct64_row, fidtx32 },         // V_DCT
-    { fidtx64, fdct32 },             // H_DCT
-    { fhalfright64, fidtx32 },       // V_ADST
-    { fidtx64, fhalfright32 },       // H_ADST
-    { fhalfright64, fidtx32 },       // V_FLIPADST
-    { fidtx64, fhalfright32 },       // H_FLIPADST
-#endif                               // CONFIG_EXT_TX
-  };
-  const transform_2d ht = FHT[tx_type];
-  tran_low_t out[32 * 64];
-  int i, j;
-  tran_low_t temp_in[64], temp_out[64];
-  const int n = 32;
-  const int n2 = 64;
-#if CONFIG_EXT_TX
-  int16_t flipped_input[32 * 64];
-  maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
-#endif
-
-  // Rows
-  for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
-      temp_in[j] = (tran_low_t)fdct_round_shift(input[i * stride + j] * Sqrt2);
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j)
-      out[j * n2 + i] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-  }
-
-  // Columns
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < n2; ++j)
-      output[i + j * n] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
-  }
-}
-#endif  // CONFIG_TX64X64
-
-#if CONFIG_EXT_TX
-// Forward identity transform.
-void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
-                    int bsx, int bsy, TX_TYPE tx_type) {
-  int r, c;
-  const int pels = bsx * bsy;
-  const int shift = 3 - ((pels > 256) + (pels > 1024));
-  if (tx_type == IDTX) {
-    for (r = 0; r < bsy; ++r) {
-      for (c = 0; c < bsx; ++c) coeff[c] = src_diff[c] * (1 << shift);
-      src_diff += stride;
-      coeff += bsx;
-    }
-  }
-}
-#endif  // CONFIG_EXT_TX
-#endif  // !AV1_DCT_GTEST
diff --git a/third_party/aom/av1/encoder/dwt.c b/third_party/aom/av1/encoder/dwt.c
new file mode 100644
index 000000000..0a57ebcfb
--- /dev/null
+++ b/third_party/aom/av1/encoder/dwt.c
@@ -0,0 +1,144 @@
+#include <assert.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/dwt.h"
+
+// Note: block length must be even for this implementation
+static void analysis_53_row(int length, tran_low_t *x, tran_low_t *lowpass,
+                            tran_low_t *highpass) {
+  int n;
+  tran_low_t r, *a, *b;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *a++ = (r = *x++) * 2;
+    *b++ = *x - ((r + x[1] + 1) >> 1);
+    x++;
+  }
+  *a = (r = *x++) * 2;
+  *b = *x - r;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ += (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+}
+
+static void analysis_53_col(int length, tran_low_t *x, tran_low_t *lowpass,
+                            tran_low_t *highpass) {
+  int n;
+  tran_low_t r, *a, *b;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *a++ = (r = *x++);
+    *b++ = (((*x) * 2) - (r + x[1]) + 2) >> 2;
+    x++;
+  }
+  *a = (r = *x++);
+  *b = (*x - r + 1) >> 1;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ += (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+}
+
+static void dyadic_analyze_53_uint8_input(int levels, int width, int height,
+                                          uint8_t *x, int pitch_x,
+                                          tran_low_t *c, int pitch_c,
+                                          int dwt_scale_bits, int hbd) {
+  int lv, i, j, nh, nw, hh = height, hw = width;
+  tran_low_t buffer[2 * DWT_MAX_LENGTH];
+
+  if (hbd) {
+    uint16_t *x16 = CONVERT_TO_SHORTPTR(x);
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j++) {
+        c[i * pitch_c + j] = x16[i * pitch_x + j] << dwt_scale_bits;
+      }
+    }
+  } else {
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j++) {
+        c[i * pitch_c + j] = x[i * pitch_x + j] << dwt_scale_bits;
+      }
+    }
+  }
+
+  for (lv = 0; lv < levels; lv++) {
+    nh = hh;
+    hh = (hh + 1) >> 1;
+    nw = hw;
+    hw = (hw + 1) >> 1;
+    if ((nh < 2) || (nw < 2)) return;
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &c[i * pitch_c], nw * sizeof(tran_low_t));
+      analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
+    }
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++) buffer[i + nh] = c[i * pitch_c + j];
+      analysis_53_col(nh, buffer + nh, buffer, buffer + hh);
+      for (i = 0; i < nh; i++) c[i * pitch_c + j] = buffer[i];
+    }
+  }
+}
+
+void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride,
+                               int hbd) {
+  dyadic_analyze_53_uint8_input(4, 8, 8, input, stride, output, 8, 2, hbd);
+}
+
+int av1_haar_ac_sad(tran_low_t *output, int bw, int bh, int stride) {
+  int acsad = 0;
+
+  for (int r = 0; r < bh; ++r)
+    for (int c = 0; c < bw; ++c) {
+      if (r >= bh / 2 || c >= bw / 2) acsad += abs(output[r * stride + c]);
+    }
+  return acsad;
+}
+
+uint64_t av1_dct_ac_sad(tran_low_t *output, int bw, int bh, int stride) {
+  uint64_t acsad = 0;
+
+  for (int r = 0; r < bh; ++r)
+    for (int c = 0; c < bw; ++c) {
+      if (r > 0 || c > 0) acsad += abs(output[r * stride + c]);
+    }
+
+  return acsad;
+}
+
+uint32_t av1_variance(uint8_t *input, int bw, int bh, int stride) {
+  int sum = 0;
+  uint32_t sse = 0;
+
+  for (int r = 0; r < bh; ++r)
+    for (int c = 0; c < bw; ++c) {
+      sum += input[r * stride + c];
+      sse += input[r * stride + c] * input[r * stride + c];
+    }
+  return sse - (uint32_t)(((int64_t)sum * sum) / (bw * bh));
+}
+
+int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd) {
+  tran_low_t output[64];
+
+  av1_fdwt8x8_uint8_input_c(input, output, stride, hbd);
+  return av1_haar_ac_sad(output, 8, 8, 8);
+}
diff --git a/third_party/aom/av1/encoder/dwt.h b/third_party/aom/av1/encoder/dwt.h
new file mode 100644
index 000000000..9a86db2f1
--- /dev/null
+++ b/third_party/aom/av1/encoder/dwt.h
@@ -0,0 +1,9 @@
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+
+#define DWT_MAX_LENGTH 64
+
+void av1_fdwt8x8(tran_low_t *input, tran_low_t *output, int stride);
+void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride,
+                               int hbd);
+int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd);
diff --git a/third_party/aom/av1/encoder/encint.h b/third_party/aom/av1/encoder/encint.h
deleted file mode 100644
index 30ea8521f..000000000
--- a/third_party/aom/av1/encoder/encint.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-/* clang-format off */
-
-#if !defined(_encint_H)
-# define _encint_H (1)
-
-typedef struct daala_enc_ctx od_enc_ctx;
-typedef struct od_params_ctx od_params_ctx;
-typedef struct od_rollback_buffer od_rollback_buffer;
-
-# include "aom_dsp/entenc.h"
-# include "av1/common/odintrin.h"
-# include "av1/common/pvq_state.h"
-
-struct daala_enc_ctx{
-  /* Stores context-adaptive CDFs for PVQ. */
-  od_state state;
-  /* AOM entropy encoder. */
-  aom_writer w;
-  int use_activity_masking;
-  /* Mode of quantization matrice : FLAT (0) or HVS (1) */
-  int qm;
-  /*Normalized PVQ lambda for use where we've already performed
-     quantization.*/
-  double pvq_norm_lambda;
-  double pvq_norm_lambda_dc;
-};
-
-// from daalaenc.h
-/**The encoder context.*/
-typedef struct daala_enc_ctx daala_enc_ctx;
-
-/** Holds important encoder information so we can roll back decisions */
-struct od_rollback_buffer {
-  od_ec_enc ec;
-  od_adapt_ctx adapt;
-};
-
-void od_encode_checkpoint(const daala_enc_ctx *enc, od_rollback_buffer *rbuf);
-void od_encode_rollback(daala_enc_ctx *enc, const od_rollback_buffer *rbuf);
-
-#endif
diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c
index f79a678fb..027b80a16 100644
--- a/third_party/aom/av1/encoder/encodeframe.c
+++ b/third_party/aom/av1/encoder/encodeframe.c
@@ -13,9 +13,9 @@
 #include <math.h>
 #include <stdio.h>
 
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-#include "./aom_config.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/binary_codes_writer.h"
@@ -23,6 +23,11 @@
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/system_state.h"
 
+#if CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/cfl.h"
 #include "av1/common/common.h"
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
@@ -36,105 +41,55 @@
 #include "av1/common/seg_common.h"
 #include "av1/common/tile_common.h"
 
+#include "av1/encoder/ab_partition_model_weights.h"
 #include "av1/encoder/aq_complexity.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/aq_variance.h"
-#if CONFIG_SUPERTX
-#include "av1/encoder/cost.h"
-#endif
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 #include "av1/common/warped_motion.h"
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-#if CONFIG_GLOBAL_MOTION
 #include "av1/encoder/global_motion.h"
-#endif  // CONFIG_GLOBAL_MOTION
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encodemv.h"
-#if CONFIG_LV_MAP
 #include "av1/encoder/encodetxb.h"
-#endif
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/extend.h"
+#include "av1/encoder/ml.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/tokenize.h"
-#if CONFIG_PVQ
-#include "av1/common/pvq.h"
-#include "av1/encoder/pvq_encoder.h"
-#endif
-#if CONFIG_HIGHBITDEPTH
-#define IF_HBD(...) __VA_ARGS__
-#else
-#define IF_HBD(...)
-#endif  // CONFIG_HIGHBITDEPTH
-
-static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
-                              TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
-                              int mi_col, BLOCK_SIZE bsize, int *rate);
-
-#if CONFIG_SUPERTX
-static int check_intra_b(PICK_MODE_CONTEXT *ctx);
-
-static int check_intra_sb(const AV1_COMP *cpi, const TileInfo *const tile,
-                          int mi_row, int mi_col, BLOCK_SIZE bsize,
-                          PC_TREE *pc_tree);
-static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
-                               int mi_row_ori, int mi_col_ori, int mi_row_pred,
-                               int mi_col_pred, int plane,
-                               BLOCK_SIZE bsize_pred, int b_sub8x8, int block);
-static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
-                            PC_TREE *pc_tree);
-static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
-                               const TileInfo *const tile, int mi_row,
-                               int mi_col, int mi_row_ori, int mi_col_ori,
-                               RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                               BLOCK_SIZE top_bsize, uint8_t *dst_buf[3],
-                               int dst_stride[3], PC_TREE *pc_tree);
-static void update_state_sb_supertx(const AV1_COMP *const cpi, ThreadData *td,
-                                    const TileInfo *const tile, int mi_row,
-                                    int mi_col, BLOCK_SIZE bsize,
-                                    RUN_TYPE dry_run, PC_TREE *pc_tree);
-static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td,
-                          const TileInfo *const tile, int mi_row, int mi_col,
-                          BLOCK_SIZE bsize, int *tmp_rate, int64_t *tmp_dist,
-                          TX_TYPE *best_tx, PC_TREE *pc_tree);
-#endif  // CONFIG_SUPERTX
+
+static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+                              ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              int *rate);
 
 // This is used as a reference when computing the source variance for the
 //  purposes of activity masking.
 // Eventually this should be replaced by custom no-reference routines,
 //  which will be faster.
 static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = {
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-#if CONFIG_EXT_PARTITION
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
-#endif  // CONFIG_EXT_PARTITION
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128
 };
 
-#if CONFIG_HIGHBITDEPTH
 static const uint16_t AV1_HIGH_VAR_OFFS_8[MAX_SB_SIZE] = {
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-#if CONFIG_EXT_PARTITION
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
-#endif  // CONFIG_EXT_PARTITION
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128
 };
 
 static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = {
@@ -146,7 +101,6 @@ static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = {
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
-#if CONFIG_EXT_PARTITION
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
@@ -155,7 +109,6 @@ static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = {
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4
-#endif  // CONFIG_EXT_PARTITION
 };
 
 static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = {
@@ -168,8 +121,6 @@ static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = {
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
-  128 * 16,
-#if CONFIG_EXT_PARTITION
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
@@ -179,10 +130,17 @@ static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = {
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
-  128 * 16
-#endif  // CONFIG_EXT_PARTITION
+  128 * 16, 128 * 16
 };
-#endif  // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_FP_MB_STATS
+static const uint8_t num_16x16_blocks_wide_lookup[BLOCK_SIZES_ALL] = {
+  1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 1, 1, 1, 2, 2, 4
+};
+static const uint8_t num_16x16_blocks_high_lookup[BLOCK_SIZES_ALL] = {
+  1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 1, 1, 2, 1, 4, 2
+};
+#endif  // CONFIG_FP_MB_STATS
 
 unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
                                            const struct buf_2d *ref,
@@ -193,7 +151,6 @@ unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
-#if CONFIG_HIGHBITDEPTH
 unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi,
                                                 const struct buf_2d *ref,
                                                 BLOCK_SIZE bs, int bd) {
@@ -218,7 +175,6 @@ unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi,
   }
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 static unsigned int get_sby_perpixel_diff_variance(const AV1_COMP *const cpi,
                                                    const struct buf_2d *ref,
@@ -266,24 +222,21 @@ static void set_offsets_without_segment_id(const AV1_COMP *const cpi,
                                            MACROBLOCK *const x, int mi_row,
                                            int mi_col, BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   const int mi_width = mi_size_wide[bsize];
   const int mi_height = mi_size_high[bsize];
 
   set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
 
-  set_skip_context(xd, mi_row, mi_col);
-#if CONFIG_VAR_TX
-  xd->above_txfm_context =
-      cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
-  xd->left_txfm_context = xd->left_txfm_context_buffer +
-                          ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
-  xd->max_tx_size = max_txsize_lookup[bsize];
-#endif
+  set_skip_context(xd, mi_row, mi_col, num_planes);
+  xd->above_txfm_context = cm->above_txfm_context[tile->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 
   // Set up destination pointers.
   av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                       mi_col);
+                       mi_col, 0, num_planes);
 
   // Set up limit values for MV components.
   // Mv beyond the range do not produce new/different prediction block.
@@ -293,18 +246,15 @@ static void set_offsets_without_segment_id(const AV1_COMP *const cpi,
   x->mv_limits.row_max = (cm->mi_rows - mi_row) * MI_SIZE + AOM_INTERP_EXTEND;
   x->mv_limits.col_max = (cm->mi_cols - mi_col) * MI_SIZE + AOM_INTERP_EXTEND;
 
-  set_plane_n4(xd, mi_width, mi_height);
+  set_plane_n4(xd, mi_width, mi_height, num_planes);
 
   // Set up distance of MB to edge of frame in 1/8th pel units.
   assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
-  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
+  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, cm->mi_rows,
+                 cm->mi_cols);
 
   // Set up source buffers.
-  av1_setup_src_planes(x, cpi->source, mi_row, mi_col);
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes);
 
   // R/D setup.
   x->rdmult = cpi->rd.RDMULT;
@@ -323,292 +273,111 @@ static void set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
 
   set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
 
-  mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_CFL
-  xd->cfl->mi_row = mi_row;
-  xd->cfl->mi_col = mi_col;
-#endif
+  mbmi = xd->mi[0];
+  xd->cfl.mi_row = mi_row;
+  xd->cfl.mi_col = mi_col;
+
+  mbmi->segment_id = 0;
 
   // Setup segment ID.
   if (seg->enabled) {
-    if (!cpi->vaq_refresh) {
+    if (seg->enabled && !cpi->vaq_refresh) {
       const uint8_t *const map =
           seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-      mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+      mbmi->segment_id =
+          map ? get_segment_id(cm, map, bsize, mi_row, mi_col) : 0;
     }
     av1_init_plane_quantizers(cpi, x, mbmi->segment_id);
-  } else {
-    mbmi->segment_id = 0;
   }
-
-#if CONFIG_SUPERTX
-  mbmi->segment_id_supertx = MAX_SEGMENTS;
-#endif  // CONFIG_SUPERTX
-}
-
-#if CONFIG_SUPERTX
-static void set_offsets_supertx(const AV1_COMP *const cpi, ThreadData *td,
-                                const TileInfo *const tile, int mi_row,
-                                int mi_col, BLOCK_SIZE bsize) {
-  MACROBLOCK *const x = &td->mb;
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-#if CONFIG_DEPENDENT_HORZTILES
-  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col, cm->dependent_horz_tiles);
-#else
-  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
-#endif
-
-  // Set up distance of MB to edge of frame in 1/8th pel units.
-  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
-  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
 }
 
-static void set_offsets_extend(const AV1_COMP *const cpi, ThreadData *td,
-                               const TileInfo *const tile, int mi_row_pred,
-                               int mi_col_pred, int mi_row_ori, int mi_col_ori,
-                               BLOCK_SIZE bsize_pred) {
-  // Used in supertx
-  // (mi_row_ori, mi_col_ori, bsize_ori): region for mv
-  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
-  MACROBLOCK *const x = &td->mb;
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int mi_width = mi_size_wide[bsize_pred];
-  const int mi_height = mi_size_high[bsize_pred];
-
-#if CONFIG_DEPENDENT_HORZTILES
-  set_mode_info_offsets(cpi, x, xd, mi_row_ori, mi_col_ori,
-                        cm->dependent_horz_tiles);
-#else
-  set_mode_info_offsets(cpi, x, xd, mi_row_ori, mi_col_ori);
-#endif
-
-  // Set up limit values for MV components.
-  // Mv beyond the range do not produce new/different prediction block.
-  x->mv_limits.row_min =
-      -(((mi_row_pred + mi_height) * MI_SIZE) + AOM_INTERP_EXTEND);
-  x->mv_limits.col_min =
-      -(((mi_col_pred + mi_width) * MI_SIZE) + AOM_INTERP_EXTEND);
-  x->mv_limits.row_max =
-      (cm->mi_rows - mi_row_pred) * MI_SIZE + AOM_INTERP_EXTEND;
-  x->mv_limits.col_max =
-      (cm->mi_cols - mi_col_pred) * MI_SIZE + AOM_INTERP_EXTEND;
-
-// Set up distance of MB to edge of frame in 1/8th pel units.
-#if !CONFIG_CB4X4
-  assert(!(mi_col_pred & (mi_width - mi_size_wide[BLOCK_8X8])) &&
-         !(mi_row_pred & (mi_height - mi_size_high[BLOCK_8X8])));
-#endif
-  set_mi_row_col(xd, tile, mi_row_pred, mi_height, mi_col_pred, mi_width,
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
-  xd->up_available = (mi_row_ori > tile->mi_row_start);
-  xd->left_available = (mi_col_ori > tile->mi_col_start);
-
-  // R/D setup.
-  x->rdmult = cpi->rd.RDMULT;
-}
-
-static void set_segment_id_supertx(const AV1_COMP *const cpi,
-                                   MACROBLOCK *const x, const int mi_row,
-                                   const int mi_col, const BLOCK_SIZE bsize) {
-  const AV1_COMMON *cm = &cpi->common;
-  const struct segmentation *seg = &cm->seg;
-  const int miw = AOMMIN(mi_size_wide[bsize], cm->mi_cols - mi_col);
-  const int mih = AOMMIN(mi_size_high[bsize], cm->mi_rows - mi_row);
-  const int mi_offset = mi_row * cm->mi_stride + mi_col;
-  MODE_INFO **const mip = cm->mi_grid_visible + mi_offset;
-  int r, c;
-  int seg_id_supertx = MAX_SEGMENTS;
-
-  if (!seg->enabled) {
-    seg_id_supertx = 0;
-  } else {
-    // Find the minimum segment_id
-    for (r = 0; r < mih; r++)
-      for (c = 0; c < miw; c++)
-        seg_id_supertx =
-            AOMMIN(mip[r * cm->mi_stride + c]->mbmi.segment_id, seg_id_supertx);
-    assert(0 <= seg_id_supertx && seg_id_supertx < MAX_SEGMENTS);
-
-    // Initialize plane quantisers
-    av1_init_plane_quantizers(cpi, x, seg_id_supertx);
-  }
-
-  // Assign the the segment_id back to segment_id_supertx
-  for (r = 0; r < mih; r++)
-    for (c = 0; c < miw; c++)
-      mip[r * cm->mi_stride + c]->mbmi.segment_id_supertx = seg_id_supertx;
-}
-#endif  // CONFIG_SUPERTX
-
-#if CONFIG_DUAL_FILTER
-static void reset_intmv_filter_type(const AV1_COMMON *const cm, MACROBLOCKD *xd,
-                                    MB_MODE_INFO *mbmi) {
+static void reset_intmv_filter_type(MB_MODE_INFO *mbmi) {
   InterpFilter filters[2];
-  InterpFilter default_filter = av1_unswitchable_filter(cm->interp_filter);
 
   for (int dir = 0; dir < 2; ++dir) {
-    filters[dir] = ((!has_subpel_mv_component(xd->mi[0], xd, dir) &&
-                     (mbmi->ref_frame[1] == NONE_FRAME ||
-                      !has_subpel_mv_component(xd->mi[0], xd, dir + 2)))
-                        ? default_filter
-                        : av1_extract_interp_filter(mbmi->interp_filters, dir));
+    filters[dir] = av1_extract_interp_filter(mbmi->interp_filters, dir);
   }
   mbmi->interp_filters = av1_make_interp_filters(filters[0], filters[1]);
 }
 
-static void update_filter_type_count(FRAME_COUNTS *counts,
+static void update_filter_type_count(uint8_t allow_update_cdf,
+                                     FRAME_COUNTS *counts,
                                      const MACROBLOCKD *xd,
                                      const MB_MODE_INFO *mbmi) {
   int dir;
   for (dir = 0; dir < 2; ++dir) {
-    if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
-        (mbmi->ref_frame[1] > INTRA_FRAME &&
-         has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
-      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
-      InterpFilter filter =
-          av1_extract_interp_filter(mbmi->interp_filters, dir);
-      ++counts->switchable_interp[ctx][filter];
+    const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+    InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
+    ++counts->switchable_interp[ctx][filter];
+    if (allow_update_cdf) {
       update_cdf(xd->tile_ctx->switchable_interp_cdf[ctx], filter,
                  SWITCHABLE_FILTERS);
     }
   }
 }
-#endif
-#if CONFIG_GLOBAL_MOTION
+
 static void update_global_motion_used(PREDICTION_MODE mode, BLOCK_SIZE bsize,
                                       const MB_MODE_INFO *mbmi,
                                       RD_COUNTS *rdc) {
-  if (mode == ZEROMV || mode == ZERO_ZEROMV) {
-    const int num_4x4s =
-        num_4x4_blocks_wide_lookup[bsize] * num_4x4_blocks_high_lookup[bsize];
+  if (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) {
+    const int num_4x4s = mi_size_wide[bsize] * mi_size_high[bsize];
     int ref;
     for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
       rdc->global_motion_used[mbmi->ref_frame[ref]] += num_4x4s;
     }
   }
 }
-#endif  // CONFIG_GLOBAL_MOTION
 
-static void reset_tx_size(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
+static void reset_tx_size(MACROBLOCK *x, MB_MODE_INFO *mbmi,
                           const TX_MODE tx_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
   if (xd->lossless[mbmi->segment_id]) {
     mbmi->tx_size = TX_4X4;
   } else if (tx_mode != TX_MODE_SELECT) {
-    mbmi->tx_size =
-        tx_size_from_tx_mode(mbmi->sb_type, tx_mode, is_inter_block(mbmi));
-  }
-}
-
-static void set_ref_and_pred_mvs(MACROBLOCK *const x, int_mv *const mi_pred_mv,
-                                 int8_t rf_type) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-
-  const int bw = xd->n8_w << MI_SIZE_LOG2;
-  const int bh = xd->n8_h << MI_SIZE_LOG2;
-  int ref_mv_idx = mbmi->ref_mv_idx;
-  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-  CANDIDATE_MV *const curr_ref_mv_stack = mbmi_ext->ref_mv_stack[rf_type];
-
-  if (has_second_ref(mbmi)) {
-    // Special case: NEAR_NEWMV and NEW_NEARMV modes use 1 + mbmi->ref_mv_idx
-    // (like NEARMV) instead
-    if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) ref_mv_idx += 1;
-
-    if (compound_ref0_mode(mbmi->mode) == NEWMV) {
-      int_mv this_mv = curr_ref_mv_stack[ref_mv_idx].this_mv;
-      clamp_mv_ref(&this_mv.as_mv, bw, bh, xd);
-      mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
-      mbmi->pred_mv[0] = this_mv;
-      mi_pred_mv[0] = this_mv;
-    }
-    if (compound_ref1_mode(mbmi->mode) == NEWMV) {
-      int_mv this_mv = curr_ref_mv_stack[ref_mv_idx].comp_mv;
-      clamp_mv_ref(&this_mv.as_mv, bw, bh, xd);
-      mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
-      mbmi->pred_mv[1] = this_mv;
-      mi_pred_mv[1] = this_mv;
-    }
-#if CONFIG_COMPOUND_SINGLEREF
-  } else if (is_inter_singleref_comp_mode(mbmi->mode)) {
-    // Special case: SR_NEAR_NEWMV uses 1 + mbmi->ref_mv_idx
-    // (like NEARMV) instead
-    if (mbmi->mode == SR_NEAR_NEWMV) ref_mv_idx += 1;
-
-    if (compound_ref0_mode(mbmi->mode) == NEWMV ||
-        compound_ref1_mode(mbmi->mode) == NEWMV) {
-      int_mv this_mv = curr_ref_mv_stack[ref_mv_idx].this_mv;
-      clamp_mv_ref(&this_mv.as_mv, bw, bh, xd);
-      mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
-      mbmi->pred_mv[0] = this_mv;
-      mi_pred_mv[0] = this_mv;
-    }
-#endif  // CONFIG_COMPOUND_SINGLEREF
+    mbmi->tx_size = tx_size_from_tx_mode(mbmi->sb_type, tx_mode);
   } else {
-    if (mbmi->mode == NEWMV) {
-      int i;
-      for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
-        int_mv this_mv = (i == 0) ? curr_ref_mv_stack[ref_mv_idx].this_mv
-                                  : curr_ref_mv_stack[ref_mv_idx].comp_mv;
-        clamp_mv_ref(&this_mv.as_mv, bw, bh, xd);
-        mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0] = this_mv;
-        mbmi->pred_mv[i] = this_mv;
-        mi_pred_mv[i] = this_mv;
-      }
-    }
+    BLOCK_SIZE bsize = mbmi->sb_type;
+    TX_SIZE min_tx_size = depth_to_tx_size(MAX_TX_DEPTH, bsize);
+    mbmi->tx_size = (TX_SIZE)TXSIZEMAX(mbmi->tx_size, min_tx_size);
+  }
+  if (is_inter_block(mbmi)) {
+    memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
   }
+  memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN);
+  av1_zero(x->blk_skip);
+  x->skip = 0;
 }
 
-static void update_state(const AV1_COMP *const cpi, ThreadData *td,
-                         PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
-                         BLOCK_SIZE bsize, RUN_TYPE dry_run) {
+static void update_state(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+                         ThreadData *td, PICK_MODE_CONTEXT *ctx, int mi_row,
+                         int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run) {
   int i, x_idx, y;
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   RD_COUNTS *const rdc = &td->rd_counts;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
-  MODE_INFO *mi = &ctx->mic;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  MODE_INFO *mi_addr = xd->mi[0];
+  MB_MODE_INFO *mi = &ctx->mic;
+  MB_MODE_INFO *const mi_addr = xd->mi[0];
   const struct segmentation *const seg = &cm->seg;
-  const int bw = mi_size_wide[mi->mbmi.sb_type];
-  const int bh = mi_size_high[mi->mbmi.sb_type];
+  const int bw = mi_size_wide[mi->sb_type];
+  const int bh = mi_size_high[mi->sb_type];
   const int mis = cm->mi_stride;
   const int mi_width = mi_size_wide[bsize];
   const int mi_height = mi_size_high[bsize];
-  const int unify_bsize = CONFIG_CB4X4;
 
-  int8_t rf_type;
-
-#if !CONFIG_SUPERTX
-  assert(mi->mbmi.sb_type == bsize);
-#endif
+  assert(mi->sb_type == bsize);
 
   *mi_addr = *mi;
   *x->mbmi_ext = ctx->mbmi_ext;
 
-#if CONFIG_DUAL_FILTER
-  reset_intmv_filter_type(cm, xd, mbmi);
-#endif
+  reset_intmv_filter_type(mi_addr);
 
-  rf_type = av1_ref_frame_type(mbmi->ref_frame);
-  if (x->mbmi_ext->ref_mv_count[rf_type] > 1 &&
-      (mbmi->sb_type >= BLOCK_8X8 || unify_bsize)) {
-    set_ref_and_pred_mvs(x, mi->mbmi.pred_mv, rf_type);
-  }
+  memcpy(x->blk_skip, ctx->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+
+  x->skip = ctx->skip;
 
   // If segmentation in use
   if (seg->enabled) {
@@ -616,34 +385,29 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td,
     if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
       const uint8_t *const map =
           seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-      mi_addr->mbmi.segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
-      reset_tx_size(xd, &mi_addr->mbmi, cm->tx_mode);
+      mi_addr->segment_id =
+          map ? get_segment_id(cm, map, bsize, mi_row, mi_col) : 0;
+      reset_tx_size(x, mi_addr, cm->tx_mode);
     }
     // Else for cyclic refresh mode update the segment map, set the segment id
     // and then update the quantizer.
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
-      av1_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row, mi_col,
-                                        bsize, ctx->rate, ctx->dist, x->skip);
-      reset_tx_size(xd, &mi_addr->mbmi, cm->tx_mode);
+      av1_cyclic_refresh_update_segment(cpi, mi_addr, mi_row, mi_col, bsize,
+                                        ctx->rate, ctx->dist, x->skip);
+      reset_tx_size(x, mi_addr, cm->tx_mode);
     }
+    if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd))
+      mi_addr->uv_mode = UV_DC_PRED;
   }
 
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
+  for (i = 0; i < num_planes; ++i) {
     p[i].coeff = ctx->coeff[i];
     p[i].qcoeff = ctx->qcoeff[i];
     pd[i].dqcoeff = ctx->dqcoeff[i];
-#if CONFIG_PVQ
-    pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i];
-#endif
     p[i].eobs = ctx->eobs[i];
-#if CONFIG_LV_MAP
     p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
-#endif  // CONFIG_LV_MAP
   }
   for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
-#if CONFIG_MRC_TX
-  xd->mrc_mask = ctx->mrc_mask;
-#endif  // CONFIG_MRC_TX
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
   for (y = 0; y < mi_height; y++)
@@ -653,26 +417,7 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td,
         xd->mi[x_idx + y * mis] = mi_addr;
       }
 
-#if !CONFIG_EXT_DELTA_Q
-  if (cpi->oxcf.aq_mode > NO_AQ && cpi->oxcf.aq_mode < DELTA_AQ)
-    av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
-#else
-  if (cpi->oxcf.aq_mode)
-    av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
-#endif
-
-  if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8 && !unify_bsize) {
-    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
-    mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
-  }
-
-  x->skip = ctx->skip;
-
-#if CONFIG_VAR_TX
-  for (i = 0; i < 1; ++i)
-    memcpy(x->blk_skip[i], ctx->blk_skip[i],
-           sizeof(uint8_t) * ctx->num_4x4_blk);
-#endif
+  if (cpi->oxcf.aq_mode) av1_init_plane_quantizers(cpi, x, mi_addr->segment_id);
 
   if (dry_run) return;
 
@@ -687,18 +432,16 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td,
         THR_H_PRED /*H_PRED*/,
         THR_D45_PRED /*D45_PRED*/,
         THR_D135_PRED /*D135_PRED*/,
-        THR_D117_PRED /*D117_PRED*/,
-        THR_D153_PRED /*D153_PRED*/,
-        THR_D207_PRED /*D207_PRED*/,
-        THR_D63_PRED /*D63_PRED*/,
-        THR_SMOOTH, /*SMOOTH_PRED*/
-#if CONFIG_SMOOTH_HV
+        THR_D113_PRED /*D113_PRED*/,
+        THR_D157_PRED /*D157_PRED*/,
+        THR_D203_PRED /*D203_PRED*/,
+        THR_D67_PRED /*D67_PRED*/,
+        THR_SMOOTH,   /*SMOOTH_PRED*/
         THR_SMOOTH_V, /*SMOOTH_V_PRED*/
         THR_SMOOTH_H, /*SMOOTH_H_PRED*/
-#endif                // CONFIG_SMOOTH_HV
-        THR_TM /*TM_PRED*/,
+        THR_PAETH /*PAETH_PRED*/,
       };
-      ++mode_chosen_counts[kf_mode_index[mbmi->mode]];
+      ++mode_chosen_counts[kf_mode_index[mi_addr->mode]];
     } else {
       // Note how often each mode chosen as best
       ++mode_chosen_counts[ctx->best_mode_index];
@@ -706,188 +449,17 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td,
   }
 #endif
   if (!frame_is_intra_only(cm)) {
-    if (is_inter_block(mbmi)) {
-      av1_update_mv_count(td);
-#if CONFIG_GLOBAL_MOTION
-      if (bsize >= BLOCK_8X8) {
-        // TODO(sarahparker): global motion stats need to be handled per-tile
-        // to be compatible with tile-based threading.
-        update_global_motion_used(mbmi->mode, bsize, mbmi, rdc);
-      } else {
-        const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
-        const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
-        int idx, idy;
-        for (idy = 0; idy < 2; idy += num_4x4_h) {
-          for (idx = 0; idx < 2; idx += num_4x4_w) {
-            const int j = idy * 2 + idx;
-            update_global_motion_used(mi->bmi[j].as_mode, bsize, mbmi, rdc);
-          }
-        }
-      }
-#endif  // CONFIG_GLOBAL_MOTION
-      if (cm->interp_filter == SWITCHABLE
-#if CONFIG_WARPED_MOTION
-          && mbmi->motion_mode != WARPED_CAUSAL
-#endif  // CONFIG_WARPED_MOTION
-#if CONFIG_GLOBAL_MOTION
-          && !is_nontrans_global_motion(xd)
-#endif  // CONFIG_GLOBAL_MOTION
-              ) {
-#if CONFIG_DUAL_FILTER
-        update_filter_type_count(td->counts, xd, mbmi);
-#else
-        const int switchable_ctx = av1_get_pred_context_switchable_interp(xd);
-        const InterpFilter filter =
-            av1_extract_interp_filter(mbmi->interp_filters, 0);
-        ++td->counts->switchable_interp[switchable_ctx][filter];
-#endif
-      }
-    }
-
-    rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
-    rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
-    rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
-  }
-
-  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
-  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
-  av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
-}
-
-#if CONFIG_SUPERTX
-static void update_state_supertx(const AV1_COMP *const cpi, ThreadData *td,
-                                 PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
-                                 BLOCK_SIZE bsize, RUN_TYPE dry_run) {
-  int y, x_idx;
-#if CONFIG_VAR_TX
-  int i;
-#endif
-  const AV1_COMMON *const cm = &cpi->common;
-  RD_COUNTS *const rdc = &td->rd_counts;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *mi = &ctx->mic;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  MODE_INFO *mi_addr = xd->mi[0];
-  const struct segmentation *const seg = &cm->seg;
-  const int mis = cm->mi_stride;
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-  const int unify_bsize = CONFIG_CB4X4;
-  int8_t rf_type;
-
-  *mi_addr = *mi;
-  *x->mbmi_ext = ctx->mbmi_ext;
-  assert(is_inter_block(mbmi));
-  assert(mbmi->tx_size == ctx->mic.mbmi.tx_size);
-
-#if CONFIG_DUAL_FILTER
-  reset_intmv_filter_type(cm, xd, mbmi);
-#endif
-
-  rf_type = av1_ref_frame_type(mbmi->ref_frame);
-  if (x->mbmi_ext->ref_mv_count[rf_type] > 1 &&
-      (mbmi->sb_type >= BLOCK_8X8 || unify_bsize)) {
-    set_ref_and_pred_mvs(x, mi->mbmi.pred_mv, rf_type);
-  }
-
-  // If segmentation in use
-  if (seg->enabled) {
-    if (cpi->vaq_refresh) {
-      const int energy =
-          bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize);
-      mi_addr->mbmi.segment_id = av1_vaq_segment_id(energy);
-    } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
-      // For cyclic refresh mode, now update the segment map
-      // and set the segment id.
-      av1_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row, mi_col,
-                                        bsize, ctx->rate, ctx->dist, 1);
-    } else {
-      // Otherwise just set the segment id based on the current segment map
-      const uint8_t *const map =
-          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-      mi_addr->mbmi.segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+    if (is_inter_block(mi_addr)) {
+      // TODO(sarahparker): global motion stats need to be handled per-tile
+      // to be compatible with tile-based threading.
+      update_global_motion_used(mi_addr->mode, bsize, mi_addr, rdc);
     }
-    mi_addr->mbmi.segment_id_supertx = MAX_SEGMENTS;
-  }
-  // Restore the coding context of the MB to that that was in place
-  // when the mode was picked for it
-  for (y = 0; y < mi_height; y++)
-    for (x_idx = 0; x_idx < mi_width; x_idx++)
-      if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx &&
-          (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
-        xd->mi[x_idx + y * mis] = mi_addr;
-      }
-
-#if !CONFIG_CB4X4
-  if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) {
-    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
-    mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
-  }
-#endif
-
-  x->skip = ctx->skip;
-
-#if CONFIG_VAR_TX
-  for (i = 0; i < 1; ++i)
-    memcpy(x->blk_skip[i], ctx->blk_skip[i],
-           sizeof(uint8_t) * ctx->num_4x4_blk);
-
-  if (!is_inter_block(mbmi) || mbmi->skip)
-    mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-#endif  // CONFIG_VAR_TX
 
-#if CONFIG_VAR_TX
-  {
-    const TX_SIZE mtx = mbmi->tx_size;
-    const int num_4x4_blocks_wide = tx_size_wide_unit[mtx] >> 1;
-    const int num_4x4_blocks_high = tx_size_high_unit[mtx] >> 1;
-    int idy, idx;
-    mbmi->inter_tx_size[0][0] = mtx;
-    for (idy = 0; idy < num_4x4_blocks_high; ++idy)
-      for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
-        mbmi->inter_tx_size[idy][idx] = mtx;
-  }
-#endif  // CONFIG_VAR_TX
-  // Turn motion variation off for supertx
-  mbmi->motion_mode = SIMPLE_TRANSLATION;
-
-  if (dry_run) return;
-
-  if (!frame_is_intra_only(cm)) {
-    av1_update_mv_count(td);
-
-#if CONFIG_GLOBAL_MOTION
-    if (is_inter_block(mbmi)) {
-      if (bsize >= BLOCK_8X8) {
-        // TODO(sarahparker): global motion stats need to be handled per-tile
-        // to be compatible with tile-based threading.
-        update_global_motion_used(mbmi->mode, bsize, mbmi, rdc);
-      } else {
-        const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
-        const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
-        int idx, idy;
-        for (idy = 0; idy < 2; idy += num_4x4_h) {
-          for (idx = 0; idx < 2; idx += num_4x4_w) {
-            const int j = idy * 2 + idx;
-            update_global_motion_used(mi->bmi[j].as_mode, bsize, mbmi, rdc);
-          }
-        }
-      }
-    }
-#endif  // CONFIG_GLOBAL_MOTION
-
-    if (cm->interp_filter == SWITCHABLE
-#if CONFIG_GLOBAL_MOTION
-        && !is_nontrans_global_motion(xd)
-#endif  // CONFIG_GLOBAL_MOTION
-            ) {
-#if CONFIG_DUAL_FILTER
-      update_filter_type_count(td->counts, xd, mbmi);
-#else
-      const int pred_ctx = av1_get_pred_context_switchable_interp(xd);
-      ++td->counts->switchable_interp[pred_ctx][mbmi->interp_filter];
-#endif
+    if (cm->interp_filter == SWITCHABLE &&
+        mi_addr->motion_mode != WARPED_CAUSAL &&
+        !is_nontrans_global_motion(xd, xd->mi[0])) {
+      update_filter_type_count(tile_data->allow_update_cdf, td->counts, xd,
+                               mi_addr);
     }
 
     rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
@@ -895,572 +467,114 @@ static void update_state_supertx(const AV1_COMP *const cpi, ThreadData *td,
     rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
   }
 
-  const int x_mis = AOMMIN(mi_width, cm->mi_cols - mi_col);
-  const int y_mis = AOMMIN(mi_height, cm->mi_rows - mi_row);
+  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
   av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
 }
 
-static void update_state_sb_supertx(const AV1_COMP *const cpi, ThreadData *td,
-                                    const TileInfo *const tile, int mi_row,
-                                    int mi_col, BLOCK_SIZE bsize,
-                                    RUN_TYPE dry_run, PC_TREE *pc_tree) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *const p = x->plane;
-  struct macroblockd_plane *const pd = xd->plane;
-  int hbs = mi_size_wide[bsize] / 2;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-  PARTITION_TYPE partition = pc_tree->partitioning;
-  BLOCK_SIZE subsize = get_subsize(bsize, partition);
-  int i;
-#if CONFIG_EXT_PARTITION_TYPES
-  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
-#endif
-  PICK_MODE_CONTEXT *pmc = NULL;
-
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
-
-  if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
-    x->mb_energy = av1_block_energy(cpi, x, bsize);
-
-  switch (partition) {
-    case PARTITION_NONE:
-      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
-      update_state_supertx(cpi, td, &pc_tree->none, mi_row, mi_col, subsize,
-                           dry_run);
-      break;
-    case PARTITION_VERT:
-      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
-      update_state_supertx(cpi, td, &pc_tree->vertical[0], mi_row, mi_col,
-                           subsize, dry_run);
-      if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) {
-        set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
-        update_state_supertx(cpi, td, &pc_tree->vertical[1], mi_row,
-                             mi_col + hbs, subsize, dry_run);
-      }
-      pmc = &pc_tree->vertical_supertx;
-      break;
-    case PARTITION_HORZ:
-      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
-      update_state_supertx(cpi, td, &pc_tree->horizontal[0], mi_row, mi_col,
-                           subsize, dry_run);
-      if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) {
-        set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
-        update_state_supertx(cpi, td, &pc_tree->horizontal[1], mi_row + hbs,
-                             mi_col, subsize, dry_run);
-      }
-      pmc = &pc_tree->horizontal_supertx;
-      break;
-    case PARTITION_SPLIT:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
-        update_state_supertx(cpi, td, pc_tree->leaf_split[0], mi_row, mi_col,
-                             subsize, dry_run);
-      } else {
-        set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
-        update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, subsize, dry_run,
-                                pc_tree->split[0]);
-        set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
-        update_state_sb_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize,
-                                dry_run, pc_tree->split[1]);
-        set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
-        update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize,
-                                dry_run, pc_tree->split[2]);
-        set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, subsize);
-        update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs,
-                                subsize, dry_run, pc_tree->split[3]);
-      }
-      pmc = &pc_tree->split_supertx;
-      break;
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-#error HORZ/VERT_A/B partitions not yet updated in superres code
-#endif
-    case PARTITION_HORZ_A:
-      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
-      update_state_supertx(cpi, td, &pc_tree->horizontala[0], mi_row, mi_col,
-                           bsize2, dry_run);
-      set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
-      update_state_supertx(cpi, td, &pc_tree->horizontala[1], mi_row,
-                           mi_col + hbs, bsize2, dry_run);
-      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
-      update_state_supertx(cpi, td, &pc_tree->horizontala[2], mi_row + hbs,
-                           mi_col, subsize, dry_run);
-      pmc = &pc_tree->horizontala_supertx;
-      break;
-    case PARTITION_HORZ_B:
-      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
-      update_state_supertx(cpi, td, &pc_tree->horizontalb[0], mi_row, mi_col,
-                           subsize, dry_run);
-      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
-      update_state_supertx(cpi, td, &pc_tree->horizontalb[1], mi_row + hbs,
-                           mi_col, bsize2, dry_run);
-      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
-      update_state_supertx(cpi, td, &pc_tree->horizontalb[2], mi_row + hbs,
-                           mi_col + hbs, bsize2, dry_run);
-      pmc = &pc_tree->horizontalb_supertx;
-      break;
-    case PARTITION_VERT_A:
-      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
-      update_state_supertx(cpi, td, &pc_tree->verticala[0], mi_row, mi_col,
-                           bsize2, dry_run);
-      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
-      update_state_supertx(cpi, td, &pc_tree->verticala[1], mi_row + hbs,
-                           mi_col, bsize2, dry_run);
-      set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
-      update_state_supertx(cpi, td, &pc_tree->verticala[2], mi_row,
-                           mi_col + hbs, subsize, dry_run);
-      pmc = &pc_tree->verticala_supertx;
-      break;
-    case PARTITION_VERT_B:
-      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
-      update_state_supertx(cpi, td, &pc_tree->verticalb[0], mi_row, mi_col,
-                           subsize, dry_run);
-      set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
-      update_state_supertx(cpi, td, &pc_tree->verticalb[1], mi_row,
-                           mi_col + hbs, bsize2, dry_run);
-      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
-      update_state_supertx(cpi, td, &pc_tree->verticalb[2], mi_row + hbs,
-                           mi_col + hbs, bsize2, dry_run);
-      pmc = &pc_tree->verticalb_supertx;
-      break;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-    default: assert(0);
-  }
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    if (pmc != NULL) {
-      p[i].coeff = pmc->coeff[i];
-      p[i].qcoeff = pmc->qcoeff[i];
-      pd[i].dqcoeff = pmc->dqcoeff[i];
-      p[i].eobs = pmc->eobs[i];
-    } else {
-      // These should never be used
-      p[i].coeff = NULL;
-      p[i].qcoeff = NULL;
-      pd[i].dqcoeff = NULL;
-      p[i].eobs = NULL;
-    }
-  }
-}
-
-static void update_supertx_param(ThreadData *td, PICK_MODE_CONTEXT *ctx,
-                                 int best_tx, TX_SIZE supertx_size) {
-  MACROBLOCK *const x = &td->mb;
-#if CONFIG_VAR_TX
-  int i;
-
-  for (i = 0; i < 1; ++i)
-    memcpy(ctx->blk_skip[i], x->blk_skip[i],
-           sizeof(uint8_t) * ctx->num_4x4_blk);
-  ctx->mic.mbmi.min_tx_size = get_min_tx_size(supertx_size);
-#endif  // CONFIG_VAR_TX
-  ctx->mic.mbmi.tx_size = supertx_size;
-  ctx->skip = x->skip;
-  ctx->mic.mbmi.tx_type = best_tx;
-}
-
-static void update_supertx_param_sb(const AV1_COMP *const cpi, ThreadData *td,
-                                    int mi_row, int mi_col, BLOCK_SIZE bsize,
-                                    int best_tx, TX_SIZE supertx_size,
-                                    PC_TREE *pc_tree) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int hbs = mi_size_wide[bsize] / 2;
-  PARTITION_TYPE partition = pc_tree->partitioning;
-  BLOCK_SIZE subsize = get_subsize(bsize, partition);
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-  int i;
-#endif
-
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
-
-  switch (partition) {
-    case PARTITION_NONE:
-      update_supertx_param(td, &pc_tree->none, best_tx, supertx_size);
-      break;
-    case PARTITION_VERT:
-      update_supertx_param(td, &pc_tree->vertical[0], best_tx, supertx_size);
-      if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize))
-        update_supertx_param(td, &pc_tree->vertical[1], best_tx, supertx_size);
-      break;
-    case PARTITION_HORZ:
-      update_supertx_param(td, &pc_tree->horizontal[0], best_tx, supertx_size);
-      if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize))
-        update_supertx_param(td, &pc_tree->horizontal[1], best_tx,
-                             supertx_size);
-      break;
-    case PARTITION_SPLIT:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        update_supertx_param(td, pc_tree->leaf_split[0], best_tx, supertx_size);
-      } else {
-        update_supertx_param_sb(cpi, td, mi_row, mi_col, subsize, best_tx,
-                                supertx_size, pc_tree->split[0]);
-        update_supertx_param_sb(cpi, td, mi_row, mi_col + hbs, subsize, best_tx,
-                                supertx_size, pc_tree->split[1]);
-        update_supertx_param_sb(cpi, td, mi_row + hbs, mi_col, subsize, best_tx,
-                                supertx_size, pc_tree->split[2]);
-        update_supertx_param_sb(cpi, td, mi_row + hbs, mi_col + hbs, subsize,
-                                best_tx, supertx_size, pc_tree->split[3]);
-      }
-      break;
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-#error HORZ/VERT_A/B partitions not yet updated in superres code
-#endif
-    case PARTITION_HORZ_A:
-      for (i = 0; i < 3; i++)
-        update_supertx_param(td, &pc_tree->horizontala[i], best_tx,
-                             supertx_size);
-      break;
-    case PARTITION_HORZ_B:
-      for (i = 0; i < 3; i++)
-        update_supertx_param(td, &pc_tree->horizontalb[i], best_tx,
-                             supertx_size);
-      break;
-    case PARTITION_VERT_A:
-      for (i = 0; i < 3; i++)
-        update_supertx_param(td, &pc_tree->verticala[i], best_tx, supertx_size);
-      break;
-    case PARTITION_VERT_B:
-      for (i = 0; i < 3; i++)
-        update_supertx_param(td, &pc_tree->verticalb[i], best_tx, supertx_size);
-      break;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-    default: assert(0);
-  }
-}
-#endif  // CONFIG_SUPERTX
-
-#if CONFIG_MOTION_VAR && NC_MODE_INFO
-static void set_mode_info_b(const AV1_COMP *const cpi,
-                            const TileInfo *const tile, ThreadData *td,
-                            int mi_row, int mi_col, BLOCK_SIZE bsize,
-                            PICK_MODE_CONTEXT *ctx) {
-  MACROBLOCK *const x = &td->mb;
-  set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
-  update_state(cpi, td, ctx, mi_row, mi_col, bsize, 1);
-}
-
-static void set_mode_info_sb(const AV1_COMP *const cpi, ThreadData *td,
-                             const TileInfo *const tile, TOKENEXTRA **tp,
-                             int mi_row, int mi_col, BLOCK_SIZE bsize,
-                             PC_TREE *pc_tree) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int hbs = mi_size_wide[bsize] / 2;
-  const PARTITION_TYPE partition = pc_tree->partitioning;
-  BLOCK_SIZE subsize = get_subsize(bsize, partition);
-#if CONFIG_EXT_PARTITION_TYPES
-  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
-  const int quarter_step = mi_size_wide[bsize] / 4;
-#endif
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-  assert(bsize >= BLOCK_8X8);
-#endif
-
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
-
-  switch (partition) {
-    case PARTITION_NONE:
-      set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize, &pc_tree->none);
-      break;
-    case PARTITION_VERT:
-      set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize,
-                      &pc_tree->vertical[0]);
-      if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) {
-        set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, subsize,
-                        &pc_tree->vertical[1]);
-      }
-      break;
-    case PARTITION_HORZ:
-      set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize,
-                      &pc_tree->horizontal[0]);
-      if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) {
-        set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, subsize,
-                        &pc_tree->horizontal[1]);
-      }
-      break;
-    case PARTITION_SPLIT:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize,
-                        pc_tree->leaf_split[0]);
-      } else {
-        set_mode_info_sb(cpi, td, tile, tp, mi_row, mi_col, subsize,
-                         pc_tree->split[0]);
-        set_mode_info_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, subsize,
-                         pc_tree->split[1]);
-        set_mode_info_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, subsize,
-                         pc_tree->split[2]);
-        set_mode_info_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, subsize,
-                         pc_tree->split[3]);
-      }
-      break;
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-#error NC_MODE_INFO+MOTION_VAR not yet supported for new HORZ/VERT_AB partitions
-#endif
-    case PARTITION_HORZ_A:
-      set_mode_info_b(cpi, tile, td, mi_row, mi_col, bsize2,
-                      &pc_tree->horizontala[0]);
-      set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, bsize2,
-                      &pc_tree->horizontala[1]);
-      set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, subsize,
-                      &pc_tree->horizontala[2]);
-      break;
-    case PARTITION_HORZ_B:
-      set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize,
-                      &pc_tree->horizontalb[0]);
-      set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, bsize2,
-                      &pc_tree->horizontalb[1]);
-      set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col + hbs, bsize2,
-                      &pc_tree->horizontalb[2]);
-      break;
-    case PARTITION_VERT_A:
-      set_mode_info_b(cpi, tile, td, mi_row, mi_col, bsize2,
-                      &pc_tree->verticala[0]);
-      set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, bsize2,
-                      &pc_tree->verticala[1]);
-      set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, subsize,
-                      &pc_tree->verticala[2]);
-      break;
-    case PARTITION_VERT_B:
-      set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize,
-                      &pc_tree->verticalb[0]);
-      set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, bsize2,
-                      &pc_tree->verticalb[1]);
-      set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col + hbs, bsize2,
-                      &pc_tree->verticalb[2]);
-      break;
-    case PARTITION_HORZ_4:
-      for (int i = 0; i < 4; ++i) {
-        int this_mi_row = mi_row + i * quarter_step;
-        if (i > 0 && this_mi_row >= cm->mi_rows) break;
-
-        set_mode_info_b(cpi, tile, td, this_mi_row, mi_col, subsize,
-                        &pc_tree->horizontal4[i]);
-      }
-      break;
-    case PARTITION_VERT_4:
-      for (int i = 0; i < 4; ++i) {
-        int this_mi_col = mi_col + i * quarter_step;
-        if (i > 0 && this_mi_col >= cm->mi_cols) break;
-
-        set_mode_info_b(cpi, tile, td, mi_row, this_mi_col, subsize,
-                        &pc_tree->vertical4[i]);
-      }
-      break;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-    default: assert(0 && "Invalid partition type."); break;
-  }
-}
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-static void av1_get_ncobmc_mode_rd(const AV1_COMP *const cpi,
-                                   MACROBLOCK *const x, MACROBLOCKD *const xd,
-                                   int bsize, const int mi_row,
-                                   const int mi_col, NCOBMC_MODE *mode) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-
-  assert(bsize >= BLOCK_8X8);
-
-  reset_xd_boundary(xd, mi_row, mi_height, mi_col, mi_width, cm->mi_rows,
-                    cm->mi_cols);
-
-  // set up source buffers before calling the mode searching function
-  av1_setup_src_planes(x, cpi->source, mi_row, mi_col);
-
-  *mode = get_ncobmc_mode(cpi, x, xd, mi_row, mi_col, bsize);
-}
-static void get_ncobmc_intrpl_pred(const AV1_COMP *const cpi, ThreadData *td,
-                                   int mi_row, int mi_col, BLOCK_SIZE bsize) {
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-  const int hbs = AOMMAX(mi_size_wide[bsize] / 2, mi_size_high[bsize] / 2);
-  const BLOCK_SIZE sqr_blk = bsize_2_sqr_bsize[bsize];
-
-  if (mi_width > mi_height) {
-    // horizontal partition
-    av1_get_ncobmc_mode_rd(cpi, x, xd, sqr_blk, mi_row, mi_col,
-                           &mbmi->ncobmc_mode[0]);
-    xd->mi += hbs;
-    av1_get_ncobmc_mode_rd(cpi, x, xd, sqr_blk, mi_row, mi_col + hbs,
-                           &mbmi->ncobmc_mode[1]);
-  } else if (mi_height > mi_width) {
-    // vertical partition
-    av1_get_ncobmc_mode_rd(cpi, x, xd, sqr_blk, mi_row, mi_col,
-                           &mbmi->ncobmc_mode[0]);
-    xd->mi += hbs * xd->mi_stride;
-    av1_get_ncobmc_mode_rd(cpi, x, xd, sqr_blk, mi_row + hbs, mi_col,
-                           &mbmi->ncobmc_mode[1]);
-  } else {
-    av1_get_ncobmc_mode_rd(cpi, x, xd, sqr_blk, mi_row, mi_col,
-                           &mbmi->ncobmc_mode[0]);
-  }
-  // restore the info
-  av1_setup_src_planes(x, cpi->source, mi_row, mi_col);
-  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
-}
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
-
 void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
-                          int mi_row, int mi_col) {
-  uint8_t *const buffers[3] = { src->y_buffer, src->u_buffer, src->v_buffer };
-  const int widths[3] = { src->y_crop_width, src->uv_crop_width,
-                          src->uv_crop_width };
-  const int heights[3] = { src->y_crop_height, src->uv_crop_height,
-                           src->uv_crop_height };
-  const int strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
-  int i;
-
+                          int mi_row, int mi_col, const int num_planes) {
   // Set current frame pointer.
   x->e_mbd.cur_buf = src;
 
-  for (i = 0; i < MAX_MB_PLANE; i++)
-    setup_pred_plane(&x->plane[i].src, x->e_mbd.mi[0]->mbmi.sb_type, buffers[i],
-                     widths[i], heights[i], strides[i], mi_row, mi_col, NULL,
+  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+  // the static analysis warnings.
+  for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); i++) {
+    const int is_uv = i > 0;
+    setup_pred_plane(&x->plane[i].src, x->e_mbd.mi[0]->sb_type, src->buffers[i],
+                     src->crop_widths[is_uv], src->crop_heights[is_uv],
+                     src->strides[is_uv], mi_row, mi_col, NULL,
                      x->e_mbd.plane[i].subsampling_x,
                      x->e_mbd.plane[i].subsampling_y);
+  }
 }
 
 static int set_segment_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
                               int8_t segment_id) {
-  int segment_qindex;
   const AV1_COMMON *const cm = &cpi->common;
   av1_init_plane_quantizers(cpi, x, segment_id);
   aom_clear_system_state();
-  segment_qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+  int segment_qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
   return av1_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
 }
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-static void dist_8x8_set_sub8x8_dst(MACROBLOCK *const x, uint8_t *dst8x8,
-                                    BLOCK_SIZE bsize, int bw, int bh,
-                                    int mi_row, int mi_col) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblockd_plane *const pd = &xd->plane[0];
-  const int dst_stride = pd->dst.stride;
-  uint8_t *dst = pd->dst.buf;
-
-  assert(bsize < BLOCK_8X8);
-
-  if (bsize < BLOCK_8X8) {
-    int i, j;
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      uint16_t *dst8x8_16 = (uint16_t *)dst8x8;
-      uint16_t *dst_sub8x8 = &dst8x8_16[((mi_row & 1) * 8 + (mi_col & 1)) << 2];
-
-      for (j = 0; j < bh; ++j)
-        for (i = 0; i < bw; ++i)
-          dst_sub8x8[j * 8 + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
-    } else {
-#endif
-      uint8_t *dst_sub8x8 = &dst8x8[((mi_row & 1) * 8 + (mi_col & 1)) << 2];
+static int set_deltaq_rdmult(const AV1_COMP *const cpi, MACROBLOCKD *const xd) {
+  const AV1_COMMON *const cm = &cpi->common;
 
-      for (j = 0; j < bh; ++j)
-        for (i = 0; i < bw; ++i)
-          dst_sub8x8[j * 8 + i] = dst[j * dst_stride + i];
-#if CONFIG_HIGHBITDEPTH
-    }
-#endif
-  }
+  return av1_compute_rd_mult(
+      cpi, cm->base_qindex + xd->delta_qindex + cm->y_dc_delta_q);
 }
-#endif
 
 static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
                              MACROBLOCK *const x, int mi_row, int mi_col,
-                             RD_STATS *rd_cost,
-#if CONFIG_SUPERTX
-                             int *totalrate_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                             PARTITION_TYPE partition,
-#endif
+                             RD_STATS *rd_cost, PARTITION_TYPE partition,
                              BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                              int64_t best_rd) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
+  MB_MODE_INFO *ctx_mbmi = &ctx->mic;
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
   const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
+  const DELTAQ_MODE deltaq_mode = cpi->oxcf.deltaq_mode;
   int i, orig_rdmult;
 
   aom_clear_system_state();
 
-#if CONFIG_PVQ
-  x->pvq_speed = 1;
-  x->pvq_coded = 0;
-#endif
-
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-  mbmi = &xd->mi[0]->mbmi;
-  mbmi->sb_type = bsize;
+
+  mbmi = xd->mi[0];
+
+  if (ctx->rd_mode_is_ready) {
+    assert(ctx_mbmi->sb_type == bsize);
+    assert(ctx_mbmi->partition == partition);
+    *mbmi = *ctx_mbmi;
+    rd_cost->rate = ctx->rate;
+    rd_cost->dist = ctx->dist;
+    rd_cost->rdcost = ctx->rdcost;
+  } else {
+    mbmi->sb_type = bsize;
+    mbmi->partition = partition;
+  }
+
 #if CONFIG_RD_DEBUG
   mbmi->mi_row = mi_row;
   mbmi->mi_col = mi_col;
 #endif
-#if CONFIG_SUPERTX
-  // We set tx_size here as skip blocks would otherwise not set it.
-  // tx_size needs to be set at this point as supertx_enable in
-  // write_modes_sb is computed based on this, and if the garbage in memory
-  // just happens to be the supertx_size, then the packer will code this
-  // block as a supertx block, even if rdopt did not pick it as such.
-  mbmi->tx_size = max_txsize_lookup[bsize];
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-  mbmi->partition = partition;
-#endif
 
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
+  for (i = 0; i < num_planes; ++i) {
     p[i].coeff = ctx->coeff[i];
     p[i].qcoeff = ctx->qcoeff[i];
     pd[i].dqcoeff = ctx->dqcoeff[i];
-#if CONFIG_PVQ
-    pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i];
-#endif
     p[i].eobs = ctx->eobs[i];
-#if CONFIG_LV_MAP
     p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
-#endif
   }
 
   for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
-#if CONFIG_MRC_TX
-  xd->mrc_mask = ctx->mrc_mask;
-#endif  // CONFIG_MRC_TX
 
-  ctx->skippable = 0;
+  if (!ctx->rd_mode_is_ready) {
+    ctx->skippable = 0;
 
-  // Set to zero to make sure we do not use the previous encoded frame stats
-  mbmi->skip = 0;
+    // Set to zero to make sure we do not use the previous encoded frame stats
+    mbmi->skip = 0;
+
+    // Reset skip mode flag.
+    mbmi->skip_mode = 0;
+  }
 
-#if CONFIG_CB4X4
   x->skip_chroma_rd =
       !is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
                            xd->plane[1].subsampling_y);
-#endif
 
-#if CONFIG_HIGHBITDEPTH
+  if (ctx->rd_mode_is_ready) {
+    x->skip = ctx->skip;
+    *x->mbmi_ext = ctx->mbmi_ext;
+    return;
+  }
+
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     x->source_variance = av1_high_get_sby_perpixel_variance(
         cpi, &x->plane[0].src, bsize, xd->bd);
@@ -1468,10 +582,6 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
     x->source_variance =
         av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
   }
-#else
-  x->source_variance =
-      av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
-#endif  // CONFIG_HIGHBITDEPTH
 
   // Save rdmult before it might be changed, so it can be restored later.
   orig_rdmult = x->rdmult;
@@ -1481,8 +591,6 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
       const int energy =
           bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize);
       mbmi->segment_id = av1_vaq_segment_id(energy);
-      // Re-initialise quantiser
-      av1_init_plane_quantizers(cpi, x, mbmi->segment_id);
     }
     x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
   } else if (aq_mode == COMPLEXITY_AQ) {
@@ -1493,29 +601,20 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
       x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
   }
 
+  if (deltaq_mode > 0) x->rdmult = set_deltaq_rdmult(cpi, xd);
+
   // Find best coding mode & reconstruct the MB so it is available
   // as a predictor for MBs that follow in the SB
   if (frame_is_intra_only(cm)) {
-    av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
-#if CONFIG_SUPERTX
-    *totalrate_nocoef = 0;
-#endif  // CONFIG_SUPERTX
+    av1_rd_pick_intra_mode_sb(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx,
+                              best_rd);
   } else {
     if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
       av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
                                          rd_cost, bsize, ctx, best_rd);
-#if CONFIG_SUPERTX
-      *totalrate_nocoef = rd_cost->rate;
-#endif  // CONFIG_SUPERTX
     } else {
       av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
-#if CONFIG_SUPERTX
-                                totalrate_nocoef,
-#endif  // CONFIG_SUPERTX
                                 bsize, ctx, best_rd);
-#if CONFIG_SUPERTX
-      assert(*totalrate_nocoef >= 0);
-#endif  // CONFIG_SUPERTX
     }
   }
 
@@ -1523,9 +622,7 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
   if ((rd_cost->rate != INT_MAX) && (aq_mode == COMPLEXITY_AQ) &&
       (bsize >= BLOCK_16X16) &&
       (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
-#if CONFIG_EXT_REFS
        cpi->refresh_alt2_ref_frame ||
-#endif  // CONFIG_EXT_REFS
        (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref))) {
     av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
   }
@@ -1538,363 +635,630 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
 
   ctx->rate = rd_cost->rate;
   ctx->dist = rd_cost->dist;
+  ctx->rdcost = rd_cost->rdcost;
 }
 
-static void update_inter_mode_stats(FRAME_COUNTS *counts, PREDICTION_MODE mode,
-                                    int16_t mode_context) {
+static void update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts,
+                                    PREDICTION_MODE mode, int16_t mode_context,
+                                    uint8_t allow_update_cdf) {
+  (void)counts;
+
   int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
   if (mode == NEWMV) {
+#if CONFIG_ENTROPY_STATS
     ++counts->newmv_mode[mode_ctx][0];
+#endif
+    if (allow_update_cdf) update_cdf(fc->newmv_cdf[mode_ctx], 0, 2);
     return;
   } else {
+#if CONFIG_ENTROPY_STATS
     ++counts->newmv_mode[mode_ctx][1];
+#endif
+    if (allow_update_cdf) update_cdf(fc->newmv_cdf[mode_ctx], 1, 2);
 
-    if (mode_context & (1 << ALL_ZERO_FLAG_OFFSET)) {
-      return;
-    }
-
-    mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
-    if (mode == ZEROMV) {
+    mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+    if (mode == GLOBALMV) {
+#if CONFIG_ENTROPY_STATS
       ++counts->zeromv_mode[mode_ctx][0];
+#endif
+      if (allow_update_cdf) update_cdf(fc->zeromv_cdf[mode_ctx], 0, 2);
       return;
     } else {
+#if CONFIG_ENTROPY_STATS
       ++counts->zeromv_mode[mode_ctx][1];
+#endif
+      if (allow_update_cdf) update_cdf(fc->zeromv_cdf[mode_ctx], 1, 2);
       mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+#if CONFIG_ENTROPY_STATS
+      ++counts->refmv_mode[mode_ctx][mode != NEARESTMV];
+#endif
+      if (allow_update_cdf)
+        update_cdf(fc->refmv_cdf[mode_ctx], mode != NEARESTMV, 2);
+    }
+  }
+}
+
+static void update_palette_cdf(MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+                               FRAME_COUNTS *counts, uint8_t allow_update_cdf) {
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int palette_bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+
+  (void)counts;
 
-      if (mode_context & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6;
-      if (mode_context & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7;
-      if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8;
+  if (mbmi->mode == DC_PRED) {
+    const int n = pmi->palette_size[0];
+    const int palette_mode_ctx = av1_get_palette_mode_ctx(xd);
 
-      ++counts->refmv_mode[mode_ctx][mode != NEARESTMV];
+#if CONFIG_ENTROPY_STATS
+    ++counts->palette_y_mode[palette_bsize_ctx][palette_mode_ctx][n > 0];
+#endif
+    if (allow_update_cdf)
+      update_cdf(fc->palette_y_mode_cdf[palette_bsize_ctx][palette_mode_ctx],
+                 n > 0, 2);
+    if (n > 0) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->palette_y_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
+#endif
+      if (allow_update_cdf) {
+        update_cdf(fc->palette_y_size_cdf[palette_bsize_ctx],
+                   n - PALETTE_MIN_SIZE, PALETTE_SIZES);
+      }
+    }
+  }
+
+  if (mbmi->uv_mode == UV_DC_PRED) {
+    const int n = pmi->palette_size[1];
+    const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->palette_uv_mode[palette_uv_mode_ctx][n > 0];
+#endif
+    if (allow_update_cdf)
+      update_cdf(fc->palette_uv_mode_cdf[palette_uv_mode_ctx], n > 0, 2);
+
+    if (n > 0) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->palette_uv_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
+#endif
+      if (allow_update_cdf) {
+        update_cdf(fc->palette_uv_size_cdf[palette_bsize_ctx],
+                   n - PALETTE_MIN_SIZE, PALETTE_SIZES);
+      }
     }
   }
 }
 
-static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
-                         int mi_col
-#if CONFIG_SUPERTX
-                         ,
-                         int supertx_enabled
+static void sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts,
+                            MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+                            const MB_MODE_INFO *above_mi,
+                            const MB_MODE_INFO *left_mi, const int intraonly,
+                            const int mi_row, const int mi_col,
+                            uint8_t allow_update_cdf) {
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+  const PREDICTION_MODE y_mode = mbmi->mode;
+  const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+  (void)counts;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+
+  if (intraonly) {
+#if CONFIG_ENTROPY_STATS
+    const PREDICTION_MODE above = av1_above_block_mode(above_mi);
+    const PREDICTION_MODE left = av1_left_block_mode(left_mi);
+    const int above_ctx = intra_mode_context[above];
+    const int left_ctx = intra_mode_context[left];
+    ++counts->kf_y_mode[above_ctx][left_ctx][y_mode];
+#endif  // CONFIG_ENTROPY_STATS
+    if (allow_update_cdf)
+      update_cdf(get_y_mode_cdf(fc, above_mi, left_mi), y_mode, INTRA_MODES);
+  } else {
+#if CONFIG_ENTROPY_STATS
+    ++counts->y_mode[size_group_lookup[bsize]][y_mode];
+#endif  // CONFIG_ENTROPY_STATS
+    if (allow_update_cdf)
+      update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES);
+  }
+
+  if (av1_filter_intra_allowed(cm, mbmi)) {
+    const int use_filter_intra_mode =
+        mbmi->filter_intra_mode_info.use_filter_intra;
+#if CONFIG_ENTROPY_STATS
+    ++counts->filter_intra[mbmi->sb_type][use_filter_intra_mode];
+    if (use_filter_intra_mode) {
+      ++counts
+            ->filter_intra_mode[mbmi->filter_intra_mode_info.filter_intra_mode];
+    }
+#endif  // CONFIG_ENTROPY_STATS
+    if (allow_update_cdf) {
+      update_cdf(fc->filter_intra_cdfs[mbmi->sb_type], use_filter_intra_mode,
+                 2);
+      if (use_filter_intra_mode) {
+        update_cdf(fc->filter_intra_mode_cdf,
+                   mbmi->filter_intra_mode_info.filter_intra_mode,
+                   FILTER_INTRA_MODES);
+      }
+    }
+  }
+  if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->angle_delta[mbmi->mode - V_PRED]
+                         [mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA];
+#endif
+    if (allow_update_cdf) {
+      update_cdf(fc->angle_delta_cdf[mbmi->mode - V_PRED],
+                 mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA,
+                 2 * MAX_ANGLE_DELTA + 1);
+    }
+  }
+
+  if (!is_chroma_reference(mi_row, mi_col, bsize,
+                           xd->plane[AOM_PLANE_U].subsampling_x,
+                           xd->plane[AOM_PLANE_U].subsampling_y))
+    return;
+
+#if CONFIG_ENTROPY_STATS
+  ++counts->uv_mode[is_cfl_allowed(xd)][y_mode][uv_mode];
+#endif  // CONFIG_ENTROPY_STATS
+  if (allow_update_cdf) {
+    const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd);
+    update_cdf(fc->uv_mode_cdf[cfl_allowed][y_mode], uv_mode,
+               UV_INTRA_MODES - !cfl_allowed);
+  }
+  if (uv_mode == UV_CFL_PRED) {
+    const int joint_sign = mbmi->cfl_alpha_signs;
+    const int idx = mbmi->cfl_alpha_idx;
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->cfl_sign[joint_sign];
+#endif
+    if (allow_update_cdf)
+      update_cdf(fc->cfl_sign_cdf, joint_sign, CFL_JOINT_SIGNS);
+    if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+      aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+
+#if CONFIG_ENTROPY_STATS
+      ++counts->cfl_alpha[CFL_CONTEXT_U(joint_sign)][CFL_IDX_U(idx)];
+#endif
+      if (allow_update_cdf)
+        update_cdf(cdf_u, CFL_IDX_U(idx), CFL_ALPHABET_SIZE);
+    }
+    if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+      aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+
+#if CONFIG_ENTROPY_STATS
+      ++counts->cfl_alpha[CFL_CONTEXT_V(joint_sign)][CFL_IDX_V(idx)];
 #endif
-                         ) {
+      if (allow_update_cdf)
+        update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE);
+    }
+  }
+  if (av1_is_directional_mode(get_uv_mode(uv_mode)) &&
+      av1_use_angle_delta(bsize)) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->angle_delta[uv_mode - UV_V_PRED]
+                         [mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA];
+#endif
+    if (allow_update_cdf) {
+      update_cdf(fc->angle_delta_cdf[uv_mode - UV_V_PRED],
+                 mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA,
+                 2 * MAX_ANGLE_DELTA + 1);
+    }
+  }
+  if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
+    update_palette_cdf(xd, mbmi, counts, allow_update_cdf);
+}
+
+static void update_stats(const AV1_COMMON *const cm, TileDataEnc *tile_data,
+                         ThreadData *td, int mi_row, int mi_col) {
   MACROBLOCK *x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const MODE_INFO *const mi = xd->mi[0];
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   FRAME_CONTEXT *fc = xd->tile_ctx;
+  const uint8_t allow_update_cdf = tile_data->allow_update_cdf;
 
   // delta quant applies to both intra and inter
-  int super_block_upper_left =
-      ((mi_row & MAX_MIB_MASK) == 0) && ((mi_col & MAX_MIB_MASK) == 0);
+  const int super_block_upper_left =
+      ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
+      ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
+
+  const int seg_ref_active =
+      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+
+  if (cm->skip_mode_flag && !seg_ref_active && is_comp_ref_allowed(bsize)) {
+    const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+#if CONFIG_ENTROPY_STATS
+    td->counts->skip_mode[skip_mode_ctx][mbmi->skip_mode]++;
+#endif
+    if (allow_update_cdf)
+      update_cdf(fc->skip_mode_cdfs[skip_mode_ctx], mbmi->skip_mode, 2);
+  }
+
+  if (!mbmi->skip_mode) {
+    if (!seg_ref_active) {
+      const int skip_ctx = av1_get_skip_context(xd);
+#if CONFIG_ENTROPY_STATS
+      td->counts->skip[skip_ctx][mbmi->skip]++;
+#endif
+      if (allow_update_cdf) update_cdf(fc->skip_cdfs[skip_ctx], mbmi->skip, 2);
+    }
+  }
 
-  if (cm->delta_q_present_flag && (bsize != cm->sb_size || !mbmi->skip) &&
+  if (cm->delta_q_present_flag &&
+      (bsize != cm->seq_params.sb_size || !mbmi->skip) &&
       super_block_upper_left) {
-    const int dq = (mbmi->current_q_index - xd->prev_qindex) / cm->delta_q_res;
+#if CONFIG_ENTROPY_STATS
+    const int dq =
+        (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res;
     const int absdq = abs(dq);
-    int i;
-    for (i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) {
+    for (int i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) {
       td->counts->delta_q[i][1]++;
     }
     if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++;
-    xd->prev_qindex = mbmi->current_q_index;
-#if CONFIG_EXT_DELTA_Q
-#if CONFIG_LOOPFILTER_LEVEL
+#endif
+    xd->current_qindex = mbmi->current_qindex;
     if (cm->delta_lf_present_flag) {
       if (cm->delta_lf_multi) {
-        for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) {
+        const int frame_lf_count =
+            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+#if CONFIG_ENTROPY_STATS
           const int delta_lf =
-              (mbmi->curr_delta_lf[lf_id] - xd->prev_delta_lf[lf_id]) /
-              cm->delta_lf_res;
+              (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) / cm->delta_lf_res;
           const int abs_delta_lf = abs(delta_lf);
-          for (i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+          for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
             td->counts->delta_lf_multi[lf_id][i][1]++;
           }
           if (abs_delta_lf < DELTA_LF_SMALL)
             td->counts->delta_lf_multi[lf_id][abs_delta_lf][0]++;
-          xd->prev_delta_lf[lf_id] = mbmi->curr_delta_lf[lf_id];
+#endif
+          xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
         }
       } else {
+#if CONFIG_ENTROPY_STATS
         const int delta_lf =
-            (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
+            (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
             cm->delta_lf_res;
         const int abs_delta_lf = abs(delta_lf);
-        for (i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+        for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
           td->counts->delta_lf[i][1]++;
         }
         if (abs_delta_lf < DELTA_LF_SMALL)
           td->counts->delta_lf[abs_delta_lf][0]++;
-        xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
-      }
-    }
-#else
-    if (cm->delta_lf_present_flag) {
-      const int dlf =
-          (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
-          cm->delta_lf_res;
-      const int absdlf = abs(dlf);
-      for (i = 0; i < AOMMIN(absdlf, DELTA_LF_SMALL); ++i) {
-        td->counts->delta_lf[i][1]++;
+#endif
+        xd->delta_lf_from_base = mbmi->delta_lf_from_base;
       }
-      if (absdlf < DELTA_LF_SMALL) td->counts->delta_lf[absdlf][0]++;
-      xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
     }
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif
   }
+
+  if (!is_inter_block(mbmi)) {
+    sum_intra_stats(cm, td->counts, xd, mbmi, xd->above_mbmi, xd->left_mbmi,
+                    frame_is_intra_only(cm), mi_row, mi_col,
+                    tile_data->allow_update_cdf);
+  }
+
+  if (av1_allow_intrabc(cm)) {
+    if (allow_update_cdf)
+      update_cdf(fc->intrabc_cdf, is_intrabc_block(mbmi), 2);
+#if CONFIG_ENTROPY_STATS
+    ++td->counts->intrabc[is_intrabc_block(mbmi)];
+#endif  // CONFIG_ENTROPY_STATS
+  }
+
   if (!frame_is_intra_only(cm)) {
-    FRAME_COUNTS *const counts = td->counts;
     RD_COUNTS *rdc = &td->rd_counts;
+
+    FRAME_COUNTS *const counts = td->counts;
+
+    if (mbmi->skip_mode) {
+      rdc->skip_mode_used_flag = 1;
+      if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+        assert(has_second_ref(mbmi));
+        rdc->compound_ref_used_flag = 1;
+      }
+      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+      return;
+    }
+
     const int inter_block = is_inter_block(mbmi);
-    const int seg_ref_active =
-        segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+
     if (!seg_ref_active) {
-#if CONFIG_SUPERTX
-      if (!supertx_enabled)
-#endif
-        counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
-#if CONFIG_NEW_MULTISYMBOL
-      update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)],
-                 inter_block, 2);
+#if CONFIG_ENTROPY_STATS
+      counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
 #endif
+      if (allow_update_cdf) {
+        update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)],
+                   inter_block, 2);
+      }
       // If the segment reference feature is enabled we have only a single
       // reference frame allowed for the segment so exclude it from
       // the reference frame counts used to work out probabilities.
       if (inter_block) {
         const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
-#if CONFIG_EXT_REFS
         const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
-#endif  // CONFIG_EXT_REFS
+
+        av1_collect_neighbors_ref_counts(xd);
 
         if (cm->reference_mode == REFERENCE_MODE_SELECT) {
           if (has_second_ref(mbmi))
             // This flag is also updated for 4x4 blocks
             rdc->compound_ref_used_flag = 1;
-          else
-            // This flag is also updated for 4x4 blocks
-            rdc->single_ref_used_flag = 1;
-          if (is_comp_ref_allowed(mbmi->sb_type)) {
-            counts->comp_inter[av1_get_reference_mode_context(cm, xd)]
+          if (is_comp_ref_allowed(bsize)) {
+#if CONFIG_ENTROPY_STATS
+            counts->comp_inter[av1_get_reference_mode_context(xd)]
                               [has_second_ref(mbmi)]++;
-#if CONFIG_NEW_MULTISYMBOL
-            update_cdf(av1_get_reference_mode_cdf(cm, xd), has_second_ref(mbmi),
-                       2);
-#endif  // CONFIG_NEW_MULTISYMBOL
+#endif  // CONFIG_ENTROPY_STATS
+            if (allow_update_cdf) {
+              update_cdf(av1_get_reference_mode_cdf(xd), has_second_ref(mbmi),
+                         2);
+            }
           }
         }
 
         if (has_second_ref(mbmi)) {
-#if CONFIG_EXT_COMP_REFS
           const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
                                                         ? UNIDIR_COMP_REFERENCE
                                                         : BIDIR_COMP_REFERENCE;
-#if !USE_UNI_COMP_REFS
-          // TODO(zoeliu): Temporarily turn off uni-directional comp refs
-          assert(comp_ref_type == BIDIR_COMP_REFERENCE);
-#endif  // !USE_UNI_COMP_REFS
+          if (allow_update_cdf) {
+            update_cdf(av1_get_comp_reference_type_cdf(xd), comp_ref_type,
+                       COMP_REFERENCE_TYPES);
+          }
+#if CONFIG_ENTROPY_STATS
           counts->comp_ref_type[av1_get_comp_reference_type_context(xd)]
                                [comp_ref_type]++;
+#endif  // CONFIG_ENTROPY_STATS
 
           if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
             const int bit = (ref0 == BWDREF_FRAME);
+            if (allow_update_cdf)
+              update_cdf(av1_get_pred_cdf_uni_comp_ref_p(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
             counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0]
                                 [bit]++;
+#endif  // CONFIG_ENTROPY_STATS
             if (!bit) {
               const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME);
+              if (allow_update_cdf)
+                update_cdf(av1_get_pred_cdf_uni_comp_ref_p1(xd), bit1, 2);
+#if CONFIG_ENTROPY_STATS
               counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1]
                                   [bit1]++;
+#endif  // CONFIG_ENTROPY_STATS
               if (bit1) {
+                if (allow_update_cdf) {
+                  update_cdf(av1_get_pred_cdf_uni_comp_ref_p2(xd),
+                             ref1 == GOLDEN_FRAME, 2);
+                }
+#if CONFIG_ENTROPY_STATS
                 counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)]
                                     [2][ref1 == GOLDEN_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
               }
             }
           } else {
-#endif  // CONFIG_EXT_COMP_REFS
-#if CONFIG_EXT_REFS
             const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
-
-            counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0][bit]++;
+            if (allow_update_cdf)
+              update_cdf(av1_get_pred_cdf_comp_ref_p(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+            counts->comp_ref[av1_get_pred_context_comp_ref_p(xd)][0][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
             if (!bit) {
-              counts->comp_ref[av1_get_pred_context_comp_ref_p1(cm, xd)][1]
-                              [ref0 == LAST_FRAME]++;
+              if (allow_update_cdf) {
+                update_cdf(av1_get_pred_cdf_comp_ref_p1(xd),
+                           ref0 == LAST2_FRAME, 2);
+              }
+#if CONFIG_ENTROPY_STATS
+              counts->comp_ref[av1_get_pred_context_comp_ref_p1(xd)][1]
+                              [ref0 == LAST2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
             } else {
-              counts->comp_ref[av1_get_pred_context_comp_ref_p2(cm, xd)][2]
+              if (allow_update_cdf) {
+                update_cdf(av1_get_pred_cdf_comp_ref_p2(xd),
+                           ref0 == GOLDEN_FRAME, 2);
+              }
+#if CONFIG_ENTROPY_STATS
+              counts->comp_ref[av1_get_pred_context_comp_ref_p2(xd)][2]
                               [ref0 == GOLDEN_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
             }
-
-            counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(cm, xd)][0]
+            if (allow_update_cdf) {
+              update_cdf(av1_get_pred_cdf_comp_bwdref_p(xd),
+                         ref1 == ALTREF_FRAME, 2);
+            }
+#if CONFIG_ENTROPY_STATS
+            counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(xd)][0]
                                [ref1 == ALTREF_FRAME]++;
-            if (ref1 != ALTREF_FRAME)
-              counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(cm, xd)]
-                                 [1][ref1 == ALTREF2_FRAME]++;
-#else   // !CONFIG_EXT_REFS
-          counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0]
-                          [ref0 == GOLDEN_FRAME]++;
-#endif  // CONFIG_EXT_REFS
-#if CONFIG_EXT_COMP_REFS
+#endif  // CONFIG_ENTROPY_STATS
+            if (ref1 != ALTREF_FRAME) {
+              if (allow_update_cdf) {
+                update_cdf(av1_get_pred_cdf_comp_bwdref_p1(xd),
+                           ref1 == ALTREF2_FRAME, 2);
+              }
+#if CONFIG_ENTROPY_STATS
+              counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(xd)][1]
+                                 [ref1 == ALTREF2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+            }
           }
-#endif  // CONFIG_EXT_COMP_REFS
         } else {
-#if CONFIG_EXT_REFS
           const int bit = (ref0 >= BWDREF_FRAME);
-
+          if (allow_update_cdf)
+            update_cdf(av1_get_pred_cdf_single_ref_p1(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
           counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
           if (bit) {
             assert(ref0 <= ALTREF_FRAME);
+            if (allow_update_cdf) {
+              update_cdf(av1_get_pred_cdf_single_ref_p2(xd),
+                         ref0 == ALTREF_FRAME, 2);
+            }
+#if CONFIG_ENTROPY_STATS
             counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
                               [ref0 == ALTREF_FRAME]++;
-            if (ref0 != ALTREF_FRAME)
+#endif  // CONFIG_ENTROPY_STATS
+            if (ref0 != ALTREF_FRAME) {
+              if (allow_update_cdf) {
+                update_cdf(av1_get_pred_cdf_single_ref_p6(xd),
+                           ref0 == ALTREF2_FRAME, 2);
+              }
+#if CONFIG_ENTROPY_STATS
               counts->single_ref[av1_get_pred_context_single_ref_p6(xd)][5]
                                 [ref0 == ALTREF2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+            }
           } else {
             const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME);
+            if (allow_update_cdf)
+              update_cdf(av1_get_pred_cdf_single_ref_p3(xd), bit1, 2);
+#if CONFIG_ENTROPY_STATS
             counts
                 ->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++;
+#endif  // CONFIG_ENTROPY_STATS
             if (!bit1) {
+              if (allow_update_cdf) {
+                update_cdf(av1_get_pred_cdf_single_ref_p4(xd),
+                           ref0 != LAST_FRAME, 2);
+              }
+#if CONFIG_ENTROPY_STATS
               counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3]
                                 [ref0 != LAST_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
             } else {
+              if (allow_update_cdf) {
+                update_cdf(av1_get_pred_cdf_single_ref_p5(xd),
+                           ref0 != LAST3_FRAME, 2);
+              }
+#if CONFIG_ENTROPY_STATS
               counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4]
                                 [ref0 != LAST3_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
             }
           }
-#else   // !CONFIG_EXT_REFS
-          counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0]
-                            [ref0 != LAST_FRAME]++;
-          if (ref0 != LAST_FRAME) {
-            counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
-                              [ref0 != GOLDEN_FRAME]++;
-          }
-#endif  // CONFIG_EXT_REFS
         }
 
-#if CONFIG_COMPOUND_SINGLEREF
-        if (!has_second_ref(mbmi))
-          counts->comp_inter_mode[av1_get_inter_mode_context(xd)]
-                                 [is_inter_singleref_comp_mode(mbmi->mode)]++;
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-#if CONFIG_INTERINTRA
-        if (cm->reference_mode != COMPOUND_REFERENCE &&
-#if CONFIG_SUPERTX
-            !supertx_enabled &&
-#endif
-            cm->allow_interintra_compound && is_interintra_allowed(mbmi)) {
+        if (cm->seq_params.enable_interintra_compound &&
+            is_interintra_allowed(mbmi)) {
           const int bsize_group = size_group_lookup[bsize];
           if (mbmi->ref_frame[1] == INTRA_FRAME) {
+#if CONFIG_ENTROPY_STATS
             counts->interintra[bsize_group][1]++;
-#if CONFIG_NEW_MULTISYMBOL
-            update_cdf(fc->interintra_cdf[bsize_group], 1, 2);
 #endif
+            if (allow_update_cdf)
+              update_cdf(fc->interintra_cdf[bsize_group], 1, 2);
+#if CONFIG_ENTROPY_STATS
             counts->interintra_mode[bsize_group][mbmi->interintra_mode]++;
-            update_cdf(fc->interintra_mode_cdf[bsize_group],
-                       mbmi->interintra_mode, INTERINTRA_MODES);
+#endif
+            if (allow_update_cdf) {
+              update_cdf(fc->interintra_mode_cdf[bsize_group],
+                         mbmi->interintra_mode, INTERINTRA_MODES);
+            }
             if (is_interintra_wedge_used(bsize)) {
+#if CONFIG_ENTROPY_STATS
               counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
-#if CONFIG_NEW_MULTISYMBOL
-              update_cdf(fc->wedge_interintra_cdf[bsize],
-                         mbmi->use_wedge_interintra, 2);
 #endif
+              if (allow_update_cdf) {
+                update_cdf(fc->wedge_interintra_cdf[bsize],
+                           mbmi->use_wedge_interintra, 2);
+              }
+              if (mbmi->use_wedge_interintra) {
+#if CONFIG_ENTROPY_STATS
+                counts->wedge_idx[bsize][mbmi->interintra_wedge_index]++;
+#endif
+                if (allow_update_cdf) {
+                  update_cdf(fc->wedge_idx_cdf[bsize],
+                             mbmi->interintra_wedge_index, 16);
+                }
+              }
             }
           } else {
+#if CONFIG_ENTROPY_STATS
             counts->interintra[bsize_group][0]++;
-#if CONFIG_NEW_MULTISYMBOL
-            update_cdf(fc->interintra_cdf[bsize_group], 0, 2);
 #endif
+            if (allow_update_cdf)
+              update_cdf(fc->interintra_cdf[bsize_group], 0, 2);
           }
         }
-#endif  // CONFIG_INTERINTRA
 
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-#if CONFIG_WARPED_MOTION
         set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+        const MOTION_MODE motion_allowed =
+            cm->switchable_motion_mode
+                ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+                                      cm->allow_warped_motion)
+                : SIMPLE_TRANSLATION;
+        if (mbmi->ref_frame[1] != INTRA_FRAME) {
+          if (motion_allowed == WARPED_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+            counts->motion_mode[bsize][mbmi->motion_mode]++;
 #endif
-        const MOTION_MODE motion_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION
-            0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-            xd,
-#endif
-            mi);
-#if CONFIG_SUPERTX
-        if (!supertx_enabled)
-#endif  // CONFIG_SUPERTX
-          if (mbmi->ref_frame[1] != INTRA_FRAME)
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-          {
-            if (motion_allowed == WARPED_CAUSAL) {
-              counts->motion_mode[mbmi->sb_type][mbmi->motion_mode]++;
-              update_cdf(fc->motion_mode_cdf[mbmi->sb_type], mbmi->motion_mode,
+            if (allow_update_cdf) {
+              update_cdf(fc->motion_mode_cdf[bsize], mbmi->motion_mode,
                          MOTION_MODES);
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-            } else if (motion_allowed == NCOBMC_ADAPT_WEIGHT) {
-              counts->ncobmc[mbmi->sb_type][mbmi->motion_mode]++;
-              update_cdf(fc->ncobmc_cdf[mbmi->sb_type], mbmi->motion_mode,
-                         OBMC_FAMILY_MODES);
-            } else if (motion_allowed == OBMC_CAUSAL) {
-              counts->obmc[mbmi->sb_type][mbmi->motion_mode == OBMC_CAUSAL]++;
-              update_cdf(fc->obmc_cdf[mbmi->sb_type], mbmi->motion_mode, 2);
             }
-#else
-            } else if (motion_allowed == OBMC_CAUSAL) {
-              counts->obmc[mbmi->sb_type][mbmi->motion_mode == OBMC_CAUSAL]++;
-#if CONFIG_NEW_MULTISYMBOL
-              update_cdf(fc->obmc_cdf[mbmi->sb_type],
-                         mbmi->motion_mode == OBMC_CAUSAL, 2);
+          } else if (motion_allowed == OBMC_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+            counts->obmc[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
 #endif
+            if (allow_update_cdf) {
+              update_cdf(fc->obmc_cdf[bsize], mbmi->motion_mode == OBMC_CAUSAL,
+                         2);
             }
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-          }
-#else
-          if (motion_allowed > SIMPLE_TRANSLATION) {
-            counts->motion_mode[mbmi->sb_type][mbmi->motion_mode]++;
-            update_cdf(fc->motion_mode_cdf[mbmi->sb_type], mbmi->motion_mode,
-                       MOTION_MODES);
-          }
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-        if (mbmi->motion_mode == NCOBMC_ADAPT_WEIGHT) {
-          ADAPT_OVERLAP_BLOCK ao_block =
-              adapt_overlap_block_lookup[mbmi->sb_type];
-          ++counts->ncobmc_mode[ao_block][mbmi->ncobmc_mode[0]];
-          update_cdf(fc->ncobmc_mode_cdf[ao_block], mbmi->ncobmc_mode[0],
-                     MAX_NCOBMC_MODES);
-          if (mi_size_wide[mbmi->sb_type] != mi_size_high[mbmi->sb_type]) {
-            ++counts->ncobmc_mode[ao_block][mbmi->ncobmc_mode[1]];
-            update_cdf(fc->ncobmc_mode_cdf[ao_block], mbmi->ncobmc_mode[1],
-                       MAX_NCOBMC_MODES);
           }
         }
+
+        if (has_second_ref(mbmi)) {
+          assert(cm->reference_mode != SINGLE_REFERENCE &&
+                 is_inter_compound_mode(mbmi->mode) &&
+                 mbmi->motion_mode == SIMPLE_TRANSLATION);
+
+          const int masked_compound_used =
+              is_any_masked_compound_used(bsize) &&
+              cm->seq_params.enable_masked_compound;
+          if (masked_compound_used) {
+            const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+#if CONFIG_ENTROPY_STATS
+            ++counts->comp_group_idx[comp_group_idx_ctx][mbmi->comp_group_idx];
 #endif
+            if (allow_update_cdf) {
+              update_cdf(fc->comp_group_idx_cdf[comp_group_idx_ctx],
+                         mbmi->comp_group_idx, 2);
+            }
+          }
 
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-
-        if (
-#if CONFIG_COMPOUND_SINGLEREF
-            is_inter_anyref_comp_mode(mbmi->mode)
-#else   // !CONFIG_COMPOUND_SINGLEREF
-            cm->reference_mode != SINGLE_REFERENCE &&
-            is_inter_compound_mode(mbmi->mode)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-            && mbmi->motion_mode == SIMPLE_TRANSLATION
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-            ) {
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-#if CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
-          if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+          if (mbmi->comp_group_idx == 0) {
+            const int comp_index_ctx = get_comp_index_context(cm, xd);
+#if CONFIG_ENTROPY_STATS
+            ++counts->compound_index[comp_index_ctx][mbmi->compound_idx];
 #endif
-            counts
-                ->compound_interinter[bsize][mbmi->interinter_compound_type]++;
-            update_cdf(fc->compound_type_cdf[bsize],
-                       mbmi->interinter_compound_type, COMPOUND_TYPES);
-#if CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
+            if (allow_update_cdf) {
+              update_cdf(fc->compound_index_cdf[comp_index_ctx],
+                         mbmi->compound_idx, 2);
+            }
+          } else {
+            assert(masked_compound_used);
+            if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+              ++counts->compound_type[bsize][mbmi->interinter_comp.type - 1];
+#endif
+              if (allow_update_cdf) {
+                update_cdf(fc->compound_type_cdf[bsize],
+                           mbmi->interinter_comp.type - 1, COMPOUND_TYPES - 1);
+              }
+            }
           }
+        }
+        if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+          if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+            counts->wedge_idx[bsize][mbmi->interinter_comp.wedge_index]++;
 #endif
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
+            if (allow_update_cdf) {
+              update_cdf(fc->wedge_idx_cdf[bsize],
+                         mbmi->interinter_comp.wedge_index, 16);
+            }
+          }
         }
       }
     }
@@ -1903,37 +1267,33 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
         !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
       int16_t mode_ctx;
       const PREDICTION_MODE mode = mbmi->mode;
+
+      mode_ctx =
+          av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
       if (has_second_ref(mbmi)) {
-        mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+#if CONFIG_ENTROPY_STATS
         ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
-        update_cdf(fc->inter_compound_mode_cdf[mode_ctx],
-                   INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES);
-#if CONFIG_COMPOUND_SINGLEREF
-      } else if (is_inter_singleref_comp_mode(mode)) {
-        mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
-        ++counts->inter_singleref_comp_mode[mode_ctx]
-                                           [INTER_SINGLEREF_COMP_OFFSET(mode)];
-#endif  // CONFIG_COMPOUND_SINGLEREF
+#endif
+        if (allow_update_cdf)
+          update_cdf(fc->inter_compound_mode_cdf[mode_ctx],
+                     INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES);
       } else {
-        mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
-                                             mbmi->ref_frame, bsize, -1);
-        update_inter_mode_stats(counts, mode, mode_ctx);
+        update_inter_mode_stats(fc, counts, mode, mode_ctx, allow_update_cdf);
       }
 
       int mode_allowed = (mbmi->mode == NEWMV);
       mode_allowed |= (mbmi->mode == NEW_NEWMV);
-#if CONFIG_COMPOUND_SINGLEREF
-      mode_allowed |= (mbmi->mode == SR_NEW_NEWMV);
-#endif  // CONFIG_COMPOUND_SINGLEREF
       if (mode_allowed) {
         uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
         int idx;
 
         for (idx = 0; idx < 2; ++idx) {
           if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+#if CONFIG_ENTROPY_STATS
             uint8_t drl_ctx =
                 av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
             ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
+#endif
 
             if (mbmi->ref_mv_idx == idx) break;
           }
@@ -1946,47 +1306,35 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
 
         for (idx = 1; idx < 3; ++idx) {
           if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+#if CONFIG_ENTROPY_STATS
             uint8_t drl_ctx =
                 av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
             ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
+#endif
 
             if (mbmi->ref_mv_idx == idx - 1) break;
           }
         }
       }
     }
-#if CONFIG_INTRABC
-  } else {
-    if (av1_allow_intrabc(bsize, cm)) {
-      FRAME_COUNTS *const counts = td->counts;
-      ++counts->intrabc[mbmi->use_intrabc];
-    } else {
-      assert(!mbmi->use_intrabc);
-    }
-#endif
   }
 }
 
 typedef struct {
-  ENTROPY_CONTEXT a[2 * MAX_MIB_SIZE * MAX_MB_PLANE];
-  ENTROPY_CONTEXT l[2 * MAX_MIB_SIZE * MAX_MB_PLANE];
+  ENTROPY_CONTEXT a[MAX_MIB_SIZE * MAX_MB_PLANE];
+  ENTROPY_CONTEXT l[MAX_MIB_SIZE * MAX_MB_PLANE];
   PARTITION_CONTEXT sa[MAX_MIB_SIZE];
   PARTITION_CONTEXT sl[MAX_MIB_SIZE];
-#if CONFIG_VAR_TX
   TXFM_CONTEXT *p_ta;
   TXFM_CONTEXT *p_tl;
-  TXFM_CONTEXT ta[2 * MAX_MIB_SIZE];
-  TXFM_CONTEXT tl[2 * MAX_MIB_SIZE];
-#endif
+  TXFM_CONTEXT ta[MAX_MIB_SIZE];
+  TXFM_CONTEXT tl[MAX_MIB_SIZE];
 } RD_SEARCH_MACROBLOCK_CONTEXT;
 
 static void restore_context(MACROBLOCK *x,
                             const RD_SEARCH_MACROBLOCK_CONTEXT *ctx, int mi_row,
-                            int mi_col,
-#if CONFIG_PVQ
-                            od_rollback_buffer *rdo_buf,
-#endif
-                            BLOCK_SIZE bsize) {
+                            int mi_col, BLOCK_SIZE bsize,
+                            const int num_planes) {
   MACROBLOCKD *xd = &x->e_mbd;
   int p;
   const int num_4x4_blocks_wide =
@@ -1995,11 +1343,9 @@ static void restore_context(MACROBLOCK *x,
       block_size_high[bsize] >> tx_size_high_log2[0];
   int mi_width = mi_size_wide[bsize];
   int mi_height = mi_size_high[bsize];
-  for (p = 0; p < MAX_MB_PLANE; p++) {
-    int tx_col;
-    int tx_row;
-    tx_col = mi_col << (MI_SIZE_LOG2 - tx_size_wide_log2[0]);
-    tx_row = (mi_row & MAX_MIB_MASK) << (MI_SIZE_LOG2 - tx_size_high_log2[0]);
+  for (p = 0; p < num_planes; p++) {
+    int tx_col = mi_col;
+    int tx_row = mi_row & MAX_MIB_MASK;
     memcpy(xd->above_context[p] + (tx_col >> xd->plane[p].subsampling_x),
            ctx->a + num_4x4_blocks_wide * p,
            (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
@@ -2013,25 +1359,17 @@ static void restore_context(MACROBLOCK *x,
          sizeof(*xd->above_seg_context) * mi_width);
   memcpy(xd->left_seg_context + (mi_row & MAX_MIB_MASK), ctx->sl,
          sizeof(xd->left_seg_context[0]) * mi_height);
-#if CONFIG_VAR_TX
   xd->above_txfm_context = ctx->p_ta;
   xd->left_txfm_context = ctx->p_tl;
   memcpy(xd->above_txfm_context, ctx->ta,
-         sizeof(*xd->above_txfm_context) * (mi_width << TX_UNIT_WIDE_LOG2));
+         sizeof(*xd->above_txfm_context) * mi_width);
   memcpy(xd->left_txfm_context, ctx->tl,
-         sizeof(*xd->left_txfm_context) * (mi_height << TX_UNIT_HIGH_LOG2));
-#endif
-#if CONFIG_PVQ
-  od_encode_rollback(&x->daala_enc, rdo_buf);
-#endif
+         sizeof(*xd->left_txfm_context) * mi_height);
 }
 
 static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
-                         int mi_row, int mi_col,
-#if CONFIG_PVQ
-                         od_rollback_buffer *rdo_buf,
-#endif
-                         BLOCK_SIZE bsize) {
+                         int mi_row, int mi_col, BLOCK_SIZE bsize,
+                         const int num_planes) {
   const MACROBLOCKD *xd = &x->e_mbd;
   int p;
   const int num_4x4_blocks_wide =
@@ -2042,11 +1380,9 @@ static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
   int mi_height = mi_size_high[bsize];
 
   // buffer the above/left context information of the block in search.
-  for (p = 0; p < MAX_MB_PLANE; ++p) {
-    int tx_col;
-    int tx_row;
-    tx_col = mi_col << (MI_SIZE_LOG2 - tx_size_wide_log2[0]);
-    tx_row = (mi_row & MAX_MIB_MASK) << (MI_SIZE_LOG2 - tx_size_high_log2[0]);
+  for (p = 0; p < num_planes; ++p) {
+    int tx_col = mi_col;
+    int tx_row = mi_row & MAX_MIB_MASK;
     memcpy(ctx->a + num_4x4_blocks_wide * p,
            xd->above_context[p] + (tx_col >> xd->plane[p].subsampling_x),
            (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
@@ -2060,386 +1396,165 @@ static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
          sizeof(*xd->above_seg_context) * mi_width);
   memcpy(ctx->sl, xd->left_seg_context + (mi_row & MAX_MIB_MASK),
          sizeof(xd->left_seg_context[0]) * mi_height);
-#if CONFIG_VAR_TX
   memcpy(ctx->ta, xd->above_txfm_context,
-         sizeof(*xd->above_txfm_context) * (mi_width << TX_UNIT_WIDE_LOG2));
+         sizeof(*xd->above_txfm_context) * mi_width);
   memcpy(ctx->tl, xd->left_txfm_context,
-         sizeof(*xd->left_txfm_context) * (mi_height << TX_UNIT_HIGH_LOG2));
+         sizeof(*xd->left_txfm_context) * mi_height);
   ctx->p_ta = xd->above_txfm_context;
   ctx->p_tl = xd->left_txfm_context;
-#endif
-#if CONFIG_PVQ
-  od_encode_checkpoint(&x->daala_enc, rdo_buf);
-#endif
 }
 
-static void encode_b(const AV1_COMP *const cpi, const TileInfo *const tile,
+static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
                      ThreadData *td, TOKENEXTRA **tp, int mi_row, int mi_col,
                      RUN_TYPE dry_run, BLOCK_SIZE bsize,
-#if CONFIG_EXT_PARTITION_TYPES
-                     PARTITION_TYPE partition,
-#endif
-                     PICK_MODE_CONTEXT *ctx, int *rate) {
+                     PARTITION_TYPE partition, PICK_MODE_CONTEXT *ctx,
+                     int *rate) {
+  TileInfo *const tile = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
-#if (CONFIG_MOTION_VAR && CONFIG_NCOBMC) | CONFIG_EXT_DELTA_Q | \
-    CONFIG_NCOBMC_ADAPT_WEIGHT
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi;
-#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
-  int check_ncobmc;
-#endif
-#endif
 
   set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
-#if CONFIG_EXT_PARTITION_TYPES
-  x->e_mbd.mi[0]->mbmi.partition = partition;
-#endif
-  update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
-#if CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
-  mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_WARPED_MOTION
-  set_ref_ptrs(&cpi->common, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-#endif
-#endif
-
-#if CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
-  const MOTION_MODE motion_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION
-      0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-      xd,
-#endif
-      xd->mi[0]);
-#endif  // CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
-
-#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
-  check_ncobmc = is_inter_block(mbmi) && motion_allowed >= OBMC_CAUSAL;
-  if (!dry_run && check_ncobmc) {
-    av1_check_ncobmc_rd(cpi, x, mi_row, mi_col);
-    av1_setup_dst_planes(x->e_mbd.plane, bsize,
-                         get_frame_new_buffer(&cpi->common), mi_row, mi_col);
-  }
-#endif
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  mbmi->partition = partition;
+  update_state(cpi, tile_data, td, ctx, mi_row, mi_col, bsize, dry_run);
 
-#if CONFIG_LV_MAP
-  av1_set_coeff_buffer(cpi, x, mi_row, mi_col);
-#endif
+  if (!dry_run) av1_set_coeff_buffer(cpi, x, mi_row, mi_col);
 
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  if (dry_run == OUTPUT_ENABLED && !frame_is_intra_only(&cpi->common)) {
-    if (motion_allowed >= NCOBMC_ADAPT_WEIGHT && is_inter_block(mbmi)) {
-      get_ncobmc_intrpl_pred(cpi, td, mi_row, mi_col, bsize);
-      av1_check_ncobmc_adapt_weight_rd(cpi, x, mi_row, mi_col);
-    }
-    av1_setup_dst_planes(x->e_mbd.plane, bsize,
-                         get_frame_new_buffer(&cpi->common), mi_row, mi_col);
-  }
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-
-  encode_superblock(cpi, td, tp, dry_run, mi_row, mi_col, bsize, rate);
+  encode_superblock(cpi, tile_data, td, tp, dry_run, mi_row, mi_col, bsize,
+                    rate);
 
-#if CONFIG_LV_MAP
   if (dry_run == 0)
     x->cb_offset += block_size_wide[bsize] * block_size_high[bsize];
-#endif
 
   if (!dry_run) {
-#if CONFIG_EXT_DELTA_Q
-    mbmi = &xd->mi[0]->mbmi;
-    if (bsize == cpi->common.sb_size && mbmi->skip == 1 &&
+    if (bsize == cpi->common.seq_params.sb_size && mbmi->skip == 1 &&
         cpi->common.delta_lf_present_flag) {
-#if CONFIG_LOOPFILTER_LEVEL
-      for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id)
-        mbmi->curr_delta_lf[lf_id] = xd->prev_delta_lf[lf_id];
-#endif  // CONFIG_LOOPFILTER_LEVEL
-      mbmi->current_delta_lf_from_base = xd->prev_delta_lf_from_base;
+      const int frame_lf_count = av1_num_planes(&cpi->common) > 1
+                                     ? FRAME_LF_COUNT
+                                     : FRAME_LF_COUNT - 2;
+      for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
+        mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
+      mbmi->delta_lf_from_base = xd->delta_lf_from_base;
     }
-#endif
-#if CONFIG_SUPERTX
-    update_stats(&cpi->common, td, mi_row, mi_col, 0);
-#else
-    update_stats(&cpi->common, td, mi_row, mi_col);
-#endif
+    if (has_second_ref(mbmi)) {
+      if (mbmi->compound_idx == 0 ||
+          mbmi->interinter_comp.type == COMPOUND_AVERAGE)
+        mbmi->comp_group_idx = 0;
+      else
+        mbmi->comp_group_idx = 1;
+    }
+    update_stats(&cpi->common, tile_data, td, mi_row, mi_col);
   }
 }
 
 static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
-                      const TileInfo *const tile, TOKENEXTRA **tp, int mi_row,
+                      TileDataEnc *tile_data, TOKENEXTRA **tp, int mi_row,
                       int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
                       PC_TREE *pc_tree, int *rate) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int hbs = mi_size_wide[bsize] / 2;
-#if CONFIG_EXT_PARTITION_TYPES && CONFIG_EXT_PARTITION_TYPES_AB
-  const int qbs = mi_size_wide[bsize] / 4;
-#endif
   const int is_partition_root = bsize >= BLOCK_8X8;
   const int ctx = is_partition_root
-                      ? partition_plane_context(xd, mi_row, mi_col,
-#if CONFIG_UNPOISON_PARTITION_CTX
-                                                mi_row + hbs < cm->mi_rows,
-                                                mi_col + hbs < cm->mi_cols,
-#endif
-                                                bsize)
+                      ? partition_plane_context(xd, mi_row, mi_col, bsize)
                       : -1;
   const PARTITION_TYPE partition = pc_tree->partitioning;
-  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
-#if CONFIG_EXT_PARTITION_TYPES
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
   int quarter_step = mi_size_wide[bsize] / 4;
   int i;
-#if !CONFIG_EXT_PARTITION_TYPES_AB
-  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
-#endif
-#endif
-
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-  assert(bsize >= BLOCK_8X8);
-#endif
+  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
-  if (!dry_run && ctx >= 0) td->counts->partition[ctx][partition]++;
-
-#if CONFIG_SUPERTX
-  if (!frame_is_intra_only(cm) && bsize <= MAX_SUPERTX_BLOCK_SIZE &&
-      partition != PARTITION_NONE && !xd->lossless[0]) {
-    int supertx_enabled;
-    TX_SIZE supertx_size = max_txsize_lookup[bsize];
-    supertx_enabled = check_supertx_sb(bsize, supertx_size, pc_tree);
-    if (supertx_enabled) {
-      const int mi_width = mi_size_wide[bsize];
-      const int mi_height = mi_size_high[bsize];
-      int x_idx, y_idx, i;
-      uint8_t *dst_buf[3];
-      int dst_stride[3];
-      set_skip_context(xd, mi_row, mi_col);
-      set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
-      update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, dry_run,
-                              pc_tree);
-
-      av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                           mi_col);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        dst_buf[i] = xd->plane[i].dst.buf;
-        dst_stride[i] = xd->plane[i].dst.stride;
-      }
-      predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, dry_run,
-                         bsize, bsize, dst_buf, dst_stride, pc_tree);
-
-      set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
-      set_segment_id_supertx(cpi, x, mi_row, mi_col, bsize);
-
-      if (!x->skip) {
-        int this_rate = 0;
-        av1_encode_sb_supertx((AV1_COMMON *)cm, x, bsize);
-        av1_tokenize_sb_supertx(cpi, td, tp, dry_run, mi_row, mi_col, bsize,
-                                rate);
-        if (rate) *rate += this_rate;
-      } else {
-        xd->mi[0]->mbmi.skip = 1;
-        if (!dry_run) td->counts->skip[av1_get_skip_context(xd)][1]++;
-        av1_reset_skip_context(xd, mi_row, mi_col, bsize);
-      }
-      if (!dry_run) {
-        for (y_idx = 0; y_idx < mi_height; y_idx++)
-          for (x_idx = 0; x_idx < mi_width; x_idx++) {
-            if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width >
-                    x_idx &&
-                (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height >
-                    y_idx) {
-              xd->mi[x_idx + y_idx * cm->mi_stride]->mbmi.skip =
-                  xd->mi[0]->mbmi.skip;
-            }
-          }
-        td->counts->supertx[partition_supertx_context_lookup[partition]]
-                           [supertx_size][1]++;
-        td->counts->supertx_size[supertx_size]++;
+  if (!dry_run && ctx >= 0) {
+    const int has_rows = (mi_row + hbs) < cm->mi_rows;
+    const int has_cols = (mi_col + hbs) < cm->mi_cols;
+
+    if (has_rows && has_cols) {
 #if CONFIG_ENTROPY_STATS
-#if CONFIG_EXT_TX
-        if (get_ext_tx_types(supertx_size, bsize, 1, cm->reduced_tx_set_used) >
-                1 &&
-            !xd->mi[0]->mbmi.skip) {
-          const int eset =
-              get_ext_tx_set(supertx_size, bsize, 1, cm->reduced_tx_set_used);
-          if (eset > 0) {
-            ++td->counts
-                  ->inter_ext_tx[eset][supertx_size][xd->mi[0]->mbmi.tx_type];
-          }
-        }
-#else
-        if (supertx_size < TX_32X32 && !xd->mi[0]->mbmi.skip) {
-          ++td->counts->inter_ext_tx[supertx_size][xd->mi[0]->mbmi.tx_type];
-        }
-#endif  // CONFIG_EXT_TX
-#endif  // CONFIG_ENTROPY_STATS
-      }
-#if CONFIG_EXT_PARTITION_TYPES
-      update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize,
-                                   partition);
-#else
-      if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
-        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+      td->counts->partition[ctx][partition]++;
 #endif
-#if CONFIG_VAR_TX
-      set_txfm_ctxs(supertx_size, mi_width, mi_height, xd->mi[0]->mbmi.skip,
-                    xd);
-#endif  // CONFIG_VAR_TX
-      return;
-    } else {
-      if (!dry_run) {
-        td->counts->supertx[partition_supertx_context_lookup[partition]]
-                           [supertx_size][0]++;
+
+      if (tile_data->allow_update_cdf) {
+        FRAME_CONTEXT *fc = xd->tile_ctx;
+        update_cdf(fc->partition_cdf[ctx], partition,
+                   partition_cdf_length(bsize));
       }
     }
   }
-#endif  // CONFIG_SUPERTX
 
   switch (partition) {
     case PARTITION_NONE:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
-#if CONFIG_EXT_PARTITION_TYPES
-               partition,
-#endif
-               &pc_tree->none, rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, &pc_tree->none, rate);
       break;
     case PARTITION_VERT:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
-#if CONFIG_EXT_PARTITION_TYPES
-               partition,
-#endif
-               &pc_tree->vertical[0], rate);
-      if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) {
-        encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
-#if CONFIG_EXT_PARTITION_TYPES
-                 partition,
-#endif
-                 &pc_tree->vertical[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, &pc_tree->vertical[0], rate);
+      if (mi_col + hbs < cm->mi_cols) {
+        encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+                 partition, &pc_tree->vertical[1], rate);
       }
       break;
     case PARTITION_HORZ:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
-#if CONFIG_EXT_PARTITION_TYPES
-               partition,
-#endif
-               &pc_tree->horizontal[0], rate);
-      if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) {
-        encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
-#if CONFIG_EXT_PARTITION_TYPES
-                 partition,
-#endif
-                 &pc_tree->horizontal[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, &pc_tree->horizontal[0], rate);
+      if (mi_row + hbs < cm->mi_rows) {
+        encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+                 partition, &pc_tree->horizontal[1], rate);
       }
       break;
     case PARTITION_SPLIT:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
-#if CONFIG_EXT_PARTITION_TYPES
-                 partition,
-#endif
-                 pc_tree->leaf_split[0], rate);
-      } else {
-        encode_sb(cpi, td, tile, tp, mi_row, mi_col, dry_run, subsize,
-                  pc_tree->split[0], rate);
-        encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, dry_run, subsize,
-                  pc_tree->split[1], rate);
-        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, dry_run, subsize,
-                  pc_tree->split[2], rate);
-        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, dry_run,
-                  subsize, pc_tree->split[3], rate);
-      }
-      break;
-
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-    case PARTITION_HORZ_A:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run,
-               get_subsize(bsize, PARTITION_HORZ_4), partition,
-               &pc_tree->horizontala[0], rate);
-      encode_b(cpi, tile, td, tp, mi_row + qbs, mi_col, dry_run,
-               get_subsize(bsize, PARTITION_HORZ_4), partition,
-               &pc_tree->horizontala[1], rate);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
-               partition, &pc_tree->horizontala[2], rate);
-      break;
-    case PARTITION_HORZ_B:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
-               &pc_tree->horizontalb[0], rate);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run,
-               get_subsize(bsize, PARTITION_HORZ_4), partition,
-               &pc_tree->horizontalb[1], rate);
-      if (mi_row + 3 * qbs < cm->mi_rows)
-        encode_b(cpi, tile, td, tp, mi_row + 3 * qbs, mi_col, dry_run,
-                 get_subsize(bsize, PARTITION_HORZ_4), partition,
-                 &pc_tree->horizontalb[2], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize,
+                pc_tree->split[0], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + hbs, dry_run, subsize,
+                pc_tree->split[1], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col, dry_run, subsize,
+                pc_tree->split[2], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col + hbs, dry_run,
+                subsize, pc_tree->split[3], rate);
       break;
-    case PARTITION_VERT_A:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run,
-               get_subsize(bsize, PARTITION_VERT_4), partition,
-               &pc_tree->verticala[0], rate);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + qbs, dry_run,
-               get_subsize(bsize, PARTITION_VERT_4), partition,
-               &pc_tree->verticala[1], rate);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
-               partition, &pc_tree->verticala[2], rate);
 
-      break;
-    case PARTITION_VERT_B:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
-               &pc_tree->verticalb[0], rate);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run,
-               get_subsize(bsize, PARTITION_VERT_4), partition,
-               &pc_tree->verticalb[1], rate);
-      if (mi_col + 3 * qbs < cm->mi_cols)
-        encode_b(cpi, tile, td, tp, mi_row, mi_col + 3 * qbs, dry_run,
-                 get_subsize(bsize, PARTITION_VERT_4), partition,
-                 &pc_tree->verticalb[2], rate);
-      break;
-#else
     case PARTITION_HORZ_A:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition,
-               &pc_tree->horizontala[0], rate);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
+               partition, &pc_tree->horizontala[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
                partition, &pc_tree->horizontala[1], rate);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
                partition, &pc_tree->horizontala[2], rate);
       break;
     case PARTITION_HORZ_B:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
-               &pc_tree->horizontalb[0], rate);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, &pc_tree->horizontalb[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
                partition, &pc_tree->horizontalb[1], rate);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
-               partition, &pc_tree->horizontalb[2], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+               bsize2, partition, &pc_tree->horizontalb[2], rate);
       break;
     case PARTITION_VERT_A:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition,
-               &pc_tree->verticala[0], rate);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
+               partition, &pc_tree->verticala[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
                partition, &pc_tree->verticala[1], rate);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
                partition, &pc_tree->verticala[2], rate);
 
       break;
     case PARTITION_VERT_B:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
-               &pc_tree->verticalb[0], rate);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, &pc_tree->verticalb[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
                partition, &pc_tree->verticalb[1], rate);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
-               partition, &pc_tree->verticalb[2], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+               bsize2, partition, &pc_tree->verticalb[2], rate);
       break;
-#endif
     case PARTITION_HORZ_4:
       for (i = 0; i < 4; ++i) {
         int this_mi_row = mi_row + i * quarter_step;
         if (i > 0 && this_mi_row >= cm->mi_rows) break;
 
-        encode_b(cpi, tile, td, tp, this_mi_row, mi_col, dry_run, subsize,
+        encode_b(cpi, tile_data, td, tp, this_mi_row, mi_col, dry_run, subsize,
                  partition, &pc_tree->horizontal4[i], rate);
       }
       break;
@@ -2448,20 +1563,14 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
         int this_mi_col = mi_col + i * quarter_step;
         if (i > 0 && this_mi_col >= cm->mi_cols) break;
 
-        encode_b(cpi, tile, td, tp, mi_row, this_mi_col, dry_run, subsize,
+        encode_b(cpi, tile_data, td, tp, mi_row, this_mi_col, dry_run, subsize,
                  partition, &pc_tree->vertical4[i], rate);
       }
       break;
-#endif  // CONFIG_EXT_PARTITION_TYPES
     default: assert(0 && "Invalid partition type."); break;
   }
 
-#if CONFIG_EXT_PARTITION_TYPES
   update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
-#else
-  if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
-    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
-#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
 // Check to see if the given partition size is allowed for a specified number
@@ -2483,19 +1592,19 @@ static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left,
   return bsize;
 }
 
-static void set_partial_sb_partition(const AV1_COMMON *const cm, MODE_INFO *mi,
-                                     int bh_in, int bw_in,
+static void set_partial_sb_partition(const AV1_COMMON *const cm,
+                                     MB_MODE_INFO *mi, int bh_in, int bw_in,
                                      int mi_rows_remaining,
                                      int mi_cols_remaining, BLOCK_SIZE bsize,
-                                     MODE_INFO **mib) {
+                                     MB_MODE_INFO **mib) {
   int bh = bh_in;
   int r, c;
-  for (r = 0; r < cm->mib_size; r += bh) {
+  for (r = 0; r < cm->seq_params.mib_size; r += bh) {
     int bw = bw_in;
-    for (c = 0; c < cm->mib_size; c += bw) {
+    for (c = 0; c < cm->seq_params.mib_size; c += bw) {
       const int index = r * cm->mi_stride + c;
       mib[index] = mi + index;
-      mib[index]->mbmi.sb_type = find_partition_size(
+      mib[index]->sb_type = find_partition_size(
           bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw);
     }
   }
@@ -2507,26 +1616,27 @@ static void set_partial_sb_partition(const AV1_COMMON *const cm, MODE_INFO *mi,
 // may not be allowed in which case this code attempts to choose the largest
 // allowable partition.
 static void set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
-                                   MODE_INFO **mib, int mi_row, int mi_col,
+                                   MB_MODE_INFO **mib, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &cpi->common;
   const int mi_rows_remaining = tile->mi_row_end - mi_row;
   const int mi_cols_remaining = tile->mi_col_end - mi_col;
   int block_row, block_col;
-  MODE_INFO *const mi_upper_left = cm->mi + mi_row * cm->mi_stride + mi_col;
+  MB_MODE_INFO *const mi_upper_left = cm->mi + mi_row * cm->mi_stride + mi_col;
   int bh = mi_size_high[bsize];
   int bw = mi_size_wide[bsize];
 
   assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0));
 
   // Apply the requested partition size to the SB if it is all "in image"
-  if ((mi_cols_remaining >= cm->mib_size) &&
-      (mi_rows_remaining >= cm->mib_size)) {
-    for (block_row = 0; block_row < cm->mib_size; block_row += bh) {
-      for (block_col = 0; block_col < cm->mib_size; block_col += bw) {
+  if ((mi_cols_remaining >= cm->seq_params.mib_size) &&
+      (mi_rows_remaining >= cm->seq_params.mib_size)) {
+    for (block_row = 0; block_row < cm->seq_params.mib_size; block_row += bh) {
+      for (block_col = 0; block_col < cm->seq_params.mib_size;
+           block_col += bw) {
         int index = block_row * cm->mi_stride + block_col;
         mib[index] = mi_upper_left + index;
-        mib[index]->mbmi.sb_type = bsize;
+        mib[index]->sb_type = bsize;
       }
     }
   } else {
@@ -2537,14 +1647,12 @@ static void set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
 }
 
 static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
-                             TileDataEnc *tile_data, MODE_INFO **mib,
+                             TileDataEnc *tile_data, MB_MODE_INFO **mib,
                              TOKENEXTRA **tp, int mi_row, int mi_col,
                              BLOCK_SIZE bsize, int *rate, int64_t *dist,
-#if CONFIG_SUPERTX
-                             int *rate_nocoef,
-#endif
                              int do_recon, PC_TREE *pc_tree) {
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -2552,37 +1660,23 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
   const int hbs = bs / 2;
   int i;
   const int pl = (bsize >= BLOCK_8X8)
-                     ? partition_plane_context(xd, mi_row, mi_col,
-#if CONFIG_UNPOISON_PARTITION_CTX
-                                               mi_row + hbs < cm->mi_rows,
-                                               mi_col + hbs < cm->mi_cols,
-#endif
-                                               bsize)
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
                      : 0;
   const PARTITION_TYPE partition =
       (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
                            : PARTITION_NONE;
-  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
   RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
   RD_STATS last_part_rdc, none_rdc, chosen_rdc;
   BLOCK_SIZE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
-  BLOCK_SIZE bs_type = mib[0]->mbmi.sb_type;
+  BLOCK_SIZE bs_type = mib[0]->sb_type;
   int do_partition_search = 1;
   PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
-  const int unify_bsize = CONFIG_CB4X4;
-#if CONFIG_SUPERTX
-  int last_part_rate_nocoef = INT_MAX;
-  int none_rate_nocoef = INT_MAX;
-  int chosen_rate_nocoef = INT_MAX;
-#endif
-#if CONFIG_PVQ
-  od_rollback_buffer pre_rdo_buf;
-#endif
+
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
-  assert(num_4x4_blocks_wide_lookup[bsize] ==
-         num_4x4_blocks_high_lookup[bsize]);
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
 
   av1_invalid_rd_stats(&last_part_rdc);
   av1_invalid_rd_stats(&none_rdc);
@@ -2590,17 +1684,10 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
 
   pc_tree->partitioning = partition;
 
-#if CONFIG_VAR_TX
-  xd->above_txfm_context =
-      cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
-  xd->left_txfm_context = xd->left_txfm_context_buffer +
-                          ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
-#endif
-#if !CONFIG_PVQ
-  save_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-  save_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
+  xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
 
   if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
     set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
@@ -2612,12 +1699,12 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
       cpi->sf.adjust_partitioning_from_last_frame) {
     // Check if any of the sub blocks are further split.
     if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
-      sub_subsize = get_subsize(subsize, PARTITION_SPLIT);
+      sub_subsize = get_partition_subsize(subsize, PARTITION_SPLIT);
       splits_below = 1;
       for (i = 0; i < 4; i++) {
         int jj = i >> 1, ii = i & 0x01;
-        MODE_INFO *this_mi = mib[jj * hbs * cm->mi_stride + ii * hbs];
-        if (this_mi && this_mi->mbmi.sb_type >= sub_subsize) {
+        MB_MODE_INFO *this_mi = mib[jj * hbs * cm->mi_stride + ii * hbs];
+        if (this_mi && this_mi->sb_type >= sub_subsize) {
           splits_below = 0;
         }
       }
@@ -2629,28 +1716,15 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
         mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
       pc_tree->partitioning = PARTITION_NONE;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
-#if CONFIG_SUPERTX
-                       &none_rate_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_NONE,
-#endif
-                       bsize, ctx_none, INT64_MAX);
+                       PARTITION_NONE, bsize, ctx_none, INT64_MAX);
 
       if (none_rdc.rate < INT_MAX) {
         none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
         none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
-#if CONFIG_SUPERTX
-        none_rate_nocoef += x->partition_cost[pl][PARTITION_NONE];
-#endif
       }
 
-#if !CONFIG_PVQ
-      restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-      restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
-      mib[0]->mbmi.sb_type = bs_type;
+      restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+      mib[0]->sb_type = bs_type;
       pc_tree->partitioning = partition;
     }
   }
@@ -2658,127 +1732,65 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
   switch (partition) {
     case PARTITION_NONE:
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-#if CONFIG_SUPERTX
-                       &last_part_rate_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_NONE,
-#endif
-                       bsize, ctx_none, INT64_MAX);
+                       PARTITION_NONE, bsize, ctx_none, INT64_MAX);
       break;
     case PARTITION_HORZ:
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-#if CONFIG_SUPERTX
-                       &last_part_rate_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_HORZ,
-#endif
-                       subsize, &pc_tree->horizontal[0], INT64_MAX);
+                       PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
+                       INT64_MAX);
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_row + hbs < cm->mi_rows) {
         RD_STATS tmp_rdc;
-#if CONFIG_SUPERTX
-        int rt_nocoef = 0;
-#endif
         PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0];
         av1_init_rd_stats(&tmp_rdc);
-        update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
-        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
-                          NULL);
+        update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
+                          mi_col, subsize, NULL);
         rd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
-#if CONFIG_SUPERTX
-                         &rt_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                         PARTITION_HORZ,
-#endif
-                         subsize, &pc_tree->horizontal[1], INT64_MAX);
+                         PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
+                         INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           av1_invalid_rd_stats(&last_part_rdc);
-#if CONFIG_SUPERTX
-          last_part_rate_nocoef = INT_MAX;
-#endif
           break;
         }
         last_part_rdc.rate += tmp_rdc.rate;
         last_part_rdc.dist += tmp_rdc.dist;
         last_part_rdc.rdcost += tmp_rdc.rdcost;
-#if CONFIG_SUPERTX
-        last_part_rate_nocoef += rt_nocoef;
-#endif
       }
       break;
     case PARTITION_VERT:
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-#if CONFIG_SUPERTX
-                       &last_part_rate_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_VERT,
-#endif
-                       subsize, &pc_tree->vertical[0], INT64_MAX);
+                       PARTITION_VERT, subsize, &pc_tree->vertical[0],
+                       INT64_MAX);
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_col + hbs < cm->mi_cols) {
         RD_STATS tmp_rdc;
-#if CONFIG_SUPERTX
-        int rt_nocoef = 0;
-#endif
         PICK_MODE_CONTEXT *ctx_v = &pc_tree->vertical[0];
         av1_init_rd_stats(&tmp_rdc);
-        update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1);
-        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
-                          NULL);
+        update_state(cpi, tile_data, td, ctx_v, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
+                          mi_col, subsize, NULL);
         rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
-#if CONFIG_SUPERTX
-                         &rt_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                         PARTITION_VERT,
-#endif
-                         subsize, &pc_tree->vertical[bsize > BLOCK_8X8],
-                         INT64_MAX);
+                         PARTITION_VERT, subsize,
+                         &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           av1_invalid_rd_stats(&last_part_rdc);
-#if CONFIG_SUPERTX
-          last_part_rate_nocoef = INT_MAX;
-#endif
           break;
         }
         last_part_rdc.rate += tmp_rdc.rate;
         last_part_rdc.dist += tmp_rdc.dist;
         last_part_rdc.rdcost += tmp_rdc.rdcost;
-#if CONFIG_SUPERTX
-        last_part_rate_nocoef += rt_nocoef;
-#endif
       }
       break;
     case PARTITION_SPLIT:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-#if CONFIG_SUPERTX
-                         &last_part_rate_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                         PARTITION_SPLIT,
-#endif
-                         subsize, pc_tree->leaf_split[0], INT64_MAX);
-        break;
-      }
       last_part_rdc.rate = 0;
       last_part_rdc.dist = 0;
       last_part_rdc.rdcost = 0;
-#if CONFIG_SUPERTX
-      last_part_rate_nocoef = 0;
-#endif
       for (i = 0; i < 4; i++) {
         int x_idx = (i & 1) * hbs;
         int y_idx = (i >> 1) * hbs;
         int jj = i >> 1, ii = i & 0x01;
         RD_STATS tmp_rdc;
-#if CONFIG_SUPERTX
-        int rt_nocoef;
-#endif
         if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
           continue;
 
@@ -2786,33 +1798,21 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
         rd_use_partition(cpi, td, tile_data,
                          mib + jj * hbs * cm->mi_stride + ii * hbs, tp,
                          mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
-                         &tmp_rdc.dist,
-#if CONFIG_SUPERTX
-                         &rt_nocoef,
-#endif
-                         i != 3, pc_tree->split[i]);
+                         &tmp_rdc.dist, i != 3, pc_tree->split[i]);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           av1_invalid_rd_stats(&last_part_rdc);
-#if CONFIG_SUPERTX
-          last_part_rate_nocoef = INT_MAX;
-#endif
           break;
         }
         last_part_rdc.rate += tmp_rdc.rate;
         last_part_rdc.dist += tmp_rdc.dist;
-#if CONFIG_SUPERTX
-        last_part_rate_nocoef += rt_nocoef;
-#endif
       }
       break;
-#if CONFIG_EXT_PARTITION_TYPES
     case PARTITION_VERT_A:
     case PARTITION_VERT_B:
     case PARTITION_HORZ_A:
     case PARTITION_HORZ_B:
     case PARTITION_HORZ_4:
     case PARTITION_VERT_4: assert(0 && "Cannot handle extended partiton types");
-#endif  //  CONFIG_EXT_PARTITION_TYPES
     default: assert(0); break;
   }
 
@@ -2820,9 +1820,6 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
     last_part_rdc.rate += x->partition_cost[pl][partition];
     last_part_rdc.rdcost =
         RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist);
-#if CONFIG_SUPERTX
-    last_part_rate_nocoef += x->partition_cost[pl][partition];
-#endif
   }
 
   if (do_partition_search && cpi->sf.adjust_partitioning_from_last_frame &&
@@ -2830,17 +1827,11 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
       partition != PARTITION_SPLIT && bsize > BLOCK_8X8 &&
       (mi_row + bs < cm->mi_rows || mi_row + hbs == cm->mi_rows) &&
       (mi_col + bs < cm->mi_cols || mi_col + hbs == cm->mi_cols)) {
-    BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
+    BLOCK_SIZE split_subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
     chosen_rdc.rate = 0;
     chosen_rdc.dist = 0;
-#if CONFIG_SUPERTX
-    chosen_rate_nocoef = 0;
-#endif
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
     pc_tree->partitioning = PARTITION_SPLIT;
 
     // Split partition.
@@ -2848,175 +1839,108 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
       int x_idx = (i & 1) * hbs;
       int y_idx = (i >> 1) * hbs;
       RD_STATS tmp_rdc;
-#if CONFIG_SUPERTX
-      int rt_nocoef = 0;
-#endif
-#if CONFIG_PVQ
-      od_rollback_buffer buf;
-#endif
+
       if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
         continue;
 
-#if !CONFIG_PVQ
-      save_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-      save_context(x, &x_ctx, mi_row, mi_col, &buf, bsize);
-#endif
+      save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
       pc_tree->split[i]->partitioning = PARTITION_NONE;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
-                       &tmp_rdc,
-#if CONFIG_SUPERTX
-                       &rt_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_SPLIT,
-#endif
-                       split_subsize, &pc_tree->split[i]->none, INT64_MAX);
+                       &tmp_rdc, PARTITION_SPLIT, split_subsize,
+                       &pc_tree->split[i]->none, INT64_MAX);
 
-#if !CONFIG_PVQ
-      restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-      restore_context(x, &x_ctx, mi_row, mi_col, &buf, bsize);
-#endif
+      restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
       if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
         av1_invalid_rd_stats(&chosen_rdc);
-#if CONFIG_SUPERTX
-        chosen_rate_nocoef = INT_MAX;
-#endif
         break;
       }
 
       chosen_rdc.rate += tmp_rdc.rate;
       chosen_rdc.dist += tmp_rdc.dist;
-#if CONFIG_SUPERTX
-      chosen_rate_nocoef += rt_nocoef;
-#endif
 
       if (i != 3)
-        encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx,
+        encode_sb(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
                   OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL);
 
       chosen_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
-#if CONFIG_SUPERTX
-      chosen_rate_nocoef += x->partition_cost[pl][PARTITION_SPLIT];
-#endif
     }
     if (chosen_rdc.rate < INT_MAX) {
       chosen_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT];
       chosen_rdc.rdcost = RDCOST(x->rdmult, chosen_rdc.rate, chosen_rdc.dist);
-#if CONFIG_SUPERTX
-      chosen_rate_nocoef += x->partition_cost[pl][PARTITION_NONE];
-#endif
     }
   }
 
   // If last_part is better set the partitioning to that.
   if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
-    mib[0]->mbmi.sb_type = bsize;
+    mib[0]->sb_type = bsize;
     if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition;
     chosen_rdc = last_part_rdc;
-#if CONFIG_SUPERTX
-    chosen_rate_nocoef = last_part_rate_nocoef;
-#endif
   }
   // If none was better set the partitioning to that.
   if (none_rdc.rdcost < chosen_rdc.rdcost) {
     if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
     chosen_rdc = none_rdc;
-#if CONFIG_SUPERTX
-    chosen_rate_nocoef = none_rate_nocoef;
-#endif
   }
 
-#if !CONFIG_PVQ
-  restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-  restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
+  restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
 
   // We must have chosen a partitioning and encoding or we'll fail later on.
   // No other opportunities for success.
-  if (bsize == cm->sb_size)
+  if (bsize == cm->seq_params.sb_size)
     assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
 
   if (do_recon) {
-    if (bsize == cm->sb_size) {
+    if (bsize == cm->seq_params.sb_size) {
       // NOTE: To get estimate for rate due to the tokens, use:
       // int rate_coeffs = 0;
-      // encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+      // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
       //           bsize, pc_tree, &rate_coeffs);
-      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+      x->cb_offset = 0;
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
                 pc_tree, NULL);
     } else {
-      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
                 pc_tree, NULL);
     }
   }
 
   *rate = chosen_rdc.rate;
   *dist = chosen_rdc.dist;
-#if CONFIG_SUPERTX
-  *rate_nocoef = chosen_rate_nocoef;
-#endif
 }
 
 /* clang-format off */
 static const BLOCK_SIZE min_partition_size[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  BLOCK_2X2,   BLOCK_2X2,   BLOCK_2X2,    //    2x2,    2x4,     4x2
-#endif
                             BLOCK_4X4,    //                     4x4
   BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,    //    4x8,    8x4,     8x8
   BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,    //   8x16,   16x8,   16x16
   BLOCK_8X8,   BLOCK_8X8,   BLOCK_16X16,  //  16x32,  32x16,   32x32
   BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,  //  32x64,  64x32,   64x64
-#if CONFIG_EXT_PARTITION
   BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,  // 64x128, 128x64, 128x128
-#endif  // CONFIG_EXT_PARTITION
   BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,    //   4x16,   16x4,    8x32
   BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16,  //   32x8,  16x64,   64x16
-#if CONFIG_EXT_PARTITION
-  BLOCK_16X16, BLOCK_16X16                // 32x128, 128x32
-#endif  // CONFIG_EXT_PARTITION
 };
 
 static const BLOCK_SIZE max_partition_size[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  BLOCK_4X4,     BLOCK_4X4,       BLOCK_4X4,    //    2x2,    2x4,     4x2
-#endif
                                   BLOCK_8X8,    //                     4x4
   BLOCK_16X16,   BLOCK_16X16,   BLOCK_16X16,    //    4x8,    8x4,     8x8
   BLOCK_32X32,   BLOCK_32X32,   BLOCK_32X32,    //   8x16,   16x8,   16x16
   BLOCK_64X64,   BLOCK_64X64,   BLOCK_64X64,    //  16x32,  32x16,   32x32
   BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST,  //  32x64,  64x32,   64x64
-#if CONFIG_EXT_PARTITION
   BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST,  // 64x128, 128x64, 128x128
-#endif  // CONFIG_EXT_PARTITION
   BLOCK_16X16,   BLOCK_16X16,   BLOCK_32X32,    //   4x16,   16x4,    8x32
   BLOCK_32X32,   BLOCK_LARGEST, BLOCK_LARGEST,  //   32x8,  16x64,   64x16
-#if CONFIG_EXT_PARTITION
-  BLOCK_LARGEST, BLOCK_LARGEST                  // 32x128, 128x32
-#endif  // CONFIG_EXT_PARTITION
 };
 
 // Next square block size less or equal than current block size.
 static const BLOCK_SIZE next_square_size[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  BLOCK_2X2,   BLOCK_2X2,     BLOCK_2X2,    //    2x2,    2x4,     4x2
-#endif
                               BLOCK_4X4,    //                     4x4
   BLOCK_4X4,   BLOCK_4X4,     BLOCK_8X8,    //    4x8,    8x4,     8x8
   BLOCK_8X8,   BLOCK_8X8,     BLOCK_16X16,  //   8x16,   16x8,   16x16
   BLOCK_16X16, BLOCK_16X16,   BLOCK_32X32,  //  16x32,  32x16,   32x32
   BLOCK_32X32, BLOCK_32X32,   BLOCK_64X64,  //  32x64,  64x32,   64x64
-#if CONFIG_EXT_PARTITION
   BLOCK_64X64, BLOCK_64X64, BLOCK_128X128,  // 64x128, 128x64, 128x128
-#endif  // CONFIG_EXT_PARTITION
   BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,      //   4x16,   16x4,    8x32
   BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16,    //   32x8,  16x64,   64x16
-#if CONFIG_EXT_PARTITION
-  BLOCK_32X32, BLOCK_32X32                  // 32x128, 128x32
-#endif  // CONFIG_EXT_PARTITION
 };
 /* clang-format on */
 
@@ -3029,17 +1953,17 @@ static const BLOCK_SIZE next_square_size[BLOCK_SIZES_ALL] = {
 // function so repeat calls can accumulate a min and max of more than one
 // superblock.
 static void get_sb_partition_size_range(const AV1_COMMON *const cm,
-                                        MACROBLOCKD *xd, MODE_INFO **mib,
+                                        MACROBLOCKD *xd, MB_MODE_INFO **mib,
                                         BLOCK_SIZE *min_block_size,
                                         BLOCK_SIZE *max_block_size) {
   int i, j;
   int index = 0;
 
   // Check the sb_type for each block that belongs to this region.
-  for (i = 0; i < cm->mib_size; ++i) {
-    for (j = 0; j < cm->mib_size; ++j) {
-      MODE_INFO *mi = mib[index + j];
-      BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : BLOCK_4X4;
+  for (i = 0; i < cm->seq_params.mib_size; ++i) {
+    for (j = 0; j < cm->seq_params.mib_size; ++j) {
+      MB_MODE_INFO *mi = mib[index + j];
+      BLOCK_SIZE sb_type = mi ? mi->sb_type : BLOCK_4X4;
       *min_block_size = AOMMIN(*min_block_size, sb_type);
       *max_block_size = AOMMAX(*max_block_size, sb_type);
     }
@@ -3047,6 +1971,68 @@ static void get_sb_partition_size_range(const AV1_COMMON *const cm,
   }
 }
 
+// Checks to see if a super block is on a horizontal image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+static int active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) {
+  int top_edge = 0;
+  int bottom_edge = cpi->common.mi_rows;
+  int is_active_h_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (cpi->oxcf.pass == 2) {
+    const TWO_PASS *const twopass = &cpi->twopass;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+
+    bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+    bottom_edge = AOMMAX(top_edge, bottom_edge);
+  }
+
+  if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
+      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
+    is_active_h_edge = 1;
+  }
+  return is_active_h_edge;
+}
+
+// Checks to see if a super block is on a vertical image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+static int active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
+  int left_edge = 0;
+  int right_edge = cpi->common.mi_cols;
+  int is_active_v_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (cpi->oxcf.pass == 2) {
+    const TWO_PASS *const twopass = &cpi->twopass;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+
+    right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+    right_edge = AOMMAX(left_edge, right_edge);
+  }
+
+  if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
+      ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
+    is_active_v_edge = 1;
+  }
+  return is_active_v_edge;
+}
+
+// Checks to see if a super block is at the edge of the active image.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+static int active_edge_sb(const AV1_COMP *cpi, int mi_row, int mi_col) {
+  return active_h_edge(cpi, mi_row, cpi->common.seq_params.mib_size) ||
+         active_v_edge(cpi, mi_col, cpi->common.seq_params.mib_size);
+}
+
 // Look at neighboring blocks and set a min and max partition size based on
 // what they chose.
 static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile,
@@ -3054,7 +2040,7 @@ static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile,
                                     int mi_col, BLOCK_SIZE *min_block_size,
                                     BLOCK_SIZE *max_block_size) {
   AV1_COMMON *const cm = &cpi->common;
-  MODE_INFO **mi = xd->mi;
+  MB_MODE_INFO **mi = xd->mi;
   const int left_in_image = xd->left_available && mi[-1];
   const int above_in_image = xd->up_available && mi[-xd->mi_stride];
   const int mi_rows_remaining = tile->mi_row_end - mi_row;
@@ -3073,18 +2059,19 @@ static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile,
     // passed in values for min and max as a starting point.
     // Find the min and max partition used in previous frame at this location
     if (cm->frame_type != KEY_FRAME) {
-      MODE_INFO **prev_mi =
+      MB_MODE_INFO **prev_mi =
           &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
       get_sb_partition_size_range(cm, xd, prev_mi, &min_size, &max_size);
     }
     // Find the min and max partition sizes used in the left superblock
     if (left_in_image) {
-      MODE_INFO **left_sb_mi = &mi[-cm->mib_size];
+      MB_MODE_INFO **left_sb_mi = &mi[-cm->seq_params.mib_size];
       get_sb_partition_size_range(cm, xd, left_sb_mi, &min_size, &max_size);
     }
     // Find the min and max partition sizes used in the above suprblock.
     if (above_in_image) {
-      MODE_INFO **above_sb_mi = &mi[-xd->mi_stride * cm->mib_size];
+      MB_MODE_INFO **above_sb_mi =
+          &mi[-xd->mi_stride * cm->seq_params.mib_size];
       get_sb_partition_size_range(cm, xd, above_sb_mi, &min_size, &max_size);
     }
 
@@ -3103,7 +2090,7 @@ static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile,
   // Test for blocks at the edge of the active image.
   // This may be the actual edge of the image or where there are formatting
   // bars.
-  if (av1_active_edge_sb(cpi, mi_row, mi_col)) {
+  if (active_edge_sb(cpi, mi_row, mi_col)) {
     min_size = BLOCK_4X4;
   } else {
     min_size = AOMMIN(cpi->sf.rd_auto_partition_min_limit, min_size);
@@ -3116,8 +2103,8 @@ static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile,
     min_size = AOMMIN(min_size, next_square_size[max_size]);
   }
 
-  *min_block_size = AOMMIN(min_size, cm->sb_size);
-  *max_block_size = AOMMIN(max_size, cm->sb_size);
+  *min_block_size = AOMMIN(min_size, cm->seq_params.sb_size);
+  *max_block_size = AOMMIN(max_size, cm->seq_params.sb_size);
 }
 
 // TODO(jingning) refactor functions setting partition search range
@@ -3131,15 +2118,15 @@ static void set_partition_range(const AV1_COMMON *const cm,
   int idx, idy;
 
   const int idx_str = cm->mi_stride * mi_row + mi_col;
-  MODE_INFO **const prev_mi = &cm->prev_mi_grid_visible[idx_str];
-  BLOCK_SIZE min_size = cm->sb_size;  // default values
+  MB_MODE_INFO **const prev_mi = &cm->prev_mi_grid_visible[idx_str];
+  BLOCK_SIZE min_size = cm->seq_params.sb_size;  // default values
   BLOCK_SIZE max_size = BLOCK_4X4;
 
   if (prev_mi) {
     for (idy = 0; idy < mi_height; ++idy) {
       for (idx = 0; idx < mi_width; ++idx) {
-        const MODE_INFO *const mi = prev_mi[idy * cm->mi_stride + idx];
-        const BLOCK_SIZE bs = mi ? mi->mbmi.sb_type : bsize;
+        const MB_MODE_INFO *const mi = prev_mi[idy * cm->mi_stride + idx];
+        const BLOCK_SIZE bs = mi ? mi->sb_type : bsize;
         min_size = AOMMIN(min_size, bs);
         max_size = AOMMAX(max_size, bs);
       }
@@ -3148,8 +2135,8 @@ static void set_partition_range(const AV1_COMMON *const cm,
 
   if (xd->left_available) {
     for (idy = 0; idy < mi_height; ++idy) {
-      const MODE_INFO *const mi = xd->mi[idy * cm->mi_stride - 1];
-      const BLOCK_SIZE bs = mi ? mi->mbmi.sb_type : bsize;
+      const MB_MODE_INFO *const mi = xd->mi[idy * cm->mi_stride - 1];
+      const BLOCK_SIZE bs = mi ? mi->sb_type : bsize;
       min_size = AOMMIN(min_size, bs);
       max_size = AOMMAX(max_size, bs);
     }
@@ -3157,8 +2144,8 @@ static void set_partition_range(const AV1_COMMON *const cm,
 
   if (xd->up_available) {
     for (idx = 0; idx < mi_width; ++idx) {
-      const MODE_INFO *const mi = xd->mi[idx - cm->mi_stride];
-      const BLOCK_SIZE bs = mi ? mi->mbmi.sb_type : bsize;
+      const MB_MODE_INFO *const mi = xd->mi[idx - cm->mi_stride];
+      const BLOCK_SIZE bs = mi ? mi->sb_type : bsize;
       min_size = AOMMIN(min_size, bs);
       max_size = AOMMAX(max_size, bs);
     }
@@ -3169,8 +2156,8 @@ static void set_partition_range(const AV1_COMMON *const cm,
     max_size = max_partition_size[max_size];
   }
 
-  *min_bs = AOMMIN(min_size, cm->sb_size);
-  *max_bs = AOMMIN(max_size, cm->sb_size);
+  *min_bs = AOMMIN(min_size, cm->seq_params.sb_size);
+  *max_bs = AOMMIN(max_size, cm->seq_params.sb_size);
 }
 
 static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
@@ -3184,24 +2171,18 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
 #if CONFIG_FP_MB_STATS
 const int qindex_skip_threshold_lookup[BLOCK_SIZES] = {
   0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120,
-#if CONFIG_EXT_PARTITION
   // TODO(debargha): What are the correct numbers here?
   130, 130, 150
-#endif  // CONFIG_EXT_PARTITION
 };
 const int qindex_split_threshold_lookup[BLOCK_SIZES] = {
   0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120,
-#if CONFIG_EXT_PARTITION
   // TODO(debargha): What are the correct numbers here?
   160, 160, 240
-#endif  // CONFIG_EXT_PARTITION
 };
 const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = {
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6,
-#if CONFIG_EXT_PARTITION
   // TODO(debargha): What are the correct numbers here?
   8, 8, 10
-#endif  // CONFIG_EXT_PARTITION
 };
 
 typedef enum {
@@ -3237,7 +2218,6 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
 }
 #endif
 
-#if CONFIG_EXT_PARTITION_TYPES
 // Try searching for an encoding for the given subblock. Returns zero if the
 // rdcost is already too high (to tell the caller not to bother searching for
 // encodings of further subblocks)
@@ -3246,20 +2226,11 @@ static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td,
                            int is_first, int is_last, int mi_row, int mi_col,
                            BLOCK_SIZE subsize, RD_STATS *best_rdc,
                            RD_STATS *sum_rdc, RD_STATS *this_rdc,
-#if CONFIG_SUPERTX
-                           int64_t best_rd, int *sum_rate_nocoef,
-                           int *this_rate_nocoef, int *abort_flag,
-#endif
                            PARTITION_TYPE partition,
                            PICK_MODE_CONTEXT *prev_ctx,
                            PICK_MODE_CONTEXT *this_ctx) {
-#if CONFIG_SUPERTX
-#define RTS_X_RATE_NOCOEF_ARG ((is_first) ? sum_rate_nocoef : this_rate_nocoef),
-#define RTS_MAX_RDCOST INT64_MAX
-#else
 #define RTS_X_RATE_NOCOEF_ARG
 #define RTS_MAX_RDCOST best_rdc->rdcost
-#endif
 
   MACROBLOCK *const x = &td->mb;
 
@@ -3276,32 +2247,22 @@ static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td,
                    RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx,
                    rdcost_remaining);
 
-#if CONFIG_SUPERTX
-  if (is_first) *abort_flag = sum_rdc->rdcost >= best_rd;
-#endif
-
   if (!is_first) {
     if (this_rdc->rate == INT_MAX) {
       sum_rdc->rdcost = INT64_MAX;
-#if CONFIG_SUPERTX
-      *sum_rate_nocoef = INT_MAX;
-#endif
     } else {
       sum_rdc->rate += this_rdc->rate;
       sum_rdc->dist += this_rdc->dist;
       sum_rdc->rdcost += this_rdc->rdcost;
-#if CONFIG_SUPERTX
-      *sum_rate_nocoef += *this_rate_nocoef;
-#endif
     }
   }
 
   if (sum_rdc->rdcost >= RTS_MAX_RDCOST) return 0;
 
   if (!is_last) {
-    update_state(cpi, td, this_ctx, mi_row, mi_col, subsize, 1);
-    encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
-                      NULL);
+    update_state(cpi, tile_data, td, this_ctx, mi_row, mi_col, subsize, 1);
+    encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, mi_col,
+                      subsize, NULL);
   }
 
   return 1;
@@ -3310,41 +2271,19 @@ static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td,
 #undef RTS_MAX_RDCOST
 }
 
-static void rd_test_partition3(
-    const AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
-    TOKENEXTRA **tp, PC_TREE *pc_tree, RD_STATS *best_rdc,
-    PICK_MODE_CONTEXT ctxs[3], PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
-    BLOCK_SIZE bsize, PARTITION_TYPE partition,
-#if CONFIG_SUPERTX
-    int64_t best_rd, int *best_rate_nocoef, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
-#endif
-    int mi_row0, int mi_col0, BLOCK_SIZE subsize0, int mi_row1, int mi_col1,
-    BLOCK_SIZE subsize1, int mi_row2, int mi_col2, BLOCK_SIZE subsize2) {
+static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td,
+                               TileDataEnc *tile_data, TOKENEXTRA **tp,
+                               PC_TREE *pc_tree, RD_STATS *best_rdc,
+                               PICK_MODE_CONTEXT ctxs[3],
+                               PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
+                               BLOCK_SIZE bsize, PARTITION_TYPE partition,
+                               int mi_row0, int mi_col0, BLOCK_SIZE subsize0,
+                               int mi_row1, int mi_col1, BLOCK_SIZE subsize1,
+                               int mi_row2, int mi_col2, BLOCK_SIZE subsize2) {
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   RD_STATS sum_rdc, this_rdc;
-#if CONFIG_UNPOISON_PARTITION_CTX
-  const AV1_COMMON *const cm = &cpi->common;
-  const int hbs = mi_size_wide[bsize] / 2;
-  const int has_rows = mi_row + hbs < cm->mi_rows;
-  const int has_cols = mi_col + hbs < cm->mi_cols;
-#endif  // CONFIG_UNPOISON_PARTITION_CTX
-#if CONFIG_SUPERTX || CONFIG_EXT_PARTITION_TYPES_AB
-  const AV1_COMMON *const cm = &cpi->common;
-#endif
-#if CONFIG_SUPERTX
-  TileInfo *const tile_info = &tile_data->tile_info;
-  int sum_rate_nocoef, this_rate_nocoef;
-  int abort_flag;
-  const int supertx_allowed = !frame_is_intra_only(cm) &&
-                              bsize <= MAX_SUPERTX_BLOCK_SIZE &&
-                              !xd->lossless[0];
-
-#define RTP_STX_TRY_ARGS \
-  best_rd, &sum_rate_nocoef, &this_rate_nocoef, &abort_flag,
-#else
 #define RTP_STX_TRY_ARGS
-#endif
 
   if (!rd_try_subblock(cpi, td, tile_data, tp, 1, 0, mi_row0, mi_col0, subsize0,
                        best_rdc, &sum_rdc, &this_rdc,
@@ -3356,131 +2295,586 @@ static void rd_test_partition3(
                        RTP_STX_TRY_ARGS partition, &ctxs[0], &ctxs[1]))
     return;
 
-// With the new layout of mixed partitions for PARTITION_HORZ_B and
-// PARTITION_VERT_B, the last subblock might start past halfway through the
-// main block, so we might signal it even though the subblock lies strictly
-// outside the image. In that case, we won't spend any bits coding it and the
-// difference (obviously) doesn't contribute to the error.
-#if CONFIG_EXT_PARTITION_TYPES_AB
-  const int try_block2 = mi_row2 < cm->mi_rows && mi_col2 < cm->mi_cols;
-#else
+  // With the new layout of mixed partitions for PARTITION_HORZ_B and
+  // PARTITION_VERT_B, the last subblock might start past halfway through the
+  // main block, so we might signal it even though the subblock lies strictly
+  // outside the image. In that case, we won't spend any bits coding it and the
+  // difference (obviously) doesn't contribute to the error.
   const int try_block2 = 1;
-#endif
   if (try_block2 &&
       !rd_try_subblock(cpi, td, tile_data, tp, 0, 1, mi_row2, mi_col2, subsize2,
                        best_rdc, &sum_rdc, &this_rdc,
                        RTP_STX_TRY_ARGS partition, &ctxs[1], &ctxs[2]))
     return;
 
-#if CONFIG_SUPERTX
-  if (supertx_allowed && !abort_flag && sum_rdc.rdcost < INT64_MAX) {
-    TX_SIZE supertx_size = max_txsize_lookup[bsize];
-    const PARTITION_TYPE best_partition = pc_tree->partitioning;
-    pc_tree->partitioning = partition;
-    sum_rdc.rate += av1_cost_bit(
-        cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
-                            [supertx_size],
-        0);
-    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-
-    if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
-      TX_TYPE best_tx = DCT_DCT;
-      RD_STATS tmp_rdc = { sum_rate_nocoef, 0, 0 };
-
-      restore_context(x, x_ctx, mi_row, mi_col, bsize);
-
-      rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, &tmp_rdc.rate,
-                    &tmp_rdc.dist, &best_tx, pc_tree);
-
-      tmp_rdc.rate += av1_cost_bit(
-          cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
-                              [supertx_size],
-          1);
-      tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
-      if (tmp_rdc.rdcost < sum_rdc.rdcost) {
-        sum_rdc = tmp_rdc;
-        update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
-                                supertx_size, pc_tree);
-      }
-    }
-
-    pc_tree->partitioning = best_partition;
-  }
-#endif
-
   if (sum_rdc.rdcost >= best_rdc->rdcost) return;
 
-  int pl = partition_plane_context(xd, mi_row, mi_col,
-#if CONFIG_UNPOISON_PARTITION_CTX
-                                   has_rows, has_cols,
-#endif
-                                   bsize);
+  int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
   sum_rdc.rate += x->partition_cost[pl][partition];
   sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-#if CONFIG_SUPERTX
-  sum_rate_nocoef += x->partition_cost[pl][partition];
-#endif
 
   if (sum_rdc.rdcost >= best_rdc->rdcost) return;
 
-#if CONFIG_SUPERTX
-  *best_rate_nocoef = sum_rate_nocoef;
-  assert(*best_rate_nocoef >= 0);
-#endif
   *best_rdc = sum_rdc;
   pc_tree->partitioning = partition;
 
 #undef RTP_STX_TRY_ARGS
 }
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+#if CONFIG_DIST_8X8
 static int64_t dist_8x8_yuv(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                            uint8_t *y_src_8x8) {
+                            uint8_t *src_plane_8x8[MAX_MB_PLANE],
+                            uint8_t *dst_plane_8x8[MAX_MB_PLANE]) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   int64_t dist_8x8, dist_8x8_uv, total_dist;
   const int src_stride = x->plane[0].src.stride;
-  uint8_t *decoded_8x8;
   int plane;
 
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    decoded_8x8 = CONVERT_TO_BYTEPTR(x->decoded_8x8);
-  else
-#endif
-    decoded_8x8 = (uint8_t *)x->decoded_8x8;
-
-  dist_8x8 = av1_dist_8x8(cpi, x, y_src_8x8, src_stride, decoded_8x8, 8,
-                          BLOCK_8X8, 8, 8, 8, 8, x->qindex)
-             << 4;
+  const int dst_stride = xd->plane[0].dst.stride;
+  dist_8x8 =
+      av1_dist_8x8(cpi, x, src_plane_8x8[0], src_stride, dst_plane_8x8[0],
+                   dst_stride, BLOCK_8X8, 8, 8, 8, 8, x->qindex)
+      << 4;
 
   // Compute chroma distortion for a luma 8x8 block
   dist_8x8_uv = 0;
 
-  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-    const int src_stride_uv = x->plane[plane].src.stride;
-    const int dst_stride_uv = xd->plane[plane].dst.stride;
-    // uv buff pointers now (i.e. the last sub8x8 block) is the same
-    // to those at the first sub8x8 block because
-    // uv buff pointer is set only once at first sub8x8 block in a 8x8.
-    uint8_t *src_uv = x->plane[plane].src.buf;
-    uint8_t *dst_uv = xd->plane[plane].dst.buf;
-    unsigned sse;
-#if CONFIG_CHROMA_SUB8X8
-    const BLOCK_SIZE plane_bsize =
-        AOMMAX(BLOCK_4X4, get_plane_block_size(BLOCK_8X8, &xd->plane[plane]));
-#else
-    const BLOCK_SIZE plane_bsize =
-        get_plane_block_size(BLOCK_8X8, &xd->plane[plane]);
-#endif
-    cpi->fn_ptr[plane_bsize].vf(src_uv, src_stride_uv, dst_uv, dst_stride_uv,
-                                &sse);
-    dist_8x8_uv += (int64_t)sse << 4;
+  if (num_planes > 1) {
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+      unsigned sse;
+      const int src_stride_uv = x->plane[plane].src.stride;
+      const int dst_stride_uv = xd->plane[plane].dst.stride;
+      const int ssx = xd->plane[plane].subsampling_x;
+      const int ssy = xd->plane[plane].subsampling_y;
+      const BLOCK_SIZE plane_bsize = get_plane_block_size(BLOCK_8X8, ssx, ssy);
+
+      cpi->fn_ptr[plane_bsize].vf(src_plane_8x8[plane], src_stride_uv,
+                                  dst_plane_8x8[plane], dst_stride_uv, &sse);
+      dist_8x8_uv += (int64_t)sse << 4;
+    }
   }
 
   return total_dist = dist_8x8 + dist_8x8_uv;
 }
-#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
+#endif  // CONFIG_DIST_8X8
+
+static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
+  pc_tree->partitioning = PARTITION_NONE;
+  pc_tree->cb_search_range = SEARCH_FULL_PLANE;
+
+  if (bsize >= BLOCK_8X8) {
+    BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    for (int idx = 0; idx < 4; ++idx)
+      reset_partition(pc_tree->split[idx], subsize);
+  }
+}
+
+static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
+                                  TileDataEnc *tile_data, TOKENEXTRA **tp,
+                                  int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                  RD_STATS *rd_cost, int64_t best_rd,
+                                  PC_TREE *pc_tree, int64_t *none_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_step = mi_size_wide[bsize] / 2;
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  const TOKENEXTRA *const tp_orig = *tp;
+  PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
+  int tmp_partition_cost[PARTITION_TYPES];
+  BLOCK_SIZE subsize;
+  RD_STATS this_rdc, sum_rdc, best_rdc, pn_rdc;
+  const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
+  int do_square_split = bsize_at_least_8x8;
+  const int pl = bsize_at_least_8x8
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                     : 0;
+  const int *partition_cost =
+      pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0];
+  const int num_planes = av1_num_planes(cm);
+
+  int64_t split_rd[4] = { 0, 0, 0, 0 };
+
+  // Override skipping rectangular partition operations for edge blocks
+  const int has_rows = (mi_row + mi_step < cm->mi_rows);
+  const int has_cols = (mi_col + mi_step < cm->mi_cols);
+
+  if (none_rd) *none_rd = 0;
+
+  int partition_none_allowed = has_rows && has_cols;
+
+  (void)*tp_orig;
+  (void)split_rd;
+
+  av1_zero(pc_tree->pc_tree_stats);
+  pc_tree->pc_tree_stats.valid = 1;
+
+  // Override partition costs at the edges of the frame in the same
+  // way as in read_partition (see decodeframe.c)
+  if (!(has_rows && has_cols)) {
+    assert(bsize_at_least_8x8 && pl >= 0);
+    const aom_cdf_prob *partition_cdf = cm->fc->partition_cdf[pl];
+    for (int i = 0; i < PARTITION_TYPES; ++i) tmp_partition_cost[i] = INT_MAX;
+    if (has_cols) {
+      // At the bottom, the two possibilities are HORZ and SPLIT
+      aom_cdf_prob bot_cdf[2];
+      partition_gather_vert_alike(bot_cdf, partition_cdf, bsize);
+      static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT };
+      av1_cost_tokens_from_cdf(tmp_partition_cost, bot_cdf, bot_inv_map);
+    } else if (has_rows) {
+      // At the right, the two possibilities are VERT and SPLIT
+      aom_cdf_prob rhs_cdf[2];
+      partition_gather_horz_alike(rhs_cdf, partition_cdf, bsize);
+      static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT };
+      av1_cost_tokens_from_cdf(tmp_partition_cost, rhs_cdf, rhs_inv_map);
+    } else {
+      // At the bottom right, we always split
+      tmp_partition_cost[PARTITION_SPLIT] = 0;
+    }
+
+    partition_cost = tmp_partition_cost;
+  }
+
+#ifndef NDEBUG
+  // Nothing should rely on the default value of this array (which is just
+  // leftover from encoding the previous block. Setting it to magic number
+  // when debugging.
+  memset(x->blk_skip, 234, sizeof(x->blk_skip));
+#endif  // NDEBUG
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+  av1_init_rd_stats(&this_rdc);
+  av1_init_rd_stats(&sum_rdc);
+  av1_invalid_rd_stats(&best_rdc);
+  best_rdc.rdcost = best_rd;
+
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
+    x->mb_energy = av1_block_energy(cpi, x, bsize);
+
+  xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  // PARTITION_NONE
+  if (partition_none_allowed) {
+    if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
+
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+                     PARTITION_NONE, bsize, ctx_none, best_rdc.rdcost);
+
+    pc_tree->pc_tree_stats.rdcost = ctx_none->rdcost;
+    pc_tree->pc_tree_stats.skip = ctx_none->skip;
+
+    if (none_rd) *none_rd = this_rdc.rdcost;
+    if (this_rdc.rate != INT_MAX) {
+      if (bsize_at_least_8x8) {
+        const int pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
+                                ? partition_cost[PARTITION_NONE]
+                                : 0;
+        this_rdc.rate += pt_cost;
+        this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+      }
+
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        // Adjust dist breakout threshold according to the partition size.
+        const int64_t dist_breakout_thr =
+            cpi->sf.partition_search_breakout_dist_thr >>
+            ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
+             (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]));
+        const int rate_breakout_thr =
+            cpi->sf.partition_search_breakout_rate_thr *
+            num_pels_log2_lookup[bsize];
+
+        best_rdc = this_rdc;
+        if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
+
+        pc_tree->cb_search_range = SEARCH_FULL_PLANE;
+
+        // If all y, u, v transform blocks in this partition are skippable, and
+        // the dist & rate are within the thresholds, the partition search is
+        // terminated for current branch of the partition search tree.
+        // The dist & rate thresholds are set to 0 at speed 0 to disable the
+        // early termination at that speed.
+        if (!x->e_mbd.lossless[xd->mi[0]->segment_id] &&
+            (ctx_none->skippable && best_rdc.dist < dist_breakout_thr &&
+             best_rdc.rate < rate_breakout_thr)) {
+          do_square_split = 0;
+        }
+      }
+    }
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  // store estimated motion vector
+  if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
+
+  int64_t temp_best_rdcost = best_rdc.rdcost;
+  pn_rdc = best_rdc;
+
+#if CONFIG_DIST_8X8
+  uint8_t *src_plane_8x8[MAX_MB_PLANE], *dst_plane_8x8[MAX_MB_PLANE];
+
+  if (x->using_dist_8x8 && bsize == BLOCK_8X8) {
+    for (int i = 0; i < MAX_MB_PLANE; i++) {
+      src_plane_8x8[i] = x->plane[i].src.buf;
+      dst_plane_8x8[i] = xd->plane[i].dst.buf;
+    }
+  }
+#endif  // CONFIG_DIST_8X8
+
+  // PARTITION_SPLIT
+  if (do_square_split) {
+    int reached_last_index = 0;
+    subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    int idx;
+
+    for (idx = 0; idx < 4 && sum_rdc.rdcost < temp_best_rdcost; ++idx) {
+      const int x_idx = (idx & 1) * mi_step;
+      const int y_idx = (idx >> 1) * mi_step;
+
+      if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+        continue;
+
+      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+
+      pc_tree->split[idx]->index = idx;
+      int64_t *p_split_rd = &split_rd[idx];
+      rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row + y_idx,
+                            mi_col + x_idx, subsize, &this_rdc,
+                            temp_best_rdcost - sum_rdc.rdcost,
+                            pc_tree->split[idx], p_split_rd);
+
+      pc_tree->pc_tree_stats.sub_block_rdcost[idx] = this_rdc.rdcost;
+      pc_tree->pc_tree_stats.sub_block_skip[idx] =
+          pc_tree->split[idx]->none.skip;
+
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+        break;
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+      }
+    }
+    reached_last_index = (idx == 4);
+
+#if CONFIG_DIST_8X8
+    if (x->using_dist_8x8 && reached_last_index &&
+        sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
+      sum_rdc.dist = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+    }
+#endif  // CONFIG_DIST_8X8
+
+    if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
+      sum_rdc.rate += partition_cost[PARTITION_SPLIT];
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+        pc_tree->partitioning = PARTITION_SPLIT;
+      }
+    }
+
+    int has_split = 0;
+    if (pc_tree->partitioning == PARTITION_SPLIT) {
+      for (int cb_idx = 0; cb_idx <= AOMMIN(idx, 3); ++cb_idx) {
+        if (pc_tree->split[cb_idx]->partitioning == PARTITION_SPLIT)
+          ++has_split;
+      }
+
+      if (has_split >= 3 || sum_rdc.rdcost < (pn_rdc.rdcost >> 1)) {
+        pc_tree->cb_search_range = SPLIT_PLANE;
+      }
+    }
+
+    if (pc_tree->partitioning == PARTITION_NONE) {
+      pc_tree->cb_search_range = SEARCH_SAME_PLANE;
+      if (pn_rdc.dist <= sum_rdc.dist)
+        pc_tree->cb_search_range = NONE_PARTITION_PLANE;
+    }
+
+    if (pn_rdc.rate == INT_MAX) pc_tree->cb_search_range = NONE_PARTITION_PLANE;
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }  // if (do_split)
+
+  pc_tree->pc_tree_stats.split = pc_tree->partitioning == PARTITION_SPLIT;
+  if (do_square_split) {
+    for (int i = 0; i < 4; ++i) {
+      pc_tree->pc_tree_stats.sub_block_split[i] =
+          pc_tree->split[i]->partitioning == PARTITION_SPLIT;
+    }
+  }
+
+  // TODO(jbb): This code added so that we avoid static analysis
+  // warning related to the fact that best_rd isn't used after this
+  // point.  This code should be refactored so that the duplicate
+  // checks occur in some sub function and thus are used...
+  (void)best_rd;
+  *rd_cost = best_rdc;
+
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
+      pc_tree->index != 3) {
+    if (bsize == cm->seq_params.sb_size) {
+      restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+    } else {
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
+  }
+
+#if CONFIG_DIST_8X8
+  if (x->using_dist_8x8 && best_rdc.rate < INT_MAX &&
+      best_rdc.dist < INT64_MAX && bsize == BLOCK_4X4 && pc_tree->index == 3) {
+    encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+              pc_tree, NULL);
+  }
+#endif  // CONFIG_DIST_8X8
+
+  if (bsize == cm->seq_params.sb_size) {
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
+  } else {
+    assert(tp_orig == *tp);
+  }
+}
+
+#define FEATURE_SIZE 19
+static const float two_pass_split_partition_weights_128[FEATURE_SIZE + 1] = {
+  2.683936f, -0.193620f, -4.106470f, -0.141320f, -0.282289f,
+  0.125296f, -1.134961f, 0.862757f,  -0.418799f, -0.637666f,
+  0.016232f, 0.345013f,  0.018823f,  -0.393394f, -1.130700f,
+  0.695357f, 0.112569f,  -0.341975f, -0.513882f, 5.7488966f,
+};
+
+static const float two_pass_split_partition_weights_64[FEATURE_SIZE + 1] = {
+  2.990993f,  0.423273f,  -0.926544f, 0.454646f,  -0.292698f,
+  -1.311632f, -0.284432f, 0.717141f,  -0.419257f, -0.574760f,
+  -0.674444f, 0.669047f,  -0.374255f, 0.380624f,  -0.804036f,
+  0.264021f,  0.004163f,  1.896802f,  0.924287f,  0.13490619f,
+};
+
+static const float two_pass_split_partition_weights_32[FEATURE_SIZE + 1] = {
+  2.795181f,  -0.136943f, -0.924842f, 0.405330f,  -0.463505f,
+  -0.584076f, -0.831472f, 0.382985f,  -0.597544f, -0.138915f,
+  -1.354350f, 0.466035f,  -0.553961f, 0.213202f,  -1.166429f,
+  0.010776f,  -0.096236f, 2.335084f,  1.699857f,  -0.58178353f,
+};
+
+static const float two_pass_split_partition_weights_16[FEATURE_SIZE + 1] = {
+  1.987888f,  -0.431100f, -1.687703f, 0.262602f,  -0.425298f,
+  -0.463870f, -1.493457f, 0.470917f,  -0.528457f, -0.087700f,
+  -1.815092f, 0.152883f,  -0.337908f, 0.093679f,  -1.548267f,
+  -0.042387f, -0.000861f, 2.556746f,  1.619192f,  0.03643292f,
+};
+
+static const float two_pass_split_partition_weights_8[FEATURE_SIZE + 1] = {
+  2.188344f,  -0.817528f, -2.119219f, 0.000000f,  -0.348167f,
+  -0.658074f, -1.960362f, 0.000000f,  -0.403080f, 0.282699f,
+  -2.061088f, 0.000000f,  -0.431919f, -0.127960f, -1.099550f,
+  0.000000f,  0.121622f,  2.017455f,  2.058228f,  -0.15475988f,
+};
+
+static const float two_pass_none_partition_weights_128[FEATURE_SIZE + 1] = {
+  -1.006689f, 0.777908f,  4.461072f,  -0.395782f, -0.014610f,
+  -0.853863f, 0.729997f,  -0.420477f, 0.282429f,  -1.194595f,
+  3.181220f,  -0.511416f, 0.117084f,  -1.149348f, 1.507990f,
+  -0.477212f, 0.202963f,  -1.469581f, 0.624461f,  -0.89081228f,
+};
+
+static const float two_pass_none_partition_weights_64[FEATURE_SIZE + 1] = {
+  -1.241117f, 0.844878f,  5.638803f,  -0.489780f, -0.108796f,
+  -4.576821f, 1.540624f,  -0.477519f, 0.227791f,  -1.443968f,
+  1.586911f,  -0.505125f, 0.140764f,  -0.464194f, 1.466658f,
+  -0.641166f, 0.195412f,  1.427905f,  2.080007f,  -1.98272777f,
+};
+
+static const float two_pass_none_partition_weights_32[FEATURE_SIZE + 1] = {
+  -2.130825f, 0.476023f,  5.907343f,  -0.516002f, -0.097471f,
+  -2.662754f, 0.614858f,  -0.576728f, 0.085261f,  -0.031901f,
+  0.727842f,  -0.600034f, 0.079326f,  0.324328f,  0.504502f,
+  -0.547105f, -0.037670f, 0.304995f,  0.369018f,  -2.66299987f,
+};
+
+static const float two_pass_none_partition_weights_16[FEATURE_SIZE + 1] = {
+  -1.626410f, 0.872047f,  5.414965f,  -0.554781f, -0.084514f,
+  -3.020550f, 0.467632f,  -0.382280f, 0.199568f,  0.426220f,
+  0.829426f,  -0.467100f, 0.153098f,  0.662994f,  0.327545f,
+  -0.560106f, -0.141610f, 0.403372f,  0.523991f,  -3.02891231f,
+};
+
+static const float two_pass_none_partition_weights_8[FEATURE_SIZE + 1] = {
+  -1.463349f, 0.375376f,  4.751430f, 0.000000f, -0.184451f,
+  -1.655447f, 0.443214f,  0.000000f, 0.127961f, 0.152435f,
+  0.083288f,  0.000000f,  0.143105f, 0.438012f, 0.073238f,
+  0.000000f,  -0.278137f, 0.186134f, 0.073737f, -1.6494962f,
+};
+
+// split_score indicates confidence of picking split partition;
+// none_score indicates confidence of picking none partition;
+static int ml_prune_2pass_split_partition(const PC_TREE_STATS *pc_tree_stats,
+                                          BLOCK_SIZE bsize, int *split_score,
+                                          int *none_score) {
+  if (!pc_tree_stats->valid) return 0;
+  const float *split_weights = NULL;
+  const float *none_weights = NULL;
+  switch (bsize) {
+    case BLOCK_4X4: break;
+    case BLOCK_8X8:
+      split_weights = two_pass_split_partition_weights_8;
+      none_weights = two_pass_none_partition_weights_8;
+      break;
+    case BLOCK_16X16:
+      split_weights = two_pass_split_partition_weights_16;
+      none_weights = two_pass_none_partition_weights_16;
+      break;
+    case BLOCK_32X32:
+      split_weights = two_pass_split_partition_weights_32;
+      none_weights = two_pass_none_partition_weights_32;
+      break;
+    case BLOCK_64X64:
+      split_weights = two_pass_split_partition_weights_64;
+      none_weights = two_pass_none_partition_weights_64;
+      break;
+    case BLOCK_128X128:
+      split_weights = two_pass_split_partition_weights_128;
+      none_weights = two_pass_none_partition_weights_128;
+      break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!split_weights || !none_weights) return 0;
+
+  aom_clear_system_state();
+
+  float features[FEATURE_SIZE];
+  int feature_index = 0;
+  features[feature_index++] = (float)pc_tree_stats->split;
+  features[feature_index++] = (float)pc_tree_stats->skip;
+  const int rdcost = (int)AOMMIN(INT_MAX, pc_tree_stats->rdcost);
+  const int rd_valid = rdcost > 0 && rdcost < 1000000000;
+  features[feature_index++] = (float)rd_valid;
+  for (int i = 0; i < 4; ++i) {
+    features[feature_index++] = (float)pc_tree_stats->sub_block_split[i];
+    features[feature_index++] = (float)pc_tree_stats->sub_block_skip[i];
+    const int sub_rdcost =
+        (int)AOMMIN(INT_MAX, pc_tree_stats->sub_block_rdcost[i]);
+    const int sub_rd_valid = sub_rdcost > 0 && sub_rdcost < 1000000000;
+    features[feature_index++] = (float)sub_rd_valid;
+    // Ratio between the sub-block RD and the whole-block RD.
+    float rd_ratio = 1.0f;
+    if (rd_valid && sub_rd_valid && sub_rdcost < rdcost)
+      rd_ratio = (float)sub_rdcost / (float)rdcost;
+    features[feature_index++] = rd_ratio;
+  }
+  assert(feature_index == FEATURE_SIZE);
+
+  float score_1 = split_weights[FEATURE_SIZE];
+  float score_2 = none_weights[FEATURE_SIZE];
+  for (int i = 0; i < FEATURE_SIZE; ++i) {
+    score_1 += features[i] * split_weights[i];
+    score_2 += features[i] * none_weights[i];
+  }
+  *split_score = (int)(score_1 * 100);
+  *none_score = (int)(score_2 * 100);
+  return 1;
+}
+#undef FEATURE_SIZE
+
+// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
+// considered.
+static void ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
+                                  int64_t best_rd, int64_t horz_rd[2],
+                                  int64_t vert_rd[2], int64_t split_rd[4],
+                                  int *const horza_partition_allowed,
+                                  int *const horzb_partition_allowed,
+                                  int *const verta_partition_allowed,
+                                  int *const vertb_partition_allowed) {
+  if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
+  const NN_CONFIG *nn_config = NULL;
+  switch (bsize) {
+    case BLOCK_8X8: nn_config = NULL; break;
+    case BLOCK_16X16: nn_config = &av1_ab_partition_nnconfig_16; break;
+    case BLOCK_32X32: nn_config = &av1_ab_partition_nnconfig_32; break;
+    case BLOCK_64X64: nn_config = &av1_ab_partition_nnconfig_64; break;
+    case BLOCK_128X128: nn_config = &av1_ab_partition_nnconfig_128; break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config) return;
+
+  aom_clear_system_state();
+
+  // Generate features.
+  float features[10];
+  int feature_index = 0;
+  features[feature_index++] = (float)part_ctx;
+  features[feature_index++] = (float)var_ctx;
+  const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+  int sub_block_rdcost[8] = { 0 };
+  int rd_index = 0;
+  for (int i = 0; i < 2; ++i) {
+    if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)horz_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)vert_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 4; ++i) {
+    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)split_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 8; ++i) {
+    // Ratio between the sub-block RD and the whole-block RD.
+    float rd_ratio = 1.0f;
+    if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+      rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+    features[feature_index++] = rd_ratio;
+  }
+  assert(feature_index == 10);
+
+  // Calculate scores using the NN model.
+  float score[16] = { 0.0f };
+  av1_nn_predict(features, nn_config, score);
+  int int_score[16];
+  int max_score = -1000;
+  for (int i = 0; i < 16; ++i) {
+    int_score[i] = (int)(100 * score[i]);
+    max_score = AOMMAX(int_score[i], max_score);
+  }
+
+  // Make decisions based on the model scores.
+  int thresh = max_score;
+  switch (bsize) {
+    case BLOCK_16X16: thresh -= 150; break;
+    case BLOCK_32X32: thresh -= 100; break;
+    default: break;
+  }
+  *horza_partition_allowed = 0;
+  *horzb_partition_allowed = 0;
+  *verta_partition_allowed = 0;
+  *vertb_partition_allowed = 0;
+  for (int i = 0; i < 16; ++i) {
+    if (int_score[i] >= thresh) {
+      if ((i >> 0) & 1) *horza_partition_allowed = 1;
+      if ((i >> 1) & 1) *horzb_partition_allowed = 1;
+      if ((i >> 2) & 1) *verta_partition_allowed = 1;
+      if ((i >> 3) & 1) *vertb_partition_allowed = 1;
+    }
+  }
+}
 
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
@@ -3488,12 +2882,10 @@ static int64_t dist_8x8_yuv(const AV1_COMP *const cpi, MACROBLOCK *const x,
 static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
                               TileDataEnc *tile_data, TOKENEXTRA **tp,
                               int mi_row, int mi_col, BLOCK_SIZE bsize,
-                              RD_STATS *rd_cost,
-#if CONFIG_SUPERTX
-                              int *rate_nocoef,
-#endif
-                              int64_t best_rd, PC_TREE *pc_tree) {
+                              RD_STATS *rd_cost, int64_t best_rd,
+                              PC_TREE *pc_tree, int64_t *none_rd) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -3501,114 +2893,87 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
   const TOKENEXTRA *const tp_orig = *tp;
   PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
-#if CONFIG_UNPOISON_PARTITION_CTX
-  const int hbs = mi_size_wide[bsize] / 2;
-  const int has_rows = mi_row + hbs < cm->mi_rows;
-  const int has_cols = mi_col + hbs < cm->mi_cols;
-#else
   int tmp_partition_cost[PARTITION_TYPES];
-#endif
   BLOCK_SIZE subsize;
   RD_STATS this_rdc, sum_rdc, best_rdc;
   const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
   int do_square_split = bsize_at_least_8x8;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
   const int pl = bsize_at_least_8x8
-                     ? partition_plane_context(xd, mi_row, mi_col,
-#if CONFIG_UNPOISON_PARTITION_CTX
-                                               has_rows, has_cols,
-#endif
-                                               bsize)
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
                      : 0;
-#else
-  const int unify_bsize = 0;
-  const int pl = partition_plane_context(xd, mi_row, mi_col,
-#if CONFIG_UNPOISON_PARTITION_CTX
-                                         has_rows, has_cols,
-#endif
-                                         bsize);
-#endif  // CONFIG_CB4X4
   const int *partition_cost =
       pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0];
-#if CONFIG_SUPERTX
-  int this_rate_nocoef, sum_rate_nocoef = 0, best_rate_nocoef = INT_MAX;
-  int abort_flag;
-  const int supertx_allowed = !frame_is_intra_only(cm) && bsize >= BLOCK_8X8 &&
-                              bsize <= MAX_SUPERTX_BLOCK_SIZE &&
-                              !xd->lossless[0];
-#endif  // CONFIG_SUPERTX
 
   int do_rectangular_split = 1;
-#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB
-  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
-#endif
+  int64_t split_rd[4] = { 0, 0, 0, 0 };
+  int64_t horz_rd[2] = { 0, 0 };
+  int64_t vert_rd[2] = { 0, 0 };
+
+  int split_ctx_is_ready[2] = { 0, 0 };
+  int horz_ctx_is_ready = 0;
+  int vert_ctx_is_ready = 0;
+  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+  if (bsize == cm->seq_params.sb_size) x->must_find_valid_partition = 0;
 
   // Override skipping rectangular partition operations for edge blocks
-  const int force_horz_split = (mi_row + mi_step >= cm->mi_rows);
-  const int force_vert_split = (mi_col + mi_step >= cm->mi_cols);
+  const int has_rows = (mi_row + mi_step < cm->mi_rows);
+  const int has_cols = (mi_col + mi_step < cm->mi_cols);
   const int xss = x->e_mbd.plane[1].subsampling_x;
   const int yss = x->e_mbd.plane[1].subsampling_y;
 
   BLOCK_SIZE min_size = x->min_partition_size;
   BLOCK_SIZE max_size = x->max_partition_size;
 
+  if (none_rd) *none_rd = 0;
+
 #if CONFIG_FP_MB_STATS
   unsigned int src_diff_var = UINT_MAX;
   int none_complexity = 0;
 #endif
 
-  int partition_none_allowed = !force_horz_split && !force_vert_split;
-  int partition_horz_allowed =
-      !force_vert_split && yss <= xss && bsize_at_least_8x8;
-  int partition_vert_allowed =
-      !force_horz_split && xss <= yss && bsize_at_least_8x8;
-
-#if CONFIG_PVQ
-  od_rollback_buffer pre_rdo_buf;
-#endif
+  int partition_none_allowed = has_rows && has_cols;
+  int partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
+  int partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
 
   (void)*tp_orig;
 
-#if !CONFIG_UNPOISON_PARTITION_CTX
-  if (force_horz_split || force_vert_split) {
-    tmp_partition_cost[PARTITION_NONE] = INT_MAX;
-
-    if (!force_vert_split) {  // force_horz_split only
-      tmp_partition_cost[PARTITION_VERT] = INT_MAX;
-      tmp_partition_cost[PARTITION_HORZ] =
-          av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_HORZ], 0);
-      tmp_partition_cost[PARTITION_SPLIT] =
-          av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_HORZ], 1);
-    } else if (!force_horz_split) {  // force_vert_split only
-      tmp_partition_cost[PARTITION_HORZ] = INT_MAX;
-      tmp_partition_cost[PARTITION_VERT] =
-          av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_VERT], 0);
-      tmp_partition_cost[PARTITION_SPLIT] =
-          av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_VERT], 1);
-    } else {  // force_ horz_split && force_vert_split horz_split
-      tmp_partition_cost[PARTITION_HORZ] = INT_MAX;
-      tmp_partition_cost[PARTITION_VERT] = INT_MAX;
+  // Override partition costs at the edges of the frame in the same
+  // way as in read_partition (see decodeframe.c)
+  if (!(has_rows && has_cols)) {
+    assert(bsize_at_least_8x8 && pl >= 0);
+    const aom_cdf_prob *partition_cdf = cm->fc->partition_cdf[pl];
+    for (int i = 0; i < PARTITION_TYPES; ++i) tmp_partition_cost[i] = INT_MAX;
+    if (has_cols) {
+      // At the bottom, the two possibilities are HORZ and SPLIT
+      aom_cdf_prob bot_cdf[2];
+      partition_gather_vert_alike(bot_cdf, partition_cdf, bsize);
+      static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT };
+      av1_cost_tokens_from_cdf(tmp_partition_cost, bot_cdf, bot_inv_map);
+    } else if (has_rows) {
+      // At the right, the two possibilities are VERT and SPLIT
+      aom_cdf_prob rhs_cdf[2];
+      partition_gather_horz_alike(rhs_cdf, partition_cdf, bsize);
+      static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT };
+      av1_cost_tokens_from_cdf(tmp_partition_cost, rhs_cdf, rhs_inv_map);
+    } else {
+      // At the bottom right, we always split
       tmp_partition_cost[PARTITION_SPLIT] = 0;
     }
 
     partition_cost = tmp_partition_cost;
   }
-#endif
 
-#if CONFIG_VAR_TX
 #ifndef NDEBUG
   // Nothing should rely on the default value of this array (which is just
   // leftover from encoding the previous block. Setting it to magic number
   // when debugging.
-  memset(x->blk_skip[0], 234, sizeof(x->blk_skip[0]));
+  memset(x->blk_skip, 234, sizeof(x->blk_skip));
 #endif  // NDEBUG
-#endif  // CONFIG_VAR_TX
 
   assert(mi_size_wide[bsize] == mi_size_high[bsize]);
 
   av1_init_rd_stats(&this_rdc);
-  av1_init_rd_stats(&sum_rdc);
   av1_invalid_rd_stats(&best_rdc);
   best_rdc.rdcost = best_rd;
 
@@ -3634,26 +2999,70 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
     // Note: Further partitioning is NOT allowed when bsize == min_size already.
     const int partition_allowed = (bsize <= max_size && bsize > min_size);
     partition_none_allowed &= no_partition_allowed;
-    partition_horz_allowed &= partition_allowed || force_horz_split;
-    partition_vert_allowed &= partition_allowed || force_vert_split;
+    partition_horz_allowed &= partition_allowed || !has_rows;
+    partition_vert_allowed &= partition_allowed || !has_cols;
     do_square_split &= bsize > min_size;
   }
   if (cpi->sf.use_square_partition_only) {
-    partition_horz_allowed &= force_horz_split;
-    partition_vert_allowed &= force_vert_split;
+    partition_horz_allowed &= !has_rows;
+    partition_vert_allowed &= !has_cols;
+  }
+
+  if (bsize > BLOCK_4X4 && x->use_cb_search_range &&
+      cpi->sf.auto_min_max_partition_size == 0) {
+    int split_score = 0;
+    int none_score = 0;
+    const int score_valid = ml_prune_2pass_split_partition(
+        &pc_tree->pc_tree_stats, bsize, &split_score, &none_score);
+    if (score_valid) {
+      {
+        const int only_split_thresh = 300;
+        const int no_none_thresh = 250;
+        const int no_split_thresh = 0;
+        if (split_score > only_split_thresh) {
+          partition_none_allowed = 0;
+          partition_horz_allowed = 0;
+          partition_vert_allowed = 0;
+        } else if (split_score > no_none_thresh) {
+          partition_none_allowed = 0;
+        }
+        if (split_score < no_split_thresh) do_square_split = 0;
+      }
+      {
+        const int no_split_thresh = 120;
+        const int no_none_thresh = -120;
+        if (none_score > no_split_thresh && partition_none_allowed)
+          do_square_split = 0;
+        if (none_score < no_none_thresh) partition_none_allowed = 0;
+      }
+    } else {
+      if (pc_tree->cb_search_range == SPLIT_PLANE) {
+        partition_none_allowed = 0;
+        partition_horz_allowed = 0;
+        partition_vert_allowed = 0;
+      }
+      if (pc_tree->cb_search_range == SEARCH_SAME_PLANE) do_square_split = 0;
+      if (pc_tree->cb_search_range == NONE_PARTITION_PLANE) {
+        do_square_split = 0;
+        partition_horz_allowed = 0;
+        partition_vert_allowed = 0;
+      }
+    }
+
+    // Fall back to default values in case all partition modes are rejected.
+    if (partition_none_allowed == 0 && do_square_split == 0 &&
+        partition_horz_allowed == 0 && partition_vert_allowed == 0) {
+      do_square_split = bsize_at_least_8x8;
+      partition_none_allowed = has_rows && has_cols;
+      partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
+      partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
+    }
   }
 
-#if CONFIG_VAR_TX
-  xd->above_txfm_context =
-      cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
-  xd->left_txfm_context = xd->left_txfm_context_buffer +
-                          ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
-#endif
-#if !CONFIG_PVQ
-  save_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-  save_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
+  xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
 
 #if CONFIG_FP_MB_STATS
   if (cpi->use_fp_mb_stats) {
@@ -3712,16 +3121,17 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   }
 #endif
 
+BEGIN_PARTITION_SEARCH:
+  if (x->must_find_valid_partition) {
+    partition_none_allowed = has_rows && has_cols;
+    partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
+    partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
+  }
   // PARTITION_NONE
   if (partition_none_allowed) {
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
-#if CONFIG_SUPERTX
-                     &this_rate_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                     PARTITION_NONE,
-#endif
-                     bsize, ctx_none, best_rdc.rdcost);
+                     PARTITION_NONE, bsize, ctx_none, best_rdc.rdcost);
+    if (none_rd) *none_rd = this_rdc.rdcost;
     if (this_rdc.rate != INT_MAX) {
       if (bsize_at_least_8x8) {
         const int pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
@@ -3729,9 +3139,6 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
                                 : 0;
         this_rdc.rate += pt_cost;
         this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
-#if CONFIG_SUPERTX
-        this_rate_nocoef += pt_cost;
-#endif
       }
 
       if (this_rdc.rdcost < best_rdc.rdcost) {
@@ -3739,16 +3146,12 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
         const int64_t dist_breakout_thr =
             cpi->sf.partition_search_breakout_dist_thr >>
             ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
-             (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]));
+             (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]));
         const int rate_breakout_thr =
             cpi->sf.partition_search_breakout_rate_thr *
             num_pels_log2_lookup[bsize];
 
         best_rdc = this_rdc;
-#if CONFIG_SUPERTX
-        best_rate_nocoef = this_rate_nocoef;
-        assert(best_rate_nocoef >= 0);
-#endif
         if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
 
         // If all y, u, v transform blocks in this partition are skippable, and
@@ -3756,7 +3159,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
         // terminated for current branch of the partition search tree.
         // The dist & rate thresholds are set to 0 at speed 0 to disable the
         // early termination at that speed.
-        if (!x->e_mbd.lossless[xd->mi[0]->mbmi.segment_id] &&
+        if (!x->e_mbd.lossless[xd->mi[0]->segment_id] &&
             (ctx_none->skippable && best_rdc.dist < dist_breakout_thr &&
              best_rdc.rate < rate_breakout_thr)) {
           do_square_split = 0;
@@ -3810,202 +3213,88 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #endif
       }
     }
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
-#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-    if (!x->skip_chroma_rd) {
-      cfl_clear_sub8x8_val(xd->cfl);
-    }
-#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
 
   // store estimated motion vector
   if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
 
-#if CONFIG_SUPERTX
-  int64_t temp_best_rdcost = INT64_MAX;
-#else
-  int64_t temp_best_rdcost = best_rdc.rdcost;
-#endif
+#if CONFIG_DIST_8X8
+  uint8_t *src_plane_8x8[MAX_MB_PLANE], *dst_plane_8x8[MAX_MB_PLANE];
+
+  if (x->using_dist_8x8 && bsize == BLOCK_8X8) {
+    for (int i = 0; i < num_planes; i++) {
+      src_plane_8x8[i] = x->plane[i].src.buf;
+      dst_plane_8x8[i] = xd->plane[i].dst.buf;
+    }
+  }
+#endif  // CONFIG_DIST_8X8
 
   // PARTITION_SPLIT
-  // TODO(jingning): use the motion vectors given by the above search as
-  // the starting point of motion search in the following partition type check.
   if (do_square_split) {
+    av1_init_rd_stats(&sum_rdc);
     int reached_last_index = 0;
-    subsize = get_subsize(bsize, PARTITION_SPLIT);
-    if (bsize == BLOCK_8X8 && !unify_bsize) {
-      if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
-        pc_tree->leaf_split[0]->pred_interp_filter =
-            av1_extract_interp_filter(ctx_none->mic.mbmi.interp_filters, 0);
-
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
-#if CONFIG_SUPERTX
-                       &sum_rate_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_SPLIT,
-#endif
-                       subsize, pc_tree->leaf_split[0], temp_best_rdcost);
-      if (sum_rdc.rate == INT_MAX) {
-        sum_rdc.rdcost = INT64_MAX;
-#if CONFIG_SUPERTX
-        sum_rate_nocoef = INT_MAX;
-#endif
-      }
-#if CONFIG_SUPERTX
-      if (supertx_allowed && sum_rdc.rdcost < INT64_MAX) {
-        TX_SIZE supertx_size = max_txsize_lookup[bsize];
-        const PARTITION_TYPE best_partition = pc_tree->partitioning;
-
-        pc_tree->partitioning = PARTITION_SPLIT;
-
-        sum_rdc.rate += av1_cost_bit(
-            cm->fc->supertx_prob[partition_supertx_context_lookup
-                                     [PARTITION_SPLIT]][supertx_size],
-            0);
-        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+    subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    int idx;
 
-        if (is_inter_mode(pc_tree->leaf_split[0]->mic.mbmi.mode)) {
-          TX_TYPE best_tx = DCT_DCT;
-          RD_STATS tmp_rdc;
-          av1_init_rd_stats(&tmp_rdc);
-          tmp_rdc.rate = sum_rate_nocoef;
-
-          restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-
-          rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
-                        &tmp_rdc.rate, &tmp_rdc.dist, &best_tx, pc_tree);
-
-          tmp_rdc.rate += av1_cost_bit(
-              cm->fc->supertx_prob[partition_supertx_context_lookup
-                                       [PARTITION_SPLIT]][supertx_size],
-              1);
-          tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
-          if (tmp_rdc.rdcost < sum_rdc.rdcost) {
-            sum_rdc = tmp_rdc;
-            update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
-                                    supertx_size, pc_tree);
-          }
-        }
+    for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) {
+      const int x_idx = (idx & 1) * mi_step;
+      const int y_idx = (idx >> 1) * mi_step;
 
-        pc_tree->partitioning = best_partition;
-      }
-#endif  // CONFIG_SUPERTX
-      reached_last_index = 1;
-    } else {
-      int idx;
-      for (idx = 0; idx < 4 && sum_rdc.rdcost < temp_best_rdcost; ++idx) {
-        const int x_idx = (idx & 1) * mi_step;
-        const int y_idx = (idx >> 1) * mi_step;
-
-        if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
-          continue;
-
-        if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
-
-        pc_tree->split[idx]->index = idx;
-        rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
-                          mi_col + x_idx, subsize, &this_rdc,
-#if CONFIG_SUPERTX
-                          &this_rate_nocoef,
-#endif
-                          temp_best_rdcost - sum_rdc.rdcost,
-                          pc_tree->split[idx]);
-
-        if (this_rdc.rate == INT_MAX) {
-          sum_rdc.rdcost = INT64_MAX;
-#if CONFIG_SUPERTX
-          sum_rate_nocoef = INT_MAX;
-#endif  // CONFIG_SUPERTX
-          break;
-        } else {
-          sum_rdc.rate += this_rdc.rate;
-          sum_rdc.dist += this_rdc.dist;
-          sum_rdc.rdcost += this_rdc.rdcost;
-#if CONFIG_SUPERTX
-          sum_rate_nocoef += this_rate_nocoef;
-#endif  // CONFIG_SUPERTX
-        }
-      }
-      reached_last_index = (idx == 4);
-
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-      if (x->using_dist_8x8 && reached_last_index &&
-          sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
-        const int src_stride = x->plane[0].src.stride;
-        int64_t dist_8x8;
-        dist_8x8 =
-            dist_8x8_yuv(cpi, x, x->plane[0].src.buf - 4 * src_stride - 4);
-        sum_rdc.dist = dist_8x8;
-        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-      }
-#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
+      if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+        continue;
 
-#if CONFIG_SUPERTX
-      if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && reached_last_index) {
-        TX_SIZE supertx_size = max_txsize_lookup[bsize];
-        const PARTITION_TYPE best_partition = pc_tree->partitioning;
+      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
 
-        pc_tree->partitioning = PARTITION_SPLIT;
+      pc_tree->split[idx]->index = idx;
+      int64_t *p_split_rd = &split_rd[idx];
+      rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
+                        subsize, &this_rdc, best_rdc.rdcost - sum_rdc.rdcost,
+                        pc_tree->split[idx], p_split_rd);
 
-        sum_rdc.rate += av1_cost_bit(
-            cm->fc->supertx_prob[partition_supertx_context_lookup
-                                     [PARTITION_SPLIT]][supertx_size],
-            0);
-        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+        break;
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
 
-        if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
-          TX_TYPE best_tx = DCT_DCT;
-          RD_STATS tmp_rdc;
-          av1_init_rd_stats(&tmp_rdc);
-          tmp_rdc.rate = sum_rate_nocoef;
-
-          restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-
-          rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
-                        &tmp_rdc.rate, &tmp_rdc.dist, &best_tx, pc_tree);
-
-          tmp_rdc.rate += av1_cost_bit(
-              cm->fc->supertx_prob[partition_supertx_context_lookup
-                                       [PARTITION_SPLIT]][supertx_size],
-              1);
-          tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
-          if (tmp_rdc.rdcost < sum_rdc.rdcost) {
-            sum_rdc = tmp_rdc;
-            update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
-                                    supertx_size, pc_tree);
+        if (idx <= 1 && (bsize <= BLOCK_8X8 ||
+                         pc_tree->split[idx]->partitioning == PARTITION_NONE)) {
+          MB_MODE_INFO *const mbmi = &(pc_tree->split[idx]->none.mic);
+          PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+          // Neither palette mode nor cfl predicted
+          if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+            if (mbmi->uv_mode != UV_CFL_PRED) split_ctx_is_ready[idx] = 1;
           }
         }
-
-        pc_tree->partitioning = best_partition;
       }
-#endif  // CONFIG_SUPERTX
     }
+    reached_last_index = (idx == 4);
 
-#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-    if (!reached_last_index && sum_rdc.rdcost >= best_rdc.rdcost)
-      cfl_clear_sub8x8_val(xd->cfl);
-#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+#if CONFIG_DIST_8X8
+    if (x->using_dist_8x8 && reached_last_index &&
+        sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
+      int64_t dist_8x8;
+      dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
+#ifdef DEBUG_DIST_8X8
+      // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled
+      if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && 0 /*!CONFIG_CFL*/)
+        assert(sum_rdc.dist == dist_8x8);
+#endif  // DEBUG_DIST_8X8
+      sum_rdc.dist = dist_8x8;
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+    }
+#endif  // CONFIG_DIST_8X8
 
     if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_SPLIT];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-#if CONFIG_SUPERTX
-      sum_rate_nocoef += partition_cost[PARTITION_SPLIT];
-#endif  // CONFIG_SUPERTX
 
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
-#if CONFIG_SUPERTX
-        best_rate_nocoef = sum_rate_nocoef;
-        assert(best_rate_nocoef >= 0);
-#else
-        temp_best_rdcost = best_rdc.rdcost;
-#endif  // CONFIG_SUPERTX
         pc_tree->partitioning = PARTITION_SPLIT;
       }
     } else if (cpi->sf.less_rectangular_check) {
@@ -4013,473 +3302,362 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
       // gives better rd cost
       do_rectangular_split &= !partition_none_allowed;
     }
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }  // if (do_split)
 
   // PARTITION_HORZ
   if (partition_horz_allowed &&
-      (do_rectangular_split || av1_active_h_edge(cpi, mi_row, mi_step))) {
-    subsize = get_subsize(bsize, PARTITION_HORZ);
+      (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
+    av1_init_rd_stats(&sum_rdc);
+    subsize = get_partition_subsize(bsize, PARTITION_HORZ);
     if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
         partition_none_allowed)
       pc_tree->horizontal[0].pred_interp_filter =
-          av1_extract_interp_filter(ctx_none->mic.mbmi.interp_filters, 0);
+          av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
 
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
-#if CONFIG_SUPERTX
-                     &sum_rate_nocoef,
-#endif  // CONFIG_SUPERTX
-#if CONFIG_EXT_PARTITION_TYPES
-                     PARTITION_HORZ,
-#endif
-                     subsize, &pc_tree->horizontal[0], best_rdc.rdcost);
+                     PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
+                     best_rdc.rdcost);
+    horz_rd[0] = sum_rdc.rdcost;
 
-#if CONFIG_SUPERTX
-    abort_flag =
-        (sum_rdc.rdcost >= best_rd && (bsize > BLOCK_8X8 || unify_bsize)) ||
-        (sum_rdc.rate == INT_MAX && bsize == BLOCK_8X8);
-#endif
-    if (sum_rdc.rdcost < temp_best_rdcost && !force_horz_split &&
-        (bsize > BLOCK_8X8 || unify_bsize)) {
+    if (sum_rdc.rdcost < best_rdc.rdcost && has_rows) {
       PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0];
-      update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
-      encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
-                        NULL);
+      MB_MODE_INFO *const mbmi = &(pc_tree->horizontal[0].mic);
+      PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+      // Neither palette mode nor cfl predicted
+      if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+        if (mbmi->uv_mode != UV_CFL_PRED) horz_ctx_is_ready = 1;
+      }
+      update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1);
+      encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, mi_col,
+                        subsize, NULL);
 
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_h);
 
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->horizontal[1].pred_interp_filter =
-            av1_extract_interp_filter(ctx_h->mic.mbmi.interp_filters, 0);
+            av1_extract_interp_filter(ctx_h->mic.interp_filters, 0);
 
-#if CONFIG_SUPERTX
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
-                       &this_rate_nocoef,
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_HORZ,
-#endif
-                       subsize, &pc_tree->horizontal[1], INT64_MAX);
-#else
       rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_HORZ,
-#endif
-                       subsize, &pc_tree->horizontal[1],
+                       PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
                        best_rdc.rdcost - sum_rdc.rdcost);
-#endif  // CONFIG_SUPERTX
+      horz_rd[1] = this_rdc.rdcost;
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+#if CONFIG_DIST_8X8
       if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
-        update_state(cpi, td, &pc_tree->horizontal[1], mi_row + mi_step, mi_col,
-                     subsize, DRY_RUN_NORMAL);
-        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row + mi_step, mi_col,
-                          subsize, NULL);
+        update_state(cpi, tile_data, td, &pc_tree->horizontal[1],
+                     mi_row + mi_step, mi_col, subsize, DRY_RUN_NORMAL);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL,
+                          mi_row + mi_step, mi_col, subsize, NULL);
       }
-#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
+#endif  // CONFIG_DIST_8X8
 
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
-#if CONFIG_SUPERTX
-        sum_rate_nocoef = INT_MAX;
-#endif  // CONFIG_SUPERTX
       } else {
         sum_rdc.rate += this_rdc.rate;
         sum_rdc.dist += this_rdc.dist;
         sum_rdc.rdcost += this_rdc.rdcost;
-#if CONFIG_SUPERTX
-        sum_rate_nocoef += this_rate_nocoef;
-#endif  // CONFIG_SUPERTX
       }
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+#if CONFIG_DIST_8X8
       if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX &&
           bsize == BLOCK_8X8) {
-        const int src_stride = x->plane[0].src.stride;
         int64_t dist_8x8;
-        dist_8x8 = dist_8x8_yuv(cpi, x, x->plane[0].src.buf - 4 * src_stride);
+        dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
+#ifdef DEBUG_DIST_8X8
+        // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled
+        if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && 0 /*!CONFIG_CFL*/)
+          assert(sum_rdc.dist == dist_8x8);
+#endif  // DEBUG_DIST_8X8
         sum_rdc.dist = dist_8x8;
         sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       }
-#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
-    }
-
-#if CONFIG_SUPERTX
-    if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) {
-      TX_SIZE supertx_size = max_txsize_lookup[bsize];
-      const PARTITION_TYPE best_partition = pc_tree->partitioning;
-
-      pc_tree->partitioning = PARTITION_HORZ;
-
-      sum_rdc.rate += av1_cost_bit(
-          cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_HORZ]]
-                              [supertx_size],
-          0);
-      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-
-      if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
-        TX_TYPE best_tx = DCT_DCT;
-        RD_STATS tmp_rdc;
-        av1_init_rd_stats(&tmp_rdc);
-        tmp_rdc.rate = sum_rate_nocoef;
-
-        restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-
-        rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, &tmp_rdc.rate,
-                      &tmp_rdc.dist, &best_tx, pc_tree);
-
-        tmp_rdc.rate += av1_cost_bit(
-            cm->fc
-                ->supertx_prob[partition_supertx_context_lookup[PARTITION_HORZ]]
-                              [supertx_size],
-            1);
-        tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
-        if (tmp_rdc.rdcost < sum_rdc.rdcost) {
-          sum_rdc = tmp_rdc;
-          update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
-                                  supertx_size, pc_tree);
-        }
-      }
-
-      pc_tree->partitioning = best_partition;
+#endif  // CONFIG_DIST_8X8
     }
-#endif  // CONFIG_SUPERTX
 
-#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-    cfl_clear_sub8x8_val(xd->cfl);
-#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_HORZ];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-#if CONFIG_SUPERTX
-      sum_rate_nocoef += partition_cost[PARTITION_HORZ];
-#endif  // CONFIG_SUPERTX
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
-#if CONFIG_SUPERTX
-        best_rate_nocoef = sum_rate_nocoef;
-        assert(best_rate_nocoef >= 0);
-#endif  // CONFIG_SUPERTX
         pc_tree->partitioning = PARTITION_HORZ;
       }
     }
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
 
   // PARTITION_VERT
   if (partition_vert_allowed &&
-      (do_rectangular_split || av1_active_v_edge(cpi, mi_col, mi_step))) {
-    subsize = get_subsize(bsize, PARTITION_VERT);
+      (do_rectangular_split || active_v_edge(cpi, mi_col, mi_step))) {
+    av1_init_rd_stats(&sum_rdc);
+    subsize = get_partition_subsize(bsize, PARTITION_VERT);
 
     if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
 
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
         partition_none_allowed)
       pc_tree->vertical[0].pred_interp_filter =
-          av1_extract_interp_filter(ctx_none->mic.mbmi.interp_filters, 0);
+          av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
 
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
-#if CONFIG_SUPERTX
-                     &sum_rate_nocoef,
-#endif  // CONFIG_SUPERTX
-#if CONFIG_EXT_PARTITION_TYPES
-                     PARTITION_VERT,
-#endif
-                     subsize, &pc_tree->vertical[0], best_rdc.rdcost);
-#if CONFIG_SUPERTX
-    abort_flag =
-        (sum_rdc.rdcost >= best_rd && (bsize > BLOCK_8X8 || unify_bsize)) ||
-        (sum_rdc.rate == INT_MAX && bsize == BLOCK_8X8);
-    const int64_t vert_max_rdcost = INT64_MAX;
-#else
+                     PARTITION_VERT, subsize, &pc_tree->vertical[0],
+                     best_rdc.rdcost);
+    vert_rd[0] = sum_rdc.rdcost;
     const int64_t vert_max_rdcost = best_rdc.rdcost;
-#endif  // CONFIG_SUPERTX
-    if (sum_rdc.rdcost < vert_max_rdcost && !force_vert_split &&
-        (bsize > BLOCK_8X8 || unify_bsize)) {
-      update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 1);
-      encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
-                        NULL);
+    if (sum_rdc.rdcost < vert_max_rdcost && has_cols) {
+      MB_MODE_INFO *const mbmi = &(pc_tree->vertical[0].mic);
+      PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+      // Neither palette mode nor cfl predicted
+      if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+        if (mbmi->uv_mode != UV_CFL_PRED) vert_ctx_is_ready = 1;
+      }
+      update_state(cpi, tile_data, td, &pc_tree->vertical[0], mi_row, mi_col,
+                   subsize, 1);
+      encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, mi_col,
+                        subsize, NULL);
 
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
 
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->vertical[1].pred_interp_filter =
-            av1_extract_interp_filter(ctx_none->mic.mbmi.interp_filters, 0);
+            av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
 
-#if CONFIG_SUPERTX
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
-                       &this_rate_nocoef,
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_VERT,
-#endif
-                       subsize, &pc_tree->vertical[1],
-                       INT64_MAX - sum_rdc.rdcost);
-#else
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_VERT,
-#endif
-                       subsize, &pc_tree->vertical[1],
+                       PARTITION_VERT, subsize, &pc_tree->vertical[1],
                        best_rdc.rdcost - sum_rdc.rdcost);
-#endif  // CONFIG_SUPERTX
+      vert_rd[1] = this_rdc.rdcost;
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+#if CONFIG_DIST_8X8
       if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
-        update_state(cpi, td, &pc_tree->vertical[1], mi_row, mi_col + mi_step,
-                     subsize, DRY_RUN_NORMAL);
-        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col + mi_step,
-                          subsize, NULL);
+        update_state(cpi, tile_data, td, &pc_tree->vertical[1], mi_row,
+                     mi_col + mi_step, subsize, DRY_RUN_NORMAL);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
+                          mi_col + mi_step, subsize, NULL);
       }
-#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
+#endif  // CONFIG_DIST_8X8
 
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
-#if CONFIG_SUPERTX
-        sum_rate_nocoef = INT_MAX;
-#endif  // CONFIG_SUPERTX
       } else {
         sum_rdc.rate += this_rdc.rate;
         sum_rdc.dist += this_rdc.dist;
         sum_rdc.rdcost += this_rdc.rdcost;
-#if CONFIG_SUPERTX
-        sum_rate_nocoef += this_rate_nocoef;
-#endif  // CONFIG_SUPERTX
       }
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+#if CONFIG_DIST_8X8
       if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX &&
           bsize == BLOCK_8X8) {
         int64_t dist_8x8;
-        dist_8x8 = dist_8x8_yuv(cpi, x, x->plane[0].src.buf - 4);
+        dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
+#ifdef DEBUG_DIST_8X8
+        // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled
+        if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 &&
+            0 /* !CONFIG_CFL */)
+          assert(sum_rdc.dist == dist_8x8);
+#endif  // DEBUG_DIST_8X8
         sum_rdc.dist = dist_8x8;
         sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       }
-#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
-    }
-#if CONFIG_SUPERTX
-    if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) {
-      TX_SIZE supertx_size = max_txsize_lookup[bsize];
-      const PARTITION_TYPE best_partition = pc_tree->partitioning;
-
-      pc_tree->partitioning = PARTITION_VERT;
-
-      sum_rdc.rate += av1_cost_bit(
-          cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_VERT]]
-                              [supertx_size],
-          0);
-      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-
-      if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
-        TX_TYPE best_tx = DCT_DCT;
-        RD_STATS tmp_rdc;
-        av1_init_rd_stats(&tmp_rdc);
-        tmp_rdc.rate = sum_rate_nocoef;
-
-        restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-
-        rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, &tmp_rdc.rate,
-                      &tmp_rdc.dist, &best_tx, pc_tree);
-
-        tmp_rdc.rate += av1_cost_bit(
-            cm->fc
-                ->supertx_prob[partition_supertx_context_lookup[PARTITION_VERT]]
-                              [supertx_size],
-            1);
-        tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
-        if (tmp_rdc.rdcost < sum_rdc.rdcost) {
-          sum_rdc = tmp_rdc;
-          update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
-                                  supertx_size, pc_tree);
-        }
-      }
-
-      pc_tree->partitioning = best_partition;
+#endif  // CONFIG_DIST_8X8
     }
-#endif  // CONFIG_SUPERTX
-
-#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-    cfl_clear_sub8x8_val(xd->cfl);
-#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_VERT];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-#if CONFIG_SUPERTX
-      sum_rate_nocoef += partition_cost[PARTITION_VERT];
-#endif  // CONFIG_SUPERTX
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
-#if CONFIG_SUPERTX
-        best_rate_nocoef = sum_rate_nocoef;
-        assert(best_rate_nocoef >= 0);
-#endif  // CONFIG_SUPERTX
         pc_tree->partitioning = PARTITION_VERT;
       }
     }
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
 
-#if CONFIG_EXT_PARTITION_TYPES
   const int ext_partition_allowed =
       do_rectangular_split && bsize > BLOCK_8X8 && partition_none_allowed;
 
-#if CONFIG_EXT_PARTITION && CONFIG_EXT_PARTITION_TYPES_AB
-  // Don't allow A/B partitions on 128x128 blocks for now (support for
-  // 128x32 and 32x128 blocks doesn't yet exist).
-  const int ab_partition_allowed =
-      ext_partition_allowed && bsize < BLOCK_128X128;
-#else
-  const int ab_partition_allowed = ext_partition_allowed;
-#endif
+  // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or
+  // PARTITION_VERT_4 for this block. This is almost the same as
+  // ext_partition_allowed, except that we don't allow 128x32 or 32x128 blocks,
+  // so we require that bsize is not BLOCK_128X128.
+  const int partition4_allowed =
+      ext_partition_allowed && bsize != BLOCK_128X128;
+
+  // The standard AB partitions are allowed whenever ext-partition-types are
+  // allowed
+  int horzab_partition_allowed = ext_partition_allowed;
+  int vertab_partition_allowed = ext_partition_allowed;
+
+  if (cpi->sf.prune_ext_partition_types_search_level) {
+    if (cpi->sf.prune_ext_partition_types_search_level == 1) {
+      horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                   (pc_tree->partitioning == PARTITION_NONE &&
+                                    x->source_variance < 32) ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+      vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+                                   (pc_tree->partitioning == PARTITION_NONE &&
+                                    x->source_variance < 32) ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+    } else {
+      horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+      vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+    }
+    horz_rd[0] = (horz_rd[0] < INT64_MAX ? horz_rd[0] : 0);
+    horz_rd[1] = (horz_rd[1] < INT64_MAX ? horz_rd[1] : 0);
+    vert_rd[0] = (vert_rd[0] < INT64_MAX ? vert_rd[0] : 0);
+    vert_rd[1] = (vert_rd[1] < INT64_MAX ? vert_rd[1] : 0);
+    split_rd[0] = (split_rd[0] < INT64_MAX ? split_rd[0] : 0);
+    split_rd[1] = (split_rd[1] < INT64_MAX ? split_rd[1] : 0);
+    split_rd[2] = (split_rd[2] < INT64_MAX ? split_rd[2] : 0);
+    split_rd[3] = (split_rd[3] < INT64_MAX ? split_rd[3] : 0);
+  }
+  int horza_partition_allowed = horzab_partition_allowed;
+  int horzb_partition_allowed = horzab_partition_allowed;
+  if (cpi->sf.prune_ext_partition_types_search_level) {
+    const int64_t horz_a_rd = horz_rd[1] + split_rd[0] + split_rd[1];
+    const int64_t horz_b_rd = horz_rd[0] + split_rd[2] + split_rd[3];
+    switch (cpi->sf.prune_ext_partition_types_search_level) {
+      case 1:
+        horza_partition_allowed &= (horz_a_rd / 16 * 14 < best_rdc.rdcost);
+        horzb_partition_allowed &= (horz_b_rd / 16 * 14 < best_rdc.rdcost);
+        break;
+      case 2:
+      default:
+        horza_partition_allowed &= (horz_a_rd / 16 * 15 < best_rdc.rdcost);
+        horzb_partition_allowed &= (horz_b_rd / 16 * 15 < best_rdc.rdcost);
+        break;
+    }
+  }
+
+  int verta_partition_allowed = vertab_partition_allowed;
+  int vertb_partition_allowed = vertab_partition_allowed;
+  if (cpi->sf.prune_ext_partition_types_search_level) {
+    const int64_t vert_a_rd = vert_rd[1] + split_rd[0] + split_rd[2];
+    const int64_t vert_b_rd = vert_rd[0] + split_rd[1] + split_rd[3];
+    switch (cpi->sf.prune_ext_partition_types_search_level) {
+      case 1:
+        verta_partition_allowed &= (vert_a_rd / 16 * 14 < best_rdc.rdcost);
+        vertb_partition_allowed &= (vert_b_rd / 16 * 14 < best_rdc.rdcost);
+        break;
+      case 2:
+      default:
+        verta_partition_allowed &= (vert_a_rd / 16 * 15 < best_rdc.rdcost);
+        vertb_partition_allowed &= (vert_b_rd / 16 * 15 < best_rdc.rdcost);
+        break;
+    }
+  }
+
+  if (cpi->sf.ml_prune_ab_partition && ext_partition_allowed &&
+      partition_horz_allowed && partition_vert_allowed) {
+    ml_prune_ab_partition(bsize, pc_tree->partitioning,
+                          get_unsigned_bits(x->source_variance),
+                          best_rdc.rdcost, horz_rd, vert_rd, split_rd,
+                          &horza_partition_allowed, &horzb_partition_allowed,
+                          &verta_partition_allowed, &vertb_partition_allowed);
+  }
 
   // PARTITION_HORZ_A
-  if (partition_horz_allowed && ab_partition_allowed) {
-#if CONFIG_EXT_PARTITION_TYPES_AB
-    rd_test_partition3(
-        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontala,
-        ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_A,
-#if CONFIG_SUPERTX
-        best_rd, &best_rate_nocoef, &x_ctx,
-#endif
-        mi_row, mi_col, get_subsize(bsize, PARTITION_HORZ_4),
-        mi_row + mi_step / 2, mi_col, get_subsize(bsize, PARTITION_HORZ_4),
-        mi_row + mi_step, mi_col, get_subsize(bsize, PARTITION_HORZ));
-#else
-    subsize = get_subsize(bsize, PARTITION_HORZ_A);
+  if (partition_horz_allowed && horza_partition_allowed) {
+    subsize = get_partition_subsize(bsize, PARTITION_HORZ_A);
+    pc_tree->horizontala[0].rd_mode_is_ready = 0;
+    pc_tree->horizontala[1].rd_mode_is_ready = 0;
+    pc_tree->horizontala[2].rd_mode_is_ready = 0;
+    if (split_ctx_is_ready[0]) {
+      av1_copy_tree_context(&pc_tree->horizontala[0], &pc_tree->split[0]->none);
+      pc_tree->horizontala[0].mic.partition = PARTITION_HORZ_A;
+      pc_tree->horizontala[0].rd_mode_is_ready = 1;
+      if (split_ctx_is_ready[1]) {
+        av1_copy_tree_context(&pc_tree->horizontala[1],
+                              &pc_tree->split[1]->none);
+        pc_tree->horizontala[1].mic.partition = PARTITION_HORZ_A;
+        pc_tree->horizontala[1].rd_mode_is_ready = 1;
+      }
+    }
     rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                        pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize,
-                       PARTITION_HORZ_A,
-#if CONFIG_SUPERTX
-                       best_rd, &best_rate_nocoef, &x_ctx,
-#endif
-                       mi_row, mi_col, bsize2, mi_row, mi_col + mi_step, bsize2,
-                       mi_row + mi_step, mi_col, subsize);
-#endif
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif  // !CONFIG_PVQ
+                       PARTITION_HORZ_A, mi_row, mi_col, bsize2, mi_row,
+                       mi_col + mi_step, bsize2, mi_row + mi_step, mi_col,
+                       subsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
   // PARTITION_HORZ_B
-  if (partition_horz_allowed && ab_partition_allowed) {
-#if CONFIG_EXT_PARTITION_TYPES_AB
-    rd_test_partition3(
-        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontalb,
-        ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_B,
-#if CONFIG_SUPERTX
-        best_rd, &best_rate_nocoef, &x_ctx,
-#endif
-        mi_row, mi_col, get_subsize(bsize, PARTITION_HORZ), mi_row + mi_step,
-        mi_col, get_subsize(bsize, PARTITION_HORZ_4), mi_row + 3 * mi_step / 2,
-        mi_col, get_subsize(bsize, PARTITION_HORZ_4));
-#else
-    subsize = get_subsize(bsize, PARTITION_HORZ_B);
+  if (partition_horz_allowed && horzb_partition_allowed) {
+    subsize = get_partition_subsize(bsize, PARTITION_HORZ_B);
+    pc_tree->horizontalb[0].rd_mode_is_ready = 0;
+    pc_tree->horizontalb[1].rd_mode_is_ready = 0;
+    pc_tree->horizontalb[2].rd_mode_is_ready = 0;
+    if (horz_ctx_is_ready) {
+      av1_copy_tree_context(&pc_tree->horizontalb[0], &pc_tree->horizontal[0]);
+      pc_tree->horizontalb[0].mic.partition = PARTITION_HORZ_B;
+      pc_tree->horizontalb[0].rd_mode_is_ready = 1;
+    }
     rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                        pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize,
-                       PARTITION_HORZ_B,
-#if CONFIG_SUPERTX
-                       best_rd, &best_rate_nocoef, &x_ctx,
-#endif
-                       mi_row, mi_col, subsize, mi_row + mi_step, mi_col,
-                       bsize2, mi_row + mi_step, mi_col + mi_step, bsize2);
-#endif
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif  // !CONFIG_PVQ
+                       PARTITION_HORZ_B, mi_row, mi_col, subsize,
+                       mi_row + mi_step, mi_col, bsize2, mi_row + mi_step,
+                       mi_col + mi_step, bsize2);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
+
   // PARTITION_VERT_A
-  if (partition_vert_allowed && ab_partition_allowed) {
-#if CONFIG_EXT_PARTITION_TYPES_AB
-    rd_test_partition3(
-        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticala,
-        ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_A,
-#if CONFIG_SUPERTX
-        best_rd, &best_rate_nocoef, &x_ctx,
-#endif
-        mi_row, mi_col, get_subsize(bsize, PARTITION_VERT_4), mi_row,
-        mi_col + mi_step / 2, get_subsize(bsize, PARTITION_VERT_4), mi_row,
-        mi_col + mi_step, get_subsize(bsize, PARTITION_VERT));
-#else
-    subsize = get_subsize(bsize, PARTITION_VERT_A);
+  if (partition_vert_allowed && verta_partition_allowed) {
+    subsize = get_partition_subsize(bsize, PARTITION_VERT_A);
+    pc_tree->verticala[0].rd_mode_is_ready = 0;
+    pc_tree->verticala[1].rd_mode_is_ready = 0;
+    pc_tree->verticala[2].rd_mode_is_ready = 0;
+    if (split_ctx_is_ready[0]) {
+      av1_copy_tree_context(&pc_tree->verticala[0], &pc_tree->split[0]->none);
+      pc_tree->verticala[0].mic.partition = PARTITION_VERT_A;
+      pc_tree->verticala[0].rd_mode_is_ready = 1;
+    }
     rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                        pc_tree->verticala, ctx_none, mi_row, mi_col, bsize,
-                       PARTITION_VERT_A,
-#if CONFIG_SUPERTX
-                       best_rd, &best_rate_nocoef, &x_ctx,
-#endif
-                       mi_row, mi_col, bsize2, mi_row + mi_step, mi_col, bsize2,
-                       mi_row, mi_col + mi_step, subsize);
-#endif
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif  // !CONFIG_PVQ
+                       PARTITION_VERT_A, mi_row, mi_col, bsize2,
+                       mi_row + mi_step, mi_col, bsize2, mi_row,
+                       mi_col + mi_step, subsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
   // PARTITION_VERT_B
-  if (partition_vert_allowed && ab_partition_allowed) {
-#if CONFIG_EXT_PARTITION_TYPES_AB
-    rd_test_partition3(
-        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticalb,
-        ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_B,
-#if CONFIG_SUPERTX
-        best_rd, &best_rate_nocoef, &x_ctx,
-#endif
-        mi_row, mi_col, get_subsize(bsize, PARTITION_VERT), mi_row,
-        mi_col + mi_step, get_subsize(bsize, PARTITION_VERT_4), mi_row,
-        mi_col + 3 * mi_step / 2, get_subsize(bsize, PARTITION_VERT_4));
-#else
-    subsize = get_subsize(bsize, PARTITION_VERT_B);
+  if (partition_vert_allowed && vertb_partition_allowed) {
+    subsize = get_partition_subsize(bsize, PARTITION_VERT_B);
+    pc_tree->verticalb[0].rd_mode_is_ready = 0;
+    pc_tree->verticalb[1].rd_mode_is_ready = 0;
+    pc_tree->verticalb[2].rd_mode_is_ready = 0;
+    if (vert_ctx_is_ready) {
+      av1_copy_tree_context(&pc_tree->verticalb[0], &pc_tree->vertical[0]);
+      pc_tree->verticalb[0].mic.partition = PARTITION_VERT_B;
+      pc_tree->verticalb[0].rd_mode_is_ready = 1;
+    }
     rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                        pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize,
-                       PARTITION_VERT_B,
-#if CONFIG_SUPERTX
-                       best_rd, &best_rate_nocoef, &x_ctx,
-#endif
-                       mi_row, mi_col, subsize, mi_row, mi_col + mi_step,
-                       bsize2, mi_row + mi_step, mi_col + mi_step, bsize2);
-#endif
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif  // !CONFIG_PVQ
+                       PARTITION_VERT_B, mi_row, mi_col, subsize, mi_row,
+                       mi_col + mi_step, bsize2, mi_row + mi_step,
+                       mi_col + mi_step, bsize2);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
 
-#if CONFIG_EXT_PARTITION
-  const int can_partition_4 = (bsize == BLOCK_128X128 || bsize == BLOCK_64X64 ||
-                               bsize == BLOCK_32X32 || bsize == BLOCK_16X16);
-#else
-  const int can_partition_4 =
-      (bsize == BLOCK_64X64 || bsize == BLOCK_32X32 || bsize == BLOCK_16X16);
-#endif  // CONFIG_EXT_PARTITION
-
   // PARTITION_HORZ_4
-  // TODO(david.barker): For this and PARTITION_VERT_4,
-  // * Add support for BLOCK_16X16 once we support 2x8 and 8x2 blocks for the
-  //   chroma plane
-  // * Add support for supertx
-  if (can_partition_4 && partition_horz_allowed && !force_horz_split &&
-      (do_rectangular_split || av1_active_h_edge(cpi, mi_row, mi_step))) {
+  int partition_horz4_allowed = partition4_allowed && partition_horz_allowed;
+  if (cpi->sf.prune_ext_partition_types_search_level == 2) {
+    partition_horz4_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                pc_tree->partitioning == PARTITION_HORZ_A ||
+                                pc_tree->partitioning == PARTITION_HORZ_B ||
+                                pc_tree->partitioning == PARTITION_SPLIT ||
+                                pc_tree->partitioning == PARTITION_NONE);
+  }
+  if (partition_horz4_allowed && has_rows &&
+      (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
+    av1_init_rd_stats(&sum_rdc);
     const int quarter_step = mi_size_high[bsize] / 4;
     PICK_MODE_CONTEXT *ctx_prev = ctx_none;
 
-    subsize = get_subsize(bsize, PARTITION_HORZ_4);
+    subsize = get_partition_subsize(bsize, PARTITION_HORZ_4);
 
     for (int i = 0; i < 4; ++i) {
       int this_mi_row = mi_row + i * quarter_step;
@@ -4488,6 +3666,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 
       PICK_MODE_CONTEXT *ctx_this = &pc_tree->horizontal4[i];
 
+      ctx_this->rd_mode_is_ready = 0;
       if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 0), (i == 3),
                            this_mi_row, mi_col, subsize, &best_rdc, &sum_rdc,
                            &this_rdc, PARTITION_HORZ_4, ctx_prev, ctx_this))
@@ -4504,19 +3683,25 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
         pc_tree->partitioning = PARTITION_HORZ_4;
       }
     }
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }
+
   // PARTITION_VERT_4
-  if (can_partition_4 && partition_vert_allowed && !force_vert_split &&
-      (do_rectangular_split || av1_active_v_edge(cpi, mi_row, mi_step))) {
+  int partition_vert4_allowed = partition4_allowed && partition_vert_allowed;
+  if (cpi->sf.prune_ext_partition_types_search_level == 2) {
+    partition_vert4_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+                                pc_tree->partitioning == PARTITION_VERT_A ||
+                                pc_tree->partitioning == PARTITION_VERT_B ||
+                                pc_tree->partitioning == PARTITION_SPLIT ||
+                                pc_tree->partitioning == PARTITION_NONE);
+  }
+  if (partition_vert4_allowed && has_cols &&
+      (do_rectangular_split || active_v_edge(cpi, mi_row, mi_step))) {
+    av1_init_rd_stats(&sum_rdc);
     const int quarter_step = mi_size_wide[bsize] / 4;
     PICK_MODE_CONTEXT *ctx_prev = ctx_none;
 
-    subsize = get_subsize(bsize, PARTITION_VERT_4);
+    subsize = get_partition_subsize(bsize, PARTITION_VERT_4);
 
     for (int i = 0; i < 4; ++i) {
       int this_mi_col = mi_col + i * quarter_step;
@@ -4525,6 +3710,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 
       PICK_MODE_CONTEXT *ctx_this = &pc_tree->vertical4[i];
 
+      ctx_this->rd_mode_is_ready = 0;
       if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 0), (i == 3), mi_row,
                            this_mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc,
                            PARTITION_VERT_4, ctx_prev, ctx_this))
@@ -4541,13 +3727,15 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
         pc_tree->partitioning = PARTITION_VERT_4;
       }
     }
-#if !CONFIG_PVQ
-    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
-#else
-    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
-#endif
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  if (bsize == cm->seq_params.sb_size && best_rdc.rate == INT_MAX) {
+    // Did not find a valid partition, go back and search again, with less
+    // constraint on which partition types to search.
+    x->must_find_valid_partition = 1;
+    goto BEGIN_PARTITION_SEARCH;
   }
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
   // TODO(jbb): This code added so that we avoid static analysis
   // warning related to the fact that best_rd isn't used after this
@@ -4556,44 +3744,27 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   (void)best_rd;
   *rd_cost = best_rdc;
 
-#if CONFIG_SUPERTX
-  *rate_nocoef = best_rate_nocoef;
-#endif  // CONFIG_SUPERTX
-
   if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
       pc_tree->index != 3) {
-    if (bsize == cm->sb_size) {
-#if CONFIG_MOTION_VAR && NC_MODE_INFO
-      set_mode_info_sb(cpi, td, tile_info, tp, mi_row, mi_col, bsize, pc_tree);
-#endif
-
-#if CONFIG_LV_MAP
+    if (bsize == cm->seq_params.sb_size) {
       x->cb_offset = 0;
-#endif
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-      set_sb_mi_boundaries(cm, xd, mi_row, mi_col);
-#endif
-      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
                 pc_tree, NULL);
     } else {
-      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
                 pc_tree, NULL);
     }
   }
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+#if CONFIG_DIST_8X8
   if (x->using_dist_8x8 && best_rdc.rate < INT_MAX &&
       best_rdc.dist < INT64_MAX && bsize == BLOCK_4X4 && pc_tree->index == 3) {
-    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+    encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
               pc_tree, NULL);
   }
-#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
+#endif  // CONFIG_DIST_8X8
 
-  if (bsize == cm->sb_size) {
-#if !CONFIG_PVQ && !CONFIG_LV_MAP
-    assert(tp_orig < *tp || (tp_orig == *tp && xd->mi[0]->mbmi.skip));
-#endif
+  if (bsize == cm->seq_params.sb_size) {
     assert(best_rdc.rate < INT_MAX);
     assert(best_rdc.dist < INT64_MAX);
   } else {
@@ -4601,71 +3772,62 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   }
 }
 
+// Set all the counters as max.
+static void init_first_partition_pass_stats_tables(
+    FIRST_PARTITION_PASS_STATS *stats) {
+  for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
+    memset(stats[i].ref0_counts, 0xff, sizeof(stats[i].ref0_counts));
+    memset(stats[i].ref1_counts, 0xff, sizeof(stats[i].ref1_counts));
+    stats[i].sample_counts = INT_MAX;
+  }
+}
+
+// Minimum number of samples to trigger the
+// mode_pruning_based_on_two_pass_partition_search feature.
+#define FIRST_PARTITION_PASS_MIN_SAMPLES 16
+
 static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
                              TileDataEnc *tile_data, int mi_row,
                              TOKENEXTRA **tp) {
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   const TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   SPEED_FEATURES *const sf = &cpi->sf;
   int mi_col;
-#if CONFIG_EXT_PARTITION
   const int leaf_nodes = 256;
-#else
-  const int leaf_nodes = 64;
-#endif  // CONFIG_EXT_PARTITION
 
   // Initialize the left context for the new SB row
   av1_zero_left_context(xd);
 
   // Reset delta for every tile
-  if (cm->delta_q_present_flag)
-    if (mi_row == tile_info->mi_row_start) xd->prev_qindex = cm->base_qindex;
-#if CONFIG_EXT_DELTA_Q
-  if (cm->delta_lf_present_flag) {
-#if CONFIG_LOOPFILTER_LEVEL
-    if (mi_row == tile_info->mi_row_start)
-      for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id)
-        xd->prev_delta_lf[lf_id] = 0;
-#endif  // CONFIG_LOOPFILTER_LEVEL
-    if (mi_row == tile_info->mi_row_start) xd->prev_delta_lf_from_base = 0;
+  if (mi_row == tile_info->mi_row_start) {
+    if (cm->delta_q_present_flag) xd->current_qindex = cm->base_qindex;
+    if (cm->delta_lf_present_flag) {
+      av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+    }
   }
-#endif
 
   // Code each SB in the row
   for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
-       mi_col += cm->mib_size) {
+       mi_col += cm->seq_params.mib_size) {
     const struct segmentation *const seg = &cm->seg;
     int dummy_rate;
     int64_t dummy_dist;
     RD_STATS dummy_rdc;
-#if CONFIG_SUPERTX
-    int dummy_rate_nocoef;
-#endif  // CONFIG_SUPERTX
     int i;
     int seg_skip = 0;
 
     const int idx_str = cm->mi_stride * mi_row + mi_col;
-    MODE_INFO **mi = cm->mi_grid_visible + idx_str;
-    PC_TREE *const pc_root = td->pc_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
+    MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+    PC_TREE *const pc_root =
+        td->pc_root[cm->seq_params.mib_size_log2 - MIN_MIB_SIZE_LOG2];
 
-#if CONFIG_LV_MAP && LV_MAP_PROB
-    av1_fill_coeff_costs(&td->mb, xd->tile_ctx);
-#else
-    av1_fill_token_costs_from_cdf(x->token_head_costs,
-                                  x->e_mbd.tile_ctx->coef_head_cdfs);
-    av1_fill_token_costs_from_cdf(x->token_tail_costs,
-                                  x->e_mbd.tile_ctx->coef_tail_cdfs);
-#endif
+    av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes);
     av1_fill_mode_rates(cm, x, xd->tile_ctx);
 
     if (sf->adaptive_pred_interp_filter) {
-#if !CONFIG_CB4X4
-      for (i = 0; i < leaf_nodes; ++i)
-        td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
-#endif
-
       for (i = 0; i < leaf_nodes; ++i) {
         td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
         td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
@@ -4674,29 +3836,43 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
       }
     }
 
-    x->tx_rd_record.num = x->tx_rd_record.index_start = 0;
+    x->mb_rd_record.num = x->mb_rd_record.index_start = 0;
+
+    av1_zero(x->txb_rd_record_8X8);
+    av1_zero(x->txb_rd_record_16X16);
+    av1_zero(x->txb_rd_record_32X32);
+    av1_zero(x->txb_rd_record_64X64);
+    av1_zero(x->txb_rd_record_intra);
+
     av1_zero(x->pred_mv);
     pc_root->index = 0;
 
     if (seg->enabled) {
       const uint8_t *const map =
           seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-      int segment_id = get_segment_id(cm, map, cm->sb_size, mi_row, mi_col);
+      int segment_id =
+          map ? get_segment_id(cm, map, cm->seq_params.sb_size, mi_row, mi_col)
+              : 0;
       seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
     }
-#if CONFIG_AMVR
-    xd->cur_frame_mv_precision_level = cm->cur_frame_mv_precision_level;
-#endif
+    xd->cur_frame_force_integer_mv = cm->cur_frame_force_integer_mv;
 
     if (cm->delta_q_present_flag) {
-      // Test mode for delta quantization
-      int sb_row = mi_row >> 3;
-      int sb_col = mi_col >> 3;
-      int sb_stride = (cm->width + MAX_SB_SIZE - 1) >> MAX_SB_SIZE_LOG2;
-      int index = ((sb_row * sb_stride + sb_col + 8) & 31) - 16;
-
-      // Ensure divisibility of delta_qindex by delta_q_res
-      int offset_qindex = (index < 0 ? -index - 8 : index - 8);
+      // Delta-q modulation based on variance
+      av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes);
+
+      int offset_qindex;
+      if (DELTAQ_MODULATION == 1) {
+        const int block_wavelet_energy_level =
+            av1_block_wavelet_energy_level(cpi, x, cm->seq_params.sb_size);
+        offset_qindex = av1_compute_deltaq_from_energy_level(
+            cpi, block_wavelet_energy_level);
+      } else {
+        const int block_var_level =
+            av1_block_energy(cpi, x, cm->seq_params.sb_size);
+        offset_qindex =
+            av1_compute_deltaq_from_energy_level(cpi, block_var_level);
+      }
       int qmask = ~(cm->delta_q_res - 1);
       int current_qindex = clamp(cm->base_qindex + offset_qindex,
                                  cm->delta_q_res, 256 - cm->delta_q_res);
@@ -4707,136 +3883,163 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
       assert(current_qindex > 0);
 
       xd->delta_qindex = current_qindex - cm->base_qindex;
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
-      xd->mi[0]->mbmi.current_q_index = current_qindex;
-#if !CONFIG_EXT_DELTA_Q
-      xd->mi[0]->mbmi.segment_id = 0;
-#endif  // CONFIG_EXT_DELTA_Q
-      av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
-#if CONFIG_EXT_DELTA_Q
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
+      xd->mi[0]->current_qindex = current_qindex;
+      av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
       if (cpi->oxcf.deltaq_mode == DELTA_Q_LF) {
         int j, k;
         int lfmask = ~(cm->delta_lf_res - 1);
-        int current_delta_lf_from_base = offset_qindex / 2;
-        current_delta_lf_from_base =
-            ((current_delta_lf_from_base + cm->delta_lf_res / 2) & lfmask);
+        int delta_lf_from_base = offset_qindex / 2;
+        delta_lf_from_base =
+            ((delta_lf_from_base + cm->delta_lf_res / 2) & lfmask);
 
         // pre-set the delta lf for loop filter. Note that this value is set
         // before mi is assigned for each block in current superblock
-        for (j = 0; j < AOMMIN(cm->mib_size, cm->mi_rows - mi_row); j++) {
-          for (k = 0; k < AOMMIN(cm->mib_size, cm->mi_cols - mi_col); k++) {
+        for (j = 0; j < AOMMIN(cm->seq_params.mib_size, cm->mi_rows - mi_row);
+             j++) {
+          for (k = 0; k < AOMMIN(cm->seq_params.mib_size, cm->mi_cols - mi_col);
+               k++) {
             cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)]
-                .mbmi.current_delta_lf_from_base =
-                clamp(current_delta_lf_from_base, 0, MAX_LOOP_FILTER);
-#if CONFIG_LOOPFILTER_LEVEL
-            for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) {
+                .delta_lf_from_base =
+                clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+            const int frame_lf_count =
+                av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+            for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
               cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)]
-                  .mbmi.curr_delta_lf[lf_id] = current_delta_lf_from_base;
+                  .delta_lf[lf_id] =
+                  clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
             }
-#endif  // CONFIG_LOOPFILTER_LEVEL
           }
         }
       }
-#endif  // CONFIG_EXT_DELTA_Q
     }
 
     x->source_variance = UINT_MAX;
     if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
       BLOCK_SIZE bsize;
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
-      bsize = seg_skip ? cm->sb_size : sf->always_this_block_size;
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
+      bsize = seg_skip ? cm->seq_params.sb_size : sf->always_this_block_size;
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->sb_size,
-                       &dummy_rate, &dummy_dist,
-#if CONFIG_SUPERTX
-                       &dummy_rate_nocoef,
-#endif  // CONFIG_SUPERTX
-                       1, pc_root);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                       cm->seq_params.sb_size, &dummy_rate, &dummy_dist, 1,
+                       pc_root);
     } else if (cpi->partition_search_skippable_frame) {
       BLOCK_SIZE bsize;
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
       bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->sb_size,
-                       &dummy_rate, &dummy_dist,
-#if CONFIG_SUPERTX
-                       &dummy_rate_nocoef,
-#endif  // CONFIG_SUPERTX
-                       1, pc_root);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                       cm->seq_params.sb_size, &dummy_rate, &dummy_dist, 1,
+                       pc_root);
     } else {
       // If required set upper and lower partition size limits
       if (sf->auto_min_max_partition_size) {
-        set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
+        set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
         rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
                                 &x->min_partition_size, &x->max_partition_size);
       }
-      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, cm->sb_size,
-                        &dummy_rdc,
-#if CONFIG_SUPERTX
-                        &dummy_rate_nocoef,
-#endif  // CONFIG_SUPERTX
-                        INT64_MAX, pc_root);
+
+      reset_partition(pc_root, cm->seq_params.sb_size);
+      x->use_cb_search_range = 0;
+      init_first_partition_pass_stats_tables(x->first_partition_pass_stats);
+      if (cpi->sf.two_pass_partition_search &&
+          mi_row + mi_size_high[cm->seq_params.sb_size] < cm->mi_rows &&
+          mi_col + mi_size_wide[cm->seq_params.sb_size] < cm->mi_cols &&
+          cm->frame_type != KEY_FRAME) {
+        x->cb_partition_scan = 1;
+        // Reset the stats tables.
+        if (sf->mode_pruning_based_on_two_pass_partition_search)
+          av1_zero(x->first_partition_pass_stats);
+        rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+                              cm->seq_params.sb_size, &dummy_rdc, INT64_MAX,
+                              pc_root, NULL);
+        x->cb_partition_scan = 0;
+
+        x->source_variance = UINT_MAX;
+        if (sf->adaptive_pred_interp_filter) {
+          for (i = 0; i < leaf_nodes; ++i) {
+            td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
+            td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
+            td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
+            td->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
+          }
+        }
+
+        x->mb_rd_record.num = x->mb_rd_record.index_start = 0;
+        av1_zero(x->txb_rd_record_8X8);
+        av1_zero(x->txb_rd_record_16X16);
+        av1_zero(x->txb_rd_record_32X32);
+        av1_zero(x->txb_rd_record_64X64);
+        av1_zero(x->txb_rd_record_intra);
+        av1_zero(x->pred_mv);
+        pc_root->index = 0;
+
+        for (int idy = 0; idy < mi_size_high[cm->seq_params.sb_size]; ++idy) {
+          for (int idx = 0; idx < mi_size_wide[cm->seq_params.sb_size]; ++idx) {
+            const int offset = cm->mi_stride * (mi_row + idy) + (mi_col + idx);
+            cm->mi_grid_visible[offset] = 0;
+          }
+        }
+
+        x->use_cb_search_range = 1;
+
+        if (sf->mode_pruning_based_on_two_pass_partition_search) {
+          for (i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
+            FIRST_PARTITION_PASS_STATS *const stat =
+                &x->first_partition_pass_stats[i];
+            if (stat->sample_counts < FIRST_PARTITION_PASS_MIN_SAMPLES) {
+              // If there are not enough samples collected, make all available.
+              memset(stat->ref0_counts, 0xff, sizeof(stat->ref0_counts));
+              memset(stat->ref1_counts, 0xff, sizeof(stat->ref1_counts));
+            } else if (sf->selective_ref_frame < 2) {
+              // ALTREF2_FRAME and BWDREF_FRAME may be skipped during the
+              // initial partition scan, so we don't eliminate them.
+              stat->ref0_counts[ALTREF2_FRAME] = 0xff;
+              stat->ref1_counts[ALTREF2_FRAME] = 0xff;
+              stat->ref0_counts[BWDREF_FRAME] = 0xff;
+              stat->ref1_counts[BWDREF_FRAME] = 0xff;
+            }
+          }
+        }
+
+        rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+                          cm->seq_params.sb_size, &dummy_rdc, INT64_MAX,
+                          pc_root, NULL);
+      } else {
+        rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+                          cm->seq_params.sb_size, &dummy_rdc, INT64_MAX,
+                          pc_root, NULL);
+      }
+    }
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+    // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile.
+    if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
+        cm->tile_rows == 1) {
+      av1_inter_mode_data_fit(x->rdmult);
     }
+#endif
   }
 }
 
 static void init_encode_frame_mb_context(AV1_COMP *cpi) {
-  MACROBLOCK *const x = &cpi->td.mb;
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
   // Copy data over into macro block data structures.
-  av1_setup_src_planes(x, cpi->source, 0, 0);
+  av1_setup_src_planes(x, cpi->source, 0, 0, num_planes);
 
-  av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
+  av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y, num_planes);
 }
 
-#if !CONFIG_REF_ADAPT
-static int check_dual_ref_flags(AV1_COMP *cpi) {
-  const int ref_flags = cpi->ref_frame_flags;
-
-  if (segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) {
-    return 0;
-  } else {
-    return (!!(ref_flags & AOM_GOLD_FLAG) + !!(ref_flags & AOM_LAST_FLAG) +
-#if CONFIG_EXT_REFS
-            !!(ref_flags & AOM_LAST2_FLAG) + !!(ref_flags & AOM_LAST3_FLAG) +
-            !!(ref_flags & AOM_BWD_FLAG) + !!(ref_flags & AOM_ALT2_FLAG) +
-#endif  // CONFIG_EXT_REFS
-            !!(ref_flags & AOM_ALT_FLAG)) >= 2;
-  }
-}
-#endif  // !CONFIG_REF_ADAPT
-
-#if !CONFIG_VAR_TX
-static void reset_skip_tx_size(AV1_COMMON *cm, TX_SIZE max_tx_size) {
-  int mi_row, mi_col;
-  const int mis = cm->mi_stride;
-  MODE_INFO **mi_ptr = cm->mi_grid_visible;
-
-  for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) {
-    for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
-      if (txsize_sqr_up_map[mi_ptr[mi_col]->mbmi.tx_size] > max_tx_size)
-        mi_ptr[mi_col]->mbmi.tx_size = max_tx_size;
-    }
-  }
-}
-#endif
-
 static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) {
   if (frame_is_intra_only(&cpi->common)) return INTRA_FRAME;
-#if CONFIG_EXT_REFS
   // We will not update the golden frame with an internal overlay frame
   else if ((cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) ||
            cpi->rc.is_src_frame_ext_arf)
-#else
-  else if (cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame)
-#endif  // CONFIG_EXT_REFS
     return ALTREF_FRAME;
-  else if (cpi->refresh_golden_frame ||
-#if CONFIG_EXT_REFS
-           cpi->refresh_alt2_ref_frame ||
-#endif  // CONFIG_EXT_REFS
+  else if (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
            cpi->refresh_alt_ref_frame)
     return GOLDEN_FRAME;
   else
@@ -4846,22 +4049,19 @@ static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) {
 }
 
 static TX_MODE select_tx_mode(const AV1_COMP *cpi) {
-  if (cpi->common.all_lossless) return ONLY_4X4;
-#if CONFIG_VAR_TX_NO_TX_MODE
-  return TX_MODE_SELECT;
-#else
+  if (cpi->common.coded_lossless) return ONLY_4X4;
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
-    return ALLOW_32X32 + CONFIG_TX64X64;
+    return TX_MODE_LARGEST;
   else if (cpi->sf.tx_size_search_method == USE_FULL_RD ||
-           cpi->sf.tx_size_search_method == USE_TX_8X8)
+           cpi->sf.tx_size_search_method == USE_FAST_RD)
     return TX_MODE_SELECT;
   else
     return cpi->common.tx_mode;
-#endif  // CONFIG_VAR_TX_NO_TX_MODE
 }
 
 void av1_init_tile_data(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   const int tile_cols = cm->tile_cols;
   const int tile_rows = cm->tile_rows;
   int tile_col, tile_row;
@@ -4886,29 +4086,23 @@ void av1_init_tile_data(AV1_COMP *cpi) {
             tile_data->mode_map[i][j] = j;
           }
         }
-#if CONFIG_PVQ
-        // This will be dynamically increased as more pvq block is encoded.
-        tile_data->pvq_q.buf_len = 1000;
-        CHECK_MEM_ERROR(
-            cm, tile_data->pvq_q.buf,
-            aom_malloc(tile_data->pvq_q.buf_len * sizeof(PVQ_INFO)));
-        tile_data->pvq_q.curr_pos = 0;
-#endif
       }
   }
 
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      TileInfo *const tile_info =
-          &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
+      TileDataEnc *const tile_data =
+          &cpi->tile_data[tile_row * tile_cols + tile_col];
+      TileInfo *const tile_info = &tile_data->tile_info;
       av1_tile_init(tile_info, cm, tile_row, tile_col);
 
       cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
       pre_tok = cpi->tile_tok[tile_row][tile_col];
-      tile_tok = allocated_tokens(*tile_info);
-#if CONFIG_PVQ
-      cpi->tile_data[tile_row * tile_cols + tile_col].pvq_q.curr_pos = 0;
-#endif
+      tile_tok = allocated_tokens(
+          *tile_info, cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
+      tile_data->allow_update_cdf = !cm->large_scale_tile;
+      tile_data->allow_update_cdf =
+          tile_data->allow_update_cdf && !cm->disable_cdf_update;
     }
   }
 }
@@ -4922,134 +4116,35 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
   TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
   int mi_row;
 
-#if CONFIG_DEPENDENT_HORZTILES
-  if ((!cm->dependent_horz_tiles) || (tile_row == 0) ||
-      tile_info->tg_horz_boundary) {
-    av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end);
-  }
-#else
-  av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end);
-#endif
+  av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end,
+                         tile_row);
+  av1_init_above_context(cm, &td->mb.e_mbd, tile_row);
 
   // Set up pointers to per thread motion search counters.
   this_tile->m_search_count = 0;   // Count of motion search hits.
   this_tile->ex_search_count = 0;  // Exhaustive mesh search hits.
   td->mb.m_search_count_ptr = &this_tile->m_search_count;
   td->mb.ex_search_count_ptr = &this_tile->ex_search_count;
-
-#if CONFIG_PVQ
-  td->mb.pvq_q = &this_tile->pvq_q;
-
-  // TODO(yushin) : activity masking info needs be signaled by a bitstream
-  td->mb.daala_enc.use_activity_masking = AV1_PVQ_ENABLE_ACTIVITY_MASKING;
-
-  if (td->mb.daala_enc.use_activity_masking)
-    td->mb.daala_enc.qm = OD_HVS_QM;  // Hard coded. Enc/dec required to sync.
-  else
-    td->mb.daala_enc.qm = OD_FLAT_QM;  // Hard coded. Enc/dec required to sync.
-
-  {
-    // FIXME: Multiple segments support
-    int segment_id = 0;
-    int rdmult = set_segment_rdmult(cpi, &td->mb, segment_id);
-    int qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
-#if CONFIG_HIGHBITDEPTH
-    const int quantizer_shift = td->mb.e_mbd.bd - 8;
-#else
-    const int quantizer_shift = 0;
-#endif  // CONFIG_HIGHBITDEPTH
-    int64_t q_ac = OD_MAXI(
-        1, av1_ac_quant(qindex, 0, cpi->common.bit_depth) >> quantizer_shift);
-    int64_t q_dc = OD_MAXI(
-        1, av1_dc_quant(qindex, 0, cpi->common.bit_depth) >> quantizer_shift);
-    /* td->mb.daala_enc.pvq_norm_lambda = OD_PVQ_LAMBDA; */
-    td->mb.daala_enc.pvq_norm_lambda =
-        (double)rdmult * (64 / 16) / (q_ac * q_ac * (1 << RDDIV_BITS));
-    td->mb.daala_enc.pvq_norm_lambda_dc =
-        (double)rdmult * (64 / 16) / (q_dc * q_dc * (1 << RDDIV_BITS));
-    // printf("%f\n", td->mb.daala_enc.pvq_norm_lambda);
-  }
-  od_init_qm(td->mb.daala_enc.state.qm, td->mb.daala_enc.state.qm_inv,
-             td->mb.daala_enc.qm == OD_HVS_QM ? OD_QM8_Q4_HVS : OD_QM8_Q4_FLAT);
-
-  if (td->mb.daala_enc.use_activity_masking) {
-    int pli;
-    int use_masking = td->mb.daala_enc.use_activity_masking;
-    int segment_id = 0;
-    int qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
-
-    for (pli = 0; pli < MAX_MB_PLANE; pli++) {
-      int i;
-      int q;
-
-      q = qindex;
-      if (q <= OD_DEFAULT_QMS[use_masking][0][pli].interp_q << OD_COEFF_SHIFT) {
-        od_interp_qm(&td->mb.daala_enc.state.pvq_qm_q4[pli][0], q,
-                     &OD_DEFAULT_QMS[use_masking][0][pli], NULL);
-      } else {
-        i = 0;
-        while (OD_DEFAULT_QMS[use_masking][i + 1][pli].qm_q4 != NULL &&
-               q > OD_DEFAULT_QMS[use_masking][i + 1][pli].interp_q
-                       << OD_COEFF_SHIFT) {
-          i++;
-        }
-        od_interp_qm(&td->mb.daala_enc.state.pvq_qm_q4[pli][0], q,
-                     &OD_DEFAULT_QMS[use_masking][i][pli],
-                     &OD_DEFAULT_QMS[use_masking][i + 1][pli]);
-      }
-    }
-  }
-
-#if !CONFIG_ANS
-  od_ec_enc_init(&td->mb.daala_enc.w.ec, 65025);
-  od_ec_enc_reset(&td->mb.daala_enc.w.ec);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-#endif  // #if CONFIG_PVQ
-
   this_tile->tctx = *cm->fc;
   td->mb.e_mbd.tile_ctx = &this_tile->tctx;
 
-#if CONFIG_CFL
-  MACROBLOCKD *const xd = &td->mb.e_mbd;
-  xd->cfl = &this_tile->cfl;
-  cfl_init(xd->cfl, cm);
-#endif
-
-#if CONFIG_PVQ
-  td->mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
-#endif  // CONFIG_PVQ
+  cfl_init(&td->mb.e_mbd.cfl, cm);
 
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  if (!cm->loop_filter_across_tiles_enabled)
-    av1_setup_across_tile_boundary_info(cm, tile_info);
-#endif
+  av1_crc32c_calculator_init(&td->mb.mb_rd_record.crc_calculator);
 
-  av1_crc_calculator_init(&td->mb.tx_rd_record.crc_calculator, 24, 0x5D6DCB);
+  td->intrabc_used_this_tile = 0;
 
   for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
-       mi_row += cm->mib_size) {
+       mi_row += cm->seq_params.mib_size) {
     encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
   }
 
   cpi->tok_count[tile_row][tile_col] =
       (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
-  assert(cpi->tok_count[tile_row][tile_col] <= allocated_tokens(*tile_info));
-#if CONFIG_PVQ
-#if !CONFIG_ANS
-  od_ec_enc_clear(&td->mb.daala_enc.w.ec);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-
-  td->mb.pvq_q->last_pos = td->mb.pvq_q->curr_pos;
-  // rewind current position so that bitstream can be written
-  // from the 1st pvq block
-  td->mb.pvq_q->curr_pos = 0;
-
-  td->mb.pvq_q = NULL;
-#endif
+  assert(cpi->tok_count[tile_row][tile_col] <=
+         allocated_tokens(*tile_info,
+                          cm->seq_params.mib_size_log2 + MI_SIZE_LOG2,
+                          av1_num_planes(cm)));
 }
 
 static void encode_tiles(AV1_COMP *cpi) {
@@ -5058,9 +4153,12 @@ static void encode_tiles(AV1_COMP *cpi) {
 
   av1_init_tile_data(cpi);
 
-  for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row)
-    for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col)
+  for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
       av1_encode_tile(cpi, &cpi->td, tile_row, tile_col);
+      cpi->intrabc_used |= cpi->td.intrabc_used_this_tile;
+    }
+  }
 }
 
 #if CONFIG_FP_MB_STATS
@@ -5077,52 +4175,34 @@ static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
 }
 #endif
 
-#if CONFIG_GLOBAL_MOTION
 #define GLOBAL_TRANS_TYPES_ENC 3  // highest motion model to search
 static int gm_get_params_cost(const WarpedMotionParams *gm,
                               const WarpedMotionParams *ref_gm, int allow_hp) {
-  assert(gm->wmtype < GLOBAL_TRANS_TYPES);
   int params_cost = 0;
   int trans_bits, trans_prec_diff;
   switch (gm->wmtype) {
-    case HOMOGRAPHY:
-    case HORTRAPEZOID:
-    case VERTRAPEZOID:
-      if (gm->wmtype != HORTRAPEZOID)
-        params_cost += aom_count_signed_primitive_refsubexpfin(
-            GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
-            (ref_gm->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF),
-            (gm->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF));
-      if (gm->wmtype != VERTRAPEZOID)
-        params_cost += aom_count_signed_primitive_refsubexpfin(
-            GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
-            (ref_gm->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF),
-            (gm->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF));
-    // Fallthrough intended
     case AFFINE:
     case ROTZOOM:
       params_cost += aom_count_signed_primitive_refsubexpfin(
           GM_ALPHA_MAX + 1, SUBEXPFIN_K,
           (ref_gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS),
           (gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
-      if (gm->wmtype != VERTRAPEZOID)
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+          (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+      if (gm->wmtype >= AFFINE) {
         params_cost += aom_count_signed_primitive_refsubexpfin(
             GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-            (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF),
-            (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF));
-      if (gm->wmtype >= AFFINE) {
-        if (gm->wmtype != HORTRAPEZOID)
-          params_cost += aom_count_signed_primitive_refsubexpfin(
-              GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-              (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF),
-              (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+            (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+            (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF));
         params_cost += aom_count_signed_primitive_refsubexpfin(
             GM_ALPHA_MAX + 1, SUBEXPFIN_K,
             (ref_gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
                 (1 << GM_ALPHA_PREC_BITS),
             (gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
       }
-    // Fallthrough intended
+      AOM_FALLTHROUGH_INTENDED;
     case TRANSLATION:
       trans_bits = (gm->wmtype == TRANSLATION)
                        ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
@@ -5138,7 +4218,7 @@ static int gm_get_params_cost(const WarpedMotionParams *gm,
           (1 << trans_bits) + 1, SUBEXPFIN_K,
           (ref_gm->wmmat[1] >> trans_prec_diff),
           (gm->wmmat[1] >> trans_prec_diff));
-    // Fallthrough intended
+      AOM_FALLTHROUGH_INTENDED;
     case IDENTITY: break;
     default: assert(0);
   }
@@ -5152,26 +4232,16 @@ static int do_gm_search_logic(SPEED_FEATURES *const sf, int num_refs_using_gm,
   switch (sf->gm_search_type) {
     case GM_FULL_SEARCH: return 1;
     case GM_REDUCED_REF_SEARCH:
-#if CONFIG_EXT_REFS
       return !(frame == LAST2_FRAME || frame == LAST3_FRAME);
-#else
-      return (num_refs_using_gm < 2);
-#endif  // CONFIG_EXT_REFS
     case GM_DISABLE_SEARCH: return 0;
     default: assert(0);
   }
   return 1;
 }
-#endif  // CONFIG_GLOBAL_MOTION
 
-// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
-#if !CONFIG_PVQ
 // Estimate if the source frame is screen content, based on the portion of
 // blocks that have no more than 4 (experimentally selected) luma colors.
-static int is_screen_content(const uint8_t *src,
-#if CONFIG_HIGHBITDEPTH
-                             int use_hbd, int bd,
-#endif  // CONFIG_HIGHBITDEPTH
+static int is_screen_content(const uint8_t *src, int use_hbd, int bd,
                              int stride, int width, int height) {
   assert(src != NULL);
   int counts = 0;
@@ -5180,20 +4250,198 @@ static int is_screen_content(const uint8_t *src,
   const int limit = 4;
   for (int r = 0; r + blk_h <= height; r += blk_h) {
     for (int c = 0; c + blk_w <= width; c += blk_w) {
+      int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
       const int n_colors =
-#if CONFIG_HIGHBITDEPTH
           use_hbd ? av1_count_colors_highbd(src + r * stride + c, stride, blk_w,
-                                            blk_h, bd)
-                  :
-#endif  // CONFIG_HIGHBITDEPTH
-                  av1_count_colors(src + r * stride + c, stride, blk_w, blk_h);
+                                            blk_h, bd, count_buf)
+                  : av1_count_colors(src + r * stride + c, stride, blk_w, blk_h,
+                                     count_buf);
       if (n_colors > 1 && n_colors <= limit) counts++;
     }
   }
   // The threshold is 10%.
   return counts * blk_h * blk_w * 10 > width * height;
 }
-#endif  // !CONFIG_PVQ
+
+// Enforce the number of references for each arbitrary frame limited to
+// (INTER_REFS_PER_FRAME - 1)
+static void enforce_max_ref_frames(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  static const int flag_list[REF_FRAMES] = { 0,
+                                             AOM_LAST_FLAG,
+                                             AOM_LAST2_FLAG,
+                                             AOM_LAST3_FLAG,
+                                             AOM_GOLD_FLAG,
+                                             AOM_BWD_FLAG,
+                                             AOM_ALT2_FLAG,
+                                             AOM_ALT_FLAG };
+  MV_REFERENCE_FRAME ref_frame;
+  int total_valid_refs = 0;
+
+  (void)flag_list;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) total_valid_refs++;
+  }
+
+  // NOTE(zoeliu): When all the possible reference frames are availble, we
+  // reduce the number of reference frames by 1, following the rules of:
+  // (1) Retain GOLDEN_FARME/ALTEF_FRAME;
+  // (2) Check the earliest 2 remaining reference frames, and remove the one
+  //     with the lower quality factor, otherwise if both have been coded at
+  //     the same quality level, remove the earliest reference frame.
+
+  if (total_valid_refs == INTER_REFS_PER_FRAME) {
+    unsigned int min_ref_offset = UINT_MAX;
+    unsigned int second_min_ref_offset = UINT_MAX;
+    MV_REFERENCE_FRAME earliest_ref_frames[2] = { LAST3_FRAME, LAST2_FRAME };
+    int earliest_buf_idxes[2] = { 0 };
+
+    // Locate the earliest two reference frames except GOLDEN/ALTREF.
+    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+      // Retain GOLDEN/ALTERF
+      if (ref_frame == GOLDEN_FRAME || ref_frame == ALTREF_FRAME) continue;
+
+      const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
+      if (buf_idx >= 0) {
+        const unsigned int ref_offset =
+            cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+
+        if (min_ref_offset == UINT_MAX) {
+          min_ref_offset = ref_offset;
+          earliest_ref_frames[0] = ref_frame;
+          earliest_buf_idxes[0] = buf_idx;
+        } else {
+          if (get_relative_dist(cm, ref_offset, min_ref_offset) < 0) {
+            second_min_ref_offset = min_ref_offset;
+            earliest_ref_frames[1] = earliest_ref_frames[0];
+            earliest_buf_idxes[1] = earliest_buf_idxes[0];
+
+            min_ref_offset = ref_offset;
+            earliest_ref_frames[0] = ref_frame;
+            earliest_buf_idxes[0] = buf_idx;
+          } else if (second_min_ref_offset == UINT_MAX ||
+                     get_relative_dist(cm, ref_offset, second_min_ref_offset) <
+                         0) {
+            second_min_ref_offset = ref_offset;
+            earliest_ref_frames[1] = ref_frame;
+            earliest_buf_idxes[1] = buf_idx;
+          }
+        }
+      }
+    }
+    // Check the coding quality factors of the two earliest reference frames.
+    RATE_FACTOR_LEVEL ref_rf_level[2];
+    double ref_rf_deltas[2];
+    for (int i = 0; i < 2; ++i) {
+      ref_rf_level[i] = cpi->frame_rf_level[earliest_buf_idxes[i]];
+      ref_rf_deltas[i] = rate_factor_deltas[ref_rf_level[i]];
+    }
+    (void)ref_rf_level;
+    (void)ref_rf_deltas;
+
+#define USE_RF_LEVEL_TO_ENFORCE 1
+#if USE_RF_LEVEL_TO_ENFORCE
+    // If both earliest two reference frames are coded using the same rate-
+    // factor, disable the earliest reference frame; Otherwise disable the
+    // reference frame that uses a lower rate-factor delta.
+    const MV_REFERENCE_FRAME ref_frame_to_disable =
+        (ref_rf_deltas[0] <= ref_rf_deltas[1]) ? earliest_ref_frames[0]
+                                               : earliest_ref_frames[1];
+#else
+    // Always disable the earliest reference frame
+    const MV_REFERENCE_FRAME ref_frame_to_disable = earliest_ref_frames[0];
+#endif  // USE_RF_LEVEL_TO_ENFORCE
+#undef USE_RF_LEVEL_TO_ENFORCE
+
+    switch (ref_frame_to_disable) {
+      case LAST_FRAME: cpi->ref_frame_flags &= ~AOM_LAST_FLAG; break;
+      case LAST2_FRAME: cpi->ref_frame_flags &= ~AOM_LAST2_FLAG; break;
+      case LAST3_FRAME: cpi->ref_frame_flags &= ~AOM_LAST3_FLAG; break;
+      case BWDREF_FRAME: cpi->ref_frame_flags &= ~AOM_BWD_FLAG; break;
+      case ALTREF2_FRAME: cpi->ref_frame_flags &= ~AOM_ALT2_FLAG; break;
+      default: break;
+    }
+  }
+}
+
+static INLINE int av1_refs_are_one_sided(const AV1_COMMON *cm) {
+  assert(!frame_is_intra_only(cm));
+
+  int one_sided_refs = 1;
+  for (int ref = 0; ref < INTER_REFS_PER_FRAME; ++ref) {
+    const int buf_idx = cm->frame_refs[ref].idx;
+    if (buf_idx == INVALID_IDX) continue;
+
+    const int ref_offset =
+        cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+    if (get_relative_dist(cm, ref_offset, (int)cm->frame_offset) > 0) {
+      one_sided_refs = 0;  // bwd reference
+      break;
+    }
+  }
+  return one_sided_refs;
+}
+
+static INLINE void get_skip_mode_ref_offsets(const AV1_COMMON *cm,
+                                             int ref_offset[2]) {
+  ref_offset[0] = ref_offset[1] = 0;
+  if (!cm->is_skip_mode_allowed) return;
+
+  const int buf_idx_0 = cm->frame_refs[cm->ref_frame_idx_0].idx;
+  const int buf_idx_1 = cm->frame_refs[cm->ref_frame_idx_1].idx;
+  assert(buf_idx_0 != INVALID_IDX && buf_idx_1 != INVALID_IDX);
+
+  ref_offset[0] = cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset;
+  ref_offset[1] = cm->buffer_pool->frame_bufs[buf_idx_1].cur_frame_offset;
+}
+
+static int check_skip_mode_enabled(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  av1_setup_skip_mode_allowed(cm);
+  if (!cm->is_skip_mode_allowed) return 0;
+
+  // Turn off skip mode if the temporal distances of the reference pair to the
+  // current frame are different by more than 1 frame.
+  const int cur_offset = (int)cm->frame_offset;
+  int ref_offset[2];
+  get_skip_mode_ref_offsets(cm, ref_offset);
+  const int cur_to_ref0 = get_relative_dist(cm, cur_offset, ref_offset[0]);
+  const int cur_to_ref1 = abs(get_relative_dist(cm, cur_offset, ref_offset[1]));
+  if (abs(cur_to_ref0 - cur_to_ref1) > 1) return 0;
+
+  // High Latency: Turn off skip mode if all refs are fwd.
+  if (cpi->all_one_sided_refs && cpi->oxcf.lag_in_frames > 0) return 0;
+
+  static const int flag_list[REF_FRAMES] = { 0,
+                                             AOM_LAST_FLAG,
+                                             AOM_LAST2_FLAG,
+                                             AOM_LAST3_FLAG,
+                                             AOM_GOLD_FLAG,
+                                             AOM_BWD_FLAG,
+                                             AOM_ALT2_FLAG,
+                                             AOM_ALT_FLAG };
+  const int ref_frame[2] = { cm->ref_frame_idx_0 + LAST_FRAME,
+                             cm->ref_frame_idx_1 + LAST_FRAME };
+  if (!(cpi->ref_frame_flags & flag_list[ref_frame[0]]) ||
+      !(cpi->ref_frame_flags & flag_list[ref_frame[1]]))
+    return 0;
+
+  return 1;
+}
+
+// Function to decide if we can skip the global motion parameter computation
+// for a particular ref frame
+static INLINE int skip_gm_frame(AV1_COMMON *const cm, int ref_frame) {
+  if ((ref_frame == LAST3_FRAME || ref_frame == LAST2_FRAME) &&
+      cm->global_motion[GOLDEN_FRAME].wmtype != IDENTITY) {
+    return get_relative_dist(
+               cm, cm->cur_frame->ref_frame_offset[ref_frame - LAST_FRAME],
+               cm->cur_frame->ref_frame_offset[GOLDEN_FRAME - LAST_FRAME]) <= 0;
+  }
+  return 0;
+}
 
 static void encode_frame_internal(AV1_COMP *cpi) {
   ThreadData *const td = &cpi->td;
@@ -5202,16 +4450,9 @@ static void encode_frame_internal(AV1_COMP *cpi) {
   MACROBLOCKD *const xd = &x->e_mbd;
   RD_COUNTS *const rdc = &cpi->td.rd_counts;
   int i;
-#if CONFIG_TEMPMV_SIGNALING || CONFIG_EXT_REFS
-  const int last_fb_buf_idx = get_ref_frame_buf_idx(cpi, LAST_FRAME);
-#endif  // CONFIG_TEMPMV_SIGNALING || CONFIG_EXT_REFS
-
-#if CONFIG_ADAPT_SCAN
-  av1_deliver_eob_threshold(cm, xd);
-#endif
 
-  x->min_partition_size = AOMMIN(x->min_partition_size, cm->sb_size);
-  x->max_partition_size = AOMMIN(x->max_partition_size, cm->sb_size);
+  x->min_partition_size = AOMMIN(x->min_partition_size, cm->seq_params.sb_size);
+  x->max_partition_size = AOMMIN(x->max_partition_size, cm->seq_params.sb_size);
 #if CONFIG_DIST_8X8
   x->using_dist_8x8 = cpi->oxcf.using_dist_8x8;
   x->tune_metric = cpi->oxcf.tuning;
@@ -5225,23 +4466,29 @@ static void encode_frame_internal(AV1_COMP *cpi) {
   av1_zero(rdc->comp_pred_diff);
 
   if (frame_is_intra_only(cm)) {
-// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
-#if !CONFIG_PVQ
-    cm->allow_screen_content_tools =
-        cpi->oxcf.content == AOM_CONTENT_SCREEN ||
-        is_screen_content(cpi->source->y_buffer,
-#if CONFIG_HIGHBITDEPTH
-                          cpi->source->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
-#endif  // CONFIG_HIGHBITDEPTH
-                          cpi->source->y_stride, cpi->source->y_width,
-                          cpi->source->y_height);
-#else
-    cm->allow_screen_content_tools = 0;
-#endif  // !CONFIG_PVQ
+    if (cm->seq_params.force_screen_content_tools == 2) {
+      cm->allow_screen_content_tools =
+          cpi->oxcf.content == AOM_CONTENT_SCREEN ||
+          is_screen_content(cpi->source->y_buffer,
+                            cpi->source->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+                            cpi->source->y_stride, cpi->source->y_width,
+                            cpi->source->y_height);
+    } else {
+      cm->allow_screen_content_tools =
+          cm->seq_params.force_screen_content_tools;
+    }
   }
 
-#if CONFIG_HASH_ME
-  if (cpi->oxcf.pass != 1 && cpi->common.allow_screen_content_tools) {
+  // Allow intrabc when screen content tools are enabled.
+  cm->allow_intrabc = cm->allow_screen_content_tools;
+  // Reset the flag.
+  cpi->intrabc_used = 0;
+  // Need to disable intrabc when superres is selected
+  if (av1_superres_scaled(cm)) {
+    cm->allow_intrabc = 0;
+  }
+
+  if (cpi->oxcf.pass != 1 && av1_use_hash_me(cm)) {
     // add to hash table
     const int pic_width = cpi->source->y_crop_width;
     const int pic_height = cpi->source->y_crop_height;
@@ -5295,6 +4542,13 @@ static void encode_frame_internal(AV1_COMP *cpi) {
         &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
         pic_width, pic_height, 64);
 
+    av1_generate_block_hash_value(cpi->source, 128, block_hash_values[1],
+                                  block_hash_values[0], is_block_same[1],
+                                  is_block_same[0]);
+    av1_add_to_hash_map_by_row_with_precal_data(
+        &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
+        pic_width, pic_height, 128);
+
     for (k = 0; k < 2; k++) {
       for (j = 0; j < 2; j++) {
         aom_free(block_hash_values[k][j]);
@@ -5305,18 +4559,71 @@ static void encode_frame_internal(AV1_COMP *cpi) {
       }
     }
   }
-#endif
 
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  alloc_ncobmc_pred_buffer(xd);
-#endif
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    const int qindex = cm->seg.enabled
+                           ? av1_get_qindex(&cm->seg, i, cm->base_qindex)
+                           : cm->base_qindex;
+    xd->lossless[i] = qindex == 0 && cm->y_dc_delta_q == 0 &&
+                      cm->u_dc_delta_q == 0 && cm->u_ac_delta_q == 0 &&
+                      cm->v_dc_delta_q == 0 && cm->v_ac_delta_q == 0;
+    if (xd->lossless[i]) cpi->has_lossless_segment = 1;
+    xd->qindex[i] = qindex;
+    if (xd->lossless[i]) {
+      cpi->optimize_seg_arr[i] = 0;
+    } else {
+      cpi->optimize_seg_arr[i] = cpi->optimize_speed_feature;
+    }
+  }
+  cm->coded_lossless = is_coded_lossless(cm, xd);
+  cm->all_lossless = cm->coded_lossless && !av1_superres_scaled(cm);
+
+  cm->tx_mode = select_tx_mode(cpi);
+
+  // Fix delta q resolution for the moment
+  cm->delta_q_res = DEFAULT_DELTA_Q_RES;
+  // Set delta_q_present_flag before it is used for the first time
+  cm->delta_lf_res = DEFAULT_DELTA_LF_RES;
+  cm->delta_q_present_flag = cpi->oxcf.deltaq_mode != NO_DELTA_Q;
+  cm->delta_lf_present_flag = cpi->oxcf.deltaq_mode == DELTA_Q_LF;
+  cm->delta_lf_multi = DEFAULT_DELTA_LF_MULTI;
+  // update delta_q_present_flag and delta_lf_present_flag based on base_qindex
+  cm->delta_q_present_flag &= cm->base_qindex > 0;
+  cm->delta_lf_present_flag &= cm->base_qindex > 0;
+
+  av1_frame_init_quantizer(cpi);
+
+  av1_initialize_rd_consts(cpi);
+  av1_initialize_me_consts(cpi, x, cm->base_qindex);
+  init_encode_frame_mb_context(cpi);
+
+  if (cm->prev_frame)
+    cm->last_frame_seg_map = cm->prev_frame->seg_map;
+  else
+    cm->last_frame_seg_map = NULL;
+  cm->current_frame_seg_map = cm->cur_frame->seg_map;
+  if (cm->allow_intrabc || cm->coded_lossless) {
+    av1_set_default_ref_deltas(cm->lf.ref_deltas);
+    av1_set_default_mode_deltas(cm->lf.mode_deltas);
+  } else if (cm->prev_frame) {
+    memcpy(cm->lf.ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES);
+    memcpy(cm->lf.mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS);
+  }
+  memcpy(cm->cur_frame->ref_deltas, cm->lf.ref_deltas, REF_FRAMES);
+  memcpy(cm->cur_frame->mode_deltas, cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
+
+  // Special case: set prev_mi to NULL when the previous mode info
+  // context cannot be used.
+  cm->prev_mi = cm->allow_ref_frame_mvs ? cm->prev_mip : NULL;
+
+  x->txb_split_count = 0;
+  av1_zero(x->blk_skip_drl);
 
-#if CONFIG_GLOBAL_MOTION
   av1_zero(rdc->global_motion_used);
   av1_zero(cpi->gmparams_cost);
   if (cpi->common.frame_type == INTER_FRAME && cpi->source &&
       !cpi->global_motion_search_done) {
-    YV12_BUFFER_CONFIG *ref_buf[TOTAL_REFS_PER_FRAME];
+    YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES];
     int frame;
     double params_by_motion[RANSAC_NUM_MOTIONS * (MAX_PARAMDIM - 1)];
     const double *params_this_motion;
@@ -5327,32 +4634,31 @@ static void encode_frame_internal(AV1_COMP *cpi) {
     };
     int num_refs_using_gm = 0;
 
-    for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+    for (frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) {
       ref_buf[frame] = get_ref_frame_buffer(cpi, frame);
       int pframe;
       cm->global_motion[frame] = default_warp_params;
       const WarpedMotionParams *ref_params =
-          cm->error_resilient_mode ? &default_warp_params
-                                   : &cm->prev_frame->global_motion[frame];
+          cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+                         : &default_warp_params;
       // check for duplicate buffer
-      for (pframe = LAST_FRAME; pframe < frame; ++pframe) {
+      for (pframe = ALTREF_FRAME; pframe > frame; --pframe) {
         if (ref_buf[frame] == ref_buf[pframe]) break;
       }
-      if (pframe < frame) {
+      if (pframe > frame) {
         memcpy(&cm->global_motion[frame], &cm->global_motion[pframe],
                sizeof(WarpedMotionParams));
       } else if (ref_buf[frame] &&
                  ref_buf[frame]->y_crop_width == cpi->source->y_crop_width &&
                  ref_buf[frame]->y_crop_height == cpi->source->y_crop_height &&
-                 do_gm_search_logic(&cpi->sf, num_refs_using_gm, frame)) {
+                 do_gm_search_logic(&cpi->sf, num_refs_using_gm, frame) &&
+                 !(cpi->sf.selective_ref_gm && skip_gm_frame(cm, frame))) {
         TransformationType model;
-        const int64_t ref_frame_error = av1_frame_error(
-#if CONFIG_HIGHBITDEPTH
-            xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
-#endif  // CONFIG_HIGHBITDEPTH
-            ref_buf[frame]->y_buffer, ref_buf[frame]->y_stride,
-            cpi->source->y_buffer, cpi->source->y_width, cpi->source->y_height,
-            cpi->source->y_stride);
+        const int64_t ref_frame_error =
+            av1_frame_error(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+                            ref_buf[frame]->y_buffer, ref_buf[frame]->y_stride,
+                            cpi->source->y_buffer, cpi->source->y_width,
+                            cpi->source->y_height, cpi->source->y_stride);
 
         if (ref_frame_error == 0) continue;
 
@@ -5366,10 +4672,7 @@ static void encode_frame_internal(AV1_COMP *cpi) {
           }
 
           compute_global_motion_feature_based(
-              model, cpi->source, ref_buf[frame],
-#if CONFIG_HIGHBITDEPTH
-              cpi->common.bit_depth,
-#endif  // CONFIG_HIGHBITDEPTH
+              model, cpi->source, ref_buf[frame], cpi->common.bit_depth,
               inliers_by_motion, params_by_motion, RANSAC_NUM_MOTIONS);
 
           for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
@@ -5381,9 +4684,7 @@ static void encode_frame_internal(AV1_COMP *cpi) {
             if (tmp_wm_params.wmtype != IDENTITY) {
               const int64_t warp_error = refine_integerized_param(
                   &tmp_wm_params, tmp_wm_params.wmtype,
-#if CONFIG_HIGHBITDEPTH
                   xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
-#endif  // CONFIG_HIGHBITDEPTH
                   ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
                   ref_buf[frame]->y_height, ref_buf[frame]->y_stride,
                   cpi->source->y_buffer, cpi->source->y_width,
@@ -5418,7 +4719,8 @@ static void encode_frame_internal(AV1_COMP *cpi) {
           if (!is_enough_erroradvantage(
                   (double)best_warp_error / ref_frame_error,
                   gm_get_params_cost(&cm->global_motion[frame], ref_params,
-                                     cm->allow_high_precision_mv))) {
+                                     cm->allow_high_precision_mv),
+                  cpi->sf.gm_erroradv_type)) {
             cm->global_motion[frame] = default_warp_params;
           }
           if (cm->global_motion[frame].wmtype != IDENTITY) break;
@@ -5435,91 +4737,14 @@ static void encode_frame_internal(AV1_COMP *cpi) {
     cpi->global_motion_search_done = 1;
   }
   memcpy(cm->cur_frame->global_motion, cm->global_motion,
-         TOTAL_REFS_PER_FRAME * sizeof(WarpedMotionParams));
-#endif  // CONFIG_GLOBAL_MOTION
-
-  for (i = 0; i < MAX_SEGMENTS; ++i) {
-    const int qindex = cm->seg.enabled
-                           ? av1_get_qindex(&cm->seg, i, cm->base_qindex)
-                           : cm->base_qindex;
-    xd->lossless[i] = qindex == 0 && cm->y_dc_delta_q == 0 &&
-                      cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
-    xd->qindex[i] = qindex;
-  }
-  cm->all_lossless = all_lossless(cm, xd);
-  if (!cm->seg.enabled && xd->lossless[0]) x->optimize = 0;
+         REF_FRAMES * sizeof(WarpedMotionParams));
 
-  cm->tx_mode = select_tx_mode(cpi);
-
-  // Fix delta q resolution for the moment
-  cm->delta_q_res = DEFAULT_DELTA_Q_RES;
-// Set delta_q_present_flag before it is used for the first time
-#if CONFIG_EXT_DELTA_Q
-  cm->delta_lf_res = DEFAULT_DELTA_LF_RES;
-  // update delta_q_present_flag and delta_lf_present_flag based on base_qindex
-  cm->delta_q_present_flag &= cm->base_qindex > 0;
-  cm->delta_lf_present_flag &= cm->base_qindex > 0;
-#else
-  cm->delta_q_present_flag =
-      cpi->oxcf.aq_mode == DELTA_AQ && cm->base_qindex > 0;
-#endif  // CONFIG_EXT_DELTA_Q
-
-  av1_frame_init_quantizer(cpi);
-
-  av1_initialize_rd_consts(cpi);
-  av1_initialize_me_consts(cpi, x, cm->base_qindex);
-  init_encode_frame_mb_context(cpi);
-
-#if CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
-  // NOTE(zoeliu): As cm->prev_frame can take neither a frame of
-  //               show_exisiting_frame=1, nor can it take a frame not used as
-  //               a reference, it is probable that by the time it is being
-  //               referred to, the frame buffer it originally points to may
-  //               already get expired and have been reassigned to the current
-  //               newly coded frame. Hence, we need to check whether this is
-  //               the case, and if yes, we have 2 choices:
-  //               (1) Simply disable the use of previous frame mvs; or
-  //               (2) Have cm->prev_frame point to one reference frame buffer,
-  //                   e.g. LAST_FRAME.
-  if (!enc_is_ref_frame_buf(cpi, cm->prev_frame)) {
-    // Reassign the LAST_FRAME buffer to cm->prev_frame.
-    cm->prev_frame = last_fb_buf_idx != INVALID_IDX
-                         ? &cm->buffer_pool->frame_bufs[last_fb_buf_idx]
-                         : NULL;
-  }
-#endif  // CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
-
-#if CONFIG_TEMPMV_SIGNALING
-  cm->use_prev_frame_mvs &= frame_can_use_prev_frame_mvs(cm);
-#else
-  if (cm->prev_frame) {
-    cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
-#if CONFIG_FRAME_SUPERRES
-                             cm->width == cm->last_width &&
-                             cm->height == cm->last_height &&
-#else
-                             cm->width == cm->prev_frame->buf.y_crop_width &&
-                             cm->height == cm->prev_frame->buf.y_crop_height &&
-#endif  // CONFIG_FRAME_SUPERRES
-                             !cm->intra_only && cm->last_show_frame;
-  } else {
-    cm->use_prev_frame_mvs = 0;
-  }
-#endif  // CONFIG_TEMPMV_SIGNALING
-
-  // Special case: set prev_mi to NULL when the previous mode info
-  // context cannot be used.
-  cm->prev_mi =
-      cm->use_prev_frame_mvs ? cm->prev_mip + cm->mi_stride + 1 : NULL;
+  av1_setup_motion_field(cm);
 
-#if CONFIG_VAR_TX
-  x->txb_split_count = 0;
-  av1_zero(x->blk_skip_drl);
-#endif
+  cpi->all_one_sided_refs =
+      frame_is_intra_only(cm) ? 0 : av1_refs_are_one_sided(cm);
 
-#if CONFIG_MFMV
-  av1_setup_motion_field(cm);
-#endif  // CONFIG_MFMV
+  cm->skip_mode_flag = check_skip_mode_enabled(cpi);
 
   {
     struct aom_usec_timer emr_timer;
@@ -5532,7 +4757,9 @@ static void encode_frame_internal(AV1_COMP *cpi) {
     }
 #endif
 
-    av1_setup_frame_boundary_info(cm);
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+    av1_inter_mode_data_init();
+#endif
 
     // If allowed, encoding tiles in parallel with one thread handling one tile.
     // TODO(geza.lore): The multi-threaded encoder is not safe with more than
@@ -5543,109 +4770,72 @@ static void encode_frame_internal(AV1_COMP *cpi) {
     else
       encode_tiles(cpi);
 
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+#if INTER_MODE_RD_TEST
+    if (cpi->sf.inter_mode_rd_model_estimation) {
+      av1_inter_mode_data_show(cm);
+    }
+#endif
+#endif
+
     aom_usec_timer_mark(&emr_timer);
     cpi->time_encode_sb_row += aom_usec_timer_elapsed(&emr_timer);
   }
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  free_ncobmc_pred_buffer(xd);
-#endif
-
-#if 0
-  // Keep record of the total distortion this time around for future use
-  cpi->last_frame_distortion = cpi->frame_distortion;
-#endif
-}
 
-static void make_consistent_compound_tools(AV1_COMMON *cm) {
-  (void)cm;
-#if CONFIG_INTERINTRA
-  if (frame_is_intra_only(cm) || cm->reference_mode == COMPOUND_REFERENCE)
-    cm->allow_interintra_compound = 0;
-#endif  // CONFIG_INTERINTRA
-#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-#if CONFIG_COMPOUND_SINGLEREF
-  if (frame_is_intra_only(cm))
-#else   // !CONFIG_COMPOUND_SINGLEREF
-  if (frame_is_intra_only(cm) || cm->reference_mode == SINGLE_REFERENCE)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    cm->allow_masked_compound = 0;
-#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+  // If intrabc is allowed but never selected, reset the allow_intrabc flag.
+  if (cm->allow_intrabc && !cpi->intrabc_used) cm->allow_intrabc = 0;
+  if (cm->allow_intrabc) cm->delta_lf_present_flag = 0;
 }
 
 void av1_encode_frame(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-#if CONFIG_EXT_TX
+  const int num_planes = av1_num_planes(cm);
   // Indicates whether or not to use a default reduced set for ext-tx
   // rather than the potential full set of 16 transforms
   cm->reduced_tx_set_used = 0;
-#endif  // CONFIG_EXT_TX
-#if CONFIG_ADAPT_SCAN
-  cm->use_adapt_scan = 1;
-  // TODO(angiebird): call av1_init_scan_order only when use_adapt_scan
-  // switches from 1 to 0
-  if (cm->use_adapt_scan == 0) av1_init_scan_order(cm);
-#endif
 
-#if CONFIG_FRAME_MARKER
   if (cm->show_frame == 0) {
     int arf_offset = AOMMIN(
         (MAX_GF_INTERVAL - 1),
         cpi->twopass.gf_group.arf_src_offset[cpi->twopass.gf_group.index]);
-#if CONFIG_EXT_REFS
     int brf_offset =
         cpi->twopass.gf_group.brf_src_offset[cpi->twopass.gf_group.index];
     arf_offset = AOMMIN((MAX_GF_INTERVAL - 1), arf_offset + brf_offset);
-#endif  // CONFIG_EXT_REFS
     cm->frame_offset = cm->current_video_frame + arf_offset;
   } else {
     cm->frame_offset = cm->current_video_frame;
   }
-  av1_setup_frame_buf_refs(cm);
-#if CONFIG_FRAME_SIGN_BIAS
-  av1_setup_frame_sign_bias(cm);
-#endif  // CONFIG_FRAME_SIGN_BIAS
-#endif  // CONFIG_FRAME_MARKER
-
-  // In the longer term the encoder should be generalized to match the
-  // decoder such that we allow compound where one of the 3 buffers has a
-  // different sign bias and that buffer is then the fixed ref. However, this
-  // requires further work in the rd loop. For now the only supported encoder
-  // side behavior is where the ALT ref buffer has opposite sign bias to
-  // the other two.
-  if (!frame_is_intra_only(cm)) {
-#if !CONFIG_ONE_SIDED_COMPOUND
-    if ((cm->ref_frame_sign_bias[ALTREF_FRAME] ==
-         cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
-        (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
-         cm->ref_frame_sign_bias[LAST_FRAME])) {
-      cpi->allow_comp_inter_inter = 0;
-    } else {
-#endif  // !CONFIG_ONE_SIDED_COMPOUND
-      cpi->allow_comp_inter_inter = 1;
-#if CONFIG_EXT_REFS
-      cm->comp_fwd_ref[0] = LAST_FRAME;
-      cm->comp_fwd_ref[1] = LAST2_FRAME;
-      cm->comp_fwd_ref[2] = LAST3_FRAME;
-      cm->comp_fwd_ref[3] = GOLDEN_FRAME;
-      cm->comp_bwd_ref[0] = BWDREF_FRAME;
-      cm->comp_bwd_ref[1] = ALTREF2_FRAME;
-      cm->comp_bwd_ref[2] = ALTREF_FRAME;
-#else                           // !CONFIG_EXT_REFS
-    cm->comp_fixed_ref = ALTREF_FRAME;
-    cm->comp_var_ref[0] = LAST_FRAME;
-    cm->comp_var_ref[1] = GOLDEN_FRAME;
-#endif                          // CONFIG_EXT_REFS
-#if !CONFIG_ONE_SIDED_COMPOUND  // Normative in encoder
+  cm->frame_offset %= (1 << (cm->seq_params.order_hint_bits_minus_1 + 1));
+
+  // Make sure segment_id is no larger than last_active_segid.
+  if (cm->seg.enabled && cm->seg.update_map) {
+    const int mi_rows = cm->mi_rows;
+    const int mi_cols = cm->mi_cols;
+    const int last_active_segid = cm->seg.last_active_segid;
+    uint8_t *map = cpi->segmentation_map;
+    for (int mi_row = 0; mi_row < mi_rows; ++mi_row) {
+      for (int mi_col = 0; mi_col < mi_cols; ++mi_col) {
+        map[mi_col] = AOMMIN(map[mi_col], last_active_segid);
+      }
+      map += mi_cols;
     }
-#endif  // !CONFIG_ONE_SIDED_COMPOUND
-  } else {
-    cpi->allow_comp_inter_inter = 0;
   }
 
+  av1_setup_frame_buf_refs(cm);
+  if (cpi->sf.selective_ref_frame >= 2) enforce_max_ref_frames(cpi);
+  av1_setup_frame_sign_bias(cm);
+
+#if CONFIG_MISMATCH_DEBUG
+  mismatch_reset_frame(num_planes);
+#else
+  (void)num_planes;
+#endif
+
+  cpi->allow_comp_inter_inter = !frame_is_intra_only(cm);
+
   if (cpi->sf.frame_parameter_update) {
     int i;
     RD_OPT *const rd_opt = &cpi->rd;
-    FRAME_COUNTS *counts = cpi->td.counts;
     RD_COUNTS *const rdc = &cpi->td.rd_counts;
 
     // This code does a single RD pass over the whole frame assuming
@@ -5662,39 +4852,20 @@ void av1_encode_frame(AV1_COMP *cpi) {
     int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type];
     const int is_alt_ref = frame_type == ALTREF_FRAME;
 
-/* prediction (compound, single or hybrid) mode selection */
-#if CONFIG_REF_ADAPT
-    // NOTE(zoeliu): "is_alt_ref" is true only for OVERLAY/INTNL_OVERLAY frames
+    /* prediction (compound, single or hybrid) mode selection */
+    // NOTE: "is_alt_ref" is true only for OVERLAY/INTNL_OVERLAY frames
     if (is_alt_ref || !cpi->allow_comp_inter_inter)
       cm->reference_mode = SINGLE_REFERENCE;
     else
       cm->reference_mode = REFERENCE_MODE_SELECT;
-#else
-#if CONFIG_BGSPRITE
-    (void)is_alt_ref;
-    if (!cpi->allow_comp_inter_inter)
-#else
-    if (is_alt_ref || !cpi->allow_comp_inter_inter)
-#endif  // CONFIG_BGSPRITE
-      cm->reference_mode = SINGLE_REFERENCE;
-    else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] &&
-             mode_thrs[COMPOUND_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT] &&
-             check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
-      cm->reference_mode = COMPOUND_REFERENCE;
-    else if (mode_thrs[SINGLE_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT])
-      cm->reference_mode = SINGLE_REFERENCE;
-    else
-      cm->reference_mode = REFERENCE_MODE_SELECT;
-#endif  // CONFIG_REF_ADAPT
 
-#if CONFIG_DUAL_FILTER
     cm->interp_filter = SWITCHABLE;
-#endif
+    if (cm->large_scale_tile) cm->interp_filter = EIGHTTAP_REGULAR;
 
-    make_consistent_compound_tools(cm);
+    cm->switchable_motion_mode = 1;
 
-    rdc->single_ref_used_flag = 0;
     rdc->compound_ref_used_flag = 0;
+    rdc->skip_mode_used_flag = 0;
 
     encode_frame_internal(cpi);
 
@@ -5705,406 +4876,124 @@ void av1_encode_frame(AV1_COMP *cpi) {
       // Use a flag that includes 4x4 blocks
       if (rdc->compound_ref_used_flag == 0) {
         cm->reference_mode = SINGLE_REFERENCE;
-        av1_zero(counts->comp_inter);
-#if !CONFIG_REF_ADAPT
-        // Use a flag that includes 4x4 blocks
-      } else if (rdc->single_ref_used_flag == 0) {
-        cm->reference_mode = COMPOUND_REFERENCE;
-        av1_zero(counts->comp_inter);
-#endif  // !CONFIG_REF_ADAPT
-      }
-    }
-    make_consistent_compound_tools(cm);
-
-#if CONFIG_VAR_TX
-#if CONFIG_RECT_TX_EXT
-    if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0 &&
-        counts->quarter_tx_size[1] == 0)
-#else
-    if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0)
-#endif
-      cm->tx_mode = ALLOW_32X32 + CONFIG_TX64X64;
-#else
-#if CONFIG_RECT_TX_EXT && CONFIG_EXT_TX
-    if (cm->tx_mode == TX_MODE_SELECT && counts->quarter_tx_size[1] == 0)
-#else
-    if (cm->tx_mode == TX_MODE_SELECT)
-#endif
-    {
-#if CONFIG_TX64X64
-      int count4x4 = 0;
-      int count8x8_8x8p = 0, count8x8_lp = 0;
-      int count16x16_16x16p = 0, count16x16_lp = 0;
-      int count32x32_32x32p = 0, count32x32_lp = 0;
-      int count64x64_64x64p = 0;
-      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-        int depth;
-        // counts->tx_size[max_depth][context_idx][this_depth_level]
-        depth = tx_size_to_depth(TX_4X4);
-        count4x4 += counts->tx_size[TX_8X8 - TX_SIZE_CTX_MIN][i][depth];
-        count4x4 += counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth];
-        count4x4 += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
-        count4x4 += counts->tx_size[TX_64X64 - TX_SIZE_CTX_MIN][i][depth];
-
-        depth = tx_size_to_depth(TX_8X8);
-        count8x8_8x8p += counts->tx_size[TX_8X8 - TX_SIZE_CTX_MIN][i][depth];
-        count8x8_lp += counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth];
-        count8x8_lp += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
-        count8x8_lp += counts->tx_size[TX_64X64 - TX_SIZE_CTX_MIN][i][depth];
-
-        depth = tx_size_to_depth(TX_16X16);
-        count16x16_16x16p +=
-            counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth];
-        count16x16_lp += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
-        count16x16_lp += counts->tx_size[TX_64X64 - TX_SIZE_CTX_MIN][i][depth];
-
-        depth = tx_size_to_depth(TX_32X32);
-        count32x32_32x32p +=
-            counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
-        count32x32_lp += counts->tx_size[TX_64X64 - TX_SIZE_CTX_MIN][i][depth];
-
-        depth = tx_size_to_depth(TX_64X64);
-        count64x64_64x64p +=
-            counts->tx_size[TX_64X64 - TX_SIZE_CTX_MIN][i][depth];
-      }
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-      count4x4 += counts->tx_size_implied[TX_4X4][TX_4X4];
-      count4x4 += counts->tx_size_implied[TX_8X8][TX_4X4];
-      count4x4 += counts->tx_size_implied[TX_16X16][TX_4X4];
-      count4x4 += counts->tx_size_implied[TX_32X32][TX_4X4];
-      count8x8_8x8p += counts->tx_size_implied[TX_8X8][TX_8X8];
-      count8x8_lp += counts->tx_size_implied[TX_16X16][TX_8X8];
-      count8x8_lp += counts->tx_size_implied[TX_32X32][TX_8X8];
-      count8x8_lp += counts->tx_size_implied[TX_64X64][TX_8X8];
-      count16x16_16x16p += counts->tx_size_implied[TX_16X16][TX_16X16];
-      count16x16_lp += counts->tx_size_implied[TX_32X32][TX_16X16];
-      count16x16_lp += counts->tx_size_implied[TX_64X64][TX_16X16];
-      count32x32_32x32p += counts->tx_size_implied[TX_32X32][TX_32X32];
-      count32x32_lp += counts->tx_size_implied[TX_64X64][TX_32X32];
-      count64x64_64x64p += counts->tx_size_implied[TX_64X64][TX_64X64];
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
-      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
-          count32x32_lp == 0 && count32x32_32x32p == 0 &&
-#if CONFIG_SUPERTX
-          cm->counts.supertx_size[TX_16X16] == 0 &&
-          cm->counts.supertx_size[TX_32X32] == 0 &&
-          cm->counts.supertx_size[TX_64X64] == 0 &&
-#endif
-          count64x64_64x64p == 0) {
-        cm->tx_mode = ALLOW_8X8;
-        reset_skip_tx_size(cm, TX_8X8);
-      } else if (count8x8_8x8p == 0 && count8x8_lp == 0 &&
-                 count16x16_16x16p == 0 && count16x16_lp == 0 &&
-                 count32x32_32x32p == 0 && count32x32_lp == 0 &&
-#if CONFIG_SUPERTX
-                 cm->counts.supertx_size[TX_8X8] == 0 &&
-                 cm->counts.supertx_size[TX_16X16] == 0 &&
-                 cm->counts.supertx_size[TX_32X32] == 0 &&
-                 cm->counts.supertx_size[TX_64X64] == 0 &&
-#endif
-                 count64x64_64x64p == 0) {
-        cm->tx_mode = ONLY_4X4;
-        reset_skip_tx_size(cm, TX_4X4);
-      } else if (count4x4 == 0 && count8x8_lp == 0 && count16x16_lp == 0 &&
-                 count32x32_lp == 0) {
-        cm->tx_mode = ALLOW_64X64;
-      } else if (count4x4 == 0 && count8x8_lp == 0 && count16x16_lp == 0 &&
-#if CONFIG_SUPERTX
-                 cm->counts.supertx_size[TX_64X64] == 0 &&
-#endif
-                 count64x64_64x64p == 0) {
-        cm->tx_mode = ALLOW_32X32;
-        reset_skip_tx_size(cm, TX_32X32);
-      } else if (count4x4 == 0 && count8x8_lp == 0 && count32x32_lp == 0 &&
-                 count32x32_32x32p == 0 &&
-#if CONFIG_SUPERTX
-                 cm->counts.supertx_size[TX_32X32] == 0 &&
-                 cm->counts.supertx_size[TX_64X64] == 0 &&
-#endif
-                 count64x64_64x64p == 0) {
-        cm->tx_mode = ALLOW_16X16;
-        reset_skip_tx_size(cm, TX_16X16);
-      }
-
-#else  // CONFIG_TX64X64
-
-      int count4x4 = 0;
-      int count8x8_lp = 0, count8x8_8x8p = 0;
-      int count16x16_16x16p = 0, count16x16_lp = 0;
-      int count32x32 = 0;
-      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-        int depth;
-        // counts->tx_size[max_depth][context_idx][this_depth_level]
-        depth = tx_size_to_depth(TX_4X4);
-        count4x4 += counts->tx_size[TX_8X8 - TX_SIZE_CTX_MIN][i][depth];
-        count4x4 += counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth];
-        count4x4 += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
-
-        depth = tx_size_to_depth(TX_8X8);
-        count8x8_8x8p += counts->tx_size[TX_8X8 - TX_SIZE_CTX_MIN][i][depth];
-        count8x8_lp += counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth];
-        count8x8_lp += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
-
-        depth = tx_size_to_depth(TX_16X16);
-        count16x16_16x16p +=
-            counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth];
-        count16x16_lp += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
-
-        depth = tx_size_to_depth(TX_32X32);
-        count32x32 += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
-      }
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-      count4x4 += counts->tx_size_implied[TX_4X4][TX_4X4];
-      count4x4 += counts->tx_size_implied[TX_8X8][TX_4X4];
-      count4x4 += counts->tx_size_implied[TX_16X16][TX_4X4];
-      count4x4 += counts->tx_size_implied[TX_32X32][TX_4X4];
-      count8x8_8x8p += counts->tx_size_implied[TX_8X8][TX_8X8];
-      count8x8_lp += counts->tx_size_implied[TX_16X16][TX_8X8];
-      count8x8_lp += counts->tx_size_implied[TX_32X32][TX_8X8];
-      count16x16_16x16p += counts->tx_size_implied[TX_16X16][TX_16X16];
-      count16x16_lp += counts->tx_size_implied[TX_32X32][TX_16X16];
-      count32x32 += counts->tx_size_implied[TX_32X32][TX_32X32];
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
-      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
-#if CONFIG_SUPERTX
-          cm->counts.supertx_size[TX_16X16] == 0 &&
-          cm->counts.supertx_size[TX_32X32] == 0 &&
-#endif  // CONFIG_SUPERTX
-          count32x32 == 0) {
-        cm->tx_mode = ALLOW_8X8;
-        reset_skip_tx_size(cm, TX_8X8);
-      } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
-                 count8x8_lp == 0 && count16x16_lp == 0 &&
-#if CONFIG_SUPERTX
-                 cm->counts.supertx_size[TX_8X8] == 0 &&
-                 cm->counts.supertx_size[TX_16X16] == 0 &&
-                 cm->counts.supertx_size[TX_32X32] == 0 &&
-#endif  // CONFIG_SUPERTX
-                 count32x32 == 0) {
-        cm->tx_mode = ONLY_4X4;
-        reset_skip_tx_size(cm, TX_4X4);
-      } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
-        cm->tx_mode = ALLOW_32X32;
-      } else if (count32x32 == 0 && count8x8_lp == 0 &&
-#if CONFIG_SUPERTX
-                 cm->counts.supertx_size[TX_32X32] == 0 &&
-#endif  // CONFIG_SUPERTX
-                 count4x4 == 0) {
-        cm->tx_mode = ALLOW_16X16;
-        reset_skip_tx_size(cm, TX_16X16);
-      }
-#endif  // CONFIG_TX64X64
-    }
-#endif
-  } else {
-    make_consistent_compound_tools(cm);
-    encode_frame_internal(cpi);
-  }
-}
-
-static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd,
-                            const MODE_INFO *mi, const MODE_INFO *above_mi,
-                            const MODE_INFO *left_mi, const int intraonly,
-                            const int mi_row, const int mi_col) {
-  FRAME_CONTEXT *fc = xd->tile_ctx;
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const PREDICTION_MODE y_mode = mbmi->mode;
-  const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
-  (void)counts;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const int unify_bsize = CONFIG_CB4X4;
-
-  if (bsize < BLOCK_8X8 && !unify_bsize) {
-    int idx, idy;
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
-    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
-    for (idy = 0; idy < 2; idy += num_4x4_h)
-      for (idx = 0; idx < 2; idx += num_4x4_w) {
-        const int bidx = idy * 2 + idx;
-        const PREDICTION_MODE bmode = mi->bmi[bidx].as_mode;
-        if (intraonly) {
 #if CONFIG_ENTROPY_STATS
-          const PREDICTION_MODE a = av1_above_block_mode(mi, above_mi, bidx);
-          const PREDICTION_MODE l = av1_left_block_mode(mi, left_mi, bidx);
-          ++counts->kf_y_mode[a][l][bmode];
+        av1_zero(cpi->td.counts->comp_inter);
 #endif  // CONFIG_ENTROPY_STATS
-          update_cdf(get_y_mode_cdf(fc, mi, above_mi, left_mi, bidx), bmode,
-                     INTRA_MODES);
-        } else {
-#if CONFIG_ENTROPY_STATS
-          ++counts->y_mode[0][bmode];
-#endif  // CONFIG_ENTROPY_STATS
-          update_cdf(fc->y_mode_cdf[0], bmode, INTRA_MODES);
-        }
       }
-  } else {
-    if (intraonly) {
-#if CONFIG_ENTROPY_STATS
-      const PREDICTION_MODE above = av1_above_block_mode(mi, above_mi, 0);
-      const PREDICTION_MODE left = av1_left_block_mode(mi, left_mi, 0);
-      ++counts->kf_y_mode[above][left][y_mode];
-#endif  // CONFIG_ENTROPY_STATS
-      update_cdf(get_y_mode_cdf(fc, mi, above_mi, left_mi, 0), y_mode,
-                 INTRA_MODES);
-    } else {
-#if CONFIG_ENTROPY_STATS
-      ++counts->y_mode[size_group_lookup[bsize]][y_mode];
-#endif  // CONFIG_ENTROPY_STATS
-      update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES);
     }
-
-#if CONFIG_FILTER_INTRA
-    if (mbmi->mode == DC_PRED && mbmi->palette_mode_info.palette_size[0] == 0) {
-      const int use_filter_intra_mode =
-          mbmi->filter_intra_mode_info.use_filter_intra_mode[0];
-      ++counts->filter_intra[0][use_filter_intra_mode];
-    }
-    if (mbmi->uv_mode == UV_DC_PRED
-#if CONFIG_CB4X4
-        &&
-        is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                            xd->plane[1].subsampling_y)
-#endif
-        && mbmi->palette_mode_info.palette_size[1] == 0) {
-      const int use_filter_intra_mode =
-          mbmi->filter_intra_mode_info.use_filter_intra_mode[1];
-      ++counts->filter_intra[1][use_filter_intra_mode];
+    // Re-check on the skip mode status as reference mode may have been changed.
+    if (frame_is_intra_only(cm) || cm->reference_mode == SINGLE_REFERENCE) {
+      cm->is_skip_mode_allowed = 0;
+      cm->skip_mode_flag = 0;
     }
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-    if (av1_is_directional_mode(mbmi->mode, bsize)) {
-      const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
-      const int p_angle =
-          mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
-      if (av1_is_intra_filter_switchable(p_angle))
-        ++counts->intra_filter[intra_filter_ctx][mbmi->intra_filter];
+    if (cm->skip_mode_flag && rdc->skip_mode_used_flag == 0)
+      cm->skip_mode_flag = 0;
+
+    if (!cm->large_scale_tile) {
+      if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0)
+        cm->tx_mode = TX_MODE_LARGEST;
     }
-#endif  // CONFIG_INTRA_INTERP && CONFIG_INTRA_INTERP
+  } else {
+    encode_frame_internal(cpi);
   }
-
-#if CONFIG_CB4X4
-  if (!is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                           xd->plane[1].subsampling_y))
-    return;
-#else
-  (void)mi_row;
-  (void)mi_col;
-  (void)xd;
-#endif
-#if CONFIG_ENTROPY_STATS
-  ++counts->uv_mode[y_mode][uv_mode];
-#endif  // CONFIG_ENTROPY_STATS
-  update_cdf(fc->uv_mode_cdf[y_mode], uv_mode, UV_INTRA_MODES);
 }
 
-#if CONFIG_VAR_TX
 static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
                               FRAME_COUNTS *counts, TX_SIZE tx_size, int depth,
-                              int blk_row, int blk_col) {
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const int tx_row = blk_row >> 1;
-  const int tx_col = blk_col >> 1;
-  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
-  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
+                              int blk_row, int blk_col,
+                              uint8_t allow_update_cdf) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
   int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
                                    xd->left_txfm_context + blk_row,
                                    mbmi->sb_type, tx_size);
-  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
+  const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
   assert(tx_size > TX_4X4);
 
   if (depth == MAX_VARTX_DEPTH) {
-// Don't add to counts in this case
-#if CONFIG_RECT_TX_EXT
-    if (tx_size == plane_tx_size)
-#endif
-      mbmi->tx_size = tx_size;
+    // Don't add to counts in this case
+    mbmi->tx_size = tx_size;
     txfm_partition_update(xd->above_txfm_context + blk_col,
                           xd->left_txfm_context + blk_row, tx_size, tx_size);
     return;
   }
 
-#if CONFIG_RECT_TX_EXT
-  if (tx_size == plane_tx_size ||
-      mbmi->tx_size == quarter_txsize_lookup[mbmi->sb_type])
-#else
-  if (tx_size == plane_tx_size)
-#endif
-  {
+  if (tx_size == plane_tx_size) {
+#if CONFIG_ENTROPY_STATS
     ++counts->txfm_partition[ctx][0];
-#if CONFIG_RECT_TX_EXT
-    if (tx_size == plane_tx_size)
 #endif
-      mbmi->tx_size = tx_size;
+    if (allow_update_cdf)
+      update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 0, 2);
+    mbmi->tx_size = tx_size;
     txfm_partition_update(xd->above_txfm_context + blk_col,
                           xd->left_txfm_context + blk_row, tx_size, tx_size);
   } else {
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bs = tx_size_wide_unit[sub_txs];
-    int i;
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
 
+#if CONFIG_ENTROPY_STATS
     ++counts->txfm_partition[ctx][1];
+#endif
+    if (allow_update_cdf)
+      update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 1, 2);
     ++x->txb_split_count;
 
     if (sub_txs == TX_4X4) {
-      mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4;
+      mbmi->inter_tx_size[txb_size_index] = TX_4X4;
       mbmi->tx_size = TX_4X4;
       txfm_partition_update(xd->above_txfm_context + blk_col,
                             xd->left_txfm_context + blk_row, TX_4X4, tx_size);
       return;
     }
 
-    for (i = 0; i < 4; ++i) {
-      int offsetr = (i >> 1) * bs;
-      int offsetc = (i & 0x01) * bs;
-      update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr,
-                        blk_col + offsetc);
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        int offsetr = row;
+        int offsetc = col;
+
+        update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr,
+                          blk_col + offsetc, allow_update_cdf);
+      }
     }
   }
 }
 
 static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x,
                                       BLOCK_SIZE plane_bsize, int mi_row,
-                                      int mi_col, FRAME_COUNTS *td_counts) {
+                                      int mi_col, FRAME_COUNTS *td_counts,
+                                      uint8_t allow_update_cdf) {
   MACROBLOCKD *xd = &x->e_mbd;
   const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-  const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
-  TX_SIZE max_tx_size = get_vartx_max_txsize(&xd->mi[0]->mbmi, plane_bsize, 0);
+  const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
   const int bh = tx_size_high_unit[max_tx_size];
   const int bw = tx_size_wide_unit[max_tx_size];
   int idx, idy;
-  int init_depth =
-      (mi_height != mi_width) ? RECT_VARTX_DEPTH_INIT : SQR_VARTX_DEPTH_INIT;
-
-#if CONFIG_INTRABC
-  // Intrabc doesn't support var-tx yet. So no need to update tx partition
-  // info., except for the split count (otherwise common->tx_mode may be
-  // modified, causing mismatch).
-  if (is_intrabc_block(&x->e_mbd.mi[0]->mbmi)) {
-    if (x->e_mbd.mi[0]->mbmi.tx_size != max_tx_size) ++x->txb_split_count;
-    return;
-  }
-#endif  // CONFIG_INTRABC
 
-  xd->above_txfm_context =
-      cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
-  xd->left_txfm_context = xd->left_txfm_context_buffer +
-                          ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
+  xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 
   for (idy = 0; idy < mi_height; idy += bh)
     for (idx = 0; idx < mi_width; idx += bw)
-      update_txfm_count(x, xd, td_counts, max_tx_size, init_depth, idy, idx);
+      update_txfm_count(x, xd, td_counts, max_tx_size, 0, idy, idx,
+                        allow_update_cdf);
 }
 
 static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
                              int blk_col) {
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const int tx_row = blk_row >> 1;
-  const int tx_col = blk_col >> 1;
-  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
-  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
-  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
@@ -6114,23 +5003,23 @@ static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
                           xd->left_txfm_context + blk_row, tx_size, tx_size);
 
   } else {
-    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bsl = tx_size_wide_unit[sub_txs];
-    int i;
-
     if (tx_size == TX_8X8) {
-      mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4;
+      mbmi->inter_tx_size[txb_size_index] = TX_4X4;
       mbmi->tx_size = TX_4X4;
       txfm_partition_update(xd->above_txfm_context + blk_col,
                             xd->left_txfm_context + blk_row, TX_4X4, tx_size);
       return;
     }
-
-    assert(bsl > 0);
-    for (i = 0; i < 4; ++i) {
-      int offsetr = (i >> 1) * bsl;
-      int offsetc = (i & 0x01) * bsl;
-      set_txfm_context(xd, sub_txs, blk_row + offsetr, blk_col + offsetc);
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+        set_txfm_context(xd, sub_txs, offsetr, offsetc);
+      }
     }
   }
 }
@@ -6140,214 +5029,94 @@ static void tx_partition_set_contexts(const AV1_COMMON *const cm,
                                       int mi_row, int mi_col) {
   const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
   const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
-  TX_SIZE max_tx_size = get_vartx_max_txsize(&xd->mi[0]->mbmi, plane_bsize, 0);
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
   const int bh = tx_size_high_unit[max_tx_size];
   const int bw = tx_size_wide_unit[max_tx_size];
   int idx, idy;
 
-  xd->above_txfm_context =
-      cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
-  xd->left_txfm_context = xd->left_txfm_context_buffer +
-                          ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
+  xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 
   for (idy = 0; idy < mi_height; idy += bh)
     for (idx = 0; idx < mi_width; idx += bw)
       set_txfm_context(xd, max_tx_size, idy, idx);
 }
-#endif
 
-void av1_update_tx_type_count(const AV1_COMMON *cm, MACROBLOCKD *xd,
-#if CONFIG_TXK_SEL
-                              int blk_row, int blk_col, int block, int plane,
-#endif
-                              BLOCK_SIZE bsize, TX_SIZE tx_size,
-                              FRAME_COUNTS *counts) {
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  int is_inter = is_inter_block(mbmi);
-  FRAME_CONTEXT *fc = xd->tile_ctx;
-#if !CONFIG_ENTROPY_STATS
-  (void)counts;
-#endif  // !CONFIG_ENTROPY_STATS
-
-#if !CONFIG_TXK_SEL
-  TX_TYPE tx_type = mbmi->tx_type;
-#else
-  (void)blk_row;
-  (void)blk_col;
-  // Only y plane's tx_type is updated
-  if (plane > 0) return;
-  TX_TYPE tx_type =
-      av1_get_tx_type(PLANE_TYPE_Y, xd, blk_row, blk_col, block, tx_size);
-#endif
-#if CONFIG_EXT_TX
-  if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 &&
-      cm->base_qindex > 0 && !mbmi->skip &&
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-    const int eset =
-        get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
-    if (eset > 0) {
-#if !CONFIG_LGT_FROM_PRED
-      const TxSetType tx_set_type = get_ext_tx_set_type(
-          tx_size, bsize, is_inter, cm->reduced_tx_set_used);
-      if (is_inter) {
-        update_cdf(fc->inter_ext_tx_cdf[eset][txsize_sqr_map[tx_size]],
-                   av1_ext_tx_ind[tx_set_type][tx_type],
-                   av1_num_ext_tx_set[tx_set_type]);
-#if CONFIG_ENTROPY_STATS
-        ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]][tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-      } else {
-#if CONFIG_ENTROPY_STATS
-        ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][mbmi->mode]
-                              [tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-        update_cdf(
-            fc->intra_ext_tx_cdf[eset][txsize_sqr_map[tx_size]][mbmi->mode],
-            av1_ext_tx_ind[tx_set_type][tx_type],
-            av1_num_ext_tx_set[tx_set_type]);
-      }
-#else
-      (void)tx_type;
-      (void)fc;
-      if (is_inter) {
-        if (LGT_FROM_PRED_INTER) {
-          if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used)
-            ++counts->inter_lgt[txsize_sqr_map[tx_size]][mbmi->use_lgt];
-#if CONFIG_ENTROPY_STATS
-          if (!mbmi->use_lgt)
-            ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]][tx_type];
-          else
-#endif  // CONFIG_ENTROPY_STATS
-            mbmi->tx_type = DCT_DCT;
-        } else {
-#if CONFIG_ENTROPY_STATS
-          ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]][tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-        }
-      } else {
-        if (LGT_FROM_PRED_INTRA) {
-          if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used)
-            ++counts->intra_lgt[txsize_sqr_map[tx_size]][mbmi->mode]
-                               [mbmi->use_lgt];
-#if CONFIG_ENTROPY_STATS
-          if (!mbmi->use_lgt)
-            ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][mbmi->mode]
-                                  [tx_type];
-          else
-#endif  // CONFIG_ENTROPY_STATS
-            mbmi->tx_type = DCT_DCT;
-        } else {
-#if CONFIG_ENTROPY_STATS
-          ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][mbmi->mode]
-                                [tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-        }
-      }
-#endif  // CONFIG_LGT_FROM_PRED
-    }
-  }
-#else
-  (void)bsize;
-  if (tx_size < TX_32X32 &&
-      ((!cm->seg.enabled && cm->base_qindex > 0) ||
-       (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
-      !mbmi->skip &&
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-    if (is_inter) {
-#if CONFIG_ENTROPY_STATS
-      ++counts->inter_ext_tx[tx_size][tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-      update_cdf(fc->inter_ext_tx_cdf[tx_size], av1_ext_tx_ind[tx_type],
-                 TX_TYPES);
-    } else {
-#if CONFIG_ENTROPY_STATS
-      ++counts->intra_ext_tx[tx_size][intra_mode_to_tx_type_context[mbmi->mode]]
-                            [tx_type];
-#endif  // CONFIG_ENTROPY_STATS
-      update_cdf(
-          fc->intra_ext_tx_cdf[tx_size]
-                              [intra_mode_to_tx_type_context[mbmi->mode]],
-          av1_ext_tx_ind[tx_type], TX_TYPES);
-    }
-  }
-#endif  // CONFIG_EXT_TX
-}
-
-static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
-                              TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
-                              int mi_col, BLOCK_SIZE bsize, int *rate) {
+static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+                              ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              int *rate) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO **mi_8x8 = xd->mi;
-  MODE_INFO *mi = mi_8x8[0];
-  MB_MODE_INFO *mbmi = &mi->mbmi;
+  MB_MODE_INFO **mi_4x4 = xd->mi;
+  MB_MODE_INFO *mbmi = mi_4x4[0];
   const int seg_skip =
       segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
   const int mis = cm->mi_stride;
   const int mi_width = mi_size_wide[bsize];
   const int mi_height = mi_size_high[bsize];
   const int is_inter = is_inter_block(mbmi);
-#if CONFIG_CB4X4
-  const BLOCK_SIZE block_size = bsize;
-#else
-  const BLOCK_SIZE block_size = AOMMAX(bsize, BLOCK_8X8);
-#endif
 
-#if CONFIG_PVQ
-  x->pvq_speed = 0;
-  x->pvq_coded = (dry_run == OUTPUT_ENABLED) ? 1 : 0;
-#endif
+  if (cpi->sf.mode_pruning_based_on_two_pass_partition_search &&
+      x->cb_partition_scan) {
+    for (int row = mi_row; row < mi_row + mi_width;
+         row += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+      for (int col = mi_col; col < mi_col + mi_height;
+           col += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+        const int index = av1_first_partition_pass_stats_index(row, col);
+        FIRST_PARTITION_PASS_STATS *const stats =
+            &x->first_partition_pass_stats[index];
+        // Increase the counter of data samples.
+        ++stats->sample_counts;
+        // Increase the counter for ref_frame[0] and ref_frame[1].
+        if (stats->ref0_counts[mbmi->ref_frame[0]] < 255)
+          ++stats->ref0_counts[mbmi->ref_frame[0]];
+        if (mbmi->ref_frame[1] >= 0 &&
+            stats->ref1_counts[mbmi->ref_frame[0]] < 255)
+          ++stats->ref1_counts[mbmi->ref_frame[1]];
+      }
+    }
+  }
 
   if (!is_inter) {
-#if CONFIG_CFL
-    xd->cfl->store_y = 1;
-#endif  // CONFIG_CFL
-    int plane;
+    xd->cfl.is_chroma_reference = is_chroma_reference(
+        mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y);
+    xd->cfl.store_y = store_cfl_required(cm, xd);
     mbmi->skip = 1;
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      av1_encode_intra_block_plane((AV1_COMMON *)cm, x, block_size, plane, 1,
+    for (int plane = 0; plane < num_planes; ++plane) {
+      av1_encode_intra_block_plane(cpi, x, bsize, plane,
+                                   cpi->optimize_seg_arr[mbmi->segment_id],
                                    mi_row, mi_col);
     }
-#if CONFIG_CFL
-    xd->cfl->store_y = 0;
-#if CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-    if (is_chroma_reference(mi_row, mi_col, bsize, xd->cfl->subsampling_x,
-                            xd->cfl->subsampling_y) &&
-        !xd->cfl->are_parameters_computed) {
-      cfl_clear_sub8x8_val(xd->cfl);
-    }
-#endif  // CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-#endif  // CONFIG_CFL
-    if (!dry_run) {
-      sum_intra_stats(td->counts, xd, mi, xd->above_mi, xd->left_mi,
-                      frame_is_intra_only(cm), mi_row, mi_col);
-    }
 
-// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
-#if !CONFIG_PVQ
-    if (bsize >= BLOCK_8X8) {
-      for (plane = 0; plane <= 1; ++plane) {
+    // If there is at least one lossless segment, force the skip for intra
+    // block to be 0, in order to avoid the segment_id to be changed by in
+    // write_segment_id().
+    if (!cpi->common.seg.segid_preskip && cpi->common.seg.update_map &&
+        cpi->has_lossless_segment)
+      mbmi->skip = 0;
+
+    xd->cfl.store_y = 0;
+    if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) {
+      for (int plane = 0; plane < AOMMIN(2, num_planes); ++plane) {
         if (mbmi->palette_mode_info.palette_size[plane] > 0) {
-          if (!dry_run)
-            av1_tokenize_color_map(x, plane, 0, t, bsize, mbmi->tx_size,
-                                   PALETTE_MAP);
-          else if (dry_run == DRY_RUN_COSTCOEFFS)
-            rate += av1_cost_color_map(x, plane, 0, bsize, mbmi->tx_size,
-                                       PALETTE_MAP);
+          if (!dry_run) {
+            av1_tokenize_color_map(x, plane, t, bsize, mbmi->tx_size,
+                                   PALETTE_MAP, tile_data->allow_update_cdf,
+                                   td->counts);
+          } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+            rate +=
+                av1_cost_color_map(x, plane, bsize, mbmi->tx_size, PALETTE_MAP);
+          }
         }
       }
     }
-#endif  // !CONFIG_PVQ
 
-#if CONFIG_VAR_TX
-    mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-#endif
-#if CONFIG_LV_MAP
-    av1_update_txb_context(cpi, td, dry_run, block_size, rate, mi_row, mi_col);
-#else   // CONFIG_LV_MAP
-    av1_tokenize_sb(cpi, td, t, dry_run, block_size, rate, mi_row, mi_col);
-#endif  // CONFIG_LV_MAP
+    av1_update_txb_context(cpi, td, dry_run, bsize, rate, mi_row, mi_col,
+                           tile_data->allow_update_cdf);
   } else {
     int ref;
     const int is_compound = has_second_ref(mbmi);
@@ -6355,123 +5124,66 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
     set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
-#if CONFIG_INTRABC
       assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
-#else
-      assert(cfg != NULL);
-#endif  // !CONFIG_INTRABC
       av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
-                           &xd->block_refs[ref]->sf);
+                           &xd->block_refs[ref]->sf, num_planes);
     }
-#if CONFIG_COMPOUND_SINGLEREF
-    // Single ref compound mode
-    if (!is_compound && is_inter_singleref_comp_mode(mbmi->mode)) {
-      xd->block_refs[1] = xd->block_refs[0];
-      YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[0]);
-#if CONFIG_INTRABC
-      assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
-#else
-      assert(cfg != NULL);
-#endif  // !CONFIG_INTRABC
-      av1_setup_pre_planes(xd, 1, cfg, mi_row, mi_col, &xd->block_refs[1]->sf);
-    }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, block_size);
 
-#if !CONFIG_NCOBMC_ADAPT_WEIGHT
-#if CONFIG_MOTION_VAR
-    if (mbmi->motion_mode == OBMC_CAUSAL) {
-#if CONFIG_NCOBMC
-      if (dry_run == OUTPUT_ENABLED)
-        av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-      else
-#endif
-        av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-    }
-#endif  // CONFIG_MOTION_VAR
-#else
-    if (mbmi->motion_mode == OBMC_CAUSAL) {
+    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+    if (mbmi->motion_mode == OBMC_CAUSAL)
       av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-    } else if (mbmi->motion_mode == NCOBMC_ADAPT_WEIGHT &&
-               dry_run == OUTPUT_ENABLED) {
-      int p;
-      for (p = 0; p < MAX_MB_PLANE; ++p) {
-        get_pred_from_intrpl_buf(xd, mi_row, mi_col, block_size, p);
+
+#if CONFIG_MISMATCH_DEBUG
+    if (dry_run == OUTPUT_ENABLED) {
+      for (int plane = 0; plane < num_planes; ++plane) {
+        const struct macroblockd_plane *pd = &xd->plane[plane];
+        int pixel_c, pixel_r;
+        mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
+                        pd->subsampling_x, pd->subsampling_y);
+        if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                                 pd->subsampling_y))
+          continue;
+        mismatch_record_block_pre(pd->dst.buf, pd->dst.stride, cm->frame_offset,
+                                  plane, pixel_c, pixel_r, pd->width,
+                                  pd->height,
+                                  xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
       }
     }
-#endif
-
-    av1_encode_sb((AV1_COMMON *)cm, x, block_size, mi_row, mi_col);
-#if CONFIG_VAR_TX
-    if (mbmi->skip) mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-    av1_tokenize_sb_vartx(cpi, td, t, dry_run, mi_row, mi_col, block_size,
-                          rate);
 #else
-#if CONFIG_LV_MAP
-    av1_update_txb_context(cpi, td, dry_run, block_size, rate, mi_row, mi_col);
-#else   // CONFIG_LV_MAP
-    av1_tokenize_sb(cpi, td, t, dry_run, block_size, rate, mi_row, mi_col);
-#endif  // CONFIG_LV_MAP
+    (void)num_planes;
 #endif
-  }
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  if (x->using_dist_8x8 && bsize < BLOCK_8X8) {
-    dist_8x8_set_sub8x8_dst(x, (uint8_t *)x->decoded_8x8, bsize,
-                            block_size_wide[bsize], block_size_high[bsize],
-                            mi_row, mi_col);
+    av1_encode_sb(cpi, x, bsize, mi_row, mi_col, dry_run);
+    av1_tokenize_sb_vartx(cpi, td, t, dry_run, mi_row, mi_col, bsize, rate,
+                          tile_data->allow_update_cdf);
   }
-#endif
 
   if (!dry_run) {
-#if CONFIG_VAR_TX
-    TX_SIZE tx_size =
-        is_inter && !mbmi->skip ? mbmi->min_tx_size : mbmi->tx_size;
-#else
-    TX_SIZE tx_size = mbmi->tx_size;
-#endif
+    if (av1_allow_intrabc(cm) && is_intrabc_block(mbmi))
+      td->intrabc_used_this_tile = 1;
     if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id] &&
-#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_EXT_TX) && CONFIG_RECT_TX
-        mbmi->sb_type > BLOCK_4X4 &&
-#else
-        mbmi->sb_type >= BLOCK_8X8 &&
-#endif
-        !(is_inter && (mbmi->skip || seg_skip))) {
-#if CONFIG_VAR_TX
+        mbmi->sb_type > BLOCK_4X4 && !(is_inter && (mbmi->skip || seg_skip))) {
       if (is_inter) {
-        tx_partition_count_update(cm, x, bsize, mi_row, mi_col, td->counts);
+        tx_partition_count_update(cm, x, bsize, mi_row, mi_col, td->counts,
+                                  tile_data->allow_update_cdf);
       } else {
-        const int tx_size_ctx = get_tx_size_context(xd);
-        const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
-                                             : intra_tx_size_cat_lookup[bsize];
-        const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
-        const int depth = tx_size_to_depth(coded_tx_size);
-        ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth];
-        if (tx_size != max_txsize_rect_lookup[bsize]) ++x->txb_split_count;
-      }
-#else
-      const int tx_size_ctx = get_tx_size_context(xd);
-      const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
-                                           : intra_tx_size_cat_lookup[bsize];
-      const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
-      const int depth = tx_size_to_depth(coded_tx_size);
-
-      ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth];
+        if (mbmi->tx_size != max_txsize_rect_lookup[bsize])
+          ++x->txb_split_count;
+        if (block_signals_txsize(bsize)) {
+          const int tx_size_ctx = get_tx_size_context(xd);
+          const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+          const int depth = tx_size_to_depth(mbmi->tx_size, bsize);
+          const int max_depths = bsize_to_max_depth(bsize);
+
+          if (tile_data->allow_update_cdf)
+            update_cdf(xd->tile_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
+                       depth, max_depths + 1);
+#if CONFIG_ENTROPY_STATS
+          ++td->counts->intra_tx_size[tx_size_cat][tx_size_ctx][depth];
 #endif
-
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-      if (is_quarter_tx_allowed(xd, mbmi, is_inter) &&
-          quarter_txsize_lookup[bsize] != max_txsize_rect_lookup[bsize] &&
-          (mbmi->tx_size == quarter_txsize_lookup[bsize] ||
-           mbmi->tx_size == max_txsize_rect_lookup[bsize])) {
-        ++td->counts
-              ->quarter_tx_size[mbmi->tx_size == quarter_txsize_lookup[bsize]];
+        }
       }
-#endif
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-      assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+      assert(IMPLIES(is_rect_tx(mbmi->tx_size), is_rect_tx_allowed(xd, mbmi)));
     } else {
       int i, j;
       TX_SIZE intra_tx_size;
@@ -6480,43 +5192,22 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
         if (xd->lossless[mbmi->segment_id]) {
           intra_tx_size = TX_4X4;
         } else {
-          intra_tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, 1);
+          intra_tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
         }
       } else {
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-        intra_tx_size = tx_size;
-#else
-        intra_tx_size = (bsize >= BLOCK_8X8) ? tx_size : TX_4X4;
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+        intra_tx_size = mbmi->tx_size;
       }
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-      ++td->counts->tx_size_implied[max_txsize_lookup[bsize]]
-                                   [txsize_sqr_up_map[tx_size]];
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
 
       for (j = 0; j < mi_height; j++)
         for (i = 0; i < mi_width; i++)
           if (mi_col + i < cm->mi_cols && mi_row + j < cm->mi_rows)
-            mi_8x8[mis * j + i]->mbmi.tx_size = intra_tx_size;
+            mi_4x4[mis * j + i]->tx_size = intra_tx_size;
 
-#if CONFIG_VAR_TX
-      mbmi->min_tx_size = get_min_tx_size(intra_tx_size);
       if (intra_tx_size != max_txsize_rect_lookup[bsize]) ++x->txb_split_count;
-#endif
     }
-
-#if !CONFIG_TXK_SEL
-    av1_update_tx_type_count(cm, xd, bsize, tx_size, td->counts);
-#endif
   }
 
-#if CONFIG_VAR_TX
-  if (cm->tx_mode == TX_MODE_SELECT &&
-#if CONFIG_CB4X4
-      mbmi->sb_type > BLOCK_4X4 &&
-#else
-      mbmi->sb_type >= BLOCK_8X8 &&
-#endif
+  if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type) &&
       is_inter && !(mbmi->skip || seg_skip) &&
       !xd->lossless[mbmi->segment_id]) {
     if (dry_run) tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
@@ -6527,1137 +5218,20 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
       if (xd->lossless[mbmi->segment_id]) {
         tx_size = TX_4X4;
       } else {
-        tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, is_inter);
+        tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
       }
     } else {
       tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4;
     }
     mbmi->tx_size = tx_size;
-    set_txfm_ctxs(tx_size, xd->n8_w, xd->n8_h, (mbmi->skip || seg_skip), xd);
-  }
-#endif  // CONFIG_VAR_TX
-#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8
-  CFL_CTX *const cfl = xd->cfl;
-#if CONFIG_DEBUG
-  if (is_chroma_reference(mi_row, mi_col, bsize, cfl->subsampling_x,
-                          cfl->subsampling_y) &&
-      !cfl->are_parameters_computed) {
-    cfl_clear_sub8x8_val(cfl);
-  }
-#endif  // CONFIG_DEBUG
+    set_txfm_ctxs(tx_size, xd->n8_w, xd->n8_h,
+                  (mbmi->skip || seg_skip) && is_inter_block(mbmi), xd);
+  }
+  CFL_CTX *const cfl = &xd->cfl;
   if (is_inter_block(mbmi) &&
       !is_chroma_reference(mi_row, mi_col, bsize, cfl->subsampling_x,
-                           cfl->subsampling_y)) {
+                           cfl->subsampling_y) &&
+      is_cfl_allowed(xd)) {
     cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
   }
-#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8
-}
-
-#if CONFIG_SUPERTX
-static int check_intra_b(PICK_MODE_CONTEXT *ctx) {
-  if (!is_inter_mode((&ctx->mic)->mbmi.mode)) return 1;
-  if (ctx->mic.mbmi.ref_frame[1] == INTRA_FRAME) return 1;
-  return 0;
-}
-
-static int check_intra_sb(const AV1_COMP *const cpi, const TileInfo *const tile,
-                          int mi_row, int mi_col, BLOCK_SIZE bsize,
-                          PC_TREE *pc_tree) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int hbs = mi_size_wide[bsize] / 2;
-  const PARTITION_TYPE partition = pc_tree->partitioning;
-  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
-#if CONFIG_EXT_PARTITION_TYPES
-  int i;
-#endif
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-
-#if !CONFIG_CB4X4
-  assert(bsize >= BLOCK_8X8);
-#endif
-
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return 1;
-
-  switch (partition) {
-    case PARTITION_NONE: return check_intra_b(&pc_tree->none); break;
-    case PARTITION_VERT:
-      if (check_intra_b(&pc_tree->vertical[0])) return 1;
-      if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) {
-        if (check_intra_b(&pc_tree->vertical[1])) return 1;
-      }
-      break;
-    case PARTITION_HORZ:
-      if (check_intra_b(&pc_tree->horizontal[0])) return 1;
-      if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) {
-        if (check_intra_b(&pc_tree->horizontal[1])) return 1;
-      }
-      break;
-    case PARTITION_SPLIT:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        if (check_intra_b(pc_tree->leaf_split[0])) return 1;
-      } else {
-        if (check_intra_sb(cpi, tile, mi_row, mi_col, subsize,
-                           pc_tree->split[0]))
-          return 1;
-        if (check_intra_sb(cpi, tile, mi_row, mi_col + hbs, subsize,
-                           pc_tree->split[1]))
-          return 1;
-        if (check_intra_sb(cpi, tile, mi_row + hbs, mi_col, subsize,
-                           pc_tree->split[2]))
-          return 1;
-        if (check_intra_sb(cpi, tile, mi_row + hbs, mi_col + hbs, subsize,
-                           pc_tree->split[3]))
-          return 1;
-      }
-      break;
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-#error HORZ/VERT_A/B partitions not yet updated in superres code
-#endif
-    case PARTITION_HORZ_A:
-      for (i = 0; i < 3; i++) {
-        if (check_intra_b(&pc_tree->horizontala[i])) return 1;
-      }
-      break;
-    case PARTITION_HORZ_B:
-      for (i = 0; i < 3; i++) {
-        if (check_intra_b(&pc_tree->horizontalb[i])) return 1;
-      }
-      break;
-    case PARTITION_VERT_A:
-      for (i = 0; i < 3; i++) {
-        if (check_intra_b(&pc_tree->verticala[i])) return 1;
-      }
-      break;
-    case PARTITION_VERT_B:
-      for (i = 0; i < 3; i++) {
-        if (check_intra_b(&pc_tree->verticalb[i])) return 1;
-      }
-      break;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-    default: assert(0);
-  }
-  return 0;
-}
-
-static int check_supertx_b(TX_SIZE supertx_size, PICK_MODE_CONTEXT *ctx) {
-  return ctx->mic.mbmi.tx_size == supertx_size;
-}
-
-static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
-                            PC_TREE *pc_tree) {
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-
-  partition = pc_tree->partitioning;
-  subsize = get_subsize(bsize, partition);
-  switch (partition) {
-    case PARTITION_NONE: return check_supertx_b(supertx_size, &pc_tree->none);
-    case PARTITION_VERT:
-      return check_supertx_b(supertx_size, &pc_tree->vertical[0]);
-    case PARTITION_HORZ:
-      return check_supertx_b(supertx_size, &pc_tree->horizontal[0]);
-    case PARTITION_SPLIT:
-      if (bsize == BLOCK_8X8 && !unify_bsize)
-        return check_supertx_b(supertx_size, pc_tree->leaf_split[0]);
-      else
-        return check_supertx_sb(subsize, supertx_size, pc_tree->split[0]);
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-#error HORZ/VERT_A/B partitions not yet updated in superres code
-#endif
-    case PARTITION_HORZ_A:
-      return check_supertx_b(supertx_size, &pc_tree->horizontala[0]);
-    case PARTITION_HORZ_B:
-      return check_supertx_b(supertx_size, &pc_tree->horizontalb[0]);
-    case PARTITION_VERT_A:
-      return check_supertx_b(supertx_size, &pc_tree->verticala[0]);
-    case PARTITION_VERT_B:
-      return check_supertx_b(supertx_size, &pc_tree->verticalb[0]);
-#endif  // CONFIG_EXT_PARTITION_TYPES
-    default: assert(0); return 0;
-  }
-}
-
-static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
-                               int mi_row_ori, int mi_col_ori, int mi_row_pred,
-                               int mi_col_pred, int plane,
-                               BLOCK_SIZE bsize_pred, int b_sub8x8, int block) {
-  // Used in supertx
-  // (mi_row_ori, mi_col_ori): location for mv
-  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *mi_8x8 = xd->mi[0];
-  MODE_INFO *mi = mi_8x8;
-  MB_MODE_INFO *mbmi = &mi->mbmi;
-  int ref;
-  const int is_compound = has_second_ref(mbmi);
-
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
-    av1_setup_pre_planes(xd, ref, cfg, mi_row_pred, mi_col_pred,
-                         &xd->block_refs[ref]->sf);
-  }
-
-#if CONFIG_COMPOUND_SINGLEREF
-  // Single ref compound mode
-  if (!is_compound && is_inter_singleref_comp_mode(mbmi->mode)) {
-    xd->block_refs[1] = xd->block_refs[0];
-    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[0]);
-    av1_setup_pre_planes(xd, 1, cfg, mi_row_pred, mi_col_pred,
-                         &xd->block_refs[1]->sf);
-  }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-  if (!b_sub8x8)
-    av1_build_inter_predictor_sb_extend(cm, xd, mi_row_ori, mi_col_ori,
-                                        mi_row_pred, mi_col_pred, plane,
-                                        bsize_pred);
-  else
-    av1_build_inter_predictor_sb_sub8x8_extend(cm, xd, mi_row_ori, mi_col_ori,
-                                               mi_row_pred, mi_col_pred, plane,
-                                               bsize_pred, block);
-}
-
-static void predict_b_extend(const AV1_COMP *const cpi, ThreadData *td,
-                             const TileInfo *const tile, int block,
-                             int mi_row_ori, int mi_col_ori, int mi_row_pred,
-                             int mi_col_pred, int mi_row_top, int mi_col_top,
-                             int plane, uint8_t *dst_buf, int dst_stride,
-                             BLOCK_SIZE bsize_top, BLOCK_SIZE bsize_pred,
-                             RUN_TYPE dry_run, int b_sub8x8) {
-  // Used in supertx
-  // (mi_row_ori, mi_col_ori): location for mv
-  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
-  // (mi_row_top, mi_col_top, bsize_top): region of the top partition size
-  // block: sub location of sub8x8 blocks
-  // b_sub8x8: 1: ori is sub8x8; 0: ori is not sub8x8
-  // bextend: 1: region to predict is an extension of ori; 0: not
-
-  MACROBLOCK *const x = &td->mb;
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int r = (mi_row_pred - mi_row_top) * MI_SIZE;
-  int c = (mi_col_pred - mi_col_top) * MI_SIZE;
-  const int mi_width_top = mi_size_wide[bsize_top];
-  const int mi_height_top = mi_size_high[bsize_top];
-
-  if (mi_row_pred < mi_row_top || mi_col_pred < mi_col_top ||
-      mi_row_pred >= mi_row_top + mi_height_top ||
-      mi_col_pred >= mi_col_top + mi_width_top || mi_row_pred >= cm->mi_rows ||
-      mi_col_pred >= cm->mi_cols)
-    return;
-
-  set_offsets_extend(cpi, td, tile, mi_row_pred, mi_col_pred, mi_row_ori,
-                     mi_col_ori, bsize_pred);
-  xd->plane[plane].dst.stride = dst_stride;
-  xd->plane[plane].dst.buf =
-      dst_buf + (r >> xd->plane[plane].subsampling_y) * dst_stride +
-      (c >> xd->plane[plane].subsampling_x);
-
-  predict_superblock(cpi, td, mi_row_ori, mi_col_ori, mi_row_pred, mi_col_pred,
-                     plane, bsize_pred, b_sub8x8, block);
-
-  if (!dry_run && (plane == 0) && (block == 0 || !b_sub8x8))
-    update_stats(&cpi->common, td, mi_row_pred, mi_col_pred, 1);
-}
-
-static void extend_dir(const AV1_COMP *const cpi, ThreadData *td,
-                       const TileInfo *const tile, int block, BLOCK_SIZE bsize,
-                       BLOCK_SIZE top_bsize, int mi_row_ori, int mi_col_ori,
-                       int mi_row, int mi_col, int mi_row_top, int mi_col_top,
-                       int plane, uint8_t *dst_buf, int dst_stride, int dir) {
-  // dir: 0-lower, 1-upper, 2-left, 3-right
-  //      4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
-  MACROBLOCKD *xd = &td->mb.e_mbd;
-  const int mi_width = mi_size_wide[bsize];
-  const int mi_height = mi_size_high[bsize];
-  int xss = xd->plane[1].subsampling_x;
-  int yss = xd->plane[1].subsampling_y;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-  int b_sub8x8 = (bsize < BLOCK_8X8) && !unify_bsize ? 1 : 0;
-  int wide_unit, high_unit;
-  int i, j;
-  int ext_offset = 0;
-
-  BLOCK_SIZE extend_bsize;
-  int mi_row_pred, mi_col_pred;
-
-  if (dir == 0 || dir == 1) {  // lower and upper
-    extend_bsize =
-        (mi_width == mi_size_wide[BLOCK_8X8] || bsize < BLOCK_8X8 || xss < yss)
-            ? BLOCK_8X8
-            : BLOCK_16X8;
-
-#if CONFIG_CB4X4
-    if (bsize < BLOCK_8X8) {
-      extend_bsize = BLOCK_4X4;
-      ext_offset = mi_size_wide[BLOCK_8X8];
-    }
-#endif
-    wide_unit = mi_size_wide[extend_bsize];
-    high_unit = mi_size_high[extend_bsize];
-
-    mi_row_pred = mi_row + ((dir == 0) ? mi_height : -(mi_height + ext_offset));
-    mi_col_pred = mi_col;
-
-    for (j = 0; j < mi_height + ext_offset; j += high_unit)
-      for (i = 0; i < mi_width + ext_offset; i += wide_unit)
-        predict_b_extend(cpi, td, tile, block, mi_row_ori, mi_col_ori,
-                         mi_row_pred + j, mi_col_pred + i, mi_row_top,
-                         mi_col_top, plane, dst_buf, dst_stride, top_bsize,
-                         extend_bsize, 1, b_sub8x8);
-  } else if (dir == 2 || dir == 3) {  // left and right
-    extend_bsize =
-        (mi_height == mi_size_high[BLOCK_8X8] || bsize < BLOCK_8X8 || yss < xss)
-            ? BLOCK_8X8
-            : BLOCK_8X16;
-#if CONFIG_CB4X4
-    if (bsize < BLOCK_8X8) {
-      extend_bsize = BLOCK_4X4;
-      ext_offset = mi_size_wide[BLOCK_8X8];
-    }
-#endif
-    wide_unit = mi_size_wide[extend_bsize];
-    high_unit = mi_size_high[extend_bsize];
-
-    mi_row_pred = mi_row;
-    mi_col_pred = mi_col + ((dir == 3) ? mi_width : -(mi_width + ext_offset));
-
-    for (j = 0; j < mi_height + ext_offset; j += high_unit)
-      for (i = 0; i < mi_width + ext_offset; i += wide_unit)
-        predict_b_extend(cpi, td, tile, block, mi_row_ori, mi_col_ori,
-                         mi_row_pred + j, mi_col_pred + i, mi_row_top,
-                         mi_col_top, plane, dst_buf, dst_stride, top_bsize,
-                         extend_bsize, 1, b_sub8x8);
-  } else {
-    extend_bsize = BLOCK_8X8;
-#if CONFIG_CB4X4
-    if (bsize < BLOCK_8X8) {
-      extend_bsize = BLOCK_4X4;
-      ext_offset = mi_size_wide[BLOCK_8X8];
-    }
-#endif
-    wide_unit = mi_size_wide[extend_bsize];
-    high_unit = mi_size_high[extend_bsize];
-
-    mi_row_pred = mi_row + ((dir == 4 || dir == 6) ? mi_height
-                                                   : -(mi_height + ext_offset));
-    mi_col_pred =
-        mi_col + ((dir == 6 || dir == 7) ? mi_width : -(mi_width + ext_offset));
-
-    for (j = 0; j < mi_height + ext_offset; j += high_unit)
-      for (i = 0; i < mi_width + ext_offset; i += wide_unit)
-        predict_b_extend(cpi, td, tile, block, mi_row_ori, mi_col_ori,
-                         mi_row_pred + j, mi_col_pred + i, mi_row_top,
-                         mi_col_top, plane, dst_buf, dst_stride, top_bsize,
-                         extend_bsize, 1, b_sub8x8);
-  }
-}
-
-static void extend_all(const AV1_COMP *const cpi, ThreadData *td,
-                       const TileInfo *const tile, int block, BLOCK_SIZE bsize,
-                       BLOCK_SIZE top_bsize, int mi_row_ori, int mi_col_ori,
-                       int mi_row, int mi_col, int mi_row_top, int mi_col_top,
-                       int plane, uint8_t *dst_buf, int dst_stride) {
-  assert(block >= 0 && block < 4);
-  for (int i = 0; i < 8; ++i) {
-    extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row_ori, mi_col_ori,
-               mi_row, mi_col, mi_row_top, mi_col_top, plane, dst_buf,
-               dst_stride, i);
-  }
-}
-
-// This function generates prediction for multiple blocks, between which
-// discontinuity around boundary is reduced by smoothing masks. The basic
-// smoothing mask is a soft step function along horz/vert direction. In more
-// complicated case when a block is split into 4 subblocks, the basic mask is
-// first applied to neighboring subblocks (2 pairs) in horizontal direction and
-// then applied to the 2 masked prediction mentioned above in vertical direction
-// If the block is split into more than one level, at every stage, masked
-// prediction is stored in dst_buf[] passed from higher level.
-static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
-                               const TileInfo *const tile, int mi_row,
-                               int mi_col, int mi_row_top, int mi_col_top,
-                               RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                               BLOCK_SIZE top_bsize, uint8_t *dst_buf[3],
-                               int dst_stride[3], PC_TREE *pc_tree) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int hbs = mi_size_wide[bsize] / 2;
-  const int is_partition_root = bsize >= BLOCK_8X8;
-  const int ctx = is_partition_root
-                      ? partition_plane_context(xd, mi_row, mi_col,
-#if CONFIG_UNPOISON_PARTITION_CTX
-                                                mi_row + hbs < cm->mi_rows,
-                                                mi_col + hbs < cm->mi_cols,
-#endif
-                                                bsize)
-                      : -1;
-  const PARTITION_TYPE partition = pc_tree->partitioning;
-  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
-#if CONFIG_EXT_PARTITION_TYPES
-  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
-#endif
-
-  int i;
-  uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf3[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
-  int dst_stride1[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
-  int dst_stride2[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
-  int dst_stride3[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-  assert(bsize >= BLOCK_8X8);
-#endif
-
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
-
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    int len = sizeof(uint16_t);
-    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
-    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_TX_SQUARE * len);
-    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_TX_SQUARE * len);
-    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
-    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_TX_SQUARE * len);
-    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_TX_SQUARE * len);
-    dst_buf3[0] = CONVERT_TO_BYTEPTR(tmp_buf3);
-    dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAX_TX_SQUARE * len);
-    dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAX_TX_SQUARE * len);
-  } else {
-#endif  // CONFIG_HIGHBITDEPTH
-    dst_buf1[0] = tmp_buf1;
-    dst_buf1[1] = tmp_buf1 + MAX_TX_SQUARE;
-    dst_buf1[2] = tmp_buf1 + 2 * MAX_TX_SQUARE;
-    dst_buf2[0] = tmp_buf2;
-    dst_buf2[1] = tmp_buf2 + MAX_TX_SQUARE;
-    dst_buf2[2] = tmp_buf2 + 2 * MAX_TX_SQUARE;
-    dst_buf3[0] = tmp_buf3;
-    dst_buf3[1] = tmp_buf3 + MAX_TX_SQUARE;
-    dst_buf3[2] = tmp_buf3 + 2 * MAX_TX_SQUARE;
-#if CONFIG_HIGHBITDEPTH
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-
-  if (!dry_run && ctx >= 0 && bsize < top_bsize) {
-    // Explicitly cast away const.
-    FRAME_COUNTS *const frame_counts = (FRAME_COUNTS *)&cm->counts;
-    frame_counts->partition[ctx][partition]++;
-  }
-
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    xd->plane[i].dst.buf = dst_buf[i];
-    xd->plane[i].dst.stride = dst_stride[i];
-  }
-
-  switch (partition) {
-    case PARTITION_NONE:
-      assert(bsize < top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; ++i) {
-        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i],
-                         top_bsize, bsize, dry_run, 0);
-        extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row, mi_col, mi_row,
-                   mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                   dst_stride[i]);
-      }
-      break;
-    case PARTITION_HORZ:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        for (i = 0; i < MAX_MB_PLANE; ++i) {
-          // First half
-          predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i],
-                           top_bsize, BLOCK_8X8, dry_run, 1);
-          if (bsize < top_bsize)
-            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                       dst_stride[i]);
-
-          // Second half
-          predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, i, dst_buf1[i],
-                           dst_stride1[i], top_bsize, BLOCK_8X8, dry_run, 1);
-          if (bsize < top_bsize)
-            extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
-                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf1[i],
-                       dst_stride1[i]);
-        }
-
-        // Smooth
-        xd->plane[0].dst.buf = dst_buf[0];
-        xd->plane[0].dst.stride = dst_stride[0];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
-            0);
-      } else {
-        for (i = 0; i < MAX_MB_PLANE; ++i) {
-#if CONFIG_CB4X4
-          const struct macroblockd_plane *pd = &xd->plane[i];
-          int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
-              subsize, pd->subsampling_x, pd->subsampling_y);
-
-          if (handle_chroma_sub8x8) {
-            int mode_offset_row = CONFIG_CHROMA_SUB8X8 ? hbs : 0;
-
-            predict_b_extend(cpi, td, tile, 0, mi_row + mode_offset_row, mi_col,
-                             mi_row, mi_col, mi_row_top, mi_col_top, i,
-                             dst_buf[i], dst_stride[i], top_bsize, bsize,
-                             dry_run, 0);
-            if (bsize < top_bsize)
-              extend_all(cpi, td, tile, 0, bsize, top_bsize,
-                         mi_row + mode_offset_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i]);
-          } else {
-#endif
-            // First half
-            predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                             mi_row_top, mi_col_top, i, dst_buf[i],
-                             dst_stride[i], top_bsize, subsize, dry_run, 0);
-            if (bsize < top_bsize)
-              extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                         dst_stride[i]);
-            else
-              extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                         dst_stride[i], 0);
-            xd->plane[i].dst.buf = dst_buf[i];
-            xd->plane[i].dst.stride = dst_stride[i];
-
-            if (mi_row + hbs < cm->mi_rows) {
-              // Second half
-              predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col,
-                               mi_row + hbs, mi_col, mi_row_top, mi_col_top, i,
-                               dst_buf1[i], dst_stride1[i], top_bsize, subsize,
-                               dry_run, 0);
-              if (bsize < top_bsize)
-                extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
-                           mi_col, mi_row + hbs, mi_col, mi_row_top, mi_col_top,
-                           i, dst_buf1[i], dst_stride1[i]);
-              else
-                extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
-                           mi_col, mi_row + hbs, mi_col, mi_row_top, mi_col_top,
-                           i, dst_buf1[i], dst_stride1[i], 1);
-              // Smooth
-              xd->plane[i].dst.buf = dst_buf[i];
-              xd->plane[i].dst.stride = dst_stride[i];
-              av1_build_masked_inter_predictor_complex(
-                  xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
-                  mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-                  PARTITION_HORZ, i);
-            }
-#if CONFIG_CB4X4
-          }
-#endif
-        }
-      }
-      break;
-    case PARTITION_VERT:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        for (i = 0; i < MAX_MB_PLANE; ++i) {
-          // First half
-          predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i],
-                           top_bsize, BLOCK_8X8, dry_run, 1);
-          if (bsize < top_bsize)
-            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                       dst_stride[i]);
-
-          // Second half
-          predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, i, dst_buf1[i],
-                           dst_stride1[i], top_bsize, BLOCK_8X8, dry_run, 1);
-          if (bsize < top_bsize)
-            extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
-                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf1[i],
-                       dst_stride1[i]);
-        }
-
-        // Smooth
-        xd->plane[0].dst.buf = dst_buf[0];
-        xd->plane[0].dst.stride = dst_stride[0];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
-            0);
-      } else {
-        for (i = 0; i < MAX_MB_PLANE; ++i) {
-#if CONFIG_CB4X4
-          const struct macroblockd_plane *pd = &xd->plane[i];
-          int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
-              subsize, pd->subsampling_x, pd->subsampling_y);
-
-          if (handle_chroma_sub8x8) {
-            int mode_offset_col = CONFIG_CHROMA_SUB8X8 ? hbs : 0;
-
-            predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + mode_offset_col,
-                             mi_row, mi_col, mi_row_top, mi_col_top, i,
-                             dst_buf[i], dst_stride[i], top_bsize, bsize,
-                             dry_run, 0);
-            if (bsize < top_bsize)
-              extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row,
-                         mi_col + mode_offset_col, mi_row, mi_col, mi_row_top,
-                         mi_col_top, i, dst_buf[i], dst_stride[i]);
-          } else {
-#endif
-            predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                             mi_row_top, mi_col_top, i, dst_buf[i],
-                             dst_stride[i], top_bsize, subsize, dry_run, 0);
-            if (bsize < top_bsize)
-              extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                         dst_stride[i]);
-            else
-              extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                         dst_stride[i], 3);
-            xd->plane[i].dst.buf = dst_buf[i];
-            xd->plane[i].dst.stride = dst_stride[i];
-
-            if (mi_col + hbs < cm->mi_cols) {
-              predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
-                               mi_col + hbs, mi_row_top, mi_col_top, i,
-                               dst_buf1[i], dst_stride1[i], top_bsize, subsize,
-                               dry_run, 0);
-              if (bsize < top_bsize)
-                extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row,
-                           mi_col + hbs, mi_row, mi_col + hbs, mi_row_top,
-                           mi_col_top, i, dst_buf1[i], dst_stride1[i]);
-              else
-                extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row,
-                           mi_col + hbs, mi_row, mi_col + hbs, mi_row_top,
-                           mi_col_top, i, dst_buf1[i], dst_stride1[i], 2);
-
-              // smooth
-              xd->plane[i].dst.buf = dst_buf[i];
-              xd->plane[i].dst.stride = dst_stride[i];
-              av1_build_masked_inter_predictor_complex(
-                  xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
-                  mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-                  PARTITION_VERT, i);
-            }
-#if CONFIG_CB4X4
-          }
-#endif
-        }
-      }
-      break;
-    case PARTITION_SPLIT:
-      if (bsize == BLOCK_8X8 && !unify_bsize) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i],
-                           top_bsize, BLOCK_8X8, dry_run, 1);
-          predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, i, dst_buf1[i],
-                           dst_stride1[i], top_bsize, BLOCK_8X8, dry_run, 1);
-          predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, i, dst_buf2[i],
-                           dst_stride2[i], top_bsize, BLOCK_8X8, dry_run, 1);
-          predict_b_extend(cpi, td, tile, 3, mi_row, mi_col, mi_row, mi_col,
-                           mi_row_top, mi_col_top, i, dst_buf3[i],
-                           dst_stride3[i], top_bsize, BLOCK_8X8, dry_run, 1);
-
-          if (bsize < top_bsize) {
-            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                       dst_stride[i]);
-            extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
-                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf1[i],
-                       dst_stride1[i]);
-            extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
-                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf2[i],
-                       dst_stride2[i]);
-            extend_all(cpi, td, tile, 3, subsize, top_bsize, mi_row, mi_col,
-                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf3[i],
-                       dst_stride3[i]);
-          }
-        }
-#if CONFIG_CB4X4
-      } else if (bsize == BLOCK_8X8) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          const struct macroblockd_plane *pd = &xd->plane[i];
-          int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
-              subsize, pd->subsampling_x, pd->subsampling_y);
-
-          if (handle_chroma_sub8x8) {
-            int mode_offset_row =
-                CONFIG_CHROMA_SUB8X8 && mi_row + hbs < cm->mi_rows ? hbs : 0;
-            int mode_offset_col =
-                CONFIG_CHROMA_SUB8X8 && mi_col + hbs < cm->mi_cols ? hbs : 0;
-
-            predict_b_extend(cpi, td, tile, 0, mi_row + mode_offset_row,
-                             mi_col + mode_offset_col, mi_row, mi_col,
-                             mi_row_top, mi_col_top, i, dst_buf[i],
-                             dst_stride[i], top_bsize, BLOCK_8X8, dry_run, 0);
-            if (bsize < top_bsize)
-              extend_all(cpi, td, tile, 0, BLOCK_8X8, top_bsize,
-                         mi_row + mode_offset_row, mi_col + mode_offset_col,
-                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                         dst_stride[i]);
-          } else {
-            predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                             mi_row_top, mi_col_top, i, dst_buf[i],
-                             dst_stride[i], top_bsize, subsize, dry_run, 0);
-            if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-              predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
-                               mi_col + hbs, mi_row_top, mi_col_top, i,
-                               dst_buf1[i], dst_stride1[i], top_bsize, subsize,
-                               dry_run, 0);
-            if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
-              predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col,
-                               mi_row + hbs, mi_col, mi_row_top, mi_col_top, i,
-                               dst_buf2[i], dst_stride2[i], top_bsize, subsize,
-                               dry_run, 0);
-            if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-              predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
-                               mi_row + hbs, mi_col + hbs, mi_row_top,
-                               mi_col_top, i, dst_buf3[i], dst_stride3[i],
-                               top_bsize, subsize, dry_run, 0);
-
-            if (bsize < top_bsize) {
-              extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
-                         dst_stride[i]);
-              if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-                extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row,
-                           mi_col + hbs, mi_row, mi_col + hbs, mi_row_top,
-                           mi_col_top, i, dst_buf1[i], dst_stride1[i]);
-              if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
-                extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
-                           mi_col, mi_row + hbs, mi_col, mi_row_top, mi_col_top,
-                           i, dst_buf2[i], dst_stride2[i]);
-              if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-                extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
-                           mi_col + hbs, mi_row + hbs, mi_col + hbs, mi_row_top,
-                           mi_col_top, i, dst_buf3[i], dst_stride3[i]);
-            }
-          }
-        }
-#endif
-      } else {
-        predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row_top,
-                           mi_col_top, dry_run, subsize, top_bsize, dst_buf,
-                           dst_stride, pc_tree->split[0]);
-        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-          predict_sb_complex(cpi, td, tile, mi_row, mi_col + hbs, mi_row_top,
-                             mi_col_top, dry_run, subsize, top_bsize, dst_buf1,
-                             dst_stride1, pc_tree->split[1]);
-        if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
-          predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col, mi_row_top,
-                             mi_col_top, dry_run, subsize, top_bsize, dst_buf2,
-                             dst_stride2, pc_tree->split[2]);
-        if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-          predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col + hbs,
-                             mi_row_top, mi_col_top, dry_run, subsize,
-                             top_bsize, dst_buf3, dst_stride3,
-                             pc_tree->split[3]);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-#if CONFIG_CB4X4
-        const struct macroblockd_plane *pd = &xd->plane[i];
-        int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
-            subsize, pd->subsampling_x, pd->subsampling_y);
-        if (handle_chroma_sub8x8) continue;  // Skip <4x4 chroma smoothing
-#else
-        if (bsize == BLOCK_8X8 && i != 0)
-          continue;  // Skip <4x4 chroma smoothing
-#endif
-
-        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
-          av1_build_masked_inter_predictor_complex(
-              xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
-              mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-              PARTITION_VERT, i);
-          if (mi_row + hbs < cm->mi_rows) {
-            av1_build_masked_inter_predictor_complex(
-                xd, dst_buf2[i], dst_stride2[i], dst_buf3[i], dst_stride3[i],
-                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-                PARTITION_VERT, i);
-            av1_build_masked_inter_predictor_complex(
-                xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i],
-                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-                PARTITION_HORZ, i);
-          }
-        } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
-          av1_build_masked_inter_predictor_complex(
-              xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i],
-              mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-              PARTITION_HORZ, i);
-        }
-      }
-      break;
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION_TYPES_AB
-#error HORZ/VERT_A/B partitions not yet updated in superres code
-#endif
-    case PARTITION_HORZ_A:
-      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                       mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       bsize2, dry_run, 0, 0);
-      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
-                 mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
-
-      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
-                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
-                       dst_stride1, top_bsize, bsize2, dry_run, 0, 0);
-      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
-                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
-
-      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
-                       mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2,
-                       top_bsize, subsize, dry_run, 0, 0);
-      if (bsize < top_bsize)
-        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
-                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
-      else
-        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
-                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 1);
-
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
-            i);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
-            i);
-      }
-
-      break;
-    case PARTITION_VERT_A:
-
-      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                       mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       bsize2, dry_run, 0, 0);
-      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
-                 mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
-
-      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
-                       mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                       top_bsize, bsize2, dry_run, 0, 0);
-      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
-                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
-
-      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
-                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
-                       dst_stride2, top_bsize, subsize, dry_run, 0, 0);
-      if (bsize < top_bsize)
-        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
-                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
-      else
-        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
-                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 2);
-
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
-            i);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
-            i);
-      }
-      break;
-    case PARTITION_HORZ_B:
-
-      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                       mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       subsize, dry_run, 0, 0);
-      if (bsize < top_bsize)
-        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
-      else
-        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0);
-
-      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
-                       mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                       top_bsize, bsize2, dry_run, 0, 0);
-      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
-                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
-
-      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
-                       mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
-                       dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0);
-      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
-                 mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2,
-                 dst_stride2);
-
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i],
-            mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-            PARTITION_VERT, i);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
-            i);
-      }
-      break;
-    case PARTITION_VERT_B:
-
-      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                       mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       subsize, dry_run, 0, 0);
-      if (bsize < top_bsize)
-        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
-      else
-        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3);
-
-      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
-                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
-                       dst_stride1, top_bsize, bsize2, dry_run, 0, 0);
-      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
-                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
-
-      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
-                       mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
-                       dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0);
-      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
-                 mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2,
-                 dst_stride2);
-
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i],
-            mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-            PARTITION_HORZ, i);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        av1_build_masked_inter_predictor_complex(
-            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
-            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
-            i);
-      }
-      break;
-#endif  // CONFIG_EXT_PARTITION_TYPES
-    default: assert(0);
-  }
-
-#if CONFIG_EXT_PARTITION_TYPES
-  if (bsize < top_bsize)
-    update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
-#else
-  if (bsize < top_bsize && (partition != PARTITION_SPLIT || bsize == BLOCK_8X8))
-    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
-#endif  // CONFIG_EXT_PARTITION_TYPES
-}
-
-static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td,
-                          const TileInfo *const tile, int mi_row, int mi_col,
-                          BLOCK_SIZE bsize, int *tmp_rate, int64_t *tmp_dist,
-                          TX_TYPE *best_tx, PC_TREE *pc_tree) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int plane, pnskip, skippable, skippable_uv, rate_uv, this_rate,
-      base_rate = *tmp_rate;
-  int64_t sse, pnsse, sse_uv, this_dist, dist_uv;
-  uint8_t *dst_buf[3];
-  int dst_stride[3];
-  TX_SIZE tx_size;
-  MB_MODE_INFO *mbmi;
-  TX_TYPE tx_type, best_tx_nostx;
-  int tmp_rate_tx = 0, skip_tx = 0;
-  int64_t tmp_dist_tx = 0, rd_tx, bestrd_tx = INT64_MAX;
-
-  set_skip_context(xd, mi_row, mi_col);
-  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
-  update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, 1, pc_tree);
-  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                       mi_col);
-  for (plane = 0; plane < MAX_MB_PLANE; plane++) {
-    dst_buf[plane] = xd->plane[plane].dst.buf;
-    dst_stride[plane] = xd->plane[plane].dst.stride;
-  }
-  predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, 1, bsize,
-                     bsize, dst_buf, dst_stride, pc_tree);
-
-  set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
-  set_segment_id_supertx(cpi, x, mi_row, mi_col, bsize);
-
-  mbmi = &xd->mi[0]->mbmi;
-  best_tx_nostx = mbmi->tx_type;
-
-  *best_tx = DCT_DCT;
-
-  // chroma
-  skippable_uv = 1;
-  rate_uv = 0;
-  dist_uv = 0;
-  sse_uv = 0;
-  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-#if CONFIG_VAR_TX
-    ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
-    ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
-    const struct macroblockd_plane *const pd = &xd->plane[plane];
-    RD_STATS this_rd_stats;
-    av1_init_rd_stats(&this_rd_stats);
-
-    tx_size = max_txsize_lookup[bsize];
-    tx_size =
-        uv_txsize_lookup[bsize][tx_size][cm->subsampling_x][cm->subsampling_y];
-    av1_get_entropy_contexts(bsize, tx_size, pd, ctxa, ctxl);
-
-    av1_subtract_plane(x, bsize, plane);
-    av1_tx_block_rd_b(cpi, x, tx_size, 0, 0, plane, 0,
-                      get_plane_block_size(bsize, pd), &ctxa[0], &ctxl[0],
-                      &this_rd_stats);
-
-    this_rate = this_rd_stats.rate;
-    this_dist = this_rd_stats.dist;
-    pnsse = this_rd_stats.sse;
-    pnskip = this_rd_stats.skip;
-#else
-    tx_size = max_txsize_lookup[bsize];
-    tx_size =
-        uv_txsize_lookup[bsize][tx_size][cm->subsampling_x][cm->subsampling_y];
-    av1_subtract_plane(x, bsize, plane);
-    av1_txfm_rd_in_plane_supertx(x, cpi, &this_rate, &this_dist, &pnskip,
-                                 &pnsse, INT64_MAX, plane, bsize, tx_size, 0);
-#endif  // CONFIG_VAR_TX
-
-    rate_uv += this_rate;
-    dist_uv += this_dist;
-    sse_uv += pnsse;
-    skippable_uv &= pnskip;
-  }
-
-  // luma
-  tx_size = max_txsize_lookup[bsize];
-  av1_subtract_plane(x, bsize, 0);
-#if CONFIG_EXT_TX
-  int ext_tx_set = get_ext_tx_set(tx_size, bsize, 1, cm->reduced_tx_set_used);
-  const TxSetType tx_set_type =
-      get_ext_tx_set_type(tx_size, bsize, 1, cm->reduced_tx_set_used);
-#endif  // CONFIG_EXT_TX
-  for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
-#if CONFIG_VAR_TX
-    ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
-    ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
-    const struct macroblockd_plane *const pd = &xd->plane[0];
-    RD_STATS this_rd_stats;
-#endif  // CONFIG_VAR_TX
-
-#if CONFIG_EXT_TX
-    if (!av1_ext_tx_used[tx_set_type][tx_type]) continue;
-#else
-    if (tx_size >= TX_32X32 && tx_type != DCT_DCT) continue;
-#endif  // CONFIG_EXT_TX
-    mbmi->tx_type = tx_type;
-
-#if CONFIG_VAR_TX
-    av1_init_rd_stats(&this_rd_stats);
-    av1_get_entropy_contexts(bsize, tx_size, pd, ctxa, ctxl);
-    av1_tx_block_rd_b(cpi, x, tx_size, 0, 0, 0, 0, bsize, &ctxa[0], &ctxl[0],
-                      &this_rd_stats);
-
-    this_rate = this_rd_stats.rate;
-    this_dist = this_rd_stats.dist;
-    pnsse = this_rd_stats.sse;
-    pnskip = this_rd_stats.skip;
-#else
-    av1_txfm_rd_in_plane_supertx(x, cpi, &this_rate, &this_dist, &pnskip,
-                                 &pnsse, INT64_MAX, 0, bsize, tx_size, 0);
-#endif  // CONFIG_VAR_TX
-
-#if CONFIG_EXT_TX
-    if (get_ext_tx_types(tx_size, bsize, 1, cm->reduced_tx_set_used) > 1 &&
-        !xd->lossless[xd->mi[0]->mbmi.segment_id] && this_rate != INT_MAX) {
-      if (ext_tx_set > 0)
-        this_rate +=
-            x->inter_tx_type_costs[ext_tx_set][mbmi->tx_size][mbmi->tx_type];
-    }
-#else
-    if (tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
-        this_rate != INT_MAX) {
-      this_rate += x->inter_tx_type_costs[tx_size][mbmi->tx_type];
-    }
-#endif  // CONFIG_EXT_TX
-    *tmp_rate = rate_uv + this_rate;
-    *tmp_dist = dist_uv + this_dist;
-    sse = sse_uv + pnsse;
-    skippable = skippable_uv && pnskip;
-    if (skippable) {
-      *tmp_rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-      x->skip = 1;
-    } else {
-      if (RDCOST(x->rdmult, *tmp_rate, *tmp_dist) < RDCOST(x->rdmult, 0, sse)) {
-        *tmp_rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
-        x->skip = 0;
-      } else {
-        *tmp_dist = sse;
-        *tmp_rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-        x->skip = 1;
-      }
-    }
-    *tmp_rate += base_rate;
-    rd_tx = RDCOST(x->rdmult, *tmp_rate, *tmp_dist);
-    if (rd_tx < bestrd_tx * 0.99 || tx_type == DCT_DCT) {
-      *best_tx = tx_type;
-      bestrd_tx = rd_tx;
-      tmp_rate_tx = *tmp_rate;
-      tmp_dist_tx = *tmp_dist;
-      skip_tx = x->skip;
-    }
-  }
-  *tmp_rate = tmp_rate_tx;
-  *tmp_dist = tmp_dist_tx;
-  x->skip = skip_tx;
-#if CONFIG_VAR_TX
-  for (plane = 0; plane < 1; ++plane)
-    memset(x->blk_skip[plane], x->skip,
-           sizeof(uint8_t) * pc_tree->none.num_4x4_blk);
-#endif  // CONFIG_VAR_TX
-  xd->mi[0]->mbmi.tx_type = best_tx_nostx;
 }
-#endif  // CONFIG_SUPERTX
diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h
index b54e54d25..62141dba4 100644
--- a/third_party/aom/av1/encoder/encodeframe.h
+++ b/third_party/aom/av1/encoder/encodeframe.h
@@ -20,6 +20,8 @@
 extern "C" {
 #endif
 
+#define DELTAQ_MODULATION 0  // 0: variance based, 1: wavelet AC energy based
+
 struct macroblock;
 struct yv12_buffer_config;
 struct AV1_COMP;
@@ -27,7 +29,7 @@ struct ThreadData;
 
 void av1_setup_src_planes(struct macroblock *x,
                           const struct yv12_buffer_config *src, int mi_row,
-                          int mi_col);
+                          int mi_col, const int num_planes);
 
 void av1_encode_frame(struct AV1_COMP *cpi);
 
@@ -35,12 +37,6 @@ void av1_init_tile_data(struct AV1_COMP *cpi);
 void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row,
                      int tile_col);
 
-void av1_update_tx_type_count(const struct AV1Common *cm, MACROBLOCKD *xd,
-#if CONFIG_TXK_SEL
-                              int blk_row, int blk_col, int block, int plane,
-#endif
-                              BLOCK_SIZE bsize, TX_SIZE tx_size,
-                              FRAME_COUNTS *counts);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c
index f35ce8a4f..cea8db6f9 100644
--- a/third_party/aom/av1/encoder/encodemb.c
+++ b/third_party/aom/av1/encoder/encodemb.c
@@ -9,15 +9,20 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/bitwriter.h"
 #include "aom_dsp/quantize.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/cfl.h"
 #include "av1/common/idct.h"
 #include "av1/common/reconinter.h"
 #include "av1/common/reconintra.h"
@@ -25,22 +30,10 @@
 
 #include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/encodemb.h"
-#if CONFIG_LV_MAP
 #include "av1/encoder/encodetxb.h"
-#endif
 #include "av1/encoder/hybrid_fwd_txfm.h"
 #include "av1/encoder/rd.h"
-#include "av1/encoder/tokenize.h"
-
-#if CONFIG_PVQ
-#include "av1/encoder/encint.h"
-#include "av1/common/partition.h"
-#include "av1/encoder/pvq_encoder.h"
-#endif
-
-#if CONFIG_CFL
-#include "av1/common/cfl.h"
-#endif
+#include "av1/encoder/rdopt.h"
 
 // Check if one needs to use c version subtraction.
 static int check_subtract_block_size(int w, int h) { return w < 4 || h < 4; }
@@ -49,31 +42,23 @@ static void subtract_block(const MACROBLOCKD *xd, int rows, int cols,
                            int16_t *diff, ptrdiff_t diff_stride,
                            const uint8_t *src8, ptrdiff_t src_stride,
                            const uint8_t *pred8, ptrdiff_t pred_stride) {
-#if !CONFIG_HIGHBITDEPTH
-  (void)xd;
-#endif
-
   if (check_subtract_block_size(rows, cols)) {
-#if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       aom_highbd_subtract_block_c(rows, cols, diff, diff_stride, src8,
                                   src_stride, pred8, pred_stride, xd->bd);
       return;
     }
-#endif  // CONFIG_HIGHBITDEPTH
     aom_subtract_block_c(rows, cols, diff, diff_stride, src8, src_stride, pred8,
                          pred_stride);
 
     return;
   }
 
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
                               pred8, pred_stride, xd->bd);
     return;
   }
-#endif  // CONFIG_HIGHBITDEPTH
   aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
                      pred_stride);
 }
@@ -101,7 +86,8 @@ void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
 void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
   const int bw = block_size_wide[plane_bsize];
   const int bh = block_size_high[plane_bsize];
   const MACROBLOCKD *xd = &x->e_mbd;
@@ -110,325 +96,26 @@ void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
                  pd->dst.buf, pd->dst.stride);
 }
 
-// Shifting negative values is undefined behaviour in C99,
-// and could mislead the optimizer, who might assume the shifted is positive.
-// This also avoids ubsan warnings.
-// In practise, this gets inlined by the optimizer to a single instruction.
-static INLINE int signed_shift_right(int x, int shift) {
-  if (x >= 0)
-    return x >> shift;
-  else
-    return -((-x) >> shift);
-}
-
-#if !CONFIG_LV_MAP
-// These numbers are empirically obtained.
-static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
-  { 10, 7 }, { 8, 5 },
-};
-
-static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
-                             int blk_row, int blk_col, int block,
-                             TX_SIZE tx_size, int ctx) {
+int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
+                   int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                   const TXB_CTX *const txb_ctx, int fast_mode,
+                   int *rate_cost) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *const p = &mb->plane[plane];
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const PLANE_TYPE plane_type = pd->plane_type;
   const int eob = p->eobs[block];
-  assert(mb->qindex > 0);
-  assert((!plane_type && !plane) || (plane_type && plane));
-  assert(eob <= tx_size_2d[tx_size]);
-  const int ref = is_inter_block(&xd->mi[0]->mbmi);
-  const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const int16_t *const dequant_ptr = pd->dequant;
-  const uint8_t *const band_translate = get_band_translate(tx_size);
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-  const SCAN_ORDER *const scan_order =
-      get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
-  const int16_t *const scan = scan_order->scan;
-  const int16_t *const nb = scan_order->neighbors;
-  const int shift = av1_get_tx_scale(tx_size);
-#if CONFIG_AOM_QM
-  int seg_id = xd->mi[0]->mbmi.segment_id;
-  // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
-  const qm_val_t *iqmatrix =
-      IS_2D_TRANSFORM(tx_type)
-          ? pd->seg_iqmatrix[seg_id][!ref][tx_size]
-          : cm->giqmatrix[NUM_QM_LEVELS - 1][0][0][tx_size];
-#endif
-#if CONFIG_NEW_QUANT
-  int dq = get_dq_profile_from_ctx(mb->qindex, ctx, ref, plane_type);
-  const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq[dq];
-#endif  // CONFIG_NEW_QUANT
-  int64_t rd_cost0, rd_cost1;
-  int16_t t0, t1;
-  int i, final_eob = 0;
-  const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
-  int(*head_token_costs)[COEFF_CONTEXTS][TAIL_TOKENS] =
-      mb->token_head_costs[txsize_sqr_map[tx_size]][plane_type][ref];
-  int(*tail_token_costs)[COEFF_CONTEXTS][TAIL_TOKENS] =
-      mb->token_tail_costs[txsize_sqr_map[tx_size]][plane_type][ref];
-  const int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1;
-  int64_t rate0, rate1;
-  int64_t eob_cost0, eob_cost1;
-  tran_low_t before_best_eob_qc = 0;
-  tran_low_t before_best_eob_dqc = 0;
-
-  uint8_t token_cache[MAX_TX_SQUARE];
-  for (i = 0; i < eob; i++) {
-    const int rc = scan[i];
-    token_cache[rc] = av1_pt_energy_class[av1_get_token(qcoeff[rc])];
-  }
-
-  /* Record the r-d cost */
-  int64_t accu_rate = 0;
-  // Initialized to the worst possible error for the largest transform size.
-  // This ensures that it never goes negative.
-  int64_t accu_error = ((int64_t)1) << 50;
-  rate0 = head_token_costs[0][ctx][0];
-  int64_t best_block_rd_cost = RDCOST(rdmult, rate0, accu_error);
-
-  // int64_t best_block_rd_cost_all0 = best_block_rd_cost;
-  const int seg_eob =
-      av1_get_tx_eob(&cm->seg, xd->mi[0]->mbmi.segment_id, tx_size);
-  for (i = 0; i < eob; i++) {
-    const int rc = scan[i];
-    const int x = qcoeff[rc];
-    const int sz = -(x < 0);
-    const int band_cur = band_translate[i];
-    const int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i);
-    const int eob_val =
-        (i + 1 == eob) ? (i + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
-    const int is_first = (i == 0);
-
-    if (x == 0) {
-      // no need to search when x == 0
-      accu_rate += av1_get_coeff_token_cost(
-          ZERO_TOKEN, eob_val, is_first, head_token_costs[band_cur][ctx_cur],
-          tail_token_costs[band_cur][ctx_cur]);
-      // accu_error does not change when x==0
-    } else {
-      /*  Computing distortion
-       */
-      // compute the distortion for the first candidate
-      // and the distortion for quantizing to 0.
-      int dx0 = abs(coeff[rc]) * (1 << shift);
-      dx0 >>= xd->bd - 8;
-
-      const int64_t d0 = (int64_t)dx0 * dx0;
-      const int x_a = x - 2 * sz - 1;
-      int dqv;
-#if CONFIG_AOM_QM
-      int iwt;
-      dqv = dequant_ptr[rc != 0];
-      if (iqmatrix != NULL) {
-        iwt = iqmatrix[rc];
-        dqv = ((iwt * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-      }
-#else
-      dqv = dequant_ptr[rc != 0];
-#endif
+  const int segment_id = xd->mi[0]->segment_id;
 
-      int dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
-      dx = signed_shift_right(dx, xd->bd - 8);
-      const int64_t d2 = (int64_t)dx * dx;
-
-      /* compute the distortion for the second candidate
-       * x_a = x - 2 * sz + 1;
-       */
-      int64_t d2_a;
-      if (x_a != 0) {
-#if CONFIG_NEW_QUANT
-        dx = av1_dequant_coeff_nuq(x, dqv, dequant_val[band_translate[i]]) -
-             (coeff[rc] * (1 << shift));
-        dx >>= xd->bd - 8;
-#else   // CONFIG_NEW_QUANT
-        dx -= ((dqv >> (xd->bd - 8)) + sz) ^ sz;
-#endif  // CONFIG_NEW_QUANT
-        d2_a = (int64_t)dx * dx;
-      } else {
-        d2_a = d0;
-      }
-      // Computing RD cost
-      int64_t base_bits;
-      // rate cost of x
-      base_bits = av1_get_token_cost(x, &t0, cat6_bits);
-      rate0 = base_bits +
-              av1_get_coeff_token_cost(t0, eob_val, is_first,
-                                       head_token_costs[band_cur][ctx_cur],
-                                       tail_token_costs[band_cur][ctx_cur]);
-      // rate cost of x_a
-      base_bits = av1_get_token_cost(x_a, &t1, cat6_bits);
-      if (t1 == ZERO_TOKEN && eob_val) {
-        rate1 = base_bits;
-      } else {
-        rate1 = base_bits +
-                av1_get_coeff_token_cost(t1, eob_val, is_first,
-                                         head_token_costs[band_cur][ctx_cur],
-                                         tail_token_costs[band_cur][ctx_cur]);
-      }
-
-      int64_t next_bits0 = 0, next_bits1 = 0;
-      if (i < eob - 1) {
-        int ctx_next;
-        const int band_next = band_translate[i + 1];
-        const int token_next = av1_get_token(qcoeff[scan[i + 1]]);
-        const int eob_val_next =
-            (i + 2 == eob) ? (i + 2 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
-
-        token_cache[rc] = av1_pt_energy_class[t0];
-        ctx_next = get_coef_context(nb, token_cache, i + 1);
-        next_bits0 = av1_get_coeff_token_cost(
-            token_next, eob_val_next, 0, head_token_costs[band_next][ctx_next],
-            tail_token_costs[band_next][ctx_next]);
-
-        token_cache[rc] = av1_pt_energy_class[t1];
-        ctx_next = get_coef_context(nb, token_cache, i + 1);
-        next_bits1 = av1_get_coeff_token_cost(
-            token_next, eob_val_next, 0, head_token_costs[band_next][ctx_next],
-            tail_token_costs[band_next][ctx_next]);
-      }
-
-      rd_cost0 = RDCOST(rdmult, (rate0 + next_bits0), d2);
-      rd_cost1 = RDCOST(rdmult, (rate1 + next_bits1), d2_a);
-      const int best_x = (rd_cost1 < rd_cost0);
-
-      const int eob_v = (i + 1 == seg_eob) ? LAST_EOB : EARLY_EOB;
-      int64_t next_eob_bits0, next_eob_bits1;
-      int best_eob_x;
-      next_eob_bits0 = av1_get_coeff_token_cost(
-          t0, eob_v, is_first, head_token_costs[band_cur][ctx_cur],
-          tail_token_costs[band_cur][ctx_cur]);
-      eob_cost0 =
-          RDCOST(rdmult, (accu_rate + next_eob_bits0), (accu_error + d2 - d0));
-      eob_cost1 = eob_cost0;
-      if (x_a != 0) {
-        next_eob_bits1 = av1_get_coeff_token_cost(
-            t1, eob_v, is_first, head_token_costs[band_cur][ctx_cur],
-            tail_token_costs[band_cur][ctx_cur]);
-        eob_cost1 = RDCOST(rdmult, (accu_rate + next_eob_bits1),
-                           (accu_error + d2_a - d0));
-        best_eob_x = (eob_cost1 < eob_cost0);
-      } else {
-        best_eob_x = 0;
-      }
-
-      const int dqc = dqcoeff[rc];
-      int dqc_a = 0;
-      if (best_x || best_eob_x) {
-        if (x_a != 0) {
-#if CONFIG_NEW_QUANT
-          dqc_a = av1_dequant_abscoeff_nuq(abs(x_a), dqv,
-                                           dequant_val[band_translate[i]]);
-          dqc_a = shift ? ROUND_POWER_OF_TWO(dqc_a, shift) : dqc_a;
-          if (sz) dqc_a = -dqc_a;
-#else
-          if (x_a < 0)
-            dqc_a = -((-x_a * dqv) >> shift);
-          else
-            dqc_a = (x_a * dqv) >> shift;
-#endif  // CONFIG_NEW_QUANT
-        } else {
-          dqc_a = 0;
-        }  // if (x_a != 0)
-      }
-
-      // record the better quantized value
-      if (best_x) {
-        assert(d2_a <= d0);
-        qcoeff[rc] = x_a;
-        dqcoeff[rc] = dqc_a;
-        accu_rate += rate1;
-        accu_error += d2_a - d0;
-        token_cache[rc] = av1_pt_energy_class[t1];
-      } else {
-        assert(d2 <= d0);
-        accu_rate += rate0;
-        accu_error += d2 - d0;
-        token_cache[rc] = av1_pt_energy_class[t0];
-      }
-      assert(accu_error >= 0);
-
-      // determine whether to move the eob position to i+1
-      const int use_a = (x_a != 0) && (best_eob_x);
-      const int64_t best_eob_cost_i = use_a ? eob_cost1 : eob_cost0;
-      if (best_eob_cost_i < best_block_rd_cost) {
-        best_block_rd_cost = best_eob_cost_i;
-        final_eob = i + 1;
-        if (use_a) {
-          before_best_eob_qc = x_a;
-          before_best_eob_dqc = dqc_a;
-        } else {
-          before_best_eob_qc = x;
-          before_best_eob_dqc = dqc;
-        }
-      }
-    }  // if (x==0)
-  }    // for (i)
-
-  assert(final_eob <= eob);
-  if (final_eob > 0) {
-    assert(before_best_eob_qc != 0);
-    i = final_eob - 1;
-    int rc = scan[i];
-    qcoeff[rc] = before_best_eob_qc;
-    dqcoeff[rc] = before_best_eob_dqc;
-  }
-
-  for (i = final_eob; i < eob; i++) {
-    int rc = scan[i];
-    qcoeff[rc] = 0;
-    dqcoeff[rc] = 0;
+  if (eob == 0 || !cpi->optimize_seg_arr[segment_id] ||
+      xd->lossless[segment_id]) {
+    *rate_cost = av1_cost_skip_txb(mb, txb_ctx, plane, tx_size);
+    return eob;
   }
 
-  p->eobs[block] = final_eob;
-  return final_eob;
-}
-#endif  // !CONFIG_LV_MAP
-
-int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int blk_row,
-                   int blk_col, int block, BLOCK_SIZE plane_bsize,
-                   TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
-                   const ENTROPY_CONTEXT *l, int fast_mode) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  struct macroblock_plane *const p = &mb->plane[plane];
-  const int eob = p->eobs[block];
-  assert((mb->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
-  if (eob == 0) return eob;
-  if (xd->lossless[xd->mi[0]->mbmi.segment_id]) return eob;
-
-#if CONFIG_PVQ
-  (void)cm;
-  (void)tx_size;
-  (void)a;
-  (void)l;
-  return eob;
-#endif
-
-#if !CONFIG_LV_MAP
-  (void)plane_bsize;
-  (void)blk_row;
-  (void)blk_col;
   (void)fast_mode;
-#if CONFIG_VAR_TX
-  int ctx = get_entropy_context(tx_size, a, l);
-#else
-  int ctx = combine_entropy_contexts(*a, *l);
-#endif  // CONFIG_VAR_TX
-  return optimize_b_greedy(cm, mb, plane, blk_row, blk_col, block, tx_size,
-                           ctx);
-#else   // !CONFIG_LV_MAP
-  TXB_CTX txb_ctx;
-  get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
-  return av1_optimize_txb(cm, mb, plane, blk_row, blk_col, block, tx_size,
-                          &txb_ctx, fast_mode);
-#endif  // !CONFIG_LV_MAP
+  return av1_optimize_txb_new(cpi, mb, plane, block, tx_size, tx_type, txb_ctx,
+                              rate_cost, cpi->oxcf.sharpness);
 }
 
-#if !CONFIG_PVQ
 typedef enum QUANT_FUNC {
   QUANT_FUNC_LOWBD = 0,
   QUANT_FUNC_HIGHBD = 1,
@@ -437,394 +124,231 @@ typedef enum QUANT_FUNC {
 
 static AV1_QUANT_FACADE
     quant_func_list[AV1_XFORM_QUANT_TYPES][QUANT_FUNC_TYPES] = {
-#if !CONFIG_NEW_QUANT
       { av1_quantize_fp_facade, av1_highbd_quantize_fp_facade },
       { av1_quantize_b_facade, av1_highbd_quantize_b_facade },
       { av1_quantize_dc_facade, av1_highbd_quantize_dc_facade },
-#else   // !CONFIG_NEW_QUANT
-      { av1_quantize_fp_nuq_facade, av1_highbd_quantize_fp_nuq_facade },
-      { av1_quantize_b_nuq_facade, av1_highbd_quantize_b_nuq_facade },
-      { av1_quantize_dc_nuq_facade, av1_highbd_quantize_dc_nuq_facade },
-#endif  // !CONFIG_NEW_QUANT
       { NULL, NULL }
     };
-#endif  // !CONFIG_PVQ
-
-#if !CONFIG_TXMG && !CONFIG_PVQ
-typedef void (*fwdTxfmFunc)(const int16_t *diff, tran_low_t *coeff, int stride,
-                            TxfmParam *txfm_param);
-static const fwdTxfmFunc fwd_txfm_func[2] = { av1_fwd_txfm,
-                                              av1_highbd_fwd_txfm };
-#endif
 
 void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
                      int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
-                     TX_SIZE tx_size, int ctx,
+                     TX_SIZE tx_size, TX_TYPE tx_type,
                      AV1_XFORM_QUANT xform_quant_idx) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-#if !(CONFIG_PVQ || CONFIG_DIST_8X8)
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-#else
-  struct macroblock_plane *const p = &x->plane[plane];
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-#endif
-  PLANE_TYPE plane_type = get_plane_type(plane);
-  TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-
-#if (CONFIG_AOM_QM || CONFIG_NEW_QUANT) && !CONFIG_PVQ
-  const int is_inter = is_inter_block(mbmi);
-#endif
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
 
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = block_size_wide[plane_bsize];
-#if CONFIG_AOM_QM && !CONFIG_PVQ
   int seg_id = mbmi->segment_id;
+  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
   // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
   const qm_val_t *qmatrix =
-      IS_2D_TRANSFORM(tx_type) ? pd->seg_qmatrix[seg_id][!is_inter][tx_size]
-                               : cm->gqmatrix[NUM_QM_LEVELS - 1][0][0][tx_size];
+      IS_2D_TRANSFORM(tx_type) ? pd->seg_qmatrix[seg_id][qm_tx_size]
+                               : cm->gqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
   const qm_val_t *iqmatrix =
       IS_2D_TRANSFORM(tx_type)
-          ? pd->seg_iqmatrix[seg_id][!is_inter][tx_size]
-          : cm->giqmatrix[NUM_QM_LEVELS - 1][0][0][tx_size];
-#endif
+          ? pd->seg_iqmatrix[seg_id][qm_tx_size]
+          : cm->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
 
-  TxfmParam txfm_param;
-
-#if CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT_FROM_PRED || CONFIG_MRC_TX
-  uint8_t *dst;
-  const int dst_stride = pd->dst.stride;
-#if CONFIG_PVQ || CONFIG_DIST_8X8
-  int16_t *pred;
-  const int txw = tx_size_wide[tx_size];
-  const int txh = tx_size_high[tx_size];
-  int i, j;
-#endif
-#endif
-
-#if !CONFIG_PVQ
-  const int tx2d_size = tx_size_2d[tx_size];
+  const int src_offset = (blk_row * diff_stride + blk_col);
+  const int16_t *src_diff = &p->src_diff[src_offset << tx_size_wide_log2[0]];
   QUANT_PARAM qparam;
-  const int16_t *src_diff;
-
-  src_diff =
-      &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
   qparam.log_scale = av1_get_tx_scale(tx_size);
-#if CONFIG_NEW_QUANT
   qparam.tx_size = tx_size;
-  qparam.dq = get_dq_profile_from_ctx(x->qindex, ctx, is_inter, plane_type);
-#endif  // CONFIG_NEW_QUANT
-#if CONFIG_AOM_QM
   qparam.qmatrix = qmatrix;
   qparam.iqmatrix = iqmatrix;
-#endif  // CONFIG_AOM_QM
-#else
-  tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
-  int skip = 1;
-  PVQ_INFO *pvq_info = NULL;
-  uint8_t *src;
-  int16_t *src_int16;
-  const int src_stride = p->src.stride;
-
-  (void)ctx;
-  (void)scan_order;
-  (void)qcoeff;
-
-  if (x->pvq_coded) {
-    assert(block < MAX_PVQ_BLOCKS_IN_SB);
-    pvq_info = &x->pvq[block][plane];
-  }
-  src = &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
-  src_int16 =
-      &p->src_int16[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
-
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    for (j = 0; j < txh; j++)
-      for (i = 0; i < txw; i++)
-        src_int16[diff_stride * j + i] =
-            CONVERT_TO_SHORTPTR(src)[src_stride * j + i];
-  } else {
-#endif  // CONFIG_HIGHBITDEPTH
-    for (j = 0; j < txh; j++)
-      for (i = 0; i < txw; i++)
-        src_int16[diff_stride * j + i] = src[src_stride * j + i];
-#if CONFIG_HIGHBITDEPTH
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-#endif
-
-#if CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT_FROM_PRED || CONFIG_MRC_TX
-  dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-#endif  // CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT_FROM_PRED ||
-        // CONFIG_MRC_TX
-
-#if CONFIG_PVQ || CONFIG_DIST_8X8
-  if (CONFIG_PVQ
-#if CONFIG_DIST_8X8
-      || x->using_dist_8x8
-#endif  // CONFIG_DIST_8X8
-      ) {
-    pred = &pd->pred[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
-
-// copy uint8 orig and predicted block to int16 buffer
-// in order to use existing VP10 transform functions
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      for (j = 0; j < txh; j++)
-        for (i = 0; i < txw; i++)
-          pred[diff_stride * j + i] =
-              CONVERT_TO_SHORTPTR(dst)[dst_stride * j + i];
-    } else {
-#endif  // CONFIG_HIGHBITDEPTH
-      for (j = 0; j < txh; j++)
-        for (i = 0; i < txw; i++)
-          pred[diff_stride * j + i] = dst[dst_stride * j + i];
-#if CONFIG_HIGHBITDEPTH
-    }
-#endif  // CONFIG_HIGHBITDEPTH
-  }
-#endif  // CONFIG_PVQ || CONFIG_DIST_8X8
-
-  (void)ctx;
-
+  TxfmParam txfm_param;
   txfm_param.tx_type = tx_type;
   txfm_param.tx_size = tx_size;
   txfm_param.lossless = xd->lossless[mbmi->segment_id];
-#if CONFIG_MRC_TX || CONFIG_LGT
-  txfm_param.is_inter = is_inter_block(mbmi);
-#endif
-#if CONFIG_MRC_TX || CONFIG_LGT_FROM_PRED
-  txfm_param.dst = dst;
-  txfm_param.stride = dst_stride;
-#if CONFIG_MRC_TX
-  txfm_param.valid_mask = &mbmi->valid_mrc_mask;
-#if SIGNAL_ANY_MRC_MASK
-  txfm_param.mask = BLOCK_OFFSET(xd->mrc_mask, block);
-#endif  // SIGNAL_ANY_MRC_MASK
-#endif  // CONFIG_MRC_TX
-#if CONFIG_LGT_FROM_PRED
-  txfm_param.mode = mbmi->mode;
-  txfm_param.use_lgt = mbmi->use_lgt;
-#endif  // CONFIG_LGT_FROM_PRED
-#endif  // CONFIG_MRC_TX || CONFIG_LGT_FROM_PRED
-
-#if !CONFIG_PVQ
+  txfm_param.tx_set_type = av1_get_ext_tx_set_type(
+      txfm_param.tx_size, is_inter_block(mbmi), cm->reduced_tx_set_used);
+
   txfm_param.bd = xd->bd;
-  const int is_hbd = get_bitdepth_data_path_index(xd);
+  txfm_param.is_hbd = get_bitdepth_data_path_index(xd);
 
-#if CONFIG_TXMG
-  av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, &txfm_param);
-#else   // CONFIG_TXMG
-  fwd_txfm_func[is_hbd](src_diff, coeff, diff_stride, &txfm_param);
-#endif  // CONFIG_TXMG
+  av1_fwd_txfm(src_diff, coeff, diff_stride, &txfm_param);
 
   if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
+    const int n_coeffs = av1_get_max_eob(tx_size);
     if (LIKELY(!x->skip_block)) {
-      quant_func_list[xform_quant_idx][is_hbd](
-          coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order, &qparam);
+      quant_func_list[xform_quant_idx][txfm_param.is_hbd](
+          coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, &qparam);
     } else {
-      av1_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
+      av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob);
     }
   }
-#if CONFIG_LV_MAP
-  p->txb_entropy_ctx[block] =
-      (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
-#endif  // CONFIG_LV_MAP
-  return;
-#else  // CONFIG_PVQ
-  (void)xform_quant_idx;
-#if CONFIG_HIGHBITDEPTH
-  txfm_param.bd = xd->bd;
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    av1_highbd_fwd_txfm(src_int16, coeff, diff_stride, &txfm_param);
-    av1_highbd_fwd_txfm(pred, ref_coeff, diff_stride, &txfm_param);
+  // NOTE: optimize_b_following is ture means av1_optimze_b will be called
+  // When the condition of doing optimize_b is changed,
+  // this flag need update simultaneously
+  const int optimize_b_following =
+      (xform_quant_idx != AV1_XFORM_QUANT_FP) || (txfm_param.lossless);
+  if (optimize_b_following) {
+    p->txb_entropy_ctx[block] =
+        (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
   } else {
-#endif
-    av1_fwd_txfm(src_int16, coeff, diff_stride, &txfm_param);
-    av1_fwd_txfm(pred, ref_coeff, diff_stride, &txfm_param);
-#if CONFIG_HIGHBITDEPTH
-  }
-#endif
-
-  // PVQ for inter mode block
-  if (!x->skip_block) {
-    PVQ_SKIP_TYPE ac_dc_coded =
-        av1_pvq_encode_helper(x,
-                              coeff,        // target original vector
-                              ref_coeff,    // reference vector
-                              dqcoeff,      // de-quantized vector
-                              eob,          // End of Block marker
-                              pd->dequant,  // aom's quantizers
-                              plane,        // image plane
-                              tx_size,      // block size in log_2 - 2
-                              tx_type,
-                              &x->rate,  // rate measured
-                              x->pvq_speed,
-                              pvq_info);  // PVQ info for a block
-    skip = ac_dc_coded == PVQ_SKIP;
+    p->txb_entropy_ctx[block] = 0;
   }
-  x->pvq_skip[plane] = skip;
-
-  if (!skip) mbmi->skip = 0;
-#endif  // #if !CONFIG_PVQ
+  return;
 }
 
 static void encode_block(int plane, int block, int blk_row, int blk_col,
-                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg,
+                         int mi_row, int mi_col, RUN_TYPE dry_run) {
+  (void)mi_row;
+  (void)mi_col;
+  (void)dry_run;
   struct encode_b_args *const args = arg;
-  AV1_COMMON *cm = args->cm;
+  const AV1_COMMON *const cm = &args->cpi->common;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  int ctx;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
   uint8_t *dst;
-#if !CONFIG_PVQ
   ENTROPY_CONTEXT *a, *l;
-#endif
-#if CONFIG_VAR_TX
-  int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-#endif
+  int dummy_rate_cost = 0;
+
+  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
   dst = &pd->dst
              .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
 
-#if !CONFIG_PVQ
   a = &args->ta[blk_col];
   l = &args->tl[blk_row];
-#if CONFIG_VAR_TX
-  ctx = get_entropy_context(tx_size, a, l);
-#else
-  ctx = combine_entropy_contexts(*a, *l);
-#endif
-#else
-  ctx = 0;
-#endif  // CONFIG_PVQ
-
-#if CONFIG_VAR_TX
   // Assert not magic number (uninitialized).
-  assert(x->blk_skip[plane][blk_row * bw + blk_col] != 234);
-
-  if (x->blk_skip[plane][blk_row * bw + blk_col] == 0)
-#endif
-  {
-    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                    ctx, AV1_XFORM_QUANT_FP);
-  }
-#if CONFIG_VAR_TX
-  else {
+  assert(plane != 0 || x->blk_skip[blk_row * bw + blk_col] != 234);
+
+  if ((plane != 0 || x->blk_skip[blk_row * bw + blk_col] == 0) &&
+      !mbmi->skip_mode) {
+    TX_TYPE tx_type = av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col,
+                                      tx_size, cm->reduced_tx_set_used);
+    if (args->enable_optimize_b) {
+      av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+                      tx_size, tx_type, AV1_XFORM_QUANT_FP);
+      TXB_CTX txb_ctx;
+      get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+      av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, 1,
+                     &dummy_rate_cost);
+    } else {
+      av1_xform_quant(
+          cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
+          USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
+    }
+  } else {
     p->eobs[block] = 0;
+    p->txb_entropy_ctx[block] = 0;
   }
-#endif
-
-#if !CONFIG_PVQ
-  av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size, a,
-                 l, 0);
 
   av1_set_txb_context(x, plane, block, tx_size, a, l);
 
-  if (p->eobs[block]) *(args->skip) = 0;
+  if (p->eobs[block]) {
+    *(args->skip) = 0;
 
-  if (p->eobs[block] != 0)
-#else
-  (void)ctx;
-  if (!x->pvq_skip[plane]) *(args->skip) = 0;
+    TX_TYPE tx_type = av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col,
+                                      tx_size, cm->reduced_tx_set_used);
+    av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+                                pd->dst.stride, p->eobs[block],
+                                cm->reduced_tx_set_used);
+  }
 
-  if (!x->pvq_skip[plane])
-#endif
-  {
-#if CONFIG_LGT_FROM_PRED
-    PREDICTION_MODE mode = xd->mi[0]->mbmi.mode;
-#endif  // CONFIG_LGT_FROM_PRED
-    TX_TYPE tx_type =
-        av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col, block, tx_size);
-    av1_inverse_transform_block(xd, dqcoeff,
-#if CONFIG_LGT_FROM_PRED
-                                mode,
+  if (p->eobs[block] == 0 && plane == 0) {
+  // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0
+  // case. It is possible that certain collision in hash index would cause
+  // the assertion failure. To further optimize the rate-distortion
+  // performance, we need to re-visit this part and enable this assert
+  // again.
+#if 0
+    if (args->cpi->oxcf.aq_mode == NO_AQ &&
+        args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) {
+      // TODO(jingning,angiebird,huisu@google.com): enable txk_check when
+      // enable_optimize_b is true to detect potential RD bug.
+      const uint8_t disable_txk_check = args->enable_optimize_b;
+      if (!disable_txk_check) {
+        assert(mbmi->txk_type[av1_get_txk_type_index(plane_bsize, blk_row,
+                                                     blk_col)] == DCT_DCT);
+      }
+    }
 #endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                mrc_mask,
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                tx_type, tx_size, dst, pd->dst.stride,
-                                p->eobs[block]);
+    update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                     DCT_DCT);
   }
+
+#if CONFIG_MISMATCH_DEBUG
+  if (dry_run == OUTPUT_ENABLED) {
+    int pixel_c, pixel_r;
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int blk_w = block_size_wide[bsize];
+    int blk_h = block_size_high[bsize];
+    mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row,
+                    pd->subsampling_x, pd->subsampling_y);
+    mismatch_record_block_tx(dst, pd->dst.stride, cm->frame_offset, plane,
+                             pixel_c, pixel_r, blk_w, blk_h,
+                             xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+  }
+#endif
 }
 
-#if CONFIG_VAR_TX
 static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
                                BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                               void *arg) {
+                               void *arg, int mi_row, int mi_col,
+                               RUN_TYPE dry_run) {
+  (void)mi_row;
+  (void)mi_col;
   struct encode_b_args *const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int tx_row = blk_row >> (1 - pd->subsampling_y);
-  const int tx_col = blk_col >> (1 - pd->subsampling_x);
-  TX_SIZE plane_tx_size;
   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
-  plane_tx_size =
-      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
-            : mbmi->inter_tx_size[tx_row][tx_col];
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
+  if (!plane) {
+    assert(tx_size_wide[tx_size] >= tx_size_wide[plane_tx_size] &&
+           tx_size_high[tx_size] >= tx_size_high[plane_tx_size]);
+  }
 
-  if (tx_size == plane_tx_size) {
-    encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
+  if (tx_size == plane_tx_size || plane) {
+    encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg,
+                 mi_row, mi_col, dry_run);
   } else {
     assert(tx_size < TX_SIZES_ALL);
-#if CONFIG_RECT_TX_EXT
-    int is_qttx = plane_tx_size == quarter_txsize_lookup[plane_bsize];
-    const TX_SIZE sub_txs = is_qttx ? plane_tx_size : sub_tx_size_map[tx_size];
-    if (is_qttx) assert(blk_row == 0 && blk_col == 0 && block == 0);
-#else
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
     assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size));
     assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size));
-#endif
     // This is the square transform block partition entry point.
-    int bsl = tx_size_wide_unit[sub_txs];
-    int i;
-    assert(bsl > 0);
-
-    for (i = 0; i < 4; ++i) {
-#if CONFIG_RECT_TX_EXT
-      int is_wide_tx = tx_size_wide_unit[sub_txs] > tx_size_high_unit[sub_txs];
-      const int offsetr =
-          is_qttx ? (is_wide_tx ? i * tx_size_high_unit[sub_txs] : 0)
-                  : blk_row + ((i >> 1) * bsl);
-      const int offsetc =
-          is_qttx ? (is_wide_tx ? 0 : i * tx_size_wide_unit[sub_txs])
-                  : blk_col + ((i & 0x01) * bsl);
-#else
-      const int offsetr = blk_row + ((i >> 1) * bsl);
-      const int offsetc = blk_col + ((i & 0x01) * bsl);
-#endif
-      int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int step = bsh * bsw;
+    assert(bsw > 0 && bsh > 0);
 
-      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
 
-      encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs,
-                         arg);
-      block += step;
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+        encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs,
+                           arg, mi_row, mi_col, dry_run);
+        block += step;
+      }
     }
   }
 }
-#endif
 
 typedef struct encode_block_pass1_args {
   AV1_COMMON *cm;
@@ -843,57 +367,25 @@ static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   TxfmParam txfm_param;
   uint8_t *dst;
-  int ctx = 0;
   dst = &pd->dst
              .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
-
   av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  ctx, AV1_XFORM_QUANT_B);
-#if CONFIG_PVQ
-  if (!x->pvq_skip[plane]) {
-    int tx_blk_size;
-    int i, j;
-    // transform block size in pixels
-    tx_blk_size = tx_size_wide[tx_size];
-
-// Since av1 does not have separate function which does inverse transform
-// but av1_inv_txfm_add_*x*() also does addition of predicted image to
-// inverse transformed image,
-// pass blank dummy image to av1_inv_txfm_add_*x*(), i.e. set dst as zeros
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      for (j = 0; j < tx_blk_size; j++)
-        for (i = 0; i < tx_blk_size; i++)
-          CONVERT_TO_SHORTPTR(dst)[j * pd->dst.stride + i] = 0;
-    } else {
-#endif  // CONFIG_HIGHBITDEPTH
-      for (j = 0; j < tx_blk_size; j++)
-        for (i = 0; i < tx_blk_size; i++) dst[j * pd->dst.stride + i] = 0;
-#if CONFIG_HIGHBITDEPTH
-    }
-#endif  // CONFIG_HIGHBITDEPTH
-  }
-#endif  // CONFIG_PVQ
+                  DCT_DCT, AV1_XFORM_QUANT_B);
 
-#if !CONFIG_PVQ
-  if (p->eobs[block] > 0)
-#endif
-  {
+  if (p->eobs[block] > 0) {
     txfm_param.bd = xd->bd;
+    txfm_param.is_hbd = get_bitdepth_data_path_index(xd);
     txfm_param.tx_type = DCT_DCT;
+    txfm_param.tx_size = tx_size;
     txfm_param.eob = p->eobs[block];
-    txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    txfm_param.lossless = xd->lossless[xd->mi[0]->segment_id];
+    txfm_param.tx_set_type = av1_get_ext_tx_set_type(
+        txfm_param.tx_size, is_inter_block(xd->mi[0]), cm->reduced_tx_set_used);
+    if (txfm_param.is_hbd) {
       av1_highbd_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride, &txfm_param);
       return;
     }
-#endif  //  CONFIG_HIGHBITDEPTH
-    if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-      av1_iwht4x4_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
-    } else {
-      av1_idct4x4_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
-    }
+    av1_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
   }
 }
 
@@ -904,20 +396,28 @@ void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize) {
                                          encode_block_pass1, &args);
 }
 
-void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
-                   int mi_col) {
+void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                   int mi_row, int mi_col, RUN_TYPE dry_run) {
+  (void)dry_run;
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct encode_b_args arg = { cm, x, &ctx, &mbmi->skip, NULL, NULL, 1 };
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  struct encode_b_args arg = { cpi,
+                               x,
+                               &ctx,
+                               &mbmi->skip,
+                               NULL,
+                               NULL,
+                               cpi->optimize_seg_arr[mbmi->segment_id] };
   int plane;
 
   mbmi->skip = 1;
 
   if (x->skip) return;
 
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+  for (plane = 0; plane < num_planes; ++plane) {
     const int subsampling_x = xd->plane[plane].subsampling_x;
     const int subsampling_y = xd->plane[plane].subsampling_y;
 
@@ -925,41 +425,32 @@ void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
                              subsampling_y))
       continue;
 
-    bsize = scale_chroma_bsize(bsize, subsampling_x, subsampling_y);
-#else
-    (void)mi_row;
-    (void)mi_col;
-#endif
+    const BLOCK_SIZE bsizec =
+        scale_chroma_bsize(bsize, subsampling_x, subsampling_y);
 
-#if CONFIG_VAR_TX
     // TODO(jingning): Clean this up.
     const struct macroblockd_plane *const pd = &xd->plane[plane];
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
     const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-    const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
-    const TX_SIZE max_tx_size = get_vartx_max_txsize(
-        mbmi, plane_bsize, pd->subsampling_x || pd->subsampling_y);
+    const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+
     const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
     const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
-    const int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
+    const int bh = block_size_high[txb_size] >> tx_size_high_log2[0];
     int idx, idy;
     int block = 0;
     int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
-    av1_get_entropy_contexts(bsize, 0, pd, ctx.ta[plane], ctx.tl[plane]);
-#else
-    const struct macroblockd_plane *const pd = &xd->plane[plane];
-    const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-    av1_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
-#endif
+    av1_get_entropy_contexts(bsizec, pd, ctx.ta[plane], ctx.tl[plane]);
+
+    av1_subtract_plane(x, bsizec, plane);
 
-#if !CONFIG_PVQ
-    av1_subtract_plane(x, bsize, plane);
-#endif
     arg.ta = ctx.ta[plane];
     arg.tl = ctx.tl[plane];
 
-#if CONFIG_VAR_TX
-    const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd);
+    const BLOCK_SIZE max_unit_bsize =
+        get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
     int mu_blocks_wide =
         block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
     int mu_blocks_high =
@@ -976,67 +467,14 @@ void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
         for (blk_row = idy; blk_row < unit_height; blk_row += bh) {
           for (blk_col = idx; blk_col < unit_width; blk_col += bw) {
             encode_block_inter(plane, block, blk_row, blk_col, plane_bsize,
-                               max_tx_size, &arg);
+                               max_tx_size, &arg, mi_row, mi_col, dry_run);
             block += step;
           }
         }
       }
     }
-#else
-    av1_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
-                                           &arg);
-#endif
-  }
-}
-
-#if CONFIG_SUPERTX
-void av1_encode_sb_supertx(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct optimize_ctx ctx;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct encode_b_args arg = { cm, x, &ctx, &mbmi->skip, NULL, NULL, 1 };
-  int plane;
-
-  mbmi->skip = 1;
-  if (x->skip) return;
-
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    const struct macroblockd_plane *const pd = &xd->plane[plane];
-#if CONFIG_VAR_TX
-    const TX_SIZE tx_size = TX_4X4;
-#else
-    const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-#endif
-    av1_subtract_plane(x, bsize, plane);
-    av1_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
-    arg.ta = ctx.ta[plane];
-    arg.tl = ctx.tl[plane];
-    av1_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
-                                           &arg);
   }
 }
-#endif  // CONFIG_SUPERTX
-
-#if !CONFIG_PVQ
-void av1_set_txb_context(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
-                         ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
-  (void)tx_size;
-  struct macroblock_plane *p = &x->plane[plane];
-
-#if !CONFIG_LV_MAP
-  *a = *l = p->eobs[block] > 0;
-#else   // !CONFIG_LV_MAP
-  *a = *l = p->txb_entropy_ctx[block];
-#endif  // !CONFIG_LV_MAP
-
-#if CONFIG_VAR_TX || CONFIG_LV_MAP
-  int i;
-  for (i = 0; i < tx_size_wide_unit[tx_size]; ++i) a[i] = a[0];
-
-  for (i = 0; i < tx_size_high_unit[tx_size]; ++i) l[i] = l[0];
-#endif
-}
-#endif
 
 static void encode_block_intra_and_set_context(int plane, int block,
                                                int blk_row, int blk_col,
@@ -1044,260 +482,113 @@ static void encode_block_intra_and_set_context(int plane, int block,
                                                TX_SIZE tx_size, void *arg) {
   av1_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
                          arg);
-#if !CONFIG_PVQ
+
   struct encode_b_args *const args = arg;
   MACROBLOCK *x = args->x;
   ENTROPY_CONTEXT *a = &args->ta[blk_col];
   ENTROPY_CONTEXT *l = &args->tl[blk_row];
   av1_set_txb_context(x, plane, block, tx_size, a, l);
-#endif
 }
 
 void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                             void *arg) {
   struct encode_b_args *const args = arg;
-  AV1_COMMON *cm = args->cm;
+  const AV1_COMMON *const cm = &args->cpi->common;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
   PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
   uint16_t *eob = &p->eobs[block];
   const int dst_stride = pd->dst.stride;
   uint8_t *dst =
       &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  int dummy_rate_cost = 0;
 
-  av1_predict_intra_block_facade(cm, xd, plane, block, blk_col, blk_row,
-                                 tx_size);
-
-  av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+  av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
 
-  const ENTROPY_CONTEXT *a = &args->ta[blk_col];
-  const ENTROPY_CONTEXT *l = &args->tl[blk_row];
-  int ctx = combine_entropy_contexts(*a, *l);
-  if (args->enable_optimize_b) {
-    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                    ctx, AV1_XFORM_QUANT_FP);
-    av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
-                   a, l, 0);
+  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  // Assert not magic number (uninitialized).
+  assert(plane != 0 || x->blk_skip[blk_row * bw + blk_col] != 234);
+  if (plane == 0 && x->blk_skip[blk_row * bw + blk_col]) {
+    *eob = 0;
+    p->txb_entropy_ctx[block] = 0;
   } else {
-    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                    ctx, AV1_XFORM_QUANT_B);
+    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+
+    const ENTROPY_CONTEXT *a = &args->ta[blk_col];
+    const ENTROPY_CONTEXT *l = &args->tl[blk_row];
+    if (args->enable_optimize_b) {
+      av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+                      tx_size, tx_type, AV1_XFORM_QUANT_FP);
+      TXB_CTX txb_ctx;
+      get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+      av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, 1,
+                     &dummy_rate_cost);
+    } else {
+      av1_xform_quant(
+          cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
+          USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
+    }
   }
 
-#if CONFIG_PVQ
-  // *(args->skip) == mbmi->skip
-  if (!x->pvq_skip[plane]) *(args->skip) = 0;
+  if (*eob) {
+    av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+                                dst_stride, *eob, cm->reduced_tx_set_used);
+  }
 
-  if (x->pvq_skip[plane]) return;
-#endif  // CONFIG_PVQ
-  av1_inverse_transform_block(xd, dqcoeff,
-#if CONFIG_LGT_FROM_PRED
-                              xd->mi[0]->mbmi.mode,
-#endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                              mrc_mask,
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                              tx_type, tx_size, dst, dst_stride, *eob);
-#if !CONFIG_PVQ
-  if (*eob) *(args->skip) = 0;
-#else
-// Note : *(args->skip) == mbmi->skip
+  if (*eob == 0 && plane == 0) {
+  // TODO(jingning): Temporarily disable txk_type check for eob=0 case.
+  // It is possible that certain collision in hash index would cause
+  // the assertion failure. To further optimize the rate-distortion
+  // performance, we need to re-visit this part and enable this assert
+  // again.
+#if 0
+    if (args->cpi->oxcf.aq_mode == NO_AQ
+        && args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) {
+      assert(mbmi->txk_type[av1_get_txk_type_index(plane_bsize, blk_row,
+                                                   blk_col)] == DCT_DCT);
+    }
 #endif
-#if CONFIG_CFL
-  if (plane == AOM_PLANE_Y && xd->cfl->store_y) {
+    update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                     DCT_DCT);
+  }
+
+  // For intra mode, skipped blocks are so rare that transmitting skip=1 is
+  // very expensive.
+  *(args->skip) = 0;
+
+  if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
     cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
   }
-#endif  // CONFIG_CFL
 }
 
-void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x,
+void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
                                   BLOCK_SIZE bsize, int plane,
                                   int enable_optimize_b, int mi_row,
                                   int mi_col) {
   const MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE] = { 0 };
-  ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE] = { 0 };
+  ENTROPY_CONTEXT ta[MAX_MIB_SIZE] = { 0 };
+  ENTROPY_CONTEXT tl[MAX_MIB_SIZE] = { 0 };
 
   struct encode_b_args arg = {
-    cm, x, NULL, &xd->mi[0]->mbmi.skip, ta, tl, enable_optimize_b
+    cpi, x, NULL, &(xd->mi[0]->skip), ta, tl, enable_optimize_b
   };
 
-#if CONFIG_CB4X4
   if (!is_chroma_reference(mi_row, mi_col, bsize,
                            xd->plane[plane].subsampling_x,
                            xd->plane[plane].subsampling_y))
     return;
-#else
-  (void)mi_row;
-  (void)mi_col;
-#endif
 
   if (enable_optimize_b) {
     const struct macroblockd_plane *const pd = &xd->plane[plane];
-    const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-    av1_get_entropy_contexts(bsize, tx_size, pd, ta, tl);
+    av1_get_entropy_contexts(bsize, pd, ta, tl);
   }
   av1_foreach_transformed_block_in_plane(
       xd, bsize, plane, encode_block_intra_and_set_context, &arg);
 }
-
-#if CONFIG_PVQ
-PVQ_SKIP_TYPE av1_pvq_encode_helper(MACROBLOCK *x, tran_low_t *const coeff,
-                                    tran_low_t *ref_coeff,
-                                    tran_low_t *const dqcoeff, uint16_t *eob,
-                                    const int16_t *quant, int plane,
-                                    TX_SIZE tx_size, TX_TYPE tx_type, int *rate,
-                                    int speed, PVQ_INFO *pvq_info) {
-  const int tx_blk_size = tx_size_wide[tx_size];
-  daala_enc_ctx *daala_enc = &x->daala_enc;
-  PVQ_SKIP_TYPE ac_dc_coded;
-  int coeff_shift = 3 - av1_get_tx_scale(tx_size);
-  int hbd_downshift = 0;
-  int rounding_mask;
-  int pvq_dc_quant;
-  int use_activity_masking = daala_enc->use_activity_masking;
-  int tell;
-  int has_dc_skip = 1;
-  int i;
-  int off = od_qm_offset(tx_size, plane ? 1 : 0);
-
-  DECLARE_ALIGNED(16, tran_low_t, coeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_coeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
-
-  DECLARE_ALIGNED(16, int32_t, in_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
-  DECLARE_ALIGNED(16, int32_t, ref_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
-  DECLARE_ALIGNED(16, int32_t, out_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
-
-  hbd_downshift = x->e_mbd.bd - 8;
-
-  assert(OD_COEFF_SHIFT >= 4);
-  // DC quantizer for PVQ
-  if (use_activity_masking)
-    pvq_dc_quant =
-        OD_MAXI(1,
-                (quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift) *
-                        daala_enc->state
-                            .pvq_qm_q4[plane][od_qm_get_index(tx_size, 0)] >>
-                    4);
-  else
-    pvq_dc_quant =
-        OD_MAXI(1, quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift);
-
-  *eob = 0;
-
-#if !CONFIG_ANS
-  tell = od_ec_enc_tell_frac(&daala_enc->w.ec);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-
-  // Change coefficient ordering for pvq encoding.
-  od_raster_to_coding_order(coeff_pvq, tx_blk_size, tx_type, coeff,
-                            tx_blk_size);
-  od_raster_to_coding_order(ref_coeff_pvq, tx_blk_size, tx_type, ref_coeff,
-                            tx_blk_size);
-
-  // copy int16 inputs to int32
-  for (i = 0; i < tx_blk_size * tx_blk_size; i++) {
-    ref_int32[i] =
-        AOM_SIGNED_SHL(ref_coeff_pvq[i], OD_COEFF_SHIFT - coeff_shift) >>
-        hbd_downshift;
-    in_int32[i] = AOM_SIGNED_SHL(coeff_pvq[i], OD_COEFF_SHIFT - coeff_shift) >>
-                  hbd_downshift;
-  }
-
-  if (abs(in_int32[0] - ref_int32[0]) < pvq_dc_quant * 141 / 256) { /* 0.55 */
-    out_int32[0] = 0;
-  } else {
-    out_int32[0] = OD_DIV_R0(in_int32[0] - ref_int32[0], pvq_dc_quant);
-  }
-
-  ac_dc_coded = od_pvq_encode(
-      daala_enc, ref_int32, in_int32, out_int32,
-      OD_MAXI(1,
-              quant[0] << (OD_COEFF_SHIFT - 3) >>
-                  hbd_downshift),  // scale/quantizer
-      OD_MAXI(1,
-              quant[1] << (OD_COEFF_SHIFT - 3) >>
-                  hbd_downshift),  // scale/quantizer
-      plane, tx_size, OD_PVQ_BETA[use_activity_masking][plane][tx_size],
-      0,  // is_keyframe,
-      daala_enc->state.qm + off, daala_enc->state.qm_inv + off,
-      speed,  // speed
-      pvq_info);
-
-  // Encode residue of DC coeff, if required.
-  if (!has_dc_skip || out_int32[0]) {
-    generic_encode(&daala_enc->w, &daala_enc->state.adapt->model_dc[plane],
-                   abs(out_int32[0]) - has_dc_skip,
-                   &daala_enc->state.adapt->ex_dc[plane][tx_size][0], 2);
-  }
-  if (out_int32[0]) {
-    aom_write_bit(&daala_enc->w, out_int32[0] < 0);
-  }
-
-  // need to save quantized residue of DC coeff
-  // so that final pvq bitstream writing can know whether DC is coded.
-  if (pvq_info) pvq_info->dq_dc_residue = out_int32[0];
-
-  out_int32[0] = out_int32[0] * pvq_dc_quant;
-  out_int32[0] += ref_int32[0];
-
-  // copy int32 result back to int16
-  assert(OD_COEFF_SHIFT > coeff_shift);
-  rounding_mask = (1 << (OD_COEFF_SHIFT - coeff_shift - 1)) - 1;
-  for (i = 0; i < tx_blk_size * tx_blk_size; i++) {
-    out_int32[i] = AOM_SIGNED_SHL(out_int32[i], hbd_downshift);
-    dqcoeff_pvq[i] = (out_int32[i] + (out_int32[i] < 0) + rounding_mask) >>
-                     (OD_COEFF_SHIFT - coeff_shift);
-  }
-
-  // Back to original coefficient order
-  od_coding_order_to_raster(dqcoeff, tx_blk_size, tx_type, dqcoeff_pvq,
-                            tx_blk_size);
-
-  *eob = tx_blk_size * tx_blk_size;
-
-#if !CONFIG_ANS
-  *rate = (od_ec_enc_tell_frac(&daala_enc->w.ec) - tell)
-          << (AV1_PROB_COST_SHIFT - OD_BITRES);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-  assert(*rate >= 0);
-
-  return ac_dc_coded;
-}
-
-void av1_store_pvq_enc_info(PVQ_INFO *pvq_info, int *qg, int *theta, int *k,
-                            od_coeff *y, int nb_bands, const int *off,
-                            int *size, int skip_rest, int skip_dir,
-                            int bs) {  // block size in log_2 -2
-  int i;
-  const int tx_blk_size = tx_size_wide[bs];
-
-  for (i = 0; i < nb_bands; i++) {
-    pvq_info->qg[i] = qg[i];
-    pvq_info->theta[i] = theta[i];
-    pvq_info->k[i] = k[i];
-    pvq_info->off[i] = off[i];
-    pvq_info->size[i] = size[i];
-  }
-
-  memcpy(pvq_info->y, y, tx_blk_size * tx_blk_size * sizeof(od_coeff));
-
-  pvq_info->nb_bands = nb_bands;
-  pvq_info->skip_rest = skip_rest;
-  pvq_info->skip_dir = skip_dir;
-  pvq_info->bs = bs;
-}
-#endif
diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h
index c817a94f0..673f87ea7 100644
--- a/third_party/aom/av1/encoder/encodemb.h
+++ b/third_party/aom/av1/encoder/encodemb.h
@@ -12,21 +12,23 @@
 #ifndef AV1_ENCODER_ENCODEMB_H_
 #define AV1_ENCODER_ENCODEMB_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
 #include "av1/encoder/block.h"
-
+#include "av1/encoder/tokenize.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 struct optimize_ctx {
-  ENTROPY_CONTEXT ta[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
-  ENTROPY_CONTEXT tl[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][MAX_MIB_SIZE];
+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][MAX_MIB_SIZE];
 };
 
 struct encode_b_args {
-  AV1_COMMON *cm;
+  const struct AV1_COMP *cpi;
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
   int8_t *skip;
@@ -43,52 +45,39 @@ typedef enum AV1_XFORM_QUANT {
   AV1_XFORM_QUANT_TYPES,
 } AV1_XFORM_QUANT;
 
-void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
-                   int mi_col);
-#if CONFIG_SUPERTX
-void av1_encode_sb_supertx(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize);
-#endif  // CONFIG_SUPERTX
+void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                   int mi_row, int mi_col, RUN_TYPE dry_run);
 void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize);
 void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
                      int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
-                     TX_SIZE tx_size, int ctx, AV1_XFORM_QUANT xform_quant_idx);
+                     TX_SIZE tx_size, TX_TYPE tx_type,
+                     AV1_XFORM_QUANT xform_quant_idx);
 
-int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int blk_row,
-                   int blk_col, int block, BLOCK_SIZE plane_bsize,
-                   TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
-                   const ENTROPY_CONTEXT *l, int fast_mode);
+int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
+                   int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                   const TXB_CTX *const txb_ctx, int fast_mode, int *rate_cost);
 
 void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
                       int blk_col, int blk_row, TX_SIZE tx_size);
 
 void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
-#if !CONFIG_PVQ
-void av1_set_txb_context(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
-                         ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l);
-#endif
+static INLINE void av1_set_txb_context(MACROBLOCK *x, int plane, int block,
+                                       TX_SIZE tx_size, ENTROPY_CONTEXT *a,
+                                       ENTROPY_CONTEXT *l) {
+  const uint8_t ctx = x->plane[plane].txb_entropy_ctx[block];
+  memset(a, ctx, tx_size_wide_unit[tx_size] * sizeof(*a));
+  memset(l, ctx, tx_size_high_unit[tx_size] * sizeof(*l));
+}
 
 void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
 
-void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x,
+void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
                                   BLOCK_SIZE bsize, int plane,
                                   int enable_optimize_b, int mi_row,
                                   int mi_col);
 
-#if CONFIG_PVQ
-PVQ_SKIP_TYPE av1_pvq_encode_helper(MACROBLOCK *x, tran_low_t *const coeff,
-                                    tran_low_t *ref_coeff,
-                                    tran_low_t *const dqcoeff, uint16_t *eob,
-                                    const int16_t *quant, int plane,
-                                    TX_SIZE tx_size, TX_TYPE tx_type, int *rate,
-                                    int speed, PVQ_INFO *pvq_info);
-
-void av1_store_pvq_enc_info(PVQ_INFO *pvq_info, int *qg, int *theta, int *k,
-                            od_coeff *y, int nb_bands, const int *off,
-                            int *size, int skip_rest, int skip_dir, int bs);
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c
index f8a546999..944e2c53d 100644
--- a/third_party/aom/av1/encoder/encodemv.c
+++ b/third_party/aom/av1/encoder/encodemv.c
@@ -16,20 +16,9 @@
 
 #include "av1/encoder/cost.h"
 #include "av1/encoder/encodemv.h"
-#include "av1/encoder/subexp.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 
-static struct av1_token mv_joint_encodings[MV_JOINTS];
-static struct av1_token mv_class_encodings[MV_CLASSES];
-static struct av1_token mv_fp_encodings[MV_FP_SIZE];
-
-void av1_entropy_mv_init(void) {
-  av1_tokens_from_tree(mv_joint_encodings, av1_mv_joint_tree);
-  av1_tokens_from_tree(mv_class_encodings, av1_mv_class_tree);
-  av1_tokens_from_tree(mv_fp_encodings, av1_mv_fp_tree);
-}
-
 static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
                                 MvSubpelPrecision precision) {
   int offset;
@@ -42,38 +31,23 @@ static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
 
   assert(comp != 0);
 
-// Sign
-#if CONFIG_NEW_MULTISYMBOL
-  aom_write_bit(w, sign);
-#else
-  aom_write(w, sign, mvcomp->sign);
-#endif
+  // Sign
+  aom_write_symbol(w, sign, mvcomp->sign_cdf, 2);
 
   // Class
-  aom_write_symbol(w, mv_class, mvcomp->class_cdf, MV_CLASSES);
+  aom_write_symbol(w, mv_class, mvcomp->classes_cdf, MV_CLASSES);
 
   // Integer bits
   if (mv_class == MV_CLASS_0) {
-#if CONFIG_NEW_MULTISYMBOL
     aom_write_symbol(w, d, mvcomp->class0_cdf, CLASS0_SIZE);
-#else
-    aom_write(w, d, mvcomp->class0[0]);
-#endif
   } else {
     int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
-#if CONFIG_NEW_MULTISYMBOL
     for (i = 0; i < n; ++i)
-      aom_write_symbol(w, (d >> i) & 1, mvcomp->bits_cdf[(i + 1) / 2], 2);
-#else
-    for (i = 0; i < n; ++i) aom_write(w, (d >> i) & 1, mvcomp->bits[i]);
-#endif
+      aom_write_symbol(w, (d >> i) & 1, mvcomp->bits_cdf[i], 2);
   }
-// Fractional bits
-#if CONFIG_INTRABC || CONFIG_AMVR
-  if (precision > MV_SUBPEL_NONE)
-#endif  // CONFIG_INTRABC || CONFIG_AMVR
-  {
+  // Fractional bits
+  if (precision > MV_SUBPEL_NONE) {
     aom_write_symbol(
         w, fr,
         mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
@@ -82,13 +56,9 @@ static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
 
   // High precision bit
   if (precision > MV_SUBPEL_LOW_PRECISION)
-#if CONFIG_NEW_MULTISYMBOL
     aom_write_symbol(
         w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf,
         2);
-#else
-    aom_write(w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp);
-#endif
 }
 
 static void build_nmv_component_cost_table(int *mvcost,
@@ -100,24 +70,20 @@ static void build_nmv_component_cost_table(int *mvcost,
   int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE];
   int class0_hp_cost[2], hp_cost[2];
 
-  sign_cost[0] = av1_cost_zero(mvcomp->sign);
-  sign_cost[1] = av1_cost_one(mvcomp->sign);
-  av1_cost_tokens(class_cost, mvcomp->classes, av1_mv_class_tree);
-  av1_cost_tokens(class0_cost, mvcomp->class0, av1_mv_class0_tree);
+  av1_cost_tokens_from_cdf(sign_cost, mvcomp->sign_cdf, NULL);
+  av1_cost_tokens_from_cdf(class_cost, mvcomp->classes_cdf, NULL);
+  av1_cost_tokens_from_cdf(class0_cost, mvcomp->class0_cdf, NULL);
   for (i = 0; i < MV_OFFSET_BITS; ++i) {
-    bits_cost[i][0] = av1_cost_zero(mvcomp->bits[i]);
-    bits_cost[i][1] = av1_cost_one(mvcomp->bits[i]);
+    av1_cost_tokens_from_cdf(bits_cost[i], mvcomp->bits_cdf[i], NULL);
   }
 
   for (i = 0; i < CLASS0_SIZE; ++i)
-    av1_cost_tokens(class0_fp_cost[i], mvcomp->class0_fp[i], av1_mv_fp_tree);
-  av1_cost_tokens(fp_cost, mvcomp->fp, av1_mv_fp_tree);
+    av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i], NULL);
+  av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, NULL);
 
   if (precision > MV_SUBPEL_LOW_PRECISION) {
-    class0_hp_cost[0] = av1_cost_zero(mvcomp->class0_hp);
-    class0_hp_cost[1] = av1_cost_one(mvcomp->class0_hp);
-    hp_cost[0] = av1_cost_zero(mvcomp->hp);
-    hp_cost[1] = av1_cost_one(mvcomp->hp);
+    av1_cost_tokens_from_cdf(class0_hp_cost, mvcomp->class0_hp_cdf, NULL);
+    av1_cost_tokens_from_cdf(hp_cost, mvcomp->hp_cdf, NULL);
   }
   mvcost[0] = 0;
   for (v = 1; v <= MV_MAX; ++v) {
@@ -134,10 +100,7 @@ static void build_nmv_component_cost_table(int *mvcost,
       const int b = c + CLASS0_BITS - 1; /* number of bits */
       for (i = 0; i < b; ++i) cost += bits_cost[i][((d >> i) & 1)];
     }
-#if CONFIG_INTRABC || CONFIG_AMVR
-    if (precision > MV_SUBPEL_NONE)
-#endif  // CONFIG_INTRABC || CONFIG_AMVR
-    {
+    if (precision > MV_SUBPEL_NONE) {
       if (c == MV_CLASS_0) {
         cost += class0_fp_cost[d][f];
       } else {
@@ -156,50 +119,14 @@ static void build_nmv_component_cost_table(int *mvcost,
   }
 }
 
-#if !CONFIG_NEW_MULTISYMBOL
-static void update_mv(aom_writer *w, const unsigned int ct[2], aom_prob *cur_p,
-                      aom_prob upd_p) {
-  (void)upd_p;
-  // Just use the default maximum number of tile groups to avoid passing in the
-  // actual
-  // number
-  av1_cond_prob_diff_update(w, cur_p, ct, DEFAULT_MAX_NUM_TG);
-}
-
-void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w,
-                         nmv_context_counts *const nmv_counts) {
-  int i;
-  int nmv_ctx = 0;
-#if CONFIG_AMVR
-  if (cm->cur_frame_mv_precision_level) {
-    return;
-  }
-#endif
-  for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) {
-    nmv_context *const mvc = &cm->fc->nmvc[nmv_ctx];
-    nmv_context_counts *const counts = &nmv_counts[nmv_ctx];
-
-    if (usehp) {
-      for (i = 0; i < 2; ++i) {
-        update_mv(w, counts->comps[i].class0_hp, &mvc->comps[i].class0_hp,
-                  MV_UPDATE_PROB);
-        update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, MV_UPDATE_PROB);
-      }
-    }
-  }
-}
-#endif
-
 void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
                    nmv_context *mvctx, int usehp) {
   const MV diff = { mv->row - ref->row, mv->col - ref->col };
   const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
-#if CONFIG_AMVR
-  if (cpi->common.cur_frame_mv_precision_level) {
+  if (cpi->common.cur_frame_force_integer_mv) {
     usehp = MV_SUBPEL_NONE;
   }
-#endif
-  aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS);
+  aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS);
   if (mv_joint_vertical(j))
     encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
 
@@ -214,212 +141,81 @@ void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
   }
 }
 
-#if CONFIG_INTRABC
 void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
                    nmv_context *mvctx) {
+  // DV and ref DV should not have sub-pel.
+  assert((mv->col & 7) == 0);
+  assert((mv->row & 7) == 0);
+  assert((ref->col & 7) == 0);
+  assert((ref->row & 7) == 0);
   const MV diff = { mv->row - ref->row, mv->col - ref->col };
   const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
 
-  aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS);
+  aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS);
   if (mv_joint_vertical(j))
     encode_mv_component(w, diff.row, &mvctx->comps[0], MV_SUBPEL_NONE);
 
   if (mv_joint_horizontal(j))
     encode_mv_component(w, diff.col, &mvctx->comps[1], MV_SUBPEL_NONE);
 }
-#endif  // CONFIG_INTRABC
 
 void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
                               const nmv_context *ctx,
                               MvSubpelPrecision precision) {
-  av1_cost_tokens(mvjoint, ctx->joints, av1_mv_joint_tree);
+  av1_cost_tokens_from_cdf(mvjoint, ctx->joints_cdf, NULL);
   build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], precision);
   build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], precision);
 }
 
-static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
-                    const int_mv mvs[2], const int_mv pred_mvs[2],
-                    nmv_context_counts *nmv_counts
-#if CONFIG_AMVR
-                    ,
-                    MvSubpelPrecision precision
-#endif
-                    ) {
-  int i;
-  PREDICTION_MODE mode = mbmi->mode;
-
-  if (mode == NEWMV || mode == NEW_NEWMV) {
-    for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
-      const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
-      const MV diff = { mvs[i].as_mv.row - ref->row,
-                        mvs[i].as_mv.col - ref->col };
-      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-      int nmv_ctx =
-          av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                      mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
-      nmv_context_counts *counts = &nmv_counts[nmv_ctx];
-      (void)pred_mvs;
-#if CONFIG_AMVR
-      av1_inc_mv(&diff, counts, precision);
-#else
-      av1_inc_mv(&diff, counts, 1);
-#endif
+int_mv av1_get_ref_mv_from_stack(int ref_idx,
+                                 const MV_REFERENCE_FRAME *ref_frame,
+                                 int ref_mv_idx,
+                                 const MB_MODE_INFO_EXT *mbmi_ext) {
+  const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+  const CANDIDATE_MV *curr_ref_mv_stack =
+      mbmi_ext->ref_mv_stack[ref_frame_type];
+  int_mv ref_mv;
+  ref_mv.as_int = INVALID_MV;
+
+  if (ref_frame[1] > INTRA_FRAME) {
+    if (ref_idx == 0) {
+      ref_mv = curr_ref_mv_stack[ref_mv_idx].this_mv;
+    } else {
+      assert(ref_idx == 1);
+      ref_mv = curr_ref_mv_stack[ref_mv_idx].comp_mv;
     }
-  } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
-    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv;
-    const MV diff = { mvs[1].as_mv.row - ref->row,
-                      mvs[1].as_mv.col - ref->col };
-    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-    int nmv_ctx =
-        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                    mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
-    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
-#if CONFIG_AMVR
-    av1_inc_mv(&diff, counts, precision);
-#else
-    av1_inc_mv(&diff, counts, 1);
-#endif
-  } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
-    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv;
-    const MV diff = { mvs[0].as_mv.row - ref->row,
-                      mvs[0].as_mv.col - ref->col };
-    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-    int nmv_ctx =
-        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                    mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
-#if CONFIG_AMVR
-    av1_inc_mv(&diff, counts, precision);
-#else
-    av1_inc_mv(&diff, counts, 1);
-#endif
-#if CONFIG_COMPOUND_SINGLEREF
   } else {
-    assert(  // mode == SR_NEAREST_NEWMV ||
-        mode == SR_NEAR_NEWMV || mode == SR_ZERO_NEWMV || mode == SR_NEW_NEWMV);
-    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv;
-    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-    int nmv_ctx =
-        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                    mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
-    (void)pred_mvs;
-    MV diff;
-    if (mode == SR_NEW_NEWMV) {
-      diff.row = mvs[0].as_mv.row - ref->row;
-      diff.col = mvs[0].as_mv.col - ref->col;
-      av1_inc_mv(&diff, counts, 1);
+    assert(ref_idx == 0);
+    if (ref_mv_idx < mbmi_ext->ref_mv_count[ref_frame_type]) {
+      ref_mv = curr_ref_mv_stack[ref_mv_idx].this_mv;
+    } else {
+      ref_mv = mbmi_ext->global_mvs[ref_frame_type];
     }
-    diff.row = mvs[1].as_mv.row - ref->row;
-    diff.col = mvs[1].as_mv.col - ref->col;
-    av1_inc_mv(&diff, counts, 1);
-#endif  // CONFIG_COMPOUND_SINGLEREF
   }
+  return ref_mv;
 }
 
-static void inc_mvs_sub8x8(const MODE_INFO *mi, int block, const int_mv mvs[2],
-                           const MB_MODE_INFO_EXT *mbmi_ext,
-                           nmv_context_counts *nmv_counts
-#if CONFIG_AMVR
-                           ,
-                           MvSubpelPrecision precision
-#endif
-                           ) {
-  int i;
-  PREDICTION_MODE mode = mi->bmi[block].as_mode;
-  const MB_MODE_INFO *mbmi = &mi->mbmi;
-
-  if (mode == NEWMV || mode == NEW_NEWMV) {
-    for (i = 0; i < 1 + has_second_ref(&mi->mbmi); ++i) {
-      const MV *ref = &mi->bmi[block].ref_mv[i].as_mv;
-      const MV diff = { mvs[i].as_mv.row - ref->row,
-                        mvs[i].as_mv.col - ref->col };
-      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-      int nmv_ctx =
-          av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                      mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
-      nmv_context_counts *counts = &nmv_counts[nmv_ctx];
-#if CONFIG_AMVR
-      av1_inc_mv(&diff, counts, precision);
-#else
-      av1_inc_mv(&diff, counts, 1);
-#endif
-    }
-  } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
-    const MV *ref = &mi->bmi[block].ref_mv[1].as_mv;
-    const MV diff = { mvs[1].as_mv.row - ref->row,
-                      mvs[1].as_mv.col - ref->col };
-    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-    int nmv_ctx =
-        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                    mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
-    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
-#if CONFIG_AMVR
-    av1_inc_mv(&diff, counts, precision);
-#else
-    av1_inc_mv(&diff, counts, 1);
-#endif
-  } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
-    const MV *ref = &mi->bmi[block].ref_mv[0].as_mv;
-    const MV diff = { mvs[0].as_mv.row - ref->row,
-                      mvs[0].as_mv.col - ref->col };
-    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-    int nmv_ctx =
-        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                    mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
-#if CONFIG_AMVR
-    av1_inc_mv(&diff, counts, precision);
-#else
-    av1_inc_mv(&diff, counts, 1);
-#endif
+int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  int ref_mv_idx = mbmi->ref_mv_idx;
+  if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) {
+    assert(has_second_ref(mbmi));
+    ref_mv_idx += 1;
   }
+  return av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx,
+                                   x->mbmi_ext);
 }
 
-void av1_update_mv_count(ThreadData *td) {
-  const MACROBLOCKD *xd = &td->mb.e_mbd;
-  const MODE_INFO *mi = xd->mi[0];
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const MB_MODE_INFO_EXT *mbmi_ext = td->mb.mbmi_ext;
-#if CONFIG_CB4X4
-  const int unify_bsize = 1;
-#else
-  const int unify_bsize = 0;
-#endif
-#if CONFIG_AMVR
-  MvSubpelPrecision precision = 1;
-  if (xd->cur_frame_mv_precision_level) {
-    precision = MV_SUBPEL_NONE;
-  }
-#endif
-
-  if (mbmi->sb_type < BLOCK_8X8 && !unify_bsize) {
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi->sb_type];
-    const int num_4x4_h = num_4x4_blocks_high_lookup[mbmi->sb_type];
-    int idx, idy;
-
-    for (idy = 0; idy < 2; idy += num_4x4_h) {
-      for (idx = 0; idx < 2; idx += num_4x4_w) {
-        const int i = idy * 2 + idx;
-
-        if (have_newmv_in_inter_mode(mi->bmi[i].as_mode))
-
-#if CONFIG_AMVR
-          inc_mvs_sub8x8(mi, i, mi->bmi[i].as_mv, mbmi_ext, td->counts->mv,
-                         precision);
-#else
-          inc_mvs_sub8x8(mi, i, mi->bmi[i].as_mv, mbmi_ext, td->counts->mv);
-#endif
-      }
-    }
-  } else {
-    if (have_newmv_in_inter_mode(mbmi->mode))
-
-#if CONFIG_AMVR
-      inc_mvs(mbmi, mbmi_ext, mbmi->mv, mbmi->pred_mv, td->counts->mv,
-              precision);
-#else
-      inc_mvs(mbmi, mbmi_ext, mbmi->mv, mbmi->pred_mv, td->counts->mv);
-#endif
-  }
+void av1_find_best_ref_mvs_from_stack(int allow_hp,
+                                      const MB_MODE_INFO_EXT *mbmi_ext,
+                                      MV_REFERENCE_FRAME ref_frame,
+                                      int_mv *nearest_mv, int_mv *near_mv,
+                                      int is_integer) {
+  const int ref_idx = 0;
+  MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME };
+  *nearest_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 0, mbmi_ext);
+  lower_mv_precision(&nearest_mv->as_mv, allow_hp, is_integer);
+  *near_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 1, mbmi_ext);
+  lower_mv_precision(&near_mv->as_mv, allow_hp, is_integer);
 }
diff --git a/third_party/aom/av1/encoder/encodemv.h b/third_party/aom/av1/encoder/encodemv.h
index 8689cec27..64e9e7162 100644
--- a/third_party/aom/av1/encoder/encodemv.h
+++ b/third_party/aom/av1/encoder/encodemv.h
@@ -18,13 +18,6 @@
 extern "C" {
 #endif
 
-void av1_entropy_mv_init(void);
-
-#if !CONFIG_NEW_MULTISYMBOL
-void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w,
-                         nmv_context_counts *const counts);
-#endif
-
 void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
                    nmv_context *mvctx, int usehp);
 
@@ -34,10 +27,18 @@ void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
 
 void av1_update_mv_count(ThreadData *td);
 
-#if CONFIG_INTRABC
 void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
                    nmv_context *mvctx);
-#endif  // CONFIG_INTRABC
+int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx);
+int_mv av1_get_ref_mv_from_stack(int ref_idx,
+                                 const MV_REFERENCE_FRAME *ref_frame,
+                                 int ref_mv_idx,
+                                 const MB_MODE_INFO_EXT *mbmi_ext);
+void av1_find_best_ref_mvs_from_stack(int allow_hp,
+                                      const MB_MODE_INFO_EXT *mbmi_ext,
+                                      MV_REFERENCE_FRAME ref_frame,
+                                      int_mv *nearest_mv, int_mv *near_mv,
+                                      int is_integer);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c
index e9ab3c87f..196e18d8a 100644
--- a/third_party/aom/av1/encoder/encoder.c
+++ b/third_party/aom/av1/encoder/encoder.c
@@ -13,12 +13,13 @@
 #include <math.h>
 #include <stdio.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
 
 #include "av1/common/alloccommon.h"
-#if CONFIG_CDEF
 #include "av1/common/cdef.h"
-#endif  // CONFIG_CDEF
 #include "av1/common/filter.h"
 #include "av1/common/idct.h"
 #include "av1/common/reconinter.h"
@@ -30,32 +31,17 @@
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/aq_variance.h"
 #include "av1/encoder/bitstream.h"
-#if CONFIG_BGSPRITE
-#include "av1/encoder/bgsprite.h"
-#endif  // CONFIG_BGSPRITE
-#if CONFIG_ANS
-#include "aom_dsp/buf_ans.h"
-#endif
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encoder.h"
-#if CONFIG_LV_MAP
 #include "av1/encoder/encodetxb.h"
-#endif
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/firstpass.h"
-#if CONFIG_HASH_ME
 #include "av1/encoder/hash_motion.h"
-#endif
 #include "av1/encoder/mbgraph.h"
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-#include "av1/common/ncobmc_kernels.h"
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
 #include "av1/encoder/picklpf.h"
-#if CONFIG_LOOP_RESTORATION
 #include "av1/encoder/pickrst.h"
-#endif  // CONFIG_LOOP_RESTORATION
 #include "av1/encoder/random.h"
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
@@ -63,45 +49,41 @@
 #include "av1/encoder/speed_features.h"
 #include "av1/encoder/temporal_filter.h"
 
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-#include "./aom_scale_rtcd.h"
 #include "aom_dsp/psnr.h"
 #if CONFIG_INTERNAL_STATS
 #include "aom_dsp/ssim.h"
 #endif
+#include "av1/encoder/grain_test_vectors.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
 #include "aom_scale/aom_scale.h"
-#if CONFIG_BITSTREAM_DEBUG
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 #include "aom_util/debug_util.h"
-#endif  // CONFIG_BITSTREAM_DEBUG
+#endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+
+#define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7
+
+// av1 uses 10,000,000 ticks/second as time stamp
+#define TICKS_PER_SEC 10000000LL
 
 #if CONFIG_ENTROPY_STATS
 FRAME_COUNTS aggregate_fc;
-// Aggregate frame counts per frame context type
-FRAME_COUNTS aggregate_fc_per_type[FRAME_CONTEXTS];
 #endif  // CONFIG_ENTROPY_STATS
 
 #define AM_SEGMENT_ID_INACTIVE 7
 #define AM_SEGMENT_ID_ACTIVE 0
 
-#define SHARP_FILTER_QTHRESH 0 /* Q threshold for 8-tap sharp filter */
+// Whether to use high precision mv for altref computation.
+#define ALTREF_HIGH_PRECISION_MV 1
 
-#define ALTREF_HIGH_PRECISION_MV 1     // Whether to use high precision mv
-                                       //  for altref computation.
-#define HIGH_PRECISION_MV_QTHRESH 200  // Q threshold for high precision
-                                       // mv. Choose a very high value for
-                                       // now so that HIGH_PRECISION is always
-                                       // chosen.
+// Q threshold for high precision mv. Choose a very high value for now so that
+// HIGH_PRECISION is always chosen.
+#define HIGH_PRECISION_MV_QTHRESH 200
 
 // #define OUTPUT_YUV_REC
-#ifdef OUTPUT_YUV_DENOISED
-FILE *yuv_denoised_file = NULL;
-#endif
 #ifdef OUTPUT_YUV_SKINMAP
 FILE *yuv_skinmap_file = NULL;
 #endif
@@ -110,20 +92,6 @@ FILE *yuv_rec_file;
 #define FILE_NAME_LEN 100
 #endif
 
-#if 0
-FILE *framepsnr;
-FILE *kf_list;
-FILE *keyfile;
-#endif
-
-#if CONFIG_CFL
-CFL_CTX NULL_CFL;
-#endif
-
-#if CONFIG_INTERNAL_STATS
-typedef enum { Y, U, V, ALL } STAT_TYPE;
-#endif  // CONFIG_INTERNAL_STATS
-
 static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) {
   switch (mode) {
     case NORMAL:
@@ -180,7 +148,6 @@ static void apply_active_map(AV1_COMP *cpi) {
         if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
       av1_enable_segmentation(seg);
       av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
-#if CONFIG_LOOPFILTER_LEVEL
       av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
       av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
       av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
@@ -194,23 +161,12 @@ static void apply_active_map(AV1_COMP *cpi) {
                       -MAX_LOOP_FILTER);
       av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V,
                       -MAX_LOOP_FILTER);
-#else
-      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
-      // Setting the data to -MAX_LOOP_FILTER will result in the computed loop
-      // filter level being zero regardless of the value of seg->abs_delta.
-      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF,
-                      -MAX_LOOP_FILTER);
-#endif  // CONFIG_LOOPFILTER_LEVEL
     } else {
       av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
-#if CONFIG_LOOPFILTER_LEVEL
       av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
       av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
       av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
       av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
-#else
-      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
-#endif  // CONFIG_LOOPFILTER_LEVEL
       if (seg->enabled) {
         seg->update_data = 1;
         seg->update_map = 1;
@@ -277,54 +233,45 @@ int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
   }
 }
 
-static void set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv
-#if CONFIG_AMVR
-                                  ,
-                                  int cur_frame_mv_precision_level
-#endif
-                                  ) {
+static void set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv,
+                                  int cur_frame_force_integer_mv) {
   MACROBLOCK *const mb = &cpi->td.mb;
-  cpi->common.allow_high_precision_mv = allow_high_precision_mv;
-
-#if CONFIG_AMVR
-  if (cpi->common.allow_high_precision_mv &&
-      cur_frame_mv_precision_level == 0) {
-#else
-  if (cpi->common.allow_high_precision_mv) {
-#endif
-    int i;
-    for (i = 0; i < NMV_CONTEXTS; ++i) {
-      mb->mv_cost_stack[i] = mb->nmvcost_hp[i];
-    }
-  } else {
-    int i;
-    for (i = 0; i < NMV_CONTEXTS; ++i) {
-      mb->mv_cost_stack[i] = mb->nmvcost[i];
-    }
-  }
+  cpi->common.allow_high_precision_mv =
+      allow_high_precision_mv && cur_frame_force_integer_mv == 0;
+  const int copy_hp =
+      cpi->common.allow_high_precision_mv && cur_frame_force_integer_mv == 0;
+  int *(*src)[2] = copy_hp ? &mb->nmvcost_hp : &mb->nmvcost;
+  mb->mv_cost_stack = *src;
 }
 
 static BLOCK_SIZE select_sb_size(const AV1_COMP *const cpi) {
-#if CONFIG_EXT_PARTITION
+  const AV1_COMMON *const cm = &cpi->common;
+
   if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_64X64)
     return BLOCK_64X64;
-
-  if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_128X128)
-    return BLOCK_128X128;
+#if CONFIG_FILEOPTIONS
+  if (cm->options && cm->options->ext_partition)
+#endif
+    if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_128X128)
+      return BLOCK_128X128;
 
   assert(cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC);
 
-  assert(IMPLIES(cpi->common.tile_cols > 1,
-                 cpi->common.tile_width % MAX_MIB_SIZE == 0));
-  assert(IMPLIES(cpi->common.tile_rows > 1,
-                 cpi->common.tile_height % MAX_MIB_SIZE == 0));
+// TODO(any): Possibly could improve this with a heuristic.
+#if CONFIG_FILEOPTIONS
+  if (cm->options && !cm->options->ext_partition) return BLOCK_64X64;
+#endif
+
+  // When superres / resize is on, 'cm->width / height' can change between
+  // calls, so we don't apply this heuristic there. Also, this heuristic gives
+  // compression gain for speed >= 2 only.
+  if (cpi->oxcf.superres_mode == SUPERRES_NONE &&
+      cpi->oxcf.resize_mode == RESIZE_NONE && cpi->oxcf.speed >= 2) {
+    return (cm->width >= 480 && cm->height >= 360) ? BLOCK_128X128
+                                                   : BLOCK_64X64;
+  }
 
-  // TODO(any): Possibly could improve this with a heuristic.
   return BLOCK_128X128;
-#else
-  (void)cpi;
-  return BLOCK_64X64;
-#endif  //  CONFIG_EXT_PARTITION
 }
 
 static void setup_frame(AV1_COMP *cpi) {
@@ -334,96 +281,82 @@ static void setup_frame(AV1_COMP *cpi) {
   // frames where the error_resilient_mode or intra_only flag is set. For
   // other inter-frames the encoder currently uses only two contexts;
   // context 1 for ALTREF frames and context 0 for the others.
-  if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+
+  cm->primary_ref_frame = PRIMARY_REF_NONE;
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+      cm->force_primary_ref_none) {
     av1_setup_past_independence(cm);
+    for (int i = 0; i < REF_FRAMES; i++) {
+      cm->fb_of_context_type[i] = -1;
+    }
+    cm->fb_of_context_type[REGULAR_FRAME] =
+        get_ref_frame_map_idx(cpi, GOLDEN_FRAME);
+    cm->frame_context_idx = REGULAR_FRAME;
   } else {
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-// Just use frame context from first signaled reference frame.
-// This will always be LAST_FRAME for now.
-#else
-#if CONFIG_EXT_REFS
     const GF_GROUP *gf_group = &cpi->twopass.gf_group;
     if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE)
       cm->frame_context_idx = EXT_ARF_FRAME;
     else if (cpi->refresh_alt_ref_frame)
       cm->frame_context_idx = ARF_FRAME;
-#else   // !CONFIG_EXT_REFS
-    if (cpi->refresh_alt_ref_frame) cm->frame_context_idx = ARF_FRAME;
-#endif  // CONFIG_EXT_REFS
     else if (cpi->rc.is_src_frame_alt_ref)
       cm->frame_context_idx = OVERLAY_FRAME;
     else if (cpi->refresh_golden_frame)
       cm->frame_context_idx = GLD_FRAME;
-#if CONFIG_EXT_REFS
     else if (cpi->refresh_bwd_ref_frame)
       cm->frame_context_idx = BRF_FRAME;
-#endif  // CONFIG_EXT_REFS
     else
       cm->frame_context_idx = REGULAR_FRAME;
-#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
+    int wanted_fb = cm->fb_of_context_type[cm->frame_context_idx];
+    for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+      int fb = get_ref_frame_map_idx(cpi, ref_frame);
+      if (fb == wanted_fb) {
+        cm->primary_ref_frame = ref_frame - LAST_FRAME;
+      }
+    }
   }
 
   if (cm->frame_type == KEY_FRAME) {
     cpi->refresh_golden_frame = 1;
     cpi->refresh_alt_ref_frame = 1;
     av1_zero(cpi->interp_filter_selected);
-    set_sb_size(cm, select_sb_size(cpi));
-#if CONFIG_REFERENCE_BUFFER
+    set_sb_size(&cm->seq_params, select_sb_size(cpi));
     set_use_reference_buffer(cm, 0);
-#endif  // CONFIG_REFERENCE_BUFFER
+  } else if (frame_is_sframe(cm)) {
+    cpi->refresh_golden_frame = 1;
+    cpi->refresh_alt_ref_frame = 1;
+    av1_zero(cpi->interp_filter_selected);
+    set_sb_size(&cm->seq_params, select_sb_size(cpi));
   } else {
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-    if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
-        cm->frame_refs[0].idx < 0) {
-      *cm->fc = cm->frame_contexts[FRAME_CONTEXT_DEFAULTS];
+    if (cm->primary_ref_frame == PRIMARY_REF_NONE ||
+        cm->frame_refs[cm->primary_ref_frame].idx < 0) {
+      av1_setup_past_independence(cm);
+      cm->seg.update_map = 1;
+      cm->seg.update_data = 1;
     } else {
-      *cm->fc = cm->frame_contexts[cm->frame_refs[0].idx];
+      *cm->fc = cm->frame_contexts[cm->frame_refs[cm->primary_ref_frame].idx];
     }
-#else
-    *cm->fc = cm->frame_contexts[cm->frame_context_idx];
-#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
     av1_zero(cpi->interp_filter_selected[0]);
   }
-#if CONFIG_EXT_REFS
-#if CONFIG_ONE_SIDED_COMPOUND && \
-    !CONFIG_EXT_COMP_REFS  // No change to bitstream
-  if (cpi->sf.recode_loop == DISALLOW_RECODE) {
-    cpi->refresh_bwd_ref_frame = cpi->refresh_last_frame;
-    cpi->rc.is_bipred_frame = 1;
-  }
-#endif  // CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
-      cm->frame_refs[0].idx < 0) {
-    // use default frame context values
-    cm->pre_fc = &cm->frame_contexts[FRAME_CONTEXT_DEFAULTS];
-  } else {
-    *cm->fc = cm->frame_contexts[cm->frame_refs[0].idx];
-    cm->pre_fc = &cm->frame_contexts[cm->frame_refs[0].idx];
-  }
-#else
-  cm->pre_fc = &cm->frame_contexts[cm->frame_context_idx];
-#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
 
+  cm->prev_frame = get_prev_frame(cm);
   cpi->vaq_refresh = 0;
 }
 
 static void enc_setup_mi(AV1_COMMON *cm) {
   int i;
-  cm->mi = cm->mip + cm->mi_stride + 1;
-  memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
-  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+  cm->mi = cm->mip;
+  memset(cm->mip, 0, cm->mi_stride * cm->mi_rows * sizeof(*cm->mip));
+  cm->prev_mi = cm->prev_mip;
   // Clear top border row
   memset(cm->prev_mip, 0, sizeof(*cm->prev_mip) * cm->mi_stride);
   // Clear left border column
-  for (i = 1; i < cm->mi_rows + 1; ++i)
+  for (i = 0; i < cm->mi_rows; ++i)
     memset(&cm->prev_mip[i * cm->mi_stride], 0, sizeof(*cm->prev_mip));
-  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
-  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+  cm->mi_grid_visible = cm->mi_grid_base;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base;
 
   memset(cm->mi_grid_base, 0,
-         cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
+         cm->mi_stride * cm->mi_rows * sizeof(*cm->mi_grid_base));
 }
 
 static int enc_alloc_mi(AV1_COMMON *cm, int mi_size) {
@@ -433,10 +366,11 @@ static int enc_alloc_mi(AV1_COMMON *cm, int mi_size) {
   if (!cm->prev_mip) return 1;
   cm->mi_alloc_size = mi_size;
 
-  cm->mi_grid_base = (MODE_INFO **)aom_calloc(mi_size, sizeof(MODE_INFO *));
+  cm->mi_grid_base =
+      (MB_MODE_INFO **)aom_calloc(mi_size, sizeof(MB_MODE_INFO *));
   if (!cm->mi_grid_base) return 1;
   cm->prev_mi_grid_base =
-      (MODE_INFO **)aom_calloc(mi_size, sizeof(MODE_INFO *));
+      (MB_MODE_INFO **)aom_calloc(mi_size, sizeof(MB_MODE_INFO *));
   if (!cm->prev_mi_grid_base) return 1;
 
   return 0;
@@ -456,19 +390,19 @@ static void enc_free_mi(AV1_COMMON *cm) {
 
 static void swap_mi_and_prev_mi(AV1_COMMON *cm) {
   // Current mip will be the prev_mip for the next frame.
-  MODE_INFO **temp_base = cm->prev_mi_grid_base;
-  MODE_INFO *temp = cm->prev_mip;
+  MB_MODE_INFO **temp_base = cm->prev_mi_grid_base;
+  MB_MODE_INFO *temp = cm->prev_mip;
   cm->prev_mip = cm->mip;
   cm->mip = temp;
 
   // Update the upper left visible macroblock ptrs.
-  cm->mi = cm->mip + cm->mi_stride + 1;
-  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+  cm->mi = cm->mip;
+  cm->prev_mi = cm->prev_mip;
 
   cm->prev_mi_grid_base = cm->mi_grid_base;
   cm->mi_grid_base = temp_base;
-  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
-  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+  cm->mi_grid_visible = cm->mi_grid_base;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base;
 }
 
 void av1_initialize_enc(void) {
@@ -480,11 +414,7 @@ void av1_initialize_enc(void) {
     aom_scale_rtcd();
     av1_init_intra_predictors();
     av1_init_me_luts();
-#if !CONFIG_XIPHRC
     av1_rc_init_minq_luts();
-#endif
-    av1_entropy_mv_init();
-    av1_encode_token_init();
     av1_init_wedge_masks();
     init_done = 1;
   }
@@ -506,25 +436,47 @@ static void alloc_context_buffers_ext(AV1_COMP *cpi) {
                   aom_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
 }
 
-static void dealloc_compressor_data(AV1_COMP *cpi) {
+static void update_film_grain_parameters(struct AV1_COMP *cpi,
+                                         const AV1EncoderConfig *oxcf) {
   AV1_COMMON *const cm = &cpi->common;
+  cpi->oxcf = *oxcf;
 
-  dealloc_context_buffers_ext(cpi);
+  if (cm->film_grain_table) {
+    aom_film_grain_table_free(cm->film_grain_table);
+    aom_free(cm->film_grain_table);
+  }
+  cm->film_grain_table = 0;
+
+  if (oxcf->film_grain_test_vector) {
+    cm->film_grain_params_present = 1;
+    if (cm->frame_type == KEY_FRAME) {
+      memcpy(&cm->film_grain_params,
+             film_grain_test_vectors + oxcf->film_grain_test_vector - 1,
+             sizeof(cm->film_grain_params));
 
-#if CONFIG_PVQ
-  if (cpi->oxcf.pass != 1) {
-    const int tile_cols = cm->tile_cols;
-    const int tile_rows = cm->tile_rows;
-    int tile_col, tile_row;
-
-    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
-      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-        TileDataEnc *tile_data =
-            &cpi->tile_data[tile_row * tile_cols + tile_col];
-        aom_free(tile_data->pvq_q.buf);
+      cm->film_grain_params.bit_depth = cm->bit_depth;
+      if (cm->color_range == AOM_CR_FULL_RANGE) {
+        cm->film_grain_params.clip_to_restricted_range = 0;
       }
+    }
+  } else if (oxcf->film_grain_table_filename) {
+    cm->film_grain_table = aom_malloc(sizeof(*cm->film_grain_table));
+    memset(cm->film_grain_table, 0, sizeof(aom_film_grain_table_t));
+
+    aom_film_grain_table_read(cm->film_grain_table,
+                              oxcf->film_grain_table_filename, &cm->error);
+  } else {
+    cm->film_grain_params_present = 0;
+    memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
   }
-#endif
+}
+
+static void dealloc_compressor_data(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  dealloc_context_buffers_ext(cpi);
+
   aom_free(cpi->tile_data);
   cpi->tile_data = NULL;
 
@@ -538,7 +490,6 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
   aom_free(cpi->active_map.map);
   cpi->active_map.map = NULL;
 
-#if CONFIG_MOTION_VAR
   aom_free(cpi->td.mb.above_pred_buf);
   cpi->td.mb.above_pred_buf = NULL;
 
@@ -550,26 +501,17 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
 
   aom_free(cpi->td.mb.mask_buf);
   cpi->td.mb.mask_buf = NULL;
-#endif
+
+  aom_free(cm->tpl_mvs);
+  cm->tpl_mvs = NULL;
 
   av1_free_ref_frame_buffers(cm->buffer_pool);
-#if CONFIG_LV_MAP
   av1_free_txb_buf(cpi);
-#endif
   av1_free_context_buffers(cm);
 
   aom_free_frame_buffer(&cpi->last_frame_uf);
-#if CONFIG_LOOP_RESTORATION
   av1_free_restoration_buffers(cm);
-  aom_free_frame_buffer(&cpi->last_frame_db);
   aom_free_frame_buffer(&cpi->trial_frame_rst);
-  aom_free(cpi->extra_rstbuf);
-  {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; ++i)
-      av1_free_restoration_struct(&cpi->rst_search[i]);
-  }
-#endif  // CONFIG_LOOP_RESTORATION
   aom_free_frame_buffer(&cpi->scaled_source);
   aom_free_frame_buffer(&cpi->scaled_last_source);
   aom_free_frame_buffer(&cpi->alt_ref_buffer);
@@ -578,32 +520,22 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
   aom_free(cpi->tile_tok[0][0]);
   cpi->tile_tok[0][0] = 0;
 
-  av1_free_pc_tree(&cpi->td);
+  av1_free_pc_tree(&cpi->td, num_planes);
 
   aom_free(cpi->td.mb.palette_buffer);
-
-#if CONFIG_ANS
-  aom_buf_ans_free(&cpi->buf_ans);
-#endif  // CONFIG_ANS
 }
 
 static void save_coding_context(AV1_COMP *cpi) {
   CODING_CONTEXT *const cc = &cpi->coding_context;
   AV1_COMMON *cm = &cpi->common;
-  int i;
 
   // Stores a snapshot of key state variables which can subsequently be
   // restored with a call to av1_restore_coding_context. These functions are
   // intended for use in a re-code loop in av1_compress_frame where the
   // quantizer value is adjusted between loop iterations.
-  for (i = 0; i < NMV_CONTEXTS; ++i) {
-    av1_copy(cc->nmv_vec_cost[i], cpi->td.mb.nmv_vec_cost[i]);
-    av1_copy(cc->nmv_costs, cpi->nmv_costs);
-    av1_copy(cc->nmv_costs_hp, cpi->nmv_costs_hp);
-  }
-
-  av1_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
-  av1_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
+  av1_copy(cc->nmv_vec_cost, cpi->td.mb.nmv_vec_cost);
+  av1_copy(cc->nmv_costs, cpi->nmv_costs);
+  av1_copy(cc->nmv_costs_hp, cpi->nmv_costs_hp);
 
   cc->fc = *cm->fc;
 }
@@ -611,18 +543,12 @@ static void save_coding_context(AV1_COMP *cpi) {
 static void restore_coding_context(AV1_COMP *cpi) {
   CODING_CONTEXT *const cc = &cpi->coding_context;
   AV1_COMMON *cm = &cpi->common;
-  int i;
 
   // Restore key state variables to the snapshot state stored in the
   // previous call to av1_save_coding_context.
-  for (i = 0; i < NMV_CONTEXTS; ++i) {
-    av1_copy(cpi->td.mb.nmv_vec_cost[i], cc->nmv_vec_cost[i]);
-    av1_copy(cpi->nmv_costs, cc->nmv_costs);
-    av1_copy(cpi->nmv_costs_hp, cc->nmv_costs_hp);
-  }
-
-  av1_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
-  av1_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
+  av1_copy(cpi->td.mb.nmv_vec_cost, cc->nmv_vec_cost);
+  av1_copy(cpi->nmv_costs, cc->nmv_costs);
+  av1_copy(cpi->nmv_costs_hp, cc->nmv_costs_hp);
 
   *cm->fc = cc->fc;
 }
@@ -673,7 +599,6 @@ static void configure_static_seg_features(AV1_COMP *cpi) {
       qi_delta =
           av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875, cm->bit_depth);
       av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
-#if CONFIG_LOOPFILTER_LEVEL
       av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
       av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
       av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
@@ -683,15 +608,8 @@ static void configure_static_seg_features(AV1_COMP *cpi) {
       av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
       av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
       av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
-#else
-      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
-      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
-#endif  // CONFIG_LOOPFILTER_LEVEL
 
       av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
-
-      // Where relevant assume segment data is delta data
-      seg->abs_delta = SEGMENT_DELTADATA;
     }
   } else if (seg->enabled) {
     // All other frames if segmentation has been enabled
@@ -702,14 +620,12 @@ static void configure_static_seg_features(AV1_COMP *cpi) {
       if (rc->source_alt_ref_active) {
         seg->update_map = 0;
         seg->update_data = 1;
-        seg->abs_delta = SEGMENT_DELTADATA;
 
         qi_delta =
             av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125, cm->bit_depth);
         av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2);
         av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
 
-#if CONFIG_LOOPFILTER_LEVEL
         av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
         av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
         av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
@@ -719,10 +635,6 @@ static void configure_static_seg_features(AV1_COMP *cpi) {
         av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
         av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
         av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
-#else
-        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
-        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
-#endif  // CONFIG_LOOPFILTER_LEVEL
 
         // Segment coding disabled for compred testing
         if (high_q || (cpi->static_mb_pct == 100)) {
@@ -777,16 +689,16 @@ static void configure_static_seg_features(AV1_COMP *cpi) {
 
 static void update_reference_segmentation_map(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible;
-  uint8_t *cache_ptr = cm->last_frame_seg_map;
+  MB_MODE_INFO **mi_4x4_ptr = cm->mi_grid_visible;
+  uint8_t *cache_ptr = cm->current_frame_seg_map;
   int row, col;
 
   for (row = 0; row < cm->mi_rows; row++) {
-    MODE_INFO **mi_8x8 = mi_8x8_ptr;
+    MB_MODE_INFO **mi_4x4 = mi_4x4_ptr;
     uint8_t *cache = cache_ptr;
-    for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++)
-      cache[0] = mi_8x8[0]->mbmi.segment_id;
-    mi_8x8_ptr += cm->mi_stride;
+    for (col = 0; col < cm->mi_cols; col++, mi_4x4++, cache++)
+      cache[0] = mi_4x4[0]->segment_id;
+    mi_4x4_ptr += cm->mi_stride;
     cache_ptr += cm->mi_cols;
   }
 }
@@ -796,12 +708,9 @@ static void alloc_raw_frame_buffers(AV1_COMP *cpi) {
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
 
   if (!cpi->lookahead)
-    cpi->lookahead = av1_lookahead_init(oxcf->width, oxcf->height,
-                                        cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                                        cm->use_highbitdepth,
-#endif
-                                        oxcf->lag_in_frames);
+    cpi->lookahead = av1_lookahead_init(
+        oxcf->width, oxcf->height, cm->subsampling_x, cm->subsampling_y,
+        cm->use_highbitdepth, oxcf->lag_in_frames);
   if (!cpi->lookahead)
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate lag buffers");
@@ -809,11 +718,8 @@ static void alloc_raw_frame_buffers(AV1_COMP *cpi) {
   // TODO(agrange) Check if ARF is enabled and skip allocation if not.
   if (aom_realloc_frame_buffer(&cpi->alt_ref_buffer, oxcf->width, oxcf->height,
                                cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                               cm->use_highbitdepth,
-#endif
-                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
-                               NULL, NULL))
+                               cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                               cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate altref buffer");
 }
@@ -822,84 +728,49 @@ static void alloc_util_frame_buffers(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   if (aom_realloc_frame_buffer(&cpi->last_frame_uf, cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                               cm->use_highbitdepth,
-#endif
-                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
-                               NULL, NULL))
+                               cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                               cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
 
-#if CONFIG_LOOP_RESTORATION
-  if (aom_realloc_frame_buffer(&cpi->last_frame_db, cm->width, cm->height,
-                               cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                               cm->use_highbitdepth,
-#endif
-                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
-                               NULL, NULL))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate last frame deblocked buffer");
   if (aom_realloc_frame_buffer(
-          &cpi->trial_frame_rst,
-#if CONFIG_FRAME_SUPERRES
-          cm->superres_upscaled_width, cm->superres_upscaled_height,
-#else
-          cm->width, cm->height,
-#endif  // CONFIG_FRAME_SUPERRES
-          cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-          cm->use_highbitdepth,
-#endif
-          AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+          &cpi->trial_frame_rst, cm->superres_upscaled_width,
+          cm->superres_upscaled_height, cm->subsampling_x, cm->subsampling_y,
+          cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
+          NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate trial restored frame buffer");
-  int extra_rstbuf_sz = RESTORATION_EXTBUF_SIZE;
-  if (extra_rstbuf_sz > 0) {
-    aom_free(cpi->extra_rstbuf);
-    CHECK_MEM_ERROR(cm, cpi->extra_rstbuf,
-                    (uint8_t *)aom_malloc(extra_rstbuf_sz));
-  } else {
-    cpi->extra_rstbuf = NULL;
-  }
-#endif  // CONFIG_LOOP_RESTORATION
 
   if (aom_realloc_frame_buffer(&cpi->scaled_source, cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                               cm->use_highbitdepth,
-#endif
-                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
-                               NULL, NULL))
+                               cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                               cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
 
   if (aom_realloc_frame_buffer(&cpi->scaled_last_source, cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                               cm->use_highbitdepth,
-#endif
-                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
-                               NULL, NULL))
+                               cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                               cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate scaled last source buffer");
 }
 
 static void alloc_compressor_data(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
 
   av1_alloc_context_buffers(cm, cm->width, cm->height);
 
-#if CONFIG_LV_MAP
   av1_alloc_txb_buf(cpi);
-#endif
 
   alloc_context_buffers_ext(cpi);
 
   aom_free(cpi->tile_tok[0][0]);
 
   {
-    unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
+    unsigned int tokens =
+        get_token_alloc(cm->mb_rows, cm->mb_cols, MAX_SB_SIZE_LOG2, num_planes);
     CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
                     aom_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
   }
@@ -909,18 +780,10 @@ static void alloc_compressor_data(AV1_COMP *cpi) {
 
 void av1_new_framerate(AV1_COMP *cpi, double framerate) {
   cpi->framerate = framerate < 0.1 ? 30 : framerate;
-#if CONFIG_XIPHRC
-  if (!cpi->od_rc.cur_frame) return;
-  cpi->od_rc.framerate = cpi->framerate;
-  od_enc_rc_resize(&cpi->od_rc);
-#else
   av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height);
-#endif
 }
 
-#if CONFIG_MAX_TILE
-
-static void set_tile_info_max_tile(AV1_COMP *cpi) {
+static void set_tile_info(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   int i, start_sb;
 
@@ -932,15 +795,15 @@ static void set_tile_info_max_tile(AV1_COMP *cpi) {
     cm->log2_tile_cols = AOMMAX(cpi->oxcf.tile_columns, cm->min_log2_tile_cols);
     cm->log2_tile_cols = AOMMIN(cm->log2_tile_cols, cm->max_log2_tile_cols);
   } else {
-    int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
-    int sb_cols = mi_cols >> MAX_MIB_SIZE_LOG2;
+    int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
+    int sb_cols = mi_cols >> cm->seq_params.mib_size_log2;
     int size_sb, j = 0;
     cm->uniform_tile_spacing_flag = 0;
     for (i = 0, start_sb = 0; start_sb < sb_cols && i < MAX_TILE_COLS; i++) {
       cm->tile_col_start_sb[i] = start_sb;
       size_sb = cpi->oxcf.tile_widths[j++];
       if (j >= cpi->oxcf.tile_width_count) j = 0;
-      start_sb += AOMMIN(size_sb, MAX_TILE_WIDTH_SB);
+      start_sb += AOMMIN(size_sb, cm->max_tile_width_sb);
     }
     cm->tile_cols = i;
     cm->tile_col_start_sb[i] = sb_cols;
@@ -952,8 +815,8 @@ static void set_tile_info_max_tile(AV1_COMP *cpi) {
     cm->log2_tile_rows = AOMMAX(cpi->oxcf.tile_rows, cm->min_log2_tile_rows);
     cm->log2_tile_rows = AOMMIN(cm->log2_tile_rows, cm->max_log2_tile_rows);
   } else {
-    int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
-    int sb_rows = mi_rows >> MAX_MIB_SIZE_LOG2;
+    int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+    int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
     int size_sb, j = 0;
     for (i = 0, start_sb = 0; start_sb < sb_rows && i < MAX_TILE_ROWS; i++) {
       cm->tile_row_start_sb[i] = start_sb;
@@ -967,158 +830,174 @@ static void set_tile_info_max_tile(AV1_COMP *cpi) {
   av1_calculate_tile_rows(cm);
 }
 
-#endif
-
-static void set_tile_info(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-#if CONFIG_DEPENDENT_HORZTILES
-  int tile_row, tile_col, num_tiles_in_tg;
-  int tg_row_start, tg_col_start;
-#endif
-#if CONFIG_EXT_TILE
-  if (cpi->oxcf.large_scale_tile) {
-#if CONFIG_EXT_PARTITION
-    if (cpi->oxcf.superblock_size != AOM_SUPERBLOCK_SIZE_64X64) {
-      cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 32);
-      cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 32);
-      cm->tile_width <<= MAX_MIB_SIZE_LOG2;
-      cm->tile_height <<= MAX_MIB_SIZE_LOG2;
-    } else {
-      cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 64);
-      cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64);
-      cm->tile_width <<= MAX_MIB_SIZE_LOG2 - 1;
-      cm->tile_height <<= MAX_MIB_SIZE_LOG2 - 1;
-    }
-#else
-    cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 64);
-    cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64);
-    cm->tile_width <<= MAX_MIB_SIZE_LOG2;
-    cm->tile_height <<= MAX_MIB_SIZE_LOG2;
-#endif  // CONFIG_EXT_PARTITION
-
-    cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols);
-    cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows);
-
-    assert(cm->tile_width >> MAX_MIB_SIZE <= 32);
-    assert(cm->tile_height >> MAX_MIB_SIZE <= 32);
-
-    // Get the number of tiles
-    cm->tile_cols = 1;
-    while (cm->tile_cols * cm->tile_width < cm->mi_cols) ++cm->tile_cols;
-
-    cm->tile_rows = 1;
-    while (cm->tile_rows * cm->tile_height < cm->mi_rows) ++cm->tile_rows;
-  } else {
-#endif  // CONFIG_EXT_TILE
-
-#if CONFIG_MAX_TILE
-    set_tile_info_max_tile(cpi);
-#else
-  int min_log2_tile_cols, max_log2_tile_cols;
-  av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
-
-  cm->log2_tile_cols =
-      clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
-  cm->log2_tile_rows = cpi->oxcf.tile_rows;
-
-  cm->tile_width =
-      get_tile_size(cm->mi_cols, cm->log2_tile_cols, &cm->tile_cols);
-  cm->tile_height =
-      get_tile_size(cm->mi_rows, cm->log2_tile_rows, &cm->tile_rows);
-#endif  // CONFIG_MAX_TILE
-#if CONFIG_EXT_TILE
-  }
-#endif  // CONFIG_EXT_TILE
-
-#if CONFIG_DEPENDENT_HORZTILES
-  cm->dependent_horz_tiles = cpi->oxcf.dependent_horz_tiles;
-#if CONFIG_EXT_TILE
-  if (cm->large_scale_tile) {
-    // May not needed since cpi->oxcf.dependent_horz_tiles is already adjusted.
-    cm->dependent_horz_tiles = 0;
-  } else {
-#endif  // CONFIG_EXT_TILE
-    if (cm->log2_tile_rows == 0) cm->dependent_horz_tiles = 0;
-#if CONFIG_EXT_TILE
-  }
-#endif  // CONFIG_EXT_TILE
-
-#if CONFIG_EXT_TILE
-  if (!cm->large_scale_tile) {
-#endif  // CONFIG_EXT_TILE
-    if (cpi->oxcf.mtu == 0) {
-      cm->num_tg = cpi->oxcf.num_tile_groups;
-    } else {
-      // Use a default value for the purposes of weighting costs in probability
-      // updates
-      cm->num_tg = DEFAULT_MAX_NUM_TG;
-    }
-    num_tiles_in_tg =
-        (cm->tile_cols * cm->tile_rows + cm->num_tg - 1) / cm->num_tg;
-    tg_row_start = 0;
-    tg_col_start = 0;
-    for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
-      for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
-        if ((tile_row * cm->tile_cols + tile_col) % num_tiles_in_tg == 0) {
-          tg_row_start = tile_row;
-          tg_col_start = tile_col;
-        }
-        cm->tile_group_start_row[tile_row][tile_col] = tg_row_start;
-        cm->tile_group_start_col[tile_row][tile_col] = tg_col_start;
-      }
-    }
-#if CONFIG_EXT_TILE
-  }
-#endif  // CONFIG_EXT_TILE
-#endif
-
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  cm->loop_filter_across_tiles_enabled =
-      cpi->oxcf.loop_filter_across_tiles_enabled;
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
-}
-
 static void update_frame_size(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 
   av1_set_mb_mi(cm, cm->width, cm->height);
   av1_init_context_buffers(cm);
-  av1_init_macroblockd(cm, xd,
-#if CONFIG_PVQ
-                       NULL,
-#endif
-#if CONFIG_CFL
-                       &NULL_CFL,
-#endif
-                       NULL);
+  av1_init_macroblockd(cm, xd, NULL);
   memset(cpi->mbmi_ext_base, 0,
          cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
   set_tile_info(cpi);
 }
 
 static void init_buffer_indices(AV1_COMP *cpi) {
-#if CONFIG_EXT_REFS
   int fb_idx;
-  for (fb_idx = 0; fb_idx < LAST_REF_FRAMES; ++fb_idx)
-    cpi->lst_fb_idxes[fb_idx] = fb_idx;
-  cpi->gld_fb_idx = LAST_REF_FRAMES;
-  cpi->bwd_fb_idx = LAST_REF_FRAMES + 1;
-  cpi->alt2_fb_idx = LAST_REF_FRAMES + 2;
-  cpi->alt_fb_idx = LAST_REF_FRAMES + 3;
-  cpi->ext_fb_idx = LAST_REF_FRAMES + 4;
+  for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx)
+    cpi->ref_fb_idx[fb_idx] = fb_idx;
   for (fb_idx = 0; fb_idx < MAX_EXT_ARFS + 1; ++fb_idx)
     cpi->arf_map[fb_idx] = LAST_REF_FRAMES + 2 + fb_idx;
-#else   // !CONFIG_EXT_REFS
-  cpi->lst_fb_idx = 0;
-  cpi->gld_fb_idx = 1;
-  cpi->alt_fb_idx = 2;
-#endif  // CONFIG_EXT_REFS
-#if CONFIG_AMVR
   cpi->rate_index = 0;
   cpi->rate_size = 0;
   cpi->cur_poc = -1;
-#endif
+}
+
+static INLINE int does_level_match(int width, int height, double fps,
+                                   int lvl_width, int lvl_height,
+                                   double lvl_fps, int lvl_dim_mult) {
+  const int64_t lvl_luma_pels = lvl_width * lvl_height;
+  const double lvl_display_sample_rate = lvl_luma_pels * lvl_fps;
+  const int64_t luma_pels = width * height;
+  const double display_sample_rate = luma_pels * fps;
+  return luma_pels <= lvl_luma_pels &&
+         display_sample_rate <= lvl_display_sample_rate &&
+         width <= lvl_width * lvl_dim_mult &&
+         height <= lvl_height * lvl_dim_mult;
+}
+
+static void set_bitstream_level_tier(SequenceHeader *seq, AV1_COMMON *cm,
+                                     const AV1EncoderConfig *oxcf) {
+  // TODO(any): This is a placeholder function that only addresses dimensions
+  // and max display sample rates.
+  // Need to add checks for max bit rate, max decoded luma sample rate, header
+  // rate, etc. that are not covered by this function.
+  (void)oxcf;
+  BitstreamLevel bl = { 9, 3 };
+  if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 512,
+                       288, 30.0, 4)) {
+    bl.major = 2;
+    bl.minor = 0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              704, 396, 30.0, 4)) {
+    bl.major = 2;
+    bl.minor = 1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              1088, 612, 30.0, 4)) {
+    bl.major = 3;
+    bl.minor = 0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              1376, 774, 30.0, 4)) {
+    bl.major = 3;
+    bl.minor = 1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              2048, 1152, 30.0, 3)) {
+    bl.major = 4;
+    bl.minor = 0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              2048, 1152, 60.0, 3)) {
+    bl.major = 4;
+    bl.minor = 1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              4096, 2176, 30.0, 2)) {
+    bl.major = 5;
+    bl.minor = 0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              4096, 2176, 60.0, 2)) {
+    bl.major = 5;
+    bl.minor = 1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              4096, 2176, 120.0, 2)) {
+    bl.major = 5;
+    bl.minor = 2;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              8192, 4352, 30.0, 2)) {
+    bl.major = 6;
+    bl.minor = 0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              8192, 4352, 60.0, 2)) {
+    bl.major = 6;
+    bl.minor = 1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              8192, 4352, 120.0, 2)) {
+    bl.major = 6;
+    bl.minor = 2;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              16384, 8704, 30.0, 2)) {
+    bl.major = 7;
+    bl.minor = 0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              16384, 8704, 60.0, 2)) {
+    bl.major = 7;
+    bl.minor = 1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              16384, 8704, 120.0, 2)) {
+    bl.major = 7;
+    bl.minor = 2;
+  }
+  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+    seq->level[i] = bl;
+    seq->tier[i] = 0;  // setting main tier by default
+    // Set the maximum parameters for bitrate and buffer size for this profile,
+    // level, and tier
+    cm->op_params[i].bitrate = max_level_bitrate(
+        cm->profile, major_minor_to_seq_level_idx(seq->level[i]), seq->tier[i]);
+    // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the
+    // check
+    if (cm->op_params[i].bitrate == 0)
+      aom_internal_error(
+          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          "AV1 does not support this combination of profile, level, and tier.");
+    // Buffer size in bits/s is bitrate in bits/s * 1 s
+    cm->op_params[i].buffer_size = cm->op_params[i].bitrate;
+  }
+}
+
+static void init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
+                                  const AV1EncoderConfig *oxcf) {
+  seq->still_picture = (oxcf->limit == 1);
+  seq->reduced_still_picture_hdr = seq->still_picture;
+  seq->reduced_still_picture_hdr &= !oxcf->full_still_picture_hdr;
+  seq->force_screen_content_tools = 2;
+  seq->force_integer_mv = 2;
+  seq->enable_order_hint = oxcf->enable_order_hint;
+  seq->frame_id_numbers_present_flag = oxcf->large_scale_tile;
+  if (seq->still_picture && seq->reduced_still_picture_hdr) {
+    seq->enable_order_hint = 0;
+    seq->frame_id_numbers_present_flag = 0;
+    seq->force_screen_content_tools = 2;
+    seq->force_integer_mv = 2;
+  }
+  seq->order_hint_bits_minus_1 =
+      seq->enable_order_hint ? DEFAULT_EXPLICIT_ORDER_HINT_BITS - 1 : -1;
+
+  seq->enable_dual_filter = oxcf->enable_dual_filter;
+  seq->enable_jnt_comp = oxcf->enable_jnt_comp;
+  seq->enable_jnt_comp &= seq->enable_order_hint;
+  seq->enable_ref_frame_mvs = oxcf->enable_ref_frame_mvs;
+  seq->enable_ref_frame_mvs &= seq->enable_order_hint;
+  seq->enable_superres = oxcf->enable_superres;
+  seq->enable_cdef = oxcf->enable_cdef;
+  seq->enable_restoration = oxcf->enable_restoration;
+  seq->enable_warped_motion = oxcf->enable_warped_motion;
+  seq->enable_interintra_compound = 1;
+  seq->enable_masked_compound = 1;
+  seq->enable_intra_edge_filter = 1;
+  seq->enable_filter_intra = 1;
+
+  set_bitstream_level_tier(seq, cm, oxcf);
+
+  if (seq->operating_points_cnt_minus_1 == 0) {
+    seq->operating_point_idc[0] = 0;
+  } else {
+    // Set operating_point_idc[] such that for the i-th operating point the
+    // first (operating_points_cnt-i) spatial layers and the first temporal
+    // layer are decoded Note that highest quality operating point should come
+    // first
+    for (int i = 0; i < seq->operating_points_cnt_minus_1 + 1; i++)
+      seq->operating_point_idc[i] =
+          (~(~0u << (seq->operating_points_cnt_minus_1 + 1 - i)) << 8) | 1;
+  }
 }
 
 static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
@@ -1129,22 +1008,53 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
 
   cm->profile = oxcf->profile;
   cm->bit_depth = oxcf->bit_depth;
-#if CONFIG_HIGHBITDEPTH
   cm->use_highbitdepth = oxcf->use_highbitdepth;
-#endif
-  cm->color_space = oxcf->color_space;
-#if CONFIG_COLORSPACE_HEADERS
-  cm->transfer_function = oxcf->transfer_function;
+  cm->color_primaries = oxcf->color_primaries;
+  cm->transfer_characteristics = oxcf->transfer_characteristics;
+  cm->matrix_coefficients = oxcf->matrix_coefficients;
+  cm->seq_params.monochrome = oxcf->monochrome;
   cm->chroma_sample_position = oxcf->chroma_sample_position;
-#endif
   cm->color_range = oxcf->color_range;
+  cm->timing_info_present = oxcf->timing_info_present;
+  cm->timing_info.num_units_in_display_tick =
+      oxcf->timing_info.num_units_in_display_tick;
+  cm->timing_info.time_scale = oxcf->timing_info.time_scale;
+  cm->timing_info.equal_picture_interval =
+      oxcf->timing_info.equal_picture_interval;
+  cm->timing_info.num_ticks_per_picture =
+      oxcf->timing_info.num_ticks_per_picture;
+
+  cm->seq_params.display_model_info_present_flag =
+      oxcf->display_model_info_present_flag;
+  cm->seq_params.decoder_model_info_present_flag =
+      oxcf->decoder_model_info_present_flag;
+  if (oxcf->decoder_model_info_present_flag) {
+    // set the decoder model parameters in schedule mode
+    cm->buffer_model.num_units_in_decoding_tick =
+        oxcf->buffer_model.num_units_in_decoding_tick;
+    cm->buffer_removal_delay_present = 1;
+    set_aom_dec_model_info(&cm->buffer_model);
+    set_dec_model_op_parameters(&cm->op_params[0]);
+  } else if (cm->timing_info_present &&
+             cm->timing_info.equal_picture_interval &&
+             !cm->seq_params.decoder_model_info_present_flag) {
+    // set the decoder model parameters in resource availability mode
+    set_resource_availability_parameters(&cm->op_params[0]);
+  } else {
+    cm->op_params[0].initial_display_delay =
+        10;  // Default value (not signaled)
+  }
 
   cm->width = oxcf->width;
   cm->height = oxcf->height;
+  set_sb_size(&cm->seq_params,
+              select_sb_size(cpi));  // set sb size before allocations
   alloc_compressor_data(cpi);
 
+  update_film_grain_parameters(cpi, oxcf);
+
   // Single thread case: use counts in common.
-  cpi->td.counts = &cm->counts;
+  cpi->td.counts = &cpi->counts;
 
   // change includes all joint functionality
   av1_change_config(cpi, oxcf);
@@ -1173,16 +1083,15 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc,
       (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000;
 }
 
-#if CONFIG_HIGHBITDEPTH
-#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
   cpi->fn_ptr[BT].sdf = SDF;                                           \
   cpi->fn_ptr[BT].sdaf = SDAF;                                         \
   cpi->fn_ptr[BT].vf = VF;                                             \
   cpi->fn_ptr[BT].svf = SVF;                                           \
   cpi->fn_ptr[BT].svaf = SVAF;                                         \
-  cpi->fn_ptr[BT].sdx3f = SDX3F;                                       \
-  cpi->fn_ptr[BT].sdx8f = SDX8F;                                       \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                                     \
+  cpi->fn_ptr[BT].jsdaf = JSDAF;                                       \
+  cpi->fn_ptr[BT].jsvaf = JSVAF;
 
 #define MAKE_BFP_SAD_WRAPPER(fnname)                                           \
   static unsigned int fnname##_bits8(const uint8_t *src_ptr,                   \
@@ -1220,47 +1129,6 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc,
            4;                                                                  \
   }
 
-#define MAKE_BFP_SAD3_WRAPPER(fnname)                                    \
-  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,  \
-                             const uint8_t *ref_ptr, int ref_stride,     \
-                             unsigned int *sad_array) {                  \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-  }                                                                      \
-  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
-                              const uint8_t *ref_ptr, int ref_stride,    \
-                              unsigned int *sad_array) {                 \
-    int i;                                                               \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-    for (i = 0; i < 3; i++) sad_array[i] >>= 2;                          \
-  }                                                                      \
-  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
-                              const uint8_t *ref_ptr, int ref_stride,    \
-                              unsigned int *sad_array) {                 \
-    int i;                                                               \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-    for (i = 0; i < 3; i++) sad_array[i] >>= 4;                          \
-  }
-
-#define MAKE_BFP_SAD8_WRAPPER(fnname)                                    \
-  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,  \
-                             const uint8_t *ref_ptr, int ref_stride,     \
-                             unsigned int *sad_array) {                  \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-  }                                                                      \
-  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
-                              const uint8_t *ref_ptr, int ref_stride,    \
-                              unsigned int *sad_array) {                 \
-    int i;                                                               \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-    for (i = 0; i < 8; i++) sad_array[i] >>= 2;                          \
-  }                                                                      \
-  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
-                              const uint8_t *ref_ptr, int ref_stride,    \
-                              unsigned int *sad_array) {                 \
-    int i;                                                               \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-    for (i = 0; i < 8; i++) sad_array[i] >>= 4;                          \
-  }
 #define MAKE_BFP_SAD4D_WRAPPER(fnname)                                        \
   static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,       \
                              const uint8_t *const ref_ptr[], int ref_stride,  \
@@ -1282,11 +1150,33 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc,
     for (i = 0; i < 4; i++) sad_array[i] >>= 4;                               \
   }
 
-#if CONFIG_EXT_PARTITION
+#define MAKE_BFP_JSADAVG_WRAPPER(fnname)                                    \
+  static unsigned int fnname##_bits8(                                       \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const JNT_COMP_PARAMS *jcp_param) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param);                                               \
+  }                                                                         \
+  static unsigned int fnname##_bits10(                                      \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const JNT_COMP_PARAMS *jcp_param) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param) >>                                             \
+           2;                                                               \
+  }                                                                         \
+  static unsigned int fnname##_bits12(                                      \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const JNT_COMP_PARAMS *jcp_param) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param) >>                                             \
+           4;                                                               \
+  }
+
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg)
-MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad128x128x3)
-MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad128x128x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg)
@@ -1294,7 +1184,6 @@ MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d)
-#endif  // CONFIG_EXT_PARTITION
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d)
@@ -1309,49 +1198,32 @@ MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg)
-MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad32x32x3)
-MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad32x32x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg)
-MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad64x64x3)
-MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad64x64x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg)
-MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad16x16x3)
-MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad16x16x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg)
-MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad16x8x3)
-MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad16x8x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg)
-MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad8x16x3)
-MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad8x16x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg)
-MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad8x8x3)
-MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad8x8x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg)
-MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad8x4x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg)
-MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad4x8x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg)
-MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad4x4x3)
-MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad4x4x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
 
-#if CONFIG_EXT_PARTITION_TYPES
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d)
@@ -1370,15 +1242,29 @@ MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d)
-#if CONFIG_EXT_PARTITION
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x128)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x128_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x128x4d)
-MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x32)
-MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x32_avg)
-MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x32x4d)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
+
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad128x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad128x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x16_avg)
 
 #define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
   cpi->fn_ptr[BT].msdf = MCSDF;       \
@@ -1409,11 +1295,9 @@ MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x32x4d)
            4;                                                            \
   }
 
-#if CONFIG_EXT_PARTITION
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x128)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x64)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x128)
-#endif  // CONFIG_EXT_PARTITION
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x64)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x32)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x64)
@@ -1427,21 +1311,13 @@ MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x8)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x4)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x8)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x4)
-
-#if CONFIG_EXT_PARTITION_TYPES
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x16)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x4)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x32)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x8)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x64)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16)
-#if CONFIG_EXT_PARTITION
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x128)
-MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
-#if CONFIG_MOTION_VAR
 #define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
   cpi->fn_ptr[BT].osdf = OSDF;           \
   cpi->fn_ptr[BT].ovf = OVF;             \
@@ -1464,11 +1340,9 @@ MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x32)
     return fnname(ref, ref_stride, wsrc, msk) >> 4;                       \
   }
 
-#if CONFIG_EXT_PARTITION
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128)
-#endif  // CONFIG_EXT_PARTITION
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64)
@@ -1482,198 +1356,190 @@ MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4)
-
-#if CONFIG_EXT_PARTITION_TYPES
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x16)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x4)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x32)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x8)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16)
-#if CONFIG_EXT_PARTITION
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x128)
-MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#endif  // CONFIG_MOTION_VAR
 
 static void highbd_set_var_fns(AV1_COMP *const cpi) {
   AV1_COMMON *const cm = &cpi->common;
   if (cm->use_highbitdepth) {
     switch (cm->bit_depth) {
       case AOM_BITS_8:
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION
-        HIGHBD_BFP(BLOCK_128X32, aom_highbd_sad128x32_bits8,
-                   aom_highbd_sad128x32_avg_bits8, aom_highbd_8_variance128x32,
-                   aom_highbd_8_sub_pixel_variance128x32,
-                   aom_highbd_8_sub_pixel_avg_variance128x32, NULL, NULL,
-                   aom_highbd_sad128x32x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_32X128, aom_highbd_sad32x128_bits8,
-                   aom_highbd_sad32x128_avg_bits8, aom_highbd_8_variance32x128,
-                   aom_highbd_8_sub_pixel_variance32x128,
-                   aom_highbd_8_sub_pixel_avg_variance32x128, NULL, NULL,
-                   aom_highbd_sad32x128x4d_bits8)
-#endif  // CONFIG_EXT_PARTITION
-
         HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits8,
                    aom_highbd_sad64x16_avg_bits8, aom_highbd_8_variance64x16,
                    aom_highbd_8_sub_pixel_variance64x16,
-                   aom_highbd_8_sub_pixel_avg_variance64x16, NULL, NULL,
-                   aom_highbd_sad64x16x4d_bits8)
+                   aom_highbd_8_sub_pixel_avg_variance64x16,
+                   aom_highbd_sad64x16x4d_bits8,
+                   aom_highbd_jnt_sad64x16_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance64x16)
 
         HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits8,
                    aom_highbd_sad16x64_avg_bits8, aom_highbd_8_variance16x64,
                    aom_highbd_8_sub_pixel_variance16x64,
-                   aom_highbd_8_sub_pixel_avg_variance16x64, NULL, NULL,
-                   aom_highbd_sad16x64x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits8,
-                   aom_highbd_sad32x8_avg_bits8, aom_highbd_8_variance32x8,
-                   aom_highbd_8_sub_pixel_variance32x8,
-                   aom_highbd_8_sub_pixel_avg_variance32x8, NULL, NULL,
-                   aom_highbd_sad32x8x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits8,
-                   aom_highbd_sad8x32_avg_bits8, aom_highbd_8_variance8x32,
-                   aom_highbd_8_sub_pixel_variance8x32,
-                   aom_highbd_8_sub_pixel_avg_variance8x32, NULL, NULL,
-                   aom_highbd_sad8x32x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits8,
-                   aom_highbd_sad16x4_avg_bits8, aom_highbd_8_variance16x4,
-                   aom_highbd_8_sub_pixel_variance16x4,
-                   aom_highbd_8_sub_pixel_avg_variance16x4, NULL, NULL,
-                   aom_highbd_sad16x4x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits8,
-                   aom_highbd_sad4x16_avg_bits8, aom_highbd_8_variance4x16,
-                   aom_highbd_8_sub_pixel_variance4x16,
-                   aom_highbd_8_sub_pixel_avg_variance4x16, NULL, NULL,
-                   aom_highbd_sad4x16x4d_bits8)
-#endif
+                   aom_highbd_8_sub_pixel_avg_variance16x64,
+                   aom_highbd_sad16x64x4d_bits8,
+                   aom_highbd_jnt_sad16x64_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance16x64)
+
+        HIGHBD_BFP(
+            BLOCK_32X8, aom_highbd_sad32x8_bits8, aom_highbd_sad32x8_avg_bits8,
+            aom_highbd_8_variance32x8, aom_highbd_8_sub_pixel_variance32x8,
+            aom_highbd_8_sub_pixel_avg_variance32x8,
+            aom_highbd_sad32x8x4d_bits8, aom_highbd_jnt_sad32x8_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance32x8)
+
+        HIGHBD_BFP(
+            BLOCK_8X32, aom_highbd_sad8x32_bits8, aom_highbd_sad8x32_avg_bits8,
+            aom_highbd_8_variance8x32, aom_highbd_8_sub_pixel_variance8x32,
+            aom_highbd_8_sub_pixel_avg_variance8x32,
+            aom_highbd_sad8x32x4d_bits8, aom_highbd_jnt_sad8x32_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance8x32)
+
+        HIGHBD_BFP(
+            BLOCK_16X4, aom_highbd_sad16x4_bits8, aom_highbd_sad16x4_avg_bits8,
+            aom_highbd_8_variance16x4, aom_highbd_8_sub_pixel_variance16x4,
+            aom_highbd_8_sub_pixel_avg_variance16x4,
+            aom_highbd_sad16x4x4d_bits8, aom_highbd_jnt_sad16x4_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance16x4)
+
+        HIGHBD_BFP(
+            BLOCK_4X16, aom_highbd_sad4x16_bits8, aom_highbd_sad4x16_avg_bits8,
+            aom_highbd_8_variance4x16, aom_highbd_8_sub_pixel_variance4x16,
+            aom_highbd_8_sub_pixel_avg_variance4x16,
+            aom_highbd_sad4x16x4d_bits8, aom_highbd_jnt_sad4x16_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance4x16)
 
         HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits8,
                    aom_highbd_sad32x16_avg_bits8, aom_highbd_8_variance32x16,
                    aom_highbd_8_sub_pixel_variance32x16,
-                   aom_highbd_8_sub_pixel_avg_variance32x16, NULL, NULL,
-                   aom_highbd_sad32x16x4d_bits8)
+                   aom_highbd_8_sub_pixel_avg_variance32x16,
+                   aom_highbd_sad32x16x4d_bits8,
+                   aom_highbd_jnt_sad32x16_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance32x16)
 
         HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits8,
                    aom_highbd_sad16x32_avg_bits8, aom_highbd_8_variance16x32,
                    aom_highbd_8_sub_pixel_variance16x32,
-                   aom_highbd_8_sub_pixel_avg_variance16x32, NULL, NULL,
-                   aom_highbd_sad16x32x4d_bits8)
+                   aom_highbd_8_sub_pixel_avg_variance16x32,
+                   aom_highbd_sad16x32x4d_bits8,
+                   aom_highbd_jnt_sad16x32_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance16x32)
 
         HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits8,
                    aom_highbd_sad64x32_avg_bits8, aom_highbd_8_variance64x32,
                    aom_highbd_8_sub_pixel_variance64x32,
-                   aom_highbd_8_sub_pixel_avg_variance64x32, NULL, NULL,
-                   aom_highbd_sad64x32x4d_bits8)
+                   aom_highbd_8_sub_pixel_avg_variance64x32,
+                   aom_highbd_sad64x32x4d_bits8,
+                   aom_highbd_jnt_sad64x32_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance64x32)
 
         HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits8,
                    aom_highbd_sad32x64_avg_bits8, aom_highbd_8_variance32x64,
                    aom_highbd_8_sub_pixel_variance32x64,
-                   aom_highbd_8_sub_pixel_avg_variance32x64, NULL, NULL,
-                   aom_highbd_sad32x64x4d_bits8)
+                   aom_highbd_8_sub_pixel_avg_variance32x64,
+                   aom_highbd_sad32x64x4d_bits8,
+                   aom_highbd_jnt_sad32x64_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance32x64)
 
         HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits8,
                    aom_highbd_sad32x32_avg_bits8, aom_highbd_8_variance32x32,
                    aom_highbd_8_sub_pixel_variance32x32,
                    aom_highbd_8_sub_pixel_avg_variance32x32,
-                   aom_highbd_sad32x32x3_bits8, aom_highbd_sad32x32x8_bits8,
-                   aom_highbd_sad32x32x4d_bits8)
+                   aom_highbd_sad32x32x4d_bits8,
+                   aom_highbd_jnt_sad32x32_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance32x32)
 
         HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits8,
                    aom_highbd_sad64x64_avg_bits8, aom_highbd_8_variance64x64,
                    aom_highbd_8_sub_pixel_variance64x64,
                    aom_highbd_8_sub_pixel_avg_variance64x64,
-                   aom_highbd_sad64x64x3_bits8, aom_highbd_sad64x64x8_bits8,
-                   aom_highbd_sad64x64x4d_bits8)
+                   aom_highbd_sad64x64x4d_bits8,
+                   aom_highbd_jnt_sad64x64_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance64x64)
 
         HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits8,
                    aom_highbd_sad16x16_avg_bits8, aom_highbd_8_variance16x16,
                    aom_highbd_8_sub_pixel_variance16x16,
                    aom_highbd_8_sub_pixel_avg_variance16x16,
-                   aom_highbd_sad16x16x3_bits8, aom_highbd_sad16x16x8_bits8,
-                   aom_highbd_sad16x16x4d_bits8)
+                   aom_highbd_sad16x16x4d_bits8,
+                   aom_highbd_jnt_sad16x16_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance16x16)
 
         HIGHBD_BFP(
             BLOCK_16X8, aom_highbd_sad16x8_bits8, aom_highbd_sad16x8_avg_bits8,
             aom_highbd_8_variance16x8, aom_highbd_8_sub_pixel_variance16x8,
-            aom_highbd_8_sub_pixel_avg_variance16x8, aom_highbd_sad16x8x3_bits8,
-            aom_highbd_sad16x8x8_bits8, aom_highbd_sad16x8x4d_bits8)
+            aom_highbd_8_sub_pixel_avg_variance16x8,
+            aom_highbd_sad16x8x4d_bits8, aom_highbd_jnt_sad16x8_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance16x8)
 
         HIGHBD_BFP(
             BLOCK_8X16, aom_highbd_sad8x16_bits8, aom_highbd_sad8x16_avg_bits8,
             aom_highbd_8_variance8x16, aom_highbd_8_sub_pixel_variance8x16,
-            aom_highbd_8_sub_pixel_avg_variance8x16, aom_highbd_sad8x16x3_bits8,
-            aom_highbd_sad8x16x8_bits8, aom_highbd_sad8x16x4d_bits8)
+            aom_highbd_8_sub_pixel_avg_variance8x16,
+            aom_highbd_sad8x16x4d_bits8, aom_highbd_jnt_sad8x16_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance8x16)
 
-        HIGHBD_BFP(
-            BLOCK_8X8, aom_highbd_sad8x8_bits8, aom_highbd_sad8x8_avg_bits8,
-            aom_highbd_8_variance8x8, aom_highbd_8_sub_pixel_variance8x8,
-            aom_highbd_8_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x3_bits8,
-            aom_highbd_sad8x8x8_bits8, aom_highbd_sad8x8x4d_bits8)
+        HIGHBD_BFP(BLOCK_8X8, aom_highbd_sad8x8_bits8,
+                   aom_highbd_sad8x8_avg_bits8, aom_highbd_8_variance8x8,
+                   aom_highbd_8_sub_pixel_variance8x8,
+                   aom_highbd_8_sub_pixel_avg_variance8x8,
+                   aom_highbd_sad8x8x4d_bits8, aom_highbd_jnt_sad8x8_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance8x8)
 
         HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits8,
                    aom_highbd_sad8x4_avg_bits8, aom_highbd_8_variance8x4,
                    aom_highbd_8_sub_pixel_variance8x4,
-                   aom_highbd_8_sub_pixel_avg_variance8x4, NULL,
-                   aom_highbd_sad8x4x8_bits8, aom_highbd_sad8x4x4d_bits8)
+                   aom_highbd_8_sub_pixel_avg_variance8x4,
+                   aom_highbd_sad8x4x4d_bits8, aom_highbd_jnt_sad8x4_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance8x4)
 
         HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits8,
                    aom_highbd_sad4x8_avg_bits8, aom_highbd_8_variance4x8,
                    aom_highbd_8_sub_pixel_variance4x8,
-                   aom_highbd_8_sub_pixel_avg_variance4x8, NULL,
-                   aom_highbd_sad4x8x8_bits8, aom_highbd_sad4x8x4d_bits8)
+                   aom_highbd_8_sub_pixel_avg_variance4x8,
+                   aom_highbd_sad4x8x4d_bits8, aom_highbd_jnt_sad4x8_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance4x8)
 
-        HIGHBD_BFP(
-            BLOCK_4X4, aom_highbd_sad4x4_bits8, aom_highbd_sad4x4_avg_bits8,
-            aom_highbd_8_variance4x4, aom_highbd_8_sub_pixel_variance4x4,
-            aom_highbd_8_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits8,
-            aom_highbd_sad4x4x8_bits8, aom_highbd_sad4x4x4d_bits8)
-
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-        HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_8_variance2x2, NULL, NULL,
-                   NULL, NULL, NULL)
-        HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_8_variance4x2, NULL, NULL,
-                   NULL, NULL, NULL)
-        HIGHBD_BFP(BLOCK_2X4, NULL, NULL, aom_highbd_8_variance2x4, NULL, NULL,
-                   NULL, NULL, NULL)
-#endif
+        HIGHBD_BFP(BLOCK_4X4, aom_highbd_sad4x4_bits8,
+                   aom_highbd_sad4x4_avg_bits8, aom_highbd_8_variance4x4,
+                   aom_highbd_8_sub_pixel_variance4x4,
+                   aom_highbd_8_sub_pixel_avg_variance4x4,
+                   aom_highbd_sad4x4x4d_bits8, aom_highbd_jnt_sad4x4_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance4x4)
 
-#if CONFIG_EXT_PARTITION
-        HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits8,
-                   aom_highbd_sad128x128_avg_bits8,
-                   aom_highbd_8_variance128x128,
-                   aom_highbd_8_sub_pixel_variance128x128,
-                   aom_highbd_8_sub_pixel_avg_variance128x128,
-                   aom_highbd_sad128x128x3_bits8, aom_highbd_sad128x128x8_bits8,
-                   aom_highbd_sad128x128x4d_bits8)
+        HIGHBD_BFP(
+            BLOCK_128X128, aom_highbd_sad128x128_bits8,
+            aom_highbd_sad128x128_avg_bits8, aom_highbd_8_variance128x128,
+            aom_highbd_8_sub_pixel_variance128x128,
+            aom_highbd_8_sub_pixel_avg_variance128x128,
+            aom_highbd_sad128x128x4d_bits8, aom_highbd_jnt_sad128x128_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance128x128)
 
         HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits8,
                    aom_highbd_sad128x64_avg_bits8, aom_highbd_8_variance128x64,
                    aom_highbd_8_sub_pixel_variance128x64,
-                   aom_highbd_8_sub_pixel_avg_variance128x64, NULL, NULL,
-                   aom_highbd_sad128x64x4d_bits8)
+                   aom_highbd_8_sub_pixel_avg_variance128x64,
+                   aom_highbd_sad128x64x4d_bits8,
+                   aom_highbd_jnt_sad128x64_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance128x64)
 
         HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits8,
                    aom_highbd_sad64x128_avg_bits8, aom_highbd_8_variance64x128,
                    aom_highbd_8_sub_pixel_variance64x128,
-                   aom_highbd_8_sub_pixel_avg_variance64x128, NULL, NULL,
-                   aom_highbd_sad64x128x4d_bits8)
-#endif  // CONFIG_EXT_PARTITION
+                   aom_highbd_8_sub_pixel_avg_variance64x128,
+                   aom_highbd_sad64x128x4d_bits8,
+                   aom_highbd_jnt_sad64x128_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance64x128)
 
-#if CONFIG_EXT_PARTITION
         HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits8,
                     aom_highbd_8_masked_sub_pixel_variance128x128)
         HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits8,
                     aom_highbd_8_masked_sub_pixel_variance128x64)
         HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits8,
                     aom_highbd_8_masked_sub_pixel_variance64x128)
-#endif  // CONFIG_EXT_PARTITION
         HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits8,
                     aom_highbd_8_masked_sub_pixel_variance64x64)
         HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits8,
@@ -1700,35 +1566,18 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
                     aom_highbd_8_masked_sub_pixel_variance8x4)
         HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits8,
                     aom_highbd_8_masked_sub_pixel_variance4x4)
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION
-        HIGHBD_MBFP(BLOCK_128X32, aom_highbd_masked_sad128x32_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance128x32)
-
-        HIGHBD_MBFP(BLOCK_32X128, aom_highbd_masked_sad32x128_bits8,
-                    aom_highbd_8_masked_sub_pixel_variance32x128)
-#endif  // CONFIG_EXT_PARTITION
-
         HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits8,
                     aom_highbd_8_masked_sub_pixel_variance64x16)
-
         HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits8,
                     aom_highbd_8_masked_sub_pixel_variance16x64)
-
         HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits8,
                     aom_highbd_8_masked_sub_pixel_variance32x8)
-
         HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits8,
                     aom_highbd_8_masked_sub_pixel_variance8x32)
-
         HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits8,
                     aom_highbd_8_masked_sub_pixel_variance16x4)
-
         HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits8,
                     aom_highbd_8_masked_sub_pixel_variance4x16)
-#endif
-#if CONFIG_MOTION_VAR
-#if CONFIG_EXT_PARTITION
         HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits8,
                     aom_highbd_obmc_variance128x128,
                     aom_highbd_obmc_sub_pixel_variance128x128)
@@ -1738,7 +1587,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits8,
                     aom_highbd_obmc_variance64x128,
                     aom_highbd_obmc_sub_pixel_variance64x128)
-#endif  // CONFIG_EXT_PARTITION
         HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits8,
                     aom_highbd_obmc_variance64x64,
                     aom_highbd_obmc_sub_pixel_variance64x64)
@@ -1778,223 +1626,206 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits8,
                     aom_highbd_obmc_variance4x4,
                     aom_highbd_obmc_sub_pixel_variance4x4)
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION
-        HIGHBD_OBFP(BLOCK_128X32, aom_highbd_obmc_sad128x32_bits8,
-                    aom_highbd_obmc_variance128x32,
-                    aom_highbd_obmc_sub_pixel_variance128x32)
-
-        HIGHBD_OBFP(BLOCK_32X128, aom_highbd_obmc_sad32x128_bits8,
-                    aom_highbd_obmc_variance32x128,
-                    aom_highbd_obmc_sub_pixel_variance32x128)
-#endif  // CONFIG_EXT_PARTITION
-
         HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits8,
                     aom_highbd_obmc_variance64x16,
                     aom_highbd_obmc_sub_pixel_variance64x16)
-
         HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits8,
                     aom_highbd_obmc_variance16x64,
                     aom_highbd_obmc_sub_pixel_variance16x64)
-
         HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits8,
                     aom_highbd_obmc_variance32x8,
                     aom_highbd_obmc_sub_pixel_variance32x8)
-
         HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits8,
                     aom_highbd_obmc_variance8x32,
                     aom_highbd_obmc_sub_pixel_variance8x32)
-
         HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits8,
                     aom_highbd_obmc_variance16x4,
                     aom_highbd_obmc_sub_pixel_variance16x4)
-
         HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits8,
                     aom_highbd_obmc_variance4x16,
                     aom_highbd_obmc_sub_pixel_variance4x16)
-#endif
-#endif  // CONFIG_MOTION_VAR
         break;
 
       case AOM_BITS_10:
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION
-        HIGHBD_BFP(BLOCK_128X32, aom_highbd_sad128x32_bits10,
-                   aom_highbd_sad128x32_avg_bits10,
-                   aom_highbd_10_variance128x32,
-                   aom_highbd_10_sub_pixel_variance128x32,
-                   aom_highbd_10_sub_pixel_avg_variance128x32, NULL, NULL,
-                   aom_highbd_sad128x32x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_32X128, aom_highbd_sad32x128_bits10,
-                   aom_highbd_sad32x128_avg_bits10,
-                   aom_highbd_10_variance32x128,
-                   aom_highbd_10_sub_pixel_variance32x128,
-                   aom_highbd_10_sub_pixel_avg_variance32x128, NULL, NULL,
-                   aom_highbd_sad32x128x4d_bits10)
-#endif  // CONFIG_EXT_PARTITION
-
         HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits10,
                    aom_highbd_sad64x16_avg_bits10, aom_highbd_10_variance64x16,
                    aom_highbd_10_sub_pixel_variance64x16,
-                   aom_highbd_10_sub_pixel_avg_variance64x16, NULL, NULL,
-                   aom_highbd_sad64x16x4d_bits10)
+                   aom_highbd_10_sub_pixel_avg_variance64x16,
+                   aom_highbd_sad64x16x4d_bits10,
+                   aom_highbd_jnt_sad64x16_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance64x16);
 
         HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits10,
                    aom_highbd_sad16x64_avg_bits10, aom_highbd_10_variance16x64,
                    aom_highbd_10_sub_pixel_variance16x64,
-                   aom_highbd_10_sub_pixel_avg_variance16x64, NULL, NULL,
-                   aom_highbd_sad16x64x4d_bits10)
+                   aom_highbd_10_sub_pixel_avg_variance16x64,
+                   aom_highbd_sad16x64x4d_bits10,
+                   aom_highbd_jnt_sad16x64_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance16x64);
 
         HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits10,
                    aom_highbd_sad32x8_avg_bits10, aom_highbd_10_variance32x8,
                    aom_highbd_10_sub_pixel_variance32x8,
-                   aom_highbd_10_sub_pixel_avg_variance32x8, NULL, NULL,
-                   aom_highbd_sad32x8x4d_bits10)
+                   aom_highbd_10_sub_pixel_avg_variance32x8,
+                   aom_highbd_sad32x8x4d_bits10,
+                   aom_highbd_jnt_sad32x8_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance32x8);
 
         HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits10,
                    aom_highbd_sad8x32_avg_bits10, aom_highbd_10_variance8x32,
                    aom_highbd_10_sub_pixel_variance8x32,
-                   aom_highbd_10_sub_pixel_avg_variance8x32, NULL, NULL,
-                   aom_highbd_sad8x32x4d_bits10)
+                   aom_highbd_10_sub_pixel_avg_variance8x32,
+                   aom_highbd_sad8x32x4d_bits10,
+                   aom_highbd_jnt_sad8x32_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance8x32);
 
         HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits10,
                    aom_highbd_sad16x4_avg_bits10, aom_highbd_10_variance16x4,
                    aom_highbd_10_sub_pixel_variance16x4,
-                   aom_highbd_10_sub_pixel_avg_variance16x4, NULL, NULL,
-                   aom_highbd_sad16x4x4d_bits10)
+                   aom_highbd_10_sub_pixel_avg_variance16x4,
+                   aom_highbd_sad16x4x4d_bits10,
+                   aom_highbd_jnt_sad16x4_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance16x4);
 
         HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits10,
                    aom_highbd_sad4x16_avg_bits10, aom_highbd_10_variance4x16,
                    aom_highbd_10_sub_pixel_variance4x16,
-                   aom_highbd_10_sub_pixel_avg_variance4x16, NULL, NULL,
-                   aom_highbd_sad4x16x4d_bits10)
-#endif
+                   aom_highbd_10_sub_pixel_avg_variance4x16,
+                   aom_highbd_sad4x16x4d_bits10,
+                   aom_highbd_jnt_sad4x16_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance4x16);
 
         HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits10,
                    aom_highbd_sad32x16_avg_bits10, aom_highbd_10_variance32x16,
                    aom_highbd_10_sub_pixel_variance32x16,
-                   aom_highbd_10_sub_pixel_avg_variance32x16, NULL, NULL,
-                   aom_highbd_sad32x16x4d_bits10)
+                   aom_highbd_10_sub_pixel_avg_variance32x16,
+                   aom_highbd_sad32x16x4d_bits10,
+                   aom_highbd_jnt_sad32x16_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance32x16);
 
         HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits10,
                    aom_highbd_sad16x32_avg_bits10, aom_highbd_10_variance16x32,
                    aom_highbd_10_sub_pixel_variance16x32,
-                   aom_highbd_10_sub_pixel_avg_variance16x32, NULL, NULL,
-                   aom_highbd_sad16x32x4d_bits10)
+                   aom_highbd_10_sub_pixel_avg_variance16x32,
+                   aom_highbd_sad16x32x4d_bits10,
+                   aom_highbd_jnt_sad16x32_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance16x32);
 
         HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits10,
                    aom_highbd_sad64x32_avg_bits10, aom_highbd_10_variance64x32,
                    aom_highbd_10_sub_pixel_variance64x32,
-                   aom_highbd_10_sub_pixel_avg_variance64x32, NULL, NULL,
-                   aom_highbd_sad64x32x4d_bits10)
+                   aom_highbd_10_sub_pixel_avg_variance64x32,
+                   aom_highbd_sad64x32x4d_bits10,
+                   aom_highbd_jnt_sad64x32_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance64x32);
 
         HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits10,
                    aom_highbd_sad32x64_avg_bits10, aom_highbd_10_variance32x64,
                    aom_highbd_10_sub_pixel_variance32x64,
-                   aom_highbd_10_sub_pixel_avg_variance32x64, NULL, NULL,
-                   aom_highbd_sad32x64x4d_bits10)
+                   aom_highbd_10_sub_pixel_avg_variance32x64,
+                   aom_highbd_sad32x64x4d_bits10,
+                   aom_highbd_jnt_sad32x64_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance32x64);
 
         HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits10,
                    aom_highbd_sad32x32_avg_bits10, aom_highbd_10_variance32x32,
                    aom_highbd_10_sub_pixel_variance32x32,
                    aom_highbd_10_sub_pixel_avg_variance32x32,
-                   aom_highbd_sad32x32x3_bits10, aom_highbd_sad32x32x8_bits10,
-                   aom_highbd_sad32x32x4d_bits10)
+                   aom_highbd_sad32x32x4d_bits10,
+                   aom_highbd_jnt_sad32x32_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance32x32);
 
         HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits10,
                    aom_highbd_sad64x64_avg_bits10, aom_highbd_10_variance64x64,
                    aom_highbd_10_sub_pixel_variance64x64,
                    aom_highbd_10_sub_pixel_avg_variance64x64,
-                   aom_highbd_sad64x64x3_bits10, aom_highbd_sad64x64x8_bits10,
-                   aom_highbd_sad64x64x4d_bits10)
+                   aom_highbd_sad64x64x4d_bits10,
+                   aom_highbd_jnt_sad64x64_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance64x64);
 
         HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits10,
                    aom_highbd_sad16x16_avg_bits10, aom_highbd_10_variance16x16,
                    aom_highbd_10_sub_pixel_variance16x16,
                    aom_highbd_10_sub_pixel_avg_variance16x16,
-                   aom_highbd_sad16x16x3_bits10, aom_highbd_sad16x16x8_bits10,
-                   aom_highbd_sad16x16x4d_bits10)
+                   aom_highbd_sad16x16x4d_bits10,
+                   aom_highbd_jnt_sad16x16_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance16x16);
 
         HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits10,
                    aom_highbd_sad16x8_avg_bits10, aom_highbd_10_variance16x8,
                    aom_highbd_10_sub_pixel_variance16x8,
                    aom_highbd_10_sub_pixel_avg_variance16x8,
-                   aom_highbd_sad16x8x3_bits10, aom_highbd_sad16x8x8_bits10,
-                   aom_highbd_sad16x8x4d_bits10)
+                   aom_highbd_sad16x8x4d_bits10,
+                   aom_highbd_jnt_sad16x8_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance16x8);
 
         HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits10,
                    aom_highbd_sad8x16_avg_bits10, aom_highbd_10_variance8x16,
                    aom_highbd_10_sub_pixel_variance8x16,
                    aom_highbd_10_sub_pixel_avg_variance8x16,
-                   aom_highbd_sad8x16x3_bits10, aom_highbd_sad8x16x8_bits10,
-                   aom_highbd_sad8x16x4d_bits10)
+                   aom_highbd_sad8x16x4d_bits10,
+                   aom_highbd_jnt_sad8x16_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance8x16);
 
         HIGHBD_BFP(
             BLOCK_8X8, aom_highbd_sad8x8_bits10, aom_highbd_sad8x8_avg_bits10,
             aom_highbd_10_variance8x8, aom_highbd_10_sub_pixel_variance8x8,
-            aom_highbd_10_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x3_bits10,
-            aom_highbd_sad8x8x8_bits10, aom_highbd_sad8x8x4d_bits10)
+            aom_highbd_10_sub_pixel_avg_variance8x8,
+            aom_highbd_sad8x8x4d_bits10, aom_highbd_jnt_sad8x8_avg_bits10,
+            aom_highbd_10_jnt_sub_pixel_avg_variance8x8);
 
-        HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits10,
-                   aom_highbd_sad8x4_avg_bits10, aom_highbd_10_variance8x4,
-                   aom_highbd_10_sub_pixel_variance8x4,
-                   aom_highbd_10_sub_pixel_avg_variance8x4, NULL,
-                   aom_highbd_sad8x4x8_bits10, aom_highbd_sad8x4x4d_bits10)
+        HIGHBD_BFP(
+            BLOCK_8X4, aom_highbd_sad8x4_bits10, aom_highbd_sad8x4_avg_bits10,
+            aom_highbd_10_variance8x4, aom_highbd_10_sub_pixel_variance8x4,
+            aom_highbd_10_sub_pixel_avg_variance8x4,
+            aom_highbd_sad8x4x4d_bits10, aom_highbd_jnt_sad8x4_avg_bits10,
+            aom_highbd_10_jnt_sub_pixel_avg_variance8x4);
 
-        HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits10,
-                   aom_highbd_sad4x8_avg_bits10, aom_highbd_10_variance4x8,
-                   aom_highbd_10_sub_pixel_variance4x8,
-                   aom_highbd_10_sub_pixel_avg_variance4x8, NULL,
-                   aom_highbd_sad4x8x8_bits10, aom_highbd_sad4x8x4d_bits10)
+        HIGHBD_BFP(
+            BLOCK_4X8, aom_highbd_sad4x8_bits10, aom_highbd_sad4x8_avg_bits10,
+            aom_highbd_10_variance4x8, aom_highbd_10_sub_pixel_variance4x8,
+            aom_highbd_10_sub_pixel_avg_variance4x8,
+            aom_highbd_sad4x8x4d_bits10, aom_highbd_jnt_sad4x8_avg_bits10,
+            aom_highbd_10_jnt_sub_pixel_avg_variance4x8);
 
         HIGHBD_BFP(
             BLOCK_4X4, aom_highbd_sad4x4_bits10, aom_highbd_sad4x4_avg_bits10,
             aom_highbd_10_variance4x4, aom_highbd_10_sub_pixel_variance4x4,
-            aom_highbd_10_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits10,
-            aom_highbd_sad4x4x8_bits10, aom_highbd_sad4x4x4d_bits10)
-
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-        HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_10_variance2x2, NULL, NULL,
-                   NULL, NULL, NULL)
-        HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_10_variance4x2, NULL, NULL,
-                   NULL, NULL, NULL)
-        HIGHBD_BFP(BLOCK_2X4, NULL, NULL, aom_highbd_10_variance2x4, NULL, NULL,
-                   NULL, NULL, NULL)
-#endif
+            aom_highbd_10_sub_pixel_avg_variance4x4,
+            aom_highbd_sad4x4x4d_bits10, aom_highbd_jnt_sad4x4_avg_bits10,
+            aom_highbd_10_jnt_sub_pixel_avg_variance4x4);
+
+        HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits10,
+                   aom_highbd_sad128x128_avg_bits10,
+                   aom_highbd_10_variance128x128,
+                   aom_highbd_10_sub_pixel_variance128x128,
+                   aom_highbd_10_sub_pixel_avg_variance128x128,
+                   aom_highbd_sad128x128x4d_bits10,
+                   aom_highbd_jnt_sad128x128_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance128x128);
 
-#if CONFIG_EXT_PARTITION
         HIGHBD_BFP(
-            BLOCK_128X128, aom_highbd_sad128x128_bits10,
-            aom_highbd_sad128x128_avg_bits10, aom_highbd_10_variance128x128,
-            aom_highbd_10_sub_pixel_variance128x128,
-            aom_highbd_10_sub_pixel_avg_variance128x128,
-            aom_highbd_sad128x128x3_bits10, aom_highbd_sad128x128x8_bits10,
-            aom_highbd_sad128x128x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits10,
-                   aom_highbd_sad128x64_avg_bits10,
-                   aom_highbd_10_variance128x64,
-                   aom_highbd_10_sub_pixel_variance128x64,
-                   aom_highbd_10_sub_pixel_avg_variance128x64, NULL, NULL,
-                   aom_highbd_sad128x64x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits10,
-                   aom_highbd_sad64x128_avg_bits10,
-                   aom_highbd_10_variance64x128,
-                   aom_highbd_10_sub_pixel_variance64x128,
-                   aom_highbd_10_sub_pixel_avg_variance64x128, NULL, NULL,
-                   aom_highbd_sad64x128x4d_bits10)
-#endif  // CONFIG_EXT_PARTITION
-
-#if CONFIG_EXT_PARTITION
+            BLOCK_128X64, aom_highbd_sad128x64_bits10,
+            aom_highbd_sad128x64_avg_bits10, aom_highbd_10_variance128x64,
+            aom_highbd_10_sub_pixel_variance128x64,
+            aom_highbd_10_sub_pixel_avg_variance128x64,
+            aom_highbd_sad128x64x4d_bits10, aom_highbd_jnt_sad128x64_avg_bits10,
+            aom_highbd_10_jnt_sub_pixel_avg_variance128x64);
+
+        HIGHBD_BFP(
+            BLOCK_64X128, aom_highbd_sad64x128_bits10,
+            aom_highbd_sad64x128_avg_bits10, aom_highbd_10_variance64x128,
+            aom_highbd_10_sub_pixel_variance64x128,
+            aom_highbd_10_sub_pixel_avg_variance64x128,
+            aom_highbd_sad64x128x4d_bits10, aom_highbd_jnt_sad64x128_avg_bits10,
+            aom_highbd_10_jnt_sub_pixel_avg_variance64x128);
+
         HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits10,
                     aom_highbd_10_masked_sub_pixel_variance128x128)
         HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits10,
                     aom_highbd_10_masked_sub_pixel_variance128x64)
         HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits10,
                     aom_highbd_10_masked_sub_pixel_variance64x128)
-#endif  // CONFIG_EXT_PARTITION
         HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits10,
                     aom_highbd_10_masked_sub_pixel_variance64x64)
         HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits10,
@@ -2021,35 +1852,18 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
                     aom_highbd_10_masked_sub_pixel_variance8x4)
         HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits10,
                     aom_highbd_10_masked_sub_pixel_variance4x4)
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION
-        HIGHBD_MBFP(BLOCK_128X32, aom_highbd_masked_sad128x32_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance128x32)
-
-        HIGHBD_MBFP(BLOCK_32X128, aom_highbd_masked_sad32x128_bits10,
-                    aom_highbd_10_masked_sub_pixel_variance32x128)
-#endif  // CONFIG_EXT_PARTITION
-
         HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits10,
                     aom_highbd_10_masked_sub_pixel_variance64x16)
-
         HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits10,
                     aom_highbd_10_masked_sub_pixel_variance16x64)
-
         HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits10,
                     aom_highbd_10_masked_sub_pixel_variance32x8)
-
         HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits10,
                     aom_highbd_10_masked_sub_pixel_variance8x32)
-
         HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits10,
                     aom_highbd_10_masked_sub_pixel_variance16x4)
-
         HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits10,
                     aom_highbd_10_masked_sub_pixel_variance4x16)
-#endif
-#if CONFIG_MOTION_VAR
-#if CONFIG_EXT_PARTITION
         HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits10,
                     aom_highbd_10_obmc_variance128x128,
                     aom_highbd_10_obmc_sub_pixel_variance128x128)
@@ -2059,7 +1873,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits10,
                     aom_highbd_10_obmc_variance64x128,
                     aom_highbd_10_obmc_sub_pixel_variance64x128)
-#endif  // CONFIG_EXT_PARTITION
         HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits10,
                     aom_highbd_10_obmc_variance64x64,
                     aom_highbd_10_obmc_sub_pixel_variance64x64)
@@ -2099,16 +1912,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits10,
                     aom_highbd_10_obmc_variance4x4,
                     aom_highbd_10_obmc_sub_pixel_variance4x4)
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION
-        HIGHBD_OBFP(BLOCK_128X32, aom_highbd_obmc_sad128x32_bits10,
-                    aom_highbd_10_obmc_variance128x32,
-                    aom_highbd_10_obmc_sub_pixel_variance128x32)
-
-        HIGHBD_OBFP(BLOCK_32X128, aom_highbd_obmc_sad32x128_bits10,
-                    aom_highbd_10_obmc_variance32x128,
-                    aom_highbd_10_obmc_sub_pixel_variance32x128)
-#endif  // CONFIG_EXT_PARTITION
 
         HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits10,
                     aom_highbd_10_obmc_variance64x16,
@@ -2133,189 +1936,188 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits10,
                     aom_highbd_10_obmc_variance4x16,
                     aom_highbd_10_obmc_sub_pixel_variance4x16)
-#endif
-#endif  // CONFIG_MOTION_VAR
         break;
 
       case AOM_BITS_12:
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION
-        HIGHBD_BFP(BLOCK_128X32, aom_highbd_sad128x32_bits12,
-                   aom_highbd_sad128x32_avg_bits12,
-                   aom_highbd_12_variance128x32,
-                   aom_highbd_12_sub_pixel_variance128x32,
-                   aom_highbd_12_sub_pixel_avg_variance128x32, NULL, NULL,
-                   aom_highbd_sad128x32x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_32X128, aom_highbd_sad32x128_bits12,
-                   aom_highbd_sad32x128_avg_bits12,
-                   aom_highbd_12_variance32x128,
-                   aom_highbd_12_sub_pixel_variance32x128,
-                   aom_highbd_12_sub_pixel_avg_variance32x128, NULL, NULL,
-                   aom_highbd_sad32x128x4d_bits12)
-#endif  // CONFIG_EXT_PARTITION
-
         HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits12,
                    aom_highbd_sad64x16_avg_bits12, aom_highbd_12_variance64x16,
                    aom_highbd_12_sub_pixel_variance64x16,
-                   aom_highbd_12_sub_pixel_avg_variance64x16, NULL, NULL,
-                   aom_highbd_sad64x16x4d_bits12)
+                   aom_highbd_12_sub_pixel_avg_variance64x16,
+                   aom_highbd_sad64x16x4d_bits12,
+                   aom_highbd_jnt_sad64x16_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance64x16);
 
         HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits12,
                    aom_highbd_sad16x64_avg_bits12, aom_highbd_12_variance16x64,
                    aom_highbd_12_sub_pixel_variance16x64,
-                   aom_highbd_12_sub_pixel_avg_variance16x64, NULL, NULL,
-                   aom_highbd_sad16x64x4d_bits12)
+                   aom_highbd_12_sub_pixel_avg_variance16x64,
+                   aom_highbd_sad16x64x4d_bits12,
+                   aom_highbd_jnt_sad16x64_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance16x64);
 
         HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits12,
                    aom_highbd_sad32x8_avg_bits12, aom_highbd_12_variance32x8,
                    aom_highbd_12_sub_pixel_variance32x8,
-                   aom_highbd_12_sub_pixel_avg_variance32x8, NULL, NULL,
-                   aom_highbd_sad32x8x4d_bits12)
+                   aom_highbd_12_sub_pixel_avg_variance32x8,
+                   aom_highbd_sad32x8x4d_bits12,
+                   aom_highbd_jnt_sad32x8_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance32x8);
 
         HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits12,
                    aom_highbd_sad8x32_avg_bits12, aom_highbd_12_variance8x32,
                    aom_highbd_12_sub_pixel_variance8x32,
-                   aom_highbd_12_sub_pixel_avg_variance8x32, NULL, NULL,
-                   aom_highbd_sad8x32x4d_bits12)
+                   aom_highbd_12_sub_pixel_avg_variance8x32,
+                   aom_highbd_sad8x32x4d_bits12,
+                   aom_highbd_jnt_sad8x32_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance8x32);
 
         HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits12,
                    aom_highbd_sad16x4_avg_bits12, aom_highbd_12_variance16x4,
                    aom_highbd_12_sub_pixel_variance16x4,
-                   aom_highbd_12_sub_pixel_avg_variance16x4, NULL, NULL,
-                   aom_highbd_sad16x4x4d_bits12)
+                   aom_highbd_12_sub_pixel_avg_variance16x4,
+                   aom_highbd_sad16x4x4d_bits12,
+                   aom_highbd_jnt_sad16x4_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance16x4);
 
         HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits12,
                    aom_highbd_sad4x16_avg_bits12, aom_highbd_12_variance4x16,
                    aom_highbd_12_sub_pixel_variance4x16,
-                   aom_highbd_12_sub_pixel_avg_variance4x16, NULL, NULL,
-                   aom_highbd_sad4x16x4d_bits12)
-#endif
+                   aom_highbd_12_sub_pixel_avg_variance4x16,
+                   aom_highbd_sad4x16x4d_bits12,
+                   aom_highbd_jnt_sad4x16_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance4x16);
 
         HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits12,
                    aom_highbd_sad32x16_avg_bits12, aom_highbd_12_variance32x16,
                    aom_highbd_12_sub_pixel_variance32x16,
-                   aom_highbd_12_sub_pixel_avg_variance32x16, NULL, NULL,
-                   aom_highbd_sad32x16x4d_bits12)
+                   aom_highbd_12_sub_pixel_avg_variance32x16,
+                   aom_highbd_sad32x16x4d_bits12,
+                   aom_highbd_jnt_sad32x16_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance32x16);
 
         HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits12,
                    aom_highbd_sad16x32_avg_bits12, aom_highbd_12_variance16x32,
                    aom_highbd_12_sub_pixel_variance16x32,
-                   aom_highbd_12_sub_pixel_avg_variance16x32, NULL, NULL,
-                   aom_highbd_sad16x32x4d_bits12)
+                   aom_highbd_12_sub_pixel_avg_variance16x32,
+                   aom_highbd_sad16x32x4d_bits12,
+                   aom_highbd_jnt_sad16x32_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance16x32);
 
         HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits12,
                    aom_highbd_sad64x32_avg_bits12, aom_highbd_12_variance64x32,
                    aom_highbd_12_sub_pixel_variance64x32,
-                   aom_highbd_12_sub_pixel_avg_variance64x32, NULL, NULL,
-                   aom_highbd_sad64x32x4d_bits12)
+                   aom_highbd_12_sub_pixel_avg_variance64x32,
+                   aom_highbd_sad64x32x4d_bits12,
+                   aom_highbd_jnt_sad64x32_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance64x32);
 
         HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits12,
                    aom_highbd_sad32x64_avg_bits12, aom_highbd_12_variance32x64,
                    aom_highbd_12_sub_pixel_variance32x64,
-                   aom_highbd_12_sub_pixel_avg_variance32x64, NULL, NULL,
-                   aom_highbd_sad32x64x4d_bits12)
+                   aom_highbd_12_sub_pixel_avg_variance32x64,
+                   aom_highbd_sad32x64x4d_bits12,
+                   aom_highbd_jnt_sad32x64_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance32x64);
 
         HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits12,
                    aom_highbd_sad32x32_avg_bits12, aom_highbd_12_variance32x32,
                    aom_highbd_12_sub_pixel_variance32x32,
                    aom_highbd_12_sub_pixel_avg_variance32x32,
-                   aom_highbd_sad32x32x3_bits12, aom_highbd_sad32x32x8_bits12,
-                   aom_highbd_sad32x32x4d_bits12)
+                   aom_highbd_sad32x32x4d_bits12,
+                   aom_highbd_jnt_sad32x32_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance32x32);
 
         HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits12,
                    aom_highbd_sad64x64_avg_bits12, aom_highbd_12_variance64x64,
                    aom_highbd_12_sub_pixel_variance64x64,
                    aom_highbd_12_sub_pixel_avg_variance64x64,
-                   aom_highbd_sad64x64x3_bits12, aom_highbd_sad64x64x8_bits12,
-                   aom_highbd_sad64x64x4d_bits12)
+                   aom_highbd_sad64x64x4d_bits12,
+                   aom_highbd_jnt_sad64x64_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance64x64);
 
         HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits12,
                    aom_highbd_sad16x16_avg_bits12, aom_highbd_12_variance16x16,
                    aom_highbd_12_sub_pixel_variance16x16,
                    aom_highbd_12_sub_pixel_avg_variance16x16,
-                   aom_highbd_sad16x16x3_bits12, aom_highbd_sad16x16x8_bits12,
-                   aom_highbd_sad16x16x4d_bits12)
+                   aom_highbd_sad16x16x4d_bits12,
+                   aom_highbd_jnt_sad16x16_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance16x16);
 
         HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits12,
                    aom_highbd_sad16x8_avg_bits12, aom_highbd_12_variance16x8,
                    aom_highbd_12_sub_pixel_variance16x8,
                    aom_highbd_12_sub_pixel_avg_variance16x8,
-                   aom_highbd_sad16x8x3_bits12, aom_highbd_sad16x8x8_bits12,
-                   aom_highbd_sad16x8x4d_bits12)
+                   aom_highbd_sad16x8x4d_bits12,
+                   aom_highbd_jnt_sad16x8_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance16x8);
 
         HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits12,
                    aom_highbd_sad8x16_avg_bits12, aom_highbd_12_variance8x16,
                    aom_highbd_12_sub_pixel_variance8x16,
                    aom_highbd_12_sub_pixel_avg_variance8x16,
-                   aom_highbd_sad8x16x3_bits12, aom_highbd_sad8x16x8_bits12,
-                   aom_highbd_sad8x16x4d_bits12)
+                   aom_highbd_sad8x16x4d_bits12,
+                   aom_highbd_jnt_sad8x16_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance8x16);
 
         HIGHBD_BFP(
             BLOCK_8X8, aom_highbd_sad8x8_bits12, aom_highbd_sad8x8_avg_bits12,
             aom_highbd_12_variance8x8, aom_highbd_12_sub_pixel_variance8x8,
-            aom_highbd_12_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x3_bits12,
-            aom_highbd_sad8x8x8_bits12, aom_highbd_sad8x8x4d_bits12)
+            aom_highbd_12_sub_pixel_avg_variance8x8,
+            aom_highbd_sad8x8x4d_bits12, aom_highbd_jnt_sad8x8_avg_bits12,
+            aom_highbd_12_jnt_sub_pixel_avg_variance8x8);
 
-        HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits12,
-                   aom_highbd_sad8x4_avg_bits12, aom_highbd_12_variance8x4,
-                   aom_highbd_12_sub_pixel_variance8x4,
-                   aom_highbd_12_sub_pixel_avg_variance8x4, NULL,
-                   aom_highbd_sad8x4x8_bits12, aom_highbd_sad8x4x4d_bits12)
+        HIGHBD_BFP(
+            BLOCK_8X4, aom_highbd_sad8x4_bits12, aom_highbd_sad8x4_avg_bits12,
+            aom_highbd_12_variance8x4, aom_highbd_12_sub_pixel_variance8x4,
+            aom_highbd_12_sub_pixel_avg_variance8x4,
+            aom_highbd_sad8x4x4d_bits12, aom_highbd_jnt_sad8x4_avg_bits12,
+            aom_highbd_12_jnt_sub_pixel_avg_variance8x4);
 
-        HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits12,
-                   aom_highbd_sad4x8_avg_bits12, aom_highbd_12_variance4x8,
-                   aom_highbd_12_sub_pixel_variance4x8,
-                   aom_highbd_12_sub_pixel_avg_variance4x8, NULL,
-                   aom_highbd_sad4x8x8_bits12, aom_highbd_sad4x8x4d_bits12)
+        HIGHBD_BFP(
+            BLOCK_4X8, aom_highbd_sad4x8_bits12, aom_highbd_sad4x8_avg_bits12,
+            aom_highbd_12_variance4x8, aom_highbd_12_sub_pixel_variance4x8,
+            aom_highbd_12_sub_pixel_avg_variance4x8,
+            aom_highbd_sad4x8x4d_bits12, aom_highbd_jnt_sad4x8_avg_bits12,
+            aom_highbd_12_jnt_sub_pixel_avg_variance4x8);
 
         HIGHBD_BFP(
             BLOCK_4X4, aom_highbd_sad4x4_bits12, aom_highbd_sad4x4_avg_bits12,
             aom_highbd_12_variance4x4, aom_highbd_12_sub_pixel_variance4x4,
-            aom_highbd_12_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits12,
-            aom_highbd_sad4x4x8_bits12, aom_highbd_sad4x4x4d_bits12)
-
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-        HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_12_variance2x2, NULL, NULL,
-                   NULL, NULL, NULL)
-        HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_12_variance4x2, NULL, NULL,
-                   NULL, NULL, NULL)
-        HIGHBD_BFP(BLOCK_2X4, NULL, NULL, aom_highbd_12_variance2x4, NULL, NULL,
-                   NULL, NULL, NULL)
-#endif
+            aom_highbd_12_sub_pixel_avg_variance4x4,
+            aom_highbd_sad4x4x4d_bits12, aom_highbd_jnt_sad4x4_avg_bits12,
+            aom_highbd_12_jnt_sub_pixel_avg_variance4x4);
+
+        HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits12,
+                   aom_highbd_sad128x128_avg_bits12,
+                   aom_highbd_12_variance128x128,
+                   aom_highbd_12_sub_pixel_variance128x128,
+                   aom_highbd_12_sub_pixel_avg_variance128x128,
+                   aom_highbd_sad128x128x4d_bits12,
+                   aom_highbd_jnt_sad128x128_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance128x128);
+
+        HIGHBD_BFP(
+            BLOCK_128X64, aom_highbd_sad128x64_bits12,
+            aom_highbd_sad128x64_avg_bits12, aom_highbd_12_variance128x64,
+            aom_highbd_12_sub_pixel_variance128x64,
+            aom_highbd_12_sub_pixel_avg_variance128x64,
+            aom_highbd_sad128x64x4d_bits12, aom_highbd_jnt_sad128x64_avg_bits12,
+            aom_highbd_12_jnt_sub_pixel_avg_variance128x64);
 
-#if CONFIG_EXT_PARTITION
         HIGHBD_BFP(
-            BLOCK_128X128, aom_highbd_sad128x128_bits12,
-            aom_highbd_sad128x128_avg_bits12, aom_highbd_12_variance128x128,
-            aom_highbd_12_sub_pixel_variance128x128,
-            aom_highbd_12_sub_pixel_avg_variance128x128,
-            aom_highbd_sad128x128x3_bits12, aom_highbd_sad128x128x8_bits12,
-            aom_highbd_sad128x128x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits12,
-                   aom_highbd_sad128x64_avg_bits12,
-                   aom_highbd_12_variance128x64,
-                   aom_highbd_12_sub_pixel_variance128x64,
-                   aom_highbd_12_sub_pixel_avg_variance128x64, NULL, NULL,
-                   aom_highbd_sad128x64x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits12,
-                   aom_highbd_sad64x128_avg_bits12,
-                   aom_highbd_12_variance64x128,
-                   aom_highbd_12_sub_pixel_variance64x128,
-                   aom_highbd_12_sub_pixel_avg_variance64x128, NULL, NULL,
-                   aom_highbd_sad64x128x4d_bits12)
-#endif  // CONFIG_EXT_PARTITION
-
-#if CONFIG_EXT_PARTITION
+            BLOCK_64X128, aom_highbd_sad64x128_bits12,
+            aom_highbd_sad64x128_avg_bits12, aom_highbd_12_variance64x128,
+            aom_highbd_12_sub_pixel_variance64x128,
+            aom_highbd_12_sub_pixel_avg_variance64x128,
+            aom_highbd_sad64x128x4d_bits12, aom_highbd_jnt_sad64x128_avg_bits12,
+            aom_highbd_12_jnt_sub_pixel_avg_variance64x128);
+
         HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits12,
                     aom_highbd_12_masked_sub_pixel_variance128x128)
         HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits12,
                     aom_highbd_12_masked_sub_pixel_variance128x64)
         HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits12,
                     aom_highbd_12_masked_sub_pixel_variance64x128)
-#endif  // CONFIG_EXT_PARTITION
         HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits12,
                     aom_highbd_12_masked_sub_pixel_variance64x64)
         HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits12,
@@ -2342,36 +2144,18 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
                     aom_highbd_12_masked_sub_pixel_variance8x4)
         HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits12,
                     aom_highbd_12_masked_sub_pixel_variance4x4)
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION
-        HIGHBD_MBFP(BLOCK_128X32, aom_highbd_masked_sad128x32_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance128x32)
-
-        HIGHBD_MBFP(BLOCK_32X128, aom_highbd_masked_sad32x128_bits12,
-                    aom_highbd_12_masked_sub_pixel_variance32x128)
-#endif  // CONFIG_EXT_PARTITION
-
         HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits12,
                     aom_highbd_12_masked_sub_pixel_variance64x16)
-
         HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits12,
                     aom_highbd_12_masked_sub_pixel_variance16x64)
-
         HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits12,
                     aom_highbd_12_masked_sub_pixel_variance32x8)
-
         HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits12,
                     aom_highbd_12_masked_sub_pixel_variance8x32)
-
         HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits12,
                     aom_highbd_12_masked_sub_pixel_variance16x4)
-
         HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits12,
                     aom_highbd_12_masked_sub_pixel_variance4x16)
-#endif
-
-#if CONFIG_MOTION_VAR
-#if CONFIG_EXT_PARTITION
         HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits12,
                     aom_highbd_12_obmc_variance128x128,
                     aom_highbd_12_obmc_sub_pixel_variance128x128)
@@ -2381,7 +2165,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits12,
                     aom_highbd_12_obmc_variance64x128,
                     aom_highbd_12_obmc_sub_pixel_variance64x128)
-#endif  // CONFIG_EXT_PARTITION
         HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits12,
                     aom_highbd_12_obmc_variance64x64,
                     aom_highbd_12_obmc_sub_pixel_variance64x64)
@@ -2421,42 +2204,24 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits12,
                     aom_highbd_12_obmc_variance4x4,
                     aom_highbd_12_obmc_sub_pixel_variance4x4)
-#if CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_EXT_PARTITION
-        HIGHBD_OBFP(BLOCK_128X32, aom_highbd_obmc_sad128x32_bits12,
-                    aom_highbd_12_obmc_variance128x32,
-                    aom_highbd_12_obmc_sub_pixel_variance128x32)
-
-        HIGHBD_OBFP(BLOCK_32X128, aom_highbd_obmc_sad32x128_bits12,
-                    aom_highbd_12_obmc_variance32x128,
-                    aom_highbd_12_obmc_sub_pixel_variance32x128)
-#endif  // CONFIG_EXT_PARTITION
-
         HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits12,
                     aom_highbd_12_obmc_variance64x16,
                     aom_highbd_12_obmc_sub_pixel_variance64x16)
-
         HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits12,
                     aom_highbd_12_obmc_variance16x64,
                     aom_highbd_12_obmc_sub_pixel_variance16x64)
-
         HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits12,
                     aom_highbd_12_obmc_variance32x8,
                     aom_highbd_12_obmc_sub_pixel_variance32x8)
-
         HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits12,
                     aom_highbd_12_obmc_variance8x32,
                     aom_highbd_12_obmc_sub_pixel_variance8x32)
-
         HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits12,
                     aom_highbd_12_obmc_variance16x4,
                     aom_highbd_12_obmc_sub_pixel_variance16x4)
-
         HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits12,
                     aom_highbd_12_obmc_variance4x16,
                     aom_highbd_12_obmc_sub_pixel_variance4x16)
-#endif
-#endif  // CONFIG_MOTION_VAR
         break;
 
       default:
@@ -2466,7 +2231,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
     }
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 static void realloc_segmentation_maps(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
@@ -2487,40 +2251,59 @@ static void realloc_segmentation_maps(AV1_COMP *cpi) {
                   aom_calloc(cm->mi_rows * cm->mi_cols, 1));
 }
 
-void set_compound_tools(AV1_COMMON *cm) {
-  (void)cm;
-#if CONFIG_INTERINTRA
-  cm->allow_interintra_compound = 1;
-#endif  // CONFIG_INTERINTRA
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-  cm->allow_masked_compound = 1;
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-}
-
 void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   RATE_CONTROL *const rc = &cpi->rc;
   MACROBLOCK *const x = &cpi->td.mb;
 
   if (cm->profile != oxcf->profile) cm->profile = oxcf->profile;
   cm->bit_depth = oxcf->bit_depth;
-  cm->color_space = oxcf->color_space;
-#if CONFIG_COLORSPACE_HEADERS
-  cm->transfer_function = oxcf->transfer_function;
+  cm->color_primaries = oxcf->color_primaries;
+  cm->transfer_characteristics = oxcf->transfer_characteristics;
+  cm->matrix_coefficients = oxcf->matrix_coefficients;
+  cm->seq_params.monochrome = oxcf->monochrome;
   cm->chroma_sample_position = oxcf->chroma_sample_position;
-#endif
   cm->color_range = oxcf->color_range;
 
-  if (cm->profile <= PROFILE_1)
-    assert(cm->bit_depth == AOM_BITS_8);
-  else
-    assert(cm->bit_depth > AOM_BITS_8);
+  assert(IMPLIES(cm->profile <= PROFILE_1, cm->bit_depth <= AOM_BITS_10));
+
+  cm->timing_info_present = oxcf->timing_info_present;
+  cm->timing_info.num_units_in_display_tick =
+      oxcf->timing_info.num_units_in_display_tick;
+  cm->timing_info.time_scale = oxcf->timing_info.time_scale;
+  cm->timing_info.equal_picture_interval =
+      oxcf->timing_info.equal_picture_interval;
+  cm->timing_info.num_ticks_per_picture =
+      oxcf->timing_info.num_ticks_per_picture;
+
+  cm->seq_params.display_model_info_present_flag =
+      oxcf->display_model_info_present_flag;
+  cm->seq_params.decoder_model_info_present_flag =
+      oxcf->decoder_model_info_present_flag;
+  if (oxcf->decoder_model_info_present_flag) {
+    // set the decoder model parameters in schedule mode
+    cm->buffer_model.num_units_in_decoding_tick =
+        oxcf->buffer_model.num_units_in_decoding_tick;
+    cm->buffer_removal_delay_present = 1;
+    set_aom_dec_model_info(&cm->buffer_model);
+    set_dec_model_op_parameters(&cm->op_params[0]);
+  } else if (cm->timing_info_present &&
+             cm->timing_info.equal_picture_interval &&
+             !cm->seq_params.decoder_model_info_present_flag) {
+    // set the decoder model parameters in resource availability mode
+    set_resource_availability_parameters(&cm->op_params[0]);
+  } else {
+    cm->op_params[0].initial_display_delay =
+        10;  // Default value (not signaled)
+  }
+
+  update_film_grain_parameters(cpi, oxcf);
 
   cpi->oxcf = *oxcf;
+  cpi->common.options = oxcf->cfg;
   x->e_mbd.bd = (int)cm->bit_depth;
-#if CONFIG_GLOBAL_MOTION
   x->e_mbd.global_motion = cm->global_motion;
-#endif  // CONFIG_GLOBAL_MOTION
 
   if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) {
     rc->baseline_gf_interval = FIXED_GF_INTERVAL;
@@ -2530,30 +2313,21 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
 
   cpi->refresh_last_frame = 1;
   cpi->refresh_golden_frame = 0;
-#if CONFIG_EXT_REFS
   cpi->refresh_bwd_ref_frame = 0;
   cpi->refresh_alt2_ref_frame = 0;
-#endif  // CONFIG_EXT_REFS
-
-  cm->refresh_frame_context =
-      (oxcf->error_resilient_mode || oxcf->frame_parallel_decoding_mode)
-          ? REFRESH_FRAME_CONTEXT_FORWARD
-          : REFRESH_FRAME_CONTEXT_BACKWARD;
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
-#endif
+
+  cm->refresh_frame_context = (oxcf->frame_parallel_decoding_mode)
+                                  ? REFRESH_FRAME_CONTEXT_DISABLED
+                                  : REFRESH_FRAME_CONTEXT_BACKWARD;
+  if (oxcf->large_scale_tile)
+    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 
   if (x->palette_buffer == NULL) {
     CHECK_MEM_ERROR(cm, x->palette_buffer,
                     aom_memalign(16, sizeof(*x->palette_buffer)));
   }
-  set_compound_tools(cm);
   av1_reset_segment_features(cm);
-#if CONFIG_AMVR
-  set_high_precision_mv(cpi, 0, 0);
-#else
-  set_high_precision_mv(cpi, 0);
-#endif
+  set_high_precision_mv(cpi, 1, 0);
 
   set_rc_buffer_sizes(rc, &cpi->oxcf);
 
@@ -2569,7 +2343,12 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   rc->worst_quality = cpi->oxcf.worst_allowed_q;
   rc->best_quality = cpi->oxcf.best_allowed_q;
 
-  cm->interp_filter = cpi->sf.default_interp_filter;
+  if (!oxcf->large_scale_tile)
+    cm->interp_filter = cpi->sf.default_interp_filter;
+  else
+    cm->interp_filter = EIGHTTAP_REGULAR;
+
+  cm->switchable_motion_mode = 1;
 
   if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) {
     cm->render_width = cpi->oxcf.render_width;
@@ -2581,10 +2360,17 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   cm->width = cpi->oxcf.width;
   cm->height = cpi->oxcf.height;
 
-  if (cpi->initial_width) {
-    if (cm->width > cpi->initial_width || cm->height > cpi->initial_height) {
+  int sb_size = cm->seq_params.sb_size;
+  // Superblock size should not be updated after the first key frame.
+  if (!cpi->seq_params_locked) {
+    set_sb_size(&cm->seq_params, select_sb_size(cpi));
+  }
+
+  if (cpi->initial_width || sb_size != cm->seq_params.sb_size) {
+    if (cm->width > cpi->initial_width || cm->height > cpi->initial_height ||
+        cm->seq_params.sb_size != sb_size) {
       av1_free_context_buffers(cm);
-      av1_free_pc_tree(&cpi->td);
+      av1_free_pc_tree(&cpi->td, num_planes);
       alloc_compressor_data(cpi);
       realloc_segmentation_maps(cpi);
       cpi->initial_width = cpi->initial_height = 0;
@@ -2595,32 +2381,24 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   cpi->alt_ref_source = NULL;
   rc->is_src_frame_alt_ref = 0;
 
-#if CONFIG_EXT_REFS
   rc->is_bwd_ref_frame = 0;
   rc->is_last_bipred_frame = 0;
   rc->is_bipred_frame = 0;
-#endif  // CONFIG_EXT_REFS
-
-#if 0
-  // Experimental RD Code
-  cpi->frame_distortion = 0;
-  cpi->last_frame_distortion = 0;
-#endif
 
   set_tile_info(cpi);
 
   cpi->ext_refresh_frame_flags_pending = 0;
   cpi->ext_refresh_frame_context_pending = 0;
 
-#if CONFIG_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
-#endif
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  cpi->common.ans_window_size_log2 = cpi->oxcf.ans_window_size_log2;
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-#if CONFIG_AMVR
-  cm->seq_mv_precision_level = 2;
-#endif
+
+  // Init sequence level coding tools
+  // This should not be called after the first key frame.
+  if (!cpi->seq_params_locked) {
+    cm->seq_params.operating_points_cnt_minus_1 =
+        cm->number_spatial_layers > 1 ? cm->number_spatial_layers - 1 : 0;
+    init_seq_coding_tools(&cm->seq_params, cm, oxcf);
+  }
 }
 
 AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
@@ -2644,10 +2422,6 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
   cm->free_mi = enc_free_mi;
   cm->setup_mi = enc_setup_mi;
 
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  get_default_ncobmc_kernels(cm);
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-
   CHECK_MEM_ERROR(cm, cm->fc,
                   (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
   CHECK_MEM_ERROR(cm, cm->frame_contexts,
@@ -2663,38 +2437,18 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
   cpi->common.buffer_pool = pool;
 
   init_config(cpi, oxcf);
-#if CONFIG_XIPHRC
-  cpi->od_rc.framerate = cpi->framerate;
-  cpi->od_rc.frame_width = cm->render_width;
-  cpi->od_rc.frame_height = cm->render_height;
-  cpi->od_rc.keyframe_rate = oxcf->key_freq;
-  cpi->od_rc.goldenframe_rate = FIXED_GF_INTERVAL;
-  cpi->od_rc.altref_rate = 25;
-  cpi->od_rc.firstpass_quant = 1;
-  cpi->od_rc.bit_depth = cm->bit_depth;
-  cpi->od_rc.minq = oxcf->best_allowed_q;
-  cpi->od_rc.maxq = oxcf->worst_allowed_q;
-  if (cpi->oxcf.rc_mode == AOM_CQ) cpi->od_rc.minq = cpi->od_rc.quality;
-  cpi->od_rc.quality = cpi->oxcf.rc_mode == AOM_Q ? oxcf->cq_level : -1;
-  cpi->od_rc.periodic_boosts = oxcf->frame_periodic_boost;
-  od_enc_rc_init(&cpi->od_rc,
-                 cpi->oxcf.rc_mode == AOM_Q ? -1 : oxcf->target_bandwidth,
-                 oxcf->maximum_buffer_size_ms);
-#else
   av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
-#endif
 
   cm->current_video_frame = 0;
+  cpi->seq_params_locked = 0;
   cpi->partition_search_skippable_frame = 0;
   cpi->tile_data = NULL;
   cpi->last_show_frame_buf_idx = INVALID_IDX;
 
   realloc_segmentation_maps(cpi);
 
-  for (i = 0; i < NMV_CONTEXTS; ++i) {
-    memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs));
-    memset(cpi->nmv_costs_hp, 0, sizeof(cpi->nmv_costs_hp));
-  }
+  memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs));
+  memset(cpi->nmv_costs_hp, 0, sizeof(cpi->nmv_costs_hp));
 
   for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]));
        i++) {
@@ -2715,7 +2469,6 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 #endif
 
   cpi->refresh_alt_ref_frame = 0;
-  cpi->multi_arf_last_grp_enabled = 0;
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
@@ -2753,17 +2506,14 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 #endif
 #if CONFIG_ENTROPY_STATS
   av1_zero(aggregate_fc);
-  av1_zero_array(aggregate_fc_per_type, FRAME_CONTEXTS);
 #endif  // CONFIG_ENTROPY_STATS
 
   cpi->first_time_stamp_ever = INT64_MAX;
 
-  for (i = 0; i < NMV_CONTEXTS; ++i) {
-    cpi->td.mb.nmvcost[i][0] = &cpi->nmv_costs[i][0][MV_MAX];
-    cpi->td.mb.nmvcost[i][1] = &cpi->nmv_costs[i][1][MV_MAX];
-    cpi->td.mb.nmvcost_hp[i][0] = &cpi->nmv_costs_hp[i][0][MV_MAX];
-    cpi->td.mb.nmvcost_hp[i][1] = &cpi->nmv_costs_hp[i][1][MV_MAX];
-  }
+  cpi->td.mb.nmvcost[0] = &cpi->nmv_costs[0][MV_MAX];
+  cpi->td.mb.nmvcost[1] = &cpi->nmv_costs[1][MV_MAX];
+  cpi->td.mb.nmvcost_hp[0] = &cpi->nmv_costs_hp[0][MV_MAX];
+  cpi->td.mb.nmvcost_hp[1] = &cpi->nmv_costs_hp[1][MV_MAX];
 
 #ifdef OUTPUT_YUV_SKINMAP
   yuv_skinmap_file = fopen("skinmap.yuv", "ab");
@@ -2772,17 +2522,6 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
   yuv_rec_file = fopen("rec.yuv", "wb");
 #endif
 
-#if 0
-  framepsnr = fopen("framepsnr.stt", "a");
-  kf_list = fopen("kf_list.stt", "w");
-#endif
-
-#if CONFIG_XIPHRC
-  if (oxcf->pass == 2) {
-    cpi->od_rc.twopass_allframes_buf = oxcf->two_pass_stats_in.buf;
-    cpi->od_rc.twopass_allframes_buf_size = oxcf->two_pass_stats_in.sz;
-  }
-#else
   if (oxcf->pass == 1) {
     av1_init_first_pass(cpi);
   } else if (oxcf->pass == 2) {
@@ -2808,24 +2547,15 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 
     av1_init_second_pass(cpi);
   }
-#endif
 
-#if CONFIG_MOTION_VAR
-#if CONFIG_HIGHBITDEPTH
-  int buf_scaler = 2;
-#else
-  int buf_scaler = 1;
-#endif
   CHECK_MEM_ERROR(
       cm, cpi->td.mb.above_pred_buf,
-      (uint8_t *)aom_memalign(16,
-                              buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
-                                  sizeof(*cpi->td.mb.above_pred_buf)));
+      (uint8_t *)aom_memalign(16, MAX_MB_PLANE * MAX_SB_SQUARE *
+                                      sizeof(*cpi->td.mb.above_pred_buf)));
   CHECK_MEM_ERROR(
       cm, cpi->td.mb.left_pred_buf,
-      (uint8_t *)aom_memalign(16,
-                              buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
-                                  sizeof(*cpi->td.mb.left_pred_buf)));
+      (uint8_t *)aom_memalign(16, MAX_MB_PLANE * MAX_SB_SQUARE *
+                                      sizeof(*cpi->td.mb.left_pred_buf)));
 
   CHECK_MEM_ERROR(cm, cpi->td.mb.wsrc_buf,
                   (int32_t *)aom_memalign(
@@ -2835,143 +2565,130 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
                   (int32_t *)aom_memalign(
                       16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf)));
 
-#endif
-
   av1_set_speed_features_framesize_independent(cpi);
   av1_set_speed_features_framesize_dependent(cpi);
 
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
   cpi->fn_ptr[BT].sdf = SDF;                                    \
   cpi->fn_ptr[BT].sdaf = SDAF;                                  \
   cpi->fn_ptr[BT].vf = VF;                                      \
   cpi->fn_ptr[BT].svf = SVF;                                    \
   cpi->fn_ptr[BT].svaf = SVAF;                                  \
-  cpi->fn_ptr[BT].sdx3f = SDX3F;                                \
-  cpi->fn_ptr[BT].sdx8f = SDX8F;                                \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                              \
+  cpi->fn_ptr[BT].jsdaf = JSDAF;                                \
+  cpi->fn_ptr[BT].jsvaf = JSVAF;
 
-#if CONFIG_EXT_PARTITION_TYPES
   BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16,
-      aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16, NULL, NULL,
-      aom_sad4x16x4d)
+      aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16,
+      aom_sad4x16x4d, aom_jnt_sad4x16_avg, aom_jnt_sub_pixel_avg_variance4x16)
 
   BFP(BLOCK_16X4, aom_sad16x4, aom_sad16x4_avg, aom_variance16x4,
-      aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4, NULL, NULL,
-      aom_sad16x4x4d)
+      aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4,
+      aom_sad16x4x4d, aom_jnt_sad16x4_avg, aom_jnt_sub_pixel_avg_variance16x4)
 
   BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32,
-      aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32, NULL, NULL,
-      aom_sad8x32x4d)
+      aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32,
+      aom_sad8x32x4d, aom_jnt_sad8x32_avg, aom_jnt_sub_pixel_avg_variance8x32)
 
   BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8,
-      aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8, NULL, NULL,
-      aom_sad32x8x4d)
+      aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8,
+      aom_sad32x8x4d, aom_jnt_sad32x8_avg, aom_jnt_sub_pixel_avg_variance32x8)
 
   BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64,
-      aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64, NULL, NULL,
-      aom_sad16x64x4d)
+      aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64,
+      aom_sad16x64x4d, aom_jnt_sad16x64_avg,
+      aom_jnt_sub_pixel_avg_variance16x64)
 
   BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16,
-      aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16, NULL, NULL,
-      aom_sad64x16x4d)
+      aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16,
+      aom_sad64x16x4d, aom_jnt_sad64x16_avg,
+      aom_jnt_sub_pixel_avg_variance64x16)
 
-#if CONFIG_EXT_PARTITION
-  BFP(BLOCK_32X128, aom_sad32x128, aom_sad32x128_avg, aom_variance32x128,
-      aom_sub_pixel_variance32x128, aom_sub_pixel_avg_variance32x128, NULL,
-      NULL, aom_sad32x128x4d)
-
-  BFP(BLOCK_128X32, aom_sad128x32, aom_sad128x32_avg, aom_variance128x32,
-      aom_sub_pixel_variance128x32, aom_sub_pixel_avg_variance128x32, NULL,
-      NULL, aom_sad128x32x4d)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
-
-#if CONFIG_EXT_PARTITION
   BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
       aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128,
-      aom_sad128x128x3, aom_sad128x128x8, aom_sad128x128x4d)
+      aom_sad128x128x4d, aom_jnt_sad128x128_avg,
+      aom_jnt_sub_pixel_avg_variance128x128)
 
   BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64,
-      aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64, NULL,
-      NULL, aom_sad128x64x4d)
+      aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64,
+      aom_sad128x64x4d, aom_jnt_sad128x64_avg,
+      aom_jnt_sub_pixel_avg_variance128x64)
 
   BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128,
-      aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128, NULL,
-      NULL, aom_sad64x128x4d)
-#endif  // CONFIG_EXT_PARTITION
+      aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128,
+      aom_sad64x128x4d, aom_jnt_sad64x128_avg,
+      aom_jnt_sub_pixel_avg_variance64x128)
 
   BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16,
-      aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16, NULL, NULL,
-      aom_sad32x16x4d)
+      aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16,
+      aom_sad32x16x4d, aom_jnt_sad32x16_avg,
+      aom_jnt_sub_pixel_avg_variance32x16)
 
   BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32,
-      aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32, NULL, NULL,
-      aom_sad16x32x4d)
+      aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32,
+      aom_sad16x32x4d, aom_jnt_sad16x32_avg,
+      aom_jnt_sub_pixel_avg_variance16x32)
 
   BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32,
-      aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32, NULL, NULL,
-      aom_sad64x32x4d)
+      aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32,
+      aom_sad64x32x4d, aom_jnt_sad64x32_avg,
+      aom_jnt_sub_pixel_avg_variance64x32)
 
   BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64,
-      aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64, NULL, NULL,
-      aom_sad32x64x4d)
+      aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64,
+      aom_sad32x64x4d, aom_jnt_sad32x64_avg,
+      aom_jnt_sub_pixel_avg_variance32x64)
 
   BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32,
       aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32,
-      aom_sad32x32x3, aom_sad32x32x8, aom_sad32x32x4d)
+      aom_sad32x32x4d, aom_jnt_sad32x32_avg,
+      aom_jnt_sub_pixel_avg_variance32x32)
 
   BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64,
       aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64,
-      aom_sad64x64x3, aom_sad64x64x8, aom_sad64x64x4d)
+      aom_sad64x64x4d, aom_jnt_sad64x64_avg,
+      aom_jnt_sub_pixel_avg_variance64x64)
 
   BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16,
       aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16,
-      aom_sad16x16x3, aom_sad16x16x8, aom_sad16x16x4d)
+      aom_sad16x16x4d, aom_jnt_sad16x16_avg,
+      aom_jnt_sub_pixel_avg_variance16x16)
 
   BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8,
-      aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8, aom_sad16x8x3,
-      aom_sad16x8x8, aom_sad16x8x4d)
+      aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8,
+      aom_sad16x8x4d, aom_jnt_sad16x8_avg, aom_jnt_sub_pixel_avg_variance16x8)
 
   BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16,
-      aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16, aom_sad8x16x3,
-      aom_sad8x16x8, aom_sad8x16x4d)
+      aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16,
+      aom_sad8x16x4d, aom_jnt_sad8x16_avg, aom_jnt_sub_pixel_avg_variance8x16)
 
   BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8,
-      aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x3,
-      aom_sad8x8x8, aom_sad8x8x4d)
+      aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x4d,
+      aom_jnt_sad8x8_avg, aom_jnt_sub_pixel_avg_variance8x8)
 
   BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4,
-      aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, NULL,
-      aom_sad8x4x8, aom_sad8x4x4d)
+      aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, aom_sad8x4x4d,
+      aom_jnt_sad8x4_avg, aom_jnt_sub_pixel_avg_variance8x4)
 
   BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8,
-      aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, NULL,
-      aom_sad4x8x8, aom_sad4x8x4d)
+      aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, aom_sad4x8x4d,
+      aom_jnt_sad4x8_avg, aom_jnt_sub_pixel_avg_variance4x8)
 
   BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4,
-      aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x3,
-      aom_sad4x4x8, aom_sad4x4x4d)
-
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  BFP(BLOCK_2X2, NULL, NULL, aom_variance2x2, NULL, NULL, NULL, NULL, NULL)
-  BFP(BLOCK_2X4, NULL, NULL, aom_variance2x4, NULL, NULL, NULL, NULL, NULL)
-  BFP(BLOCK_4X2, NULL, NULL, aom_variance4x2, NULL, NULL, NULL, NULL, NULL)
-#endif
+      aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d,
+      aom_jnt_sad4x4_avg, aom_jnt_sub_pixel_avg_variance4x4)
 
-#if CONFIG_MOTION_VAR
 #define OBFP(BT, OSDF, OVF, OSVF) \
   cpi->fn_ptr[BT].osdf = OSDF;    \
   cpi->fn_ptr[BT].ovf = OVF;      \
   cpi->fn_ptr[BT].osvf = OSVF;
 
-#if CONFIG_EXT_PARTITION
   OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128,
        aom_obmc_sub_pixel_variance128x128)
   OBFP(BLOCK_128X64, aom_obmc_sad128x64, aom_obmc_variance128x64,
        aom_obmc_sub_pixel_variance128x64)
   OBFP(BLOCK_64X128, aom_obmc_sad64x128, aom_obmc_variance64x128,
        aom_obmc_sub_pixel_variance64x128)
-#endif  // CONFIG_EXT_PARTITION
   OBFP(BLOCK_64X64, aom_obmc_sad64x64, aom_obmc_variance64x64,
        aom_obmc_sub_pixel_variance64x64)
   OBFP(BLOCK_64X32, aom_obmc_sad64x32, aom_obmc_variance64x32,
@@ -2998,46 +2715,27 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
        aom_obmc_sub_pixel_variance8x4)
   OBFP(BLOCK_4X4, aom_obmc_sad4x4, aom_obmc_variance4x4,
        aom_obmc_sub_pixel_variance4x4)
-
-#if CONFIG_EXT_PARTITION_TYPES
   OBFP(BLOCK_4X16, aom_obmc_sad4x16, aom_obmc_variance4x16,
        aom_obmc_sub_pixel_variance4x16)
-
   OBFP(BLOCK_16X4, aom_obmc_sad16x4, aom_obmc_variance16x4,
        aom_obmc_sub_pixel_variance16x4)
-
   OBFP(BLOCK_8X32, aom_obmc_sad8x32, aom_obmc_variance8x32,
        aom_obmc_sub_pixel_variance8x32)
-
   OBFP(BLOCK_32X8, aom_obmc_sad32x8, aom_obmc_variance32x8,
        aom_obmc_sub_pixel_variance32x8)
-
   OBFP(BLOCK_16X64, aom_obmc_sad16x64, aom_obmc_variance16x64,
        aom_obmc_sub_pixel_variance16x64)
-
   OBFP(BLOCK_64X16, aom_obmc_sad64x16, aom_obmc_variance64x16,
        aom_obmc_sub_pixel_variance64x16)
 
-#if CONFIG_EXT_PARTITION
-  OBFP(BLOCK_32X128, aom_obmc_sad32x128, aom_obmc_variance32x128,
-       aom_obmc_sub_pixel_variance32x128)
-
-  OBFP(BLOCK_128X32, aom_obmc_sad128x32, aom_obmc_variance128x32,
-       aom_obmc_sub_pixel_variance128x32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#endif  // CONFIG_MOTION_VAR
-
 #define MBFP(BT, MCSDF, MCSVF)  \
   cpi->fn_ptr[BT].msdf = MCSDF; \
   cpi->fn_ptr[BT].msvf = MCSVF;
 
-#if CONFIG_EXT_PARTITION
   MBFP(BLOCK_128X128, aom_masked_sad128x128,
        aom_masked_sub_pixel_variance128x128)
   MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_sub_pixel_variance128x64)
   MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_sub_pixel_variance64x128)
-#endif  // CONFIG_EXT_PARTITION
   MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_sub_pixel_variance64x64)
   MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_sub_pixel_variance64x32)
   MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_sub_pixel_variance32x64)
@@ -3052,7 +2750,6 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
   MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_sub_pixel_variance8x4)
   MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_sub_pixel_variance4x4)
 
-#if CONFIG_EXT_PARTITION_TYPES
   MBFP(BLOCK_4X16, aom_masked_sad4x16, aom_masked_sub_pixel_variance4x16)
 
   MBFP(BLOCK_16X4, aom_masked_sad16x4, aom_masked_sub_pixel_variance16x4)
@@ -3065,16 +2762,7 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 
   MBFP(BLOCK_64X16, aom_masked_sad64x16, aom_masked_sub_pixel_variance64x16)
 
-#if CONFIG_EXT_PARTITION
-  MBFP(BLOCK_32X128, aom_masked_sad32x128, aom_masked_sub_pixel_variance32x128)
-
-  MBFP(BLOCK_128X32, aom_masked_sad128x32, aom_masked_sub_pixel_variance128x32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
-
-#if CONFIG_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
-#endif
 
   /* av1_init_quantizer() is first called here. Add check in
    * av1_frame_init_quantizer() so that av1_init_quantizer is only
@@ -3082,29 +2770,25 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
    * av1_init_quantizer() for every frame.
    */
   av1_init_quantizer(cpi);
-#if CONFIG_AOM_QM
-  aom_qm_init(cm);
-#endif
+  av1_qm_init(cm);
 
   av1_loop_filter_init(cm);
-#if CONFIG_FRAME_SUPERRES
   cm->superres_scale_denominator = SCALE_NUMERATOR;
   cm->superres_upscaled_width = oxcf->width;
   cm->superres_upscaled_height = oxcf->height;
-#endif  // CONFIG_FRAME_SUPERRES
-#if CONFIG_LOOP_RESTORATION
   av1_loop_restoration_precal();
-#endif  // CONFIG_LOOP_RESTORATION
 
   cm->error.setjmp = 0;
 
   return cpi;
 }
 
+#if CONFIG_INTERNAL_STATS
 #define SNPRINT(H, T) snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T))
 
 #define SNPRINT2(H, T, V) \
   snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
+#endif  // CONFIG_INTERNAL_STATS
 
 void av1_remove_compressor(AV1_COMP *cpi) {
   AV1_COMMON *cm;
@@ -3114,14 +2798,14 @@ void av1_remove_compressor(AV1_COMP *cpi) {
   if (!cpi) return;
 
   cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+
   if (cm->current_video_frame > 0) {
 #if CONFIG_ENTROPY_STATS
     if (cpi->oxcf.pass != 1) {
       fprintf(stderr, "Writing counts.stt\n");
       FILE *f = fopen("counts.stt", "wb");
       fwrite(&aggregate_fc, sizeof(aggregate_fc), 1, f);
-      fwrite(aggregate_fc_per_type, sizeof(aggregate_fc_per_type[0]),
-             FRAME_CONTEXTS, f);
       fclose(f);
     }
 #endif  // CONFIG_ENTROPY_STATS
@@ -3151,16 +2835,21 @@ void av1_remove_compressor(AV1_COMP *cpi) {
         snprintf(headings, sizeof(headings),
                  "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
                  "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
-                 "WstPsnr\tWstSsim\tWstFast\tWstHVS");
+                 "WstPsnr\tWstSsim\tWstFast\tWstHVS\t"
+                 "AVPsrnY\tAPsnrCb\tAPsnrCr");
         snprintf(results, sizeof(results),
                  "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
                  "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
-                 "%7.3f\t%7.3f\t%7.3f\t%7.3f",
-                 dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr,
-                 cpi->psnr.stat[ALL] / cpi->count, total_psnr, total_ssim,
-                 total_ssim, cpi->fastssim.stat[ALL] / cpi->count,
-                 cpi->psnrhvs.stat[ALL] / cpi->count, cpi->psnr.worst,
-                 cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst);
+                 "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                 "%7.3f\t%7.3f\t%7.3f",
+                 dr, cpi->psnr.stat[STAT_ALL] / cpi->count, total_psnr,
+                 cpi->psnr.stat[STAT_ALL] / cpi->count, total_psnr, total_ssim,
+                 total_ssim, cpi->fastssim.stat[STAT_ALL] / cpi->count,
+                 cpi->psnrhvs.stat[STAT_ALL] / cpi->count, cpi->psnr.worst,
+                 cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst,
+                 cpi->psnr.stat[STAT_Y] / cpi->count,
+                 cpi->psnr.stat[STAT_U] / cpi->count,
+                 cpi->psnr.stat[STAT_V] / cpi->count);
 
         if (cpi->b_calculate_blockiness) {
           SNPRINT(headings, "\t  Block\tWstBlck");
@@ -3184,19 +2873,7 @@ void av1_remove_compressor(AV1_COMP *cpi) {
 
       fclose(f);
     }
-
-#endif
-
-#if 0
-    {
-      printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
-      printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
-      printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame,
-             cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000,
-             cpi->time_compress_data / 1000,
-             (cpi->time_receive_data + cpi->time_compress_data) / 1000);
-    }
-#endif
+#endif  // CONFIG_INTERNAL_STATS
   }
 
   for (t = 0; t < cpi->num_workers; ++t) {
@@ -3209,21 +2886,22 @@ void av1_remove_compressor(AV1_COMP *cpi) {
     // Deallocate allocated thread data.
     if (t < cpi->num_workers - 1) {
       aom_free(thread_data->td->palette_buffer);
-#if CONFIG_MOTION_VAR
       aom_free(thread_data->td->above_pred_buf);
       aom_free(thread_data->td->left_pred_buf);
       aom_free(thread_data->td->wsrc_buf);
       aom_free(thread_data->td->mask_buf);
-#endif  // CONFIG_MOTION_VAR
       aom_free(thread_data->td->counts);
-      av1_free_pc_tree(thread_data->td);
+      av1_free_pc_tree(thread_data->td, num_planes);
       aom_free(thread_data->td);
     }
   }
   aom_free(cpi->tile_thr_data);
   aom_free(cpi->workers);
 
-  if (cpi->num_workers > 1) av1_loop_filter_dealloc(&cpi->lf_row_sync);
+  if (cpi->num_workers > 1) {
+    av1_loop_filter_dealloc(&cpi->lf_row_sync);
+    av1_loop_restoration_dealloc(&cpi->lr_row_sync, cpi->num_workers);
+  }
 
   dealloc_compressor_data(cpi);
 
@@ -3244,6 +2922,10 @@ void av1_remove_compressor(AV1_COMP *cpi) {
 #endif  // CONFIG_INTERNAL_STATS
 
   av1_remove_common(cm);
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    av1_hash_table_destroy(&cm->buffer_pool->frame_bufs[i].hash_table);
+  }
+  if (cpi->sf.use_hash_based_trellis) hbt_destroy();
   av1_free_ref_frame_buffers(cm->buffer_pool);
   aom_free(cpi);
 
@@ -3253,30 +2935,14 @@ void av1_remove_compressor(AV1_COMP *cpi) {
 #ifdef OUTPUT_YUV_REC
   fclose(yuv_rec_file);
 #endif
-#if 0
-
-  if (keyfile)
-    fclose(keyfile);
-
-  if (framepsnr)
-    fclose(framepsnr);
-
-  if (kf_list)
-    fclose(kf_list);
-
-#endif
 }
 
 static void generate_psnr_packet(AV1_COMP *cpi) {
   struct aom_codec_cx_pkt pkt;
   int i;
   PSNR_STATS psnr;
-#if CONFIG_HIGHBITDEPTH
   aom_calc_highbd_psnr(cpi->source, cpi->common.frame_to_show, &psnr,
                        cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
-#else
-  aom_calc_psnr(cpi->source, cpi->common.frame_to_show, &psnr);
-#endif
 
   for (i = 0; i < 4; ++i) {
     pkt.data.psnr.samples[i] = psnr.samples[i];
@@ -3290,22 +2956,25 @@ static void generate_psnr_packet(AV1_COMP *cpi) {
 int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags) {
   if (ref_frame_flags > ((1 << INTER_REFS_PER_FRAME) - 1)) return -1;
 
-  cpi->ref_frame_flags = ref_frame_flags;
+  cpi->ext_ref_frame_flags = ref_frame_flags;
   return 0;
 }
 
-void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags) {
-  cpi->ext_refresh_golden_frame = (ref_frame_flags & AOM_GOLD_FLAG) != 0;
-  cpi->ext_refresh_alt_ref_frame = (ref_frame_flags & AOM_ALT_FLAG) != 0;
-  cpi->ext_refresh_last_frame = (ref_frame_flags & AOM_LAST_FLAG) != 0;
+void av1_update_reference(AV1_COMP *cpi, int ref_frame_upd_flags) {
+  cpi->ext_refresh_last_frame = (ref_frame_upd_flags & AOM_LAST_FLAG) != 0;
+  cpi->ext_refresh_golden_frame = (ref_frame_upd_flags & AOM_GOLD_FLAG) != 0;
+  cpi->ext_refresh_alt_ref_frame = (ref_frame_upd_flags & AOM_ALT_FLAG) != 0;
+  cpi->ext_refresh_bwd_ref_frame = (ref_frame_upd_flags & AOM_BWD_FLAG) != 0;
+  cpi->ext_refresh_alt2_ref_frame = (ref_frame_upd_flags & AOM_ALT2_FLAG) != 0;
   cpi->ext_refresh_frame_flags_pending = 1;
 }
 
 int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx);
   if (cfg) {
-    aom_yv12_copy_frame(cfg, sd);
+    aom_yv12_copy_frame(cfg, sd, num_planes);
     return 0;
   } else {
     return -1;
@@ -3314,9 +2983,10 @@ int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
 
 int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx);
   if (cfg) {
-    aom_yv12_copy_frame(sd, cfg);
+    aom_yv12_copy_frame(sd, cfg, num_planes);
     return 0;
   } else {
     return -1;
@@ -3361,7 +3031,6 @@ void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
 }
 #endif
 
-#if CONFIG_EXT_REFS && !CONFIG_XIPHRC
 #if USE_GF16_MULTI_LAYER
 static void check_show_existing_frame_gf16(AV1_COMP *cpi) {
   const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
@@ -3374,7 +3043,7 @@ static void check_show_existing_frame_gf16(AV1_COMP *cpi) {
   } else if (cpi->rc.is_last_bipred_frame) {
     cpi->rc.is_last_bipred_frame = 0;
     cm->show_existing_frame = 1;
-    cpi->existing_fb_idx_to_show = cpi->bwd_fb_idx;
+    cpi->existing_fb_idx_to_show = cpi->ref_fb_idx[BWDREF_FRAME - 1];
   } else if (next_frame_update_type == OVERLAY_UPDATE ||
              next_frame_update_type == INTNL_OVERLAY_UPDATE) {
     // Check the temporal filtering status for the next OVERLAY frame
@@ -3392,8 +3061,8 @@ static void check_show_existing_frame_gf16(AV1_COMP *cpi) {
       cm->show_existing_frame = 1;
       cpi->rc.is_src_frame_alt_ref = 1;
       cpi->existing_fb_idx_to_show = (next_frame_update_type == OVERLAY_UPDATE)
-                                         ? cpi->alt_fb_idx
-                                         : cpi->bwd_fb_idx;
+                                         ? cpi->ref_fb_idx[ALTREF_FRAME - 1]
+                                         : cpi->ref_fb_idx[BWDREF_FRAME - 1];
       cpi->is_arf_filter_off[which_arf] = 0;
     }
   }
@@ -3423,7 +3092,7 @@ static void check_show_existing_frame(AV1_COMP *cpi) {
     //       the last_fb_idxes[0] after reference frame buffer update
     cpi->rc.is_last_bipred_frame = 0;
     cm->show_existing_frame = 1;
-    cpi->existing_fb_idx_to_show = cpi->lst_fb_idxes[0];
+    cpi->existing_fb_idx_to_show = cpi->ref_fb_idx[0];
   } else if (cpi->is_arf_filter_off[which_arf] &&
              (next_frame_update_type == OVERLAY_UPDATE ||
               next_frame_update_type == INTNL_OVERLAY_UPDATE)) {
@@ -3432,20 +3101,18 @@ static void check_show_existing_frame(AV1_COMP *cpi) {
     cm->show_existing_frame = 1;
     cpi->rc.is_src_frame_alt_ref = 1;
     cpi->existing_fb_idx_to_show = (next_frame_update_type == OVERLAY_UPDATE)
-                                       ? cpi->alt_fb_idx
-                                       : cpi->alt2_fb_idx;
+                                       ? cpi->ref_fb_idx[ALTREF_FRAME - 1]
+                                       : cpi->ref_fb_idx[ALTREF2_FRAME - 1];
     cpi->is_arf_filter_off[which_arf] = 0;
   }
   cpi->rc.is_src_frame_ext_arf = 0;
 }
-#endif  // CONFIG_EXT_REFS && !CONFIG_XIPHRC
 
 #ifdef OUTPUT_YUV_REC
 void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
   uint8_t *src = s->y_buffer;
   int h = cm->height;
   if (yuv_rec_file == NULL) return;
-#if CONFIG_HIGHBITDEPTH
   if (s->flags & YV12_FLAG_HIGHBITDEPTH) {
     uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
 
@@ -3473,7 +3140,6 @@ void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
     fflush(yuv_rec_file);
     return;
   }
-#endif  // CONFIG_HIGHBITDEPTH
 
   do {
     fwrite(src, s->y_width, 1, yuv_rec_file);
@@ -3500,7 +3166,6 @@ void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
 }
 #endif  // OUTPUT_YUV_REC
 
-#if CONFIG_GLOBAL_MOTION
 #define GM_RECODE_LOOP_NUM4X4_FACTOR 192
 static int recode_loop_test_global_motion(AV1_COMP *cpi) {
   int i;
@@ -3515,12 +3180,13 @@ static int recode_loop_test_global_motion(AV1_COMP *cpi) {
       assert(cm->global_motion[i].wmtype == IDENTITY);
       cpi->gmparams_cost[i] = 0;
       recode = 1;
-      recode |= (rdc->global_motion_used[i] > 0);
+      // TODO(sarahparker): The earlier condition for recoding here was:
+      // "recode |= (rdc->global_motion_used[i] > 0);". Can we bring something
+      // similar to that back to speed up global motion?
     }
   }
   return recode;
 }
-#endif  // CONFIG_GLOBAL_MOTION
 
 // Function to test for conditions that indicate we should loop
 // back and recode a frame.
@@ -3602,15 +3268,15 @@ static void dump_ref_frame_images(AV1_COMP *cpi) {
 }
 #endif  // DUMP_REF_FRAME_IMAGES == 1
 
-#if CONFIG_EXT_REFS
 // This function is used to shift the virtual indices of last reference frames
 // as follows:
 // LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME
 // when the LAST_FRAME is updated.
 static INLINE void shift_last_ref_frames(AV1_COMP *cpi) {
+  // TODO(isbs): shift the scaled indices as well
   int ref_frame;
   for (ref_frame = LAST_REF_FRAMES - 1; ref_frame > 0; --ref_frame) {
-    cpi->lst_fb_idxes[ref_frame] = cpi->lst_fb_idxes[ref_frame - 1];
+    cpi->ref_fb_idx[ref_frame] = cpi->ref_fb_idx[ref_frame - 1];
 
     // [0] is allocated to the current coded frame. The statistics for the
     // reference frames start at [LAST_FRAME], i.e. [1].
@@ -3621,64 +3287,18 @@ static INLINE void shift_last_ref_frames(AV1_COMP *cpi) {
     }
   }
 }
-#endif  // CONFIG_EXT_REFS
-
-#if CONFIG_VAR_REFS
-static void enc_check_valid_ref_frames(AV1_COMP *const cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  MV_REFERENCE_FRAME ref_frame;
-
-  // TODO(zoeliu): To handle ALTREF_FRAME the same way as do with other
-  //               reference frames. Current encoder invalid ALTREF when ALTREF
-  //               is the same as LAST, but invalid all the other references
-  //               when they are the same as ALTREF.
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    int ref_buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
-    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - LAST_FRAME];
-
-    if (ref_buf_idx != INVALID_IDX) {
-      ref_buf->is_valid = 1;
-
-      MV_REFERENCE_FRAME ref;
-      for (ref = LAST_FRAME; ref < ref_frame; ++ref) {
-        int buf_idx = get_ref_frame_buf_idx(cpi, ref);
-        RefBuffer *const buf = &cm->frame_refs[ref - LAST_FRAME];
-        if (buf->is_valid && buf_idx == ref_buf_idx) {
-          if (ref_frame != ALTREF_FRAME || ref == LAST_FRAME) {
-            ref_buf->is_valid = 0;
-            break;
-          } else {
-            buf->is_valid = 0;
-          }
-        }
-      }
-    } else {
-      ref_buf->is_valid = 0;
-    }
-  }
-}
-#endif  // CONFIG_VAR_REFS
 
-#if CONFIG_EXT_REFS
 #if USE_GF16_MULTI_LAYER
 static void update_reference_frames_gf16(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   BufferPool *const pool = cm->buffer_pool;
 
   if (cm->frame_type == KEY_FRAME) {
-    for (int ref_frame = 0; ref_frame < LAST_REF_FRAMES; ++ref_frame) {
+    for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
       ref_cnt_fb(pool->frame_bufs,
-                 &cm->ref_frame_map[cpi->lst_fb_idxes[ref_frame]],
+                 &cm->ref_frame_map[cpi->ref_fb_idx[ref_frame]],
                  cm->new_fb_idx);
     }
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
-               cm->new_fb_idx);
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx],
-               cm->new_fb_idx);
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt2_fb_idx],
-               cm->new_fb_idx);
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx],
-               cm->new_fb_idx);
   } else {
     if (cpi->refresh_last_frame || cpi->refresh_golden_frame ||
         cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame ||
@@ -3703,7 +3323,6 @@ static void update_reference_frames_gf16(AV1_COMP *cpi) {
 #endif  // DUMP_REF_FRAME_IMAGES
 }
 #endif  // USE_GF16_MULTI_LAYER
-#endif  // CONFIG_EXT_REFS
 
 static void update_reference_frames(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
@@ -3712,30 +3331,28 @@ static void update_reference_frames(AV1_COMP *cpi) {
   //       for the purpose to verify no mismatch between encoder and decoder.
   if (cm->show_frame) cpi->last_show_frame_buf_idx = cm->new_fb_idx;
 
-#if CONFIG_EXT_REFS
 #if USE_GF16_MULTI_LAYER
   if (cpi->rc.baseline_gf_interval == 16) {
     update_reference_frames_gf16(cpi);
     return;
   }
 #endif  // USE_GF16_MULTI_LAYER
-#endif  // CONFIG_EXT_REFS
 
   BufferPool *const pool = cm->buffer_pool;
+
   // At this point the new frame has been encoded.
   // If any buffer copy / swapping is signaled it should be done here.
-  if (cm->frame_type == KEY_FRAME) {
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
-               cm->new_fb_idx);
-#if CONFIG_EXT_REFS
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx],
-               cm->new_fb_idx);
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt2_fb_idx],
-               cm->new_fb_idx);
-#endif  // CONFIG_EXT_REFS
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx],
-               cm->new_fb_idx);
-  } else if (av1_preserve_existing_gf(cpi)) {
+
+  if (cm->frame_type == KEY_FRAME || frame_is_sframe(cm)) {
+    for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->ref_fb_idx[ref_frame]],
+                 cm->new_fb_idx);
+    }
+    return;
+  }
+
+  if (av1_preserve_existing_gf(cpi)) {
     // We have decided to preserve the previously existing golden frame as our
     // new ARF frame. However, in the short term in function
     // av1_bitstream.c::get_refresh_mask() we left it in the GF slot and, if
@@ -3746,19 +3363,17 @@ static void update_reference_frames(AV1_COMP *cpi) {
     // slot and, if we're updating the GF, the current frame becomes the new GF.
     int tmp;
 
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx],
+    ref_cnt_fb(pool->frame_bufs,
+               &cm->ref_frame_map[cpi->ref_fb_idx[ALTREF_FRAME - 1]],
                cm->new_fb_idx);
-    tmp = cpi->alt_fb_idx;
-    cpi->alt_fb_idx = cpi->gld_fb_idx;
-    cpi->gld_fb_idx = tmp;
+    tmp = cpi->ref_fb_idx[ALTREF_FRAME - 1];
+    cpi->ref_fb_idx[ALTREF_FRAME - 1] = cpi->ref_fb_idx[GOLDEN_FRAME - 1];
+    cpi->ref_fb_idx[GOLDEN_FRAME - 1] = tmp;
 
-#if CONFIG_EXT_REFS
     // We need to modify the mapping accordingly
-    cpi->arf_map[0] = cpi->alt_fb_idx;
-#endif  // CONFIG_EXT_REFS
-// TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to
-// cpi->interp_filter_selected[GOLDEN_FRAME]?
-#if CONFIG_EXT_REFS
+    cpi->arf_map[0] = cpi->ref_fb_idx[ALTREF_FRAME - 1];
+    // TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to
+    // cpi->interp_filter_selected[GOLDEN_FRAME]?
   } else if (cpi->rc.is_src_frame_ext_arf && cm->show_existing_frame) {
     // Deal with the special case for showing existing internal ALTREF_FRAME
     // Refresh the LAST_FRAME with the ALTREF_FRAME and retire the LAST3_FRAME
@@ -3767,29 +3382,22 @@ static void update_reference_frames(AV1_COMP *cpi) {
     const int which_arf = gf_group->arf_ref_idx[gf_group->index];
     assert(gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE);
 
-    const int tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
+    const int tmp = cpi->ref_fb_idx[LAST_REF_FRAMES - 1];
     shift_last_ref_frames(cpi);
 
-    cpi->lst_fb_idxes[0] = cpi->alt2_fb_idx;
-    cpi->alt2_fb_idx = tmp;
+    cpi->ref_fb_idx[LAST_FRAME - 1] = cpi->ref_fb_idx[ALTREF2_FRAME - 1];
+    cpi->ref_fb_idx[ALTREF2_FRAME - 1] = tmp;
     // We need to modify the mapping accordingly
-    cpi->arf_map[which_arf] = cpi->alt2_fb_idx;
+    cpi->arf_map[which_arf] = cpi->ref_fb_idx[ALTREF2_FRAME - 1];
 
     memcpy(cpi->interp_filter_selected[LAST_FRAME],
            cpi->interp_filter_selected[ALTREF2_FRAME],
            sizeof(cpi->interp_filter_selected[ALTREF2_FRAME]));
-#endif     // CONFIG_EXT_REFS
   } else { /* For non key/golden frames */
     // === ALTREF_FRAME ===
     if (cpi->refresh_alt_ref_frame) {
-      int arf_idx = cpi->alt_fb_idx;
+      int arf_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
       int which_arf = 0;
-#if !CONFIG_EXT_REFS
-      if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
-        const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-        arf_idx = gf_group->arf_update_idx[gf_group->index];
-      }
-#endif  // !CONFIG_EXT_REFS
       ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
 
       memcpy(cpi->interp_filter_selected[ALTREF_FRAME + which_arf],
@@ -3799,21 +3407,19 @@ static void update_reference_frames(AV1_COMP *cpi) {
 
     // === GOLDEN_FRAME ===
     if (cpi->refresh_golden_frame) {
-      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]],
                  cm->new_fb_idx);
 
-#if !CONFIG_EXT_REFS
-      if (!cpi->rc.is_src_frame_alt_ref)
-#endif  // !CONFIG_EXT_REFS
-        memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
-               cpi->interp_filter_selected[0],
-               sizeof(cpi->interp_filter_selected[0]));
+      memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
     }
 
-#if CONFIG_EXT_REFS
     // === BWDREF_FRAME ===
     if (cpi->refresh_bwd_ref_frame) {
-      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx],
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->ref_fb_idx[BWDREF_FRAME - 1]],
                  cm->new_fb_idx);
 
       memcpy(cpi->interp_filter_selected[BWDREF_FRAME],
@@ -3823,18 +3429,17 @@ static void update_reference_frames(AV1_COMP *cpi) {
 
     // === ALTREF2_FRAME ===
     if (cpi->refresh_alt2_ref_frame) {
-      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt2_fb_idx],
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]],
                  cm->new_fb_idx);
 
       memcpy(cpi->interp_filter_selected[ALTREF2_FRAME],
              cpi->interp_filter_selected[0],
              sizeof(cpi->interp_filter_selected[0]));
     }
-#endif  // CONFIG_EXT_REFS
   }
 
   if (cpi->refresh_last_frame) {
-#if CONFIG_EXT_REFS
     // NOTE(zoeliu): We have two layers of mapping (1) from the per-frame
     // reference to the reference frame buffer virtual index; and then (2) from
     // the virtual index to the reference frame buffer physical index:
@@ -3842,7 +3447,7 @@ static void update_reference_frames(AV1_COMP *cpi) {
     // LAST_FRAME,      ..., LAST3_FRAME,     ..., ALTREF_FRAME
     //      |                     |                     |
     //      v                     v                     v
-    // lst_fb_idxes[0], ..., lst_fb_idxes[2], ..., alt_fb_idx
+    // ref_fb_idx[0],   ..., ref_fb_idx[2],   ..., ref_fb_idx[ALTREF_FRAME-1]
     //      |                     |                     |
     //      v                     v                     v
     // ref_frame_map[], ..., ref_frame_map[], ..., ref_frame_map[]
@@ -3864,61 +3469,42 @@ static void update_reference_frames(AV1_COMP *cpi) {
     // LAST_FRAME,      LAST2_FRAME,     LAST3_FRAME
     //      |                |                |
     //      v                v                v
-    // lst_fb_idxes[2], lst_fb_idxes[0], lst_fb_idxes[1]
-    int ref_frame;
+    // ref_fb_idx[2],   ref_fb_idx[0],   ref_fb_idx[1]
+    int tmp;
 
-    if (cm->frame_type == KEY_FRAME) {
-      for (ref_frame = 0; ref_frame < LAST_REF_FRAMES; ++ref_frame) {
-        ref_cnt_fb(pool->frame_bufs,
-                   &cm->ref_frame_map[cpi->lst_fb_idxes[ref_frame]],
-                   cm->new_fb_idx);
-      }
-    } else {
-      int tmp;
+    ref_cnt_fb(pool->frame_bufs,
+               &cm->ref_frame_map[cpi->ref_fb_idx[LAST_REF_FRAMES - 1]],
+               cm->new_fb_idx);
 
-      ref_cnt_fb(pool->frame_bufs,
-                 &cm->ref_frame_map[cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]],
-                 cm->new_fb_idx);
+    tmp = cpi->ref_fb_idx[LAST_REF_FRAMES - 1];
 
-      tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
+    shift_last_ref_frames(cpi);
+    cpi->ref_fb_idx[0] = tmp;
 
-      shift_last_ref_frames(cpi);
-      cpi->lst_fb_idxes[0] = tmp;
+    assert(cm->show_existing_frame == 0);
+    memcpy(cpi->interp_filter_selected[LAST_FRAME],
+           cpi->interp_filter_selected[0],
+           sizeof(cpi->interp_filter_selected[0]));
+
+    if (cpi->rc.is_last_bipred_frame) {
+      // Refresh the LAST_FRAME with the BWDREF_FRAME and retire the
+      // LAST3_FRAME by updating the virtual indices.
+      //
+      // NOTE: The source frame for BWDREF does not have a holding position as
+      //       the OVERLAY frame for ALTREF's. Hence, to resolve the reference
+      //       virtual index reshuffling for BWDREF, the encoder always
+      //       specifies a LAST_BIPRED right before BWDREF and completes the
+      //       reshuffling job accordingly.
+      tmp = cpi->ref_fb_idx[LAST_REF_FRAMES - 1];
 
-      assert(cm->show_existing_frame == 0);
-      memcpy(cpi->interp_filter_selected[LAST_FRAME],
-             cpi->interp_filter_selected[0],
-             sizeof(cpi->interp_filter_selected[0]));
+      shift_last_ref_frames(cpi);
+      cpi->ref_fb_idx[0] = cpi->ref_fb_idx[BWDREF_FRAME - 1];
+      cpi->ref_fb_idx[BWDREF_FRAME - 1] = tmp;
 
-      if (cpi->rc.is_last_bipred_frame) {
-        // Refresh the LAST_FRAME with the BWDREF_FRAME and retire the
-        // LAST3_FRAME by updating the virtual indices.
-        //
-        // NOTE: The source frame for BWDREF does not have a holding position as
-        //       the OVERLAY frame for ALTREF's. Hence, to resolve the reference
-        //       virtual index reshuffling for BWDREF, the encoder always
-        //       specifies a LAST_BIPRED right before BWDREF and completes the
-        //       reshuffling job accordingly.
-        tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
-
-        shift_last_ref_frames(cpi);
-        cpi->lst_fb_idxes[0] = cpi->bwd_fb_idx;
-        cpi->bwd_fb_idx = tmp;
-
-        memcpy(cpi->interp_filter_selected[LAST_FRAME],
-               cpi->interp_filter_selected[BWDREF_FRAME],
-               sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
-      }
-    }
-#else   // !CONFIG_EXT_REFS
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
-               cm->new_fb_idx);
-    if (!cpi->rc.is_src_frame_alt_ref) {
       memcpy(cpi->interp_filter_selected[LAST_FRAME],
-             cpi->interp_filter_selected[0],
-             sizeof(cpi->interp_filter_selected[0]));
+             cpi->interp_filter_selected[BWDREF_FRAME],
+             sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
     }
-#endif  // CONFIG_EXT_REFS
   }
 
 #if DUMP_REF_FRAME_IMAGES == 1
@@ -3937,19 +3523,11 @@ static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, int buffer_idx) {
 
 static void scale_references(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MV_REFERENCE_FRAME ref_frame;
   const AOM_REFFRAME ref_mask[INTER_REFS_PER_FRAME] = {
-    AOM_LAST_FLAG,
-#if CONFIG_EXT_REFS
-    AOM_LAST2_FLAG,
-    AOM_LAST3_FLAG,
-#endif  // CONFIG_EXT_REFS
-    AOM_GOLD_FLAG,
-#if CONFIG_EXT_REFS
-    AOM_BWD_FLAG,
-    AOM_ALT2_FLAG,
-#endif  // CONFIG_EXT_REFS
-    AOM_ALT_FLAG
+    AOM_LAST_FLAG, AOM_LAST2_FLAG, AOM_LAST3_FLAG, AOM_GOLD_FLAG,
+    AOM_BWD_FLAG,  AOM_ALT2_FLAG,  AOM_ALT_FLAG
   };
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
@@ -3964,7 +3542,6 @@ static void scale_references(AV1_COMP *cpi) {
         continue;
       }
 
-#if CONFIG_HIGHBITDEPTH
       if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
         RefCntBuffer *new_fb_ptr = NULL;
         int force_scaling = 0;
@@ -3983,35 +3560,11 @@ static void scale_references(AV1_COMP *cpi) {
                   cm->byte_alignment, NULL, NULL, NULL))
             aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
-          av1_resize_and_extend_frame(ref, &new_fb_ptr->buf,
-                                      (int)cm->bit_depth);
+          av1_resize_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth,
+                                      num_planes);
           cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
         }
-#else
-      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
-        RefCntBuffer *new_fb_ptr = NULL;
-        int force_scaling = 0;
-        int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
-        if (new_fb == INVALID_IDX) {
-          new_fb = get_free_fb(cm);
-          force_scaling = 1;
-        }
-        if (new_fb == INVALID_IDX) return;
-        new_fb_ptr = &pool->frame_bufs[new_fb];
-        if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
-            new_fb_ptr->buf.y_crop_height != cm->height) {
-          if (aom_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
-                                       cm->subsampling_x, cm->subsampling_y,
-                                       AOM_BORDER_IN_PIXELS, cm->byte_alignment,
-                                       NULL, NULL, NULL))
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                               "Failed to allocate frame buffer");
-          av1_resize_and_extend_frame(ref, &new_fb_ptr->buf);
-          cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
-          alloc_frame_mvs(cm, new_fb);
-        }
-#endif  // CONFIG_HIGHBITDEPTH
       } else {
         const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
         RefCntBuffer *const buf = &pool->frame_bufs[buf_idx];
@@ -4029,115 +3582,18 @@ static void scale_references(AV1_COMP *cpi) {
 static void release_scaled_references(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
   int i;
-  if (cpi->oxcf.pass == 0) {
-    // Only release scaled references under certain conditions:
-    // if reference will be updated, or if scaled reference has same resolution.
-    int refresh[INTER_REFS_PER_FRAME];
-    refresh[0] = (cpi->refresh_last_frame) ? 1 : 0;
-#if CONFIG_EXT_REFS
-    refresh[1] = refresh[2] = 0;
-    refresh[3] = (cpi->refresh_golden_frame) ? 1 : 0;
-    refresh[4] = (cpi->refresh_bwd_ref_frame) ? 1 : 0;
-    refresh[5] = (cpi->refresh_alt2_ref_frame) ? 1 : 0;
-    refresh[6] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
-#else   // !CONFIG_EXT_REFS
-    refresh[1] = (cpi->refresh_golden_frame) ? 1 : 0;
-    refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
-#endif  // CONFIG_EXT_REFS
-    for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
-      const int idx = cpi->scaled_ref_idx[i - 1];
-      RefCntBuffer *const buf =
-          idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL;
-      const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i);
-      if (buf != NULL &&
-          (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width &&
-                              buf->buf.y_crop_height == ref->y_crop_height))) {
-        --buf->ref_count;
-        cpi->scaled_ref_idx[i - 1] = INVALID_IDX;
-      }
-    }
-  } else {
-    for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) {
-      const int idx = cpi->scaled_ref_idx[i];
-      RefCntBuffer *const buf =
-          idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL;
-      if (buf != NULL) {
-        --buf->ref_count;
-        cpi->scaled_ref_idx[i] = INVALID_IDX;
-      }
+  // TODO(isbs): only refresh the necessary frames, rather than all of them
+  for (i = 0; i < REF_FRAMES; ++i) {
+    const int idx = cpi->scaled_ref_idx[i];
+    RefCntBuffer *const buf =
+        idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL;
+    if (buf != NULL) {
+      --buf->ref_count;
+      cpi->scaled_ref_idx[i] = INVALID_IDX;
     }
   }
 }
 
-#if 0 && CONFIG_INTERNAL_STATS
-static void output_frame_level_debug_stats(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
-  int64_t recon_err;
-
-  aom_clear_system_state();
-
-  recon_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
-
-  if (cpi->twopass.total_left_stats.coded_error != 0.0)
-    fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
-       "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
-       "%10"PRId64" %10"PRId64" %10d "
-       "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
-        "%6d %6d %5d %5d %5d "
-        "%10"PRId64" %10.3lf"
-        "%10lf %8u %10"PRId64" %10d %10d %10d\n",
-        cpi->common.current_video_frame,
-        cm->width, cm->height,
-        cpi->rc.source_alt_ref_pending,
-        cpi->rc.source_alt_ref_active,
-        cpi->rc.this_frame_target,
-        cpi->rc.projected_frame_size,
-        cpi->rc.projected_frame_size / cpi->common.MBs,
-        (cpi->rc.projected_frame_size - cpi->rc.this_frame_target),
-        cpi->rc.vbr_bits_off_target,
-        cpi->rc.vbr_bits_off_target_fast,
-        cpi->twopass.extend_minq,
-        cpi->twopass.extend_minq_fast,
-        cpi->rc.total_target_vs_actual,
-        (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target),
-        cpi->rc.total_actual_bits, cm->base_qindex,
-        av1_convert_qindex_to_q(cm->base_qindex, cm->bit_depth),
-        (double)av1_dc_quant(cm->base_qindex, 0, cm->bit_depth) / 4.0,
-        av1_convert_qindex_to_q(cpi->twopass.active_worst_quality,
-                                cm->bit_depth),
-        cpi->rc.avg_q,
-        av1_convert_qindex_to_q(cpi->oxcf.cq_level, cm->bit_depth),
-        cpi->refresh_last_frame, cpi->refresh_golden_frame,
-        cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost,
-        cpi->twopass.bits_left,
-        cpi->twopass.total_left_stats.coded_error,
-        cpi->twopass.bits_left /
-            (1 + cpi->twopass.total_left_stats.coded_error),
-        cpi->tot_recode_hits, recon_err, cpi->rc.kf_boost,
-        cpi->twopass.kf_zeromotion_pct,
-        cpi->twopass.fr_content_type);
-
-  fclose(f);
-
-  if (0) {
-    FILE *const fmodes = fopen("Modes.stt", "a");
-    int i;
-
-    fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame,
-            cm->frame_type, cpi->refresh_golden_frame,
-            cpi->refresh_alt_ref_frame);
-
-    for (i = 0; i < MAX_MODES; ++i)
-      fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
-
-    fprintf(fmodes, "\n");
-
-    fclose(fmodes);
-  }
-}
-#endif
-
 static void set_mv_search_params(AV1_COMP *cpi) {
   const AV1_COMMON *const cm = &cpi->common;
   const unsigned int max_mv_def = AOMMIN(cm->width, cm->height);
@@ -4164,18 +3620,16 @@ static void set_mv_search_params(AV1_COMP *cpi) {
 }
 
 static void set_size_independent_vars(AV1_COMP *cpi) {
-#if CONFIG_GLOBAL_MOTION
   int i;
   for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
     cpi->common.global_motion[i] = default_warp_params;
   }
   cpi->global_motion_search_done = 0;
-#endif  // CONFIG_GLOBAL_MOTION
   av1_set_speed_features_framesize_independent(cpi);
   av1_set_rd_speed_thresholds(cpi);
   av1_set_rd_speed_thresholds_sub8x8(cpi);
   cpi->common.interp_filter = cpi->sf.default_interp_filter;
-  if (!frame_is_intra_only(&cpi->common)) set_compound_tools(&cpi->common);
+  cpi->common.switchable_motion_mode = 1;
 }
 
 static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
@@ -4186,24 +3640,13 @@ static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
   // Setup variables that depend on the dimensions of the frame.
   av1_set_speed_features_framesize_dependent(cpi);
 
-// Decide q and q bounds.
-#if CONFIG_XIPHRC
-  int frame_type = cm->frame_type == KEY_FRAME ? OD_I_FRAME : OD_P_FRAME;
-  *q = od_enc_rc_select_quantizers_and_lambdas(
-      &cpi->od_rc, cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame,
-      frame_type, bottom_index, top_index);
-#else
+  // Decide q and q bounds.
   *q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, bottom_index,
                                 top_index);
-#endif
 
   if (!frame_is_intra_only(cm)) {
-#if CONFIG_AMVR
     set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH,
-                          cpi->common.cur_frame_mv_precision_level);
-#else
-    set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
-#endif
+                          cpi->common.cur_frame_force_integer_mv);
   }
 
   // Configure experimental use of segmentation for enhanced coding of
@@ -4224,10 +3667,9 @@ static void init_motion_estimation(AV1_COMP *cpi) {
   }
 }
 
-#if CONFIG_LOOP_RESTORATION
 #define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0
-static void set_restoration_tilesize(int width, int height, int sx, int sy,
-                                     RestorationInfo *rst) {
+static void set_restoration_unit_size(int width, int height, int sx, int sy,
+                                      RestorationInfo *rst) {
   (void)width;
   (void)height;
   (void)sx;
@@ -4238,17 +3680,13 @@ static void set_restoration_tilesize(int width, int height, int sx, int sy,
   int s = 0;
 #endif  // !COUPLED_CHROMA_FROM_LUMA_RESTORATION
 
-  rst[0].restoration_tilesize = (RESTORATION_TILESIZE_MAX >> 1);
-  rst[1].restoration_tilesize = rst[0].restoration_tilesize >> s;
-  rst[2].restoration_tilesize = rst[1].restoration_tilesize;
-
-  rst[0].procunit_width = rst[0].procunit_height = RESTORATION_PROC_UNIT_SIZE;
-  rst[1].procunit_width = rst[2].procunit_width =
-      RESTORATION_PROC_UNIT_SIZE >> sx;
-  rst[1].procunit_height = rst[2].procunit_height =
-      RESTORATION_PROC_UNIT_SIZE >> sy;
+  if (width * height > 352 * 288)
+    rst[0].restoration_unit_size = RESTORATION_UNITSIZE_MAX;
+  else
+    rst[0].restoration_unit_size = (RESTORATION_UNITSIZE_MAX >> 1);
+  rst[1].restoration_unit_size = rst[0].restoration_unit_size >> s;
+  rst[2].restoration_unit_size = rst[1].restoration_unit_size;
 }
-#endif  // CONFIG_LOOP_RESTORATION
 
 static void init_ref_frame_bufs(AV1_COMMON *cm) {
   int i;
@@ -4258,31 +3696,23 @@ static void init_ref_frame_bufs(AV1_COMMON *cm) {
     cm->ref_frame_map[i] = INVALID_IDX;
     pool->frame_bufs[i].ref_count = 0;
   }
-#if CONFIG_HASH_ME
-  for (i = 0; i < FRAME_BUFFERS; ++i) {
-    av1_hash_table_init(&pool->frame_bufs[i].hash_table);
+  if (cm->seq_params.force_screen_content_tools) {
+    for (i = 0; i < FRAME_BUFFERS; ++i) {
+      av1_hash_table_init(&pool->frame_bufs[i].hash_table);
+    }
   }
-#endif
 }
 
-static void check_initial_width(AV1_COMP *cpi,
-#if CONFIG_HIGHBITDEPTH
-                                int use_highbitdepth,
-#endif
+static void check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
                                 int subsampling_x, int subsampling_y) {
   AV1_COMMON *const cm = &cpi->common;
 
-  if (!cpi->initial_width ||
-#if CONFIG_HIGHBITDEPTH
-      cm->use_highbitdepth != use_highbitdepth ||
-#endif
+  if (!cpi->initial_width || cm->use_highbitdepth != use_highbitdepth ||
       cm->subsampling_x != subsampling_x ||
       cm->subsampling_y != subsampling_y) {
     cm->subsampling_x = subsampling_x;
     cm->subsampling_y = subsampling_y;
-#if CONFIG_HIGHBITDEPTH
     cm->use_highbitdepth = use_highbitdepth;
-#endif
 
     alloc_raw_frame_buffers(cpi);
     init_ref_frame_bufs(cm);
@@ -4299,12 +3729,9 @@ static void check_initial_width(AV1_COMP *cpi,
 // Returns 1 if the assigned width or height was <= 0.
 static int set_size_literal(AV1_COMP *cpi, int width, int height) {
   AV1_COMMON *cm = &cpi->common;
-#if CONFIG_HIGHBITDEPTH
+  const int num_planes = av1_num_planes(cm);
   check_initial_width(cpi, cm->use_highbitdepth, cm->subsampling_x,
                       cm->subsampling_y);
-#else
-  check_initial_width(cpi, cm->subsampling_x, cm->subsampling_y);
-#endif  // CONFIG_HIGHBITDEPTH
 
   if (width <= 0 || height <= 0) return 1;
 
@@ -4314,7 +3741,7 @@ static int set_size_literal(AV1_COMP *cpi, int width, int height) {
   if (cpi->initial_width && cpi->initial_height &&
       (cm->width > cpi->initial_width || cm->height > cpi->initial_height)) {
     av1_free_context_buffers(cm);
-    av1_free_pc_tree(&cpi->td);
+    av1_free_pc_tree(&cpi->td, num_planes);
     alloc_compressor_data(cpi);
     realloc_segmentation_maps(cpi);
     cpi->initial_width = cpi->initial_height = 0;
@@ -4326,6 +3753,7 @@ static int set_size_literal(AV1_COMP *cpi, int width, int height) {
 
 static void set_frame_size(AV1_COMP *cpi, int width, int height) {
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   int ref_frame;
 
@@ -4333,52 +3761,42 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) {
     // There has been a change in the encoded frame size
     set_size_literal(cpi, width, height);
     set_mv_search_params(cpi);
+    // Recalculate 'all_lossless' in case super-resolution was (un)selected.
+    cm->all_lossless = cm->coded_lossless && !av1_superres_scaled(cm);
   }
 
-#if !CONFIG_XIPHRC
   if (cpi->oxcf.pass == 2) {
     av1_set_target_rate(cpi, cm->width, cm->height);
   }
-#endif
 
   alloc_frame_mvs(cm, cm->new_fb_idx);
 
+  // Allocate above context buffers
+  if (cm->num_allocated_above_context_planes < av1_num_planes(cm) ||
+      cm->num_allocated_above_context_mi_col < cm->mi_cols ||
+      cm->num_allocated_above_contexts < cm->tile_rows) {
+    av1_free_above_context_buffers(cm, cm->num_allocated_above_contexts);
+    if (av1_alloc_above_context_buffers(cm, cm->tile_rows))
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate context buffers");
+  }
+
   // Reset the frame pointers to the current frame size.
   if (aom_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                               cm->use_highbitdepth,
-#endif
-                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
-                               NULL, NULL))
+                               cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                               cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
 
-#if CONFIG_LOOP_RESTORATION
-  set_restoration_tilesize(
-#if CONFIG_FRAME_SUPERRES
-      cm->superres_upscaled_width, cm->superres_upscaled_height,
-#else
-      cm->width, cm->height,
-#endif  // CONFIG_FRAME_SUPERRES
-      cm->subsampling_x, cm->subsampling_y, cm->rst_info);
-  for (int i = 0; i < MAX_MB_PLANE; ++i)
+  const int frame_width = cm->superres_upscaled_width;
+  const int frame_height = cm->superres_upscaled_height;
+  set_restoration_unit_size(frame_width, frame_height, cm->subsampling_x,
+                            cm->subsampling_y, cm->rst_info);
+  for (int i = 0; i < num_planes; ++i)
     cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
+
   av1_alloc_restoration_buffers(cm);
-  for (int i = 0; i < MAX_MB_PLANE; ++i) {
-    cpi->rst_search[i].restoration_tilesize =
-        cm->rst_info[i].restoration_tilesize;
-    cpi->rst_search[i].procunit_width = cm->rst_info[i].procunit_width;
-    cpi->rst_search[i].procunit_height = cm->rst_info[i].procunit_height;
-    av1_alloc_restoration_struct(cm, &cpi->rst_search[i],
-#if CONFIG_FRAME_SUPERRES
-                                 cm->superres_upscaled_width,
-                                 cm->superres_upscaled_height);
-#else
-                                 cm->width, cm->height);
-#endif  // CONFIG_FRAME_SUPERRES
-  }
-#endif                            // CONFIG_LOOP_RESTORATION
   alloc_util_frame_buffers(cpi);  // TODO(afergs): Remove? Gets called anyways.
   init_motion_estimation(cpi);
 
@@ -4391,36 +3809,18 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) {
     if (buf_idx != INVALID_IDX) {
       YV12_BUFFER_CONFIG *const buf = &cm->buffer_pool->frame_bufs[buf_idx].buf;
       ref_buf->buf = buf;
-#if CONFIG_HIGHBITDEPTH
-      av1_setup_scale_factors_for_frame(
-          &ref_buf->sf, buf->y_crop_width, buf->y_crop_height, cm->width,
-          cm->height, (buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0);
-#else
       av1_setup_scale_factors_for_frame(&ref_buf->sf, buf->y_crop_width,
                                         buf->y_crop_height, cm->width,
                                         cm->height);
-#endif  // CONFIG_HIGHBITDEPTH
-      if (av1_is_scaled(&ref_buf->sf)) aom_extend_frame_borders(buf);
+      if (av1_is_scaled(&ref_buf->sf))
+        aom_extend_frame_borders(buf, num_planes);
     } else {
       ref_buf->buf = NULL;
     }
   }
 
-#if CONFIG_VAR_REFS
-  // Check duplicate reference frames
-  enc_check_valid_ref_frames(cpi);
-#endif  // CONFIG_VAR_REFS
-
-#if CONFIG_INTRABC
-#if CONFIG_HIGHBITDEPTH
-  av1_setup_scale_factors_for_frame(&xd->sf_identity, cm->width, cm->height,
-                                    cm->width, cm->height,
-                                    cm->use_highbitdepth);
-#else
-  av1_setup_scale_factors_for_frame(&xd->sf_identity, cm->width, cm->height,
+  av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height,
                                     cm->width, cm->height);
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_INTRABC
 
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
 }
@@ -4432,6 +3832,7 @@ static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) {
   if (oxcf->pass == 1) return SCALE_NUMERATOR;
   uint8_t new_denom = SCALE_NUMERATOR;
 
+  if (cpi->common.seq_params.reduced_still_picture_hdr) return SCALE_NUMERATOR;
   switch (oxcf->resize_mode) {
     case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break;
     case RESIZE_FIXED:
@@ -4446,15 +3847,19 @@ static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) {
   return new_denom;
 }
 
-#if CONFIG_FRAME_SUPERRES
-
 static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
   // Choose an arbitrary random number
   static unsigned int seed = 34567;
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
   if (oxcf->pass == 1) return SCALE_NUMERATOR;
   uint8_t new_denom = SCALE_NUMERATOR;
-  int bottom_index, top_index, q, qthresh;
+
+  // Make sure that superres mode of the frame is consistent with the
+  // sequence-level flag.
+  assert(IMPLIES(oxcf->superres_mode != SUPERRES_NONE,
+                 cpi->common.seq_params.enable_superres));
+  assert(IMPLIES(!cpi->common.seq_params.enable_superres,
+                 oxcf->superres_mode == SUPERRES_NONE));
 
   switch (oxcf->superres_mode) {
     case SUPERRES_NONE: new_denom = SCALE_NUMERATOR; break;
@@ -4465,21 +3870,35 @@ static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
         new_denom = oxcf->superres_scale_denominator;
       break;
     case SUPERRES_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
-    case SUPERRES_QTHRESH:
-      qthresh = (cpi->common.frame_type == KEY_FRAME ? oxcf->superres_kf_qthresh
-                                                     : oxcf->superres_qthresh);
+    case SUPERRES_QTHRESH: {
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      const RATE_FACTOR_LEVEL rf_level = gf_group->rf_level[gf_group->index];
+      const double rate_factor_delta = rate_factor_deltas[rf_level];
+      const int qthresh = (rate_factor_delta <= 1.0)
+                              ? oxcf->superres_qthresh
+                              : oxcf->superres_kf_qthresh;
       av1_set_target_rate(cpi, cpi->oxcf.width, cpi->oxcf.height);
-      q = av1_rc_pick_q_and_bounds(cpi, cpi->oxcf.width, cpi->oxcf.height,
-                                   &bottom_index, &top_index);
+      int bottom_index, top_index;
+      const int q = av1_rc_pick_q_and_bounds(
+          cpi, cpi->oxcf.width, cpi->oxcf.height, &bottom_index, &top_index);
       if (q < qthresh) {
         new_denom = SCALE_NUMERATOR;
       } else {
-        new_denom = SCALE_NUMERATOR + 1 + ((q - qthresh) >> 3);
-        new_denom = AOMMIN(SCALE_NUMERATOR << 1, new_denom);
-        // printf("SUPERRES: q %d, qthresh %d: denom %d\n", q, qthresh,
-        // new_denom);
+        const uint8_t min_denom = SCALE_NUMERATOR + 1;
+        const uint8_t denom_step = (MAXQ - qthresh + 1) >> 3;
+
+        if (q == qthresh) {
+          new_denom = min_denom;
+        } else if (denom_step == 0) {
+          new_denom = SCALE_NUMERATOR << 1;
+        } else {
+          const uint8_t additional_denom = (q - qthresh) / denom_step;
+          new_denom =
+              AOMMIN(min_denom + additional_denom, SCALE_NUMERATOR << 1);
+        }
       }
       break;
+    }
     default: assert(0);
   }
   return new_denom;
@@ -4489,15 +3908,12 @@ static int dimension_is_ok(int orig_dim, int resized_dim, int denom) {
   return (resized_dim * SCALE_NUMERATOR >= orig_dim * denom / 2);
 }
 
-// TODO(now): Fix?
 static int dimensions_are_ok(int owidth, int oheight, size_params_type *rsz) {
-  return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom) &&
-         (CONFIG_HORZONLY_FRAME_SUPERRES ||
-          dimension_is_ok(oheight, rsz->resize_height, rsz->superres_denom));
+  // Only need to check the width, as scaling is horizontal only.
+  (void)oheight;
+  return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom);
 }
 
-#define DIVIDE_AND_ROUND(x, y) (((x) + ((y) >> 1)) / (y))
-
 static int validate_size_scales(RESIZE_MODE resize_mode,
                                 SUPERRES_MODE superres_mode, int owidth,
                                 int oheight, size_params_type *rsz) {
@@ -4548,24 +3964,17 @@ static int validate_size_scales(RESIZE_MODE resize_mode,
     } while (!dimensions_are_ok(owidth, oheight, rsz) &&
              (resize_denom > SCALE_NUMERATOR ||
               rsz->superres_denom > SCALE_NUMERATOR));
-  } else {  // We are allowed to alter neither resize scale nor superres scale.
+  } else {  // We are allowed to alter neither resize scale nor superres
+            // scale.
     return 0;
   }
   return dimensions_are_ok(owidth, oheight, rsz);
 }
-#undef DIVIDE_AND_ROUND
-#endif  // CONFIG_FRAME_SUPERRES
 
 // Calculates resize and superres params for next frame
 size_params_type av1_calculate_next_size_params(AV1_COMP *cpi) {
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
-  size_params_type rsz = {
-    oxcf->width,
-    oxcf->height,
-#if CONFIG_FRAME_SUPERRES
-    SCALE_NUMERATOR
-#endif  // CONFIG_FRAME_SUPERRES
-  };
+  size_params_type rsz = { oxcf->width, oxcf->height, SCALE_NUMERATOR };
   int resize_denom;
   if (oxcf->pass == 1) return rsz;
   if (cpi->resize_pending_width && cpi->resize_pending_height) {
@@ -4579,12 +3988,10 @@ size_params_type av1_calculate_next_size_params(AV1_COMP *cpi) {
     av1_calculate_scaled_size(&rsz.resize_width, &rsz.resize_height,
                               resize_denom);
   }
-#if CONFIG_FRAME_SUPERRES
   rsz.superres_denom = calculate_next_superres_scale(cpi);
   if (!validate_size_scales(oxcf->resize_mode, oxcf->superres_mode, oxcf->width,
                             oxcf->height, &rsz))
     assert(0 && "Invalid scale parameters");
-#endif  // CONFIG_FRAME_SUPERRES
   return rsz;
 }
 
@@ -4592,14 +3999,12 @@ static void setup_frame_size_from_params(AV1_COMP *cpi, size_params_type *rsz) {
   int encode_width = rsz->resize_width;
   int encode_height = rsz->resize_height;
 
-#if CONFIG_FRAME_SUPERRES
   AV1_COMMON *cm = &cpi->common;
   cm->superres_upscaled_width = encode_width;
   cm->superres_upscaled_height = encode_height;
   cm->superres_scale_denominator = rsz->superres_denom;
   av1_calculate_scaled_superres_size(&encode_width, &encode_height,
                                      rsz->superres_denom);
-#endif  // CONFIG_FRAME_SUPERRES
   set_frame_size(cpi, encode_width, encode_height);
 }
 
@@ -4608,67 +4013,63 @@ static void setup_frame_size(AV1_COMP *cpi) {
   setup_frame_size_from_params(cpi, &rsz);
 }
 
-#if CONFIG_FRAME_SUPERRES
 static void superres_post_encode(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  if (!av1_superres_scaled(cm)) return;
 
-  if (av1_superres_unscaled(cm)) return;
+  assert(cpi->oxcf.enable_superres);
+  assert(!is_lossless_requested(&cpi->oxcf));
+  assert(!cm->all_lossless);
 
   av1_superres_upscale(cm, NULL);
 
   // If regular resizing is occurring the source will need to be downscaled to
   // match the upscaled superres resolution. Otherwise the original source is
   // used.
-  if (av1_resize_unscaled(cm)) {
+  if (!av1_resize_scaled(cm)) {
     cpi->source = cpi->unscaled_source;
     if (cpi->last_source != NULL) cpi->last_source = cpi->unscaled_last_source;
   } else {
     assert(cpi->unscaled_source->y_crop_width != cm->superres_upscaled_width);
     assert(cpi->unscaled_source->y_crop_height != cm->superres_upscaled_height);
-    // Do downscale. cm->(width|height) has been updated by av1_superres_upscale
+    // Do downscale. cm->(width|height) has been updated by
+    // av1_superres_upscale
     if (aom_realloc_frame_buffer(
             &cpi->scaled_source, cm->superres_upscaled_width,
             cm->superres_upscaled_height, cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-            cm->use_highbitdepth,
-#endif  // CONFIG_HIGHBITDEPTH
-            AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+            cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->byte_alignment,
+            NULL, NULL, NULL))
       aom_internal_error(
           &cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to reallocate scaled source buffer for superres");
     assert(cpi->scaled_source.y_crop_width == cm->superres_upscaled_width);
     assert(cpi->scaled_source.y_crop_height == cm->superres_upscaled_height);
-#if CONFIG_HIGHBITDEPTH
     av1_resize_and_extend_frame(cpi->unscaled_source, &cpi->scaled_source,
-                                (int)cm->bit_depth);
-#else
-    av1_resize_and_extend_frame(cpi->unscaled_source, &cpi->scaled_source);
-#endif  // CONFIG_HIGHBITDEPTH
+                                (int)cm->bit_depth, num_planes);
     cpi->source = &cpi->scaled_source;
   }
 }
-#endif  // CONFIG_FRAME_SUPERRES
 
 static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
-  struct loopfilter *lf = &cm->lf;
-  int no_loopfilter = 0;
 
-  if (is_lossless_requested(&cpi->oxcf)) no_loopfilter = 1;
+  assert(IMPLIES(is_lossless_requested(&cpi->oxcf),
+                 cm->coded_lossless && cm->all_lossless));
 
-#if CONFIG_EXT_TILE
-  // 0 loopfilter level is only necessary if individual tile
-  // decoding is required.
-  if (cm->single_tile_decoding) no_loopfilter = 1;
-#endif  // CONFIG_EXT_TILE
+  const int no_loopfilter = cm->coded_lossless || cm->large_scale_tile;
+  const int no_cdef =
+      !cm->seq_params.enable_cdef || cm->coded_lossless || cm->large_scale_tile;
+  const int no_restoration = !cm->seq_params.enable_restoration ||
+                             cm->all_lossless || cm->large_scale_tile;
+
+  struct loopfilter *lf = &cm->lf;
 
   if (no_loopfilter) {
-#if CONFIG_LOOPFILTER_LEVEL
     lf->filter_level[0] = 0;
     lf->filter_level[1] = 0;
-#else
-    lf->filter_level = 0;
-#endif
   } else {
     struct aom_usec_timer timer;
 
@@ -4682,79 +4083,60 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
     cpi->time_pick_lpf += aom_usec_timer_elapsed(&timer);
   }
 
-#if !CONFIG_LPF_SB
-#if CONFIG_LOOPFILTER_LEVEL
-  if (lf->filter_level[0] || lf->filter_level[1])
-#else
-  if (lf->filter_level > 0)
-#endif
-#endif  // CONFIG_LPF_SB
-  {
-#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4
-#if CONFIG_LPF_SB
-    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0, 0,
-                          0);
-#else
-#if CONFIG_LOOPFILTER_LEVEL
-    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level[0],
-                          lf->filter_level[1], 0, 0);
-    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level_u,
-                          lf->filter_level_u, 1, 0);
-    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level_v,
-                          lf->filter_level_v, 2, 0);
-
-#else
-    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif  // CONFIG_LPF_SB
+  if (lf->filter_level[0] || lf->filter_level[1]) {
+#if LOOP_FILTER_BITMASK
+    av1_loop_filter_frame(cm->frame_to_show, cm, xd, 0, num_planes, 0);
 #else
     if (cpi->num_workers > 1)
-      av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
-                               lf->filter_level, 0, 0, cpi->workers,
-                               cpi->num_workers, &cpi->lf_row_sync);
+      av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd, 0, num_planes, 0,
+                               cpi->workers, cpi->num_workers,
+                               &cpi->lf_row_sync);
     else
-      av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+      av1_loop_filter_frame(cm->frame_to_show, cm, xd, 0, num_planes, 0);
 #endif
   }
 
-#if CONFIG_STRIPED_LOOP_RESTORATION
-  av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm);
-#endif
+  if (!no_restoration)
+    av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm, 0);
 
-#if CONFIG_CDEF
-  if (is_lossless_requested(&cpi->oxcf)) {
+  if (no_cdef) {
     cm->cdef_bits = 0;
     cm->cdef_strengths[0] = 0;
     cm->nb_cdef_strengths = 1;
+    cm->cdef_uv_strengths[0] = 0;
   } else {
     // Find CDEF parameters
     av1_cdef_search(cm->frame_to_show, cpi->source, cm, xd,
-                    cpi->oxcf.speed > 0);
+                    cpi->sf.fast_cdef_search);
 
     // Apply the filter
     av1_cdef_frame(cm->frame_to_show, cm, xd);
   }
-#endif
 
-#if CONFIG_FRAME_SUPERRES
   superres_post_encode(cpi);
-#endif  // CONFIG_FRAME_SUPERRES
 
-#if CONFIG_LOOP_RESTORATION
-  aom_extend_frame_borders(cm->frame_to_show);
-  av1_pick_filter_restoration(cpi->source, cpi, cpi->sf.lpf_pick);
-  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
-    av1_loop_restoration_frame(cm->frame_to_show, cm, cm->rst_info, 7, 0, NULL);
+  if (no_restoration) {
+    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+  } else {
+    av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm, 1);
+    av1_pick_filter_restoration(cpi->source, cpi);
+    if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+        cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+        cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
+      if (cpi->num_workers > 1)
+        av1_loop_restoration_filter_frame_mt(cm->frame_to_show, cm, 0,
+                                             cpi->workers, cpi->num_workers,
+                                             &cpi->lr_row_sync, &cpi->lr_ctxt);
+      else
+        av1_loop_restoration_filter_frame(cm->frame_to_show, cm, 0,
+                                          &cpi->lr_ctxt);
+    }
   }
-#endif  // CONFIG_LOOP_RESTORATION
-  // TODO(debargha): Fix mv search range on encoder side
-  // aom_extend_frame_inner_borders(cm->frame_to_show);
-  aom_extend_frame_borders(cm->frame_to_show);
 }
 
-static void encode_without_recode_loop(AV1_COMP *cpi) {
+static int encode_without_recode_loop(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   int q = 0, bottom_index = 0, top_index = 0;  // Dummy variables.
 
@@ -4774,10 +4156,7 @@ static void encode_without_recode_loop(AV1_COMP *cpi) {
   if (cpi->unscaled_last_source != NULL)
     cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
                                              &cpi->scaled_last_source);
-#if CONFIG_HIGHBITDEPTH && CONFIG_GLOBAL_MOTION
   cpi->source->buf_8bit_valid = 0;
-#endif
-
   if (frame_is_intra_only(cm) == 0) {
     scale_references(cpi);
   }
@@ -4796,6 +4175,16 @@ static void encode_without_recode_loop(AV1_COMP *cpi) {
     av1_cyclic_refresh_setup(cpi);
   }
   apply_active_map(cpi);
+  if (cm->seg.enabled) {
+    if (!cm->seg.update_data && cm->prev_frame) {
+      segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+    } else {
+      calculate_segdata(&cm->seg);
+    }
+  } else {
+    memset(&cm->seg, 0, sizeof(cm->seg));
+  }
+  segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
 
   // transform / motion compensation build reconstruction frame
   av1_encode_frame(cpi);
@@ -4810,29 +4199,25 @@ static void encode_without_recode_loop(AV1_COMP *cpi) {
   // seen in the last encoder iteration.
   // update_base_skip_probs(cpi);
   aom_clear_system_state();
+  return AOM_CODEC_OK;
 }
 
-static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
-                                    uint8_t *dest) {
+static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   int bottom_index, top_index;
   int loop_count = 0;
   int loop_at_this_size = 0;
   int loop = 0;
-#if !CONFIG_XIPHRC
   int overshoot_seen = 0;
   int undershoot_seen = 0;
-#endif
   int frame_over_shoot_limit;
   int frame_under_shoot_limit;
   int q = 0, q_low = 0, q_high = 0;
 
   set_size_independent_vars(cpi);
 
-#if CONFIG_HIGHBITDEPTH && CONFIG_GLOBAL_MOTION
   cpi->source->buf_8bit_valid = 0;
-#endif
 
   aom_clear_system_state();
   setup_frame_size(cpi);
@@ -4845,32 +4230,27 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
       // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
       set_mv_search_params(cpi);
 
-#if !CONFIG_XIPHRC
       // Reset the loop state for new frame size.
       overshoot_seen = 0;
       undershoot_seen = 0;
-#endif
 
       q_low = bottom_index;
       q_high = top_index;
 
       loop_at_this_size = 0;
-    }
 
-    // Decide frame size bounds first time through.
-    if (loop_count == 0) {
+      // Decide frame size bounds first time through.
       av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
                                        &frame_under_shoot_limit,
                                        &frame_over_shoot_limit);
     }
 
-#if CONFIG_GLOBAL_MOTION
-    // if frame was scaled calculate global_motion_search again if already done
+    // if frame was scaled calculate global_motion_search again if already
+    // done
     if (loop_count > 0 && cpi->source && cpi->global_motion_search_done)
       if (cpi->source->y_crop_width != cm->width ||
           cpi->source->y_crop_height != cm->height)
         cpi->global_motion_search_done = 0;
-#endif  // CONFIG_GLOBAL_MOTION
     cpi->source =
         av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
     if (cpi->unscaled_last_source != NULL)
@@ -4884,29 +4264,18 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
       scale_references(cpi);
     }
     av1_set_quantizer(cm, q);
+    // printf("Frame %d/%d: q = %d, frame_type = %d\n", cm->current_video_frame,
+    //        cm->show_frame, q, cm->frame_type);
 
     if (loop_count == 0) setup_frame(cpi);
 
-#if CONFIG_Q_ADAPT_PROBS
     // Base q-index may have changed, so we need to assign proper default coef
     // probs before every iteration.
-    if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
-      int i;
+    if (cm->primary_ref_frame == PRIMARY_REF_NONE ||
+        cm->frame_refs[cm->primary_ref_frame].idx < 0) {
       av1_default_coef_probs(cm);
-      if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode ||
-          cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL) {
-        for (i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
-      } else if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT) {
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-        if (cm->frame_refs[0].idx >= 0) {
-          cm->frame_contexts[cm->frame_refs[0].idx] = *cm->fc;
-        }
-#else
-        cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
-#endif
-      }
+      av1_setup_frame_contexts(cm);
     }
-#endif  // CONFIG_Q_ADAPT_PROBS
 
     // Variance adaptive and in frame q adjustment experiments are mutually
     // exclusive.
@@ -4915,6 +4284,16 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
     } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
       av1_setup_in_frame_q_adj(cpi);
     }
+    if (cm->seg.enabled) {
+      if (!cm->seg.update_data && cm->prev_frame) {
+        segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+      } else {
+        calculate_segdata(&cm->seg);
+      }
+    } else {
+      memset(&cm->seg, 0, sizeof(cm->seg));
+    }
+    segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
 
     // transform / motion compensation build reconstruction frame
     save_coding_context(cpi);
@@ -4931,7 +4310,9 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
     // to recode.
     if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
       restore_coding_context(cpi);
-      av1_pack_bitstream(cpi, dest, size);
+
+      if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK)
+        return AOM_CODEC_ERROR;
 
       rc->projected_frame_size = (int)(*size) << 3;
       restore_coding_context(cpi);
@@ -4950,16 +4331,11 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
         int64_t high_err_target = cpi->ambient_err;
         int64_t low_err_target = cpi->ambient_err >> 1;
 
-#if CONFIG_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
           kf_err = aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm));
         } else {
           kf_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
         }
-#else
-        kf_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
-#endif  // CONFIG_HIGHBITDEPTH
-
         // Prevent possible divide by zero error below for perfect KF
         kf_err += !kf_err;
 
@@ -4996,7 +4372,6 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
         // Is the projected frame size out of range and are we allowed
         // to attempt to recode.
         int last_q = q;
-#if !CONFIG_XIPHRC
         int retries = 0;
 
         // Frame size out of permitted range:
@@ -5062,7 +4437,6 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
 
           undershoot_seen = 1;
         }
-#endif
 
         // Clamp Q to upper and lower limits:
         q = clamp(q, q_low, q_high);
@@ -5078,11 +4452,9 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
         rc->projected_frame_size < rc->max_frame_bandwidth)
       loop = 0;
 
-#if CONFIG_GLOBAL_MOTION
     if (recode_loop_test_global_motion(cpi)) {
       loop = 1;
     }
-#endif  // CONFIG_GLOBAL_MOTION
 
     if (loop) {
       ++loop_count;
@@ -5093,86 +4465,90 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
 #endif
     }
   } while (loop);
+
+  return AOM_CODEC_OK;
 }
 
 static int get_ref_frame_flags(const AV1_COMP *cpi) {
   const int *const map = cpi->common.ref_frame_map;
 
-#if CONFIG_EXT_REFS
-  const int last2_is_last =
-      map[cpi->lst_fb_idxes[1]] == map[cpi->lst_fb_idxes[0]];
-  const int last3_is_last =
-      map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[0]];
-  const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[0]];
-#if CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS
-  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[0]];
-  const int last3_is_last2 =
-      map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[1]];
-  const int gld_is_last2 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[1]];
-  const int gld_is_last3 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[2]];
-#else   // !CONFIG_ONE_SIDED_COMPOUND || CONFIG_EXT_COMP_REFS
-  const int bwd_is_last = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[0]];
-  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[0]];
-
-  const int last3_is_last2 =
-      map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[1]];
-  const int gld_is_last2 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[1]];
-  const int bwd_is_last2 = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[1]];
-
-  const int gld_is_last3 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[2]];
-  const int bwd_is_last3 = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[2]];
-
-  const int bwd_is_gld = map[cpi->bwd_fb_idx] == map[cpi->gld_fb_idx];
-#endif  // CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS
-
-  const int alt2_is_last = map[cpi->alt2_fb_idx] == map[cpi->lst_fb_idxes[0]];
-  const int alt2_is_last2 = map[cpi->alt2_fb_idx] == map[cpi->lst_fb_idxes[1]];
-  const int alt2_is_last3 = map[cpi->alt2_fb_idx] == map[cpi->lst_fb_idxes[2]];
-  const int alt2_is_gld = map[cpi->alt2_fb_idx] == map[cpi->gld_fb_idx];
-  const int alt2_is_bwd = map[cpi->alt2_fb_idx] == map[cpi->bwd_fb_idx];
-
-  const int last2_is_alt = map[cpi->lst_fb_idxes[1]] == map[cpi->alt_fb_idx];
-  const int last3_is_alt = map[cpi->lst_fb_idxes[2]] == map[cpi->alt_fb_idx];
-  const int gld_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
-  const int bwd_is_alt = map[cpi->bwd_fb_idx] == map[cpi->alt_fb_idx];
-  const int alt2_is_alt = map[cpi->alt2_fb_idx] == map[cpi->alt_fb_idx];
-#else   // !CONFIG_EXT_REFS
-  const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
-  const int gld_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
-  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
-#endif  // CONFIG_EXT_REFS
-
-  int flags = AOM_REFFRAME_ALL;
-
-  if (gld_is_last || gld_is_alt) flags &= ~AOM_GOLD_FLAG;
+  // No.1 Priority: LAST_FRAME
+  const int last2_is_last = map[cpi->ref_fb_idx[1]] == map[cpi->ref_fb_idx[0]];
+  const int last3_is_last = map[cpi->ref_fb_idx[2]] == map[cpi->ref_fb_idx[0]];
+  const int gld_is_last =
+      map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]] == map[cpi->ref_fb_idx[0]];
+  const int bwd_is_last =
+      map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] == map[cpi->ref_fb_idx[0]];
+  const int alt2_is_last =
+      map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] == map[cpi->ref_fb_idx[0]];
+  const int alt_is_last =
+      map[cpi->ref_fb_idx[ALTREF_FRAME - 1]] == map[cpi->ref_fb_idx[0]];
+
+  // No.2 Priority: ALTREF_FRAME
+  const int last2_is_alt =
+      map[cpi->ref_fb_idx[1]] == map[cpi->ref_fb_idx[ALTREF_FRAME - 1]];
+  const int last3_is_alt =
+      map[cpi->ref_fb_idx[2]] == map[cpi->ref_fb_idx[ALTREF_FRAME - 1]];
+  const int gld_is_alt = map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]] ==
+                         map[cpi->ref_fb_idx[ALTREF_FRAME - 1]];
+  const int bwd_is_alt = map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] ==
+                         map[cpi->ref_fb_idx[ALTREF_FRAME - 1]];
+  const int alt2_is_alt = map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] ==
+                          map[cpi->ref_fb_idx[ALTREF_FRAME - 1]];
+
+  // No.3 Priority: LAST2_FRAME
+  const int last3_is_last2 = map[cpi->ref_fb_idx[2]] == map[cpi->ref_fb_idx[1]];
+  const int gld_is_last2 =
+      map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]] == map[cpi->ref_fb_idx[1]];
+  const int bwd_is_last2 =
+      map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] == map[cpi->ref_fb_idx[1]];
+  const int alt2_is_last2 =
+      map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] == map[cpi->ref_fb_idx[1]];
+
+  // No.4 Priority: LAST3_FRAME
+  const int gld_is_last3 =
+      map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]] == map[cpi->ref_fb_idx[2]];
+  const int bwd_is_last3 =
+      map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] == map[cpi->ref_fb_idx[2]];
+  const int alt2_is_last3 =
+      map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] == map[cpi->ref_fb_idx[2]];
+
+  // No.5 Priority: GOLDEN_FRAME
+  const int bwd_is_gld = map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] ==
+                         map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]];
+  const int alt2_is_gld = map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] ==
+                          map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]];
+
+  // No.6 Priority: BWDREF_FRAME
+  const int alt2_is_bwd = map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] ==
+                          map[cpi->ref_fb_idx[BWDREF_FRAME - 1]];
+
+  // No.7 Priority: ALTREF2_FRAME
+
+  // After av1_apply_encoding_flags() is called, cpi->ref_frame_flags might be
+  // adjusted according to external encoder flags.
+  int flags = cpi->ext_ref_frame_flags;
 
   if (cpi->rc.frames_till_gf_update_due == INT_MAX) flags &= ~AOM_GOLD_FLAG;
 
   if (alt_is_last) flags &= ~AOM_ALT_FLAG;
 
-#if CONFIG_EXT_REFS
   if (last2_is_last || last2_is_alt) flags &= ~AOM_LAST2_FLAG;
 
-  if (last3_is_last || last3_is_last2 || last3_is_alt) flags &= ~AOM_LAST3_FLAG;
+  if (last3_is_last || last3_is_alt || last3_is_last2) flags &= ~AOM_LAST3_FLAG;
 
-  if (gld_is_last2 || gld_is_last3) flags &= ~AOM_GOLD_FLAG;
+  if (gld_is_last || gld_is_alt || gld_is_last2 || gld_is_last3)
+    flags &= ~AOM_GOLD_FLAG;
 
-#if CONFIG_ONE_SIDED_COMPOUND && \
-    !CONFIG_EXT_COMP_REFS  // Changes LL & HL bitstream
-  /* Allow biprediction between two identical frames (e.g. bwd_is_last = 1) */
-  if (bwd_is_alt && (flags & AOM_BWD_FLAG)) flags &= ~AOM_BWD_FLAG;
-#else   // !CONFIG_ONE_SIDED_COMPOUND || CONFIG_EXT_COMP_REFS
-  if ((bwd_is_last || bwd_is_last2 || bwd_is_last3 || bwd_is_gld ||
-       bwd_is_alt) &&
+  if ((bwd_is_last || bwd_is_alt || bwd_is_last2 || bwd_is_last3 ||
+       bwd_is_gld) &&
       (flags & AOM_BWD_FLAG))
     flags &= ~AOM_BWD_FLAG;
-#endif  // CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS
 
-  if ((alt2_is_last || alt2_is_last2 || alt2_is_last3 || alt2_is_gld ||
-       alt2_is_bwd || alt2_is_alt) &&
+  if ((alt2_is_last || alt2_is_alt || alt2_is_last2 || alt2_is_last3 ||
+       alt2_is_gld || alt2_is_bwd) &&
       (flags & AOM_ALT2_FLAG))
     flags &= ~AOM_ALT2_FLAG;
-#endif  // CONFIG_EXT_REFS
 
   return flags;
 }
@@ -5182,6 +4558,9 @@ static void set_ext_overrides(AV1_COMP *cpi) {
   // av1_update_reference() and av1_update_entropy() calls
   // Note: The overrides are valid only for the next frame passed
   // to encode_frame_to_data_rate() function
+  if (cpi->ext_use_s_frame) cpi->common.frame_type = S_FRAME;
+  cpi->common.force_primary_ref_none = cpi->ext_use_primary_ref_none;
+
   if (cpi->ext_refresh_frame_context_pending) {
     cpi->common.refresh_frame_context = cpi->ext_refresh_frame_context;
     cpi->ext_refresh_frame_context_pending = 0;
@@ -5190,54 +4569,23 @@ static void set_ext_overrides(AV1_COMP *cpi) {
     cpi->refresh_last_frame = cpi->ext_refresh_last_frame;
     cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame;
     cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame;
+    cpi->refresh_bwd_ref_frame = cpi->ext_refresh_bwd_ref_frame;
+    cpi->refresh_alt2_ref_frame = cpi->ext_refresh_alt2_ref_frame;
     cpi->ext_refresh_frame_flags_pending = 0;
   }
+  cpi->common.allow_ref_frame_mvs = cpi->ext_use_ref_frame_mvs;
+  cpi->common.error_resilient_mode = cpi->ext_use_error_resilient;
 }
 
-#if !CONFIG_FRAME_SIGN_BIAS
-static void set_arf_sign_bias(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  int arf_sign_bias;
-#if CONFIG_EXT_REFS
-  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-  // The arf_sign_bias will be one for internal ARFs'
-  arf_sign_bias = cpi->rc.source_alt_ref_active &&
-                  (!cpi->refresh_alt_ref_frame ||
-                   gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE);
-#else   // !CONFIG_EXT_REFS
-  if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
-    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-    arf_sign_bias = cpi->rc.source_alt_ref_active &&
-                    (!cpi->refresh_alt_ref_frame ||
-                     (gf_group->rf_level[gf_group->index] == GF_ARF_LOW));
-  } else {
-    arf_sign_bias =
-        (cpi->rc.source_alt_ref_active && !cpi->refresh_alt_ref_frame);
-  }
-#endif  // CONFIG_EXT_REFS
-
-  cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias;
-#if CONFIG_EXT_REFS
-  cm->ref_frame_sign_bias[BWDREF_FRAME] = cm->ref_frame_sign_bias[ALTREF_FRAME];
-  cm->ref_frame_sign_bias[ALTREF2_FRAME] =
-      cm->ref_frame_sign_bias[ALTREF_FRAME];
-#endif  // CONFIG_EXT_REFS
-}
-#endif  // !CONFIG_FRAME_SIGN_BIAS
-
 static int setup_interp_filter_search_mask(AV1_COMP *cpi) {
   InterpFilter ifilter;
-  int ref_total[TOTAL_REFS_PER_FRAME] = { 0 };
+  int ref_total[REF_FRAMES] = { 0 };
   MV_REFERENCE_FRAME ref;
   int mask = 0;
   int arf_idx = ALTREF_FRAME;
 
-#if CONFIG_EXT_REFS
   if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
       cpi->refresh_alt2_ref_frame)
-#else   // !CONFIG_EXT_REFS
-  if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame)
-#endif  // CONFIG_EXT_REFS
     return mask;
 
   for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref)
@@ -5247,25 +4595,21 @@ static int setup_interp_filter_search_mask(AV1_COMP *cpi) {
   for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter) {
     if ((ref_total[LAST_FRAME] &&
          cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) &&
-#if CONFIG_EXT_REFS
         (ref_total[LAST2_FRAME] == 0 ||
          cpi->interp_filter_selected[LAST2_FRAME][ifilter] * 50 <
              ref_total[LAST2_FRAME]) &&
         (ref_total[LAST3_FRAME] == 0 ||
          cpi->interp_filter_selected[LAST3_FRAME][ifilter] * 50 <
              ref_total[LAST3_FRAME]) &&
-#endif  // CONFIG_EXT_REFS
         (ref_total[GOLDEN_FRAME] == 0 ||
          cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50 <
              ref_total[GOLDEN_FRAME]) &&
-#if CONFIG_EXT_REFS
         (ref_total[BWDREF_FRAME] == 0 ||
          cpi->interp_filter_selected[BWDREF_FRAME][ifilter] * 50 <
              ref_total[BWDREF_FRAME]) &&
         (ref_total[ALTREF2_FRAME] == 0 ||
          cpi->interp_filter_selected[ALTREF2_FRAME][ifilter] * 50 <
              ref_total[ALTREF2_FRAME]) &&
-#endif  // CONFIG_EXT_REFS
         (ref_total[ALTREF_FRAME] == 0 ||
          cpi->interp_filter_selected[arf_idx][ifilter] * 50 <
              ref_total[ALTREF_FRAME]))
@@ -5281,16 +4625,50 @@ static int setup_interp_filter_search_mask(AV1_COMP *cpi) {
 static void dump_filtered_recon_frames(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   const YV12_BUFFER_CONFIG *recon_buf = cm->frame_to_show;
-  int h;
-  char file_name[256] = "/tmp/enc_filtered_recon.yuv";
-  FILE *f_recon = NULL;
 
-  if (recon_buf == NULL || !cm->show_frame) {
-    printf("Frame %d is not ready or no show to dump.\n",
+  if (recon_buf == NULL) {
+    printf("Frame %d is not ready.\n", cm->current_video_frame);
+    return;
+  }
+
+  static const int flag_list[REF_FRAMES] = { 0,
+                                             AOM_LAST_FLAG,
+                                             AOM_LAST2_FLAG,
+                                             AOM_LAST3_FLAG,
+                                             AOM_GOLD_FLAG,
+                                             AOM_BWD_FLAG,
+                                             AOM_ALT2_FLAG,
+                                             AOM_ALT_FLAG };
+  printf(
+      "\n***Frame=%d (frame_offset=%d, show_frame=%d, "
+      "show_existing_frame=%d) "
+      "[LAST LAST2 LAST3 GOLDEN BWD ALT2 ALT]=[",
+      cm->current_video_frame, cm->frame_offset, cm->show_frame,
+      cm->show_existing_frame);
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
+    const int ref_offset =
+        (buf_idx >= 0)
+            ? (int)cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset
+            : -1;
+    printf(
+        " %d(%c-%d-%4.2f)", ref_offset,
+        (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N',
+        (buf_idx >= 0) ? (int)cpi->frame_rf_level[buf_idx] : -1,
+        (buf_idx >= 0) ? rate_factor_deltas[cpi->frame_rf_level[buf_idx]] : -1);
+  }
+  printf(" ]\n");
+
+  if (!cm->show_frame) {
+    printf("Frame %d is a no show frame, so no image dump.\n",
            cm->current_video_frame);
     return;
   }
 
+  int h;
+  char file_name[256] = "/tmp/enc_filtered_recon.yuv";
+  FILE *f_recon = NULL;
+
   if (cm->current_video_frame == 0) {
     if ((f_recon = fopen(file_name, "wb")) == NULL) {
       printf("Unable to open file %s to write.\n", file_name);
@@ -5303,13 +4681,14 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) {
     }
   }
   printf(
-      "\nFrame=%5d, encode_update_type[%5d]=%1d, show_existing_frame=%d, "
-      "source_alt_ref_active=%d, refresh_alt_ref_frame=%d, rf_level=%d, "
-      "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n",
+      "\nFrame=%5d, encode_update_type[%5d]=%1d, frame_offset=%d, "
+      "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, "
+      "refresh_alt_ref_frame=%d, rf_level=%d, "
+      "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n",
       cm->current_video_frame, cpi->twopass.gf_group.index,
       cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
-      cm->show_existing_frame, cpi->rc.source_alt_ref_active,
-      cpi->refresh_alt_ref_frame,
+      cm->frame_offset, cm->show_frame, cm->show_existing_frame,
+      cpi->rc.source_alt_ref_active, cpi->refresh_alt_ref_frame,
       cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index],
       recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height);
 #if 0
@@ -5346,49 +4725,44 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) {
 }
 #endif  // DUMP_RECON_FRAMES
 
-static void make_update_tile_list_enc(AV1_COMP *cpi, const int tile_rows,
-                                      const int tile_cols,
-                                      FRAME_CONTEXT *ec_ctxs[]) {
-  int i;
-  for (i = 0; i < tile_rows * tile_cols; ++i)
-    ec_ctxs[i] = &cpi->tile_data[i].tctx;
-}
-
-static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
-                                      uint8_t *dest, int skip_adapt,
-                                      unsigned int *frame_flags) {
+static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
+                                     int skip_adapt,
+                                     unsigned int *frame_flags) {
   AV1_COMMON *const cm = &cpi->common;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   struct segmentation *const seg = &cm->seg;
-  FRAME_CONTEXT **tile_ctxs = aom_malloc(cm->tile_rows * cm->tile_cols *
-                                         sizeof(&cpi->tile_data[0].tctx));
-  aom_cdf_prob **cdf_ptrs =
-      aom_malloc(cm->tile_rows * cm->tile_cols *
-                 sizeof(&cpi->tile_data[0].tctx.partition_cdf[0][0]));
-#if CONFIG_XIPHRC
-  int frame_type;
-  int drop_this_frame = 0;
-#endif  // CONFIG_XIPHRC
+
   set_ext_overrides(cpi);
   aom_clear_system_state();
 
-#if !CONFIG_FRAME_SIGN_BIAS
-  // Set the arf sign bias for this frame.
-  set_arf_sign_bias(cpi);
-#endif  // !CONFIG_FRAME_SIGN_BIAS
-
-#if CONFIG_TEMPMV_SIGNALING
   // frame type has been decided outside of this function call
-  cm->cur_frame->intra_only = cm->frame_type == KEY_FRAME || cm->intra_only;
-  cm->use_prev_frame_mvs =
-      !cpi->oxcf.disable_tempmv && !cm->cur_frame->intra_only;
-#endif
+  cm->cur_frame->intra_only = frame_is_intra_only(cm);
+  cm->cur_frame->frame_type = cm->frame_type;
+
+  // S_FRAMEs are always error resilient
+  cm->error_resilient_mode |= frame_is_sframe(cm);
+
+  cm->large_scale_tile = cpi->oxcf.large_scale_tile;
+  cm->single_tile_decoding = cpi->oxcf.single_tile_decoding;
+  if (cm->large_scale_tile) cm->seq_params.frame_id_numbers_present_flag = 0;
+
+  cm->allow_ref_frame_mvs &= frame_might_allow_ref_frame_mvs(cm);
+  // cm->allow_ref_frame_mvs needs to be written into the frame header while
+  // cm->large_scale_tile is 1, therefore, "cm->large_scale_tile=1" case is
+  // separated from frame_might_allow_ref_frame_mvs().
+  cm->allow_ref_frame_mvs &= !cm->large_scale_tile;
+
+  cm->allow_warped_motion =
+      cpi->oxcf.allow_warped_motion && frame_might_allow_warped_motion(cm);
+
+  // Reset the frame packet stamp index.
+  if (cm->frame_type == KEY_FRAME) cm->current_video_frame = 0;
 
-#if CONFIG_EXT_REFS
   // NOTE:
   // (1) Move the setup of the ref_frame_flags upfront as it would be
   //     determined by the current frame properties;
-  // (2) The setup of the ref_frame_flags applies to both show_existing_frame's
+  // (2) The setup of the ref_frame_flags applies to both
+  // show_existing_frame's
   //     and the other cases.
   if (cm->current_video_frame > 0)
     cpi->ref_frame_flags = get_ref_frame_flags(cpi);
@@ -5415,12 +4789,20 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     cpi->rc.is_bipred_frame = 0;
 
     restore_coding_context(cpi);
+
     // Build the bitstream
-    av1_pack_bitstream(cpi, dest, size);
+    if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK)
+      return AOM_CODEC_ERROR;
+
+    cpi->seq_params_locked = 1;
 
     // Set up frame to show to get ready for stats collection.
     cm->frame_to_show = get_frame_new_buffer(cm);
 
+    // Update current frame offset.
+    cm->frame_offset =
+        cm->buffer_pool->frame_bufs[cm->new_fb_idx].cur_frame_offset;
+
 #if DUMP_RECON_FRAMES == 1
     // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
     dump_filtered_recon_frames(cpi);
@@ -5432,9 +4814,11 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     //     update has been done previously when handling the LAST_BIPRED_FRAME
     //     right before BWDREF_FRAME (in the display order);
     // (2) For INTNL_OVERLAY as the show_existing_frame, the reference frame
-    //     update will be done when the following is called, which will exchange
+    //     update will be done when the following is called, which will
+    //     exchange
     //     the virtual indexes between LAST_FRAME and ALTREF2_FRAME, so that
-    //     LAST3 will get retired, LAST2 becomes LAST3, LAST becomes LAST2, and
+    //     LAST3 will get retired, LAST2 becomes LAST3, LAST becomes LAST2,
+    //     and
     //     ALTREF2_FRAME will serve as the new LAST_FRAME.
     update_reference_frames(cpi);
 
@@ -5452,23 +4836,13 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     // to do post-encoding update accordingly.
     if (cpi->rc.is_src_frame_alt_ref) {
       av1_set_target_rate(cpi, cm->width, cm->height);
-#if CONFIG_XIPHRC
-      frame_type = cm->frame_type == INTER_FRAME ? OD_P_FRAME : OD_I_FRAME;
-      drop_this_frame = od_enc_rc_update_state(
-          &cpi->od_rc, *size << 3, cpi->refresh_golden_frame,
-          cpi->refresh_alt_ref_frame, frame_type, cpi->droppable);
-#else
       av1_rc_postencode_update(cpi, *size);
-#endif
     }
 
     ++cm->current_video_frame;
 
-    aom_free(tile_ctxs);
-    aom_free(cdf_ptrs);
-    return;
+    return AOM_CODEC_OK;
   }
-#endif  // CONFIG_EXT_REFS
 
   // Set default state for segment based loop filter update flags.
   cm->lf.mode_ref_delta_update = 0;
@@ -5477,7 +4851,7 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     cpi->sf.interp_filter_search_mask = setup_interp_filter_search_mask(cpi);
 
   // Set various flags etc to special state if it is a key frame.
-  if (frame_is_intra_only(cm)) {
+  if (frame_is_intra_only(cm) || frame_is_sframe(cm)) {
     // Reset the loop filter deltas and segmentation map.
     av1_reset_segment_features(cm);
 
@@ -5489,19 +4863,6 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
 
     // The alternate reference frame cannot be active for a key frame.
     cpi->rc.source_alt_ref_active = 0;
-
-    cm->error_resilient_mode = oxcf->error_resilient_mode;
-
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-    // By default, encoder assumes decoder can use prev_mi.
-    if (cm->error_resilient_mode) {
-      cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
-      cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_FORWARD;
-    } else if (cm->intra_only) {
-      // Only reset the current context.
-      cm->reset_frame_context = RESET_FRAME_CONTEXT_CURRENT;
-    }
-#endif
   }
   if (cpi->oxcf.mtu == 0) {
     cm->num_tg = cpi->oxcf.num_tile_groups;
@@ -5511,33 +4872,15 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     cm->num_tg = DEFAULT_MAX_NUM_TG;
   }
 
-#if CONFIG_EXT_TILE
-  cm->large_scale_tile = cpi->oxcf.large_scale_tile;
-  cm->single_tile_decoding = cpi->oxcf.single_tile_decoding;
-#endif  // CONFIG_EXT_TILE
-
-#if CONFIG_XIPHRC
-  if (drop_this_frame) {
-    av1_rc_postencode_update_drop_frame(cpi);
-    ++cm->current_video_frame;
-    aom_free(tile_ctxs);
-    aom_free(cdf_ptrs);
-    return;
-  }
-#else
   // For 1 pass CBR, check if we are dropping this frame.
   // Never drop on key frame.
   if (oxcf->pass == 0 && oxcf->rc_mode == AOM_CBR &&
       cm->frame_type != KEY_FRAME) {
     if (av1_rc_drop_frame(cpi)) {
       av1_rc_postencode_update_drop_frame(cpi);
-      ++cm->current_video_frame;
-      aom_free(tile_ctxs);
-      aom_free(cdf_ptrs);
-      return;
+      return AOM_CODEC_OK;
     }
   }
-#endif
 
   aom_clear_system_state();
 
@@ -5546,46 +4889,59 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
          MAX_MODES * sizeof(*cpi->mode_chosen_counts));
 #endif
 
-#if CONFIG_REFERENCE_BUFFER
   if (cm->seq_params.frame_id_numbers_present_flag) {
     /* Non-normative definition of current_frame_id ("frame counter" with
-    * wraparound) */
-    const int frame_id_length = FRAME_ID_LENGTH_MINUS7 + 7;
+     * wraparound) */
+    const int frame_id_length = FRAME_ID_LENGTH;
     if (cm->current_frame_id == -1) {
       int lsb, msb;
-/* quasi-random initialization of current_frame_id for a key frame */
-#if CONFIG_HIGHBITDEPTH
+      /* quasi-random initialization of current_frame_id for a key frame */
       if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) {
         lsb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[0] & 0xff;
         msb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[1] & 0xff;
       } else {
-#endif
         lsb = cpi->source->y_buffer[0] & 0xff;
         msb = cpi->source->y_buffer[1] & 0xff;
-#if CONFIG_HIGHBITDEPTH
       }
-#endif
       cm->current_frame_id = ((msb << 8) + lsb) % (1 << frame_id_length);
+
+      // S_frame is meant for stitching different streams of different
+      // resolutions together, so current_frame_id must be the
+      // same across different streams of the same content current_frame_id
+      // should be the same and not random. 0x37 is a chosen number as start
+      // point
+      if (cpi->oxcf.sframe_enabled) cm->current_frame_id = 0x37;
     } else {
       cm->current_frame_id =
           (cm->current_frame_id + 1 + (1 << frame_id_length)) %
           (1 << frame_id_length);
     }
   }
-#endif  // CONFIG_REFERENCE_BUFFER
 
-#if CONFIG_EXT_DELTA_Q
-  cm->delta_q_present_flag = cpi->oxcf.deltaq_mode != NO_DELTA_Q;
-  cm->delta_lf_present_flag = cpi->oxcf.deltaq_mode == DELTA_Q_LF;
-#if CONFIG_LOOPFILTER_LEVEL
-  cm->delta_lf_multi = DEFAULT_DELTA_LF_MULTI;
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif
+  switch (cpi->oxcf.cdf_update_mode) {
+    case 0:  // No CDF update for any frames(4~6% compression loss).
+      cm->disable_cdf_update = 1;
+      break;
+    case 1:  // Enable CDF update for all frames.
+      cm->disable_cdf_update = 0;
+      break;
+    case 2:
+      // Strategically determine at which frames to do CDF update.
+      // Currently only enable CDF update for all-intra and no-show frames(1.5%
+      // compression loss).
+      // TODO(huisu@google.com): design schemes for various trade-offs between
+      // compression quality and decoding speed.
+      cm->disable_cdf_update =
+          (frame_is_intra_only(cm) || !cm->show_frame) ? 0 : 1;
+      break;
+  }
+  cm->timing_info_present &= !cm->seq_params.reduced_still_picture_hdr;
 
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
-    encode_without_recode_loop(cpi);
+    if (encode_without_recode_loop(cpi) != AOM_CODEC_OK) return AOM_CODEC_ERROR;
   } else {
-    encode_with_recode_loop(cpi, size, dest);
+    if (encode_with_recode_loop(cpi, size, dest) != AOM_CODEC_OK)
+      return AOM_CODEC_ERROR;
   }
 
   cm->last_tile_cols = cm->tile_cols;
@@ -5601,72 +4957,86 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   // fixed interval. Note the reconstruction error if it is the frame before
   // the force key frame
   if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
-#if CONFIG_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
       cpi->ambient_err =
           aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm));
     } else {
       cpi->ambient_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
     }
-#else
-    cpi->ambient_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
-#endif  // CONFIG_HIGHBITDEPTH
   }
 
-  // If the encoder forced a KEY_FRAME decision
-  if (cm->frame_type == KEY_FRAME) {
+  // If the encoder forced a KEY_FRAME decision or if frame is an S_FRAME
+  if (cm->frame_type == KEY_FRAME || frame_is_sframe(cm)) {
     cpi->refresh_last_frame = 1;
   }
 
   cm->frame_to_show = get_frame_new_buffer(cm);
-  cm->frame_to_show->color_space = cm->color_space;
-#if CONFIG_COLORSPACE_HEADERS
-  cm->frame_to_show->transfer_function = cm->transfer_function;
+  cm->frame_to_show->color_primaries = cm->color_primaries;
+  cm->frame_to_show->transfer_characteristics = cm->transfer_characteristics;
+  cm->frame_to_show->matrix_coefficients = cm->matrix_coefficients;
+  cm->frame_to_show->monochrome = cm->seq_params.monochrome;
   cm->frame_to_show->chroma_sample_position = cm->chroma_sample_position;
-#endif
   cm->frame_to_show->color_range = cm->color_range;
   cm->frame_to_show->render_width = cm->render_width;
   cm->frame_to_show->render_height = cm->render_height;
 
-#if CONFIG_EXT_REFS
-// TODO(zoeliu): For non-ref frames, loop filtering may need to be turned
-// off.
-#endif  // CONFIG_EXT_REFS
+  // TODO(zoeliu): For non-ref frames, loop filtering may need to be turned
+  // off.
 
   // Pick the loop filter level for the frame.
-  loopfilter_frame(cpi, cm);
+  if (!cm->allow_intrabc) {
+    loopfilter_frame(cpi, cm);
+  } else {
+    cm->lf.filter_level[0] = 0;
+    cm->lf.filter_level[1] = 0;
+    cm->cdef_bits = 0;
+    cm->cdef_strengths[0] = 0;
+    cm->nb_cdef_strengths = 1;
+    cm->cdef_uv_strengths[0] = 0;
+    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+  }
+
+  // TODO(debargha): Fix mv search range on encoder side
+  // aom_extend_frame_inner_borders(cm->frame_to_show, av1_num_planes(cm));
+  aom_extend_frame_borders(cm->frame_to_show, av1_num_planes(cm));
 
 #ifdef OUTPUT_YUV_REC
   aom_write_one_yuv_frame(cm, cm->frame_to_show);
 #endif
 
   // Build the bitstream
-  av1_pack_bitstream(cpi, dest, size);
+  if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK)
+    return AOM_CODEC_ERROR;
 
-  if (skip_adapt) {
-    aom_free(tile_ctxs);
-    aom_free(cdf_ptrs);
-    return;
-  }
+  cpi->seq_params_locked = 1;
+
+  if (skip_adapt) return AOM_CODEC_OK;
 
-#if CONFIG_REFERENCE_BUFFER
   if (cm->seq_params.frame_id_numbers_present_flag) {
     int i;
-    /* Update reference frame id values based on the value of refresh_mask */
+    // Update reference frame id values based on the value of refresh_frame_mask
     for (i = 0; i < REF_FRAMES; i++) {
-      if ((cm->refresh_mask >> i) & 1) {
+      if ((cpi->refresh_frame_mask >> i) & 1) {
         cm->ref_frame_id[i] = cm->current_frame_id;
       }
     }
   }
-#endif  // CONFIG_REFERENCE_BUFFER
 
 #if DUMP_RECON_FRAMES == 1
   // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
-  if (cm->show_frame) dump_filtered_recon_frames(cpi);
+  dump_filtered_recon_frames(cpi);
 #endif  // DUMP_RECON_FRAMES
 
-  if (cm->seg.update_map) update_reference_segmentation_map(cpi);
+  if (cm->seg.enabled) {
+    if (cm->seg.update_map) {
+      update_reference_segmentation_map(cpi);
+    } else if (cm->last_frame_seg_map) {
+      memcpy(cm->current_frame_seg_map, cm->last_frame_seg_map,
+             cm->mi_cols * cm->mi_rows * sizeof(uint8_t));
+    }
+  }
 
   if (frame_is_intra_only(cm) == 0) {
     release_scaled_references(cpi);
@@ -5675,39 +5045,12 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   update_reference_frames(cpi);
 
 #if CONFIG_ENTROPY_STATS
-  av1_accumulate_frame_counts(&aggregate_fc, &cm->counts);
-  assert(cm->frame_context_idx < FRAME_CONTEXTS);
-  av1_accumulate_frame_counts(&aggregate_fc_per_type[cm->frame_context_idx],
-                              &cm->counts);
+  av1_accumulate_frame_counts(&aggregate_fc, &cpi->counts);
 #endif  // CONFIG_ENTROPY_STATS
-  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-#if CONFIG_LV_MAP
-    av1_adapt_coef_probs(cm);
-#endif  // CONFIG_LV_MAP
-    av1_adapt_intra_frame_probs(cm);
-    make_update_tile_list_enc(cpi, cm->tile_rows, cm->tile_cols, tile_ctxs);
-    av1_average_tile_coef_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs,
-                               cm->tile_rows * cm->tile_cols);
-    av1_average_tile_intra_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs,
-                                cm->tile_rows * cm->tile_cols);
-#if CONFIG_PVQ
-    av1_average_tile_pvq_cdfs(cpi->common.fc, tile_ctxs,
-                              cm->tile_rows * cm->tile_cols);
-#endif  // CONFIG_PVQ
-#if CONFIG_ADAPT_SCAN
-    av1_adapt_scan_order(cm);
-#endif  // CONFIG_ADAPT_SCAN
-  }
 
-  if (!frame_is_intra_only(cm)) {
-    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-      av1_adapt_inter_frame_probs(cm);
-      av1_adapt_mv_probs(cm, cm->allow_high_precision_mv);
-      av1_average_tile_inter_cdfs(&cpi->common, cpi->common.fc, tile_ctxs,
-                                  cdf_ptrs, cm->tile_rows * cm->tile_cols);
-      av1_average_tile_mv_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs,
-                               cm->tile_rows * cm->tile_cols);
-    }
+  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    *cm->fc = cpi->tile_data[cm->largest_tile_id].tctx;
+    av1_reset_cdf_symbol_counters(cm->fc);
   }
 
   if (cpi->refresh_golden_frame == 1)
@@ -5720,39 +5063,14 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   else
     cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
 
-#if CONFIG_EXT_REFS
   if (cpi->refresh_bwd_ref_frame == 1)
     cpi->frame_flags |= FRAMEFLAGS_BWDREF;
   else
     cpi->frame_flags &= ~FRAMEFLAGS_BWDREF;
-#endif  // CONFIG_EXT_REFS
-
-#if !CONFIG_EXT_REFS
-  cpi->ref_frame_flags = get_ref_frame_flags(cpi);
-#endif  // !CONFIG_EXT_REFS
 
   cm->last_frame_type = cm->frame_type;
 
-#if CONFIG_XIPHRC
-  frame_type = cm->frame_type == KEY_FRAME ? OD_I_FRAME : OD_P_FRAME;
-
-  drop_this_frame =
-      od_enc_rc_update_state(&cpi->od_rc, *size << 3, cpi->refresh_golden_frame,
-                             cpi->refresh_alt_ref_frame, frame_type, 0);
-  if (drop_this_frame) {
-    av1_rc_postencode_update_drop_frame(cpi);
-    ++cm->current_video_frame;
-    aom_free(tile_ctxs);
-    aom_free(cdf_ptrs);
-    return;
-  }
-#else   // !CONFIG_XIPHRC
   av1_rc_postencode_update(cpi, *size);
-#endif  // CONFIG_XIPHRC
-
-#if 0
-  output_frame_level_debug_stats(cpi);
-#endif
 
   if (cm->frame_type == KEY_FRAME) {
     // Tell the caller that the frame was coded as a key frame
@@ -5768,90 +5086,79 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   cm->lf.mode_ref_delta_update = 0;
 
   if (cm->show_frame) {
-#if CONFIG_EXT_REFS
-// TODO(zoeliu): We may only swamp mi and prev_mi for those frames that are
-// being used as reference.
-#endif  // CONFIG_EXT_REFS
+    // TODO(zoeliu): We may only swamp mi and prev_mi for those frames that
+    // are
+    // being used as reference.
     swap_mi_and_prev_mi(cm);
     // Don't increment frame counters if this was an altref buffer
     // update not a real frame
     ++cm->current_video_frame;
   }
 
-#if CONFIG_EXT_REFS
   // NOTE: Shall not refer to any frame not used as reference.
   if (cm->is_reference_frame) {
-#endif  // CONFIG_EXT_REFS
-    cm->prev_frame = cm->cur_frame;
     // keep track of the last coded dimensions
     cm->last_width = cm->width;
     cm->last_height = cm->height;
 
     // reset to normal state now that we are done.
     cm->last_show_frame = cm->show_frame;
-#if CONFIG_EXT_REFS
   }
-#endif  // CONFIG_EXT_REFS
 
-  aom_free(tile_ctxs);
-  aom_free(cdf_ptrs);
+  return AOM_CODEC_OK;
 }
 
-static void Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
-                        int skip_adapt, unsigned int *frame_flags) {
-#if CONFIG_XIPHRC
-  int64_t ip_count;
-  int frame_type, is_golden, is_altref;
-
-  /* Not updated during init so update it here */
-  if (cpi->oxcf.rc_mode == AOM_Q) cpi->od_rc.quality = cpi->oxcf.cq_level;
-
-  frame_type = od_frame_type(&cpi->od_rc, cpi->od_rc.cur_frame, &is_golden,
-                             &is_altref, &ip_count);
-
-  if (frame_type == OD_I_FRAME) {
-    frame_type = KEY_FRAME;
-    cpi->frame_flags &= FRAMEFLAGS_KEY;
-  } else if (frame_type == OD_P_FRAME) {
-    frame_type = INTER_FRAME;
-  }
-
-  if (is_altref) {
-    cpi->refresh_alt_ref_frame = 1;
-    cpi->rc.source_alt_ref_active = 1;
-  }
-
-  cpi->refresh_golden_frame = is_golden;
-  cpi->common.frame_type = frame_type;
-  if (is_golden) cpi->frame_flags &= FRAMEFLAGS_GOLDEN;
-#else
+static int Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
+                       int skip_adapt, unsigned int *frame_flags) {
   if (cpi->oxcf.rc_mode == AOM_CBR) {
     av1_rc_get_one_pass_cbr_params(cpi);
   } else {
     av1_rc_get_one_pass_vbr_params(cpi);
   }
-#endif
-  encode_frame_to_data_rate(cpi, size, dest, skip_adapt, frame_flags);
+  if (encode_frame_to_data_rate(cpi, size, dest, skip_adapt, frame_flags) !=
+      AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
+  }
+  check_show_existing_frame(cpi);
+  return AOM_CODEC_OK;
 }
 
-#if !CONFIG_XIPHRC
-static void Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
-                        unsigned int *frame_flags) {
-  encode_frame_to_data_rate(cpi, size, dest, 0, frame_flags);
+static int Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
+                       unsigned int *frame_flags) {
+#if CONFIG_MISMATCH_DEBUG
+  mismatch_move_frame_idx_w();
+#endif
+#if TXCOEFF_COST_TIMER
+  AV1_COMMON *cm = &cpi->common;
+  cm->txcoeff_cost_timer = 0;
+  cm->txcoeff_cost_count = 0;
+#endif
+
+  if (encode_frame_to_data_rate(cpi, size, dest, 0, frame_flags) !=
+      AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
+  }
+
+#if TXCOEFF_COST_TIMER
+  cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer;
+  fprintf(stderr,
+          "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld "
+          "in us\n",
+          cm->txcoeff_cost_count, cm->txcoeff_cost_timer,
+          cm->cum_txcoeff_cost_timer);
+#endif
 
-#if CONFIG_EXT_REFS
-  // Do not do post-encoding update for those frames that do not have a spot in
-  // a gf group, but note that an OVERLAY frame always has a spot in a gf group,
+  // Do not do post-encoding update for those frames that do not have a spot
+  // in
+  // a gf group, but note that an OVERLAY frame always has a spot in a gf
+  // group,
   // even when show_existing_frame is used.
   if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref) {
     av1_twopass_postencode_update(cpi);
   }
   check_show_existing_frame(cpi);
-#else
-  av1_twopass_postencode_update(cpi);
-#endif  // CONFIG_EXT_REFS
+  return AOM_CODEC_OK;
 }
-#endif
 
 int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
@@ -5861,37 +5168,34 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
   int res = 0;
   const int subsampling_x = sd->subsampling_x;
   const int subsampling_y = sd->subsampling_y;
-#if CONFIG_HIGHBITDEPTH
   const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
-#endif
 
-#if CONFIG_HIGHBITDEPTH
   check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
-#else
-  check_initial_width(cpi, subsampling_x, subsampling_y);
-#endif  // CONFIG_HIGHBITDEPTH
 
   aom_usec_timer_start(&timer);
 
   if (av1_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
-#if CONFIG_HIGHBITDEPTH
-                         use_highbitdepth,
-#endif  // CONFIG_HIGHBITDEPTH
-                         frame_flags))
+                         use_highbitdepth, frame_flags))
     res = -1;
   aom_usec_timer_mark(&timer);
   cpi->time_receive_data += aom_usec_timer_elapsed(&timer);
 
-  if ((cm->profile == PROFILE_0 || cm->profile == PROFILE_2) &&
+  if ((cm->profile == PROFILE_0) && !cm->seq_params.monochrome &&
       (subsampling_x != 1 || subsampling_y != 1)) {
     aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
-                       "Non-4:2:0 color format requires profile 1 or 3");
+                       "Non-4:2:0 color format requires profile 1 or 2");
     res = -1;
   }
-  if ((cm->profile == PROFILE_1 || cm->profile == PROFILE_3) &&
-      (subsampling_x == 1 && subsampling_y == 1)) {
+  if ((cm->profile == PROFILE_1) &&
+      !(subsampling_x == 0 && subsampling_y == 0)) {
     aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
-                       "4:2:0 color format requires profile 0 or 2");
+                       "Profile 1 requires 4:4:4 color format");
+    res = -1;
+  }
+  if ((cm->profile == PROFILE_2) && (cm->bit_depth <= AOM_BITS_10) &&
+      !(subsampling_x == 1 && subsampling_y == 0)) {
+    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+                       "Profile 2 bit-depth < 10 requires 4:2:2 color format");
     res = -1;
   }
 
@@ -5902,13 +5206,10 @@ static int frame_is_reference(const AV1_COMP *cpi) {
   const AV1_COMMON *cm = &cpi->common;
 
   return cm->frame_type == KEY_FRAME || cpi->refresh_last_frame ||
-         cpi->refresh_golden_frame ||
-#if CONFIG_EXT_REFS
-         cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame ||
-#endif  // CONFIG_EXT_REFS
-         cpi->refresh_alt_ref_frame || !cm->error_resilient_mode ||
-         cm->lf.mode_ref_delta_update || cm->seg.update_map ||
-         cm->seg.update_data;
+         cpi->refresh_golden_frame || cpi->refresh_bwd_ref_frame ||
+         cpi->refresh_alt2_ref_frame || cpi->refresh_alt_ref_frame ||
+         !cm->error_resilient_mode || cm->lf.mode_ref_delta_update ||
+         cm->seg.update_map || cm->seg.update_data;
 }
 
 static void adjust_frame_rate(AV1_COMP *cpi,
@@ -5968,7 +5269,6 @@ static int get_arf_src_index(AV1_COMP *cpi) {
   return arf_src_index;
 }
 
-#if CONFIG_EXT_REFS
 static int get_brf_src_index(AV1_COMP *cpi) {
   int brf_src_index = 0;
   const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
@@ -6002,7 +5302,6 @@ static int get_arf2_src_index(AV1_COMP *cpi) {
   }
   return arf2_src_index;
 }
-#endif  // CONFIG_EXT_REFS
 
 static void check_src_altref(AV1_COMP *cpi,
                              const struct lookahead_entry *source) {
@@ -6014,14 +5313,10 @@ static void check_src_altref(AV1_COMP *cpi,
   if (cpi->oxcf.pass == 2) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     rc->is_src_frame_alt_ref =
-#if CONFIG_EXT_REFS
         (gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE) ||
-#endif  // CONFIG_EXT_REFS
         (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
-#if CONFIG_EXT_REFS
     rc->is_src_frame_ext_arf =
         gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE;
-#endif  // CONFIG_EXT_REFS
   } else {
     rc->is_src_frame_alt_ref =
         cpi->alt_ref_source && (source == cpi->alt_ref_source);
@@ -6031,20 +5326,16 @@ static void check_src_altref(AV1_COMP *cpi,
     // Current frame is an ARF overlay frame.
     cpi->alt_ref_source = NULL;
 
-#if CONFIG_EXT_REFS
     if (rc->is_src_frame_ext_arf && !cpi->common.show_existing_frame) {
       // For INTNL_OVERLAY, when show_existing_frame == 0, they do need to
       // refresh the LAST_FRAME, i.e. LAST3 gets retired, LAST2 becomes LAST3,
       // LAST becomes LAST2, and INTNL_OVERLAY becomes LAST.
       cpi->refresh_last_frame = 1;
     } else {
-#endif  // CONFIG_EXT_REFS
       // Don't refresh the last buffer for an ARF overlay frame. It will
       // become the GF so preserve last as an alternative prediction option.
       cpi->refresh_last_frame = 0;
-#if CONFIG_EXT_REFS
     }
-#endif  // CONFIG_EXT_REFS
   }
 }
 
@@ -6055,10 +5346,10 @@ extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
 
 static void adjust_image_stat(double y, double u, double v, double all,
                               ImageStat *s) {
-  s->stat[Y] += y;
-  s->stat[U] += u;
-  s->stat[V] += v;
-  s->stat[ALL] += all;
+  s->stat[STAT_Y] += y;
+  s->stat[STAT_U] += u;
+  s->stat[STAT_V] += v;
+  s->stat[STAT_ALL] += all;
   s->worst = AOMMIN(s->worst, all);
 }
 
@@ -6073,12 +5364,10 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
 #endif
   cpi->bytes += frame_bytes;
 
-#if CONFIG_HIGHBITDEPTH
   if (cm->use_highbitdepth) {
     in_bit_depth = cpi->oxcf.input_bit_depth;
     bit_depth = cm->bit_depth;
   }
-#endif
   if (cm->show_frame) {
     const YV12_BUFFER_CONFIG *orig = cpi->source;
     const YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
@@ -6089,28 +5378,20 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
       PSNR_STATS psnr;
       double frame_ssim2 = 0.0, weight = 0.0;
       aom_clear_system_state();
-// TODO(yaowu): unify these two versions into one.
-#if CONFIG_HIGHBITDEPTH
+      // TODO(yaowu): unify these two versions into one.
       aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth);
-#else
-      aom_calc_psnr(orig, recon, &psnr);
-#endif  // CONFIG_HIGHBITDEPTH
 
       adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0],
                         &cpi->psnr);
       cpi->total_sq_error += psnr.sse[0];
       cpi->total_samples += psnr.samples[0];
       samples = psnr.samples[0];
-// TODO(yaowu): unify these two versions into one.
-#if CONFIG_HIGHBITDEPTH
+      // TODO(yaowu): unify these two versions into one.
       if (cm->use_highbitdepth)
         frame_ssim2 =
             aom_highbd_calc_ssim(orig, recon, &weight, bit_depth, in_bit_depth);
       else
         frame_ssim2 = aom_calc_ssim(orig, recon, &weight);
-#else
-      frame_ssim2 = aom_calc_ssim(orig, recon, &weight);
-#endif  // CONFIG_HIGHBITDEPTH
 
       cpi->worst_ssim = AOMMIN(cpi->worst_ssim, frame_ssim2);
       cpi->summed_quality += frame_ssim2 * weight;
@@ -6119,18 +5400,19 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
 #if 0
       {
         FILE *f = fopen("q_used.stt", "a");
+        double y2 = psnr.psnr[1];
+        double u2 = psnr.psnr[2];
+        double v2 = psnr.psnr[3];
+        double frame_psnr2 = psnr.psnr[0];
         fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
-                cpi->common.current_video_frame, y2, u2, v2,
+                cm->current_video_frame, y2, u2, v2,
                 frame_psnr2, frame_ssim2);
         fclose(f);
       }
 #endif
     }
     if (cpi->b_calculate_blockiness) {
-#if CONFIG_HIGHBITDEPTH
-      if (!cm->use_highbitdepth)
-#endif
-      {
+      if (!cm->use_highbitdepth) {
         const double frame_blockiness =
             av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer,
                                recon->y_stride, orig->y_width, orig->y_height);
@@ -6139,10 +5421,7 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
       }
 
       if (cpi->b_calculate_consistency) {
-#if CONFIG_HIGHBITDEPTH
-        if (!cm->use_highbitdepth)
-#endif
-        {
+        if (!cm->use_highbitdepth) {
           const double this_inconsistency = aom_get_ssim_metrics(
               orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride,
               orig->y_width, orig->y_height, cpi->ssim_vars, &cpi->metrics, 1);
@@ -6167,7 +5446,6 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
 }
 #endif  // CONFIG_INTERNAL_STATS
 
-#if CONFIG_AMVR
 static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture,
                          const YV12_BUFFER_CONFIG *last_picture,
                          hash_table *last_hash_table) {
@@ -6203,14 +5481,28 @@ static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture,
       p_cur += (y_pos * stride_cur + x_pos);
       p_ref += (y_pos * stride_ref + x_pos);
 
-      for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
-        for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
-          if (p_cur[tmpX] != p_ref[tmpX]) {
-            match = 0;
+      if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+        uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur);
+        uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref);
+        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+            if (p16_cur[tmpX] != p16_ref[tmpX]) {
+              match = 0;
+            }
           }
+          p16_cur += stride_cur;
+          p16_ref += stride_ref;
+        }
+      } else {
+        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+            if (p_cur[tmpX] != p_ref[tmpX]) {
+              match = 0;
+            }
+          }
+          p_cur += stride_cur;
+          p_ref += stride_ref;
         }
-        p_cur += stride_cur;
-        p_ref += stride_ref;
       }
 
       if (match) {
@@ -6227,10 +5519,14 @@ static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture,
 
       av1_get_block_hash_value(
           cur_picture->y_buffer + y_pos * stride_cur + x_pos, stride_cur,
-          block_size, &hash_value_1, &hash_value_2);
-
-      if (av1_has_exact_match(last_hash_table, hash_value_1, hash_value_2)) {
-        M++;
+          block_size, &hash_value_1, &hash_value_2,
+          (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH));
+      // Hashing does not work for highbitdepth currently.
+      // TODO(Roger): Make it work for highbitdepth.
+      if (av1_use_hash_me(&cpi->common)) {
+        if (av1_has_exact_match(last_hash_table, hash_value_1, hash_value_2)) {
+          M++;
+        }
       }
     }
   }
@@ -6282,13 +5578,14 @@ static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture,
 
   return 0;
 }
-#endif
 
 int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest, int64_t *time_stamp,
-                            int64_t *time_end, int flush) {
+                            int64_t *time_end, int flush,
+                            const aom_rational_t *timebase) {
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   BufferPool *const pool = cm->buffer_pool;
   RATE_CONTROL *const rc = &cpi->rc;
   struct aom_usec_timer cmptimer;
@@ -6296,15 +5593,9 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   struct lookahead_entry *last_source = NULL;
   struct lookahead_entry *source = NULL;
   int arf_src_index;
-#if CONFIG_EXT_REFS
   int brf_src_index;
-#endif  // CONFIG_EXT_REFS
   int i;
 
-#if CONFIG_XIPHRC
-  cpi->od_rc.end_of_input = flush;
-#endif
-
 #if CONFIG_BITSTREAM_DEBUG
   assert(cpi->oxcf.max_threads == 0 &&
          "bitstream debug tool does not support multithreading");
@@ -6312,13 +5603,10 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   bitstream_queue_set_frame_write(cm->current_video_frame * 2 + cm->show_frame);
 #endif
 
+  cm->showable_frame = 0;
   aom_usec_timer_start(&cmptimer);
 
-#if CONFIG_AMVR
   set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV, 0);
-#else
-  set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
-#endif
 
   // Is multi-arf enabled.
   // Note that at the moment multi_arf is only configured for 2 pass VBR
@@ -6327,24 +5615,36 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   else
     cpi->multi_arf_allowed = 0;
 
-// Normal defaults
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
-#endif
-  cm->refresh_frame_context =
-      (oxcf->error_resilient_mode || oxcf->frame_parallel_decoding_mode)
-          ? REFRESH_FRAME_CONTEXT_FORWARD
-          : REFRESH_FRAME_CONTEXT_BACKWARD;
+  // Normal defaults
+  cm->refresh_frame_context = oxcf->frame_parallel_decoding_mode
+                                  ? REFRESH_FRAME_CONTEXT_DISABLED
+                                  : REFRESH_FRAME_CONTEXT_BACKWARD;
+  if (oxcf->large_scale_tile)
+    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 
   cpi->refresh_last_frame = 1;
   cpi->refresh_golden_frame = 0;
-#if CONFIG_EXT_REFS
   cpi->refresh_bwd_ref_frame = 0;
   cpi->refresh_alt2_ref_frame = 0;
-#endif  // CONFIG_EXT_REFS
   cpi->refresh_alt_ref_frame = 0;
 
-#if CONFIG_EXT_REFS && !CONFIG_XIPHRC
+  // TODO(zoeliu@gmail.com): To support forward-KEY_FRAME and set up the
+  //                         following flag accordingly.
+  cm->reset_decoder_state = 0;
+
+  // Don't allow a show_existing_frame to coincide with an error resilient or
+  // S-Frame
+  struct lookahead_entry *lookahead_src = NULL;
+  if (cm->current_video_frame > 0)
+    lookahead_src = av1_lookahead_peek(cpi->lookahead, 0);
+  if (lookahead_src != NULL &&
+      ((cpi->oxcf.error_resilient_mode |
+        ((lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT) != 0)) ||
+       (cpi->oxcf.s_frame_mode |
+        ((lookahead_src->flags & AOM_EFLAG_SET_S_FRAME) != 0)))) {
+    cm->show_existing_frame = 0;
+  }
+
   if (oxcf->pass == 2 && cm->show_existing_frame) {
     // Manage the source buffer and flush out the source frame that has been
     // coded already; Also get prepared for PSNR calculation if needed.
@@ -6352,6 +5652,7 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
       *size = 0;
       return -1;
     }
+    av1_apply_encoding_flags(cpi, source->flags);
     cpi->source = &source->img;
     // TODO(zoeliu): To track down to determine whether it's needed to adjust
     // the frame rate.
@@ -6361,7 +5662,8 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
     // We need to adjust frame rate for an overlay frame
     if (cpi->rc.is_src_frame_alt_ref) adjust_frame_rate(cpi, source);
 
-    // Find a free buffer for the new frame, releasing the reference previously
+    // Find a free buffer for the new frame, releasing the reference
+    // previously
     // held.
     if (cm->new_fb_idx != INVALID_IDX) {
       --pool->frame_bufs[cm->new_fb_idx].ref_count;
@@ -6379,7 +5681,8 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
     // We need to update the gf_group for show_existing overlay frame
     if (cpi->rc.is_src_frame_alt_ref) av1_rc_get_second_pass_params(cpi);
 
-    Pass2Encode(cpi, size, dest, frame_flags);
+    if (Pass2Encode(cpi, size, dest, frame_flags) != AOM_CODEC_OK)
+      return AOM_CODEC_ERROR;
 
     if (cpi->b_calculate_psnr) generate_psnr_packet(cpi);
 
@@ -6393,7 +5696,6 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
     cm->show_existing_frame = 0;
     return 0;
   }
-#endif  // CONFIG_EXT_REFS && !CONFIG_XIPHRC
 
   // Should we encode an arf frame.
   arf_src_index = get_arf_src_index(cpi);
@@ -6415,21 +5717,13 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
     assert(arf_src_index <= rc->frames_to_key);
 
     if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
+      cm->showable_frame = 1;
       cpi->alt_ref_source = source;
 
       if (oxcf->arnr_max_frames > 0) {
-// Produce the filtered ARF frame.
-#if CONFIG_BGSPRITE
-        int bgsprite_ret = av1_background_sprite(cpi, arf_src_index);
-        // Do temporal filter if bgsprite not generated.
-        if (bgsprite_ret != 0)
-#endif  // CONFIG_BGSPRITE
-          av1_temporal_filter(cpi,
-#if CONFIG_BGSPRITE
-                              NULL, &cpi->alt_ref_buffer,
-#endif  // CONFIG_BGSPRITE
-                              arf_src_index);
-        aom_extend_frame_borders(&cpi->alt_ref_buffer);
+        // Produce the filtered ARF frame.
+        av1_temporal_filter(cpi, arf_src_index);
+        aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes);
         force_src_buffer = &cpi->alt_ref_buffer;
       }
 
@@ -6438,16 +5732,13 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
       cpi->refresh_alt_ref_frame = 1;
       cpi->refresh_last_frame = 0;
       cpi->refresh_golden_frame = 0;
-#if CONFIG_EXT_REFS
       cpi->refresh_bwd_ref_frame = 0;
       cpi->refresh_alt2_ref_frame = 0;
-#endif  // CONFIG_EXT_REFS
       rc->is_src_frame_alt_ref = 0;
     }
     rc->source_alt_ref_pending = 0;
   }
 
-#if CONFIG_EXT_REFS
   // Should we encode an arf2 frame.
   arf_src_index = get_arf2_src_index(cpi);
   if (arf_src_index) {
@@ -6468,16 +5759,13 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
     assert(arf_src_index <= rc->frames_to_key);
 
     if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
+      cm->showable_frame = 1;
       cpi->alt_ref_source = source;
 
       if (oxcf->arnr_max_frames > 0) {
         // Produce the filtered ARF frame.
-        av1_temporal_filter(cpi,
-#if CONFIG_BGSPRITE
-                            NULL, NULL,
-#endif  // CONFIG_BGSPRITE
-                            arf_src_index);
-        aom_extend_frame_borders(&cpi->alt_ref_buffer);
+        av1_temporal_filter(cpi, arf_src_index);
+        aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes);
         force_src_buffer = &cpi->alt_ref_buffer;
       }
 
@@ -6499,6 +5787,7 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   if (brf_src_index) {
     assert(brf_src_index <= rc->frames_to_key);
     if ((source = av1_lookahead_peek(cpi->lookahead, brf_src_index)) != NULL) {
+      cm->showable_frame = 1;
       cm->show_frame = 0;
       cm->intra_only = 0;
 
@@ -6511,7 +5800,6 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
       rc->is_bwd_ref_frame = 1;
     }
   }
-#endif  // CONFIG_EXT_REFS
 
   if (!source) {
     // Get last frame source.
@@ -6538,16 +5826,13 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
 
     *time_stamp = source->ts_start;
     *time_end = source->ts_end;
+    av1_apply_encoding_flags(cpi, source->flags);
     *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
 
   } else {
     *size = 0;
     if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
-#if CONFIG_XIPHRC
-      od_enc_rc_2pass_out(&cpi->od_rc, cpi->output_pkt_list, 1);
-#else
       av1_end_first_pass(cpi); /* get last stats packet */
-#endif
       cpi->twopass.first_pass_done = 1;
     }
     return -1;
@@ -6573,20 +5858,23 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
 
   if (cm->new_fb_idx == INVALID_IDX) return -1;
 
+  // Retain the RF_LEVEL for the current newly coded frame.
+  cpi->frame_rf_level[cm->new_fb_idx] =
+      cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+
   cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
-#if CONFIG_HIGHBITDEPTH && CONFIG_GLOBAL_MOTION
   cm->cur_frame->buf.buf_8bit_valid = 0;
-#endif
-#if !CONFIG_EXT_REFS
-  if (cpi->multi_arf_allowed) {
-    if (cm->frame_type == KEY_FRAME) {
-      init_buffer_indices(cpi);
-    } else if (oxcf->pass == 2) {
-      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-      cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index];
-    }
+
+  if (cm->film_grain_table) {
+    cm->film_grain_params_present = aom_film_grain_table_lookup(
+        cm->film_grain_table, *time_stamp, *time_end, 0 /* erase */,
+        &cm->film_grain_params);
   }
-#endif  // !CONFIG_EXT_REFS
+  cm->cur_frame->film_grain_params_present = cm->film_grain_params_present;
+
+  // only one operating point supported now
+  cpi->common.tu_presentation_delay =
+      ticks_to_timebase_units(timebase, *time_stamp);
 
   // Start with a 0 size frame.
   *size = 0;
@@ -6594,87 +5882,62 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   cpi->frame_flags = *frame_flags;
 
   if (oxcf->pass == 2) {
-#if CONFIG_XIPHRC
-    if (od_enc_rc_2pass_in(&cpi->od_rc) < 0) return -1;
-  }
-#else
     av1_rc_get_second_pass_params(cpi);
   } else if (oxcf->pass == 1) {
     setup_frame_size(cpi);
   }
-#endif
 
   if (cpi->oxcf.pass != 0 || frame_is_intra_only(cm) == 1) {
-    for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i)
-      cpi->scaled_ref_idx[i] = INVALID_IDX;
+    for (i = 0; i < REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX;
   }
 
-#if CONFIG_AOM_QM
   cm->using_qmatrix = cpi->oxcf.using_qm;
   cm->min_qmlevel = cpi->oxcf.qm_minlevel;
   cm->max_qmlevel = cpi->oxcf.qm_maxlevel;
-#endif
 
-#if CONFIG_REFERENCE_BUFFER
   if (cm->seq_params.frame_id_numbers_present_flag) {
     if (*time_stamp == 0) {
       cpi->common.current_frame_id = -1;
     }
   }
-#endif  // CONFIG_REFERENCE_BUFFER
-#if CONFIG_AMVR
+
   cpi->cur_poc++;
-  if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools) {
-    if (cpi->common.seq_mv_precision_level == 2) {
+  if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools &&
+      !frame_is_intra_only(cm)) {
+    if (cpi->common.seq_params.force_integer_mv == 2) {
       struct lookahead_entry *previous_entry =
-          cpi->lookahead->buf + cpi->previsous_index;
-      cpi->common.cur_frame_mv_precision_level = is_integer_mv(
-          cpi, cpi->source, &previous_entry->img, cpi->previsou_hash_table);
+          av1_lookahead_peek(cpi->lookahead, cpi->previous_index);
+      if (!previous_entry)
+        cpi->common.cur_frame_force_integer_mv = 0;
+      else
+        cpi->common.cur_frame_force_integer_mv = is_integer_mv(
+            cpi, cpi->source, &previous_entry->img, cpi->previous_hash_table);
     } else {
-      cpi->common.cur_frame_mv_precision_level =
-          cpi->common.seq_mv_precision_level;
+      cpi->common.cur_frame_force_integer_mv =
+          cpi->common.seq_params.force_integer_mv;
     }
   } else {
-    cpi->common.cur_frame_mv_precision_level = 0;
+    cpi->common.cur_frame_force_integer_mv = 0;
   }
-#endif
 
-#if CONFIG_XIPHRC
-  if (oxcf->pass == 1) {
-    size_t tmp;
-    if (cpi->od_rc.cur_frame == 0) Pass0Encode(cpi, &tmp, dest, 1, frame_flags);
-    cpi->od_rc.firstpass_quant = cpi->od_rc.target_quantizer;
-    Pass0Encode(cpi, &tmp, dest, 0, frame_flags);
-    od_enc_rc_2pass_out(&cpi->od_rc, cpi->output_pkt_list, 0);
-  } else if (oxcf->pass == 2) {
-    Pass0Encode(cpi, size, dest, 0, frame_flags);
-  } else {
-    if (cpi->od_rc.cur_frame == 0) {
-      size_t tmp;
-      Pass0Encode(cpi, &tmp, dest, 1, frame_flags);
-    }
-    Pass0Encode(cpi, size, dest, 0, frame_flags);
-  }
-#else
   if (oxcf->pass == 1) {
     cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(oxcf);
     av1_first_pass(cpi, source);
   } else if (oxcf->pass == 2) {
-    Pass2Encode(cpi, size, dest, frame_flags);
+    if (Pass2Encode(cpi, size, dest, frame_flags) != AOM_CODEC_OK)
+      return AOM_CODEC_ERROR;
   } else {
     // One pass encode
-    Pass0Encode(cpi, size, dest, 0, frame_flags);
+    if (Pass0Encode(cpi, size, dest, 0, frame_flags) != AOM_CODEC_OK)
+      return AOM_CODEC_ERROR;
   }
-#endif
-#if CONFIG_HASH_ME
   if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools) {
-#if CONFIG_AMVR
-    cpi->previsou_hash_table = &cm->cur_frame->hash_table;
+    cpi->previous_hash_table = &cm->cur_frame->hash_table;
     {
       int l;
       for (l = -MAX_PRE_FRAMES; l < cpi->lookahead->max_sz; l++) {
         if ((cpi->lookahead->buf + l) == source) {
-          cpi->previsous_index = l;
+          cpi->previous_index = l;
           break;
         }
       }
@@ -6684,17 +5947,26 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
                            "Failed to find last frame original buffer");
       }
     }
-#endif
   }
 
-#endif
+  if (!cm->large_scale_tile) {
+    cm->frame_contexts[cm->new_fb_idx] = *cm->fc;
+  }
 
-#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
-  cm->frame_contexts[cm->new_fb_idx] = *cm->fc;
-#else
-  if (!cm->error_resilient_mode)
-    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
-#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
+#define EXT_TILE_DEBUG 0
+#if EXT_TILE_DEBUG
+  if (cm->large_scale_tile && oxcf->pass == 2) {
+    char fn[20] = "./fc";
+    fn[4] = cm->current_video_frame / 100 + '0';
+    fn[5] = (cm->current_video_frame % 100) / 10 + '0';
+    fn[6] = (cm->current_video_frame % 10) + '0';
+    fn[7] = '\0';
+    av1_print_frame_contexts(cm->fc, fn);
+  }
+#endif  // EXT_TILE_DEBUG
+#undef EXT_TILE_DEBUG
+
+  cm->showable_frame = !cm->show_frame && cm->showable_frame;
 
   // No frame encoded, or frame was dropped, release scaled references.
   if ((*size == 0) && (frame_is_intra_only(cm) == 0)) {
@@ -6717,10 +5989,6 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   }
 #endif  // CONFIG_INTERNAL_STATS
 
-#if CONFIG_XIPHRC
-  cpi->od_rc.cur_frame++;
-#endif
-
   aom_clear_system_state();
 
   return 0;
@@ -6755,6 +6023,29 @@ int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) {
   return 0;
 }
 
+static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
+                                       const YV12_BUFFER_CONFIG *b) {
+  return a->y_height == b->y_height && a->y_width == b->y_width &&
+         a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
+         a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
+         a->border == b->border &&
+         (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
+             (b->flags & YV12_FLAG_HIGHBITDEPTH);
+}
+
+aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
+                                       YV12_BUFFER_CONFIG *new_frame,
+                                       YV12_BUFFER_CONFIG *sd) {
+  const int num_planes = av1_num_planes(cm);
+  if (!equal_dimensions_and_border(new_frame, sd))
+    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                       "Incorrect buffer dimensions");
+  else
+    aom_yv12_copy_frame(new_frame, sd, num_planes);
+
+  return cm->error.error_code;
+}
+
 int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
                           AOM_SCALING vert_mode) {
   int hr = 0, hs = 0, vr = 0, vs = 0;
@@ -6773,47 +6064,134 @@ int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
 
 int av1_get_quantizer(AV1_COMP *cpi) { return cpi->common.base_qindex; }
 
+int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *frame_size) {
+  size_t output_size = 0;
+  size_t total_bytes_read = 0;
+  size_t remaining_size = *frame_size;
+  uint8_t *buff_ptr = buffer;
+
+  // go through each OBUs
+  while (total_bytes_read < *frame_size) {
+    uint8_t saved_obu_header[2];
+    uint64_t obu_payload_size;
+    size_t length_of_payload_size;
+    size_t length_of_obu_size;
+    uint32_t obu_header_size = (buff_ptr[0] >> 2) & 0x1 ? 2 : 1;
+    size_t obu_bytes_read = obu_header_size;  // bytes read for current obu
+
+    // save the obu header (1 or 2 bytes)
+    memmove(saved_obu_header, buff_ptr, obu_header_size);
+    // clear the obu_has_size_field
+    saved_obu_header[0] = saved_obu_header[0] & (~0x2);
+
+    // get the payload_size and length of payload_size
+    if (aom_uleb_decode(buff_ptr + obu_header_size, remaining_size,
+                        &obu_payload_size, &length_of_payload_size) != 0) {
+      return AOM_CODEC_ERROR;
+    }
+    obu_bytes_read += length_of_payload_size;
+
+    // calculate the length of size of the obu header plus payload
+    length_of_obu_size =
+        aom_uleb_size_in_bytes((uint64_t)(obu_header_size + obu_payload_size));
+
+    // move the rest of data to new location
+    memmove(buff_ptr + length_of_obu_size + obu_header_size,
+            buff_ptr + obu_bytes_read, remaining_size - obu_bytes_read);
+    obu_bytes_read += (size_t)obu_payload_size;
+
+    // write the new obu size
+    const uint64_t obu_size = obu_header_size + obu_payload_size;
+    size_t coded_obu_size;
+    if (aom_uleb_encode(obu_size, sizeof(obu_size), buff_ptr,
+                        &coded_obu_size) != 0) {
+      return AOM_CODEC_ERROR;
+    }
+
+    // write the saved (modified) obu_header following obu size
+    memmove(buff_ptr + length_of_obu_size, saved_obu_header, obu_header_size);
+
+    total_bytes_read += obu_bytes_read;
+    remaining_size -= obu_bytes_read;
+    buff_ptr += length_of_obu_size + obu_size;
+    output_size += length_of_obu_size + (size_t)obu_size;
+  }
+
+  *frame_size = output_size;
+  return AOM_CODEC_OK;
+}
+
 void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
+  // TODO(yunqingwang): For what references to use, external encoding flags
+  // should be consistent with internal reference frame selection. Need to
+  // ensure that there is not conflict between the two. In AV1 encoder, the
+  // priority rank for 7 reference frames are: LAST, ALTREF, LAST2, LAST3,
+  // GOLDEN, BWDREF, ALTREF2. If only one reference frame is used, it must be
+  // LAST.
+  cpi->ext_ref_frame_flags = AOM_REFFRAME_ALL;
   if (flags &
-      (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF)) {
-    int ref = AOM_REFFRAME_ALL;
-
+      (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+       AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+       AOM_EFLAG_NO_REF_ARF2)) {
     if (flags & AOM_EFLAG_NO_REF_LAST) {
-      ref ^= AOM_LAST_FLAG;
-#if CONFIG_EXT_REFS
-      ref ^= AOM_LAST2_FLAG;
-      ref ^= AOM_LAST3_FLAG;
-#endif  // CONFIG_EXT_REFS
-    }
+      cpi->ext_ref_frame_flags = 0;
+    } else {
+      int ref = AOM_REFFRAME_ALL;
 
-    if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG;
+      if (flags & AOM_EFLAG_NO_REF_LAST2) ref ^= AOM_LAST2_FLAG;
+      if (flags & AOM_EFLAG_NO_REF_LAST3) ref ^= AOM_LAST3_FLAG;
 
-    if (flags & AOM_EFLAG_NO_REF_ARF) ref ^= AOM_ALT_FLAG;
+      if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG;
 
-    av1_use_as_reference(cpi, ref);
+      if (flags & AOM_EFLAG_NO_REF_ARF) {
+        ref ^= AOM_ALT_FLAG;
+        ref ^= AOM_BWD_FLAG;
+        ref ^= AOM_ALT2_FLAG;
+      } else {
+        if (flags & AOM_EFLAG_NO_REF_BWD) ref ^= AOM_BWD_FLAG;
+        if (flags & AOM_EFLAG_NO_REF_ARF2) ref ^= AOM_ALT2_FLAG;
+      }
+
+      av1_use_as_reference(cpi, ref);
+    }
   }
 
   if (flags &
-      (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
-       AOM_EFLAG_FORCE_GF | AOM_EFLAG_FORCE_ARF)) {
+      (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF)) {
     int upd = AOM_REFFRAME_ALL;
 
-    if (flags & AOM_EFLAG_NO_UPD_LAST) {
-      upd ^= AOM_LAST_FLAG;
-#if CONFIG_EXT_REFS
-      upd ^= AOM_LAST2_FLAG;
-      upd ^= AOM_LAST3_FLAG;
-#endif  // CONFIG_EXT_REFS
-    }
+    // Refreshing LAST/LAST2/LAST3 is handled by 1 common flag.
+    if (flags & AOM_EFLAG_NO_UPD_LAST) upd ^= AOM_LAST_FLAG;
 
     if (flags & AOM_EFLAG_NO_UPD_GF) upd ^= AOM_GOLD_FLAG;
 
-    if (flags & AOM_EFLAG_NO_UPD_ARF) upd ^= AOM_ALT_FLAG;
+    if (flags & AOM_EFLAG_NO_UPD_ARF) {
+      upd ^= AOM_ALT_FLAG;
+      upd ^= AOM_BWD_FLAG;
+      upd ^= AOM_ALT2_FLAG;
+    }
 
     av1_update_reference(cpi, upd);
   }
 
+  cpi->ext_use_ref_frame_mvs = cpi->oxcf.allow_ref_frame_mvs &
+                               ((flags & AOM_EFLAG_NO_REF_FRAME_MVS) == 0);
+  cpi->ext_use_error_resilient = cpi->oxcf.error_resilient_mode |
+                                 ((flags & AOM_EFLAG_ERROR_RESILIENT) != 0);
+  cpi->ext_use_s_frame =
+      cpi->oxcf.s_frame_mode | ((flags & AOM_EFLAG_SET_S_FRAME) != 0);
+  cpi->ext_use_primary_ref_none = (flags & AOM_EFLAG_SET_PRIMARY_REF_NONE) != 0;
+
   if (flags & AOM_EFLAG_NO_UPD_ENTROPY) {
     av1_update_entropy(cpi, 0);
   }
 }
+
+int64_t timebase_units_to_ticks(const aom_rational_t *timebase, int64_t n) {
+  return n * TICKS_PER_SEC * timebase->num / timebase->den;
+}
+
+int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n) {
+  const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
+  return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
+}
diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h
index eb779a3cd..5212db2b1 100644
--- a/third_party/aom/av1/encoder/encoder.h
+++ b/third_party/aom/av1/encoder/encoder.h
@@ -14,7 +14,8 @@
 
 #include <stdio.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aomcx.h"
 
 #include "av1/common/alloccommon.h"
@@ -22,11 +23,8 @@
 #include "av1/common/thread_common.h"
 #include "av1/common/onyxc_int.h"
 #include "av1/common/resize.h"
+#include "av1/common/timing.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
-#if CONFIG_ANS
-#include "aom_dsp/ans.h"
-#include "aom_dsp/buf_ans.h"
-#endif
 #include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/encodemb.h"
@@ -38,9 +36,6 @@
 #include "av1/encoder/rd.h"
 #include "av1/encoder/speed_features.h"
 #include "av1/encoder/tokenize.h"
-#if CONFIG_XIPHRC
-#include "av1/encoder/ratectrl_xiph.h"
-#endif
 
 #if CONFIG_INTERNAL_STATS
 #include "aom_dsp/ssim.h"
@@ -54,19 +49,13 @@ extern "C" {
 #endif
 
 typedef struct {
-  int nmv_vec_cost[NMV_CONTEXTS][MV_JOINTS];
-  int nmv_costs[NMV_CONTEXTS][2][MV_VALS];
-  int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS];
-
-  // 0 = Intra, Last, GF, ARF
-  int8_t last_ref_lf_deltas[TOTAL_REFS_PER_FRAME];
-  // 0 = ZERO_MV, MV
-  int8_t last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
+  int nmv_vec_cost[MV_JOINTS];
+  int nmv_costs[2][MV_VALS];
+  int nmv_costs_hp[2][MV_VALS];
 
   FRAME_CONTEXT fc;
 } CODING_CONTEXT;
 
-#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
 typedef enum {
   // regular inter frame
   REGULAR_FRAME = 0,
@@ -76,14 +65,12 @@ typedef enum {
   OVERLAY_FRAME = 2,
   // golden frame
   GLD_FRAME = 3,
-#if CONFIG_EXT_REFS
   // backward reference frame
   BRF_FRAME = 4,
   // extra alternate reference frame
-  EXT_ARF_FRAME = 5
-#endif
+  EXT_ARF_FRAME = 5,
+  FRAME_CONTEXT_INDEXES
 } FRAME_CONTEXT_INDEX;
-#endif
 
 typedef enum {
   NORMAL = 0,
@@ -101,13 +88,9 @@ typedef enum {
 typedef enum {
   FRAMEFLAGS_KEY = 1 << 0,
   FRAMEFLAGS_GOLDEN = 1 << 1,
-#if CONFIG_EXT_REFS
   FRAMEFLAGS_BWDREF = 1 << 2,
   // TODO(zoeliu): To determine whether a frame flag is needed for ALTREF2_FRAME
   FRAMEFLAGS_ALTREF = 1 << 3,
-#else   // !CONFIG_EXT_REFS
-  FRAMEFLAGS_ALTREF = 1 << 2,
-#endif  // CONFIG_EXT_REFS
 } FRAMETYPE_FLAGS;
 
 typedef enum {
@@ -115,26 +98,22 @@ typedef enum {
   VARIANCE_AQ = 1,
   COMPLEXITY_AQ = 2,
   CYCLIC_REFRESH_AQ = 3,
-#if !CONFIG_EXT_DELTA_Q
-  DELTA_AQ = 4,
-#endif
   AQ_MODE_COUNT  // This should always be the last member of the enum
 } AQ_MODE;
-#if CONFIG_EXT_DELTA_Q
 typedef enum {
   NO_DELTA_Q = 0,
   DELTA_Q_ONLY = 1,
   DELTA_Q_LF = 2,
   DELTAQ_MODE_COUNT  // This should always be the last member of the enum
 } DELTAQ_MODE;
-#endif
+
 typedef enum {
   RESIZE_NONE = 0,    // No frame resizing allowed.
   RESIZE_FIXED = 1,   // All frames are coded at the specified scale.
   RESIZE_RANDOM = 2,  // All frames are coded at a random scale.
   RESIZE_MODES
 } RESIZE_MODE;
-#if CONFIG_FRAME_SUPERRES
+
 typedef enum {
   SUPERRES_NONE = 0,     // No frame superres allowed
   SUPERRES_FIXED = 1,    // All frames are coded at the specified scale,
@@ -145,13 +124,14 @@ typedef enum {
                          // q_index
   SUPERRES_MODES
 } SUPERRES_MODE;
-#endif  // CONFIG_FRAME_SUPERRES
 
 typedef struct AV1EncoderConfig {
   BITSTREAM_PROFILE profile;
   aom_bit_depth_t bit_depth;     // Codec bit-depth.
   int width;                     // width of data passed to the compressor
   int height;                    // height of data passed to the compressor
+  int forced_max_frame_width;    // forced maximum width of frame (if != 0)
+  int forced_max_frame_height;   // forced maximum height of frame (if != 0)
   unsigned int input_bit_depth;  // Input bit depth.
   double init_framerate;         // set to passed in framerate
   int64_t target_bandwidth;      // bandwidth to be used in bits per second
@@ -159,6 +139,7 @@ typedef struct AV1EncoderConfig {
   int noise_sensitivity;  // pre processing blur: recommendation 0
   int sharpness;          // sharpening output: recommendation 0:
   int speed;
+  int dev_sf;
   // maximum allowed bitrate for any intra frame in % of bitrate target.
   unsigned int rc_max_intra_bitrate_pct;
   // maximum allowed bitrate for any inter frame in % of bitrate target.
@@ -172,8 +153,11 @@ typedef struct AV1EncoderConfig {
   // Key Framing Operations
   int auto_key;  // autodetect cut scenes and set the keyframes
   int key_freq;  // maximum distance to key frame.
-
+  int sframe_dist;
+  int sframe_mode;
+  int sframe_enabled;
   int lag_in_frames;  // how many frames lag before we start encoding
+  int fwd_kf_enabled;
 
   // ----------------------------------------------------------------
   // DATARATE CONTROL OPTIONS
@@ -199,36 +183,33 @@ typedef struct AV1EncoderConfig {
   int best_allowed_q;
   int cq_level;
   AQ_MODE aq_mode;  // Adaptive Quantization mode
-#if CONFIG_EXT_DELTA_Q
   DELTAQ_MODE deltaq_mode;
-#endif
-#if CONFIG_AOM_QM
+  int enable_cdef;
+  int enable_restoration;
+  int disable_trellis_quant;
   int using_qm;
+  int qm_y;
+  int qm_u;
+  int qm_v;
   int qm_minlevel;
   int qm_maxlevel;
-#endif
 #if CONFIG_DIST_8X8
   int using_dist_8x8;
 #endif
   unsigned int num_tile_groups;
   unsigned int mtu;
 
-#if CONFIG_TEMPMV_SIGNALING
-  unsigned int disable_tempmv;
-#endif
   // Internal frame size scaling.
   RESIZE_MODE resize_mode;
   uint8_t resize_scale_denominator;
   uint8_t resize_kf_scale_denominator;
 
-#if CONFIG_FRAME_SUPERRES
   // Frame Super-Resolution size scaling.
   SUPERRES_MODE superres_mode;
   uint8_t superres_scale_denominator;
   uint8_t superres_kf_scale_denominator;
   int superres_qthresh;
   int superres_kf_qthresh;
-#endif  // CONFIG_FRAME_SUPERRES
 
   // Enable feature to reduce the frame quantization every x frames.
   int frame_periodic_boost;
@@ -241,9 +222,7 @@ typedef struct AV1EncoderConfig {
   // ----------------------------------------------------------------
 
   int enable_auto_arf;
-#if CONFIG_EXT_REFS
   int enable_auto_brf;  // (b)ackward (r)ef (f)rame
-#endif                  // CONFIG_EXT_REFS
 
   /* Bitfield defining the error resiliency features to enable.
    * Can provide decodable frames after losses in previous
@@ -251,12 +230,16 @@ typedef struct AV1EncoderConfig {
    */
   unsigned int error_resilient_mode;
 
+  unsigned int s_frame_mode;
+
   /* Bitfield defining the parallel decoding mode where the
    * decoding in successive frames may be conducted in parallel
    * just by decoding the frame headers.
    */
   unsigned int frame_parallel_decoding_mode;
 
+  unsigned int limit;
+
   int arnr_max_frames;
   int arnr_strength;
 
@@ -265,18 +248,10 @@ typedef struct AV1EncoderConfig {
 
   int tile_columns;
   int tile_rows;
-#if CONFIG_MAX_TILE
   int tile_width_count;
   int tile_height_count;
   int tile_widths[MAX_TILE_COLS];
   int tile_heights[MAX_TILE_ROWS];
-#endif
-#if CONFIG_DEPENDENT_HORZTILES
-  int dependent_horz_tiles;
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-  int loop_filter_across_tiles_enabled;
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
 
   int max_threads;
 
@@ -289,34 +264,135 @@ typedef struct AV1EncoderConfig {
 
   aom_tune_metric tuning;
   aom_tune_content content;
-#if CONFIG_HIGHBITDEPTH
   int use_highbitdepth;
-#endif
-  aom_color_space_t color_space;
-  aom_transfer_function_t transfer_function;
+  aom_color_primaries_t color_primaries;
+  aom_transfer_characteristics_t transfer_characteristics;
+  aom_matrix_coefficients_t matrix_coefficients;
   aom_chroma_sample_position_t chroma_sample_position;
   int color_range;
   int render_width;
   int render_height;
-
-#if CONFIG_EXT_PARTITION
+  aom_timing_info_type_t timing_info_type;
+  int timing_info_present;
+  aom_timing_info_t timing_info;
+  int decoder_model_info_present_flag;
+  int display_model_info_present_flag;
+  int buffer_removal_delay_present;
+  aom_dec_model_info_t buffer_model;
+  aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
+  aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1];
+  int film_grain_test_vector;
+  const char *film_grain_table_filename;
+
+  uint8_t cdf_update_mode;
   aom_superblock_size_t superblock_size;
-#endif  // CONFIG_EXT_PARTITION
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  int ans_window_size_log2;
-#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-#if CONFIG_EXT_TILE
   unsigned int large_scale_tile;
   unsigned int single_tile_decoding;
-#endif  // CONFIG_EXT_TILE
-
+  int monochrome;
+  unsigned int full_still_picture_hdr;
+  int enable_dual_filter;
   unsigned int motion_vector_unit_test;
+  const cfg_options_t *cfg;
+  int enable_order_hint;
+  int enable_jnt_comp;
+  int enable_ref_frame_mvs;
+  unsigned int allow_ref_frame_mvs;
+  int enable_warped_motion;
+  int allow_warped_motion;
+  int enable_superres;
+  unsigned int save_as_annexb;
 } AV1EncoderConfig;
 
 static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) {
   return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
 }
 
+typedef struct FRAME_COUNTS {
+// Note: This structure should only contain 'unsigned int' fields, or
+// aggregates built solely from 'unsigned int' fields/elements
+#if CONFIG_ENTROPY_STATS
+  unsigned int kf_y_mode[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][INTRA_MODES];
+  unsigned int angle_delta[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1];
+  unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
+  unsigned int uv_mode[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES];
+  unsigned int cfl_sign[CFL_JOINT_SIGNS];
+  unsigned int cfl_alpha[CFL_ALPHA_CONTEXTS][CFL_ALPHABET_SIZE];
+  unsigned int palette_y_mode[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2];
+  unsigned int palette_uv_mode[PALETTE_UV_MODE_CONTEXTS][2];
+  unsigned int palette_y_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  unsigned int palette_uv_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  unsigned int palette_y_color_index[PALETTE_SIZES]
+                                    [PALETTE_COLOR_INDEX_CONTEXTS]
+                                    [PALETTE_COLORS];
+  unsigned int palette_uv_color_index[PALETTE_SIZES]
+                                     [PALETTE_COLOR_INDEX_CONTEXTS]
+                                     [PALETTE_COLORS];
+  unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+  unsigned int txb_skip[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS][2];
+  unsigned int eob_extra[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                        [EOB_COEF_CONTEXTS][2];
+  unsigned int dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS][2];
+  unsigned int coeff_lps[TX_SIZES][PLANE_TYPES][BR_CDF_SIZE - 1][LEVEL_CONTEXTS]
+                        [2];
+  unsigned int eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS][2];
+  unsigned int eob_multi16[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][5];
+  unsigned int eob_multi32[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][6];
+  unsigned int eob_multi64[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][7];
+  unsigned int eob_multi128[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][8];
+  unsigned int eob_multi256[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][9];
+  unsigned int eob_multi512[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][10];
+  unsigned int eob_multi1024[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][11];
+  unsigned int coeff_lps_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                              [LEVEL_CONTEXTS][BR_CDF_SIZE];
+  unsigned int coeff_base_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                               [SIG_COEF_CONTEXTS][NUM_BASE_LEVELS + 2];
+  unsigned int coeff_base_eob_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                                   [SIG_COEF_CONTEXTS_EOB][NUM_BASE_LEVELS + 1];
+  unsigned int newmv_mode[NEWMV_MODE_CONTEXTS][2];
+  unsigned int zeromv_mode[GLOBALMV_MODE_CONTEXTS][2];
+  unsigned int refmv_mode[REFMV_MODE_CONTEXTS][2];
+  unsigned int drl_mode[DRL_MODE_CONTEXTS][2];
+  unsigned int inter_compound_mode[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+  unsigned int wedge_idx[BLOCK_SIZES_ALL][16];
+  unsigned int interintra[BLOCK_SIZE_GROUPS][2];
+  unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+  unsigned int wedge_interintra[BLOCK_SIZES_ALL][2];
+  unsigned int compound_type[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1];
+  unsigned int motion_mode[BLOCK_SIZES_ALL][MOTION_MODES];
+  unsigned int obmc[BLOCK_SIZES_ALL][2];
+  unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
+  unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
+  unsigned int comp_ref_type[COMP_REF_TYPE_CONTEXTS][2];
+  unsigned int uni_comp_ref[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1][2];
+  unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS - 1][2];
+  unsigned int comp_ref[REF_CONTEXTS][FWD_REFS - 1][2];
+  unsigned int comp_bwdref[REF_CONTEXTS][BWD_REFS - 1][2];
+  unsigned int intrabc[2];
+
+  unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2];
+  unsigned int intra_tx_size[MAX_TX_CATS][TX_SIZE_CONTEXTS][MAX_TX_DEPTH + 1];
+  unsigned int skip_mode[SKIP_MODE_CONTEXTS][2];
+  unsigned int skip[SKIP_CONTEXTS][2];
+  unsigned int compound_index[COMP_INDEX_CONTEXTS][2];
+  unsigned int comp_group_idx[COMP_GROUP_IDX_CONTEXTS][2];
+  unsigned int delta_q[DELTA_Q_PROBS][2];
+  unsigned int delta_lf_multi[FRAME_LF_COUNT][DELTA_LF_PROBS][2];
+  unsigned int delta_lf[DELTA_LF_PROBS][2];
+
+  unsigned int inter_ext_tx[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+  unsigned int intra_ext_tx[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                           [TX_TYPES];
+  unsigned int filter_intra_mode[FILTER_INTRA_MODES];
+  unsigned int filter_intra[BLOCK_SIZES_ALL][2];
+  unsigned int switchable_restore[RESTORE_SWITCHABLE_TYPES];
+  unsigned int wiener_restore[2];
+  unsigned int sgrproj_restore[2];
+#endif  // CONFIG_ENTROPY_STATS
+
+  unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
+                                [SWITCHABLE_FILTERS];
+} FRAME_COUNTS;
+
 // TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
 typedef struct TileDataEnc {
   TileInfo tile_info;
@@ -324,42 +400,31 @@ typedef struct TileDataEnc {
   int mode_map[BLOCK_SIZES_ALL][MAX_MODES];
   int m_search_count;
   int ex_search_count;
-#if CONFIG_PVQ
-  PVQ_QUEUE pvq_q;
-#endif
-#if CONFIG_CFL
   CFL_CTX cfl;
-#endif
   DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
+  uint8_t allow_update_cdf;
 } TileDataEnc;
 
 typedef struct RD_COUNTS {
   int64_t comp_pred_diff[REFERENCE_MODES];
-#if CONFIG_GLOBAL_MOTION
   // Stores number of 4x4 blocks using global motion per reference frame.
-  int global_motion_used[TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_GLOBAL_MOTION
-  int single_ref_used_flag;
+  int global_motion_used[REF_FRAMES];
   int compound_ref_used_flag;
+  int skip_mode_used_flag;
 } RD_COUNTS;
 
 typedef struct ThreadData {
   MACROBLOCK mb;
   RD_COUNTS rd_counts;
   FRAME_COUNTS *counts;
-#if !CONFIG_CB4X4
-  PICK_MODE_CONTEXT *leaf_tree;
-#endif
   PC_TREE *pc_tree;
   PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
-#if CONFIG_MOTION_VAR
   int32_t *wsrc_buf;
   int32_t *mask_buf;
   uint8_t *above_pred_buf;
   uint8_t *left_pred_buf;
-#endif
-
   PALETTE_BUFFER *palette_buffer;
+  int intrabc_used_this_tile;
 } ThreadData;
 
 struct EncWorkerData;
@@ -370,14 +435,21 @@ typedef struct ActiveMap {
   unsigned char *map;
 } ActiveMap;
 
-#define NUM_STAT_TYPES 4  // types of stats: Y, U, V and ALL
+#if CONFIG_INTERNAL_STATS
+// types of stats
+typedef enum {
+  STAT_Y,
+  STAT_U,
+  STAT_V,
+  STAT_ALL,
+  NUM_STAT_TYPES  // This should always be the last member of the enum
+} StatType;
 
 typedef struct IMAGE_STAT {
   double stat[NUM_STAT_TYPES];
   double worst;
 } ImageStat;
-
-#undef NUM_STAT_TYPES
+#endif  // CONFIG_INTERNAL_STATS
 
 typedef struct {
   int ref_count;
@@ -392,16 +464,18 @@ typedef struct TileBufferEnc {
 typedef struct AV1_COMP {
   QUANTS quants;
   ThreadData td;
+  FRAME_COUNTS counts;
   MB_MODE_INFO_EXT *mbmi_ext_base;
-#if CONFIG_LV_MAP
   CB_COEFF_BUFFER *coeff_buffer_base;
-#endif
   Dequants dequants;
   AV1_COMMON common;
   AV1EncoderConfig oxcf;
   struct lookahead_ctx *lookahead;
   struct lookahead_entry *alt_ref_source;
 
+  int optimize_speed_feature;
+  int optimize_seg_arr[MAX_SEGMENTS];
+
   YV12_BUFFER_CONFIG *source;
   YV12_BUFFER_CONFIG *last_source;  // NULL for first frame and alt_ref frames
   YV12_BUFFER_CONFIG *unscaled_source;
@@ -411,58 +485,42 @@ typedef struct AV1_COMP {
 
   // For a still frame, this flag is set to 1 to skip partition search.
   int partition_search_skippable_frame;
-#if CONFIG_AMVR
   double csm_rate_array[32];
   double m_rate_array[32];
   int rate_size;
   int rate_index;
-  hash_table *previsou_hash_table;
-  int previsous_index;
+  hash_table *previous_hash_table;
+  int previous_index;
   int cur_poc;  // DebugInfo
-#endif
 
-  int scaled_ref_idx[TOTAL_REFS_PER_FRAME];
-#if CONFIG_EXT_REFS
-  int lst_fb_idxes[LAST_REF_FRAMES];
-#else
-  int lst_fb_idx;
-#endif  // CONFIG_EXT_REFS
-  int gld_fb_idx;
-#if CONFIG_EXT_REFS
-  int bwd_fb_idx;   // BWDREF_FRAME
-  int alt2_fb_idx;  // ALTREF2_FRAME
-#endif              // CONFIG_EXT_REFS
-  int alt_fb_idx;
-#if CONFIG_EXT_REFS
-  int ext_fb_idx;      // extra ref frame buffer index
+  int scaled_ref_idx[REF_FRAMES];
+  int ref_fb_idx[REF_FRAMES];
   int refresh_fb_idx;  // ref frame buffer index to refresh
-#endif                 // CONFIG_EXT_REFS
 
   int last_show_frame_buf_idx;  // last show frame buffer index
 
   int refresh_last_frame;
   int refresh_golden_frame;
-#if CONFIG_EXT_REFS
   int refresh_bwd_ref_frame;
   int refresh_alt2_ref_frame;
-#endif  // CONFIG_EXT_REFS
   int refresh_alt_ref_frame;
 
   int ext_refresh_frame_flags_pending;
   int ext_refresh_last_frame;
   int ext_refresh_golden_frame;
+  int ext_refresh_bwd_ref_frame;
+  int ext_refresh_alt2_ref_frame;
   int ext_refresh_alt_ref_frame;
 
   int ext_refresh_frame_context_pending;
   int ext_refresh_frame_context;
+  int ext_use_ref_frame_mvs;
+  int ext_use_error_resilient;
+  int ext_use_s_frame;
+  int ext_use_primary_ref_none;
 
   YV12_BUFFER_CONFIG last_frame_uf;
-#if CONFIG_LOOP_RESTORATION
-  YV12_BUFFER_CONFIG last_frame_db;
   YV12_BUFFER_CONFIG trial_frame_rst;
-  uint8_t *extra_rstbuf;  // Extra buffers used in restoration search
-  RestorationInfo rst_search[MAX_MB_PLANE];  // Used for encoder side search
-#endif                                       // CONFIG_LOOP_RESTORATION
 
   // Ambient reconstruction err target for force key frames
   int64_t ambient_err;
@@ -471,22 +529,17 @@ typedef struct AV1_COMP {
 
   CODING_CONTEXT coding_context;
 
-#if CONFIG_GLOBAL_MOTION
   int gmtype_cost[TRANS_TYPES];
-  int gmparams_cost[TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_GLOBAL_MOTION
+  int gmparams_cost[REF_FRAMES];
 
-  int nmv_costs[NMV_CONTEXTS][2][MV_VALS];
-  int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS];
+  int nmv_costs[2][MV_VALS];
+  int nmv_costs_hp[2][MV_VALS];
 
   int64_t last_time_stamp_seen;
   int64_t last_end_time_stamp_seen;
   int64_t first_time_stamp_ever;
 
   RATE_CONTROL rc;
-#if CONFIG_XIPHRC
-  od_rc_state od_rc;
-#endif
   double framerate;
 
   // NOTE(zoeliu): Any inter frame allows maximum of REF_FRAMES inter
@@ -500,6 +553,8 @@ typedef struct AV1_COMP {
   int mbgraph_n_frames;  // number of frames filled in the above
   int static_mb_pct;     // % forced skip mbs by segmentation
   int ref_frame_flags;
+  int ext_ref_frame_flags;
+  RATE_FACTOR_LEVEL frame_rf_level[FRAME_BUFFERS];
 
   SPEED_FEATURES sf;
 
@@ -507,6 +562,7 @@ typedef struct AV1_COMP {
   int mv_step_param;
 
   int allow_comp_inter_inter;
+  int all_one_sided_refs;
 
   uint8_t *segmentation_map;
 
@@ -514,7 +570,6 @@ typedef struct AV1_COMP {
   ActiveMap active_map;
 
   fractional_mv_step_fp *find_fractional_mv_step;
-  av1_full_search_fn_t full_search_sad;  // It is currently unused.
   av1_diamond_search_fn_t diamond_search_sad;
   aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL];
   uint64_t time_receive_data;
@@ -581,8 +636,6 @@ typedef struct AV1_COMP {
   search_site_config ss_cfg;
 
   int multi_arf_allowed;
-  int multi_arf_enabled;
-  int multi_arf_last_grp_enabled;
 
   TileDataEnc *tile_data;
   int allocated_tiles;  // Keep track of memory allocated for tiles.
@@ -597,6 +650,11 @@ typedef struct AV1_COMP {
   int resize_buffer_underflow;
   int resize_count;
 
+  // Sequence parameters have been transmitted already and locked
+  // or not. Once locked av1_change_config cannot change the seq
+  // parameters.
+  int seq_params_locked;
+
   // VARIANCE_AQ segment map refresh
   int vaq_refresh;
 
@@ -604,11 +662,6 @@ typedef struct AV1_COMP {
   int num_workers;
   AVxWorker *workers;
   struct EncWorkerData *tile_thr_data;
-  AV1LfSync lf_row_sync;
-#if CONFIG_ANS
-  struct BufAnsCoder buf_ans;
-#endif
-#if CONFIG_EXT_REFS
   int refresh_frame_mask;
   int existing_fb_idx_to_show;
   int is_arf_filter_off[MAX_EXT_ARFS + 1];
@@ -616,22 +669,24 @@ typedef struct AV1_COMP {
   int arf_map[MAX_EXT_ARFS + 1];
   int arf_pos_in_gf[MAX_EXT_ARFS + 1];
   int arf_pos_for_ovrly[MAX_EXT_ARFS + 1];
-#endif  // CONFIG_EXT_REFS
-#if CONFIG_GLOBAL_MOTION
   int global_motion_search_done;
-#endif
-#if CONFIG_LV_MAP
   tran_low_t *tcoeff_buf[MAX_MB_PLANE];
-#endif
-
-#if CONFIG_EXT_REFS
   int extra_arf_allowed;
-  int bwd_ref_allowed;
-#endif  // CONFIG_EXT_REFS
+  // A flag to indicate if intrabc is ever used in current frame.
+  int intrabc_used;
+  int dv_cost[2][MV_VALS];
+  // TODO(huisu@google.com): we can update dv_joint_cost per SB.
+  int dv_joint_cost[MV_JOINTS];
+  int has_lossless_segment;
+
+  // For frame refs short signaling:
+  //   A mapping of each reference frame from its encoder side value to the
+  //   decoder side value obtained following the short signaling procedure.
+  int ref_conv[REF_FRAMES];
 
-#if CONFIG_BGSPRITE
-  int bgsprite_allowed;
-#endif  // CONFIG_BGSPRITE
+  AV1LfSync lf_row_sync;
+  AV1LrSync lr_row_sync;
+  AV1LrStruct lr_ctxt;
 } AV1_COMP;
 
 void av1_initialize_enc(void);
@@ -650,12 +705,17 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
 
 int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest, int64_t *time_stamp,
-                            int64_t *time_end, int flush);
+                            int64_t *time_end, int flush,
+                            const aom_rational_t *timebase);
 
 int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest);
 
 int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame);
 
+aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
+                                       YV12_BUFFER_CONFIG *new_frame,
+                                       YV12_BUFFER_CONFIG *sd);
+
 int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags);
 
 void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags);
@@ -675,6 +735,11 @@ int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
 
 int av1_get_quantizer(struct AV1_COMP *cpi);
 
+int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size);
+
+int64_t timebase_units_to_ticks(const aom_rational_t *timebase, int64_t n);
+int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n);
+
 static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
   return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
          (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
@@ -682,22 +747,7 @@ static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
 
 static INLINE int get_ref_frame_map_idx(const AV1_COMP *cpi,
                                         MV_REFERENCE_FRAME ref_frame) {
-#if CONFIG_EXT_REFS
-  if (ref_frame >= LAST_FRAME && ref_frame <= LAST3_FRAME)
-    return cpi->lst_fb_idxes[ref_frame - 1];
-#else
-  if (ref_frame == LAST_FRAME) return cpi->lst_fb_idx;
-#endif  // CONFIG_EXT_REFS
-  else if (ref_frame == GOLDEN_FRAME)
-    return cpi->gld_fb_idx;
-#if CONFIG_EXT_REFS
-  else if (ref_frame == BWDREF_FRAME)
-    return cpi->bwd_fb_idx;
-  else if (ref_frame == ALTREF2_FRAME)
-    return cpi->alt2_fb_idx;
-#endif  // CONFIG_EXT_REFS
-  else
-    return cpi->alt_fb_idx;
+  return (ref_frame >= 1) ? cpi->ref_fb_idx[ref_frame - 1] : INVALID_IDX;
 }
 
 static INLINE int get_ref_frame_buf_idx(const AV1_COMP *cpi,
@@ -707,16 +757,19 @@ static INLINE int get_ref_frame_buf_idx(const AV1_COMP *cpi,
   return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
 }
 
-#if CONFIG_HASH_ME
-static INLINE hash_table *get_ref_frame_hash_map(const AV1_COMP *cpi,
-                                                 MV_REFERENCE_FRAME ref_frame) {
+// TODO(huisu@google.com, youzhou@microsoft.com): enable hash-me for HBD.
+static INLINE int av1_use_hash_me(const AV1_COMMON *const cm) {
+  return cm->allow_screen_content_tools;
+}
+
+static INLINE hash_table *av1_get_ref_frame_hash_map(
+    const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
   const AV1_COMMON *const cm = &cpi->common;
   const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
   return buf_idx != INVALID_IDX
              ? &cm->buffer_pool->frame_bufs[buf_idx].hash_table
              : NULL;
 }
-#endif
 
 static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
     const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
@@ -726,7 +779,6 @@ static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
                                 : NULL;
 }
 
-#if CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
 static INLINE int enc_is_ref_frame_buf(AV1_COMP *cpi, RefCntBuffer *frame_buf) {
   MV_REFERENCE_FRAME ref_frame;
   AV1_COMMON *const cm = &cpi->common;
@@ -737,48 +789,42 @@ static INLINE int enc_is_ref_frame_buf(AV1_COMP *cpi, RefCntBuffer *frame_buf) {
   }
   return (ref_frame <= ALTREF_FRAME);
 }
-#endif  // CONFIG_EXT_REFS
 
-static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols) {
-  // We assume 3 planes all at full resolution. We assume up to 1 token per
-  // pixel, and then allow a head room of 1 EOSB token per 4x4 block per plane,
-  // plus EOSB_TOKEN per plane.
-  return mb_rows * mb_cols * (16 * 16 + 17) * 3;
+// Token buffer is only used for palette tokens.
+static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols,
+                                           int sb_size_log2,
+                                           const int num_planes) {
+  // Calculate the maximum number of max superblocks in the image.
+  const int shift = sb_size_log2 - 4;
+  const int sb_size = 1 << sb_size_log2;
+  const int sb_size_square = sb_size * sb_size;
+  const int sb_rows = ALIGN_POWER_OF_TWO(mb_rows, shift) >> shift;
+  const int sb_cols = ALIGN_POWER_OF_TWO(mb_cols, shift) >> shift;
+
+  // One palette token for each pixel. There can be palettes on two planes.
+  const int sb_palette_toks = AOMMIN(2, num_planes) * sb_size_square;
+
+  return sb_rows * sb_cols * sb_palette_toks;
 }
 
 // Get the allocated token size for a tile. It does the same calculation as in
 // the frame token allocation.
-static INLINE unsigned int allocated_tokens(TileInfo tile) {
-#if CONFIG_CB4X4
+static INLINE unsigned int allocated_tokens(TileInfo tile, int sb_size_log2,
+                                            int num_planes) {
   int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 2) >> 2;
   int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 2) >> 2;
-#else
-  int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 1) >> 1;
-  int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 1) >> 1;
-#endif
 
-  return get_token_alloc(tile_mb_rows, tile_mb_cols);
+  return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes);
 }
 
-#if CONFIG_TEMPMV_SIGNALING
-void av1_set_temporal_mv_prediction(AV1_COMP *cpi, int allow_tempmv_prediction);
-#endif
-
 void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags);
 
+#define ALT_MIN_LAG 3
 static INLINE int is_altref_enabled(const AV1_COMP *const cpi) {
-  return cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.enable_auto_arf;
+  return cpi->oxcf.lag_in_frames >= ALT_MIN_LAG && cpi->oxcf.enable_auto_arf;
 }
 
 // TODO(zoeliu): To set up cpi->oxcf.enable_auto_brf
-#if 0 && CONFIG_EXT_REFS
-static INLINE int is_bwdref_enabled(const AV1_COMP *const cpi) {
-  // NOTE(zoeliu): The enabling of bi-predictive frames depends on the use of
-  //               alt_ref, and now will be off when the alt_ref interval is
-  //               not sufficiently large.
-  return is_altref_enabled(cpi) && cpi->oxcf.enable_auto_brf;
-}
-#endif  // CONFIG_EXT_REFS
 
 static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                 MV_REFERENCE_FRAME ref0,
@@ -813,22 +859,14 @@ static INLINE void uref_cnt_fb(EncRefCntBuffer *ubufs, int *uidx,
   ubufs[new_uidx].ref_count++;
 }
 
-// Returns 1 if a frame is unscaled and 0 otherwise.
-static INLINE int av1_resize_unscaled(const AV1_COMMON *cm) {
-#if CONFIG_FRAME_SUPERRES
-  return cm->superres_upscaled_width == cm->render_width &&
-         cm->superres_upscaled_height == cm->render_height;
-#else
-  return cm->width == cm->render_width && cm->height == cm->render_height;
-#endif  // CONFIG_FRAME_SUPERRES
+// Returns 1 if a frame is scaled and 0 otherwise.
+static INLINE int av1_resize_scaled(const AV1_COMMON *cm) {
+  return !(cm->superres_upscaled_width == cm->render_width &&
+           cm->superres_upscaled_height == cm->render_height);
 }
 
-static INLINE int av1_frame_unscaled(const AV1_COMMON *cm) {
-#if CONFIG_FRAME_SUPERRES
-  return av1_superres_unscaled(cm) && av1_resize_unscaled(cm);
-#else
-  return av1_resize_unscaled(cm);
-#endif  // CONFIG_FRAME_SUPERRES
+static INLINE int av1_frame_scaled(const AV1_COMMON *cm) {
+  return !av1_superres_scaled(cm) && av1_resize_scaled(cm);
 }
 
 #ifdef __cplusplus
diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c
index 6209d6fa4..4d4802b46 100644
--- a/third_party/aom/av1/encoder/encodetxb.c
+++ b/third_party/aom/av1/encoder/encodetxb.c
@@ -9,65 +9,81 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "av1/common/scan.h"
+#include "av1/encoder/encodetxb.h"
+
+#include "aom_ports/mem.h"
 #include "av1/common/blockd.h"
 #include "av1/common/idct.h"
 #include "av1/common/pred_common.h"
+#include "av1/common/scan.h"
 #include "av1/encoder/bitstream.h"
-#include "av1/encoder/encodeframe.h"
 #include "av1/encoder/cost.h"
-#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/hash.h"
 #include "av1/encoder/rdopt.h"
-#include "av1/encoder/subexp.h"
 #include "av1/encoder/tokenize.h"
 
-#define TEST_OPTIMIZE_TXB 0
+static int hbt_needs_init = 1;
+static CRC32C crc_calculator;
+static const int HBT_EOB = 16;            // also the length in opt_qcoeff
+static const int HBT_TABLE_SIZE = 65536;  // 16 bit: holds 65536 'arrays'
+static const int HBT_ARRAY_LENGTH = 256;  // 8 bit: 256 entries
+// If removed in hbt_create_hashes or increased beyond int8_t, widen deltas type
+static const int HBT_KICKOUT = 3;
+
+typedef struct OptTxbQcoeff {
+  // Use larger type if larger/no kickout value is used in hbt_create_hashes
+  int8_t deltas[16];
+  uint32_t hbt_qc_hash;
+  uint32_t hbt_ctx_hash;
+  int init;
+  int rate_cost;
+} OptTxbQcoeff;
+
+OptTxbQcoeff *hbt_hash_table;
+
+typedef struct LevelDownStats {
+  int update;
+  tran_low_t low_qc;
+  tran_low_t low_dqc;
+  int64_t dist0;
+  int rate;
+  int rate_low;
+  int64_t dist;
+  int64_t dist_low;
+  int64_t rd;
+  int64_t rd_low;
+  int64_t nz_rd;
+  int64_t rd_diff;
+  int cost_diff;
+  int64_t dist_diff;
+  int new_eob;
+} LevelDownStats;
 
 void av1_alloc_txb_buf(AV1_COMP *cpi) {
-#if 0
-  AV1_COMMON *cm = &cpi->common;
-  int mi_block_size = 1 << MI_SIZE_LOG2;
-  // TODO(angiebird): Make sure cm->subsampling_x/y is set correctly, and then
-  // use precise buffer size according to cm->subsampling_x/y
-  int pixel_stride = mi_block_size * cm->mi_cols;
-  int pixel_height = mi_block_size * cm->mi_rows;
-  int i;
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    CHECK_MEM_ERROR(
-        cm, cpi->tcoeff_buf[i],
-        aom_malloc(sizeof(*cpi->tcoeff_buf[i]) * pixel_stride * pixel_height));
-  }
-#else
   AV1_COMMON *cm = &cpi->common;
-  int size = ((cm->mi_rows >> MAX_MIB_SIZE_LOG2) + 1) *
-             ((cm->mi_cols >> MAX_MIB_SIZE_LOG2) + 1);
+  int size = ((cm->mi_rows >> cm->seq_params.mib_size_log2) + 1) *
+             ((cm->mi_cols >> cm->seq_params.mib_size_log2) + 1);
 
   av1_free_txb_buf(cpi);
   // TODO(jingning): This should be further reduced.
   CHECK_MEM_ERROR(cm, cpi->coeff_buffer_base,
-                  aom_malloc(sizeof(*cpi->coeff_buffer_base) * size));
-#endif
+                  aom_memalign(32, sizeof(*cpi->coeff_buffer_base) * size));
 }
 
-void av1_free_txb_buf(AV1_COMP *cpi) {
-#if 0
-  int i;
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    aom_free(cpi->tcoeff_buf[i]);
-  }
-#else
-  aom_free(cpi->coeff_buffer_base);
-#endif
-}
+void av1_free_txb_buf(AV1_COMP *cpi) { aom_free(cpi->coeff_buffer_base); }
 
 void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x,
                           int mi_row, int mi_col) {
-  int stride = (cpi->common.mi_cols >> MAX_MIB_SIZE_LOG2) + 1;
-  int offset =
-      (mi_row >> MAX_MIB_SIZE_LOG2) * stride + (mi_col >> MAX_MIB_SIZE_LOG2);
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  int mib_size_log2 = cm->seq_params.mib_size_log2;
+  int stride = (cm->mi_cols >> mib_size_log2) + 1;
+  int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
   CB_COEFF_BUFFER *coeff_buf = &cpi->coeff_buffer_base[offset];
   const int txb_offset = x->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
-  for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+  assert(x->cb_offset < (1 << num_pels_log2_lookup[cm->seq_params.sb_size]));
+  for (int plane = 0; plane < num_planes; ++plane) {
     x->mbmi_ext->tcoeff[plane] = coeff_buf->tcoeff[plane] + x->cb_offset;
     x->mbmi_ext->eobs[plane] = coeff_buf->eobs[plane] + txb_offset;
     x->mbmi_ext->txb_skip_ctx[plane] =
@@ -93,435 +109,147 @@ static void write_golomb(aom_writer *w, int level) {
   for (i = length - 1; i >= 0; --i) aom_write_bit(w, (x >> i) & 0x01);
 }
 
-static INLINE void write_nz_map(aom_writer *w, const tran_low_t *tcoeff,
-                                uint16_t eob, int plane, const int16_t *scan,
-                                TX_SIZE tx_size, TX_TYPE tx_type,
-                                FRAME_CONTEXT *fc) {
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int height = tx_size_high[tx_size];
-#if CONFIG_CTX1D
-  const int width = tx_size_wide[tx_size];
-  const int eob_offset = width + height;
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int seg_eob =
-      (tx_class == TX_CLASS_2D) ? tx_size_2d[tx_size] : eob_offset;
-#else
-  const int seg_eob = tx_size_2d[tx_size];
-#endif
-#if !LV_MAP_PROB
-  aom_prob *nz_map = fc->nz_map[txs_ctx][plane_type];
-  aom_prob *eob_flag = fc->eob_flag[txs_ctx][plane_type];
-#endif
-
-  for (int c = 0; c < eob; ++c) {
-    int coeff_ctx = get_nz_map_ctx(tcoeff, c, scan, bwl, height, tx_type);
-    int eob_ctx = get_eob_ctx(tcoeff, scan[c], txs_ctx, tx_type);
-
-    tran_low_t v = tcoeff[scan[c]];
-    int is_nz = (v != 0);
-
-    if (c == seg_eob - 1) break;
-
-#if LV_MAP_PROB
-    aom_write_bin(w, is_nz, fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], 2);
-#else
-    aom_write(w, is_nz, nz_map[coeff_ctx]);
-#endif
-
-    if (is_nz) {
-#if LV_MAP_PROB
-      aom_write_bin(w, c == (eob - 1),
-                    fc->eob_flag_cdf[txs_ctx][plane_type][eob_ctx], 2);
-#else
-      aom_write(w, c == (eob - 1), eob_flag[eob_ctx]);
-#endif
-    }
+static INLINE tran_low_t get_lower_coeff(tran_low_t qc) {
+  if (qc == 0) {
+    return 0;
   }
+  return qc > 0 ? qc - 1 : qc + 1;
 }
 
-#if CONFIG_CTX1D
-static INLINE void write_nz_map_vert(aom_writer *w, const tran_low_t *tcoeff,
-                                     uint16_t eob, int plane,
-                                     const int16_t *scan, const int16_t *iscan,
-                                     TX_SIZE tx_size, TX_TYPE tx_type,
-                                     FRAME_CONTEXT *fc) {
-  (void)eob;
-  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  int16_t eob_ls[MAX_HVTX_SIZE];
-  get_eob_vert(eob_ls, tcoeff, width, height);
-#if !LV_MAP_PROB
-  aom_prob *nz_map = fc->nz_map[txs_ctx][plane_type];
-#endif
-  for (int c = 0; c < width; ++c) {
-    int16_t veob = eob_ls[c];
-    assert(veob <= height);
-    int el_ctx = get_empty_line_ctx(c, eob_ls);
-#if LV_MAP_PROB
-    aom_write_bin(w, veob == 0,
-                  fc->empty_line_cdf[txs_ctx][plane_type][tx_class][el_ctx], 2);
-#else
-    aom_write(w, veob == 0,
-              fc->empty_line[txs_ctx][plane_type][tx_class][el_ctx]);
-#endif
-    if (veob) {
-      for (int r = 0; r < veob; ++r) {
-        if (r + 1 != height) {
-          int coeff_idx = r * width + c;
-          int scan_idx = iscan[coeff_idx];
-          int is_nz = tcoeff[coeff_idx] != 0;
-          int coeff_ctx =
-              get_nz_map_ctx(tcoeff, scan_idx, scan, bwl, height, tx_type);
-#if LV_MAP_PROB
-          aom_write_bin(w, is_nz,
-                        fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], 2);
-#else
-          aom_write(w, is_nz, nz_map[coeff_ctx]);
-#endif
-          if (is_nz) {
-            int eob_ctx = get_hv_eob_ctx(c, r, eob_ls);
-#if LV_MAP_PROB
-            aom_write_bin(
-                w, r == veob - 1,
-                fc->hv_eob_cdf[txs_ctx][plane_type][tx_class][eob_ctx], 2);
-#else
-            aom_write(w, r == veob - 1,
-                      fc->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx]);
-#endif
-          }
-        }
-      }
-    }
-  }
+static INLINE tran_low_t qcoeff_to_dqcoeff(tran_low_t qc, int coeff_idx,
+                                           int dqv, int shift,
+                                           const qm_val_t *iqmatrix) {
+  int sign = qc < 0 ? -1 : 1;
+  if (iqmatrix != NULL)
+    dqv =
+        ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+  return sign * ((abs(qc) * dqv) >> shift);
 }
 
-static INLINE void write_nz_map_horiz(aom_writer *w, const tran_low_t *tcoeff,
-                                      uint16_t eob, int plane,
-                                      const int16_t *scan, const int16_t *iscan,
-                                      TX_SIZE tx_size, TX_TYPE tx_type,
-                                      FRAME_CONTEXT *fc) {
-  (void)scan;
-  (void)eob;
-  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  int16_t eob_ls[MAX_HVTX_SIZE];
-  get_eob_horiz(eob_ls, tcoeff, width, height);
-#if !LV_MAP_PROB
-  aom_prob *nz_map = fc->nz_map[txs_ctx][plane_type];
-#endif
-  for (int r = 0; r < height; ++r) {
-    int16_t heob = eob_ls[r];
-    int el_ctx = get_empty_line_ctx(r, eob_ls);
-#if LV_MAP_PROB
-    aom_write_bin(w, heob == 0,
-                  fc->empty_line_cdf[txs_ctx][plane_type][tx_class][el_ctx], 2);
-#else
-    aom_write(w, heob == 0,
-              fc->empty_line[txs_ctx][plane_type][tx_class][el_ctx]);
-#endif
-    if (heob) {
-      for (int c = 0; c < heob; ++c) {
-        if (c + 1 != width) {
-          int coeff_idx = r * width + c;
-          int scan_idx = iscan[coeff_idx];
-          int is_nz = tcoeff[coeff_idx] != 0;
-          int coeff_ctx =
-              get_nz_map_ctx(tcoeff, scan_idx, scan, bwl, height, tx_type);
-#if LV_MAP_PROB
-          aom_write_bin(w, is_nz,
-                        fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], 2);
-#else
-          aom_write(w, is_nz, nz_map[coeff_ctx]);
-#endif
-          if (is_nz) {
-            int eob_ctx = get_hv_eob_ctx(r, c, eob_ls);
-#if LV_MAP_PROB
-            aom_write_bin(
-                w, c == heob - 1,
-                fc->hv_eob_cdf[txs_ctx][plane_type][tx_class][eob_ctx], 2);
-#else
-            aom_write(w, c == heob - 1,
-                      fc->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx]);
-#endif
-          }
-        }
-      }
-    }
-  }
+static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
+                                     int shift) {
+  const int64_t diff = (tcoeff - dqcoeff) * (1 << shift);
+  const int64_t error = diff * diff;
+  return error;
 }
-#endif
-
-void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
-                          aom_writer *w, int blk_row, int blk_col, int block,
-                          int plane, TX_SIZE tx_size, const tran_low_t *tcoeff,
-                          uint16_t eob, TXB_CTX *txb_ctx) {
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-  const int16_t *scan = scan_order->scan;
-  int c;
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int height = tx_size_high[tx_size];
-  uint16_t update_eob = 0;
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
-  (void)blk_row;
-  (void)blk_col;
-
-#if LV_MAP_PROB
-  aom_write_bin(w, eob == 0,
-                ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2);
+#if CONFIG_ENTROPY_STATS
+void av1_update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size,
+                            TX_CLASS tx_class, PLANE_TYPE plane,
+                            FRAME_CONTEXT *ec_ctx, FRAME_COUNTS *counts,
+                            uint8_t allow_update_cdf) {
 #else
-  aom_write(w, eob == 0, ec_ctx->txb_skip[txs_ctx][txb_ctx->txb_skip_ctx]);
+void av1_update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class,
+                            PLANE_TYPE plane, FRAME_CONTEXT *ec_ctx,
+                            uint8_t allow_update_cdf) {
 #endif
+  int eob_extra;
+  const int eob_pt = get_eob_pos_token(eob, &eob_extra);
+  TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
 
-  if (eob == 0) return;
-#if CONFIG_TXK_SEL
-  av1_write_tx_type(cm, xd, blk_row, blk_col, block, plane,
-                    get_min_tx_size(tx_size), w);
-#endif
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
 
-#if CONFIG_CTX1D
-  TX_CLASS tx_class = get_tx_class(tx_type);
-  if (tx_class == TX_CLASS_2D) {
-    write_nz_map(w, tcoeff, eob, plane, scan, tx_size, tx_type, ec_ctx);
-  } else {
-    const int width = tx_size_wide[tx_size];
-    const int eob_offset = width + height;
-    const int eob_mode = eob > eob_offset;
-#if LV_MAP_PROB
-    aom_write_bin(w, eob_mode,
-                  ec_ctx->eob_mode_cdf[txs_ctx][plane_type][tx_class], 2);
-#else
-    aom_write(w, eob_mode, ec_ctx->eob_mode[txs_ctx][plane_type][tx_class]);
+  switch (eob_multi_size) {
+    case 0:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi16[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
 #endif
-    if (eob_mode == 0) {
-      write_nz_map(w, tcoeff, eob, plane, scan, tx_size, tx_type, ec_ctx);
-    } else {
-      const int16_t *iscan = scan_order->iscan;
-      assert(tx_class == TX_CLASS_VERT || tx_class == TX_CLASS_HORIZ);
-      if (tx_class == TX_CLASS_VERT)
-        write_nz_map_vert(w, tcoeff, eob, plane, scan, iscan, tx_size, tx_type,
-                          ec_ctx);
-      else
-        write_nz_map_horiz(w, tcoeff, eob, plane, scan, iscan, tx_size, tx_type,
-                           ec_ctx);
-    }
-  }
-#else
-  write_nz_map(w, tcoeff, eob, plane, scan, tx_size, tx_type, ec_ctx);
-#endif  // CONFIG_CTX1D
-
-  int i;
-  for (i = 0; i < NUM_BASE_LEVELS; ++i) {
-#if !LV_MAP_PROB
-    aom_prob *coeff_base = ec_ctx->coeff_base[txs_ctx][plane_type][i];
+      if (allow_update_cdf)
+        update_cdf(ec_ctx->eob_flag_cdf16[plane][eob_multi_ctx], eob_pt - 1, 5);
+      break;
+    case 1:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi32[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
 #endif
-    update_eob = 0;
-    for (c = eob - 1; c >= 0; --c) {
-      tran_low_t v = tcoeff[scan[c]];
-      tran_low_t level = abs(v);
-      int sign = (v < 0) ? 1 : 0;
-      int ctx;
-
-      if (level <= i) continue;
-
-      ctx = get_base_ctx(tcoeff, scan[c], bwl, height, i + 1);
-
-      if (level == i + 1) {
-#if LV_MAP_PROB
-        aom_write_bin(w, 1, ec_ctx->coeff_base_cdf[txs_ctx][plane_type][i][ctx],
-                      2);
-#else
-        aom_write(w, 1, coeff_base[ctx]);
+      if (allow_update_cdf)
+        update_cdf(ec_ctx->eob_flag_cdf32[plane][eob_multi_ctx], eob_pt - 1, 6);
+      break;
+    case 2:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi64[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
 #endif
-        if (c == 0) {
-#if LV_MAP_PROB
-          aom_write_bin(w, sign,
-                        ec_ctx->dc_sign_cdf[plane_type][txb_ctx->dc_sign_ctx],
-                        2);
-#else
-          aom_write(w, sign, ec_ctx->dc_sign[plane_type][txb_ctx->dc_sign_ctx]);
+      if (allow_update_cdf)
+        update_cdf(ec_ctx->eob_flag_cdf64[plane][eob_multi_ctx], eob_pt - 1, 7);
+      break;
+    case 3:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi128[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
 #endif
-        } else {
-          aom_write_bit(w, sign);
-        }
-        continue;
+      if (allow_update_cdf) {
+        update_cdf(ec_ctx->eob_flag_cdf128[plane][eob_multi_ctx], eob_pt - 1,
+                   8);
       }
-
-#if LV_MAP_PROB
-      aom_write_bin(w, 0, ec_ctx->coeff_base_cdf[txs_ctx][plane_type][i][ctx],
-                    2);
-#else
-      aom_write(w, 0, coeff_base[ctx]);
-#endif
-      update_eob = AOMMAX(update_eob, c);
-    }
-  }
-
-  for (c = update_eob; c >= 0; --c) {
-    tran_low_t v = tcoeff[scan[c]];
-    tran_low_t level = abs(v);
-    int sign = (v < 0) ? 1 : 0;
-    int idx;
-    int ctx;
-
-    if (level <= NUM_BASE_LEVELS) continue;
-
-    if (c == 0) {
-#if LV_MAP_PROB
-      aom_write_bin(w, sign,
-                    ec_ctx->dc_sign_cdf[plane_type][txb_ctx->dc_sign_ctx], 2);
-#else
-      aom_write(w, sign, ec_ctx->dc_sign[plane_type][txb_ctx->dc_sign_ctx]);
+      break;
+    case 4:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi256[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
 #endif
-    } else {
-      aom_write_bit(w, sign);
-    }
-
-    // level is above 1.
-    ctx = get_br_ctx(tcoeff, scan[c], bwl, height);
-
-#if BR_NODE
-    int base_range = level - 1 - NUM_BASE_LEVELS;
-    int br_set_idx = 0;
-    int br_base = 0;
-    int br_offset = 0;
-
-    if (base_range >= COEFF_BASE_RANGE)
-      br_set_idx = BASE_RANGE_SETS;
-    else
-      br_set_idx = coeff_to_br_index[base_range];
-
-    for (idx = 0; idx < BASE_RANGE_SETS; ++idx) {
-      aom_write_bin(w, idx == br_set_idx,
-                    ec_ctx->coeff_br_cdf[txs_ctx][plane_type][idx][ctx], 2);
-      if (idx == br_set_idx) {
-        br_base = br_index_to_coeff[br_set_idx];
-        br_offset = base_range - br_base;
-        int extra_bits = (1 << br_extra_bits[idx]) - 1;
-        for (int tok = 0; tok < extra_bits; ++tok) {
-          if (tok == br_offset) {
-            aom_write_bin(w, 1, ec_ctx->coeff_lps_cdf[txs_ctx][plane_type][ctx],
-                          2);
-            break;
-          }
-          aom_write_bin(w, 0, ec_ctx->coeff_lps_cdf[txs_ctx][plane_type][ctx],
-                        2);
-        }
-        //        aom_write_literal(w, br_offset, br_extra_bits[idx]);
-        break;
+      if (allow_update_cdf) {
+        update_cdf(ec_ctx->eob_flag_cdf256[plane][eob_multi_ctx], eob_pt - 1,
+                   9);
       }
-    }
-
-    if (br_set_idx < BASE_RANGE_SETS) continue;
-#else  // BR_NODE
-    for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
-      if (level == (idx + 1 + NUM_BASE_LEVELS)) {
-#if LV_MAP_PROB
-        aom_write_bin(w, 1, ec_ctx->coeff_lps_cdf[txs_ctx][plane_type][ctx], 2);
-#else
-        aom_write(w, 1, ec_ctx->coeff_lps[txs_ctx][plane_type][ctx]);
+      break;
+    case 5:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi512[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
 #endif
-        break;
+      if (allow_update_cdf) {
+        update_cdf(ec_ctx->eob_flag_cdf512[plane][eob_multi_ctx], eob_pt - 1,
+                   10);
       }
-#if LV_MAP_PROB
-      aom_write_bin(w, 0, ec_ctx->coeff_lps_cdf[txs_ctx][plane_type][ctx], 2);
-#else
-      aom_write(w, 0, ec_ctx->coeff_lps[txs_ctx][plane_type][ctx]);
-#endif
-    }
-    if (idx < COEFF_BASE_RANGE) continue;
-#endif  // BR_NODE
-
-    // use 0-th order Golomb code to handle the residual level.
-    write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS);
+      break;
+    case 6:
+    default:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi1024[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf) {
+        update_cdf(ec_ctx->eob_flag_cdf1024[plane][eob_multi_ctx], eob_pt - 1,
+                   11);
+      }
+      break;
   }
-}
-
-void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
-                         aom_writer *w, int plane) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  BLOCK_SIZE bsize = mbmi->sb_type;
-  struct macroblockd_plane *pd = &xd->plane[plane];
 
-#if CONFIG_CHROMA_SUB8X8
-  const BLOCK_SIZE plane_bsize =
-      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#elif CONFIG_CB4X4
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#else
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd);
-#endif
-  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
-  const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
-  const int bkw = tx_size_wide_unit[tx_size];
-  const int bkh = tx_size_high_unit[tx_size];
-  const int step = tx_size_wide_unit[tx_size] * tx_size_high_unit[tx_size];
-  int row, col;
-  int block = 0;
-  for (row = 0; row < max_blocks_high; row += bkh) {
-    for (col = 0; col < max_blocks_wide; col += bkw) {
-      tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
-      uint16_t eob = x->mbmi_ext->eobs[plane][block];
-      TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block],
-                          x->mbmi_ext->dc_sign_ctx[plane][block] };
-      av1_write_coeffs_txb(cm, xd, w, row, col, block, plane, tx_size, tcoeff,
-                           eob, &txb_ctx);
-      block += step;
-    }
+  if (k_eob_offset_bits[eob_pt] > 0) {
+    int eob_ctx = eob_pt - 3;
+    int eob_shift = k_eob_offset_bits[eob_pt] - 1;
+    int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+#if CONFIG_ENTROPY_STATS
+    counts->eob_extra[cdf_idx][txs_ctx][plane][eob_pt][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+    if (allow_update_cdf)
+      update_cdf(ec_ctx->eob_extra_cdf[txs_ctx][plane][eob_ctx], bit, 2);
   }
 }
 
-static INLINE void get_base_ctx_set(const tran_low_t *tcoeffs,
-                                    int c,  // raster order
-                                    const int bwl, const int height,
-                                    int ctx_set[NUM_BASE_LEVELS]) {
-  const int row = c >> bwl;
-  const int col = c - (row << bwl);
-  const int stride = 1 << bwl;
-  int mag[NUM_BASE_LEVELS] = { 0 };
-  int idx;
-  tran_low_t abs_coeff;
-  int i;
-
-  for (idx = 0; idx < BASE_CONTEXT_POSITION_NUM; ++idx) {
-    int ref_row = row + base_ref_offset[idx][0];
-    int ref_col = col + base_ref_offset[idx][1];
-    int pos = (ref_row << bwl) + ref_col;
-
-    if (ref_row < 0 || ref_col < 0 || ref_row >= height || ref_col >= stride)
-      continue;
-
-    abs_coeff = abs(tcoeffs[pos]);
-
-    for (i = 0; i < NUM_BASE_LEVELS; ++i) {
-      ctx_set[i] += abs_coeff > i;
-      if (base_ref_offset[idx][0] >= 0 && base_ref_offset[idx][1] >= 0)
-        mag[i] |= abs_coeff > (i + 1);
-    }
-  }
+static int get_eob_cost(int eob, const LV_MAP_EOB_COST *txb_eob_costs,
+                        const LV_MAP_COEFF_COST *txb_costs, TX_CLASS tx_class) {
+  int eob_extra;
+  const int eob_pt = get_eob_pos_token(eob, &eob_extra);
+  int eob_cost = 0;
+  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+  eob_cost = txb_eob_costs->eob_cost[eob_multi_ctx][eob_pt - 1];
+
+  if (k_eob_offset_bits[eob_pt] > 0) {
+    const int eob_ctx = eob_pt - 3;
+    const int eob_shift = k_eob_offset_bits[eob_pt] - 1;
+    const int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+    eob_cost += txb_costs->eob_extra_cost[eob_ctx][bit];
+    const int offset_bits = k_eob_offset_bits[eob_pt];
+    if (offset_bits > 1) eob_cost += av1_cost_literal(offset_bits - 1);
+  }
+  return eob_cost;
+}
 
-  for (i = 0; i < NUM_BASE_LEVELS; ++i) {
-    ctx_set[i] = get_base_ctx_from_count_mag(row, col, ctx_set[i], mag[i]);
+static INLINE int get_sign_bit_cost(tran_low_t qc, int coeff_idx,
+                                    const int (*dc_sign_cost)[2],
+                                    int dc_sign_ctx) {
+  if (coeff_idx == 0) {
+    const int sign = (qc < 0) ? 1 : 0;
+    return dc_sign_cost[dc_sign_ctx][sign];
   }
-  return;
+  return av1_cost_literal(1);
 }
 
 static INLINE int get_br_cost(tran_low_t abs_qc, int ctx,
@@ -530,1440 +258,1522 @@ static INLINE int get_br_cost(tran_low_t abs_qc, int ctx,
   const tran_low_t max_level = 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE;
   (void)ctx;
   if (abs_qc >= min_level) {
-#if BR_NODE
-    if (abs_qc >= max_level)
+    if (abs_qc >= max_level) {
       return coeff_lps[COEFF_BASE_RANGE];  // COEFF_BASE_RANGE * cost0;
-    else
+    } else {
       return coeff_lps[(abs_qc - min_level)];  //  * cost0 + cost1;
-#else
-    const int cost0 = coeff_lps[0];
-    const int cost1 = coeff_lps[1];
-    if (abs_qc >= max_level)
-      return COEFF_BASE_RANGE * cost0;
-    else
-      return (abs_qc - min_level) * cost0 + cost1;
-#endif
-  } else {
-    return 0;
+    }
   }
+  return 0;
 }
 
-static INLINE int get_base_cost(tran_low_t abs_qc, int ctx,
-                                const int coeff_base[2], int base_idx) {
-  const int level = base_idx + 1;
-  (void)ctx;
-  if (abs_qc < level)
-    return 0;
-  else
-    return coeff_base[abs_qc == level];
-}
-
-int get_nz_eob_map_cost(const LV_MAP_COEFF_COST *coeff_costs,
-                        const tran_low_t *qcoeff, uint16_t eob, int plane,
-                        const int16_t *scan, TX_SIZE tx_size, TX_TYPE tx_type) {
-  (void)plane;
-  TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int height = tx_size_high[tx_size];
-#if CONFIG_CTX1D
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int width = tx_size_wide[tx_size];
-  const int eob_offset = width + height;
-  const int seg_eob =
-      (tx_class == TX_CLASS_2D) ? tx_size_2d[tx_size] : eob_offset;
-#else
-  const int seg_eob = tx_size_2d[tx_size];
-#endif
-  int cost = 0;
-  for (int c = 0; c < eob; ++c) {
-    tran_low_t v = qcoeff[scan[c]];
-    int is_nz = (v != 0);
-    if (c + 1 != seg_eob) {
-      int coeff_ctx = get_nz_map_ctx(qcoeff, c, scan, bwl, height, tx_type);
-      cost += coeff_costs->nz_map_cost[coeff_ctx][is_nz];
-      if (is_nz) {
-        int eob_ctx = get_eob_ctx(qcoeff, scan[c], txs_ctx, tx_type);
-        cost += coeff_costs->eob_cost[eob_ctx][c == (eob - 1)];
-      }
-    }
+static INLINE int get_golomb_cost(int abs_qc) {
+  if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
+    const int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+    const int length = get_msb(r) + 1;
+    return av1_cost_literal(2 * length - 1);
   }
-  return cost;
+  return 0;
 }
 
-#if CONFIG_CTX1D
-static INLINE int get_nz_eob_map_cost_vert(const LV_MAP_COEFF_COST *coeff_costs,
-                                           const tran_low_t *qcoeff,
-                                           uint16_t eob, int plane,
-                                           const int16_t *scan,
-                                           const int16_t *iscan,
-                                           TX_SIZE tx_size, TX_TYPE tx_type) {
-  (void)tx_size;
-  (void)scan;
-  (void)eob;
-  (void)plane;
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  int16_t eob_ls[MAX_HVTX_SIZE];
-  get_eob_vert(eob_ls, qcoeff, width, height);
+static int get_coeff_cost(const tran_low_t qc, const int scan_idx,
+                          const int is_eob, const TxbInfo *const txb_info,
+                          const LV_MAP_COEFF_COST *const txb_costs,
+                          const int coeff_ctx, const TX_CLASS tx_class) {
+  const TXB_CTX *const txb_ctx = txb_info->txb_ctx;
+  const int is_nz = (qc != 0);
+  const tran_low_t abs_qc = abs(qc);
   int cost = 0;
-  for (int c = 0; c < width; ++c) {
-    int16_t veob = eob_ls[c];
-    assert(veob <= height);
-    int el_ctx = get_empty_line_ctx(c, eob_ls);
-    cost += coeff_costs->empty_line_cost[tx_class][el_ctx][veob == 0];
-    if (veob) {
-      for (int r = 0; r < veob; ++r) {
-        if (r + 1 != height) {
-          int coeff_idx = r * width + c;
-          int scan_idx = iscan[coeff_idx];
-          int is_nz = qcoeff[coeff_idx] != 0;
-          int coeff_ctx =
-              get_nz_map_ctx(qcoeff, scan_idx, scan, bwl, height, tx_type);
-          cost += coeff_costs->nz_map_cost[coeff_ctx][is_nz];
-          if (is_nz) {
-            int eob_ctx = get_hv_eob_ctx(c, r, eob_ls);
-            cost += coeff_costs->hv_eob_cost[tx_class][eob_ctx][r == veob - 1];
-          }
-        }
-      }
-    }
+  const int16_t *const scan = txb_info->scan_order->scan;
+  const int pos = scan[scan_idx];
+
+  if (is_eob) {
+    cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+  } else {
+    cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
   }
-  return cost;
-}
+  if (is_nz) {
+    cost += get_sign_bit_cost(qc, scan_idx, txb_costs->dc_sign_cost,
+                              txb_ctx->dc_sign_ctx);
 
-static INLINE int get_nz_eob_map_cost_horiz(
-    const LV_MAP_COEFF_COST *coeff_costs, const tran_low_t *qcoeff,
-    uint16_t eob, int plane, const int16_t *scan, const int16_t *iscan,
-    TX_SIZE tx_size, TX_TYPE tx_type) {
-  (void)tx_size;
-  (void)scan;
-  (void)eob;
-  (void)plane;
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  int16_t eob_ls[MAX_HVTX_SIZE];
-  get_eob_horiz(eob_ls, qcoeff, width, height);
-  int cost = 0;
-  for (int r = 0; r < height; ++r) {
-    int16_t heob = eob_ls[r];
-    assert(heob <= width);
-    int el_ctx = get_empty_line_ctx(r, eob_ls);
-    cost += coeff_costs->empty_line_cost[tx_class][el_ctx][heob == 0];
-    if (heob) {
-      for (int c = 0; c < heob; ++c) {
-        if (c + 1 != width) {
-          int coeff_idx = r * width + c;
-          int scan_idx = iscan[coeff_idx];
-          int is_nz = qcoeff[coeff_idx] != 0;
-          int coeff_ctx =
-              get_nz_map_ctx(qcoeff, scan_idx, scan, bwl, height, tx_type);
-          cost += coeff_costs->nz_map_cost[coeff_ctx][is_nz];
-          if (is_nz) {
-            int eob_ctx = get_hv_eob_ctx(r, c, eob_ls);
-            cost += coeff_costs->hv_eob_cost[tx_class][eob_ctx][c == heob - 1];
-          }
-        }
-      }
+    if (abs_qc > NUM_BASE_LEVELS) {
+      const int ctx =
+          get_br_ctx(txb_info->levels, pos, txb_info->bwl, tx_class);
+      cost += get_br_cost(abs_qc, ctx, txb_costs->lps_cost[ctx]);
+      cost += get_golomb_cost(abs_qc);
     }
   }
   return cost;
 }
-#endif
 
-int av1_cost_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
-                        int blk_row, int blk_col, int block, TX_SIZE tx_size,
-                        TXB_CTX *txb_ctx) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const struct macroblock_plane *p = &x->plane[plane];
-  const int eob = p->eobs[block];
-  const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int c, cost;
-  int txb_skip_ctx = txb_ctx->txb_skip_ctx;
+static INLINE int get_nz_map_ctx(const uint8_t *const levels,
+                                 const int coeff_idx, const int bwl,
+                                 const int height, const int scan_idx,
+                                 const int is_eob, const TX_SIZE tx_size,
+                                 const TX_CLASS tx_class) {
+  if (is_eob) {
+    if (scan_idx == 0) return 0;
+    if (scan_idx <= (height << bwl) / 8) return 1;
+    if (scan_idx <= (height << bwl) / 4) return 2;
+    return 3;
+  }
+  const int stats =
+      get_nz_mag(levels + get_padded_idx(coeff_idx, bwl), bwl, tx_class);
+  return get_nz_map_ctx_from_stats(stats, coeff_idx, bwl, tx_size, tx_class);
+}
 
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int height = tx_size_high[tx_size];
+static void get_dist_cost_stats(LevelDownStats *const stats, const int scan_idx,
+                                const int is_eob,
+                                const LV_MAP_COEFF_COST *const txb_costs,
+                                const TxbInfo *const txb_info,
+                                const TX_CLASS tx_class) {
+  const int16_t *const scan = txb_info->scan_order->scan;
+  const int coeff_idx = scan[scan_idx];
+  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
+  const uint8_t *const levels = txb_info->levels;
+  stats->new_eob = -1;
+  stats->update = 0;
+  stats->rd_low = 0;
+  stats->rd = 0;
+  stats->nz_rd = 0;
+  stats->dist_low = 0;
+  stats->rate_low = 0;
+  stats->low_qc = 0;
 
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-  const int16_t *scan = scan_order->scan;
+  const tran_low_t tqc = txb_info->tcoeff[coeff_idx];
+  const int dqv = txb_info->dequant[coeff_idx != 0];
+  const int coeff_ctx =
+      get_nz_map_ctx(levels, coeff_idx, txb_info->bwl, txb_info->height,
+                     scan_idx, is_eob, txb_info->tx_size, tx_class);
+  const int qc_cost = get_coeff_cost(qc, scan_idx, is_eob, txb_info, txb_costs,
+                                     coeff_ctx, tx_class);
+  assert(qc != 0);
+  const tran_low_t dqc = qcoeff_to_dqcoeff(qc, coeff_idx, dqv, txb_info->shift,
+                                           txb_info->iqmatrix);
+  const int64_t dqc_dist = get_coeff_dist(tqc, dqc, txb_info->shift);
 
-  LV_MAP_COEFF_COST *coeff_costs = &x->coeff_costs[txs_ctx][plane_type];
+  // distortion difference when coefficient is quantized to 0
+  const tran_low_t dqc0 =
+      qcoeff_to_dqcoeff(0, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix);
 
-  cost = 0;
+  stats->dist0 = get_coeff_dist(tqc, dqc0, txb_info->shift);
+  stats->dist = dqc_dist - stats->dist0;
+  stats->rate = qc_cost;
 
-  if (eob == 0) {
-    cost = coeff_costs->txb_skip_cost[txb_skip_ctx][1];
-    return cost;
-  }
-  cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
+  stats->rd = RDCOST(txb_info->rdmult, stats->rate, stats->dist);
 
-#if CONFIG_TXK_SEL
-  cost += av1_tx_type_cost(cm, x, xd, mbmi->sb_type, plane, tx_size, tx_type);
-#endif
+  stats->low_qc = get_lower_coeff(qc);
 
-#if CONFIG_CTX1D
-  TX_CLASS tx_class = get_tx_class(tx_type);
-  if (tx_class == TX_CLASS_2D) {
-    cost += get_nz_eob_map_cost(coeff_costs, qcoeff, eob, plane, scan, tx_size,
-                                tx_type);
+  if (is_eob && stats->low_qc == 0) {
+    stats->rd_low = stats->rd;  // disable selection of low_qc in this case.
   } else {
-    const int width = tx_size_wide[tx_size];
-    const int eob_offset = width + height;
-    const int eob_mode = eob > eob_offset;
-    cost += coeff_costs->eob_mode_cost[tx_class][eob_mode];
-    if (eob_mode == 0) {
-      cost += get_nz_eob_map_cost(coeff_costs, qcoeff, eob, plane, scan,
-                                  tx_size, tx_type);
+    if (stats->low_qc == 0) {
+      stats->dist_low = 0;
     } else {
-      const int16_t *iscan = scan_order->iscan;
-      assert(tx_class == TX_CLASS_VERT || tx_class == TX_CLASS_HORIZ);
-      if (tx_class == TX_CLASS_VERT)
-        cost += get_nz_eob_map_cost_vert(coeff_costs, qcoeff, eob, plane, scan,
-                                         iscan, tx_size, tx_type);
-      else
-        cost += get_nz_eob_map_cost_horiz(coeff_costs, qcoeff, eob, plane, scan,
-                                          iscan, tx_size, tx_type);
+      stats->low_dqc = qcoeff_to_dqcoeff(stats->low_qc, coeff_idx, dqv,
+                                         txb_info->shift, txb_info->iqmatrix);
+      const int64_t low_dqc_dist =
+          get_coeff_dist(tqc, stats->low_dqc, txb_info->shift);
+      stats->dist_low = low_dqc_dist - stats->dist0;
     }
+    const int low_qc_cost =
+        get_coeff_cost(stats->low_qc, scan_idx, is_eob, txb_info, txb_costs,
+                       coeff_ctx, tx_class);
+    stats->rate_low = low_qc_cost;
+    stats->rd_low = RDCOST(txb_info->rdmult, stats->rate_low, stats->dist_low);
   }
-#else   // CONFIG_CTX1D
-  cost += get_nz_eob_map_cost(coeff_costs, qcoeff, eob, plane, scan, tx_size,
-                              tx_type);
-#endif  // CONFIG_CTX1D
-
-  for (c = 0; c < eob; ++c) {
-    tran_low_t v = qcoeff[scan[c]];
-    int is_nz = (v != 0);
-    int level = abs(v);
-
-    if (is_nz) {
-      int ctx_ls[NUM_BASE_LEVELS] = { 0 };
-      int sign = (v < 0) ? 1 : 0;
-
-      // sign bit cost
-      if (c == 0) {
-        int dc_sign_ctx = txb_ctx->dc_sign_ctx;
-        cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign];
-      } else {
-        cost += av1_cost_bit(128, sign);
-      }
-
-      get_base_ctx_set(qcoeff, scan[c], bwl, height, ctx_ls);
-
-      int i;
-      for (i = 0; i < NUM_BASE_LEVELS; ++i) {
-        if (level <= i) continue;
-
-        if (level == i + 1) {
-          cost += coeff_costs->base_cost[i][ctx_ls[i]][1];
-          continue;
-        }
-        cost += coeff_costs->base_cost[i][ctx_ls[i]][0];
-      }
-
-      if (level > NUM_BASE_LEVELS) {
-        int ctx;
-        ctx = get_br_ctx(qcoeff, scan[c], bwl, height);
-#if BR_NODE
-        int base_range = level - 1 - NUM_BASE_LEVELS;
-        if (base_range < COEFF_BASE_RANGE) {
-          cost += coeff_costs->lps_cost[ctx][base_range];
-        } else {
-          cost += coeff_costs->lps_cost[ctx][COEFF_BASE_RANGE];
-        }
-
-#else
-        for (int idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
-          if (level == (idx + 1 + NUM_BASE_LEVELS)) {
-            cost += coeff_costs->lps_cost[ctx][1];
-            break;
-          }
-          cost += coeff_costs->lps_cost[ctx][0];
-        }
-#endif
-        if (level >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
-          // residual cost
-          int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
-          int ri = r;
-          int length = 0;
-
-          while (ri) {
-            ri >>= 1;
-            ++length;
-          }
-
-          for (ri = 0; ri < length - 1; ++ri) cost += av1_cost_bit(128, 0);
+}
 
-          for (ri = length - 1; ri >= 0; --ri)
-            cost += av1_cost_bit(128, (r >> ri) & 0x01);
-        }
-      }
-    }
-  }
+static void get_dist_cost_stats_with_eob(
+    LevelDownStats *const stats, const int scan_idx,
+    const LV_MAP_COEFF_COST *const txb_costs, const TxbInfo *const txb_info,
+    const TX_CLASS tx_class) {
+  const int is_eob = 0;
+  get_dist_cost_stats(stats, scan_idx, is_eob, txb_costs, txb_info, tx_class);
 
-  return cost;
+  const int16_t *const scan = txb_info->scan_order->scan;
+  const int coeff_idx = scan[scan_idx];
+  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
+  const int coeff_ctx_temp = get_nz_map_ctx(
+      txb_info->levels, coeff_idx, txb_info->bwl, txb_info->height, scan_idx, 1,
+      txb_info->tx_size, tx_class);
+  const int qc_eob_cost = get_coeff_cost(qc, scan_idx, 1, txb_info, txb_costs,
+                                         coeff_ctx_temp, tx_class);
+  int64_t rd_eob = RDCOST(txb_info->rdmult, qc_eob_cost, stats->dist);
+  if (stats->low_qc != 0) {
+    const int low_qc_eob_cost =
+        get_coeff_cost(stats->low_qc, scan_idx, 1, txb_info, txb_costs,
+                       coeff_ctx_temp, tx_class);
+    int64_t rd_eob_low =
+        RDCOST(txb_info->rdmult, low_qc_eob_cost, stats->dist_low);
+    rd_eob = (rd_eob > rd_eob_low) ? rd_eob_low : rd_eob;
+  }
+
+  stats->nz_rd = AOMMIN(stats->rd_low, stats->rd) - rd_eob;
 }
 
-static INLINE int has_base(tran_low_t qc, int base_idx) {
-  const int level = base_idx + 1;
-  return abs(qc) >= level;
+static INLINE void update_qcoeff(const int coeff_idx, const tran_low_t qc,
+                                 const TxbInfo *const txb_info) {
+  txb_info->qcoeff[coeff_idx] = qc;
+  txb_info->levels[get_padded_idx(coeff_idx, txb_info->bwl)] =
+      (uint8_t)clamp(abs(qc), 0, INT8_MAX);
 }
 
-static INLINE int has_br(tran_low_t qc) {
-  return abs(qc) >= 1 + NUM_BASE_LEVELS;
+static INLINE void update_coeff(const int coeff_idx, const tran_low_t qc,
+                                const TxbInfo *const txb_info) {
+  update_qcoeff(coeff_idx, qc, txb_info);
+  const int dqv = txb_info->dequant[coeff_idx != 0];
+  txb_info->dqcoeff[coeff_idx] = qcoeff_to_dqcoeff(
+      qc, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix);
 }
 
-static INLINE int get_sign_bit_cost(tran_low_t qc, int coeff_idx,
-                                    const int (*dc_sign_cost)[2],
-                                    int dc_sign_ctx) {
-  const int sign = (qc < 0) ? 1 : 0;
-  // sign bit cost
-  if (coeff_idx == 0) {
-    return dc_sign_cost[dc_sign_ctx][sign];
-  } else {
-    return av1_cost_bit(128, sign);
-  }
-}
-static INLINE int get_golomb_cost(int abs_qc) {
-  if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
-    // residual cost
-    int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
-    int ri = r;
-    int length = 0;
-
-    while (ri) {
-      ri >>= 1;
-      ++length;
-    }
+void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width,
+                           const int height, uint8_t *const levels) {
+  const int stride = width + TX_PAD_HOR;
+  uint8_t *ls = levels;
 
-    return av1_cost_literal(2 * length - 1);
-  } else {
-    return 0;
-  }
-}
+  memset(levels - TX_PAD_TOP * stride, 0,
+         sizeof(*levels) * TX_PAD_TOP * stride);
+  memset(levels + stride * height, 0,
+         sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
 
-void gen_txb_cache(TxbCache *txb_cache, TxbInfo *txb_info) {
-  // gen_nz_count_arr
-  const int16_t *scan = txb_info->scan_order->scan;
-  const int bwl = txb_info->bwl;
-  const int height = txb_info->height;
-  tran_low_t *qcoeff = txb_info->qcoeff;
-  const BASE_CTX_TABLE *base_ctx_table =
-      txb_info->coeff_ctx_table->base_ctx_table;
-  for (int c = 0; c < txb_info->eob; ++c) {
-    const int coeff_idx = scan[c];  // raster order
-    const int row = coeff_idx >> bwl;
-    const int col = coeff_idx - (row << bwl);
-#if REDUCE_CONTEXT_DEPENDENCY
-    int prev_coeff_idx;
-    int prev_row;
-    int prev_col;
-    if (c > MIN_SCAN_IDX_REDUCE_CONTEXT_DEPENDENCY) {
-      prev_coeff_idx = scan[c - 1];  // raster order
-      prev_row = prev_coeff_idx >> bwl;
-      prev_col = prev_coeff_idx - (prev_row << bwl);
-    } else {
-      prev_coeff_idx = -1;
-      prev_row = -1;
-      prev_col = -1;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      *ls++ = (uint8_t)clamp(abs(coeff[i * width + j]), 0, INT8_MAX);
     }
-    txb_cache->nz_count_arr[coeff_idx] =
-        get_nz_count(qcoeff, bwl, height, row, col, prev_row, prev_col);
-#else
-    txb_cache->nz_count_arr[coeff_idx] =
-        get_nz_count(qcoeff, bwl, height, row, col);
-#endif
-    const int nz_count = txb_cache->nz_count_arr[coeff_idx];
-    txb_cache->nz_ctx_arr[coeff_idx] =
-        get_nz_map_ctx_from_count(nz_count, coeff_idx, bwl, txb_info->tx_type);
-
-    // gen_base_count_mag_arr
-    if (!has_base(qcoeff[coeff_idx], 0)) continue;
-    int *base_mag = txb_cache->base_mag_arr[coeff_idx];
-    int count[NUM_BASE_LEVELS];
-    get_base_count_mag(base_mag, count, qcoeff, bwl, height, row, col);
-
-    for (int i = 0; i < NUM_BASE_LEVELS; ++i) {
-      if (!has_base(qcoeff[coeff_idx], i)) break;
-      txb_cache->base_count_arr[i][coeff_idx] = count[i];
-      const int level = i + 1;
-      txb_cache->base_ctx_arr[i][coeff_idx] =
-          base_ctx_table[row != 0][col != 0][base_mag[0] > level][count[i]];
+    for (int j = 0; j < TX_PAD_HOR; j++) {
+      *ls++ = 0;
     }
-
-    // gen_br_count_mag_arr
-    if (!has_br(qcoeff[coeff_idx])) continue;
-    int *br_count = txb_cache->br_count_arr + coeff_idx;
-    int *br_mag = txb_cache->br_mag_arr[coeff_idx];
-    *br_count = get_br_count_mag(br_mag, qcoeff, bwl, height, row, col,
-                                 NUM_BASE_LEVELS);
-    txb_cache->br_ctx_arr[coeff_idx] =
-        get_br_ctx_from_count_mag(row, col, *br_count, br_mag[0]);
-  }
-}
-
-static INLINE const int *get_level_prob(int level, int coeff_idx,
-                                        const TxbCache *txb_cache,
-                                        const LV_MAP_COEFF_COST *txb_costs) {
-  if (level == 0) {
-    const int ctx = txb_cache->nz_ctx_arr[coeff_idx];
-    return txb_costs->nz_map_cost[ctx];
-  } else if (level >= 1 && level < 1 + NUM_BASE_LEVELS) {
-    const int idx = level - 1;
-    const int ctx = txb_cache->base_ctx_arr[idx][coeff_idx];
-    return txb_costs->base_cost[idx][ctx];
-  } else if (level >= 1 + NUM_BASE_LEVELS &&
-             level < 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
-    const int ctx = txb_cache->br_ctx_arr[coeff_idx];
-    return txb_costs->lps_cost[ctx];
-  } else if (level >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
-    printf("get_level_prob does not support golomb\n");
-    assert(0);
-    return 0;
-  } else {
-    assert(0);
-    return 0;
   }
 }
 
-static INLINE tran_low_t get_lower_coeff(tran_low_t qc) {
-  if (qc == 0) {
-    return 0;
+void av1_get_nz_map_contexts_c(const uint8_t *const levels,
+                               const int16_t *const scan, const uint16_t eob,
+                               const TX_SIZE tx_size, const TX_CLASS tx_class,
+                               int8_t *const coeff_contexts) {
+  const int bwl = get_txb_bwl(tx_size);
+  const int height = get_txb_high(tx_size);
+  for (int i = 0; i < eob; ++i) {
+    const int pos = scan[i];
+    coeff_contexts[pos] = get_nz_map_ctx(levels, pos, bwl, height, i,
+                                         i == eob - 1, tx_size, tx_class);
   }
-  return qc > 0 ? qc - 1 : qc + 1;
 }
 
-static INLINE void update_mag_arr(int *mag_arr, int abs_qc) {
-  if (mag_arr[0] == abs_qc) {
-    mag_arr[1] -= 1;
-    assert(mag_arr[1] >= 0);
-  }
-}
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+                          aom_writer *w, int blk_row, int blk_col, int plane,
+                          TX_SIZE tx_size, const tran_low_t *tcoeff,
+                          uint16_t eob, TXB_CTX *txb_ctx) {
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int16_t *const scan = scan_order->scan;
+  int c;
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
 
-static INLINE int get_mag_from_mag_arr(const int *mag_arr) {
-  int mag;
-  if (mag_arr[1] > 0) {
-    mag = mag_arr[0];
-  } else if (mag_arr[0] > 0) {
-    mag = mag_arr[0] - 1;
-  } else {
-    // no neighbor
-    assert(mag_arr[0] == 0 && mag_arr[1] == 0);
-    mag = 0;
+  aom_write_symbol(w, eob == 0,
+                   ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2);
+  if (plane == 0 && eob == 0) {
+    assert(tx_type == DCT_DCT);
   }
-  return mag;
-}
+  if (eob == 0) return;
 
-static int neighbor_level_down_update(int *new_count, int *new_mag, int count,
-                                      const int *mag, int coeff_idx,
-                                      tran_low_t abs_nb_coeff, int nb_coeff_idx,
-                                      int level, const TxbInfo *txb_info) {
-  *new_count = count;
-  *new_mag = get_mag_from_mag_arr(mag);
+  av1_txb_init_levels(tcoeff, width, height, levels);
 
-  int update = 0;
-  // check if br_count changes
-  if (abs_nb_coeff == level) {
-    update = 1;
-    *new_count -= 1;
-    assert(*new_count >= 0);
-  }
-  const int row = coeff_idx >> txb_info->bwl;
-  const int col = coeff_idx - (row << txb_info->bwl);
-  const int nb_row = nb_coeff_idx >> txb_info->bwl;
-  const int nb_col = nb_coeff_idx - (nb_row << txb_info->bwl);
-
-  // check if mag changes
-  if (nb_row >= row && nb_col >= col) {
-    if (abs_nb_coeff == mag[0]) {
-      assert(mag[1] > 0);
-      if (mag[1] == 1) {
-        // the nb is the only qc with max mag
-        *new_mag -= 1;
-        assert(*new_mag >= 0);
-        update = 1;
-      }
-    }
-  }
-  return update;
-}
+  av1_write_tx_type(cm, xd, blk_row, blk_col, plane, tx_size, w);
 
-static int try_neighbor_level_down_br(int coeff_idx, int nb_coeff_idx,
-                                      const TxbCache *txb_cache,
-                                      const LV_MAP_COEFF_COST *txb_costs,
-                                      const TxbInfo *txb_info) {
-  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  const tran_low_t abs_qc = abs(qc);
-  const int level = NUM_BASE_LEVELS + 1;
-  if (abs_qc < level) return 0;
-
-  const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
-  const tran_low_t abs_nb_coeff = abs(nb_coeff);
-  const int count = txb_cache->br_count_arr[coeff_idx];
-  const int *mag = txb_cache->br_mag_arr[coeff_idx];
-  int new_count;
-  int new_mag;
-  const int update =
-      neighbor_level_down_update(&new_count, &new_mag, count, mag, coeff_idx,
-                                 abs_nb_coeff, nb_coeff_idx, level, txb_info);
-  if (update) {
-    const int row = coeff_idx >> txb_info->bwl;
-    const int col = coeff_idx - (row << txb_info->bwl);
-    const int ctx = txb_cache->br_ctx_arr[coeff_idx];
-    const int org_cost = get_br_cost(abs_qc, ctx, txb_costs->lps_cost[ctx]);
-
-    const int new_ctx = get_br_ctx_from_count_mag(row, col, new_count, new_mag);
-    const int new_cost =
-        get_br_cost(abs_qc, new_ctx, txb_costs->lps_cost[new_ctx]);
-    const int cost_diff = -org_cost + new_cost;
-    return cost_diff;
-  } else {
-    return 0;
+  int eob_extra;
+  const int eob_pt = get_eob_pos_token(eob, &eob_extra);
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+  switch (eob_multi_size) {
+    case 0:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx], 5);
+      break;
+    case 1:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx], 6);
+      break;
+    case 2:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx], 7);
+      break;
+    case 3:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx], 8);
+      break;
+    case 4:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx], 9);
+      break;
+    case 5:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx], 10);
+      break;
+    default:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11);
+      break;
   }
-}
 
-static int try_neighbor_level_down_base(int coeff_idx, int nb_coeff_idx,
-                                        const TxbCache *txb_cache,
-                                        const LV_MAP_COEFF_COST *txb_costs,
-                                        const TxbInfo *txb_info) {
-  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  const tran_low_t abs_qc = abs(qc);
-  const BASE_CTX_TABLE *base_ctx_table =
-      txb_info->coeff_ctx_table->base_ctx_table;
-
-  int cost_diff = 0;
-  for (int base_idx = 0; base_idx < NUM_BASE_LEVELS; ++base_idx) {
-    const int level = base_idx + 1;
-    if (abs_qc < level) continue;
-
-    const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
-    const tran_low_t abs_nb_coeff = abs(nb_coeff);
-
-    const int count = txb_cache->base_count_arr[base_idx][coeff_idx];
-    const int *mag = txb_cache->base_mag_arr[coeff_idx];
-    int new_count;
-    int new_mag;
-    const int update =
-        neighbor_level_down_update(&new_count, &new_mag, count, mag, coeff_idx,
-                                   abs_nb_coeff, nb_coeff_idx, level, txb_info);
-    if (update) {
-      const int row = coeff_idx >> txb_info->bwl;
-      const int col = coeff_idx - (row << txb_info->bwl);
-      const int ctx = txb_cache->base_ctx_arr[base_idx][coeff_idx];
-      const int org_cost = get_base_cost(
-          abs_qc, ctx, txb_costs->base_cost[base_idx][ctx], base_idx);
-
-      const int new_ctx =
-          base_ctx_table[row != 0][col != 0][new_mag > level][new_count];
-      const int new_cost = get_base_cost(
-          abs_qc, new_ctx, txb_costs->base_cost[base_idx][new_ctx], base_idx);
-      cost_diff += -org_cost + new_cost;
+  if (k_eob_offset_bits[eob_pt] > 0) {
+    const int eob_ctx = eob_pt - 3;
+    int eob_shift = k_eob_offset_bits[eob_pt] - 1;
+    int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+    aom_write_symbol(w, bit,
+                     ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2);
+    for (int i = 1; i < k_eob_offset_bits[eob_pt]; i++) {
+      eob_shift = k_eob_offset_bits[eob_pt] - 1 - i;
+      bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+      aom_write_bit(w, bit);
     }
   }
-  return cost_diff;
-}
 
-static int try_neighbor_level_down_nz(int coeff_idx, int nb_coeff_idx,
-                                      const TxbCache *txb_cache,
-                                      const LV_MAP_COEFF_COST *txb_costs,
-                                      TxbInfo *txb_info) {
-  // assume eob doesn't change
-  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  const tran_low_t abs_qc = abs(qc);
-  const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
-  const tran_low_t abs_nb_coeff = abs(nb_coeff);
-  if (abs_nb_coeff != 1) return 0;
-  const int16_t *iscan = txb_info->scan_order->iscan;
-  const int scan_idx = iscan[coeff_idx];
-  if (scan_idx == txb_info->seg_eob) return 0;
-  const int nb_scan_idx = iscan[nb_coeff_idx];
-  if (nb_scan_idx < scan_idx) {
-    const int count = txb_cache->nz_count_arr[coeff_idx];
-    assert(count > 0);
-    txb_info->qcoeff[nb_coeff_idx] = get_lower_coeff(nb_coeff);
-    const int new_ctx = get_nz_map_ctx_from_count(
-        count - 1, coeff_idx, txb_info->bwl, txb_info->tx_type);
-    txb_info->qcoeff[nb_coeff_idx] = nb_coeff;
-    const int ctx = txb_cache->nz_ctx_arr[coeff_idx];
-    const int is_nz = abs_qc > 0;
-    const int org_cost = txb_costs->nz_map_cost[ctx][is_nz];
-    const int new_cost = txb_costs->nz_map_cost[new_ctx][is_nz];
-    const int cost_diff = new_cost - org_cost;
-    return cost_diff;
-  } else {
-    return 0;
-  }
-}
+  av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
 
-static int try_self_level_down(tran_low_t *low_coeff, int coeff_idx,
-                               const TxbCache *txb_cache,
-                               const LV_MAP_COEFF_COST *txb_costs,
-                               TxbInfo *txb_info) {
-  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  if (qc == 0) {
-    *low_coeff = 0;
-    return 0;
-  }
-  const tran_low_t abs_qc = abs(qc);
-  *low_coeff = get_lower_coeff(qc);
-  int cost_diff;
-  if (*low_coeff == 0) {
-    const int scan_idx = txb_info->scan_order->iscan[coeff_idx];
-    const int *level_cost =
-        get_level_prob(abs_qc, coeff_idx, txb_cache, txb_costs);
-    const int *low_level_cost =
-        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_costs);
-    if (scan_idx < txb_info->seg_eob) {
-      // When level-0, we code the binary of abs_qc > level
-      // but when level-k k > 0 we code the binary of abs_qc == level
-      // That's why wee need this special treatment for level-0 map
-      // TODO(angiebird): make leve-0 consistent to other levels
-      cost_diff = -level_cost[1] + low_level_cost[0] - low_level_cost[1];
+  for (c = eob - 1; c >= 0; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx = coeff_contexts[pos];
+    const tran_low_t v = tcoeff[pos];
+    const tran_low_t level = abs(v);
+
+    if (c == eob - 1) {
+      aom_write_symbol(
+          w, AOMMIN(level, 3) - 1,
+          ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx], 3);
     } else {
-      cost_diff = -level_cost[1];
+      aom_write_symbol(w, AOMMIN(level, 3),
+                       ec_ctx->coeff_base_cdf[txs_ctx][plane_type][coeff_ctx],
+                       4);
     }
-
-    if (scan_idx < txb_info->seg_eob) {
-      const int eob_ctx = get_eob_ctx(txb_info->qcoeff, coeff_idx,
-                                      txb_info->txs_ctx, txb_info->tx_type);
-      cost_diff -=
-          txb_costs->eob_cost[eob_ctx][scan_idx == (txb_info->eob - 1)];
+    if (level > NUM_BASE_LEVELS) {
+      // level is above 1.
+      const int base_range = level - 1 - NUM_BASE_LEVELS;
+      const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+        const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+        aom_write_symbol(
+            w, k,
+            ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx],
+            BR_CDF_SIZE);
+        if (k < BR_CDF_SIZE - 1) break;
+      }
     }
+  }
 
-    const int sign_cost = get_sign_bit_cost(
-        qc, coeff_idx, txb_costs->dc_sign_cost, txb_info->txb_ctx->dc_sign_ctx);
-    cost_diff -= sign_cost;
-  } else if (abs_qc <= NUM_BASE_LEVELS) {
-    const int *level_cost =
-        get_level_prob(abs_qc, coeff_idx, txb_cache, txb_costs);
-    const int *low_level_cost =
-        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_costs);
-    cost_diff = -level_cost[1] + low_level_cost[1] - low_level_cost[0];
-  } else if (abs_qc == NUM_BASE_LEVELS + 1) {
-    const int *level_cost =
-        get_level_prob(abs_qc, coeff_idx, txb_cache, txb_costs);
-    const int *low_level_cost =
-        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_costs);
-#if BR_NODE
-    cost_diff = -level_cost[0] + low_level_cost[1] - low_level_cost[0];
-#else
-    cost_diff = -level_cost[1] + low_level_cost[1] - low_level_cost[0];
-#endif
-  } else if (abs_qc < 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
-    const int *level_cost =
-        get_level_prob(abs_qc, coeff_idx, txb_cache, txb_costs);
-    const int *low_level_cost =
-        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_costs);
-
-#if BR_NODE
-    cost_diff = -level_cost[abs_qc - 1 - NUM_BASE_LEVELS] +
-                low_level_cost[abs(*low_coeff) - 1 - NUM_BASE_LEVELS];
-#else
-    cost_diff = -level_cost[1] + low_level_cost[1] - low_level_cost[0];
-#endif
-  } else if (abs_qc == 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
-    const int *low_level_cost =
-        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_costs);
-#if BR_NODE
-    cost_diff = -get_golomb_cost(abs_qc) - low_level_cost[COEFF_BASE_RANGE] +
-                low_level_cost[COEFF_BASE_RANGE - 1];
-#else
-    cost_diff =
-        -get_golomb_cost(abs_qc) + low_level_cost[1] - low_level_cost[0];
-#endif
-  } else {
-    assert(abs_qc > 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE);
-    const tran_low_t abs_low_coeff = abs(*low_coeff);
-    cost_diff = -get_golomb_cost(abs_qc) + get_golomb_cost(abs_low_coeff);
+  // Loop to code all signs in the transform block,
+  // starting with the sign of DC (if applicable)
+  for (c = 0; c < eob; ++c) {
+    const tran_low_t v = tcoeff[scan[c]];
+    const tran_low_t level = abs(v);
+    const int sign = (v < 0) ? 1 : 0;
+    if (level) {
+      if (c == 0) {
+        aom_write_symbol(
+            w, sign, ec_ctx->dc_sign_cdf[plane_type][txb_ctx->dc_sign_ctx], 2);
+      } else {
+        aom_write_bit(w, sign);
+      }
+      if (level > COEFF_BASE_RANGE + NUM_BASE_LEVELS)
+        write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS);
+    }
   }
-  return cost_diff;
 }
 
-#define COST_MAP_SIZE 5
-#define COST_MAP_OFFSET 2
+typedef struct encode_txb_args {
+  const AV1_COMMON *cm;
+  MACROBLOCK *x;
+  aom_writer *w;
+} ENCODE_TXB_ARGS;
 
-static INLINE int check_nz_neighbor(tran_low_t qc) { return abs(qc) == 1; }
-
-static INLINE int check_base_neighbor(tran_low_t qc) {
-  return abs(qc) <= 1 + NUM_BASE_LEVELS;
+static void write_coeffs_txb_wrap(const AV1_COMMON *cm, MACROBLOCK *x,
+                                  aom_writer *w, int plane, int block,
+                                  int blk_row, int blk_col, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
+  uint16_t eob = x->mbmi_ext->eobs[plane][block];
+  TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block],
+                      x->mbmi_ext->dc_sign_ctx[plane][block] };
+  av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, plane, tx_size, tcoeff, eob,
+                       &txb_ctx);
 }
 
-static INLINE int check_br_neighbor(tran_low_t qc) {
-  return abs(qc) > BR_MAG_OFFSET;
+void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, int mi_row,
+                         int mi_col, aom_writer *w, BLOCK_SIZE bsize) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int num_planes = av1_num_planes(cm);
+  int block[MAX_MB_PLANE] = { 0 };
+  int row, col;
+  assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
+                                       xd->plane[0].subsampling_y));
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+  int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+  int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+  mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+  mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+  for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
+    for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+      for (int plane = 0; plane < num_planes; ++plane) {
+        const struct macroblockd_plane *const pd = &xd->plane[plane];
+        if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                                 pd->subsampling_y))
+          continue;
+        const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+        const int stepr = tx_size_high_unit[tx_size];
+        const int stepc = tx_size_wide_unit[tx_size];
+        const int step = stepr * stepc;
+
+        const int unit_height = ROUND_POWER_OF_TWO(
+            AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y);
+        const int unit_width = ROUND_POWER_OF_TWO(
+            AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x);
+        for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+             blk_row += stepr) {
+          for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+               blk_col += stepc) {
+            write_coeffs_txb_wrap(cm, x, w, plane, block[plane], blk_row,
+                                  blk_col, tx_size);
+            block[plane] += step;
+          }
+        }
+      }
+    }
+  }
 }
 
-#define FAST_OPTIMIZE_TXB 1
+// TODO(angiebird): use this function whenever it's possible
+static int get_tx_type_cost(const AV1_COMMON *cm, const MACROBLOCK *x,
+                            const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
+                            TX_TYPE tx_type) {
+  if (plane > 0) return 0;
 
-#if FAST_OPTIMIZE_TXB
-#define ALNB_REF_OFFSET_NUM 2
-static int alnb_ref_offset[ALNB_REF_OFFSET_NUM][2] = {
-  { -1, 0 }, { 0, -1 },
-};
-#define NB_REF_OFFSET_NUM 4
-static int nb_ref_offset[NB_REF_OFFSET_NUM][2] = {
-  { -1, 0 }, { 0, -1 }, { 1, 0 }, { 0, 1 },
-};
-#endif  // FAST_OPTIMIZE_TXB
+  const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
 
-// TODO(angiebird): add static to this function once it's called
-int try_level_down(int coeff_idx, const TxbCache *txb_cache,
-                   const LV_MAP_COEFF_COST *txb_costs, TxbInfo *txb_info,
-                   int (*cost_map)[COST_MAP_SIZE], int fast_mode) {
-#if !FAST_OPTIMIZE_TXB
-  (void)fast_mode;
-#endif
-  if (cost_map) {
-    for (int i = 0; i < COST_MAP_SIZE; ++i) av1_zero(cost_map[i]);
-  }
-
-  tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  tran_low_t low_coeff;
-  if (qc == 0) return 0;
-  int accu_cost_diff = 0;
-
-  const int16_t *iscan = txb_info->scan_order->iscan;
-  const int eob = txb_info->eob;
-  const int scan_idx = iscan[coeff_idx];
-  if (scan_idx < eob) {
-    const int cost_diff = try_self_level_down(&low_coeff, coeff_idx, txb_cache,
-                                              txb_costs, txb_info);
-    if (cost_map)
-      cost_map[0 + COST_MAP_OFFSET][0 + COST_MAP_OFFSET] = cost_diff;
-    accu_cost_diff += cost_diff;
-  }
-
-  const int row = coeff_idx >> txb_info->bwl;
-  const int col = coeff_idx - (row << txb_info->bwl);
-  if (check_nz_neighbor(qc)) {
-#if FAST_OPTIMIZE_TXB
-    int(*ref_offset)[2];
-    int ref_num;
-    if (fast_mode) {
-      ref_offset = alnb_ref_offset;
-      ref_num = ALNB_REF_OFFSET_NUM;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  if (get_ext_tx_types(tx_size, is_inter, cm->reduced_tx_set_used) > 1 &&
+      !xd->lossless[xd->mi[0]->segment_id]) {
+    const int ext_tx_set =
+        get_ext_tx_set(tx_size, is_inter, cm->reduced_tx_set_used);
+    if (is_inter) {
+      if (ext_tx_set > 0)
+        return x->inter_tx_type_costs[ext_tx_set][square_tx_size][tx_type];
     } else {
-      ref_offset = sig_ref_offset;
-      ref_num = SIG_REF_OFFSET_NUM;
-    }
-#else
-    int(*ref_offset)[2] = sig_ref_offset;
-    const int ref_num = SIG_REF_OFFSET_NUM;
-#endif
-    for (int i = 0; i < ref_num; ++i) {
-      const int nb_row = row - ref_offset[i][0];
-      const int nb_col = col - ref_offset[i][1];
-      const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
-
-      if (nb_row < 0 || nb_col < 0 || nb_row >= txb_info->height ||
-          nb_col >= txb_info->stride)
-        continue;
-
-      const int nb_scan_idx = iscan[nb_coeff_idx];
-      if (nb_scan_idx < eob) {
-        const int cost_diff = try_neighbor_level_down_nz(
-            nb_coeff_idx, coeff_idx, txb_cache, txb_costs, txb_info);
-        if (cost_map)
-          cost_map[nb_row - row + COST_MAP_OFFSET]
-                  [nb_col - col + COST_MAP_OFFSET] += cost_diff;
-        accu_cost_diff += cost_diff;
+      if (ext_tx_set > 0) {
+        PREDICTION_MODE intra_dir;
+        if (mbmi->filter_intra_mode_info.use_filter_intra)
+          intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info
+                                             .filter_intra_mode];
+        else
+          intra_dir = mbmi->mode;
+        return x->intra_tx_type_costs[ext_tx_set][square_tx_size][intra_dir]
+                                     [tx_type];
       }
     }
   }
+  return 0;
+}
 
-  if (check_base_neighbor(qc)) {
-#if FAST_OPTIMIZE_TXB
-    int(*ref_offset)[2];
-    int ref_num;
-    if (fast_mode) {
-      ref_offset = nb_ref_offset;
-      ref_num = NB_REF_OFFSET_NUM;
-    } else {
-      ref_offset = base_ref_offset;
-      ref_num = BASE_CONTEXT_POSITION_NUM;
-    }
-#else
-    int(*ref_offset)[2] = base_ref_offset;
-    int ref_num = BASE_CONTEXT_POSITION_NUM;
-#endif
-    for (int i = 0; i < ref_num; ++i) {
-      const int nb_row = row - ref_offset[i][0];
-      const int nb_col = col - ref_offset[i][1];
-      const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
-
-      if (nb_row < 0 || nb_col < 0 || nb_row >= txb_info->height ||
-          nb_col >= txb_info->stride)
-        continue;
-
-      const int nb_scan_idx = iscan[nb_coeff_idx];
-      if (nb_scan_idx < eob) {
-        const int cost_diff = try_neighbor_level_down_base(
-            nb_coeff_idx, coeff_idx, txb_cache, txb_costs, txb_info);
-        if (cost_map)
-          cost_map[nb_row - row + COST_MAP_OFFSET]
-                  [nb_col - col + COST_MAP_OFFSET] += cost_diff;
-        accu_cost_diff += cost_diff;
+static AOM_FORCE_INLINE int warehouse_efficients_txb(
+    const AV1_COMMON *const cm, const MACROBLOCK *x, const int plane,
+    const int block, const TX_SIZE tx_size, const TXB_CTX *const txb_ctx,
+    const struct macroblock_plane *p, const int eob,
+    const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
+    const MACROBLOCKD *const xd, const TX_TYPE tx_type,
+    const TX_CLASS tx_class) {
+  const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int16_t *const scan = scan_order->scan;
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST *const eob_costs =
+      &x->eob_costs[eob_multi_size][plane_type];
+  int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
+
+  av1_txb_init_levels(qcoeff, width, height, levels);
+
+  cost += get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type);
+
+  cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
+
+  av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
+
+  const int(*lps_cost)[COEFF_BASE_RANGE + 1] = coeff_costs->lps_cost;
+  int c = eob - 1;
+  {
+    const int pos = scan[c];
+    const tran_low_t v = qcoeff[pos];
+    const int sign = v >> 31;
+    const int level = (v ^ sign) - sign;
+    const int coeff_ctx = coeff_contexts[pos];
+    cost += coeff_costs->base_eob_cost[coeff_ctx][AOMMIN(level, 3) - 1];
+
+    if (v) {
+      // sign bit cost
+      if (level > NUM_BASE_LEVELS) {
+        const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
+        const int base_range =
+            AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+        cost += lps_cost[ctx][base_range];
+        cost += get_golomb_cost(level);
+      }
+      if (c) {
+        cost += av1_cost_literal(1);
+      } else {
+        const int sign01 = (sign ^ sign) - sign;
+        const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+        cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
+        return cost;
       }
     }
   }
-
-  if (check_br_neighbor(qc)) {
-#if FAST_OPTIMIZE_TXB
-    int(*ref_offset)[2];
-    int ref_num;
-    if (fast_mode) {
-      ref_offset = nb_ref_offset;
-      ref_num = NB_REF_OFFSET_NUM;
-    } else {
-      ref_offset = br_ref_offset;
-      ref_num = BR_CONTEXT_POSITION_NUM;
-    }
-#else
-    int(*ref_offset)[2] = br_ref_offset;
-    const int ref_num = BR_CONTEXT_POSITION_NUM;
-#endif
-    for (int i = 0; i < ref_num; ++i) {
-      const int nb_row = row - ref_offset[i][0];
-      const int nb_col = col - ref_offset[i][1];
-      const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
-
-      if (nb_row < 0 || nb_col < 0 || nb_row >= txb_info->height ||
-          nb_col >= txb_info->stride)
-        continue;
-
-      const int nb_scan_idx = iscan[nb_coeff_idx];
-      if (nb_scan_idx < eob) {
-        const int cost_diff = try_neighbor_level_down_br(
-            nb_coeff_idx, coeff_idx, txb_cache, txb_costs, txb_info);
-        if (cost_map)
-          cost_map[nb_row - row + COST_MAP_OFFSET]
-                  [nb_col - col + COST_MAP_OFFSET] += cost_diff;
-        accu_cost_diff += cost_diff;
+  const int(*base_cost)[4] = coeff_costs->base_cost;
+  for (c = eob - 2; c >= 1; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx = coeff_contexts[pos];
+    const tran_low_t v = qcoeff[pos];
+    const int level = abs(v);
+    const int cost0 = base_cost[coeff_ctx][AOMMIN(level, 3)];
+    if (v) {
+      // sign bit cost
+      cost += av1_cost_literal(1);
+      if (level > NUM_BASE_LEVELS) {
+        const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
+        const int base_range =
+            AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+        cost += lps_cost[ctx][base_range];
+        cost += get_golomb_cost(level);
       }
     }
+    cost += cost0;
   }
+  if (c == 0) {
+    const int pos = scan[c];
+    const tran_low_t v = qcoeff[pos];
+    const int coeff_ctx = coeff_contexts[pos];
+    const int sign = v >> 31;
+    const int level = (v ^ sign) - sign;
+    cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
 
-  return accu_cost_diff;
-}
-
-static int get_low_coeff_cost(int coeff_idx, const TxbCache *txb_cache,
-                              const LV_MAP_COEFF_COST *txb_costs,
-                              const TxbInfo *txb_info) {
-  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  const int abs_qc = abs(qc);
-  assert(abs_qc <= 1);
-  int cost = 0;
-  const int scan_idx = txb_info->scan_order->iscan[coeff_idx];
-  if (scan_idx < txb_info->seg_eob) {
-    const int *level_cost = get_level_prob(0, coeff_idx, txb_cache, txb_costs);
-    cost += level_cost[qc != 0];
-  }
-
-  if (qc != 0) {
-    const int base_idx = 0;
-    const int ctx = txb_cache->base_ctx_arr[base_idx][coeff_idx];
-    cost += get_base_cost(abs_qc, ctx, txb_costs->base_cost[base_idx][ctx],
-                          base_idx);
-    if (scan_idx < txb_info->seg_eob) {
-      const int eob_ctx = get_eob_ctx(txb_info->qcoeff, coeff_idx,
-                                      txb_info->txs_ctx, txb_info->tx_type);
-      cost += txb_costs->eob_cost[eob_ctx][scan_idx == (txb_info->eob - 1)];
+    if (v) {
+      // sign bit cost
+      const int sign01 = (sign ^ sign) - sign;
+      const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+      cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
+      if (level > NUM_BASE_LEVELS) {
+        const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
+        const int base_range =
+            AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+        cost += lps_cost[ctx][base_range];
+        cost += get_golomb_cost(level);
+      }
     }
-    cost += get_sign_bit_cost(qc, coeff_idx, txb_costs->dc_sign_cost,
-                              txb_info->txb_ctx->dc_sign_ctx);
   }
   return cost;
 }
 
-static INLINE void set_eob(TxbInfo *txb_info, int eob) {
-  txb_info->eob = eob;
-  txb_info->seg_eob = AOMMIN(eob, tx_size_2d[txb_info->tx_size] - 1);
+int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x,
+                        const int plane, const int blk_row, const int blk_col,
+                        const int block, const TX_SIZE tx_size,
+                        const TXB_CTX *const txb_ctx) {
+  const struct macroblock_plane *p = &x->plane[plane];
+  const int eob = p->eobs[block];
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const LV_MAP_COEFF_COST *const coeff_costs =
+      &x->coeff_costs[txs_ctx][plane_type];
+  if (eob == 0) {
+    return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+  }
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+
+#define WAREHOUSE_EFFICIENTS_TXB_CASE(tx_class_literal)                        \
+  case tx_class_literal:                                                       \
+    return warehouse_efficients_txb(cm, x, plane, block, tx_size, txb_ctx, p,  \
+                                    eob, plane_type, coeff_costs, xd, tx_type, \
+                                    tx_class_literal);
+  switch (tx_class) {
+    WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_2D);
+    WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_HORIZ);
+    WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_VERT);
+#undef WAREHOUSE_EFFICIENTS_TXB_CASE
+    default: assert(false); return 0;
+  }
 }
 
-// TODO(angiebird): add static to this function once it's called
-int try_change_eob(int *new_eob, int coeff_idx, const TxbCache *txb_cache,
-                   const LV_MAP_COEFF_COST *txb_costs, TxbInfo *txb_info,
-                   int fast_mode) {
-  assert(txb_info->eob > 0);
-  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  const int abs_qc = abs(qc);
-  if (abs_qc != 1) {
-    *new_eob = -1;
-    return 0;
-  }
-  const int16_t *iscan = txb_info->scan_order->iscan;
-  const int16_t *scan = txb_info->scan_order->scan;
-  const int scan_idx = iscan[coeff_idx];
-  *new_eob = 0;
-  int cost_diff = 0;
-  cost_diff -= get_low_coeff_cost(coeff_idx, txb_cache, txb_costs, txb_info);
-  // int coeff_cost =
-  //     get_coeff_cost(qc, scan_idx, txb_info, txb_probs);
-  // if (-cost_diff != coeff_cost) {
-  //   printf("-cost_diff %d coeff_cost %d\n", -cost_diff, coeff_cost);
-  //   get_low_coeff_cost(coeff_idx, txb_cache, txb_probs, txb_info);
-  //   get_coeff_cost(qc, scan_idx, txb_info, txb_probs);
-  // }
-  for (int si = scan_idx - 1; si >= 0; --si) {
-    const int ci = scan[si];
-    if (txb_info->qcoeff[ci] != 0) {
-      *new_eob = si + 1;
-      break;
+static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+                        const LV_MAP_EOB_COST *txb_eob_costs, int *rate_cost) {
+  int update = 0;
+  if (txb_info->eob == 0) return update;
+  const int16_t *const scan = txb_info->scan_order->scan;
+  // forward optimize the nz_map`
+  const int init_eob = txb_info->eob;
+  const TX_CLASS tx_class = tx_type_to_class[txb_info->tx_type];
+  const int eob_cost =
+      get_eob_cost(init_eob, txb_eob_costs, txb_costs, tx_class);
+
+  // backward optimize the level-k map
+  int accu_rate = eob_cost;
+  int64_t accu_dist = 0;
+  int64_t prev_eob_rd_cost = INT64_MAX;
+  int64_t cur_eob_rd_cost = 0;
+
+  {
+    const int si = init_eob - 1;
+    const int coeff_idx = scan[si];
+    LevelDownStats stats;
+    get_dist_cost_stats(&stats, si, si == init_eob - 1, txb_costs, txb_info,
+                        tx_class);
+    if ((stats.rd_low < stats.rd) && (stats.low_qc != 0)) {
+      update = 1;
+      update_coeff(coeff_idx, stats.low_qc, txb_info);
+      accu_rate += stats.rate_low;
+      accu_dist += stats.dist_low;
     } else {
-      cost_diff -= get_low_coeff_cost(ci, txb_cache, txb_costs, txb_info);
+      accu_rate += stats.rate;
+      accu_dist += stats.dist;
     }
   }
 
-  const int org_eob = txb_info->eob;
-  set_eob(txb_info, *new_eob);
-  cost_diff += try_level_down(coeff_idx, txb_cache, txb_costs, txb_info, NULL,
-                              fast_mode);
-  set_eob(txb_info, org_eob);
+  int si = init_eob - 2;
+  int8_t has_nz_tail = 0;
+  // eob is not fixed
+  for (; si >= 0 && has_nz_tail < 2; --si) {
+    assert(si != init_eob - 1);
+    const int coeff_idx = scan[si];
+    tran_low_t qc = txb_info->qcoeff[coeff_idx];
 
-  if (*new_eob > 0) {
-    // Note that get_eob_ctx does NOT actually account for qcoeff, so we don't
-    // need to lower down the qcoeff here
-    const int eob_ctx = get_eob_ctx(txb_info->qcoeff, scan[*new_eob - 1],
-                                    txb_info->txs_ctx, txb_info->tx_type);
-    cost_diff -= txb_costs->eob_cost[eob_ctx][0];
-    cost_diff += txb_costs->eob_cost[eob_ctx][1];
-  } else {
-    const int txb_skip_ctx = txb_info->txb_ctx->txb_skip_ctx;
-    cost_diff -= txb_costs->txb_skip_cost[txb_skip_ctx][0];
-    cost_diff += txb_costs->txb_skip_cost[txb_skip_ctx][1];
-  }
-  return cost_diff;
-}
+    if (qc == 0) {
+      const int coeff_ctx =
+          get_lower_levels_ctx(txb_info->levels, coeff_idx, txb_info->bwl,
+                               txb_info->tx_size, tx_class);
+      accu_rate += txb_costs->base_cost[coeff_ctx][0];
+    } else {
+      LevelDownStats stats;
+      get_dist_cost_stats_with_eob(&stats, si, txb_costs, txb_info, tx_class);
+      // check if it is better to make this the last significant coefficient
+      int cur_eob_rate =
+          get_eob_cost(si + 1, txb_eob_costs, txb_costs, tx_class);
+      cur_eob_rd_cost = RDCOST(txb_info->rdmult, cur_eob_rate, 0);
+      prev_eob_rd_cost =
+          RDCOST(txb_info->rdmult, accu_rate, accu_dist) + stats.nz_rd;
+      if (cur_eob_rd_cost <= prev_eob_rd_cost) {
+        update = 1;
+        for (int j = si + 1; j < txb_info->eob; j++) {
+          const int coeff_pos_j = scan[j];
+          update_coeff(coeff_pos_j, 0, txb_info);
+        }
+        txb_info->eob = si + 1;
+
+        // rerun cost calculation due to change of eob
+        accu_rate = cur_eob_rate;
+        accu_dist = 0;
+        get_dist_cost_stats(&stats, si, 1, txb_costs, txb_info, tx_class);
+        if ((stats.rd_low < stats.rd) && (stats.low_qc != 0)) {
+          update = 1;
+          update_coeff(coeff_idx, stats.low_qc, txb_info);
+          accu_rate += stats.rate_low;
+          accu_dist += stats.dist_low;
+        } else {
+          accu_rate += stats.rate;
+          accu_dist += stats.dist;
+        }
 
-static INLINE tran_low_t qcoeff_to_dqcoeff(tran_low_t qc, int dqv, int shift) {
-  int sgn = qc < 0 ? -1 : 1;
-  return sgn * ((abs(qc) * dqv) >> shift);
-}
+        // reset non zero tail when new eob is found
+        has_nz_tail = 0;
+      } else {
+        int bUpdCoeff = 0;
+        if (stats.rd_low < stats.rd) {
+          if ((si < txb_info->eob - 1)) {
+            bUpdCoeff = 1;
+            update = 1;
+          }
+        } else {
+          ++has_nz_tail;
+        }
 
-// TODO(angiebird): add static to this function it's called
-void update_level_down(int coeff_idx, TxbCache *txb_cache, TxbInfo *txb_info) {
-  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  const int abs_qc = abs(qc);
-  if (qc == 0) return;
-  const tran_low_t low_coeff = get_lower_coeff(qc);
-  txb_info->qcoeff[coeff_idx] = low_coeff;
-  const int dqv = txb_info->dequant[coeff_idx != 0];
-  txb_info->dqcoeff[coeff_idx] =
-      qcoeff_to_dqcoeff(low_coeff, dqv, txb_info->shift);
-
-  const int row = coeff_idx >> txb_info->bwl;
-  const int col = coeff_idx - (row << txb_info->bwl);
-  const int eob = txb_info->eob;
-  const int16_t *iscan = txb_info->scan_order->iscan;
-  for (int i = 0; i < SIG_REF_OFFSET_NUM; ++i) {
-    const int nb_row = row - sig_ref_offset[i][0];
-    const int nb_col = col - sig_ref_offset[i][1];
-
-    if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height &&
-          nb_col < txb_info->stride))
-      continue;
-
-    const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
-    const int nb_scan_idx = iscan[nb_coeff_idx];
-    if (nb_scan_idx < eob) {
-      const int scan_idx = iscan[coeff_idx];
-      if (scan_idx < nb_scan_idx) {
-        const int level = 1;
-        if (abs_qc == level) {
-          txb_cache->nz_count_arr[nb_coeff_idx] -= 1;
-          assert(txb_cache->nz_count_arr[nb_coeff_idx] >= 0);
+        if (bUpdCoeff) {
+          update_coeff(coeff_idx, stats.low_qc, txb_info);
+          accu_rate += stats.rate_low;
+          accu_dist += stats.dist_low;
+        } else {
+          accu_rate += stats.rate;
+          accu_dist += stats.dist;
         }
-        const int count = txb_cache->nz_count_arr[nb_coeff_idx];
-        txb_cache->nz_ctx_arr[nb_coeff_idx] = get_nz_map_ctx_from_count(
-            count, nb_coeff_idx, txb_info->bwl, txb_info->tx_type);
-        // int ref_ctx = get_nz_map_ctx(txb_info->qcoeff, nb_coeff_idx,
-        // txb_info->bwl, tx_type);
-        // if (ref_ctx != txb_cache->nz_ctx_arr[nb_coeff_idx])
-        //   printf("nz ctx %d ref_ctx %d\n",
-        //   txb_cache->nz_ctx_arr[nb_coeff_idx], ref_ctx);
       }
     }
-  }
+  }  // for (si)
+
+  // eob is fixed
+  for (; si >= 0; --si) {
+    assert(si != init_eob - 1);
+    const int coeff_idx = scan[si];
+    tran_low_t qc = txb_info->qcoeff[coeff_idx];
+
+    if (qc == 0) {
+      const int coeff_ctx =
+          get_lower_levels_ctx(txb_info->levels, coeff_idx, txb_info->bwl,
+                               txb_info->tx_size, tx_class);
+      accu_rate += txb_costs->base_cost[coeff_ctx][0];
+    } else {
+      LevelDownStats stats;
+      get_dist_cost_stats(&stats, si, 0, txb_costs, txb_info, tx_class);
 
-  const BASE_CTX_TABLE *base_ctx_table =
-      txb_info->coeff_ctx_table->base_ctx_table;
-  for (int i = 0; i < BASE_CONTEXT_POSITION_NUM; ++i) {
-    const int nb_row = row - base_ref_offset[i][0];
-    const int nb_col = col - base_ref_offset[i][1];
-    const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
-
-    if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height &&
-          nb_col < txb_info->stride))
-      continue;
-
-    const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
-    if (!has_base(nb_coeff, 0)) continue;
-    const int nb_scan_idx = iscan[nb_coeff_idx];
-    if (nb_scan_idx < eob) {
-      if (row >= nb_row && col >= nb_col)
-        update_mag_arr(txb_cache->base_mag_arr[nb_coeff_idx], abs_qc);
-      const int mag =
-          get_mag_from_mag_arr(txb_cache->base_mag_arr[nb_coeff_idx]);
-      for (int base_idx = 0; base_idx < NUM_BASE_LEVELS; ++base_idx) {
-        if (!has_base(nb_coeff, base_idx)) continue;
-        const int level = base_idx + 1;
-        if (abs_qc == level) {
-          txb_cache->base_count_arr[base_idx][nb_coeff_idx] -= 1;
-          assert(txb_cache->base_count_arr[base_idx][nb_coeff_idx] >= 0);
+      int bUpdCoeff = 0;
+      if (stats.rd_low < stats.rd) {
+        if ((si < txb_info->eob - 1)) {
+          bUpdCoeff = 1;
+          update = 1;
         }
-        const int count = txb_cache->base_count_arr[base_idx][nb_coeff_idx];
-        txb_cache->base_ctx_arr[base_idx][nb_coeff_idx] =
-            base_ctx_table[nb_row != 0][nb_col != 0][mag > level][count];
-        // int ref_ctx = get_base_ctx(txb_info->qcoeff, nb_coeff_idx,
-        // txb_info->bwl, level);
-        // if (ref_ctx != txb_cache->base_ctx_arr[base_idx][nb_coeff_idx]) {
-        //   printf("base ctx %d ref_ctx %d\n",
-        //   txb_cache->base_ctx_arr[base_idx][nb_coeff_idx], ref_ctx);
-        // }
+      }
+      if (bUpdCoeff) {
+        update_coeff(coeff_idx, stats.low_qc, txb_info);
+        accu_rate += stats.rate_low;
+        accu_dist += stats.dist_low;
+      } else {
+        accu_rate += stats.rate;
+        accu_dist += stats.dist;
       }
     }
-  }
-
-  for (int i = 0; i < BR_CONTEXT_POSITION_NUM; ++i) {
-    const int nb_row = row - br_ref_offset[i][0];
-    const int nb_col = col - br_ref_offset[i][1];
-    const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+  }  // for (si)
 
-    if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height &&
-          nb_col < txb_info->stride))
-      continue;
+  int non_zero_blk_rate =
+      txb_costs->txb_skip_cost[txb_info->txb_ctx->txb_skip_ctx][0];
+  prev_eob_rd_cost =
+      RDCOST(txb_info->rdmult, accu_rate + non_zero_blk_rate, accu_dist);
 
-    const int nb_scan_idx = iscan[nb_coeff_idx];
-    const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
-    if (!has_br(nb_coeff)) continue;
-    if (nb_scan_idx < eob) {
-      const int level = 1 + NUM_BASE_LEVELS;
-      if (abs_qc == level) {
-        txb_cache->br_count_arr[nb_coeff_idx] -= 1;
-        assert(txb_cache->br_count_arr[nb_coeff_idx] >= 0);
-      }
-      if (row >= nb_row && col >= nb_col)
-        update_mag_arr(txb_cache->br_mag_arr[nb_coeff_idx], abs_qc);
-      const int count = txb_cache->br_count_arr[nb_coeff_idx];
-      const int mag = get_mag_from_mag_arr(txb_cache->br_mag_arr[nb_coeff_idx]);
-      txb_cache->br_ctx_arr[nb_coeff_idx] =
-          get_br_ctx_from_count_mag(nb_row, nb_col, count, mag);
-      // int ref_ctx = get_level_ctx(txb_info->qcoeff, nb_coeff_idx,
-      // txb_info->bwl);
-      // if (ref_ctx != txb_cache->br_ctx_arr[nb_coeff_idx]) {
-      //   printf("base ctx %d ref_ctx %d\n",
-      //   txb_cache->br_ctx_arr[nb_coeff_idx], ref_ctx);
-      // }
+  int zero_blk_rate =
+      txb_costs->txb_skip_cost[txb_info->txb_ctx->txb_skip_ctx][1];
+  int64_t zero_blk_rd_cost = RDCOST(txb_info->rdmult, zero_blk_rate, 0);
+  if (zero_blk_rd_cost <= prev_eob_rd_cost) {
+    update = 1;
+    for (int j = 0; j < txb_info->eob; j++) {
+      const int coeff_pos_j = scan[j];
+      update_coeff(coeff_pos_j, 0, txb_info);
     }
+    txb_info->eob = 0;
+  }
+
+  // record total rate cost
+  *rate_cost = zero_blk_rd_cost <= prev_eob_rd_cost
+                   ? zero_blk_rate
+                   : accu_rate + non_zero_blk_rate;
+
+  if (txb_info->eob > 0) {
+    *rate_cost += txb_info->tx_type_cost;
   }
+
+  return update;
 }
 
-static int get_coeff_cost(tran_low_t qc, int scan_idx, TxbInfo *txb_info,
-                          const LV_MAP_COEFF_COST *txb_costs) {
-  const TXB_CTX *txb_ctx = txb_info->txb_ctx;
-  const int is_nz = (qc != 0);
-  const tran_low_t abs_qc = abs(qc);
-  int cost = 0;
+// These numbers are empirically obtained.
+static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
+  { 17, 13 },
+  { 16, 10 },
+};
+
+void hbt_init() {
+  hbt_hash_table =
+      aom_malloc(sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH);
+  memset(hbt_hash_table, 0,
+         sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH);
+  av1_crc32c_calculator_init(&crc_calculator);  // 31 bit: qc & ctx
+
+  hbt_needs_init = 0;
+}
+
+void hbt_destroy() { aom_free(hbt_hash_table); }
+
+int hbt_hash_miss(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
+                  TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+                  const LV_MAP_EOB_COST *txb_eob_costs,
+                  const struct macroblock_plane *p, int block, int fast_mode,
+                  int *rate_cost) {
+  (void)fast_mode;
   const int16_t *scan = txb_info->scan_order->scan;
+  int prev_eob = txb_info->eob;
+  assert(HBT_EOB <= 16);  // Lengthen array if allowing longer eob.
+  int32_t prev_coeff[16];
+  for (int i = 0; i < prev_eob; i++) {
+    prev_coeff[i] = txb_info->qcoeff[scan[i]];
+  }
+  for (int i = prev_eob; i < HBT_EOB; i++) {
+    prev_coeff[i] = 0;  // For compiler piece of mind.
+  }
+
+  av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height,
+                      txb_info->levels);
 
-  if (scan_idx < txb_info->seg_eob) {
-    int coeff_ctx =
-        get_nz_map_ctx(txb_info->qcoeff, scan_idx, scan, txb_info->bwl,
-                       txb_info->height, txb_info->tx_type);
-    cost += txb_costs->nz_map_cost[coeff_ctx][is_nz];
+  const int update =
+      optimize_txb(txb_info, txb_costs, txb_eob_costs, rate_cost);
+
+  // Overwrite old entry
+  uint16_t hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE;
+  uint16_t hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH;
+  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+      .rate_cost = *rate_cost;
+  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index].init = 1;
+  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+      .hbt_qc_hash = hbt_qc_hash;
+  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+      .hbt_ctx_hash = hbt_ctx_hash;
+  assert(prev_eob >= txb_info->eob);  // eob can't get longer
+  for (int i = 0; i < txb_info->eob; i++) {
+    // Record how coeff changed. Convention: towards zero is negative.
+    if (txb_info->qcoeff[scan[i]] > 0)
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .deltas[i] = txb_info->qcoeff[scan[i]] - prev_coeff[i];
+    else
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .deltas[i] = prev_coeff[i] - txb_info->qcoeff[scan[i]];
+  }
+  for (int i = txb_info->eob; i < prev_eob; i++) {
+    // If eob got shorter, record that all after it changed to zero.
+    if (prev_coeff[i] > 0)
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .deltas[i] = -prev_coeff[i];
+    else
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .deltas[i] = prev_coeff[i];
+  }
+  for (int i = prev_eob; i < HBT_EOB; i++) {
+    // Record 'no change' after optimized coefficients run out.
+    hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+        .deltas[i] = 0;
   }
 
-  if (is_nz) {
-    cost += get_sign_bit_cost(qc, scan_idx, txb_costs->dc_sign_cost,
-                              txb_ctx->dc_sign_ctx);
+  if (update) {
+    p->eobs[block] = txb_info->eob;
+    p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
+        txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
+  }
+  return txb_info->eob;
+}
 
-    int ctx_ls[NUM_BASE_LEVELS] = { 0 };
-    get_base_ctx_set(txb_info->qcoeff, scan[scan_idx], txb_info->bwl,
-                     txb_info->height, ctx_ls);
+int hbt_hash_hit(uint32_t hbt_table_index, int hbt_array_index,
+                 TxbInfo *txb_info, const struct macroblock_plane *p, int block,
+                 int *rate_cost) {
+  const int16_t *scan = txb_info->scan_order->scan;
+  int new_eob = 0;
+  int update = 0;
 
-    int i;
-    for (i = 0; i < NUM_BASE_LEVELS; ++i) {
-      cost += get_base_cost(abs_qc, ctx_ls[i],
-                            txb_costs->base_cost[i][ctx_ls[i]], i);
-    }
+  for (int i = 0; i < txb_info->eob; i++) {
+    // Delta convention is negatives go towards zero, so only apply those ones.
+    if (hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+            .deltas[i] < 0) {
+      if (txb_info->qcoeff[scan[i]] > 0)
+        txb_info->qcoeff[scan[i]] +=
+            hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+                .deltas[i];
+      else
+        txb_info->qcoeff[scan[i]] -=
+            hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+                .deltas[i];
 
-    if (abs_qc > NUM_BASE_LEVELS) {
-      int ctx = get_br_ctx(txb_info->qcoeff, scan[scan_idx], txb_info->bwl,
-                           txb_info->height);
-      cost += get_br_cost(abs_qc, ctx, txb_costs->lps_cost[ctx]);
-      cost += get_golomb_cost(abs_qc);
+      update = 1;
+      update_coeff(scan[i], txb_info->qcoeff[scan[i]], txb_info);
     }
+    if (txb_info->qcoeff[scan[i]]) new_eob = i + 1;
+  }
 
-    if (scan_idx < txb_info->seg_eob) {
-      int eob_ctx = get_eob_ctx(txb_info->qcoeff, scan[scan_idx],
-                                txb_info->txs_ctx, txb_info->tx_type);
-      cost += txb_costs->eob_cost[eob_ctx][scan_idx == (txb_info->eob - 1)];
-    }
+  // Rate_cost can be calculated here instead (av1_cost_coeffs_txb), but
+  // it is expensive and gives little benefit as long as qc_hash is high bit
+  *rate_cost =
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .rate_cost;
+
+  if (update) {
+    txb_info->eob = new_eob;
+    p->eobs[block] = txb_info->eob;
+    p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
+        txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
   }
-  return cost;
+
+  return txb_info->eob;
 }
 
-#if TEST_OPTIMIZE_TXB
-#define ALL_REF_OFFSET_NUM 17
-static int all_ref_offset[ALL_REF_OFFSET_NUM][2] = {
-  { 0, 0 },  { -2, -1 }, { -2, 0 }, { -2, 1 }, { -1, -2 }, { -1, -1 },
-  { -1, 0 }, { -1, 1 },  { 0, -2 }, { 0, -1 }, { 1, -2 },  { 1, -1 },
-  { 1, 0 },  { 2, 0 },   { 0, 1 },  { 0, 2 },  { 1, 1 },
-};
+int hbt_search_match(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
+                     TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+                     const LV_MAP_EOB_COST *txb_eob_costs,
+                     const struct macroblock_plane *p, int block, int fast_mode,
+                     int *rate_cost) {
+  // Check for qcoeff match
+  int hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH;
+  int hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE;
+
+  if (hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+              .hbt_qc_hash == hbt_qc_hash &&
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+              .hbt_ctx_hash == hbt_ctx_hash &&
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .init) {
+    return hbt_hash_hit(hbt_table_index, hbt_array_index, txb_info, p, block,
+                        rate_cost);
+  } else {
+    return hbt_hash_miss(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs,
+                         txb_eob_costs, p, block, fast_mode, rate_cost);
+  }
+}
+
+int hbt_create_hashes(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+                      const LV_MAP_EOB_COST *txb_eob_costs,
+                      const struct macroblock_plane *p, int block,
+                      int fast_mode, int *rate_cost) {
+  // Initialize hash table if needed.
+  if (hbt_needs_init) {
+    hbt_init();
+  }
+
+  //// Hash creation
+  uint8_t txb_hash_data[256];  // Asserts below to ensure enough space.
+  const int16_t *scan = txb_info->scan_order->scan;
+  uint8_t chunk = 0;
+  int hash_data_index = 0;
+
+  // Make qc_hash.
+  int packing_index = 0;  // needed for packing.
+  for (int i = 0; i < txb_info->eob; i++) {
+    tran_low_t prechunk = txb_info->qcoeff[scan[i]];
+
+    // Softening: Improves speed. Aligns with signed deltas.
+    if (prechunk < 0) prechunk *= -1;
+
+    // Early kick out: Don't apply feature if there are large coeffs:
+    // If this kickout value is removed or raised beyond int8_t,
+    // widen deltas type in OptTxbQcoeff struct.
+    assert((int8_t)HBT_KICKOUT == HBT_KICKOUT);  // If not, widen types.
+    if (prechunk > HBT_KICKOUT) {
+      av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height,
+                          txb_info->levels);
+
+      const int update =
+          optimize_txb(txb_info, txb_costs, txb_eob_costs, rate_cost);
+
+      if (update) {
+        p->eobs[block] = txb_info->eob;
+        p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
+            txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
+      }
+      return txb_info->eob;
+    }
+
+    // Since coeffs are 0 to 3, only 2 bits are needed: pack into bytes
+    if (packing_index == 0) txb_hash_data[hash_data_index] = 0;
+    chunk = prechunk << packing_index;
+    packing_index += 2;
+    txb_hash_data[hash_data_index] |= chunk;
 
-static int try_level_down_ref(int coeff_idx, const LV_MAP_COEFF_COST *txb_costs,
-                              TxbInfo *txb_info,
-                              int (*cost_map)[COST_MAP_SIZE]) {
-  if (cost_map) {
-    for (int i = 0; i < COST_MAP_SIZE; ++i) av1_zero(cost_map[i]);
-  }
-  tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  if (qc == 0) return 0;
-  int row = coeff_idx >> txb_info->bwl;
-  int col = coeff_idx - (row << txb_info->bwl);
-  int org_cost = 0;
-  for (int i = 0; i < ALL_REF_OFFSET_NUM; ++i) {
-    int nb_row = row - all_ref_offset[i][0];
-    int nb_col = col - all_ref_offset[i][1];
-    int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
-    int nb_scan_idx = txb_info->scan_order->iscan[nb_coeff_idx];
-    if (nb_scan_idx < txb_info->eob && nb_row >= 0 && nb_col >= 0 &&
-        nb_row < txb_info->height && nb_col < txb_info->stride) {
-      tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
-      int cost = get_coeff_cost(nb_coeff, nb_scan_idx, txb_info, txb_costs);
-      if (cost_map)
-        cost_map[nb_row - row + COST_MAP_OFFSET]
-                [nb_col - col + COST_MAP_OFFSET] -= cost;
-      org_cost += cost;
+    // Full byte:
+    if (packing_index == 8) {
+      packing_index = 0;
+      hash_data_index++;
     }
   }
-  txb_info->qcoeff[coeff_idx] = get_lower_coeff(qc);
-  int new_cost = 0;
-  for (int i = 0; i < ALL_REF_OFFSET_NUM; ++i) {
-    int nb_row = row - all_ref_offset[i][0];
-    int nb_col = col - all_ref_offset[i][1];
-    int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
-    int nb_scan_idx = txb_info->scan_order->iscan[nb_coeff_idx];
-    if (nb_scan_idx < txb_info->eob && nb_row >= 0 && nb_col >= 0 &&
-        nb_row < txb_info->height && nb_col < txb_info->stride) {
-      tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
-      int cost = get_coeff_cost(nb_coeff, nb_scan_idx, txb_info, txb_costs);
-      if (cost_map)
-        cost_map[nb_row - row + COST_MAP_OFFSET]
-                [nb_col - col + COST_MAP_OFFSET] += cost;
-      new_cost += cost;
+  // Needed when packing_index != 0, to include final byte.
+  hash_data_index++;
+  assert(hash_data_index <= 64);
+  // 31 bit qc_hash: index to array
+  uint32_t hbt_qc_hash =
+      av1_get_crc32c_value(&crc_calculator, txb_hash_data, hash_data_index);
+
+  // Make ctx_hash.
+  hash_data_index = 0;
+  tran_low_t prechunk;
+
+  for (int i = 0; i < txb_info->eob; i++) {
+    // Save as magnitudes towards or away from zero.
+    if (txb_info->tcoeff[scan[i]] >= 0)
+      prechunk = txb_info->tcoeff[scan[i]] - txb_info->dqcoeff[scan[i]];
+    else
+      prechunk = txb_info->dqcoeff[scan[i]] - txb_info->tcoeff[scan[i]];
+
+    chunk = prechunk & 0xff;
+    txb_hash_data[hash_data_index++] = chunk;
+  }
+
+  // Extra ctx data:
+  // Include dequants.
+  txb_hash_data[hash_data_index++] = txb_info->dequant[0] & 0xff;
+  txb_hash_data[hash_data_index++] = txb_info->dequant[1] & 0xff;
+  chunk = txb_info->txb_ctx->txb_skip_ctx & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  chunk = txb_info->txb_ctx->dc_sign_ctx & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  // eob
+  chunk = txb_info->eob & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  // rdmult (int64)
+  chunk = txb_info->rdmult & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  // tx_type
+  chunk = txb_info->tx_type & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  // base_eob_cost
+  for (int i = 1; i < 3; i++) {  // i = 0 are softened away
+    for (int j = 0; j < SIG_COEF_CONTEXTS_EOB; j++) {
+      chunk = (txb_costs->base_eob_cost[j][i] & 0xff00) >> 8;
+      txb_hash_data[hash_data_index++] = chunk;
     }
   }
-  txb_info->qcoeff[coeff_idx] = qc;
-  return new_cost - org_cost;
-}
-
-static void test_level_down(int coeff_idx, const TxbCache *txb_cache,
-                            const LV_MAP_COEFF_COST *txb_costs,
-                            TxbInfo *txb_info) {
-  int cost_map[COST_MAP_SIZE][COST_MAP_SIZE];
-  int ref_cost_map[COST_MAP_SIZE][COST_MAP_SIZE];
-  const int cost_diff =
-      try_level_down(coeff_idx, txb_cache, txb_costs, txb_info, cost_map, 0);
-  const int cost_diff_ref =
-      try_level_down_ref(coeff_idx, txb_costs, txb_info, ref_cost_map);
-  if (cost_diff != cost_diff_ref) {
-    printf("qc %d cost_diff %d cost_diff_ref %d\n", txb_info->qcoeff[coeff_idx],
-           cost_diff, cost_diff_ref);
-    for (int r = 0; r < COST_MAP_SIZE; ++r) {
-      for (int c = 0; c < COST_MAP_SIZE; ++c) {
-        printf("%d:%d ", cost_map[r][c], ref_cost_map[r][c]);
-      }
-      printf("\n");
+  // eob_cost
+  for (int i = 0; i < 11; i++) {
+    for (int j = 0; j < 2; j++) {
+      chunk = (txb_eob_costs->eob_cost[j][i] & 0xff00) >> 8;
+      txb_hash_data[hash_data_index++] = chunk;
+    }
+  }
+  // dc_sign_cost
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < DC_SIGN_CONTEXTS; j++) {
+      chunk = (txb_costs->dc_sign_cost[j][i] & 0xff00) >> 8;
+      txb_hash_data[hash_data_index++] = chunk;
     }
   }
+
+  assert(hash_data_index <= 256);
+  // 31 bit ctx_hash: used to index table
+  uint32_t hbt_ctx_hash =
+      av1_get_crc32c_value(&crc_calculator, txb_hash_data, hash_data_index);
+  //// End hash creation
+
+  return hbt_search_match(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs,
+                          txb_eob_costs, p, block, fast_mode, rate_cost);
 }
-#endif
 
-// TODO(angiebird): make this static once it's called
-int get_txb_cost(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs) {
-  int cost = 0;
-  int txb_skip_ctx = txb_info->txb_ctx->txb_skip_ctx;
-  const int16_t *scan = txb_info->scan_order->scan;
-  if (txb_info->eob == 0) {
-    cost = txb_costs->txb_skip_cost[txb_skip_ctx][1];
-    return cost;
-  }
-  cost = txb_costs->txb_skip_cost[txb_skip_ctx][0];
-  for (int c = 0; c < txb_info->eob; ++c) {
-    tran_low_t qc = txb_info->qcoeff[scan[c]];
-    int coeff_cost = get_coeff_cost(qc, c, txb_info, txb_costs);
-    cost += coeff_cost;
+static AOM_FORCE_INLINE int get_coeff_cost_simple(
+    int ci, tran_low_t abs_qc, int coeff_ctx,
+    const LV_MAP_COEFF_COST *txb_costs, int bwl, TX_CLASS tx_class,
+    const uint8_t *levels) {
+  // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
+  // and not the last (scan_idx != eob - 1)
+  assert(ci > 0);
+  int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+  if (abs_qc) {
+    cost += av1_cost_literal(1);
+    if (abs_qc > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
+      cost += get_br_cost(abs_qc, br_ctx, txb_costs->lps_cost[br_ctx]);
+      cost += get_golomb_cost(abs_qc);
+    }
   }
   return cost;
 }
 
-#if TEST_OPTIMIZE_TXB
-void test_try_change_eob(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
-                         TxbCache *txb_cache) {
-  int eob = txb_info->eob;
-  const int16_t *scan = txb_info->scan_order->scan;
-  if (eob > 0) {
-    int last_si = eob - 1;
-    int last_ci = scan[last_si];
-    int last_coeff = txb_info->qcoeff[last_ci];
-    if (abs(last_coeff) == 1) {
-      int new_eob;
-      int cost_diff =
-          try_change_eob(&new_eob, last_ci, txb_cache, txb_costs, txb_info, 0);
-      int org_eob = txb_info->eob;
-      int cost = get_txb_cost(txb_info, txb_costs);
-
-      txb_info->qcoeff[last_ci] = get_lower_coeff(last_coeff);
-      set_eob(txb_info, new_eob);
-      int new_cost = get_txb_cost(txb_info, txb_costs);
-      set_eob(txb_info, org_eob);
-      txb_info->qcoeff[last_ci] = last_coeff;
-
-      int ref_cost_diff = -cost + new_cost;
-      if (cost_diff != ref_cost_diff)
-        printf("org_eob %d new_eob %d cost_diff %d ref_cost_diff %d\n", org_eob,
-               new_eob, cost_diff, ref_cost_diff);
+static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc,
+                                         int sign, int coeff_ctx,
+                                         int dc_sign_ctx,
+                                         const LV_MAP_COEFF_COST *txb_costs,
+                                         int bwl, TX_CLASS tx_class,
+                                         const uint8_t *levels) {
+  int cost = 0;
+  if (is_last) {
+    cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+  } else {
+    cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+  }
+  if (abs_qc != 0) {
+    if (ci == 0) {
+      cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
+    } else {
+      cost += av1_cost_literal(1);
+    }
+    if (abs_qc > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
+      cost += get_br_cost(abs_qc, br_ctx, txb_costs->lps_cost[br_ctx]);
+      cost += get_golomb_cost(abs_qc);
     }
   }
+  return cost;
 }
-#endif
 
-static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
-                                     int shift) {
-  const int64_t diff = (tcoeff - dqcoeff) * (1 << shift);
-  const int64_t error = diff * diff;
-  return error;
+static INLINE void get_qc_dqc_low(tran_low_t abs_qc, int sign, int dqv,
+                                  int shift, tran_low_t *qc_low,
+                                  tran_low_t *dqc_low) {
+  tran_low_t abs_qc_low = abs_qc - 1;
+  *qc_low = (-sign ^ abs_qc_low) + sign;
+  assert((sign ? -abs_qc_low : abs_qc_low) == *qc_low);
+  tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
+  *dqc_low = (-sign ^ abs_dqc_low) + sign;
+  assert((sign ? -abs_dqc_low : abs_dqc_low) == *dqc_low);
 }
 
-typedef struct LevelDownStats {
-  int update;
-  tran_low_t low_qc;
-  tran_low_t low_dqc;
-  int64_t rd_diff;
-  int cost_diff;
-  int64_t dist_diff;
-  int new_eob;
-} LevelDownStats;
-
-void try_level_down_facade(LevelDownStats *stats, int scan_idx,
-                           const TxbCache *txb_cache,
-                           const LV_MAP_COEFF_COST *txb_costs,
-                           TxbInfo *txb_info, int fast_mode) {
-  const int16_t *scan = txb_info->scan_order->scan;
-  const int coeff_idx = scan[scan_idx];
-  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
-  stats->new_eob = -1;
-  stats->update = 0;
+static INLINE void update_coeff_general(
+    int *accu_rate, int64_t *accu_dist, int si, int eob, TX_SIZE tx_size,
+    TX_CLASS tx_class, int bwl, int height, int64_t rdmult, int shift,
+    int dc_sign_ctx, const int16_t *dequant, const int16_t *scan,
+    const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels) {
+  const int dqv = dequant[si != 0];
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const int is_last = si == (eob - 1);
+  const int coeff_ctx = get_lower_levels_ctx_general(
+      is_last, si, bwl, height, levels, ci, tx_size, tx_class);
   if (qc == 0) {
-    return;
+    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+  } else {
+    const int sign = (qc < 0) ? 1 : 0;
+    const tran_low_t abs_qc = abs(qc);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int64_t dist = get_coeff_dist(tqc, dqc, shift);
+    const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
+    const int rate =
+        get_coeff_cost_general(is_last, ci, abs_qc, sign, coeff_ctx,
+                               dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+    const int64_t rd = RDCOST(rdmult, rate, dist);
+
+    tran_low_t qc_low, dqc_low;
+    get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+    const tran_low_t abs_qc_low = abs_qc - 1;
+    const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift);
+    const int rate_low =
+        get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
+                               dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+    const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+    if (rd_low < rd) {
+      qcoeff[ci] = qc_low;
+      dqcoeff[ci] = dqc_low;
+      levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
+      *accu_rate += rate_low;
+      *accu_dist += dist_low - dist0;
+    } else {
+      *accu_rate += rate;
+      *accu_dist += dist - dist0;
+    }
   }
+}
 
-  const tran_low_t tqc = txb_info->tcoeff[coeff_idx];
-  const int dqv = txb_info->dequant[coeff_idx != 0];
-
-  const tran_low_t dqc = qcoeff_to_dqcoeff(qc, dqv, txb_info->shift);
-  const int64_t dqc_dist = get_coeff_dist(tqc, dqc, txb_info->shift);
-
-  stats->low_qc = get_lower_coeff(qc);
-  stats->low_dqc = qcoeff_to_dqcoeff(stats->low_qc, dqv, txb_info->shift);
-  const int64_t low_dqc_dist =
-      get_coeff_dist(tqc, stats->low_dqc, txb_info->shift);
-
-  stats->dist_diff = -dqc_dist + low_dqc_dist;
-  stats->cost_diff = 0;
-  stats->new_eob = txb_info->eob;
-  if (scan_idx == txb_info->eob - 1 && abs(qc) == 1) {
-    stats->cost_diff = try_change_eob(&stats->new_eob, coeff_idx, txb_cache,
-                                      txb_costs, txb_info, fast_mode);
+static AOM_FORCE_INLINE void update_coeff_simple(
+    int *accu_rate, int si, int eob, TX_SIZE tx_size, TX_CLASS tx_class,
+    int bwl, int64_t rdmult, int shift, const int16_t *dequant,
+    const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs,
+    const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff,
+    uint8_t *levels) {
+  const int dqv = dequant[1];
+  (void)eob;
+  // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
+  // and not the last (scan_idx != eob - 1)
+  assert(si != eob - 1);
+  assert(si > 0);
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const int coeff_ctx =
+      get_lower_levels_ctx(levels, ci, bwl, tx_size, tx_class);
+  if (qc == 0) {
+    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
   } else {
-    stats->cost_diff = try_level_down(coeff_idx, txb_cache, txb_costs, txb_info,
-                                      NULL, fast_mode);
-#if TEST_OPTIMIZE_TXB
-    test_level_down(coeff_idx, txb_cache, txb_costs, txb_info);
-#endif
+    const tran_low_t abs_qc = abs(qc);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int rate = get_coeff_cost_simple(ci, abs_qc, coeff_ctx, txb_costs,
+                                           bwl, tx_class, levels);
+    if (abs(dqc) < abs(tqc)) {
+      *accu_rate += rate;
+      return;
+    }
+    const int64_t dist = get_coeff_dist(tqc, dqc, shift);
+    const int64_t rd = RDCOST(rdmult, rate, dist);
+
+    const int sign = (qc < 0) ? 1 : 0;
+    tran_low_t qc_low, dqc_low;
+    get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+    const tran_low_t abs_qc_low = abs_qc - 1;
+    const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift);
+    const int rate_low = get_coeff_cost_simple(
+        ci, abs_qc_low, coeff_ctx, txb_costs, bwl, tx_class, levels);
+    const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+    if (rd_low < rd) {
+      qcoeff[ci] = qc_low;
+      dqcoeff[ci] = dqc_low;
+      levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
+      *accu_rate += rate_low;
+    } else {
+      *accu_rate += rate;
+    }
   }
-  stats->rd_diff = RDCOST(txb_info->rdmult, stats->cost_diff, stats->dist_diff);
-  if (stats->rd_diff < 0) stats->update = 1;
-  return;
 }
 
-static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
-                        TxbCache *txb_cache, int dry_run, int fast_mode) {
-  int update = 0;
-  if (txb_info->eob == 0) return update;
-  int cost_diff = 0;
-  int64_t dist_diff = 0;
-  int64_t rd_diff = 0;
-  const int max_eob = tx_size_2d[txb_info->tx_size];
-
-#if TEST_OPTIMIZE_TXB
-  int64_t sse;
-  int64_t org_dist =
-      av1_block_error_c(txb_info->tcoeff, txb_info->dqcoeff, max_eob, &sse) *
-      (1 << (2 * txb_info->shift));
-  int org_cost = get_txb_cost(txb_info, txb_costs);
-#endif
-
-  tran_low_t *org_qcoeff = txb_info->qcoeff;
-  tran_low_t *org_dqcoeff = txb_info->dqcoeff;
+static AOM_FORCE_INLINE void update_coeff_eob(
+    int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci,
+    int si, TX_SIZE tx_size, TX_CLASS tx_class, int bwl, int height,
+    int dc_sign_ctx, int64_t rdmult, int shift, const int16_t *dequant,
+    const int16_t *scan, const LV_MAP_EOB_COST *txb_eob_costs,
+    const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness) {
+  const int dqv = dequant[si != 0];
+  assert(si != *eob - 1);
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const int coeff_ctx =
+      get_lower_levels_ctx(levels, ci, bwl, tx_size, tx_class);
+  if (qc == 0) {
+    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+  } else {
+    int lower_level = 0;
+    const tran_low_t abs_qc = abs(qc);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int sign = (qc < 0) ? 1 : 0;
+    const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
+    int64_t dist = get_coeff_dist(tqc, dqc, shift) - dist0;
+    int rate =
+        get_coeff_cost_general(0, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx,
+                               txb_costs, bwl, tx_class, levels);
+    int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist);
+
+    tran_low_t qc_low, dqc_low;
+    get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+    const tran_low_t abs_qc_low = abs_qc - 1;
+    const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift) - dist0;
+    const int rate_low =
+        get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx, dc_sign_ctx,
+                               txb_costs, bwl, tx_class, levels);
+    const int64_t rd_low =
+        RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
+
+    int lower_level_new_eob = 0;
+    const int new_eob = si + 1;
+    uint8_t tmp_levels[3];
+    for (int ni = 0; ni < *nz_num; ++ni) {
+      const int last_ci = nz_ci[ni];
+      tmp_levels[ni] = levels[get_padded_idx(last_ci, bwl)];
+      levels[get_padded_idx(last_ci, bwl)] = 0;
+    }
 
-  tran_low_t tmp_qcoeff[MAX_TX_SQUARE];
-  tran_low_t tmp_dqcoeff[MAX_TX_SQUARE];
-  const int org_eob = txb_info->eob;
-  if (dry_run) {
-    memcpy(tmp_qcoeff, org_qcoeff, sizeof(org_qcoeff[0]) * max_eob);
-    memcpy(tmp_dqcoeff, org_dqcoeff, sizeof(org_dqcoeff[0]) * max_eob);
-    txb_info->qcoeff = tmp_qcoeff;
-    txb_info->dqcoeff = tmp_dqcoeff;
-  }
+    const int coeff_ctx_new_eob = get_lower_levels_ctx_general(
+        1, si, bwl, height, levels, ci, tx_size, tx_class);
+    const int new_eob_cost =
+        get_eob_cost(new_eob, txb_eob_costs, txb_costs, tx_class);
+    int rate_coeff_eob =
+        new_eob_cost + get_coeff_cost_general(1, ci, abs_qc, sign,
+                                              coeff_ctx_new_eob, dc_sign_ctx,
+                                              txb_costs, bwl, tx_class, levels);
+    int64_t dist_new_eob = dist;
+    int64_t rd_new_eob = RDCOST(rdmult, rate_coeff_eob, dist_new_eob);
+
+    if (abs_qc_low > 0) {
+      const int rate_coeff_eob_low =
+          new_eob_cost +
+          get_coeff_cost_general(1, ci, abs_qc_low, sign, coeff_ctx_new_eob,
+                                 dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+      const int64_t dist_new_eob_low = dist_low;
+      const int64_t rd_new_eob_low =
+          RDCOST(rdmult, rate_coeff_eob_low, dist_new_eob_low);
+      if (rd_new_eob_low < rd_new_eob) {
+        lower_level_new_eob = 1;
+        rd_new_eob = rd_new_eob_low;
+        rate_coeff_eob = rate_coeff_eob_low;
+        dist_new_eob = dist_new_eob_low;
+      }
+    }
 
-  const int16_t *scan = txb_info->scan_order->scan;
+    if (rd_low < rd) {
+      lower_level = 1;
+      rd = rd_low;
+      rate = rate_low;
+      dist = dist_low;
+    }
 
-  // forward optimize the nz_map
-  const int cur_eob = txb_info->eob;
-  for (int si = 0; si < cur_eob; ++si) {
-    const int coeff_idx = scan[si];
-    tran_low_t qc = txb_info->qcoeff[coeff_idx];
-    if (abs(qc) == 1) {
-      LevelDownStats stats;
-      try_level_down_facade(&stats, si, txb_cache, txb_costs, txb_info,
-                            fast_mode);
-      if (stats.update) {
-        update = 1;
-        cost_diff += stats.cost_diff;
-        dist_diff += stats.dist_diff;
-        rd_diff += stats.rd_diff;
-        update_level_down(coeff_idx, txb_cache, txb_info);
-        set_eob(txb_info, stats.new_eob);
+    if (sharpness == 0 && rd_new_eob < rd) {
+      for (int ni = 0; ni < *nz_num; ++ni) {
+        int last_ci = nz_ci[ni];
+        // levels[get_padded_idx(last_ci, bwl)] = 0;
+        qcoeff[last_ci] = 0;
+        dqcoeff[last_ci] = 0;
+      }
+      *eob = new_eob;
+      *nz_num = 0;
+      *accu_rate = rate_coeff_eob;
+      *accu_dist = dist_new_eob;
+      lower_level = lower_level_new_eob;
+    } else {
+      for (int ni = 0; ni < *nz_num; ++ni) {
+        const int last_ci = nz_ci[ni];
+        levels[get_padded_idx(last_ci, bwl)] = tmp_levels[ni];
       }
+      *accu_rate += rate;
+      *accu_dist += dist;
     }
-  }
 
-  // backward optimize the level-k map
-  int eob_fix = 0;
-  for (int si = txb_info->eob - 1; si >= 0; --si) {
-    const int coeff_idx = scan[si];
-    if (eob_fix == 1 && txb_info->qcoeff[coeff_idx] == 1) {
-      // when eob is fixed, there is not need to optimize again when
-      // abs(qc) == 1
-      continue;
+    if (lower_level) {
+      qcoeff[ci] = qc_low;
+      dqcoeff[ci] = dqc_low;
+      levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
     }
-    LevelDownStats stats;
-    try_level_down_facade(&stats, si, txb_cache, txb_costs, txb_info,
-                          fast_mode);
-    if (stats.update) {
-#if TEST_OPTIMIZE_TXB
-// printf("si %d low_qc %d cost_diff %d dist_diff %ld rd_diff %ld eob %d new_eob
-// %d\n", si, stats.low_qc, stats.cost_diff, stats.dist_diff, stats.rd_diff,
-// txb_info->eob, stats.new_eob);
-#endif
-      update = 1;
-      cost_diff += stats.cost_diff;
-      dist_diff += stats.dist_diff;
-      rd_diff += stats.rd_diff;
-      update_level_down(coeff_idx, txb_cache, txb_info);
-      set_eob(txb_info, stats.new_eob);
+    if (qcoeff[ci]) {
+      nz_ci[*nz_num] = ci;
+      ++*nz_num;
     }
-    if (eob_fix == 0 && txb_info->qcoeff[coeff_idx] != 0) eob_fix = 1;
-    if (si > txb_info->eob) si = txb_info->eob;
-  }
-#if TEST_OPTIMIZE_TXB
-  int64_t new_dist =
-      av1_block_error_c(txb_info->tcoeff, txb_info->dqcoeff, max_eob, &sse) *
-      (1 << (2 * txb_info->shift));
-  int new_cost = get_txb_cost(txb_info, txb_costs);
-  int64_t ref_dist_diff = new_dist - org_dist;
-  int ref_cost_diff = new_cost - org_cost;
-  if (cost_diff != ref_cost_diff || dist_diff != ref_dist_diff)
-    printf(
-        "overall rd_diff %ld\ncost_diff %d ref_cost_diff%d\ndist_diff %ld "
-        "ref_dist_diff %ld\neob %d new_eob %d\n\n",
-        rd_diff, cost_diff, ref_cost_diff, dist_diff, ref_dist_diff, org_eob,
-        txb_info->eob);
-#endif
-  if (dry_run) {
-    txb_info->qcoeff = org_qcoeff;
-    txb_info->dqcoeff = org_dqcoeff;
-    set_eob(txb_info, org_eob);
   }
-  return update;
 }
 
-// These numbers are empirically obtained.
-static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
-  { 17, 13 }, { 16, 10 },
-};
+static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob,
+                               int nz_num, int *nz_ci, int64_t rdmult,
+                               int skip_cost, int non_skip_cost,
+                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                               int sharpness) {
+  const int64_t rd = RDCOST(rdmult, *accu_rate + non_skip_cost, accu_dist);
+  const int64_t rd_new_eob = RDCOST(rdmult, skip_cost, 0);
+  if (sharpness == 0 && rd_new_eob < rd) {
+    for (int i = 0; i < nz_num; ++i) {
+      const int ci = nz_ci[i];
+      qcoeff[ci] = 0;
+      dqcoeff[ci] = 0;
+      // no need to set up levels because this is the last step
+      // levels[get_padded_idx(ci, bwl)] = 0;
+    }
+    *accu_rate = 0;
+    *eob = 0;
+  }
+}
+
+int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                         int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                         const TXB_CTX *const txb_ctx, int *rate_cost,
+                         int sharpness) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block);
+  const int16_t *dequant = p->dequant_QTX;
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  assert(width == (1 << bwl));
+  const int is_inter = is_inter_block(mbmi);
+  const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+  const int16_t *scan = scan_order->scan;
+  const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type];
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST *txb_eob_costs =
+      &x->eob_costs[eob_multi_size][plane_type];
+
+  const int shift = av1_get_tx_scale(tx_size);
+  const int64_t rdmult =
+      ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) +
+       2) >>
+      (sharpness + (cpi->oxcf.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4
+                        ? 7 - mbmi->segment_id
+                        : 2));
+
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+
+  av1_txb_init_levels(qcoeff, width, height, levels);
+
+  // TODO(angirbird): check iqmatrix
+
+  const int non_skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][0];
+  const int skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+  int eob = p->eobs[block];
+  const int eob_cost = get_eob_cost(eob, txb_eob_costs, txb_costs, tx_class);
+  int accu_rate = eob_cost;
+  int64_t accu_dist = 0;
+  int si = eob - 1;
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const tran_low_t abs_qc = abs(qc);
+  const int sign = qc < 0;
+  const int max_nz_num = 2;
+  int nz_num = 1;
+  int nz_ci[3] = { ci, 0, 0 };
+  if (abs_qc >= 2) {
+    update_coeff_general(&accu_rate, &accu_dist, si, eob, tx_size, tx_class,
+                         bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx,
+                         dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
+                         levels);
+    --si;
+  } else {
+    assert(abs_qc == 1);
+    const int coeff_ctx = get_lower_levels_ctx_general(
+        1, si, bwl, height, levels, ci, tx_size, tx_class);
+    accu_rate += get_coeff_cost_general(1, ci, abs_qc, sign, coeff_ctx,
+                                        txb_ctx->dc_sign_ctx, txb_costs, bwl,
+                                        tx_class, levels);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int64_t dist = get_coeff_dist(tqc, dqc, shift);
+    const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
+    accu_dist += dist - dist0;
+    --si;
+  }
+
+#define UPDATE_COEFF_EOB_CASE(tx_class_literal)                            \
+  case tx_class_literal:                                                   \
+    for (; si >= 0 && nz_num <= max_nz_num; --si) {                        \
+      update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si,   \
+                       tx_size, tx_class_literal, bwl, height,             \
+                       txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \
+                       txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff,  \
+                       levels, sharpness);                                 \
+    }                                                                      \
+    break;
+  switch (tx_class) {
+    UPDATE_COEFF_EOB_CASE(TX_CLASS_2D);
+    UPDATE_COEFF_EOB_CASE(TX_CLASS_HORIZ);
+    UPDATE_COEFF_EOB_CASE(TX_CLASS_VERT);
+#undef UPDATE_COEFF_EOB_CASE
+    default: assert(false);
+  }
+
+  if (si == -1 && nz_num <= max_nz_num) {
+    update_skip(&accu_rate, accu_dist, &eob, nz_num, nz_ci, rdmult, skip_cost,
+                non_skip_cost, qcoeff, dqcoeff, sharpness);
+  }
+
+#define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal)                             \
+  case tx_class_literal:                                                       \
+    for (; si >= 1; --si) {                                                    \
+      update_coeff_simple(&accu_rate, si, eob, tx_size, tx_class_literal, bwl, \
+                          rdmult, shift, dequant, scan, txb_costs, tcoeff,     \
+                          qcoeff, dqcoeff, levels);                            \
+    }                                                                          \
+    break;
+  switch (tx_class) {
+    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_2D);
+    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_HORIZ);
+    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_VERT);
+#undef UPDATE_COEFF_SIMPLE_CASE
+    default: assert(false);
+  }
+
+  // DC position
+  if (si == 0) {
+    // no need to update accu_dist because it's not used after this point
+    int64_t dummy_dist = 0;
+    update_coeff_general(&accu_rate, &dummy_dist, si, eob, tx_size, tx_class,
+                         bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx,
+                         dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
+                         levels);
+  }
+
+  const int tx_type_cost = get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type);
+  if (eob == 0)
+    accu_rate += skip_cost;
+  else
+    accu_rate += non_skip_cost + tx_type_cost;
+
+  p->eobs[block] = eob;
+  p->txb_entropy_ctx[block] =
+      av1_get_txb_entropy_context(qcoeff, scan_order, p->eobs[block]);
+
+  *rate_cost = accu_rate;
+  return eob;
+}
 
-int av1_optimize_txb(const AV1_COMMON *cm, MACROBLOCK *x, int plane,
+// This function is deprecated, but we keep it here because hash trellis
+// is not integrated with av1_optimize_txb_new yet
+int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
                      int blk_row, int blk_col, int block, TX_SIZE tx_size,
-                     TXB_CTX *txb_ctx, int fast_mode) {
+                     TXB_CTX *txb_ctx, int fast_mode, int *rate_cost) {
+  const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+  const MB_MODE_INFO *mbmi = xd->mi[0];
   const struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   const int eob = p->eobs[block];
   tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block);
-  const int16_t *dequant = pd->dequant;
-  const int seg_eob = AOMMIN(eob, tx_size_2d[tx_size] - 1);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int stride = 1 << bwl;
-  const int height = tx_size_high[tx_size];
+  const int16_t *dequant = p->dequant_QTX;
+  const int seg_eob = av1_get_max_eob(tx_size);
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
   const int is_inter = is_inter_block(mbmi);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-  const LV_MAP_COEFF_COST txb_costs = x->coeff_costs[txs_ctx][plane_type];
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type];
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST txb_eob_costs =
+      x->eob_costs[eob_multi_size][plane_type];
 
   const int shift = av1_get_tx_scale(tx_size);
   const int64_t rdmult =
-      (x->rdmult * plane_rd_mult[is_inter][plane_type] + 2) >> 2;
-
-  TxbInfo txb_info = { qcoeff,
-                       dqcoeff,
-                       tcoeff,
-                       dequant,
-                       shift,
-                       tx_size,
-                       txs_ctx,
-                       tx_type,
-                       bwl,
-                       stride,
-                       height,
-                       eob,
-                       seg_eob,
-                       scan_order,
-                       txb_ctx,
-                       rdmult,
-                       &cm->coeff_ctx_table };
-
-  TxbCache txb_cache;
-  gen_txb_cache(&txb_cache, &txb_info);
+      ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) +
+       2) >>
+      2;
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
+  const qm_val_t *iqmatrix =
+      IS_2D_TRANSFORM(tx_type)
+          ? pd->seg_iqmatrix[mbmi->segment_id][qm_tx_size]
+          : cm->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+  assert(width == (1 << bwl));
+  const int tx_type_cost = get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type);
+  TxbInfo txb_info = {
+    qcoeff,   levels,       dqcoeff,    tcoeff,  dequant, shift,
+    tx_size,  txs_ctx,      tx_type,    bwl,     width,   height,
+    eob,      seg_eob,      scan_order, txb_ctx, rdmult,  &cm->coeff_ctx_table,
+    iqmatrix, tx_type_cost,
+  };
+
+  // Hash based trellis (hbt) speed feature: avoid expensive optimize_txb calls
+  // by storing the coefficient deltas in a hash table.
+  // Currently disabled in speedfeatures.c
+  if (eob <= HBT_EOB && eob > 0 && cpi->sf.use_hash_based_trellis) {
+    return hbt_create_hashes(&txb_info, txb_costs, &txb_eob_costs, p, block,
+                             fast_mode, rate_cost);
+  }
+
+  av1_txb_init_levels(qcoeff, width, height, levels);
 
   const int update =
-      optimize_txb(&txb_info, &txb_costs, &txb_cache, 0, fast_mode);
-  if (update) p->eobs[block] = txb_info.eob;
+      optimize_txb(&txb_info, txb_costs, &txb_eob_costs, rate_cost);
+
+  if (update) {
+    p->eobs[block] = txb_info.eob;
+    p->txb_entropy_ctx[block] =
+        av1_get_txb_entropy_context(qcoeff, scan_order, txb_info.eob);
+  }
   return txb_info.eob;
 }
+
 int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
                                 const SCAN_ORDER *scan_order, int eob) {
-  const int16_t *scan = scan_order->scan;
+  const int16_t *const scan = scan_order->scan;
   int cul_level = 0;
   int c;
 
   if (eob == 0) return 0;
   for (c = 0; c < eob; ++c) {
     cul_level += abs(qcoeff[scan[c]]);
+    if (cul_level > COEFF_CONTEXT_MASK) break;
   }
 
   cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
@@ -1981,167 +1791,72 @@ void av1_update_txb_context_b(int plane, int block, int blk_row, int blk_col,
   ThreadData *const td = args->td;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   const uint16_t eob = p->eobs[block];
   const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   const PLANE_TYPE plane_type = pd->plane_type;
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-  (void)plane_bsize;
-
-  int cul_level = av1_get_txb_entropy_context(qcoeff, scan_order, eob);
-  av1_set_contexts(xd, pd, plane, tx_size, cul_level, blk_col, blk_row);
-}
-
-static INLINE void av1_update_nz_eob_counts(FRAME_CONTEXT *fc,
-                                            FRAME_COUNTS *counts, uint16_t eob,
-                                            const tran_low_t *tcoeff, int plane,
-                                            TX_SIZE tx_size, TX_TYPE tx_type,
-                                            const int16_t *scan) {
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int height = tx_size_high[tx_size];
-  TX_SIZE txsize_ctx = get_txsize_context(tx_size);
-#if CONFIG_CTX1D
-  const int width = tx_size_wide[tx_size];
-  const int eob_offset = width + height;
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int seg_eob =
-      (tx_class == TX_CLASS_2D) ? tx_size_2d[tx_size] : eob_offset;
-#else
-  const int seg_eob = tx_size_2d[tx_size];
-#endif
-  unsigned int(*nz_map_count)[SIG_COEF_CONTEXTS][2] =
-      &counts->nz_map[txsize_ctx][plane_type];
-  for (int c = 0; c < eob; ++c) {
-    tran_low_t v = tcoeff[scan[c]];
-    int is_nz = (v != 0);
-    int coeff_ctx = get_nz_map_ctx(tcoeff, c, scan, bwl, height, tx_type);
-    int eob_ctx = get_eob_ctx(tcoeff, scan[c], txsize_ctx, tx_type);
-
-    if (c == seg_eob - 1) break;
-
-    ++(*nz_map_count)[coeff_ctx][is_nz];
-#if LV_MAP_PROB
-    update_bin(fc->nz_map_cdf[txsize_ctx][plane_type][coeff_ctx], is_nz, 2);
-#endif
-
-    if (is_nz) {
-      ++counts->eob_flag[txsize_ctx][plane_type][eob_ctx][c == (eob - 1)];
-#if LV_MAP_PROB
-      update_bin(fc->eob_flag_cdf[txsize_ctx][plane_type][eob_ctx],
-                 c == (eob - 1), 2);
-#endif
-    }
-  }
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int cul_level = av1_get_txb_entropy_context(qcoeff, scan_order, eob);
+  av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, blk_col,
+                   blk_row);
 }
 
-#if CONFIG_CTX1D
-static INLINE void av1_update_nz_eob_counts_vert(
-    FRAME_CONTEXT *fc, FRAME_COUNTS *counts, uint16_t eob,
-    const tran_low_t *tcoeff, int plane, TX_SIZE tx_size, TX_TYPE tx_type,
-    const int16_t *scan, const int16_t *iscan) {
-  (void)eob;
-  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  int16_t eob_ls[MAX_HVTX_SIZE];
-  get_eob_vert(eob_ls, tcoeff, width, height);
-  unsigned int(*nz_map_count)[SIG_COEF_CONTEXTS][2] =
-      &counts->nz_map[txs_ctx][plane_type];
-  for (int c = 0; c < width; ++c) {
-    int16_t veob = eob_ls[c];
-    assert(veob <= height);
-    int el_ctx = get_empty_line_ctx(c, eob_ls);
-    ++counts->empty_line[txs_ctx][plane_type][tx_class][el_ctx][veob == 0];
-#if LV_MAP_PROB
-    update_bin(fc->empty_line_cdf[txs_ctx][plane_type][tx_class][el_ctx],
-               veob == 0, 2);
-#endif
-    if (veob) {
-      for (int r = 0; r < veob; ++r) {
-        if (r + 1 != height) {
-          int coeff_idx = r * width + c;
-          int scan_idx = iscan[coeff_idx];
-          int is_nz = tcoeff[coeff_idx] != 0;
-          int coeff_ctx =
-              get_nz_map_ctx(tcoeff, scan_idx, scan, bwl, height, tx_type);
-          ++(*nz_map_count)[coeff_ctx][is_nz];
-#if LV_MAP_PROB
-          update_bin(fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], is_nz, 2);
-#endif
-          if (is_nz) {
-            int eob_ctx = get_hv_eob_ctx(c, r, eob_ls);
-            ++counts->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx]
-                            [r == veob - 1];
-#if LV_MAP_PROB
-            update_bin(fc->hv_eob_cdf[txs_ctx][plane_type][tx_class][eob_ctx],
-                       r == veob - 1, 2);
-#endif
-          }
+static void update_tx_type_count(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                 int blk_row, int blk_col, int plane,
+                                 TX_SIZE tx_size, FRAME_COUNTS *counts,
+                                 uint8_t allow_update_cdf) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  int is_inter = is_inter_block(mbmi);
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+#if !CONFIG_ENTROPY_STATS
+  (void)counts;
+#endif  // !CONFIG_ENTROPY_STATS
+
+  // Only y plane's tx_type is updated
+  if (plane > 0) return;
+  TX_TYPE tx_type = av1_get_tx_type(PLANE_TYPE_Y, xd, blk_row, blk_col, tx_size,
+                                    cm->reduced_tx_set_used);
+  if (get_ext_tx_types(tx_size, is_inter, cm->reduced_tx_set_used) > 1 &&
+      cm->base_qindex > 0 && !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    const int eset = get_ext_tx_set(tx_size, is_inter, cm->reduced_tx_set_used);
+    if (eset > 0) {
+      const TxSetType tx_set_type =
+          av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used);
+      if (is_inter) {
+        if (allow_update_cdf) {
+          update_cdf(fc->inter_ext_tx_cdf[eset][txsize_sqr_map[tx_size]],
+                     av1_ext_tx_ind[tx_set_type][tx_type],
+                     av1_num_ext_tx_set[tx_set_type]);
         }
-      }
-    }
-  }
-}
-
-static INLINE void av1_update_nz_eob_counts_horiz(
-    FRAME_CONTEXT *fc, FRAME_COUNTS *counts, uint16_t eob,
-    const tran_low_t *tcoeff, int plane, TX_SIZE tx_size, TX_TYPE tx_type,
-    const int16_t *scan, const int16_t *iscan) {
-  (void)eob;
-  (void)scan;
-  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_CLASS tx_class = get_tx_class(tx_type);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int width = tx_size_wide[tx_size];
-  const int height = tx_size_high[tx_size];
-  int16_t eob_ls[MAX_HVTX_SIZE];
-  get_eob_horiz(eob_ls, tcoeff, width, height);
-  unsigned int(*nz_map_count)[SIG_COEF_CONTEXTS][2] =
-      &counts->nz_map[txs_ctx][plane_type];
-  for (int r = 0; r < height; ++r) {
-    int16_t heob = eob_ls[r];
-    int el_ctx = get_empty_line_ctx(r, eob_ls);
-    ++counts->empty_line[txs_ctx][plane_type][tx_class][el_ctx][heob == 0];
-#if LV_MAP_PROB
-    update_bin(fc->empty_line_cdf[txs_ctx][plane_type][tx_class][el_ctx],
-               heob == 0, 2);
-#endif
-    if (heob) {
-      for (int c = 0; c < heob; ++c) {
-        if (c + 1 != width) {
-          int coeff_idx = r * width + c;
-          int scan_idx = iscan[coeff_idx];
-          int is_nz = tcoeff[coeff_idx] != 0;
-          int coeff_ctx =
-              get_nz_map_ctx(tcoeff, scan_idx, scan, bwl, height, tx_type);
-          ++(*nz_map_count)[coeff_ctx][is_nz];
-#if LV_MAP_PROB
-          update_bin(fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], is_nz, 2);
-#endif
-          if (is_nz) {
-            int eob_ctx = get_hv_eob_ctx(r, c, eob_ls);
-            ++counts->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx]
-                            [c == heob - 1];
-#if LV_MAP_PROB
-            update_bin(fc->hv_eob_cdf[txs_ctx][plane_type][tx_class][eob_ctx],
-                       c == heob - 1, 2);
-#endif
-          }
+#if CONFIG_ENTROPY_STATS
+        ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]]
+                              [av1_ext_tx_ind[tx_set_type][tx_type]];
+#endif  // CONFIG_ENTROPY_STATS
+      } else {
+        PREDICTION_MODE intra_dir;
+        if (mbmi->filter_intra_mode_info.use_filter_intra)
+          intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info
+                                             .filter_intra_mode];
+        else
+          intra_dir = mbmi->mode;
+#if CONFIG_ENTROPY_STATS
+        ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][intra_dir]
+                              [av1_ext_tx_ind[tx_set_type][tx_type]];
+#endif  // CONFIG_ENTROPY_STATS
+        if (allow_update_cdf) {
+          update_cdf(
+              fc->intra_ext_tx_cdf[eset][txsize_sqr_map[tx_size]][intra_dir],
+              av1_ext_tx_ind[tx_set_type][tx_type],
+              av1_num_ext_tx_set[tx_set_type]);
         }
       }
     }
   }
 }
-#endif  // CONFIG_CTX1D
 
 void av1_update_and_record_txb_context(int plane, int block, int blk_row,
                                        int blk_col, BLOCK_SIZE plane_bsize,
@@ -2154,461 +1869,164 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  int eob = p->eobs[block], update_eob = 0;
-  const PLANE_TYPE plane_type = pd->plane_type;
-  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
-  const int segment_id = mbmi->segment_id;
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-  const int16_t *scan = scan_order->scan;
-  const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
-  int c, i;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int eob = p->eobs[block];
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, pd->above_context + blk_col,
               pd->left_context + blk_row, &txb_ctx);
-  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int height = tx_size_high[tx_size];
-  int cul_level = 0;
-
-  TX_SIZE txsize_ctx = get_txsize_context(tx_size);
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  const uint8_t allow_update_cdf = args->allow_update_cdf;
+  const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size);
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#if CONFIG_ENTROPY_STATS
+  int cdf_idx = cm->coef_cdf_category;
+#endif  // CONFIG_ENTROPY_STATS
+
+#if CONFIG_ENTROPY_STATS
+  ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
+#endif  // CONFIG_ENTROPY_STATS
+  if (allow_update_cdf) {
+    update_cdf(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx], eob == 0,
+               2);
+  }
 
-  memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
-
-  ++td->counts->txb_skip[txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
-#if LV_MAP_PROB
-  update_bin(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx], eob == 0,
-             2);
-#endif
   x->mbmi_ext->txb_skip_ctx[plane][block] = txb_ctx.txb_skip_ctx;
-
   x->mbmi_ext->eobs[plane][block] = eob;
 
   if (eob == 0) {
-    av1_set_contexts(xd, pd, plane, tx_size, 0, blk_col, blk_row);
+    av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col, blk_row);
     return;
   }
 
-#if CONFIG_TXK_SEL
-  av1_update_tx_type_count(cm, xd, blk_row, blk_col, block, plane,
-                           mbmi->sb_type, get_min_tx_size(tx_size), td->counts);
-#endif
-
-#if CONFIG_CTX1D
-  TX_CLASS tx_class = get_tx_class(tx_type);
-  if (tx_class == TX_CLASS_2D) {
-    av1_update_nz_eob_counts(ec_ctx, td->counts, eob, tcoeff, plane, tx_size,
-                             tx_type, scan);
-  } else {
-    const int width = tx_size_wide[tx_size];
-    const int eob_offset = width + height;
-    const int eob_mode = eob > eob_offset;
-    const TX_SIZE txs_ctx = get_txsize_context(tx_size);
-    ++td->counts->eob_mode[txs_ctx][plane_type][tx_class][eob_mode];
-#if LV_MAP_PROB
-    update_bin(ec_ctx->eob_mode_cdf[txs_ctx][plane_type][tx_class], eob_mode,
-               2);
-#endif
-    if (eob_mode == 0) {
-      av1_update_nz_eob_counts(ec_ctx, td->counts, eob, tcoeff, plane, tx_size,
-                               tx_type, scan);
-    } else {
-      const int16_t *iscan = scan_order->iscan;
-      assert(tx_class == TX_CLASS_VERT || tx_class == TX_CLASS_HORIZ);
-      if (tx_class == TX_CLASS_VERT)
-        av1_update_nz_eob_counts_vert(ec_ctx, td->counts, eob, tcoeff, plane,
-                                      tx_size, tx_type, scan, iscan);
-      else
-        av1_update_nz_eob_counts_horiz(ec_ctx, td->counts, eob, tcoeff, plane,
-                                       tx_size, tx_type, scan, iscan);
-    }
-  }
-#else   // CONFIG_CTX1D
-  av1_update_nz_eob_counts(ec_ctx, td->counts, eob, tcoeff, plane, tx_size,
-                           tx_type, scan);
-#endif  // CONFIG_CTX1D
-
-  // Reverse process order to handle coefficient level and sign.
-  for (i = 0; i < NUM_BASE_LEVELS; ++i) {
-    update_eob = 0;
-    for (c = eob - 1; c >= 0; --c) {
-      tran_low_t v = qcoeff[scan[c]];
-      tran_low_t level = abs(v);
-      int ctx;
-
-      if (level <= i) continue;
-
-      ctx = get_base_ctx(tcoeff, scan[c], bwl, height, i + 1);
+  tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
+  const int segment_id = mbmi->segment_id;
+  const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
 
-      if (level == i + 1) {
-        ++td->counts->coeff_base[txsize_ctx][plane_type][i][ctx][1];
-#if LV_MAP_PROB
-        update_bin(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][i][ctx], 1,
-                   2);
-#endif
-        if (c == 0) {
-          int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  av1_txb_init_levels(tcoeff, width, height, levels);
+  update_tx_type_count(cm, xd, blk_row, blk_col, plane, tx_size, td->counts,
+                       allow_update_cdf);
 
-          ++td->counts->dc_sign[plane_type][dc_sign_ctx][v < 0];
-#if LV_MAP_PROB
-          update_bin(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], v < 0, 2);
-#endif
-          x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx;
-        }
-        cul_level += level;
-        continue;
-      }
-      ++td->counts->coeff_base[txsize_ctx][plane_type][i][ctx][0];
-#if LV_MAP_PROB
-      update_bin(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][i][ctx], 0, 2);
+  const PLANE_TYPE plane_type = pd->plane_type;
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int16_t *const scan = scan_order->scan;
+#if CONFIG_ENTROPY_STATS
+  av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
+                         td->counts, allow_update_cdf);
+#else
+  av1_update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx,
+                         allow_update_cdf);
 #endif
-      update_eob = AOMMAX(update_eob, c);
-    }
-  }
-
-  for (c = update_eob; c >= 0; --c) {
-    tran_low_t v = qcoeff[scan[c]];
-    tran_low_t level = abs(v);
-    int idx;
-    int ctx;
 
-    if (level <= NUM_BASE_LEVELS) continue;
+  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+  av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
 
-    cul_level += level;
-    if (c == 0) {
-      int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+  for (int c = eob - 1; c >= 0; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx = coeff_contexts[pos];
+    const tran_low_t v = qcoeff[pos];
+    const tran_low_t level = abs(v);
 
-      ++td->counts->dc_sign[plane_type][dc_sign_ctx][v < 0];
-#if LV_MAP_PROB
-      update_bin(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], v < 0, 2);
-#endif
-      x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx;
+    if (allow_update_cdf) {
+      if (c == eob - 1) {
+        assert(coeff_ctx < 4);
+        update_cdf(
+            ec_ctx->coeff_base_eob_cdf[txsize_ctx][plane_type][coeff_ctx],
+            AOMMIN(level, 3) - 1, 3);
+      } else {
+        update_cdf(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][coeff_ctx],
+                   AOMMIN(level, 3), 4);
+      }
     }
-
-    // level is above 1.
-    ctx = get_br_ctx(tcoeff, scan[c], bwl, height);
-
-#if BR_NODE
-    int base_range = level - 1 - NUM_BASE_LEVELS;
-    int br_set_idx = base_range < COEFF_BASE_RANGE
-                         ? coeff_to_br_index[base_range]
-                         : BASE_RANGE_SETS;
-
-    for (idx = 0; idx < BASE_RANGE_SETS; ++idx) {
-      if (idx == br_set_idx) {
-        int br_base = br_index_to_coeff[br_set_idx];
-        int br_offset = base_range - br_base;
-        ++td->counts->coeff_br[txsize_ctx][plane_type][idx][ctx][1];
-#if LV_MAP_PROB
-        update_bin(ec_ctx->coeff_br_cdf[txsize_ctx][plane_type][idx][ctx], 1,
-                   2);
-#endif
-        int extra_bits = (1 << br_extra_bits[idx]) - 1;
-        for (int tok = 0; tok < extra_bits; ++tok) {
-          if (br_offset == tok) {
-            ++td->counts->coeff_lps[txsize_ctx][plane_type][ctx][1];
-#if LV_MAP_PROB
-            update_bin(ec_ctx->coeff_lps_cdf[txsize_ctx][plane_type][ctx], 1,
-                       2);
-#endif
-            break;
-          }
-          ++td->counts->coeff_lps[txsize_ctx][plane_type][ctx][0];
-#if LV_MAP_PROB
-          update_bin(ec_ctx->coeff_lps_cdf[txsize_ctx][plane_type][ctx], 0, 2);
+    {
+      if (c == eob - 1) {
+        assert(coeff_ctx < 4);
+#if CONFIG_ENTROPY_STATS
+        ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type]
+                                          [coeff_ctx][AOMMIN(level, 3) - 1];
+      } else {
+        ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type]
+                                      [coeff_ctx][AOMMIN(level, 3)];
 #endif
-        }
-        break;
       }
-      ++td->counts->coeff_br[txsize_ctx][plane_type][idx][ctx][0];
-#if LV_MAP_PROB
-      update_bin(ec_ctx->coeff_br_cdf[txsize_ctx][plane_type][idx][ctx], 0, 2);
-#endif
     }
-#else  // BR_NODE
-    for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
-      if (level == (idx + 1 + NUM_BASE_LEVELS)) {
-        ++td->counts->coeff_lps[txsize_ctx][plane_type][ctx][1];
-#if LV_MAP_PROB
-        update_bin(ec_ctx->coeff_lps_cdf[txsize_ctx][plane_type][ctx], 1, 2);
+    if (level > NUM_BASE_LEVELS) {
+      const int base_range = level - 1 - NUM_BASE_LEVELS;
+      const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+        const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+        if (allow_update_cdf) {
+          update_cdf(ec_ctx->coeff_br_cdf[AOMMIN(txsize_ctx, TX_32X32)]
+                                         [plane_type][br_ctx],
+                     k, BR_CDF_SIZE);
+        }
+        for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) {
+#if CONFIG_ENTROPY_STATS
+          ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type][lps]
+                                 [br_ctx][lps == k];
+#endif  // CONFIG_ENTROPY_STATS
+          if (lps == k) break;
+        }
+#if CONFIG_ENTROPY_STATS
+        ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)]
+                                     [plane_type][br_ctx][k];
 #endif
-        break;
+        if (k < BR_CDF_SIZE - 1) break;
       }
-      ++td->counts->coeff_lps[txsize_ctx][plane_type][ctx][0];
-#if LV_MAP_PROB
-      update_bin(ec_ctx->coeff_lps_cdf[txsize_ctx][plane_type][ctx], 0, 2);
-#endif
     }
-    if (idx < COEFF_BASE_RANGE) continue;
-#endif  // BR_NODE
-    // use 0-th order Golomb code to handle the residual level.
   }
 
-  cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
+  // Update the context needed to code the DC sign (if applicable)
+  if (tcoeff[0] != 0) {
+    const int dc_sign = (tcoeff[0] < 0) ? 1 : 0;
+    const int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+#if CONFIG_ENTROPY_STATS
+    ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign];
+#endif  // CONFIG_ENTROPY_STATS
+    if (allow_update_cdf)
+      update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2);
+    x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx;
+  }
 
-  // DC value
-  set_dc_sign(&cul_level, tcoeff[0]);
-  av1_set_contexts(xd, pd, plane, tx_size, cul_level, blk_col, blk_row);
-
-#if CONFIG_ADAPT_SCAN
-  // Since dqcoeff is not available here, we pass qcoeff into
-  // av1_update_scan_count_facade(). The update behavior should be the same
-  // because av1_update_scan_count_facade() only cares if coefficients are zero
-  // or not.
-  av1_update_scan_count_facade((AV1_COMMON *)cm, td->counts, tx_size, tx_type,
-                               qcoeff, eob);
-#endif
+  const int cul_level = av1_get_txb_entropy_context(tcoeff, scan_order, eob);
+  av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, blk_col,
+                   blk_row);
 }
 
 void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
                             RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
-                            int mi_row, int mi_col) {
+                            int mi_row, int mi_col, uint8_t allow_update_cdf) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const int ctx = av1_get_skip_context(xd);
-  const int skip_inc =
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
-  struct tokenize_b_args arg = { cpi, td, NULL, 0 };
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct tokenize_b_args arg = { cpi, td, NULL, 0, allow_update_cdf };
   (void)rate;
   (void)mi_row;
   (void)mi_col;
   if (mbmi->skip) {
-    if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
-    av1_reset_skip_context(xd, mi_row, mi_col, bsize);
+    av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
     return;
   }
 
   if (!dry_run) {
-    td->counts->skip[ctx][0] += skip_inc;
     av1_foreach_transformed_block(xd, bsize, mi_row, mi_col,
-                                  av1_update_and_record_txb_context, &arg);
+                                  av1_update_and_record_txb_context, &arg,
+                                  num_planes);
   } else if (dry_run == DRY_RUN_NORMAL) {
     av1_foreach_transformed_block(xd, bsize, mi_row, mi_col,
-                                  av1_update_txb_context_b, &arg);
+                                  av1_update_txb_context_b, &arg, num_planes);
   } else {
     printf("DRY_RUN_COSTCOEFFS is not supported yet\n");
     assert(0);
   }
 }
-
-static void find_new_prob(unsigned int *branch_cnt, aom_prob *oldp,
-                          int *savings, int *update, aom_writer *const bc) {
-  const aom_prob upd = DIFF_UPDATE_PROB;
-  int u = 0;
-  aom_prob newp = get_binary_prob(branch_cnt[0], branch_cnt[1]);
-  int s = av1_prob_diff_update_savings_search(branch_cnt, *oldp, &newp, upd, 1);
-
-  if (s > 0 && newp != *oldp) u = 1;
-
-  if (u)
-    *savings += s - (int)(av1_cost_zero(upd));  // TODO(jingning): 1?
-  else
-    *savings -= (int)(av1_cost_zero(upd));
-
-  if (update) {
-    ++update[u];
-    return;
-  }
-
-  aom_write(bc, u, upd);
-  if (u) {
-    /* send/use new probability */
-    av1_write_prob_diff_update(bc, newp, *oldp);
-    *oldp = newp;
-  }
-}
-
-static void write_txb_probs(aom_writer *const bc, AV1_COMP *cpi,
-                            TX_SIZE tx_size) {
-  FRAME_CONTEXT *fc = cpi->common.fc;
-  FRAME_COUNTS *counts = cpi->td.counts;
-  int savings = 0;
-  int update[2] = { 0, 0 };
-  int plane, ctx, level;
-
-  for (ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx) {
-    find_new_prob(counts->txb_skip[tx_size][ctx], &fc->txb_skip[tx_size][ctx],
-                  &savings, update, bc);
-  }
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane) {
-    for (ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) {
-      find_new_prob(counts->nz_map[tx_size][plane][ctx],
-                    &fc->nz_map[tx_size][plane][ctx], &savings, update, bc);
-    }
-  }
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane) {
-    for (ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) {
-      find_new_prob(counts->eob_flag[tx_size][plane][ctx],
-                    &fc->eob_flag[tx_size][plane][ctx], &savings, update, bc);
-    }
-  }
-
-  for (level = 0; level < NUM_BASE_LEVELS; ++level) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane) {
-      for (ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx) {
-        find_new_prob(counts->coeff_base[tx_size][plane][level][ctx],
-                      &fc->coeff_base[tx_size][plane][level][ctx], &savings,
-                      update, bc);
-      }
-    }
-  }
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane) {
-    for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
-      find_new_prob(counts->coeff_lps[tx_size][plane][ctx],
-                    &fc->coeff_lps[tx_size][plane][ctx], &savings, update, bc);
-    }
-  }
-
-  // Decide if to update the model for this tx_size
-  if (update[1] == 0 || savings < 0) {
-    aom_write_bit(bc, 0);
-    return;
-  }
-  aom_write_bit(bc, 1);
-
-  for (ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx) {
-    find_new_prob(counts->txb_skip[tx_size][ctx], &fc->txb_skip[tx_size][ctx],
-                  &savings, NULL, bc);
-  }
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane) {
-    for (ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) {
-      find_new_prob(counts->nz_map[tx_size][plane][ctx],
-                    &fc->nz_map[tx_size][plane][ctx], &savings, NULL, bc);
-    }
-  }
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane) {
-    for (ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) {
-      find_new_prob(counts->eob_flag[tx_size][plane][ctx],
-                    &fc->eob_flag[tx_size][plane][ctx], &savings, NULL, bc);
-    }
-  }
-
-  for (level = 0; level < NUM_BASE_LEVELS; ++level) {
-    for (plane = 0; plane < PLANE_TYPES; ++plane) {
-      for (ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx) {
-        find_new_prob(counts->coeff_base[tx_size][plane][level][ctx],
-                      &fc->coeff_base[tx_size][plane][level][ctx], &savings,
-                      NULL, bc);
-      }
-    }
-  }
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane) {
-    for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
-      find_new_prob(counts->coeff_lps[tx_size][plane][ctx],
-                    &fc->coeff_lps[tx_size][plane][ctx], &savings, NULL, bc);
-    }
-  }
-}
-
-void av1_write_txb_probs(AV1_COMP *cpi, aom_writer *w) {
-  const TX_MODE tx_mode = cpi->common.tx_mode;
-  const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
-  TX_SIZE tx_size;
-  int ctx, plane;
-
-#if LV_MAP_PROB
-  return;
-#endif
-
-  for (plane = 0; plane < PLANE_TYPES; ++plane)
-    for (ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
-      av1_cond_prob_diff_update(w, &cpi->common.fc->dc_sign[plane][ctx],
-                                cpi->td.counts->dc_sign[plane][ctx], 1);
-
-  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
-    write_txb_probs(w, cpi, tx_size);
-}
-
-#if CONFIG_TXK_SEL
-int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
-                            int block, int blk_row, int blk_col,
-                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                            const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
-                            int use_fast_coef_costing, RD_STATS *rd_stats) {
-  const AV1_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  TX_TYPE txk_start = DCT_DCT;
-  TX_TYPE txk_end = TX_TYPES - 1;
-  TX_TYPE best_tx_type = txk_start;
-  int64_t best_rd = INT64_MAX;
-  uint8_t best_eob = 0;
-  const int coeff_ctx = combine_entropy_contexts(*a, *l);
-  RD_STATS best_rd_stats;
-  TX_TYPE tx_type;
-
-  av1_invalid_rd_stats(&best_rd_stats);
-
-  for (tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
-    if (plane == 0) mbmi->txk_type[(blk_row << 4) + blk_col] = tx_type;
-    TX_TYPE ref_tx_type = av1_get_tx_type(get_plane_type(plane), xd, blk_row,
-                                          blk_col, block, tx_size);
-    if (tx_type != ref_tx_type) {
-      // use av1_get_tx_type() to check if the tx_type is valid for the current
-      // mode if it's not, we skip it here.
-      continue;
-    }
-
-#if CONFIG_EXT_TX
-    const int is_inter = is_inter_block(mbmi);
-    const TxSetType tx_set_type =
-        get_ext_tx_set_type(get_min_tx_size(tx_size), mbmi->sb_type, is_inter,
-                            cm->reduced_tx_set_used);
-    if (!av1_ext_tx_used[tx_set_type][tx_type]) continue;
-#endif  // CONFIG_EXT_TX
-
-    RD_STATS this_rd_stats;
-    av1_invalid_rd_stats(&this_rd_stats);
-    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                    coeff_ctx, AV1_XFORM_QUANT_FP);
-    av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
-                   a, l, 1);
-    av1_dist_block(cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size,
-                   &this_rd_stats.dist, &this_rd_stats.sse,
-                   OUTPUT_HAS_PREDICTED_PIXELS);
-    const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-    this_rd_stats.rate =
-        av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block, tx_size,
-                        scan_order, a, l, use_fast_coef_costing);
-    int rd = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
-
-    if (rd < best_rd) {
-      best_rd = rd;
-      best_rd_stats = this_rd_stats;
-      best_tx_type = tx_type;
-      best_eob = x->plane[plane].txb_entropy_ctx[block];
-    }
-  }
-
-  av1_merge_rd_stats(rd_stats, &best_rd_stats);
-
-  if (best_eob == 0 && is_inter_block(mbmi)) best_tx_type = DCT_DCT;
-
-  if (plane == 0) mbmi->txk_type[(blk_row << 4) + blk_col] = best_tx_type;
-  x->plane[plane].txb_entropy_ctx[block] = best_eob;
-
-  if (!is_inter_block(mbmi)) {
-    // intra mode needs decoded result such that the next transform block
-    // can use it for prediction.
-    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                    coeff_ctx, AV1_XFORM_QUANT_FP);
-    av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
-                   a, l, 1);
-
-    av1_inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
-                                       x->plane[plane].eobs[block]);
-  }
-  return best_rd;
-}
-#endif  // CONFIG_TXK_SEL
diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h
index 76a04bb41..aa847ad62 100644
--- a/third_party/aom/av1/encoder/encodetxb.h
+++ b/third_party/aom/av1/encoder/encodetxb.h
@@ -12,7 +12,8 @@
 #ifndef ENCODETXB_H_
 #define ENCODETXB_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "av1/common/blockd.h"
 #include "av1/common/onyxc_int.h"
 #include "av1/common/txb_common.h"
@@ -25,6 +26,7 @@ extern "C" {
 
 typedef struct TxbInfo {
   tran_low_t *qcoeff;
+  uint8_t *levels;  // absolute values and clamped to 255.
   tran_low_t *dqcoeff;
   const tran_low_t *tcoeff;
   const int16_t *dequant;
@@ -33,7 +35,7 @@ typedef struct TxbInfo {
   TX_SIZE txs_ctx;
   TX_TYPE tx_type;
   int bwl;
-  int stride;
+  int width;
   int height;
   int eob;
   int seg_eob;
@@ -41,51 +43,27 @@ typedef struct TxbInfo {
   TXB_CTX *txb_ctx;
   int64_t rdmult;
   const LV_MAP_CTX_TABLE *coeff_ctx_table;
+  const qm_val_t *iqmatrix;
+  int tx_type_cost;
 } TxbInfo;
 
-typedef struct TxbCache {
-  int nz_count_arr[MAX_TX_SQUARE];
-  int nz_ctx_arr[MAX_TX_SQUARE];
-  int base_count_arr[NUM_BASE_LEVELS][MAX_TX_SQUARE];
-  int base_mag_arr[MAX_TX_SQUARE]
-                  [2];  // [0]: max magnitude [1]: num of max magnitude
-  int base_ctx_arr[NUM_BASE_LEVELS][MAX_TX_SQUARE];
-
-  int br_count_arr[MAX_TX_SQUARE];
-  int br_mag_arr[MAX_TX_SQUARE]
-                [2];  // [0]: max magnitude [1]: num of max magnitude
-  int br_ctx_arr[MAX_TX_SQUARE];
-} TxbCache;
-
-typedef struct TxbProbs {
-  const aom_prob *dc_sign_prob;
-  const aom_prob *nz_map;
-  aom_prob (*coeff_base)[COEFF_BASE_CONTEXTS];
-  const aom_prob *coeff_lps;
-  const aom_prob *eob_flag;
-  const aom_prob *txb_skip;
-#if BR_NODE
-  const aom_prob *coeff_br;
-#endif
-} TxbProbs;
-
 void av1_alloc_txb_buf(AV1_COMP *cpi);
 void av1_free_txb_buf(AV1_COMP *cpi);
-int av1_cost_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
-                        int blk_row, int blk_col, int block, TX_SIZE tx_size,
-                        TXB_CTX *txb_ctx);
+int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x,
+                        const int plane, const int blk_row, const int blk_col,
+                        const int block, const TX_SIZE tx_size,
+                        const TXB_CTX *const txb_ctx);
 void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
-                          aom_writer *w, int blk_row, int blk_col, int block,
-                          int plane, TX_SIZE tx_size, const tran_low_t *tcoeff,
+                          aom_writer *w, int blk_row, int blk_col, int plane,
+                          TX_SIZE tx_size, const tran_low_t *tcoeff,
                           uint16_t eob, TXB_CTX *txb_ctx);
-void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
-                         aom_writer *w, int plane);
+void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, int mi_row,
+                         int mi_col, aom_writer *w, BLOCK_SIZE bsize);
 int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
                                 const SCAN_ORDER *scan_order, int eob);
 void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
                             RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
-                            const int mi_row, const int mi_col);
-void av1_write_txb_probs(AV1_COMP *cpi, aom_writer *w);
+                            int mi_row, int mi_col, uint8_t allow_update_cdf);
 
 void av1_update_txb_context_b(int plane, int block, int blk_row, int blk_col,
                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
@@ -98,16 +76,10 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
 void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x,
                           int mi_row, int mi_col);
 
-#if CONFIG_TXK_SEL
-int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
-                            int block, int blk_row, int blk_col,
-                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                            const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
-                            int use_fast_coef_costing, RD_STATS *rd_stats);
-#endif
-int av1_optimize_txb(const AV1_COMMON *cm, MACROBLOCK *x, int plane,
-                     int blk_row, int blk_col, int block, TX_SIZE tx_size,
-                     TXB_CTX *txb_ctx, int fast_mode);
+void hbt_destroy();
+int av1_optimize_txb_new(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                         int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                         const TXB_CTX *txb_ctx, int *rate_cost, int sharpness);
 #ifdef __cplusplus
 }
 #endif
diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c
index edc9b1d61..404af2e7c 100644
--- a/third_party/aom/av1/encoder/ethread.c
+++ b/third_party/aom/av1/encoder/ethread.c
@@ -18,15 +18,13 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
   for (int i = 0; i < REFERENCE_MODES; i++)
     td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
 
-#if CONFIG_GLOBAL_MOTION
-  for (int i = 0; i < TOTAL_REFS_PER_FRAME; i++)
+  for (int i = 0; i < REF_FRAMES; i++)
     td->rd_counts.global_motion_used[i] +=
         td_t->rd_counts.global_motion_used[i];
-#endif  // CONFIG_GLOBAL_MOTION
 
   td->rd_counts.compound_ref_used_flag |=
       td_t->rd_counts.compound_ref_used_flag;
-  td->rd_counts.single_ref_used_flag |= td_t->rd_counts.single_ref_used_flag;
+  td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag;
 }
 
 static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
@@ -53,7 +51,7 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   const int tile_cols = cm->tile_cols;
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  const int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols);
+  int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols);
   int i;
 
   av1_init_tile_data(cpi);
@@ -81,29 +79,19 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
                         aom_memalign(32, sizeof(*thread_data->td)));
         av1_zero(*thread_data->td);
 
-// Set up pc_tree.
-#if !CONFIG_CB4X4
-        thread_data->td->leaf_tree = NULL;
-#endif
+        // Set up pc_tree.
         thread_data->td->pc_tree = NULL;
         av1_setup_pc_tree(cm, thread_data->td);
 
-#if CONFIG_MOTION_VAR
-#if CONFIG_HIGHBITDEPTH
-        int buf_scaler = 2;
-#else
-        int buf_scaler = 1;
-#endif
         CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
                         (uint8_t *)aom_memalign(
-                            16,
-                            buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
-                                sizeof(*thread_data->td->above_pred_buf)));
+                            16, MAX_MB_PLANE * MAX_SB_SQUARE *
+                                    sizeof(*thread_data->td->above_pred_buf)));
         CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf,
                         (uint8_t *)aom_memalign(
-                            16,
-                            buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
-                                sizeof(*thread_data->td->left_pred_buf)));
+                            16, MAX_MB_PLANE * MAX_SB_SQUARE *
+                                    sizeof(*thread_data->td->left_pred_buf)));
+
         CHECK_MEM_ERROR(
             cm, thread_data->td->wsrc_buf,
             (int32_t *)aom_memalign(
@@ -112,7 +100,6 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
             cm, thread_data->td->mask_buf,
             (int32_t *)aom_memalign(
                 16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf)));
-#endif
         // Allocate frame counters in thread data.
         CHECK_MEM_ERROR(cm, thread_data->td->counts,
                         aom_calloc(1, sizeof(*thread_data->td->counts)));
@@ -133,6 +120,8 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
 
       winterface->sync(worker);
     }
+  } else {
+    num_workers = AOMMIN(num_workers, cpi->num_workers);
   }
 
   for (i = 0; i < num_workers; i++) {
@@ -148,16 +137,13 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
     if (thread_data->td != &cpi->td) {
       thread_data->td->mb = cpi->td.mb;
       thread_data->td->rd_counts = cpi->td.rd_counts;
-#if CONFIG_MOTION_VAR
       thread_data->td->mb.above_pred_buf = thread_data->td->above_pred_buf;
       thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf;
       thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf;
       thread_data->td->mb.mask_buf = thread_data->td->mask_buf;
-#endif
     }
-    if (thread_data->td->counts != &cpi->common.counts) {
-      memcpy(thread_data->td->counts, &cpi->common.counts,
-             sizeof(cpi->common.counts));
+    if (thread_data->td->counts != &cpi->counts) {
+      memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts));
     }
 
     if (i < num_workers - 1)
@@ -187,14 +173,24 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
   for (i = 0; i < num_workers; i++) {
     AVxWorker *const worker = &cpi->workers[i];
     EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
-
+    cpi->intrabc_used |= thread_data->td->intrabc_used_this_tile;
     // Accumulate counters.
     if (i < cpi->num_workers - 1) {
-      av1_accumulate_frame_counts(&cm->counts, thread_data->td->counts);
+      av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts);
       accumulate_rd_opt(&cpi->td, thread_data->td);
-#if CONFIG_VAR_TX
       cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count;
-#endif
     }
   }
 }
+
+// Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int'
+// members, so we treat it as an array, and sum over the whole length.
+void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts,
+                                 const FRAME_COUNTS *counts) {
+  unsigned int *const acc = (unsigned int *)acc_counts;
+  const unsigned int *const cnt = (const unsigned int *)counts;
+
+  const unsigned int n_counts = sizeof(FRAME_COUNTS) / sizeof(unsigned int);
+
+  for (unsigned int i = 0; i < n_counts; i++) acc[i] += cnt[i];
+}
diff --git a/third_party/aom/av1/encoder/ethread.h b/third_party/aom/av1/encoder/ethread.h
index 6c30a3e5c..b6b1fed4e 100644
--- a/third_party/aom/av1/encoder/ethread.h
+++ b/third_party/aom/av1/encoder/ethread.h
@@ -27,6 +27,9 @@ typedef struct EncWorkerData {
 
 void av1_encode_tiles_mt(struct AV1_COMP *cpi);
 
+void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts,
+                                 const struct FRAME_COUNTS *counts);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/extend.c b/third_party/aom/av1/encoder/extend.c
index 007694a38..e9621a574 100644
--- a/third_party/aom/av1/encoder/extend.c
+++ b/third_party/aom/av1/encoder/extend.c
@@ -57,7 +57,6 @@ static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
                                          uint8_t *dst8, int dst_pitch, int w,
                                          int h, int extend_top, int extend_left,
@@ -100,7 +99,6 @@ static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
     dst_ptr2 += dst_pitch;
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                YV12_BUFFER_CONFIG *dst) {
@@ -124,7 +122,6 @@ void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
   const int eb_uv = eb_y >> uv_height_subsampling;
   const int er_uv = er_y >> uv_width_subsampling;
 
-#if CONFIG_HIGHBITDEPTH
   if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
     highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
                                  dst->y_stride, src->y_crop_width,
@@ -139,7 +136,6 @@ void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
         src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
     return;
   }
-#endif  // CONFIG_HIGHBITDEPTH
 
   copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
                         dst->y_stride, src->y_crop_width, src->y_crop_height,
diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c
index 2a4200887..113c068c1 100644
--- a/third_party/aom/av1/encoder/firstpass.c
+++ b/third_party/aom/av1/encoder/firstpass.c
@@ -13,8 +13,8 @@
 #include <math.h>
 #include <stdio.h>
 
-#include "./aom_dsp_rtcd.h"
-#include "./aom_scale_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
@@ -27,9 +27,7 @@
 #include "av1/common/entropymv.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/reconinter.h"  // av1_setup_dst_planes()
-#if CONFIG_LV_MAP
 #include "av1/common/txb_common.h"
-#endif
 #include "av1/encoder/aq_variance.h"
 #include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/block.h"
@@ -41,6 +39,7 @@
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/rd.h"
+#include "av1/encoder/dwt.h"
 
 #define OUTPUT_FPF 0
 #define ARF_STATS_OUTPUT 0
@@ -143,6 +142,7 @@ static void zero_stats(FIRSTPASS_STATS *section) {
   section->frame = 0.0;
   section->weight = 0.0;
   section->intra_error = 0.0;
+  section->frame_avg_wavelet_energy = 0.0;
   section->coded_error = 0.0;
   section->sr_coded_error = 0.0;
   section->pcnt_inter = 0.0;
@@ -169,6 +169,7 @@ static void accumulate_stats(FIRSTPASS_STATS *section,
   section->frame += frame->frame;
   section->weight += frame->weight;
   section->intra_error += frame->intra_error;
+  section->frame_avg_wavelet_energy += frame->frame_avg_wavelet_energy;
   section->coded_error += frame->coded_error;
   section->sr_coded_error += frame->sr_coded_error;
   section->pcnt_inter += frame->pcnt_inter;
@@ -195,6 +196,7 @@ static void subtract_stats(FIRSTPASS_STATS *section,
   section->frame -= frame->frame;
   section->weight -= frame->weight;
   section->intra_error -= frame->intra_error;
+  section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy;
   section->coded_error -= frame->coded_error;
   section->sr_coded_error -= frame->sr_coded_error;
   section->pcnt_inter -= frame->pcnt_inter;
@@ -305,7 +307,6 @@ static unsigned int get_prediction_error(BLOCK_SIZE bsize,
   return sse;
 }
 
-#if CONFIG_HIGHBITDEPTH
 static aom_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
                                                       int bd) {
   switch (bd) {
@@ -345,7 +346,6 @@ static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
   fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
   return sse;
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 // Refine the motion search range according to the frame dimension
 // for first pass test.
@@ -361,10 +361,10 @@ static void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
                                      const MV *ref_mv, MV *best_mv,
                                      int *best_motion_err) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MV tmp_mv = { 0, 0 };
+  MV tmp_mv = kZeroMv;
   MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 };
   int num00, tmp_err, n;
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
   const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
 
@@ -376,11 +376,9 @@ static void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
 
   // Override the default variance function to use MSE.
   v_fn_ptr.vf = get_block_variance_fn(bsize);
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
   }
-#endif  // CONFIG_HIGHBITDEPTH
 
   // Center the initial step/diamond search on best mv.
   tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
@@ -459,7 +457,6 @@ static void set_first_pass_params(AV1_COMP *cpi) {
   cpi->rc.frames_to_key = INT_MAX;
 }
 
-#if CONFIG_EXT_REFS
 static double raw_motion_error_stdev(int *raw_motion_err_list,
                                      int raw_motion_err_counts) {
   int64_t sum_raw_err = 0;
@@ -482,7 +479,6 @@ static double raw_motion_error_stdev(int *raw_motion_err_list,
   raw_err_stdev = sqrt(raw_err_stdev / raw_motion_err_counts);
   return raw_err_stdev;
 }
-#endif  // CONFIG_EXT_REFS
 
 #define UL_INTRA_THRESH 50
 #define INVALID_ROW -1
@@ -490,6 +486,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   int mb_row, mb_col;
   MACROBLOCK *const x = &cpi->td.mb;
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   TileInfo tile;
   struct macroblock_plane *const p = x->plane;
@@ -500,6 +497,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
 
   int recon_yoffset, recon_uvoffset;
   int64_t intra_error = 0;
+  int64_t frame_avg_wavelet_energy = 0;
   int64_t coded_error = 0;
   int64_t sr_coded_error = 0;
 
@@ -515,9 +513,8 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   int image_data_start_row = INVALID_ROW;
   int new_mv_count = 0;
   int sum_in_vectors = 0;
-  MV lastmv = { 0, 0 };
+  MV lastmv = kZeroMv;
   TWO_PASS *twopass = &cpi->twopass;
-  const MV zero_mv = { 0, 0 };
   int recon_y_stride, recon_uv_stride, uv_mb_height;
 
   YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
@@ -529,18 +526,12 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   BufferPool *const pool = cm->buffer_pool;
   const int qindex = find_fp_qindex(cm->bit_depth);
   const int mb_scale = mi_size_wide[BLOCK_16X16];
-#if CONFIG_PVQ
-  PVQ_QUEUE pvq_q;
-  od_adapt_ctx pvq_context;
-#endif
 
-#if CONFIG_EXT_REFS
   int *raw_motion_err_list;
   int raw_motion_err_counts = 0;
   CHECK_MEM_ERROR(
       cm, raw_motion_err_list,
       aom_calloc(cm->mb_rows * cm->mb_cols, sizeof(*raw_motion_err_list)));
-#endif  // CONFIG_EXT_REFS
   // First pass code requires valid last and new frame buffers.
   assert(new_yv12 != NULL);
   assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
@@ -555,7 +546,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
 
   xd->mi = cm->mi_grid_visible;
   xd->mi[0] = cm->mi;
-  x->e_mbd.mi[0]->mbmi.sb_type = BLOCK_16X16;
+  x->e_mbd.mi[0]->sb_type = BLOCK_16X16;
 
   intra_factor = 0.0;
   brightness_factor = 0.0;
@@ -564,80 +555,34 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   set_first_pass_params(cpi);
   av1_set_quantizer(cm, qindex);
 
-  av1_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+  av1_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y,
+                         num_planes);
 
-  av1_setup_src_planes(x, cpi->source, 0, 0);
-  av1_setup_dst_planes(xd->plane, cm->sb_size, new_yv12, 0, 0);
+  av1_setup_src_planes(x, cpi->source, 0, 0, num_planes);
+  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, new_yv12, 0, 0, 0,
+                       num_planes);
 
   if (!frame_is_intra_only(cm)) {
-    av1_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
+    av1_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL, num_planes);
   }
 
   xd->mi = cm->mi_grid_visible;
   xd->mi[0] = cm->mi;
 
-#if CONFIG_CFL
   // Don't store luma on the fist pass since chroma is not computed
-  xd->cfl->store_y = 0;
-#endif  // CONFIG_CFL
+  xd->cfl.store_y = 0;
   av1_frame_init_quantizer(cpi);
 
-#if CONFIG_PVQ
-  // For pass 1 of 2-pass encoding, init here for PVQ for now.
-  {
-    pvq_q.buf_len = 5000;
-    CHECK_MEM_ERROR(cm, pvq_q.buf,
-                    aom_malloc(pvq_q.buf_len * sizeof(PVQ_INFO)));
-    pvq_q.curr_pos = 0;
-    x->pvq_coded = 0;
-
-    x->pvq_q = &pvq_q;
-
-    // TODO(yushin): Since this init step is also called in 2nd pass,
-    // or 1-pass encoding, consider factoring out it as a function.
-    // TODO(yushin)
-    // If activity masking is enabled, change below to OD_HVS_QM
-    x->daala_enc.qm = OD_FLAT_QM;  // Hard coded. Enc/dec required to sync.
-    x->daala_enc.pvq_norm_lambda = OD_PVQ_LAMBDA;
-    x->daala_enc.pvq_norm_lambda_dc = OD_PVQ_LAMBDA;
-
-    od_init_qm(x->daala_enc.state.qm, x->daala_enc.state.qm_inv,
-               x->daala_enc.qm == OD_HVS_QM ? OD_QM8_Q4_HVS : OD_QM8_Q4_FLAT);
-#if !CONFIG_ANS
-    od_ec_enc_init(&x->daala_enc.w.ec, 65025);
-    od_ec_enc_reset(&x->daala_enc.w.ec);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-  }
-#endif
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
+  for (i = 0; i < num_planes; ++i) {
     p[i].coeff = ctx->coeff[i];
     p[i].qcoeff = ctx->qcoeff[i];
     pd[i].dqcoeff = ctx->dqcoeff[i];
-#if CONFIG_PVQ
-    pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i];
-#endif
     p[i].eobs = ctx->eobs[i];
-#if CONFIG_LV_MAP
     p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
-#endif
   }
 
   av1_init_mv_probs(cm);
-#if CONFIG_LV_MAP
   av1_init_lv_map(cm);
-#endif
-#if CONFIG_ADAPT_SCAN
-  av1_init_scan_order(cm);
-  av1_deliver_eob_threshold(cm, xd);
-#endif
-  av1_convolve_init(cm);
-#if CONFIG_PVQ
-  od_adapt_ctx_reset(&pvq_context, 0);
-  x->daala_enc.state.adapt = &pvq_context;
-#endif  // CONFIG_PVQ
   av1_initialize_rd_consts(cpi);
 
   // Tiling is ignored in the first pass.
@@ -648,7 +593,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
 
   for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
-    MV best_ref_mv = { 0, 0 };
+    MV best_ref_mv = kZeroMv;
 
     // Reset above block coeffs.
     xd->up_available = (mb_row != 0);
@@ -674,31 +619,28 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
 
       aom_clear_system_state();
 
+      const int idx_str = xd->mi_stride * mb_row * mb_scale + mb_col * mb_scale;
+      xd->mi = cm->mi_grid_visible + idx_str;
+      xd->mi[0] = cm->mi + idx_str;
       xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
       xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
       xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
       xd->left_available = (mb_col != 0);
-      xd->mi[0]->mbmi.sb_type = bsize;
-      xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
+      xd->mi[0]->sb_type = bsize;
+      xd->mi[0]->ref_frame[0] = INTRA_FRAME;
       set_mi_row_col(xd, &tile, mb_row * mb_scale, mi_size_high[bsize],
-                     mb_col * mb_scale, mi_size_wide[bsize],
-#if CONFIG_DEPENDENT_HORZTILES
-                     cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                     cm->mi_rows, cm->mi_cols);
+                     mb_col * mb_scale, mi_size_wide[bsize], cm->mi_rows,
+                     cm->mi_cols);
 
-      set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize]);
+      set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], num_planes);
 
       // Do intra 16x16 prediction.
-      xd->mi[0]->mbmi.segment_id = 0;
-#if CONFIG_SUPERTX
-      xd->mi[0]->mbmi.segment_id_supertx = 0;
-#endif  // CONFIG_SUPERTX
-      xd->lossless[xd->mi[0]->mbmi.segment_id] = (qindex == 0);
-      xd->mi[0]->mbmi.mode = DC_PRED;
-      xd->mi[0]->mbmi.tx_size =
+      xd->mi[0]->segment_id = 0;
+      xd->lossless[xd->mi[0]->segment_id] = (qindex == 0);
+      xd->mi[0]->mode = DC_PRED;
+      xd->mi[0]->tx_size =
           use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
-      av1_encode_intra_block_plane(cm, x, bsize, 0, 0, mb_row * 2, mb_col * 2);
+      av1_encode_intra_block_plane(cpi, x, bsize, 0, 0, mb_row * 2, mb_col * 2);
       this_error = aom_get_mb_ss(x->plane[0].src_diff);
 
       // Keep a record of blocks that have almost no intra error residual
@@ -712,7 +654,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
         image_data_start_row = mb_row;
       }
 
-#if CONFIG_HIGHBITDEPTH
       if (cm->use_highbitdepth) {
         switch (cm->bit_depth) {
           case AOM_BITS_8: break;
@@ -725,7 +666,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
             return;
         }
       }
-#endif  // CONFIG_HIGHBITDEPTH
 
       aom_clear_system_state();
       log_intra = log(this_error + 1.0);
@@ -734,14 +674,10 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
       else
         intra_factor += 1.0;
 
-#if CONFIG_HIGHBITDEPTH
       if (cm->use_highbitdepth)
         level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
       else
         level_sample = x->plane[0].src.buf[0];
-#else
-      level_sample = x->plane[0].src.buf[0];
-#endif
       if ((level_sample < DARK_THRESH) && (log_intra < 9.0))
         brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
       else
@@ -759,6 +695,15 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
       // Accumulate the intra error.
       intra_error += (int64_t)this_error;
 
+      int stride = x->plane[0].src.stride;
+      uint8_t *buf = x->plane[0].src.buf;
+      for (int r8 = 0; r8 < 2; ++r8)
+        for (int c8 = 0; c8 < 2; ++c8) {
+          int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+          frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input(
+              buf + c8 * 8 + r8 * 8 * stride, stride, hbd);
+        }
+
 #if CONFIG_FP_MB_STATS
       if (cpi->use_fp_mb_stats) {
         // initialization
@@ -775,11 +720,10 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
       if (!frame_is_intra_only(cm)) {  // Do a motion search
         int tmp_err, motion_error, raw_motion_error;
         // Assume 0,0 motion with no mv overhead.
-        MV mv = { 0, 0 }, tmp_mv = { 0, 0 };
+        MV mv = kZeroMv, tmp_mv = kZeroMv;
         struct buf_2d unscaled_last_source_buf_2d;
 
         xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
-#if CONFIG_HIGHBITDEPTH
         if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
           motion_error = highbd_get_prediction_error(
               bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
@@ -787,10 +731,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
           motion_error = get_prediction_error(bsize, &x->plane[0].src,
                                               &xd->plane[0].pre[0]);
         }
-#else
-        motion_error =
-            get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
-#endif  // CONFIG_HIGHBITDEPTH
 
         // Compute the motion error of the 0,0 motion using the last source
         // frame as the reference. Skip the further motion search on
@@ -799,7 +739,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
             cpi->unscaled_last_source->y_buffer + recon_yoffset;
         unscaled_last_source_buf_2d.stride =
             cpi->unscaled_last_source->y_stride;
-#if CONFIG_HIGHBITDEPTH
         if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
           raw_motion_error = highbd_get_prediction_error(
               bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
@@ -807,10 +746,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
           raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
                                                   &unscaled_last_source_buf_2d);
         }
-#else
-        raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                &unscaled_last_source_buf_2d);
-#endif  // CONFIG_HIGHBITDEPTH
 
         // TODO(pengchong): Replace the hard-coded threshold
         if (raw_motion_error > 25) {
@@ -822,7 +757,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
           // 0,0 based search as well.
           if (!is_zero_mv(&best_ref_mv)) {
             tmp_err = INT_MAX;
-            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
+            first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &tmp_err);
 
             if (tmp_err < motion_error) {
               motion_error = tmp_err;
@@ -836,7 +771,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
             int gf_motion_error;
 
             xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
-#if CONFIG_HIGHBITDEPTH
             if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
               gf_motion_error = highbd_get_prediction_error(
                   bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
@@ -844,12 +778,8 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
               gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
                                                      &xd->plane[0].pre[0]);
             }
-#else
-            gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                   &xd->plane[0].pre[0]);
-#endif  // CONFIG_HIGHBITDEPTH
 
-            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,
+            first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv,
                                      &gf_motion_error);
 
             if (gf_motion_error < motion_error && gf_motion_error < this_error)
@@ -913,11 +843,11 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
           mv.row *= 8;
           mv.col *= 8;
           this_error = motion_error;
-          xd->mi[0]->mbmi.mode = NEWMV;
-          xd->mi[0]->mbmi.mv[0].as_mv = mv;
-          xd->mi[0]->mbmi.tx_size = TX_4X4;
-          xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME;
-          xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME;
+          xd->mi[0]->mode = NEWMV;
+          xd->mi[0]->mv[0].as_mv = mv;
+          xd->mi[0]->tx_size = TX_4X4;
+          xd->mi[0]->ref_frame[0] = LAST_FRAME;
+          xd->mi[0]->ref_frame[1] = NONE_FRAME;
           av1_build_inter_predictors_sby(cm, xd, mb_row * mb_scale,
                                          mb_col * mb_scale, NULL, bsize);
           av1_encode_sby_pass1(cm, x, bsize);
@@ -1006,9 +936,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
             }
           }
         }
-#if CONFIG_EXT_REFS
         raw_motion_err_list[raw_motion_err_counts++] = raw_motion_error;
-#endif  // CONFIG_EXT_REFS
       } else {
         sr_coded_error += (int64_t)this_error;
       }
@@ -1031,25 +959,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
 
     aom_clear_system_state();
   }
-#if CONFIG_EXT_REFS
   const double raw_err_stdev =
       raw_motion_error_stdev(raw_motion_err_list, raw_motion_err_counts);
   aom_free(raw_motion_err_list);
-#endif  // CONFIG_EXT_REFS
-
-#if CONFIG_PVQ
-#if !CONFIG_ANS
-  od_ec_enc_clear(&x->daala_enc.w.ec);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-
-  x->pvq_q->last_pos = x->pvq_q->curr_pos;
-  x->pvq_q->curr_pos = 0;
-  x->pvq_q = NULL;
-
-  aom_free(pvq_q.buf);
-#endif
 
   // Clamp the image start to rows/2. This number of rows is discarded top
   // and bottom as dead data so rows / 2 means the frame is blank.
@@ -1083,6 +995,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
     fps.coded_error = (double)(coded_error >> 8) + min_err;
     fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err;
     fps.intra_error = (double)(intra_error >> 8) + min_err;
+    fps.frame_avg_wavelet_energy = (double)frame_avg_wavelet_energy;
     fps.count = 1.0;
     fps.pcnt_inter = (double)intercount / num_mbs;
     fps.pcnt_second_ref = (double)second_ref_count / num_mbs;
@@ -1090,9 +1003,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
     fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
     fps.inactive_zone_rows = (double)image_data_start_row;
     fps.inactive_zone_cols = (double)0;  // TODO(paulwilkins): fix
-#if CONFIG_EXT_REFS
     fps.raw_error_stdev = raw_err_stdev;
-#endif  // CONFIG_EXT_REFS
 
     if (mvcount > 0) {
       fps.MVr = (double)sum_mvr / mvcount;
@@ -1144,41 +1055,29 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
        ((twopass->this_frame_stats.intra_error /
          DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
     if (gld_yv12 != NULL) {
-#if CONFIG_EXT_REFS
-      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
-                 cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]]);
-#else
-      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
-                 cm->ref_frame_map[cpi->lst_fb_idx]);
-#endif  // CONFIG_EXT_REFS
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]],
+                 cm->ref_frame_map[cpi->ref_fb_idx[LAST_FRAME - 1]]);
     }
     twopass->sr_update_lag = 1;
   } else {
     ++twopass->sr_update_lag;
   }
 
-  aom_extend_frame_borders(new_yv12);
+  aom_extend_frame_borders(new_yv12, num_planes);
 
-// The frame we just compressed now becomes the last frame.
-#if CONFIG_EXT_REFS
+  // The frame we just compressed now becomes the last frame.
   ref_cnt_fb(pool->frame_bufs,
-             &cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]],
-             cm->new_fb_idx);
-#else
-  ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
+             &cm->ref_frame_map[cpi->ref_fb_idx[LAST_FRAME - 1]],
              cm->new_fb_idx);
-#endif  // CONFIG_EXT_REFS
 
   // Special case for the first frame. Copy into the GF buffer as a second
   // reference.
-  if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX) {
-#if CONFIG_EXT_REFS
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
-               cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]]);
-#else
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
-               cm->ref_frame_map[cpi->lst_fb_idx]);
-#endif  // CONFIG_EXT_REFS
+  if (cm->current_video_frame == 0 &&
+      cpi->ref_fb_idx[GOLDEN_FRAME - 1] != INVALID_IDX) {
+    ref_cnt_fb(pool->frame_bufs,
+               &cm->ref_frame_map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]],
+               cm->ref_frame_map[cpi->ref_fb_idx[LAST_FRAME - 1]]);
   }
 
   // Use this to see what the first pass reconstruction looks like.
@@ -1234,7 +1133,7 @@ static int get_twopass_worst_quality(const AV1_COMP *cpi,
                             : cpi->common.MBs;
     const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
     const double av_err_per_mb = section_err / active_mbs;
-    const double speed_term = 1.0 + 0.04 * oxcf->speed;
+    const double speed_term = 1.0;
     double ediv_size_correction;
     const int target_norm_bits_per_mb =
         (int)((uint64_t)section_target_bandwidth << BPER_MB_NORMBITS) /
@@ -1662,21 +1561,6 @@ static int calculate_boost_bits(int frame_count, int boost,
                 0);
 }
 
-#if !CONFIG_EXT_REFS
-// Current limit on maximum number of active arfs in a GF/ARF group.
-#define MAX_ACTIVE_ARFS 2
-#define ARF_SLOT1 2
-#define ARF_SLOT2 3
-// This function indirects the choice of buffers for arfs.
-// At the moment the values are fixed but this may change as part of
-// the integration process with other codec features that swap buffers around.
-static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) {
-  arf_buffer_indices[0] = ARF_SLOT1;
-  arf_buffer_indices[1] = ARF_SLOT2;
-}
-#endif  // !CONFIG_EXT_REFS
-
-#if CONFIG_EXT_REFS
 #if USE_GF16_MULTI_LAYER
 // === GF Group of 16 ===
 #define GF_INTERVAL_16 16
@@ -2146,10 +2030,8 @@ static void define_gf_group_structure_16(AV1_COMP *cpi) {
       gf_group->bidir_pred_enabled[frame_index] = 0;
       for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx)
         gf_group->ref_fb_idx_map[frame_index][ref_idx] = ref_idx;
-      gf_group->refresh_idx[frame_index] =
-          cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME];
-      gf_group->refresh_flag[frame_index] =
-          cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME];
+      gf_group->refresh_idx[frame_index] = cpi->ref_fb_idx[LAST_FRAME - 1];
+      gf_group->refresh_flag[frame_index] = cpi->ref_fb_idx[LAST_FRAME - 1];
 
       continue;
     }
@@ -2247,19 +2129,16 @@ static void define_gf_group_structure_16(AV1_COMP *cpi) {
   }
 }
 #endif  // USE_GF16_MULTI_LAYER
-#endif  // CONFIG_EXT_REFS
 
 static void define_gf_group_structure(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
 
-#if CONFIG_EXT_REFS
 #if USE_GF16_MULTI_LAYER
   if (rc->baseline_gf_interval == 16) {
     define_gf_group_structure_16(cpi);
     return;
   }
 #endif  // USE_GF16_MULTI_LAYER
-#endif  // CONFIG_EXT_REFS
 
   TWO_PASS *const twopass = &cpi->twopass;
   GF_GROUP *const gf_group = &twopass->gf_group;
@@ -2267,7 +2146,6 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
   int frame_index = 0;
   const int key_frame = cpi->common.frame_type == KEY_FRAME;
 
-#if CONFIG_EXT_REFS
   // The use of bi-predictive frames are only enabled when following 3
   // conditions are met:
   // (1) ALTREF is enabled;
@@ -2275,7 +2153,7 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
   // (3) The bi-predictive group interval is strictly smaller than the
   //     golden group interval.
   const int is_bipred_enabled =
-      cpi->bwd_ref_allowed && rc->source_alt_ref_pending &&
+      cpi->extra_arf_allowed && rc->source_alt_ref_pending &&
       rc->bipred_group_interval &&
       rc->bipred_group_interval <=
           (rc->baseline_gf_interval - rc->source_alt_ref_pending);
@@ -2288,14 +2166,6 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
   int subgroup_interval[MAX_EXT_ARFS + 1];
   int is_sg_bipred_enabled = is_bipred_enabled;
   int accumulative_subgroup_interval = 0;
-#else
-  int mid_frame_idx;
-  unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
-#endif  // CONFIG_EXT_REFS
-
-#if !CONFIG_EXT_REFS
-  get_arf_buffer_indices(arf_buffer_indices);
-#endif  // !CONFIG_EXT_REFS
 
   // For key frames the frame target rate is already set and it
   // is also the golden frame.
@@ -2308,25 +2178,16 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
       gf_group->update_type[frame_index] = GF_UPDATE;
       gf_group->rf_level[frame_index] = GF_ARF_STD;
     }
-#if CONFIG_EXT_REFS
     gf_group->arf_update_idx[frame_index] = 0;
     gf_group->arf_ref_idx[frame_index] = 0;
-#else
-    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
-    gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
-#endif  // CONFIG_EXT_REFS
   }
 
-#if CONFIG_EXT_REFS
   gf_group->bidir_pred_enabled[frame_index] = 0;
   gf_group->brf_src_offset[frame_index] = 0;
-#endif  // CONFIG_EXT_REFS
 
   frame_index++;
 
-#if CONFIG_EXT_REFS
   bipred_frame_index++;
-#endif  // CONFIG_EXT_REFS
 
   // === [frame_index == 1] ===
   if (rc->source_alt_ref_pending) {
@@ -2335,21 +2196,13 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
     gf_group->arf_src_offset[frame_index] =
         (unsigned char)(rc->baseline_gf_interval - 1);
 
-#if CONFIG_EXT_REFS
     gf_group->arf_update_idx[frame_index] = 0;
     gf_group->arf_ref_idx[frame_index] = 0;
 
     gf_group->bidir_pred_enabled[frame_index] = 0;
     gf_group->brf_src_offset[frame_index] = 0;
-// NOTE: "bidir_pred_frame_index" stays unchanged for ARF_UPDATE frames.
-#else
-    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
-    gf_group->arf_ref_idx[frame_index] =
-        arf_buffer_indices[cpi->multi_arf_last_grp_enabled &&
-                           rc->source_alt_ref_active];
-#endif  // CONFIG_EXT_REFS
-
-#if CONFIG_EXT_REFS
+    // NOTE: "bidir_pred_frame_index" stays unchanged for ARF_UPDATE frames.
+
     // Work out the ARFs' positions in this gf group
     // NOTE(weitinglin): ALT_REFs' are indexed inversely, but coded in display
     // order (except for the original ARF). In the example of three ALT_REF's,
@@ -2370,11 +2223,9 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
     subgroup_interval[cpi->num_extra_arfs] =
         cpi->arf_pos_for_ovrly[cpi->num_extra_arfs] - frame_index -
         (cpi->num_extra_arfs == 0 ? 1 : 2);
-#endif  // CONFIG_EXT_REFS
 
     ++frame_index;
 
-#if CONFIG_EXT_REFS
     // Insert an extra ARF
     // === [frame_index == 2] ===
     if (cpi->num_extra_arfs) {
@@ -2387,43 +2238,12 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
       ++frame_index;
     }
     accumulative_subgroup_interval += subgroup_interval[cpi->num_extra_arfs];
-#else   // !CONFIG_EXT_ARFS
-    if (cpi->multi_arf_enabled) {
-      // Set aside a slot for a level 1 arf.
-      gf_group->update_type[frame_index] = ARF_UPDATE;
-      gf_group->rf_level[frame_index] = GF_ARF_LOW;
-      gf_group->arf_src_offset[frame_index] =
-          (unsigned char)((rc->baseline_gf_interval >> 1) - 1);
-      gf_group->arf_update_idx[frame_index] = arf_buffer_indices[1];
-      gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
-      ++frame_index;
-    }
-#endif  // CONFIG_EXT_ARFS
   }
 
-#if !CONFIG_EXT_REFS
-  // Define middle frame
-  mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
-#endif  // !CONFIG_EXT_REFS
-
   for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) {
-#if !CONFIG_EXT_REFS
-    int arf_idx = 0;
-
-    if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
-      if (frame_index <= mid_frame_idx) arf_idx = 1;
-    }
-#endif  // !CONFIG_EXT_REFS
-
-#if CONFIG_EXT_REFS
     gf_group->arf_update_idx[frame_index] = which_arf;
     gf_group->arf_ref_idx[frame_index] = which_arf;
-#else
-    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
-    gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
-#endif  // CONFIG_EXT_REFS
 
-#if CONFIG_EXT_REFS
     // If we are going to have ARFs, check whether we can have BWDREF in this
     // subgroup, and further, whether we can have ARF subgroup which contains
     // the BWDREF subgroup but contained within the GF group:
@@ -2472,18 +2292,14 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
         bipred_group_end = 1;
       }
     } else {
-#endif  // CONFIG_EXT_REFS
       gf_group->update_type[frame_index] = LF_UPDATE;
       gf_group->rf_level[frame_index] = INTER_NORMAL;
-#if CONFIG_EXT_REFS
       gf_group->bidir_pred_enabled[frame_index] = 0;
       gf_group->brf_src_offset[frame_index] = 0;
     }
-#endif  // CONFIG_EXT_REFS
 
     ++frame_index;
 
-#if CONFIG_EXT_REFS
     // Check if we need to update the ARF.
     if (is_sg_bipred_enabled && cpi->num_extra_arfs && which_arf > 0 &&
         frame_index > cpi->arf_pos_for_ovrly[which_arf]) {
@@ -2503,25 +2319,19 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
         ++frame_index;
       }
     }
-#endif  // CONFIG_EXT_REFS
   }
 
-// NOTE: We need to configure the frame at the end of the sequence + 1 that will
-//       be the start frame for the next group. Otherwise prior to the call to
-//       av1_rc_get_second_pass_params() the data will be undefined.
-#if CONFIG_EXT_REFS
+  // NOTE: We need to configure the frame at the end of the sequence + 1 that
+  // will
+  //       be the start frame for the next group. Otherwise prior to the call to
+  //       av1_rc_get_second_pass_params() the data will be undefined.
   gf_group->arf_update_idx[frame_index] = 0;
   gf_group->arf_ref_idx[frame_index] = 0;
-#else
-  gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
-  gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
-#endif  // CONFIG_EXT_REFS
 
   if (rc->source_alt_ref_pending) {
     gf_group->update_type[frame_index] = OVERLAY_UPDATE;
     gf_group->rf_level[frame_index] = INTER_NORMAL;
 
-#if CONFIG_EXT_REFS
     cpi->arf_pos_in_gf[0] = 1;
     if (cpi->num_extra_arfs) {
       // Overwrite the update_type for extra-ARF's corresponding internal
@@ -2534,21 +2344,13 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
         gf_group->rf_level[cpi->arf_pos_for_ovrly[i]] = INTER_NORMAL;
       }
     }
-#else
-    // Final setup for second arf and its overlay.
-    if (cpi->multi_arf_enabled) {
-      gf_group->update_type[mid_frame_idx] = OVERLAY_UPDATE;
-    }
-#endif  // CONFIG_EXT_REFS
   } else {
     gf_group->update_type[frame_index] = GF_UPDATE;
     gf_group->rf_level[frame_index] = GF_ARF_STD;
   }
 
-#if CONFIG_EXT_REFS
   gf_group->bidir_pred_enabled[frame_index] = 0;
   gf_group->brf_src_offset[frame_index] = 0;
-#endif  // CONFIG_EXT_REFS
 }
 
 static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
@@ -2566,18 +2368,11 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
   int64_t total_group_bits = gf_group_bits;
   double modified_err = 0.0;
   double err_fraction;
-  int mid_boost_bits = 0;
-#if CONFIG_EXT_REFS
   int ext_arf_boost[MAX_EXT_ARFS];
-#else
-  int mid_frame_idx;
-#endif  // CONFIG_EXT_REFS
 
   define_gf_group_structure(cpi);
 
-#if CONFIG_EXT_REFS
   av1_zero_array(ext_arf_boost, MAX_EXT_ARFS);
-#endif  // CONFIG_EXT_REFS
 
   key_frame = cpi->common.frame_type == KEY_FRAME;
 
@@ -2607,24 +2402,14 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
 
     ++frame_index;
 
-#if CONFIG_EXT_REFS
     // Skip all the extra-ARF's right after ARF at the starting segment of
     // the current GF group.
     if (cpi->num_extra_arfs) {
       while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
         ++frame_index;
     }
-#else   // !CONFIG_EXT_ARFS
-    // Set aside a slot for a level 1 arf.
-    if (cpi->multi_arf_enabled) ++frame_index;
-#endif  // CONFIG_EXT_ARFS
   }
 
-#if !CONFIG_EXT_REFS
-  // Define middle frame
-  mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
-#endif  // !CONFIG_EXT_REFS
-
   // Allocate bits to the other frames in the group.
   for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) {
     if (EOF == input_stats(twopass, &frame_stats)) break;
@@ -2638,15 +2423,9 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
 
     target_frame_size = (int)((double)total_group_bits * err_fraction);
 
-    if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
-      mid_boost_bits += (target_frame_size >> 4);
-      target_frame_size -= (target_frame_size >> 4);
-    }
-
     target_frame_size =
         clamp(target_frame_size, 0, AOMMIN(max_bits, (int)total_group_bits));
 
-#if CONFIG_EXT_REFS
     if (gf_group->update_type[frame_index] == BRF_UPDATE) {
       // Boost up the allocated bits on BWDREF_FRAME
       gf_group->bit_allocation[frame_index] =
@@ -2662,28 +2441,22 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
     } else {
       assert(gf_group->update_type[frame_index] == LF_UPDATE ||
              gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE);
-#endif  // CONFIG_EXT_REFS
       gf_group->bit_allocation[frame_index] = target_frame_size;
-#if CONFIG_EXT_REFS
     }
-#endif  // CONFIG_EXT_REFS
 
     ++frame_index;
 
-#if CONFIG_EXT_REFS
     // Skip all the extra-ARF's.
     if (cpi->num_extra_arfs) {
       while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
         ++frame_index;
     }
-#endif  // CONFIG_EXT_REFS
   }
 
   // NOTE: We need to configure the frame at the end of the sequence + 1 that
   //       will be the start frame for the next group. Otherwise prior to the
   //       call to av1_rc_get_second_pass_params() the data will be undefined.
   if (rc->source_alt_ref_pending) {
-#if CONFIG_EXT_REFS
     if (cpi->num_extra_arfs) {
       // NOTE: For bit allocation, move the allocated bits associated with
       //       INTNL_OVERLAY_UPDATE to the corresponding INTNL_ARF_UPDATE.
@@ -2702,18 +2475,7 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
         gf_group->bit_allocation[cpi->arf_pos_for_ovrly[i]] = 0;
       }
     }
-#else
-    // Final setup for second arf and its overlay.
-    if (cpi->multi_arf_enabled) {
-      gf_group->bit_allocation[2] =
-          gf_group->bit_allocation[mid_frame_idx] + mid_boost_bits;
-      gf_group->bit_allocation[mid_frame_idx] = 0;
-    }
-#endif  // CONFIG_EXT_REFS
   }
-
-  // Note whether multi-arf was enabled this group for next time.
-  cpi->multi_arf_last_grp_enabled = cpi->multi_arf_enabled;
 }
 
 // Analyse and define a gf/arf group.
@@ -2761,10 +2523,7 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   const int is_key_frame = frame_is_intra_only(cm);
   const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
 
-#if CONFIG_EXT_REFS
   cpi->extra_arf_allowed = 1;
-  cpi->bwd_ref_allowed = 1;
-#endif  // CONFIG_EXT_REFS
 
   // Reset the GF group data structures unless this is a key
   // frame in which case it will already have been done.
@@ -2826,15 +2585,9 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
   }
 
-#if CONFIG_EXT_REFS || CONFIG_BGSPRITE
   double avg_sr_coded_error = 0;
   double avg_raw_err_stdev = 0;
   int non_zero_stdev_count = 0;
-#endif  // CONFIG_EXT_REFS || CONFIG_BGSPRITE
-#if CONFIG_BGSPRITE
-  double avg_pcnt_second_ref = 0;
-  int non_zero_pcnt_second_ref_count = 0;
-#endif
 
   i = 0;
   while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) {
@@ -2859,20 +2612,12 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     accumulate_frame_motion_stats(
         &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
         &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
-#if CONFIG_EXT_REFS || CONFIG_BGSPRITE
     // sum up the metric values of current gf group
     avg_sr_coded_error += next_frame.sr_coded_error;
     if (fabs(next_frame.raw_error_stdev) > 0.000001) {
       non_zero_stdev_count++;
       avg_raw_err_stdev += next_frame.raw_error_stdev;
     }
-#endif  // CONFIG_EXT_REFS || CONFIG_BGSPRITE
-#if CONFIG_BGSPRITE
-    if (this_frame->pcnt_second_ref) {
-      avg_pcnt_second_ref += this_frame->pcnt_second_ref;
-    }
-    non_zero_pcnt_second_ref_count++;
-#endif  // CONFIG_BGSPRITE
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
@@ -2912,18 +2657,14 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
              (abs_mv_in_out_accumulator > 3.0) ||
              (mv_in_out_accumulator < -2.0) ||
              ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) {
-#if CONFIG_EXT_REFS
       // If GF group interval is < 12, we force it to be 8. Otherwise,
       // if it is >= 12, we keep it as is.
       // NOTE: 'i' is 1 more than the GF group interval candidate that is being
       //       checked.
       if (i == (8 + 1) || i >= (12 + 1)) {
-#endif  // CONFIG_EXT_REFS
         boost_score = old_boost_score;
         break;
-#if CONFIG_EXT_REFS
       }
-#endif  // CONFIG_EXT_REFS
     }
 
     *this_frame = next_frame;
@@ -2934,12 +2675,10 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Was the group length constrained by the requirement for a new KF?
   rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
 
-#if CONFIG_EXT_REFS || CONFIG_BGSPRITE
   const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
                                                              : cpi->common.MBs;
   assert(num_mbs > 0);
   if (i) avg_sr_coded_error /= i;
-#endif  // CONFIG_EXT_REFS || CONFIG_BGSPRITE
 
   // Should we use the alternate reference frame.
   if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
@@ -2948,24 +2687,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     rc->gfu_boost =
         calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
     rc->source_alt_ref_pending = 1;
-
-    // Test to see if multi arf is appropriate.
-    cpi->multi_arf_enabled =
-        (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) &&
-         (zero_motion_accumulator < 0.995))
-            ? 1
-            : 0;
-#if CONFIG_BGSPRITE
-    if (non_zero_pcnt_second_ref_count) {
-      avg_pcnt_second_ref /= non_zero_pcnt_second_ref_count;
-    }
-
-    cpi->bgsprite_allowed = 1;
-    if (abs_mv_in_out_accumulator > 0.30 || decay_accumulator < 0.90 ||
-        avg_sr_coded_error / num_mbs < 20 || avg_pcnt_second_ref < 0.30) {
-      cpi->bgsprite_allowed = 0;
-    }
-#endif  // CONFIG_BGSPRITE
   } else {
     rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST);
     rc->source_alt_ref_pending = 0;
@@ -2973,7 +2694,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   // Set the interval until the next gf.
   rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
-#if CONFIG_EXT_REFS
   if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count;
 
   // Disable extra altrefs and backward refs for "still" gf group:
@@ -2981,13 +2701,12 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   //   avg_sr_coded_error:      average of the SSE per pixel of each frame;
   //   avg_raw_err_stdev:       average of the standard deviation of (0,0)
   //                            motion error per block of each frame.
-  assert(num_mbs > 0);
   const int disable_bwd_extarf =
       (zero_motion_accumulator > MIN_ZERO_MOTION &&
        avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
        avg_raw_err_stdev < MAX_RAW_ERR_VAR);
 
-  if (disable_bwd_extarf) cpi->extra_arf_allowed = cpi->bwd_ref_allowed = 0;
+  if (disable_bwd_extarf) cpi->extra_arf_allowed = 0;
 
   if (!cpi->extra_arf_allowed) {
     cpi->num_extra_arfs = 0;
@@ -2998,15 +2717,12 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   }
   // Currently at maximum two extra ARFs' are allowed
   assert(cpi->num_extra_arfs <= MAX_EXT_ARFS);
-#endif  // CONFIG_EXT_REFS
 
   rc->frames_till_gf_update_due = rc->baseline_gf_interval;
 
-#if CONFIG_EXT_REFS
   rc->bipred_group_interval = BFG_INTERVAL;
   // The minimum bi-predictive frame group interval is 2.
   if (rc->bipred_group_interval < 2) rc->bipred_group_interval = 0;
-#endif  // CONFIG_EXT_REFS
 
   // Reset the file position.
   reset_fpf_position(twopass, start_pos);
@@ -3226,7 +2942,6 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Clear the alt ref active flag and last group multi arf flags as they
   // can never be set for a key frame.
   rc->source_alt_ref_active = 0;
-  cpi->multi_arf_last_grp_enabled = 0;
 
   // KF is always a GF so clear frames till next gf counter.
   rc->frames_till_gf_update_due = 0;
@@ -3397,6 +3112,8 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Work out how many bits to allocate for the key frame itself.
   kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
                                  twopass->kf_group_bits);
+  // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", rc->kf_boost,
+  //        kf_bits, twopass->kf_zeromotion_pct);
 
   // Work out the fraction of the kf group bits reserved for the inter frames
   // within the group after discounting the bits for the kf itself.
@@ -3433,17 +3150,9 @@ void av1_ref_frame_map_idx_updates(AV1_COMP *cpi, int gf_frame_index) {
   int ref_fb_idx_prev[REF_FRAMES];
   int ref_fb_idx_curr[REF_FRAMES];
 
-  ref_fb_idx_prev[LAST_FRAME - LAST_FRAME] =
-      cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME];
-  ref_fb_idx_prev[LAST2_FRAME - LAST_FRAME] =
-      cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME];
-  ref_fb_idx_prev[LAST3_FRAME - LAST_FRAME] =
-      cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME];
-  ref_fb_idx_prev[GOLDEN_FRAME - LAST_FRAME] = cpi->gld_fb_idx;
-  ref_fb_idx_prev[BWDREF_FRAME - LAST_FRAME] = cpi->bwd_fb_idx;
-  ref_fb_idx_prev[ALTREF2_FRAME - LAST_FRAME] = cpi->alt2_fb_idx;
-  ref_fb_idx_prev[ALTREF_FRAME - LAST_FRAME] = cpi->alt_fb_idx;
-  ref_fb_idx_prev[REF_FRAMES - LAST_FRAME] = cpi->ext_fb_idx;
+  for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+    ref_fb_idx_prev[ref_frame] = cpi->ref_fb_idx[ref_frame];
+  }
 
   // Update map index for each reference frame
   for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx) {
@@ -3451,17 +3160,9 @@ void av1_ref_frame_map_idx_updates(AV1_COMP *cpi, int gf_frame_index) {
     ref_fb_idx_curr[ref_idx] = ref_fb_idx_prev[ref_frame - LAST_FRAME];
   }
 
-  cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] =
-      ref_fb_idx_curr[LAST_FRAME - LAST_FRAME];
-  cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] =
-      ref_fb_idx_curr[LAST2_FRAME - LAST_FRAME];
-  cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] =
-      ref_fb_idx_curr[LAST3_FRAME - LAST_FRAME];
-  cpi->gld_fb_idx = ref_fb_idx_curr[GOLDEN_FRAME - LAST_FRAME];
-  cpi->bwd_fb_idx = ref_fb_idx_curr[BWDREF_FRAME - LAST_FRAME];
-  cpi->alt2_fb_idx = ref_fb_idx_curr[ALTREF2_FRAME - LAST_FRAME];
-  cpi->alt_fb_idx = ref_fb_idx_curr[ALTREF_FRAME - LAST_FRAME];
-  cpi->ext_fb_idx = ref_fb_idx_curr[REF_FRAMES - LAST_FRAME];
+  for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+    cpi->ref_fb_idx[ref_frame] = ref_fb_idx_curr[ref_frame];
+  }
 }
 
 // Define the reference buffers that will be updated post encode.
@@ -3487,26 +3188,36 @@ static void configure_buffer_updates_16(AV1_COMP *cpi) {
   // Update refresh index
   switch (gf_group->refresh_idx[gf_group->index]) {
     case LAST_FRAME:
-      cpi->refresh_fb_idx = cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME];
+      cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST_FRAME - LAST_FRAME];
       break;
 
     case LAST2_FRAME:
-      cpi->refresh_fb_idx = cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME];
+      cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST2_FRAME - LAST_FRAME];
       break;
 
     case LAST3_FRAME:
-      cpi->refresh_fb_idx = cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME];
+      cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST3_FRAME - LAST_FRAME];
       break;
 
-    case GOLDEN_FRAME: cpi->refresh_fb_idx = cpi->gld_fb_idx; break;
+    case GOLDEN_FRAME:
+      cpi->refresh_fb_idx = cpi->ref_fb_idx[GOLDEN_FRAME - 1];
+      break;
 
-    case BWDREF_FRAME: cpi->refresh_fb_idx = cpi->bwd_fb_idx; break;
+    case BWDREF_FRAME:
+      cpi->refresh_fb_idx = cpi->ref_fb_idx[BWDREF_FRAME - 1];
+      break;
 
-    case ALTREF2_FRAME: cpi->refresh_fb_idx = cpi->alt2_fb_idx; break;
+    case ALTREF2_FRAME:
+      cpi->refresh_fb_idx = cpi->ref_fb_idx[ALTREF2_FRAME - 1];
+      break;
 
-    case ALTREF_FRAME: cpi->refresh_fb_idx = cpi->alt_fb_idx; break;
+    case ALTREF_FRAME:
+      cpi->refresh_fb_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
+      break;
 
-    case REF_FRAMES: cpi->refresh_fb_idx = cpi->ext_fb_idx; break;
+    case REF_FRAMES:
+      cpi->refresh_fb_idx = cpi->ref_fb_idx[REF_FRAMES - 1];
+      break;
 
     default: assert(0); break;
   }
@@ -3579,7 +3290,6 @@ static void configure_buffer_updates(AV1_COMP *cpi) {
   // cpi->rc.is_$Source_Type to make this function as it is in the comment?
 
   cpi->rc.is_src_frame_alt_ref = 0;
-#if CONFIG_EXT_REFS
   cpi->rc.is_bwd_ref_frame = 0;
   cpi->rc.is_last_bipred_frame = 0;
   cpi->rc.is_bipred_frame = 0;
@@ -3592,22 +3302,21 @@ static void configure_buffer_updates(AV1_COMP *cpi) {
     return;
   }
 #endif  // USE_GF16_MULTI_LAYER
-#endif  // CONFIG_EXT_REFS
 
   switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
-    case KF_UPDATE: cpi->refresh_last_frame = 1; cpi->refresh_golden_frame = 1;
-#if CONFIG_EXT_REFS
+    case KF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
       cpi->refresh_bwd_ref_frame = 1;
       cpi->refresh_alt2_ref_frame = 1;
-#endif  // CONFIG_EXT_REFS
       cpi->refresh_alt_ref_frame = 1;
       break;
 
-    case LF_UPDATE: cpi->refresh_last_frame = 1; cpi->refresh_golden_frame = 0;
-#if CONFIG_EXT_REFS
+    case LF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
       cpi->refresh_bwd_ref_frame = 0;
       cpi->refresh_alt2_ref_frame = 0;
-#endif  // CONFIG_EXT_REFS
       cpi->refresh_alt_ref_frame = 0;
       break;
 
@@ -3616,35 +3325,30 @@ static void configure_buffer_updates(AV1_COMP *cpi) {
       //               needed.
       cpi->refresh_last_frame = 1;
       cpi->refresh_golden_frame = 1;
-#if CONFIG_EXT_REFS
       cpi->refresh_bwd_ref_frame = 0;
       cpi->refresh_alt2_ref_frame = 0;
-#endif  // CONFIG_EXT_REFS
       cpi->refresh_alt_ref_frame = 0;
       break;
 
     case OVERLAY_UPDATE:
       cpi->refresh_last_frame = 0;
       cpi->refresh_golden_frame = 1;
-#if CONFIG_EXT_REFS
       cpi->refresh_bwd_ref_frame = 0;
       cpi->refresh_alt2_ref_frame = 0;
-#endif  // CONFIG_EXT_REFS
       cpi->refresh_alt_ref_frame = 0;
 
       cpi->rc.is_src_frame_alt_ref = 1;
       break;
 
-    case ARF_UPDATE: cpi->refresh_last_frame = 0; cpi->refresh_golden_frame = 0;
-#if CONFIG_EXT_REFS
+    case ARF_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
       // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
       cpi->refresh_bwd_ref_frame = 0;
       cpi->refresh_alt2_ref_frame = 0;
-#endif  // CONFIG_EXT_REFS
       cpi->refresh_alt_ref_frame = 1;
       break;
 
-#if CONFIG_EXT_REFS
     case BRF_UPDATE:
       cpi->refresh_last_frame = 0;
       cpi->refresh_golden_frame = 0;
@@ -3693,7 +3397,6 @@ static void configure_buffer_updates(AV1_COMP *cpi) {
       cpi->refresh_alt2_ref_frame = 1;
       cpi->refresh_alt_ref_frame = 0;
       break;
-#endif  // CONFIG_EXT_REFS
 
     default: assert(0); break;
   }
@@ -3734,11 +3437,8 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
 
   // If this is an arf frame then we dont want to read the stats file or
   // advance the input pointer as we already have what we need.
-  if (gf_group->update_type[gf_group->index] == ARF_UPDATE
-#if CONFIG_EXT_REFS
-      || gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE
-#endif  // CONFIG_EXT_REFS
-      ) {
+  if (gf_group->update_type[gf_group->index] == ARF_UPDATE ||
+      gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
     configure_buffer_updates(cpi);
     target_rate = gf_group->bit_allocation[gf_group->index];
     target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate);
@@ -3850,6 +3550,8 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
     // applied when combining MB error values for the frame.
     twopass->mb_av_energy =
         log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0);
+    twopass->frame_avg_haar_energy =
+        log((this_frame.frame_avg_wavelet_energy / num_mbs) + 1.0);
   }
 
   // Update the total stats remaining structure.
diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h
index 9ac542bf3..4ff0f73b0 100644
--- a/third_party/aom/av1/encoder/firstpass.h
+++ b/third_party/aom/av1/encoder/firstpass.h
@@ -42,7 +42,6 @@ typedef struct {
 } FIRSTPASS_MB_STATS;
 #endif
 
-#if CONFIG_EXT_REFS
 // Length of the bi-predictive frame group (BFG)
 // NOTE: Currently each BFG contains one backward ref (BWF) frame plus a certain
 //       number of bi-predictive frames.
@@ -64,7 +63,6 @@ typedef struct {
 #define MAX_SR_CODED_ERROR 40
 #define MAX_RAW_ERR_VAR 2000
 #define MIN_MV_IN_OUT 0.4
-#endif  // CONFIG_EXT_REFS
 
 #define VLOW_MOTION_THRESHOLD 950
 
@@ -72,6 +70,7 @@ typedef struct {
   double frame;
   double weight;
   double intra_error;
+  double frame_avg_wavelet_energy;
   double coded_error;
   double sr_coded_error;
   double pcnt_inter;
@@ -91,10 +90,8 @@ typedef struct {
   double new_mv_count;
   double duration;
   double count;
-#if CONFIG_EXT_REFS || CONFIG_BGSPRITE
   // standard deviation for (0, 0) motion prediction error
   double raw_error_stdev;
-#endif  // CONFIG_EXT_REFS
 } FIRSTPASS_STATS;
 
 typedef enum {
@@ -103,16 +100,12 @@ typedef enum {
   GF_UPDATE = 2,
   ARF_UPDATE = 3,
   OVERLAY_UPDATE = 4,
-#if CONFIG_EXT_REFS
   BRF_UPDATE = 5,            // Backward Reference Frame
   LAST_BIPRED_UPDATE = 6,    // Last Bi-predictive Frame
   BIPRED_UPDATE = 7,         // Bi-predictive Frame, but not the last one
   INTNL_OVERLAY_UPDATE = 8,  // Internal Overlay Frame
   INTNL_ARF_UPDATE = 9,      // Internal Altref Frame (candidate for ALTREF2)
   FRAME_UPDATE_TYPES = 10
-#else   // !CONFIG_EXT_REFS
-  FRAME_UPDATE_TYPES = 5
-#endif  // CONFIG_EXT_REFS
 } FRAME_UPDATE_TYPE;
 
 #define FC_ANIMATION_THRESH 0.15
@@ -129,13 +122,11 @@ typedef struct {
   unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
-#if CONFIG_EXT_REFS
   unsigned char brf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char bidir_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char ref_fb_idx_map[(MAX_LAG_BUFFERS * 2) + 1][REF_FRAMES];
   unsigned char refresh_idx[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char refresh_flag[(MAX_LAG_BUFFERS * 2) + 1];
-#endif  // CONFIG_EXT_REFS
   int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
 } GF_GROUP;
 
@@ -153,6 +144,7 @@ typedef struct {
   double modified_error_max;
   double modified_error_left;
   double mb_av_energy;
+  double frame_avg_haar_energy;
 
 #if CONFIG_FP_MB_STATS
   uint8_t *frame_mb_stats_buf;
@@ -198,7 +190,6 @@ void av1_rc_get_second_pass_params(struct AV1_COMP *cpi);
 // Post encode update of the rate control parameters for 2-pass
 void av1_twopass_postencode_update(struct AV1_COMP *cpi);
 
-#if CONFIG_EXT_REFS
 #if USE_GF16_MULTI_LAYER
 void av1_ref_frame_map_idx_updates(struct AV1_COMP *cpi, int gf_frame_index);
 #endif  // USE_GF16_MULTI_LAYER
@@ -213,7 +204,6 @@ static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) {
   else
     return 0;
 }
-#endif  // CONFIG_EXT_REFS
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/generic_encoder.c b/third_party/aom/av1/encoder/generic_encoder.c
deleted file mode 100644
index a31bb9ef6..000000000
--- a/third_party/aom/av1/encoder/generic_encoder.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include <stdio.h>
-
-#include "aom_dsp/bitwriter.h"
-#include "av1/common/generic_code.h"
-#include "av1/common/odintrin.h"
-#include "pvq_encoder.h"
-
-/** Encodes a value from 0 to N-1 (with N up to 16) based on a cdf and adapts
- * the cdf accordingly.
- *
- * @param [in,out] w     multi-symbol entropy encoder
- * @param [in]     val   variable being encoded
- * @param [in,out] cdf   CDF of the variable (Q15)
- * @param [in]     n     number of values possible
- * @param [in,out] count number of symbols encoded with that cdf so far
- * @param [in]     rate  adaptation rate shift (smaller is faster)
- */
-void aom_encode_cdf_adapt_q15(aom_writer *w, int val, uint16_t *cdf, int n,
- int *count, int rate) {
-  int i;
-  if (*count == 0) {
-    /* On the first call, we normalize the cdf to (32768 - n). This should
-       eventually be moved to the state init, but for now it makes it much
-       easier to experiment and convert symbols to the Q15 adaptation.*/
-    int ft;
-    ft = cdf[n - 1];
-    for (i = 0; i < n; i++) {
-      cdf[i] = AOM_ICDF(cdf[i]*32768/ft);
-    }
-  }
-  aom_write_cdf(w, val, cdf, n);
-  aom_cdf_adapt_q15(val, cdf, n, count, rate);
-}
-
-/** Encodes a random variable using a "generic" model, assuming that the
- * distribution is one-sided (zero and up), has a single mode, and decays
- * exponentially past the model.
- *
- * @param [in,out] w     multi-symbol entropy encoder
- * @param [in,out] model generic probability model
- * @param [in]     x     variable being encoded
- * @param [in,out] ExQ16 expectation of x (adapted)
- * @param [in]     integration integration period of ExQ16 (leaky average over
- * 1<<integration samples)
- */
-void generic_encode(aom_writer *w, generic_encoder *model, int x,
- int *ex_q16, int integration) {
-  int lg_q1;
-  int shift;
-  int id;
-  uint16_t *cdf;
-  int xs;
-  lg_q1 = log_ex(*ex_q16);
-  OD_LOG((OD_LOG_ENTROPY_CODER, OD_LOG_DEBUG,
-   "%d %d", *ex_q16, lg_q1));
-  /* If expectation is too large, shift x to ensure that
-     all we have past xs=15 is the exponentially decaying tail
-     of the distribution */
-  shift = OD_MAXI(0, (lg_q1 - 5) >> 1);
-  /* Choose the cdf to use: we have two per "octave" of ExQ16 */
-  id = OD_MINI(GENERIC_TABLES - 1, lg_q1);
-  cdf = model->cdf[id];
-  xs = (x + (1 << shift >> 1)) >> shift;
-  aom_write_symbol_pvq(w, OD_MINI(15, xs), cdf, 16);
-  if (xs >= 15) {
-    int e;
-    unsigned decay;
-    /* Estimate decay based on the assumption that the distribution is close
-       to Laplacian for large values. We should probably have an adaptive
-       estimate instead. Note: The 2* is a kludge that's not fully understood
-       yet. */
-    OD_ASSERT(*ex_q16 < INT_MAX >> 1);
-    e = ((2**ex_q16 >> 8) + (1 << shift >> 1)) >> shift;
-    decay = OD_MAXI(2, OD_MINI(254, 256*e/(e + 256)));
-    /* Encode the tail of the distribution assuming exponential decay. */
-    aom_laplace_encode_special(w, xs - 15, decay);
-  }
-  if (shift != 0) {
-    int special;
-    /* Because of the rounding, there's only half the number of possibilities
-       for xs=0. */
-    special = xs == 0;
-    if (shift - special > 0) {
-      aom_write_literal(w, x - (xs << shift) + (!special << (shift - 1)),
-       shift - special);
-    }
-  }
-  generic_model_update(ex_q16, x, integration);
-  OD_LOG((OD_LOG_ENTROPY_CODER, OD_LOG_DEBUG,
-   "enc: %d %d %d %d %d %x", *ex_q16, x, shift, id, xs, enc->rng));
-}
-
-/** Estimates the cost of encoding a value with generic_encode().
- *
- * @param [in,out] model generic probability model
- * @param [in]     x     variable being encoded
- * @param [in,out] ExQ16 expectation of x (adapted)
- * @return number of bits (approximation)
- */
-double generic_encode_cost(generic_encoder *model, int x, int *ex_q16) {
-  int lg_q1;
-  int shift;
-  int id;
-  uint16_t *cdf;
-  int xs;
-  int extra;
-  lg_q1 = log_ex(*ex_q16);
-  /* If expectation is too large, shift x to ensure that
-       all we have past xs=15 is the exponentially decaying tail
-       of the distribution */
-  shift = OD_MAXI(0, (lg_q1 - 5) >> 1);
-  /* Choose the cdf to use: we have two per "octave" of ExQ16 */
-  id = OD_MINI(GENERIC_TABLES - 1, lg_q1);
-  cdf = model->cdf[id];
-  xs = (x + (1 << shift >> 1)) >> shift;
-  extra = 0;
-  if (shift) extra = shift - (xs == 0);
-  xs = OD_MINI(15, xs);
-  /* Shortcut: assume it's going to cost 2 bits for the Laplace coder. */
-  if (xs == 15) extra += 2;
-  return
-      extra - OD_LOG2((double)(cdf[xs] - (xs == 0 ? 0 : cdf[xs - 1]))/cdf[15]);
-}
-
-/*Estimates the cost of encoding a value with a given CDF.*/
-double od_encode_cdf_cost(int val, uint16_t *cdf, int n) {
-  int total_prob;
-  int prev_prob;
-  double val_prob;
-  OD_ASSERT(n > 0);
-  total_prob = cdf[n - 1];
-  if (val == 0) {
-    prev_prob = 0;
-  }
-  else {
-    prev_prob = cdf[val - 1];
-  }
-  val_prob = (cdf[val] - prev_prob) / (double)total_prob;
-  return -OD_LOG2(val_prob);
-}
diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c
index 4d44e9a6f..f07d1bc00 100644
--- a/third_party/aom/av1/encoder/global_motion.c
+++ b/third_party/aom/av1/encoder/global_motion.c
@@ -32,12 +32,14 @@
 // Border over which to compute the global motion
 #define ERRORADV_BORDER 0
 
-#define ERRORADV_MAX_THRESH 0.995
-#define ERRORADV_COST_PRODUCT_THRESH 26000
+static const double erroradv_tr[] = { 0.75, 0.70, 0.65 };
+static const double erroradv_prod_tr[] = { 22000, 20000, 18000 };
 
-int is_enough_erroradvantage(double best_erroradvantage, int params_cost) {
-  return best_erroradvantage < ERRORADV_MAX_THRESH &&
-         best_erroradvantage * params_cost < ERRORADV_COST_PRODUCT_THRESH;
+int is_enough_erroradvantage(double best_erroradvantage, int params_cost,
+                             int erroradv_type) {
+  assert(erroradv_type < GM_ERRORADV_TR_TYPES);
+  return best_erroradvantage < erroradv_tr[erroradv_type] &&
+         best_erroradvantage * params_cost < erroradv_prod_tr[erroradv_type];
 }
 
 static void convert_to_params(const double *params, int32_t *model) {
@@ -76,6 +78,7 @@ static void convert_to_params(const double *params, int32_t *model) {
 void convert_model_to_params(const double *params, WarpedMotionParams *model) {
   convert_to_params(params, model->wmmat);
   model->wmtype = get_gmtype(model);
+  model->invalid = 0;
 }
 
 // Adds some offset to a global motion parameter and handles
@@ -110,32 +113,31 @@ static int32_t add_param_offset(int param_index, int32_t param_value,
 
 static void force_wmtype(WarpedMotionParams *wm, TransformationType wmtype) {
   switch (wmtype) {
-    case IDENTITY: wm->wmmat[0] = 0; wm->wmmat[1] = 0;
+    case IDENTITY:
+      wm->wmmat[0] = 0;
+      wm->wmmat[1] = 0;
+      AOM_FALLTHROUGH_INTENDED;
     case TRANSLATION:
       wm->wmmat[2] = 1 << WARPEDMODEL_PREC_BITS;
       wm->wmmat[3] = 0;
-    case ROTZOOM: wm->wmmat[4] = -wm->wmmat[3]; wm->wmmat[5] = wm->wmmat[2];
+      AOM_FALLTHROUGH_INTENDED;
+    case ROTZOOM:
+      wm->wmmat[4] = -wm->wmmat[3];
+      wm->wmmat[5] = wm->wmmat[2];
+      AOM_FALLTHROUGH_INTENDED;
     case AFFINE: wm->wmmat[6] = wm->wmmat[7] = 0; break;
-    case HORTRAPEZOID: wm->wmmat[6] = wm->wmmat[4] = 0; break;
-    case VERTRAPEZOID: wm->wmmat[7] = wm->wmmat[3] = 0; break;
-    case HOMOGRAPHY: break;
     default: assert(0);
   }
   wm->wmtype = wmtype;
 }
 
 int64_t refine_integerized_param(WarpedMotionParams *wm,
-                                 TransformationType wmtype,
-#if CONFIG_HIGHBITDEPTH
-                                 int use_hbd, int bd,
-#endif  // CONFIG_HIGHBITDEPTH
+                                 TransformationType wmtype, int use_hbd, int bd,
                                  uint8_t *ref, int r_width, int r_height,
                                  int r_stride, uint8_t *dst, int d_width,
                                  int d_height, int d_stride, int n_refinements,
                                  int64_t best_frame_error) {
-  static const int max_trans_model_params[TRANS_TYPES] = {
-    0, 2, 4, 6, 8, 8, 8
-  };
+  static const int max_trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
   const int border = ERRORADV_BORDER;
   int i = 0, p;
   int n_params = max_trans_model_params[wmtype];
@@ -147,35 +149,26 @@ int64_t refine_integerized_param(WarpedMotionParams *wm,
   int32_t best_param;
 
   force_wmtype(wm, wmtype);
-  best_error = av1_warp_error(
-      wm,
-#if CONFIG_HIGHBITDEPTH
-      use_hbd, bd,
-#endif  // CONFIG_HIGHBITDEPTH
-      ref, r_width, r_height, r_stride, dst + border * d_stride + border,
-      border, border, d_width - 2 * border, d_height - 2 * border, d_stride, 0,
-      0, SCALE_SUBPEL_SHIFTS, SCALE_SUBPEL_SHIFTS, best_frame_error);
+  best_error = av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                              dst + border * d_stride + border, border, border,
+                              d_width - 2 * border, d_height - 2 * border,
+                              d_stride, 0, 0, best_frame_error);
   best_error = AOMMIN(best_error, best_frame_error);
   step = 1 << (n_refinements - 1);
   for (i = 0; i < n_refinements; i++, step >>= 1) {
     for (p = 0; p < n_params; ++p) {
       int step_dir = 0;
       // Skip searches for parameters that are forced to be 0
-      if (wmtype == HORTRAPEZOID && (p == 4 || p == 6)) continue;
-      if (wmtype == VERTRAPEZOID && (p == 3 || p == 7)) continue;
       param = param_mat + p;
       curr_param = *param;
       best_param = curr_param;
       // look to the left
       *param = add_param_offset(p, curr_param, -step);
-      step_error = av1_warp_error(
-          wm,
-#if CONFIG_HIGHBITDEPTH
-          use_hbd, bd,
-#endif  // CONFIG_HIGHBITDEPTH
-          ref, r_width, r_height, r_stride, dst + border * d_stride + border,
-          border, border, d_width - 2 * border, d_height - 2 * border, d_stride,
-          0, 0, SCALE_SUBPEL_SHIFTS, SCALE_SUBPEL_SHIFTS, best_error);
+      step_error =
+          av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                         dst + border * d_stride + border, border, border,
+                         d_width - 2 * border, d_height - 2 * border, d_stride,
+                         0, 0, best_error);
       if (step_error < best_error) {
         best_error = step_error;
         best_param = *param;
@@ -184,14 +177,11 @@ int64_t refine_integerized_param(WarpedMotionParams *wm,
 
       // look to the right
       *param = add_param_offset(p, curr_param, step);
-      step_error = av1_warp_error(
-          wm,
-#if CONFIG_HIGHBITDEPTH
-          use_hbd, bd,
-#endif  // CONFIG_HIGHBITDEPTH
-          ref, r_width, r_height, r_stride, dst + border * d_stride + border,
-          border, border, d_width - 2 * border, d_height - 2 * border, d_stride,
-          0, 0, SCALE_SUBPEL_SHIFTS, SCALE_SUBPEL_SHIFTS, best_error);
+      step_error =
+          av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                         dst + border * d_stride + border, border, border,
+                         d_width - 2 * border, d_height - 2 * border, d_stride,
+                         0, 0, best_error);
       if (step_error < best_error) {
         best_error = step_error;
         best_param = *param;
@@ -203,15 +193,11 @@ int64_t refine_integerized_param(WarpedMotionParams *wm,
       // for the biggest step size
       while (step_dir) {
         *param = add_param_offset(p, best_param, step * step_dir);
-        step_error = av1_warp_error(
-            wm,
-#if CONFIG_HIGHBITDEPTH
-            use_hbd, bd,
-#endif  // CONFIG_HIGHBITDEPTH
-            ref, r_width, r_height, r_stride, dst + border * d_stride + border,
-            border, border, d_width - 2 * border, d_height - 2 * border,
-            d_stride, 0, 0, SCALE_SUBPEL_SHIFTS, SCALE_SUBPEL_SHIFTS,
-            best_error);
+        step_error =
+            av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                           dst + border * d_stride + border, border, border,
+                           d_width - 2 * border, d_height - 2 * border,
+                           d_stride, 0, 0, best_error);
         if (step_error < best_error) {
           best_error = step_error;
           best_param = *param;
@@ -229,9 +215,6 @@ int64_t refine_integerized_param(WarpedMotionParams *wm,
 
 static INLINE RansacFunc get_ransac_type(TransformationType type) {
   switch (type) {
-    case HOMOGRAPHY: return ransac_homography;
-    case HORTRAPEZOID: return ransac_hortrapezoid;
-    case VERTRAPEZOID: return ransac_vertrapezoid;
     case AFFINE: return ransac_affine;
     case ROTZOOM: return ransac_rotzoom;
     case TRANSLATION: return ransac_translation;
@@ -239,7 +222,6 @@ static INLINE RansacFunc get_ransac_type(TransformationType type) {
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 static unsigned char *downconvert_frame(YV12_BUFFER_CONFIG *frm,
                                         int bit_depth) {
   int i, j;
@@ -257,14 +239,13 @@ static unsigned char *downconvert_frame(YV12_BUFFER_CONFIG *frm,
   }
   return buf_8bit;
 }
-#endif
 
-int compute_global_motion_feature_based(
-    TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref,
-#if CONFIG_HIGHBITDEPTH
-    int bit_depth,
-#endif
-    int *num_inliers_by_motion, double *params_by_motion, int num_motions) {
+int compute_global_motion_feature_based(TransformationType type,
+                                        YV12_BUFFER_CONFIG *frm,
+                                        YV12_BUFFER_CONFIG *ref, int bit_depth,
+                                        int *num_inliers_by_motion,
+                                        double *params_by_motion,
+                                        int num_motions) {
   int i;
   int num_frm_corners, num_ref_corners;
   int num_correspondences;
@@ -274,7 +255,6 @@ int compute_global_motion_feature_based(
   unsigned char *ref_buffer = ref->y_buffer;
   RansacFunc ransac = get_ransac_type(type);
 
-#if CONFIG_HIGHBITDEPTH
   if (frm->flags & YV12_FLAG_HIGHBITDEPTH) {
     // The frame buffer is 16-bit, so we need to convert to 8 bits for the
     // following code. We cache the result until the frame is released.
@@ -283,7 +263,6 @@ int compute_global_motion_feature_based(
   if (ref->flags & YV12_FLAG_HIGHBITDEPTH) {
     ref_buffer = downconvert_frame(ref, bit_depth);
   }
-#endif
 
   // compute interest points in images using FAST features
   num_frm_corners = fast_corner_detect(frm_buffer, frm->y_width, frm->y_height,
diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h
index 7fca5327f..2c15753fd 100644
--- a/third_party/aom/av1/encoder/global_motion.h
+++ b/third_party/aom/av1/encoder/global_motion.h
@@ -24,16 +24,14 @@ extern "C" {
 
 void convert_model_to_params(const double *params, WarpedMotionParams *model);
 
-int is_enough_erroradvantage(double erroradv, int params_cost);
+int is_enough_erroradvantage(double best_erroradvantage, int params_cost,
+                             int erroradv_type);
 
 // Returns the av1_warp_error between "dst" and the result of applying the
 // motion params that result from fine-tuning "wm" to "ref". Note that "wm" is
 // modified in place.
 int64_t refine_integerized_param(WarpedMotionParams *wm,
-                                 TransformationType wmtype,
-#if CONFIG_HIGHBITDEPTH
-                                 int use_hbd, int bd,
-#endif  // CONFIG_HIGHBITDEPTH
+                                 TransformationType wmtype, int use_hbd, int bd,
                                  uint8_t *ref, int r_width, int r_height,
                                  int r_stride, uint8_t *dst, int d_width,
                                  int d_height, int d_stride, int n_refinements,
@@ -54,12 +52,12 @@ int64_t refine_integerized_param(WarpedMotionParams *wm,
   number of inlier feature points for each motion. Params for which the
   num_inliers entry is 0 should be ignored by the caller.
 */
-int compute_global_motion_feature_based(
-    TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref,
-#if CONFIG_HIGHBITDEPTH
-    int bit_depth,
-#endif
-    int *num_inliers_by_motion, double *params_by_motion, int num_motions);
+int compute_global_motion_feature_based(TransformationType type,
+                                        YV12_BUFFER_CONFIG *frm,
+                                        YV12_BUFFER_CONFIG *ref, int bit_depth,
+                                        int *num_inliers_by_motion,
+                                        double *params_by_motion,
+                                        int num_motions);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/grain_test_vectors.h b/third_party/aom/av1/encoder/grain_test_vectors.h
new file mode 100644
index 000000000..45632da9b
--- /dev/null
+++ b/third_party/aom/av1/encoder/grain_test_vectors.h
@@ -0,0 +1,781 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_GRAIN_TEST_VECTORS_H_
+#define AV1_GRAIN_TEST_VECTORS_H_
+
+/* Test vectors for emulation of different film grain types.
+ * Note that bit depth would be derived from the bitstream and
+ * not signaled in film grain metadata. The parameters are valid
+ * for any bit depth.
+ */
+static aom_film_grain_t film_grain_test_vectors[16] = {
+  /* Test 1 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 16, 0 },
+        { 25, 136 },
+        { 33, 144 },
+        { 41, 160 },
+        { 48, 168 },
+        { 56, 136 },
+        { 67, 128 },
+        { 82, 144 },
+        { 97, 152 },
+        { 113, 144 },
+        { 128, 176 },
+        { 143, 168 },
+        { 158, 176 },
+        { 178, 184 } },
+      14 /* num_points_y */,
+      { { 16, 0 },
+        { 20, 64 },
+        { 28, 88 },
+        { 60, 104 },
+        { 90, 136 },
+        { 105, 160 },
+        { 134, 168 },
+        { 168, 208 } },
+      8 /* num_cb_points */,
+      { { 16, 0 },
+        { 28, 96 },
+        { 56, 80 },
+        { 66, 96 },
+        { 80, 104 },
+        { 108, 96 },
+        { 122, 112 },
+        { 137, 112 },
+        { 169, 176 } },
+      9 /* num_cr_points */,
+      11 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 },
+      { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 },
+      { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 },
+      8 /* ar_coeff_shift */,
+      247 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      18 /* cb_offset */,
+      229 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      54 /* cr_offset */,
+      0 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /* chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 2 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 96 }, { 255, 96 } },
+      2 /* num_points_y */,
+      { { 0, 64 }, { 255, 64 } },
+      2 /* num_cb_points */,
+      { { 0, 64 }, { 255, 64 } },
+      2 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 3 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 192 }, { 255, 192 } },
+      2 /* num_points_y */,
+      { { 0, 128 }, { 255, 128 } },
+      2 /* num_cb_points */,
+      { { 0, 128 }, { 255, 128 } },
+      2 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          4,   -7, 2,  4,   12, -12, 5,   -8, 6,  8,   -19, -16, 19,
+          -10, -2, 17, -42, 58, -2,  -13, 9,  14, -36, 67,  0,
+      },
+      {
+          4,   -7, 2,  4,   12, -12, 5,   -8, 6,  8,   -19, -16, 19,
+          -10, -2, 17, -42, 58, -2,  -13, 9,  14, -36, 67,  0,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      1 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 4 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 16, 0 },
+          { 24, 137 },
+          { 53, 146 },
+          { 63, 155 },
+          { 78, 155 },
+          { 107, 150 },
+          { 122, 147 },
+          { 136, 147 },
+          { 166, 153 },
+      },
+      9 /* num_points_y */,
+      {
+          { 16, 0 },
+          { 20, 72 },
+          { 27, 82 },
+          { 33, 91 },
+          { 69, 121 },
+          { 95, 143 },
+          { 108, 154 },
+          { 134, 169 },
+          { 147, 177 },
+      },
+      9 /* num_cb_points */,
+      {
+          { 16, 0 },
+          { 24, 95 },
+          { 54, 93 },
+          { 65, 94 },
+          { 79, 98 },
+          { 109, 107 },
+          { 124, 119 },
+          { 139, 136 },
+          { 169, 170 },
+      },
+      9 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          7,  -9,  2, 4,   7, -12, 7,  -18, 18, -30, -27, -42,
+          13, -20, 7, -18, 6, 107, 55, -2,  -4, -9,  -22, 113,
+      },
+      {
+          -3, -1, -4,  3,   -6,  -2,  3,  1,  -4, -10, -10, -5, -5,
+          -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66,  0,
+      },
+      {
+          0,  4, -3, 13,  0,  1,   -3, 0,  -3, -10, -68, -4, -2,
+          -5, 2, -3, -20, 62, -31, 0,  -4, -1, -8,  -29, 0,
+      },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 5 */
+  {
+      1 /* apply_grain */,
+      0 /* update_parameters */,
+      { { 0, 64 }, { 255, 64 } },
+      2 /* num_points_y */,
+      {
+          { 0, 96 },
+          { 32, 90 },
+          { 64, 83 },
+          { 96, 76 },
+          { 128, 68 },
+          { 159, 59 },
+          { 191, 48 },
+          { 223, 34 },
+          { 255, 0 },
+      },
+      9 /* num_cb_points */,
+      {
+          { 0, 0 },
+          { 32, 34 },
+          { 64, 48 },
+          { 96, 59 },
+          { 128, 68 },
+          { 159, 76 },
+          { 191, 83 },
+          { 223, 90 },
+          { 255, 96 },
+      },
+      9 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          -2, 2,  -5, 7,   -6, 4,   -2, -1, 1,  -2,  0,  -2, 2,
+          -3, -5, 13, -13, 6,  -14, 8,  -1, 18, -36, 58, 0,
+      },
+      {
+          -2, -1, -3, 14, -4, -1, -3, 0, -1, 7, -31, 7, 2,
+          0,  1,  0,  -7, 50, -8, -2, 2, 2,  2, -4,  0,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      1063 /* random_seed */
+  },
+  /* Test 6 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 0, 96 },
+          { 20, 92 },
+          { 39, 88 },
+          { 59, 84 },
+          { 78, 80 },
+          { 98, 75 },
+          { 118, 70 },
+          { 137, 65 },
+          { 157, 60 },
+          { 177, 53 },
+          { 196, 46 },
+          { 216, 38 },
+          { 235, 27 },
+          { 255, 0 },
+      },
+      14 /* num_points_y */,
+      { { 0, 0 } },
+      0 /* num_cb_points */,
+      { { 0, 0 } },
+      0 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      2754 /* random_seed */
+  },
+  /* Test 7 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 0, 0 },
+          { 20, 27 },
+          { 39, 38 },
+          { 59, 46 },
+          { 78, 53 },
+          { 98, 60 },
+          { 118, 65 },
+          { 137, 70 },
+          { 157, 75 },
+          { 177, 80 },
+          { 196, 84 },
+          { 216, 88 },
+          { 235, 92 },
+          { 255, 96 },
+      },
+      14 /* num_points_y */,
+      { { 0, 0 }, { 255, 0 } },
+      2 /* num_cb_points */,
+      { { 0, 0 }, { 255, 0 } },
+      2 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 8 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 96 }, { 255, 96 } },
+      2 /* num_points_y */,
+      { { 0, 62 }, { 255, 62 } },
+      2 /* num_cb_points */,
+      { { 0, 62 }, { 255, 62 } },
+      2 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0,  -2, -2, 8,   5,  -1, 1,   -1, 5,  16,  -33, -9,  6,
+          -1, -3, 10, -47, 63, 0,  -15, 3,  11, -42, 75,  -69,
+      },
+      {
+          1,  -1, -1, 9,   5,  0, 1,   -1, 5,  15,  -32, -10, 8,
+          -2, -4, 11, -46, 62, 1, -16, 3,  13, -43, 75,  -55,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 9 */
+  {
+      1 /* apply_grain */,
+      0 /* update_parameters */,
+      { { 0, 48 }, { 255, 48 } },
+      2 /* num_points_y */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_cb_points */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 10 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 48 }, { 255, 48 } },
+      2 /* num_points_y */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_cb_points */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+      { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 },
+      { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 11 */
+  {
+      1 /* apply_grain */,
+      0 /* update_parameters */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_points_y */,
+      {
+          { 0, 48 },
+          { 32, 45 },
+          { 64, 42 },
+          { 96, 38 },
+          { 128, 34 },
+          { 159, 29 },
+          { 191, 24 },
+          { 223, 17 },
+          { 255, 0 },
+      },
+      9 /* num_cb_points */,
+      {
+          { 0, 0 },
+          { 32, 17 },
+          { 64, 24 },
+          { 96, 29 },
+          { 128, 34 },
+          { 159, 38 },
+          { 191, 42 },
+          { 223, 45 },
+          { 255, 48 },
+      },
+      9 /* num_cr_points */,
+      10 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          7,  -9,  2, 4,   7, -12, 7,  -18, 18, -30, -27, -42,
+          13, -20, 7, -18, 6, 107, 55, -2,  -4, -9,  -22, 113,
+      },
+      {
+          -3, -1, -4,  3,   -6,  -2,  3,  1,  -4, -10, -10, -5, -5,
+          -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66,  0,
+      },
+      {
+          0,  4, -3, 13,  0,  1,   -3, 0,  -3, -10, -68, -4, -2,
+          -5, 2, -3, -20, 62, -31, 0,  -4, -1, -8,  -29, 0,
+      },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      1357 /* random_seed */
+  },
+  /* Test 12 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 16, 0 },
+          { 24, 49 },
+          { 39, 69 },
+          { 46, 84 },
+          { 53, 91 },
+          { 63, 100 },
+          { 78, 114 },
+          { 92, 134 },
+          { 164, 139 },
+      },
+      9 /* num_points_y */,
+      {
+          { 16, 0 },
+          { 20, 31 },
+          { 26, 42 },
+          { 33, 54 },
+          { 40, 65 },
+          { 47, 72 },
+          { 56, 85 },
+          { 84, 123 },
+          { 152, 157 },
+      },
+      9 /* num_cb_points */,
+      {
+          { 16, 0 },
+          { 25, 14 },
+          { 39, 33 },
+          { 47, 40 },
+          { 54, 47 },
+          { 64, 62 },
+          { 79, 76 },
+          { 94, 83 },
+          { 167, 101 },
+      },
+      9 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 },
+      { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 },
+      { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      0 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 13 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 0, 48 },
+          { 20, 46 },
+          { 39, 44 },
+          { 59, 42 },
+          { 78, 40 },
+          { 98, 38 },
+          { 118, 35 },
+          { 137, 33 },
+          { 157, 30 },
+          { 177, 27 },
+          { 196, 23 },
+          { 216, 19 },
+          { 235, 13 },
+          { 255, 0 },
+      },
+      14 /* num_points_y */,
+      { { 0, 0 }, { 255, 0 } },
+      0 /* num_cb_points */,
+      { { 0, 0 }, { 255, 0 } },
+      0 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 14 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 0, 0 },
+          { 20, 13 },
+          { 39, 19 },
+          { 59, 23 },
+          { 78, 27 },
+          { 98, 30 },
+          { 118, 33 },
+          { 137, 35 },
+          { 157, 38 },
+          { 177, 40 },
+          { 196, 42 },
+          { 216, 44 },
+          { 235, 46 },
+          { 255, 48 },
+      },
+      14 /* num_points_y */,
+      { { 0, 0 }, { 255, 0 } },
+      0 /* num_cb_points */,
+      { { 0, 0 }, { 255, 0 } },
+      0 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 15 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 96 }, { 255, 96 } },
+      1 /* num_points_y */,
+      { { 0, 96 }, { 255, 96 } },
+      0 /* num_cb_points */,
+      { { 0, 96 }, { 255, 96 } },
+      0 /* num_cr_points */,
+      11 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 5, -15, -10, -19, 0, -12, 6, 51, 30, -5, -12, 56 },
+      { 2, 2, -24, -5, 1, 1, -18, 37, -2, 0, -15, 39, -70 },
+      { 2, 3, -24, -5, -1, 0, -18, 38, -2, 0, -15, 39, -55 },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      1 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 16 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 16, 0 },
+          { 58, 126 },
+          { 87, 120 },
+          { 97, 122 },
+          { 112, 125 },
+          { 126, 131 },
+          { 141, 139 },
+          { 199, 153 },
+      },
+      8 /* num_points_y */,
+      {
+          { 16, 0 },
+          { 59, 68 },
+          { 66, 76 },
+          { 73, 82 },
+          { 79, 85 },
+          { 86, 86 },
+          { 151, 95 },
+          { 192, 101 },
+      },
+      8 /* num_cb_points */,
+      {
+          { 16, 0 },
+          { 59, 64 },
+          { 89, 80 },
+          { 99, 86 },
+          { 114, 90 },
+          { 129, 93 },
+          { 144, 97 },
+          { 203, 85 },
+      },
+      8 /* num_cr_points */,
+      10 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0,  -2, -2, 8,   5,  -1, 1,   -1, 5,  16,  -33, -9,  6,
+          -1, -3, 10, -47, 63, 0,  -15, 3,  11, -42, 75,  -69,
+      },
+      {
+          1,  -1, -1, 9,   5,  0, 1,   -1, 5,  15,  -32, -10, 8,
+          -2, -4, 11, -46, 62, 1, -16, 3,  13, -43, 75,  -55,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      2 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+};
+#endif  // AV1_GRAIN_TEST_VECTORS_H_
diff --git a/third_party/aom/av1/encoder/hash.c b/third_party/aom/av1/encoder/hash.c
index 89c5bd8a3..180115d9f 100644
--- a/third_party/aom/av1/encoder/hash.c
+++ b/third_party/aom/av1/encoder/hash.c
@@ -22,7 +22,7 @@ static void crc_calculator_process_data(CRC_CALCULATOR *p_crc_calculator,
   }
 }
 
-void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) {
+static void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) {
   p_crc_calculator->remainder = 0;
 }
 
@@ -61,9 +61,65 @@ void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
   crc_calculator_init_table(p_crc_calculator);
 }
 
-uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
-                           int length) {
+uint32_t av1_get_crc_value(void *crc_calculator, uint8_t *p, int length) {
+  CRC_CALCULATOR *p_crc_calculator = (CRC_CALCULATOR *)crc_calculator;
   crc_calculator_reset(p_crc_calculator);
   crc_calculator_process_data(p_crc_calculator, p, length);
   return crc_calculator_get_crc(p_crc_calculator);
 }
+
+/* CRC-32C (iSCSI) polynomial in reversed bit order. */
+#define POLY 0x82f63b78
+
+/* Construct table for software CRC-32C calculation. */
+void av1_crc32c_calculator_init(CRC32C *p_crc32c) {
+  uint32_t crc;
+
+  for (int n = 0; n < 256; n++) {
+    crc = n;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    p_crc32c->table[0][n] = crc;
+  }
+  for (int n = 0; n < 256; n++) {
+    crc = p_crc32c->table[0][n];
+    for (int k = 1; k < 8; k++) {
+      crc = p_crc32c->table[0][crc & 0xff] ^ (crc >> 8);
+      p_crc32c->table[k][n] = crc;
+    }
+  }
+}
+
+/* Table-driven software version as a fall-back.  This is about 15 times slower
+ than using the hardware instructions.  This assumes little-endian integers,
+ as is the case on Intel processors that the assembler code here is for. */
+uint32_t av1_get_crc32c_value_c(CRC32C *p, uint8_t *buf, size_t len) {
+  const uint8_t *next = (const uint8_t *)(buf);
+  uint64_t crc;
+
+  crc = 0 ^ 0xffffffff;
+  while (len && ((uintptr_t)next & 7) != 0) {
+    crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+    len--;
+  }
+  while (len >= 8) {
+    crc ^= *(uint64_t *)next;
+    crc = p->table[7][crc & 0xff] ^ p->table[6][(crc >> 8) & 0xff] ^
+          p->table[5][(crc >> 16) & 0xff] ^ p->table[4][(crc >> 24) & 0xff] ^
+          p->table[3][(crc >> 32) & 0xff] ^ p->table[2][(crc >> 40) & 0xff] ^
+          p->table[1][(crc >> 48) & 0xff] ^ p->table[0][crc >> 56];
+    next += 8;
+    len -= 8;
+  }
+  while (len) {
+    crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+    len--;
+  }
+  return (uint32_t)crc ^ 0xffffffff;
+}
diff --git a/third_party/aom/av1/encoder/hash.h b/third_party/aom/av1/encoder/hash.h
index a0fd54fb6..8b6227540 100644
--- a/third_party/aom/av1/encoder/hash.h
+++ b/third_party/aom/av1/encoder/hash.h
@@ -12,7 +12,8 @@
 #ifndef AV1_ENCODER_HASH_H_
 #define AV1_ENCODER_HASH_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 
 #ifdef __cplusplus
@@ -31,9 +32,16 @@ typedef struct _crc_calculator {
 // calling av1_get_crc_value().
 void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
                              uint32_t truncPoly);
+uint32_t av1_get_crc_value(void *crc_calculator, uint8_t *p, int length);
+
+// CRC32C: POLY = 0x82f63b78;
+typedef struct _CRC32C {
+  /* Table for a quadword-at-a-time software crc. */
+  uint32_t table[8][256];
+} CRC32C;
 
-uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
-                           int length);
+// init table for software version crc32c
+void av1_crc32c_calculator_init(CRC32C *p_crc32c);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/hash_motion.c b/third_party/aom/av1/encoder/hash_motion.c
index 2378597ad..5a8f8cbba 100644
--- a/third_party/aom/av1/encoder/hash_motion.c
+++ b/third_party/aom/av1/encoder/hash_motion.c
@@ -1,7 +1,9 @@
 #include <assert.h>
+
+#include "config/av1_rtcd.h"
+
 #include "av1/encoder/hash.h"
 #include "av1/encoder/hash_motion.h"
-#include "./av1_rtcd.h"
 
 static const int crc_bits = 16;
 static const int block_size_bits = 3;
@@ -16,7 +18,7 @@ static void hash_table_clear_all(hash_table *p_hash_table) {
   int max_addr = 1 << (crc_bits + block_size_bits);
   for (int i = 0; i < max_addr; i++) {
     if (p_hash_table->p_lookup_table[i] != NULL) {
-      vector_destroy(p_hash_table->p_lookup_table[i]);
+      aom_vector_destroy(p_hash_table->p_lookup_table[i]);
       aom_free(p_hash_table->p_lookup_table[i]);
       p_hash_table->p_lookup_table[i] = NULL;
     }
@@ -37,11 +39,30 @@ static void get_pixels_in_1D_char_array_by_block_2x2(uint8_t *y_src, int stride,
   }
 }
 
+static void get_pixels_in_1D_short_array_by_block_2x2(uint16_t *y_src,
+                                                      int stride,
+                                                      uint16_t *p_pixels_in1D) {
+  uint16_t *p_pel = y_src;
+  int index = 0;
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 2; j++) {
+      p_pixels_in1D[index++] = p_pel[j];
+    }
+    p_pel += stride;
+  }
+}
+
 static int is_block_2x2_row_same_value(uint8_t *p) {
   if (p[0] != p[1] || p[2] != p[3]) {
     return 0;
   }
+  return 1;
+}
 
+static int is_block16_2x2_row_same_value(uint16_t *p) {
+  if (p[0] != p[1] || p[2] != p[3]) {
+    return 0;
+  }
   return 1;
 }
 
@@ -49,7 +70,13 @@ static int is_block_2x2_col_same_value(uint8_t *p) {
   if ((p[0] != p[2]) || (p[1] != p[3])) {
     return 0;
   }
+  return 1;
+}
 
+static int is_block16_2x2_col_same_value(uint16_t *p) {
+  if ((p[0] != p[2]) || (p[1] != p[3])) {
+    return 0;
+  }
   return 1;
 }
 
@@ -63,6 +90,7 @@ static int hash_block_size_to_index(int block_size) {
     case 16: return 2;
     case 32: return 3;
     case 64: return 4;
+    case 128: return 5;
     default: return -1;
   }
 }
@@ -100,11 +128,13 @@ static void hash_table_add_to_table(hash_table *p_hash_table,
   if (p_hash_table->p_lookup_table[hash_value] == NULL) {
     p_hash_table->p_lookup_table[hash_value] =
         aom_malloc(sizeof(p_hash_table->p_lookup_table[0][0]));
-    vector_setup(p_hash_table->p_lookup_table[hash_value], 10,
-                 sizeof(curr_block_hash[0]));
-    vector_push_back(p_hash_table->p_lookup_table[hash_value], curr_block_hash);
+    aom_vector_setup(p_hash_table->p_lookup_table[hash_value], 10,
+                     sizeof(curr_block_hash[0]));
+    aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
+                         curr_block_hash);
   } else {
-    vector_push_back(p_hash_table->p_lookup_table[hash_value], curr_block_hash);
+    aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
+                         curr_block_hash);
   }
 }
 
@@ -119,7 +149,7 @@ int32_t av1_hash_table_count(hash_table *p_hash_table, uint32_t hash_value) {
 Iterator av1_hash_get_first_iterator(hash_table *p_hash_table,
                                      uint32_t hash_value) {
   assert(av1_hash_table_count(p_hash_table, hash_value) > 0);
-  return vector_begin(p_hash_table->p_lookup_table[hash_value]);
+  return aom_vector_begin(p_hash_table->p_lookup_table[hash_value]);
 }
 
 int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
@@ -127,8 +157,9 @@ int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
   if (p_hash_table->p_lookup_table[hash_value1] == NULL) {
     return 0;
   }
-  Iterator iterator = vector_begin(p_hash_table->p_lookup_table[hash_value1]);
-  Iterator last = vector_end(p_hash_table->p_lookup_table[hash_value1]);
+  Iterator iterator =
+      aom_vector_begin(p_hash_table->p_lookup_table[hash_value1]);
+  Iterator last = aom_vector_end(p_hash_table->p_lookup_table[hash_value1]);
   for (; !iterator_equals(&iterator, &last); iterator_increment(&iterator)) {
     if ((*(block_hash *)iterator_get(&iterator)).hash_value2 == hash_value2) {
       return 1;
@@ -146,25 +177,45 @@ void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
   const int y_end = picture->y_crop_height - height + 1;
 
   const int length = width * 2;
-  uint8_t p[4];
-
-  int pos = 0;
-  for (int y_pos = 0; y_pos < y_end; y_pos++) {
-    for (int x_pos = 0; x_pos < x_end; x_pos++) {
-      get_pixels_in_1D_char_array_by_block_2x2(
-          picture->y_buffer + y_pos * picture->y_stride + x_pos,
-          picture->y_stride, p);
-      pic_block_same_info[0][pos] = is_block_2x2_row_same_value(p);
-      pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p);
-
-      pic_block_hash[0][pos] =
-          av1_get_crc_value(&crc_calculator1, p, length * sizeof(p[0]));
-      pic_block_hash[1][pos] =
-          av1_get_crc_value(&crc_calculator2, p, length * sizeof(p[0]));
-
-      pos++;
+  if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t p[4];
+    int pos = 0;
+    for (int y_pos = 0; y_pos < y_end; y_pos++) {
+      for (int x_pos = 0; x_pos < x_end; x_pos++) {
+        get_pixels_in_1D_short_array_by_block_2x2(
+            CONVERT_TO_SHORTPTR(picture->y_buffer) + y_pos * picture->y_stride +
+                x_pos,
+            picture->y_stride, p);
+        pic_block_same_info[0][pos] = is_block16_2x2_row_same_value(p);
+        pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p);
+
+        pic_block_hash[0][pos] = av1_get_crc_value(
+            &crc_calculator1, (uint8_t *)p, length * sizeof(p[0]));
+        pic_block_hash[1][pos] = av1_get_crc_value(
+            &crc_calculator2, (uint8_t *)p, length * sizeof(p[0]));
+        pos++;
+      }
+      pos += width - 1;
+    }
+  } else {
+    uint8_t p[4];
+    int pos = 0;
+    for (int y_pos = 0; y_pos < y_end; y_pos++) {
+      for (int x_pos = 0; x_pos < x_end; x_pos++) {
+        get_pixels_in_1D_char_array_by_block_2x2(
+            picture->y_buffer + y_pos * picture->y_stride + x_pos,
+            picture->y_stride, p);
+        pic_block_same_info[0][pos] = is_block_2x2_row_same_value(p);
+        pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p);
+
+        pic_block_hash[0][pos] =
+            av1_get_crc_value(&crc_calculator1, p, length * sizeof(p[0]));
+        pic_block_hash[1][pos] =
+            av1_get_crc_value(&crc_calculator2, p, length * sizeof(p[0]));
+        pos++;
+      }
+      pos += width - 1;
     }
-    pos += width - 1;
   }
 }
 
@@ -222,14 +273,14 @@ void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture,
   }
 
   if (block_size >= 4) {
-    const int size_minus1 = block_size - 1;
+    const int size_minus_1 = block_size - 1;
     pos = 0;
     for (int y_pos = 0; y_pos < y_end; y_pos++) {
       for (int x_pos = 0; x_pos < x_end; x_pos++) {
         dst_pic_block_same_info[2][pos] =
             (!dst_pic_block_same_info[0][pos] &&
              !dst_pic_block_same_info[1][pos]) ||
-            (((x_pos & size_minus1) == 0) && ((y_pos & size_minus1) == 0));
+            (((x_pos & size_minus_1) == 0) && ((y_pos & size_minus_1) == 0));
         pos++;
       }
       pos += block_size - 1;
@@ -276,13 +327,25 @@ int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture,
   const int stride = picture->y_stride;
   const uint8_t *p = picture->y_buffer + y_start * stride + x_start;
 
-  for (int i = 0; i < block_size; i++) {
-    for (int j = 1; j < block_size; j++) {
-      if (p[j] != p[0]) {
-        return 0;
+  if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *p16 = CONVERT_TO_SHORTPTR(p);
+    for (int i = 0; i < block_size; i++) {
+      for (int j = 1; j < block_size; j++) {
+        if (p16[j] != p16[0]) {
+          return 0;
+        }
       }
+      p16 += stride;
+    }
+  } else {
+    for (int i = 0; i < block_size; i++) {
+      for (int j = 1; j < block_size; j++) {
+        if (p[j] != p[0]) {
+          return 0;
+        }
+      }
+      p += stride;
     }
-    p += stride;
   }
 
   return 1;
@@ -293,26 +356,38 @@ int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
   const int stride = picture->y_stride;
   const uint8_t *p = picture->y_buffer + y_start * stride + x_start;
 
-  for (int i = 0; i < block_size; i++) {
-    for (int j = 1; j < block_size; j++) {
-      if (p[j * stride + i] != p[i]) {
-        return 0;
+  if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *p16 = CONVERT_TO_SHORTPTR(p);
+    for (int i = 0; i < block_size; i++) {
+      for (int j = 1; j < block_size; j++) {
+        if (p16[j * stride + i] != p16[i]) {
+          return 0;
+        }
+      }
+    }
+  } else {
+    for (int i = 0; i < block_size; i++) {
+      for (int j = 1; j < block_size; j++) {
+        if (p[j * stride + i] != p[i]) {
+          return 0;
+        }
       }
     }
   }
-
   return 1;
 }
 
 // global buffer for hash value calculation of a block
 // used only in av1_get_block_hash_value()
-static uint32_t hash_value_buffer[2][2][1024];  // [first hash/second hash]
-                                                // [two buffers used ping-pong]
-                                                // [num of 2x2 blocks in 64x64]
+#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096)
+// [first hash/second hash]
+// [two buffers used ping-pong]
+// [num of 2x2 blocks in 128x128]
+static uint32_t hash_value_buffer[2][2][AOM_BUFFER_SIZE_FOR_BLOCK_HASH];
 
 void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
-                              uint32_t *hash_value1, uint32_t *hash_value2) {
-  uint8_t pixel_to_hash[4];
+                              uint32_t *hash_value1, uint32_t *hash_value2,
+                              int use_highbitdepth) {
   uint32_t to_hash[4];
   const int add_value = hash_block_size_to_index(block_size) << crc_bits;
   assert(add_value >= 0);
@@ -320,16 +395,34 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
 
   // 2x2 subblock hash values in current CU
   int sub_block_in_width = (block_size >> 1);
-  for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
-    for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
-      int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
-      get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos,
-                                               stride, pixel_to_hash);
-
-      hash_value_buffer[0][0][pos] = av1_get_crc_value(
-          &crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash));
-      hash_value_buffer[1][0][pos] = av1_get_crc_value(
-          &crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash));
+  if (use_highbitdepth) {
+    uint16_t pixel_to_hash[4];
+    uint16_t *y16_src = CONVERT_TO_SHORTPTR(y_src);
+    for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
+      for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
+        int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
+        get_pixels_in_1D_short_array_by_block_2x2(
+            y16_src + y_pos * stride + x_pos, stride, pixel_to_hash);
+        assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        hash_value_buffer[0][0][pos] = av1_get_crc_value(
+            &crc_calculator1, (uint8_t *)pixel_to_hash, sizeof(pixel_to_hash));
+        hash_value_buffer[1][0][pos] = av1_get_crc_value(
+            &crc_calculator2, (uint8_t *)pixel_to_hash, sizeof(pixel_to_hash));
+      }
+    }
+  } else {
+    uint8_t pixel_to_hash[4];
+    for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
+      for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
+        int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
+        get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos,
+                                                 stride, pixel_to_hash);
+        assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        hash_value_buffer[0][0][pos] = av1_get_crc_value(
+            &crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash));
+        hash_value_buffer[1][0][pos] = av1_get_crc_value(
+            &crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash));
+      }
     }
   }
 
@@ -349,6 +442,10 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
       for (int x_pos = 0; x_pos < sub_block_in_width; x_pos++) {
         int srcPos = (y_pos << 1) * src_sub_block_in_width + (x_pos << 1);
 
+        assert(srcPos + 1 < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        assert(srcPos + src_sub_block_in_width + 1 <
+               AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
         to_hash[0] = hash_value_buffer[0][src_idx][srcPos];
         to_hash[1] = hash_value_buffer[0][src_idx][srcPos + 1];
         to_hash[2] =
@@ -378,3 +475,5 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
   *hash_value1 = (hash_value_buffer[0][dst_idx][0] & crc_mask) + add_value;
   *hash_value2 = hash_value_buffer[1][dst_idx][0];
 }
+
+#undef AOM_BUFFER_SIZE_FOR_BLOCK_HASH
diff --git a/third_party/aom/av1/encoder/hash_motion.h b/third_party/aom/av1/encoder/hash_motion.h
index 26e1ac46e..8deb92eb6 100644
--- a/third_party/aom/av1/encoder/hash_motion.h
+++ b/third_party/aom/av1/encoder/hash_motion.h
@@ -12,7 +12,8 @@
 #ifndef AV1_ENCODER_HASH_MOTION_H_
 #define AV1_ENCODER_HASH_MOTION_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_scale/yv12config.h"
 #include "third_party/vector/vector.h"
@@ -29,7 +30,9 @@ typedef struct _block_hash {
   uint32_t hash_value2;
 } block_hash;
 
-typedef struct _hash_table { Vector **p_lookup_table; } hash_table;
+typedef struct _hash_table {
+  Vector **p_lookup_table;
+} hash_table;
 
 void av1_hash_table_init(hash_table *p_hash_table);
 void av1_hash_table_destroy(hash_table *p_hash_table);
@@ -63,7 +66,8 @@ int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture,
 int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
                                  int block_size, int x_start, int y_start);
 void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
-                              uint32_t *hash_value1, uint32_t *hash_value2);
+                              uint32_t *hash_value1, uint32_t *hash_value2,
+                              int use_highbitdepth);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
index 6ddeb2b77..0922557d0 100644
--- a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
@@ -9,228 +9,73 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "av1/common/idct.h"
 #include "av1/encoder/hybrid_fwd_txfm.h"
 
-#if CONFIG_CHROMA_2X2
-static void fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, TxfmParam *txfm_param) {
-  tran_high_t a1 = src_diff[0];
-  tran_high_t b1 = src_diff[1];
-  tran_high_t c1 = src_diff[diff_stride];
-  tran_high_t d1 = src_diff[1 + diff_stride];
-
-  tran_high_t a2 = a1 + c1;
-  tran_high_t b2 = b1 + d1;
-  tran_high_t c2 = a1 - c1;
-  tran_high_t d2 = b1 - d1;
-
-  a1 = a2 + b2;
-  b1 = a2 - b2;
-  c1 = c2 + d2;
-  d1 = c2 - d2;
-
-  coeff[0] = (tran_low_t)(4 * a1);
-  coeff[1] = (tran_low_t)(4 * b1);
-  coeff[2] = (tran_low_t)(4 * c1);
-  coeff[3] = (tran_low_t)(4 * d1);
-
-  (void)txfm_param;
-}
-#endif
-
-static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, TxfmParam *txfm_param) {
-  if (txfm_param->lossless) {
-    assert(txfm_param->tx_type == DCT_DCT);
-    av1_fwht4x4(src_diff, coeff, diff_stride);
-    return;
+/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
+   pixel. */
+void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  int i;
+  tran_high_t a1, b1, c1, d1, e1;
+  const int16_t *ip_pass0 = input;
+  const tran_low_t *ip = NULL;
+  tran_low_t *op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip_pass0[0 * stride];
+    b1 = ip_pass0[1 * stride];
+    c1 = ip_pass0[2 * stride];
+    d1 = ip_pass0[3 * stride];
+
+    a1 += b1;
+    d1 = d1 - c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = (tran_low_t)a1;
+    op[4] = (tran_low_t)c1;
+    op[8] = (tran_low_t)d1;
+    op[12] = (tran_low_t)b1;
+
+    ip_pass0++;
+    op++;
   }
-
-#if CONFIG_LGT || CONFIG_DAALA_DCT4
-  // only C version has LGTs
-  av1_fht4x4_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht4x4(src_diff, coeff, diff_stride, txfm_param);
-#endif
-}
-
-static void fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_fht4x8_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht4x8(src_diff, coeff, diff_stride, txfm_param);
-#endif
-}
-
-static void fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_fht8x4_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht8x4(src_diff, coeff, diff_stride, txfm_param);
-#endif
-}
-
-static void fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
-                          int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_fht8x16_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht8x16(src_diff, coeff, diff_stride, txfm_param);
-#endif
-}
-
-static void fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
-                          int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_fht16x8_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht16x8(src_diff, coeff, diff_stride, txfm_param);
-#endif
-}
-
-static void fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TxfmParam *txfm_param) {
-  av1_fht16x32(src_diff, coeff, diff_stride, txfm_param);
-}
-
-static void fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TxfmParam *txfm_param) {
-  av1_fht32x16(src_diff, coeff, diff_stride, txfm_param);
-}
-
-static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_LGT || CONFIG_DAALA_DCT8
-  av1_fht8x8_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht8x8(src_diff, coeff, diff_stride, txfm_param);
-#endif
-}
-
-static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_DAALA_DCT16
-  av1_fht16x16_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht16x16(src_diff, coeff, diff_stride, txfm_param);
-#endif  // CONFIG_DAALA_DCT16
-}
-
-static void fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_MRC_TX
-  // MRC_DCT currently only has a C implementation
-  if (txfm_param->tx_type == MRC_DCT) {
-    av1_fht32x32_c(src_diff, coeff, diff_stride, txfm_param);
-    return;
+  ip = output;
+  op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0];
+    b1 = ip[1];
+    c1 = ip[2];
+    d1 = ip[3];
+
+    a1 += b1;
+    d1 -= c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
+    op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
+    op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
+    op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
+
+    ip += 4;
+    op += 4;
   }
-#endif  // CONFIG_MRC_TX
-  av1_fht32x32(src_diff, coeff, diff_stride, txfm_param);
-}
-
-#if CONFIG_TX64X64
-static void fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_EXT_TX
-  if (txfm_param->tx_type == IDTX)
-    av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, 64, txfm_param->tx_type);
-  else
-#endif
-    av1_fht64x64(src_diff, coeff, diff_stride, txfm_param);
-}
-
-static void fwd_txfm_32x64(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_EXT_TX
-  if (txfm_param->tx_type == IDTX)
-    av1_fwd_idtx_c(src_diff, coeff, diff_stride, 32, 64, txfm_param->tx_type);
-  else
-#endif
-    av1_fht32x64(src_diff, coeff, diff_stride, txfm_param);
-}
-
-static void fwd_txfm_64x32(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_EXT_TX
-  if (txfm_param->tx_type == IDTX)
-    av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, 32, txfm_param->tx_type);
-  else
-#endif
-    av1_fht64x32(src_diff, coeff, diff_stride, txfm_param);
-}
-#endif  // CONFIG_TX64X64
-
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-static void fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff,
-                          int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_fht16x4_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht16x4(src_diff, coeff, diff_stride, txfm_param);
-#endif
-}
-
-static void fwd_txfm_4x16(const int16_t *src_diff, tran_low_t *coeff,
-                          int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_fht4x16_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht4x16(src_diff, coeff, diff_stride, txfm_param);
-#endif
-}
-
-static void fwd_txfm_32x8(const int16_t *src_diff, tran_low_t *coeff,
-                          int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_fht32x8_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht32x8(src_diff, coeff, diff_stride, txfm_param);
-#endif
 }
 
-static void fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff,
-                          int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_LGT
-  av1_fht8x32_c(src_diff, coeff, diff_stride, txfm_param);
-#else
-  av1_fht8x32(src_diff, coeff, diff_stride, txfm_param);
-#endif
-}
-#endif
-
-#if CONFIG_CHROMA_2X2
-static void highbd_fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff,
-                                int diff_stride, TxfmParam *txfm_param) {
-  tran_high_t a1 = src_diff[0];
-  tran_high_t b1 = src_diff[1];
-  tran_high_t c1 = src_diff[diff_stride];
-  tran_high_t d1 = src_diff[1 + diff_stride];
-
-  tran_high_t a2 = a1 + c1;
-  tran_high_t b2 = b1 + d1;
-  tran_high_t c2 = a1 - c1;
-  tran_high_t d2 = b1 - d1;
-
-  a1 = a2 + b2;
-  b1 = a2 - b2;
-  c1 = c2 + d2;
-  d1 = c2 - d2;
-
-  coeff[0] = (tran_low_t)(4 * a1);
-  coeff[1] = (tran_low_t)(4 * b1);
-  coeff[2] = (tran_low_t)(4 * c1);
-  coeff[3] = (tran_low_t)(4 * d1);
-
-  (void)txfm_param;
+void av1_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  av1_fwht4x4_c(input, output, stride);
 }
-#endif
 
 static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TxfmParam *txfm_param) {
@@ -243,22 +88,6 @@ static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
     return;
   }
   switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      // fallthrough intended
-      av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-      // fallthrough intended
-      av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
     // use the c version for anything including identity for now
     case V_DCT:
     case H_DCT:
@@ -267,11 +96,11 @@ static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
     case V_FLIPADST:
     case H_FLIPADST:
     case IDTX:
-      // fallthrough intended
       av1_fwd_txfm2d_4x4_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
       break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0);
+    default:
+      av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
   }
 }
 
@@ -317,28 +146,40 @@ static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
                          txfm_param->bd);
 }
 
+static void highbd_fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_16x4_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                        txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_4x16(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_4x16_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                        txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_32x8(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_32x8_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                        txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_8x32_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                        txfm_param->bd);
+}
+
 static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TxfmParam *txfm_param) {
   int32_t *dst_coeff = (int32_t *)coeff;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
   switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      // fallthrough intended
-      av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-      // fallthrough intended
-      av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
     // use the c version for anything including identity for now
     case V_DCT:
     case H_DCT:
@@ -347,11 +188,11 @@ static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
     case V_FLIPADST:
     case H_FLIPADST:
     case IDTX:
-      // fallthrough intended
       av1_fwd_txfm2d_8x8_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
       break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0);
+    default:
+      av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
   }
 }
 
@@ -361,22 +202,6 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
   switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      // fallthrough intended
-      av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-      // fallthrough intended
-      av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
     // use the c version for anything including identity for now
     case V_DCT:
     case H_DCT:
@@ -385,11 +210,11 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
     case V_FLIPADST:
     case H_FLIPADST:
     case IDTX:
-      // fallthrough intended
       av1_fwd_txfm2d_16x16_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
       break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0);
+    default:
+      av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
   }
 }
 
@@ -399,22 +224,6 @@ static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
   switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      // fallthrough intended
-      av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-      // fallthrough intended
-      av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
     // use the c version for anything including identity for now
     case V_DCT:
     case H_DCT:
@@ -423,206 +232,72 @@ static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
     case V_FLIPADST:
     case H_FLIPADST:
     case IDTX:
-      // fallthrough intended
       av1_fwd_txfm2d_32x32_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
       break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0);
+    default:
+      av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
   }
 }
 
-#if CONFIG_TX64X64
 static void highbd_fwd_txfm_32x64(const int16_t *src_diff, tran_low_t *coeff,
                                   int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
   int32_t *dst_coeff = (int32_t *)coeff;
-  const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_fwd_txfm2d_32x64_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-#if CONFIG_EXT_TX
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-      // TODO(sarahparker)
-      // I've deleted the 64x64 implementations that existed in lieu
-      // of adst, flipadst and identity for simplicity but will bring back
-      // in a later change. This shouldn't impact performance since
-      // DCT_DCT is the only extended type currently allowed for 64x64,
-      // as dictated by get_ext_tx_set_type in blockd.h.
-      av1_fwd_txfm2d_32x64_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
-      break;
-    case IDTX:
-      av1_fwd_idtx_c(src_diff, dst_coeff, diff_stride, 32, 64, tx_type);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
+  av1_fwd_txfm2d_32x64_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
 }
 
 static void highbd_fwd_txfm_64x32(const int16_t *src_diff, tran_low_t *coeff,
                                   int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
   int32_t *dst_coeff = (int32_t *)coeff;
-  const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_fwd_txfm2d_64x32_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-#if CONFIG_EXT_TX
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-      // TODO(sarahparker)
-      // I've deleted the 64x64 implementations that existed in lieu
-      // of adst, flipadst and identity for simplicity but will bring back
-      // in a later change. This shouldn't impact performance since
-      // DCT_DCT is the only extended type currently allowed for 64x64,
-      // as dictated by get_ext_tx_set_type in blockd.h.
-      av1_fwd_txfm2d_64x32_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
-      break;
-    case IDTX:
-      av1_fwd_idtx_c(src_diff, dst_coeff, diff_stride, 64, 32, tx_type);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
+  av1_fwd_txfm2d_64x32_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+static void highbd_fwd_txfm_16x64(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_16x64_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+static void highbd_fwd_txfm_64x16(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_64x16_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
 }
+
 static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
                                   int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
   int32_t *dst_coeff = (int32_t *)coeff;
-  const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
-  switch (tx_type) {
-    case DCT_DCT:
-      av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-#if CONFIG_EXT_TX
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-      // TODO(sarahparker)
-      // I've deleted the 64x64 implementations that existed in lieu
-      // of adst, flipadst and identity for simplicity but will bring back
-      // in a later change. This shouldn't impact performance since
-      // DCT_DCT is the only extended type currently allowed for 64x64,
-      // as dictated by get_ext_tx_set_type in blockd.h.
-      av1_fwd_txfm2d_64x64_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
-      break;
-    case IDTX:
-      av1_fwd_idtx_c(src_diff, dst_coeff, diff_stride, 64, 64, tx_type);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
+  av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
 }
-#endif  // CONFIG_TX64X64
 
 void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
                   TxfmParam *txfm_param) {
-  const TX_SIZE tx_size = txfm_param->tx_size;
-#if CONFIG_LGT_FROM_PRED
-  if (txfm_param->use_lgt) {
-    // if use_lgt is 1, it will override tx_type
-    assert(is_lgt_allowed(txfm_param->mode, tx_size));
-    flgt2d_from_pred_c(src_diff, coeff, diff_stride, txfm_param);
-    return;
-  }
-#endif  // CONFIG_LGT_FROM_PRED
-  switch (tx_size) {
-#if CONFIG_TX64X64
-    case TX_64X64:
-      fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_32X64:
-      fwd_txfm_32x64(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_64X32:
-      fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param);
-      break;
-#endif  // CONFIG_TX64X64
-    case TX_32X32:
-      fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_16X16:
-      fwd_txfm_16x16(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_8X8: fwd_txfm_8x8(src_diff, coeff, diff_stride, txfm_param); break;
-    case TX_4X8: fwd_txfm_4x8(src_diff, coeff, diff_stride, txfm_param); break;
-    case TX_8X4: fwd_txfm_8x4(src_diff, coeff, diff_stride, txfm_param); break;
-    case TX_8X16:
-      fwd_txfm_8x16(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_16X8:
-      fwd_txfm_16x8(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_16X32:
-      fwd_txfm_16x32(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_32X16:
-      fwd_txfm_32x16(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_4X4: fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param); break;
-#if CONFIG_CHROMA_2X2
-    case TX_2X2: fwd_txfm_2x2(src_diff, coeff, diff_stride, txfm_param); break;
-#endif
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    case TX_4X16:
-      fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_16X4:
-      fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_8X32:
-      fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param);
-      break;
-    case TX_32X8:
-      fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param);
-      break;
-#endif
-    default: assert(0); break;
-  }
+  if (txfm_param->bd == 8)
+    av1_lowbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+  else
+    av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+}
+
+void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff,
+                          int diff_stride, TxfmParam *txfm_param) {
+  av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
 }
 
 void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
                          int diff_stride, TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
   const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
-#if CONFIG_TX64X64
     case TX_64X64:
       highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param);
       break;
@@ -632,7 +307,12 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
     case TX_64X32:
       highbd_fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param);
       break;
-#endif  // CONFIG_TX64X64
+    case TX_16X64:
+      highbd_fwd_txfm_16x64(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_64X16:
+      highbd_fwd_txfm_64x16(src_diff, coeff, diff_stride, txfm_param);
+      break;
     case TX_32X32:
       highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param);
       break;
@@ -663,11 +343,18 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
     case TX_4X4:
       highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param);
       break;
-#if CONFIG_CHROMA_2X2
-    case TX_2X2:
-      highbd_fwd_txfm_2x2(src_diff, coeff, diff_stride, txfm_param);
+    case TX_4X16:
+      highbd_fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_16X4:
+      highbd_fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_8X32:
+      highbd_fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_32X8:
+      highbd_fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param);
       break;
-#endif
     default: assert(0); break;
   }
 }
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
index b25ffb8d8..6155b255a 100644
--- a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
@@ -12,7 +12,7 @@
 #ifndef AV1_ENCODER_HYBRID_FWD_TXFM_H_
 #define AV1_ENCODER_HYBRID_FWD_TXFM_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/third_party/aom/av1/encoder/k_means_template.h b/third_party/aom/av1/encoder/k_means_template.h
index 3a433d9b5..9e526b88b 100644
--- a/third_party/aom/av1/encoder/k_means_template.h
+++ b/third_party/aom/av1/encoder/k_means_template.h
@@ -23,25 +23,23 @@
 #define RENAME_(x, y) AV1_K_MEANS_RENAME(x, y)
 #define RENAME(x) RENAME_(x, AV1_K_MEANS_DIM)
 
-static float RENAME(calc_dist)(const float *p1, const float *p2) {
-  float dist = 0;
-  int i;
-  for (i = 0; i < AV1_K_MEANS_DIM; ++i) {
-    const float diff = p1[i] - p2[i];
+static int RENAME(calc_dist)(const int *p1, const int *p2) {
+  int dist = 0;
+  for (int i = 0; i < AV1_K_MEANS_DIM; ++i) {
+    const int diff = p1[i] - p2[i];
     dist += diff * diff;
   }
   return dist;
 }
 
-void RENAME(av1_calc_indices)(const float *data, const float *centroids,
+void RENAME(av1_calc_indices)(const int *data, const int *centroids,
                               uint8_t *indices, int n, int k) {
-  int i, j;
-  for (i = 0; i < n; ++i) {
-    float min_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids);
+  for (int i = 0; i < n; ++i) {
+    int min_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids);
     indices[i] = 0;
-    for (j = 1; j < k; ++j) {
-      const float this_dist = RENAME(calc_dist)(
-          data + i * AV1_K_MEANS_DIM, centroids + j * AV1_K_MEANS_DIM);
+    for (int j = 1; j < k; ++j) {
+      const int this_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM,
+                                              centroids + j * AV1_K_MEANS_DIM);
       if (this_dist < min_dist) {
         min_dist = this_dist;
         indices[i] = j;
@@ -50,19 +48,16 @@ void RENAME(av1_calc_indices)(const float *data, const float *centroids,
   }
 }
 
-static void RENAME(calc_centroids)(const float *data, float *centroids,
+static void RENAME(calc_centroids)(const int *data, int *centroids,
                                    const uint8_t *indices, int n, int k) {
-  int i, j, index;
-  int count[PALETTE_MAX_SIZE];
+  int i, j;
+  int count[PALETTE_MAX_SIZE] = { 0 };
   unsigned int rand_state = (unsigned int)data[0];
-
   assert(n <= 32768);
-
-  memset(count, 0, sizeof(count[0]) * k);
   memset(centroids, 0, sizeof(centroids[0]) * k * AV1_K_MEANS_DIM);
 
   for (i = 0; i < n; ++i) {
-    index = indices[i];
+    const int index = indices[i];
     assert(index < k);
     ++count[index];
     for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
@@ -76,43 +71,35 @@ static void RENAME(calc_centroids)(const float *data, float *centroids,
              data + (lcg_rand16(&rand_state) % n) * AV1_K_MEANS_DIM,
              sizeof(centroids[0]) * AV1_K_MEANS_DIM);
     } else {
-      const float norm = 1.0f / count[i];
-      for (j = 0; j < AV1_K_MEANS_DIM; ++j)
-        centroids[i * AV1_K_MEANS_DIM + j] *= norm;
+      for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
+        centroids[i * AV1_K_MEANS_DIM + j] =
+            DIVIDE_AND_ROUND(centroids[i * AV1_K_MEANS_DIM + j], count[i]);
+      }
     }
   }
-
-  // Round to nearest integers.
-  for (i = 0; i < k * AV1_K_MEANS_DIM; ++i) {
-    centroids[i] = roundf(centroids[i]);
-  }
 }
 
-static float RENAME(calc_total_dist)(const float *data, const float *centroids,
-                                     const uint8_t *indices, int n, int k) {
-  float dist = 0;
-  int i;
+static int64_t RENAME(calc_total_dist)(const int *data, const int *centroids,
+                                       const uint8_t *indices, int n, int k) {
+  int64_t dist = 0;
   (void)k;
-
-  for (i = 0; i < n; ++i)
+  for (int i = 0; i < n; ++i) {
     dist += RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM,
                               centroids + indices[i] * AV1_K_MEANS_DIM);
-
+  }
   return dist;
 }
 
-void RENAME(av1_k_means)(const float *data, float *centroids, uint8_t *indices,
+void RENAME(av1_k_means)(const int *data, int *centroids, uint8_t *indices,
                          int n, int k, int max_itr) {
-  int i;
-  float this_dist;
-  float pre_centroids[2 * PALETTE_MAX_SIZE];
+  int pre_centroids[2 * PALETTE_MAX_SIZE];
   uint8_t pre_indices[MAX_SB_SQUARE];
 
   RENAME(av1_calc_indices)(data, centroids, indices, n, k);
-  this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
+  int64_t this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
 
-  for (i = 0; i < max_itr; ++i) {
-    const float pre_dist = this_dist;
+  for (int i = 0; i < max_itr; ++i) {
+    const int64_t pre_dist = this_dist;
     memcpy(pre_centroids, centroids,
            sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM);
     memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
@@ -132,6 +119,5 @@ void RENAME(av1_k_means)(const float *data, float *centroids, uint8_t *indices,
       break;
   }
 }
-
 #undef RENAME_
 #undef RENAME
diff --git a/third_party/aom/av1/encoder/laplace_encoder.c b/third_party/aom/av1/encoder/laplace_encoder.c
deleted file mode 100644
index 54ffc88fb..000000000
--- a/third_party/aom/av1/encoder/laplace_encoder.c
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include <stdio.h>
-
-#include "aom_dsp/bitwriter.h"
-#include "av1/common/odintrin.h"
-#include "av1/common/pvq.h"
-#include "pvq_encoder.h"
-
-static void aom_encode_pvq_split(aom_writer *w, od_pvq_codeword_ctx *adapt,
- int count, int sum, int ctx) {
-  int shift;
-  int rest;
-  int fctx;
-  if (sum == 0) return;
-  shift = OD_MAXI(0, OD_ILOG(sum) - 3);
-  if (shift) {
-    rest = count & ((1 << shift) - 1);
-    count >>= shift;
-    sum >>= shift;
-  }
-  fctx = 7*ctx + sum - 1;
-  aom_write_symbol_pvq(w, count, adapt->pvq_split_cdf[fctx], sum + 1);
-  if (shift) aom_write_literal(w, rest, shift);
-}
-
-void aom_encode_band_pvq_splits(aom_writer *w, od_pvq_codeword_ctx *adapt,
- const int *y, int n, int k, int level) {
-  int mid;
-  int i;
-  int count_right;
-  if (n <= 1 || k == 0) return;
-  if (k == 1 && n <= 16) {
-    int cdf_id;
-    int pos;
-    cdf_id = od_pvq_k1_ctx(n, level == 0);
-    for (pos = 0; !y[pos]; pos++);
-    OD_ASSERT(pos < n);
-    aom_write_symbol_pvq(w, pos, adapt->pvq_k1_cdf[cdf_id], n);
-  }
-  else {
-    mid = n >> 1;
-    count_right = k;
-    for (i = 0; i < mid; i++) count_right -= abs(y[i]);
-    aom_encode_pvq_split(w, adapt, count_right, k, od_pvq_size_ctx(n));
-    aom_encode_band_pvq_splits(w, adapt, y, mid, k - count_right, level + 1);
-    aom_encode_band_pvq_splits(w, adapt, y + mid, n - mid, count_right,
-     level + 1);
-  }
-}
-
-/** Encodes the tail of a Laplace-distributed variable, i.e. it doesn't
- * do anything special for the zero case.
- *
- * @param [in,out] enc     range encoder
- * @param [in]     x       variable to encode (has to be positive)
- * @param [in]     decay   decay factor of the distribution in Q8 format,
- * i.e. pdf ~= decay^x
- */
-void aom_laplace_encode_special(aom_writer *w, int x, unsigned decay) {
-  int shift;
-  int xs;
-  int sym;
-  const uint16_t *cdf;
-  shift = 0;
-  /* We don't want a large decay value because that would require too many
-     symbols. */
-  while (decay > 235) {
-    decay = (decay*decay + 128) >> 8;
-    shift++;
-  }
-  decay = OD_MINI(decay, 254);
-  decay = OD_MAXI(decay, 2);
-  xs = x >> shift;
-  cdf = EXP_CDF_TABLE[(decay + 1) >> 1];
-  OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "decay = %d", decay));
-  do {
-    sym = OD_MINI(xs, 15);
-    {
-      int i;
-      OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "%d %d %d %d %d\n", x, xs, shift,
-       sym, max));
-      for (i = 0; i < 16; i++) {
-        OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "%d ", cdf[i]));
-      }
-      OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "\n"));
-    }
-    aom_write_cdf(w, sym, cdf, 16);
-    xs -= 15;
-  } while (sym >= 15);
-  if (shift) aom_write_literal(w, x & ((1 << shift) - 1), shift);
-}
diff --git a/third_party/aom/av1/encoder/lookahead.c b/third_party/aom/av1/encoder/lookahead.c
index 591ca6152..1bf8ecbac 100644
--- a/third_party/aom/av1/encoder/lookahead.c
+++ b/third_party/aom/av1/encoder/lookahead.c
@@ -11,10 +11,9 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #include "av1/common/common.h"
-
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/extend.h"
 #include "av1/encoder/lookahead.h"
@@ -42,14 +41,9 @@ void av1_lookahead_destroy(struct lookahead_ctx *ctx) {
   }
 }
 
-struct lookahead_ctx *av1_lookahead_init(unsigned int width,
-                                         unsigned int height,
-                                         unsigned int subsampling_x,
-                                         unsigned int subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                                         int use_highbitdepth,
-#endif
-                                         unsigned int depth) {
+struct lookahead_ctx *av1_lookahead_init(
+    unsigned int width, unsigned int height, unsigned int subsampling_x,
+    unsigned int subsampling_y, int use_highbitdepth, unsigned int depth) {
   struct lookahead_ctx *ctx = NULL;
 
   // Clamp the lookahead queue depth
@@ -68,10 +62,7 @@ struct lookahead_ctx *av1_lookahead_init(unsigned int width,
     if (!ctx->buf) goto bail;
     for (i = 0; i < depth; i++)
       if (aom_alloc_frame_buffer(&ctx->buf[i].img, width, height, subsampling_x,
-                                 subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                                 use_highbitdepth,
-#endif
+                                 subsampling_y, use_highbitdepth,
                                  AOM_BORDER_IN_PIXELS, legacy_byte_alignment))
         goto bail;
   }
@@ -84,10 +75,7 @@ bail:
 #define USE_PARTIAL_COPY 0
 
 int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
-                       int64_t ts_start, int64_t ts_end,
-#if CONFIG_HIGHBITDEPTH
-                       int use_highbitdepth,
-#endif
+                       int64_t ts_start, int64_t ts_end, int use_highbitdepth,
                        aom_enc_frame_flags_t flags) {
   struct lookahead_entry *buf;
 #if USE_PARTIAL_COPY
@@ -160,10 +148,7 @@ int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
       YV12_BUFFER_CONFIG new_img;
       memset(&new_img, 0, sizeof(new_img));
       if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x,
-                                 subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                                 use_highbitdepth,
-#endif
+                                 subsampling_y, use_highbitdepth,
                                  AOM_BORDER_IN_PIXELS, 0))
         return 1;
       aom_free_frame_buffer(&buf->img);
diff --git a/third_party/aom/av1/encoder/lookahead.h b/third_party/aom/av1/encoder/lookahead.h
index 19f75d7e4..3897c2a6a 100644
--- a/third_party/aom/av1/encoder/lookahead.h
+++ b/third_party/aom/av1/encoder/lookahead.h
@@ -44,14 +44,9 @@ struct lookahead_ctx {
  * The lookahead stage is a queue of frame buffers on which some analysis
  * may be done when buffers are enqueued.
  */
-struct lookahead_ctx *av1_lookahead_init(unsigned int width,
-                                         unsigned int height,
-                                         unsigned int subsampling_x,
-                                         unsigned int subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                                         int use_highbitdepth,
-#endif
-                                         unsigned int depth);
+struct lookahead_ctx *av1_lookahead_init(
+    unsigned int width, unsigned int height, unsigned int subsampling_x,
+    unsigned int subsampling_y, int use_highbitdepth, unsigned int depth);
 
 /**\brief Destroys the lookahead stage
  */
@@ -73,10 +68,7 @@ void av1_lookahead_destroy(struct lookahead_ctx *ctx);
  * \param[in] active_map  Map that specifies which macroblock is active
  */
 int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
-                       int64_t ts_start, int64_t ts_end,
-#if CONFIG_HIGHBITDEPTH
-                       int use_highbitdepth,
-#endif
+                       int64_t ts_start, int64_t ts_end, int use_highbitdepth,
                        aom_enc_frame_flags_t flags);
 
 /**\brief Get the next source buffer to encode
diff --git a/third_party/aom/av1/encoder/mbgraph.c b/third_party/aom/av1/encoder/mbgraph.c
index 7d2510af9..472173634 100644
--- a/third_party/aom/av1/encoder/mbgraph.c
+++ b/third_party/aom/av1/encoder/mbgraph.c
@@ -11,8 +11,8 @@
 
 #include <limits.h>
 
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
@@ -47,32 +47,28 @@ static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv,
   av1_hex_search(x, &ref_full, step_param, x->errorperbit, 0,
                  cond_cost_list(cpi, cost_list), &v_fn_ptr, 0, ref_mv);
 
-// Try sub-pixel MC
-// if (bestsme > error_thresh && bestsme < INT_MAX)
-#if CONFIG_AMVR
-  if (cpi->common.cur_frame_mv_precision_level == 1) {
+  // Try sub-pixel MC
+  // if (bestsme > error_thresh && bestsme < INT_MAX)
+  if (cpi->common.cur_frame_force_integer_mv == 1) {
     x->best_mv.as_mv.row *= 8;
     x->best_mv.as_mv.col *= 8;
   } else {
-#else
-  {
-#endif
     int distortion;
     unsigned int sse;
-    cpi->find_fractional_mv_step(x, ref_mv, cpi->common.allow_high_precision_mv,
-                                 x->errorperbit, &v_fn_ptr, 0,
-                                 mv_sf->subpel_iters_per_step,
-                                 cond_cost_list(cpi, cost_list), NULL, NULL,
-                                 &distortion, &sse, NULL, NULL, 0, 0, 0, 0, 0);
+    cpi->find_fractional_mv_step(
+        x, &cpi->common, mb_row, mb_col, ref_mv,
+        cpi->common.allow_high_precision_mv, x->errorperbit, &v_fn_ptr, 0,
+        mv_sf->subpel_iters_per_step, cond_cost_list(cpi, cost_list), NULL,
+        NULL, &distortion, &sse, NULL, NULL, 0, 0, 0, 0, 0);
   }
 
-  if (has_second_ref(&xd->mi[0]->mbmi))
-    xd->mi[0]->mbmi.mode = NEW_NEWMV;
+  if (has_second_ref(xd->mi[0]))
+    xd->mi[0]->mode = NEW_NEWMV;
   else
-    xd->mi[0]->mbmi.mode = NEWMV;
+    xd->mi[0]->mode = NEWMV;
 
-  xd->mi[0]->mbmi.mv[0] = x->best_mv;
-  xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME;
+  xd->mi[0]->mv[0] = x->best_mv;
+  xd->mi[0]->ref_frame[1] = NONE_FRAME;
 
   av1_build_inter_predictors_sby(&cpi->common, xd, mb_row, mb_col, NULL,
                                  BLOCK_16X16);
@@ -108,7 +104,7 @@ static int do_16x16_motion_search(AV1_COMP *cpi, const MV *ref_mv, int mb_row,
   // If the current best reference mv is not centered on 0,0 then do a 0,0
   // based search as well.
   if (ref_mv->row != 0 || ref_mv->col != 0) {
-    MV zero_ref_mv = { 0, 0 };
+    MV zero_ref_mv = kZeroMv;
 
     tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, mb_row, mb_col);
     if (tmp_err < err) {
@@ -144,14 +140,14 @@ static int find_best_16x16_intra(AV1_COMP *cpi, PREDICTION_MODE *pbest_mode) {
 
   // calculate SATD for each intra prediction mode;
   // we're intentionally not doing 4x4, we just want a rough estimate
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+  for (mode = DC_PRED; mode <= PAETH_PRED; mode++) {
     unsigned int err;
 
-    xd->mi[0]->mbmi.mode = mode;
-    av1_predict_intra_block(cm, xd, 16, 16, BLOCK_16X16, mode,
-                            x->plane[0].src.buf, x->plane[0].src.stride,
-                            xd->plane[0].dst.buf, xd->plane[0].dst.stride, 0, 0,
-                            0);
+    xd->mi[0]->mode = mode;
+    av1_predict_intra_block(cm, xd, 16, 16, TX_16X16, mode, 0, 0,
+                            FILTER_INTRA_MODES, x->plane[0].src.buf,
+                            x->plane[0].src.stride, xd->plane[0].dst.buf,
+                            xd->plane[0].dst.stride, 0, 0, 0);
     err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
                        xd->plane[0].dst.buf, xd->plane[0].dst.stride);
 
@@ -231,8 +227,8 @@ static void update_mbgraph_frame_stats(AV1_COMP *cpi,
 
   int mb_col, mb_row, offset = 0;
   int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
-  MV gld_top_mv = { 0, 0 };
-  MODE_INFO mi_local;
+  MV gld_top_mv = kZeroMv;
+  MB_MODE_INFO mi_local;
 
   av1_zero(mi_local);
   // Set up limit values for motion vectors to prevent them extending outside
@@ -244,9 +240,9 @@ static void update_mbgraph_frame_stats(AV1_COMP *cpi,
   xd->plane[0].pre[0].stride = buf->y_stride;
   xd->plane[1].dst.stride = buf->uv_stride;
   xd->mi[0] = &mi_local;
-  mi_local.mbmi.sb_type = BLOCK_16X16;
-  mi_local.mbmi.ref_frame[0] = LAST_FRAME;
-  mi_local.mbmi.ref_frame[1] = NONE_FRAME;
+  mi_local.sb_type = BLOCK_16X16;
+  mi_local.ref_frame[0] = LAST_FRAME;
+  mi_local.ref_frame[1] = NONE_FRAME;
 
   for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
     MV gld_left_mv = gld_top_mv;
diff --git a/third_party/aom/av1/encoder/mbgraph.h b/third_party/aom/av1/encoder/mbgraph.h
index 758e2ad15..3e0a4fa9b 100644
--- a/third_party/aom/av1/encoder/mbgraph.h
+++ b/third_party/aom/av1/encoder/mbgraph.h
@@ -23,10 +23,12 @@ typedef struct {
       int_mv mv;
       PREDICTION_MODE mode;
     } m;
-  } ref[TOTAL_REFS_PER_FRAME];
+  } ref[REF_FRAMES];
 } MBGRAPH_MB_STATS;
 
-typedef struct { MBGRAPH_MB_STATS *mb_stats; } MBGRAPH_FRAME_STATS;
+typedef struct {
+  MBGRAPH_MB_STATS *mb_stats;
+} MBGRAPH_FRAME_STATS;
 
 struct AV1_COMP;
 
diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c
index 6c8503da0..c4572a341 100644
--- a/third_party/aom/av1/encoder/mcomp.c
+++ b/third_party/aom/av1/encoder/mcomp.c
@@ -13,8 +13,8 @@
 #include <math.h>
 #include <stdio.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
@@ -22,9 +22,11 @@
 
 #include "av1/common/common.h"
 #include "av1/common/mvref_common.h"
+#include "av1/common/onyxc_int.h"
 #include "av1/common/reconinter.h"
 
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/encodemv.h"
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/rdopt.h"
 
@@ -54,10 +56,9 @@ void av1_set_mv_search_range(MvLimits *mv_limits, const MV *mv) {
   if (mv_limits->row_max > row_max) mv_limits->row_max = row_max;
 }
 
-static void av1_set_subpel_mv_search_range(const MvLimits *mv_limits,
-                                           int *col_min, int *col_max,
-                                           int *row_min, int *row_max,
-                                           const MV *ref_mv) {
+static void set_subpel_mv_search_range(const MvLimits *mv_limits, int *col_min,
+                                       int *col_max, int *row_min, int *row_max,
+                                       const MV *ref_mv) {
   const int max_mv = MAX_FULL_PEL_VAL * 8;
   const int minc = AOMMAX(mv_limits->col_min * 8, ref_mv->col - max_mv);
   const int maxc = AOMMIN(mv_limits->col_max * 8, ref_mv->col + max_mv);
@@ -172,57 +173,64 @@ void av1_init3smotion_compensation(search_site_config *cfg, int stride) {
 static INLINE int sp(int x) { return x & 7; }
 
 static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
-  return &buf[(r >> 3) * stride + (c >> 3)];
+  const int offset = (r >> 3) * stride + (c >> 3);
+  return buf + offset;
 }
 
 /* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER(v, r, c)                                             \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                 \
-    MV this_mv = { r, c };                                                \
-    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);    \
-    if (second_pred == NULL)                                              \
-      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r),  \
-                         src_address, src_stride, &sse);                  \
-    else if (mask)                                                        \
-      thismse = vfp->msvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
-                          src_address, src_stride, second_pred, mask,     \
-                          mask_stride, invert_mask, &sse);                \
-    else                                                                  \
-      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
-                          src_address, src_stride, &sse, second_pred);    \
-    v += thismse;                                                         \
-    if (v < besterr) {                                                    \
-      besterr = v;                                                        \
-      br = r;                                                             \
-      bc = c;                                                             \
-      *distortion = thismse;                                              \
-      *sse1 = sse;                                                        \
-    }                                                                     \
-  } else {                                                                \
-    v = INT_MAX;                                                          \
+#define CHECK_BETTER(v, r, c)                                                \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
+    MV this_mv = { r, c };                                                   \
+    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);       \
+    if (second_pred == NULL) {                                               \
+      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r),     \
+                         src_address, src_stride, &sse);                     \
+    } else if (mask) {                                                       \
+      thismse = vfp->msvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r),    \
+                          src_address, src_stride, second_pred, mask,        \
+                          mask_stride, invert_mask, &sse);                   \
+    } else {                                                                 \
+      if (xd->jcp_param.use_jnt_comp_avg)                                    \
+        thismse = vfp->jsvaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
+                             src_address, src_stride, &sse, second_pred,     \
+                             &xd->jcp_param);                                \
+      else                                                                   \
+        thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r),  \
+                            src_address, src_stride, &sse, second_pred);     \
+    }                                                                        \
+    v += thismse;                                                            \
+    if (v < besterr) {                                                       \
+      besterr = v;                                                           \
+      br = r;                                                                \
+      bc = c;                                                                \
+      *distortion = thismse;                                                 \
+      *sse1 = sse;                                                           \
+    }                                                                        \
+  } else {                                                                   \
+    v = INT_MAX;                                                             \
   }
 
 #define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
 
 /* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER1(v, r, c)                                              \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
-    MV this_mv = { r, c };                                                  \
-    thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,        \
-                                   pre(y, y_stride, r, c), y_stride, sp(c), \
-                                   sp(r), second_pred, mask, mask_stride,   \
-                                   invert_mask, w, h, &sse);                \
-    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);      \
-    v += thismse;                                                           \
-    if (v < besterr) {                                                      \
-      besterr = v;                                                          \
-      br = r;                                                               \
-      bc = c;                                                               \
-      *distortion = thismse;                                                \
-      *sse1 = sse;                                                          \
-    }                                                                       \
-  } else {                                                                  \
-    v = INT_MAX;                                                            \
+#define CHECK_BETTER1(v, r, c)                                             \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                  \
+    MV this_mv = { r, c };                                                 \
+    thismse = upsampled_pref_error(                                        \
+        xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,    \
+        pre(y, y_stride, r, c), y_stride, sp(c), sp(r), second_pred, mask, \
+        mask_stride, invert_mask, w, h, &sse);                             \
+    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);     \
+    v += thismse;                                                          \
+    if (v < besterr) {                                                     \
+      besterr = v;                                                         \
+      br = r;                                                              \
+      bc = c;                                                              \
+      *distortion = thismse;                                               \
+      *sse1 = sse;                                                         \
+    }                                                                      \
+  } else {                                                                 \
+    v = INT_MAX;                                                           \
   }
 
 #define FIRST_LEVEL_CHECKS                                       \
@@ -294,33 +302,33 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
     }                                              \
   }
 
-#define SETUP_SUBPEL_SEARCH                                                 \
-  const uint8_t *const src_address = x->plane[0].src.buf;                   \
-  const int src_stride = x->plane[0].src.stride;                            \
-  const MACROBLOCKD *xd = &x->e_mbd;                                        \
-  unsigned int besterr = INT_MAX;                                           \
-  unsigned int sse;                                                         \
-  unsigned int whichdir;                                                    \
-  int thismse;                                                              \
-  MV *bestmv = &x->best_mv.as_mv;                                           \
-  const unsigned int halfiters = iters_per_step;                            \
-  const unsigned int quarteriters = iters_per_step;                         \
-  const unsigned int eighthiters = iters_per_step;                          \
-  const int y_stride = xd->plane[0].pre[0].stride;                          \
-  const int offset = bestmv->row * y_stride + bestmv->col;                  \
-  const uint8_t *const y = xd->plane[0].pre[0].buf;                         \
-                                                                            \
-  int br = bestmv->row * 8;                                                 \
-  int bc = bestmv->col * 8;                                                 \
-  int hstep = 4;                                                            \
-  int minc, maxc, minr, maxr;                                               \
-  int tr = br;                                                              \
-  int tc = bc;                                                              \
-                                                                            \
-  av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, \
-                                 ref_mv);                                   \
-                                                                            \
-  bestmv->row *= 8;                                                         \
+#define SETUP_SUBPEL_SEARCH                                             \
+  const uint8_t *const src_address = x->plane[0].src.buf;               \
+  const int src_stride = x->plane[0].src.stride;                        \
+  const MACROBLOCKD *xd = &x->e_mbd;                                    \
+  unsigned int besterr = INT_MAX;                                       \
+  unsigned int sse;                                                     \
+  unsigned int whichdir;                                                \
+  int thismse;                                                          \
+  MV *bestmv = &x->best_mv.as_mv;                                       \
+  const unsigned int halfiters = iters_per_step;                        \
+  const unsigned int quarteriters = iters_per_step;                     \
+  const unsigned int eighthiters = iters_per_step;                      \
+  const int y_stride = xd->plane[0].pre[0].stride;                      \
+  const int offset = bestmv->row * y_stride + bestmv->col;              \
+  const uint8_t *const y = xd->plane[0].pre[0].buf;                     \
+                                                                        \
+  int br = bestmv->row * 8;                                             \
+  int bc = bestmv->col * 8;                                             \
+  int hstep = 4;                                                        \
+  int minc, maxc, minr, maxr;                                           \
+  int tr = br;                                                          \
+  int tc = bc;                                                          \
+                                                                        \
+  set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, \
+                             ref_mv);                                   \
+                                                                        \
+  bestmv->row *= 8;                                                     \
   bestmv->col *= 8;
 
 static unsigned int setup_center_error(
@@ -331,25 +339,34 @@ static unsigned int setup_center_error(
     int mask_stride, int invert_mask, int w, int h, int offset, int *mvjcost,
     int *mvcost[2], unsigned int *sse1, int *distortion) {
   unsigned int besterr;
-#if CONFIG_HIGHBITDEPTH
   if (second_pred != NULL) {
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
-      if (mask)
+      if (mask) {
         aom_highbd_comp_mask_pred(comp_pred16, second_pred, w, h, y + offset,
                                   y_stride, mask, mask_stride, invert_mask);
-      else
-        aom_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
-                                 y_stride);
+      } else {
+        if (xd->jcp_param.use_jnt_comp_avg)
+          aom_highbd_jnt_comp_avg_pred(comp_pred16, second_pred, w, h,
+                                       y + offset, y_stride, &xd->jcp_param);
+        else
+          aom_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
+                                   y_stride);
+      }
       besterr =
           vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1);
     } else {
       DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
-      if (mask)
+      if (mask) {
         aom_comp_mask_pred(comp_pred, second_pred, w, h, y + offset, y_stride,
                            mask, mask_stride, invert_mask);
-      else
-        aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+      } else {
+        if (xd->jcp_param.use_jnt_comp_avg)
+          aom_jnt_comp_avg_pred(comp_pred, second_pred, w, h, y + offset,
+                                y_stride, &xd->jcp_param);
+        else
+          aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+      }
       besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
     }
   } else {
@@ -357,22 +374,6 @@ static unsigned int setup_center_error(
   }
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-#else
-  (void)xd;
-  if (second_pred != NULL) {
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
-    if (mask)
-      aom_comp_mask_pred(comp_pred, second_pred, w, h, y + offset, y_stride,
-                         mask, mask_stride, invert_mask);
-    else
-      aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
-    besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
-  } else {
-    besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
-  }
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-#endif  // CONFIG_HIGHBITDEPTH
   return besterr;
 }
 
@@ -401,11 +402,13 @@ static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) {
 }
 
 int av1_find_best_sub_pixel_tree_pruned_evenmore(
-    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
     unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) {
+    int mask_stride, int invert_mask, int w, int h,
+    int use_accurate_subpel_search) {
   SETUP_SUBPEL_SEARCH;
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                src_address, src_stride, y, y_stride,
@@ -418,7 +421,10 @@ int av1_find_best_sub_pixel_tree_pruned_evenmore(
   (void)allow_hp;
   (void)forced_stop;
   (void)hstep;
-  (void)use_upsampled_ref;
+  (void)use_accurate_subpel_search;
+  (void)cm;
+  (void)mi_row;
+  (void)mi_col;
 
   if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
       cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
@@ -468,13 +474,18 @@ int av1_find_best_sub_pixel_tree_pruned_evenmore(
 }
 
 int av1_find_best_sub_pixel_tree_pruned_more(
-    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
     unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) {
+    int mask_stride, int invert_mask, int w, int h,
+    int use_accurate_subpel_search) {
   SETUP_SUBPEL_SEARCH;
-  (void)use_upsampled_ref;
+  (void)use_accurate_subpel_search;
+  (void)cm;
+  (void)mi_row;
+  (void)mi_col;
 
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                src_address, src_stride, y, y_stride,
@@ -531,13 +542,18 @@ int av1_find_best_sub_pixel_tree_pruned_more(
 }
 
 int av1_find_best_sub_pixel_tree_pruned(
-    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
     unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) {
+    int mask_stride, int invert_mask, int w, int h,
+    int use_accurate_subpel_search) {
   SETUP_SUBPEL_SEARCH;
-  (void)use_upsampled_ref;
+  (void)use_accurate_subpel_search;
+  (void)cm;
+  (void)mi_row;
+  (void)mi_col;
 
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                src_address, src_stride, y, y_stride,
@@ -624,7 +640,8 @@ static const MV search_step_table[12] = {
 };
 /* clang-format on */
 
-static int upsampled_pref_error(const MACROBLOCKD *xd,
+static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                int mi_row, int mi_col, const MV *const mv,
                                 const aom_variance_fn_ptr_t *vfp,
                                 const uint8_t *const src, const int src_stride,
                                 const uint8_t *const y, int y_stride,
@@ -633,73 +650,105 @@ static int upsampled_pref_error(const MACROBLOCKD *xd,
                                 int mask_stride, int invert_mask, int w, int h,
                                 unsigned int *sse) {
   unsigned int besterr;
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
     if (second_pred != NULL) {
-      if (mask)
+      if (mask) {
         aom_highbd_comp_mask_upsampled_pred(
-            pred16, second_pred, w, h, subpel_x_q3, subpel_y_q3, y, y_stride,
-            mask, mask_stride, invert_mask, xd->bd);
-      else
-        aom_highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h,
-                                           subpel_x_q3, subpel_y_q3, y,
-                                           y_stride, xd->bd);
+            xd, cm, mi_row, mi_col, mv, pred16, second_pred, w, h, subpel_x_q3,
+            subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask, xd->bd);
+      } else {
+        if (xd->jcp_param.use_jnt_comp_avg)
+          aom_highbd_jnt_comp_avg_upsampled_pred(
+              xd, cm, mi_row, mi_col, mv, pred16, second_pred, w, h,
+              subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd, &xd->jcp_param);
+        else
+          aom_highbd_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16,
+                                             second_pred, w, h, subpel_x_q3,
+                                             subpel_y_q3, y, y_stride, xd->bd);
+      }
     } else {
-      aom_highbd_upsampled_pred(pred16, w, h, subpel_x_q3, subpel_y_q3, y,
-                                y_stride, xd->bd);
+      aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, w, h,
+                                subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd);
     }
 
     besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, sse);
   } else {
     DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
-#else
-  DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
-  (void)xd;
-#endif  // CONFIG_HIGHBITDEPTH
     if (second_pred != NULL) {
-      if (mask)
-        aom_comp_mask_upsampled_pred(pred, second_pred, w, h, subpel_x_q3,
-                                     subpel_y_q3, y, y_stride, mask,
-                                     mask_stride, invert_mask);
-      else
-        aom_comp_avg_upsampled_pred(pred, second_pred, w, h, subpel_x_q3,
-                                    subpel_y_q3, y, y_stride);
+      if (mask) {
+        aom_comp_mask_upsampled_pred(
+            xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3,
+            subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask);
+      } else {
+        if (xd->jcp_param.use_jnt_comp_avg)
+          aom_jnt_comp_avg_upsampled_pred(
+              xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3,
+              subpel_y_q3, y, y_stride, &xd->jcp_param);
+        else
+          aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred,
+                                      second_pred, w, h, subpel_x_q3,
+                                      subpel_y_q3, y, y_stride);
+      }
     } else {
-      aom_upsampled_pred(pred, w, h, subpel_x_q3, subpel_y_q3, y, y_stride);
+      aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
+                         subpel_y_q3, y, y_stride);
     }
 
     besterr = vfp->vf(pred, w, src, src_stride, sse);
-#if CONFIG_HIGHBITDEPTH
   }
-#endif
   return besterr;
 }
 
 static unsigned int upsampled_setup_center_error(
-    const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
-    int error_per_bit, const aom_variance_fn_ptr_t *vfp,
-    const uint8_t *const src, const int src_stride, const uint8_t *const y,
-    int y_stride, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h, int offset, int *mvjcost,
-    int *mvcost[2], unsigned int *sse1, int *distortion) {
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *bestmv, const MV *ref_mv, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, const uint8_t *const src,
+    const int src_stride, const uint8_t *const y, int y_stride,
+    const uint8_t *second_pred, const uint8_t *mask, int mask_stride,
+    int invert_mask, int w, int h, int offset, int *mvjcost, int *mvcost[2],
+    unsigned int *sse1, int *distortion) {
   unsigned int besterr = upsampled_pref_error(
-      xd, vfp, src, src_stride, y + offset, y_stride, 0, 0, second_pred, mask,
-      mask_stride, invert_mask, w, h, sse1);
+      xd, cm, mi_row, mi_col, bestmv, vfp, src, src_stride, y + offset,
+      y_stride, 0, 0, second_pred, mask, mask_stride, invert_mask, w, h, sse1);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
   return besterr;
 }
 
+// when use_accurate_subpel_search == 0
+static INLINE unsigned int estimate_upsampled_pref_error(
+    MACROBLOCKD *xd, const aom_variance_fn_ptr_t *vfp, const uint8_t *const src,
+    const int src_stride, const uint8_t *const pre, int y_stride,
+    int subpel_x_q3, int subpel_y_q3, const uint8_t *second_pred,
+    const uint8_t *mask, int mask_stride, int invert_mask, unsigned int *sse) {
+  if (second_pred == NULL) {
+    return vfp->svf(pre, y_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+                    sse);
+  } else if (mask) {
+    return vfp->msvf(pre, y_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+                     second_pred, mask, mask_stride, invert_mask, sse);
+  } else {
+    if (xd->jcp_param.use_jnt_comp_avg)
+      return vfp->jsvaf(pre, y_stride, subpel_x_q3, subpel_y_q3, src,
+                        src_stride, sse, second_pred, &xd->jcp_param);
+    else
+      return vfp->svaf(pre, y_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+                       sse, second_pred);
+  }
+}
+
 int av1_find_best_sub_pixel_tree(
-    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
     unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) {
+    int mask_stride, int invert_mask, int w, int h,
+    int use_accurate_subpel_search) {
   const uint8_t *const src_address = x->plane[0].src.buf;
   const int src_stride = x->plane[0].src.stride;
-  const MACROBLOCKD *xd = &x->e_mbd;
+  MACROBLOCKD *xd = &x->e_mbd;
   unsigned int besterr = INT_MAX;
   unsigned int sse;
   unsigned int thismse;
@@ -720,8 +769,7 @@ int av1_find_best_sub_pixel_tree(
   int kr, kc;
   int minc, maxc, minr, maxr;
 
-  av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
-                                 ref_mv);
+  set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, ref_mv);
 
   if (!allow_hp)
     if (round == 3) round = 2;
@@ -729,12 +777,11 @@ int av1_find_best_sub_pixel_tree(
   bestmv->row *= 8;
   bestmv->col *= 8;
 
-  // use_upsampled_ref can be 0 or 1
-  if (use_upsampled_ref)
+  if (use_accurate_subpel_search)
     besterr = upsampled_setup_center_error(
-        xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
-        y_stride, second_pred, mask, mask_stride, invert_mask, w, h, offset,
-        mvjcost, mvcost, sse1, distortion);
+        xd, cm, mi_row, mi_col, bestmv, ref_mv, error_per_bit, vfp, src_address,
+        src_stride, y, y_stride, second_pred, mask, mask_stride, invert_mask, w,
+        h, offset, mvjcost, mvcost, sse1, distortion);
   else
     besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                  src_address, src_stride, y, y_stride,
@@ -751,23 +798,16 @@ int av1_find_best_sub_pixel_tree(
       if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
         MV this_mv = { tr, tc };
 
-        if (use_upsampled_ref) {
-          thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,
-                                         pre(y, y_stride, tr, tc), y_stride,
-                                         sp(tc), sp(tr), second_pred, mask,
-                                         mask_stride, invert_mask, w, h, &sse);
+        if (use_accurate_subpel_search) {
+          thismse = upsampled_pref_error(
+              xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,
+              pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred,
+              mask, mask_stride, invert_mask, w, h, &sse);
         } else {
-          const uint8_t *const pre_address = pre(y, y_stride, tr, tc);
-          if (second_pred == NULL)
-            thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
-                               src_address, src_stride, &sse);
-          else if (mask)
-            thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr),
-                                src_address, src_stride, second_pred, mask,
-                                mask_stride, invert_mask, &sse);
-          else
-            thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
-                                src_address, src_stride, &sse, second_pred);
+          thismse = estimate_upsampled_pref_error(
+              xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc),
+              y_stride, sp(tc), sp(tr), second_pred, mask, mask_stride,
+              invert_mask, &sse);
         }
 
         cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
@@ -793,24 +833,16 @@ int av1_find_best_sub_pixel_tree(
     if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
       MV this_mv = { tr, tc };
 
-      if (use_upsampled_ref) {
-        thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,
-                                       pre(y, y_stride, tr, tc), y_stride,
-                                       sp(tc), sp(tr), second_pred, mask,
-                                       mask_stride, invert_mask, w, h, &sse);
+      if (use_accurate_subpel_search) {
+        thismse = upsampled_pref_error(
+            xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,
+            pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred,
+            mask, mask_stride, invert_mask, w, h, &sse);
       } else {
-        const uint8_t *const pre_address = pre(y, y_stride, tr, tc);
-
-        if (second_pred == NULL)
-          thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address,
-                             src_stride, &sse);
-        else if (mask)
-          thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr),
-                              src_address, src_stride, second_pred, mask,
-                              mask_stride, invert_mask, &sse);
-        else
-          thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
-                              src_address, src_stride, &sse, second_pred);
+        thismse = estimate_upsampled_pref_error(
+            xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc),
+            y_stride, sp(tc), sp(tr), second_pred, mask, mask_stride,
+            invert_mask, &sse);
       }
 
       cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
@@ -835,7 +867,7 @@ int av1_find_best_sub_pixel_tree(
     }
 
     if (iters_per_step > 1 && best_idx != -1) {
-      if (use_upsampled_ref) {
+      if (use_accurate_subpel_search) {
         SECOND_LEVEL_CHECKS_BEST(1);
       } else {
         SECOND_LEVEL_CHECKS_BEST(0);
@@ -861,63 +893,51 @@ int av1_find_best_sub_pixel_tree(
 #undef PRE
 #undef CHECK_BETTER
 
-#if CONFIG_WARPED_MOTION
 unsigned int av1_compute_motion_cost(const AV1_COMP *cpi, MACROBLOCK *const x,
                                      BLOCK_SIZE bsize, int mi_row, int mi_col,
                                      const MV *this_mv) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mi[0];
-  MB_MODE_INFO *mbmi = &mi->mbmi;
   const uint8_t *const src = x->plane[0].src.buf;
   const int src_stride = x->plane[0].src.stride;
   uint8_t *const dst = xd->plane[0].dst.buf;
   const int dst_stride = xd->plane[0].dst.stride;
   const aom_variance_fn_ptr_t *vfp = &cpi->fn_ptr[bsize];
-  const MV ref_mv = x->mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv;
+  const int_mv ref_mv = av1_get_ref_mv(x, 0);
   unsigned int mse;
   unsigned int sse;
 
   av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
   mse = vfp->vf(dst, dst_stride, src, src_stride, &sse);
-  mse +=
-      mv_err_cost(this_mv, &ref_mv, x->nmvjointcost, x->mvcost, x->errorperbit);
+  mse += mv_err_cost(this_mv, &ref_mv.as_mv, x->nmvjointcost, x->mvcost,
+                     x->errorperbit);
   return mse;
 }
 
 // Refine MV in a small range
-#if WARPED_MOTION_SORT_SAMPLES
 unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                  int *pts0, int *pts_inref0, int *pts_mv0,
+                                  int *pts0, int *pts_inref0,
                                   int total_samples) {
-#else
-unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
-                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                  int *pts, int *pts_inref) {
-#endif  // WARPED_MOTION_SORT_SAMPLES
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mi[0];
-  MB_MODE_INFO *mbmi = &mi->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   const MV neighbors[8] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 },
                             { 0, -2 }, { 2, 0 }, { 0, 2 }, { -2, 0 } };
-  const MV ref_mv = x->mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv;
+  const int_mv ref_mv = av1_get_ref_mv(x, 0);
   int16_t br = mbmi->mv[0].as_mv.row;
   int16_t bc = mbmi->mv[0].as_mv.col;
   int16_t *tr = &mbmi->mv[0].as_mv.row;
   int16_t *tc = &mbmi->mv[0].as_mv.col;
   WarpedMotionParams best_wm_params = mbmi->wm_params[0];
-#if WARPED_MOTION_SORT_SAMPLES
   int best_num_proj_ref = mbmi->num_proj_ref[0];
-#endif  // WARPED_MOTION_SORT_SAMPLES
   unsigned int bestmse;
   int minc, maxc, minr, maxr;
   const int start = cm->allow_high_precision_mv ? 0 : 4;
   int ite;
 
-  av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
-                                 &ref_mv);
+  set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
+                             &ref_mv.as_mv);
 
   // Calculate the center position's error
   assert(bc >= minc && bc <= maxc && br >= minr && br <= maxr);
@@ -937,15 +957,13 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
 
       if (*tc >= minc && *tc <= maxc && *tr >= minr && *tr <= maxr) {
         MV this_mv = { *tr, *tc };
-#if WARPED_MOTION_SORT_SAMPLES
         int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
 
         memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
         memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
         if (total_samples > 1)
           mbmi->num_proj_ref[0] =
-              sortSamples(pts_mv0, &this_mv, pts, pts_inref, total_samples);
-#endif  // WARPED_MOTION_SORT_SAMPLES
+              selectSamples(&this_mv, pts, pts_inref, total_samples, bsize);
 
         if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize, *tr,
                              *tc, &mbmi->wm_params[0], mi_row, mi_col)) {
@@ -955,9 +973,7 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
           if (thismse < bestmse) {
             best_idx = idx;
             best_wm_params = mbmi->wm_params[0];
-#if WARPED_MOTION_SORT_SAMPLES
             best_num_proj_ref = mbmi->num_proj_ref[0];
-#endif  // WARPED_MOTION_SORT_SAMPLES
             bestmse = thismse;
           }
         }
@@ -975,12 +991,9 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
   *tr = br;
   *tc = bc;
   mbmi->wm_params[0] = best_wm_params;
-#if WARPED_MOTION_SORT_SAMPLES
   mbmi->num_proj_ref[0] = best_num_proj_ref;
-#endif  // WARPED_MOTION_SORT_SAMPLES
   return bestmse;
 }
-#endif  // CONFIG_WARPED_MOTION
 
 static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col,
                                int range) {
@@ -1386,11 +1399,19 @@ int av1_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
   const MV mv = { best_mv->row * 8, best_mv->col * 8 };
   unsigned int unused;
 
-  return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0,
-                   what->buf, what->stride, &unused, second_pred) +
-         (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
-                                   x->errorperbit)
-                     : 0);
+  if (xd->jcp_param.use_jnt_comp_avg)
+    return vfp->jsvaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0,
+                      what->buf, what->stride, &unused, second_pred,
+                      &xd->jcp_param) +
+           (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+                                     x->errorperbit)
+                       : 0);
+  else
+    return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0,
+                     what->buf, what->stride, &unused, second_pred) +
+           (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+                                     x->errorperbit)
+                       : 0);
 }
 
 int av1_get_mvpred_mask_var(const MACROBLOCK *x, const MV *best_mv,
@@ -1785,205 +1806,6 @@ int av1_diamond_search_sad_c(MACROBLOCK *x, const search_site_config *cfg,
   return bestsad;
 }
 
-static int vector_match(int16_t *ref, int16_t *src, int bwl) {
-  int best_sad = INT_MAX;
-  int this_sad;
-  int d;
-  int center, offset = 0;
-  int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.
-  for (d = 0; d <= bw; d += 16) {
-    this_sad = aom_vector_var(&ref[d], src, bwl);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      offset = d;
-    }
-  }
-  center = offset;
-
-  for (d = -8; d <= 8; d += 16) {
-    int this_pos = offset + d;
-    // check limit
-    if (this_pos < 0 || this_pos > bw) continue;
-    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      center = this_pos;
-    }
-  }
-  offset = center;
-
-  for (d = -4; d <= 4; d += 8) {
-    int this_pos = offset + d;
-    // check limit
-    if (this_pos < 0 || this_pos > bw) continue;
-    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      center = this_pos;
-    }
-  }
-  offset = center;
-
-  for (d = -2; d <= 2; d += 4) {
-    int this_pos = offset + d;
-    // check limit
-    if (this_pos < 0 || this_pos > bw) continue;
-    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      center = this_pos;
-    }
-  }
-  offset = center;
-
-  for (d = -1; d <= 1; d += 2) {
-    int this_pos = offset + d;
-    // check limit
-    if (this_pos < 0 || this_pos > bw) continue;
-    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      center = this_pos;
-    }
-  }
-
-  return (center - (bw >> 1));
-}
-
-static const MV search_pos[4] = {
-  { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 },
-};
-
-unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
-                                           BLOCK_SIZE bsize, int mi_row,
-                                           int mi_col) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
-  DECLARE_ALIGNED(16, int16_t, hbuf[2 * MAX_SB_SIZE]);
-  DECLARE_ALIGNED(16, int16_t, vbuf[2 * MAX_SB_SIZE]);
-  DECLARE_ALIGNED(16, int16_t, src_hbuf[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, int16_t, src_vbuf[MAX_SB_SQUARE]);
-  int idx;
-  const int src_stride = x->plane[0].src.stride;
-  const int ref_stride = xd->plane[0].pre[0].stride;
-  uint8_t const *ref_buf, *src_buf;
-  MV *tmp_mv = &xd->mi[0]->mbmi.mv[0].as_mv;
-  unsigned int best_sad, tmp_sad, sad_arr[4];
-  MV this_mv;
-  const YV12_BUFFER_CONFIG *scaled_ref_frame =
-      av1_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]);
-
-  if (scaled_ref_frame) {
-    int i;
-    // Swap out the reference frame for a version that's been scaled to
-    // match the resolution of the current frame, allowing the existing
-    // motion search code to be used without additional modifications.
-    for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
-    av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
-  }
-
-#if CONFIG_HIGHBITDEPTH
-  {
-    unsigned int this_sad;
-    tmp_mv->row = 0;
-    tmp_mv->col = 0;
-    this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
-                                      xd->plane[0].pre[0].buf, ref_stride);
-
-    if (scaled_ref_frame) {
-      int i;
-      for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
-    }
-    return this_sad;
-  }
-#endif
-
-  const int bw = 4 << b_width_log2_lookup[bsize];
-  const int bh = 4 << b_height_log2_lookup[bsize];
-  const int search_width = bw << 1;
-  const int search_height = bh << 1;
-  const int norm_factor = 3 + (bw >> 5);
-
-  // Set up prediction 1-D reference set
-  ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
-  for (idx = 0; idx < search_width; idx += 16) {
-    aom_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
-    ref_buf += 16;
-  }
-
-  ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
-  for (idx = 0; idx < search_height; ++idx) {
-    vbuf[idx] = aom_int_pro_col(ref_buf, bw) >> norm_factor;
-    ref_buf += ref_stride;
-  }
-
-  // Set up src 1-D reference set
-  for (idx = 0; idx < bw; idx += 16) {
-    src_buf = x->plane[0].src.buf + idx;
-    aom_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
-  }
-
-  src_buf = x->plane[0].src.buf;
-  for (idx = 0; idx < bh; ++idx) {
-    src_vbuf[idx] = aom_int_pro_col(src_buf, bw) >> norm_factor;
-    src_buf += src_stride;
-  }
-
-  // Find the best match per 1-D search
-  tmp_mv->col = vector_match(hbuf, src_hbuf, b_width_log2_lookup[bsize]);
-  tmp_mv->row = vector_match(vbuf, src_vbuf, b_height_log2_lookup[bsize]);
-
-  this_mv = *tmp_mv;
-  src_buf = x->plane[0].src.buf;
-  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
-  best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
-
-  {
-    const uint8_t *const pos[4] = {
-      ref_buf - ref_stride, ref_buf - 1, ref_buf + 1, ref_buf + ref_stride,
-    };
-
-    cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, sad_arr);
-  }
-
-  for (idx = 0; idx < 4; ++idx) {
-    if (sad_arr[idx] < best_sad) {
-      best_sad = sad_arr[idx];
-      tmp_mv->row = search_pos[idx].row + this_mv.row;
-      tmp_mv->col = search_pos[idx].col + this_mv.col;
-    }
-  }
-
-  if (sad_arr[0] < sad_arr[3])
-    this_mv.row -= 1;
-  else
-    this_mv.row += 1;
-
-  if (sad_arr[1] < sad_arr[2])
-    this_mv.col -= 1;
-  else
-    this_mv.col += 1;
-
-  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
-
-  tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
-  if (best_sad > tmp_sad) {
-    *tmp_mv = this_mv;
-    best_sad = tmp_sad;
-  }
-
-  tmp_mv->row *= 8;
-  tmp_mv->col *= 8;
-
-  if (scaled_ref_frame) {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
-  }
-
-  return best_sad;
-}
-
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
               point as the best match, we will do a final 1-away diamond
               refining search  */
@@ -2110,197 +1932,6 @@ static int full_pixel_exhaustive(const AV1_COMP *const cpi, MACROBLOCK *x,
   return bestsme;
 }
 
-int av1_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const aom_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
-  int r, c;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = AOMMAX(ref_mv->row - distance, x->mv_limits.row_min);
-  const int row_max = AOMMIN(ref_mv->row + distance, x->mv_limits.row_max);
-  const int col_min = AOMMAX(ref_mv->col - distance, x->mv_limits.col_min);
-  const int col_max = AOMMIN(ref_mv->col + distance, x->mv_limits.col_max);
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
-                  in_what->stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  *best_mv = *ref_mv;
-
-  for (r = row_min; r < row_max; ++r) {
-    for (c = col_min; c < col_max; ++c) {
-      const MV mv = { r, c };
-      const int sad =
-          fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
-                      in_what->stride) +
-          mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-      if (sad < best_sad) {
-        best_sad = sad;
-        *best_mv = mv;
-      }
-    }
-  }
-  return best_sad;
-}
-
-int av1_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const aom_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
-  int r;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = AOMMAX(ref_mv->row - distance, x->mv_limits.row_min);
-  const int row_max = AOMMIN(ref_mv->row + distance, x->mv_limits.row_max);
-  const int col_min = AOMMAX(ref_mv->col - distance, x->mv_limits.col_min);
-  const int col_max = AOMMIN(ref_mv->col + distance, x->mv_limits.col_max);
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  unsigned int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
-                  in_what->stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  *best_mv = *ref_mv;
-
-  for (r = row_min; r < row_max; ++r) {
-    int c = col_min;
-    const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
-
-    if (fn_ptr->sdx3f != NULL) {
-      while ((c + 2) < col_max) {
-        int i;
-        DECLARE_ALIGNED(16, uint32_t, sads[3]);
-
-        fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
-                      sads);
-
-        for (i = 0; i < 3; ++i) {
-          unsigned int sad = sads[i];
-          if (sad < best_sad) {
-            const MV mv = { r, c };
-            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
-          }
-          ++check_here;
-          ++c;
-        }
-      }
-    }
-
-    while (c < col_max) {
-      unsigned int sad =
-          fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride);
-      if (sad < best_sad) {
-        const MV mv = { r, c };
-        sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-        if (sad < best_sad) {
-          best_sad = sad;
-          *best_mv = mv;
-        }
-      }
-      ++check_here;
-      ++c;
-    }
-  }
-
-  return best_sad;
-}
-
-int av1_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const aom_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
-  int r;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = AOMMAX(ref_mv->row - distance, x->mv_limits.row_min);
-  const int row_max = AOMMIN(ref_mv->row + distance, x->mv_limits.row_max);
-  const int col_min = AOMMAX(ref_mv->col - distance, x->mv_limits.col_min);
-  const int col_max = AOMMIN(ref_mv->col + distance, x->mv_limits.col_max);
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  unsigned int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
-                  in_what->stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  *best_mv = *ref_mv;
-
-  for (r = row_min; r < row_max; ++r) {
-    int c = col_min;
-    const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
-
-    if (fn_ptr->sdx8f != NULL) {
-      while ((c + 7) < col_max) {
-        int i;
-        DECLARE_ALIGNED(16, uint32_t, sads[8]);
-
-        fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride,
-                      sads);
-
-        for (i = 0; i < 8; ++i) {
-          unsigned int sad = sads[i];
-          if (sad < best_sad) {
-            const MV mv = { r, c };
-            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
-          }
-          ++check_here;
-          ++c;
-        }
-      }
-    }
-
-    if (fn_ptr->sdx3f != NULL) {
-      while ((c + 2) < col_max) {
-        int i;
-        DECLARE_ALIGNED(16, uint32_t, sads[3]);
-
-        fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
-                      sads);
-
-        for (i = 0; i < 3; ++i) {
-          unsigned int sad = sads[i];
-          if (sad < best_sad) {
-            const MV mv = { r, c };
-            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
-          }
-          ++check_here;
-          ++c;
-        }
-      }
-    }
-
-    while (c < col_max) {
-      unsigned int sad =
-          fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride);
-      if (sad < best_sad) {
-        const MV mv = { r, c };
-        sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-        if (sad < best_sad) {
-          best_sad = sad;
-          *best_mv = mv;
-        }
-      }
-      ++check_here;
-      ++c;
-    }
-  }
-
-  return best_sad;
-}
-
 int av1_refining_search_sad(MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                             int search_range,
                             const aom_variance_fn_ptr_t *fn_ptr,
@@ -2394,16 +2025,23 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
 
   clamp_mv(best_mv, x->mv_limits.col_min, x->mv_limits.col_max,
            x->mv_limits.row_min, x->mv_limits.row_max);
-  if (mask)
+  if (mask) {
     best_sad = fn_ptr->msdf(what->buf, what->stride,
                             get_buf_from_mv(in_what, best_mv), in_what->stride,
                             second_pred, mask, mask_stride, invert_mask) +
                mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
-  else
-    best_sad =
-        fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
-                     in_what->stride, second_pred) +
-        mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
+  } else {
+    if (xd->jcp_param.use_jnt_comp_avg)
+      best_sad = fn_ptr->jsdaf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, best_mv),
+                               in_what->stride, second_pred, &xd->jcp_param) +
+                 mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
+    else
+      best_sad = fn_ptr->sdaf(what->buf, what->stride,
+                              get_buf_from_mv(in_what, best_mv),
+                              in_what->stride, second_pred) +
+                 mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
+  }
 
   for (i = 0; i < search_range; ++i) {
     int best_site = -1;
@@ -2414,14 +2052,20 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
 
       if (is_mv_in(&x->mv_limits, &mv)) {
         unsigned int sad;
-        if (mask)
+        if (mask) {
           sad = fn_ptr->msdf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &mv), in_what->stride,
                              second_pred, mask, mask_stride, invert_mask);
-        else
-          sad = fn_ptr->sdaf(what->buf, what->stride,
-                             get_buf_from_mv(in_what, &mv), in_what->stride,
-                             second_pred);
+        } else {
+          if (xd->jcp_param.use_jnt_comp_avg)
+            sad = fn_ptr->jsdaf(what->buf, what->stride,
+                                get_buf_from_mv(in_what, &mv), in_what->stride,
+                                second_pred, &xd->jcp_param);
+          else
+            sad = fn_ptr->sdaf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &mv), in_what->stride,
+                               second_pred);
+        }
         if (sad < best_sad) {
           sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
           if (sad < best_sad) {
@@ -2454,45 +2098,10 @@ static int is_exhaustive_allowed(const AV1_COMP *const cpi, MACROBLOCK *x) {
          (*x->ex_search_count_ptr <= max_ex) && !cpi->rc.is_src_frame_alt_ref;
 }
 
-#if CONFIG_HASH_ME
-#define MAX_HASH_MV_TABLE_SIZE 5
-static void add_to_sort_table(block_hash block_hashes[MAX_HASH_MV_TABLE_SIZE],
-                              int costs[MAX_HASH_MV_TABLE_SIZE], int *existing,
-                              int max_size, block_hash curr_block,
-                              int curr_cost) {
-  if (*existing < max_size) {
-    block_hashes[*existing] = curr_block;
-    costs[*existing] = curr_cost;
-    (*existing)++;
-  } else {
-    int max_cost = 0;
-    int max_cost_idx = 0;
-    for (int i = 0; i < max_size; i++) {
-      if (costs[i] > max_cost) {
-        max_cost = costs[i];
-        max_cost_idx = i;
-      }
-    }
-
-    if (curr_cost < max_cost) {
-      block_hashes[max_cost_idx] = curr_block;
-      costs[max_cost_idx] = curr_cost;
-    }
-  }
-}
-#endif
-
-#if CONFIG_HASH_ME
 int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                           MV *mvp_full, int step_param, int error_per_bit,
                           int *cost_list, const MV *ref_mv, int var_max, int rd,
                           int x_pos, int y_pos, int intra) {
-#else
-int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                          MV *mvp_full, int step_param, int error_per_bit,
-                          int *cost_list, const MV *ref_mv, int var_max,
-                          int rd) {
-#endif
   const SPEED_FEATURES *const sf = &cpi->sf;
   const SEARCH_METHODS method = sf->mv.search_method;
   const aom_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
@@ -2539,7 +2148,7 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
       if (is_exhaustive_allowed(cpi, x)) {
         int exhuastive_thr = sf->exhaustive_searches_thresh;
         exhuastive_thr >>=
-            10 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+            10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
 
         // Threshold variance for an exhaustive full search.
         if (var > exhuastive_thr) {
@@ -2556,44 +2165,37 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
         }
       }
       break;
-
-      break;
     default: assert(0 && "Invalid search method.");
   }
 
   if (method != NSTEP && rd && var < var_max)
     var = av1_get_mvpred_var(x, &x->best_mv.as_mv, ref_mv, fn_ptr, 1);
 
-#if CONFIG_HASH_ME
   do {
-    if (!cpi->common.allow_screen_content_tools) {
-      break;
-    }
+    if (!av1_use_hash_me(&cpi->common)) break;
+
     // already single ME
     // get block size and original buffer of current block
     const int block_height = block_size_high[bsize];
     const int block_width = block_size_wide[bsize];
     if (block_height == block_width && x_pos >= 0 && y_pos >= 0) {
       if (block_width == 4 || block_width == 8 || block_width == 16 ||
-          block_width == 32 || block_width == 64) {
+          block_width == 32 || block_width == 64 || block_width == 128) {
         uint8_t *what = x->plane[0].src.buf;
         const int what_stride = x->plane[0].src.stride;
-        block_hash block_hashes[MAX_HASH_MV_TABLE_SIZE];
-        int costs[MAX_HASH_MV_TABLE_SIZE];
-        int existing = 0;
-        int i;
         uint32_t hash_value1, hash_value2;
         MV best_hash_mv;
         int best_hash_cost = INT_MAX;
 
         // for the hashMap
         hash_table *ref_frame_hash =
-            intra ? &cpi->common.cur_frame->hash_table
-                  : get_ref_frame_hash_map(cpi,
-                                           x->e_mbd.mi[0]->mbmi.ref_frame[0]);
+            intra
+                ? &cpi->common.cur_frame->hash_table
+                : av1_get_ref_frame_hash_map(cpi, x->e_mbd.mi[0]->ref_frame[0]);
 
-        av1_get_block_hash_value(what, what_stride, block_width, &hash_value1,
-                                 &hash_value2);
+        av1_get_block_hash_value(
+            what, what_stride, block_width, &hash_value1, &hash_value2,
+            x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
 
         const int count = av1_hash_table_count(ref_frame_hash, hash_value1);
         // for intra, at lest one matching can be found, itself.
@@ -2603,44 +2205,31 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 
         Iterator iterator =
             av1_hash_get_first_iterator(ref_frame_hash, hash_value1);
-        for (i = 0; i < count; i++, iterator_increment(&iterator)) {
+        for (int i = 0; i < count; i++, iterator_increment(&iterator)) {
           block_hash ref_block_hash = *(block_hash *)(iterator_get(&iterator));
           if (hash_value2 == ref_block_hash.hash_value2) {
-            // for intra, make sure the prediction is from valid area
-            // not predict from current block.
-            // TODO(roger): check if the constrain is necessary
-            if (intra &&
-                ref_block_hash.y + block_height >
-                    ((y_pos >> MAX_SB_SIZE_LOG2) << MAX_SB_SIZE_LOG2) &&
-                ref_block_hash.x + block_width >
-                    ((x_pos >> MAX_SB_SIZE_LOG2) << MAX_SB_SIZE_LOG2)) {
-              continue;
+            // For intra, make sure the prediction is from valid area.
+            if (intra) {
+              const int mi_col = x_pos / MI_SIZE;
+              const int mi_row = y_pos / MI_SIZE;
+              const MV dv = { 8 * (ref_block_hash.y - y_pos),
+                              8 * (ref_block_hash.x - x_pos) };
+              if (!av1_is_dv_valid(dv, &cpi->common, &x->e_mbd, mi_row, mi_col,
+                                   bsize, cpi->common.seq_params.mib_size_log2))
+                continue;
+            }
+            MV hash_mv;
+            hash_mv.col = ref_block_hash.x - x_pos;
+            hash_mv.row = ref_block_hash.y - y_pos;
+            if (!is_mv_in(&x->mv_limits, &hash_mv)) continue;
+            const int refCost =
+                av1_get_mvpred_var(x, &hash_mv, ref_mv, fn_ptr, 1);
+            if (refCost < best_hash_cost) {
+              best_hash_cost = refCost;
+              best_hash_mv = hash_mv;
             }
-            int refCost =
-                abs(ref_block_hash.x - x_pos) + abs(ref_block_hash.y - y_pos);
-            add_to_sort_table(block_hashes, costs, &existing,
-                              MAX_HASH_MV_TABLE_SIZE, ref_block_hash, refCost);
-          }
-        }
-
-        if (existing == 0) {
-          break;
-        }
-
-        for (i = 0; i < existing; i++) {
-          MV hash_mv;
-          hash_mv.col = block_hashes[i].x - x_pos;
-          hash_mv.row = block_hashes[i].y - y_pos;
-          if (!is_mv_in(&x->mv_limits, &hash_mv)) {
-            continue;
-          }
-          int currHashCost = av1_get_mvpred_var(x, &hash_mv, ref_mv, fn_ptr, 1);
-          if (currHashCost < best_hash_cost) {
-            best_hash_cost = currHashCost;
-            best_hash_mv = hash_mv;
           }
         }
-
         if (best_hash_cost < var) {
           x->second_best_mv = x->best_mv;
           x->best_mv.as_mv = best_hash_mv;
@@ -2649,12 +2238,10 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
       }
     }
   } while (0);
-#endif
 
   return var;
 }
 
-#if CONFIG_MOTION_VAR
 /* returns subpixel variance error function */
 #define DIST(r, c) \
   vfp->osvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, mask, &sse)
@@ -2687,20 +2274,21 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 #define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
 
 #undef CHECK_BETTER1
-#define CHECK_BETTER1(v, r, c)                                              \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
-    thismse =                                                               \
-        upsampled_obmc_pref_error(xd, mask, vfp, z, pre(y, y_stride, r, c), \
-                                  y_stride, sp(c), sp(r), w, h, &sse);      \
-    if ((v = MVC(r, c) + thismse) < besterr) {                              \
-      besterr = v;                                                          \
-      br = r;                                                               \
-      bc = c;                                                               \
-      *distortion = thismse;                                                \
-      *sse1 = sse;                                                          \
-    }                                                                       \
-  } else {                                                                  \
-    v = INT_MAX;                                                            \
+#define CHECK_BETTER1(v, r, c)                                                \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                     \
+    MV this_mv = { r, c };                                                    \
+    thismse = upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, &this_mv,     \
+                                        mask, vfp, z, pre(y, y_stride, r, c), \
+                                        y_stride, sp(c), sp(r), w, h, &sse);  \
+    if ((v = MVC(r, c) + thismse) < besterr) {                                \
+      besterr = v;                                                            \
+      br = r;                                                                 \
+      bc = c;                                                                 \
+      *distortion = thismse;                                                  \
+      *sse1 = sse;                                                            \
+    }                                                                         \
+  } else {                                                                    \
+    v = INT_MAX;                                                              \
   }
 
 static unsigned int setup_obmc_center_error(
@@ -2715,60 +2303,55 @@ static unsigned int setup_obmc_center_error(
   return besterr;
 }
 
-static int upsampled_obmc_pref_error(const MACROBLOCKD *xd, const int32_t *mask,
-                                     const aom_variance_fn_ptr_t *vfp,
-                                     const int32_t *const wsrc,
-                                     const uint8_t *const y, int y_stride,
-                                     int subpel_x_q3, int subpel_y_q3, int w,
-                                     int h, unsigned int *sse) {
+static int upsampled_obmc_pref_error(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *const mv, const int32_t *mask, const aom_variance_fn_ptr_t *vfp,
+    const int32_t *const wsrc, const uint8_t *const y, int y_stride,
+    int subpel_x_q3, int subpel_y_q3, int w, int h, unsigned int *sse) {
   unsigned int besterr;
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
-    aom_highbd_upsampled_pred(pred16, w, h, subpel_x_q3, subpel_y_q3, y,
-                              y_stride, xd->bd);
+    aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, w, h,
+                              subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd);
 
     besterr = vfp->ovf(CONVERT_TO_BYTEPTR(pred16), w, wsrc, mask, sse);
   } else {
     DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
-#else
-  DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
-  (void)xd;
-#endif  // CONFIG_HIGHBITDEPTH
-    aom_upsampled_pred(pred, w, h, subpel_x_q3, subpel_y_q3, y, y_stride);
+    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
+                       subpel_y_q3, y, y_stride);
 
     besterr = vfp->ovf(pred, w, wsrc, mask, sse);
-#if CONFIG_HIGHBITDEPTH
   }
-#endif
   return besterr;
 }
 
 static unsigned int upsampled_setup_obmc_center_error(
-    const MACROBLOCKD *xd, const int32_t *mask, const MV *bestmv,
-    const MV *ref_mv, int error_per_bit, const aom_variance_fn_ptr_t *vfp,
-    const int32_t *const wsrc, const uint8_t *const y, int y_stride, int w,
-    int h, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
-    int *distortion) {
-  unsigned int besterr = upsampled_obmc_pref_error(
-      xd, mask, vfp, wsrc, y + offset, y_stride, 0, 0, w, h, sse1);
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const int32_t *mask, const MV *bestmv, const MV *ref_mv, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc,
+    const uint8_t *const y, int y_stride, int w, int h, int offset,
+    int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion) {
+  unsigned int besterr =
+      upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, bestmv, mask, vfp, wsrc,
+                                y + offset, y_stride, 0, 0, w, h, sse1);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
   return besterr;
 }
 
 int av1_find_best_obmc_sub_pixel_tree_up(
-    MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
-    int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop,
-    int iters_per_step, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, int is_second, int use_upsampled_ref) {
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
+    int is_second, int use_accurate_subpel_search) {
   const int32_t *wsrc = x->wsrc_buf;
   const int32_t *mask = x->mask_buf;
   const int *const z = wsrc;
   const int *const src_address = z;
   MACROBLOCKD *xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[0];
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   unsigned int besterr = INT_MAX;
   unsigned int sse;
   unsigned int thismse;
@@ -2794,8 +2377,7 @@ int av1_find_best_obmc_sub_pixel_tree_up(
 
   int minc, maxc, minr, maxr;
 
-  av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
-                                 ref_mv);
+  set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, ref_mv);
 
   y = pd->pre[is_second].buf;
   y_stride = pd->pre[is_second].stride;
@@ -2806,11 +2388,11 @@ int av1_find_best_obmc_sub_pixel_tree_up(
 
   bestmv->row *= 8;
   bestmv->col *= 8;
-  // use_upsampled_ref can be 0 or 1
-  if (use_upsampled_ref)
+  // use_accurate_subpel_search can be 0 or 1
+  if (use_accurate_subpel_search)
     besterr = upsampled_setup_obmc_center_error(
-        xd, mask, bestmv, ref_mv, error_per_bit, vfp, z, y, y_stride, w, h,
-        offset, mvjcost, mvcost, sse1, distortion);
+        xd, cm, mi_row, mi_col, mask, bestmv, ref_mv, error_per_bit, vfp, z, y,
+        y_stride, w, h, offset, mvjcost, mvcost, sse1, distortion);
   else
     besterr = setup_obmc_center_error(mask, bestmv, ref_mv, error_per_bit, vfp,
                                       z, y, y_stride, offset, mvjcost, mvcost,
@@ -2823,15 +2405,13 @@ int av1_find_best_obmc_sub_pixel_tree_up(
       tc = bc + search_step[idx].col;
       if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
         MV this_mv = { tr, tc };
-        const uint8_t *const pre_address = pre(y, y_stride, tr, tc);
-
-        if (use_upsampled_ref) {
-          thismse =
-              upsampled_obmc_pref_error(xd, mask, vfp, src_address, pre_address,
-                                        y_stride, sp(tc), sp(tr), w, h, &sse);
+        if (use_accurate_subpel_search) {
+          thismse = upsampled_obmc_pref_error(
+              xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address,
+              pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse);
         } else {
-          thismse = vfp->osvf(pre_address, y_stride, sp(tc), sp(tr),
-                              src_address, mask, &sse);
+          thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc),
+                              sp(tr), src_address, mask, &sse);
         }
 
         cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
@@ -2856,10 +2436,10 @@ int av1_find_best_obmc_sub_pixel_tree_up(
     if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
       MV this_mv = { tr, tc };
 
-      if (use_upsampled_ref) {
-        thismse = upsampled_obmc_pref_error(xd, mask, vfp, src_address,
-                                            pre(y, y_stride, tr, tc), y_stride,
-                                            sp(tc), sp(tr), w, h, &sse);
+      if (use_accurate_subpel_search) {
+        thismse = upsampled_obmc_pref_error(
+            xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address,
+            pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse);
       } else {
         thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr),
                             src_address, mask, &sse);
@@ -2887,7 +2467,7 @@ int av1_find_best_obmc_sub_pixel_tree_up(
     }
 
     if (iters_per_step > 1 && best_idx != -1) {
-      if (use_upsampled_ref) {
+      if (use_accurate_subpel_search) {
         SECOND_LEVEL_CHECKS_BEST(1);
       } else {
         SECOND_LEVEL_CHECKS_BEST(0);
@@ -3123,89 +2703,98 @@ int av1_obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
   }
   return bestsme;
 }
-#endif  // CONFIG_MOTION_VAR
 
 // Note(yunqingwang): The following 2 functions are only used in the motion
 // vector unit test, which return extreme motion vectors allowed by the MV
 // limits.
-#define COMMON_MV_TEST     \
-  SETUP_SUBPEL_SEARCH;     \
-                           \
-  (void)error_per_bit;     \
-  (void)vfp;               \
-  (void)src_address;       \
-  (void)src_stride;        \
-  (void)y;                 \
-  (void)y_stride;          \
-  (void)second_pred;       \
-  (void)w;                 \
-  (void)h;                 \
-  (void)use_upsampled_ref; \
-  (void)offset;            \
-  (void)mvjcost;           \
-  (void)mvcost;            \
-  (void)sse1;              \
-  (void)distortion;        \
-                           \
-  (void)halfiters;         \
-  (void)quarteriters;      \
-  (void)eighthiters;       \
-  (void)whichdir;          \
-  (void)forced_stop;       \
-  (void)hstep;             \
-                           \
-  (void)tr;                \
-  (void)tc;                \
-  (void)sse;               \
-  (void)thismse;           \
+#define COMMON_MV_TEST              \
+  SETUP_SUBPEL_SEARCH;              \
+                                    \
+  (void)error_per_bit;              \
+  (void)vfp;                        \
+  (void)src_address;                \
+  (void)src_stride;                 \
+  (void)y;                          \
+  (void)y_stride;                   \
+  (void)second_pred;                \
+  (void)w;                          \
+  (void)h;                          \
+  (void)use_accurate_subpel_search; \
+  (void)offset;                     \
+  (void)mvjcost;                    \
+  (void)mvcost;                     \
+  (void)sse1;                       \
+  (void)distortion;                 \
+                                    \
+  (void)halfiters;                  \
+  (void)quarteriters;               \
+  (void)eighthiters;                \
+  (void)whichdir;                   \
+  (void)forced_stop;                \
+  (void)hstep;                      \
+                                    \
+  (void)tr;                         \
+  (void)tc;                         \
+  (void)sse;                        \
+  (void)thismse;                    \
   (void)cost_list;
 // Return the maximum MV.
-int av1_return_max_sub_pixel_mv(
-    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
-    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) {
+int av1_return_max_sub_pixel_mv(MACROBLOCK *x, const AV1_COMMON *const cm,
+                                int mi_row, int mi_col, const MV *ref_mv,
+                                int allow_hp, int error_per_bit,
+                                const aom_variance_fn_ptr_t *vfp,
+                                int forced_stop, int iters_per_step,
+                                int *cost_list, int *mvjcost, int *mvcost[2],
+                                int *distortion, unsigned int *sse1,
+                                const uint8_t *second_pred, const uint8_t *mask,
+                                int mask_stride, int invert_mask, int w, int h,
+                                int use_accurate_subpel_search) {
   COMMON_MV_TEST;
   (void)mask;
   (void)mask_stride;
   (void)invert_mask;
   (void)minr;
   (void)minc;
+
+  (void)cm;
+  (void)mi_row;
+  (void)mi_col;
+
   bestmv->row = maxr;
   bestmv->col = maxc;
   besterr = 0;
-// In the sub-pel motion search, if hp is not used, then the last bit of mv
-// has to be 0.
-#if CONFIG_AMVR
+  // In the sub-pel motion search, if hp is not used, then the last bit of mv
+  // has to be 0.
   lower_mv_precision(bestmv, allow_hp, 0);
-#else
-  lower_mv_precision(bestmv, allow_hp);
-#endif
   return besterr;
 }
 // Return the minimum MV.
-int av1_return_min_sub_pixel_mv(
-    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
-    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) {
+int av1_return_min_sub_pixel_mv(MACROBLOCK *x, const AV1_COMMON *const cm,
+                                int mi_row, int mi_col, const MV *ref_mv,
+                                int allow_hp, int error_per_bit,
+                                const aom_variance_fn_ptr_t *vfp,
+                                int forced_stop, int iters_per_step,
+                                int *cost_list, int *mvjcost, int *mvcost[2],
+                                int *distortion, unsigned int *sse1,
+                                const uint8_t *second_pred, const uint8_t *mask,
+                                int mask_stride, int invert_mask, int w, int h,
+                                int use_accurate_subpel_search) {
   COMMON_MV_TEST;
   (void)maxr;
   (void)maxc;
   (void)mask;
   (void)mask_stride;
   (void)invert_mask;
+
+  (void)cm;
+  (void)mi_row;
+  (void)mi_col;
+
   bestmv->row = minr;
   bestmv->col = minc;
   besterr = 0;
-// In the sub-pel motion search, if hp is not used, then the last bit of mv
-// has to be 0.
-#if CONFIG_AMVR
+  // In the sub-pel motion search, if hp is not used, then the last bit of mv
+  // has to be 0.
   lower_mv_precision(bestmv, allow_hp, 0);
-#else
-  lower_mv_precision(bestmv, allow_hp);
-#endif
   return besterr;
 }
diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h
index 2c53075cc..539e8f4e4 100644
--- a/third_party/aom/av1/encoder/mcomp.h
+++ b/third_party/aom/av1/encoder/mcomp.h
@@ -69,10 +69,9 @@ struct SPEED_FEATURES;
 
 int av1_init_search_range(int size);
 
-int av1_refining_search_sad(struct macroblock *x, struct mv *ref_mv,
-                            int sad_per_bit, int distance,
-                            const aom_variance_fn_ptr_t *fn_ptr,
-                            const struct mv *center_mv);
+int av1_refining_search_sad(struct macroblock *x, MV *ref_mv, int sad_per_bit,
+                            int distance, const aom_variance_fn_ptr_t *fn_ptr,
+                            const MV *center_mv);
 
 // Runs sequence of diamond searches in smaller steps for RD.
 int av1_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
@@ -81,24 +80,20 @@ int av1_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
                            const aom_variance_fn_ptr_t *fn_ptr,
                            const MV *ref_mv, MV *dst_mv);
 
-// Perform integral projection based motion estimation.
-unsigned int av1_int_pro_motion_estimation(const struct AV1_COMP *cpi,
-                                           MACROBLOCK *x, BLOCK_SIZE bsize,
-                                           int mi_row, int mi_col);
-
 int av1_hex_search(MACROBLOCK *x, MV *start_mv, int search_param,
                    int sad_per_bit, int do_init_search, int *cost_list,
                    const aom_variance_fn_ptr_t *vfp, int use_mvcost,
                    const MV *center_mv);
 
 typedef int(fractional_mv_step_fp)(
-    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp,
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
     int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
     int *distortion, unsigned int *sse1, const uint8_t *second_pred,
     const uint8_t *mask, int mask_stride, int invert_mask, int w, int h,
-    int use_upsampled_ref);
+    int use_accurate_subpel_search);
 
 extern fractional_mv_step_fp av1_find_best_sub_pixel_tree;
 extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned;
@@ -123,52 +118,33 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
                              int invert_mask, const MV *center_mv,
                              const uint8_t *second_pred);
 
-struct AV1_COMP;
-
-#if CONFIG_HASH_ME
 int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
                           BLOCK_SIZE bsize, MV *mvp_full, int step_param,
                           int error_per_bit, int *cost_list, const MV *ref_mv,
                           int var_max, int rd, int x_pos, int y_pos, int intra);
-#else
-int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
-                          BLOCK_SIZE bsize, MV *mvp_full, int step_param,
-                          int error_per_bit, int *cost_list, const MV *ref_mv,
-                          int var_max, int rd);
-#endif
 
-#if CONFIG_MOTION_VAR
 int av1_obmc_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
                                 MV *mvp_full, int step_param, int sadpb,
                                 int further_steps, int do_refine,
                                 const aom_variance_fn_ptr_t *fn_ptr,
                                 const MV *ref_mv, MV *dst_mv, int is_second);
 int av1_find_best_obmc_sub_pixel_tree_up(
-    MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
-    int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop,
-    int iters_per_step, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, int is_second, int use_upsampled_ref);
-#endif  // CONFIG_MOTION_VAR
-#ifdef __cplusplus
-}  // extern "C"
-#endif
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
+    int is_second, int use_accurate_subpel_search);
 
-#if CONFIG_WARPED_MOTION
 unsigned int av1_compute_motion_cost(const struct AV1_COMP *cpi,
                                      MACROBLOCK *const x, BLOCK_SIZE bsize,
                                      int mi_row, int mi_col, const MV *this_mv);
-#if WARPED_MOTION_SORT_SAMPLES
 unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi,
                                   MACROBLOCK *const x, BLOCK_SIZE bsize,
                                   int mi_row, int mi_col, int *pts0,
-                                  int *pts_inref0, int *pts_mv0,
-                                  int total_samples);
-#else
-unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi,
-                                  MACROBLOCK *const x, BLOCK_SIZE bsize,
-                                  int mi_row, int mi_col, int *pts,
-                                  int *pts_inref);
-#endif  // WARPED_MOTION_SORT_SAMPLES
-#endif  // CONFIG_WARPED_MOTION
+                                  int *pts_inref0, int total_samples);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
 #endif  // AV1_ENCODER_MCOMP_H_
diff --git a/third_party/aom/av1/encoder/mips/msa/error_msa.c b/third_party/aom/av1/encoder/mips/msa/error_msa.c
index 8d13af7ad..2e86dee43 100644
--- a/third_party/aom/av1/encoder/mips/msa/error_msa.c
+++ b/third_party/aom/av1/encoder/mips/msa/error_msa.c
@@ -9,7 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom_dsp/mips/macros_msa.h"
 
 #define BLOCK_ERROR_BLOCKSIZE_MSA(BSize)                                     \
diff --git a/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c
deleted file mode 100644
index 4b0364d6c..000000000
--- a/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c
+++ /dev/null
@@ -1,436 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "av1/common/enums.h"
-#include "av1/encoder/mips/msa/fdct_msa.h"
-#include "aom_dsp/mips/fwd_txfm_msa.h"
-
-static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride,
-                                   const int32_t *const0, int16_t *int_buf) {
-  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
-  v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
-  v4i32 k0, k1, k2, k3;
-
-  /* load input data */
-  r0 = LD_SH(input);
-  r15 = LD_SH(input + 15 * stride);
-  r7 = LD_SH(input + 7 * stride);
-  r8 = LD_SH(input + 8 * stride);
-  SLLI_4V(r0, r15, r7, r8, 2);
-
-  /* stage 1 */
-  LD_SW2(const0, 4, k0, k1);
-  LD_SW2(const0 + 8, 4, k2, k3);
-  MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
-
-  r3 = LD_SH(input + 3 * stride);
-  r4 = LD_SH(input + 4 * stride);
-  r11 = LD_SH(input + 11 * stride);
-  r12 = LD_SH(input + 12 * stride);
-  SLLI_4V(r3, r4, r11, r12, 2);
-
-  LD_SW2(const0 + 4 * 4, 4, k0, k1);
-  LD_SW2(const0 + 4 * 6, 4, k2, k3);
-  MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
-
-  /* stage 2 */
-  BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
-  ST_SH2(tp0, tp2, int_buf, 8);
-  ST_SH2(tp1, tp3, int_buf + 4 * 8, 8);
-
-  LD_SW2(const0 + 4 * 8, 4, k0, k1);
-  k2 = LD_SW(const0 + 4 * 10);
-  MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
-
-  ST_SH2(h0, h1, int_buf + 8 * 8, 8);
-  ST_SH2(h3, h2, int_buf + 12 * 8, 8);
-
-  r9 = LD_SH(input + 9 * stride);
-  r6 = LD_SH(input + 6 * stride);
-  r1 = LD_SH(input + stride);
-  r14 = LD_SH(input + 14 * stride);
-  SLLI_4V(r9, r6, r1, r14, 2);
-
-  LD_SW2(const0 + 4 * 11, 4, k0, k1);
-  LD_SW2(const0 + 4 * 13, 4, k2, k3);
-  MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
-
-  ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
-
-  r13 = LD_SH(input + 13 * stride);
-  r2 = LD_SH(input + 2 * stride);
-  r5 = LD_SH(input + 5 * stride);
-  r10 = LD_SH(input + 10 * stride);
-  SLLI_4V(r13, r2, r5, r10, 2);
-
-  LD_SW2(const0 + 4 * 15, 4, k0, k1);
-  LD_SW2(const0 + 4 * 17, 4, k2, k3);
-  MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
-
-  ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
-
-  BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
-  ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
-}
-
-static void fadst16_step2_msa_helper(int16_t *int_buf, const int32_t *const0,
-                                     int16_t *out, int16_t *out_ptr) {
-  v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
-  v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
-  v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
-  v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
-  v4i32 k0, k1, k2, k3;
-
-  LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15);
-  LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7);
-  LD_SW2(const0 + 4 * 19, 4, k0, k1);
-  k2 = LD_SW(const0 + 4 * 21);
-  MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
-
-  tp0 = LD_SH(int_buf + 4 * 8);
-  tp1 = LD_SH(int_buf + 5 * 8);
-  tp3 = LD_SH(int_buf + 10 * 8);
-  tp2 = LD_SH(int_buf + 14 * 8);
-  LD_SW2(const0 + 4 * 22, 4, k0, k1);
-  k2 = LD_SW(const0 + 4 * 24);
-  MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
-  out4 = -out4;
-  ST_SH(out4, (out + 3 * 16));
-  ST_SH(out5, (out_ptr + 4 * 16));
-
-  h1 = LD_SH(int_buf + 9 * 8);
-  h3 = LD_SH(int_buf + 12 * 8);
-  MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
-  out13 = -out13;
-  ST_SH(out12, (out + 2 * 16));
-  ST_SH(out13, (out_ptr + 5 * 16));
-
-  tp0 = LD_SH(int_buf);
-  tp1 = LD_SH(int_buf + 8);
-  tp2 = LD_SH(int_buf + 2 * 8);
-  tp3 = LD_SH(int_buf + 6 * 8);
-
-  BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
-  out1 = -out1;
-  ST_SH(out0, (out));
-  ST_SH(out1, (out_ptr + 7 * 16));
-
-  h0 = LD_SH(int_buf + 8 * 8);
-  h2 = LD_SH(int_buf + 13 * 8);
-
-  BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
-  out8 = -out8;
-  ST_SH(out8, (out + 16));
-  ST_SH(out9, (out_ptr + 6 * 16));
-
-  /* stage 4 */
-  LD_SW2(const0 + 4 * 25, 4, k0, k1);
-  LD_SW2(const0 + 4 * 27, 4, k2, k3);
-  MADD_SHORT(h10, h11, k1, k2, out2, out3);
-  ST_SH(out2, (out + 7 * 16));
-  ST_SH(out3, (out_ptr));
-
-  MADD_SHORT(out6, out7, k0, k3, out6, out7);
-  ST_SH(out6, (out + 4 * 16));
-  ST_SH(out7, (out_ptr + 3 * 16));
-
-  MADD_SHORT(out10, out11, k0, k3, out10, out11);
-  ST_SH(out10, (out + 6 * 16));
-  ST_SH(out11, (out_ptr + 16));
-
-  MADD_SHORT(out14, out15, k1, k2, out14, out15);
-  ST_SH(out14, (out + 5 * 16));
-  ST_SH(out15, (out_ptr + 2 * 16));
-}
-
-static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0,
-                                   int16_t *out) {
-  fadst16_step2_msa_helper(int_buf, const0, out, out + 128);
-}
-
-static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) {
-  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
-  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
-
-  /* load input data */
-  LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
-  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
-                     r7);
-  FDCT_POSTPROC_2V_NEG_H(r0, r1);
-  FDCT_POSTPROC_2V_NEG_H(r2, r3);
-  FDCT_POSTPROC_2V_NEG_H(r4, r5);
-  FDCT_POSTPROC_2V_NEG_H(r6, r7);
-  ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
-  out += 64;
-
-  LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
-  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
-                     r12, r13, r14, r15);
-  FDCT_POSTPROC_2V_NEG_H(r8, r9);
-  FDCT_POSTPROC_2V_NEG_H(r10, r11);
-  FDCT_POSTPROC_2V_NEG_H(r12, r13);
-  FDCT_POSTPROC_2V_NEG_H(r14, r15);
-  ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
-  out += 64;
-
-  /* load input data */
-  input += 128;
-  LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
-  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
-                     r7);
-  FDCT_POSTPROC_2V_NEG_H(r0, r1);
-  FDCT_POSTPROC_2V_NEG_H(r2, r3);
-  FDCT_POSTPROC_2V_NEG_H(r4, r5);
-  FDCT_POSTPROC_2V_NEG_H(r6, r7);
-  ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
-  out += 64;
-
-  LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
-  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
-                     r12, r13, r14, r15);
-  FDCT_POSTPROC_2V_NEG_H(r8, r9);
-  FDCT_POSTPROC_2V_NEG_H(r10, r11);
-  FDCT_POSTPROC_2V_NEG_H(r12, r13);
-  FDCT_POSTPROC_2V_NEG_H(r14, r15);
-  ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
-}
-
-static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0,
-                                   int16_t *int_buf) {
-  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
-  v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
-  v4i32 k0, k1, k2, k3;
-
-  /* load input data */
-  r0 = LD_SH(input);
-  r7 = LD_SH(input + 7 * 8);
-  r8 = LD_SH(input + 8 * 8);
-  r15 = LD_SH(input + 15 * 8);
-
-  /* stage 1 */
-  LD_SW2(const0, 4, k0, k1);
-  LD_SW2(const0 + 4 * 2, 4, k2, k3);
-  MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
-
-  r3 = LD_SH(input + 3 * 8);
-  r4 = LD_SH(input + 4 * 8);
-  r11 = LD_SH(input + 11 * 8);
-  r12 = LD_SH(input + 12 * 8);
-
-  LD_SW2(const0 + 4 * 4, 4, k0, k1);
-  LD_SW2(const0 + 4 * 6, 4, k2, k3);
-  MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
-
-  /* stage 2 */
-  BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
-  ST_SH2(tp0, tp1, int_buf, 4 * 8);
-  ST_SH2(tp2, tp3, int_buf + 8, 4 * 8);
-
-  LD_SW2(const0 + 4 * 8, 4, k0, k1);
-  k2 = LD_SW(const0 + 4 * 10);
-  MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
-  ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8);
-  ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8);
-
-  r1 = LD_SH(input + 8);
-  r6 = LD_SH(input + 6 * 8);
-  r9 = LD_SH(input + 9 * 8);
-  r14 = LD_SH(input + 14 * 8);
-
-  LD_SW2(const0 + 4 * 11, 4, k0, k1);
-  LD_SW2(const0 + 4 * 13, 4, k2, k3);
-  MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
-  ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
-
-  r2 = LD_SH(input + 2 * 8);
-  r5 = LD_SH(input + 5 * 8);
-  r10 = LD_SH(input + 10 * 8);
-  r13 = LD_SH(input + 13 * 8);
-
-  LD_SW2(const0 + 4 * 15, 4, k0, k1);
-  LD_SW2(const0 + 4 * 17, 4, k2, k3);
-  MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
-  ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
-  BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
-  ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
-}
-
-static void fadst16_rows_step2_msa(int16_t *int_buf, const int32_t *const0,
-                                   int16_t *out) {
-  fadst16_step2_msa_helper(int_buf, const0, out, out + 8);
-}
-
-static void fadst16_transpose_msa(int16_t *input, int16_t *out) {
-  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
-  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
-
-  /* load input data */
-  LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
-          l7, l15);
-  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
-                     r7);
-  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
-                     r12, r13, r14, r15);
-  ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
-  ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
-  out += 16 * 8;
-
-  /* load input data */
-  input += 128;
-  LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
-          l7, l15);
-  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
-                     r7);
-  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
-                     r12, r13, r14, r15);
-  ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
-  ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
-}
-
-static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) {
-  int16_t *temp = intermediate;
-  int16_t *out = output;
-  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11;
-  v8i16 in12, in13, in14, in15;
-
-  LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7);
-  temp = intermediate + 8;
-  LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
-                     in10, in11, in12, in13, in14, in15);
-  FDCT_POSTPROC_2V_NEG_H(in0, in1);
-  FDCT_POSTPROC_2V_NEG_H(in2, in3);
-  FDCT_POSTPROC_2V_NEG_H(in4, in5);
-  FDCT_POSTPROC_2V_NEG_H(in6, in7);
-  FDCT_POSTPROC_2V_NEG_H(in8, in9);
-  FDCT_POSTPROC_2V_NEG_H(in10, in11);
-  FDCT_POSTPROC_2V_NEG_H(in12, in13);
-  FDCT_POSTPROC_2V_NEG_H(in14, in15);
-  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
-               in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6,
-               tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
-  temp = intermediate;
-  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16);
-  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
-                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
-  temp = intermediate;
-  LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
-  FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
-               in4, in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
-                     tmp1, in1, tmp2, in2, tmp3, in3);
-  ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16);
-  TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
-                     tmp5, in5, tmp6, in6, tmp7, in7);
-  out = output + 8;
-  ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16);
-}
-
-void av1_fht16x16_msa(const int16_t *input, int16_t *output, int32_t stride,
-                      int32_t tx_type) {
-  DECLARE_ALIGNED(32, int16_t, tmp[256]);
-  DECLARE_ALIGNED(32, int16_t, trans_buf[256]);
-  DECLARE_ALIGNED(32, int16_t, tmp_buf[128]);
-  int32_t i;
-  int16_t *ptmpbuf = &tmp_buf[0];
-  int16_t *trans = &trans_buf[0];
-  const int32_t const_arr[29 * 4] = {
-    52707308,    52707308,    52707308,    52707308,    -1072430300,
-    -1072430300, -1072430300, -1072430300, 795618043,   795618043,
-    795618043,   795618043,   -721080468,  -721080468,  -721080468,
-    -721080468,  459094491,   459094491,   459094491,   459094491,
-    -970646691,  -970646691,  -970646691,  -970646691,  1010963856,
-    1010963856,  1010963856,  1010963856,  -361743294,  -361743294,
-    -361743294,  -361743294,  209469125,   209469125,   209469125,
-    209469125,   -1053094788, -1053094788, -1053094788, -1053094788,
-    1053160324,  1053160324,  1053160324,  1053160324,  639644520,
-    639644520,   639644520,   639644520,   -862444000,  -862444000,
-    -862444000,  -862444000,  1062144356,  1062144356,  1062144356,
-    1062144356,  -157532337,  -157532337,  -157532337,  -157532337,
-    260914709,   260914709,   260914709,   260914709,   -1041559667,
-    -1041559667, -1041559667, -1041559667, 920985831,   920985831,
-    920985831,   920985831,   -551995675,  -551995675,  -551995675,
-    -551995675,  596522295,   596522295,   596522295,   596522295,
-    892853362,   892853362,   892853362,   892853362,   -892787826,
-    -892787826,  -892787826,  -892787826,  410925857,   410925857,
-    410925857,   410925857,   -992012162,  -992012162,  -992012162,
-    -992012162,  992077698,   992077698,   992077698,   992077698,
-    759246145,   759246145,   759246145,   759246145,   -759180609,
-    -759180609,  -759180609,  -759180609,  -759222975,  -759222975,
-    -759222975,  -759222975,  759288511,   759288511,   759288511,
-    759288511
-  };
-
-  switch (tx_type) {
-    case DCT_DCT:
-      /* column transform */
-      for (i = 0; i < 2; ++i) {
-        fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
-      }
-
-      /* row transform */
-      for (i = 0; i < 2; ++i) {
-        fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
-      }
-      break;
-    case ADST_DCT:
-      /* column transform */
-      for (i = 0; i < 2; ++i) {
-        fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
-        fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
-      }
-
-      /* row transform */
-      for (i = 0; i < 2; ++i) {
-        postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
-      }
-      break;
-    case DCT_ADST:
-      /* column transform */
-      for (i = 0; i < 2; ++i) {
-        fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
-      }
-
-      fadst16_transpose_postproc_msa(tmp, trans);
-
-      /* row transform */
-      for (i = 0; i < 2; ++i) {
-        fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
-        fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
-      }
-
-      fadst16_transpose_msa(tmp, output);
-      break;
-    case ADST_ADST:
-      /* column transform */
-      for (i = 0; i < 2; ++i) {
-        fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
-        fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
-      }
-
-      fadst16_transpose_postproc_msa(tmp, trans);
-
-      /* row transform */
-      for (i = 0; i < 2; ++i) {
-        fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
-        fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
-      }
-
-      fadst16_transpose_msa(tmp, output);
-      break;
-    default: assert(0); break;
-  }
-}
diff --git a/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c
index da1ac74f0..085c08bfb 100644
--- a/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c
+++ b/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c
@@ -12,7 +12,6 @@
 #include <assert.h>
 
 #include "av1/common/enums.h"
-#include "av1/encoder/mips/msa/fdct_msa.h"
 
 void av1_fwht4x4_msa(const int16_t *input, int16_t *output,
                      int32_t src_stride) {
@@ -45,54 +44,3 @@ void av1_fwht4x4_msa(const int16_t *input, int16_t *output,
   ST4x2_UB(in1, output + 8, 4);
   ST4x2_UB(in2, output + 12, 4);
 }
-
-void av1_fht4x4_msa(const int16_t *input, int16_t *output, int32_t stride,
-                    int32_t tx_type) {
-  v8i16 in0, in1, in2, in3;
-
-  LD_SH4(input, stride, in0, in1, in2, in3);
-
-  /* fdct4 pre-process */
-  {
-    v8i16 temp, mask;
-    v16i8 zero = { 0 };
-    v16i8 one = __msa_ldi_b(1);
-
-    mask = (v8i16)__msa_sldi_b(zero, one, 15);
-    SLLI_4V(in0, in1, in2, in3, 4);
-    temp = __msa_ceqi_h(in0, 0);
-    temp = (v8i16)__msa_xori_b((v16u8)temp, 255);
-    temp = mask & temp;
-    in0 += temp;
-  }
-
-  switch (tx_type) {
-    case DCT_DCT:
-      AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
-      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-      AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
-      break;
-    case ADST_DCT:
-      AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
-      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-      AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
-      break;
-    case DCT_ADST:
-      AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
-      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-      AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
-      break;
-    case ADST_ADST:
-      AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
-      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-      AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
-      break;
-    default: assert(0); break;
-  }
-
-  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-  ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
-  SRA_4V(in0, in1, in2, in3, 2);
-  PCKEV_D2_SH(in1, in0, in3, in2, in0, in2);
-  ST_SH2(in0, in2, output, 8);
-}
diff --git a/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c
deleted file mode 100644
index 4cbf60a11..000000000
--- a/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "av1/common/enums.h"
-#include "av1/encoder/mips/msa/fdct_msa.h"
-
-void av1_fht8x8_msa(const int16_t *input, int16_t *output, int32_t stride,
-                    int32_t tx_type) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-
-  LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7);
-  SLLI_4V(in0, in1, in2, in3, 2);
-  SLLI_4V(in4, in5, in6, in7, 2);
-
-  switch (tx_type) {
-    case DCT_DCT:
-      AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
-                         in3, in4, in5, in6, in7);
-      AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      break;
-    case ADST_DCT:
-      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
-                         in3, in4, in5, in6, in7);
-      AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      break;
-    case DCT_ADST:
-      AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
-                         in3, in4, in5, in6, in7);
-      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      break;
-    case ADST_ADST:
-      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
-                         in3, in4, in5, in6, in7);
-      AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-                in5, in6, in7);
-      break;
-    default: assert(0); break;
-  }
-
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
-  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
-}
diff --git a/third_party/aom/av1/encoder/mips/msa/fdct_msa.h b/third_party/aom/av1/encoder/mips/msa/fdct_msa.h
deleted file mode 100644
index 52bcf790c..000000000
--- a/third_party/aom/av1/encoder/mips/msa/fdct_msa.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_
-#define AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_
-
-#include "aom_dsp/mips/fwd_txfm_msa.h"
-#include "aom_dsp/mips/txfm_macros_msa.h"
-#include "aom_ports/mem.h"
-
-#define AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,  \
-                  out3, out4, out5, out6, out7)                              \
-  {                                                                          \
-    v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m;                       \
-    v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m;                        \
-    v8i16 coeff0_m = { cospi_2_64,  cospi_6_64,  cospi_10_64, cospi_14_64,   \
-                       cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \
-    v8i16 coeff1_m = { cospi_8_64,  -cospi_8_64,  cospi_16_64, -cospi_16_64, \
-                       cospi_24_64, -cospi_24_64, 0,           0 };          \
-                                                                             \
-    SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m);                          \
-    cnst2_m = -cnst0_m;                                                      \
-    ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);       \
-    SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m);                          \
-    cnst4_m = -cnst2_m;                                                      \
-    ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);       \
-                                                                             \
-    ILVRL_H2_SH(in0, in7, vec1_m, vec0_m);                                   \
-    ILVRL_H2_SH(in4, in3, vec3_m, vec2_m);                                   \
-    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m,  \
-                          cnst2_m, cnst3_m, in7, in0, in4, in3);             \
-                                                                             \
-    SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m);                          \
-    cnst2_m = -cnst0_m;                                                      \
-    ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);       \
-    SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m);                          \
-    cnst4_m = -cnst2_m;                                                      \
-    ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);       \
-                                                                             \
-    ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                   \
-    ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                   \
-                                                                             \
-    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m,  \
-                          cnst2_m, cnst3_m, in5, in2, in6, in1);             \
-    BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5);                   \
-    out7 = -s0_m;                                                            \
-    out0 = s1_m;                                                             \
-                                                                             \
-    SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m);  \
-                                                                             \
-    ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m);       \
-    cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                               \
-    cnst1_m = cnst0_m;                                                       \
-                                                                             \
-    ILVRL_H2_SH(in4, in3, vec1_m, vec0_m);                                   \
-    ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                   \
-    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m,  \
-                          cnst3_m, cnst1_m, out1, out6, s0_m, s1_m);         \
-                                                                             \
-    SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                          \
-    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                               \
-                                                                             \
-    ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                   \
-    ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m);                                 \
-    out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                   \
-    out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);                   \
-    out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);                   \
-    out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);                   \
-                                                                             \
-    out1 = -out1;                                                            \
-    out3 = -out3;                                                            \
-    out5 = -out5;                                                            \
-  }
-
-#define AOM_FADST4(in0, in1, in2, in3, out0, out1, out2, out3)              \
-  {                                                                         \
-    v4i32 s0_m, s1_m, s2_m, s3_m, constant_m;                               \
-    v4i32 in0_r_m, in1_r_m, in2_r_m, in3_r_m;                               \
-                                                                            \
-    UNPCK_R_SH_SW(in0, in0_r_m);                                            \
-    UNPCK_R_SH_SW(in1, in1_r_m);                                            \
-    UNPCK_R_SH_SW(in2, in2_r_m);                                            \
-    UNPCK_R_SH_SW(in3, in3_r_m);                                            \
-                                                                            \
-    constant_m = __msa_fill_w(sinpi_4_9);                                   \
-    MUL2(in0_r_m, constant_m, in3_r_m, constant_m, s1_m, s0_m);             \
-                                                                            \
-    constant_m = __msa_fill_w(sinpi_1_9);                                   \
-    s0_m += in0_r_m * constant_m;                                           \
-    s1_m -= in1_r_m * constant_m;                                           \
-                                                                            \
-    constant_m = __msa_fill_w(sinpi_2_9);                                   \
-    s0_m += in1_r_m * constant_m;                                           \
-    s1_m += in3_r_m * constant_m;                                           \
-                                                                            \
-    s2_m = in0_r_m + in1_r_m - in3_r_m;                                     \
-                                                                            \
-    constant_m = __msa_fill_w(sinpi_3_9);                                   \
-    MUL2(in2_r_m, constant_m, s2_m, constant_m, s3_m, in1_r_m);             \
-                                                                            \
-    in0_r_m = s0_m + s3_m;                                                  \
-    s2_m = s1_m - s3_m;                                                     \
-    s3_m = s1_m - s0_m + s3_m;                                              \
-                                                                            \
-    SRARI_W4_SW(in0_r_m, in1_r_m, s2_m, s3_m, DCT_CONST_BITS);              \
-    PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m, s3_m, s3_m, \
-                out0, out1, out2, out3);                                    \
-  }
-#endif  // AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_
diff --git a/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c b/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c
index 4ec679642..531ae090a 100644
--- a/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c
+++ b/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c
@@ -9,7 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom_dsp/mips/macros_msa.h"
 
 static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride,
diff --git a/third_party/aom/av1/encoder/ml.c b/third_party/aom/av1/encoder/ml.c
new file mode 100644
index 000000000..3a27e5845
--- /dev/null
+++ b/third_party/aom/av1/encoder/ml.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/encoder/ml.h"
+
+void av1_nn_predict(const float *features, const NN_CONFIG *nn_config,
+                    float *output) {
+  int num_input_nodes = nn_config->num_inputs;
+  int buf_index = 0;
+  float buf[2][NN_MAX_NODES_PER_LAYER];
+  const float *input_nodes = features;
+
+  // Propagate hidden layers.
+  const int num_layers = nn_config->num_hidden_layers;
+  assert(num_layers <= NN_MAX_HIDDEN_LAYERS);
+  for (int layer = 0; layer < num_layers; ++layer) {
+    const float *weights = nn_config->weights[layer];
+    const float *bias = nn_config->bias[layer];
+    float *output_nodes = buf[buf_index];
+    const int num_output_nodes = nn_config->num_hidden_nodes[layer];
+    assert(num_output_nodes < NN_MAX_NODES_PER_LAYER);
+    for (int node = 0; node < num_output_nodes; ++node) {
+      float val = 0.0f;
+      for (int i = 0; i < num_input_nodes; ++i)
+        val += weights[i] * input_nodes[i];
+      val += bias[node];
+      // ReLU as activation function.
+      val = val > 0.0f ? val : 0.0f;  // Could use AOMMAX().
+      output_nodes[node] = val;
+      weights += num_input_nodes;
+    }
+    num_input_nodes = num_output_nodes;
+    input_nodes = output_nodes;
+    buf_index = 1 - buf_index;
+  }
+
+  // Final output layer.
+  const float *weights = nn_config->weights[num_layers];
+  for (int node = 0; node < nn_config->num_outputs; ++node) {
+    const float *bias = nn_config->bias[num_layers];
+    float val = 0.0f;
+    for (int i = 0; i < num_input_nodes; ++i)
+      val += weights[i] * input_nodes[i];
+    output[node] = val + bias[node];
+    weights += num_input_nodes;
+  }
+}
diff --git a/third_party/aom/av1/encoder/ml.h b/third_party/aom/av1/encoder/ml.h
new file mode 100644
index 000000000..614cb60bb
--- /dev/null
+++ b/third_party/aom/av1/encoder/ml.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_ML_H_
+#define AV1_ENCODER_ML_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NN_MAX_HIDDEN_LAYERS 10
+#define NN_MAX_NODES_PER_LAYER 128
+
+typedef struct {
+  int num_inputs;         // Number of input nodes, i.e. features.
+  int num_outputs;        // Number of output nodes.
+  int num_hidden_layers;  // Number of hidden layers, maximum 10.
+  // Number of nodes for each hidden layer.
+  int num_hidden_nodes[NN_MAX_HIDDEN_LAYERS];
+  // Weight parameters, indexed by layer.
+  const float *weights[NN_MAX_HIDDEN_LAYERS + 1];
+  // Bias parameters, indexed by layer.
+  const float *bias[NN_MAX_HIDDEN_LAYERS + 1];
+} NN_CONFIG;
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict(const float *features, const NN_CONFIG *nn_config,
+                    float *output);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_RD_H_
diff --git a/third_party/aom/av1/encoder/palette.c b/third_party/aom/av1/encoder/palette.c
index f34b82544..e61cd02ce 100644
--- a/third_party/aom/av1/encoder/palette.c
+++ b/third_party/aom/av1/encoder/palette.c
@@ -23,16 +23,14 @@
 #include "av1/encoder/k_means_template.h"
 #undef AV1_K_MEANS_DIM
 
-static int float_comparer(const void *a, const void *b) {
-  const float fa = *(const float *)a;
-  const float fb = *(const float *)b;
-  return (fa > fb) - (fa < fb);
+static int int_comparer(const void *a, const void *b) {
+  return (*(int *)a - *(int *)b);
 }
 
-int av1_remove_duplicates(float *centroids, int num_centroids) {
+int av1_remove_duplicates(int *centroids, int num_centroids) {
   int num_unique;  // number of unique centroids
   int i;
-  qsort(centroids, num_centroids, sizeof(*centroids), float_comparer);
+  qsort(centroids, num_centroids, sizeof(*centroids), int_comparer);
   // Remove duplicates.
   num_unique = 1;
   for (i = 1; i < num_centroids; ++i) {
@@ -43,7 +41,6 @@ int av1_remove_duplicates(float *centroids, int num_centroids) {
   return num_unique;
 }
 
-#if CONFIG_PALETTE_DELTA_ENCODING
 static int delta_encode_cost(const int *colors, int num, int bit_depth,
                              int min_val) {
   if (num <= 0) return 0;
@@ -116,15 +113,11 @@ int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
   }
   return AOMMAX(av1_ceil_log2(max_d + 1), *min_bits);
 }
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
 
 int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
-#if CONFIG_PALETTE_DELTA_ENCODING
                              uint16_t *color_cache, int n_cache,
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
                              int bit_depth) {
   const int n = pmi->palette_size[0];
-#if CONFIG_PALETTE_DELTA_ENCODING
   int out_cache_colors[PALETTE_MAX_SIZE];
   uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
   const int n_out_cache =
@@ -132,19 +125,13 @@ int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
                             cache_color_found, out_cache_colors);
   const int total_bits =
       n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 1);
-  return total_bits * av1_cost_bit(128, 0);
-#else
-  return bit_depth * n * av1_cost_bit(128, 0);
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
+  return av1_cost_literal(total_bits);
 }
 
 int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
-#if CONFIG_PALETTE_DELTA_ENCODING
                               uint16_t *color_cache, int n_cache,
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
                               int bit_depth) {
   const int n = pmi->palette_size[1];
-#if CONFIG_PALETTE_DELTA_ENCODING
   int total_bits = 0;
   // U channel palette color cost.
   int out_cache_colors[PALETTE_MAX_SIZE];
@@ -163,8 +150,5 @@ int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
       2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count;
   const int bits_using_raw = bit_depth * n;
   total_bits += 1 + AOMMIN(bits_using_delta, bits_using_raw);
-  return total_bits * av1_cost_bit(128, 0);
-#else
-  return 2 * bit_depth * n * av1_cost_bit(128, 0);
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
+  return av1_cost_literal(total_bits);
 }
diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h
index efd89f66f..bbdd50784 100644
--- a/third_party/aom/av1/encoder/palette.h
+++ b/third_party/aom/av1/encoder/palette.h
@@ -20,22 +20,22 @@ extern "C" {
 
 #define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim
 
-void AV1_K_MEANS_RENAME(av1_calc_indices, 1)(const float *data,
-                                             const float *centroids,
+void AV1_K_MEANS_RENAME(av1_calc_indices, 1)(const int *data,
+                                             const int *centroids,
                                              uint8_t *indices, int n, int k);
-void AV1_K_MEANS_RENAME(av1_calc_indices, 2)(const float *data,
-                                             const float *centroids,
+void AV1_K_MEANS_RENAME(av1_calc_indices, 2)(const int *data,
+                                             const int *centroids,
                                              uint8_t *indices, int n, int k);
-void AV1_K_MEANS_RENAME(av1_k_means, 1)(const float *data, float *centroids,
+void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int *data, int *centroids,
                                         uint8_t *indices, int n, int k,
                                         int max_itr);
-void AV1_K_MEANS_RENAME(av1_k_means, 2)(const float *data, float *centroids,
+void AV1_K_MEANS_RENAME(av1_k_means, 2)(const int *data, int *centroids,
                                         uint8_t *indices, int n, int k,
                                         int max_itr);
 
 // Given 'n' 'data' points and 'k' 'centroids' each of dimension 'dim',
 // calculate the centroid 'indices' for the data points.
-static INLINE void av1_calc_indices(const float *data, const float *centroids,
+static INLINE void av1_calc_indices(const int *data, const int *centroids,
                                     uint8_t *indices, int n, int k, int dim) {
   if (dim == 1) {
     AV1_K_MEANS_RENAME(av1_calc_indices, 1)(data, centroids, indices, n, k);
@@ -50,7 +50,7 @@ static INLINE void av1_calc_indices(const float *data, const float *centroids,
 // dimension 'dim', runs up to 'max_itr' iterations of k-means algorithm to get
 // updated 'centroids' and the centroid 'indices' for elements in 'data'.
 // Note: the output centroids are rounded off to nearest integers.
-static INLINE void av1_k_means(const float *data, float *centroids,
+static INLINE void av1_k_means(const int *data, int *centroids,
                                uint8_t *indices, int n, int k, int dim,
                                int max_itr) {
   if (dim == 1) {
@@ -66,9 +66,8 @@ static INLINE void av1_k_means(const float *data, float *centroids,
 // puts these unique centroids in first 'k' indices of 'centroids' array.
 // Ideally, the centroids should be rounded to integers before calling this
 // method.
-int av1_remove_duplicates(float *centroids, int num_centroids);
+int av1_remove_duplicates(int *centroids, int num_centroids);
 
-#if CONFIG_PALETTE_DELTA_ENCODING
 // Given a color cache and a set of base colors, find if each cache color is
 // present in the base colors, record the binary results in "cache_color_found".
 // Record the colors that are not in the color cache in "out_cache_colors".
@@ -80,20 +79,14 @@ int av1_index_color_cache(const uint16_t *color_cache, int n_cache,
 // assign zero_count with the number of deltas being 0.
 int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
                                  int bit_depth, int *zero_count, int *min_bits);
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
 
 // Return the rate cost for transmitting luma palette color values.
 int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
-#if CONFIG_PALETTE_DELTA_ENCODING
-                             uint16_t *color_cache, int n_cache,
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
-                             int bit_depth);
+                             uint16_t *color_cache, int n_cache, int bit_depth);
 
 // Return the rate cost for transmitting chroma palette color values.
 int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
-#if CONFIG_PALETTE_DELTA_ENCODING
                               uint16_t *color_cache, int n_cache,
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
                               int bit_depth);
 
 #ifdef __cplusplus
diff --git a/third_party/aom/av1/encoder/pickcdef.c b/third_party/aom/av1/encoder/pickcdef.c
index accc97e57..4f6265617 100644
--- a/third_party/aom/av1/encoder/pickcdef.c
+++ b/third_party/aom/av1/encoder/pickcdef.c
@@ -12,7 +12,8 @@
 #include <math.h>
 #include <string.h>
 
-#include "./aom_scale_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "av1/common/cdef.h"
 #include "av1/common/onyxc_int.h"
@@ -23,7 +24,7 @@
 #define REDUCED_TOTAL_STRENGTHS (REDUCED_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
 #define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
 
-static int priconv[REDUCED_PRI_STRENGTHS] = { 0, 1, 2, 3, 4, 7, 12, 25 };
+static int priconv[REDUCED_PRI_STRENGTHS] = { 0, 1, 2, 3, 5, 7, 10, 13 };
 
 /* Search for the best strength to add as an option, knowing we
    already selected nb_strengths options. */
@@ -68,16 +69,11 @@ static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths,
                                 uint64_t (**mse)[TOTAL_STRENGTHS], int sb_count,
                                 int fast) {
   uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS];
-#if !CONFIG_CDEF_SINGLEPASS
-  const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
-#endif
   int i, j;
   uint64_t best_tot_mse = (uint64_t)1 << 63;
   int best_id0 = 0;
   int best_id1 = 0;
-#if CONFIG_CDEF_SINGLEPASS
   const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
-#endif
   memset(tot_mse, 0, sizeof(tot_mse));
   for (i = 0; i < sb_count; i++) {
     int gi;
@@ -204,10 +200,9 @@ static INLINE uint64_t dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
   svar = sum_s2 - ((sum_s * sum_s + 32) >> 6);
   dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6);
   return (uint64_t)floor(
-      .5 +
-      (sum_d2 + sum_s2 - 2 * sum_sd) * .5 *
-          (svar + dvar + (400 << 2 * coeff_shift)) /
-          (sqrt((20000 << 4 * coeff_shift) + svar * (double)dvar)));
+      .5 + (sum_d2 + sum_s2 - 2 * sum_sd) * .5 *
+               (svar + dvar + (400 << 2 * coeff_shift)) /
+               (sqrt((20000 << 4 * coeff_shift) + svar * (double)dvar)));
 }
 
 static INLINE uint64_t mse_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
@@ -290,7 +285,7 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
   int fbr, fbc;
   uint16_t *src[3];
   uint16_t *ref_coeff[3];
-  cdef_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64];
+  static cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128];
   int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
   int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
   int stride[3];
@@ -310,32 +305,27 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
   int *sb_index = aom_malloc(nvfb * nhfb * sizeof(*sb_index));
   int *selected_strength = aom_malloc(nvfb * nhfb * sizeof(*sb_index));
   uint64_t(*mse[2])[TOTAL_STRENGTHS];
-#if CONFIG_CDEF_SINGLEPASS
   int pri_damping = 3 + (cm->base_qindex >> 6);
-#else
-  int pri_damping = 6;
-#endif
   int sec_damping = 3 + (cm->base_qindex >> 6);
   int i;
   int nb_strengths;
   int nb_strength_bits;
   int quantizer;
   double lambda;
-  int nplanes = 3;
+  const int num_planes = av1_num_planes(cm);
   const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
   DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
   uint16_t *in;
-  DECLARE_ALIGNED(32, uint16_t, tmp_dst[CDEF_BLOCKSIZE * CDEF_BLOCKSIZE]);
-  int chroma_cdef = xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
-                    xd->plane[2].subsampling_x == xd->plane[2].subsampling_y;
+  DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
   quantizer =
-      av1_ac_quant(cm->base_qindex, 0, cm->bit_depth) >> (cm->bit_depth - 8);
+      av1_ac_quant_Q3(cm->base_qindex, 0, cm->bit_depth) >> (cm->bit_depth - 8);
   lambda = .12 * quantizer * quantizer / 256.;
 
-  av1_setup_dst_planes(xd->plane, cm->sb_size, frame, 0, 0);
+  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
+                       num_planes);
   mse[0] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
   mse[1] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
-  for (pli = 0; pli < nplanes; pli++) {
+  for (pli = 0; pli < num_planes; pli++) {
     uint8_t *ref_buffer;
     int ref_stride;
     switch (pli) {
@@ -371,20 +361,16 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
 
     for (r = 0; r < frame_height; ++r) {
       for (c = 0; c < frame_width; ++c) {
-#if CONFIG_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
           src[pli][r * stride[pli] + c] = CONVERT_TO_SHORTPTR(
               xd->plane[pli].dst.buf)[r * xd->plane[pli].dst.stride + c];
           ref_coeff[pli][r * stride[pli] + c] =
               CONVERT_TO_SHORTPTR(ref_buffer)[r * ref_stride + c];
         } else {
-#endif
           src[pli][r * stride[pli] + c] =
               xd->plane[pli].dst.buf[r * xd->plane[pli].dst.stride + c];
           ref_coeff[pli][r * stride[pli] + c] = ref_buffer[r * ref_stride + c];
-#if CONFIG_HIGHBITDEPTH
         }
-#endif
       }
     }
   }
@@ -397,13 +383,33 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
       int dirinit = 0;
       nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * fbc);
       nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * fbr);
-      cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
-                          MI_SIZE_64X64 * fbc]
-          ->mbmi.cdef_strength = -1;
+      int hb_step = 1;
+      int vb_step = 1;
+      BLOCK_SIZE bs = BLOCK_64X64;
+      MB_MODE_INFO *const mbmi =
+          cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
+                              MI_SIZE_64X64 * fbc];
+      if (((fbc & 1) &&
+           (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_128X64)) ||
+          ((fbr & 1) &&
+           (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_64X128)))
+        continue;
+      if (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_128X64 ||
+          mbmi->sb_type == BLOCK_64X128)
+        bs = mbmi->sb_type;
+      if (bs == BLOCK_128X128 || bs == BLOCK_128X64) {
+        nhb = AOMMIN(MI_SIZE_128X128, cm->mi_cols - MI_SIZE_64X64 * fbc);
+        hb_step = 2;
+      }
+      if (bs == BLOCK_128X128 || bs == BLOCK_64X128) {
+        nvb = AOMMIN(MI_SIZE_128X128, cm->mi_rows - MI_SIZE_64X64 * fbr);
+        vb_step = 2;
+      }
+      // No filtering if the entire filter block is skipped
       if (sb_all_skip(cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64)) continue;
       cdef_count = sb_compute_cdef_list(cm, fbr * MI_SIZE_64X64,
-                                        fbc * MI_SIZE_64X64, dlist, 1);
-      for (pli = 0; pli < nplanes; pli++) {
+                                        fbc * MI_SIZE_64X64, dlist, bs);
+      for (pli = 0; pli < num_planes; pli++) {
         for (i = 0; i < CDEF_INBUF_SIZE; i++) inbuf[i] = CDEF_VERY_LARGE;
         for (gi = 0; gi < total_strengths; gi++) {
           int threshold;
@@ -411,7 +417,6 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
           int sec_strength;
           threshold = gi / CDEF_SEC_STRENGTHS;
           if (fast) threshold = priconv[threshold];
-          if (pli > 0 && !chroma_cdef) threshold = 0;
           /* We avoid filtering the pixels for which some of the pixels to
              average
              are outside the frame. We could change the filter instead, but it
@@ -419,11 +424,10 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
           int yoff = CDEF_VBORDER * (fbr != 0);
           int xoff = CDEF_HBORDER * (fbc != 0);
           int ysize = (nvb << mi_high_l2[pli]) +
-                      CDEF_VBORDER * (fbr != nvfb - 1) + yoff;
+                      CDEF_VBORDER * (fbr + vb_step < nvfb) + yoff;
           int xsize = (nhb << mi_wide_l2[pli]) +
-                      CDEF_HBORDER * (fbc != nhfb - 1) + xoff;
+                      CDEF_HBORDER * (fbc + hb_step < nhfb) + xoff;
           sec_strength = gi % CDEF_SEC_STRENGTHS;
-#if CONFIG_CDEF_SINGLEPASS
           copy_sb16_16(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
                        src[pli],
                        (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff,
@@ -433,19 +437,6 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
                          dir, &dirinit, var, pli, dlist, cdef_count, threshold,
                          sec_strength + (sec_strength == 3), pri_damping,
                          sec_damping, coeff_shift);
-#else
-          if (sec_strength == 0)
-            copy_sb16_16(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
-                         src[pli],
-                         (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff,
-                         (fbc * MI_SIZE_64X64 << mi_wide_l2[pli]) - xoff,
-                         stride[pli], ysize, xsize);
-          cdef_filter_fb(sec_strength ? NULL : (uint8_t *)in, CDEF_BSTRIDE,
-                         tmp_dst, in, xdec[pli], ydec[pli], dir, &dirinit, var,
-                         pli, dlist, cdef_count, threshold,
-                         sec_strength + (sec_strength == 3), sec_damping,
-                         pri_damping, coeff_shift, sec_strength != 0, 1);
-#endif
           curr_mse = compute_cdef_dist(
               ref_coeff[pli] +
                   (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) * stride[pli] +
@@ -470,7 +461,7 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
     int best_lev0[CDEF_MAX_STRENGTHS];
     int best_lev1[CDEF_MAX_STRENGTHS] = { 0 };
     nb_strengths = 1 << i;
-    if (nplanes >= 3)
+    if (num_planes >= 3)
       tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths,
                                            mse, sb_count, fast);
     else
@@ -500,14 +491,14 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
     best_gi = 0;
     for (gi = 0; gi < cm->nb_cdef_strengths; gi++) {
       uint64_t curr = mse[0][i][cm->cdef_strengths[gi]];
-      if (nplanes >= 3) curr += mse[1][i][cm->cdef_uv_strengths[gi]];
+      if (num_planes >= 3) curr += mse[1][i][cm->cdef_uv_strengths[gi]];
       if (curr < best_mse) {
         best_gi = gi;
         best_mse = curr;
       }
     }
     selected_strength[i] = best_gi;
-    cm->mi_grid_visible[sb_index[i]]->mbmi.cdef_strength = best_gi;
+    cm->mi_grid_visible[sb_index[i]]->cdef_strength = best_gi;
   }
 
   if (fast) {
@@ -526,7 +517,7 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
   cm->cdef_sec_damping = sec_damping;
   aom_free(mse[0]);
   aom_free(mse[1]);
-  for (pli = 0; pli < nplanes; pli++) {
+  for (pli = 0; pli < num_planes; pli++) {
     aom_free(src[pli]);
     aom_free(ref_coeff[pli]);
   }
diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c
index d8b6f9074..5f802a707 100644
--- a/third_party/aom/av1/encoder/picklpf.c
+++ b/third_party/aom/av1/encoder/picklpf.c
@@ -12,7 +12,7 @@
 #include <assert.h>
 #include <limits.h>
 
-#include "./aom_scale_rtcd.h"
+#include "config/aom_scale_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/psnr.h"
@@ -27,74 +27,6 @@
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/picklpf.h"
 
-#if CONFIG_LPF_SB
-#if CONFIG_HIGHBITDEPTH
-static int compute_sb_y_sse_highbd(const YV12_BUFFER_CONFIG *src,
-                                   const YV12_BUFFER_CONFIG *frame,
-                                   AV1_COMMON *const cm, int mi_row,
-                                   int mi_col) {
-  int sse = 0;
-  const int mi_row_start = AOMMAX(0, mi_row - FILT_BOUNDARY_MI_OFFSET);
-  const int mi_col_start = AOMMAX(0, mi_col - FILT_BOUNDARY_MI_OFFSET);
-  const int mi_row_range = mi_row - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE;
-  const int mi_col_range = mi_col - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE;
-  const int mi_row_end = AOMMIN(mi_row_range, cm->mi_rows);
-  const int mi_col_end = AOMMIN(mi_col_range, cm->mi_cols);
-
-  const int row = mi_row_start * MI_SIZE;
-  const int col = mi_col_start * MI_SIZE;
-  const uint16_t *src_y =
-      CONVERT_TO_SHORTPTR(src->y_buffer) + row * src->y_stride + col;
-  const uint16_t *frame_y =
-      CONVERT_TO_SHORTPTR(frame->y_buffer) + row * frame->y_stride + col;
-  const int row_end = (mi_row_end - mi_row_start) * MI_SIZE;
-  const int col_end = (mi_col_end - mi_col_start) * MI_SIZE;
-
-  int x, y;
-  for (y = 0; y < row_end; ++y) {
-    for (x = 0; x < col_end; ++x) {
-      const int diff = src_y[x] - frame_y[x];
-      sse += diff * diff;
-    }
-    src_y += src->y_stride;
-    frame_y += frame->y_stride;
-  }
-  return sse;
-}
-#endif
-
-static int compute_sb_y_sse(const YV12_BUFFER_CONFIG *src,
-                            const YV12_BUFFER_CONFIG *frame,
-                            AV1_COMMON *const cm, int mi_row, int mi_col) {
-  int sse = 0;
-  const int mi_row_start = AOMMAX(0, mi_row - FILT_BOUNDARY_MI_OFFSET);
-  const int mi_col_start = AOMMAX(0, mi_col - FILT_BOUNDARY_MI_OFFSET);
-  const int mi_row_range = mi_row - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE;
-  const int mi_col_range = mi_col - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE;
-  const int mi_row_end = AOMMIN(mi_row_range, cm->mi_rows);
-  const int mi_col_end = AOMMIN(mi_col_range, cm->mi_cols);
-
-  const int row = mi_row_start * MI_SIZE;
-  const int col = mi_col_start * MI_SIZE;
-  const uint8_t *src_y = src->y_buffer + row * src->y_stride + col;
-  const uint8_t *frame_y = frame->y_buffer + row * frame->y_stride + col;
-  const int row_end = (mi_row_end - mi_row_start) * MI_SIZE;
-  const int col_end = (mi_col_end - mi_col_start) * MI_SIZE;
-
-  int x, y;
-  for (y = 0; y < row_end; ++y) {
-    for (x = 0; x < col_end; ++x) {
-      const int diff = src_y[x] - frame_y[x];
-      sse += diff * diff;
-    }
-    src_y += src->y_stride;
-    frame_y += frame->y_stride;
-  }
-  return sse;
-}
-#endif  // CONFIG_LPF_SB
-
-#if !CONFIG_LPF_SB
 static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc,
                             YV12_BUFFER_CONFIG *dst_bc, int plane) {
   switch (plane) {
@@ -104,7 +36,6 @@ static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc,
     default: assert(plane >= 0 && plane <= 2); break;
   }
 }
-#endif  // CONFIG_LPF_SB
 
 int av1_get_max_filter_level(const AV1_COMP *cpi) {
   if (cpi->oxcf.pass == 2) {
@@ -115,195 +46,58 @@ int av1_get_max_filter_level(const AV1_COMP *cpi) {
   }
 }
 
-#if CONFIG_LPF_SB
-// TODO(chengchen): reduce memory usage by copy superblock instead of frame
-static int try_filter_superblock(const YV12_BUFFER_CONFIG *sd,
-                                 AV1_COMP *const cpi, int filt_level,
-                                 int partial_frame, int mi_row, int mi_col) {
-  AV1_COMMON *const cm = &cpi->common;
-  int filt_err;
-
-#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4
-  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1,
-                        partial_frame, mi_row, mi_col);
-#else
-  if (cpi->num_workers > 1)
-    av1_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
-                             filt_level, 1, partial_frame, cpi->workers,
-                             cpi->num_workers, &cpi->lf_row_sync);
-  else
-    av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
-                          1, partial_frame);
-#endif
-
-#if CONFIG_HIGHBITDEPTH
-  if (cm->use_highbitdepth) {
-    filt_err =
-        compute_sb_y_sse_highbd(sd, cm->frame_to_show, cm, mi_row, mi_col);
-  } else {
-    filt_err = compute_sb_y_sse(sd, cm->frame_to_show, cm, mi_row, mi_col);
-  }
-#else
-  filt_err = compute_sb_y_sse(sd, cm->frame_to_show, cm, mi_row, mi_col);
-#endif  // CONFIG_HIGHBITDEPTH
-
-  // TODO(chengchen): Copy the superblock only
-  // Re-instate the unfiltered frame
-  aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
-
-  return filt_err;
-}
-
-static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
-                               int partial_frame, double *best_cost_ret,
-                               int mi_row, int mi_col, int last_lvl) {
-  assert(partial_frame == 1);
-  assert(last_lvl >= 0);
-
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *x = &cpi->td.mb;
-
-  int min_filter_level = AOMMAX(0, last_lvl - MAX_LPF_OFFSET);
-  int max_filter_level =
-      AOMMIN(av1_get_max_filter_level(cpi), last_lvl + MAX_LPF_OFFSET);
-
-  // search a larger range for the start superblock
-  if (mi_row == 0 && mi_col == 0) {
-    min_filter_level = 0;
-    max_filter_level = av1_get_max_filter_level(cpi);
-  }
-
-  // TODO(chengchen): Copy for superblock only
-  // Make a copy of the unfiltered / processed recon buffer
-  aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
-
-  int estimate_err =
-      try_filter_superblock(sd, cpi, last_lvl, partial_frame, mi_row, mi_col);
-
-  int best_err = estimate_err;
-  int filt_best = last_lvl;
-
-  int i;
-  for (i = min_filter_level; i <= max_filter_level; i += LPF_STEP) {
-    if (i == last_lvl) continue;
-
-    int filt_err =
-        try_filter_superblock(sd, cpi, i, partial_frame, mi_row, mi_col);
-
-    if (filt_err < best_err) {
-      best_err = filt_err;
-      filt_best = i;
-    }
-  }
-
-  // If previous sb filter level has similar filtering performance as current
-  // best filter level, use previous level such that we can only send one bit
-  // to indicate current filter level is the same as the previous.
-  int threshold = 400;
-
-  // ratio = the filtering area / a superblock size
-  int ratio = 1;
-  if (mi_row + MAX_MIB_SIZE > cm->mi_rows) {
-    ratio *= (cm->mi_rows - mi_row);
-  } else {
-    if (mi_row == 0) {
-      ratio *= (MAX_MIB_SIZE - FILT_BOUNDARY_MI_OFFSET);
-    } else {
-      ratio *= MAX_MIB_SIZE;
-    }
-  }
-  if (mi_col + MAX_MIB_SIZE > cm->mi_cols) {
-    ratio *= (cm->mi_cols - mi_col);
-  } else {
-    if (mi_col == 0) {
-      ratio *= (MAX_MIB_SIZE - FILT_BOUNDARY_MI_OFFSET);
-    } else {
-      ratio *= MAX_MIB_SIZE;
-    }
-  }
-  threshold = threshold * ratio / (MAX_MIB_SIZE * MAX_MIB_SIZE);
-
-  const int diff = abs(estimate_err - best_err);
-
-  const int percent_thresh = (int)((double)estimate_err * 0.01);
-  threshold = AOMMAX(threshold, percent_thresh);
-  if (diff < threshold) {
-    best_err = estimate_err;
-    filt_best = last_lvl;
-  }
-
-  // Compute rdcost to determine whether to reuse previous filter lvl
-  if (filt_best != last_lvl) {
-  }
-
-  if (best_cost_ret) *best_cost_ret = RDCOST_DBL(x->rdmult, 0, best_err);
-  return filt_best;
-}
-
-#else  // CONFIG_LPF_SB
 static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
                                 AV1_COMP *const cpi, int filt_level,
-                                int partial_frame
-#if CONFIG_LOOPFILTER_LEVEL
-                                ,
-                                int plane, int dir
-#endif
-                                ) {
+                                int partial_frame, int plane, int dir) {
   AV1_COMMON *const cm = &cpi->common;
   int64_t filt_err;
 
-#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4
-#if CONFIG_LOOPFILTER_LEVEL
   assert(plane >= 0 && plane <= 2);
   int filter_level[2] = { filt_level, filt_level };
   if (plane == 0 && dir == 0) filter_level[1] = cm->lf.filter_level[1];
   if (plane == 0 && dir == 1) filter_level[0] = cm->lf.filter_level[0];
 
-  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd,
-                        filter_level[0], filter_level[1], plane, partial_frame);
-#else
-  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1,
-                        partial_frame);
-#endif  // CONFIG_LOOPFILTER_LEVEL
+  // set base filters for use of get_filter_level when in DELTA_Q_LF mode
+  switch (plane) {
+    case 0:
+      cm->lf.filter_level[0] = filter_level[0];
+      cm->lf.filter_level[1] = filter_level[1];
+      break;
+    case 1: cm->lf.filter_level_u = filter_level[0]; break;
+    case 2: cm->lf.filter_level_v = filter_level[0]; break;
+  }
+
+      // TODO(any): please enable multi-thread and remove the flag when loop
+      // filter mask is compatible with multi-thread.
+#if LOOP_FILTER_BITMASK
+  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane,
+                        plane + 1, partial_frame);
 #else
   if (cpi->num_workers > 1)
-    av1_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
-                             filt_level, 1, partial_frame, cpi->workers,
+    av1_loop_filter_frame_mt(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane,
+                             plane + 1, partial_frame, cpi->workers,
                              cpi->num_workers, &cpi->lf_row_sync);
   else
-    av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
-                          1, partial_frame);
+    av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane,
+                          plane + 1, partial_frame);
 #endif
 
   int highbd = 0;
-#if CONFIG_HIGHBITDEPTH
   highbd = cm->use_highbitdepth;
-#endif  // CONFIG_HIGHBITDEPTH
 
-#if CONFIG_LOOPFILTER_LEVEL
   filt_err = aom_get_sse_plane(sd, cm->frame_to_show, plane, highbd);
 
   // Re-instate the unfiltered frame
   yv12_copy_plane(&cpi->last_frame_uf, cm->frame_to_show, plane);
-#else
-  filt_err = aom_get_sse_plane(sd, cm->frame_to_show, 0, highbd);
-
-  // Re-instate the unfiltered frame
-  yv12_copy_plane(&cpi->last_frame_uf, cm->frame_to_show, 0);
-#endif  // CONFIG_LOOPFILTER_LEVEL
 
   return filt_err;
 }
 
 static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
-                               int partial_frame, double *best_cost_ret
-#if CONFIG_LOOPFILTER_LEVEL
-                               ,
-                               int plane, int dir
-#endif
-                               ) {
+                               int partial_frame,
+                               const int *last_frame_filter_level,
+                               double *best_cost_ret, int plane, int dir) {
   const AV1_COMMON *const cm = &cpi->common;
-  const struct loopfilter *const lf = &cm->lf;
   const int min_filter_level = 0;
   const int max_filter_level = av1_get_max_filter_level(cpi);
   int filt_direction = 0;
@@ -311,39 +105,24 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
   int filt_best;
   MACROBLOCK *x = &cpi->td.mb;
 
-// Start the search at the previous frame filter level unless it is now out of
-// range.
-#if CONFIG_LOOPFILTER_LEVEL
+  // Start the search at the previous frame filter level unless it is now out of
+  // range.
   int lvl;
   switch (plane) {
-    case 0: lvl = (dir == 1) ? lf->filter_level[1] : lf->filter_level[0]; break;
-    case 1: lvl = lf->filter_level_u; break;
-    case 2: lvl = lf->filter_level_v; break;
+    case 0: lvl = last_frame_filter_level[dir]; break;
+    case 1: lvl = last_frame_filter_level[2]; break;
+    case 2: lvl = last_frame_filter_level[3]; break;
     default: assert(plane >= 0 && plane <= 2); return 0;
   }
   int filt_mid = clamp(lvl, min_filter_level, max_filter_level);
-#else
-  int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
-#endif  // CONFIG_LOOPFILTER_LEVEL
   int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
   // Sum squared error at each filter level
   int64_t ss_err[MAX_LOOP_FILTER + 1];
 
   // Set each entry to -1
   memset(ss_err, 0xFF, sizeof(ss_err));
-
-#if CONFIG_LOOPFILTER_LEVEL
   yv12_copy_plane(cm->frame_to_show, &cpi->last_frame_uf, plane);
-#else
-  //  Make a copy of the unfiltered / processed recon buffer
-  aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
-#endif  // CONFIG_LOOPFILTER_LEVEL
-
-#if CONFIG_LOOPFILTER_LEVEL
   best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane, dir);
-#else
-  best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame);
-#endif  // CONFIG_LOOPFILTER_LEVEL
   filt_best = filt_mid;
   ss_err[filt_mid] = best_err;
 
@@ -363,12 +142,8 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     if (filt_direction <= 0 && filt_low != filt_mid) {
       // Get Low filter error score
       if (ss_err[filt_low] < 0) {
-#if CONFIG_LOOPFILTER_LEVEL
         ss_err[filt_low] =
             try_filter_frame(sd, cpi, filt_low, partial_frame, plane, dir);
-#else
-        ss_err[filt_low] = try_filter_frame(sd, cpi, filt_low, partial_frame);
-#endif  // CONFIG_LOOPFILTER_LEVEL
       }
       // If value is close to the best so far then bias towards a lower loop
       // filter value.
@@ -384,12 +159,8 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     // Now look at filt_high
     if (filt_direction >= 0 && filt_high != filt_mid) {
       if (ss_err[filt_high] < 0) {
-#if CONFIG_LOOPFILTER_LEVEL
         ss_err[filt_high] =
             try_filter_frame(sd, cpi, filt_high, partial_frame, plane, dir);
-#else
-        ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame);
-#endif  // CONFIG_LOOPFILTER_LEVEL
       }
       // If value is significantly better than previous best, bias added against
       // raising filter value
@@ -415,33 +186,36 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
   if (best_cost_ret) *best_cost_ret = RDCOST_DBL(x->rdmult, 0, best_err);
   return filt_best;
 }
-#endif  // CONFIG_LPF_SB
 
 void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
                            LPF_PICK_METHOD method) {
   AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   struct loopfilter *const lf = &cm->lf;
+  (void)sd;
 
-  lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0 : cpi->oxcf.sharpness;
+  lf->sharpness_level = 0;
 
   if (method == LPF_PICK_MINIMAL_LPF) {
-#if CONFIG_LOOPFILTER_LEVEL
     lf->filter_level[0] = 0;
     lf->filter_level[1] = 0;
-#else
-    lf->filter_level = 0;
-#endif
   } else if (method >= LPF_PICK_FROM_Q) {
     const int min_filter_level = 0;
     const int max_filter_level = av1_get_max_filter_level(cpi);
-    const int q = av1_ac_quant(cm->base_qindex, 0, cm->bit_depth);
-// These values were determined by linear fitting the result of the
-// searched level, filt_guess = q * 0.316206 + 3.87252
-#if CONFIG_HIGHBITDEPTH
+    const int q = av1_ac_quant_Q3(cm->base_qindex, 0, cm->bit_depth);
+    // These values were determined by linear fitting the result of the
+    // searched level for 8 bit depth:
+    // Keyframes: filt_guess = q * 0.06699 - 1.60817
+    // Other frames: filt_guess = q * 0.02295 + 2.48225
+    //
+    // And high bit depth separately:
+    // filt_guess = q * 0.316206 + 3.87252
     int filt_guess;
     switch (cm->bit_depth) {
       case AOM_BITS_8:
-        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+        filt_guess = (cm->frame_type == KEY_FRAME)
+                         ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18)
+                         : ROUND_POWER_OF_TWO(q * 6017 + 650707, 18);
         break;
       case AOM_BITS_10:
         filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
@@ -455,58 +229,36 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
                "or AOM_BITS_12");
         return;
     }
-#else
-    int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
-#endif  // CONFIG_HIGHBITDEPTH
-    if (cm->frame_type == KEY_FRAME) filt_guess -= 4;
-#if CONFIG_LOOPFILTER_LEVEL
+    if (cm->bit_depth != AOM_BITS_8 && cm->frame_type == KEY_FRAME)
+      filt_guess -= 4;
+    // TODO(chengchen): retrain the model for Y, U, V filter levels
     lf->filter_level[0] = clamp(filt_guess, min_filter_level, max_filter_level);
     lf->filter_level[1] = clamp(filt_guess, min_filter_level, max_filter_level);
-#else
-    lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
-#endif
+    lf->filter_level_u = clamp(filt_guess, min_filter_level, max_filter_level);
+    lf->filter_level_v = clamp(filt_guess, min_filter_level, max_filter_level);
   } else {
-#if CONFIG_LPF_SB
-    int mi_row, mi_col;
-    // TODO(chengchen): init last_lvl using previous frame's info?
-    int last_lvl = 0;
-    // TODO(chengchen): if the frame size makes the last superblock very small,
-    // consider merge it to the previous superblock to save bits.
-    // Example, if frame size 1080x720, then in the last row of superblock,
-    // there're (FILT_BOUNDAR_OFFSET + 16) pixels.
-    for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MAX_MIB_SIZE) {
-      for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
-        int lvl =
-            search_filter_level(sd, cpi, 1, NULL, mi_row, mi_col, last_lvl);
-
-        av1_loop_filter_sb_level_init(cm, mi_row, mi_col, lvl);
-
-        // For the superblock at row start, its previous filter level should be
-        // the one above it, not the one at the end of last row
-        if (mi_col + MAX_MIB_SIZE >= cm->mi_cols) {
-          last_lvl = cm->mi_grid_visible[mi_row * cm->mi_stride]->mbmi.filt_lvl;
-        } else {
-          last_lvl = lvl;
-        }
-      }
+    const int last_frame_filter_level[4] = { lf->filter_level[0],
+                                             lf->filter_level[1],
+                                             lf->filter_level_u,
+                                             lf->filter_level_v };
+
+    lf->filter_level[0] = lf->filter_level[1] =
+        search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                            last_frame_filter_level, NULL, 0, 2);
+    lf->filter_level[0] =
+        search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                            last_frame_filter_level, NULL, 0, 0);
+    lf->filter_level[1] =
+        search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                            last_frame_filter_level, NULL, 0, 1);
+
+    if (num_planes > 1) {
+      lf->filter_level_u =
+          search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                              last_frame_filter_level, NULL, 1, 0);
+      lf->filter_level_v =
+          search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                              last_frame_filter_level, NULL, 2, 0);
     }
-#else  // CONFIG_LPF_SB
-#if CONFIG_LOOPFILTER_LEVEL
-    lf->filter_level[0] = lf->filter_level[1] = search_filter_level(
-        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 0, 2);
-    lf->filter_level[0] = search_filter_level(
-        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 0, 0);
-    lf->filter_level[1] = search_filter_level(
-        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 0, 1);
-
-    lf->filter_level_u = search_filter_level(
-        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 1, 0);
-    lf->filter_level_v = search_filter_level(
-        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 2, 0);
-#else
-    lf->filter_level =
-        search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL);
-#endif  // CONFIG_LOOPFILTER_LEVEL
-#endif  // CONFIG_LPF_SB
   }
 }
diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c
index a2262b6fc..93ea09690 100644
--- a/third_party/aom/av1/encoder/pickrst.c
+++ b/third_party/aom/av1/encoder/pickrst.c
@@ -14,7 +14,7 @@
 #include <limits.h>
 #include <math.h>
 
-#include "./aom_scale_rtcd.h"
+#include "config/aom_scale_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/binary_codes_writer.h"
@@ -40,150 +40,156 @@ static const RestorationType force_restore_type = RESTORE_TYPES;
 // Number of Wiener iterations
 #define NUM_WIENER_ITERS 5
 
-typedef double (*search_restore_type)(const YV12_BUFFER_CONFIG *src,
-                                      AV1_COMP *cpi, int partial_frame,
-                                      int plane, RestorationInfo *info,
-                                      RestorationType *rest_level,
-                                      int64_t *best_tile_cost,
-                                      YV12_BUFFER_CONFIG *dst_frame);
+// Penalty factor for use of dual sgr
+#define DUAL_SGR_PENALTY_MULT 0.01
 
 const int frame_level_restore_bits[RESTORE_TYPES] = { 2, 2, 2, 2 };
 
-static int64_t sse_restoration_tile(const YV12_BUFFER_CONFIG *src,
-                                    const YV12_BUFFER_CONFIG *dst,
-                                    const AV1_COMMON *cm, int h_start,
-                                    int width, int v_start, int height,
-                                    int components_pattern) {
-  int64_t filt_err = 0;
-  (void)cm;
-  // Y and UV components cannot be mixed
-  assert(components_pattern == 1 || components_pattern == 2 ||
-         components_pattern == 4 || components_pattern == 6);
-#if CONFIG_HIGHBITDEPTH
-  if (cm->use_highbitdepth) {
-    if ((components_pattern >> AOM_PLANE_Y) & 1) {
-      filt_err +=
-          aom_highbd_get_y_sse_part(src, dst, h_start, width, v_start, height);
-    }
-    if ((components_pattern >> AOM_PLANE_U) & 1) {
-      filt_err +=
-          aom_highbd_get_u_sse_part(src, dst, h_start, width, v_start, height);
-    }
-    if ((components_pattern >> AOM_PLANE_V) & 1) {
-      filt_err +=
-          aom_highbd_get_v_sse_part(src, dst, h_start, width, v_start, height);
-    }
-    return filt_err;
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-  if ((components_pattern >> AOM_PLANE_Y) & 1) {
-    filt_err += aom_get_y_sse_part(src, dst, h_start, width, v_start, height);
-  }
-  if ((components_pattern >> AOM_PLANE_U) & 1) {
-    filt_err += aom_get_u_sse_part(src, dst, h_start, width, v_start, height);
-  }
-  if ((components_pattern >> AOM_PLANE_V) & 1) {
-    filt_err += aom_get_v_sse_part(src, dst, h_start, width, v_start, height);
-  }
-  return filt_err;
+typedef int64_t (*sse_extractor_type)(const YV12_BUFFER_CONFIG *a,
+                                      const YV12_BUFFER_CONFIG *b);
+typedef int64_t (*sse_part_extractor_type)(const YV12_BUFFER_CONFIG *a,
+                                           const YV12_BUFFER_CONFIG *b,
+                                           int hstart, int width, int vstart,
+                                           int height);
+
+#define NUM_EXTRACTORS (3 * (1 + 1))
+
+static const sse_part_extractor_type sse_part_extractors[NUM_EXTRACTORS] = {
+  aom_get_y_sse_part,        aom_get_u_sse_part,
+  aom_get_v_sse_part,        aom_highbd_get_y_sse_part,
+  aom_highbd_get_u_sse_part, aom_highbd_get_v_sse_part,
+};
+
+static int64_t sse_restoration_unit(const RestorationTileLimits *limits,
+                                    const YV12_BUFFER_CONFIG *src,
+                                    const YV12_BUFFER_CONFIG *dst, int plane,
+                                    int highbd) {
+  return sse_part_extractors[3 * highbd + plane](
+      src, dst, limits->h_start, limits->h_end - limits->h_start,
+      limits->v_start, limits->v_end - limits->v_start);
 }
 
-static int64_t sse_restoration_frame(AV1_COMMON *const cm,
-                                     const YV12_BUFFER_CONFIG *src,
-                                     const YV12_BUFFER_CONFIG *dst,
-                                     int components_pattern) {
-  int64_t filt_err = 0;
-#if CONFIG_HIGHBITDEPTH
-  if (cm->use_highbitdepth) {
-    if ((components_pattern >> AOM_PLANE_Y) & 1) {
-      filt_err += aom_highbd_get_y_sse(src, dst);
-    }
-    if ((components_pattern >> AOM_PLANE_U) & 1) {
-      filt_err += aom_highbd_get_u_sse(src, dst);
-    }
-    if ((components_pattern >> AOM_PLANE_V) & 1) {
-      filt_err += aom_highbd_get_v_sse(src, dst);
-    }
-    return filt_err;
-  }
-#else
-  (void)cm;
-#endif  // CONFIG_HIGHBITDEPTH
-  if ((components_pattern >> AOM_PLANE_Y) & 1) {
-    filt_err = aom_get_y_sse(src, dst);
-  }
-  if ((components_pattern >> AOM_PLANE_U) & 1) {
-    filt_err += aom_get_u_sse(src, dst);
-  }
-  if ((components_pattern >> AOM_PLANE_V) & 1) {
-    filt_err += aom_get_v_sse(src, dst);
-  }
-  return filt_err;
+typedef struct {
+  // The best coefficients for Wiener or Sgrproj restoration
+  WienerInfo wiener;
+  SgrprojInfo sgrproj;
+
+  // The sum of squared errors for this rtype.
+  int64_t sse[RESTORE_SWITCHABLE_TYPES];
+
+  // The rtype to use for this unit given a frame rtype as
+  // index. Indices: WIENER, SGRPROJ, SWITCHABLE.
+  RestorationType best_rtype[RESTORE_TYPES - 1];
+} RestUnitSearchInfo;
+
+typedef struct {
+  const YV12_BUFFER_CONFIG *src;
+  YV12_BUFFER_CONFIG *dst;
+
+  const AV1_COMMON *cm;
+  const MACROBLOCK *x;
+  int plane;
+  int plane_width;
+  int plane_height;
+  RestUnitSearchInfo *rusi;
+
+  // Speed features
+  const SPEED_FEATURES *sf;
+
+  uint8_t *dgd_buffer;
+  int dgd_stride;
+  const uint8_t *src_buffer;
+  int src_stride;
+
+  // sse and bits are initialised by reset_rsc in search_rest_type
+  int64_t sse;
+  int64_t bits;
+  int tile_y0, tile_stripe0;
+
+  // sgrproj and wiener are initialised by rsc_on_tile when starting the first
+  // tile in the frame.
+  SgrprojInfo sgrproj;
+  WienerInfo wiener;
+  AV1PixelRect tile_rect;
+} RestSearchCtxt;
+
+static void rsc_on_tile(int tile_row, int tile_col, void *priv) {
+  (void)tile_col;
+
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  set_default_sgrproj(&rsc->sgrproj);
+  set_default_wiener(&rsc->wiener);
+
+  rsc->tile_stripe0 =
+      (tile_row == 0) ? 0 : rsc->cm->rst_end_stripe[tile_row - 1];
 }
 
-static int64_t try_restoration_tile(const YV12_BUFFER_CONFIG *src,
-                                    AV1_COMP *const cpi, RestorationInfo *rsi,
-                                    int components_pattern, int partial_frame,
-                                    int tile_idx,
-                                    YV12_BUFFER_CONFIG *dst_frame) {
-  AV1_COMMON *const cm = &cpi->common;
-  int64_t filt_err;
-  int tile_width, tile_height, nhtiles, nvtiles;
-  int ntiles, width, height;
-
-  // Y and UV components cannot be mixed
-  assert(components_pattern == 1 || components_pattern == 2 ||
-         components_pattern == 4 || components_pattern == 6);
-
-  if (components_pattern == 1) {  // Y only
-    width = src->y_crop_width;
-    height = src->y_crop_height;
-  } else {  // Color
-    width = src->uv_crop_width;
-    height = src->uv_crop_height;
-  }
-  ntiles = av1_get_rest_ntiles(
-      width, height, cm->rst_info[components_pattern > 1].restoration_tilesize,
-      &tile_width, &tile_height, &nhtiles, &nvtiles);
-  (void)ntiles;
-
-  av1_loop_restoration_frame(cm->frame_to_show, cm, rsi, components_pattern,
-                             partial_frame, dst_frame);
-  RestorationTileLimits limits = av1_get_rest_tile_limits(
-      tile_idx, nhtiles, nvtiles, tile_width, tile_height, width,
-#if CONFIG_STRIPED_LOOP_RESTORATION
-      height, components_pattern > 1 ? cm->subsampling_y : 0);
-#else
-      height);
-#endif
-  filt_err = sse_restoration_tile(
-      src, dst_frame, cm, limits.h_start, limits.h_end - limits.h_start,
-      limits.v_start, limits.v_end - limits.v_start, components_pattern);
-
-  return filt_err;
+static void reset_rsc(RestSearchCtxt *rsc) {
+  rsc->sse = 0;
+  rsc->bits = 0;
 }
 
-static int64_t try_restoration_frame(const YV12_BUFFER_CONFIG *src,
-                                     AV1_COMP *const cpi, RestorationInfo *rsi,
-                                     int components_pattern, int partial_frame,
-                                     YV12_BUFFER_CONFIG *dst_frame) {
-  AV1_COMMON *const cm = &cpi->common;
-  int64_t filt_err;
-  av1_loop_restoration_frame(cm->frame_to_show, cm, rsi, components_pattern,
-                             partial_frame, dst_frame);
-  filt_err = sse_restoration_frame(cm, src, dst_frame, components_pattern);
-  return filt_err;
+static void init_rsc(const YV12_BUFFER_CONFIG *src, const AV1_COMMON *cm,
+                     const MACROBLOCK *x, const SPEED_FEATURES *sf, int plane,
+                     RestUnitSearchInfo *rusi, YV12_BUFFER_CONFIG *dst,
+                     RestSearchCtxt *rsc) {
+  rsc->src = src;
+  rsc->dst = dst;
+  rsc->cm = cm;
+  rsc->x = x;
+  rsc->plane = plane;
+  rsc->rusi = rusi;
+  rsc->sf = sf;
+
+  const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
+  const int is_uv = plane != AOM_PLANE_Y;
+  rsc->plane_width = src->crop_widths[is_uv];
+  rsc->plane_height = src->crop_heights[is_uv];
+  rsc->src_buffer = src->buffers[plane];
+  rsc->src_stride = src->strides[is_uv];
+  rsc->dgd_buffer = dgd->buffers[plane];
+  rsc->dgd_stride = dgd->strides[is_uv];
+  rsc->tile_rect = av1_whole_frame_rect(cm, is_uv);
+  assert(src->crop_widths[is_uv] == dgd->crop_widths[is_uv]);
+  assert(src->crop_heights[is_uv] == dgd->crop_heights[is_uv]);
+}
+
+static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
+                                    const RestorationTileLimits *limits,
+                                    const AV1PixelRect *tile_rect,
+                                    const RestorationUnitInfo *rui) {
+  const AV1_COMMON *const cm = rsc->cm;
+  const int plane = rsc->plane;
+  const int is_uv = plane > 0;
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+  RestorationLineBuffers rlbs;
+  const int bit_depth = cm->bit_depth;
+  const int highbd = cm->use_highbitdepth;
+
+  const YV12_BUFFER_CONFIG *fts = cm->frame_to_show;
+  // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be
+  // also used in encoder.
+  const int optimized_lr = 0;
+
+  av1_loop_restoration_filter_unit(
+      limits, rui, &rsi->boundaries, &rlbs, tile_rect, rsc->tile_stripe0,
+      is_uv && cm->subsampling_x, is_uv && cm->subsampling_y, highbd, bit_depth,
+      fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane],
+      rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr);
+
+  return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd);
 }
 
 static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
                                     int src_stride, const uint8_t *dat8,
                                     int dat_stride, int use_highbitdepth,
-                                    int32_t *flt1, int flt1_stride,
-                                    int32_t *flt2, int flt2_stride, int *xqd) {
+                                    int32_t *flt0, int flt0_stride,
+                                    int32_t *flt1, int flt1_stride, int *xqd,
+                                    const sgr_params_type *params) {
   int i, j;
   int64_t err = 0;
   int xq[2];
-  decode_xq(xqd, xq);
+  decode_xq(xqd, xq, params);
   if (!use_highbitdepth) {
     const uint8_t *src = src8;
     const uint8_t *dat = dat8;
@@ -191,9 +197,9 @@ static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
       for (j = 0; j < width; ++j) {
         const int32_t u =
             (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
-        const int32_t f1 = (int32_t)flt1[i * flt1_stride + j] - u;
-        const int32_t f2 = (int32_t)flt2[i * flt2_stride + j] - u;
-        const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
+        int32_t v = u << SGRPROJ_PRJ_BITS;
+        if (params->r[0] > 0) v += xq[0] * (flt0[i * flt0_stride + j] - u);
+        if (params->r[1] > 0) v += xq[1] * (flt1[i * flt1_stride + j] - u);
         const int32_t e =
             ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) -
             src[i * src_stride + j];
@@ -203,17 +209,67 @@ static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
   } else {
     const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j < width; ++j) {
-        const int32_t u =
-            (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
-        const int32_t f1 = (int32_t)flt1[i * flt1_stride + j] - u;
-        const int32_t f2 = (int32_t)flt2[i * flt2_stride + j] - u;
-        const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
-        const int32_t e =
-            ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) -
-            src[i * src_stride + j];
-        err += e * e;
+    const int32_t half = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1);
+    if (params->r[0] > 0 && params->r[1] > 0) {
+      int xq0 = xq[0];
+      int xq1 = xq[1];
+      for (i = 0; i < height; ++i) {
+        for (j = 0; j < width; ++j) {
+          const int32_t d = dat[j];
+          const int32_t s = src[j];
+          const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS);
+          int32_t v0 = flt0[j] - u;
+          int32_t v1 = flt1[j] - u;
+          int32_t v = half;
+          v += xq0 * v0;
+          v += xq1 * v1;
+          const int32_t e =
+              (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s;
+          err += e * e;
+        }
+        dat += dat_stride;
+        flt0 += flt0_stride;
+        flt1 += flt1_stride;
+        src += src_stride;
+      }
+    } else if (params->r[0] > 0 || params->r[1] > 0) {
+      int exq;
+      int32_t *flt;
+      int flt_stride;
+      if (params->r[0] > 0) {
+        exq = xq[0];
+        flt = flt0;
+        flt_stride = flt0_stride;
+      } else {
+        exq = xq[1];
+        flt = flt1;
+        flt_stride = flt1_stride;
+      }
+      for (i = 0; i < height; ++i) {
+        for (j = 0; j < width; ++j) {
+          const int32_t d = dat[j];
+          const int32_t s = src[j];
+          const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS);
+          int32_t v = half;
+          v += exq * (flt[j] - u);
+          const int32_t e =
+              (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s;
+          err += e * e;
+        }
+        dat += dat_stride;
+        flt += flt_stride;
+        src += src_stride;
+      }
+    } else {
+      for (i = 0; i < height; ++i) {
+        for (j = 0; j < width; ++j) {
+          const int32_t d = dat[j];
+          const int32_t s = src[j];
+          const int32_t e = d - s;
+          err += e * e;
+        }
+        dat += dat_stride;
+        src += src_stride;
       }
     }
   }
@@ -223,11 +279,12 @@ static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
 #define USE_SGRPROJ_REFINEMENT_SEARCH 1
 static int64_t finer_search_pixel_proj_error(
     const uint8_t *src8, int width, int height, int src_stride,
-    const uint8_t *dat8, int dat_stride, int use_highbitdepth, int32_t *flt1,
-    int flt1_stride, int32_t *flt2, int flt2_stride, int start_step, int *xqd) {
-  int64_t err = get_pixel_proj_error(src8, width, height, src_stride, dat8,
-                                     dat_stride, use_highbitdepth, flt1,
-                                     flt1_stride, flt2, flt2_stride, xqd);
+    const uint8_t *dat8, int dat_stride, int use_highbitdepth, int32_t *flt0,
+    int flt0_stride, int32_t *flt1, int flt1_stride, int start_step, int *xqd,
+    const sgr_params_type *params) {
+  int64_t err = get_pixel_proj_error(
+      src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0,
+      flt0_stride, flt1, flt1_stride, xqd, params);
   (void)start_step;
 #if USE_SGRPROJ_REFINEMENT_SEARCH
   int64_t err2;
@@ -235,13 +292,17 @@ static int64_t finer_search_pixel_proj_error(
   int tap_max[] = { SGRPROJ_PRJ_MAX0, SGRPROJ_PRJ_MAX1 };
   for (int s = start_step; s >= 1; s >>= 1) {
     for (int p = 0; p < 2; ++p) {
+      if ((params->r[0] == 0 && p == 0) || (params->r[1] == 0 && p == 1)) {
+        continue;
+      }
       int skip = 0;
       do {
         if (xqd[p] - s >= tap_min[p]) {
           xqd[p] -= s;
-          err2 = get_pixel_proj_error(src8, width, height, src_stride, dat8,
-                                      dat_stride, use_highbitdepth, flt1,
-                                      flt1_stride, flt2, flt2_stride, xqd);
+          err2 =
+              get_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                   dat_stride, use_highbitdepth, flt0,
+                                   flt0_stride, flt1, flt1_stride, xqd, params);
           if (err2 > err) {
             xqd[p] += s;
           } else {
@@ -257,9 +318,10 @@ static int64_t finer_search_pixel_proj_error(
       do {
         if (xqd[p] + s <= tap_max[p]) {
           xqd[p] += s;
-          err2 = get_pixel_proj_error(src8, width, height, src_stride, dat8,
-                                      dat_stride, use_highbitdepth, flt1,
-                                      flt1_stride, flt2, flt2_stride, xqd);
+          err2 =
+              get_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                   dat_stride, use_highbitdepth, flt0,
+                                   flt0_stride, flt1, flt1_stride, xqd, params);
           if (err2 > err) {
             xqd[p] -= s;
           } else {
@@ -277,10 +339,11 @@ static int64_t finer_search_pixel_proj_error(
 }
 
 static void get_proj_subspace(const uint8_t *src8, int width, int height,
-                              int src_stride, uint8_t *dat8, int dat_stride,
-                              int use_highbitdepth, int32_t *flt1,
-                              int flt1_stride, int32_t *flt2, int flt2_stride,
-                              int *xq) {
+                              int src_stride, const uint8_t *dat8,
+                              int dat_stride, int use_highbitdepth,
+                              int32_t *flt0, int flt0_stride, int32_t *flt1,
+                              int flt1_stride, int *xq,
+                              const sgr_params_type *params) {
   int i, j;
   double H[2][2] = { { 0, 0 }, { 0, 0 } };
   double C[2] = { 0, 0 };
@@ -301,8 +364,10 @@ static void get_proj_subspace(const uint8_t *src8, int width, int height,
         const double u = (double)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
         const double s =
             (double)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
-        const double f1 = (double)flt1[i * flt1_stride + j] - u;
-        const double f2 = (double)flt2[i * flt2_stride + j] - u;
+        const double f1 =
+            (params->r[0] > 0) ? (double)flt0[i * flt0_stride + j] - u : 0;
+        const double f2 =
+            (params->r[1] > 0) ? (double)flt1[i * flt1_stride + j] - u : 0;
         H[0][0] += f1 * f1;
         H[1][1] += f2 * f2;
         H[0][1] += f1 * f2;
@@ -318,8 +383,10 @@ static void get_proj_subspace(const uint8_t *src8, int width, int height,
         const double u = (double)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
         const double s =
             (double)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
-        const double f1 = (double)flt1[i * flt1_stride + j] - u;
-        const double f2 = (double)flt2[i * flt2_stride + j] - u;
+        const double f1 =
+            (params->r[0] > 0) ? (double)flt0[i * flt0_stride + j] - u : 0;
+        const double f2 =
+            (params->r[1] > 0) ? (double)flt1[i * flt1_stride + j] - u : 0;
         H[0][0] += f1 * f1;
         H[1][1] += f2 * f2;
         H[0][1] += f1 * f2;
@@ -334,99 +401,103 @@ static void get_proj_subspace(const uint8_t *src8, int width, int height,
   H[1][0] = H[0][1];
   C[0] /= size;
   C[1] /= size;
-  Det = (H[0][0] * H[1][1] - H[0][1] * H[1][0]);
-  if (Det < 1e-8) return;  // ill-posed, return default values
-  x[0] = (H[1][1] * C[0] - H[0][1] * C[1]) / Det;
-  x[1] = (H[0][0] * C[1] - H[1][0] * C[0]) / Det;
-  xq[0] = (int)rint(x[0] * (1 << SGRPROJ_PRJ_BITS));
-  xq[1] = (int)rint(x[1] * (1 << SGRPROJ_PRJ_BITS));
+  if (params->r[0] == 0) {
+    // H matrix is now only the scalar H[1][1]
+    // C vector is now only the scalar C[1]
+    Det = H[1][1];
+    if (Det < 1e-8) return;  // ill-posed, return default values
+    x[0] = 0;
+    x[1] = C[1] / Det;
+
+    xq[0] = 0;
+    xq[1] = (int)rint(x[1] * (1 << SGRPROJ_PRJ_BITS));
+  } else if (params->r[1] == 0) {
+    // H matrix is now only the scalar H[0][0]
+    // C vector is now only the scalar C[0]
+    Det = H[0][0];
+    if (Det < 1e-8) return;  // ill-posed, return default values
+    x[0] = C[0] / Det;
+    x[1] = 0;
+
+    xq[0] = (int)rint(x[0] * (1 << SGRPROJ_PRJ_BITS));
+    xq[1] = 0;
+  } else {
+    Det = (H[0][0] * H[1][1] - H[0][1] * H[1][0]);
+    if (Det < 1e-8) return;  // ill-posed, return default values
+    x[0] = (H[1][1] * C[0] - H[0][1] * C[1]) / Det;
+    x[1] = (H[0][0] * C[1] - H[1][0] * C[0]) / Det;
+
+    xq[0] = (int)rint(x[0] * (1 << SGRPROJ_PRJ_BITS));
+    xq[1] = (int)rint(x[1] * (1 << SGRPROJ_PRJ_BITS));
+  }
 }
 
-void encode_xq(int *xq, int *xqd) {
-  xqd[0] = xq[0];
-  xqd[0] = clamp(xqd[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
-  xqd[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[0] - xq[1];
-  xqd[1] = clamp(xqd[1], SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1);
+void encode_xq(int *xq, int *xqd, const sgr_params_type *params) {
+  if (params->r[0] == 0) {
+    xqd[0] = 0;
+    xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xq[1], SGRPROJ_PRJ_MIN1,
+                   SGRPROJ_PRJ_MAX1);
+  } else if (params->r[1] == 0) {
+    xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
+    xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0], SGRPROJ_PRJ_MIN1,
+                   SGRPROJ_PRJ_MAX1);
+  } else {
+    xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
+    xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0] - xq[1], SGRPROJ_PRJ_MIN1,
+                   SGRPROJ_PRJ_MAX1);
+  }
 }
 
-static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
-                                          int dat_stride, const uint8_t *src8,
-                                          int src_stride, int use_highbitdepth,
-                                          int bit_depth, int pu_width,
-                                          int pu_height, int *eps, int *xqd,
-                                          int32_t *rstbuf) {
-  int32_t *flt1 = rstbuf;
-  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
+// Apply the self-guided filter across an entire restoration unit.
+static void apply_sgr(int sgr_params_idx, const uint8_t *dat8, int width,
+                      int height, int dat_stride, int use_highbd, int bit_depth,
+                      int pu_width, int pu_height, int32_t *flt0, int32_t *flt1,
+                      int flt_stride) {
+  for (int i = 0; i < height; i += pu_height) {
+    const int h = AOMMIN(pu_height, height - i);
+    int32_t *flt0_row = flt0 + i * flt_stride;
+    int32_t *flt1_row = flt1 + i * flt_stride;
+    const uint8_t *dat8_row = dat8 + i * dat_stride;
+
+    // Iterate over the stripe in blocks of width pu_width
+    for (int j = 0; j < width; j += pu_width) {
+      const int w = AOMMIN(pu_width, width - j);
+      av1_selfguided_restoration(dat8_row + j, w, h, dat_stride, flt0_row + j,
+                                 flt1_row + j, flt_stride, sgr_params_idx,
+                                 bit_depth, use_highbd);
+    }
+  }
+}
+
+static SgrprojInfo search_selfguided_restoration(
+    const uint8_t *dat8, int width, int height, int dat_stride,
+    const uint8_t *src8, int src_stride, int use_highbitdepth, int bit_depth,
+    int pu_width, int pu_height, int32_t *rstbuf) {
+  int32_t *flt0 = rstbuf;
+  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   int ep, bestep = 0;
-  int64_t err, besterr = -1;
+  int64_t besterr = -1;
   int exqd[2], bestxqd[2] = { 0, 0 };
-  int flt1_stride = ((width + 7) & ~7) + 8;
-  int flt2_stride = ((width + 7) & ~7) + 8;
+  int flt_stride = ((width + 7) & ~7) + 8;
   assert(pu_width == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
          pu_width == RESTORATION_PROC_UNIT_SIZE);
   assert(pu_height == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
          pu_height == RESTORATION_PROC_UNIT_SIZE);
-#if !CONFIG_HIGHBITDEPTH
-  (void)bit_depth;
-#endif
 
   for (ep = 0; ep < SGRPROJ_PARAMS; ep++) {
     int exq[2];
-#if CONFIG_HIGHBITDEPTH
-    if (use_highbitdepth) {
-      uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
-      for (int i = 0; i < height; i += pu_height)
-        for (int j = 0; j < width; j += pu_width) {
-          const int w = AOMMIN(pu_width, width - j);
-          const int h = AOMMIN(pu_height, height - i);
-          uint16_t *dat_p = dat + i * dat_stride + j;
-          int32_t *flt1_p = flt1 + i * flt1_stride + j;
-          int32_t *flt2_p = flt2 + i * flt2_stride + j;
-#if USE_HIGHPASS_IN_SGRPROJ
-          av1_highpass_filter_highbd(dat_p, w, h, dat_stride, flt1_p,
-                                     flt1_stride, sgr_params[ep].corner,
-                                     sgr_params[ep].edge);
-#else
-          av1_selfguided_restoration_highbd(
-              dat_p, w, h, dat_stride, flt1_p, flt1_stride, bit_depth,
-              sgr_params[ep].r1, sgr_params[ep].e1);
-#endif  // USE_HIGHPASS_IN_SGRPROJ
-          av1_selfguided_restoration_highbd(
-              dat_p, w, h, dat_stride, flt2_p, flt2_stride, bit_depth,
-              sgr_params[ep].r2, sgr_params[ep].e2);
-        }
-    } else {
-#endif
-      for (int i = 0; i < height; i += pu_height)
-        for (int j = 0; j < width; j += pu_width) {
-          const int w = AOMMIN(pu_width, width - j);
-          const int h = AOMMIN(pu_height, height - i);
-          uint8_t *dat_p = dat8 + i * dat_stride + j;
-          int32_t *flt1_p = flt1 + i * flt1_stride + j;
-          int32_t *flt2_p = flt2 + i * flt2_stride + j;
-#if USE_HIGHPASS_IN_SGRPROJ
-          av1_highpass_filter(dat_p, w, h, dat_stride, flt1_p, flt1_stride,
-                              sgr_params[ep].corner, sgr_params[ep].edge);
-#else
-        av1_selfguided_restoration(dat_p, w, h, dat_stride, flt1_p, flt1_stride,
-                                   sgr_params[ep].r1, sgr_params[ep].e1);
-#endif  // USE_HIGHPASS_IN_SGRPROJ
-          av1_selfguided_restoration(dat_p, w, h, dat_stride, flt2_p,
-                                     flt2_stride, sgr_params[ep].r2,
-                                     sgr_params[ep].e2);
-        }
-#if CONFIG_HIGHBITDEPTH
-    }
-#endif
+    apply_sgr(ep, dat8, width, height, dat_stride, use_highbitdepth, bit_depth,
+              pu_width, pu_height, flt0, flt1, flt_stride);
     aom_clear_system_state();
+    const sgr_params_type *const params = &sgr_params[ep];
     get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
-                      use_highbitdepth, flt1, flt1_stride, flt2, flt2_stride,
-                      exq);
+                      use_highbitdepth, flt0, flt_stride, flt1, flt_stride, exq,
+                      params);
     aom_clear_system_state();
-    encode_xq(exq, exqd);
-    err = finer_search_pixel_proj_error(
+    encode_xq(exq, exqd, params);
+    int64_t err = finer_search_pixel_proj_error(
         src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth,
-        flt1, flt1_stride, flt2, flt2_stride, 2, exqd);
+        flt0, flt_stride, flt1, flt_stride, 2, exqd, params);
     if (besterr == -1 || err < besterr) {
       bestep = ep;
       besterr = err;
@@ -434,273 +505,86 @@ static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
       bestxqd[1] = exqd[1];
     }
   }
-  *eps = bestep;
-  xqd[0] = bestxqd[0];
-  xqd[1] = bestxqd[1];
+
+  SgrprojInfo ret;
+  ret.ep = bestep;
+  ret.xqd[0] = bestxqd[0];
+  ret.xqd[1] = bestxqd[1];
+  return ret;
 }
 
 static int count_sgrproj_bits(SgrprojInfo *sgrproj_info,
                               SgrprojInfo *ref_sgrproj_info) {
   int bits = SGRPROJ_PARAMS_BITS;
-  bits += aom_count_primitive_refsubexpfin(
-      SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
-      ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
-      sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
-  bits += aom_count_primitive_refsubexpfin(
-      SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
-      ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
-      sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+  const sgr_params_type *params = &sgr_params[sgrproj_info->ep];
+  if (params->r[0] > 0)
+    bits += aom_count_primitive_refsubexpfin(
+        SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+        sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+  if (params->r[1] > 0)
+    bits += aom_count_primitive_refsubexpfin(
+        SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+        sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
   return bits;
 }
 
-struct rest_search_ctxt {
-  const YV12_BUFFER_CONFIG *src;
-  AV1_COMP *cpi;
-  uint8_t *dgd_buffer;
-  const uint8_t *src_buffer;
-  int dgd_stride;
-  int src_stride;
-  int partial_frame;
-  RestorationInfo *info;
-  RestorationType *type;
-  int64_t *best_tile_cost;
-  int plane;
-  int plane_width;
-  int plane_height;
-  int nrtiles_x;
-  int nrtiles_y;
-  YV12_BUFFER_CONFIG *dst_frame;
-};
-
-// Fill in ctxt. Returns the number of restoration tiles for this plane
-static INLINE int init_rest_search_ctxt(
-    const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi, int partial_frame, int plane,
-    RestorationInfo *info, RestorationType *type, int64_t *best_tile_cost,
-    YV12_BUFFER_CONFIG *dst_frame, struct rest_search_ctxt *ctxt) {
-  AV1_COMMON *const cm = &cpi->common;
-  ctxt->src = src;
-  ctxt->cpi = cpi;
-  ctxt->partial_frame = partial_frame;
-  ctxt->info = info;
-  ctxt->type = type;
-  ctxt->best_tile_cost = best_tile_cost;
-  ctxt->plane = plane;
-  ctxt->dst_frame = dst_frame;
-
-  const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
-  if (plane == AOM_PLANE_Y) {
-    ctxt->plane_width = src->y_crop_width;
-    ctxt->plane_height = src->y_crop_height;
-    ctxt->src_buffer = src->y_buffer;
-    ctxt->src_stride = src->y_stride;
-    ctxt->dgd_buffer = dgd->y_buffer;
-    ctxt->dgd_stride = dgd->y_stride;
-    assert(ctxt->plane_width == dgd->y_crop_width);
-    assert(ctxt->plane_height == dgd->y_crop_height);
-    assert(ctxt->plane_width == src->y_crop_width);
-    assert(ctxt->plane_height == src->y_crop_height);
-  } else {
-    ctxt->plane_width = src->uv_crop_width;
-    ctxt->plane_height = src->uv_crop_height;
-    ctxt->src_stride = src->uv_stride;
-    ctxt->dgd_stride = dgd->uv_stride;
-    ctxt->src_buffer = plane == AOM_PLANE_U ? src->u_buffer : src->v_buffer;
-    ctxt->dgd_buffer = plane == AOM_PLANE_U ? dgd->u_buffer : dgd->v_buffer;
-    assert(ctxt->plane_width == dgd->uv_crop_width);
-    assert(ctxt->plane_height == dgd->uv_crop_height);
-  }
-
-  return av1_get_rest_ntiles(ctxt->plane_width, ctxt->plane_height,
-                             cm->rst_info[plane].restoration_tilesize, NULL,
-                             NULL, &ctxt->nrtiles_x, &ctxt->nrtiles_y);
-}
-
-typedef void (*rtile_visitor_t)(const struct rest_search_ctxt *search_ctxt,
-                                int rtile_idx,
-                                const RestorationTileLimits *limits, void *arg);
-
-static void foreach_rtile_in_tile(const struct rest_search_ctxt *ctxt,
-                                  int tile_row, int tile_col,
-                                  rtile_visitor_t fun, void *arg) {
-  const AV1_COMMON *const cm = &ctxt->cpi->common;
-  const RestorationInfo *rsi = ctxt->cpi->rst_search;
-  TileInfo tile_info;
-
-  av1_tile_set_row(&tile_info, cm, tile_row);
-  av1_tile_set_col(&tile_info, cm, tile_col);
-
-  int tile_col_start = tile_info.mi_col_start * MI_SIZE;
-  int tile_col_end = tile_info.mi_col_end * MI_SIZE;
-  int tile_row_start = tile_info.mi_row_start * MI_SIZE;
-  int tile_row_end = tile_info.mi_row_end * MI_SIZE;
-  if (ctxt->plane > 0) {
-    tile_col_start = ROUND_POWER_OF_TWO(tile_col_start, cm->subsampling_x);
-    tile_col_end = ROUND_POWER_OF_TWO(tile_col_end, cm->subsampling_x);
-    tile_row_start = ROUND_POWER_OF_TWO(tile_row_start, cm->subsampling_y);
-    tile_row_end = ROUND_POWER_OF_TWO(tile_row_end, cm->subsampling_y);
-  }
+static void search_sgrproj(const RestorationTileLimits *limits,
+                           const AV1PixelRect *tile, int rest_unit_idx,
+                           void *priv, int32_t *tmpbuf,
+                           RestorationLineBuffers *rlbs) {
+  (void)rlbs;
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
 
-#if CONFIG_FRAME_SUPERRES
-  // If upscaling is enabled, the tile limits need scaling to match the
-  // upscaled frame where the restoration tiles live. To do this, scale up the
-  // top-left and bottom-right of the tile.
-  if (!av1_superres_unscaled(cm)) {
-    av1_calculate_unscaled_superres_size(&tile_col_start, &tile_row_start,
-                                         cm->superres_scale_denominator);
-    av1_calculate_unscaled_superres_size(&tile_col_end, &tile_row_end,
-                                         cm->superres_scale_denominator);
-    // Make sure we don't fall off the bottom-right of the frame.
-    tile_col_end = AOMMIN(tile_col_end, ctxt->plane_width);
-    tile_row_end = AOMMIN(tile_row_end, ctxt->plane_height);
-  }
-#endif  // CONFIG_FRAME_SUPERRES
-
-  const int rtile_size = rsi->restoration_tilesize;
-  const int rtile_col0 = (tile_col_start + rtile_size - 1) / rtile_size;
-  const int rtile_col1 =
-      AOMMIN((tile_col_end + rtile_size - 1) / rtile_size, ctxt->nrtiles_x);
-  const int rtile_row0 = (tile_row_start + rtile_size - 1) / rtile_size;
-  const int rtile_row1 =
-      AOMMIN((tile_row_end + rtile_size - 1) / rtile_size, ctxt->nrtiles_y);
-
-  const int rtile_width = AOMMIN(tile_col_end - tile_col_start, rtile_size);
-  const int rtile_height = AOMMIN(tile_row_end - tile_row_start, rtile_size);
-
-  for (int rtile_row = rtile_row0; rtile_row < rtile_row1; ++rtile_row) {
-    for (int rtile_col = rtile_col0; rtile_col < rtile_col1; ++rtile_col) {
-      const int rtile_idx = rtile_row * ctxt->nrtiles_x + rtile_col;
-      RestorationTileLimits limits = av1_get_rest_tile_limits(
-          rtile_idx, ctxt->nrtiles_x, ctxt->nrtiles_y, rtile_width,
-          rtile_height, ctxt->plane_width,
-#if CONFIG_STRIPED_LOOP_RESTORATION
-          ctxt->plane_height, ctxt->plane > 0 ? cm->subsampling_y : 0);
-#else
-          ctxt->plane_height);
-#endif
-      fun(ctxt, rtile_idx, &limits, arg);
-    }
-  }
-}
+  const MACROBLOCK *const x = rsc->x;
+  const AV1_COMMON *const cm = rsc->cm;
+  const int highbd = cm->use_highbitdepth;
+  const int bit_depth = cm->bit_depth;
 
-static void search_sgrproj_for_rtile(const struct rest_search_ctxt *ctxt,
-                                     int rtile_idx,
-                                     const RestorationTileLimits *limits,
-                                     void *arg) {
-  const MACROBLOCK *const x = &ctxt->cpi->td.mb;
-  const AV1_COMMON *const cm = &ctxt->cpi->common;
-  RestorationInfo *rsi = ctxt->cpi->rst_search;
-  SgrprojInfo *sgrproj_info = ctxt->info->sgrproj_info;
-
-  SgrprojInfo *ref_sgrproj_info = (SgrprojInfo *)arg;
-
-  int64_t err =
-      sse_restoration_tile(ctxt->src, cm->frame_to_show, cm, limits->h_start,
-                           limits->h_end - limits->h_start, limits->v_start,
-                           limits->v_end - limits->v_start, (1 << ctxt->plane));
-  // #bits when a tile is not restored
-  int bits = av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 0);
-  double cost_norestore = RDCOST_DBL(x->rdmult, (bits >> 4), err);
-  ctxt->best_tile_cost[rtile_idx] = INT64_MAX;
-
-  RestorationInfo *plane_rsi = &rsi[ctxt->plane];
-  SgrprojInfo *rtile_sgrproj_info = &plane_rsi->sgrproj_info[rtile_idx];
   uint8_t *dgd_start =
-      ctxt->dgd_buffer + limits->v_start * ctxt->dgd_stride + limits->h_start;
+      rsc->dgd_buffer + limits->v_start * rsc->dgd_stride + limits->h_start;
   const uint8_t *src_start =
-      ctxt->src_buffer + limits->v_start * ctxt->src_stride + limits->h_start;
+      rsc->src_buffer + limits->v_start * rsc->src_stride + limits->h_start;
 
-  search_selfguided_restoration(
-      dgd_start, limits->h_end - limits->h_start,
-      limits->v_end - limits->v_start, ctxt->dgd_stride, src_start,
-      ctxt->src_stride,
-#if CONFIG_HIGHBITDEPTH
-      cm->use_highbitdepth, cm->bit_depth,
-#else
-      0, 8,
-#endif  // CONFIG_HIGHBITDEPTH
-      rsi[ctxt->plane].procunit_width, rsi[ctxt->plane].procunit_height,
-      &rtile_sgrproj_info->ep, rtile_sgrproj_info->xqd,
-      cm->rst_internal.tmpbuf);
-  plane_rsi->restoration_type[rtile_idx] = RESTORE_SGRPROJ;
-  err = try_restoration_tile(ctxt->src, ctxt->cpi, rsi, (1 << ctxt->plane),
-                             ctxt->partial_frame, rtile_idx, ctxt->dst_frame);
-  bits =
-      count_sgrproj_bits(&plane_rsi->sgrproj_info[rtile_idx], ref_sgrproj_info)
-      << AV1_PROB_COST_SHIFT;
-  bits += av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 1);
-  double cost_sgrproj = RDCOST_DBL(x->rdmult, (bits >> 4), err);
-  if (cost_sgrproj >= cost_norestore) {
-    ctxt->type[rtile_idx] = RESTORE_NONE;
-  } else {
-    ctxt->type[rtile_idx] = RESTORE_SGRPROJ;
-    *ref_sgrproj_info = sgrproj_info[rtile_idx] =
-        plane_rsi->sgrproj_info[rtile_idx];
-    ctxt->best_tile_cost[rtile_idx] = err;
-  }
-  plane_rsi->restoration_type[rtile_idx] = RESTORE_NONE;
-}
+  const int is_uv = rsc->plane > 0;
+  const int ss_x = is_uv && cm->subsampling_x;
+  const int ss_y = is_uv && cm->subsampling_y;
+  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
+  const int procunit_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
 
-static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
-                             int partial_frame, int plane,
-                             RestorationInfo *info, RestorationType *type,
-                             int64_t *best_tile_cost,
-                             YV12_BUFFER_CONFIG *dst_frame) {
-  struct rest_search_ctxt ctxt;
-  const int nrtiles =
-      init_rest_search_ctxt(src, cpi, partial_frame, plane, info, type,
-                            best_tile_cost, dst_frame, &ctxt);
-
-  RestorationInfo *plane_rsi = &cpi->rst_search[plane];
-  plane_rsi->frame_restoration_type = RESTORE_SGRPROJ;
-  for (int rtile_idx = 0; rtile_idx < nrtiles; ++rtile_idx) {
-    plane_rsi->restoration_type[rtile_idx] = RESTORE_NONE;
-  }
-
-  // Compute best Sgrproj filters for each rtile, one (encoder/decoder)
-  // tile at a time.
-  const AV1_COMMON *const cm = &cpi->common;
-#if CONFIG_HIGHBITDEPTH
-  if (cm->use_highbitdepth)
-    extend_frame_highbd(CONVERT_TO_SHORTPTR(ctxt.dgd_buffer), ctxt.plane_width,
-                        ctxt.plane_height, ctxt.dgd_stride, SGRPROJ_BORDER_HORZ,
-                        SGRPROJ_BORDER_VERT);
-  else
-#endif
-    extend_frame(ctxt.dgd_buffer, ctxt.plane_width, ctxt.plane_height,
-                 ctxt.dgd_stride, SGRPROJ_BORDER_HORZ, SGRPROJ_BORDER_VERT);
-
-  for (int tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
-    for (int tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
-      SgrprojInfo ref_sgrproj_info;
-      set_default_sgrproj(&ref_sgrproj_info);
-      foreach_rtile_in_tile(&ctxt, tile_row, tile_col, search_sgrproj_for_rtile,
-                            &ref_sgrproj_info);
-    }
-  }
-
-  // Cost for Sgrproj filtering
-  SgrprojInfo ref_sgrproj_info;
-  set_default_sgrproj(&ref_sgrproj_info);
-  SgrprojInfo *sgrproj_info = info->sgrproj_info;
-
-  int bits = frame_level_restore_bits[plane_rsi->frame_restoration_type]
-             << AV1_PROB_COST_SHIFT;
-  for (int rtile_idx = 0; rtile_idx < nrtiles; ++rtile_idx) {
-    bits += av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB,
-                         type[rtile_idx] != RESTORE_NONE);
-    plane_rsi->sgrproj_info[rtile_idx] = sgrproj_info[rtile_idx];
-    if (type[rtile_idx] == RESTORE_SGRPROJ) {
-      bits += count_sgrproj_bits(&plane_rsi->sgrproj_info[rtile_idx],
-                                 &ref_sgrproj_info)
-              << AV1_PROB_COST_SHIFT;
-      ref_sgrproj_info = plane_rsi->sgrproj_info[rtile_idx];
-    }
-    plane_rsi->restoration_type[rtile_idx] = type[rtile_idx];
-  }
-  int64_t err = try_restoration_frame(src, cpi, cpi->rst_search, (1 << plane),
-                                      partial_frame, dst_frame);
-  double cost_sgrproj = RDCOST_DBL(cpi->td.mb.rdmult, (bits >> 4), err);
-  return cost_sgrproj;
+  rusi->sgrproj = search_selfguided_restoration(
+      dgd_start, limits->h_end - limits->h_start,
+      limits->v_end - limits->v_start, rsc->dgd_stride, src_start,
+      rsc->src_stride, highbd, bit_depth, procunit_width, procunit_height,
+      tmpbuf);
+
+  RestorationUnitInfo rui;
+  rui.restoration_type = RESTORE_SGRPROJ;
+  rui.sgrproj_info = rusi->sgrproj;
+
+  rusi->sse[RESTORE_SGRPROJ] = try_restoration_unit(rsc, limits, tile, &rui);
+
+  const int64_t bits_none = x->sgrproj_restore_cost[0];
+  const int64_t bits_sgr = x->sgrproj_restore_cost[1] +
+                           (count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj)
+                            << AV1_PROB_COST_SHIFT);
+
+  double cost_none =
+      RDCOST_DBL(x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE]);
+  double cost_sgr =
+      RDCOST_DBL(x->rdmult, bits_sgr >> 4, rusi->sse[RESTORE_SGRPROJ]);
+  if (rusi->sgrproj.ep < 10)
+    cost_sgr *= (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->dual_sgr_penalty_level);
+
+  RestorationType rtype =
+      (cost_sgr < cost_none) ? RESTORE_SGRPROJ : RESTORE_NONE;
+  rusi->best_rtype[RESTORE_SGRPROJ - 1] = rtype;
+
+  rsc->sse += rusi->sse[rtype];
+  rsc->bits += (cost_sgr < cost_none) ? bits_sgr : bits_none;
+  if (cost_sgr < cost_none) rsc->sgrproj = rusi->sgrproj;
 }
 
 static double find_average(const uint8_t *src, int h_start, int h_end,
@@ -758,7 +642,6 @@ static void compute_stats(int wiener_win, const uint8_t *dgd,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 static double find_average_highbd(const uint16_t *src, int h_start, int h_end,
                                   int v_start, int v_end, int stride) {
   uint64_t sum = 0;
@@ -771,10 +654,10 @@ static double find_average_highbd(const uint16_t *src, int h_start, int h_end,
   return avg;
 }
 
-static void compute_stats_highbd(int wiener_win, const uint8_t *dgd8,
-                                 const uint8_t *src8, int h_start, int h_end,
-                                 int v_start, int v_end, int dgd_stride,
-                                 int src_stride, double *M, double *H) {
+static AOM_FORCE_INLINE void compute_stats_highbd(
+    int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start,
+    int h_end, int v_start, int v_end, int dgd_stride, int src_stride,
+    double *M, double *H) {
   int i, j, k, l;
   double Y[WIENER_WIN2];
   const int wiener_win2 = wiener_win * wiener_win;
@@ -798,13 +681,15 @@ static void compute_stats_highbd(int wiener_win, const uint8_t *dgd8,
       }
       assert(idx == wiener_win2);
       for (k = 0; k < wiener_win2; ++k) {
-        M[k] += Y[k] * X;
-        H[k * wiener_win2 + k] += Y[k] * Y[k];
+        double Yk = Y[k];
+        M[k] += Yk * X;
+        double *H2 = &H[k * wiener_win2];
+        H2[k] += Yk * Yk;
         for (l = k + 1; l < wiener_win2; ++l) {
           // H is a symmetric matrix, so we only need to fill out the upper
           // triangle here. We can copy it down to the lower triangle outside
           // the (i, j) loops.
-          H[k * wiener_win2 + l] += Y[k] * Y[l];
+          H2[l] += Yk * Y[l];
         }
       }
     }
@@ -815,7 +700,6 @@ static void compute_stats_highbd(int wiener_win, const uint8_t *dgd8,
     }
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 static INLINE int wrap_index(int i, int wiener_win) {
   const int wiener_halfwin1 = (wiener_win >> 1) + 1;
@@ -1059,37 +943,37 @@ static int count_wiener_bits(int wiener_win, WienerInfo *wiener_info,
 }
 
 #define USE_WIENER_REFINEMENT_SEARCH 1
-static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src,
-                                        AV1_COMP *cpi, RestorationInfo *rsi,
-                                        int start_step, int plane,
-                                        int wiener_win, int tile_idx,
-                                        int partial_frame,
-                                        YV12_BUFFER_CONFIG *dst_frame) {
+static int64_t finer_tile_search_wiener(const RestSearchCtxt *rsc,
+                                        const RestorationTileLimits *limits,
+                                        const AV1PixelRect *tile,
+                                        RestorationUnitInfo *rui,
+                                        int wiener_win) {
   const int plane_off = (WIENER_WIN - wiener_win) >> 1;
-  int64_t err = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
-                                     tile_idx, dst_frame);
-  (void)start_step;
+  int64_t err = try_restoration_unit(rsc, limits, tile, rui);
 #if USE_WIENER_REFINEMENT_SEARCH
   int64_t err2;
   int tap_min[] = { WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP1_MINV,
                     WIENER_FILT_TAP2_MINV };
   int tap_max[] = { WIENER_FILT_TAP0_MAXV, WIENER_FILT_TAP1_MAXV,
                     WIENER_FILT_TAP2_MAXV };
+
+  WienerInfo *plane_wiener = &rui->wiener_info;
+
   // printf("err  pre = %"PRId64"\n", err);
+  const int start_step = 4;
   for (int s = start_step; s >= 1; s >>= 1) {
     for (int p = plane_off; p < WIENER_HALFWIN; ++p) {
       int skip = 0;
       do {
-        if (rsi[plane].wiener_info[tile_idx].hfilter[p] - s >= tap_min[p]) {
-          rsi[plane].wiener_info[tile_idx].hfilter[p] -= s;
-          rsi[plane].wiener_info[tile_idx].hfilter[WIENER_WIN - p - 1] -= s;
-          rsi[plane].wiener_info[tile_idx].hfilter[WIENER_HALFWIN] += 2 * s;
-          err2 = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
-                                      tile_idx, dst_frame);
+        if (plane_wiener->hfilter[p] - s >= tap_min[p]) {
+          plane_wiener->hfilter[p] -= s;
+          plane_wiener->hfilter[WIENER_WIN - p - 1] -= s;
+          plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s;
+          err2 = try_restoration_unit(rsc, limits, tile, rui);
           if (err2 > err) {
-            rsi[plane].wiener_info[tile_idx].hfilter[p] += s;
-            rsi[plane].wiener_info[tile_idx].hfilter[WIENER_WIN - p - 1] += s;
-            rsi[plane].wiener_info[tile_idx].hfilter[WIENER_HALFWIN] -= 2 * s;
+            plane_wiener->hfilter[p] += s;
+            plane_wiener->hfilter[WIENER_WIN - p - 1] += s;
+            plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s;
           } else {
             err = err2;
             skip = 1;
@@ -1101,16 +985,15 @@ static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src,
       } while (1);
       if (skip) break;
       do {
-        if (rsi[plane].wiener_info[tile_idx].hfilter[p] + s <= tap_max[p]) {
-          rsi[plane].wiener_info[tile_idx].hfilter[p] += s;
-          rsi[plane].wiener_info[tile_idx].hfilter[WIENER_WIN - p - 1] += s;
-          rsi[plane].wiener_info[tile_idx].hfilter[WIENER_HALFWIN] -= 2 * s;
-          err2 = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
-                                      tile_idx, dst_frame);
+        if (plane_wiener->hfilter[p] + s <= tap_max[p]) {
+          plane_wiener->hfilter[p] += s;
+          plane_wiener->hfilter[WIENER_WIN - p - 1] += s;
+          plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s;
+          err2 = try_restoration_unit(rsc, limits, tile, rui);
           if (err2 > err) {
-            rsi[plane].wiener_info[tile_idx].hfilter[p] -= s;
-            rsi[plane].wiener_info[tile_idx].hfilter[WIENER_WIN - p - 1] -= s;
-            rsi[plane].wiener_info[tile_idx].hfilter[WIENER_HALFWIN] += 2 * s;
+            plane_wiener->hfilter[p] -= s;
+            plane_wiener->hfilter[WIENER_WIN - p - 1] -= s;
+            plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s;
           } else {
             err = err2;
             // At the highest step size continue moving in the same direction
@@ -1123,16 +1006,15 @@ static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src,
     for (int p = plane_off; p < WIENER_HALFWIN; ++p) {
       int skip = 0;
       do {
-        if (rsi[plane].wiener_info[tile_idx].vfilter[p] - s >= tap_min[p]) {
-          rsi[plane].wiener_info[tile_idx].vfilter[p] -= s;
-          rsi[plane].wiener_info[tile_idx].vfilter[WIENER_WIN - p - 1] -= s;
-          rsi[plane].wiener_info[tile_idx].vfilter[WIENER_HALFWIN] += 2 * s;
-          err2 = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
-                                      tile_idx, dst_frame);
+        if (plane_wiener->vfilter[p] - s >= tap_min[p]) {
+          plane_wiener->vfilter[p] -= s;
+          plane_wiener->vfilter[WIENER_WIN - p - 1] -= s;
+          plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s;
+          err2 = try_restoration_unit(rsc, limits, tile, rui);
           if (err2 > err) {
-            rsi[plane].wiener_info[tile_idx].vfilter[p] += s;
-            rsi[plane].wiener_info[tile_idx].vfilter[WIENER_WIN - p - 1] += s;
-            rsi[plane].wiener_info[tile_idx].vfilter[WIENER_HALFWIN] -= 2 * s;
+            plane_wiener->vfilter[p] += s;
+            plane_wiener->vfilter[WIENER_WIN - p - 1] += s;
+            plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s;
           } else {
             err = err2;
             skip = 1;
@@ -1144,16 +1026,15 @@ static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src,
       } while (1);
       if (skip) break;
       do {
-        if (rsi[plane].wiener_info[tile_idx].vfilter[p] + s <= tap_max[p]) {
-          rsi[plane].wiener_info[tile_idx].vfilter[p] += s;
-          rsi[plane].wiener_info[tile_idx].vfilter[WIENER_WIN - p - 1] += s;
-          rsi[plane].wiener_info[tile_idx].vfilter[WIENER_HALFWIN] -= 2 * s;
-          err2 = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
-                                      tile_idx, dst_frame);
+        if (plane_wiener->vfilter[p] + s <= tap_max[p]) {
+          plane_wiener->vfilter[p] += s;
+          plane_wiener->vfilter[WIENER_WIN - p - 1] += s;
+          plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s;
+          err2 = try_restoration_unit(rsc, limits, tile, rui);
           if (err2 > err) {
-            rsi[plane].wiener_info[tile_idx].vfilter[p] -= s;
-            rsi[plane].wiener_info[tile_idx].vfilter[WIENER_WIN - p - 1] -= s;
-            rsi[plane].wiener_info[tile_idx].vfilter[WIENER_HALFWIN] += 2 * s;
+            plane_wiener->vfilter[p] -= s;
+            plane_wiener->vfilter[WIENER_WIN - p - 1] -= s;
+            plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s;
           } else {
             err = err2;
             // At the highest step size continue moving in the same direction
@@ -1169,372 +1050,264 @@ static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src,
   return err;
 }
 
-static void search_wiener_for_rtile(const struct rest_search_ctxt *ctxt,
-                                    int rtile_idx,
-                                    const RestorationTileLimits *limits,
-                                    void *arg) {
-  const MACROBLOCK *const x = &ctxt->cpi->td.mb;
-  const AV1_COMMON *const cm = &ctxt->cpi->common;
-  RestorationInfo *rsi = ctxt->cpi->rst_search;
+static void search_wiener(const RestorationTileLimits *limits,
+                          const AV1PixelRect *tile_rect, int rest_unit_idx,
+                          void *priv, int32_t *tmpbuf,
+                          RestorationLineBuffers *rlbs) {
+  (void)tmpbuf;
+  (void)rlbs;
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
 
   const int wiener_win =
-      (ctxt->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
+      (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
 
   double M[WIENER_WIN2];
   double H[WIENER_WIN2 * WIENER_WIN2];
   double vfilterd[WIENER_WIN], hfilterd[WIENER_WIN];
 
-  WienerInfo *ref_wiener_info = (WienerInfo *)arg;
-
-  int64_t err =
-      sse_restoration_tile(ctxt->src, cm->frame_to_show, cm, limits->h_start,
-                           limits->h_end - limits->h_start, limits->v_start,
-                           limits->v_end - limits->v_start, (1 << ctxt->plane));
-  // #bits when a tile is not restored
-  int bits = av1_cost_bit(RESTORE_NONE_WIENER_PROB, 0);
-  double cost_norestore = RDCOST_DBL(x->rdmult, (bits >> 4), err);
-  ctxt->best_tile_cost[rtile_idx] = INT64_MAX;
-
-#if CONFIG_HIGHBITDEPTH
-  if (cm->use_highbitdepth)
-    compute_stats_highbd(wiener_win, ctxt->dgd_buffer, ctxt->src_buffer,
+  const AV1_COMMON *const cm = rsc->cm;
+  if (cm->use_highbitdepth) {
+    compute_stats_highbd(wiener_win, rsc->dgd_buffer, rsc->src_buffer,
                          limits->h_start, limits->h_end, limits->v_start,
-                         limits->v_end, ctxt->dgd_stride, ctxt->src_stride, M,
-                         H);
-  else
-#endif  // CONFIG_HIGHBITDEPTH
-    compute_stats(wiener_win, ctxt->dgd_buffer, ctxt->src_buffer,
-                  limits->h_start, limits->h_end, limits->v_start,
-                  limits->v_end, ctxt->dgd_stride, ctxt->src_stride, M, H);
+                         limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
+  } else {
+    compute_stats(wiener_win, rsc->dgd_buffer, rsc->src_buffer, limits->h_start,
+                  limits->h_end, limits->v_start, limits->v_end,
+                  rsc->dgd_stride, rsc->src_stride, M, H);
+  }
 
-  ctxt->type[rtile_idx] = RESTORE_WIENER;
+  const MACROBLOCK *const x = rsc->x;
+  const int64_t bits_none = x->wiener_restore_cost[0];
 
   if (!wiener_decompose_sep_sym(wiener_win, M, H, vfilterd, hfilterd)) {
-    ctxt->type[rtile_idx] = RESTORE_NONE;
+    rsc->bits += bits_none;
+    rsc->sse += rusi->sse[RESTORE_NONE];
+    rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
+    rusi->sse[RESTORE_WIENER] = INT64_MAX;
     return;
   }
 
-  RestorationInfo *plane_rsi = &rsi[ctxt->plane];
-  WienerInfo *rtile_wiener_info = &plane_rsi->wiener_info[rtile_idx];
-  quantize_sym_filter(wiener_win, vfilterd, rtile_wiener_info->vfilter);
-  quantize_sym_filter(wiener_win, hfilterd, rtile_wiener_info->hfilter);
+  RestorationUnitInfo rui;
+  memset(&rui, 0, sizeof(rui));
+  rui.restoration_type = RESTORE_WIENER;
+  quantize_sym_filter(wiener_win, vfilterd, rui.wiener_info.vfilter);
+  quantize_sym_filter(wiener_win, hfilterd, rui.wiener_info.hfilter);
 
   // Filter score computes the value of the function x'*A*x - x'*b for the
   // learned filter and compares it against identity filer. If there is no
   // reduction in the function, the filter is reverted back to identity
-  double score = compute_score(wiener_win, M, H, rtile_wiener_info->vfilter,
-                               rtile_wiener_info->hfilter);
-  if (score > 0.0) {
-    ctxt->type[rtile_idx] = RESTORE_NONE;
+  if (compute_score(wiener_win, M, H, rui.wiener_info.vfilter,
+                    rui.wiener_info.hfilter) > 0) {
+    rsc->bits += bits_none;
+    rsc->sse += rusi->sse[RESTORE_NONE];
+    rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
+    rusi->sse[RESTORE_WIENER] = INT64_MAX;
     return;
   }
+
   aom_clear_system_state();
 
-  plane_rsi->restoration_type[rtile_idx] = RESTORE_WIENER;
-  err = finer_tile_search_wiener(ctxt->src, ctxt->cpi, rsi, 4, ctxt->plane,
-                                 wiener_win, rtile_idx, ctxt->partial_frame,
-                                 ctxt->dst_frame);
+  rusi->sse[RESTORE_WIENER] =
+      finer_tile_search_wiener(rsc, limits, tile_rect, &rui, wiener_win);
+  rusi->wiener = rui.wiener_info;
+
   if (wiener_win != WIENER_WIN) {
-    assert(rtile_wiener_info->vfilter[0] == 0 &&
-           rtile_wiener_info->vfilter[WIENER_WIN - 1] == 0);
-    assert(rtile_wiener_info->hfilter[0] == 0 &&
-           rtile_wiener_info->hfilter[WIENER_WIN - 1] == 0);
-  }
-  bits = count_wiener_bits(wiener_win, rtile_wiener_info, ref_wiener_info)
-         << AV1_PROB_COST_SHIFT;
-  bits += av1_cost_bit(RESTORE_NONE_WIENER_PROB, 1);
-  double cost_wiener = RDCOST_DBL(x->rdmult, (bits >> 4), err);
-  if (cost_wiener >= cost_norestore) {
-    ctxt->type[rtile_idx] = RESTORE_NONE;
-  } else {
-    ctxt->type[rtile_idx] = RESTORE_WIENER;
-    *ref_wiener_info = ctxt->info->wiener_info[rtile_idx] = *rtile_wiener_info;
-    ctxt->best_tile_cost[rtile_idx] = err;
+    assert(rui.wiener_info.vfilter[0] == 0 &&
+           rui.wiener_info.vfilter[WIENER_WIN - 1] == 0);
+    assert(rui.wiener_info.hfilter[0] == 0 &&
+           rui.wiener_info.hfilter[WIENER_WIN - 1] == 0);
   }
-  plane_rsi->restoration_type[rtile_idx] = RESTORE_NONE;
-}
 
-static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
-                            int partial_frame, int plane, RestorationInfo *info,
-                            RestorationType *type, int64_t *best_tile_cost,
-                            YV12_BUFFER_CONFIG *dst_frame) {
-  struct rest_search_ctxt ctxt;
-  const int nrtiles =
-      init_rest_search_ctxt(src, cpi, partial_frame, plane, info, type,
-                            best_tile_cost, dst_frame, &ctxt);
-
-  RestorationInfo *plane_rsi = &cpi->rst_search[plane];
-  plane_rsi->frame_restoration_type = RESTORE_WIENER;
-  for (int tile_idx = 0; tile_idx < nrtiles; ++tile_idx) {
-    plane_rsi->restoration_type[tile_idx] = RESTORE_NONE;
-  }
+  const int64_t bits_wiener =
+      x->wiener_restore_cost[1] +
+      (count_wiener_bits(wiener_win, &rusi->wiener, &rsc->wiener)
+       << AV1_PROB_COST_SHIFT);
 
-  AV1_COMMON *const cm = &cpi->common;
-// Construct a (WIENER_HALFWIN)-pixel border around the frame
-// Note use this border to gather stats even though the actual filter
-// may use less border on the top/bottom of a processing unit.
-#if CONFIG_HIGHBITDEPTH
-  if (cm->use_highbitdepth)
-    extend_frame_highbd(CONVERT_TO_SHORTPTR(ctxt.dgd_buffer), ctxt.plane_width,
-                        ctxt.plane_height, ctxt.dgd_stride, WIENER_HALFWIN,
-                        WIENER_HALFWIN);
-  else
-#endif
-    extend_frame(ctxt.dgd_buffer, ctxt.plane_width, ctxt.plane_height,
-                 ctxt.dgd_stride, WIENER_HALFWIN, WIENER_HALFWIN);
-
-  // Compute best Wiener filters for each rtile, one (encoder/decoder)
-  // tile at a time.
-  for (int tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
-    for (int tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
-      WienerInfo ref_wiener_info;
-      set_default_wiener(&ref_wiener_info);
-
-      foreach_rtile_in_tile(&ctxt, tile_row, tile_col, search_wiener_for_rtile,
-                            &ref_wiener_info);
-    }
-  }
+  double cost_none =
+      RDCOST_DBL(x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE]);
+  double cost_wiener =
+      RDCOST_DBL(x->rdmult, bits_wiener >> 4, rusi->sse[RESTORE_WIENER]);
 
-  // cost for Wiener filtering
-  WienerInfo ref_wiener_info;
-  set_default_wiener(&ref_wiener_info);
-  int bits = frame_level_restore_bits[plane_rsi->frame_restoration_type]
-             << AV1_PROB_COST_SHIFT;
-  WienerInfo *wiener_info = info->wiener_info;
-  const int wiener_win =
-      (plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
-
-  for (int tile_idx = 0; tile_idx < nrtiles; ++tile_idx) {
-    bits +=
-        av1_cost_bit(RESTORE_NONE_WIENER_PROB, type[tile_idx] != RESTORE_NONE);
-    plane_rsi->wiener_info[tile_idx] = wiener_info[tile_idx];
-
-    if (type[tile_idx] == RESTORE_WIENER) {
-      bits += count_wiener_bits(wiener_win, &plane_rsi->wiener_info[tile_idx],
-                                &ref_wiener_info)
-              << AV1_PROB_COST_SHIFT;
-      ref_wiener_info = plane_rsi->wiener_info[tile_idx];
-    }
-    plane_rsi->restoration_type[tile_idx] = type[tile_idx];
-  }
-  int64_t err = try_restoration_frame(src, cpi, cpi->rst_search, 1 << plane,
-                                      partial_frame, dst_frame);
-  double cost_wiener = RDCOST_DBL(cpi->td.mb.rdmult, (bits >> 4), err);
+  RestorationType rtype =
+      (cost_wiener < cost_none) ? RESTORE_WIENER : RESTORE_NONE;
+  rusi->best_rtype[RESTORE_WIENER - 1] = rtype;
 
-  return cost_wiener;
+  rsc->sse += rusi->sse[rtype];
+  rsc->bits += (cost_wiener < cost_none) ? bits_wiener : bits_none;
+  if (cost_wiener < cost_none) rsc->wiener = rusi->wiener;
 }
 
-static double search_norestore(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
-                               int partial_frame, int plane,
-                               RestorationInfo *info, RestorationType *type,
-                               int64_t *best_tile_cost,
-                               YV12_BUFFER_CONFIG *dst_frame) {
-  int64_t err;
-  double cost_norestore;
-  int bits;
-  MACROBLOCK *x = &cpi->td.mb;
-  AV1_COMMON *const cm = &cpi->common;
-  int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
-  int width, height;
-  if (plane == AOM_PLANE_Y) {
-    width = src->y_crop_width;
-    height = src->y_crop_height;
-  } else {
-    width = src->uv_crop_width;
-    height = src->uv_crop_height;
-  }
-  const int ntiles = av1_get_rest_ntiles(
-      width, height, cm->rst_info[plane].restoration_tilesize, &tile_width,
-      &tile_height, &nhtiles, &nvtiles);
-  (void)info;
-  (void)dst_frame;
-  (void)partial_frame;
-
-  info->frame_restoration_type = RESTORE_NONE;
-  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
-    RestorationTileLimits limits = av1_get_rest_tile_limits(
-        tile_idx, nhtiles, nvtiles, tile_width, tile_height, width,
-#if CONFIG_STRIPED_LOOP_RESTORATION
-        height, plane != AOM_PLANE_Y ? cm->subsampling_y : 0);
-#else
-        height);
-#endif
-    err = sse_restoration_tile(src, cm->frame_to_show, cm, limits.h_start,
-                               limits.h_end - limits.h_start, limits.v_start,
-                               limits.v_end - limits.v_start, 1 << plane);
-    type[tile_idx] = RESTORE_NONE;
-    best_tile_cost[tile_idx] = err;
-  }
-  // RD cost associated with no restoration
-  err = sse_restoration_frame(cm, src, cm->frame_to_show, (1 << plane));
-  bits = frame_level_restore_bits[RESTORE_NONE] << AV1_PROB_COST_SHIFT;
-  cost_norestore = RDCOST_DBL(x->rdmult, (bits >> 4), err);
-  return cost_norestore;
-}
+static void search_norestore(const RestorationTileLimits *limits,
+                             const AV1PixelRect *tile_rect, int rest_unit_idx,
+                             void *priv, int32_t *tmpbuf,
+                             RestorationLineBuffers *rlbs) {
+  (void)tile_rect;
+  (void)tmpbuf;
+  (void)rlbs;
 
-struct switchable_rest_search_ctxt {
-  SgrprojInfo sgrproj_info;
-  WienerInfo wiener_info;
-  RestorationType *const *restore_types;
-  int64_t *const *tile_cost;
-  double cost_switchable;
-};
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
 
-static void search_switchable_for_rtile(const struct rest_search_ctxt *ctxt,
-                                        int rtile_idx,
-                                        const RestorationTileLimits *limits,
-                                        void *arg) {
-  const MACROBLOCK *x = &ctxt->cpi->td.mb;
-  RestorationInfo *rsi = &ctxt->cpi->common.rst_info[ctxt->plane];
-  struct switchable_rest_search_ctxt *swctxt =
-      (struct switchable_rest_search_ctxt *)arg;
+  const int highbd = rsc->cm->use_highbitdepth;
+  rusi->sse[RESTORE_NONE] = sse_restoration_unit(
+      limits, rsc->src, rsc->cm->frame_to_show, rsc->plane, highbd);
+
+  rsc->sse += rusi->sse[RESTORE_NONE];
+}
 
+static void search_switchable(const RestorationTileLimits *limits,
+                              const AV1PixelRect *tile_rect, int rest_unit_idx,
+                              void *priv, int32_t *tmpbuf,
+                              RestorationLineBuffers *rlbs) {
   (void)limits;
+  (void)tile_rect;
+  (void)tmpbuf;
+  (void)rlbs;
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
 
-  double best_cost =
-      RDCOST_DBL(x->rdmult, (x->switchable_restore_cost[RESTORE_NONE] >> 4),
-                 swctxt->tile_cost[RESTORE_NONE][rtile_idx]);
-  rsi->restoration_type[rtile_idx] = RESTORE_NONE;
-  for (RestorationType r = 1; r < RESTORE_SWITCHABLE_TYPES; r++) {
-    if (force_restore_type != RESTORE_TYPES)
-      if (r != force_restore_type) continue;
-    int tilebits = 0;
-    if (swctxt->restore_types[r][rtile_idx] != r) continue;
-    if (r == RESTORE_WIENER)
-      tilebits += count_wiener_bits(
-          (ctxt->plane == AOM_PLANE_Y ? WIENER_WIN : WIENER_WIN - 2),
-          &rsi->wiener_info[rtile_idx], &swctxt->wiener_info);
-    else if (r == RESTORE_SGRPROJ)
-      tilebits += count_sgrproj_bits(&rsi->sgrproj_info[rtile_idx],
-                                     &swctxt->sgrproj_info);
-    tilebits <<= AV1_PROB_COST_SHIFT;
-    tilebits += x->switchable_restore_cost[r];
-    double cost =
-        RDCOST_DBL(x->rdmult, tilebits >> 4, swctxt->tile_cost[r][rtile_idx]);
-
-    if (cost < best_cost) {
-      rsi->restoration_type[rtile_idx] = r;
-      best_cost = cost;
+  const MACROBLOCK *const x = rsc->x;
+
+  const int wiener_win =
+      (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
+
+  double best_cost = 0;
+  int64_t best_bits = 0;
+  RestorationType best_rtype = RESTORE_NONE;
+
+  for (RestorationType r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) {
+    // Check for the condition that wiener or sgrproj search could not
+    // find a solution or the solution was worse than RESTORE_NONE.
+    // In either case the best_rtype will be set as RESTORE_NONE. These
+    // should be skipped from the test below.
+    if (r > RESTORE_NONE) {
+      if (rusi->best_rtype[r - 1] == RESTORE_NONE) continue;
     }
-  }
-  if (rsi->restoration_type[rtile_idx] == RESTORE_WIENER)
-    swctxt->wiener_info = rsi->wiener_info[rtile_idx];
-  else if (rsi->restoration_type[rtile_idx] == RESTORE_SGRPROJ)
-    swctxt->sgrproj_info = rsi->sgrproj_info[rtile_idx];
-  if (force_restore_type != RESTORE_TYPES)
-    assert(rsi->restoration_type[rtile_idx] == force_restore_type ||
-           rsi->restoration_type[rtile_idx] == RESTORE_NONE);
-  swctxt->cost_switchable += best_cost;
-}
 
-static double search_switchable_restoration(
-    const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi, int partial_frame, int plane,
-    RestorationType *const restore_types[RESTORE_SWITCHABLE_TYPES],
-    int64_t *const tile_cost[RESTORE_SWITCHABLE_TYPES], RestorationInfo *rsi) {
-  const AV1_COMMON *const cm = &cpi->common;
-  struct rest_search_ctxt ctxt;
-  init_rest_search_ctxt(src, cpi, partial_frame, plane, NULL, NULL, NULL, NULL,
-                        &ctxt);
-  struct switchable_rest_search_ctxt swctxt;
-  swctxt.restore_types = restore_types;
-  swctxt.tile_cost = tile_cost;
-
-  rsi->frame_restoration_type = RESTORE_SWITCHABLE;
-  int bits = frame_level_restore_bits[rsi->frame_restoration_type]
-             << AV1_PROB_COST_SHIFT;
-  swctxt.cost_switchable = RDCOST_DBL(cpi->td.mb.rdmult, bits >> 4, 0);
-
-  for (int tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
-    for (int tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
-      set_default_sgrproj(&swctxt.sgrproj_info);
-      set_default_wiener(&swctxt.wiener_info);
-      foreach_rtile_in_tile(&ctxt, tile_row, tile_col,
-                            search_switchable_for_rtile, &swctxt);
+    const int64_t sse = rusi->sse[r];
+    int64_t coeff_pcost = 0;
+    switch (r) {
+      case RESTORE_NONE: coeff_pcost = 0; break;
+      case RESTORE_WIENER:
+        coeff_pcost =
+            count_wiener_bits(wiener_win, &rusi->wiener, &rsc->wiener);
+        break;
+      case RESTORE_SGRPROJ:
+        coeff_pcost = count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj);
+        break;
+      default: assert(0); break;
+    }
+    const int64_t coeff_bits = coeff_pcost << AV1_PROB_COST_SHIFT;
+    const int64_t bits = x->switchable_restore_cost[r] + coeff_bits;
+    double cost = RDCOST_DBL(x->rdmult, bits >> 4, sse);
+    if (r == RESTORE_SGRPROJ && rusi->sgrproj.ep < 10)
+      cost *= (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->dual_sgr_penalty_level);
+    if (r == 0 || cost < best_cost) {
+      best_cost = cost;
+      best_bits = bits;
+      best_rtype = r;
     }
   }
 
-  return swctxt.cost_switchable;
+  rusi->best_rtype[RESTORE_SWITCHABLE - 1] = best_rtype;
+
+  rsc->sse += rusi->sse[best_rtype];
+  rsc->bits += best_bits;
+  if (best_rtype == RESTORE_WIENER) rsc->wiener = rusi->wiener;
+  if (best_rtype == RESTORE_SGRPROJ) rsc->sgrproj = rusi->sgrproj;
 }
 
-void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
-                                 LPF_PICK_METHOD method) {
-  static search_restore_type search_restore_fun[RESTORE_SWITCHABLE_TYPES] = {
-    search_norestore, search_wiener, search_sgrproj,
+static void copy_unit_info(RestorationType frame_rtype,
+                           const RestUnitSearchInfo *rusi,
+                           RestorationUnitInfo *rui) {
+  assert(frame_rtype > 0);
+  rui->restoration_type = rusi->best_rtype[frame_rtype - 1];
+  if (rui->restoration_type == RESTORE_WIENER)
+    rui->wiener_info = rusi->wiener;
+  else
+    rui->sgrproj_info = rusi->sgrproj;
+}
+
+static double search_rest_type(RestSearchCtxt *rsc, RestorationType rtype) {
+  static const rest_unit_visitor_t funs[RESTORE_TYPES] = {
+    search_norestore, search_wiener, search_sgrproj, search_switchable
   };
-  AV1_COMMON *const cm = &cpi->common;
-  double cost_restore[RESTORE_TYPES];
-  int64_t *tile_cost[RESTORE_SWITCHABLE_TYPES];
-  RestorationType *restore_types[RESTORE_SWITCHABLE_TYPES];
-  double best_cost_restore;
-  RestorationType r, best_restore;
-  const int ywidth = src->y_crop_width;
-  const int yheight = src->y_crop_height;
-  const int uvwidth = src->uv_crop_width;
-  const int uvheight = src->uv_crop_height;
-
-  const int ntiles_y =
-      av1_get_rest_ntiles(ywidth, yheight, cm->rst_info[0].restoration_tilesize,
-                          NULL, NULL, NULL, NULL);
-  const int ntiles_uv = av1_get_rest_ntiles(
-      uvwidth, uvheight, cm->rst_info[1].restoration_tilesize, NULL, NULL, NULL,
-      NULL);
-
-  // Assume ntiles_uv is never larger that ntiles_y and so the same arrays work.
-  for (r = 0; r < RESTORE_SWITCHABLE_TYPES; r++) {
-    tile_cost[r] = (int64_t *)aom_malloc(sizeof(*tile_cost[0]) * ntiles_y);
-    restore_types[r] =
-        (RestorationType *)aom_malloc(sizeof(*restore_types[0]) * ntiles_y);
-  }
 
-  for (int plane = AOM_PLANE_Y; plane <= AOM_PLANE_V; ++plane) {
-    for (r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) {
-      cost_restore[r] = DBL_MAX;
-      if (force_restore_type != RESTORE_TYPES)
-        if (r != RESTORE_NONE && r != force_restore_type) continue;
-      cost_restore[r] =
-          search_restore_fun[r](src, cpi, method == LPF_PICK_FROM_SUBIMAGE,
-                                plane, &cm->rst_info[plane], restore_types[r],
-                                tile_cost[r], &cpi->trial_frame_rst);
-    }
-    if (plane == AOM_PLANE_Y)
-      cost_restore[RESTORE_SWITCHABLE] = search_switchable_restoration(
-          src, cpi, method == LPF_PICK_FROM_SUBIMAGE, plane, restore_types,
-          tile_cost, &cm->rst_info[plane]);
-    else
-      cost_restore[RESTORE_SWITCHABLE] = DBL_MAX;
-    best_cost_restore = DBL_MAX;
-    best_restore = 0;
-    for (r = 0; r < RESTORE_TYPES; ++r) {
-      if (force_restore_type != RESTORE_TYPES)
-        if (r != RESTORE_NONE && r != force_restore_type) continue;
-      if (cost_restore[r] < best_cost_restore) {
-        best_restore = r;
-        best_cost_restore = cost_restore[r];
+  reset_rsc(rsc);
+  rsc_on_tile(LR_TILE_ROW, LR_TILE_COL, rsc);
+  av1_foreach_rest_unit_in_plane(rsc->cm, rsc->plane, funs[rtype], rsc,
+                                 &rsc->tile_rect, rsc->cm->rst_tmpbuf, NULL);
+  return RDCOST_DBL(rsc->x->rdmult, rsc->bits >> 4, rsc->sse);
+}
+
+static int rest_tiles_in_plane(const AV1_COMMON *cm, int plane) {
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+  return rsi->units_per_tile;
+}
+
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  assert(!cm->all_lossless);
+
+  int ntiles[2];
+  for (int is_uv = 0; is_uv < 2; ++is_uv)
+    ntiles[is_uv] = rest_tiles_in_plane(cm, is_uv);
+
+  assert(ntiles[1] <= ntiles[0]);
+  RestUnitSearchInfo *rusi =
+      (RestUnitSearchInfo *)aom_memalign(16, sizeof(*rusi) * ntiles[0]);
+
+  // If the restoration unit dimensions are not multiples of
+  // rsi->restoration_unit_size then some elements of the rusi array may be
+  // left uninitialised when we reach copy_unit_info(...). This is not a
+  // problem, as these elements are ignored later, but in order to quiet
+  // Valgrind's warnings we initialise the array below.
+  memset(rusi, 0, sizeof(*rusi) * ntiles[0]);
+
+  RestSearchCtxt rsc;
+  const int plane_start = AOM_PLANE_Y;
+  const int plane_end = num_planes > 1 ? AOM_PLANE_V : AOM_PLANE_Y;
+  for (int plane = plane_start; plane <= plane_end; ++plane) {
+    init_rsc(src, &cpi->common, &cpi->td.mb, &cpi->sf, plane, rusi,
+             &cpi->trial_frame_rst, &rsc);
+
+    const int plane_ntiles = ntiles[plane > 0];
+    const RestorationType num_rtypes =
+        (plane_ntiles > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES;
+
+    double best_cost = 0;
+    RestorationType best_rtype = RESTORE_NONE;
+
+    const int highbd = rsc.cm->use_highbitdepth;
+    extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height,
+                 rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER,
+                 highbd);
+
+    for (RestorationType r = 0; r < num_rtypes; ++r) {
+      if ((force_restore_type != RESTORE_TYPES) && (r != RESTORE_NONE) &&
+          (r != force_restore_type))
+        continue;
+
+      double cost = search_rest_type(&rsc, r);
+
+      if (r == 0 || cost < best_cost) {
+        best_cost = cost;
+        best_rtype = r;
       }
     }
-    cm->rst_info[plane].frame_restoration_type = best_restore;
+
+    cm->rst_info[plane].frame_restoration_type = best_rtype;
     if (force_restore_type != RESTORE_TYPES)
-      assert(best_restore == force_restore_type ||
-             best_restore == RESTORE_NONE);
-    if (best_restore != RESTORE_SWITCHABLE) {
-      const int nt = (plane == AOM_PLANE_Y ? ntiles_y : ntiles_uv);
-      memcpy(cm->rst_info[plane].restoration_type, restore_types[best_restore],
-             nt * sizeof(restore_types[best_restore][0]));
+      assert(best_rtype == force_restore_type || best_rtype == RESTORE_NONE);
+
+    if (best_rtype != RESTORE_NONE) {
+      for (int u = 0; u < plane_ntiles; ++u) {
+        copy_unit_info(best_rtype, &rusi[u], &cm->rst_info[plane].unit_info[u]);
+      }
     }
   }
-  /*
-  printf("Frame %d/%d restore types: %d %d %d\n", cm->current_video_frame,
-         cm->show_frame, cm->rst_info[0].frame_restoration_type,
-         cm->rst_info[1].frame_restoration_type,
-         cm->rst_info[2].frame_restoration_type);
-  printf("Frame %d/%d frame_restore_type %d : %f %f %f %f\n",
-         cm->current_video_frame, cm->show_frame,
-         cm->rst_info[0].frame_restoration_type, cost_restore[0],
-         cost_restore[1], cost_restore[2], cost_restore[3]);
-         */
-
-  for (r = 0; r < RESTORE_SWITCHABLE_TYPES; r++) {
-    aom_free(tile_cost[r]);
-    aom_free(restore_types[r]);
-  }
+
+  aom_free(rusi);
 }
diff --git a/third_party/aom/av1/encoder/pickrst.h b/third_party/aom/av1/encoder/pickrst.h
index f6096ed1d..179b89ff9 100644
--- a/third_party/aom/av1/encoder/pickrst.h
+++ b/third_party/aom/av1/encoder/pickrst.h
@@ -20,8 +20,7 @@ extern "C" {
 struct yv12_buffer_config;
 struct AV1_COMP;
 
-void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
-                                 LPF_PICK_METHOD method);
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/pustats.h b/third_party/aom/av1/encoder/pustats.h
new file mode 100644
index 000000000..ef333b6d8
--- /dev/null
+++ b/third_party/aom/av1/encoder/pustats.h
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_PUSTATS_H_
+#define AV1_ENCODER_PUSTATS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+#define NUM_FEATURES 20
+#define NUM_HIDDEN_LAYERS 2
+#define HIDDEN_LAYERS_0_NODES 10
+#define HIDDEN_LAYERS_1_NODES 10
+#define LOGITS_NODES 1
+
+static const float
+    av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES *
+                                          HIDDEN_LAYERS_0_NODES] = {
+      13.8498f,  19.6630f,   13.3036f,  5.2448f,   -18.0270f,  21.6671f,
+      -0.2135f,  -0.0060f,   0.1211f,   -0.3549f,  -0.3550f,   0.0190f,
+      0.0167f,   -0.1192f,   0.2003f,   8.6663f,   32.0264f,   9.9558f,
+      9.0935f,   -110.4994f, 51.8056f,  64.8041f,  58.5392f,   53.0189f,
+      -61.6300f, 4.7540f,    -0.0140f,  0.0185f,   -15.8050f,  0.0790f,
+      0.0707f,   0.0784f,    0.0766f,   -0.3030f,  0.0392f,    49.3312f,
+      63.3326f,  61.4025f,   54.2723f,  -62.2769f, -147.1736f, -84.9432f,
+      -82.5422f, -70.4857f,  46.7622f,  -1.0285f,  -0.4809f,   0.0068f,
+      1.0888f,   -0.0515f,   -0.0384f,  -0.0232f,  -0.0396f,   0.2429f,
+      0.2040f,   -144.4016f, -88.0868f, -80.3134f, -70.6685f,  66.8528f,
+      -53.8097f, -45.4011f,  -52.8680f, -58.7226f, 99.7830f,   2.3728f,
+      0.0229f,   0.0002f,    -0.3288f,  -0.0563f,  -0.0550f,   -0.0552f,
+      -0.0563f,  0.2214f,    0.0139f,   -60.8965f, -45.5251f,  -50.4188f,
+      -51.5623f, 85.7369f,   77.3415f,  47.4930f,  53.8120f,   58.2311f,
+      -45.9650f, -2.4938f,   0.1639f,   -0.5270f,  -75.4622f,  -0.0026f,
+      0.0031f,   0.0047f,    0.0015f,   0.0092f,   0.0654f,    75.6402f,
+      54.7447f,  54.8156f,   52.6834f,  -9.1246f,  -34.0108f,  -35.6423f,
+      -34.2911f, -38.5444f,  72.1123f,  10.9750f,  -0.1595f,   0.1983f,
+      22.5724f,  -0.0556f,   -0.0618f,  -0.0571f,  -0.0608f,   0.2439f,
+      -0.0805f,  -32.5107f,  -28.9688f, -33.7284f, -48.1365f,  61.5297f,
+      39.2492f,  -35.1928f,  -11.5000f, 7.7038f,   -94.2469f,  13.5586f,
+      0.7541f,   0.0105f,    4.4041f,   0.1799f,   0.1339f,    0.1567f,
+      -0.6668f,  -0.7384f,   0.2185f,   17.1700f,  -26.4601f,  -1.8970f,
+      38.9635f,  -30.1916f,  31.8139f,  14.6157f,  10.0565f,   3.3340f,
+      -40.6985f, -2.1186f,   0.0116f,   0.0962f,   0.7115f,    -1.4071f,
+      -1.3701f,  -1.4728f,   -1.3404f,  -1.7286f,  5.5632f,    28.4998f,
+      5.4087f,   16.2668f,   11.8693f,  -39.4153f, 106.3281f,  38.3075f,
+      39.4933f,  47.3805f,   -15.0514f, -21.2421f, -0.2358f,   -0.0024f,
+      0.3505f,   -0.0429f,   -0.0377f,  -0.0322f,  -0.0344f,   0.2020f,
+      0.1417f,   99.6711f,   35.3896f,  43.1117f,  59.8879f,   -17.8250f,
+      -16.6976f, 18.5100f,   6.3383f,   25.3020f,  -55.8824f,  25.1027f,
+      -0.9926f,  -0.0738f,   -1.4892f,  0.0269f,   -0.0051f,   -5.8168f,
+      -0.0579f,  -0.1500f,   0.7224f,   8.3066f,   -3.8805f,   -12.1482f,
+      14.3492f,  -20.8118f,
+    };
+
+static const float av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] =
+    {
+      17.6566f,  62.2217f, -107.2644f, -56.2255f, 68.2252f,
+      -37.5662f, 9.587f,   18.5206f,   69.6873f,  4.3903f,
+    };
+
+static const float
+    av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
+                                          HIDDEN_LAYERS_1_NODES] = {
+      -0.0494f, 0.3505f,   -0.0461f, -1.3451f, 0.0198f,  -0.0746f, -0.2217f,
+      -0.9525f, 0.0633f,   -0.0737f, -0.3568f, 1.8569f,  -0.0189f, -1.8269f,
+      0.6281f,  -1.3266f,  -0.9202f, 2.8978f,  -0.6437f, -0.8709f, -1.5066f,
+      -1.0582f, -1.9509f,  -0.0417f, -0.1315f, -0.3368f, 0.0014f,  -0.5734f,
+      -1.4640f, -1.6042f,  3.3911f,  -1.6815f, -1.9026f, -4.8702f, -0.1012f,
+      -1.4517f, -3.2156f,  0.8448f,  0.2331f,  -0.1593f, 2.6627f,  -0.8451f,
+      -1.7382f, 0.9303f,   2.3003f,  -0.0659f, 0.5772f,  0.4253f,  0.2083f,
+      0.3649f,  -0.9198f,  -0.2183f, -0.5381f, -1.0831f, 2.0359f,  0.0040f,
+      -0.0871f, -0.1715f,  2.2453f,  0.5099f,  -0.5900f, -0.6313f, -1.3028f,
+      -1.7257f, 1.4130f,   -0.7189f, -0.4336f, 1.9266f,  1.7495f,  -0.3321f,
+      0.2827f,  0.4015f,   -0.5044f, -1.0420f, -0.1258f, -0.0342f, -0.1190f,
+      -3.1263f, 0.7485f,   -0.3161f, -0.2224f, 2.5533f,  -0.2121f, -1.3389f,
+      0.5556f,  -0.9407f,  -0.7456f, 1.4137f,  -0.0353f, -0.0521f, 2.4382f,
+      0.1493f,  -11.5631f, -1.6178f, 3.5538f,  -3.6538f, -0.5972f, -3.0038f,
+      -2.1640f, 0.5754f,
+    };
+
+static const float av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] =
+    {
+      69.1995f, 41.7369f, -1.4885f, -35.785f, 26.1678f,
+      58.4472f, 36.2223f, 66.327f,  50.8867f, 2.8306f,
+    };
+
+static const float
+    av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
+      1.811f,  0.9009f, 0.0694f, -0.9985f, -0.039f,
+      0.2076f, 0.5643f, 0.5408f, 0.6071f,  0.277f,
+    };
+
+static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = {
+  39.5529f,
+};
+
+static const NN_CONFIG av1_pustats_rate_nnconfig = {
+  NUM_FEATURES,                                      // num_inputs
+  LOGITS_NODES,                                      // num_outputs
+  NUM_HIDDEN_LAYERS,                                 // num_hidden_layers
+  { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES },  // num_hidden_nodes
+  {
+      av1_pustats_rate_hiddenlayer_0_kernel,
+      av1_pustats_rate_hiddenlayer_1_kernel,
+      av1_pustats_rate_logits_kernel,
+  },
+  {
+      av1_pustats_rate_hiddenlayer_0_bias,
+      av1_pustats_rate_hiddenlayer_1_bias,
+      av1_pustats_rate_logits_bias,
+  },
+};
+
+static const float
+    av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES *
+                                          HIDDEN_LAYERS_0_NODES] = {
+      -39.0787f,  -212.9998f, -174.2088f, -264.1454f, 292.7151f,  -60.8750f,
+      -5.9915f,   0.0712f,    -60.2312f,  -0.2020f,   -0.2135f,   -0.1663f,
+      -0.0711f,   0.2267f,    0.9152f,    -36.1294f,  -159.9320f, -222.9809f,
+      -270.2556f, 300.7162f,  159.9224f,  -172.5735f, -7.6852f,   54.3985f,
+      110.6721f,  19.2907f,   -15.1039f,  -0.0457f,   0.3289f,    0.4529f,
+      -8.2222f,   1.3213f,    -0.8378f,   -0.2605f,   3.9600f,    17.3407f,
+      113.1116f,  34.6326f,   11.6688f,   109.3541f,  240.8123f,  45.0615f,
+      80.7443f,   39.2500f,   -21.0931f,  -27.1989f,  -0.4264f,   -0.1345f,
+      1.6269f,    -0.0716f,   0.0989f,    -0.1382f,   0.0248f,    0.0913f,
+      4.3903f,    244.1014f,  32.2567f,   58.6171f,   62.2273f,   -2.8647f,
+      -227.5659f, 16.0031f,   -70.5256f,  23.8071f,   290.7356f,  13.6094f,
+      -2.1842f,   0.0104f,    -2.8760f,   0.3708f,    0.8501f,    -3.2964f,
+      -0.2088f,   -0.4474f,   1.2248f,    40.5180f,   -130.7891f, -188.1583f,
+      -174.0906f, 205.9622f,  0.3425f,    0.2531f,    0.2822f,    0.0488f,
+      0.1416f,    -0.0433f,   -0.1195f,   -0.0413f,   -0.0708f,   -0.0787f,
+      -0.0889f,   -0.4022f,   -0.5055f,   -0.4715f,   0.2315f,    0.1021f,
+      -0.3676f,   -0.3499f,   -0.0715f,   0.1913f,    205.7521f,  125.2265f,
+      92.0640f,   77.5566f,   -164.4280f, -19.3715f,  -0.1346f,   -0.4060f,
+      0.5042f,    -0.2395f,   -0.1329f,   -0.1397f,   0.2175f,    0.2895f,
+      5.5019f,    198.9799f,  114.0018f,  94.9015f,   86.8434f,   -183.4237f,
+      121.5626f,  94.8945f,   65.0803f,   93.6487f,   -346.5279f, -47.6168f,
+      0.0633f,    0.0135f,    -0.0692f,   -0.1015f,   -0.1146f,   -0.1341f,
+      -0.1175f,   0.4186f,    0.1505f,    130.7402f,  107.8443f,  62.8497f,
+      65.3501f,   -312.7407f, 282.8321f,  98.1531f,   75.6648f,   25.8733f,
+      -176.9298f, -37.2695f,  -0.3760f,   0.0017f,    0.1030f,    -0.1483f,
+      0.0787f,    -0.0962f,   0.4109f,    -0.2292f,   9.1681f,    274.3607f,
+      60.9538f,   75.9405f,   68.3776f,   -167.3098f, -335.1045f, -69.2583f,
+      -76.3441f,  -16.5793f,  218.5244f,  28.2405f,   0.9169f,    -0.0026f,
+      -0.8077f,   -1.5756f,   -0.0804f,   0.1404f,    1.2656f,    0.0272f,
+      -0.2529f,   -340.8659f, -112.7778f, -58.3890f,  -4.1224f,   108.1709f,
+      -180.7382f, -93.7114f,  -77.8686f,  -131.8134f, 353.3893f,  4.8233f,
+      0.0205f,    0.0000f,    -1.1654f,   -0.0161f,   -0.0255f,   -0.0358f,
+      -0.0412f,   0.1103f,    0.1041f,    -188.9934f, -110.1792f, -88.6301f,
+      -93.7226f,  336.9746f,
+    };
+
+static const float av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] =
+    { -175.6918f, 43.4519f,  154.196f, -81.1015f,  -0.0758f,
+      136.5695f,  110.8713f, 142.029f, -153.0901f, -145.2688f };
+
+static const float
+    av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
+                                          HIDDEN_LAYERS_1_NODES] = {
+      -0.1727f, -0.2859f,  -0.3757f, -0.4260f,  -0.5441f, -0.0666f, -0.3792f,
+      -0.1335f, -0.1521f,  -0.0821f, -3.1590f,  0.2711f,  0.5889f,  0.0878f,
+      0.4693f,  0.7773f,   -9.2989f, 0.0414f,   0.4485f,  22.8958f, -3.7024f,
+      -2.4672f, -43.2908f, 0.0956f,  0.4431f,   2.3429f,  1.7183f,  0.3985f,
+      -0.2275f, -3.1583f,  -0.3485f, 0.3280f,   0.3763f,  0.2069f,  0.4231f,
+      0.7366f,  -6.9527f,  0.0713f,  0.1359f,   16.6500f, -1.7655f, -0.1651f,
+      0.1280f,  -0.2678f,  -0.2120f, 1.6243f,   1.8773f,  -0.7543f, -0.3292f,
+      -0.7627f, -0.2001f,  -0.1125f, -0.8100f,  -0.1866f, 0.0567f,  -0.4002f,
+      3.2429f,  0.6427f,   -0.3759f, -11.6518f, -2.2893f, 0.7708f,  -1.8637f,
+      1.7148f,  0.3124f,   -0.7129f, -0.4927f,  0.1964f,  -0.2570f, -25.0783f,
+      2.5061f,  0.1457f,   -1.1239f, 0.0570f,   -0.2526f, -0.0669f, 0.6791f,
+      1.1531f,  -0.7246f,  -0.3180f, -0.0015f,  -0.0061f, -0.1626f, -0.0181f,
+      0.1271f,  -0.0140f,  -0.6027f, 0.0736f,   -0.0157f, 1.2420f,  -6.4055f,
+      0.2128f,  -0.0386f,  0.3446f,  0.1840f,   -0.7208f, -1.6979f, -0.0442f,
+      0.3230f,  -1.9745f,
+    };
+
+static const float av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] =
+    { 0.f,      70.3414f, 9.6036f,   -118.1096f, 49.2507f,
+      95.1849f, 81.8015f, 167.0967f, -337.7945f, 169.8344f };
+
+static const float
+    av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
+      -0.3627f, 1.2272f,  0.2201f, -1.7406f, -0.6885f,
+      0.8487f,  -0.2761f, 0.7731f, -5.2096f, -0.7351f,
+    };
+
+static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = {
+  48.2331f,
+};
+
+static const NN_CONFIG av1_pustats_dist_nnconfig = {
+  NUM_FEATURES,                                      // num_inputs
+  LOGITS_NODES,                                      // num_outputs
+  NUM_HIDDEN_LAYERS,                                 // num_hidden_layers
+  { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES },  // num_hidden_nodes
+  {
+      av1_pustats_dist_hiddenlayer_0_kernel,
+      av1_pustats_dist_hiddenlayer_1_kernel,
+      av1_pustats_dist_logits_kernel,
+  },
+  {
+      av1_pustats_dist_hiddenlayer_0_bias,
+      av1_pustats_dist_hiddenlayer_1_bias,
+      av1_pustats_dist_logits_bias,
+  },
+};
+
+#undef NUM_FEATURES
+#undef NUM_HIDDEN_LAYERS
+#undef HIDDEN_LAYERS_0_NODES
+#undef HIDDEN_LAYERS_1_NODES
+#undef LOGITS_NODES
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_PUSTATS_H_
diff --git a/third_party/aom/av1/encoder/pvq_encoder.c b/third_party/aom/av1/encoder/pvq_encoder.c
deleted file mode 100644
index 9d5133012..000000000
--- a/third_party/aom/av1/encoder/pvq_encoder.c
+++ /dev/null
@@ -1,988 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include "aom_dsp/entcode.h"
-#include "aom_dsp/entenc.h"
-#include "av1/common/blockd.h"
-#include "av1/common/odintrin.h"
-#include "av1/common/partition.h"
-#include "av1/common/pvq_state.h"
-#include "av1/encoder/encodemb.h"
-#include "av1/encoder/pvq_encoder.h"
-#include "aom_ports/system_state.h"
-
-/*Shift to ensure that the upper bound (i.e. for the max blocksize) of the
-   dot-product of the 1st band of chroma with the luma ref doesn't overflow.*/
-#define OD_CFL_FLIP_SHIFT (OD_LIMIT_BSIZE_MAX + 0)
-
-void aom_write_symbol_pvq(aom_writer *w, int symb, aom_cdf_prob *cdf,
-    int nsymbs) {
-  if (cdf[0] == 0)
-    aom_cdf_init_q15_1D(cdf, nsymbs, CDF_SIZE(nsymbs));
-  aom_write_symbol(w, symb, cdf, nsymbs);
-}
-
-static void aom_encode_pvq_codeword(aom_writer *w, od_pvq_codeword_ctx *adapt,
- const od_coeff *in, int n, int k) {
-  int i;
-  aom_encode_band_pvq_splits(w, adapt, in, n, k, 0);
-  for (i = 0; i < n; i++) if (in[i]) aom_write_bit(w, in[i] < 0);
-}
-
-/* Computes 1/sqrt(i) using a table for small values. */
-static double od_rsqrt_table(int i) {
-  static double table[16] = {
-    1.000000, 0.707107, 0.577350, 0.500000,
-    0.447214, 0.408248, 0.377964, 0.353553,
-    0.333333, 0.316228, 0.301511, 0.288675,
-    0.277350, 0.267261, 0.258199, 0.250000};
-  if (i <= 16) return table[i-1];
-  else return 1./sqrt(i);
-}
-
-/*Computes 1/sqrt(start+2*i+1) using a lookup table containing the results
-   where 0 <= i < table_size.*/
-static double od_custom_rsqrt_dynamic_table(const double* table,
- const int table_size, const double start, const int i) {
-  if (i < table_size) return table[i];
-  else return od_rsqrt_table((int)(start + 2*i + 1));
-}
-
-/*Fills tables used in od_custom_rsqrt_dynamic_table for a given start.*/
-static void od_fill_dynamic_rsqrt_table(double *table, const int table_size,
- const double start) {
-  int i;
-  for (i = 0; i < table_size; i++)
-    table[i] = od_rsqrt_table((int)(start + 2*i + 1));
-}
-
-/** Find the codepoint on the given PSphere closest to the desired
- * vector. Double-precision PVQ search just to make sure our tests
- * aren't limited by numerical accuracy.
- *
- * @param [in]      xcoeff  input vector to quantize (x in the math doc)
- * @param [in]      n       number of dimensions
- * @param [in]      k       number of pulses
- * @param [out]     ypulse  optimal codevector found (y in the math doc)
- * @param [out]     g2      multiplier for the distortion (typically squared
- *                          gain units)
- * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
- * @param [in]      prev_k  number of pulses already in ypulse that we should
- *                          reuse for the search (or 0 for a new search)
- * @return                  cosine distance between x and y (between 0 and 1)
- */
-double pvq_search_rdo_double_c(const od_val16 *xcoeff, int n, int k,
- od_coeff *ypulse, double g2, double pvq_norm_lambda, int prev_k) {
-  int i, j;
-  double xy;
-  double yy;
-  /* TODO - This blows our 8kB stack space budget and should be fixed when
-   converting PVQ to fixed point. */
-  double x[MAXN];
-  double xx;
-  double lambda;
-  double norm_1;
-  int rdo_pulses;
-  double delta_rate;
-  xx = xy = yy = 0;
-  for (j = 0; j < n; j++) {
-    x[j] = fabs((float)xcoeff[j]);
-    xx += x[j]*x[j];
-  }
-  norm_1 = 1./sqrt(1e-30 + xx);
-  lambda = pvq_norm_lambda/(1e-30 + g2);
-  i = 0;
-  if (prev_k > 0 && prev_k <= k) {
-    /* We reuse pulses from a previous search so we don't have to search them
-       again. */
-    for (j = 0; j < n; j++) {
-      ypulse[j] = abs(ypulse[j]);
-      xy += x[j]*ypulse[j];
-      yy += ypulse[j]*ypulse[j];
-      i += ypulse[j];
-    }
-  }
-  else if (k > 2) {
-    double l1_norm;
-    double l1_inv;
-    l1_norm = 0;
-    for (j = 0; j < n; j++) l1_norm += x[j];
-    l1_inv = 1./OD_MAXF(l1_norm, 1e-100);
-    for (j = 0; j < n; j++) {
-      double tmp;
-      tmp = k*x[j]*l1_inv;
-      ypulse[j] = OD_MAXI(0, (int)floor(tmp));
-      xy += x[j]*ypulse[j];
-      yy += ypulse[j]*ypulse[j];
-      i += ypulse[j];
-    }
-  }
-  else OD_CLEAR(ypulse, n);
-
-  /* Only use RDO on the last few pulses. This not only saves CPU, but using
-     RDO on all pulses actually makes the results worse for reasons I don't
-     fully understand. */
-  rdo_pulses = 1 + k/4;
-  /* Rough assumption for now, the last position costs about 3 bits more than
-     the first. */
-  delta_rate = 3./n;
-  /* Search one pulse at a time */
-  for (; i < k - rdo_pulses; i++) {
-    int pos;
-    double best_xy;
-    double best_yy;
-    pos = 0;
-    best_xy = -10;
-    best_yy = 1;
-    for (j = 0; j < n; j++) {
-      double tmp_xy;
-      double tmp_yy;
-      tmp_xy = xy + x[j];
-      tmp_yy = yy + 2*ypulse[j] + 1;
-      tmp_xy *= tmp_xy;
-      if (j == 0 || tmp_xy*best_yy > best_xy*tmp_yy) {
-        best_xy = tmp_xy;
-        best_yy = tmp_yy;
-        pos = j;
-      }
-    }
-    xy = xy + x[pos];
-    yy = yy + 2*ypulse[pos] + 1;
-    ypulse[pos]++;
-  }
-  /* Search last pulses with RDO. Distortion is D = (x-y)^2 = x^2 - 2*x*y + y^2
-     and since x^2 and y^2 are constant, we just maximize x*y, plus a
-     lambda*rate term. Note that since x and y aren't normalized here,
-     we need to divide by sqrt(x^2)*sqrt(y^2). */
-  for (; i < k; i++) {
-    double rsqrt_table[4];
-    int rsqrt_table_size = 4;
-    int pos;
-    double best_cost;
-    pos = 0;
-    best_cost = -1e5;
-    /*Fill the small rsqrt lookup table with inputs relative to yy.
-      Specifically, the table of n values is filled with
-       rsqrt(yy + 1), rsqrt(yy + 2 + 1) .. rsqrt(yy + 2*(n-1) + 1).*/
-    od_fill_dynamic_rsqrt_table(rsqrt_table, rsqrt_table_size, yy);
-    for (j = 0; j < n; j++) {
-      double tmp_xy;
-      double tmp_yy;
-      tmp_xy = xy + x[j];
-      /*Calculate rsqrt(yy + 2*ypulse[j] + 1) using an optimized method.*/
-      tmp_yy = od_custom_rsqrt_dynamic_table(rsqrt_table, rsqrt_table_size,
-       yy, ypulse[j]);
-      tmp_xy = 2*tmp_xy*norm_1*tmp_yy - lambda*j*delta_rate;
-      if (j == 0 || tmp_xy > best_cost) {
-        best_cost = tmp_xy;
-        pos = j;
-      }
-    }
-    xy = xy + x[pos];
-    yy = yy + 2*ypulse[pos] + 1;
-    ypulse[pos]++;
-  }
-  for (i = 0; i < n; i++) {
-    if (xcoeff[i] < 0) ypulse[i] = -ypulse[i];
-  }
-  return xy/(1e-100 + sqrt(xx*yy));
-}
-
-/** Encodes the gain so that the return value increases with the
- * distance |x-ref|, so that we can encode a zero when x=ref. The
- * value x=0 is not covered because it is only allowed in the noref
- * case.
- *
- * @param [in]      x      quantized gain to encode
- * @param [in]      ref    quantized gain of the reference
- * @return                 interleave-encoded quantized gain value
- */
-static int neg_interleave(int x, int ref) {
-  if (x < ref) return -2*(x - ref) - 1;
-  else if (x < 2*ref) return 2*(x - ref);
-  else return x-1;
-}
-
-int od_vector_is_null(const od_coeff *x, int len) {
-  int i;
-  for (i = 0; i < len; i++) if (x[i]) return 0;
-  return 1;
-}
-
-static double od_pvq_rate(int qg, int icgr, int theta, int ts,
-  const od_adapt_ctx *adapt, const od_coeff *y0, int k, int n, int speed) {
-  double rate;
-  if (k == 0) rate = 0;
-  else if (speed > 0) {
-    int i;
-    int sum;
-    double f;
-    /* Compute "center of mass" of the pulse vector. */
-    sum = 0;
-    for (i = 0; i < n - (theta != -1); i++) sum += i*abs(y0[i]);
-    f = sum/(double)(k*n);
-    /* Estimates the number of bits it will cost to encode K pulses in
-       N dimensions based on hand-tuned fit for bitrate vs K, N and
-       "center of mass". */
-    rate = (1 + .4*f)*n*OD_LOG2(1 + OD_MAXF(0, log(n*2*(1*f + .025))*k/n)) + 3;
-  }
-  else {
-    aom_writer w;
-    od_pvq_codeword_ctx cd;
-    int tell;
-#if !CONFIG_ANS
-    od_ec_enc_init(&w.ec, 1000);
-#else
-# error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-    OD_COPY(&cd, &adapt->pvq.pvq_codeword_ctx, 1);
-#if !CONFIG_ANS
-    tell = od_ec_enc_tell_frac(&w.ec);
-#else
-# error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-    aom_encode_pvq_codeword(&w, &cd, y0, n - (theta != -1), k);
-#if !CONFIG_ANS
-    rate = (od_ec_enc_tell_frac(&w.ec)-tell)/8.;
-    od_ec_enc_clear(&w.ec);
-#else
-# error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-  }
-  if (qg > 0 && theta >= 0) {
-    /* Approximate cost of entropy-coding theta */
-    rate += .9*OD_LOG2(ts);
-    if (qg == icgr) rate -= .5;
-  }
-  return rate;
-}
-
-#define MAX_PVQ_ITEMS (20)
-/* This stores the information about a PVQ search candidate, so we can sort
-   based on K. */
-typedef struct {
-  int gain;
-  int k;
-  od_val32 qtheta;
-  int theta;
-  int ts;
-  od_val32 qcg;
-} pvq_search_item;
-
-int items_compare(pvq_search_item *a, pvq_search_item *b) {
-  /* Break ties in K with gain to ensure a stable sort.
-     Otherwise, the order depends on qsort implementation. */
-  return a->k == b->k ? a->gain - b->gain : a->k - b->k;
-}
-
-/** Perform PVQ quantization with prediction, trying several
- * possible gains and angles. See draft-valin-videocodec-pvq and
- * http://jmvalin.ca/slides/pvq.pdf for more details.
- *
- * @param [out]    out         coefficients after quantization
- * @param [in]     x0          coefficients before quantization
- * @param [in]     r0          reference, aka predicted coefficients
- * @param [in]     n           number of dimensions
- * @param [in]     q0          quantization step size
- * @param [out]    y           pulse vector (i.e. selected PVQ codevector)
- * @param [out]    itheta      angle between input and reference (-1 if noref)
- * @param [out]    vk          total number of pulses
- * @param [in]     beta        per-band activity masking beta param
- * @param [out]    skip_diff   distortion cost of skipping this block
- *                             (accumulated)
- * @param [in]     is_keyframe whether we're encoding a keyframe
- * @param [in]     pli         plane index
- * @param [in]     adapt       probability adaptation context
- * @param [in]     qm          QM with magnitude compensation
- * @param [in]     qm_inv      Inverse of QM with magnitude compensation
- * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
- * @param [in]     speed       Make search faster by making approximations
- * @return         gain        index of the quatized gain
-*/
-static int pvq_theta(od_coeff *out, const od_coeff *x0, const od_coeff *r0,
-    int n, int q0, od_coeff *y, int *itheta, int *vk,
-    od_val16 beta, double *skip_diff, int is_keyframe, int pli,
-    const od_adapt_ctx *adapt, const int16_t *qm, const int16_t *qm_inv,
-    double pvq_norm_lambda, int speed) {
-  od_val32 g;
-  od_val32 gr;
-  od_coeff y_tmp[MAXN + 3];
-  int i;
-  /* Number of pulses. */
-  int k;
-  /* Companded gain of x and reference, normalized to q. */
-  od_val32 cg;
-  od_val32 cgr;
-  int icgr;
-  int qg;
-  /* Best RDO cost (D + lamdba*R) so far. */
-  double best_cost;
-  double dist0;
-  /* Distortion (D) that corresponds to the best RDO cost. */
-  double best_dist;
-  double dist;
-  /* Sign of Householder reflection. */
-  int s;
-  /* Dimension on which Householder reflects. */
-  int m;
-  od_val32 theta;
-  double corr;
-  int best_k;
-  od_val32 best_qtheta;
-  od_val32 gain_offset;
-  int noref;
-  double skip_dist;
-  int cfl_enabled;
-  int skip;
-  double gain_weight;
-  od_val16 x16[MAXN];
-  od_val16 r16[MAXN];
-  int xshift;
-  int rshift;
-  /* Give more weight to gain error when calculating the total distortion. */
-  gain_weight = 1.0;
-  OD_ASSERT(n > 1);
-  corr = 0;
-#if !defined(OD_FLOAT_PVQ)
-  /* Shift needed to make x fit in 16 bits even after rotation.
-     This shift value is not normative (it can be changed without breaking
-     the bitstream) */
-  xshift = OD_MAXI(0, od_vector_log_mag(x0, n) - 15);
-  /* Shift needed to make the reference fit in 15 bits, so that the Householder
-     vector can fit in 16 bits.
-     This shift value *is* normative, and has to match the decoder. */
-  rshift = OD_MAXI(0, od_vector_log_mag(r0, n) - 14);
-#else
-  xshift = 0;
-  rshift = 0;
-#endif
-  for (i = 0; i < n; i++) {
-#if defined(OD_FLOAT_PVQ)
-    /*This is slightly different from the original float PVQ code,
-       where the qm was applied in the accumulation in od_pvq_compute_gain and
-       the vectors were od_coeffs, not od_val16 (i.e. double).*/
-    x16[i] = x0[i]*(double)qm[i]*OD_QM_SCALE_1;
-    r16[i] = r0[i]*(double)qm[i]*OD_QM_SCALE_1;
-#else
-    x16[i] = OD_SHR_ROUND(x0[i]*qm[i], OD_QM_SHIFT + xshift);
-    r16[i] = OD_SHR_ROUND(r0[i]*qm[i], OD_QM_SHIFT + rshift);
-#endif
-    corr += OD_MULT16_16(x16[i], r16[i]);
-  }
-  cfl_enabled = is_keyframe && pli != 0 && !OD_DISABLE_CFL;
-  cg  = od_pvq_compute_gain(x16, n, q0, &g, beta, xshift);
-  cgr = od_pvq_compute_gain(r16, n, q0, &gr, beta, rshift);
-  if (cfl_enabled) cgr = OD_CGAIN_SCALE;
-  /* gain_offset is meant to make sure one of the quantized gains has
-     exactly the same gain as the reference. */
-#if defined(OD_FLOAT_PVQ)
-  icgr = (int)floor(.5 + cgr);
-#else
-  icgr = OD_SHR_ROUND(cgr, OD_CGAIN_SHIFT);
-#endif
-  gain_offset = cgr - OD_SHL(icgr, OD_CGAIN_SHIFT);
-  /* Start search with null case: gain=0, no pulse. */
-  qg = 0;
-  dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2;
-  best_dist = dist;
-  best_cost = dist + pvq_norm_lambda*od_pvq_rate(0, 0, -1, 0, adapt, NULL, 0,
-    n, speed);
-  noref = 1;
-  best_k = 0;
-  *itheta = -1;
-  OD_CLEAR(y, n);
-  best_qtheta = 0;
-  m = 0;
-  s = 1;
-  corr = corr/(1e-100 + g*(double)gr/OD_SHL(1, xshift + rshift));
-  corr = OD_MAXF(OD_MINF(corr, 1.), -1.);
-  if (is_keyframe) skip_dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2;
-  else {
-    skip_dist = gain_weight*(cg - cgr)*(cg - cgr)
-     + cgr*(double)cg*(2 - 2*corr);
-    skip_dist *= OD_CGAIN_SCALE_2;
-  }
-  if (!is_keyframe) {
-    /* noref, gain=0 isn't allowed, but skip is allowed. */
-    od_val32 scgr;
-    scgr = OD_MAXF(0,gain_offset);
-    if (icgr == 0) {
-      best_dist = gain_weight*(cg - scgr)*(cg - scgr)
-       + scgr*(double)cg*(2 - 2*corr);
-      best_dist *= OD_CGAIN_SCALE_2;
-    }
-    best_cost = best_dist + pvq_norm_lambda*od_pvq_rate(0, icgr, 0, 0, adapt,
-     NULL, 0, n, speed);
-    best_qtheta = 0;
-    *itheta = 0;
-    noref = 0;
-  }
-  dist0 = best_dist;
-  if (n <= OD_MAX_PVQ_SIZE && !od_vector_is_null(r0, n) && corr > 0) {
-    od_val16 xr[MAXN];
-    int gain_bound;
-    int prev_k;
-    pvq_search_item items[MAX_PVQ_ITEMS];
-    int idx;
-    int nitems;
-    double cos_dist;
-    idx = 0;
-    gain_bound = OD_SHR(cg - gain_offset, OD_CGAIN_SHIFT);
-    /* Perform theta search only if prediction is useful. */
-    theta = OD_ROUND32(OD_THETA_SCALE*acos(corr));
-    m = od_compute_householder(r16, n, gr, &s, rshift);
-    od_apply_householder(xr, x16, r16, n);
-    prev_k = 0;
-    for (i = m; i < n - 1; i++) xr[i] = xr[i + 1];
-    /* Compute all candidate PVQ searches within a reasonable range of gain
-       and theta. */
-    for (i = OD_MAXI(1, gain_bound - 1); i <= gain_bound + 1; i++) {
-      int j;
-      od_val32 qcg;
-      int ts;
-      int theta_lower;
-      int theta_upper;
-      /* Quantized companded gain */
-      qcg = OD_SHL(i, OD_CGAIN_SHIFT) + gain_offset;
-      /* Set angular resolution (in ra) to match the encoded gain */
-      ts = od_pvq_compute_max_theta(qcg, beta);
-      theta_lower = OD_MAXI(0, (int)floor(.5 +
-       theta*OD_THETA_SCALE_1*2/M_PI*ts) - 2);
-      theta_upper = OD_MINI(ts - 1, (int)ceil(theta*OD_THETA_SCALE_1*2/M_PI*ts));
-      /* Include the angles within a reasonable range. */
-      for (j = theta_lower; j <= theta_upper; j++) {
-        od_val32 qtheta;
-        qtheta = od_pvq_compute_theta(j, ts);
-        k = od_pvq_compute_k(qcg, j, 0, n, beta);
-        items[idx].gain = i;
-        items[idx].theta = j;
-        items[idx].k = k;
-        items[idx].qcg = qcg;
-        items[idx].qtheta = qtheta;
-        items[idx].ts = ts;
-        idx++;
-        OD_ASSERT(idx < MAX_PVQ_ITEMS);
-      }
-    }
-    nitems = idx;
-    cos_dist = 0;
-    /* Sort PVQ search candidates in ascending order of pulses K so that
-       we can reuse all the previously searched pulses across searches. */
-    qsort(items, nitems, sizeof(items[0]),
-     (int (*)(const void *, const void *))items_compare);
-    /* Search for the best gain/theta in order. */
-    for (idx = 0; idx < nitems; idx++) {
-      int j;
-      od_val32 qcg;
-      int ts;
-      double cost;
-      double dist_theta;
-      double sin_prod;
-      od_val32 qtheta;
-      /* Quantized companded gain */
-      qcg = items[idx].qcg;
-      i = items[idx].gain;
-      j = items[idx].theta;
-      /* Set angular resolution (in ra) to match the encoded gain */
-      ts = items[idx].ts;
-      /* Search for the best angle within a reasonable range. */
-      qtheta = items[idx].qtheta;
-      k = items[idx].k;
-      /* Compute the minimal possible distortion by not taking the PVQ
-         cos_dist into account. */
-      dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1;
-      dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta;
-      dist *= OD_CGAIN_SCALE_2;
-      /* If we have no hope of beating skip (including a 1-bit worst-case
-         penalty), stop now. */
-      if (dist > dist0 + 1.0*pvq_norm_lambda && k != 0) continue;
-      sin_prod = od_pvq_sin(theta)*OD_TRIG_SCALE_1*od_pvq_sin(qtheta)*
-       OD_TRIG_SCALE_1;
-      /* PVQ search, using a gain of qcg*cg*sin(theta)*sin(qtheta) since
-         that's the factor by which cos_dist is multiplied to get the
-         distortion metric. */
-      if (k == 0) {
-        cos_dist = 0;
-        OD_CLEAR(y_tmp, n-1);
-      }
-      else if (k != prev_k) {
-        cos_dist = pvq_search_rdo_double(xr, n - 1, k, y_tmp,
-         qcg*(double)cg*sin_prod*OD_CGAIN_SCALE_2, pvq_norm_lambda, prev_k);
-      }
-      prev_k = k;
-      /* See Jmspeex' Journal of Dubious Theoretical Results. */
-      dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1
-       + sin_prod*(2 - 2*cos_dist);
-      dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta;
-      dist *= OD_CGAIN_SCALE_2;
-      /* Do approximate RDO. */
-      cost = dist + pvq_norm_lambda*od_pvq_rate(i, icgr, j, ts, adapt, y_tmp,
-       k, n, speed);
-      if (cost < best_cost) {
-        best_cost = cost;
-        best_dist = dist;
-        qg = i;
-        best_k = k;
-        best_qtheta = qtheta;
-        *itheta = j;
-        noref = 0;
-        OD_COPY(y, y_tmp, n - 1);
-      }
-    }
-  }
-  /* Don't bother with no-reference version if there's a reasonable
-     correlation. */
-  if (n <= OD_MAX_PVQ_SIZE && (corr < .5
-        || cg < (od_val32)(OD_SHL(2, OD_CGAIN_SHIFT)))) {
-    int gain_bound;
-    int prev_k;
-    gain_bound = OD_SHR(cg, OD_CGAIN_SHIFT);
-    prev_k = 0;
-    /* Search for the best gain (haven't determined reasonable range yet). */
-    for (i = OD_MAXI(1, gain_bound); i <= gain_bound + 1; i++) {
-      double cos_dist;
-      double cost;
-      od_val32 qcg;
-      qcg = OD_SHL(i, OD_CGAIN_SHIFT);
-      k = od_pvq_compute_k(qcg, -1, 1, n, beta);
-      /* Compute the minimal possible distortion by not taking the PVQ
-         cos_dist into account. */
-      dist = gain_weight*(qcg - cg)*(qcg - cg);
-      dist *= OD_CGAIN_SCALE_2;
-      if (dist > dist0 && k != 0) continue;
-      cos_dist = pvq_search_rdo_double(x16, n, k, y_tmp,
-       qcg*(double)cg*OD_CGAIN_SCALE_2, pvq_norm_lambda, prev_k);
-      prev_k = k;
-      /* See Jmspeex' Journal of Dubious Theoretical Results. */
-      dist = gain_weight*(qcg - cg)*(qcg - cg)
-       + qcg*(double)cg*(2 - 2*cos_dist);
-      dist *= OD_CGAIN_SCALE_2;
-      /* Do approximate RDO. */
-      cost = dist + pvq_norm_lambda*od_pvq_rate(i, 0, -1, 0, adapt, y_tmp, k,
-       n, speed);
-      if (cost <= best_cost) {
-        best_cost = cost;
-        best_dist = dist;
-        qg = i;
-        noref = 1;
-        best_k = k;
-        *itheta = -1;
-        OD_COPY(y, y_tmp, n);
-      }
-    }
-  }
-  k = best_k;
-  theta = best_qtheta;
-  skip = 0;
-  if (noref) {
-    if (qg == 0) skip = OD_PVQ_SKIP_ZERO;
-  }
-  else {
-    if (!is_keyframe && qg == 0) {
-      skip = (icgr ? OD_PVQ_SKIP_ZERO : OD_PVQ_SKIP_COPY);
-    }
-    if (qg == icgr && *itheta == 0 && !cfl_enabled) skip = OD_PVQ_SKIP_COPY;
-  }
-  /* Synthesize like the decoder would. */
-  if (skip) {
-    if (skip == OD_PVQ_SKIP_COPY) OD_COPY(out, r0, n);
-    else OD_CLEAR(out, n);
-  }
-  else {
-    if (noref) gain_offset = 0;
-    g = od_gain_expand(OD_SHL(qg, OD_CGAIN_SHIFT) + gain_offset, q0, beta);
-    od_pvq_synthesis_partial(out, y, r16, n, noref, g, theta, m, s,
-     qm_inv);
-  }
-  *vk = k;
-  *skip_diff += skip_dist - best_dist;
-  /* Encode gain differently depending on whether we use prediction or not.
-     Special encoding on inter frames where qg=0 is allowed for noref=0
-     but not noref=1.*/
-  if (is_keyframe) return noref ? qg : neg_interleave(qg, icgr);
-  else return noref ? qg - 1 : neg_interleave(qg + 1, icgr + 1);
-}
-
-/** Encodes a single vector of integers (eg, a partition within a
- *  coefficient block) using PVQ
- *
- * @param [in,out] w          multi-symbol entropy encoder
- * @param [in]     qg         quantized gain
- * @param [in]     theta      quantized post-prediction theta
- * @param [in]     in         coefficient vector to code
- * @param [in]     n          number of coefficients in partition
- * @param [in]     k          number of pulses in partition
- * @param [in,out] model      entropy encoder state
- * @param [in,out] adapt      adaptation context
- * @param [in,out] exg        ExQ16 expectation of gain value
- * @param [in,out] ext        ExQ16 expectation of theta value
- * @param [in]     cdf_ctx    selects which cdf context to use
- * @param [in]     is_keyframe whether we're encoding a keyframe
- * @param [in]     code_skip  whether the "skip rest" flag is allowed
- * @param [in]     skip_rest  when set, we skip all higher bands
- * @param [in]     encode_flip whether we need to encode the CfL flip flag now
- * @param [in]     flip       value of the CfL flip flag
- */
-void pvq_encode_partition(aom_writer *w,
-                                 int qg,
-                                 int theta,
-                                 const od_coeff *in,
-                                 int n,
-                                 int k,
-                                 generic_encoder model[3],
-                                 od_adapt_ctx *adapt,
-                                 int *exg,
-                                 int *ext,
-                                 int cdf_ctx,
-                                 int is_keyframe,
-                                 int code_skip,
-                                 int skip_rest,
-                                 int encode_flip,
-                                 int flip) {
-  int noref;
-  int id;
-  noref = (theta == -1);
-  id = (qg > 0) + 2*OD_MINI(theta + 1,3) + 8*code_skip*skip_rest;
-  if (is_keyframe) {
-    OD_ASSERT(id != 8);
-    if (id >= 8) id--;
-  }
-  else {
-    OD_ASSERT(id != 10);
-    if (id >= 10) id--;
-  }
-  /* Jointly code gain, theta and noref for small values. Then we handle
-     larger gain and theta values. For noref, theta = -1. */
-  aom_write_symbol_pvq(w, id, &adapt->pvq.pvq_gaintheta_cdf[cdf_ctx][0],
-   8 + 7*code_skip);
-  if (encode_flip) {
-    /* We could eventually do some smarter entropy coding here, but it would
-       have to be good enough to overcome the overhead of the entropy coder.
-       An early attempt using a "toogle" flag with simple adaptation wasn't
-       worth the trouble. */
-    aom_write_bit(w, flip);
-  }
-  if (qg > 0) {
-    int tmp;
-    tmp = *exg;
-    generic_encode(w, &model[!noref], qg - 1, &tmp, 2);
-    OD_IIR_DIADIC(*exg, qg << 16, 2);
-  }
-  if (theta > 1) {
-    int tmp;
-    tmp = *ext;
-    generic_encode(w, &model[2], theta - 2, &tmp, 2);
-    OD_IIR_DIADIC(*ext, theta << 16, 2);
-  }
-  aom_encode_pvq_codeword(w, &adapt->pvq.pvq_codeword_ctx, in,
-   n - (theta != -1), k);
-}
-
-/** Quantizes a scalar with rate-distortion optimization (RDO)
- * @param [in] x      unquantized value
- * @param [in] q      quantization step size
- * @param [in] delta0 rate increase for encoding a 1 instead of a 0
- * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
- * @retval quantized value
- */
-int od_rdo_quant(od_coeff x, int q, double delta0, double pvq_norm_lambda) {
-  int n;
-  /* Optimal quantization threshold is 1/2 + lambda*delta_rate/2. See
-     Jmspeex' Journal of Dubious Theoretical Results for details. */
-  n = OD_DIV_R0(abs(x), q);
-  if ((double)abs(x)/q < (double)n/2 + pvq_norm_lambda*delta0/(2*n)) {
-    return 0;
-  }
-  else {
-    return OD_DIV_R0(x, q);
-  }
-}
-
-/** Encode a coefficient block (excepting DC) using PVQ
- *
- * @param [in,out] enc     daala encoder context
- * @param [in]     ref     'reference' (prediction) vector
- * @param [in]     in      coefficient block to quantize and encode
- * @param [out]    out     quantized coefficient block
- * @param [in]     q0      scale/quantizer
- * @param [in]     pli     plane index
- * @param [in]     bs      log of the block size minus two
- * @param [in]     beta    per-band activity masking beta param
- * @param [in]     is_keyframe whether we're encoding a keyframe
- * @param [in]     qm      QM with magnitude compensation
- * @param [in]     qm_inv  Inverse of QM with magnitude compensation
- * @param [in]     speed   Make search faster by making approximations
- * @param [in]     pvq_info If null, conisdered as RDO search mode
- * @return         Returns block skip info indicating whether DC/AC are coded.
- *                 bit0: DC is coded, bit1: AC is coded (1 means coded)
- *
- */
-PVQ_SKIP_TYPE od_pvq_encode(daala_enc_ctx *enc,
-                   od_coeff *ref,
-                   const od_coeff *in,
-                   od_coeff *out,
-                   int q_dc,
-                   int q_ac,
-                   int pli,
-                   int bs,
-                   const od_val16 *beta,
-                   int is_keyframe,
-                   const int16_t *qm,
-                   const int16_t *qm_inv,
-                   int speed,
-                   PVQ_INFO *pvq_info){
-  int theta[PVQ_MAX_PARTITIONS];
-  int qg[PVQ_MAX_PARTITIONS];
-  int k[PVQ_MAX_PARTITIONS];
-  od_coeff y[OD_TXSIZE_MAX*OD_TXSIZE_MAX];
-  int *exg;
-  int *ext;
-  int nb_bands;
-  int i;
-  const int *off;
-  int size[PVQ_MAX_PARTITIONS];
-  generic_encoder *model;
-  double skip_diff;
-  int tell;
-  uint16_t *skip_cdf;
-  od_rollback_buffer buf;
-  int dc_quant;
-  int flip;
-  int cfl_encoded;
-  int skip_rest;
-  int skip_dir;
-  int skip_theta_value;
-  const unsigned char *pvq_qm;
-  double dc_rate;
-  int use_masking;
-  PVQ_SKIP_TYPE ac_dc_coded;
-
-  aom_clear_system_state();
-
-  use_masking = enc->use_activity_masking;
-
-  if (use_masking)
-    pvq_qm = &enc->state.pvq_qm_q4[pli][0];
-  else
-    pvq_qm = 0;
-
-  exg = &enc->state.adapt->pvq.pvq_exg[pli][bs][0];
-  ext = enc->state.adapt->pvq.pvq_ext + bs*PVQ_MAX_PARTITIONS;
-  skip_cdf = enc->state.adapt->skip_cdf[2*bs + (pli != 0)];
-  model = enc->state.adapt->pvq.pvq_param_model;
-  nb_bands = OD_BAND_OFFSETS[bs][0];
-  off = &OD_BAND_OFFSETS[bs][1];
-
-  if (use_masking)
-    dc_quant = OD_MAXI(1, q_dc * pvq_qm[od_qm_get_index(bs, 0)] >> 4);
-  else
-    dc_quant = OD_MAXI(1, q_dc);
-
-  tell = 0;
-  for (i = 0; i < nb_bands; i++) size[i] = off[i+1] - off[i];
-  skip_diff = 0;
-  flip = 0;
-  /*If we are coding a chroma block of a keyframe, we are doing CfL.*/
-  if (pli != 0 && is_keyframe) {
-    od_val32 xy;
-    xy = 0;
-    /*Compute the dot-product of the first band of chroma with the luma ref.*/
-    for (i = off[0]; i < off[1]; i++) {
-#if defined(OD_FLOAT_PVQ)
-      xy += ref[i]*(double)qm[i]*OD_QM_SCALE_1*
-       (double)in[i]*(double)qm[i]*OD_QM_SCALE_1;
-#else
-      od_val32 rq;
-      od_val32 inq;
-      rq = ref[i]*qm[i];
-      inq = in[i]*qm[i];
-      xy += OD_SHR(rq*(int64_t)inq, OD_SHL(OD_QM_SHIFT + OD_CFL_FLIP_SHIFT,
-       1));
-#endif
-    }
-    /*If cos(theta) < 0, then |theta| > pi/2 and we should negate the ref.*/
-    if (xy < 0) {
-      flip = 1;
-      for(i = off[0]; i < off[nb_bands]; i++) ref[i] = -ref[i];
-    }
-  }
-  for (i = 0; i < nb_bands; i++) {
-    int q;
-
-    if (use_masking)
-      q = OD_MAXI(1, q_ac * pvq_qm[od_qm_get_index(bs, i + 1)] >> 4);
-    else
-      q = OD_MAXI(1, q_ac);
-
-    qg[i] = pvq_theta(out + off[i], in + off[i], ref + off[i], size[i],
-     q, y + off[i], &theta[i], &k[i], beta[i], &skip_diff, is_keyframe,
-     pli, enc->state.adapt, qm + off[i], qm_inv + off[i],
-     enc->pvq_norm_lambda, speed);
-  }
-  od_encode_checkpoint(enc, &buf);
-  if (is_keyframe) out[0] = 0;
-  else {
-    int n;
-    n = OD_DIV_R0(abs(in[0] - ref[0]), dc_quant);
-    if (n == 0) {
-      out[0] = 0;
-    } else {
-      int tell2;
-      od_rollback_buffer dc_buf;
-
-      dc_rate = -OD_LOG2((double)(OD_ICDF(skip_cdf[3]) - OD_ICDF(skip_cdf[2]))/
-       (double)(OD_ICDF(skip_cdf[2]) - OD_ICDF(skip_cdf[1])));
-      dc_rate += 1;
-
-#if !CONFIG_ANS
-      tell2 = od_ec_enc_tell_frac(&enc->w.ec);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-      od_encode_checkpoint(enc, &dc_buf);
-      generic_encode(&enc->w, &enc->state.adapt->model_dc[pli],
-       n - 1, &enc->state.adapt->ex_dc[pli][bs][0], 2);
-#if !CONFIG_ANS
-      tell2 = od_ec_enc_tell_frac(&enc->w.ec) - tell2;
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-      dc_rate += tell2/8.0;
-      od_encode_rollback(enc, &dc_buf);
-
-      out[0] = od_rdo_quant(in[0] - ref[0], dc_quant, dc_rate,
-       enc->pvq_norm_lambda);
-    }
-  }
-#if !CONFIG_ANS
-  tell = od_ec_enc_tell_frac(&enc->w.ec);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-  /* Code as if we're not skipping. */
-  aom_write_symbol(&enc->w, 2 + (out[0] != 0), skip_cdf, 4);
-  ac_dc_coded = AC_CODED + (out[0] != 0);
-  cfl_encoded = 0;
-  skip_rest = 1;
-  skip_theta_value = is_keyframe ? -1 : 0;
-  for (i = 1; i < nb_bands; i++) {
-    if (theta[i] != skip_theta_value || qg[i]) skip_rest = 0;
-  }
-  skip_dir = 0;
-  if (nb_bands > 1) {
-    for (i = 0; i < 3; i++) {
-      int j;
-      int tmp;
-      tmp = 1;
-      // ToDo(yaowu): figure out better stop condition without gcc warning.
-      for (j = i + 1; j < nb_bands && j < PVQ_MAX_PARTITIONS; j += 3) {
-        if (theta[j] != skip_theta_value || qg[j]) tmp = 0;
-      }
-      skip_dir |= tmp << i;
-    }
-  }
-  if (theta[0] == skip_theta_value && qg[0] == 0 && skip_rest) nb_bands = 0;
-
-  /* NOTE: There was no other better place to put this function. */
-  if (pvq_info)
-    av1_store_pvq_enc_info(pvq_info, qg, theta, k, y, nb_bands, off, size,
-      skip_rest, skip_dir, bs);
-
-  for (i = 0; i < nb_bands; i++) {
-    int encode_flip;
-    /* Encode CFL flip bit just after the first time it's used. */
-    encode_flip = pli != 0 && is_keyframe && theta[i] != -1 && !cfl_encoded;
-    if (i == 0 || (!skip_rest && !(skip_dir & (1 << ((i - 1)%3))))) {
-      pvq_encode_partition(&enc->w, qg[i], theta[i], y + off[i],
-       size[i], k[i], model, enc->state.adapt, exg + i, ext + i,
-       (pli != 0)*OD_TXSIZES*PVQ_MAX_PARTITIONS + bs*PVQ_MAX_PARTITIONS + i,
-       is_keyframe, i == 0 && (i < nb_bands - 1), skip_rest, encode_flip, flip);
-    }
-    if (i == 0 && !skip_rest && bs > 0) {
-      aom_write_symbol(&enc->w, skip_dir,
-       &enc->state.adapt->pvq.pvq_skip_dir_cdf[(pli != 0) + 2*(bs - 1)][0], 7);
-    }
-    if (encode_flip) cfl_encoded = 1;
-  }
-#if !CONFIG_ANS
-  tell = od_ec_enc_tell_frac(&enc->w.ec) - tell;
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-  /* Account for the rate of skipping the AC, based on the same DC decision
-     we made when trying to not skip AC. */
-  {
-    double skip_rate;
-    if (out[0] != 0) {
-      skip_rate = -OD_LOG2((OD_ICDF(skip_cdf[1]) - OD_ICDF(skip_cdf[0]))/
-     (double)OD_ICDF(skip_cdf[3]));
-    }
-    else {
-      skip_rate = -OD_LOG2(OD_ICDF(skip_cdf[0])/
-     (double)OD_ICDF(skip_cdf[3]));
-    }
-    tell -= (int)floor(.5+8*skip_rate);
-  }
-  if (nb_bands == 0 || skip_diff <= enc->pvq_norm_lambda/8*tell) {
-    if (is_keyframe) out[0] = 0;
-    else {
-      int n;
-      n = OD_DIV_R0(abs(in[0] - ref[0]), dc_quant);
-      if (n == 0) {
-        out[0] = 0;
-      } else {
-        int tell2;
-        od_rollback_buffer dc_buf;
-
-        dc_rate = -OD_LOG2((double)(OD_ICDF(skip_cdf[1]) - OD_ICDF(skip_cdf[0]))/
-         (double)OD_ICDF(skip_cdf[0]));
-        dc_rate += 1;
-
-#if !CONFIG_ANS
-        tell2 = od_ec_enc_tell_frac(&enc->w.ec);
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-        od_encode_checkpoint(enc, &dc_buf);
-        generic_encode(&enc->w, &enc->state.adapt->model_dc[pli],
-         n - 1, &enc->state.adapt->ex_dc[pli][bs][0], 2);
-#if !CONFIG_ANS
-        tell2 = od_ec_enc_tell_frac(&enc->w.ec) - tell2;
-#else
-#error "CONFIG_PVQ currently requires !CONFIG_ANS."
-#endif
-        dc_rate += tell2/8.0;
-        od_encode_rollback(enc, &dc_buf);
-
-        out[0] = od_rdo_quant(in[0] - ref[0], dc_quant, dc_rate,
-         enc->pvq_norm_lambda);
-      }
-    }
-    /* We decide to skip, roll back everything as it was before. */
-    od_encode_rollback(enc, &buf);
-    aom_write_symbol(&enc->w, out[0] != 0, skip_cdf, 4);
-    ac_dc_coded = (out[0] != 0);
-    if (is_keyframe) for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = 0;
-    else for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = ref[i];
-  }
-  if (pvq_info)
-    pvq_info->ac_dc_coded = ac_dc_coded;
-  return ac_dc_coded;
-}
diff --git a/third_party/aom/av1/encoder/pvq_encoder.h b/third_party/aom/av1/encoder/pvq_encoder.h
deleted file mode 100644
index b84c8961b..000000000
--- a/third_party/aom/av1/encoder/pvq_encoder.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* clang-format off */
-
-#if !defined(_pvq_encoder_H)
-# define _pvq_encoder_H (1)
-# include "aom_dsp/bitwriter.h"
-# include "aom_dsp/entenc.h"
-# include "av1/common/blockd.h"
-# include "av1/common/pvq.h"
-# include "av1/encoder/encint.h"
-
-void aom_write_symbol_pvq(aom_writer *w, int symb, aom_cdf_prob *cdf,
-    int nsymbs);
-
-void aom_encode_band_pvq_splits(aom_writer *w, od_pvq_codeword_ctx *adapt,
- const int *y, int n, int k, int level);
-
-void aom_laplace_encode_special(aom_writer *w, int x, unsigned decay);
-
-void pvq_encode_partition(aom_writer *w,
-                                 int qg,
-                                 int theta,
-                                 const od_coeff *in,
-                                 int n,
-                                 int k,
-                                 generic_encoder model[3],
-                                 od_adapt_ctx *adapt,
-                                 int *exg,
-                                 int *ext,
-                                 int cdf_ctx,
-                                 int is_keyframe,
-                                 int code_skip,
-                                 int skip_rest,
-                                 int encode_flip,
-                                 int flip);
-
-PVQ_SKIP_TYPE od_pvq_encode(daala_enc_ctx *enc, od_coeff *ref,
-    const od_coeff *in, od_coeff *out, int q_dc, int q_ac, int pli, int bs,
-    const od_val16 *beta, int is_keyframe,
-    const int16_t *qm, const int16_t *qm_inv, int speed,
-    PVQ_INFO *pvq_info);
-
-#endif
diff --git a/third_party/aom/av1/encoder/ransac.c b/third_party/aom/av1/encoder/ransac.c
index 6d2eb4183..781f528eb 100644
--- a/third_party/aom/av1/encoder/ransac.c
+++ b/third_party/aom/av1/encoder/ransac.c
@@ -80,60 +80,6 @@ static void project_points_double_affine(double *mat, double *points,
   }
 }
 
-static void project_points_double_hortrapezoid(double *mat, double *points,
-                                               double *proj, const int n,
-                                               const int stride_points,
-                                               const int stride_proj) {
-  int i;
-  double x, y, Z, Z_inv;
-  for (i = 0; i < n; ++i) {
-    x = *(points++), y = *(points++);
-    Z_inv = mat[7] * y + 1;
-    assert(fabs(Z_inv) > 0.000001);
-    Z = 1. / Z_inv;
-    *(proj++) = (mat[2] * x + mat[3] * y + mat[0]) * Z;
-    *(proj++) = (mat[5] * y + mat[1]) * Z;
-    points += stride_points - 2;
-    proj += stride_proj - 2;
-  }
-}
-
-static void project_points_double_vertrapezoid(double *mat, double *points,
-                                               double *proj, const int n,
-                                               const int stride_points,
-                                               const int stride_proj) {
-  int i;
-  double x, y, Z, Z_inv;
-  for (i = 0; i < n; ++i) {
-    x = *(points++), y = *(points++);
-    Z_inv = mat[6] * x + 1;
-    assert(fabs(Z_inv) > 0.000001);
-    Z = 1. / Z_inv;
-    *(proj++) = (mat[2] * x + mat[0]) * Z;
-    *(proj++) = (mat[4] * x + mat[5] * y + mat[1]) * Z;
-    points += stride_points - 2;
-    proj += stride_proj - 2;
-  }
-}
-
-static void project_points_double_homography(double *mat, double *points,
-                                             double *proj, const int n,
-                                             const int stride_points,
-                                             const int stride_proj) {
-  int i;
-  double x, y, Z, Z_inv;
-  for (i = 0; i < n; ++i) {
-    x = *(points++), y = *(points++);
-    Z_inv = mat[6] * x + mat[7] * y + 1;
-    assert(fabs(Z_inv) > 0.000001);
-    Z = 1. / Z_inv;
-    *(proj++) = (mat[2] * x + mat[3] * y + mat[0]) * Z;
-    *(proj++) = (mat[4] * x + mat[5] * y + mat[1]) * Z;
-    points += stride_points - 2;
-    proj += stride_proj - 2;
-  }
-}
-
 static void normalize_homography(double *pts, int n, double *T) {
   double *p = pts;
   double mean[2] = { 0, 0 };
@@ -193,22 +139,6 @@ static void denormalize_homography(double *params, double *T1, double *T2) {
   multiply_mat(iT2, params2, params, 3, 3, 3);
 }
 
-static void denormalize_homography_reorder(double *params, double *T1,
-                                           double *T2) {
-  double params_denorm[MAX_PARAMDIM];
-  memcpy(params_denorm, params, sizeof(*params) * 8);
-  params_denorm[8] = 1.0;
-  denormalize_homography(params_denorm, T1, T2);
-  params[0] = params_denorm[2];
-  params[1] = params_denorm[5];
-  params[2] = params_denorm[0];
-  params[3] = params_denorm[1];
-  params[4] = params_denorm[3];
-  params[5] = params_denorm[4];
-  params[6] = params_denorm[6];
-  params[7] = params_denorm[7];
-}
-
 static void denormalize_affine_reorder(double *params, double *T1, double *T2) {
   double params_denorm[MAX_PARAMDIM];
   params_denorm[0] = params[0];
@@ -377,217 +307,6 @@ static int find_affine(int np, double *pts1, double *pts2, double *mat) {
   return 0;
 }
 
-static int find_vertrapezoid(int np, double *pts1, double *pts2, double *mat) {
-  const int np3 = np * 3;
-  double *a = (double *)aom_malloc(sizeof(*a) * np3 * 14);
-  double *U = a + np3 * 7;
-  double S[7], V[7 * 7], H[9];
-  int i, mini;
-  double sx, sy, dx, dy;
-  double T1[9], T2[9];
-
-  normalize_homography(pts1, np, T1);
-  normalize_homography(pts2, np, T2);
-
-  for (i = 0; i < np; ++i) {
-    dx = *(pts2++);
-    dy = *(pts2++);
-    sx = *(pts1++);
-    sy = *(pts1++);
-
-    a[i * 3 * 7 + 0] = a[i * 3 * 7 + 1] = 0;
-    a[i * 3 * 7 + 2] = -sx;
-    a[i * 3 * 7 + 3] = -sy;
-    a[i * 3 * 7 + 4] = -1;
-    a[i * 3 * 7 + 5] = dy * sx;
-    a[i * 3 * 7 + 6] = dy;
-
-    a[(i * 3 + 1) * 7 + 0] = sx;
-    a[(i * 3 + 1) * 7 + 1] = 1;
-    a[(i * 3 + 1) * 7 + 2] = a[(i * 3 + 1) * 7 + 3] = a[(i * 3 + 1) * 7 + 4] =
-        0;
-    a[(i * 3 + 1) * 7 + 5] = -dx * sx;
-    a[(i * 3 + 1) * 7 + 6] = -dx;
-
-    a[(i * 3 + 2) * 7 + 0] = -dy * sx;
-    a[(i * 3 + 2) * 7 + 1] = -dy;
-    a[(i * 3 + 2) * 7 + 2] = dx * sx;
-    a[(i * 3 + 2) * 7 + 3] = dx * sy;
-    a[(i * 3 + 2) * 7 + 4] = dx;
-    a[(i * 3 + 2) * 7 + 5] = a[(i * 3 + 2) * 7 + 6] = 0;
-  }
-  if (SVD(U, S, V, a, np3, 7)) {
-    aom_free(a);
-    return 1;
-  } else {
-    double minS = 1e12;
-    mini = -1;
-    for (i = 0; i < 7; ++i) {
-      if (S[i] < minS) {
-        minS = S[i];
-        mini = i;
-      }
-    }
-  }
-  H[1] = H[7] = 0;
-  for (i = 0; i < 1; i++) H[i] = V[i * 7 + mini];
-  for (; i < 6; i++) H[i + 1] = V[i * 7 + mini];
-  for (; i < 7; i++) H[i + 2] = V[i * 7 + mini];
-
-  denormalize_homography_reorder(H, T1, T2);
-  aom_free(a);
-  if (H[8] == 0.0) {
-    return 1;
-  } else {
-    // normalize
-    double f = 1.0 / H[8];
-    for (i = 0; i < 8; i++) mat[i] = f * H[i];
-  }
-  return 0;
-}
-
-static int find_hortrapezoid(int np, double *pts1, double *pts2, double *mat) {
-  const int np3 = np * 3;
-  double *a = (double *)aom_malloc(sizeof(*a) * np3 * 14);
-  double *U = a + np3 * 7;
-  double S[7], V[7 * 7], H[9];
-  int i, mini;
-  double sx, sy, dx, dy;
-  double T1[9], T2[9];
-
-  normalize_homography(pts1, np, T1);
-  normalize_homography(pts2, np, T2);
-
-  for (i = 0; i < np; ++i) {
-    dx = *(pts2++);
-    dy = *(pts2++);
-    sx = *(pts1++);
-    sy = *(pts1++);
-
-    a[i * 3 * 7 + 0] = a[i * 3 * 7 + 1] = a[i * 3 * 7 + 2] = 0;
-    a[i * 3 * 7 + 3] = -sy;
-    a[i * 3 * 7 + 4] = -1;
-    a[i * 3 * 7 + 5] = dy * sy;
-    a[i * 3 * 7 + 6] = dy;
-
-    a[(i * 3 + 1) * 7 + 0] = sx;
-    a[(i * 3 + 1) * 7 + 1] = sy;
-    a[(i * 3 + 1) * 7 + 2] = 1;
-    a[(i * 3 + 1) * 7 + 3] = a[(i * 3 + 1) * 7 + 4] = 0;
-    a[(i * 3 + 1) * 7 + 5] = -dx * sy;
-    a[(i * 3 + 1) * 7 + 6] = -dx;
-
-    a[(i * 3 + 2) * 7 + 0] = -dy * sx;
-    a[(i * 3 + 2) * 7 + 1] = -dy * sy;
-    a[(i * 3 + 2) * 7 + 2] = -dy;
-    a[(i * 3 + 2) * 7 + 3] = dx * sy;
-    a[(i * 3 + 2) * 7 + 4] = dx;
-    a[(i * 3 + 2) * 7 + 5] = a[(i * 3 + 2) * 7 + 6] = 0;
-  }
-
-  if (SVD(U, S, V, a, np3, 7)) {
-    aom_free(a);
-    return 1;
-  } else {
-    double minS = 1e12;
-    mini = -1;
-    for (i = 0; i < 7; ++i) {
-      if (S[i] < minS) {
-        minS = S[i];
-        mini = i;
-      }
-    }
-  }
-  H[3] = H[6] = 0;
-  for (i = 0; i < 3; i++) H[i] = V[i * 7 + mini];
-  for (; i < 5; i++) H[i + 1] = V[i * 7 + mini];
-  for (; i < 7; i++) H[i + 2] = V[i * 7 + mini];
-
-  denormalize_homography_reorder(H, T1, T2);
-  aom_free(a);
-  if (H[8] == 0.0) {
-    return 1;
-  } else {
-    // normalize
-    double f = 1.0 / H[8];
-    for (i = 0; i < 8; i++) mat[i] = f * H[i];
-  }
-  return 0;
-}
-
-static int find_homography(int np, double *pts1, double *pts2, double *mat) {
-  // Implemented from Peter Kovesi's normalized implementation
-  const int np3 = np * 3;
-  double *a = (double *)aom_malloc(sizeof(*a) * np3 * 18);
-  double *U = a + np3 * 9;
-  double S[9], V[9 * 9], H[9];
-  int i, mini;
-  double sx, sy, dx, dy;
-  double T1[9], T2[9];
-
-  normalize_homography(pts1, np, T1);
-  normalize_homography(pts2, np, T2);
-
-  for (i = 0; i < np; ++i) {
-    dx = *(pts2++);
-    dy = *(pts2++);
-    sx = *(pts1++);
-    sy = *(pts1++);
-
-    a[i * 3 * 9 + 0] = a[i * 3 * 9 + 1] = a[i * 3 * 9 + 2] = 0;
-    a[i * 3 * 9 + 3] = -sx;
-    a[i * 3 * 9 + 4] = -sy;
-    a[i * 3 * 9 + 5] = -1;
-    a[i * 3 * 9 + 6] = dy * sx;
-    a[i * 3 * 9 + 7] = dy * sy;
-    a[i * 3 * 9 + 8] = dy;
-
-    a[(i * 3 + 1) * 9 + 0] = sx;
-    a[(i * 3 + 1) * 9 + 1] = sy;
-    a[(i * 3 + 1) * 9 + 2] = 1;
-    a[(i * 3 + 1) * 9 + 3] = a[(i * 3 + 1) * 9 + 4] = a[(i * 3 + 1) * 9 + 5] =
-        0;
-    a[(i * 3 + 1) * 9 + 6] = -dx * sx;
-    a[(i * 3 + 1) * 9 + 7] = -dx * sy;
-    a[(i * 3 + 1) * 9 + 8] = -dx;
-
-    a[(i * 3 + 2) * 9 + 0] = -dy * sx;
-    a[(i * 3 + 2) * 9 + 1] = -dy * sy;
-    a[(i * 3 + 2) * 9 + 2] = -dy;
-    a[(i * 3 + 2) * 9 + 3] = dx * sx;
-    a[(i * 3 + 2) * 9 + 4] = dx * sy;
-    a[(i * 3 + 2) * 9 + 5] = dx;
-    a[(i * 3 + 2) * 9 + 6] = a[(i * 3 + 2) * 9 + 7] = a[(i * 3 + 2) * 9 + 8] =
-        0;
-  }
-
-  if (SVD(U, S, V, a, np3, 9)) {
-    aom_free(a);
-    return 1;
-  } else {
-    double minS = 1e12;
-    mini = -1;
-    for (i = 0; i < 9; ++i) {
-      if (S[i] < minS) {
-        minS = S[i];
-        mini = i;
-      }
-    }
-  }
-
-  for (i = 0; i < 9; i++) H[i] = V[i * 9 + mini];
-  denormalize_homography_reorder(H, T1, T2);
-  aom_free(a);
-  if (H[8] == 0.0) {
-    return 1;
-  } else {
-    // normalize
-    double f = 1.0 / H[8];
-    for (i = 0; i < 8; i++) mat[i] = f * H[i];
-  }
-  return 0;
-}
-
 static int get_rand_indices(int npoints, int minpts, int *indices,
                             unsigned int *seed) {
   int i, j;
@@ -860,11 +579,6 @@ static int is_degenerate_affine(double *p) {
   return is_collinear3(p, p + 2, p + 4);
 }
 
-static int is_degenerate_homography(double *p) {
-  return is_collinear3(p, p + 2, p + 4) || is_collinear3(p, p + 2, p + 6) ||
-         is_collinear3(p, p + 4, p + 6) || is_collinear3(p + 2, p + 4, p + 6);
-}
-
 int ransac_translation(int *matched_points, int npoints,
                        int *num_inliers_by_motion, double *params_by_motion,
                        int num_desired_motions) {
@@ -887,30 +601,3 @@ int ransac_affine(int *matched_points, int npoints, int *num_inliers_by_motion,
                 params_by_motion, num_desired_motions, 3, is_degenerate_affine,
                 find_affine, project_points_double_affine);
 }
-
-int ransac_homography(int *matched_points, int npoints,
-                      int *num_inliers_by_motion, double *params_by_motion,
-                      int num_desired_motions) {
-  return ransac(matched_points, npoints, num_inliers_by_motion,
-                params_by_motion, num_desired_motions, 4,
-                is_degenerate_homography, find_homography,
-                project_points_double_homography);
-}
-
-int ransac_hortrapezoid(int *matched_points, int npoints,
-                        int *num_inliers_by_motion, double *params_by_motion,
-                        int num_desired_motions) {
-  return ransac(matched_points, npoints, num_inliers_by_motion,
-                params_by_motion, num_desired_motions, 4,
-                is_degenerate_homography, find_hortrapezoid,
-                project_points_double_hortrapezoid);
-}
-
-int ransac_vertrapezoid(int *matched_points, int npoints,
-                        int *num_inliers_by_motion, double *params_by_motion,
-                        int num_desired_motions) {
-  return ransac(matched_points, npoints, num_inliers_by_motion,
-                params_by_motion, num_desired_motions, 4,
-                is_degenerate_homography, find_vertrapezoid,
-                project_points_double_vertrapezoid);
-}
diff --git a/third_party/aom/av1/encoder/ransac.h b/third_party/aom/av1/encoder/ransac.h
index f611add36..1019055ed 100644
--- a/third_party/aom/av1/encoder/ransac.h
+++ b/third_party/aom/av1/encoder/ransac.h
@@ -25,17 +25,8 @@ typedef int (*RansacFunc)(int *matched_points, int npoints,
 
 /* Each of these functions fits a motion model from a set of
    corresponding points in 2 frames using RANSAC. */
-int ransac_homography(int *matched_points, int npoints,
-                      int *num_inliers_by_motion, double *params_by_motion,
-                      int num_motions);
 int ransac_affine(int *matched_points, int npoints, int *num_inliers_by_motion,
                   double *params_by_motion, int num_motions);
-int ransac_hortrapezoid(int *matched_points, int npoints,
-                        int *num_inliers_by_motion, double *params_by_motion,
-                        int num_motions);
-int ransac_vertrapezoid(int *matched_points, int npoints,
-                        int *num_inliers_by_motion, double *params_by_motion,
-                        int num_motions);
 int ransac_rotzoom(int *matched_points, int npoints, int *num_inliers_by_motion,
                    double *params_by_motion, int num_motions);
 int ransac_translation(int *matched_points, int npoints,
diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c
index a90cb880e..ac9392fa1 100644
--- a/third_party/aom/av1/encoder/ratectrl.c
+++ b/third_party/aom/av1/encoder/ratectrl.c
@@ -44,7 +44,6 @@
 #define MAX_BPB_FACTOR 50
 
 #define FRAME_OVERHEAD_BITS 200
-#if CONFIG_HIGHBITDEPTH
 #define ASSIGN_MINQ_TABLE(bit_depth, name)                   \
   do {                                                       \
     switch (bit_depth) {                                     \
@@ -58,13 +57,6 @@
         name = NULL;                                         \
     }                                                        \
   } while (0)
-#else
-#define ASSIGN_MINQ_TABLE(bit_depth, name) \
-  do {                                     \
-    (void)bit_depth;                       \
-    name = name##_8;                       \
-  } while (0)
-#endif
 
 // Tables relating active max Q to active min Q
 static int kf_low_motion_minq_8[QINDEX_RANGE];
@@ -74,7 +66,6 @@ static int arfgf_high_motion_minq_8[QINDEX_RANGE];
 static int inter_minq_8[QINDEX_RANGE];
 static int rtc_minq_8[QINDEX_RANGE];
 
-#if CONFIG_HIGHBITDEPTH
 static int kf_low_motion_minq_10[QINDEX_RANGE];
 static int kf_high_motion_minq_10[QINDEX_RANGE];
 static int arfgf_low_motion_minq_10[QINDEX_RANGE];
@@ -87,7 +78,6 @@ static int arfgf_low_motion_minq_12[QINDEX_RANGE];
 static int arfgf_high_motion_minq_12[QINDEX_RANGE];
 static int inter_minq_12[QINDEX_RANGE];
 static int rtc_minq_12[QINDEX_RANGE];
-#endif
 
 static int gf_high = 2000;
 static int gf_low = 400;
@@ -97,7 +87,6 @@ static int kf_low = 400;
 // How many times less pixels there are to encode given the current scaling.
 // Temporary replacement for rcf_mult and rate_thresh_mult.
 static double resize_rate_factor(const AV1_COMP *cpi, int width, int height) {
-  (void)cpi;
   return (double)(cpi->oxcf.width * cpi->oxcf.height) / (width * height);
 }
 
@@ -140,33 +129,27 @@ void av1_rc_init_minq_luts(void) {
   init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8,
                  arfgf_low_motion_minq_8, arfgf_high_motion_minq_8,
                  inter_minq_8, rtc_minq_8, AOM_BITS_8);
-#if CONFIG_HIGHBITDEPTH
   init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10,
                  arfgf_low_motion_minq_10, arfgf_high_motion_minq_10,
                  inter_minq_10, rtc_minq_10, AOM_BITS_10);
   init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12,
                  arfgf_low_motion_minq_12, arfgf_high_motion_minq_12,
                  inter_minq_12, rtc_minq_12, AOM_BITS_12);
-#endif
 }
 
 // These functions use formulaic calculations to make playing with the
 // quantizer tables easier. If necessary they can be replaced by lookup
 // tables if and when things settle down in the experimental bitstream
 double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth) {
-// Convert the index to a real Q value (scaled down to match old Q values)
-#if CONFIG_HIGHBITDEPTH
+  // Convert the index to a real Q value (scaled down to match old Q values)
   switch (bit_depth) {
-    case AOM_BITS_8: return av1_ac_quant(qindex, 0, bit_depth) / 4.0;
-    case AOM_BITS_10: return av1_ac_quant(qindex, 0, bit_depth) / 16.0;
-    case AOM_BITS_12: return av1_ac_quant(qindex, 0, bit_depth) / 64.0;
+    case AOM_BITS_8: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 4.0;
+    case AOM_BITS_10: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 16.0;
+    case AOM_BITS_12: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 64.0;
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1.0;
   }
-#else
-  return av1_ac_quant(qindex, 0, bit_depth) / 4.0;
-#endif
 }
 
 int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
@@ -196,12 +179,8 @@ int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target) {
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
   const int min_frame_target =
       AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
-// Clip the frame target to the minimum setup value.
-#if CONFIG_EXT_REFS
+  // Clip the frame target to the minimum setup value.
   if (cpi->rc.is_src_frame_alt_ref) {
-#else
-  if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
-#endif  // CONFIG_EXT_REFS
     // If there is an active ARF at this location use the minimum
     // bits on this frame even if it is a constructed arf.
     // The active maximum quantizer insures that an appropriate
@@ -239,14 +218,10 @@ static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) {
   const AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
 
-// Non-viewable frames are a special case and are treated as pure overhead.
-#if CONFIG_EXT_REFS
+  // Non-viewable frames are a special case and are treated as pure overhead.
   // TODO(zoeliu): To further explore whether we should treat BWDREF_FRAME
   //               differently, since it is a no-show frame.
   if (!cm->show_frame && !rc->is_bwd_ref_frame)
-#else
-  if (!cm->show_frame)
-#endif  // CONFIG_EXT_REFS
     rc->bits_off_target -= encoded_frame_size;
   else
     rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
@@ -590,11 +565,9 @@ static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) {
     active_worst_quality =
         curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] * 2;
   } else {
-    if (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame ||
-#if CONFIG_EXT_REFS
-                                      cpi->refresh_alt2_ref_frame ||
-#endif  // CONFIG_EXT_REFS
-                                      cpi->refresh_alt_ref_frame)) {
+    if (!rc->is_src_frame_alt_ref &&
+        (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
+         cpi->refresh_alt_ref_frame)) {
       active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4
                                              : rc->last_q[INTER_FRAME];
     } else {
@@ -931,26 +904,9 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
 }
 
 int av1_frame_type_qdelta(const AV1_COMP *cpi, int rf_level, int q) {
-  static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
-    1.00,  // INTER_NORMAL
-#if CONFIG_EXT_REFS
-    0.80,  // INTER_LOW
-    1.50,  // INTER_HIGH
-    1.25,  // GF_ARF_LOW
-#else
-    1.00,  // INTER_HIGH
-    1.50,  // GF_ARF_LOW
-#endif     // CONFIG_EXT_REFS
-    2.00,  // GF_ARF_STD
-    2.00,  // KF_STD
+  static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] = {
+    INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME
   };
-  static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] =
-#if CONFIG_EXT_REFS
-      { INTER_FRAME, INTER_FRAME, INTER_FRAME,
-        INTER_FRAME, INTER_FRAME, KEY_FRAME };
-#else
-      { INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME };
-#endif  // CONFIG_EXT_REFS
   const AV1_COMMON *const cm = &cpi->common;
   int qdelta =
       av1_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q,
@@ -1020,11 +976,9 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
       active_best_quality +=
           av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
     }
-  } else if (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame ||
-#if CONFIG_EXT_REFS
-                                           cpi->refresh_alt2_ref_frame ||
-#endif  // CONFIG_EXT_REFS
-                                           cpi->refresh_alt_ref_frame)) {
+  } else if (!rc->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
+              cpi->refresh_alt_ref_frame)) {
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
     // a key frame.
@@ -1044,11 +998,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
       active_best_quality = active_best_quality * 15 / 16;
 
     } else if (oxcf->rc_mode == AOM_Q) {
-#if CONFIG_EXT_REFS
       if (!cpi->refresh_alt_ref_frame && !cpi->refresh_alt2_ref_frame) {
-#else
-      if (!cpi->refresh_alt_ref_frame) {
-#endif  // CONFIG_EXT_REFS
         active_best_quality = cq_level;
       } else {
         active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
@@ -1080,11 +1030,9 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
   if ((cpi->oxcf.rc_mode != AOM_Q) &&
       (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
     if (frame_is_intra_only(cm) ||
-        (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame ||
-#if CONFIG_EXT_REFS
-                                       cpi->refresh_alt2_ref_frame ||
-#endif  // CONFIG_EXT_REFS
-                                       cpi->refresh_alt_ref_frame))) {
+        (!rc->is_src_frame_alt_ref &&
+         (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
+          cpi->refresh_alt_ref_frame))) {
       active_best_quality -=
           (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
       active_worst_quality += (cpi->twopass.extend_maxq / 2);
@@ -1106,7 +1054,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
   }
 
   // Modify active_best_quality for downscaled normal frames.
-  if (!av1_frame_unscaled(cm) && !frame_is_kf_gf_arf(cpi)) {
+  if (av1_frame_scaled(cm) && !frame_is_kf_gf_arf(cpi)) {
     int qdelta = av1_compute_qdelta_by_rate(
         rc, cm->frame_type, active_best_quality, 2.0, cm->bit_depth);
     active_best_quality =
@@ -1193,7 +1141,7 @@ static void rc_set_frame_target(AV1_COMP *cpi, int target, int width,
   rc->this_frame_target = target;
 
   // Modify frame size target when down-scaled.
-  if (!av1_frame_unscaled(cm))
+  if (av1_frame_scaled(cm))
     rc->this_frame_target =
         (int)(rc->this_frame_target * resize_rate_factor(cpi, width, height));
 
@@ -1217,21 +1165,13 @@ static void update_alt_ref_frame_stats(AV1_COMP *cpi) {
 static void update_golden_frame_stats(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
 
-#if CONFIG_EXT_REFS
   // Update the Golden frame usage counts.
   // NOTE(weitinglin): If we use show_existing_frame for an OVERLAY frame,
   //                   only the virtual indices for the reference frame will be
   //                   updated and cpi->refresh_golden_frame will still be zero.
   if (cpi->refresh_golden_frame || rc->is_src_frame_alt_ref) {
-#else   // !CONFIG_EXT_REFS
-  // Update the Golden frame usage counts.
-  if (cpi->refresh_golden_frame) {
-#endif  // CONFIG_EXT_REFS
-
-#if CONFIG_EXT_REFS
     // We will not use internal overlay frames to replace the golden frame
     if (!rc->is_src_frame_ext_arf)
-#endif  // CONFIG_EXT_REFS
       // this frame refreshes means next frames don't unless specified by user
       rc->frames_since_golden = 0;
 
@@ -1248,11 +1188,7 @@ static void update_golden_frame_stats(AV1_COMP *cpi) {
     // Decrement count down till next gf
     if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
 
-#if CONFIG_EXT_REFS
   } else if (!cpi->refresh_alt_ref_frame && !cpi->refresh_alt2_ref_frame) {
-#else
-  } else if (!cpi->refresh_alt_ref_frame) {
-#endif  // CONFIG_EXT_REFS
     // Decrement count down till next gf
     if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
 
@@ -1282,10 +1218,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
   } else {
     if (!rc->is_src_frame_alt_ref &&
-        !(cpi->refresh_golden_frame ||
-#if CONFIG_EXT_REFS
-          cpi->refresh_alt2_ref_frame ||
-#endif  // CONFIG_EXT_REFS
+        !(cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
           cpi->refresh_alt_ref_frame)) {
       rc->last_q[INTER_FRAME] = qindex;
       rc->avg_frame_qindex[INTER_FRAME] =
@@ -1307,10 +1240,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
   // This is used to help set quality in forced key frames to reduce popping
   if ((qindex < rc->last_boosted_qindex) || (cm->frame_type == KEY_FRAME) ||
       (!rc->constrained_gf_group &&
-       (cpi->refresh_alt_ref_frame ||
-#if CONFIG_EXT_REFS
-        cpi->refresh_alt2_ref_frame ||
-#endif  // CONFIG_EXT_REFS
+       (cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame ||
         (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
     rc->last_boosted_qindex = qindex;
   }
@@ -1320,7 +1250,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
 
   // Rolling monitors of whether we are over or underspending used to help
   // regulate min and Max Q in two pass.
-  if (!av1_frame_unscaled(cm))
+  if (av1_frame_scaled(cm))
     rc->this_frame_target =
         (int)(rc->this_frame_target /
               resize_rate_factor(cpi, cm->width, cm->height));
@@ -1337,14 +1267,10 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
 
   // Actual bits spent
   rc->total_actual_bits += rc->projected_frame_size;
-#if CONFIG_EXT_REFS
   // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
   //               differently here for rc->avg_frame_bandwidth.
   rc->total_target_bits +=
       (cm->show_frame || rc->is_bwd_ref_frame) ? rc->avg_frame_bandwidth : 0;
-#else
-  rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0;
-#endif  // CONFIG_EXT_REFS
 
   rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
 
@@ -1358,13 +1284,9 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
 
   if (cm->frame_type == KEY_FRAME) rc->frames_since_key = 0;
 
-#if CONFIG_EXT_REFS
   // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
   //               differently here for rc->avg_frame_bandwidth.
   if (cm->show_frame || rc->is_bwd_ref_frame) {
-#else
-  if (cm->show_frame) {
-#endif  // CONFIG_EXT_REFS
     rc->frames_since_key++;
     rc->frames_to_key--;
   }
@@ -1417,6 +1339,10 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   int target;
+  int altref_enabled = is_altref_enabled(cpi);
+  int sframe_dist = cpi->oxcf.sframe_dist;
+  int sframe_mode = cpi->oxcf.sframe_mode;
+  int sframe_enabled = cpi->oxcf.sframe_enabled;
   // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
   if (!cpi->refresh_alt_ref_frame &&
       (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
@@ -1429,6 +1355,37 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
     rc->source_alt_ref_active = 0;
   } else {
     cm->frame_type = INTER_FRAME;
+    if (sframe_enabled) {
+      if (altref_enabled) {
+        if (sframe_mode == 1) {
+          // sframe_mode == 1: insert sframe if it matches altref frame.
+
+          if (cm->current_video_frame % sframe_dist == 0 &&
+              cm->frame_type != KEY_FRAME && cm->current_video_frame != 0 &&
+              cpi->refresh_alt_ref_frame) {
+            cm->frame_type = S_FRAME;
+          }
+        } else {
+          // sframe_mode != 1: if sframe will be inserted at the next available
+          // altref frame
+
+          if (cm->current_video_frame % sframe_dist == 0 &&
+              cm->frame_type != KEY_FRAME && cm->current_video_frame != 0) {
+            rc->sframe_due = 1;
+          }
+
+          if (rc->sframe_due && cpi->refresh_alt_ref_frame) {
+            cm->frame_type = S_FRAME;
+            rc->sframe_due = 0;
+          }
+        }
+      } else {
+        if (cm->current_video_frame % sframe_dist == 0 &&
+            cm->frame_type != KEY_FRAME && cm->current_video_frame != 0) {
+          cm->frame_type = S_FRAME;
+        }
+      }
+    }
   }
   if (rc->frames_till_gf_update_due == 0) {
     rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
@@ -1444,6 +1401,10 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
     rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
     rc->gfu_boost = DEFAULT_GF_BOOST;
   }
+
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    av1_cyclic_refresh_update_parameters(cpi);
+
   if (cm->frame_type == KEY_FRAME)
     target = calc_iframe_target_size_one_pass_vbr(cpi);
   else
diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h
index 8b410e778..81157ce72 100644
--- a/third_party/aom/av1/encoder/ratectrl.h
+++ b/third_party/aom/av1/encoder/ratectrl.h
@@ -28,7 +28,6 @@ extern "C" {
 #define MAX_GF_INTERVAL 16
 #define FIXED_GF_INTERVAL 8  // Used in some testing modes only
 
-#if CONFIG_EXT_REFS
 typedef enum {
   INTER_NORMAL = 0,
   INTER_LOW = 1,
@@ -38,23 +37,20 @@ typedef enum {
   KF_STD = 5,
   RATE_FACTOR_LEVELS = 6
 } RATE_FACTOR_LEVEL;
-#else
-typedef enum {
-  INTER_NORMAL = 0,
-  INTER_HIGH = 1,
-  GF_ARF_LOW = 2,
-  GF_ARF_STD = 3,
-  KF_STD = 4,
-  RATE_FACTOR_LEVELS = 5
-} RATE_FACTOR_LEVEL;
-#endif  // CONFIG_EXT_REFS
+
+static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
+  1.00,  // INTER_NORMAL
+  0.80,  // INTER_LOW
+  1.50,  // INTER_HIGH
+  1.25,  // GF_ARF_LOW
+  2.00,  // GF_ARF_STD
+  2.00,  // KF_STD
+};
 
 typedef struct {
   int resize_width;
   int resize_height;
-#if CONFIG_FRAME_SUPERRES
   uint8_t superres_denom;
-#endif  // CONFIG_FRAME_SUPERRES
 } size_params_type;
 
 typedef struct {
@@ -88,8 +84,8 @@ typedef struct {
   int source_alt_ref_pending;
   int source_alt_ref_active;
   int is_src_frame_alt_ref;
+  int sframe_due;
 
-#if CONFIG_EXT_REFS
   // Length of the bi-predictive frame group interval
   int bipred_group_interval;
 
@@ -99,7 +95,6 @@ typedef struct {
   int is_last_bipred_frame;
   int is_bipred_frame;
   int is_src_frame_ext_arf;
-#endif  // CONFIG_EXT_REFS
 
   int avg_frame_bandwidth;  // Average frame size target for clip
   int min_frame_bandwidth;  // Minimum allocation used for any frame
diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.c b/third_party/aom/av1/encoder/ratectrl_xiph.c
index b9f827528..e69de29bb 100644
--- a/third_party/aom/av1/encoder/ratectrl_xiph.c
+++ b/third_party/aom/av1/encoder/ratectrl_xiph.c
@@ -1,1244 +0,0 @@
-/*
- * Copyright (c) 2001-2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include "av1/common/odintrin.h"
-#include "av1/encoder/ratectrl_xiph.h"
-
-#define OD_Q57(v) ((int64_t)((uint64_t)(v) << 57))
-#define OD_F_Q45(v) ((int64_t)(((v) * ((int64_t)1 << 45))))
-#define OD_F_Q12(v) ((int32_t)(((v) * ((int32_t)1 << 12))))
-
-/*A rough lookup table for tan(x), 0 <= x < pi/2.
-  The values are Q12 fixed-point and spaced at 5 degree intervals.
-  These decisions are somewhat arbitrary, but sufficient for the 2nd order
-   Bessel follower below.
-  Values of x larger than 85 degrees are extrapolated from the last interval,
-   which is way off, but "good enough".*/
-static uint16_t OD_ROUGH_TAN_LOOKUP[18] = { 0,     358,   722,  1098, 1491,
-                                            1910,  2365,  2868, 3437, 4096,
-                                            4881,  5850,  7094, 8784, 11254,
-                                            15286, 23230, 46817 };
-
-/*alpha is Q24 in the range [0,0.5).
-  The return values is 5.12.*/
-static int od_warp_alpha(int alpha) {
-  int i;
-  int d;
-  int t0;
-  int t1;
-  i = alpha * 36 >> 24;
-  if (i >= 17) i = 16;
-  t0 = OD_ROUGH_TAN_LOOKUP[i];
-  t1 = OD_ROUGH_TAN_LOOKUP[i + 1];
-  d = alpha * 36 - (i << 24);
-  return (int)((((int64_t)t0 << 32) + ((t1 - t0) << 8) * (int64_t)d) >> 32);
-}
-
-static const int64_t OD_ATANH_LOG2[32] = {
-  0x32B803473F7AD0F4LL, 0x2F2A71BD4E25E916LL, 0x2E68B244BB93BA06LL,
-  0x2E39FB9198CE62E4LL, 0x2E2E683F68565C8FLL, 0x2E2B850BE2077FC1LL,
-  0x2E2ACC58FE7B78DBLL, 0x2E2A9E2DE52FD5F2LL, 0x2E2A92A338D53EECLL,
-  0x2E2A8FC08F5E19B6LL, 0x2E2A8F07E51A485ELL, 0x2E2A8ED9BA8AF388LL,
-  0x2E2A8ECE2FE7384ALL, 0x2E2A8ECB4D3E4B1ALL, 0x2E2A8ECA94940FE8LL,
-  0x2E2A8ECA6669811DLL, 0x2E2A8ECA5ADEDD6ALL, 0x2E2A8ECA57FC347ELL,
-  0x2E2A8ECA57438A43LL, 0x2E2A8ECA57155FB4LL, 0x2E2A8ECA5709D510LL,
-  0x2E2A8ECA5706F267LL, 0x2E2A8ECA570639BDLL, 0x2E2A8ECA57060B92LL,
-  0x2E2A8ECA57060008LL, 0x2E2A8ECA5705FD25LL, 0x2E2A8ECA5705FC6CLL,
-  0x2E2A8ECA5705FC3ELL, 0x2E2A8ECA5705FC33LL, 0x2E2A8ECA5705FC30LL,
-  0x2E2A8ECA5705FC2FLL, 0x2E2A8ECA5705FC2FLL
-};
-
-static int od_ilog64(int64_t v) {
-  static const unsigned char OD_DEBRUIJN_IDX64[64] = {
-    0,  1,  2,  7,  3,  13, 8,  19, 4,  25, 14, 28, 9,  34, 20, 40,
-    5,  17, 26, 38, 15, 46, 29, 48, 10, 31, 35, 54, 21, 50, 41, 57,
-    63, 6,  12, 18, 24, 27, 33, 39, 16, 37, 45, 47, 30, 53, 49, 56,
-    62, 11, 23, 32, 36, 44, 52, 55, 61, 22, 43, 51, 60, 42, 59, 58
-  };
-  int ret;
-  v |= v >> 1;
-  v |= v >> 2;
-  v |= v >> 4;
-  v |= v >> 8;
-  v |= v >> 16;
-  v |= v >> 32;
-  ret = (int)v & 1;
-  v = (v >> 1) + 1;
-  ret += OD_DEBRUIJN_IDX64[v * UINT64_C(0x218A392CD3D5DBF) >> 58 & 0x3F];
-  return ret;
-}
-
-/*Computes the binary exponential of logq57.
-  input: a log base 2 in Q57 format
-  output: a 64 bit integer in Q0 (no fraction) */
-static int64_t od_bexp64(int64_t logq57) {
-  int64_t w;
-  int64_t z;
-  int ipart;
-  ipart = (int)(logq57 >> 57);
-  if (ipart < 0) return 0;
-  if (ipart >= 63) return 0x7FFFFFFFFFFFFFFFLL;
-  z = logq57 - OD_Q57(ipart);
-  if (z) {
-    int64_t mask;
-    int64_t wlo;
-    int i;
-    /*C doesn't give us 64x64->128 muls, so we use CORDIC.
-      This is not particularly fast, but it's not being used in time-critical
-       code; it is very accurate.*/
-    /*z is the fractional part of the log in Q62 format.
-      We need 1 bit of headroom since the magnitude can get larger than 1
-       during the iteration, and a sign bit.*/
-    z <<= 5;
-    /*w is the exponential in Q61 format (since it also needs headroom and can
-       get as large as 2.0); we could get another bit if we dropped the sign,
-       but we'll recover that bit later anyway.
-      Ideally this should start out as
-        \lim_{n->\infty} 2^{61}/\product_{i=1}^n \sqrt{1-2^{-2i}}
-       but in order to guarantee convergence we have to repeat iterations 4,
-        13 (=3*4+1), and 40 (=3*13+1, etc.), so it winds up somewhat larger.*/
-    w = 0x26A3D0E401DD846DLL;
-    for (i = 0;; i++) {
-      mask = -(z < 0);
-      w += ((w >> (i + 1)) + mask) ^ mask;
-      z -= (OD_ATANH_LOG2[i] + mask) ^ mask;
-      /*Repeat iteration 4.*/
-      if (i >= 3) break;
-      z *= 2;
-    }
-    for (;; i++) {
-      mask = -(z < 0);
-      w += ((w >> (i + 1)) + mask) ^ mask;
-      z -= (OD_ATANH_LOG2[i] + mask) ^ mask;
-      /*Repeat iteration 13.*/
-      if (i >= 12) break;
-      z *= 2;
-    }
-    for (; i < 32; i++) {
-      mask = -(z < 0);
-      w += ((w >> (i + 1)) + mask) ^ mask;
-      z = (z - ((OD_ATANH_LOG2[i] + mask) ^ mask)) * 2;
-    }
-    wlo = 0;
-    /*Skip the remaining iterations unless we really require that much
-       precision.
-      We could have bailed out earlier for smaller iparts, but that would
-       require initializing w from a table, as the limit doesn't converge to
-       61-bit precision until n=30.*/
-    if (ipart > 30) {
-      /*For these iterations, we just update the low bits, as the high bits
-         can't possibly be affected.
-        OD_ATANH_LOG2 has also converged (it actually did so one iteration
-         earlier, but that's no reason for an extra special case).*/
-      for (;; i++) {
-        mask = -(z < 0);
-        wlo += ((w >> i) + mask) ^ mask;
-        z -= (OD_ATANH_LOG2[31] + mask) ^ mask;
-        /*Repeat iteration 40.*/
-        if (i >= 39) break;
-        z <<= 1;
-      }
-      for (; i < 61; i++) {
-        mask = -(z < 0);
-        wlo += ((w >> i) + mask) ^ mask;
-        z = (z - ((OD_ATANH_LOG2[31] + mask) ^ mask)) << 1;
-      }
-    }
-    w = (w << 1) + wlo;
-  } else {
-    w = (int64_t)1 << 62;
-  }
-  if (ipart < 62) {
-    w = ((w >> (61 - ipart)) + 1) >> 1;
-  }
-  return w;
-}
-
-/*Computes the binary log of w
-  input: a 64-bit integer in Q0 (no fraction)
-  output: a 64-bit log in Q57 */
-static int64_t od_blog64(int64_t w) {
-  int64_t z;
-  int ipart;
-  if (w <= 0) return -1;
-  ipart = od_ilog64(w) - 1;
-  if (ipart > 61) {
-    w >>= ipart - 61;
-  } else {
-    w <<= 61 - ipart;
-  }
-  z = 0;
-  if (w & (w - 1)) {
-    int64_t x;
-    int64_t y;
-    int64_t u;
-    int64_t mask;
-    int i;
-    /*C doesn't give us 64x64->128 muls, so we use CORDIC.
-      This is not particularly fast, but it's not being used in time-critical
-       code; it is very accurate.*/
-    /*z is the fractional part of the log in Q61 format.*/
-    /*x and y are the cosh() and sinh(), respectively, in Q61 format.
-      We are computing z = 2*atanh(y/x) = 2*atanh((w - 1)/(w + 1)).*/
-    x = w + ((int64_t)1 << 61);
-    y = w - ((int64_t)1 << 61);
-    for (i = 0; i < 4; i++) {
-      mask = -(y < 0);
-      z += ((OD_ATANH_LOG2[i] >> i) + mask) ^ mask;
-      u = x >> (i + 1);
-      x -= ((y >> (i + 1)) + mask) ^ mask;
-      y -= (u + mask) ^ mask;
-    }
-    /*Repeat iteration 4.*/
-    for (i--; i < 13; i++) {
-      mask = -(y < 0);
-      z += ((OD_ATANH_LOG2[i] >> i) + mask) ^ mask;
-      u = x >> (i + 1);
-      x -= ((y >> (i + 1)) + mask) ^ mask;
-      y -= (u + mask) ^ mask;
-    }
-    /*Repeat iteration 13.*/
-    for (i--; i < 32; i++) {
-      mask = -(y < 0);
-      z += ((OD_ATANH_LOG2[i] >> i) + mask) ^ mask;
-      u = x >> (i + 1);
-      x -= ((y >> (i + 1)) + mask) ^ mask;
-      y -= (u + mask) ^ mask;
-    }
-    /*OD_ATANH_LOG2 has converged.*/
-    for (; i < 40; i++) {
-      mask = -(y < 0);
-      z += ((OD_ATANH_LOG2[31] >> i) + mask) ^ mask;
-      u = x >> (i + 1);
-      x -= ((y >> (i + 1)) + mask) ^ mask;
-      y -= (u + mask) ^ mask;
-    }
-    /*Repeat iteration 40.*/
-    for (i--; i < 62; i++) {
-      mask = -(y < 0);
-      z += ((OD_ATANH_LOG2[31] >> i) + mask) ^ mask;
-      u = x >> (i + 1);
-      x -= ((y >> (i + 1)) + mask) ^ mask;
-      y -= (u + mask) ^ mask;
-    }
-    z = (z + 8) >> 4;
-  }
-  return OD_Q57(ipart) + z;
-}
-
-/*Convenience function converts Q57 value to a clamped 32-bit Q24 value
-  in: input in Q57 format.
-  Return: same number in Q24 */
-static int32_t od_q57_to_q24(int64_t in) {
-  int64_t ret;
-  ret = (in + ((int64_t)1 << 32)) >> 33;
-  /*0x80000000 is automatically converted to unsigned on 32-bit systems.
-    -0x7FFFFFFF-1 is needed to avoid "promoting" the whole expression to
-    unsigned.*/
-  return (int32_t)OD_CLAMPI(-0x7FFFFFFF - 1, ret, 0x7FFFFFFF);
-}
-
-/*Binary exponential of log_scale with 24-bit fractional precision and
-   saturation.
-  log_scale: A binary logarithm in Q57 format.
-  Return: The binary exponential in Q24 format, saturated to 2**31-1 if
-   log_scale was too large.*/
-static int32_t od_bexp64_q24(int64_t log_scale) {
-  if (log_scale < OD_Q57(8)) {
-    int64_t ret;
-    ret = od_bexp64(log_scale + OD_Q57(24));
-    return ret < 0x7FFFFFFF ? (int32_t)ret : 0x7FFFFFFF;
-  }
-  return 0x7FFFFFFF;
-}
-
-/*Re-initialize Bessel filter coefficients with the specified delay.
-  This does not alter the x/y state, but changes the reaction time of the
-   filter.
-  Altering the time constant of a reactive filter without alterning internal
-   state is something that has to be done carefuly, but our design operates at
-   high enough delays and with small enough time constant changes to make it
-   safe.*/
-static void od_iir_bessel2_reinit(od_iir_bessel2 *f, int delay) {
-  int alpha;
-  int64_t one48;
-  int64_t warp;
-  int64_t k1;
-  int64_t k2;
-  int64_t d;
-  int64_t a;
-  int64_t ik2;
-  int64_t b1;
-  int64_t b2;
-  /*This borrows some code from an unreleased version of Postfish.
-    See the recipe at http://unicorn.us.com/alex/2polefilters.html for details
-     on deriving the filter coefficients.*/
-  /*alpha is Q24*/
-  alpha = (1 << 24) / delay;
-  one48 = (int64_t)1 << 48;
-  /*warp is 7.12*/
-  warp = OD_MAXI(od_warp_alpha(alpha), 1);
-  /*k1 is 9.12*/
-  k1 = 3 * warp;
-  /*k2 is 16.24.*/
-  k2 = k1 * warp;
-  /*d is 16.15.*/
-  d = ((((1 << 12) + k1) << 12) + k2 + 256) >> 9;
-  /*a is 0.32, since d is larger than both 1.0 and k2.*/
-  a = (k2 << 23) / d;
-  /*ik2 is 25.24.*/
-  ik2 = one48 / k2;
-  /*b1 is Q56; in practice, the integer ranges between -2 and 2.*/
-  b1 = 2 * a * (ik2 - (1 << 24));
-  /*b2 is Q56; in practice, the integer ranges between -2 and 2.*/
-  b2 = (one48 << 8) - ((4 * a) << 24) - b1;
-  /*All of the filter parameters are Q24.*/
-  f->c[0] = (int32_t)((b1 + ((int64_t)1 << 31)) >> 32);
-  f->c[1] = (int32_t)((b2 + ((int64_t)1 << 31)) >> 32);
-  f->g = (int32_t)((a + 128) >> 8);
-}
-
-/*Initialize a 2nd order low-pass Bessel filter with the corresponding delay
-   and initial value.
-  value is Q24.*/
-static void od_iir_bessel2_init(od_iir_bessel2 *f, int delay, int32_t value) {
-  od_iir_bessel2_reinit(f, delay);
-  f->y[1] = f->y[0] = f->x[1] = f->x[0] = value;
-}
-
-static int64_t od_iir_bessel2_update(od_iir_bessel2 *f, int32_t x) {
-  int64_t c0;
-  int64_t c1;
-  int64_t g;
-  int64_t x0;
-  int64_t x1;
-  int64_t y0;
-  int64_t y1;
-  int64_t ya;
-  c0 = f->c[0];
-  c1 = f->c[1];
-  g = f->g;
-  x0 = f->x[0];
-  x1 = f->x[1];
-  y0 = f->y[0];
-  y1 = f->y[1];
-  ya = ((x + x0 * 2 + x1) * g + y0 * c0 + y1 * c1 + (1 << 23)) >> 24;
-  f->x[1] = (int32_t)x0;
-  f->x[0] = x;
-  f->y[1] = (int32_t)y0;
-  f->y[0] = (int32_t)ya;
-  return ya;
-}
-
-static void od_enc_rc_reset(od_rc_state *rc) {
-  int64_t npixels;
-  int64_t ibpp;
-  rc->bits_per_frame = (int64_t)(rc->target_bitrate / rc->framerate);
-  /*Insane framerates or frame sizes mean insane bitrates.
-    Let's not get carried away.*/
-  if (rc->bits_per_frame > 0x400000000000LL) {
-    rc->bits_per_frame = (int64_t)0x400000000000LL;
-  } else {
-    if (rc->bits_per_frame < 32) {
-      rc->bits_per_frame = 32;
-    }
-  }
-  rc->reservoir_frame_delay = OD_MAXI(rc->reservoir_frame_delay, 12);
-  rc->reservoir_max = rc->bits_per_frame * rc->reservoir_frame_delay;
-  /*Start with a buffer fullness and fullness target of 50% */
-  rc->reservoir_target = (rc->reservoir_max + 1) >> 1;
-  rc->reservoir_fullness = rc->reservoir_target;
-  /*Pick exponents and initial scales for quantizer selection.*/
-  npixels = rc->frame_width * (int64_t)rc->frame_height;
-  rc->log_npixels = od_blog64(npixels);
-  ibpp = npixels / rc->bits_per_frame;
-  /*All of these initial scale/exp values are from Theora, and have not yet
-     been adapted to Daala, so they're certainly wrong.
-    The B-frame values especially are simply copies of the P-frame values.*/
-  if (ibpp < 1) {
-    rc->exp[OD_I_FRAME] = 59;
-    rc->log_scale[OD_I_FRAME] = od_blog64(1997) - OD_Q57(OD_COEFF_SHIFT);
-  } else if (ibpp < 2) {
-    rc->exp[OD_I_FRAME] = 55;
-    rc->log_scale[OD_I_FRAME] = od_blog64(1604) - OD_Q57(OD_COEFF_SHIFT);
-  } else {
-    rc->exp[OD_I_FRAME] = 48;
-    rc->log_scale[OD_I_FRAME] = od_blog64(834) - OD_Q57(OD_COEFF_SHIFT);
-  }
-  if (ibpp < 4) {
-    rc->exp[OD_P_FRAME] = 100;
-    rc->log_scale[OD_P_FRAME] = od_blog64(2249) - OD_Q57(OD_COEFF_SHIFT);
-  } else if (ibpp < 8) {
-    rc->exp[OD_P_FRAME] = 95;
-    rc->log_scale[OD_P_FRAME] = od_blog64(1751) - OD_Q57(OD_COEFF_SHIFT);
-  } else {
-    rc->exp[OD_P_FRAME] = 73;
-    rc->log_scale[OD_P_FRAME] = od_blog64(1260) - OD_Q57(OD_COEFF_SHIFT);
-  }
-  /*Golden P-frames both use the same log_scale and exp modeling
-     values as regular P-frames and the same scale follower.
-    For convenience in the rate calculation code, we maintain a copy of
-    the scale and exp values in OD_GOLDEN_P_FRAME.*/
-  rc->exp[OD_GOLDEN_P_FRAME] = rc->exp[OD_P_FRAME];
-  rc->log_scale[OD_GOLDEN_P_FRAME] = rc->log_scale[OD_P_FRAME];
-  rc->exp[OD_ALTREF_P_FRAME] = rc->exp[OD_P_FRAME];
-  rc->log_scale[OD_ALTREF_P_FRAME] = rc->log_scale[OD_P_FRAME];
-  /*We clamp the actual I and B frame delays to a minimum of 10 to work within
-     the range of values where later incrementing the delay works as designed.
-    10 is not an exact choice, but rather a good working trade-off.*/
-  rc->inter_p_delay = 10;
-  rc->inter_delay_target = rc->reservoir_frame_delay >> 1;
-  memset(rc->frame_count, 0, sizeof(rc->frame_count));
-  /*Drop-frame tracking is concerned with more than just the basic three frame
-     types.
-    It needs to track boosted and cut subtypes (of which there is only one
-     right now, OD_GOLDEN_P_FRAME). */
-  rc->prev_drop_count[OD_I_FRAME] = 0;
-  rc->log_drop_scale[OD_I_FRAME] = OD_Q57(0);
-  rc->prev_drop_count[OD_P_FRAME] = 0;
-  rc->log_drop_scale[OD_P_FRAME] = OD_Q57(0);
-  rc->prev_drop_count[OD_GOLDEN_P_FRAME] = 0;
-  rc->log_drop_scale[OD_GOLDEN_P_FRAME] = OD_Q57(0);
-  rc->prev_drop_count[OD_ALTREF_P_FRAME] = 0;
-  rc->log_drop_scale[OD_ALTREF_P_FRAME] = OD_Q57(0);
-  /*Set up second order followers, initialized according to corresponding
-     time constants.*/
-  od_iir_bessel2_init(&rc->scalefilter[OD_I_FRAME], 4,
-                      od_q57_to_q24(rc->log_scale[OD_I_FRAME]));
-  od_iir_bessel2_init(&rc->scalefilter[OD_P_FRAME], rc->inter_p_delay,
-                      od_q57_to_q24(rc->log_scale[OD_P_FRAME]));
-  od_iir_bessel2_init(&rc->vfrfilter[OD_I_FRAME], 4,
-                      od_bexp64_q24(rc->log_drop_scale[OD_I_FRAME]));
-  od_iir_bessel2_init(&rc->vfrfilter[OD_P_FRAME], 4,
-                      od_bexp64_q24(rc->log_drop_scale[OD_P_FRAME]));
-  od_iir_bessel2_init(&rc->vfrfilter[OD_GOLDEN_P_FRAME], 4,
-                      od_bexp64_q24(rc->log_drop_scale[OD_GOLDEN_P_FRAME]));
-  od_iir_bessel2_init(&rc->vfrfilter[OD_ALTREF_P_FRAME], 4,
-                      od_bexp64_q24(rc->log_drop_scale[OD_ALTREF_P_FRAME]));
-}
-
-int od_enc_rc_resize(od_rc_state *rc) {
-  /*If encoding has not yet begun, reset the buffer state.*/
-  if (rc->cur_frame == 0) {
-    od_enc_rc_reset(rc);
-  } else {
-    int idt;
-    /*Otherwise, update the bounds on the buffer, but not the current
-       fullness.*/
-    rc->bits_per_frame = (int64_t)(rc->target_bitrate / rc->framerate);
-    /*Insane framerates or frame sizes mean insane bitrates.
-      Let's not get carried away.*/
-    if (rc->bits_per_frame > 0x400000000000LL) {
-      rc->bits_per_frame = (int64_t)0x400000000000LL;
-    } else {
-      if (rc->bits_per_frame < 32) {
-        rc->bits_per_frame = 32;
-      }
-    }
-    rc->reservoir_frame_delay = OD_MAXI(rc->reservoir_frame_delay, 12);
-    rc->reservoir_max = rc->bits_per_frame * rc->reservoir_frame_delay;
-    rc->reservoir_target =
-        ((rc->reservoir_max + 1) >> 1) +
-        ((rc->bits_per_frame + 2) >> 2) *
-            OD_MINI(rc->keyframe_rate, rc->reservoir_frame_delay);
-    /*Update the INTER-frame scale filter delay.
-      We jump to it immediately if we've already seen enough frames; otherwise
-       it is simply set as the new target.*/
-    rc->inter_delay_target = idt = OD_MAXI(rc->reservoir_frame_delay >> 1, 10);
-    if (idt < OD_MINI(rc->inter_p_delay, rc->frame_count[OD_P_FRAME])) {
-      od_iir_bessel2_init(&rc->scalefilter[OD_P_FRAME], idt,
-                          rc->scalefilter[OD_P_FRAME].y[0]);
-      rc->inter_p_delay = idt;
-    }
-  }
-  return 0;
-}
-
-int od_enc_rc_init(od_rc_state *rc, int64_t bitrate, int delay_ms) {
-  if (rc->framerate <= 0) return 1;
-  if (rc->target_bitrate > 0) {
-    /*State has already been initialized; rather than reinitialize,
-      adjust the buffering for the new target rate. */
-    rc->target_bitrate = bitrate;
-    return od_enc_rc_resize(rc);
-  }
-  rc->target_quantizer = 0;
-  rc->target_bitrate = bitrate;
-  rc->rate_bias = 0;
-  if (bitrate > 0) {
-    /* The buffer size is clamped between [12, 256], this interval is short
-       enough to
-       allow reaction, but long enough to allow looking into the next GOP
-       (avoiding
-       the case where the last frames before an I-frame get starved).
-       The 12 frame minimum gives us some chance to distribute bit estimation
-       errors in the worst case. The 256 frame maximum means we'll require 8-10
-       seconds
-       of pre-buffering at 24-30 fps, which is not unreasonable.*/
-    rc->reservoir_frame_delay =
-        (int)OD_MINI((delay_ms / 1000) * rc->framerate, 256);
-    rc->drop_frames = 1;
-    rc->cap_overflow = 1;
-    rc->cap_underflow = 0;
-    rc->twopass_state = 0;
-    od_enc_rc_reset(rc);
-  }
-  return 0;
-}
-
-/*Scale the number of frames by the number of expected drops/duplicates.*/
-static int od_rc_scale_drop(od_rc_state *rc, int frame_type, int nframes) {
-  if (rc->prev_drop_count[frame_type] > 0 ||
-      rc->log_drop_scale[frame_type] > OD_Q57(0)) {
-    int64_t dup_scale;
-    dup_scale = od_bexp64(((rc->log_drop_scale[frame_type] +
-                            od_blog64(rc->prev_drop_count[frame_type] + 1)) >>
-                           1) +
-                          OD_Q57(8));
-    if (dup_scale < nframes << 8) {
-      int dup_scalei;
-      dup_scalei = (int)dup_scale;
-      if (dup_scalei > 0) {
-        nframes = ((nframes << 8) + dup_scalei - 1) / dup_scalei;
-      }
-    } else {
-      nframes = !!nframes;
-    }
-  }
-  return nframes;
-}
-
-/*Closed form version of frame determination code.
-  Used by rate control to predict frame types and subtypes into the future.
-  No side effects, may be called any number of times.
-  Note that it ignores end-of-file conditions; one-pass planning *should*
-   ignore end-of-file. */
-int od_frame_type(od_rc_state *rc, int64_t coding_frame_count, int *is_golden,
-                  int *is_altref, int64_t *ip_count) {
-  int frame_type;
-  if (coding_frame_count == 0) {
-    *is_golden = 1;
-    *is_altref = 1;
-    *ip_count = 0;
-    frame_type = OD_I_FRAME;
-  } else {
-    int keyrate = rc->keyframe_rate;
-    if (rc->closed_gop) {
-      int ip_per_gop;
-      int gop_n;
-      int gop_i;
-      ip_per_gop = (keyrate - 1) / 2;
-      gop_n = coding_frame_count / keyrate;
-      gop_i = coding_frame_count - gop_n * keyrate;
-      *ip_count = gop_n * ip_per_gop + (gop_i > 0) + (gop_i - 1);
-      frame_type = gop_i == 0 ? OD_I_FRAME : OD_P_FRAME;
-    } else {
-      int ip_per_gop;
-      int gop_n;
-      int gop_i;
-      ip_per_gop = (keyrate);
-      gop_n = (coding_frame_count - 1) / keyrate;
-      gop_i = coding_frame_count - gop_n * keyrate - 1;
-      *ip_count = (coding_frame_count > 0) + gop_n * ip_per_gop + (gop_i);
-      frame_type = gop_i / 1 < ip_per_gop - 1 ? OD_P_FRAME : OD_I_FRAME;
-    }
-  }
-  *is_golden =
-      (*ip_count % rc->goldenframe_rate) == 0 || frame_type == OD_I_FRAME;
-  *is_altref = (*ip_count % rc->altref_rate) == 0 || frame_type == OD_I_FRAME;
-  return frame_type;
-}
-
-/*Count frames types forward from the current frame up to but not including
-   the last I-frame in reservoir_frame_delay.
-  If reservoir_frame_delay contains no I-frames (or the current frame is the
-   only I-frame), count all reservoir_frame_delay frames.
-  Returns the number of frames counted.
-  Right now, this implementation is simple, brute-force, and expensive.
-  It is also easy to understand and debug.
-  TODO: replace with a virtual FIFO that keeps running totals as
-   repeating the counting over-and-over will have a performance impact on
-   whole-file 2pass usage.*/
-static int frame_type_count(od_rc_state *rc, int nframes[OD_FRAME_NSUBTYPES]) {
-  int i;
-  int j;
-  int acc[OD_FRAME_NSUBTYPES];
-  int count;
-  int reservoir_frames;
-  int reservoir_frame_delay;
-  memset(nframes, 0, OD_FRAME_NSUBTYPES * sizeof(*nframes));
-  memset(acc, 0, sizeof(acc));
-  count = 0;
-  reservoir_frames = 0;
-#if 1
-  /*Go ahead and count past end-of-stream.
-    We won't nail the exact bitrate on short files that end with a partial
-     GOP, but we also won't [potentially] destroy the quality of the last few
-     frames in that same case when we suddenly find out the stream is ending
-     before the original planning horizon.*/
-  reservoir_frame_delay = rc->reservoir_frame_delay;
-#else
-  /*Don't count past the end of the stream (once we know where end-of-stream
-     is).*/
-  reservoir_frame_delay =
-      rc->end_of_input ? rc->input_size + 1 : rc->reservoir_frame_delay;
-#endif
-  for (i = 0; i < reservoir_frame_delay; i++) {
-    int frame_type;
-    int is_golden;
-    int is_altref;
-    int64_t dummy;
-    frame_type =
-        od_frame_type(rc, rc->cur_frame + i, &is_golden, &is_altref, &dummy);
-    switch (frame_type) {
-      case OD_I_FRAME: {
-        for (j = 0; j < OD_FRAME_NSUBTYPES; j++) nframes[j] += acc[j];
-        reservoir_frames += count;
-        memset(acc, 0, sizeof(acc));
-        acc[OD_I_FRAME] = 1;
-        count = 1;
-        break;
-      }
-      case OD_P_FRAME: {
-        if (is_golden) {
-          ++acc[OD_GOLDEN_P_FRAME];
-          ++count;
-        } else if (is_altref) {
-          ++acc[OD_ALTREF_P_FRAME];
-          ++count;
-        } else {
-          ++acc[OD_P_FRAME];
-          ++count;
-        }
-        break;
-      }
-    }
-  }
-  /*If there were no I-frames at all, or only the first frame was an I-frame,
-     the accumulators never flushed and still contain the counts for the
-     entire buffer.
-    In both these cases, we return these counts.
-    Otherwise, we discard what remains in the accumulators as they contain
-     the counts from and past the last I-frame.*/
-  if (reservoir_frames == 0) {
-    for (i = 0; i < OD_FRAME_NSUBTYPES; i++) nframes[i] = acc[i];
-    reservoir_frames += count;
-  }
-  return reservoir_frames;
-}
-
-static int convert_to_ac_quant(int q, int bit_depth) {
-  return lrint(av1_convert_qindex_to_q(q, bit_depth));
-}
-
-int od_enc_rc_select_quantizers_and_lambdas(od_rc_state *rc,
-                                            int is_golden_frame,
-                                            int is_altref_frame, int frame_type,
-                                            int *bottom_idx, int *top_idx) {
-  int frame_subtype;
-  int64_t log_cur_scale;
-  int lossy_quantizer_min;
-  int lossy_quantizer_max;
-  double mqp_i = OD_MQP_I;
-  double mqp_p = OD_MQP_P;
-  double mqp_gp = OD_MQP_GP;
-  double mqp_ap = OD_MQP_AP;
-  int reservoir_frames;
-  int nframes[OD_FRAME_NSUBTYPES];
-  int32_t mqp_Q12[OD_FRAME_NSUBTYPES];
-  int64_t dqp_Q45[OD_FRAME_NSUBTYPES];
-  /*Verify the closed-form frame type determination code matches what the
-     input queue set.*/
-  /*One pseudo-non-closed-form caveat:
-    Once we've seen end-of-input, the batched frame determination code
-     suppresses the last open-GOP's I-frame (since it would only be
-     useful for the next GOP, which doesn't exist).
-     Thus, don't check one the input queue is drained.*/
-  if (!rc->end_of_input) {
-    int closed_form_type;
-    int closed_form_golden;
-    int closed_form_altref;
-    int64_t closed_form_cur_frame;
-    closed_form_type =
-        od_frame_type(rc, rc->cur_frame, &closed_form_golden,
-                      &closed_form_altref, &closed_form_cur_frame);
-    OD_UNUSED(closed_form_type);
-    OD_UNUSED(is_altref_frame);
-    assert(closed_form_type == frame_type);
-    assert(closed_form_cur_frame == rc->cur_frame);
-    assert(closed_form_altref == is_altref_frame);
-    assert(closed_form_golden == is_golden_frame);
-  }
-
-  log_cur_scale = (int64_t)rc->scalefilter[frame_type].y[0] << 33;
-
-  /*Count the various types and classes of frames.*/
-  reservoir_frames = frame_type_count(rc, nframes);
-  nframes[OD_I_FRAME] = od_rc_scale_drop(rc, OD_I_FRAME, nframes[OD_I_FRAME]);
-  nframes[OD_P_FRAME] = od_rc_scale_drop(rc, OD_P_FRAME, nframes[OD_P_FRAME]);
-  nframes[OD_GOLDEN_P_FRAME] =
-      od_rc_scale_drop(rc, OD_GOLDEN_P_FRAME, nframes[OD_GOLDEN_P_FRAME]);
-  nframes[OD_ALTREF_P_FRAME] =
-      od_rc_scale_drop(rc, OD_ALTREF_P_FRAME, nframes[OD_ALTREF_P_FRAME]);
-
-  switch (rc->twopass_state) {
-    default: break;
-    case 1: {
-      /*Pass 1 mode: use a fixed qi value.*/
-      return rc->firstpass_quant;
-    } break;
-    case 2: {
-      int i;
-      int64_t scale_sum[OD_FRAME_NSUBTYPES];
-      int qti;
-      /*Pass 2 mode: we know exactly how much of each frame type there is in
-         the current buffer window, and have estimates for the scales.*/
-      for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
-        nframes[i] = rc->nframes[i];
-        nframes[i] = rc->nframes[i];
-        scale_sum[i] = rc->scale_sum[i];
-      }
-      /*If we're not using the same frame type as in pass 1 (because someone
-         changed the keyframe interval), remove that scale estimate.
-        We'll add in a replacement for the correct frame type below.*/
-      qti = rc->cur_metrics.frame_type;
-      if (qti != frame_type) {
-        nframes[qti]--;
-        scale_sum[qti] -= od_bexp64_q24(rc->cur_metrics.log_scale);
-      }
-      /*Compute log_scale estimates for each frame type from the pass-1 scales
-         we measured in the current window.*/
-      for (qti = 0; qti < OD_FRAME_NSUBTYPES; qti++) {
-        rc->log_scale[qti] = nframes[qti] > 0
-                                 ? od_blog64(scale_sum[qti]) -
-                                       od_blog64(nframes[qti]) - OD_Q57(24)
-                                 : -rc->log_npixels;
-      }
-      /*If we're not using the same frame type as in pass 1, add a scale
-         estimate for the corresponding frame using the current low-pass
-         filter value.
-        This is mostly to ensure we have a valid estimate even when pass 1 had
-         no frames of this type in the buffer window.
-        TODO: We could also plan ahead and figure out how many keyframes we'll
-         be forced to add in the current buffer window.*/
-      qti = rc->cur_metrics.frame_type;
-      if (qti != frame_type) {
-        int64_t scale;
-        scale = rc->log_scale[frame_type] < OD_Q57(23)
-                    ? od_bexp64(rc->log_scale[frame_type] + OD_Q57(24))
-                    : 0x7FFFFFFFFFFFLL;
-        scale *= nframes[frame_type];
-        nframes[frame_type]++;
-        scale += od_bexp64_q24(log_cur_scale >> 33);
-        rc->log_scale[frame_type] =
-            od_blog64(scale) - od_blog64(nframes[qti]) - OD_Q57(24);
-      } else {
-        log_cur_scale = (int64_t)rc->cur_metrics.log_scale << 33;
-      }
-    } break;
-  }
-
-  /*Quantizer selection sticks to the codable, lossy portion of the quantizer
-    range.*/
-  lossy_quantizer_min = convert_to_ac_quant(rc->minq, rc->bit_depth);
-  lossy_quantizer_max = convert_to_ac_quant(rc->maxq, rc->bit_depth);
-  frame_subtype = frame_type;
-  /*Stash quantizer modulation by frame type.*/
-  mqp_Q12[OD_I_FRAME] = OD_F_Q12(mqp_i);
-  mqp_Q12[OD_P_FRAME] = OD_F_Q12(mqp_p);
-  mqp_Q12[OD_GOLDEN_P_FRAME] = OD_F_Q12(mqp_gp);
-  mqp_Q12[OD_ALTREF_P_FRAME] = OD_F_Q12(mqp_ap);
-  dqp_Q45[OD_I_FRAME] = OD_F_Q45(OD_DQP_I);
-  dqp_Q45[OD_P_FRAME] = OD_F_Q45(OD_DQP_P);
-  dqp_Q45[OD_GOLDEN_P_FRAME] = OD_F_Q45(OD_DQP_GP);
-  dqp_Q45[OD_ALTREF_P_FRAME] = OD_F_Q45(OD_DQP_AP);
-  /*Is rate control active?*/
-  if (rc->target_bitrate <= 0) {
-    /*Rate control is not active; derive quantizer directly from
-      quality parameter and frame type. */
-    /*Can't use the OD_LOSSLESS macro, as it uses state.quantizer to intuit,
-      and we've not set it yet.*/
-    if (rc->quality == 0) {
-      /*Lossless coding requested.*/
-      rc->base_quantizer = 0;
-      rc->target_quantizer = 0;
-    } else {
-      int64_t log_quantizer;
-
-      /* Adjust the modulation constants using the last frame's quantizer. */
-      double mqp_delta = (255 - rc->target_quantizer) / 2000.0f;
-      mqp_i -= mqp_delta;
-      mqp_p += mqp_delta;
-      mqp_gp -= mqp_delta;
-      mqp_Q12[OD_I_FRAME] = OD_F_Q12(mqp_i);
-      mqp_Q12[OD_P_FRAME] = OD_F_Q12(mqp_p);
-      mqp_Q12[OD_GOLDEN_P_FRAME] = OD_F_Q12(mqp_gp);
-      mqp_Q12[OD_ALTREF_P_FRAME] = OD_F_Q12(mqp_ap);
-
-      if (rc->quality == -1) {
-        /*A quality of -1 means quality was unset; use a default.*/
-        rc->base_quantizer = convert_to_ac_quant(10, rc->bit_depth);
-      } else {
-        rc->base_quantizer = convert_to_ac_quant(rc->quality, rc->bit_depth);
-      }
-
-      if (rc->periodic_boosts && !is_golden_frame) {
-        int pattern_rate = (rc->goldenframe_rate >> 1);
-        int dist_to_golden = rc->cur_frame % pattern_rate;
-        int dist_away_golden = pattern_rate - dist_to_golden;
-        int boost = dist_to_golden;
-        if (dist_away_golden > dist_to_golden) boost = dist_away_golden;
-        boost -= pattern_rate;
-        boost *= (rc->base_quantizer) / OD_PERIODIC_BOOST_DIV;
-        rc->base_quantizer = rc->base_quantizer + boost;
-      }
-
-      /*As originally written, qp modulation is applied to the coded quantizer.
-        Because we now have and use a more precise target quantizer for various
-        calculation, that needs to be modulated as well.
-        Calculate what is, effectively, a fractional coded quantizer. */
-      /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/
-      log_quantizer = od_blog64(rc->base_quantizer) - OD_Q57(OD_COEFF_SHIFT);
-      /*log_quantizer to Q21.*/
-      log_quantizer >>= 36;
-      /*scale log quantizer, result is Q33.*/
-      log_quantizer *= OD_LOG_QUANTIZER_BASE_Q12;
-      /*Add Q33 offset to Q33 log_quantizer.*/
-      log_quantizer += OD_LOG_QUANTIZER_OFFSET_Q45 >> 12;
-      /*Modulate quantizer according to frame type; result is Q45.*/
-      log_quantizer *= mqp_Q12[frame_subtype];
-      /*Add Q45 boost/cut to Q45 fractional coded quantizer.*/
-      log_quantizer += dqp_Q45[frame_subtype];
-      /*Back to log2 quantizer in Q57.*/
-      log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) *
-                          OD_LOG_QUANTIZER_EXP_Q12 +
-                      OD_Q57(OD_COEFF_SHIFT);
-      /*Convert Q57 log2 quantizer to unclamped linear target quantizer value.*/
-      rc->target_quantizer = od_bexp64(log_quantizer);
-    }
-  } else {
-    int clamp;
-    int64_t rate_bias;
-    int64_t rate_total;
-    int base_quantizer;
-    int64_t log_quantizer;
-    int qlo;
-    int qhi;
-    int i;
-    /*We clamp the allowed amount of qi change (after initialization).*/
-    clamp = rc->cur_frame > 0;
-    /*Figure out how to re-distribute bits so that we hit our fullness target
-       before the last keyframe in our current buffer window (after the current
-       frame), or the end of the buffer window, whichever comes first.*/
-    /*Single pass only right now.*/
-    /*If we've been missing our target, add a penalty term.*/
-    rate_bias = (rc->rate_bias / (rc->cur_frame + 1000)) * reservoir_frames;
-    /*rate_total is the total bits available over the next
-       reservoir_frames frames.*/
-    rate_total = rc->reservoir_fullness - rc->reservoir_target + rate_bias +
-                 reservoir_frames * rc->bits_per_frame;
-    /*Find a target quantizer that meets our rate target for the specific mix
-       of frame types we'll have over the next frame_delay frames.
-      We model the rate<->quantizer relationship as:
-       rate = scale*(quantizer**-exp)
-      In this case, we have our desired rate, an exponent selected in setup,
-       and a scale that's been measured over our frame history, so we're
-       solving for the quantizer.
-      Exponentiation with arbitrary exponents is expensive, so we work in
-       the binary log domain (binary exp and log aren't too bad):
-       rate = e2(log2_scale - log2_quantizer * exp)
-      There's no easy closed form solution, so we bisection search for it.*/
-    /*We do not currently allow rate control to select lossless encoding.*/
-    qlo = 1;
-    /*If there's a quality specified, it's used to select the
-       coarsest base quantizer we can select.
-      Otherwise we can use up to and including the coarsest codable
-       quantizer.*/
-    if (rc->quality > 0)
-      qhi = convert_to_ac_quant(rc->quality, rc->bit_depth);
-    else
-      qhi = lossy_quantizer_max;
-    base_quantizer = (qlo + qhi) >> 1;
-    while (qlo < qhi) {
-      volatile int64_t log_base_quantizer;
-      int64_t diff;
-      int64_t bits;
-      /*Count bits contributed by each frame type using the model.*/
-      bits = 0;
-      log_base_quantizer = od_blog64(base_quantizer);
-      for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
-        /*Modulate base quantizer by frame type.*/
-        /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/
-        log_quantizer = log_base_quantizer - OD_Q57(OD_COEFF_SHIFT);
-        /*log_quantizer to Q21.*/
-        log_quantizer >>= 36;
-        /*scale log quantizer, result is Q33.*/
-        log_quantizer *= OD_LOG_QUANTIZER_BASE_Q12;
-        /*Add Q33 offset to Q33 log_quantizer.*/
-        log_quantizer += OD_LOG_QUANTIZER_OFFSET_Q45 >> 12;
-        /*Modulate quantizer according to frame type; result is Q45.*/
-        log_quantizer *= mqp_Q12[i];
-        /*Add Q45 boost/cut to Q45 fractional coded quantizer.*/
-        log_quantizer += dqp_Q45[i];
-        /*Back to log2 quantizer in Q57.*/
-        log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) *
-                            OD_LOG_QUANTIZER_EXP_Q12 +
-                        OD_Q57(OD_COEFF_SHIFT);
-        /*Clamp modulated quantizer values.*/
-        log_quantizer = OD_CLAMPI(od_blog64(lossy_quantizer_min), log_quantizer,
-                                  od_blog64(lossy_quantizer_max));
-        /* All the fields here are Q57 except for the exponent which is Q6.*/
-        bits += nframes[i] * od_bexp64(rc->log_scale[i] + rc->log_npixels -
-                                       (log_quantizer >> 6) * rc->exp[i]);
-      }
-      diff = bits - rate_total;
-      if (diff > 0) {
-        qlo = base_quantizer + 1;
-      } else if (diff < 0) {
-        qhi = base_quantizer - 1;
-      } else {
-        break;
-      }
-      base_quantizer = (qlo + qhi) >> 1;
-    }
-    /*If this was not one of the initial frames, limit the change in base
-       quantizer to within [0.8*Q,1.2*Q], where Q is the previous frame's
-       base quantizer.*/
-    if (clamp) {
-      base_quantizer = OD_CLAMPI((rc->base_quantizer * 0x0CCCD + 0x8000) >> 16,
-                                 base_quantizer,
-                                 (rc->base_quantizer * 0x13333 + 0x8000) >> 16);
-    }
-    /*Modulate chosen base quantizer to produce target quantizer.*/
-    log_quantizer = od_blog64(base_quantizer);
-    /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/
-    log_quantizer -= OD_Q57(OD_COEFF_SHIFT);
-    /*log_quantizer to Q21.*/
-    log_quantizer >>= 36;
-    /*scale log quantizer, result is Q33.*/
-    log_quantizer *= OD_LOG_QUANTIZER_BASE_Q12;
-    /*Add Q33 offset to Q33 log_quantizer.*/
-    log_quantizer += OD_LOG_QUANTIZER_OFFSET_Q45 >> 12;
-    /*Modulate quantizer according to frame type; result is Q45.*/
-    log_quantizer *= mqp_Q12[frame_subtype];
-    /*Add Q45 boost/cut to Q45 fractional coded quantizer.*/
-    log_quantizer += dqp_Q45[frame_subtype];
-    /*Back to log2 quantizer in Q57.*/
-    log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) *
-                        OD_LOG_QUANTIZER_EXP_Q12 +
-                    OD_Q57(OD_COEFF_SHIFT);
-    /*Clamp modulated quantizer values.*/
-    log_quantizer = OD_CLAMPI(od_blog64(lossy_quantizer_min), log_quantizer,
-                              od_blog64(lossy_quantizer_max));
-    /*The above allocation looks only at the total rate we'll accumulate in
-       the next reservoir_frame_delay frames.
-      However we could overflow the bit reservoir on the very next frame, so
-       check for that here if we're not using a soft target.*/
-    if (rc->cap_overflow) {
-      int64_t margin;
-      int64_t soft_limit;
-      int64_t log_soft_limit;
-      int64_t log_scale_pixels;
-      int64_t exp;
-      int64_t log_qexp;
-      /*Allow 3% of the buffer for prediction error.
-        This should be plenty, and we don't mind if we go a bit over; we only
-         want to keep these bits from being completely wasted.*/
-      margin = (rc->reservoir_max + 31) >> 5;
-      /*We want to use at least this many bits next frame.*/
-      soft_limit = rc->reservoir_fullness + rc->bits_per_frame -
-                   (rc->reservoir_max - margin);
-      log_soft_limit = od_blog64(soft_limit);
-      /*If we're predicting we won't use that many bits...*/
-      log_scale_pixels = rc->log_scale[frame_subtype] + rc->log_npixels;
-      exp = rc->exp[frame_subtype];
-      log_qexp = (log_quantizer >> 6) * exp;
-      if (log_scale_pixels - log_qexp < log_soft_limit) {
-        /*Scale the adjustment based on how far into the margin we are.*/
-        log_qexp += ((log_scale_pixels - log_soft_limit - log_qexp) >> 32) *
-                    (OD_MINI(margin, soft_limit) << 32) / margin;
-        log_quantizer = (((log_qexp + (exp >> 1)) / exp) << 6);
-      }
-    }
-    /*We just checked we don't overflow the reservoir next frame, now check
-       we don't underflow and bust the budget (when not using a soft target).
-      Disabled when a quality bound is set; if we saturate quantizer to the
-       maximum possible size when we have a limiting max quality, the
-       resulting lambda can cause strange behavior.*/
-    if (rc->quality == -1) {
-      int64_t exp;
-      int64_t log_qexp;
-      int64_t log_scale_pixels;
-      int64_t log_hard_limit;
-      /*Compute the maximum number of bits we can use in the next frame.
-        Allow 50% of the rate for a single frame for prediction error.
-        This may not be enough for keyframes or sudden changes in
-         complexity.*/
-      log_hard_limit =
-          od_blog64(rc->reservoir_fullness + (rc->bits_per_frame >> 1));
-      /*If we're predicting we'll use more than this...*/
-      log_scale_pixels = rc->log_scale[frame_subtype] + rc->log_npixels;
-      exp = rc->exp[frame_subtype];
-      log_qexp = (log_quantizer >> 6) * exp;
-      if (log_scale_pixels - log_qexp > log_hard_limit) {
-        /*Force the target to hit our limit exactly.*/
-        log_qexp = log_scale_pixels - log_hard_limit;
-        log_quantizer = (log_qexp + (exp >> 1)) / exp << 6;
-        /*If that target is unreasonable, oh well; we'll have to drop.*/
-        log_quantizer = OD_MAXI(log_quantizer, od_blog64(lossy_quantizer_max));
-      }
-    }
-    /*Compute a final estimate of the number of bits we plan to use, update
-       the running rate bias measurement.*/
-    {
-      int64_t log_qexp;
-      int64_t log_scale_pixels;
-      log_scale_pixels = rc->log_scale[frame_subtype] + rc->log_npixels;
-      log_qexp = (log_quantizer >> 6) * rc->exp[frame_subtype];
-      rc->rate_bias += od_bexp64(log_scale_pixels - log_qexp);
-    }
-    rc->target_quantizer = od_bexp64(log_quantizer);
-    /*The various cappings and adjustments may have altered the log_quantizer
-       target significantly.
-      We can either update the base quantizer to be consistent with the
-       target or let it track separately.
-      Theora behavior effectively keeps them consistent, as it regenerates
-       the effective base quantizer from the target each frame rather than
-       saving both.
-      For Daala, it's easier to allow them to track separately.
-      For now, allow them to track separately and see how it behaves.*/
-    rc->base_quantizer = base_quantizer;
-  }
-  *bottom_idx = lossy_quantizer_min;
-  *top_idx = lossy_quantizer_max;
-  rc->target_quantizer = av1_qindex_from_ac(
-      OD_CLAMPI(lossy_quantizer_min, rc->target_quantizer, lossy_quantizer_max),
-      rc->bit_depth);
-  return rc->target_quantizer;
-}
-
-int od_enc_rc_update_state(od_rc_state *rc, int64_t bits, int is_golden_frame,
-                           int is_altref_frame, int frame_type, int droppable) {
-  int dropped;
-  dropped = 0;
-  /*Update rate control only if rate control is active.*/
-  if (rc->target_bitrate > 0) {
-    int64_t log_scale;
-    int frame_subtype;
-    frame_subtype = frame_type;
-    /*Track non-golden and golden P frame drops separately.*/
-    if (is_golden_frame && frame_type == OD_P_FRAME)
-      frame_subtype = OD_GOLDEN_P_FRAME;
-    else if (is_altref_frame && frame_type == OD_P_FRAME)
-      frame_subtype = OD_ALTREF_P_FRAME;
-    if (bits <= 0) {
-      /*We didn't code any blocks in this frame.*/
-      log_scale = OD_Q57(-64);
-      bits = 0;
-      ++rc->prev_drop_count[frame_subtype];
-    } else {
-      int64_t log_bits;
-      int64_t log_qexp;
-      /*Compute the estimated scale factor for this frame type.*/
-      log_bits = od_blog64(bits);
-      log_qexp = od_blog64(rc->target_quantizer);
-      log_qexp = (log_qexp >> 6) * (rc->exp[frame_type]);
-      log_scale = OD_MINI(log_bits - rc->log_npixels + log_qexp, OD_Q57(16));
-    }
-
-    switch (rc->twopass_state) {
-      case 1: {
-        int golden, altref;
-        int64_t ipc;
-        rc->cur_metrics.frame_type =
-            od_frame_type(rc, rc->cur_frame, &golden, &altref, &ipc);
-        /*Pass 1 mode: save the metrics for this frame.*/
-        rc->cur_metrics.log_scale = od_q57_to_q24(log_scale);
-      } break;
-      case 2: {
-        /*Pass 2 mode:*/
-        int m_frame_type = rc->cur_metrics.frame_type;
-        rc->nframes[m_frame_type]--;
-        rc->scale_sum[m_frame_type] -= od_bexp64_q24(rc->cur_metrics.log_scale);
-      } break;
-    }
-
-    if (bits > 0) {
-      od_iir_bessel2 *f;
-      /*If this is the first example of the given frame type we've
-         seen, we immediately replace the default scale factor guess
-         with the estimate we just computed using the first frame.*/
-      if (rc->frame_count[frame_type] == 0) {
-        f = rc->scalefilter + frame_type;
-        f->y[1] = f->y[0] = f->x[1] = f->x[0] = od_q57_to_q24(log_scale);
-        rc->log_scale[frame_type] = log_scale;
-      } else {
-        /*Lengthen the time constant for the inter filters as we collect more
-           frame statistics, until we reach our target.*/
-        if (frame_type != OD_I_FRAME &&
-            rc->inter_p_delay < rc->inter_delay_target &&
-            rc->frame_count[frame_type] >= rc->inter_p_delay) {
-          od_iir_bessel2_reinit(&rc->scalefilter[frame_type],
-                                ++rc->inter_p_delay);
-        }
-        /*Update the low-pass scale filter for this frame type
-           regardless of whether or not we drop this frame.*/
-        rc->log_scale[frame_type] =
-            od_iir_bessel2_update(rc->scalefilter + frame_type,
-                                  od_q57_to_q24(log_scale))
-            << 33;
-      }
-      /*If this frame busts our budget, it must be dropped.*/
-      if (droppable && rc->reservoir_fullness + rc->bits_per_frame < bits) {
-        ++rc->prev_drop_count[frame_subtype];
-        bits = 0;
-        dropped = 1;
-      } else {
-        uint32_t drop_count;
-        /*Update a low-pass filter to estimate the "real" frame rate taking
-           drops into account.
-          This is only done if the frame is coded, as it needs the final
-           count of dropped frames.*/
-        drop_count = rc->prev_drop_count[frame_subtype] + 1;
-        if (drop_count > 0x7F) {
-          drop_count = 0x7FFFFFFF;
-        } else {
-          drop_count <<= 24;
-        }
-        rc->log_drop_scale[frame_subtype] =
-            od_blog64(od_iir_bessel2_update(rc->vfrfilter + frame_subtype,
-                                            drop_count)) -
-            OD_Q57(24);
-        /*Zero the drop count for this frame.
-          It will be increased if we drop frames.*/
-        rc->prev_drop_count[frame_subtype] = 0;
-      }
-      /*Increment the frame count for filter adaptation purposes.*/
-      if (!rc->twopass_state) rc->frame_count[frame_type]++;
-    }
-    rc->reservoir_fullness += rc->bits_per_frame - bits;
-    /*If we're too quick filling the buffer and overflow is capped,
-      that rate is lost forever.*/
-    if (rc->cap_overflow && rc->reservoir_fullness > rc->reservoir_max) {
-      rc->reservoir_fullness = rc->reservoir_max;
-    }
-    /*If we're too quick draining the buffer and underflow is capped,
-      don't try to make up that rate later.*/
-    if (rc->cap_underflow && rc->reservoir_fullness < 0) {
-      rc->reservoir_fullness = 0;
-    }
-    /*Adjust the bias for the real bits we've used.*/
-    rc->rate_bias -= bits;
-  }
-  return dropped;
-}
-
-static INLINE void od_rc_buffer_val(od_rc_state *rc, int64_t val, int bytes) {
-  while (bytes-- > 0) {
-    rc->twopass_buffer[rc->twopass_buffer_bytes++] = (uint8_t)(val & 0xFF);
-    val >>= 8;
-  }
-}
-
-static INLINE int64_t od_rc_unbuffer_val(od_rc_state *rc, int bytes) {
-  int64_t ret = 0;
-  int shift = 0;
-  while (bytes-- > 0) {
-    ret |= ((int64_t)rc->twopass_buffer[rc->twopass_buffer_bytes++]) << shift;
-    shift += 8;
-  }
-  return ret;
-}
-
-int od_enc_rc_2pass_out(od_rc_state *rc, struct aom_codec_pkt_list *pkt_list,
-                        int summary) {
-  int i;
-  struct aom_codec_cx_pkt pkt;
-  rc->twopass_buffer = rc->firstpass_buffer;
-  rc->twopass_buffer_bytes = 0;
-  if (!rc->twopass_state) {
-    rc->twopass_state = 1;
-    for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
-      rc->frame_count[i] = 0;
-      rc->exp[i] = 0;
-      rc->scale_sum[i] = 0;
-    }
-  }
-  if (summary) {
-    od_rc_buffer_val(rc, OD_RC_2PASS_MAGIC, 4);
-    od_rc_buffer_val(rc, OD_RC_2PASS_VERSION, 1);
-    for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
-      od_rc_buffer_val(rc, rc->frame_count[i], 4);
-      od_rc_buffer_val(rc, rc->exp[i], 4);
-      od_rc_buffer_val(rc, rc->scale_sum[i], 8);
-    }
-  } else {
-    int frame_type = rc->cur_metrics.frame_type;
-    rc->scale_sum[frame_type] += od_bexp64_q24(rc->cur_metrics.log_scale);
-    rc->frame_count[frame_type]++;
-    od_rc_buffer_val(rc, rc->cur_metrics.frame_type, 1);
-    od_rc_buffer_val(rc, rc->cur_metrics.log_scale, 4);
-  }
-  pkt.data.twopass_stats.buf = rc->firstpass_buffer;
-  pkt.data.twopass_stats.sz = rc->twopass_buffer_bytes;
-  pkt.kind = AOM_CODEC_STATS_PKT;
-  aom_codec_pkt_list_add(pkt_list, &pkt);
-  return 0;
-}
-
-int od_enc_rc_2pass_in(od_rc_state *rc) {
-  /* Enable pass 2 mode if this is the first call. */
-  if (rc->twopass_state == 0) {
-    uint32_t i, total_frames = 0;
-
-    if (!rc->twopass_allframes_buf ||
-        rc->twopass_allframes_buf_size < OD_RC_2PASS_MIN)
-      return -1;
-
-    /* Find summary packet at the end */
-    rc->twopass_buffer = rc->twopass_allframes_buf;
-    rc->twopass_buffer +=
-        rc->twopass_allframes_buf_size - OD_RC_2PASS_SUMMARY_SZ;
-    rc->twopass_buffer_bytes = 0;
-
-    if (od_rc_unbuffer_val(rc, 4) != OD_RC_2PASS_MAGIC) return -1;
-    if (od_rc_unbuffer_val(rc, 1) != OD_RC_2PASS_VERSION) return -1;
-
-    for (i = 0; i < OD_FRAME_NSUBTYPES; i++) {
-      rc->frame_count[i] = od_rc_unbuffer_val(rc, 4);
-      rc->exp[i] = od_rc_unbuffer_val(rc, 4);
-      rc->scale_sum[i] = od_rc_unbuffer_val(rc, 8);
-      rc->nframes[i] = rc->frame_count[i];
-      total_frames += rc->frame_count[i];
-    }
-
-    if (total_frames < 1) return -1;
-
-    if (total_frames * OD_RC_2PASS_PACKET_SZ > rc->twopass_allframes_buf_size)
-      return -1;
-
-    od_enc_rc_reset(rc);
-
-    /* Everything looks ok */
-    rc->twopass_buffer = rc->twopass_allframes_buf;
-    rc->twopass_state = 2;
-    rc->twopass_buffer_bytes = 0;
-  }
-
-  rc->cur_metrics.frame_type = od_rc_unbuffer_val(rc, 1);
-  rc->cur_metrics.log_scale = od_rc_unbuffer_val(rc, 4);
-
-  return 0;
-}
diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.h b/third_party/aom/av1/encoder/ratectrl_xiph.h
index a4a9052fa..e69de29bb 100644
--- a/third_party/aom/av1/encoder/ratectrl_xiph.h
+++ b/third_party/aom/av1/encoder/ratectrl_xiph.h
@@ -1,200 +0,0 @@
-/*
- * Copyright (c) 2001-2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#if !defined(_ratectrl_xiph_H)
-#define _ratectrl_xiph_H (1)
-
-#include "av1/encoder/ratectrl.h"
-#include "aom/internal/aom_codec_internal.h"
-
-/*Frame types.*/
-#define OD_I_FRAME (0)
-#define OD_P_FRAME (1)
-#define OD_GOLDEN_P_FRAME (2)
-#define OD_ALTREF_P_FRAME (3)
-
-#define OD_FRAME_NSUBTYPES (OD_ALTREF_P_FRAME + 1)
-
-/* Periodic boost (in between golden frames) strength - lower is more */
-#define OD_PERIODIC_BOOST_DIV (10)
-
-/* Constants for frame QP modulation <- tweak these
- * Adjusts how the rate control system decides the quantizers per frame
- * (sub)type */
-#define OD_MQP_I (0.98)
-#define OD_MQP_P (1.06)
-#define OD_MQP_GP (0.99)
-#define OD_MQP_AP (0.92)
-#define OD_DQP_I (-2)
-#define OD_DQP_P (0)
-#define OD_DQP_GP (-2)
-#define OD_DQP_AP (-2)
-
-/*Fractional_coded_quantizer ~=
-   log2(quantizer / (1 << OD_COEFF_SHIFT))*6.307 + 6.235*/
-/*Base/scale factor for linear quantizer to fractional coded quantizer
-   conversion (6.307 * 2^12) */
-#define OD_LOG_QUANTIZER_BASE_Q12 (0x0064EB)
-/*Inverse of above scale factor.*/
-#define OD_LOG_QUANTIZER_EXP_Q12 (0x000289)
-/*Offset for linear quantizer to fractional coded quantizer
-   conversion (6.235 * 2^45) */
-#define OD_LOG_QUANTIZER_OFFSET_Q45 (0x0000C7851EB851ECLL)
-
-#define OD_RC_2PASS_MAGIC (0x53015641) /* [A, V, 1, S] in little endian */
-#define OD_RC_2PASS_SUMMARY_SZ (4 + 1 + (4 + 4 + 8) * OD_FRAME_NSUBTYPES)
-#define OD_RC_2PASS_PACKET_SZ (1 + 4)
-#define OD_RC_2PASS_MIN (OD_RC_2PASS_PACKET_SZ + OD_RC_2PASS_SUMMARY_SZ)
-#define OD_RC_2PASS_VERSION (1)
-
-/*A 2nd order low-pass Bessel follower.
-  We use this for rate control because it has fast reaction time, but is
-   critically damped.*/
-typedef struct od_iir_bessel2 {
-  int32_t c[2];
-  int64_t g;
-  int32_t x[2];
-  int32_t y[2];
-} od_iir_bessel2;
-
-/* The 2-pass metrics associated with a single frame. */
-typedef struct od_frame_metrics {
-  /*The log base 2 of the scale factor for this frame in Q24 format.*/
-  int64_t log_scale;
-  /*The frame type from pass 1.*/
-  unsigned frame_type : 1;
-} od_frame_metrics;
-
-/*Rate control setup and working state information.*/
-typedef struct od_rc_state {
-  /* Image format */
-  int frame_width;
-  int frame_height;
-  int bit_depth;
-
-  /* Framerate */
-  double framerate;
-  /* Keyframe rate */
-  int keyframe_rate;
-  /* Golden frame period */
-  int goldenframe_rate;
-  /* Altref frame period */
-  int altref_rate;
-  /*The target bit-rate in bits per second.*/
-  int64_t target_bitrate;
-  /* Quality level for non-bitrate-targeting */
-  int quality;
-  /* Copied from oxcf->frame_periodic_boost */
-  int periodic_boosts;
-  /* Max Q */
-  int maxq;
-  /* Min Q */
-  int minq;
-  /* Quantizer to use for the first pass */
-  int firstpass_quant;
-
-  /* 2-pass metrics */
-  od_frame_metrics cur_metrics;
-
-  /* 2-pass state */
-  int64_t scale_sum[OD_FRAME_NSUBTYPES];
-  int nframes[OD_FRAME_NSUBTYPES];
-
-  /* 2-pass bytestream reader/writer context */
-  uint8_t *twopass_buffer;
-  int twopass_buffer_bytes;
-
-  /* Pass 1 stats packet storage */
-  uint8_t firstpass_buffer[OD_RC_2PASS_SUMMARY_SZ];
-
-  /* Every state packet from the first pass in a single buffer */
-  uint8_t *twopass_allframes_buf;
-  size_t twopass_allframes_buf_size;
-
-  /* Actual returned quantizer */
-  int target_quantizer;
-  /*The full-precision, unmodulated quantizer upon which
-    our modulated quantizers are based.*/
-  int base_quantizer;
-
-  /* Increments by 1 for each frame. */
-  int64_t cur_frame;
-
-  /* End of input flag */
-  int end_of_input;
-  /* Closed GOP flag */
-  int closed_gop;
-  /*The number of frames over which to distribute the reservoir usage.*/
-  int reservoir_frame_delay;
-  /*Will we drop frames to meet bitrate target?*/
-  unsigned char drop_frames;
-  /*Do we respect the maximum reservoir fullness?*/
-  unsigned char cap_overflow;
-  /*Can the reservoir go negative?*/
-  unsigned char cap_underflow;
-  /*Two-pass mode state.
-    0 => 1-pass encoding.
-    1 => 1st pass of 2-pass encoding.
-    2 => 2nd pass of 2-pass encoding.*/
-  int twopass_state;
-  /*The log of the number of pixels in a frame in Q57 format.*/
-  int64_t log_npixels;
-  /*The target average bits per frame.*/
-  int64_t bits_per_frame;
-  /*The current bit reservoir fullness (bits available to be used).*/
-  int64_t reservoir_fullness;
-  /*The target buffer fullness.
-    This is where we'd like to be by the last keyframe the appears in the next
-     buf_delay frames.*/
-  int64_t reservoir_target;
-  /*The maximum buffer fullness (total size of the buffer).*/
-  int64_t reservoir_max;
-  /*The log of estimated scale factor for the rate model in Q57 format.*/
-  int64_t log_scale[OD_FRAME_NSUBTYPES];
-  /*The exponent used in the rate model in Q8 format.*/
-  unsigned exp[OD_FRAME_NSUBTYPES];
-  /*The log of an estimated scale factor used to obtain the real framerate, for
-     VFR sources or, e.g., 12 fps content doubled to 24 fps, etc.*/
-  int64_t log_drop_scale[OD_FRAME_NSUBTYPES];
-  /*The total drop count from the previous frame.*/
-  uint32_t prev_drop_count[OD_FRAME_NSUBTYPES];
-  /*Second-order lowpass filters to track scale and VFR/drops.*/
-  od_iir_bessel2 scalefilter[OD_FRAME_NSUBTYPES];
-  od_iir_bessel2 vfrfilter[OD_FRAME_NSUBTYPES];
-  int frame_count[OD_FRAME_NSUBTYPES];
-  int inter_p_delay;
-  int inter_delay_target;
-  /*The total accumulated estimation bias.*/
-  int64_t rate_bias;
-} od_rc_state;
-
-int od_enc_rc_init(od_rc_state *rc, int64_t bitrate, int delay_ms);
-
-int od_enc_rc_select_quantizers_and_lambdas(od_rc_state *rc,
-                                            int is_golden_frame,
-                                            int is_altref_frame, int frame_type,
-                                            int *bottom_idx, int *top_idx);
-
-/* Returns 1 if the frame should be dropped */
-int od_enc_rc_update_state(od_rc_state *rc, int64_t bits, int is_golden_frame,
-                           int is_altref_frame, int frame_type, int droppable);
-
-int od_frame_type(od_rc_state *rc, int64_t coding_frame_count, int *is_golden,
-                  int *is_altref, int64_t *ip_count);
-
-int od_enc_rc_resize(od_rc_state *rc);
-
-int od_enc_rc_2pass_out(od_rc_state *rc, struct aom_codec_pkt_list *pkt_list,
-                        int summary);
-
-int od_enc_rc_2pass_in(od_rc_state *rc);
-
-#endif
diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c
index 5dd485334..17f23e5ec 100644
--- a/third_party/aom/av1/encoder/rd.c
+++ b/third_party/aom/av1/encoder/rd.c
@@ -13,7 +13,7 @@
 #include <math.h>
 #include <stdio.h>
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
@@ -36,9 +36,7 @@
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encoder.h"
-#if CONFIG_LV_MAP
 #include "av1/encoder/encodetxb.h"
-#endif
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
@@ -54,114 +52,96 @@
 // This table is used to correct for block size.
 // The factors here are << 2 (2 = x0.5, 32 = x8 etc).
 static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  2,  2,  2,
-#endif
-  2,  3,  3,  4, 6,  6,  8, 12, 12, 16, 24, 24, 32,
-#if CONFIG_EXT_PARTITION
-  48, 48, 64,
-#endif  // CONFIG_EXT_PARTITION
-  4,  4,  8,  8, 16, 16,
-#if CONFIG_EXT_PARTITION
-  32, 32
-#endif  // CONFIG_EXT_PARTITION
+  2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32, 48, 48, 64, 4, 4, 8, 8, 16, 16
 };
 
-#if CONFIG_EXT_TX
 static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA][EXT_TX_SIZES] =
     {
-#if CONFIG_CHROMA_2X2
-      { 1, 1, 1, 1, 1 },  // unused
-      { 0, 1, 1, 0, 0 },
-      { 0, 0, 0, 1, 0 },
-#if CONFIG_MRC_TX
-      { 0, 0, 0, 0, 1 },
-#endif  // CONFIG_MRC_TX
-#else   // CONFIG_CHROMA_2X2
       { 1, 1, 1, 1 },  // unused
       { 1, 1, 0, 0 },
       { 0, 0, 1, 0 },
-#if CONFIG_MRC_TX
-      { 0, 0, 0, 1 },
-#endif  // CONFIG_MRC_TX
-#endif  // CONFIG_CHROMA_2X2
     };
 
 static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER][EXT_TX_SIZES] =
     {
-#if CONFIG_CHROMA_2X2
-      { 1, 1, 1, 1, 1 },  // unused
-      { 0, 1, 1, 0, 0 }, { 0, 0, 0, 1, 0 }, { 0, 0, 0, 0, 1 },
-#if CONFIG_MRC_TX
-      { 0, 0, 0, 0, 1 },
-#endif  // CONFIG_MRC_TX
-#else   // CONFIG_CHROMA_2X2
       { 1, 1, 1, 1 },  // unused
-      { 1, 1, 0, 0 }, { 0, 0, 1, 0 }, { 0, 0, 0, 1 },
-#if CONFIG_MRC_TX
+      { 1, 1, 0, 0 },
+      { 0, 0, 1, 0 },
       { 0, 0, 0, 1 },
-#endif  // CONFIG_MRC_TX
-#endif  // CONFIG_CHROMA_2X2
     };
-#endif  // CONFIG_EXT_TX
+
+static const int av1_ext_tx_set_idx_to_type[2][AOMMAX(EXT_TX_SETS_INTRA,
+                                                      EXT_TX_SETS_INTER)] = {
+  {
+      // Intra
+      EXT_TX_SET_DCTONLY,
+      EXT_TX_SET_DTT4_IDTX_1DDCT,
+      EXT_TX_SET_DTT4_IDTX,
+  },
+  {
+      // Inter
+      EXT_TX_SET_DCTONLY,
+      EXT_TX_SET_ALL16,
+      EXT_TX_SET_DTT9_IDTX_1DDCT,
+      EXT_TX_SET_DCT_IDTX,
+  },
+};
 
 void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
                          FRAME_CONTEXT *fc) {
   int i, j;
 
-  if (cm->frame_type == KEY_FRAME) {
-    for (i = 0; i < PARTITION_CONTEXTS_PRIMARY; ++i)
-      av1_cost_tokens_from_cdf(x->partition_cost[i], fc->partition_cdf[i],
+  for (i = 0; i < PARTITION_CONTEXTS; ++i)
+    av1_cost_tokens_from_cdf(x->partition_cost[i], fc->partition_cdf[i], NULL);
+
+  if (cm->skip_mode_flag) {
+    for (i = 0; i < SKIP_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->skip_mode_cost[i], fc->skip_mode_cdfs[i],
                                NULL);
-#if CONFIG_UNPOISON_PARTITION_CTX
-    for (; i < PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES; ++i) {
-      aom_prob p = fc->partition_prob[i][PARTITION_VERT];
-      assert(p > 0);
-      x->partition_cost[i][PARTITION_NONE] = INT_MAX;
-      x->partition_cost[i][PARTITION_HORZ] = INT_MAX;
-      x->partition_cost[i][PARTITION_VERT] = av1_cost_bit(p, 0);
-      x->partition_cost[i][PARTITION_SPLIT] = av1_cost_bit(p, 1);
-    }
-    for (; i < PARTITION_CONTEXTS_PRIMARY + 2 * PARTITION_BLOCK_SIZES; ++i) {
-      aom_prob p = fc->partition_prob[i][PARTITION_HORZ];
-      assert(p > 0);
-      x->partition_cost[i][PARTITION_NONE] = INT_MAX;
-      x->partition_cost[i][PARTITION_HORZ] = av1_cost_bit(p, 0);
-      x->partition_cost[i][PARTITION_VERT] = INT_MAX;
-      x->partition_cost[i][PARTITION_SPLIT] = av1_cost_bit(p, 1);
     }
-    x->partition_cost[PARTITION_CONTEXTS][PARTITION_NONE] = INT_MAX;
-    x->partition_cost[PARTITION_CONTEXTS][PARTITION_HORZ] = INT_MAX;
-    x->partition_cost[PARTITION_CONTEXTS][PARTITION_VERT] = INT_MAX;
-    x->partition_cost[PARTITION_CONTEXTS][PARTITION_SPLIT] = 0;
-#endif  // CONFIG_UNPOISON_PARTITION_CTX
   }
 
-#if CONFIG_KF_CTX
+  for (i = 0; i < SKIP_CONTEXTS; ++i) {
+    av1_cost_tokens_from_cdf(x->skip_cost[i], fc->skip_cdfs[i], NULL);
+  }
+
   for (i = 0; i < KF_MODE_CONTEXTS; ++i)
     for (j = 0; j < KF_MODE_CONTEXTS; ++j)
       av1_cost_tokens_from_cdf(x->y_mode_costs[i][j], fc->kf_y_cdf[i][j], NULL);
-#else
-  for (i = 0; i < INTRA_MODES; ++i)
-    for (j = 0; j < INTRA_MODES; ++j)
-      av1_cost_tokens_from_cdf(x->y_mode_costs[i][j], fc->kf_y_cdf[i][j], NULL);
-#endif
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
     av1_cost_tokens_from_cdf(x->mbmode_cost[i], fc->y_mode_cdf[i], NULL);
-  for (i = 0; i < INTRA_MODES; ++i)
-    av1_cost_tokens_from_cdf(x->intra_uv_mode_cost[i], fc->uv_mode_cdf[i],
-                             NULL);
+  for (i = 0; i < CFL_ALLOWED_TYPES; ++i)
+    for (j = 0; j < INTRA_MODES; ++j)
+      av1_cost_tokens_from_cdf(x->intra_uv_mode_cost[i][j],
+                               fc->uv_mode_cdf[i][j], NULL);
+
+  av1_cost_tokens_from_cdf(x->filter_intra_mode_cost, fc->filter_intra_mode_cdf,
+                           NULL);
+  for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+    if (av1_filter_intra_allowed_bsize(cm, i))
+      av1_cost_tokens_from_cdf(x->filter_intra_cost[i],
+                               fc->filter_intra_cdfs[i], NULL);
+  }
 
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
     av1_cost_tokens_from_cdf(x->switchable_interp_costs[i],
                              fc->switchable_interp_cdf[i], NULL);
 
-  for (i = 0; i < PALETTE_BLOCK_SIZES; ++i) {
+  for (i = 0; i < PALATTE_BSIZE_CTXS; ++i) {
     av1_cost_tokens_from_cdf(x->palette_y_size_cost[i],
                              fc->palette_y_size_cdf[i], NULL);
     av1_cost_tokens_from_cdf(x->palette_uv_size_cost[i],
                              fc->palette_uv_size_cdf[i], NULL);
+    for (j = 0; j < PALETTE_Y_MODE_CONTEXTS; ++j) {
+      av1_cost_tokens_from_cdf(x->palette_y_mode_cost[i][j],
+                               fc->palette_y_mode_cdf[i][j], NULL);
+    }
+  }
+
+  for (i = 0; i < PALETTE_UV_MODE_CONTEXTS; ++i) {
+    av1_cost_tokens_from_cdf(x->palette_uv_mode_cost[i],
+                             fc->palette_uv_mode_cdf[i], NULL);
   }
 
   for (i = 0; i < PALETTE_SIZES; ++i) {
@@ -172,60 +152,38 @@ void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
                                fc->palette_uv_color_index_cdf[i][j], NULL);
     }
   }
-#if CONFIG_MRC_TX
-  for (i = 0; i < PALETTE_SIZES; ++i) {
-    for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) {
-      av1_cost_tokens_from_cdf(x->mrc_mask_inter_cost[i][j],
-                               fc->mrc_mask_inter_cdf[i][j], NULL);
-      av1_cost_tokens_from_cdf(x->mrc_mask_intra_cost[i][j],
-                               fc->mrc_mask_intra_cdf[i][j], NULL);
-    }
-  }
-#endif  // CONFIG_MRC_TX
 
-#if CONFIG_CFL
   int sign_cost[CFL_JOINT_SIGNS];
   av1_cost_tokens_from_cdf(sign_cost, fc->cfl_sign_cdf, NULL);
   for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
-    const aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
-    const aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
     int *cost_u = x->cfl_cost[joint_sign][CFL_PRED_U];
     int *cost_v = x->cfl_cost[joint_sign][CFL_PRED_V];
-    if (CFL_SIGN_U(joint_sign) == CFL_SIGN_ZERO)
+    if (CFL_SIGN_U(joint_sign) == CFL_SIGN_ZERO) {
       memset(cost_u, 0, CFL_ALPHABET_SIZE * sizeof(*cost_u));
-    else
+    } else {
+      const aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
       av1_cost_tokens_from_cdf(cost_u, cdf_u, NULL);
-    if (CFL_SIGN_V(joint_sign) == CFL_SIGN_ZERO)
+    }
+    if (CFL_SIGN_V(joint_sign) == CFL_SIGN_ZERO) {
       memset(cost_v, 0, CFL_ALPHABET_SIZE * sizeof(*cost_v));
-    else
+    } else {
+      const aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
       av1_cost_tokens_from_cdf(cost_v, cdf_v, NULL);
+    }
     for (int u = 0; u < CFL_ALPHABET_SIZE; u++)
       cost_u[u] += sign_cost[joint_sign];
   }
-#endif  // CONFIG_CFL
 
-  for (i = 0; i < MAX_TX_DEPTH; ++i)
+  for (i = 0; i < MAX_TX_CATS; ++i)
     for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
       av1_cost_tokens_from_cdf(x->tx_size_cost[i][j], fc->tx_size_cdf[i][j],
                                NULL);
 
-#if CONFIG_EXT_TX
-#if CONFIG_LGT_FROM_PRED
-  if (LGT_FROM_PRED_INTRA) {
-    for (i = 0; i < LGT_SIZES; ++i) {
-      for (j = 0; j < INTRA_MODES; ++j) {
-        x->intra_lgt_cost[i][j][0] = av1_cost_bit(fc->intra_lgt_prob[i][j], 0);
-        x->intra_lgt_cost[i][j][1] = av1_cost_bit(fc->intra_lgt_prob[i][j], 1);
-      }
-    }
-  }
-  if (LGT_FROM_PRED_INTER) {
-    for (i = 0; i < LGT_SIZES; ++i) {
-      x->inter_lgt_cost[i][0] = av1_cost_bit(fc->inter_lgt_prob[i], 0);
-      x->inter_lgt_cost[i][1] = av1_cost_bit(fc->inter_lgt_prob[i], 1);
-    }
+  for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i) {
+    av1_cost_tokens_from_cdf(x->txfm_partition_cost[i],
+                             fc->txfm_partition_cdf[i], NULL);
   }
-#endif  // CONFIG_LGT_FROM_PRED
+
   for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
     int s;
     for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
@@ -245,125 +203,124 @@ void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
       }
     }
   }
-#else
-  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
-    for (j = 0; j < TX_TYPES; ++j)
-      av1_cost_tokens_from_cdf(x->intra_tx_type_costs[i][j],
-                               fc->intra_ext_tx_cdf[i][j], av1_ext_tx_inv);
-  }
-  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
-    av1_cost_tokens_from_cdf(x->inter_tx_type_costs[i], fc->inter_ext_tx_cdf[i],
-                             av1_ext_tx_inv);
-  }
-#endif  // CONFIG_EXT_TX
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-  for (i = 0; i < INTRA_FILTERS + 1; ++i)
-    av1_cost_tokens_from_cdf(x->intra_filter_cost[i], fc->intra_filter_cdf[i],
+  for (i = 0; i < DIRECTIONAL_MODES; ++i) {
+    av1_cost_tokens_from_cdf(x->angle_delta_cost[i], fc->angle_delta_cdf[i],
                              NULL);
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_LOOP_RESTORATION
-  av1_cost_tokens(x->switchable_restore_cost, fc->switchable_restore_prob,
-                  av1_switchable_restore_tree);
-#endif  // CONFIG_LOOP_RESTORATION
-#if CONFIG_INTRABC
+  }
+  av1_cost_tokens_from_cdf(x->switchable_restore_cost,
+                           fc->switchable_restore_cdf, NULL);
+  av1_cost_tokens_from_cdf(x->wiener_restore_cost, fc->wiener_restore_cdf,
+                           NULL);
+  av1_cost_tokens_from_cdf(x->sgrproj_restore_cost, fc->sgrproj_restore_cdf,
+                           NULL);
   av1_cost_tokens_from_cdf(x->intrabc_cost, fc->intrabc_cdf, NULL);
-#endif  // CONFIG_INTRABC
 
   if (!frame_is_intra_only(cm)) {
+    for (i = 0; i < COMP_INTER_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->comp_inter_cost[i], fc->comp_inter_cdf[i],
+                               NULL);
+    }
+
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      for (j = 0; j < SINGLE_REFS - 1; ++j) {
+        av1_cost_tokens_from_cdf(x->single_ref_cost[i][j],
+                                 fc->single_ref_cdf[i][j], NULL);
+      }
+    }
+
+    for (i = 0; i < COMP_REF_TYPE_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->comp_ref_type_cost[i],
+                               fc->comp_ref_type_cdf[i], NULL);
+    }
+
+    for (i = 0; i < UNI_COMP_REF_CONTEXTS; ++i) {
+      for (j = 0; j < UNIDIR_COMP_REFS - 1; ++j) {
+        av1_cost_tokens_from_cdf(x->uni_comp_ref_cost[i][j],
+                                 fc->uni_comp_ref_cdf[i][j], NULL);
+      }
+    }
+
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      for (j = 0; j < FWD_REFS - 1; ++j) {
+        av1_cost_tokens_from_cdf(x->comp_ref_cost[i][j], fc->comp_ref_cdf[i][j],
+                                 NULL);
+      }
+    }
+
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      for (j = 0; j < BWD_REFS - 1; ++j) {
+        av1_cost_tokens_from_cdf(x->comp_bwdref_cost[i][j],
+                                 fc->comp_bwdref_cdf[i][j], NULL);
+      }
+    }
+
+    for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->intra_inter_cost[i], fc->intra_inter_cdf[i],
+                               NULL);
+    }
+
     for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) {
-#if CONFIG_NEW_MULTISYMBOL
       av1_cost_tokens_from_cdf(x->newmv_mode_cost[i], fc->newmv_cdf[i], NULL);
-#else
-      x->newmv_mode_cost[i][0] = av1_cost_bit(fc->newmv_prob[i], 0);
-      x->newmv_mode_cost[i][1] = av1_cost_bit(fc->newmv_prob[i], 1);
-#endif
     }
 
-    for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i) {
-#if CONFIG_NEW_MULTISYMBOL
+    for (i = 0; i < GLOBALMV_MODE_CONTEXTS; ++i) {
       av1_cost_tokens_from_cdf(x->zeromv_mode_cost[i], fc->zeromv_cdf[i], NULL);
-#else
-      x->zeromv_mode_cost[i][0] = av1_cost_bit(fc->zeromv_prob[i], 0);
-      x->zeromv_mode_cost[i][1] = av1_cost_bit(fc->zeromv_prob[i], 1);
-#endif
     }
 
     for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) {
-#if CONFIG_NEW_MULTISYMBOL
       av1_cost_tokens_from_cdf(x->refmv_mode_cost[i], fc->refmv_cdf[i], NULL);
-#else
-      x->refmv_mode_cost[i][0] = av1_cost_bit(fc->refmv_prob[i], 0);
-      x->refmv_mode_cost[i][1] = av1_cost_bit(fc->refmv_prob[i], 1);
-#endif
     }
 
     for (i = 0; i < DRL_MODE_CONTEXTS; ++i) {
-#if CONFIG_NEW_MULTISYMBOL
       av1_cost_tokens_from_cdf(x->drl_mode_cost0[i], fc->drl_cdf[i], NULL);
-#else
-      x->drl_mode_cost0[i][0] = av1_cost_bit(fc->drl_prob[i], 0);
-      x->drl_mode_cost0[i][1] = av1_cost_bit(fc->drl_prob[i], 1);
-#endif
     }
     for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
       av1_cost_tokens_from_cdf(x->inter_compound_mode_cost[i],
                                fc->inter_compound_mode_cdf[i], NULL);
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
     for (i = 0; i < BLOCK_SIZES_ALL; ++i)
       av1_cost_tokens_from_cdf(x->compound_type_cost[i],
                                fc->compound_type_cdf[i], NULL);
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-#if CONFIG_COMPOUND_SINGLEREF
-    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-      av1_cost_tokens_from_cdf(x->inter_singleref_comp_mode_cost[i],
-                               fc->inter_singleref_comp_mode_cdf[i], NULL);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_INTERINTRA
-    for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+    for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+      if (get_interinter_wedge_bits(i)) {
+        av1_cost_tokens_from_cdf(x->wedge_idx_cost[i], fc->wedge_idx_cdf[i],
+                                 NULL);
+      }
+    }
+    for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
+      av1_cost_tokens_from_cdf(x->interintra_cost[i], fc->interintra_cdf[i],
+                               NULL);
       av1_cost_tokens_from_cdf(x->interintra_mode_cost[i],
                                fc->interintra_mode_cdf[i], NULL);
-#endif  // CONFIG_INTERINTRA
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    }
+    for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+      av1_cost_tokens_from_cdf(x->wedge_interintra_cost[i],
+                               fc->wedge_interintra_cdf[i], NULL);
+    }
     for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
       av1_cost_tokens_from_cdf(x->motion_mode_cost[i], fc->motion_mode_cdf[i],
                                NULL);
     }
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
     for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-      av1_cost_tokens_from_cdf(x->motion_mode_cost2[i], fc->ncobmc_cdf[i],
-                               NULL);
-#endif
-#if CONFIG_NEW_MULTISYMBOL || CONFIG_NCOBMC_ADAPT_WEIGHT
       av1_cost_tokens_from_cdf(x->motion_mode_cost1[i], fc->obmc_cdf[i], NULL);
-#else
-      x->motion_mode_cost1[i][0] = av1_cost_bit(fc->obmc_prob[i], 0);
-      x->motion_mode_cost1[i][1] = av1_cost_bit(fc->obmc_prob[i], 1);
-#endif
     }
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT
-    for (i = ADAPT_OVERLAP_BLOCK_8X8; i < ADAPT_OVERLAP_BLOCKS; ++i) {
-      av1_cost_tokens_from_cdf(x->ncobmc_mode_cost[i], fc->ncobmc_mode_cdf[i],
+    for (i = 0; i < COMP_INDEX_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->comp_idx_cost[i], fc->compound_index_cdf[i],
                                NULL);
     }
-#endif  // CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    for (i = 0; i < COMP_GROUP_IDX_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->comp_group_idx_cost[i],
+                               fc->comp_group_idx_cdf[i], NULL);
+    }
   }
 }
 
 // Values are now correlated to quantizer.
 static int sad_per_bit16lut_8[QINDEX_RANGE];
 static int sad_per_bit4lut_8[QINDEX_RANGE];
-
-#if CONFIG_HIGHBITDEPTH
 static int sad_per_bit16lut_10[QINDEX_RANGE];
 static int sad_per_bit4lut_10[QINDEX_RANGE];
 static int sad_per_bit16lut_12[QINDEX_RANGE];
 static int sad_per_bit4lut_12[QINDEX_RANGE];
-#endif
 
 static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range,
                             aom_bit_depth_t bit_depth) {
@@ -381,31 +338,26 @@ static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range,
 void av1_init_me_luts(void) {
   init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE,
                   AOM_BITS_8);
-#if CONFIG_HIGHBITDEPTH
   init_me_luts_bd(sad_per_bit16lut_10, sad_per_bit4lut_10, QINDEX_RANGE,
                   AOM_BITS_10);
   init_me_luts_bd(sad_per_bit16lut_12, sad_per_bit4lut_12, QINDEX_RANGE,
                   AOM_BITS_12);
-#endif
 }
 
 static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
                                          8,  8,  4,  4,  2,  2,  1,  0 };
 static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = {
   128, 144, 128, 128, 144,
-#if CONFIG_EXT_REFS
   // TODO(zoeliu): To adjust further following factor values.
   128, 128, 128,
   // TODO(weitinglin): We should investigate if the values should be the same
   //                   as the value used by OVERLAY frame
   144,  // INTNL_OVERLAY_UPDATE
   128   // INTNL_ARF_UPDATE
-#endif  // CONFIG_EXT_REFS
 };
 
 int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) {
-  const int64_t q = av1_dc_quant(qindex, 0, cpi->common.bit_depth);
-#if CONFIG_HIGHBITDEPTH
+  const int64_t q = av1_dc_quant_Q3(qindex, 0, cpi->common.bit_depth);
   int64_t rdmult = 0;
   switch (cpi->common.bit_depth) {
     case AOM_BITS_8: rdmult = 88 * q * q / 24; break;
@@ -415,9 +367,6 @@ int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) {
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
   }
-#else
-  int64_t rdmult = 88 * q * q / 24;
-#endif  // CONFIG_HIGHBITDEPTH
   if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
@@ -432,25 +381,19 @@ int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) {
 
 static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) {
   double q;
-#if CONFIG_HIGHBITDEPTH
   switch (bit_depth) {
-    case AOM_BITS_8: q = av1_dc_quant(qindex, 0, AOM_BITS_8) / 4.0; break;
-    case AOM_BITS_10: q = av1_dc_quant(qindex, 0, AOM_BITS_10) / 16.0; break;
-    case AOM_BITS_12: q = av1_dc_quant(qindex, 0, AOM_BITS_12) / 64.0; break;
+    case AOM_BITS_8: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_8) / 4.0; break;
+    case AOM_BITS_10: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_10) / 16.0; break;
+    case AOM_BITS_12: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_12) / 64.0; break;
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
   }
-#else
-  (void)bit_depth;
-  q = av1_dc_quant(qindex, 0, AOM_BITS_8) / 4.0;
-#endif  // CONFIG_HIGHBITDEPTH
   // TODO(debargha): Adjust the function below.
   return AOMMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
 }
 
 void av1_initialize_me_consts(const AV1_COMP *cpi, MACROBLOCK *x, int qindex) {
-#if CONFIG_HIGHBITDEPTH
   switch (cpi->common.bit_depth) {
     case AOM_BITS_8:
       x->sadperbit16 = sad_per_bit16lut_8[qindex];
@@ -467,11 +410,6 @@ void av1_initialize_me_consts(const AV1_COMP *cpi, MACROBLOCK *x, int qindex) {
     default:
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
   }
-#else
-  (void)cpi;
-  x->sadperbit16 = sad_per_bit16lut_8[qindex];
-  x->sadperbit4 = sad_per_bit4lut_8[qindex];
-#endif  // CONFIG_HIGHBITDEPTH
 }
 
 static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) {
@@ -490,195 +428,89 @@ static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) {
       const int t = q * rd_thresh_block_size_factor[bsize];
       const int thresh_max = INT_MAX / t;
 
-#if CONFIG_CB4X4
       for (i = 0; i < MAX_MODES; ++i)
         rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max
                                                  ? rd->thresh_mult[i] * t / 4
                                                  : INT_MAX;
-#else
-      if (bsize >= BLOCK_8X8) {
-        for (i = 0; i < MAX_MODES; ++i)
-          rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max
-                                                   ? rd->thresh_mult[i] * t / 4
-                                                   : INT_MAX;
-      } else {
-        for (i = 0; i < MAX_REFS; ++i)
-          rd->threshes[segment_id][bsize][i] =
-              rd->thresh_mult_sub8x8[i] < thresh_max
-                  ? rd->thresh_mult_sub8x8[i] * t / 4
-                  : INT_MAX;
-      }
-#endif
     }
   }
 }
 
-void av1_set_mvcost(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int ref,
-                    int ref_mv_idx) {
-  MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
-  int8_t rf_type = av1_ref_frame_type(x->e_mbd.mi[0]->mbmi.ref_frame);
-  int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                            mbmi_ext->ref_mv_stack[rf_type], ref, ref_mv_idx);
-  (void)ref_frame;
-  x->mvcost = x->mv_cost_stack[nmv_ctx];
-  x->nmvjointcost = x->nmv_vec_cost[nmv_ctx];
+void av1_set_mvcost(MACROBLOCK *x, int ref, int ref_mv_idx) {
+  (void)ref;
+  (void)ref_mv_idx;
+  x->mvcost = x->mv_cost_stack;
+  x->nmvjointcost = x->nmv_vec_cost;
 }
 
-#if CONFIG_LV_MAP
-#if !LV_MAP_PROB
-static void get_rate_cost(aom_prob p, int cost[2]) {
-  cost[0] = av1_cost_bit(p, 0);
-  cost[1] = av1_cost_bit(p, 1);
-}
-#endif  // !LV_MAP_PROB
-
-void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc) {
+void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
+                          const int num_planes) {
+  const int nplanes = AOMMIN(num_planes, PLANE_TYPES);
+  for (int eob_multi_size = 0; eob_multi_size < 7; ++eob_multi_size) {
+    for (int plane = 0; plane < nplanes; ++plane) {
+      LV_MAP_EOB_COST *pcost = &x->eob_costs[eob_multi_size][plane];
+
+      for (int ctx = 0; ctx < 2; ++ctx) {
+        aom_cdf_prob *pcdf;
+        switch (eob_multi_size) {
+          case 0: pcdf = fc->eob_flag_cdf16[plane][ctx]; break;
+          case 1: pcdf = fc->eob_flag_cdf32[plane][ctx]; break;
+          case 2: pcdf = fc->eob_flag_cdf64[plane][ctx]; break;
+          case 3: pcdf = fc->eob_flag_cdf128[plane][ctx]; break;
+          case 4: pcdf = fc->eob_flag_cdf256[plane][ctx]; break;
+          case 5: pcdf = fc->eob_flag_cdf512[plane][ctx]; break;
+          case 6:
+          default: pcdf = fc->eob_flag_cdf1024[plane][ctx]; break;
+        }
+        av1_cost_tokens_from_cdf(pcost->eob_cost[ctx], pcdf, NULL);
+      }
+    }
+  }
   for (int tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
-    for (int plane = 0; plane < PLANE_TYPES; ++plane) {
+    for (int plane = 0; plane < nplanes; ++plane) {
       LV_MAP_COEFF_COST *pcost = &x->coeff_costs[tx_size][plane];
 
-#if LV_MAP_PROB
       for (int ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx)
         av1_cost_tokens_from_cdf(pcost->txb_skip_cost[ctx],
                                  fc->txb_skip_cdf[tx_size][ctx], NULL);
 
+      for (int ctx = 0; ctx < SIG_COEF_CONTEXTS_EOB; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->base_eob_cost[ctx],
+                                 fc->coeff_base_eob_cdf[tx_size][plane][ctx],
+                                 NULL);
       for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx)
-        av1_cost_tokens_from_cdf(pcost->nz_map_cost[ctx],
-                                 fc->nz_map_cdf[tx_size][plane][ctx], NULL);
+        av1_cost_tokens_from_cdf(pcost->base_cost[ctx],
+                                 fc->coeff_base_cdf[tx_size][plane][ctx], NULL);
 
       for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx)
-        av1_cost_tokens_from_cdf(pcost->eob_cost[ctx],
-                                 fc->eob_flag_cdf[tx_size][plane][ctx], NULL);
+        av1_cost_tokens_from_cdf(pcost->eob_extra_cost[ctx],
+                                 fc->eob_extra_cdf[tx_size][plane][ctx], NULL);
 
       for (int ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
         av1_cost_tokens_from_cdf(pcost->dc_sign_cost[ctx],
                                  fc->dc_sign_cdf[plane][ctx], NULL);
 
-      for (int layer = 0; layer < NUM_BASE_LEVELS; ++layer)
-        for (int ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx)
-          av1_cost_tokens_from_cdf(
-              pcost->base_cost[layer][ctx],
-              fc->coeff_base_cdf[tx_size][plane][layer][ctx], NULL);
-
-#if BR_NODE
-      for (int br = 0; br < BASE_RANGE_SETS; ++br)
-        for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx)
-          av1_cost_tokens_from_cdf(pcost->br_cost[br][ctx],
-                                   fc->coeff_br_cdf[tx_size][plane][br][ctx],
-                                   NULL);
-
       for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
-        int lps_rate[2];
-        av1_cost_tokens_from_cdf(lps_rate,
-                                 fc->coeff_lps_cdf[tx_size][plane][ctx], NULL);
-
-        for (int base_range = 0; base_range < COEFF_BASE_RANGE + 1;
-             ++base_range) {
-          int br_set_idx = base_range < COEFF_BASE_RANGE
-                               ? coeff_to_br_index[base_range]
-                               : BASE_RANGE_SETS;
-
-          pcost->lps_cost[ctx][base_range] = 0;
-
-          for (int idx = 0; idx < BASE_RANGE_SETS; ++idx) {
-            if (idx == br_set_idx) {
-              pcost->lps_cost[ctx][base_range] += pcost->br_cost[idx][ctx][1];
-
-              int br_base = br_index_to_coeff[br_set_idx];
-              int br_offset = base_range - br_base;
-              int extra_bits = (1 << br_extra_bits[idx]) - 1;
-              for (int tok = 0; tok < extra_bits; ++tok) {
-                if (tok == br_offset) {
-                  pcost->lps_cost[ctx][base_range] += lps_rate[1];
-                  break;
-                } else {
-                  pcost->lps_cost[ctx][base_range] += lps_rate[0];
-                }
-              }
-              break;
-            } else {
-              pcost->lps_cost[ctx][base_range] += pcost->br_cost[idx][ctx][0];
-            }
-          }
-          // load the base range cost
-        }
-      }
-#else   // BR_NODE
-      for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx)
-        av1_cost_tokens_from_cdf(pcost->lps_cost[ctx],
-                                 fc->coeff_lps_cdf[tx_size][plane][ctx], NULL);
-#endif  // BR_NODE
-#if CONFIG_CTX1D
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
-        av1_cost_tokens_from_cdf(pcost->eob_mode_cost[tx_class],
-                                 fc->eob_mode_cdf[tx_size][plane][tx_class],
+        int br_rate[BR_CDF_SIZE];
+        int prev_cost = 0;
+        int i, j;
+        av1_cost_tokens_from_cdf(br_rate, fc->coeff_br_cdf[tx_size][plane][ctx],
                                  NULL);
-
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
-        for (int ctx = 0; ctx < EMPTY_LINE_CONTEXTS; ++ctx)
-          av1_cost_tokens_from_cdf(
-              pcost->empty_line_cost[tx_class][ctx],
-              fc->empty_line_cdf[tx_size][plane][tx_class][ctx], NULL);
-
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
-        for (int ctx = 0; ctx < HV_EOB_CONTEXTS; ++ctx)
-          av1_cost_tokens_from_cdf(
-              pcost->hv_eob_cost[tx_class][ctx],
-              fc->hv_eob_cdf[tx_size][plane][tx_class][ctx], NULL);
-#endif  // CONFIG_CTX1D
-#else   // LV_MAP_PROB
-      for (int ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx)
-        get_rate_cost(fc->txb_skip[tx_size][ctx], pcost->txb_skip_cost[ctx]);
-
-      for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx)
-        get_rate_cost(fc->nz_map[tx_size][plane][ctx], pcost->nz_map_cost[ctx]);
-
-      for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx)
-        get_rate_cost(fc->eob_flag[tx_size][plane][ctx], pcost->eob_cost[ctx]);
-
-      for (int ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
-        get_rate_cost(fc->dc_sign[plane][ctx], pcost->dc_sign_cost[ctx]);
-
-      for (int layer = 0; layer < NUM_BASE_LEVELS; ++layer)
-        for (int ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx)
-          get_rate_cost(fc->coeff_base[tx_size][plane][layer][ctx],
-                        pcost->base_cost[layer][ctx]);
-
-      for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx)
-        get_rate_cost(fc->coeff_lps[tx_size][plane][ctx], pcost->lps_cost[ctx]);
-
-#if CONFIG_CTX1D
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
-        get_rate_cost(fc->eob_mode[tx_size][plane][tx_class],
-                      pcost->eob_mode_cost[tx_class]);
-
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
-        for (int ctx = 0; ctx < EMPTY_LINE_CONTEXTS; ++ctx)
-          get_rate_cost(fc->empty_line[tx_size][plane][tx_class][ctx],
-                        pcost->empty_line_cost[tx_class][ctx]);
-
-      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
-        for (int ctx = 0; ctx < HV_EOB_CONTEXTS; ++ctx)
-          get_rate_cost(fc->hv_eob[tx_size][plane][tx_class][ctx],
-                        pcost->hv_eob_cost[tx_class][ctx]);
-#endif  // CONFIG_CTX1D
-#endif  // LV_MAP_PROB
-    }
-  }
-}
-#endif  // CONFIG_LV_MAP
-
-void av1_fill_token_costs_from_cdf(av1_coeff_cost *cost,
-                                   coeff_cdf_model (*cdf)[PLANE_TYPES]) {
-  for (int tx = 0; tx < TX_SIZES; ++tx) {
-    for (int pt = 0; pt < PLANE_TYPES; ++pt) {
-      for (int rt = 0; rt < REF_TYPES; ++rt) {
-        for (int band = 0; band < COEF_BANDS; ++band) {
-          for (int ctx = 0; ctx < BAND_COEFF_CONTEXTS(band); ++ctx) {
-            av1_cost_tokens_from_cdf(cost[tx][pt][rt][band][ctx],
-                                     cdf[tx][pt][rt][band][ctx], NULL);
+        // printf("br_rate: ");
+        // for(j = 0; j < BR_CDF_SIZE; j++)
+        //  printf("%4d ", br_rate[j]);
+        // printf("\n");
+        for (i = 0; i < COEFF_BASE_RANGE; i += BR_CDF_SIZE - 1) {
+          for (j = 0; j < BR_CDF_SIZE - 1; j++) {
+            pcost->lps_cost[ctx][i + j] = prev_cost + br_rate[j];
           }
+          prev_cost += br_rate[j];
         }
+        pcost->lps_cost[ctx][i] = prev_cost;
+        // printf("lps_cost: %d %d %2d : ", tx_size, plane, ctx);
+        // for (i = 0; i <= COEFF_BASE_RANGE; i++)
+        //  printf("%5d ", pcost->lps_cost[ctx][i]);
+        // printf("\n");
       }
     }
   }
@@ -688,7 +520,6 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->td.mb;
   RD_OPT *const rd = &cpi->rd;
-  int nmv_ctx;
 
   aom_clear_system_state();
 
@@ -698,56 +529,35 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) {
 
   set_block_thresholds(cm, rd);
 
-  for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) {
-#if CONFIG_AMVR
-    if (cm->cur_frame_mv_precision_level) {
-      av1_build_nmv_cost_table(x->nmv_vec_cost[nmv_ctx], x->nmvcost[nmv_ctx],
-                               &cm->fc->nmvc[nmv_ctx], MV_SUBPEL_NONE);
-    } else {
-      av1_build_nmv_cost_table(
-          x->nmv_vec_cost[nmv_ctx],
-          cm->allow_high_precision_mv ? x->nmvcost_hp[nmv_ctx]
-                                      : x->nmvcost[nmv_ctx],
-          &cm->fc->nmvc[nmv_ctx], cm->allow_high_precision_mv);
-    }
-
-#else
+  if (cm->cur_frame_force_integer_mv) {
+    av1_build_nmv_cost_table(x->nmv_vec_cost, x->nmvcost, &cm->fc->nmvc,
+                             MV_SUBPEL_NONE);
+  } else {
     av1_build_nmv_cost_table(
-        x->nmv_vec_cost[nmv_ctx],
-        cm->allow_high_precision_mv ? x->nmvcost_hp[nmv_ctx]
-                                    : x->nmvcost[nmv_ctx],
-        &cm->fc->nmvc[nmv_ctx], cm->allow_high_precision_mv);
-#endif
+        x->nmv_vec_cost,
+        cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost, &cm->fc->nmvc,
+        cm->allow_high_precision_mv);
   }
-  x->mvcost = x->mv_cost_stack[0];
-  x->nmvjointcost = x->nmv_vec_cost[0];
 
-#if CONFIG_INTRABC
+  x->mvcost = x->mv_cost_stack;
+  x->nmvjointcost = x->nmv_vec_cost;
+
   if (frame_is_intra_only(cm) && cm->allow_screen_content_tools &&
       cpi->oxcf.pass != 1) {
-    av1_build_nmv_cost_table(
-        x->nmv_vec_cost[0],
-        cm->allow_high_precision_mv ? x->nmvcost_hp[0] : x->nmvcost[0],
-        &cm->fc->ndvc, MV_SUBPEL_NONE);
+    int *dvcost[2] = { &cpi->dv_cost[0][MV_MAX], &cpi->dv_cost[1][MV_MAX] };
+    av1_build_nmv_cost_table(cpi->dv_joint_cost, dvcost, &cm->fc->ndvc,
+                             MV_SUBPEL_NONE);
   }
-#endif
 
-#if CONFIG_GLOBAL_MOTION
   if (cpi->oxcf.pass != 1) {
     for (int i = 0; i < TRANS_TYPES; ++i)
-#if GLOBAL_TRANS_TYPES > 4
-      cpi->gmtype_cost[i] = (1 + (i > 0 ? GLOBAL_TYPE_BITS : 0))
-                            << AV1_PROB_COST_SHIFT;
-#else
       // IDENTITY: 1 bit
       // TRANSLATION: 3 bits
       // ROTZOOM: 2 bits
       // AFFINE: 3 bits
       cpi->gmtype_cost[i] = (1 + (i > 0 ? (i == ROTZOOM ? 1 : 2) : 0))
                             << AV1_PROB_COST_SHIFT;
-#endif  // GLOBAL_TRANS_TYPES > 4
   }
-#endif  // CONFIG_GLOBAL_MOTION
 }
 
 static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
@@ -840,288 +650,32 @@ void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2,
   }
 }
 
-static void get_entropy_contexts_plane(
-    BLOCK_SIZE plane_bsize, TX_SIZE tx_size, const struct macroblockd_plane *pd,
-    ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
-    ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) {
+static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize,
+                                       const struct macroblockd_plane *pd,
+                                       ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+                                       ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) {
   const int num_4x4_w = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
   const int num_4x4_h = block_size_high[plane_bsize] >> tx_size_high_log2[0];
   const ENTROPY_CONTEXT *const above = pd->above_context;
   const ENTROPY_CONTEXT *const left = pd->left_context;
 
-#if CONFIG_LV_MAP
   memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
   memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
-  return;
-#endif  // CONFIG_LV_MAP
-
-  int i;
-
-#if CONFIG_CHROMA_2X2
-  switch (tx_size) {
-    case TX_2X2:
-      memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
-      memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
-      break;
-    case TX_4X4:
-      for (i = 0; i < num_4x4_w; i += 2)
-        t_above[i] = !!*(const uint16_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 2)
-        t_left[i] = !!*(const uint16_t *)&left[i];
-      break;
-    case TX_8X8:
-      for (i = 0; i < num_4x4_w; i += 4)
-        t_above[i] = !!*(const uint32_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 4)
-        t_left[i] = !!*(const uint32_t *)&left[i];
-      break;
-    case TX_16X16:
-      for (i = 0; i < num_4x4_w; i += 8)
-        t_above[i] = !!*(const uint64_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 8)
-        t_left[i] = !!*(const uint64_t *)&left[i];
-      break;
-    case TX_32X32:
-      for (i = 0; i < num_4x4_w; i += 16)
-        t_above[i] =
-            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
-      for (i = 0; i < num_4x4_h; i += 16)
-        t_left[i] =
-            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
-      break;
-#if CONFIG_TX64X64
-    case TX_32X64:
-      for (i = 0; i < num_4x4_w; i += 16)
-        t_above[i] =
-            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
-      for (i = 0; i < num_4x4_h; i += 32)
-        t_left[i] =
-            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8] |
-               *(const uint64_t *)&left[i + 16] |
-               *(const uint64_t *)&left[i + 24]);
-      break;
-    case TX_64X32:
-      for (i = 0; i < num_4x4_w; i += 32)
-        t_above[i] =
-            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8] |
-               *(const uint64_t *)&above[i + 16] |
-               *(const uint64_t *)&above[i + 24]);
-      for (i = 0; i < num_4x4_h; i += 16)
-        t_left[i] =
-            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
-      break;
-    case TX_64X64:
-      for (i = 0; i < num_4x4_w; i += 32)
-        t_above[i] =
-            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8] |
-               *(const uint64_t *)&above[i + 16] |
-               *(const uint64_t *)&above[i + 24]);
-      for (i = 0; i < num_4x4_h; i += 32)
-        t_left[i] =
-            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8] |
-               *(const uint64_t *)&left[i + 16] |
-               *(const uint64_t *)&left[i + 24]);
-      break;
-#endif  // CONFIG_TX64X64
-    case TX_4X8:
-      for (i = 0; i < num_4x4_w; i += 2)
-        t_above[i] = !!*(const uint16_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 4)
-        t_left[i] = !!*(const uint32_t *)&left[i];
-      break;
-    case TX_8X4:
-      for (i = 0; i < num_4x4_w; i += 4)
-        t_above[i] = !!*(const uint32_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 2)
-        t_left[i] = !!*(const uint16_t *)&left[i];
-      break;
-    case TX_8X16:
-      for (i = 0; i < num_4x4_w; i += 4)
-        t_above[i] = !!*(const uint32_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 8)
-        t_left[i] = !!*(const uint64_t *)&left[i];
-      break;
-    case TX_16X8:
-      for (i = 0; i < num_4x4_w; i += 8)
-        t_above[i] = !!*(const uint64_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 4)
-        t_left[i] = !!*(const uint32_t *)&left[i];
-      break;
-    case TX_16X32:
-      for (i = 0; i < num_4x4_w; i += 8)
-        t_above[i] = !!*(const uint64_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 16)
-        t_left[i] =
-            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
-      break;
-    case TX_32X16:
-      for (i = 0; i < num_4x4_w; i += 16)
-        t_above[i] =
-            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
-      for (i = 0; i < num_4x4_h; i += 8)
-        t_left[i] = !!*(const uint64_t *)&left[i];
-      break;
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    case TX_4X16:
-      for (i = 0; i < num_4x4_w; i += 2)
-        t_above[i] = !!*(const uint16_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 8)
-        t_left[i] = !!*(const uint64_t *)&left[i];
-      break;
-    case TX_16X4:
-      for (i = 0; i < num_4x4_w; i += 8)
-        t_above[i] = !!*(const uint64_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 2)
-        t_left[i] = !!*(const uint16_t *)&left[i];
-      break;
-    case TX_8X32:
-      for (i = 0; i < num_4x4_w; i += 4)
-        t_above[i] = !!*(const uint32_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 16)
-        t_left[i] =
-            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
-      break;
-    case TX_32X8:
-      for (i = 0; i < num_4x4_w; i += 16)
-        t_above[i] =
-            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
-      for (i = 0; i < num_4x4_h; i += 4)
-        t_left[i] = !!*(const uint32_t *)&left[i];
-      break;
-#endif
-
-    default: assert(0 && "Invalid transform size."); break;
-  }
-  return;
-#endif  // CONFIG_CHROMA_2X2
-
-  switch (tx_size) {
-    case TX_4X4:
-      memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
-      memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
-      break;
-    case TX_8X8:
-      for (i = 0; i < num_4x4_w; i += 2)
-        t_above[i] = !!*(const uint16_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 2)
-        t_left[i] = !!*(const uint16_t *)&left[i];
-      break;
-    case TX_16X16:
-      for (i = 0; i < num_4x4_w; i += 4)
-        t_above[i] = !!*(const uint32_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 4)
-        t_left[i] = !!*(const uint32_t *)&left[i];
-      break;
-    case TX_32X32:
-      for (i = 0; i < num_4x4_w; i += 8)
-        t_above[i] = !!*(const uint64_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 8)
-        t_left[i] = !!*(const uint64_t *)&left[i];
-      break;
-#if CONFIG_TX64X64
-    case TX_32X64:
-      for (i = 0; i < num_4x4_w; i += 8)
-        t_above[i] = !!*(const uint64_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 16)
-        t_left[i] =
-            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
-      break;
-    case TX_64X32:
-      for (i = 0; i < num_4x4_w; i += 16)
-        t_above[i] =
-            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
-      for (i = 0; i < num_4x4_h; i += 8)
-        t_left[i] = !!*(const uint64_t *)&left[i];
-      break;
-    case TX_64X64:
-      for (i = 0; i < num_4x4_w; i += 16)
-        t_above[i] =
-            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
-      for (i = 0; i < num_4x4_h; i += 16)
-        t_left[i] =
-            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
-      break;
-#endif  // CONFIG_TX64X64
-    case TX_4X8:
-      memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
-      for (i = 0; i < num_4x4_h; i += 2)
-        t_left[i] = !!*(const uint16_t *)&left[i];
-      break;
-    case TX_8X4:
-      for (i = 0; i < num_4x4_w; i += 2)
-        t_above[i] = !!*(const uint16_t *)&above[i];
-      memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
-      break;
-    case TX_8X16:
-      for (i = 0; i < num_4x4_w; i += 2)
-        t_above[i] = !!*(const uint16_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 4)
-        t_left[i] = !!*(const uint32_t *)&left[i];
-      break;
-    case TX_16X8:
-      for (i = 0; i < num_4x4_w; i += 4)
-        t_above[i] = !!*(const uint32_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 2)
-        t_left[i] = !!*(const uint16_t *)&left[i];
-      break;
-    case TX_16X32:
-      for (i = 0; i < num_4x4_w; i += 4)
-        t_above[i] = !!*(const uint32_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 8)
-        t_left[i] = !!*(const uint64_t *)&left[i];
-      break;
-    case TX_32X16:
-      for (i = 0; i < num_4x4_w; i += 8)
-        t_above[i] = !!*(const uint64_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 4)
-        t_left[i] = !!*(const uint32_t *)&left[i];
-      break;
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    case TX_4X16:
-      memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
-      for (i = 0; i < num_4x4_h; i += 4)
-        t_left[i] = !!*(const uint32_t *)&left[i];
-      break;
-    case TX_16X4:
-      for (i = 0; i < num_4x4_w; i += 4)
-        t_above[i] = !!*(const uint32_t *)&above[i];
-      memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
-      break;
-    case TX_8X32:
-      for (i = 0; i < num_4x4_w; i += 2)
-        t_above[i] = !!*(const uint16_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 8)
-        t_left[i] = !!*(const uint64_t *)&left[i];
-      break;
-    case TX_32X8:
-      for (i = 0; i < num_4x4_w; i += 8)
-        t_above[i] = !!*(const uint64_t *)&above[i];
-      for (i = 0; i < num_4x4_h; i += 2)
-        t_left[i] = !!*(const uint16_t *)&left[i];
-      break;
-#endif
-    default: assert(0 && "Invalid transform size."); break;
-  }
 }
 
-void av1_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+void av1_get_entropy_contexts(BLOCK_SIZE bsize,
                               const struct macroblockd_plane *pd,
-                              ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
-                              ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) {
-#if CONFIG_CHROMA_SUB8X8
+                              ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+                              ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) {
   const BLOCK_SIZE plane_bsize =
-      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#else
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#endif
-  get_entropy_contexts_plane(plane_bsize, tx_size, pd, t_above, t_left);
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+  get_entropy_contexts_plane(plane_bsize, pd, t_above, t_left);
 }
 
 void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
                  int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
   int i;
   int zero_seen = 0;
-  int best_index = 0;
   int best_sad = INT_MAX;
   int this_sad = INT_MAX;
   int max_mv = 0;
@@ -1129,11 +683,15 @@ void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
   uint8_t *ref_y_ptr;
   MV pred_mv[MAX_MV_REF_CANDIDATES + 1];
   int num_mv_refs = 0;
-
-  pred_mv[num_mv_refs++] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv;
-  if (x->mbmi_ext->ref_mvs[ref_frame][0].as_int !=
-      x->mbmi_ext->ref_mvs[ref_frame][1].as_int) {
-    pred_mv[num_mv_refs++] = x->mbmi_ext->ref_mvs[ref_frame][1].as_mv;
+  const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME };
+  const int_mv ref_mv =
+      av1_get_ref_mv_from_stack(0, ref_frames, 0, x->mbmi_ext);
+  const int_mv ref_mv1 =
+      av1_get_ref_mv_from_stack(0, ref_frames, 1, x->mbmi_ext);
+
+  pred_mv[num_mv_refs++] = ref_mv.as_mv;
+  if (ref_mv.as_int != ref_mv1.as_int) {
+    pred_mv[num_mv_refs++] = ref_mv1.as_mv;
   }
   if (cpi->sf.adaptive_motion_search && block_size < x->max_partition_size)
     pred_mv[num_mv_refs++] = x->pred_mv[ref_frame];
@@ -1158,12 +716,10 @@ void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
     // Note if it is the best so far.
     if (this_sad < best_sad) {
       best_sad = this_sad;
-      best_index = i;
     }
   }
 
   // Note the index of the mv that worked best in the reference list.
-  x->mv_best_ref_index[ref_frame] = best_index;
   x->max_mv_context[ref_frame] = max_mv;
   x->pred_mv_sad[ref_frame] = best_sad;
 }
@@ -1172,7 +728,8 @@ void av1_setup_pred_block(const MACROBLOCKD *xd,
                           struct buf_2d dst[MAX_MB_PLANE],
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                           const struct scale_factors *scale,
-                          const struct scale_factors *scale_uv) {
+                          const struct scale_factors *scale_uv,
+                          const int num_planes) {
   int i;
 
   dst[0].buf = src->y_buffer;
@@ -1181,8 +738,8 @@ void av1_setup_pred_block(const MACROBLOCKD *xd,
   dst[2].buf = src->v_buffer;
   dst[1].stride = dst[2].stride = src->uv_stride;
 
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    setup_pred_plane(dst + i, xd->mi[0]->mbmi.sb_type, dst[i].buf,
+  for (i = 0; i < num_planes; ++i) {
+    setup_pred_plane(dst + i, xd->mi[0]->sb_type, dst[i].buf,
                      i ? src->uv_crop_width : src->y_crop_width,
                      i ? src->uv_crop_height : src->y_crop_height,
                      dst[i].stride, mi_row, mi_col, i ? scale_uv : scale,
@@ -1192,7 +749,7 @@ void av1_setup_pred_block(const MACROBLOCKD *xd,
 
 int av1_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block,
                             int stride) {
-  const int bw = b_width_log2_lookup[plane_bsize];
+  const int bw = mi_size_wide_log2[plane_bsize];
   const int y = 4 * (raster_block >> bw);
   const int x = 4 * (raster_block & ((1 << bw) - 1));
   return y * stride + x;
@@ -1214,43 +771,24 @@ YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi,
              : NULL;
 }
 
-#if CONFIG_DUAL_FILTER
 int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
                             const MACROBLOCKD *xd) {
   if (cm->interp_filter == SWITCHABLE) {
-    const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+    const MB_MODE_INFO *const mbmi = xd->mi[0];
     int inter_filter_cost = 0;
     int dir;
 
     for (dir = 0; dir < 2; ++dir) {
-      if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
-          (mbmi->ref_frame[1] > INTRA_FRAME &&
-           has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
-        const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
-        const InterpFilter filter =
-            av1_extract_interp_filter(mbmi->interp_filters, dir);
-        inter_filter_cost += x->switchable_interp_costs[ctx][filter];
-      }
+      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+      const InterpFilter filter =
+          av1_extract_interp_filter(mbmi->interp_filters, dir);
+      inter_filter_cost += x->switchable_interp_costs[ctx][filter];
     }
     return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
   } else {
     return 0;
   }
 }
-#else
-int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
-                            const MACROBLOCKD *xd) {
-  if (cm->interp_filter == SWITCHABLE) {
-    const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-    const int ctx = av1_get_pred_context_switchable_interp(xd);
-    const InterpFilter filter =
-        av1_extract_interp_filter(mbmi->interp_filters, 0);
-    return SWITCHABLE_INTERP_RATE_FACTOR *
-           x->switchable_interp_costs[ctx][filter];
-  }
-  return 0;
-}
-#endif
 
 void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   int i;
@@ -1262,22 +800,18 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
 
   if (sf->adaptive_rd_thresh) {
     rd->thresh_mult[THR_NEARESTMV] = 300;
-#if CONFIG_EXT_REFS
     rd->thresh_mult[THR_NEARESTL2] = 300;
     rd->thresh_mult[THR_NEARESTL3] = 300;
     rd->thresh_mult[THR_NEARESTB] = 300;
     rd->thresh_mult[THR_NEARESTA2] = 300;
-#endif  // CONFIG_EXT_REFS
     rd->thresh_mult[THR_NEARESTA] = 300;
     rd->thresh_mult[THR_NEARESTG] = 300;
   } else {
     rd->thresh_mult[THR_NEARESTMV] = 0;
-#if CONFIG_EXT_REFS
     rd->thresh_mult[THR_NEARESTL2] = 0;
     rd->thresh_mult[THR_NEARESTL3] = 0;
     rd->thresh_mult[THR_NEARESTB] = 0;
     rd->thresh_mult[THR_NEARESTA2] = 0;
-#endif  // CONFIG_EXT_REFS
     rd->thresh_mult[THR_NEARESTA] = 0;
     rd->thresh_mult[THR_NEARESTG] = 0;
   }
@@ -1285,92 +819,35 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_DC] += 1000;
 
   rd->thresh_mult[THR_NEWMV] += 1000;
-#if CONFIG_EXT_REFS
   rd->thresh_mult[THR_NEWL2] += 1000;
   rd->thresh_mult[THR_NEWL3] += 1000;
   rd->thresh_mult[THR_NEWB] += 1000;
   rd->thresh_mult[THR_NEWA2] = 1000;
-#endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_NEWA] += 1000;
   rd->thresh_mult[THR_NEWG] += 1000;
 
   rd->thresh_mult[THR_NEARMV] += 1000;
-#if CONFIG_EXT_REFS
   rd->thresh_mult[THR_NEARL2] += 1000;
   rd->thresh_mult[THR_NEARL3] += 1000;
   rd->thresh_mult[THR_NEARB] += 1000;
   rd->thresh_mult[THR_NEARA2] = 1000;
-#endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_NEARA] += 1000;
   rd->thresh_mult[THR_NEARG] += 1000;
 
-  rd->thresh_mult[THR_ZEROMV] += 2000;
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_ZEROL2] += 2000;
-  rd->thresh_mult[THR_ZEROL3] += 2000;
-  rd->thresh_mult[THR_ZEROB] += 2000;
-  rd->thresh_mult[THR_ZEROA2] = 2000;
-#endif  // CONFIG_EXT_REFS
-  rd->thresh_mult[THR_ZEROG] += 2000;
-  rd->thresh_mult[THR_ZEROA] += 2000;
-
-  rd->thresh_mult[THR_TM] += 1000;
-
-#if CONFIG_COMPOUND_SINGLEREF
-  rd->thresh_mult[THR_SR_NEAREST_NEARMV] += 1200;
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_SR_NEAREST_NEARL2] += 1200;
-  rd->thresh_mult[THR_SR_NEAREST_NEARL3] += 1200;
-  rd->thresh_mult[THR_SR_NEAREST_NEARB] += 1200;
-#endif  // CONFIG_EXT_REFS
-  rd->thresh_mult[THR_SR_NEAREST_NEARA] += 1200;
-  rd->thresh_mult[THR_SR_NEAREST_NEARG] += 1200;
-
-  /*
-  rd->thresh_mult[THR_SR_NEAREST_NEWMV] += 1200;
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_SR_NEAREST_NEWL2] += 1200;
-  rd->thresh_mult[THR_SR_NEAREST_NEWL3] += 1200;
-  rd->thresh_mult[THR_SR_NEAREST_NEWB] += 1200;
-#endif  // CONFIG_EXT_REFS
-  rd->thresh_mult[THR_SR_NEAREST_NEWA] += 1200;
-  rd->thresh_mult[THR_SR_NEAREST_NEWG] += 1200;*/
-
-  rd->thresh_mult[THR_SR_NEAR_NEWMV] += 1500;
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_SR_NEAR_NEWL2] += 1500;
-  rd->thresh_mult[THR_SR_NEAR_NEWL3] += 1500;
-  rd->thresh_mult[THR_SR_NEAR_NEWB] += 1500;
-#endif  // CONFIG_EXT_REFS
-  rd->thresh_mult[THR_SR_NEAR_NEWA] += 1500;
-  rd->thresh_mult[THR_SR_NEAR_NEWG] += 1500;
-
-  rd->thresh_mult[THR_SR_ZERO_NEWMV] += 2000;
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_SR_ZERO_NEWL2] += 2000;
-  rd->thresh_mult[THR_SR_ZERO_NEWL3] += 2000;
-  rd->thresh_mult[THR_SR_ZERO_NEWB] += 2000;
-#endif  // CONFIG_EXT_REFS
-  rd->thresh_mult[THR_SR_ZERO_NEWA] += 2000;
-  rd->thresh_mult[THR_SR_ZERO_NEWG] += 2000;
-
-  rd->thresh_mult[THR_SR_NEW_NEWMV] += 1700;
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_SR_NEW_NEWL2] += 1700;
-  rd->thresh_mult[THR_SR_NEW_NEWL3] += 1700;
-  rd->thresh_mult[THR_SR_NEW_NEWB] += 1700;
-#endif  // CONFIG_EXT_REFS
-  rd->thresh_mult[THR_SR_NEW_NEWA] += 1700;
-  rd->thresh_mult[THR_SR_NEW_NEWG] += 1700;
-#endif  // CONFIG_COMPOUND_SINGLEREF
+  rd->thresh_mult[THR_GLOBALMV] += 2000;
+  rd->thresh_mult[THR_GLOBALL2] += 2000;
+  rd->thresh_mult[THR_GLOBALL3] += 2000;
+  rd->thresh_mult[THR_GLOBALB] += 2000;
+  rd->thresh_mult[THR_GLOBALA2] = 2000;
+  rd->thresh_mult[THR_GLOBALG] += 2000;
+  rd->thresh_mult[THR_GLOBALA] += 2000;
+
+  rd->thresh_mult[THR_PAETH] += 1000;
 
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1000;
-#if CONFIG_EXT_REFS
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000;
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 1000;
-#endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] += 1000;
-#if CONFIG_EXT_REFS
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] += 1000;
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] += 1000;
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] += 1000;
@@ -1380,13 +857,10 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A2] += 1000;
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA2] += 1000;
 
-#if CONFIG_EXT_COMP_REFS
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL2] += 1000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL3] += 1000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLG] += 1000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTBA] += 1000;
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL2] += 2000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL3] += 2000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLG] += 2000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTBA] += 2000;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARLA] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWLA] += 1500;
@@ -1394,16 +868,15 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWLA] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARLA] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWLA] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROLA] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] += 2500;
 
-#if CONFIG_EXT_REFS
   rd->thresh_mult[THR_COMP_NEAR_NEARL2A] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] += 1500;
   rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] += 1500;
   rd->thresh_mult[THR_COMP_NEAR_NEWL2A] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARL2A] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWL2A] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROL2A] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A] += 2500;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARL3A] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] += 1500;
@@ -1411,8 +884,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWL3A] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARL3A] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWL3A] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROL3A] += 2500;
-#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] += 2500;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARGA] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWGA] += 1500;
@@ -1420,16 +892,15 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWGA] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARGA] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWGA] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROGA] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] += 2500;
 
-#if CONFIG_EXT_REFS
   rd->thresh_mult[THR_COMP_NEAR_NEARLB] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWLB] += 1500;
   rd->thresh_mult[THR_COMP_NEW_NEARESTLB] += 1500;
   rd->thresh_mult[THR_COMP_NEAR_NEWLB] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARLB] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROLB] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] += 2500;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARL2B] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] += 1500;
@@ -1437,7 +908,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWL2B] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARL2B] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWL2B] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROL2B] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2B] += 2500;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARL3B] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] += 1500;
@@ -1445,7 +916,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWL3B] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARL3B] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWL3B] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROL3B] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3B] += 2500;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARGB] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWGB] += 1500;
@@ -1453,7 +924,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWGB] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARGB] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWGB] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROGB] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGB] += 2500;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARLA2] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] += 1500;
@@ -1461,7 +932,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWLA2] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARLA2] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWLA2] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROLA2] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA2] += 2500;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARL2A2] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWL2A2] += 1500;
@@ -1469,7 +940,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWL2A2] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARL2A2] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWL2A2] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROL2A2] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A2] += 2500;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWL3A2] += 1500;
@@ -1477,7 +948,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWL3A2] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARL3A2] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWL3A2] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROL3A2] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A2] += 2500;
 
   rd->thresh_mult[THR_COMP_NEAR_NEARGA2] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWGA2] += 1500;
@@ -1485,124 +956,55 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAR_NEWGA2] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEARGA2] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWGA2] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROGA2] += 2500;
-
-#if CONFIG_EXT_COMP_REFS
-  rd->thresh_mult[THR_COMP_NEAR_NEARLL2] += 1200;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] += 1500;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWLL2] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEARLL2] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWLL2] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROLL2] += 2500;
-
-  rd->thresh_mult[THR_COMP_NEAR_NEARLL3] += 1200;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] += 1500;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWLL3] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEARLL3] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWLL3] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROLL3] += 2500;
-
-  rd->thresh_mult[THR_COMP_NEAR_NEARLG] += 1200;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWLG] += 1500;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTLG] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWLG] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEARLG] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWLG] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROLG] += 2500;
-
-  rd->thresh_mult[THR_COMP_NEAR_NEARBA] += 1200;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWBA] += 1500;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTBA] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWBA] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEARBA] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2000;
-  rd->thresh_mult[THR_COMP_ZERO_ZEROBA] += 2500;
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] += 2500;
 
   rd->thresh_mult[THR_H_PRED] += 2000;
   rd->thresh_mult[THR_V_PRED] += 2000;
   rd->thresh_mult[THR_D135_PRED] += 2500;
-  rd->thresh_mult[THR_D207_PRED] += 2500;
-  rd->thresh_mult[THR_D153_PRED] += 2500;
-  rd->thresh_mult[THR_D63_PRED] += 2500;
-  rd->thresh_mult[THR_D117_PRED] += 2500;
+  rd->thresh_mult[THR_D203_PRED] += 2500;
+  rd->thresh_mult[THR_D157_PRED] += 2500;
+  rd->thresh_mult[THR_D67_PRED] += 2500;
+  rd->thresh_mult[THR_D113_PRED] += 2500;
   rd->thresh_mult[THR_D45_PRED] += 2500;
 
-  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARL] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEWL] += 2000;
-
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL2] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL2] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARL2] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEWL2] += 2000;
-
-  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL3] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL3] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARL3] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEWL3] += 2000;
-#endif  // CONFIG_EXT_REFS
-
-  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROG] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARG] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEWG] += 2000;
-
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROB] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTB] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARB] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEWB] += 2000;
-
-  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROA2] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTA2] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARA2] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEWA2] += 2000;
-#endif  // CONFIG_EXT_REFS
-
-  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROA] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEARA] += 1500;
-  rd->thresh_mult[THR_COMP_INTERINTRA_NEWA] += 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEARLL2] += 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] += 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] += 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLL2] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEARLL2] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEWLL2] += 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL2] += 3200;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLL3] += 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] += 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] += 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLL3] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEARLL3] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEWLL3] += 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL3] += 3200;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLG] += 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLG] += 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLG] += 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLG] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEARLG] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEWLG] += 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLG] += 3200;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARBA] += 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWBA] += 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTBA] += 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWBA] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEARBA] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] += 3200;
 }
 
 void av1_set_rd_speed_thresholds_sub8x8(AV1_COMP *cpi) {
-  static const int thresh_mult[MAX_REFS] = {
-#if CONFIG_EXT_REFS
-    2500,
-    2500,
-    2500,
-    2500,
-    2500,
-    2500,
-    2500,
-    4500,
-    4500,
-    4500,
-    4500,
-    4500,
-    4500,
-    4500,
-    4500,
-    4500,
-    4500,
-    4500,
-    4500,
-    2500
-#else  // !CONFIG_EXT_REFS
-    2500,
-    2500,
-    2500,
-    4500,
-    4500,
-    2500
-#endif  // CONFIG_EXT_REFS
-  };
+  static const int thresh_mult[MAX_REFS] = { 2500, 2500, 2500, 2500, 2500,
+                                             2500, 2500, 4500, 4500, 4500,
+                                             4500, 4500, 4500, 4500, 4500,
+                                             4500, 4500, 4500, 4500, 2500 };
   RD_OPT *const rd = &cpi->rd;
   memcpy(rd->thresh_mult_sub8x8, thresh_mult, sizeof(thresh_mult));
 }
@@ -1611,15 +1013,12 @@ void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
                                int (*factor_buf)[MAX_MODES], int rd_thresh,
                                int bsize, int best_mode_index) {
   if (rd_thresh > 0) {
-#if CONFIG_CB4X4
     const int top_mode = MAX_MODES;
-#else
-    const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
-#endif
     int mode;
     for (mode = 0; mode < top_mode; ++mode) {
       const BLOCK_SIZE min_size = AOMMAX(bsize - 1, BLOCK_4X4);
-      const BLOCK_SIZE max_size = AOMMIN(bsize + 2, (int)cm->sb_size);
+      const BLOCK_SIZE max_size =
+          AOMMIN(bsize + 2, (int)cm->seq_params.sb_size);
       BLOCK_SIZE bs;
       for (bs = min_size; bs <= max_size; ++bs) {
         int *const fact = &factor_buf[bs][mode];
@@ -1635,8 +1034,7 @@ void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
 
 int av1_get_intra_cost_penalty(int qindex, int qdelta,
                                aom_bit_depth_t bit_depth) {
-  const int q = av1_dc_quant(qindex, qdelta, bit_depth);
-#if CONFIG_HIGHBITDEPTH
+  const int q = av1_dc_quant_Q3(qindex, qdelta, bit_depth);
   switch (bit_depth) {
     case AOM_BITS_8: return 20 * q;
     case AOM_BITS_10: return 5 * q;
@@ -1645,7 +1043,4 @@ int av1_get_intra_cost_penalty(int qindex, int qdelta,
       assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
       return -1;
   }
-#else
-  return 20 * q;
-#endif  // CONFIG_HIGHBITDEPTH
 }
diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h
index 35ada8e6c..281b676b0 100644
--- a/third_party/aom/av1/encoder/rd.h
+++ b/third_party/aom/av1/encoder/rd.h
@@ -14,9 +14,6 @@
 
 #include <limits.h>
 
-#if CONFIG_ANS
-#include "aom_dsp/ans.h"
-#endif  // CONFIG_ANS
 #include "av1/common/blockd.h"
 
 #include "av1/encoder/block.h"
@@ -30,9 +27,9 @@ extern "C" {
 #define RDDIV_BITS 7
 #define RD_EPB_SHIFT 6
 
-#define RDCOST(RM, R, D)                                          \
-  (ROUND_POWER_OF_TWO(((int64_t)R) * (RM), AV1_PROB_COST_SHIFT) + \
-   (D << RDDIV_BITS))
+#define RDCOST(RM, R, D)                                            \
+  (ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT) + \
+   ((D) * (1 << RDDIV_BITS)))
 
 #define RDCOST_DBL(RM, R, D)                                       \
   (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \
@@ -50,102 +47,43 @@ extern "C" {
 // const MODE_DEFINITION av1_mode_order[MAX_MODES] used in the rd code.
 typedef enum {
   THR_NEARESTMV,
-#if CONFIG_EXT_REFS
   THR_NEARESTL2,
   THR_NEARESTL3,
   THR_NEARESTB,
   THR_NEARESTA2,
-#endif  // CONFIG_EXT_REFS
   THR_NEARESTA,
   THR_NEARESTG,
 
   THR_DC,
 
   THR_NEWMV,
-#if CONFIG_EXT_REFS
   THR_NEWL2,
   THR_NEWL3,
   THR_NEWB,
   THR_NEWA2,
-#endif  // CONFIG_EXT_REFS
   THR_NEWA,
   THR_NEWG,
 
   THR_NEARMV,
-#if CONFIG_EXT_REFS
   THR_NEARL2,
   THR_NEARL3,
   THR_NEARB,
   THR_NEARA2,
-#endif  // CONFIG_EXT_REFS
   THR_NEARA,
   THR_NEARG,
 
-  THR_ZEROMV,
-#if CONFIG_EXT_REFS
-  THR_ZEROL2,
-  THR_ZEROL3,
-  THR_ZEROB,
-  THR_ZEROA2,
-#endif  // CONFIG_EXT_REFS
-  THR_ZEROA,
-  THR_ZEROG,
-
-#if CONFIG_COMPOUND_SINGLEREF
-  THR_SR_NEAREST_NEARMV,
-#if CONFIG_EXT_REFS
-  THR_SR_NEAREST_NEARL2,
-  THR_SR_NEAREST_NEARL3,
-  THR_SR_NEAREST_NEARB,
-#endif  // CONFIG_EXT_REFS
-  THR_SR_NEAREST_NEARG,
-  THR_SR_NEAREST_NEARA,
-
-  /*
-  THR_SR_NEAREST_NEWMV,
-#if CONFIG_EXT_REFS
-  THR_SR_NEAREST_NEWL2,
-  THR_SR_NEAREST_NEWL3,
-  THR_SR_NEAREST_NEWB,
-#endif  // CONFIG_EXT_REFS
-  THR_SR_NEAREST_NEWG,
-  THR_SR_NEAREST_NEWA,*/
-
-  THR_SR_NEAR_NEWMV,
-#if CONFIG_EXT_REFS
-  THR_SR_NEAR_NEWL2,
-  THR_SR_NEAR_NEWL3,
-  THR_SR_NEAR_NEWB,
-#endif  // CONFIG_EXT_REFS
-  THR_SR_NEAR_NEWG,
-  THR_SR_NEAR_NEWA,
-
-  THR_SR_ZERO_NEWMV,
-#if CONFIG_EXT_REFS
-  THR_SR_ZERO_NEWL2,
-  THR_SR_ZERO_NEWL3,
-  THR_SR_ZERO_NEWB,
-#endif  // CONFIG_EXT_REFS
-  THR_SR_ZERO_NEWG,
-  THR_SR_ZERO_NEWA,
-
-  THR_SR_NEW_NEWMV,
-#if CONFIG_EXT_REFS
-  THR_SR_NEW_NEWL2,
-  THR_SR_NEW_NEWL3,
-  THR_SR_NEW_NEWB,
-#endif  // CONFIG_EXT_REFS
-  THR_SR_NEW_NEWG,
-  THR_SR_NEW_NEWA,
-#endif  // CONFIG_COMPOUND_SINGLEREF
+  THR_GLOBALMV,
+  THR_GLOBALL2,
+  THR_GLOBALL3,
+  THR_GLOBALB,
+  THR_GLOBALA2,
+  THR_GLOBALA,
+  THR_GLOBALG,
 
   THR_COMP_NEAREST_NEARESTLA,
-#if CONFIG_EXT_REFS
   THR_COMP_NEAREST_NEARESTL2A,
   THR_COMP_NEAREST_NEARESTL3A,
-#endif  // CONFIG_EXT_REFS
   THR_COMP_NEAREST_NEARESTGA,
-#if CONFIG_EXT_REFS
   THR_COMP_NEAREST_NEARESTLB,
   THR_COMP_NEAREST_NEARESTL2B,
   THR_COMP_NEAREST_NEARESTL3B,
@@ -154,21 +92,16 @@ typedef enum {
   THR_COMP_NEAREST_NEARESTL2A2,
   THR_COMP_NEAREST_NEARESTL3A2,
   THR_COMP_NEAREST_NEARESTGA2,
-#if CONFIG_EXT_COMP_REFS
   THR_COMP_NEAREST_NEARESTLL2,
   THR_COMP_NEAREST_NEARESTLL3,
   THR_COMP_NEAREST_NEARESTLG,
   THR_COMP_NEAREST_NEARESTBA,
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
 
-  THR_TM,
+  THR_PAETH,
 
   THR_SMOOTH,
-#if CONFIG_SMOOTH_HV
   THR_SMOOTH_V,
   THR_SMOOTH_H,
-#endif  // CONFIG_SMOOTH_HV
 
   THR_COMP_NEAR_NEARLA,
   THR_COMP_NEW_NEARESTLA,
@@ -176,16 +109,15 @@ typedef enum {
   THR_COMP_NEW_NEARLA,
   THR_COMP_NEAR_NEWLA,
   THR_COMP_NEW_NEWLA,
-  THR_COMP_ZERO_ZEROLA,
+  THR_COMP_GLOBAL_GLOBALLA,
 
-#if CONFIG_EXT_REFS
   THR_COMP_NEAR_NEARL2A,
   THR_COMP_NEW_NEARESTL2A,
   THR_COMP_NEAREST_NEWL2A,
   THR_COMP_NEW_NEARL2A,
   THR_COMP_NEAR_NEWL2A,
   THR_COMP_NEW_NEWL2A,
-  THR_COMP_ZERO_ZEROL2A,
+  THR_COMP_GLOBAL_GLOBALL2A,
 
   THR_COMP_NEAR_NEARL3A,
   THR_COMP_NEW_NEARESTL3A,
@@ -193,8 +125,7 @@ typedef enum {
   THR_COMP_NEW_NEARL3A,
   THR_COMP_NEAR_NEWL3A,
   THR_COMP_NEW_NEWL3A,
-  THR_COMP_ZERO_ZEROL3A,
-#endif  // CONFIG_EXT_REFS
+  THR_COMP_GLOBAL_GLOBALL3A,
 
   THR_COMP_NEAR_NEARGA,
   THR_COMP_NEW_NEARESTGA,
@@ -202,16 +133,15 @@ typedef enum {
   THR_COMP_NEW_NEARGA,
   THR_COMP_NEAR_NEWGA,
   THR_COMP_NEW_NEWGA,
-  THR_COMP_ZERO_ZEROGA,
+  THR_COMP_GLOBAL_GLOBALGA,
 
-#if CONFIG_EXT_REFS
   THR_COMP_NEAR_NEARLB,
   THR_COMP_NEW_NEARESTLB,
   THR_COMP_NEAREST_NEWLB,
   THR_COMP_NEW_NEARLB,
   THR_COMP_NEAR_NEWLB,
   THR_COMP_NEW_NEWLB,
-  THR_COMP_ZERO_ZEROLB,
+  THR_COMP_GLOBAL_GLOBALLB,
 
   THR_COMP_NEAR_NEARL2B,
   THR_COMP_NEW_NEARESTL2B,
@@ -219,7 +149,7 @@ typedef enum {
   THR_COMP_NEW_NEARL2B,
   THR_COMP_NEAR_NEWL2B,
   THR_COMP_NEW_NEWL2B,
-  THR_COMP_ZERO_ZEROL2B,
+  THR_COMP_GLOBAL_GLOBALL2B,
 
   THR_COMP_NEAR_NEARL3B,
   THR_COMP_NEW_NEARESTL3B,
@@ -227,7 +157,7 @@ typedef enum {
   THR_COMP_NEW_NEARL3B,
   THR_COMP_NEAR_NEWL3B,
   THR_COMP_NEW_NEWL3B,
-  THR_COMP_ZERO_ZEROL3B,
+  THR_COMP_GLOBAL_GLOBALL3B,
 
   THR_COMP_NEAR_NEARGB,
   THR_COMP_NEW_NEARESTGB,
@@ -235,7 +165,7 @@ typedef enum {
   THR_COMP_NEW_NEARGB,
   THR_COMP_NEAR_NEWGB,
   THR_COMP_NEW_NEWGB,
-  THR_COMP_ZERO_ZEROGB,
+  THR_COMP_GLOBAL_GLOBALGB,
 
   THR_COMP_NEAR_NEARLA2,
   THR_COMP_NEW_NEARESTLA2,
@@ -243,7 +173,7 @@ typedef enum {
   THR_COMP_NEW_NEARLA2,
   THR_COMP_NEAR_NEWLA2,
   THR_COMP_NEW_NEWLA2,
-  THR_COMP_ZERO_ZEROLA2,
+  THR_COMP_GLOBAL_GLOBALLA2,
 
   THR_COMP_NEAR_NEARL2A2,
   THR_COMP_NEW_NEARESTL2A2,
@@ -251,7 +181,7 @@ typedef enum {
   THR_COMP_NEW_NEARL2A2,
   THR_COMP_NEAR_NEWL2A2,
   THR_COMP_NEW_NEWL2A2,
-  THR_COMP_ZERO_ZEROL2A2,
+  THR_COMP_GLOBAL_GLOBALL2A2,
 
   THR_COMP_NEAR_NEARL3A2,
   THR_COMP_NEW_NEARESTL3A2,
@@ -259,7 +189,7 @@ typedef enum {
   THR_COMP_NEW_NEARL3A2,
   THR_COMP_NEAR_NEWL3A2,
   THR_COMP_NEW_NEWL3A2,
-  THR_COMP_ZERO_ZEROL3A2,
+  THR_COMP_GLOBAL_GLOBALL3A2,
 
   THR_COMP_NEAR_NEARGA2,
   THR_COMP_NEW_NEARESTGA2,
@@ -267,16 +197,24 @@ typedef enum {
   THR_COMP_NEW_NEARGA2,
   THR_COMP_NEAR_NEWGA2,
   THR_COMP_NEW_NEWGA2,
-  THR_COMP_ZERO_ZEROGA2,
+  THR_COMP_GLOBAL_GLOBALGA2,
+
+  THR_H_PRED,
+  THR_V_PRED,
+  THR_D135_PRED,
+  THR_D203_PRED,
+  THR_D157_PRED,
+  THR_D67_PRED,
+  THR_D113_PRED,
+  THR_D45_PRED,
 
-#if CONFIG_EXT_COMP_REFS
   THR_COMP_NEAR_NEARLL2,
   THR_COMP_NEW_NEARESTLL2,
   THR_COMP_NEAREST_NEWLL2,
   THR_COMP_NEW_NEARLL2,
   THR_COMP_NEAR_NEWLL2,
   THR_COMP_NEW_NEWLL2,
-  THR_COMP_ZERO_ZEROLL2,
+  THR_COMP_GLOBAL_GLOBALLL2,
 
   THR_COMP_NEAR_NEARLL3,
   THR_COMP_NEW_NEARESTLL3,
@@ -284,7 +222,7 @@ typedef enum {
   THR_COMP_NEW_NEARLL3,
   THR_COMP_NEAR_NEWLL3,
   THR_COMP_NEW_NEWLL3,
-  THR_COMP_ZERO_ZEROLL3,
+  THR_COMP_GLOBAL_GLOBALLL3,
 
   THR_COMP_NEAR_NEARLG,
   THR_COMP_NEW_NEARESTLG,
@@ -292,7 +230,7 @@ typedef enum {
   THR_COMP_NEW_NEARLG,
   THR_COMP_NEAR_NEWLG,
   THR_COMP_NEW_NEWLG,
-  THR_COMP_ZERO_ZEROLG,
+  THR_COMP_GLOBAL_GLOBALLG,
 
   THR_COMP_NEAR_NEARBA,
   THR_COMP_NEW_NEARESTBA,
@@ -300,79 +238,25 @@ typedef enum {
   THR_COMP_NEW_NEARBA,
   THR_COMP_NEAR_NEWBA,
   THR_COMP_NEW_NEWBA,
-  THR_COMP_ZERO_ZEROBA,
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
+  THR_COMP_GLOBAL_GLOBALBA,
 
-  THR_H_PRED,
-  THR_V_PRED,
-  THR_D135_PRED,
-  THR_D207_PRED,
-  THR_D153_PRED,
-  THR_D63_PRED,
-  THR_D117_PRED,
-  THR_D45_PRED,
-
-  THR_COMP_INTERINTRA_ZEROL,
-  THR_COMP_INTERINTRA_NEARESTL,
-  THR_COMP_INTERINTRA_NEARL,
-  THR_COMP_INTERINTRA_NEWL,
-
-#if CONFIG_EXT_REFS
-  THR_COMP_INTERINTRA_ZEROL2,
-  THR_COMP_INTERINTRA_NEARESTL2,
-  THR_COMP_INTERINTRA_NEARL2,
-  THR_COMP_INTERINTRA_NEWL2,
-
-  THR_COMP_INTERINTRA_ZEROL3,
-  THR_COMP_INTERINTRA_NEARESTL3,
-  THR_COMP_INTERINTRA_NEARL3,
-  THR_COMP_INTERINTRA_NEWL3,
-#endif  // CONFIG_EXT_REFS
-
-  THR_COMP_INTERINTRA_ZEROG,
-  THR_COMP_INTERINTRA_NEARESTG,
-  THR_COMP_INTERINTRA_NEARG,
-  THR_COMP_INTERINTRA_NEWG,
-
-#if CONFIG_EXT_REFS
-  THR_COMP_INTERINTRA_ZEROB,
-  THR_COMP_INTERINTRA_NEARESTB,
-  THR_COMP_INTERINTRA_NEARB,
-  THR_COMP_INTERINTRA_NEWB,
-
-  THR_COMP_INTERINTRA_ZEROA2,
-  THR_COMP_INTERINTRA_NEARESTA2,
-  THR_COMP_INTERINTRA_NEARA2,
-  THR_COMP_INTERINTRA_NEWA2,
-#endif  // CONFIG_EXT_REFS
-
-  THR_COMP_INTERINTRA_ZEROA,
-  THR_COMP_INTERINTRA_NEARESTA,
-  THR_COMP_INTERINTRA_NEARA,
-  THR_COMP_INTERINTRA_NEWA,
   MAX_MODES
 } THR_MODES;
 
 typedef enum {
   THR_LAST,
-#if CONFIG_EXT_REFS
   THR_LAST2,
   THR_LAST3,
   THR_BWDR,
   THR_ALTR2,
-#endif  // CONFIG_EXT_REFS
   THR_GOLD,
   THR_ALTR,
 
   THR_COMP_LA,
-#if CONFIG_EXT_REFS
   THR_COMP_L2A,
   THR_COMP_L3A,
-#endif  // CONFIG_EXT_REFS
   THR_COMP_GA,
 
-#if CONFIG_EXT_REFS
   THR_COMP_LB,
   THR_COMP_L2B,
   THR_COMP_L3B,
@@ -382,7 +266,6 @@ typedef enum {
   THR_COMP_L2A2,
   THR_COMP_L3A2,
   THR_COMP_GA2,
-#endif  // CONFIG_EXT_REFS
 
   THR_INTRA,
 
@@ -399,7 +282,7 @@ typedef struct RD_OPT {
 
   int threshes[MAX_SEGMENTS][BLOCK_SIZES_ALL][MAX_MODES];
 
-  int64_t prediction_type_threshes[TOTAL_REFS_PER_FRAME][REFERENCE_MODES];
+  int64_t prediction_type_threshes[REF_FRAMES][REFERENCE_MODES];
 
   int RDMULT;
 } RD_OPT;
@@ -417,16 +300,16 @@ static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
   rd_stats->invalid_rate = 0;
   rd_stats->ref_rdcost = INT64_MAX;
 #if CONFIG_RD_DEBUG
+  // This may run into problems when monochrome video is
+  // encoded, as there will only be 1 plane
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats->txb_coeff_cost[plane] = 0;
-#if CONFIG_VAR_TX
     {
       int r, c;
       for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
         for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
           rd_stats->txb_coeff_cost_map[plane][r][c] = 0;
     }
-#endif
   }
 #endif
 }
@@ -444,16 +327,16 @@ static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
   rd_stats->invalid_rate = 1;
   rd_stats->ref_rdcost = INT64_MAX;
 #if CONFIG_RD_DEBUG
+  // This may run into problems when monochrome video is
+  // encoded, as there will only be 1 plane
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats->txb_coeff_cost[plane] = INT_MAX;
-#if CONFIG_VAR_TX
     {
       int r, c;
       for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
         for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
           rd_stats->txb_coeff_cost_map[plane][r][c] = INT_MAX;
     }
-#endif
   }
 #endif
 }
@@ -464,14 +347,17 @@ static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
   int plane;
 #endif
   rd_stats_dst->rate += rd_stats_src->rate;
+  if (!rd_stats_dst->zero_rate)
+    rd_stats_dst->zero_rate = rd_stats_src->zero_rate;
   rd_stats_dst->dist += rd_stats_src->dist;
   rd_stats_dst->sse += rd_stats_src->sse;
   rd_stats_dst->skip &= rd_stats_src->skip;
   rd_stats_dst->invalid_rate &= rd_stats_src->invalid_rate;
 #if CONFIG_RD_DEBUG
+  // This may run into problems when monochrome video is
+  // encoded, as there will only be 1 plane
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane];
-#if CONFIG_VAR_TX
     {
       // TODO(angiebird): optimize this part
       int r, c;
@@ -484,21 +370,10 @@ static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
         }
       assert(ref_txb_coeff_cost == rd_stats_dst->txb_coeff_cost[plane]);
     }
-#endif
   }
 #endif
 }
 
-static INLINE int av1_get_coeff_token_cost(int token, int eob_val, int is_first,
-                                           const int *head_cost_table,
-                                           const int *tail_cost_table) {
-  if (eob_val == LAST_EOB) return av1_cost_zero(128);
-  const int comb_symb = 2 * AOMMIN(token, TWO_TOKEN) - eob_val + is_first;
-  int cost = head_cost_table[comb_symb];
-  if (token > ONE_TOKEN) cost += tail_cost_table[token - TWO_TOKEN];
-  return cost;
-}
-
 struct TileInfo;
 struct TileDataEnc;
 struct AV1_COMP;
@@ -528,13 +403,12 @@ YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const struct AV1_COMP *cpi,
 
 void av1_init_me_luts(void);
 
-void av1_set_mvcost(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int ref,
-                    int ref_mv_idx);
+void av1_set_mvcost(MACROBLOCK *x, int ref, int ref_mv_idx);
 
-void av1_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+void av1_get_entropy_contexts(BLOCK_SIZE bsize,
                               const struct macroblockd_plane *pd,
-                              ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
-                              ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]);
+                              ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+                              ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]);
 
 void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi);
 
@@ -562,7 +436,8 @@ void av1_setup_pred_block(const MACROBLOCKD *xd,
                           struct buf_2d dst[MAX_MB_PLANE],
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                           const struct scale_factors *scale,
-                          const struct scale_factors *scale_uv);
+                          const struct scale_factors *scale_uv,
+                          const int num_planes);
 
 int av1_get_intra_cost_penalty(int qindex, int qdelta,
                                aom_bit_depth_t bit_depth);
@@ -570,12 +445,8 @@ int av1_get_intra_cost_penalty(int qindex, int qdelta,
 void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
                          FRAME_CONTEXT *fc);
 
-#if CONFIG_LV_MAP
-void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc);
-#endif
-
-void av1_fill_token_costs_from_cdf(av1_coeff_cost *cost,
-                                   coeff_cdf_model (*cdf)[PLANE_TYPES]);
+void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
+                          const int num_planes);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c
index 607db9b86..6f4fced87 100644
--- a/third_party/aom/av1/encoder/rdopt.c
+++ b/third_party/aom/av1/encoder/rdopt.c
@@ -12,18 +12,17 @@
 #include <assert.h>
 #include <math.h>
 
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/blend.h"
 #include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
 
-#if CONFIG_CFL
 #include "av1/common/cfl.h"
-#endif
 #include "av1/common/common.h"
 #include "av1/common/common_data.h"
 #include "av1/common/entropy.h"
@@ -37,12 +36,8 @@
 #include "av1/common/reconintra.h"
 #include "av1/common/scan.h"
 #include "av1/common/seg_common.h"
-#if CONFIG_LV_MAP
 #include "av1/common/txb_common.h"
-#endif
-#if CONFIG_WARPED_MOTION
 #include "av1/common/warped_motion.h"
-#endif  // CONFIG_WARPED_MOTION
 
 #include "av1/encoder/aq_variance.h"
 #include "av1/encoder/av1_quantize.h"
@@ -50,105 +45,37 @@
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encoder.h"
-#if CONFIG_LV_MAP
 #include "av1/encoder/encodetxb.h"
-#endif
 #include "av1/encoder/hybrid_fwd_txfm.h"
 #include "av1/encoder/mcomp.h"
+#include "av1/encoder/ml.h"
 #include "av1/encoder/palette.h"
+#include "av1/encoder/pustats.h"
+#include "av1/encoder/random.h"
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/tokenize.h"
-#if CONFIG_PVQ
-#include "av1/encoder/pvq_encoder.h"
-#include "av1/common/pvq.h"
-#endif  // CONFIG_PVQ
-#if CONFIG_DUAL_FILTER
+#include "av1/encoder/tx_prune_model_weights.h"
+
+// Set this macro as 1 to collect data about tx size selection.
+#define COLLECT_TX_SIZE_DATA 0
+#if COLLECT_TX_SIZE_DATA
+static const char av1_tx_size_data_output_file[] = "tx_size_data.txt";
+#endif
+
 #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
-#if USE_EXTRA_FILTER
-static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
-  { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 }, { 1, 0 }, { 1, 1 },
-  { 1, 2 }, { 1, 3 }, { 2, 0 }, { 2, 1 }, { 2, 2 }, { 2, 3 },
-  { 3, 0 }, { 3, 1 }, { 3, 2 }, { 3, 3 },
+static const InterpFilters filter_sets[DUAL_FILTER_SET_SIZE] = {
+  0x00000000, 0x00010000, 0x00020000,  // y = 0
+  0x00000001, 0x00010001, 0x00020001,  // y = 1
+  0x00000002, 0x00010002, 0x00020002,  // y = 2
 };
-#else   // USE_EXTRA_FILTER
-static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
-  { 0, 0 }, { 0, 1 }, { 0, 2 }, { 1, 0 }, { 1, 1 },
-  { 1, 2 }, { 2, 0 }, { 2, 1 }, { 2, 2 },
-};
-#endif  // USE_EXTRA_FILTER
-#endif  // CONFIG_DUAL_FILTER
-
-#if CONFIG_EXT_REFS
-
-#define LAST_FRAME_MODE_MASK                                          \
-  ((1 << INTRA_FRAME) | (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |     \
-   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \
-   (1 << ALTREF_FRAME))
-#define LAST2_FRAME_MODE_MASK                                         \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST3_FRAME) |      \
-   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \
-   (1 << ALTREF_FRAME))
-#define LAST3_FRAME_MODE_MASK                                         \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |      \
-   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \
-   (1 << ALTREF_FRAME))
-#define GOLDEN_FRAME_MODE_MASK                                       \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |     \
-   (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \
-   (1 << ALTREF_FRAME))
-#define BWDREF_FRAME_MODE_MASK                                       \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |     \
-   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << ALTREF2_FRAME) | \
-   (1 << ALTREF_FRAME))
-#define ALTREF2_FRAME_MODE_MASK                                     \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |    \
-   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | \
-   (1 << ALTREF_FRAME))
-#define ALTREF_FRAME_MODE_MASK                                      \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |    \
-   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | \
-   (1 << ALTREF2_FRAME))
-
-#else  // !CONFIG_EXT_REFS
-
-#define LAST_FRAME_MODE_MASK \
-  ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
-#define GOLDEN_FRAME_MODE_MASK \
-  ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
-#define ALTREF_FRAME_MODE_MASK \
-  ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | (1 << INTRA_FRAME))
-
-#endif  // CONFIG_EXT_REFS
-
-#if CONFIG_EXT_REFS
-#if CONFIG_EXT_COMP_REFS
+
 #define SECOND_REF_FRAME_MASK                                         \
   ((1 << ALTREF_FRAME) | (1 << ALTREF2_FRAME) | (1 << BWDREF_FRAME) | \
    (1 << GOLDEN_FRAME) | (1 << LAST2_FRAME) | 0x01)
-#else  // !CONFIG_EXT_COMP_REFS
-#define SECOND_REF_FRAME_MASK \
-  ((1 << ALTREF_FRAME) | (1 << ALTREF2_FRAME) | (1 << BWDREF_FRAME) | 0x01)
-#endif  // CONFIG_EXT_COMP_REFS
-#else   // !CONFIG_EXT_REFS
-#define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | 0x01)
-#endif  // CONFIG_EXT_REFS
-
-#define MIN_EARLY_TERM_INDEX 3
-#define NEW_MV_DISCOUNT_FACTOR 8
 
-#if CONFIG_EXT_INTRA
 #define ANGLE_SKIP_THRESH 10
-#define FILTER_FAST_SEARCH 1
-#endif  // CONFIG_EXT_INTRA
-
-// Setting this to 1 will disable trellis optimization within the
-// transform search. Trellis optimization will still be applied
-// in the final encode.
-#ifndef DISABLE_TRELLISQ_SEARCH
-#define DISABLE_TRELLISQ_SEARCH 0
-#endif
 
 static const double ADST_FLIP_SVM[8] = {
   /* vertical */
@@ -162,122 +89,72 @@ typedef struct {
   MV_REFERENCE_FRAME ref_frame[2];
 } MODE_DEFINITION;
 
-typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION;
+typedef struct {
+  MV_REFERENCE_FRAME ref_frame[2];
+} REF_DEFINITION;
+
+typedef enum {
+  FTXS_NONE = 0,
+  FTXS_DCT_AND_1D_DCT_ONLY = 1 << 0,
+  FTXS_DISABLE_TRELLIS_OPT = 1 << 1,
+  FTXS_USE_TRANSFORM_DOMAIN = 1 << 2
+} FAST_TX_SEARCH_MODE;
 
 struct rdcost_block_args {
   const AV1_COMP *cpi;
   MACROBLOCK *x;
-  ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE];
-  ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE];
+  ENTROPY_CONTEXT t_above[MAX_MIB_SIZE];
+  ENTROPY_CONTEXT t_left[MAX_MIB_SIZE];
   RD_STATS rd_stats;
   int64_t this_rd;
   int64_t best_rd;
   int exit_early;
   int use_fast_coef_costing;
+  FAST_TX_SEARCH_MODE ftxs_mode;
 };
 
 #define LAST_NEW_MV_INDEX 6
 static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEARESTMV, { LAST_FRAME, NONE_FRAME } },
-#if CONFIG_EXT_REFS
   { NEARESTMV, { LAST2_FRAME, NONE_FRAME } },
   { NEARESTMV, { LAST3_FRAME, NONE_FRAME } },
   { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } },
   { NEARESTMV, { ALTREF2_FRAME, NONE_FRAME } },
-#endif  // CONFIG_EXT_REFS
   { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } },
   { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } },
 
   { DC_PRED, { INTRA_FRAME, NONE_FRAME } },
 
   { NEWMV, { LAST_FRAME, NONE_FRAME } },
-#if CONFIG_EXT_REFS
   { NEWMV, { LAST2_FRAME, NONE_FRAME } },
   { NEWMV, { LAST3_FRAME, NONE_FRAME } },
   { NEWMV, { BWDREF_FRAME, NONE_FRAME } },
   { NEWMV, { ALTREF2_FRAME, NONE_FRAME } },
-#endif  // CONFIG_EXT_REFS
   { NEWMV, { ALTREF_FRAME, NONE_FRAME } },
   { NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
 
   { NEARMV, { LAST_FRAME, NONE_FRAME } },
-#if CONFIG_EXT_REFS
   { NEARMV, { LAST2_FRAME, NONE_FRAME } },
   { NEARMV, { LAST3_FRAME, NONE_FRAME } },
   { NEARMV, { BWDREF_FRAME, NONE_FRAME } },
   { NEARMV, { ALTREF2_FRAME, NONE_FRAME } },
-#endif  // CONFIG_EXT_REFS
   { NEARMV, { ALTREF_FRAME, NONE_FRAME } },
   { NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
 
-  { ZEROMV, { LAST_FRAME, NONE_FRAME } },
-#if CONFIG_EXT_REFS
-  { ZEROMV, { LAST2_FRAME, NONE_FRAME } },
-  { ZEROMV, { LAST3_FRAME, NONE_FRAME } },
-  { ZEROMV, { BWDREF_FRAME, NONE_FRAME } },
-  { ZEROMV, { ALTREF2_FRAME, NONE_FRAME } },
-#endif  // CONFIG_EXT_REFS
-  { ZEROMV, { GOLDEN_FRAME, NONE_FRAME } },
-  { ZEROMV, { ALTREF_FRAME, NONE_FRAME } },
-
-// TODO(zoeliu): May need to reconsider the order on the modes to check
-
-#if CONFIG_COMPOUND_SINGLEREF
-  // Single ref comp mode
-  { SR_NEAREST_NEARMV, { LAST_FRAME, NONE_FRAME } },
-#if CONFIG_EXT_REFS
-  { SR_NEAREST_NEARMV, { LAST2_FRAME, NONE_FRAME } },
-  { SR_NEAREST_NEARMV, { LAST3_FRAME, NONE_FRAME } },
-  { SR_NEAREST_NEARMV, { BWDREF_FRAME, NONE_FRAME } },
-#endif  // CONFIG_EXT_REFS
-  { SR_NEAREST_NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
-  { SR_NEAREST_NEARMV, { ALTREF_FRAME, NONE_FRAME } },
-
-  /*
-  { SR_NEAREST_NEWMV, { LAST_FRAME, NONE_FRAME } },
-#if CONFIG_EXT_REFS
-  { SR_NEAREST_NEWMV, { LAST2_FRAME, NONE_FRAME } },
-  { SR_NEAREST_NEWMV, { LAST3_FRAME, NONE_FRAME } },
-  { SR_NEAREST_NEWMV, { BWDREF_FRAME, NONE_FRAME } },
-#endif  // CONFIG_EXT_REFS
-  { SR_NEAREST_NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
-  { SR_NEAREST_NEWMV, { ALTREF_FRAME, NONE_FRAME } },*/
-
-  { SR_NEAR_NEWMV, { LAST_FRAME, NONE_FRAME } },
-#if CONFIG_EXT_REFS
-  { SR_NEAR_NEWMV, { LAST2_FRAME, NONE_FRAME } },
-  { SR_NEAR_NEWMV, { LAST3_FRAME, NONE_FRAME } },
-  { SR_NEAR_NEWMV, { BWDREF_FRAME, NONE_FRAME } },
-#endif  // CONFIG_EXT_REFS
-  { SR_NEAR_NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
-  { SR_NEAR_NEWMV, { ALTREF_FRAME, NONE_FRAME } },
-
-  { SR_ZERO_NEWMV, { LAST_FRAME, NONE_FRAME } },
-#if CONFIG_EXT_REFS
-  { SR_ZERO_NEWMV, { LAST2_FRAME, NONE_FRAME } },
-  { SR_ZERO_NEWMV, { LAST3_FRAME, NONE_FRAME } },
-  { SR_ZERO_NEWMV, { BWDREF_FRAME, NONE_FRAME } },
-#endif  // CONFIG_EXT_REFS
-  { SR_ZERO_NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
-  { SR_ZERO_NEWMV, { ALTREF_FRAME, NONE_FRAME } },
-
-  { SR_NEW_NEWMV, { LAST_FRAME, NONE_FRAME } },
-#if CONFIG_EXT_REFS
-  { SR_NEW_NEWMV, { LAST2_FRAME, NONE_FRAME } },
-  { SR_NEW_NEWMV, { LAST3_FRAME, NONE_FRAME } },
-  { SR_NEW_NEWMV, { BWDREF_FRAME, NONE_FRAME } },
-#endif  // CONFIG_EXT_REFS
-  { SR_NEW_NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
-  { SR_NEW_NEWMV, { ALTREF_FRAME, NONE_FRAME } },
-#endif  // CONFIG_COMPOUND_SINGLEREF
+  { GLOBALMV, { LAST_FRAME, NONE_FRAME } },
+  { GLOBALMV, { LAST2_FRAME, NONE_FRAME } },
+  { GLOBALMV, { LAST3_FRAME, NONE_FRAME } },
+  { GLOBALMV, { BWDREF_FRAME, NONE_FRAME } },
+  { GLOBALMV, { ALTREF2_FRAME, NONE_FRAME } },
+  { GLOBALMV, { GOLDEN_FRAME, NONE_FRAME } },
+  { GLOBALMV, { ALTREF_FRAME, NONE_FRAME } },
+
+  // TODO(zoeliu): May need to reconsider the order on the modes to check
 
   { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
-#if CONFIG_EXT_REFS
   { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
   { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_REFS
   { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-#if CONFIG_EXT_REFS
   { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
   { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
@@ -287,21 +164,16 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
   { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
 
-#if CONFIG_EXT_COMP_REFS
   { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
   { NEAREST_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
   { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
   { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
 
-  { TM_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } },
 
   { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
-#if CONFIG_SMOOTH_HV
   { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
   { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
-#endif  // CONFIG_SMOOTH_HV
 
   { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
@@ -309,16 +181,15 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
-  { ZERO_ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF_FRAME } },
 
-#if CONFIG_EXT_REFS
   { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
   { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
   { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
   { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
   { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { ZERO_ZEROMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF_FRAME } },
 
   { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
@@ -326,8 +197,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
   { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
   { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
-  { ZERO_ZEROMV, { LAST3_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_REFS
+  { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF_FRAME } },
 
   { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
@@ -335,16 +205,15 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
   { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
   { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-  { ZERO_ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 
-#if CONFIG_EXT_REFS
   { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
   { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
   { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
   { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
-  { ZERO_ZEROMV, { LAST_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, BWDREF_FRAME } },
 
   { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
@@ -352,7 +221,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
-  { ZERO_ZEROMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST2_FRAME, BWDREF_FRAME } },
 
   { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
@@ -360,7 +229,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
-  { ZERO_ZEROMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST3_FRAME, BWDREF_FRAME } },
 
   { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
@@ -368,7 +237,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-  { ZERO_ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, BWDREF_FRAME } },
 
   { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
@@ -376,7 +245,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
   { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
-  { ZERO_ZEROMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF2_FRAME } },
 
   { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
   { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
@@ -384,7 +253,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
   { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
   { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
-  { ZERO_ZEROMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF2_FRAME } },
 
   { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
   { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
@@ -392,7 +261,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
   { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
   { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
-  { ZERO_ZEROMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF2_FRAME } },
 
   { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
   { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
@@ -400,16 +269,24 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
   { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
   { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
-  { ZERO_ZEROMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+
+  { H_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { V_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D203_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D157_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D67_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D113_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
 
-#if CONFIG_EXT_COMP_REFS
   { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
   { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } },
   { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } },
   { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } },
-  { ZERO_ZEROMV, { LAST_FRAME, LAST2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, LAST2_FRAME } },
 
   { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
@@ -417,7 +294,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } },
   { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } },
-  { ZERO_ZEROMV, { LAST_FRAME, LAST3_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, LAST3_FRAME } },
 
   { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
@@ -425,7 +302,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
   { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
   { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
-  { ZERO_ZEROMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, GOLDEN_FRAME } },
 
   { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
@@ -433,89 +310,400 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
   { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
   { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
-  { ZERO_ZEROMV, { BWDREF_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
+  { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } },
+};
 
-  { H_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { V_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D207_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D153_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D63_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D117_PRED, { INTRA_FRAME, NONE_FRAME } },
-  { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
+static const int16_t intra_to_mode_idx[INTRA_MODE_NUM] = {
+  7,    // DC_PRED,
+  134,  // V_PRED,
+  133,  // H_PRED,
+  140,  // D45_PRED,
+  135,  // D135_PRED,
+  139,  // D113_PRED,
+  137,  // D157_PRED,
+  136,  // D203_PRED,
+  138,  // D67_PRED,
+  46,   // SMOOTH_PRED,
+  47,   // SMOOTH_V_PRED,
+  48,   // SMOOTH_H_PRED,
+  45,   // PAETH_PRED,
+};
+
+/* clang-format off */
+static const int16_t single_inter_to_mode_idx[SINGLE_INTER_MODE_NUM]
+                                             [REF_FRAMES] = {
+  // NEARESTMV,
+  { -1, 0, 1, 2, 6, 3, 4, 5, },
+  // NEARMV,
+  { -1, 15, 16, 17, 21, 18, 19, 20, },
+  // GLOBALMV,
+  { -1, 22, 23, 24, 27, 25, 26, 28, },
+  // NEWMV,
+  { -1, 8, 9, 10, 14, 11, 12, 13, },
+};
+/* clang-format on */
 
-  { ZEROMV, { LAST_FRAME, INTRA_FRAME } },
-  { NEARESTMV, { LAST_FRAME, INTRA_FRAME } },
-  { NEARMV, { LAST_FRAME, INTRA_FRAME } },
-  { NEWMV, { LAST_FRAME, INTRA_FRAME } },
-
-#if CONFIG_EXT_REFS
-  { ZEROMV, { LAST2_FRAME, INTRA_FRAME } },
-  { NEARESTMV, { LAST2_FRAME, INTRA_FRAME } },
-  { NEARMV, { LAST2_FRAME, INTRA_FRAME } },
-  { NEWMV, { LAST2_FRAME, INTRA_FRAME } },
-
-  { ZEROMV, { LAST3_FRAME, INTRA_FRAME } },
-  { NEARESTMV, { LAST3_FRAME, INTRA_FRAME } },
-  { NEARMV, { LAST3_FRAME, INTRA_FRAME } },
-  { NEWMV, { LAST3_FRAME, INTRA_FRAME } },
-#endif  // CONFIG_EXT_REFS
-
-  { ZEROMV, { GOLDEN_FRAME, INTRA_FRAME } },
-  { NEARESTMV, { GOLDEN_FRAME, INTRA_FRAME } },
-  { NEARMV, { GOLDEN_FRAME, INTRA_FRAME } },
-  { NEWMV, { GOLDEN_FRAME, INTRA_FRAME } },
-
-#if CONFIG_EXT_REFS
-  { ZEROMV, { BWDREF_FRAME, INTRA_FRAME } },
-  { NEARESTMV, { BWDREF_FRAME, INTRA_FRAME } },
-  { NEARMV, { BWDREF_FRAME, INTRA_FRAME } },
-  { NEWMV, { BWDREF_FRAME, INTRA_FRAME } },
-
-  { ZEROMV, { ALTREF2_FRAME, INTRA_FRAME } },
-  { NEARESTMV, { ALTREF2_FRAME, INTRA_FRAME } },
-  { NEARMV, { ALTREF2_FRAME, INTRA_FRAME } },
-  { NEWMV, { ALTREF2_FRAME, INTRA_FRAME } },
-#endif  // CONFIG_EXT_REFS
-
-  { ZEROMV, { ALTREF_FRAME, INTRA_FRAME } },
-  { NEARESTMV, { ALTREF_FRAME, INTRA_FRAME } },
-  { NEARMV, { ALTREF_FRAME, INTRA_FRAME } },
-  { NEWMV, { ALTREF_FRAME, INTRA_FRAME } },
+/* clang-format off */
+static const int16_t comp_inter_to_mode_idx[COMP_INTER_MODE_NUM][REF_FRAMES]
+                                     [REF_FRAMES] = {
+  // NEAREST_NEARESTMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 41, 42, 43, 33, 37, 29, },
+      { -1, -1, -1, -1, -1, 34, 38, 30, },
+      { -1, -1, -1, -1, -1, 35, 39, 31, },
+      { -1, -1, -1, -1, -1, 36, 40, 32, },
+      { -1, -1, -1, -1, -1, -1, -1, 44, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // NEAR_NEARMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 141, 148, 155, 77, 105, 49, },
+      { -1, -1, -1, -1, -1, 84, 112, 56, },
+      { -1, -1, -1, -1, -1, 91, 119, 63, },
+      { -1, -1, -1, -1, -1, 98, 126, 70, },
+      { -1, -1, -1, -1, -1, -1, -1, 162, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // NEAREST_NEWMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 143, 150, 157, 79, 107, 51, },
+      { -1, -1, -1, -1, -1, 86, 114, 58, },
+      { -1, -1, -1, -1, -1, 93, 121, 65, },
+      { -1, -1, -1, -1, -1, 100, 128, 72, },
+      { -1, -1, -1, -1, -1, -1, -1, 164, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // NEW_NEARESTMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 142, 149, 156, 78, 106, 50, },
+      { -1, -1, -1, -1, -1, 85, 113, 57, },
+      { -1, -1, -1, -1, -1, 92, 120, 64, },
+      { -1, -1, -1, -1, -1, 99, 127, 71, },
+      { -1, -1, -1, -1, -1, -1, -1, 163, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // NEAR_NEWMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 145, 152, 159, 81, 109, 53, },
+      { -1, -1, -1, -1, -1, 88, 116, 60, },
+      { -1, -1, -1, -1, -1, 95, 123, 67, },
+      { -1, -1, -1, -1, -1, 102, 130, 74, },
+      { -1, -1, -1, -1, -1, -1, -1, 166, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // NEW_NEARMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 144, 151, 158, 80, 108, 52, },
+      { -1, -1, -1, -1, -1, 87, 115, 59, },
+      { -1, -1, -1, -1, -1, 94, 122, 66, },
+      { -1, -1, -1, -1, -1, 101, 129, 73, },
+      { -1, -1, -1, -1, -1, -1, -1, 165, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // GLOBAL_GLOBALMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 147, 154, 161, 83, 111, 55, },
+      { -1, -1, -1, -1, -1, 90, 118, 62, },
+      { -1, -1, -1, -1, -1, 97, 125, 69, },
+      { -1, -1, -1, -1, -1, 104, 132, 76, },
+      { -1, -1, -1, -1, -1, -1, -1, 168, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // NEW_NEWMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 146, 153, 160, 82, 110, 54, },
+      { -1, -1, -1, -1, -1, 89, 117, 61, },
+      { -1, -1, -1, -1, -1, 96, 124, 68, },
+      { -1, -1, -1, -1, -1, 103, 131, 75, },
+      { -1, -1, -1, -1, -1, -1, -1, 167, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
 };
+/* clang-format on */
+
+static int get_prediction_mode_idx(PREDICTION_MODE this_mode,
+                                   MV_REFERENCE_FRAME ref_frame,
+                                   MV_REFERENCE_FRAME second_ref_frame) {
+  if (this_mode < INTRA_MODE_END) {
+    assert(ref_frame == INTRA_FRAME);
+    assert(second_ref_frame == NONE_FRAME);
+    return intra_to_mode_idx[this_mode - INTRA_MODE_START];
+  }
+  if (this_mode >= SINGLE_INTER_MODE_START &&
+      this_mode < SINGLE_INTER_MODE_END) {
+    assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
+    assert(second_ref_frame == NONE_FRAME);
+    return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START]
+                                   [ref_frame];
+  }
+  if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END) {
+    assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
+    assert((second_ref_frame > INTRA_FRAME) &&
+           (second_ref_frame <= ALTREF_FRAME));
+    return comp_inter_to_mode_idx[this_mode - COMP_INTER_MODE_START][ref_frame]
+                                 [second_ref_frame];
+  }
+  assert(0);
+  return -1;
+}
 
 static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = {
-  DC_PRED,       H_PRED,        V_PRED,    SMOOTH_PRED, TM_PRED,
-#if CONFIG_SMOOTH_HV
-  SMOOTH_V_PRED, SMOOTH_H_PRED,
-#endif  // CONFIG_SMOOTH_HV
-  D135_PRED,     D207_PRED,     D153_PRED, D63_PRED,    D117_PRED, D45_PRED,
+  DC_PRED,       H_PRED,        V_PRED,    SMOOTH_PRED, PAETH_PRED,
+  SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED,   D157_PRED,
+  D67_PRED,      D113_PRED,     D45_PRED,
 };
 
-#if CONFIG_CFL
 static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
-  UV_DC_PRED,       UV_CFL_PRED,      UV_H_PRED,
-  UV_V_PRED,        UV_SMOOTH_PRED,   UV_TM_PRED,
-#if CONFIG_SMOOTH_HV
-  UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED,
-#endif  // CONFIG_SMOOTH_HV
-  UV_D135_PRED,     UV_D207_PRED,     UV_D153_PRED,
-  UV_D63_PRED,      UV_D117_PRED,     UV_D45_PRED,
+  UV_DC_PRED,     UV_CFL_PRED,   UV_H_PRED,        UV_V_PRED,
+  UV_SMOOTH_PRED, UV_PAETH_PRED, UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED,
+  UV_D135_PRED,   UV_D203_PRED,  UV_D157_PRED,     UV_D67_PRED,
+  UV_D113_PRED,   UV_D45_PRED,
 };
-#else
-#define uv_rd_search_mode_order intra_rd_search_mode_order
-#endif  // CONFIG_CFL
+
+typedef struct InterModeSearchState {
+  int64_t best_rd;
+  MB_MODE_INFO best_mbmode;
+  int best_rate_y;
+  int best_rate_uv;
+  int best_mode_skippable;
+  int best_skip2;
+  int best_mode_index;
+  int skip_intra_modes;
+  int num_available_refs;
+  int64_t dist_refs[REF_FRAMES];
+  int dist_order_refs[REF_FRAMES];
+  int64_t mode_threshold[MAX_MODES];
+  PREDICTION_MODE best_intra_mode;
+  int64_t best_intra_rd;
+  int angle_stats_ready;
+  uint8_t directional_mode_skip_mask[INTRA_MODES];
+  unsigned int best_pred_sse;
+  int rate_uv_intra[TX_SIZES_ALL];
+  int rate_uv_tokenonly[TX_SIZES_ALL];
+  int64_t dist_uvs[TX_SIZES_ALL];
+  int skip_uvs[TX_SIZES_ALL];
+  UV_PREDICTION_MODE mode_uv[TX_SIZES_ALL];
+  PALETTE_MODE_INFO pmi_uv[TX_SIZES_ALL];
+  int8_t uv_angle_delta[TX_SIZES_ALL];
+  int64_t best_pred_rd[REFERENCE_MODES];
+  int64_t best_pred_diff[REFERENCE_MODES];
+  // Save a set of single_newmv for each checked ref_mv.
+  int_mv single_newmv[MAX_REF_MV_SERCH][REF_FRAMES];
+  int single_newmv_rate[MAX_REF_MV_SERCH][REF_FRAMES];
+  int single_newmv_valid[MAX_REF_MV_SERCH][REF_FRAMES];
+  int64_t modelled_rd[MB_MODE_COUNT][REF_FRAMES];
+} InterModeSearchState;
+
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+
+typedef struct InterModeRdModel {
+  int ready;
+  double a;
+  double b;
+  double dist_mean;
+  int skip_count;
+  int non_skip_count;
+  int fp_skip_count;
+  int bracket_idx;
+} InterModeRdModel;
+
+InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
+
+#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400
+static int inter_mode_data_idx[4];
+static int64_t inter_mode_data_sse[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
+static int64_t inter_mode_data_dist[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
+static int inter_mode_data_residue_cost[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
+static int inter_mode_data_all_cost[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
+static int64_t inter_mode_data_ref_best_rd[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
+
+int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
+  if (bsize == BLOCK_8X8) return 1;
+  if (bsize == BLOCK_16X16) return 2;
+  if (bsize == BLOCK_32X32) return 3;
+  return -1;
+}
+
+void av1_inter_mode_data_init() {
+  for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
+    const int block_idx = inter_mode_data_block_idx(i);
+    if (block_idx != -1) inter_mode_data_idx[block_idx] = 0;
+    InterModeRdModel *md = &inter_mode_rd_models[i];
+    md->ready = 0;
+    md->skip_count = 0;
+    md->non_skip_count = 0;
+    md->fp_skip_count = 0;
+    md->bracket_idx = 0;
+  }
+}
+
+void av1_inter_mode_data_show(const AV1_COMMON *cm) {
+  printf("frame_offset %d\n", cm->frame_offset);
+  for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
+    const int block_idx = inter_mode_data_block_idx(i);
+    if (block_idx != -1) inter_mode_data_idx[block_idx] = 0;
+    InterModeRdModel *md = &inter_mode_rd_models[i];
+    if (md->ready) {
+      printf("bsize %d non_skip_count %d skip_count %d fp_skip_count %d\n", i,
+             md->non_skip_count, md->skip_count, md->fp_skip_count);
+    }
+  }
+}
+
+static int64_t get_est_rd(BLOCK_SIZE bsize, int rdmult, int64_t sse,
+                          int curr_cost) {
+  aom_clear_system_state();
+  InterModeRdModel *md = &inter_mode_rd_models[bsize];
+  if (md->ready) {
+    const double est_ld = md->a * sse + md->b;
+    const double est_residue_cost = (sse - md->dist_mean) / est_ld;
+    const int64_t est_cost = (int64_t)round(est_residue_cost) + curr_cost;
+    const int64_t int64_dist_mean = (int64_t)round(md->dist_mean);
+    const int64_t est_rd = RDCOST(rdmult, est_cost, int64_dist_mean);
+    return est_rd;
+  }
+  return 0;
+}
+
+#define DATA_BRACKETS 7
+static const int data_num_threshold[DATA_BRACKETS] = {
+  200, 400, 800, 1600, 3200, 6400, INT32_MAX
+};
+
+void av1_inter_mode_data_fit(int rdmult) {
+  aom_clear_system_state();
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+    const int block_idx = inter_mode_data_block_idx(bsize);
+    InterModeRdModel *md = &inter_mode_rd_models[bsize];
+    if (block_idx == -1) continue;
+    int data_num = inter_mode_data_idx[block_idx];
+    if (data_num < data_num_threshold[md->bracket_idx]) {
+      continue;
+    }
+    double my = 0;
+    double mx = 0;
+    double dx = 0;
+    double dxy = 0;
+    double dist_mean = 0;
+    const int train_num = data_num;
+    for (int i = 0; i < train_num; ++i) {
+      const double sse = (double)inter_mode_data_sse[block_idx][i];
+      const double dist = (double)inter_mode_data_dist[block_idx][i];
+      const double residue_cost = inter_mode_data_residue_cost[block_idx][i];
+      const double ld = (sse - dist) / residue_cost;
+      dist_mean += dist;
+      my += ld;
+      mx += sse;
+      dx += sse * sse;
+      dxy += sse * ld;
+    }
+    dist_mean = dist_mean / data_num;
+    my = my / train_num;
+    mx = mx / train_num;
+    dx = sqrt(dx / train_num);
+    dxy = dxy / train_num;
+
+    md->dist_mean = dist_mean;
+    md->a = (dxy - mx * my) / (dx * dx - mx * mx);
+    md->b = my - md->a * mx;
+    ++md->bracket_idx;
+    md->ready = 1;
+    assert(md->bracket_idx < DATA_BRACKETS);
+
+    (void)rdmult;
+#if 0
+    int skip_count = 0;
+    int fp_skip_count = 0;
+    double avg_error = 0;
+    const int test_num = data_num;
+    for (int i = 0; i < data_num; ++i) {
+      const int64_t sse = inter_mode_data_sse[block_idx][i];
+      const int64_t dist = inter_mode_data_dist[block_idx][i];
+      const int64_t residue_cost = inter_mode_data_residue_cost[block_idx][i];
+      const int64_t all_cost = inter_mode_data_all_cost[block_idx][i];
+      const int64_t est_rd =
+          get_est_rd(bsize, rdmult, sse, all_cost - residue_cost);
+      const int64_t real_rd = RDCOST(rdmult, all_cost, dist);
+      const int64_t ref_best_rd = inter_mode_data_ref_best_rd[block_idx][i];
+      if (est_rd > ref_best_rd) {
+        ++skip_count;
+        if (real_rd < ref_best_rd) {
+          ++fp_skip_count;
+        }
+      }
+      avg_error += abs(est_rd - real_rd) * 100. / real_rd;
+    }
+    avg_error /= test_num;
+    printf("test_num %d bsize %d avg_error %f skip_count %d fp_skip_count %d\n",
+           test_num, bsize, avg_error, skip_count, fp_skip_count);
+#endif
+  }
+}
+
+static void inter_mode_data_push(BLOCK_SIZE bsize, int64_t sse, int64_t dist,
+                                 int residue_cost, int all_cost,
+                                 int64_t ref_best_rd) {
+  if (residue_cost == 0 || sse == dist) return;
+  const int block_idx = inter_mode_data_block_idx(bsize);
+  if (block_idx == -1) return;
+  if (inter_mode_data_idx[block_idx] < INTER_MODE_RD_DATA_OVERALL_SIZE) {
+    const int data_idx = inter_mode_data_idx[block_idx];
+    inter_mode_data_sse[block_idx][data_idx] = sse;
+    inter_mode_data_dist[block_idx][data_idx] = dist;
+    inter_mode_data_residue_cost[block_idx][data_idx] = residue_cost;
+    inter_mode_data_all_cost[block_idx][data_idx] = all_cost;
+    inter_mode_data_ref_best_rd[block_idx][data_idx] = ref_best_rd;
+    ++inter_mode_data_idx[block_idx];
+  }
+}
+#endif  // CONFIG_COLLECT_INTER_MODE_RD_STATS
 
 static INLINE int write_uniform_cost(int n, int v) {
   const int l = get_unsigned_bits(n);
   const int m = (1 << l) - n;
   if (l == 0) return 0;
   if (v < m)
-    return (l - 1) * av1_cost_bit(128, 0);
+    return av1_cost_literal(l - 1);
   else
-    return l * av1_cost_bit(128, 0);
+    return av1_cost_literal(l);
+}
+
+// Similar to store_cfl_required(), but for use during the RDO process,
+// where we haven't yet determined whether this block uses CfL.
+static INLINE CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm,
+                                                      const MACROBLOCK *x) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  if (cm->seq_params.monochrome || x->skip_chroma_rd) return CFL_DISALLOWED;
+
+  if (!xd->cfl.is_chroma_reference) {
+    // For non-chroma-reference blocks, we should always store the luma pixels,
+    // in case the corresponding chroma-reference block uses CfL.
+    // Note that this can only happen for block sizes which are <8 on
+    // their shortest side, as otherwise they would be chroma reference
+    // blocks.
+    return CFL_ALLOWED;
+  }
+
+  // For chroma reference blocks, we should store data in the encoder iff we're
+  // allowed to try out CfL.
+  return is_cfl_allowed(xd);
 }
 
 // constants for prune 1 and prune 2 decision boundaries
@@ -524,6 +712,10 @@ static INLINE int write_uniform_cost(int n, int v) {
 #define FAST_EXT_TX_CORR_MARGIN 0.5
 #define FAST_EXT_TX_EDST_MARGIN 0.3
 
+static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                           RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                           int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode);
+
 static unsigned pixel_dist_visible_only(
     const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src,
     const int src_stride, const uint8_t *dst, const int dst_stride,
@@ -531,15 +723,10 @@ static unsigned pixel_dist_visible_only(
     int visible_cols) {
   unsigned sse;
 
-  if (txb_rows == visible_rows && txb_cols == visible_cols
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-      && tx_bsize < BLOCK_SIZES
-#endif
-      ) {
+  if (txb_rows == visible_rows && txb_cols == visible_cols) {
     cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
     return sse;
   }
-#if CONFIG_HIGHBITDEPTH
   const MACROBLOCKD *xd = &x->e_mbd;
 
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -547,9 +734,6 @@ static unsigned pixel_dist_visible_only(
                                              visible_cols, visible_rows);
     return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2);
   }
-#else
-  (void)x;
-#endif  // CONFIG_HIGHBITDEPTH
   sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols,
                          visible_rows);
   return sse;
@@ -588,10 +772,9 @@ static uint64_t cdef_dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
   const uint64_t c1 = (400 * a << 2 * coeff_shift);
   const uint64_t c2 = (b * 20000 * a * a << 4 * coeff_shift);
 
-  dist =
-      (uint64_t)floor(.5 +
-                      (sum_d2 + sum_s2 - 2 * sum_sd) * .5 * (svar + dvar + c1) /
-                          (sqrt(svar * (double)dvar + c2)));
+  dist = (uint64_t)floor(.5 + (sum_d2 + sum_s2 - 2 * sum_sd) * .5 *
+                                  (svar + dvar + c1) /
+                                  (sqrt(svar * (double)dvar + c2)));
 
   // Calibrate dist to have similar rate for the same QP with MSE only
   // distortion (as in master branch)
@@ -729,11 +912,9 @@ static double od_compute_dist_common(int activity_masking, uint16_t *x,
 static double od_compute_dist(uint16_t *x, uint16_t *y, int bsize_w,
                               int bsize_h, int qindex) {
   assert(bsize_w >= 8 && bsize_h >= 8);
-#if CONFIG_PVQ
-  int activity_masking = 1;
-#else
+
   int activity_masking = 0;
-#endif
+
   int i, j;
   DECLARE_ALIGNED(16, od_coeff, e[MAX_TX_SQUARE]);
   DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
@@ -760,11 +941,9 @@ static double od_compute_dist(uint16_t *x, uint16_t *y, int bsize_w,
 static double od_compute_dist_diff(uint16_t *x, int16_t *e, int bsize_w,
                                    int bsize_h, int qindex) {
   assert(bsize_w >= 8 && bsize_h >= 8);
-#if CONFIG_PVQ
-  int activity_masking = 1;
-#else
+
   int activity_masking = 0;
-#endif
+
   DECLARE_ALIGNED(16, uint16_t, y[MAX_TX_SQUARE]);
   DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
   DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
@@ -806,7 +985,6 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
 
   if (x->tune_metric == AOM_TUNE_CDEF_DIST ||
       x->tune_metric == AOM_TUNE_DAALA_DIST) {
-#if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       for (j = 0; j < bsh; j++)
         for (i = 0; i < bsw; i++)
@@ -834,7 +1012,6 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
         }
       }
     } else {
-#endif
       for (j = 0; j < bsh; j++)
         for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
 
@@ -858,9 +1035,7 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
               rec[j * bsw + i] = src[j * src_stride + i];
         }
       }
-#if CONFIG_HIGHBITDEPTH
     }
-#endif  // CONFIG_HIGHBITDEPTH
   }
 
   if (x->tune_metric == AOM_TUNE_DAALA_DIST) {
@@ -874,10 +1049,8 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
                                  bsw, coeff_shift);
       }
     }
-#if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
       d = ((uint64_t)d) >> 2 * coeff_shift;
-#endif
   } else {
     // Otherwise, MSE by default
     d = pixel_dist_visible_only(cpi, x, src, src_stride, dst, dst_stride,
@@ -887,10 +1060,10 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
   return d;
 }
 
-static int64_t av1_dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src,
-                                 int src_stride, const int16_t *diff,
-                                 int diff_stride, int bsw, int bsh,
-                                 int visible_w, int visible_h, int qindex) {
+static int64_t dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src,
+                             int src_stride, const int16_t *diff,
+                             int diff_stride, int bsw, int bsh, int visible_w,
+                             int visible_h, int qindex) {
   int64_t d = 0;
   int i, j;
   const MACROBLOCKD *xd = &x->e_mbd;
@@ -905,18 +1078,14 @@ static int64_t av1_dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src,
 
   if (x->tune_metric == AOM_TUNE_CDEF_DIST ||
       x->tune_metric == AOM_TUNE_DAALA_DIST) {
-#if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       for (j = 0; j < bsh; j++)
         for (i = 0; i < bsw; i++)
           orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
     } else {
-#endif
       for (j = 0; j < bsh; j++)
         for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
-#if CONFIG_HIGHBITDEPTH
     }
-#endif  // CONFIG_HIGHBITDEPTH
 
     if ((bsw == visible_w) && (bsh == visible_h)) {
       for (j = 0; j < bsh; j++)
@@ -971,7 +1140,8 @@ static int64_t av1_dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src,
 static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
                                          const uint8_t *src, int src_stride,
                                          const uint8_t *dst, int dst_stride,
-                                         double *hordist, double *verdist) {
+                                         int need_4th, double *hordist,
+                                         double *verdist) {
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
   unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
@@ -980,7 +1150,6 @@ static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
   if (f_index < 0) {
     const int w_shift = bw == 8 ? 1 : 2;
     const int h_shift = bh == 8 ? 1 : 2;
-#if CONFIG_HIGHBITDEPTH
     if (cpi->common.use_highbitdepth) {
       const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
       const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
@@ -992,17 +1161,13 @@ static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
               (src16[j + i * src_stride] - dst16[j + i * dst_stride]);
         }
     } else {
-#endif  // CONFIG_HIGHBITDEPTH
-
       for (int i = 0; i < bh; ++i)
         for (int j = 0; j < bw; ++j) {
           const int index = (j >> w_shift) + ((i >> h_shift) << 2);
           esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) *
                         (src[j + i * src_stride] - dst[j + i * dst_stride]);
         }
-#if CONFIG_HIGHBITDEPTH
     }
-#endif  // CONFIG_HIGHBITDEPTH
   } else {
     cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[0]);
     cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
@@ -1051,13 +1216,22 @@ static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
     hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip;
     hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip;
     hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip;
+    if (need_4th) {
+      hordist[3] = ((double)esq[3] + esq[7] + esq[11] + esq[15]) * e_recip;
+    }
     verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip;
     verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip;
     verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip;
+    if (need_4th) {
+      verdist[3] = ((double)esq[12] + esq[13] + esq[14] + esq[15]) * e_recip;
+    }
   } else {
     hordist[0] = verdist[0] = 0.25;
     hordist[1] = verdist[1] = 0.25;
     hordist[2] = verdist[2] = 0.25;
+    if (need_4th) {
+      hordist[3] = verdist[3] = 0.25;
+    }
   }
 }
 
@@ -1067,7 +1241,7 @@ static int adst_vs_flipadst(const AV1_COMP *cpi, BLOCK_SIZE bsize,
   int prune_bitmask = 0;
   double svm_proj_h = 0, svm_proj_v = 0;
   double hdist[3] = { 0, 0, 0 }, vdist[3] = { 0, 0, 0 };
-  get_energy_distribution_fine(cpi, bsize, src, src_stride, dst, dst_stride,
+  get_energy_distribution_fine(cpi, bsize, src, src_stride, dst, dst_stride, 0,
                                hdist, vdist);
 
   svm_proj_v = vdist[0] * ADST_FLIP_SVM[0] + vdist[1] * ADST_FLIP_SVM[1] +
@@ -1087,7 +1261,6 @@ static int adst_vs_flipadst(const AV1_COMP *cpi, BLOCK_SIZE bsize,
   return prune_bitmask;
 }
 
-#if CONFIG_EXT_TX
 static void get_horver_correlation(const int16_t *diff, int stride, int w,
                                    int h, double *hcorr, double *vcorr) {
   // Returns hor/ver correlation coefficient
@@ -1132,7 +1305,7 @@ static void get_horver_correlation(const int16_t *diff, int stride, int w,
   }
 }
 
-int dct_vs_idtx(const int16_t *diff, int stride, int w, int h) {
+static int dct_vs_idtx(const int16_t *diff, int stride, int w, int h) {
   double hcorr, vcorr;
   int prune_bitmask = 0;
   get_horver_correlation(diff, stride, w, h, &hcorr, &vcorr);
@@ -1164,14 +1337,13 @@ static int prune_two_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
   if (dct_idtx) {
     av1_subtract_plane(x, bsize, 0);
     const struct macroblock_plane *const p = &x->plane[0];
-    const int bw = 4 << (b_width_log2_lookup[bsize]);
-    const int bh = 4 << (b_height_log2_lookup[bsize]);
+    const int bw = block_size_wide[bsize];
+    const int bh = block_size_high[bsize];
     prune |= dct_vs_idtx(p->src_diff, bw, bw, bh);
   }
 
   return prune;
 }
-#endif  // CONFIG_EXT_TX
 
 // Performance drop: 0.3%, Speed improvement: 5%
 static int prune_one_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
@@ -1182,61 +1354,342 @@ static int prune_one_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
                           pd->dst.stride);
 }
 
-#if CONFIG_EXT_TX
 // 1D Transforms used in inter set, this needs to be changed if
 // ext_tx_used_inter is changed
 static const int ext_tx_used_inter_1D[EXT_TX_SETS_INTER][TX_TYPES_1D] = {
-  { 1, 0, 0, 0 }, { 1, 1, 1, 1 }, { 1, 1, 1, 1 }, { 1, 0, 0, 1 },
-#if CONFIG_MRC_TX
+  { 1, 0, 0, 0 },
+  { 1, 1, 1, 1 },
+  { 1, 1, 1, 1 },
   { 1, 0, 0, 1 },
-#endif  // CONFIG_MRC_TX
 };
-#endif  // CONFIG_EXT_TX
 
-static int prune_tx_types(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
-                          const MACROBLOCKD *const xd, int tx_set) {
-#if CONFIG_EXT_TX
-  const int *tx_set_1D = tx_set >= 0 ? ext_tx_used_inter_1D[tx_set] : NULL;
-#else
-  const int tx_set_1D[TX_TYPES_1D] = { 0 };
-#endif  // CONFIG_EXT_TX
+static void get_energy_distribution_finer(const int16_t *diff, int stride,
+                                          int bw, int bh, float *hordist,
+                                          float *verdist) {
+  // First compute downscaled block energy values (esq); downscale factors
+  // are defined by w_shift and h_shift.
+  unsigned int esq[256];
+  const int w_shift = bw <= 8 ? 0 : 1;
+  const int h_shift = bh <= 8 ? 0 : 1;
+  const int esq_w = bw <= 8 ? bw : bw / 2;
+  const int esq_h = bh <= 8 ? bh : bh / 2;
+  const int esq_sz = esq_w * esq_h;
+  int i, j;
+  memset(esq, 0, esq_sz * sizeof(esq[0]));
+  for (i = 0; i < bh; i++) {
+    unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
+    const int16_t *cur_diff_row = diff + i * stride;
+    for (j = 0; j < bw; j++) {
+      cur_esq_row[j >> w_shift] += cur_diff_row[j] * cur_diff_row[j];
+    }
+  }
 
+  uint64_t total = 0;
+  for (i = 0; i < esq_sz; i++) total += esq[i];
+
+  // Output hordist and verdist arrays are normalized 1D projections of esq
+  if (total == 0) {
+    float hor_val = 1.0f / esq_w;
+    for (j = 0; j < esq_w - 1; j++) hordist[j] = hor_val;
+    float ver_val = 1.0f / esq_h;
+    for (i = 0; i < esq_h - 1; i++) verdist[i] = ver_val;
+    return;
+  }
+
+  const float e_recip = 1.0f / (float)total;
+  memset(hordist, 0, (esq_w - 1) * sizeof(hordist[0]));
+  memset(verdist, 0, (esq_h - 1) * sizeof(verdist[0]));
+  const unsigned int *cur_esq_row;
+  for (i = 0; i < esq_h - 1; i++) {
+    cur_esq_row = esq + i * esq_w;
+    for (j = 0; j < esq_w - 1; j++) {
+      hordist[j] += (float)cur_esq_row[j];
+      verdist[i] += (float)cur_esq_row[j];
+    }
+    verdist[i] += (float)cur_esq_row[j];
+  }
+  cur_esq_row = esq + i * esq_w;
+  for (j = 0; j < esq_w - 1; j++) hordist[j] += (float)cur_esq_row[j];
+
+  for (j = 0; j < esq_w - 1; j++) hordist[j] *= e_recip;
+  for (i = 0; i < esq_h - 1; i++) verdist[i] *= e_recip;
+}
+
+// Similar to get_horver_correlation, but also takes into account first
+// row/column, when computing horizontal/vertical correlation.
+static void get_horver_correlation_full(const int16_t *diff, int stride, int w,
+                                        int h, float *hcorr, float *vcorr) {
+  const float num_hor = (float)(h * (w - 1));
+  const float num_ver = (float)((h - 1) * w);
+  int i, j;
+
+  // The following notation is used:
+  // x - current pixel
+  // y - left neighbor pixel
+  // z - top neighbor pixel
+  int64_t xy_sum = 0, xz_sum = 0;
+  int64_t xhor_sum = 0, xver_sum = 0, y_sum = 0, z_sum = 0;
+  int64_t x2hor_sum = 0, x2ver_sum = 0, y2_sum = 0, z2_sum = 0;
+
+  int16_t x, y, z;
+  for (j = 1; j < w; ++j) {
+    x = diff[j];
+    y = diff[j - 1];
+    xy_sum += x * y;
+    xhor_sum += x;
+    y_sum += y;
+    x2hor_sum += x * x;
+    y2_sum += y * y;
+  }
+  for (i = 1; i < h; ++i) {
+    x = diff[i * stride];
+    z = diff[(i - 1) * stride];
+    xz_sum += x * z;
+    xver_sum += x;
+    z_sum += z;
+    x2ver_sum += x * x;
+    z2_sum += z * z;
+    for (j = 1; j < w; ++j) {
+      x = diff[i * stride + j];
+      y = diff[i * stride + j - 1];
+      z = diff[(i - 1) * stride + j];
+      xy_sum += x * y;
+      xz_sum += x * z;
+      xhor_sum += x;
+      xver_sum += x;
+      y_sum += y;
+      z_sum += z;
+      x2hor_sum += x * x;
+      x2ver_sum += x * x;
+      y2_sum += y * y;
+      z2_sum += z * z;
+    }
+  }
+  const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+  const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+  const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+  const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+  const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+  const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+  *hcorr = *vcorr = 1;
+  if (xhor_var_n > 0 && y_var_n > 0) {
+    *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+    *hcorr = *hcorr < 0 ? 0 : *hcorr;
+  }
+  if (xver_var_n > 0 && z_var_n > 0) {
+    *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+    *vcorr = *vcorr < 0 ? 0 : *vcorr;
+  }
+}
+
+// Transforms raw scores into a probability distribution across 16 TX types
+static void score_2D_transform_pow8(float *scores_2D, float shift) {
+  float sum = 0.0f;
+  int i;
+
+  for (i = 0; i < 16; i++) {
+    float v, v2, v4;
+    v = AOMMAX(scores_2D[i] + shift, 0.0f);
+    v2 = v * v;
+    v4 = v2 * v2;
+    scores_2D[i] = v4 * v4;
+    sum += scores_2D[i];
+  }
+  for (i = 0; i < 16; i++) scores_2D[i] /= sum;
+}
+
+// These thresholds were calibrated to provide a certain number of TX types
+// pruned by the model on average, i.e. selecting a threshold with index i
+// will lead to pruning i+1 TX types on average
+static const float *prune_2D_adaptive_thresholds[] = {
+  // TX_4X4
+  (float[]){ 0.02014f, 0.02722f, 0.03430f, 0.04114f, 0.04724f, 0.05212f,
+             0.05627f, 0.06018f, 0.06409f, 0.06824f, 0.07312f, 0.07849f,
+             0.08606f, 0.09827f },
+  // TX_8X8
+  (float[]){ 0.00745f, 0.01355f, 0.02039f, 0.02795f, 0.03625f, 0.04407f,
+             0.05042f, 0.05579f, 0.06067f, 0.06604f, 0.07239f, 0.08093f,
+             0.09363f, 0.11682f },
+  // TX_16X16
+  (float[]){ 0.01404f, 0.02820f, 0.04211f, 0.05164f, 0.05798f, 0.06335f,
+             0.06897f, 0.07629f, 0.08875f, 0.11169f },
+  // TX_32X32
+  NULL,
+  // TX_64X64
+  NULL,
+  // TX_4X8
+  (float[]){ 0.01282f, 0.02087f, 0.02844f, 0.03601f, 0.04285f, 0.04871f,
+             0.05359f, 0.05823f, 0.06287f, 0.06799f, 0.07361f, 0.08093f,
+             0.09119f, 0.10828f },
+  // TX_8X4
+  (float[]){ 0.01184f, 0.01941f, 0.02722f, 0.03503f, 0.04187f, 0.04822f,
+             0.05359f, 0.05823f, 0.06287f, 0.06799f, 0.07361f, 0.08093f,
+             0.09167f, 0.10974f },
+  // TX_8X16
+  (float[]){ 0.00525f, 0.01135f, 0.01819f, 0.02576f, 0.03357f, 0.04114f,
+             0.04773f, 0.05383f, 0.05920f, 0.06506f, 0.07190f, 0.08118f,
+             0.09509f, 0.12097f },
+  // TX_16X8
+  (float[]){ 0.00525f, 0.01160f, 0.01819f, 0.02527f, 0.03308f, 0.04065f,
+             0.04773f, 0.05383f, 0.05969f, 0.06531f, 0.07214f, 0.08118f,
+             0.09485f, 0.12048f },
+  // TX_16X32
+  (float[]){ 0.01257f, 0.02576f, 0.03723f, 0.04578f, 0.05212f, 0.05798f,
+             0.06506f, 0.07385f, 0.08606f, 0.10925f },
+  // TX_32X16
+  (float[]){ 0.01233f, 0.02527f, 0.03699f, 0.04602f, 0.05286f, 0.05896f,
+             0.06531f, 0.07336f, 0.08582f, 0.11072f },
+  // TX_32X64
+  NULL,
+  // TX_64X32
+  NULL,
+  // TX_4X16
+  NULL,
+  // TX_16X4
+  NULL,
+  // TX_8X32
+  NULL,
+  // TX_32X8
+  NULL,
+  // TX_16X64
+  NULL,
+  // TX_64X16
+  NULL,
+};
+
+static int prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                       int blk_row, int blk_col, TxSetType tx_set_type,
+                       TX_TYPE_PRUNE_MODE prune_mode) {
+  static const int tx_type_table_2D[16] = {
+    DCT_DCT,      DCT_ADST,      DCT_FLIPADST,      V_DCT,
+    ADST_DCT,     ADST_ADST,     ADST_FLIPADST,     V_ADST,
+    FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST,
+    H_DCT,        H_ADST,        H_FLIPADST,        IDTX
+  };
+  if (tx_set_type != EXT_TX_SET_ALL16 &&
+      tx_set_type != EXT_TX_SET_DTT9_IDTX_1DDCT)
+    return 0;
+  const NN_CONFIG *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size];
+  const NN_CONFIG *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size];
+  if (!nn_config_hor || !nn_config_ver) return 0;  // Model not established yet.
+
+  aom_clear_system_state();
+  float hfeatures[16], vfeatures[16];
+  float hscores[4], vscores[4];
+  float scores_2D[16];
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+  const int hfeatures_num = bw <= 8 ? bw : bw / 2;
+  const int vfeatures_num = bh <= 8 ? bh : bh / 2;
+  assert(hfeatures_num <= 16);
+  assert(vfeatures_num <= 16);
+
+  const struct macroblock_plane *const p = &x->plane[0];
+  const int diff_stride = block_size_wide[bsize];
+  const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+  get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures,
+                                vfeatures);
+  get_horver_correlation_full(diff, diff_stride, bw, bh,
+                              &hfeatures[hfeatures_num - 1],
+                              &vfeatures[vfeatures_num - 1]);
+  av1_nn_predict(hfeatures, nn_config_hor, hscores);
+  av1_nn_predict(vfeatures, nn_config_ver, vscores);
+
+  float score_2D_average = 0.0f;
+  for (int i = 0; i < 4; i++) {
+    float *cur_scores_2D = scores_2D + i * 4;
+    cur_scores_2D[0] = vscores[i] * hscores[0];
+    cur_scores_2D[1] = vscores[i] * hscores[1];
+    cur_scores_2D[2] = vscores[i] * hscores[2];
+    cur_scores_2D[3] = vscores[i] * hscores[3];
+    score_2D_average += cur_scores_2D[0] + cur_scores_2D[1] + cur_scores_2D[2] +
+                        cur_scores_2D[3];
+  }
+  score_2D_average /= 16;
+  score_2D_transform_pow8(scores_2D, (20 - score_2D_average));
+
+  // Always keep the TX type with the highest score, prune all others with
+  // score below score_thresh.
+  int max_score_i = 0;
+  float max_score = 0.0f;
+  for (int i = 0; i < 16; i++) {
+    if (scores_2D[i] > max_score &&
+        av1_ext_tx_used[tx_set_type][tx_type_table_2D[i]]) {
+      max_score = scores_2D[i];
+      max_score_i = i;
+    }
+  }
+
+  int pruning_aggressiveness = 0;
+  if (prune_mode == PRUNE_2D_ACCURATE) {
+    if (tx_set_type == EXT_TX_SET_ALL16)
+      pruning_aggressiveness = 6;
+    else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT)
+      pruning_aggressiveness = 4;
+  } else if (prune_mode == PRUNE_2D_FAST) {
+    if (tx_set_type == EXT_TX_SET_ALL16)
+      pruning_aggressiveness = 10;
+    else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT)
+      pruning_aggressiveness = 7;
+  }
+  const float score_thresh =
+      prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness - 1];
+
+  int prune_bitmask = 0;
+  for (int i = 0; i < 16; i++) {
+    if (scores_2D[i] < score_thresh && i != max_score_i)
+      prune_bitmask |= (1 << tx_type_table_2D[i]);
+  }
+  return prune_bitmask;
+}
+
+static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
+                     const MACROBLOCKD *const xd, int tx_set_type) {
+  av1_zero(x->tx_search_prune);
+  x->tx_split_prune_flag = 0;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  if (!is_inter_block(mbmi) || cpi->sf.tx_type_search.prune_mode == NO_PRUNE ||
+      x->use_default_inter_tx_type || xd->lossless[mbmi->segment_id] ||
+      x->cb_partition_scan)
+    return;
+  int tx_set = ext_tx_set_index[1][tx_set_type];
+  assert(tx_set >= 0);
+  const int *tx_set_1D = ext_tx_used_inter_1D[tx_set];
   switch (cpi->sf.tx_type_search.prune_mode) {
-    case NO_PRUNE: return 0; break;
+    case NO_PRUNE: return;
     case PRUNE_ONE:
-      if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D]))
-        return 0;
-      return prune_one_for_sby(cpi, bsize, x, xd);
+      if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) return;
+      x->tx_search_prune[tx_set_type] = prune_one_for_sby(cpi, bsize, x, xd);
       break;
-#if CONFIG_EXT_TX
     case PRUNE_TWO:
-      if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) {
-        if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return 0;
-        return prune_two_for_sby(cpi, bsize, x, xd, 0, 1);
+      if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) {
+        if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return;
+        x->tx_search_prune[tx_set_type] =
+            prune_two_for_sby(cpi, bsize, x, xd, 0, 1);
+      }
+      if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) {
+        x->tx_search_prune[tx_set_type] =
+            prune_two_for_sby(cpi, bsize, x, xd, 1, 0);
       }
-      if ((tx_set >= 0) && !(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D]))
-        return prune_two_for_sby(cpi, bsize, x, xd, 1, 0);
-      return prune_two_for_sby(cpi, bsize, x, xd, 1, 1);
+      x->tx_search_prune[tx_set_type] =
+          prune_two_for_sby(cpi, bsize, x, xd, 1, 1);
       break;
-#endif  // CONFIG_EXT_TX
+    case PRUNE_2D_ACCURATE:
+    case PRUNE_2D_FAST: break;
+    default: assert(0);
   }
-  assert(0);
-  return 0;
 }
 
-static int do_tx_type_search(TX_TYPE tx_type, int prune) {
-// TODO(sarahparker) implement for non ext tx
-#if CONFIG_EXT_TX
-  return !(((prune >> vtx_tab[tx_type]) & 1) |
-           ((prune >> (htx_tab[tx_type] + 8)) & 1));
-#else
-  // temporary to avoid compiler warnings
-  (void)vtx_tab;
-  (void)htx_tab;
-  (void)tx_type;
-  (void)prune;
-  return 1;
-#endif  // CONFIG_EXT_TX
+static int do_tx_type_search(TX_TYPE tx_type, int prune,
+                             TX_TYPE_PRUNE_MODE mode) {
+  // TODO(sarahparker) implement for non ext tx
+  if (mode >= PRUNE_2D_ACCURATE) {
+    return !((prune >> tx_type) & 1);
+  } else {
+    return !(((prune >> vtx_tab[tx_type]) & 1) |
+             ((prune >> (htx_tab[tx_type] + 8)) & 1));
+  }
 }
 
 static void model_rd_from_sse(const AV1_COMP *const cpi,
@@ -1245,16 +1698,12 @@ static void model_rd_from_sse(const AV1_COMP *const cpi,
                               int64_t *dist) {
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int dequant_shift =
-#if CONFIG_HIGHBITDEPTH
-      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 :
-#endif  // CONFIG_HIGHBITDEPTH
-                                                    3;
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
 
   // Fast approximate the modelling function.
   if (cpi->sf.simple_model_rd_from_var) {
     const int64_t square_error = sse;
-    int quantizer = (pd->dequant[1] >> dequant_shift);
-
+    int quantizer = (pd->dequant_Q3[1] >> dequant_shift);
     if (quantizer < 120)
       *rate = (int)((square_error * (280 - quantizer)) >>
                     (16 - AV1_PROB_COST_SHIFT));
@@ -1263,22 +1712,48 @@ static void model_rd_from_sse(const AV1_COMP *const cpi,
     *dist = (square_error * quantizer) >> 8;
   } else {
     av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[bsize],
-                                 pd->dequant[1] >> dequant_shift, rate, dist);
+                                 pd->dequant_Q3[1] >> dequant_shift, rate,
+                                 dist);
   }
-
   *dist <<= 4;
 }
 
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  int64_t total_sse = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE bs = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
+                                               pd->subsampling_y);
+    unsigned int sse;
+
+    if (x->skip_chroma_rd && plane) continue;
+
+    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                       &sse);
+    total_sse += sse;
+  }
+  total_sse <<= 4;
+  return total_sse;
+}
+#endif
+
 static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
                             MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
                             int plane_to, int *out_rate_sum,
                             int64_t *out_dist_sum, int *skip_txfm_sb,
-                            int64_t *skip_sse_sb) {
+                            int64_t *skip_sse_sb, int *plane_rate,
+                            int64_t *plane_sse, int64_t *plane_dist) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
   int plane;
-  const int ref = xd->mi[0]->mbmi.ref_frame[0];
+  const int ref = xd->mi[0]->ref_frame[0];
 
   int64_t rate_sum = 0;
   int64_t dist_sum = 0;
@@ -1289,19 +1764,13 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
   for (plane = plane_from; plane <= plane_to; ++plane) {
     struct macroblock_plane *const p = &x->plane[plane];
     struct macroblockd_plane *const pd = &xd->plane[plane];
-#if CONFIG_CHROMA_SUB8X8
-    const BLOCK_SIZE bs = AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#else
-    const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
-#endif  // CONFIG_CHROMA_SUB8X8
-
+    const BLOCK_SIZE bs =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
     unsigned int sse;
     int rate;
     int64_t dist;
 
-#if CONFIG_CB4X4
     if (x->skip_chroma_rd && plane) continue;
-#endif  // CONFIG_CB4X4
 
     // TODO(geza): Write direct sse functions that do not compute
     // variance as well.
@@ -1316,14 +1785,54 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
 
     rate_sum += rate;
     dist_sum += dist;
+    if (plane_rate) plane_rate[plane] = rate;
+    if (plane_sse) plane_sse[plane] = sse;
+    if (plane_dist) plane_dist[plane] = dist;
   }
 
-  *skip_txfm_sb = total_sse == 0;
-  *skip_sse_sb = total_sse << 4;
+  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
   *out_rate_sum = (int)rate_sum;
   *out_dist_sum = dist_sum;
 }
 
+static void check_block_skip(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                             MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
+                             int plane_to, int *skip_txfm_sb) {
+  *skip_txfm_sb = 1;
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    struct macroblock_plane *const p = &x->plane[plane];
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE bs =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    unsigned int sse;
+
+    if (x->skip_chroma_rd && plane) continue;
+
+    // Since fast HBD variance functions scale down sse by 4 bit, we first use
+    // fast vf implementation to rule out blocks with non-zero scaled sse. Then,
+    // only if the source is HBD and the scaled sse is 0, accurate sse
+    // computation is applied to determine if the sse is really 0. This step is
+    // necessary for HBD lossless coding.
+    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                       &sse);
+    if (sse) {
+      *skip_txfm_sb = 0;
+      return;
+    } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint64_t sse64 = aom_highbd_sse_odd_size(
+          p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+          block_size_wide[bs], block_size_high[bs]);
+
+      if (sse64) {
+        *skip_txfm_sb = 0;
+        return;
+      }
+    }
+  }
+  return;
+}
+
 int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
                           intptr_t block_size, int64_t *ssz) {
   int i;
@@ -1339,20 +1848,6 @@ int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
   return error;
 }
 
-int64_t av1_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
-                             int block_size) {
-  int i;
-  int64_t error = 0;
-
-  for (i = 0; i < block_size; i++) {
-    const int diff = coeff[i] - dqcoeff[i];
-    error += diff * diff;
-  }
-
-  return error;
-}
-
-#if CONFIG_HIGHBITDEPTH
 int64_t av1_highbd_block_error_c(const tran_low_t *coeff,
                                  const tran_low_t *dqcoeff, intptr_t block_size,
                                  int64_t *ssz, int bd) {
@@ -1373,236 +1868,13 @@ int64_t av1_highbd_block_error_c(const tran_low_t *coeff,
   *ssz = sqcoeff;
   return error;
 }
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_PVQ
-// Without PVQ, av1_block_error_c() return two kind of errors,
-// 1) reconstruction (i.e. decoded) error and
-// 2) Squared sum of transformed residue (i.e. 'coeff')
-// However, if PVQ is enabled, coeff does not keep the transformed residue
-// but instead a transformed original is kept.
-// Hence, new parameter ref vector (i.e. transformed predicted signal)
-// is required to derive the residue signal,
-// i.e. coeff - ref = residue (all transformed).
-
-#if CONFIG_HIGHBITDEPTH
-static int64_t av1_highbd_block_error2_c(const tran_low_t *coeff,
-                                         const tran_low_t *dqcoeff,
-                                         const tran_low_t *ref,
-                                         intptr_t block_size, int64_t *ssz,
-                                         int bd) {
-  int64_t error;
-  int64_t sqcoeff;
-  int shift = 2 * (bd - 8);
-  int rounding = shift > 0 ? 1 << (shift - 1) : 0;
-  // Use the existing sse codes for calculating distortion of decoded signal:
-  // i.e. (orig - decoded)^2
-  // For high bit depth, throw away ssz until a 32-bit version of
-  // av1_block_error_fp is written.
-  int64_t ssz_trash;
-  error = av1_block_error(coeff, dqcoeff, block_size, &ssz_trash);
-  // prediction residue^2 = (orig - ref)^2
-  sqcoeff = av1_block_error(coeff, ref, block_size, &ssz_trash);
-  error = (error + rounding) >> shift;
-  sqcoeff = (sqcoeff + rounding) >> shift;
-  *ssz = sqcoeff;
-  return error;
-}
-#else
-// TODO(yushin) : Since 4x4 case does not need ssz, better to refactor into
-// a separate function that does not do the extra computations for ssz.
-static int64_t av1_block_error2_c(const tran_low_t *coeff,
-                                  const tran_low_t *dqcoeff,
-                                  const tran_low_t *ref, intptr_t block_size,
-                                  int64_t *ssz) {
-  int64_t error;
-  int64_t ssz_trash;
-  // Use the existing sse codes for calculating distortion of decoded signal:
-  // i.e. (orig - decoded)^2
-  error = av1_block_error(coeff, dqcoeff, block_size, &ssz_trash);
-  // prediction residue^2 = (orig - ref)^2
-  *ssz = av1_block_error(coeff, ref, block_size, &ssz_trash);
-  return error;
-}
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_PVQ
-
-#if !CONFIG_PVQ || CONFIG_VAR_TX
-#if !CONFIG_LV_MAP
-static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
-                       int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order,
-                       const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
-                       int use_fast_coef_costing) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const struct macroblock_plane *p = &x->plane[plane];
-  const struct macroblockd_plane *pd = &xd->plane[plane];
-  const PLANE_TYPE type = pd->plane_type;
-  const uint16_t *band_count = &band_count_table[tx_size][1];
-  const int eob = p->eobs[block];
-  const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  const TX_SIZE tx_size_ctx = txsize_sqr_map[tx_size];
-  uint8_t token_cache[MAX_TX_SQUARE];
-  int pt = combine_entropy_contexts(*a, *l);
-  int c, cost;
-  const int16_t *scan = scan_order->scan;
-  const int16_t *nb = scan_order->neighbors;
-  const int ref = is_inter_block(mbmi);
-  int(*head_token_costs)[COEFF_CONTEXTS][TAIL_TOKENS] =
-      x->token_head_costs[tx_size_ctx][type][ref];
-  int(*tail_token_costs)[COEFF_CONTEXTS][TAIL_TOKENS] =
-      x->token_tail_costs[tx_size_ctx][type][ref];
-  const int seg_eob = av1_get_tx_eob(&cm->seg, mbmi->segment_id, tx_size);
-  int eob_val;
-
-#if CONFIG_HIGHBITDEPTH
-  const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
-#else
-  const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, 8);
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if !CONFIG_VAR_TX && !CONFIG_SUPERTX
-  // Check for consistency of tx_size with mode info
-  assert(tx_size == av1_get_tx_size(plane, xd));
-#endif  // !CONFIG_VAR_TX && !CONFIG_SUPERTX
-  (void)cm;
-
-  if (eob == 0) {
-    // block zero
-    cost = (*head_token_costs)[pt][0];
-  } else {
-    if (use_fast_coef_costing) {
-      int band_left = *band_count++;
-
-      // dc token
-      int v = qcoeff[0];
-      int16_t prev_t;
-      cost = av1_get_token_cost(v, &prev_t, cat6_bits);
-      eob_val = (eob == 1) ? EARLY_EOB : NO_EOB;
-      cost += av1_get_coeff_token_cost(
-          prev_t, eob_val, 1, (*head_token_costs)[pt], (*tail_token_costs)[pt]);
-
-      token_cache[0] = av1_pt_energy_class[prev_t];
-      ++head_token_costs;
-      ++tail_token_costs;
-
-      // ac tokens
-      for (c = 1; c < eob; c++) {
-        const int rc = scan[c];
-        int16_t t;
-
-        v = qcoeff[rc];
-        cost += av1_get_token_cost(v, &t, cat6_bits);
-        eob_val =
-            (c + 1 == eob) ? (c + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
-        cost += av1_get_coeff_token_cost(t, eob_val, 0,
-                                         (*head_token_costs)[!prev_t],
-                                         (*tail_token_costs)[!prev_t]);
-        prev_t = t;
-        if (!--band_left) {
-          band_left = *band_count++;
-          ++head_token_costs;
-          ++tail_token_costs;
-        }
-      }
-    } else {  // !use_fast_coef_costing
-      int band_left = *band_count++;
-
-      // dc token
-      int v = qcoeff[0];
-      int16_t tok;
-      cost = av1_get_token_cost(v, &tok, cat6_bits);
-      eob_val = (eob == 1) ? EARLY_EOB : NO_EOB;
-      cost += av1_get_coeff_token_cost(tok, eob_val, 1, (*head_token_costs)[pt],
-                                       (*tail_token_costs)[pt]);
-
-      token_cache[0] = av1_pt_energy_class[tok];
-      ++head_token_costs;
-      ++tail_token_costs;
-
-      // ac tokens
-      for (c = 1; c < eob; c++) {
-        const int rc = scan[c];
-
-        v = qcoeff[rc];
-        cost += av1_get_token_cost(v, &tok, cat6_bits);
-        pt = get_coef_context(nb, token_cache, c);
-        eob_val =
-            (c + 1 == eob) ? (c + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
-        cost += av1_get_coeff_token_cost(
-            tok, eob_val, 0, (*head_token_costs)[pt], (*tail_token_costs)[pt]);
-        token_cache[rc] = av1_pt_energy_class[tok];
-        if (!--band_left) {
-          band_left = *band_count++;
-          ++head_token_costs;
-          ++tail_token_costs;
-        }
-      }
-    }
-  }
-
-  return cost;
-}
-#endif  // !CONFIG_LV_MAP
-
-int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
-                    int blk_row, int blk_col, int block, TX_SIZE tx_size,
-                    const SCAN_ORDER *scan_order, const ENTROPY_CONTEXT *a,
-                    const ENTROPY_CONTEXT *l, int use_fast_coef_costing) {
-  const AV1_COMMON *const cm = &cpi->common;
-#if !CONFIG_LV_MAP
-  (void)blk_row;
-  (void)blk_col;
-#if CONFIG_MRC_TX
-  const MACROBLOCKD *xd = &x->e_mbd;
-  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const TX_TYPE tx_type = av1_get_tx_type(xd->plane[plane].plane_type, xd,
-                                          blk_row, blk_col, block, tx_size);
-  const int is_inter = is_inter_block(mbmi);
-  if (tx_type == MRC_DCT && ((is_inter && SIGNAL_MRC_MASK_INTER) ||
-                             (!is_inter && SIGNAL_MRC_MASK_INTRA))) {
-    const int mrc_mask_cost =
-        av1_cost_color_map(x, plane, block, mbmi->sb_type, tx_size, MRC_MAP);
-    return cost_coeffs(cm, x, plane, block, tx_size, scan_order, a, l,
-                       use_fast_coef_costing) +
-           mrc_mask_cost;
-  }
-#endif
-  return cost_coeffs(cm, x, plane, block, tx_size, scan_order, a, l,
-                     use_fast_coef_costing);
-#else  // !CONFIG_LV_MAP
-  (void)scan_order;
-  (void)use_fast_coef_costing;
-  const MACROBLOCKD *xd = &x->e_mbd;
-  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const struct macroblockd_plane *pd = &xd->plane[plane];
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_CHROMA_SUB8X8
-  const BLOCK_SIZE plane_bsize =
-      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#elif CONFIG_CB4X4
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#else   // CONFIG_CB4X4
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
-#endif  // CONFIG_CB4X4
-
-  TXB_CTX txb_ctx;
-  get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
-  return av1_cost_coeffs_txb(cm, x, plane, blk_row, blk_col, block, tx_size,
-                             &txb_ctx);
-#endif  // !CONFIG_LV_MAP
-}
-#endif  // !CONFIG_PVQ || CONFIG_VAR_TX
 
 // Get transform block visible dimensions cropped to the MI units.
 static void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
                                BLOCK_SIZE plane_bsize, int blk_row, int blk_col,
                                BLOCK_SIZE tx_bsize, int *width, int *height,
                                int *visible_width, int *visible_height) {
-#if !(CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX))
   assert(tx_bsize <= plane_bsize);
-#endif
   int txb_height = block_size_high[tx_bsize];
   int txb_width = block_size_wide[tx_bsize];
   const int block_height = block_size_high[plane_bsize];
@@ -1659,234 +1931,900 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
 
 // Compute the pixel domain distortion from diff on all visible 4x4s in the
 // transform block.
-static int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
-                               const int16_t *diff, const int diff_stride,
-                               int blk_row, int blk_col,
-                               const BLOCK_SIZE plane_bsize,
-                               const BLOCK_SIZE tx_bsize) {
+static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
+                                      int blk_row, int blk_col,
+                                      const BLOCK_SIZE plane_bsize,
+                                      const BLOCK_SIZE tx_bsize) {
   int visible_rows, visible_cols;
   const MACROBLOCKD *xd = &x->e_mbd;
-#if CONFIG_DIST_8X8
-  int txb_height = block_size_high[tx_bsize];
-  int txb_width = block_size_wide[tx_bsize];
-  const int src_stride = x->plane[plane].src.stride;
-  const int src_idx = (blk_row * src_stride + blk_col) << tx_size_wide_log2[0];
-  const uint8_t *src = &x->plane[plane].src.buf[src_idx];
-#endif
-
   get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
                      NULL, &visible_cols, &visible_rows);
-
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *diff = x->plane[plane].src_diff;
 #if CONFIG_DIST_8X8
-  if (x->using_dist_8x8 && plane == 0 && txb_width >= 8 && txb_height >= 8)
-    return av1_dist_8x8_diff(x, src, src_stride, diff, diff_stride, txb_width,
-                             txb_height, visible_cols, visible_rows, x->qindex);
-  else
+  int txb_height = block_size_high[tx_bsize];
+  int txb_width = block_size_wide[tx_bsize];
+  if (x->using_dist_8x8 && plane == 0 && txb_width >= 8 && txb_height >= 8) {
+    const int src_stride = x->plane[plane].src.stride;
+    const int src_idx = (blk_row * src_stride + blk_col)
+                        << tx_size_wide_log2[0];
+    const uint8_t *src = &x->plane[plane].src.buf[src_idx];
+    return dist_8x8_diff(x, src, src_stride, diff, diff_stride, txb_width,
+                         txb_height, visible_cols, visible_rows, x->qindex);
+  }
 #endif
-    return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols,
-                                  visible_rows);
+  diff += ((blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]);
+  return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
 }
 
-int av1_count_colors(const uint8_t *src, int stride, int rows, int cols) {
-  int val_count[256];
-  memset(val_count, 0, sizeof(val_count));
+int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+                     int *val_count) {
+  const int max_pix_val = 1 << 8;
+  memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
   for (int r = 0; r < rows; ++r) {
     for (int c = 0; c < cols; ++c) {
-      ++val_count[src[r * stride + c]];
+      const int this_val = src[r * stride + c];
+      assert(this_val < max_pix_val);
+      ++val_count[this_val];
     }
   }
   int n = 0;
-  for (int i = 0; i < 256; ++i) {
+  for (int i = 0; i < max_pix_val; ++i) {
     if (val_count[i]) ++n;
   }
   return n;
 }
 
-#if CONFIG_HIGHBITDEPTH
 int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
-                            int bit_depth) {
+                            int bit_depth, int *val_count) {
   assert(bit_depth <= 12);
+  const int max_pix_val = 1 << bit_depth;
   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  int val_count[1 << 12];
-  memset(val_count, 0, (1 << 12) * sizeof(val_count[0]));
+  memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
   for (int r = 0; r < rows; ++r) {
     for (int c = 0; c < cols; ++c) {
-      ++val_count[src[r * stride + c]];
+      const int this_val = src[r * stride + c];
+      assert(this_val < max_pix_val);
+      if (this_val >= max_pix_val) return 0;
+      ++val_count[this_val];
     }
   }
   int n = 0;
-  for (int i = 0; i < (1 << bit_depth); ++i) {
+  for (int i = 0; i < max_pix_val; ++i) {
     if (val_count[i]) ++n;
   }
   return n;
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
-void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
-                    BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col,
-                    TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse,
-                    OUTPUT_STATUS output_status) {
+static void inverse_transform_block_facade(MACROBLOCKD *xd, int plane,
+                                           int block, int blk_row, int blk_col,
+                                           int eob, int reduced_tx_set) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, reduced_tx_set);
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst =
+      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+                              dst_stride, eob, reduced_tx_set);
+}
+
+static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record, const uint32_t hash);
+
+static uint32_t get_intra_txb_hash(MACROBLOCK *x, int plane, int blk_row,
+                                   int blk_col, BLOCK_SIZE plane_bsize,
+                                   TX_SIZE tx_size) {
+  int16_t tmp_data[64 * 64];
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *diff = x->plane[plane].src_diff;
+  const int16_t *cur_diff_row = diff + 4 * blk_row * diff_stride + 4 * blk_col;
+  const int txb_w = tx_size_wide[tx_size];
+  const int txb_h = tx_size_high[tx_size];
+  uint8_t *hash_data = (uint8_t *)cur_diff_row;
+  if (txb_w != diff_stride) {
+    int16_t *cur_hash_row = tmp_data;
+    for (int i = 0; i < txb_h; i++) {
+      memcpy(cur_hash_row, cur_diff_row, sizeof(*diff) * txb_w);
+      cur_hash_row += txb_w;
+      cur_diff_row += diff_stride;
+    }
+    hash_data = (uint8_t *)tmp_data;
+  }
+  CRC32C *crc = &x->mb_rd_record.crc_calculator;
+  const uint32_t hash = av1_get_crc32c_value(crc, hash_data, 2 * txb_w * txb_h);
+  return (hash << 5) + tx_size;
+}
+
+static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block,
+                                        TX_SIZE tx_size, int64_t *out_dist,
+                                        int64_t *out_sse) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
-#if CONFIG_DIST_8X8
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-#else   // CONFIG_DIST_8X8
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-#endif  // CONFIG_DIST_8X8
+  // Transform domain distortion computation is more efficient as it does
+  // not involve an inverse transform, but it is less accurate.
+  const int buffer_length = av1_get_max_eob(tx_size);
+  int64_t this_sse;
+  // TX-domain results need to shift down to Q2/D10 to match pixel
+  // domain distortion values which are in Q2^2
+  int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 
-  if (cpi->sf.use_transform_domain_distortion
-#if CONFIG_DIST_8X8
-      && !x->using_dist_8x8
-#endif
-      ) {
-    // Transform domain distortion computation is more efficient as it does
-    // not involve an inverse transform, but it is less accurate.
-    const int buffer_length = tx_size_2d[tx_size];
-    int64_t this_sse;
-    int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
-    tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-    tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-#if CONFIG_PVQ
-    tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
-
-#if CONFIG_HIGHBITDEPTH
-    const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
-    *out_dist = av1_highbd_block_error2_c(coeff, dqcoeff, ref_coeff,
-                                          buffer_length, &this_sse, bd);
-#else
-    *out_dist =
-        av1_block_error2_c(coeff, dqcoeff, ref_coeff, buffer_length, &this_sse);
-#endif  // CONFIG_HIGHBITDEPTH
-#else   // !CONFIG_PVQ
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length,
-                                         &this_sse, xd->bd);
-    else
-#endif
-      *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
-#endif  // CONFIG_PVQ
-    *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
-    *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse,
+                                       xd->bd);
+  else
+    *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
+
+  *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
+  *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
+}
+
+static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
+                                           int plane, BLOCK_SIZE plane_bsize,
+                                           int block, int blk_row, int blk_col,
+                                           TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const uint16_t eob = p->eobs[block];
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  const int bsw = block_size_wide[tx_bsize];
+  const int bsh = block_size_high[tx_bsize];
+  const int src_stride = x->plane[plane].src.stride;
+  const int dst_stride = xd->plane[plane].dst.stride;
+  // Scale the transform block index to pixel unit.
+  const int src_idx = (blk_row * src_stride + blk_col) << tx_size_wide_log2[0];
+  const int dst_idx = (blk_row * dst_stride + blk_col) << tx_size_wide_log2[0];
+  const uint8_t *src = &x->plane[plane].src.buf[src_idx];
+  const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
+  const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+
+  assert(cpi != NULL);
+  assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
+
+  uint8_t *recon;
+  DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    recon = CONVERT_TO_BYTEPTR(recon16);
+    av1_highbd_convolve_2d_copy_sr(CONVERT_TO_SHORTPTR(dst), dst_stride,
+                                   CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw,
+                                   bsh, NULL, NULL, 0, 0, NULL, xd->bd);
   } else {
-    const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
-#if !CONFIG_PVQ || CONFIG_DIST_8X8
-    const int bsw = block_size_wide[tx_bsize];
-    const int bsh = block_size_high[tx_bsize];
-#endif
-    const int src_stride = x->plane[plane].src.stride;
-    const int dst_stride = xd->plane[plane].dst.stride;
-    // Scale the transform block index to pixel unit.
-    const int src_idx = (blk_row * src_stride + blk_col)
-                        << tx_size_wide_log2[0];
-    const int dst_idx = (blk_row * dst_stride + blk_col)
-                        << tx_size_wide_log2[0];
-    const uint8_t *src = &x->plane[plane].src.buf[src_idx];
-    const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
-    const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-    const uint16_t eob = p->eobs[block];
+    recon = (uint8_t *)recon16;
+    av1_convolve_2d_copy_sr(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh, NULL,
+                            NULL, 0, 0, NULL);
+  }
 
-    assert(cpi != NULL);
-    assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, tx_size,
+                                    cpi->common.reduced_tx_set_used);
+  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon,
+                              MAX_TX_SIZE, eob,
+                              cpi->common.reduced_tx_set_used);
+#if CONFIG_DIST_8X8
+  if (x->using_dist_8x8 && plane == 0 && (bsw < 8 || bsh < 8)) {
+    // Save decoded pixels for inter block in pd->pred to avoid
+    // block_8x8_rd_txfm_daala_dist() need to produce them
+    // by calling av1_inverse_transform_block() again.
+    const int pred_stride = block_size_wide[plane_bsize];
+    const int pred_idx = (blk_row * pred_stride + blk_col)
+                         << tx_size_wide_log2[0];
+    int16_t *pred = &x->pred_luma[pred_idx];
+    int i, j;
 
-    {
-      const int diff_stride = block_size_wide[plane_bsize];
-      const int diff_idx = (blk_row * diff_stride + blk_col)
-                           << tx_size_wide_log2[0];
-      const int16_t *diff = &p->src_diff[diff_idx];
-      *out_sse = pixel_diff_dist(x, plane, diff, diff_stride, blk_row, blk_col,
-                                 plane_bsize, tx_bsize);
-#if CONFIG_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-        *out_sse = ROUND_POWER_OF_TWO(*out_sse, (xd->bd - 8) * 2);
-#endif  // CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      for (j = 0; j < bsh; j++)
+        for (i = 0; i < bsw; i++)
+          pred[j * pred_stride + i] =
+              CONVERT_TO_SHORTPTR(recon)[j * MAX_TX_SIZE + i];
+    } else {
+      for (j = 0; j < bsh; j++)
+        for (i = 0; i < bsw; i++)
+          pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i];
     }
-    *out_sse *= 16;
+  }
+#endif  // CONFIG_DIST_8X8
+  return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
+                         blk_row, blk_col, plane_bsize, tx_bsize);
+}
 
-    if (eob) {
-      if (output_status == OUTPUT_HAS_DECODED_PIXELS) {
-        *out_dist = pixel_dist(cpi, x, plane, src, src_stride, dst, dst_stride,
-                               blk_row, blk_col, plane_bsize, tx_bsize);
-      } else {
-#if CONFIG_HIGHBITDEPTH
-        uint8_t *recon;
-        DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
-
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-          recon = CONVERT_TO_BYTEPTR(recon16);
-        else
-          recon = (uint8_t *)recon16;
-#else
-        DECLARE_ALIGNED(16, uint8_t, recon[MAX_TX_SQUARE]);
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if !CONFIG_PVQ
-#if CONFIG_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          aom_highbd_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, NULL, 0,
-                                   NULL, 0, bsw, bsh, xd->bd);
-        } else {
-#endif  // CONFIG_HIGHBITDEPTH
-          aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, NULL, 0, NULL,
-                            0, bsw, bsh);
-#if CONFIG_HIGHBITDEPTH
-        }
-#endif  // CONFIG_HIGHBITDEPTH
-#else
-        (void)dst;
-#endif  // !CONFIG_PVQ
-
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-        uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-        const PLANE_TYPE plane_type = get_plane_type(plane);
-        TX_TYPE tx_type =
-            av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-        av1_inverse_transform_block(xd, dqcoeff,
-#if CONFIG_LGT_FROM_PRED
-                                    xd->mi[0]->mbmi.mode,
-#endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                    mrc_mask,
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                    tx_type, tx_size, recon, MAX_TX_SIZE, eob);
+static double get_mean(const int16_t *diff, int stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      sum += diff[j * stride + i];
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+
+static double get_sse_norm(const int16_t *diff, int stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      const int err = diff[j * stride + i];
+      sum += err * err;
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+
+static double get_sad_norm(const int16_t *diff, int stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      sum += abs(diff[j * stride + i]);
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+
+static void get_2x2_normalized_sses_and_sads(
+    const AV1_COMP *const cpi, BLOCK_SIZE tx_bsize, const uint8_t *const src,
+    int src_stride, const uint8_t *const dst, int dst_stride,
+    const int16_t *const src_diff, int diff_stride, double *const sse_norm_arr,
+    double *const sad_norm_arr) {
+  const BLOCK_SIZE tx_bsize_half =
+      get_partition_subsize(tx_bsize, PARTITION_SPLIT);
+  if (tx_bsize_half == BLOCK_INVALID) {  // manually calculate stats
+    const int half_width = block_size_wide[tx_bsize] / 2;
+    const int half_height = block_size_high[tx_bsize] / 2;
+    for (int row = 0; row < 2; ++row) {
+      for (int col = 0; col < 2; ++col) {
+        const int16_t *const this_src_diff =
+            src_diff + row * half_height * diff_stride + col * half_width;
+        sse_norm_arr[row * 2 + col] =
+            get_sse_norm(this_src_diff, diff_stride, half_width, half_height);
+        sad_norm_arr[row * 2 + col] =
+            get_sad_norm(this_src_diff, diff_stride, half_width, half_height);
+      }
+    }
+  } else {  // use function pointers to calculate stats
+    const int half_width = block_size_wide[tx_bsize_half];
+    const int half_height = block_size_high[tx_bsize_half];
+    const int num_samples_half = half_width * half_height;
+    for (int row = 0; row < 2; ++row) {
+      for (int col = 0; col < 2; ++col) {
+        const uint8_t *const this_src =
+            src + row * half_height * src_stride + col * half_width;
+        const uint8_t *const this_dst =
+            dst + row * half_height * dst_stride + col * half_width;
+
+        unsigned int this_sse;
+        cpi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst,
+                                      dst_stride, &this_sse);
+        sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half;
+
+        const unsigned int this_sad = cpi->fn_ptr[tx_bsize_half].sdf(
+            this_src, src_stride, this_dst, dst_stride);
+        sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half;
+      }
+    }
+  }
+}
+
+#if CONFIG_COLLECT_RD_STATS
+// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
+// 0: Do not collect any RD stats
+// 1: Collect RD stats for transform units
+// 2: Collect RD stats for partition units
+static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    const RD_STATS *const rd_stats, int blk_row,
+                                    int blk_col, BLOCK_SIZE plane_bsize,
+                                    TX_SIZE tx_size, TX_TYPE tx_type) {
+  if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
+
+  // Generate small sample to restrict output size.
+  static unsigned int seed = 21743;
+  if (lcg_rand16(&seed) % 100 > 0) return;
+
+  const char output_file[] = "tu_stats.txt";
+  FILE *fout = fopen(output_file, "a");
+  if (!fout) return;
 
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int plane = 0;
+  struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int txw = tx_size_wide[tx_size];
+  const int txh = tx_size_high[tx_size];
+  const int dequant_shift =
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+  const int q_step = pd->dequant_Q3[1] >> dequant_shift;
+  const double num_samples = txw * txh;
+
+  const double rate_norm = (double)rd_stats->rate / num_samples;
+  const double dist_norm = (double)rd_stats->dist / num_samples;
+
+  fprintf(fout, "%g %g", rate_norm, dist_norm);
+
+  const int src_stride = p->src.stride;
+  const uint8_t *const src =
+      &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *const dst =
+      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  unsigned int sse;
+  cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+  const double sse_norm = (double)sse / num_samples;
+
+  const unsigned int sad =
+      cpi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride);
+  const double sad_norm = (double)sad / num_samples;
+
+  fprintf(fout, " %g %g", sse_norm, sad_norm);
+
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *const src_diff =
+      &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+
+  double sse_norm_arr[4], sad_norm_arr[4];
+  get_2x2_normalized_sses_and_sads(cpi, tx_bsize, src, src_stride, dst,
+                                   dst_stride, src_diff, diff_stride,
+                                   sse_norm_arr, sad_norm_arr);
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sse_norm_arr[i]);
+  }
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sad_norm_arr[i]);
+  }
+
+  const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+  const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+
+  fprintf(fout, " %d %d %d %d %d", q_step, tx_size_wide[tx_size],
+          tx_size_high[tx_size], tx_type_1d_row, tx_type_1d_col);
+
+  int model_rate;
+  int64_t model_dist;
+  model_rd_from_sse(cpi, xd, tx_bsize, plane, sse, &model_rate, &model_dist);
+  const double model_rate_norm = (double)model_rate / num_samples;
+  const double model_dist_norm = (double)model_dist / num_samples;
+  fprintf(fout, " %g %g", model_rate_norm, model_dist_norm);
+
+  const double mean = get_mean(src_diff, diff_stride, txw, txh);
+  double hor_corr, vert_corr;
+  get_horver_correlation(src_diff, diff_stride, txw, txh, &hor_corr,
+                         &vert_corr);
+  fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
+
+  double hdist[4] = { 0 }, vdist[4] = { 0 };
+  get_energy_distribution_fine(cpi, tx_bsize, src, src_stride, dst, dst_stride,
+                               1, hdist, vdist);
+  fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
+          hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
+
+  fprintf(fout, "\n");
+  fclose(fout);
+}
+
+#if CONFIG_COLLECT_RD_STATS == 2
+static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                     const RD_STATS *const rd_stats,
+                                     BLOCK_SIZE plane_bsize) {
+  if (rd_stats->invalid_rate) return;
+  if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
+
+  // Generate small sample to restrict output size.
+  static unsigned int seed = 95014;
+  if (lcg_rand16(&seed) % 100 > 0) return;
+
+  const char output_file[] = "pu_stats.txt";
+  FILE *fout = fopen(output_file, "a");
+  if (!fout) return;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int plane = 0;
+  struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
+  const int dequant_shift =
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+  const int q_step = pd->dequant_Q3[1] >> dequant_shift;
+  const double num_samples = bw * bh;
+
+  const double rate_norm = (double)rd_stats->rate / num_samples;
+  const double dist_norm = (double)rd_stats->dist / num_samples;
+
+  fprintf(fout, "%g %g", rate_norm, dist_norm);
+
+  const int src_stride = p->src.stride;
+  const uint8_t *const src = p->src.buf;
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *const dst = pd->dst.buf;
+  unsigned int sse;
+  cpi->fn_ptr[plane_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+  const double sse_norm = (double)sse / num_samples;
+
+  const unsigned int sad =
+      cpi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride);
+  const double sad_norm = (double)sad / num_samples;
+
+  fprintf(fout, " %g %g", sse_norm, sad_norm);
+
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *const src_diff = p->src_diff;
+
+  double sse_norm_arr[4], sad_norm_arr[4];
+  get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
+                                   dst_stride, src_diff, diff_stride,
+                                   sse_norm_arr, sad_norm_arr);
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sse_norm_arr[i]);
+  }
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sad_norm_arr[i]);
+  }
+
+  fprintf(fout, " %d %d %d", q_step, bw, bh);
+
+  int model_rate;
+  int64_t model_dist;
+  model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &model_rate, &model_dist);
+  const double model_rate_norm = (double)model_rate / num_samples;
+  const double model_dist_norm = (double)model_dist / num_samples;
+  fprintf(fout, " %g %g", model_rate_norm, model_dist_norm);
+
+  const double mean = get_mean(src_diff, diff_stride, bw, bh);
+  double hor_corr, vert_corr;
+  get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr);
+  fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
+
+  double hdist[4] = { 0 }, vdist[4] = { 0 };
+  get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst,
+                               dst_stride, 1, hdist, vdist);
+  fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
+          hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
+
+  fprintf(fout, "\n");
+  fclose(fout);
+}
+#endif  // CONFIG_COLLECT_RD_STATS == 2
+#endif  // CONFIG_COLLECT_RD_STATS
+
+static void model_rd_with_dnn(const AV1_COMP *const cpi,
+                              const MACROBLOCK *const x, BLOCK_SIZE bsize,
+                              int plane, unsigned int *rsse, int *rate,
+                              int64_t *dist) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+  const int log_numpels = num_pels_log2_lookup[plane_bsize];
+  const int num_samples = (1 << log_numpels);
+
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
+  const int dequant_shift =
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+  const int q_step = pd->dequant_Q3[1] >> dequant_shift;
+
+  const int src_stride = p->src.stride;
+  const uint8_t *const src = p->src.buf;
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *const dst = pd->dst.buf;
+  unsigned int sse;
+  cpi->fn_ptr[plane_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+  const double sse_norm = (double)sse / num_samples;
+
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *const src_diff = p->src_diff;
+
+  double sse_norm_arr[4], sad_norm_arr[4];
+  get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
+                                   dst_stride, src_diff, diff_stride,
+                                   sse_norm_arr, sad_norm_arr);
+  const double mean = get_mean(src_diff, diff_stride, bw, bh);
+  const double variance = sse_norm - mean * mean;
+  const double q_sqr = (double)(q_step * q_step);
+  const double q_sqr_by_variance = q_sqr / variance;
+  double hor_corr, vert_corr;
+  get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr);
+  double hdist[4] = { 0 }, vdist[4] = { 0 };
+  get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst,
+                               dst_stride, 1, hdist, vdist);
+
+  float features[20];
+  features[0] = (float)hdist[0];
+  features[1] = (float)hdist[1];
+  features[2] = (float)hdist[2];
+  features[3] = (float)hdist[3];
+  features[4] = (float)hor_corr;
+  features[5] = (float)log_numpels;
+  features[6] = (float)mean;
+  features[7] = (float)q_sqr;
+  features[8] = (float)q_sqr_by_variance;
+  features[9] = (float)sse_norm_arr[0];
+  features[10] = (float)sse_norm_arr[1];
+  features[11] = (float)sse_norm_arr[2];
+  features[12] = (float)sse_norm_arr[3];
+  features[13] = (float)sse_norm_arr[3];
+  features[14] = (float)variance;
+  features[15] = (float)vdist[0];
+  features[16] = (float)vdist[1];
+  features[17] = (float)vdist[2];
+  features[18] = (float)vdist[3];
+  features[19] = (float)vert_corr;
+
+  float rate_f, dist_f;
+  av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_f);
+  av1_nn_predict(features, &av1_pustats_rate_nnconfig, &rate_f);
+  const int rate_i = (int)(AOMMAX(0.0, rate_f * (1 << log_numpels)) + 0.5);
+  const int64_t dist_i =
+      (int64_t)(AOMMAX(0.0, dist_f * (1 << log_numpels)) + 0.5);
+  if (rate) *rate = rate_i;
+  if (dist) *dist = dist_i;
+  if (rsse) *rsse = sse;
+  return;
+}
+
+void model_rd_for_sb_with_dnn(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                              MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
+                              int plane_to, int *out_rate_sum,
+                              int64_t *out_dist_sum, int *skip_txfm_sb,
+                              int64_t *skip_sse_sb, int *plane_rate,
+                              int64_t *plane_sse, int64_t *plane_dist) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  const int ref = xd->mi[0]->ref_frame[0];
+
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  int64_t total_sse = 0;
+
+  x->pred_sse[ref] = 0;
+
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    unsigned int sse;
+    int rate;
+    int64_t dist;
+
+    if (x->skip_chroma_rd && plane) continue;
+
+    model_rd_with_dnn(cpi, x, bsize, plane, &sse, &rate, &dist);
+
+    if (plane == 0) x->pred_sse[ref] = sse;
+
+    total_sse += sse;
+    rate_sum += rate;
+    dist_sum += dist;
+
+    if (plane_rate) plane_rate[plane] = rate;
+    if (plane_sse) plane_sse[plane] = sse;
+    if (plane_dist) plane_dist[plane] = dist;
+  }
+
+  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
+static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                               int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               const TXB_CTX *const txb_ctx,
+                               FAST_TX_SEARCH_MODE ftxs_mode,
+                               int use_fast_coef_costing, int64_t ref_best_rd,
+                               RD_STATS *best_rd_stats) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  int64_t best_rd = INT64_MAX;
+  uint16_t best_eob = 0;
+  TX_TYPE best_tx_type = DCT_DCT;
+  TX_TYPE last_tx_type = TX_TYPES;
+  const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY;
+  // The buffer used to swap dqcoeff in macroblockd_plane so we can keep dqcoeff
+  // of the best tx_type
+  DECLARE_ALIGNED(32, tran_low_t, this_dqcoeff[MAX_SB_SQUARE]);
+  tran_low_t *orig_dqcoeff = pd->dqcoeff;
+  tran_low_t *best_dqcoeff = this_dqcoeff;
+  const int txk_type_idx =
+      av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
+  av1_invalid_rd_stats(best_rd_stats);
+
+  TXB_RD_INFO *intra_txb_rd_info = NULL;
+  uint16_t cur_joint_ctx = 0;
+  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+  const int within_border =
+      mi_row >= xd->tile.mi_row_start &&
+      (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) &&
+      mi_col >= xd->tile.mi_col_start &&
+      (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end);
+  if (within_border && cpi->sf.use_intra_txb_hash && frame_is_intra_only(cm) &&
+      !is_inter && plane == 0 &&
+      tx_size_wide[tx_size] == tx_size_high[tx_size]) {
+    const uint32_t intra_hash =
+        get_intra_txb_hash(x, plane, blk_row, blk_col, plane_bsize, tx_size);
+    const int intra_hash_idx =
+        find_tx_size_rd_info(&x->txb_rd_record_intra, intra_hash);
+    intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx];
+
+    cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
+    if (intra_hash_idx > 0 &&
+        intra_txb_rd_info->entropy_context == cur_joint_ctx &&
+        x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) {
+      mbmi->txk_type[txk_type_idx] = intra_txb_rd_info->tx_type;
+      const TX_TYPE ref_tx_type =
+          av1_get_tx_type(get_plane_type(plane), &x->e_mbd, blk_row, blk_col,
+                          tx_size, cpi->common.reduced_tx_set_used);
+      if (ref_tx_type == intra_txb_rd_info->tx_type) {
+        best_rd_stats->rate = intra_txb_rd_info->rate;
+        best_rd_stats->dist = intra_txb_rd_info->dist;
+        best_rd_stats->sse = intra_txb_rd_info->sse;
+        best_rd_stats->skip = intra_txb_rd_info->eob == 0;
+        x->plane[plane].eobs[block] = intra_txb_rd_info->eob;
+        x->plane[plane].txb_entropy_ctx[block] =
+            intra_txb_rd_info->txb_entropy_ctx;
+        best_rd = RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->dist);
+        best_eob = intra_txb_rd_info->eob;
+        best_tx_type = intra_txb_rd_info->tx_type;
+        update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                         best_tx_type);
+        goto RECON_INTRA;
+      }
+    }
+  }
+
+  int rate_cost = 0;
+  TX_TYPE txk_start = DCT_DCT;
+  TX_TYPE txk_end = TX_TYPES - 1;
+  if (!(!is_inter && x->use_default_intra_tx_type) &&
+      !(is_inter && x->use_default_inter_tx_type))
+    if (x->rd_model == LOW_TXFM_RD || x->cb_partition_scan)
+      if (plane == 0) txk_end = DCT_DCT;
+
+  uint8_t best_txb_ctx = 0;
+  const TxSetType tx_set_type =
+      av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used);
+  int prune = 0;
+  const int do_prune = plane == 0 && !fast_tx_search && txk_end != DCT_DCT &&
+                       !(!is_inter && x->use_default_intra_tx_type) &&
+                       !(is_inter && x->use_default_inter_tx_type) &&
+                       cpi->sf.tx_type_search.prune_mode > NO_PRUNE;
+  if (do_prune && is_inter) {
+    if (cpi->sf.tx_type_search.prune_mode >= PRUNE_2D_ACCURATE) {
+      prune = prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col,
+                          tx_set_type, cpi->sf.tx_type_search.prune_mode);
+    } else {
+      prune = x->tx_search_prune[tx_set_type];
+    }
+  }
+
+  TX_TYPE uv_tx_type = DCT_DCT;
+  if (plane) {
+    // tx_type of PLANE_TYPE_UV should be the same as PLANE_TYPE_Y
+    uv_tx_type = txk_start = txk_end =
+        av1_get_tx_type(get_plane_type(plane), xd, blk_row, blk_col, tx_size,
+                        cm->reduced_tx_set_used);
+  }
+  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) {
+    txk_start = txk_end = DCT_DCT;
+  }
+
+  int8_t allowed_tx_mask[TX_TYPES] = { 0 };  // 1: allow; 0: skip.
+  int allowed_tx_num = 0;
+  if (fast_tx_search) {
+    allowed_tx_mask[DCT_DCT] = 1;
+    allowed_tx_mask[H_DCT] = 1;
+    allowed_tx_mask[V_DCT] = 1;
+  } else {
+    memset(allowed_tx_mask + txk_start, 1, txk_end - txk_start + 1);
+  }
+  for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
+    if (do_prune) {
+      if (!do_tx_type_search(tx_type, prune, cpi->sf.tx_type_search.prune_mode))
+        allowed_tx_mask[tx_type] = 0;
+    }
+    if (plane == 0 && allowed_tx_mask[tx_type]) {
+      if (!av1_ext_tx_used[tx_set_type][tx_type])
+        allowed_tx_mask[tx_type] = 0;
+      else if (!is_inter && x->use_default_intra_tx_type &&
+               tx_type != get_default_tx_type(0, xd, tx_size))
+        allowed_tx_mask[tx_type] = 0;
+      else if (is_inter && x->use_default_inter_tx_type &&
+               tx_type != get_default_tx_type(0, xd, tx_size))
+        allowed_tx_mask[tx_type] = 0;
+    }
+    allowed_tx_num += allowed_tx_mask[tx_type];
+  }
+  // Need to have at least one transform type allowed.
+  if (allowed_tx_num == 0) {
+    allowed_tx_mask[plane ? uv_tx_type : DCT_DCT] = 1;
+  }
+
+  int use_transform_domain_distortion =
+      (cpi->sf.use_transform_domain_distortion > 0) &&
+      // Any 64-pt transforms only preserves half the coefficients.
+      // Therefore transform domain distortion is not valid for these
+      // transform sizes.
+      txsize_sqr_up_map[tx_size] != TX_64X64;
 #if CONFIG_DIST_8X8
-        if (x->using_dist_8x8 && plane == 0 && (bsw < 8 || bsh < 8)) {
-          // Save decoded pixels for inter block in pd->pred to avoid
-          // block_8x8_rd_txfm_daala_dist() need to produce them
-          // by calling av1_inverse_transform_block() again.
-          const int pred_stride = block_size_wide[plane_bsize];
-          const int pred_idx = (blk_row * pred_stride + blk_col)
-                               << tx_size_wide_log2[0];
-          int16_t *pred = &pd->pred[pred_idx];
-          int i, j;
-
-#if CONFIG_HIGHBITDEPTH
-          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-            for (j = 0; j < bsh; j++)
-              for (i = 0; i < bsw; i++)
-                pred[j * pred_stride + i] =
-                    CONVERT_TO_SHORTPTR(recon)[j * MAX_TX_SIZE + i];
-          } else {
+  if (x->using_dist_8x8) use_transform_domain_distortion = 0;
 #endif
-            for (j = 0; j < bsh; j++)
-              for (i = 0; i < bsw; i++)
-                pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i];
-#if CONFIG_HIGHBITDEPTH
-          }
-#endif  // CONFIG_HIGHBITDEPTH
-        }
-#endif  // CONFIG_DIST_8X8
-        *out_dist =
-            pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
-                       blk_row, blk_col, plane_bsize, tx_bsize);
+
+  int calc_pixel_domain_distortion_final =
+      cpi->sf.use_transform_domain_distortion == 1 &&
+      use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD &&
+      !x->cb_partition_scan;
+  if (calc_pixel_domain_distortion_final && allowed_tx_num <= 1)
+    calc_pixel_domain_distortion_final = use_transform_domain_distortion = 0;
+
+  const uint16_t *eobs_ptr = x->plane[plane].eobs;
+
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  int64_t block_sse =
+      pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize);
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
+  block_sse *= 16;
+
+  for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
+    if (!allowed_tx_mask[tx_type]) continue;
+    if (plane == 0) mbmi->txk_type[txk_type_idx] = tx_type;
+    RD_STATS this_rd_stats;
+    av1_invalid_rd_stats(&this_rd_stats);
+
+    if (!cpi->optimize_seg_arr[mbmi->segment_id]) {
+      av1_xform_quant(
+          cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
+          USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
+      rate_cost = av1_cost_coeffs(cm, x, plane, blk_row, blk_col, block,
+                                  tx_size, txb_ctx, use_fast_coef_costing);
+    } else {
+      av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+                      tx_size, tx_type, AV1_XFORM_QUANT_FP);
+      if (cpi->sf.optimize_b_precheck && best_rd < INT64_MAX &&
+          eobs_ptr[block] >= 4) {
+        // Calculate distortion quickly in transform domain.
+        dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+                             &this_rd_stats.sse);
+        rate_cost = av1_cost_coeffs(cm, x, plane, blk_row, blk_col, block,
+                                    tx_size, txb_ctx, use_fast_coef_costing);
+        const int64_t rd_estimate =
+            AOMMIN(RDCOST(x->rdmult, rate_cost, this_rd_stats.dist),
+                   RDCOST(x->rdmult, 0, this_rd_stats.sse));
+        if (rd_estimate - (rd_estimate >> 3) > AOMMIN(best_rd, ref_best_rd))
+          continue;
       }
-      *out_dist *= 16;
+      av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx, 1,
+                     &rate_cost);
+    }
+    if (eobs_ptr[block] == 0) {
+      // When eob is 0, pixel domain distortion is more efficient and accurate.
+      this_rd_stats.dist = this_rd_stats.sse = block_sse;
+    } else if (use_transform_domain_distortion) {
+      dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+                           &this_rd_stats.sse);
     } else {
-      *out_dist = *out_sse;
+      this_rd_stats.dist = dist_block_px_domain(
+          cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+      this_rd_stats.sse = block_sse;
+    }
+
+    this_rd_stats.rate = rate_cost;
+
+    const int64_t rd =
+        RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+
+    if (rd < best_rd) {
+      best_rd = rd;
+      *best_rd_stats = this_rd_stats;
+      best_tx_type = tx_type;
+      best_txb_ctx = x->plane[plane].txb_entropy_ctx[block];
+      best_eob = x->plane[plane].eobs[block];
+      last_tx_type = best_tx_type;
+
+      // Swap qcoeff and dqcoeff buffers
+      tran_low_t *const tmp_dqcoeff = best_dqcoeff;
+      best_dqcoeff = pd->dqcoeff;
+      pd->dqcoeff = tmp_dqcoeff;
+    }
+
+#if CONFIG_COLLECT_RD_STATS == 1
+    if (plane == 0) {
+      PrintTransformUnitStats(cpi, x, &this_rd_stats, blk_row, blk_col,
+                              plane_bsize, tx_size, tx_type);
+    }
+#endif  // CONFIG_COLLECT_RD_STATS == 1
+
+    if (cpi->sf.adaptive_txb_search_level) {
+      if ((best_rd - (best_rd >> cpi->sf.adaptive_txb_search_level)) >
+          ref_best_rd) {
+        break;
+      }
+    }
+
+    // Skip transform type search when we found the block has been quantized to
+    // all zero and at the same time, it has better rdcost than doing transform.
+    if (cpi->sf.tx_type_search.skip_tx_search && !best_eob) break;
+  }
+
+  assert(best_rd != INT64_MAX);
+
+  best_rd_stats->skip = best_eob == 0;
+  if (best_eob == 0) best_tx_type = DCT_DCT;
+  if (plane == 0) {
+    update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                     best_tx_type);
+  }
+  x->plane[plane].txb_entropy_ctx[block] = best_txb_ctx;
+  x->plane[plane].eobs[block] = best_eob;
+
+  pd->dqcoeff = best_dqcoeff;
+
+  if (calc_pixel_domain_distortion_final && best_eob) {
+    best_rd_stats->dist = dist_block_px_domain(
+        cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+    best_rd_stats->sse = block_sse;
+  }
+
+  if (intra_txb_rd_info != NULL) {
+    intra_txb_rd_info->valid = 1;
+    intra_txb_rd_info->entropy_context = cur_joint_ctx;
+    intra_txb_rd_info->rate = best_rd_stats->rate;
+    intra_txb_rd_info->dist = best_rd_stats->dist;
+    intra_txb_rd_info->sse = best_rd_stats->sse;
+    intra_txb_rd_info->eob = best_eob;
+    intra_txb_rd_info->txb_entropy_ctx = best_txb_ctx;
+    if (plane == 0) intra_txb_rd_info->tx_type = best_tx_type;
+  }
+
+RECON_INTRA:
+  if (!is_inter && best_eob &&
+      (blk_row + tx_size_high_unit[tx_size] < mi_size_high[plane_bsize] ||
+       blk_col + tx_size_wide_unit[tx_size] < mi_size_wide[plane_bsize])) {
+    // intra mode needs decoded result such that the next transform block
+    // can use it for prediction.
+    // if the last search tx_type is the best tx_type, we don't need to
+    // do this again
+    if (best_tx_type != last_tx_type) {
+      if (!cpi->optimize_seg_arr[mbmi->segment_id]) {
+        av1_xform_quant(
+            cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+            best_tx_type,
+            USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
+      } else {
+        av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+                        tx_size, best_tx_type, AV1_XFORM_QUANT_FP);
+        av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx, 1,
+                       &rate_cost);
+      }
+    }
+
+    inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
+                                   x->plane[plane].eobs[block],
+                                   cm->reduced_tx_set_used);
+
+    // This may happen because of hash collision. The eob stored in the hash
+    // table is non-zero, but the real eob is zero. We need to make sure tx_type
+    // is DCT_DCT in this case.
+    if (plane == 0 && x->plane[plane].eobs[block] == 0 &&
+        best_tx_type != DCT_DCT) {
+      update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                       DCT_DCT);
     }
   }
+  pd->dqcoeff = orig_dqcoeff;
+
+  return best_rd;
 }
 
 static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
@@ -1894,7 +2832,7 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   struct rdcost_block_args *args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   const AV1_COMP *cpi = args->cpi;
   ENTROPY_CONTEXT *a = args->t_above + blk_col;
   ENTROPY_CONTEXT *l = args->t_left + blk_row;
@@ -1909,122 +2847,44 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   // (new distortion metric) are different.
   // Exception is: dist-8x8 is enabled but still MSE is used,
   // i.e. "--tune=" encoder option is not used.
+  int bw = block_size_wide[plane_bsize];
+  int bh = block_size_high[plane_bsize];
   int disable_early_skip =
-      x->using_dist_8x8 && plane == 0 && plane_bsize >= BLOCK_8X8 &&
+      x->using_dist_8x8 && plane == AOM_PLANE_Y && bw >= 8 && bh >= 8 &&
       (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4) &&
       x->tune_metric != AOM_TUNE_PSNR;
 #endif  // CONFIG_DIST_8X8
 
-#if !CONFIG_SUPERTX && !CONFIG_VAR_TX
-  assert(tx_size == av1_get_tx_size(plane, xd));
-#endif  // !CONFIG_SUPERTX
-
   av1_init_rd_stats(&this_rd_stats);
 
   if (args->exit_early) return;
 
   if (!is_inter_block(mbmi)) {
-    av1_predict_intra_block_facade(cm, xd, plane, block, blk_col, blk_row,
-                                   tx_size);
+    av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
     av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
   }
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+  search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  &txb_ctx, args->ftxs_mode, args->use_fast_coef_costing,
+                  args->best_rd - args->this_rd, &this_rd_stats);
 
-#if !CONFIG_TXK_SEL
-  // full forward transform and quantization
-  const int coeff_ctx = combine_entropy_contexts(*a, *l);
-#if DISABLE_TRELLISQ_SEARCH
-  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  coeff_ctx, AV1_XFORM_QUANT_B);
-#else
-  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  coeff_ctx, AV1_XFORM_QUANT_FP);
-
-  const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
-  tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
-  const int buffer_length = tx_size_2d[tx_size];
-  int64_t tmp_dist;
-  int64_t tmp;
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    tmp_dist =
-        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp, xd->bd);
-  else
-#endif
-    tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp);
-  tmp_dist = RIGHT_SIGNED_SHIFT(tmp_dist, shift);
-
-  if (
-#if CONFIG_DIST_8X8
-      disable_early_skip ||
-#endif
-      RDCOST(x->rdmult, 0, tmp_dist) + args->this_rd < args->best_rd) {
-    av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
-                   a, l, 1);
-  } else {
-    args->exit_early = 1;
-    return;
-  }
-#endif  // DISABLE_TRELLISQ_SEARCH
-
-#if CONFIG_MRC_TX
-  if (mbmi->tx_type == MRC_DCT && !mbmi->valid_mrc_mask) {
-    args->exit_early = 1;
-    return;
-  }
-#endif  // CONFIG_MRC_TX
-
-  if (!is_inter_block(mbmi)) {
-    struct macroblock_plane *const p = &x->plane[plane];
-    av1_inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
-                                       p->eobs[block]);
-    av1_dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-                   tx_size, &this_rd_stats.dist, &this_rd_stats.sse,
-                   OUTPUT_HAS_DECODED_PIXELS);
-  } else {
-    av1_dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-                   tx_size, &this_rd_stats.dist, &this_rd_stats.sse,
-                   OUTPUT_HAS_PREDICTED_PIXELS);
-  }
-#if CONFIG_CFL
-  if (plane == AOM_PLANE_Y && xd->cfl->store_y) {
-#if CONFIG_CHROMA_SUB8X8
+  if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
     assert(!is_inter_block(mbmi) || plane_bsize < BLOCK_8X8);
-#else
-    assert(!is_inter_block(mbmi));
-#endif  // CONFIG_CHROMA_SUB8X8
     cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
   }
-#endif  // CONFIG_CFL
-  rd = RDCOST(x->rdmult, 0, this_rd_stats.dist);
-  if (args->this_rd + rd > args->best_rd) {
-    args->exit_early = 1;
-    return;
-  }
-#if !CONFIG_PVQ
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-  const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-  this_rd_stats.rate =
-      av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block, tx_size,
-                      scan_order, a, l, args->use_fast_coef_costing);
-#else   // !CONFIG_PVQ
-  this_rd_stats.rate = x->rate;
-#endif  // !CONFIG_PVQ
-#else   // !CONFIG_TXK_SEL
-  av1_search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
-                      tx_size, a, l, args->use_fast_coef_costing,
-                      &this_rd_stats);
-#endif  // !CONFIG_TXK_SEL
-
-#if !CONFIG_PVQ
+
 #if CONFIG_RD_DEBUG
   av1_update_txb_coeff_cost(&this_rd_stats, plane, tx_size, blk_row, blk_col,
                             this_rd_stats.rate);
 #endif  // CONFIG_RD_DEBUG
   av1_set_txb_context(x, plane, block, tx_size, a, l);
-#endif  // !CONFIG_PVQ
+
+  if (plane == 0) {
+    x->blk_skip[blk_row *
+                    (block_size_wide[plane_bsize] >> tx_size_wide_log2[0]) +
+                blk_col] = (x->plane[plane].eobs[block] == 0);
+  }
 
   rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
   rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse);
@@ -2032,11 +2892,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   // TODO(jingning): temporarily enabled only for luma component
   rd = AOMMIN(rd1, rd2);
 
-#if !CONFIG_PVQ
   this_rd_stats.skip &= !x->plane[plane].eobs[block];
-#else
-  this_rd_stats.skip &= x->pvq_skip[plane];
-#endif  // !CONFIG_PVQ
+
   av1_merge_rd_stats(&args->rd_stats, &this_rd_stats);
 
   args->this_rd += rd;
@@ -2057,12 +2914,12 @@ static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[0];
   const struct macroblock_plane *const p = &x->plane[0];
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
   const uint8_t *src = &p->src.buf[0];
   const uint8_t *dst = &pd->dst.buf[0];
-  const int16_t *pred = &pd->pred[0];
+  const int16_t *pred = &x->pred_luma[0];
   int bw = block_size_wide[bsize];
   int bh = block_size_high[bsize];
   int visible_w = bw;
@@ -2070,7 +2927,7 @@ static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   int i, j;
   int64_t rd, rd1, rd2;
-  unsigned int tmp1, tmp2;
+  int64_t sse = INT64_MAX, dist = INT64_MAX;
   int qindex = x->qindex;
 
   assert((bw & 0x07) == 0);
@@ -2079,53 +2936,51 @@ static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
   get_txb_dimensions(xd, 0, bsize, 0, 0, bsize, &bw, &bh, &visible_w,
                      &visible_h);
 
-#if CONFIG_HIGHBITDEPTH
-  uint8_t *pred8;
-  DECLARE_ALIGNED(16, uint16_t, pred16[MAX_TX_SQUARE]);
-
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    pred8 = CONVERT_TO_BYTEPTR(pred16);
-  else
-    pred8 = (uint8_t *)pred16;
-#else
-  DECLARE_ALIGNED(16, uint8_t, pred8[MAX_TX_SQUARE]);
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    for (j = 0; j < bh; j++)
-      for (i = 0; i < bw; i++)
-        CONVERT_TO_SHORTPTR(pred8)[j * bw + i] = pred[j * bw + i];
-  } else {
-#endif
-    for (j = 0; j < bh; j++)
-      for (i = 0; i < bw; i++) pred8[j * bw + i] = (uint8_t)pred[j * bw + i];
-#if CONFIG_HIGHBITDEPTH
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-
-  tmp1 = (unsigned)av1_dist_8x8(cpi, x, src, src_stride, pred8, bw, bsize, bw,
-                                bh, visible_w, visible_h, qindex);
-  tmp2 = (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, bsize,
-                                bw, bh, visible_w, visible_h, qindex);
+  const int diff_stride = block_size_wide[bsize];
+  const int16_t *diff = p->src_diff;
+  sse = dist_8x8_diff(x, src, src_stride, diff, diff_stride, bw, bh, visible_w,
+                      visible_h, qindex);
+  sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+  sse *= 16;
 
   if (!is_inter_block(mbmi)) {
-    if (x->tune_metric == AOM_TUNE_PSNR) {
-      assert(args->rd_stats.sse == tmp1 * 16);
-      assert(args->rd_stats.dist == tmp2 * 16);
-    }
-    args->rd_stats.sse = (int64_t)tmp1 * 16;
-    args->rd_stats.dist = (int64_t)tmp2 * 16;
+    dist = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, bsize, bw, bh,
+                        visible_w, visible_h, qindex);
+    dist *= 16;
   } else {
-    // For inter mode, the decoded pixels are provided in pd->pred,
+    // For inter mode, the decoded pixels are provided in x->pred_luma,
     // while the predicted pixels are in dst.
-    if (x->tune_metric == AOM_TUNE_PSNR) {
-      assert(args->rd_stats.sse == tmp2 * 16);
-      assert(args->rd_stats.dist == tmp1 * 16);
+    uint8_t *pred8;
+    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      pred8 = CONVERT_TO_BYTEPTR(pred16);
+    else
+      pred8 = (uint8_t *)pred16;
+
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      for (j = 0; j < bh; j++)
+        for (i = 0; i < bw; i++)
+          CONVERT_TO_SHORTPTR(pred8)[j * bw + i] = pred[j * bw + i];
+    } else {
+      for (j = 0; j < bh; j++)
+        for (i = 0; i < bw; i++) pred8[j * bw + i] = (uint8_t)pred[j * bw + i];
     }
-    args->rd_stats.sse = (int64_t)tmp2 * 16;
-    args->rd_stats.dist = (int64_t)tmp1 * 16;
+
+    dist = av1_dist_8x8(cpi, x, src, src_stride, pred8, bw, bsize, bw, bh,
+                        visible_w, visible_h, qindex);
+    dist *= 16;
+  }
+
+#ifdef DEBUG_DIST_8X8
+  if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) {
+    assert(args->rd_stats.sse == sse);
+    assert(args->rd_stats.dist == dist);
   }
+#endif  // DEBUG_DIST_8X8
+
+  args->rd_stats.sse = sse;
+  args->rd_stats.dist = dist;
 
   rd1 = RDCOST(x->rdmult, args->rd_stats.rate, args->rd_stats.dist);
   rd2 = RDCOST(x->rdmult, 0, args->rd_stats.sse);
@@ -2141,7 +2996,8 @@ static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
 static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
                              RD_STATS *rd_stats, int64_t ref_best_rd, int plane,
                              BLOCK_SIZE bsize, TX_SIZE tx_size,
-                             int use_fast_coef_casting) {
+                             int use_fast_coef_casting,
+                             FAST_TX_SEARCH_MODE ftxs_mode) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   struct rdcost_block_args args;
@@ -2150,18 +3006,21 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
   args.cpi = cpi;
   args.best_rd = ref_best_rd;
   args.use_fast_coef_costing = use_fast_coef_casting;
+  args.ftxs_mode = ftxs_mode;
   av1_init_rd_stats(&args.rd_stats);
 
-  if (plane == 0) xd->mi[0]->mbmi.tx_size = tx_size;
+  if (plane == 0) xd->mi[0]->tx_size = tx_size;
 
-  av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
+  av1_get_entropy_contexts(bsize, pd, args.t_above, args.t_left);
 
   av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
                                          &args);
 #if CONFIG_DIST_8X8
-  if (x->using_dist_8x8 && !args.exit_early && plane == 0 &&
-      bsize >= BLOCK_8X8 &&
-      (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
+  int bw = block_size_wide[bsize];
+  int bh = block_size_high[bsize];
+
+  if (x->using_dist_8x8 && !args.exit_early && plane == 0 && bw >= 8 &&
+      bh >= 8 && (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
     dist_8x8_sub8x8_txfm_rd(cpi, x, bsize, &args);
 #endif
 
@@ -2172,183 +3031,48 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
   }
 }
 
-#if CONFIG_SUPERTX
-void av1_txfm_rd_in_plane_supertx(MACROBLOCK *x, const AV1_COMP *cpi, int *rate,
-                                  int64_t *distortion, int *skippable,
-                                  int64_t *sse, int64_t ref_best_rd, int plane,
-                                  BLOCK_SIZE bsize, TX_SIZE tx_size,
-                                  int use_fast_coef_casting) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  struct rdcost_block_args args;
-  av1_zero(args);
-  args.cpi = cpi;
-  args.x = x;
-  args.best_rd = ref_best_rd;
-  args.use_fast_coef_costing = use_fast_coef_casting;
-
-#if CONFIG_EXT_TX
-  assert(tx_size < TX_SIZES);
-#endif  // CONFIG_EXT_TX
-
-  if (plane == 0) xd->mi[0]->mbmi.tx_size = tx_size;
-
-  av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
-
-  block_rd_txfm(plane, 0, 0, 0, get_plane_block_size(bsize, pd), tx_size,
-                &args);
-
-  if (args.exit_early) {
-    *rate = INT_MAX;
-    *distortion = INT64_MAX;
-    *sse = INT64_MAX;
-    *skippable = 0;
-  } else {
-    *distortion = args.rd_stats.dist;
-    *rate = args.rd_stats.rate;
-    *sse = args.rd_stats.sse;
-    *skippable = !x->plane[plane].eobs[0];
-  }
-}
-#endif  // CONFIG_SUPERTX
-
-static int tx_size_cost(const AV1_COMP *const cpi, const MACROBLOCK *const x,
+static int tx_size_cost(const AV1_COMMON *const cm, const MACROBLOCK *const x,
                         BLOCK_SIZE bsize, TX_SIZE tx_size) {
-  const AV1_COMMON *const cm = &cpi->common;
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
 
   if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type)) {
-    const int is_inter = is_inter_block(mbmi);
-    const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
-                                         : intra_tx_size_cat_lookup[bsize];
-    const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
-    const int depth = tx_size_to_depth(coded_tx_size);
+    const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+    const int depth = tx_size_to_depth(tx_size, bsize);
     const int tx_size_ctx = get_tx_size_context(xd);
     int r_tx_size = x->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-    if (is_quarter_tx_allowed(xd, mbmi, is_inter) && tx_size != coded_tx_size)
-      r_tx_size += av1_cost_bit(cm->fc->quarter_tx_size_prob,
-                                tx_size == quarter_txsize_lookup[bsize]);
-#endif
     return r_tx_size;
   } else {
     return 0;
   }
 }
 
-#if CONFIG_LGT_FROM_PRED
-int av1_lgt_cost(const AV1_COMMON *cm, const MACROBLOCK *x,
-                 const MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
-                 TX_SIZE tx_size, int use_lgt) {
-  if (plane > 0) return 0;
-  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const int is_inter = is_inter_block(mbmi);
-
-  assert(is_lgt_allowed(mbmi->mode, tx_size));
-  if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 &&
-      !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-    const int ext_tx_set =
-        get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
-    if (LGT_FROM_PRED_INTRA && !is_inter && ext_tx_set > 0 &&
-        ALLOW_INTRA_EXT_TX)
-      return x->intra_lgt_cost[txsize_sqr_map[tx_size]][mbmi->mode][use_lgt];
-    if (LGT_FROM_PRED_INTRA && is_inter && ext_tx_set > 0)
-      return x->inter_lgt_cost[txsize_sqr_map[tx_size]][use_lgt];
-  }
-  return 0;
-}
-#endif  // CONFIG_LGT_FROM_PRED
-
-// TODO(angiebird): use this function whenever it's possible
-int av1_tx_type_cost(const AV1_COMMON *cm, const MACROBLOCK *x,
-                     const MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
-                     TX_SIZE tx_size, TX_TYPE tx_type) {
-  if (plane > 0) return 0;
-
-#if CONFIG_LGT_FROM_PRED
-  assert(!xd->mi[0]->mbmi.use_lgt);
-#endif
-#if CONFIG_VAR_TX
-  tx_size = get_min_tx_size(tx_size);
-#endif
-
-  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const int is_inter = is_inter_block(mbmi);
-#if CONFIG_EXT_TX
-  if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 &&
-      !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-    const int ext_tx_set =
-        get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
-    if (is_inter) {
-      if (ext_tx_set > 0)
-        return x
-            ->inter_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]][tx_type];
-    } else {
-      if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
-        return x->intra_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]]
-                                     [mbmi->mode][tx_type];
-    }
-  }
-#else
-  (void)bsize;
-  (void)cm;
-  if (tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
-      !FIXED_TX_TYPE) {
-    if (is_inter) {
-      return x->inter_tx_type_costs[tx_size][tx_type];
-    } else {
-      return x->intra_tx_type_costs[tx_size]
-                                   [intra_mode_to_tx_type_context[mbmi->mode]]
-                                   [tx_type];
-    }
-  }
-#endif  // CONFIG_EXT_TX
-  return 0;
-}
 static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                         RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs,
-                        TX_TYPE tx_type, TX_SIZE tx_size) {
+                        TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   int64_t rd = INT64_MAX;
-  aom_prob skip_prob = av1_get_skip_prob(cm, xd);
+  const int skip_ctx = av1_get_skip_context(xd);
   int s0, s1;
   const int is_inter = is_inter_block(mbmi);
   const int tx_select =
-      cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8;
-
-  const int r_tx_size = tx_size_cost(cpi, x, bs, tx_size);
+      cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type);
+  int ctx = txfm_partition_context(
+      xd->above_txfm_context, xd->left_txfm_context, mbmi->sb_type, tx_size);
+  const int r_tx_size = is_inter ? x->txfm_partition_cost[ctx][0]
+                                 : tx_size_cost(cm, x, bs, tx_size);
 
-#if CONFIG_PVQ
-  assert(tx_size >= TX_4X4);
-#endif  // CONFIG_PVQ
-  assert(skip_prob > 0);
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
   assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
 
-  s0 = av1_cost_bit(skip_prob, 0);
-  s1 = av1_cost_bit(skip_prob, 1);
+  s0 = x->skip_cost[skip_ctx][0];
+  s1 = x->skip_cost[skip_ctx][1];
 
-  mbmi->tx_type = tx_type;
   mbmi->tx_size = tx_size;
-  txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, tx_size,
-                   cpi->sf.use_fast_coef_costing);
+  txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, AOM_PLANE_Y, bs, tx_size,
+                   cpi->sf.use_fast_coef_costing, ftxs_mode);
   if (rd_stats->rate == INT_MAX) return INT64_MAX;
-#if !CONFIG_TXK_SEL
-  int plane = 0;
-#if CONFIG_LGT_FROM_PRED
-  if (is_lgt_allowed(mbmi->mode, tx_size))
-    rd_stats->rate +=
-        av1_lgt_cost(cm, x, xd, bs, plane, tx_size, mbmi->use_lgt);
-  if (!mbmi->use_lgt)
-    rd_stats->rate += av1_tx_type_cost(cm, x, xd, bs, plane, tx_size, tx_type);
-#else
-  rd_stats->rate += av1_tx_type_cost(cm, x, xd, bs, plane, tx_size, tx_type);
-#endif  // CONFIG_LGT_FROM_PRED
-#endif
 
   if (rd_stats->skip) {
     if (is_inter) {
@@ -2363,545 +3087,136 @@ static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   if (tx_select) rd_stats->rate += r_tx_size;
 
-  if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
-      !(rd_stats->skip))
+  if (is_inter && !xd->lossless[xd->mi[0]->segment_id] && !(rd_stats->skip))
     rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
 
   return rd;
 }
 
-static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
-                            TX_TYPE tx_type, TX_SIZE tx_size) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
-  const int is_inter = is_inter_block(mbmi);
-  int prune = 0;
-  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
-    // passing -1 in for tx_type indicates that all 1D
-    // transforms should be considered for pruning
-    prune = prune_tx_types(cpi, bs, x, xd, -1);
-
-#if CONFIG_MRC_TX
-  // MRC_DCT only implemented for TX_32X32 so only include this tx in
-  // the search for TX_32X32
-  if (tx_type == MRC_DCT &&
-      ((is_inter && !USE_MRC_INTER) || (!is_inter && !USE_MRC_INTRA) ||
-       tx_size != TX_32X32))
-    return 1;
-#endif  // CONFIG_MRC_TX
-#if CONFIG_LGT_FROM_PRED
-  if (mbmi->use_lgt && mbmi->ref_mv_idx > 0) return 1;
-#endif  // CONFIG_LGT_FROM_PRED
-  if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) return 1;
-  if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, tx_size))
-    return 1;
-  if (!is_inter && x->use_default_intra_tx_type &&
-      tx_type != get_default_tx_type(0, xd, 0, tx_size))
-    return 1;
-  if (is_inter && x->use_default_inter_tx_type &&
-      tx_type != get_default_tx_type(0, xd, 0, tx_size))
-    return 1;
-  if (max_tx_size >= TX_32X32 && tx_size == TX_4X4) return 1;
-#if CONFIG_EXT_TX
-  const AV1_COMMON *const cm = &cpi->common;
-  const TxSetType tx_set_type =
-      get_ext_tx_set_type(tx_size, bs, is_inter, cm->reduced_tx_set_used);
-  if (!av1_ext_tx_used[tx_set_type][tx_type]) return 1;
-  if (is_inter) {
-    if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
-      if (!do_tx_type_search(tx_type, prune)) return 1;
-    }
-  } else {
-    if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
-      if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) return 1;
-    }
-  }
-#else   // CONFIG_EXT_TX
-  if (tx_size >= TX_32X32 && tx_type != DCT_DCT) return 1;
-  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
-      !do_tx_type_search(tx_type, prune))
-    return 1;
-#endif  // CONFIG_EXT_TX
-  return 0;
-}
-
-#if (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT || CONFIG_INTERINTRA)
 static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
                                    MACROBLOCK *x, int *r, int64_t *d, int *s,
                                    int64_t *sse, int64_t ref_best_rd) {
   RD_STATS rd_stats;
-  int64_t rd = txfm_yrd(cpi, x, &rd_stats, ref_best_rd, bs, DCT_DCT,
-                        max_txsize_lookup[bs]);
+  x->rd_model = LOW_TXFM_RD;
+  int64_t rd = txfm_yrd(cpi, x, &rd_stats, ref_best_rd, bs,
+                        max_txsize_rect_lookup[bs], FTXS_NONE);
+  x->rd_model = FULL_TXFM_RD;
   *r = rd_stats.rate;
   *d = rd_stats.dist;
   *s = rd_stats.skip;
   *sse = rd_stats.sse;
   return rd;
 }
-#endif  // (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
 
 static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
                                    RD_STATS *rd_stats, int64_t ref_best_rd,
                                    BLOCK_SIZE bs) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  TX_TYPE tx_type, best_tx_type = DCT_DCT;
-  int64_t this_rd, best_rd = INT64_MAX;
-  aom_prob skip_prob = av1_get_skip_prob(cm, xd);
-  int s0 = av1_cost_bit(skip_prob, 0);
-  int s1 = av1_cost_bit(skip_prob, 1);
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const int is_inter = is_inter_block(mbmi);
-  int prune = 0;
-  const int plane = 0;
-#if CONFIG_LGT_FROM_PRED
-  int is_lgt_best = 0;
-  int search_lgt = is_inter
-                       ? LGT_FROM_PRED_INTER && !x->use_default_inter_tx_type &&
-                             !cpi->sf.tx_type_search.prune_mode > NO_PRUNE
-                       : LGT_FROM_PRED_INTRA && !x->use_default_intra_tx_type &&
-                             ALLOW_INTRA_EXT_TX;
-#endif  // CONFIG_LGT_FROM_PRED
-  av1_invalid_rd_stats(rd_stats);
-
-  mbmi->tx_size = tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
-#if CONFIG_VAR_TX
-  mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-#endif  // CONFIG_VAR_TX
-#if CONFIG_EXT_TX
-  int ext_tx_set =
-      get_ext_tx_set(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used);
+  mbmi->tx_size = tx_size_from_tx_mode(bs, cm->tx_mode);
   const TxSetType tx_set_type =
-      get_ext_tx_set_type(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used);
-#endif  // CONFIG_EXT_TX
-
-  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
-#if CONFIG_EXT_TX
-    prune = prune_tx_types(cpi, bs, x, xd, ext_tx_set);
-#else
-    prune = prune_tx_types(cpi, bs, x, xd, 0);
-#endif  // CONFIG_EXT_TX
-#if CONFIG_EXT_TX
-  if (get_ext_tx_types(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used) >
-          1 &&
-      !xd->lossless[mbmi->segment_id]) {
-#if CONFIG_PVQ
-    od_rollback_buffer pre_buf, post_buf;
-
-    od_encode_checkpoint(&x->daala_enc, &pre_buf);
-    od_encode_checkpoint(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
-
-    for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
-      if (!av1_ext_tx_used[tx_set_type][tx_type]) continue;
-      RD_STATS this_rd_stats;
-      if (is_inter) {
-        if (x->use_default_inter_tx_type &&
-            tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
-          continue;
-        if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
-          if (!do_tx_type_search(tx_type, prune)) continue;
-        }
-      } else {
-        if (x->use_default_intra_tx_type &&
-            tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
-          continue;
-        if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
-          if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue;
-        }
-      }
-
-      mbmi->tx_type = tx_type;
-
-      txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs,
-                       mbmi->tx_size, cpi->sf.use_fast_coef_costing);
-#if CONFIG_PVQ
-      od_encode_rollback(&x->daala_enc, &pre_buf);
-#endif  // CONFIG_PVQ
-      if (this_rd_stats.rate == INT_MAX) continue;
-      av1_tx_type_cost(cm, x, xd, bs, plane, mbmi->tx_size, tx_type);
-
-      if (this_rd_stats.skip)
-        this_rd = RDCOST(x->rdmult, s1, this_rd_stats.sse);
-      else
-        this_rd =
-            RDCOST(x->rdmult, this_rd_stats.rate + s0, this_rd_stats.dist);
-      if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] &&
-          !this_rd_stats.skip)
-        this_rd = AOMMIN(this_rd, RDCOST(x->rdmult, s1, this_rd_stats.sse));
-
-      if (this_rd < best_rd) {
-        best_rd = this_rd;
-        best_tx_type = mbmi->tx_type;
-        *rd_stats = this_rd_stats;
-#if CONFIG_PVQ
-        od_encode_checkpoint(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
-      }
-    }
-#if CONFIG_PVQ
-    od_encode_rollback(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
-#if CONFIG_LGT_FROM_PRED
-    // search LGT
-    if (search_lgt && is_lgt_allowed(mbmi->mode, mbmi->tx_size) &&
-        !cm->reduced_tx_set_used) {
-      RD_STATS this_rd_stats;
-      mbmi->use_lgt = 1;
-      txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs,
-                       mbmi->tx_size, cpi->sf.use_fast_coef_costing);
-      if (this_rd_stats.rate != INT_MAX) {
-        av1_lgt_cost(cm, x, xd, bs, plane, mbmi->tx_size, 1);
-        if (this_rd_stats.skip)
-          this_rd = RDCOST(x->rdmult, s1, this_rd_stats.sse);
-        else
-          this_rd =
-              RDCOST(x->rdmult, this_rd_stats.rate + s0, this_rd_stats.dist);
-        if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] &&
-            !this_rd_stats.skip)
-          this_rd = AOMMIN(this_rd, RDCOST(x->rdmult, s1, this_rd_stats.sse));
-        if (this_rd < best_rd) {
-          best_rd = this_rd;
-          is_lgt_best = 1;
-          *rd_stats = this_rd_stats;
-        }
-      }
-      mbmi->use_lgt = 0;
-    }
-#endif  // CONFIG_LGT_FROM_PRED
-  } else {
-    mbmi->tx_type = DCT_DCT;
-    txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
-                     cpi->sf.use_fast_coef_costing);
-  }
-#else   // CONFIG_EXT_TX
-  if (mbmi->tx_size < TX_32X32 && !xd->lossless[mbmi->segment_id]) {
-    for (tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
-      RD_STATS this_rd_stats;
-      if (!is_inter && x->use_default_intra_tx_type &&
-          tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
-        continue;
-      if (is_inter && x->use_default_inter_tx_type &&
-          tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
-        continue;
-      mbmi->tx_type = tx_type;
-      txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs,
-                       mbmi->tx_size, cpi->sf.use_fast_coef_costing);
-      if (this_rd_stats.rate == INT_MAX) continue;
-
-      av1_tx_type_cost(cm, x, xd, bs, plane, mbmi->tx_size, tx_type);
-      if (is_inter) {
-        if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
-            !do_tx_type_search(tx_type, prune))
-          continue;
-      }
-      if (this_rd_stats.skip)
-        this_rd = RDCOST(x->rdmult, s1, this_rd_stats.sse);
-      else
-        this_rd =
-            RDCOST(x->rdmult, this_rd_stats.rate + s0, this_rd_stats.dist);
-      if (is_inter && !xd->lossless[mbmi->segment_id] && !this_rd_stats.skip)
-        this_rd = AOMMIN(this_rd, RDCOST(x->rdmult, s1, this_rd_stats.sse));
-
-      if (this_rd < best_rd) {
-        best_rd = this_rd;
-        best_tx_type = mbmi->tx_type;
-        *rd_stats = this_rd_stats;
-      }
-    }
-  } else {
-    mbmi->tx_type = DCT_DCT;
-    txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
-                     cpi->sf.use_fast_coef_costing);
-  }
-#endif  // CONFIG_EXT_TX
-  mbmi->tx_type = best_tx_type;
-#if CONFIG_LGT_FROM_PRED
-  mbmi->use_lgt = is_lgt_best;
-#endif  // CONFIG_LGT_FROM_PRED
+      av1_get_ext_tx_set_type(mbmi->tx_size, is_inter, cm->reduced_tx_set_used);
+  prune_tx(cpi, bs, x, xd, tx_set_type);
+  txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, AOM_PLANE_Y, bs,
+                   mbmi->tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE);
+  // Reset the pruning flags.
+  av1_zero(x->tx_search_prune);
+  x->tx_split_prune_flag = 0;
 }
 
 static void choose_smallest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     RD_STATS *rd_stats, int64_t ref_best_rd,
                                     BLOCK_SIZE bs) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
 
   mbmi->tx_size = TX_4X4;
-  mbmi->tx_type = DCT_DCT;
-#if CONFIG_VAR_TX
-  mbmi->min_tx_size = get_min_tx_size(TX_4X4);
-#endif  // CONFIG_VAR_TX
-
   txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
-                   cpi->sf.use_fast_coef_costing);
+                   cpi->sf.use_fast_coef_costing, FTXS_NONE);
 }
 
-#if CONFIG_TXK_SEL || CONFIG_VAR_TX
 static INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) {
   int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * tx_size_wide_log2[0]);
   return num_blk;
 }
-#endif  // CONFIG_TXK_SEL || CONFIG_VAR_TX
+
+static int get_search_init_depth(int mi_width, int mi_height, int is_inter,
+                                 const SPEED_FEATURES *sf) {
+  if (sf->tx_size_search_method == USE_LARGESTALL) return MAX_VARTX_DEPTH;
+
+  if (sf->tx_size_search_lgr_block) {
+    if (mi_width > mi_size_wide[BLOCK_64X64] ||
+        mi_height > mi_size_high[BLOCK_64X64])
+      return MAX_VARTX_DEPTH;
+  }
+
+  if (is_inter) {
+    return (mi_height != mi_width) ? sf->inter_tx_size_search_init_depth_rect
+                                   : sf->inter_tx_size_search_init_depth_sqr;
+  } else {
+    return (mi_height != mi_width) ? sf->intra_tx_size_search_init_depth_rect
+                                   : sf->intra_tx_size_search_init_depth_sqr;
+  }
+}
 
 static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
                                         MACROBLOCK *x, RD_STATS *rd_stats,
                                         int64_t ref_best_rd, BLOCK_SIZE bs) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   int64_t rd = INT64_MAX;
   int n;
-  int start_tx, end_tx;
-  int64_t best_rd = INT64_MAX, last_rd = INT64_MAX;
-  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
-  TX_SIZE best_tx_size = max_tx_size;
-  TX_TYPE best_tx_type = DCT_DCT;
-#if CONFIG_LGT_FROM_PRED
-  int breakout = 0;
-  int is_lgt_best = 0;
-  mbmi->use_lgt = 0;
-#endif  // CONFIG_LGT_FROM_PRED
-#if CONFIG_TXK_SEL
-  TX_TYPE best_txk_type[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
-#endif  // CONFIG_TXK_SEL
+  int start_tx;
+  int depth;
+  int64_t best_rd = INT64_MAX;
+  const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs];
+  TX_SIZE best_tx_size = max_rect_tx_size;
+  TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  const int n4 = bsize_to_num_blk(bs);
   const int tx_select = cm->tx_mode == TX_MODE_SELECT;
-  const int is_inter = is_inter_block(mbmi);
-#if CONFIG_PVQ
-  od_rollback_buffer buf;
-  od_encode_checkpoint(&x->daala_enc, &buf);
-#endif  // CONFIG_PVQ
 
   av1_invalid_rd_stats(rd_stats);
 
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-  int evaluate_rect_tx = 0;
   if (tx_select) {
-    evaluate_rect_tx = is_rect_tx_allowed(xd, mbmi);
+    start_tx = max_rect_tx_size;
+    depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs],
+                                  is_inter_block(mbmi), &cpi->sf);
   } else {
-    const TX_SIZE chosen_tx_size =
-        tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
-    evaluate_rect_tx = is_rect_tx(chosen_tx_size);
-    assert(IMPLIES(evaluate_rect_tx, is_rect_tx_allowed(xd, mbmi)));
-  }
-  if (evaluate_rect_tx) {
-    TX_TYPE tx_start = DCT_DCT;
-    TX_TYPE tx_end = TX_TYPES;
-#if CONFIG_TXK_SEL
-    // The tx_type becomes dummy when lv_map is on. The tx_type search will be
-    // performed in av1_search_txk_type()
-    tx_end = DCT_DCT + 1;
-#endif
-    TX_TYPE tx_type;
-    for (tx_type = tx_start; tx_type < tx_end; ++tx_type) {
-      if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue;
-      const TX_SIZE rect_tx_size = max_txsize_rect_lookup[bs];
-      RD_STATS this_rd_stats;
-      const TxSetType tx_set_type = get_ext_tx_set_type(
-          rect_tx_size, bs, is_inter, cm->reduced_tx_set_used);
-      if (av1_ext_tx_used[tx_set_type][tx_type]) {
-        rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type,
-                      rect_tx_size);
-        ref_best_rd = AOMMIN(rd, ref_best_rd);
-        if (rd < best_rd) {
-#if CONFIG_TXK_SEL
-          memcpy(best_txk_type, mbmi->txk_type, sizeof(best_txk_type[0]) * 256);
-#endif
-          best_tx_type = tx_type;
-          best_tx_size = rect_tx_size;
-          best_rd = rd;
-          *rd_stats = this_rd_stats;
-        }
-      }
-#if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
-      const int is_inter = is_inter_block(mbmi);
-      if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
-#endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
-    }
-#if CONFIG_LGT_FROM_PRED
-    const TX_SIZE rect_tx_size = max_txsize_rect_lookup[bs];
-    if (is_lgt_allowed(mbmi->mode, rect_tx_size) && !cm->reduced_tx_set_used) {
-      RD_STATS this_rd_stats;
-      mbmi->use_lgt = 1;
-      rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, 0, rect_tx_size);
-      if (rd < best_rd) {
-        is_lgt_best = 1;
-        best_tx_size = rect_tx_size;
-        best_rd = rd;
-        *rd_stats = this_rd_stats;
-      }
-      mbmi->use_lgt = 0;
-    }
-#endif  // CONFIG_LGT_FROM_PRED
-  }
-
-#if CONFIG_RECT_TX_EXT
-  // test 1:4/4:1 tx
-  int evaluate_quarter_tx = 0;
-  if (is_quarter_tx_allowed(xd, mbmi, is_inter)) {
-    if (tx_select) {
-      evaluate_quarter_tx = 1;
-    } else {
-      const TX_SIZE chosen_tx_size =
-          tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
-      evaluate_quarter_tx = chosen_tx_size == quarter_txsize_lookup[bs];
-    }
-  }
-  if (evaluate_quarter_tx) {
-    TX_TYPE tx_start = DCT_DCT;
-    TX_TYPE tx_end = TX_TYPES;
-#if CONFIG_TXK_SEL
-    // The tx_type becomes dummy when lv_map is on. The tx_type search will be
-    // performed in av1_search_txk_type()
-    tx_end = DCT_DCT + 1;
-#endif
-    TX_TYPE tx_type;
-    for (tx_type = tx_start; tx_type < tx_end; ++tx_type) {
-      if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue;
-      const TX_SIZE tx_size = quarter_txsize_lookup[bs];
-      RD_STATS this_rd_stats;
-      const TxSetType tx_set_type =
-          get_ext_tx_set_type(tx_size, bs, is_inter, cm->reduced_tx_set_used);
-      if (av1_ext_tx_used[tx_set_type][tx_type]) {
-        rd =
-            txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type, tx_size);
-        if (rd < best_rd) {
-#if CONFIG_TXK_SEL
-          memcpy(best_txk_type, mbmi->txk_type,
-                 sizeof(best_txk_type[0]) * num_blk);
-#endif
-          best_tx_type = tx_type;
-#if CONFIG_LGT_FROM_PRED
-          is_lgt_best = 0;
-#endif
-          best_tx_size = tx_size;
-          best_rd = rd;
-          *rd_stats = this_rd_stats;
-        }
-      }
-#if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
-      const int is_inter = is_inter_block(mbmi);
-      if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
-#endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
-    }
-#if CONFIG_LGT_FROM_PRED
-    if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used) {
-      const TX_SIZE tx_size = quarter_txsize_lookup[bs];
-      RD_STATS this_rd_stats;
-      mbmi->use_lgt = 1;
-      rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, 0, tx_size);
-      if (rd < best_rd) {
-        is_lgt_best = 1;
-        best_tx_size = tx_size;
-        best_rd = rd;
-        *rd_stats = this_rd_stats;
-      }
-      mbmi->use_lgt = 0;
-    }
-#endif  // CONFIG_LGT_FROM_PRED
+    const TX_SIZE chosen_tx_size = tx_size_from_tx_mode(bs, cm->tx_mode);
+    start_tx = chosen_tx_size;
+    depth = MAX_TX_DEPTH;
   }
-#endif  // CONFIG_RECT_TX_EXT
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
 
-  if (tx_select) {
-    start_tx = max_tx_size;
-    end_tx = (max_tx_size >= TX_32X32) ? TX_8X8 : TX_4X4;
-  } else {
-    const TX_SIZE chosen_tx_size =
-        tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
-    start_tx = chosen_tx_size;
-    end_tx = chosen_tx_size;
-  }
-
-  last_rd = INT64_MAX;
-  for (n = start_tx; n >= end_tx; --n) {
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-    if (is_rect_tx(n)) break;
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
-    TX_TYPE tx_start = DCT_DCT;
-    TX_TYPE tx_end = TX_TYPES;
-#if CONFIG_TXK_SEL
-    // The tx_type becomes dummy when lv_map is on. The tx_type search will be
-    // performed in av1_search_txk_type()
-    tx_end = DCT_DCT + 1;
-#endif
-    TX_TYPE tx_type;
-    for (tx_type = tx_start; tx_type < tx_end; ++tx_type) {
-      RD_STATS this_rd_stats;
-      if (skip_txfm_search(cpi, x, bs, tx_type, n)) continue;
-      rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type, n);
-#if CONFIG_PVQ
-      od_encode_rollback(&x->daala_enc, &buf);
-#endif  // CONFIG_PVQ
-      // Early termination in transform size search.
-      if (cpi->sf.tx_size_search_breakout &&
-          (rd == INT64_MAX ||
-           (this_rd_stats.skip == 1 && tx_type != DCT_DCT && n < start_tx) ||
-           (n < (int)max_tx_size && rd > last_rd))) {
-#if CONFIG_LGT_FROM_PRED
-        breakout = 1;
-#endif
-        break;
-      }
+  prune_tx(cpi, bs, x, xd, EXT_TX_SET_ALL16);
 
-      last_rd = rd;
-      ref_best_rd = AOMMIN(rd, ref_best_rd);
-      if (rd < best_rd) {
-#if CONFIG_TXK_SEL
-        memcpy(best_txk_type, mbmi->txk_type, sizeof(best_txk_type[0]) * 256);
-#endif
-        best_tx_type = tx_type;
-#if CONFIG_LGT_FROM_PRED
-        is_lgt_best = 0;
-#endif
-        best_tx_size = n;
-        best_rd = rd;
-        *rd_stats = this_rd_stats;
-      }
-#if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
-      const int is_inter = is_inter_block(mbmi);
-      if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
-#endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
-    }
-#if CONFIG_LGT_FROM_PRED
-    mbmi->use_lgt = 1;
-    if (is_lgt_allowed(mbmi->mode, n) && !skip_txfm_search(cpi, x, bs, 0, n) &&
-        !breakout) {
-      RD_STATS this_rd_stats;
-      rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, 0, n);
-      if (rd < best_rd) {
-        is_lgt_best = 1;
-        best_tx_size = n;
-        best_rd = rd;
-        *rd_stats = this_rd_stats;
-      }
-    }
-    mbmi->use_lgt = 0;
-#endif  // CONFIG_LGT_FROM_PRED
+  for (n = start_tx; depth <= MAX_TX_DEPTH; depth++, n = sub_tx_size_map[n]) {
+    RD_STATS this_rd_stats;
+    if (mbmi->ref_mv_idx > 0) x->rd_model = LOW_TXFM_RD;
+    rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, n, FTXS_NONE);
+    x->rd_model = FULL_TXFM_RD;
+
+    if (rd < best_rd) {
+      memcpy(best_txk_type, mbmi->txk_type,
+             sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
+      memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * n4);
+      best_tx_size = n;
+      best_rd = rd;
+      *rd_stats = this_rd_stats;
+    }
+    if (n == TX_4X4) break;
   }
   mbmi->tx_size = best_tx_size;
-  mbmi->tx_type = best_tx_type;
-#if CONFIG_LGT_FROM_PRED
-  mbmi->use_lgt = is_lgt_best;
-  assert(!is_lgt_best || is_lgt_allowed(mbmi->mode, mbmi->tx_size));
-#endif  // CONFIG_LGT_FROM_PRED
-#if CONFIG_TXK_SEL
-  memcpy(mbmi->txk_type, best_txk_type, sizeof(best_txk_type[0]) * 256);
-#endif
-
-#if CONFIG_VAR_TX
-  mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-#endif  // CONFIG_VAR_TX
+  memcpy(mbmi->txk_type, best_txk_type,
+         sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
+  memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4);
 
-#if !CONFIG_EXT_TX
-  if (mbmi->tx_size >= TX_32X32) assert(mbmi->tx_type == DCT_DCT);
-#endif  // !CONFIG_EXT_TX
-#if CONFIG_PVQ
-  if (best_rd != INT64_MAX) {
-    txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs, best_tx_type, best_tx_size);
-  }
-#endif  // CONFIG_PVQ
+  // Reset the pruning flags.
+  av1_zero(x->tx_search_prune);
+  x->tx_split_prune_flag = 0;
 }
 
 static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
@@ -2910,9 +3225,9 @@ static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
   MACROBLOCKD *xd = &x->e_mbd;
   av1_init_rd_stats(rd_stats);
 
-  assert(bs == xd->mi[0]->mbmi.sb_type);
+  assert(bs == xd->mi[0]->sb_type);
 
-  if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+  if (xd->lossless[xd->mi[0]->segment_id]) {
     choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
   } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
     choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
@@ -2921,18 +3236,117 @@ static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
   }
 }
 
+// Return the rate cost for luma prediction mode info. of intra blocks.
+static int intra_mode_info_cost_y(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                  const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
+                                  int mode_cost) {
+  int total_rate = mode_cost;
+  const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0;
+  const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra;
+  const int use_intrabc = mbmi->use_intrabc;
+  // Can only activate one mode.
+  assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc +
+          use_filter_intra) <= 1);
+  const int try_palette =
+      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
+  if (try_palette && mbmi->mode == DC_PRED) {
+    const MACROBLOCKD *xd = &x->e_mbd;
+    const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+    const int mode_ctx = av1_get_palette_mode_ctx(xd);
+    total_rate += x->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette];
+    if (use_palette) {
+      const uint8_t *const color_map = xd->plane[0].color_index_map;
+      int block_width, block_height, rows, cols;
+      av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                               &cols);
+      const int plt_size = mbmi->palette_mode_info.palette_size[0];
+      int palette_mode_cost =
+          x->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+          write_uniform_cost(plt_size, color_map[0]);
+      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+      const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+      palette_mode_cost +=
+          av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache,
+                                   n_cache, cpi->common.bit_depth);
+      palette_mode_cost +=
+          av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP);
+      total_rate += palette_mode_cost;
+    }
+  }
+  if (av1_filter_intra_allowed(&cpi->common, mbmi)) {
+    total_rate += x->filter_intra_cost[mbmi->sb_type][use_filter_intra];
+    if (use_filter_intra) {
+      total_rate += x->filter_intra_mode_cost[mbmi->filter_intra_mode_info
+                                                  .filter_intra_mode];
+    }
+  }
+  if (av1_is_directional_mode(mbmi->mode)) {
+    if (av1_use_angle_delta(bsize)) {
+      total_rate += x->angle_delta_cost[mbmi->mode - V_PRED]
+                                       [MAX_ANGLE_DELTA +
+                                        mbmi->angle_delta[PLANE_TYPE_Y]];
+    }
+  }
+  if (av1_allow_intrabc(&cpi->common))
+    total_rate += x->intrabc_cost[use_intrabc];
+  return total_rate;
+}
+
+// Return the rate cost for chroma prediction mode info. of intra blocks.
+static int intra_mode_info_cost_uv(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                   const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
+                                   int mode_cost) {
+  int total_rate = mode_cost;
+  const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0;
+  const UV_PREDICTION_MODE mode = mbmi->uv_mode;
+  // Can only activate one mode.
+  assert(((mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1);
+
+  const int try_palette =
+      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
+  if (try_palette && mode == UV_DC_PRED) {
+    const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
+    total_rate +=
+        x->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette];
+    if (use_palette) {
+      const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+      const int plt_size = pmi->palette_size[1];
+      const MACROBLOCKD *xd = &x->e_mbd;
+      const uint8_t *const color_map = xd->plane[1].color_index_map;
+      int palette_mode_cost =
+          x->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+          write_uniform_cost(plt_size, color_map[0]);
+      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+      const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+      palette_mode_cost += av1_palette_color_cost_uv(pmi, color_cache, n_cache,
+                                                     cpi->common.bit_depth);
+      palette_mode_cost +=
+          av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP);
+      total_rate += palette_mode_cost;
+    }
+  }
+  if (av1_is_directional_mode(get_uv_mode(mode))) {
+    if (av1_use_angle_delta(bsize)) {
+      total_rate +=
+          x->angle_delta_cost[mode - V_PRED][mbmi->angle_delta[PLANE_TYPE_UV] +
+                                             MAX_ANGLE_DELTA];
+    }
+  }
+  return total_rate;
+}
+
 static int conditional_skipintra(PREDICTION_MODE mode,
                                  PREDICTION_MODE best_intra_mode) {
-  if (mode == D117_PRED && best_intra_mode != V_PRED &&
+  if (mode == D113_PRED && best_intra_mode != V_PRED &&
       best_intra_mode != D135_PRED)
     return 1;
-  if (mode == D63_PRED && best_intra_mode != V_PRED &&
+  if (mode == D67_PRED && best_intra_mode != V_PRED &&
       best_intra_mode != D45_PRED)
     return 1;
-  if (mode == D207_PRED && best_intra_mode != H_PRED &&
+  if (mode == D203_PRED && best_intra_mode != H_PRED &&
       best_intra_mode != D45_PRED)
     return 1;
-  if (mode == D153_PRED && best_intra_mode != H_PRED &&
+  if (mode == D157_PRED && best_intra_mode != H_PRED &&
       best_intra_mode != D135_PRED)
     return 1;
   return 0;
@@ -2943,48 +3357,42 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
                                BLOCK_SIZE bsize, int mode_cost) {
   const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   assert(!is_inter_block(mbmi));
   RD_STATS this_rd_stats;
   int row, col;
   int64_t temp_sse, this_rd;
-  const TX_SIZE tx_size = tx_size_from_tx_mode(bsize, cpi->common.tx_mode, 0);
+  TX_SIZE tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
   const int stepr = tx_size_high_unit[tx_size];
   const int stepc = tx_size_wide_unit[tx_size];
   const int max_blocks_wide = max_block_wide(xd, bsize, 0);
   const int max_blocks_high = max_block_high(xd, bsize, 0);
   mbmi->tx_size = tx_size;
   // Prediction.
-  const int step = stepr * stepc;
-  int block = 0;
   for (row = 0; row < max_blocks_high; row += stepr) {
     for (col = 0; col < max_blocks_wide; col += stepc) {
-      av1_predict_intra_block_facade(cm, xd, 0, block, col, row, tx_size);
-      block += step;
+      av1_predict_intra_block_facade(cm, xd, 0, col, row, tx_size);
     }
   }
   // RD estimation.
   model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate,
-                  &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse);
-#if CONFIG_EXT_INTRA
-  if (av1_is_directional_mode(mbmi->mode, bsize) &&
-      av1_use_angle_delta(bsize)) {
-    mode_cost += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
-                                    MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
-  }
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  if (mbmi->mode == DC_PRED) {
-    const aom_prob prob = cpi->common.fc->filter_intra_probs[0];
-    if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) {
-      const int mode = mbmi->filter_intra_mode_info.filter_intra_mode[0];
-      mode_cost += (av1_cost_bit(prob, 1) +
-                    write_uniform_cost(FILTER_INTRA_MODES, mode));
+                  &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL,
+                  NULL, NULL);
+  if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
+    mode_cost +=
+        x->angle_delta_cost[mbmi->mode - V_PRED]
+                           [MAX_ANGLE_DELTA + mbmi->angle_delta[PLANE_TYPE_Y]];
+  }
+  if (mbmi->mode == DC_PRED &&
+      av1_filter_intra_allowed_bsize(cm, mbmi->sb_type)) {
+    if (mbmi->filter_intra_mode_info.use_filter_intra) {
+      const int mode = mbmi->filter_intra_mode_info.filter_intra_mode;
+      mode_cost += x->filter_intra_cost[mbmi->sb_type][1] +
+                   x->filter_intra_mode_cost[mode];
     } else {
-      mode_cost += av1_cost_bit(prob, 0);
+      mode_cost += x->filter_intra_cost[mbmi->sb_type][0];
     }
   }
-#endif  // CONFIG_FILTER_INTRA
   this_rd =
       RDCOST(x->rdmult, this_rd_stats.rate + mode_cost, this_rd_stats.dist);
   return this_rd;
@@ -3014,42 +3422,99 @@ static void extend_palette_color_map(uint8_t *const color_map, int orig_width,
   }
 }
 
-#if CONFIG_PALETTE_DELTA_ENCODING
 // Bias toward using colors in the cache.
 // TODO(huisu): Try other schemes to improve compression.
 static void optimize_palette_colors(uint16_t *color_cache, int n_cache,
-                                    int n_colors, int stride,
-                                    float *centroids) {
+                                    int n_colors, int stride, int *centroids) {
   if (n_cache <= 0) return;
   for (int i = 0; i < n_colors * stride; i += stride) {
-    float min_diff = fabsf(centroids[i] - color_cache[0]);
+    int min_diff = abs(centroids[i] - (int)color_cache[0]);
     int idx = 0;
     for (int j = 1; j < n_cache; ++j) {
-      float this_diff = fabsf(centroids[i] - color_cache[j]);
+      const int this_diff = abs(centroids[i] - color_cache[j]);
       if (this_diff < min_diff) {
         min_diff = this_diff;
         idx = j;
       }
     }
-    if (min_diff < 1.5) centroids[i] = color_cache[idx];
+    if (min_diff <= 1) centroids[i] = color_cache[idx];
   }
 }
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
 
-static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                     BLOCK_SIZE bsize, int palette_ctx,
-                                     int dc_mode_cost, MB_MODE_INFO *best_mbmi,
-                                     uint8_t *best_palette_color_map,
-                                     int64_t *best_rd, int64_t *best_model_rd,
-                                     int *rate, int *rate_tokenonly,
-                                     int64_t *distortion, int *skippable) {
+// Given the base colors as specified in centroids[], calculate the RD cost
+// of palette mode.
+static void palette_rd_y(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n,
+    uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
+    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
+    int *rate, int *rate_tokenonly, int *rate_overhead, int64_t *distortion,
+    int *skippable, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip) {
+  optimize_palette_colors(color_cache, n_cache, n, 1, centroids);
+  int k = av1_remove_duplicates(centroids, n);
+  if (k < PALETTE_MIN_SIZE) {
+    // Too few unique colors to create a palette. And DC_PRED will work
+    // well for that case anyway. So skip.
+    return;
+  }
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  if (cpi->common.use_highbitdepth)
+    for (int i = 0; i < k; ++i)
+      pmi->palette_colors[i] =
+          clip_pixel_highbd((int)centroids[i], cpi->common.bit_depth);
+  else
+    for (int i = 0; i < k; ++i)
+      pmi->palette_colors[i] = clip_pixel(centroids[i]);
+  pmi->palette_size[0] = k;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  int block_width, block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                           &cols);
+  av1_calc_indices(data, centroids, color_map, rows * cols, k, 1);
+  extend_palette_color_map(color_map, cols, rows, block_width, block_height);
+  const int palette_mode_cost =
+      intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost);
+  int64_t this_model_rd = intra_model_yrd(cpi, x, bsize, palette_mode_cost);
+  if (*best_model_rd != INT64_MAX &&
+      this_model_rd > *best_model_rd + (*best_model_rd >> 1))
+    return;
+  if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+  RD_STATS tokenonly_rd_stats;
+  super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+  if (tokenonly_rd_stats.rate == INT_MAX) return;
+  int this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
+  int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) {
+    tokenonly_rd_stats.rate -=
+        tx_size_cost(&cpi->common, x, bsize, mbmi->tx_size);
+  }
+  if (this_rd < *best_rd) {
+    *best_rd = this_rd;
+    memcpy(best_palette_color_map, color_map,
+           block_width * block_height * sizeof(color_map[0]));
+    *best_mbmi = *mbmi;
+    memcpy(blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+    *rate_overhead = this_rate - tokenonly_rd_stats.rate;
+    if (rate) *rate = this_rate;
+    if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
+    if (distortion) *distortion = tokenonly_rd_stats.dist;
+    if (skippable) *skippable = tokenonly_rd_stats.skip;
+  }
+}
+
+static int rd_pick_palette_intra_sby(
+    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+    int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
+    int64_t *best_rd, int64_t *best_model_rd, int *rate, int *rate_tokenonly,
+    int64_t *distortion, int *skippable, PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip) {
   int rate_overhead = 0;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mic = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mic->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   assert(!is_inter_block(mbmi));
-  assert(bsize >= BLOCK_8X8);
-  int this_rate, colors, n;
+  assert(av1_allow_palette(cpi->common.allow_screen_content_tools, bsize));
+  int colors, n;
   const int src_stride = x->plane[0].src.stride;
   const uint8_t *const src = x->plane[0].src.buf;
   uint8_t *const color_map = xd->plane[0].color_index_map;
@@ -3057,37 +3522,26 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
                            &cols);
 
-  assert(cpi->common.allow_screen_content_tools);
-
-#if CONFIG_HIGHBITDEPTH
+  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
   if (cpi->common.use_highbitdepth)
     colors = av1_count_colors_highbd(src, src_stride, rows, cols,
-                                     cpi->common.bit_depth);
+                                     cpi->common.bit_depth, count_buf);
   else
-#endif  // CONFIG_HIGHBITDEPTH
-    colors = av1_count_colors(src, src_stride, rows, cols);
-#if CONFIG_FILTER_INTRA
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-#endif  // CONFIG_FILTER_INTRA
+    colors = av1_count_colors(src, src_stride, rows, cols, count_buf);
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
 
   if (colors > 1 && colors <= 64) {
-    int r, c, i, k, palette_mode_cost;
+    int r, c, i;
     const int max_itr = 50;
-    float *const data = x->palette_buffer->kmeans_data_buf;
-    float centroids[PALETTE_MAX_SIZE];
-    float lb, ub, val;
-    RD_STATS tokenonly_rd_stats;
-    int64_t this_rd, this_model_rd;
-    PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-#if CONFIG_HIGHBITDEPTH
+    int *const data = x->palette_buffer->kmeans_data_buf;
+    int centroids[PALETTE_MAX_SIZE];
+    int lb, ub, val;
     uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
     if (cpi->common.use_highbitdepth)
       lb = ub = src16[0];
     else
-#endif  // CONFIG_HIGHBITDEPTH
       lb = ub = src[0];
 
-#if CONFIG_HIGHBITDEPTH
     if (cpi->common.use_highbitdepth) {
       for (r = 0; r < rows; ++r) {
         for (c = 0; c < cols; ++c) {
@@ -3100,7 +3554,6 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
         }
       }
     } else {
-#endif  // CONFIG_HIGHBITDEPTH
       for (r = 0; r < rows; ++r) {
         for (c = 0; c < cols; ++c) {
           val = src[r * src_stride + c];
@@ -3111,99 +3564,57 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
             ub = val;
         }
       }
-#if CONFIG_HIGHBITDEPTH
     }
-#endif  // CONFIG_HIGHBITDEPTH
 
     mbmi->mode = DC_PRED;
-#if CONFIG_FILTER_INTRA
-    mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-#endif  // CONFIG_FILTER_INTRA
-
-    if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return 0;
+    mbmi->filter_intra_mode_info.use_filter_intra = 0;
 
-#if CONFIG_PALETTE_DELTA_ENCODING
     uint16_t color_cache[2 * PALETTE_MAX_SIZE];
     const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
 
-    for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
-         --n) {
+    // Find the dominant colors, stored in top_colors[].
+    int top_colors[PALETTE_MAX_SIZE] = { 0 };
+    for (i = 0; i < AOMMIN(colors, PALETTE_MAX_SIZE); ++i) {
+      int max_count = 0;
+      for (int j = 0; j < (1 << cpi->common.bit_depth); ++j) {
+        if (count_buf[j] > max_count) {
+          max_count = count_buf[j];
+          top_colors[i] = j;
+        }
+      }
+      assert(max_count > 0);
+      count_buf[top_colors[i]] = 0;
+    }
+
+    // Try the dominant colors directly.
+    // TODO(huisu@google.com): Try to avoid duplicate computation in cases
+    // where the dominant colors and the k-means results are similar.
+    for (n = AOMMIN(colors, PALETTE_MAX_SIZE); n >= 2; --n) {
+      for (i = 0; i < n; ++i) centroids[i] = top_colors[i];
+      palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                   color_cache, n_cache, best_mbmi, best_palette_color_map,
+                   best_rd, best_model_rd, rate, rate_tokenonly, &rate_overhead,
+                   distortion, skippable, ctx, best_blk_skip);
+    }
+
+    // K-means clustering.
+    for (n = AOMMIN(colors, PALETTE_MAX_SIZE); n >= 2; --n) {
       if (colors == PALETTE_MIN_SIZE) {
         // Special case: These colors automatically become the centroids.
         assert(colors == n);
         assert(colors == 2);
         centroids[0] = lb;
         centroids[1] = ub;
-        k = 2;
       } else {
         for (i = 0; i < n; ++i) {
           centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
         }
         av1_k_means(data, centroids, color_map, rows * cols, n, 1, max_itr);
-#if CONFIG_PALETTE_DELTA_ENCODING
-        optimize_palette_colors(color_cache, n_cache, n, 1, centroids);
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
-        k = av1_remove_duplicates(centroids, n);
-        if (k < PALETTE_MIN_SIZE) {
-          // Too few unique colors to create a palette. And DC_PRED will work
-          // well for that case anyway. So skip.
-          continue;
-        }
-      }
-
-#if CONFIG_HIGHBITDEPTH
-      if (cpi->common.use_highbitdepth)
-        for (i = 0; i < k; ++i)
-          pmi->palette_colors[i] =
-              clip_pixel_highbd((int)centroids[i], cpi->common.bit_depth);
-      else
-#endif  // CONFIG_HIGHBITDEPTH
-        for (i = 0; i < k; ++i)
-          pmi->palette_colors[i] = clip_pixel((int)centroids[i]);
-      pmi->palette_size[0] = k;
-
-      av1_calc_indices(data, centroids, color_map, rows * cols, k, 1);
-      extend_palette_color_map(color_map, cols, rows, block_width,
-                               block_height);
-      palette_mode_cost =
-          dc_mode_cost +
-          x->palette_y_size_cost[bsize - BLOCK_8X8][k - PALETTE_MIN_SIZE] +
-          write_uniform_cost(k, color_map[0]) +
-          av1_cost_bit(
-              av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx],
-              1);
-      palette_mode_cost += av1_palette_color_cost_y(pmi,
-#if CONFIG_PALETTE_DELTA_ENCODING
-                                                    color_cache, n_cache,
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
-                                                    cpi->common.bit_depth);
-      palette_mode_cost +=
-          av1_cost_color_map(x, 0, 0, bsize, mbmi->tx_size, PALETTE_MAP);
-      this_model_rd = intra_model_yrd(cpi, x, bsize, palette_mode_cost);
-      if (*best_model_rd != INT64_MAX &&
-          this_model_rd > *best_model_rd + (*best_model_rd >> 1))
-        continue;
-      if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
-      super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
-      if (tokenonly_rd_stats.rate == INT_MAX) continue;
-      this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
-      this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-      if (!xd->lossless[mbmi->segment_id] &&
-          block_signals_txsize(mbmi->sb_type)) {
-        tokenonly_rd_stats.rate -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
-      }
-      if (this_rd < *best_rd) {
-        *best_rd = this_rd;
-        memcpy(best_palette_color_map, color_map,
-               block_width * block_height * sizeof(color_map[0]));
-        *best_mbmi = *mbmi;
-        rate_overhead = this_rate - tokenonly_rd_stats.rate;
-        if (rate) *rate = this_rate;
-        if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
-        if (distortion) *distortion = tokenonly_rd_stats.dist;
-        if (skippable) *skippable = tokenonly_rd_stats.skip;
       }
+      palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                   color_cache, n_cache, best_mbmi, best_palette_color_map,
+                   best_rd, best_model_rd, rate, rate_tokenonly, &rate_overhead,
+                   distortion, skippable, ctx, best_blk_skip);
     }
   }
 
@@ -3215,663 +3626,30 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   return rate_overhead;
 }
 
-static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
-    const AV1_COMP *const cpi, MACROBLOCK *x, int row, int col,
-    PREDICTION_MODE *best_mode, const int *bmode_costs, ENTROPY_CONTEXT *a,
-    ENTROPY_CONTEXT *l, int *bestrate, int *bestratey, int64_t *bestdistortion,
-    BLOCK_SIZE bsize, TX_SIZE tx_size, int *y_skip, int64_t rd_thresh) {
-  const AV1_COMMON *const cm = &cpi->common;
-  PREDICTION_MODE mode;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  assert(!is_inter_block(&xd->mi[0]->mbmi));
-  int64_t best_rd = rd_thresh;
-  struct macroblock_plane *p = &x->plane[0];
-  struct macroblockd_plane *pd = &xd->plane[0];
-  const int src_stride = p->src.stride;
-  const int dst_stride = pd->dst.stride;
-  const uint8_t *src_init = &p->src.buf[row * 4 * src_stride + col * 4];
-  uint8_t *dst_init = &pd->dst.buf[row * 4 * dst_stride + col * 4];
-#if CONFIG_CHROMA_2X2
-  // TODO(jingning): This is a temporal change. The whole function should be
-  // out when cb4x4 is enabled.
-  ENTROPY_CONTEXT ta[4], tempa[4];
-  ENTROPY_CONTEXT tl[4], templ[4];
-#else
-  ENTROPY_CONTEXT ta[2], tempa[2];
-  ENTROPY_CONTEXT tl[2], templ[2];
-#endif  // CONFIG_CHROMA_2X2
-
-  const int pred_width_in_4x4_blocks = num_4x4_blocks_wide_lookup[bsize];
-  const int pred_height_in_4x4_blocks = num_4x4_blocks_high_lookup[bsize];
-  const int tx_width_unit = tx_size_wide_unit[tx_size];
-  const int tx_height_unit = tx_size_high_unit[tx_size];
-  const int pred_block_width = block_size_wide[bsize];
-  const int pred_block_height = block_size_high[bsize];
-  const int tx_width = tx_size_wide[tx_size];
-  const int tx_height = tx_size_high[tx_size];
-  const int pred_width_in_transform_blocks = pred_block_width / tx_width;
-  const int pred_height_in_transform_blocks = pred_block_height / tx_height;
-  int idx, idy;
-  int best_can_skip = 0;
-  uint8_t best_dst[8 * 8];
-#if CONFIG_HIGHBITDEPTH
-  uint16_t best_dst16[8 * 8];
-#endif  // CONFIG_HIGHBITDEPTH
-  const int is_lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-  const int sub_bsize = bsize;
-#else
-  const int sub_bsize = BLOCK_4X4;
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
-
-#if CONFIG_PVQ
-  od_rollback_buffer pre_buf, post_buf;
-  od_encode_checkpoint(&x->daala_enc, &pre_buf);
-  od_encode_checkpoint(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
-
-  assert(bsize < BLOCK_8X8);
-  assert(tx_width < 8 || tx_height < 8);
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-  if (is_lossless)
-    assert(tx_width == 4 && tx_height == 4);
-  else
-    assert(tx_width == pred_block_width && tx_height == pred_block_height);
-#else
-  assert(tx_width == 4 && tx_height == 4);
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
-
-  memcpy(ta, a, pred_width_in_transform_blocks * sizeof(a[0]));
-  memcpy(tl, l, pred_height_in_transform_blocks * sizeof(l[0]));
-
-  xd->mi[0]->mbmi.tx_size = tx_size;
-
-  xd->mi[0]->mbmi.palette_mode_info.palette_size[0] = 0;
-
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-#if CONFIG_PVQ
-    od_encode_checkpoint(&x->daala_enc, &pre_buf);
-#endif
-    for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
-      int64_t this_rd;
-      int ratey = 0;
-      int64_t distortion = 0;
-      int rate = bmode_costs[mode];
-      int can_skip = 1;
-
-      if (!(cpi->sf.intra_y_mode_mask[txsize_sqr_up_map[tx_size]] &
-            (1 << mode)))
-        continue;
-
-      // Only do the oblique modes if the best so far is
-      // one of the neighboring directional modes
-      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
-        if (conditional_skipintra(mode, *best_mode)) continue;
-      }
-
-      memcpy(tempa, ta, pred_width_in_transform_blocks * sizeof(ta[0]));
-      memcpy(templ, tl, pred_height_in_transform_blocks * sizeof(tl[0]));
-
-      for (idy = 0; idy < pred_height_in_transform_blocks; ++idy) {
-        for (idx = 0; idx < pred_width_in_transform_blocks; ++idx) {
-          const int block_raster_idx = (row + idy) * 2 + (col + idx);
-          const int block =
-              av1_raster_order_to_block_index(tx_size, block_raster_idx);
-          const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
-          uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
-#if !CONFIG_PVQ
-          int16_t *const src_diff = av1_raster_block_offset_int16(
-              BLOCK_8X8, block_raster_idx, p->src_diff);
-#endif
-          int skip;
-          assert(block < 4);
-          assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
-                         idx == 0 && idy == 0));
-          assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
-                         block == 0 || block == 2));
-          xd->mi[0]->bmi[block_raster_idx].as_mode = mode;
-          av1_predict_intra_block(
-              cm, xd, pd->width, pd->height, txsize_to_bsize[tx_size], mode,
-              dst, dst_stride, dst, dst_stride, col + idx, row + idy, 0);
-#if !CONFIG_PVQ
-          aom_highbd_subtract_block(tx_height, tx_width, src_diff, 8, src,
-                                    src_stride, dst, dst_stride, xd->bd);
-#endif
-          if (is_lossless) {
-            TX_TYPE tx_type =
-                av1_get_tx_type(PLANE_TYPE_Y, xd, 0, 0, block, tx_size);
-            const SCAN_ORDER *scan_order =
-                get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
-            const int coeff_ctx =
-                combine_entropy_contexts(tempa[idx], templ[idy]);
-#if !CONFIG_PVQ
-            av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
-                            tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
-            ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size,
-                                     scan_order, tempa + idx, templ + idy,
-                                     cpi->sf.use_fast_coef_costing);
-            skip = (p->eobs[block] == 0);
-            can_skip &= skip;
-            tempa[idx] = !skip;
-            templ[idy] = !skip;
-#if CONFIG_EXT_TX
-            if (tx_size == TX_8X4) {
-              tempa[idx + 1] = tempa[idx];
-            } else if (tx_size == TX_4X8) {
-              templ[idy + 1] = templ[idy];
-            }
-#endif  // CONFIG_EXT_TX
-#else
-            (void)scan_order;
-
-            av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
-                            tx_size, coeff_ctx, AV1_XFORM_QUANT_B);
-
-            ratey += x->rate;
-            skip = x->pvq_skip[0];
-            tempa[idx] = !skip;
-            templ[idy] = !skip;
-            can_skip &= skip;
-#endif
-            if (RDCOST(x->rdmult, ratey, distortion) >= best_rd)
-              goto next_highbd;
-#if CONFIG_PVQ
-            if (!skip)
-#endif
-              av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
-#if CONFIG_LGT_FROM_PRED
-                                          mode,
-#endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                          BLOCK_OFFSET(xd->mrc_mask, block),
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                          DCT_DCT, tx_size, dst, dst_stride,
-                                          p->eobs[block]);
-          } else {
-            int64_t dist;
-            unsigned int tmp;
-            TX_TYPE tx_type =
-                av1_get_tx_type(PLANE_TYPE_Y, xd, 0, 0, block, tx_size);
-            const SCAN_ORDER *scan_order =
-                get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
-            const int coeff_ctx =
-                combine_entropy_contexts(tempa[idx], templ[idy]);
-#if !CONFIG_PVQ
-#if DISABLE_TRELLISQ_SEARCH
-            av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
-                            tx_size, coeff_ctx, AV1_XFORM_QUANT_B);
-#else
-            av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
-                            tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
-            av1_optimize_b(cm, x, 0, 0, 0, block, BLOCK_8X8, tx_size,
-                           tempa + idx, templ + idy, 1);
-#endif  // DISABLE_TRELLISQ_SEARCH
-            ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size,
-                                     scan_order, tempa + idx, templ + idy,
-                                     cpi->sf.use_fast_coef_costing);
-            skip = (p->eobs[block] == 0);
-            can_skip &= skip;
-            tempa[idx] = !skip;
-            templ[idy] = !skip;
-#if CONFIG_EXT_TX
-            if (tx_size == TX_8X4) {
-              tempa[idx + 1] = tempa[idx];
-            } else if (tx_size == TX_4X8) {
-              templ[idy + 1] = templ[idy];
-            }
-#endif  // CONFIG_EXT_TX
-#else
-            (void)scan_order;
-
-            av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
-                            tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
-            ratey += x->rate;
-            skip = x->pvq_skip[0];
-            tempa[idx] = !skip;
-            templ[idy] = !skip;
-            can_skip &= skip;
-#endif
-#if CONFIG_PVQ
-            if (!skip)
-#endif
-              av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
-#if CONFIG_LGT_FROM_PRED
-                                          mode,
-#endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                          BLOCK_OFFSET(xd->mrc_mask, block),
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                          tx_type, tx_size, dst, dst_stride,
-                                          p->eobs[block]);
-            cpi->fn_ptr[sub_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
-            dist = (int64_t)tmp << 4;
-            distortion += dist;
-            if (RDCOST(x->rdmult, ratey, distortion) >= best_rd)
-              goto next_highbd;
-          }
-        }
-      }
-
-      rate += ratey;
-      this_rd = RDCOST(x->rdmult, rate, distortion);
-
-      if (this_rd < best_rd) {
-        *bestrate = rate;
-        *bestratey = ratey;
-        *bestdistortion = distortion;
-        best_rd = this_rd;
-        best_can_skip = can_skip;
-        *best_mode = mode;
-        memcpy(a, tempa, pred_width_in_transform_blocks * sizeof(tempa[0]));
-        memcpy(l, templ, pred_height_in_transform_blocks * sizeof(templ[0]));
-#if CONFIG_PVQ
-        od_encode_checkpoint(&x->daala_enc, &post_buf);
-#endif
-        for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy) {
-          memcpy(best_dst16 + idy * 8,
-                 CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
-                 pred_width_in_transform_blocks * 4 * sizeof(uint16_t));
-        }
-      }
-    next_highbd : {}
-#if CONFIG_PVQ
-      od_encode_rollback(&x->daala_enc, &pre_buf);
-#endif
-    }
-
-    if (best_rd >= rd_thresh) return best_rd;
-
-#if CONFIG_PVQ
-    od_encode_rollback(&x->daala_enc, &post_buf);
-#endif
-
-    if (y_skip) *y_skip &= best_can_skip;
-
-    for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy) {
-      memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
-             best_dst16 + idy * 8,
-             pred_width_in_transform_blocks * 4 * sizeof(uint16_t));
-    }
-
-    return best_rd;
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_PVQ
-  od_encode_checkpoint(&x->daala_enc, &pre_buf);
-#endif  // CONFIG_PVQ
-
-  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
-    int64_t this_rd;
-    int ratey = 0;
-    int64_t distortion = 0;
-    int rate = bmode_costs[mode];
-    int can_skip = 1;
-
-    if (!(cpi->sf.intra_y_mode_mask[txsize_sqr_up_map[tx_size]] &
-          (1 << mode))) {
-      continue;
-    }
-
-    // Only do the oblique modes if the best so far is
-    // one of the neighboring directional modes
-    if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
-      if (conditional_skipintra(mode, *best_mode)) continue;
-    }
-
-    memcpy(tempa, ta, pred_width_in_transform_blocks * sizeof(ta[0]));
-    memcpy(templ, tl, pred_height_in_transform_blocks * sizeof(tl[0]));
-
-    for (idy = 0; idy < pred_height_in_4x4_blocks; idy += tx_height_unit) {
-      for (idx = 0; idx < pred_width_in_4x4_blocks; idx += tx_width_unit) {
-        const int block_raster_idx = (row + idy) * 2 + (col + idx);
-        int block = av1_raster_order_to_block_index(tx_size, block_raster_idx);
-        const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
-        uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
-#if !CONFIG_PVQ
-        int16_t *const src_diff = av1_raster_block_offset_int16(
-            BLOCK_8X8, block_raster_idx, p->src_diff);
-#endif  // !CONFIG_PVQ
-        int skip;
-        assert(block < 4);
-        assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
-                       idx == 0 && idy == 0));
-        assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
-                       block == 0 || block == 2));
-        xd->mi[0]->bmi[block_raster_idx].as_mode = mode;
-        av1_predict_intra_block(cm, xd, pd->width, pd->height,
-                                txsize_to_bsize[tx_size], mode, dst, dst_stride,
-                                dst, dst_stride,
-#if CONFIG_CB4X4
-                                2 * (col + idx), 2 * (row + idy),
-#else
-                                col + idx, row + idy,
-#endif  // CONFIG_CB4X4
-                                0);
-#if !CONFIG_PVQ
-        aom_subtract_block(tx_height, tx_width, src_diff, 8, src, src_stride,
-                           dst, dst_stride);
-#endif  // !CONFIG_PVQ
-        TX_TYPE tx_type =
-            av1_get_tx_type(PLANE_TYPE_Y, xd, 0, 0, block, tx_size);
-        const SCAN_ORDER *scan_order =
-            get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
-        const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]);
-#if CONFIG_CB4X4
-        block = 4 * block;
-#endif  // CONFIG_CB4X4
-#if !CONFIG_PVQ
-#if DISABLE_TRELLISQ_SEARCH
-        av1_xform_quant(cm, x, 0, block,
-#if CONFIG_CB4X4
-                        2 * (row + idy), 2 * (col + idx),
-#else
-                        row + idy, col + idx,
-#endif  // CONFIG_CB4X4
-                        BLOCK_8X8, tx_size, coeff_ctx, AV1_XFORM_QUANT_B);
-#else
-        const AV1_XFORM_QUANT xform_quant =
-            is_lossless ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
-        av1_xform_quant(cm, x, 0, block,
-#if CONFIG_CB4X4
-                        2 * (row + idy), 2 * (col + idx),
-#else
-                        row + idy, col + idx,
-#endif  // CONFIG_CB4X4
-                        BLOCK_8X8, tx_size, coeff_ctx, xform_quant);
-
-        av1_optimize_b(cm, x, 0, 0, 0, block, BLOCK_8X8, tx_size, tempa + idx,
-                       templ + idy, 1);
-#endif  // DISABLE_TRELLISQ_SEARCH
-        ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size, scan_order,
-                                 tempa + idx, templ + idy,
-                                 cpi->sf.use_fast_coef_costing);
-        skip = (p->eobs[block] == 0);
-        can_skip &= skip;
-        tempa[idx] = !skip;
-        templ[idy] = !skip;
-#if CONFIG_EXT_TX
-        if (tx_size == TX_8X4) {
-          tempa[idx + 1] = tempa[idx];
-        } else if (tx_size == TX_4X8) {
-          templ[idy + 1] = templ[idy];
-        }
-#endif  // CONFIG_EXT_TX
-#else
-        (void)scan_order;
-
-        av1_xform_quant(cm, x, 0, block,
-#if CONFIG_CB4X4
-                        2 * (row + idy), 2 * (col + idx),
-#else
-                        row + idy, col + idx,
-#endif  // CONFIG_CB4X4
-                        BLOCK_8X8, tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
-
-        ratey += x->rate;
-        skip = x->pvq_skip[0];
-        tempa[idx] = !skip;
-        templ[idy] = !skip;
-        can_skip &= skip;
-#endif  // !CONFIG_PVQ
-
-        if (!is_lossless) {  // To use the pixel domain distortion, we need to
-                             // calculate inverse txfm *before* calculating RD
-                             // cost. Compared to calculating the distortion in
-                             // the frequency domain, the overhead of encoding
-                             // effort is low.
-#if CONFIG_PVQ
-          if (!skip)
-#endif  // CONFIG_PVQ
-            av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
-#if CONFIG_LGT_FROM_PRED
-                                        mode,
-#endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                        BLOCK_OFFSET(xd->mrc_mask, block),
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                        tx_type, tx_size, dst, dst_stride,
-                                        p->eobs[block]);
-          unsigned int tmp;
-          cpi->fn_ptr[sub_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
-          const int64_t dist = (int64_t)tmp << 4;
-          distortion += dist;
-        }
-
-        if (RDCOST(x->rdmult, ratey, distortion) >= best_rd) goto next;
-
-        if (is_lossless) {  // Calculate inverse txfm *after* RD cost.
-#if CONFIG_PVQ
-          if (!skip)
-#endif  // CONFIG_PVQ
-            av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
-#if CONFIG_LGT_FROM_PRED
-                                        mode,
-#endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                        BLOCK_OFFSET(xd->mrc_mask, block),
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                                        DCT_DCT, tx_size, dst, dst_stride,
-                                        p->eobs[block]);
-        }
-      }
-    }
-
-    rate += ratey;
-    this_rd = RDCOST(x->rdmult, rate, distortion);
-
-    if (this_rd < best_rd) {
-      *bestrate = rate;
-      *bestratey = ratey;
-      *bestdistortion = distortion;
-      best_rd = this_rd;
-      best_can_skip = can_skip;
-      *best_mode = mode;
-      memcpy(a, tempa, pred_width_in_transform_blocks * sizeof(tempa[0]));
-      memcpy(l, templ, pred_height_in_transform_blocks * sizeof(templ[0]));
-#if CONFIG_PVQ
-      od_encode_checkpoint(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
-      for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy)
-        memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
-               pred_width_in_transform_blocks * 4);
-    }
-  next : {}
-#if CONFIG_PVQ
-    od_encode_rollback(&x->daala_enc, &pre_buf);
-#endif  // CONFIG_PVQ
-  }     // mode decision loop
-
-  if (best_rd >= rd_thresh) return best_rd;
-
-#if CONFIG_PVQ
-  od_encode_rollback(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
-
-  if (y_skip) *y_skip &= best_can_skip;
-
-  for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy)
-    memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
-           pred_width_in_transform_blocks * 4);
-
-  return best_rd;
-}
-
-static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
-                                            MACROBLOCK *mb, int *rate,
-                                            int *rate_y, int64_t *distortion,
-                                            int *y_skip, int64_t best_rd) {
-  const MACROBLOCKD *const xd = &mb->e_mbd;
-  MODE_INFO *const mic = xd->mi[0];
-  const MODE_INFO *above_mi = xd->above_mi;
-  const MODE_INFO *left_mi = xd->left_mi;
-  MB_MODE_INFO *const mbmi = &mic->mbmi;
-  assert(!is_inter_block(mbmi));
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const int pred_width_in_4x4_blocks = num_4x4_blocks_wide_lookup[bsize];
-  const int pred_height_in_4x4_blocks = num_4x4_blocks_high_lookup[bsize];
-  int idx, idy;
-  int cost = 0;
-  int64_t total_distortion = 0;
-  int tot_rate_y = 0;
-  int64_t total_rd = 0;
-  const int *bmode_costs = mb->mbmode_cost[0];
-  const int is_lossless = xd->lossless[mbmi->segment_id];
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-  const TX_SIZE tx_size = is_lossless ? TX_4X4 : max_txsize_rect_lookup[bsize];
-#else
-  const TX_SIZE tx_size = TX_4X4;
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
-
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-  mbmi->intra_filter = INTRA_FILTER_LINEAR;
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-#endif  // CONFIG_FILTER_INTRA
-
-  // TODO(any): Add search of the tx_type to improve rd performance at the
-  // expense of speed.
-  mbmi->tx_type = DCT_DCT;
-  mbmi->tx_size = tx_size;
-#if CONFIG_LGT_FROM_PRED
-  mbmi->use_lgt = 0;
-#endif
-
-  if (y_skip) *y_skip = 1;
-
-  // Pick modes for each prediction sub-block (of size 4x4, 4x8, or 8x4) in this
-  // 8x8 coding block.
-  for (idy = 0; idy < 2; idy += pred_height_in_4x4_blocks) {
-    for (idx = 0; idx < 2; idx += pred_width_in_4x4_blocks) {
-      PREDICTION_MODE best_mode = DC_PRED;
-      int r = INT_MAX, ry = INT_MAX;
-      int64_t d = INT64_MAX, this_rd = INT64_MAX;
-      int j;
-      const int pred_block_idx = idy * 2 + idx;
-      if (cpi->common.frame_type == KEY_FRAME) {
-        const PREDICTION_MODE A =
-            av1_above_block_mode(mic, above_mi, pred_block_idx);
-        const PREDICTION_MODE L =
-            av1_left_block_mode(mic, left_mi, pred_block_idx);
-
-#if CONFIG_KF_CTX
-        const int above_ctx = intra_mode_context[A];
-        const int left_ctx = intra_mode_context[L];
-        bmode_costs = mb->y_mode_costs[above_ctx][left_ctx];
-#else
-        bmode_costs = mb->y_mode_costs[A][L];
-#endif
-      }
-      this_rd = rd_pick_intra_sub_8x8_y_subblock_mode(
-          cpi, mb, idy, idx, &best_mode, bmode_costs,
-          xd->plane[0].above_context + idx, xd->plane[0].left_context + idy, &r,
-          &ry, &d, bsize, tx_size, y_skip, best_rd - total_rd);
-#if CONFIG_DIST_8X8
-      if (!cpi->oxcf.using_dist_8x8)
-#endif
-        if (this_rd >= best_rd - total_rd) return INT64_MAX;
-
-      total_rd += this_rd;
-      cost += r;
-      total_distortion += d;
-      tot_rate_y += ry;
-
-      mic->bmi[pred_block_idx].as_mode = best_mode;
-      for (j = 1; j < pred_height_in_4x4_blocks; ++j)
-        mic->bmi[pred_block_idx + j * 2].as_mode = best_mode;
-      for (j = 1; j < pred_width_in_4x4_blocks; ++j)
-        mic->bmi[pred_block_idx + j].as_mode = best_mode;
-
-      if (total_rd >= best_rd) return INT64_MAX;
-    }
-  }
-  mbmi->mode = mic->bmi[3].as_mode;
-
-#if CONFIG_DIST_8X8
-  if (cpi->oxcf.using_dist_8x8) {
-    const struct macroblock_plane *p = &mb->plane[0];
-    const struct macroblockd_plane *pd = &xd->plane[0];
-    const int src_stride = p->src.stride;
-    const int dst_stride = pd->dst.stride;
-    uint8_t *src = p->src.buf;
-    uint8_t *dst = pd->dst.buf;
-
-    // Daala-defined distortion computed for the block of 8x8 pixels
-    total_distortion = av1_dist_8x8(cpi, mb, src, src_stride, dst, dst_stride,
-                                    BLOCK_8X8, 8, 8, 8, 8, mb->qindex)
-                       << 4;
-  }
-#endif  // CONFIG_DIST_8X8
-  // Add in the cost of the transform type
-  if (!is_lossless) {
-    int rate_tx_type = 0;
-#if CONFIG_EXT_TX
-    if (get_ext_tx_types(tx_size, bsize, 0, cpi->common.reduced_tx_set_used) >
-        1) {
-      const int eset =
-          get_ext_tx_set(tx_size, bsize, 0, cpi->common.reduced_tx_set_used);
-#if CONFIG_LGT_FROM_PRED
-      if (LGT_FROM_PRED_INTRA && is_lgt_allowed(mbmi->mode, tx_size))
-        rate_tx_type += mb->intra_lgt_cost[txsize_sqr_map[tx_size]][mbmi->mode]
-                                          [mbmi->use_lgt];
-      if (!LGT_FROM_PRED_INTRA || !mbmi->use_lgt)
-#endif  // CONFIG_LGT_FROM_PRED
-        rate_tx_type += mb->intra_tx_type_costs[eset][txsize_sqr_map[tx_size]]
-                                               [mbmi->mode][mbmi->tx_type];
-    }
-#else
-    rate_tx_type =
-        mb->intra_tx_type_costs[txsize_sqr_map[tx_size]]
-                               [intra_mode_to_tx_type_context[mbmi->mode]]
-                               [mbmi->tx_type];
-#endif  // CONFIG_EXT_TX
-    assert(mbmi->tx_size == tx_size);
-    cost += rate_tx_type;
-    tot_rate_y += rate_tx_type;
-  }
-
-  *rate = cost;
-  *rate_y = tot_rate_y;
-  *distortion = total_distortion;
-
-  return RDCOST(mb->rdmult, cost, total_distortion);
-}
-
-#if CONFIG_FILTER_INTRA
 // Return 1 if an filter intra mode is selected; return 0 otherwise.
 static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     int *rate, int *rate_tokenonly,
                                     int64_t *distortion, int *skippable,
                                     BLOCK_SIZE bsize, int mode_cost,
                                     int64_t *best_rd, int64_t *best_model_rd,
-                                    uint16_t skip_mask) {
+                                    PICK_MODE_CONTEXT *ctx) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mic = xd->mi[0];
-  MB_MODE_INFO *mbmi = &mic->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   int filter_intra_selected_flag = 0;
   FILTER_INTRA_MODE mode;
-  TX_SIZE best_tx_size = TX_4X4;
+  TX_SIZE best_tx_size = TX_8X8;
   FILTER_INTRA_MODE_INFO filter_intra_mode_info;
-  TX_TYPE best_tx_type;
-#if CONFIG_LGT_FROM_PRED
-  int use_lgt_when_selected;
-#endif
-
+  TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
+  (void)ctx;
   av1_zero(filter_intra_mode_info);
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 1;
+  mbmi->filter_intra_mode_info.use_filter_intra = 1;
   mbmi->mode = DC_PRED;
   mbmi->palette_mode_info.palette_size[0] = 0;
 
   for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
-    int this_rate;
     int64_t this_rd, this_model_rd;
     RD_STATS tokenonly_rd_stats;
-    if (skip_mask & (1 << mode)) continue;
-    mbmi->filter_intra_mode_info.filter_intra_mode[0] = mode;
+    mbmi->filter_intra_mode_info.filter_intra_mode = mode;
     this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost);
     if (*best_model_rd != INT64_MAX &&
         this_model_rd > *best_model_rd + (*best_model_rd >> 1))
@@ -3879,19 +3657,19 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
     if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
     super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
     if (tokenonly_rd_stats.rate == INT_MAX) continue;
-    this_rate = tokenonly_rd_stats.rate +
-                av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 1) +
-                write_uniform_cost(FILTER_INTRA_MODES, mode) + mode_cost;
+    const int this_rate =
+        tokenonly_rd_stats.rate +
+        intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost);
     this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
 
     if (this_rd < *best_rd) {
       *best_rd = this_rd;
-      best_tx_size = mic->mbmi.tx_size;
+      best_tx_size = mbmi->tx_size;
       filter_intra_mode_info = mbmi->filter_intra_mode_info;
-      best_tx_type = mic->mbmi.tx_type;
-#if CONFIG_LGT_FROM_PRED
-      use_lgt_when_selected = mic->mbmi.use_lgt;
-#endif
+      memcpy(best_txk_type, mbmi->txk_type,
+             sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
+      memcpy(ctx->blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
       *rate = this_rate;
       *rate_tokenonly = tokenonly_rd_stats.rate;
       *distortion = tokenonly_rd_stats.dist;
@@ -3903,43 +3681,31 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   if (filter_intra_selected_flag) {
     mbmi->mode = DC_PRED;
     mbmi->tx_size = best_tx_size;
-#if CONFIG_LGT_FROM_PRED
-    mbmi->use_lgt = use_lgt_when_selected;
-#endif
-    mbmi->filter_intra_mode_info.use_filter_intra_mode[0] =
-        filter_intra_mode_info.use_filter_intra_mode[0];
-    mbmi->filter_intra_mode_info.filter_intra_mode[0] =
-        filter_intra_mode_info.filter_intra_mode[0];
-    mbmi->tx_type = best_tx_type;
+    mbmi->filter_intra_mode_info = filter_intra_mode_info;
+    memcpy(mbmi->txk_type, best_txk_type,
+           sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
     return 1;
   } else {
     return 0;
   }
 }
-#endif  // CONFIG_FILTER_INTRA
 
-#if CONFIG_EXT_INTRA
 // Run RD calculation with given luma intra prediction angle., and return
 // the RD cost. Update the best mode info. if the RD cost is the best so far.
 static int64_t calc_rd_given_intra_angle(
     const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost,
     int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate,
     RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size,
-    TX_TYPE *best_tx_type,
-#if CONFIG_LGT_FROM_PRED
-    int *use_lgt_when_selected,
-#endif
-#if CONFIG_INTRA_INTERP
-    INTRA_FILTER *best_filter,
-#endif  // CONFIG_INTRA_INTERP
-    int64_t *best_rd, int64_t *best_model_rd) {
+    int64_t *best_rd, int64_t *best_model_rd, TX_TYPE *best_txk_type,
+    uint8_t *best_blk_skip) {
   int this_rate;
   RD_STATS tokenonly_rd_stats;
   int64_t this_rd, this_model_rd;
-  MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
+  MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
+  const int n4 = bsize_to_num_blk(bsize);
   assert(!is_inter_block(mbmi));
 
-  mbmi->angle_delta[0] = angle_delta;
+  mbmi->angle_delta[PLANE_TYPE_Y] = angle_delta;
   this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost);
   if (*best_model_rd != INT64_MAX &&
       this_model_rd > *best_model_rd + (*best_model_rd >> 1))
@@ -3948,22 +3714,19 @@ static int64_t calc_rd_given_intra_angle(
   super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in);
   if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX;
 
-  this_rate = tokenonly_rd_stats.rate + mode_cost +
-              write_uniform_cost(2 * max_angle_delta + 1,
-                                 mbmi->angle_delta[0] + max_angle_delta);
+  this_rate =
+      tokenonly_rd_stats.rate + mode_cost +
+      x->angle_delta_cost[mbmi->mode - V_PRED]
+                         [max_angle_delta + mbmi->angle_delta[PLANE_TYPE_Y]];
   this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
 
   if (this_rd < *best_rd) {
+    memcpy(best_txk_type, mbmi->txk_type,
+           sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+    memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * n4);
     *best_rd = this_rd;
-    *best_angle_delta = mbmi->angle_delta[0];
+    *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_Y];
     *best_tx_size = mbmi->tx_size;
-#if CONFIG_INTRA_INTERP
-    *best_filter = mbmi->intra_filter;
-#endif  // CONFIG_INTRA_INTERP
-    *best_tx_type = mbmi->tx_type;
-#if CONFIG_LGT_FROM_PRED
-    *use_lgt_when_selected = mbmi->use_lgt;
-#endif
     *rate = this_rate;
     rd_stats->rate = tokenonly_rd_stats.rate;
     rd_stats->dist = tokenonly_rd_stats.dist;
@@ -3980,131 +3743,60 @@ static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
                                        int64_t best_rd,
                                        int64_t *best_model_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mic = xd->mi[0];
-  MB_MODE_INFO *mbmi = &mic->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   assert(!is_inter_block(mbmi));
   int i, angle_delta, best_angle_delta = 0;
   int first_try = 1;
-#if CONFIG_INTRA_INTERP
-  int p_angle;
-  const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
-  INTRA_FILTER filter, best_filter = INTRA_FILTER_LINEAR;
-#endif  // CONFIG_INTRA_INTERP
   int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
-  TX_SIZE best_tx_size = mic->mbmi.tx_size;
-  TX_TYPE best_tx_type = mbmi->tx_type;
-#if CONFIG_LGT_FROM_PRED
-  int use_lgt_when_selected = mbmi->use_lgt;
-#endif
+  TX_SIZE best_tx_size = mbmi->tx_size;
+  const int n4 = bsize_to_num_blk(bsize);
+  TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
 
   for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
 
   for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
-#if CONFIG_INTRA_INTERP
-    for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
-      if (FILTER_FAST_SEARCH && filter != INTRA_FILTER_LINEAR) continue;
-      mic->mbmi.intra_filter = filter;
-#endif  // CONFIG_INTRA_INTERP
-      for (i = 0; i < 2; ++i) {
-        best_rd_in = (best_rd == INT64_MAX)
-                         ? INT64_MAX
-                         : (best_rd + (best_rd >> (first_try ? 3 : 5)));
-        this_rd = calc_rd_given_intra_angle(
-            cpi, x, bsize,
-#if CONFIG_INTRA_INTERP
-            mode_cost + x->intra_filter_cost[intra_filter_ctx][filter],
-#else
-          mode_cost,
-#endif  // CONFIG_INTRA_INTERP
-            best_rd_in, (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate,
-            rd_stats, &best_angle_delta, &best_tx_size, &best_tx_type,
-#if CONFIG_LGT_FROM_PRED
-            &use_lgt_when_selected,
-#endif
-#if CONFIG_INTRA_INTERP
-            &best_filter,
-#endif  // CONFIG_INTRA_INTERP
-            &best_rd, best_model_rd);
-        rd_cost[2 * angle_delta + i] = this_rd;
-        if (first_try && this_rd == INT64_MAX) return best_rd;
-        first_try = 0;
-        if (angle_delta == 0) {
-          rd_cost[1] = this_rd;
-          break;
-        }
+    for (i = 0; i < 2; ++i) {
+      best_rd_in = (best_rd == INT64_MAX)
+                       ? INT64_MAX
+                       : (best_rd + (best_rd >> (first_try ? 3 : 5)));
+      this_rd = calc_rd_given_intra_angle(
+          cpi, x, bsize, mode_cost, best_rd_in, (1 - 2 * i) * angle_delta,
+          MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
+          &best_rd, best_model_rd, best_txk_type, best_blk_skip);
+      rd_cost[2 * angle_delta + i] = this_rd;
+      if (first_try && this_rd == INT64_MAX) return best_rd;
+      first_try = 0;
+      if (angle_delta == 0) {
+        rd_cost[1] = this_rd;
+        break;
       }
-#if CONFIG_INTRA_INTERP
     }
-#endif  // CONFIG_INTRA_INTERP
   }
 
   assert(best_rd != INT64_MAX);
   for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
     int64_t rd_thresh;
-#if CONFIG_INTRA_INTERP
-    for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
-      if (FILTER_FAST_SEARCH && filter != INTRA_FILTER_LINEAR) continue;
-      mic->mbmi.intra_filter = filter;
-#endif  // CONFIG_INTRA_INTERP
-      for (i = 0; i < 2; ++i) {
-        int skip_search = 0;
-        rd_thresh = best_rd + (best_rd >> 5);
-        if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
-            rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
-          skip_search = 1;
-        if (!skip_search) {
-          calc_rd_given_intra_angle(
-              cpi, x, bsize,
-#if CONFIG_INTRA_INTERP
-              mode_cost + x->intra_filter_cost[intra_filter_ctx][filter],
-#else
-            mode_cost,
-#endif  // CONFIG_INTRA_INTERP
-              best_rd, (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate,
-              rd_stats, &best_angle_delta, &best_tx_size, &best_tx_type,
-#if CONFIG_LGT_FROM_PRED
-              &use_lgt_when_selected,
-#endif
-#if CONFIG_INTRA_INTERP
-              &best_filter,
-#endif  // CONFIG_INTRA_INTERP
-              &best_rd, best_model_rd);
-        }
-      }
-#if CONFIG_INTRA_INTERP
-    }
-#endif  // CONFIG_INTRA_INTERP
-  }
-
-#if CONFIG_INTRA_INTERP
-  if (FILTER_FAST_SEARCH && rd_stats->rate < INT_MAX) {
-    p_angle = mode_to_angle_map[mbmi->mode] + best_angle_delta * ANGLE_STEP;
-    if (av1_is_intra_filter_switchable(p_angle)) {
-      for (filter = INTRA_FILTER_LINEAR + 1; filter < INTRA_FILTERS; ++filter) {
-        mic->mbmi.intra_filter = filter;
-        this_rd = calc_rd_given_intra_angle(
-            cpi, x, bsize,
-            mode_cost + x->intra_filter_cost[intra_filter_ctx][filter], best_rd,
-            best_angle_delta, MAX_ANGLE_DELTA, rate, rd_stats,
-            &best_angle_delta, &best_tx_size, &best_tx_type,
-#if CONFIG_LGT_FROM_PRED
-            &use_lgt_when_selected,
-#endif
-            &best_filter, &best_rd, best_model_rd);
+    for (i = 0; i < 2; ++i) {
+      int skip_search = 0;
+      rd_thresh = best_rd + (best_rd >> 5);
+      if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
+          rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
+        skip_search = 1;
+      if (!skip_search) {
+        calc_rd_given_intra_angle(
+            cpi, x, bsize, mode_cost, best_rd, (1 - 2 * i) * angle_delta,
+            MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
+            &best_rd, best_model_rd, best_txk_type, best_blk_skip);
       }
     }
   }
-#endif  // CONFIG_INTRA_INTERP
 
   mbmi->tx_size = best_tx_size;
-  mbmi->angle_delta[0] = best_angle_delta;
-#if CONFIG_INTRA_INTERP
-  mic->mbmi.intra_filter = best_filter;
-#endif  // CONFIG_INTRA_INTERP
-  mbmi->tx_type = best_tx_type;
-#if CONFIG_LGT_FROM_PRED
-  mbmi->use_lgt = use_lgt_when_selected;
-#endif
+  mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta;
+  memcpy(mbmi->txk_type, best_txk_type,
+         sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+  memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4);
   return best_rd;
 }
 
@@ -4173,7 +3865,7 @@ static void angle_estimation(const uint8_t *src, int src_stride, int rows,
   uint64_t hist_sum = 0;
   for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
   for (i = 0; i < INTRA_MODES; ++i) {
-    if (av1_is_directional_mode(i, bsize)) {
+    if (av1_is_directional_mode(i)) {
       const uint8_t angle_bin = mode_to_angle_bin[i];
       uint64_t score = 2 * hist[angle_bin];
       int weight = 2;
@@ -4191,7 +3883,6 @@ static void angle_estimation(const uint8_t *src, int src_stride, int rows,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
                                     int rows, int cols, BLOCK_SIZE bsize,
                                     uint8_t *directional_mode_skip_mask) {
@@ -4229,7 +3920,7 @@ static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
   uint64_t hist_sum = 0;
   for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
   for (i = 0; i < INTRA_MODES; ++i) {
-    if (av1_is_directional_mode(i, bsize)) {
+    if (av1_is_directional_mode(i)) {
       const uint8_t angle_bin = mode_to_angle_bin[i];
       uint64_t score = 2 * hist[angle_bin];
       int weight = 2;
@@ -4246,119 +3937,102 @@ static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
     }
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_EXT_INTRA
+
+// Given selected prediction mode, search for the best tx type and size.
+static void intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, const int *bmode_costs,
+                            int64_t *best_rd, int *rate, int *rate_tokenonly,
+                            int64_t *distortion, int *skippable,
+                            MB_MODE_INFO *best_mbmi, PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  RD_STATS rd_stats;
+  super_block_yrd(cpi, x, &rd_stats, bsize, *best_rd);
+  if (rd_stats.rate == INT_MAX) return;
+  int this_rate_tokenonly = rd_stats.rate;
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) {
+    // super_block_yrd above includes the cost of the tx_size in the
+    // tokenonly rate, but for intra blocks, tx_size is always coded
+    // (prediction granularity), so we account for it in the full rate,
+    // not the tokenonly rate.
+    this_rate_tokenonly -= tx_size_cost(&cpi->common, x, bsize, mbmi->tx_size);
+  }
+  const int this_rate =
+      rd_stats.rate +
+      intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]);
+  const int64_t this_rd = RDCOST(x->rdmult, this_rate, rd_stats.dist);
+  if (this_rd < *best_rd) {
+    *best_mbmi = *mbmi;
+    *best_rd = this_rd;
+    *rate = this_rate;
+    *rate_tokenonly = this_rate_tokenonly;
+    *distortion = rd_stats.dist;
+    *skippable = rd_stats.skip;
+    memcpy(ctx->blk_skip, x->blk_skip,
+           sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+  }
+}
 
 // This function is used only for intra_only frames
 static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                       int *rate, int *rate_tokenonly,
                                       int64_t *distortion, int *skippable,
-                                      BLOCK_SIZE bsize, int64_t best_rd) {
+                                      BLOCK_SIZE bsize, int64_t best_rd,
+                                      PICK_MODE_CONTEXT *ctx) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mic = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mic->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   assert(!is_inter_block(mbmi));
-  MB_MODE_INFO best_mbmi = *mbmi;
   int64_t best_model_rd = INT64_MAX;
-#if CONFIG_EXT_INTRA
   const int rows = block_size_high[bsize];
   const int cols = block_size_wide[bsize];
-#if CONFIG_INTRA_INTERP
-  const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
-#endif  // CONFIG_INTRA_INTERP
   int is_directional_mode;
   uint8_t directional_mode_skip_mask[INTRA_MODES];
   const int src_stride = x->plane[0].src.stride;
   const uint8_t *src = x->plane[0].src.buf;
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
   int beat_best_rd = 0;
-  uint16_t filter_intra_mode_skip_mask = (1 << FILTER_INTRA_MODES) - 1;
-#endif  // CONFIG_FILTER_INTRA
   const int *bmode_costs;
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  int palette_y_mode_ctx = 0;
   const int try_palette =
       av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
   uint8_t *best_palette_color_map =
       try_palette ? x->palette_buffer->best_palette_color_map : NULL;
-  const MODE_INFO *above_mi = xd->above_mi;
-  const MODE_INFO *left_mi = xd->left_mi;
-  const PREDICTION_MODE A = av1_above_block_mode(mic, above_mi, 0);
-  const PREDICTION_MODE L = av1_left_block_mode(mic, left_mi, 0);
-  const PREDICTION_MODE FINAL_MODE_SEARCH = TM_PRED + 1;
-#if CONFIG_PVQ
-  od_rollback_buffer pre_buf, post_buf;
-
-  od_encode_checkpoint(&x->daala_enc, &pre_buf);
-  od_encode_checkpoint(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
-
-#if CONFIG_KF_CTX
+  const MB_MODE_INFO *above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *left_mi = xd->left_mbmi;
+  const PREDICTION_MODE A = av1_above_block_mode(above_mi);
+  const PREDICTION_MODE L = av1_left_block_mode(left_mi);
   const int above_ctx = intra_mode_context[A];
   const int left_ctx = intra_mode_context[L];
   bmode_costs = x->y_mode_costs[above_ctx][left_ctx];
-#else
-  bmode_costs = x->y_mode_costs[A][L];
-#endif
 
-#if CONFIG_EXT_INTRA
-  mbmi->angle_delta[0] = 0;
-#if CONFIG_HIGHBITDEPTH
+  mbmi->angle_delta[PLANE_TYPE_Y] = 0;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
     highbd_angle_estimation(src, src_stride, rows, cols, bsize,
                             directional_mode_skip_mask);
   else
-#endif  // CONFIG_HIGHBITDEPTH
     angle_estimation(src, src_stride, rows, cols, bsize,
                      directional_mode_skip_mask);
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-#endif  // CONFIG_FILTER_INTRA
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
   pmi->palette_size[0] = 0;
-  if (try_palette) {
-    if (above_mi) {
-      palette_y_mode_ctx +=
-          (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-    }
-    if (left_mi) {
-      palette_y_mode_ctx +=
-          (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-    }
-  }
 
   if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
     x->use_default_intra_tx_type = 1;
   else
     x->use_default_intra_tx_type = 0;
 
+  MB_MODE_INFO best_mbmi = *mbmi;
   /* Y Search for intra prediction mode */
-  for (int mode_idx = DC_PRED; mode_idx <= FINAL_MODE_SEARCH; ++mode_idx) {
+  for (int mode_idx = DC_PRED; mode_idx < INTRA_MODES; ++mode_idx) {
     RD_STATS this_rd_stats;
     int this_rate, this_rate_tokenonly, s;
     int64_t this_distortion, this_rd, this_model_rd;
-    if (mode_idx == FINAL_MODE_SEARCH) {
-      if (x->use_default_intra_tx_type == 0) break;
-      mbmi->mode = best_mbmi.mode;
-      x->use_default_intra_tx_type = 0;
-    } else {
-      assert(mode_idx < INTRA_MODES);
-      mbmi->mode = intra_rd_search_mode_order[mode_idx];
-    }
-#if CONFIG_PVQ
-    od_encode_rollback(&x->daala_enc, &pre_buf);
-#endif  // CONFIG_PVQ
-#if CONFIG_EXT_INTRA
-    mbmi->angle_delta[0] = 0;
-#endif  // CONFIG_EXT_INTRA
+    mbmi->mode = intra_rd_search_mode_order[mode_idx];
+    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
     this_model_rd = intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode]);
     if (best_model_rd != INT64_MAX &&
         this_model_rd > best_model_rd + (best_model_rd >> 1))
       continue;
     if (this_model_rd < best_model_rd) best_model_rd = this_model_rd;
-#if CONFIG_EXT_INTRA
-    is_directional_mode = av1_is_directional_mode(mbmi->mode, bsize);
+    is_directional_mode = av1_is_directional_mode(mbmi->mode);
     if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
     if (is_directional_mode && av1_use_angle_delta(bsize)) {
       this_rd_stats.rate = INT_MAX;
@@ -4367,97 +4041,61 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     } else {
       super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
     }
-#else
-    super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
-#endif  // CONFIG_EXT_INTRA
     this_rate_tokenonly = this_rd_stats.rate;
     this_distortion = this_rd_stats.dist;
     s = this_rd_stats.skip;
 
     if (this_rate_tokenonly == INT_MAX) continue;
 
-    this_rate = this_rate_tokenonly + bmode_costs[mbmi->mode];
-
     if (!xd->lossless[mbmi->segment_id] &&
         block_signals_txsize(mbmi->sb_type)) {
       // super_block_yrd above includes the cost of the tx_size in the
       // tokenonly rate, but for intra blocks, tx_size is always coded
       // (prediction granularity), so we account for it in the full rate,
       // not the tokenonly rate.
-      this_rate_tokenonly -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
-    }
-    if (try_palette && mbmi->mode == DC_PRED) {
-      this_rate +=
-          av1_cost_bit(av1_default_palette_y_mode_prob[bsize - BLOCK_8X8]
-                                                      [palette_y_mode_ctx],
-                       0);
-    }
-#if CONFIG_FILTER_INTRA
-    if (mbmi->mode == DC_PRED)
-      this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 0);
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_EXT_INTRA
-    if (is_directional_mode) {
-#if CONFIG_INTRA_INTERP
-      const int p_angle =
-          mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
-      if (av1_is_intra_filter_switchable(p_angle))
-        this_rate += x->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
-#endif  // CONFIG_INTRA_INTERP
-      if (av1_use_angle_delta(bsize)) {
-        this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
-                                        MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
-      }
-    }
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_INTRABC
-    if (bsize >= BLOCK_8X8 && cpi->common.allow_screen_content_tools)
-      this_rate += x->intrabc_cost[0];
-#endif  // CONFIG_INTRABC
-    this_rd = RDCOST(x->rdmult, this_rate, this_distortion);
-#if CONFIG_FILTER_INTRA
-    if (best_rd == INT64_MAX || this_rd - best_rd < (best_rd >> 4)) {
-      filter_intra_mode_skip_mask ^= (1 << mbmi->mode);
+      this_rate_tokenonly -=
+          tx_size_cost(&cpi->common, x, bsize, mbmi->tx_size);
     }
-#endif  // CONFIG_FILTER_INTRA
-
+    this_rate =
+        this_rd_stats.rate +
+        intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]);
+    this_rd = RDCOST(x->rdmult, this_rate, this_distortion);
     if (this_rd < best_rd) {
       best_mbmi = *mbmi;
       best_rd = this_rd;
-#if CONFIG_FILTER_INTRA
       beat_best_rd = 1;
-#endif  // CONFIG_FILTER_INTRA
       *rate = this_rate;
       *rate_tokenonly = this_rate_tokenonly;
       *distortion = this_distortion;
       *skippable = s;
-#if CONFIG_PVQ
-      od_encode_checkpoint(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
+      memcpy(ctx->blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
     }
   }
 
-#if CONFIG_PVQ
-  od_encode_rollback(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
-
   if (try_palette) {
-    rd_pick_palette_intra_sby(cpi, x, bsize, palette_y_mode_ctx,
-                              bmode_costs[DC_PRED], &best_mbmi,
+    rd_pick_palette_intra_sby(cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi,
                               best_palette_color_map, &best_rd, &best_model_rd,
-                              rate, rate_tokenonly, distortion, skippable);
+                              rate, rate_tokenonly, distortion, skippable, ctx,
+                              ctx->blk_skip);
   }
 
-#if CONFIG_FILTER_INTRA
-  if (beat_best_rd) {
+  if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) {
     if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
                                  skippable, bsize, bmode_costs[DC_PRED],
-                                 &best_rd, &best_model_rd,
-                                 filter_intra_mode_skip_mask)) {
+                                 &best_rd, &best_model_rd, ctx)) {
       best_mbmi = *mbmi;
     }
   }
-#endif  // CONFIG_FILTER_INTRA
+
+  // If previous searches use only the default tx type, do an extra search for
+  // the best tx type.
+  if (x->use_default_intra_tx_type) {
+    *mbmi = best_mbmi;
+    x->use_default_intra_tx_type = 0;
+    intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate, rate_tokenonly,
+                    distortion, skippable, &best_mbmi, ctx);
+  }
 
   *mbmi = best_mbmi;
   return best_rd;
@@ -4469,33 +4107,29 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                             RD_STATS *rd_stats, BLOCK_SIZE bsize,
                             int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const TX_SIZE uv_tx_size = av1_get_uv_tx_size(mbmi, &xd->plane[1]);
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_U];
+  const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
   int plane;
   int is_cost_valid = 1;
   av1_init_rd_stats(rd_stats);
 
   if (ref_best_rd < 0) is_cost_valid = 0;
 
-#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
   if (x->skip_chroma_rd) return is_cost_valid;
 
-  bsize = scale_chroma_bsize(bsize, xd->plane[1].subsampling_x,
-                             xd->plane[1].subsampling_y);
-#endif  // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+  bsize = scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
 
-#if !CONFIG_PVQ
   if (is_inter_block(mbmi) && is_cost_valid) {
     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
       av1_subtract_plane(x, bsize, plane);
   }
-#endif  // !CONFIG_PVQ
 
   if (is_cost_valid) {
     for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
       RD_STATS pn_rd_stats;
       txfm_rd_in_plane(x, cpi, &pn_rd_stats, ref_best_rd, plane, bsize,
-                       uv_tx_size, cpi->sf.use_fast_coef_costing);
+                       uv_tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE);
       if (pn_rd_stats.rate == INT_MAX) {
         is_cost_valid = 0;
         break;
@@ -4517,283 +4151,222 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
   return is_cost_valid;
 }
 
-#if CONFIG_VAR_TX
-void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
-                       int blk_row, int blk_col, int plane, int block,
-                       int plane_bsize, const ENTROPY_CONTEXT *a,
-                       const ENTROPY_CONTEXT *l, RD_STATS *rd_stats) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
+static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
+                          int blk_row, int blk_col, int plane, int block,
+                          int plane_bsize, const ENTROPY_CONTEXT *a,
+                          const ENTROPY_CONTEXT *l, RD_STATS *rd_stats,
+                          FAST_TX_SEARCH_MODE ftxs_mode, int64_t ref_rdcost,
+                          TXB_RD_INFO *rd_info_array) {
   const struct macroblock_plane *const p = &x->plane[plane];
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-
-#if CONFIG_TXK_SEL
-  av1_search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
-                      tx_size, a, l, 0, rd_stats);
-  return;
-#endif
-
-  int64_t tmp;
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  PLANE_TYPE plane_type = get_plane_type(plane);
-  TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
-  const SCAN_ORDER *const scan_order =
-      get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
-  BLOCK_SIZE txm_bsize = txsize_to_bsize[tx_size];
-  int bh = block_size_high[txm_bsize];
-  int bw = block_size_wide[txm_bsize];
-  int src_stride = p->src.stride;
-  uint8_t *src =
-      &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
-  uint8_t *dst =
-      &pd->dst
-           .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
-#if CONFIG_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, rec_buffer16[MAX_TX_SQUARE]);
-  uint8_t *rec_buffer;
-#else
-  DECLARE_ALIGNED(16, uint8_t, rec_buffer[MAX_TX_SQUARE]);
-#endif  // CONFIG_HIGHBITDEPTH
-  const int diff_stride = block_size_wide[plane_bsize];
-  const int16_t *diff =
-      &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
-  int txb_coeff_cost;
-
-  assert(tx_size < TX_SIZES_ALL);
-
-  int coeff_ctx = get_entropy_context(tx_size, a, l);
-
-  tmp = pixel_diff_dist(x, plane, diff, diff_stride, blk_row, blk_col,
-                        plane_bsize, txm_bsize);
-
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
-#endif  // CONFIG_HIGHBITDEPTH
-  rd_stats->sse += tmp << 4;
-
-  if (rd_stats->invalid_rate) {
-    rd_stats->dist += tmp << 4;
-    rd_stats->rate += rd_stats->zero_rate;
-    rd_stats->skip = 1;
-    return;
-  }
-
-// TODO(any): Use av1_dist_block to compute distortion
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    rec_buffer = CONVERT_TO_BYTEPTR(rec_buffer16);
-    aom_highbd_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL,
-                             0, NULL, 0, bw, bh, xd->bd);
-  } else {
-    rec_buffer = (uint8_t *)rec_buffer16;
-    aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0,
-                      NULL, 0, bw, bh);
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+  const uint16_t cur_joint_ctx =
+      (txb_ctx.dc_sign_ctx << 8) + txb_ctx.txb_skip_ctx;
+
+  const int txk_type_idx =
+      av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
+  // Look up RD and terminate early in case when we've already processed exactly
+  // the same residual with exactly the same entropy context.
+  if (rd_info_array != NULL && rd_info_array->valid &&
+      rd_info_array->entropy_context == cur_joint_ctx) {
+    if (plane == 0)
+      x->e_mbd.mi[0]->txk_type[txk_type_idx] = rd_info_array->tx_type;
+    const TX_TYPE ref_tx_type =
+        av1_get_tx_type(get_plane_type(plane), &x->e_mbd, blk_row, blk_col,
+                        tx_size, cpi->common.reduced_tx_set_used);
+    if (ref_tx_type == rd_info_array->tx_type) {
+      rd_stats->rate += rd_info_array->rate;
+      rd_stats->dist += rd_info_array->dist;
+      rd_stats->sse += rd_info_array->sse;
+      rd_stats->skip &= rd_info_array->eob == 0;
+      p->eobs[block] = rd_info_array->eob;
+      p->txb_entropy_ctx[block] = rd_info_array->txb_entropy_ctx;
+      return;
+    }
   }
-#else
-  aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0, NULL,
-                    0, bw, bh);
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if DISABLE_TRELLISQ_SEARCH
-  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  coeff_ctx, AV1_XFORM_QUANT_B);
-
-#else
-  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  coeff_ctx, AV1_XFORM_QUANT_FP);
-
-  const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
-  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  const int buffer_length = tx_size_2d[tx_size];
-  int64_t tmp_dist, tmp_sse;
-#if CONFIG_DIST_8X8
-  int disable_early_skip =
-      x->using_dist_8x8 && plane == 0 && plane_bsize >= BLOCK_8X8 &&
-      (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4) &&
-      x->tune_metric != AOM_TUNE_PSNR;
-#endif  // CONFIG_DIST_8X8
 
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    tmp_dist =
-        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp_sse, xd->bd);
-  else
-#endif
-    tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp_sse);
-
-  tmp_dist = RIGHT_SIGNED_SHIFT(tmp_dist, shift);
-
-#if CONFIG_MRC_TX
-  if (tx_type == MRC_DCT && !xd->mi[0]->mbmi.valid_mrc_mask) {
-    av1_invalid_rd_stats(rd_stats);
-    return;
-  }
-#endif  // CONFIG_MRC_TX
-  if (
-#if CONFIG_DIST_8X8
-      disable_early_skip ||
-#endif
-      RDCOST(x->rdmult, 0, tmp_dist) < rd_stats->ref_rdcost) {
-    av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
-                   a, l, 1);
-  } else {
-    rd_stats->rate += rd_stats->zero_rate;
-    rd_stats->dist += tmp << 4;
-    rd_stats->skip = 1;
-    rd_stats->invalid_rate = 1;
-    return;
+  RD_STATS this_rd_stats;
+  search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  &txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats);
+
+  av1_merge_rd_stats(rd_stats, &this_rd_stats);
+
+  // Save RD results for possible reuse in future.
+  if (rd_info_array != NULL) {
+    rd_info_array->valid = 1;
+    rd_info_array->entropy_context = cur_joint_ctx;
+    rd_info_array->rate = this_rd_stats.rate;
+    rd_info_array->dist = this_rd_stats.dist;
+    rd_info_array->sse = this_rd_stats.sse;
+    rd_info_array->eob = p->eobs[block];
+    rd_info_array->txb_entropy_ctx = p->txb_entropy_ctx[block];
+    if (plane == 0) {
+      rd_info_array->tx_type = x->e_mbd.mi[0]->txk_type[txk_type_idx];
+    }
+  }
+}
+
+static void get_mean_and_dev(const int16_t *data, int stride, int bw, int bh,
+                             float *mean, float *dev) {
+  int x_sum = 0;
+  uint64_t x2_sum = 0;
+  for (int i = 0; i < bh; ++i) {
+    for (int j = 0; j < bw; ++j) {
+      const int val = data[j];
+      x_sum += val;
+      x2_sum += val * val;
+    }
+    data += stride;
+  }
+
+  const int num = bw * bh;
+  const float e_x = (float)x_sum / num;
+  const float e_x2 = (float)((double)x2_sum / num);
+  const float diff = e_x2 - e_x * e_x;
+  *dev = (diff > 0) ? sqrtf(diff) : 0;
+  *mean = e_x;
+}
+
+static void get_mean_and_dev_float(const float *data, int stride, int bw,
+                                   int bh, float *mean, float *dev) {
+  float x_sum = 0;
+  float x2_sum = 0;
+  for (int i = 0; i < bh; ++i) {
+    for (int j = 0; j < bw; ++j) {
+      const float val = data[j];
+      x_sum += val;
+      x2_sum += val * val;
+    }
+    data += stride;
+  }
+
+  const int num = bw * bh;
+  const float e_x = x_sum / num;
+  const float e_x2 = x2_sum / num;
+  const float diff = e_x2 - e_x * e_x;
+  *dev = (diff > 0) ? sqrtf(diff) : 0;
+  *mean = e_x;
+}
+
+// Feature used by the model to predict tx split: the mean and standard
+// deviation values of the block and sub-blocks.
+static void get_mean_dev_features(const int16_t *data, int stride, int bw,
+                                  int bh, int levels, float *feature) {
+  int feature_idx = 0;
+  int width = bw;
+  int height = bh;
+  const int16_t *const data_ptr = &data[0];
+  for (int lv = 0; lv < levels; ++lv) {
+    if (width < 2 || height < 2) break;
+    float mean_buf[16];
+    float dev_buf[16];
+    int blk_idx = 0;
+    for (int row = 0; row < bh; row += height) {
+      for (int col = 0; col < bw; col += width) {
+        float mean, dev;
+        get_mean_and_dev(data_ptr + row * stride + col, stride, width, height,
+                         &mean, &dev);
+        feature[feature_idx++] = mean;
+        feature[feature_idx++] = dev;
+        mean_buf[blk_idx] = mean;
+        dev_buf[blk_idx++] = dev;
+      }
+    }
+    if (blk_idx > 1) {
+      float mean, dev;
+      // Deviation of means.
+      get_mean_and_dev_float(mean_buf, 1, 1, blk_idx, &mean, &dev);
+      feature[feature_idx++] = dev;
+      // Mean of deviations.
+      get_mean_and_dev_float(dev_buf, 1, 1, blk_idx, &mean, &dev);
+      feature[feature_idx++] = mean;
+    }
+    // Reduce the block size when proceeding to the next level.
+    if (height == width) {
+      height = height >> 1;
+      width = width >> 1;
+    } else if (height > width) {
+      height = height >> 1;
+    } else {
+      width = width >> 1;
+    }
   }
-#endif  // DISABLE_TRELLISQ_SEARCH
+}
 
-  const int eob = p->eobs[block];
+static int ml_predict_tx_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row,
+                               int blk_col, TX_SIZE tx_size) {
+  const NN_CONFIG *nn_config = av1_tx_split_nnconfig_map[tx_size];
+  if (!nn_config) return -1;
 
-  av1_inverse_transform_block(xd, dqcoeff,
-#if CONFIG_LGT_FROM_PRED
-                              xd->mi[0]->mbmi.mode,
-#endif
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                              mrc_mask,
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                              tx_type, tx_size, rec_buffer, MAX_TX_SIZE, eob);
-  if (eob > 0) {
-#if CONFIG_DIST_8X8
-    if (x->using_dist_8x8 && plane == 0 && (bw < 8 && bh < 8)) {
-      // Save sub8x8 luma decoded pixels
-      // since 8x8 luma decoded pixels are not available for daala-dist
-      // after recursive split of BLOCK_8x8 is done.
-      const int pred_stride = block_size_wide[plane_bsize];
-      const int pred_idx = (blk_row * pred_stride + blk_col)
-                           << tx_size_wide_log2[0];
-      int16_t *decoded = &pd->pred[pred_idx];
-      int i, j;
+  const int diff_stride = block_size_wide[bsize];
+  const int16_t *diff =
+      x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+  aom_clear_system_state();
 
-#if CONFIG_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        for (j = 0; j < bh; j++)
-          for (i = 0; i < bw; i++)
-            decoded[j * pred_stride + i] =
-                CONVERT_TO_SHORTPTR(rec_buffer)[j * MAX_TX_SIZE + i];
-      } else {
-#endif
-        for (j = 0; j < bh; j++)
-          for (i = 0; i < bw; i++)
-            decoded[j * pred_stride + i] = rec_buffer[j * MAX_TX_SIZE + i];
-#if CONFIG_HIGHBITDEPTH
-      }
-#endif  // CONFIG_HIGHBITDEPTH
-    }
-#endif  // CONFIG_DIST_8X8
-    tmp = pixel_dist(cpi, x, plane, src, src_stride, rec_buffer, MAX_TX_SIZE,
-                     blk_row, blk_col, plane_bsize, txm_bsize);
-  }
-  rd_stats->dist += tmp * 16;
-  txb_coeff_cost = av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block,
-                                   tx_size, scan_order, a, l, 0);
-  rd_stats->rate += txb_coeff_cost;
-  rd_stats->skip &= (eob == 0);
+  float features[64] = { 0.0f };
+  get_mean_dev_features(diff, diff_stride, bw, bh, 2, features);
 
-#if CONFIG_RD_DEBUG
-  av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col,
-                            txb_coeff_cost);
-#endif  // CONFIG_RD_DEBUG
+  float score = 0.0f;
+  av1_nn_predict(features, nn_config, &score);
+  if (score > 8.0f) return 100;
+  if (score < -8.0f) return 0;
+  score = 1.0f / (1.0f + (float)exp(-score));
+  return (int)(score * 100);
 }
 
+// Search for the best tx partition/type for a given luma block.
 static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
-                            int blk_col, int plane, int block, TX_SIZE tx_size,
-                            int depth, BLOCK_SIZE plane_bsize,
-                            ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl,
-                            TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
-                            RD_STATS *rd_stats, int64_t ref_best_rd,
-                            int *is_cost_valid) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  struct macroblock_plane *const p = &x->plane[plane];
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int tx_row = blk_row >> (1 - pd->subsampling_y);
-  const int tx_col = blk_col >> (1 - pd->subsampling_x);
-  TX_SIZE(*const inter_tx_size)
-  [MAX_MIB_SIZE] =
-      (TX_SIZE(*)[MAX_MIB_SIZE]) & mbmi->inter_tx_size[tx_row][tx_col];
-  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
-  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-  int64_t this_rd = INT64_MAX;
-  ENTROPY_CONTEXT *pta = ta + blk_col;
-  ENTROPY_CONTEXT *ptl = tl + blk_row;
-  int i;
-  int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
-                                   mbmi->sb_type, tx_size);
-  int64_t sum_rd = INT64_MAX;
-  int tmp_eob = 0;
-  int zero_blk_rate;
-  RD_STATS sum_rd_stats;
-#if CONFIG_TXK_SEL
-  TX_TYPE best_tx_type = TX_TYPES;
-  int txk_idx = (blk_row << 4) + blk_col;
-#endif
-#if CONFIG_RECT_TX_EXT
-  TX_SIZE quarter_txsize = quarter_txsize_lookup[mbmi->sb_type];
-  int check_qttx = is_quarter_tx_allowed(xd, mbmi, is_inter_block(mbmi)) &&
-                   tx_size == max_txsize_rect_lookup[mbmi->sb_type] &&
-                   quarter_txsize != tx_size;
-  int is_qttx_picked = 0;
-  int eobs_qttx[2] = { 0, 0 };
-  int skip_qttx[2] = { 0, 0 };
-  int block_offset_qttx = check_qttx
-                              ? tx_size_wide_unit[quarter_txsize] *
-                                    tx_size_high_unit[quarter_txsize]
-                              : 0;
-  int blk_row_offset, blk_col_offset;
-  int is_wide_qttx =
-      tx_size_wide_unit[quarter_txsize] > tx_size_high_unit[quarter_txsize];
-  blk_row_offset = is_wide_qttx ? tx_size_high_unit[quarter_txsize] : 0;
-  blk_col_offset = is_wide_qttx ? 0 : tx_size_wide_unit[quarter_txsize];
-#endif
-
-  av1_init_rd_stats(&sum_rd_stats);
-
+                            int blk_col, int block, TX_SIZE tx_size, int depth,
+                            BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+                            ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above,
+                            TXFM_CONTEXT *tx_left, RD_STATS *rd_stats,
+                            int64_t ref_best_rd, int *is_cost_valid,
+                            FAST_TX_SEARCH_MODE ftxs_mode,
+                            TXB_RD_INFO_NODE *rd_info_node) {
   assert(tx_size < TX_SIZES_ALL);
-
+  av1_init_rd_stats(rd_stats);
   if (ref_best_rd < 0) {
     *is_cost_valid = 0;
     return;
   }
 
-  av1_init_rd_stats(rd_stats);
-
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
-#if CONFIG_LV_MAP
-  TX_SIZE txs_ctx = get_txsize_context(tx_size);
-  TXB_CTX txb_ctx;
-  get_txb_ctx(plane_bsize, tx_size, plane, pta, ptl, &txb_ctx);
-
-#if LV_MAP_PROB
-  zero_blk_rate = x->coeff_costs[txs_ctx][get_plane_type(plane)]
-                      .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
-#else
-  zero_blk_rate =
-      av1_cost_bit(xd->fc->txb_skip[txs_ctx][txb_ctx.txb_skip_ctx], 1);
-#endif  // LV_MAP_PROB
-#else
-  TX_SIZE tx_size_ctx = txsize_sqr_map[tx_size];
-  int coeff_ctx = get_entropy_context(tx_size, pta, ptl);
-  zero_blk_rate =
-      x->token_head_costs[tx_size_ctx][pd->plane_type][1][0][coeff_ctx][0];
-#endif
-
-  rd_stats->ref_rdcost = ref_best_rd;
-  rd_stats->zero_rate = zero_blk_rate;
-  if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
-    inter_tx_size[0][0] = tx_size;
-    av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
-                      plane_bsize, pta, ptl, rd_stats);
-    if (rd_stats->rate == INT_MAX) return;
+  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  ENTROPY_CONTEXT *pta = ta + blk_col;
+  ENTROPY_CONTEXT *ptl = tl + blk_row;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
+                                         mbmi->sb_type, tx_size);
+  struct macroblock_plane *const p = &x->plane[0];
+
+  const int try_no_split = 1;
+  int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH;
+
+  int64_t no_split_rd = INT64_MAX;
+  int no_split_txb_entropy_ctx = 0;
+  TX_TYPE no_split_tx_type = TX_TYPES;
+  // TX no split
+  if (try_no_split) {
+    const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+    TXB_CTX txb_ctx;
+    get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx);
+    const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y]
+                                  .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+
+    rd_stats->ref_rdcost = ref_best_rd;
+    rd_stats->zero_rate = zero_blk_rate;
+    const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
+    mbmi->inter_tx_size[index] = tx_size;
+    tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, pta,
+                  ptl, rd_stats, ftxs_mode, ref_best_rd,
+                  rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
+    assert(rd_stats->rate < INT_MAX);
 
     if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
              RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
@@ -4806,187 +4379,111 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
       rd_stats->rate = zero_blk_rate;
       rd_stats->dist = rd_stats->sse;
       rd_stats->skip = 1;
-      x->blk_skip[plane][blk_row * bw + blk_col] = 1;
+      x->blk_skip[blk_row * bw + blk_col] = 1;
       p->eobs[block] = 0;
-#if CONFIG_TXK_SEL
-      mbmi->txk_type[txk_idx] = DCT_DCT;
-#endif
+      update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                       DCT_DCT);
     } else {
-      x->blk_skip[plane][blk_row * bw + blk_col] = 0;
+      x->blk_skip[blk_row * bw + blk_col] = 0;
       rd_stats->skip = 0;
     }
 
     if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
-      rd_stats->rate +=
-          av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
-#if CONFIG_RECT_TX_EXT
-    if (check_qttx) {
-      assert(blk_row == 0 && blk_col == 0);
-      rd_stats->rate += av1_cost_bit(cpi->common.fc->quarter_tx_size_prob, 0);
+      rd_stats->rate += x->txfm_partition_cost[ctx][0];
+    no_split_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+    if (cpi->sf.adaptive_txb_search_level &&
+        (no_split_rd -
+         (no_split_rd >> (1 + cpi->sf.adaptive_txb_search_level))) >
+            ref_best_rd) {
+      *is_cost_valid = 0;
+      return;
     }
-#endif
-    this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-#if CONFIG_LV_MAP
-    tmp_eob = p->txb_entropy_ctx[block];
-#else
-    tmp_eob = p->eobs[block];
-#endif
-
-#if CONFIG_TXK_SEL
-    best_tx_type = mbmi->txk_type[txk_idx];
-#endif
-
-#if CONFIG_RECT_TX_EXT
-    if (check_qttx) {
-      assert(blk_row == 0 && blk_col == 0 && block == 0 && plane == 0);
 
-      RD_STATS rd_stats_tmp, rd_stats_qttx;
-      int64_t rd_qttx;
+    no_split_txb_entropy_ctx = p->txb_entropy_ctx[block];
+    const int txk_type_idx =
+        av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
+    no_split_tx_type = mbmi->txk_type[txk_type_idx];
 
-      av1_init_rd_stats(&rd_stats_qttx);
-      av1_init_rd_stats(&rd_stats_tmp);
-
-      av1_tx_block_rd_b(cpi, x, quarter_txsize, 0, 0, plane, 0, plane_bsize,
-                        pta, ptl, &rd_stats_qttx);
-      if (rd_stats->rate == INT_MAX) return;
-
-      tx_size_ctx = txsize_sqr_map[quarter_txsize];
-      coeff_ctx = get_entropy_context(quarter_txsize, pta, ptl);
-      zero_blk_rate =
-          x->token_head_costs[tx_size_ctx][pd->plane_type][1][0][coeff_ctx][0];
-      if ((RDCOST(x->rdmult, rd_stats_qttx.rate, rd_stats_qttx.dist) >=
-               RDCOST(x->rdmult, zero_blk_rate, rd_stats_qttx.sse) ||
-           rd_stats_qttx.skip == 1) &&
-          !xd->lossless[mbmi->segment_id]) {
-#if CONFIG_RD_DEBUG
-        av1_update_txb_coeff_cost(&rd_stats_qttx, plane, quarter_txsize, 0, 0,
-                                  zero_blk_rate - rd_stats_qttx.rate);
-#endif  // CONFIG_RD_DEBUG
-        rd_stats_qttx.rate = zero_blk_rate;
-        rd_stats_qttx.dist = rd_stats_qttx.sse;
-        rd_stats_qttx.skip = 1;
-        x->blk_skip[plane][blk_row * bw + blk_col] = 1;
-        skip_qttx[0] = 1;
-        p->eobs[block] = 0;
-      } else {
-        x->blk_skip[plane][blk_row * bw + blk_col] = 0;
-        skip_qttx[0] = 0;
-        rd_stats->skip = 0;
-      }
-
-      // Second tx block
-      av1_tx_block_rd_b(cpi, x, quarter_txsize, blk_row_offset, blk_col_offset,
-                        plane, block_offset_qttx, plane_bsize, pta, ptl,
-                        &rd_stats_tmp);
-
-      if (rd_stats->rate == INT_MAX) return;
-
-#if !CONFIG_PVQ
-      av1_set_txb_context(x, plane, 0, quarter_txsize, pta, ptl);
-#endif  // !CONFIG_PVQ
-      coeff_ctx = get_entropy_context(quarter_txsize, pta + blk_col_offset,
-                                      ptl + blk_row_offset);
-      zero_blk_rate =
-          x->token_head_costs[tx_size_ctx][pd->plane_type][1][0][coeff_ctx][0];
-      if ((RDCOST(x->rdmult, rd_stats_tmp.rate, rd_stats_tmp.dist) >=
-               RDCOST(x->rdmult, zero_blk_rate, rd_stats_tmp.sse) ||
-           rd_stats_tmp.skip == 1) &&
-          !xd->lossless[mbmi->segment_id]) {
-#if CONFIG_RD_DEBUG
-        av1_update_txb_coeff_cost(&rd_stats_tmp, plane, quarter_txsize, 0, 0,
-                                  zero_blk_rate - rd_stats_tmp.rate);
-#endif  // CONFIG_RD_DEBUG
-        rd_stats_tmp.rate = zero_blk_rate;
-        rd_stats_tmp.dist = rd_stats_tmp.sse;
-        rd_stats_tmp.skip = 1;
-        x->blk_skip[plane][blk_row_offset * bw + blk_col_offset] = 1;
-        skip_qttx[1] = 1;
-        p->eobs[block_offset_qttx] = 0;
-      } else {
-        x->blk_skip[plane][blk_row_offset * bw + blk_col_offset] = 0;
-        skip_qttx[1] = 0;
-        rd_stats_tmp.skip = 0;
-      }
-
-      av1_merge_rd_stats(&rd_stats_qttx, &rd_stats_tmp);
+    if (cpi->sf.txb_split_cap)
+      if (p->eobs[block] == 0) try_split = 0;
+  }
 
-      if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) {
-        rd_stats_qttx.rate +=
-            av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
-      }
-      rd_stats_qttx.rate +=
-          av1_cost_bit(cpi->common.fc->quarter_tx_size_prob, 1);
-      rd_qttx = RDCOST(x->rdmult, rd_stats_qttx.rate, rd_stats_qttx.dist);
-#if CONFIG_LV_MAP
-      eobs_qttx[0] = p->txb_entropy_ctx[0];
-      eobs_qttx[1] = p->txb_entropy_ctx[block_offset_qttx];
-#else
-      eobs_qttx[0] = p->eobs[0];
-      eobs_qttx[1] = p->eobs[block_offset_qttx];
-#endif
-      if (rd_qttx < this_rd) {
-        is_qttx_picked = 1;
-        this_rd = rd_qttx;
-        rd_stats->rate = rd_stats_qttx.rate;
-        rd_stats->dist = rd_stats_qttx.dist;
-        rd_stats->sse = rd_stats_qttx.sse;
-        rd_stats->skip = rd_stats_qttx.skip;
-        rd_stats->rdcost = rd_stats_qttx.rdcost;
-      }
-      av1_get_entropy_contexts(plane_bsize, 0, pd, ta, tl);
+  if (x->e_mbd.bd == 8 && !x->cb_partition_scan && try_split) {
+    const int threshold = cpi->sf.tx_type_search.ml_tx_split_thresh;
+    if (threshold >= 0) {
+      const int split_score =
+          ml_predict_tx_split(x, plane_bsize, blk_row, blk_col, tx_size);
+      if (split_score >= 0 && split_score < threshold) try_split = 0;
     }
-#endif
   }
 
-  if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH
-#if CONFIG_MRC_TX
-      // If the tx type we are trying is MRC_DCT, we cannot partition the
-      // transform into anything smaller than TX_32X32
-      && mbmi->tx_type != MRC_DCT
-#endif  // CONFIG_MRC_TX
-      ) {
+#if COLLECT_TX_SIZE_DATA
+  // Do not skip tx_split when collecting tx size data.
+  try_split = 1;
+#endif
+
+  // TX split
+  int64_t split_rd = INT64_MAX;
+  RD_STATS split_rd_stats;
+  av1_init_rd_stats(&split_rd_stats);
+  if (try_split) {
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bsl = tx_size_wide_unit[sub_txs];
-    int sub_step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int sub_step = bsw * bsh;
     RD_STATS this_rd_stats;
     int this_cost_valid = 1;
     int64_t tmp_rd = 0;
 #if CONFIG_DIST_8X8
-    int sub8x8_eob[4];
+    int sub8x8_eob[4] = { 0, 0, 0, 0 };
+    struct macroblockd_plane *const pd = &xd->plane[0];
 #endif
-    sum_rd_stats.rate =
-        av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 1);
+    split_rd_stats.rate = x->txfm_partition_cost[ctx][1];
 
     assert(tx_size < TX_SIZES_ALL);
 
-    ref_best_rd = AOMMIN(this_rd, ref_best_rd);
-
-    for (i = 0; i < 4 && this_cost_valid; ++i) {
-      int offsetr = blk_row + (i >> 1) * bsl;
-      int offsetc = blk_col + (i & 0x01) * bsl;
-
-      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+    ref_best_rd = AOMMIN(no_split_rd, ref_best_rd);
+
+    int blk_idx = 0;
+    for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) {
+      for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw, ++blk_idx) {
+        const int offsetr = blk_row + r;
+        const int offsetc = blk_col + c;
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+        assert(blk_idx < 4);
+        select_tx_block(
+            cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize,
+            ta, tl, tx_above, tx_left, &this_rd_stats, ref_best_rd - tmp_rd,
+            &this_cost_valid, ftxs_mode,
+            (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
 
-      select_tx_block(cpi, x, offsetr, offsetc, plane, block, sub_txs,
-                      depth + 1, plane_bsize, ta, tl, tx_above, tx_left,
-                      &this_rd_stats, ref_best_rd - tmp_rd, &this_cost_valid);
 #if CONFIG_DIST_8X8
-      if (x->using_dist_8x8 && plane == 0 && tx_size == TX_8X8) {
-        sub8x8_eob[i] = p->eobs[block];
-      }
+        if (!x->using_dist_8x8)
+#endif
+          if (!this_cost_valid) goto LOOP_EXIT;
+#if CONFIG_DIST_8X8
+        if (x->using_dist_8x8 && tx_size == TX_8X8) {
+          sub8x8_eob[2 * (r / bsh) + (c / bsw)] = p->eobs[block];
+        }
 #endif  // CONFIG_DIST_8X8
-      av1_merge_rd_stats(&sum_rd_stats, &this_rd_stats);
+        av1_merge_rd_stats(&split_rd_stats, &this_rd_stats);
 
-      tmp_rd = RDCOST(x->rdmult, sum_rd_stats.rate, sum_rd_stats.dist);
+        tmp_rd = RDCOST(x->rdmult, split_rd_stats.rate, split_rd_stats.dist);
 #if CONFIG_DIST_8X8
-      if (!x->using_dist_8x8)
+        if (!x->using_dist_8x8)
 #endif
-        if (this_rd < tmp_rd) break;
-      block += sub_step;
+          if (no_split_rd < tmp_rd) {
+            this_cost_valid = 0;
+            goto LOOP_EXIT;
+          }
+        block += sub_step;
+      }
     }
+
+  LOOP_EXIT : {}
+
 #if CONFIG_DIST_8X8
-    if (x->using_dist_8x8 && this_cost_valid && plane == 0 &&
-        tx_size == TX_8X8) {
+    if (x->using_dist_8x8 && this_cost_valid && tx_size == TX_8X8) {
       const int src_stride = p->src.stride;
       const int dst_stride = pd->dst.stride;
 
@@ -4997,34 +4494,33 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
                .buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
 
       int64_t dist_8x8;
-      int qindex = x->qindex;
+      const int qindex = x->qindex;
       const int pred_stride = block_size_wide[plane_bsize];
       const int pred_idx = (blk_row * pred_stride + blk_col)
                            << tx_size_wide_log2[0];
-      int16_t *pred = &pd->pred[pred_idx];
-      int j;
+      const int16_t *pred = &x->pred_luma[pred_idx];
+      int i, j;
       int row, col;
 
-#if CONFIG_HIGHBITDEPTH
       uint8_t *pred8;
       DECLARE_ALIGNED(16, uint16_t, pred8_16[8 * 8]);
-#else
-      DECLARE_ALIGNED(16, uint8_t, pred8[8 * 8]);
-#endif  // CONFIG_HIGHBITDEPTH
 
       dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride,
                               BLOCK_8X8, 8, 8, 8, 8, qindex) *
                  16;
-      sum_rd_stats.sse = dist_8x8;
 
-#if CONFIG_HIGHBITDEPTH
+#ifdef DEBUG_DIST_8X8
+      if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8)
+        assert(sum_rd_stats.sse == dist_8x8);
+#endif  // DEBUG_DIST_8X8
+
+      split_rd_stats.sse = dist_8x8;
+
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
         pred8 = CONVERT_TO_BYTEPTR(pred8_16);
       else
         pred8 = (uint8_t *)pred8_16;
-#endif
 
-#if CONFIG_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         for (row = 0; row < 2; ++row) {
           for (col = 0; col < 2; ++col) {
@@ -5047,7 +4543,6 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
           }
         }
       } else {
-#endif
         for (row = 0; row < 2; ++row) {
           for (col = 0; col < 2; ++col) {
             int idx = row * 2 + col;
@@ -5066,87 +4561,99 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
             }
           }
         }
-#if CONFIG_HIGHBITDEPTH
       }
-#endif  // CONFIG_HIGHBITDEPTH
       dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, pred8, 8, BLOCK_8X8, 8,
                               8, 8, 8, qindex) *
                  16;
-      sum_rd_stats.dist = dist_8x8;
-      tmp_rd = RDCOST(x->rdmult, sum_rd_stats.rate, sum_rd_stats.dist);
-    }
-#endif  // CONFIG_DIST_8X8
-    if (this_cost_valid) sum_rd = tmp_rd;
-  }
 
-  if (this_rd < sum_rd) {
-    int idx, idy;
-#if CONFIG_RECT_TX_EXT
-    TX_SIZE tx_size_selected = is_qttx_picked ? quarter_txsize : tx_size;
-#else
-    TX_SIZE tx_size_selected = tx_size;
-#endif
+#ifdef DEBUG_DIST_8X8
+      if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8)
+        assert(sum_rd_stats.dist == dist_8x8);
+#endif  // DEBUG_DIST_8X8
 
-#if CONFIG_RECT_TX_EXT
-    if (is_qttx_picked) {
-      assert(blk_row == 0 && blk_col == 0 && plane == 0);
-#if CONFIG_LV_MAP
-      p->txb_entropy_ctx[0] = eobs_qttx[0];
-      p->txb_entropy_ctx[block_offset_qttx] = eobs_qttx[1];
-#else
-      p->eobs[0] = eobs_qttx[0];
-      p->eobs[block_offset_qttx] = eobs_qttx[1];
-#endif
-    } else {
-#endif
-#if CONFIG_LV_MAP
-      p->txb_entropy_ctx[block] = tmp_eob;
-#else
-    p->eobs[block] = tmp_eob;
-#endif
-#if CONFIG_RECT_TX_EXT
+      split_rd_stats.dist = dist_8x8;
+      tmp_rd = RDCOST(x->rdmult, split_rd_stats.rate, split_rd_stats.dist);
     }
-#endif
+#endif  // CONFIG_DIST_8X8
+    if (this_cost_valid) split_rd = tmp_rd;
+  }
 
-#if !CONFIG_PVQ
-    av1_set_txb_context(x, plane, block, tx_size_selected, pta, ptl);
-#if CONFIG_RECT_TX_EXT
-    if (is_qttx_picked)
-      av1_set_txb_context(x, plane, block_offset_qttx, tx_size_selected,
-                          pta + blk_col_offset, ptl + blk_row_offset);
-#endif  // CONFIG_RECT_TX_EXT
-#endif  // !CONFIG_PVQ
+#if COLLECT_TX_SIZE_DATA
+  do {
+    if (tx_size <= TX_4X4 || depth >= MAX_VARTX_DEPTH) break;
 
+#if 0
+    // Randomly select blocks to collect data to reduce output file size.
+    const int rnd_val = rand() % 2;
+    if (rnd_val) break;
+#endif
+
+    const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+    const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+    const int within_border =
+        mi_row >= xd->tile.mi_row_start &&
+        (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) &&
+        mi_col >= xd->tile.mi_col_start &&
+        (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end);
+    if (!within_border) break;
+
+    FILE *fp = fopen(av1_tx_size_data_output_file, "a");
+    if (!fp) break;
+
+    // Split decision, RD cost, block type(inter/intra), q-index, rdmult,
+    // and block size.
+    const int split_selected = sum_rd < this_rd;
+    const int is_inter = 1;
+    const int txb_w = tx_size_wide[tx_size];
+    const int txb_h = tx_size_high[tx_size];
+    fprintf(fp, "%d,%lld,%lld,%d,%d,%d,%d,%d,", split_selected,
+            (long long)this_rd, (long long)sum_rd, cpi->common.base_qindex,
+            x->rdmult, is_inter, txb_w, txb_h);
+
+    // Residue signal.
+    const int diff_stride = block_size_wide[plane_bsize];
+    const int16_t *src_diff =
+        &p->src_diff[(blk_row * diff_stride + blk_col) * 4];
+    for (int r = 0; r < txb_h; ++r) {
+      for (int c = 0; c < txb_w; ++c) {
+        fprintf(fp, "%d,", src_diff[c]);
+      }
+      src_diff += diff_stride;
+    }
+    fprintf(fp, "\n");
+
+    fclose(fp);
+  } while (0);
+#endif  // COLLECT_TX_SIZE_DATA
+
+  if (no_split_rd < split_rd) {
+    const TX_SIZE tx_size_selected = tx_size;
+    p->txb_entropy_ctx[block] = no_split_txb_entropy_ctx;
+    av1_set_txb_context(x, 0, block, tx_size_selected, pta, ptl);
     txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
                           tx_size);
-    inter_tx_size[0][0] = tx_size_selected;
-    for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
-      for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
-        inter_tx_size[idy][idx] = tx_size_selected;
-    mbmi->tx_size = tx_size_selected;
-#if CONFIG_TXK_SEL
-    mbmi->txk_type[txk_idx] = best_tx_type;
-#endif
-    if (this_rd == INT64_MAX) *is_cost_valid = 0;
-#if CONFIG_RECT_TX_EXT
-    if (is_qttx_picked) {
-      x->blk_skip[plane][0] = skip_qttx[0];
-      x->blk_skip[plane][blk_row_offset * bw + blk_col_offset] = skip_qttx[1];
-    } else {
-#endif
-      x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip;
-#if CONFIG_RECT_TX_EXT
+    for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+      for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+        const int index =
+            av1_get_txb_size_index(plane_bsize, blk_row + idy, blk_col + idx);
+        mbmi->inter_tx_size[index] = tx_size_selected;
+      }
     }
-#endif
+    mbmi->tx_size = tx_size_selected;
+    update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                     no_split_tx_type);
+    x->blk_skip[blk_row * bw + blk_col] = rd_stats->skip;
   } else {
-    *rd_stats = sum_rd_stats;
-    if (sum_rd == INT64_MAX) *is_cost_valid = 0;
+    *rd_stats = split_rd_stats;
+    if (split_rd == INT64_MAX) *is_cost_valid = 0;
   }
 }
 
-static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
-                            RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                            int64_t ref_best_rd) {
+static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                                   RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                                   int64_t ref_best_rd,
+                                   FAST_TX_SEARCH_MODE ftxs_mode,
+                                   TXB_RD_INFO_NODE *rd_info_tree) {
   MACROBLOCKD *const xd = &x->e_mbd;
   int is_cost_valid = 1;
   int64_t this_rd = 0;
@@ -5157,48 +4664,57 @@ static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
 
   if (is_cost_valid) {
     const struct macroblockd_plane *const pd = &xd->plane[0];
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-    const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-    const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const int mi_width = mi_size_wide[plane_bsize];
+    const int mi_height = mi_size_high[plane_bsize];
     const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize];
     const int bh = tx_size_high_unit[max_tx_size];
     const int bw = tx_size_wide_unit[max_tx_size];
     int idx, idy;
     int block = 0;
-    int init_depth =
-        (mi_height != mi_width) ? RECT_VARTX_DEPTH_INIT : SQR_VARTX_DEPTH_INIT;
     int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
-    ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
-    ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
-    TXFM_CONTEXT tx_above[MAX_MIB_SIZE * 2];
-    TXFM_CONTEXT tx_left[MAX_MIB_SIZE * 2];
+    ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
 
     RD_STATS pn_rd_stats;
+    const int init_depth =
+        get_search_init_depth(mi_width, mi_height, 1, &cpi->sf);
     av1_init_rd_stats(&pn_rd_stats);
 
-    av1_get_entropy_contexts(bsize, 0, pd, ctxa, ctxl);
+    av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
     memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
     memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
 
     for (idy = 0; idy < mi_height; idy += bh) {
       for (idx = 0; idx < mi_width; idx += bw) {
-        select_tx_block(cpi, x, idy, idx, 0, block, max_tx_size, init_depth,
+        select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth,
                         plane_bsize, ctxa, ctxl, tx_above, tx_left,
-                        &pn_rd_stats, ref_best_rd - this_rd, &is_cost_valid);
-        if (pn_rd_stats.rate == INT_MAX) {
+                        &pn_rd_stats, ref_best_rd - this_rd, &is_cost_valid,
+                        ftxs_mode, rd_info_tree);
+        if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
           av1_invalid_rd_stats(rd_stats);
           return;
         }
         av1_merge_rd_stats(rd_stats, &pn_rd_stats);
-        this_rd += AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
-                          RDCOST(x->rdmult, 0, pn_rd_stats.sse));
+        this_rd +=
+            AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
+                   RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse));
         block += step;
+        if (rd_info_tree != NULL) rd_info_tree += 1;
       }
     }
   }
-
-  this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist),
-                   RDCOST(x->rdmult, 0, rd_stats->sse));
+  int64_t zero_rd = RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse);
+  this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+  if (zero_rd < this_rd) {
+    this_rd = zero_rd;
+    rd_stats->rate = rd_stats->zero_rate;
+    rd_stats->dist = rd_stats->sse;
+    rd_stats->skip = 1;
+  }
   if (this_rd > ref_best_rd) is_cost_valid = 0;
 
   if (!is_cost_valid) {
@@ -5209,541 +4725,711 @@ static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
 
 static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
                                        RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                                       int64_t ref_best_rd, TX_TYPE tx_type) {
-  const AV1_COMMON *const cm = &cpi->common;
+                                       int64_t ref_best_rd,
+                                       TXB_RD_INFO_NODE *rd_info_tree) {
+  const int fast_tx_search = cpi->sf.tx_size_search_method > USE_FULL_RD;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const int is_inter = is_inter_block(mbmi);
-  aom_prob skip_prob = av1_get_skip_prob(cm, xd);
-  int s0 = av1_cost_bit(skip_prob, 0);
-  int s1 = av1_cost_bit(skip_prob, 1);
+  const int skip_ctx = av1_get_skip_context(xd);
+  int s0 = x->skip_cost[skip_ctx][0];
+  int s1 = x->skip_cost[skip_ctx][1];
   int64_t rd;
-  int row, col;
-  const int max_blocks_high = max_block_high(xd, bsize, 0);
-  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
 
-  mbmi->tx_type = tx_type;
-  inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd);
-  mbmi->min_tx_size = get_min_tx_size(mbmi->inter_tx_size[0][0]);
+  // TODO(debargha): enable this as a speed feature where the
+  // select_inter_block_yrd() function above will use a simplified search
+  // such as not using full optimize, but the inter_block_yrd() function
+  // will use more complex search given that the transform partitions have
+  // already been decided.
+
+  int64_t rd_thresh = ref_best_rd;
+  if (fast_tx_search && rd_thresh < INT64_MAX) {
+    if (INT64_MAX - rd_thresh > (rd_thresh >> 3)) rd_thresh += (rd_thresh >> 3);
+  }
+  assert(rd_thresh > 0);
 
+  FAST_TX_SEARCH_MODE ftxs_mode =
+      fast_tx_search ? FTXS_DCT_AND_1D_DCT_ONLY : FTXS_NONE;
+  select_inter_block_yrd(cpi, x, rd_stats, bsize, rd_thresh, ftxs_mode,
+                         rd_info_tree);
   if (rd_stats->rate == INT_MAX) return INT64_MAX;
 
-  for (row = 0; row < max_blocks_high / 2; ++row)
-    for (col = 0; col < max_blocks_wide / 2; ++col)
-      mbmi->min_tx_size = AOMMIN(
-          mbmi->min_tx_size, get_min_tx_size(mbmi->inter_tx_size[row][col]));
-
-#if !CONFIG_TXK_SEL
-#if CONFIG_EXT_TX
-  if (get_ext_tx_types(mbmi->min_tx_size, bsize, is_inter,
-                       cm->reduced_tx_set_used) > 1 &&
-      !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-    const int ext_tx_set = get_ext_tx_set(mbmi->min_tx_size, bsize, is_inter,
-                                          cm->reduced_tx_set_used);
-#if CONFIG_LGT_FROM_PRED
-    if (is_lgt_allowed(mbmi->mode, mbmi->min_tx_size)) {
-      if (LGT_FROM_PRED_INTRA && !is_inter && ext_tx_set > 0 &&
-          ALLOW_INTRA_EXT_TX)
-        rd_stats->rate += x->intra_lgt_cost[txsize_sqr_map[mbmi->min_tx_size]]
-                                           [mbmi->mode][mbmi->use_lgt];
-      if (LGT_FROM_PRED_INTER && is_inter && ext_tx_set > 0)
-        rd_stats->rate +=
-            x->inter_lgt_cost[txsize_sqr_map[mbmi->min_tx_size]][mbmi->use_lgt];
-    }
-    if (!mbmi->use_lgt) {
-#endif  // CONFIG_LGT_FROM_PRED
-      if (is_inter) {
-        if (ext_tx_set > 0)
-          rd_stats->rate +=
-              x->inter_tx_type_costs[ext_tx_set]
-                                    [txsize_sqr_map[mbmi->min_tx_size]]
-                                    [mbmi->tx_type];
-      } else {
-        if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
-          rd_stats->rate +=
-              x->intra_tx_type_costs[ext_tx_set][mbmi->min_tx_size][mbmi->mode]
-                                    [mbmi->tx_type];
-      }
-    }
-#if CONFIG_LGT_FROM_PRED
+  // If fast_tx_search is true, only DCT and 1D DCT were tested in
+  // select_inter_block_yrd() above. Do a better search for tx type with
+  // tx sizes already decided.
+  if (fast_tx_search) {
+    if (!inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, FTXS_NONE))
+      return INT64_MAX;
   }
-#endif
-#else
-  if (mbmi->min_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id])
-    rd_stats->rate += x->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type];
-#endif  // CONFIG_EXT_TX
-#endif  // CONFIG_TXK_SEL
 
   if (rd_stats->skip)
     rd = RDCOST(x->rdmult, s1, rd_stats->sse);
   else
     rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
 
-  if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
-      !(rd_stats->skip))
+  if (is_inter && !xd->lossless[xd->mi[0]->segment_id] && !(rd_stats->skip))
     rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
 
   return rd;
 }
 
-static uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  const int rows = block_size_high[bsize];
-  const int cols = block_size_wide[bsize];
-  const int diff_stride = cols;
-  const struct macroblock_plane *const p = &x->plane[0];
-  const int16_t *diff = &p->src_diff[0];
-  uint8_t hash_data[MAX_SB_SQUARE];
-  for (int r = 0; r < rows; ++r) {
-    for (int c = 0; c < cols; ++c) {
-      hash_data[cols * r + c] = clip_pixel(diff[c] + 128);
+// Finds rd cost for a y block, given the transform size partitions
+static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+                         int blk_col, int block, TX_SIZE tx_size,
+                         BLOCK_SIZE plane_bsize, int depth,
+                         ENTROPY_CONTEXT *above_ctx, ENTROPY_CONTEXT *left_ctx,
+                         TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+                         int64_t ref_best_rd, RD_STATS *rd_stats,
+                         FAST_TX_SEARCH_MODE ftxs_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+
+  assert(tx_size < TX_SIZES_ALL);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
+      plane_bsize, blk_row, blk_col)];
+
+  int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
+                                   mbmi->sb_type, tx_size);
+
+  av1_init_rd_stats(rd_stats);
+  if (tx_size == plane_tx_size) {
+    ENTROPY_CONTEXT *ta = above_ctx + blk_col;
+    ENTROPY_CONTEXT *tl = left_ctx + blk_row;
+    const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+    TXB_CTX txb_ctx;
+    get_txb_ctx(plane_bsize, tx_size, 0, ta, tl, &txb_ctx);
+
+    const int zero_blk_rate = x->coeff_costs[txs_ctx][get_plane_type(0)]
+                                  .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+    rd_stats->zero_rate = zero_blk_rate;
+    rd_stats->ref_rdcost = ref_best_rd;
+    tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, ta,
+                  tl, rd_stats, ftxs_mode, ref_best_rd, NULL);
+    const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+    if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+            RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
+        rd_stats->skip == 1) {
+      rd_stats->rate = zero_blk_rate;
+      rd_stats->dist = rd_stats->sse;
+      rd_stats->skip = 1;
+      x->blk_skip[blk_row * mi_width + blk_col] = 1;
+      x->plane[0].eobs[block] = 0;
+      x->plane[0].txb_entropy_ctx[block] = 0;
+      update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                       DCT_DCT);
+    } else {
+      rd_stats->skip = 0;
+      x->blk_skip[blk_row * mi_width + blk_col] = 0;
     }
-    diff += diff_stride;
+    if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+      rd_stats->rate += x->txfm_partition_cost[ctx][0];
+    av1_set_txb_context(x, 0, block, tx_size, ta, tl);
+    txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
+                          tx_size);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int step = bsh * bsw;
+    RD_STATS pn_rd_stats;
+    int64_t this_rd = 0;
+    assert(bsw > 0 && bsh > 0);
+
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
+
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+        av1_init_rd_stats(&pn_rd_stats);
+        tx_block_yrd(cpi, x, offsetr, offsetc, block, sub_txs, plane_bsize,
+                     depth + 1, above_ctx, left_ctx, tx_above, tx_left,
+                     ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode);
+        if (pn_rd_stats.rate == INT_MAX) {
+          av1_invalid_rd_stats(rd_stats);
+          return;
+        }
+        av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+        this_rd += RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist);
+        block += step;
+      }
+    }
+
+    if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+      rd_stats->rate += x->txfm_partition_cost[ctx][1];
   }
-  return (av1_get_crc_value(&x->tx_rd_record.crc_calculator, hash_data,
-                            rows * cols)
-          << 7) +
-         bsize;
+}
+
+// Return value 0: early termination triggered, no valid rd cost available;
+//              1: rd cost values are valid.
+static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                           RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                           int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int is_cost_valid = 1;
+  int64_t this_rd = 0;
+
+  if (ref_best_rd < 0) is_cost_valid = 0;
+
+  av1_init_rd_stats(rd_stats);
+
+  if (is_cost_valid) {
+    const struct macroblockd_plane *const pd = &xd->plane[0];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const int mi_width = mi_size_wide[plane_bsize];
+    const int mi_height = mi_size_high[plane_bsize];
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
+    const int bh = tx_size_high_unit[max_tx_size];
+    const int bw = tx_size_wide_unit[max_tx_size];
+    const int init_depth =
+        get_search_init_depth(mi_width, mi_height, 1, &cpi->sf);
+    int idx, idy;
+    int block = 0;
+    int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+    ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+    RD_STATS pn_rd_stats;
+
+    av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
+    memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
+    memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bw) {
+        av1_init_rd_stats(&pn_rd_stats);
+        tx_block_yrd(cpi, x, idy, idx, block, max_tx_size, plane_bsize,
+                     init_depth, ctxa, ctxl, tx_above, tx_left,
+                     ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode);
+        if (pn_rd_stats.rate == INT_MAX) {
+          av1_invalid_rd_stats(rd_stats);
+          return 0;
+        }
+        av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+        this_rd +=
+            AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
+                   RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse));
+        block += step;
+      }
+    }
+  }
+  int64_t zero_rd = RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse);
+  this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+  if (zero_rd < this_rd) {
+    this_rd = zero_rd;
+    rd_stats->rate = rd_stats->zero_rate;
+    rd_stats->dist = rd_stats->sse;
+    rd_stats->skip = 1;
+  }
+  if (this_rd > ref_best_rd) is_cost_valid = 0;
+
+  if (!is_cost_valid) {
+    // reset cost value
+    av1_invalid_rd_stats(rd_stats);
+  }
+  return is_cost_valid;
+}
+
+static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+  const int16_t *diff = x->plane[0].src_diff;
+  const uint32_t hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
+                                             (uint8_t *)diff, 2 * rows * cols);
+  return (hash << 5) + bsize;
 }
 
 static void save_tx_rd_info(int n4, uint32_t hash, const MACROBLOCK *const x,
                             const RD_STATS *const rd_stats,
-                            TX_RD_INFO *const tx_rd_info) {
+                            MB_RD_RECORD *tx_rd_record) {
+  int index;
+  if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) {
+    index =
+        (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN;
+    ++tx_rd_record->num;
+  } else {
+    index = tx_rd_record->index_start;
+    tx_rd_record->index_start =
+        (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
+  }
+  MB_RD_INFO *const tx_rd_info = &tx_rd_record->tx_rd_info[index];
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   tx_rd_info->hash_value = hash;
-  tx_rd_info->tx_type = mbmi->tx_type;
   tx_rd_info->tx_size = mbmi->tx_size;
-#if CONFIG_VAR_TX
-  tx_rd_info->min_tx_size = mbmi->min_tx_size;
-  memcpy(tx_rd_info->blk_skip, x->blk_skip[0],
+  memcpy(tx_rd_info->blk_skip, x->blk_skip,
          sizeof(tx_rd_info->blk_skip[0]) * n4);
-  for (int idy = 0; idy < xd->n8_h; ++idy)
-    for (int idx = 0; idx < xd->n8_w; ++idx)
-      tx_rd_info->inter_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
-#endif  // CONFIG_VAR_TX
-#if CONFIG_TXK_SEL
+  av1_copy(tx_rd_info->inter_tx_size, mbmi->inter_tx_size);
   av1_copy(tx_rd_info->txk_type, mbmi->txk_type);
-#endif  // CONFIG_TXK_SEL
   tx_rd_info->rd_stats = *rd_stats;
 }
 
-static void fetch_tx_rd_info(int n4, const TX_RD_INFO *const tx_rd_info,
+static void fetch_tx_rd_info(int n4, const MB_RD_INFO *const tx_rd_info,
                              RD_STATS *const rd_stats, MACROBLOCK *const x) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  mbmi->tx_type = tx_rd_info->tx_type;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   mbmi->tx_size = tx_rd_info->tx_size;
-#if CONFIG_VAR_TX
-  mbmi->min_tx_size = tx_rd_info->min_tx_size;
-  memcpy(x->blk_skip[0], tx_rd_info->blk_skip,
+  memcpy(x->blk_skip, tx_rd_info->blk_skip,
          sizeof(tx_rd_info->blk_skip[0]) * n4);
-  for (int idy = 0; idy < xd->n8_h; ++idy)
-    for (int idx = 0; idx < xd->n8_w; ++idx)
-      mbmi->inter_tx_size[idy][idx] = tx_rd_info->inter_tx_size[idy][idx];
-#endif  // CONFIG_VAR_TX
-#if CONFIG_TXK_SEL
+  av1_copy(mbmi->inter_tx_size, tx_rd_info->inter_tx_size);
   av1_copy(mbmi->txk_type, tx_rd_info->txk_type);
-#endif  // CONFIG_TXK_SEL
   *rd_stats = tx_rd_info->rd_stats;
 }
 
+static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
+                                const uint32_t hash) {
+  // Linear search through the circular buffer to find matching hash.
+  int index;
+  for (int i = cur_record->num - 1; i >= 0; i--) {
+    index = (cur_record->index_start + i) % TX_SIZE_RD_RECORD_BUFFER_LEN;
+    if (cur_record->hash_vals[index] == hash) return index;
+  }
+
+  // If not found - add new RD info into the buffer and return its index
+  if (cur_record->num < TX_SIZE_RD_RECORD_BUFFER_LEN) {
+    index = (cur_record->index_start + cur_record->num) %
+            TX_SIZE_RD_RECORD_BUFFER_LEN;
+    cur_record->num++;
+  } else {
+    index = cur_record->index_start;
+    cur_record->index_start =
+        (cur_record->index_start + 1) % TX_SIZE_RD_RECORD_BUFFER_LEN;
+  }
+
+  cur_record->hash_vals[index] = hash;
+  av1_zero(cur_record->tx_rd_info[index]);
+  return index;
+}
+
+// Go through all TX blocks that could be used in TX size search, compute
+// residual hash values for them and find matching RD info that stores previous
+// RD search results for these TX blocks. The idea is to prevent repeated
+// rate/distortion computations that happen because of the combination of
+// partition and TX size search. The resulting RD info records are returned in
+// the form of a quadtree for easier access in actual TX size search.
+static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+                                   int mi_col, TXB_RD_INFO_NODE *dst_rd_info) {
+  TXB_RD_RECORD *rd_records_table[4] = { x->txb_rd_record_8X8,
+                                         x->txb_rd_record_16X16,
+                                         x->txb_rd_record_32X32,
+                                         x->txb_rd_record_64X64 };
+  const TX_SIZE max_square_tx_size = max_txsize_lookup[bsize];
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+
+  // Hashing is performed only for square TX sizes larger than TX_4X4
+  if (max_square_tx_size < TX_8X8) return 0;
+
+  const int bw_mi = mi_size_wide[bsize];
+  const int diff_stride = bw;
+  const struct macroblock_plane *const p = &x->plane[0];
+  const int16_t *diff = &p->src_diff[0];
+
+  // Coordinates of the top-left corner of current block within the superblock
+  // measured in pixels:
+  const int mi_row_in_sb = (mi_row % MAX_MIB_SIZE) << MI_SIZE_LOG2;
+  const int mi_col_in_sb = (mi_col % MAX_MIB_SIZE) << MI_SIZE_LOG2;
+  int cur_rd_info_idx = 0;
+  int cur_tx_depth = 0;
+  uint8_t parent_idx_buf[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
+  uint8_t child_idx_buf[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
+  TX_SIZE cur_tx_size = max_txsize_rect_lookup[bsize];
+  while (cur_tx_depth <= MAX_VARTX_DEPTH) {
+    const int cur_tx_bw = tx_size_wide[cur_tx_size];
+    const int cur_tx_bh = tx_size_high[cur_tx_size];
+    if (cur_tx_bw < 8 || cur_tx_bh < 8) break;
+    const TX_SIZE next_tx_size = sub_tx_size_map[cur_tx_size];
+    for (int row = 0; row < bh; row += cur_tx_bh) {
+      for (int col = 0; col < bw; col += cur_tx_bw) {
+        if (cur_tx_bw != cur_tx_bh) {
+          // Use dummy nodes for all rectangular transforms within the
+          // TX size search tree.
+          dst_rd_info[cur_rd_info_idx].rd_info_array = NULL;
+        } else {
+          // Get spatial location of this TX block within the superblock
+          // (measured in cur_tx_bsize units).
+          const int row_in_sb = (mi_row_in_sb + row) / cur_tx_bh;
+          const int col_in_sb = (mi_col_in_sb + col) / cur_tx_bw;
+
+          int16_t hash_data[MAX_SB_SQUARE];
+          int16_t *cur_hash_row = hash_data;
+          const int16_t *cur_diff_row = diff + row * diff_stride + col;
+          for (int i = 0; i < cur_tx_bh; i++) {
+            memcpy(cur_hash_row, cur_diff_row, sizeof(*hash_data) * cur_tx_bw);
+            cur_hash_row += cur_tx_bw;
+            cur_diff_row += diff_stride;
+          }
+          const int hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
+                                                (uint8_t *)hash_data,
+                                                2 * cur_tx_bw * cur_tx_bh);
+
+          // Find corresponding RD info based on the hash value.
+          const int rd_record_idx =
+              row_in_sb * (MAX_MIB_SIZE >> (cur_tx_size + 1 - TX_8X8)) +
+              col_in_sb;
+
+          int idx = find_tx_size_rd_info(
+              &rd_records_table[cur_tx_size - TX_8X8][rd_record_idx], hash);
+          dst_rd_info[cur_rd_info_idx].rd_info_array =
+              &rd_records_table[cur_tx_size - TX_8X8][rd_record_idx]
+                   .tx_rd_info[idx];
+        }
+
+        // Update the output quadtree RD info structure.
+        av1_zero(dst_rd_info[cur_rd_info_idx].children);
+        const int this_mi_row = row / MI_SIZE;
+        const int this_mi_col = col / MI_SIZE;
+        if (cur_tx_depth > 0) {  // Set up child pointers.
+          const int mi_index = this_mi_row * bw_mi + this_mi_col;
+          const int child_idx = child_idx_buf[mi_index];
+          assert(child_idx < 4);
+          dst_rd_info[parent_idx_buf[mi_index]].children[child_idx] =
+              &dst_rd_info[cur_rd_info_idx];
+        }
+        if (cur_tx_depth < MAX_VARTX_DEPTH) {  // Set up parent and child idx.
+          const int tx_bh_mi = cur_tx_bh / MI_SIZE;
+          const int tx_bw_mi = cur_tx_bw / MI_SIZE;
+          for (int i = this_mi_row; i < this_mi_row + tx_bh_mi; ++i) {
+            memset(parent_idx_buf + i * bw_mi + this_mi_col, cur_rd_info_idx,
+                   tx_bw_mi);
+          }
+          int child_idx = 0;
+          const int next_tx_bh_mi = tx_size_wide_unit[next_tx_size];
+          const int next_tx_bw_mi = tx_size_wide_unit[next_tx_size];
+          for (int i = this_mi_row; i < this_mi_row + tx_bh_mi;
+               i += next_tx_bh_mi) {
+            for (int j = this_mi_col; j < this_mi_col + tx_bw_mi;
+                 j += next_tx_bw_mi) {
+              assert(child_idx < 4);
+              child_idx_buf[i * bw_mi + j] = child_idx++;
+            }
+          }
+        }
+        ++cur_rd_info_idx;
+      }
+    }
+    cur_tx_size = next_tx_size;
+    ++cur_tx_depth;
+  }
+  return 1;
+}
+
+// origin_threshold * 128 / 100
+static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = {
+  {
+      64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68,
+      68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68,
+  },
+  {
+      88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68,
+      68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68,
+  },
+  {
+      90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74,
+      74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74,
+  },
+};
+
+// lookup table for predict_skip_flag
+// int max_tx_size = max_txsize_rect_lookup[bsize];
+// if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16)
+//   max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16);
+static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = {
+  TX_4X4,   TX_4X8,   TX_8X4,   TX_8X8,   TX_8X16,  TX_16X8,
+  TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16,
+  TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16,  TX_16X4,
+  TX_8X8,   TX_8X8,   TX_16X16, TX_16X16,
+};
+
 // Uses simple features on top of DCT coefficients to quickly predict
 // whether optimal RD decision is to skip encoding the residual.
-static int predict_skip_flag_8bit(const MACROBLOCK *x, BLOCK_SIZE bsize) {
-  if (bsize > BLOCK_16X16) return 0;
-  // Tuned for target false-positive rate of 5% for all block sizes:
-  const uint32_t threshold_table[] = { 50, 50, 50, 55, 47, 47, 53, 22, 22, 37 };
-  const struct macroblock_plane *const p = &x->plane[0];
+// The sse value is stored in dist.
+static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
+                             int reduced_tx_set) {
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
-  tran_low_t DCT_coefs[32 * 32];
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
+
+  *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize);
+  const int64_t mse = *dist / bw / bh;
+  // Normalized quantizer takes the transform upscaling factor (8 for tx size
+  // smaller than 32) into account.
+  const int16_t normalized_dc_q = dc_q >> 3;
+  const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8;
+  // Predict not to skip when mse is larger than threshold.
+  if (mse > mse_thresh) return 0;
+
+  const int max_tx_size = max_predict_sf_tx_size[bsize];
+  const int tx_h = tx_size_high[max_tx_size];
+  const int tx_w = tx_size_wide[max_tx_size];
+  DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]);
   TxfmParam param;
   param.tx_type = DCT_DCT;
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-  param.tx_size = max_txsize_rect_lookup[bsize];
-#else
-  param.tx_size = max_txsize_lookup[bsize];
-#endif
-  param.bd = 8;
+  param.tx_size = max_tx_size;
+  param.bd = xd->bd;
+  param.is_hbd = get_bitdepth_data_path_index(xd);
   param.lossless = 0;
-  av1_fwd_txfm(p->src_diff, DCT_coefs, bw, &param);
-
-  uint32_t dc = (uint32_t)av1_dc_quant(x->qindex, 0, AOM_BITS_8);
-  uint32_t ac = (uint32_t)av1_ac_quant(x->qindex, 0, AOM_BITS_8);
-  uint32_t max_quantized_coef = (100 * (uint32_t)abs(DCT_coefs[0])) / dc;
-  for (int i = 1; i < bw * bh; i++) {
-    uint32_t cur_quantized_coef = (100 * (uint32_t)abs(DCT_coefs[i])) / ac;
-    if (cur_quantized_coef > max_quantized_coef)
-      max_quantized_coef = cur_quantized_coef;
+  param.tx_set_type = av1_get_ext_tx_set_type(
+      param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
+  const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2);
+  const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize];
+  const int16_t *src_diff = x->plane[0].src_diff;
+  const int n_coeff = tx_w * tx_h;
+  const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
+  const uint32_t dc_thresh = max_qcoef_thresh * dc_q;
+  const uint32_t ac_thresh = max_qcoef_thresh * ac_q;
+  for (int row = 0; row < bh; row += tx_h) {
+    for (int col = 0; col < bw; col += tx_w) {
+      av1_fwd_txfm(src_diff + col, coefs, bw, &param);
+      // Operating on TX domain, not pixels; we want the QTX quantizers
+      const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7);
+      if (dc_coef >= dc_thresh) return 0;
+      for (int i = 1; i < n_coeff; ++i) {
+        const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7);
+        if (ac_coef >= ac_thresh) return 0;
+      }
+    }
+    src_diff += tx_h * bw;
   }
-
-  return max_quantized_coef < threshold_table[AOMMAX(bsize - BLOCK_4X4, 0)];
+  return 1;
 }
 
 // Used to set proper context for early termination with skip = 1.
-static void set_skip_flag(const AV1_COMP *cpi, MACROBLOCK *x,
-                          RD_STATS *rd_stats, int bsize) {
+static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize,
+                          int64_t dist) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const int n4 = bsize_to_num_blk(bsize);
-#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
   const TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
-#else
-  const TX_SIZE tx_size = max_txsize_lookup[bsize];
-#endif
-  mbmi->tx_type = DCT_DCT;
-  for (int idy = 0; idy < xd->n8_h; ++idy)
-    for (int idx = 0; idx < xd->n8_w; ++idx)
-      mbmi->inter_tx_size[idy][idx] = tx_size;
+  memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN);
+  memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size));
   mbmi->tx_size = tx_size;
-  mbmi->min_tx_size = get_min_tx_size(tx_size);
-  memset(x->blk_skip[0], 1, sizeof(uint8_t) * n4);
+  memset(x->blk_skip, 1, sizeof(x->blk_skip[0]) * n4);
   rd_stats->skip = 1;
 
   // Rate.
-  const int tx_size_ctx = txsize_sqr_map[tx_size];
-  ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
-  ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
-  av1_get_entropy_contexts(bsize, 0, &xd->plane[0], ctxa, ctxl);
-  int coeff_ctx = get_entropy_context(tx_size, ctxa, ctxl);
-  int rate = x->token_head_costs[tx_size_ctx][PLANE_TYPE_Y][1][0][coeff_ctx][0];
+  const int tx_size_ctx = get_txsize_entropy_ctx(tx_size);
+  ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+  ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+  av1_get_entropy_contexts(bsize, &xd->plane[0], ctxa, ctxl);
+  TXB_CTX txb_ctx;
+  // Because plane is 0, plane_bsize equal to bsize
+  get_txb_ctx(bsize, tx_size, 0, ctxa, ctxl, &txb_ctx);
+  int rate = x->coeff_costs[tx_size_ctx][PLANE_TYPE_Y]
+                 .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
   if (tx_size > TX_4X4) {
     int ctx = txfm_partition_context(
         xd->above_txfm_context, xd->left_txfm_context, mbmi->sb_type, tx_size);
-    rate += av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
+    rate += x->txfm_partition_cost[ctx][0];
   }
-#if !CONFIG_TXK_SEL
-#if CONFIG_EXT_TX
-  const AV1_COMMON *cm = &cpi->common;
-  const int ext_tx_set = get_ext_tx_set(max_txsize_lookup[bsize], bsize, 1,
-                                        cm->reduced_tx_set_used);
-  if (get_ext_tx_types(mbmi->min_tx_size, bsize, 1, cm->reduced_tx_set_used) >
-          1 &&
-      !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-    if (ext_tx_set > 0)
-      rate +=
-          x->inter_tx_type_costs[ext_tx_set][txsize_sqr_map[mbmi->min_tx_size]]
-                                [mbmi->tx_type];
-  }
-#else
-  if (mbmi->min_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id])
-    rd_stats->rate += x->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type];
-#endif  // CONFIG_EXT_TX
-#endif  // CONFIG_TXK_SEL
   rd_stats->rate = rate;
-
-  // Distortion.
-  int64_t tmp = pixel_diff_dist(x, 0, x->plane[0].src_diff,
-                                block_size_wide[bsize], 0, 0, bsize, bsize);
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
-#endif  // CONFIG_HIGHBITDEPTH
-  rd_stats->dist = rd_stats->sse = (tmp << 4);
+    dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
+  rd_stats->dist = rd_stats->sse = (dist << 4);
 }
 
 static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
-                               RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                               int64_t ref_best_rd) {
+                               RD_STATS *rd_stats, BLOCK_SIZE bsize, int mi_row,
+                               int mi_col, int64_t ref_best_rd) {
   const AV1_COMMON *cm = &cpi->common;
-  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   int64_t rd = INT64_MAX;
   int64_t best_rd = INT64_MAX;
-  TX_TYPE tx_type, best_tx_type = DCT_DCT;
   const int is_inter = is_inter_block(mbmi);
-  TX_SIZE best_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
-  TX_SIZE best_tx = max_txsize_lookup[bsize];
-  TX_SIZE best_min_tx_size = TX_SIZES_ALL;
-  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
-  TX_TYPE txk_start = DCT_DCT;
-#if CONFIG_TXK_SEL
-  TX_TYPE txk_end = DCT_DCT + 1;
-#else
-  TX_TYPE txk_end = TX_TYPES;
-#endif
   const int n4 = bsize_to_num_blk(bsize);
-  int idx, idy;
-  int prune = 0;
-#if CONFIG_EXT_TX
-  const TxSetType tx_set_type = get_ext_tx_set_type(
-      max_tx_size, bsize, is_inter, cm->reduced_tx_set_used);
-  const int ext_tx_set =
-      get_ext_tx_set(max_tx_size, bsize, is_inter, cm->reduced_tx_set_used);
-#endif  // CONFIG_EXT_TX
+  // Get the tx_size 1 level down
+  const TX_SIZE min_tx_size = sub_tx_size_map[max_txsize_rect_lookup[bsize]];
+  const TxSetType tx_set_type =
+      av1_get_ext_tx_set_type(min_tx_size, is_inter, cm->reduced_tx_set_used);
+  const int within_border =
+      mi_row >= xd->tile.mi_row_start &&
+      (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) &&
+      mi_col >= xd->tile.mi_col_start &&
+      (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end);
 
   av1_invalid_rd_stats(rd_stats);
 
-#if CONFIG_LGT_FROM_PRED
-  mbmi->use_lgt = 0;
-  int search_lgt = is_inter
-                       ? LGT_FROM_PRED_INTER &&
-                             (!cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
-                       : LGT_FROM_PRED_INTRA && ALLOW_INTRA_EXT_TX;
-#endif  // CONFIG_LGT_FROM_PRED
+  if (cpi->sf.model_based_prune_tx_search_level && ref_best_rd != INT64_MAX) {
+    int model_rate;
+    int64_t model_dist;
+    int model_skip;
+    model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &model_rate, &model_dist,
+                    &model_skip, NULL, NULL, NULL, NULL);
+    const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist);
+    // If the modeled rd is a lot worse than the best so far, breakout.
+    // TODO(debargha, urvang): Improve the model and make the check below
+    // tighter.
+    assert(cpi->sf.model_based_prune_tx_search_level >= 0 &&
+           cpi->sf.model_based_prune_tx_search_level <= 2);
+    if (!model_skip &&
+        model_rd / (5 - cpi->sf.model_based_prune_tx_search_level) >
+            ref_best_rd)
+      return;
+  }
 
   const uint32_t hash = get_block_residue_hash(x, bsize);
-  TX_RD_RECORD *tx_rd_record = &x->tx_rd_record;
+  MB_RD_RECORD *mb_rd_record = &x->mb_rd_record;
 
-  if (ref_best_rd != INT64_MAX) {
-    for (int i = 0; i < tx_rd_record->num; ++i) {
-      const int index = (tx_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
+  if (ref_best_rd != INT64_MAX && within_border && cpi->sf.use_mb_rd_hash) {
+    for (int i = 0; i < mb_rd_record->num; ++i) {
+      const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
       // If there is a match in the tx_rd_record, fetch the RD decision and
       // terminate early.
-      if (tx_rd_record->tx_rd_info[index].hash_value == hash) {
-        TX_RD_INFO *tx_rd_info = &tx_rd_record->tx_rd_info[index];
+      if (mb_rd_record->tx_rd_info[index].hash_value == hash) {
+        MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[index];
         fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x);
         return;
       }
     }
   }
 
-// If we predict that skip is the optimal RD decision - set the respective
-// context and terminate early.
-#if CONFIG_HIGHBITDEPTH
-  if (!(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH))
-#endif  // CONFIG_HIGHBITDEPTH
-  {
-    if (is_inter && cpi->sf.tx_type_search.use_skip_flag_prediction &&
-        predict_skip_flag_8bit(x, bsize)) {
-      set_skip_flag(cpi, x, rd_stats, bsize);
-      return;
-    }
+  // If we predict that skip is the optimal RD decision - set the respective
+  // context and terminate early.
+  int64_t dist;
+  if (is_inter && cpi->sf.tx_type_search.use_skip_flag_prediction &&
+      predict_skip_flag(x, bsize, &dist, cm->reduced_tx_set_used)) {
+    set_skip_flag(x, rd_stats, bsize, dist);
+    // Save the RD search results into tx_rd_record.
+    if (within_border) save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+    return;
   }
 
-  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
-#if CONFIG_EXT_TX
-    prune = prune_tx_types(cpi, bsize, x, xd, ext_tx_set);
-#else
-    prune = prune_tx_types(cpi, bsize, x, xd, 0);
-#endif  // CONFIG_EXT_TX
+  // Precompute residual hashes and find existing or add new RD records to
+  // store and reuse rate and distortion values to speed up TX size search.
+  TXB_RD_INFO_NODE matched_rd_info[16 + 64 + 256];
+  int found_rd_info = 0;
+  if (ref_best_rd != INT64_MAX && within_border && cpi->sf.use_inter_txb_hash) {
+    found_rd_info =
+        find_tx_size_rd_records(x, bsize, mi_row, mi_col, matched_rd_info);
+  }
+
+  prune_tx(cpi, bsize, x, xd, tx_set_type);
 
   int found = 0;
 
-  for (tx_type = txk_start; tx_type < txk_end; ++tx_type) {
-    RD_STATS this_rd_stats;
-    av1_init_rd_stats(&this_rd_stats);
-#if CONFIG_MRC_TX
-    // MRC_DCT only implemented for TX_32X32 so only include this tx in
-    // the search for TX_32X32
-    if (tx_type == MRC_DCT &&
-        (max_tx_size != TX_32X32 || (is_inter && !USE_MRC_INTER) ||
-         (!is_inter && !USE_MRC_INTRA)))
-      continue;
-#endif  // CONFIG_MRC_TX
-#if CONFIG_EXT_TX
-    if (!av1_ext_tx_used[tx_set_type][tx_type]) continue;
-    if (is_inter) {
-      if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
-        if (!do_tx_type_search(tx_type, prune)) continue;
-      }
-    } else {
-      if (!ALLOW_INTRA_EXT_TX && bsize >= BLOCK_8X8) {
-        if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue;
-      }
-    }
-#else   // CONFIG_EXT_TX
-    if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
-        !do_tx_type_search(tx_type, prune))
-      continue;
-#endif  // CONFIG_EXT_TX
-    if (is_inter && x->use_default_inter_tx_type &&
-        tx_type != get_default_tx_type(0, xd, 0, max_tx_size))
-      continue;
+  RD_STATS this_rd_stats;
+  av1_init_rd_stats(&this_rd_stats);
 
-    if (xd->lossless[mbmi->segment_id])
-      if (tx_type != DCT_DCT) continue;
+  rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
+                               found_rd_info ? matched_rd_info : NULL);
 
-    rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
-                                 tx_type);
-    ref_best_rd = AOMMIN(rd, ref_best_rd);
-    if (rd < best_rd) {
-      best_rd = rd;
-      *rd_stats = this_rd_stats;
-      best_tx_type = mbmi->tx_type;
-      best_tx = mbmi->tx_size;
-      best_min_tx_size = mbmi->min_tx_size;
-      memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4);
-      found = 1;
-      for (idy = 0; idy < xd->n8_h; ++idy)
-        for (idx = 0; idx < xd->n8_w; ++idx)
-          best_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
-    }
+  ref_best_rd = AOMMIN(rd, ref_best_rd);
+  if (rd < best_rd) {
+    *rd_stats = this_rd_stats;
+    found = 1;
   }
 
+  // Reset the pruning flags.
+  av1_zero(x->tx_search_prune);
+  x->tx_split_prune_flag = 0;
+
   // We should always find at least one candidate unless ref_best_rd is less
   // than INT64_MAX (in which case, all the calls to select_tx_size_fix_type
   // might have failed to find something better)
   assert(IMPLIES(!found, ref_best_rd != INT64_MAX));
   if (!found) return;
 
-#if CONFIG_LGT_FROM_PRED
-  if (search_lgt && is_lgt_allowed(mbmi->mode, max_tx_size) &&
-      !cm->reduced_tx_set_used) {
-    RD_STATS this_rd_stats;
-    mbmi->use_lgt = 1;
-    rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd, 0);
-    if (rd < best_rd) {
-      best_rd = rd;
-      *rd_stats = this_rd_stats;
-      best_tx = mbmi->tx_size;
-      best_min_tx_size = mbmi->min_tx_size;
-      memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4);
-      for (idy = 0; idy < xd->n8_h; ++idy)
-        for (idx = 0; idx < xd->n8_w; ++idx)
-          best_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
-    } else {
-      mbmi->use_lgt = 0;
-    }
-  }
-#endif  // CONFIG_LGT_FROM_PRED
-  // We found a candidate transform to use. Copy our results from the "best"
-  // array into mbmi.
-  mbmi->tx_type = best_tx_type;
-  for (idy = 0; idy < xd->n8_h; ++idy)
-    for (idx = 0; idx < xd->n8_w; ++idx)
-      mbmi->inter_tx_size[idy][idx] = best_tx_size[idy][idx];
-  mbmi->tx_size = best_tx;
-  mbmi->min_tx_size = best_min_tx_size;
-  memcpy(x->blk_skip[0], best_blk_skip, sizeof(best_blk_skip[0]) * n4);
-
   // Save the RD search results into tx_rd_record.
-  int index;
-  if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) {
-    index =
-        (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN;
-    ++tx_rd_record->num;
-  } else {
-    index = tx_rd_record->index_start;
-    tx_rd_record->index_start =
-        (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
-  }
-  save_tx_rd_info(n4, hash, x, rd_stats, &tx_rd_record->tx_rd_info[index]);
+  if (within_border && cpi->sf.use_mb_rd_hash)
+    save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
 }
 
-static void tx_block_rd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
-                        int blk_col, int plane, int block, TX_SIZE tx_size,
-                        BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *above_ctx,
-                        ENTROPY_CONTEXT *left_ctx, RD_STATS *rd_stats) {
+static void tx_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+                          int blk_col, int plane, int block, TX_SIZE tx_size,
+                          BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *above_ctx,
+                          ENTROPY_CONTEXT *left_ctx, RD_STATS *rd_stats,
+                          FAST_TX_SEARCH_MODE ftxs_mode) {
+  assert(plane > 0);
+  assert(tx_size < TX_SIZES_ALL);
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-  const int tx_row = blk_row >> (1 - pd->subsampling_y);
-  const int tx_col = blk_col >> (1 - pd->subsampling_x);
-  TX_SIZE plane_tx_size;
   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-
-  assert(tx_size < TX_SIZES_ALL);
-
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
-  plane_tx_size =
-      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
-            : mbmi->inter_tx_size[tx_row][tx_col];
-
-  if (tx_size == plane_tx_size) {
-    ENTROPY_CONTEXT *ta = above_ctx + blk_col;
-    ENTROPY_CONTEXT *tl = left_ctx + blk_row;
-    av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
-                      plane_bsize, ta, tl, rd_stats);
-#if !CONFIG_PVQ
-    av1_set_txb_context(x, plane, block, tx_size, ta, tl);
-#endif  // !CONFIG_PVQ
-  } else {
-    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    const int bsl = tx_size_wide_unit[sub_txs];
-    int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
-    int i;
-
-    assert(bsl > 0);
-
-    for (i = 0; i < 4; ++i) {
-      int offsetr = blk_row + (i >> 1) * bsl;
-      int offsetc = blk_col + (i & 0x01) * bsl;
-
-      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-
-      tx_block_rd(cpi, x, offsetr, offsetc, plane, block, sub_txs, plane_bsize,
-                  above_ctx, left_ctx, rd_stats);
-      block += step;
-    }
-  }
+  ENTROPY_CONTEXT *ta = above_ctx + blk_col;
+  ENTROPY_CONTEXT *tl = left_ctx + blk_row;
+  tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block, plane_bsize,
+                ta, tl, rd_stats, ftxs_mode, INT64_MAX, NULL);
+  av1_set_txb_context(x, plane, block, tx_size, ta, tl);
 }
 
 // Return value 0: early termination triggered, no valid rd cost available;
 //              1: rd cost values are valid.
 static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
                             RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                            int64_t ref_best_rd) {
+                            int64_t ref_best_rd,
+                            FAST_TX_SEARCH_MODE ftxs_mode) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   int plane;
   int is_cost_valid = 1;
-  int64_t this_rd;
+  int64_t this_rd = 0;
 
   if (ref_best_rd < 0) is_cost_valid = 0;
 
   av1_init_rd_stats(rd_stats);
 
-#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
   if (x->skip_chroma_rd) return is_cost_valid;
-  bsize = scale_chroma_bsize(mbmi->sb_type, xd->plane[1].subsampling_x,
-                             xd->plane[1].subsampling_y);
-#endif  // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
-
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-  if (is_rect_tx(mbmi->tx_size)) {
-    return super_block_uvrd(cpi, x, rd_stats, bsize, ref_best_rd);
-  }
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+  const BLOCK_SIZE bsizec = scale_chroma_bsize(
+      bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
 
   if (is_inter_block(mbmi) && is_cost_valid) {
     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
-      av1_subtract_plane(x, bsize, plane);
+      av1_subtract_plane(x, bsizec, plane);
   }
 
-  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-    const struct macroblockd_plane *const pd = &xd->plane[plane];
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-    const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-    const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
-    const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize];
-    const int bh = tx_size_high_unit[max_tx_size];
-    const int bw = tx_size_wide_unit[max_tx_size];
-    int idx, idy;
-    int block = 0;
-    const int step = bh * bw;
-    ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE];
-    ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE];
-    RD_STATS pn_rd_stats;
-    av1_init_rd_stats(&pn_rd_stats);
-
-    av1_get_entropy_contexts(bsize, 0, pd, ta, tl);
-
-    for (idy = 0; idy < mi_height; idy += bh) {
-      for (idx = 0; idx < mi_width; idx += bw) {
-        tx_block_rd(cpi, x, idy, idx, plane, block, max_tx_size, plane_bsize,
-                    ta, tl, &pn_rd_stats);
-        block += step;
+  if (is_cost_valid) {
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const BLOCK_SIZE plane_bsize =
+          get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
+      const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+      const int mi_height =
+          block_size_high[plane_bsize] >> tx_size_high_log2[0];
+      const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+      const int bh = tx_size_high_unit[max_tx_size];
+      const int bw = tx_size_wide_unit[max_tx_size];
+      int idx, idy;
+      int block = 0;
+      const int step = bh * bw;
+      ENTROPY_CONTEXT ta[MAX_MIB_SIZE];
+      ENTROPY_CONTEXT tl[MAX_MIB_SIZE];
+      RD_STATS pn_rd_stats;
+      av1_init_rd_stats(&pn_rd_stats);
+      av1_get_entropy_contexts(bsizec, pd, ta, tl);
+
+      for (idy = 0; idy < mi_height; idy += bh) {
+        for (idx = 0; idx < mi_width; idx += bw) {
+          tx_block_uvrd(cpi, x, idy, idx, plane, block, max_tx_size,
+                        plane_bsize, ta, tl, &pn_rd_stats, ftxs_mode);
+          block += step;
+        }
       }
-    }
 
-    if (pn_rd_stats.rate == INT_MAX) {
-      is_cost_valid = 0;
-      break;
-    }
+      if (pn_rd_stats.rate == INT_MAX) {
+        is_cost_valid = 0;
+        break;
+      }
 
-    av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+      av1_merge_rd_stats(rd_stats, &pn_rd_stats);
 
-    this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist),
-                     RDCOST(x->rdmult, 0, rd_stats->sse));
+      this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist),
+                       RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse));
 
-    if (this_rd > ref_best_rd) {
-      is_cost_valid = 0;
-      break;
+      if (this_rd > ref_best_rd) {
+        is_cost_valid = 0;
+        break;
+      }
     }
   }
 
@@ -5754,7 +5440,6 @@ static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
 
   return is_cost_valid;
 }
-#endif  // CONFIG_VAR_TX
 
 static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
                                        int dc_mode_cost,
@@ -5764,11 +5449,12 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
                                        int *rate_tokenonly, int64_t *distortion,
                                        int *skippable) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   assert(!is_inter_block(mbmi));
+  assert(
+      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type));
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const BLOCK_SIZE bsize = mbmi->sb_type;
-  assert(bsize >= BLOCK_8X8);
   int this_rate;
   int64_t this_rd;
   int colors_u, colors_v, colors;
@@ -5780,42 +5466,32 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
   int plane_block_width, plane_block_height, rows, cols;
   av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
                            &plane_block_height, &rows, &cols);
-  if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return;
 
   mbmi->uv_mode = UV_DC_PRED;
-#if CONFIG_FILTER_INTRA
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
-#endif  // CONFIG_FILTER_INTRA
 
-#if CONFIG_HIGHBITDEPTH
+  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
   if (cpi->common.use_highbitdepth) {
     colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols,
-                                       cpi->common.bit_depth);
+                                       cpi->common.bit_depth, count_buf);
     colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols,
-                                       cpi->common.bit_depth);
+                                       cpi->common.bit_depth, count_buf);
   } else {
-#endif  // CONFIG_HIGHBITDEPTH
-    colors_u = av1_count_colors(src_u, src_stride, rows, cols);
-    colors_v = av1_count_colors(src_v, src_stride, rows, cols);
-#if CONFIG_HIGHBITDEPTH
+    colors_u = av1_count_colors(src_u, src_stride, rows, cols, count_buf);
+    colors_v = av1_count_colors(src_v, src_stride, rows, cols, count_buf);
   }
-#endif  // CONFIG_HIGHBITDEPTH
 
-#if CONFIG_PALETTE_DELTA_ENCODING
   uint16_t color_cache[2 * PALETTE_MAX_SIZE];
   const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
 
   colors = colors_u > colors_v ? colors_u : colors_v;
   if (colors > 1 && colors <= 64) {
     int r, c, n, i, j;
     const int max_itr = 50;
-    float lb_u, ub_u, val_u;
-    float lb_v, ub_v, val_v;
-    float *const data = x->palette_buffer->kmeans_data_buf;
-    float centroids[2 * PALETTE_MAX_SIZE];
+    int lb_u, ub_u, val_u;
+    int lb_v, ub_v, val_v;
+    int *const data = x->palette_buffer->kmeans_data_buf;
+    int centroids[2 * PALETTE_MAX_SIZE];
 
-#if CONFIG_HIGHBITDEPTH
     uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
     uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
     if (cpi->common.use_highbitdepth) {
@@ -5824,32 +5500,25 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
       lb_v = src_v16[0];
       ub_v = src_v16[0];
     } else {
-#endif  // CONFIG_HIGHBITDEPTH
       lb_u = src_u[0];
       ub_u = src_u[0];
       lb_v = src_v[0];
       ub_v = src_v[0];
-#if CONFIG_HIGHBITDEPTH
     }
-#endif  // CONFIG_HIGHBITDEPTH
 
     for (r = 0; r < rows; ++r) {
       for (c = 0; c < cols; ++c) {
-#if CONFIG_HIGHBITDEPTH
         if (cpi->common.use_highbitdepth) {
           val_u = src_u16[r * src_stride + c];
           val_v = src_v16[r * src_stride + c];
           data[(r * cols + c) * 2] = val_u;
           data[(r * cols + c) * 2 + 1] = val_v;
         } else {
-#endif  // CONFIG_HIGHBITDEPTH
           val_u = src_u[r * src_stride + c];
           val_v = src_v[r * src_stride + c];
           data[(r * cols + c) * 2] = val_u;
           data[(r * cols + c) * 2 + 1] = val_v;
-#if CONFIG_HIGHBITDEPTH
         }
-#endif  // CONFIG_HIGHBITDEPTH
         if (val_u < lb_u)
           lb_u = val_u;
         else if (val_u > ub_u)
@@ -5868,34 +5537,30 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
         centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
       }
       av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
-#if CONFIG_PALETTE_DELTA_ENCODING
       optimize_palette_colors(color_cache, n_cache, n, 2, centroids);
       // Sort the U channel colors in ascending order.
       for (i = 0; i < 2 * (n - 1); i += 2) {
         int min_idx = i;
-        float min_val = centroids[i];
+        int min_val = centroids[i];
         for (j = i + 2; j < 2 * n; j += 2)
           if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
         if (min_idx != i) {
-          float temp_u = centroids[i], temp_v = centroids[i + 1];
+          int temp_u = centroids[i], temp_v = centroids[i + 1];
           centroids[i] = centroids[min_idx];
           centroids[i + 1] = centroids[min_idx + 1];
           centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
         }
       }
       av1_calc_indices(data, centroids, color_map, rows * cols, n, 2);
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
       extend_palette_color_map(color_map, cols, rows, plane_block_width,
                                plane_block_height);
       pmi->palette_size[1] = n;
       for (i = 1; i < 3; ++i) {
         for (j = 0; j < n; ++j) {
-#if CONFIG_HIGHBITDEPTH
           if (cpi->common.use_highbitdepth)
             pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
                 (int)centroids[j * 2 + i - 1], cpi->common.bit_depth);
           else
-#endif  // CONFIG_HIGHBITDEPTH
             pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
                 clip_pixel((int)centroids[j * 2 + i - 1]);
         }
@@ -5903,19 +5568,8 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
 
       super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
       if (tokenonly_rd_stats.rate == INT_MAX) continue;
-      this_rate =
-          tokenonly_rd_stats.rate + dc_mode_cost +
-          x->palette_uv_size_cost[bsize - BLOCK_8X8][n - PALETTE_MIN_SIZE] +
-          write_uniform_cost(n, color_map[0]) +
-          av1_cost_bit(
-              av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 1);
-      this_rate += av1_palette_color_cost_uv(pmi,
-#if CONFIG_PALETTE_DELTA_ENCODING
-                                             color_cache, n_cache,
-#endif  // CONFIG_PALETTE_DELTA_ENCODING
-                                             cpi->common.bit_depth);
-      this_rate +=
-          av1_cost_color_map(x, 1, 0, bsize, mbmi->tx_size, PALETTE_MAP);
+      this_rate = tokenonly_rd_stats.rate +
+                  intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
       this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
       if (this_rd < *best_rd) {
         *best_rd = this_rd;
@@ -5937,68 +5591,13 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
   }
 }
 
-#if CONFIG_FILTER_INTRA
-// Return 1 if an filter intra mode is selected; return 0 otherwise.
-static int rd_pick_filter_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                     int *rate, int *rate_tokenonly,
-                                     int64_t *distortion, int *skippable,
-                                     BLOCK_SIZE bsize, int64_t *best_rd) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  int filter_intra_selected_flag = 0;
-  int this_rate;
-  int64_t this_rd;
-  FILTER_INTRA_MODE mode;
-  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
-  RD_STATS tokenonly_rd_stats;
-
-  av1_zero(filter_intra_mode_info);
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 1;
-  mbmi->uv_mode = UV_DC_PRED;
-  mbmi->palette_mode_info.palette_size[1] = 0;
-
-  for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
-    mbmi->filter_intra_mode_info.filter_intra_mode[1] = mode;
-    if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd))
-      continue;
-
-    this_rate = tokenonly_rd_stats.rate +
-                av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 1) +
-                x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] +
-                write_uniform_cost(FILTER_INTRA_MODES, mode);
-    this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-    if (this_rd < *best_rd) {
-      *best_rd = this_rd;
-      *rate = this_rate;
-      *rate_tokenonly = tokenonly_rd_stats.rate;
-      *distortion = tokenonly_rd_stats.dist;
-      *skippable = tokenonly_rd_stats.skip;
-      filter_intra_mode_info = mbmi->filter_intra_mode_info;
-      filter_intra_selected_flag = 1;
-    }
-  }
-
-  if (filter_intra_selected_flag) {
-    mbmi->uv_mode = UV_DC_PRED;
-    mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
-        filter_intra_mode_info.use_filter_intra_mode[1];
-    mbmi->filter_intra_mode_info.filter_intra_mode[1] =
-        filter_intra_mode_info.filter_intra_mode[1];
-    return 1;
-  } else {
-    return 0;
-  }
-}
-#endif  // CONFIG_FILTER_INTRA
-
-#if CONFIG_EXT_INTRA
 // Run RD calculation with given chroma intra prediction angle., and return
 // the RD cost. Update the best mode info. if the RD cost is the best so far.
 static int64_t pick_intra_angle_routine_sbuv(
     const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats,
     int *best_angle_delta, int64_t *best_rd) {
-  MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
+  MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
   assert(!is_inter_block(mbmi));
   int this_rate;
   int64_t this_rd;
@@ -6006,11 +5605,12 @@ static int64_t pick_intra_angle_routine_sbuv(
 
   if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in))
     return INT64_MAX;
-  this_rate = tokenonly_rd_stats.rate + rate_overhead;
+  this_rate = tokenonly_rd_stats.rate +
+              intra_mode_info_cost_uv(cpi, x, mbmi, bsize, rate_overhead);
   this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
   if (this_rd < *best_rd) {
     *best_rd = this_rd;
-    *best_angle_delta = mbmi->angle_delta[1];
+    *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
     *rate = this_rate;
     rd_stats->rate = tokenonly_rd_stats.rate;
     rd_stats->dist = tokenonly_rd_stats.dist;
@@ -6026,7 +5626,7 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     int64_t best_rd, int *rate,
                                     RD_STATS *rd_stats) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   assert(!is_inter_block(mbmi));
   int i, angle_delta, best_angle_delta = 0;
   int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
@@ -6041,7 +5641,7 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
       best_rd_in = (best_rd == INT64_MAX)
                        ? INT64_MAX
                        : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5)));
-      mbmi->angle_delta[1] = (1 - 2 * i) * angle_delta;
+      mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta;
       this_rd = pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead,
                                               best_rd_in, rate, rd_stats,
                                               &best_angle_delta, &best_rd);
@@ -6064,7 +5664,7 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
           rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
         skip_search = 1;
       if (!skip_search) {
-        mbmi->angle_delta[1] = (1 - 2 * i) * angle_delta;
+        mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta;
         pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd,
                                       rate, rd_stats, &best_angle_delta,
                                       &best_rd);
@@ -6072,202 +5672,137 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
   }
 
-  mbmi->angle_delta[1] = best_angle_delta;
+  mbmi->angle_delta[PLANE_TYPE_UV] = best_angle_delta;
   return rd_stats->rate != INT_MAX;
 }
-#endif  // CONFIG_EXT_INTRA
-
-#if CONFIG_CFL
-static int64_t cfl_alpha_dist_lbd(const int16_t *pred_buf_q3,
-                                  const uint8_t *src, int src_stride, int width,
-                                  int height, int dc_pred, int alpha_q3,
-                                  int64_t *dist_neg_out) {
-  int64_t dist = 0;
-  int diff;
 
-  if (alpha_q3 == 0) {
-    for (int j = 0; j < height; j++) {
-      for (int i = 0; i < width; i++) {
-        diff = src[i] - dc_pred;
-        dist += diff * diff;
-      }
-      src += src_stride;
-    }
-
-    if (dist_neg_out) *dist_neg_out = dist;
-
-    return dist;
-  }
-
-  int64_t dist_neg = 0;
-  for (int j = 0; j < height; j++) {
-    for (int i = 0; i < width; i++) {
-      const int uv = src[i];
-      const int scaled_luma = get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]);
-
-      diff = uv - clip_pixel(scaled_luma + dc_pred);
-      dist += diff * diff;
+#define PLANE_SIGN_TO_JOINT_SIGN(plane, a, b) \
+  (plane == CFL_PRED_U ? a * CFL_SIGNS + b - 1 : b * CFL_SIGNS + a - 1)
+static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
+                             TX_SIZE tx_size, int64_t best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
 
-      diff = uv - clip_pixel(-scaled_luma + dc_pred);
-      dist_neg += diff * diff;
-    }
-    pred_buf_q3 += MAX_SB_SIZE;
-    src += src_stride;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_DEBUG
+  assert(is_cfl_allowed(xd));
+  const int ssx = xd->plane[AOM_PLANE_U].subsampling_x;
+  const int ssy = xd->plane[AOM_PLANE_U].subsampling_y;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->sb_type, ssx, ssy);
+  (void)plane_bsize;
+  assert(plane_bsize < BLOCK_SIZES_ALL);
+  if (!xd->lossless[mbmi->segment_id]) {
+    assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
+    assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
   }
+#endif  // CONFIG_DEBUG
 
-  if (dist_neg_out) *dist_neg_out = dist_neg;
-
-  return dist;
-}
-#if CONFIG_HIGHBITDEPTH
-static int64_t cfl_alpha_dist_hbd(const int16_t *pred_buf_q3,
-                                  const uint16_t *src, int src_stride,
-                                  int width, int height, int dc_pred,
-                                  int alpha_q3, int bit_depth,
-                                  int64_t *dist_neg_out) {
-  const int shift = 2 * (bit_depth - 8);
-  const int rounding = shift > 0 ? (1 << shift) >> 1 : 0;
-  int64_t dist = 0;
-  int diff;
-
-  if (alpha_q3 == 0) {
-    for (int j = 0; j < height; j++) {
-      for (int i = 0; i < width; i++) {
-        diff = src[i] - dc_pred;
-        dist += diff * diff;
+  xd->cfl.use_dc_pred_cache = 1;
+  const int64_t mode_rd =
+      RDCOST(x->rdmult,
+             x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED], 0);
+  int64_t best_rd_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
+  int best_c[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
+#if CONFIG_DEBUG
+  int best_rate_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
+#endif  // CONFIG_DEBUG
+
+  for (int plane = 0; plane < CFL_PRED_PLANES; plane++) {
+    RD_STATS rd_stats;
+    av1_init_rd_stats(&rd_stats);
+    for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
+      best_rd_uv[joint_sign][plane] = INT64_MAX;
+      best_c[joint_sign][plane] = 0;
+    }
+    // Collect RD stats for an alpha value of zero in this plane.
+    // Skip i == CFL_SIGN_ZERO as (0, 0) is invalid.
+    for (int i = CFL_SIGN_NEG; i < CFL_SIGNS; i++) {
+      const int joint_sign = PLANE_SIGN_TO_JOINT_SIGN(plane, CFL_SIGN_ZERO, i);
+      if (i == CFL_SIGN_NEG) {
+        mbmi->cfl_alpha_idx = 0;
+        mbmi->cfl_alpha_signs = joint_sign;
+        txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, plane + 1, bsize, tx_size,
+                         cpi->sf.use_fast_coef_costing, FTXS_NONE);
+        if (rd_stats.rate == INT_MAX) break;
+      }
+      const int alpha_rate = x->cfl_cost[joint_sign][plane][0];
+      best_rd_uv[joint_sign][plane] =
+          RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist);
+#if CONFIG_DEBUG
+      best_rate_uv[joint_sign][plane] = rd_stats.rate;
+#endif  // CONFIG_DEBUG
+    }
+  }
+
+  int best_joint_sign = -1;
+
+  for (int plane = 0; plane < CFL_PRED_PLANES; plane++) {
+    for (int pn_sign = CFL_SIGN_NEG; pn_sign < CFL_SIGNS; pn_sign++) {
+      int progress = 0;
+      for (int c = 0; c < CFL_ALPHABET_SIZE; c++) {
+        int flag = 0;
+        RD_STATS rd_stats;
+        if (c > 2 && progress < c) break;
+        av1_init_rd_stats(&rd_stats);
+        for (int i = 0; i < CFL_SIGNS; i++) {
+          const int joint_sign = PLANE_SIGN_TO_JOINT_SIGN(plane, pn_sign, i);
+          if (i == 0) {
+            mbmi->cfl_alpha_idx = (c << CFL_ALPHABET_SIZE_LOG2) + c;
+            mbmi->cfl_alpha_signs = joint_sign;
+            txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, plane + 1, bsize,
+                             tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE);
+            if (rd_stats.rate == INT_MAX) break;
+          }
+          const int alpha_rate = x->cfl_cost[joint_sign][plane][c];
+          int64_t this_rd =
+              RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist);
+          if (this_rd >= best_rd_uv[joint_sign][plane]) continue;
+          best_rd_uv[joint_sign][plane] = this_rd;
+          best_c[joint_sign][plane] = c;
+#if CONFIG_DEBUG
+          best_rate_uv[joint_sign][plane] = rd_stats.rate;
+#endif  // CONFIG_DEBUG
+          flag = 2;
+          if (best_rd_uv[joint_sign][!plane] == INT64_MAX) continue;
+          this_rd += mode_rd + best_rd_uv[joint_sign][!plane];
+          if (this_rd >= best_rd) continue;
+          best_rd = this_rd;
+          best_joint_sign = joint_sign;
+        }
+        progress += flag;
       }
-      src += src_stride;
-    }
-    dist = (dist + rounding) >> shift;
-
-    if (dist_neg_out) *dist_neg_out = dist;
-
-    return dist;
-  }
-
-  int64_t dist_neg = 0;
-  for (int j = 0; j < height; j++) {
-    for (int i = 0; i < width; i++) {
-      const int uv = src[i];
-      const int scaled_luma = get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]);
-
-      diff = uv - clip_pixel_highbd(scaled_luma + dc_pred, bit_depth);
-      dist += diff * diff;
-
-      diff = uv - clip_pixel_highbd(-scaled_luma + dc_pred, bit_depth);
-      dist_neg += diff * diff;
     }
-    pred_buf_q3 += MAX_SB_SIZE;
-    src += src_stride;
-  }
-
-  if (dist_neg_out) *dist_neg_out = (dist_neg + rounding) >> shift;
-
-  return (dist + rounding) >> shift;
-}
-#endif  // CONFIG_HIGHBITDEPTH
-static int64_t cfl_alpha_dist(const int16_t *pred_buf_q3, const uint8_t *src,
-                              int src_stride, int width, int height,
-                              int dc_pred, int alpha_q3, int use_hbd,
-                              int bit_depth, int64_t *dist_neg_out) {
-#if CONFIG_HIGHBITDEPTH
-  if (use_hbd) {
-    const uint16_t *src_16 = CONVERT_TO_SHORTPTR(src);
-    return cfl_alpha_dist_hbd(pred_buf_q3, src_16, src_stride, width, height,
-                              dc_pred, alpha_q3, bit_depth, dist_neg_out);
   }
-#endif  // CONFIG_HIGHBITDEPTH
-  (void)use_hbd;
-  (void)bit_depth;
-  return cfl_alpha_dist_lbd(pred_buf_q3, src, src_stride, width, height,
-                            dc_pred, alpha_q3, dist_neg_out);
-}
-
-static int cfl_rd_pick_alpha(MACROBLOCK *const x, TX_SIZE tx_size) {
-  const struct macroblock_plane *const p_u = &x->plane[AOM_PLANE_U];
-  const struct macroblock_plane *const p_v = &x->plane[AOM_PLANE_V];
-  const uint8_t *const src_u = p_u->src.buf;
-  const uint8_t *const src_v = p_v->src.buf;
-  const int src_stride_u = p_u->src.stride;
-  const int src_stride_v = p_v->src.stride;
-
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-
-  CFL_CTX *const cfl = xd->cfl;
-  cfl_compute_parameters(xd, tx_size);
-  const int width = cfl->uv_width;
-  const int height = cfl->uv_height;
-  const int dc_pred_u = cfl->dc_pred[CFL_PRED_U];
-  const int dc_pred_v = cfl->dc_pred[CFL_PRED_V];
-  const int16_t *pred_buf_q3 = cfl->pred_buf_q3;
-  const int use_hbd = get_bitdepth_data_path_index(xd);
-
-  int64_t sse[CFL_PRED_PLANES][CFL_MAGS_SIZE];
-  sse[CFL_PRED_U][0] =
-      cfl_alpha_dist(pred_buf_q3, src_u, src_stride_u, width, height, dc_pred_u,
-                     0, use_hbd, xd->bd, NULL);
-  sse[CFL_PRED_V][0] =
-      cfl_alpha_dist(pred_buf_q3, src_v, src_stride_v, width, height, dc_pred_v,
-                     0, use_hbd, xd->bd, NULL);
-
-  for (int c = 0; c < CFL_ALPHABET_SIZE; c++) {
-    const int m = c * 2 + 1;
-    const int abs_alpha_q3 = c + 1;
-    sse[CFL_PRED_U][m] = cfl_alpha_dist(
-        pred_buf_q3, src_u, src_stride_u, width, height, dc_pred_u,
-        abs_alpha_q3, use_hbd, xd->bd, &sse[CFL_PRED_U][m + 1]);
-    sse[CFL_PRED_V][m] = cfl_alpha_dist(
-        pred_buf_q3, src_v, src_stride_v, width, height, dc_pred_v,
-        abs_alpha_q3, use_hbd, xd->bd, &sse[CFL_PRED_V][m + 1]);
-  }
-
-  int64_t dist;
-  int64_t cost;
-  int64_t best_cost = INT64_MAX;
-  int best_rate = 0;
 
-  // Compute least squares parameter of the entire block
+  int best_rate_overhead = INT_MAX;
   int ind = 0;
-  int signs = 0;
-
-  for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
-    const int sign_u = CFL_SIGN_U(joint_sign);
-    const int sign_v = CFL_SIGN_V(joint_sign);
-    const int size_u = (sign_u == CFL_SIGN_ZERO) ? 1 : CFL_ALPHABET_SIZE;
-    const int size_v = (sign_v == CFL_SIGN_ZERO) ? 1 : CFL_ALPHABET_SIZE;
-    for (int u = 0; u < size_u; u++) {
-      const int idx_u = (sign_u == CFL_SIGN_ZERO) ? 0 : u * 2 + 1;
-      for (int v = 0; v < size_v; v++) {
-        const int idx_v = (sign_v == CFL_SIGN_ZERO) ? 0 : v * 2 + 1;
-        dist = sse[CFL_PRED_U][idx_u + (sign_u == CFL_SIGN_NEG)] +
-               sse[CFL_PRED_V][idx_v + (sign_v == CFL_SIGN_NEG)];
-        dist *= 16;
-        const int rate = x->cfl_cost[joint_sign][CFL_PRED_U][u] +
-                         x->cfl_cost[joint_sign][CFL_PRED_V][v];
-        cost = RDCOST(x->rdmult, rate, dist);
-        if (cost < best_cost) {
-          best_cost = cost;
-          best_rate = rate;
-          ind = (u << CFL_ALPHABET_SIZE_LOG2) + v;
-          signs = joint_sign;
-        }
-      }
-    }
+  if (best_joint_sign >= 0) {
+    const int u = best_c[best_joint_sign][CFL_PRED_U];
+    const int v = best_c[best_joint_sign][CFL_PRED_V];
+    ind = (u << CFL_ALPHABET_SIZE_LOG2) + v;
+    best_rate_overhead = x->cfl_cost[best_joint_sign][CFL_PRED_U][u] +
+                         x->cfl_cost[best_joint_sign][CFL_PRED_V][v];
+#if CONFIG_DEBUG
+    xd->cfl.rate = x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED] +
+                   best_rate_overhead +
+                   best_rate_uv[best_joint_sign][CFL_PRED_U] +
+                   best_rate_uv[best_joint_sign][CFL_PRED_V];
+#endif  // CONFIG_DEBUG
+  } else {
+    best_joint_sign = 0;
   }
 
   mbmi->cfl_alpha_idx = ind;
-  mbmi->cfl_alpha_signs = signs;
-  return best_rate;
+  mbmi->cfl_alpha_signs = best_joint_sign;
+  xd->cfl.use_dc_pred_cache = 0;
+  xd->cfl.dc_pred_is_cached[0] = 0;
+  xd->cfl.dc_pred_is_cached[1] = 0;
+  return best_rate_overhead;
 }
-#endif  // CONFIG_CFL
 
 static void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
   mbmi->uv_mode = UV_DC_PRED;
   mbmi->palette_mode_info.palette_size[1] = 0;
-#if CONFIG_FILTER_INTRA
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
-#endif  // CONFIG_FILTER_INTRA
 }
 
 static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
@@ -6275,83 +5810,53 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                        int64_t *distortion, int *skippable,
                                        BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   assert(!is_inter_block(mbmi));
   MB_MODE_INFO best_mbmi = *mbmi;
   int64_t best_rd = INT64_MAX, this_rd;
-#if CONFIG_PVQ
-  od_rollback_buffer buf;
-  od_encode_checkpoint(&x->daala_enc, &buf);
-#endif  // CONFIG_PVQ
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  const int try_palette =
-      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
 
   for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) {
     int this_rate;
     RD_STATS tokenonly_rd_stats;
     UV_PREDICTION_MODE mode = uv_rd_search_mode_order[mode_idx];
-#if CONFIG_EXT_INTRA
-    const int is_directional_mode =
-        av1_is_directional_mode(get_uv_mode(mode), mbmi->sb_type);
-#endif  // CONFIG_EXT_INTRA
+    const int is_directional_mode = av1_is_directional_mode(get_uv_mode(mode));
     if (!(cpi->sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
           (1 << mode)))
       continue;
 
     mbmi->uv_mode = mode;
-#if CONFIG_CFL
     int cfl_alpha_rate = 0;
     if (mode == UV_CFL_PRED) {
+      if (!is_cfl_allowed(xd)) continue;
       assert(!is_directional_mode);
-      const TX_SIZE uv_tx_size = av1_get_uv_tx_size(mbmi, &xd->plane[1]);
-      cfl_alpha_rate = cfl_rd_pick_alpha(x, uv_tx_size);
+      const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+      cfl_alpha_rate = cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd);
+      if (cfl_alpha_rate == INT_MAX) continue;
     }
-#endif
-#if CONFIG_EXT_INTRA
-    mbmi->angle_delta[1] = 0;
+    mbmi->angle_delta[PLANE_TYPE_UV] = 0;
     if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type)) {
-      const int rate_overhead = x->intra_uv_mode_cost[mbmi->mode][mode] +
-                                write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, 0);
+      const int rate_overhead =
+          x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode];
       if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
                                     &this_rate, &tokenonly_rd_stats))
         continue;
     } else {
-#endif  // CONFIG_EXT_INTRA
       if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) {
-#if CONFIG_PVQ
-        od_encode_rollback(&x->daala_enc, &buf);
-#endif  // CONFIG_PVQ
         continue;
       }
-#if CONFIG_EXT_INTRA
     }
-#endif  // CONFIG_EXT_INTRA
-    this_rate =
-        tokenonly_rd_stats.rate + x->intra_uv_mode_cost[mbmi->mode][mode];
-
-#if CONFIG_CFL
+    const int mode_cost =
+        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode] +
+        cfl_alpha_rate;
+    this_rate = tokenonly_rd_stats.rate +
+                intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost);
     if (mode == UV_CFL_PRED) {
-      this_rate += cfl_alpha_rate;
+      assert(is_cfl_allowed(xd));
+#if CONFIG_DEBUG
+      if (!xd->lossless[mbmi->segment_id])
+        assert(xd->cfl.rate == tokenonly_rd_stats.rate + mode_cost);
+#endif  // CONFIG_DEBUG
     }
-#endif
-#if CONFIG_EXT_INTRA
-    if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type)) {
-      this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
-                                      MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
-    }
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-    if (mbmi->sb_type >= BLOCK_8X8 && mode == UV_DC_PRED)
-      this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 0);
-#endif  // CONFIG_FILTER_INTRA
-    if (try_palette && mode == UV_DC_PRED)
-      this_rate += av1_cost_bit(
-          av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 0);
-
-#if CONFIG_PVQ
-    od_encode_rollback(&x->daala_enc, &buf);
-#endif  // CONFIG_PVQ
     this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
 
     if (this_rd < best_rd) {
@@ -6364,22 +5869,17 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
   }
 
+  const int try_palette =
+      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
   if (try_palette) {
     uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map;
-    rd_pick_palette_intra_sbuv(cpi, x,
-                               x->intra_uv_mode_cost[mbmi->mode][UV_DC_PRED],
-                               best_palette_color_map, &best_mbmi, &best_rd,
-                               rate, rate_tokenonly, distortion, skippable);
+    rd_pick_palette_intra_sbuv(
+        cpi, x,
+        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_DC_PRED],
+        best_palette_color_map, &best_mbmi, &best_rd, rate, rate_tokenonly,
+        distortion, skippable);
   }
 
-#if CONFIG_FILTER_INTRA
-  if (mbmi->sb_type >= BLOCK_8X8) {
-    if (rd_pick_filter_intra_sbuv(cpi, x, rate, rate_tokenonly, distortion,
-                                  skippable, bsize, &best_rd))
-      best_mbmi = *mbmi;
-  }
-#endif  // CONFIG_FILTER_INTRA
-
   *mbmi = best_mbmi;
   // Make sure we actually chose a mode
   assert(best_rd < INT64_MAX);
@@ -6391,13 +5891,14 @@ static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
                                  int *rate_uv, int *rate_uv_tokenonly,
                                  int64_t *dist_uv, int *skip_uv,
                                  UV_PREDICTION_MODE *mode_uv) {
+  const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
   // Use an estimated rd for uv_intra based on DC_PRED if the
   // appropriate speed flag is set.
   init_sbuv_mode(mbmi);
-#if CONFIG_CB4X4
-#if !CONFIG_CHROMA_2X2
   if (x->skip_chroma_rd) {
     *rate_uv = 0;
     *rate_uv_tokenonly = 0;
@@ -6406,31 +5907,20 @@ static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
     *mode_uv = UV_DC_PRED;
     return;
   }
+  xd->cfl.is_chroma_reference = is_chroma_reference(
+      mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y);
   bsize = scale_chroma_bsize(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
                              xd->plane[AOM_PLANE_U].subsampling_y);
-#endif  // !CONFIG_CHROMA_2X2
-#if CONFIG_CFL
   // Only store reconstructed luma when there's chroma RDO. When there's no
   // chroma RDO, the reconstructed luma will be stored in encode_superblock().
-  xd->cfl->store_y = !x->skip_chroma_rd;
-#endif  // CONFIG_CFL
-#else
-  bsize = bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize;
-#if CONFIG_CFL
-  xd->cfl->store_y = 1;
-#endif  // CONFIG_CFL
-#endif  // CONFIG_CB4X4
-#if CONFIG_CFL
-  if (xd->cfl->store_y) {
-    // Perform one extra call to txfm_rd_in_plane(), with the values chosen
-    // during luma RDO, so we can store reconstructed luma values
-    RD_STATS this_rd_stats;
-    txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y,
-                     mbmi->sb_type, mbmi->tx_size,
-                     cpi->sf.use_fast_coef_costing);
-    xd->cfl->store_y = 0;
+  xd->cfl.store_y = store_cfl_required_rdo(cm, x);
+  if (xd->cfl.store_y) {
+    // Restore reconstructed luma values.
+    av1_encode_intra_block_plane(cpi, x, mbmi->sb_type, AOM_PLANE_Y,
+                                 cpi->optimize_seg_arr[mbmi->segment_id],
+                                 mi_row, mi_col);
+    xd->cfl.store_y = 0;
   }
-#endif  // CONFIG_CFL
   rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
                           bsize, max_tx_size);
   *mode_uv = mbmi->uv_mode;
@@ -6441,16 +5931,10 @@ static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
   if (is_inter_compound_mode(mode)) {
     return x
         ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
-#if CONFIG_COMPOUND_SINGLEREF
-  } else if (is_inter_singleref_comp_mode(mode)) {
-    return x->inter_singleref_comp_mode_cost[mode_context]
-                                            [INTER_SINGLEREF_COMP_OFFSET(mode)];
-#endif  // CONFIG_COMPOUND_SINGLEREF
   }
 
   int mode_cost = 0;
   int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
-  int16_t is_all_zero_mv = mode_context & (1 << ALL_ZERO_FLAG_OFFSET);
 
   assert(is_inter_mode(mode));
 
@@ -6459,43 +5943,34 @@ static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
     return mode_cost;
   } else {
     mode_cost = x->newmv_mode_cost[mode_ctx][1];
-    mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
-
-    if (is_all_zero_mv) return mode_cost;
+    mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
 
-    if (mode == ZEROMV) {
+    if (mode == GLOBALMV) {
       mode_cost += x->zeromv_mode_cost[mode_ctx][0];
       return mode_cost;
     } else {
       mode_cost += x->zeromv_mode_cost[mode_ctx][1];
       mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
-
-      if (mode_context & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6;
-      if (mode_context & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7;
-      if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8;
-
       mode_cost += x->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
       return mode_cost;
     }
   }
 }
 
-#if (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
-static int get_interinter_compound_type_bits(BLOCK_SIZE bsize,
-                                             COMPOUND_TYPE comp_type) {
-  (void)bsize;
-  switch (comp_type) {
+static int get_interinter_compound_mask_rate(const MACROBLOCK *const x,
+                                             const MB_MODE_INFO *const mbmi) {
+  switch (mbmi->interinter_comp.type) {
     case COMPOUND_AVERAGE: return 0;
-#if CONFIG_WEDGE
-    case COMPOUND_WEDGE: return get_interinter_wedge_bits(bsize);
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-    case COMPOUND_SEG: return 1;
-#endif  // CONFIG_COMPOUND_SEGMENT
+    case COMPOUND_WEDGE:
+      return get_interinter_wedge_bits(mbmi->sb_type) > 0
+                 ? av1_cost_literal(1) +
+                       x->wedge_idx_cost[mbmi->sb_type]
+                                        [mbmi->interinter_comp.wedge_index]
+                 : 0;
+    case COMPOUND_DIFFWTD: return av1_cost_literal(1);
     default: assert(0); return 0;
   }
 }
-#endif  // (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
 
 typedef struct {
   int eobs;
@@ -6508,13 +5983,8 @@ typedef struct {
   int_mv pred_mv[2];
   int_mv ref_mv[2];
 
-#if CONFIG_CHROMA_2X2
-  ENTROPY_CONTEXT ta[4];
-  ENTROPY_CONTEXT tl[4];
-#else
   ENTROPY_CONTEXT ta[2];
   ENTROPY_CONTEXT tl[2];
-#endif  // CONFIG_CHROMA_2X2
 } SEG_RDSTAT;
 
 typedef struct {
@@ -6527,12 +5997,7 @@ typedef struct {
   int64_t sse;
   int segment_yrate;
   PREDICTION_MODE modes[4];
-#if CONFIG_COMPOUND_SINGLEREF
-  SEG_RDSTAT rdstat[4][INTER_MODES + INTER_SINGLEREF_COMP_MODES +
-                       INTER_COMPOUND_MODES];
-#else   // !CONFIG_COMPOUND_SINGLEREF
   SEG_RDSTAT rdstat[4][INTER_MODES + INTER_COMPOUND_MODES];
-#endif  // CONFIG_COMPOUND_SINGLEREF
   int mvthresh;
 } BEST_SEG_INFO;
 
@@ -6543,149 +6008,103 @@ static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) {
          (mv->col >> 3) > mv_limits->col_max;
 }
 
-// Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
-// TODO(aconverse): Find out if this is still productive then clean up or remove
-static int check_best_zero_mv(
-    const AV1_COMP *const cpi, const MACROBLOCK *const x,
-    const int16_t mode_context[TOTAL_REFS_PER_FRAME],
-    const int16_t compound_mode_context[TOTAL_REFS_PER_FRAME],
-    int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME], int this_mode,
-    const MV_REFERENCE_FRAME ref_frames[2], const BLOCK_SIZE bsize, int block,
-    int mi_row, int mi_col) {
-  int_mv zeromv[2] = { {.as_int = 0 } };
-#if CONFIG_GLOBAL_MOTION
-  int comp_pred_mode = ref_frames[1] > INTRA_FRAME;
-#endif
-  (void)mi_row;
-  (void)mi_col;
-  (void)cpi;
-#if CONFIG_GLOBAL_MOTION
-  if (this_mode == ZEROMV || this_mode == ZERO_ZEROMV) {
-    for (int cur_frm = 0; cur_frm < 1 + comp_pred_mode; cur_frm++) {
-      zeromv[cur_frm].as_int =
-          gm_get_motion_vector(&cpi->common.global_motion[ref_frames[cur_frm]],
-                               cpi->common.allow_high_precision_mv, bsize,
-                               mi_col, mi_row, block
-#if CONFIG_AMVR
-                               ,
-                               cpi->common.cur_frame_mv_precision_level
-#endif
-                               )
-              .as_int;
-    }
+static INLINE int get_single_mode(int this_mode, int ref_idx,
+                                  int is_comp_pred) {
+  int single_mode;
+  if (is_comp_pred) {
+    single_mode =
+        ref_idx ? compound_ref1_mode(this_mode) : compound_ref0_mode(this_mode);
+  } else {
+    single_mode = this_mode;
   }
-#endif  // CONFIG_GLOBAL_MOTION
-
-  if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
-      frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int &&
-      (ref_frames[1] <= INTRA_FRAME ||
-       frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int)) {
-    int16_t rfc =
-        av1_mode_context_analyzer(mode_context, ref_frames, bsize, block);
-    int c1 = cost_mv_ref(x, NEARMV, rfc);
-    int c2 = cost_mv_ref(x, NEARESTMV, rfc);
-    int c3 = cost_mv_ref(x, ZEROMV, rfc);
+  return single_mode;
+}
 
+/* If the current mode shares the same mv with other modes with higher prority,
+ * skip this mode. This priority order is nearest > global > near. */
+static int skip_repeated_mv(const AV1_COMMON *const cm,
+                            const MACROBLOCK *const x, int this_mode,
+                            const MV_REFERENCE_FRAME ref_frames[2]) {
+  const int is_comp_pred = ref_frames[1] > INTRA_FRAME;
+  const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  if (!is_comp_pred) {
     if (this_mode == NEARMV) {
-      if (c1 > c3) return 0;
-    } else if (this_mode == NEARESTMV) {
-      if (c2 > c3) return 0;
-    } else {
-      assert(this_mode == ZEROMV);
-      if (ref_frames[1] <= INTRA_FRAME) {
-        if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
-            (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
-          return 0;
-      } else {
-        if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0 &&
-             frame_mv[NEARESTMV][ref_frames[1]].as_int == 0) ||
-            (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0 &&
-             frame_mv[NEARMV][ref_frames[1]].as_int == 0))
-          return 0;
+      if (mbmi_ext->ref_mv_count[ref_frame_type] == 0) {
+        // NEARMV has the same motion vector as NEARESTMV
+        return 1;
+      }
+      if (mbmi_ext->ref_mv_count[ref_frame_type] == 1 &&
+          cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+        // NEARMV has the same motion vector as GLOBALMV
+        return 1;
       }
     }
-  } else if ((this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
-              this_mode == ZERO_ZEROMV) &&
-             frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int &&
-             frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int) {
-    int16_t rfc = compound_mode_context[ref_frames[0]];
-    int c2 = cost_mv_ref(x, NEAREST_NEARESTMV, rfc);
-    int c3 = cost_mv_ref(x, ZERO_ZEROMV, rfc);
-    int c5 = cost_mv_ref(x, NEAR_NEARMV, rfc);
-
-    if (this_mode == NEAREST_NEARESTMV) {
-      if (c2 > c3) return 0;
-    } else if (this_mode == NEAR_NEARMV) {
-      if (c5 > c3) return 0;
-    } else {
-      assert(this_mode == ZERO_ZEROMV);
-      if ((c3 >= c2 && frame_mv[NEAREST_NEARESTMV][ref_frames[0]].as_int == 0 &&
-           frame_mv[NEAREST_NEARESTMV][ref_frames[1]].as_int == 0) ||
-          (c3 >= c5 && frame_mv[NEAR_NEARMV][ref_frames[0]].as_int == 0 &&
-           frame_mv[NEAR_NEARMV][ref_frames[1]].as_int == 0))
-        return 0;
+    if (this_mode == GLOBALMV) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] == 0 &&
+          cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+        // GLOBALMV has the same motion vector as NEARESTMV
+        return 1;
+      }
+    }
+  } else {
+    for (int i = 0; i < 2; ++i) {
+      const int single_mode = get_single_mode(this_mode, i, is_comp_pred);
+      if (single_mode == NEARMV) {
+        if (mbmi_ext->ref_mv_count[ref_frame_type] == 0) {
+          // NEARMV has the same motion vector as NEARESTMV in compound mode
+          return 1;
+        }
+      }
+    }
+    if (this_mode == NEAR_NEARMV) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] == 1 &&
+          cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION &&
+          cm->global_motion[ref_frames[1]].wmtype <= TRANSLATION) {
+        // NEAR_NEARMV has the same motion vector as GLOBAL_GLOBALMV
+        return 1;
+      }
+    }
+    if (this_mode == GLOBAL_GLOBALMV) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] == 0 &&
+          cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION &&
+          cm->global_motion[ref_frames[1]].wmtype <= TRANSLATION) {
+        // GLOBAL_GLOBALMV has the same motion vector as NEARST_NEARSTMV
+        return 1;
+      }
     }
   }
-  return 1;
+  return 0;
 }
 
 static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
-                                BLOCK_SIZE bsize, int_mv *frame_mv,
-#if CONFIG_COMPOUND_SINGLEREF
-                                int_mv *frame_comp_mv,
-#endif  // CONFIG_COMPOUND_SINGLEREF
-                                int mi_row, int mi_col,
-                                int_mv *ref_mv_sub8x8[2], const uint8_t *mask,
-                                int mask_stride, int *rate_mv,
-                                const int block) {
+                                BLOCK_SIZE bsize, int_mv *cur_mv, int mi_row,
+                                int mi_col, int_mv *ref_mv_sub8x8[2],
+                                const uint8_t *mask, int mask_stride,
+                                int *rate_mv, const int block) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   const int pw = block_size_wide[bsize];
   const int ph = block_size_high[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-// This function should only ever be called for compound modes
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!has_second_ref(mbmi)) {
-    assert(is_inter_singleref_comp_mode(mbmi->mode));
-    assert(frame_comp_mv);
-  }
-  assert(has_second_ref(mbmi) || is_inter_singleref_comp_mode(mbmi->mode));
-  const int refs[2] = { mbmi->ref_frame[0],
-                        has_second_ref(mbmi) ? mbmi->ref_frame[1]
-                                             : mbmi->ref_frame[0] };
-#else
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  // This function should only ever be called for compound modes
   assert(has_second_ref(mbmi));
   const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
-#endif  // CONFIG_COMPOUND_SINGLEREF
   int_mv ref_mv[2];
   int ite, ref;
-  struct scale_factors sf;
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
   // ic and ir are the 4x4 coordinates of the sub8x8 at index "block"
   const int ic = block & 1;
   const int ir = (block - ic) >> 1;
   struct macroblockd_plane *const pd = &xd->plane[0];
   const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
   const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
-#if CONFIG_GLOBAL_MOTION
   int is_global[2];
-#if CONFIG_COMPOUND_SINGLEREF
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
-#else
-  for (ref = 0; ref < 2; ++ref)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  {
-    WarpedMotionParams *const wm =
-        &xd->global_motion[xd->mi[0]->mbmi.ref_frame[ref]];
-    is_global[ref] = is_global_mv_block(xd->mi[0], block, wm->wmtype);
-  }
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!has_second_ref(mbmi)) is_global[1] = is_global[0];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#endif  // CONFIG_GLOBAL_MOTION
-#else   // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-  (void)block;
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+  for (ref = 0; ref < 2; ++ref) {
+    const WarpedMotionParams *const wm =
+        &xd->global_motion[xd->mi[0]->ref_frame[ref]];
+    is_global[ref] = is_global_mv_block(xd->mi[0], wm->wmtype);
+  }
 
   // Do joint motion search in compound mode to get more accurate mv.
   struct buf_2d backup_yv12[2][MAX_MB_PLANE];
@@ -6695,82 +6114,14 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
     av1_get_scaled_ref_frame(cpi, refs[1])
   };
 
-// Prediction buffer from second frame.
-#if CONFIG_HIGHBITDEPTH
+  // Prediction buffer from second frame.
   DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
   uint8_t *second_pred;
-#else
-  DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]);
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_CB4X4
   (void)ref_mv_sub8x8;
-#endif  // CONFIG_CB4X4
-
-#if CONFIG_COMPOUND_SINGLEREF
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
-#else
-  for (ref = 0; ref < 2; ++ref)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  {
-#if !CONFIG_CB4X4
-    if (bsize < BLOCK_8X8 && ref_mv_sub8x8 != NULL)
-      ref_mv[ref].as_int = ref_mv_sub8x8[ref]->as_int;
-    else
-#endif  // !CONFIG_CB4X4
-      ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0];
-
-    if (scaled_ref_frame[ref]) {
-      int i;
-      // Swap out the reference frame for a version that's been scaled to
-      // match the resolution of the current frame, allowing the existing
-      // motion search code to be used without additional modifications.
-      for (i = 0; i < MAX_MB_PLANE; i++)
-        backup_yv12[ref][i] = xd->plane[i].pre[ref];
-      av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
-                           NULL);
-    }
-  }
 
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!has_second_ref(mbmi)) {
-    assert(is_inter_singleref_comp_mode(mbmi->mode));
-    // NOTE: For single ref comp mode, set up the 2nd set of ref_mv/pre_planes
-    //       all from the 1st reference frame, i.e. refs[0].
-    ref_mv[1] = x->mbmi_ext->ref_mvs[refs[0]][0];
-    if (scaled_ref_frame[0]) {
-      int i;
-      // Swap out the reference frame for a version that's been scaled to
-      // match the resolution of the current frame, allowing the existing
-      // motion search code to be used without additional modifications.
-      for (i = 0; i < MAX_MB_PLANE; i++)
-        backup_yv12[1][i] = xd->plane[i].pre[1];
-      av1_setup_pre_planes(xd, 1, scaled_ref_frame[0], mi_row, mi_col, NULL);
-    }
-  }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-// Since we have scaled the reference frames to match the size of the current
-// frame we must use a unit scaling factor during mode selection.
-#if CONFIG_HIGHBITDEPTH
-  av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
-                                    cm->height, cm->use_highbitdepth);
-#else
-  av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
-                                    cm->height);
-#endif  // CONFIG_HIGHBITDEPTH
-
-// Allow joint search multiple times iteratively for each reference frame
-// and break out of the search loop if it couldn't find a better mv.
-#if CONFIG_COMPOUND_SINGLEREF
-  const int num_ites =
-      (has_second_ref(mbmi) || mbmi->mode == SR_NEW_NEWMV) ? 4 : 1;
-  const int start_ite = has_second_ref(mbmi) ? 0 : 1;
-  for (ite = start_ite; ite < (start_ite + num_ites); ite++)
-#else
-  for (ite = 0; ite < 4; ite++)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  {
+  // Allow joint search multiple times iteratively for each reference frame
+  // and break out of the search loop if it couldn't find a better mv.
+  for (ite = 0; ite < 4; ite++) {
     struct buf_2d ref_yv12[2];
     int bestsme = INT_MAX;
     int sadpb = x->sadperbit16;
@@ -6782,84 +6133,78 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
                        // odd iterations search in the second. The predictor
                        // found for the 'other' reference frame is factored in.
     const int plane = 0;
-    ConvolveParams conv_params = get_conv_params(!id, 0, plane);
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+    ConvolveParams conv_params = get_conv_params(!id, 0, plane, xd->bd);
+    conv_params.use_jnt_comp_avg = 0;
     WarpTypesAllowed warp_types;
-#if CONFIG_GLOBAL_MOTION
     warp_types.global_warp_allowed = is_global[!id];
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
     warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
-#endif  // CONFIG_WARPED_MOTION
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 
-    // Initialized here because of compiler problem in Visual Studio.
+    for (ref = 0; ref < 2; ++ref) {
+      ref_mv[ref] = av1_get_ref_mv(x, ref);
+      // Swap out the reference frame for a version that's been scaled to
+      // match the resolution of the current frame, allowing the existing
+      // motion search code to be used without additional modifications.
+      if (scaled_ref_frame[ref]) {
+        int i;
+        for (i = 0; i < num_planes; i++)
+          backup_yv12[ref][i] = xd->plane[i].pre[ref];
+        av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+                             NULL, num_planes);
+      }
+    }
+
+    assert(IMPLIES(scaled_ref_frame[0] != NULL,
+                   cm->width == scaled_ref_frame[0]->y_crop_width &&
+                       cm->height == scaled_ref_frame[0]->y_crop_height));
+    assert(IMPLIES(scaled_ref_frame[1] != NULL,
+                   cm->width == scaled_ref_frame[1]->y_crop_width &&
+                       cm->height == scaled_ref_frame[1]->y_crop_height));
+
+    // Initialize based on (possibly scaled) prediction buffers.
     ref_yv12[0] = xd->plane[plane].pre[0];
     ref_yv12[1] = xd->plane[plane].pre[1];
 
-// Get the prediction block from the 'other' reference frame.
-#if CONFIG_COMPOUND_SINGLEREF
-    MV *const the_other_mv = (has_second_ref(mbmi) || id)
-                                 ? &frame_mv[refs[!id]].as_mv
-                                 : &frame_comp_mv[refs[0]].as_mv;
-#endif  // CONFIG_COMPOUND_SINGLEREF
+    // Get the prediction block from the 'other' reference frame.
+    InterpFilters interp_filters = EIGHTTAP_REGULAR;
 
-#if CONFIG_HIGHBITDEPTH
+    // Since we have scaled the reference frames to match the size of the
+    // current frame we must use a unit scaling factor during mode selection.
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
       av1_highbd_build_inter_predictor(
           ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
-#if CONFIG_COMPOUND_SINGLEREF
-          the_other_mv,
-#else   // !(CONFIG_COMPOUND_SINGLEREF)
-          &frame_mv[refs[!id]].as_mv,
-#endif  // CONFIG_COMPOUND_SINGLEREF
-          &sf, pw, ph, 0, mbmi->interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-          &warp_types, p_col, p_row,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-          plane, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
+          &cur_mv[!id].as_mv, &cm->sf_identity, pw, ph, 0, interp_filters,
+          &warp_types, p_col, p_row, plane, MV_PRECISION_Q3, mi_col * MI_SIZE,
+          mi_row * MI_SIZE, xd, cm->allow_warped_motion);
     } else {
       second_pred = (uint8_t *)second_pred_alloc_16;
-#endif  // CONFIG_HIGHBITDEPTH
-      av1_build_inter_predictor(
-          ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
-#if CONFIG_COMPOUND_SINGLEREF
-          the_other_mv,
-#else   // !(CONFIG_COMPOUND_SINGLEREF)
-        &frame_mv[refs[!id]].as_mv,
-#endif  // CONFIG_COMPOUND_SINGLEREF
-          &sf, pw, ph, &conv_params, mbmi->interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-          &warp_types, p_col, p_row, plane, !id,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-          MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
-#if CONFIG_HIGHBITDEPTH
-    }
-#endif  // CONFIG_HIGHBITDEPTH
-
-    // Do compound motion search on the current reference frame.
+      av1_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride,
+                                second_pred, pw, &cur_mv[!id].as_mv,
+                                &cm->sf_identity, pw, ph, &conv_params,
+                                interp_filters, &warp_types, p_col, p_row,
+                                plane, !id, MV_PRECISION_Q3, mi_col * MI_SIZE,
+                                mi_row * MI_SIZE, xd, cm->allow_warped_motion);
+    }
+
+    const int order_idx = id != 0;
+    av1_jnt_comp_weight_assign(cm, mbmi, order_idx, &xd->jcp_param.fwd_offset,
+                               &xd->jcp_param.bck_offset,
+                               &xd->jcp_param.use_jnt_comp_avg, 1);
+
+    // Do full-pixel compound motion search on the current reference frame.
     if (id) xd->plane[plane].pre[0] = ref_yv12[id];
     av1_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv);
 
-// Use the mv result from the single mode as mv predictor.
-// Use the mv result from the single mode as mv predictor.
-#if CONFIG_COMPOUND_SINGLEREF
-    if (!has_second_ref(mbmi) && id)
-      *best_mv = frame_comp_mv[refs[0]].as_mv;
-    else
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      *best_mv = frame_mv[refs[id]].as_mv;
+    // Use the mv result from the single mode as mv predictor.
+    // Use the mv result from the single mode as mv predictor.
+    *best_mv = cur_mv[id].as_mv;
 
     best_mv->col >>= 3;
     best_mv->row >>= 3;
 
-#if CONFIG_COMPOUND_SINGLEREF
-    if (!has_second_ref(mbmi))
-      av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
-    else
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      av1_set_mvcost(x, refs[id], id, mbmi->ref_mv_idx);
+    av1_set_mvcost(
+        x, id,
+        mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
 
     // Small-range full-pixel motion search.
     bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
@@ -6877,42 +6222,44 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
     x->mv_limits = tmp_mv_limits;
 
-#if CONFIG_AMVR
-    if (cpi->common.cur_frame_mv_precision_level) {
+    // Restore the pointer to the first (possibly scaled) prediction buffer.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[0];
+
+    for (ref = 0; ref < 2; ++ref) {
+      if (scaled_ref_frame[ref]) {
+        // Swap back the original buffers for subpel motion search.
+        for (int i = 0; i < num_planes; i++) {
+          xd->plane[i].pre[ref] = backup_yv12[ref][i];
+        }
+        // Re-initialize based on unscaled prediction buffers.
+        ref_yv12[ref] = xd->plane[plane].pre[ref];
+      }
+    }
+
+    // Do sub-pixel compound motion search on the current reference frame.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[id];
+
+    if (cpi->common.cur_frame_force_integer_mv) {
       x->best_mv.as_mv.row *= 8;
       x->best_mv.as_mv.col *= 8;
     }
-    if (bestsme < INT_MAX && cpi->common.cur_frame_mv_precision_level == 0)
-#else
-    if (bestsme < INT_MAX)
-#endif
-    {
+    if (bestsme < INT_MAX && cpi->common.cur_frame_force_integer_mv == 0) {
       int dis; /* TODO: use dis in distortion calculation later. */
       unsigned int sse;
       bestsme = cpi->find_fractional_mv_step(
-          x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
-          x->errorperbit, &cpi->fn_ptr[bsize], 0,
-          cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
-          &dis, &sse, second_pred, mask, mask_stride, id, pw, ph,
-          cpi->sf.use_upsampled_references);
+          x, cm, mi_row, mi_col, &ref_mv[id].as_mv,
+          cpi->common.allow_high_precision_mv, x->errorperbit,
+          &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
+          x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask,
+          mask_stride, id, pw, ph, cpi->sf.use_accurate_subpel_search);
     }
 
-    // Restore the pointer to the first (possibly scaled) prediction buffer.
+    // Restore the pointer to the first prediction buffer.
     if (id) xd->plane[plane].pre[0] = ref_yv12[0];
 
     if (bestsme < last_besterr[id]) {
-#if CONFIG_COMPOUND_SINGLEREF
-      // NOTE: For single ref comp mode, frame_mv stores the first mv and
-      //       frame_comp_mv stores the second mv.
-      if (!has_second_ref(mbmi) && id)
-        frame_comp_mv[refs[0]].as_mv = *best_mv;
-      else
-#endif  // CONFIG_COMPOUND_SINGLEREF
-        frame_mv[refs[id]].as_mv = *best_mv;
+      cur_mv[id].as_mv = *best_mv;
       last_besterr[id] = bestsme;
-#if CONFIG_COMPOUND_SINGLEREF
-      if (!has_second_ref(mbmi)) last_besterr[!id] = last_besterr[id];
-#endif  // CONFIG_COMPOUND_SINGLEREF
     } else {
       break;
     }
@@ -6920,216 +6267,124 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
   *rate_mv = 0;
 
-#if CONFIG_COMPOUND_SINGLEREF
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
-#else
-  for (ref = 0; ref < 2; ++ref)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  {
-    if (scaled_ref_frame[ref]) {
-      // Restore the prediction frame pointers to their unscaled versions.
-      int i;
-      for (i = 0; i < MAX_MB_PLANE; i++)
-        xd->plane[i].pre[ref] = backup_yv12[ref][i];
-    }
-
-#if CONFIG_COMPOUND_SINGLEREF
-    if (!has_second_ref(mbmi))
-      av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
-    else
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      av1_set_mvcost(x, refs[ref], ref, mbmi->ref_mv_idx);
-
-#if CONFIG_COMPOUND_SINGLEREF
-    if (!has_second_ref(mbmi)) {
-      // NOTE: For single ref comp mode, i.e. !has_second_ref(mbmi) is true, the
-      //       first mv is stored in frame_mv[] and the second mv is stored in
-      //       frame_comp_mv[].
-      if (compound_ref0_mode(mbmi->mode) == NEWMV)  // SR_NEW_NEWMV
-        *rate_mv += av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
-                                    &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
-                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-      assert(compound_ref1_mode(mbmi->mode) == NEWMV);
-      *rate_mv += av1_mv_bit_cost(&frame_comp_mv[refs[0]].as_mv,
-                                  &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
-                                  x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-    } else {
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if !CONFIG_CB4X4
-      if (bsize >= BLOCK_8X8)
-#endif  // !CONFIG_CB4X4
-        *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
-                                    &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv,
-                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-#if !CONFIG_CB4X4
-      else
-        *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
-                                    &ref_mv_sub8x8[ref]->as_mv, x->nmvjointcost,
-                                    x->mvcost, MV_COST_WEIGHT);
-#endif  // !CONFIG_CB4X4
-#if CONFIG_COMPOUND_SINGLEREF
-    }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  }
+  for (ref = 0; ref < 2; ++ref) {
+    av1_set_mvcost(
+        x, ref,
+        mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
 
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!has_second_ref(mbmi)) {
-    if (scaled_ref_frame[0]) {
-      // Restore the prediction frame pointers to their unscaled versions.
-      int i;
-      for (i = 0; i < MAX_MB_PLANE; i++)
-        xd->plane[i].pre[1] = backup_yv12[1][i];
-    }
+    const int_mv curr_ref_mv = av1_get_ref_mv(x, ref);
+    *rate_mv += av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv,
+                                x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
   }
-#endif  // CONFIG_COMPOUND_SINGLEREF
 }
 
 static void estimate_ref_frame_costs(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
-    unsigned int *ref_costs_single,
-#if CONFIG_EXT_COMP_REFS
-    unsigned int (*ref_costs_comp)[TOTAL_REFS_PER_FRAME],
-#else
-    unsigned int *ref_costs_comp,
-#endif  // CONFIG_EXT_COMP_REFS
-    aom_prob *comp_mode_p) {
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, const MACROBLOCK *x,
+    int segment_id, unsigned int *ref_costs_single,
+    unsigned int (*ref_costs_comp)[REF_FRAMES]) {
   int seg_ref_active =
       segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
   if (seg_ref_active) {
-    memset(ref_costs_single, 0,
-           TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_single));
-#if CONFIG_EXT_COMP_REFS
+    memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single));
     int ref_frame;
-    for (ref_frame = 0; ref_frame < TOTAL_REFS_PER_FRAME; ++ref_frame)
+    for (ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
       memset(ref_costs_comp[ref_frame], 0,
-             TOTAL_REFS_PER_FRAME * sizeof((*ref_costs_comp)[0]));
-#else
-    memset(ref_costs_comp, 0, TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_comp));
-#endif  // CONFIG_EXT_COMP_REFS
-
-    *comp_mode_p = 128;
+             REF_FRAMES * sizeof((*ref_costs_comp)[0]));
   } else {
-    aom_prob intra_inter_p = av1_get_intra_inter_prob(cm, xd);
-    aom_prob comp_inter_p = 128;
-
-    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
-      comp_inter_p = av1_get_reference_mode_prob(cm, xd);
-      *comp_mode_p = comp_inter_p;
-    } else {
-      *comp_mode_p = 128;
-    }
-
-    ref_costs_single[INTRA_FRAME] = av1_cost_bit(intra_inter_p, 0);
-
-    if (cm->reference_mode != COMPOUND_REFERENCE) {
-      aom_prob ref_single_p1 = av1_get_pred_prob_single_ref_p1(cm, xd);
-      aom_prob ref_single_p2 = av1_get_pred_prob_single_ref_p2(cm, xd);
-#if CONFIG_EXT_REFS
-      aom_prob ref_single_p3 = av1_get_pred_prob_single_ref_p3(cm, xd);
-      aom_prob ref_single_p4 = av1_get_pred_prob_single_ref_p4(cm, xd);
-      aom_prob ref_single_p5 = av1_get_pred_prob_single_ref_p5(cm, xd);
-      aom_prob ref_single_p6 = av1_get_pred_prob_single_ref_p6(cm, xd);
-#endif  // CONFIG_EXT_REFS
-
-      unsigned int base_cost = av1_cost_bit(intra_inter_p, 1);
-
-      ref_costs_single[LAST_FRAME] =
-#if CONFIG_EXT_REFS
-          ref_costs_single[LAST2_FRAME] = ref_costs_single[LAST3_FRAME] =
-              ref_costs_single[BWDREF_FRAME] = ref_costs_single[ALTREF2_FRAME] =
-#endif  // CONFIG_EXT_REFS
-                  ref_costs_single[GOLDEN_FRAME] =
-                      ref_costs_single[ALTREF_FRAME] = base_cost;
-
-#if CONFIG_EXT_REFS
-      ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0);
-      ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p1, 0);
-      ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p1, 0);
-      ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 0);
-      ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
-      ref_costs_single[ALTREF2_FRAME] += av1_cost_bit(ref_single_p1, 1);
-      ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
-
-      ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p3, 0);
-      ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p3, 0);
-      ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p3, 1);
-      ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p3, 1);
-
-      ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p2, 0);
-      ref_costs_single[ALTREF2_FRAME] += av1_cost_bit(ref_single_p2, 0);
-      ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p2, 1);
-
-      ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p4, 0);
-      ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p4, 1);
-
-      ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p5, 0);
-      ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p5, 1);
-
-      ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p6, 0);
-      ref_costs_single[ALTREF2_FRAME] += av1_cost_bit(ref_single_p6, 1);
-#else   // !CONFIG_EXT_REFS
-      ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0);
-      ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 1);
-      ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
-
-      ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p2, 0);
-      ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p2, 1);
-#endif  // CONFIG_EXT_REFS
-    } else {
-      ref_costs_single[LAST_FRAME] = 512;
-#if CONFIG_EXT_REFS
-      ref_costs_single[LAST2_FRAME] = 512;
-      ref_costs_single[LAST3_FRAME] = 512;
-      ref_costs_single[BWDREF_FRAME] = 512;
-      ref_costs_single[ALTREF2_FRAME] = 512;
-#endif  // CONFIG_EXT_REFS
-      ref_costs_single[GOLDEN_FRAME] = 512;
-      ref_costs_single[ALTREF_FRAME] = 512;
-    }
+    int intra_inter_ctx = av1_get_intra_inter_context(xd);
+    ref_costs_single[INTRA_FRAME] = x->intra_inter_cost[intra_inter_ctx][0];
+    unsigned int base_cost = x->intra_inter_cost[intra_inter_ctx][1];
+
+    for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
+      ref_costs_single[i] = base_cost;
+
+    const int ctx_p1 = av1_get_pred_context_single_ref_p1(xd);
+    const int ctx_p2 = av1_get_pred_context_single_ref_p2(xd);
+    const int ctx_p3 = av1_get_pred_context_single_ref_p3(xd);
+    const int ctx_p4 = av1_get_pred_context_single_ref_p4(xd);
+    const int ctx_p5 = av1_get_pred_context_single_ref_p5(xd);
+    const int ctx_p6 = av1_get_pred_context_single_ref_p6(xd);
+
+    // Determine cost of a single ref frame, where frame types are represented
+    // by a tree:
+    // Level 0: add cost whether this ref is a forward or backward ref
+    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+    ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+
+    // Level 1: if this ref is forward ref,
+    // add cost whether it is last/last2 or last3/golden
+    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p3][2][0];
+    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p3][2][0];
+    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p3][2][1];
+    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p3][2][1];
+
+    // Level 1: if this ref is backward ref
+    // then add cost whether this ref is altref or backward ref
+    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p2][1][0];
+    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p2][1][0];
+    ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p2][1][1];
+
+    // Level 2: further add cost whether this ref is last or last2
+    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p4][3][0];
+    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p4][3][1];
+
+    // Level 2: last3 or golden
+    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p5][4][0];
+    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p5][4][1];
+
+    // Level 2: bwdref or altref2
+    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p6][5][0];
+    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p6][5][1];
 
     if (cm->reference_mode != SINGLE_REFERENCE) {
-      aom_prob ref_comp_p = av1_get_pred_prob_comp_ref_p(cm, xd);
-#if CONFIG_EXT_REFS
-      aom_prob ref_comp_p1 = av1_get_pred_prob_comp_ref_p1(cm, xd);
-      aom_prob ref_comp_p2 = av1_get_pred_prob_comp_ref_p2(cm, xd);
-      aom_prob bwdref_comp_p = av1_get_pred_prob_comp_bwdref_p(cm, xd);
-      aom_prob bwdref_comp_p1 = av1_get_pred_prob_comp_bwdref_p1(cm, xd);
-#endif  // CONFIG_EXT_REFS
-
-      unsigned int base_cost = av1_cost_bit(intra_inter_p, 1);
+      // Similar to single ref, determine cost of compound ref frames.
+      // cost_compound_refs = cost_first_ref + cost_second_ref
+      const int bwdref_comp_ctx_p = av1_get_pred_context_comp_bwdref_p(xd);
+      const int bwdref_comp_ctx_p1 = av1_get_pred_context_comp_bwdref_p1(xd);
+      const int ref_comp_ctx_p = av1_get_pred_context_comp_ref_p(xd);
+      const int ref_comp_ctx_p1 = av1_get_pred_context_comp_ref_p1(xd);
+      const int ref_comp_ctx_p2 = av1_get_pred_context_comp_ref_p2(xd);
 
-#if CONFIG_EXT_COMP_REFS
-      aom_prob comp_ref_type_p = av1_get_comp_reference_type_prob(cm, xd);
-      unsigned int ref_bicomp_costs[TOTAL_REFS_PER_FRAME] = { 0 };
+      const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd);
+      unsigned int ref_bicomp_costs[REF_FRAMES] = { 0 };
 
       ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] =
           ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] =
-#if USE_UNI_COMP_REFS
-              base_cost + av1_cost_bit(comp_ref_type_p, 1);
-#else
-              base_cost;
-#endif  // USE_UNI_COMP_REFS
+              base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][1];
       ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0;
       ref_bicomp_costs[ALTREF_FRAME] = 0;
 
-      ref_bicomp_costs[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
-      ref_bicomp_costs[LAST2_FRAME] += av1_cost_bit(ref_comp_p, 0);
-      ref_bicomp_costs[LAST3_FRAME] += av1_cost_bit(ref_comp_p, 1);
-      ref_bicomp_costs[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
+      // cost of first ref frame
+      ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
+      ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
+      ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
+      ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
 
-      ref_bicomp_costs[LAST_FRAME] += av1_cost_bit(ref_comp_p1, 1);
-      ref_bicomp_costs[LAST2_FRAME] += av1_cost_bit(ref_comp_p1, 0);
+      ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][0];
+      ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][1];
 
-      ref_bicomp_costs[LAST3_FRAME] += av1_cost_bit(ref_comp_p2, 0);
-      ref_bicomp_costs[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p2, 1);
+      ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][0];
+      ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][1];
 
-      ref_bicomp_costs[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
-      ref_bicomp_costs[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
-      ref_bicomp_costs[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1);
+      // cost of second ref frame
+      ref_bicomp_costs[BWDREF_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+      ref_bicomp_costs[ALTREF2_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+      ref_bicomp_costs[ALTREF_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][1];
 
-      ref_bicomp_costs[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p1, 0);
-      ref_bicomp_costs[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p1, 1);
+      ref_bicomp_costs[BWDREF_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0];
+      ref_bicomp_costs[ALTREF2_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1];
 
+      // cost: if one ref frame is forward ref, the other ref is backward ref
       int ref0, ref1;
       for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
         for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) {
@@ -7138,66 +6393,28 @@ static void estimate_ref_frame_costs(
         }
       }
 
-      aom_prob uni_comp_ref_p = av1_get_pred_prob_uni_comp_ref_p(cm, xd);
-      aom_prob uni_comp_ref_p1 = av1_get_pred_prob_uni_comp_ref_p1(cm, xd);
-      aom_prob uni_comp_ref_p2 = av1_get_pred_prob_uni_comp_ref_p2(cm, xd);
-
+      // cost: if both ref frames are the same side.
+      const int uni_comp_ref_ctx_p = av1_get_pred_context_uni_comp_ref_p(xd);
+      const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd);
+      const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd);
       ref_costs_comp[LAST_FRAME][LAST2_FRAME] =
-          base_cost + av1_cost_bit(comp_ref_type_p, 0) +
-          av1_cost_bit(uni_comp_ref_p, 0) + av1_cost_bit(uni_comp_ref_p1, 0);
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0];
       ref_costs_comp[LAST_FRAME][LAST3_FRAME] =
-          base_cost + av1_cost_bit(comp_ref_type_p, 0) +
-          av1_cost_bit(uni_comp_ref_p, 0) + av1_cost_bit(uni_comp_ref_p1, 1) +
-          av1_cost_bit(uni_comp_ref_p2, 0);
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0];
       ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] =
-          base_cost + av1_cost_bit(comp_ref_type_p, 0) +
-          av1_cost_bit(uni_comp_ref_p, 0) + av1_cost_bit(uni_comp_ref_p1, 1) +
-          av1_cost_bit(uni_comp_ref_p2, 1);
-
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1];
       ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] =
-          base_cost + av1_cost_bit(comp_ref_type_p, 0) +
-          av1_cost_bit(uni_comp_ref_p, 1);
-
-#else  // !CONFIG_EXT_COMP_REFS
-
-      ref_costs_comp[LAST_FRAME] =
-#if CONFIG_EXT_REFS
-          ref_costs_comp[LAST2_FRAME] = ref_costs_comp[LAST3_FRAME] =
-#endif  // CONFIG_EXT_REFS
-              ref_costs_comp[GOLDEN_FRAME] = base_cost;
-
-#if CONFIG_EXT_REFS
-      ref_costs_comp[BWDREF_FRAME] = ref_costs_comp[ALTREF2_FRAME] =
-          ref_costs_comp[ALTREF_FRAME] = 0;
-#endif  // CONFIG_EXT_REFS
-
-#if CONFIG_EXT_REFS
-      ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
-      ref_costs_comp[LAST2_FRAME] += av1_cost_bit(ref_comp_p, 0);
-      ref_costs_comp[LAST3_FRAME] += av1_cost_bit(ref_comp_p, 1);
-      ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
-
-      ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p1, 1);
-      ref_costs_comp[LAST2_FRAME] += av1_cost_bit(ref_comp_p1, 0);
-
-      ref_costs_comp[LAST3_FRAME] += av1_cost_bit(ref_comp_p2, 0);
-      ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p2, 1);
-
-      // NOTE(zoeliu): BWDREF and ALTREF each add an extra cost by coding 1
-      //               more bit.
-      ref_costs_comp[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
-      ref_costs_comp[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
-      ref_costs_comp[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1);
-
-      ref_costs_comp[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p1, 0);
-      ref_costs_comp[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p1, 1);
-#else   // !CONFIG_EXT_REFS
-      ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
-      ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
-#endif  // CONFIG_EXT_REFS
-#endif  // CONFIG_EXT_COMP_REFS
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1];
     } else {
-#if CONFIG_EXT_COMP_REFS
       int ref0, ref1;
       for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
         for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1)
@@ -7207,17 +6424,6 @@ static void estimate_ref_frame_costs(
       ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512;
       ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512;
       ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512;
-#else  // !CONFIG_EXT_COMP_REFS
-      ref_costs_comp[LAST_FRAME] = 512;
-#if CONFIG_EXT_REFS
-      ref_costs_comp[LAST2_FRAME] = 512;
-      ref_costs_comp[LAST3_FRAME] = 512;
-      ref_costs_comp[BWDREF_FRAME] = 512;
-      ref_costs_comp[ALTREF2_FRAME] = 512;
-      ref_costs_comp[ALTREF_FRAME] = 512;
-#endif  // CONFIG_EXT_REFS
-      ref_costs_comp[GOLDEN_FRAME] = 512;
-#endif  // CONFIG_EXT_COMP_REFS
     }
   }
 }
@@ -7240,17 +6446,15 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
 }
 
-static void setup_buffer_inter(
+static void setup_buffer_ref_mvs_inter(
     const AV1_COMP *const cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
     BLOCK_SIZE block_size, int mi_row, int mi_col,
-    int_mv frame_nearest_mv[TOTAL_REFS_PER_FRAME],
-    int_mv frame_near_mv[TOTAL_REFS_PER_FRAME],
-    struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE]) {
+    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
   const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mi = xd->mi[0];
-  int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
 
@@ -7258,35 +6462,20 @@ static void setup_buffer_inter(
 
   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
   // use the UV scaling factors.
-  av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
+  av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf,
+                       num_planes);
 
   // Gets an initial list of candidate vectors from neighbours and orders them
-  av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
-                   mbmi_ext->ref_mv_stack[ref_frame],
-                   mbmi_ext->compound_mode_context, candidates, mi_row, mi_col,
-                   NULL, NULL, mbmi_ext->mode_context);
-
-// Candidate refinement carried out at encoder and decoder
-#if CONFIG_AMVR
-  av1_find_best_ref_mvs(cm->allow_high_precision_mv, candidates,
-                        &frame_nearest_mv[ref_frame], &frame_near_mv[ref_frame],
-                        cm->cur_frame_mv_precision_level);
-#else
-  av1_find_best_ref_mvs(cm->allow_high_precision_mv, candidates,
-                        &frame_nearest_mv[ref_frame],
-                        &frame_near_mv[ref_frame]);
-#endif
-// Further refinement that is encode side only to test the top few candidates
-// in full and choose the best as the centre point for subsequent searches.
-// The current implementation doesn't support scaling.
-#if CONFIG_CB4X4
+  av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+                   mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
+                   mi_col, mbmi_ext->mode_context);
+
+  // Further refinement that is encode side only to test the top few candidates
+  // in full and choose the best as the centre point for subsequent searches.
+  // The current implementation doesn't support scaling.
+  (void)block_size;
   av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
               block_size);
-#else
-  if (!av1_is_scaled(sf) && block_size >= BLOCK_8X8)
-    av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
-                block_size);
-#endif  // CONFIG_CB4X4
 }
 
 static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
@@ -7294,19 +6483,15 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                                  int ref_idx, int *rate_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
   const AV1_COMMON *cm = &cpi->common;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int num_planes = av1_num_planes(cm);
+  MB_MODE_INFO *mbmi = xd->mi[0];
   struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
   int bestsme = INT_MAX;
   int step_param;
   int sadpb = x->sadperbit16;
   MV mvp_full;
-#if CONFIG_COMPOUND_SINGLEREF
-  int ref =
-      has_second_ref(mbmi) ? mbmi->ref_frame[ref_idx] : mbmi->ref_frame[0];
-#else   // !CONFIG_COMPOUND_SINGLEREF
   int ref = mbmi->ref_frame[ref_idx];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+  MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv;
 
   MvLimits tmp_mv_limits = x->mv_limits;
   int cost_list[5];
@@ -7314,25 +6499,21 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
   const YV12_BUFFER_CONFIG *scaled_ref_frame =
       av1_get_scaled_ref_frame(cpi, ref);
 
-  MV pred_mv[3];
-  pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv;
-  pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv;
-  pred_mv[2] = x->pred_mv[ref];
-
   if (scaled_ref_frame) {
-    int i;
     // Swap out the reference frame for a version that's been scaled to
     // match the resolution of the current frame, allowing the existing
-    // motion search code to be used without additional modifications.
-    for (i = 0; i < MAX_MB_PLANE; i++)
+    // full-pixel motion search code to be used without additional
+    // modifications.
+    for (int i = 0; i < num_planes; i++) {
       backup_yv12[i] = xd->plane[i].pre[ref_idx];
-
-    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
+    }
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+                         num_planes);
   }
 
-  av1_set_mv_search_range(&x->mv_limits, &ref_mv);
-
-  av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
+  av1_set_mvcost(
+      x, ref_idx,
+      mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
 
   // Work out the size of the first step in the mv step search.
   // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc.
@@ -7347,16 +6528,16 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
     step_param = cpi->mv_step_param;
   }
 
-  if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size) {
+  if (cpi->sf.adaptive_motion_search && bsize < cm->seq_params.sb_size) {
     int boffset =
-        2 * (b_width_log2_lookup[cm->sb_size] -
-             AOMMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
+        2 * (mi_size_wide_log2[cm->seq_params.sb_size] -
+             AOMMIN(mi_size_high_log2[bsize], mi_size_wide_log2[bsize]));
     step_param = AOMMAX(step_param, boffset);
   }
 
   if (cpi->sf.adaptive_motion_search) {
-    int bwl = b_width_log2_lookup[bsize];
-    int bhl = b_height_log2_lookup[bsize];
+    int bwl = mi_size_wide_log2[bsize];
+    int bhl = mi_size_high_log2[bsize];
     int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
 
     if (tlevel < 5) {
@@ -7374,8 +6555,8 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
           x->best_mv.as_int = INVALID_MV;
 
           if (scaled_ref_frame) {
-            int j;
-            for (j = 0; j < MAX_MB_PLANE; ++j)
+            // Swap back the original buffers before returning.
+            for (int j = 0; j < num_planes; ++j)
               xd->plane[j].pre[ref_idx] = backup_yv12[j];
           }
           return;
@@ -7384,35 +6565,26 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
   }
 
+  // Note: MV limits are modified here. Always restore the original values
+  // after full-pixel motion search.
   av1_set_mv_search_range(&x->mv_limits, &ref_mv);
 
-#if CONFIG_MOTION_VAR
   if (mbmi->motion_mode != SIMPLE_TRANSLATION)
     mvp_full = mbmi->mv[0].as_mv;
   else
-#endif  // CONFIG_MOTION_VAR
-    mvp_full = pred_mv[x->mv_best_ref_index[ref]];
+    mvp_full = ref_mv;
 
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
 
   x->best_mv.as_int = x->second_best_mv.as_int = INVALID_MV;
 
-#if CONFIG_MOTION_VAR
   switch (mbmi->motion_mode) {
     case SIMPLE_TRANSLATION:
-#endif  // CONFIG_MOTION_VAR
-#if CONFIG_HASH_ME
       bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
                                       sadpb, cond_cost_list(cpi, cost_list),
                                       &ref_mv, INT_MAX, 1, (MI_SIZE * mi_col),
                                       (MI_SIZE * mi_row), 0);
-#else
-  bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
-                                  cond_cost_list(cpi, cost_list), &ref_mv,
-                                  INT_MAX, 1);
-#endif
-#if CONFIG_MOTION_VAR
       break;
     case OBMC_CAUSAL:
       bestsme = av1_obmc_full_pixel_diamond(
@@ -7422,25 +6594,27 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
       break;
     default: assert(0 && "Invalid motion mode!\n");
   }
-#endif  // CONFIG_MOTION_VAR
+
+  if (scaled_ref_frame) {
+    // Swap back the original buffers for subpel motion search.
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[ref_idx] = backup_yv12[i];
+    }
+  }
 
   x->mv_limits = tmp_mv_limits;
 
-#if CONFIG_AMVR
-  if (cpi->common.cur_frame_mv_precision_level) {
+  if (cpi->common.cur_frame_force_integer_mv) {
     x->best_mv.as_mv.row *= 8;
     x->best_mv.as_mv.col *= 8;
   }
-  if (bestsme < INT_MAX && cpi->common.cur_frame_mv_precision_level == 0) {
-#else
-  if (bestsme < INT_MAX) {
-#endif
+  const int use_fractional_mv =
+      bestsme < INT_MAX && cpi->common.cur_frame_force_integer_mv == 0;
+  if (use_fractional_mv) {
     int dis; /* TODO: use dis in distortion calculation later. */
-#if CONFIG_MOTION_VAR
     switch (mbmi->motion_mode) {
       case SIMPLE_TRANSLATION:
-#endif  // CONFIG_MOTION_VAR
-        if (cpi->sf.use_upsampled_references) {
+        if (cpi->sf.use_accurate_subpel_search) {
           int best_mv_var;
           const int try_second = x->second_best_mv.as_int != INVALID_MV &&
                                  x->second_best_mv.as_int != x->best_mv.as_int;
@@ -7448,8 +6622,8 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
           const int ph = block_size_high[bsize];
 
           best_mv_var = cpi->find_fractional_mv_step(
-              x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
-              &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+              x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
+              x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
               cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
               x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, NULL,
               0, 0, pw, ph, 1);
@@ -7472,8 +6646,9 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                 x->best_mv.as_mv.col * 8 <= maxc &&
                 x->best_mv.as_mv.col * 8 >= minc) {
               this_var = cpi->find_fractional_mv_step(
-                  x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
-                  &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+                  x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
+                  x->errorperbit, &cpi->fn_ptr[bsize],
+                  cpi->sf.mv.subpel_force_stop,
                   cpi->sf.mv.subpel_iters_per_step,
                   cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
                   &dis, &x->pred_sse[ref], NULL, NULL, 0, 0, pw, ph, 1);
@@ -7483,45 +6658,35 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
           }
         } else {
           cpi->find_fractional_mv_step(
-              x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
-              &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+              x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
+              x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
               cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
               x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, NULL,
               0, 0, 0, 0, 0);
         }
-#if CONFIG_MOTION_VAR
         break;
       case OBMC_CAUSAL:
         av1_find_best_obmc_sub_pixel_tree_up(
-            x, &x->best_mv.as_mv, &ref_mv, cm->allow_high_precision_mv,
-            x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
-            cpi->sf.mv.subpel_iters_per_step, x->nmvjointcost, x->mvcost, &dis,
-            &x->pred_sse[ref], 0, cpi->sf.use_upsampled_references);
+            x, cm, mi_row, mi_col, &x->best_mv.as_mv, &ref_mv,
+            cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize],
+            cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step,
+            x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], 0,
+            cpi->sf.use_accurate_subpel_search);
         break;
       default: assert(0 && "Invalid motion mode!\n");
     }
-#endif  // CONFIG_MOTION_VAR
   }
   *rate_mv = av1_mv_bit_cost(&x->best_mv.as_mv, &ref_mv, x->nmvjointcost,
                              x->mvcost, MV_COST_WEIGHT);
 
-#if CONFIG_MOTION_VAR
   if (cpi->sf.adaptive_motion_search && mbmi->motion_mode == SIMPLE_TRANSLATION)
-#else
-  if (cpi->sf.adaptive_motion_search)
-#endif  // CONFIG_MOTION_VAR
     x->pred_mv[ref] = x->best_mv.as_mv;
-
-  if (scaled_ref_frame) {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      xd->plane[i].pre[ref_idx] = backup_yv12[i];
-  }
 }
 
-static INLINE void restore_dst_buf(MACROBLOCKD *xd, BUFFER_SET dst) {
+static INLINE void restore_dst_buf(MACROBLOCKD *xd, BUFFER_SET dst,
+                                   const int num_planes) {
   int i;
-  for (i = 0; i < MAX_MB_PLANE; i++) {
+  for (i = 0; i < num_planes; i++) {
     xd->plane[i].dst.buf = dst.plane[i];
     xd->plane[i].dst.stride = dst.stride[i];
   }
@@ -7535,106 +6700,50 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
   const int pw = block_size_wide[bsize];
   const int ph = block_size_high[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_COMPOUND_SINGLEREF
-  const int other_ref =
-      has_second_ref(mbmi) ? mbmi->ref_frame[!ref_idx] : mbmi->ref_frame[0];
-#else  // !CONFIG_COMPOUND_SINGLEREF
+  MB_MODE_INFO *mbmi = xd->mi[0];
   const int other_ref = mbmi->ref_frame[!ref_idx];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  struct scale_factors sf;
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
   struct macroblockd_plane *const pd = &xd->plane[0];
   // ic and ir are the 4x4 coordinates of the sub8x8 at index "block"
   const int ic = block & 1;
   const int ir = (block - ic) >> 1;
   const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
   const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
-#if CONFIG_GLOBAL_MOTION
-  WarpedMotionParams *const wm = &xd->global_motion[other_ref];
-  int is_global = is_global_mv_block(xd->mi[0], block, wm->wmtype);
-#endif  // CONFIG_GLOBAL_MOTION
-#else
-  (void)block;
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+  const WarpedMotionParams *const wm = &xd->global_motion[other_ref];
+  int is_global = is_global_mv_block(xd->mi[0], wm->wmtype);
 
-// This function should only ever be called for compound modes
-#if CONFIG_COMPOUND_SINGLEREF
-  assert(has_second_ref(mbmi) || is_inter_singleref_comp_mode(mbmi->mode));
-#else   // !CONFIG_COMPOUND_SINGLEREF
+  // This function should only ever be called for compound modes
   assert(has_second_ref(mbmi));
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-  struct buf_2d backup_yv12[MAX_MB_PLANE];
-  const YV12_BUFFER_CONFIG *const scaled_ref_frame =
-      av1_get_scaled_ref_frame(cpi, other_ref);
 
-  if (scaled_ref_frame) {
-    int i;
-    // Swap out the reference frame for a version that's been scaled to
-    // match the resolution of the current frame, allowing the existing
-    // motion search code to be used without additional modifications.
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      backup_yv12[i] = xd->plane[i].pre[!ref_idx];
-    av1_setup_pre_planes(xd, !ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
-  }
-
-// Since we have scaled the reference frames to match the size of the current
-// frame we must use a unit scaling factor during mode selection.
-#if CONFIG_HIGHBITDEPTH
-  av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
-                                    cm->height, cm->use_highbitdepth);
-#else
-  av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
-                                    cm->height);
-#endif  // CONFIG_HIGHBITDEPTH
+  const int plane = 0;
+  struct buf_2d ref_yv12 = xd->plane[plane].pre[!ref_idx];
 
-  struct buf_2d ref_yv12;
+  struct scale_factors sf;
+  av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height,
+                                    cm->width, cm->height);
 
-  const int plane = 0;
-  ConvolveParams conv_params = get_conv_params(!ref_idx, 0, plane);
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+  ConvolveParams conv_params = get_conv_params(!ref_idx, 0, plane, xd->bd);
   WarpTypesAllowed warp_types;
-#if CONFIG_GLOBAL_MOTION
   warp_types.global_warp_allowed = is_global;
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
   warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
-#endif  // CONFIG_WARPED_MOTION
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 
-  // Initialized here because of compiler problem in Visual Studio.
-  ref_yv12 = xd->plane[plane].pre[!ref_idx];
-
-// Get the prediction block from the 'other' reference frame.
-#if CONFIG_HIGHBITDEPTH
+  // Get the prediction block from the 'other' reference frame.
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     av1_highbd_build_inter_predictor(
         ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
-        0, mbmi->interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-        &warp_types, p_col, p_row,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-        plane, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
+        0, mbmi->interp_filters, &warp_types, p_col, p_row, plane,
+        MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd,
+        cm->allow_warped_motion);
   } else {
-#endif  // CONFIG_HIGHBITDEPTH
     av1_build_inter_predictor(
         ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
-        &conv_params, mbmi->interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-        &warp_types, p_col, p_row, plane, !ref_idx,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-        MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
-#if CONFIG_HIGHBITDEPTH
+        &conv_params, mbmi->interp_filters, &warp_types, p_col, p_row, plane,
+        !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd,
+        cm->allow_warped_motion);
   }
-#endif  // CONFIG_HIGHBITDEPTH
 
-  if (scaled_ref_frame) {
-    // Restore the prediction frame pointers to their unscaled versions.
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      xd->plane[i].pre[!ref_idx] = backup_yv12[i];
-  }
+  av1_jnt_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset,
+                             &xd->jcp_param.bck_offset,
+                             &xd->jcp_param.use_jnt_comp_avg, 1);
 }
 
 // Search for the best mv for one component of a compound,
@@ -7645,45 +6754,41 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
                                           const uint8_t *second_pred,
                                           const uint8_t *mask, int mask_stride,
                                           int *rate_mv, int ref_idx) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   const int pw = block_size_wide[bsize];
   const int ph = block_size_high[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_COMPOUND_SINGLEREF
-  const int ref =
-      has_second_ref(mbmi) ? mbmi->ref_frame[ref_idx] : mbmi->ref_frame[0];
-#else
+  MB_MODE_INFO *mbmi = xd->mi[0];
   const int ref = mbmi->ref_frame[ref_idx];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  int_mv ref_mv = x->mbmi_ext->ref_mvs[ref][0];
+  const int_mv ref_mv = av1_get_ref_mv(x, ref_idx);
   struct macroblockd_plane *const pd = &xd->plane[0];
 
   struct buf_2d backup_yv12[MAX_MB_PLANE];
   const YV12_BUFFER_CONFIG *const scaled_ref_frame =
       av1_get_scaled_ref_frame(cpi, ref);
 
-// Check that this is either an interinter or an interintra block
-#if CONFIG_COMPOUND_SINGLEREF
-  assert(has_second_ref(mbmi) ||
-         // or a single ref comp pred mode
-         is_inter_singleref_comp_mode(mbmi->mode) ||
-         (ref_idx == 0 && mbmi->ref_frame[1] == INTRA_FRAME));
-#else
-  assert(has_second_ref(mbmi) ||
-         (ref_idx == 0 && mbmi->ref_frame[1] == INTRA_FRAME));
-#endif  // CONFIG_COMPOUND_SINGLEREF
+  // Check that this is either an interinter or an interintra block
+  assert(has_second_ref(mbmi) || (ref_idx == 0 && is_interintra_mode(mbmi)));
+
+  // Store the first prediction buffer.
+  struct buf_2d orig_yv12;
+  if (ref_idx) {
+    orig_yv12 = pd->pre[0];
+    pd->pre[0] = pd->pre[ref_idx];
+  }
 
   if (scaled_ref_frame) {
     int i;
     // Swap out the reference frame for a version that's been scaled to
     // match the resolution of the current frame, allowing the existing
-    // motion search code to be used without additional modifications.
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      backup_yv12[i] = xd->plane[i].pre[ref_idx];
-    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
+    // full-pixel motion search code to be used without additional
+    // modifications.
+    for (i = 0; i < num_planes; i++) backup_yv12[i] = xd->plane[i].pre[ref_idx];
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+                         num_planes);
   }
 
-  struct buf_2d orig_yv12;
   int bestsme = INT_MAX;
   int sadpb = x->sadperbit16;
   MV *const best_mv = &x->best_mv.as_mv;
@@ -7691,12 +6796,6 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
   MvLimits tmp_mv_limits = x->mv_limits;
 
-  // Initialized here because of compiler problem in Visual Studio.
-  if (ref_idx) {
-    orig_yv12 = pd->pre[0];
-    pd->pre[0] = pd->pre[ref_idx];
-  }
-
   // Do compound motion search on the current reference frame.
   av1_set_mv_search_range(&x->mv_limits, &ref_mv.as_mv);
 
@@ -7706,12 +6805,9 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   best_mv->col >>= 3;
   best_mv->row >>= 3;
 
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!has_second_ref(mbmi))
-    av1_set_mvcost(x, ref, 0, mbmi->ref_mv_idx);
-  else
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
+  av1_set_mvcost(
+      x, ref_idx,
+      mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
 
   // Small-range full-pixel motion search.
   bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
@@ -7729,44 +6825,40 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
   x->mv_limits = tmp_mv_limits;
 
-#if CONFIG_AMVR
-  if (cpi->common.cur_frame_mv_precision_level) {
+  if (scaled_ref_frame) {
+    // Swap back the original buffers for subpel motion search.
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[ref_idx] = backup_yv12[i];
+    }
+  }
+
+  if (cpi->common.cur_frame_force_integer_mv) {
     x->best_mv.as_mv.row *= 8;
     x->best_mv.as_mv.col *= 8;
   }
-  if (bestsme < INT_MAX && cpi->common.cur_frame_mv_precision_level == 0) {
-#else
-  if (bestsme < INT_MAX) {
-#endif
+  const int use_fractional_mv =
+      bestsme < INT_MAX && cpi->common.cur_frame_force_integer_mv == 0;
+  if (use_fractional_mv) {
     int dis; /* TODO: use dis in distortion calculation later. */
     unsigned int sse;
     bestsme = cpi->find_fractional_mv_step(
-        x, &ref_mv.as_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
+        x, cm, mi_row, mi_col, &ref_mv.as_mv,
+        cpi->common.allow_high_precision_mv, x->errorperbit,
         &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
         x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask, mask_stride,
-        ref_idx, pw, ph, cpi->sf.use_upsampled_references);
+        ref_idx, pw, ph, cpi->sf.use_accurate_subpel_search);
   }
 
-  // Restore the pointer to the first (possibly scaled) prediction buffer.
+  // Restore the pointer to the first unscaled prediction buffer.
   if (ref_idx) pd->pre[0] = orig_yv12;
 
   if (bestsme < INT_MAX) *this_mv = *best_mv;
 
   *rate_mv = 0;
 
-  if (scaled_ref_frame) {
-    // Restore the prediction frame pointers to their unscaled versions.
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      xd->plane[i].pre[ref_idx] = backup_yv12[i];
-  }
-
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!has_second_ref(mbmi))
-    av1_set_mvcost(x, ref, 0, mbmi->ref_mv_idx);
-  else
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
+  av1_set_mvcost(
+      x, ref_idx,
+      mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
   *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, x->nmvjointcost,
                               x->mvcost, MV_COST_WEIGHT);
 }
@@ -7774,51 +6866,23 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 // Wrapper for compound_single_motion_search, for the common case
 // where the second prediction is also an inter mode.
 static void compound_single_motion_search_interinter(
-    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *frame_mv,
-#if CONFIG_COMPOUND_SINGLEREF
-    int_mv *frame_comp_mv,
-#endif  // CONFIG_COMPOUND_SINGLEREF
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
     int mi_row, int mi_col, const uint8_t *mask, int mask_stride, int *rate_mv,
     const int block, int ref_idx) {
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-
-// This function should only ever be called for compound modes
-#if CONFIG_COMPOUND_SINGLEREF
-  int is_singleref_comp_mode =
-      !has_second_ref(mbmi) && is_inter_singleref_comp_mode(mbmi->mode);
-  assert(has_second_ref(mbmi) || is_singleref_comp_mode);
-  if (is_singleref_comp_mode && ref_idx) assert(frame_comp_mv);
-#else   // !CONFIG_COMPOUND_SINGLEREF
-  assert(has_second_ref(mbmi));
-#endif  // CONFIG_COMPOUND_SINGLEREF
+  // This function should only ever be called for compound modes
+  assert(has_second_ref(xd->mi[0]));
 
-// Prediction buffer from second frame.
-#if CONFIG_HIGHBITDEPTH
+  // Prediction buffer from second frame.
   DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
   uint8_t *second_pred;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
     second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
   else
     second_pred = (uint8_t *)second_pred_alloc_16;
-#else
-  DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]);
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_COMPOUND_SINGLEREF
-  MV *this_mv = has_second_ref(mbmi)
-                    ? &frame_mv[mbmi->ref_frame[ref_idx]].as_mv
-                    : (ref_idx ? &frame_comp_mv[mbmi->ref_frame[0]].as_mv
-                               : &frame_mv[mbmi->ref_frame[0]].as_mv);
-  const MV *other_mv =
-      has_second_ref(mbmi)
-          ? &frame_mv[mbmi->ref_frame[!ref_idx]].as_mv
-          : (ref_idx ? &frame_mv[mbmi->ref_frame[0]].as_mv
-                     : &frame_comp_mv[mbmi->ref_frame[0]].as_mv);
-#else   // !CONFIG_COMPOUND_SINGLEREF
-  MV *this_mv = &frame_mv[mbmi->ref_frame[ref_idx]].as_mv;
-  const MV *other_mv = &frame_mv[mbmi->ref_frame[!ref_idx]].as_mv;
-#endif  // CONFIG_COMPOUND_SINGLEREF
+
+  MV *this_mv = &cur_mv[ref_idx].as_mv;
+  const MV *other_mv = &cur_mv[!ref_idx].as_mv;
 
   build_second_inter_pred(cpi, x, bsize, other_mv, mi_row, mi_col, block,
                           ref_idx, second_pred);
@@ -7828,58 +6892,33 @@ static void compound_single_motion_search_interinter(
                                 ref_idx);
 }
 
-#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
 static void do_masked_motion_search_indexed(
     const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize,
     int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv, int which) {
   // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   BLOCK_SIZE sb_type = mbmi->sb_type;
   const uint8_t *mask;
   const int mask_stride = block_size_wide[bsize];
 
   mask = av1_get_compound_type_mask(comp_data, sb_type);
 
-  int_mv frame_mv[TOTAL_REFS_PER_FRAME];
-#if CONFIG_COMPOUND_SINGLEREF
-  int_mv frame_comp_mv[TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  MV_REFERENCE_FRAME rf[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
-  assert(bsize >= BLOCK_8X8 || CONFIG_CB4X4);
-
-  frame_mv[rf[0]].as_int = cur_mv[0].as_int;
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!has_second_ref(mbmi))
-    frame_comp_mv[rf[0]].as_int = cur_mv[1].as_int;
-  else
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    frame_mv[rf[1]].as_int = cur_mv[1].as_int;
+  tmp_mv[0].as_int = cur_mv[0].as_int;
+  tmp_mv[1].as_int = cur_mv[1].as_int;
   if (which == 0 || which == 1) {
-    compound_single_motion_search_interinter(
-        cpi, x, bsize, frame_mv,
-#if CONFIG_COMPOUND_SINGLEREF
-        has_second_ref(mbmi) ? NULL : frame_comp_mv,
-#endif  // CONFIG_COMPOUND_SINGLEREF
-        mi_row, mi_col, mask, mask_stride, rate_mv, 0, which);
+    compound_single_motion_search_interinter(cpi, x, bsize, tmp_mv, mi_row,
+                                             mi_col, mask, mask_stride, rate_mv,
+                                             0, which);
   } else if (which == 2) {
-    joint_motion_search(cpi, x, bsize, frame_mv,
-#if CONFIG_COMPOUND_SINGLEREF
-                        has_second_ref(mbmi) ? NULL : frame_comp_mv,
-#endif  // CONFIG_COMPOUND_SINGLEREF
-                        mi_row, mi_col, NULL, mask, mask_stride, rate_mv, 0);
-  }
-  tmp_mv[0].as_int = frame_mv[rf[0]].as_int;
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!has_second_ref(mbmi))
-    tmp_mv[1].as_int = frame_comp_mv[rf[0]].as_int;
-  else  // comp ref
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    tmp_mv[1].as_int = frame_mv[rf[1]].as_int;
-}
-#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+    joint_motion_search(cpi, x, bsize, tmp_mv, mi_row, mi_col, NULL, mask,
+                        mask_stride, rate_mv, 0);
+  }
+}
 
+#define USE_DISCOUNT_NEWMV_TEST 0
+#if USE_DISCOUNT_NEWMV_TEST
 // In some situations we want to discount the apparent cost of a new motion
 // vector. Where there is a subtle motion field and especially where there is
 // low spatial complexity then it can be hard to cover the cost of a new motion
@@ -7887,17 +6926,42 @@ static void do_masked_motion_search_indexed(
 // However, once established that vector may be usable through the nearest and
 // near mv modes to reduce distortion in subsequent blocks and also improve
 // visual quality.
-static int discount_newmv_test(const AV1_COMP *const cpi, int this_mode,
-                               int_mv this_mv,
-                               int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME],
-                               int ref_frame) {
-  return (!cpi->rc.is_src_frame_alt_ref && (this_mode == NEWMV) &&
-          (this_mv.as_int != 0) &&
-          ((mode_mv[NEARESTMV][ref_frame].as_int == 0) ||
-           (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) &&
-          ((mode_mv[NEARMV][ref_frame].as_int == 0) ||
-           (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV)));
+#define NEW_MV_DISCOUNT_FACTOR 8
+static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx,
+                               int ref_mv_idx,
+                               const MV_REFERENCE_FRAME *ref_frame,
+                               const MB_MODE_INFO_EXT *mbmi_ext);
+static int discount_newmv_test(const AV1_COMP *const cpi, const MACROBLOCK *x,
+                               int this_mode, int_mv this_mv) {
+  if (this_mode == NEWMV && this_mv.as_int != 0 &&
+      !cpi->rc.is_src_frame_alt_ref) {
+    // Only discount new_mv when nearst_mv and all near_mv are zero, and the
+    // new_mv is not equal to global_mv
+    const AV1_COMMON *const cm = &cpi->common;
+    const MACROBLOCKD *const xd = &x->e_mbd;
+    const MB_MODE_INFO *const mbmi = xd->mi[0];
+    const MV_REFERENCE_FRAME tmp_ref_frames[2] = { mbmi->ref_frame[0],
+                                                   NONE_FRAME };
+    const uint8_t ref_frame_type = av1_ref_frame_type(tmp_ref_frames);
+    int_mv nearest_mv;
+    get_this_mv(&nearest_mv, NEARESTMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+    int ret = nearest_mv.as_int == 0;
+    for (int ref_mv_idx = 0;
+         ref_mv_idx < x->mbmi_ext->ref_mv_count[ref_frame_type]; ++ref_mv_idx) {
+      int_mv near_mv;
+      get_this_mv(&near_mv, NEARMV, 0, ref_mv_idx, tmp_ref_frames, x->mbmi_ext);
+      ret &= near_mv.as_int == 0;
+    }
+    if (cm->global_motion[tmp_ref_frames[0]].wmtype <= TRANSLATION) {
+      int_mv global_mv;
+      get_this_mv(&global_mv, GLOBALMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+      ret &= global_mv.as_int != this_mv.as_int;
+    }
+    return ret;
+  }
+  return 0;
 }
+#endif
 
 #define LEFT_TOP_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
 #define RIGHT_BOTTOM_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
@@ -7910,25 +6974,42 @@ static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
            xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
-#if CONFIG_WEDGE
 static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
                                const BLOCK_SIZE bsize, const uint8_t *pred0,
                                int stride0, const uint8_t *pred1, int stride1) {
+  static const BLOCK_SIZE split_qtr[BLOCK_SIZES_ALL] = {
+    //                            4X4
+    BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_4X8, BLOCK_8X4, BLOCK_8X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_8X16, BLOCK_16X8, BLOCK_16X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_16X32, BLOCK_32X16, BLOCK_32X32,
+    // 64x128,     128x64,        128x128
+    BLOCK_32X64, BLOCK_64X32, BLOCK_64X64,
+    // 4X16,       16X4,          8X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
+    // 32X8,       16X64,         64X16
+    BLOCK_16X4, BLOCK_8X32, BLOCK_32X8
+  };
   const struct macroblock_plane *const p = &x->plane[0];
   const uint8_t *src = p->src.buf;
   int src_stride = p->src.stride;
-  const int f_index = bsize - BLOCK_8X8;
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
   uint32_t esq[2][4];
   int64_t tl, br;
 
-#if CONFIG_HIGHBITDEPTH
+  const BLOCK_SIZE f_index = split_qtr[bsize];
+  assert(f_index != BLOCK_INVALID);
+
   if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     pred0 = CONVERT_TO_BYTEPTR(pred0);
     pred1 = CONVERT_TO_BYTEPTR(pred1);
   }
-#endif  // CONFIG_HIGHBITDEPTH
 
   cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
   cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred0 + bw / 2, stride0,
@@ -7947,100 +7028,14 @@ static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
                           pred1 + bh / 2 * stride1 + bw / 2, stride0,
                           &esq[1][3]);
 
-  tl = (int64_t)(esq[0][0] + esq[0][1] + esq[0][2]) -
-       (int64_t)(esq[1][0] + esq[1][1] + esq[1][2]);
-  br = (int64_t)(esq[1][3] + esq[1][1] + esq[1][2]) -
-       (int64_t)(esq[0][3] + esq[0][1] + esq[0][2]);
+  tl = ((int64_t)esq[0][0] + esq[0][1] + esq[0][2]) -
+       ((int64_t)esq[1][0] + esq[1][1] + esq[1][2]);
+  br = ((int64_t)esq[1][3] + esq[1][1] + esq[1][2]) -
+       ((int64_t)esq[0][3] + esq[0][1] + esq[0][2]);
   return (tl + br > 0);
 }
-#endif  // CONFIG_WEDGE
-
-#if !CONFIG_DUAL_FILTER
-static InterpFilter predict_interp_filter(
-    const AV1_COMP *cpi, const MACROBLOCK *x, const BLOCK_SIZE bsize,
-    const int mi_row, const int mi_col,
-    InterpFilter (*single_filter)[TOTAL_REFS_PER_FRAME]) {
-  InterpFilter best_filter = SWITCHABLE;
-  const AV1_COMMON *cm = &cpi->common;
-  const MACROBLOCKD *xd = &x->e_mbd;
-  int bsl = mi_width_log2_lookup[bsize];
-  int pred_filter_search =
-      cpi->sf.cb_pred_filter_search
-          ? (((mi_row + mi_col) >> bsl) +
-             get_chessboard_index(cm->current_video_frame)) &
-                0x1
-          : 0;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const int is_comp_pred = has_second_ref(mbmi);
-  const int this_mode = mbmi->mode;
-  int refs[2] = { mbmi->ref_frame[0],
-                  (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
-  if (pred_filter_search) {
-    InterpFilter af = SWITCHABLE, lf = SWITCHABLE;
-    if (xd->up_available)
-      af = av1_extract_interp_filter(
-          xd->mi[-xd->mi_stride]->mbmi.interp_filters, 0);
-    if (xd->left_available)
-      lf = av1_extract_interp_filter(xd->mi[-1]->mbmi.interp_filters, 0);
-
-    if ((this_mode != NEWMV && this_mode != NEW_NEWMV) || (af == lf))
-      best_filter = af;
-  }
-  if (is_comp_pred) {
-    if (cpi->sf.adaptive_mode_search) {
-      switch (this_mode) {
-        case NEAREST_NEARESTMV:
-          if (single_filter[NEARESTMV][refs[0]] ==
-              single_filter[NEARESTMV][refs[1]])
-            best_filter = single_filter[NEARESTMV][refs[0]];
-          break;
-        case NEAR_NEARMV:
-          if (single_filter[NEARMV][refs[0]] == single_filter[NEARMV][refs[1]])
-            best_filter = single_filter[NEARMV][refs[0]];
-          break;
-        case ZERO_ZEROMV:
-          if (single_filter[ZEROMV][refs[0]] == single_filter[ZEROMV][refs[1]])
-            best_filter = single_filter[ZEROMV][refs[0]];
-          break;
-        case NEW_NEWMV:
-          if (single_filter[NEWMV][refs[0]] == single_filter[NEWMV][refs[1]])
-            best_filter = single_filter[NEWMV][refs[0]];
-          break;
-        case NEAREST_NEWMV:
-          if (single_filter[NEARESTMV][refs[0]] ==
-              single_filter[NEWMV][refs[1]])
-            best_filter = single_filter[NEARESTMV][refs[0]];
-          break;
-        case NEAR_NEWMV:
-          if (single_filter[NEARMV][refs[0]] == single_filter[NEWMV][refs[1]])
-            best_filter = single_filter[NEARMV][refs[0]];
-          break;
-        case NEW_NEARESTMV:
-          if (single_filter[NEWMV][refs[0]] ==
-              single_filter[NEARESTMV][refs[1]])
-            best_filter = single_filter[NEWMV][refs[0]];
-          break;
-        case NEW_NEARMV:
-          if (single_filter[NEWMV][refs[0]] == single_filter[NEARMV][refs[1]])
-            best_filter = single_filter[NEWMV][refs[0]];
-          break;
-        default:
-          if (single_filter[this_mode][refs[0]] ==
-              single_filter[this_mode][refs[1]])
-            best_filter = single_filter[this_mode][refs[0]];
-          break;
-      }
-    }
-  }
-  if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
-    best_filter = EIGHTTAP_REGULAR;
-  }
-  return best_filter;
-}
-#endif  // !CONFIG_DUAL_FILTER
 
 // Choose the best wedge index and sign
-#if CONFIG_WEDGE
 static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
                           const BLOCK_SIZE bsize, const uint8_t *const p0,
                           const uint8_t *const p1, int *const best_wedge_sign,
@@ -8058,12 +7053,8 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
   int wedge_types = (1 << get_wedge_bits_lookup(bsize));
   const uint8_t *mask;
   uint64_t sse;
-#if CONFIG_HIGHBITDEPTH
   const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
   const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
-#else
-  const int bd_round = 0;
-#endif  // CONFIG_HIGHBITDEPTH
 
   DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
@@ -8072,7 +7063,6 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
 
   int64_t sign_limit;
 
-#if CONFIG_HIGHBITDEPTH
   if (hbd) {
     aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride,
                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
@@ -8080,9 +7070,7 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
                               CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
     aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
-  } else  // NOLINT
-#endif    // CONFIG_HIGHBITDEPTH
-  {
+  } else {
     aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw);
     aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
     aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
@@ -8114,6 +7102,7 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
     sse = ROUND_POWER_OF_TWO(sse, bd_round);
 
     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
+    rate += x->wedge_idx_cost[bsize][wedge_index];
     rd = RDCOST(x->rdmult, rate, dist);
 
     if (rd < best_rd) {
@@ -8123,7 +7112,8 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
     }
   }
 
-  return best_rd;
+  return best_rd -
+         RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
 }
 
 // Choose the best wedge index the specified sign
@@ -8143,25 +7133,18 @@ static int64_t pick_wedge_fixed_sign(
   int wedge_types = (1 << get_wedge_bits_lookup(bsize));
   const uint8_t *mask;
   uint64_t sse;
-#if CONFIG_HIGHBITDEPTH
   const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
   const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
-#else
-  const int bd_round = 0;
-#endif  // CONFIG_HIGHBITDEPTH
 
   DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
 
-#if CONFIG_HIGHBITDEPTH
   if (hbd) {
     aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
                               CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
     aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
-  } else  // NOLINT
-#endif    // CONFIG_HIGHBITDEPTH
-  {
+  } else {
     aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
     aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
   }
@@ -8175,6 +7158,7 @@ static int64_t pick_wedge_fixed_sign(
     sse = ROUND_POWER_OF_TWO(sse, bd_round);
 
     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
+    rate += x->wedge_idx_cost[bsize][wedge_index];
     rd = RDCOST(x->rdmult, rate, dist);
 
     if (rd < best_rd) {
@@ -8183,7 +7167,8 @@ static int64_t pick_wedge_fixed_sign(
     }
   }
 
-  return best_rd;
+  return best_rd -
+         RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
 }
 
 static int64_t pick_interinter_wedge(const AV1_COMP *const cpi,
@@ -8192,7 +7177,7 @@ static int64_t pick_interinter_wedge(const AV1_COMP *const cpi,
                                      const uint8_t *const p0,
                                      const uint8_t *const p1) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const int bw = block_size_wide[bsize];
 
   int64_t rd;
@@ -8200,7 +7185,7 @@ static int64_t pick_interinter_wedge(const AV1_COMP *const cpi,
   int wedge_sign = 0;
 
   assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
-  assert(cpi->common.allow_masked_compound);
+  assert(cpi->common.seq_params.enable_masked_compound);
 
   if (cpi->sf.fast_wedge_sign_estimate) {
     wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
@@ -8209,19 +7194,17 @@ static int64_t pick_interinter_wedge(const AV1_COMP *const cpi,
     rd = pick_wedge(cpi, x, bsize, p0, p1, &wedge_sign, &wedge_index);
   }
 
-  mbmi->wedge_sign = wedge_sign;
-  mbmi->wedge_index = wedge_index;
+  mbmi->interinter_comp.wedge_sign = wedge_sign;
+  mbmi->interinter_comp.wedge_index = wedge_index;
   return rd;
 }
-#endif  // CONFIG_WEDGE
 
-#if CONFIG_COMPOUND_SEGMENT
 static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
                                    MACROBLOCK *const x, const BLOCK_SIZE bsize,
                                    const uint8_t *const p0,
                                    const uint8_t *const p1) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const struct buf_2d *const src = &x->plane[0].src;
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
@@ -8230,20 +7213,15 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
   uint64_t sse;
   int64_t dist;
   int64_t rd0;
-  SEG_MASK_TYPE cur_mask_type;
+  DIFFWTD_MASK_TYPE cur_mask_type;
   int64_t best_rd = INT64_MAX;
-  SEG_MASK_TYPE best_mask_type = 0;
-#if CONFIG_HIGHBITDEPTH
+  DIFFWTD_MASK_TYPE best_mask_type = 0;
   const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
   const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
-#else
-  const int bd_round = 0;
-#endif  // CONFIG_HIGHBITDEPTH
   DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
 
-#if CONFIG_HIGHBITDEPTH
   if (hbd) {
     aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride,
                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
@@ -8251,26 +7229,22 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
                               CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
     aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
-  } else  // NOLINT
-#endif    // CONFIG_HIGHBITDEPTH
-  {
+  } else {
     aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw);
     aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
     aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
   }
 
   // try each mask type and its inverse
-  for (cur_mask_type = 0; cur_mask_type < SEG_MASK_TYPES; cur_mask_type++) {
-// build mask and inverse
-#if CONFIG_HIGHBITDEPTH
+  for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) {
+    // build mask and inverse
     if (hbd)
-      build_compound_seg_mask_highbd(
+      av1_build_compound_diffwtd_mask_highbd(
           xd->seg_mask, cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
-          CONVERT_TO_BYTEPTR(p1), bw, bsize, bh, bw, xd->bd);
+          CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd);
     else
-#endif  // CONFIG_HIGHBITDEPTH
-      build_compound_seg_mask(xd->seg_mask, cur_mask_type, p0, bw, p1, bw,
-                              bsize, bh, bw);
+      av1_build_compound_diffwtd_mask(xd->seg_mask, cur_mask_type, p0, bw, p1,
+                                      bw, bh, bw);
 
     // compute rd for mask
     sse = av1_wedge_sse_from_residuals(r1, d10, xd->seg_mask, N);
@@ -8286,35 +7260,31 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
   }
 
   // make final mask
-  mbmi->mask_type = best_mask_type;
-#if CONFIG_HIGHBITDEPTH
+  mbmi->interinter_comp.mask_type = best_mask_type;
   if (hbd)
-    build_compound_seg_mask_highbd(
-        xd->seg_mask, mbmi->mask_type, CONVERT_TO_BYTEPTR(p0), bw,
-        CONVERT_TO_BYTEPTR(p1), bw, bsize, bh, bw, xd->bd);
+    av1_build_compound_diffwtd_mask_highbd(
+        xd->seg_mask, mbmi->interinter_comp.mask_type, CONVERT_TO_BYTEPTR(p0),
+        bw, CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd);
   else
-#endif  // CONFIG_HIGHBITDEPTH
-    build_compound_seg_mask(xd->seg_mask, mbmi->mask_type, p0, bw, p1, bw,
-                            bsize, bh, bw);
+    av1_build_compound_diffwtd_mask(
+        xd->seg_mask, mbmi->interinter_comp.mask_type, p0, bw, p1, bw, bh, bw);
 
   return best_rd;
 }
-#endif  // CONFIG_COMPOUND_SEGMENT
 
-#if CONFIG_WEDGE && CONFIG_INTERINTRA
 static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
                                      const MACROBLOCK *const x,
                                      const BLOCK_SIZE bsize,
                                      const uint8_t *const p0,
                                      const uint8_t *const p1) {
   const MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
 
   int64_t rd;
   int wedge_index = -1;
 
   assert(is_interintra_wedge_used(bsize));
-  assert(cpi->common.allow_interintra_compound);
+  assert(cpi->common.seq_params.enable_interintra_compound);
 
   rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, 0, &wedge_index);
 
@@ -8322,22 +7292,15 @@ static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
   mbmi->interintra_wedge_index = wedge_index;
   return rd;
 }
-#endif  // CONFIG_WEDGE && CONFIG_INTERINTRA
 
-#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
 static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     const BLOCK_SIZE bsize,
                                     const uint8_t *const p0,
                                     const uint8_t *const p1) {
-  const COMPOUND_TYPE compound_type =
-      x->e_mbd.mi[0]->mbmi.interinter_compound_type;
+  const COMPOUND_TYPE compound_type = x->e_mbd.mi[0]->interinter_comp.type;
   switch (compound_type) {
-#if CONFIG_WEDGE
     case COMPOUND_WEDGE: return pick_interinter_wedge(cpi, x, bsize, p0, p1);
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-    case COMPOUND_SEG: return pick_interinter_seg(cpi, x, bsize, p0, p1);
-#endif  // CONFIG_COMPOUND_SEGMENT
+    case COMPOUND_DIFFWTD: return pick_interinter_seg(cpi, x, bsize, p0, p1);
     default: assert(0); return 0;
   }
 }
@@ -8346,46 +7309,23 @@ static int interinter_compound_motion_search(
     const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
     const BLOCK_SIZE bsize, const int this_mode, int mi_row, int mi_col) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   int_mv tmp_mv[2];
   int tmp_rate_mv = 0;
-  const INTERINTER_COMPOUND_DATA compound_data = {
-#if CONFIG_WEDGE
-    mbmi->wedge_index,
-    mbmi->wedge_sign,
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-    mbmi->mask_type,
-    xd->seg_mask,
-#endif  // CONFIG_COMPOUND_SEGMENT
-    mbmi->interinter_compound_type
-  };
-#if CONFIG_COMPOUND_SINGLEREF
-  // NOTE: Mode is needed to identify the compound mode prediction, regardless
-  //       of comp refs or single ref.
-  mbmi->mode = this_mode;
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-  if (this_mode == NEW_NEWMV
-#if CONFIG_COMPOUND_SINGLEREF
-      || this_mode == SR_NEW_NEWMV
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      ) {
-    do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
+  mbmi->interinter_comp.seg_mask = xd->seg_mask;
+  const INTERINTER_COMPOUND_DATA *compound_data = &mbmi->interinter_comp;
+
+  if (this_mode == NEW_NEWMV) {
+    do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
                                     mi_row, mi_col, tmp_mv, &tmp_rate_mv, 2);
     mbmi->mv[0].as_int = tmp_mv[0].as_int;
     mbmi->mv[1].as_int = tmp_mv[1].as_int;
   } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
-    do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
+    do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
                                     mi_row, mi_col, tmp_mv, &tmp_rate_mv, 0);
     mbmi->mv[0].as_int = tmp_mv[0].as_int;
-  } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV
-#if CONFIG_COMPOUND_SINGLEREF
-             // || this_mode == SR_NEAREST_NEWMV
-             || this_mode == SR_NEAR_NEWMV || this_mode == SR_ZERO_NEWMV
-#endif  // CONFIG_COMPOUND_SINGLEREF
-             ) {
-    do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
+  } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+    do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
                                     mi_row, mi_col, tmp_mv, &tmp_rate_mv, 1);
     mbmi->mv[1].as_int = tmp_mv[1].as_int;
   }
@@ -8394,22 +7334,23 @@ static int interinter_compound_motion_search(
 
 static int64_t build_and_cost_compound_type(
     const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
-    const BLOCK_SIZE bsize, const int this_mode, int rs2, int rate_mv,
+    const BLOCK_SIZE bsize, const int this_mode, int *rs2, int rate_mv,
     BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, uint8_t **preds1,
     int *strides, int mi_row, int mi_col) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   int rate_sum;
   int64_t dist_sum;
   int64_t best_rd_cur = INT64_MAX;
   int64_t rd = INT64_MAX;
   int tmp_skip_txfm_sb;
   int64_t tmp_skip_sse_sb;
-  const COMPOUND_TYPE compound_type = mbmi->interinter_compound_type;
+  const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
 
   best_rd_cur = pick_interinter_mask(cpi, x, bsize, *preds0, *preds1);
-  best_rd_cur += RDCOST(x->rdmult, rs2 + rate_mv, 0);
+  *rs2 += get_interinter_compound_mask_rate(x, mbmi);
+  best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0);
 
   if (have_newmv_in_inter_mode(this_mode) &&
       use_masked_motion_search(compound_type)) {
@@ -8417,80 +7358,74 @@ static int64_t build_and_cost_compound_type(
                                                      this_mode, mi_row, mi_col);
     av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
     model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
-                    &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
-    rd = RDCOST(x->rdmult, rs2 + *out_rate_mv + rate_sum, dist_sum);
+                    &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+    rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
     if (rd >= best_rd_cur) {
       mbmi->mv[0].as_int = cur_mv[0].as_int;
       mbmi->mv[1].as_int = cur_mv[1].as_int;
       *out_rate_mv = rate_mv;
-      av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0,
-#if CONFIG_SUPERTX
-                                               0, 0,
-#endif  // CONFIG_SUPERTX
-                                               preds0, strides, preds1,
-                                               strides);
+      av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
+                                               preds1, strides);
     }
     av1_subtract_plane(x, bsize, 0);
     rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
                              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
     if (rd != INT64_MAX)
-      rd = RDCOST(x->rdmult, rs2 + *out_rate_mv + rate_sum, dist_sum);
+      rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
     best_rd_cur = rd;
 
   } else {
-    av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0,
-#if CONFIG_SUPERTX
-                                             0, 0,
-#endif  // CONFIG_SUPERTX
-                                             preds0, strides, preds1, strides);
+    av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
+                                             preds1, strides);
     av1_subtract_plane(x, bsize, 0);
     rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
                              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
     if (rd != INT64_MAX)
-      rd = RDCOST(x->rdmult, rs2 + rate_mv + rate_sum, dist_sum);
+      rd = RDCOST(x->rdmult, *rs2 + rate_mv + rate_sum, dist_sum);
     best_rd_cur = rd;
   }
   return best_rd_cur;
 }
-#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
 
 typedef struct {
-#if CONFIG_MOTION_VAR
-  // Inter prediction buffers and respective strides
+  // OBMC secondary prediction buffers and respective strides
   uint8_t *above_pred_buf[MAX_MB_PLANE];
   int above_pred_stride[MAX_MB_PLANE];
   uint8_t *left_pred_buf[MAX_MB_PLANE];
   int left_pred_stride[MAX_MB_PLANE];
-#endif  // CONFIG_MOTION_VAR
   int_mv *single_newmv;
   // Pointer to array of motion vectors to use for each ref and their rates
   // Should point to first of 2 arrays in 2D array
   int *single_newmv_rate;
+  int *single_newmv_valid;
   // Pointer to array of predicted rate-distortion
   // Should point to first of 2 arrays in 2D array
-  int64_t (*modelled_rd)[TOTAL_REFS_PER_FRAME];
-  InterpFilter single_filter[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+  int64_t (*modelled_rd)[REF_FRAMES];
+  InterpFilter single_filter[MB_MODE_COUNT][REF_FRAMES];
+  int ref_frame_cost;
+  int single_comp_cost;
 } HandleInterModeArgs;
 
+static INLINE int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv,
+                                     const AV1_COMMON *cm,
+                                     const MACROBLOCK *x) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  *out_mv = in_mv;
+  lower_mv_precision(&out_mv->as_mv, cm->allow_high_precision_mv,
+                     cm->cur_frame_force_integer_mv);
+  clamp_mv2(&out_mv->as_mv, xd);
+  return !mv_check_bounds(&x->mv_limits, &out_mv->as_mv);
+}
+
 static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                            const BLOCK_SIZE bsize,
-                            int_mv (*const mode_mv)[TOTAL_REFS_PER_FRAME],
-#if CONFIG_COMPOUND_SINGLEREF
-                            int_mv (*const mode_comp_mv)[TOTAL_REFS_PER_FRAME],
-#endif  // CONFIG_COMPOUND_SINGLEREF
+                            const BLOCK_SIZE bsize, int_mv *cur_mv,
                             const int mi_row, const int mi_col,
-                            int *const rate_mv, int_mv *const single_newmv,
+                            int *const rate_mv,
                             HandleInterModeArgs *const args) {
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   const int is_comp_pred = has_second_ref(mbmi);
   const PREDICTION_MODE this_mode = mbmi->mode;
-  const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
-  int_mv *const frame_mv = mode_mv[this_mode];
-#if CONFIG_COMPOUND_SINGLEREF
-  int_mv *const frame_comp_mv = mode_comp_mv[this_mode];
-#endif  // CONFIG_COMPOUND_SINGLEREF
   const int refs[2] = { mbmi->ref_frame[0],
                         mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
   int i;
@@ -8498,392 +7433,338 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
   (void)args;
 
   if (is_comp_pred) {
-    for (i = 0; i < 2; ++i) {
-      single_newmv[refs[i]].as_int = args->single_newmv[refs[i]].as_int;
-    }
-
     if (this_mode == NEW_NEWMV) {
-      frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
-      frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+      cur_mv[0].as_int = args->single_newmv[refs[0]].as_int;
+      cur_mv[1].as_int = args->single_newmv[refs[1]].as_int;
 
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
-        joint_motion_search(cpi, x, bsize, frame_mv,
-#if CONFIG_COMPOUND_SINGLEREF
-                            NULL,  // int_mv *frame_comp_mv
-#endif                             // CONFIG_COMPOUND_SINGLEREF
-                            mi_row, mi_col, NULL, NULL, 0, rate_mv, 0);
+        joint_motion_search(cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, NULL,
+                            0, rate_mv, 0);
       } else {
         *rate_mv = 0;
         for (i = 0; i < 2; ++i) {
-          av1_set_mvcost(x, refs[i], i, mbmi->ref_mv_idx);
-          *rate_mv += av1_mv_bit_cost(
-              &frame_mv[refs[i]].as_mv, &mbmi_ext->ref_mvs[refs[i]][0].as_mv,
-              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+          const int_mv ref_mv = av1_get_ref_mv(x, i);
+          av1_set_mvcost(x, i, mbmi->ref_mv_idx);
+          *rate_mv +=
+              av1_mv_bit_cost(&cur_mv[i].as_mv, &ref_mv.as_mv, x->nmvjointcost,
+                              x->mvcost, MV_COST_WEIGHT);
         }
       }
     } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
-      frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+      cur_mv[1].as_int = args->single_newmv[refs[1]].as_int;
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
-        frame_mv[refs[0]].as_int =
-            mode_mv[compound_ref0_mode(this_mode)][refs[0]].as_int;
-        compound_single_motion_search_interinter(cpi, x, bsize, frame_mv,
-#if CONFIG_COMPOUND_SINGLEREF
-                                                 NULL,
-#endif  // CONFIG_COMPOUND_SINGLEREF
-                                                 mi_row, mi_col, NULL, 0,
-                                                 rate_mv, 0, 1);
+        compound_single_motion_search_interinter(
+            cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 1);
       } else {
-        av1_set_mvcost(x, refs[1], 1, mbmi->ref_mv_idx);
-        *rate_mv = av1_mv_bit_cost(&frame_mv[refs[1]].as_mv,
-                                   &mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+        av1_set_mvcost(x, 1,
+                       mbmi->ref_mv_idx + (this_mode == NEAR_NEWMV ? 1 : 0));
+        const int_mv ref_mv = av1_get_ref_mv(x, 1);
+        *rate_mv = av1_mv_bit_cost(&cur_mv[1].as_mv, &ref_mv.as_mv,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
     } else {
       assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV);
-      frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+      cur_mv[0].as_int = args->single_newmv[refs[0]].as_int;
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
-        frame_mv[refs[1]].as_int =
-            mode_mv[compound_ref1_mode(this_mode)][refs[1]].as_int;
-        compound_single_motion_search_interinter(cpi, x, bsize, frame_mv,
-#if CONFIG_COMPOUND_SINGLEREF
-                                                 NULL,
-#endif  // CONFIG_COMPOUND_SINGLEREF
-                                                 mi_row, mi_col, NULL, 0,
-                                                 rate_mv, 0, 0);
+        compound_single_motion_search_interinter(
+            cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 0);
       } else {
-        av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
-        *rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
-                                   &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+        const int_mv ref_mv = av1_get_ref_mv(x, 0);
+        av1_set_mvcost(x, 0,
+                       mbmi->ref_mv_idx + (this_mode == NEW_NEARMV ? 1 : 0));
+        *rate_mv = av1_mv_bit_cost(&cur_mv[0].as_mv, &ref_mv.as_mv,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
     }
-#if CONFIG_COMPOUND_SINGLEREF
-  } else if (is_inter_singleref_comp_mode(this_mode)) {
-    // Single ref comp mode
-    const int mode0 = compound_ref0_mode(this_mode);
-
-    single_newmv[refs[0]].as_int = args->single_newmv[refs[0]].as_int;
-    frame_mv[refs[0]].as_int = (mode0 == NEWMV)
-                                   ? single_newmv[refs[0]].as_int
-                                   : mode_mv[mode0][refs[0]].as_int;
-    assert(compound_ref1_mode(this_mode) == NEWMV);
-    frame_comp_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
-
-    if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
-      if (this_mode == SR_NEW_NEWMV) {
-        joint_motion_search(cpi, x, bsize, frame_mv, frame_comp_mv, mi_row,
-                            mi_col, NULL, NULL, 0, rate_mv, 0);
-      } else {
-        assert(  // this_mode == SR_NEAREST_NEWMV ||
-            this_mode == SR_NEAR_NEWMV || this_mode == SR_ZERO_NEWMV);
-        compound_single_motion_search_interinter(cpi, x, bsize, frame_mv,
-                                                 frame_comp_mv, mi_row, mi_col,
-                                                 NULL, 0, rate_mv, 0, 1);
-      }
-    } else {
-      *rate_mv = 0;
-      av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
-      if (mode0 == NEWMV)
-        *rate_mv += av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
-                                    &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
-                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-      *rate_mv += av1_mv_bit_cost(&frame_comp_mv[refs[0]].as_mv,
-                                  &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
-                                  x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-    }
-#endif  // CONFIG_COMPOUND_SINGLEREF
   } else {
-    if (is_comp_interintra_pred) {
-      x->best_mv = args->single_newmv[refs[0]];
-      *rate_mv = args->single_newmv_rate[refs[0]];
-    } else {
-      single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, rate_mv);
-      args->single_newmv[refs[0]] = x->best_mv;
-      args->single_newmv_rate[refs[0]] = *rate_mv;
-    }
-
+    single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, rate_mv);
     if (x->best_mv.as_int == INVALID_MV) return INT64_MAX;
 
-    frame_mv[refs[0]] = x->best_mv;
-    xd->mi[0]->bmi[0].as_mv[0] = x->best_mv;
+    args->single_newmv[refs[0]] = x->best_mv;
+    args->single_newmv_rate[refs[0]] = *rate_mv;
+    args->single_newmv_valid[refs[0]] = 1;
+
+    cur_mv[0].as_int = x->best_mv.as_int;
 
+#if USE_DISCOUNT_NEWMV_TEST
     // Estimate the rate implications of a new mv but discount this
     // under certain circumstances where we want to help initiate a weak
     // motion field, where the distortion gain for a single block may not
     // be enough to overcome the cost of a new mv.
-    if (discount_newmv_test(cpi, this_mode, x->best_mv, mode_mv, refs[0])) {
+    if (discount_newmv_test(cpi, x, this_mode, x->best_mv)) {
       *rate_mv = AOMMAX(*rate_mv / NEW_MV_DISCOUNT_FACTOR, 1);
     }
+#endif
   }
 
   return 0;
 }
 
-int64_t interpolation_filter_search(
+static INLINE void swap_dst_buf(MACROBLOCKD *xd, const BUFFER_SET *dst_bufs[2],
+                                int num_planes) {
+  const BUFFER_SET *buf0 = dst_bufs[0];
+  dst_bufs[0] = dst_bufs[1];
+  dst_bufs[1] = buf0;
+  restore_dst_buf(xd, *dst_bufs[0], num_planes);
+}
+
+// calculate the rdcost of given interpolation_filter
+static INLINE int64_t interpolation_filter_rd(
+    MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+    int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd,
+    int *const switchable_rate, int *const skip_txfm_sb,
+    int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2], int filter_idx) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int tmp_rate, tmp_skip_sb = 0;
+  int64_t tmp_dist, tmp_skip_sse = INT64_MAX;
+
+  const InterpFilters last_best = mbmi->interp_filters;
+  mbmi->interp_filters = filter_sets[filter_idx];
+  const int tmp_rs = av1_get_switchable_rate(cm, x, xd);
+  av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+  model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, &tmp_dist,
+                  &tmp_skip_sb, &tmp_skip_sse, NULL, NULL, NULL);
+  int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
+  if (tmp_rd < *rd) {
+    *rd = tmp_rd;
+    *switchable_rate = tmp_rs;
+    *skip_txfm_sb = tmp_skip_sb;
+    *skip_sse_sb = tmp_skip_sse;
+    swap_dst_buf(xd, dst_bufs, num_planes);
+    return 1;
+  }
+  mbmi->interp_filters = last_best;
+  return 0;
+}
+
+// check if there is saved result match with this search
+static INLINE int is_interp_filter_match(const INTERPOLATION_FILTER_STATS *st,
+                                         MB_MODE_INFO *const mi) {
+  for (int i = 0; i < 2; ++i) {
+    if ((st->ref_frames[i] != mi->ref_frame[i]) ||
+        (st->mv[i].as_int != mi->mv[i].as_int)) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static INLINE int find_interp_filter_in_stats(MACROBLOCK *x,
+                                              MB_MODE_INFO *const mbmi) {
+  const int comp_idx = mbmi->compound_idx;
+  const int offset = x->interp_filter_stats_idx[comp_idx];
+  for (int j = 0; j < offset; ++j) {
+    const INTERPOLATION_FILTER_STATS *st = &x->interp_filter_stats[comp_idx][j];
+    if (is_interp_filter_match(st, mbmi)) {
+      mbmi->interp_filters = st->filters;
+      return j;
+    }
+  }
+  return -1;  // no match result found
+}
+
+static INLINE void save_interp_filter_search_stat(MACROBLOCK *x,
+                                                  MB_MODE_INFO *const mbmi) {
+  const int comp_idx = mbmi->compound_idx;
+  const int offset = x->interp_filter_stats_idx[comp_idx];
+  if (offset < MAX_INTERP_FILTER_STATS) {
+    INTERPOLATION_FILTER_STATS stat = {
+      mbmi->interp_filters,
+      { mbmi->mv[0], mbmi->mv[1] },
+      { mbmi->ref_frame[0], mbmi->ref_frame[1] },
+    };
+    x->interp_filter_stats[comp_idx][offset] = stat;
+    x->interp_filter_stats_idx[comp_idx]++;
+  }
+}
+
+static int64_t interpolation_filter_search(
     MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
     int mi_row, int mi_col, const BUFFER_SET *const tmp_dst,
-    BUFFER_SET *const orig_dst,
-    InterpFilter (*const single_filter)[TOTAL_REFS_PER_FRAME],
+    BUFFER_SET *const orig_dst, InterpFilter (*const single_filter)[REF_FRAMES],
     int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb,
     int64_t *const skip_sse_sb) {
   const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  int i;
-  int tmp_rate;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int need_search =
+      av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd);
+  int i, tmp_rate;
   int64_t tmp_dist;
 
   (void)single_filter;
-
-  InterpFilter assign_filter = SWITCHABLE;
-
-  if (cm->interp_filter == SWITCHABLE) {
-#if !CONFIG_DUAL_FILTER
-    assign_filter = av1_is_interp_needed(xd)
-                        ? predict_interp_filter(cpi, x, bsize, mi_row, mi_col,
-                                                single_filter)
-                        : cm->interp_filter;
-#endif  // !CONFIG_DUAL_FILTER
-  } else {
-    assign_filter = cm->interp_filter;
+  int match_found = -1;
+  const InterpFilter assign_filter = cm->interp_filter;
+  if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) {
+    match_found = find_interp_filter_in_stats(x, mbmi);
+  }
+  if (!need_search || match_found == -1) {
+    set_default_interp_filters(mbmi, assign_filter);
   }
-
-  set_default_interp_filters(mbmi, assign_filter);
-
   *switchable_rate = av1_get_switchable_rate(cm, x, xd);
   av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
-  model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, &tmp_dist,
-                  skip_txfm_sb, skip_sse_sb);
+  model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, &tmp_dist,
+                  skip_txfm_sb, skip_sse_sb, NULL, NULL, NULL);
   *rd = RDCOST(x->rdmult, *switchable_rate + tmp_rate, tmp_dist);
 
-  if (assign_filter == SWITCHABLE) {
-    // do interp_filter search
-    if (av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd)) {
-#if CONFIG_DUAL_FILTER
-      const int filter_set_size = DUAL_FILTER_SET_SIZE;
-#else
-      const int filter_set_size = SWITCHABLE_FILTERS;
-#endif  // CONFIG_DUAL_FILTER
-      int best_in_temp = 0;
-      InterpFilters best_filters = mbmi->interp_filters;
-      restore_dst_buf(xd, *tmp_dst);
-      // EIGHTTAP_REGULAR mode is calculated beforehand
-      for (i = 1; i < filter_set_size; ++i) {
-        int tmp_skip_sb = 0;
-        int64_t tmp_skip_sse = INT64_MAX;
-        int tmp_rs;
-        int64_t tmp_rd;
-#if CONFIG_DUAL_FILTER
-        mbmi->interp_filters =
-            av1_make_interp_filters(filter_sets[i][0], filter_sets[i][1]);
-#else
-        mbmi->interp_filters = av1_broadcast_interp_filter((InterpFilter)i);
-#endif  // CONFIG_DUAL_FILTER
-        tmp_rs = av1_get_switchable_rate(cm, x, xd);
-        av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
-        model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
-                        &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
-        tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
-
-        if (tmp_rd < *rd) {
-          *rd = tmp_rd;
-          *switchable_rate = av1_get_switchable_rate(cm, x, xd);
-          best_filters = mbmi->interp_filters;
-          *skip_txfm_sb = tmp_skip_sb;
-          *skip_sse_sb = tmp_skip_sse;
-          best_in_temp = !best_in_temp;
-          if (best_in_temp) {
-            restore_dst_buf(xd, *orig_dst);
-          } else {
-            restore_dst_buf(xd, *tmp_dst);
-          }
-        }
-      }
-      if (best_in_temp) {
-        restore_dst_buf(xd, *tmp_dst);
-      } else {
-        restore_dst_buf(xd, *orig_dst);
+  if (assign_filter != SWITCHABLE || match_found != -1) {
+    return 0;
+  }
+  if (!need_search) {
+    assert(mbmi->interp_filters ==
+           av1_broadcast_interp_filter(EIGHTTAP_REGULAR));
+    return 0;
+  }
+  // do interp_filter search
+  const int filter_set_size = DUAL_FILTER_SET_SIZE;
+  restore_dst_buf(xd, *tmp_dst, num_planes);
+  const BUFFER_SET *dst_bufs[2] = { tmp_dst, orig_dst };
+  if (cpi->sf.use_fast_interpolation_filter_search &&
+      cm->seq_params.enable_dual_filter) {
+    // default to (R,R): EIGHTTAP_REGULARxEIGHTTAP_REGULAR
+    int best_dual_mode = 0;
+    // Find best of {R}x{R,Sm,Sh}
+    // EIGHTTAP_REGULAR mode is calculated beforehand
+    for (i = 1; i < SWITCHABLE_FILTERS; ++i) {
+      if (interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+                                  switchable_rate, skip_txfm_sb, skip_sse_sb,
+                                  dst_bufs, i)) {
+        best_dual_mode = i;
+      }
+    }
+    // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
+    for (i = best_dual_mode + SWITCHABLE_FILTERS; i < filter_set_size;
+         i += SWITCHABLE_FILTERS) {
+      interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+                              switchable_rate, skip_txfm_sb, skip_sse_sb,
+                              dst_bufs, i);
+    }
+  } else {
+    // EIGHTTAP_REGULAR mode is calculated beforehand
+    for (i = 1; i < filter_set_size; ++i) {
+      if (cm->seq_params.enable_dual_filter == 0) {
+        const int16_t filter_y = filter_sets[i] & 0xffff;
+        const int16_t filter_x = filter_sets[i] >> 16;
+        if (filter_x != filter_y) continue;
       }
-      mbmi->interp_filters = best_filters;
-    } else {
-      assert(mbmi->interp_filters ==
-             av1_broadcast_interp_filter(EIGHTTAP_REGULAR));
+      interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+                              switchable_rate, skip_txfm_sb, skip_sse_sb,
+                              dst_bufs, i);
     }
   }
-
+  swap_dst_buf(xd, dst_bufs, num_planes);
+  // save search results
+  if (cpi->sf.skip_repeat_interpolation_filter_search) {
+    assert(match_found == -1);
+    save_interp_filter_search_stat(x, mbmi);
+  }
   return 0;
 }
 
-#if CONFIG_DUAL_FILTER
-static InterpFilters condition_interp_filters_on_mv(
-    InterpFilters interp_filters, const MACROBLOCKD *xd) {
-  InterpFilter filters[2];
-  for (int i = 0; i < 2; ++i)
-    filters[i] = (has_subpel_mv_component(xd->mi[0], xd, i))
-                     ? av1_extract_interp_filter(interp_filters, i)
-                     : EIGHTTAP_REGULAR;
-
-  return av1_make_interp_filters(filters[0], filters[1]);
-}
-#endif
-
 // TODO(afergs): Refactor the MBMI references in here - there's four
 // TODO(afergs): Refactor optional args - add them to a struct or remove
-static int64_t motion_mode_rd(
-    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
-    RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
-    int *disable_skip, int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME], int mi_row,
-    int mi_col, HandleInterModeArgs *const args, const int64_t ref_best_rd,
-    const int *refs, int rate_mv,
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-    // only used when WARPED_MOTION is on?
-    int_mv *const single_newmv, int rate2_bmc_nocoeff,
-    MB_MODE_INFO *best_bmc_mbmi, int rate_mv_bmc,
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-    int rs, int *skip_txfm_sb, int64_t *skip_sse_sb, BUFFER_SET *orig_dst) {
+static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                              BLOCK_SIZE bsize, RD_STATS *rd_stats,
+                              RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
+                              int *disable_skip, int mi_row, int mi_col,
+                              HandleInterModeArgs *const args,
+                              int64_t ref_best_rd, const int *refs, int rate_mv,
+                              BUFFER_SET *orig_dst
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+                              ,
+                              int64_t *best_est_rd
+#endif
+) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mi[0];
-  MB_MODE_INFO *mbmi = &mi->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   const int is_comp_pred = has_second_ref(mbmi);
   const PREDICTION_MODE this_mode = mbmi->mode;
-
-  (void)mode_mv;
-  (void)mi_row;
-  (void)mi_col;
-  (void)args;
-  (void)refs;
-  (void)rate_mv;
-  (void)is_comp_pred;
-  (void)this_mode;
-#if !CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
-  (void)single_newmv;
-#endif
-
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  MOTION_MODE motion_mode, last_motion_mode_allowed;
   int rate2_nocoeff = 0, best_xskip, best_disable_skip = 0;
   RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
   MB_MODE_INFO base_mbmi, best_mbmi;
-#if CONFIG_VAR_TX
-  uint8_t best_blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
-#endif  // CONFIG_VAR_TX
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-
-#if CONFIG_WARPED_MOTION
-#if WARPED_MOTION_SORT_SAMPLES
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  int interintra_allowed = cm->seq_params.enable_interintra_compound &&
+                           is_interintra_allowed(mbmi) && mbmi->compound_idx;
   int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE];
-  int pts_mv0[SAMPLES_ARRAY_SIZE];
   int total_samples;
-#else
-  int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
-#endif  // WARPED_MOTION_SORT_SAMPLES
-#endif  // CONFIG_WARPED_MOTION
 
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  (void)rate_mv;
+
   av1_invalid_rd_stats(&best_rd_stats);
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
-  if (cm->interp_filter == SWITCHABLE) rd_stats->rate += rs;
-#if CONFIG_WARPED_MOTION
   aom_clear_system_state();
-#if WARPED_MOTION_SORT_SAMPLES
-  mbmi->num_proj_ref[0] =
-      findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0, pts_mv0);
+  mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0);
   total_samples = mbmi->num_proj_ref[0];
-#else
-  mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
-#endif  // WARPED_MOTION_SORT_SAMPLES
-  best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
-#endif  // CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   rate2_nocoeff = rd_stats->rate;
-  last_motion_mode_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION
-      0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-      xd,
-#endif
-      mi);
   base_mbmi = *mbmi;
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  MOTION_MODE last_motion_mode_allowed =
+      cm->switchable_motion_mode
+          ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+                                cm->allow_warped_motion)
+          : SIMPLE_TRANSLATION;
+  assert(mbmi->ref_frame[1] != INTRA_FRAME);
+  const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1];
 
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   int64_t best_rd = INT64_MAX;
-  for (motion_mode = SIMPLE_TRANSLATION;
-       motion_mode <= last_motion_mode_allowed; motion_mode++) {
+
+  for (int mode_index = (int)SIMPLE_TRANSLATION;
+       mode_index <= (int)last_motion_mode_allowed + interintra_allowed;
+       mode_index++) {
     int64_t tmp_rd = INT64_MAX;
-    int tmp_rate;
-    int64_t tmp_dist;
-    int tmp_rate2 =
-        motion_mode != SIMPLE_TRANSLATION ? rate2_bmc_nocoeff : rate2_nocoeff;
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-    // We cannot estimate the rd cost for the motion mode NCOBMC_ADAPT_WEIGHT
-    // right now since it requires mvs from all neighboring blocks. We will
-    // check if this mode is beneficial after all the mv's in the current
-    // superblock are selected.
-    if (motion_mode == NCOBMC_ADAPT_WEIGHT) continue;
-#endif
+    int tmp_rate2 = rate2_nocoeff;
+    int is_interintra_mode = mode_index > (int)last_motion_mode_allowed;
+    int skip_txfm_sb = 0;
 
     *mbmi = base_mbmi;
-    mbmi->motion_mode = motion_mode;
-#if CONFIG_MOTION_VAR
-    if (mbmi->motion_mode == OBMC_CAUSAL) {
-      *mbmi = *best_bmc_mbmi;
+    if (is_interintra_mode) {
+      mbmi->motion_mode = SIMPLE_TRANSLATION;
+    } else {
+      mbmi->motion_mode = (MOTION_MODE)mode_index;
+      assert(mbmi->ref_frame[1] != INTRA_FRAME);
+    }
+
+    if (mbmi->motion_mode == SIMPLE_TRANSLATION && !is_interintra_mode) {
+      // SIMPLE_TRANSLATION mode: no need to recalculate.
+      // The prediction is calculated before motion_mode_rd() is called in
+      // handle_inter_mode()
+    } else if (mbmi->motion_mode == OBMC_CAUSAL) {
       mbmi->motion_mode = OBMC_CAUSAL;
-      if (!is_comp_pred &&
-#if CONFIG_COMPOUND_SINGLEREF
-          !is_inter_singleref_comp_mode(this_mode) &&
-#endif  // CONFIG_COMPOUND_SINGLEREF
-          have_newmv_in_inter_mode(this_mode)) {
+      if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
         int tmp_rate_mv = 0;
 
         single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, &tmp_rate_mv);
         mbmi->mv[0].as_int = x->best_mv.as_int;
-        if (discount_newmv_test(cpi, this_mode, mbmi->mv[0], mode_mv,
-                                refs[0])) {
+#if USE_DISCOUNT_NEWMV_TEST
+        if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) {
           tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
         }
-        tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
-#if CONFIG_DUAL_FILTER
-        mbmi->interp_filters =
-            condition_interp_filters_on_mv(mbmi->interp_filters, xd);
-#endif  // CONFIG_DUAL_FILTER
-        av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
-      } else {
-        av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+#endif
+        tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
       }
+      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
       av1_build_obmc_inter_prediction(
           cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride,
           args->left_pred_buf, args->left_pred_stride);
-      model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
-                      &tmp_dist, skip_txfm_sb, skip_sse_sb);
-    }
-#endif  // CONFIG_MOTION_VAR
-
-#if CONFIG_WARPED_MOTION
-    if (mbmi->motion_mode == WARPED_CAUSAL) {
-#if WARPED_MOTION_SORT_SAMPLES
+    } else if (mbmi->motion_mode == WARPED_CAUSAL) {
       int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
-#endif  // WARPED_MOTION_SORT_SAMPLES
-      *mbmi = *best_bmc_mbmi;
       mbmi->motion_mode = WARPED_CAUSAL;
       mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE;
       mbmi->interp_filters = av1_broadcast_interp_filter(
           av1_unswitchable_filter(cm->interp_filter));
 
-#if WARPED_MOTION_SORT_SAMPLES
       memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
       memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
-      // Rank the samples by motion vector difference
+      // Select the samples according to motion vector difference
       if (mbmi->num_proj_ref[0] > 1) {
-        mbmi->num_proj_ref[0] = sortSamples(pts_mv0, &mbmi->mv[0].as_mv, pts,
-                                            pts_inref, mbmi->num_proj_ref[0]);
-        best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
+        mbmi->num_proj_ref[0] = selectSamples(
+            &mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref[0], bsize);
       }
-#endif  // WARPED_MOTION_SORT_SAMPLES
 
       if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
                            mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
@@ -8892,144 +7773,299 @@ static int64_t motion_mode_rd(
         if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
           int tmp_rate_mv = 0;
           const int_mv mv0 = mbmi->mv[0];
-          WarpedMotionParams wm_params0 = mbmi->wm_params[0];
-#if WARPED_MOTION_SORT_SAMPLES
+          const WarpedMotionParams wm_params0 = mbmi->wm_params[0];
           int num_proj_ref0 = mbmi->num_proj_ref[0];
 
           // Refine MV in a small range.
           av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts0, pts_inref0,
-                               pts_mv0, total_samples);
-#else
-          // Refine MV in a small range.
-          av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts, pts_inref);
-#endif  // WARPED_MOTION_SORT_SAMPLES
+                               total_samples);
 
           // Keep the refined MV and WM parameters.
           if (mv0.as_int != mbmi->mv[0].as_int) {
             const int ref = refs[0];
-            const MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
-
+            const int_mv ref_mv = av1_get_ref_mv(x, 0);
             tmp_rate_mv =
-                av1_mv_bit_cost(&mbmi->mv[0].as_mv, &ref_mv, x->nmvjointcost,
-                                x->mvcost, MV_COST_WEIGHT);
+                av1_mv_bit_cost(&mbmi->mv[0].as_mv, &ref_mv.as_mv,
+                                x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 
             if (cpi->sf.adaptive_motion_search)
               x->pred_mv[ref] = mbmi->mv[0].as_mv;
 
-            single_newmv[ref] = mbmi->mv[0];
-
-            if (discount_newmv_test(cpi, this_mode, mbmi->mv[0], mode_mv,
-                                    refs[0])) {
+#if USE_DISCOUNT_NEWMV_TEST
+            if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) {
               tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
             }
-#if WARPED_MOTION_SORT_SAMPLES
-            best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
-#endif  // WARPED_MOTION_SORT_SAMPLES
-            tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
-#if CONFIG_DUAL_FILTER
-            mbmi->interp_filters =
-                condition_interp_filters_on_mv(mbmi->interp_filters, xd);
-#endif  // CONFIG_DUAL_FILTER
+#endif
+            tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
           } else {
             // Restore the old MV and WM parameters.
             mbmi->mv[0] = mv0;
             mbmi->wm_params[0] = wm_params0;
-#if WARPED_MOTION_SORT_SAMPLES
             mbmi->num_proj_ref[0] = num_proj_ref0;
-#endif  // WARPED_MOTION_SORT_SAMPLES
           }
         }
 
         av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
-        model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
-                        &tmp_dist, skip_txfm_sb, skip_sse_sb);
       } else {
         continue;
       }
+    } else if (is_interintra_mode) {
+      INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
+      int64_t rd, best_interintra_rd = INT64_MAX;
+      int rmode, rate_sum;
+      int64_t dist_sum;
+      int j;
+      int tmp_rate_mv = 0;
+      int tmp_skip_txfm_sb;
+      int bw = block_size_wide[bsize];
+      int64_t tmp_skip_sse_sb;
+      DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]);
+      DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]);
+      uint8_t *tmp_buf, *intrapred;
+      const int *const interintra_mode_cost =
+          x->interintra_mode_cost[size_group_lookup[bsize]];
+
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_);
+        intrapred = CONVERT_TO_BYTEPTR(intrapred_);
+      } else {
+        tmp_buf = tmp_buf_;
+        intrapred = intrapred_;
+      }
+      const int_mv mv0 = mbmi->mv[0];
+
+      mbmi->ref_frame[1] = NONE_FRAME;
+      xd->plane[0].dst.buf = tmp_buf;
+      xd->plane[0].dst.stride = bw;
+      av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
+
+      restore_dst_buf(xd, *orig_dst, num_planes);
+      mbmi->ref_frame[1] = INTRA_FRAME;
+      mbmi->use_wedge_interintra = 0;
+      for (j = 0; j < INTERINTRA_MODES; ++j) {
+        mbmi->interintra_mode = (INTERINTRA_MODE)j;
+        rmode = interintra_mode_cost[mbmi->interintra_mode];
+        av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                  intrapred, bw);
+        av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+        model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+                        &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+        rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
+        if (rd < best_interintra_rd) {
+          best_interintra_rd = rd;
+          best_interintra_mode = mbmi->interintra_mode;
+        }
+      }
+      mbmi->interintra_mode = best_interintra_mode;
+      rmode = interintra_mode_cost[mbmi->interintra_mode];
+      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                intrapred, bw);
+      av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+      av1_subtract_plane(x, bsize, 0);
+      rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                               &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+      if (rd != INT64_MAX)
+        rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum, dist_sum);
+      best_interintra_rd = rd;
+
+      if (ref_best_rd < INT64_MAX && (best_interintra_rd >> 1) > ref_best_rd) {
+        // restore ref_frame[1]
+        mbmi->ref_frame[1] = ref_frame_1;
+        continue;
+      }
+
+      if (is_interintra_wedge_used(bsize)) {
+        int64_t best_interintra_rd_nowedge = INT64_MAX;
+        int64_t best_interintra_rd_wedge = INT64_MAX;
+        int_mv tmp_mv;
+        InterpFilters backup_interp_filters = mbmi->interp_filters;
+        int rwedge = x->wedge_interintra_cost[bsize][0];
+        if (rd != INT64_MAX)
+          rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum + rwedge, dist_sum);
+        best_interintra_rd_nowedge = rd;
+
+        // Disable wedge search if source variance is small
+        if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) {
+          mbmi->use_wedge_interintra = 1;
+
+          rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) +
+                   x->wedge_interintra_cost[bsize][1];
+
+          best_interintra_rd_wedge =
+              pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+
+          best_interintra_rd_wedge +=
+              RDCOST(x->rdmult, rmode + rate_mv + rwedge, 0);
+          // Refine motion vector.
+          if (have_newmv_in_inter_mode(mbmi->mode)) {
+            // get negative of mask
+            const uint8_t *mask = av1_get_contiguous_soft_mask(
+                mbmi->interintra_wedge_index, 1, bsize);
+            tmp_mv = av1_get_ref_mv(x, 0);
+            compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row,
+                                          mi_col, intrapred, mask, bw,
+                                          &tmp_rate_mv, 0);
+            mbmi->mv[0].as_int = tmp_mv.as_int;
+            av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst,
+                                           bsize);
+            model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+                            &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL,
+                            NULL);
+            rd = RDCOST(x->rdmult, tmp_rate_mv + rmode + rate_sum + rwedge,
+                        dist_sum);
+            if (rd >= best_interintra_rd_wedge) {
+              tmp_mv.as_int = mv0.as_int;
+              tmp_rate_mv = rate_mv;
+              mbmi->interp_filters = backup_interp_filters;
+              av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+            }
+          } else {
+            tmp_mv.as_int = mv0.as_int;
+            tmp_rate_mv = rate_mv;
+            av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+          }
+          // Evaluate closer to true rd
+          av1_subtract_plane(x, bsize, 0);
+          rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                                   &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
+                                   INT64_MAX);
+          if (rd != INT64_MAX)
+            rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
+                        dist_sum);
+          best_interintra_rd_wedge = rd;
+          if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
+            mbmi->use_wedge_interintra = 1;
+            mbmi->mv[0].as_int = tmp_mv.as_int;
+            tmp_rate2 += tmp_rate_mv - rate_mv;
+          } else {
+            mbmi->use_wedge_interintra = 0;
+            mbmi->mv[0].as_int = mv0.as_int;
+            mbmi->interp_filters = backup_interp_filters;
+          }
+        } else {
+          mbmi->use_wedge_interintra = 0;
+        }
+      }  // if (is_interintra_wedge_used(bsize))
+      restore_dst_buf(xd, *orig_dst, num_planes);
+      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
     }
-#endif  // CONFIG_WARPED_MOTION
+
+    if (!cpi->common.all_lossless)
+      check_block_skip(cpi, bsize, x, xd, 0, num_planes - 1, &skip_txfm_sb);
+
     x->skip = 0;
 
     rd_stats->dist = 0;
     rd_stats->sse = 0;
     rd_stats->skip = 1;
     rd_stats->rate = tmp_rate2;
-    if (last_motion_mode_allowed > SIMPLE_TRANSLATION) {
-#if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
-      if (last_motion_mode_allowed == WARPED_CAUSAL)
-#endif  // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
+    if (av1_is_interp_needed(xd))
+      rd_stats->rate += av1_get_switchable_rate(cm, x, xd);
+    if (interintra_allowed) {
+      rd_stats->rate += x->interintra_cost[size_group_lookup[bsize]]
+                                          [mbmi->ref_frame[1] == INTRA_FRAME];
+      if (mbmi->ref_frame[1] == INTRA_FRAME) {
+        rd_stats->rate += x->interintra_mode_cost[size_group_lookup[bsize]]
+                                                 [mbmi->interintra_mode];
+        if (is_interintra_wedge_used(bsize)) {
+          rd_stats->rate +=
+              x->wedge_interintra_cost[bsize][mbmi->use_wedge_interintra];
+          if (mbmi->use_wedge_interintra) {
+            rd_stats->rate +=
+                av1_cost_literal(get_interintra_wedge_bits(bsize));
+          }
+        }
+      }
+    }
+    if ((last_motion_mode_allowed > SIMPLE_TRANSLATION) &&
+        (mbmi->ref_frame[1] != INTRA_FRAME)) {
+      if (last_motion_mode_allowed == WARPED_CAUSAL) {
         rd_stats->rate += x->motion_mode_cost[bsize][mbmi->motion_mode];
-#if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
-      else
+      } else {
         rd_stats->rate += x->motion_mode_cost1[bsize][mbmi->motion_mode];
-#endif  // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
-    }
-#if CONFIG_WARPED_MOTION
-    if (mbmi->motion_mode == WARPED_CAUSAL) {
-      rd_stats->rate -= rs;
+      }
     }
-#endif  // CONFIG_WARPED_MOTION
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-    if (!*skip_txfm_sb) {
+    if (!skip_txfm_sb) {
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+      int64_t est_rd = 0;
+      int est_skip = 0;
+      if (cpi->sf.inter_mode_rd_model_estimation) {
+        InterModeRdModel *md = &inter_mode_rd_models[mbmi->sb_type];
+        if (md->ready) {
+          const int64_t curr_sse = get_sse(cpi, x);
+          est_rd =
+              get_est_rd(mbmi->sb_type, x->rdmult, curr_sse, rd_stats->rate);
+          est_skip = est_rd * 0.8 > *best_est_rd;
+#if INTER_MODE_RD_TEST
+          if (est_rd < *best_est_rd) {
+            *best_est_rd = est_rd;
+          }
+#else   // INTER_MODE_RD_TEST
+          if (est_skip) {
+            ++md->skip_count;
+            mbmi->ref_frame[1] = ref_frame_1;
+            continue;
+          } else {
+            if (est_rd < *best_est_rd) {
+              *best_est_rd = est_rd;
+            }
+            ++md->non_skip_count;
+          }
+#endif  // INTER_MODE_RD_TEST
+        }
+      }
+#endif  // CONFIG_COLLECT_INTER_MODE_RD_STATS
+
       int64_t rdcosty = INT64_MAX;
       int is_cost_valid_uv = 0;
 
       // cost and distortion
       av1_subtract_plane(x, bsize, 0);
-#if CONFIG_VAR_TX
       if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
-        select_tx_type_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
+        // Motion mode
+        select_tx_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col,
+                           ref_best_rd);
+#if CONFIG_COLLECT_RD_STATS == 2
+        PrintPredictionUnitStats(cpi, x, rd_stats_y, bsize);
+#endif  // CONFIG_COLLECT_RD_STATS == 2
       } else {
-        int idx, idy;
         super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
-        for (idy = 0; idy < xd->n8_h; ++idy)
-          for (idx = 0; idx < xd->n8_w; ++idx)
-            mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
-        memset(x->blk_skip[0], rd_stats_y->skip,
-               sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+        memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+        memset(x->blk_skip, rd_stats_y->skip,
+               sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
       }
-#else
-    /* clang-format off */
-      super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
-/* clang-format on */
-#endif  // CONFIG_VAR_TX
 
       if (rd_stats_y->rate == INT_MAX) {
         av1_invalid_rd_stats(rd_stats);
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-        if (mbmi->motion_mode != SIMPLE_TRANSLATION) {
+        if (mbmi->motion_mode != SIMPLE_TRANSLATION ||
+            mbmi->ref_frame[1] == INTRA_FRAME) {
+          mbmi->ref_frame[1] = ref_frame_1;
           continue;
         } else {
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-          restore_dst_buf(xd, *orig_dst);
+          restore_dst_buf(xd, *orig_dst, num_planes);
+          mbmi->ref_frame[1] = ref_frame_1;
           return INT64_MAX;
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
         }
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
       }
 
       av1_merge_rd_stats(rd_stats, rd_stats_y);
 
       rdcosty = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
       rdcosty = AOMMIN(rdcosty, RDCOST(x->rdmult, 0, rd_stats->sse));
-/* clang-format off */
-#if CONFIG_VAR_TX
-      is_cost_valid_uv =
-          inter_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty);
-#else
-      is_cost_valid_uv =
-          super_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty);
-#endif  // CONFIG_VAR_TX
-      if (!is_cost_valid_uv) {
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-        continue;
-#else
-        restore_dst_buf(xd, *orig_dst);
-        return INT64_MAX;
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      if (num_planes > 1) {
+        /* clang-format off */
+        is_cost_valid_uv =
+            inter_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty,
+                             FTXS_NONE);
+        if (!is_cost_valid_uv) {
+          mbmi->ref_frame[1] = ref_frame_1;
+          continue;
+        }
+        /* clang-format on */
+        av1_merge_rd_stats(rd_stats, rd_stats_uv);
+      } else {
+        av1_init_rd_stats(rd_stats_uv);
       }
-      /* clang-format on */
-      av1_merge_rd_stats(rd_stats, rd_stats_uv);
 #if CONFIG_RD_DEBUG
       // record transform block coefficient cost
       // TODO(angiebird): So far rd_debug tool only detects discrepancy of
@@ -9038,812 +8074,766 @@ static int64_t motion_mode_rd(
       // other place when we need to compare non-coefficient cost.
       mbmi->rd_stats = *rd_stats;
 #endif  // CONFIG_RD_DEBUG
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      const int skip_ctx = av1_get_skip_context(xd);
       if (rd_stats->skip) {
         rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
         rd_stats_y->rate = 0;
         rd_stats_uv->rate = 0;
-        rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+        rd_stats->rate += x->skip_cost[skip_ctx][1];
         mbmi->skip = 0;
         // here mbmi->skip temporarily plays a role as what this_skip2 does
       } else if (!xd->lossless[mbmi->segment_id] &&
                  (RDCOST(x->rdmult,
                          rd_stats_y->rate + rd_stats_uv->rate +
-                             av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
-                         rd_stats->dist) >=
-                  RDCOST(x->rdmult, av1_cost_bit(av1_get_skip_prob(cm, xd), 1),
-                         rd_stats->sse))) {
+                             x->skip_cost[skip_ctx][0],
+                         rd_stats->dist) >= RDCOST(x->rdmult,
+                                                   x->skip_cost[skip_ctx][1],
+                                                   rd_stats->sse))) {
         rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
-        rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+        rd_stats->rate += x->skip_cost[skip_ctx][1];
         rd_stats->dist = rd_stats->sse;
         rd_stats_y->rate = 0;
         rd_stats_uv->rate = 0;
         mbmi->skip = 1;
       } else {
-        rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+        rd_stats->rate += x->skip_cost[skip_ctx][0];
         mbmi->skip = 0;
       }
       *disable_skip = 0;
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+      if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
+          cm->tile_rows == 1) {
+#if INTER_MODE_RD_TEST
+        if (md->ready) {
+          int64_t real_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+          if (est_skip) {
+            ++md->skip_count;
+            if (real_rd < ref_best_rd) {
+              ++md->fp_skip_count;
+            }
+            // int fp_skip = real_rd < ref_best_rd;
+            // printf("est_skip %d fp_skip %d est_rd %ld best_est_rd %ld real_rd
+            // %ld ref_best_rd %ld\n",
+            //        est_skip, fp_skip, est_rd, *best_est_rd, real_rd,
+            //        ref_best_rd);
+          } else {
+            ++md->non_skip_count;
+          }
+        }
+#endif  // INTER_MODE_RD_TEST
+        inter_mode_data_push(mbmi->sb_type, rd_stats->sse, rd_stats->dist,
+                             rd_stats_y->rate + rd_stats_uv->rate +
+                                 x->skip_cost[skip_ctx][mbmi->skip],
+                             rd_stats->rate, ref_best_rd);
+      }
+#endif  // CONFIG_COLLECT_INTER_MODE_RD_STATS
+      int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+      if (curr_rd < ref_best_rd) {
+        ref_best_rd = curr_rd;
+      }
     } else {
       x->skip = 1;
       *disable_skip = 1;
-      mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, 1);
+      mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
 
-// The cost of skip bit needs to be added.
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      // The cost of skip bit needs to be added.
       mbmi->skip = 0;
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-      rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+      rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][1];
 
-      rd_stats->dist = *skip_sse_sb;
-      rd_stats->sse = *skip_sse_sb;
+      rd_stats->dist = 0;
+      rd_stats->sse = 0;
       rd_stats_y->rate = 0;
       rd_stats_uv->rate = 0;
       rd_stats->skip = 1;
     }
 
-#if CONFIG_GLOBAL_MOTION
-    if (this_mode == ZEROMV || this_mode == ZERO_ZEROMV) {
-      if (is_nontrans_global_motion(xd)) {
-        rd_stats->rate -= rs;
+    if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) {
+      if (is_nontrans_global_motion(xd, xd->mi[0])) {
         mbmi->interp_filters = av1_broadcast_interp_filter(
             av1_unswitchable_filter(cm->interp_filter));
       }
     }
-#endif  // CONFIG_GLOBAL_MOTION
 
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-    if (mbmi->motion_mode == SIMPLE_TRANSLATION || (tmp_rd < best_rd)) {
+    if ((mbmi->motion_mode == SIMPLE_TRANSLATION &&
+         mbmi->ref_frame[1] != INTRA_FRAME) ||
+        (tmp_rd < best_rd)) {
       best_mbmi = *mbmi;
       best_rd = tmp_rd;
       best_rd_stats = *rd_stats;
       best_rd_stats_y = *rd_stats_y;
-      best_rd_stats_uv = *rd_stats_uv;
-#if CONFIG_VAR_TX
-      for (int i = 0; i < MAX_MB_PLANE; ++i)
-        memcpy(best_blk_skip[i], x->blk_skip[i],
-               sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
-#endif  // CONFIG_VAR_TX
+      if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv;
+      memcpy(best_blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
       best_xskip = x->skip;
       best_disable_skip = *disable_skip;
+      if (best_xskip) break;
     }
   }
+  mbmi->ref_frame[1] = ref_frame_1;
 
   if (best_rd == INT64_MAX) {
     av1_invalid_rd_stats(rd_stats);
-    restore_dst_buf(xd, *orig_dst);
+    restore_dst_buf(xd, *orig_dst, num_planes);
     return INT64_MAX;
   }
   *mbmi = best_mbmi;
   *rd_stats = best_rd_stats;
   *rd_stats_y = best_rd_stats_y;
-  *rd_stats_uv = best_rd_stats_uv;
-#if CONFIG_VAR_TX
-  for (int i = 0; i < MAX_MB_PLANE; ++i)
-    memcpy(x->blk_skip[i], best_blk_skip[i],
-           sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
-#endif  // CONFIG_VAR_TX
+  if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv;
+  memcpy(x->blk_skip, best_blk_skip,
+         sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
   x->skip = best_xskip;
   *disable_skip = best_disable_skip;
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
-  restore_dst_buf(xd, *orig_dst);
+  restore_dst_buf(xd, *orig_dst, num_planes);
+  return 0;
+}
+
+static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi,
+                            MACROBLOCK *const x, BLOCK_SIZE bsize, int mi_row,
+                            int mi_col, BUFFER_SET *const orig_dst) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+
+  int64_t total_sse = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+
+    av1_subtract_plane(x, bsize, plane);
+    int64_t sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh);
+    sse = sse << 4;
+    total_sse += sse;
+  }
+  const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+  rd_stats->dist = rd_stats->sse = total_sse;
+  rd_stats->rate = x->skip_mode_cost[skip_mode_ctx][1];
+  rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+
+  restore_dst_buf(xd, *orig_dst, num_planes);
   return 0;
 }
 
+#ifndef NDEBUG
+static INLINE int is_single_inter_mode(int this_mode) {
+  return this_mode >= SINGLE_INTER_MODE_START &&
+         this_mode < SINGLE_INTER_MODE_END;
+}
+#endif
+
+static INLINE int get_ref_mv_offset(int single_mode, uint8_t ref_mv_idx) {
+  assert(is_single_inter_mode(single_mode));
+  int ref_mv_offset;
+  if (single_mode == NEARESTMV) {
+    ref_mv_offset = 0;
+  } else if (single_mode == NEARMV) {
+    ref_mv_offset = ref_mv_idx + 1;
+  } else {
+    ref_mv_offset = -1;
+  }
+  return ref_mv_offset;
+}
+
+static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx,
+                               int ref_mv_idx,
+                               const MV_REFERENCE_FRAME *ref_frame,
+                               const MB_MODE_INFO_EXT *mbmi_ext) {
+  const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+  const int is_comp_pred = ref_frame[1] > INTRA_FRAME;
+  const int single_mode = get_single_mode(this_mode, ref_idx, is_comp_pred);
+  assert(is_single_inter_mode(single_mode));
+  if (single_mode == NEWMV) {
+    this_mv->as_int = INVALID_MV;
+  } else if (single_mode == GLOBALMV) {
+    *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]];
+  } else {
+    assert(single_mode == NEARMV || single_mode == NEARESTMV);
+    const int ref_mv_offset = get_ref_mv_offset(single_mode, ref_mv_idx);
+    if (ref_mv_offset < mbmi_ext->ref_mv_count[ref_frame_type]) {
+      assert(ref_mv_offset >= 0);
+      if (ref_idx == 0) {
+        *this_mv =
+            mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].this_mv;
+      } else {
+        *this_mv =
+            mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].comp_mv;
+      }
+    } else {
+      *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]];
+    }
+  }
+}
+
+// This function update the non-new mv for the current prediction mode
+static INLINE int build_cur_mv(int_mv *cur_mv, int this_mode,
+                               const AV1_COMMON *cm, const MACROBLOCK *x) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_comp_pred = has_second_ref(mbmi);
+  int ret = 1;
+  for (int i = 0; i < is_comp_pred + 1; ++i) {
+    int_mv this_mv;
+    get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx, mbmi->ref_frame,
+                x->mbmi_ext);
+    const int single_mode = get_single_mode(this_mode, i, is_comp_pred);
+    if (single_mode == NEWMV) {
+      cur_mv[i] = this_mv;
+    } else {
+      ret &= clamp_and_check_mv(cur_mv + i, this_mv, cm, x);
+    }
+  }
+  return ret;
+}
+
+static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi,
+                               const MB_MODE_INFO_EXT *mbmi_ext,
+                               int (*drl_mode_cost0)[2],
+                               int8_t ref_frame_type) {
+  int cost = 0;
+  if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+    for (int idx = 0; idx < 2; ++idx) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx =
+            av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+        cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != idx];
+        if (mbmi->ref_mv_idx == idx) return cost;
+      }
+    }
+    return cost;
+  }
+
+  if (have_nearmv_in_inter_mode(mbmi->mode)) {
+    for (int idx = 1; idx < 3; ++idx) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx =
+            av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+        cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != (idx - 1)];
+        if (mbmi->ref_mv_idx == (idx - 1)) return cost;
+      }
+    }
+    return cost;
+  }
+  return cost;
+}
+
 static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                  BLOCK_SIZE bsize, RD_STATS *rd_stats,
                                  RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
-                                 int *disable_skip,
-                                 int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME],
-#if CONFIG_COMPOUND_SINGLEREF
-                                 int_mv (*mode_comp_mv)[TOTAL_REFS_PER_FRAME],
-#endif  // CONFIG_COMPOUND_SINGLEREF
-                                 int mi_row, int mi_col,
-                                 HandleInterModeArgs *args,
-                                 const int64_t ref_best_rd) {
+                                 int *disable_skip, int mi_row, int mi_col,
+                                 HandleInterModeArgs *args, int64_t ref_best_rd
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+                                 ,
+                                 int64_t *best_est_rd
+#endif
+) {
   const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mi[0];
-  MB_MODE_INFO *mbmi = &mi->mbmi;
+  MB_MODE_INFO *mbmi = xd->mi[0];
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const int is_comp_pred = has_second_ref(mbmi);
   const int this_mode = mbmi->mode;
-#if CONFIG_COMPOUND_SINGLEREF
-  const int is_singleref_comp_mode = is_inter_singleref_comp_mode(this_mode);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  int_mv *frame_mv = mode_mv[this_mode];
-#if CONFIG_COMPOUND_SINGLEREF
-  // The comp mv for the compound mode in single ref
-  int_mv *frame_comp_mv = mode_comp_mv[this_mode];
-#endif  // CONFIG_COMPOUND_SINGLEREF
   int i;
   int refs[2] = { mbmi->ref_frame[0],
                   (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
-  int_mv cur_mv[2];
   int rate_mv = 0;
-  int pred_exists = 1;
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT || CONFIG_INTERINTRA
   const int bw = block_size_wide[bsize];
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-  int_mv single_newmv[TOTAL_REFS_PER_FRAME];
-#if CONFIG_INTERINTRA
-  const int *const interintra_mode_cost =
-      x->interintra_mode_cost[size_group_lookup[bsize]];
-#endif  // CONFIG_INTERINTRA
-  const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
-  uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-#if CONFIG_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
-#else
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf_[MAX_MB_PLANE * MAX_SB_SQUARE]);
-#endif  // CONFIG_HIGHBITDEPTH
+  DECLARE_ALIGNED(32, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
   uint8_t *tmp_buf;
-
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  int rate2_bmc_nocoeff;
-  MB_MODE_INFO best_bmc_mbmi;
-  int rate_mv_bmc;
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   int64_t rd = INT64_MAX;
   BUFFER_SET orig_dst, tmp_dst;
-  int rs = 0;
 
   int skip_txfm_sb = 0;
   int64_t skip_sse_sb = INT64_MAX;
   int16_t mode_ctx;
-#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_MOTION_VAR
-  // dummy fillers
-  mbmi->ncobmc_mode[0] = NO_OVERLAP;
-  mbmi->ncobmc_mode[1] = NO_OVERLAP;
-#endif
 
-#if CONFIG_INTERINTRA
-  int compmode_interintra_cost = 0;
-  mbmi->use_wedge_interintra = 0;
-#endif
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-  int compmode_interinter_cost = 0;
-  mbmi->interinter_compound_type = COMPOUND_AVERAGE;
-#endif
-#if CONFIG_LGT_FROM_PRED
-  mbmi->use_lgt = 0;
-#endif
+  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+  mbmi->comp_group_idx = 0;
+  mbmi->compound_idx = 1;
+  if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME;
 
-#if CONFIG_INTERINTRA
-  if (!cm->allow_interintra_compound && is_comp_interintra_pred)
-    return INT64_MAX;
-#endif  // CONFIG_INTERINTRA
-
-  // is_comp_interintra_pred implies !is_comp_pred
-  assert(!is_comp_interintra_pred || (!is_comp_pred));
-  // is_comp_interintra_pred implies is_interintra_allowed(mbmi->sb_type)
-  assert(!is_comp_interintra_pred || is_interintra_allowed(mbmi));
-
-#if CONFIG_COMPOUND_SINGLEREF
-  if (is_comp_pred || is_singleref_comp_mode)
-#else   // !CONFIG_COMPOUND_SINGLEREF
-  if (is_comp_pred)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    mode_ctx = mbmi_ext->compound_mode_context[refs[0]];
-  else
-    mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
-                                         mbmi->ref_frame, bsize, -1);
+  mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
 
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
     tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_);
   else
-#endif  // CONFIG_HIGHBITDEPTH
     tmp_buf = tmp_buf_;
   // Make sure that we didn't leave the plane destination buffers set
   // to tmp_buf at the end of the last iteration
   assert(xd->plane[0].dst.buf != tmp_buf);
 
-#if CONFIG_WARPED_MOTION
   mbmi->num_proj_ref[0] = 0;
   mbmi->num_proj_ref[1] = 0;
-#endif  // CONFIG_WARPED_MOTION
 
   if (is_comp_pred) {
-    if (frame_mv[refs[0]].as_int == INVALID_MV ||
-        frame_mv[refs[1]].as_int == INVALID_MV)
-      return INT64_MAX;
-#if CONFIG_COMPOUND_SINGLEREF
-  } else if (is_singleref_comp_mode) {
-    if (frame_mv[refs[0]].as_int == INVALID_MV ||
-        frame_comp_mv[refs[0]].as_int == INVALID_MV)
-      return INT64_MAX;
-#endif  // CONFIG_COMPOUND_SINGLEREF
+    for (int ref_idx = 0; ref_idx < is_comp_pred + 1; ++ref_idx) {
+      const int single_mode = get_single_mode(this_mode, ref_idx, is_comp_pred);
+      if (single_mode == NEWMV &&
+          args->single_newmv[mbmi->ref_frame[ref_idx]].as_int == INVALID_MV)
+        return INT64_MAX;
+    }
   }
 
   mbmi->motion_mode = SIMPLE_TRANSLATION;
-  if (have_newmv_in_inter_mode(this_mode)) {
-    const int64_t ret_val =
-        handle_newmv(cpi, x, bsize, mode_mv,
-#if CONFIG_COMPOUND_SINGLEREF
-                     mode_comp_mv,
-#endif  // CONFIG_COMPOUND_SINGLEREF
-                     mi_row, mi_col, &rate_mv, single_newmv, args);
-    if (ret_val != 0)
-      return ret_val;
-    else
-      rd_stats->rate += rate_mv;
-  }
-  for (i = 0; i < is_comp_pred + 1; ++i) {
-    cur_mv[i] = frame_mv[refs[i]];
-    // Clip "next_nearest" so that it does not extend to far out of image
-    if (this_mode != NEWMV) clamp_mv2(&cur_mv[i].as_mv, xd);
-    if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX;
-    mbmi->mv[i].as_int = cur_mv[i].as_int;
-  }
-
-#if CONFIG_COMPOUND_SINGLEREF
-  if (!is_comp_pred && is_singleref_comp_mode) {
-    cur_mv[1] = frame_comp_mv[refs[0]];
-    // Clip "next_nearest" so that it does not extend to far out of image
-    if (this_mode != NEWMV) clamp_mv2(&cur_mv[1].as_mv, xd);
-    if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
-    mbmi->mv[1].as_int = cur_mv[1].as_int;
-  }
-#endif  // CONFIG_COMPOUND_SINGLEREF
+  const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                   cm->seq_params.enable_masked_compound;
+  int64_t ret_val = INT64_MAX;
+  const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+  rd_stats->rate += args->ref_frame_cost + args->single_comp_cost;
+  rd_stats->rate +=
+      get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
+  const RD_STATS backup_rd_stats = *rd_stats;
+  const RD_STATS backup_rd_stats_y = *rd_stats_y;
+  const RD_STATS backup_rd_stats_uv = *rd_stats_uv;
+  const MB_MODE_INFO backup_mbmi = *mbmi;
+  INTERINTER_COMPOUND_DATA best_compound_data;
+  uint8_t tmp_best_mask_buf[2 * MAX_SB_SQUARE];
+  RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
+  int64_t best_rd = INT64_MAX;
+  int64_t best_ret_val = INT64_MAX;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  MB_MODE_INFO best_mbmi = *mbmi;
+  int64_t early_terminate = 0;
+  int plane_rate[MAX_MB_PLANE] = { 0 };
+  int64_t plane_sse[MAX_MB_PLANE] = { 0 };
+  int64_t plane_dist[MAX_MB_PLANE] = { 0 };
+  int64_t newmv_ret_val = INT64_MAX;
+  int_mv backup_mv[2] = { { 0 } };
+  int backup_rate_mv = 0;
+
+  int comp_idx;
+  const int search_jnt_comp = is_comp_pred & cm->seq_params.enable_jnt_comp &
+                              (mbmi->mode != GLOBAL_GLOBALMV);
+  // If !search_jnt_comp, we need to force mbmi->compound_idx = 1.
+  for (comp_idx = 1; comp_idx >= !search_jnt_comp; --comp_idx) {
+    int rs = 0;
+    int compmode_interinter_cost = 0;
+    early_terminate = 0;
+    *rd_stats = backup_rd_stats;
+    *rd_stats_y = backup_rd_stats_y;
+    *rd_stats_uv = backup_rd_stats_uv;
+    *mbmi = backup_mbmi;
+    mbmi->compound_idx = comp_idx;
 
-  if (this_mode == NEAREST_NEARESTMV) {
-    if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
-      cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
-      cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
+    if (is_comp_pred && comp_idx == 0) {
+      mbmi->comp_group_idx = 0;
+      mbmi->compound_idx = 0;
 
-      for (i = 0; i < 2; ++i) {
-        clamp_mv2(&cur_mv[i].as_mv, xd);
-        if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX;
-        mbmi->mv[i].as_int = cur_mv[i].as_int;
+      const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+      const int comp_index_ctx = get_comp_index_context(cm, xd);
+      if (masked_compound_used) {
+        compmode_interinter_cost +=
+            x->comp_group_idx_cost[comp_group_idx_ctx][0];
       }
+      compmode_interinter_cost += x->comp_idx_cost[comp_index_ctx][0];
     }
-  }
-
-  if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
-#if CONFIG_COMPOUND_SINGLEREF
-    if (this_mode == NEAREST_NEWMV ||  // this_mode == SR_NEAREST_NEWMV ||
-        this_mode == SR_NEAREST_NEARMV)
-#else   // !CONFIG_COMPOUND_SINGLEREF
-    if (this_mode == NEAREST_NEWMV)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    {
-      cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
 
-#if CONFIG_AMVR
-      lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv,
-                         cm->cur_frame_mv_precision_level);
-#else
-      lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
-#endif
-      clamp_mv2(&cur_mv[0].as_mv, xd);
-      if (mv_check_bounds(&x->mv_limits, &cur_mv[0].as_mv)) return INT64_MAX;
-      mbmi->mv[0].as_int = cur_mv[0].as_int;
+    int_mv cur_mv[2];
+    if (!build_cur_mv(cur_mv, this_mode, cm, x)) {
+      early_terminate = INT64_MAX;
+      continue;
     }
+    if (have_newmv_in_inter_mode(this_mode)) {
+      if (comp_idx == 0) {
+        cur_mv[0] = backup_mv[0];
+        cur_mv[1] = backup_mv[1];
+        rate_mv = backup_rate_mv;
+      }
 
-    if (this_mode == NEW_NEARESTMV) {
-      cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
+      // when jnt_comp_skip_mv_search flag is on, new mv will be searched once
+      if (!(search_jnt_comp && cpi->sf.jnt_comp_skip_mv_search &&
+            comp_idx == 0)) {
+        newmv_ret_val =
+            handle_newmv(cpi, x, bsize, cur_mv, mi_row, mi_col, &rate_mv, args);
 
-#if CONFIG_AMVR
-      lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv,
-                         cm->cur_frame_mv_precision_level);
-#else
-      lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
-#endif
-      clamp_mv2(&cur_mv[1].as_mv, xd);
-      if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
-      mbmi->mv[1].as_int = cur_mv[1].as_int;
-    }
-  }
-
-  if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
-    int ref_mv_idx = mbmi->ref_mv_idx + 1;
-    if (this_mode == NEAR_NEWMV ||
-#if CONFIG_COMPOUND_SINGLEREF
-        this_mode == SR_NEAR_NEWMV ||
-#endif  // CONFIG_COMPOUND_SINGLEREF
-        this_mode == NEAR_NEARMV) {
-      cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+        // Store cur_mv and rate_mv so that they can be restored in the next
+        // iteration of the loop
+        backup_mv[0] = cur_mv[0];
+        backup_mv[1] = cur_mv[1];
+        backup_rate_mv = rate_mv;
+      }
 
-#if CONFIG_AMVR
-      lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv,
-                         cm->cur_frame_mv_precision_level);
-#else
-      lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
-#endif
-      clamp_mv2(&cur_mv[0].as_mv, xd);
-      if (mv_check_bounds(&x->mv_limits, &cur_mv[0].as_mv)) return INT64_MAX;
-      mbmi->mv[0].as_int = cur_mv[0].as_int;
+      if (newmv_ret_val != 0) {
+        early_terminate = INT64_MAX;
+        continue;
+      } else {
+        rd_stats->rate += rate_mv;
+      }
+    }
+    for (i = 0; i < is_comp_pred + 1; ++i) {
+      mbmi->mv[i].as_int = cur_mv[i].as_int;
+    }
+
+    // Initialise tmp_dst and orig_dst buffers to prevent "may be used
+    // uninitialized" warnings in GCC when the stream is monochrome.
+    memset(tmp_dst.plane, 0, sizeof(tmp_dst.plane));
+    memset(tmp_dst.stride, 0, sizeof(tmp_dst.stride));
+    memset(orig_dst.plane, 0, sizeof(tmp_dst.plane));
+    memset(orig_dst.stride, 0, sizeof(tmp_dst.stride));
+
+    // do first prediction into the destination buffer. Do the next
+    // prediction into a temporary buffer. Then keep track of which one
+    // of these currently holds the best predictor, and use the other
+    // one for future predictions. In the end, copy from tmp_buf to
+    // dst if necessary.
+    for (i = 0; i < num_planes; i++) {
+      tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE;
+      tmp_dst.stride[i] = MAX_SB_SIZE;
+    }
+    for (i = 0; i < num_planes; i++) {
+      orig_dst.plane[i] = xd->plane[i].dst.buf;
+      orig_dst.stride[i] = xd->plane[i].dst.stride;
+    }
+
+    const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx);
+#if USE_DISCOUNT_NEWMV_TEST
+    // We don't include the cost of the second reference here, because there
+    // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
+    // words if you present them in that order, the second one is always known
+    // if the first is known.
+    //
+    // Under some circumstances we discount the cost of new mv mode to encourage
+    // initiation of a motion field.
+    if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) {
+      // discount_newmv_test only applies discount on NEWMV mode.
+      assert(this_mode == NEWMV);
+      rd_stats->rate += AOMMIN(cost_mv_ref(x, this_mode, mode_ctx),
+                               cost_mv_ref(x, NEARESTMV, mode_ctx));
+    } else {
+      rd_stats->rate += ref_mv_cost;
     }
-
-    if (this_mode == NEW_NEARMV ||
-#if CONFIG_COMPOUND_SINGLEREF
-        this_mode == SR_NEAREST_NEARMV ||
-#endif  // CONFIG_COMPOUND_SINGLEREF
-        this_mode == NEAR_NEARMV) {
-#if CONFIG_COMPOUND_SINGLEREF
-      if (this_mode == SR_NEAREST_NEARMV)
-        cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
-      else
-#endif  // CONFIG_COMPOUND_SINGLEREF
-        cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
-
-#if CONFIG_AMVR
-      lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv,
-                         cm->cur_frame_mv_precision_level);
 #else
-      lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
+    rd_stats->rate += ref_mv_cost;
 #endif
-      clamp_mv2(&cur_mv[1].as_mv, xd);
-      if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
-      mbmi->mv[1].as_int = cur_mv[1].as_int;
+
+    if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
+        mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
+      early_terminate = INT64_MAX;
+      continue;
     }
-  }
 
-  // do first prediction into the destination buffer. Do the next
-  // prediction into a temporary buffer. Then keep track of which one
-  // of these currently holds the best predictor, and use the other
-  // one for future predictions. In the end, copy from tmp_buf to
-  // dst if necessary.
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE;
-    tmp_dst.stride[i] = MAX_SB_SIZE;
-  }
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    orig_dst.plane[i] = xd->plane[i].dst.buf;
-    orig_dst.stride[i] = xd->plane[i].dst.stride;
-  }
+    ret_val = interpolation_filter_search(
+        x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, args->single_filter,
+        &rd, &rs, &skip_txfm_sb, &skip_sse_sb);
+    if (ret_val != 0) {
+      early_terminate = INT64_MAX;
+      restore_dst_buf(xd, orig_dst, num_planes);
+      continue;
+    } else if (cpi->sf.model_based_post_interp_filter_breakout &&
+               ref_best_rd != INT64_MAX && (rd / 6) > ref_best_rd) {
+      early_terminate = INT64_MAX;
+      restore_dst_buf(xd, orig_dst, num_planes);
+      if ((rd >> 4) > ref_best_rd) break;
+      continue;
+    }
 
-  // We don't include the cost of the second reference here, because there
-  // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
-  // words if you present them in that order, the second one is always known
-  // if the first is known.
-  //
-  // Under some circumstances we discount the cost of new mv mode to encourage
-  // initiation of a motion field.
-  if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv,
-                          refs[0])) {
-    rd_stats->rate += AOMMIN(
-        cost_mv_ref(x, this_mode, mode_ctx),
-        cost_mv_ref(x, is_comp_pred ? NEAREST_NEARESTMV : NEARESTMV, mode_ctx));
-  } else {
-    rd_stats->rate += cost_mv_ref(x, this_mode, mode_ctx);
-  }
+    if (is_comp_pred && comp_idx) {
+      int rate_sum, rs2;
+      int64_t dist_sum;
+      int64_t best_rd_compound = INT64_MAX, best_rd_cur = INT64_MAX;
+      int_mv best_mv[2];
+      int best_tmp_rate_mv = rate_mv;
+      int tmp_skip_txfm_sb;
+      int64_t tmp_skip_sse_sb;
+      DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]);
+      DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]);
+      uint8_t *preds0[1] = { pred0 };
+      uint8_t *preds1[1] = { pred1 };
+      int strides[1] = { bw };
+      int tmp_rate_mv;
+      const int num_pix = 1 << num_pels_log2_lookup[bsize];
+      COMPOUND_TYPE cur_type;
+      int best_compmode_interinter_cost = 0;
+      int can_use_previous = cm->allow_warped_motion;
+
+      best_mv[0].as_int = cur_mv[0].as_int;
+      best_mv[1].as_int = cur_mv[1].as_int;
 
-  if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
-      mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV)
-    return INT64_MAX;
+      if (masked_compound_used) {
+        // get inter predictors to use for masked compound modes
+        av1_build_inter_predictors_for_planes_single_buf(
+            xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides,
+            can_use_previous);
+        av1_build_inter_predictors_for_planes_single_buf(
+            xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides,
+            can_use_previous);
+      }
+
+      int best_comp_group_idx = 0;
+      int best_compound_idx = 1;
+      for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
+        if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break;
+        if (!is_interinter_compound_used(cur_type, bsize)) continue;
+        tmp_rate_mv = rate_mv;
+        best_rd_cur = INT64_MAX;
+        mbmi->interinter_comp.type = cur_type;
+        int masked_type_cost = 0;
+
+        const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+        const int comp_index_ctx = get_comp_index_context(cm, xd);
+        if (masked_compound_used) {
+          if (cur_type == COMPOUND_AVERAGE) {
+            mbmi->comp_group_idx = 0;
+            mbmi->compound_idx = 1;
+
+            masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][0];
+            masked_type_cost += x->comp_idx_cost[comp_index_ctx][1];
+          } else {
+            mbmi->comp_group_idx = 1;
+            mbmi->compound_idx = 1;
 
-  int64_t ret_val = interpolation_filter_search(
-      x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, args->single_filter,
-      &rd, &rs, &skip_txfm_sb, &skip_sse_sb);
-  if (ret_val != 0) return ret_val;
+            masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][1];
+            masked_type_cost +=
+                x->compound_type_cost[bsize][mbmi->interinter_comp.type - 1];
+          }
+        } else {
+          mbmi->comp_group_idx = 0;
+          mbmi->compound_idx = 1;
 
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  best_bmc_mbmi = *mbmi;
-  rate2_bmc_nocoeff = rd_stats->rate;
-  if (cm->interp_filter == SWITCHABLE) rate2_bmc_nocoeff += rs;
-  rate_mv_bmc = rate_mv;
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+          masked_type_cost += x->comp_idx_cost[comp_index_ctx][1];
+        }
+        rs2 = masked_type_cost;
+
+        switch (cur_type) {
+          case COMPOUND_AVERAGE:
+            av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst,
+                                           bsize);
+            av1_subtract_plane(x, bsize, 0);
+            rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                                     &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
+                                     INT64_MAX);
+            if (rd != INT64_MAX)
+              best_rd_cur =
+                  RDCOST(x->rdmult, rs2 + rate_mv + rate_sum, dist_sum);
+            break;
+          case COMPOUND_WEDGE:
+            if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
+                best_rd_compound / 3 < ref_best_rd) {
+              best_rd_cur = build_and_cost_compound_type(
+                  cpi, x, cur_mv, bsize, this_mode, &rs2, rate_mv, &orig_dst,
+                  &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col);
+            }
+            break;
+          case COMPOUND_DIFFWTD:
+            if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
+                best_rd_compound / 3 < ref_best_rd) {
+              best_rd_cur = build_and_cost_compound_type(
+                  cpi, x, cur_mv, bsize, this_mode, &rs2, rate_mv, &orig_dst,
+                  &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col);
+            }
+            break;
+          default: assert(0); return INT64_MAX;
+        }
 
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-#if CONFIG_COMPOUND_SINGLEREF
-  if (is_comp_pred || is_singleref_comp_mode)
-#else
-  if (is_comp_pred)
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  {
-    int rate_sum, rs2;
-    int64_t dist_sum;
-    int64_t best_rd_compound = INT64_MAX, best_rd_cur = INT64_MAX;
-    INTERINTER_COMPOUND_DATA best_compound_data;
-    int_mv best_mv[2];
-    int best_tmp_rate_mv = rate_mv;
-    int tmp_skip_txfm_sb;
-    int64_t tmp_skip_sse_sb;
-    DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]);
-    DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]);
-    uint8_t *preds0[1] = { pred0 };
-    uint8_t *preds1[1] = { pred1 };
-    int strides[1] = { bw };
-    int tmp_rate_mv;
-    int masked_compound_used = is_any_masked_compound_used(bsize);
-#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-    masked_compound_used = masked_compound_used && cm->allow_masked_compound;
-#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-    COMPOUND_TYPE cur_type;
-    int best_compmode_interinter_cost = 0;
-
-    best_mv[0].as_int = cur_mv[0].as_int;
-    best_mv[1].as_int = cur_mv[1].as_int;
-    memset(&best_compound_data, 0, sizeof(best_compound_data));
-#if CONFIG_COMPOUND_SEGMENT
-    uint8_t tmp_mask_buf[2 * MAX_SB_SQUARE];
-    best_compound_data.seg_mask = tmp_mask_buf;
-#endif  // CONFIG_COMPOUND_SEGMENT
-
-#if CONFIG_COMPOUND_SINGLEREF
-    // TODO(zoeliu): To further check whether the following setups are needed.
-    // Single ref compound mode: Prepare the 2nd ref frame predictor the same as
-    // the 1st one.
-    if (!is_comp_pred && is_singleref_comp_mode) {
-      xd->block_refs[1] = xd->block_refs[0];
-      for (i = 0; i < MAX_MB_PLANE; i++)
-        xd->plane[i].pre[1] = xd->plane[i].pre[0];
-    }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-    if (masked_compound_used) {
-      // get inter predictors to use for masked compound modes
-      av1_build_inter_predictors_for_planes_single_buf(
-          xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides);
-      av1_build_inter_predictors_for_planes_single_buf(
-          xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides);
-    }
-
-    for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
-      if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break;
-      if (!is_interinter_compound_used(cur_type, bsize)) continue;
-      tmp_rate_mv = rate_mv;
-      best_rd_cur = INT64_MAX;
-      mbmi->interinter_compound_type = cur_type;
-      int masked_type_cost = 0;
-      if (masked_compound_used) {
-#if CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
-        if (!is_interinter_compound_used(COMPOUND_WEDGE, bsize))
-          masked_type_cost += av1_cost_literal(1);
-        else
-#endif  // CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
-          masked_type_cost +=
-              x->compound_type_cost[bsize][mbmi->interinter_compound_type];
-      }
-      rs2 = av1_cost_literal(get_interinter_compound_type_bits(
-                bsize, mbmi->interinter_compound_type)) +
-            masked_type_cost;
-
-      switch (cur_type) {
-        case COMPOUND_AVERAGE:
-          av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst,
-                                         bsize);
-          av1_subtract_plane(x, bsize, 0);
-          rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
-                                   &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
-                                   INT64_MAX);
-          if (rd != INT64_MAX)
-            best_rd_cur = RDCOST(x->rdmult, rs2 + rate_mv + rate_sum, dist_sum);
+        if (best_rd_cur < best_rd_compound) {
+          best_comp_group_idx = mbmi->comp_group_idx;
+          best_compound_idx = mbmi->compound_idx;
           best_rd_compound = best_rd_cur;
-          break;
-#if CONFIG_WEDGE
-        case COMPOUND_WEDGE:
-          if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
-              best_rd_compound / 3 < ref_best_rd) {
-            best_rd_cur = build_and_cost_compound_type(
-                cpi, x, cur_mv, bsize, this_mode, rs2, rate_mv, &orig_dst,
-                &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col);
-          }
-          break;
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-        case COMPOUND_SEG:
-          if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
-              best_rd_compound / 3 < ref_best_rd) {
-            best_rd_cur = build_and_cost_compound_type(
-                cpi, x, cur_mv, bsize, this_mode, rs2, rate_mv, &orig_dst,
-                &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col);
-          }
-          break;
-#endif  // CONFIG_COMPOUND_SEGMENT
-        default: assert(0); return 0;
-      }
-
-      if (best_rd_cur < best_rd_compound) {
-        best_rd_compound = best_rd_cur;
-#if CONFIG_WEDGE
-        best_compound_data.wedge_index = mbmi->wedge_index;
-        best_compound_data.wedge_sign = mbmi->wedge_sign;
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-        best_compound_data.mask_type = mbmi->mask_type;
-        memcpy(best_compound_data.seg_mask, xd->seg_mask,
-               2 * MAX_SB_SQUARE * sizeof(uint8_t));
-#endif  // CONFIG_COMPOUND_SEGMENT
-        best_compound_data.interinter_compound_type =
-            mbmi->interinter_compound_type;
-        best_compmode_interinter_cost = rs2;
-        if (have_newmv_in_inter_mode(this_mode)) {
-          if (use_masked_motion_search(cur_type)) {
-            best_tmp_rate_mv = tmp_rate_mv;
-            best_mv[0].as_int = mbmi->mv[0].as_int;
-            best_mv[1].as_int = mbmi->mv[1].as_int;
-          } else {
-            best_mv[0].as_int = cur_mv[0].as_int;
-            best_mv[1].as_int = cur_mv[1].as_int;
+          best_compound_data = mbmi->interinter_comp;
+          memcpy(tmp_best_mask_buf, xd->seg_mask,
+                 2 * num_pix * sizeof(uint8_t));
+          best_compmode_interinter_cost = rs2;
+          if (have_newmv_in_inter_mode(this_mode)) {
+            if (use_masked_motion_search(cur_type)) {
+              best_tmp_rate_mv = tmp_rate_mv;
+              best_mv[0].as_int = mbmi->mv[0].as_int;
+              best_mv[1].as_int = mbmi->mv[1].as_int;
+            } else {
+              best_mv[0].as_int = cur_mv[0].as_int;
+              best_mv[1].as_int = cur_mv[1].as_int;
+            }
           }
         }
+        // reset to original mvs for next iteration
+        mbmi->mv[0].as_int = cur_mv[0].as_int;
+        mbmi->mv[1].as_int = cur_mv[1].as_int;
+      }
+      mbmi->comp_group_idx = best_comp_group_idx;
+      mbmi->compound_idx = best_compound_idx;
+      mbmi->interinter_comp = best_compound_data;
+      assert(IMPLIES(mbmi->comp_group_idx == 1,
+                     mbmi->interinter_comp.type != COMPOUND_AVERAGE));
+      memcpy(xd->seg_mask, tmp_best_mask_buf, 2 * num_pix * sizeof(uint8_t));
+      if (have_newmv_in_inter_mode(this_mode)) {
+        mbmi->mv[0].as_int = best_mv[0].as_int;
+        mbmi->mv[1].as_int = best_mv[1].as_int;
+        if (use_masked_motion_search(mbmi->interinter_comp.type)) {
+          rd_stats->rate += best_tmp_rate_mv - rate_mv;
+          rate_mv = best_tmp_rate_mv;
+        }
       }
-      // reset to original mvs for next iteration
-      mbmi->mv[0].as_int = cur_mv[0].as_int;
-      mbmi->mv[1].as_int = cur_mv[1].as_int;
-    }
-#if CONFIG_WEDGE
-    mbmi->wedge_index = best_compound_data.wedge_index;
-    mbmi->wedge_sign = best_compound_data.wedge_sign;
-#endif  // CONFIG_WEDGE
-#if CONFIG_COMPOUND_SEGMENT
-    mbmi->mask_type = best_compound_data.mask_type;
-    memcpy(xd->seg_mask, best_compound_data.seg_mask,
-           2 * MAX_SB_SQUARE * sizeof(uint8_t));
-#endif  // CONFIG_COMPOUND_SEGMENT
-    mbmi->interinter_compound_type =
-        best_compound_data.interinter_compound_type;
-    if (have_newmv_in_inter_mode(this_mode)) {
-      mbmi->mv[0].as_int = best_mv[0].as_int;
-      mbmi->mv[1].as_int = best_mv[1].as_int;
-      xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
-      xd->mi[0]->bmi[0].as_mv[1].as_int = mbmi->mv[1].as_int;
-      if (use_masked_motion_search(mbmi->interinter_compound_type)) {
-        rd_stats->rate += best_tmp_rate_mv - rate_mv;
-        rate_mv = best_tmp_rate_mv;
-      }
-    }
 
-    if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) {
-      restore_dst_buf(xd, orig_dst);
-      return INT64_MAX;
+      if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        early_terminate = INT64_MAX;
+        continue;
+      }
+      compmode_interinter_cost = best_compmode_interinter_cost;
     }
 
-    pred_exists = 0;
-
-    compmode_interinter_cost = best_compmode_interinter_cost;
-  }
-#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-
-#if CONFIG_INTERINTRA
-  if (is_comp_interintra_pred) {
-    INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
-    int64_t best_interintra_rd = INT64_MAX;
-    int rmode, rate_sum;
-    int64_t dist_sum;
-    int j;
-    int tmp_rate_mv = 0;
-    int tmp_skip_txfm_sb;
-    int64_t tmp_skip_sse_sb;
-    DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_SB_SQUARE]);
-    uint8_t *intrapred;
-
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      intrapred = CONVERT_TO_BYTEPTR(intrapred_);
-    else
-#endif  // CONFIG_HIGHBITDEPTH
-      intrapred = intrapred_;
-
-    mbmi->ref_frame[1] = NONE_FRAME;
-    for (j = 0; j < MAX_MB_PLANE; j++) {
-      xd->plane[j].dst.buf = tmp_buf + j * MAX_SB_SQUARE;
-      xd->plane[j].dst.stride = bw;
+    if (is_comp_pred) {
+      int tmp_rate;
+      int64_t tmp_dist;
+      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst, bsize);
+      model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate,
+                      &tmp_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate,
+                      plane_sse, plane_dist);
+      rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist);
+    }
+
+    if (search_jnt_comp) {
+      // if 1/2 model rd is larger than best_rd in jnt_comp mode,
+      // use jnt_comp mode, save additional search
+      if ((rd >> 1) > best_rd) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        continue;
+      }
     }
-    av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst, bsize);
-    restore_dst_buf(xd, orig_dst);
-    mbmi->ref_frame[1] = INTRA_FRAME;
-    mbmi->use_wedge_interintra = 0;
 
-    for (j = 0; j < INTERINTRA_MODES; ++j) {
-      mbmi->interintra_mode = (INTERINTRA_MODE)j;
-      rmode = interintra_mode_cost[mbmi->interintra_mode];
-      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, &orig_dst,
-                                                intrapred, bw);
-      av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-      model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
-                      &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
-      rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
-      if (rd < best_interintra_rd) {
-        best_interintra_rd = rd;
-        best_interintra_mode = mbmi->interintra_mode;
-      }
-    }
-    mbmi->interintra_mode = best_interintra_mode;
-    rmode = interintra_mode_cost[mbmi->interintra_mode];
-    av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, &orig_dst,
-                                              intrapred, bw);
-    av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-    av1_subtract_plane(x, bsize, 0);
-    rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
-                             &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
-    if (rd != INT64_MAX)
-      rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum, dist_sum);
-    best_interintra_rd = rd;
+    if (!is_comp_pred)
+      args->single_filter[this_mode][refs[0]] =
+          av1_extract_interp_filter(mbmi->interp_filters, 0);
 
-    if (ref_best_rd < INT64_MAX && best_interintra_rd > 2 * ref_best_rd) {
-      // Don't need to call restore_dst_buf here
-      return INT64_MAX;
-    }
-#if CONFIG_WEDGE
-    if (is_interintra_wedge_used(bsize)) {
-      int64_t best_interintra_rd_nowedge = INT64_MAX;
-      int64_t best_interintra_rd_wedge = INT64_MAX;
-      int_mv tmp_mv;
-      int rwedge = av1_cost_bit(cm->fc->wedge_interintra_prob[bsize], 0);
-      if (rd != INT64_MAX)
-        rd = RDCOST(x->rdmult, rmode + rate_mv + rwedge + rate_sum, dist_sum);
-      best_interintra_rd_nowedge = best_interintra_rd;
-
-      // Disable wedge search if source variance is small
-      if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) {
-        mbmi->use_wedge_interintra = 1;
-
-        rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) +
-                 av1_cost_bit(cm->fc->wedge_interintra_prob[bsize], 1);
-
-        best_interintra_rd_wedge =
-            pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
-
-        best_interintra_rd_wedge +=
-            RDCOST(x->rdmult, rmode + rate_mv + rwedge, 0);
-        // Refine motion vector.
-        if (have_newmv_in_inter_mode(this_mode)) {
-          // get negative of mask
-          const uint8_t *mask = av1_get_contiguous_soft_mask(
-              mbmi->interintra_wedge_index, 1, bsize);
-          tmp_mv.as_int = x->mbmi_ext->ref_mvs[refs[0]][0].as_int;
-          compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row,
-                                        mi_col, intrapred, mask, bw,
-                                        &tmp_rate_mv, 0);
-          mbmi->mv[0].as_int = tmp_mv.as_int;
-          av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst,
-                                         bsize);
-          model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
-                          &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
-          rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
-                      dist_sum);
-          if (rd >= best_interintra_rd_wedge) {
-            tmp_mv.as_int = cur_mv[0].as_int;
-            tmp_rate_mv = rate_mv;
-          }
-        } else {
-          tmp_mv.as_int = cur_mv[0].as_int;
-          tmp_rate_mv = rate_mv;
-          av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
-        }
-        // Evaluate closer to true rd
-        av1_subtract_plane(x, bsize, 0);
-        rd =
-            estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
-                                &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
-        if (rd != INT64_MAX)
-          rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
-                      dist_sum);
-        best_interintra_rd_wedge = rd;
-        if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
-          mbmi->use_wedge_interintra = 1;
-          mbmi->mv[0].as_int = tmp_mv.as_int;
-          rd_stats->rate += tmp_rate_mv - rate_mv;
-          rate_mv = tmp_rate_mv;
-        } else {
-          mbmi->use_wedge_interintra = 0;
-          mbmi->mv[0].as_int = cur_mv[0].as_int;
+    if (args->modelled_rd != NULL) {
+      if (is_comp_pred) {
+        const int mode0 = compound_ref0_mode(this_mode);
+        const int mode1 = compound_ref1_mode(this_mode);
+        const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]],
+                                   args->modelled_rd[mode1][refs[1]]);
+        if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) {
+          restore_dst_buf(xd, orig_dst, num_planes);
+          early_terminate = INT64_MAX;
+          continue;
         }
       } else {
-        mbmi->use_wedge_interintra = 0;
+        args->modelled_rd[this_mode][refs[0]] = rd;
       }
     }
-#endif  // CONFIG_WEDGE
 
-    pred_exists = 0;
-    compmode_interintra_cost =
-        av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 1) +
-        interintra_mode_cost[mbmi->interintra_mode];
-    if (is_interintra_wedge_used(bsize)) {
-      compmode_interintra_cost += av1_cost_bit(
-          cm->fc->wedge_interintra_prob[bsize], mbmi->use_wedge_interintra);
-      if (mbmi->use_wedge_interintra) {
-        compmode_interintra_cost +=
-            av1_cost_literal(get_interintra_wedge_bits(bsize));
+    if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+      // if current pred_error modeled rd is substantially more than the best
+      // so far, do not bother doing full rd
+      if (rd / 2 > ref_best_rd) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        early_terminate = INT64_MAX;
+        continue;
       }
     }
-  } else if (is_interintra_allowed(mbmi)) {
-    compmode_interintra_cost =
-        av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 0);
-  }
-#endif  // CONFIG_INTERINTRA
 
-  if (pred_exists == 0) {
-    int tmp_rate;
-    int64_t tmp_dist;
-    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst, bsize);
-    model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
-                    &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
-    rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist);
-  }
-
-  if (!is_comp_pred)
-    args->single_filter[this_mode][refs[0]] =
-        av1_extract_interp_filter(mbmi->interp_filters, 0);
+    rd_stats->rate += compmode_interinter_cost;
 
-  if (args->modelled_rd != NULL) {
-    if (is_comp_pred) {
-      const int mode0 = compound_ref0_mode(this_mode);
-      const int mode1 = compound_ref1_mode(this_mode);
-      const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]],
-                                 args->modelled_rd[mode1][refs[1]]);
-      if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) {
-        restore_dst_buf(xd, orig_dst);
-        return INT64_MAX;
+    if (search_jnt_comp && cpi->sf.jnt_comp_fast_tx_search && comp_idx == 0) {
+      // TODO(chengchen): this speed feature introduces big loss.
+      // Need better estimation of rate distortion.
+      rd_stats->rate += rs;
+      rd_stats->rate += plane_rate[0] + plane_rate[1] + plane_rate[2];
+      rd_stats_y->rate = plane_rate[0];
+      rd_stats_uv->rate = plane_rate[1] + plane_rate[2];
+      rd_stats->sse = plane_sse[0] + plane_sse[1] + plane_sse[2];
+      rd_stats_y->sse = plane_sse[0];
+      rd_stats_uv->sse = plane_sse[1] + plane_sse[2];
+      rd_stats->dist = plane_dist[0] + plane_dist[1] + plane_dist[2];
+      rd_stats_y->dist = plane_dist[0];
+      rd_stats_uv->dist = plane_dist[1] + plane_dist[2];
+    } else {
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+      ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
+                               disable_skip, mi_row, mi_col, args, ref_best_rd,
+                               refs, rate_mv, &orig_dst, best_est_rd);
+#else
+      ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
+                               disable_skip, mi_row, mi_col, args, ref_best_rd,
+                               refs, rate_mv, &orig_dst);
+#endif
+    }
+    if (ret_val != INT64_MAX) {
+      if (search_jnt_comp) {
+        int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+        if (tmp_rd < best_rd) {
+          best_rd_stats = *rd_stats;
+          best_rd_stats_y = *rd_stats_y;
+          best_rd_stats_uv = *rd_stats_uv;
+          best_ret_val = ret_val;
+          best_rd = tmp_rd;
+          best_mbmi = *mbmi;
+          memcpy(best_blk_skip, x->blk_skip,
+                 sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w);
+        }
+        if (tmp_rd < ref_best_rd) {
+          ref_best_rd = tmp_rd;
+        }
       }
-    } else if (!is_comp_interintra_pred) {
-      args->modelled_rd[this_mode][refs[0]] = rd;
     }
-  }
-
-  if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
-    // if current pred_error modeled rd is substantially more than the best
-    // so far, do not bother doing full rd
-    if (rd / 2 > ref_best_rd) {
-      restore_dst_buf(xd, orig_dst);
-      return INT64_MAX;
+    if (!search_jnt_comp && ret_val != 0) {
+      restore_dst_buf(xd, orig_dst, num_planes);
+      return ret_val;
     }
+    restore_dst_buf(xd, orig_dst, num_planes);
   }
 
-#if CONFIG_INTERINTRA
-  rd_stats->rate += compmode_interintra_cost;
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  rate2_bmc_nocoeff += compmode_interintra_cost;
-#endif
-#endif
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-  rd_stats->rate += compmode_interinter_cost;
-#endif
-
-  ret_val = motion_mode_rd(
-      cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip, mode_mv,
-      mi_row, mi_col, args, ref_best_rd, refs, rate_mv,
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-      single_newmv, rate2_bmc_nocoeff, &best_bmc_mbmi, rate_mv_bmc,
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-      rs, &skip_txfm_sb, &skip_sse_sb, &orig_dst);
+  // re-instate status of the best choice
+  if (is_comp_pred && best_ret_val != INT64_MAX) {
+    *rd_stats = best_rd_stats;
+    *rd_stats_y = best_rd_stats_y;
+    *rd_stats_uv = best_rd_stats_uv;
+    ret_val = best_ret_val;
+    *mbmi = best_mbmi;
+    assert(IMPLIES(mbmi->comp_group_idx == 1,
+                   mbmi->interinter_comp.type != COMPOUND_AVERAGE));
+    memcpy(x->blk_skip, best_blk_skip,
+           sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w);
+  }
+  if (early_terminate == INT64_MAX) return INT64_MAX;
   if (ret_val != 0) return ret_val;
-
-  return 0;  // The rate-distortion cost will be re-calculated by caller.
+  return RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
 }
 
-#if CONFIG_INTRABC
 static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
                                        RD_STATS *rd_cost, BLOCK_SIZE bsize,
                                        int64_t best_rd) {
   const AV1_COMMON *const cm = &cpi->common;
-  if (!av1_allow_intrabc(bsize, cm)) return INT64_MAX;
+  if (!av1_allow_intrabc(cm)) return INT64_MAX;
+  const int num_planes = av1_num_planes(cm);
 
   MACROBLOCKD *const xd = &x->e_mbd;
   const TileInfo *tile = &xd->tile;
-  MODE_INFO *const mi = xd->mi[0];
+  MB_MODE_INFO *mbmi = xd->mi[0];
   const int mi_row = -xd->mb_to_top_edge / (8 * MI_SIZE);
   const int mi_col = -xd->mb_to_left_edge / (8 * MI_SIZE);
   const int w = block_size_wide[bsize];
   const int h = block_size_high[bsize];
-  const int sb_row = mi_row / MAX_MIB_SIZE;
-  const int sb_col = mi_col / MAX_MIB_SIZE;
+  const int sb_row = mi_row >> cm->seq_params.mib_size_log2;
+  const int sb_col = mi_col >> cm->seq_params.mib_size_log2;
 
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   MV_REFERENCE_FRAME ref_frame = INTRA_FRAME;
-  int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
-  av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
-                   mbmi_ext->ref_mv_stack[ref_frame],
-                   mbmi_ext->compound_mode_context, candidates, mi_row, mi_col,
-                   NULL, NULL, mbmi_ext->mode_context);
+  av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+                   mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
+                   mi_col, mbmi_ext->mode_context);
 
   int_mv nearestmv, nearmv;
-  av1_find_best_ref_mvs(0, candidates, &nearestmv, &nearmv);
+  av1_find_best_ref_mvs_from_stack(0, mbmi_ext, ref_frame, &nearestmv, &nearmv,
+                                   0);
 
   int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
-  if (dv_ref.as_int == 0) av1_find_ref_dv(&dv_ref, mi_row, mi_col);
-  mbmi_ext->ref_mvs[INTRA_FRAME][0] = dv_ref;
+  if (dv_ref.as_int == 0)
+    av1_find_ref_dv(&dv_ref, tile, cm->seq_params.mib_size, mi_row, mi_col);
+  // Ref DV should not have sub-pel.
+  assert((dv_ref.as_mv.col & 7) == 0);
+  assert((dv_ref.as_mv.row & 7) == 0);
+  mbmi_ext->ref_mv_stack[INTRA_FRAME][0].this_mv = dv_ref;
 
   struct buf_2d yv12_mb[MAX_MB_PLANE];
-  av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, mi_row, mi_col, NULL, NULL);
-  for (int i = 0; i < MAX_MB_PLANE; ++i) {
+  av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, mi_row, mi_col, NULL, NULL,
+                       num_planes);
+  for (int i = 0; i < num_planes; ++i) {
     xd->plane[i].pre[0] = yv12_mb[i];
   }
 
@@ -9853,11 +8843,11 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     IBC_MOTION_DIRECTIONS
   };
 
-  MB_MODE_INFO *mbmi = &mi->mbmi;
   MB_MODE_INFO best_mbmi = *mbmi;
   RD_STATS best_rdcost = *rd_cost;
   int best_skip = x->skip;
 
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
   for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE;
        dir < IBC_MOTION_DIRECTIONS; ++dir) {
     const MvLimits tmp_mv_limits = x->mv_limits;
@@ -9866,16 +8856,18 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
         x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
         x->mv_limits.col_max = (tile->mi_col_end - mi_col) * MI_SIZE - w;
         x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
-        x->mv_limits.row_max = (sb_row * MAX_MIB_SIZE - mi_row) * MI_SIZE - h;
+        x->mv_limits.row_max =
+            (sb_row * cm->seq_params.mib_size - mi_row) * MI_SIZE - h;
         break;
       case IBC_MOTION_LEFT:
         x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
-        x->mv_limits.col_max = (sb_col * MAX_MIB_SIZE - mi_col) * MI_SIZE - w;
+        x->mv_limits.col_max =
+            (sb_col * cm->seq_params.mib_size - mi_col) * MI_SIZE - w;
         // TODO(aconverse@google.com): Minimize the overlap between above and
         // left areas.
         x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
         int bottom_coded_mi_edge =
-            AOMMIN((sb_row + 1) * MAX_MIB_SIZE, tile->mi_row_end);
+            AOMMIN((sb_row + 1) * cm->seq_params.mib_size, tile->mi_row_end);
         x->mv_limits.row_max = (bottom_coded_mi_edge - mi_row) * MI_SIZE - h;
         break;
       default: assert(0);
@@ -9898,66 +8890,67 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     mvp_full.row >>= 3;
     int sadpb = x->sadperbit16;
     int cost_list[5];
-#if CONFIG_HASH_ME
     int bestsme = av1_full_pixel_search(
         cpi, x, bsize, &mvp_full, step_param, sadpb,
         cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1,
         (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1);
-#else
-    int bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
-                                        sadpb, cond_cost_list(cpi, cost_list),
-                                        &dv_ref.as_mv, INT_MAX, 1);
-#endif
 
     x->mv_limits = tmp_mv_limits;
     if (bestsme == INT_MAX) continue;
     mvp_full = x->best_mv.as_mv;
-    MV dv = {.row = mvp_full.row * 8, .col = mvp_full.col * 8 };
+    MV dv = { .row = mvp_full.row * 8, .col = mvp_full.col * 8 };
     if (mv_check_bounds(&x->mv_limits, &dv)) continue;
-    if (!is_dv_valid(dv, tile, mi_row, mi_col, bsize)) continue;
+    if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize,
+                         cm->seq_params.mib_size_log2))
+      continue;
 
+    // DV should not have sub-pel.
+    assert((dv.col & 7) == 0);
+    assert((dv.row & 7) == 0);
     memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info));
+    mbmi->filter_intra_mode_info.use_filter_intra = 0;
     mbmi->use_intrabc = 1;
     mbmi->mode = DC_PRED;
     mbmi->uv_mode = UV_DC_PRED;
+    mbmi->motion_mode = SIMPLE_TRANSLATION;
     mbmi->mv[0].as_mv = dv;
     mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
     mbmi->skip = 0;
     x->skip = 0;
     av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
 
-    assert(x->mvcost == x->mv_cost_stack[0]);
+    int *dvcost[2] = { (int *)&cpi->dv_cost[0][MV_MAX],
+                       (int *)&cpi->dv_cost[1][MV_MAX] };
     // TODO(aconverse@google.com): The full motion field defining discount
     // in MV_COST_WEIGHT is too large. Explore other values.
-    int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, x->nmvjointcost,
-                                  x->mvcost, MV_COST_WEIGHT_SUB);
+    int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, cpi->dv_joint_cost,
+                                  dvcost, MV_COST_WEIGHT_SUB);
     const int rate_mode = x->intrabc_cost[1];
     RD_STATS rd_stats, rd_stats_uv;
     av1_subtract_plane(x, bsize, 0);
-    super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
-    super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-    av1_merge_rd_stats(&rd_stats, &rd_stats_uv);
+    if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+      // Intrabc
+      select_tx_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col, INT64_MAX);
+    } else {
+      super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
+      memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+      memset(x->blk_skip, rd_stats.skip,
+             sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+    }
+    if (num_planes > 1) {
+      super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+      av1_merge_rd_stats(&rd_stats, &rd_stats_uv);
+    }
 #if CONFIG_RD_DEBUG
     mbmi->rd_stats = rd_stats;
 #endif
 
-#if CONFIG_VAR_TX
-    // TODO(aconverse@google.com): Evaluate allowing VAR TX on intrabc blocks
-    const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
-    const int height = block_size_high[bsize] >> tx_size_high_log2[0];
-    int idx, idy;
-    for (idy = 0; idy < height; ++idy)
-      for (idx = 0; idx < width; ++idx)
-        mbmi->inter_tx_size[idy >> 1][idx >> 1] = mbmi->tx_size;
-    mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-#endif  // CONFIG_VAR_TX
-
-    const aom_prob skip_prob = av1_get_skip_prob(cm, xd);
+    const int skip_ctx = av1_get_skip_context(xd);
 
     RD_STATS rdc_noskip;
     av1_init_rd_stats(&rdc_noskip);
     rdc_noskip.rate =
-        rate_mode + rate_mv + rd_stats.rate + av1_cost_bit(skip_prob, 0);
+        rate_mode + rate_mv + rd_stats.rate + x->skip_cost[skip_ctx][0];
     rdc_noskip.dist = rd_stats.dist;
     rdc_noskip.rdcost = RDCOST(x->rdmult, rdc_noskip.rate, rdc_noskip.dist);
     if (rdc_noskip.rdcost < best_rd) {
@@ -9965,98 +8958,88 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
       best_mbmi = *mbmi;
       best_skip = x->skip;
       best_rdcost = rdc_noskip;
+      memcpy(best_blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
     }
 
-    x->skip = 1;
-    mbmi->skip = 1;
-    RD_STATS rdc_skip;
-    av1_init_rd_stats(&rdc_skip);
-    rdc_skip.rate = rate_mode + rate_mv + av1_cost_bit(skip_prob, 1);
-    rdc_skip.dist = rd_stats.sse;
-    rdc_skip.rdcost = RDCOST(x->rdmult, rdc_skip.rate, rdc_skip.dist);
-    if (rdc_skip.rdcost < best_rd) {
-      best_rd = rdc_skip.rdcost;
-      best_mbmi = *mbmi;
-      best_skip = x->skip;
-      best_rdcost = rdc_skip;
+    if (!xd->lossless[mbmi->segment_id]) {
+      x->skip = 1;
+      mbmi->skip = 1;
+      RD_STATS rdc_skip;
+      av1_init_rd_stats(&rdc_skip);
+      rdc_skip.rate = rate_mode + rate_mv + x->skip_cost[skip_ctx][1];
+      rdc_skip.dist = rd_stats.sse;
+      rdc_skip.rdcost = RDCOST(x->rdmult, rdc_skip.rate, rdc_skip.dist);
+      if (rdc_skip.rdcost < best_rd) {
+        best_rd = rdc_skip.rdcost;
+        best_mbmi = *mbmi;
+        best_skip = x->skip;
+        best_rdcost = rdc_skip;
+        memcpy(best_blk_skip, x->blk_skip,
+               sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+      }
     }
   }
   *mbmi = best_mbmi;
   *rd_cost = best_rdcost;
   x->skip = best_skip;
+  memcpy(x->blk_skip, best_blk_skip,
+         sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
   return best_rd;
 }
-#endif  // CONFIG_INTRABC
 
-void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
-                               RD_STATS *rd_cost, BLOCK_SIZE bsize,
+void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+                               int mi_col, RD_STATS *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  struct macroblockd_plane *const pd = xd->plane;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int num_planes = av1_num_planes(cm);
   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
   int y_skip = 0, uv_skip = 0;
   int64_t dist_y = 0, dist_uv = 0;
   TX_SIZE max_uv_tx_size;
-  const int unify_bsize = CONFIG_CB4X4;
 
   ctx->skip = 0;
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE_FRAME;
-#if CONFIG_INTRABC
   mbmi->use_intrabc = 0;
   mbmi->mv[0].as_int = 0;
-#endif  // CONFIG_INTRABC
-#if CONFIG_LGT_FROM_PRED
-  mbmi->use_lgt = 0;
-#endif
 
   const int64_t intra_yrd =
-      (bsize >= BLOCK_8X8 || unify_bsize)
-          ? rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y,
-                                   &y_skip, bsize, best_rd)
-          : rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
-                                         &dist_y, &y_skip, best_rd);
+      rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y,
+                             &y_skip, bsize, best_rd, ctx);
 
   if (intra_yrd < best_rd) {
-#if CONFIG_CFL
-#if CONFIG_CB4X4
     // Only store reconstructed luma when there's chroma RDO. When there's no
     // chroma RDO, the reconstructed luma will be stored in encode_superblock().
-    xd->cfl->store_y = !x->skip_chroma_rd;
-#else
-    xd->cfl->store_y = 1;
-#endif  // CONFIG_CB4X4
-    if (xd->cfl->store_y) {
-      // Perform one extra call to txfm_rd_in_plane(), with the values chosen
-      // during luma RDO, so we can store reconstructed luma values
-      RD_STATS this_rd_stats;
-      txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y,
-                       mbmi->sb_type, mbmi->tx_size,
-                       cpi->sf.use_fast_coef_costing);
-      xd->cfl->store_y = 0;
-    }
-#endif  // CONFIG_CFL
-    max_uv_tx_size = uv_txsize_lookup[bsize][mbmi->tx_size][pd[1].subsampling_x]
-                                     [pd[1].subsampling_y];
-    init_sbuv_mode(mbmi);
-#if CONFIG_CB4X4
-    if (!x->skip_chroma_rd)
-      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
-                              &uv_skip, bsize, max_uv_tx_size);
-#else
-    rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
-                            &uv_skip, AOMMAX(BLOCK_8X8, bsize), max_uv_tx_size);
-#endif  // CONFIG_CB4X4
+    xd->cfl.is_chroma_reference = is_chroma_reference(
+        mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y);
+    xd->cfl.store_y = store_cfl_required_rdo(cm, x);
+    if (xd->cfl.store_y) {
+      // Restore reconstructed luma values.
+      memcpy(x->blk_skip, ctx->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      av1_encode_intra_block_plane(cpi, x, bsize, AOM_PLANE_Y,
+                                   cpi->optimize_seg_arr[mbmi->segment_id],
+                                   mi_row, mi_col);
+      xd->cfl.store_y = 0;
+    }
+    if (num_planes > 1) {
+      max_uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+      init_sbuv_mode(mbmi);
+      if (!x->skip_chroma_rd)
+        rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
+                                &uv_skip, bsize, max_uv_tx_size);
+    }
 
     if (y_skip && (uv_skip || x->skip_chroma_rd)) {
       rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
-                      av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+                      x->skip_cost[av1_get_skip_context(xd)][1];
       rd_cost->dist = dist_y + dist_uv;
     } else {
       rd_cost->rate =
-          rate_y + rate_uv + av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+          rate_y + rate_uv + x->skip_cost[av1_get_skip_context(xd)][0];
       rd_cost->dist = dist_y + dist_uv;
     }
     rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
@@ -10064,125 +9047,47 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     rd_cost->rate = INT_MAX;
   }
 
-#if CONFIG_INTRABC
   if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd)
     best_rd = rd_cost->rdcost;
   if (rd_pick_intrabc_mode_sb(cpi, x, rd_cost, bsize, best_rd) < best_rd) {
-    ctx->skip = x->skip;  // FIXME where is the proper place to set this?!
+    ctx->skip = x->skip;
+    memcpy(ctx->blk_skip, x->blk_skip,
+           sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
     assert(rd_cost->rate != INT_MAX);
-    rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
   }
-#endif
   if (rd_cost->rate == INT_MAX) return;
 
   ctx->mic = *xd->mi[0];
   ctx->mbmi_ext = *x->mbmi_ext;
 }
 
-// Do we have an internal image edge (e.g. formatting bars).
-int av1_internal_image_edge(const AV1_COMP *cpi) {
-  return (cpi->oxcf.pass == 2) &&
-         ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
-          (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
-}
-
-// Checks to see if a super block is on a horizontal image edge.
-// In most cases this is the "real" edge unless there are formatting
-// bars embedded in the stream.
-int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) {
-  int top_edge = 0;
-  int bottom_edge = cpi->common.mi_rows;
-  int is_active_h_edge = 0;
-
-  // For two pass account for any formatting bars detected.
-  if (cpi->oxcf.pass == 2) {
-    const TWO_PASS *const twopass = &cpi->twopass;
-
-    // The inactive region is specified in MBs not mi units.
-    // The image edge is in the following MB row.
-    top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
-
-    bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
-    bottom_edge = AOMMAX(top_edge, bottom_edge);
-  }
-
-  if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
-      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
-    is_active_h_edge = 1;
-  }
-  return is_active_h_edge;
-}
-
-// Checks to see if a super block is on a vertical image edge.
-// In most cases this is the "real" edge unless there are formatting
-// bars embedded in the stream.
-int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
-  int left_edge = 0;
-  int right_edge = cpi->common.mi_cols;
-  int is_active_v_edge = 0;
-
-  // For two pass account for any formatting bars detected.
-  if (cpi->oxcf.pass == 2) {
-    const TWO_PASS *const twopass = &cpi->twopass;
-
-    // The inactive region is specified in MBs not mi units.
-    // The image edge is in the following MB row.
-    left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
-
-    right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
-    right_edge = AOMMAX(left_edge, right_edge);
-  }
-
-  if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
-      ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
-    is_active_v_edge = 1;
-  }
-  return is_active_v_edge;
-}
-
-// Checks to see if a super block is at the edge of the active image.
-// In most cases this is the "real" edge unless there are formatting
-// bars embedded in the stream.
-int av1_active_edge_sb(const AV1_COMP *cpi, int mi_row, int mi_col) {
-  return av1_active_h_edge(cpi, mi_row, cpi->common.mib_size) ||
-         av1_active_v_edge(cpi, mi_col, cpi->common.mib_size);
-}
-
 static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const BLOCK_SIZE bsize = mbmi->sb_type;
-  assert(bsize >= BLOCK_8X8);
   int src_stride = x->plane[1].src.stride;
   const uint8_t *const src_u = x->plane[1].src.buf;
   const uint8_t *const src_v = x->plane[2].src.buf;
-  float *const data = x->palette_buffer->kmeans_data_buf;
-  float centroids[2 * PALETTE_MAX_SIZE];
+  int *const data = x->palette_buffer->kmeans_data_buf;
+  int centroids[2 * PALETTE_MAX_SIZE];
   uint8_t *const color_map = xd->plane[1].color_index_map;
   int r, c;
-#if CONFIG_HIGHBITDEPTH
   const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
   const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
-#endif  // CONFIG_HIGHBITDEPTH
   int plane_block_width, plane_block_height, rows, cols;
   av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
                            &plane_block_height, &rows, &cols);
-  (void)cpi;
 
   for (r = 0; r < rows; ++r) {
     for (c = 0; c < cols; ++c) {
-#if CONFIG_HIGHBITDEPTH
       if (cpi->common.use_highbitdepth) {
         data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
         data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
       } else {
-#endif  // CONFIG_HIGHBITDEPTH
         data[(r * cols + c) * 2] = src_u[r * src_stride + c];
         data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
-#if CONFIG_HIGHBITDEPTH
       }
-#endif  // CONFIG_HIGHBITDEPTH
     }
   }
 
@@ -10198,451 +9103,361 @@ static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
                            plane_block_height);
 }
 
-#if CONFIG_FILTER_INTRA
-static void pick_filter_intra_interframe(
-    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
-    int mi_col, int *rate_uv_intra, int *rate_uv_tokenonly, int64_t *dist_uv,
-    int *skip_uv, UV_PREDICTION_MODE *mode_uv,
-    FILTER_INTRA_MODE_INFO *filter_intra_mode_info_uv,
-#if CONFIG_EXT_INTRA
-    int8_t *uv_angle_delta,
-#endif  // CONFIG_EXT_INTRA
-    PALETTE_MODE_INFO *pmi_uv, int palette_ctx, int skip_mask,
-    unsigned int *ref_costs_single, int64_t *best_rd, int64_t *best_intra_rd,
-    PREDICTION_MODE *best_intra_mode, int *best_mode_index, int *best_skip2,
-    int *best_mode_skippable,
-#if CONFIG_SUPERTX
-    int *returnrate_nocoef,
-#endif  // CONFIG_SUPERTX
-    int64_t *best_pred_rd, MB_MODE_INFO *best_mbmode, RD_STATS *rd_cost) {
+static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
+                                      const MACROBLOCKD *xd, int mi_row,
+                                      int mi_col, const uint8_t *above,
+                                      int above_stride, const uint8_t *left,
+                                      int left_stride);
+
+static const int ref_frame_flag_list[REF_FRAMES] = { 0,
+                                                     AOM_LAST_FLAG,
+                                                     AOM_LAST2_FLAG,
+                                                     AOM_LAST3_FLAG,
+                                                     AOM_GOLD_FLAG,
+                                                     AOM_BWD_FLAG,
+                                                     AOM_ALT2_FLAG,
+                                                     AOM_ALT_FLAG };
+
+static void rd_pick_skip_mode(RD_STATS *rd_cost,
+                              InterModeSearchState *search_state,
+                              const AV1_COMP *const cpi, MACROBLOCK *const x,
+                              BLOCK_SIZE bsize, int mi_row, int mi_col,
+                              struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  const int try_palette =
-      av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
-  int rate2 = 0, rate_y = INT_MAX, skippable = 0, rate_uv, rate_dummy, i;
-  int dc_mode_index;
-  const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
-  int64_t distortion2 = 0, distortion_y = 0, this_rd = *best_rd;
-  int64_t distortion_uv, model_rd = INT64_MAX;
-  TX_SIZE uv_tx;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
 
-  for (i = 0; i < MAX_MODES; ++i)
-    if (av1_mode_order[i].mode == DC_PRED &&
-        av1_mode_order[i].ref_frame[0] == INTRA_FRAME)
-      break;
-  dc_mode_index = i;
-  assert(i < MAX_MODES);
+  x->compound_idx = 1;  // COMPOUND_AVERAGE
+  RD_STATS skip_mode_rd_stats;
+  av1_invalid_rd_stats(&skip_mode_rd_stats);
 
-  // TODO(huisu): use skip_mask for further speedup.
-  (void)skip_mask;
-  mbmi->mode = DC_PRED;
+  if (cm->ref_frame_idx_0 == INVALID_IDX ||
+      cm->ref_frame_idx_1 == INVALID_IDX) {
+    return;
+  }
+
+  const MV_REFERENCE_FRAME ref_frame = LAST_FRAME + cm->ref_frame_idx_0;
+  const MV_REFERENCE_FRAME second_ref_frame = LAST_FRAME + cm->ref_frame_idx_1;
+  const PREDICTION_MODE this_mode = NEAREST_NEARESTMV;
+  const int mode_index =
+      get_prediction_mode_idx(this_mode, ref_frame, second_ref_frame);
+
+  if (mode_index == -1) {
+    return;
+  }
+
+  mbmi->mode = this_mode;
   mbmi->uv_mode = UV_DC_PRED;
-  mbmi->ref_frame[0] = INTRA_FRAME;
-  mbmi->ref_frame[1] = NONE_FRAME;
-  if (!rd_pick_filter_intra_sby(cpi, x, &rate_dummy, &rate_y, &distortion_y,
-                                &skippable, bsize, intra_mode_cost[mbmi->mode],
-                                &this_rd, &model_rd, 0)) {
+  mbmi->ref_frame[0] = ref_frame;
+  mbmi->ref_frame[1] = second_ref_frame;
+
+  assert(this_mode == NEAREST_NEARESTMV);
+  if (!build_cur_mv(mbmi->mv, this_mode, cm, x)) {
     return;
   }
-  if (rate_y == INT_MAX) return;
-
-  uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x]
-                          [xd->plane[1].subsampling_y];
-  if (rate_uv_intra[uv_tx] == INT_MAX) {
-    choose_intra_uv_mode(cpi, x, bsize, uv_tx, &rate_uv_intra[uv_tx],
-                         &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx],
-                         &skip_uv[uv_tx], &mode_uv[uv_tx]);
-    if (cm->allow_screen_content_tools) pmi_uv[uv_tx] = *pmi;
-    filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
-#if CONFIG_EXT_INTRA
-    uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
-#endif  // CONFIG_EXT_INTRA
-  }
-
-  rate_uv = rate_uv_tokenonly[uv_tx];
-  distortion_uv = dist_uv[uv_tx];
-  skippable = skippable && skip_uv[uv_tx];
-  mbmi->uv_mode = mode_uv[uv_tx];
-  if (cm->allow_screen_content_tools) {
-    pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
-    memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
-           pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
-           2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
-  }
-#if CONFIG_EXT_INTRA
-  mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
-#endif  // CONFIG_EXT_INTRA
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
-      filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
-  if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
-    mbmi->filter_intra_mode_info.filter_intra_mode[1] =
-        filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
-  }
-
-  rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv +
-          x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
-  if (try_palette && mbmi->mode == DC_PRED)
-    rate2 += av1_cost_bit(
-        av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0);
 
-  if (!xd->lossless[mbmi->segment_id]) {
-    // super_block_yrd above includes the cost of the tx_size in the
-    // tokenonly rate, but for intra blocks, tx_size is always coded
-    // (prediction granularity), so we account for it in the full rate,
-    // not the tokenonly rate.
-    rate_y -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
-  }
-
-  rate2 += av1_cost_bit(cm->fc->filter_intra_probs[0],
-                        mbmi->filter_intra_mode_info.use_filter_intra_mode[0]);
-  rate2 += write_uniform_cost(
-      FILTER_INTRA_MODES, mbmi->filter_intra_mode_info.filter_intra_mode[0]);
-#if CONFIG_EXT_INTRA
-  if (av1_is_directional_mode(get_uv_mode(mbmi->uv_mode), bsize) &&
-      av1_use_angle_delta(bsize)) {
-    rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
-                                MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
-  }
-#endif  // CONFIG_EXT_INTRA
-  if (mbmi->mode == DC_PRED) {
-    rate2 +=
-        av1_cost_bit(cpi->common.fc->filter_intra_probs[1],
-                     mbmi->filter_intra_mode_info.use_filter_intra_mode[1]);
-    if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1])
-      rate2 +=
-          write_uniform_cost(FILTER_INTRA_MODES,
-                             mbmi->filter_intra_mode_info.filter_intra_mode[1]);
-  }
-  distortion2 = distortion_y + distortion_uv;
-  av1_encode_intra_block_plane((AV1_COMMON *)cm, x, bsize, 0, 0, mi_row,
-                               mi_col);
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
+  mbmi->comp_group_idx = 0;
+  mbmi->compound_idx = x->compound_idx;
+  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->ref_mv_idx = 0;
+  mbmi->skip_mode = mbmi->skip = 1;
 
-  rate2 += ref_costs_single[INTRA_FRAME];
+  set_default_interp_filters(mbmi, cm->interp_filter);
 
-  if (skippable) {
-    rate2 -= (rate_y + rate_uv);
-    rate_y = 0;
-    rate_uv = 0;
-    rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-  } else {
-    rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+  for (int i = 0; i < num_planes; i++) {
+    xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+    xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
   }
-  this_rd = RDCOST(x->rdmult, rate2, distortion2);
 
-  if (this_rd < *best_intra_rd) {
-    *best_intra_rd = this_rd;
-    *best_intra_mode = mbmi->mode;
+  BUFFER_SET orig_dst;
+  for (int i = 0; i < num_planes; i++) {
+    orig_dst.plane[i] = xd->plane[i].dst.buf;
+    orig_dst.stride[i] = xd->plane[i].dst.stride;
   }
-  for (i = 0; i < REFERENCE_MODES; ++i)
-    best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd);
 
-  if (this_rd < *best_rd) {
-    *best_mode_index = dc_mode_index;
-    mbmi->mv[0].as_int = 0;
-    rd_cost->rate = rate2;
-#if CONFIG_SUPERTX
-    if (x->skip)
-      *returnrate_nocoef = rate2;
-    else
-      *returnrate_nocoef = rate2 - rate_y - rate_uv;
-    *returnrate_nocoef -= av1_cost_bit(av1_get_skip_prob(cm, xd), skippable);
-    *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd),
-                                       mbmi->ref_frame[0] != INTRA_FRAME);
-#endif  // CONFIG_SUPERTX
-    rd_cost->dist = distortion2;
-    rd_cost->rdcost = this_rd;
-    *best_rd = this_rd;
-    *best_mbmode = *mbmi;
-    *best_skip2 = 0;
-    *best_mode_skippable = skippable;
+  // Obtain the rdcost for skip_mode.
+  skip_mode_rd(&skip_mode_rd_stats, cpi, x, bsize, mi_row, mi_col, &orig_dst);
+
+  // Compare the use of skip_mode with the best intra/inter mode obtained.
+  const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+  const int64_t best_intra_inter_mode_cost =
+      (rd_cost->dist < INT64_MAX && rd_cost->rate < INT32_MAX)
+          ? RDCOST(x->rdmult,
+                   rd_cost->rate + x->skip_mode_cost[skip_mode_ctx][0],
+                   rd_cost->dist)
+          : INT64_MAX;
+
+  if (skip_mode_rd_stats.rdcost <= best_intra_inter_mode_cost) {
+    assert(mode_index != -1);
+    search_state->best_mbmode.skip_mode = 1;
+    search_state->best_mbmode = *mbmi;
+
+    search_state->best_mbmode.skip_mode = search_state->best_mbmode.skip = 1;
+    search_state->best_mbmode.mode = NEAREST_NEARESTMV;
+    search_state->best_mbmode.ref_frame[0] = mbmi->ref_frame[0];
+    search_state->best_mbmode.ref_frame[1] = mbmi->ref_frame[1];
+    search_state->best_mbmode.mv[0].as_int = mbmi->mv[0].as_int;
+    search_state->best_mbmode.mv[1].as_int = mbmi->mv[1].as_int;
+    search_state->best_mbmode.ref_mv_idx = 0;
+
+    // Set up tx_size related variables for skip-specific loop filtering.
+    search_state->best_mbmode.tx_size =
+        block_signals_txsize(bsize) ? tx_size_from_tx_mode(bsize, cm->tx_mode)
+                                    : max_txsize_rect_lookup[bsize];
+    memset(search_state->best_mbmode.inter_tx_size,
+           search_state->best_mbmode.tx_size,
+           sizeof(search_state->best_mbmode.inter_tx_size));
+    set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->n8_w, xd->n8_h,
+                  search_state->best_mbmode.skip && is_inter_block(mbmi), xd);
+
+    // Set up color-related variables for skip mode.
+    search_state->best_mbmode.uv_mode = UV_DC_PRED;
+    search_state->best_mbmode.palette_mode_info.palette_size[0] = 0;
+    search_state->best_mbmode.palette_mode_info.palette_size[1] = 0;
+
+    search_state->best_mbmode.comp_group_idx = 0;
+    search_state->best_mbmode.compound_idx = x->compound_idx;
+    search_state->best_mbmode.interinter_comp.type = COMPOUND_AVERAGE;
+    search_state->best_mbmode.motion_mode = SIMPLE_TRANSLATION;
+
+    search_state->best_mbmode.interintra_mode =
+        (INTERINTRA_MODE)(II_DC_PRED - 1);
+    search_state->best_mbmode.filter_intra_mode_info.use_filter_intra = 0;
+
+    set_default_interp_filters(&search_state->best_mbmode, cm->interp_filter);
+
+    search_state->best_mode_index = mode_index;
+
+    // Update rd_cost
+    rd_cost->rate = skip_mode_rd_stats.rate;
+    rd_cost->dist = rd_cost->sse = skip_mode_rd_stats.dist;
+    rd_cost->rdcost = skip_mode_rd_stats.rdcost;
+
+    search_state->best_rd = rd_cost->rdcost;
+    search_state->best_skip2 = 1;
+    search_state->best_mode_skippable = (skip_mode_rd_stats.sse == 0);
+
+    x->skip = 1;
   }
 }
-#endif  // CONFIG_FILTER_INTRA
-
-#if CONFIG_MOTION_VAR
-static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
-                                      const MACROBLOCKD *xd, int mi_row,
-                                      int mi_col, const uint8_t *above,
-                                      int above_stride, const uint8_t *left,
-                                      int left_stride);
-#endif  // CONFIG_MOTION_VAR
 
-void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
-                               MACROBLOCK *x, int mi_row, int mi_col,
-                               RD_STATS *rd_cost,
-#if CONFIG_SUPERTX
-                               int *returnrate_nocoef,
-#endif  // CONFIG_SUPERTX
-                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                               int64_t best_rd_so_far) {
+// speed feature: fast intra/inter transform type search
+// Used for speed >= 2
+// When this speed feature is on, in rd mode search, only DCT is used.
+// After the mode is determined, this function is called, to select
+// transform types and get accurate rdcost.
+static void sf_refine_fast_tx_type_search(
+    const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col,
+    RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+    int best_mode_index, MB_MODE_INFO *best_mbmode,
+    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], int best_rate_y,
+    int best_rate_uv, int *best_skip2) {
   const AV1_COMMON *const cm = &cpi->common;
-  const RD_OPT *const rd_opt = &cpi->rd;
   const SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const int try_palette =
-      av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int num_planes = av1_num_planes(cm);
+
+  if (xd->lossless[mbmi->segment_id] == 0 && best_mode_index >= 0 &&
+      ((sf->tx_type_search.fast_inter_tx_type_search == 1 &&
+        is_inter_mode(best_mbmode->mode)) ||
+       (sf->tx_type_search.fast_intra_tx_type_search == 1 &&
+        !is_inter_mode(best_mbmode->mode)))) {
+    int skip_blk = 0;
+    RD_STATS rd_stats_y, rd_stats_uv;
+
+    x->use_default_inter_tx_type = 0;
+    x->use_default_intra_tx_type = 0;
+
+    *mbmi = *best_mbmode;
+
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+    // Select prediction reference frames.
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+      if (has_second_ref(mbmi))
+        xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+    }
+
+    if (is_inter_mode(mbmi->mode)) {
+      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+      if (mbmi->motion_mode == OBMC_CAUSAL)
+        av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+
+      av1_subtract_plane(x, bsize, 0);
+      if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+        // av1_rd_pick_inter_mode_sb
+        select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col,
+                           INT64_MAX);
+        assert(rd_stats_y.rate != INT_MAX);
+      } else {
+        super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+        memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+        memset(x->blk_skip, rd_stats_y.skip,
+               sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+      }
+      if (num_planes > 1) {
+        inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX, FTXS_NONE);
+      } else {
+        av1_init_rd_stats(&rd_stats_uv);
+      }
+    } else {
+      super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+      if (num_planes > 1) {
+        super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+      } else {
+        av1_init_rd_stats(&rd_stats_uv);
+      }
+    }
+
+    if (RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate,
+               (rd_stats_y.dist + rd_stats_uv.dist)) >
+        RDCOST(x->rdmult, 0, (rd_stats_y.sse + rd_stats_uv.sse))) {
+      skip_blk = 1;
+      rd_stats_y.rate = x->skip_cost[av1_get_skip_context(xd)][1];
+      rd_stats_uv.rate = 0;
+      rd_stats_y.dist = rd_stats_y.sse;
+      rd_stats_uv.dist = rd_stats_uv.sse;
+    } else {
+      skip_blk = 0;
+      rd_stats_y.rate += x->skip_cost[av1_get_skip_context(xd)][0];
+    }
+
+    if (RDCOST(x->rdmult, best_rate_y + best_rate_uv, rd_cost->dist) >
+        RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate,
+               (rd_stats_y.dist + rd_stats_uv.dist))) {
+      best_mbmode->tx_size = mbmi->tx_size;
+      av1_copy(best_mbmode->inter_tx_size, mbmi->inter_tx_size);
+      memcpy(ctx->blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      av1_copy(best_mbmode->txk_type, mbmi->txk_type);
+      rd_cost->rate +=
+          (rd_stats_y.rate + rd_stats_uv.rate - best_rate_y - best_rate_uv);
+      rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
+      rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
+      *best_skip2 = skip_blk;
+    }
+  }
+}
+
+// Please add/modify parameter setting in this function, making it consistent
+// and easy to read and maintain.
+static void set_params_rd_pick_inter_mode(
+    const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args,
+    BLOCK_SIZE bsize, int mi_row, int mi_col, uint16_t ref_frame_skip_mask[2],
+    uint32_t mode_skip_mask[REF_FRAMES],
+    unsigned int ref_costs_single[REF_FRAMES],
+    unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES],
+    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const struct segmentation *const seg = &cm->seg;
-  PREDICTION_MODE this_mode;
-  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
+  const SPEED_FEATURES *const sf = &cpi->sf;
   unsigned char segment_id = mbmi->segment_id;
-  int comp_pred, i, k;
-  int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
-#if CONFIG_COMPOUND_SINGLEREF
-  int_mv frame_comp_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE];
-  int_mv single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } };
-  int single_newmv_rate[TOTAL_REFS_PER_FRAME] = { 0 };
-  int64_t modelled_rd[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
-  static const int flag_list[TOTAL_REFS_PER_FRAME] = {
-    0,
-    AOM_LAST_FLAG,
-#if CONFIG_EXT_REFS
-    AOM_LAST2_FLAG,
-    AOM_LAST3_FLAG,
-#endif  // CONFIG_EXT_REFS
-    AOM_GOLD_FLAG,
-#if CONFIG_EXT_REFS
-    AOM_BWD_FLAG,
-    AOM_ALT2_FLAG,
-#endif  // CONFIG_EXT_REFS
-    AOM_ALT_FLAG
-  };
-  int64_t best_rd = best_rd_so_far;
-  int best_rate_y = INT_MAX, best_rate_uv = INT_MAX;
-  int64_t best_pred_diff[REFERENCE_MODES];
-  int64_t best_pred_rd[REFERENCE_MODES];
-  MB_MODE_INFO best_mbmode;
-  int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
-  int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-  int best_mode_skippable = 0;
-  int midx, best_mode_index = -1;
-  unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
-#if CONFIG_EXT_COMP_REFS
-  unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME][TOTAL_REFS_PER_FRAME];
-#else
-  unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_EXT_COMP_REFS
-  aom_prob comp_mode_p;
-  int64_t best_intra_rd = INT64_MAX;
-  unsigned int best_pred_sse = UINT_MAX;
-  PREDICTION_MODE best_intra_mode = DC_PRED;
-  int rate_uv_intra[TX_SIZES_ALL], rate_uv_tokenonly[TX_SIZES_ALL];
-  int64_t dist_uvs[TX_SIZES_ALL];
-  int skip_uvs[TX_SIZES_ALL];
-  UV_PREDICTION_MODE mode_uv[TX_SIZES_ALL];
-  PALETTE_MODE_INFO pmi_uv[TX_SIZES_ALL];
-#if CONFIG_EXT_INTRA
-  int8_t uv_angle_delta[TX_SIZES_ALL];
-  int is_directional_mode, angle_stats_ready = 0;
-  uint8_t directional_mode_skip_mask[INTRA_MODES];
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-  int8_t dc_skipped = 1;
-  FILTER_INTRA_MODE_INFO filter_intra_mode_info_uv[TX_SIZES_ALL];
-#endif  // CONFIG_FILTER_INTRA
-  const int intra_cost_penalty = av1_get_intra_cost_penalty(
-      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
-  const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
-  int best_skip2 = 0;
-  uint16_t ref_frame_skip_mask[2] = { 0 };
-  uint32_t mode_skip_mask[TOTAL_REFS_PER_FRAME] = { 0 };
-#if CONFIG_INTERINTRA
-  MV_REFERENCE_FRAME best_single_inter_ref = LAST_FRAME;
-  int64_t best_single_inter_rd = INT64_MAX;
-#endif  // CONFIG_INTERINTRA
-  int mode_skip_start = sf->mode_skip_start + 1;
-  const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
-  const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
-  int64_t mode_threshold[MAX_MODES];
-  int *mode_map = tile_data->mode_map[bsize];
-  const int mode_search_skip_flags = sf->mode_search_skip_flags;
-#if CONFIG_PVQ
-  od_rollback_buffer pre_buf;
-#endif  // CONFIG_PVQ
-
-  HandleInterModeArgs args = {
-#if CONFIG_MOTION_VAR
-    { NULL },
-    { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
-    { NULL },
-    { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
-#endif  // CONFIG_MOTION_VAR
-    NULL,
-    NULL,
-    NULL,
-    { { 0 } },
-  };
-
-  const int rows = block_size_high[bsize];
-  const int cols = block_size_wide[bsize];
-  int palette_ctx = 0;
-  const MODE_INFO *above_mi = xd->above_mi;
-  const MODE_INFO *left_mi = xd->left_mi;
-#if CONFIG_MOTION_VAR
   int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+                                   MAX_SB_SIZE >> 1 };
+  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+                                    MAX_SB_SIZE >> 1 };
   int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
 
-#if CONFIG_HIGHBITDEPTH
+  for (int i = 0; i < MB_MODE_COUNT; ++i)
+    for (int k = 0; k < REF_FRAMES; ++k) args->single_filter[i][k] = SWITCHABLE;
+
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     int len = sizeof(uint16_t);
-    args.above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
-    args.above_pred_buf[1] =
+    args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
+    args->above_pred_buf[1] =
+        CONVERT_TO_BYTEPTR(x->above_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+    args->above_pred_buf[2] =
         CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len);
-    args.above_pred_buf[2] =
-        CONVERT_TO_BYTEPTR(x->above_pred_buf + 2 * MAX_SB_SQUARE * len);
-    args.left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
-    args.left_pred_buf[1] =
+    args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
+    args->left_pred_buf[1] =
+        CONVERT_TO_BYTEPTR(x->left_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+    args->left_pred_buf[2] =
         CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len);
-    args.left_pred_buf[2] =
-        CONVERT_TO_BYTEPTR(x->left_pred_buf + 2 * MAX_SB_SQUARE * len);
   } else {
-#endif  // CONFIG_HIGHBITDEPTH
-    args.above_pred_buf[0] = x->above_pred_buf;
-    args.above_pred_buf[1] = x->above_pred_buf + MAX_SB_SQUARE;
-    args.above_pred_buf[2] = x->above_pred_buf + 2 * MAX_SB_SQUARE;
-    args.left_pred_buf[0] = x->left_pred_buf;
-    args.left_pred_buf[1] = x->left_pred_buf + MAX_SB_SQUARE;
-    args.left_pred_buf[2] = x->left_pred_buf + 2 * MAX_SB_SQUARE;
-#if CONFIG_HIGHBITDEPTH
+    args->above_pred_buf[0] = x->above_pred_buf;
+    args->above_pred_buf[1] = x->above_pred_buf + (MAX_SB_SQUARE >> 1);
+    args->above_pred_buf[2] = x->above_pred_buf + MAX_SB_SQUARE;
+    args->left_pred_buf[0] = x->left_pred_buf;
+    args->left_pred_buf[1] = x->left_pred_buf + (MAX_SB_SQUARE >> 1);
+    args->left_pred_buf[2] = x->left_pred_buf + MAX_SB_SQUARE;
   }
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_MOTION_VAR
-
-  av1_zero(best_mbmode);
 
-  av1_zero(pmi_uv);
-  if (try_palette) {
-    if (above_mi)
-      palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-    if (left_mi)
-      palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-  }
-
-  estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
-                           &comp_mode_p);
-
-  for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX;
-  for (i = 0; i < TX_SIZES_ALL; i++) rate_uv_intra[i] = INT_MAX;
-  for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) x->pred_sse[i] = INT_MAX;
-  for (i = 0; i < MB_MODE_COUNT; ++i) {
-    for (k = 0; k < TOTAL_REFS_PER_FRAME; ++k) {
-      args.single_filter[i][k] = SWITCHABLE;
-    }
-  }
+  av1_collect_neighbors_ref_counts(xd);
 
-  rd_cost->rate = INT_MAX;
-#if CONFIG_SUPERTX
-  *returnrate_nocoef = INT_MAX;
-#endif  // CONFIG_SUPERTX
+  estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single,
+                           ref_costs_comp);
 
+  MV_REFERENCE_FRAME ref_frame;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     x->mbmi_ext->mode_context[ref_frame] = 0;
     x->mbmi_ext->compound_mode_context[ref_frame] = 0;
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+    if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame]) {
       assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
-      setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
-                         frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
-    }
-    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
-#if CONFIG_GLOBAL_MOTION
-    frame_mv[ZEROMV][ref_frame].as_int =
-        gm_get_motion_vector(&cm->global_motion[ref_frame],
-                             cm->allow_high_precision_mv, bsize, mi_col, mi_row,
-                             0
-#if CONFIG_AMVR
-                             ,
-                             cm->cur_frame_mv_precision_level
-#endif
-                             )
-            .as_int;
-#else   // CONFIG_GLOBAL_MOTION
-    frame_mv[ZEROMV][ref_frame].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
-    frame_mv[NEW_NEWMV][ref_frame].as_int = INVALID_MV;
-#if CONFIG_COMPOUND_SINGLEREF
-    frame_mv[SR_NEW_NEWMV][ref_frame].as_int = INVALID_MV;
-    frame_comp_mv[SR_NEW_NEWMV][ref_frame].as_int = INVALID_MV;
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_GLOBAL_MOTION
-    frame_mv[ZERO_ZEROMV][ref_frame].as_int =
-        gm_get_motion_vector(&cm->global_motion[ref_frame],
-                             cm->allow_high_precision_mv, bsize, mi_col, mi_row,
-                             0
-#if CONFIG_AMVR
-                             ,
-                             cm->cur_frame_mv_precision_level
-#endif
-                             )
-            .as_int;
-#else   // CONFIG_GLOBAL_MOTION
-    frame_mv[ZERO_ZEROMV][ref_frame].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
+      setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
+                                 yv12_mb);
+    }
   }
 
+  // TODO(zoeliu@google.com): To further optimize the obtaining of motion vector
+  // references for compound prediction, as not every pair of reference frames
+  // woud be examined for the RD evaluation.
   for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
-    MODE_INFO *const mi = xd->mi[0];
-    int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
     x->mbmi_ext->mode_context[ref_frame] = 0;
-    av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
-                     mbmi_ext->ref_mv_stack[ref_frame],
-                     mbmi_ext->compound_mode_context, candidates, mi_row,
-                     mi_col, NULL, NULL, mbmi_ext->mode_context);
-    if (mbmi_ext->ref_mv_count[ref_frame] < 2) {
-      MV_REFERENCE_FRAME rf[2];
-      av1_set_ref_frame(rf, ref_frame);
-      if (mbmi_ext->ref_mvs[rf[0]][0].as_int !=
-              frame_mv[ZEROMV][rf[0]].as_int ||
-          mbmi_ext->ref_mvs[rf[0]][1].as_int !=
-              frame_mv[ZEROMV][rf[0]].as_int ||
-          mbmi_ext->ref_mvs[rf[1]][0].as_int !=
-              frame_mv[ZEROMV][rf[1]].as_int ||
-          mbmi_ext->ref_mvs[rf[1]][1].as_int != frame_mv[ZEROMV][rf[1]].as_int)
-        mbmi_ext->mode_context[ref_frame] &= ~(1 << ALL_ZERO_FLAG_OFFSET);
-    }
-  }
-
-#if CONFIG_MOTION_VAR
+    av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+                     mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
+                     mi_col, mbmi_ext->mode_context);
+  }
+
   av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
 
   if (check_num_overlappable_neighbors(mbmi) &&
       is_motion_variation_allowed_bsize(bsize)) {
     av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col,
-                                        args.above_pred_buf, dst_width1,
-                                        dst_height1, args.above_pred_stride);
+                                        args->above_pred_buf, dst_width1,
+                                        dst_height1, args->above_pred_stride);
     av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col,
-                                       args.left_pred_buf, dst_width2,
-                                       dst_height2, args.left_pred_stride);
+                                       args->left_pred_buf, dst_width2,
+                                       dst_height2, args->left_pred_stride);
     av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
-                         mi_col);
-    calc_target_weighted_pred(cm, x, xd, mi_row, mi_col, args.above_pred_buf[0],
-                              args.above_pred_stride[0], args.left_pred_buf[0],
-                              args.left_pred_stride[0]);
+                         mi_col, 0, num_planes);
+    calc_target_weighted_pred(
+        cm, x, xd, mi_row, mi_col, args->above_pred_buf[0],
+        args->above_pred_stride[0], args->left_pred_buf[0],
+        args->left_pred_stride[0]);
   }
-#endif  // CONFIG_MOTION_VAR
 
+  int min_pred_mv_sad = INT_MAX;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
+    min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]);
+
+  for (int i = 0; i < 2; ++i) {
+    ref_frame_skip_mask[i] = 0;
+  }
+  memset(mode_skip_mask, 0, REF_FRAMES * sizeof(*mode_skip_mask));
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
+    if (!(cpi->ref_frame_flags & ref_frame_flag_list[ref_frame])) {
       // Skip checking missing references in both single and compound reference
       // modes. Note that a mode will be skipped iff both reference frames
       // are masked out.
       ref_frame_skip_mask[0] |= (1 << ref_frame);
       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
     } else {
-      for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
-        // Skip fixed mv modes for poor references
-        if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
-          mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
-          break;
-        }
+      // Skip fixed mv modes for poor references
+      if ((x->pred_mv_sad[ref_frame] >> 2) > min_pred_mv_sad) {
+        mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
       }
     }
     // If the segment reference frame feature is enabled....
@@ -10658,55 +9473,34 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   // segment level feature is enabled for this segment. This is to
   // prevent the possibility that we end up unable to pick any mode.
   if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
-    // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+    // Only consider GLOBALMV/ALTREF_FRAME for alt ref frame,
     // unless ARNR filtering is enabled in which case we want
     // an unfiltered alternative. We allow near/nearest as well
     // because they may result in zero-zero MVs but be cheaper.
     if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-      int_mv zeromv;
-      ref_frame_skip_mask[0] = (1 << LAST_FRAME) |
-#if CONFIG_EXT_REFS
-                               (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |
-                               (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) |
-#endif  // CONFIG_EXT_REFS
-                               (1 << GOLDEN_FRAME);
+      ref_frame_skip_mask[0] = (1 << LAST_FRAME) | (1 << LAST2_FRAME) |
+                               (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) |
+                               (1 << ALTREF2_FRAME) | (1 << GOLDEN_FRAME);
       ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
       // TODO(zoeliu): To further explore whether following needs to be done for
       //               BWDREF_FRAME as well.
       mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
-#if CONFIG_GLOBAL_MOTION
-      zeromv.as_int = gm_get_motion_vector(&cm->global_motion[ALTREF_FRAME],
-                                           cm->allow_high_precision_mv, bsize,
-                                           mi_col, mi_row, 0
-#if CONFIG_AMVR
-                                           ,
-                                           cm->cur_frame_mv_precision_level
-#endif
-                                           )
-                          .as_int;
-#else
-      zeromv.as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
-      if (frame_mv[NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
+      const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME };
+      int_mv near_mv, nearest_mv, global_mv;
+      get_this_mv(&nearest_mv, NEARESTMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+      get_this_mv(&near_mv, NEARMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+      get_this_mv(&global_mv, GLOBALMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+
+      if (near_mv.as_int != global_mv.as_int)
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
-      if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
+      if (nearest_mv.as_int != global_mv.as_int)
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
-      if (frame_mv[NEAREST_NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
-        mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARESTMV);
-      if (frame_mv[NEAR_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
-        mode_skip_mask[ALTREF_FRAME] |= (1 << NEAR_NEARMV);
-#if CONFIG_COMPOUND_SINGLEREF
-      if (frame_mv[SR_NEAREST_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int ||
-          frame_comp_mv[SR_NEAREST_NEARMV][ALTREF_FRAME].as_int !=
-              zeromv.as_int)
-        mode_skip_mask[ALTREF_FRAME] |= (1 << SR_NEAREST_NEARMV);
-#endif  // CONFIG_COMPOUND_SINGLEREF
     }
   }
 
   if (cpi->rc.is_src_frame_alt_ref) {
     if (sf->alt_ref_search_fp) {
-      assert(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]);
+      assert(cpi->ref_frame_flags & ref_frame_flag_list[ALTREF_FRAME]);
       mode_skip_mask[ALTREF_FRAME] = 0;
       ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
       ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
@@ -10733,24 +9527,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   mode_skip_mask[INTRA_FRAME] |=
       ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
 
-  for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0;
-  for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
-    mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
-
-  midx = sf->schedule_mode_search ? mode_skip_start : 0;
-  while (midx > 4) {
-    uint8_t end_pos = 0;
-    for (i = 5; i < midx; ++i) {
-      if (mode_threshold[mode_map[i - 1]] > mode_threshold[mode_map[i]]) {
-        uint8_t tmp = mode_map[i];
-        mode_map[i] = mode_map[i - 1];
-        mode_map[i - 1] = tmp;
-        end_pos = i;
-      }
-    }
-    midx = end_pos;
-  }
-
   if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
     x->use_default_intra_tx_type = 1;
   else
@@ -10760,528 +9536,705 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     x->use_default_inter_tx_type = 1;
   else
     x->use_default_inter_tx_type = 0;
-#if CONFIG_PVQ
-  od_encode_checkpoint(&x->daala_enc, &pre_buf);
-#endif  // CONFIG_PVQ
-  for (i = 0; i < MB_MODE_COUNT; ++i)
-    for (ref_frame = 0; ref_frame < TOTAL_REFS_PER_FRAME; ++ref_frame)
-      modelled_rd[i][ref_frame] = INT64_MAX;
-
-  for (midx = 0; midx < MAX_MODES; ++midx) {
-    int mode_index;
-    int mode_excluded = 0;
+  if (cpi->sf.skip_repeat_interpolation_filter_search) {
+    x->interp_filter_stats_idx[0] = 0;
+    x->interp_filter_stats_idx[1] = 0;
+  }
+}
+
+static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x,
+                                RD_STATS *rd_cost, PICK_MODE_CONTEXT *ctx,
+                                BLOCK_SIZE bsize, MB_MODE_INFO *const mbmi,
+                                PALETTE_MODE_INFO *const pmi,
+                                unsigned int *ref_costs_single,
+                                InterModeSearchState *search_state) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int rate2 = 0;
+  int64_t distortion2 = 0, best_rd_palette = search_state->best_rd, this_rd,
+          best_model_rd_palette = INT64_MAX;
+  int skippable = 0, rate_overhead_palette = 0;
+  RD_STATS rd_stats_y;
+  TX_SIZE uv_tx = TX_4X4;
+  uint8_t *const best_palette_color_map =
+      x->palette_buffer->best_palette_color_map;
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  MB_MODE_INFO best_mbmi_palette = *mbmi;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+
+  mbmi->mode = DC_PRED;
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  rate_overhead_palette = rd_pick_palette_intra_sby(
+      cpi, x, bsize, intra_mode_cost[DC_PRED], &best_mbmi_palette,
+      best_palette_color_map, &best_rd_palette, &best_model_rd_palette, NULL,
+      NULL, NULL, NULL, ctx, best_blk_skip);
+  if (pmi->palette_size[0] == 0) return;
+
+  memcpy(x->blk_skip, best_blk_skip,
+         sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
+
+  memcpy(color_map, best_palette_color_map,
+         rows * cols * sizeof(best_palette_color_map[0]));
+  super_block_yrd(cpi, x, &rd_stats_y, bsize, search_state->best_rd);
+  if (rd_stats_y.rate == INT_MAX) return;
+
+  skippable = rd_stats_y.skip;
+  distortion2 = rd_stats_y.dist;
+  rate2 = rd_stats_y.rate + rate_overhead_palette;
+  rate2 += ref_costs_single[INTRA_FRAME];
+  if (num_planes > 1) {
+    uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+    if (search_state->rate_uv_intra[uv_tx] == INT_MAX) {
+      choose_intra_uv_mode(
+          cpi, x, bsize, uv_tx, &search_state->rate_uv_intra[uv_tx],
+          &search_state->rate_uv_tokenonly[uv_tx],
+          &search_state->dist_uvs[uv_tx], &search_state->skip_uvs[uv_tx],
+          &search_state->mode_uv[uv_tx]);
+      search_state->pmi_uv[uv_tx] = *pmi;
+      search_state->uv_angle_delta[uv_tx] = mbmi->angle_delta[PLANE_TYPE_UV];
+    }
+    mbmi->uv_mode = search_state->mode_uv[uv_tx];
+    pmi->palette_size[1] = search_state->pmi_uv[uv_tx].palette_size[1];
+    if (pmi->palette_size[1] > 0) {
+      memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+             search_state->pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+             2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+    }
+    mbmi->angle_delta[PLANE_TYPE_UV] = search_state->uv_angle_delta[uv_tx];
+    skippable = skippable && search_state->skip_uvs[uv_tx];
+    distortion2 += search_state->dist_uvs[uv_tx];
+    rate2 += search_state->rate_uv_intra[uv_tx];
+  }
+
+  if (skippable) {
+    rate2 -= rd_stats_y.rate;
+    if (num_planes > 1) rate2 -= search_state->rate_uv_tokenonly[uv_tx];
+    rate2 += x->skip_cost[av1_get_skip_context(xd)][1];
+  } else {
+    rate2 += x->skip_cost[av1_get_skip_context(xd)][0];
+  }
+  this_rd = RDCOST(x->rdmult, rate2, distortion2);
+  if (this_rd < search_state->best_rd) {
+    search_state->best_mode_index = 3;
+    mbmi->mv[0].as_int = 0;
+    rd_cost->rate = rate2;
+    rd_cost->dist = distortion2;
+    rd_cost->rdcost = this_rd;
+    search_state->best_rd = this_rd;
+    search_state->best_mbmode = *mbmi;
+    search_state->best_skip2 = 0;
+    search_state->best_mode_skippable = skippable;
+    memcpy(ctx->blk_skip, x->blk_skip,
+           sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+  }
+}
+
+static void init_inter_mode_search_state(InterModeSearchState *search_state,
+                                         const AV1_COMP *cpi,
+                                         const TileDataEnc *tile_data,
+                                         const MACROBLOCK *x, BLOCK_SIZE bsize,
+                                         int64_t best_rd_so_far) {
+  search_state->best_rd = best_rd_so_far;
+
+  av1_zero(search_state->best_mbmode);
+
+  search_state->best_rate_y = INT_MAX;
+
+  search_state->best_rate_uv = INT_MAX;
+
+  search_state->best_mode_skippable = 0;
+
+  search_state->best_skip2 = 0;
+
+  search_state->best_mode_index = -1;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const unsigned char segment_id = mbmi->segment_id;
+
+  search_state->skip_intra_modes = 0;
+
+  search_state->num_available_refs = 0;
+  memset(search_state->dist_refs, -1, sizeof(search_state->dist_refs));
+  memset(search_state->dist_order_refs, -1,
+         sizeof(search_state->dist_order_refs));
+
+  for (int i = 0; i <= LAST_NEW_MV_INDEX; ++i)
+    search_state->mode_threshold[i] = 0;
+  const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
+  for (int i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
+    search_state->mode_threshold[i] =
+        ((int64_t)rd_threshes[i] * tile_data->thresh_freq_fact[bsize][i]) >> 5;
+
+  search_state->best_intra_mode = DC_PRED;
+  search_state->best_intra_rd = INT64_MAX;
+
+  search_state->angle_stats_ready = 0;
+
+  search_state->best_pred_sse = UINT_MAX;
+
+  for (int i = 0; i < TX_SIZES_ALL; i++)
+    search_state->rate_uv_intra[i] = INT_MAX;
+
+  av1_zero(search_state->pmi_uv);
+
+  for (int i = 0; i < REFERENCE_MODES; ++i)
+    search_state->best_pred_rd[i] = INT64_MAX;
+
+  av1_zero(search_state->single_newmv);
+  av1_zero(search_state->single_newmv_rate);
+  av1_zero(search_state->single_newmv_valid);
+  for (int i = 0; i < MB_MODE_COUNT; ++i)
+    for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
+      search_state->modelled_rd[i][ref_frame] = INT64_MAX;
+}
+
+static int inter_mode_search_order_independent_skip(
+    const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize, int mode_index,
+    int mi_row, int mi_col, uint32_t *mode_skip_mask,
+    uint16_t *ref_frame_skip_mask) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const AV1_COMMON *const cm = &cpi->common;
+  const struct segmentation *const seg = &cm->seg;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const unsigned char segment_id = mbmi->segment_id;
+  const MV_REFERENCE_FRAME *ref_frame = av1_mode_order[mode_index].ref_frame;
+  const PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode;
+
+  if (cpi->sf.mode_pruning_based_on_two_pass_partition_search &&
+      !x->cb_partition_scan) {
+    const int mi_width = mi_size_wide[bsize];
+    const int mi_height = mi_size_high[bsize];
+    int found = 0;
+    // Search in the stats table to see if the ref frames have been used in the
+    // first pass of partition search.
+    for (int row = mi_row; row < mi_row + mi_width && !found;
+         row += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+      for (int col = mi_col; col < mi_col + mi_height && !found;
+           col += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+        const int index = av1_first_partition_pass_stats_index(row, col);
+        const FIRST_PARTITION_PASS_STATS *const stats =
+            &x->first_partition_pass_stats[index];
+        if (stats->ref0_counts[ref_frame[0]] &&
+            (ref_frame[1] < 0 || stats->ref1_counts[ref_frame[1]])) {
+          found = 1;
+          break;
+        }
+      }
+    }
+    if (!found) return 1;
+  }
+
+  if (ref_frame[0] > INTRA_FRAME && ref_frame[1] == INTRA_FRAME) {
+    // Mode must by compatible
+    if (!is_interintra_allowed_mode(this_mode)) return 1;
+    if (!is_interintra_allowed_bsize(bsize)) return 1;
+  }
+
+  // This is only used in motion vector unit test.
+  if (cpi->oxcf.motion_vector_unit_test && ref_frame[0] == INTRA_FRAME)
+    return 1;
+
+  if (ref_frame[0] == INTRA_FRAME) {
+    if (this_mode != DC_PRED) {
+      // Disable intra modes other than DC_PRED for blocks with low variance
+      // Threshold for intra skipping based on source variance
+      // TODO(debargha): Specialize the threshold for super block sizes
+      const unsigned int skip_intra_var_thresh = 64;
+      if ((sf->mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
+          x->source_variance < skip_intra_var_thresh)
+        return 1;
+    }
+  } else {
+    if (!is_comp_ref_allowed(bsize) && ref_frame[1] > INTRA_FRAME) return 1;
+  }
+
+  const int comp_pred = ref_frame[1] > INTRA_FRAME;
+  if (comp_pred) {
+    if (!cpi->allow_comp_inter_inter) return 1;
+
+    // Skip compound inter modes if ARF is not available.
+    if (!(cpi->ref_frame_flags & ref_frame_flag_list[ref_frame[1]])) return 1;
+
+    // Do not allow compound prediction if the segment level reference frame
+    // feature is in use as in this case there can only be one reference.
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) return 1;
+  }
+
+  if (sf->selective_ref_frame) {
+    if (sf->selective_ref_frame >= 2 || x->cb_partition_scan) {
+      if (ref_frame[0] == ALTREF2_FRAME || ref_frame[1] == ALTREF2_FRAME)
+        if (get_relative_dist(
+                cm, cm->cur_frame->ref_frame_offset[ALTREF2_FRAME - LAST_FRAME],
+                cm->frame_offset) < 0)
+          return 1;
+      if (ref_frame[0] == BWDREF_FRAME || ref_frame[1] == BWDREF_FRAME)
+        if (get_relative_dist(
+                cm, cm->cur_frame->ref_frame_offset[BWDREF_FRAME - LAST_FRAME],
+                cm->frame_offset) < 0)
+          return 1;
+    }
+    if (ref_frame[0] == LAST3_FRAME || ref_frame[1] == LAST3_FRAME)
+      if (get_relative_dist(
+              cm, cm->cur_frame->ref_frame_offset[LAST3_FRAME - LAST_FRAME],
+              cm->cur_frame->ref_frame_offset[GOLDEN_FRAME - LAST_FRAME]) <= 0)
+        return 1;
+    if (ref_frame[0] == LAST2_FRAME || ref_frame[1] == LAST2_FRAME)
+      if (get_relative_dist(
+              cm, cm->cur_frame->ref_frame_offset[LAST2_FRAME - LAST_FRAME],
+              cm->cur_frame->ref_frame_offset[GOLDEN_FRAME - LAST_FRAME]) <= 0)
+        return 1;
+  }
+
+  // One-sided compound is used only when all reference frames are one-sided.
+  if (sf->selective_ref_frame && comp_pred && !cpi->all_one_sided_refs) {
+    unsigned int ref_offsets[2];
+    for (int i = 0; i < 2; ++i) {
+      const int buf_idx = cm->frame_refs[ref_frame[i] - LAST_FRAME].idx;
+      assert(buf_idx >= 0);
+      ref_offsets[i] = cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+    }
+    if ((get_relative_dist(cm, ref_offsets[0], cm->frame_offset) <= 0 &&
+         get_relative_dist(cm, ref_offsets[1], cm->frame_offset) <= 0) ||
+        (get_relative_dist(cm, ref_offsets[0], cm->frame_offset) > 0 &&
+         get_relative_dist(cm, ref_offsets[1], cm->frame_offset) > 0))
+      return 1;
+  }
+
+  if (mode_skip_mask[ref_frame[0]] & (1 << this_mode)) {
+    return 1;
+  }
+
+  if ((ref_frame_skip_mask[0] & (1 << ref_frame[0])) &&
+      (ref_frame_skip_mask[1] & (1 << AOMMAX(0, ref_frame[1])))) {
+    return 1;
+  }
+
+  if (skip_repeated_mv(cm, x, this_mode, ref_frame)) {
+    return 1;
+  }
+  return 0;
+}
+
+static INLINE void init_mbmi(MB_MODE_INFO *mbmi, int mode_index,
+                             const AV1_COMMON *cm) {
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode;
+  mbmi->ref_mv_idx = 0;
+  mbmi->mode = this_mode;
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->ref_frame[0] = av1_mode_order[mode_index].ref_frame[0];
+  mbmi->ref_frame[1] = av1_mode_order[mode_index].ref_frame[1];
+  pmi->palette_size[0] = 0;
+  pmi->palette_size[1] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
+  set_default_interp_filters(mbmi, cm->interp_filter);
+}
+
+static int handle_intra_mode(InterModeSearchState *search_state,
+                             const AV1_COMP *cpi, MACROBLOCK *x,
+                             BLOCK_SIZE bsize, int ref_frame_cost,
+                             const PICK_MODE_CONTEXT *ctx, int disable_skip,
+                             RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+                             RD_STATS *rd_stats_uv) {
+  const AV1_COMMON *cm = &cpi->common;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(mbmi->ref_frame[0] == INTRA_FRAME);
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int try_palette =
+      av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
+  const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
+  const int intra_cost_penalty = av1_get_intra_cost_penalty(
+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+  const int num_planes = av1_num_planes(cm);
+  av1_init_rd_stats(rd_stats);
+  av1_init_rd_stats(rd_stats_y);
+  av1_init_rd_stats(rd_stats_uv);
+  TX_SIZE uv_tx;
+  int is_directional_mode = av1_is_directional_mode(mbmi->mode);
+  if (is_directional_mode && av1_use_angle_delta(bsize)) {
+    int rate_dummy;
+    int64_t model_rd = INT64_MAX;
+    if (!search_state->angle_stats_ready) {
+      const int src_stride = x->plane[0].src.stride;
+      const uint8_t *src = x->plane[0].src.buf;
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+        highbd_angle_estimation(src, src_stride, rows, cols, bsize,
+                                search_state->directional_mode_skip_mask);
+      else
+        angle_estimation(src, src_stride, rows, cols, bsize,
+                         search_state->directional_mode_skip_mask);
+      search_state->angle_stats_ready = 1;
+    }
+    if (search_state->directional_mode_skip_mask[mbmi->mode]) return 0;
+    rd_stats_y->rate = INT_MAX;
+    rd_pick_intra_angle_sby(cpi, x, &rate_dummy, rd_stats_y, bsize,
+                            intra_mode_cost[mbmi->mode], search_state->best_rd,
+                            &model_rd);
+  } else {
+    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+    super_block_yrd(cpi, x, rd_stats_y, bsize, search_state->best_rd);
+  }
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  memcpy(best_blk_skip, x->blk_skip,
+         sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+
+  if (mbmi->mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
+    RD_STATS rd_stats_y_fi;
+    int filter_intra_selected_flag = 0;
+    TX_SIZE best_tx_size = mbmi->tx_size;
+    TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
+    memcpy(best_txk_type, mbmi->txk_type,
+           sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+    FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED;
+    int64_t best_rd_tmp = INT64_MAX;
+    if (rd_stats_y->rate != INT_MAX) {
+      best_rd_tmp = RDCOST(x->rdmult,
+                           rd_stats_y->rate + x->filter_intra_cost[bsize][0] +
+                               intra_mode_cost[mbmi->mode],
+                           rd_stats_y->dist);
+    }
+
+    mbmi->filter_intra_mode_info.use_filter_intra = 1;
+    for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED;
+         fi_mode < FILTER_INTRA_MODES; ++fi_mode) {
+      int64_t this_rd_tmp;
+      mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode;
+
+      super_block_yrd(cpi, x, &rd_stats_y_fi, bsize, search_state->best_rd);
+      if (rd_stats_y_fi.rate == INT_MAX) {
+        continue;
+      }
+      const int this_rate_tmp =
+          rd_stats_y_fi.rate +
+          intra_mode_info_cost_y(cpi, x, mbmi, bsize,
+                                 intra_mode_cost[mbmi->mode]);
+      this_rd_tmp = RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist);
+
+      if (this_rd_tmp < best_rd_tmp) {
+        best_tx_size = mbmi->tx_size;
+        memcpy(best_txk_type, mbmi->txk_type,
+               sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+        memcpy(best_blk_skip, x->blk_skip,
+               sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+        best_fi_mode = fi_mode;
+        *rd_stats_y = rd_stats_y_fi;
+        filter_intra_selected_flag = 1;
+        best_rd_tmp = this_rd_tmp;
+      }
+    }
+
+    mbmi->tx_size = best_tx_size;
+    memcpy(mbmi->txk_type, best_txk_type,
+           sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+    memcpy(x->blk_skip, best_blk_skip,
+           sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+
+    if (filter_intra_selected_flag) {
+      mbmi->filter_intra_mode_info.use_filter_intra = 1;
+      mbmi->filter_intra_mode_info.filter_intra_mode = best_fi_mode;
+    } else {
+      mbmi->filter_intra_mode_info.use_filter_intra = 0;
+    }
+  }
+
+  if (rd_stats_y->rate == INT_MAX) return 0;
+
+  if (num_planes > 1) {
+    uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+    if (search_state->rate_uv_intra[uv_tx] == INT_MAX) {
+      choose_intra_uv_mode(
+          cpi, x, bsize, uv_tx, &search_state->rate_uv_intra[uv_tx],
+          &search_state->rate_uv_tokenonly[uv_tx],
+          &search_state->dist_uvs[uv_tx], &search_state->skip_uvs[uv_tx],
+          &search_state->mode_uv[uv_tx]);
+      if (try_palette) search_state->pmi_uv[uv_tx] = *pmi;
+      search_state->uv_angle_delta[uv_tx] = mbmi->angle_delta[PLANE_TYPE_UV];
+    }
+
+    rd_stats_uv->rate = search_state->rate_uv_tokenonly[uv_tx];
+    rd_stats_uv->dist = search_state->dist_uvs[uv_tx];
+    rd_stats_uv->skip = search_state->skip_uvs[uv_tx];
+    rd_stats->skip = rd_stats_y->skip && rd_stats_uv->skip;
+    mbmi->uv_mode = search_state->mode_uv[uv_tx];
+    if (try_palette) {
+      pmi->palette_size[1] = search_state->pmi_uv[uv_tx].palette_size[1];
+      memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+             search_state->pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+             2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+    }
+    mbmi->angle_delta[PLANE_TYPE_UV] = search_state->uv_angle_delta[uv_tx];
+  }
+
+  rd_stats->rate =
+      rd_stats_y->rate +
+      intra_mode_info_cost_y(cpi, x, mbmi, bsize, intra_mode_cost[mbmi->mode]);
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
+    // super_block_yrd above includes the cost of the tx_size in the
+    // tokenonly rate, but for intra blocks, tx_size is always coded
+    // (prediction granularity), so we account for it in the full rate,
+    // not the tokenonly rate.
+    rd_stats_y->rate -= tx_size_cost(cm, x, bsize, mbmi->tx_size);
+  }
+  if (num_planes > 1 && !x->skip_chroma_rd) {
+    const int uv_mode_cost =
+        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mbmi->uv_mode];
+    rd_stats->rate +=
+        rd_stats_uv->rate +
+        intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost);
+  }
+  if (mbmi->mode != DC_PRED && mbmi->mode != PAETH_PRED)
+    rd_stats->rate += intra_cost_penalty;
+  rd_stats->dist = rd_stats_y->dist + rd_stats_uv->dist;
+
+  // Estimate the reference frame signaling cost and add it
+  // to the rolling cost variable.
+  rd_stats->rate += ref_frame_cost;
+  if (rd_stats->skip) {
+    // Back out the coefficient coding costs
+    rd_stats->rate -= (rd_stats_y->rate + rd_stats_uv->rate);
+    rd_stats_y->rate = 0;
+    rd_stats_uv->rate = 0;
+    // Cost the skip mb case
+    rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][1];
+  } else {
+    // Add in the cost of the no skip flag.
+    rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][0];
+  }
+  // Calculate the final RD estimate for this mode.
+  int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+
+  // Keep record of best intra rd
+  if (this_rd < search_state->best_intra_rd) {
+    search_state->best_intra_rd = this_rd;
+    search_state->best_intra_mode = mbmi->mode;
+  }
+
+  if (sf->skip_intra_in_interframe) {
+    if (search_state->best_rd < (INT64_MAX / 2) &&
+        this_rd > (search_state->best_rd + (search_state->best_rd >> 1)))
+      search_state->skip_intra_modes = 1;
+  }
+
+  if (!disable_skip) {
+    for (int i = 0; i < REFERENCE_MODES; ++i)
+      search_state->best_pred_rd[i] =
+          AOMMIN(search_state->best_pred_rd[i], this_rd);
+  }
+  return 1;
+}
+
+void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
+                               MACROBLOCK *x, int mi_row, int mi_col,
+                               RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int try_palette =
+      av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const struct segmentation *const seg = &cm->seg;
+  PREDICTION_MODE this_mode;
+  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
+  unsigned char segment_id = mbmi->segment_id;
+  int i, k;
+  struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
+  unsigned int ref_costs_single[REF_FRAMES];
+  unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
+  int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)];
+  int *mode_map = tile_data->mode_map[bsize];
+  uint32_t mode_skip_mask[REF_FRAMES];
+  uint16_t ref_frame_skip_mask[2];
+
+  InterModeSearchState search_state;
+  init_inter_mode_search_state(&search_state, cpi, tile_data, x, bsize,
+                               best_rd_so_far);
+
+  HandleInterModeArgs args = {
+    { NULL },  { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
+    { NULL },  { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1 },
+    NULL,      NULL,
+    NULL,      NULL,
+    { { 0 } }, INT_MAX,
+    INT_MAX
+  };
+  for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+
+  av1_invalid_rd_stats(rd_cost);
+
+  // init params, set frame modes, speed features
+  set_params_rd_pick_inter_mode(cpi, x, &args, bsize, mi_row, mi_col,
+                                ref_frame_skip_mask, mode_skip_mask,
+                                ref_costs_single, ref_costs_comp, yv12_mb);
+
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+  int64_t best_est_rd = INT64_MAX;
+#endif
+
+  for (int midx = 0; midx < MAX_MODES; ++midx) {
+    int mode_index = mode_map[midx];
     int64_t this_rd = INT64_MAX;
     int disable_skip = 0;
-    int compmode_cost = 0;
     int rate2 = 0, rate_y = 0, rate_uv = 0;
-    int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+    int64_t distortion2 = 0;
     int skippable = 0;
     int this_skip2 = 0;
-    int64_t total_sse = INT64_MAX;
     uint8_t ref_frame_type;
-#if CONFIG_PVQ
-    od_encode_rollback(&x->daala_enc, &pre_buf);
-#endif  // CONFIG_PVQ
-    mode_index = mode_map[midx];
+
     this_mode = av1_mode_order[mode_index].mode;
     ref_frame = av1_mode_order[mode_index].ref_frame[0];
     second_ref_frame = av1_mode_order[mode_index].ref_frame[1];
-    mbmi->ref_mv_idx = 0;
-
-    if (ref_frame > INTRA_FRAME && second_ref_frame == INTRA_FRAME) {
-      // Mode must by compatible
-      if (!is_interintra_allowed_mode(this_mode)) continue;
-      if (!is_interintra_allowed_bsize(bsize)) continue;
-    }
-
-    if (is_inter_compound_mode(this_mode)) {
-      frame_mv[this_mode][ref_frame].as_int =
-          frame_mv[compound_ref0_mode(this_mode)][ref_frame].as_int;
-      frame_mv[this_mode][second_ref_frame].as_int =
-          frame_mv[compound_ref1_mode(this_mode)][second_ref_frame].as_int;
-#if CONFIG_COMPOUND_SINGLEREF
-    } else if (is_inter_singleref_comp_mode(this_mode)) {
-      frame_mv[this_mode][ref_frame].as_int =
-          frame_mv[compound_ref0_mode(this_mode)][ref_frame].as_int;
-      frame_comp_mv[this_mode][ref_frame].as_int =
-          frame_mv[compound_ref1_mode(this_mode)][ref_frame].as_int;
-#endif  // CONFIG_COMPOUND_SINGLEREF
-    }
-
-    // Look at the reference frame of the best mode so far and set the
-    // skip mask to look at a subset of the remaining modes.
-    if (midx == mode_skip_start && best_mode_index >= 0) {
-      switch (best_mbmode.ref_frame[0]) {
-        case INTRA_FRAME: break;
-        case LAST_FRAME:
-          ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK;
-          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-          break;
-#if CONFIG_EXT_REFS
-        case LAST2_FRAME:
-          ref_frame_skip_mask[0] |= LAST2_FRAME_MODE_MASK;
-          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-          break;
-        case LAST3_FRAME:
-          ref_frame_skip_mask[0] |= LAST3_FRAME_MODE_MASK;
-          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-          break;
-#endif  // CONFIG_EXT_REFS
-        case GOLDEN_FRAME:
-          ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
-          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-          break;
-#if CONFIG_EXT_REFS
-        case BWDREF_FRAME:
-          ref_frame_skip_mask[0] |= BWDREF_FRAME_MODE_MASK;
-          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-          break;
-        case ALTREF2_FRAME:
-          ref_frame_skip_mask[0] |= ALTREF2_FRAME_MODE_MASK;
-          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-          break;
-#endif  // CONFIG_EXT_REFS
-        case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALTREF_FRAME_MODE_MASK;
-#if CONFIG_EXT_REFS
-          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-#endif  // CONFIG_EXT_REFS
-          break;
-        case NONE_FRAME:
-        case TOTAL_REFS_PER_FRAME:
-          assert(0 && "Invalid Reference frame");
-          break;
-      }
-    }
 
-    if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
-        (ref_frame_skip_mask[1] & (1 << AOMMAX(0, second_ref_frame))))
-      continue;
+    init_mbmi(mbmi, mode_index, cm);
 
-#if CONFIG_EXT_COMP_REFS
-// TODO(zoeliu): Following toggle between #if 0/1 and the bug will manifest
-// itself.
-#if 0
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame]) ||
-        (second_ref_frame > INTRA_FRAME &&
-         (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))))
-      printf("Frame=%d, bsize=%d, (mi_row,mi_col)=(%d,%d), ref_frame=%d, "
-             "second_ref_frame=%d\n", cm->current_video_frame, bsize, mi_row,
-             mi_col, ref_frame, second_ref_frame);
-
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
-    if (second_ref_frame > INTRA_FRAME &&
-        (!(cpi->ref_frame_flags & flag_list[second_ref_frame])))
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+    if (inter_mode_search_order_independent_skip(cpi, x, bsize, mode_index,
+                                                 mi_row, mi_col, mode_skip_mask,
+                                                 ref_frame_skip_mask))
       continue;
-#endif  // 0
 
-#if !USE_UNI_COMP_REFS
-    // NOTE(zoeliu): Temporarily disable uni-directional comp refs
-    if (second_ref_frame > INTRA_FRAME) {
-      if (!((ref_frame < BWDREF_FRAME) ^ (second_ref_frame < BWDREF_FRAME)))
+    if (ref_frame == INTRA_FRAME) {
+      if (sf->skip_intra_in_interframe && search_state.skip_intra_modes)
         continue;
     }
-    assert(second_ref_frame <= INTRA_FRAME ||
-           ((ref_frame < BWDREF_FRAME) ^ (second_ref_frame < BWDREF_FRAME)));
-#endif  // !USE_UNI_COMP_REFS
-#endif  // CONFIG_EXT_COMP_REFS
 
-    if (mode_skip_mask[ref_frame] & (1 << this_mode)) continue;
-
-    // Test best rd so far against threshold for trying this mode.
-    if (best_mode_skippable && sf->schedule_mode_search)
-      mode_threshold[mode_index] <<= 1;
-
-    if (best_rd < mode_threshold[mode_index]) continue;
-
-    // This is only used in motion vector unit test.
-    if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue;
-
-#if CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS  // Changes LL bitstream
-#if CONFIG_EXT_REFS
-    if (cpi->oxcf.pass == 0) {
-      // Complexity-compression trade-offs
-      // if (ref_frame == ALTREF_FRAME) continue;
-      // if (ref_frame == BWDREF_FRAME) continue;
-      if (second_ref_frame == ALTREF_FRAME) continue;
-      // if (second_ref_frame == BWDREF_FRAME) continue;
+    if (sf->drop_ref) {
+      if (ref_frame > INTRA_FRAME && second_ref_frame > INTRA_FRAME) {
+        if (search_state.num_available_refs > 2) {
+          if ((ref_frame == search_state.dist_order_refs[0] &&
+               second_ref_frame == search_state.dist_order_refs[1]) ||
+              (ref_frame == search_state.dist_order_refs[1] &&
+               second_ref_frame == search_state.dist_order_refs[0]))
+            continue;
+        }
+      }
     }
-#endif  // CONFIG_EXT_REFS
-#endif  // CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS
-    comp_pred = second_ref_frame > INTRA_FRAME;
-    if (comp_pred) {
-      if (!cpi->allow_comp_inter_inter) continue;
 
-      // Skip compound inter modes if ARF is not available.
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+    if (search_state.best_rd < search_state.mode_threshold[mode_index])
+      continue;
 
-      // Do not allow compound prediction if the segment level reference frame
-      // feature is in use as in this case there can only be one reference.
-      if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue;
+    const int comp_pred = second_ref_frame > INTRA_FRAME;
+    const int ref_frame_cost = comp_pred
+                                   ? ref_costs_comp[ref_frame][second_ref_frame]
+                                   : ref_costs_single[ref_frame];
+    const int compmode_cost =
+        is_comp_ref_allowed(mbmi->sb_type) ? comp_inter_cost[comp_pred] : 0;
+    const int real_compmode_cost =
+        cm->reference_mode == REFERENCE_MODE_SELECT ? compmode_cost : 0;
 
-      if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
-          best_mode_index >= 0 && best_mbmode.ref_frame[0] == INTRA_FRAME)
+    if (comp_pred) {
+      if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+          search_state.best_mode_index >= 0 &&
+          search_state.best_mbmode.ref_frame[0] == INTRA_FRAME)
         continue;
-
-      mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
-    } else {
-      if (ref_frame != INTRA_FRAME)
-        mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
     }
 
     if (ref_frame == INTRA_FRAME) {
       if (sf->adaptive_mode_search)
-        if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_pred_sse)
+        if ((x->source_variance << num_pels_log2_lookup[bsize]) >
+            search_state.best_pred_sse)
           continue;
 
       if (this_mode != DC_PRED) {
-        // Disable intra modes other than DC_PRED for blocks with low variance
-        // Threshold for intra skipping based on source variance
-        // TODO(debargha): Specialize the threshold for super block sizes
-        const unsigned int skip_intra_var_thresh = 64;
-        if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
-            x->source_variance < skip_intra_var_thresh)
-          continue;
         // Only search the oblique modes if the best so far is
         // one of the neighboring directional modes
-        if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
-            (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
-          if (best_mode_index >= 0 && best_mbmode.ref_frame[0] > INTRA_FRAME)
+        if ((sf->mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+            (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) {
+          if (search_state.best_mode_index >= 0 &&
+              search_state.best_mbmode.ref_frame[0] > INTRA_FRAME)
             continue;
         }
-        if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
-          if (conditional_skipintra(this_mode, best_intra_mode)) continue;
+        if (sf->mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+          if (conditional_skipintra(this_mode, search_state.best_intra_mode))
+            continue;
         }
       }
-#if CONFIG_GLOBAL_MOTION
-    } else if (cm->global_motion[ref_frame].wmtype == IDENTITY &&
-               (!comp_pred ||
-                cm->global_motion[second_ref_frame].wmtype == IDENTITY)) {
-#else   // CONFIG_GLOBAL_MOTION
-    } else {
-#endif  // CONFIG_GLOBAL_MOTION
-      const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, second_ref_frame };
-      if (!check_best_zero_mv(cpi, x, mbmi_ext->mode_context,
-                              mbmi_ext->compound_mode_context, frame_mv,
-                              this_mode, ref_frames, bsize, -1, mi_row, mi_col))
-        continue;
     }
 
-    mbmi->mode = this_mode;
-    mbmi->uv_mode = UV_DC_PRED;
-    mbmi->ref_frame[0] = ref_frame;
-    mbmi->ref_frame[1] = second_ref_frame;
-    pmi->palette_size[0] = 0;
-    pmi->palette_size[1] = 0;
-#if CONFIG_FILTER_INTRA
-    mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-    mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
-#endif  // CONFIG_FILTER_INTRA
-        // Evaluate all sub-pel filters irrespective of whether we can use
-        // them for this frame.
-
-    set_default_interp_filters(mbmi, cm->interp_filter);
-
-    mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
-    mbmi->motion_mode = SIMPLE_TRANSLATION;
-
-    x->skip = 0;
-    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
-
     // Select prediction reference frames.
-    for (i = 0; i < MAX_MB_PLANE; i++) {
+    for (i = 0; i < num_planes; i++) {
       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
       if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
     }
 
-#if CONFIG_COMPOUND_SINGLEREF
-    // Single ref compound mode
-    if (!comp_pred && is_inter_singleref_comp_mode(mbmi->mode)) {
-      xd->block_refs[1] = xd->block_refs[0];
-      for (i = 0; i < MAX_MB_PLANE; i++)
-        xd->plane[i].pre[1] = xd->plane[i].pre[0];
-    }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-#if CONFIG_INTERINTRA
-    mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
-#endif  // CONFIG_INTERINTRA
-
     if (ref_frame == INTRA_FRAME) {
-      RD_STATS rd_stats_y;
-      TX_SIZE uv_tx;
-      struct macroblockd_plane *const pd = &xd->plane[1];
-#if CONFIG_EXT_INTRA
-      is_directional_mode = av1_is_directional_mode(mbmi->mode, bsize);
-      if (is_directional_mode && av1_use_angle_delta(bsize)) {
-        int rate_dummy;
-        int64_t model_rd = INT64_MAX;
-        if (!angle_stats_ready) {
-          const int src_stride = x->plane[0].src.stride;
-          const uint8_t *src = x->plane[0].src.buf;
-#if CONFIG_HIGHBITDEPTH
-          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-            highbd_angle_estimation(src, src_stride, rows, cols, bsize,
-                                    directional_mode_skip_mask);
-          else
-#endif  // CONFIG_HIGHBITDEPTH
-            angle_estimation(src, src_stride, rows, cols, bsize,
-                             directional_mode_skip_mask);
-          angle_stats_ready = 1;
-        }
-        if (directional_mode_skip_mask[mbmi->mode]) continue;
-        rd_stats_y.rate = INT_MAX;
-        rd_pick_intra_angle_sby(cpi, x, &rate_dummy, &rd_stats_y, bsize,
-                                intra_mode_cost[mbmi->mode], best_rd,
-                                &model_rd);
-      } else {
-        mbmi->angle_delta[0] = 0;
-        super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd);
-      }
-#else
-      super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd);
-#endif  // CONFIG_EXT_INTRA
-      rate_y = rd_stats_y.rate;
-      distortion_y = rd_stats_y.dist;
-      skippable = rd_stats_y.skip;
-
-      if (rate_y == INT_MAX) continue;
-
-#if CONFIG_FILTER_INTRA
-      if (mbmi->mode == DC_PRED) dc_skipped = 0;
-#endif  // CONFIG_FILTER_INTRA
-
-      uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][pd->subsampling_x]
-                              [pd->subsampling_y];
-      if (rate_uv_intra[uv_tx] == INT_MAX) {
-        choose_intra_uv_mode(cpi, x, bsize, uv_tx, &rate_uv_intra[uv_tx],
-                             &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx],
-                             &skip_uvs[uv_tx], &mode_uv[uv_tx]);
-        if (try_palette) pmi_uv[uv_tx] = *pmi;
-
-#if CONFIG_EXT_INTRA
-        uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-        filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
-#endif  // CONFIG_FILTER_INTRA
-      }
-
-      rate_uv = rate_uv_tokenonly[uv_tx];
-      distortion_uv = dist_uvs[uv_tx];
-      skippable = skippable && skip_uvs[uv_tx];
-      mbmi->uv_mode = mode_uv[uv_tx];
-      if (try_palette) {
-        pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
-        memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
-               pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
-               2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
-      }
-
-#if CONFIG_EXT_INTRA
-      mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-      mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
-          filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
-      if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
-        mbmi->filter_intra_mode_info.filter_intra_mode[1] =
-            filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
-      }
-#endif  // CONFIG_FILTER_INTRA
-
-#if CONFIG_CB4X4
-      rate2 = rate_y + intra_mode_cost[mbmi->mode];
-      if (!x->skip_chroma_rd)
-        rate2 += rate_uv + x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
-#else
-      rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv +
-              x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
-#endif  // CONFIG_CB4X4
-
-      if (try_palette && mbmi->mode == DC_PRED) {
-        rate2 += av1_cost_bit(
-            av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0);
-      }
-
-      if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
-        // super_block_yrd above includes the cost of the tx_size in the
-        // tokenonly rate, but for intra blocks, tx_size is always coded
-        // (prediction granularity), so we account for it in the full rate,
-        // not the tokenonly rate.
-        rate_y -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
-      }
-#if CONFIG_EXT_INTRA
-      if (is_directional_mode) {
-#if CONFIG_INTRA_INTERP
-        const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
-        const int p_angle =
-            mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
-        if (av1_is_intra_filter_switchable(p_angle))
-          rate2 += x->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
-#endif  // CONFIG_INTRA_INTERP
-        if (av1_use_angle_delta(bsize)) {
-          rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
-                                      MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
-        }
-      }
-      if (av1_is_directional_mode(get_uv_mode(mbmi->uv_mode), bsize) &&
-          av1_use_angle_delta(bsize)) {
-        rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
-                                    MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
-      }
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-      if (mbmi->mode == DC_PRED) {
-        rate2 +=
-            av1_cost_bit(cm->fc->filter_intra_probs[0],
-                         mbmi->filter_intra_mode_info.use_filter_intra_mode[0]);
-        if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) {
-          rate2 += write_uniform_cost(
-              FILTER_INTRA_MODES,
-              mbmi->filter_intra_mode_info.filter_intra_mode[0]);
-        }
+      RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv;
+      const int ret = handle_intra_mode(
+          &search_state, cpi, x, bsize, ref_frame_cost, ctx, disable_skip,
+          &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv);
+      if (!ret) {
+        continue;
       }
-      if (mbmi->uv_mode == UV_DC_PRED) {
-        rate2 +=
-            av1_cost_bit(cpi->common.fc->filter_intra_probs[1],
-                         mbmi->filter_intra_mode_info.use_filter_intra_mode[1]);
-        if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1])
-          rate2 += write_uniform_cost(
-              FILTER_INTRA_MODES,
-              mbmi->filter_intra_mode_info.filter_intra_mode[1]);
-      }
-#endif  // CONFIG_FILTER_INTRA
-      if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
-        rate2 += intra_cost_penalty;
-      distortion2 = distortion_y + distortion_uv;
+      rate2 = intra_rd_stats.rate;
+      distortion2 = intra_rd_stats.dist;
+      this_rd = RDCOST(x->rdmult, rate2, distortion2);
+      skippable = intra_rd_stats.skip;
+      rate_y = intra_rd_stats_y.rate;
     } else {
-      int_mv backup_ref_mv[2];
-
-      if (!is_comp_ref_allowed(bsize) && mbmi->ref_frame[1] > INTRA_FRAME)
-        continue;
-
-      backup_ref_mv[0] = mbmi_ext->ref_mvs[ref_frame][0];
-      if (comp_pred) backup_ref_mv[1] = mbmi_ext->ref_mvs[second_ref_frame][0];
-#if CONFIG_INTERINTRA
-      if (second_ref_frame == INTRA_FRAME) {
-        if (best_single_inter_ref != ref_frame) continue;
-        mbmi->interintra_mode = intra_to_interintra_mode[best_intra_mode];
-// TODO(debargha|geza.lore):
-// Should we use ext_intra modes for interintra?
-#if CONFIG_EXT_INTRA
-        mbmi->angle_delta[0] = 0;
-        mbmi->angle_delta[1] = 0;
-#if CONFIG_INTRA_INTERP
-        mbmi->intra_filter = INTRA_FILTER_LINEAR;
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-        mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-        mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
-#endif  // CONFIG_FILTER_INTRA
-      }
-#endif  // CONFIG_INTERINTRA
+      mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+      mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+      mbmi->filter_intra_mode_info.use_filter_intra = 0;
       mbmi->ref_mv_idx = 0;
       ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-
-      if (comp_pred) {
-        if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
-          int ref_mv_idx = 0;
-          // Special case: NEAR_NEWMV and NEW_NEARMV modes use
-          // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
-          // mbmi->ref_mv_idx (like NEWMV)
-          if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
-            ref_mv_idx = 1;
-
-          if (compound_ref0_mode(mbmi->mode) == NEWMV) {
-            int_mv this_mv =
-                mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
-            clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                         xd->n8_h << MI_SIZE_LOG2, xd);
-            mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
-          }
-          if (compound_ref1_mode(mbmi->mode) == NEWMV) {
-            int_mv this_mv =
-                mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
-            clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                         xd->n8_h << MI_SIZE_LOG2, xd);
-            mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
-          }
-        }
-#if CONFIG_COMPOUND_SINGLEREF
-      } else if (is_inter_singleref_comp_mode(mbmi->mode)) {
-        if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
-          // TODO(zoeliu): To further investigate which ref_mv_idx should be
-          //               chosen for the mode of SR_NEAR_NEWMV.
-          int ref_mv_idx = 0;
-          // Special case: SR_NEAR_NEWMV mode use
-          // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
-          // mbmi->ref_mv_idx (like NEWMV)
-          if (mbmi->mode == SR_NEAR_NEWMV) ref_mv_idx = 1;
-
-          if (compound_ref0_mode(mbmi->mode) == NEWMV ||
-              compound_ref1_mode(mbmi->mode) == NEWMV) {
-            int_mv this_mv =
-                mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
-            clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                         xd->n8_h << MI_SIZE_LOG2, xd);
-            mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
-          }
-        }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      } else {
-        if (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
-          int ref;
-          for (ref = 0; ref < 1 + comp_pred; ++ref) {
-            int_mv this_mv =
-                (ref == 0) ? mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv
-                           : mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
-            clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                         xd->n8_h << MI_SIZE_LOG2, xd);
-            mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
-          }
-        }
-      }
+      int64_t ref_best_rd = search_state.best_rd;
       {
         RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
         av1_init_rd_stats(&rd_stats);
         rd_stats.rate = rate2;
 
         // Point to variables that are maintained between loop iterations
-        args.single_newmv = single_newmv;
-        args.single_newmv_rate = single_newmv_rate;
-        args.modelled_rd = modelled_rd;
+        args.single_newmv = search_state.single_newmv[0];
+        args.single_newmv_rate = search_state.single_newmv_rate[0];
+        args.single_newmv_valid = search_state.single_newmv_valid[0];
+        args.modelled_rd = search_state.modelled_rd;
+        args.single_comp_cost = real_compmode_cost;
+        args.ref_frame_cost = ref_frame_cost;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+        this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
+                                    &rd_stats_uv, &disable_skip, mi_row, mi_col,
+                                    &args, ref_best_rd, &best_est_rd);
+#else
         this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
-                                    &rd_stats_uv, &disable_skip, frame_mv,
-#if CONFIG_COMPOUND_SINGLEREF
-                                    frame_comp_mv,
-#endif  // CONFIG_COMPOUND_SINGLEREF
-                                    mi_row, mi_col, &args, best_rd);
+                                    &rd_stats_uv, &disable_skip, mi_row, mi_col,
+                                    &args, ref_best_rd);
+#endif
+        if (this_rd < ref_best_rd) {
+          ref_best_rd = this_rd;
+        }
 
         rate2 = rd_stats.rate;
         skippable = rd_stats.skip;
         distortion2 = rd_stats.dist;
-        total_sse = rd_stats.sse;
         rate_y = rd_stats_y.rate;
         rate_uv = rd_stats_uv.rate;
       }
 
-// TODO(jingning): This needs some refactoring to improve code quality
-// and reduce redundant steps.
-#if CONFIG_COMPOUND_SINGLEREF
-      if ((have_nearmv_in_inter_mode(mbmi->mode) &&
-           mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
-          ((mbmi->mode == NEWMV || mbmi->mode == SR_NEW_NEWMV ||
-            mbmi->mode == NEW_NEWMV) &&
-           mbmi_ext->ref_mv_count[ref_frame_type] > 1))
-#else   // !CONFIG_COMPOUND_SINGLEREF
+      // TODO(jingning): This needs some refactoring to improve code quality
+      // and reduce redundant steps.
       if ((have_nearmv_in_inter_mode(mbmi->mode) &&
            mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
           ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) &&
-           mbmi_ext->ref_mv_count[ref_frame_type] > 1))
-#endif  // CONFIG_COMPOUND_SINGLEREF
-      {
-        int_mv backup_mv = frame_mv[NEARMV][ref_frame];
+           mbmi_ext->ref_mv_count[ref_frame_type] > 1)) {
         MB_MODE_INFO backup_mbmi = *mbmi;
         int backup_skip = x->skip;
         int64_t tmp_ref_rd = this_rd;
@@ -11290,40 +10243,14 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         // TODO(jingning): This should be deprecated shortly.
         int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
         int ref_set =
-            AOMMIN(2, mbmi_ext->ref_mv_count[ref_frame_type] - 1 - idx_offset);
-
-        uint8_t drl_ctx =
-            av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx_offset);
-        // Dummy
-        int_mv backup_fmv[2];
-        backup_fmv[0] = frame_mv[NEWMV][ref_frame];
-        if (comp_pred) backup_fmv[1] = frame_mv[NEWMV][second_ref_frame];
-
-        rate2 += (rate2 < INT_MAX ? x->drl_mode_cost0[drl_ctx][0] : 0);
-
-        if (this_rd < INT64_MAX) {
-          if (RDCOST(x->rdmult, rate_y + rate_uv, distortion2) <
-              RDCOST(x->rdmult, 0, total_sse))
-            tmp_ref_rd = RDCOST(
-                x->rdmult, rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
-                distortion2);
-          else
-            tmp_ref_rd =
-                RDCOST(x->rdmult,
-                       rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
-                           rate_y - rate_uv,
-                       total_sse);
-        }
-#if CONFIG_VAR_TX
-        for (i = 0; i < MAX_MB_PLANE; ++i)
-          memcpy(x->blk_skip_drl[i], x->blk_skip[i],
-                 sizeof(uint8_t) * ctx->num_4x4_blk);
-#endif  // CONFIG_VAR_TX
+            AOMMIN(MAX_REF_MV_SERCH - 1,
+                   mbmi_ext->ref_mv_count[ref_frame_type] - 1 - idx_offset);
+        memcpy(x->blk_skip_drl, x->blk_skip,
+               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
 
         for (ref_idx = 0; ref_idx < ref_set; ++ref_idx) {
           int64_t tmp_alt_rd = INT64_MAX;
           int dummy_disable_skip = 0;
-          int ref;
           int_mv cur_mv;
           RD_STATS tmp_rd_stats, tmp_rd_stats_y, tmp_rd_stats_uv;
 
@@ -11333,80 +10260,19 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 
           mbmi->ref_mv_idx = 1 + ref_idx;
 
-          if (comp_pred) {
-            int ref_mv_idx = mbmi->ref_mv_idx;
-            // Special case: NEAR_NEWMV and NEW_NEARMV modes use
-            // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
-            // mbmi->ref_mv_idx (like NEWMV)
-            if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
-              ref_mv_idx = 1 + mbmi->ref_mv_idx;
-
-            if (compound_ref0_mode(mbmi->mode) == NEWMV) {
-              int_mv this_mv =
-                  mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
-              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                           xd->n8_h << MI_SIZE_LOG2, xd);
-              mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
-            } else if (compound_ref0_mode(mbmi->mode) == NEARESTMV) {
-              int_mv this_mv =
-                  mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
-              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                           xd->n8_h << MI_SIZE_LOG2, xd);
-              mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
-            }
-
-            if (compound_ref1_mode(mbmi->mode) == NEWMV) {
-              int_mv this_mv =
-                  mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
-              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                           xd->n8_h << MI_SIZE_LOG2, xd);
-              mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
-            } else if (compound_ref1_mode(mbmi->mode) == NEARESTMV) {
-              int_mv this_mv =
-                  mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
-              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                           xd->n8_h << MI_SIZE_LOG2, xd);
-              mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
-            }
-#if CONFIG_COMPOUND_SINGLEREF
-          } else if (is_inter_singleref_comp_mode(mbmi->mode)) {
-            int ref_mv_idx = mbmi->ref_mv_idx;
-            // Special case: SR_NEAR_NEWMV mode use
-            // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
-            // mbmi->ref_mv_idx (like NEWMV)
-            if (mbmi->mode == SR_NEAR_NEWMV) ref_mv_idx = 1 + mbmi->ref_mv_idx;
-
-            // TODO(zoeliu): For the mode of SR_NEAREST_NEWMV, as it only runs
-            //               the "if", not the "else if",
-            //               mbmi_ext->ref_mvs[mbmi->ref_frame[0]] takes the
-            //               value for "NEWMV", instead of "NEARESTMV".
-            if (compound_ref0_mode(mbmi->mode) == NEWMV ||
-                compound_ref1_mode(mbmi->mode) == NEWMV) {
-              int_mv this_mv =
-                  mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
-              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                           xd->n8_h << MI_SIZE_LOG2, xd);
-              mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
-            } else if (compound_ref0_mode(mbmi->mode) == NEARESTMV ||
-                       compound_ref1_mode(mbmi->mode) == NEARESTMV) {
-              int_mv this_mv =
-                  mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
-              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                           xd->n8_h << MI_SIZE_LOG2, xd);
-              mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
-            }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-          } else {
-            for (ref = 0; ref < 1 + comp_pred; ++ref) {
-              int_mv this_mv =
-                  (ref == 0)
-                      ? mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
-                            .this_mv
-                      : mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
-                            .comp_mv;
-              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
-                           xd->n8_h << MI_SIZE_LOG2, xd);
-              mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
+          if (cpi->sf.reduce_inter_modes) {
+            if (mbmi->ref_frame[0] == LAST2_FRAME ||
+                mbmi->ref_frame[0] == LAST3_FRAME ||
+                mbmi->ref_frame[1] == LAST2_FRAME ||
+                mbmi->ref_frame[1] == LAST3_FRAME) {
+              if (mbmi_ext
+                      ->ref_mv_stack[ref_frame_type]
+                                    [mbmi->ref_mv_idx + idx_offset]
+                      .weight < REF_CAT_LEVEL) {
+                *mbmi = backup_mbmi;
+                x->skip = backup_skip;
+                continue;
+              }
             }
           }
 
@@ -11416,69 +10282,31 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
           clamp_mv2(&cur_mv.as_mv, xd);
 
           if (!mv_check_bounds(&x->mv_limits, &cur_mv.as_mv)) {
-            int_mv dummy_single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } };
-            int dummy_single_newmv_rate[TOTAL_REFS_PER_FRAME] = { 0 };
-
-            frame_mv[NEARMV][ref_frame] = cur_mv;
             av1_init_rd_stats(&tmp_rd_stats);
 
-            // Point to variables that are not maintained between iterations
-            args.single_newmv = dummy_single_newmv;
-            args.single_newmv_rate = dummy_single_newmv_rate;
             args.modelled_rd = NULL;
-            tmp_alt_rd = handle_inter_mode(cpi, x, bsize, &tmp_rd_stats,
-                                           &tmp_rd_stats_y, &tmp_rd_stats_uv,
-                                           &dummy_disable_skip, frame_mv,
-#if CONFIG_COMPOUND_SINGLEREF
-                                           frame_comp_mv,
-#endif  // CONFIG_COMPOUND_SINGLEREF
-                                           mi_row, mi_col, &args, best_rd);
-            // Prevent pointers from escaping local scope
-            args.single_newmv = NULL;
-            args.single_newmv_rate = NULL;
-          }
-
-          for (i = 0; i < mbmi->ref_mv_idx; ++i) {
-            uint8_t drl1_ctx = 0;
-            drl1_ctx = av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
-                                   i + idx_offset);
-            tmp_rd_stats.rate +=
-                (tmp_rd_stats.rate < INT_MAX ? x->drl_mode_cost0[drl1_ctx][1]
-                                             : 0);
-          }
-
-          if (mbmi_ext->ref_mv_count[ref_frame_type] >
-                  mbmi->ref_mv_idx + idx_offset + 1 &&
-              ref_idx < ref_set - 1) {
-            uint8_t drl1_ctx =
-                av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
-                            mbmi->ref_mv_idx + idx_offset);
-            tmp_rd_stats.rate +=
-                (tmp_rd_stats.rate < INT_MAX ? x->drl_mode_cost0[drl1_ctx][0]
-                                             : 0);
-          }
-
-          if (tmp_alt_rd < INT64_MAX) {
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+            args.single_newmv = search_state.single_newmv[mbmi->ref_mv_idx];
+            args.single_newmv_rate =
+                search_state.single_newmv_rate[mbmi->ref_mv_idx];
+            args.single_newmv_valid =
+                search_state.single_newmv_valid[mbmi->ref_mv_idx];
+            args.single_comp_cost = real_compmode_cost;
+            args.ref_frame_cost = ref_frame_cost;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
             tmp_alt_rd =
-                RDCOST(x->rdmult, tmp_rd_stats.rate, tmp_rd_stats.dist);
+                handle_inter_mode(cpi, x, bsize, &tmp_rd_stats, &tmp_rd_stats_y,
+                                  &tmp_rd_stats_uv, &dummy_disable_skip, mi_row,
+                                  mi_col, &args, ref_best_rd, &best_est_rd);
 #else
-            if (RDCOST(x->rdmult, tmp_rd_stats_y.rate + tmp_rd_stats_uv.rate,
-                       tmp_rd_stats.dist) <
-                RDCOST(x->rdmult, 0, tmp_rd_stats.sse))
-              tmp_alt_rd =
-                  RDCOST(x->rdmult,
-                         tmp_rd_stats.rate +
-                             av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
-                         tmp_rd_stats.dist);
-            else
-              tmp_alt_rd =
-                  RDCOST(x->rdmult,
-                         tmp_rd_stats.rate +
-                             av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
-                             tmp_rd_stats_y.rate - tmp_rd_stats_uv.rate,
-                         tmp_rd_stats.sse);
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+            tmp_alt_rd = handle_inter_mode(
+                cpi, x, bsize, &tmp_rd_stats, &tmp_rd_stats_y, &tmp_rd_stats_uv,
+                &dummy_disable_skip, mi_row, mi_col, &args, ref_best_rd);
+#endif
+
+            // Prevent pointers from escaping local scope
+            args.single_newmv = search_state.single_newmv[0];
+            args.single_newmv_rate = search_state.single_newmv_rate[0];
+            args.single_newmv_valid = search_state.single_newmv_valid[0];
           }
 
           if (tmp_ref_rd > tmp_alt_rd) {
@@ -11488,192 +10316,61 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
             skippable = tmp_rd_stats.skip;
             rate_y = tmp_rd_stats_y.rate;
             rate_uv = tmp_rd_stats_uv.rate;
-            total_sse = tmp_rd_stats.sse;
             this_rd = tmp_alt_rd;
             tmp_ref_rd = tmp_alt_rd;
             backup_mbmi = *mbmi;
             backup_skip = x->skip;
-#if CONFIG_VAR_TX
-            for (i = 0; i < MAX_MB_PLANE; ++i)
-              memcpy(x->blk_skip_drl[i], x->blk_skip[i],
-                     sizeof(uint8_t) * ctx->num_4x4_blk);
-#endif  // CONFIG_VAR_TX
+            memcpy(x->blk_skip_drl, x->blk_skip,
+                   sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
           } else {
             *mbmi = backup_mbmi;
             x->skip = backup_skip;
           }
         }
 
-        frame_mv[NEARMV][ref_frame] = backup_mv;
-        frame_mv[NEWMV][ref_frame] = backup_fmv[0];
-        if (comp_pred) frame_mv[NEWMV][second_ref_frame] = backup_fmv[1];
-#if CONFIG_VAR_TX
-        for (i = 0; i < MAX_MB_PLANE; ++i)
-          memcpy(x->blk_skip[i], x->blk_skip_drl[i],
-                 sizeof(uint8_t) * ctx->num_4x4_blk);
-#endif  // CONFIG_VAR_TX
+        memcpy(x->blk_skip, x->blk_skip_drl,
+               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
       }
-      mbmi_ext->ref_mvs[ref_frame][0] = backup_ref_mv[0];
-      if (comp_pred) mbmi_ext->ref_mvs[second_ref_frame][0] = backup_ref_mv[1];
-
       if (this_rd == INT64_MAX) continue;
 
-      if (is_comp_ref_allowed(mbmi->sb_type))
-        compmode_cost = av1_cost_bit(comp_mode_p, comp_pred);
-
-      if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost;
-    }
-
-    // Estimate the reference frame signaling cost and add it
-    // to the rolling cost variable.
-    if (comp_pred) {
-#if CONFIG_EXT_COMP_REFS
-      rate2 += ref_costs_comp[ref_frame][second_ref_frame];
-#else  // !CONFIG_EXT_COMP_REFS
-      rate2 += ref_costs_comp[ref_frame];
-#if CONFIG_EXT_REFS
-      rate2 += ref_costs_comp[second_ref_frame];
-#endif  // CONFIG_EXT_REFS
-#endif  // CONFIG_EXT_COMP_REFS
-    } else {
-      rate2 += ref_costs_single[ref_frame];
-    }
-
-#if CONFIG_COMPOUND_SINGLEREF
-    // Add the cost to signal single/comp mode in single ref.
-    if (!comp_pred && cm->reference_mode != COMPOUND_REFERENCE) {
-      aom_prob singleref_comp_mode_p = av1_get_inter_mode_prob(cm, xd);
-      rate2 += av1_cost_bit(singleref_comp_mode_p,
-                            is_inter_singleref_comp_mode(mbmi->mode));
-    }
-#endif  // CONFIG_COMPOUND_SINGLEREF
-
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-    if (ref_frame == INTRA_FRAME)
-#else
-    if (!disable_skip)
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-    {
-      if (skippable) {
-        // Back out the coefficient coding costs
-        rate2 -= (rate_y + rate_uv);
-        rate_y = 0;
-        rate_uv = 0;
-        // Cost the skip mb case
-        rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-      } else if (ref_frame != INTRA_FRAME && !xd->lossless[mbmi->segment_id]) {
-        if (RDCOST(x->rdmult, rate_y + rate_uv + rate_skip0, distortion2) <
-            RDCOST(x->rdmult, rate_skip1, total_sse)) {
-          // Add in the cost of the no skip flag.
-          rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
-        } else {
-          // FIXME(rbultje) make this work for splitmv also
-          rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-          distortion2 = total_sse;
-          assert(total_sse >= 0);
-          rate2 -= (rate_y + rate_uv);
-          this_skip2 = 1;
-          rate_y = 0;
-          rate_uv = 0;
-        }
-      } else {
-        // Add in the cost of the no skip flag.
-        rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
-      }
-
-      // Calculate the final RD estimate for this mode.
-      this_rd = RDCOST(x->rdmult, rate2, distortion2);
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-    } else {
       this_skip2 = mbmi->skip;
       this_rd = RDCOST(x->rdmult, rate2, distortion2);
       if (this_skip2) {
         rate_y = 0;
         rate_uv = 0;
       }
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-    }
-
-    if (ref_frame == INTRA_FRAME) {
-      // Keep record of best intra rd
-      if (this_rd < best_intra_rd) {
-        best_intra_rd = this_rd;
-        best_intra_mode = mbmi->mode;
-      }
-#if CONFIG_INTERINTRA
-    } else if (second_ref_frame == NONE_FRAME) {
-      if (this_rd < best_single_inter_rd) {
-        best_single_inter_rd = this_rd;
-        best_single_inter_ref = mbmi->ref_frame[0];
-      }
-#endif  // CONFIG_INTERINTRA
-    }
-
-    if (!disable_skip && ref_frame == INTRA_FRAME) {
-      for (i = 0; i < REFERENCE_MODES; ++i)
-        best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd);
     }
 
     // Did this mode help.. i.e. is it the new best mode
-    if (this_rd < best_rd || x->skip) {
+    if (this_rd < search_state.best_rd || x->skip) {
+      int mode_excluded = 0;
+      if (comp_pred) {
+        mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
+      }
       if (!mode_excluded) {
         // Note index of best mode so far
-        best_mode_index = mode_index;
+        search_state.best_mode_index = mode_index;
 
         if (ref_frame == INTRA_FRAME) {
           /* required for left and above block mv */
           mbmi->mv[0].as_int = 0;
         } else {
-          best_pred_sse = x->pred_sse[ref_frame];
+          search_state.best_pred_sse = x->pred_sse[ref_frame];
         }
 
         rd_cost->rate = rate2;
-#if CONFIG_SUPERTX
-        if (x->skip)
-          *returnrate_nocoef = rate2;
-        else
-          *returnrate_nocoef = rate2 - rate_y - rate_uv;
-        *returnrate_nocoef -= av1_cost_bit(
-            av1_get_skip_prob(cm, xd), disable_skip || skippable || this_skip2);
-        *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd),
-                                           mbmi->ref_frame[0] != INTRA_FRAME);
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-#if CONFIG_WARPED_MOTION
-        set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-#endif
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-        MODE_INFO *const mi = xd->mi[0];
-        const MOTION_MODE motion_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION
-            0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-            xd,
-#endif
-            mi);
-        if (motion_allowed == WARPED_CAUSAL)
-          *returnrate_nocoef -= x->motion_mode_cost[bsize][mbmi->motion_mode];
-        else if (motion_allowed == OBMC_CAUSAL)
-          *returnrate_nocoef -= x->motion_mode_cost1[bsize][mbmi->motion_mode];
-#else
-        *returnrate_nocoef -= x->motion_mode_cost[bsize][mbmi->motion_mode];
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-#endif  // CONFIG_SUPERTX
         rd_cost->dist = distortion2;
         rd_cost->rdcost = this_rd;
-        best_rd = this_rd;
-        best_mbmode = *mbmi;
-        best_skip2 = this_skip2;
-        best_mode_skippable = skippable;
-        best_rate_y = rate_y + av1_cost_bit(av1_get_skip_prob(cm, xd),
-                                            this_skip2 || skippable);
-        best_rate_uv = rate_uv;
-#if CONFIG_VAR_TX
-        for (i = 0; i < MAX_MB_PLANE; ++i)
-          memcpy(ctx->blk_skip[i], x->blk_skip[i],
-                 sizeof(uint8_t) * ctx->num_4x4_blk);
-#endif  // CONFIG_VAR_TX
+        search_state.best_rd = this_rd;
+        search_state.best_mbmode = *mbmi;
+        search_state.best_skip2 = this_skip2;
+        search_state.best_mode_skippable = skippable;
+        search_state.best_rate_y =
+            rate_y +
+            x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable];
+        search_state.best_rate_uv = rate_uv;
+        memcpy(ctx->blk_skip, x->blk_skip,
+               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
       }
     }
 
@@ -11693,458 +10390,136 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       hybrid_rd = RDCOST(x->rdmult, hybrid_rate, distortion2);
 
       if (!comp_pred) {
-        if (single_rd < best_pred_rd[SINGLE_REFERENCE])
-          best_pred_rd[SINGLE_REFERENCE] = single_rd;
+        if (single_rd < search_state.best_pred_rd[SINGLE_REFERENCE])
+          search_state.best_pred_rd[SINGLE_REFERENCE] = single_rd;
       } else {
-        if (single_rd < best_pred_rd[COMPOUND_REFERENCE])
-          best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+        if (single_rd < search_state.best_pred_rd[COMPOUND_REFERENCE])
+          search_state.best_pred_rd[COMPOUND_REFERENCE] = single_rd;
       }
-      if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
-        best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+      if (hybrid_rd < search_state.best_pred_rd[REFERENCE_MODE_SELECT])
+        search_state.best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
     }
 
-    if (x->skip && !comp_pred) break;
-  }
-
-  if (xd->lossless[mbmi->segment_id] == 0 && best_mode_index >= 0 &&
-      ((sf->tx_type_search.fast_inter_tx_type_search == 1 &&
-        is_inter_mode(best_mbmode.mode)) ||
-       (sf->tx_type_search.fast_intra_tx_type_search == 1 &&
-        !is_inter_mode(best_mbmode.mode)))) {
-    int skip_blk = 0;
-    RD_STATS rd_stats_y, rd_stats_uv;
-
-    x->use_default_inter_tx_type = 0;
-    x->use_default_intra_tx_type = 0;
-
-    *mbmi = best_mbmode;
-
-    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-
-    // Select prediction reference frames.
-    for (i = 0; i < MAX_MB_PLANE; i++) {
-      xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
-      if (has_second_ref(mbmi))
-        xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
-    }
+    if (sf->drop_ref) {
+      if (second_ref_frame == NONE_FRAME) {
+        const int idx = ref_frame - LAST_FRAME;
+        if (idx && distortion2 > search_state.dist_refs[idx]) {
+          search_state.dist_refs[idx] = distortion2;
+          search_state.dist_order_refs[idx] = ref_frame;
+        }
 
-#if CONFIG_COMPOUND_SINGLEREF
-    // Single ref compound mode
-    if (!has_second_ref(mbmi) && is_inter_singleref_comp_mode(mbmi->mode)) {
-      xd->block_refs[1] = xd->block_refs[0];
-      for (i = 0; i < MAX_MB_PLANE; i++)
-        xd->plane[i].pre[1] = xd->plane[i].pre[0];
-    }
-#endif  // CONFIG_COMPOUND_SINGLEREF
+        // Reach the last single ref prediction mode
+        if (ref_frame == ALTREF_FRAME && this_mode == GLOBALMV) {
+          // bubble sort dist_refs and the order index
+          for (i = 0; i < REF_FRAMES; ++i) {
+            for (k = i + 1; k < REF_FRAMES; ++k) {
+              if (search_state.dist_refs[i] < search_state.dist_refs[k]) {
+                int64_t tmp_dist = search_state.dist_refs[i];
+                search_state.dist_refs[i] = search_state.dist_refs[k];
+                search_state.dist_refs[k] = tmp_dist;
+
+                int tmp_idx = search_state.dist_order_refs[i];
+                search_state.dist_order_refs[i] =
+                    search_state.dist_order_refs[k];
+                search_state.dist_order_refs[k] = tmp_idx;
+              }
+            }
+          }
 
-    if (is_inter_mode(mbmi->mode)) {
-      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
-#if CONFIG_MOTION_VAR
-      if (mbmi->motion_mode == OBMC_CAUSAL) {
-        av1_build_obmc_inter_prediction(
-            cm, xd, mi_row, mi_col, args.above_pred_buf, args.above_pred_stride,
-            args.left_pred_buf, args.left_pred_stride);
-      }
-#endif  // CONFIG_MOTION_VAR
-      av1_subtract_plane(x, bsize, 0);
-#if CONFIG_VAR_TX
-      if (cm->tx_mode == TX_MODE_SELECT || xd->lossless[mbmi->segment_id]) {
-        select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-        assert(rd_stats_y.rate != INT_MAX);
-      } else {
-        int idx, idy;
-        super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-        for (idy = 0; idy < xd->n8_h; ++idy)
-          for (idx = 0; idx < xd->n8_w; ++idx)
-            mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
-        memset(x->blk_skip[0], rd_stats_y.skip,
-               sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+          for (i = 0; i < REF_FRAMES; ++i) {
+            if (search_state.dist_refs[i] == -1) break;
+            search_state.num_available_refs = i;
+          }
+          search_state.num_available_refs++;
+        }
       }
-
-      inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-#else
-      super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-      super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-#endif  // CONFIG_VAR_TX
-    } else {
-      super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-      super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-    }
-
-    if (RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate,
-               (rd_stats_y.dist + rd_stats_uv.dist)) >
-        RDCOST(x->rdmult, 0, (rd_stats_y.sse + rd_stats_uv.sse))) {
-      skip_blk = 1;
-      rd_stats_y.rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-      rd_stats_uv.rate = 0;
-      rd_stats_y.dist = rd_stats_y.sse;
-      rd_stats_uv.dist = rd_stats_uv.sse;
-    } else {
-      skip_blk = 0;
-      rd_stats_y.rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
     }
 
-    if (RDCOST(x->rdmult, best_rate_y + best_rate_uv, rd_cost->dist) >
-        RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate,
-               (rd_stats_y.dist + rd_stats_uv.dist))) {
-#if CONFIG_VAR_TX
-      int idx, idy;
-#endif  // CONFIG_VAR_TX
-      best_mbmode.tx_type = mbmi->tx_type;
-      best_mbmode.tx_size = mbmi->tx_size;
-#if CONFIG_LGT_FROM_PRED
-      best_mbmode.use_lgt = mbmi->use_lgt;
-#endif
-#if CONFIG_VAR_TX
-      for (idy = 0; idy < xd->n8_h; ++idy)
-        for (idx = 0; idx < xd->n8_w; ++idx)
-          best_mbmode.inter_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
-
-      for (i = 0; i < MAX_MB_PLANE; ++i)
-        memcpy(ctx->blk_skip[i], x->blk_skip[i],
-               sizeof(uint8_t) * ctx->num_4x4_blk);
-
-      best_mbmode.min_tx_size = mbmi->min_tx_size;
-#endif  // CONFIG_VAR_TX
-      rd_cost->rate +=
-          (rd_stats_y.rate + rd_stats_uv.rate - best_rate_y - best_rate_uv);
-      rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
-      rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
-      best_skip2 = skip_blk;
-    }
+    if (x->skip && !comp_pred) break;
   }
 
-  // Only try palette mode when the best mode so far is an intra mode.
-  if (try_palette && !is_inter_mode(best_mbmode.mode)) {
-    int rate2 = 0;
-#if CONFIG_SUPERTX
-    int best_rate_nocoef;
-#endif  // CONFIG_SUPERTX
-    int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd,
-            best_model_rd_palette = INT64_MAX;
-    int skippable = 0, rate_overhead_palette = 0;
-    RD_STATS rd_stats_y;
-    TX_SIZE uv_tx;
-    uint8_t *const best_palette_color_map =
-        x->palette_buffer->best_palette_color_map;
-    uint8_t *const color_map = xd->plane[0].color_index_map;
-    MB_MODE_INFO best_mbmi_palette = best_mbmode;
+  // In effect only when speed >= 2.
+  sf_refine_fast_tx_type_search(
+      cpi, x, mi_row, mi_col, rd_cost, bsize, ctx, search_state.best_mode_index,
+      &search_state.best_mbmode, yv12_mb, search_state.best_rate_y,
+      search_state.best_rate_uv, &search_state.best_skip2);
 
-    mbmi->mode = DC_PRED;
-    mbmi->uv_mode = UV_DC_PRED;
-    mbmi->ref_frame[0] = INTRA_FRAME;
-    mbmi->ref_frame[1] = NONE_FRAME;
-    rate_overhead_palette = rd_pick_palette_intra_sby(
-        cpi, x, bsize, palette_ctx, intra_mode_cost[DC_PRED],
-        &best_mbmi_palette, best_palette_color_map, &best_rd_palette,
-        &best_model_rd_palette, NULL, NULL, NULL, NULL);
-    if (pmi->palette_size[0] == 0) goto PALETTE_EXIT;
-    memcpy(color_map, best_palette_color_map,
-           rows * cols * sizeof(best_palette_color_map[0]));
-    super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd);
-    if (rd_stats_y.rate == INT_MAX) goto PALETTE_EXIT;
-    uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x]
-                            [xd->plane[1].subsampling_y];
-    if (rate_uv_intra[uv_tx] == INT_MAX) {
-      choose_intra_uv_mode(cpi, x, bsize, uv_tx, &rate_uv_intra[uv_tx],
-                           &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx],
-                           &skip_uvs[uv_tx], &mode_uv[uv_tx]);
-      pmi_uv[uv_tx] = *pmi;
-#if CONFIG_EXT_INTRA
-      uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-      filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
-#endif  // CONFIG_FILTER_INTRA
-    }
-    mbmi->uv_mode = mode_uv[uv_tx];
-    pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
-    if (pmi->palette_size[1] > 0) {
-      memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
-             pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
-             2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
-    }
-#if CONFIG_EXT_INTRA
-    mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_FILTER_INTRA
-    mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
-        filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
-    if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
-      mbmi->filter_intra_mode_info.filter_intra_mode[1] =
-          filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
-    }
-#endif  // CONFIG_FILTER_INTRA
-    skippable = rd_stats_y.skip && skip_uvs[uv_tx];
-    distortion2 = rd_stats_y.dist + dist_uvs[uv_tx];
-    rate2 = rd_stats_y.rate + rate_overhead_palette + rate_uv_intra[uv_tx];
-    rate2 += ref_costs_single[INTRA_FRAME];
-
-    if (skippable) {
-      rate2 -= (rd_stats_y.rate + rate_uv_tokenonly[uv_tx]);
-#if CONFIG_SUPERTX
-      best_rate_nocoef = rate2;
-#endif  // CONFIG_SUPERTX
-      rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-    } else {
-#if CONFIG_SUPERTX
-      best_rate_nocoef = rate2 - (rd_stats_y.rate + rate_uv_tokenonly[uv_tx]);
-#endif  // CONFIG_SUPERTX
-      rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
-    }
-    this_rd = RDCOST(x->rdmult, rate2, distortion2);
-    if (this_rd < best_rd) {
-      best_mode_index = 3;
-      mbmi->mv[0].as_int = 0;
-      rd_cost->rate = rate2;
-#if CONFIG_SUPERTX
-      *returnrate_nocoef = best_rate_nocoef;
-#endif  // CONFIG_SUPERTX
-      rd_cost->dist = distortion2;
-      rd_cost->rdcost = this_rd;
-      best_rd = this_rd;
-      best_mbmode = *mbmi;
-      best_skip2 = 0;
-      best_mode_skippable = skippable;
-    }
-  }
-PALETTE_EXIT:
-
-#if CONFIG_FILTER_INTRA
-  // TODO(huisu): filter-intra is turned off in lossless mode for now to
-  // avoid a unit test failure
-  if (!xd->lossless[mbmi->segment_id] && pmi->palette_size[0] == 0 &&
-      !dc_skipped && best_mode_index >= 0 &&
-      best_intra_rd < (best_rd + (best_rd >> 3))) {
-    pick_filter_intra_interframe(
-        cpi, x, bsize, mi_row, mi_col, rate_uv_intra, rate_uv_tokenonly,
-        dist_uvs, skip_uvs, mode_uv, filter_intra_mode_info_uv,
-#if CONFIG_EXT_INTRA
-        uv_angle_delta,
-#endif  // CONFIG_EXT_INTRA
-        pmi_uv, palette_ctx, 0, ref_costs_single, &best_rd, &best_intra_rd,
-        &best_intra_mode, &best_mode_index, &best_skip2, &best_mode_skippable,
-#if CONFIG_SUPERTX
-        returnrate_nocoef,
-#endif  // CONFIG_SUPERTX
-        best_pred_rd, &best_mbmode, rd_cost);
-  }
-#endif  // CONFIG_FILTER_INTRA
-
-// The inter modes' rate costs are not calculated precisely in some cases.
-// Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
-// ZEROMV. Here, checks are added for those cases, and the mode decisions
-// are corrected.
-#if CONFIG_COMPOUND_SINGLEREF
-// NOTE: For SR_NEW_NEWMV, no need to check as the two mvs from the same ref
-//       are surely different from each other.
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  if (best_mbmode.mode == NEWMV || best_mbmode.mode == NEW_NEWMV) {
-    const MV_REFERENCE_FRAME refs[2] = { best_mbmode.ref_frame[0],
-                                         best_mbmode.ref_frame[1] };
-    int comp_pred_mode = refs[1] > INTRA_FRAME;
-    int_mv zeromv[2];
-    const uint8_t rf_type = av1_ref_frame_type(best_mbmode.ref_frame);
-#if CONFIG_GLOBAL_MOTION
-    zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]],
-                                            cm->allow_high_precision_mv, bsize,
-                                            mi_col, mi_row, 0
-#if CONFIG_AMVR
-                                            ,
-                                            cm->cur_frame_mv_precision_level
-#endif
-                                            )
-                           .as_int;
-    zeromv[1].as_int =
-        comp_pred_mode
-            ? gm_get_motion_vector(&cm->global_motion[refs[1]],
-                                   cm->allow_high_precision_mv, bsize, mi_col,
-                                   mi_row, 0
-#if CONFIG_AMVR
-                                   ,
-                                   cm->cur_frame_mv_precision_level
-#endif
-                                   )
-                  .as_int
-            : 0;
-#else
-    zeromv[0].as_int = 0;
-    zeromv[1].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
-    if (!comp_pred_mode) {
-      int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
-                        ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2)
-                        : INT_MAX;
-
-      for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
-        int_mv cur_mv = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
-        if (cur_mv.as_int == best_mbmode.mv[0].as_int) {
-          best_mbmode.mode = NEARMV;
-          best_mbmode.ref_mv_idx = i;
-        }
-      }
-
-      if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int)
-        best_mbmode.mode = NEARESTMV;
-      else if (best_mbmode.mv[0].as_int == zeromv[0].as_int)
-        best_mbmode.mode = ZEROMV;
-    } else {
-      int_mv nearestmv[2];
-      int_mv nearmv[2];
-
-      if (mbmi_ext->ref_mv_count[rf_type] > 1) {
-        nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][1].this_mv;
-        nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][1].comp_mv;
-      } else {
-        nearmv[0] = frame_mv[NEARMV][refs[0]];
-        nearmv[1] = frame_mv[NEARMV][refs[1]];
-      }
-      if (mbmi_ext->ref_mv_count[rf_type] >= 1) {
-        nearestmv[0] = mbmi_ext->ref_mv_stack[rf_type][0].this_mv;
-        nearestmv[1] = mbmi_ext->ref_mv_stack[rf_type][0].comp_mv;
-      } else {
-        nearestmv[0] = frame_mv[NEARESTMV][refs[0]];
-        nearestmv[1] = frame_mv[NEARESTMV][refs[1]];
-      }
-
-      if (nearestmv[0].as_int == best_mbmode.mv[0].as_int &&
-          nearestmv[1].as_int == best_mbmode.mv[1].as_int) {
-        best_mbmode.mode = NEAREST_NEARESTMV;
-      } else {
-        int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
-                          ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2)
-                          : INT_MAX;
-
-        for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
-          nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
-          nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][i + 1].comp_mv;
-
-          // Try switching to the NEAR_NEARMV mode
-          if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
-              nearmv[1].as_int == best_mbmode.mv[1].as_int) {
-            best_mbmode.mode = NEAR_NEARMV;
-            best_mbmode.ref_mv_idx = i;
-          }
-        }
+  // Only try palette mode when the best mode so far is an intra mode.
+  if (try_palette && !is_inter_mode(search_state.best_mbmode.mode)) {
+    search_palette_mode(cpi, x, rd_cost, ctx, bsize, mbmi, pmi,
+                        ref_costs_single, &search_state);
+  }
 
-        if (best_mbmode.mode == NEW_NEWMV &&
-            best_mbmode.mv[0].as_int == zeromv[0].as_int &&
-            best_mbmode.mv[1].as_int == zeromv[1].as_int)
-          best_mbmode.mode = ZERO_ZEROMV;
-      }
-    }
+  search_state.best_mbmode.skip_mode = 0;
+  if (cm->skip_mode_flag &&
+      !segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+      is_comp_ref_allowed(bsize)) {
+    rd_pick_skip_mode(rd_cost, &search_state, cpi, x, bsize, mi_row, mi_col,
+                      yv12_mb);
   }
 
   // Make sure that the ref_mv_idx is only nonzero when we're
   // using a mode which can support ref_mv_idx
-  if (best_mbmode.ref_mv_idx != 0 &&
-#if CONFIG_COMPOUND_SINGLEREF
-      !(best_mbmode.mode == NEWMV || best_mbmode.mode == SR_NEW_NEWMV ||
-        best_mbmode.mode == NEW_NEWMV ||
-        have_nearmv_in_inter_mode(best_mbmode.mode)))
-#else   // !CONFIG_COMPOUND_SINGLEREF
-      !(best_mbmode.mode == NEWMV || best_mbmode.mode == NEW_NEWMV ||
-        have_nearmv_in_inter_mode(best_mbmode.mode)))
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  {
-    best_mbmode.ref_mv_idx = 0;
-  }
-
-  if (best_mbmode.ref_frame[0] > INTRA_FRAME &&
-      best_mbmode.ref_frame[1] <= INTRA_FRAME) {
-    int8_t ref_frame_type = av1_ref_frame_type(best_mbmode.ref_frame);
-    int16_t mode_ctx = mbmi_ext->mode_context[ref_frame_type];
-    if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) {
-      int_mv zeromv;
-#if CONFIG_GLOBAL_MOTION
-      const MV_REFERENCE_FRAME ref = best_mbmode.ref_frame[0];
-      zeromv.as_int = gm_get_motion_vector(&cm->global_motion[ref],
-                                           cm->allow_high_precision_mv, bsize,
-                                           mi_col, mi_row, 0
-#if CONFIG_AMVR
-                                           ,
-                                           cm->cur_frame_mv_precision_level
-#endif
-                                           )
-                          .as_int;
-#else
-      zeromv.as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
-      if (best_mbmode.mv[0].as_int == zeromv.as_int) {
-        best_mbmode.mode = ZEROMV;
-      }
-    }
+  if (search_state.best_mbmode.ref_mv_idx != 0 &&
+      !(search_state.best_mbmode.mode == NEWMV ||
+        search_state.best_mbmode.mode == NEW_NEWMV ||
+        have_nearmv_in_inter_mode(search_state.best_mbmode.mode))) {
+    search_state.best_mbmode.ref_mv_idx = 0;
   }
 
-  if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
+  if (search_state.best_mode_index < 0 ||
+      search_state.best_rd >= best_rd_so_far) {
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
     return;
   }
 
-  assert((cm->interp_filter == SWITCHABLE) ||
-         (cm->interp_filter ==
-          av1_extract_interp_filter(best_mbmode.interp_filters, 0)) ||
-         !is_inter_block(&best_mbmode));
-#if CONFIG_DUAL_FILTER
-  assert((cm->interp_filter == SWITCHABLE) ||
-         (cm->interp_filter ==
-          av1_extract_interp_filter(best_mbmode.interp_filters, 1)) ||
-         !is_inter_block(&best_mbmode));
-#endif  // CONFIG_DUAL_FILTER
+  assert(
+      (cm->interp_filter == SWITCHABLE) ||
+      (cm->interp_filter ==
+       av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 0)) ||
+      !is_inter_block(&search_state.best_mbmode));
+  assert(
+      (cm->interp_filter == SWITCHABLE) ||
+      (cm->interp_filter ==
+       av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 1)) ||
+      !is_inter_block(&search_state.best_mbmode));
 
   if (!cpi->rc.is_src_frame_alt_ref)
     av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
-                              sf->adaptive_rd_thresh, bsize, best_mode_index);
+                              sf->adaptive_rd_thresh, bsize,
+                              search_state.best_mode_index);
 
   // macroblock modes
-  *mbmi = best_mbmode;
-  x->skip |= best_skip2;
-
-// Note: this section is needed since the mode may have been forced to
-// ZEROMV by the all-zero mode handling of ref-mv.
-#if CONFIG_GLOBAL_MOTION
-  if (mbmi->mode == ZEROMV || mbmi->mode == ZERO_ZEROMV) {
-#if CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR
-    // Correct the motion mode for ZEROMV
-    const MOTION_MODE last_motion_mode_allowed =
-        motion_mode_allowed(0, xd->global_motion,
-#if CONFIG_WARPED_MOTION
-                            xd,
-#endif
-                            xd->mi[0]);
-    if (mbmi->motion_mode > last_motion_mode_allowed)
-      mbmi->motion_mode = last_motion_mode_allowed;
-#endif  // CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR
-
-    // Correct the interpolation filter for ZEROMV
-    if (is_nontrans_global_motion(xd)) {
-      mbmi->interp_filters = av1_broadcast_interp_filter(
-          av1_unswitchable_filter(cm->interp_filter));
+  *mbmi = search_state.best_mbmode;
+  x->skip |= search_state.best_skip2;
+
+  // Note: this section is needed since the mode may have been forced to
+  // GLOBALMV by the all-zero mode handling of ref-mv.
+  if (mbmi->mode == GLOBALMV || mbmi->mode == GLOBAL_GLOBALMV) {
+    // Correct the interp filters for GLOBALMV
+    if (is_nontrans_global_motion(xd, xd->mi[0])) {
+      assert(mbmi->interp_filters ==
+             av1_broadcast_interp_filter(
+                 av1_unswitchable_filter(cm->interp_filter)));
     }
   }
-#endif  // CONFIG_GLOBAL_MOTION
-
-  for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
-    if (mbmi->mode != NEWMV)
-      mbmi->pred_mv[i].as_int = mbmi->mv[i].as_int;
-    else
-      mbmi->pred_mv[i].as_int = mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_int;
-  }
 
   for (i = 0; i < REFERENCE_MODES; ++i) {
-    if (best_pred_rd[i] == INT64_MAX)
-      best_pred_diff[i] = INT_MIN;
+    if (search_state.best_pred_rd[i] == INT64_MAX)
+      search_state.best_pred_diff[i] = INT_MIN;
     else
-      best_pred_diff[i] = best_rd - best_pred_rd[i];
+      search_state.best_pred_diff[i] =
+          search_state.best_rd - search_state.best_pred_rd[i];
   }
 
-  x->skip |= best_mode_skippable;
+  x->skip |= search_state.best_mode_skippable;
 
-  assert(best_mode_index >= 0);
+  assert(search_state.best_mode_index >= 0);
 
-  store_coding_context(x, ctx, best_mode_index, best_pred_diff,
-                       best_mode_skippable);
+  store_coding_context(x, ctx, search_state.best_mode_index,
+                       search_state.best_pred_diff,
+                       search_state.best_mode_skippable);
 
   if (pmi->palette_size[1] > 0) {
     assert(try_palette);
@@ -12160,18 +10535,14 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
                                         int64_t best_rd_so_far) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   unsigned char segment_id = mbmi->segment_id;
   const int comp_pred = 0;
   int i;
   int64_t best_pred_diff[REFERENCE_MODES];
-  unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
-#if CONFIG_EXT_COMP_REFS
-  unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME][TOTAL_REFS_PER_FRAME];
-#else
-  unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_EXT_COMP_REFS
-  aom_prob comp_mode_p;
+  unsigned int ref_costs_single[REF_FRAMES];
+  unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
+  int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)];
   InterpFilter best_filter = SWITCHABLE;
   int64_t this_rd = INT64_MAX;
   int rate2 = 0;
@@ -12179,12 +10550,13 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
   (void)mi_row;
   (void)mi_col;
 
-  estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
-                           &comp_mode_p);
+  av1_collect_neighbors_ref_counts(xd);
 
-  for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) x->pred_sse[i] = INT_MAX;
-  for (i = LAST_FRAME; i < TOTAL_REFS_PER_FRAME; ++i)
-    x->pred_mv_sad[i] = INT_MAX;
+  estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single,
+                           ref_costs_comp);
+
+  for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+  for (i = LAST_FRAME; i < REF_FRAMES; ++i) x->pred_mv_sad[i] = INT_MAX;
 
   rd_cost->rate = INT_MAX;
 
@@ -12192,58 +10564,35 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
 
   mbmi->palette_mode_info.palette_size[0] = 0;
   mbmi->palette_mode_info.palette_size[1] = 0;
-
-#if CONFIG_FILTER_INTRA
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
-#endif  // CONFIG_FILTER_INTRA
-  mbmi->mode = ZEROMV;
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  mbmi->mode = GLOBALMV;
   mbmi->motion_mode = SIMPLE_TRANSLATION;
   mbmi->uv_mode = UV_DC_PRED;
-  mbmi->ref_frame[0] = LAST_FRAME;
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME))
+    mbmi->ref_frame[0] = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+  else
+    mbmi->ref_frame[0] = LAST_FRAME;
   mbmi->ref_frame[1] = NONE_FRAME;
-#if CONFIG_GLOBAL_MOTION
   mbmi->mv[0].as_int =
       gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]],
-                           cm->allow_high_precision_mv, bsize, mi_col, mi_row, 0
-#if CONFIG_AMVR
-                           ,
-                           cm->cur_frame_mv_precision_level
-#endif
-                           )
+                           cm->allow_high_precision_mv, bsize, mi_col, mi_row,
+                           cm->cur_frame_force_integer_mv)
           .as_int;
-#else   // CONFIG_GLOBAL_MOTION
-  mbmi->mv[0].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
   mbmi->tx_size = max_txsize_lookup[bsize];
   x->skip = 1;
 
   mbmi->ref_mv_idx = 0;
-  mbmi->pred_mv[0].as_int = 0;
-#if CONFIG_LGT_FROM_PRED
-  mbmi->use_lgt = 0;
-#endif
 
   mbmi->motion_mode = SIMPLE_TRANSLATION;
-#if CONFIG_MOTION_VAR
   av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
-#endif
-#if CONFIG_WARPED_MOTION
   if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) {
     int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
-#if WARPED_MOTION_SORT_SAMPLES
-    int pts_mv[SAMPLES_ARRAY_SIZE];
-    mbmi->num_proj_ref[0] =
-        findSamples(cm, xd, mi_row, mi_col, pts, pts_inref, pts_mv);
-    // Rank the samples by motion vector difference
-    if (mbmi->num_proj_ref[0] > 1)
-      mbmi->num_proj_ref[0] = sortSamples(pts_mv, &mbmi->mv[0].as_mv, pts,
-                                          pts_inref, mbmi->num_proj_ref[0]);
-#else
     mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
-#endif  // WARPED_MOTION_SORT_SAMPLES
+    // Select the samples according to motion vector difference
+    if (mbmi->num_proj_ref[0] > 1)
+      mbmi->num_proj_ref[0] = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+                                            mbmi->num_proj_ref[0], bsize);
   }
-#endif
 
   set_default_interp_filters(mbmi, cm->interp_filter);
 
@@ -12270,7 +10619,7 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
   rate2 += av1_get_switchable_rate(cm, x, xd);
 
   if (cm->reference_mode == REFERENCE_MODE_SELECT)
-    rate2 += av1_cost_bit(comp_mode_p, comp_pred);
+    rate2 += comp_inter_cost[comp_pred];
 
   // Estimate the reference frame signaling cost and add it
   // to the rolling cost variable.
@@ -12292,15 +10641,13 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
           av1_extract_interp_filter(mbmi->interp_filters, 0)));
 
   av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
-                            cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
+                            cpi->sf.adaptive_rd_thresh, bsize, THR_GLOBALMV);
 
   av1_zero(best_pred_diff);
 
-  store_coding_context(x, ctx, THR_ZEROMV, best_pred_diff, 0);
+  store_coding_context(x, ctx, THR_GLOBALMV, best_pred_diff, 0);
 }
 
-#if CONFIG_MOTION_VAR
-
 struct calc_target_weighted_pred_ctxt {
   const MACROBLOCK *x;
   const uint8_t *tmp;
@@ -12308,28 +10655,22 @@ struct calc_target_weighted_pred_ctxt {
   int overlap;
 };
 
-static INLINE void calc_target_weighted_pred_above(MACROBLOCKD *xd,
-                                                   int rel_mi_col,
-                                                   uint8_t nb_mi_width,
-                                                   MODE_INFO *nb_mi,
-                                                   void *fun_ctxt) {
+static INLINE void calc_target_weighted_pred_above(
+    MACROBLOCKD *xd, int rel_mi_col, uint8_t nb_mi_width, MB_MODE_INFO *nb_mi,
+    void *fun_ctxt, const int num_planes) {
   (void)nb_mi;
+  (void)num_planes;
 
   struct calc_target_weighted_pred_ctxt *ctxt =
       (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
 
-#if CONFIG_HIGHBITDEPTH
-  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-#else
-  const int is_hbd = 0;
-#endif  // CONFIG_HIGHBITDEPTH
-
   const int bw = xd->n8_w << MI_SIZE_LOG2;
   const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
 
   int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE);
   int32_t *mask = ctxt->x->mask_buf + (rel_mi_col * MI_SIZE);
   const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE;
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
 
   if (!is_hbd) {
     for (int row = 0; row < ctxt->overlap; ++row) {
@@ -12343,7 +10684,6 @@ static INLINE void calc_target_weighted_pred_above(MACROBLOCKD *xd,
       mask += bw;
       tmp += ctxt->tmp_stride;
     }
-#if CONFIG_HIGHBITDEPTH
   } else {
     const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
 
@@ -12358,32 +10698,25 @@ static INLINE void calc_target_weighted_pred_above(MACROBLOCKD *xd,
       mask += bw;
       tmp16 += ctxt->tmp_stride;
     }
-#endif  // CONFIG_HIGHBITDEPTH
   }
 }
 
-static INLINE void calc_target_weighted_pred_left(MACROBLOCKD *xd,
-                                                  int rel_mi_row,
-                                                  uint8_t nb_mi_height,
-                                                  MODE_INFO *nb_mi,
-                                                  void *fun_ctxt) {
+static INLINE void calc_target_weighted_pred_left(
+    MACROBLOCKD *xd, int rel_mi_row, uint8_t nb_mi_height, MB_MODE_INFO *nb_mi,
+    void *fun_ctxt, const int num_planes) {
   (void)nb_mi;
+  (void)num_planes;
 
   struct calc_target_weighted_pred_ctxt *ctxt =
       (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
 
-#if CONFIG_HIGHBITDEPTH
-  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-#else
-  const int is_hbd = 0;
-#endif  // CONFIG_HIGHBITDEPTH
-
   const int bw = xd->n8_w << MI_SIZE_LOG2;
   const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
 
   int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw);
   int32_t *mask = ctxt->x->mask_buf + (rel_mi_row * MI_SIZE * bw);
   const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride);
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
 
   if (!is_hbd) {
     for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) {
@@ -12398,7 +10731,6 @@ static INLINE void calc_target_weighted_pred_left(MACROBLOCKD *xd,
       mask += bw;
       tmp += ctxt->tmp_stride;
     }
-#if CONFIG_HIGHBITDEPTH
   } else {
     const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
 
@@ -12414,7 +10746,6 @@ static INLINE void calc_target_weighted_pred_left(MACROBLOCKD *xd,
       mask += bw;
       tmp16 += ctxt->tmp_stride;
     }
-#endif  // CONFIG_HIGHBITDEPTH
   }
 }
 
@@ -12461,18 +10792,14 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
                                       int mi_col, const uint8_t *above,
                                       int above_stride, const uint8_t *left,
                                       int left_stride) {
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   const int bw = xd->n8_w << MI_SIZE_LOG2;
   const int bh = xd->n8_h << MI_SIZE_LOG2;
   int32_t *mask_buf = x->mask_buf;
   int32_t *wsrc_buf = x->wsrc_buf;
 
-  const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
-#if CONFIG_HIGHBITDEPTH
   const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-#else
-  const int is_hbd = 0;
-#endif  // CONFIG_HIGHBITDEPTH
+  const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
 
   // plane 0 should not be subsampled
   assert(xd->plane[0].subsampling_x == 0);
@@ -12488,7 +10815,7 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
     struct calc_target_weighted_pred_ctxt ctxt = { x, above, above_stride,
                                                    overlap };
     foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd, mi_col,
-                                  max_neighbor_obmc[b_width_log2_lookup[bsize]],
+                                  max_neighbor_obmc[mi_size_wide_log2[bsize]],
                                   calc_target_weighted_pred_above, &ctxt);
   }
 
@@ -12504,7 +10831,7 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
     struct calc_target_weighted_pred_ctxt ctxt = { x, left, left_stride,
                                                    overlap };
     foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd, mi_row,
-                                 max_neighbor_obmc[b_height_log2_lookup[bsize]],
+                                 max_neighbor_obmc[mi_size_high_log2[bsize]],
                                  calc_target_weighted_pred_left, &ctxt);
   }
 
@@ -12518,7 +10845,6 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
       wsrc_buf += bw;
       src += x->plane[0].src.stride;
     }
-#if CONFIG_HIGHBITDEPTH
   } else {
     const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
 
@@ -12529,462 +10855,5 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
       wsrc_buf += bw;
       src += x->plane[0].src.stride;
     }
-#endif  // CONFIG_HIGHBITDEPTH
-  }
-}
-
-#if CONFIG_NCOBMC
-void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
-                         int mi_row, int mi_col) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  MB_MODE_INFO backup_mbmi;
-  BLOCK_SIZE bsize = mbmi->sb_type;
-  int ref, skip_blk, backup_skip = x->skip;
-  int64_t rd_causal;
-  RD_STATS rd_stats_y, rd_stats_uv;
-  int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
-  int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-
-  // Recompute the best causal predictor and rd
-  mbmi->motion_mode = SIMPLE_TRANSLATION;
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
-    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
-    assert(cfg != NULL);
-    av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
-                         &xd->block_refs[ref]->sf);
-  }
-  av1_setup_dst_planes(x->e_mbd.plane, bsize,
-                       get_frame_new_buffer(&cpi->common), mi_row, mi_col);
-
-  av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
-
-  av1_subtract_plane(x, bsize, 0);
-#if CONFIG_VAR_TX
-  if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
-    select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-  } else {
-    int idx, idy;
-    super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-    for (idy = 0; idy < xd->n8_h; ++idy)
-      for (idx = 0; idx < xd->n8_w; ++idx)
-        mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
-    memset(x->blk_skip[0], rd_stats_y.skip,
-           sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
-  }
-  inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-#else
-  super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-  super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-#endif
-  assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
-  if (rd_stats_y.skip && rd_stats_uv.skip) {
-    rd_stats_y.rate = rate_skip1;
-    rd_stats_uv.rate = 0;
-    rd_stats_y.dist = rd_stats_y.sse;
-    rd_stats_uv.dist = rd_stats_uv.sse;
-    skip_blk = 0;
-  } else if (RDCOST(x->rdmult,
-                    (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
-                    (rd_stats_y.dist + rd_stats_uv.dist)) >
-             RDCOST(x->rdmult, rate_skip1,
-                    (rd_stats_y.sse + rd_stats_uv.sse))) {
-    rd_stats_y.rate = rate_skip1;
-    rd_stats_uv.rate = 0;
-    rd_stats_y.dist = rd_stats_y.sse;
-    rd_stats_uv.dist = rd_stats_uv.sse;
-    skip_blk = 1;
-  } else {
-    rd_stats_y.rate += rate_skip0;
-    skip_blk = 0;
-  }
-  backup_skip = skip_blk;
-  backup_mbmi = *mbmi;
-  rd_causal = RDCOST(x->rdmult, (rd_stats_y.rate + rd_stats_uv.rate),
-                     (rd_stats_y.dist + rd_stats_uv.dist));
-  rd_causal +=
-      RDCOST(x->rdmult, av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 0), 0);
-
-  // Check non-causal mode
-  mbmi->motion_mode = OBMC_CAUSAL;
-  av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-
-  av1_subtract_plane(x, bsize, 0);
-#if CONFIG_VAR_TX
-  if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
-    select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-  } else {
-    int idx, idy;
-    super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-    for (idy = 0; idy < xd->n8_h; ++idy)
-      for (idx = 0; idx < xd->n8_w; ++idx)
-        mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
-    memset(x->blk_skip[0], rd_stats_y.skip,
-           sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
-  }
-  inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-#else
-  super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-  super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-#endif
-  assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
-  if (rd_stats_y.skip && rd_stats_uv.skip) {
-    rd_stats_y.rate = rate_skip1;
-    rd_stats_uv.rate = 0;
-    rd_stats_y.dist = rd_stats_y.sse;
-    rd_stats_uv.dist = rd_stats_uv.sse;
-    skip_blk = 0;
-  } else if (RDCOST(x->rdmult,
-                    (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
-                    (rd_stats_y.dist + rd_stats_uv.dist)) >
-             RDCOST(x->rdmult, rate_skip1,
-                    (rd_stats_y.sse + rd_stats_uv.sse))) {
-    rd_stats_y.rate = rate_skip1;
-    rd_stats_uv.rate = 0;
-    rd_stats_y.dist = rd_stats_y.sse;
-    rd_stats_uv.dist = rd_stats_uv.sse;
-    skip_blk = 1;
-  } else {
-    rd_stats_y.rate += rate_skip0;
-    skip_blk = 0;
-  }
-
-  if (rd_causal >
-      RDCOST(x->rdmult,
-             rd_stats_y.rate + rd_stats_uv.rate +
-                 av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 1),
-             (rd_stats_y.dist + rd_stats_uv.dist))) {
-    x->skip = skip_blk;
-  } else {
-    *mbmi = backup_mbmi;
-    x->skip = backup_skip;
-  }
-}
-#endif  // CONFIG_NCOBMC
-
-int64_t get_prediction_rd_cost(const struct AV1_COMP *cpi, struct macroblock *x,
-                               int mi_row, int mi_col, int *skip_blk,
-                               MB_MODE_INFO *backup_mbmi) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
-  const MOTION_MODE motion_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION
-      0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-      xd,
-#endif
-      xd->mi[0]);
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
-  RD_STATS rd_stats_y, rd_stats_uv;
-  int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
-  int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-  int64_t this_rd;
-  int ref;
-
-#if CONFIG_CB4X4
-  x->skip_chroma_rd =
-      !is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                           xd->plane[1].subsampling_y);
-#endif
-
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
-    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
-    assert(cfg != NULL);
-    av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
-                         &xd->block_refs[ref]->sf);
-  }
-  av1_setup_dst_planes(x->e_mbd.plane, bsize,
-                       get_frame_new_buffer(&cpi->common), mi_row, mi_col);
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  if (mbmi->motion_mode != NCOBMC_ADAPT_WEIGHT)
-#endif
-    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
-
-#if CONFIG_MOTION_VAR
-  if (mbmi->motion_mode == OBMC_CAUSAL) {
-#if CONFIG_NCOBMC
-    av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-#else
-    av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-#endif
-  }
-#endif  // CONFIG_MOTION_VAR
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  if (mbmi->motion_mode == NCOBMC_ADAPT_WEIGHT)
-    for (int plane = 0; plane < MAX_MB_PLANE; ++plane)
-      get_pred_from_intrpl_buf(xd, mi_row, mi_col, bsize, plane);
-#endif
-  av1_subtract_plane(x, bsize, 0);
-
-#if CONFIG_VAR_TX
-  if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
-    select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-  } else {
-    int idx, idy;
-    super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-    for (idy = 0; idy < xd->n8_h; ++idy)
-      for (idx = 0; idx < xd->n8_w; ++idx)
-        mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
-    memset(x->blk_skip[0], rd_stats_y.skip,
-           sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
-  }
-  inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-#else
-  super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
-  super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-#endif
-  assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
-
-  if (rd_stats_y.skip && rd_stats_uv.skip) {
-    rd_stats_y.rate = rate_skip1;
-    rd_stats_uv.rate = 0;
-    rd_stats_y.dist = rd_stats_y.sse;
-    rd_stats_uv.dist = rd_stats_uv.sse;
-    *skip_blk = 1;
-  } else if (RDCOST(x->rdmult,
-                    (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
-                    (rd_stats_y.dist + rd_stats_uv.dist)) >
-             RDCOST(x->rdmult, rate_skip1,
-                    (rd_stats_y.sse + rd_stats_uv.sse))) {
-    rd_stats_y.rate = rate_skip1;
-    rd_stats_uv.rate = 0;
-    rd_stats_y.dist = rd_stats_y.sse;
-    rd_stats_uv.dist = rd_stats_uv.sse;
-    *skip_blk = 1;
-  } else {
-    rd_stats_y.rate += rate_skip0;
-    *skip_blk = 0;
-  }
-
-  if (backup_mbmi) *backup_mbmi = *mbmi;
-
-  this_rd = RDCOST(x->rdmult, (rd_stats_y.rate + rd_stats_uv.rate),
-                   (rd_stats_y.dist + rd_stats_uv.dist));
-#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
-  if (motion_allowed == NCOBMC_ADAPT_WEIGHT) {
-    assert(mbmi->motion_mode <= NCOBMC_ADAPT_WEIGHT);
-    this_rd +=
-        RDCOST(x->rdmult, x->motion_mode_cost2[bsize][mbmi->motion_mode], 0);
-  } else if (motion_allowed == OBMC_CAUSAL) {
-    assert(mbmi->motion_mode <= OBMC_CAUSAL);
-    this_rd +=
-        RDCOST(x->rdmult, x->motion_mode_cost1[bsize][mbmi->motion_mode], 0);
-  } else {
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
-    this_rd +=
-        RDCOST(x->rdmult, x->motion_mode_cost[bsize][mbmi->motion_mode], 0);
-#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
-  }
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
-  return this_rd;
-}
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-void av1_check_ncobmc_adapt_weight_rd(const struct AV1_COMP *cpi,
-                                      struct macroblock *x, int mi_row,
-                                      int mi_col) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_VAR_TX
-  const int n4 = bsize_to_num_blk(bsize);
-  uint8_t st_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
-  uint8_t obmc_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
-  uint8_t ncobmc_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
-#endif
-  MB_MODE_INFO st_mbmi, obmc_mbmi, ncobmc_mbmi;
-  int st_skip, obmc_skip, ncobmc_skip;
-  int64_t st_rd, obmc_rd, ncobmc_rd;
-#if CONFIG_WARPED_MOTION
-  const AV1_COMMON *const cm = &cpi->common;
-  const int is_warp_motion = mbmi->motion_mode == WARPED_CAUSAL;
-  const int rs = RDCOST(x->rdmult, av1_get_switchable_rate(cm, x, xd), 0);
-  MB_MODE_INFO warp_mbmi;
-  int64_t warp_rd;
-  int warp_skip;
-#endif
-
-  // Recompute the rd for the motion mode decided in rd loop
-  mbmi->motion_mode = SIMPLE_TRANSLATION;
-  st_rd = get_prediction_rd_cost(cpi, x, mi_row, mi_col, &st_skip, &st_mbmi);
-#if CONFIG_WARPED_MOTION
-  st_rd += rs;
-#endif
-#if CONFIG_VAR_TX
-  memcpy(st_blk_skip, x->blk_skip[0], sizeof(st_blk_skip[0]) * n4);
-#endif
-
-  mbmi->motion_mode = OBMC_CAUSAL;
-  obmc_rd =
-      get_prediction_rd_cost(cpi, x, mi_row, mi_col, &obmc_skip, &obmc_mbmi);
-#if CONFIG_WARPED_MOTION
-  obmc_rd += rs;
-#endif
-#if CONFIG_VAR_TX
-  memcpy(obmc_blk_skip, x->blk_skip[0], sizeof(obmc_blk_skip[0]) * n4);
-#endif
-
-  // Compute the rd cost for ncobmc adaptive weight
-  mbmi->motion_mode = NCOBMC_ADAPT_WEIGHT;
-  ncobmc_rd = get_prediction_rd_cost(cpi, x, mi_row, mi_col, &ncobmc_skip,
-                                     &ncobmc_mbmi);
-#if CONFIG_WARPED_MOTION
-  ncobmc_rd += rs;
-#endif
-  // Calculate the ncobmc mode costs
-  {
-    ADAPT_OVERLAP_BLOCK aob = adapt_overlap_block_lookup[bsize];
-    ncobmc_rd +=
-        RDCOST(x->rdmult, x->ncobmc_mode_cost[aob][mbmi->ncobmc_mode[0]], 0);
-    if (mi_size_wide[bsize] != mi_size_high[bsize])
-      ncobmc_rd +=
-          RDCOST(x->rdmult, x->ncobmc_mode_cost[aob][mbmi->ncobmc_mode[1]], 0);
-  }
-#if CONFIG_VAR_TX
-  memcpy(ncobmc_blk_skip, x->blk_skip[0], sizeof(ncobmc_blk_skip[0]) * n4);
-#endif
-
-#if CONFIG_WARPED_MOTION
-  if (is_warp_motion) {
-    mbmi->motion_mode = WARPED_CAUSAL;
-    warp_rd =
-        get_prediction_rd_cost(cpi, x, mi_row, mi_col, &warp_skip, &warp_mbmi);
-  } else {
-    warp_rd = INT64_MAX;
-  }
-#endif
-
-#if CONFIG_WARPED_MOTION
-  if (AOMMIN(ncobmc_rd, warp_rd) < AOMMIN(st_rd, obmc_rd)) {
-    if (ncobmc_rd < warp_rd) {
-      x->skip = ncobmc_skip;
-      *mbmi = ncobmc_mbmi;
-#if CONFIG_VAR_TX
-      memcpy(x->blk_skip[0], ncobmc_blk_skip, sizeof(ncobmc_blk_skip[0]) * n4);
-#endif
-    } else {
-      x->skip = warp_skip;
-      *mbmi = warp_mbmi;
-    }
-#else
-  if (ncobmc_rd < AOMMIN(st_rd, obmc_rd)) {
-    x->skip = ncobmc_skip;
-    *mbmi = ncobmc_mbmi;
-#if CONFIG_VAR_TX
-    memcpy(x->blk_skip[0], ncobmc_blk_skip, sizeof(ncobmc_blk_skip[0]) * n4);
-#endif
-#endif  // CONFIG_WARPED_MOTION
-  } else {
-    if (obmc_rd < st_rd) {
-      *mbmi = obmc_mbmi;
-      x->skip = obmc_skip;
-#if CONFIG_VAR_TX
-      memcpy(x->blk_skip[0], obmc_blk_skip, sizeof(obmc_blk_skip[0]) * n4);
-#endif
-    } else {
-      *mbmi = st_mbmi;
-      x->skip = st_skip;
-#if CONFIG_VAR_TX
-      memcpy(x->blk_skip[0], st_blk_skip, sizeof(st_blk_skip[0]) * n4);
-#endif
-    }
-  }
-}
-
-int64_t get_ncobmc_error(MACROBLOCKD *xd, int pxl_row, int pxl_col,
-                         BLOCK_SIZE bsize, int plane, struct buf_2d *src) {
-  const int wide = AOMMIN(mi_size_wide[bsize] * MI_SIZE,
-                          (xd->sb_mi_bd.mi_col_end + 1) * MI_SIZE - pxl_col);
-  const int high = AOMMIN(mi_size_high[bsize] * MI_SIZE,
-                          (xd->sb_mi_bd.mi_row_end + 1) * MI_SIZE - pxl_row);
-  const int ss_x = xd->plane[plane].subsampling_x;
-  const int ss_y = xd->plane[plane].subsampling_y;
-  int row_offset = (pxl_row - xd->sb_mi_bd.mi_row_begin * MI_SIZE) >> ss_y;
-  int col_offset = (pxl_col - xd->sb_mi_bd.mi_col_begin * MI_SIZE) >> ss_x;
-  int dst_stride = xd->ncobmc_pred_buf_stride[plane];
-  int dst_offset = row_offset * dst_stride + col_offset;
-  int src_stride = src->stride;
-
-  int r, c;
-  int64_t tmp, error = 0;
-
-  for (r = 0; r < (high >> ss_y); ++r) {
-    for (c = 0; c < (wide >> ss_x); ++c) {
-      tmp = xd->ncobmc_pred_buf[plane][r * dst_stride + c + dst_offset] -
-            src->buf[r * src_stride + c];
-      error += tmp * tmp;
-    }
-  }
-  return error;
-}
-
-int get_ncobmc_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                    MACROBLOCKD *xd, int mi_row, int mi_col, int bsize) {
-  const AV1_COMMON *const cm = &cpi->common;
-  uint8_t *pred_buf[4][MAX_MB_PLANE];
-
-  // TODO(weitinglin): stride size needs to be fixed for high-bit depth
-  int pred_stride[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-
-  // target block in pxl
-  int pxl_row = mi_row << MI_SIZE_LOG2;
-  int pxl_col = mi_col << MI_SIZE_LOG2;
-  int64_t error, best_error = INT64_MAX;
-  int plane, tmp_mode, best_mode = 0;
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    int len = sizeof(uint16_t);
-    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[0], cm->ncobmcaw_buf[0], MAX_SB_SQUARE,
-                            len);
-    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[1], cm->ncobmcaw_buf[1], MAX_SB_SQUARE,
-                            len);
-    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[2], cm->ncobmcaw_buf[2], MAX_SB_SQUARE,
-                            len);
-    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[3], cm->ncobmcaw_buf[3], MAX_SB_SQUARE,
-                            len);
-  } else {
-#endif  // CONFIG_HIGHBITDEPTH
-    ASSIGN_ALIGNED_PTRS(pred_buf[0], cm->ncobmcaw_buf[0], MAX_SB_SQUARE);
-    ASSIGN_ALIGNED_PTRS(pred_buf[1], cm->ncobmcaw_buf[1], MAX_SB_SQUARE);
-    ASSIGN_ALIGNED_PTRS(pred_buf[2], cm->ncobmcaw_buf[2], MAX_SB_SQUARE);
-    ASSIGN_ALIGNED_PTRS(pred_buf[3], cm->ncobmcaw_buf[3], MAX_SB_SQUARE);
-#if CONFIG_HIGHBITDEPTH
-  }
-#endif
-
-  av1_get_ext_blk_preds(cm, xd, bsize, mi_row, mi_col, pred_buf, pred_stride);
-  av1_get_ori_blk_pred(cm, xd, bsize, mi_row, mi_col, pred_buf[3], pred_stride);
-
-  for (tmp_mode = 0; tmp_mode < MAX_NCOBMC_MODES; ++tmp_mode) {
-    error = 0;
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      build_ncobmc_intrpl_pred(cm, xd, plane, pxl_row, pxl_col, bsize, pred_buf,
-                               pred_stride, tmp_mode);
-      error += get_ncobmc_error(xd, pxl_row, pxl_col, bsize, plane,
-                                &x->plane[plane].src);
-    }
-    if (error < best_error) {
-      best_mode = tmp_mode;
-      best_error = error;
-    }
-  }
-
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    build_ncobmc_intrpl_pred(cm, xd, plane, pxl_row, pxl_col, bsize, pred_buf,
-                             pred_stride, best_mode);
   }
-
-  return best_mode;
 }
-
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR
diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h
index dbc7527fb..1fa3d68ce 100644
--- a/third_party/aom/av1/encoder/rdopt.h
+++ b/third_party/aom/av1/encoder/rdopt.h
@@ -13,16 +13,20 @@
 #define AV1_ENCODER_RDOPT_H_
 
 #include "av1/common/blockd.h"
+#include "av1/common/txb_common.h"
 
 #include "av1/encoder/block.h"
 #include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#define MAX_REF_MV_SERCH 3
+
 struct TileInfo;
-struct AV1_COMP;
 struct macroblock;
 struct RD_STATS;
 
@@ -35,7 +39,6 @@ static INLINE void av1_update_txb_coeff_cost(RD_STATS *rd_stats, int plane,
   (void)tx_size;
   rd_stats->txb_coeff_cost[plane] += txb_coeff_cost;
 
-#if CONFIG_VAR_TX
   {
     const int txb_h = tx_size_high_unit[tx_size];
     const int txb_w = tx_size_wide_unit[tx_size];
@@ -48,113 +51,86 @@ static INLINE void av1_update_txb_coeff_cost(RD_STATS *rd_stats, int plane,
   }
   assert(blk_row < TXB_COEFF_COST_MAP_SIZE);
   assert(blk_col < TXB_COEFF_COST_MAP_SIZE);
-#endif
 }
 #endif
 
-typedef enum OUTPUT_STATUS {
-  OUTPUT_HAS_PREDICTED_PIXELS,
-  OUTPUT_HAS_DECODED_PIXELS
-} OUTPUT_STATUS;
-
 // Returns the number of colors in 'src'.
-int av1_count_colors(const uint8_t *src, int stride, int rows, int cols);
-#if CONFIG_HIGHBITDEPTH
+int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+                     int *val_count);
 // Same as av1_count_colors(), but for high-bitdepth mode.
 int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
-                            int bit_depth);
-#endif  // CONFIG_HIGHBITDEPTH
-
-void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
-                    BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col,
-                    TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse,
-                    OUTPUT_STATUS output_status);
+                            int bit_depth, int *val_count);
 
 #if CONFIG_DIST_8X8
-int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
+int64_t av1_dist_8x8(const struct AV1_COMP *const cpi, const MACROBLOCK *x,
                      const uint8_t *src, int src_stride, const uint8_t *dst,
                      int dst_stride, const BLOCK_SIZE tx_bsize, int bsw,
                      int bsh, int visible_w, int visible_h, int qindex);
 #endif
 
-#if !CONFIG_PVQ || CONFIG_VAR_TX
-int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
-                    int blk_row, int blk_col, int block, TX_SIZE tx_size,
-                    const SCAN_ORDER *scan_order, const ENTROPY_CONTEXT *a,
-                    const ENTROPY_CONTEXT *l, int use_fast_coef_costing);
+static INLINE int av1_cost_skip_txb(MACROBLOCK *x, const TXB_CTX *const txb_ctx,
+                                    int plane, TX_SIZE tx_size) {
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const LV_MAP_COEFF_COST *const coeff_costs =
+      &x->coeff_costs[txs_ctx][plane_type];
+  return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+}
+
+static INLINE int av1_cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x,
+                                  int plane, int blk_row, int blk_col,
+                                  int block, TX_SIZE tx_size,
+                                  const TXB_CTX *const txb_ctx,
+                                  int use_fast_coef_costing) {
+#if TXCOEFF_COST_TIMER
+  struct aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+#endif
+  (void)use_fast_coef_costing;
+  const int cost = av1_cost_coeffs_txb(cm, x, plane, blk_row, blk_col, block,
+                                       tx_size, txb_ctx);
+#if TXCOEFF_COST_TIMER
+  AV1_COMMON *tmp_cm = (AV1_COMMON *)&cpi->common;
+  aom_usec_timer_mark(&timer);
+  const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+  tmp_cm->txcoeff_cost_timer += elapsed_time;
+  ++tmp_cm->txcoeff_cost_count;
 #endif
+  return cost;
+}
+
 void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
-                               struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
-                               PICK_MODE_CONTEXT *ctx, int64_t best_rd);
+                               int mi_row, int mi_col, struct RD_STATS *rd_cost,
+                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                               int64_t best_rd);
 
-unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
+unsigned int av1_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
                                            const struct buf_2d *ref,
                                            BLOCK_SIZE bs);
-#if CONFIG_HIGHBITDEPTH
-unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi,
+unsigned int av1_high_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
                                                 const struct buf_2d *ref,
                                                 BLOCK_SIZE bs, int bd);
-#endif
 
 void av1_rd_pick_inter_mode_sb(const struct AV1_COMP *cpi,
                                struct TileDataEnc *tile_data,
                                struct macroblock *x, int mi_row, int mi_col,
-                               struct RD_STATS *rd_cost,
-#if CONFIG_SUPERTX
-                               int *returnrate_nocoef,
-#endif  // CONFIG_SUPERTX
-                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                               int64_t best_rd_so_far);
+                               struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
 
 void av1_rd_pick_inter_mode_sb_seg_skip(
     const struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
     struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost,
     BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
 
-int av1_internal_image_edge(const struct AV1_COMP *cpi);
-int av1_active_h_edge(const struct AV1_COMP *cpi, int mi_row, int mi_step);
-int av1_active_v_edge(const struct AV1_COMP *cpi, int mi_col, int mi_step);
-int av1_active_edge_sb(const struct AV1_COMP *cpi, int mi_row, int mi_col);
-
-#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
-void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
-                         int mi_row, int mi_col);
-#endif  // CONFIG_MOTION_VAR && CONFIG_NCOBMC
-
-#if CONFIG_SUPERTX
-#if CONFIG_VAR_TX
-void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
-                       int blk_row, int blk_col, int plane, int block,
-                       int plane_bsize, const ENTROPY_CONTEXT *a,
-                       const ENTROPY_CONTEXT *l, RD_STATS *rd_stats);
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+#define INTER_MODE_RD_TEST 0
+void av1_inter_mode_data_init();
+void av1_inter_mode_data_fit(int rdmult);
+void av1_inter_mode_data_show(const AV1_COMMON *cm);
 #endif
 
-void av1_txfm_rd_in_plane_supertx(MACROBLOCK *x, const AV1_COMP *cpi, int *rate,
-                                  int64_t *distortion, int *skippable,
-                                  int64_t *sse, int64_t ref_best_rd, int plane,
-                                  BLOCK_SIZE bsize, TX_SIZE tx_size,
-                                  int use_fast_coef_casting);
-#endif  // CONFIG_SUPERTX
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-int av1_tx_type_cost(const AV1_COMMON *cm, const MACROBLOCK *x,
-                     const MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
-                     TX_SIZE tx_size, TX_TYPE tx_type);
-
-int64_t get_prediction_rd_cost(const struct AV1_COMP *cpi, struct macroblock *x,
-                               int mi_row, int mi_col, int *skip_blk,
-                               MB_MODE_INFO *backup_mbmi);
-
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-void av1_check_ncobmc_adapt_weight_rd(const struct AV1_COMP *cpi,
-                                      struct macroblock *x, int mi_row,
-                                      int mi_col);
-int get_ncobmc_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                    MACROBLOCKD *xd, int mi_row, int mi_col, int bsize);
-
-#endif
-
 #endif  // AV1_ENCODER_RDOPT_H_
diff --git a/third_party/aom/av1/encoder/segmentation.c b/third_party/aom/av1/encoder/segmentation.c
index 4f01fbba4..2e9102745 100644
--- a/third_party/aom/av1/encoder/segmentation.c
+++ b/third_party/aom/av1/encoder/segmentation.c
@@ -18,26 +18,21 @@
 
 #include "av1/encoder/cost.h"
 #include "av1/encoder/segmentation.h"
-#include "av1/encoder/subexp.h"
 
 void av1_enable_segmentation(struct segmentation *seg) {
   seg->enabled = 1;
   seg->update_map = 1;
   seg->update_data = 1;
+  seg->temporal_update = 0;
 }
 
 void av1_disable_segmentation(struct segmentation *seg) {
   seg->enabled = 0;
   seg->update_map = 0;
   seg->update_data = 0;
+  seg->temporal_update = 0;
 }
 
-void av1_set_segment_data(struct segmentation *seg, int8_t *feature_data,
-                          unsigned char abs_delta) {
-  seg->abs_delta = abs_delta;
-
-  memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data));
-}
 void av1_disable_segfeature(struct segmentation *seg, int segment_id,
                             SEG_LVL_FEATURES feature_id) {
   seg->feature_mask[segment_id] &= ~(1 << feature_id);
@@ -48,76 +43,8 @@ void av1_clear_segdata(struct segmentation *seg, int segment_id,
   seg->feature_data[segment_id][feature_id] = 0;
 }
 
-// Based on set of segment counts calculate a probability tree
-static void calc_segtree_probs(unsigned *segcounts,
-                               aom_prob *segment_tree_probs,
-                               const aom_prob *cur_tree_probs,
-                               const int probwt) {
-  // Work out probabilities of each segment
-  const unsigned cc[4] = { segcounts[0] + segcounts[1],
-                           segcounts[2] + segcounts[3],
-                           segcounts[4] + segcounts[5],
-                           segcounts[6] + segcounts[7] };
-  const unsigned ccc[2] = { cc[0] + cc[1], cc[2] + cc[3] };
-  int i;
-
-  segment_tree_probs[0] = get_binary_prob(ccc[0], ccc[1]);
-  segment_tree_probs[1] = get_binary_prob(cc[0], cc[1]);
-  segment_tree_probs[2] = get_binary_prob(cc[2], cc[3]);
-  segment_tree_probs[3] = get_binary_prob(segcounts[0], segcounts[1]);
-  segment_tree_probs[4] = get_binary_prob(segcounts[2], segcounts[3]);
-  segment_tree_probs[5] = get_binary_prob(segcounts[4], segcounts[5]);
-  segment_tree_probs[6] = get_binary_prob(segcounts[6], segcounts[7]);
-
-  for (i = 0; i < 7; i++) {
-    const unsigned *ct =
-        i == 0 ? ccc : i < 3 ? cc + (i & 2) : segcounts + (i - 3) * 2;
-    av1_prob_diff_update_savings_search(ct, cur_tree_probs[i],
-                                        &segment_tree_probs[i],
-                                        DIFF_UPDATE_PROB, probwt);
-  }
-}
-
-// Based on set of segment counts and probabilities calculate a cost estimate
-static int cost_segmap(unsigned *segcounts, aom_prob *probs) {
-  const int c01 = segcounts[0] + segcounts[1];
-  const int c23 = segcounts[2] + segcounts[3];
-  const int c45 = segcounts[4] + segcounts[5];
-  const int c67 = segcounts[6] + segcounts[7];
-  const int c0123 = c01 + c23;
-  const int c4567 = c45 + c67;
-
-  // Cost the top node of the tree
-  int cost = c0123 * av1_cost_zero(probs[0]) + c4567 * av1_cost_one(probs[0]);
-
-  // Cost subsequent levels
-  if (c0123 > 0) {
-    cost += c01 * av1_cost_zero(probs[1]) + c23 * av1_cost_one(probs[1]);
-
-    if (c01 > 0)
-      cost += segcounts[0] * av1_cost_zero(probs[3]) +
-              segcounts[1] * av1_cost_one(probs[3]);
-    if (c23 > 0)
-      cost += segcounts[2] * av1_cost_zero(probs[4]) +
-              segcounts[3] * av1_cost_one(probs[4]);
-  }
-
-  if (c4567 > 0) {
-    cost += c45 * av1_cost_zero(probs[2]) + c67 * av1_cost_one(probs[2]);
-
-    if (c45 > 0)
-      cost += segcounts[4] * av1_cost_zero(probs[5]) +
-              segcounts[5] * av1_cost_one(probs[5]);
-    if (c67 > 0)
-      cost += segcounts[6] * av1_cost_zero(probs[6]) +
-              segcounts[7] * av1_cost_one(probs[6]);
-  }
-
-  return cost;
-}
-
 static void count_segs(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                       const TileInfo *tile, MODE_INFO **mi,
+                       const TileInfo *tile, MB_MODE_INFO **mi,
                        unsigned *no_pred_segcounts,
                        unsigned (*temporal_predictor_count)[2],
                        unsigned *t_unpred_seg_counts, int bw, int bh,
@@ -127,29 +54,27 @@ static void count_segs(const AV1_COMMON *cm, MACROBLOCKD *xd,
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
   xd->mi = mi;
-  segment_id = xd->mi[0]->mbmi.segment_id;
+  segment_id = xd->mi[0]->segment_id;
 
-  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw,
-#if CONFIG_DEPENDENT_HORZTILES
-                 cm->dependent_horz_tiles,
-#endif  // CONFIG_DEPENDENT_HORZTILES
-                 cm->mi_rows, cm->mi_cols);
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
 
   // Count the number of hits on each segment with no prediction
   no_pred_segcounts[segment_id]++;
 
   // Temporal prediction not allowed on key frames
   if (cm->frame_type != KEY_FRAME) {
-    const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+    const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
     // Test to see if the segment id matches the predicted value.
     const int pred_segment_id =
-        get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col);
+        cm->last_frame_seg_map
+            ? get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col)
+            : 0;
     const int pred_flag = pred_segment_id == segment_id;
     const int pred_context = av1_get_pred_context_seg_id(xd);
 
     // Store the prediction status for this mb and update counts
     // as appropriate
-    xd->mi[0]->mbmi.seg_id_predicted = pred_flag;
+    xd->mi[0]->seg_id_predicted = pred_flag;
     temporal_predictor_count[pred_context][pred_flag]++;
 
     // Update the "unpredicted" segment count
@@ -158,21 +83,15 @@ static void count_segs(const AV1_COMMON *cm, MACROBLOCKD *xd,
 }
 
 static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                          const TileInfo *tile, MODE_INFO **mi,
+                          const TileInfo *tile, MB_MODE_INFO **mi,
                           unsigned *no_pred_segcounts,
                           unsigned (*temporal_predictor_count)[2],
                           unsigned *t_unpred_seg_counts, int mi_row, int mi_col,
                           BLOCK_SIZE bsize) {
   const int mis = cm->mi_stride;
   const int bs = mi_size_wide[bsize], hbs = bs / 2;
-#if CONFIG_EXT_PARTITION_TYPES
   PARTITION_TYPE partition;
-#if CONFIG_EXT_PARTITION_TYPES_AB
   const int qbs = bs / 4;
-#endif  // CONFIG_EXT_PARTITION_TYPES_AB
-#else
-  int bw, bh;
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
@@ -181,7 +100,6 @@ static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
              no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, \
              (cs_bw), (cs_bh), mi_row + (cs_rowoff), mi_col + (cs_coloff));
 
-#if CONFIG_EXT_PARTITION_TYPES
   if (bsize == BLOCK_8X8)
     partition = PARTITION_NONE;
   else
@@ -196,28 +114,6 @@ static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
       CSEGS(hbs, bs, 0, 0);
       CSEGS(hbs, bs, 0, hbs);
       break;
-#if CONFIG_EXT_PARTITION_TYPES_AB
-    case PARTITION_HORZ_A:
-      CSEGS(bs, qbs, 0, 0);
-      CSEGS(bs, qbs, qbs, 0);
-      CSEGS(bs, hbs, hbs, 0);
-      break;
-    case PARTITION_HORZ_B:
-      CSEGS(bs, hbs, 0, 0);
-      CSEGS(bs, qbs, hbs, 0);
-      if (mi_row + 3 * qbs < cm->mi_rows) CSEGS(bs, qbs, 3 * qbs, 0);
-      break;
-    case PARTITION_VERT_A:
-      CSEGS(qbs, bs, 0, 0);
-      CSEGS(qbs, bs, 0, qbs);
-      CSEGS(hbs, bs, 0, hbs);
-      break;
-    case PARTITION_VERT_B:
-      CSEGS(hbs, bs, 0, 0);
-      CSEGS(qbs, bs, 0, hbs);
-      if (mi_col + 3 * qbs < cm->mi_cols) CSEGS(qbs, bs, 0, 3 * qbs);
-      break;
-#else
     case PARTITION_HORZ_A:
       CSEGS(hbs, hbs, 0, 0);
       CSEGS(hbs, hbs, 0, hbs);
@@ -238,14 +134,24 @@ static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
       CSEGS(hbs, hbs, 0, hbs);
       CSEGS(hbs, hbs, hbs, hbs);
       break;
-#endif
+    case PARTITION_HORZ_4:
+      CSEGS(bs, qbs, 0, 0);
+      CSEGS(bs, qbs, qbs, 0);
+      CSEGS(bs, qbs, 2 * qbs, 0);
+      if (mi_row + 3 * qbs < cm->mi_rows) CSEGS(bs, qbs, 3 * qbs, 0);
+      break;
+
+    case PARTITION_VERT_4:
+      CSEGS(qbs, bs, 0, 0);
+      CSEGS(qbs, bs, 0, qbs);
+      CSEGS(qbs, bs, 0, 2 * qbs);
+      if (mi_col + 3 * qbs < cm->mi_cols) CSEGS(qbs, bs, 0, 3 * qbs);
+      break;
+
     case PARTITION_SPLIT: {
-      const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
+      const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
       int n;
 
-      assert(num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type] < bs &&
-             num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type] < bs);
-
       for (n = 0; n < 4; n++) {
         const int mi_dc = hbs * (n & 1);
         const int mi_dr = hbs * (n >> 1);
@@ -257,34 +163,6 @@ static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
     } break;
     default: assert(0);
   }
-#else
-  bw = mi_size_wide[mi[0]->mbmi.sb_type];
-  bh = mi_size_high[mi[0]->mbmi.sb_type];
-
-  if (bw == bs && bh == bs) {
-    CSEGS(bs, bs, 0, 0);
-  } else if (bw == bs && bh < bs) {
-    CSEGS(bs, hbs, 0, 0);
-    CSEGS(bs, hbs, hbs, 0);
-  } else if (bw < bs && bh == bs) {
-    CSEGS(hbs, bs, 0, 0);
-    CSEGS(hbs, bs, 0, hbs);
-  } else {
-    const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
-    int n;
-
-    assert(bw < bs && bh < bs);
-
-    for (n = 0; n < 4; n++) {
-      const int mi_dc = hbs * (n & 1);
-      const int mi_dr = hbs * (n >> 1);
-
-      count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc], no_pred_segcounts,
-                    temporal_predictor_count, t_unpred_seg_counts,
-                    mi_row + mi_dr, mi_col + mi_dc, subsize);
-    }
-  }
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
 #undef CSEGS
 }
@@ -292,83 +170,58 @@ static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
 void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) {
   struct segmentation *seg = &cm->seg;
   struct segmentation_probs *segp = &cm->fc->seg;
-
   int no_pred_cost;
   int t_pred_cost = INT_MAX;
-
   int tile_col, tile_row, mi_row, mi_col;
-  const int probwt = cm->num_tg;
-
-  unsigned(*temporal_predictor_count)[2] = cm->counts.seg.pred;
-  unsigned *no_pred_segcounts = cm->counts.seg.tree_total;
-  unsigned *t_unpred_seg_counts = cm->counts.seg.tree_mispred;
-
-  aom_prob no_pred_tree[SEG_TREE_PROBS];
-  aom_prob t_pred_tree[SEG_TREE_PROBS];
-#if !CONFIG_NEW_MULTISYMBOL
-  aom_prob t_nopred_prob[PREDICTION_PROBS];
-#endif
-
+  unsigned temporal_predictor_count[SEG_TEMPORAL_PRED_CTXS][2] = { { 0 } };
+  unsigned no_pred_segcounts[MAX_SEGMENTS] = { 0 };
+  unsigned t_unpred_seg_counts[MAX_SEGMENTS] = { 0 };
   (void)xd;
 
-  // We are about to recompute all the segment counts, so zero the accumulators.
-  av1_zero(cm->counts.seg);
-
   // First of all generate stats regarding how well the last segment map
   // predicts this one
   for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
     TileInfo tile_info;
     av1_tile_set_row(&tile_info, cm, tile_row);
     for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
-      MODE_INFO **mi_ptr;
+      MB_MODE_INFO **mi_ptr;
       av1_tile_set_col(&tile_info, cm, tile_col);
-#if CONFIG_DEPENDENT_HORZTILES
-      av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col);
-#endif
       mi_ptr = cm->mi_grid_visible + tile_info.mi_row_start * cm->mi_stride +
                tile_info.mi_col_start;
       for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
-           mi_row += cm->mib_size, mi_ptr += cm->mib_size * cm->mi_stride) {
-        MODE_INFO **mi = mi_ptr;
+           mi_row += cm->seq_params.mib_size,
+          mi_ptr += cm->seq_params.mib_size * cm->mi_stride) {
+        MB_MODE_INFO **mi = mi_ptr;
         for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-             mi_col += cm->mib_size, mi += cm->mib_size) {
+             mi_col += cm->seq_params.mib_size, mi += cm->seq_params.mib_size) {
           count_segs_sb(cm, xd, &tile_info, mi, no_pred_segcounts,
                         temporal_predictor_count, t_unpred_seg_counts, mi_row,
-                        mi_col, cm->sb_size);
+                        mi_col, cm->seq_params.sb_size);
         }
       }
     }
   }
 
-  // Work out probability tree for coding segments without prediction
-  // and the cost.
-  calc_segtree_probs(no_pred_segcounts, no_pred_tree, segp->tree_probs, probwt);
-  no_pred_cost = cost_segmap(no_pred_segcounts, no_pred_tree);
-
-  // Key frames cannot use temporal prediction
-  if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
-    // Work out probability tree for coding those segments not
-    // predicted using the temporal method and the cost.
-    calc_segtree_probs(t_unpred_seg_counts, t_pred_tree, segp->tree_probs,
-                       probwt);
-    t_pred_cost = cost_segmap(t_unpred_seg_counts, t_pred_tree);
-#if !CONFIG_NEW_MULTISYMBOL
-    // Add in the cost of the signaling for each prediction context.
-    int i;
-    for (i = 0; i < PREDICTION_PROBS; i++) {
-      const int count0 = temporal_predictor_count[i][0];
-      const int count1 = temporal_predictor_count[i][1];
-
-      t_nopred_prob[i] = get_binary_prob(count0, count1);
-      av1_prob_diff_update_savings_search(
-          temporal_predictor_count[i], segp->pred_probs[i], &t_nopred_prob[i],
-          DIFF_UPDATE_PROB, probwt);
-
-      // Add in the predictor signaling cost
-      t_pred_cost += count0 * av1_cost_zero(t_nopred_prob[i]) +
-                     count1 * av1_cost_one(t_nopred_prob[i]);
+  int seg_id_cost[MAX_SEGMENTS];
+  av1_cost_tokens_from_cdf(seg_id_cost, segp->tree_cdf, NULL);
+  no_pred_cost = 0;
+  for (int i = 0; i < MAX_SEGMENTS; ++i)
+    no_pred_cost += no_pred_segcounts[i] * seg_id_cost[i];
+
+  // Frames without past dependency cannot use temporal prediction
+  if (cm->primary_ref_frame != PRIMARY_REF_NONE) {
+    int pred_flag_cost[SEG_TEMPORAL_PRED_CTXS][2];
+    for (int i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i)
+      av1_cost_tokens_from_cdf(pred_flag_cost[i], segp->pred_cdf[i], NULL);
+    t_pred_cost = 0;
+    // Cost for signaling the prediction flag.
+    for (int i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i) {
+      for (int j = 0; j < 2; ++j)
+        t_pred_cost += temporal_predictor_count[i][j] * pred_flag_cost[i][j];
     }
-#endif
+    // Cost for signaling the unpredicted segment id.
+    for (int i = 0; i < MAX_SEGMENTS; ++i)
+      t_pred_cost += t_unpred_seg_counts[i] * seg_id_cost[i];
   }
 
   // Now choose which coding method to use.
diff --git a/third_party/aom/av1/encoder/segmentation.h b/third_party/aom/av1/encoder/segmentation.h
index 1d24ed1d1..a207b0f26 100644
--- a/third_party/aom/av1/encoder/segmentation.h
+++ b/third_party/aom/av1/encoder/segmentation.h
@@ -27,19 +27,6 @@ void av1_disable_segfeature(struct segmentation *seg, int segment_id,
 void av1_clear_segdata(struct segmentation *seg, int segment_id,
                        SEG_LVL_FEATURES feature_id);
 
-// The values given for each segment can be either deltas (from the default
-// value chosen for the frame) or absolute values.
-//
-// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for
-// SEGMENT_ALT_LF)
-// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for
-// SEGMENT_ALT_LF)
-//
-// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
-// the absolute values given).
-void av1_set_segment_data(struct segmentation *seg, int8_t *feature_data,
-                          unsigned char abs_delta);
-
 void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd);
 
 void av1_reset_segment_features(AV1_COMMON *cm);
diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c
index 5608d031e..49740817c 100644
--- a/third_party/aom/av1/encoder/speed_features.c
+++ b/third_party/aom/av1/encoder/speed_features.c
@@ -17,6 +17,12 @@
 
 #include "aom_dsp/aom_dsp_common.h"
 
+// Setting this to 1 will disable trellis optimization completely.
+// Setting this to 2 will disable trellis optimization within the
+// transform search. Trellis optimization will still be applied
+// in the final encode.
+#define DISABLE_TRELLISQ_SEARCH 0
+
 #define MAX_MESH_SPEED 5  // Max speed setting for mesh motion method
 static MESH_PATTERN
     good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
@@ -28,23 +34,21 @@ static MESH_PATTERN
       { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
     };
 static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] = {
-  50, 25, 15, 5, 1, 1
+  50, 50, 25, 15, 5, 1
 };
 
-#if CONFIG_INTRABC
-// TODO(aconverse@google.com): These settings are pretty relaxed, tune them for
+// TODO(huisu@google.com): These settings are pretty relaxed, tune them for
 // each speed setting
 static MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
   { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } },
   { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
   { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
   { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
   { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
-  { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
 };
 static uint8_t intrabc_max_mesh_pct[MAX_MESH_SPEED + 1] = { 100, 100, 100,
                                                             25,  25,  10 };
-#endif
 
 // Intra only frames, golden frames (except alt ref overlays) and
 // alt ref frames tend to be coded at a higher than ambient quality
@@ -74,22 +78,18 @@ static BLOCK_SIZE set_partition_min_limit(AV1_COMMON *const cm) {
   }
 }
 
+// Do we have an internal image edge (e.g. formatting bars).
+static int has_internal_image_edge(const AV1_COMP *cpi) {
+  return (cpi->oxcf.pass == 2) &&
+         ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
+          (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
+}
+
 static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
                                                        SPEED_FEATURES *sf,
                                                        int speed) {
   AV1_COMMON *const cm = &cpi->common;
 
-  if (speed >= 1) {
-    if (AOMMIN(cm->width, cm->height) >= 720) {
-      sf->disable_split_mask =
-          cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
-      sf->partition_search_breakout_dist_thr = (1 << 23);
-    } else {
-      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
-      sf->partition_search_breakout_dist_thr = (1 << 21);
-    }
-  }
-
   if (speed >= 2) {
     if (AOMMIN(cm->width, cm->height) >= 720) {
       sf->disable_split_mask =
@@ -121,11 +121,11 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
   }
 
   // If this is a two pass clip that fits the criteria for animated or
-  // graphics content then reset disable_split_mask for speeds 1-4.
+  // graphics content then reset disable_split_mask for speeds 2+.
   // Also if the image edge is internal to the coded area.
-  if ((speed >= 1) && (cpi->oxcf.pass == 2) &&
+  if ((speed >= 2) && (cpi->oxcf.pass == 2) &&
       ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
-       (av1_internal_image_edge(cpi)))) {
+       (has_internal_image_edge(cpi)))) {
     sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
   }
 
@@ -145,85 +145,83 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
   AV1_COMMON *const cm = &cpi->common;
   const int boosted = frame_is_boosted(cpi);
 
+  // Speed 0 for all speed features that give neutral coding performance change.
+  sf->reduce_inter_modes = 1;
+  sf->prune_ext_partition_types_search_level = 1;
+  sf->ml_prune_ab_partition = 1;
+  sf->adaptive_txb_search_level = 1;
+  sf->jnt_comp_skip_mv_search = 1;
+  sf->model_based_prune_tx_search_level = 1;
+  sf->model_based_post_interp_filter_breakout = 1;
+  sf->inter_mode_rd_model_estimation = 1;
+
   if (speed >= 1) {
-    sf->tx_type_search.fast_intra_tx_type_search = 1;
-    sf->tx_type_search.fast_inter_tx_type_search = 1;
+    sf->gm_erroradv_type = GM_ERRORADV_TR_1;
+    sf->selective_ref_frame = 1;
+    sf->inter_tx_size_search_init_depth_rect = 1;
+    sf->inter_tx_size_search_init_depth_sqr = 1;
+    sf->intra_tx_size_search_init_depth_rect = 1;
+    sf->intra_tx_size_search_init_depth_sqr = 1;
+    sf->tx_size_search_lgr_block = 1;
+    sf->two_pass_partition_search = 1;
+    sf->mode_pruning_based_on_two_pass_partition_search = 1;
+    sf->prune_ext_partition_types_search_level = 2;
+    sf->use_fast_interpolation_filter_search = 1;
+    sf->skip_repeat_interpolation_filter_search = 1;
+    sf->tx_type_search.skip_tx_search = 1;
+    sf->tx_type_search.ml_tx_split_thresh = 40;
+    sf->model_based_prune_tx_search_level = 0;
+    sf->model_based_post_interp_filter_breakout = 0;
+    // TODO(angiebird): Re-evaluate the impact of inter_mode_rd_model_estimation
+    // on speed 1
+    sf->inter_mode_rd_model_estimation = 0;
+    sf->adaptive_txb_search_level = 2;
+    sf->use_intra_txb_hash = 1;
+    sf->optimize_b_precheck = 1;
+    sf->dual_sgr_penalty_level = 1;
   }
 
   if (speed >= 2) {
-    if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
-        av1_internal_image_edge(cpi)) {
-      sf->use_square_partition_only = !frame_is_boosted(cpi);
-    } else {
-      sf->use_square_partition_only = !frame_is_intra_only(cm);
-    }
+    sf->gm_erroradv_type = GM_ERRORADV_TR_2;
 
-    sf->less_rectangular_check = 1;
+    sf->selective_ref_frame = 2;
+    sf->fast_cdef_search = 1;
 
     sf->use_rd_breakout = 1;
-    sf->adaptive_motion_search = 1;
-    sf->mv.auto_mv_step_size = 1;
     sf->adaptive_rd_thresh = 1;
+    sf->mv.auto_mv_step_size = 1;
     sf->mv.subpel_iters_per_step = 1;
-    sf->mode_skip_start = 10;
-    sf->adaptive_pred_interp_filter = 1;
-
-    sf->recode_loop = ALLOW_RECODE_KFARFGF;
-#if CONFIG_TX64X64
-    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
-#if CONFIG_CFL
-    sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
-#else
-    sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC_H_V;
-#endif  // CONFIG_CFL
-#endif  // CONFIG_TX64X64
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
-#if CONFIG_CFL
-    sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
-#else
-    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
-#endif
-    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
-#if CONFIG_CFL
-    sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
-#else
-    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
-#endif
+    sf->disable_filter_search_var_thresh = 100;
+    sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
 
-    sf->tx_size_search_breakout = 1;
     sf->partition_search_breakout_rate_thr = 80;
-    sf->tx_type_search.prune_mode = PRUNE_ONE;
-    // Use transform domain distortion.
-    // Note var-tx expt always uses pixel domain distortion.
-    sf->use_transform_domain_distortion = 1;
+    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+    sf->allow_partition_search_skip = 1;
     sf->disable_wedge_search_var_thresh = 100;
     sf->fast_wedge_sign_estimate = 1;
   }
 
   if (speed >= 3) {
-    sf->tx_size_search_method =
-        frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL;
-    sf->mode_search_skip_flags =
-        (cm->frame_type == KEY_FRAME)
-            ? 0
-            : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
-                  FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR;
-    sf->disable_filter_search_var_thresh = 100;
-    sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
-    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
-    sf->allow_partition_search_skip = 1;
-    sf->use_upsampled_references = 0;
+    sf->tx_size_search_method = boosted ? USE_FULL_RD : USE_LARGESTALL;
+    sf->less_rectangular_check = 1;
+    sf->mode_skip_start = 10;
+    sf->adaptive_pred_interp_filter = 1;
+    // adaptive_motion_search breaks encoder multi-thread tests.
+    // The values in x->pred_mv[] differ for single and multi-thread cases.
+    // See aomedia:1778.
+    // sf->adaptive_motion_search = 1;
+    sf->recode_loop = ALLOW_RECODE_KFARFGF;
+    sf->use_transform_domain_distortion = 1;
+    sf->use_accurate_subpel_search = 0;
     sf->adaptive_rd_thresh = 2;
-#if CONFIG_EXT_TX
-    sf->tx_type_search.prune_mode = PRUNE_TWO;
-#endif
-#if CONFIG_GLOBAL_MOTION
+    sf->tx_type_search.prune_mode = PRUNE_2D_FAST;
     sf->gm_search_type = GM_DISABLE_SEARCH;
-#endif  // CONFIG_GLOBAL_MOTION
   }
 
   if (speed >= 4) {
-    sf->use_square_partition_only = !frame_is_intra_only(cm);
+    sf->tx_type_search.fast_intra_tx_type_search = 1;
+    sf->tx_type_search.fast_inter_tx_type_search = 1;
+    sf->use_square_partition_only = !boosted;
     sf->tx_size_search_method =
         frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
     sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
@@ -232,52 +230,44 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->cb_partition_search = !boosted;
     sf->cb_pred_filter_search = 1;
     sf->alt_ref_search_fp = 1;
-    sf->recode_loop = ALLOW_RECODE_KFMAXBW;
-    sf->adaptive_rd_thresh = 3;
     sf->mode_skip_start = 6;
-#if CONFIG_TX64X64
-    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
-#if CONFIG_CFL
-    sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_CFL;
-#else
-    sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC;
-#endif  // CONFIG_CFL
-#endif  // CONFIG_TX64X64
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
-#if CONFIG_CFL
-    sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_CFL;
-#else
-    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
-#endif  // CONFIG_CFL
     sf->adaptive_interp_filter_search = 1;
   }
 
   if (speed >= 5) {
+    sf->recode_loop = ALLOW_RECODE_KFMAXBW;
+    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
+    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
     sf->use_square_partition_only = 1;
     sf->tx_size_search_method = USE_LARGESTALL;
     sf->mv.search_method = BIGDIA;
     sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
     sf->adaptive_rd_thresh = 4;
-    if (cm->frame_type != KEY_FRAME)
-      sf->mode_search_skip_flags |= FLAG_EARLY_TERMINATE;
+    sf->mode_search_skip_flags =
+        (cm->frame_type == KEY_FRAME)
+            ? 0
+            : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+                  FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
+                  FLAG_EARLY_TERMINATE;
     sf->disable_filter_search_var_thresh = 200;
     sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
     sf->use_fast_coef_costing = 1;
     sf->partition_search_breakout_rate_thr = 300;
+    sf->use_transform_domain_distortion = 2;
   }
 
   if (speed >= 6) {
     int i;
-    sf->optimize_coefficients = 0;
+    sf->optimize_coefficients = NO_TRELLIS_OPT;
     sf->mv.search_method = HEX;
     sf->disable_filter_search_var_thresh = 500;
     for (i = 0; i < TX_SIZES; ++i) {
       sf->intra_y_mode_mask[i] = INTRA_DC;
-#if CONFIG_CFL
       sf->intra_uv_mode_mask[i] = UV_INTRA_DC_CFL;
-#else
-      sf->intra_uv_mode_mask[i] = INTRA_DC;
-#endif  // CONFIG_CFL
     }
     sf->partition_search_breakout_rate_thr = 500;
     sf->mv.reduce_first_step_size = 1;
@@ -288,9 +278,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
     sf->default_max_partition_size = BLOCK_32X32;
     sf->default_min_partition_size = BLOCK_8X8;
-#if CONFIG_TX64X64
     sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
-#endif  // CONFIG_TX64X64
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->frame_parameter_update = 0;
     sf->mv.search_method = FAST_HEX;
@@ -298,13 +286,10 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST;
     sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST;
     sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST;
-#if CONFIG_EXT_PARTITION
     sf->inter_mode_mask[BLOCK_64X128] = INTER_NEAREST;
     sf->inter_mode_mask[BLOCK_128X64] = INTER_NEAREST;
     sf->inter_mode_mask[BLOCK_128X128] = INTER_NEAREST;
-#endif  // CONFIG_EXT_PARTITION
     sf->partition_search_type = REFERENCE_PARTITION;
-    sf->default_min_partition_size = BLOCK_8X8;
     sf->reuse_inter_pred_sby = 1;
     sf->force_frame_boost =
         is_keyframe ||
@@ -324,31 +309,9 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
 void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) {
   SPEED_FEATURES *const sf = &cpi->sf;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  AV1_COMMON *const cm = &cpi->common;
   RD_OPT *const rd = &cpi->rd;
   int i;
 
-// Limit memory usage for high resolutions
-#if CONFIG_EXT_REFS
-  // TODO(zoeliu): Temporary solution to resolve the insufficient RAM issue for
-  //               ext-refs. Need to work with @yunqingwang to have a more
-  //               effective solution.
-  if (AOMMIN(cm->width, cm->height) > 720) {
-    // Turn off the use of upsampled references for HD resolution
-    sf->use_upsampled_references = 0;
-  } else if ((AOMMIN(cm->width, cm->height) > 540) &&
-             (oxcf->profile != PROFILE_0)) {
-    sf->use_upsampled_references = 0;
-  }
-#else
-  if (AOMMIN(cm->width, cm->height) > 1080) {
-    sf->use_upsampled_references = 0;
-  } else if ((AOMMIN(cm->width, cm->height) > 720) &&
-             (oxcf->profile != PROFILE_0)) {
-    sf->use_upsampled_references = 0;
-  }
-#endif  // CONFIG_EXT_REFS
-
   if (oxcf->mode == GOOD) {
     set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
   }
@@ -371,6 +334,52 @@ void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) {
     cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
 }
 
+static void set_dev_sf(AV1_COMP *cpi, SPEED_FEATURES *sf, int speed) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  if (speed & TXFM_CODING_SF) {
+    sf->inter_tx_size_search_init_depth_rect = 1;
+    sf->inter_tx_size_search_init_depth_sqr = 1;
+    sf->intra_tx_size_search_init_depth_rect = 1;
+    sf->intra_tx_size_search_init_depth_sqr = 1;
+    sf->tx_size_search_method = USE_FAST_RD;
+    sf->tx_type_search.fast_intra_tx_type_search = 1;
+    sf->tx_type_search.fast_inter_tx_type_search = 1;
+  }
+
+  if (speed & INTER_PRED_SF) {
+    sf->selective_ref_frame = 2;
+    // sf->adaptive_motion_search = 1;
+    sf->mv.auto_mv_step_size = 1;
+    sf->adaptive_rd_thresh = 1;
+    sf->mv.subpel_iters_per_step = 1;
+    sf->adaptive_pred_interp_filter = 1;
+  }
+
+  if (speed & INTRA_PRED_SF) {
+    sf->max_intra_bsize = BLOCK_32X32;
+  }
+
+  if (speed & PARTITION_SF) {
+    if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+        has_internal_image_edge(cpi)) {
+      sf->use_square_partition_only = !frame_is_boosted(cpi);
+    } else {
+      sf->use_square_partition_only = !frame_is_intra_only(cm);
+    }
+    sf->less_rectangular_check = 1;
+    sf->prune_ext_partition_types_search_level = 2;
+  }
+
+  if (speed & LOOP_FILTER_SF) {
+    sf->fast_cdef_search = 1;
+  }
+
+  if (speed & RD_SKIP_SF) {
+    sf->use_rd_breakout = 1;
+  }
+}
+
 void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   SPEED_FEATURES *const sf = &cpi->sf;
@@ -378,7 +387,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   int i;
 
-  (void)cm;
   // best quality defaults
   sf->frame_parameter_update = 1;
   sf->mv.search_method = NSTEP;
@@ -386,7 +394,19 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   sf->mv.subpel_search_method = SUBPEL_TREE;
   sf->mv.subpel_iters_per_step = 2;
   sf->mv.subpel_force_stop = 0;
-  sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf);
+#if DISABLE_TRELLISQ_SEARCH == 2
+  sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf)
+                                  ? FINAL_PASS_TRELLIS_OPT
+                                  : NO_TRELLIS_OPT;
+#elif DISABLE_TRELLISQ_SEARCH == 1
+  sf->optimize_coefficients = NO_TRELLIS_OPT;
+#else
+  if (is_lossless_requested(&cpi->oxcf))
+    sf->optimize_coefficients = NO_TRELLIS_OPT;
+  else
+    sf->optimize_coefficients = FULL_TRELLIS_OPT;
+#endif  // DISABLE_TRELLISQ_SEARCH
+  sf->gm_erroradv_type = GM_ERRORADV_TR_0;
   sf->mv.reduce_first_step_size = 0;
   sf->coeff_prob_appx_step = 1;
   sf->mv.auto_mv_step_size = 0;
@@ -394,6 +414,15 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   sf->comp_inter_joint_search_thresh = BLOCK_4X4;
   sf->adaptive_rd_thresh = 0;
   sf->tx_size_search_method = USE_FULL_RD;
+  sf->inter_tx_size_search_init_depth_sqr = 0;
+  sf->inter_tx_size_search_init_depth_rect = 0;
+  sf->intra_tx_size_search_init_depth_rect = 0;
+  sf->intra_tx_size_search_init_depth_sqr = 0;
+  sf->tx_size_search_lgr_block = 0;
+  sf->model_based_prune_tx_search_level = 0;
+  sf->model_based_post_interp_filter_breakout = 0;
+  sf->reduce_inter_modes = 0;
+  sf->selective_ref_gm = 1;
   sf->adaptive_motion_search = 0;
   sf->adaptive_pred_interp_filter = 0;
   sf->adaptive_mode_search = 0;
@@ -401,10 +430,13 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   sf->cb_partition_search = 0;
   sf->alt_ref_search_fp = 0;
   sf->partition_search_type = SEARCH_PARTITION;
-  sf->tx_type_search.prune_mode = NO_PRUNE;
+  sf->tx_type_search.prune_mode = PRUNE_2D_ACCURATE;
+  sf->tx_type_search.ml_tx_split_thresh = 30;
   sf->tx_type_search.use_skip_flag_prediction = 1;
   sf->tx_type_search.fast_intra_tx_type_search = 0;
   sf->tx_type_search.fast_inter_tx_type_search = 0;
+  sf->tx_type_search.skip_tx_search = 0;
+  sf->selective_ref_frame = 0;
   sf->less_rectangular_check = 0;
   sf->use_square_partition_only = 0;
   sf->auto_min_max_partition_size = NOT_IN_USE;
@@ -420,17 +452,25 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   sf->disable_filter_search_var_thresh = 0;
   sf->adaptive_interp_filter_search = 0;
   sf->allow_partition_search_skip = 0;
-  sf->use_upsampled_references = 1;
+  sf->use_accurate_subpel_search = 1;
   sf->disable_wedge_search_var_thresh = 0;
   sf->fast_wedge_sign_estimate = 0;
+  sf->drop_ref = 0;
+  sf->skip_intra_in_interframe = 1;
+  sf->txb_split_cap = 1;
+  sf->adaptive_txb_search_level = 0;
+  sf->two_pass_partition_search = 0;
+  sf->mode_pruning_based_on_two_pass_partition_search = 0;
+  sf->use_intra_txb_hash = 0;
+  sf->use_inter_txb_hash = 1;
+  sf->use_mb_rd_hash = 1;
+  sf->optimize_b_precheck = 0;
+  sf->jnt_comp_fast_tx_search = 0;
+  sf->jnt_comp_skip_mv_search = 0;
 
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
-#if CONFIG_CFL
     sf->intra_uv_mode_mask[i] = UV_INTRA_ALL;
-#else
-    sf->intra_uv_mode_mask[i] = INTRA_ALL;
-#endif  // CONFIG_CFL
   }
   sf->use_rd_breakout = 0;
   sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
@@ -448,22 +488,28 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   // Recode loop tolerance %.
   sf->recode_tolerance = 25;
   sf->default_interp_filter = SWITCHABLE;
-  sf->tx_size_search_breakout = 0;
   sf->partition_search_breakout_dist_thr = 0;
   sf->partition_search_breakout_rate_thr = 0;
   sf->simple_model_rd_from_var = 0;
+  sf->prune_ext_partition_types_search_level = 0;
+  sf->ml_prune_ab_partition = 0;
+  sf->fast_cdef_search = 0;
 
   // Set this at the appropriate speed levels
   sf->use_transform_domain_distortion = 0;
-#if CONFIG_GLOBAL_MOTION
   sf->gm_search_type = GM_FULL_SEARCH;
-#endif  // CONFIG_GLOBAL_MOTION
+  sf->use_fast_interpolation_filter_search = 0;
+  sf->skip_repeat_interpolation_filter_search = 0;
+  sf->use_hash_based_trellis = 0;
+
+  // Set decoder side speed feature to use less dual sgr modes
+  sf->dual_sgr_penalty_level = 0;
+
+  sf->inter_mode_rd_model_estimation = 0;
 
-  if (oxcf->mode == GOOD
-#if CONFIG_XIPHRC
-      || oxcf->pass == 1
-#endif
-      )
+  set_dev_sf(cpi, sf, oxcf->dev_sf);
+
+  if (oxcf->mode == GOOD)
     set_good_speed_features_framesize_independent(cpi, sf, oxcf->speed);
 
   // sf->partition_search_breakout_dist_thr is set assuming max 64x64
@@ -472,7 +518,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
     sf->partition_search_breakout_dist_thr <<= 2 * (MAX_SB_SIZE_LOG2 - 6);
   }
 
-  cpi->full_search_sad = av1_full_search_sad;
   cpi->diamond_search_sad = av1_diamond_search_sad;
 
   sf->allow_exhaustive_searches = 1;
@@ -490,7 +535,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
     sf->mesh_patterns[i].interval =
         good_quality_mesh_patterns[speed][i].interval;
   }
-#if CONFIG_INTRABC
   if ((frame_is_intra_only(cm) && cm->allow_screen_content_tools) &&
       (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION ||
        cpi->oxcf.content == AOM_CONTENT_SCREEN)) {
@@ -500,18 +544,15 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
     }
     sf->max_exaustive_pct = intrabc_max_mesh_pct[speed];
   }
-#endif  // CONFIG_INTRABC
 
-#if !CONFIG_XIPHRC
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
-  if (oxcf->pass == 1) sf->optimize_coefficients = 0;
-#endif
+  if (oxcf->pass == 1) sf->optimize_coefficients = NO_TRELLIS_OPT;
 
   // No recode for 1 pass.
   if (oxcf->pass == 0) {
     sf->recode_loop = DISALLOW_RECODE;
-    sf->optimize_coefficients = 0;
+    sf->optimize_coefficients = NO_TRELLIS_OPT;
   }
 
   if (sf->mv.subpel_search_method == SUBPEL_TREE) {
@@ -524,12 +565,11 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
     cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned_evenmore;
   }
 
-#if !CONFIG_AOM_QM
-  x->optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
-#else
+  cpi->optimize_speed_feature =
+      oxcf->pass != 1 ? sf->optimize_coefficients : NO_TRELLIS_OPT;
   // FIXME: trellis not very efficient for quantisation matrices
-  x->optimize = 0;
-#endif
+  if (cm->using_qmatrix) cpi->optimize_speed_feature = NO_TRELLIS_OPT;
+  if (oxcf->disable_trellis_quant) cpi->optimize_speed_feature = NO_TRELLIS_OPT;
 
   x->min_partition_size = sf->default_min_partition_size;
   x->max_partition_size = sf->default_max_partition_size;
@@ -543,4 +583,8 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
     cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
   else if (cpi->oxcf.motion_vector_unit_test == 2)
     cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+
+#if CONFIG_DIST_8X8
+  if (sf->use_transform_domain_distortion > 0) cpi->oxcf.using_dist_8x8 = 0;
+#endif  // CONFIG_DIST_8X8
 }
diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h
index edd79cd16..59cb6be58 100644
--- a/third_party/aom/av1/encoder/speed_features.h
+++ b/third_party/aom/av1/encoder/speed_features.h
@@ -20,64 +20,51 @@ extern "C" {
 
 enum {
   INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) |
-              (1 << D135_PRED) | (1 << D117_PRED) | (1 << D153_PRED) |
-              (1 << D207_PRED) | (1 << D63_PRED) | (1 << SMOOTH_PRED) |
-#if CONFIG_SMOOTH_HV
-              (1 << SMOOTH_V_PRED) | (1 << SMOOTH_H_PRED) |
-#endif  // CONFIG_SMOOTH_HV
-              (1 << TM_PRED),
-#if CONFIG_CFL
-  UV_INTRA_ALL = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) |
-                 (1 << UV_D45_PRED) | (1 << UV_D135_PRED) |
-                 (1 << UV_D117_PRED) | (1 << UV_D153_PRED) |
-                 (1 << UV_D207_PRED) | (1 << UV_D63_PRED) |
-                 (1 << UV_SMOOTH_PRED) |
-#if CONFIG_SMOOTH_HV
-                 (1 << UV_SMOOTH_V_PRED) | (1 << UV_SMOOTH_H_PRED) |
-#endif  // CONFIG_SMOOTH_HV
-                 (1 << UV_TM_PRED) | (1 << UV_CFL_PRED),
+              (1 << D135_PRED) | (1 << D113_PRED) | (1 << D157_PRED) |
+              (1 << D203_PRED) | (1 << D67_PRED) | (1 << SMOOTH_PRED) |
+              (1 << SMOOTH_V_PRED) | (1 << SMOOTH_H_PRED) | (1 << PAETH_PRED),
+  UV_INTRA_ALL =
+      (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) |
+      (1 << UV_D45_PRED) | (1 << UV_D135_PRED) | (1 << UV_D113_PRED) |
+      (1 << UV_D157_PRED) | (1 << UV_D203_PRED) | (1 << UV_D67_PRED) |
+      (1 << UV_SMOOTH_PRED) | (1 << UV_SMOOTH_V_PRED) |
+      (1 << UV_SMOOTH_H_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED),
   UV_INTRA_DC = (1 << UV_DC_PRED),
   UV_INTRA_DC_CFL = (1 << UV_DC_PRED) | (1 << UV_CFL_PRED),
-  UV_INTRA_DC_TM = (1 << UV_DC_PRED) | (1 << UV_TM_PRED),
-  UV_INTRA_DC_TM_CFL =
-      (1 << UV_DC_PRED) | (1 << UV_TM_PRED) | (1 << UV_CFL_PRED),
+  UV_INTRA_DC_TM = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED),
+  UV_INTRA_DC_PAETH_CFL =
+      (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED),
   UV_INTRA_DC_H_V = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED),
   UV_INTRA_DC_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_V_PRED) |
                         (1 << UV_H_PRED) | (1 << UV_CFL_PRED),
-  UV_INTRA_DC_TM_H_V = (1 << UV_DC_PRED) | (1 << UV_TM_PRED) |
-                       (1 << UV_V_PRED) | (1 << UV_H_PRED),
-  UV_INTRA_DC_TM_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_TM_PRED) |
-                           (1 << UV_V_PRED) | (1 << UV_H_PRED) |
-                           (1 << UV_CFL_PRED),
-#endif  // CONFIG_CFL
+  UV_INTRA_DC_PAETH_H_V = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) |
+                          (1 << UV_V_PRED) | (1 << UV_H_PRED),
+  UV_INTRA_DC_PAETH_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) |
+                              (1 << UV_V_PRED) | (1 << UV_H_PRED) |
+                              (1 << UV_CFL_PRED),
   INTRA_DC = (1 << DC_PRED),
-  INTRA_DC_TM = (1 << DC_PRED) | (1 << TM_PRED),
+  INTRA_DC_TM = (1 << DC_PRED) | (1 << PAETH_PRED),
   INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
-  INTRA_DC_TM_H_V =
-      (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) | (1 << H_PRED)
+  INTRA_DC_PAETH_H_V =
+      (1 << DC_PRED) | (1 << PAETH_PRED) | (1 << V_PRED) | (1 << H_PRED)
 };
 
 enum {
-#if CONFIG_COMPOUND_SINGLEREF
-// TODO(zoeliu): To further consider following single ref comp modes:
-//               SR_NEAREST_NEARMV, SR_NEAREST_NEWMV, SR_NEAR_NEWMV,
-//               SR_ZERO_NEWMV, and SR_NEW_NEWMV.
-#endif  // CONFIG_COMPOUND_SINGLEREF
-  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV) |
-              (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) | (1 << NEW_NEWMV) |
-              (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) | (1 << NEW_NEARMV) |
-              (1 << NEW_NEARESTMV) | (1 << ZERO_ZEROMV),
+  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) |
+              (1 << NEWMV) | (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) |
+              (1 << NEW_NEWMV) | (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) |
+              (1 << NEW_NEARMV) | (1 << NEW_NEARESTMV) | (1 << GLOBAL_GLOBALMV),
   INTER_NEAREST = (1 << NEARESTMV) | (1 << NEAREST_NEARESTMV) |
                   (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV),
   INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV) |
                       (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) |
                       (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
                       (1 << NEW_NEARMV) | (1 << NEAR_NEWMV),
-  INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) |
-                       (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) |
+  INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << GLOBALMV) |
+                       (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
                        (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV),
-  INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) | (1 << NEWMV) |
-                           (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) |
+  INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << GLOBALMV) | (1 << NEWMV) |
+                           (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
                            (1 << NEW_NEWMV) | (1 << NEW_NEARESTMV) |
                            (1 << NEAREST_NEWMV) | (1 << NEW_NEARMV) |
                            (1 << NEAR_NEWMV),
@@ -86,8 +73,8 @@ enum {
                            (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
                            (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) |
                            (1 << NEAR_NEARMV),
-  INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) |
-                            (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) |
+  INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) |
+                            (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
                             (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) |
                             (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) |
                             (1 << NEAR_NEARMV),
@@ -106,6 +93,17 @@ enum {
 };
 
 typedef enum {
+  TXFM_CODING_SF = 1,
+  INTER_PRED_SF = 2,
+  INTRA_PRED_SF = 4,
+  PARTITION_SF = 8,
+  LOOP_FILTER_SF = 16,
+  RD_SKIP_SF = 32,
+  RESERVE_2_SF = 64,
+  RESERVE_3_SF = 128,
+} DEV_SPEED_FEATURES;
+
+typedef enum {
   DIAMOND = 0,
   NSTEP = 1,
   HEX = 2,
@@ -141,8 +139,8 @@ typedef enum {
 
 typedef enum {
   USE_FULL_RD = 0,
+  USE_FAST_RD,
   USE_LARGESTALL,
-  USE_TX_8X8
 } TX_SIZE_SEARCH_METHOD;
 
 typedef enum {
@@ -190,10 +188,13 @@ typedef enum {
   NO_PRUNE = 0,
   // eliminates one tx type in vertical and horizontal direction
   PRUNE_ONE = 1,
-#if CONFIG_EXT_TX
   // eliminates two tx types in each direction
   PRUNE_TWO = 2,
-#endif
+  // adaptively prunes the least perspective tx types out of all 16
+  // (tuned to provide negligible quality loss)
+  PRUNE_2D_ACCURATE = 3,
+  // similar, but applies much more aggressive pruning to get better speed-up
+  PRUNE_2D_FAST = 4,
 } TX_TYPE_PRUNE_MODE;
 
 typedef struct {
@@ -204,6 +205,13 @@ typedef struct {
   // Use a skip flag prediction model to detect blocks with skip = 1 early
   // and avoid doing full TX type search for such blocks.
   int use_skip_flag_prediction;
+
+  // Threshold used by the ML based method to predict TX block split decisions.
+  int ml_tx_split_thresh;
+
+  // skip remaining transform type search when we found the rdcost of skip is
+  // better than applying transform
+  int skip_tx_search;
 } TX_TYPE_SEARCH;
 
 typedef enum {
@@ -261,13 +269,29 @@ typedef struct MESH_PATTERN {
   int interval;
 } MESH_PATTERN;
 
-#if CONFIG_GLOBAL_MOTION
 typedef enum {
   GM_FULL_SEARCH,
   GM_REDUCED_REF_SEARCH,
   GM_DISABLE_SEARCH
 } GM_SEARCH_TYPE;
-#endif  // CONFIG_GLOBAL_MOTION
+
+typedef enum {
+  GM_ERRORADV_TR_0,
+  GM_ERRORADV_TR_1,
+  GM_ERRORADV_TR_2,
+  GM_ERRORADV_TR_TYPES,
+} GM_ERRORADV_TYPE;
+
+typedef enum {
+  NO_TRELLIS_OPT,         // No trellis optimization
+  FULL_TRELLIS_OPT,       // Trellis optimization in all stages
+  FINAL_PASS_TRELLIS_OPT  // Trellis optimization in only the final encode pass
+} TRELLIS_OPT_TYPE;
+
+typedef enum {
+  FULL_TXFM_RD,
+  LOW_TXFM_RD,
+} TXFM_RD_MODEL;
 
 typedef struct SPEED_FEATURES {
   MV_SPEED_FEATURES mv;
@@ -277,8 +301,11 @@ typedef struct SPEED_FEATURES {
 
   RECODE_LOOP_TYPE recode_loop;
 
-  // Trellis (dynamic programming) optimization of quantized values (+1, 0).
-  int optimize_coefficients;
+  // Trellis (dynamic programming) optimization of quantized values
+  TRELLIS_OPT_TYPE optimize_coefficients;
+
+  // Global motion warp error threshold
+  GM_ERRORADV_TYPE gm_erroradv_type;
 
   // Always set to 0. If on it enables 0 cost background transmission
   // (except for the initial transmission of the segmentation). The feature is
@@ -287,6 +314,14 @@ typedef struct SPEED_FEATURES {
   // adds overhead.
   int static_segmentation;
 
+  // Limit the inter mode tested in the RD loop
+  int reduce_inter_modes;
+
+  // Do not compute the global motion parameters for a LAST2_FRAME or
+  // LAST3_FRAME if the GOLDEN_FRAME is closer and it has a non identity
+  // global model.
+  int selective_ref_gm;
+
   // If 1 we iterate finding a best reference for 2 ref frames together - via
   // a log search that iterates 4 times (check around mv for last for best
   // error of combined predictor then check around mv for alt). If 0 we
@@ -309,6 +344,17 @@ typedef struct SPEED_FEATURES {
   // for intra and model coefs for the rest.
   TX_SIZE_SEARCH_METHOD tx_size_search_method;
 
+  // Init search depth for square and rectangular transform partitions.
+  // Values:
+  // 0 - search full tree, 1: search 1 level, 2: search the highest level only
+  int inter_tx_size_search_init_depth_sqr;
+  int inter_tx_size_search_init_depth_rect;
+  int intra_tx_size_search_init_depth_sqr;
+  int intra_tx_size_search_init_depth_rect;
+  // If any dimension of a coding block size above 64, always search the
+  // largest transform only, since the largest transform block size is 64x64.
+  int tx_size_search_lgr_block;
+
   // After looking at the first set of modes (set by index here), skip
   // checking modes for reference frames that don't match the reference frame
   // of the best so far.
@@ -318,9 +364,51 @@ typedef struct SPEED_FEATURES {
 
   TX_TYPE_SEARCH tx_type_search;
 
+  // Skip split transform block partition when the collocated bigger block
+  // is selected as all zero coefficients.
+  int txb_split_cap;
+
+  // Shortcut the transform block partition and type search when the target
+  // rdcost is relatively lower.
+  // Values are 0 (not used) , or 1 - 2 with progressively increasing
+  // aggressiveness
+  int adaptive_txb_search_level;
+
+  // Prune level for tx_size_type search for inter based on rd model
+  // 0: no pruning
+  // 1-2: progressively increasing aggressiveness of pruning
+  int model_based_prune_tx_search_level;
+
+  // Model based breakout after interpolation filter search
+  // 0: no breakout
+  // 1: use model based rd breakout
+  int model_based_post_interp_filter_breakout;
+
   // Used if partition_search_type = FIXED_SIZE_PARTITION
   BLOCK_SIZE always_this_block_size;
 
+  // Drop less likely to be picked reference frames in the RD search.
+  // Has three levels for now: 0, 1 and 2, where higher levels prune more
+  // aggressively than lower ones. (0 means no pruning).
+  int selective_ref_frame;
+
+  // Prune extended partition types search
+  // Can take values 0 - 2, 0 referring to no pruning, and 1 - 2 increasing
+  // aggressiveness of pruning in order.
+  int prune_ext_partition_types_search_level;
+
+  // Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions.
+  int ml_prune_ab_partition;
+
+  int fast_cdef_search;
+
+  // 2-pass coding block partition search
+  int two_pass_partition_search;
+
+  // Use the mode decisions made in the initial partition search to prune mode
+  // candidates, e.g. ref frames.
+  int mode_pruning_based_on_two_pass_partition_search;
+
   // Skip rectangular partition test when partition type none gives better
   // rd than partition type split.
   int less_rectangular_check;
@@ -427,7 +515,7 @@ typedef struct SPEED_FEATURES {
   // by only looking at counts from 1/2 the bands.
   FAST_COEFF_UPDATE use_fast_coef_updates;
 
-  // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV
+  // A binary mask indicating if NEARESTMV, NEARMV, GLOBALMV, NEWMV
   // modes are used in order from LSB to MSB for each BLOCK_SIZE.
   int inter_mode_mask[BLOCK_SIZES_ALL];
 
@@ -456,10 +544,6 @@ typedef struct SPEED_FEATURES {
   // default interp filter choice
   InterpFilter default_interp_filter;
 
-  // Early termination in transform size search, which only applies while
-  // tx_size_search_method is USE_FULL_RD.
-  int tx_size_search_breakout;
-
   // adaptive interp_filter search to allow skip of certain filter types.
   int adaptive_interp_filter_search;
 
@@ -476,16 +560,67 @@ typedef struct SPEED_FEATURES {
   // Fast approximation of av1_model_rd_from_var_lapndz
   int simple_model_rd_from_var;
 
-  // Do sub-pixel search in up-sampled reference frames
-  int use_upsampled_references;
+  // If true, sub-pixel search uses the exact convolve function used for final
+  // encoding and decoding; otherwise, it uses bilinear interpolation.
+  int use_accurate_subpel_search;
 
   // Whether to compute distortion in the image domain (slower but
   // more accurate), or in the transform domain (faster but less acurate).
+  // 0: use image domain
+  // 1: use transform domain in tx_type search, and use image domain for
+  // RD_STATS
+  // 2: use transform domain
   int use_transform_domain_distortion;
 
-#if CONFIG_GLOBAL_MOTION
   GM_SEARCH_TYPE gm_search_type;
-#endif  // CONFIG_GLOBAL_MOTION
+
+  // Do limited interpolation filter search for dual filters, since best choice
+  // usually includes EIGHTTAP_REGULAR.
+  int use_fast_interpolation_filter_search;
+
+  // Save results of interpolation_filter_search for a block
+  // Check mv and ref_frames before search, if they are same with previous
+  // saved results, it can be skipped.
+  int skip_repeat_interpolation_filter_search;
+
+  // Use a hash table to store previously computed optimized qcoeffs from
+  // expensive calls to optimize_txb.
+  int use_hash_based_trellis;
+
+  // flag to drop some ref frames in compound motion search
+  int drop_ref;
+
+  // flag to allow skipping intra mode for inter frame prediction
+  int skip_intra_in_interframe;
+
+  // Use hash table to store intra(keyframe only) txb transform search results
+  // to avoid repeated search on the same residue signal.
+  int use_intra_txb_hash;
+
+  // Use hash table to store inter txb transform search results
+  // to avoid repeated search on the same residue signal.
+  int use_inter_txb_hash;
+
+  // Use hash table to store macroblock RD search results
+  // to avoid repeated search on the same residue signal.
+  int use_mb_rd_hash;
+
+  // Calculate RD cost before doing optimize_b, and skip if the cost is large.
+  int optimize_b_precheck;
+
+  // Use model rd instead of transform search in jnt_comp
+  int jnt_comp_fast_tx_search;
+
+  // Skip mv search in jnt_comp
+  int jnt_comp_skip_mv_search;
+
+  // Decoder side speed feature to add penalty for use of dual-sgr filters.
+  // Takes values 0 - 10, 0 indicating no penalty and each additional level
+  // adding a penalty of 1%
+  int dual_sgr_penalty_level;
+
+  // Dynamically estimate final rd from prediction error and mode cost
+  int inter_mode_rd_model_estimation;
 } SPEED_FEATURES;
 
 struct AV1_COMP;
diff --git a/third_party/aom/av1/encoder/subexp.c b/third_party/aom/av1/encoder/subexp.c
deleted file mode 100644
index dc96d712a..000000000
--- a/third_party/aom/av1/encoder/subexp.c
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include "aom_dsp/bitwriter.h"
-
-#include "av1/common/common.h"
-#include "av1/common/entropy.h"
-#include "av1/encoder/cost.h"
-#include "av1/encoder/subexp.h"
-
-static const uint8_t update_bits[255] = {
-  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,
-  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  8,  8,  8,  8,  8,  8,
-  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-  8,  8,  8,  8,  8,  8,  8,  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11,
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-  11, 11, 11, 11, 11, 11, 11, 0,
-};
-#define MIN_DELP_BITS 5
-
-static int recenter_nonneg(int v, int m) {
-  if (v > (m << 1))
-    return v;
-  else if (v >= m)
-    return ((v - m) << 1);
-  else
-    return ((m - v) << 1) - 1;
-}
-
-static int remap_prob(int v, int m) {
-  int i;
-  static const uint8_t map_table[MAX_PROB - 1] = {
-    // generated by:
-    //   map_table[j] = split_index(j, MAX_PROB - 1, MODULUS_PARAM);
-    20,  21,  22,  23,  24,  25,  0,   26,  27,  28,  29,  30,  31,  32,  33,
-    34,  35,  36,  37,  1,   38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
-    48,  49,  2,   50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,
-    3,   62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  4,   74,
-    75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  5,   86,  87,  88,
-    89,  90,  91,  92,  93,  94,  95,  96,  97,  6,   98,  99,  100, 101, 102,
-    103, 104, 105, 106, 107, 108, 109, 7,   110, 111, 112, 113, 114, 115, 116,
-    117, 118, 119, 120, 121, 8,   122, 123, 124, 125, 126, 127, 128, 129, 130,
-    131, 132, 133, 9,   134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
-    145, 10,  146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11,
-    158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12,  170, 171,
-    172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 13,  182, 183, 184, 185,
-    186, 187, 188, 189, 190, 191, 192, 193, 14,  194, 195, 196, 197, 198, 199,
-    200, 201, 202, 203, 204, 205, 15,  206, 207, 208, 209, 210, 211, 212, 213,
-    214, 215, 216, 217, 16,  218, 219, 220, 221, 222, 223, 224, 225, 226, 227,
-    228, 229, 17,  230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
-    18,  242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 19,
-  };
-  v--;
-  m--;
-  if ((m << 1) <= MAX_PROB)
-    i = recenter_nonneg(v, m) - 1;
-  else
-    i = recenter_nonneg(MAX_PROB - 1 - v, MAX_PROB - 1 - m) - 1;
-
-  i = map_table[i];
-  return i;
-}
-
-static int prob_diff_update_cost(aom_prob newp, aom_prob oldp) {
-  int delp = remap_prob(newp, oldp);
-  return update_bits[delp] << AV1_PROB_COST_SHIFT;
-}
-
-static void encode_uniform(aom_writer *w, int v) {
-  const int l = 8;
-  const int m = (1 << l) - 190;
-  if (v < m) {
-    aom_write_literal(w, v, l - 1);
-  } else {
-    aom_write_literal(w, m + ((v - m) >> 1), l - 1);
-    aom_write_literal(w, (v - m) & 1, 1);
-  }
-}
-
-static INLINE int write_bit_gte(aom_writer *w, int word, int test) {
-  aom_write_literal(w, word >= test, 1);
-  return word >= test;
-}
-
-static void encode_term_subexp(aom_writer *w, int word) {
-  if (!write_bit_gte(w, word, 16)) {
-    aom_write_literal(w, word, 4);
-  } else if (!write_bit_gte(w, word, 32)) {
-    aom_write_literal(w, word - 16, 4);
-  } else if (!write_bit_gte(w, word, 64)) {
-    aom_write_literal(w, word - 32, 5);
-  } else {
-    encode_uniform(w, word - 64);
-  }
-}
-
-void av1_write_prob_diff_update(aom_writer *w, aom_prob newp, aom_prob oldp) {
-  const int delp = remap_prob(newp, oldp);
-  encode_term_subexp(w, delp);
-}
-
-int av1_prob_diff_update_savings_search(const unsigned int *ct, aom_prob oldp,
-                                        aom_prob *bestp, aom_prob upd,
-                                        int probwt) {
-  const uint32_t old_b = cost_branch256(ct, oldp);
-  int bestsavings = 0;
-  aom_prob newp, bestnewp = oldp;
-  const int step = *bestp > oldp ? -1 : 1;
-  const int upd_cost = av1_cost_one(upd) - av1_cost_zero(upd);
-
-  if (old_b > (uint32_t)upd_cost + (MIN_DELP_BITS << AV1_PROB_COST_SHIFT)) {
-    for (newp = *bestp; newp != oldp; newp += step) {
-      const int new_b = cost_branch256(ct, newp);
-      const int update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
-      const int savings = (int)((int64_t)old_b - new_b - update_b * probwt);
-      if (savings > bestsavings) {
-        bestsavings = savings;
-        bestnewp = newp;
-      }
-    }
-  }
-  *bestp = bestnewp;
-  return bestsavings;
-}
-
-void av1_cond_prob_diff_update(aom_writer *w, aom_prob *oldp,
-                               const unsigned int ct[2], int probwt) {
-  const aom_prob upd = DIFF_UPDATE_PROB;
-  aom_prob newp = get_binary_prob(ct[0], ct[1]);
-  const int savings =
-      av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd, probwt);
-  assert(newp >= 1);
-  if (savings > 0) {
-    aom_write(w, 1, upd);
-    av1_write_prob_diff_update(w, newp, *oldp);
-    *oldp = newp;
-  } else {
-    aom_write(w, 0, upd);
-  }
-}
-
-int av1_cond_prob_diff_update_savings(aom_prob *oldp, const unsigned int ct[2],
-                                      int probwt) {
-  const aom_prob upd = DIFF_UPDATE_PROB;
-  aom_prob newp = get_binary_prob(ct[0], ct[1]);
-  const int savings =
-      av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd, probwt);
-  return savings;
-}
diff --git a/third_party/aom/av1/encoder/subexp.h b/third_party/aom/av1/encoder/subexp.h
deleted file mode 100644
index 580edabdb..000000000
--- a/third_party/aom/av1/encoder/subexp.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_ENCODER_SUBEXP_H_
-#define AV1_ENCODER_SUBEXP_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "aom_dsp/bitwriter.h"
-#include "aom_dsp/prob.h"
-
-void av1_write_prob_diff_update(aom_writer *w, aom_prob newp, aom_prob oldpm);
-
-void av1_cond_prob_diff_update(aom_writer *w, aom_prob *oldp,
-                               const unsigned int ct[2], int probwt);
-
-int av1_prob_diff_update_savings_search(const unsigned int *ct, aom_prob oldp,
-                                        aom_prob *bestp, aom_prob upd,
-                                        int probwt);
-
-int av1_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const aom_prob oldp,
-                                              aom_prob *bestp, aom_prob upd,
-                                              int stepsize, int probwt);
-
-int av1_cond_prob_diff_update_savings(aom_prob *oldp, const unsigned int ct[2],
-                                      int probwt);
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AV1_ENCODER_SUBEXP_H_
diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c
index daa647689..250feab81 100644
--- a/third_party/aom/av1/encoder/temporal_filter.c
+++ b/third_party/aom/av1/encoder/temporal_filter.c
@@ -12,7 +12,8 @@
 #include <math.h>
 #include <limits.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "av1/common/alloccommon.h"
 #include "av1/common/onyxc_int.h"
 #include "av1/common/quant_common.h"
@@ -35,26 +36,17 @@
 static void temporal_filter_predictors_mb_c(
     MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr,
     int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col,
-    uint8_t *pred, struct scale_factors *scale, int x, int y) {
+    uint8_t *pred, struct scale_factors *scale, int x, int y,
+    int can_use_previous) {
   const int which_mv = 0;
   const MV mv = { mv_row, mv_col };
   enum mv_precision mv_precision_uv;
   int uv_stride;
   // TODO(angiebird): change plane setting accordingly
-  ConvolveParams conv_params = get_conv_params(which_mv, which_mv, 0);
-
-#if USE_TEMPORALFILTER_12TAP
-  const InterpFilters interp_filters =
-      av1_broadcast_interp_filter(TEMPORALFILTER_12TAP);
-  (void)xd;
-#else
-  const InterpFilters interp_filters = xd->mi[0]->mbmi.interp_filters;
-#endif  // USE_TEMPORALFILTER_12TAP
-
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+  ConvolveParams conv_params = get_conv_params(which_mv, 0, 0, xd->bd);
+  const InterpFilters interp_filters = xd->mi[0]->interp_filters;
   WarpTypesAllowed warp_types;
   memset(&warp_types, 0, sizeof(WarpTypesAllowed));
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 
   if (uv_block_width == 8) {
     uv_stride = (stride + 1) >> 1;
@@ -64,55 +56,36 @@ static void temporal_filter_predictors_mb_c(
     mv_precision_uv = MV_PRECISION_Q3;
   }
 
-#if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     av1_highbd_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale,
                                      16, 16, which_mv, interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                                     &warp_types, x, y,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                                     0, MV_PRECISION_Q3, x, y, xd);
-
-    av1_highbd_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256],
-                                     uv_block_width, &mv, scale, uv_block_width,
-                                     uv_block_height, which_mv, interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                                     &warp_types, x, y,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                                     1, mv_precision_uv, x, y, xd);
-
-    av1_highbd_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512],
-                                     uv_block_width, &mv, scale, uv_block_width,
-                                     uv_block_height, which_mv, interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                                     &warp_types, x, y,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                                     2, mv_precision_uv, x, y, xd);
+                                     &warp_types, x, y, 0, MV_PRECISION_Q3, x,
+                                     y, xd, can_use_previous);
+
+    av1_highbd_build_inter_predictor(
+        u_mb_ptr, uv_stride, &pred[256], uv_block_width, &mv, scale,
+        uv_block_width, uv_block_height, which_mv, interp_filters, &warp_types,
+        x, y, 1, mv_precision_uv, x, y, xd, can_use_previous);
+
+    av1_highbd_build_inter_predictor(
+        v_mb_ptr, uv_stride, &pred[512], uv_block_width, &mv, scale,
+        uv_block_width, uv_block_height, which_mv, interp_filters, &warp_types,
+        x, y, 2, mv_precision_uv, x, y, xd, can_use_previous);
     return;
   }
-#endif  // CONFIG_HIGHBITDEPTH
   av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16,
-                            &conv_params, interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                            &warp_types, x, y, 0, 0,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                            MV_PRECISION_Q3, x, y, xd);
+                            &conv_params, interp_filters, &warp_types, x, y, 0,
+                            0, MV_PRECISION_Q3, x, y, xd, can_use_previous);
 
   av1_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], uv_block_width,
                             &mv, scale, uv_block_width, uv_block_height,
-                            &conv_params, interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                            &warp_types, x, y, 1, 0,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                            mv_precision_uv, x, y, xd);
+                            &conv_params, interp_filters, &warp_types, x, y, 1,
+                            0, mv_precision_uv, x, y, xd, can_use_previous);
 
   av1_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], uv_block_width,
                             &mv, scale, uv_block_width, uv_block_height,
-                            &conv_params, interp_filters,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                            &warp_types, x, y, 2, 0,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                            mv_precision_uv, x, y, xd);
+                            &conv_params, interp_filters, &warp_types, x, y, 2,
+                            0, mv_precision_uv, x, y, xd, can_use_previous);
 }
 
 void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
@@ -176,7 +149,6 @@ void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_temporal_filter_apply_c(
     uint8_t *frame1_8, unsigned int stride, uint8_t *frame2_8,
     unsigned int block_width, unsigned int block_height, int strength,
@@ -238,7 +210,6 @@ void av1_highbd_temporal_filter_apply_c(
     byte += stride - block_width;
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
                                               uint8_t *arf_frame_buf,
@@ -255,7 +226,7 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
   int cost_list[5];
   MvLimits tmp_mv_limits = x->mv_limits;
 
-  MV best_ref_mv1 = { 0, 0 };
+  MV best_ref_mv1 = kZeroMv;
   MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
 
   // Save input state
@@ -276,8 +247,8 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
 
   av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
 
-  x->mvcost = x->mv_cost_stack[0];
-  x->nmvjointcost = x->nmv_vec_cost[0];
+  x->mvcost = x->mv_cost_stack;
+  x->nmvjointcost = x->nmv_vec_cost;
 
   // Use mv costing from x->mvcost directly
   av1_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
@@ -286,9 +257,8 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
 
   x->mv_limits = tmp_mv_limits;
 
-// Ignore mv costing by sending NULL pointer instead of cost array
-#if CONFIG_AMVR
-  if (cpi->common.cur_frame_mv_precision_level == 1) {
+  // Ignore mv costing by sending NULL pointer instead of cost array
+  if (cpi->common.cur_frame_force_integer_mv == 1) {
     const uint8_t *const src_address = x->plane[0].src.buf;
     const int src_stride = x->plane[0].src.stride;
     const uint8_t *const y = xd->plane[0].pre[0].buf;
@@ -301,17 +271,15 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
     bestsme = cpi->fn_ptr[BLOCK_16X16].vf(y + offset, y_stride, src_address,
                                           src_stride, &sse);
   } else {
-#endif
     bestsme = cpi->find_fractional_mv_step(
-        x, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
+        x, &cpi->common, 0, 0, &best_ref_mv1,
+        cpi->common.allow_high_precision_mv, x->errorperbit,
         &cpi->fn_ptr[BLOCK_16X16], 0, mv_sf->subpel_iters_per_step,
         cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL,
         NULL, 0, 0, 0, 0, 0);
-#if CONFIG_AMVR
   }
-#endif
 
-  x->e_mbd.mi[0]->bmi[0].as_mv[0] = x->best_mv;
+  x->e_mbd.mi[0]->mv[0] = x->best_mv;
 
   // Restore input state
   x->plane[0].src = src;
@@ -321,13 +289,12 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
 }
 
 static void temporal_filter_iterate_c(AV1_COMP *cpi,
-#if CONFIG_BGSPRITE
-                                      YV12_BUFFER_CONFIG *target,
-#endif  // CONFIG_BGSPRITE
                                       YV12_BUFFER_CONFIG **frames,
                                       int frame_count, int alt_ref_index,
                                       int strength,
                                       struct scale_factors *scale) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   int byte;
   int frame;
   int mb_col, mb_row;
@@ -341,28 +308,22 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
   MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
   YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
   uint8_t *dst1, *dst2;
-#if CONFIG_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, predictor16[16 * 16 * 3]);
-  DECLARE_ALIGNED(16, uint8_t, predictor8[16 * 16 * 3]);
+  DECLARE_ALIGNED(32, uint16_t, predictor16[16 * 16 * 3]);
+  DECLARE_ALIGNED(32, uint8_t, predictor8[16 * 16 * 3]);
   uint8_t *predictor;
-#else
-  DECLARE_ALIGNED(16, uint8_t, predictor[16 * 16 * 3]);
-#endif
   const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
   const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x;
 
   // Save input state
   uint8_t *input_buffer[MAX_MB_PLANE];
   int i;
-#if CONFIG_HIGHBITDEPTH
   if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     predictor = CONVERT_TO_BYTEPTR(predictor16);
   } else {
     predictor = predictor8;
   }
-#endif
 
-  for (i = 0; i < MAX_MB_PLANE; i++) input_buffer[i] = mbd->plane[i].pre[0].buf;
+  for (i = 0; i < num_planes; i++) input_buffer[i] = mbd->plane[i].pre[0].buf;
 
   for (mb_row = 0; mb_row < mb_rows; mb_row++) {
     // Source frames are extended to 16 pixels. This is different than
@@ -399,8 +360,9 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
 
         if (frames[frame] == NULL) continue;
 
-        mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0;
-        mbd->mi[0]->bmi[0].as_mv[0].as_mv.col = 0;
+        mbd->mi[0]->mv[0].as_mv.row = 0;
+        mbd->mi[0]->mv[0].as_mv.col = 0;
+        mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
 
         if (frame == alt_ref_index) {
           filter_weight = 2;
@@ -422,60 +384,51 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
               mbd, frames[frame]->y_buffer + mb_y_offset,
               frames[frame]->u_buffer + mb_uv_offset,
               frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride,
-              mb_uv_width, mb_uv_height, mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
-              mbd->mi[0]->bmi[0].as_mv[0].as_mv.col, predictor, scale,
-              mb_col * 16, mb_row * 16);
+              mb_uv_width, mb_uv_height, mbd->mi[0]->mv[0].as_mv.row,
+              mbd->mi[0]->mv[0].as_mv.col, predictor, scale, mb_col * 16,
+              mb_row * 16, cm->allow_warped_motion);
 
-// Apply the filter (YUV)
-#if CONFIG_HIGHBITDEPTH
+          // Apply the filter (YUV)
           if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
             int adj_strength = strength + 2 * (mbd->bd - 8);
             av1_highbd_temporal_filter_apply(
                 f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16,
                 adj_strength, filter_weight, accumulator, count);
-            av1_highbd_temporal_filter_apply(
-                f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
-                mb_uv_width, mb_uv_height, adj_strength, filter_weight,
-                accumulator + 256, count + 256);
-            av1_highbd_temporal_filter_apply(
-                f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
-                mb_uv_width, mb_uv_height, adj_strength, filter_weight,
-                accumulator + 512, count + 512);
+            if (num_planes > 1) {
+              av1_highbd_temporal_filter_apply(
+                  f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
+                  mb_uv_width, mb_uv_height, adj_strength, filter_weight,
+                  accumulator + 256, count + 256);
+              av1_highbd_temporal_filter_apply(
+                  f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
+                  mb_uv_width, mb_uv_height, adj_strength, filter_weight,
+                  accumulator + 512, count + 512);
+            }
           } else {
-#endif  // CONFIG_HIGHBITDEPTH
             av1_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
                                         predictor, 16, 16, strength,
                                         filter_weight, accumulator, count);
-            av1_temporal_filter_apply_c(
-                f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
-                mb_uv_width, mb_uv_height, strength, filter_weight,
-                accumulator + 256, count + 256);
-            av1_temporal_filter_apply_c(
-                f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
-                mb_uv_width, mb_uv_height, strength, filter_weight,
-                accumulator + 512, count + 512);
-#if CONFIG_HIGHBITDEPTH
+            if (num_planes > 1) {
+              av1_temporal_filter_apply_c(
+                  f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
+                  mb_uv_width, mb_uv_height, strength, filter_weight,
+                  accumulator + 256, count + 256);
+              av1_temporal_filter_apply_c(
+                  f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
+                  mb_uv_width, mb_uv_height, strength, filter_weight,
+                  accumulator + 512, count + 512);
+            }
           }
-#endif  // CONFIG_HIGHBITDEPTH
         }
       }
 
-// Normalize filter output to produce AltRef frame
-#if CONFIG_HIGHBITDEPTH
+      // Normalize filter output to produce AltRef frame
       if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         uint16_t *dst1_16;
         uint16_t *dst2_16;
-#if CONFIG_BGSPRITE
-        dst1 = target->y_buffer;
-#else
         dst1 = cpi->alt_ref_buffer.y_buffer;
-#endif  // CONFIG_BGSPRITE
         dst1_16 = CONVERT_TO_SHORTPTR(dst1);
-#if CONFIG_BGSPRITE
-        stride = target->y_stride;
-#else
         stride = cpi->alt_ref_buffer.y_stride;
-#endif  // CONFIG_BGSPRITE
         byte = mb_y_offset;
         for (i = 0, k = 0; i < 16; i++) {
           for (j = 0; j < 16; j++, k++) {
@@ -488,40 +441,31 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
 
           byte += stride - 16;
         }
-
-        dst1 = cpi->alt_ref_buffer.u_buffer;
-        dst2 = cpi->alt_ref_buffer.v_buffer;
-        dst1_16 = CONVERT_TO_SHORTPTR(dst1);
-        dst2_16 = CONVERT_TO_SHORTPTR(dst2);
-        stride = cpi->alt_ref_buffer.uv_stride;
-        byte = mb_uv_offset;
-        for (i = 0, k = 256; i < mb_uv_height; i++) {
-          for (j = 0; j < mb_uv_width; j++, k++) {
-            int m = k + 256;
-
-            // U
-            dst1_16[byte] =
-                (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
-
-            // V
-            dst2_16[byte] =
-                (uint16_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
-
-            // move to next pixel
-            byte++;
+        if (num_planes > 1) {
+          dst1 = cpi->alt_ref_buffer.u_buffer;
+          dst2 = cpi->alt_ref_buffer.v_buffer;
+          dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+          dst2_16 = CONVERT_TO_SHORTPTR(dst2);
+          stride = cpi->alt_ref_buffer.uv_stride;
+          byte = mb_uv_offset;
+          for (i = 0, k = 256; i < mb_uv_height; i++) {
+            for (j = 0; j < mb_uv_width; j++, k++) {
+              int m = k + 256;
+              // U
+              dst1_16[byte] =
+                  (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+              // V
+              dst2_16[byte] =
+                  (uint16_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
+              // move to next pixel
+              byte++;
+            }
+            byte += stride - mb_uv_width;
           }
-
-          byte += stride - mb_uv_width;
         }
       } else {
-#endif  // CONFIG_HIGHBITDEPTH
-#if CONFIG_BGSPRITE
-        dst1 = target->y_buffer;
-        stride = target->y_stride;
-#else
-      dst1 = cpi->alt_ref_buffer.y_buffer;
-      stride = cpi->alt_ref_buffer.y_stride;
-#endif  // CONFIG_BGSPRITE
+        dst1 = cpi->alt_ref_buffer.y_buffer;
+        stride = cpi->alt_ref_buffer.y_stride;
         byte = mb_y_offset;
         for (i = 0, k = 0; i < 16; i++) {
           for (j = 0; j < 16; j++, k++) {
@@ -533,36 +477,27 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
           }
           byte += stride - 16;
         }
-#if CONFIG_BGSPRITE
-        dst1 = target->u_buffer;
-        dst2 = target->v_buffer;
-        stride = target->uv_stride;
-#else
-      dst1 = cpi->alt_ref_buffer.u_buffer;
-      dst2 = cpi->alt_ref_buffer.v_buffer;
-      stride = cpi->alt_ref_buffer.uv_stride;
-#endif  // CONFIG_BGSPRITE
-        byte = mb_uv_offset;
-        for (i = 0, k = 256; i < mb_uv_height; i++) {
-          for (j = 0; j < mb_uv_width; j++, k++) {
-            int m = k + 256;
-
-            // U
-            dst1[byte] =
-                (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
-
-            // V
-            dst2[byte] =
-                (uint8_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
-
-            // move to next pixel
-            byte++;
+        if (num_planes > 1) {
+          dst1 = cpi->alt_ref_buffer.u_buffer;
+          dst2 = cpi->alt_ref_buffer.v_buffer;
+          stride = cpi->alt_ref_buffer.uv_stride;
+          byte = mb_uv_offset;
+          for (i = 0, k = 256; i < mb_uv_height; i++) {
+            for (j = 0; j < mb_uv_width; j++, k++) {
+              int m = k + 256;
+              // U
+              dst1[byte] =
+                  (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+              // V
+              dst2[byte] =
+                  (uint8_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
+              // move to next pixel
+              byte++;
+            }
+            byte += stride - mb_uv_width;
           }
-          byte += stride - mb_uv_width;
         }
-#if CONFIG_HIGHBITDEPTH
       }
-#endif  // CONFIG_HIGHBITDEPTH
       mb_y_offset += 16;
       mb_uv_offset += mb_uv_width;
     }
@@ -571,7 +506,7 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
   }
 
   // Restore input state
-  for (i = 0; i < MAX_MB_PLANE; i++) mbd->plane[i].pre[0].buf = input_buffer[i];
+  for (i = 0; i < num_planes; i++) mbd->plane[i].pre[0].buf = input_buffer[i];
 }
 
 // Apply buffer limits and context specific adjustments to arnr filter.
@@ -633,11 +568,7 @@ static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
   *arnr_strength = strength;
 }
 
-void av1_temporal_filter(AV1_COMP *cpi,
-#if CONFIG_BGSPRITE
-                         YV12_BUFFER_CONFIG *bg, YV12_BUFFER_CONFIG *target,
-#endif  // CONFIG_BGSPRITE
-                         int distance) {
+void av1_temporal_filter(AV1_COMP *cpi, int distance) {
   RATE_CONTROL *const rc = &cpi->rc;
   int frame;
   int frames_to_blur;
@@ -647,17 +578,14 @@ void av1_temporal_filter(AV1_COMP *cpi,
   int frames_to_blur_forward;
   struct scale_factors sf;
   YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
-#if CONFIG_EXT_REFS
   const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-#endif  // CONFIG_EXT_REFS
 
   // Apply context specific adjustments to the arnr filter parameters.
   adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength);
-// TODO(weitinglin): Currently, we enforce the filtering strength on
-//                   extra ARFs' to be zeros. We should investigate in which
-//                   case it is more beneficial to use non-zero strength
-//                   filtering.
-#if CONFIG_EXT_REFS
+  // TODO(weitinglin): Currently, we enforce the filtering strength on
+  //                   extra ARFs' to be zeros. We should investigate in which
+  //                   case it is more beneficial to use non-zero strength
+  //                   filtering.
   if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
     strength = 0;
     frames_to_blur = 1;
@@ -685,7 +613,7 @@ void av1_temporal_filter(AV1_COMP *cpi,
     cpi->is_arf_filter_off[which_arf] = 1;
   else
     cpi->is_arf_filter_off[which_arf] = 0;
-#endif  // CONFIG_EXT_REFS
+  cpi->common.showable_frame = cpi->is_arf_filter_off[which_arf];
 
   frames_to_blur_backward = (frames_to_blur / 2);
   frames_to_blur_forward = ((frames_to_blur - 1) / 2);
@@ -694,40 +622,20 @@ void av1_temporal_filter(AV1_COMP *cpi,
   // Setup frame pointers, NULL indicates frame not included in filter.
   for (frame = 0; frame < frames_to_blur; ++frame) {
     const int which_buffer = start_frame - frame;
-#if CONFIG_BGSPRITE
-    if (frame == frames_to_blur_backward && bg != NULL) {
-      // Insert bg into frames at ARF index.
-      frames[frames_to_blur - 1 - frame] = bg;
-    } else {
-#endif  // CONFIG_BGSPRITE
-      struct lookahead_entry *buf =
-          av1_lookahead_peek(cpi->lookahead, which_buffer);
-      frames[frames_to_blur - 1 - frame] = &buf->img;
-#if CONFIG_BGSPRITE
-    }
-#endif  // CONFIG_BGSPRITE
+    struct lookahead_entry *buf =
+        av1_lookahead_peek(cpi->lookahead, which_buffer);
+    frames[frames_to_blur - 1 - frame] = &buf->img;
   }
 
   if (frames_to_blur > 0) {
-// Setup scaling factors. Scaling on each of the arnr frames is not
-// supported.
-// ARF is produced at the native frame size and resized when coded.
-#if CONFIG_HIGHBITDEPTH
-    av1_setup_scale_factors_for_frame(
-        &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
-        frames[0]->y_crop_width, frames[0]->y_crop_height,
-        cpi->common.use_highbitdepth);
-#else
+    // Setup scaling factors. Scaling on each of the arnr frames is not
+    // supported.
+    // ARF is produced at the native frame size and resized when coded.
     av1_setup_scale_factors_for_frame(
         &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
         frames[0]->y_crop_width, frames[0]->y_crop_height);
-#endif  // CONFIG_HIGHBITDEPTH
   }
 
-  temporal_filter_iterate_c(cpi,
-#if CONFIG_BGSPRITE
-                            target,
-#endif  // CONFIG_BGSPRITE
-                            frames, frames_to_blur, frames_to_blur_backward,
-                            strength, &sf);
+  temporal_filter_iterate_c(cpi, frames, frames_to_blur,
+                            frames_to_blur_backward, strength, &sf);
 }
diff --git a/third_party/aom/av1/encoder/temporal_filter.h b/third_party/aom/av1/encoder/temporal_filter.h
index 7dd9fad58..bc0863a63 100644
--- a/third_party/aom/av1/encoder/temporal_filter.h
+++ b/third_party/aom/av1/encoder/temporal_filter.h
@@ -16,11 +16,7 @@
 extern "C" {
 #endif
 
-void av1_temporal_filter(AV1_COMP *cpi,
-#if CONFIG_BGSPRITE
-                         YV12_BUFFER_CONFIG *bg, YV12_BUFFER_CONFIG *target,
-#endif  // CONFIG_BGSPRITE
-                         int distance);
+void av1_temporal_filter(AV1_COMP *cpi, int distance);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/tokenize.c b/third_party/aom/av1/encoder/tokenize.c
index a2e24d66b..16a6a9a35 100644
--- a/third_party/aom/av1/encoder/tokenize.c
+++ b/third_party/aom/av1/encoder/tokenize.c
@@ -23,314 +23,13 @@
 
 #include "av1/encoder/cost.h"
 #include "av1/encoder/encoder.h"
-#if CONFIG_LV_MAP
-#include "av1/encoder/encodetxb.c"
-#endif
+#include "av1/encoder/encodetxb.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/tokenize.h"
 
-static const TOKENVALUE dct_cat_lt_10_value_tokens[] = {
-  { 9, 63 }, { 9, 61 }, { 9, 59 }, { 9, 57 }, { 9, 55 }, { 9, 53 }, { 9, 51 },
-  { 9, 49 }, { 9, 47 }, { 9, 45 }, { 9, 43 }, { 9, 41 }, { 9, 39 }, { 9, 37 },
-  { 9, 35 }, { 9, 33 }, { 9, 31 }, { 9, 29 }, { 9, 27 }, { 9, 25 }, { 9, 23 },
-  { 9, 21 }, { 9, 19 }, { 9, 17 }, { 9, 15 }, { 9, 13 }, { 9, 11 }, { 9, 9 },
-  { 9, 7 },  { 9, 5 },  { 9, 3 },  { 9, 1 },  { 8, 31 }, { 8, 29 }, { 8, 27 },
-  { 8, 25 }, { 8, 23 }, { 8, 21 }, { 8, 19 }, { 8, 17 }, { 8, 15 }, { 8, 13 },
-  { 8, 11 }, { 8, 9 },  { 8, 7 },  { 8, 5 },  { 8, 3 },  { 8, 1 },  { 7, 15 },
-  { 7, 13 }, { 7, 11 }, { 7, 9 },  { 7, 7 },  { 7, 5 },  { 7, 3 },  { 7, 1 },
-  { 6, 7 },  { 6, 5 },  { 6, 3 },  { 6, 1 },  { 5, 3 },  { 5, 1 },  { 4, 1 },
-  { 3, 1 },  { 2, 1 },  { 1, 1 },  { 0, 0 },  { 1, 0 },  { 2, 0 },  { 3, 0 },
-  { 4, 0 },  { 5, 0 },  { 5, 2 },  { 6, 0 },  { 6, 2 },  { 6, 4 },  { 6, 6 },
-  { 7, 0 },  { 7, 2 },  { 7, 4 },  { 7, 6 },  { 7, 8 },  { 7, 10 }, { 7, 12 },
-  { 7, 14 }, { 8, 0 },  { 8, 2 },  { 8, 4 },  { 8, 6 },  { 8, 8 },  { 8, 10 },
-  { 8, 12 }, { 8, 14 }, { 8, 16 }, { 8, 18 }, { 8, 20 }, { 8, 22 }, { 8, 24 },
-  { 8, 26 }, { 8, 28 }, { 8, 30 }, { 9, 0 },  { 9, 2 },  { 9, 4 },  { 9, 6 },
-  { 9, 8 },  { 9, 10 }, { 9, 12 }, { 9, 14 }, { 9, 16 }, { 9, 18 }, { 9, 20 },
-  { 9, 22 }, { 9, 24 }, { 9, 26 }, { 9, 28 }, { 9, 30 }, { 9, 32 }, { 9, 34 },
-  { 9, 36 }, { 9, 38 }, { 9, 40 }, { 9, 42 }, { 9, 44 }, { 9, 46 }, { 9, 48 },
-  { 9, 50 }, { 9, 52 }, { 9, 54 }, { 9, 56 }, { 9, 58 }, { 9, 60 }, { 9, 62 }
-};
-const TOKENVALUE *av1_dct_cat_lt_10_value_tokens =
-    dct_cat_lt_10_value_tokens +
-    (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens)) /
-        2;
-// The corresponding costs of the extrabits for the tokens in the above table
-// are stored in the table below. The values are obtained from looking up the
-// entry for the specified extrabits in the table corresponding to the token
-// (as defined in cost element av1_extra_bits)
-// e.g. {9, 63} maps to cat5_cost[63 >> 1], {1, 1} maps to sign_cost[1 >> 1]
-static const int dct_cat_lt_10_value_cost[] = {
-  3773, 3750, 3704, 3681, 3623, 3600, 3554, 3531, 3432, 3409, 3363, 3340, 3282,
-  3259, 3213, 3190, 3136, 3113, 3067, 3044, 2986, 2963, 2917, 2894, 2795, 2772,
-  2726, 2703, 2645, 2622, 2576, 2553, 3197, 3116, 3058, 2977, 2881, 2800, 2742,
-  2661, 2615, 2534, 2476, 2395, 2299, 2218, 2160, 2079, 2566, 2427, 2334, 2195,
-  2023, 1884, 1791, 1652, 1893, 1696, 1453, 1256, 1229, 864,  512,  512,  512,
-  512,  0,    512,  512,  512,  512,  864,  1229, 1256, 1453, 1696, 1893, 1652,
-  1791, 1884, 2023, 2195, 2334, 2427, 2566, 2079, 2160, 2218, 2299, 2395, 2476,
-  2534, 2615, 2661, 2742, 2800, 2881, 2977, 3058, 3116, 3197, 2553, 2576, 2622,
-  2645, 2703, 2726, 2772, 2795, 2894, 2917, 2963, 2986, 3044, 3067, 3113, 3136,
-  3190, 3213, 3259, 3282, 3340, 3363, 3409, 3432, 3531, 3554, 3600, 3623, 3681,
-  3704, 3750, 3773,
-};
-const int *av1_dct_cat_lt_10_value_cost =
-    dct_cat_lt_10_value_cost +
-    (sizeof(dct_cat_lt_10_value_cost) / sizeof(*dct_cat_lt_10_value_cost)) / 2;
-
-// Array indices are identical to previously-existing CONTEXT_NODE indices
-/* clang-format off */
-const aom_tree_index av1_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
-  -EOB_TOKEN, 2,                       // 0  = EOB
-  -ZERO_TOKEN, 4,                      // 1  = ZERO
-  -ONE_TOKEN, 6,                       // 2  = ONE
-  8, 12,                               // 3  = LOW_VAL
-  -TWO_TOKEN, 10,                      // 4  = TWO
-  -THREE_TOKEN, -FOUR_TOKEN,           // 5  = THREE
-  14, 16,                              // 6  = HIGH_LOW
-  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 7  = CAT_ONE
-  18, 20,                              // 8  = CAT_THREEFOUR
-  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 9  = CAT_THREE
-  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 10 = CAT_FIVE
-};
-/* clang-format on */
-
-static const int16_t zero_cost[] = { 0 };
-static const int16_t sign_cost[1] = { 512 };
-static const int16_t cat1_cost[1 << 1] = { 864, 1229 };
-static const int16_t cat2_cost[1 << 2] = { 1256, 1453, 1696, 1893 };
-static const int16_t cat3_cost[1 << 3] = { 1652, 1791, 1884, 2023,
-                                           2195, 2334, 2427, 2566 };
-static const int16_t cat4_cost[1 << 4] = { 2079, 2160, 2218, 2299, 2395, 2476,
-                                           2534, 2615, 2661, 2742, 2800, 2881,
-                                           2977, 3058, 3116, 3197 };
-static const int16_t cat5_cost[1 << 5] = {
-  2553, 2576, 2622, 2645, 2703, 2726, 2772, 2795, 2894, 2917, 2963,
-  2986, 3044, 3067, 3113, 3136, 3190, 3213, 3259, 3282, 3340, 3363,
-  3409, 3432, 3531, 3554, 3600, 3623, 3681, 3704, 3750, 3773
-};
-const int16_t av1_cat6_low_cost[256] = {
-  3378, 3390, 3401, 3413, 3435, 3447, 3458, 3470, 3517, 3529, 3540, 3552, 3574,
-  3586, 3597, 3609, 3671, 3683, 3694, 3706, 3728, 3740, 3751, 3763, 3810, 3822,
-  3833, 3845, 3867, 3879, 3890, 3902, 3973, 3985, 3996, 4008, 4030, 4042, 4053,
-  4065, 4112, 4124, 4135, 4147, 4169, 4181, 4192, 4204, 4266, 4278, 4289, 4301,
-  4323, 4335, 4346, 4358, 4405, 4417, 4428, 4440, 4462, 4474, 4485, 4497, 4253,
-  4265, 4276, 4288, 4310, 4322, 4333, 4345, 4392, 4404, 4415, 4427, 4449, 4461,
-  4472, 4484, 4546, 4558, 4569, 4581, 4603, 4615, 4626, 4638, 4685, 4697, 4708,
-  4720, 4742, 4754, 4765, 4777, 4848, 4860, 4871, 4883, 4905, 4917, 4928, 4940,
-  4987, 4999, 5010, 5022, 5044, 5056, 5067, 5079, 5141, 5153, 5164, 5176, 5198,
-  5210, 5221, 5233, 5280, 5292, 5303, 5315, 5337, 5349, 5360, 5372, 4988, 5000,
-  5011, 5023, 5045, 5057, 5068, 5080, 5127, 5139, 5150, 5162, 5184, 5196, 5207,
-  5219, 5281, 5293, 5304, 5316, 5338, 5350, 5361, 5373, 5420, 5432, 5443, 5455,
-  5477, 5489, 5500, 5512, 5583, 5595, 5606, 5618, 5640, 5652, 5663, 5675, 5722,
-  5734, 5745, 5757, 5779, 5791, 5802, 5814, 5876, 5888, 5899, 5911, 5933, 5945,
-  5956, 5968, 6015, 6027, 6038, 6050, 6072, 6084, 6095, 6107, 5863, 5875, 5886,
-  5898, 5920, 5932, 5943, 5955, 6002, 6014, 6025, 6037, 6059, 6071, 6082, 6094,
-  6156, 6168, 6179, 6191, 6213, 6225, 6236, 6248, 6295, 6307, 6318, 6330, 6352,
-  6364, 6375, 6387, 6458, 6470, 6481, 6493, 6515, 6527, 6538, 6550, 6597, 6609,
-  6620, 6632, 6654, 6666, 6677, 6689, 6751, 6763, 6774, 6786, 6808, 6820, 6831,
-  6843, 6890, 6902, 6913, 6925, 6947, 6959, 6970, 6982
-};
-const int av1_cat6_high_cost[CAT6_HIGH_COST_ENTRIES] = {
-  100,   2263,  2739,  4902,  3160,  5323,  5799,  7962,  3678,  5841,  6317,
-  8480,  6738,  8901,  9377,  11540, 3678,  5841,  6317,  8480,  6738,  8901,
-  9377,  11540, 7256,  9419,  9895,  12058, 10316, 12479, 12955, 15118, 3678,
-  5841,  6317,  8480,  6738,  8901,  9377,  11540, 7256,  9419,  9895,  12058,
-  10316, 12479, 12955, 15118, 7256,  9419,  9895,  12058, 10316, 12479, 12955,
-  15118, 10834, 12997, 13473, 15636, 13894, 16057, 16533, 18696,
-#if CONFIG_HIGHBITDEPTH
-  4193,  6356,  6832,  8995,  7253,  9416,  9892,  12055, 7771,  9934,  10410,
-  12573, 10831, 12994, 13470, 15633, 7771,  9934,  10410, 12573, 10831, 12994,
-  13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771,
-  9934,  10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151,
-  14409, 16572, 17048, 19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048,
-  19211, 14927, 17090, 17566, 19729, 17987, 20150, 20626, 22789, 4193,  6356,
-  6832,  8995,  7253,  9416,  9892,  12055, 7771,  9934,  10410, 12573, 10831,
-  12994, 13470, 15633, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633,
-  11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771,  9934,  10410,
-  12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572,
-  17048, 19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927,
-  17090, 17566, 19729, 17987, 20150, 20626, 22789, 8286,  10449, 10925, 13088,
-  11346, 13509, 13985, 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563,
-  19726, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605,
-  18081, 20244, 18502, 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924,
-  17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304,
-  15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659,
-  23822, 22080, 24243, 24719, 26882, 4193,  6356,  6832,  8995,  7253,  9416,
-  9892,  12055, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633, 7771,
-  9934,  10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151,
-  14409, 16572, 17048, 19211, 7771,  9934,  10410, 12573, 10831, 12994, 13470,
-  15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 11349, 13512,
-  13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090, 17566, 19729, 17987,
-  20150, 20626, 22789, 8286,  10449, 10925, 13088, 11346, 13509, 13985, 16148,
-  11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503,
-  16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665,
-  21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442,
-  17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244,
-  18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719,
-  26882, 8286,  10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027,
-  14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924,
-  17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304,
-  11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081,
-  20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665,
-  21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379,
-  14542, 15018, 17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759,
-  19017, 21180, 21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656,
-  23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120,
-  18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595,
-  24758, 25234, 27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397,
-  23113, 25276, 25752, 27915, 26173, 28336, 28812, 30975, 4193,  6356,  6832,
-  8995,  7253,  9416,  9892,  12055, 7771,  9934,  10410, 12573, 10831, 12994,
-  13470, 15633, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633, 11349,
-  13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771,  9934,  10410, 12573,
-  10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048,
-  19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090,
-  17566, 19729, 17987, 20150, 20626, 22789, 8286,  10449, 10925, 13088, 11346,
-  13509, 13985, 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726,
-  11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081,
-  20244, 18502, 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087,
-  17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442,
-  17605, 18081, 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822,
-  22080, 24243, 24719, 26882, 8286,  10449, 10925, 13088, 11346, 13509, 13985,
-  16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027,
-  14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502,
-  20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726,
-  15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081,
-  20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243,
-  24719, 26882, 12379, 14542, 15018, 17181, 15439, 17602, 18078, 20241, 15957,
-  18120, 18596, 20759, 19017, 21180, 21656, 23819, 15957, 18120, 18596, 20759,
-  19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234,
-  27397, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698,
-  22174, 24337, 22595, 24758, 25234, 27397, 19535, 21698, 22174, 24337, 22595,
-  24758, 25234, 27397, 23113, 25276, 25752, 27915, 26173, 28336, 28812, 30975,
-  8286,  10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027, 14503,
-  16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924, 17087,
-  17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 11864,
-  14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244,
-  18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665, 21141,
-  23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379, 14542,
-  15018, 17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017,
-  21180, 21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819,
-  19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596,
-  20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758,
-  25234, 27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113,
-  25276, 25752, 27915, 26173, 28336, 28812, 30975, 12379, 14542, 15018, 17181,
-  15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180, 21656,
-  23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698,
-  22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759, 19017,
-  21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397,
-  19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276, 25752,
-  27915, 26173, 28336, 28812, 30975, 16472, 18635, 19111, 21274, 19532, 21695,
-  22171, 24334, 20050, 22213, 22689, 24852, 23110, 25273, 25749, 27912, 20050,
-  22213, 22689, 24852, 23110, 25273, 25749, 27912, 23628, 25791, 26267, 28430,
-  26688, 28851, 29327, 31490, 20050, 22213, 22689, 24852, 23110, 25273, 25749,
-  27912, 23628, 25791, 26267, 28430, 26688, 28851, 29327, 31490, 23628, 25791,
-  26267, 28430, 26688, 28851, 29327, 31490, 27206, 29369, 29845, 32008, 30266,
-  32429, 32905, 35068
-#endif
-};
-
-const uint8_t av1_cat6_skipped_bits_discount[8] = {
-  0, 3, 6, 9, 12, 18, 24, 30
-};
-
-#if CONFIG_NEW_MULTISYMBOL
-const av1_extra_bit av1_extra_bits[ENTROPY_TOKENS] = {
-  { 0, 0, 0, zero_cost },                        // ZERO_TOKEN
-  { 0, 0, 1, sign_cost },                        // ONE_TOKEN
-  { 0, 0, 2, sign_cost },                        // TWO_TOKEN
-  { 0, 0, 3, sign_cost },                        // THREE_TOKEN
-  { 0, 0, 4, sign_cost },                        // FOUR_TOKEN
-  { av1_cat1_cdf, 1, CAT1_MIN_VAL, cat1_cost },  // CATEGORY1_TOKEN
-  { av1_cat2_cdf, 2, CAT2_MIN_VAL, cat2_cost },  // CATEGORY2_TOKEN
-  { av1_cat3_cdf, 3, CAT3_MIN_VAL, cat3_cost },  // CATEGORY3_TOKEN
-  { av1_cat4_cdf, 4, CAT4_MIN_VAL, cat4_cost },  // CATEGORY4_TOKEN
-  { av1_cat5_cdf, 5, CAT5_MIN_VAL, cat5_cost },  // CATEGORY5_TOKEN
-  { av1_cat6_cdf, 18, CAT6_MIN_VAL, 0 },         // CATEGORY6_TOKEN
-  { 0, 0, 0, zero_cost }                         // EOB_TOKEN
-};
-#else
-const av1_extra_bit av1_extra_bits[ENTROPY_TOKENS] = {
-  { 0, 0, 0, zero_cost },                         // ZERO_TOKEN
-  { 0, 0, 1, sign_cost },                         // ONE_TOKEN
-  { 0, 0, 2, sign_cost },                         // TWO_TOKEN
-  { 0, 0, 3, sign_cost },                         // THREE_TOKEN
-  { 0, 0, 4, sign_cost },                         // FOUR_TOKEN
-  { av1_cat1_prob, 1, CAT1_MIN_VAL, cat1_cost },  // CATEGORY1_TOKEN
-  { av1_cat2_prob, 2, CAT2_MIN_VAL, cat2_cost },  // CATEGORY2_TOKEN
-  { av1_cat3_prob, 3, CAT3_MIN_VAL, cat3_cost },  // CATEGORY3_TOKEN
-  { av1_cat4_prob, 4, CAT4_MIN_VAL, cat4_cost },  // CATEGORY4_TOKEN
-  { av1_cat5_prob, 5, CAT5_MIN_VAL, cat5_cost },  // CATEGORY5_TOKEN
-  { av1_cat6_prob, 18, CAT6_MIN_VAL, 0 },         // CATEGORY6_TOKEN
-  { 0, 0, 0, zero_cost }                          // EOB_TOKEN
-};
-#endif
-
-#if !CONFIG_PVQ || CONFIG_VAR_TX
-static void cost_coeffs_b(int plane, int block, int blk_row, int blk_col,
-                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
-  struct tokenize_b_args *const args = arg;
-  const AV1_COMP *const cpi = args->cpi;
-  const AV1_COMMON *cm = &cpi->common;
-  ThreadData *const td = args->td;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct macroblock_plane *p = &x->plane[plane];
-  struct macroblockd_plane *pd = &xd->plane[plane];
-  const PLANE_TYPE type = pd->plane_type;
-  const TX_TYPE tx_type =
-      av1_get_tx_type(type, xd, blk_row, blk_col, block, tx_size);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-  const int rate = av1_cost_coeffs(
-      cpi, x, plane, blk_row, blk_col, block, tx_size, scan_order,
-      pd->above_context + blk_col, pd->left_context + blk_row, 0);
-  args->this_rate += rate;
-  (void)plane_bsize;
-  av1_set_contexts(xd, pd, plane, tx_size, p->eobs[block] > 0, blk_col,
-                   blk_row);
-}
-
-static void set_entropy_context_b(int plane, int block, int blk_row,
-                                  int blk_col, BLOCK_SIZE plane_bsize,
-                                  TX_SIZE tx_size, void *arg) {
-  struct tokenize_b_args *const args = arg;
-  ThreadData *const td = args->td;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *p = &x->plane[plane];
-  struct macroblockd_plane *pd = &xd->plane[plane];
-  (void)plane_bsize;
-  av1_set_contexts(xd, pd, plane, tx_size, p->eobs[block] > 0, blk_col,
-                   blk_row);
-}
-
-static INLINE void add_token(TOKENEXTRA **t,
-                             aom_cdf_prob (*tail_cdf)[CDF_SIZE(ENTROPY_TOKENS)],
-                             aom_cdf_prob (*head_cdf)[CDF_SIZE(ENTROPY_TOKENS)],
-                             int eob_val, int first_val, int32_t extra,
-                             uint8_t token) {
-  (*t)->token = token;
-  (*t)->extra = extra;
-  (*t)->tail_cdf = tail_cdf;
-  (*t)->head_cdf = head_cdf;
-  (*t)->eob_val = eob_val;
-  (*t)->first_val = first_val;
-  (*t)++;
-
-  if (token == BLOCK_Z_TOKEN) {
-    update_cdf(*head_cdf, 0, HEAD_TOKENS + 1);
-  } else {
-    if (eob_val != LAST_EOB) {
-      const int symb = 2 * AOMMIN(token, TWO_TOKEN) - eob_val + first_val;
-      update_cdf(*head_cdf, symb, HEAD_TOKENS + first_val);
-    }
-    if (token > ONE_TOKEN)
-      update_cdf(*tail_cdf, token - TWO_TOKEN, TAIL_TOKENS);
-  }
-}
-#endif  // !CONFIG_PVQ || CONFIG_VAR_TX
-
 static int cost_and_tokenize_map(Av1ColorMapParam *param, TOKENEXTRA **t,
-                                 int calc_rate) {
+                                 int plane, int calc_rate, int allow_update_cdf,
+                                 FRAME_COUNTS *counts) {
   const uint8_t *const color_map = param->color_map;
   MapCdf map_cdf = param->map_cdf;
   ColorCost color_cost = param->color_cost;
@@ -338,28 +37,37 @@ static int cost_and_tokenize_map(Av1ColorMapParam *param, TOKENEXTRA **t,
   const int rows = param->rows;
   const int cols = param->cols;
   const int n = param->n_colors;
-
+  const int palette_size_idx = n - PALETTE_MIN_SIZE;
   int this_rate = 0;
   uint8_t color_order[PALETTE_MAX_SIZE];
-#if CONFIG_PALETTE_THROUGHPUT
+
+  (void)plane;
+  (void)counts;
+
   for (int k = 1; k < rows + cols - 1; ++k) {
     for (int j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) {
       int i = k - j;
-#else
-  for (int i = 0; i < rows; ++i) {
-    for (int j = (i == 0 ? 1 : 0); j < cols; ++j) {
-#endif  // CONFIG_PALETTE_THROUGHPUT
       int color_new_idx;
       const int color_ctx = av1_get_palette_color_index_context(
           color_map, plane_block_width, i, j, n, color_order, &color_new_idx);
       assert(color_new_idx >= 0 && color_new_idx < n);
       if (calc_rate) {
-        this_rate +=
-            (*color_cost)[n - PALETTE_MIN_SIZE][color_ctx][color_new_idx];
+        this_rate += (*color_cost)[palette_size_idx][color_ctx][color_new_idx];
       } else {
         (*t)->token = color_new_idx;
-        (*t)->color_map_cdf = map_cdf[n - PALETTE_MIN_SIZE][color_ctx];
+        (*t)->color_map_cdf = map_cdf[palette_size_idx][color_ctx];
         ++(*t);
+        if (allow_update_cdf)
+          update_cdf(map_cdf[palette_size_idx][color_ctx], color_new_idx, n);
+#if CONFIG_ENTROPY_STATS
+        if (plane) {
+          ++counts->palette_uv_color_index[palette_size_idx][color_ctx]
+                                          [color_new_idx];
+        } else {
+          ++counts->palette_y_color_index[palette_size_idx][color_ctx]
+                                         [color_new_idx];
+        }
+#endif
       }
     }
   }
@@ -370,7 +78,7 @@ static int cost_and_tokenize_map(Av1ColorMapParam *param, TOKENEXTRA **t,
 static void get_palette_params(const MACROBLOCK *const x, int plane,
                                BLOCK_SIZE bsize, Av1ColorMapParam *params) {
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
   const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   params->color_map = xd->plane[plane].color_index_map;
   params->map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
@@ -382,263 +90,62 @@ static void get_palette_params(const MACROBLOCK *const x, int plane,
                            &params->rows, &params->cols);
 }
 
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-static void get_mrc_params(const MACROBLOCK *const x, int block,
-                           TX_SIZE tx_size, Av1ColorMapParam *params) {
-  memset(params, 0, sizeof(*params));
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const int is_inter = is_inter_block(mbmi);
-  params->color_map = BLOCK_OFFSET(xd->mrc_mask, block);
-  params->map_cdf = is_inter ? xd->tile_ctx->mrc_mask_inter_cdf
-                             : xd->tile_ctx->mrc_mask_intra_cdf;
-  params->color_cost =
-      is_inter ? &x->mrc_mask_inter_cost : &x->mrc_mask_intra_cost;
-  params->n_colors = 2;
-  params->plane_width = tx_size_wide[tx_size];
-  params->rows = tx_size_high[tx_size];
-  params->cols = tx_size_wide[tx_size];
-}
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-
 static void get_color_map_params(const MACROBLOCK *const x, int plane,
-                                 int block, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                 BLOCK_SIZE bsize, TX_SIZE tx_size,
                                  COLOR_MAP_TYPE type,
                                  Av1ColorMapParam *params) {
-  (void)block;
   (void)tx_size;
   memset(params, 0, sizeof(*params));
   switch (type) {
     case PALETTE_MAP: get_palette_params(x, plane, bsize, params); break;
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-    case MRC_MAP: get_mrc_params(x, block, tx_size, params); break;
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
     default: assert(0 && "Invalid color map type"); return;
   }
 }
 
-int av1_cost_color_map(const MACROBLOCK *const x, int plane, int block,
-                       BLOCK_SIZE bsize, TX_SIZE tx_size, COLOR_MAP_TYPE type) {
+int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize,
+                       TX_SIZE tx_size, COLOR_MAP_TYPE type) {
   assert(plane == 0 || plane == 1);
   Av1ColorMapParam color_map_params;
-  get_color_map_params(x, plane, block, bsize, tx_size, type,
-                       &color_map_params);
-  return cost_and_tokenize_map(&color_map_params, NULL, 1);
+  get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params);
+  return cost_and_tokenize_map(&color_map_params, NULL, plane, 1, 0, NULL);
 }
 
-void av1_tokenize_color_map(const MACROBLOCK *const x, int plane, int block,
+void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
                             TOKENEXTRA **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
-                            COLOR_MAP_TYPE type) {
+                            COLOR_MAP_TYPE type, int allow_update_cdf,
+                            FRAME_COUNTS *counts) {
   assert(plane == 0 || plane == 1);
-#if CONFIG_MRC_TX
-  if (type == MRC_MAP) {
-    const int is_inter = is_inter_block(&x->e_mbd.mi[0]->mbmi);
-    if ((is_inter && !SIGNAL_MRC_MASK_INTER) ||
-        (!is_inter && !SIGNAL_MRC_MASK_INTRA))
-      return;
-  }
-#endif  // CONFIG_MRC_TX
   Av1ColorMapParam color_map_params;
-  get_color_map_params(x, plane, block, bsize, tx_size, type,
-                       &color_map_params);
+  get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params);
   // The first color index does not use context or entropy.
   (*t)->token = color_map_params.color_map[0];
   (*t)->color_map_cdf = NULL;
   ++(*t);
-  cost_and_tokenize_map(&color_map_params, t, 0);
-}
-
-#if CONFIG_PVQ
-static void add_pvq_block(AV1_COMMON *const cm, MACROBLOCK *const x,
-                          PVQ_INFO *pvq) {
-  PVQ_QUEUE *q = x->pvq_q;
-  if (q->curr_pos >= q->buf_len) {
-    int new_buf_len = 2 * q->buf_len + 1;
-    PVQ_INFO *new_buf;
-    CHECK_MEM_ERROR(cm, new_buf, aom_malloc(new_buf_len * sizeof(PVQ_INFO)));
-    memcpy(new_buf, q->buf, q->buf_len * sizeof(PVQ_INFO));
-    aom_free(q->buf);
-    q->buf = new_buf;
-    q->buf_len = new_buf_len;
-  }
-  OD_COPY(q->buf + q->curr_pos, pvq, 1);
-  ++q->curr_pos;
-}
-
-// NOTE: This does not actually generate tokens, instead we store the encoding
-// decisions made for PVQ in a queue that we will read from when
-// actually writing the bitstream in write_modes_b
-static void tokenize_pvq(int plane, int block, int blk_row, int blk_col,
-                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
-  struct tokenize_b_args *const args = arg;
-  const AV1_COMP *cpi = args->cpi;
-  const AV1_COMMON *const cm = &cpi->common;
-  ThreadData *const td = args->td;
-  MACROBLOCK *const x = &td->mb;
-  PVQ_INFO *pvq_info;
-
-  (void)block;
-  (void)blk_row;
-  (void)blk_col;
-  (void)plane_bsize;
-  (void)tx_size;
-
-  assert(block < MAX_PVQ_BLOCKS_IN_SB);
-  pvq_info = &x->pvq[block][plane];
-  add_pvq_block((AV1_COMMON * const) cm, x, pvq_info);
-}
-#endif  // CONFIG_PVQ
-
-static void tokenize_b(int plane, int block, int blk_row, int blk_col,
-                       BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
-#if !CONFIG_PVQ
-  struct tokenize_b_args *const args = arg;
-  const AV1_COMP *cpi = args->cpi;
-  const AV1_COMMON *const cm = &cpi->common;
-  ThreadData *const td = args->td;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  TOKENEXTRA **tp = args->tp;
-  uint8_t token_cache[MAX_TX_SQUARE];
-  struct macroblock_plane *p = &x->plane[plane];
-  struct macroblockd_plane *pd = &xd->plane[plane];
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  int pt; /* near block/prev token context index */
-  int c;
-  TOKENEXTRA *t = *tp; /* store tokens starting here */
-  const int eob = p->eobs[block];
-  const PLANE_TYPE type = pd->plane_type;
-  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-#if CONFIG_SUPERTX
-  const int segment_id = AOMMIN(mbmi->segment_id, mbmi->segment_id_supertx);
-#else
-  const int segment_id = mbmi->segment_id;
-#endif  // CONFIG_SUEPRTX
-  const int16_t *scan, *nb;
-  const TX_TYPE tx_type =
-      av1_get_tx_type(type, xd, blk_row, blk_col, block, tx_size);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-  const int ref = is_inter_block(mbmi);
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-  aom_cdf_prob(
-      *const coef_head_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
-      ec_ctx->coef_head_cdfs[txsize_sqr_map[tx_size]][type][ref];
-  aom_cdf_prob(
-      *const coef_tail_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
-      ec_ctx->coef_tail_cdfs[txsize_sqr_map[tx_size]][type][ref];
-  int eob_val;
-  int first_val = 1;
-  const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
-  const uint8_t *const band = get_band_translate(tx_size);
-  int16_t token;
-  EXTRABIT extra;
-  (void)plane_bsize;
-  pt = get_entropy_context(tx_size, pd->above_context + blk_col,
-                           pd->left_context + blk_row);
-  scan = scan_order->scan;
-  nb = scan_order->neighbors;
-  c = 0;
-
-#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-  if (tx_type == MRC_DCT)
-    av1_tokenize_color_map(x, plane, block, &t, plane_bsize, tx_size, MRC_MAP);
-#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-
-  if (eob == 0)
-    add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt], 1,
-              1, 0, BLOCK_Z_TOKEN);
-
-  while (c < eob) {
-    int v = qcoeff[scan[c]];
-    first_val = (c == 0);
-
-    if (!v) {
-      add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt],
-                0, first_val, 0, ZERO_TOKEN);
-      token_cache[scan[c]] = 0;
-    } else {
-      eob_val =
-          (c + 1 == eob) ? (c + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
-      av1_get_token_extra(v, &token, &extra);
-      add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt],
-                eob_val, first_val, extra, (uint8_t)token);
-      token_cache[scan[c]] = av1_pt_energy_class[token];
-    }
-    ++c;
-    pt = get_coef_context(nb, token_cache, AOMMIN(c, eob - 1));
-  }
-
-#if CONFIG_COEF_INTERLEAVE
-  t->token = EOSB_TOKEN;
-  t++;
-#endif
-
-  *tp = t;
-
-#if CONFIG_ADAPT_SCAN
-  // Since dqcoeff is not available here, we pass qcoeff into
-  // av1_update_scan_count_facade(). The update behavior should be the same
-  // because av1_update_scan_count_facade() only cares if coefficients are zero
-  // or not.
-  av1_update_scan_count_facade((AV1_COMMON *)cm, td->counts, tx_size, tx_type,
-                               qcoeff, c);
-#endif
-
-  av1_set_contexts(xd, pd, plane, tx_size, c > 0, blk_col, blk_row);
-#else   // !CONFIG_PVQ
-  tokenize_pvq(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
-#endif  // !CONFIG_PVQ
-}
-
-struct is_skippable_args {
-  uint16_t *eobs;
-  int *skippable;
-};
-static void is_skippable(int plane, int block, int blk_row, int blk_col,
-                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *argv) {
-  struct is_skippable_args *args = argv;
-  (void)plane;
-  (void)plane_bsize;
-  (void)tx_size;
-  (void)blk_row;
-  (void)blk_col;
-  args->skippable[0] &= (!args->eobs[block]);
+  cost_and_tokenize_map(&color_map_params, t, plane, 0, allow_update_cdf,
+                        counts);
 }
 
-// TODO(yaowu): rewrite and optimize this function to remove the usage of
-//              av1_foreach_transform_block() and simplify is_skippable().
-int av1_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
-  int result = 1;
-  struct is_skippable_args args = { x->plane[plane].eobs, &result };
-  av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane, is_skippable,
-                                         &args);
-  return result;
-}
-
-#if CONFIG_VAR_TX
 void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
                     TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int blk_row,
                     int blk_col, int block, int plane, void *arg) {
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-  const int tx_row = blk_row >> (1 - pd->subsampling_y);
-  const int tx_col = blk_col >> (1 - pd->subsampling_x);
   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-  TX_SIZE plane_tx_size;
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
-  plane_tx_size =
-      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
-            : mbmi->inter_tx_size[tx_row][tx_col];
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
 
-  if (tx_size == plane_tx_size) {
-    plane_bsize = get_plane_block_size(mbmi->sb_type, pd);
-#if CONFIG_LV_MAP
+  if (tx_size == plane_tx_size || plane) {
+    plane_bsize = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
+                                       pd->subsampling_y);
     if (!dry_run) {
       av1_update_and_record_txb_context(plane, block, blk_row, blk_col,
                                         plane_bsize, tx_size, arg);
@@ -649,120 +156,71 @@ void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
       printf("DRY_RUN_COSTCOEFFS is not supported yet\n");
       assert(0);
     }
-#else
-    if (!dry_run)
-      tokenize_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
-    else if (dry_run == DRY_RUN_NORMAL)
-      set_entropy_context_b(plane, block, blk_row, blk_col, plane_bsize,
-                            tx_size, arg);
-    else if (dry_run == DRY_RUN_COSTCOEFFS)
-      cost_coeffs_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
-#endif
   } else {
-#if CONFIG_RECT_TX_EXT
-    int is_qttx = plane_tx_size == quarter_txsize_lookup[plane_bsize];
-    const TX_SIZE sub_txs = is_qttx ? plane_tx_size : sub_tx_size_map[tx_size];
-#else
     // Half the block size in transform block unit.
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-#endif
-    const int bsl = tx_size_wide_unit[sub_txs];
-    int i;
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int step = bsw * bsh;
 
-    assert(bsl > 0);
+    assert(bsw > 0 && bsh > 0);
 
-    for (i = 0; i < 4; ++i) {
-#if CONFIG_RECT_TX_EXT
-      int is_wide_tx = tx_size_wide_unit[sub_txs] > tx_size_high_unit[sub_txs];
-      const int offsetr =
-          is_qttx ? (is_wide_tx ? i * tx_size_high_unit[sub_txs] : 0)
-                  : blk_row + ((i >> 1) * bsl);
-      const int offsetc =
-          is_qttx ? (is_wide_tx ? 0 : i * tx_size_wide_unit[sub_txs])
-                  : blk_col + ((i & 0x01) * bsl);
-#else
-      const int offsetr = blk_row + ((i >> 1) * bsl);
-      const int offsetc = blk_col + ((i & 0x01) * bsl);
-#endif
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
 
-      int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
 
-      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
-
-      tokenize_vartx(td, t, dry_run, sub_txs, plane_bsize, offsetr, offsetc,
-                     block, plane, arg);
-      block += step;
+        tokenize_vartx(td, t, dry_run, sub_txs, plane_bsize, offsetr, offsetc,
+                       block, plane, arg);
+        block += step;
+      }
     }
   }
 }
 
 void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
                            RUN_TYPE dry_run, int mi_row, int mi_col,
-                           BLOCK_SIZE bsize, int *rate) {
+                           BLOCK_SIZE bsize, int *rate,
+                           uint8_t allow_update_cdf) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_LV_MAP
+  MB_MODE_INFO *const mbmi = xd->mi[0];
   (void)t;
-#else
-  TOKENEXTRA *t_backup = *t;
-#endif
-  const int ctx = av1_get_skip_context(xd);
-  const int skip_inc =
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
-  struct tokenize_b_args arg = { cpi, td, t, 0 };
-  int plane;
+  struct tokenize_b_args arg = { cpi, td, t, 0, allow_update_cdf };
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
   if (mbmi->skip) {
-    if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
-    av1_reset_skip_context(xd, mi_row, mi_col, bsize);
-#if !CONFIG_LV_MAP
-    if (dry_run) *t = t_backup;
-#endif
+    av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
     return;
   }
 
-  if (!dry_run) td->counts->skip[ctx][0] += skip_inc;
-#if !CONFIG_LV_MAP
-  else
-    *t = t_backup;
-#endif
-
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-#if CONFIG_CB4X4
+  for (int plane = 0; plane < num_planes; ++plane) {
     if (!is_chroma_reference(mi_row, mi_col, bsize,
                              xd->plane[plane].subsampling_x,
                              xd->plane[plane].subsampling_y)) {
-#if !CONFIG_PVQ && !CONFIG_LV_MAP
-      if (!dry_run) {
-        (*t)->token = EOSB_TOKEN;
-        (*t)++;
-      }
-#endif
       continue;
     }
-#endif
     const struct macroblockd_plane *const pd = &xd->plane[plane];
-#if CONFIG_CHROMA_SUB8X8
+    const BLOCK_SIZE bsizec =
+        scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
     const BLOCK_SIZE plane_bsize =
-        AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#else
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#endif
+        get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
     const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-    const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
-    const TX_SIZE max_tx_size = get_vartx_max_txsize(
-        mbmi, plane_bsize, pd->subsampling_x || pd->subsampling_y);
+    const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
     const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
     int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
-    int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
+    int bh = block_size_high[txb_size] >> tx_size_high_log2[0];
     int idx, idy;
     int block = 0;
     int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
 
-    const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd);
+    const BLOCK_SIZE max_unit_bsize =
+        get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
     int mu_blocks_wide =
         block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
     int mu_blocks_high =
@@ -785,144 +243,6 @@ void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
         }
       }
     }
-#if !CONFIG_LV_MAP
-    if (!dry_run) {
-      (*t)->token = EOSB_TOKEN;
-      (*t)++;
-    }
-#endif
-  }
-  if (rate) *rate += arg.this_rate;
-}
-#endif  // CONFIG_VAR_TX
-
-void av1_tokenize_sb(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
-                     RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
-                     const int mi_row, const int mi_col) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const int ctx = av1_get_skip_context(xd);
-  const int skip_inc =
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
-  struct tokenize_b_args arg = { cpi, td, t, 0 };
-  if (mbmi->skip) {
-    if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
-    av1_reset_skip_context(xd, mi_row, mi_col, bsize);
-    return;
-  }
-
-  if (!dry_run) {
-#if CONFIG_COEF_INTERLEAVE
-    td->counts->skip[ctx][0] += skip_inc;
-    av1_foreach_transformed_block_interleave(xd, bsize, tokenize_b, &arg);
-#else
-    int plane;
-
-    td->counts->skip[ctx][0] += skip_inc;
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-#if CONFIG_CB4X4
-      if (!is_chroma_reference(mi_row, mi_col, bsize,
-                               xd->plane[plane].subsampling_x,
-                               xd->plane[plane].subsampling_y)) {
-#if !CONFIG_PVQ
-        (*t)->token = EOSB_TOKEN;
-        (*t)++;
-#endif
-        continue;
-      }
-#else
-      (void)mi_row;
-      (void)mi_col;
-#endif
-      av1_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b,
-                                             &arg);
-#if !CONFIG_PVQ
-      (*t)->token = EOSB_TOKEN;
-      (*t)++;
-#endif  // !CONFIG_PVQ
-    }
-#endif
-  }
-#if !CONFIG_PVQ
-  else if (dry_run == DRY_RUN_NORMAL) {
-    int plane;
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-#if CONFIG_CB4X4
-      if (!is_chroma_reference(mi_row, mi_col, bsize,
-                               xd->plane[plane].subsampling_x,
-                               xd->plane[plane].subsampling_y))
-        continue;
-#else
-      (void)mi_row;
-      (void)mi_col;
-#endif
-      av1_foreach_transformed_block_in_plane(xd, bsize, plane,
-                                             set_entropy_context_b, &arg);
-    }
-  } else if (dry_run == DRY_RUN_COSTCOEFFS) {
-    int plane;
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-#if CONFIG_CB4X4
-      if (!is_chroma_reference(mi_row, mi_col, bsize,
-                               xd->plane[plane].subsampling_x,
-                               xd->plane[plane].subsampling_y))
-        continue;
-#else
-      (void)mi_row;
-      (void)mi_col;
-#endif
-      av1_foreach_transformed_block_in_plane(xd, bsize, plane, cost_coeffs_b,
-                                             &arg);
-    }
-  }
-#endif  // !CONFIG_PVQ
-
-  if (rate) *rate += arg.this_rate;
-}
-
-#if CONFIG_SUPERTX
-void av1_tokenize_sb_supertx(const AV1_COMP *cpi, ThreadData *td,
-                             TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
-                             int mi_col, BLOCK_SIZE bsize, int *rate) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &td->mb.e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  TOKENEXTRA *t_backup = *t;
-  const int ctx = av1_get_skip_context(xd);
-  const int skip_inc =
-      !segfeature_active(&cm->seg, mbmi->segment_id_supertx, SEG_LVL_SKIP);
-  struct tokenize_b_args arg = { cpi, td, t, 0 };
-  if (mbmi->skip) {
-    if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
-    av1_reset_skip_context(xd, mi_row, mi_col, bsize);
-    if (dry_run) *t = t_backup;
-    return;
-  }
-
-  if (!dry_run) {
-    int plane;
-    td->counts->skip[ctx][0] += skip_inc;
-
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      av1_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b,
-                                             &arg);
-      (*t)->token = EOSB_TOKEN;
-      (*t)++;
-    }
-  } else if (dry_run == DRY_RUN_NORMAL) {
-    int plane;
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane)
-      av1_foreach_transformed_block_in_plane(xd, bsize, plane,
-                                             set_entropy_context_b, &arg);
-    *t = t_backup;
-  } else if (dry_run == DRY_RUN_COSTCOEFFS) {
-    int plane;
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane)
-      av1_foreach_transformed_block_in_plane(xd, bsize, plane, cost_coeffs_b,
-                                             &arg);
   }
   if (rate) *rate += arg.this_rate;
 }
-#endif  // CONFIG_SUPERTX
diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h
index 20000e502..de1cbe99c 100644
--- a/third_party/aom/av1/encoder/tokenize.h
+++ b/third_party/aom/av1/encoder/tokenize.h
@@ -13,51 +13,29 @@
 #define AV1_ENCODER_TOKENIZE_H_
 
 #include "av1/common/entropy.h"
-
 #include "av1/encoder/block.h"
-#include "av1/encoder/treewriter.h"
+#include "aom_dsp/bitwriter.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define EOSB_TOKEN 127  // Not signalled, encoder only
-
-#if CONFIG_HIGHBITDEPTH
-typedef int32_t EXTRABIT;
-#else
-typedef int16_t EXTRABIT;
-#endif
-
-typedef struct {
-  int16_t token;
-  EXTRABIT extra;
-} TOKENVALUE;
-
 typedef struct {
-  aom_cdf_prob (*tail_cdf)[CDF_SIZE(ENTROPY_TOKENS)];
-  aom_cdf_prob (*head_cdf)[CDF_SIZE(ENTROPY_TOKENS)];
   aom_cdf_prob *color_map_cdf;
-  int eob_val;
-  int first_val;
-  const aom_prob *context_tree;
-  EXTRABIT extra;
+  // TODO(yaowu: use packed enum type if appropriate)
   uint8_t token;
 } TOKENEXTRA;
 
-extern const aom_tree_index av1_coef_tree[];
-extern const aom_tree_index av1_coef_con_tree[];
-
-int av1_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
-
 struct AV1_COMP;
 struct ThreadData;
+struct FRAME_COUNTS;
 
 struct tokenize_b_args {
   const struct AV1_COMP *cpi;
   struct ThreadData *td;
   TOKENEXTRA **tp;
   int this_rate;
+  uint8_t allow_update_cdf;
 };
 
 typedef enum {
@@ -69,78 +47,22 @@ typedef enum {
 // Note in all the tokenize functions rate if non NULL is incremented
 // with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS,
 // otherwise rate is not incremented.
-#if CONFIG_VAR_TX
 void av1_tokenize_sb_vartx(const struct AV1_COMP *cpi, struct ThreadData *td,
                            TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
-                           int mi_col, BLOCK_SIZE bsize, int *rate);
-#endif
+                           int mi_col, BLOCK_SIZE bsize, int *rate,
+                           uint8_t allow_update_cdf);
 
-int av1_cost_color_map(const MACROBLOCK *const x, int plane, int block,
-                       BLOCK_SIZE bsize, TX_SIZE tx_size, COLOR_MAP_TYPE type);
+int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize,
+                       TX_SIZE tx_size, COLOR_MAP_TYPE type);
 
-void av1_tokenize_color_map(const MACROBLOCK *const x, int plane, int block,
+void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
                             TOKENEXTRA **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
-                            COLOR_MAP_TYPE type);
-
-void av1_tokenize_sb(const struct AV1_COMP *cpi, struct ThreadData *td,
-                     TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                     int *rate, const int mi_row, const int mi_col);
-#if CONFIG_SUPERTX
-void av1_tokenize_sb_supertx(const struct AV1_COMP *cpi, struct ThreadData *td,
-                             TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
-                             int mi_col, BLOCK_SIZE bsize, int *rate);
-#endif
-
-extern const int16_t *av1_dct_value_cost_ptr;
-/* TODO: The Token field should be broken out into a separate char array to
- *  improve cache locality, since it's needed for costing when the rest of the
- *  fields are not.
- */
-extern const TOKENVALUE *av1_dct_value_tokens_ptr;
-extern const TOKENVALUE *av1_dct_cat_lt_10_value_tokens;
-extern const int *av1_dct_cat_lt_10_value_cost;
-extern const int16_t av1_cat6_low_cost[256];
-#if CONFIG_HIGHBITDEPTH
-#define CAT6_HIGH_COST_ENTRIES 1024
-#else
-#define CAT6_HIGH_COST_ENTRIES 64
-#endif
-extern const int av1_cat6_high_cost[CAT6_HIGH_COST_ENTRIES];
-extern const uint8_t av1_cat6_skipped_bits_discount[8];
-
-static INLINE void av1_get_token_extra(int v, int16_t *token, EXTRABIT *extra) {
-  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
-    *token = CATEGORY6_TOKEN;
-    if (v >= CAT6_MIN_VAL)
-      *extra = 2 * v - 2 * CAT6_MIN_VAL;
-    else
-      *extra = -2 * v - 2 * CAT6_MIN_VAL + 1;
-    return;
-  }
-  *token = av1_dct_cat_lt_10_value_tokens[v].token;
-  *extra = av1_dct_cat_lt_10_value_tokens[v].extra;
-}
-static INLINE int16_t av1_get_token(int v) {
-  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) return 10;
-  return av1_dct_cat_lt_10_value_tokens[v].token;
-}
-
-static INLINE int av1_get_token_cost(int v, int16_t *token, int cat6_bits) {
-  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
-    EXTRABIT extrabits;
-    *token = CATEGORY6_TOKEN;
-    extrabits = abs(v) - CAT6_MIN_VAL;
-    return av1_cat6_low_cost[extrabits & 0xff] +
-           av1_cat6_high_cost[extrabits >> 8] -
-           av1_cat6_skipped_bits_discount[18 - cat6_bits];
-  }
-  *token = av1_dct_cat_lt_10_value_tokens[v].token;
-  return av1_dct_cat_lt_10_value_cost[v];
-}
+                            COLOR_MAP_TYPE type, int allow_update_cdf,
+                            struct FRAME_COUNTS *counts);
 
 static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id,
                                  TX_SIZE tx_size) {
-  const int eob_max = tx_size_2d[tx_size];
+  const int eob_max = av1_get_max_eob(tx_size);
   return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
diff --git a/third_party/aom/av1/encoder/treewriter.c b/third_party/aom/av1/encoder/treewriter.c
deleted file mode 100644
index 50be72413..000000000
--- a/third_party/aom/av1/encoder/treewriter.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "av1/encoder/treewriter.h"
-
-static void tree2tok(struct av1_token *tokens, const aom_tree_index *tree,
-                     int i, int v, int l) {
-  v += v;
-  ++l;
-
-  do {
-    const aom_tree_index j = tree[i++];
-    if (j <= 0) {
-      tokens[-j].value = v;
-      tokens[-j].len = l;
-    } else {
-      tree2tok(tokens, tree, j, v, l);
-    }
-  } while (++v & 1);
-}
-
-void av1_tokens_from_tree(struct av1_token *tokens,
-                          const aom_tree_index *tree) {
-  tree2tok(tokens, tree, 0, 0, 0);
-}
-
-static unsigned int convert_distribution(unsigned int i, aom_tree tree,
-                                         unsigned int branch_ct[][2],
-                                         const unsigned int num_events[]) {
-  unsigned int left, right;
-
-  if (tree[i] <= 0)
-    left = num_events[-tree[i]];
-  else
-    left = convert_distribution(tree[i], tree, branch_ct, num_events);
-
-  if (tree[i + 1] <= 0)
-    right = num_events[-tree[i + 1]];
-  else
-    right = convert_distribution(tree[i + 1], tree, branch_ct, num_events);
-
-  branch_ct[i >> 1][0] = left;
-  branch_ct[i >> 1][1] = right;
-  return left + right;
-}
-
-void av1_tree_probs_from_distribution(aom_tree tree,
-                                      unsigned int branch_ct[/* n-1 */][2],
-                                      const unsigned int num_events[/* n */]) {
-  convert_distribution(0, tree, branch_ct, num_events);
-}
diff --git a/third_party/aom/av1/encoder/treewriter.h b/third_party/aom/av1/encoder/treewriter.h
deleted file mode 100644
index 9a4cb86cb..000000000
--- a/third_party/aom/av1/encoder/treewriter.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_ENCODER_TREEWRITER_H_
-#define AV1_ENCODER_TREEWRITER_H_
-
-#include "aom_dsp/bitwriter.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void av1_tree_probs_from_distribution(aom_tree tree,
-                                      unsigned int branch_ct[/* n - 1 */][2],
-                                      const unsigned int num_events[/* n */]);
-
-struct av1_token {
-  int value;
-  int len;
-};
-
-void av1_tokens_from_tree(struct av1_token *, const aom_tree_index *);
-
-static INLINE void av1_write_token(aom_writer *w, const aom_tree_index *tree,
-                                   const aom_prob *probs,
-                                   const struct av1_token *token) {
-  aom_write_tree(w, tree, probs, token->value, token->len, 0);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AV1_ENCODER_TREEWRITER_H_
diff --git a/third_party/aom/av1/encoder/tx_prune_model_weights.h b/third_party/aom/av1/encoder/tx_prune_model_weights.h
new file mode 100644
index 000000000..69063b801
--- /dev/null
+++ b/third_party/aom/av1/encoder/tx_prune_model_weights.h
@@ -0,0 +1,2086 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+#define AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+// Tx type model for 4x4 block.
+static const float av1_tx_type_nn_weights_4x4_layer0[32] = {
+  0.72406f,  -0.40019f, 0.51795f,  -0.43881f, -0.49746f, -0.41780f, -0.39409f,
+  -0.16183f, -1.00135f, -0.41733f, -0.96534f, 0.93272f,  1.06229f,  0.04188f,
+  0.60919f,  0.92405f,  -0.39359f, 0.70570f,  0.75375f,  1.11966f,  -1.86360f,
+  -0.35421f, 0.18743f,  0.13346f,  -0.21262f, 0.07050f,  0.10533f,  -0.47402f,
+  1.33417f,  1.72899f,  1.17983f,  0.10552f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_layer0[8] = {
+  1.96273f, -0.69845f, -0.10999f, -1.11311f,
+  1.35101f, 0.43842f,  -0.29264f, -1.15376f,
+};
+
+static const float av1_tx_type_nn_weights_4x4_layer1[32] = {
+  0.79770f,  0.08520f,  0.23298f,  0.05285f,  0.87506f,  -0.90784f, -0.06197f,
+  -1.00580f, 0.68639f,  -0.34881f, 0.15366f,  -1.64658f, 0.80755f,  -0.26293f,
+  0.10253f,  -0.23915f, 1.14696f,  -0.10928f, -1.61377f, 0.00863f,  0.98599f,
+  -0.43872f, 0.61196f,  -0.03787f, 1.01060f,  0.17643f,  -0.00208f, -0.15738f,
+  0.06517f,  0.72885f,  0.24387f,  1.28535f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_layer1[4] = {
+  1.23769f,
+  1.40308f,
+  0.09871f,
+  1.82070f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x4 = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_4x4_layer0,
+      av1_tx_type_nn_weights_4x4_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_4x4_layer0,
+      av1_tx_type_nn_bias_4x4_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx type model for 4x8 block.
+static const float av1_tx_type_nn_weights_4x8_hor_layer0[32] = {
+  0.68355f,  -0.06887f, 0.68525f,  -0.86048f, -0.35906f, -0.28597f, -0.21108f,
+  0.12591f,  -1.13025f, -0.65695f, -0.25658f, 0.39155f,  0.89011f,  0.19258f,
+  0.28316f,  0.61172f,  0.52587f,  0.99182f,  0.75704f,  0.66788f,  -1.61814f,
+  -1.23483f, -0.62868f, -0.11902f, 0.33295f,  0.64796f,  0.92345f,  -0.71821f,
+  0.07575f,  0.34687f,  0.20518f,  -0.87850f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_hor_layer0[8] = {
+  1.14049f, -0.18583f, 1.92114f, -0.72057f,
+  1.32715f, 0.96713f,  1.09877f, -0.64345f,
+};
+
+static const float av1_tx_type_nn_weights_4x8_hor_layer1[32] = {
+  0.71978f,  0.06896f,  1.48617f,  0.97124f,  -0.02487f, -0.95359f, 0.68983f,
+  -0.16313f, 0.51324f,  -0.33770f, 0.45938f,  -1.08238f, 0.72938f,  0.42300f,
+  0.85691f,  -0.03783f, 1.12617f,  -0.04034f, 0.36923f,  0.25638f,  1.10167f,
+  0.41633f,  0.72602f,  -0.14797f, 0.66888f,  0.11437f,  -0.99797f, -0.20725f,
+  1.01163f,  2.06308f,  1.23331f,  -0.15481f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_hor_layer1[4] = {
+  2.14443f,
+  1.98356f,
+  0.74616f,
+  2.58795f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_4x8_hor_layer0,
+      av1_tx_type_nn_weights_4x8_hor_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_4x8_hor_layer0,
+      av1_tx_type_nn_bias_4x8_hor_layer1,
+  },
+};
+
+static const float av1_tx_type_nn_weights_4x8_ver_layer0[128] = {
+  0.88859f,  1.02796f,  1.15509f,  0.61719f,  0.85804f,  1.17581f,  0.93524f,
+  0.06546f,  0.08018f,  -0.78562f, -0.36614f, 0.14149f,  -0.30069f, -0.52647f,
+  -0.82789f, 0.60527f,  -1.74026f, -0.20271f, 0.09875f,  0.03708f,  0.09430f,
+  -0.24043f, -0.38433f, 1.21014f,  1.42443f,  0.69586f,  1.07812f,  1.21748f,
+  1.10989f,  0.93122f,  1.04127f,  0.39424f,  0.95592f,  0.12904f,  0.46330f,
+  0.49722f,  0.46303f,  0.36979f,  0.60227f,  0.39345f,  -2.01632f, -0.05706f,
+  0.07766f,  -0.01271f, -0.16577f, -0.21957f, -0.14800f, 0.24898f,  0.27662f,
+  0.42028f,  0.44748f,  1.14585f,  1.38805f,  0.46182f,  -0.22982f, -0.07324f,
+  0.29886f,  -0.46959f, -0.04228f, -0.01064f, 0.24260f,  -0.32282f, -0.23804f,
+  1.44466f,  -0.42190f, -0.36385f, 0.39746f,  0.38557f,  -0.09624f, -0.21540f,
+  0.57385f,  -0.72878f, -0.39677f, -0.00717f, 0.60499f,  1.33849f,  1.05337f,
+  1.11947f,  0.38487f,  0.86534f,  -0.33970f, 0.71140f,  0.20772f,  0.61132f,
+  0.06181f,  -0.20027f, 0.13736f,  -0.72321f, 0.64586f,  -0.56740f, -0.90912f,
+  -0.20452f, 0.15381f,  -0.84346f, 0.19550f,  0.63164f,  1.35441f,  0.63218f,
+  0.82883f,  0.38803f,  -0.23874f, -0.02962f, 0.23846f,  -0.06822f, -0.40159f,
+  -0.17850f, -0.69524f, 1.12299f,  -0.08286f, -0.14150f, -0.28456f, -0.41519f,
+  -0.12792f, -0.55286f, 0.51655f,  0.06636f,  0.73759f,  0.70072f,  0.12616f,
+  0.31282f,  0.17130f,  -1.34233f, 0.37221f,  0.95838f,  0.16286f,  1.04301f,
+  0.73600f,  -0.11233f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_ver_layer0[16] = {
+  -0.89131f, 0.09124f,  -0.71678f, -1.19929f, 0.98963f,  0.16896f,
+  -0.44943f, -0.97532f, -0.13997f, 1.07136f,  -0.46362f, -0.45253f,
+  -0.63015f, -0.20008f, 1.24048f,  -0.21265f,
+};
+
+static const float av1_tx_type_nn_weights_4x8_ver_layer1[64] = {
+  -0.79795f, 0.45973f,  -0.54188f, -1.05095f, 0.64404f,  -0.56470f, -0.57018f,
+  0.61644f,  0.50229f,  1.14006f,  0.13805f,  -0.42058f, -0.07468f, 0.66203f,
+  0.93180f,  -0.59662f, -0.25152f, 0.00336f,  1.09769f,  -1.11921f, 0.15151f,
+  0.58750f,  -0.42480f, -0.95908f, -0.10980f, 1.31715f,  0.06665f,  -0.52371f,
+  0.37228f,  -0.12364f, 0.54876f,  -0.32698f, 0.39863f,  -0.97669f, -1.06351f,
+  1.82755f,  1.02851f,  0.10322f,  -0.08322f, 0.08891f,  -0.05715f, 0.93503f,
+  0.02096f,  -0.39506f, -0.99330f, -0.09407f, 0.75108f,  -0.30104f, 1.78314f,
+  -0.01786f, -0.17392f, 0.00461f,  0.41394f,  0.92566f,  1.11251f,  -0.71380f,
+  -0.04907f, 0.12736f,  0.00208f,  0.94451f,  -0.31783f, -0.19655f, 0.64619f,
+  0.50359f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_ver_layer1[4] = {
+  0.39274f,
+  1.27276f,
+  0.30322f,
+  2.55238f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_4x8_ver_layer0,
+      av1_tx_type_nn_weights_4x8_ver_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_4x8_ver_layer0,
+      av1_tx_type_nn_bias_4x8_ver_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx type model for 8x4 block.
+static const float av1_tx_type_nn_weights_8x4_hor_layer0[128] = {
+  0.64828f,  0.61618f,  0.98975f,  -0.14562f, 0.26957f,  1.80872f,  0.58299f,
+  -0.06917f, 0.00937f,  -0.74073f, -0.66045f, -0.04576f, -0.39802f, -0.76960f,
+  -0.85166f, 0.88799f,  -0.70694f, -0.34366f, -0.54906f, -0.39502f, -0.29465f,
+  -0.49650f, -0.32171f, 1.37181f,  1.30432f,  0.71843f,  1.01916f,  1.01582f,
+  0.90999f,  0.86334f,  1.04603f,  0.40734f,  0.96187f,  0.53742f,  0.07510f,
+  0.44167f,  0.02049f,  -0.02874f, 0.97191f,  1.03647f,  -2.62751f, -0.01390f,
+  -0.09282f, -0.02522f, -0.30849f, -0.19386f, -0.51356f, 0.52778f,  0.77191f,
+  0.75416f,  0.69067f,  0.93561f,  1.35982f,  0.76193f,  0.57869f,  0.00251f,
+  -0.87244f, -0.26922f, -0.06682f, 0.07176f,  0.51142f,  0.58948f,  0.13914f,
+  0.71165f,  -0.40329f, -0.33201f, 0.35293f,  0.33437f,  -0.01812f, -0.24765f,
+  0.26810f,  -0.77088f, 1.35707f,  0.22243f,  0.78402f,  0.66191f,  0.79890f,
+  1.90669f,  0.73189f,  0.24222f,  -0.34682f, 0.66990f,  0.19554f,  0.58414f,
+  0.05060f,  -0.21271f, 0.11656f,  -0.74907f, 0.68837f,  -0.39147f, -1.78263f,
+  -0.69918f, -0.06838f, -0.26927f, 0.38502f,  0.08305f,  1.29848f,  0.67328f,
+  0.67269f,  0.65805f,  -0.47778f, -1.02617f, 0.16523f,  0.12223f,  -0.35294f,
+  -0.15866f, -0.56224f, 1.25895f,  -0.21422f, -0.33518f, -0.33519f, -0.37414f,
+  0.55122f,  0.14806f,  0.44312f,  -0.07865f, 0.75295f,  0.10766f,  0.59922f,
+  0.48837f,  -0.19099f, -2.07991f, 0.35755f,  0.87813f,  0.07559f,  1.00724f,
+  0.25223f,  -0.06761f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_hor_layer0[16] = {
+  -0.54227f, 0.08599f,  -0.77447f, -1.10920f, 0.89298f,  0.05454f,
+  -0.73681f, 0.21048f,  -0.41041f, 1.25690f,  -0.60918f, 0.14661f,
+  -0.65392f, -0.25881f, 1.67995f,  -0.03550f,
+};
+
+static const float av1_tx_type_nn_weights_8x4_hor_layer1[64] = {
+  -0.22312f, 0.73552f,  0.48399f,  -0.66996f, 0.36527f,  -0.42228f, -1.10793f,
+  0.31167f,  0.16177f,  1.69315f,  -0.06287f, -0.35804f, -0.24889f, 0.80824f,
+  1.08952f,  -0.62838f, 0.30066f,  -0.19043f, -0.00518f, -1.31005f, 0.65797f,
+  1.07714f,  -0.24253f, 0.49779f,  0.05848f,  1.08914f,  0.08015f,  -0.38853f,
+  0.35108f,  -0.11026f, 0.64528f,  -0.37615f, 0.39995f,  -0.58117f, -1.29627f,
+  1.74169f,  0.75558f,  -0.04910f, 0.35020f,  0.04556f,  0.12634f,  1.27223f,
+  0.02608f,  -0.19687f, -0.78649f, -0.22746f, 1.02589f,  -0.28411f, 1.42443f,
+  -0.42115f, -0.21153f, -0.01733f, 0.62001f,  0.87167f,  1.66008f,  -0.39179f,
+  -0.06293f, 0.27012f,  0.16871f,  0.64597f,  0.67358f,  -0.20053f, 0.95830f,
+  0.44232f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_hor_layer1[4] = {
+  0.14889f,
+  1.74197f,
+  0.53696f,
+  2.87574f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_8x4_hor_layer0,
+      av1_tx_type_nn_weights_8x4_hor_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_8x4_hor_layer0,
+      av1_tx_type_nn_bias_8x4_hor_layer1,
+  },
+};
+
+static const float av1_tx_type_nn_weights_8x4_ver_layer0[32] = {
+  0.81919f,  0.15527f,  0.60055f,  -0.54617f, -0.35510f, -0.28223f, -0.20478f,
+  0.15001f,  -1.84806f, -0.30274f, -0.00865f, 0.33939f,  1.11970f,  0.44630f,
+  0.32074f,  0.39637f,  0.08149f,  1.28070f,  0.86703f,  0.76503f,  -1.83991f,
+  -1.13575f, -0.68605f, -0.23690f, 0.07099f,  0.64960f,  0.82543f,  -0.72028f,
+  0.08220f,  0.34338f,  0.20245f,  -0.88920f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_ver_layer0[8] = {
+  1.14995f, -0.16021f, 2.38325f, -0.65179f,
+  1.09624f, 1.07662f,  0.63837f, -0.64847f,
+};
+
+static const float av1_tx_type_nn_weights_8x4_ver_layer1[32] = {
+  0.10278f,  0.06819f,  1.73885f,  1.29889f,  -0.18482f, -1.06132f, 0.67003f,
+  -0.23280f, 0.50181f,  -0.33890f, 0.43524f,  -1.03147f, 1.09640f,  0.66332f,
+  0.47652f,  -0.02251f, 0.94245f,  -0.03861f, 0.84776f,  0.28377f,  0.92044f,
+  0.23572f,  0.52082f,  -0.16266f, 0.45290f,  0.11342f,  -0.50310f, -0.92633f,
+  1.46345f,  1.84714f,  1.06804f,  -0.13610f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_ver_layer1[4] = {
+  2.41028f,
+  1.95675f,
+  0.82387f,
+  2.41923f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_8x4_ver_layer0,
+      av1_tx_type_nn_weights_8x4_ver_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_8x4_ver_layer0,
+      av1_tx_type_nn_bias_8x4_ver_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx type model for 8x8 block.
+static const float av1_tx_type_nn_weights_8x8_layer0[128] = {
+  0.98214f,  1.05643f,  0.91173f,  0.24165f,  0.39961f,  0.25736f,  0.68593f,
+  0.10553f,  0.13353f,  -0.49687f, -1.66413f, 1.16584f,  2.25147f,  -0.72247f,
+  -2.65486f, -0.03628f, -1.47746f, -1.07644f, -1.25551f, -0.91260f, -1.26199f,
+  -1.06022f, -1.42138f, 1.10500f,  2.96552f,  -0.40638f, 0.02258f,  -0.23137f,
+  0.34922f,  -0.01454f, 0.41251f,  0.35944f,  -1.56742f, 0.01406f,  0.88114f,
+  1.42462f,  0.87243f,  0.02439f,  0.07035f,  0.34303f,  -3.16843f, 0.25798f,
+  0.07494f,  0.38926f,  -0.12267f, 0.09049f,  -0.36711f, 0.01551f,  1.41269f,
+  1.33505f,  1.43627f,  1.41909f,  1.44605f,  1.43008f,  1.36721f,  0.19443f,
+  -0.08606f, 0.17285f,  0.63692f,  0.92092f,  0.61007f,  0.87100f,  -0.33631f,
+  1.98025f,  -0.40686f, -0.33808f, 0.34919f,  0.33817f,  -0.01807f, -0.25259f,
+  0.26442f,  -0.76979f, 1.07788f,  -1.38747f, 1.34315f,  2.79947f,  2.02838f,
+  -0.25062f, 0.00174f,  1.25888f,  0.17344f,  0.20897f,  1.28765f,  1.95749f,
+  1.62351f,  1.04556f,  0.43858f,  0.12463f,  1.66399f,  0.03971f,  0.36614f,
+  0.56932f,  0.15982f,  0.11587f,  0.21402f,  1.89386f,  -0.91267f, -0.79781f,
+  1.79155f,  0.60147f,  -0.90118f, -4.32718f, -0.58154f, -0.02181f, -0.40734f,
+  -0.11409f, -0.79470f, 0.69697f,  -0.16588f, -0.16090f, -0.21236f, -0.52776f,
+  -0.64455f, 0.09173f,  0.80766f,  0.76097f,  0.20295f,  -0.93467f, -0.43509f,
+  0.59659f,  0.07788f,  -3.79459f, 0.16268f,  0.47343f,  0.05106f,  -0.24880f,
+  1.18941f,  0.10346f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_layer0[16] = {
+  0.75780f,  0.25628f,  0.19911f,  -0.41384f, 1.33909f,  0.31498f,
+  -1.37171f, -1.09561f, -0.44056f, 0.49001f,  -0.65804f, -1.96031f,
+  0.64806f,  -0.52520f, 1.38838f,  0.15519f,
+};
+
+static const float av1_tx_type_nn_weights_8x8_layer1[64] = {
+  -0.63856f, -2.02670f, -0.92947f, 0.00216f,  1.47710f,  -2.01099f, -2.11289f,
+  -0.92288f, 0.19296f,  1.37866f,  -0.85975f, -0.78624f, -2.10392f, 0.13976f,
+  1.06968f,  -2.04120f, 0.57991f,  -1.84941f, -0.81512f, -2.08254f, -0.47334f,
+  0.12256f,  -1.39594f, -1.02829f, 0.06134f,  2.25646f,  -1.25196f, -2.65317f,
+  -1.94473f, 0.10989f,  0.55446f,  -1.76557f, 0.33455f,  -1.85556f, -3.01878f,
+  -0.25100f, 1.65520f,  -1.61409f, 1.16336f,  -1.15560f, 0.13631f,  1.50733f,
+  -1.07538f, -0.91200f, -1.93132f, 0.09271f,  0.24425f,  -1.80655f, -0.01138f,
+  -1.36421f, -0.62970f, -0.84766f, -0.34714f, -0.50531f, 1.91005f,  -1.60316f,
+  -0.02495f, 1.04938f,  0.28411f,  -0.79809f, -1.48232f, 0.00766f,  0.94016f,
+  -1.11974f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_layer1[4] = {
+  0.53574f,
+  1.57736f,
+  -0.13698f,
+  2.64613f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x8 = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_8x8_layer0,
+      av1_tx_type_nn_weights_8x8_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_8x8_layer0,
+      av1_tx_type_nn_bias_8x8_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx type model for 8x16 block.
+static const float av1_tx_type_nn_weights_8x16_hor_layer0[128] = {
+  1.36274f,  1.37313f,  1.26859f,  1.26459f,  1.37979f,  1.47217f,  1.29710f,
+  0.15765f,  0.31552f,  -0.05727f, 0.25562f,  0.47925f,  -0.32913f, -0.55757f,
+  -0.98010f, 0.08568f,  -0.62754f, 0.12834f,  -0.03717f, 0.06286f,  0.26159f,
+  0.26023f,  -0.62605f, 1.34500f,  1.47720f,  0.47937f,  0.84793f,  0.87866f,
+  0.81260f,  0.74761f,  0.84217f,  0.53321f,  -0.78232f, 0.35321f,  0.41240f,
+  0.45002f,  0.88973f,  0.51055f,  0.91115f,  -0.45512f, -2.37418f, -0.25205f,
+  0.05893f,  -0.15685f, -0.25156f, -0.17104f, -0.12230f, 0.17802f,  0.18796f,
+  -0.05797f, 0.26484f,  1.23515f,  1.70393f,  0.46022f,  -0.14354f, 0.08501f,
+  -0.84625f, -0.42578f, -0.29345f, -0.51797f, -0.56515f, -0.47465f, 0.23970f,
+  1.59912f,  -0.40332f, -0.33209f, 0.37274f,  0.36831f,  -0.00248f, -0.24295f,
+  0.29539f,  -0.76136f, -0.22531f, 0.12371f,  0.37889f,  1.02639f,  1.73330f,
+  1.09686f,  1.04111f,  0.69006f,  -1.27157f, 0.94013f,  0.61621f,  0.62274f,
+  0.48759f,  0.55672f,  0.62597f,  -0.38846f, 1.72124f,  0.08214f,  -0.06650f,
+  0.32617f,  0.10958f,  0.24650f,  0.10740f,  1.16861f,  0.50701f,  0.45383f,
+  0.90016f,  -0.00695f, -0.11986f, -0.07834f, 0.20346f,  0.25863f,  -0.40889f,
+  -0.11344f, -0.79108f, 0.76259f,  -0.14562f, -0.15459f, -0.20954f, -0.51306f,
+  0.02743f,  -0.82456f, -0.00861f, -0.27274f, 0.28762f,  0.07282f,  0.26410f,
+  0.53413f,  -0.22208f, -0.85031f, -1.39129f, -0.74519f, 0.09771f,  0.80313f,
+  1.07698f,  0.02531f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_hor_layer0[16] = {
+  -1.30434f, -1.19259f, -0.43467f, -0.85386f, 0.96584f,  0.29276f,
+  -0.41990f, -0.96924f, -0.30933f, 0.95264f,  -0.25330f, -1.19584f,
+  1.46564f,  -0.42959f, 1.55720f,  0.18479f,
+};
+
+static const float av1_tx_type_nn_weights_8x16_hor_layer1[64] = {
+  -1.72959f, -0.21670f, 0.10616f,  -0.02006f, 0.15084f,  -0.85303f, -0.27535f,
+  0.58704f,  0.23683f,  1.19743f,  0.77971f,  0.49874f,  0.19508f,  0.19641f,
+  1.47895f,  -0.52173f, -0.56746f, -0.50761f, 0.15864f,  -0.95168f, 0.48103f,
+  0.91904f,  -0.11700f, 0.62863f,  0.06526f,  1.63803f,  -0.72325f, -1.80449f,
+  0.66373f,  0.12831f,  0.27139f,  -0.26346f, 1.50852f,  0.25079f,  -0.54255f,
+  1.78815f,  1.39691f,  -0.44989f, -0.18511f, -1.52903f, 0.13983f,  1.06906f,
+  -0.30184f, 0.37566f,  0.46209f,  0.10440f,  0.64695f,  -0.34002f, 1.96990f,
+  0.21189f,  -0.91248f, -0.11263f, 0.26708f,  1.27405f,  1.89776f,  0.02081f,
+  -0.06977f, -0.02584f, 0.47733f,  0.27117f,  1.33315f,  -0.09175f, 0.48747f,
+  1.16772f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_hor_layer1[4] = {
+  1.25783f,
+  1.19452f,
+  0.69964f,
+  2.41982f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_8x16_hor_layer0,
+      av1_tx_type_nn_weights_8x16_hor_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_8x16_hor_layer0,
+      av1_tx_type_nn_bias_8x16_hor_layer1,
+  },
+};
+
+static const float av1_tx_type_nn_weights_8x16_ver_layer0[128] = {
+  0.90888f,  0.86305f,  0.81674f,  0.75352f,  1.07834f,  0.99048f,  0.96355f,
+  0.13836f,  -0.51334f, 0.19906f,  1.84608f,  0.67828f,  0.45876f,  0.08325f,
+  0.28190f,  -0.01958f, -1.96553f, 0.27837f,  -0.05929f, 0.13491f,  0.21036f,
+  0.05797f,  -0.01373f, 0.73765f,  1.39603f,  -0.53767f, 0.10362f,  0.03420f,
+  0.41909f,  0.09510f,  0.32284f,  0.83860f,  0.13954f,  0.48434f,  1.47762f,
+  0.45891f,  0.23613f,  0.13013f,  0.82097f,  -0.03251f, -1.89757f, 0.21589f,
+  -0.10370f, 0.02530f,  -0.25659f, 0.01466f,  -0.23661f, 0.22783f,  0.92100f,
+  1.02915f,  1.20358f,  1.17251f,  0.97749f,  1.04696f,  0.91333f,  0.54576f,
+  -0.52792f, 0.02217f,  0.25652f,  0.31405f,  -0.18398f, 0.04572f,  -0.81359f,
+  1.82883f,  -0.40047f, -0.33056f, 0.35255f,  0.34448f,  -0.00339f, -0.23857f,
+  0.28925f,  -0.77175f, -0.24325f, -0.21420f, 1.11451f,  1.39553f,  0.51573f,
+  0.05476f,  1.13791f,  0.94959f,  -0.35710f, 0.67467f,  0.16722f,  0.61213f,
+  0.07683f,  -0.20613f, 0.13440f,  -0.72131f, -0.15418f, -0.17688f, -0.16510f,
+  -0.19226f, 0.09270f,  -2.43559f, -0.12669f, 0.05074f,  0.30414f,  0.00927f,
+  0.60630f,  0.00801f,  -1.07310f, -0.06227f, 2.10607f,  0.02382f,  -0.39891f,
+  -0.09149f, -0.78596f, 0.83966f,  -0.14802f, -0.14083f, -0.20831f, -0.55136f,
+  0.08566f,  -0.00647f, 0.07044f,  0.53408f,  0.85720f,  -0.07393f, 0.24476f,
+  0.43767f,  0.30519f,  -1.89430f, 0.23252f,  1.63790f,  0.17316f,  -0.03903f,
+  0.25269f,  0.01562f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_ver_layer0[16] = {
+  -0.83370f, -0.20704f, -0.60437f, -0.81664f, 1.16998f,  0.16745f,
+  -1.34680f, -1.07083f, -0.34649f, 0.65598f,  -0.56278f, 0.22660f,
+  -0.25956f, -0.29608f, 1.24359f,  -0.09167f,
+};
+
+static const float av1_tx_type_nn_weights_8x16_ver_layer1[64] = {
+  -0.71147f, -0.63964f, -0.69220f, 0.22326f,  0.67191f,  -0.58894f, -0.98464f,
+  0.23583f,  0.22824f,  1.39838f,  0.09920f,  -0.59411f, -0.67101f, 0.19088f,
+  0.83025f,  -0.66991f, -0.42889f, -0.49969f, 1.39532f,  -1.02000f, 0.62101f,
+  0.57175f,  -0.83226f, 0.01551f,  0.05604f,  1.23028f,  0.02030f,  -0.55995f,
+  -0.42349f, 0.15375f,  0.52132f,  -0.52421f, 0.89586f,  -0.73778f, -0.10911f,
+  0.22447f,  1.16858f,  -0.48169f, 1.73890f,  -0.69860f, 0.12504f,  1.10492f,
+  0.04391f,  -0.85670f, -0.49257f, 0.09616f,  0.76518f,  -0.44854f, 1.50938f,
+  0.62246f,  -0.40366f, -0.11182f, -0.01680f, 0.59724f,  1.32170f,  -1.09061f,
+  -0.04278f, -0.02449f, 0.25024f,  1.26239f,  0.42345f,  -0.10031f, 0.80871f,
+  0.44198f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_ver_layer1[4] = {
+  0.68329f,
+  1.33555f,
+  0.25943f,
+  3.23439f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_8x16_ver_layer0,
+      av1_tx_type_nn_weights_8x16_ver_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_8x16_ver_layer0,
+      av1_tx_type_nn_bias_8x16_ver_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx type model for 16x8 block.
+static const float av1_tx_type_nn_weights_16x8_hor_layer0[128] = {
+  0.89821f,  0.90804f,  1.13052f,  0.74855f,  1.02053f,  0.91260f,  0.97102f,
+  0.16808f,  -0.19982f, -0.33296f, -0.22490f, -0.22481f, -0.09332f, -2.44338f,
+  -0.12236f, -0.03158f, -1.43561f, 0.07794f,  0.16586f,  0.09731f,  0.12967f,
+  0.09725f,  -0.16826f, 1.26640f,  0.88004f,  0.27312f,  -0.07993f, 0.33640f,
+  0.11732f,  0.33384f,  0.97066f,  -0.61744f, -0.48545f, 0.44622f,  0.73744f,
+  0.32262f,  -0.05713f, 0.42280f,  1.10378f,  0.18540f,  -2.07906f, 0.11443f,
+  0.37877f,  0.24136f,  -0.12524f, -0.12434f, 0.02116f,  0.11716f,  1.28267f,
+  1.01508f,  1.26184f,  1.22545f,  1.29582f,  1.18855f,  1.27564f,  0.42001f,
+  -0.41481f, 0.06725f,  -0.13133f, -0.24801f, 0.16515f,  0.16228f,  0.35197f,
+  0.53610f,  -0.39805f, -0.32584f, 0.40096f,  0.38621f,  -0.00030f, -0.23434f,
+  0.29149f,  -0.76542f, 0.04996f,  -0.30036f, 1.48687f,  0.90852f,  -0.03083f,
+  -0.15953f, 1.19259f,  0.87690f,  -1.08977f, 0.78757f,  0.81149f,  0.54089f,
+  0.35400f,  0.37919f,  0.84997f,  -0.20449f, 0.39601f,  -0.37596f, 0.64748f,
+  0.26021f,  0.37354f,  0.23593f,  0.16335f,  1.70681f,  0.31800f,  -0.00964f,
+  0.82687f,  -0.78372f, -1.47438f, 0.32410f,  1.37436f,  0.07476f,  -0.40574f,
+  -0.10353f, -0.79300f, 0.74381f,  -0.15601f, -0.14380f, -0.20961f, -0.52697f,
+  0.04669f,  -0.00870f, 0.05624f,  -0.09036f, 0.25701f,  0.30336f,  0.24199f,
+  0.45579f,  0.66330f,  -1.81834f, 0.74965f,  1.22747f,  0.25072f,  0.25100f,
+  0.43289f,  -0.00362f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_hor_layer0[16] = {
+  -0.87643f, 0.36754f,  -0.86409f, 1.37761f,  1.22688f,  0.09074f,
+  -1.47139f, -1.06100f, -0.24087f, 1.10382f,  -0.32837f, -1.39592f,
+  -0.14741f, -0.43954f, 1.72137f,  -0.21704f,
+};
+
+static const float av1_tx_type_nn_weights_16x8_hor_layer1[64] = {
+  -0.81860f, -0.80745f, -0.43612f, 0.58656f,  0.37455f, -0.56519f, -1.71536f,
+  0.23278f,  0.23951f,  1.09610f,  0.49986f,  0.43375f, -0.53182f, 0.17376f,
+  1.05626f,  -0.61743f, -0.71777f, -0.66943f, 1.40091f, 0.34426f,  1.14295f,
+  0.45571f,  -0.52504f, -0.00303f, 0.06044f,  0.66119f, -0.60340f, -1.14344f,
+  -0.28045f, 0.12742f,  0.61484f,  -0.41016f, 1.36102f, -0.86969f, -0.52728f,
+  1.01725f,  0.67083f,  -0.10138f, 1.36406f,  0.34066f, 0.12498f,  0.86595f,
+  -0.39636f, -0.27888f, -0.40244f, 0.09847f,  0.81178f, -0.45313f, 1.39127f,
+  0.99865f,  -0.57908f, 0.55072f,  0.49638f,  1.11524f, 1.85504f,  -0.28316f,
+  -0.05195f, -0.23284f, 0.26461f,  -1.28120f, 0.60707f, -0.06110f, 0.74085f,
+  0.63304f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_hor_layer1[4] = {
+  0.71765f,
+  1.40400f,
+  0.32221f,
+  3.07234f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_16x8_hor_layer0,
+      av1_tx_type_nn_weights_16x8_hor_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_16x8_hor_layer0,
+      av1_tx_type_nn_bias_16x8_hor_layer1,
+  },
+};
+
+static const float av1_tx_type_nn_weights_16x8_ver_layer0[128] = {
+  1.20497f,  1.23691f,  1.23738f,  1.07773f,  1.15264f,  1.31959f,  1.15365f,
+  0.17179f,  0.68612f,  0.55636f,  0.57145f,  0.67022f,  0.19636f,  -1.27420f,
+  -1.36428f, -0.16706f, -1.20934f, -0.87794f, -0.97146f, -0.74722f, -1.14493f,
+  -1.02689f, -0.88153f, 0.83857f,  1.53355f,  0.13601f,  0.35451f,  0.53750f,
+  0.62381f,  0.32438f,  0.59405f,  0.33090f,  -1.52948f, -0.46094f, 0.42634f,
+  0.48763f,  0.30707f,  0.52553f,  0.71427f,  -0.31287f, -2.37106f, -0.18756f,
+  0.16561f,  -0.00431f, -0.13747f, -0.09336f, -0.16511f, 0.13454f,  0.45010f,
+  -0.00317f, -0.06403f, 0.95442f,  1.59636f,  0.30602f,  -0.05515f, 0.05467f,
+  -0.21758f, -0.19192f, -0.17935f, -0.00545f, 0.35409f,  0.26141f,  -0.32174f,
+  1.78129f,  -0.40161f, -0.33158f, 0.38084f,  0.38081f,  0.01053f,  -0.23567f,
+  0.29239f,  -0.76159f, -0.19373f, 0.13649f,  0.66949f,  1.19733f,  1.92557f,
+  1.16691f,  0.94955f,  0.62324f,  -0.85434f, -0.07699f, 0.87683f,  0.95911f,
+  0.86106f,  0.57959f,  0.40146f,  -0.35851f, 1.55427f,  0.15349f,  -0.01582f,
+  0.32517f,  0.03784f,  0.15916f,  0.09024f,  1.43187f,  0.56160f,  0.11521f,
+  0.52476f,  -0.26107f, -0.38167f, -0.31596f, 0.31304f,  -0.65366f, -0.40680f,
+  -0.11082f, -0.78585f, 0.77906f,  -0.13322f, -0.13747f, -0.21001f, -0.53204f,
+  -0.06752f, -0.84741f, -0.53442f, -0.16284f, 0.54027f,  0.13586f,  -0.42001f,
+  0.85388f,  0.08300f,  -0.89325f, -1.73681f, -0.70473f, 0.23151f,  0.69549f,
+  0.72124f,  0.12769f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_ver_layer0[16] = {
+  -1.15644f, -0.31062f, 0.20697f,  -0.60304f, -1.19498f, 0.21451f,
+  -0.42825f, -0.71800f, -0.25816f, 1.47408f,  -0.24423f, -1.45773f,
+  -0.55834f, -0.36938f, 1.56759f,  0.07238f,
+};
+
+static const float av1_tx_type_nn_weights_16x8_ver_layer1[64] = {
+  -1.45227f, -0.67141f, 0.75237f,  0.32681f,  -0.70528f, -0.76730f, -0.49777f,
+  0.02418f,  0.25096f,  1.14840f,  0.23548f,  0.48755f,  0.33164f,  0.21050f,
+  1.41651f,  -0.28888f, -0.76668f, 0.04439f,  0.67538f,  -1.06438f, 0.68128f,
+  0.95824f,  0.08530f,  -0.03635f, 0.06820f,  1.38621f,  -0.50424f, -1.72992f,
+  -0.20949f, 0.13400f,  0.93366f,  -0.05324f, 1.41593f,  -0.75119f, -1.80912f,
+  1.05440f,  0.62580f,  -0.30867f, -0.07025f, -0.34654f, 0.13621f,  1.74426f,
+  -0.22417f, 0.47031f,  -0.08142f, 0.10151f,  0.42498f,  0.06635f,  1.50623f,
+  1.04130f,  0.85107f,  0.23382f,  0.69800f,  1.10856f,  1.18767f,  -0.69395f,
+  -0.07985f, 0.50412f,  0.46019f,  0.49214f,  0.44219f,  -0.09502f, 0.75745f,
+  0.99208f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_ver_layer1[4] = {
+  0.68774f,
+  0.88572f,
+  0.77462f,
+  3.05667f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_16x8_ver_layer0,
+      av1_tx_type_nn_weights_16x8_ver_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_16x8_ver_layer0,
+      av1_tx_type_nn_bias_16x8_ver_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx type model for 16x16 block.
+static const float av1_tx_type_nn_weights_16x16_layer0[128] = {
+  1.26592f,  1.36313f,  1.30956f,  1.29926f,  1.48816f,  1.68851f,  1.32000f,
+  0.13321f,  -0.22477f, -0.88906f, -0.19622f, 1.69605f,  1.22180f,  -1.57771f,
+  -1.15765f, 0.05710f,  -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f,
+  -0.77952f, -1.15723f, 1.17809f,  1.35602f,  -0.05243f, -0.37596f, 0.26108f,
+  0.17611f,  -0.10323f, 0.77279f,  -0.48911f, -0.79308f, 0.55112f,  0.43918f,
+  0.27872f,  0.28714f,  0.45830f,  1.05689f,  0.03705f,  -2.49975f, -0.01940f,
+  0.05709f,  0.07942f,  -0.13290f, -0.10359f, 0.00143f,  0.37303f,  0.96470f,
+  0.53293f,  1.14459f,  0.89185f,  0.43378f,  0.47764f,  0.90924f,  0.15279f,
+  -0.15361f, 0.02949f,  0.42240f,  0.68143f,  0.89588f,  0.73754f,  0.10974f,
+  1.57755f,  -0.39870f, -0.32914f, 0.35638f,  0.34991f,  -0.00003f, -0.23373f,
+  0.29630f,  -0.76699f, -0.01356f, 0.04234f,  0.84253f,  1.92078f,  0.93160f,
+  0.71993f,  0.71604f,  0.76455f,  -1.59782f, 0.32332f,  1.11628f,  0.33062f,
+  -0.03728f, -0.05710f, 0.80447f,  -0.14719f, 1.34658f,  -0.05718f, 0.64015f,
+  0.21926f,  0.41653f,  0.12720f,  0.54092f,  1.39411f,  1.81819f,  -0.24513f,
+  0.00955f,  0.38011f,  -0.57787f, -0.41759f, 0.68834f,  -0.31783f, -0.40607f,
+  -0.10107f, -0.79374f, 0.75599f,  -0.16282f, -0.14490f, -0.20783f, -0.55019f,
+  -0.13793f, -0.22293f, 0.18305f,  0.12445f,  0.56830f,  0.24567f,  0.09278f,
+  0.70803f,  0.35803f,  -1.52676f, -0.89624f, 0.77665f,  0.19877f,  0.77175f,
+  0.50355f,  0.08592f,
+};
+
+static const float av1_tx_type_nn_bias_16x16_layer0[16] = {
+  -1.31834f, 0.14346f,  -0.10062f, 0.84489f,  0.95617f,  -0.06720f,
+  -0.68502f, -0.91442f, -0.31932f, 0.25276f,  -0.15138f, -1.57661f,
+  -0.14062f, -0.42120f, 0.94573f,  -0.09287f,
+};
+
+static const float av1_tx_type_nn_weights_16x16_layer1[64] = {
+  -1.80333f, -1.06353f, 0.55139f,  0.74644f,  0.13747f, -0.93018f, -0.10286f,
+  0.67133f,  0.24460f,  1.44583f,  0.02173f,  0.26037f, -0.73687f, 0.19566f,
+  0.61846f,  -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f,
+  0.96224f,  -0.59139f, 0.03813f,  0.05403f,  1.33427f, -0.54375f, -1.92181f,
+  0.54704f,  0.13608f,  0.22151f,  -0.38076f, 1.18390f, -0.77508f, -1.84283f,
+  1.00894f,  0.62318f,  -0.15296f, 1.27600f,  0.22822f, 0.12751f,  0.93910f,
+  -0.28502f, 0.53912f,  -0.96889f, 0.10182f,  0.81508f, -0.43028f, 2.67386f,
+  0.52204f,  0.49820f,  -0.41711f, 1.05038f,  1.12192f, 0.74349f,  -0.75417f,
+  -0.03718f, -0.35769f, 0.89651f,  0.63236f,  0.54215f, -0.07894f, 0.48274f,
+  1.08829f,
+};
+
+static const float av1_tx_type_nn_bias_16x16_layer1[4] = {
+  0.81986f,
+  1.26865f,
+  0.11118f,
+  2.48404f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x16 = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_16x16_layer0,
+      av1_tx_type_nn_weights_16x16_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_16x16_layer0,
+      av1_tx_type_nn_bias_16x16_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx type model for 16x32 block.
+static const float av1_tx_type_nn_weights_16x32_hor_layer0[128] = {
+  0.89821f,  0.90804f,  1.13052f,  0.74855f,  1.02053f,  0.91260f,  0.97102f,
+  0.16808f,  -0.19982f, -0.33296f, -0.22490f, -0.22481f, -0.09332f, -2.44338f,
+  -0.12236f, -0.03158f, -1.43561f, 0.07794f,  0.16586f,  0.09731f,  0.12967f,
+  0.09725f,  -0.16826f, 1.26640f,  0.88004f,  0.27312f,  -0.07993f, 0.33640f,
+  0.11732f,  0.33384f,  0.97066f,  -0.61744f, -0.48545f, 0.44622f,  0.73744f,
+  0.32262f,  -0.05713f, 0.42280f,  1.10378f,  0.18540f,  -2.07906f, 0.11443f,
+  0.37877f,  0.24136f,  -0.12524f, -0.12434f, 0.02116f,  0.11716f,  1.28267f,
+  1.01508f,  1.26184f,  1.22545f,  1.29582f,  1.18855f,  1.27564f,  0.42001f,
+  -0.41481f, 0.06725f,  -0.13133f, -0.24801f, 0.16515f,  0.16228f,  0.35197f,
+  0.53610f,  -0.39805f, -0.32584f, 0.40096f,  0.38621f,  -0.00030f, -0.23434f,
+  0.29149f,  -0.76542f, 0.04996f,  -0.30036f, 1.48687f,  0.90852f,  -0.03083f,
+  -0.15953f, 1.19259f,  0.87690f,  -1.08977f, 0.78757f,  0.81149f,  0.54089f,
+  0.35400f,  0.37919f,  0.84997f,  -0.20449f, 0.39601f,  -0.37596f, 0.64748f,
+  0.26021f,  0.37354f,  0.23593f,  0.16335f,  1.70681f,  0.31800f,  -0.00964f,
+  0.82687f,  -0.78372f, -1.47438f, 0.32410f,  1.37436f,  0.07476f,  -0.40574f,
+  -0.10353f, -0.79300f, 0.74381f,  -0.15601f, -0.14380f, -0.20961f, -0.52697f,
+  0.04669f,  -0.00870f, 0.05624f,  -0.09036f, 0.25701f,  0.30336f,  0.24199f,
+  0.45579f,  0.66330f,  -1.81834f, 0.74965f,  1.22747f,  0.25072f,  0.25100f,
+  0.43289f,  -0.00362f,
+};
+
+static const float av1_tx_type_nn_bias_16x32_hor_layer0[16] = {
+  -0.87643f, 0.36754f,  -0.86409f, 1.37761f,  1.22688f,  0.09074f,
+  -1.47139f, -1.06100f, -0.24087f, 1.10382f,  -0.32837f, -1.39592f,
+  -0.14741f, -0.43954f, 1.72137f,  -0.21704f,
+};
+
+static const float av1_tx_type_nn_weights_16x32_hor_layer1[64] = {
+  -0.81860f, -0.80745f, -0.43612f, 0.58656f,  0.37455f, -0.56519f, -1.71536f,
+  0.23278f,  0.23951f,  1.09610f,  0.49986f,  0.43375f, -0.53182f, 0.17376f,
+  1.05626f,  -0.61743f, -0.71777f, -0.66943f, 1.40091f, 0.34426f,  1.14295f,
+  0.45571f,  -0.52504f, -0.00303f, 0.06044f,  0.66119f, -0.60340f, -1.14344f,
+  -0.28045f, 0.12742f,  0.61484f,  -0.41016f, 1.36102f, -0.86969f, -0.52728f,
+  1.01725f,  0.67083f,  -0.10138f, 1.36406f,  0.34066f, 0.12498f,  0.86595f,
+  -0.39636f, -0.27888f, -0.40244f, 0.09847f,  0.81178f, -0.45313f, 1.39127f,
+  0.99865f,  -0.57908f, 0.55072f,  0.49638f,  1.11524f, 1.85504f,  -0.28316f,
+  -0.05195f, -0.23284f, 0.26461f,  -1.28120f, 0.60707f, -0.06110f, 0.74085f,
+  0.63304f,
+};
+
+static const float av1_tx_type_nn_bias_16x32_hor_layer1[4] = {
+  0.71765f,
+  1.40400f,
+  0.32221f,
+  3.07234f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x32_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_16x32_hor_layer0,
+      av1_tx_type_nn_weights_16x32_hor_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_16x32_hor_layer0,
+      av1_tx_type_nn_bias_16x32_hor_layer1,
+  },
+};
+
+static const float av1_tx_type_nn_weights_16x32_ver_layer0[512] = {
+  -0.01219f, 0.51494f,  0.25450f,  0.45788f,  -0.87277f, 0.32954f,  -0.04851f,
+  -0.24321f, -0.40000f, 0.21915f,  0.14108f,  0.98268f,  0.18989f,  0.54298f,
+  0.36349f,  0.38931f,  1.08124f,  0.87199f,  1.03553f,  1.14777f,  1.04254f,
+  1.11336f,  0.92198f,  0.84715f,  1.89363f,  1.21587f,  0.72377f,  1.25097f,
+  0.84231f,  0.95529f,  1.12346f,  0.19113f,  -0.04559f, 0.56859f,  0.59747f,
+  0.60176f,  0.82465f,  0.59009f,  0.67240f,  1.58674f,  -0.92951f, -0.23449f,
+  0.11923f,  -0.19151f, -0.15914f, 0.03146f,  -0.16541f, 0.17181f,  -0.21834f,
+  0.21906f,  0.96708f,  0.36085f,  -0.42380f, -2.25681f, -0.48812f, 0.72875f,
+  0.06585f,  0.18818f,  -0.02109f, -0.10996f, 0.00187f,  -0.02078f, 0.04484f,
+  -0.07171f, 0.94773f,  -0.33466f, 0.28484f,  0.14791f,  0.30274f,  0.13377f,
+  0.40970f,  0.45133f,  1.69265f,  -0.36422f, -0.15889f, 0.07670f,  0.44675f,
+  -0.28665f, -0.07097f, 1.03803f,  -0.83274f, -0.24571f, 0.08039f,  -0.23790f,
+  -0.23276f, -0.28031f, 0.26451f,  -0.18513f, -2.23336f, -0.62073f, 0.32495f,
+  -0.67644f, -0.08559f, -0.36382f, -0.24515f, -0.01899f, 0.09012f,  0.19723f,
+  0.04017f,  0.31624f,  0.58369f,  0.30411f,  -0.81165f, -2.58541f, -0.20491f,
+  0.68089f,  -0.14799f, 0.13925f,  0.12867f,  0.15229f,  0.06887f,  -0.03784f,
+  0.02288f,  -0.28712f, 0.14107f,  0.29485f,  -0.11662f, 0.25239f,  0.30311f,
+  -0.07377f, -0.10962f, 0.59856f,  0.47967f,  0.01847f,  -0.27889f, 0.46786f,
+  0.18118f,  0.09355f,  -2.10076f, 0.38823f,  0.28202f,  0.29104f,  0.86977f,
+  0.52377f,  0.21161f,  0.72888f,  -0.00952f, 0.15982f,  -0.14651f, 0.28763f,
+  -0.14155f, 0.00093f,  0.08351f,  0.34685f,  -0.22066f, 0.20378f,  0.25416f,
+  0.03423f,  -0.11068f, -0.41612f, 0.56913f,  -0.06697f, -0.12585f, -0.21033f,
+  -0.14513f, -0.04477f, -0.35778f, 0.03437f,  0.06956f,  -0.25356f, -1.46010f,
+  -0.08142f, 0.11926f,  -0.63551f, -0.13882f, 0.34164f,  0.10821f,  1.07323f,
+  -0.62435f, -0.27116f, 0.25971f,  0.11952f,  -0.39480f, -0.05474f, -0.12582f,
+  0.28289f,  0.13723f,  0.58369f,  0.41865f,  0.28574f,  1.01357f,  0.46661f,
+  0.61717f,  0.85708f,  -0.03930f, -0.38013f, -0.33888f, -0.20561f, -0.19087f,
+  -0.01041f, 0.12119f,  -0.20786f, 0.55915f,  0.67511f,  0.55554f,  0.56540f,
+  0.76647f,  0.54766f,  0.45166f,  0.61384f,  0.95407f,  -0.06811f, -0.62132f,
+  0.12713f,  0.63713f,  2.04090f,  1.17054f,  0.00469f,  -0.93692f, -0.24136f,
+  -0.04281f, -0.15787f, 0.37956f,  -0.09174f, -0.72494f, 0.55285f,  -1.40996f,
+  -0.54077f, 0.38445f,  -0.08258f, 0.64259f,  -0.54058f, -0.49865f, 1.41371f,
+  0.89014f,  0.78788f,  0.37919f,  0.87447f,  -0.00760f, -0.00947f, 0.16323f,
+  -0.36632f, -1.38115f, -0.24619f, 0.40490f,  -0.08871f, -0.25365f, -0.60842f,
+  0.11128f,  0.18658f,  -0.86001f, -0.28271f, 0.39572f,  -0.29930f, -0.10110f,
+  0.33706f,  0.21731f,  0.15383f,  -0.01707f, 0.02812f,  0.31192f,  0.39742f,
+  0.38260f,  -0.48263f, 0.57385f,  0.53239f,  -0.60013f, -0.63211f, -0.45140f,
+  -0.73520f, -0.95260f, -0.70633f, -0.96190f, 0.01747f,  -0.05195f, -0.07138f,
+  -1.09535f, -0.63548f, -1.55700f, -0.35721f, -0.18923f, 0.77568f,  0.09419f,
+  0.36919f,  -0.32761f, -0.06597f, -0.38988f, -0.43674f, -0.24284f, 0.36906f,
+  0.28414f,  0.19273f,  -0.68516f, 0.09514f,  -0.45381f, 0.19917f,  -0.32377f,
+  1.32549f,  0.08244f,  -0.64405f, 0.13195f,  2.85307f,  0.47631f,  -0.33408f,
+  0.04168f,  0.18585f,  -0.18029f, 0.07986f,  -0.08816f, -0.00703f, -0.01515f,
+  -0.13164f, 0.00571f,  0.05676f,  1.51425f,  0.73360f,  0.43486f,  -0.08223f,
+  -0.06183f, -0.57098f, -0.29948f, 0.05945f,  0.19238f,  -0.47980f, -0.35902f,
+  -0.19931f, 0.43443f,  0.67436f,  0.78573f,  0.25703f,  1.01863f,  0.99047f,
+  0.95228f,  1.02429f,  1.19264f,  0.29935f,  -0.26583f, -0.98749f, -0.46167f,
+  -0.29727f, -0.10515f, -0.39790f, -0.59321f, -0.61925f, -0.95452f, 0.04292f,
+  -0.48273f, -0.91195f, -0.45971f, -0.46355f, -0.88319f, -0.51712f, -0.47682f,
+  -0.86110f, -0.59178f, -0.57163f, -0.94946f, 0.19627f,  -0.18699f, 0.11037f,
+  1.39110f,  0.05715f,  3.00762f,  1.52243f,  0.25028f,  0.12779f,  -0.12871f,
+  0.04764f,  0.08288f,  -0.16572f, -0.06580f, 0.05845f,  -0.01474f, 0.04886f,
+  -0.10000f, 0.12911f,  -0.01416f, -0.12472f, 0.14358f,  0.16554f,  0.08853f,
+  0.13418f,  -0.05408f, -0.13871f, -0.00049f, 0.20725f,  -0.05603f, 0.27885f,
+  -0.14277f, 0.29653f,  -0.24739f, 0.10101f,  -0.17068f, -2.43802f, 0.41834f,
+  0.49784f,  0.34949f,  0.98487f,  0.16792f,  1.07355f,  0.32546f,  1.32377f,
+  -0.08584f, 0.85214f,  -0.05721f, 0.90307f,  0.20167f,  0.52664f,  -0.14478f,
+  0.64997f,  0.06846f,  0.32475f,  0.64453f,  0.70143f,  -0.03091f, -0.24958f,
+  -0.39021f, -0.57693f, -0.18319f, 0.11793f,  -0.05948f, 0.36670f,  -0.27932f,
+  0.14800f,  -0.55459f, -0.89673f, 0.65922f,  0.54308f,  -0.16731f, -0.59731f,
+  -0.20705f, -0.18183f, -0.05645f, -0.06829f, -0.40210f, -0.27955f, 0.28054f,
+  0.57665f,  0.14171f,  0.54693f,  -0.22144f, -0.59664f, 0.13295f,  0.07057f,
+  -0.19698f, 0.03328f,  -0.09687f, -0.32390f, -0.11506f, -0.40406f, -0.11473f,
+  0.10399f,  -0.29824f, 0.16028f,  0.00053f,  0.22699f,  0.04203f,  -0.43880f,
+  -0.12654f, 0.12172f,  0.21087f,  -0.46350f, -0.22081f, -0.06173f, -0.23287f,
+  0.90314f,  0.04466f,  -0.06149f, 0.32682f,  0.16609f,  -0.58991f, -0.03786f,
+  -0.41329f, 0.02632f,  0.23411f,  0.25344f,  0.16468f,  0.31007f,  0.21845f,
+  0.32462f,  0.33945f,  0.11527f,  -0.35926f, -0.18584f, 0.29340f,  0.78199f,
+  2.39287f,  0.53838f,  -1.55085f, 0.02238f,  -0.26153f, -0.42498f, -0.02460f,
+  0.19261f,  -0.10870f, -0.08453f, -0.39561f, 0.08600f,  0.36310f,  0.58439f,
+  -0.59526f, 0.13104f,  -0.06703f, -0.17529f, -0.41431f, -0.23121f, -0.32394f,
+  -0.33324f, -0.21405f, -0.41702f, -0.29236f, -0.31766f, -0.33512f, -0.22679f,
+  -0.13680f, -0.00118f, -1.81744f, -2.34798f, -1.08048f, -0.29883f, -0.29123f,
+  -0.01752f,
+};
+
+static const float av1_tx_type_nn_bias_16x32_ver_layer0[32] = {
+  1.02458f,  -1.02185f, -0.18978f, 0.05981f,  -0.94931f, 0.34544f,  0.04415f,
+  -0.60036f, -0.11368f, -0.14154f, 1.23438f,  0.51640f,  -0.57587f, -0.91380f,
+  0.95720f,  0.68298f,  -0.06353f, -2.14960f, -0.11080f, 0.79380f,  -0.94199f,
+  0.43040f,  0.01358f,  0.07201f,  -0.49689f, -0.14839f, -0.80132f, -0.13925f,
+  -0.11834f, -0.24998f, -0.33976f, 0.35497f,
+};
+
+static const float av1_tx_type_nn_weights_16x32_ver_layer1[128] = {
+  0.87367f,  -1.06469f, -0.50829f, -0.70540f, 1.14596f,  -1.12346f, -0.94467f,
+  0.01380f,  -0.18911f, 0.07961f,  -0.18626f, 0.61902f,  -0.64423f, 1.21545f,
+  1.01149f,  0.26309f,  1.50380f,  1.93940f,  -0.64064f, 1.03987f,  -1.88000f,
+  -0.44574f, -1.53303f, 1.36307f,  1.00292f,  0.37031f,  0.21594f,  0.16758f,
+  0.02592f,  -0.77431f, -0.31797f, -1.53826f, 1.14013f,  -1.21957f, 0.04571f,
+  -0.22168f, 0.32299f,  0.25949f,  -0.13306f, 0.17850f,  0.92494f,  0.19999f,
+  0.07494f,  -0.03362f, -0.53453f, 1.02970f,  -0.22947f, 0.73964f,  1.08445f,
+  0.16855f,  -0.02686f, 0.25254f,  0.05952f,  0.02194f,  0.05649f,  0.39195f,
+  0.14139f,  0.53843f,  -0.06959f, -0.06993f, -0.14151f, -0.53147f, 0.17481f,
+  -1.21977f, 0.62932f,  1.07173f,  0.24049f,  -0.51574f, 0.97492f,  -0.28169f,
+  -0.15406f, -0.05441f, -0.25415f, 0.16583f,  0.43674f,  -0.00593f, -0.09277f,
+  0.61402f,  1.35562f,  -0.03926f, 0.18967f,  -0.29548f, -0.55509f, 0.23661f,
+  0.05023f,  0.36226f,  -0.83314f, 0.39357f,  0.19943f,  -0.63431f, -0.03847f,
+  0.12213f,  0.62024f,  -0.11704f, -0.22483f, 0.96624f,  0.18518f,  0.09181f,
+  -0.63068f, 0.66797f,  0.74107f,  0.40624f,  0.70636f,  -0.06921f, 0.34175f,
+  -0.15513f, 2.07844f,  0.22126f,  0.52919f,  0.26793f,  -0.50018f, 1.10549f,
+  0.10970f,  0.05831f,  0.82842f,  -1.22975f, 1.78377f,  0.92679f,  2.01480f,
+  -1.19011f, -0.53381f, 0.38533f,  0.45579f,  -0.10683f, -0.40828f, 0.31398f,
+  0.14978f,  0.91325f,
+};
+
+static const float av1_tx_type_nn_bias_16x32_ver_layer1[4] = {
+  1.03659f,
+  1.80249f,
+  1.25710f,
+  1.32000f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x32_ver = {
+  16,  // num_inputs
+  4,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_16x32_ver_layer0,
+      av1_tx_type_nn_weights_16x32_ver_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_16x32_ver_layer0,
+      av1_tx_type_nn_bias_16x32_ver_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx type model for 32x16 block.
+static const float av1_tx_type_nn_weights_32x16_hor_layer0[512] = {
+  -0.07289f, 0.30798f,  0.41881f,  0.33434f,  -0.01599f, 0.85307f,  -0.16060f,
+  -0.07922f, -0.04693f, 0.29186f,  0.44117f,  1.02417f,  0.12447f,  0.46321f,
+  0.40060f,  0.50140f,  0.48338f,  0.47298f,  0.36585f,  0.42821f,  0.41289f,
+  0.47534f,  0.42900f,  0.26061f,  0.45887f,  0.38163f,  0.17302f,  1.00888f,
+  1.79910f,  1.36140f,  0.24471f,  0.04557f,  1.10823f,  0.74325f,  0.91210f,
+  0.81387f,  0.98865f,  -0.09874f, 0.55146f,  0.19385f,  -0.50752f, -0.17249f,
+  0.27261f,  -0.02763f, -0.03286f, 0.09122f,  0.07015f,  0.20012f,  0.68983f,
+  -1.25345f, -0.00145f, 0.71567f,  0.54948f,  -0.56154f, -0.28918f, 0.11997f,
+  -0.09907f, 0.09195f,  0.05768f,  0.15558f,  0.11284f,  -0.35195f, -0.08723f,
+  -0.03571f, 0.94031f,  0.63737f,  0.98202f,  0.93826f,  0.87126f,  0.88530f,
+  0.97697f,  0.55283f,  0.58670f,  0.86502f,  0.97008f,  0.99709f,  0.66214f,
+  0.96660f,  0.99890f,  0.31945f,  -1.00301f, 0.13215f,  -0.03950f, 0.21148f,
+  0.05128f,  0.10955f,  0.44839f,  -0.33438f, -2.09773f, 0.13908f,  0.58669f,
+  0.25268f,  -0.24006f, 0.01286f,  -0.05732f, 0.03401f,  -0.06896f, 0.35397f,
+  0.05133f,  -0.21449f, -0.38437f, -0.32326f, -0.38731f, -0.44419f, 0.25968f,
+  -0.29422f, -0.12553f, -0.08896f, -0.16400f, -0.22309f, 0.21380f,  -0.26912f,
+  0.06866f,  -0.25694f, 0.17632f,  0.32032f,  -0.10666f, 0.26278f,  0.31877f,
+  -0.09338f, -0.14289f, 0.54232f,  0.46070f,  0.00059f,  -0.27914f, 0.45177f,
+  0.16274f,  -0.08811f, -0.45791f, 0.53946f,  -0.16794f, 0.16229f,  0.11840f,
+  -0.24435f, 0.26894f,  -0.33180f, -0.47314f, 0.34061f,  -0.13939f, 0.13321f,
+  -0.05208f, -0.18139f, -0.35234f, 1.37298f,  -0.19360f, 0.21728f,  0.26088f,
+  0.04045f,  -0.10763f, -0.40470f, 0.50026f,  -0.06726f, -0.12871f, -0.20963f,
+  -0.14583f, -0.04711f, -0.35988f, 0.03091f,  0.06491f,  -0.31668f, -0.52190f,
+  0.23397f,  -0.13984f, -0.15207f, -0.49977f, 0.51205f,  0.12559f,  -0.03631f,
+  0.33447f,  -0.36684f, 0.17533f,  0.15671f,  -0.00096f, 0.06817f,  0.20922f,
+  0.34006f,  0.71260f,  0.45024f,  0.53033f,  0.15645f,  0.76019f,  0.56870f,
+  0.83066f,  0.63022f,  1.74436f,  -0.24798f, 0.06795f,  -0.00749f, 0.17795f,
+  0.10371f,  0.06527f,  0.41054f,  0.49003f,  0.34630f,  0.02615f,  0.30320f,
+  -0.47133f, -0.49584f, 0.21775f,  0.27530f,  -0.29977f, -0.64269f, 0.52627f,
+  -0.02492f, 0.08077f,  0.40786f,  -0.36015f, -0.70714f, -1.98185f, -0.28187f,
+  0.35018f,  -0.06105f, -0.12710f, 0.06606f,  -0.27805f, 0.44630f,  -0.84731f,
+  -0.26699f, 0.25856f,  0.06194f,  -0.18674f, -0.11560f, -0.43277f, 1.10579f,
+  0.95876f,  0.17415f,  0.56386f,  0.68426f,  0.50180f,  0.24844f,  0.12347f,
+  0.15281f,  -0.19089f, 0.52279f,  0.41860f,  -0.05270f, -0.17029f, -0.03542f,
+  0.10621f,  -0.25088f, 0.24070f,  -0.08951f, 0.29950f,  -0.36720f, 0.02151f,
+  0.20129f,  -0.70066f, -0.23144f, -0.20070f, -0.39262f, -0.01597f, -0.05591f,
+  0.23814f,  -0.25991f, 0.05812f,  0.60554f,  -0.06106f, -0.58326f, 0.28762f,
+  -0.18747f, 0.08232f,  -0.04243f, -0.03293f, 0.14722f,  -0.13017f, -0.67263f,
+  0.38698f,  -0.18207f, -0.11496f, -0.27976f, -0.55345f, 1.42872f,  0.04684f,
+  0.04214f,  0.00030f,  0.02410f,  0.19966f,  -0.04246f, 0.00442f,  0.23121f,
+  0.13364f,  0.21548f,  -0.12748f, -0.14066f, -0.28354f, 0.59937f,  -0.27553f,
+  1.57503f,  -0.01050f, -0.17724f, 0.44110f,  -0.80334f, 0.72064f,  1.00501f,
+  -0.72638f, 0.02774f,  0.48540f,  -0.72016f, -0.27721f, 0.31559f,  0.07322f,
+  0.20279f,  -0.19647f, 0.02352f,  0.12662f,  0.19743f,  0.30543f,  0.25712f,
+  0.44702f,  0.16417f,  0.17888f,  -2.58469f, 0.20555f,  0.57782f,  -0.10892f,
+  0.14527f,  0.82251f,  0.04200f,  0.44626f,  0.10818f,  0.71204f,  0.62903f,
+  0.69178f,  0.73603f,  0.52717f,  0.83020f,  0.48824f,  1.03270f,  -0.00152f,
+  0.07958f,  0.24181f,  -0.78839f, -0.74214f, -0.72998f, -1.58694f, 0.17735f,
+  0.56318f,  0.32580f,  -0.58503f, -0.33673f, -0.00838f, 0.48924f,  0.43362f,
+  0.12750f,  0.00295f,  0.38624f,  0.17037f,  0.00729f,  -0.26256f, -0.41669f,
+  0.36847f,  0.22424f,  1.33334f,  0.18112f,  0.37682f,  0.49173f,  -0.45240f,
+  -0.04857f, -0.35038f, -0.83099f, -0.01988f, 0.03497f,  0.38033f,  0.13685f,
+  0.17597f,  0.28668f,  0.31193f,  -0.43281f, 0.43267f,  -0.50495f, 0.01969f,
+  0.14131f,  -0.09326f, -0.39425f, -0.62048f, -0.09119f, -0.28306f, -0.52671f,
+  -0.38584f, -0.10953f, 0.19669f,  0.34540f,  -0.49941f, 0.04605f,  -0.43535f,
+  0.27519f,  0.03659f,  -0.31961f, 0.13330f,  0.87009f,  0.20101f,  -0.70392f,
+  -0.27883f, 0.33874f,  -0.34308f, 0.67760f,  0.88195f,  0.55752f,  -0.26563f,
+  0.17875f,  0.06964f,  0.87607f,  1.47616f,  0.46747f,  -0.56408f, -0.39352f,
+  -0.16427f, -0.41185f, 0.14187f,  0.19265f,  -0.58613f, 0.56345f,  -0.17729f,
+  -0.11320f, 0.08752f,  -0.01329f, 1.20981f,  0.45170f,  -0.20571f, -0.01150f,
+  0.26476f,  0.13508f,  0.22020f,  -0.42684f, -0.22499f, -1.51212f, 0.86648f,
+  0.21776f,  0.24666f,  0.71339f,  0.42742f,  -0.00952f, 0.14762f,  0.07693f,
+  -0.19599f, 0.03075f,  -0.09703f, -0.32483f, -0.11616f, -0.40461f, -0.11693f,
+  0.10038f,  -0.30038f, 0.14686f,  0.00548f,  0.20350f,  0.00763f,  -0.43756f,
+  -0.01997f, 0.00902f,  0.07470f,  -0.41441f, -0.20605f, 0.07626f,  -0.34973f,
+  0.47455f,  -0.15251f, -0.05325f, 0.04964f,  0.32477f,  -0.54604f, 0.25273f,
+  -0.18461f, -0.30841f, 0.64908f,  0.60752f,  0.64148f,  0.72788f,  0.71232f,
+  0.58597f,  0.73017f,  0.58857f,  0.71908f,  0.59860f,  0.61849f,  0.99398f,
+  0.39572f,  -0.36165f, -1.88646f, 0.14384f,  -0.60541f, -0.21380f, -0.55498f,
+  -0.50960f, -0.08801f, 0.51892f,  0.19126f,  0.57879f,  1.19447f,  0.25673f,
+  -0.21631f, -0.43562f, -0.27497f, -0.02206f, -0.56169f, 0.58952f,  -0.60983f,
+  -0.64088f, -0.69087f, -0.56261f, -0.74089f, -0.65063f, -0.66978f, -0.60836f,
+  -0.92770f, -0.77182f, -1.61031f, -0.70007f, -0.68402f, -0.42242f, -0.66722f,
+  -0.14533f,
+};
+
+static const float av1_tx_type_nn_bias_32x16_hor_layer0[32] = {
+  1.53781f,  -0.49320f, -0.31646f, 0.02826f,  -1.05554f, 0.06559f,  -0.12399f,
+  -0.61671f, -0.28956f, -0.15419f, 0.87189f,  -0.43375f, -1.08477f, -0.66006f,
+  0.36233f,  0.82678f,  -0.92342f, -1.47101f, -0.02937f, -0.16497f, -0.75457f,
+  0.50173f,  -0.07560f, 0.71598f,  1.50795f,  -0.04745f, -0.14008f, -0.18510f,
+  -0.14988f, -0.67044f, 0.79659f,  0.70610f,
+};
+
+static const float av1_tx_type_nn_weights_32x16_hor_layer1[128] = {
+  0.84983f,  -0.62530f, -0.82600f, -0.52563f, -0.11942f, -0.50279f, -0.13425f,
+  -0.02850f, 0.50767f,  0.10252f,  0.24540f,  0.67748f,  -0.43483f, -0.22242f,
+  0.23431f,  0.57287f,  0.69560f,  1.13814f,  -0.47427f, -0.55858f, -1.47072f,
+  0.26587f,  -0.36335f, 0.83060f,  1.01645f,  -0.52895f, -0.11614f, 0.17390f,
+  -0.13664f, -0.83098f, -0.07985f, -1.36820f, 0.47759f,  -0.55612f, 0.46852f,
+  0.07406f,  -0.80467f, 0.23059f,  0.09992f,  -0.06164f, 0.13541f,  0.06135f,
+  0.83605f,  -0.53224f, -0.13867f, 0.93838f,  -0.61290f, 0.27732f,  -0.46688f,
+  -0.41810f, 0.12885f,  0.13619f,  -0.24612f, 0.07215f,  0.98866f,  0.10993f,
+  1.05799f,  -0.27146f, -0.00079f, -0.08585f, 0.08322f,  -0.33809f, 0.67598f,
+  -1.06515f, 1.28866f,  0.61028f,  -0.31704f, -0.59905f, 1.62151f,  0.10969f,
+  0.20671f,  -0.17818f, 0.14170f,  0.19322f,  0.30602f,  0.93111f,  0.19011f,
+  -0.45609f, 0.82506f,  0.32936f,  -0.07858f, -0.27106f, -0.31638f, 0.23299f,
+  0.81491f,  0.32584f,  -0.52093f, -0.32472f, 0.53643f,  -0.42605f, 0.01641f,
+  0.09002f,  0.15832f,  -0.08790f, 0.05511f,  1.00730f,  0.46309f,  0.68166f,
+  -0.18835f, 0.64512f,  -1.00540f, 0.86802f,  0.18981f,  -0.06982f, -0.24514f,
+  -0.08027f, 0.61199f,  -0.20830f, 0.72001f,  0.17477f,  0.06511f,  0.00801f,
+  -0.43590f, 0.37257f,  0.70323f,  0.60233f,  1.62541f,  0.74383f,  -0.22254f,
+  -0.33892f, 0.22881f,  0.62817f,  0.68915f,  -0.06417f, 0.00969f,  1.65869f,
+  0.89060f,  0.75948f,
+};
+
+static const float av1_tx_type_nn_bias_32x16_hor_layer1[4] = {
+  0.95359f,
+  1.56043f,
+  1.06017f,
+  2.54520f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_32x16_hor = {
+  16,  // num_inputs
+  4,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_32x16_hor_layer0,
+      av1_tx_type_nn_weights_32x16_hor_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_32x16_hor_layer0,
+      av1_tx_type_nn_bias_32x16_hor_layer1,
+  },
+};
+
+static const float av1_tx_type_nn_weights_32x16_ver_layer0[128] = {
+  1.30219f,  1.30548f,  1.33334f,  1.20560f,  1.01572f,  1.38100f,  1.37504f,
+  0.12599f,  -0.96957f, 0.19400f,  0.75734f,  0.11295f,  -0.40447f, -1.53062f,
+  -0.82980f, 0.02168f,  -1.11289f, -0.66861f, -0.83663f, -0.91455f, -0.78618f,
+  -0.87176f, -1.10711f, 0.71207f,  1.49689f,  -0.12715f, 0.29357f,  0.35234f,
+  0.61016f,  0.80708f,  0.83564f,  1.05961f,  -0.99842f, 0.82004f,  0.02638f,
+  0.44606f,  0.32298f,  0.21321f,  0.47290f,  -0.71442f, -2.81050f, -0.02520f,
+  -0.08919f, 0.00369f,  -0.05257f, -0.07011f, -0.16394f, 0.06290f,  0.80086f,
+  0.32349f,  0.47411f,  1.36126f,  1.68162f,  0.91325f,  -0.27495f, 0.00262f,
+  0.06025f,  0.42832f,  0.36965f,  0.38063f,  0.32772f,  0.40914f,  0.44510f,
+  3.02239f,  -1.84077f, 0.49536f,  -0.27340f, -0.10437f, -0.34293f, -0.08047f,
+  -0.29651f, -0.97111f, -0.34187f, 0.52869f,  1.27240f,  1.20306f,  1.19121f,
+  1.28742f,  0.26393f,  -0.62319f, 0.92285f,  -0.08303f, -0.33118f, -0.13053f,
+  0.24875f,  -0.52089f, 0.44691f,  -1.08908f, 1.20921f,  0.36538f,  -0.46792f,
+  -0.18855f, -0.13443f, -0.28472f, -0.10353f, 0.06911f,  0.68519f,  0.08228f,
+  -0.49027f, -0.34381f, 0.04719f,  -0.33298f, 0.72525f,  0.09538f,  -0.29216f,
+  -0.07260f, -0.55827f, 0.54542f,  -0.10144f, -0.09292f, -0.14427f, -0.38361f,
+  -0.41559f, 0.75338f,  -0.04530f, 0.27944f,  0.06932f,  -0.11537f, 0.29568f,
+  1.92155f,  -0.98996f, -0.08841f, 0.49386f,  0.15947f,  0.53290f,  1.46747f,
+  0.59360f,  0.25468f,
+};
+
+static const float av1_tx_type_nn_bias_32x16_ver_layer0[16] = {
+  -1.19673f, 0.33043f,  0.24408f, 0.46221f,  2.00646f, 0.19031f,
+  -0.64944f, -0.43452f, 1.04400f, 1.47371f,  0.52460f, -1.39577f,
+  0.83852f,  -0.25536f, 1.33200f, -0.24444f,
+};
+
+static const float av1_tx_type_nn_weights_32x16_ver_layer1[64] = {
+  -1.31447f, -0.86455f, 0.85217f,  1.00048f,  0.37395f, -1.35713f, -0.54032f,
+  0.82803f,  0.89606f,  1.57696f,  0.68067f,  0.42512f, -0.26250f, 0.14621f,
+  0.93249f,  -0.77690f, -0.93652f, -0.44488f, 0.68360f, -0.88178f, 1.89111f,
+  0.67700f,  -0.29310f, 0.91604f,  -1.21881f, 1.11188f, 0.45045f,  -0.86119f,
+  -0.09294f, 0.09360f,  0.80794f,  0.41027f,  1.80399f, -0.50362f, -1.44689f,
+  0.85148f,  0.90707f,  -0.18458f, 0.14165f,  1.17367f, 0.70869f,  1.57147f,
+  0.24692f,  0.16626f,  0.56794f,  0.07313f,  0.14728f, -0.74296f, 1.74127f,
+  1.26560f,  0.17753f,  1.10194f,  0.56435f,  1.73779f, 1.42841f,  -1.16773f,
+  0.24584f,  0.10813f,  -0.60187f, 0.79802f,  0.75229f, -0.06112f, 1.77282f,
+  1.01058f,
+};
+
+static const float av1_tx_type_nn_bias_32x16_ver_layer1[4] = {
+  0.83082f,
+  2.03845f,
+  0.59627f,
+  2.31341f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_32x16_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_32x16_ver_layer0,
+      av1_tx_type_nn_weights_32x16_ver_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_32x16_ver_layer0,
+      av1_tx_type_nn_bias_32x16_ver_layer1,
+  },
+};
+/******************************************************************************/
+
+// Map tx_size to its corresponding neural net model for tx type prediction.
+static const NN_CONFIG *av1_tx_type_nnconfig_map_hor[] = {
+  &av1_tx_type_nnconfig_4x4,        // 4x4
+  &av1_tx_type_nnconfig_8x8,        // 8x8
+  &av1_tx_type_nnconfig_16x16,      // 16x16
+  NULL,                             // 32x32
+  NULL,                             // 64x64
+  &av1_tx_type_nnconfig_4x8_hor,    // 4x8
+  &av1_tx_type_nnconfig_8x4_hor,    // 8x4
+  &av1_tx_type_nnconfig_8x16_hor,   // 8x16
+  &av1_tx_type_nnconfig_16x8_hor,   // 16x8
+  &av1_tx_type_nnconfig_16x32_hor,  // 16x32
+  &av1_tx_type_nnconfig_32x16_hor,  // 32x16
+  NULL,                             // 32x64
+  NULL,                             // 64x32
+  NULL,                             // 4x16
+  NULL,                             // 16x4
+  NULL,                             // 8x32
+  NULL,                             // 32x8
+  NULL,                             // 16x64
+  NULL,                             // 64x16
+};
+
+static const NN_CONFIG *av1_tx_type_nnconfig_map_ver[] = {
+  &av1_tx_type_nnconfig_4x4,        // 4x4 transform
+  &av1_tx_type_nnconfig_8x8,        // 8x8 transform
+  &av1_tx_type_nnconfig_16x16,      // 16x16 transform
+  NULL,                             // 32x32 transform
+  NULL,                             // 64x64 transform
+  &av1_tx_type_nnconfig_4x8_ver,    // 4x8 transform
+  &av1_tx_type_nnconfig_8x4_ver,    // 8x4 transform
+  &av1_tx_type_nnconfig_8x16_ver,   // 8x16 transform
+  &av1_tx_type_nnconfig_16x8_ver,   // 16x8 transform
+  &av1_tx_type_nnconfig_16x32_ver,  // 16x32 transform
+  &av1_tx_type_nnconfig_32x16_ver,  // 32x16 transform
+  NULL,                             // 32x64 transform
+  NULL,                             // 64x32 transform
+  NULL,                             // 4x16 transform
+  NULL,                             // 16x4 transform
+  NULL,                             // 8x32 transform
+  NULL,                             // 32x8 transform
+  NULL,                             // 16x64 transform
+  NULL,                             // 64x16 transform
+};
+
+// Tx split model for 4x8 block.
+static const float av1_tx_split_nn_weights_4x8_layer0[8 * 16] = {
+  0.068650f,  -0.732073f, -0.040361f, 0.322550f,  -0.021123f, 0.212518f,
+  -0.350546f, 0.435987f,  -0.111756f, -0.401568f, 0.069548f,  -0.313000f,
+  0.073918f,  -0.373805f, -0.775810f, -0.124753f, 0.181094f,  -0.602641f,
+  -0.026219f, -0.350112f, 0.020599f,  -0.311752f, -0.476482f, -0.669465f,
+  -0.310921f, 0.348869f,  -0.115984f, 0.154250f,  0.200485f,  -0.016689f,
+  0.020392f,  0.413810f,  0.634064f,  -0.627530f, 0.399178f,  -0.012284f,
+  0.472030f,  0.091087f,  -0.706100f, -0.447944f, -0.274226f, 0.445656f,
+  0.309339f,  0.505522f,  0.038496f,  -0.152809f, 0.408684f,  -0.068151f,
+  0.271612f,  0.353233f,  -0.150365f, 0.075212f,  -0.035096f, 0.346615f,
+  0.124382f,  0.477072f,  0.216288f,  0.070548f,  -0.106362f, 0.681613f,
+  -0.145502f, -0.218631f, -0.099248f, -0.001983f, -0.196819f, -0.969045f,
+  0.063009f,  -0.123053f, 0.104875f,  -0.137581f, -0.282933f, -0.003624f,
+  -0.315659f, -0.333523f, -0.503000f, -0.100063f, -0.536711f, -0.059978f,
+  -0.670248f, -0.353762f, 0.181109f,  0.289715f,  -0.071206f, 0.261141f,
+  0.052796f,  -0.114554f, -0.139214f, -0.261380f, 0.075984f,  -0.647925f,
+  -0.099528f, -0.677814f, 0.015712f,  -0.389385f, -0.095622f, -0.165117f,
+  -0.109454f, -0.175240f, -0.393914f, 0.212330f,  0.037822f,  0.248280f,
+  0.180197f,  0.110493f,  -0.525727f, -0.092329f, -0.524029f, -0.407364f,
+  -0.542373f, -0.435626f, -0.912194f, 0.062794f,  0.160433f,  0.741485f,
+  -0.103659f, -0.119327f, -0.055275f, 0.334358f,  0.014713f,  0.046327f,
+  0.831114f,  -0.576682f, 0.354369f,  -0.082088f, 0.452331f,  0.039730f,
+  -0.792429f, -0.385862f,
+};
+
+static const float av1_tx_split_nn_bias_4x8_layer0[16] = {
+  0.238621f,  2.186830f,  1.383035f,  -0.867139f, 1.257119f, -0.351571f,
+  -0.240650f, -0.971692f, 2.744843f,  1.116991f,  0.139062f, -0.165332f,
+  0.262171f,  -1.598153f, -1.427340f, -1.602306f,
+};
+
+static const float av1_tx_split_nn_weights_4x8_layer1[16] = {
+  -0.367134f, 1.373058f, -0.897039f, -0.326819f, -0.734030f, -0.290413f,
+  -0.501249f, 0.505321f, -0.537692f, -0.767893f, 0.268697f,  0.278987f,
+  0.085082f,  0.614986f, 0.847904f,  0.637578f,
+};
+
+static const float av1_tx_split_nn_bias_4x8_layer1[1] = {
+  0.20586078f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_4x8 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_4x8_layer0,
+      av1_tx_split_nn_weights_4x8_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_4x8_layer0,
+      av1_tx_split_nn_bias_4x8_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 8x8 block.
+static const float av1_tx_split_nn_weights_8x8_layer0[144] = {
+  0.177983f,  -0.938386f, -0.074460f, -0.221843f, -0.073182f, -0.295155f,
+  -0.098202f, -0.279510f, 0.001054f,  -0.119319f, -1.835282f, -0.581507f,
+  -1.222222f, -1.049006f, -0.807508f, -0.454252f, -0.774879f, -0.180607f,
+  -0.886976f, -0.231971f, -0.824677f, -0.351872f, -1.323819f, 0.235378f,
+  0.015331f,  -0.341818f, 0.145549f,  -0.348362f, 0.147647f,  -0.323400f,
+  0.047558f,  -0.553025f, -0.295485f, -0.330368f, -0.530605f, -0.407516f,
+  0.447740f,  0.782381f,  -0.179164f, -0.584675f, -0.052645f, 0.038656f,
+  -0.096783f, 0.038342f,  -0.170762f, -0.405844f, -0.552665f, -0.509866f,
+  0.757204f,  -1.296465f, 0.631015f,  0.009265f,  0.646192f,  0.044523f,
+  0.653161f,  0.033820f,  0.849639f,  -0.068555f, -1.036085f, -0.511652f,
+  0.104693f,  -1.458690f, 0.286051f,  -0.089800f, 0.381564f,  -0.302640f,
+  0.304465f,  -0.268706f, 0.432603f,  -0.117914f, -2.070031f, -0.565696f,
+  -0.073027f, -1.783570f, -0.318144f, -0.320990f, -0.343966f, -0.140996f,
+  -0.322977f, -0.232147f, -0.373210f, -0.158266f, -1.922305f, -0.634373f,
+  0.101894f,  -0.221847f, 0.018412f,  -0.423887f, -0.266684f, -0.444930f,
+  -0.196237f, 0.106638f,  -0.065834f, -0.538401f, -0.280772f, -0.620348f,
+  1.089957f,  -0.799928f, 0.504112f,  -0.165763f, 0.578741f,  -0.172653f,
+  0.547316f,  -0.143484f, 0.717220f,  -0.297190f, -1.237854f, -0.074819f,
+  -0.977304f, -0.484092f, -0.646427f, -0.451443f, -0.612126f, -0.224475f,
+  -0.731608f, -0.257077f, -0.665857f, -0.346742f, -1.216372f, 0.227267f,
+  0.231249f,  -1.693073f, -0.035899f, 0.380845f,  -0.058476f, 0.409405f,
+  -0.066679f, 0.406731f,  -0.068501f, 0.396748f,  0.639462f,  0.150834f,
+  -0.418659f, -1.421931f, 0.101889f,  0.083573f,  0.129746f,  0.134460f,
+  0.081185f,  0.127420f,  0.083664f,  0.051096f,  1.361688f,  0.386093f,
+};
+
+static const float av1_tx_split_nn_bias_8x8_layer0[12] = {
+  4.280443f, 2.218902f, -0.256953f, 3.161431f,  2.082548f, 2.506052f,
+  2.563224f, 1.421976f, -1.627813f, -1.436085f, 2.297265f, 1.500469f,
+};
+
+static const float av1_tx_split_nn_weights_8x8_layer1[12] = {
+  1.178833f,  -0.428527f, -0.078737f, 0.381434f, -0.466895f, -0.901745f,
+  -0.766968f, -0.356663f, 0.450146f,  0.509370f, -0.356604f, -0.443506f,
+};
+
+static const float av1_tx_split_nn_bias_8x8_layer1[1] = {
+  -0.156294f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x8 = {
+  12,  // num_inputs
+  1,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      12,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_8x8_layer0,
+      av1_tx_split_nn_weights_8x8_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_8x8_layer0,
+      av1_tx_split_nn_bias_8x8_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 8x16 block.
+static const float av1_tx_split_nn_weights_8x16_layer0[8 * 64] = {
+  0.374660f,  0.218905f,  -0.139779f, 0.212141f,  0.056517f,  0.051114f,
+  0.042860f,  -0.273258f, -0.340809f, 0.138983f,  -0.216996f, -0.241519f,
+  -0.123244f, 0.078577f,  -0.472273f, -0.194201f, 0.125056f,  0.239761f,
+  -0.332782f, 0.174782f,  -0.211400f, -0.129795f, 0.062195f,  0.113176f,
+  -0.008869f, 0.140764f,  0.059833f,  0.163826f,  0.359293f,  -0.109797f,
+  -0.022091f, -0.059536f, -0.188226f, 0.179709f,  0.031386f,  0.164790f,
+  0.214364f,  0.198555f,  0.152262f,  -0.242980f, 0.319367f,  -0.136902f,
+  0.046524f,  -0.043591f, 0.342178f,  -0.011757f, -0.014286f, 0.072871f,
+  -0.278314f, -0.345303f, -0.252103f, -0.107154f, -0.235101f, -0.106739f,
+  -0.120865f, -0.160042f, 0.240028f,  0.112902f,  -0.141587f, -0.703012f,
+  -0.136591f, 0.318993f,  -0.154417f, -0.054668f, 0.192870f,  0.176166f,
+  -0.029965f, 0.266942f,  -0.178384f, 0.038680f,  0.134403f,  -0.002426f,
+  0.534825f,  -0.070923f, 0.413281f,  0.418148f,  0.093729f,  0.016454f,
+  0.305358f,  -0.040512f, 0.069904f,  -0.227588f, -0.362220f, -0.031604f,
+  -0.394901f, 0.071506f,  -0.342833f, -0.142550f, -0.164005f, 0.182600f,
+  0.213062f,  0.076805f,  0.278758f,  0.125613f,  -0.035552f, 0.040971f,
+  0.182785f,  -0.227961f, -0.105413f, -0.074949f, -0.084629f, -0.254767f,
+  0.114657f,  0.047121f,  0.195902f,  0.264759f,  0.017799f,  0.210230f,
+  0.150749f,  -0.142142f, 0.182494f,  -0.142415f, -0.259782f, -0.114830f,
+  -0.198826f, 0.000061f,  -0.375668f, -0.276656f, -0.373202f, 0.210298f,
+  0.422680f,  0.066960f,  0.351106f,  -0.209034f, 0.367195f,  -0.110274f,
+  0.115573f,  -0.066642f, -0.389673f, -0.260447f, 0.056949f,  -0.180425f,
+  0.069922f,  -0.153506f, -0.097053f, -0.111757f, 0.094069f,  0.144837f,
+  -0.052984f, -0.506681f, -0.034474f, 0.279057f,  -0.105025f, 0.006656f,
+  -0.125017f, -0.114096f, 0.103153f,  -0.117402f, -0.359472f, 0.072534f,
+  0.110291f,  0.003088f,  -0.456897f, 0.038331f,  -0.322298f, 0.113942f,
+  -0.119916f, -0.194392f, 0.093167f,  0.193459f,  0.074671f,  0.033602f,
+  0.004440f,  -0.179578f, -0.036637f, -0.216172f, -0.296530f, -0.318992f,
+  0.319160f,  -0.066218f, 0.291246f,  0.181292f,  0.089914f,  0.025273f,
+  0.303128f,  0.019063f,  0.078545f,  -0.396919f, 0.014065f,  -0.122121f,
+  0.037107f,  -0.151886f, -0.299392f, -0.172207f, -0.124571f, -0.232553f,
+  0.102970f,  -0.225040f, 0.061059f,  -0.258188f, -0.469871f, -0.099607f,
+  -0.061524f, -0.213700f, 0.070237f,  -0.289134f, -0.238225f, 0.256403f,
+  -0.119344f, 0.067782f,  -0.398983f, -0.123975f, -0.200205f, -0.047038f,
+  0.026569f,  0.031037f,  0.094302f,  -0.101239f, 0.433307f,  -0.303612f,
+  0.088537f,  -0.164436f, 0.202471f,  -0.048592f, -0.251904f, 0.122577f,
+  -0.309874f, -0.263405f, -0.292503f, 0.216589f,  0.035378f,  0.136599f,
+  -0.145844f, -0.018211f, 0.174084f,  -0.449941f, -0.001428f, 0.064134f,
+  0.039652f,  0.111083f,  -0.246076f, -0.204733f, 0.056559f,  -0.000123f,
+  0.104049f,  0.138512f,  -0.128309f, 0.087855f,  0.232784f,  0.247138f,
+  0.162766f,  0.154829f,  0.313605f,  -0.164115f, -0.050844f, 0.156549f,
+  0.185279f,  -0.238962f, -0.308281f, -0.179592f, -0.193262f, 0.201670f,
+  -0.203399f, -0.096831f, -0.127867f, 0.310674f,  -0.008181f, 0.004078f,
+  -0.211038f, -0.193480f, -0.185639f, -0.150202f, -0.204858f, -0.240758f,
+  0.114268f,  -0.032535f, -0.052403f, -0.234333f, -0.064072f, -0.208444f,
+  -0.352853f, -0.224001f, -0.156330f, 0.215436f,  0.171846f,  0.291849f,
+  0.108832f,  0.046991f,  -0.127801f, 0.032485f,  0.141493f,  0.123319f,
+  -0.057250f, 0.315346f,  -0.061317f, -0.465086f, -0.130179f, -0.217841f,
+  -0.239089f, -0.073251f, -0.327718f, 0.054905f,  -0.283169f, -0.028900f,
+  0.071450f,  0.270072f,  0.248891f,  0.088052f,  0.253319f,  0.122808f,
+  0.175490f,  -0.147805f, 0.089169f,  -0.045457f, -0.330788f, 0.099791f,
+  -0.137376f, -0.195977f, -0.350942f, -0.284930f, -0.559037f, 0.030504f,
+  0.162554f,  -0.199100f, -0.050453f, -0.131320f, -0.077863f, -0.066253f,
+  -0.379723f, -0.424047f, -0.081182f, -0.252261f, -0.102815f, 0.058240f,
+  -0.182036f, 0.176772f,  -0.070823f, 0.216054f,  -0.211533f, -0.232992f,
+  0.279346f,  0.117984f,  0.236674f,  0.126625f,  -0.046220f, 0.044919f,
+  0.278492f,  0.083944f,  0.180512f,  0.217994f,  0.401170f,  -0.064417f,
+  0.011636f,  -0.139597f, -0.050020f, -0.268438f, -0.032803f, 0.024908f,
+  -0.085713f, -0.012984f, -0.055192f, -0.338657f, 0.045826f,  -0.312849f,
+  -0.023393f, -0.168800f, -0.030886f, -0.131816f, -0.253542f, -0.104812f,
+  -0.354389f, 0.169464f,  0.094151f,  -0.217122f, -0.456397f, 0.211478f,
+  0.219232f,  -0.155519f, -0.353700f, -0.264759f, -0.034709f, 0.034409f,
+  -0.148639f, -0.132850f, -0.216791f, -0.118492f, 0.173721f,  -0.144181f,
+  0.335028f,  0.176439f,  0.105980f,  0.169390f,  0.155615f,  -0.040618f,
+  -0.176029f, 0.155569f,  -0.184833f, -0.171099f, -0.178663f, -0.032051f,
+  -0.434334f, 0.092238f,  -0.263103f, 0.061804f,  -0.172957f, 0.005962f,
+  -0.100176f, 0.125898f,  0.048092f,  -0.088141f, 0.247196f,  -0.221601f,
+  -0.114474f, -0.124410f, -0.156393f, -0.181782f, -0.083562f, 0.034937f,
+  0.403401f,  -0.046200f, 0.322259f,  0.219678f,  0.109850f,  0.051837f,
+  0.196861f,  -0.019118f, 0.248818f,  -0.137567f, 0.127862f,  0.052293f,
+  0.298726f,  0.275788f,  0.015344f,  0.058714f,  0.283691f,  -0.053794f,
+  -0.123270f, -0.227761f, -0.141744f, -0.268515f, -0.007189f, -0.242117f,
+  -0.252396f, -0.069017f, 0.034803f,  -0.003388f, -0.262577f, 0.062115f,
+  -0.298393f, 0.215415f,  -0.153615f, 0.289902f,  0.085886f,  -0.504290f,
+  0.077178f,  0.150861f,  -0.228848f, -0.261020f, 0.198204f,  0.162113f,
+  0.346418f,  -0.286950f, 0.354756f,  -0.226419f, 0.024720f,  0.208037f,
+  0.107286f,  -0.110849f, 0.104415f,  -0.207725f, 0.063932f,  -0.037748f,
+  -0.167037f, -0.068282f, 0.320815f,  -0.051884f, 0.099989f,  -0.078388f,
+  0.127071f,  0.046675f,  -0.336571f, -0.273080f, 0.264694f,  -0.007352f,
+  -0.093828f, 0.094773f,  -0.144434f, 0.091795f,  -0.031615f, 0.056914f,
+  0.064673f,  -0.136669f, 0.344734f,  0.225926f,  0.283451f,  -0.068354f,
+  0.030572f,  0.180784f,  -0.378047f, -0.092962f, -0.083291f, 0.038970f,
+  0.052094f,  -0.017932f, 0.216302f,  -0.184396f, 0.079888f,  0.210406f,
+  -0.020627f, 0.244744f,  0.336972f,  -0.182914f, -0.220976f, -0.304225f,
+  -0.330974f, -0.370868f, -0.084935f, -0.136489f, -0.210082f, -0.188088f,
+  -0.408768f, 0.184693f,
+};
+
+static const float av1_tx_split_nn_bias_8x16_layer0[64] = {
+  -0.274107f, 0.445751f,  0.234359f,  0.291593f,  0.163298f,  0.183707f,
+  -0.548839f, -0.190779f, -0.163346f, -0.669028f, 0.399209f,  -0.354974f,
+  0.000000f,  -0.254630f, 0.220149f,  0.371104f,  0.789759f,  0.270300f,
+  0.195126f,  -0.206958f, 0.917708f,  -0.256232f, 1.131933f,  1.178944f,
+  0.461270f,  0.246169f,  -0.818614f, -0.111986f, 0.759355f,  0.154889f,
+  0.470299f,  -1.025250f, 0.678678f,  0.959346f,  -0.164105f, 0.544079f,
+  -0.448733f, 0.649221f,  -0.536672f, 0.962758f,  -0.256427f, 0.808664f,
+  -0.118694f, 0.684873f,  -0.015635f, -0.046469f, 0.075481f,  0.412647f,
+  0.454456f,  -0.107169f, 0.775235f,  -0.261629f, -1.194849f, 0.010093f,
+  -0.231289f, 0.658286f,  -0.769320f, 0.564545f,  0.482962f,  -0.131378f,
+  -0.255844f, -0.078400f, 0.476752f,  0.643001f,
+};
+
+static const float av1_tx_split_nn_weights_8x16_layer1[64] = {
+  -0.145065f, -0.145101f, 0.174786f,  0.196692f,  0.102025f,  -0.087735f,
+  0.386353f,  -0.660539f, -0.183940f, 0.490045f,  -0.276404f, -0.145669f,
+  0.209846f,  -0.085574f, -0.156821f, -0.377450f, -0.950010f, 0.450709f,
+  -0.108545f, -0.261181f, 1.435606f,  -0.176621f, -1.158548f, 2.035680f,
+  0.218069f,  -0.138629f, 0.305958f,  -0.277194f, -0.602468f, 0.203873f,
+  0.120720f,  0.216095f,  -0.434502f, -0.579746f, -0.239450f, 0.755529f,
+  0.545643f,  0.232091f,  0.330169f,  0.988136f,  -0.070465f, -0.345584f,
+  -0.162455f, -0.617064f, 0.123881f,  -0.201098f, 0.222756f,  0.112932f,
+  0.048647f,  -0.147890f, 0.394584f,  -0.262148f, 0.280564f,  -0.195432f,
+  -0.047515f, 1.133410f,  0.255415f,  -0.299032f, -0.397807f, -0.153246f,
+  -0.256734f, 0.177370f,  0.213522f,  -0.530158f,
+};
+
+static const float av1_tx_split_nn_bias_8x16_layer1[1] = {
+  0.14910713f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x16 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      64,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_8x16_layer0,
+      av1_tx_split_nn_weights_8x16_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_8x16_layer0,
+      av1_tx_split_nn_bias_8x16_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 16x16 block.
+static const float av1_tx_split_nn_weights_16x16_layer0[12 * 24] = {
+  -0.177215f, -0.297166f, 0.299924f,  0.207878f,  0.216871f,  0.173264f,
+  0.295464f,  0.048395f,  0.154731f,  0.305880f,  0.056787f,  -0.166617f,
+  0.115653f,  -0.529477f, -0.073995f, -0.211746f, -0.018169f, 0.000788f,
+  -0.024940f, -0.007055f, 0.001392f,  0.021678f,  -1.594600f, -0.099593f,
+  0.332930f,  0.103574f,  0.158249f,  0.182601f,  0.332665f,  0.226207f,
+  -0.139566f, 0.185531f,  0.099074f,  -0.185654f, -0.203121f, -0.285678f,
+  -0.313453f, -0.294452f, -0.143707f, -0.031265f, -0.453030f, -0.061874f,
+  -0.066150f, -0.099058f, -0.458879f, 0.127544f,  0.338314f,  -0.161350f,
+  0.030091f,  -0.075528f, 0.004320f,  0.353690f,  -0.013480f, -0.420402f,
+  -0.004659f, -0.329401f, -0.001745f, 0.227384f,  -0.055183f, 0.121405f,
+  0.160340f,  0.143603f,  -0.221813f, 0.079107f,  -0.657639f, -0.084348f,
+  -0.303414f, 0.046774f,  -0.367679f, 0.060005f,  0.168645f,  0.084421f,
+  -0.133625f, 0.301375f,  0.079412f,  -0.419303f, 0.017235f,  0.068637f,
+  0.018384f,  -0.428325f, -0.019753f, 0.149444f,  -0.474836f, -0.287162f,
+  0.198083f,  0.028292f,  -0.299092f, -0.005849f, -0.256245f, 0.233277f,
+  -0.217561f, -0.264003f, 0.269411f,  0.207032f,  -0.339411f, -0.198431f,
+  -0.028521f, 0.158076f,  0.177116f,  0.345702f,  -0.145132f, 0.064623f,
+  -0.090867f, 0.288816f,  -0.263198f, -0.071028f, -0.044546f, 0.380017f,
+  -0.014100f, -0.271192f, -0.318559f, 0.129015f,  -0.050314f, -0.093355f,
+  -0.578498f, 0.099090f,  -0.133080f, -0.029975f, -0.059828f, -0.157765f,
+  -0.321153f, -0.343671f, -0.242959f, 0.128304f,  0.017170f,  0.072787f,
+  -0.475838f, -0.003806f, -0.068615f, 0.150556f,  -0.159903f, -0.416513f,
+  0.218794f,  -0.290456f, -0.084569f, -0.170014f, -0.044414f, -0.153069f,
+  -0.077329f, -0.089747f, -0.096526f, 0.537952f,  0.134725f,  -0.006469f,
+  -0.323335f, -0.168183f, -0.107163f, -0.139954f, 0.011286f,  -0.021712f,
+  -0.513992f, 0.259135f,  -0.319808f, 0.077811f,  0.104613f,  0.370571f,
+  0.185244f,  0.065530f,  -0.091098f, -0.573741f, 0.111934f,  0.437417f,
+  -0.123691f, 0.220641f,  -0.024783f, -0.149460f, -0.354185f, -0.134127f,
+  0.038015f,  -0.380596f, 0.250980f,  0.142208f,  0.135170f,  -0.131129f,
+  -0.357556f, -0.530945f, 0.159672f,  -0.147025f, -0.377829f, -0.504508f,
+  -0.492870f, 0.020753f,  0.142818f,  0.025172f,  0.086140f,  0.091283f,
+  0.087491f,  -0.186415f, 0.177785f,  -0.195121f, -1.191148f, -0.477102f,
+  0.023371f,  0.227004f,  -0.023502f, -0.242913f, -0.074398f, -0.153480f,
+  0.162900f,  0.415509f,  -0.162565f, -0.131709f, -0.258852f, -0.252027f,
+  -0.080845f, -0.330274f, 0.021874f,  0.232398f,  0.069277f,  0.220567f,
+  -0.024237f, -0.366771f, 0.081673f,  -0.429906f, -0.302170f, 0.061045f,
+  0.352777f,  -0.230376f, 0.408153f,  0.064758f,  0.142051f,  0.007219f,
+  0.622878f,  0.212577f,  0.036489f,  0.081150f,  -0.284767f, 0.107763f,
+  -0.529786f, -0.072190f, -0.300421f, -0.287959f, -0.568900f, 0.011547f,
+  -0.131696f, -0.356854f, -0.587962f, -0.026598f, 0.405829f,  0.057565f,
+  0.414265f,  -0.159155f, 0.221456f,  0.146314f,  0.265776f,  -0.006516f,
+  0.473978f,  -0.186431f, 0.288672f,  -0.060437f, 0.083380f,  -0.205641f,
+  0.360016f,  0.222041f,  0.420011f,  0.024579f,  0.377546f,  0.250380f,
+  -0.069900f, 0.296743f,  0.073532f,  -0.243225f, -0.374987f, -0.387288f,
+  -0.237255f, -0.287013f, 0.417831f,  -0.252988f, -0.257652f, -0.066775f,
+  -0.253926f, 0.057841f,  0.346133f,  -0.157797f, -0.406028f, -0.286893f,
+  0.274507f,  -0.452561f, 0.143381f,  -0.097755f, 0.021242f,  0.034561f,
+  0.044115f,  0.004065f,  0.066729f,  0.043558f,  0.102991f,  -0.477574f,
+};
+
+static const float av1_tx_split_nn_bias_16x16_layer0[24] = {
+  -0.479033f, 1.467402f,  -0.366291f, 0.372511f,  0.715322f,  -0.605500f,
+  0.176848f,  0.032318f,  0.237429f,  -0.046047f, 0.452082f,  0.451805f,
+  -0.822845f, 0.636762f,  -0.057350f, 1.163978f,  0.728287f,  0.603654f,
+  -0.245519f, -0.893569f, -1.428185f, 0.808870f,  -0.076159f, 1.231976f,
+};
+
+static const float av1_tx_split_nn_weights_16x16_layer1[24] = {
+  -0.176161f, 1.670188f, -0.180755f, -0.321326f, 0.249728f,  -0.170504f,
+  -0.538432f, 0.033893f, 0.149842f,  0.404140f,  -0.377812f, 0.338838f,
+  -0.176091f, 0.249844f, -0.362533f, 1.412460f,  0.196862f,  0.278194f,
+  -0.140444f, 0.297746f, 0.172533f,  0.116470f,  -0.151656f, -0.603250f,
+};
+
+static const float av1_tx_split_nn_bias_16x16_layer1[1] = {
+  0.184803f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x16 = {
+  12,  // num_inputs
+  1,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      24,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_16x16_layer0,
+      av1_tx_split_nn_weights_16x16_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_16x16_layer0,
+      av1_tx_split_nn_bias_16x16_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 32x32 block.
+static const float av1_tx_split_nn_weights_32x32_layer0[12 * 32] = {
+  -0.439303f, 0.004813f,  -0.365052f, -0.116868f, -0.356716f, -0.196537f,
+  -0.196770f, -0.076096f, 0.357004f,  -0.044909f, -0.112910f, -0.129081f,
+  0.156725f,  -0.386346f, 0.038971f,  0.160696f,  0.204923f,  -0.384333f,
+  -0.319546f, 0.028179f,  -0.250524f, -0.289669f, -0.284138f, -0.258963f,
+  -0.180854f, -0.000807f, -0.029620f, -0.353134f, 0.212408f,  0.141414f,
+  0.303016f,  0.098066f,  0.482455f,  0.036069f,  -0.166279f, 0.210119f,
+  -0.086337f, -0.023550f, -0.250796f, -0.183945f, -0.393856f, 0.170608f,
+  -0.306403f, 0.026318f,  -0.277296f, 0.092684f,  -0.033584f, -0.018371f,
+  -0.025043f, -0.257659f, -0.139163f, -0.206949f, -0.190105f, 0.028053f,
+  0.361851f,  -0.364726f, -0.096771f, -0.184166f, -0.433228f, -0.182191f,
+  -0.097051f, 0.259172f,  0.016432f,  0.259358f,  0.145059f,  0.037196f,
+  0.091581f,  -0.219644f, 0.140384f,  -0.446837f, -0.234531f, 0.149508f,
+  -0.083429f, 0.186189f,  -0.099890f, -0.111277f, 0.495214f,  0.085053f,
+  -0.266613f, -0.051366f, 0.148593f,  0.111875f,  0.077787f,  -0.371653f,
+  -0.146157f, -0.229235f, 0.076203f,  0.488975f,  0.096771f,  -0.009483f,
+  0.192985f,  0.246273f,  -0.192671f, -0.557890f, -0.292650f, -0.088907f,
+  -0.106892f, -0.329659f, 0.012105f,  -0.359326f, 0.170723f,  -0.004357f,
+  0.171593f,  -0.478768f, -0.236016f, -0.035077f, 0.133731f,  0.137962f,
+  -0.397926f, -0.155164f, -0.276709f, -0.186602f, -0.258301f, 0.036965f,
+  -0.649359f, 0.127605f,  0.097930f,  0.182775f,  -0.313324f, 0.053349f,
+  0.204203f,  -0.222948f, -0.059008f, -0.049759f, -0.056848f, 0.087497f,
+  -0.039987f, -0.055042f, -0.041623f, -0.078424f, -0.317291f, -0.191398f,
+  0.632147f,  0.221825f,  0.268394f,  -0.096357f, 0.442545f,  -0.007117f,
+  -0.036125f, 0.000525f,  0.088092f,  -0.203653f, 0.086925f,  0.439141f,
+  0.329889f,  -0.370050f, -0.194306f, -0.207430f, 0.132779f,  -0.217614f,
+  -0.039444f, -0.053019f, -0.260725f, -0.116563f, -0.271048f, 0.283737f,
+  -0.007300f, 0.062257f,  -0.347865f, -0.296767f, -0.359123f, 0.230459f,
+  -0.189117f, -0.087622f, -0.561091f, 0.184182f,  -0.044980f, 0.012643f,
+  0.241672f,  0.050272f,  -0.204851f, -0.159285f, -0.064081f, -0.118666f,
+  -0.269471f, 0.231668f,  0.135749f,  -0.131162f, 0.062760f,  0.100949f,
+  0.074967f,  -0.056918f, 0.251707f,  0.034098f,  0.341290f,  -0.105027f,
+  0.313246f,  -0.092679f, -0.014632f, -0.390967f, 0.136881f,  -0.241554f,
+  0.097674f,  0.110832f,  -0.390245f, 0.017654f,  -0.506222f, 0.065252f,
+  0.244834f,  -0.171352f, -0.331702f, 0.111043f,  0.125217f,  -0.058116f,
+  -0.382595f, -0.052545f, 0.114261f,  -0.493617f, 0.243984f,  -0.171053f,
+  0.165009f,  -0.063020f, 0.096502f,  0.341339f,  -0.013443f, 0.056372f,
+  0.339284f,  0.398376f,  0.389409f,  0.257252f,  0.517368f,  0.078856f,
+  0.087716f,  -0.171092f, 0.227461f,  0.125307f,  -0.054423f, -0.143161f,
+  0.224041f,  -0.086477f, -0.092548f, 0.072392f,  -0.061608f, 0.258347f,
+  0.147033f,  -0.478244f, -0.204869f, 0.038552f,  -0.144563f, 0.224087f,
+  -0.296705f, 0.153889f,  -0.064624f, 0.085265f,  -0.103826f, 0.127971f,
+  0.019965f,  0.111937f,  -0.074187f, -0.029518f, -0.127305f, -0.012210f,
+  0.042714f,  0.070052f,  -0.202360f, 0.348144f,  -0.132097f, -0.209585f,
+  -0.248286f, -0.065774f, -0.089482f, -0.133226f, 0.325430f,  -0.013468f,
+  -0.406090f, -0.144936f, 0.208620f,  0.343445f,  -0.059639f, 0.114857f,
+  -0.069431f, -0.218725f, 0.190575f,  -0.368101f, 0.030030f,  0.062815f,
+  -0.239369f, -0.537852f, 0.022487f,  0.023038f,  0.190788f,  0.040123f,
+  -0.004304f, 0.060749f,  -0.108929f, 0.136796f,  -0.542875f, -0.227074f,
+  -0.182244f, 0.082559f,  0.019149f,  0.178854f,  0.120284f,  0.009070f,
+  0.068268f,  -0.544822f, 0.120536f,  0.354028f,  -0.119890f, -0.122055f,
+  -0.405335f, 0.122341f,  -0.304412f, 0.062405f,  -0.302568f, -0.276505f,
+  -0.120915f, -0.221841f, 0.282007f,  -0.253971f, 0.059517f,  -0.144976f,
+  0.149391f,  -0.047355f, -0.167742f, -0.392333f, -0.041132f, 0.342135f,
+  0.017485f,  0.021038f,  -0.023728f, -0.192181f, -0.103996f, 0.092873f,
+  -0.114365f, -0.397732f, -0.065421f, 0.053084f,  0.035201f,  0.053019f,
+  -0.105377f, -0.039500f, 0.131904f,  -0.123911f, -0.390328f, -0.125198f,
+  -0.000126f, 0.014864f,  -0.220187f, 0.084056f,  -0.492155f, -0.164979f,
+  0.133592f,  0.121519f,  -0.240813f, 0.186680f,  0.118673f,  0.235006f,
+  -0.239894f, -0.185759f, -0.336992f, 0.209620f,  -0.298845f, 0.127803f,
+  -0.083992f, 0.194340f,  -0.245378f, 0.212308f,  0.142512f,  -0.163324f,
+  0.383495f,  0.291065f,  0.286620f,  -0.239957f, 0.225127f,  -0.174424f,
+  0.297231f,  -0.045434f, 0.156444f,  -0.184273f, -0.204567f, 0.202551f,
+  0.370019f,  -0.073910f, 0.344897f,  0.063100f,  0.338547f,  -0.099145f,
+  0.391863f,  -0.214244f, -0.241734f, -0.281851f, -0.035133f, -0.153157f,
+};
+
+static const float av1_tx_split_nn_bias_32x32_layer0[32] = {
+  0.143343f,  -0.021982f, -0.314939f, 0.170867f,  -0.081248f, 0.125758f,
+  -0.355762f, 0.279798f,  1.027712f,  -0.434660f, 1.072005f,  0.668893f,
+  -0.031216f, -0.528650f, 0.328349f,  0.543645f,  -0.188810f, 0.221110f,
+  -1.638637f, 0.058045f,  -1.731105f, -0.444284f, 0.513693f,  0.890025f,
+  0.160288f,  0.393312f,  0.332856f,  -0.080767f, 0.299822f,  0.235876f,
+  0.254942f,  -0.017796f,
+};
+
+static const float av1_tx_split_nn_weights_32x32_layer1[32] = {
+  -0.090326f, -0.267553f, -0.026071f, 0.100912f,  0.279137f,  0.079064f,
+  -0.074885f, 0.053804f,  0.736810f,  -0.031693f, -0.970514f, 0.174069f,
+  0.095940f,  -0.065047f, 0.052911f,  0.176728f,  -0.058274f, 0.148364f,
+  -0.162210f, 0.093875f,  -0.367663f, 0.020876f,  0.137280f,  -1.099116f,
+  0.146854f,  0.075590f,  0.228534f,  0.141993f,  0.072143f,  0.101421f,
+  -0.068547f, -0.154148f,
+};
+
+static const float av1_tx_split_nn_bias_32x32_layer1[1] = {
+  0.316622f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_32x32 = {
+  12,  // num_inputs
+  1,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_32x32_layer0,
+      av1_tx_split_nn_weights_32x32_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_32x32_layer0,
+      av1_tx_split_nn_bias_32x32_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 64x64 block.
+static const float av1_tx_split_nn_weights_64x64_layer0[12 * 32] = {
+  -0.006828f, 0.149944f,  -0.017614f, -0.044599f, -0.024517f, 0.507698f,
+  0.001039f,  0.037164f,  0.015091f,  -0.306620f, -0.162047f, -0.369440f,
+  0.396310f,  0.087121f,  0.208609f,  -0.083068f, 0.493774f,  0.217682f,
+  0.377393f,  0.172879f,  0.397422f,  0.078919f,  0.741350f,  0.064169f,
+  -0.099989f, -0.192983f, -0.278230f, -0.310048f, -0.439965f, -0.226698f,
+  -0.436596f, -0.007551f, -0.396721f, 0.153570f,  -0.190838f, -0.071869f,
+  0.048799f,  -0.301301f, -0.005015f, 0.500480f,  -0.030622f, -0.559095f,
+  -0.032634f, -0.054160f, -0.056979f, -0.456545f, 0.306536f,  -0.411323f,
+  -0.005366f, -0.069496f, 0.019990f,  0.327931f,  -0.002516f, 0.393190f,
+  0.001759f,  0.035093f,  -0.030302f, -0.528984f, 0.174781f,  0.241462f,
+  -0.415427f, -0.164502f, 0.143065f,  -0.122595f, 0.082049f,  -0.143346f,
+  0.055642f,  -0.124701f, 0.004050f,  -0.216235f, -2.681730f, 0.101658f,
+  0.381239f,  0.465936f,  0.331154f,  0.301708f,  -0.360171f, 0.054886f,
+  -0.118658f, 0.287921f,  0.277859f,  0.203784f,  0.247809f,  0.656924f,
+  -0.354628f, 0.315081f,  0.105108f,  -0.510179f, 0.059267f,  0.061386f,
+  0.076423f,  0.347119f,  0.100134f,  0.028402f,  -0.118621f, -0.238689f,
+  0.080141f,  -0.138863f, 0.009009f,  -0.100526f, -0.138875f, 0.066992f,
+  0.005949f,  0.564336f,  0.046994f,  0.004655f,  0.366047f,  0.014695f,
+  -0.146928f, -0.024665f, -0.440357f, -0.109395f, 0.527231f,  -0.020925f,
+  -0.227236f, -0.068141f, 0.282009f,  0.040192f,  -0.267100f, 0.229228f,
+  0.133861f,  0.338706f,  -0.030178f, -0.040919f, -0.026343f, -0.330338f,
+  -0.066931f, -0.110580f, -0.072056f, 0.599457f,  -0.020738f, 0.169200f,
+  0.836240f,  -0.157548f, 0.386273f,  0.002404f,  0.329410f,  -0.007020f,
+  0.351705f,  -0.041259f, 0.388861f,  0.003899f,  0.582627f,  0.023572f,
+  0.409912f,  -0.158472f, 0.536383f,  0.525093f,  0.604247f,  0.439159f,
+  0.692832f,  0.046272f,  0.590367f,  -0.082166f, 0.262357f,  0.478671f,
+  0.031935f,  0.042675f,  0.120002f,  0.398616f,  -0.078967f, 0.227986f,
+  -0.044679f, 0.151061f,  -0.085564f, 0.220205f,  -0.265606f, -0.203623f,
+  0.204719f,  -0.125922f, 0.038544f,  -0.269379f, 0.025866f,  0.109967f,
+  0.019064f,  -0.237297f, -0.309746f, -0.329118f, -0.278368f, -0.063859f,
+  0.278496f,  0.018620f,  0.209971f,  0.296250f,  0.142850f,  0.288689f,
+  0.137084f,  0.130517f,  0.128171f,  -0.155396f, -0.008449f, -0.099845f,
+  0.173455f,  -0.059909f, -0.147318f, 0.102851f,  -0.251389f, -0.001448f,
+  0.103907f,  0.297273f,  -0.027846f, 0.028260f,  -0.382601f, 0.346695f,
+  -0.601641f, 0.162366f,  -0.477495f, -0.042731f, -0.387871f, -0.051791f,
+  -0.401498f, -0.048446f, -0.456270f, -0.062287f, 0.493919f,  0.003008f,
+  0.099917f,  -0.358525f, -0.094903f, -0.022811f, -0.062259f, 0.019455f,
+  -0.050644f, 0.020041f,  -0.132912f, -0.061578f, -3.083691f, -0.014961f,
+  -0.129115f, -0.710559f, 0.157213f,  -0.844037f, -0.121991f, -0.943386f,
+  -0.231269f, -0.003462f, 0.331478f,  -0.132703f, -1.285993f, -0.120957f,
+  -0.373755f, -0.322609f, 0.309059f,  -0.131523f, -0.118334f, -0.063805f,
+  -0.104251f, 0.012166f,  -0.094699f, -0.283753f, 0.128168f,  -0.526929f,
+  -0.050331f, 0.186153f,  0.005913f,  -0.221236f, 0.036363f,  0.160909f,
+  -0.001342f, -0.382749f, 0.037820f,  0.281689f,  -0.024275f, 0.028854f,
+  0.318291f,  0.318526f,  0.035778f,  0.034031f,  0.189663f,  -0.293367f,
+  0.082022f,  0.127923f,  0.078866f,  -0.081361f, -0.268117f, 0.246675f,
+  0.248605f,  -0.215479f, -0.073084f, 0.496140f,  -0.067327f, 0.396237f,
+  -0.120739f, 0.033752f,  -0.044120f, -0.218941f, -0.028078f, 0.195132f,
+  -0.040400f, 0.281604f,  -0.100471f, 0.415207f,  -0.258503f, -0.429749f,
+  0.150569f,  -0.010859f, 0.136448f,  0.026589f,  0.148466f,  0.110764f,
+  0.380967f,  0.009177f,  0.103075f,  0.116417f,  0.226273f,  -0.327746f,
+  0.169346f,  0.284553f,  -0.094986f, 0.312745f,  -0.147840f, 0.025062f,
+  -0.494482f, 0.112388f,  -0.213962f, 0.107050f,  -0.433371f, -0.096276f,
+  -0.244835f, -0.003518f, -0.459148f, -0.145080f, 0.017150f,  0.042846f,
+  -0.237479f, 0.104746f,  0.158677f,  0.358937f,  0.099921f,  0.277109f,
+  0.012410f,  -0.062897f, 0.116130f,  0.255309f,  0.341628f,  0.145002f,
+  -0.429344f, -0.016433f, -0.068985f, 0.285194f,  -0.286719f, -0.018298f,
+  -0.179369f, -0.194655f, -0.165380f, 0.026071f,  -0.428268f, -0.379929f,
+  -0.727543f, 0.179610f,  -0.963979f, -0.042026f, -0.616202f, 0.133401f,
+  -0.784966f, 0.061205f,  -0.713357f, 0.129795f,  0.120512f,  -0.339545f,
+  0.353557f,  0.114906f,  -0.329813f, -0.209987f, 0.085410f,  0.214313f,
+  -0.122082f, 0.335770f,  -0.020937f, 0.202456f,  0.289023f,  -0.421186f,
+  0.337905f,  0.407663f,  0.132771f,  0.071734f,  0.213914f,  0.128595f,
+  0.302659f,  -0.209501f, 0.217756f,  0.253079f,  -0.089505f, -0.205614f,
+};
+
+static const float av1_tx_split_nn_bias_64x64_layer0[32] = {
+  0.296914f,  -1.826816f, 0.346130f,  0.969520f,  -0.528154f, 1.175862f,
+  -0.075985f, -0.097323f, -0.233059f, 0.004846f,  0.401279f,  -2.272435f,
+  0.086257f,  0.414162f,  -0.194786f, -0.233887f, -0.113215f, -2.453546f,
+  0.861214f,  0.298361f,  0.267397f,  -0.158557f, -0.119911f, -0.098134f,
+  -0.339263f, 0.385871f,  -0.678123f, 0.263218f,  0.251611f,  -1.155773f,
+  -0.365437f, 0.229255f,
+};
+
+static const float av1_tx_split_nn_weights_64x64_layer1[32] = {
+  0.502104f,  -0.708023f, 0.419648f,  1.583418f,  0.419355f,  -1.462981f,
+  -0.439623f, 0.405691f,  0.823257f,  0.061654f,  0.750875f,  0.775031f,
+  -0.387909f, 0.447385f,  0.284690f,  0.353262f,  -0.224347f, 0.832864f,
+  -1.708491f, -1.042447f, -0.272829f, 0.540640f,  0.310509f,  0.723745f,
+  0.245592f,  -0.218417f, -0.597987f, -0.362301f, 0.702217f,  -0.692614f,
+  0.207812f,  0.513560f,
+};
+
+static const float av1_tx_split_nn_bias_64x64_layer1[1] = { -0.2307045f };
+
+static const NN_CONFIG av1_tx_split_nnconfig_64x64 = {
+  12,  // num_inputs
+  1,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_64x64_layer0,
+      av1_tx_split_nn_weights_64x64_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_64x64_layer0,
+      av1_tx_split_nn_bias_64x64_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 4x16 block.
+static const float av1_tx_split_nn_weights_4x16_layer0[8 * 16] = {
+  -1.344184f, -1.454625f, -0.703110f, -0.140570f, -0.841536f, -0.068131f,
+  -2.128968f, -0.655518f, 0.432180f,  0.879752f,  -0.222211f, 0.061615f,
+  -0.230969f, 0.569496f,  1.424188f,  0.598063f,  -0.436005f, -0.737606f,
+  -0.137875f, -0.085730f, -0.076512f, -0.583101f, -0.937377f, -0.203556f,
+  -0.215797f, -0.015361f, -0.124098f, -0.411917f, 0.340441f,  -0.331752f,
+  -0.472607f, -0.097714f, -0.930572f, -1.354713f, -0.550724f, 0.176212f,
+  -0.636060f, 0.183271f,  -0.610212f, 0.345895f,  -1.100906f, -1.605713f,
+  0.111888f,  -0.140937f, 0.063013f,  -0.013315f, -0.273472f, -0.255870f,
+  1.200328f,  0.274002f,  1.005776f,  0.322392f,  1.222373f,  0.158227f,
+  0.408810f,  0.145022f,  0.139842f,  -1.249412f, 0.286672f,  -0.635699f,
+  0.312562f,  -0.495606f, -1.117034f, -0.085107f, -0.097484f, -0.341521f,
+  -0.132199f, -0.863055f, 0.217579f,  -1.161425f, -0.302087f, -1.357271f,
+  -0.520724f, -1.211069f, -1.048729f, -0.333087f, -1.171527f, -0.280824f,
+  -2.057684f, -0.228755f, 0.606278f,  0.101198f,  -0.314847f, -1.303255f,
+  -0.294964f, 1.301923f,  0.041712f,  0.077593f,  -1.152746f, 0.495315f,
+  -0.751566f, 0.230249f,  -0.840661f, 0.100731f,  1.346269f,  0.649898f,
+  -1.432258f, -0.456710f, -1.018123f, -0.348559f, -1.225226f, -0.170717f,
+  -0.354072f, 0.068292f,  -0.234168f, 0.277503f,  0.179134f,  0.907420f,
+  0.354626f,  -0.627210f, 0.905779f,  0.512612f,  0.161190f,  -0.843177f,
+  0.014953f,  -0.354983f, 0.011116f,  -0.429598f, -1.017138f, -0.211432f,
+  0.941840f,  -0.281747f, 0.957776f,  -0.541914f, 1.041880f,  -0.433580f,
+  -1.416451f, -0.166467f,
+};
+
+static const float av1_tx_split_nn_bias_4x16_layer0[16] = {
+  3.086118f,  -3.235095f, 4.830956f,  -0.165706f, 0.955031f,  4.055783f,
+  -0.311489f, 4.660205f,  -0.576277f, -0.248111f, -0.790519f, -1.686412f,
+  -1.191704f, -3.800073f, 4.121552f,  -1.399397f,
+};
+
+static const float av1_tx_split_nn_weights_4x16_layer1[16] = {
+  -0.758677f, 0.388776f,  0.439906f,  0.011390f, -0.084319f, -0.667969f,
+  -0.467316f, -0.875491f, -0.160668f, 0.805292f, 0.114393f,  -0.549682f,
+  0.462109f,  0.343315f,  1.092593f,  0.483152f,
+};
+
+static const float av1_tx_split_nn_bias_4x16_layer1[1] = {
+  0.8205083f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_4x16 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_4x16_layer0,
+      av1_tx_split_nn_weights_4x16_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_4x16_layer0,
+      av1_tx_split_nn_bias_4x16_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 16x32 block.
+static const float av1_tx_split_nn_weights_16x32_layer0[8 * 32] = {
+  0.180713f,  0.033211f,  0.607561f,  0.138642f,  0.637204f,  -0.000940f,
+  0.012630f,  0.358109f,  0.022238f,  0.190418f,  0.079088f,  0.065925f,
+  0.038242f,  0.162380f,  -0.122728f, 0.379382f,  -0.303283f, -0.327550f,
+  0.029120f,  -0.284553f, 0.269588f,  -0.309805f, -0.241036f, -0.161103f,
+  -0.304887f, 0.239843f,  -0.149146f, 0.311234f,  -0.073640f, -0.132718f,
+  0.178901f,  0.474712f,  0.020280f,  0.063685f,  -0.609170f, -0.013658f,
+  -0.338074f, 0.250429f,  0.082978f,  -0.186315f, -0.788959f, 0.039859f,
+  -0.426461f, -0.001524f, -0.447211f, 0.378102f,  0.315617f,  0.017428f,
+  0.745494f,  -0.219024f, 0.512836f,  0.200522f,  0.680449f,  0.313686f,
+  -0.412569f, -0.132927f, 0.631120f,  0.042735f,  0.336153f,  0.044772f,
+  0.432606f,  0.175681f,  -0.634411f, -0.073509f, -0.040643f, -0.559260f,
+  -0.104034f, -0.570495f, -0.247365f, 0.063256f,  -0.582021f, -0.492585f,
+  -0.194955f, -0.207934f, -0.506627f, 0.021743f,  -0.416518f, 0.320876f,
+  0.115889f,  0.149399f,  -0.229376f, 0.095505f,  0.115191f,  -0.471921f,
+  0.113068f,  0.343684f,  -0.036831f, 0.021240f,  0.295112f,  0.031166f,
+  0.448201f,  -0.132241f, 0.164032f,  0.355572f,  0.072154f,  0.017335f,
+  -0.046113f, 0.178719f,  -0.026881f, -0.242590f, 0.055073f,  -0.012958f,
+  0.077904f,  0.351356f,  0.107655f,  0.260568f,  -0.080052f, -0.197553f,
+  0.085763f,  0.263416f,  -0.327741f, 0.158855f,  0.056899f,  -0.162121f,
+  0.339518f,  -0.571204f, 0.264966f,  -0.252214f, -0.202560f, -0.134213f,
+  -0.330188f, 0.009470f,  -0.468376f, -0.065240f, -0.307957f, 0.116479f,
+  -0.222238f, -0.458716f, 0.186493f,  -0.391415f, 0.118649f,  -0.104653f,
+  -0.259958f, -0.332081f, -0.403785f, -0.050147f, -0.573511f, 0.177117f,
+  -0.598358f, 0.164947f,  -0.119694f, -0.058520f, 0.203829f,  -0.267404f,
+  -0.048202f, -0.600006f, 0.181594f,  -0.731805f, 0.146417f,  -0.687148f,
+  -1.210525f, -0.450101f, -0.620635f, 0.208825f,  -0.611357f, 0.112202f,
+  -0.309468f, -0.323545f, 0.357770f,  0.308061f,  0.553199f,  0.049012f,
+  0.530093f,  -0.208597f, 0.607882f,  -0.058120f, -0.527634f, 0.018136f,
+  0.060753f,  0.118894f,  0.175649f,  0.014731f,  0.428318f,  -0.106465f,
+  -0.119077f, 0.080179f,  0.524997f,  0.368286f,  0.528286f,  0.213659f,
+  0.639286f,  0.195079f,  -0.049815f, -0.092008f, -0.302958f, 0.298149f,
+  -0.173870f, -0.145205f, -0.233589f, -0.303368f, 0.141275f,  0.325622f,
+  -0.115293f, 0.155188f,  0.047225f,  0.231050f,  -0.167447f, 0.349754f,
+  0.295544f,  -0.319466f, 0.095144f,  0.174612f,  -0.194652f, 0.305915f,
+  -0.239008f, -0.037453f, 0.280696f,  0.125850f,  0.749196f,  -0.101919f,
+  0.791808f,  -0.236811f, 0.064157f,  0.032865f,  -0.225911f, 0.350384f,
+  0.723183f,  -0.103992f, 0.483085f,  -0.123992f, 0.602138f,  0.023895f,
+  -0.692601f, -0.118387f, 0.162527f,  0.145178f,  -0.184702f, -0.017753f,
+  -0.159436f, 0.124105f,  -0.131067f, 0.310275f,  0.151499f,  0.138924f,
+  0.537459f,  0.263212f,  0.615896f,  0.281255f,  0.021293f,  -0.473459f,
+  0.210145f,  -0.056682f, 0.063658f,  0.377254f,  -0.314410f, -0.183487f,
+  0.300384f,  0.328471f,  0.164694f,  -0.159272f, -0.160942f, -0.502861f,
+  -0.129147f, 0.045916f,  -0.606865f, -0.101378f,
+};
+
+static const float av1_tx_split_nn_bias_16x32_layer0[32] = {
+  0.051664f,  -0.212487f, -0.077596f, -0.818467f, 0.638475f,  -0.759937f,
+  0.157198f,  0.989640f,  1.586035f,  0.431144f,  0.041605f,  0.543085f,
+  0.498379f,  0.320504f,  0.134233f,  0.670979f,  -0.105562f, -1.574879f,
+  1.261812f,  -0.287530f, -1.610592f, 0.730899f,  -0.894240f, -0.657790f,
+  0.270806f,  -0.181708f, 0.298578f,  0.817240f,  -0.221508f, -0.201771f,
+  -0.294389f, 1.456413f,
+};
+
+static const float av1_tx_split_nn_weights_16x32_layer1[32] = {
+  1.208914f,  0.324728f,  0.383352f,  -0.874321f, 0.172565f,  -0.580927f,
+  -0.432927f, 0.433698f,  -0.801935f, 0.672028f,  0.563493f,  0.260077f,
+  -0.200557f, -0.121638f, 0.530735f,  -0.525196f, 0.281799f,  0.624204f,
+  -0.662775f, -0.230887f, 0.980989f,  0.223437f,  -0.790591f, 0.600724f,
+  -0.273445f, 0.427635f,  -0.501641f, -0.878390f, 0.234731f,  -0.172550f,
+  0.418904f,  1.792187f,
+};
+
+static const float av1_tx_split_nn_bias_16x32_layer1[1] = {
+  -0.29233751f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x32 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_16x32_layer0,
+      av1_tx_split_nn_weights_16x32_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_16x32_layer0,
+      av1_tx_split_nn_bias_16x32_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 32x64 block.
+static const float av1_tx_split_nn_weights_32x64_layer0[8 * 32] = {
+  0.031614f,  -0.110926f, 0.052418f,  -0.702506f, 0.045708f,  0.238329f,
+  -0.021806f, -0.208128f, 0.509745f,  -0.293891f, 0.277788f,  0.113937f,
+  0.741576f,  0.062848f,  0.351878f,  0.212532f,  0.385842f,  0.081517f,
+  0.398502f,  -0.015156f, 0.242616f,  0.214619f,  -0.182678f, -0.170546f,
+  0.110605f,  -0.236749f, -0.023831f, -0.285243f, 0.147156f,  -0.257639f,
+  0.341355f,  -0.571641f, -0.721797f, 0.139588f,  -0.518494f, -0.206526f,
+  -0.570560f, -0.184295f, 0.110271f,  0.210292f,  -0.109132f, -0.001080f,
+  0.129251f,  -0.204230f, -0.396312f, -0.183024f, 0.421243f,  -0.013154f,
+  0.222627f,  0.169826f,  0.226037f,  0.218153f,  -0.343528f, 0.274906f,
+  -0.156632f, 0.250261f,  -0.484020f, 0.019909f,  -0.349575f, -0.286643f,
+  -0.507396f, 0.202446f,  -0.154110f, -0.292644f, 0.122666f,  0.306963f,
+  0.424895f,  0.005579f,  0.494094f,  -0.079551f, 0.473740f,  0.352414f,
+  -0.356917f, 0.264331f,  -0.554487f, 0.119978f,  0.012291f,  -0.141641f,
+  -0.254714f, -0.213723f, -0.116701f, -0.011267f, 0.190025f,  -0.118501f,
+  0.305151f,  -0.316782f, -0.220801f, -0.308420f, -0.324285f, 0.421329f,
+  -0.177066f, -0.055114f, 0.229698f,  -0.199523f, 0.054278f,  0.365020f,
+  -0.060586f, -0.300618f, 0.157563f,  -0.064338f, -0.005711f, -0.176991f,
+  -0.424502f, -0.111914f, 0.092608f,  0.126621f,  0.078547f,  0.148008f,
+  0.024221f,  0.124599f,  0.001343f,  0.059402f,  0.453753f,  0.047102f,
+  0.242544f,  0.055735f,  -0.067451f, -0.170061f, -0.170469f, -0.232173f,
+  0.214908f,  0.248889f,  0.544348f,  -0.084566f, 0.402478f,  0.298031f,
+  0.099038f,  -0.238019f, -0.475085f, -0.070042f, -0.754955f, -0.049095f,
+  -0.783801f, -0.099857f, -0.582008f, -0.055194f, -0.103655f, 0.143689f,
+  0.100219f,  0.293934f,  0.099271f,  -0.036320f, 0.356626f,  -0.261445f,
+  0.879544f,  0.000878f,  0.532920f,  -0.093918f, 0.508867f,  -0.040215f,
+  -0.789042f, -0.145380f, -0.090040f, -0.066636f, 0.015212f,  0.352989f,
+  -0.058831f, -0.164588f, 0.039890f,  0.122861f,  0.222508f,  0.061217f,
+  0.466487f,  0.022666f,  0.423777f,  -0.002200f, -0.656835f, -0.099760f,
+  -0.520606f, 0.303204f,  -0.563620f, -0.160922f, -0.243203f, 0.313354f,
+  -0.336516f, -0.206764f, -0.236040f, 0.325899f,  -0.418748f, 0.163205f,
+  -0.476242f, -0.121928f, 0.139178f,  -0.157193f, -0.531766f, -0.180202f,
+  -0.485254f, 0.187703f,  -0.440072f, 0.137854f,  0.029139f,  0.109530f,
+  -0.078475f, -0.360618f, -0.334672f, -0.350890f, -0.403976f, 0.180336f,
+  -0.304542f, 0.005123f,  0.413995f,  0.314639f,  0.342648f,  -0.293264f,
+  0.358135f,  -0.180425f, -0.369530f, -0.048413f, 0.498366f,  0.121875f,
+  0.270948f,  -0.187966f, 0.342503f,  0.174420f,  -0.352105f, 0.088080f,
+  0.008277f,  0.020275f,  -0.002381f, 0.504389f,  -0.018832f, -0.366047f,
+  -0.090947f, -0.168150f, 0.016184f,  -0.328914f, 0.089579f,  -0.017349f,
+  0.005844f,  -0.005010f, -1.857514f, -0.282426f, 0.010177f,  -0.214727f,
+  -0.182529f, 0.156943f,  -0.162032f, -0.472654f, 0.069432f,  0.016901f,
+  -0.767905f, 0.137129f,  -0.411463f, 0.049056f,  -0.431657f, -0.037641f,
+  0.785500f,  0.046225f,  0.195831f,  0.245204f,  0.368614f,  0.212261f,
+  0.440626f,  -0.158048f, -0.461031f, -0.146280f,
+};
+
+static const float av1_tx_split_nn_bias_32x64_layer0[32] = {
+  0.490777f,  -1.894238f, 0.621333f,  -0.076756f, 0.286298f, 0.286375f,
+  -0.126431f, -0.350034f, -1.017572f, 0.620125f,  0.408128f, 0.238756f,
+  -0.060728f, 0.210912f,  0.043124f,  0.445649f,  0.907025f, 0.360272f,
+  1.083101f,  -0.068952f, 1.062348f,  0.396354f,  0.280075f, 0.501732f,
+  0.328422f,  0.066241f,  0.474697f,  0.126313f,  0.741206f, 0.314796f,
+  0.552712f,  0.299410f,
+};
+
+static const float av1_tx_split_nn_weights_32x64_layer1[32] = {
+  1.033823f,  0.603439f,  0.304591f,  -0.279940f, -0.780909f, -0.132801f,
+  0.154059f,  0.662014f,  -0.718368f, 0.198733f,  0.039766f,  -0.208516f,
+  -0.104909f, -0.394209f, 0.081617f,  0.365041f,  -0.874960f, -0.063315f,
+  -1.189897f, 0.337225f,  0.410893f,  0.307519f,  0.221323f,  0.233895f,
+  0.469536f,  0.438557f,  0.280144f,  0.422423f,  -1.394513f, 0.781900f,
+  0.352981f,  0.111265f,
+};
+
+static const float av1_tx_split_nn_bias_32x64_layer1[1] = {
+  -0.18160765f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_32x64 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_32x64_layer0,
+      av1_tx_split_nn_weights_32x64_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_32x64_layer0,
+      av1_tx_split_nn_bias_32x64_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 8x32 block.
+static const float av1_tx_split_nn_weights_8x32_layer0[8 * 24] = {
+  -0.687846f, 0.121404f,  -0.372905f, 0.126770f,  -0.103298f, -0.101650f,
+  -0.148490f, -0.271740f, 0.682915f,  -0.079765f, 0.634347f,  -0.151503f,
+  0.287692f,  -0.079072f, -0.236948f, 0.065064f,  0.713383f,  0.397123f,
+  0.553621f,  0.368529f,  0.767663f,  -0.046601f, -0.392402f, -0.294822f,
+  -0.292325f, -0.010573f, -0.837945f, 0.050113f,  -0.811360f, 0.199162f,
+  0.150832f,  0.011602f,  0.369694f,  -0.225876f, 0.234113f,  -0.269808f,
+  0.303805f,  -0.190281f, -0.451136f, 0.209755f,  -0.308894f, 0.326956f,
+  0.313591f,  0.089923f,  -0.095754f, 0.390981f,  0.467366f,  0.169670f,
+  0.853322f,  0.054055f,  0.830319f,  -0.121918f, 0.262019f,  -0.093526f,
+  0.385558f,  0.419174f,  0.040198f,  -0.347030f, -0.450492f, -0.106764f,
+  0.487502f,  -0.204188f, 0.430374f,  -0.116388f, 0.236407f,  -0.157376f,
+  0.732294f,  -0.651387f, 0.347446f,  0.342575f,  0.048406f,  0.187657f,
+  0.434899f,  -0.447782f, 0.032728f,  -0.071168f, -0.255327f, 0.104174f,
+  0.095689f,  -0.431743f, 0.725694f,  0.031797f,  0.523171f,  0.061801f,
+  0.469804f,  -0.071068f, -0.059024f, -0.211937f, 0.392134f,  -0.321490f,
+  0.366060f,  -0.427798f, 0.166771f,  0.299652f,  0.044660f,  0.205142f,
+  0.039133f,  -0.051835f, -0.465475f, 0.216976f,  -0.341156f, 0.095358f,
+  0.230807f,  0.201674f,  0.279266f,  -0.713534f, -0.091690f, -0.569708f,
+  -0.119001f, 0.252160f,  -1.544578f, -0.284477f, 0.555348f,  0.226471f,
+  0.347690f,  0.034365f,  0.770835f,  -0.241859f, -0.130241f, 0.292936f,
+  0.396622f,  -0.417916f, 0.492224f,  0.125517f,  0.344824f,  0.232172f,
+  -0.432106f, -0.278745f, 0.035069f,  -0.307247f, -0.120760f, 0.170950f,
+  0.433601f,  0.044286f,  0.141463f,  -0.041382f, 0.529346f,  0.010868f,
+  -0.323674f, 0.185205f,  0.623459f,  0.232842f,  -0.406693f, -0.142944f,
+  0.222988f,  0.343634f,  0.065401f,  0.002621f,  0.805335f,  -0.426926f,
+  0.279181f,  0.131364f,  0.192339f,  -0.402391f, 0.544120f,  -0.060618f,
+  0.467780f,  0.165224f,  -0.373131f, 0.002427f,  0.688064f,  0.322317f,
+  0.259713f,  0.130583f,  0.185032f,  -0.189111f, -0.067821f, 0.010875f,
+  0.644724f,  -0.179291f, 0.463222f,  0.155230f,  0.721384f,  -0.046019f,
+  0.438501f,  0.440027f,  -0.462090f, -0.002039f, -0.468026f, -0.008890f,
+  -0.328530f, 0.370102f,  0.482531f,  0.043471f,  -0.469732f, -0.532663f,
+  0.122081f,  -0.379659f, 0.037219f,  -0.519913f, -0.128975f, -0.404365f,
+};
+
+static const float av1_tx_split_nn_bias_8x32_layer0[24] = {
+  -1.198965f, 0.395204f,  -0.408627f, -0.021654f, -0.658355f, 0.154525f,
+  -0.288354f, 1.207574f,  0.411608f,  0.964678f,  -1.176893f, 1.059006f,
+  -0.472969f, 2.087975f,  1.065536f,  0.595569f,  0.197907f,  -0.349938f,
+  1.013651f,  -0.931093f, -0.973595f, -0.459094f, -1.253062f, 1.624782f,
+};
+
+static const float av1_tx_split_nn_weights_8x32_layer1[24] = {
+  0.815787f,  -0.393465f, -0.483427f, -0.565592f, 0.493494f,  0.430229f,
+  -0.507073f, -0.251379f, -0.353418f, -0.495445f, 0.820029f,  0.649146f,
+  -0.487383f, 1.844503f,  0.480324f,  -0.982705f, -0.501446f, -0.220584f,
+  0.334299f,  0.802238f,  0.805838f,  -0.487848f, 0.300772f,  -1.232857f,
+};
+
+static const float av1_tx_split_nn_bias_8x32_layer1[1] = {
+  0.13435879f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x32 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      24,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_8x32_layer0,
+      av1_tx_split_nn_weights_8x32_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_8x32_layer0,
+      av1_tx_split_nn_bias_8x32_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 16x32 block.
+static const float av1_tx_split_nn_weights_16x64_layer0[8 * 16] = {
+  -0.378223f, -0.124216f, -0.514089f, -0.110117f, -0.585801f, -0.094838f,
+  -0.455385f, -0.220254f, -0.504568f, -0.082351f, -0.476420f, -0.253993f,
+  -0.454709f, -0.059461f, 0.210313f,  -0.155683f, 0.192968f,  -0.127804f,
+  0.471996f,  0.253377f,  0.472625f,  0.485322f,  0.150560f,  0.164868f,
+  -0.475587f, 0.447559f,  -0.455759f, -0.306665f, -0.194866f, -0.283716f,
+  -0.243897f, 0.293020f,  -0.308298f, -0.191904f, -0.468568f, 0.014053f,
+  -0.618848f, 0.096273f,  -0.444586f, 0.347750f,  -0.280643f, -0.062872f,
+  0.118661f,  0.540099f,  0.104141f,  -0.279300f, -0.098721f, -0.173427f,
+  -0.984558f, -0.424559f, -0.411928f, -0.120875f, -0.488999f, -0.050716f,
+  -0.523103f, 0.093620f,  -0.930396f, -0.431997f, -1.163297f, 0.190384f,
+  -0.422581f, -0.005354f, 0.450552f,  0.369210f,  0.562484f,  0.679922f,
+  0.282099f,  -0.039075f, 0.404196f,  0.006371f,  0.069679f,  -0.196160f,
+  -0.213675f, 0.275187f,  -0.104235f, -0.193090f, 0.003116f,  -0.252454f,
+  -0.094591f, 0.210439f,  -0.137070f, 0.145043f,  0.024558f,  0.121718f,
+  0.010138f,  0.301651f,  -0.377990f, 0.444414f,  0.001845f,  -0.095334f,
+  0.550259f,  0.087603f,  0.792492f,  -0.044584f, 0.641706f,  -0.328458f,
+  -0.447791f, 0.135376f,  0.356385f,  0.135748f,  0.310370f,  0.293757f,
+  -0.062000f, -0.056368f, 0.343930f,  0.312039f,  0.370763f,  0.452381f,
+  -0.023630f, -0.185909f, 0.422277f,  -0.006306f, 0.045166f,  0.423359f,
+  -0.157735f, -0.084901f, 0.219527f,  -0.209510f, 0.575057f,  0.249276f,
+  0.069267f,  0.233898f,  -0.229392f, 0.117197f,  -0.038551f, 0.293976f,
+  0.101996f,  0.120878f,
+};
+
+static const float av1_tx_split_nn_bias_16x64_layer0[16] = {
+  1.036995f,  0.160249f,  0.100264f,  0.694881f,  0.694677f,  0.128379f,
+  -0.843405f, -0.405515f, 0.104139f,  0.182980f,  -0.025472f, 0.901067f,
+  -0.299866f, -0.103079f, -0.190352f, -0.048121f,
+};
+
+static const float av1_tx_split_nn_weights_16x64_layer1[16] = {
+  -1.778868f, 0.174690f,  0.211991f, 0.712138f,  0.589352f,  0.466652f,
+  1.029146f,  -0.490044f, 0.483015f, 0.600215f,  -0.577776f, -0.755546f,
+  0.348337f,  -0.205082f, 0.347129f, -0.322277f,
+};
+
+static const float av1_tx_split_nn_bias_16x64_layer1[1] = {
+  0.04230947f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x64 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_16x64_layer0,
+      av1_tx_split_nn_weights_16x64_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_16x64_layer0,
+      av1_tx_split_nn_bias_16x64_layer1,
+  },
+};
+/******************************************************************************/
+
+// Map block size to its corresponding neural net model for tx split prediction.
+static const NN_CONFIG *av1_tx_split_nnconfig_map[TX_SIZES_ALL] = {
+  NULL,                          // TX_4X4,
+  &av1_tx_split_nnconfig_8x8,    // TX_8X8,
+  &av1_tx_split_nnconfig_16x16,  // TX_16X16,
+  &av1_tx_split_nnconfig_32x32,  // TX_32X32,
+  &av1_tx_split_nnconfig_64x64,  // TX_64X64,
+  &av1_tx_split_nnconfig_4x8,    // TX_4X8,
+  &av1_tx_split_nnconfig_4x8,    // TX_8X4,
+  &av1_tx_split_nnconfig_8x16,   // TX_8X16,
+  &av1_tx_split_nnconfig_8x16,   // TX_16X8,
+  &av1_tx_split_nnconfig_16x32,  // TX_16X32,
+  &av1_tx_split_nnconfig_16x32,  // TX_32X16,
+  &av1_tx_split_nnconfig_32x64,  // TX_32X64,
+  &av1_tx_split_nnconfig_32x64,  // TX_64X32,
+  &av1_tx_split_nnconfig_4x16,   // TX_4X16,
+  &av1_tx_split_nnconfig_4x16,   // TX_16X4,
+  &av1_tx_split_nnconfig_8x32,   // TX_8X32,
+  &av1_tx_split_nnconfig_8x32,   // TX_32X8,
+  &av1_tx_split_nnconfig_16x64,  // TX_16X64,
+  &av1_tx_split_nnconfig_16x64,  // TX_64X16,
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
new file mode 100644
index 000000000..84065d6de
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -0,0 +1,1205 @@
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+
+void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
+  __m128i buf0[32];
+  __m128i buf1[32];
+  const int32_t *cospi;
+  // stage 0
+  // stage 1
+  buf1[0] = _mm_add_epi32(input[0], input[31]);
+  buf1[31] = _mm_sub_epi32(input[0], input[31]);
+  buf1[1] = _mm_add_epi32(input[1], input[30]);
+  buf1[30] = _mm_sub_epi32(input[1], input[30]);
+  buf1[2] = _mm_add_epi32(input[2], input[29]);
+  buf1[29] = _mm_sub_epi32(input[2], input[29]);
+  buf1[3] = _mm_add_epi32(input[3], input[28]);
+  buf1[28] = _mm_sub_epi32(input[3], input[28]);
+  buf1[4] = _mm_add_epi32(input[4], input[27]);
+  buf1[27] = _mm_sub_epi32(input[4], input[27]);
+  buf1[5] = _mm_add_epi32(input[5], input[26]);
+  buf1[26] = _mm_sub_epi32(input[5], input[26]);
+  buf1[6] = _mm_add_epi32(input[6], input[25]);
+  buf1[25] = _mm_sub_epi32(input[6], input[25]);
+  buf1[7] = _mm_add_epi32(input[7], input[24]);
+  buf1[24] = _mm_sub_epi32(input[7], input[24]);
+  buf1[8] = _mm_add_epi32(input[8], input[23]);
+  buf1[23] = _mm_sub_epi32(input[8], input[23]);
+  buf1[9] = _mm_add_epi32(input[9], input[22]);
+  buf1[22] = _mm_sub_epi32(input[9], input[22]);
+  buf1[10] = _mm_add_epi32(input[10], input[21]);
+  buf1[21] = _mm_sub_epi32(input[10], input[21]);
+  buf1[11] = _mm_add_epi32(input[11], input[20]);
+  buf1[20] = _mm_sub_epi32(input[11], input[20]);
+  buf1[12] = _mm_add_epi32(input[12], input[19]);
+  buf1[19] = _mm_sub_epi32(input[12], input[19]);
+  buf1[13] = _mm_add_epi32(input[13], input[18]);
+  buf1[18] = _mm_sub_epi32(input[13], input[18]);
+  buf1[14] = _mm_add_epi32(input[14], input[17]);
+  buf1[17] = _mm_sub_epi32(input[14], input[17]);
+  buf1[15] = _mm_add_epi32(input[15], input[16]);
+  buf1[16] = _mm_sub_epi32(input[15], input[16]);
+
+  // stage 2
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
+  buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
+  buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
+  buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]);
+  buf0[2] = _mm_add_epi32(buf1[2], buf1[13]);
+  buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]);
+  buf0[3] = _mm_add_epi32(buf1[3], buf1[12]);
+  buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]);
+  buf0[4] = _mm_add_epi32(buf1[4], buf1[11]);
+  buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]);
+  buf0[5] = _mm_add_epi32(buf1[5], buf1[10]);
+  buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]);
+  buf0[6] = _mm_add_epi32(buf1[6], buf1[9]);
+  buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]);
+  buf0[7] = _mm_add_epi32(buf1[7], buf1[8]);
+  buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  buf0[18] = buf1[18];
+  buf0[19] = buf1[19];
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+                      buf0[27], cos_bit);
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+                      buf0[26], cos_bit);
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+                      buf0[25], cos_bit);
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+                      buf0[24], cos_bit);
+  buf0[28] = buf1[28];
+  buf0[29] = buf1[29];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 3
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
+  buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
+  buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
+  buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
+  buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
+  buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
+  buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
+  buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
+  buf1[8] = buf0[8];
+  buf1[9] = buf0[9];
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+                      buf1[13], cos_bit);
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+                      buf1[12], cos_bit);
+  buf1[14] = buf0[14];
+  buf1[15] = buf0[15];
+  buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
+  buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]);
+  buf1[17] = _mm_add_epi32(buf0[17], buf0[22]);
+  buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]);
+  buf1[18] = _mm_add_epi32(buf0[18], buf0[21]);
+  buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]);
+  buf1[19] = _mm_add_epi32(buf0[19], buf0[20]);
+  buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]);
+  buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]);
+  buf1[31] = _mm_add_epi32(buf0[31], buf0[24]);
+  buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]);
+  buf1[30] = _mm_add_epi32(buf0[30], buf0[25]);
+  buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]);
+  buf1[29] = _mm_add_epi32(buf0[29], buf0[26]);
+  buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]);
+  buf1[28] = _mm_add_epi32(buf0[28], buf0[27]);
+
+  // stage 4
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
+  buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
+  buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
+  buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
+  buf0[4] = buf1[4];
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
+                      cos_bit);
+  buf0[7] = buf1[7];
+  buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
+  buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
+  buf0[9] = _mm_add_epi32(buf1[9], buf1[10]);
+  buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]);
+  buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]);
+  buf0[15] = _mm_add_epi32(buf1[15], buf1[12]);
+  buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]);
+  buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+                      buf0[29], cos_bit);
+  btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+                      buf0[28], cos_bit);
+  btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+                      buf0[27], cos_bit);
+  btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+                      buf0[26], cos_bit);
+  buf0[22] = buf1[22];
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[25] = buf1[25];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 5
+  cospi = cospi_arr(cos_bit);
+  btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
+                      cos_bit);
+  btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
+                      cos_bit);
+  buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
+  buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
+  buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
+  buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
+  buf1[8] = buf0[8];
+  btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
+                      buf1[14], cos_bit);
+  btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+                      buf1[13], cos_bit);
+  buf1[11] = buf0[11];
+  buf1[12] = buf0[12];
+  buf1[15] = buf0[15];
+  buf1[16] = _mm_add_epi32(buf0[16], buf0[19]);
+  buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]);
+  buf1[17] = _mm_add_epi32(buf0[17], buf0[18]);
+  buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]);
+  buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]);
+  buf1[23] = _mm_add_epi32(buf0[23], buf0[20]);
+  buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]);
+  buf1[22] = _mm_add_epi32(buf0[22], buf0[21]);
+  buf1[24] = _mm_add_epi32(buf0[24], buf0[27]);
+  buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]);
+  buf1[25] = _mm_add_epi32(buf0[25], buf0[26]);
+  buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]);
+  buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]);
+  buf1[31] = _mm_add_epi32(buf0[31], buf0[28]);
+  buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]);
+  buf1[30] = _mm_add_epi32(buf0[30], buf0[29]);
+
+  // stage 6
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+                      cos_bit);
+  btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
+                      cos_bit);
+  buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
+  buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
+  buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
+  buf0[11] = _mm_add_epi32(buf1[11], buf1[10]);
+  buf0[12] = _mm_add_epi32(buf1[12], buf1[13]);
+  buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]);
+  buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
+  buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
+  buf0[16] = buf1[16];
+  btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+                      buf0[30], cos_bit);
+  btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+                      buf0[29], cos_bit);
+  buf0[19] = buf1[19];
+  buf0[20] = buf1[20];
+  btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+                      buf0[26], cos_bit);
+  btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+                      buf0[25], cos_bit);
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[27] = buf1[27];
+  buf0[28] = buf1[28];
+  buf0[31] = buf1[31];
+
+  // stage 7
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = buf0[0];
+  buf1[1] = buf0[1];
+  buf1[2] = buf0[2];
+  buf1[3] = buf0[3];
+  buf1[4] = buf0[4];
+  buf1[5] = buf0[5];
+  buf1[6] = buf0[6];
+  buf1[7] = buf0[7];
+  btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
+                      cos_bit);
+  btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
+                      buf1[14], cos_bit);
+  btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+                      buf1[13], cos_bit);
+  btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+                      buf1[12], cos_bit);
+  buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
+  buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
+  buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
+  buf1[19] = _mm_add_epi32(buf0[19], buf0[18]);
+  buf1[20] = _mm_add_epi32(buf0[20], buf0[21]);
+  buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]);
+  buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]);
+  buf1[23] = _mm_add_epi32(buf0[23], buf0[22]);
+  buf1[24] = _mm_add_epi32(buf0[24], buf0[25]);
+  buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]);
+  buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]);
+  buf1[27] = _mm_add_epi32(buf0[27], buf0[26]);
+  buf1[28] = _mm_add_epi32(buf0[28], buf0[29]);
+  buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]);
+  buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]);
+  buf1[31] = _mm_add_epi32(buf0[31], buf0[30]);
+
+  // stage 8
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  buf0[4] = buf1[4];
+  buf0[5] = buf1[5];
+  buf0[6] = buf1[6];
+  buf0[7] = buf1[7];
+  buf0[8] = buf1[8];
+  buf0[9] = buf1[9];
+  buf0[10] = buf1[10];
+  buf0[11] = buf1[11];
+  buf0[12] = buf1[12];
+  buf0[13] = buf1[13];
+  buf0[14] = buf1[14];
+  buf0[15] = buf1[15];
+  btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
+                      buf0[31], cos_bit);
+  btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+                      buf0[30], cos_bit);
+  btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+                      buf0[29], cos_bit);
+  btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+                      buf0[28], cos_bit);
+  btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+                      buf0[27], cos_bit);
+  btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+                      buf0[26], cos_bit);
+  btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+                      buf0[25], cos_bit);
+  btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
+                      buf0[24], cos_bit);
+
+  // stage 9
+  output[0] = buf0[0];
+  output[1] = buf0[16];
+  output[2] = buf0[8];
+  output[3] = buf0[24];
+  output[4] = buf0[4];
+  output[5] = buf0[20];
+  output[6] = buf0[12];
+  output[7] = buf0[28];
+  output[8] = buf0[2];
+  output[9] = buf0[18];
+  output[10] = buf0[10];
+  output[11] = buf0[26];
+  output[12] = buf0[6];
+  output[13] = buf0[22];
+  output[14] = buf0[14];
+  output[15] = buf0[30];
+  output[16] = buf0[1];
+  output[17] = buf0[17];
+  output[18] = buf0[9];
+  output[19] = buf0[25];
+  output[20] = buf0[5];
+  output[21] = buf0[21];
+  output[22] = buf0[13];
+  output[23] = buf0[29];
+  output[24] = buf0[3];
+  output[25] = buf0[19];
+  output[26] = buf0[11];
+  output[27] = buf0[27];
+  output[28] = buf0[7];
+  output[29] = buf0[23];
+  output[30] = buf0[15];
+  output[31] = buf0[31];
+}
+
+void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 4;
+  const int num_per_128 = 4;
+  const int32_t *cospi;
+  __m128i buf0[4];
+  __m128i buf1[4];
+  int col_num = txfm_size / num_per_128;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    int j;
+    for (j = 0; j < 4; ++j) {
+      buf0[j] = input[j * col_num + col];
+    }
+
+    // stage 1
+    stage_idx++;
+    buf1[0] = buf0[3];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[1];
+    buf1[3] = buf0[2];
+
+    // stage 2
+    stage_idx++;
+
+    cospi = cospi_arr(cos_bit);
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
+                        cos_bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
+                        buf0[3], cos_bit);
+
+    // stage 3
+    stage_idx++;
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+
+    // stage 4
+    stage_idx++;
+
+    cospi = cospi_arr(cos_bit);
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], cos_bit);
+
+    // stage 5
+    stage_idx++;
+    buf1[0] = buf0[0];
+    buf1[1] = _mm_sub_epi32(_mm_setzero_si128(), buf0[2]);
+    buf1[2] = buf0[3];
+    buf1[3] = _mm_sub_epi32(_mm_setzero_si128(), buf0[1]);
+
+    for (j = 0; j < 4; ++j) {
+      output[j * col_num + col] = buf1[j];
+    }
+  }
+}
+
+void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32 = _mm_set1_epi32(-cospi[32]);
+  __m128i cospi_p32 = _mm_set1_epi32(cospi[32]);
+  __m128i cospi_m16 = _mm_set1_epi32(-cospi[16]);
+  __m128i cospi_p48 = _mm_set1_epi32(cospi[48]);
+  __m128i cospi_m48 = _mm_set1_epi32(-cospi[48]);
+  __m128i cospi_p16 = _mm_set1_epi32(cospi[16]);
+  __m128i cospi_m08 = _mm_set1_epi32(-cospi[8]);
+  __m128i cospi_p56 = _mm_set1_epi32(cospi[56]);
+  __m128i cospi_m56 = _mm_set1_epi32(-cospi[56]);
+  __m128i cospi_m40 = _mm_set1_epi32(-cospi[40]);
+  __m128i cospi_p24 = _mm_set1_epi32(cospi[24]);
+  __m128i cospi_m24 = _mm_set1_epi32(-cospi[24]);
+  __m128i cospi_p08 = _mm_set1_epi32(cospi[8]);
+  __m128i cospi_p40 = _mm_set1_epi32(cospi[40]);
+  __m128i cospi_p60 = _mm_set1_epi32(cospi[60]);
+  __m128i cospi_p04 = _mm_set1_epi32(cospi[4]);
+  __m128i cospi_p28 = _mm_set1_epi32(cospi[28]);
+  __m128i cospi_p36 = _mm_set1_epi32(cospi[36]);
+  __m128i cospi_p44 = _mm_set1_epi32(cospi[44]);
+  __m128i cospi_p20 = _mm_set1_epi32(cospi[20]);
+  __m128i cospi_p12 = _mm_set1_epi32(cospi[12]);
+  __m128i cospi_p52 = _mm_set1_epi32(cospi[52]);
+  __m128i cospi_m04 = _mm_set1_epi32(-cospi[4]);
+  __m128i cospi_m60 = _mm_set1_epi32(-cospi[60]);
+  __m128i cospi_m36 = _mm_set1_epi32(-cospi[36]);
+  __m128i cospi_m28 = _mm_set1_epi32(-cospi[28]);
+  __m128i cospi_m20 = _mm_set1_epi32(-cospi[20]);
+  __m128i cospi_m44 = _mm_set1_epi32(-cospi[44]);
+  __m128i cospi_m52 = _mm_set1_epi32(-cospi[52]);
+  __m128i cospi_m12 = _mm_set1_epi32(-cospi[12]);
+  __m128i cospi_p62 = _mm_set1_epi32(cospi[62]);
+  __m128i cospi_p02 = _mm_set1_epi32(cospi[2]);
+  __m128i cospi_p30 = _mm_set1_epi32(cospi[30]);
+  __m128i cospi_p34 = _mm_set1_epi32(cospi[34]);
+  __m128i cospi_p46 = _mm_set1_epi32(cospi[46]);
+  __m128i cospi_p18 = _mm_set1_epi32(cospi[18]);
+  __m128i cospi_p14 = _mm_set1_epi32(cospi[14]);
+  __m128i cospi_p50 = _mm_set1_epi32(cospi[50]);
+  __m128i cospi_p54 = _mm_set1_epi32(cospi[54]);
+  __m128i cospi_p10 = _mm_set1_epi32(cospi[10]);
+  __m128i cospi_p22 = _mm_set1_epi32(cospi[22]);
+  __m128i cospi_p42 = _mm_set1_epi32(cospi[42]);
+  __m128i cospi_p38 = _mm_set1_epi32(cospi[38]);
+  __m128i cospi_p26 = _mm_set1_epi32(cospi[26]);
+  __m128i cospi_p06 = _mm_set1_epi32(cospi[6]);
+  __m128i cospi_p58 = _mm_set1_epi32(cospi[58]);
+  __m128i cospi_p63 = _mm_set1_epi32(cospi[63]);
+  __m128i cospi_p01 = _mm_set1_epi32(cospi[1]);
+  __m128i cospi_p31 = _mm_set1_epi32(cospi[31]);
+  __m128i cospi_p33 = _mm_set1_epi32(cospi[33]);
+  __m128i cospi_p47 = _mm_set1_epi32(cospi[47]);
+  __m128i cospi_p17 = _mm_set1_epi32(cospi[17]);
+  __m128i cospi_p15 = _mm_set1_epi32(cospi[15]);
+  __m128i cospi_p49 = _mm_set1_epi32(cospi[49]);
+  __m128i cospi_p55 = _mm_set1_epi32(cospi[55]);
+  __m128i cospi_p09 = _mm_set1_epi32(cospi[9]);
+  __m128i cospi_p23 = _mm_set1_epi32(cospi[23]);
+  __m128i cospi_p41 = _mm_set1_epi32(cospi[41]);
+  __m128i cospi_p39 = _mm_set1_epi32(cospi[39]);
+  __m128i cospi_p25 = _mm_set1_epi32(cospi[25]);
+  __m128i cospi_p07 = _mm_set1_epi32(cospi[7]);
+  __m128i cospi_p57 = _mm_set1_epi32(cospi[57]);
+  __m128i cospi_p59 = _mm_set1_epi32(cospi[59]);
+  __m128i cospi_p05 = _mm_set1_epi32(cospi[5]);
+  __m128i cospi_p27 = _mm_set1_epi32(cospi[27]);
+  __m128i cospi_p37 = _mm_set1_epi32(cospi[37]);
+  __m128i cospi_p43 = _mm_set1_epi32(cospi[43]);
+  __m128i cospi_p21 = _mm_set1_epi32(cospi[21]);
+  __m128i cospi_p11 = _mm_set1_epi32(cospi[11]);
+  __m128i cospi_p53 = _mm_set1_epi32(cospi[53]);
+  __m128i cospi_p51 = _mm_set1_epi32(cospi[51]);
+  __m128i cospi_p13 = _mm_set1_epi32(cospi[13]);
+  __m128i cospi_p19 = _mm_set1_epi32(cospi[19]);
+  __m128i cospi_p45 = _mm_set1_epi32(cospi[45]);
+  __m128i cospi_p35 = _mm_set1_epi32(cospi[35]);
+  __m128i cospi_p29 = _mm_set1_epi32(cospi[29]);
+  __m128i cospi_p03 = _mm_set1_epi32(cospi[3]);
+  __m128i cospi_p61 = _mm_set1_epi32(cospi[61]);
+
+  // stage 1
+  __m128i x1[64];
+  x1[0] = _mm_add_epi32(input[0], input[63]);
+  x1[63] = _mm_sub_epi32(input[0], input[63]);
+  x1[1] = _mm_add_epi32(input[1], input[62]);
+  x1[62] = _mm_sub_epi32(input[1], input[62]);
+  x1[2] = _mm_add_epi32(input[2], input[61]);
+  x1[61] = _mm_sub_epi32(input[2], input[61]);
+  x1[3] = _mm_add_epi32(input[3], input[60]);
+  x1[60] = _mm_sub_epi32(input[3], input[60]);
+  x1[4] = _mm_add_epi32(input[4], input[59]);
+  x1[59] = _mm_sub_epi32(input[4], input[59]);
+  x1[5] = _mm_add_epi32(input[5], input[58]);
+  x1[58] = _mm_sub_epi32(input[5], input[58]);
+  x1[6] = _mm_add_epi32(input[6], input[57]);
+  x1[57] = _mm_sub_epi32(input[6], input[57]);
+  x1[7] = _mm_add_epi32(input[7], input[56]);
+  x1[56] = _mm_sub_epi32(input[7], input[56]);
+  x1[8] = _mm_add_epi32(input[8], input[55]);
+  x1[55] = _mm_sub_epi32(input[8], input[55]);
+  x1[9] = _mm_add_epi32(input[9], input[54]);
+  x1[54] = _mm_sub_epi32(input[9], input[54]);
+  x1[10] = _mm_add_epi32(input[10], input[53]);
+  x1[53] = _mm_sub_epi32(input[10], input[53]);
+  x1[11] = _mm_add_epi32(input[11], input[52]);
+  x1[52] = _mm_sub_epi32(input[11], input[52]);
+  x1[12] = _mm_add_epi32(input[12], input[51]);
+  x1[51] = _mm_sub_epi32(input[12], input[51]);
+  x1[13] = _mm_add_epi32(input[13], input[50]);
+  x1[50] = _mm_sub_epi32(input[13], input[50]);
+  x1[14] = _mm_add_epi32(input[14], input[49]);
+  x1[49] = _mm_sub_epi32(input[14], input[49]);
+  x1[15] = _mm_add_epi32(input[15], input[48]);
+  x1[48] = _mm_sub_epi32(input[15], input[48]);
+  x1[16] = _mm_add_epi32(input[16], input[47]);
+  x1[47] = _mm_sub_epi32(input[16], input[47]);
+  x1[17] = _mm_add_epi32(input[17], input[46]);
+  x1[46] = _mm_sub_epi32(input[17], input[46]);
+  x1[18] = _mm_add_epi32(input[18], input[45]);
+  x1[45] = _mm_sub_epi32(input[18], input[45]);
+  x1[19] = _mm_add_epi32(input[19], input[44]);
+  x1[44] = _mm_sub_epi32(input[19], input[44]);
+  x1[20] = _mm_add_epi32(input[20], input[43]);
+  x1[43] = _mm_sub_epi32(input[20], input[43]);
+  x1[21] = _mm_add_epi32(input[21], input[42]);
+  x1[42] = _mm_sub_epi32(input[21], input[42]);
+  x1[22] = _mm_add_epi32(input[22], input[41]);
+  x1[41] = _mm_sub_epi32(input[22], input[41]);
+  x1[23] = _mm_add_epi32(input[23], input[40]);
+  x1[40] = _mm_sub_epi32(input[23], input[40]);
+  x1[24] = _mm_add_epi32(input[24], input[39]);
+  x1[39] = _mm_sub_epi32(input[24], input[39]);
+  x1[25] = _mm_add_epi32(input[25], input[38]);
+  x1[38] = _mm_sub_epi32(input[25], input[38]);
+  x1[26] = _mm_add_epi32(input[26], input[37]);
+  x1[37] = _mm_sub_epi32(input[26], input[37]);
+  x1[27] = _mm_add_epi32(input[27], input[36]);
+  x1[36] = _mm_sub_epi32(input[27], input[36]);
+  x1[28] = _mm_add_epi32(input[28], input[35]);
+  x1[35] = _mm_sub_epi32(input[28], input[35]);
+  x1[29] = _mm_add_epi32(input[29], input[34]);
+  x1[34] = _mm_sub_epi32(input[29], input[34]);
+  x1[30] = _mm_add_epi32(input[30], input[33]);
+  x1[33] = _mm_sub_epi32(input[30], input[33]);
+  x1[31] = _mm_add_epi32(input[31], input[32]);
+  x1[32] = _mm_sub_epi32(input[31], input[32]);
+
+  // stage 2
+  __m128i x2[64];
+  x2[0] = _mm_add_epi32(x1[0], x1[31]);
+  x2[31] = _mm_sub_epi32(x1[0], x1[31]);
+  x2[1] = _mm_add_epi32(x1[1], x1[30]);
+  x2[30] = _mm_sub_epi32(x1[1], x1[30]);
+  x2[2] = _mm_add_epi32(x1[2], x1[29]);
+  x2[29] = _mm_sub_epi32(x1[2], x1[29]);
+  x2[3] = _mm_add_epi32(x1[3], x1[28]);
+  x2[28] = _mm_sub_epi32(x1[3], x1[28]);
+  x2[4] = _mm_add_epi32(x1[4], x1[27]);
+  x2[27] = _mm_sub_epi32(x1[4], x1[27]);
+  x2[5] = _mm_add_epi32(x1[5], x1[26]);
+  x2[26] = _mm_sub_epi32(x1[5], x1[26]);
+  x2[6] = _mm_add_epi32(x1[6], x1[25]);
+  x2[25] = _mm_sub_epi32(x1[6], x1[25]);
+  x2[7] = _mm_add_epi32(x1[7], x1[24]);
+  x2[24] = _mm_sub_epi32(x1[7], x1[24]);
+  x2[8] = _mm_add_epi32(x1[8], x1[23]);
+  x2[23] = _mm_sub_epi32(x1[8], x1[23]);
+  x2[9] = _mm_add_epi32(x1[9], x1[22]);
+  x2[22] = _mm_sub_epi32(x1[9], x1[22]);
+  x2[10] = _mm_add_epi32(x1[10], x1[21]);
+  x2[21] = _mm_sub_epi32(x1[10], x1[21]);
+  x2[11] = _mm_add_epi32(x1[11], x1[20]);
+  x2[20] = _mm_sub_epi32(x1[11], x1[20]);
+  x2[12] = _mm_add_epi32(x1[12], x1[19]);
+  x2[19] = _mm_sub_epi32(x1[12], x1[19]);
+  x2[13] = _mm_add_epi32(x1[13], x1[18]);
+  x2[18] = _mm_sub_epi32(x1[13], x1[18]);
+  x2[14] = _mm_add_epi32(x1[14], x1[17]);
+  x2[17] = _mm_sub_epi32(x1[14], x1[17]);
+  x2[15] = _mm_add_epi32(x1[15], x1[16]);
+  x2[16] = _mm_sub_epi32(x1[15], x1[16]);
+  x2[32] = x1[32];
+  x2[33] = x1[33];
+  x2[34] = x1[34];
+  x2[35] = x1[35];
+  x2[36] = x1[36];
+  x2[37] = x1[37];
+  x2[38] = x1[38];
+  x2[39] = x1[39];
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[40], x1[55], x2[40], x2[55],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[41], x1[54], x2[41], x2[54],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[42], x1[53], x2[42], x2[53],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[43], x1[52], x2[43], x2[52],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[44], x1[51], x2[44], x2[51],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[45], x1[50], x2[45], x2[50],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[46], x1[49], x2[46], x2[49],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[47], x1[48], x2[47], x2[48],
+                          __rounding, cos_bit);
+  x2[56] = x1[56];
+  x2[57] = x1[57];
+  x2[58] = x1[58];
+  x2[59] = x1[59];
+  x2[60] = x1[60];
+  x2[61] = x1[61];
+  x2[62] = x1[62];
+  x2[63] = x1[63];
+
+  // stage 3
+  __m128i x3[64];
+  x3[0] = _mm_add_epi32(x2[0], x2[15]);
+  x3[15] = _mm_sub_epi32(x2[0], x2[15]);
+  x3[1] = _mm_add_epi32(x2[1], x2[14]);
+  x3[14] = _mm_sub_epi32(x2[1], x2[14]);
+  x3[2] = _mm_add_epi32(x2[2], x2[13]);
+  x3[13] = _mm_sub_epi32(x2[2], x2[13]);
+  x3[3] = _mm_add_epi32(x2[3], x2[12]);
+  x3[12] = _mm_sub_epi32(x2[3], x2[12]);
+  x3[4] = _mm_add_epi32(x2[4], x2[11]);
+  x3[11] = _mm_sub_epi32(x2[4], x2[11]);
+  x3[5] = _mm_add_epi32(x2[5], x2[10]);
+  x3[10] = _mm_sub_epi32(x2[5], x2[10]);
+  x3[6] = _mm_add_epi32(x2[6], x2[9]);
+  x3[9] = _mm_sub_epi32(x2[6], x2[9]);
+  x3[7] = _mm_add_epi32(x2[7], x2[8]);
+  x3[8] = _mm_sub_epi32(x2[7], x2[8]);
+  x3[16] = x2[16];
+  x3[17] = x2[17];
+  x3[18] = x2[18];
+  x3[19] = x2[19];
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[20], x2[27], x3[20], x3[27],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[21], x2[26], x3[21], x3[26],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[22], x2[25], x3[22], x3[25],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[23], x2[24], x3[23], x3[24],
+                          __rounding, cos_bit);
+  x3[28] = x2[28];
+  x3[29] = x2[29];
+  x3[30] = x2[30];
+  x3[31] = x2[31];
+  x3[32] = _mm_add_epi32(x2[32], x2[47]);
+  x3[47] = _mm_sub_epi32(x2[32], x2[47]);
+  x3[33] = _mm_add_epi32(x2[33], x2[46]);
+  x3[46] = _mm_sub_epi32(x2[33], x2[46]);
+  x3[34] = _mm_add_epi32(x2[34], x2[45]);
+  x3[45] = _mm_sub_epi32(x2[34], x2[45]);
+  x3[35] = _mm_add_epi32(x2[35], x2[44]);
+  x3[44] = _mm_sub_epi32(x2[35], x2[44]);
+  x3[36] = _mm_add_epi32(x2[36], x2[43]);
+  x3[43] = _mm_sub_epi32(x2[36], x2[43]);
+  x3[37] = _mm_add_epi32(x2[37], x2[42]);
+  x3[42] = _mm_sub_epi32(x2[37], x2[42]);
+  x3[38] = _mm_add_epi32(x2[38], x2[41]);
+  x3[41] = _mm_sub_epi32(x2[38], x2[41]);
+  x3[39] = _mm_add_epi32(x2[39], x2[40]);
+  x3[40] = _mm_sub_epi32(x2[39], x2[40]);
+  x3[48] = _mm_sub_epi32(x2[63], x2[48]);
+  x3[63] = _mm_add_epi32(x2[63], x2[48]);
+  x3[49] = _mm_sub_epi32(x2[62], x2[49]);
+  x3[62] = _mm_add_epi32(x2[62], x2[49]);
+  x3[50] = _mm_sub_epi32(x2[61], x2[50]);
+  x3[61] = _mm_add_epi32(x2[61], x2[50]);
+  x3[51] = _mm_sub_epi32(x2[60], x2[51]);
+  x3[60] = _mm_add_epi32(x2[60], x2[51]);
+  x3[52] = _mm_sub_epi32(x2[59], x2[52]);
+  x3[59] = _mm_add_epi32(x2[59], x2[52]);
+  x3[53] = _mm_sub_epi32(x2[58], x2[53]);
+  x3[58] = _mm_add_epi32(x2[58], x2[53]);
+  x3[54] = _mm_sub_epi32(x2[57], x2[54]);
+  x3[57] = _mm_add_epi32(x2[57], x2[54]);
+  x3[55] = _mm_sub_epi32(x2[56], x2[55]);
+  x3[56] = _mm_add_epi32(x2[56], x2[55]);
+
+  // stage 4
+  __m128i x4[64];
+  x4[0] = _mm_add_epi32(x3[0], x3[7]);
+  x4[7] = _mm_sub_epi32(x3[0], x3[7]);
+  x4[1] = _mm_add_epi32(x3[1], x3[6]);
+  x4[6] = _mm_sub_epi32(x3[1], x3[6]);
+  x4[2] = _mm_add_epi32(x3[2], x3[5]);
+  x4[5] = _mm_sub_epi32(x3[2], x3[5]);
+  x4[3] = _mm_add_epi32(x3[3], x3[4]);
+  x4[4] = _mm_sub_epi32(x3[3], x3[4]);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[10], x3[13], x4[10], x4[13],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[11], x3[12], x4[11], x4[12],
+                          __rounding, cos_bit);
+  x4[14] = x3[14];
+  x4[15] = x3[15];
+  x4[16] = _mm_add_epi32(x3[16], x3[23]);
+  x4[23] = _mm_sub_epi32(x3[16], x3[23]);
+  x4[17] = _mm_add_epi32(x3[17], x3[22]);
+  x4[22] = _mm_sub_epi32(x3[17], x3[22]);
+  x4[18] = _mm_add_epi32(x3[18], x3[21]);
+  x4[21] = _mm_sub_epi32(x3[18], x3[21]);
+  x4[19] = _mm_add_epi32(x3[19], x3[20]);
+  x4[20] = _mm_sub_epi32(x3[19], x3[20]);
+  x4[24] = _mm_sub_epi32(x3[31], x3[24]);
+  x4[31] = _mm_add_epi32(x3[31], x3[24]);
+  x4[25] = _mm_sub_epi32(x3[30], x3[25]);
+  x4[30] = _mm_add_epi32(x3[30], x3[25]);
+  x4[26] = _mm_sub_epi32(x3[29], x3[26]);
+  x4[29] = _mm_add_epi32(x3[29], x3[26]);
+  x4[27] = _mm_sub_epi32(x3[28], x3[27]);
+  x4[28] = _mm_add_epi32(x3[28], x3[27]);
+  x4[32] = x3[32];
+  x4[33] = x3[33];
+  x4[34] = x3[34];
+  x4[35] = x3[35];
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[36], x3[59], x4[36], x4[59],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[37], x3[58], x4[37], x4[58],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[38], x3[57], x4[38], x4[57],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[39], x3[56], x4[39], x4[56],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[40], x3[55], x4[40], x4[55],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[41], x3[54], x4[41], x4[54],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[42], x3[53], x4[42], x4[53],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[43], x3[52], x4[43], x4[52],
+                          __rounding, cos_bit);
+  x4[44] = x3[44];
+  x4[45] = x3[45];
+  x4[46] = x3[46];
+  x4[47] = x3[47];
+  x4[48] = x3[48];
+  x4[49] = x3[49];
+  x4[50] = x3[50];
+  x4[51] = x3[51];
+  x4[60] = x3[60];
+  x4[61] = x3[61];
+  x4[62] = x3[62];
+  x4[63] = x3[63];
+
+  // stage 5
+  __m128i x5[64];
+  x5[0] = _mm_add_epi32(x4[0], x4[3]);
+  x5[3] = _mm_sub_epi32(x4[0], x4[3]);
+  x5[1] = _mm_add_epi32(x4[1], x4[2]);
+  x5[2] = _mm_sub_epi32(x4[1], x4[2]);
+  x5[4] = x4[4];
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x4[5], x4[6], x5[5], x5[6],
+                          __rounding, cos_bit);
+  x5[7] = x4[7];
+  x5[8] = _mm_add_epi32(x4[8], x4[11]);
+  x5[11] = _mm_sub_epi32(x4[8], x4[11]);
+  x5[9] = _mm_add_epi32(x4[9], x4[10]);
+  x5[10] = _mm_sub_epi32(x4[9], x4[10]);
+  x5[12] = _mm_sub_epi32(x4[15], x4[12]);
+  x5[15] = _mm_add_epi32(x4[15], x4[12]);
+  x5[13] = _mm_sub_epi32(x4[14], x4[13]);
+  x5[14] = _mm_add_epi32(x4[14], x4[13]);
+  x5[16] = x4[16];
+  x5[17] = x4[17];
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[18], x4[29], x5[18], x5[29],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[19], x4[28], x5[19], x5[28],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[20], x4[27], x5[20], x5[27],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[21], x4[26], x5[21], x5[26],
+                          __rounding, cos_bit);
+  x5[22] = x4[22];
+  x5[23] = x4[23];
+  x5[24] = x4[24];
+  x5[25] = x4[25];
+  x5[30] = x4[30];
+  x5[31] = x4[31];
+  x5[32] = _mm_add_epi32(x4[32], x4[39]);
+  x5[39] = _mm_sub_epi32(x4[32], x4[39]);
+  x5[33] = _mm_add_epi32(x4[33], x4[38]);
+  x5[38] = _mm_sub_epi32(x4[33], x4[38]);
+  x5[34] = _mm_add_epi32(x4[34], x4[37]);
+  x5[37] = _mm_sub_epi32(x4[34], x4[37]);
+  x5[35] = _mm_add_epi32(x4[35], x4[36]);
+  x5[36] = _mm_sub_epi32(x4[35], x4[36]);
+  x5[40] = _mm_sub_epi32(x4[47], x4[40]);
+  x5[47] = _mm_add_epi32(x4[47], x4[40]);
+  x5[41] = _mm_sub_epi32(x4[46], x4[41]);
+  x5[46] = _mm_add_epi32(x4[46], x4[41]);
+  x5[42] = _mm_sub_epi32(x4[45], x4[42]);
+  x5[45] = _mm_add_epi32(x4[45], x4[42]);
+  x5[43] = _mm_sub_epi32(x4[44], x4[43]);
+  x5[44] = _mm_add_epi32(x4[44], x4[43]);
+  x5[48] = _mm_add_epi32(x4[48], x4[55]);
+  x5[55] = _mm_sub_epi32(x4[48], x4[55]);
+  x5[49] = _mm_add_epi32(x4[49], x4[54]);
+  x5[54] = _mm_sub_epi32(x4[49], x4[54]);
+  x5[50] = _mm_add_epi32(x4[50], x4[53]);
+  x5[53] = _mm_sub_epi32(x4[50], x4[53]);
+  x5[51] = _mm_add_epi32(x4[51], x4[52]);
+  x5[52] = _mm_sub_epi32(x4[51], x4[52]);
+  x5[56] = _mm_sub_epi32(x4[63], x4[56]);
+  x5[63] = _mm_add_epi32(x4[63], x4[56]);
+  x5[57] = _mm_sub_epi32(x4[62], x4[57]);
+  x5[62] = _mm_add_epi32(x4[62], x4[57]);
+  x5[58] = _mm_sub_epi32(x4[61], x4[58]);
+  x5[61] = _mm_add_epi32(x4[61], x4[58]);
+  x5[59] = _mm_sub_epi32(x4[60], x4[59]);
+  x5[60] = _mm_add_epi32(x4[60], x4[59]);
+
+  // stage 6
+  __m128i x6[64];
+  btf_32_type0_sse4_1_new(cospi_p32, cospi_p32, x5[0], x5[1], x6[0], x6[1],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p48, cospi_p16, x5[2], x5[3], x6[2], x6[3],
+                          __rounding, cos_bit);
+  x6[4] = _mm_add_epi32(x5[4], x5[5]);
+  x6[5] = _mm_sub_epi32(x5[4], x5[5]);
+  x6[6] = _mm_sub_epi32(x5[7], x5[6]);
+  x6[7] = _mm_add_epi32(x5[7], x5[6]);
+  x6[8] = x5[8];
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x5[9], x5[14], x6[9], x6[14],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x5[10], x5[13], x6[10], x6[13],
+                          __rounding, cos_bit);
+  x6[11] = x5[11];
+  x6[12] = x5[12];
+  x6[15] = x5[15];
+  x6[16] = _mm_add_epi32(x5[16], x5[19]);
+  x6[19] = _mm_sub_epi32(x5[16], x5[19]);
+  x6[17] = _mm_add_epi32(x5[17], x5[18]);
+  x6[18] = _mm_sub_epi32(x5[17], x5[18]);
+  x6[20] = _mm_sub_epi32(x5[23], x5[20]);
+  x6[23] = _mm_add_epi32(x5[23], x5[20]);
+  x6[21] = _mm_sub_epi32(x5[22], x5[21]);
+  x6[22] = _mm_add_epi32(x5[22], x5[21]);
+  x6[24] = _mm_add_epi32(x5[24], x5[27]);
+  x6[27] = _mm_sub_epi32(x5[24], x5[27]);
+  x6[25] = _mm_add_epi32(x5[25], x5[26]);
+  x6[26] = _mm_sub_epi32(x5[25], x5[26]);
+  x6[28] = _mm_sub_epi32(x5[31], x5[28]);
+  x6[31] = _mm_add_epi32(x5[31], x5[28]);
+  x6[29] = _mm_sub_epi32(x5[30], x5[29]);
+  x6[30] = _mm_add_epi32(x5[30], x5[29]);
+  x6[32] = x5[32];
+  x6[33] = x5[33];
+  btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[34], x5[61], x6[34], x6[61],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[35], x5[60], x6[35], x6[60],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[36], x5[59], x6[36], x6[59],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[37], x5[58], x6[37], x6[58],
+                          __rounding, cos_bit);
+  x6[38] = x5[38];
+  x6[39] = x5[39];
+  x6[40] = x5[40];
+  x6[41] = x5[41];
+  btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[42], x5[53], x6[42], x6[53],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[43], x5[52], x6[43], x6[52],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[44], x5[51], x6[44], x6[51],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[45], x5[50], x6[45], x6[50],
+                          __rounding, cos_bit);
+  x6[46] = x5[46];
+  x6[47] = x5[47];
+  x6[48] = x5[48];
+  x6[49] = x5[49];
+  x6[54] = x5[54];
+  x6[55] = x5[55];
+  x6[56] = x5[56];
+  x6[57] = x5[57];
+  x6[62] = x5[62];
+  x6[63] = x5[63];
+
+  // stage 7
+  __m128i x7[64];
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  btf_32_type1_sse4_1_new(cospi_p56, cospi_p08, x6[4], x6[7], x7[4], x7[7],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p24, cospi_p40, x6[5], x6[6], x7[5], x7[6],
+                          __rounding, cos_bit);
+  x7[8] = _mm_add_epi32(x6[8], x6[9]);
+  x7[9] = _mm_sub_epi32(x6[8], x6[9]);
+  x7[10] = _mm_sub_epi32(x6[11], x6[10]);
+  x7[11] = _mm_add_epi32(x6[11], x6[10]);
+  x7[12] = _mm_add_epi32(x6[12], x6[13]);
+  x7[13] = _mm_sub_epi32(x6[12], x6[13]);
+  x7[14] = _mm_sub_epi32(x6[15], x6[14]);
+  x7[15] = _mm_add_epi32(x6[15], x6[14]);
+  x7[16] = x6[16];
+  btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x6[17], x6[30], x7[17], x7[30],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x6[18], x6[29], x7[18], x7[29],
+                          __rounding, cos_bit);
+  x7[19] = x6[19];
+  x7[20] = x6[20];
+  btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x6[21], x6[26], x7[21], x7[26],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x6[22], x6[25], x7[22], x7[25],
+                          __rounding, cos_bit);
+  x7[23] = x6[23];
+  x7[24] = x6[24];
+  x7[27] = x6[27];
+  x7[28] = x6[28];
+  x7[31] = x6[31];
+  x7[32] = _mm_add_epi32(x6[32], x6[35]);
+  x7[35] = _mm_sub_epi32(x6[32], x6[35]);
+  x7[33] = _mm_add_epi32(x6[33], x6[34]);
+  x7[34] = _mm_sub_epi32(x6[33], x6[34]);
+  x7[36] = _mm_sub_epi32(x6[39], x6[36]);
+  x7[39] = _mm_add_epi32(x6[39], x6[36]);
+  x7[37] = _mm_sub_epi32(x6[38], x6[37]);
+  x7[38] = _mm_add_epi32(x6[38], x6[37]);
+  x7[40] = _mm_add_epi32(x6[40], x6[43]);
+  x7[43] = _mm_sub_epi32(x6[40], x6[43]);
+  x7[41] = _mm_add_epi32(x6[41], x6[42]);
+  x7[42] = _mm_sub_epi32(x6[41], x6[42]);
+  x7[44] = _mm_sub_epi32(x6[47], x6[44]);
+  x7[47] = _mm_add_epi32(x6[47], x6[44]);
+  x7[45] = _mm_sub_epi32(x6[46], x6[45]);
+  x7[46] = _mm_add_epi32(x6[46], x6[45]);
+  x7[48] = _mm_add_epi32(x6[48], x6[51]);
+  x7[51] = _mm_sub_epi32(x6[48], x6[51]);
+  x7[49] = _mm_add_epi32(x6[49], x6[50]);
+  x7[50] = _mm_sub_epi32(x6[49], x6[50]);
+  x7[52] = _mm_sub_epi32(x6[55], x6[52]);
+  x7[55] = _mm_add_epi32(x6[55], x6[52]);
+  x7[53] = _mm_sub_epi32(x6[54], x6[53]);
+  x7[54] = _mm_add_epi32(x6[54], x6[53]);
+  x7[56] = _mm_add_epi32(x6[56], x6[59]);
+  x7[59] = _mm_sub_epi32(x6[56], x6[59]);
+  x7[57] = _mm_add_epi32(x6[57], x6[58]);
+  x7[58] = _mm_sub_epi32(x6[57], x6[58]);
+  x7[60] = _mm_sub_epi32(x6[63], x6[60]);
+  x7[63] = _mm_add_epi32(x6[63], x6[60]);
+  x7[61] = _mm_sub_epi32(x6[62], x6[61]);
+  x7[62] = _mm_add_epi32(x6[62], x6[61]);
+
+  // stage 8
+  __m128i x8[64];
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+  btf_32_type1_sse4_1_new(cospi_p60, cospi_p04, x7[8], x7[15], x8[8], x8[15],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p28, cospi_p36, x7[9], x7[14], x8[9], x8[14],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p44, cospi_p20, x7[10], x7[13], x8[10], x8[13],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p12, cospi_p52, x7[11], x7[12], x8[11], x8[12],
+                          __rounding, cos_bit);
+  x8[16] = _mm_add_epi32(x7[16], x7[17]);
+  x8[17] = _mm_sub_epi32(x7[16], x7[17]);
+  x8[18] = _mm_sub_epi32(x7[19], x7[18]);
+  x8[19] = _mm_add_epi32(x7[19], x7[18]);
+  x8[20] = _mm_add_epi32(x7[20], x7[21]);
+  x8[21] = _mm_sub_epi32(x7[20], x7[21]);
+  x8[22] = _mm_sub_epi32(x7[23], x7[22]);
+  x8[23] = _mm_add_epi32(x7[23], x7[22]);
+  x8[24] = _mm_add_epi32(x7[24], x7[25]);
+  x8[25] = _mm_sub_epi32(x7[24], x7[25]);
+  x8[26] = _mm_sub_epi32(x7[27], x7[26]);
+  x8[27] = _mm_add_epi32(x7[27], x7[26]);
+  x8[28] = _mm_add_epi32(x7[28], x7[29]);
+  x8[29] = _mm_sub_epi32(x7[28], x7[29]);
+  x8[30] = _mm_sub_epi32(x7[31], x7[30]);
+  x8[31] = _mm_add_epi32(x7[31], x7[30]);
+  x8[32] = x7[32];
+  btf_32_type0_sse4_1_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61],
+                          __rounding, cos_bit);
+  x8[35] = x7[35];
+  x8[36] = x7[36];
+  btf_32_type0_sse4_1_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57],
+                          __rounding, cos_bit);
+  x8[39] = x7[39];
+  x8[40] = x7[40];
+  btf_32_type0_sse4_1_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53],
+                          __rounding, cos_bit);
+  x8[43] = x7[43];
+  x8[44] = x7[44];
+  btf_32_type0_sse4_1_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49],
+                          __rounding, cos_bit);
+  x8[47] = x7[47];
+  x8[48] = x7[48];
+  x8[51] = x7[51];
+  x8[52] = x7[52];
+  x8[55] = x7[55];
+  x8[56] = x7[56];
+  x8[59] = x7[59];
+  x8[60] = x7[60];
+  x8[63] = x7[63];
+
+  // stage 9
+  __m128i x9[64];
+  x9[0] = x8[0];
+  x9[1] = x8[1];
+  x9[2] = x8[2];
+  x9[3] = x8[3];
+  x9[4] = x8[4];
+  x9[5] = x8[5];
+  x9[6] = x8[6];
+  x9[7] = x8[7];
+  x9[8] = x8[8];
+  x9[9] = x8[9];
+  x9[10] = x8[10];
+  x9[11] = x8[11];
+  x9[12] = x8[12];
+  x9[13] = x8[13];
+  x9[14] = x8[14];
+  x9[15] = x8[15];
+  btf_32_type1_sse4_1_new(cospi_p62, cospi_p02, x8[16], x8[31], x9[16], x9[31],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p30, cospi_p34, x8[17], x8[30], x9[17], x9[30],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p46, cospi_p18, x8[18], x8[29], x9[18], x9[29],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p14, cospi_p50, x8[19], x8[28], x9[19], x9[28],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p54, cospi_p10, x8[20], x8[27], x9[20], x9[27],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p22, cospi_p42, x8[21], x8[26], x9[21], x9[26],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p38, cospi_p26, x8[22], x8[25], x9[22], x9[25],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p06, cospi_p58, x8[23], x8[24], x9[23], x9[24],
+                          __rounding, cos_bit);
+  x9[32] = _mm_add_epi32(x8[32], x8[33]);
+  x9[33] = _mm_sub_epi32(x8[32], x8[33]);
+  x9[34] = _mm_sub_epi32(x8[35], x8[34]);
+  x9[35] = _mm_add_epi32(x8[35], x8[34]);
+  x9[36] = _mm_add_epi32(x8[36], x8[37]);
+  x9[37] = _mm_sub_epi32(x8[36], x8[37]);
+  x9[38] = _mm_sub_epi32(x8[39], x8[38]);
+  x9[39] = _mm_add_epi32(x8[39], x8[38]);
+  x9[40] = _mm_add_epi32(x8[40], x8[41]);
+  x9[41] = _mm_sub_epi32(x8[40], x8[41]);
+  x9[42] = _mm_sub_epi32(x8[43], x8[42]);
+  x9[43] = _mm_add_epi32(x8[43], x8[42]);
+  x9[44] = _mm_add_epi32(x8[44], x8[45]);
+  x9[45] = _mm_sub_epi32(x8[44], x8[45]);
+  x9[46] = _mm_sub_epi32(x8[47], x8[46]);
+  x9[47] = _mm_add_epi32(x8[47], x8[46]);
+  x9[48] = _mm_add_epi32(x8[48], x8[49]);
+  x9[49] = _mm_sub_epi32(x8[48], x8[49]);
+  x9[50] = _mm_sub_epi32(x8[51], x8[50]);
+  x9[51] = _mm_add_epi32(x8[51], x8[50]);
+  x9[52] = _mm_add_epi32(x8[52], x8[53]);
+  x9[53] = _mm_sub_epi32(x8[52], x8[53]);
+  x9[54] = _mm_sub_epi32(x8[55], x8[54]);
+  x9[55] = _mm_add_epi32(x8[55], x8[54]);
+  x9[56] = _mm_add_epi32(x8[56], x8[57]);
+  x9[57] = _mm_sub_epi32(x8[56], x8[57]);
+  x9[58] = _mm_sub_epi32(x8[59], x8[58]);
+  x9[59] = _mm_add_epi32(x8[59], x8[58]);
+  x9[60] = _mm_add_epi32(x8[60], x8[61]);
+  x9[61] = _mm_sub_epi32(x8[60], x8[61]);
+  x9[62] = _mm_sub_epi32(x8[63], x8[62]);
+  x9[63] = _mm_add_epi32(x8[63], x8[62]);
+
+  // stage 10
+  __m128i x10[64];
+  x10[0] = x9[0];
+  x10[1] = x9[1];
+  x10[2] = x9[2];
+  x10[3] = x9[3];
+  x10[4] = x9[4];
+  x10[5] = x9[5];
+  x10[6] = x9[6];
+  x10[7] = x9[7];
+  x10[8] = x9[8];
+  x10[9] = x9[9];
+  x10[10] = x9[10];
+  x10[11] = x9[11];
+  x10[12] = x9[12];
+  x10[13] = x9[13];
+  x10[14] = x9[14];
+  x10[15] = x9[15];
+  x10[16] = x9[16];
+  x10[17] = x9[17];
+  x10[18] = x9[18];
+  x10[19] = x9[19];
+  x10[20] = x9[20];
+  x10[21] = x9[21];
+  x10[22] = x9[22];
+  x10[23] = x9[23];
+  x10[24] = x9[24];
+  x10[25] = x9[25];
+  x10[26] = x9[26];
+  x10[27] = x9[27];
+  x10[28] = x9[28];
+  x10[29] = x9[29];
+  x10[30] = x9[30];
+  x10[31] = x9[31];
+  btf_32_type1_sse4_1_new(cospi_p63, cospi_p01, x9[32], x9[63], x10[32],
+                          x10[63], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p31, cospi_p33, x9[33], x9[62], x10[33],
+                          x10[62], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p47, cospi_p17, x9[34], x9[61], x10[34],
+                          x10[61], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p15, cospi_p49, x9[35], x9[60], x10[35],
+                          x10[60], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p55, cospi_p09, x9[36], x9[59], x10[36],
+                          x10[59], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p23, cospi_p41, x9[37], x9[58], x10[37],
+                          x10[58], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p39, cospi_p25, x9[38], x9[57], x10[38],
+                          x10[57], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p07, cospi_p57, x9[39], x9[56], x10[39],
+                          x10[56], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p59, cospi_p05, x9[40], x9[55], x10[40],
+                          x10[55], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p27, cospi_p37, x9[41], x9[54], x10[41],
+                          x10[54], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p43, cospi_p21, x9[42], x9[53], x10[42],
+                          x10[53], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p11, cospi_p53, x9[43], x9[52], x10[43],
+                          x10[52], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p51, cospi_p13, x9[44], x9[51], x10[44],
+                          x10[51], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p19, cospi_p45, x9[45], x9[50], x10[45],
+                          x10[50], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p35, cospi_p29, x9[46], x9[49], x10[46],
+                          x10[49], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p03, cospi_p61, x9[47], x9[48], x10[47],
+                          x10[48], __rounding, cos_bit);
+
+  // stage 11
+  output[0] = x10[0];
+  output[1] = x10[32];
+  output[2] = x10[16];
+  output[3] = x10[48];
+  output[4] = x10[8];
+  output[5] = x10[40];
+  output[6] = x10[24];
+  output[7] = x10[56];
+  output[8] = x10[4];
+  output[9] = x10[36];
+  output[10] = x10[20];
+  output[11] = x10[52];
+  output[12] = x10[12];
+  output[13] = x10[44];
+  output[14] = x10[28];
+  output[15] = x10[60];
+  output[16] = x10[2];
+  output[17] = x10[34];
+  output[18] = x10[18];
+  output[19] = x10[50];
+  output[20] = x10[10];
+  output[21] = x10[42];
+  output[22] = x10[26];
+  output[23] = x10[58];
+  output[24] = x10[6];
+  output[25] = x10[38];
+  output[26] = x10[22];
+  output[27] = x10[54];
+  output[28] = x10[14];
+  output[29] = x10[46];
+  output[30] = x10[30];
+  output[31] = x10[62];
+  output[32] = x10[1];
+  output[33] = x10[33];
+  output[34] = x10[17];
+  output[35] = x10[49];
+  output[36] = x10[9];
+  output[37] = x10[41];
+  output[38] = x10[25];
+  output[39] = x10[57];
+  output[40] = x10[5];
+  output[41] = x10[37];
+  output[42] = x10[21];
+  output[43] = x10[53];
+  output[44] = x10[13];
+  output[45] = x10[45];
+  output[46] = x10[29];
+  output[47] = x10[61];
+  output[48] = x10[3];
+  output[49] = x10[35];
+  output[50] = x10[19];
+  output[51] = x10[51];
+  output[52] = x10[11];
+  output[53] = x10[43];
+  output[54] = x10[27];
+  output[55] = x10[59];
+  output[56] = x10[7];
+  output[57] = x10[39];
+  output[58] = x10[23];
+  output[59] = x10[55];
+  output[60] = x10[15];
+  output[61] = x10[47];
+  output[62] = x10[31];
+  output[63] = x10[63];
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
new file mode 100644
index 000000000..abb95f31e
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+
+static INLINE void int16_array_with_stride_to_int32_array_without_stride(
+    const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
+  int r, c;
+  for (r = 0; r < txfm1d_size; r++) {
+    for (c = 0; c < txfm1d_size; c++) {
+      output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
+    }
+  }
+}
+
+typedef void (*TxfmFuncSSE2)(const __m128i *input, __m128i *output,
+                             const int8_t cos_bit, const int8_t *stage_range);
+
+static void fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+                              const int8_t cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 32;
+  const int num_per_128 = 4;
+  __m128i buf0[32];
+  __m128i buf1[32];
+  int col_num = txfm_size / num_per_128;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    int j;
+    for (j = 0; j < 32; ++j) {
+      buf0[j] = input[j * col_num + col];
+    }
+    av1_fdct32_new_sse4_1(buf0, buf1, cos_bit);
+    for (j = 0; j < 32; ++j) {
+      output[j * col_num + col] = buf1[j];
+    }
+  }
+}
+
+static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT32: return fdct32_new_sse4_1; break;
+    default: assert(0);
+  }
+  return NULL;
+}
+
+static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
+                                     const int stride,
+                                     const TXFM_2D_FLIP_CFG *cfg,
+                                     int32_t *txfm_buf) {
+  // TODO(sarahparker) This does not currently support rectangular transforms
+  // and will break without splitting txfm_size out into row and col size.
+  // Rectangular transforms use c code only, so it should be ok for now.
+  // It will be corrected when there are sse implementations for rectangular
+  // transforms.
+  assert(cfg->tx_size < TX_SIZES);
+  const int txfm_size = tx_size_wide[cfg->tx_size];
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->stage_range_row;
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+  __m128i *buf_128 = (__m128i *)txfm_buf;
+  __m128i *out_128 = (__m128i *)output;
+  int num_per_128 = 4;
+  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+
+  int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
+                                                        txfm_size);
+  av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
+  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+  av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+  transpose_32(txfm_size, out_128, buf_128);
+  txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
+  av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
+  transpose_32(txfm_size, buf_128, out_128);
+}
+
+void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
+  (void)bd;
+  fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
+}
+
+static INLINE void transpose_32_4x4x2(int stride, const __m128i *inputA,
+                                      const __m128i *inputB, __m128i *output) {
+  __m128i temp0 = _mm_unpacklo_epi32(inputA[0], inputA[2]);
+  __m128i temp1 = _mm_unpackhi_epi32(inputA[0], inputA[2]);
+  __m128i temp2 = _mm_unpacklo_epi32(inputA[1], inputA[3]);
+  __m128i temp3 = _mm_unpackhi_epi32(inputA[1], inputA[3]);
+
+  output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+  output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+  output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+  output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+
+  temp0 = _mm_unpacklo_epi32(inputB[0], inputB[2]);
+  temp1 = _mm_unpackhi_epi32(inputB[0], inputB[2]);
+  temp2 = _mm_unpacklo_epi32(inputB[1], inputB[3]);
+  temp3 = _mm_unpackhi_epi32(inputB[1], inputB[3]);
+
+  output[4 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+  output[5 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+  output[6 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+  output[7 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+                                          int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X64;
+  __m128i buf0[64], buf1[512];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = fdct8x64_new_sse2;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    __m128i bufA[64];
+    __m128i bufB[64];
+    __m128i *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+      bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+    }
+    av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row);
+    av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row);
+    av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
+    av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < width_div8; ++j) {
+      __m128i *out = (__m128i *)(output8 + 4 * j);
+      transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output,
+                                          int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_64X32;
+  __m128i buf0[64], buf1[256];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+  assert(tx_type == DCT_DCT);
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    __m128i bufA[64];
+    __m128i bufB[64];
+    __m128i *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+      bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+    }
+    av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row);
+    av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row);
+    av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
+    av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < width_div8; ++j) {
+      __m128i *out = (__m128i *)(output8 + 4 * j);
+      transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output,
+                                          int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_32X64;
+  __m128i buf0[64], buf1[256];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = fdct8x64_new_sse2;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    __m128i bufA[32];
+    __m128i bufB[32];
+    __m128i *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+      bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+    }
+    av1_fdct32_new_sse4_1(bufA, bufA, cos_bit_row);
+    av1_fdct32_new_sse4_1(bufB, bufB, cos_bit_row);
+    av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
+    av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < (32 / 4); ++j) {
+      __m128i *out = (__m128i *)(output8 + 4 * j);
+      transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+    }
+  }
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+  av1_lowbd_fwd_txfm2d_4x4_sse2,    // 4x4 transform
+  av1_lowbd_fwd_txfm2d_8x8_sse2,    // 8x8 transform
+  av1_lowbd_fwd_txfm2d_16x16_sse2,  // 16x16 transform
+  av1_lowbd_fwd_txfm2d_32x32_sse2,  // 32x32 transform
+  lowbd_fwd_txfm2d_64x64_sse4_1,    // 64x64 transform
+  av1_lowbd_fwd_txfm2d_4x8_sse2,    // 4x8 transform
+  av1_lowbd_fwd_txfm2d_8x4_sse2,    // 8x4 transform
+  av1_lowbd_fwd_txfm2d_8x16_sse2,   // 8x16 transform
+  av1_lowbd_fwd_txfm2d_16x8_sse2,   // 16x8 transform
+  av1_lowbd_fwd_txfm2d_16x32_sse2,  // 16x32 transform
+  av1_lowbd_fwd_txfm2d_32x16_sse2,  // 32x16 transform
+  lowbd_fwd_txfm2d_32x64_sse4_1,    // 32x64 transform
+  lowbd_fwd_txfm2d_64x32_sse4_1,    // 64x32 transform
+  av1_lowbd_fwd_txfm2d_4x16_sse2,   // 4x16 transform
+  av1_lowbd_fwd_txfm2d_16x4_sse2,   // 16x4 transform
+  av1_lowbd_fwd_txfm2d_8x32_sse2,   // 8x32 transform
+  av1_lowbd_fwd_txfm2d_32x8_sse2,   // 32x8 transform
+  av1_lowbd_fwd_txfm2d_16x64_sse2,  // 16x64 transform
+  av1_lowbd_fwd_txfm2d_64x16_sse2,  // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff,
+                               int diff_stride, TxfmParam *txfm_param) {
+  FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+  if ((fwd_txfm2d_func == NULL) ||
+      (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) {
+    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+  } else {
+    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+                    txfm_param->bd);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c
new file mode 100644
index 000000000..6aae7ce1e
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c
@@ -0,0 +1,2889 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+
+// TODO(linfengz): refine fdct4x8 and fadst4x8 optimization (if possible).
+
+static void fdct4x4_new_sse2(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+  __m128i u[4], v[4];
+
+  u[0] = _mm_unpacklo_epi16(input[0], input[1]);
+  u[1] = _mm_unpacklo_epi16(input[3], input[2]);
+
+  v[0] = _mm_add_epi16(u[0], u[1]);
+  v[1] = _mm_sub_epi16(u[0], u[1]);
+
+  u[0] = _mm_madd_epi16(v[0], cospi_p32_p32);  // 0
+  u[1] = _mm_madd_epi16(v[0], cospi_p32_m32);  // 2
+  u[2] = _mm_madd_epi16(v[1], cospi_p16_p48);  // 1
+  u[3] = _mm_madd_epi16(v[1], cospi_p48_m16);  // 3
+
+  v[0] = _mm_add_epi32(u[0], __rounding);
+  v[1] = _mm_add_epi32(u[1], __rounding);
+  v[2] = _mm_add_epi32(u[2], __rounding);
+  v[3] = _mm_add_epi32(u[3], __rounding);
+  u[0] = _mm_srai_epi32(v[0], cos_bit);
+  u[1] = _mm_srai_epi32(v[1], cos_bit);
+  u[2] = _mm_srai_epi32(v[2], cos_bit);
+  u[3] = _mm_srai_epi32(v[3], cos_bit);
+
+  output[0] = _mm_packs_epi32(u[0], u[1]);
+  output[1] = _mm_packs_epi32(u[2], u[3]);
+  output[2] = _mm_srli_si128(output[0], 8);
+  output[3] = _mm_srli_si128(output[1], 8);
+}
+
+static void fdct8x4_new_sse2(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+
+  // stage 1
+  __m128i x1[4];
+  x1[0] = _mm_adds_epi16(input[0], input[3]);
+  x1[3] = _mm_subs_epi16(input[0], input[3]);
+  x1[1] = _mm_adds_epi16(input[1], input[2]);
+  x1[2] = _mm_subs_epi16(input[1], input[2]);
+
+  // stage 2
+  __m128i x2[4];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x2[0], x2[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x1[2], x1[3], x2[2], x2[3]);
+
+  // stage 3
+  output[0] = x2[0];
+  output[1] = x2[2];
+  output[2] = x2[1];
+  output[3] = x2[3];
+}
+
+static void fdct4x8_new_sse2(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = _mm_adds_epi16(input[0], input[7]);
+  x1[7] = _mm_subs_epi16(input[0], input[7]);
+  x1[1] = _mm_adds_epi16(input[1], input[6]);
+  x1[6] = _mm_subs_epi16(input[1], input[6]);
+  x1[2] = _mm_adds_epi16(input[2], input[5]);
+  x1[5] = _mm_subs_epi16(input[2], input[5]);
+  x1[3] = _mm_adds_epi16(input[3], input[4]);
+  x1[4] = _mm_subs_epi16(input[3], input[4]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = _mm_adds_epi16(x1[0], x1[3]);
+  x2[3] = _mm_subs_epi16(x1[0], x1[3]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[2]);
+  x2[2] = _mm_subs_epi16(x1[1], x1[2]);
+  x2[4] = x1[4];
+  btf_16_w4_sse2(&cospi_m32_p32, &cospi_p32_p32, __rounding, cos_bit, &x1[5],
+                 &x1[6], &x2[5], &x2[6]);
+  x2[7] = x1[7];
+
+  // stage 3
+  __m128i x3[8];
+  btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x2[0],
+                 &x2[1], &x3[0], &x3[1]);
+  btf_16_w4_sse2(&cospi_p48_p16, &cospi_m16_p48, __rounding, cos_bit, &x2[2],
+                 &x2[3], &x3[2], &x3[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[5]);
+  x3[5] = _mm_subs_epi16(x2[4], x2[5]);
+  x3[6] = _mm_subs_epi16(x2[7], x2[6]);
+  x3[7] = _mm_adds_epi16(x2[7], x2[6]);
+
+  // stage 4
+  __m128i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_w4_sse2(&cospi_p56_p08, &cospi_m08_p56, __rounding, cos_bit, &x3[4],
+                 &x3[7], &x4[4], &x4[7]);
+  btf_16_w4_sse2(&cospi_p24_p40, &cospi_m40_p24, __rounding, cos_bit, &x3[5],
+                 &x3[6], &x4[5], &x4[6]);
+
+  // stage 5
+  output[0] = x4[0];
+  output[1] = x4[4];
+  output[2] = x4[2];
+  output[3] = x4[6];
+  output[4] = x4[1];
+  output[5] = x4[5];
+  output[6] = x4[3];
+  output[7] = x4[7];
+}
+
+static void fdct8x8_new_sse2(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = _mm_adds_epi16(input[0], input[7]);
+  x1[7] = _mm_subs_epi16(input[0], input[7]);
+  x1[1] = _mm_adds_epi16(input[1], input[6]);
+  x1[6] = _mm_subs_epi16(input[1], input[6]);
+  x1[2] = _mm_adds_epi16(input[2], input[5]);
+  x1[5] = _mm_subs_epi16(input[2], input[5]);
+  x1[3] = _mm_adds_epi16(input[3], input[4]);
+  x1[4] = _mm_subs_epi16(input[3], input[4]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = _mm_adds_epi16(x1[0], x1[3]);
+  x2[3] = _mm_subs_epi16(x1[0], x1[3]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[2]);
+  x2[2] = _mm_subs_epi16(x1[1], x1[2]);
+  x2[4] = x1[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x2[5], x2[6]);
+  x2[7] = x1[7];
+
+  // stage 3
+  __m128i x3[8];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x2[0], x2[1], x3[0], x3[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x2[2], x2[3], x3[2], x3[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[5]);
+  x3[5] = _mm_subs_epi16(x2[4], x2[5]);
+  x3[6] = _mm_subs_epi16(x2[7], x2[6]);
+  x3[7] = _mm_adds_epi16(x2[7], x2[6]);
+
+  // stage 4
+  __m128i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x3[4], x3[7], x4[4], x4[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x3[5], x3[6], x4[5], x4[6]);
+
+  // stage 5
+  output[0] = x4[0];
+  output[1] = x4[4];
+  output[2] = x4[2];
+  output[3] = x4[6];
+  output[4] = x4[1];
+  output[5] = x4[5];
+  output[6] = x4[3];
+  output[7] = x4[7];
+}
+
+static void fdct8x16_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+
+  // stage 1
+  __m128i x1[16];
+  x1[0] = _mm_adds_epi16(input[0], input[15]);
+  x1[15] = _mm_subs_epi16(input[0], input[15]);
+  x1[1] = _mm_adds_epi16(input[1], input[14]);
+  x1[14] = _mm_subs_epi16(input[1], input[14]);
+  x1[2] = _mm_adds_epi16(input[2], input[13]);
+  x1[13] = _mm_subs_epi16(input[2], input[13]);
+  x1[3] = _mm_adds_epi16(input[3], input[12]);
+  x1[12] = _mm_subs_epi16(input[3], input[12]);
+  x1[4] = _mm_adds_epi16(input[4], input[11]);
+  x1[11] = _mm_subs_epi16(input[4], input[11]);
+  x1[5] = _mm_adds_epi16(input[5], input[10]);
+  x1[10] = _mm_subs_epi16(input[5], input[10]);
+  x1[6] = _mm_adds_epi16(input[6], input[9]);
+  x1[9] = _mm_subs_epi16(input[6], input[9]);
+  x1[7] = _mm_adds_epi16(input[7], input[8]);
+  x1[8] = _mm_subs_epi16(input[7], input[8]);
+
+  // stage 2
+  __m128i x2[16];
+  x2[0] = _mm_adds_epi16(x1[0], x1[7]);
+  x2[7] = _mm_subs_epi16(x1[0], x1[7]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[6]);
+  x2[6] = _mm_subs_epi16(x1[1], x1[6]);
+  x2[2] = _mm_adds_epi16(x1[2], x1[5]);
+  x2[5] = _mm_subs_epi16(x1[2], x1[5]);
+  x2[3] = _mm_adds_epi16(x1[3], x1[4]);
+  x2[4] = _mm_subs_epi16(x1[3], x1[4]);
+  x2[8] = x1[8];
+  x2[9] = x1[9];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[10], x1[13], x2[10], x2[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[11], x1[12], x2[11], x2[12]);
+  x2[14] = x1[14];
+  x2[15] = x1[15];
+
+  // stage 3
+  __m128i x3[16];
+  x3[0] = _mm_adds_epi16(x2[0], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[0], x2[3]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[1], x2[2]);
+  x3[4] = x2[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[5], x2[6], x3[5], x3[6]);
+  x3[7] = x2[7];
+  x3[8] = _mm_adds_epi16(x2[8], x2[11]);
+  x3[11] = _mm_subs_epi16(x2[8], x2[11]);
+  x3[9] = _mm_adds_epi16(x2[9], x2[10]);
+  x3[10] = _mm_subs_epi16(x2[9], x2[10]);
+  x3[12] = _mm_subs_epi16(x2[15], x2[12]);
+  x3[15] = _mm_adds_epi16(x2[15], x2[12]);
+  x3[13] = _mm_subs_epi16(x2[14], x2[13]);
+  x3[14] = _mm_adds_epi16(x2[14], x2[13]);
+
+  // stage 4
+  __m128i x4[16];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x3[0], x3[1], x4[0], x4[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x3[2], x3[3], x4[2], x4[3]);
+  x4[4] = _mm_adds_epi16(x3[4], x3[5]);
+  x4[5] = _mm_subs_epi16(x3[4], x3[5]);
+  x4[6] = _mm_subs_epi16(x3[7], x3[6]);
+  x4[7] = _mm_adds_epi16(x3[7], x3[6]);
+  x4[8] = x3[8];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[9], x3[14], x4[9], x4[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[10], x3[13], x4[10], x4[13]);
+  x4[11] = x3[11];
+  x4[12] = x3[12];
+  x4[15] = x3[15];
+
+  // stage 5
+  __m128i x5[16];
+  x5[0] = x4[0];
+  x5[1] = x4[1];
+  x5[2] = x4[2];
+  x5[3] = x4[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x4[4], x4[7], x5[4], x5[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x4[5], x4[6], x5[5], x5[6]);
+  x5[8] = _mm_adds_epi16(x4[8], x4[9]);
+  x5[9] = _mm_subs_epi16(x4[8], x4[9]);
+  x5[10] = _mm_subs_epi16(x4[11], x4[10]);
+  x5[11] = _mm_adds_epi16(x4[11], x4[10]);
+  x5[12] = _mm_adds_epi16(x4[12], x4[13]);
+  x5[13] = _mm_subs_epi16(x4[12], x4[13]);
+  x5[14] = _mm_subs_epi16(x4[15], x4[14]);
+  x5[15] = _mm_adds_epi16(x4[15], x4[14]);
+
+  // stage 6
+  __m128i x6[16];
+  x6[0] = x5[0];
+  x6[1] = x5[1];
+  x6[2] = x5[2];
+  x6[3] = x5[3];
+  x6[4] = x5[4];
+  x6[5] = x5[5];
+  x6[6] = x5[6];
+  x6[7] = x5[7];
+  btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x5[8], x5[15], x6[8], x6[15]);
+  btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x5[9], x5[14], x6[9], x6[14]);
+  btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x5[10], x5[13], x6[10], x6[13]);
+  btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x5[11], x5[12], x6[11], x6[12]);
+
+  // stage 7
+  output[0] = x6[0];
+  output[1] = x6[8];
+  output[2] = x6[4];
+  output[3] = x6[12];
+  output[4] = x6[2];
+  output[5] = x6[10];
+  output[6] = x6[6];
+  output[7] = x6[14];
+  output[8] = x6[1];
+  output[9] = x6[9];
+  output[10] = x6[5];
+  output[11] = x6[13];
+  output[12] = x6[3];
+  output[13] = x6[11];
+  output[14] = x6[7];
+  output[15] = x6[15];
+}
+
+void fdct8x32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+  __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]);
+  __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]);
+  __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]);
+  __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]);
+  __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]);
+  __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]);
+  __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]);
+  __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]);
+  __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]);
+  __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]);
+  __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]);
+  __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]);
+  __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]);
+  __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]);
+  __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]);
+  __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]);
+
+  // stage 1
+  __m128i x1[32];
+  x1[0] = _mm_adds_epi16(input[0], input[31]);
+  x1[31] = _mm_subs_epi16(input[0], input[31]);
+  x1[1] = _mm_adds_epi16(input[1], input[30]);
+  x1[30] = _mm_subs_epi16(input[1], input[30]);
+  x1[2] = _mm_adds_epi16(input[2], input[29]);
+  x1[29] = _mm_subs_epi16(input[2], input[29]);
+  x1[3] = _mm_adds_epi16(input[3], input[28]);
+  x1[28] = _mm_subs_epi16(input[3], input[28]);
+  x1[4] = _mm_adds_epi16(input[4], input[27]);
+  x1[27] = _mm_subs_epi16(input[4], input[27]);
+  x1[5] = _mm_adds_epi16(input[5], input[26]);
+  x1[26] = _mm_subs_epi16(input[5], input[26]);
+  x1[6] = _mm_adds_epi16(input[6], input[25]);
+  x1[25] = _mm_subs_epi16(input[6], input[25]);
+  x1[7] = _mm_adds_epi16(input[7], input[24]);
+  x1[24] = _mm_subs_epi16(input[7], input[24]);
+  x1[8] = _mm_adds_epi16(input[8], input[23]);
+  x1[23] = _mm_subs_epi16(input[8], input[23]);
+  x1[9] = _mm_adds_epi16(input[9], input[22]);
+  x1[22] = _mm_subs_epi16(input[9], input[22]);
+  x1[10] = _mm_adds_epi16(input[10], input[21]);
+  x1[21] = _mm_subs_epi16(input[10], input[21]);
+  x1[11] = _mm_adds_epi16(input[11], input[20]);
+  x1[20] = _mm_subs_epi16(input[11], input[20]);
+  x1[12] = _mm_adds_epi16(input[12], input[19]);
+  x1[19] = _mm_subs_epi16(input[12], input[19]);
+  x1[13] = _mm_adds_epi16(input[13], input[18]);
+  x1[18] = _mm_subs_epi16(input[13], input[18]);
+  x1[14] = _mm_adds_epi16(input[14], input[17]);
+  x1[17] = _mm_subs_epi16(input[14], input[17]);
+  x1[15] = _mm_adds_epi16(input[15], input[16]);
+  x1[16] = _mm_subs_epi16(input[15], input[16]);
+
+  // stage 2
+  __m128i x2[32];
+  x2[0] = _mm_adds_epi16(x1[0], x1[15]);
+  x2[15] = _mm_subs_epi16(x1[0], x1[15]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[14]);
+  x2[14] = _mm_subs_epi16(x1[1], x1[14]);
+  x2[2] = _mm_adds_epi16(x1[2], x1[13]);
+  x2[13] = _mm_subs_epi16(x1[2], x1[13]);
+  x2[3] = _mm_adds_epi16(x1[3], x1[12]);
+  x2[12] = _mm_subs_epi16(x1[3], x1[12]);
+  x2[4] = _mm_adds_epi16(x1[4], x1[11]);
+  x2[11] = _mm_subs_epi16(x1[4], x1[11]);
+  x2[5] = _mm_adds_epi16(x1[5], x1[10]);
+  x2[10] = _mm_subs_epi16(x1[5], x1[10]);
+  x2[6] = _mm_adds_epi16(x1[6], x1[9]);
+  x2[9] = _mm_subs_epi16(x1[6], x1[9]);
+  x2[7] = _mm_adds_epi16(x1[7], x1[8]);
+  x2[8] = _mm_subs_epi16(x1[7], x1[8]);
+  x2[16] = x1[16];
+  x2[17] = x1[17];
+  x2[18] = x1[18];
+  x2[19] = x1[19];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[20], x1[27], x2[20], x2[27]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[21], x1[26], x2[21], x2[26]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[22], x1[25], x2[22], x2[25]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[23], x1[24], x2[23], x2[24]);
+  x2[28] = x1[28];
+  x2[29] = x1[29];
+  x2[30] = x1[30];
+  x2[31] = x1[31];
+
+  // stage 3
+  __m128i x3[32];
+  x3[0] = _mm_adds_epi16(x2[0], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[0], x2[7]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[1], x2[6]);
+  x3[2] = _mm_adds_epi16(x2[2], x2[5]);
+  x3[5] = _mm_subs_epi16(x2[2], x2[5]);
+  x3[3] = _mm_adds_epi16(x2[3], x2[4]);
+  x3[4] = _mm_subs_epi16(x2[3], x2[4]);
+  x3[8] = x2[8];
+  x3[9] = x2[9];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[10], x2[13], x3[10], x3[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[11], x2[12], x3[11], x3[12]);
+  x3[14] = x2[14];
+  x3[15] = x2[15];
+  x3[16] = _mm_adds_epi16(x2[16], x2[23]);
+  x3[23] = _mm_subs_epi16(x2[16], x2[23]);
+  x3[17] = _mm_adds_epi16(x2[17], x2[22]);
+  x3[22] = _mm_subs_epi16(x2[17], x2[22]);
+  x3[18] = _mm_adds_epi16(x2[18], x2[21]);
+  x3[21] = _mm_subs_epi16(x2[18], x2[21]);
+  x3[19] = _mm_adds_epi16(x2[19], x2[20]);
+  x3[20] = _mm_subs_epi16(x2[19], x2[20]);
+  x3[24] = _mm_subs_epi16(x2[31], x2[24]);
+  x3[31] = _mm_adds_epi16(x2[31], x2[24]);
+  x3[25] = _mm_subs_epi16(x2[30], x2[25]);
+  x3[30] = _mm_adds_epi16(x2[30], x2[25]);
+  x3[26] = _mm_subs_epi16(x2[29], x2[26]);
+  x3[29] = _mm_adds_epi16(x2[29], x2[26]);
+  x3[27] = _mm_subs_epi16(x2[28], x2[27]);
+  x3[28] = _mm_adds_epi16(x2[28], x2[27]);
+
+  // stage 4
+  __m128i x4[32];
+  x4[0] = _mm_adds_epi16(x3[0], x3[3]);
+  x4[3] = _mm_subs_epi16(x3[0], x3[3]);
+  x4[1] = _mm_adds_epi16(x3[1], x3[2]);
+  x4[2] = _mm_subs_epi16(x3[1], x3[2]);
+  x4[4] = x3[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[5], x3[6], x4[5], x4[6]);
+  x4[7] = x3[7];
+  x4[8] = _mm_adds_epi16(x3[8], x3[11]);
+  x4[11] = _mm_subs_epi16(x3[8], x3[11]);
+  x4[9] = _mm_adds_epi16(x3[9], x3[10]);
+  x4[10] = _mm_subs_epi16(x3[9], x3[10]);
+  x4[12] = _mm_subs_epi16(x3[15], x3[12]);
+  x4[15] = _mm_adds_epi16(x3[15], x3[12]);
+  x4[13] = _mm_subs_epi16(x3[14], x3[13]);
+  x4[14] = _mm_adds_epi16(x3[14], x3[13]);
+  x4[16] = x3[16];
+  x4[17] = x3[17];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[18], x3[29], x4[18], x4[29]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[19], x3[28], x4[19], x4[28]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[20], x3[27], x4[20], x4[27]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[21], x3[26], x4[21], x4[26]);
+  x4[22] = x3[22];
+  x4[23] = x3[23];
+  x4[24] = x3[24];
+  x4[25] = x3[25];
+  x4[30] = x3[30];
+  x4[31] = x3[31];
+
+  // stage 5
+  __m128i x5[32];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x4[0], x4[1], x5[0], x5[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x4[2], x4[3], x5[2], x5[3]);
+  x5[4] = _mm_adds_epi16(x4[4], x4[5]);
+  x5[5] = _mm_subs_epi16(x4[4], x4[5]);
+  x5[6] = _mm_subs_epi16(x4[7], x4[6]);
+  x5[7] = _mm_adds_epi16(x4[7], x4[6]);
+  x5[8] = x4[8];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[9], x4[14], x5[9], x5[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[10], x4[13], x5[10], x5[13]);
+  x5[11] = x4[11];
+  x5[12] = x4[12];
+  x5[15] = x4[15];
+  x5[16] = _mm_adds_epi16(x4[16], x4[19]);
+  x5[19] = _mm_subs_epi16(x4[16], x4[19]);
+  x5[17] = _mm_adds_epi16(x4[17], x4[18]);
+  x5[18] = _mm_subs_epi16(x4[17], x4[18]);
+  x5[20] = _mm_subs_epi16(x4[23], x4[20]);
+  x5[23] = _mm_adds_epi16(x4[23], x4[20]);
+  x5[21] = _mm_subs_epi16(x4[22], x4[21]);
+  x5[22] = _mm_adds_epi16(x4[22], x4[21]);
+  x5[24] = _mm_adds_epi16(x4[24], x4[27]);
+  x5[27] = _mm_subs_epi16(x4[24], x4[27]);
+  x5[25] = _mm_adds_epi16(x4[25], x4[26]);
+  x5[26] = _mm_subs_epi16(x4[25], x4[26]);
+  x5[28] = _mm_subs_epi16(x4[31], x4[28]);
+  x5[31] = _mm_adds_epi16(x4[31], x4[28]);
+  x5[29] = _mm_subs_epi16(x4[30], x4[29]);
+  x5[30] = _mm_adds_epi16(x4[30], x4[29]);
+
+  // stage 6
+  __m128i x6[32];
+  x6[0] = x5[0];
+  x6[1] = x5[1];
+  x6[2] = x5[2];
+  x6[3] = x5[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x5[4], x5[7], x6[4], x6[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x5[5], x5[6], x6[5], x6[6]);
+  x6[8] = _mm_adds_epi16(x5[8], x5[9]);
+  x6[9] = _mm_subs_epi16(x5[8], x5[9]);
+  x6[10] = _mm_subs_epi16(x5[11], x5[10]);
+  x6[11] = _mm_adds_epi16(x5[11], x5[10]);
+  x6[12] = _mm_adds_epi16(x5[12], x5[13]);
+  x6[13] = _mm_subs_epi16(x5[12], x5[13]);
+  x6[14] = _mm_subs_epi16(x5[15], x5[14]);
+  x6[15] = _mm_adds_epi16(x5[15], x5[14]);
+  x6[16] = x5[16];
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[17], x5[30], x6[17], x6[30]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[18], x5[29], x6[18], x6[29]);
+  x6[19] = x5[19];
+  x6[20] = x5[20];
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[21], x5[26], x6[21], x6[26]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[22], x5[25], x6[22], x6[25]);
+  x6[23] = x5[23];
+  x6[24] = x5[24];
+  x6[27] = x5[27];
+  x6[28] = x5[28];
+  x6[31] = x5[31];
+
+  // stage 7
+  __m128i x7[32];
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  x7[4] = x6[4];
+  x7[5] = x6[5];
+  x7[6] = x6[6];
+  x7[7] = x6[7];
+  btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x6[8], x6[15], x7[8], x7[15]);
+  btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x6[9], x6[14], x7[9], x7[14]);
+  btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x6[10], x6[13], x7[10], x7[13]);
+  btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x6[11], x6[12], x7[11], x7[12]);
+  x7[16] = _mm_adds_epi16(x6[16], x6[17]);
+  x7[17] = _mm_subs_epi16(x6[16], x6[17]);
+  x7[18] = _mm_subs_epi16(x6[19], x6[18]);
+  x7[19] = _mm_adds_epi16(x6[19], x6[18]);
+  x7[20] = _mm_adds_epi16(x6[20], x6[21]);
+  x7[21] = _mm_subs_epi16(x6[20], x6[21]);
+  x7[22] = _mm_subs_epi16(x6[23], x6[22]);
+  x7[23] = _mm_adds_epi16(x6[23], x6[22]);
+  x7[24] = _mm_adds_epi16(x6[24], x6[25]);
+  x7[25] = _mm_subs_epi16(x6[24], x6[25]);
+  x7[26] = _mm_subs_epi16(x6[27], x6[26]);
+  x7[27] = _mm_adds_epi16(x6[27], x6[26]);
+  x7[28] = _mm_adds_epi16(x6[28], x6[29]);
+  x7[29] = _mm_subs_epi16(x6[28], x6[29]);
+  x7[30] = _mm_subs_epi16(x6[31], x6[30]);
+  x7[31] = _mm_adds_epi16(x6[31], x6[30]);
+
+  // stage 8
+  __m128i x8[32];
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+  x8[8] = x7[8];
+  x8[9] = x7[9];
+  x8[10] = x7[10];
+  x8[11] = x7[11];
+  x8[12] = x7[12];
+  x8[13] = x7[13];
+  x8[14] = x7[14];
+  x8[15] = x7[15];
+  btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x7[16], x7[31], x8[16], x8[31]);
+  btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x7[17], x7[30], x8[17], x8[30]);
+  btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x7[18], x7[29], x8[18], x8[29]);
+  btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x7[19], x7[28], x8[19], x8[28]);
+  btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x7[20], x7[27], x8[20], x8[27]);
+  btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x7[21], x7[26], x8[21], x8[26]);
+  btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x7[22], x7[25], x8[22], x8[25]);
+  btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x7[23], x7[24], x8[23], x8[24]);
+
+  // stage 9
+  output[0] = x8[0];
+  output[1] = x8[16];
+  output[2] = x8[8];
+  output[3] = x8[24];
+  output[4] = x8[4];
+  output[5] = x8[20];
+  output[6] = x8[12];
+  output[7] = x8[28];
+  output[8] = x8[2];
+  output[9] = x8[18];
+  output[10] = x8[10];
+  output[11] = x8[26];
+  output[12] = x8[6];
+  output[13] = x8[22];
+  output[14] = x8[14];
+  output[15] = x8[30];
+  output[16] = x8[1];
+  output[17] = x8[17];
+  output[18] = x8[9];
+  output[19] = x8[25];
+  output[20] = x8[5];
+  output[21] = x8[21];
+  output[22] = x8[13];
+  output[23] = x8[29];
+  output[24] = x8[3];
+  output[25] = x8[19];
+  output[26] = x8[11];
+  output[27] = x8[27];
+  output[28] = x8[7];
+  output[29] = x8[23];
+  output[30] = x8[15];
+  output[31] = x8[31];
+}
+
+void fdct8x64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+  __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
+  __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+  __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
+  __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+  __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]);
+  __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]);
+  __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]);
+  __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]);
+  __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]);
+  __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]);
+  __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]);
+  __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]);
+  __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]);
+  __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]);
+  __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]);
+  __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]);
+  __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]);
+  __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]);
+  __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]);
+  __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]);
+  __m128i cospi_p63_p01 = pair_set_epi16(cospi[63], cospi[1]);
+  __m128i cospi_m01_p63 = pair_set_epi16(-cospi[1], cospi[63]);
+  __m128i cospi_p31_p33 = pair_set_epi16(cospi[31], cospi[33]);
+  __m128i cospi_m33_p31 = pair_set_epi16(-cospi[33], cospi[31]);
+  __m128i cospi_p47_p17 = pair_set_epi16(cospi[47], cospi[17]);
+  __m128i cospi_m17_p47 = pair_set_epi16(-cospi[17], cospi[47]);
+  __m128i cospi_p15_p49 = pair_set_epi16(cospi[15], cospi[49]);
+  __m128i cospi_m49_p15 = pair_set_epi16(-cospi[49], cospi[15]);
+  __m128i cospi_p55_p09 = pair_set_epi16(cospi[55], cospi[9]);
+  __m128i cospi_m09_p55 = pair_set_epi16(-cospi[9], cospi[55]);
+  __m128i cospi_p23_p41 = pair_set_epi16(cospi[23], cospi[41]);
+  __m128i cospi_m41_p23 = pair_set_epi16(-cospi[41], cospi[23]);
+  __m128i cospi_p39_p25 = pair_set_epi16(cospi[39], cospi[25]);
+  __m128i cospi_m25_p39 = pair_set_epi16(-cospi[25], cospi[39]);
+  __m128i cospi_p07_p57 = pair_set_epi16(cospi[7], cospi[57]);
+  __m128i cospi_m57_p07 = pair_set_epi16(-cospi[57], cospi[7]);
+  __m128i cospi_p59_p05 = pair_set_epi16(cospi[59], cospi[5]);
+  __m128i cospi_m05_p59 = pair_set_epi16(-cospi[5], cospi[59]);
+  __m128i cospi_p27_p37 = pair_set_epi16(cospi[27], cospi[37]);
+  __m128i cospi_m37_p27 = pair_set_epi16(-cospi[37], cospi[27]);
+  __m128i cospi_p43_p21 = pair_set_epi16(cospi[43], cospi[21]);
+  __m128i cospi_m21_p43 = pair_set_epi16(-cospi[21], cospi[43]);
+  __m128i cospi_p11_p53 = pair_set_epi16(cospi[11], cospi[53]);
+  __m128i cospi_m53_p11 = pair_set_epi16(-cospi[53], cospi[11]);
+  __m128i cospi_p51_p13 = pair_set_epi16(cospi[51], cospi[13]);
+  __m128i cospi_m13_p51 = pair_set_epi16(-cospi[13], cospi[51]);
+  __m128i cospi_p19_p45 = pair_set_epi16(cospi[19], cospi[45]);
+  __m128i cospi_m45_p19 = pair_set_epi16(-cospi[45], cospi[19]);
+  __m128i cospi_p35_p29 = pair_set_epi16(cospi[35], cospi[29]);
+  __m128i cospi_m29_p35 = pair_set_epi16(-cospi[29], cospi[35]);
+  __m128i cospi_p03_p61 = pair_set_epi16(cospi[3], cospi[61]);
+  __m128i cospi_m61_p03 = pair_set_epi16(-cospi[61], cospi[3]);
+
+  // stage 1
+  __m128i x1[64];
+  x1[0] = _mm_adds_epi16(input[0], input[63]);
+  x1[63] = _mm_subs_epi16(input[0], input[63]);
+  x1[1] = _mm_adds_epi16(input[1], input[62]);
+  x1[62] = _mm_subs_epi16(input[1], input[62]);
+  x1[2] = _mm_adds_epi16(input[2], input[61]);
+  x1[61] = _mm_subs_epi16(input[2], input[61]);
+  x1[3] = _mm_adds_epi16(input[3], input[60]);
+  x1[60] = _mm_subs_epi16(input[3], input[60]);
+  x1[4] = _mm_adds_epi16(input[4], input[59]);
+  x1[59] = _mm_subs_epi16(input[4], input[59]);
+  x1[5] = _mm_adds_epi16(input[5], input[58]);
+  x1[58] = _mm_subs_epi16(input[5], input[58]);
+  x1[6] = _mm_adds_epi16(input[6], input[57]);
+  x1[57] = _mm_subs_epi16(input[6], input[57]);
+  x1[7] = _mm_adds_epi16(input[7], input[56]);
+  x1[56] = _mm_subs_epi16(input[7], input[56]);
+  x1[8] = _mm_adds_epi16(input[8], input[55]);
+  x1[55] = _mm_subs_epi16(input[8], input[55]);
+  x1[9] = _mm_adds_epi16(input[9], input[54]);
+  x1[54] = _mm_subs_epi16(input[9], input[54]);
+  x1[10] = _mm_adds_epi16(input[10], input[53]);
+  x1[53] = _mm_subs_epi16(input[10], input[53]);
+  x1[11] = _mm_adds_epi16(input[11], input[52]);
+  x1[52] = _mm_subs_epi16(input[11], input[52]);
+  x1[12] = _mm_adds_epi16(input[12], input[51]);
+  x1[51] = _mm_subs_epi16(input[12], input[51]);
+  x1[13] = _mm_adds_epi16(input[13], input[50]);
+  x1[50] = _mm_subs_epi16(input[13], input[50]);
+  x1[14] = _mm_adds_epi16(input[14], input[49]);
+  x1[49] = _mm_subs_epi16(input[14], input[49]);
+  x1[15] = _mm_adds_epi16(input[15], input[48]);
+  x1[48] = _mm_subs_epi16(input[15], input[48]);
+  x1[16] = _mm_adds_epi16(input[16], input[47]);
+  x1[47] = _mm_subs_epi16(input[16], input[47]);
+  x1[17] = _mm_adds_epi16(input[17], input[46]);
+  x1[46] = _mm_subs_epi16(input[17], input[46]);
+  x1[18] = _mm_adds_epi16(input[18], input[45]);
+  x1[45] = _mm_subs_epi16(input[18], input[45]);
+  x1[19] = _mm_adds_epi16(input[19], input[44]);
+  x1[44] = _mm_subs_epi16(input[19], input[44]);
+  x1[20] = _mm_adds_epi16(input[20], input[43]);
+  x1[43] = _mm_subs_epi16(input[20], input[43]);
+  x1[21] = _mm_adds_epi16(input[21], input[42]);
+  x1[42] = _mm_subs_epi16(input[21], input[42]);
+  x1[22] = _mm_adds_epi16(input[22], input[41]);
+  x1[41] = _mm_subs_epi16(input[22], input[41]);
+  x1[23] = _mm_adds_epi16(input[23], input[40]);
+  x1[40] = _mm_subs_epi16(input[23], input[40]);
+  x1[24] = _mm_adds_epi16(input[24], input[39]);
+  x1[39] = _mm_subs_epi16(input[24], input[39]);
+  x1[25] = _mm_adds_epi16(input[25], input[38]);
+  x1[38] = _mm_subs_epi16(input[25], input[38]);
+  x1[26] = _mm_adds_epi16(input[26], input[37]);
+  x1[37] = _mm_subs_epi16(input[26], input[37]);
+  x1[27] = _mm_adds_epi16(input[27], input[36]);
+  x1[36] = _mm_subs_epi16(input[27], input[36]);
+  x1[28] = _mm_adds_epi16(input[28], input[35]);
+  x1[35] = _mm_subs_epi16(input[28], input[35]);
+  x1[29] = _mm_adds_epi16(input[29], input[34]);
+  x1[34] = _mm_subs_epi16(input[29], input[34]);
+  x1[30] = _mm_adds_epi16(input[30], input[33]);
+  x1[33] = _mm_subs_epi16(input[30], input[33]);
+  x1[31] = _mm_adds_epi16(input[31], input[32]);
+  x1[32] = _mm_subs_epi16(input[31], input[32]);
+
+  // stage 2
+  __m128i x2[64];
+  x2[0] = _mm_adds_epi16(x1[0], x1[31]);
+  x2[31] = _mm_subs_epi16(x1[0], x1[31]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[30]);
+  x2[30] = _mm_subs_epi16(x1[1], x1[30]);
+  x2[2] = _mm_adds_epi16(x1[2], x1[29]);
+  x2[29] = _mm_subs_epi16(x1[2], x1[29]);
+  x2[3] = _mm_adds_epi16(x1[3], x1[28]);
+  x2[28] = _mm_subs_epi16(x1[3], x1[28]);
+  x2[4] = _mm_adds_epi16(x1[4], x1[27]);
+  x2[27] = _mm_subs_epi16(x1[4], x1[27]);
+  x2[5] = _mm_adds_epi16(x1[5], x1[26]);
+  x2[26] = _mm_subs_epi16(x1[5], x1[26]);
+  x2[6] = _mm_adds_epi16(x1[6], x1[25]);
+  x2[25] = _mm_subs_epi16(x1[6], x1[25]);
+  x2[7] = _mm_adds_epi16(x1[7], x1[24]);
+  x2[24] = _mm_subs_epi16(x1[7], x1[24]);
+  x2[8] = _mm_adds_epi16(x1[8], x1[23]);
+  x2[23] = _mm_subs_epi16(x1[8], x1[23]);
+  x2[9] = _mm_adds_epi16(x1[9], x1[22]);
+  x2[22] = _mm_subs_epi16(x1[9], x1[22]);
+  x2[10] = _mm_adds_epi16(x1[10], x1[21]);
+  x2[21] = _mm_subs_epi16(x1[10], x1[21]);
+  x2[11] = _mm_adds_epi16(x1[11], x1[20]);
+  x2[20] = _mm_subs_epi16(x1[11], x1[20]);
+  x2[12] = _mm_adds_epi16(x1[12], x1[19]);
+  x2[19] = _mm_subs_epi16(x1[12], x1[19]);
+  x2[13] = _mm_adds_epi16(x1[13], x1[18]);
+  x2[18] = _mm_subs_epi16(x1[13], x1[18]);
+  x2[14] = _mm_adds_epi16(x1[14], x1[17]);
+  x2[17] = _mm_subs_epi16(x1[14], x1[17]);
+  x2[15] = _mm_adds_epi16(x1[15], x1[16]);
+  x2[16] = _mm_subs_epi16(x1[15], x1[16]);
+  x2[32] = x1[32];
+  x2[33] = x1[33];
+  x2[34] = x1[34];
+  x2[35] = x1[35];
+  x2[36] = x1[36];
+  x2[37] = x1[37];
+  x2[38] = x1[38];
+  x2[39] = x1[39];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[40], x1[55], x2[40], x2[55]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[41], x1[54], x2[41], x2[54]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[42], x1[53], x2[42], x2[53]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[43], x1[52], x2[43], x2[52]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[44], x1[51], x2[44], x2[51]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[45], x1[50], x2[45], x2[50]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[46], x1[49], x2[46], x2[49]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[47], x1[48], x2[47], x2[48]);
+  x2[56] = x1[56];
+  x2[57] = x1[57];
+  x2[58] = x1[58];
+  x2[59] = x1[59];
+  x2[60] = x1[60];
+  x2[61] = x1[61];
+  x2[62] = x1[62];
+  x2[63] = x1[63];
+
+  // stage 3
+  __m128i x3[64];
+  x3[0] = _mm_adds_epi16(x2[0], x2[15]);
+  x3[15] = _mm_subs_epi16(x2[0], x2[15]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[14]);
+  x3[14] = _mm_subs_epi16(x2[1], x2[14]);
+  x3[2] = _mm_adds_epi16(x2[2], x2[13]);
+  x3[13] = _mm_subs_epi16(x2[2], x2[13]);
+  x3[3] = _mm_adds_epi16(x2[3], x2[12]);
+  x3[12] = _mm_subs_epi16(x2[3], x2[12]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[11]);
+  x3[11] = _mm_subs_epi16(x2[4], x2[11]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[10]);
+  x3[10] = _mm_subs_epi16(x2[5], x2[10]);
+  x3[6] = _mm_adds_epi16(x2[6], x2[9]);
+  x3[9] = _mm_subs_epi16(x2[6], x2[9]);
+  x3[7] = _mm_adds_epi16(x2[7], x2[8]);
+  x3[8] = _mm_subs_epi16(x2[7], x2[8]);
+  x3[16] = x2[16];
+  x3[17] = x2[17];
+  x3[18] = x2[18];
+  x3[19] = x2[19];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[20], x2[27], x3[20], x3[27]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[21], x2[26], x3[21], x3[26]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[22], x2[25], x3[22], x3[25]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[23], x2[24], x3[23], x3[24]);
+  x3[28] = x2[28];
+  x3[29] = x2[29];
+  x3[30] = x2[30];
+  x3[31] = x2[31];
+  x3[32] = _mm_adds_epi16(x2[32], x2[47]);
+  x3[47] = _mm_subs_epi16(x2[32], x2[47]);
+  x3[33] = _mm_adds_epi16(x2[33], x2[46]);
+  x3[46] = _mm_subs_epi16(x2[33], x2[46]);
+  x3[34] = _mm_adds_epi16(x2[34], x2[45]);
+  x3[45] = _mm_subs_epi16(x2[34], x2[45]);
+  x3[35] = _mm_adds_epi16(x2[35], x2[44]);
+  x3[44] = _mm_subs_epi16(x2[35], x2[44]);
+  x3[36] = _mm_adds_epi16(x2[36], x2[43]);
+  x3[43] = _mm_subs_epi16(x2[36], x2[43]);
+  x3[37] = _mm_adds_epi16(x2[37], x2[42]);
+  x3[42] = _mm_subs_epi16(x2[37], x2[42]);
+  x3[38] = _mm_adds_epi16(x2[38], x2[41]);
+  x3[41] = _mm_subs_epi16(x2[38], x2[41]);
+  x3[39] = _mm_adds_epi16(x2[39], x2[40]);
+  x3[40] = _mm_subs_epi16(x2[39], x2[40]);
+  x3[48] = _mm_subs_epi16(x2[63], x2[48]);
+  x3[63] = _mm_adds_epi16(x2[63], x2[48]);
+  x3[49] = _mm_subs_epi16(x2[62], x2[49]);
+  x3[62] = _mm_adds_epi16(x2[62], x2[49]);
+  x3[50] = _mm_subs_epi16(x2[61], x2[50]);
+  x3[61] = _mm_adds_epi16(x2[61], x2[50]);
+  x3[51] = _mm_subs_epi16(x2[60], x2[51]);
+  x3[60] = _mm_adds_epi16(x2[60], x2[51]);
+  x3[52] = _mm_subs_epi16(x2[59], x2[52]);
+  x3[59] = _mm_adds_epi16(x2[59], x2[52]);
+  x3[53] = _mm_subs_epi16(x2[58], x2[53]);
+  x3[58] = _mm_adds_epi16(x2[58], x2[53]);
+  x3[54] = _mm_subs_epi16(x2[57], x2[54]);
+  x3[57] = _mm_adds_epi16(x2[57], x2[54]);
+  x3[55] = _mm_subs_epi16(x2[56], x2[55]);
+  x3[56] = _mm_adds_epi16(x2[56], x2[55]);
+
+  // stage 4
+  __m128i x4[64];
+  x4[0] = _mm_adds_epi16(x3[0], x3[7]);
+  x4[7] = _mm_subs_epi16(x3[0], x3[7]);
+  x4[1] = _mm_adds_epi16(x3[1], x3[6]);
+  x4[6] = _mm_subs_epi16(x3[1], x3[6]);
+  x4[2] = _mm_adds_epi16(x3[2], x3[5]);
+  x4[5] = _mm_subs_epi16(x3[2], x3[5]);
+  x4[3] = _mm_adds_epi16(x3[3], x3[4]);
+  x4[4] = _mm_subs_epi16(x3[3], x3[4]);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[10], x3[13], x4[10], x4[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[11], x3[12], x4[11], x4[12]);
+  x4[14] = x3[14];
+  x4[15] = x3[15];
+  x4[16] = _mm_adds_epi16(x3[16], x3[23]);
+  x4[23] = _mm_subs_epi16(x3[16], x3[23]);
+  x4[17] = _mm_adds_epi16(x3[17], x3[22]);
+  x4[22] = _mm_subs_epi16(x3[17], x3[22]);
+  x4[18] = _mm_adds_epi16(x3[18], x3[21]);
+  x4[21] = _mm_subs_epi16(x3[18], x3[21]);
+  x4[19] = _mm_adds_epi16(x3[19], x3[20]);
+  x4[20] = _mm_subs_epi16(x3[19], x3[20]);
+  x4[24] = _mm_subs_epi16(x3[31], x3[24]);
+  x4[31] = _mm_adds_epi16(x3[31], x3[24]);
+  x4[25] = _mm_subs_epi16(x3[30], x3[25]);
+  x4[30] = _mm_adds_epi16(x3[30], x3[25]);
+  x4[26] = _mm_subs_epi16(x3[29], x3[26]);
+  x4[29] = _mm_adds_epi16(x3[29], x3[26]);
+  x4[27] = _mm_subs_epi16(x3[28], x3[27]);
+  x4[28] = _mm_adds_epi16(x3[28], x3[27]);
+  x4[32] = x3[32];
+  x4[33] = x3[33];
+  x4[34] = x3[34];
+  x4[35] = x3[35];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[36], x3[59], x4[36], x4[59]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[37], x3[58], x4[37], x4[58]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[38], x3[57], x4[38], x4[57]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[39], x3[56], x4[39], x4[56]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[40], x3[55], x4[40], x4[55]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[41], x3[54], x4[41], x4[54]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[42], x3[53], x4[42], x4[53]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[43], x3[52], x4[43], x4[52]);
+  x4[44] = x3[44];
+  x4[45] = x3[45];
+  x4[46] = x3[46];
+  x4[47] = x3[47];
+  x4[48] = x3[48];
+  x4[49] = x3[49];
+  x4[50] = x3[50];
+  x4[51] = x3[51];
+  x4[60] = x3[60];
+  x4[61] = x3[61];
+  x4[62] = x3[62];
+  x4[63] = x3[63];
+
+  // stage 5
+  __m128i x5[64];
+  x5[0] = _mm_adds_epi16(x4[0], x4[3]);
+  x5[3] = _mm_subs_epi16(x4[0], x4[3]);
+  x5[1] = _mm_adds_epi16(x4[1], x4[2]);
+  x5[2] = _mm_subs_epi16(x4[1], x4[2]);
+  x5[4] = x4[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x4[5], x4[6], x5[5], x5[6]);
+  x5[7] = x4[7];
+  x5[8] = _mm_adds_epi16(x4[8], x4[11]);
+  x5[11] = _mm_subs_epi16(x4[8], x4[11]);
+  x5[9] = _mm_adds_epi16(x4[9], x4[10]);
+  x5[10] = _mm_subs_epi16(x4[9], x4[10]);
+  x5[12] = _mm_subs_epi16(x4[15], x4[12]);
+  x5[15] = _mm_adds_epi16(x4[15], x4[12]);
+  x5[13] = _mm_subs_epi16(x4[14], x4[13]);
+  x5[14] = _mm_adds_epi16(x4[14], x4[13]);
+  x5[16] = x4[16];
+  x5[17] = x4[17];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[18], x4[29], x5[18], x5[29]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[19], x4[28], x5[19], x5[28]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[20], x4[27], x5[20], x5[27]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[21], x4[26], x5[21], x5[26]);
+  x5[22] = x4[22];
+  x5[23] = x4[23];
+  x5[24] = x4[24];
+  x5[25] = x4[25];
+  x5[30] = x4[30];
+  x5[31] = x4[31];
+  x5[32] = _mm_adds_epi16(x4[32], x4[39]);
+  x5[39] = _mm_subs_epi16(x4[32], x4[39]);
+  x5[33] = _mm_adds_epi16(x4[33], x4[38]);
+  x5[38] = _mm_subs_epi16(x4[33], x4[38]);
+  x5[34] = _mm_adds_epi16(x4[34], x4[37]);
+  x5[37] = _mm_subs_epi16(x4[34], x4[37]);
+  x5[35] = _mm_adds_epi16(x4[35], x4[36]);
+  x5[36] = _mm_subs_epi16(x4[35], x4[36]);
+  x5[40] = _mm_subs_epi16(x4[47], x4[40]);
+  x5[47] = _mm_adds_epi16(x4[47], x4[40]);
+  x5[41] = _mm_subs_epi16(x4[46], x4[41]);
+  x5[46] = _mm_adds_epi16(x4[46], x4[41]);
+  x5[42] = _mm_subs_epi16(x4[45], x4[42]);
+  x5[45] = _mm_adds_epi16(x4[45], x4[42]);
+  x5[43] = _mm_subs_epi16(x4[44], x4[43]);
+  x5[44] = _mm_adds_epi16(x4[44], x4[43]);
+  x5[48] = _mm_adds_epi16(x4[48], x4[55]);
+  x5[55] = _mm_subs_epi16(x4[48], x4[55]);
+  x5[49] = _mm_adds_epi16(x4[49], x4[54]);
+  x5[54] = _mm_subs_epi16(x4[49], x4[54]);
+  x5[50] = _mm_adds_epi16(x4[50], x4[53]);
+  x5[53] = _mm_subs_epi16(x4[50], x4[53]);
+  x5[51] = _mm_adds_epi16(x4[51], x4[52]);
+  x5[52] = _mm_subs_epi16(x4[51], x4[52]);
+  x5[56] = _mm_subs_epi16(x4[63], x4[56]);
+  x5[63] = _mm_adds_epi16(x4[63], x4[56]);
+  x5[57] = _mm_subs_epi16(x4[62], x4[57]);
+  x5[62] = _mm_adds_epi16(x4[62], x4[57]);
+  x5[58] = _mm_subs_epi16(x4[61], x4[58]);
+  x5[61] = _mm_adds_epi16(x4[61], x4[58]);
+  x5[59] = _mm_subs_epi16(x4[60], x4[59]);
+  x5[60] = _mm_adds_epi16(x4[60], x4[59]);
+
+  // stage 6
+  __m128i x6[64];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x5[0], x5[1], x6[0], x6[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x5[2], x5[3], x6[2], x6[3]);
+  x6[4] = _mm_adds_epi16(x5[4], x5[5]);
+  x6[5] = _mm_subs_epi16(x5[4], x5[5]);
+  x6[6] = _mm_subs_epi16(x5[7], x5[6]);
+  x6[7] = _mm_adds_epi16(x5[7], x5[6]);
+  x6[8] = x5[8];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x5[9], x5[14], x6[9], x6[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x5[10], x5[13], x6[10], x6[13]);
+  x6[11] = x5[11];
+  x6[12] = x5[12];
+  x6[15] = x5[15];
+  x6[16] = _mm_adds_epi16(x5[16], x5[19]);
+  x6[19] = _mm_subs_epi16(x5[16], x5[19]);
+  x6[17] = _mm_adds_epi16(x5[17], x5[18]);
+  x6[18] = _mm_subs_epi16(x5[17], x5[18]);
+  x6[20] = _mm_subs_epi16(x5[23], x5[20]);
+  x6[23] = _mm_adds_epi16(x5[23], x5[20]);
+  x6[21] = _mm_subs_epi16(x5[22], x5[21]);
+  x6[22] = _mm_adds_epi16(x5[22], x5[21]);
+  x6[24] = _mm_adds_epi16(x5[24], x5[27]);
+  x6[27] = _mm_subs_epi16(x5[24], x5[27]);
+  x6[25] = _mm_adds_epi16(x5[25], x5[26]);
+  x6[26] = _mm_subs_epi16(x5[25], x5[26]);
+  x6[28] = _mm_subs_epi16(x5[31], x5[28]);
+  x6[31] = _mm_adds_epi16(x5[31], x5[28]);
+  x6[29] = _mm_subs_epi16(x5[30], x5[29]);
+  x6[30] = _mm_adds_epi16(x5[30], x5[29]);
+  x6[32] = x5[32];
+  x6[33] = x5[33];
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[34], x5[61], x6[34], x6[61]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[35], x5[60], x6[35], x6[60]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[36], x5[59], x6[36], x6[59]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[37], x5[58], x6[37], x6[58]);
+  x6[38] = x5[38];
+  x6[39] = x5[39];
+  x6[40] = x5[40];
+  x6[41] = x5[41];
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[42], x5[53], x6[42], x6[53]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[43], x5[52], x6[43], x6[52]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[44], x5[51], x6[44], x6[51]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[45], x5[50], x6[45], x6[50]);
+  x6[46] = x5[46];
+  x6[47] = x5[47];
+  x6[48] = x5[48];
+  x6[49] = x5[49];
+  x6[54] = x5[54];
+  x6[55] = x5[55];
+  x6[56] = x5[56];
+  x6[57] = x5[57];
+  x6[62] = x5[62];
+  x6[63] = x5[63];
+
+  // stage 7
+  __m128i x7[64];
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x6[4], x6[7], x7[4], x7[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x6[5], x6[6], x7[5], x7[6]);
+  x7[8] = _mm_adds_epi16(x6[8], x6[9]);
+  x7[9] = _mm_subs_epi16(x6[8], x6[9]);
+  x7[10] = _mm_subs_epi16(x6[11], x6[10]);
+  x7[11] = _mm_adds_epi16(x6[11], x6[10]);
+  x7[12] = _mm_adds_epi16(x6[12], x6[13]);
+  x7[13] = _mm_subs_epi16(x6[12], x6[13]);
+  x7[14] = _mm_subs_epi16(x6[15], x6[14]);
+  x7[15] = _mm_adds_epi16(x6[15], x6[14]);
+  x7[16] = x6[16];
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x6[17], x6[30], x7[17], x7[30]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x6[18], x6[29], x7[18], x7[29]);
+  x7[19] = x6[19];
+  x7[20] = x6[20];
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x6[21], x6[26], x7[21], x7[26]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x6[22], x6[25], x7[22], x7[25]);
+  x7[23] = x6[23];
+  x7[24] = x6[24];
+  x7[27] = x6[27];
+  x7[28] = x6[28];
+  x7[31] = x6[31];
+  x7[32] = _mm_adds_epi16(x6[32], x6[35]);
+  x7[35] = _mm_subs_epi16(x6[32], x6[35]);
+  x7[33] = _mm_adds_epi16(x6[33], x6[34]);
+  x7[34] = _mm_subs_epi16(x6[33], x6[34]);
+  x7[36] = _mm_subs_epi16(x6[39], x6[36]);
+  x7[39] = _mm_adds_epi16(x6[39], x6[36]);
+  x7[37] = _mm_subs_epi16(x6[38], x6[37]);
+  x7[38] = _mm_adds_epi16(x6[38], x6[37]);
+  x7[40] = _mm_adds_epi16(x6[40], x6[43]);
+  x7[43] = _mm_subs_epi16(x6[40], x6[43]);
+  x7[41] = _mm_adds_epi16(x6[41], x6[42]);
+  x7[42] = _mm_subs_epi16(x6[41], x6[42]);
+  x7[44] = _mm_subs_epi16(x6[47], x6[44]);
+  x7[47] = _mm_adds_epi16(x6[47], x6[44]);
+  x7[45] = _mm_subs_epi16(x6[46], x6[45]);
+  x7[46] = _mm_adds_epi16(x6[46], x6[45]);
+  x7[48] = _mm_adds_epi16(x6[48], x6[51]);
+  x7[51] = _mm_subs_epi16(x6[48], x6[51]);
+  x7[49] = _mm_adds_epi16(x6[49], x6[50]);
+  x7[50] = _mm_subs_epi16(x6[49], x6[50]);
+  x7[52] = _mm_subs_epi16(x6[55], x6[52]);
+  x7[55] = _mm_adds_epi16(x6[55], x6[52]);
+  x7[53] = _mm_subs_epi16(x6[54], x6[53]);
+  x7[54] = _mm_adds_epi16(x6[54], x6[53]);
+  x7[56] = _mm_adds_epi16(x6[56], x6[59]);
+  x7[59] = _mm_subs_epi16(x6[56], x6[59]);
+  x7[57] = _mm_adds_epi16(x6[57], x6[58]);
+  x7[58] = _mm_subs_epi16(x6[57], x6[58]);
+  x7[60] = _mm_subs_epi16(x6[63], x6[60]);
+  x7[63] = _mm_adds_epi16(x6[63], x6[60]);
+  x7[61] = _mm_subs_epi16(x6[62], x6[61]);
+  x7[62] = _mm_adds_epi16(x6[62], x6[61]);
+
+  // stage 8
+  __m128i x8[64];
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+  btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x7[8], x7[15], x8[8], x8[15]);
+  btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x7[9], x7[14], x8[9], x8[14]);
+  btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x7[10], x7[13], x8[10], x8[13]);
+  btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x7[11], x7[12], x8[11], x8[12]);
+  x8[16] = _mm_adds_epi16(x7[16], x7[17]);
+  x8[17] = _mm_subs_epi16(x7[16], x7[17]);
+  x8[18] = _mm_subs_epi16(x7[19], x7[18]);
+  x8[19] = _mm_adds_epi16(x7[19], x7[18]);
+  x8[20] = _mm_adds_epi16(x7[20], x7[21]);
+  x8[21] = _mm_subs_epi16(x7[20], x7[21]);
+  x8[22] = _mm_subs_epi16(x7[23], x7[22]);
+  x8[23] = _mm_adds_epi16(x7[23], x7[22]);
+  x8[24] = _mm_adds_epi16(x7[24], x7[25]);
+  x8[25] = _mm_subs_epi16(x7[24], x7[25]);
+  x8[26] = _mm_subs_epi16(x7[27], x7[26]);
+  x8[27] = _mm_adds_epi16(x7[27], x7[26]);
+  x8[28] = _mm_adds_epi16(x7[28], x7[29]);
+  x8[29] = _mm_subs_epi16(x7[28], x7[29]);
+  x8[30] = _mm_subs_epi16(x7[31], x7[30]);
+  x8[31] = _mm_adds_epi16(x7[31], x7[30]);
+  x8[32] = x7[32];
+  btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x7[33], x7[62], x8[33], x8[62]);
+  btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x7[34], x7[61], x8[34], x8[61]);
+  x8[35] = x7[35];
+  x8[36] = x7[36];
+  btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x7[37], x7[58], x8[37], x8[58]);
+  btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x7[38], x7[57], x8[38], x8[57]);
+  x8[39] = x7[39];
+  x8[40] = x7[40];
+  btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x7[41], x7[54], x8[41], x8[54]);
+  btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x7[42], x7[53], x8[42], x8[53]);
+  x8[43] = x7[43];
+  x8[44] = x7[44];
+  btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x7[45], x7[50], x8[45], x8[50]);
+  btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x7[46], x7[49], x8[46], x8[49]);
+  x8[47] = x7[47];
+  x8[48] = x7[48];
+  x8[51] = x7[51];
+  x8[52] = x7[52];
+  x8[55] = x7[55];
+  x8[56] = x7[56];
+  x8[59] = x7[59];
+  x8[60] = x7[60];
+  x8[63] = x7[63];
+
+  // stage 9
+  __m128i x9[64];
+  x9[0] = x8[0];
+  x9[1] = x8[1];
+  x9[2] = x8[2];
+  x9[3] = x8[3];
+  x9[4] = x8[4];
+  x9[5] = x8[5];
+  x9[6] = x8[6];
+  x9[7] = x8[7];
+  x9[8] = x8[8];
+  x9[9] = x8[9];
+  x9[10] = x8[10];
+  x9[11] = x8[11];
+  x9[12] = x8[12];
+  x9[13] = x8[13];
+  x9[14] = x8[14];
+  x9[15] = x8[15];
+  btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x8[16], x8[31], x9[16], x9[31]);
+  btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x8[17], x8[30], x9[17], x9[30]);
+  btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x8[18], x8[29], x9[18], x9[29]);
+  btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x8[19], x8[28], x9[19], x9[28]);
+  btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x8[20], x8[27], x9[20], x9[27]);
+  btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x8[21], x8[26], x9[21], x9[26]);
+  btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x8[22], x8[25], x9[22], x9[25]);
+  btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x8[23], x8[24], x9[23], x9[24]);
+  x9[32] = _mm_adds_epi16(x8[32], x8[33]);
+  x9[33] = _mm_subs_epi16(x8[32], x8[33]);
+  x9[34] = _mm_subs_epi16(x8[35], x8[34]);
+  x9[35] = _mm_adds_epi16(x8[35], x8[34]);
+  x9[36] = _mm_adds_epi16(x8[36], x8[37]);
+  x9[37] = _mm_subs_epi16(x8[36], x8[37]);
+  x9[38] = _mm_subs_epi16(x8[39], x8[38]);
+  x9[39] = _mm_adds_epi16(x8[39], x8[38]);
+  x9[40] = _mm_adds_epi16(x8[40], x8[41]);
+  x9[41] = _mm_subs_epi16(x8[40], x8[41]);
+  x9[42] = _mm_subs_epi16(x8[43], x8[42]);
+  x9[43] = _mm_adds_epi16(x8[43], x8[42]);
+  x9[44] = _mm_adds_epi16(x8[44], x8[45]);
+  x9[45] = _mm_subs_epi16(x8[44], x8[45]);
+  x9[46] = _mm_subs_epi16(x8[47], x8[46]);
+  x9[47] = _mm_adds_epi16(x8[47], x8[46]);
+  x9[48] = _mm_adds_epi16(x8[48], x8[49]);
+  x9[49] = _mm_subs_epi16(x8[48], x8[49]);
+  x9[50] = _mm_subs_epi16(x8[51], x8[50]);
+  x9[51] = _mm_adds_epi16(x8[51], x8[50]);
+  x9[52] = _mm_adds_epi16(x8[52], x8[53]);
+  x9[53] = _mm_subs_epi16(x8[52], x8[53]);
+  x9[54] = _mm_subs_epi16(x8[55], x8[54]);
+  x9[55] = _mm_adds_epi16(x8[55], x8[54]);
+  x9[56] = _mm_adds_epi16(x8[56], x8[57]);
+  x9[57] = _mm_subs_epi16(x8[56], x8[57]);
+  x9[58] = _mm_subs_epi16(x8[59], x8[58]);
+  x9[59] = _mm_adds_epi16(x8[59], x8[58]);
+  x9[60] = _mm_adds_epi16(x8[60], x8[61]);
+  x9[61] = _mm_subs_epi16(x8[60], x8[61]);
+  x9[62] = _mm_subs_epi16(x8[63], x8[62]);
+  x9[63] = _mm_adds_epi16(x8[63], x8[62]);
+
+  // stage 10
+  __m128i x10[64];
+  x10[0] = x9[0];
+  x10[1] = x9[1];
+  x10[2] = x9[2];
+  x10[3] = x9[3];
+  x10[4] = x9[4];
+  x10[5] = x9[5];
+  x10[6] = x9[6];
+  x10[7] = x9[7];
+  x10[8] = x9[8];
+  x10[9] = x9[9];
+  x10[10] = x9[10];
+  x10[11] = x9[11];
+  x10[12] = x9[12];
+  x10[13] = x9[13];
+  x10[14] = x9[14];
+  x10[15] = x9[15];
+  x10[16] = x9[16];
+  x10[17] = x9[17];
+  x10[18] = x9[18];
+  x10[19] = x9[19];
+  x10[20] = x9[20];
+  x10[21] = x9[21];
+  x10[22] = x9[22];
+  x10[23] = x9[23];
+  x10[24] = x9[24];
+  x10[25] = x9[25];
+  x10[26] = x9[26];
+  x10[27] = x9[27];
+  x10[28] = x9[28];
+  x10[29] = x9[29];
+  x10[30] = x9[30];
+  x10[31] = x9[31];
+  btf_16_sse2(cospi_p63_p01, cospi_m01_p63, x9[32], x9[63], x10[32], x10[63]);
+  btf_16_sse2(cospi_p31_p33, cospi_m33_p31, x9[33], x9[62], x10[33], x10[62]);
+  btf_16_sse2(cospi_p47_p17, cospi_m17_p47, x9[34], x9[61], x10[34], x10[61]);
+  btf_16_sse2(cospi_p15_p49, cospi_m49_p15, x9[35], x9[60], x10[35], x10[60]);
+  btf_16_sse2(cospi_p55_p09, cospi_m09_p55, x9[36], x9[59], x10[36], x10[59]);
+  btf_16_sse2(cospi_p23_p41, cospi_m41_p23, x9[37], x9[58], x10[37], x10[58]);
+  btf_16_sse2(cospi_p39_p25, cospi_m25_p39, x9[38], x9[57], x10[38], x10[57]);
+  btf_16_sse2(cospi_p07_p57, cospi_m57_p07, x9[39], x9[56], x10[39], x10[56]);
+  btf_16_sse2(cospi_p59_p05, cospi_m05_p59, x9[40], x9[55], x10[40], x10[55]);
+  btf_16_sse2(cospi_p27_p37, cospi_m37_p27, x9[41], x9[54], x10[41], x10[54]);
+  btf_16_sse2(cospi_p43_p21, cospi_m21_p43, x9[42], x9[53], x10[42], x10[53]);
+  btf_16_sse2(cospi_p11_p53, cospi_m53_p11, x9[43], x9[52], x10[43], x10[52]);
+  btf_16_sse2(cospi_p51_p13, cospi_m13_p51, x9[44], x9[51], x10[44], x10[51]);
+  btf_16_sse2(cospi_p19_p45, cospi_m45_p19, x9[45], x9[50], x10[45], x10[50]);
+  btf_16_sse2(cospi_p35_p29, cospi_m29_p35, x9[46], x9[49], x10[46], x10[49]);
+  btf_16_sse2(cospi_p03_p61, cospi_m61_p03, x9[47], x9[48], x10[47], x10[48]);
+
+  // stage 11
+  output[0] = x10[0];
+  output[1] = x10[32];
+  output[2] = x10[16];
+  output[3] = x10[48];
+  output[4] = x10[8];
+  output[5] = x10[40];
+  output[6] = x10[24];
+  output[7] = x10[56];
+  output[8] = x10[4];
+  output[9] = x10[36];
+  output[10] = x10[20];
+  output[11] = x10[52];
+  output[12] = x10[12];
+  output[13] = x10[44];
+  output[14] = x10[28];
+  output[15] = x10[60];
+  output[16] = x10[2];
+  output[17] = x10[34];
+  output[18] = x10[18];
+  output[19] = x10[50];
+  output[20] = x10[10];
+  output[21] = x10[42];
+  output[22] = x10[26];
+  output[23] = x10[58];
+  output[24] = x10[6];
+  output[25] = x10[38];
+  output[26] = x10[22];
+  output[27] = x10[54];
+  output[28] = x10[14];
+  output[29] = x10[46];
+  output[30] = x10[30];
+  output[31] = x10[62];
+  output[32] = x10[1];
+  output[33] = x10[33];
+  output[34] = x10[17];
+  output[35] = x10[49];
+  output[36] = x10[9];
+  output[37] = x10[41];
+  output[38] = x10[25];
+  output[39] = x10[57];
+  output[40] = x10[5];
+  output[41] = x10[37];
+  output[42] = x10[21];
+  output[43] = x10[53];
+  output[44] = x10[13];
+  output[45] = x10[45];
+  output[46] = x10[29];
+  output[47] = x10[61];
+  output[48] = x10[3];
+  output[49] = x10[35];
+  output[50] = x10[19];
+  output[51] = x10[51];
+  output[52] = x10[11];
+  output[53] = x10[43];
+  output[54] = x10[27];
+  output[55] = x10[59];
+  output[56] = x10[7];
+  output[57] = x10[39];
+  output[58] = x10[23];
+  output[59] = x10[55];
+  output[60] = x10[15];
+  output[61] = x10[47];
+  output[62] = x10[31];
+  output[63] = x10[63];
+}
+
+static void fadst4x4_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *sinpi = sinpi_arr(cos_bit);
+  const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]);
+  const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]);
+  const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
+  const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
+  const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
+  const __m128i __zero = _mm_set1_epi16(0);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+  const __m128i in7 = _mm_add_epi16(input[0], input[1]);
+  __m128i u[8], v[8];
+
+  u[0] = _mm_unpacklo_epi16(input[0], input[1]);
+  u[1] = _mm_unpacklo_epi16(input[2], input[3]);
+  u[2] = _mm_unpacklo_epi16(in7, __zero);
+  u[3] = _mm_unpacklo_epi16(input[2], __zero);
+  u[4] = _mm_unpacklo_epi16(input[3], __zero);
+
+  v[0] = _mm_madd_epi16(u[0], sinpi_p01_p02);  // s0 + s2
+  v[1] = _mm_madd_epi16(u[1], sinpi_p03_p04);  // s4 + s5
+  v[2] = _mm_madd_epi16(u[2], sinpi_p03_p03);  // x1
+  v[3] = _mm_madd_epi16(u[0], sinpi_p04_m01);  // s1 - s3
+  v[4] = _mm_madd_epi16(u[1], sinpi_m03_p02);  // -s4 + s6
+  v[5] = _mm_madd_epi16(u[3], sinpi_p03_p03);  // s4
+  v[6] = _mm_madd_epi16(u[4], sinpi_p03_p03);
+
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[1] = _mm_sub_epi32(v[2], v[6]);
+  u[2] = _mm_add_epi32(v[3], v[4]);
+  u[3] = _mm_sub_epi32(u[2], u[0]);
+  u[4] = _mm_slli_epi32(v[5], 2);
+  u[5] = _mm_sub_epi32(u[4], v[5]);
+  u[6] = _mm_add_epi32(u[3], u[5]);
+
+  v[0] = _mm_add_epi32(u[0], __rounding);
+  v[1] = _mm_add_epi32(u[1], __rounding);
+  v[2] = _mm_add_epi32(u[2], __rounding);
+  v[3] = _mm_add_epi32(u[6], __rounding);
+
+  u[0] = _mm_srai_epi32(v[0], cos_bit);
+  u[1] = _mm_srai_epi32(v[1], cos_bit);
+  u[2] = _mm_srai_epi32(v[2], cos_bit);
+  u[3] = _mm_srai_epi32(v[3], cos_bit);
+
+  output[0] = _mm_packs_epi32(u[0], u[2]);
+  output[1] = _mm_packs_epi32(u[1], u[3]);
+  output[2] = _mm_srli_si128(output[0], 8);
+  output[3] = _mm_srli_si128(output[1], 8);
+}
+
+static void fadst4x8_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = input[0];
+  x1[1] = _mm_subs_epi16(__zero, input[7]);
+  x1[2] = _mm_subs_epi16(__zero, input[3]);
+  x1[3] = input[4];
+  x1[4] = _mm_subs_epi16(__zero, input[1]);
+  x1[5] = input[6];
+  x1[6] = input[2];
+  x1[7] = _mm_subs_epi16(__zero, input[5]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = x1[0];
+  x2[1] = x1[1];
+  btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[2],
+                 &x1[3], &x2[2], &x2[3]);
+  x2[4] = x1[4];
+  x2[5] = x1[5];
+  btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[6],
+                 &x1[7], &x2[6], &x2[7]);
+
+  // stage 3
+  __m128i x3[8];
+  x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+
+  // stage 4
+  __m128i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_w4_sse2(&cospi_p16_p48, &cospi_p48_m16, __rounding, cos_bit, &x3[4],
+                 &x3[5], &x4[4], &x4[5]);
+  btf_16_w4_sse2(&cospi_m48_p16, &cospi_p16_p48, __rounding, cos_bit, &x3[6],
+                 &x3[7], &x4[6], &x4[7]);
+
+  // stage 5
+  __m128i x5[8];
+  x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+  x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+  x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+  x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+  x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+  x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+  x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+  x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+
+  // stage 6
+  __m128i x6[8];
+  btf_16_w4_sse2(&cospi_p04_p60, &cospi_p60_m04, __rounding, cos_bit, &x5[0],
+                 &x5[1], &x6[0], &x6[1]);
+  btf_16_w4_sse2(&cospi_p20_p44, &cospi_p44_m20, __rounding, cos_bit, &x5[2],
+                 &x5[3], &x6[2], &x6[3]);
+  btf_16_w4_sse2(&cospi_p36_p28, &cospi_p28_m36, __rounding, cos_bit, &x5[4],
+                 &x5[5], &x6[4], &x6[5]);
+  btf_16_w4_sse2(&cospi_p52_p12, &cospi_p12_m52, __rounding, cos_bit, &x5[6],
+                 &x5[7], &x6[6], &x6[7]);
+
+  // stage 7
+  output[0] = x6[1];
+  output[1] = x6[6];
+  output[2] = x6[3];
+  output[3] = x6[4];
+  output[4] = x6[5];
+  output[5] = x6[2];
+  output[6] = x6[7];
+  output[7] = x6[0];
+}
+
+static void fadst8x4_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *sinpi = sinpi_arr(cos_bit);
+  const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]);
+  const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]);
+  const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
+  const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
+  const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
+  const __m128i __zero = _mm_set1_epi16(0);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+  const __m128i in7 = _mm_add_epi16(input[0], input[1]);
+  __m128i u_lo[8], u_hi[8], v_lo[8], v_hi[8];
+
+  u_lo[0] = _mm_unpacklo_epi16(input[0], input[1]);
+  u_hi[0] = _mm_unpackhi_epi16(input[0], input[1]);
+  u_lo[1] = _mm_unpacklo_epi16(input[2], input[3]);
+  u_hi[1] = _mm_unpackhi_epi16(input[2], input[3]);
+  u_lo[2] = _mm_unpacklo_epi16(in7, __zero);
+  u_hi[2] = _mm_unpackhi_epi16(in7, __zero);
+  u_lo[3] = _mm_unpacklo_epi16(input[2], __zero);
+  u_hi[3] = _mm_unpackhi_epi16(input[2], __zero);
+  u_lo[4] = _mm_unpacklo_epi16(input[3], __zero);
+  u_hi[4] = _mm_unpackhi_epi16(input[3], __zero);
+
+  v_lo[0] = _mm_madd_epi16(u_lo[0], sinpi_p01_p02);  // s0 + s2
+  v_hi[0] = _mm_madd_epi16(u_hi[0], sinpi_p01_p02);  // s0 + s2
+  v_lo[1] = _mm_madd_epi16(u_lo[1], sinpi_p03_p04);  // s4 + s5
+  v_hi[1] = _mm_madd_epi16(u_hi[1], sinpi_p03_p04);  // s4 + s5
+  v_lo[2] = _mm_madd_epi16(u_lo[2], sinpi_p03_p03);  // x1
+  v_hi[2] = _mm_madd_epi16(u_hi[2], sinpi_p03_p03);  // x1
+  v_lo[3] = _mm_madd_epi16(u_lo[0], sinpi_p04_m01);  // s1 - s3
+  v_hi[3] = _mm_madd_epi16(u_hi[0], sinpi_p04_m01);  // s1 - s3
+  v_lo[4] = _mm_madd_epi16(u_lo[1], sinpi_m03_p02);  // -s4 + s6
+  v_hi[4] = _mm_madd_epi16(u_hi[1], sinpi_m03_p02);  // -s4 + s6
+  v_lo[5] = _mm_madd_epi16(u_lo[3], sinpi_p03_p03);  // s4
+  v_hi[5] = _mm_madd_epi16(u_hi[3], sinpi_p03_p03);  // s4
+  v_lo[6] = _mm_madd_epi16(u_lo[4], sinpi_p03_p03);
+  v_hi[6] = _mm_madd_epi16(u_hi[4], sinpi_p03_p03);
+
+  u_lo[0] = _mm_add_epi32(v_lo[0], v_lo[1]);
+  u_hi[0] = _mm_add_epi32(v_hi[0], v_hi[1]);
+  u_lo[1] = _mm_sub_epi32(v_lo[2], v_lo[6]);
+  u_hi[1] = _mm_sub_epi32(v_hi[2], v_hi[6]);
+  u_lo[2] = _mm_add_epi32(v_lo[3], v_lo[4]);
+  u_hi[2] = _mm_add_epi32(v_hi[3], v_hi[4]);
+  u_lo[3] = _mm_sub_epi32(u_lo[2], u_lo[0]);
+  u_hi[3] = _mm_sub_epi32(u_hi[2], u_hi[0]);
+  u_lo[4] = _mm_slli_epi32(v_lo[5], 2);
+  u_hi[4] = _mm_slli_epi32(v_hi[5], 2);
+  u_lo[5] = _mm_sub_epi32(u_lo[4], v_lo[5]);
+  u_hi[5] = _mm_sub_epi32(u_hi[4], v_hi[5]);
+  u_lo[6] = _mm_add_epi32(u_lo[3], u_lo[5]);
+  u_hi[6] = _mm_add_epi32(u_hi[3], u_hi[5]);
+
+  v_lo[0] = _mm_add_epi32(u_lo[0], __rounding);
+  v_hi[0] = _mm_add_epi32(u_hi[0], __rounding);
+  v_lo[1] = _mm_add_epi32(u_lo[1], __rounding);
+  v_hi[1] = _mm_add_epi32(u_hi[1], __rounding);
+  v_lo[2] = _mm_add_epi32(u_lo[2], __rounding);
+  v_hi[2] = _mm_add_epi32(u_hi[2], __rounding);
+  v_lo[3] = _mm_add_epi32(u_lo[6], __rounding);
+  v_hi[3] = _mm_add_epi32(u_hi[6], __rounding);
+
+  u_lo[0] = _mm_srai_epi32(v_lo[0], cos_bit);
+  u_hi[0] = _mm_srai_epi32(v_hi[0], cos_bit);
+  u_lo[1] = _mm_srai_epi32(v_lo[1], cos_bit);
+  u_hi[1] = _mm_srai_epi32(v_hi[1], cos_bit);
+  u_lo[2] = _mm_srai_epi32(v_lo[2], cos_bit);
+  u_hi[2] = _mm_srai_epi32(v_hi[2], cos_bit);
+  u_lo[3] = _mm_srai_epi32(v_lo[3], cos_bit);
+  u_hi[3] = _mm_srai_epi32(v_hi[3], cos_bit);
+
+  output[0] = _mm_packs_epi32(u_lo[0], u_hi[0]);
+  output[1] = _mm_packs_epi32(u_lo[1], u_hi[1]);
+  output[2] = _mm_packs_epi32(u_lo[2], u_hi[2]);
+  output[3] = _mm_packs_epi32(u_lo[3], u_hi[3]);
+}
+
+static void fadst8x8_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = input[0];
+  x1[1] = _mm_subs_epi16(__zero, input[7]);
+  x1[2] = _mm_subs_epi16(__zero, input[3]);
+  x1[3] = input[4];
+  x1[4] = _mm_subs_epi16(__zero, input[1]);
+  x1[5] = input[6];
+  x1[6] = input[2];
+  x1[7] = _mm_subs_epi16(__zero, input[5]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = x1[0];
+  x2[1] = x1[1];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]);
+  x2[4] = x1[4];
+  x2[5] = x1[5];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]);
+
+  // stage 3
+  __m128i x3[8];
+  x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+
+  // stage 4
+  __m128i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]);
+
+  // stage 5
+  __m128i x5[8];
+  x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+  x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+  x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+  x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+  x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+  x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+  x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+  x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+
+  // stage 6
+  __m128i x6[8];
+  btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x5[0], x5[1], x6[0], x6[1]);
+  btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x5[2], x5[3], x6[2], x6[3]);
+  btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x5[4], x5[5], x6[4], x6[5]);
+  btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x5[6], x5[7], x6[6], x6[7]);
+
+  // stage 7
+  output[0] = x6[1];
+  output[1] = x6[6];
+  output[2] = x6[3];
+  output[3] = x6[4];
+  output[4] = x6[5];
+  output[5] = x6[2];
+  output[6] = x6[7];
+  output[7] = x6[0];
+}
+
+static void fadst8x16_new_sse2(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+  __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+  __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+  // stage 1
+  __m128i x1[16];
+  x1[0] = input[0];
+  x1[1] = _mm_subs_epi16(__zero, input[15]);
+  x1[2] = _mm_subs_epi16(__zero, input[7]);
+  x1[3] = input[8];
+  x1[4] = _mm_subs_epi16(__zero, input[3]);
+  x1[5] = input[12];
+  x1[6] = input[4];
+  x1[7] = _mm_subs_epi16(__zero, input[11]);
+  x1[8] = _mm_subs_epi16(__zero, input[1]);
+  x1[9] = input[14];
+  x1[10] = input[6];
+  x1[11] = _mm_subs_epi16(__zero, input[9]);
+  x1[12] = input[2];
+  x1[13] = _mm_subs_epi16(__zero, input[13]);
+  x1[14] = _mm_subs_epi16(__zero, input[5]);
+  x1[15] = input[10];
+
+  // stage 2
+  __m128i x2[16];
+  x2[0] = x1[0];
+  x2[1] = x1[1];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]);
+  x2[4] = x1[4];
+  x2[5] = x1[5];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]);
+  x2[8] = x1[8];
+  x2[9] = x1[9];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x2[10], x2[11]);
+  x2[12] = x1[12];
+  x2[13] = x1[13];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x2[14], x2[15]);
+
+  // stage 3
+  __m128i x3[16];
+  x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+  x3[8] = _mm_adds_epi16(x2[8], x2[10]);
+  x3[10] = _mm_subs_epi16(x2[8], x2[10]);
+  x3[9] = _mm_adds_epi16(x2[9], x2[11]);
+  x3[11] = _mm_subs_epi16(x2[9], x2[11]);
+  x3[12] = _mm_adds_epi16(x2[12], x2[14]);
+  x3[14] = _mm_subs_epi16(x2[12], x2[14]);
+  x3[13] = _mm_adds_epi16(x2[13], x2[15]);
+  x3[15] = _mm_subs_epi16(x2[13], x2[15]);
+
+  // stage 4
+  __m128i x4[16];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  x4[10] = x3[10];
+  x4[11] = x3[11];
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[12], x3[13], x4[12], x4[13]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[14], x3[15], x4[14], x4[15]);
+
+  // stage 5
+  __m128i x5[16];
+  x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+  x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+  x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+  x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+  x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+  x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+  x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+  x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+  x5[8] = _mm_adds_epi16(x4[8], x4[12]);
+  x5[12] = _mm_subs_epi16(x4[8], x4[12]);
+  x5[9] = _mm_adds_epi16(x4[9], x4[13]);
+  x5[13] = _mm_subs_epi16(x4[9], x4[13]);
+  x5[10] = _mm_adds_epi16(x4[10], x4[14]);
+  x5[14] = _mm_subs_epi16(x4[10], x4[14]);
+  x5[11] = _mm_adds_epi16(x4[11], x4[15]);
+  x5[15] = _mm_subs_epi16(x4[11], x4[15]);
+
+  // stage 6
+  __m128i x6[16];
+  x6[0] = x5[0];
+  x6[1] = x5[1];
+  x6[2] = x5[2];
+  x6[3] = x5[3];
+  x6[4] = x5[4];
+  x6[5] = x5[5];
+  x6[6] = x5[6];
+  x6[7] = x5[7];
+  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x5[8], x5[9], x6[8], x6[9]);
+  btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x5[10], x5[11], x6[10], x6[11]);
+  btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x5[12], x5[13], x6[12], x6[13]);
+  btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x5[14], x5[15], x6[14], x6[15]);
+
+  // stage 7
+  __m128i x7[16];
+  x7[0] = _mm_adds_epi16(x6[0], x6[8]);
+  x7[8] = _mm_subs_epi16(x6[0], x6[8]);
+  x7[1] = _mm_adds_epi16(x6[1], x6[9]);
+  x7[9] = _mm_subs_epi16(x6[1], x6[9]);
+  x7[2] = _mm_adds_epi16(x6[2], x6[10]);
+  x7[10] = _mm_subs_epi16(x6[2], x6[10]);
+  x7[3] = _mm_adds_epi16(x6[3], x6[11]);
+  x7[11] = _mm_subs_epi16(x6[3], x6[11]);
+  x7[4] = _mm_adds_epi16(x6[4], x6[12]);
+  x7[12] = _mm_subs_epi16(x6[4], x6[12]);
+  x7[5] = _mm_adds_epi16(x6[5], x6[13]);
+  x7[13] = _mm_subs_epi16(x6[5], x6[13]);
+  x7[6] = _mm_adds_epi16(x6[6], x6[14]);
+  x7[14] = _mm_subs_epi16(x6[6], x6[14]);
+  x7[7] = _mm_adds_epi16(x6[7], x6[15]);
+  x7[15] = _mm_subs_epi16(x6[7], x6[15]);
+
+  // stage 8
+  __m128i x8[16];
+  btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x7[0], x7[1], x8[0], x8[1]);
+  btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x7[2], x7[3], x8[2], x8[3]);
+  btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x7[4], x7[5], x8[4], x8[5]);
+  btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x7[6], x7[7], x8[6], x8[7]);
+  btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x7[8], x7[9], x8[8], x8[9]);
+  btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x7[10], x7[11], x8[10], x8[11]);
+  btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x7[12], x7[13], x8[12], x8[13]);
+  btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x7[14], x7[15], x8[14], x8[15]);
+
+  // stage 9
+  output[0] = x8[1];
+  output[1] = x8[14];
+  output[2] = x8[3];
+  output[3] = x8[12];
+  output[4] = x8[5];
+  output[5] = x8[10];
+  output[6] = x8[7];
+  output[7] = x8[8];
+  output[8] = x8[9];
+  output[9] = x8[6];
+  output[10] = x8[11];
+  output[11] = x8[4];
+  output[12] = x8[13];
+  output[13] = x8[2];
+  output[14] = x8[15];
+  output[15] = x8[0];
+}
+
+static const transform_1d_sse2 col_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_new_sse2,       // DCT_DCT
+  fadst4x4_new_sse2,      // ADST_DCT
+  fdct4x4_new_sse2,       // DCT_ADST
+  fadst4x4_new_sse2,      // ADST_ADST
+  fadst4x4_new_sse2,      // FLIPADST_DCT
+  fdct4x4_new_sse2,       // DCT_FLIPADST
+  fadst4x4_new_sse2,      // FLIPADST_FLIPADST
+  fadst4x4_new_sse2,      // ADST_FLIPADST
+  fadst4x4_new_sse2,      // FLIPADST_ADST
+  fidentity4x4_new_sse2,  // IDTX
+  fdct4x4_new_sse2,       // V_DCT
+  fidentity4x4_new_sse2,  // H_DCT
+  fadst4x4_new_sse2,      // V_ADST
+  fidentity4x4_new_sse2,  // H_ADST
+  fadst4x4_new_sse2,      // V_FLIPADST
+  fidentity4x4_new_sse2   // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_new_sse2,       // DCT_DCT
+  fdct4x4_new_sse2,       // ADST_DCT
+  fadst4x4_new_sse2,      // DCT_ADST
+  fadst4x4_new_sse2,      // ADST_ADST
+  fdct4x4_new_sse2,       // FLIPADST_DCT
+  fadst4x4_new_sse2,      // DCT_FLIPADST
+  fadst4x4_new_sse2,      // FLIPADST_FLIPADST
+  fadst4x4_new_sse2,      // ADST_FLIPADST
+  fadst4x4_new_sse2,      // FLIPADST_ADST
+  fidentity4x4_new_sse2,  // IDTX
+  fidentity4x4_new_sse2,  // V_DCT
+  fdct4x4_new_sse2,       // H_DCT
+  fidentity4x4_new_sse2,  // V_ADST
+  fadst4x4_new_sse2,      // H_ADST
+  fidentity4x4_new_sse2,  // V_FLIPADST
+  fadst4x4_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_new_sse2,       // DCT_DCT
+  fadst4x8_new_sse2,      // ADST_DCT
+  fdct4x8_new_sse2,       // DCT_ADST
+  fadst4x8_new_sse2,      // ADST_ADST
+  fadst4x8_new_sse2,      // FLIPADST_DCT
+  fdct4x8_new_sse2,       // DCT_FLIPADST
+  fadst4x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst4x8_new_sse2,      // ADST_FLIPADST
+  fadst4x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fdct4x8_new_sse2,       // V_DCT
+  fidentity8x8_new_sse2,  // H_DCT
+  fadst4x8_new_sse2,      // V_ADST
+  fidentity8x8_new_sse2,  // H_ADST
+  fadst4x8_new_sse2,      // V_FLIPADST
+  fidentity8x8_new_sse2   // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_new_sse2,       // DCT_DCT
+  fdct8x4_new_sse2,       // ADST_DCT
+  fadst8x4_new_sse2,      // DCT_ADST
+  fadst8x4_new_sse2,      // ADST_ADST
+  fdct8x4_new_sse2,       // FLIPADST_DCT
+  fadst8x4_new_sse2,      // DCT_FLIPADST
+  fadst8x4_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x4_new_sse2,      // ADST_FLIPADST
+  fadst8x4_new_sse2,      // FLIPADST_ADST
+  fidentity8x4_new_sse2,  // IDTX
+  fidentity8x4_new_sse2,  // V_DCT
+  fdct8x4_new_sse2,       // H_DCT
+  fidentity8x4_new_sse2,  // V_ADST
+  fadst8x4_new_sse2,      // H_ADST
+  fidentity8x4_new_sse2,  // V_FLIPADST
+  fadst8x4_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_new_sse2,       // DCT_DCT
+  fadst8x4_new_sse2,      // ADST_DCT
+  fdct8x4_new_sse2,       // DCT_ADST
+  fadst8x4_new_sse2,      // ADST_ADST
+  fadst8x4_new_sse2,      // FLIPADST_DCT
+  fdct8x4_new_sse2,       // DCT_FLIPADST
+  fadst8x4_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x4_new_sse2,      // ADST_FLIPADST
+  fadst8x4_new_sse2,      // FLIPADST_ADST
+  fidentity8x4_new_sse2,  // IDTX
+  fdct8x4_new_sse2,       // V_DCT
+  fidentity8x4_new_sse2,  // H_DCT
+  fadst8x4_new_sse2,      // V_ADST
+  fidentity8x4_new_sse2,  // H_ADST
+  fadst8x4_new_sse2,      // V_FLIPADST
+  fidentity8x4_new_sse2   // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_new_sse2,       // DCT_DCT
+  fdct4x8_new_sse2,       // ADST_DCT
+  fadst4x8_new_sse2,      // DCT_ADST
+  fadst4x8_new_sse2,      // ADST_ADST
+  fdct4x8_new_sse2,       // FLIPADST_DCT
+  fadst4x8_new_sse2,      // DCT_FLIPADST
+  fadst4x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst4x8_new_sse2,      // ADST_FLIPADST
+  fadst4x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fidentity8x8_new_sse2,  // V_DCT
+  fdct4x8_new_sse2,       // H_DCT
+  fidentity8x8_new_sse2,  // V_ADST
+  fadst4x8_new_sse2,      // H_ADST
+  fidentity8x8_new_sse2,  // V_FLIPADST
+  fadst4x8_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_new_sse2,       // DCT_DCT
+  fadst8x8_new_sse2,      // ADST_DCT
+  fdct8x8_new_sse2,       // DCT_ADST
+  fadst8x8_new_sse2,      // ADST_ADST
+  fadst8x8_new_sse2,      // FLIPADST_DCT
+  fdct8x8_new_sse2,       // DCT_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x8_new_sse2,      // ADST_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fdct8x8_new_sse2,       // V_DCT
+  fidentity8x8_new_sse2,  // H_DCT
+  fadst8x8_new_sse2,      // V_ADST
+  fidentity8x8_new_sse2,  // H_ADST
+  fadst8x8_new_sse2,      // V_FLIPADST
+  fidentity8x8_new_sse2,  // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_new_sse2,       // DCT_DCT
+  fdct8x8_new_sse2,       // ADST_DCT
+  fadst8x8_new_sse2,      // DCT_ADST
+  fadst8x8_new_sse2,      // ADST_ADST
+  fdct8x8_new_sse2,       // FLIPADST_DCT
+  fadst8x8_new_sse2,      // DCT_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x8_new_sse2,      // ADST_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fidentity8x8_new_sse2,  // V_DCT
+  fdct8x8_new_sse2,       // H_DCT
+  fidentity8x8_new_sse2,  // V_ADST
+  fadst8x8_new_sse2,      // H_ADST
+  fidentity8x8_new_sse2,  // V_FLIPADST
+  fadst8x8_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_new_sse2,       // DCT_DCT
+  fadst8x16_new_sse2,      // ADST_DCT
+  fdct8x16_new_sse2,       // DCT_ADST
+  fadst8x16_new_sse2,      // ADST_ADST
+  fadst8x16_new_sse2,      // FLIPADST_DCT
+  fdct8x16_new_sse2,       // DCT_FLIPADST
+  fadst8x16_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x16_new_sse2,      // ADST_FLIPADST
+  fadst8x16_new_sse2,      // FLIPADST_ADST
+  fidentity8x16_new_sse2,  // IDTX
+  fdct8x16_new_sse2,       // V_DCT
+  fidentity8x16_new_sse2,  // H_DCT
+  fadst8x16_new_sse2,      // V_ADST
+  fidentity8x16_new_sse2,  // H_ADST
+  fadst8x16_new_sse2,      // V_FLIPADST
+  fidentity8x16_new_sse2   // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_new_sse2,       // DCT_DCT
+  fdct8x16_new_sse2,       // ADST_DCT
+  fadst8x16_new_sse2,      // DCT_ADST
+  fadst8x16_new_sse2,      // ADST_ADST
+  fdct8x16_new_sse2,       // FLIPADST_DCT
+  fadst8x16_new_sse2,      // DCT_FLIPADST
+  fadst8x16_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x16_new_sse2,      // ADST_FLIPADST
+  fadst8x16_new_sse2,      // FLIPADST_ADST
+  fidentity8x16_new_sse2,  // IDTX
+  fidentity8x16_new_sse2,  // V_DCT
+  fdct8x16_new_sse2,       // H_DCT
+  fidentity8x16_new_sse2,  // V_ADST
+  fadst8x16_new_sse2,      // H_ADST
+  fidentity8x16_new_sse2,  // V_FLIPADST
+  fadst8x16_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x32_arr[TX_TYPES] = {
+  fdct8x32_new_sse2,       // DCT_DCT
+  NULL,                    // ADST_DCT
+  NULL,                    // DCT_ADST
+  NULL,                    // ADST_ADST
+  NULL,                    // FLIPADST_DCT
+  NULL,                    // DCT_FLIPADST
+  NULL,                    // FLIPADST_FLIPADST
+  NULL,                    // ADST_FLIPADST
+  NULL,                    // FLIPADST_ADST
+  fidentity8x32_new_sse2,  // IDTX
+  fidentity8x32_new_sse2,  // V_DCT
+  fdct8x32_new_sse2,       // H_DCT
+  NULL,                    // V_ADST
+  NULL,                    // H_ADST
+  NULL,                    // V_FLIPADST
+  NULL                     // H_FLIPADST
+};
+
+void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[4], buf1[4], *buf;
+  const int8_t *shift = fwd_txfm_shift_ls[TX_4X4];
+  const int txw_idx = get_txw_idx(TX_4X4);
+  const int txh_idx = get_txh_idx(TX_4X4);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 4;
+  const transform_1d_sse2 col_txfm = col_txfm4x4_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm4x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_4x4(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_4x4(buf, buf);
+  store_buffer_16bit_to_32bit_w4(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)stride;
+  (void)bd;
+  __m128i buf0[8], buf1[8], *buf;
+  const int8_t *shift = fwd_txfm_shift_ls[TX_4X8];
+  const int txw_idx = get_txw_idx(TX_4X8);
+  const int txh_idx = get_txh_idx(TX_4X8);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 8;
+  const transform_1d_sse2 col_txfm = col_txfm4x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_4x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_8x4(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w4(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_4X16];
+  const int txw_idx = get_txw_idx(TX_4X16);
+  const int txh_idx = get_txh_idx(TX_4X16);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_4x8(buf0, buf1);
+  transpose_16bit_4x8(buf0 + 8, buf1 + 8);
+
+  for (int i = 0; i < 2; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + 8 * i, buf, width);
+    } else {
+      buf = buf1 + 8 * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    transpose_16bit_8x4(buf, buf);
+    store_buffer_16bit_to_32bit_w4(buf, output + 8 * width * i, width, 8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[8], buf1[8], *buf;
+  const int8_t *shift = fwd_txfm_shift_ls[TX_8X4];
+  const int txw_idx = get_txw_idx(TX_8X4);
+  const int txh_idx = get_txh_idx(TX_8X4);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 4;
+  const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm4x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip)
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  else
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_8x8(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[8], buf1[8], *buf;
+  const int8_t *shift = fwd_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 8;
+  const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip)
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  else
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_8x8(buf, buf);
+  store_buffer_16bit_to_32bit_w8(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
+  const int txw_idx = get_txw_idx(TX_8X16);
+  const int txh_idx = get_txh_idx(TX_8X16);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+
+  for (int i = 0; i < 2; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    transpose_16bit_8x8(buf, buf);
+    store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[32];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_8X32];
+  const int txw_idx = get_txw_idx(TX_8X32);
+  const int txh_idx = get_txh_idx(TX_8X32);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 32;
+  const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+  transpose_16bit_8x8(buf0 + 16, buf1 + 16);
+  transpose_16bit_8x8(buf0 + 24, buf1 + 24);
+
+  for (int i = 0; i < 4; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    transpose_16bit_8x8(buf, buf);
+    store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_16X4];
+  const int txw_idx = get_txw_idx(TX_16X4);
+  const int txh_idx = get_txh_idx(TX_16X4);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 4;
+  const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+  __m128i *buf;
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    transpose_16bit_8x4(buf0, buf1 + 8 * i);
+  }
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_4x8(buf, buf);
+  store_buffer_16bit_to_32bit_w8(buf, output, width, height);
+  transpose_16bit_4x8(buf + 8, buf + 8);
+  store_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
+  const int txw_idx = get_txw_idx(TX_16X8);
+  const int txh_idx = get_txh_idx(TX_16X8);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 8;
+  const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+  __m128i *buf;
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    transpose_16bit_8x8(buf0, buf1 + 8 * i);
+  }
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_8x8(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
+  transpose_16bit_8x8(buf + 8, buf + 8);
+  store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[32];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_16X16];
+  const int txw_idx = get_txw_idx(TX_16X16);
+  const int txh_idx = get_txh_idx(TX_16X16);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+    transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    transpose_16bit_8x8(buf, buf);
+    store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+    transpose_16bit_8x8(buf + 8, buf + 8);
+    store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+                                   8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[64];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_16X32];
+  const int txw_idx = get_txw_idx(TX_16X32);
+  const int txh_idx = get_txh_idx(TX_16X32);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 32;
+  const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 2; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 4; i++) {
+      __m128i *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_sse2(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row);
+      round_shift_16bit(buf, width, shift[2]);
+      transpose_16bit_8x8(buf, buf);
+      store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+                                          8);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8,
+                                          width, 8);
+    }
+  } else {
+    av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[32];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_32X8];
+  const int txw_idx = get_txw_idx(TX_32X8);
+  const int txh_idx = get_txh_idx(TX_32X8);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 8;
+  const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 1; i++) {
+      __m128i *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_sse2(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row);
+      round_shift_16bit(buf, width, shift[2]);
+      transpose_16bit_8x8(buf, buf);
+      store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+                                     height);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+                                     height);
+      transpose_16bit_8x8(buf + 16, buf + 16);
+      store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+                                     width, height);
+      transpose_16bit_8x8(buf + 24, buf + 24);
+      store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+                                     width, height);
+    }
+  } else {
+    av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[64];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_32X16];
+  const int txw_idx = get_txw_idx(TX_32X16);
+  const int txh_idx = get_txh_idx(TX_32X16);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 2; i++) {
+      __m128i *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_sse2(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row);
+      round_shift_16bit(buf, width, shift[2]);
+      transpose_16bit_8x8(buf, buf);
+      store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+                                          8);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8,
+                                          width, 8);
+      transpose_16bit_8x8(buf + 16, buf + 16);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+                                          width, 8);
+      transpose_16bit_8x8(buf + 24, buf + 24);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+                                          width, 8);
+    }
+  } else {
+    av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[128];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_32X32];
+  const int txw_idx = get_txw_idx(TX_32X32);
+  const int txh_idx = get_txh_idx(TX_32X32);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 32;
+  const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 4; i++) {
+      __m128i *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_sse2(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row);
+      round_shift_16bit(buf, width, shift[2]);
+      transpose_16bit_8x8(buf, buf);
+      store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+                                     8);
+      transpose_16bit_8x8(buf + 16, buf + 16);
+      store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+                                     width, 8);
+      transpose_16bit_8x8(buf + 24, buf + 24);
+      store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+                                     width, 8);
+    }
+  } else {
+    av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X16;
+  __m128i buf0[64], buf1[128];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = fdct8x16_new_sse2;
+  const transform_1d_sse2 row_txfm = fdct8x64_new_sse2;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < height_div8; ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < height_div8; i++) {
+    __m128i *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < 4; ++j) {
+      __m128i *buf8 = buf + 8 * j;
+      transpose_16bit_8x8(buf8, buf8);
+      store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, 32, 8);
+    }
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_16X64;
+  __m128i buf0[64], buf1[128];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = fdct8x64_new_sse2;
+  const transform_1d_sse2 row_txfm = fdct8x16_new_sse2;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < height_div8; ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    __m128i *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    int32_t *output8 = output + 8 * width * i;
+    for (int j = 0; j < width_div8; ++j) {
+      __m128i *buf8 = buf + 8 * j;
+      transpose_16bit_8x8(buf8, buf8);
+      store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, width, 8);
+    }
+  }
+  // Zero out the bottom 16x32 area.
+  memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+  av1_lowbd_fwd_txfm2d_4x4_sse2,    // 4x4 transform
+  av1_lowbd_fwd_txfm2d_8x8_sse2,    // 8x8 transform
+  av1_lowbd_fwd_txfm2d_16x16_sse2,  // 16x16 transform
+  av1_lowbd_fwd_txfm2d_32x32_sse2,  // 32x32 transform
+  NULL,                             // 64x64 transform
+  av1_lowbd_fwd_txfm2d_4x8_sse2,    // 4x8 transform
+  av1_lowbd_fwd_txfm2d_8x4_sse2,    // 8x4 transform
+  av1_lowbd_fwd_txfm2d_8x16_sse2,   // 8x16 transform
+  av1_lowbd_fwd_txfm2d_16x8_sse2,   // 16x8 transform
+  av1_lowbd_fwd_txfm2d_16x32_sse2,  // 16x32 transform
+  av1_lowbd_fwd_txfm2d_32x16_sse2,  // 32x16 transform
+  NULL,                             // 32x64 transform
+  NULL,                             // 64x32 transform
+  av1_lowbd_fwd_txfm2d_4x16_sse2,   // 4x16 transform
+  av1_lowbd_fwd_txfm2d_16x4_sse2,   // 16x4 transform
+  av1_lowbd_fwd_txfm2d_8x32_sse2,   // 8x32 transform
+  av1_lowbd_fwd_txfm2d_32x8_sse2,   // 32x8 transform
+  av1_lowbd_fwd_txfm2d_16x64_sse2,  // 16x64 transform
+  av1_lowbd_fwd_txfm2d_64x16_sse2,  // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff,
+                             int diff_stride, TxfmParam *txfm_param) {
+  FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+
+  if ((fwd_txfm2d_func == NULL) ||
+      (txfm_param->lossless && txfm_param->tx_size == TX_4X4))
+    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+  else
+    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+                    txfm_param->bd);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
new file mode 100644
index 000000000..aa14d3ade
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_
+#define AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void fdct8x32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit);
+void fdct8x64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit);
+
+static INLINE void fidentity4x4_new_sse2(const __m128i *const input,
+                                         __m128i *const output,
+                                         const int8_t cos_bit) {
+  (void)cos_bit;
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 4; ++i) {
+    const __m128i a = _mm_unpacklo_epi16(input[i], one);
+    const __m128i b = scale_round_sse2(a, NewSqrt2);
+    output[i] = _mm_packs_epi32(b, b);
+  }
+}
+
+static INLINE void fidentity8x4_new_sse2(const __m128i *const input,
+                                         __m128i *const output,
+                                         const int8_t cos_bit) {
+  (void)cos_bit;
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 4; ++i) {
+    const __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+    const __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+    const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+    const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
+    output[i] = _mm_packs_epi32(b_lo, b_hi);
+  }
+}
+
+static INLINE void fidentity8x8_new_sse2(const __m128i *input, __m128i *output,
+                                         int8_t cos_bit) {
+  (void)cos_bit;
+
+  output[0] = _mm_adds_epi16(input[0], input[0]);
+  output[1] = _mm_adds_epi16(input[1], input[1]);
+  output[2] = _mm_adds_epi16(input[2], input[2]);
+  output[3] = _mm_adds_epi16(input[3], input[3]);
+  output[4] = _mm_adds_epi16(input[4], input[4]);
+  output[5] = _mm_adds_epi16(input[5], input[5]);
+  output[6] = _mm_adds_epi16(input[6], input[6]);
+  output[7] = _mm_adds_epi16(input[7], input[7]);
+}
+
+static INLINE void fidentity8x16_new_sse2(const __m128i *input, __m128i *output,
+                                          int8_t cos_bit) {
+  (void)cos_bit;
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 16; ++i) {
+    const __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+    const __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+    const __m128i b_lo = scale_round_sse2(a_lo, 2 * NewSqrt2);
+    const __m128i b_hi = scale_round_sse2(a_hi, 2 * NewSqrt2);
+    output[i] = _mm_packs_epi32(b_lo, b_hi);
+  }
+}
+
+static INLINE void fidentity8x32_new_sse2(const __m128i *input, __m128i *output,
+                                          int8_t cos_bit) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; ++i) {
+    output[i] = _mm_slli_epi16(input[i], 2);
+  }
+}
+
+static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = {
+  fdct8x32_new_sse2,       // DCT_DCT
+  NULL,                    // ADST_DCT
+  NULL,                    // DCT_ADST
+  NULL,                    // ADST_ADST
+  NULL,                    // FLIPADST_DCT
+  NULL,                    // DCT_FLIPADST
+  NULL,                    // FLIPADST_FLIPADST
+  NULL,                    // ADST_FLIPADST
+  NULL,                    // FLIPADST_ADST
+  fidentity8x32_new_sse2,  // IDTX
+  fdct8x32_new_sse2,       // V_DCT
+  fidentity8x32_new_sse2,  // H_DCT
+  NULL,                    // V_ADST
+  NULL,                    // H_ADST
+  NULL,                    // V_FLIPADST
+  NULL                     // H_FLIPADST
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
index c8d4ccb70..b58911fcb 100644
--- a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
@@ -11,7 +11,8 @@
 
 #include <immintrin.h>
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 
@@ -32,7 +33,10 @@ static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
                            const int16_t *dequant_ptr, int log_scale,
                            __m256i *qp) {
   __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
-  round = _mm_srai_epi16(round, log_scale);
+  if (log_scale) {
+    const __m128i round_scale = _mm_set1_epi16(1 << (15 - log_scale));
+    round = _mm_mulhrs_epi16(round, round_scale);
+  }
   const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
   const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
 
@@ -45,8 +49,8 @@ static INLINE void quantize(const __m256i *qp, __m256i *c,
                             const int16_t *iscan_ptr, int log_scale,
                             tran_low_t *qcoeff, tran_low_t *dqcoeff,
                             __m256i *eob) {
-  const __m256i abs = _mm256_abs_epi32(*c);
-  __m256i q = _mm256_add_epi32(abs, qp[0]);
+  const __m256i abs_coeff = _mm256_abs_epi32(*c);
+  __m256i q = _mm256_add_epi32(abs_coeff, qp[0]);
 
   __m256i q_lo = _mm256_mul_epi32(q, qp[1]);
   __m256i q_hi = _mm256_srli_epi64(q, 32);
@@ -56,6 +60,9 @@ static INLINE void quantize(const __m256i *qp, __m256i *c,
   q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale);
   q_hi = _mm256_slli_epi64(q_hi, 32);
   q = _mm256_or_si256(q_lo, q_hi);
+  const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale);
+  const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s);
+  q = _mm256_andnot_si256(mask, q);
 
   __m256i dq = _mm256_mullo_epi32(q, qp[2]);
   dq = _mm256_srai_epi32(dq, log_scale);
@@ -81,8 +88,8 @@ static INLINE void quantize(const __m256i *qp, __m256i *c,
 }
 
 void av1_highbd_quantize_fp_avx2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan, int log_scale) {
@@ -90,14 +97,23 @@ void av1_highbd_quantize_fp_avx2(
   (void)zbin_ptr;
   (void)quant_shift_ptr;
   const unsigned int step = 8;
+  __m256i qp[3], coeff;
 
-  if (LIKELY(!skip_block)) {
-    __m256i qp[3], coeff;
+  init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp);
+  coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
 
-    init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp);
-    coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  __m256i eob = _mm256_setzero_si256();
+  quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan += step;
+  n_coeffs -= step;
 
-    __m256i eob = _mm256_setzero_si256();
+  update_qp(qp);
+  while (n_coeffs > 0) {
+    coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
     quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
 
     coeff_ptr += step;
@@ -105,39 +121,17 @@ void av1_highbd_quantize_fp_avx2(
     dqcoeff_ptr += step;
     iscan += step;
     n_coeffs -= step;
-
-    update_qp(qp);
-    while (n_coeffs > 0) {
-      coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
-      quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
-
-      coeff_ptr += step;
-      qcoeff_ptr += step;
-      dqcoeff_ptr += step;
-      iscan += step;
-      n_coeffs -= step;
-    }
-    {
-      __m256i eob_s;
-      eob_s = _mm256_shuffle_epi32(eob, 0xe);
-      eob = _mm256_max_epi16(eob, eob_s);
-      eob_s = _mm256_shufflelo_epi16(eob, 0xe);
-      eob = _mm256_max_epi16(eob, eob_s);
-      eob_s = _mm256_shufflelo_epi16(eob, 1);
-      eob = _mm256_max_epi16(eob, eob_s);
-      const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
-                                              _mm256_extractf128_si256(eob, 1));
-      *eob_ptr = _mm_extract_epi16(final_eob, 0);
-    }
-  } else {
-    do {
-      const __m256i zero = _mm256_setzero_si256();
-      _mm256_storeu_si256((__m256i *)qcoeff_ptr, zero);
-      _mm256_storeu_si256((__m256i *)dqcoeff_ptr, zero);
-      qcoeff_ptr += step;
-      dqcoeff_ptr += step;
-      n_coeffs -= step;
-    } while (n_coeffs > 0);
-    *eob_ptr = 0;
+  }
+  {
+    __m256i eob_s;
+    eob_s = _mm256_shuffle_epi32(eob, 0xe);
+    eob = _mm256_max_epi16(eob, eob_s);
+    eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+    eob = _mm256_max_epi16(eob, eob_s);
+    eob_s = _mm256_shufflelo_epi16(eob, 1);
+    eob = _mm256_max_epi16(eob, eob_s);
+    const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+                                            _mm256_extractf128_si256(eob, 1));
+    *eob_ptr = _mm_extract_epi16(final_eob, 0);
   }
 }
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
index 8d717a083..40b3b460b 100644
--- a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -12,8 +12,10 @@
 #include <smmintrin.h>
 #include <stdint.h>
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/synonyms.h"
 
 // Coefficient quantization phase 1
 // param[0-2] : rounding/quan/dequan constants
@@ -36,6 +38,8 @@ static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
   qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
   dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
   dquan[0] = _mm_srli_epi64(dquan[0], scale);
+  const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale);
+  qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]);
 }
 
 // Coefficient quantization phase 2
@@ -70,7 +74,8 @@ static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
 
   qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
   dquan[0] = _mm_sign_epi32(dquan[0], *sign);
-
+  qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]);
+  dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]);
   _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
   _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
 }
@@ -108,12 +113,12 @@ static INLINE uint16_t get_accumulated_eob(__m128i *eob) {
 }
 
 void av1_highbd_quantize_fp_sse4_1(
-    const tran_low_t *coeff_ptr, intptr_t count, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan, int log_scale) {
-  __m128i coeff[2], qcoeff[2], dequant[2], qparam[3], coeff_sign;
+  __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign;
   __m128i eob = _mm_setzero_si128();
   const tran_low_t *src = coeff_ptr;
   tran_low_t *quanAddr = qcoeff_ptr;
@@ -121,7 +126,6 @@ void av1_highbd_quantize_fp_sse4_1(
   const int shift = 16 - log_scale;
   const int coeff_stride = 4;
   const int quan_stride = coeff_stride;
-  (void)skip_block;
   (void)zbin_ptr;
   (void)quant_shift_ptr;
   (void)scan;
@@ -129,29 +133,54 @@ void av1_highbd_quantize_fp_sse4_1(
   memset(quanAddr, 0, count * sizeof(quanAddr[0]));
   memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
 
-  if (!skip_block) {
-    coeff[0] = _mm_loadu_si128((__m128i const *)src);
+  coeff[0] = _mm_loadu_si128((__m128i const *)src);
+  const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
+  const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+
+  qparam[0] = _mm_set_epi32(round1, round1, round1, round0);
+  qparam[1] = xx_set_64_from_32i(quant_ptr[1], quant_ptr[0]);
+  qparam[2] = xx_set_64_from_32i(dequant_ptr[1], dequant_ptr[0]);
+  qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1],
+                            dequant_ptr[0]);
+
+  // DC and first 3 AC
+  quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
+                        &coeff_sign);
+
+  // update round/quan/dquan for AC
+  qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
+  qparam[1] = xx_set1_64_from_32i(quant_ptr[1]);
+  qparam[2] = xx_set1_64_from_32i(dequant_ptr[1]);
+  qparam[3] = _mm_set1_epi32(dequant_ptr[1]);
+  quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
+                        quanAddr, dquanAddr);
+
+  // next 4 AC
+  coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+  quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
+                        &coeff_sign);
+  quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
+                        quanAddr + quan_stride, dquanAddr + quan_stride);
+
+  find_eob(quanAddr, iscan, &eob);
+
+  count -= 8;
+
+  // loop for the rest of AC
+  while (count > 0) {
+    src += coeff_stride << 1;
+    quanAddr += quan_stride << 1;
+    dquanAddr += quan_stride << 1;
+    iscan += quan_stride << 1;
 
-    qparam[0] =
-        _mm_set_epi32(round_ptr[1] >> log_scale, round_ptr[1] >> log_scale,
-                      round_ptr[1] >> log_scale, round_ptr[0] >> log_scale);
-    qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[0]);
-    qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[0]);
+    coeff[0] = _mm_loadu_si128((__m128i const *)src);
+    coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
 
-    // DC and first 3 AC
     quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
                           &coeff_sign);
-
-    // update round/quan/dquan for AC
-    qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
-    qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[1]);
-    qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[1]);
-
     quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
                           log_scale, quanAddr, dquanAddr);
 
-    // next 4 AC
-    coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
     quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
                           &coeff_sign);
     quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
@@ -161,34 +190,6 @@ void av1_highbd_quantize_fp_sse4_1(
     find_eob(quanAddr, iscan, &eob);
 
     count -= 8;
-
-    // loop for the rest of AC
-    while (count > 0) {
-      src += coeff_stride << 1;
-      quanAddr += quan_stride << 1;
-      dquanAddr += quan_stride << 1;
-      iscan += quan_stride << 1;
-
-      coeff[0] = _mm_loadu_si128((__m128i const *)src);
-      coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
-
-      quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff,
-                            dequant, &coeff_sign);
-      quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
-                            log_scale, quanAddr, dquanAddr);
-
-      quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff,
-                            dequant, &coeff_sign);
-      quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
-                            log_scale, quanAddr + quan_stride,
-                            dquanAddr + quan_stride);
-
-      find_eob(quanAddr, iscan, &eob);
-
-      count -= 8;
-    }
-    *eob_ptr = get_accumulated_eob(&eob);
-  } else {
-    *eob_ptr = 0;
   }
+  *eob_ptr = get_accumulated_eob(&eob);
 }
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
index 078a67510..df22aaba7 100644
--- a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
@@ -11,7 +11,8 @@
 
 #include <immintrin.h>
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 
@@ -57,7 +58,7 @@ static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
   init_one_qp(&round, &qp[0]);
   init_one_qp(&quant, &qp[1]);
 
-  if (log_scale > 0) {
+  if (log_scale == 1) {
     qp[1] = _mm256_slli_epi16(qp[1], log_scale);
   }
 
@@ -94,16 +95,25 @@ static INLINE void update_qp(int log_scale, __m256i *thr, __m256i *qp) {
     }                                            \
   } while (0)
 
+static INLINE uint16_t quant_gather_eob(__m256i eob) {
+  const __m128i eob_lo = _mm256_castsi256_si128(eob);
+  const __m128i eob_hi = _mm256_extractf128_si256(eob, 1);
+  __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi);
+  eob_s = _mm_subs_epu16(_mm_set1_epi16(INT16_MAX), eob_s);
+  eob_s = _mm_minpos_epu16(eob_s);
+  return INT16_MAX - _mm_extract_epi16(eob_s, 0);
+}
+
 static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c,
                             const int16_t *iscan_ptr, tran_low_t *qcoeff,
                             tran_low_t *dqcoeff, __m256i *eob) {
-  const __m256i abs = _mm256_abs_epi16(*c);
-  __m256i mask = _mm256_cmpgt_epi16(abs, *thr);
-  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs, *thr));
+  const __m256i abs_coeff = _mm256_abs_epi16(*c);
+  __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
   const int nzflag = _mm256_movemask_epi8(mask);
 
   if (nzflag) {
-    __m256i q = _mm256_adds_epi16(abs, qp[0]);
+    __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
     q = _mm256_mulhi_epi16(q, qp[1]);
     q = _mm256_sign_epi16(q, *c);
     const __m256i dq = _mm256_mullo_epi16(q, qp[2]);
@@ -123,8 +133,8 @@ static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c,
 }
 
 void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *zbin_ptr,
-                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const int16_t *zbin_ptr, const int16_t *round_ptr,
+                          const int16_t *quant_ptr,
                           const int16_t *quant_shift_ptr,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -134,15 +144,26 @@ void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   (void)quant_shift_ptr;
   const unsigned int step = 16;
 
-  if (LIKELY(!skip_block)) {
-    __m256i qp[3];
-    __m256i coeff, thr;
-    const int log_scale = 0;
+  __m256i qp[3];
+  __m256i coeff, thr;
+  const int log_scale = 0;
 
-    init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
-    read_coeff(coeff_ptr, &coeff);
+  init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+  read_coeff(coeff_ptr, &coeff);
+
+  __m256i eob = _mm256_setzero_si256();
+  quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan_ptr += step;
+  n_coeffs -= step;
+
+  update_qp(log_scale, &thr, qp);
 
-    __m256i eob = _mm256_setzero_si256();
+  while (n_coeffs > 0) {
+    read_coeff(coeff_ptr, &coeff);
     quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
 
     coeff_ptr += step;
@@ -150,54 +171,21 @@ void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     dqcoeff_ptr += step;
     iscan_ptr += step;
     n_coeffs -= step;
-
-    update_qp(log_scale, &thr, qp);
-
-    while (n_coeffs > 0) {
-      read_coeff(coeff_ptr, &coeff);
-      quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
-
-      coeff_ptr += step;
-      qcoeff_ptr += step;
-      dqcoeff_ptr += step;
-      iscan_ptr += step;
-      n_coeffs -= step;
-    }
-    {
-      __m256i eob_s;
-      eob_s = _mm256_shuffle_epi32(eob, 0xe);
-      eob = _mm256_max_epi16(eob, eob_s);
-      eob_s = _mm256_shufflelo_epi16(eob, 0xe);
-      eob = _mm256_max_epi16(eob, eob_s);
-      eob_s = _mm256_shufflelo_epi16(eob, 1);
-      eob = _mm256_max_epi16(eob, eob_s);
-      const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
-                                              _mm256_extractf128_si256(eob, 1));
-      *eob_ptr = _mm_extract_epi16(final_eob, 0);
-    }
-  } else {
-    do {
-      write_zero(qcoeff_ptr);
-      write_zero(dqcoeff_ptr);
-      qcoeff_ptr += step;
-      dqcoeff_ptr += step;
-      n_coeffs -= step;
-    } while (n_coeffs > 0);
-    *eob_ptr = 0;
   }
+  *eob_ptr = quant_gather_eob(eob);
 }
 
 static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp,
                                   __m256i *c, const int16_t *iscan_ptr,
                                   tran_low_t *qcoeff, tran_low_t *dqcoeff,
                                   __m256i *eob) {
-  const __m256i abs = _mm256_abs_epi16(*c);
-  __m256i mask = _mm256_cmpgt_epi16(abs, *thr);
-  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs, *thr));
+  const __m256i abs_coeff = _mm256_abs_epi16(*c);
+  __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
   const int nzflag = _mm256_movemask_epi8(mask);
 
   if (nzflag) {
-    __m256i q = _mm256_adds_epi16(abs, qp[0]);
+    __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
     q = _mm256_mulhi_epu16(q, qp[1]);
 
     __m256i dq = _mm256_mullo_epi16(q, qp[2]);
@@ -221,8 +209,8 @@ static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp,
 }
 
 void av1_quantize_fp_32x32_avx2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan_ptr, const int16_t *iscan_ptr) {
@@ -231,15 +219,26 @@ void av1_quantize_fp_32x32_avx2(
   (void)quant_shift_ptr;
   const unsigned int step = 16;
 
-  if (LIKELY(!skip_block)) {
-    __m256i qp[3];
-    __m256i coeff, thr;
-    const int log_scale = 1;
+  __m256i qp[3];
+  __m256i coeff, thr;
+  const int log_scale = 1;
 
-    init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
-    read_coeff(coeff_ptr, &coeff);
+  init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+  read_coeff(coeff_ptr, &coeff);
 
-    __m256i eob = _mm256_setzero_si256();
+  __m256i eob = _mm256_setzero_si256();
+  quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan_ptr += step;
+  n_coeffs -= step;
+
+  update_qp(log_scale, &thr, qp);
+
+  while (n_coeffs > 0) {
+    read_coeff(coeff_ptr, &coeff);
     quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
 
     coeff_ptr += step;
@@ -247,40 +246,85 @@ void av1_quantize_fp_32x32_avx2(
     dqcoeff_ptr += step;
     iscan_ptr += step;
     n_coeffs -= step;
+  }
+  *eob_ptr = quant_gather_eob(eob);
+}
+
+static INLINE void quantize_64x64(const __m256i *thr, const __m256i *qp,
+                                  __m256i *c, const int16_t *iscan_ptr,
+                                  tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                  __m256i *eob) {
+  const __m256i abs_coeff = _mm256_abs_epi16(*c);
+  __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
+  const int nzflag = _mm256_movemask_epi8(mask);
 
-    update_qp(log_scale, &thr, qp);
-
-    while (n_coeffs > 0) {
-      read_coeff(coeff_ptr, &coeff);
-      quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
-                     &eob);
-
-      coeff_ptr += step;
-      qcoeff_ptr += step;
-      dqcoeff_ptr += step;
-      iscan_ptr += step;
-      n_coeffs -= step;
-    }
-    {
-      __m256i eob_s;
-      eob_s = _mm256_shuffle_epi32(eob, 0xe);
-      eob = _mm256_max_epi16(eob, eob_s);
-      eob_s = _mm256_shufflelo_epi16(eob, 0xe);
-      eob = _mm256_max_epi16(eob, eob_s);
-      eob_s = _mm256_shufflelo_epi16(eob, 1);
-      eob = _mm256_max_epi16(eob, eob_s);
-      const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
-                                              _mm256_extractf128_si256(eob, 1));
-      *eob_ptr = _mm_extract_epi16(final_eob, 0);
-    }
+  if (nzflag) {
+    __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
+    __m256i qh = _mm256_mulhi_epi16(q, qp[1]);
+    __m256i ql = _mm256_mullo_epi16(q, qp[1]);
+    qh = _mm256_slli_epi16(qh, 2);
+    ql = _mm256_srli_epi16(ql, 14);
+    q = _mm256_or_si256(qh, ql);
+    const __m256i dqh = _mm256_slli_epi16(_mm256_mulhi_epi16(q, qp[2]), 14);
+    const __m256i dql = _mm256_srli_epi16(_mm256_mullo_epi16(q, qp[2]), 2);
+    __m256i dq = _mm256_or_si256(dqh, dql);
+
+    q = _mm256_sign_epi16(q, *c);
+    dq = _mm256_sign_epi16(dq, *c);
+
+    store_two_quan(q, qcoeff, dq, dqcoeff);
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+    const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
+    const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
+    __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
+    cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
+    *eob = _mm256_max_epi16(*eob, cur_eob);
   } else {
-    do {
-      write_zero(qcoeff_ptr);
-      write_zero(dqcoeff_ptr);
-      qcoeff_ptr += step;
-      dqcoeff_ptr += step;
-      n_coeffs -= step;
-    } while (n_coeffs > 0);
-    *eob_ptr = 0;
+    write_zero(qcoeff);
+    write_zero(dqcoeff);
+  }
+}
+
+void av1_quantize_fp_64x64_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  const unsigned int step = 16;
+
+  __m256i qp[3];
+  __m256i coeff, thr;
+  const int log_scale = 2;
+
+  init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+  read_coeff(coeff_ptr, &coeff);
+
+  __m256i eob = _mm256_setzero_si256();
+  quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan_ptr += step;
+  n_coeffs -= step;
+
+  update_qp(log_scale, &thr, qp);
+
+  while (n_coeffs > 0) {
+    read_coeff(coeff_ptr, &coeff);
+    quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan_ptr += step;
+    n_coeffs -= step;
   }
+  *eob_ptr = quant_gather_eob(eob);
 }
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
index 4f7c09546..b07e7717f 100644
--- a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
@@ -12,7 +12,8 @@
 #include <emmintrin.h>
 #include <xmmintrin.h>
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
@@ -67,16 +68,80 @@ static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) {
   }
 }
 
+static INLINE void quantize(const int16_t *iscan_ptr,
+                            const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const __m128i *round0, const __m128i *round1,
+                            const __m128i *quant0, const __m128i *quant1,
+                            const __m128i *dequant0, const __m128i *dequant1,
+                            const __m128i *thr0, const __m128i *thr1,
+                            __m128i *eob) {
+  __m128i coeff0, coeff1;
+  // Do DC and first 15 AC
+  read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
+
+  // Poor man's sign extract
+  const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+  __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+  qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+  qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+  const __m128i mask0 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff0, *thr0),
+                                     _mm_cmpeq_epi16(qcoeff0, *thr0));
+  const __m128i mask1 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff1, *thr1),
+                                     _mm_cmpeq_epi16(qcoeff1, *thr1));
+  const int16_t nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1);
+
+  if (nzflag) {
+    qcoeff0 = _mm_adds_epi16(qcoeff0, *round0);
+    qcoeff1 = _mm_adds_epi16(qcoeff1, *round1);
+    const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0);
+    const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1);
+
+    // Reinsert signs
+    qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+    qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+    qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+    qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+    write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
+
+    coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0);
+    coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1);
+
+    write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
+
+    const __m128i zero = _mm_setzero_si128();
+    // Scan for eob
+    const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+    const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+    const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+    const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+    const __m128i iscan0 =
+        _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+    const __m128i iscan1 =
+        _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+    // Add one to convert from indices to counts
+    const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0);
+    const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1);
+    const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0);
+    const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1);
+    const __m128i eob2 = _mm_max_epi16(eob0, eob1);
+    *eob = _mm_max_epi16(*eob, eob2);
+  } else {
+    write_zero(qcoeff_ptr, n_coeffs);
+    write_zero(dqcoeff_ptr, n_coeffs);
+  }
+}
+
 void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *zbin_ptr,
-                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const int16_t *zbin_ptr, const int16_t *round_ptr,
+                          const int16_t *quant_ptr,
                           const int16_t *quant_shift_ptr,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
                           const int16_t *scan_ptr, const int16_t *iscan_ptr) {
-  __m128i zero;
-  __m128i thr;
-  int16_t nzflag;
   (void)scan_ptr;
   (void)zbin_ptr;
   (void)quant_shift_ptr;
@@ -86,167 +151,39 @@ void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   qcoeff_ptr += n_coeffs;
   dqcoeff_ptr += n_coeffs;
   n_coeffs = -n_coeffs;
-  zero = _mm_setzero_si128();
-
-  if (!skip_block) {
-    __m128i eob;
-    __m128i round, quant, dequant;
-    {
-      __m128i coeff0, coeff1;
-
-      // Setup global values
-      {
-        round = _mm_load_si128((const __m128i *)round_ptr);
-        quant = _mm_load_si128((const __m128i *)quant_ptr);
-        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-      }
-
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        // Do DC and first 15 AC
-        read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        round = _mm_unpackhi_epi64(round, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        quant = _mm_unpackhi_epi64(quant, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        dequant = _mm_unpackhi_epi64(dequant, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob = _mm_max_epi16(eob, eob1);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    thr = _mm_srai_epi16(dequant, 1);
-
-    // AC only loop
-    while (n_coeffs < 0) {
-      __m128i coeff0, coeff1;
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-
-        read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
-                 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
-
-        if (nzflag) {
-          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-          // Reinsert signs
-          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-          write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
-
-          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-          write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
-        } else {
-          write_zero(qcoeff_ptr, n_coeffs);
-          write_zero(dqcoeff_ptr, n_coeffs);
-        }
-      }
-
-      if (nzflag) {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob0, eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob0 = _mm_max_epi16(eob0, eob1);
-        eob = _mm_max_epi16(eob, eob0);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    // Accumulate EOB
-    {
-      __m128i eob_shuffled;
-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      *eob_ptr = _mm_extract_epi16(eob, 1);
-    }
-  } else {
-    do {
-      write_zero(dqcoeff_ptr, n_coeffs);
-      write_zero(qcoeff_ptr, n_coeffs);
-      n_coeffs += 8 * 2;
-    } while (n_coeffs < 0);
-    *eob_ptr = 0;
+
+  const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr);
+  const __m128i round1 = _mm_unpackhi_epi64(round0, round0);
+  const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr);
+  const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0);
+  const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr);
+  const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0);
+  const __m128i thr0 = _mm_srai_epi16(dequant0, 1);
+  const __m128i thr1 = _mm_srai_epi16(dequant1, 1);
+  __m128i eob = _mm_setzero_si128();
+
+  quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0,
+           &round1, &quant0, &quant1, &dequant0, &dequant1, &thr0, &thr1, &eob);
+
+  n_coeffs += 8 * 2;
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1,
+             &round1, &quant1, &quant1, &dequant1, &dequant1, &thr1, &thr1,
+             &eob);
+    n_coeffs += 8 * 2;
+  }
+
+  // Accumulate EOB
+  {
+    __m128i eob_shuffled;
+    eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    *eob_ptr = _mm_extract_epi16(eob, 1);
   }
 }
diff --git a/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
index dcc697ba3..faa2a232a 100644
--- a/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
+++ b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
@@ -47,6 +47,9 @@
         paddd           %1, xmm1
         SUM_ACROSS_Q    %1
 %endmacro
+
+SECTION .text
+
 ;void ssim_parms_sse2(
 ;    unsigned char *s,
 ;    int sp,
diff --git a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
new file mode 100644
index 000000000..0adefecdb
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_TXMF1D_SSE2_H_
+#define AV1_TXMF1D_SSE2_H_
+
+#include <smmintrin.h>
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+                           int8_t cos_bit);
+void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+                           int8_t cos_bit);
+
+void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_idct4_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct8_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct16_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct32_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct64_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_iadst4_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst8_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst16_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t cos_bit, const int8_t *stage_range);
+static INLINE void transpose_32_4x4(int stride, const __m128i *input,
+                                    __m128i *output) {
+  __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
+  __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
+
+  output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+  output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+  output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+  output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+// the entire input block can be represent by a grid of 4x4 blocks
+// each 4x4 blocks can be represent by 4 vertical __m128i
+// we first transpose each 4x4 block internally
+// then transpose the grid
+static INLINE void transpose_32(int txfm_size, const __m128i *input,
+                                __m128i *output) {
+  const int num_per_128 = 4;
+  const int row_size = txfm_size;
+  const int col_size = txfm_size / num_per_128;
+  int r, c;
+
+  // transpose each 4x4 block internally
+  for (r = 0; r < row_size; r += 4) {
+    for (c = 0; c < col_size; c++) {
+      transpose_32_4x4(col_size, &input[r * col_size + c],
+                       &output[c * 4 * col_size + r / 4]);
+    }
+  }
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                         \
+    const __m128i ww0 = _mm_set1_epi32(w0);                    \
+    const __m128i ww1 = _mm_set1_epi32(w1);                    \
+    const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0);          \
+    const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1);          \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
+    out0 = av1_round_shift_32_sse4_1(out0, bit);               \
+    const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1);          \
+    const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0);          \
+    out1 = _mm_sub_epi32(in0_w1, in1_w0);                      \
+    out1 = av1_round_shift_32_sse4_1(out1, bit);               \
+  } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                         \
+    btf_32_sse4_1_type0(w1, w0, in1, in0, out0, out1, bit);    \
+  } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+  do {                                                                  \
+    const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0);                   \
+    const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1);                   \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                               \
+    out0 = _mm_add_epi32(out0, r);                                      \
+    out0 = _mm_srai_epi32(out0, bit);                                   \
+    const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1);                   \
+    const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0);                   \
+    out1 = _mm_sub_epi32(in0_w1, in1_w0);                               \
+    out1 = _mm_add_epi32(out1, r);                                      \
+    out1 = _mm_srai_epi32(out1, bit);                                   \
+  } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+  do {                                                                  \
+    btf_32_type0_sse4_1_new(ww1, ww0, in1, in0, out0, out1, r, bit);    \
+  } while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AV1_TXMF1D_SSE2_H_
diff --git a/third_party/aom/av1/encoder/x86/corner_match_sse4.c b/third_party/aom/av1/encoder/x86/corner_match_sse4.c
index 179da0d28..381f757da 100644
--- a/third_party/aom/av1/encoder/x86/corner_match_sse4.c
+++ b/third_party/aom/av1/encoder/x86/corner_match_sse4.c
@@ -5,7 +5,8 @@
 
 #include <smmintrin.h>
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom_ports/mem.h"
 #include "av1/encoder/corner_match.h"
 
diff --git a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
deleted file mode 100644
index e5b19a44c..000000000
--- a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
+++ /dev/null
@@ -1,3483 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h>  // SSE2
-
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/fwd_txfm_sse2.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-#include "aom_ports/mem.h"
-
-static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
-                                   int stride, int flipud, int fliplr) {
-  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
-  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
-  __m128i mask;
-
-  if (!flipud) {
-    in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-    in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-    in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
-    in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
-  } else {
-    in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
-    in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
-    in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-    in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-  }
-
-  if (fliplr) {
-    in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
-    in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
-    in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
-    in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
-  }
-
-  in[0] = _mm_slli_epi16(in[0], 4);
-  in[1] = _mm_slli_epi16(in[1], 4);
-  in[2] = _mm_slli_epi16(in[2], 4);
-  in[3] = _mm_slli_epi16(in[3], 4);
-
-  mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
-  in[0] = _mm_add_epi16(in[0], mask);
-  in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
-}
-
-static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
-  const __m128i kOne = _mm_set1_epi16(1);
-  __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
-  __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
-  __m128i out01 = _mm_add_epi16(in01, kOne);
-  __m128i out23 = _mm_add_epi16(in23, kOne);
-  out01 = _mm_srai_epi16(out01, 2);
-  out23 = _mm_srai_epi16(out23, 2);
-  store_output(&out01, (output + 0 * 8));
-  store_output(&out23, (output + 1 * 8));
-}
-
-static INLINE void transpose_4x4(__m128i *res) {
-  // Combine and transpose
-  // 00 01 02 03 20 21 22 23
-  // 10 11 12 13 30 31 32 33
-  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
-  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
-  // 00 10 01 11 02 12 03 13
-  // 20 30 21 31 22 32 23 33
-  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
-
-  // 00 10 20 30 01 11 21 31
-  // 02 12 22 32 03 13 23 33
-  // only use the first 4 16-bit integers
-  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
-  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
-}
-
-static void fdct4_sse2(__m128i *in) {
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i u[4], v[4];
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpacklo_epi16(in[3], in[2]);
-
-  v[0] = _mm_add_epi16(u[0], u[1]);
-  v[1] = _mm_sub_epi16(u[0], u[1]);
-
-  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
-  u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
-  u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
-  u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u[0], u[1]);
-  in[1] = _mm_packs_epi32(u[2], u[3]);
-  transpose_4x4(in);
-}
-
-static void fadst4_sse2(__m128i *in) {
-  const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
-  const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
-  const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
-  const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
-  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
-  const __m128i kZero = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8];
-  __m128i in7 = _mm_add_epi16(in[0], in[1]);
-
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpacklo_epi16(in[2], in[3]);
-  u[2] = _mm_unpacklo_epi16(in7, kZero);
-  u[3] = _mm_unpacklo_epi16(in[2], kZero);
-  u[4] = _mm_unpacklo_epi16(in[3], kZero);
-
-  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
-  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
-  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
-  v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
-  v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
-  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
-  v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
-
-  u[0] = _mm_add_epi32(v[0], v[1]);
-  u[1] = _mm_sub_epi32(v[2], v[6]);
-  u[2] = _mm_add_epi32(v[3], v[4]);
-  u[3] = _mm_sub_epi32(u[2], u[0]);
-  u[4] = _mm_slli_epi32(v[5], 2);
-  u[5] = _mm_sub_epi32(u[4], v[5]);
-  u[6] = _mm_add_epi32(u[3], u[5]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u[0], u[2]);
-  in[1] = _mm_packs_epi32(u[1], u[3]);
-  transpose_4x4(in);
-}
-
-#if CONFIG_EXT_TX
-static void fidtx4_sse2(__m128i *in) {
-  const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
-  const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i v0, v1, v2, v3;
-  __m128i u0, u1, u2, u3;
-
-  v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
-  v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
-  v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
-  v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
-
-  u0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
-  u1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
-  u2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
-  u3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
-
-  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u0, u2);
-  in[1] = _mm_packs_epi32(u1, u3);
-  transpose_4x4(in);
-}
-#endif  // CONFIG_EXT_TX
-
-void av1_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
-                     TxfmParam *txfm_param) {
-  __m128i in[4];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
-  switch (tx_type) {
-    case DCT_DCT: aom_fdct4x4_sse2(input, output, stride); break;
-    case ADST_DCT:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fadst4_sse2(in);
-      fdct4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case DCT_ADST:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fdct4_sse2(in);
-      fadst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case ADST_ADST:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_4x4(input, in, stride, 1, 0);
-      fadst4_sse2(in);
-      fdct4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_4x4(input, in, stride, 0, 1);
-      fdct4_sse2(in);
-      fadst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_4x4(input, in, stride, 1, 1);
-      fadst4_sse2(in);
-      fadst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_4x4(input, in, stride, 0, 1);
-      fadst4_sse2(in);
-      fadst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_4x4(input, in, stride, 1, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case IDTX:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fidtx4_sse2(in);
-      fidtx4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case V_DCT:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fdct4_sse2(in);
-      fidtx4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case H_DCT:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fidtx4_sse2(in);
-      fdct4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case V_ADST:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fadst4_sse2(in);
-      fidtx4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case H_ADST:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fidtx4_sse2(in);
-      fadst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case V_FLIPADST:
-      load_buffer_4x4(input, in, stride, 1, 0);
-      fadst4_sse2(in);
-      fidtx4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case H_FLIPADST:
-      load_buffer_4x4(input, in, stride, 0, 1);
-      fidtx4_sse2(in);
-      fadst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0);
-  }
-}
-
-// load 8x8 array
-static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
-                                   int stride, int flipud, int fliplr) {
-  if (!flipud) {
-    in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
-    in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
-    in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
-    in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
-    in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
-    in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
-    in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
-    in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
-  } else {
-    in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
-    in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
-    in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
-    in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
-    in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
-    in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
-    in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
-    in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  }
-
-  if (fliplr) {
-    in[0] = mm_reverse_epi16(in[0]);
-    in[1] = mm_reverse_epi16(in[1]);
-    in[2] = mm_reverse_epi16(in[2]);
-    in[3] = mm_reverse_epi16(in[3]);
-    in[4] = mm_reverse_epi16(in[4]);
-    in[5] = mm_reverse_epi16(in[5]);
-    in[6] = mm_reverse_epi16(in[6]);
-    in[7] = mm_reverse_epi16(in[7]);
-  }
-
-  in[0] = _mm_slli_epi16(in[0], 2);
-  in[1] = _mm_slli_epi16(in[1], 2);
-  in[2] = _mm_slli_epi16(in[2], 2);
-  in[3] = _mm_slli_epi16(in[3], 2);
-  in[4] = _mm_slli_epi16(in[4], 2);
-  in[5] = _mm_slli_epi16(in[5], 2);
-  in[6] = _mm_slli_epi16(in[6], 2);
-  in[7] = _mm_slli_epi16(in[7], 2);
-}
-
-// right shift and rounding
-static INLINE void right_shift_8x8(__m128i *res, const int bit) {
-  __m128i sign0 = _mm_srai_epi16(res[0], 15);
-  __m128i sign1 = _mm_srai_epi16(res[1], 15);
-  __m128i sign2 = _mm_srai_epi16(res[2], 15);
-  __m128i sign3 = _mm_srai_epi16(res[3], 15);
-  __m128i sign4 = _mm_srai_epi16(res[4], 15);
-  __m128i sign5 = _mm_srai_epi16(res[5], 15);
-  __m128i sign6 = _mm_srai_epi16(res[6], 15);
-  __m128i sign7 = _mm_srai_epi16(res[7], 15);
-
-  if (bit == 2) {
-    const __m128i const_rounding = _mm_set1_epi16(1);
-    res[0] = _mm_adds_epi16(res[0], const_rounding);
-    res[1] = _mm_adds_epi16(res[1], const_rounding);
-    res[2] = _mm_adds_epi16(res[2], const_rounding);
-    res[3] = _mm_adds_epi16(res[3], const_rounding);
-    res[4] = _mm_adds_epi16(res[4], const_rounding);
-    res[5] = _mm_adds_epi16(res[5], const_rounding);
-    res[6] = _mm_adds_epi16(res[6], const_rounding);
-    res[7] = _mm_adds_epi16(res[7], const_rounding);
-  }
-
-  res[0] = _mm_sub_epi16(res[0], sign0);
-  res[1] = _mm_sub_epi16(res[1], sign1);
-  res[2] = _mm_sub_epi16(res[2], sign2);
-  res[3] = _mm_sub_epi16(res[3], sign3);
-  res[4] = _mm_sub_epi16(res[4], sign4);
-  res[5] = _mm_sub_epi16(res[5], sign5);
-  res[6] = _mm_sub_epi16(res[6], sign6);
-  res[7] = _mm_sub_epi16(res[7], sign7);
-
-  if (bit == 1) {
-    res[0] = _mm_srai_epi16(res[0], 1);
-    res[1] = _mm_srai_epi16(res[1], 1);
-    res[2] = _mm_srai_epi16(res[2], 1);
-    res[3] = _mm_srai_epi16(res[3], 1);
-    res[4] = _mm_srai_epi16(res[4], 1);
-    res[5] = _mm_srai_epi16(res[5], 1);
-    res[6] = _mm_srai_epi16(res[6], 1);
-    res[7] = _mm_srai_epi16(res[7], 1);
-  } else {
-    res[0] = _mm_srai_epi16(res[0], 2);
-    res[1] = _mm_srai_epi16(res[1], 2);
-    res[2] = _mm_srai_epi16(res[2], 2);
-    res[3] = _mm_srai_epi16(res[3], 2);
-    res[4] = _mm_srai_epi16(res[4], 2);
-    res[5] = _mm_srai_epi16(res[5], 2);
-    res[6] = _mm_srai_epi16(res[6], 2);
-    res[7] = _mm_srai_epi16(res[7], 2);
-  }
-}
-
-// write 8x8 array
-static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
-                                    int stride) {
-  store_output(&res[0], (output + 0 * stride));
-  store_output(&res[1], (output + 1 * stride));
-  store_output(&res[2], (output + 2 * stride));
-  store_output(&res[3], (output + 3 * stride));
-  store_output(&res[4], (output + 4 * stride));
-  store_output(&res[5], (output + 5 * stride));
-  store_output(&res[6], (output + 6 * stride));
-  store_output(&res[7], (output + 7 * stride));
-}
-
-// perform in-place transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-  // 00 10 01 11 02 12 03 13
-  // 20 30 21 31 22 32 23 33
-  // 04 14 05 15 06 16 07 17
-  // 24 34 25 35 26 36 27 37
-  // 40 50 41 51 42 52 43 53
-  // 60 70 61 71 62 72 63 73
-  // 44 54 45 55 46 56 47 57
-  // 64 74 65 75 66 76 67 77
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-  // 00 10 20 30 01 11 21 31
-  // 40 50 60 70 41 51 61 71
-  // 02 12 22 32 03 13 23 33
-  // 42 52 62 72 43 53 63 73
-  // 04 14 24 34 05 15 25 35
-  // 44 54 64 74 45 55 65 75
-  // 06 16 26 36 07 17 27 37
-  // 46 56 66 76 47 57 67 77
-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-  // 00 10 20 30 40 50 60 70
-  // 01 11 21 31 41 51 61 71
-  // 02 12 22 32 42 52 62 72
-  // 03 13 23 33 43 53 63 73
-  // 04 14 24 34 44 54 64 74
-  // 05 15 25 35 45 55 65 75
-  // 06 16 26 36 46 56 66 76
-  // 07 17 27 37 47 57 67 77
-}
-
-static void fdct8_sse2(__m128i *in) {
-  // constants
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-
-  // stage 1
-  s0 = _mm_add_epi16(in[0], in[7]);
-  s1 = _mm_add_epi16(in[1], in[6]);
-  s2 = _mm_add_epi16(in[2], in[5]);
-  s3 = _mm_add_epi16(in[3], in[4]);
-  s4 = _mm_sub_epi16(in[3], in[4]);
-  s5 = _mm_sub_epi16(in[2], in[5]);
-  s6 = _mm_sub_epi16(in[1], in[6]);
-  s7 = _mm_sub_epi16(in[0], in[7]);
-
-  u0 = _mm_add_epi16(s0, s3);
-  u1 = _mm_add_epi16(s1, s2);
-  u2 = _mm_sub_epi16(s1, s2);
-  u3 = _mm_sub_epi16(s0, s3);
-  // interleave and perform butterfly multiplication/addition
-  v0 = _mm_unpacklo_epi16(u0, u1);
-  v1 = _mm_unpackhi_epi16(u0, u1);
-  v2 = _mm_unpacklo_epi16(u2, u3);
-  v3 = _mm_unpackhi_epi16(u2, u3);
-
-  u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
-  u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
-  u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
-  u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
-  u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
-  u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
-  u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
-  u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
-
-  // shift and rounding
-  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u0, u1);
-  in[2] = _mm_packs_epi32(u4, u5);
-  in[4] = _mm_packs_epi32(u2, u3);
-  in[6] = _mm_packs_epi32(u6, u7);
-
-  // stage 2
-  // interleave and perform butterfly multiplication/addition
-  u0 = _mm_unpacklo_epi16(s6, s5);
-  u1 = _mm_unpackhi_epi16(s6, s5);
-  v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
-  v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
-  v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
-  v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-
-  // shift and rounding
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-
-  u0 = _mm_packs_epi32(v0, v1);
-  u1 = _mm_packs_epi32(v2, v3);
-
-  // stage 3
-  s0 = _mm_add_epi16(s4, u0);
-  s1 = _mm_sub_epi16(s4, u0);
-  s2 = _mm_sub_epi16(s7, u1);
-  s3 = _mm_add_epi16(s7, u1);
-
-  // stage 4
-  u0 = _mm_unpacklo_epi16(s0, s3);
-  u1 = _mm_unpackhi_epi16(s0, s3);
-  u2 = _mm_unpacklo_epi16(s1, s2);
-  u3 = _mm_unpackhi_epi16(s1, s2);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
-  v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
-  v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
-  v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
-  v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
-  v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
-  v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
-  v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
-
-  // shift and rounding
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  in[1] = _mm_packs_epi32(v0, v1);
-  in[3] = _mm_packs_epi32(v4, v5);
-  in[5] = _mm_packs_epi32(v2, v3);
-  in[7] = _mm_packs_epi32(v6, v7);
-
-  // transpose
-  array_transpose_8x8(in, in);
-}
-
-static void fadst8_sse2(__m128i *in) {
-  // Constants
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__const_0 = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
-  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-
-  // properly aligned for butterfly input
-  in0 = in[7];
-  in1 = in[0];
-  in2 = in[5];
-  in3 = in[2];
-  in4 = in[3];
-  in5 = in[4];
-  in6 = in[1];
-  in7 = in[6];
-
-  // column transformation
-  // stage 1
-  // interleave and multiply/add into 32-bit integer
-  s0 = _mm_unpacklo_epi16(in0, in1);
-  s1 = _mm_unpackhi_epi16(in0, in1);
-  s2 = _mm_unpacklo_epi16(in2, in3);
-  s3 = _mm_unpackhi_epi16(in2, in3);
-  s4 = _mm_unpacklo_epi16(in4, in5);
-  s5 = _mm_unpackhi_epi16(in4, in5);
-  s6 = _mm_unpacklo_epi16(in6, in7);
-  s7 = _mm_unpackhi_epi16(in6, in7);
-
-  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
-  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
-  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
-  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
-  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
-  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
-  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
-  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
-  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
-  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
-  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
-  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
-  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
-  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
-  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
-  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
-
-  // addition
-  w0 = _mm_add_epi32(u0, u8);
-  w1 = _mm_add_epi32(u1, u9);
-  w2 = _mm_add_epi32(u2, u10);
-  w3 = _mm_add_epi32(u3, u11);
-  w4 = _mm_add_epi32(u4, u12);
-  w5 = _mm_add_epi32(u5, u13);
-  w6 = _mm_add_epi32(u6, u14);
-  w7 = _mm_add_epi32(u7, u15);
-  w8 = _mm_sub_epi32(u0, u8);
-  w9 = _mm_sub_epi32(u1, u9);
-  w10 = _mm_sub_epi32(u2, u10);
-  w11 = _mm_sub_epi32(u3, u11);
-  w12 = _mm_sub_epi32(u4, u12);
-  w13 = _mm_sub_epi32(u5, u13);
-  w14 = _mm_sub_epi32(u6, u14);
-  w15 = _mm_sub_epi32(u7, u15);
-
-  // shift and rounding
-  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
-  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
-  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
-  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
-  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
-  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
-  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
-  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
-
-  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
-  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
-  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
-  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
-  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
-  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
-  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
-  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
-
-  // back to 16-bit and pack 8 integers into __m128i
-  v0 = _mm_add_epi32(w0, w4);
-  v1 = _mm_add_epi32(w1, w5);
-  v2 = _mm_add_epi32(w2, w6);
-  v3 = _mm_add_epi32(w3, w7);
-  v4 = _mm_sub_epi32(w0, w4);
-  v5 = _mm_sub_epi32(w1, w5);
-  v6 = _mm_sub_epi32(w2, w6);
-  v7 = _mm_sub_epi32(w3, w7);
-
-  w0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  w1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  w2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  w3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  w4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  w5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  w6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  w7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(w0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(w1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(w2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(w3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(w4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(w5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(w6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(w7, DCT_CONST_BITS);
-
-  in[4] = _mm_packs_epi32(u8, u9);
-  in[5] = _mm_packs_epi32(u10, u11);
-  in[6] = _mm_packs_epi32(u12, u13);
-  in[7] = _mm_packs_epi32(u14, u15);
-
-  // stage 2
-  s0 = _mm_packs_epi32(v0, v1);
-  s1 = _mm_packs_epi32(v2, v3);
-  s2 = _mm_packs_epi32(v4, v5);
-  s3 = _mm_packs_epi32(v6, v7);
-
-  u0 = _mm_unpacklo_epi16(in[4], in[5]);
-  u1 = _mm_unpackhi_epi16(in[4], in[5]);
-  u2 = _mm_unpacklo_epi16(in[6], in[7]);
-  u3 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
-  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
-  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
-  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
-  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
-  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
-  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
-  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
-
-  w0 = _mm_add_epi32(v0, v4);
-  w1 = _mm_add_epi32(v1, v5);
-  w2 = _mm_add_epi32(v2, v6);
-  w3 = _mm_add_epi32(v3, v7);
-  w4 = _mm_sub_epi32(v0, v4);
-  w5 = _mm_sub_epi32(v1, v5);
-  w6 = _mm_sub_epi32(v2, v6);
-  w7 = _mm_sub_epi32(v3, v7);
-
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
-  // back to 16-bit intergers
-  s4 = _mm_packs_epi32(u0, u1);
-  s5 = _mm_packs_epi32(u2, u3);
-  s6 = _mm_packs_epi32(u4, u5);
-  s7 = _mm_packs_epi32(u6, u7);
-
-  // stage 3
-  u0 = _mm_unpacklo_epi16(s2, s3);
-  u1 = _mm_unpackhi_epi16(s2, s3);
-  u2 = _mm_unpacklo_epi16(s6, s7);
-  u3 = _mm_unpackhi_epi16(s6, s7);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
-  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
-  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
-  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
-  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
-  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
-  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  s2 = _mm_packs_epi32(v0, v1);
-  s3 = _mm_packs_epi32(v2, v3);
-  s6 = _mm_packs_epi32(v4, v5);
-  s7 = _mm_packs_epi32(v6, v7);
-
-  // FIXME(jingning): do subtract using bit inversion?
-  in[0] = s0;
-  in[1] = _mm_sub_epi16(k__const_0, s4);
-  in[2] = s6;
-  in[3] = _mm_sub_epi16(k__const_0, s2);
-  in[4] = s3;
-  in[5] = _mm_sub_epi16(k__const_0, s7);
-  in[6] = s5;
-  in[7] = _mm_sub_epi16(k__const_0, s1);
-
-  // transpose
-  array_transpose_8x8(in, in);
-}
-
-#if CONFIG_EXT_TX
-static void fidtx8_sse2(__m128i *in) {
-  in[0] = _mm_slli_epi16(in[0], 1);
-  in[1] = _mm_slli_epi16(in[1], 1);
-  in[2] = _mm_slli_epi16(in[2], 1);
-  in[3] = _mm_slli_epi16(in[3], 1);
-  in[4] = _mm_slli_epi16(in[4], 1);
-  in[5] = _mm_slli_epi16(in[5], 1);
-  in[6] = _mm_slli_epi16(in[6], 1);
-  in[7] = _mm_slli_epi16(in[7], 1);
-
-  array_transpose_8x8(in, in);
-}
-#endif  // CONFIG_EXT_TX
-
-void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
-                     TxfmParam *txfm_param) {
-  __m128i in[8];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
-  switch (tx_type) {
-    case DCT_DCT: aom_fdct8x8_sse2(input, output, stride); break;
-    case ADST_DCT:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fadst8_sse2(in);
-      fdct8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case DCT_ADST:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fdct8_sse2(in);
-      fadst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case ADST_ADST:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fadst8_sse2(in);
-      fadst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_8x8(input, in, stride, 1, 0);
-      fadst8_sse2(in);
-      fdct8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_8x8(input, in, stride, 0, 1);
-      fdct8_sse2(in);
-      fadst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_8x8(input, in, stride, 1, 1);
-      fadst8_sse2(in);
-      fadst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_8x8(input, in, stride, 0, 1);
-      fadst8_sse2(in);
-      fadst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_8x8(input, in, stride, 1, 0);
-      fadst8_sse2(in);
-      fadst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case IDTX:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fidtx8_sse2(in);
-      fidtx8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case V_DCT:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fdct8_sse2(in);
-      fidtx8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case H_DCT:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fidtx8_sse2(in);
-      fdct8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case V_ADST:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fadst8_sse2(in);
-      fidtx8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case H_ADST:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fidtx8_sse2(in);
-      fadst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case V_FLIPADST:
-      load_buffer_8x8(input, in, stride, 1, 0);
-      fadst8_sse2(in);
-      fidtx8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case H_FLIPADST:
-      load_buffer_8x8(input, in, stride, 0, 1);
-      fidtx8_sse2(in);
-      fadst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0);
-  }
-}
-
-static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0,
-                                     __m128i *in1, int stride, int flipud,
-                                     int fliplr) {
-  // Load 4 8x8 blocks
-  const int16_t *topL = input;
-  const int16_t *topR = input + 8;
-  const int16_t *botL = input + 8 * stride;
-  const int16_t *botR = input + 8 * stride + 8;
-
-  const int16_t *tmp;
-
-  if (flipud) {
-    // Swap left columns
-    tmp = topL;
-    topL = botL;
-    botL = tmp;
-    // Swap right columns
-    tmp = topR;
-    topR = botR;
-    botR = tmp;
-  }
-
-  if (fliplr) {
-    // Swap top rows
-    tmp = topL;
-    topL = topR;
-    topR = tmp;
-    // Swap bottom rows
-    tmp = botL;
-    botL = botR;
-    botR = tmp;
-  }
-
-  // load first 8 columns
-  load_buffer_8x8(topL, in0, stride, flipud, fliplr);
-  load_buffer_8x8(botL, in0 + 8, stride, flipud, fliplr);
-
-  // load second 8 columns
-  load_buffer_8x8(topR, in1, stride, flipud, fliplr);
-  load_buffer_8x8(botR, in1 + 8, stride, flipud, fliplr);
-}
-
-static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
-                                      __m128i *in1, int stride) {
-  // write first 8 columns
-  write_buffer_8x8(output, in0, stride);
-  write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
-  // write second 8 columns
-  output += 8;
-  write_buffer_8x8(output, in1, stride);
-  write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
-}
-
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
-  __m128i tbuf[8];
-  array_transpose_8x8(res0, res0);
-  array_transpose_8x8(res1, tbuf);
-  array_transpose_8x8(res0 + 8, res1);
-  array_transpose_8x8(res1 + 8, res1 + 8);
-
-  res0[8] = tbuf[0];
-  res0[9] = tbuf[1];
-  res0[10] = tbuf[2];
-  res0[11] = tbuf[3];
-  res0[12] = tbuf[4];
-  res0[13] = tbuf[5];
-  res0[14] = tbuf[6];
-  res0[15] = tbuf[7];
-}
-
-static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
-  // perform rounding operations
-  right_shift_8x8(res0, 2);
-  right_shift_8x8(res0 + 8, 2);
-  right_shift_8x8(res1, 2);
-  right_shift_8x8(res1 + 8, 2);
-}
-
-static void fdct16_8col(__m128i *in) {
-  // perform 16x16 1-D DCT for 8 columns
-  __m128i i[8], s[8], p[8], t[8], u[16], v[16];
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  // stage 1
-  i[0] = _mm_add_epi16(in[0], in[15]);
-  i[1] = _mm_add_epi16(in[1], in[14]);
-  i[2] = _mm_add_epi16(in[2], in[13]);
-  i[3] = _mm_add_epi16(in[3], in[12]);
-  i[4] = _mm_add_epi16(in[4], in[11]);
-  i[5] = _mm_add_epi16(in[5], in[10]);
-  i[6] = _mm_add_epi16(in[6], in[9]);
-  i[7] = _mm_add_epi16(in[7], in[8]);
-
-  s[0] = _mm_sub_epi16(in[7], in[8]);
-  s[1] = _mm_sub_epi16(in[6], in[9]);
-  s[2] = _mm_sub_epi16(in[5], in[10]);
-  s[3] = _mm_sub_epi16(in[4], in[11]);
-  s[4] = _mm_sub_epi16(in[3], in[12]);
-  s[5] = _mm_sub_epi16(in[2], in[13]);
-  s[6] = _mm_sub_epi16(in[1], in[14]);
-  s[7] = _mm_sub_epi16(in[0], in[15]);
-
-  p[0] = _mm_add_epi16(i[0], i[7]);
-  p[1] = _mm_add_epi16(i[1], i[6]);
-  p[2] = _mm_add_epi16(i[2], i[5]);
-  p[3] = _mm_add_epi16(i[3], i[4]);
-  p[4] = _mm_sub_epi16(i[3], i[4]);
-  p[5] = _mm_sub_epi16(i[2], i[5]);
-  p[6] = _mm_sub_epi16(i[1], i[6]);
-  p[7] = _mm_sub_epi16(i[0], i[7]);
-
-  u[0] = _mm_add_epi16(p[0], p[3]);
-  u[1] = _mm_add_epi16(p[1], p[2]);
-  u[2] = _mm_sub_epi16(p[1], p[2]);
-  u[3] = _mm_sub_epi16(p[0], p[3]);
-
-  v[0] = _mm_unpacklo_epi16(u[0], u[1]);
-  v[1] = _mm_unpackhi_epi16(u[0], u[1]);
-  v[2] = _mm_unpacklo_epi16(u[2], u[3]);
-  v[3] = _mm_unpackhi_epi16(u[2], u[3]);
-
-  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
-  u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
-  u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
-  u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
-  u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
-  u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
-  u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
-  u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u[0], u[1]);
-  in[4] = _mm_packs_epi32(u[4], u[5]);
-  in[8] = _mm_packs_epi32(u[2], u[3]);
-  in[12] = _mm_packs_epi32(u[6], u[7]);
-
-  u[0] = _mm_unpacklo_epi16(p[5], p[6]);
-  u[1] = _mm_unpackhi_epi16(p[5], p[6]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
-  u[0] = _mm_packs_epi32(v[0], v[1]);
-  u[1] = _mm_packs_epi32(v[2], v[3]);
-
-  t[0] = _mm_add_epi16(p[4], u[0]);
-  t[1] = _mm_sub_epi16(p[4], u[0]);
-  t[2] = _mm_sub_epi16(p[7], u[1]);
-  t[3] = _mm_add_epi16(p[7], u[1]);
-
-  u[0] = _mm_unpacklo_epi16(t[0], t[3]);
-  u[1] = _mm_unpackhi_epi16(t[0], t[3]);
-  u[2] = _mm_unpacklo_epi16(t[1], t[2]);
-  u[3] = _mm_unpackhi_epi16(t[1], t[2]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
-  v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
-  v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
-  v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
-  v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  in[2] = _mm_packs_epi32(v[0], v[1]);
-  in[6] = _mm_packs_epi32(v[4], v[5]);
-  in[10] = _mm_packs_epi32(v[2], v[3]);
-  in[14] = _mm_packs_epi32(v[6], v[7]);
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[2], s[5]);
-  u[1] = _mm_unpackhi_epi16(s[2], s[5]);
-  u[2] = _mm_unpacklo_epi16(s[3], s[4]);
-  u[3] = _mm_unpackhi_epi16(s[3], s[4]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-  v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  t[2] = _mm_packs_epi32(v[0], v[1]);
-  t[3] = _mm_packs_epi32(v[2], v[3]);
-  t[4] = _mm_packs_epi32(v[4], v[5]);
-  t[5] = _mm_packs_epi32(v[6], v[7]);
-
-  // stage 3
-  p[0] = _mm_add_epi16(s[0], t[3]);
-  p[1] = _mm_add_epi16(s[1], t[2]);
-  p[2] = _mm_sub_epi16(s[1], t[2]);
-  p[3] = _mm_sub_epi16(s[0], t[3]);
-  p[4] = _mm_sub_epi16(s[7], t[4]);
-  p[5] = _mm_sub_epi16(s[6], t[5]);
-  p[6] = _mm_add_epi16(s[6], t[5]);
-  p[7] = _mm_add_epi16(s[7], t[4]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(p[1], p[6]);
-  u[1] = _mm_unpackhi_epi16(p[1], p[6]);
-  u[2] = _mm_unpacklo_epi16(p[2], p[5]);
-  u[3] = _mm_unpackhi_epi16(p[2], p[5]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
-  v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
-  v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
-  v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
-  v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  t[1] = _mm_packs_epi32(v[0], v[1]);
-  t[2] = _mm_packs_epi32(v[2], v[3]);
-  t[5] = _mm_packs_epi32(v[4], v[5]);
-  t[6] = _mm_packs_epi32(v[6], v[7]);
-
-  // stage 5
-  s[0] = _mm_add_epi16(p[0], t[1]);
-  s[1] = _mm_sub_epi16(p[0], t[1]);
-  s[2] = _mm_sub_epi16(p[3], t[2]);
-  s[3] = _mm_add_epi16(p[3], t[2]);
-  s[4] = _mm_add_epi16(p[4], t[5]);
-  s[5] = _mm_sub_epi16(p[4], t[5]);
-  s[6] = _mm_sub_epi16(p[7], t[6]);
-  s[7] = _mm_add_epi16(p[7], t[6]);
-
-  // stage 6
-  u[0] = _mm_unpacklo_epi16(s[0], s[7]);
-  u[1] = _mm_unpackhi_epi16(s[0], s[7]);
-  u[2] = _mm_unpacklo_epi16(s[1], s[6]);
-  u[3] = _mm_unpackhi_epi16(s[1], s[6]);
-  u[4] = _mm_unpacklo_epi16(s[2], s[5]);
-  u[5] = _mm_unpackhi_epi16(s[2], s[5]);
-  u[6] = _mm_unpacklo_epi16(s[3], s[4]);
-  u[7] = _mm_unpackhi_epi16(s[3], s[4]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
-  v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
-  v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
-  v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
-  v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
-  v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
-  v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
-  v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
-  v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
-  v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
-  v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
-  v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
-  v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  in[1] = _mm_packs_epi32(v[0], v[1]);
-  in[9] = _mm_packs_epi32(v[2], v[3]);
-  in[5] = _mm_packs_epi32(v[4], v[5]);
-  in[13] = _mm_packs_epi32(v[6], v[7]);
-  in[3] = _mm_packs_epi32(v[8], v[9]);
-  in[11] = _mm_packs_epi32(v[10], v[11]);
-  in[7] = _mm_packs_epi32(v[12], v[13]);
-  in[15] = _mm_packs_epi32(v[14], v[15]);
-}
-
-static void fadst16_8col(__m128i *in) {
-  // perform 16x16 1-D ADST for 8 columns
-  __m128i s[16], x[16], u[32], v[32];
-  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
-  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
-
-  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
-  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
-  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
-  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
-  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
-  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
-  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
-  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
-  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
-  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
-  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
-  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
-  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
-  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
-  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
-  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
-  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
-  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
-  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
-  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
-  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
-  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
-  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
-  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
-  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
-  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
-  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
-  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
-  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
-  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
-  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
-  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
-
-  u[0] = _mm_add_epi32(v[0], v[16]);
-  u[1] = _mm_add_epi32(v[1], v[17]);
-  u[2] = _mm_add_epi32(v[2], v[18]);
-  u[3] = _mm_add_epi32(v[3], v[19]);
-  u[4] = _mm_add_epi32(v[4], v[20]);
-  u[5] = _mm_add_epi32(v[5], v[21]);
-  u[6] = _mm_add_epi32(v[6], v[22]);
-  u[7] = _mm_add_epi32(v[7], v[23]);
-  u[8] = _mm_add_epi32(v[8], v[24]);
-  u[9] = _mm_add_epi32(v[9], v[25]);
-  u[10] = _mm_add_epi32(v[10], v[26]);
-  u[11] = _mm_add_epi32(v[11], v[27]);
-  u[12] = _mm_add_epi32(v[12], v[28]);
-  u[13] = _mm_add_epi32(v[13], v[29]);
-  u[14] = _mm_add_epi32(v[14], v[30]);
-  u[15] = _mm_add_epi32(v[15], v[31]);
-  u[16] = _mm_sub_epi32(v[0], v[16]);
-  u[17] = _mm_sub_epi32(v[1], v[17]);
-  u[18] = _mm_sub_epi32(v[2], v[18]);
-  u[19] = _mm_sub_epi32(v[3], v[19]);
-  u[20] = _mm_sub_epi32(v[4], v[20]);
-  u[21] = _mm_sub_epi32(v[5], v[21]);
-  u[22] = _mm_sub_epi32(v[6], v[22]);
-  u[23] = _mm_sub_epi32(v[7], v[23]);
-  u[24] = _mm_sub_epi32(v[8], v[24]);
-  u[25] = _mm_sub_epi32(v[9], v[25]);
-  u[26] = _mm_sub_epi32(v[10], v[26]);
-  u[27] = _mm_sub_epi32(v[11], v[27]);
-  u[28] = _mm_sub_epi32(v[12], v[28]);
-  u[29] = _mm_sub_epi32(v[13], v[29]);
-  u[30] = _mm_sub_epi32(v[14], v[30]);
-  u[31] = _mm_sub_epi32(v[15], v[31]);
-
-  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
-  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
-  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
-  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
-  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
-  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
-  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
-  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
-  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
-  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
-  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
-  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
-  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
-  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
-  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
-  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
-
-  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
-  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
-  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
-  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
-  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
-  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
-  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
-  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
-  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
-  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
-  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
-  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
-  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
-  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
-  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
-  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
-
-  v[0] = _mm_add_epi32(u[0], u[8]);
-  v[1] = _mm_add_epi32(u[1], u[9]);
-  v[2] = _mm_add_epi32(u[2], u[10]);
-  v[3] = _mm_add_epi32(u[3], u[11]);
-  v[4] = _mm_add_epi32(u[4], u[12]);
-  v[5] = _mm_add_epi32(u[5], u[13]);
-  v[6] = _mm_add_epi32(u[6], u[14]);
-  v[7] = _mm_add_epi32(u[7], u[15]);
-
-  v[16] = _mm_add_epi32(v[0], v[4]);
-  v[17] = _mm_add_epi32(v[1], v[5]);
-  v[18] = _mm_add_epi32(v[2], v[6]);
-  v[19] = _mm_add_epi32(v[3], v[7]);
-  v[20] = _mm_sub_epi32(v[0], v[4]);
-  v[21] = _mm_sub_epi32(v[1], v[5]);
-  v[22] = _mm_sub_epi32(v[2], v[6]);
-  v[23] = _mm_sub_epi32(v[3], v[7]);
-  v[16] = _mm_add_epi32(v[16], k__DCT_CONST_ROUNDING);
-  v[17] = _mm_add_epi32(v[17], k__DCT_CONST_ROUNDING);
-  v[18] = _mm_add_epi32(v[18], k__DCT_CONST_ROUNDING);
-  v[19] = _mm_add_epi32(v[19], k__DCT_CONST_ROUNDING);
-  v[20] = _mm_add_epi32(v[20], k__DCT_CONST_ROUNDING);
-  v[21] = _mm_add_epi32(v[21], k__DCT_CONST_ROUNDING);
-  v[22] = _mm_add_epi32(v[22], k__DCT_CONST_ROUNDING);
-  v[23] = _mm_add_epi32(v[23], k__DCT_CONST_ROUNDING);
-  v[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
-  v[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
-  v[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
-  v[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
-  v[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
-  v[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
-  v[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
-  v[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
-  s[0] = _mm_packs_epi32(v[16], v[17]);
-  s[1] = _mm_packs_epi32(v[18], v[19]);
-  s[2] = _mm_packs_epi32(v[20], v[21]);
-  s[3] = _mm_packs_epi32(v[22], v[23]);
-
-  v[8] = _mm_sub_epi32(u[0], u[8]);
-  v[9] = _mm_sub_epi32(u[1], u[9]);
-  v[10] = _mm_sub_epi32(u[2], u[10]);
-  v[11] = _mm_sub_epi32(u[3], u[11]);
-  v[12] = _mm_sub_epi32(u[4], u[12]);
-  v[13] = _mm_sub_epi32(u[5], u[13]);
-  v[14] = _mm_sub_epi32(u[6], u[14]);
-  v[15] = _mm_sub_epi32(u[7], u[15]);
-
-  v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-  s[4] = _mm_packs_epi32(v[8], v[9]);
-  s[5] = _mm_packs_epi32(v[10], v[11]);
-  s[6] = _mm_packs_epi32(v[12], v[13]);
-  s[7] = _mm_packs_epi32(v[14], v[15]);
-  //
-
-  s[8] = _mm_packs_epi32(u[16], u[17]);
-  s[9] = _mm_packs_epi32(u[18], u[19]);
-  s[10] = _mm_packs_epi32(u[20], u[21]);
-  s[11] = _mm_packs_epi32(u[22], u[23]);
-  s[12] = _mm_packs_epi32(u[24], u[25]);
-  s[13] = _mm_packs_epi32(u[26], u[27]);
-  s[14] = _mm_packs_epi32(u[28], u[29]);
-  s[15] = _mm_packs_epi32(u[30], u[31]);
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
-  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], v[8]);
-  u[1] = _mm_add_epi32(v[1], v[9]);
-  u[2] = _mm_add_epi32(v[2], v[10]);
-  u[3] = _mm_add_epi32(v[3], v[11]);
-  u[4] = _mm_add_epi32(v[4], v[12]);
-  u[5] = _mm_add_epi32(v[5], v[13]);
-  u[6] = _mm_add_epi32(v[6], v[14]);
-  u[7] = _mm_add_epi32(v[7], v[15]);
-  u[8] = _mm_sub_epi32(v[0], v[8]);
-  u[9] = _mm_sub_epi32(v[1], v[9]);
-  u[10] = _mm_sub_epi32(v[2], v[10]);
-  u[11] = _mm_sub_epi32(v[3], v[11]);
-  u[12] = _mm_sub_epi32(v[4], v[12]);
-  u[13] = _mm_sub_epi32(v[5], v[13]);
-  u[14] = _mm_sub_epi32(v[6], v[14]);
-  u[15] = _mm_sub_epi32(v[7], v[15]);
-
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-  v[8] = _mm_add_epi32(u[0], u[4]);
-  v[9] = _mm_add_epi32(u[1], u[5]);
-  v[10] = _mm_add_epi32(u[2], u[6]);
-  v[11] = _mm_add_epi32(u[3], u[7]);
-  v[12] = _mm_sub_epi32(u[0], u[4]);
-  v[13] = _mm_sub_epi32(u[1], u[5]);
-  v[14] = _mm_sub_epi32(u[2], u[6]);
-  v[15] = _mm_sub_epi32(u[3], u[7]);
-
-  v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-  s[8] = _mm_packs_epi32(v[8], v[9]);
-  s[9] = _mm_packs_epi32(v[10], v[11]);
-  s[10] = _mm_packs_epi32(v[12], v[13]);
-  s[11] = _mm_packs_epi32(v[14], v[15]);
-
-  x[12] = _mm_packs_epi32(u[8], u[9]);
-  x[13] = _mm_packs_epi32(u[10], u[11]);
-  x[14] = _mm_packs_epi32(u[12], u[13]);
-  x[15] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  u[0] = _mm_unpacklo_epi16(s[4], s[5]);
-  u[1] = _mm_unpackhi_epi16(s[4], s[5]);
-  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
-  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
-  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
-  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
-  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
-  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], v[4]);
-  u[1] = _mm_add_epi32(v[1], v[5]);
-  u[2] = _mm_add_epi32(v[2], v[6]);
-  u[3] = _mm_add_epi32(v[3], v[7]);
-  u[4] = _mm_sub_epi32(v[0], v[4]);
-  u[5] = _mm_sub_epi32(v[1], v[5]);
-  u[6] = _mm_sub_epi32(v[2], v[6]);
-  u[7] = _mm_sub_epi32(v[3], v[7]);
-  u[8] = _mm_add_epi32(v[8], v[12]);
-  u[9] = _mm_add_epi32(v[9], v[13]);
-  u[10] = _mm_add_epi32(v[10], v[14]);
-  u[11] = _mm_add_epi32(v[11], v[15]);
-  u[12] = _mm_sub_epi32(v[8], v[12]);
-  u[13] = _mm_sub_epi32(v[9], v[13]);
-  u[14] = _mm_sub_epi32(v[10], v[14]);
-  u[15] = _mm_sub_epi32(v[11], v[15]);
-
-  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[4] = _mm_packs_epi32(v[0], v[1]);
-  s[5] = _mm_packs_epi32(v[2], v[3]);
-  s[6] = _mm_packs_epi32(v[4], v[5]);
-  s[7] = _mm_packs_epi32(v[6], v[7]);
-
-  s[12] = _mm_packs_epi32(v[8], v[9]);
-  s[13] = _mm_packs_epi32(v[10], v[11]);
-  s[14] = _mm_packs_epi32(v[12], v[13]);
-  s[15] = _mm_packs_epi32(v[14], v[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
-  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
-  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
-  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  in[0] = s[0];
-  in[1] = _mm_sub_epi16(kZero, s[8]);
-  in[2] = s[12];
-  in[3] = _mm_sub_epi16(kZero, s[4]);
-  in[4] = _mm_packs_epi32(v[4], v[5]);
-  in[5] = _mm_packs_epi32(v[12], v[13]);
-  in[6] = _mm_packs_epi32(v[8], v[9]);
-  in[7] = _mm_packs_epi32(v[0], v[1]);
-  in[8] = _mm_packs_epi32(v[2], v[3]);
-  in[9] = _mm_packs_epi32(v[10], v[11]);
-  in[10] = _mm_packs_epi32(v[14], v[15]);
-  in[11] = _mm_packs_epi32(v[6], v[7]);
-  in[12] = s[5];
-  in[13] = _mm_sub_epi16(kZero, s[13]);
-  in[14] = s[9];
-  in[15] = _mm_sub_epi16(kZero, s[1]);
-}
-
-static void fdct16_sse2(__m128i *in0, __m128i *in1) {
-  fdct16_8col(in0);
-  fdct16_8col(in1);
-  array_transpose_16x16(in0, in1);
-}
-
-static void fadst16_sse2(__m128i *in0, __m128i *in1) {
-  fadst16_8col(in0);
-  fadst16_8col(in1);
-  array_transpose_16x16(in0, in1);
-}
-
-#if CONFIG_EXT_TX
-static void fidtx16_sse2(__m128i *in0, __m128i *in1) {
-  idtx16_8col(in0);
-  idtx16_8col(in1);
-  array_transpose_16x16(in0, in1);
-}
-#endif  // CONFIG_EXT_TX
-
-void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
-                       TxfmParam *txfm_param) {
-  __m128i in0[16], in1[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
-  switch (tx_type) {
-    case DCT_DCT:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fdct16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fdct16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case ADST_DCT:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fadst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fdct16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case DCT_ADST:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fdct16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fadst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case ADST_ADST:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fadst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fadst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_16x16(input, in0, in1, stride, 1, 0);
-      fadst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fdct16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_16x16(input, in0, in1, stride, 0, 1);
-      fdct16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fadst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_16x16(input, in0, in1, stride, 1, 1);
-      fadst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fadst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_16x16(input, in0, in1, stride, 0, 1);
-      fadst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fadst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_16x16(input, in0, in1, stride, 1, 0);
-      fadst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fadst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case IDTX:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fidtx16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fidtx16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case V_DCT:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fdct16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fidtx16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case H_DCT:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fidtx16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fdct16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case V_ADST:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fadst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fidtx16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case H_ADST:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fidtx16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fadst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case V_FLIPADST:
-      load_buffer_16x16(input, in0, in1, stride, 1, 0);
-      fadst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fidtx16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case H_FLIPADST:
-      load_buffer_16x16(input, in0, in1, stride, 0, 1);
-      fidtx16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fadst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-}
-
-static INLINE void prepare_4x8_row_first(__m128i *in) {
-  in[0] = _mm_unpacklo_epi64(in[0], in[2]);
-  in[1] = _mm_unpacklo_epi64(in[1], in[3]);
-  transpose_4x4(in);
-  in[4] = _mm_unpacklo_epi64(in[4], in[6]);
-  in[5] = _mm_unpacklo_epi64(in[5], in[7]);
-  transpose_4x4(in + 4);
-}
-
-// Load input into the left-hand half of in (ie, into lanes 0..3 of
-// each element of in). The right hand half (lanes 4..7) should be
-// treated as being filled with "don't care" values.
-static INLINE void load_buffer_4x8(const int16_t *input, __m128i *in,
-                                   int stride, int flipud, int fliplr) {
-  const int shift = 2;
-  if (!flipud) {
-    in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-    in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-    in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
-    in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
-    in[4] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride));
-    in[5] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride));
-    in[6] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride));
-    in[7] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride));
-  } else {
-    in[0] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride));
-    in[1] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride));
-    in[2] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride));
-    in[3] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride));
-    in[4] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
-    in[5] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
-    in[6] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-    in[7] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-  }
-
-  if (fliplr) {
-    in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
-    in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
-    in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
-    in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
-    in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
-    in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
-    in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
-    in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
-  }
-
-  in[0] = _mm_slli_epi16(in[0], shift);
-  in[1] = _mm_slli_epi16(in[1], shift);
-  in[2] = _mm_slli_epi16(in[2], shift);
-  in[3] = _mm_slli_epi16(in[3], shift);
-  in[4] = _mm_slli_epi16(in[4], shift);
-  in[5] = _mm_slli_epi16(in[5], shift);
-  in[6] = _mm_slli_epi16(in[6], shift);
-  in[7] = _mm_slli_epi16(in[7], shift);
-
-  scale_sqrt2_8x4(in);
-  scale_sqrt2_8x4(in + 4);
-  prepare_4x8_row_first(in);
-}
-
-static INLINE void write_buffer_4x8(tran_low_t *output, __m128i *res) {
-  __m128i in01, in23, in45, in67, sign01, sign23, sign45, sign67;
-  const int shift = 1;
-
-  // revert the 8x8 txfm's transpose
-  array_transpose_8x8(res, res);
-
-  in01 = _mm_unpacklo_epi64(res[0], res[1]);
-  in23 = _mm_unpacklo_epi64(res[2], res[3]);
-  in45 = _mm_unpacklo_epi64(res[4], res[5]);
-  in67 = _mm_unpacklo_epi64(res[6], res[7]);
-
-  sign01 = _mm_srai_epi16(in01, 15);
-  sign23 = _mm_srai_epi16(in23, 15);
-  sign45 = _mm_srai_epi16(in45, 15);
-  sign67 = _mm_srai_epi16(in67, 15);
-
-  in01 = _mm_sub_epi16(in01, sign01);
-  in23 = _mm_sub_epi16(in23, sign23);
-  in45 = _mm_sub_epi16(in45, sign45);
-  in67 = _mm_sub_epi16(in67, sign67);
-
-  in01 = _mm_srai_epi16(in01, shift);
-  in23 = _mm_srai_epi16(in23, shift);
-  in45 = _mm_srai_epi16(in45, shift);
-  in67 = _mm_srai_epi16(in67, shift);
-
-  store_output(&in01, (output + 0 * 8));
-  store_output(&in23, (output + 1 * 8));
-  store_output(&in45, (output + 2 * 8));
-  store_output(&in67, (output + 3 * 8));
-}
-
-void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride,
-                     TxfmParam *txfm_param) {
-  __m128i in[8];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
-  switch (tx_type) {
-    case DCT_DCT:
-      load_buffer_4x8(input, in, stride, 0, 0);
-      fdct4_sse2(in);
-      fdct4_sse2(in + 4);
-      fdct8_sse2(in);
-      break;
-    case ADST_DCT:
-      load_buffer_4x8(input, in, stride, 0, 0);
-      fdct4_sse2(in);
-      fdct4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case DCT_ADST:
-      load_buffer_4x8(input, in, stride, 0, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fdct8_sse2(in);
-      break;
-    case ADST_ADST:
-      load_buffer_4x8(input, in, stride, 0, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_4x8(input, in, stride, 1, 0);
-      fdct4_sse2(in);
-      fdct4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_4x8(input, in, stride, 0, 1);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fdct8_sse2(in);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_4x8(input, in, stride, 1, 1);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_4x8(input, in, stride, 0, 1);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_4x8(input, in, stride, 1, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case IDTX:
-      load_buffer_4x8(input, in, stride, 0, 0);
-      fidtx4_sse2(in);
-      fidtx4_sse2(in + 4);
-      fidtx8_sse2(in);
-      break;
-    case V_DCT:
-      load_buffer_4x8(input, in, stride, 0, 0);
-      fidtx4_sse2(in);
-      fidtx4_sse2(in + 4);
-      fdct8_sse2(in);
-      break;
-    case H_DCT:
-      load_buffer_4x8(input, in, stride, 0, 0);
-      fdct4_sse2(in);
-      fdct4_sse2(in + 4);
-      fidtx8_sse2(in);
-      break;
-    case V_ADST:
-      load_buffer_4x8(input, in, stride, 0, 0);
-      fidtx4_sse2(in);
-      fidtx4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case H_ADST:
-      load_buffer_4x8(input, in, stride, 0, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fidtx8_sse2(in);
-      break;
-    case V_FLIPADST:
-      load_buffer_4x8(input, in, stride, 1, 0);
-      fidtx4_sse2(in);
-      fidtx4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case H_FLIPADST:
-      load_buffer_4x8(input, in, stride, 0, 1);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fidtx8_sse2(in);
-      break;
-#endif
-    default: assert(0); break;
-  }
-  write_buffer_4x8(output, in);
-}
-
-// Load input into the left-hand half of in (ie, into lanes 0..3 of
-// each element of in). The right hand half (lanes 4..7) should be
-// treated as being filled with "don't care" values.
-// The input is split horizontally into two 4x4
-// chunks 'l' and 'r'. Then 'l' is stored in the top-left 4x4
-// block of 'in' and 'r' is stored in the bottom-left block.
-// This is to allow us to reuse 4x4 transforms.
-static INLINE void load_buffer_8x4(const int16_t *input, __m128i *in,
-                                   int stride, int flipud, int fliplr) {
-  const int shift = 2;
-  if (!flipud) {
-    in[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
-    in[1] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
-    in[2] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
-    in[3] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
-  } else {
-    in[0] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
-    in[1] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
-    in[2] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
-    in[3] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
-  }
-
-  if (fliplr) {
-    in[0] = mm_reverse_epi16(in[0]);
-    in[1] = mm_reverse_epi16(in[1]);
-    in[2] = mm_reverse_epi16(in[2]);
-    in[3] = mm_reverse_epi16(in[3]);
-  }
-
-  in[0] = _mm_slli_epi16(in[0], shift);
-  in[1] = _mm_slli_epi16(in[1], shift);
-  in[2] = _mm_slli_epi16(in[2], shift);
-  in[3] = _mm_slli_epi16(in[3], shift);
-
-  scale_sqrt2_8x4(in);
-
-  in[4] = _mm_shuffle_epi32(in[0], 0xe);
-  in[5] = _mm_shuffle_epi32(in[1], 0xe);
-  in[6] = _mm_shuffle_epi32(in[2], 0xe);
-  in[7] = _mm_shuffle_epi32(in[3], 0xe);
-}
-
-static INLINE void write_buffer_8x4(tran_low_t *output, __m128i *res) {
-  __m128i out0, out1, out2, out3, sign0, sign1, sign2, sign3;
-  const int shift = 1;
-  sign0 = _mm_srai_epi16(res[0], 15);
-  sign1 = _mm_srai_epi16(res[1], 15);
-  sign2 = _mm_srai_epi16(res[2], 15);
-  sign3 = _mm_srai_epi16(res[3], 15);
-
-  out0 = _mm_sub_epi16(res[0], sign0);
-  out1 = _mm_sub_epi16(res[1], sign1);
-  out2 = _mm_sub_epi16(res[2], sign2);
-  out3 = _mm_sub_epi16(res[3], sign3);
-
-  out0 = _mm_srai_epi16(out0, shift);
-  out1 = _mm_srai_epi16(out1, shift);
-  out2 = _mm_srai_epi16(out2, shift);
-  out3 = _mm_srai_epi16(out3, shift);
-
-  store_output(&out0, (output + 0 * 8));
-  store_output(&out1, (output + 1 * 8));
-  store_output(&out2, (output + 2 * 8));
-  store_output(&out3, (output + 3 * 8));
-}
-
-void av1_fht8x4_sse2(const int16_t *input, tran_low_t *output, int stride,
-                     TxfmParam *txfm_param) {
-  __m128i in[8];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
-  switch (tx_type) {
-    case DCT_DCT:
-      load_buffer_8x4(input, in, stride, 0, 0);
-      fdct4_sse2(in);
-      fdct4_sse2(in + 4);
-      fdct8_sse2(in);
-      break;
-    case ADST_DCT:
-      load_buffer_8x4(input, in, stride, 0, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fdct8_sse2(in);
-      break;
-    case DCT_ADST:
-      load_buffer_8x4(input, in, stride, 0, 0);
-      fdct4_sse2(in);
-      fdct4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case ADST_ADST:
-      load_buffer_8x4(input, in, stride, 0, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_8x4(input, in, stride, 1, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fdct8_sse2(in);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_8x4(input, in, stride, 0, 1);
-      fdct4_sse2(in);
-      fdct4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_8x4(input, in, stride, 1, 1);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_8x4(input, in, stride, 0, 1);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_8x4(input, in, stride, 1, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case IDTX:
-      load_buffer_8x4(input, in, stride, 0, 0);
-      fidtx4_sse2(in);
-      fidtx4_sse2(in + 4);
-      fidtx8_sse2(in);
-      break;
-    case V_DCT:
-      load_buffer_8x4(input, in, stride, 0, 0);
-      fdct4_sse2(in);
-      fdct4_sse2(in + 4);
-      fidtx8_sse2(in);
-      break;
-    case H_DCT:
-      load_buffer_8x4(input, in, stride, 0, 0);
-      fidtx4_sse2(in);
-      fidtx4_sse2(in + 4);
-      fdct8_sse2(in);
-      break;
-    case V_ADST:
-      load_buffer_8x4(input, in, stride, 0, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fidtx8_sse2(in);
-      break;
-    case H_ADST:
-      load_buffer_8x4(input, in, stride, 0, 0);
-      fidtx4_sse2(in);
-      fidtx4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-    case V_FLIPADST:
-      load_buffer_8x4(input, in, stride, 1, 0);
-      fadst4_sse2(in);
-      fadst4_sse2(in + 4);
-      fidtx8_sse2(in);
-      break;
-    case H_FLIPADST:
-      load_buffer_8x4(input, in, stride, 0, 1);
-      fidtx4_sse2(in);
-      fidtx4_sse2(in + 4);
-      fadst8_sse2(in);
-      break;
-#endif
-    default: assert(0); break;
-  }
-  write_buffer_8x4(output, in);
-}
-
-static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in,
-                                    int stride, int flipud, int fliplr) {
-  // Load 2 8x8 blocks
-  const int16_t *t = input;
-  const int16_t *b = input + 8 * stride;
-
-  if (flipud) {
-    const int16_t *const tmp = t;
-    t = b;
-    b = tmp;
-  }
-
-  load_buffer_8x8(t, in, stride, flipud, fliplr);
-  scale_sqrt2_8x8(in);
-  load_buffer_8x8(b, in + 8, stride, flipud, fliplr);
-  scale_sqrt2_8x8(in + 8);
-}
-
-static INLINE void round_power_of_two_signed(__m128i *x, int n) {
-  const __m128i rounding = _mm_set1_epi16((1 << n) >> 1);
-  const __m128i sign = _mm_srai_epi16(*x, 15);
-  const __m128i res = _mm_add_epi16(_mm_add_epi16(*x, rounding), sign);
-  *x = _mm_srai_epi16(res, n);
-}
-
-static void row_8x16_rounding(__m128i *in, int bits) {
-  int i;
-  for (i = 0; i < 16; i++) {
-    round_power_of_two_signed(&in[i], bits);
-  }
-}
-
-void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
-                      TxfmParam *txfm_param) {
-  __m128i in[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
-  __m128i *const t = in;      // Alias to top 8x8 sub block
-  __m128i *const b = in + 8;  // Alias to bottom 8x8 sub block
-
-  switch (tx_type) {
-    case DCT_DCT:
-      load_buffer_8x16(input, in, stride, 0, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fdct8_sse2(t);
-      fdct8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fdct16_8col(in);
-      break;
-    case ADST_DCT:
-      load_buffer_8x16(input, in, stride, 0, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fdct8_sse2(t);
-      fdct8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case DCT_ADST:
-      load_buffer_8x16(input, in, stride, 0, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fadst8_sse2(t);
-      fadst8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fdct16_8col(in);
-      break;
-    case ADST_ADST:
-      load_buffer_8x16(input, in, stride, 0, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fadst8_sse2(t);
-      fadst8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_8x16(input, in, stride, 1, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fdct8_sse2(t);
-      fdct8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_8x16(input, in, stride, 0, 1);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fadst8_sse2(t);
-      fadst8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fdct16_8col(in);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_8x16(input, in, stride, 1, 1);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fadst8_sse2(t);
-      fadst8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_8x16(input, in, stride, 0, 1);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fadst8_sse2(t);
-      fadst8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_8x16(input, in, stride, 1, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fadst8_sse2(t);
-      fadst8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case IDTX:
-      load_buffer_8x16(input, in, stride, 0, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fidtx8_sse2(t);
-      fidtx8_sse2(b);
-      row_8x16_rounding(in, 2);
-      idtx16_8col(in);
-      break;
-    case V_DCT:
-      load_buffer_8x16(input, in, stride, 0, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fidtx8_sse2(t);
-      fidtx8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fdct16_8col(in);
-      break;
-    case H_DCT:
-      load_buffer_8x16(input, in, stride, 0, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fdct8_sse2(t);
-      fdct8_sse2(b);
-      row_8x16_rounding(in, 2);
-      idtx16_8col(in);
-      break;
-    case V_ADST:
-      load_buffer_8x16(input, in, stride, 0, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fidtx8_sse2(t);
-      fidtx8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case H_ADST:
-      load_buffer_8x16(input, in, stride, 0, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fadst8_sse2(t);
-      fadst8_sse2(b);
-      row_8x16_rounding(in, 2);
-      idtx16_8col(in);
-      break;
-    case V_FLIPADST:
-      load_buffer_8x16(input, in, stride, 1, 0);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fidtx8_sse2(t);
-      fidtx8_sse2(b);
-      row_8x16_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case H_FLIPADST:
-      load_buffer_8x16(input, in, stride, 0, 1);
-      array_transpose_8x8(t, t);
-      array_transpose_8x8(b, b);
-      fadst8_sse2(t);
-      fadst8_sse2(b);
-      row_8x16_rounding(in, 2);
-      idtx16_8col(in);
-      break;
-#endif
-    default: assert(0); break;
-  }
-  write_buffer_8x8(output, t, 8);
-  write_buffer_8x8(output + 64, b, 8);
-}
-
-static INLINE void load_buffer_16x8(const int16_t *input, __m128i *in,
-                                    int stride, int flipud, int fliplr) {
-  // Load 2 8x8 blocks
-  const int16_t *l = input;
-  const int16_t *r = input + 8;
-
-  if (fliplr) {
-    const int16_t *const tmp = l;
-    l = r;
-    r = tmp;
-  }
-
-  // load first 8 columns
-  load_buffer_8x8(l, in, stride, flipud, fliplr);
-  scale_sqrt2_8x8(in);
-  load_buffer_8x8(r, in + 8, stride, flipud, fliplr);
-  scale_sqrt2_8x8(in + 8);
-}
-
-#define col_16x8_rounding row_8x16_rounding
-
-void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride,
-                      TxfmParam *txfm_param) {
-  __m128i in[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
-  __m128i *const l = in;      // Alias to left 8x8 sub block
-  __m128i *const r = in + 8;  // Alias to right 8x8 sub block, which we store
-                              // in the second half of the array
-
-  switch (tx_type) {
-    case DCT_DCT:
-      load_buffer_16x8(input, in, stride, 0, 0);
-      fdct8_sse2(l);
-      fdct8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fdct16_8col(in);
-      break;
-    case ADST_DCT:
-      load_buffer_16x8(input, in, stride, 0, 0);
-      fadst8_sse2(l);
-      fadst8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fdct16_8col(in);
-      break;
-    case DCT_ADST:
-      load_buffer_16x8(input, in, stride, 0, 0);
-      fdct8_sse2(l);
-      fdct8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case ADST_ADST:
-      load_buffer_16x8(input, in, stride, 0, 0);
-      fadst8_sse2(l);
-      fadst8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_16x8(input, in, stride, 1, 0);
-      fadst8_sse2(l);
-      fadst8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fdct16_8col(in);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_16x8(input, in, stride, 0, 1);
-      fdct8_sse2(l);
-      fdct8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_16x8(input, in, stride, 1, 1);
-      fadst8_sse2(l);
-      fadst8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_16x8(input, in, stride, 0, 1);
-      fadst8_sse2(l);
-      fadst8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_16x8(input, in, stride, 1, 0);
-      fadst8_sse2(l);
-      fadst8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case IDTX:
-      load_buffer_16x8(input, in, stride, 0, 0);
-      fidtx8_sse2(l);
-      fidtx8_sse2(r);
-      col_16x8_rounding(in, 2);
-      idtx16_8col(in);
-      break;
-    case V_DCT:
-      load_buffer_16x8(input, in, stride, 0, 0);
-      fdct8_sse2(l);
-      fdct8_sse2(r);
-      col_16x8_rounding(in, 2);
-      idtx16_8col(in);
-      break;
-    case H_DCT:
-      load_buffer_16x8(input, in, stride, 0, 0);
-      fidtx8_sse2(l);
-      fidtx8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fdct16_8col(in);
-      break;
-    case V_ADST:
-      load_buffer_16x8(input, in, stride, 0, 0);
-      fadst8_sse2(l);
-      fadst8_sse2(r);
-      col_16x8_rounding(in, 2);
-      idtx16_8col(in);
-      break;
-    case H_ADST:
-      load_buffer_16x8(input, in, stride, 0, 0);
-      fidtx8_sse2(l);
-      fidtx8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-    case V_FLIPADST:
-      load_buffer_16x8(input, in, stride, 1, 0);
-      fadst8_sse2(l);
-      fadst8_sse2(r);
-      col_16x8_rounding(in, 2);
-      idtx16_8col(in);
-      break;
-    case H_FLIPADST:
-      load_buffer_16x8(input, in, stride, 0, 1);
-      fidtx8_sse2(l);
-      fidtx8_sse2(r);
-      col_16x8_rounding(in, 2);
-      fadst16_8col(in);
-      break;
-#endif
-    default: assert(0); break;
-  }
-  array_transpose_8x8(l, l);
-  array_transpose_8x8(r, r);
-  write_buffer_8x8(output, l, 16);
-  write_buffer_8x8(output + 8, r, 16);
-}
-
-// Note: The 16-column 32-element transforms expect their input to be
-// split up into a 2x2 grid of 8x16 blocks
-static INLINE void fdct32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
-                                __m128i *br) {
-  fdct32_8col(tl, bl);
-  fdct32_8col(tr, br);
-  array_transpose_16x16(tl, tr);
-  array_transpose_16x16(bl, br);
-}
-
-#if CONFIG_EXT_TX
-static INLINE void fidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
-                                 __m128i *br) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    tl[i] = _mm_slli_epi16(tl[i], 2);
-    tr[i] = _mm_slli_epi16(tr[i], 2);
-    bl[i] = _mm_slli_epi16(bl[i], 2);
-    br[i] = _mm_slli_epi16(br[i], 2);
-  }
-  array_transpose_16x16(tl, tr);
-  array_transpose_16x16(bl, br);
-}
-#endif
-
-static INLINE void load_buffer_16x32(const int16_t *input, __m128i *intl,
-                                     __m128i *intr, __m128i *inbl,
-                                     __m128i *inbr, int stride, int flipud,
-                                     int fliplr) {
-  int i;
-  if (flipud) {
-    input = input + 31 * stride;
-    stride = -stride;
-  }
-
-  for (i = 0; i < 16; ++i) {
-    intl[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
-    intr[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
-    inbl[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 0)), 2);
-    inbr[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 8)), 2);
-  }
-
-  if (fliplr) {
-    __m128i tmp;
-    for (i = 0; i < 16; ++i) {
-      tmp = intl[i];
-      intl[i] = mm_reverse_epi16(intr[i]);
-      intr[i] = mm_reverse_epi16(tmp);
-      tmp = inbl[i];
-      inbl[i] = mm_reverse_epi16(inbr[i]);
-      inbr[i] = mm_reverse_epi16(tmp);
-    }
-  }
-
-  scale_sqrt2_8x16(intl);
-  scale_sqrt2_8x16(intr);
-  scale_sqrt2_8x16(inbl);
-  scale_sqrt2_8x16(inbr);
-}
-
-static INLINE void write_buffer_16x32(tran_low_t *output, __m128i *restl,
-                                      __m128i *restr, __m128i *resbl,
-                                      __m128i *resbr) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    store_output(&restl[i], output + i * 16 + 0);
-    store_output(&restr[i], output + i * 16 + 8);
-    store_output(&resbl[i], output + (i + 16) * 16 + 0);
-    store_output(&resbr[i], output + (i + 16) * 16 + 8);
-  }
-}
-
-static INLINE void round_signed_8x8(__m128i *in, const int bit) {
-  const __m128i rounding = _mm_set1_epi16((1 << bit) >> 1);
-  __m128i sign0 = _mm_srai_epi16(in[0], 15);
-  __m128i sign1 = _mm_srai_epi16(in[1], 15);
-  __m128i sign2 = _mm_srai_epi16(in[2], 15);
-  __m128i sign3 = _mm_srai_epi16(in[3], 15);
-  __m128i sign4 = _mm_srai_epi16(in[4], 15);
-  __m128i sign5 = _mm_srai_epi16(in[5], 15);
-  __m128i sign6 = _mm_srai_epi16(in[6], 15);
-  __m128i sign7 = _mm_srai_epi16(in[7], 15);
-
-  in[0] = _mm_add_epi16(_mm_add_epi16(in[0], rounding), sign0);
-  in[1] = _mm_add_epi16(_mm_add_epi16(in[1], rounding), sign1);
-  in[2] = _mm_add_epi16(_mm_add_epi16(in[2], rounding), sign2);
-  in[3] = _mm_add_epi16(_mm_add_epi16(in[3], rounding), sign3);
-  in[4] = _mm_add_epi16(_mm_add_epi16(in[4], rounding), sign4);
-  in[5] = _mm_add_epi16(_mm_add_epi16(in[5], rounding), sign5);
-  in[6] = _mm_add_epi16(_mm_add_epi16(in[6], rounding), sign6);
-  in[7] = _mm_add_epi16(_mm_add_epi16(in[7], rounding), sign7);
-
-  in[0] = _mm_srai_epi16(in[0], bit);
-  in[1] = _mm_srai_epi16(in[1], bit);
-  in[2] = _mm_srai_epi16(in[2], bit);
-  in[3] = _mm_srai_epi16(in[3], bit);
-  in[4] = _mm_srai_epi16(in[4], bit);
-  in[5] = _mm_srai_epi16(in[5], bit);
-  in[6] = _mm_srai_epi16(in[6], bit);
-  in[7] = _mm_srai_epi16(in[7], bit);
-}
-
-static INLINE void round_signed_16x16(__m128i *in0, __m128i *in1) {
-  const int bit = 4;
-  round_signed_8x8(in0, bit);
-  round_signed_8x8(in0 + 8, bit);
-  round_signed_8x8(in1, bit);
-  round_signed_8x8(in1 + 8, bit);
-}
-
-// Note:
-//  suffix "t" indicates the transpose operation comes first
-static void fdct16t_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  fdct16_8col(in0);
-  fdct16_8col(in1);
-}
-
-static void fadst16t_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  fadst16_8col(in0);
-  fadst16_8col(in1);
-}
-
-static INLINE void fdct32t_16col(__m128i *tl, __m128i *tr, __m128i *bl,
-                                 __m128i *br) {
-  array_transpose_16x16(tl, tr);
-  array_transpose_16x16(bl, br);
-  fdct32_8col(tl, bl);
-  fdct32_8col(tr, br);
-}
-
-typedef enum transpose_indicator_ {
-  transpose,
-  no_transpose,
-} transpose_indicator;
-
-static INLINE void fhalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
-                                      __m128i *br, transpose_indicator t) {
-  __m128i tmpl[16], tmpr[16];
-  int i;
-
-  // Copy the bottom half of the input to temporary storage
-  for (i = 0; i < 16; ++i) {
-    tmpl[i] = bl[i];
-    tmpr[i] = br[i];
-  }
-
-  // Generate the bottom half of the output
-  for (i = 0; i < 16; ++i) {
-    bl[i] = _mm_slli_epi16(tl[i], 2);
-    br[i] = _mm_slli_epi16(tr[i], 2);
-  }
-  array_transpose_16x16(bl, br);
-
-  // Copy the temporary storage back to the top half of the input
-  for (i = 0; i < 16; ++i) {
-    tl[i] = tmpl[i];
-    tr[i] = tmpr[i];
-  }
-
-  // Generate the top half of the output
-  scale_sqrt2_8x16(tl);
-  scale_sqrt2_8x16(tr);
-  if (t == transpose)
-    fdct16t_sse2(tl, tr);
-  else
-    fdct16_sse2(tl, tr);
-}
-
-// Note on data layout, for both this and the 32x16 transforms:
-// So that we can reuse the 16-element transforms easily,
-// we want to split the input into 8x16 blocks.
-// For 16x32, this means the input is a 2x2 grid of such blocks.
-// For 32x16, it means the input is a 4x1 grid.
-void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride,
-                       TxfmParam *txfm_param) {
-  __m128i intl[16], intr[16], inbl[16], inbr[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
-  switch (tx_type) {
-    case DCT_DCT:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
-      fdct16t_sse2(intl, intr);
-      fdct16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fdct32t_16col(intl, intr, inbl, inbr);
-      break;
-    case ADST_DCT:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
-      fdct16t_sse2(intl, intr);
-      fdct16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
-      break;
-    case DCT_ADST:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
-      fadst16t_sse2(intl, intr);
-      fadst16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fdct32t_16col(intl, intr, inbl, inbr);
-      break;
-    case ADST_ADST:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
-      fadst16t_sse2(intl, intr);
-      fadst16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
-      fdct16t_sse2(intl, intr);
-      fdct16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
-      fadst16t_sse2(intl, intr);
-      fadst16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fdct32t_16col(intl, intr, inbl, inbr);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 1);
-      fadst16t_sse2(intl, intr);
-      fadst16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
-      fadst16t_sse2(intl, intr);
-      fadst16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
-      fadst16t_sse2(intl, intr);
-      fadst16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
-      break;
-    case IDTX:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
-      fidtx16_sse2(intl, intr);
-      fidtx16_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fidtx32_16col(intl, intr, inbl, inbr);
-      break;
-    case V_DCT:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
-      fidtx16_sse2(intl, intr);
-      fidtx16_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fdct32t_16col(intl, intr, inbl, inbr);
-      break;
-    case H_DCT:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
-      fdct16t_sse2(intl, intr);
-      fdct16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fidtx32_16col(intl, intr, inbl, inbr);
-      break;
-    case V_ADST:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
-      fidtx16_sse2(intl, intr);
-      fidtx16_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
-      break;
-    case H_ADST:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
-      fadst16t_sse2(intl, intr);
-      fadst16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fidtx32_16col(intl, intr, inbl, inbr);
-      break;
-    case V_FLIPADST:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
-      fidtx16_sse2(intl, intr);
-      fidtx16_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fhalfright32_16col(intl, intr, inbl, inbr, transpose);
-      break;
-    case H_FLIPADST:
-      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
-      fadst16t_sse2(intl, intr);
-      fadst16t_sse2(inbl, inbr);
-      round_signed_16x16(intl, intr);
-      round_signed_16x16(inbl, inbr);
-      fidtx32_16col(intl, intr, inbl, inbr);
-      break;
-#endif
-    default: assert(0); break;
-  }
-  write_buffer_16x32(output, intl, intr, inbl, inbr);
-}
-
-static INLINE void load_buffer_32x16(const int16_t *input, __m128i *in0,
-                                     __m128i *in1, __m128i *in2, __m128i *in3,
-                                     int stride, int flipud, int fliplr) {
-  int i;
-  if (flipud) {
-    input += 15 * stride;
-    stride = -stride;
-  }
-
-  for (i = 0; i < 16; ++i) {
-    in0[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
-    in1[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
-    in2[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2);
-    in3[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2);
-  }
-
-  if (fliplr) {
-    for (i = 0; i < 16; ++i) {
-      __m128i tmp1 = in0[i];
-      __m128i tmp2 = in1[i];
-      in0[i] = mm_reverse_epi16(in3[i]);
-      in1[i] = mm_reverse_epi16(in2[i]);
-      in2[i] = mm_reverse_epi16(tmp2);
-      in3[i] = mm_reverse_epi16(tmp1);
-    }
-  }
-
-  scale_sqrt2_8x16(in0);
-  scale_sqrt2_8x16(in1);
-  scale_sqrt2_8x16(in2);
-  scale_sqrt2_8x16(in3);
-}
-
-static INLINE void write_buffer_32x16(tran_low_t *output, __m128i *res0,
-                                      __m128i *res1, __m128i *res2,
-                                      __m128i *res3) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    store_output(&res0[i], output + i * 32 + 0);
-    store_output(&res1[i], output + i * 32 + 8);
-    store_output(&res2[i], output + i * 32 + 16);
-    store_output(&res3[i], output + i * 32 + 24);
-  }
-}
-
-void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride,
-                       TxfmParam *txfm_param) {
-  __m128i in0[16], in1[16], in2[16], in3[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
-  load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
-  switch (tx_type) {
-    case DCT_DCT:
-      fdct16_sse2(in0, in1);
-      fdct16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fdct32_16col(in0, in1, in2, in3);
-      break;
-    case ADST_DCT:
-      fadst16_sse2(in0, in1);
-      fadst16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fdct32_16col(in0, in1, in2, in3);
-      break;
-    case DCT_ADST:
-      fdct16_sse2(in0, in1);
-      fdct16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
-      break;
-    case ADST_ADST:
-      fadst16_sse2(in0, in1);
-      fadst16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
-      fadst16_sse2(in0, in1);
-      fadst16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fdct32_16col(in0, in1, in2, in3);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
-      fdct16_sse2(in0, in1);
-      fdct16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 1);
-      fadst16_sse2(in0, in1);
-      fadst16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
-      fadst16_sse2(in0, in1);
-      fadst16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
-      fadst16_sse2(in0, in1);
-      fadst16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
-      break;
-    case IDTX:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
-      fidtx16_sse2(in0, in1);
-      fidtx16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fidtx32_16col(in0, in1, in2, in3);
-      break;
-    case V_DCT:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
-      fdct16_sse2(in0, in1);
-      fdct16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fidtx32_16col(in0, in1, in2, in3);
-      break;
-    case H_DCT:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
-      fidtx16_sse2(in0, in1);
-      fidtx16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fdct32_16col(in0, in1, in2, in3);
-      break;
-    case V_ADST:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
-      fadst16_sse2(in0, in1);
-      fadst16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fidtx32_16col(in0, in1, in2, in3);
-      break;
-    case H_ADST:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
-      fidtx16_sse2(in0, in1);
-      fidtx16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
-      break;
-    case V_FLIPADST:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
-      fadst16_sse2(in0, in1);
-      fadst16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fidtx32_16col(in0, in1, in2, in3);
-      break;
-    case H_FLIPADST:
-      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
-      fidtx16_sse2(in0, in1);
-      fidtx16_sse2(in2, in3);
-      round_signed_16x16(in0, in1);
-      round_signed_16x16(in2, in3);
-      fhalfright32_16col(in0, in1, in2, in3, no_transpose);
-      break;
-#endif
-    default: assert(0); break;
-  }
-  write_buffer_32x16(output, in0, in1, in2, in3);
-}
-
-// Note:
-// 32x32 hybrid fwd txfm
-//  4x2 grids of 8x16 block. Each block is represented by __m128i in[16]
-static INLINE void load_buffer_32x32(const int16_t *input,
-                                     __m128i *in0 /*in0[32]*/,
-                                     __m128i *in1 /*in1[32]*/,
-                                     __m128i *in2 /*in2[32]*/,
-                                     __m128i *in3 /*in3[32]*/, int stride,
-                                     int flipud, int fliplr) {
-  if (flipud) {
-    input += 31 * stride;
-    stride = -stride;
-  }
-
-  int i;
-  for (i = 0; i < 32; ++i) {
-    in0[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
-    in1[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
-    in2[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2);
-    in3[i] = _mm_slli_epi16(
-        _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2);
-  }
-
-  if (fliplr) {
-    for (i = 0; i < 32; ++i) {
-      __m128i tmp1 = in0[i];
-      __m128i tmp2 = in1[i];
-      in0[i] = mm_reverse_epi16(in3[i]);
-      in1[i] = mm_reverse_epi16(in2[i]);
-      in2[i] = mm_reverse_epi16(tmp2);
-      in3[i] = mm_reverse_epi16(tmp1);
-    }
-  }
-}
-
-static INLINE void swap_16x16(__m128i *b0l /*b0l[16]*/,
-                              __m128i *b0r /*b0r[16]*/,
-                              __m128i *b1l /*b1l[16]*/,
-                              __m128i *b1r /*b1r[16]*/) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    __m128i tmp0 = b1l[i];
-    __m128i tmp1 = b1r[i];
-    b1l[i] = b0l[i];
-    b1r[i] = b0r[i];
-    b0l[i] = tmp0;
-    b0r[i] = tmp1;
-  }
-}
-
-static INLINE void fdct32(__m128i *in0, __m128i *in1, __m128i *in2,
-                          __m128i *in3) {
-  fdct32_8col(in0, &in0[16]);
-  fdct32_8col(in1, &in1[16]);
-  fdct32_8col(in2, &in2[16]);
-  fdct32_8col(in3, &in3[16]);
-
-  array_transpose_16x16(in0, in1);
-  array_transpose_16x16(&in0[16], &in1[16]);
-  array_transpose_16x16(in2, in3);
-  array_transpose_16x16(&in2[16], &in3[16]);
-
-  swap_16x16(&in0[16], &in1[16], in2, in3);
-}
-
-static INLINE void fhalfright32(__m128i *in0, __m128i *in1, __m128i *in2,
-                                __m128i *in3) {
-  fhalfright32_16col(in0, in1, &in0[16], &in1[16], no_transpose);
-  fhalfright32_16col(in2, in3, &in2[16], &in3[16], no_transpose);
-  swap_16x16(&in0[16], &in1[16], in2, in3);
-}
-
-#if CONFIG_EXT_TX
-static INLINE void fidtx32(__m128i *in0, __m128i *in1, __m128i *in2,
-                           __m128i *in3) {
-  fidtx32_16col(in0, in1, &in0[16], &in1[16]);
-  fidtx32_16col(in2, in3, &in2[16], &in3[16]);
-  swap_16x16(&in0[16], &in1[16], in2, in3);
-}
-#endif
-
-static INLINE void round_signed_32x32(__m128i *in0, __m128i *in1, __m128i *in2,
-                                      __m128i *in3) {
-  round_signed_16x16(in0, in1);
-  round_signed_16x16(&in0[16], &in1[16]);
-  round_signed_16x16(in2, in3);
-  round_signed_16x16(&in2[16], &in3[16]);
-}
-
-static INLINE void write_buffer_32x32(__m128i *in0, __m128i *in1, __m128i *in2,
-                                      __m128i *in3, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 32; ++i) {
-    store_output(&in0[i], output + i * 32 + 0);
-    store_output(&in1[i], output + i * 32 + 8);
-    store_output(&in2[i], output + i * 32 + 16);
-    store_output(&in3[i], output + i * 32 + 24);
-  }
-}
-
-void av1_fht32x32_sse2(const int16_t *input, tran_low_t *output, int stride,
-                       TxfmParam *txfm_param) {
-  __m128i in0[32], in1[32], in2[32], in3[32];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "No 32x32 sse2 MRC_DCT implementation");
-#endif
-
-  load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 0);
-  switch (tx_type) {
-    case DCT_DCT:
-      fdct32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fdct32(in0, in1, in2, in3);
-      break;
-    case ADST_DCT:
-      fhalfright32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fdct32(in0, in1, in2, in3);
-      break;
-    case DCT_ADST:
-      fdct32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fhalfright32(in0, in1, in2, in3);
-      break;
-    case ADST_ADST:
-      fhalfright32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fhalfright32(in0, in1, in2, in3);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
-      fhalfright32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fdct32(in0, in1, in2, in3);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
-      fdct32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fhalfright32(in0, in1, in2, in3);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 1);
-      fhalfright32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fhalfright32(in0, in1, in2, in3);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
-      fhalfright32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fhalfright32(in0, in1, in2, in3);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
-      fhalfright32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fhalfright32(in0, in1, in2, in3);
-      break;
-    case IDTX:
-      fidtx32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fidtx32(in0, in1, in2, in3);
-      break;
-    case V_DCT:
-      fdct32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fidtx32(in0, in1, in2, in3);
-      break;
-    case H_DCT:
-      fidtx32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fdct32(in0, in1, in2, in3);
-      break;
-    case V_ADST:
-      fhalfright32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fidtx32(in0, in1, in2, in3);
-      break;
-    case H_ADST:
-      fidtx32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fhalfright32(in0, in1, in2, in3);
-      break;
-    case V_FLIPADST:
-      load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0);
-      fhalfright32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fidtx32(in0, in1, in2, in3);
-      break;
-    case H_FLIPADST:
-      load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1);
-      fidtx32(in0, in1, in2, in3);
-      round_signed_32x32(in0, in1, in2, in3);
-      fhalfright32(in0, in1, in2, in3);
-      break;
-#endif
-    default: assert(0);
-  }
-  write_buffer_32x32(in0, in1, in2, in3, output);
-}
diff --git a/third_party/aom/av1/encoder/x86/dct_sse2.asm b/third_party/aom/av1/encoder/x86/dct_sse2.asm
index a99db3d6e..b18554818 100644
--- a/third_party/aom/av1/encoder/x86/dct_sse2.asm
+++ b/third_party/aom/av1/encoder/x86/dct_sse2.asm
@@ -63,7 +63,6 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
   psllw           m0,        2
   psllw           m1,        2
 
-%if CONFIG_HIGHBITDEPTH
   ; sign extension
   mova            m2,             m0
   mova            m3,             m1
@@ -79,9 +78,5 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
   mova            [outputq + 16], m2
   mova            [outputq + 32], m1
   mova            [outputq + 48], m3
-%else
-  mova            [outputq],      m0
-  mova            [outputq + 16], m1
-%endif
 
   RET
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse2.c b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c
new file mode 100644
index 000000000..dedb4d02f
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c
@@ -0,0 +1,505 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+
+static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src,
+                                          const int stride,
+                                          const ptrdiff_t *const offsets,
+                                          __m128i *const level) {
+  level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride);
+  level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride);
+  level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride);
+  level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride);
+  level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride);
+}
+
+static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src,
+                                          const int stride,
+                                          const ptrdiff_t *const offsets,
+                                          __m128i *const level) {
+  level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride);
+  level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride);
+  level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride);
+  level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride);
+  level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride);
+}
+
+static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src,
+                                           const int stride,
+                                           const ptrdiff_t *const offsets,
+                                           __m128i *const level) {
+  level[0] = _mm_loadu_si128((__m128i *)(src + 1));
+  level[1] = _mm_loadu_si128((__m128i *)(src + stride));
+  level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0]));
+  level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1]));
+  level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2]));
+}
+
+static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) {
+  const __m128i const_3 = _mm_set1_epi8(3);
+  const __m128i const_4 = _mm_set1_epi8(4);
+  __m128i count;
+
+  count = _mm_min_epu8(level[0], const_3);
+  level[1] = _mm_min_epu8(level[1], const_3);
+  level[2] = _mm_min_epu8(level[2], const_3);
+  level[3] = _mm_min_epu8(level[3], const_3);
+  level[4] = _mm_min_epu8(level[4], const_3);
+  count = _mm_add_epi8(count, level[1]);
+  count = _mm_add_epi8(count, level[2]);
+  count = _mm_add_epi8(count, level[3]);
+  count = _mm_add_epi8(count, level[4]);
+  count = _mm_avg_epu8(count, _mm_setzero_si128());
+  count = _mm_min_epu8(count, const_4);
+  return count;
+}
+
+static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            int8_t *const coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const __m128i pos_to_offset_large = _mm_set1_epi8(21);
+  __m128i pos_to_offset =
+      (height == 4)
+          ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21)
+          : _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21,
+                          21, 21);
+  __m128i count;
+  __m128i level[5];
+  int8_t *cc = coeff_contexts;
+  int row = height;
+
+  assert(!(height % 4));
+
+  do {
+    load_levels_4x4x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)cc, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 4 * stride;
+    cc += 16;
+    row -= 4;
+  } while (row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
+                                             const int height,
+                                             const ptrdiff_t *const offsets,
+                                             int8_t *coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const __m128i pos_to_offset =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+  __m128i count;
+  __m128i level[5];
+  int row = height;
+
+  assert(!(height % 4));
+
+  do {
+    load_levels_4x4x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)coeff_contexts, count);
+    levels += 4 * stride;
+    coeff_contexts += 16;
+    row -= 4;
+  } while (row);
+}
+
+static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
+                                             const int height,
+                                             const ptrdiff_t *const offsets,
+                                             int8_t *coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+  __m128i pos_to_offset =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+  __m128i count;
+  __m128i level[5];
+  int row = height;
+
+  assert(!(height % 4));
+
+  do {
+    load_levels_4x4x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)coeff_contexts, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 4 * stride;
+    coeff_contexts += 16;
+    row -= 4;
+  } while (row);
+}
+
+static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
+                                           const int height,
+                                           const ptrdiff_t *const offsets,
+                                           int8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  int8_t *cc = coeff_contexts;
+  int row = height;
+  __m128i count;
+  __m128i level[5];
+  __m128i pos_to_offset[3];
+
+  assert(!(height % 2));
+
+  if (height == 8) {
+    pos_to_offset[0] =
+        _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21);
+    pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
+                                     21, 21, 21, 21, 21);
+  } else if (height < 8) {
+    pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21,
+                                     21, 21, 21, 21);
+    pos_to_offset[1] = _mm_setr_epi8(16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+                                     21, 21, 21, 21, 21);
+  } else {
+    pos_to_offset[0] = _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+                                     11, 11, 11, 11, 11);
+    pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
+                                     21, 21, 21, 21, 21);
+  }
+  pos_to_offset[2] = _mm_set1_epi8(21);
+
+  do {
+    load_levels_8x2x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset[0]);
+    _mm_store_si128((__m128i *)cc, count);
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    levels += 2 * stride;
+    cc += 16;
+    row -= 2;
+  } while (row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            int8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  const __m128i pos_to_offset =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+  int row = height;
+  __m128i count;
+  __m128i level[5];
+
+  assert(!(height % 2));
+
+  do {
+    load_levels_8x2x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)coeff_contexts, count);
+    levels += 2 * stride;
+    coeff_contexts += 16;
+    row -= 2;
+  } while (row);
+}
+
+static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            int8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+  __m128i pos_to_offset =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5);
+  int row = height;
+  __m128i count;
+  __m128i level[5];
+
+  assert(!(height % 2));
+
+  do {
+    load_levels_8x2x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)coeff_contexts, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 2 * stride;
+    coeff_contexts += 16;
+    row -= 2;
+  } while (row);
+}
+
+static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
+                                             const int real_width,
+                                             const int real_height,
+                                             const int width, const int height,
+                                             const ptrdiff_t *const offsets,
+                                             int8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+  int8_t *cc = coeff_contexts;
+  int row = height;
+  __m128i pos_to_offset[5];
+  __m128i pos_to_offset_large[3];
+  __m128i count;
+  __m128i level[5];
+
+  assert(!(width % 16));
+
+  pos_to_offset_large[2] = _mm_set1_epi8(21);
+  if (real_width == real_height) {
+    pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21);
+    pos_to_offset[1] = _mm_setr_epi8(1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
+        pos_to_offset_large[2];
+  } else if (real_width > real_height) {
+    pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[1] = _mm_setr_epi8(16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8(
+        16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
+    pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
+  } else {  // real_width < real_height
+    pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8(
+        11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11);
+    pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[4] = pos_to_offset_large[2];
+    pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(11);
+  }
+
+  do {
+    int w = width;
+
+    do {
+      load_levels_16x1x5_sse2(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel_sse2(level);
+      count = _mm_add_epi8(count, pos_to_offset[0]);
+      _mm_store_si128((__m128i *)cc, count);
+      levels += 16;
+      cc += 16;
+      w -= 16;
+      pos_to_offset[0] = pos_to_offset_large[0];
+    } while (w);
+
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    pos_to_offset[2] = pos_to_offset[3];
+    pos_to_offset[3] = pos_to_offset[4];
+    pos_to_offset_large[0] = pos_to_offset_large[1];
+    pos_to_offset_large[1] = pos_to_offset_large[2];
+    levels += TX_PAD_HOR;
+  } while (--row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
+                                              const int width, const int height,
+                                              const ptrdiff_t *const offsets,
+                                              int8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+  const __m128i pos_to_offset_large =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+  __m128i count;
+  __m128i level[5];
+  int row = height;
+
+  assert(!(width % 16));
+
+  do {
+    __m128i pos_to_offset =
+        _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+    int w = width;
+
+    do {
+      load_levels_16x1x5_sse2(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel_sse2(level);
+      count = _mm_add_epi8(count, pos_to_offset);
+      _mm_store_si128((__m128i *)coeff_contexts, count);
+      pos_to_offset = pos_to_offset_large;
+      levels += 16;
+      coeff_contexts += 16;
+      w -= 16;
+    } while (w);
+
+    levels += TX_PAD_HOR;
+  } while (--row);
+}
+
+static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
+                                              const int width, const int height,
+                                              const ptrdiff_t *const offsets,
+                                              int8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+  __m128i pos_to_offset[3];
+  __m128i count;
+  __m128i level[5];
+  int row = height;
+
+  assert(!(width % 16));
+
+  pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0);
+  pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5);
+  pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+
+  do {
+    int w = width;
+
+    do {
+      load_levels_16x1x5_sse2(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel_sse2(level);
+      count = _mm_add_epi8(count, pos_to_offset[0]);
+      _mm_store_si128((__m128i *)coeff_contexts, count);
+      levels += 16;
+      coeff_contexts += 16;
+      w -= 16;
+    } while (w);
+
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    levels += TX_PAD_HOR;
+  } while (--row);
+}
+
+// Note: levels[] must be in the range [0, 127], inclusive.
+void av1_get_nz_map_contexts_sse2(const uint8_t *const levels,
+                                  const int16_t *const scan, const uint16_t eob,
+                                  const TX_SIZE tx_size,
+                                  const TX_CLASS tx_class,
+                                  int8_t *const coeff_contexts) {
+  const int last_idx = eob - 1;
+  if (!last_idx) {
+    coeff_contexts[0] = 0;
+    return;
+  }
+
+  const int real_width = tx_size_wide[tx_size];
+  const int real_height = tx_size_high[tx_size];
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  const int stride = width + TX_PAD_HOR;
+  ptrdiff_t offsets[3];
+
+  /* coeff_contexts must be 16 byte aligned. */
+  assert(!((intptr_t)coeff_contexts & 0xf));
+
+  if (tx_class == TX_CLASS_2D) {
+    offsets[0] = 0 * stride + 2;
+    offsets[1] = 1 * stride + 1;
+    offsets[2] = 2 * stride + 0;
+
+    if (width == 4) {
+      get_4_nz_map_contexts_2d(levels, height, offsets, coeff_contexts);
+    } else if (width == 8) {
+      get_8_coeff_contexts_2d(levels, height, offsets, coeff_contexts);
+    } else if (width == 16) {
+      get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+                                offsets, coeff_contexts);
+    } else {
+      get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+                                offsets, coeff_contexts);
+    }
+  } else if (tx_class == TX_CLASS_HORIZ) {
+    offsets[0] = 2;
+    offsets[1] = 3;
+    offsets[2] = 4;
+    if (width == 4) {
+      get_4_nz_map_contexts_hor(levels, height, offsets, coeff_contexts);
+    } else if (width == 8) {
+      get_8_coeff_contexts_hor(levels, height, offsets, coeff_contexts);
+    } else {
+      get_16n_coeff_contexts_hor(levels, width, height, offsets,
+                                 coeff_contexts);
+    }
+  } else {  // TX_CLASS_VERT
+    offsets[0] = 2 * stride;
+    offsets[1] = 3 * stride;
+    offsets[2] = 4 * stride;
+    if (width == 4) {
+      get_4_nz_map_contexts_ver(levels, height, offsets, coeff_contexts);
+    } else if (width == 8) {
+      get_8_coeff_contexts_ver(levels, height, offsets, coeff_contexts);
+    } else {
+      get_16n_coeff_contexts_ver(levels, width, height, offsets,
+                                 coeff_contexts);
+    }
+  }
+
+  const int bwl = get_txb_bwl(tx_size);
+  const int pos = scan[last_idx];
+  if (last_idx <= (height << bwl) / 8)
+    coeff_contexts[pos] = 1;
+  else if (last_idx <= (height << bwl) / 4)
+    coeff_contexts[pos] = 2;
+  else
+    coeff_contexts[pos] = 3;
+}
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
new file mode 100644
index 000000000..b3a879b0f
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <smmintrin.h>  /* SSE4.1 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+
+void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width,
+                                const int height, uint8_t *const levels) {
+  const int stride = width + TX_PAD_HOR;
+  memset(levels - TX_PAD_TOP * stride, 0,
+         sizeof(*levels) * TX_PAD_TOP * stride);
+  memset(levels + stride * height, 0,
+         sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
+
+  const __m128i zeros = _mm_setzero_si128();
+  int i = 0;
+  uint8_t *ls = levels;
+  const tran_low_t *cf = coeff;
+  if (width == 4) {
+    do {
+      const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
+      const __m128i coeffB = _mm_load_si128((__m128i *)(cf + width));
+      const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+      const __m128i absAB = _mm_abs_epi16(coeffAB);
+      const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
+      const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros);
+      _mm_storeu_si128((__m128i *)ls, lsAB);
+      ls += (stride << 1);
+      cf += (width << 1);
+      i += 2;
+    } while (i < height);
+  } else if (width == 8) {
+    do {
+      const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
+      const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4));
+      const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+      const __m128i absAB = _mm_abs_epi16(coeffAB);
+      const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
+      _mm_storeu_si128((__m128i *)ls, absAB8);
+      ls += stride;
+      cf += width;
+      i += 1;
+    } while (i < height);
+  } else {
+    do {
+      int j = 0;
+      do {
+        const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
+        const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4));
+        const __m128i coeffC = _mm_load_si128((__m128i *)(cf + 8));
+        const __m128i coeffD = _mm_load_si128((__m128i *)(cf + 12));
+        const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+        const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD);
+        const __m128i absAB = _mm_abs_epi16(coeffAB);
+        const __m128i absCD = _mm_abs_epi16(coeffCD);
+        const __m128i absABCD = _mm_packs_epi16(absAB, absCD);
+        _mm_storeu_si128((__m128i *)(ls + j), absABCD);
+        j += 16;
+        cf += 16;
+      } while (j < width);
+      *(int32_t *)(ls + width) = 0;
+      ls += stride;
+      i += 1;
+    } while (i < height);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
index 6599630d0..7d4f69585 100644
--- a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
+++ b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
@@ -11,7 +11,8 @@
 
 #include <immintrin.h>  // AVX2
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
diff --git a/third_party/aom/av1/encoder/x86/error_sse2.asm b/third_party/aom/av1/encoder/x86/error_sse2.asm
index 4680f1fab..72e9e22b1 100644
--- a/third_party/aom/av1/encoder/x86/error_sse2.asm
+++ b/third_party/aom/av1/encoder/x86/error_sse2.asm
@@ -77,49 +77,3 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
   movd    edx, m5
 %endif
   RET
-
-; Compute the sum of squared difference between two int16_t vectors.
-; int64_t av1_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
-;                            intptr_t block_size)
-
-INIT_XMM sse2
-cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
-  pxor      m4, m4                 ; sse accumulator
-  pxor      m5, m5                 ; dedicated zero register
-  lea     uqcq, [uqcq+sizeq*2]
-  lea     dqcq, [dqcq+sizeq*2]
-  neg    sizeq
-.loop:
-  mova      m2, [uqcq+sizeq*2]
-  mova      m0, [dqcq+sizeq*2]
-  mova      m3, [uqcq+sizeq*2+mmsize]
-  mova      m1, [dqcq+sizeq*2+mmsize]
-  psubw     m0, m2
-  psubw     m1, m3
-  ; individual errors are max. 15bit+sign, so squares are 30bit, and
-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
-  pmaddwd   m0, m0
-  pmaddwd   m1, m1
-  ; accumulate in 64bit
-  punpckldq m3, m0, m5
-  punpckhdq m0, m5
-  paddq     m4, m3
-  punpckldq m3, m1, m5
-  paddq     m4, m0
-  punpckhdq m1, m5
-  paddq     m4, m3
-  paddq     m4, m1
-  add    sizeq, mmsize
-  jl .loop
-
-  ; accumulate horizontally and store in return value
-  movhlps   m5, m4
-  paddq     m4, m5
-%if ARCH_X86_64
-  movq    rax, m4
-%else
-  pshufd   m5, m4, 0x1
-  movd    eax, m4
-  movd    edx, m5
-%endif
-  RET
diff --git a/third_party/aom/av1/encoder/x86/hash_sse42.c b/third_party/aom/av1/encoder/x86/hash_sse42.c
new file mode 100644
index 000000000..65fa46311
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/hash_sse42.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <smmintrin.h>
+
+// Byte-boundary alignment issues
+#define ALIGN_SIZE 8
+#define ALIGN_MASK (ALIGN_SIZE - 1)
+
+#define CALC_CRC(op, crc, type, buf, len) \
+  while ((len) >= sizeof(type)) {         \
+    (crc) = op((crc), *(type *)(buf));    \
+    (len) -= sizeof(type);                \
+    buf += sizeof(type);                  \
+  }
+
+/**
+ * Calculates 32-bit CRC for the input buffer
+ * polynomial is 0x11EDC6F41
+ * @return A 32-bit unsigned integer representing the CRC
+ */
+uint32_t av1_get_crc32c_value_sse4_2(void *crc_calculator, uint8_t *p,
+                                     size_t len) {
+  (void)crc_calculator;
+  const uint8_t *buf = p;
+  uint32_t crc = 0xFFFFFFFF;
+
+  // Align the input to the word boundary
+  for (; (len > 0) && ((intptr_t)buf & ALIGN_MASK); len--, buf++) {
+    crc = _mm_crc32_u8(crc, *buf);
+  }
+
+#ifdef __x86_64__
+  uint64_t crc64 = crc;
+  CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len);
+  crc = (uint32_t)crc64;
+#endif
+  CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len);
+  CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len);
+  CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len);
+  return (crc ^= 0xFFFFFFFF);
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
index b684f7a3a..4cd6371a6 100644
--- a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -11,11 +11,12 @@
 #include <assert.h>
 #include <smmintrin.h> /* SSE4.1 */
 
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
-#include "av1/common/av1_fwd_txfm1d_cfg.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
 #include "av1/common/av1_txfm.h"
 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
 #include "aom_dsp/txfm_common.h"
 #include "aom_dsp/x86/txfm_common_sse2.h"
 #include "aom_ports/mem.h"
@@ -121,72 +122,57 @@ static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) {
 }
 
 static void fadst4x4_sse4_1(__m128i *in, int bit) {
-  const int32_t *cospi = cospi_arr(bit);
-  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
-  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
-  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
-  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
-  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const int32_t *sinpi = sinpi_arr(bit);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
-  const __m128i kZero = _mm_setzero_si128();
-  __m128i s0, s1, s2, s3;
+  const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
+  const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
+  const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
+  const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
+  __m128i t;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i x0, x1, x2, x3;
   __m128i u0, u1, u2, u3;
   __m128i v0, v1, v2, v3;
 
-  // stage 0
-  // stage 1
-  // stage 2
-  u0 = _mm_mullo_epi32(in[3], cospi8);
-  u1 = _mm_mullo_epi32(in[0], cospi56);
-  u2 = _mm_add_epi32(u0, u1);
-  s0 = _mm_add_epi32(u2, rnding);
-  s0 = _mm_srai_epi32(s0, bit);
-
-  v0 = _mm_mullo_epi32(in[3], cospi56);
-  v1 = _mm_mullo_epi32(in[0], cospi8);
-  v2 = _mm_sub_epi32(v0, v1);
-  s1 = _mm_add_epi32(v2, rnding);
-  s1 = _mm_srai_epi32(s1, bit);
-
-  u0 = _mm_mullo_epi32(in[1], cospi40);
-  u1 = _mm_mullo_epi32(in[2], cospi24);
-  u2 = _mm_add_epi32(u0, u1);
-  s2 = _mm_add_epi32(u2, rnding);
-  s2 = _mm_srai_epi32(s2, bit);
-
-  v0 = _mm_mullo_epi32(in[1], cospi24);
-  v1 = _mm_mullo_epi32(in[2], cospi40);
-  v2 = _mm_sub_epi32(v0, v1);
-  s3 = _mm_add_epi32(v2, rnding);
-  s3 = _mm_srai_epi32(s3, bit);
-
-  // stage 3
-  u0 = _mm_add_epi32(s0, s2);
-  u2 = _mm_sub_epi32(s0, s2);
-  u1 = _mm_add_epi32(s1, s3);
-  u3 = _mm_sub_epi32(s1, s3);
-
-  // stage 4
-  v0 = _mm_mullo_epi32(u2, cospi32);
-  v1 = _mm_mullo_epi32(u3, cospi32);
-  v2 = _mm_add_epi32(v0, v1);
-  s2 = _mm_add_epi32(v2, rnding);
-  u2 = _mm_srai_epi32(s2, bit);
+  s0 = _mm_mullo_epi32(in[0], sinpi1);
+  s1 = _mm_mullo_epi32(in[0], sinpi4);
+  s2 = _mm_mullo_epi32(in[1], sinpi2);
+  s3 = _mm_mullo_epi32(in[1], sinpi1);
+  s4 = _mm_mullo_epi32(in[2], sinpi3);
+  s5 = _mm_mullo_epi32(in[3], sinpi4);
+  s6 = _mm_mullo_epi32(in[3], sinpi2);
+  t = _mm_add_epi32(in[0], in[1]);
+  s7 = _mm_sub_epi32(t, in[3]);
+
+  t = _mm_add_epi32(s0, s2);
+  x0 = _mm_add_epi32(t, s5);
+  x1 = _mm_mullo_epi32(s7, sinpi3);
+  t = _mm_sub_epi32(s1, s3);
+  x2 = _mm_add_epi32(t, s6);
+  x3 = s4;
+
+  s0 = _mm_add_epi32(x0, x3);
+  s1 = x1;
+  s2 = _mm_sub_epi32(x2, x3);
+  t = _mm_sub_epi32(x2, x0);
+  s3 = _mm_add_epi32(t, x3);
+
+  u0 = _mm_add_epi32(s0, rnding);
+  u0 = _mm_srai_epi32(u0, bit);
+
+  u1 = _mm_add_epi32(s1, rnding);
+  u1 = _mm_srai_epi32(u1, bit);
+
+  u2 = _mm_add_epi32(s2, rnding);
+  u2 = _mm_srai_epi32(u2, bit);
+
+  u3 = _mm_add_epi32(s3, rnding);
+  u3 = _mm_srai_epi32(u3, bit);
 
-  v2 = _mm_sub_epi32(v0, v1);
-  s3 = _mm_add_epi32(v2, rnding);
-  u3 = _mm_srai_epi32(s3, bit);
-
-  // u0, u1, u2, u3
-  u2 = _mm_sub_epi32(kZero, u2);
-  u1 = _mm_sub_epi32(kZero, u1);
-
-  // u0, u2, u3, u1
-  // Transpose 4x4 32-bit
-  v0 = _mm_unpacklo_epi32(u0, u2);
-  v1 = _mm_unpackhi_epi32(u0, u2);
-  v2 = _mm_unpacklo_epi32(u3, u1);
-  v3 = _mm_unpackhi_epi32(u3, u1);
+  v0 = _mm_unpacklo_epi32(u0, u1);
+  v1 = _mm_unpackhi_epi32(u0, u1);
+  v2 = _mm_unpacklo_epi32(u2, u3);
+  v3 = _mm_unpackhi_epi32(u2, u3);
 
   in[0] = _mm_unpacklo_epi64(v0, v2);
   in[1] = _mm_unpackhi_epi64(v0, v2);
@@ -197,84 +183,65 @@ static void fadst4x4_sse4_1(__m128i *in, int bit) {
 void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
                                int input_stride, TX_TYPE tx_type, int bd) {
   __m128i in[4];
-  const TXFM_1D_CFG *row_cfg = NULL;
-  const TXFM_1D_CFG *col_cfg = NULL;
+  const int8_t *shift = fwd_txfm_shift_ls[TX_4X4];
+  const int txw_idx = get_txw_idx(TX_4X4);
+  const int txh_idx = get_txh_idx(TX_4X4);
 
   switch (tx_type) {
     case DCT_DCT:
-      row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
-      col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
-      load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
-      fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
       write_buffer_4x4(in, coeff);
       break;
     case ADST_DCT:
-      row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
-      load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
-      fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
       write_buffer_4x4(in, coeff);
       break;
     case DCT_ADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
-      col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
-      load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
-      fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
       write_buffer_4x4(in, coeff);
       break;
     case ADST_ADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
-      load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
-      fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
       write_buffer_4x4(in, coeff);
       break;
-#if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
-      load_buffer_4x4(input, in, input_stride, 1, 0, row_cfg->shift[0]);
-      fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
       write_buffer_4x4(in, coeff);
       break;
     case DCT_FLIPADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
-      col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
-      load_buffer_4x4(input, in, input_stride, 0, 1, row_cfg->shift[0]);
-      fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+      fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
       write_buffer_4x4(in, coeff);
       break;
     case FLIPADST_FLIPADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
-      load_buffer_4x4(input, in, input_stride, 1, 1, row_cfg->shift[0]);
-      fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
       write_buffer_4x4(in, coeff);
       break;
     case ADST_FLIPADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
-      load_buffer_4x4(input, in, input_stride, 0, 1, row_cfg->shift[0]);
-      fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
       write_buffer_4x4(in, coeff);
       break;
     case FLIPADST_ADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
-      load_buffer_4x4(input, in, input_stride, 1, 0, row_cfg->shift[0]);
-      fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
-      fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
+      load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
       write_buffer_4x4(in, coeff);
       break;
-#endif
     default: assert(0);
   }
   (void)bd;
@@ -624,415 +591,274 @@ static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
 
 static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
   const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
-  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
-  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
-  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
-  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
-  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
-  const __m128i kZero = _mm_setzero_si128();
-  __m128i u[8], v[8], x;
-
-  // Even 8 points: 0, 2, ..., 14
-  // stage 0
-  // stage 1
-  // stage 2
-  // (1)
-  u[0] = _mm_mullo_epi32(in[14], cospi4);
-  x = _mm_mullo_epi32(in[0], cospi60);
-  u[0] = _mm_add_epi32(u[0], x);
-  u[0] = _mm_add_epi32(u[0], rnding);
-  u[0] = _mm_srai_epi32(u[0], bit);
-
-  u[1] = _mm_mullo_epi32(in[14], cospi60);
-  x = _mm_mullo_epi32(in[0], cospi4);
-  u[1] = _mm_sub_epi32(u[1], x);
-  u[1] = _mm_add_epi32(u[1], rnding);
-  u[1] = _mm_srai_epi32(u[1], bit);
-
-  // (2)
-  u[2] = _mm_mullo_epi32(in[10], cospi20);
-  x = _mm_mullo_epi32(in[4], cospi44);
-  u[2] = _mm_add_epi32(u[2], x);
-  u[2] = _mm_add_epi32(u[2], rnding);
-  u[2] = _mm_srai_epi32(u[2], bit);
-
-  u[3] = _mm_mullo_epi32(in[10], cospi44);
-  x = _mm_mullo_epi32(in[4], cospi20);
-  u[3] = _mm_sub_epi32(u[3], x);
-  u[3] = _mm_add_epi32(u[3], rnding);
-  u[3] = _mm_srai_epi32(u[3], bit);
-
-  // (3)
-  u[4] = _mm_mullo_epi32(in[6], cospi36);
-  x = _mm_mullo_epi32(in[8], cospi28);
-  u[4] = _mm_add_epi32(u[4], x);
-  u[4] = _mm_add_epi32(u[4], rnding);
-  u[4] = _mm_srai_epi32(u[4], bit);
-
-  u[5] = _mm_mullo_epi32(in[6], cospi28);
-  x = _mm_mullo_epi32(in[8], cospi36);
-  u[5] = _mm_sub_epi32(u[5], x);
-  u[5] = _mm_add_epi32(u[5], rnding);
-  u[5] = _mm_srai_epi32(u[5], bit);
-
-  // (4)
-  u[6] = _mm_mullo_epi32(in[2], cospi52);
-  x = _mm_mullo_epi32(in[12], cospi12);
-  u[6] = _mm_add_epi32(u[6], x);
-  u[6] = _mm_add_epi32(u[6], rnding);
-  u[6] = _mm_srai_epi32(u[6], bit);
-
-  u[7] = _mm_mullo_epi32(in[2], cospi12);
-  x = _mm_mullo_epi32(in[12], cospi52);
-  u[7] = _mm_sub_epi32(u[7], x);
-  u[7] = _mm_add_epi32(u[7], rnding);
-  u[7] = _mm_srai_epi32(u[7], bit);
-
-  // stage 3
-  v[0] = _mm_add_epi32(u[0], u[4]);
-  v[4] = _mm_sub_epi32(u[0], u[4]);
-  v[1] = _mm_add_epi32(u[1], u[5]);
-  v[5] = _mm_sub_epi32(u[1], u[5]);
-  v[2] = _mm_add_epi32(u[2], u[6]);
-  v[6] = _mm_sub_epi32(u[2], u[6]);
-  v[3] = _mm_add_epi32(u[3], u[7]);
-  v[7] = _mm_sub_epi32(u[3], u[7]);
-
-  // stage 4
-  u[0] = v[0];
-  u[1] = v[1];
-  u[2] = v[2];
-  u[3] = v[3];
-
-  u[4] = _mm_mullo_epi32(v[4], cospi16);
-  x = _mm_mullo_epi32(v[5], cospi48);
-  u[4] = _mm_add_epi32(u[4], x);
-  u[4] = _mm_add_epi32(u[4], rnding);
-  u[4] = _mm_srai_epi32(u[4], bit);
-
-  u[5] = _mm_mullo_epi32(v[4], cospi48);
-  x = _mm_mullo_epi32(v[5], cospi16);
-  u[5] = _mm_sub_epi32(u[5], x);
-  u[5] = _mm_add_epi32(u[5], rnding);
-  u[5] = _mm_srai_epi32(u[5], bit);
-
-  u[6] = _mm_mullo_epi32(v[6], cospim48);
-  x = _mm_mullo_epi32(v[7], cospi16);
-  u[6] = _mm_add_epi32(u[6], x);
-  u[6] = _mm_add_epi32(u[6], rnding);
-  u[6] = _mm_srai_epi32(u[6], bit);
-
-  u[7] = _mm_mullo_epi32(v[6], cospi16);
-  x = _mm_mullo_epi32(v[7], cospim48);
-  u[7] = _mm_sub_epi32(u[7], x);
-  u[7] = _mm_add_epi32(u[7], rnding);
-  u[7] = _mm_srai_epi32(u[7], bit);
-
-  // stage 5
-  v[0] = _mm_add_epi32(u[0], u[2]);
-  v[2] = _mm_sub_epi32(u[0], u[2]);
-  v[1] = _mm_add_epi32(u[1], u[3]);
-  v[3] = _mm_sub_epi32(u[1], u[3]);
-  v[4] = _mm_add_epi32(u[4], u[6]);
-  v[6] = _mm_sub_epi32(u[4], u[6]);
-  v[5] = _mm_add_epi32(u[5], u[7]);
-  v[7] = _mm_sub_epi32(u[5], u[7]);
-
-  // stage 6
-  u[0] = v[0];
-  u[1] = v[1];
-  u[4] = v[4];
-  u[5] = v[5];
-
-  v[0] = _mm_mullo_epi32(v[2], cospi32);
-  x = _mm_mullo_epi32(v[3], cospi32);
-  u[2] = _mm_add_epi32(v[0], x);
-  u[2] = _mm_add_epi32(u[2], rnding);
-  u[2] = _mm_srai_epi32(u[2], bit);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
+  int col;
 
-  u[3] = _mm_sub_epi32(v[0], x);
-  u[3] = _mm_add_epi32(u[3], rnding);
-  u[3] = _mm_srai_epi32(u[3], bit);
+  // Note:
+  //  Even column: 0, 2, ..., 14
+  //  Odd column: 1, 3, ..., 15
+  //  one even column plus one odd column constructs one row (8 coeffs)
+  //  total we have 8 rows (8x8).
+  for (col = 0; col < 2; ++col) {
+    // stage 0
+    // stage 1
+    u0 = in[2 * 0 + col];
+    u1 = _mm_sub_epi32(zero, in[2 * 7 + col]);
+    u2 = _mm_sub_epi32(zero, in[2 * 3 + col]);
+    u3 = in[2 * 4 + col];
+    u4 = _mm_sub_epi32(zero, in[2 * 1 + col]);
+    u5 = in[2 * 6 + col];
+    u6 = in[2 * 2 + col];
+    u7 = _mm_sub_epi32(zero, in[2 * 5 + col]);
 
-  v[0] = _mm_mullo_epi32(v[6], cospi32);
-  x = _mm_mullo_epi32(v[7], cospi32);
-  u[6] = _mm_add_epi32(v[0], x);
-  u[6] = _mm_add_epi32(u[6], rnding);
-  u[6] = _mm_srai_epi32(u[6], bit);
-
-  u[7] = _mm_sub_epi32(v[0], x);
-  u[7] = _mm_add_epi32(u[7], rnding);
-  u[7] = _mm_srai_epi32(u[7], bit);
-
-  // stage 7
-  out[0] = u[0];
-  out[2] = _mm_sub_epi32(kZero, u[4]);
-  out[4] = u[6];
-  out[6] = _mm_sub_epi32(kZero, u[2]);
-  out[8] = u[3];
-  out[10] = _mm_sub_epi32(kZero, u[7]);
-  out[12] = u[5];
-  out[14] = _mm_sub_epi32(kZero, u[1]);
+    // stage 2
+    v0 = u0;
+    v1 = u1;
 
-  // Odd 8 points: 1, 3, ..., 15
-  // stage 0
-  // stage 1
-  // stage 2
-  // (1)
-  u[0] = _mm_mullo_epi32(in[15], cospi4);
-  x = _mm_mullo_epi32(in[1], cospi60);
-  u[0] = _mm_add_epi32(u[0], x);
-  u[0] = _mm_add_epi32(u[0], rnding);
-  u[0] = _mm_srai_epi32(u[0], bit);
+    x = _mm_mullo_epi32(u2, cospi32);
+    y = _mm_mullo_epi32(u3, cospi32);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
 
-  u[1] = _mm_mullo_epi32(in[15], cospi60);
-  x = _mm_mullo_epi32(in[1], cospi4);
-  u[1] = _mm_sub_epi32(u[1], x);
-  u[1] = _mm_add_epi32(u[1], rnding);
-  u[1] = _mm_srai_epi32(u[1], bit);
+    v3 = _mm_sub_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
 
-  // (2)
-  u[2] = _mm_mullo_epi32(in[11], cospi20);
-  x = _mm_mullo_epi32(in[5], cospi44);
-  u[2] = _mm_add_epi32(u[2], x);
-  u[2] = _mm_add_epi32(u[2], rnding);
-  u[2] = _mm_srai_epi32(u[2], bit);
+    v4 = u4;
+    v5 = u5;
 
-  u[3] = _mm_mullo_epi32(in[11], cospi44);
-  x = _mm_mullo_epi32(in[5], cospi20);
-  u[3] = _mm_sub_epi32(u[3], x);
-  u[3] = _mm_add_epi32(u[3], rnding);
-  u[3] = _mm_srai_epi32(u[3], bit);
+    x = _mm_mullo_epi32(u6, cospi32);
+    y = _mm_mullo_epi32(u7, cospi32);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
 
-  // (3)
-  u[4] = _mm_mullo_epi32(in[7], cospi36);
-  x = _mm_mullo_epi32(in[9], cospi28);
-  u[4] = _mm_add_epi32(u[4], x);
-  u[4] = _mm_add_epi32(u[4], rnding);
-  u[4] = _mm_srai_epi32(u[4], bit);
-
-  u[5] = _mm_mullo_epi32(in[7], cospi28);
-  x = _mm_mullo_epi32(in[9], cospi36);
-  u[5] = _mm_sub_epi32(u[5], x);
-  u[5] = _mm_add_epi32(u[5], rnding);
-  u[5] = _mm_srai_epi32(u[5], bit);
-
-  // (4)
-  u[6] = _mm_mullo_epi32(in[3], cospi52);
-  x = _mm_mullo_epi32(in[13], cospi12);
-  u[6] = _mm_add_epi32(u[6], x);
-  u[6] = _mm_add_epi32(u[6], rnding);
-  u[6] = _mm_srai_epi32(u[6], bit);
-
-  u[7] = _mm_mullo_epi32(in[3], cospi12);
-  x = _mm_mullo_epi32(in[13], cospi52);
-  u[7] = _mm_sub_epi32(u[7], x);
-  u[7] = _mm_add_epi32(u[7], rnding);
-  u[7] = _mm_srai_epi32(u[7], bit);
+    v7 = _mm_sub_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
 
-  // stage 3
-  v[0] = _mm_add_epi32(u[0], u[4]);
-  v[4] = _mm_sub_epi32(u[0], u[4]);
-  v[1] = _mm_add_epi32(u[1], u[5]);
-  v[5] = _mm_sub_epi32(u[1], u[5]);
-  v[2] = _mm_add_epi32(u[2], u[6]);
-  v[6] = _mm_sub_epi32(u[2], u[6]);
-  v[3] = _mm_add_epi32(u[3], u[7]);
-  v[7] = _mm_sub_epi32(u[3], u[7]);
+    // stage 3
+    u0 = _mm_add_epi32(v0, v2);
+    u1 = _mm_add_epi32(v1, v3);
+    u2 = _mm_sub_epi32(v0, v2);
+    u3 = _mm_sub_epi32(v1, v3);
+    u4 = _mm_add_epi32(v4, v6);
+    u5 = _mm_add_epi32(v5, v7);
+    u6 = _mm_sub_epi32(v4, v6);
+    u7 = _mm_sub_epi32(v5, v7);
 
-  // stage 4
-  u[0] = v[0];
-  u[1] = v[1];
-  u[2] = v[2];
-  u[3] = v[3];
-
-  u[4] = _mm_mullo_epi32(v[4], cospi16);
-  x = _mm_mullo_epi32(v[5], cospi48);
-  u[4] = _mm_add_epi32(u[4], x);
-  u[4] = _mm_add_epi32(u[4], rnding);
-  u[4] = _mm_srai_epi32(u[4], bit);
-
-  u[5] = _mm_mullo_epi32(v[4], cospi48);
-  x = _mm_mullo_epi32(v[5], cospi16);
-  u[5] = _mm_sub_epi32(u[5], x);
-  u[5] = _mm_add_epi32(u[5], rnding);
-  u[5] = _mm_srai_epi32(u[5], bit);
-
-  u[6] = _mm_mullo_epi32(v[6], cospim48);
-  x = _mm_mullo_epi32(v[7], cospi16);
-  u[6] = _mm_add_epi32(u[6], x);
-  u[6] = _mm_add_epi32(u[6], rnding);
-  u[6] = _mm_srai_epi32(u[6], bit);
-
-  u[7] = _mm_mullo_epi32(v[6], cospi16);
-  x = _mm_mullo_epi32(v[7], cospim48);
-  u[7] = _mm_sub_epi32(u[7], x);
-  u[7] = _mm_add_epi32(u[7], rnding);
-  u[7] = _mm_srai_epi32(u[7], bit);
+    // stage 4
+    v0 = u0;
+    v1 = u1;
+    v2 = u2;
+    v3 = u3;
+
+    x = _mm_mullo_epi32(u4, cospi16);
+    y = _mm_mullo_epi32(u5, cospi48);
+    v4 = _mm_add_epi32(x, y);
+    v4 = _mm_add_epi32(v4, rnding);
+    v4 = _mm_srai_epi32(v4, bit);
+
+    x = _mm_mullo_epi32(u4, cospi48);
+    y = _mm_mullo_epi32(u5, cospim16);
+    v5 = _mm_add_epi32(x, y);
+    v5 = _mm_add_epi32(v5, rnding);
+    v5 = _mm_srai_epi32(v5, bit);
+
+    x = _mm_mullo_epi32(u6, cospim48);
+    y = _mm_mullo_epi32(u7, cospi16);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    x = _mm_mullo_epi32(u6, cospi16);
+    y = _mm_mullo_epi32(u7, cospi48);
+    v7 = _mm_add_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
 
-  // stage 5
-  v[0] = _mm_add_epi32(u[0], u[2]);
-  v[2] = _mm_sub_epi32(u[0], u[2]);
-  v[1] = _mm_add_epi32(u[1], u[3]);
-  v[3] = _mm_sub_epi32(u[1], u[3]);
-  v[4] = _mm_add_epi32(u[4], u[6]);
-  v[6] = _mm_sub_epi32(u[4], u[6]);
-  v[5] = _mm_add_epi32(u[5], u[7]);
-  v[7] = _mm_sub_epi32(u[5], u[7]);
-
-  // stage 6
-  u[0] = v[0];
-  u[1] = v[1];
-  u[4] = v[4];
-  u[5] = v[5];
-
-  v[0] = _mm_mullo_epi32(v[2], cospi32);
-  x = _mm_mullo_epi32(v[3], cospi32);
-  u[2] = _mm_add_epi32(v[0], x);
-  u[2] = _mm_add_epi32(u[2], rnding);
-  u[2] = _mm_srai_epi32(u[2], bit);
+    // stage 5
+    u0 = _mm_add_epi32(v0, v4);
+    u1 = _mm_add_epi32(v1, v5);
+    u2 = _mm_add_epi32(v2, v6);
+    u3 = _mm_add_epi32(v3, v7);
+    u4 = _mm_sub_epi32(v0, v4);
+    u5 = _mm_sub_epi32(v1, v5);
+    u6 = _mm_sub_epi32(v2, v6);
+    u7 = _mm_sub_epi32(v3, v7);
 
-  u[3] = _mm_sub_epi32(v[0], x);
-  u[3] = _mm_add_epi32(u[3], rnding);
-  u[3] = _mm_srai_epi32(u[3], bit);
+    // stage 6
+    x = _mm_mullo_epi32(u0, cospi4);
+    y = _mm_mullo_epi32(u1, cospi60);
+    v0 = _mm_add_epi32(x, y);
+    v0 = _mm_add_epi32(v0, rnding);
+    v0 = _mm_srai_epi32(v0, bit);
+
+    x = _mm_mullo_epi32(u0, cospi60);
+    y = _mm_mullo_epi32(u1, cospim4);
+    v1 = _mm_add_epi32(x, y);
+    v1 = _mm_add_epi32(v1, rnding);
+    v1 = _mm_srai_epi32(v1, bit);
+
+    x = _mm_mullo_epi32(u2, cospi20);
+    y = _mm_mullo_epi32(u3, cospi44);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    x = _mm_mullo_epi32(u2, cospi44);
+    y = _mm_mullo_epi32(u3, cospim20);
+    v3 = _mm_add_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    x = _mm_mullo_epi32(u4, cospi36);
+    y = _mm_mullo_epi32(u5, cospi28);
+    v4 = _mm_add_epi32(x, y);
+    v4 = _mm_add_epi32(v4, rnding);
+    v4 = _mm_srai_epi32(v4, bit);
+
+    x = _mm_mullo_epi32(u4, cospi28);
+    y = _mm_mullo_epi32(u5, cospim36);
+    v5 = _mm_add_epi32(x, y);
+    v5 = _mm_add_epi32(v5, rnding);
+    v5 = _mm_srai_epi32(v5, bit);
+
+    x = _mm_mullo_epi32(u6, cospi52);
+    y = _mm_mullo_epi32(u7, cospi12);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    x = _mm_mullo_epi32(u6, cospi12);
+    y = _mm_mullo_epi32(u7, cospim52);
+    v7 = _mm_add_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
 
-  v[0] = _mm_mullo_epi32(v[6], cospi32);
-  x = _mm_mullo_epi32(v[7], cospi32);
-  u[6] = _mm_add_epi32(v[0], x);
-  u[6] = _mm_add_epi32(u[6], rnding);
-  u[6] = _mm_srai_epi32(u[6], bit);
-
-  u[7] = _mm_sub_epi32(v[0], x);
-  u[7] = _mm_add_epi32(u[7], rnding);
-  u[7] = _mm_srai_epi32(u[7], bit);
-
-  // stage 7
-  out[1] = u[0];
-  out[3] = _mm_sub_epi32(kZero, u[4]);
-  out[5] = u[6];
-  out[7] = _mm_sub_epi32(kZero, u[2]);
-  out[9] = u[3];
-  out[11] = _mm_sub_epi32(kZero, u[7]);
-  out[13] = u[5];
-  out[15] = _mm_sub_epi32(kZero, u[1]);
+    // stage 7
+    out[2 * 0 + col] = v1;
+    out[2 * 1 + col] = v6;
+    out[2 * 2 + col] = v3;
+    out[2 * 3 + col] = v4;
+    out[2 * 4 + col] = v5;
+    out[2 * 5 + col] = v2;
+    out[2 * 6 + col] = v7;
+    out[2 * 7 + col] = v0;
+  }
 }
 
 void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
                                TX_TYPE tx_type, int bd) {
   __m128i in[16], out[16];
-  const TXFM_1D_CFG *row_cfg = NULL;
-  const TXFM_1D_CFG *col_cfg = NULL;
+  const int8_t *shift = fwd_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
 
   switch (tx_type) {
     case DCT_DCT:
-      row_cfg = &fwd_txfm_1d_row_cfg_dct_8;
-      col_cfg = &fwd_txfm_1d_col_cfg_dct_8;
-      load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
-      fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
-      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case ADST_DCT:
-      row_cfg = &fwd_txfm_1d_row_cfg_dct_8;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
-      load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
-      fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
-      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case DCT_ADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
-      col_cfg = &fwd_txfm_1d_col_cfg_dct_8;
-      load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
-      fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
-      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case ADST_ADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
-      load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
-      fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
-      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
-#if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      row_cfg = &fwd_txfm_1d_row_cfg_dct_8;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
-      load_buffer_8x8(input, in, stride, 1, 0, row_cfg->shift[0]);
-      fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
-      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case DCT_FLIPADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
-      col_cfg = &fwd_txfm_1d_col_cfg_dct_8;
-      load_buffer_8x8(input, in, stride, 0, 1, row_cfg->shift[0]);
-      fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
-      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case FLIPADST_FLIPADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
-      load_buffer_8x8(input, in, stride, 1, 1, row_cfg->shift[0]);
-      fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
-      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+      load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case ADST_FLIPADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
-      load_buffer_8x8(input, in, stride, 0, 1, row_cfg->shift[0]);
-      fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
-      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case FLIPADST_ADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
-      load_buffer_8x8(input, in, stride, 1, 0, row_cfg->shift[0]);
-      fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
-      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
+      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_8x8_rounding(out, -shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
-#endif  // CONFIG_EXT_TX
     default: assert(0);
   }
   (void)bd;
@@ -1402,230 +1228,174 @@ static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
 
 static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
   const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
   const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
   const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
   const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
   const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
-  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
-  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
-  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
-  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
-  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
-  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
-  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
-  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
-  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
-  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+
   __m128i u[16], v[16], x, y;
-  const int col_num = 4;
   int col;
 
-  // Calculate the column 0, 1, 2, 3
-  for (col = 0; col < col_num; ++col) {
+  for (col = 0; col < 4; ++col) {
     // stage 0
     // stage 1
-    // stage 2
-    v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2);
-    x = _mm_mullo_epi32(in[0 * col_num + col], cospi62);
-    v[0] = _mm_add_epi32(v[0], x);
-    v[0] = _mm_add_epi32(v[0], rnding);
-    v[0] = _mm_srai_epi32(v[0], bit);
+    u[0] = in[0 * 4 + col];
+    u[1] = _mm_sub_epi32(zero, in[15 * 4 + col]);
+    u[2] = _mm_sub_epi32(zero, in[7 * 4 + col]);
+    u[3] = in[8 * 4 + col];
+    u[4] = _mm_sub_epi32(zero, in[3 * 4 + col]);
+    u[5] = in[12 * 4 + col];
+    u[6] = in[4 * 4 + col];
+    u[7] = _mm_sub_epi32(zero, in[11 * 4 + col]);
+    u[8] = _mm_sub_epi32(zero, in[1 * 4 + col]);
+    u[9] = in[14 * 4 + col];
+    u[10] = in[6 * 4 + col];
+    u[11] = _mm_sub_epi32(zero, in[9 * 4 + col]);
+    u[12] = in[2 * 4 + col];
+    u[13] = _mm_sub_epi32(zero, in[13 * 4 + col]);
+    u[14] = _mm_sub_epi32(zero, in[5 * 4 + col]);
+    u[15] = in[10 * 4 + col];
 
-    v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62);
-    x = _mm_mullo_epi32(in[0 * col_num + col], cospi2);
-    v[1] = _mm_sub_epi32(v[1], x);
-    v[1] = _mm_add_epi32(v[1], rnding);
-    v[1] = _mm_srai_epi32(v[1], bit);
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
 
-    v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10);
-    x = _mm_mullo_epi32(in[2 * col_num + col], cospi54);
-    v[2] = _mm_add_epi32(v[2], x);
+    x = _mm_mullo_epi32(u[2], cospi32);
+    y = _mm_mullo_epi32(u[3], cospi32);
+    v[2] = _mm_add_epi32(x, y);
     v[2] = _mm_add_epi32(v[2], rnding);
     v[2] = _mm_srai_epi32(v[2], bit);
 
-    v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54);
-    x = _mm_mullo_epi32(in[2 * col_num + col], cospi10);
-    v[3] = _mm_sub_epi32(v[3], x);
+    v[3] = _mm_sub_epi32(x, y);
     v[3] = _mm_add_epi32(v[3], rnding);
     v[3] = _mm_srai_epi32(v[3], bit);
 
-    v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18);
-    x = _mm_mullo_epi32(in[4 * col_num + col], cospi46);
-    v[4] = _mm_add_epi32(v[4], x);
-    v[4] = _mm_add_epi32(v[4], rnding);
-    v[4] = _mm_srai_epi32(v[4], bit);
-
-    v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46);
-    x = _mm_mullo_epi32(in[4 * col_num + col], cospi18);
-    v[5] = _mm_sub_epi32(v[5], x);
-    v[5] = _mm_add_epi32(v[5], rnding);
-    v[5] = _mm_srai_epi32(v[5], bit);
-
-    v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26);
-    x = _mm_mullo_epi32(in[6 * col_num + col], cospi38);
-    v[6] = _mm_add_epi32(v[6], x);
+    v[4] = u[4];
+    v[5] = u[5];
+
+    x = _mm_mullo_epi32(u[6], cospi32);
+    y = _mm_mullo_epi32(u[7], cospi32);
+    v[6] = _mm_add_epi32(x, y);
     v[6] = _mm_add_epi32(v[6], rnding);
     v[6] = _mm_srai_epi32(v[6], bit);
 
-    v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38);
-    x = _mm_mullo_epi32(in[6 * col_num + col], cospi26);
-    v[7] = _mm_sub_epi32(v[7], x);
+    v[7] = _mm_sub_epi32(x, y);
     v[7] = _mm_add_epi32(v[7], rnding);
     v[7] = _mm_srai_epi32(v[7], bit);
 
-    v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34);
-    x = _mm_mullo_epi32(in[8 * col_num + col], cospi30);
-    v[8] = _mm_add_epi32(v[8], x);
-    v[8] = _mm_add_epi32(v[8], rnding);
-    v[8] = _mm_srai_epi32(v[8], bit);
-
-    v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30);
-    x = _mm_mullo_epi32(in[8 * col_num + col], cospi34);
-    v[9] = _mm_sub_epi32(v[9], x);
-    v[9] = _mm_add_epi32(v[9], rnding);
-    v[9] = _mm_srai_epi32(v[9], bit);
+    v[8] = u[8];
+    v[9] = u[9];
 
-    v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42);
-    x = _mm_mullo_epi32(in[10 * col_num + col], cospi22);
-    v[10] = _mm_add_epi32(v[10], x);
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[11], cospi32);
+    v[10] = _mm_add_epi32(x, y);
     v[10] = _mm_add_epi32(v[10], rnding);
     v[10] = _mm_srai_epi32(v[10], bit);
 
-    v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22);
-    x = _mm_mullo_epi32(in[10 * col_num + col], cospi42);
-    v[11] = _mm_sub_epi32(v[11], x);
+    v[11] = _mm_sub_epi32(x, y);
     v[11] = _mm_add_epi32(v[11], rnding);
     v[11] = _mm_srai_epi32(v[11], bit);
 
-    v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50);
-    x = _mm_mullo_epi32(in[12 * col_num + col], cospi14);
-    v[12] = _mm_add_epi32(v[12], x);
-    v[12] = _mm_add_epi32(v[12], rnding);
-    v[12] = _mm_srai_epi32(v[12], bit);
-
-    v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14);
-    x = _mm_mullo_epi32(in[12 * col_num + col], cospi50);
-    v[13] = _mm_sub_epi32(v[13], x);
-    v[13] = _mm_add_epi32(v[13], rnding);
-    v[13] = _mm_srai_epi32(v[13], bit);
+    v[12] = u[12];
+    v[13] = u[13];
 
-    v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58);
-    x = _mm_mullo_epi32(in[14 * col_num + col], cospi6);
-    v[14] = _mm_add_epi32(v[14], x);
+    x = _mm_mullo_epi32(u[14], cospi32);
+    y = _mm_mullo_epi32(u[15], cospi32);
+    v[14] = _mm_add_epi32(x, y);
     v[14] = _mm_add_epi32(v[14], rnding);
     v[14] = _mm_srai_epi32(v[14], bit);
 
-    v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6);
-    x = _mm_mullo_epi32(in[14 * col_num + col], cospi58);
-    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_sub_epi32(x, y);
     v[15] = _mm_add_epi32(v[15], rnding);
     v[15] = _mm_srai_epi32(v[15], bit);
 
     // stage 3
-    u[0] = _mm_add_epi32(v[0], v[8]);
-    u[8] = _mm_sub_epi32(v[0], v[8]);
-    u[1] = _mm_add_epi32(v[1], v[9]);
-    u[9] = _mm_sub_epi32(v[1], v[9]);
-    u[2] = _mm_add_epi32(v[2], v[10]);
-    u[10] = _mm_sub_epi32(v[2], v[10]);
-    u[3] = _mm_add_epi32(v[3], v[11]);
-    u[11] = _mm_sub_epi32(v[3], v[11]);
-    u[4] = _mm_add_epi32(v[4], v[12]);
-    u[12] = _mm_sub_epi32(v[4], v[12]);
-    u[5] = _mm_add_epi32(v[5], v[13]);
-    u[13] = _mm_sub_epi32(v[5], v[13]);
-    u[6] = _mm_add_epi32(v[6], v[14]);
-    u[14] = _mm_sub_epi32(v[6], v[14]);
-    u[7] = _mm_add_epi32(v[7], v[15]);
-    u[15] = _mm_sub_epi32(v[7], v[15]);
+    u[0] = _mm_add_epi32(v[0], v[2]);
+    u[1] = _mm_add_epi32(v[1], v[3]);
+    u[2] = _mm_sub_epi32(v[0], v[2]);
+    u[3] = _mm_sub_epi32(v[1], v[3]);
+    u[4] = _mm_add_epi32(v[4], v[6]);
+    u[5] = _mm_add_epi32(v[5], v[7]);
+    u[6] = _mm_sub_epi32(v[4], v[6]);
+    u[7] = _mm_sub_epi32(v[5], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[10]);
+    u[9] = _mm_add_epi32(v[9], v[11]);
+    u[10] = _mm_sub_epi32(v[8], v[10]);
+    u[11] = _mm_sub_epi32(v[9], v[11]);
+    u[12] = _mm_add_epi32(v[12], v[14]);
+    u[13] = _mm_add_epi32(v[13], v[15]);
+    u[14] = _mm_sub_epi32(v[12], v[14]);
+    u[15] = _mm_sub_epi32(v[13], v[15]);
 
     // stage 4
     v[0] = u[0];
     v[1] = u[1];
     v[2] = u[2];
     v[3] = u[3];
-    v[4] = u[4];
-    v[5] = u[5];
-    v[6] = u[6];
-    v[7] = u[7];
-
-    v[8] = _mm_mullo_epi32(u[8], cospi8);
-    x = _mm_mullo_epi32(u[9], cospi56);
-    v[8] = _mm_add_epi32(v[8], x);
-    v[8] = _mm_add_epi32(v[8], rnding);
-    v[8] = _mm_srai_epi32(v[8], bit);
-
-    v[9] = _mm_mullo_epi32(u[8], cospi56);
-    x = _mm_mullo_epi32(u[9], cospi8);
-    v[9] = _mm_sub_epi32(v[9], x);
-    v[9] = _mm_add_epi32(v[9], rnding);
-    v[9] = _mm_srai_epi32(v[9], bit);
-
-    v[10] = _mm_mullo_epi32(u[10], cospi40);
-    x = _mm_mullo_epi32(u[11], cospi24);
-    v[10] = _mm_add_epi32(v[10], x);
-    v[10] = _mm_add_epi32(v[10], rnding);
-    v[10] = _mm_srai_epi32(v[10], bit);
-
-    v[11] = _mm_mullo_epi32(u[10], cospi24);
-    x = _mm_mullo_epi32(u[11], cospi40);
-    v[11] = _mm_sub_epi32(v[11], x);
-    v[11] = _mm_add_epi32(v[11], rnding);
-    v[11] = _mm_srai_epi32(v[11], bit);
-
-    v[12] = _mm_mullo_epi32(u[12], cospim56);
-    x = _mm_mullo_epi32(u[13], cospi8);
-    v[12] = _mm_add_epi32(v[12], x);
-    v[12] = _mm_add_epi32(v[12], rnding);
-    v[12] = _mm_srai_epi32(v[12], bit);
-
-    v[13] = _mm_mullo_epi32(u[12], cospi8);
-    x = _mm_mullo_epi32(u[13], cospim56);
-    v[13] = _mm_sub_epi32(v[13], x);
-    v[13] = _mm_add_epi32(v[13], rnding);
-    v[13] = _mm_srai_epi32(v[13], bit);
-
-    v[14] = _mm_mullo_epi32(u[14], cospim24);
-    x = _mm_mullo_epi32(u[15], cospi40);
-    v[14] = _mm_add_epi32(v[14], x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
-
-    v[15] = _mm_mullo_epi32(u[14], cospi40);
-    x = _mm_mullo_epi32(u[15], cospim24);
-    v[15] = _mm_sub_epi32(v[15], x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
+    v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
+    v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
+    v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
+    v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+    v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
 
     // stage 5
     u[0] = _mm_add_epi32(v[0], v[4]);
-    u[4] = _mm_sub_epi32(v[0], v[4]);
     u[1] = _mm_add_epi32(v[1], v[5]);
-    u[5] = _mm_sub_epi32(v[1], v[5]);
     u[2] = _mm_add_epi32(v[2], v[6]);
-    u[6] = _mm_sub_epi32(v[2], v[6]);
     u[3] = _mm_add_epi32(v[3], v[7]);
+    u[4] = _mm_sub_epi32(v[0], v[4]);
+    u[5] = _mm_sub_epi32(v[1], v[5]);
+    u[6] = _mm_sub_epi32(v[2], v[6]);
     u[7] = _mm_sub_epi32(v[3], v[7]);
     u[8] = _mm_add_epi32(v[8], v[12]);
-    u[12] = _mm_sub_epi32(v[8], v[12]);
     u[9] = _mm_add_epi32(v[9], v[13]);
-    u[13] = _mm_sub_epi32(v[9], v[13]);
     u[10] = _mm_add_epi32(v[10], v[14]);
-    u[14] = _mm_sub_epi32(v[10], v[14]);
     u[11] = _mm_add_epi32(v[11], v[15]);
+    u[12] = _mm_sub_epi32(v[8], v[12]);
+    u[13] = _mm_sub_epi32(v[9], v[13]);
+    u[14] = _mm_sub_epi32(v[10], v[14]);
     u[15] = _mm_sub_epi32(v[11], v[15]);
 
     // stage 6
@@ -1633,148 +1403,72 @@ static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
     v[1] = u[1];
     v[2] = u[2];
     v[3] = u[3];
-
-    v[4] = _mm_mullo_epi32(u[4], cospi16);
-    x = _mm_mullo_epi32(u[5], cospi48);
-    v[4] = _mm_add_epi32(v[4], x);
-    v[4] = _mm_add_epi32(v[4], rnding);
-    v[4] = _mm_srai_epi32(v[4], bit);
-
-    v[5] = _mm_mullo_epi32(u[4], cospi48);
-    x = _mm_mullo_epi32(u[5], cospi16);
-    v[5] = _mm_sub_epi32(v[5], x);
-    v[5] = _mm_add_epi32(v[5], rnding);
-    v[5] = _mm_srai_epi32(v[5], bit);
-
-    v[6] = _mm_mullo_epi32(u[6], cospim48);
-    x = _mm_mullo_epi32(u[7], cospi16);
-    v[6] = _mm_add_epi32(v[6], x);
-    v[6] = _mm_add_epi32(v[6], rnding);
-    v[6] = _mm_srai_epi32(v[6], bit);
-
-    v[7] = _mm_mullo_epi32(u[6], cospi16);
-    x = _mm_mullo_epi32(u[7], cospim48);
-    v[7] = _mm_sub_epi32(v[7], x);
-    v[7] = _mm_add_epi32(v[7], rnding);
-    v[7] = _mm_srai_epi32(v[7], bit);
-
-    v[8] = u[8];
-    v[9] = u[9];
-    v[10] = u[10];
-    v[11] = u[11];
-
-    v[12] = _mm_mullo_epi32(u[12], cospi16);
-    x = _mm_mullo_epi32(u[13], cospi48);
-    v[12] = _mm_add_epi32(v[12], x);
-    v[12] = _mm_add_epi32(v[12], rnding);
-    v[12] = _mm_srai_epi32(v[12], bit);
-
-    v[13] = _mm_mullo_epi32(u[12], cospi48);
-    x = _mm_mullo_epi32(u[13], cospi16);
-    v[13] = _mm_sub_epi32(v[13], x);
-    v[13] = _mm_add_epi32(v[13], rnding);
-    v[13] = _mm_srai_epi32(v[13], bit);
-
-    v[14] = _mm_mullo_epi32(u[14], cospim48);
-    x = _mm_mullo_epi32(u[15], cospi16);
-    v[14] = _mm_add_epi32(v[14], x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
-
-    v[15] = _mm_mullo_epi32(u[14], cospi16);
-    x = _mm_mullo_epi32(u[15], cospim48);
-    v[15] = _mm_sub_epi32(v[15], x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
-
-    // stage 7
-    u[0] = _mm_add_epi32(v[0], v[2]);
-    u[2] = _mm_sub_epi32(v[0], v[2]);
-    u[1] = _mm_add_epi32(v[1], v[3]);
-    u[3] = _mm_sub_epi32(v[1], v[3]);
-    u[4] = _mm_add_epi32(v[4], v[6]);
-    u[6] = _mm_sub_epi32(v[4], v[6]);
-    u[5] = _mm_add_epi32(v[5], v[7]);
-    u[7] = _mm_sub_epi32(v[5], v[7]);
-    u[8] = _mm_add_epi32(v[8], v[10]);
-    u[10] = _mm_sub_epi32(v[8], v[10]);
-    u[9] = _mm_add_epi32(v[9], v[11]);
-    u[11] = _mm_sub_epi32(v[9], v[11]);
-    u[12] = _mm_add_epi32(v[12], v[14]);
-    u[14] = _mm_sub_epi32(v[12], v[14]);
-    u[13] = _mm_add_epi32(v[13], v[15]);
-    u[15] = _mm_sub_epi32(v[13], v[15]);
-
-    // stage 8
-    v[0] = u[0];
-    v[1] = u[1];
-
-    y = _mm_mullo_epi32(u[2], cospi32);
-    x = _mm_mullo_epi32(u[3], cospi32);
-    v[2] = _mm_add_epi32(y, x);
-    v[2] = _mm_add_epi32(v[2], rnding);
-    v[2] = _mm_srai_epi32(v[2], bit);
-
-    v[3] = _mm_sub_epi32(y, x);
-    v[3] = _mm_add_epi32(v[3], rnding);
-    v[3] = _mm_srai_epi32(v[3], bit);
-
     v[4] = u[4];
     v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+    v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
+    v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
 
-    y = _mm_mullo_epi32(u[6], cospi32);
-    x = _mm_mullo_epi32(u[7], cospi32);
-    v[6] = _mm_add_epi32(y, x);
-    v[6] = _mm_add_epi32(v[6], rnding);
-    v[6] = _mm_srai_epi32(v[6], bit);
-
-    v[7] = _mm_sub_epi32(y, x);
-    v[7] = _mm_add_epi32(v[7], rnding);
-    v[7] = _mm_srai_epi32(v[7], bit);
-
-    v[8] = u[8];
-    v[9] = u[9];
-
-    y = _mm_mullo_epi32(u[10], cospi32);
-    x = _mm_mullo_epi32(u[11], cospi32);
-    v[10] = _mm_add_epi32(y, x);
-    v[10] = _mm_add_epi32(v[10], rnding);
-    v[10] = _mm_srai_epi32(v[10], bit);
-
-    v[11] = _mm_sub_epi32(y, x);
-    v[11] = _mm_add_epi32(v[11], rnding);
-    v[11] = _mm_srai_epi32(v[11], bit);
-
-    v[12] = u[12];
-    v[13] = u[13];
-
-    y = _mm_mullo_epi32(u[14], cospi32);
-    x = _mm_mullo_epi32(u[15], cospi32);
-    v[14] = _mm_add_epi32(y, x);
-    v[14] = _mm_add_epi32(v[14], rnding);
-    v[14] = _mm_srai_epi32(v[14], bit);
+    // stage 7
+    u[0] = _mm_add_epi32(v[0], v[8]);
+    u[1] = _mm_add_epi32(v[1], v[9]);
+    u[2] = _mm_add_epi32(v[2], v[10]);
+    u[3] = _mm_add_epi32(v[3], v[11]);
+    u[4] = _mm_add_epi32(v[4], v[12]);
+    u[5] = _mm_add_epi32(v[5], v[13]);
+    u[6] = _mm_add_epi32(v[6], v[14]);
+    u[7] = _mm_add_epi32(v[7], v[15]);
+    u[8] = _mm_sub_epi32(v[0], v[8]);
+    u[9] = _mm_sub_epi32(v[1], v[9]);
+    u[10] = _mm_sub_epi32(v[2], v[10]);
+    u[11] = _mm_sub_epi32(v[3], v[11]);
+    u[12] = _mm_sub_epi32(v[4], v[12]);
+    u[13] = _mm_sub_epi32(v[5], v[13]);
+    u[14] = _mm_sub_epi32(v[6], v[14]);
+    u[15] = _mm_sub_epi32(v[7], v[15]);
 
-    v[15] = _mm_sub_epi32(y, x);
-    v[15] = _mm_add_epi32(v[15], rnding);
-    v[15] = _mm_srai_epi32(v[15], bit);
+    // stage 8
+    v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
+    v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
+    v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
+    v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
+    v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
+    v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
+    v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
+    v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
+    v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
+    v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
 
     // stage 9
-    out[0 * col_num + col] = v[0];
-    out[1 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[8]);
-    out[2 * col_num + col] = v[12];
-    out[3 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[4]);
-    out[4 * col_num + col] = v[6];
-    out[5 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[14]);
-    out[6 * col_num + col] = v[10];
-    out[7 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[2]);
-    out[8 * col_num + col] = v[3];
-    out[9 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[11]);
-    out[10 * col_num + col] = v[15];
-    out[11 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[7]);
-    out[12 * col_num + col] = v[5];
-    out[13 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[13]);
-    out[14 * col_num + col] = v[9];
-    out[15 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[1]);
+    out[0 * 4 + col] = v[1];
+    out[1 * 4 + col] = v[14];
+    out[2 * 4 + col] = v[3];
+    out[3 * 4 + col] = v[12];
+    out[4 * 4 + col] = v[5];
+    out[5 * 4 + col] = v[10];
+    out[6 * 4 + col] = v[7];
+    out[7 * 4 + col] = v[8];
+    out[8 * 4 + col] = v[9];
+    out[9 * 4 + col] = v[6];
+    out[10 * 4 + col] = v[11];
+    out[11 * 4 + col] = v[4];
+    out[12 * 4 + col] = v[13];
+    out[13 * 4 + col] = v[2];
+    out[14 * 4 + col] = v[15];
+    out[15 * 4 + col] = v[0];
   }
 }
 
@@ -1802,111 +1496,91 @@ static void write_buffer_16x16(const __m128i *in, int32_t *output) {
 void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
                                  int stride, TX_TYPE tx_type, int bd) {
   __m128i in[64], out[64];
-  const TXFM_1D_CFG *row_cfg = NULL;
-  const TXFM_1D_CFG *col_cfg = NULL;
-
+  const int8_t *shift = fwd_txfm_shift_ls[TX_16X16];
+  const int txw_idx = get_txw_idx(TX_16X16);
+  const int txh_idx = get_txh_idx(TX_16X16);
   switch (tx_type) {
     case DCT_DCT:
-      row_cfg = &fwd_txfm_1d_row_cfg_dct_16;
-      col_cfg = &fwd_txfm_1d_col_cfg_dct_16;
-      load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
-      fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
-      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case ADST_DCT:
-      row_cfg = &fwd_txfm_1d_row_cfg_dct_16;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
-      load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
-      fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
-      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case DCT_ADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
-      col_cfg = &fwd_txfm_1d_col_cfg_dct_16;
-      load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
-      fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
-      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case ADST_ADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
-      load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
-      fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
-      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
-#if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      row_cfg = &fwd_txfm_1d_row_cfg_dct_16;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
-      load_buffer_16x16(input, in, stride, 1, 0, row_cfg->shift[0]);
-      fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
-      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case DCT_FLIPADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
-      col_cfg = &fwd_txfm_1d_col_cfg_dct_16;
-      load_buffer_16x16(input, in, stride, 0, 1, row_cfg->shift[0]);
-      fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
-      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case FLIPADST_FLIPADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
-      load_buffer_16x16(input, in, stride, 1, 1, row_cfg->shift[0]);
-      fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
-      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+      load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case ADST_FLIPADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
-      load_buffer_16x16(input, in, stride, 0, 1, row_cfg->shift[0]);
-      fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
-      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case FLIPADST_ADST:
-      row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
-      col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
-      load_buffer_16x16(input, in, stride, 1, 0, row_cfg->shift[0]);
-      fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
-      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
+      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+      col_txfm_16x16_rounding(out, -shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
-#endif  // CONFIG_EXT_TX
     default: assert(0);
   }
   (void)bd;
diff --git a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
deleted file mode 100644
index 88621c82b..000000000
--- a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ /dev/null
@@ -1,1627 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>  // avx2
-
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/fwd_txfm_avx2.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_avx2.h"
-
-static INLINE void load_buffer_16x16(const int16_t *input, int stride,
-                                     int flipud, int fliplr, __m256i *in) {
-  if (!flipud) {
-    in[0] = _mm256_loadu_si256((const __m256i *)(input + 0 * stride));
-    in[1] = _mm256_loadu_si256((const __m256i *)(input + 1 * stride));
-    in[2] = _mm256_loadu_si256((const __m256i *)(input + 2 * stride));
-    in[3] = _mm256_loadu_si256((const __m256i *)(input + 3 * stride));
-    in[4] = _mm256_loadu_si256((const __m256i *)(input + 4 * stride));
-    in[5] = _mm256_loadu_si256((const __m256i *)(input + 5 * stride));
-    in[6] = _mm256_loadu_si256((const __m256i *)(input + 6 * stride));
-    in[7] = _mm256_loadu_si256((const __m256i *)(input + 7 * stride));
-    in[8] = _mm256_loadu_si256((const __m256i *)(input + 8 * stride));
-    in[9] = _mm256_loadu_si256((const __m256i *)(input + 9 * stride));
-    in[10] = _mm256_loadu_si256((const __m256i *)(input + 10 * stride));
-    in[11] = _mm256_loadu_si256((const __m256i *)(input + 11 * stride));
-    in[12] = _mm256_loadu_si256((const __m256i *)(input + 12 * stride));
-    in[13] = _mm256_loadu_si256((const __m256i *)(input + 13 * stride));
-    in[14] = _mm256_loadu_si256((const __m256i *)(input + 14 * stride));
-    in[15] = _mm256_loadu_si256((const __m256i *)(input + 15 * stride));
-  } else {
-    in[0] = _mm256_loadu_si256((const __m256i *)(input + 15 * stride));
-    in[1] = _mm256_loadu_si256((const __m256i *)(input + 14 * stride));
-    in[2] = _mm256_loadu_si256((const __m256i *)(input + 13 * stride));
-    in[3] = _mm256_loadu_si256((const __m256i *)(input + 12 * stride));
-    in[4] = _mm256_loadu_si256((const __m256i *)(input + 11 * stride));
-    in[5] = _mm256_loadu_si256((const __m256i *)(input + 10 * stride));
-    in[6] = _mm256_loadu_si256((const __m256i *)(input + 9 * stride));
-    in[7] = _mm256_loadu_si256((const __m256i *)(input + 8 * stride));
-    in[8] = _mm256_loadu_si256((const __m256i *)(input + 7 * stride));
-    in[9] = _mm256_loadu_si256((const __m256i *)(input + 6 * stride));
-    in[10] = _mm256_loadu_si256((const __m256i *)(input + 5 * stride));
-    in[11] = _mm256_loadu_si256((const __m256i *)(input + 4 * stride));
-    in[12] = _mm256_loadu_si256((const __m256i *)(input + 3 * stride));
-    in[13] = _mm256_loadu_si256((const __m256i *)(input + 2 * stride));
-    in[14] = _mm256_loadu_si256((const __m256i *)(input + 1 * stride));
-    in[15] = _mm256_loadu_si256((const __m256i *)(input + 0 * stride));
-  }
-
-  if (fliplr) {
-    mm256_reverse_epi16(&in[0]);
-    mm256_reverse_epi16(&in[1]);
-    mm256_reverse_epi16(&in[2]);
-    mm256_reverse_epi16(&in[3]);
-    mm256_reverse_epi16(&in[4]);
-    mm256_reverse_epi16(&in[5]);
-    mm256_reverse_epi16(&in[6]);
-    mm256_reverse_epi16(&in[7]);
-    mm256_reverse_epi16(&in[8]);
-    mm256_reverse_epi16(&in[9]);
-    mm256_reverse_epi16(&in[10]);
-    mm256_reverse_epi16(&in[11]);
-    mm256_reverse_epi16(&in[12]);
-    mm256_reverse_epi16(&in[13]);
-    mm256_reverse_epi16(&in[14]);
-    mm256_reverse_epi16(&in[15]);
-  }
-
-  in[0] = _mm256_slli_epi16(in[0], 2);
-  in[1] = _mm256_slli_epi16(in[1], 2);
-  in[2] = _mm256_slli_epi16(in[2], 2);
-  in[3] = _mm256_slli_epi16(in[3], 2);
-  in[4] = _mm256_slli_epi16(in[4], 2);
-  in[5] = _mm256_slli_epi16(in[5], 2);
-  in[6] = _mm256_slli_epi16(in[6], 2);
-  in[7] = _mm256_slli_epi16(in[7], 2);
-  in[8] = _mm256_slli_epi16(in[8], 2);
-  in[9] = _mm256_slli_epi16(in[9], 2);
-  in[10] = _mm256_slli_epi16(in[10], 2);
-  in[11] = _mm256_slli_epi16(in[11], 2);
-  in[12] = _mm256_slli_epi16(in[12], 2);
-  in[13] = _mm256_slli_epi16(in[13], 2);
-  in[14] = _mm256_slli_epi16(in[14], 2);
-  in[15] = _mm256_slli_epi16(in[15], 2);
-}
-
-static INLINE void write_buffer_16x16(const __m256i *in, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    storeu_output_avx2(&in[i], output + (i << 4));
-  }
-}
-
-static void right_shift_16x16(__m256i *in) {
-  const __m256i one = _mm256_set1_epi16(1);
-  __m256i s0 = _mm256_srai_epi16(in[0], 15);
-  __m256i s1 = _mm256_srai_epi16(in[1], 15);
-  __m256i s2 = _mm256_srai_epi16(in[2], 15);
-  __m256i s3 = _mm256_srai_epi16(in[3], 15);
-  __m256i s4 = _mm256_srai_epi16(in[4], 15);
-  __m256i s5 = _mm256_srai_epi16(in[5], 15);
-  __m256i s6 = _mm256_srai_epi16(in[6], 15);
-  __m256i s7 = _mm256_srai_epi16(in[7], 15);
-  __m256i s8 = _mm256_srai_epi16(in[8], 15);
-  __m256i s9 = _mm256_srai_epi16(in[9], 15);
-  __m256i s10 = _mm256_srai_epi16(in[10], 15);
-  __m256i s11 = _mm256_srai_epi16(in[11], 15);
-  __m256i s12 = _mm256_srai_epi16(in[12], 15);
-  __m256i s13 = _mm256_srai_epi16(in[13], 15);
-  __m256i s14 = _mm256_srai_epi16(in[14], 15);
-  __m256i s15 = _mm256_srai_epi16(in[15], 15);
-
-  in[0] = _mm256_add_epi16(in[0], one);
-  in[1] = _mm256_add_epi16(in[1], one);
-  in[2] = _mm256_add_epi16(in[2], one);
-  in[3] = _mm256_add_epi16(in[3], one);
-  in[4] = _mm256_add_epi16(in[4], one);
-  in[5] = _mm256_add_epi16(in[5], one);
-  in[6] = _mm256_add_epi16(in[6], one);
-  in[7] = _mm256_add_epi16(in[7], one);
-  in[8] = _mm256_add_epi16(in[8], one);
-  in[9] = _mm256_add_epi16(in[9], one);
-  in[10] = _mm256_add_epi16(in[10], one);
-  in[11] = _mm256_add_epi16(in[11], one);
-  in[12] = _mm256_add_epi16(in[12], one);
-  in[13] = _mm256_add_epi16(in[13], one);
-  in[14] = _mm256_add_epi16(in[14], one);
-  in[15] = _mm256_add_epi16(in[15], one);
-
-  in[0] = _mm256_sub_epi16(in[0], s0);
-  in[1] = _mm256_sub_epi16(in[1], s1);
-  in[2] = _mm256_sub_epi16(in[2], s2);
-  in[3] = _mm256_sub_epi16(in[3], s3);
-  in[4] = _mm256_sub_epi16(in[4], s4);
-  in[5] = _mm256_sub_epi16(in[5], s5);
-  in[6] = _mm256_sub_epi16(in[6], s6);
-  in[7] = _mm256_sub_epi16(in[7], s7);
-  in[8] = _mm256_sub_epi16(in[8], s8);
-  in[9] = _mm256_sub_epi16(in[9], s9);
-  in[10] = _mm256_sub_epi16(in[10], s10);
-  in[11] = _mm256_sub_epi16(in[11], s11);
-  in[12] = _mm256_sub_epi16(in[12], s12);
-  in[13] = _mm256_sub_epi16(in[13], s13);
-  in[14] = _mm256_sub_epi16(in[14], s14);
-  in[15] = _mm256_sub_epi16(in[15], s15);
-
-  in[0] = _mm256_srai_epi16(in[0], 2);
-  in[1] = _mm256_srai_epi16(in[1], 2);
-  in[2] = _mm256_srai_epi16(in[2], 2);
-  in[3] = _mm256_srai_epi16(in[3], 2);
-  in[4] = _mm256_srai_epi16(in[4], 2);
-  in[5] = _mm256_srai_epi16(in[5], 2);
-  in[6] = _mm256_srai_epi16(in[6], 2);
-  in[7] = _mm256_srai_epi16(in[7], 2);
-  in[8] = _mm256_srai_epi16(in[8], 2);
-  in[9] = _mm256_srai_epi16(in[9], 2);
-  in[10] = _mm256_srai_epi16(in[10], 2);
-  in[11] = _mm256_srai_epi16(in[11], 2);
-  in[12] = _mm256_srai_epi16(in[12], 2);
-  in[13] = _mm256_srai_epi16(in[13], 2);
-  in[14] = _mm256_srai_epi16(in[14], 2);
-  in[15] = _mm256_srai_epi16(in[15], 2);
-}
-
-static void fdct16_avx2(__m256i *in) {
-  // sequence: cospi_L_H = pairs(L, H) and L first
-  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64);
-  const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64);
-  const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
-
-  const __m256i cospi_p30_p02 = pair256_set_epi16(cospi_30_64, cospi_2_64);
-  const __m256i cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64);
-
-  const __m256i cospi_p14_p18 = pair256_set_epi16(cospi_14_64, cospi_18_64);
-  const __m256i cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64);
-
-  const __m256i cospi_p22_p10 = pair256_set_epi16(cospi_22_64, cospi_10_64);
-  const __m256i cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64);
-
-  const __m256i cospi_p06_p26 = pair256_set_epi16(cospi_6_64, cospi_26_64);
-  const __m256i cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64);
-
-  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m256i s0, s1, s2, s3, s4, s5, s6, s7;
-  __m256i t0, t1, t2, t3, t4, t5, t6, t7;
-  __m256i v0, v1, v2, v3;
-  __m256i x0, x1;
-
-  // 0, 4, 8, 12
-  u0 = _mm256_add_epi16(in[0], in[15]);
-  u1 = _mm256_add_epi16(in[1], in[14]);
-  u2 = _mm256_add_epi16(in[2], in[13]);
-  u3 = _mm256_add_epi16(in[3], in[12]);
-  u4 = _mm256_add_epi16(in[4], in[11]);
-  u5 = _mm256_add_epi16(in[5], in[10]);
-  u6 = _mm256_add_epi16(in[6], in[9]);
-  u7 = _mm256_add_epi16(in[7], in[8]);
-
-  s0 = _mm256_add_epi16(u0, u7);
-  s1 = _mm256_add_epi16(u1, u6);
-  s2 = _mm256_add_epi16(u2, u5);
-  s3 = _mm256_add_epi16(u3, u4);
-
-  // 0, 8
-  v0 = _mm256_add_epi16(s0, s3);
-  v1 = _mm256_add_epi16(s1, s2);
-
-  x0 = _mm256_unpacklo_epi16(v0, v1);
-  x1 = _mm256_unpackhi_epi16(v0, v1);
-
-  t0 = butter_fly(&x0, &x1, &cospi_p16_p16);
-  t1 = butter_fly(&x0, &x1, &cospi_p16_m16);
-
-  // 4, 12
-  v0 = _mm256_sub_epi16(s1, s2);
-  v1 = _mm256_sub_epi16(s0, s3);
-
-  x0 = _mm256_unpacklo_epi16(v0, v1);
-  x1 = _mm256_unpackhi_epi16(v0, v1);
-
-  t2 = butter_fly(&x0, &x1, &cospi_p24_p08);
-  t3 = butter_fly(&x0, &x1, &cospi_m08_p24);
-
-  // 2, 6, 10, 14
-  s0 = _mm256_sub_epi16(u3, u4);
-  s1 = _mm256_sub_epi16(u2, u5);
-  s2 = _mm256_sub_epi16(u1, u6);
-  s3 = _mm256_sub_epi16(u0, u7);
-
-  v0 = s0;  // output[4]
-  v3 = s3;  // output[7]
-
-  x0 = _mm256_unpacklo_epi16(s2, s1);
-  x1 = _mm256_unpackhi_epi16(s2, s1);
-
-  v2 = butter_fly(&x0, &x1, &cospi_p16_p16);  // output[5]
-  v1 = butter_fly(&x0, &x1, &cospi_p16_m16);  // output[6]
-
-  s0 = _mm256_add_epi16(v0, v1);  // step[4]
-  s1 = _mm256_sub_epi16(v0, v1);  // step[5]
-  s2 = _mm256_sub_epi16(v3, v2);  // step[6]
-  s3 = _mm256_add_epi16(v3, v2);  // step[7]
-
-  // 2, 14
-  x0 = _mm256_unpacklo_epi16(s0, s3);
-  x1 = _mm256_unpackhi_epi16(s0, s3);
-
-  t4 = butter_fly(&x0, &x1, &cospi_p28_p04);
-  t5 = butter_fly(&x0, &x1, &cospi_m04_p28);
-
-  // 10, 6
-  x0 = _mm256_unpacklo_epi16(s1, s2);
-  x1 = _mm256_unpackhi_epi16(s1, s2);
-  t6 = butter_fly(&x0, &x1, &cospi_p12_p20);
-  t7 = butter_fly(&x0, &x1, &cospi_m20_p12);
-
-  // 1, 3, 5, 7, 9, 11, 13, 15
-  s0 = _mm256_sub_epi16(in[7], in[8]);  // step[8]
-  s1 = _mm256_sub_epi16(in[6], in[9]);  // step[9]
-  u2 = _mm256_sub_epi16(in[5], in[10]);
-  u3 = _mm256_sub_epi16(in[4], in[11]);
-  u4 = _mm256_sub_epi16(in[3], in[12]);
-  u5 = _mm256_sub_epi16(in[2], in[13]);
-  s6 = _mm256_sub_epi16(in[1], in[14]);  // step[14]
-  s7 = _mm256_sub_epi16(in[0], in[15]);  // step[15]
-
-  in[0] = t0;
-  in[8] = t1;
-  in[4] = t2;
-  in[12] = t3;
-  in[2] = t4;
-  in[14] = t5;
-  in[10] = t6;
-  in[6] = t7;
-
-  x0 = _mm256_unpacklo_epi16(u5, u2);
-  x1 = _mm256_unpackhi_epi16(u5, u2);
-
-  s2 = butter_fly(&x0, &x1, &cospi_p16_p16);  // step[13]
-  s5 = butter_fly(&x0, &x1, &cospi_p16_m16);  // step[10]
-
-  x0 = _mm256_unpacklo_epi16(u4, u3);
-  x1 = _mm256_unpackhi_epi16(u4, u3);
-
-  s3 = butter_fly(&x0, &x1, &cospi_p16_p16);  // step[12]
-  s4 = butter_fly(&x0, &x1, &cospi_p16_m16);  // step[11]
-
-  u0 = _mm256_add_epi16(s0, s4);  // output[8]
-  u1 = _mm256_add_epi16(s1, s5);
-  u2 = _mm256_sub_epi16(s1, s5);
-  u3 = _mm256_sub_epi16(s0, s4);
-  u4 = _mm256_sub_epi16(s7, s3);
-  u5 = _mm256_sub_epi16(s6, s2);
-  u6 = _mm256_add_epi16(s6, s2);
-  u7 = _mm256_add_epi16(s7, s3);
-
-  // stage 4
-  s0 = u0;
-  s3 = u3;
-  s4 = u4;
-  s7 = u7;
-
-  x0 = _mm256_unpacklo_epi16(u1, u6);
-  x1 = _mm256_unpackhi_epi16(u1, u6);
-
-  s1 = butter_fly(&x0, &x1, &cospi_m08_p24);
-  s6 = butter_fly(&x0, &x1, &cospi_p24_p08);
-
-  x0 = _mm256_unpacklo_epi16(u2, u5);
-  x1 = _mm256_unpackhi_epi16(u2, u5);
-
-  s2 = butter_fly(&x0, &x1, &cospi_m24_m08);
-  s5 = butter_fly(&x0, &x1, &cospi_m08_p24);
-
-  // stage 5
-  u0 = _mm256_add_epi16(s0, s1);
-  u1 = _mm256_sub_epi16(s0, s1);
-  u2 = _mm256_sub_epi16(s3, s2);
-  u3 = _mm256_add_epi16(s3, s2);
-  u4 = _mm256_add_epi16(s4, s5);
-  u5 = _mm256_sub_epi16(s4, s5);
-  u6 = _mm256_sub_epi16(s7, s6);
-  u7 = _mm256_add_epi16(s7, s6);
-
-  // stage 6
-  x0 = _mm256_unpacklo_epi16(u0, u7);
-  x1 = _mm256_unpackhi_epi16(u0, u7);
-  in[1] = butter_fly(&x0, &x1, &cospi_p30_p02);
-  in[15] = butter_fly(&x0, &x1, &cospi_m02_p30);
-
-  x0 = _mm256_unpacklo_epi16(u1, u6);
-  x1 = _mm256_unpackhi_epi16(u1, u6);
-  in[9] = butter_fly(&x0, &x1, &cospi_p14_p18);
-  in[7] = butter_fly(&x0, &x1, &cospi_m18_p14);
-
-  x0 = _mm256_unpacklo_epi16(u2, u5);
-  x1 = _mm256_unpackhi_epi16(u2, u5);
-  in[5] = butter_fly(&x0, &x1, &cospi_p22_p10);
-  in[11] = butter_fly(&x0, &x1, &cospi_m10_p22);
-
-  x0 = _mm256_unpacklo_epi16(u3, u4);
-  x1 = _mm256_unpackhi_epi16(u3, u4);
-  in[13] = butter_fly(&x0, &x1, &cospi_p06_p26);
-  in[3] = butter_fly(&x0, &x1, &cospi_m26_p06);
-}
-
-void fadst16_avx2(__m256i *in) {
-  const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64);
-  const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64);
-  const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64);
-  const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64);
-  const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64);
-  const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
-  const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64);
-  const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64);
-  const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
-  const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
-  const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64);
-  const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64);
-  const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
-  const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64);
-  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  __m256i s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
-  __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-  __m256i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
-  __m256i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-  __m256i y0, y1;
-
-  // stage 1, s takes low 256 bits; x takes high 256 bits
-  y0 = _mm256_unpacklo_epi16(in[15], in[0]);
-  y1 = _mm256_unpackhi_epi16(in[15], in[0]);
-  s0 = _mm256_madd_epi16(y0, cospi_p01_p31);
-  x0 = _mm256_madd_epi16(y1, cospi_p01_p31);
-  s1 = _mm256_madd_epi16(y0, cospi_p31_m01);
-  x1 = _mm256_madd_epi16(y1, cospi_p31_m01);
-
-  y0 = _mm256_unpacklo_epi16(in[13], in[2]);
-  y1 = _mm256_unpackhi_epi16(in[13], in[2]);
-  s2 = _mm256_madd_epi16(y0, cospi_p05_p27);
-  x2 = _mm256_madd_epi16(y1, cospi_p05_p27);
-  s3 = _mm256_madd_epi16(y0, cospi_p27_m05);
-  x3 = _mm256_madd_epi16(y1, cospi_p27_m05);
-
-  y0 = _mm256_unpacklo_epi16(in[11], in[4]);
-  y1 = _mm256_unpackhi_epi16(in[11], in[4]);
-  s4 = _mm256_madd_epi16(y0, cospi_p09_p23);
-  x4 = _mm256_madd_epi16(y1, cospi_p09_p23);
-  s5 = _mm256_madd_epi16(y0, cospi_p23_m09);
-  x5 = _mm256_madd_epi16(y1, cospi_p23_m09);
-
-  y0 = _mm256_unpacklo_epi16(in[9], in[6]);
-  y1 = _mm256_unpackhi_epi16(in[9], in[6]);
-  s6 = _mm256_madd_epi16(y0, cospi_p13_p19);
-  x6 = _mm256_madd_epi16(y1, cospi_p13_p19);
-  s7 = _mm256_madd_epi16(y0, cospi_p19_m13);
-  x7 = _mm256_madd_epi16(y1, cospi_p19_m13);
-
-  y0 = _mm256_unpacklo_epi16(in[7], in[8]);
-  y1 = _mm256_unpackhi_epi16(in[7], in[8]);
-  s8 = _mm256_madd_epi16(y0, cospi_p17_p15);
-  x8 = _mm256_madd_epi16(y1, cospi_p17_p15);
-  s9 = _mm256_madd_epi16(y0, cospi_p15_m17);
-  x9 = _mm256_madd_epi16(y1, cospi_p15_m17);
-
-  y0 = _mm256_unpacklo_epi16(in[5], in[10]);
-  y1 = _mm256_unpackhi_epi16(in[5], in[10]);
-  s10 = _mm256_madd_epi16(y0, cospi_p21_p11);
-  x10 = _mm256_madd_epi16(y1, cospi_p21_p11);
-  s11 = _mm256_madd_epi16(y0, cospi_p11_m21);
-  x11 = _mm256_madd_epi16(y1, cospi_p11_m21);
-
-  y0 = _mm256_unpacklo_epi16(in[3], in[12]);
-  y1 = _mm256_unpackhi_epi16(in[3], in[12]);
-  s12 = _mm256_madd_epi16(y0, cospi_p25_p07);
-  x12 = _mm256_madd_epi16(y1, cospi_p25_p07);
-  s13 = _mm256_madd_epi16(y0, cospi_p07_m25);
-  x13 = _mm256_madd_epi16(y1, cospi_p07_m25);
-
-  y0 = _mm256_unpacklo_epi16(in[1], in[14]);
-  y1 = _mm256_unpackhi_epi16(in[1], in[14]);
-  s14 = _mm256_madd_epi16(y0, cospi_p29_p03);
-  x14 = _mm256_madd_epi16(y1, cospi_p29_p03);
-  s15 = _mm256_madd_epi16(y0, cospi_p03_m29);
-  x15 = _mm256_madd_epi16(y1, cospi_p03_m29);
-
-  // u takes low 256 bits; v takes high 256 bits
-  u0 = _mm256_add_epi32(s0, s8);
-  u1 = _mm256_add_epi32(s1, s9);
-  u2 = _mm256_add_epi32(s2, s10);
-  u3 = _mm256_add_epi32(s3, s11);
-  u4 = _mm256_add_epi32(s4, s12);
-  u5 = _mm256_add_epi32(s5, s13);
-  u6 = _mm256_add_epi32(s6, s14);
-  u7 = _mm256_add_epi32(s7, s15);
-
-  u8 = _mm256_sub_epi32(s0, s8);
-  u9 = _mm256_sub_epi32(s1, s9);
-  u10 = _mm256_sub_epi32(s2, s10);
-  u11 = _mm256_sub_epi32(s3, s11);
-  u12 = _mm256_sub_epi32(s4, s12);
-  u13 = _mm256_sub_epi32(s5, s13);
-  u14 = _mm256_sub_epi32(s6, s14);
-  u15 = _mm256_sub_epi32(s7, s15);
-
-  v0 = _mm256_add_epi32(x0, x8);
-  v1 = _mm256_add_epi32(x1, x9);
-  v2 = _mm256_add_epi32(x2, x10);
-  v3 = _mm256_add_epi32(x3, x11);
-  v4 = _mm256_add_epi32(x4, x12);
-  v5 = _mm256_add_epi32(x5, x13);
-  v6 = _mm256_add_epi32(x6, x14);
-  v7 = _mm256_add_epi32(x7, x15);
-
-  v8 = _mm256_sub_epi32(x0, x8);
-  v9 = _mm256_sub_epi32(x1, x9);
-  v10 = _mm256_sub_epi32(x2, x10);
-  v11 = _mm256_sub_epi32(x3, x11);
-  v12 = _mm256_sub_epi32(x4, x12);
-  v13 = _mm256_sub_epi32(x5, x13);
-  v14 = _mm256_sub_epi32(x6, x14);
-  v15 = _mm256_sub_epi32(x7, x15);
-
-  // low 256 bits rounding
-  u8 = _mm256_add_epi32(u8, dct_rounding);
-  u9 = _mm256_add_epi32(u9, dct_rounding);
-  u10 = _mm256_add_epi32(u10, dct_rounding);
-  u11 = _mm256_add_epi32(u11, dct_rounding);
-  u12 = _mm256_add_epi32(u12, dct_rounding);
-  u13 = _mm256_add_epi32(u13, dct_rounding);
-  u14 = _mm256_add_epi32(u14, dct_rounding);
-  u15 = _mm256_add_epi32(u15, dct_rounding);
-
-  u8 = _mm256_srai_epi32(u8, DCT_CONST_BITS);
-  u9 = _mm256_srai_epi32(u9, DCT_CONST_BITS);
-  u10 = _mm256_srai_epi32(u10, DCT_CONST_BITS);
-  u11 = _mm256_srai_epi32(u11, DCT_CONST_BITS);
-  u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS);
-  u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS);
-  u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
-  u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
-
-  // high 256 bits rounding
-  v8 = _mm256_add_epi32(v8, dct_rounding);
-  v9 = _mm256_add_epi32(v9, dct_rounding);
-  v10 = _mm256_add_epi32(v10, dct_rounding);
-  v11 = _mm256_add_epi32(v11, dct_rounding);
-  v12 = _mm256_add_epi32(v12, dct_rounding);
-  v13 = _mm256_add_epi32(v13, dct_rounding);
-  v14 = _mm256_add_epi32(v14, dct_rounding);
-  v15 = _mm256_add_epi32(v15, dct_rounding);
-
-  v8 = _mm256_srai_epi32(v8, DCT_CONST_BITS);
-  v9 = _mm256_srai_epi32(v9, DCT_CONST_BITS);
-  v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS);
-  v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS);
-  v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
-  v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
-  v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
-  v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
-
-  // Saturation pack 32-bit to 16-bit
-  x8 = _mm256_packs_epi32(u8, v8);
-  x9 = _mm256_packs_epi32(u9, v9);
-  x10 = _mm256_packs_epi32(u10, v10);
-  x11 = _mm256_packs_epi32(u11, v11);
-  x12 = _mm256_packs_epi32(u12, v12);
-  x13 = _mm256_packs_epi32(u13, v13);
-  x14 = _mm256_packs_epi32(u14, v14);
-  x15 = _mm256_packs_epi32(u15, v15);
-
-  // stage 2
-  y0 = _mm256_unpacklo_epi16(x8, x9);
-  y1 = _mm256_unpackhi_epi16(x8, x9);
-  s8 = _mm256_madd_epi16(y0, cospi_p04_p28);
-  x8 = _mm256_madd_epi16(y1, cospi_p04_p28);
-  s9 = _mm256_madd_epi16(y0, cospi_p28_m04);
-  x9 = _mm256_madd_epi16(y1, cospi_p28_m04);
-
-  y0 = _mm256_unpacklo_epi16(x10, x11);
-  y1 = _mm256_unpackhi_epi16(x10, x11);
-  s10 = _mm256_madd_epi16(y0, cospi_p20_p12);
-  x10 = _mm256_madd_epi16(y1, cospi_p20_p12);
-  s11 = _mm256_madd_epi16(y0, cospi_p12_m20);
-  x11 = _mm256_madd_epi16(y1, cospi_p12_m20);
-
-  y0 = _mm256_unpacklo_epi16(x12, x13);
-  y1 = _mm256_unpackhi_epi16(x12, x13);
-  s12 = _mm256_madd_epi16(y0, cospi_m28_p04);
-  x12 = _mm256_madd_epi16(y1, cospi_m28_p04);
-  s13 = _mm256_madd_epi16(y0, cospi_p04_p28);
-  x13 = _mm256_madd_epi16(y1, cospi_p04_p28);
-
-  y0 = _mm256_unpacklo_epi16(x14, x15);
-  y1 = _mm256_unpackhi_epi16(x14, x15);
-  s14 = _mm256_madd_epi16(y0, cospi_m12_p20);
-  x14 = _mm256_madd_epi16(y1, cospi_m12_p20);
-  s15 = _mm256_madd_epi16(y0, cospi_p20_p12);
-  x15 = _mm256_madd_epi16(y1, cospi_p20_p12);
-
-  x0 = _mm256_add_epi32(u0, u4);
-  s0 = _mm256_add_epi32(v0, v4);
-  x1 = _mm256_add_epi32(u1, u5);
-  s1 = _mm256_add_epi32(v1, v5);
-  x2 = _mm256_add_epi32(u2, u6);
-  s2 = _mm256_add_epi32(v2, v6);
-  x3 = _mm256_add_epi32(u3, u7);
-  s3 = _mm256_add_epi32(v3, v7);
-
-  v8 = _mm256_sub_epi32(u0, u4);
-  v9 = _mm256_sub_epi32(v0, v4);
-  v10 = _mm256_sub_epi32(u1, u5);
-  v11 = _mm256_sub_epi32(v1, v5);
-  v12 = _mm256_sub_epi32(u2, u6);
-  v13 = _mm256_sub_epi32(v2, v6);
-  v14 = _mm256_sub_epi32(u3, u7);
-  v15 = _mm256_sub_epi32(v3, v7);
-
-  v8 = _mm256_add_epi32(v8, dct_rounding);
-  v9 = _mm256_add_epi32(v9, dct_rounding);
-  v10 = _mm256_add_epi32(v10, dct_rounding);
-  v11 = _mm256_add_epi32(v11, dct_rounding);
-  v12 = _mm256_add_epi32(v12, dct_rounding);
-  v13 = _mm256_add_epi32(v13, dct_rounding);
-  v14 = _mm256_add_epi32(v14, dct_rounding);
-  v15 = _mm256_add_epi32(v15, dct_rounding);
-
-  v8 = _mm256_srai_epi32(v8, DCT_CONST_BITS);
-  v9 = _mm256_srai_epi32(v9, DCT_CONST_BITS);
-  v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS);
-  v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS);
-  v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
-  v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
-  v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
-  v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
-
-  x4 = _mm256_packs_epi32(v8, v9);
-  x5 = _mm256_packs_epi32(v10, v11);
-  x6 = _mm256_packs_epi32(v12, v13);
-  x7 = _mm256_packs_epi32(v14, v15);
-
-  u8 = _mm256_add_epi32(s8, s12);
-  u9 = _mm256_add_epi32(s9, s13);
-  u10 = _mm256_add_epi32(s10, s14);
-  u11 = _mm256_add_epi32(s11, s15);
-  u12 = _mm256_sub_epi32(s8, s12);
-  u13 = _mm256_sub_epi32(s9, s13);
-  u14 = _mm256_sub_epi32(s10, s14);
-  u15 = _mm256_sub_epi32(s11, s15);
-
-  v8 = _mm256_add_epi32(x8, x12);
-  v9 = _mm256_add_epi32(x9, x13);
-  v10 = _mm256_add_epi32(x10, x14);
-  v11 = _mm256_add_epi32(x11, x15);
-  v12 = _mm256_sub_epi32(x8, x12);
-  v13 = _mm256_sub_epi32(x9, x13);
-  v14 = _mm256_sub_epi32(x10, x14);
-  v15 = _mm256_sub_epi32(x11, x15);
-
-  u12 = _mm256_add_epi32(u12, dct_rounding);
-  u13 = _mm256_add_epi32(u13, dct_rounding);
-  u14 = _mm256_add_epi32(u14, dct_rounding);
-  u15 = _mm256_add_epi32(u15, dct_rounding);
-
-  u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS);
-  u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS);
-  u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
-  u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
-
-  v12 = _mm256_add_epi32(v12, dct_rounding);
-  v13 = _mm256_add_epi32(v13, dct_rounding);
-  v14 = _mm256_add_epi32(v14, dct_rounding);
-  v15 = _mm256_add_epi32(v15, dct_rounding);
-
-  v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
-  v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
-  v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
-  v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
-
-  x12 = _mm256_packs_epi32(u12, v12);
-  x13 = _mm256_packs_epi32(u13, v13);
-  x14 = _mm256_packs_epi32(u14, v14);
-  x15 = _mm256_packs_epi32(u15, v15);
-
-  // stage 3
-  y0 = _mm256_unpacklo_epi16(x4, x5);
-  y1 = _mm256_unpackhi_epi16(x4, x5);
-  s4 = _mm256_madd_epi16(y0, cospi_p08_p24);
-  x4 = _mm256_madd_epi16(y1, cospi_p08_p24);
-  s5 = _mm256_madd_epi16(y0, cospi_p24_m08);
-  x5 = _mm256_madd_epi16(y1, cospi_p24_m08);
-
-  y0 = _mm256_unpacklo_epi16(x6, x7);
-  y1 = _mm256_unpackhi_epi16(x6, x7);
-  s6 = _mm256_madd_epi16(y0, cospi_m24_p08);
-  x6 = _mm256_madd_epi16(y1, cospi_m24_p08);
-  s7 = _mm256_madd_epi16(y0, cospi_p08_p24);
-  x7 = _mm256_madd_epi16(y1, cospi_p08_p24);
-
-  y0 = _mm256_unpacklo_epi16(x12, x13);
-  y1 = _mm256_unpackhi_epi16(x12, x13);
-  s12 = _mm256_madd_epi16(y0, cospi_p08_p24);
-  x12 = _mm256_madd_epi16(y1, cospi_p08_p24);
-  s13 = _mm256_madd_epi16(y0, cospi_p24_m08);
-  x13 = _mm256_madd_epi16(y1, cospi_p24_m08);
-
-  y0 = _mm256_unpacklo_epi16(x14, x15);
-  y1 = _mm256_unpackhi_epi16(x14, x15);
-  s14 = _mm256_madd_epi16(y0, cospi_m24_p08);
-  x14 = _mm256_madd_epi16(y1, cospi_m24_p08);
-  s15 = _mm256_madd_epi16(y0, cospi_p08_p24);
-  x15 = _mm256_madd_epi16(y1, cospi_p08_p24);
-
-  u0 = _mm256_add_epi32(x0, x2);
-  v0 = _mm256_add_epi32(s0, s2);
-  u1 = _mm256_add_epi32(x1, x3);
-  v1 = _mm256_add_epi32(s1, s3);
-  u2 = _mm256_sub_epi32(x0, x2);
-  v2 = _mm256_sub_epi32(s0, s2);
-  u3 = _mm256_sub_epi32(x1, x3);
-  v3 = _mm256_sub_epi32(s1, s3);
-
-  u0 = _mm256_add_epi32(u0, dct_rounding);
-  v0 = _mm256_add_epi32(v0, dct_rounding);
-  u1 = _mm256_add_epi32(u1, dct_rounding);
-  v1 = _mm256_add_epi32(v1, dct_rounding);
-  u2 = _mm256_add_epi32(u2, dct_rounding);
-  v2 = _mm256_add_epi32(v2, dct_rounding);
-  u3 = _mm256_add_epi32(u3, dct_rounding);
-  v3 = _mm256_add_epi32(v3, dct_rounding);
-
-  u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
-  v0 = _mm256_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
-  v1 = _mm256_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS);
-  v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS);
-  v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS);
-
-  in[0] = _mm256_packs_epi32(u0, v0);
-  x1 = _mm256_packs_epi32(u1, v1);
-  x2 = _mm256_packs_epi32(u2, v2);
-  x3 = _mm256_packs_epi32(u3, v3);
-
-  // Rounding on s4 + s6, s5 + s7, s4 - s6, s5 - s7
-  u4 = _mm256_add_epi32(s4, s6);
-  u5 = _mm256_add_epi32(s5, s7);
-  u6 = _mm256_sub_epi32(s4, s6);
-  u7 = _mm256_sub_epi32(s5, s7);
-
-  v4 = _mm256_add_epi32(x4, x6);
-  v5 = _mm256_add_epi32(x5, x7);
-  v6 = _mm256_sub_epi32(x4, x6);
-  v7 = _mm256_sub_epi32(x5, x7);
-
-  u4 = _mm256_add_epi32(u4, dct_rounding);
-  u5 = _mm256_add_epi32(u5, dct_rounding);
-  u6 = _mm256_add_epi32(u6, dct_rounding);
-  u7 = _mm256_add_epi32(u7, dct_rounding);
-
-  u4 = _mm256_srai_epi32(u4, DCT_CONST_BITS);
-  u5 = _mm256_srai_epi32(u5, DCT_CONST_BITS);
-  u6 = _mm256_srai_epi32(u6, DCT_CONST_BITS);
-  u7 = _mm256_srai_epi32(u7, DCT_CONST_BITS);
-
-  v4 = _mm256_add_epi32(v4, dct_rounding);
-  v5 = _mm256_add_epi32(v5, dct_rounding);
-  v6 = _mm256_add_epi32(v6, dct_rounding);
-  v7 = _mm256_add_epi32(v7, dct_rounding);
-
-  v4 = _mm256_srai_epi32(v4, DCT_CONST_BITS);
-  v5 = _mm256_srai_epi32(v5, DCT_CONST_BITS);
-  v6 = _mm256_srai_epi32(v6, DCT_CONST_BITS);
-  v7 = _mm256_srai_epi32(v7, DCT_CONST_BITS);
-
-  x4 = _mm256_packs_epi32(u4, v4);
-  in[12] = _mm256_packs_epi32(u5, v5);
-  x6 = _mm256_packs_epi32(u6, v6);
-  x7 = _mm256_packs_epi32(u7, v7);
-
-  u0 = _mm256_add_epi32(u8, u10);
-  v0 = _mm256_add_epi32(v8, v10);
-  u1 = _mm256_add_epi32(u9, u11);
-  v1 = _mm256_add_epi32(v9, v11);
-  u2 = _mm256_sub_epi32(u8, u10);
-  v2 = _mm256_sub_epi32(v8, v10);
-  u3 = _mm256_sub_epi32(u9, u11);
-  v3 = _mm256_sub_epi32(v9, v11);
-
-  u0 = _mm256_add_epi32(u0, dct_rounding);
-  v0 = _mm256_add_epi32(v0, dct_rounding);
-  u1 = _mm256_add_epi32(u1, dct_rounding);
-  v1 = _mm256_add_epi32(v1, dct_rounding);
-  u2 = _mm256_add_epi32(u2, dct_rounding);
-  v2 = _mm256_add_epi32(v2, dct_rounding);
-  u3 = _mm256_add_epi32(u3, dct_rounding);
-  v3 = _mm256_add_epi32(v3, dct_rounding);
-
-  u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
-  v0 = _mm256_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
-  v1 = _mm256_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS);
-  v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS);
-  v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS);
-
-  x8 = _mm256_packs_epi32(u0, v0);
-  in[14] = _mm256_packs_epi32(u1, v1);
-  x10 = _mm256_packs_epi32(u2, v2);
-  x11 = _mm256_packs_epi32(u3, v3);
-
-  // Rounding on s12 + s14, s13 + s15, s12 - s14, s13 - s15
-  u12 = _mm256_add_epi32(s12, s14);
-  u13 = _mm256_add_epi32(s13, s15);
-  u14 = _mm256_sub_epi32(s12, s14);
-  u15 = _mm256_sub_epi32(s13, s15);
-
-  v12 = _mm256_add_epi32(x12, x14);
-  v13 = _mm256_add_epi32(x13, x15);
-  v14 = _mm256_sub_epi32(x12, x14);
-  v15 = _mm256_sub_epi32(x13, x15);
-
-  u12 = _mm256_add_epi32(u12, dct_rounding);
-  u13 = _mm256_add_epi32(u13, dct_rounding);
-  u14 = _mm256_add_epi32(u14, dct_rounding);
-  u15 = _mm256_add_epi32(u15, dct_rounding);
-
-  u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS);
-  u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS);
-  u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
-  u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
-
-  v12 = _mm256_add_epi32(v12, dct_rounding);
-  v13 = _mm256_add_epi32(v13, dct_rounding);
-  v14 = _mm256_add_epi32(v14, dct_rounding);
-  v15 = _mm256_add_epi32(v15, dct_rounding);
-
-  v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
-  v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
-  v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
-  v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
-
-  x12 = _mm256_packs_epi32(u12, v12);
-  x13 = _mm256_packs_epi32(u13, v13);
-  x14 = _mm256_packs_epi32(u14, v14);
-  x15 = _mm256_packs_epi32(u15, v15);
-  in[2] = x12;
-
-  // stage 4
-  y0 = _mm256_unpacklo_epi16(x2, x3);
-  y1 = _mm256_unpackhi_epi16(x2, x3);
-  s2 = _mm256_madd_epi16(y0, cospi_m16_m16);
-  x2 = _mm256_madd_epi16(y1, cospi_m16_m16);
-  s3 = _mm256_madd_epi16(y0, cospi_p16_m16);
-  x3 = _mm256_madd_epi16(y1, cospi_p16_m16);
-
-  y0 = _mm256_unpacklo_epi16(x6, x7);
-  y1 = _mm256_unpackhi_epi16(x6, x7);
-  s6 = _mm256_madd_epi16(y0, cospi_p16_p16);
-  x6 = _mm256_madd_epi16(y1, cospi_p16_p16);
-  s7 = _mm256_madd_epi16(y0, cospi_m16_p16);
-  x7 = _mm256_madd_epi16(y1, cospi_m16_p16);
-
-  y0 = _mm256_unpacklo_epi16(x10, x11);
-  y1 = _mm256_unpackhi_epi16(x10, x11);
-  s10 = _mm256_madd_epi16(y0, cospi_p16_p16);
-  x10 = _mm256_madd_epi16(y1, cospi_p16_p16);
-  s11 = _mm256_madd_epi16(y0, cospi_m16_p16);
-  x11 = _mm256_madd_epi16(y1, cospi_m16_p16);
-
-  y0 = _mm256_unpacklo_epi16(x14, x15);
-  y1 = _mm256_unpackhi_epi16(x14, x15);
-  s14 = _mm256_madd_epi16(y0, cospi_m16_m16);
-  x14 = _mm256_madd_epi16(y1, cospi_m16_m16);
-  s15 = _mm256_madd_epi16(y0, cospi_p16_m16);
-  x15 = _mm256_madd_epi16(y1, cospi_p16_m16);
-
-  // Rounding
-  u2 = _mm256_add_epi32(s2, dct_rounding);
-  u3 = _mm256_add_epi32(s3, dct_rounding);
-  u6 = _mm256_add_epi32(s6, dct_rounding);
-  u7 = _mm256_add_epi32(s7, dct_rounding);
-
-  u10 = _mm256_add_epi32(s10, dct_rounding);
-  u11 = _mm256_add_epi32(s11, dct_rounding);
-  u14 = _mm256_add_epi32(s14, dct_rounding);
-  u15 = _mm256_add_epi32(s15, dct_rounding);
-
-  u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS);
-  u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS);
-  u6 = _mm256_srai_epi32(u6, DCT_CONST_BITS);
-  u7 = _mm256_srai_epi32(u7, DCT_CONST_BITS);
-
-  u10 = _mm256_srai_epi32(u10, DCT_CONST_BITS);
-  u11 = _mm256_srai_epi32(u11, DCT_CONST_BITS);
-  u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
-  u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
-
-  v2 = _mm256_add_epi32(x2, dct_rounding);
-  v3 = _mm256_add_epi32(x3, dct_rounding);
-  v6 = _mm256_add_epi32(x6, dct_rounding);
-  v7 = _mm256_add_epi32(x7, dct_rounding);
-
-  v10 = _mm256_add_epi32(x10, dct_rounding);
-  v11 = _mm256_add_epi32(x11, dct_rounding);
-  v14 = _mm256_add_epi32(x14, dct_rounding);
-  v15 = _mm256_add_epi32(x15, dct_rounding);
-
-  v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS);
-  v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS);
-  v6 = _mm256_srai_epi32(v6, DCT_CONST_BITS);
-  v7 = _mm256_srai_epi32(v7, DCT_CONST_BITS);
-
-  v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS);
-  v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS);
-  v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
-  v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
-
-  in[7] = _mm256_packs_epi32(u2, v2);
-  in[8] = _mm256_packs_epi32(u3, v3);
-
-  in[4] = _mm256_packs_epi32(u6, v6);
-  in[11] = _mm256_packs_epi32(u7, v7);
-
-  in[6] = _mm256_packs_epi32(u10, v10);
-  in[9] = _mm256_packs_epi32(u11, v11);
-
-  in[5] = _mm256_packs_epi32(u14, v14);
-  in[10] = _mm256_packs_epi32(u15, v15);
-
-  in[1] = _mm256_sub_epi16(zero, x8);
-  in[3] = _mm256_sub_epi16(zero, x4);
-  in[13] = _mm256_sub_epi16(zero, x13);
-  in[15] = _mm256_sub_epi16(zero, x1);
-}
-
-#if CONFIG_EXT_TX
-static void fidtx16_avx2(__m256i *in) {
-  txfm_scaling16_avx2((int16_t)Sqrt2, in);
-}
-#endif
-
-void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
-                       TxfmParam *txfm_param) {
-  __m256i in[16];
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
-#endif
-
-  switch (tx_type) {
-    case DCT_DCT:
-      load_buffer_16x16(input, stride, 0, 0, in);
-      fdct16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fdct16_avx2(in);
-      break;
-    case ADST_DCT:
-      load_buffer_16x16(input, stride, 0, 0, in);
-      fadst16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fdct16_avx2(in);
-      break;
-    case DCT_ADST:
-      load_buffer_16x16(input, stride, 0, 0, in);
-      fdct16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fadst16_avx2(in);
-      break;
-    case ADST_ADST:
-      load_buffer_16x16(input, stride, 0, 0, in);
-      fadst16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fadst16_avx2(in);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      load_buffer_16x16(input, stride, 1, 0, in);
-      fadst16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fdct16_avx2(in);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_16x16(input, stride, 0, 1, in);
-      fdct16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fadst16_avx2(in);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_16x16(input, stride, 1, 1, in);
-      fadst16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fadst16_avx2(in);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_16x16(input, stride, 0, 1, in);
-      fadst16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fadst16_avx2(in);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_16x16(input, stride, 1, 0, in);
-      fadst16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fadst16_avx2(in);
-      break;
-    case IDTX:
-      load_buffer_16x16(input, stride, 0, 0, in);
-      fidtx16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fidtx16_avx2(in);
-      break;
-    case V_DCT:
-      load_buffer_16x16(input, stride, 0, 0, in);
-      fdct16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fidtx16_avx2(in);
-      break;
-    case H_DCT:
-      load_buffer_16x16(input, stride, 0, 0, in);
-      fidtx16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fdct16_avx2(in);
-      break;
-    case V_ADST:
-      load_buffer_16x16(input, stride, 0, 0, in);
-      fadst16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fidtx16_avx2(in);
-      break;
-    case H_ADST:
-      load_buffer_16x16(input, stride, 0, 0, in);
-      fidtx16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fadst16_avx2(in);
-      break;
-    case V_FLIPADST:
-      load_buffer_16x16(input, stride, 1, 0, in);
-      fadst16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fidtx16_avx2(in);
-      break;
-    case H_FLIPADST:
-      load_buffer_16x16(input, stride, 0, 1, in);
-      fidtx16_avx2(in);
-      mm256_transpose_16x16(in, in);
-      right_shift_16x16(in);
-      fadst16_avx2(in);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-  mm256_transpose_16x16(in, in);
-  write_buffer_16x16(in, output);
-  _mm256_zeroupper();
-}
-
-static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) {
-  int i = 0;
-  __m256i temp;
-  while (i < size) {
-    temp = a0[i];
-    a0[i] = a1[i];
-    a1[i] = temp;
-    i++;
-  }
-}
-
-static void mm256_transpose_32x32(__m256i *in0, __m256i *in1) {
-  mm256_transpose_16x16(in0, in0);
-  mm256_transpose_16x16(&in0[16], &in0[16]);
-  mm256_transpose_16x16(in1, in1);
-  mm256_transpose_16x16(&in1[16], &in1[16]);
-  mm256_vectors_swap(&in0[16], in1, 16);
-}
-
-static void prepare_16x16_even(const __m256i *in, __m256i *even) {
-  even[0] = _mm256_add_epi16(in[0], in[31]);
-  even[1] = _mm256_add_epi16(in[1], in[30]);
-  even[2] = _mm256_add_epi16(in[2], in[29]);
-  even[3] = _mm256_add_epi16(in[3], in[28]);
-  even[4] = _mm256_add_epi16(in[4], in[27]);
-  even[5] = _mm256_add_epi16(in[5], in[26]);
-  even[6] = _mm256_add_epi16(in[6], in[25]);
-  even[7] = _mm256_add_epi16(in[7], in[24]);
-  even[8] = _mm256_add_epi16(in[8], in[23]);
-  even[9] = _mm256_add_epi16(in[9], in[22]);
-  even[10] = _mm256_add_epi16(in[10], in[21]);
-  even[11] = _mm256_add_epi16(in[11], in[20]);
-  even[12] = _mm256_add_epi16(in[12], in[19]);
-  even[13] = _mm256_add_epi16(in[13], in[18]);
-  even[14] = _mm256_add_epi16(in[14], in[17]);
-  even[15] = _mm256_add_epi16(in[15], in[16]);
-}
-
-static void prepare_16x16_odd(const __m256i *in, __m256i *odd) {
-  odd[0] = _mm256_sub_epi16(in[15], in[16]);
-  odd[1] = _mm256_sub_epi16(in[14], in[17]);
-  odd[2] = _mm256_sub_epi16(in[13], in[18]);
-  odd[3] = _mm256_sub_epi16(in[12], in[19]);
-  odd[4] = _mm256_sub_epi16(in[11], in[20]);
-  odd[5] = _mm256_sub_epi16(in[10], in[21]);
-  odd[6] = _mm256_sub_epi16(in[9], in[22]);
-  odd[7] = _mm256_sub_epi16(in[8], in[23]);
-  odd[8] = _mm256_sub_epi16(in[7], in[24]);
-  odd[9] = _mm256_sub_epi16(in[6], in[25]);
-  odd[10] = _mm256_sub_epi16(in[5], in[26]);
-  odd[11] = _mm256_sub_epi16(in[4], in[27]);
-  odd[12] = _mm256_sub_epi16(in[3], in[28]);
-  odd[13] = _mm256_sub_epi16(in[2], in[29]);
-  odd[14] = _mm256_sub_epi16(in[1], in[30]);
-  odd[15] = _mm256_sub_epi16(in[0], in[31]);
-}
-
-static void collect_16col(const __m256i *even, const __m256i *odd,
-                          __m256i *out) {
-  // fdct16_avx2() already maps the output
-  out[0] = even[0];
-  out[2] = even[1];
-  out[4] = even[2];
-  out[6] = even[3];
-  out[8] = even[4];
-  out[10] = even[5];
-  out[12] = even[6];
-  out[14] = even[7];
-  out[16] = even[8];
-  out[18] = even[9];
-  out[20] = even[10];
-  out[22] = even[11];
-  out[24] = even[12];
-  out[26] = even[13];
-  out[28] = even[14];
-  out[30] = even[15];
-
-  out[1] = odd[0];
-  out[17] = odd[1];
-  out[9] = odd[2];
-  out[25] = odd[3];
-  out[5] = odd[4];
-  out[21] = odd[5];
-  out[13] = odd[6];
-  out[29] = odd[7];
-  out[3] = odd[8];
-  out[19] = odd[9];
-  out[11] = odd[10];
-  out[27] = odd[11];
-  out[7] = odd[12];
-  out[23] = odd[13];
-  out[15] = odd[14];
-  out[31] = odd[15];
-}
-
-static void collect_coeffs(const __m256i *first_16col_even,
-                           const __m256i *first_16col_odd,
-                           const __m256i *second_16col_even,
-                           const __m256i *second_16col_odd, __m256i *in0,
-                           __m256i *in1) {
-  collect_16col(first_16col_even, first_16col_odd, in0);
-  collect_16col(second_16col_even, second_16col_odd, in1);
-}
-
-static void fdct16_odd_avx2(__m256i *in) {
-  // sequence: cospi_L_H = pairs(L, H) and L first
-  const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-  const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64);
-  const __m256i cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64);
-  const __m256i cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m256i cospi_p31_p01 = pair256_set_epi16(cospi_31_64, cospi_1_64);
-  const __m256i cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64);
-  const __m256i cospi_p15_p17 = pair256_set_epi16(cospi_15_64, cospi_17_64);
-  const __m256i cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64);
-  const __m256i cospi_p23_p09 = pair256_set_epi16(cospi_23_64, cospi_9_64);
-  const __m256i cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64);
-  const __m256i cospi_p07_p25 = pair256_set_epi16(cospi_7_64, cospi_25_64);
-  const __m256i cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64);
-  const __m256i cospi_p27_p05 = pair256_set_epi16(cospi_27_64, cospi_5_64);
-  const __m256i cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
-  const __m256i cospi_p11_p21 = pair256_set_epi16(cospi_11_64, cospi_21_64);
-  const __m256i cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
-  const __m256i cospi_p19_p13 = pair256_set_epi16(cospi_19_64, cospi_13_64);
-  const __m256i cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64);
-  const __m256i cospi_p03_p29 = pair256_set_epi16(cospi_3_64, cospi_29_64);
-  const __m256i cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64);
-
-  __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-  __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15;
-  __m256i u0, u1;
-
-  // stage 1 is in prepare_16x16_odd()
-
-  // stage 2
-  y0 = in[0];
-  y1 = in[1];
-  y2 = in[2];
-  y3 = in[3];
-
-  u0 = _mm256_unpacklo_epi16(in[4], in[11]);
-  u1 = _mm256_unpackhi_epi16(in[4], in[11]);
-  y4 = butter_fly(&u0, &u1, &cospi_m16_p16);
-  y11 = butter_fly(&u0, &u1, &cospi_p16_p16);
-
-  u0 = _mm256_unpacklo_epi16(in[5], in[10]);
-  u1 = _mm256_unpackhi_epi16(in[5], in[10]);
-  y5 = butter_fly(&u0, &u1, &cospi_m16_p16);
-  y10 = butter_fly(&u0, &u1, &cospi_p16_p16);
-
-  u0 = _mm256_unpacklo_epi16(in[6], in[9]);
-  u1 = _mm256_unpackhi_epi16(in[6], in[9]);
-  y6 = butter_fly(&u0, &u1, &cospi_m16_p16);
-  y9 = butter_fly(&u0, &u1, &cospi_p16_p16);
-
-  u0 = _mm256_unpacklo_epi16(in[7], in[8]);
-  u1 = _mm256_unpackhi_epi16(in[7], in[8]);
-  y7 = butter_fly(&u0, &u1, &cospi_m16_p16);
-  y8 = butter_fly(&u0, &u1, &cospi_p16_p16);
-
-  y12 = in[12];
-  y13 = in[13];
-  y14 = in[14];
-  y15 = in[15];
-
-  // stage 3
-  x0 = _mm256_add_epi16(y0, y7);
-  x1 = _mm256_add_epi16(y1, y6);
-  x2 = _mm256_add_epi16(y2, y5);
-  x3 = _mm256_add_epi16(y3, y4);
-  x4 = _mm256_sub_epi16(y3, y4);
-  x5 = _mm256_sub_epi16(y2, y5);
-  x6 = _mm256_sub_epi16(y1, y6);
-  x7 = _mm256_sub_epi16(y0, y7);
-  x8 = _mm256_sub_epi16(y15, y8);
-  x9 = _mm256_sub_epi16(y14, y9);
-  x10 = _mm256_sub_epi16(y13, y10);
-  x11 = _mm256_sub_epi16(y12, y11);
-  x12 = _mm256_add_epi16(y12, y11);
-  x13 = _mm256_add_epi16(y13, y10);
-  x14 = _mm256_add_epi16(y14, y9);
-  x15 = _mm256_add_epi16(y15, y8);
-
-  // stage 4
-  y0 = x0;
-  y1 = x1;
-  y6 = x6;
-  y7 = x7;
-  y8 = x8;
-  y9 = x9;
-  y14 = x14;
-  y15 = x15;
-
-  u0 = _mm256_unpacklo_epi16(x2, x13);
-  u1 = _mm256_unpackhi_epi16(x2, x13);
-  y2 = butter_fly(&u0, &u1, &cospi_m08_p24);
-  y13 = butter_fly(&u0, &u1, &cospi_p24_p08);
-
-  u0 = _mm256_unpacklo_epi16(x3, x12);
-  u1 = _mm256_unpackhi_epi16(x3, x12);
-  y3 = butter_fly(&u0, &u1, &cospi_m08_p24);
-  y12 = butter_fly(&u0, &u1, &cospi_p24_p08);
-
-  u0 = _mm256_unpacklo_epi16(x4, x11);
-  u1 = _mm256_unpackhi_epi16(x4, x11);
-  y4 = butter_fly(&u0, &u1, &cospi_m24_m08);
-  y11 = butter_fly(&u0, &u1, &cospi_m08_p24);
-
-  u0 = _mm256_unpacklo_epi16(x5, x10);
-  u1 = _mm256_unpackhi_epi16(x5, x10);
-  y5 = butter_fly(&u0, &u1, &cospi_m24_m08);
-  y10 = butter_fly(&u0, &u1, &cospi_m08_p24);
-
-  // stage 5
-  x0 = _mm256_add_epi16(y0, y3);
-  x1 = _mm256_add_epi16(y1, y2);
-  x2 = _mm256_sub_epi16(y1, y2);
-  x3 = _mm256_sub_epi16(y0, y3);
-  x4 = _mm256_sub_epi16(y7, y4);
-  x5 = _mm256_sub_epi16(y6, y5);
-  x6 = _mm256_add_epi16(y6, y5);
-  x7 = _mm256_add_epi16(y7, y4);
-
-  x8 = _mm256_add_epi16(y8, y11);
-  x9 = _mm256_add_epi16(y9, y10);
-  x10 = _mm256_sub_epi16(y9, y10);
-  x11 = _mm256_sub_epi16(y8, y11);
-  x12 = _mm256_sub_epi16(y15, y12);
-  x13 = _mm256_sub_epi16(y14, y13);
-  x14 = _mm256_add_epi16(y14, y13);
-  x15 = _mm256_add_epi16(y15, y12);
-
-  // stage 6
-  y0 = x0;
-  y3 = x3;
-  y4 = x4;
-  y7 = x7;
-  y8 = x8;
-  y11 = x11;
-  y12 = x12;
-  y15 = x15;
-
-  u0 = _mm256_unpacklo_epi16(x1, x14);
-  u1 = _mm256_unpackhi_epi16(x1, x14);
-  y1 = butter_fly(&u0, &u1, &cospi_m04_p28);
-  y14 = butter_fly(&u0, &u1, &cospi_p28_p04);
-
-  u0 = _mm256_unpacklo_epi16(x2, x13);
-  u1 = _mm256_unpackhi_epi16(x2, x13);
-  y2 = butter_fly(&u0, &u1, &cospi_m28_m04);
-  y13 = butter_fly(&u0, &u1, &cospi_m04_p28);
-
-  u0 = _mm256_unpacklo_epi16(x5, x10);
-  u1 = _mm256_unpackhi_epi16(x5, x10);
-  y5 = butter_fly(&u0, &u1, &cospi_m20_p12);
-  y10 = butter_fly(&u0, &u1, &cospi_p12_p20);
-
-  u0 = _mm256_unpacklo_epi16(x6, x9);
-  u1 = _mm256_unpackhi_epi16(x6, x9);
-  y6 = butter_fly(&u0, &u1, &cospi_m12_m20);
-  y9 = butter_fly(&u0, &u1, &cospi_m20_p12);
-
-  // stage 7
-  x0 = _mm256_add_epi16(y0, y1);
-  x1 = _mm256_sub_epi16(y0, y1);
-  x2 = _mm256_sub_epi16(y3, y2);
-  x3 = _mm256_add_epi16(y3, y2);
-  x4 = _mm256_add_epi16(y4, y5);
-  x5 = _mm256_sub_epi16(y4, y5);
-  x6 = _mm256_sub_epi16(y7, y6);
-  x7 = _mm256_add_epi16(y7, y6);
-
-  x8 = _mm256_add_epi16(y8, y9);
-  x9 = _mm256_sub_epi16(y8, y9);
-  x10 = _mm256_sub_epi16(y11, y10);
-  x11 = _mm256_add_epi16(y11, y10);
-  x12 = _mm256_add_epi16(y12, y13);
-  x13 = _mm256_sub_epi16(y12, y13);
-  x14 = _mm256_sub_epi16(y15, y14);
-  x15 = _mm256_add_epi16(y15, y14);
-
-  // stage 8
-  u0 = _mm256_unpacklo_epi16(x0, x15);
-  u1 = _mm256_unpackhi_epi16(x0, x15);
-  in[0] = butter_fly(&u0, &u1, &cospi_p31_p01);
-  in[15] = butter_fly(&u0, &u1, &cospi_m01_p31);
-
-  u0 = _mm256_unpacklo_epi16(x1, x14);
-  u1 = _mm256_unpackhi_epi16(x1, x14);
-  in[1] = butter_fly(&u0, &u1, &cospi_p15_p17);
-  in[14] = butter_fly(&u0, &u1, &cospi_m17_p15);
-
-  u0 = _mm256_unpacklo_epi16(x2, x13);
-  u1 = _mm256_unpackhi_epi16(x2, x13);
-  in[2] = butter_fly(&u0, &u1, &cospi_p23_p09);
-  in[13] = butter_fly(&u0, &u1, &cospi_m09_p23);
-
-  u0 = _mm256_unpacklo_epi16(x3, x12);
-  u1 = _mm256_unpackhi_epi16(x3, x12);
-  in[3] = butter_fly(&u0, &u1, &cospi_p07_p25);
-  in[12] = butter_fly(&u0, &u1, &cospi_m25_p07);
-
-  u0 = _mm256_unpacklo_epi16(x4, x11);
-  u1 = _mm256_unpackhi_epi16(x4, x11);
-  in[4] = butter_fly(&u0, &u1, &cospi_p27_p05);
-  in[11] = butter_fly(&u0, &u1, &cospi_m05_p27);
-
-  u0 = _mm256_unpacklo_epi16(x5, x10);
-  u1 = _mm256_unpackhi_epi16(x5, x10);
-  in[5] = butter_fly(&u0, &u1, &cospi_p11_p21);
-  in[10] = butter_fly(&u0, &u1, &cospi_m21_p11);
-
-  u0 = _mm256_unpacklo_epi16(x6, x9);
-  u1 = _mm256_unpackhi_epi16(x6, x9);
-  in[6] = butter_fly(&u0, &u1, &cospi_p19_p13);
-  in[9] = butter_fly(&u0, &u1, &cospi_m13_p19);
-
-  u0 = _mm256_unpacklo_epi16(x7, x8);
-  u1 = _mm256_unpackhi_epi16(x7, x8);
-  in[7] = butter_fly(&u0, &u1, &cospi_p03_p29);
-  in[8] = butter_fly(&u0, &u1, &cospi_m29_p03);
-}
-
-static void fdct32_avx2(__m256i *in0, __m256i *in1) {
-  __m256i even0[16], even1[16], odd0[16], odd1[16];
-  prepare_16x16_even(in0, even0);
-  fdct16_avx2(even0);
-
-  prepare_16x16_odd(in0, odd0);
-  fdct16_odd_avx2(odd0);
-
-  prepare_16x16_even(in1, even1);
-  fdct16_avx2(even1);
-
-  prepare_16x16_odd(in1, odd1);
-  fdct16_odd_avx2(odd1);
-
-  collect_coeffs(even0, odd0, even1, odd1, in0, in1);
-
-  mm256_transpose_32x32(in0, in1);
-}
-
-static INLINE void write_buffer_32x32(const __m256i *in0, const __m256i *in1,
-                                      tran_low_t *output) {
-  int i = 0;
-  const int stride = 32;
-  tran_low_t *coeff = output;
-  while (i < 32) {
-    storeu_output_avx2(&in0[i], coeff);
-    storeu_output_avx2(&in1[i], coeff + 16);
-    coeff += stride;
-    i += 1;
-  }
-}
-
-#if CONFIG_EXT_TX
-static void fhalfright32_16col_avx2(__m256i *in) {
-  int i = 0;
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i sqrt2 = _mm256_set1_epi16((int16_t)Sqrt2);
-  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  __m256i x0, x1;
-
-  while (i < 16) {
-    in[i] = _mm256_slli_epi16(in[i], 2);
-    x0 = _mm256_unpacklo_epi16(in[i + 16], zero);
-    x1 = _mm256_unpackhi_epi16(in[i + 16], zero);
-    x0 = _mm256_madd_epi16(x0, sqrt2);
-    x1 = _mm256_madd_epi16(x1, sqrt2);
-    x0 = _mm256_add_epi32(x0, dct_rounding);
-    x1 = _mm256_add_epi32(x1, dct_rounding);
-    x0 = _mm256_srai_epi32(x0, DCT_CONST_BITS);
-    x1 = _mm256_srai_epi32(x1, DCT_CONST_BITS);
-    in[i + 16] = _mm256_packs_epi32(x0, x1);
-    i += 1;
-  }
-  fdct16_avx2(&in[16]);
-}
-
-static void fhalfright32_avx2(__m256i *in0, __m256i *in1) {
-  fhalfright32_16col_avx2(in0);
-  fhalfright32_16col_avx2(in1);
-  mm256_vectors_swap(in0, &in0[16], 16);
-  mm256_vectors_swap(in1, &in1[16], 16);
-  mm256_transpose_32x32(in0, in1);
-}
-#endif  // CONFIG_EXT_TX
-
-static INLINE void load_buffer_32x32(const int16_t *input, int stride,
-                                     int flipud, int fliplr, __m256i *in0,
-                                     __m256i *in1) {
-  // Load 4 16x16 blocks
-  const int16_t *topL = input;
-  const int16_t *topR = input + 16;
-  const int16_t *botL = input + 16 * stride;
-  const int16_t *botR = input + 16 * stride + 16;
-
-  const int16_t *tmp;
-
-  if (flipud) {
-    // Swap left columns
-    tmp = topL;
-    topL = botL;
-    botL = tmp;
-    // Swap right columns
-    tmp = topR;
-    topR = botR;
-    botR = tmp;
-  }
-
-  if (fliplr) {
-    // Swap top rows
-    tmp = topL;
-    topL = topR;
-    topR = tmp;
-    // Swap bottom rows
-    tmp = botL;
-    botL = botR;
-    botR = tmp;
-  }
-
-  // load first 16 columns
-  load_buffer_16x16(topL, stride, flipud, fliplr, in0);
-  load_buffer_16x16(botL, stride, flipud, fliplr, in0 + 16);
-
-  // load second 16 columns
-  load_buffer_16x16(topR, stride, flipud, fliplr, in1);
-  load_buffer_16x16(botR, stride, flipud, fliplr, in1 + 16);
-}
-
-static INLINE void right_shift_32x32_16col(int bit, __m256i *in) {
-  int i = 0;
-  const __m256i rounding = _mm256_set1_epi16((1 << bit) >> 1);
-  __m256i sign;
-  while (i < 32) {
-    sign = _mm256_srai_epi16(in[i], 15);
-    in[i] = _mm256_add_epi16(in[i], rounding);
-    in[i] = _mm256_add_epi16(in[i], sign);
-    in[i] = _mm256_srai_epi16(in[i], bit);
-    i += 1;
-  }
-}
-
-// Positive rounding
-static INLINE void right_shift_32x32(__m256i *in0, __m256i *in1) {
-  const int bit = 4;
-  right_shift_32x32_16col(bit, in0);
-  right_shift_32x32_16col(bit, in1);
-}
-
-#if CONFIG_EXT_TX
-static void fidtx32_avx2(__m256i *in0, __m256i *in1) {
-  int i = 0;
-  while (i < 32) {
-    in0[i] = _mm256_slli_epi16(in0[i], 2);
-    in1[i] = _mm256_slli_epi16(in1[i], 2);
-    i += 1;
-  }
-  mm256_transpose_32x32(in0, in1);
-}
-#endif
-
-void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride,
-                       TxfmParam *txfm_param) {
-  __m256i in0[32];  // left 32 columns
-  __m256i in1[32];  // right 32 columns
-  const TX_TYPE tx_type = txfm_param->tx_type;
-#if CONFIG_MRC_TX
-  assert(tx_type != MRC_DCT && "No avx2 32x32 implementation of MRC_DCT");
-#endif
-
-  switch (tx_type) {
-    case DCT_DCT:
-      load_buffer_32x32(input, stride, 0, 0, in0, in1);
-      fdct32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fdct32_avx2(in0, in1);
-      break;
-#if CONFIG_EXT_TX
-    case ADST_DCT:
-      load_buffer_32x32(input, stride, 0, 0, in0, in1);
-      fhalfright32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fdct32_avx2(in0, in1);
-      break;
-    case DCT_ADST:
-      load_buffer_32x32(input, stride, 0, 0, in0, in1);
-      fdct32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fhalfright32_avx2(in0, in1);
-      break;
-    case ADST_ADST:
-      load_buffer_32x32(input, stride, 0, 0, in0, in1);
-      fhalfright32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fhalfright32_avx2(in0, in1);
-      break;
-    case FLIPADST_DCT:
-      load_buffer_32x32(input, stride, 1, 0, in0, in1);
-      fhalfright32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fdct32_avx2(in0, in1);
-      break;
-    case DCT_FLIPADST:
-      load_buffer_32x32(input, stride, 0, 1, in0, in1);
-      fdct32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fhalfright32_avx2(in0, in1);
-      break;
-    case FLIPADST_FLIPADST:
-      load_buffer_32x32(input, stride, 1, 1, in0, in1);
-      fhalfright32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fhalfright32_avx2(in0, in1);
-      break;
-    case ADST_FLIPADST:
-      load_buffer_32x32(input, stride, 0, 1, in0, in1);
-      fhalfright32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fhalfright32_avx2(in0, in1);
-      break;
-    case FLIPADST_ADST:
-      load_buffer_32x32(input, stride, 1, 0, in0, in1);
-      fhalfright32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fhalfright32_avx2(in0, in1);
-      break;
-    case IDTX:
-      load_buffer_32x32(input, stride, 0, 0, in0, in1);
-      fidtx32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fidtx32_avx2(in0, in1);
-      break;
-    case V_DCT:
-      load_buffer_32x32(input, stride, 0, 0, in0, in1);
-      fdct32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fidtx32_avx2(in0, in1);
-      break;
-    case H_DCT:
-      load_buffer_32x32(input, stride, 0, 0, in0, in1);
-      fidtx32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fdct32_avx2(in0, in1);
-      break;
-    case V_ADST:
-      load_buffer_32x32(input, stride, 0, 0, in0, in1);
-      fhalfright32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fidtx32_avx2(in0, in1);
-      break;
-    case H_ADST:
-      load_buffer_32x32(input, stride, 0, 0, in0, in1);
-      fidtx32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fhalfright32_avx2(in0, in1);
-      break;
-    case V_FLIPADST:
-      load_buffer_32x32(input, stride, 1, 0, in0, in1);
-      fhalfright32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fidtx32_avx2(in0, in1);
-      break;
-    case H_FLIPADST:
-      load_buffer_32x32(input, stride, 0, 1, in0, in1);
-      fidtx32_avx2(in0, in1);
-      right_shift_32x32(in0, in1);
-      fhalfright32_avx2(in0, in1);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-  write_buffer_32x32(in0, in1, output);
-  _mm256_zeroupper();
-}
diff --git a/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm b/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm
index 7186b6b92..30983d1c1 100644
--- a/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm
@@ -14,6 +14,8 @@
 
 %include "aom_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ; void av1_temporal_filter_apply_sse2 | arg
 ;  (unsigned char  *frame1,           |  0
 ;   unsigned int    stride,           |  1
diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
index bf233ca4d..4d2e99f25 100644
--- a/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
+++ b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
@@ -31,7 +31,7 @@ uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d,
   uint64_t csse;
 
   const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
-  const __m128i v_zext_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+  const __m128i v_zext_q = xx_set1_64_from_32i(0xffffffff);
 
   __m128i v_acc0_q = _mm_setzero_si128();
 
diff --git a/third_party/aom/build/cmake/aom_config.c.cmake b/third_party/aom/build/cmake/aom_config.c.template
index 62f0a10ab..62f0a10ab 100644
--- a/third_party/aom/build/cmake/aom_config.c.cmake
+++ b/third_party/aom/build/cmake/aom_config.c.template
diff --git a/third_party/aom/build/cmake/aom_config_defaults.cmake b/third_party/aom/build/cmake/aom_config_defaults.cmake
index 488401be1..c7252f064 100644
--- a/third_party/aom/build/cmake/aom_config_defaults.cmake
+++ b/third_party/aom/build/cmake/aom_config_defaults.cmake
@@ -1,29 +1,28 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
 # Defaults for every libaom configuration variable. Here we add all libaom
 # config variables to the cmake variable cache, but omit the FORCE parameter to
 # allow users to specify values when executing cmake to generate build files.
 # Values here are used only if not set by the user.
 set(INLINE "" CACHE STRING "Sets INLINE value for current target.")
-set(RESTRICT "" CACHE STRING "Sets RESTRICT value for current target.")
 
 # CPUs.
 set(ARCH_ARM 0 CACHE NUMBER "Enables ARM architecture.")
 set(ARCH_MIPS 0 CACHE NUMBER "Enables MIPS architecture.")
+set(ARCH_PPC 0 CACHE NUMBER "Enables PPC architecture.")
 set(ARCH_X86 0 CACHE NUMBER "Enables X86 architecture.")
 set(ARCH_X86_64 0 CACHE NUMBER "Enables X86_64 architecture.")
 
 # ARM optimization flags.
 set(HAVE_NEON 0 CACHE NUMBER "Enables NEON intrinsics optimizations.")
-set(HAVE_NEON_ASM 0 CACHE NUMBER "Enables NEON assembly optimizations.")
 
 # MIPS optimization flags.
 set(HAVE_DSPR2 0 CACHE NUMBER "Enables DSPR2 optimizations.")
@@ -31,6 +30,9 @@ set(HAVE_MIPS32 0 CACHE NUMBER "Enables MIPS32 optimizations.")
 set(HAVE_MIPS64 0 CACHE NUMBER "Enables MIPS64 optimizations. ")
 set(HAVE_MSA 0 CACHE NUMBER "Enables MSA optimizations.")
 
+# PPC optimization flags.
+set(HAVE_VSX 0 CACHE NUMBER "Enables VSX optimizations.")
+
 # x86/x86_64 optimization flags.
 set(HAVE_AVX 0 CACHE NUMBER "Enables AVX optimizations.")
 set(HAVE_AVX2 0 CACHE NUMBER "Enables AVX2 optimizations.")
@@ -39,33 +41,16 @@ set(HAVE_SSE 0 CACHE NUMBER "Enables SSE optimizations.")
 set(HAVE_SSE2 0 CACHE NUMBER "Enables SSE2 optimizations.")
 set(HAVE_SSE3 0 CACHE NUMBER "Enables SSE3 optimizations.")
 set(HAVE_SSE4_1 0 CACHE NUMBER "Enables SSE 4.1 optimizations.")
+set(HAVE_SSE4_2 0 CACHE NUMBER "Enables SSE 4.2 optimizations.")
 set(HAVE_SSSE3 0 CACHE NUMBER "Enables SSSE3 optimizations.")
 
 # Flags describing the build environment.
-set(HAVE_AOM_PORTS 0 CACHE NUMBER "Internal flag, deprecated.")
 set(HAVE_FEXCEPT 0 CACHE NUMBER "Internal flag, GNU fenv.h present for target.")
 set(HAVE_PTHREAD_H 0 CACHE NUMBER "Internal flag, target pthread support.")
 set(HAVE_UNISTD_H 0 CACHE NUMBER "Internal flag, unistd.h present for target.")
 set(HAVE_WXWIDGETS 0 CACHE NUMBER "WxWidgets present.")
 
-# Deprecated flags preserved for compatibility with configure build.
-set(CONFIG_CODEC_SRCS 0 CACHE NUMBER "Deprecated flag.")
-set(CONFIG_DEBUG_LIBS 0 CACHE NUMBER "Deprecated flag.")
-set(CONFIG_DEPENDENCY_TRACKING 1 CACHE NUMBER "Deprecated flag.")
-set(CONFIG_EXPERIMENTAL 0 CACHE NUMBER "Deprecated flag.")
-set(CONFIG_EXTERNAL_BUILD 0 CACHE NUMBER "Deprecated flag.")
-set(CONFIG_INSTALL_BINS 0 CACHE NUMBER "Deprecated flag.")
-set(CONFIG_INSTALL_DOCS 0 CACHE NUMBER "Deprecated flag.")
-set(CONFIG_INSTALL_LIBS 0 CACHE NUMBER "Deprecated flag.")
-set(CONFIG_INSTALL_SRCS 0 CACHE NUMBER "Deprecated flag.")
-set(CONFIG_POSTPROC 1 CACHE NUMBER "Deprecated flag.")
-set(CONFIG_POSTPROC_VISUALIZER 0 CACHE NUMBER "Deprecated flag.")
-set(CONFIG_RVCT 0 CACHE NUMBER "Deprecated flag.")
-set(CONFIG_SMALL 0 CACHE NUMBER "Deprecated flag.")
-set(CONFIG_STATIC_MSVCRT 0 CACHE NUMBER "Deprecated flag.")
-
 # Build configuration flags.
-set(CONFIG_AV1 1 CACHE NUMBER "Internal flag.")
 set(CONFIG_AV1_DECODER 1 CACHE NUMBER "Enable AV1 decoder.")
 set(CONFIG_AV1_ENCODER 1 CACHE NUMBER "Enable AV1 encoder.")
 set(CONFIG_BIG_ENDIAN 0 CACHE NUMBER "Internal flag.")
@@ -85,131 +70,26 @@ set(CONFIG_WEBM_IO 1 CACHE NUMBER "Enables WebM support.")
 # Debugging flags.
 set(CONFIG_BITSTREAM_DEBUG 0 CACHE NUMBER "Bitstream debugging flag.")
 set(CONFIG_DEBUG 0 CACHE NUMBER "Debug build flag.")
-
-# Testing flags.
-set(CONFIG_DECODE_PERF_TESTS 0 CACHE NUMBER "Enables decoder performance test.")
-set(CONFIG_ENCODE_PERF_TESTS 0 CACHE NUMBER "Enables encoder performance test.")
-set(CONFIG_UNIT_TESTS 1 CACHE NUMBER "Enables unit tests.")
+set(CONFIG_MISMATCH_DEBUG 0 CACHE NUMBER "Mismatch debugging flag.")
 
 # AV1 feature flags.
 set(CONFIG_ACCOUNTING 0 CACHE NUMBER "Enables bit accounting.")
 set(CONFIG_ANALYZER 0 CACHE NUMBER "Enables bit stream analyzer.")
 set(CONFIG_COEFFICIENT_RANGE_CHECKING 0 CACHE NUMBER "Coefficient range check.")
-set(CONFIG_HIGHBITDEPTH 1 CACHE NUMBER "Enables high bit depth support.")
+set(CONFIG_FILEOPTIONS 1 CACHE NUMBER "Enables encoder config file support.")
 set(CONFIG_INSPECTION 0 CACHE NUMBER "Enables bitstream inspection.")
-set(CONFIG_INTERNAL_STATS 0 CACHE NUMBER "Codec stats.")
-set(CONFIG_LOWBITDEPTH 1 CACHE NUMBER "Enables low bit depth support.")
-set(CONFIG_REALTIME_ONLY 0 CACHE NUMBER "Support only realtime encodes.")
+set(CONFIG_INTERNAL_STATS 0 CACHE NUMBER "Enables internal encoder stats.")
+set(CONFIG_LOWBITDEPTH 0 CACHE NUMBER "Enables 8-bit optimized pipeline.")
 set(CONFIG_SIZE_LIMIT 0 CACHE NUMBER "Limit max decode width/height.")
 set(CONFIG_SPATIAL_RESAMPLING 1 CACHE NUMBER "Spatial resampling.")
-set(CONFIG_SYMBOLRATE 0 CACHE NUMBER "Enables symbol rate accounting.")
+set(DECODE_HEIGHT_LIMIT 0 CACHE NUMBER "Set limit for decode height.")
+set(DECODE_WIDTH_LIMIT 0 CACHE NUMBER "Set limit for decode width.")
 
 # AV1 experiment flags.
-set(CONFIG_ADAPT_SCAN 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_AMVR 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_ANS 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_AOM_QM 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_BGSPRITE 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_CB4X4 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_CDEF 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_CDEF_SINGLEPASS 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_CFL 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_CHROMA_2X2 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_CHROMA_SUB8X8 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_COEF_INTERLEAVE 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_COLORSPACE_HEADERS 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_COMPOUND_ROUND 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_COMPOUND_SEGMENT 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_COMPOUND_SINGLEREF 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_CONVOLVE_ROUND 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_CTX1D 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_DCT16 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_DCT32 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_DCT4 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_DCT64 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_DCT8 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DAALA_TX 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DCT_ONLY 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DEBLOCK_13TAP 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DEPENDENT_HORZTILES 0 CACHE NUMBER "AV1 experiment flag.")
+set(CONFIG_COLLECT_INTER_MODE_RD_STATS 1 CACHE NUMBER "AV1 experiment flag.")
+set(CONFIG_COLLECT_RD_STATS 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_DIST_8X8 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DUAL_FILTER 1 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_ENTROPY_STATS 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_EXT_COMP_REFS 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_EXT_DELTA_Q 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_EXT_INTRA 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_EXT_PARTITION 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_EXT_PARTITION_TYPES 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_EXT_PARTITION_TYPES_AB 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_EXT_REFS 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_EXT_SKIP 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_EXT_TILE 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_EXT_TX 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_FILTER_INTRA 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_FP_MB_STATS 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_FRAME_MARKER 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_FRAME_SIGN_BIAS 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_FRAME_SIZE 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_FRAME_SUPERRES 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_GLOBAL_MOTION 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_HASH_ME 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_HORZONLY_FRAME_SUPERRES 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_INTERINTRA 1 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_INTER_STATS_ONLY 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_INTRABC 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_INTRA_EDGE 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_INTRA_INTERP 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_JNT_COMP 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_KF_CTX 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_LGT 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_LGT_FROM_PRED 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_LOOPFILTERING_ACROSS_TILES 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_LOOPFILTER_LEVEL 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_LOOP_RESTORATION 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_LPF_DIRECT 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_LPF_SB 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_LV_MAP 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_MASKED_TX 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_MAX_TILE 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_MFMV 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_MOTION_VAR 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_MRC_TX 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_MV_COMPRESS 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_NCOBMC 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_NCOBMC_ADAPT_WEIGHT 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_NEW_MULTISYMBOL 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_NEW_QUANT 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_NO_FRAME_CONTEXT_SIGNALING 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_OBU 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_ONE_SIDED_COMPOUND 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_OPT_REF_MV 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_PALETTE_DELTA_ENCODING 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_PALETTE_THROUGHPUT 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_PARALLEL_DEBLOCKING 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_PVQ 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_Q_ADAPT_PROBS 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_RAWBITS 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_RD_DEBUG 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_RECT_TX 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_RECT_TX_EXT 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_REFERENCE_BUFFER 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_REF_ADAPT 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_RESTRICT_COMPRESSED_HDR 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_SBL_SYMBOL 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_SEGMENT_ZEROMV 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_SMOOTH_HV 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_STRIPED_LOOP_RESTORATION 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_SUPERTX 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_TEMPMV_SIGNALING 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_TMV 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_TPL_MV 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_TX64X64 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_TXK_SEL 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_TXMG 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_UNPOISON_PARTITION_CTX 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_VAR_REFS 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_VAR_TX 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_VAR_TX_NO_TX_MODE 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_WARPED_MOTION 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_WEDGE 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_XIPHRC 0 CACHE NUMBER "AV1 experiment flag.")
diff --git a/third_party/aom/build/cmake/aom_configure.cmake b/third_party/aom/build/cmake/aom_configure.cmake
index 3553710d3..5d782aaf9 100644
--- a/third_party/aom/build/cmake/aom_configure.cmake
+++ b/third_party/aom/build/cmake/aom_configure.cmake
@@ -1,23 +1,24 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_)
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_
 set(AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_ 1)
 
 include(FindGit)
 include(FindPerl)
 include(FindThreads)
-include(FindwxWidgets)
 
 set(AOM_SUPPORTED_CPU_TARGETS
-    "arm64 armv7 armv7s generic mips32 mips64 x86 x86_64")
+    "arm64 armv7 armv7s generic mips32 mips64 ppc x86 x86_64")
 
 # Generate the user config settings. This must occur before include of
 # aom_config_defaults.cmake (because it turns every config variable into a cache
@@ -25,10 +26,10 @@ set(AOM_SUPPORTED_CPU_TARGETS
 get_cmake_property(cmake_cache_vars CACHE_VARIABLES)
 foreach(cache_var ${cmake_cache_vars})
   get_property(cache_var_helpstring CACHE ${cache_var} PROPERTY HELPSTRING)
-  set(cmdline_helpstring  "No help, variable specified on the command line.")
+  set(cmdline_helpstring "No help, variable specified on the command line.")
   if("${cache_var_helpstring}" STREQUAL "${cmdline_helpstring}")
     set(AOM_CMAKE_CONFIG "${AOM_CMAKE_CONFIG} -D${cache_var}=${${cache_var}}")
-  endif ()
+  endif()
 endforeach()
 string(STRIP "${AOM_CMAKE_CONFIG}" AOM_CMAKE_CONFIG)
 
@@ -41,156 +42,158 @@ include("${AOM_ROOT}/build/cmake/util.cmake")
 
 # Build a list of all configurable variables.
 get_cmake_property(cmake_cache_vars CACHE_VARIABLES)
-foreach (var ${cmake_cache_vars})
-  if ("${var}" MATCHES "^CONFIG_")
+foreach(var ${cmake_cache_vars})
+  if("${var}" MATCHES "^CONFIG_")
     list(APPEND AOM_CONFIG_VARS ${var})
-  endif ()
-endforeach ()
-
-# Adopted experiments get enabled by default. For debugging, make it possible to
-# to turn them all off with a single option.
-if (NOT ENABLE_ADOPTED_EXPERIMENTS)
-  get_cmake_property(cmake_cache_vars CACHE_VARIABLES)
-  unset(var)
-  foreach (var ${cmake_cache_vars})
-    unset(var_helpstring)
-    get_property(var_helpstring CACHE ${var} PROPERTY HELPSTRING)
-    if ("${var_helpstring}" STREQUAL "AV1 experiment flag.")
-     if ("${var}" STREQUAL "CONFIG_CB4X4")
-       # CB4X4 is required and can not be disabled.
-     else ()
-       set(${var} 0)
-     endif ()
-    endif ()
-  endforeach ()
-endif ()
+  endif()
+endforeach()
 
 # Detect target CPU.
-if (NOT AOM_TARGET_CPU)
-  if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "AMD64" OR
-      "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-    if (${CMAKE_SIZEOF_VOID_P} EQUAL 4)
+if(NOT AOM_TARGET_CPU)
+  if("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "AMD64" OR
+     "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
+    if(${CMAKE_SIZEOF_VOID_P} EQUAL 4)
       set(AOM_TARGET_CPU "x86")
-    elseif (${CMAKE_SIZEOF_VOID_P} EQUAL 8)
+    elseif(${CMAKE_SIZEOF_VOID_P} EQUAL 8)
       set(AOM_TARGET_CPU "x86_64")
-    else ()
+    else()
       message(FATAL_ERROR
-              "--- Unexpected pointer size (${CMAKE_SIZEOF_VOID_P}) for\n"
-              "      CMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}\n"
-              "      CMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}\n"
-              "      CMAKE_GENERATOR=${CMAKE_GENERATOR}\n")
-    endif ()
-  elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "i386" OR
-          "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86")
+                "--- Unexpected pointer size (${CMAKE_SIZEOF_VOID_P}) for\n"
+                "      CMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}\n"
+                "      CMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}\n"
+                "      CMAKE_GENERATOR=${CMAKE_GENERATOR}\n")
+    endif()
+  elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "i386" OR
+         "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86")
     set(AOM_TARGET_CPU "x86")
-  elseif ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^arm" OR
-          "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^mips")
+  elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^arm" OR
+         "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^mips")
     set(AOM_TARGET_CPU "${CMAKE_SYSTEM_PROCESSOR}")
-  endif ()
-endif ()
+  elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64")
+    set(AOM_TARGET_CPU "arm64")
+  elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^ppc")
+    set(AOM_TARGET_CPU "ppc")
+  else()
+    message(WARNING "The architecture ${CMAKE_SYSTEM_PROCESSOR} is not "
+                    "supported, falling back to the generic target")
+    set(AOM_TARGET_CPU "generic")
+  endif()
+endif()
 
-if (CMAKE_TOOLCHAIN_FILE)
-  # Add toolchain file to config string.
+if(CMAKE_TOOLCHAIN_FILE) # Add toolchain file to config string.
   set(toolchain_string "-DCMAKE_TOOLCHAIN_FILE=\\\"${CMAKE_TOOLCHAIN_FILE}\\\"")
   set(AOM_CMAKE_CONFIG "${toolchain_string} ${AOM_CMAKE_CONFIG}")
-else ()
+else()
+
   # Add detected CPU to the config string.
   set(AOM_CMAKE_CONFIG "-DAOM_TARGET_CPU=${AOM_TARGET_CPU} ${AOM_CMAKE_CONFIG}")
-endif ()
+endif()
 set(AOM_CMAKE_CONFIG "-G \\\"${CMAKE_GENERATOR}\\\" ${AOM_CMAKE_CONFIG}")
 string(STRIP "${AOM_CMAKE_CONFIG}" AOM_CMAKE_CONFIG)
 
 message("--- aom_configure: Detected CPU: ${AOM_TARGET_CPU}")
 set(AOM_TARGET_SYSTEM ${CMAKE_SYSTEM_NAME})
 
-if ("${CMAKE_BUILD_TYPE}" MATCHES "Deb")
+if("${CMAKE_BUILD_TYPE}" MATCHES "Deb")
   set(CONFIG_DEBUG 1)
-endif ()
+endif()
 
-if (NOT MSVC)
-  if (BUILD_SHARED_LIBS)
+if(NOT MSVC)
+  if(BUILD_SHARED_LIBS)
     set(CONFIG_PIC 1)
     set(CONFIG_SHARED 1)
     set(CONFIG_STATIC 0)
-  endif ()
+  endif()
 
-  if (CONFIG_PIC)
+  if(CONFIG_PIC)
+
+    # TODO(tomfinegan): clang needs -pie in CMAKE_EXE_LINKER_FLAGS for this to
+    # work.
     set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-    if ("${AOM_TARGET_SYSTEM}" STREQUAL "Linux" AND
-        "${AOM_TARGET_CPU}" MATCHES "^armv7")
+    if("${AOM_TARGET_SYSTEM}" STREQUAL "Linux" AND "${AOM_TARGET_CPU}" MATCHES
+       "^armv7")
       set(AOM_AS_FLAGS ${AOM_AS_FLAGS} --defsym PIC=1)
-    else ()
+    else()
       set(AOM_AS_FLAGS ${AOM_AS_FLAGS} -DPIC)
-    endif ()
-  endif ()
-else ()
+    endif()
+  endif()
+else()
   set(CONFIG_MSVS 1)
-endif ()
+endif()
 
-if (NOT "${AOM_SUPPORTED_CPU_TARGETS}" MATCHES "${AOM_TARGET_CPU}")
-  message(FATAL_ERROR "No RTCD support for ${AOM_TARGET_CPU}. Create it, or "
-          "add -DAOM_TARGET_CPU=generic to your cmake command line for a "
-          "generic build of libaom and tools.")
-endif ()
+if(NOT "${AOM_SUPPORTED_CPU_TARGETS}" MATCHES "${AOM_TARGET_CPU}")
+  message(FATAL_ERROR
+            "No RTCD support for ${AOM_TARGET_CPU}. Create it, or "
+            "add -DAOM_TARGET_CPU=generic to your cmake command line for a "
+            "generic build of libaom and tools.")
+endif()
 
-if ("${AOM_TARGET_CPU}" STREQUAL "x86" OR "${AOM_TARGET_CPU}" STREQUAL "x86_64")
-  if (ENABLE_NASM)
+if("${AOM_TARGET_CPU}" STREQUAL "x86" OR "${AOM_TARGET_CPU}" STREQUAL "x86_64")
+  find_program(AS_EXECUTABLE yasm $ENV{YASM_PATH})
+  if(NOT AS_EXECUTABLE OR ENABLE_NASM)
+    unset(AS_EXECUTABLE CACHE)
     find_program(AS_EXECUTABLE nasm $ENV{NASM_PATH})
-    test_nasm()
-    set(AOM_AS_FLAGS ${AOM_AS_FLAGS} -Ox)
-  else ()
-    find_program(AS_EXECUTABLE yasm $ENV{YASM_PATH})
-  endif ()
-
-  if (NOT AS_EXECUTABLE)
-    message(FATAL_ERROR "Unable to find assembler. To build without "
-            "optimizations, add -DAOM_TARGET_CPU=generic to your cmake command "
-            "line.")
-  endif ()
+    if(AS_EXECUTABLE)
+      test_nasm()
+    endif()
+  endif()
+
+  if(NOT AS_EXECUTABLE)
+    message(FATAL_ERROR
+              "Unable to find assembler. Install 'yasm' or 'nasm.' "
+              "To build without optimizations, add -DAOM_TARGET_CPU=generic to "
+              "your cmake command line.")
+  endif()
   get_asm_obj_format("objformat")
   set(AOM_AS_FLAGS -f ${objformat} ${AOM_AS_FLAGS})
   string(STRIP "${AOM_AS_FLAGS}" AOM_AS_FLAGS)
-elseif ("${AOM_TARGET_CPU}" MATCHES "arm")
-  if ("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+elseif("${AOM_TARGET_CPU}" MATCHES "arm")
+  if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
     set(AS_EXECUTABLE as)
     set(AOM_AS_FLAGS -arch ${AOM_TARGET_CPU} -isysroot ${CMAKE_OSX_SYSROOT})
-  elseif ("${AOM_TARGET_SYSTEM}" STREQUAL "Linux")
-    # arm linux assembler settings controlled by
-    # build/cmake/toolchains/arm*-linux*.cmake
-  endif ()
-  if (NOT AS_EXECUTABLE)
+  elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Linux")
+    if(NOT AS_EXECUTABLE)
+      set(AS_EXECUTABLE as)
+    endif()
+  elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
+    if(NOT AS_EXECUTABLE)
+      set(AS_EXECUTABLE ${CMAKE_C_COMPILER} -c -mimplicit-it=always)
+    endif()
+  endif()
+  if(NOT AS_EXECUTABLE)
     message(FATAL_ERROR
-            "Unknown assembler for: ${AOM_TARGET_CPU}-${AOM_TARGET_SYSTEM}")
-  endif ()
+              "Unknown assembler for: ${AOM_TARGET_CPU}-${AOM_TARGET_SYSTEM}")
+  endif()
 
   string(STRIP "${AOM_AS_FLAGS}" AOM_AS_FLAGS)
-endif ()
+endif()
 
-if (CONFIG_ANALYZER)
+if(CONFIG_ANALYZER)
+  include(FindwxWidgets)
   find_package(wxWidgets REQUIRED adv base core)
   include(${wxWidgets_USE_FILE})
-endif ()
+endif()
 
-if (NOT MSVC AND CMAKE_C_COMPILER_ID MATCHES "GNU\|Clang")
+if(NOT MSVC AND CMAKE_C_COMPILER_ID MATCHES "GNU\|Clang")
   set(CONFIG_GCC 1)
-endif ()
+endif()
 
-if (CONFIG_GCOV)
+if(CONFIG_GCOV)
   message("--- Testing for CONFIG_GCOV support.")
   require_linker_flag("-fprofile-arcs -ftest-coverage")
   require_compiler_flag("-fprofile-arcs -ftest-coverage" YES)
-endif ()
+endif()
 
-if (CONFIG_GPROF)
+if(CONFIG_GPROF)
   message("--- Testing for CONFIG_GPROF support.")
   require_compiler_flag("-pg" YES)
-endif ()
+endif()
 
-if ("${AOM_TARGET_SYSTEM}" MATCHES "Darwin\|Linux\|Windows")
+if("${AOM_TARGET_SYSTEM}" MATCHES "Darwin\|Linux\|Windows")
   set(CONFIG_OS_SUPPORT 1)
-endif ()
+endif()
 
-################################################################################
+#
 # Fix CONFIG_* dependencies. This must be done before including cpu.cmake to
 # ensure RTCD_CONFIG_* are properly set.
 fix_experiment_configs()
@@ -198,54 +201,65 @@ fix_experiment_configs()
 # Test compiler support.
 aom_get_inline("INLINE")
 
-# TODO(tomfinegan): aom_ports_check is legacy; HAVE_AOM_PORTS is not used
-# anywhere in the aom sources. To be removed after parity with the legacy
-# build system stops being important.
-aom_check_source_compiles("aom_ports_check"
-                          "#include \"${AOM_ROOT}/aom/aom_integer.h\""
-                          HAVE_AOM_PORTS)
-aom_check_source_compiles("pthread_check" "#include <pthread.h>" HAVE_PTHREAD_H)
+# Don't just check for pthread.h, but use the result of the full pthreads
+# including a linking check in FindThreads above.
+set(HAVE_PTHREAD_H ${CMAKE_USE_PTHREADS_INIT})
 aom_check_source_compiles("unistd_check" "#include <unistd.h>" HAVE_UNISTD_H)
 
-if (NOT MSVC)
+if(NOT MSVC)
   aom_push_var(CMAKE_REQUIRED_LIBRARIES "m")
-  aom_check_c_compiles("fenv_check"
-                       "#define _GNU_SOURCE
+  aom_check_c_compiles(
+    "fenv_check"
+    "#define _GNU_SOURCE
                         #include <fenv.h>
                         void unused(void) {
                           (void)unused;
                           (void)feenableexcept(FE_DIVBYZERO | FE_INVALID);
-                        }" HAVE_FEXCEPT)
+                        }"
+    HAVE_FEXCEPT)
   aom_pop_var(CMAKE_REQUIRED_LIBRARIES)
 endif()
 
 include("${AOM_ROOT}/build/cmake/cpu.cmake")
 
-if (ENABLE_CCACHE)
+if(ENABLE_CCACHE)
   set_compiler_launcher(ENABLE_CCACHE ccache)
-endif ()
+endif()
 
-if (ENABLE_DISTCC)
+if(ENABLE_DISTCC)
   set_compiler_launcher(ENABLE_DISTCC distcc)
-endif ()
+endif()
 
-if (ENABLE_GOMA)
+if(ENABLE_GOMA)
   set_compiler_launcher(ENABLE_GOMA gomacc)
-endif ()
+endif()
 
-if (NOT CONFIG_AV1_DECODER AND NOT CONFIG_AV1_ENCODER)
+if(NOT CONFIG_AV1_DECODER AND NOT CONFIG_AV1_ENCODER)
   message(FATAL_ERROR "Decoder and encoder disabled, nothing to build.")
-endif ()
+endif()
+
+if(DECODE_HEIGHT_LIMIT OR DECODE_WIDTH_LIMIT)
+  change_config_and_warn(CONFIG_SIZE_LIMIT 1
+                         "DECODE_HEIGHT_LIMIT and DECODE_WIDTH_LIMIT")
+endif()
+
+if(CONFIG_SIZE_LIMIT)
+  if(NOT DECODE_HEIGHT_LIMIT OR NOT DECODE_WIDTH_LIMIT)
+    message(FATAL_ERROR "When setting CONFIG_SIZE_LIMIT, DECODE_HEIGHT_LIMIT "
+                        "and DECODE_WIDTH_LIMIT must be set.")
+  endif()
+endif()
 
 # Test compiler flags.
-if (MSVC)
+if(MSVC)
   add_compiler_flag_if_supported("/W3")
+
   # Disable MSVC warnings that suggest making code non-portable.
   add_compiler_flag_if_supported("/wd4996")
-  if (ENABLE_WERROR)
+  if(ENABLE_WERROR)
     add_compiler_flag_if_supported("/WX")
-  endif ()
-else ()
+  endif()
+else()
   require_c_flag("-std=c99" YES)
   add_compiler_flag_if_supported("-Wall")
   add_compiler_flag_if_supported("-Wdisabled-optimization")
@@ -261,8 +275,9 @@ else ()
   add_compiler_flag_if_supported("-Wuninitialized")
   add_compiler_flag_if_supported("-Wunused")
   add_compiler_flag_if_supported("-Wvla")
-  # TODO(jzern): this could be added as a cxx flags for test/*.cc only,
-  # avoiding third_party.
+
+  # TODO(jzern): this could be added as a cxx flags for test/*.cc only, avoiding
+  # third_party.
   add_c_flag_if_supported("-Wshorten-64-to-32")
 
   # Add -Wshadow only for C files to avoid massive gtest warning spam.
@@ -271,67 +286,64 @@ else ()
   # Add -Wundef only for C files to avoid massive gtest warning spam.
   add_c_flag_if_supported("-Wundef")
 
-  if (ENABLE_WERROR)
+  if(ENABLE_WERROR)
     add_compiler_flag_if_supported("-Werror")
-  endif ()
-  # Flag(s) added here negate CMake defaults and produce build output similar
-  # to the existing configure/make build system.
-  add_compiler_flag_if_supported("-Wno-unused-function")
+  endif()
 
-  if ("${CMAKE_BUILD_TYPE}" MATCHES "Rel")
+  if("${CMAKE_BUILD_TYPE}" MATCHES "Rel")
     add_compiler_flag_if_supported("-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0")
-  endif ()
+  endif()
   add_compiler_flag_if_supported("-D_LARGEFILE_SOURCE")
   add_compiler_flag_if_supported("-D_FILE_OFFSET_BITS=64")
-endif ()
+endif()
 
 set(AOM_LIB_LINK_TYPE PUBLIC)
-if (EMSCRIPTEN)
+if(EMSCRIPTEN)
+
   # Avoid CMake generation time errors resulting from collisions with the form
   # of target_link_libraries() used by Emscripten.cmake.
   unset(AOM_LIB_LINK_TYPE)
-endif ()
+endif()
 
 # Generate aom_config templates.
-set(aom_config_asm_template "${AOM_CONFIG_DIR}/aom_config.asm.cmake")
-set(aom_config_h_template "${AOM_CONFIG_DIR}/aom_config.h.cmake")
-execute_process(COMMAND ${CMAKE_COMMAND}
-  -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
-  -DAOM_ROOT=${AOM_ROOT}
-  -P "${AOM_ROOT}/build/cmake/generate_aom_config_templates.cmake")
+set(aom_config_asm_template "${AOM_CONFIG_DIR}/config/aom_config.asm.cmake")
+set(aom_config_h_template "${AOM_CONFIG_DIR}/config/aom_config.h.cmake")
+execute_process(COMMAND
+                  ${CMAKE_COMMAND} -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                  -DAOM_ROOT=${AOM_ROOT} -P
+                  "${AOM_ROOT}/build/cmake/generate_aom_config_templates.cmake")
 
 # Generate aom_config.{asm,h}.
-configure_file("${aom_config_asm_template}" "${AOM_CONFIG_DIR}/aom_config.asm")
-configure_file("${aom_config_h_template}" "${AOM_CONFIG_DIR}/aom_config.h")
+configure_file("${aom_config_asm_template}"
+               "${AOM_CONFIG_DIR}/config/aom_config.asm")
+configure_file("${aom_config_h_template}"
+               "${AOM_CONFIG_DIR}/config/aom_config.h")
 
 # Read the current git hash.
 find_package(Git)
-if (NOT GIT_FOUND)
+if(NOT GIT_FOUND)
   message("--- Git missing, version will be read from CHANGELOG.")
-endif ()
+endif()
 
-configure_file("${AOM_ROOT}/build/cmake/aom_config.c.cmake"
-               "${AOM_CONFIG_DIR}/aom_config.c")
+configure_file("${AOM_ROOT}/build/cmake/aom_config.c.template"
+               "${AOM_CONFIG_DIR}/config/aom_config.c")
 
 # Find Perl and generate the RTCD sources.
 find_package(Perl)
-if (NOT PERL_FOUND)
+if(NOT PERL_FOUND)
   message(FATAL_ERROR "Perl is required to build libaom.")
-endif ()
+endif()
 
 configure_file("${AOM_CONFIG_DIR}/rtcd_config.cmake"
                "${AOM_CONFIG_DIR}/${AOM_TARGET_CPU}_rtcd_config.rtcd")
 
-set(AOM_RTCD_CONFIG_FILE_LIST
-    "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl"
+set(AOM_RTCD_CONFIG_FILE_LIST "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl"
     "${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl"
     "${AOM_ROOT}/av1/common/av1_rtcd_defs.pl")
-set(AOM_RTCD_HEADER_FILE_LIST
-    "${AOM_CONFIG_DIR}/aom_dsp_rtcd.h"
-    "${AOM_CONFIG_DIR}/aom_scale_rtcd.h"
-    "${AOM_CONFIG_DIR}/av1_rtcd.h")
-set(AOM_RTCD_SOURCE_FILE_LIST
-    "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c"
+set(AOM_RTCD_HEADER_FILE_LIST "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h"
+    "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h"
+    "${AOM_CONFIG_DIR}/config/av1_rtcd.h")
+set(AOM_RTCD_SOURCE_FILE_LIST "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c"
     "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c"
     "${AOM_ROOT}/av1/common/av1_rtcd.c")
 set(AOM_RTCD_SYMBOL_LIST aom_dsp_rtcd aom_scale_rtcd av1_rtcd)
@@ -345,32 +357,26 @@ foreach(NUM RANGE ${AOM_RTCD_CUSTOM_COMMAND_COUNT})
   list(GET AOM_RTCD_SYMBOL_LIST ${NUM} AOM_RTCD_SYMBOL)
   execute_process(
     COMMAND ${PERL_EXECUTABLE} "${AOM_ROOT}/build/make/rtcd.pl"
-      --arch=${AOM_TARGET_CPU} --sym=${AOM_RTCD_SYMBOL} ${AOM_RTCD_FLAGS}
-      --config=${AOM_CONFIG_DIR}/${AOM_TARGET_CPU}_rtcd_config.rtcd
-      ${AOM_RTCD_CONFIG_FILE}
+            --arch=${AOM_TARGET_CPU}
+            --sym=${AOM_RTCD_SYMBOL} ${AOM_RTCD_FLAGS}
+            --config=${AOM_CONFIG_DIR}/${AOM_TARGET_CPU}_rtcd_config.rtcd
+            ${AOM_RTCD_CONFIG_FILE}
     OUTPUT_FILE ${AOM_RTCD_HEADER_FILE})
 endforeach()
 
 # Generate aom_version.h.
-execute_process(
-  COMMAND ${CMAKE_COMMAND}
-  -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
-  -DAOM_ROOT=${AOM_ROOT}
-  -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
-  -DPERL_EXECUTABLE=${PERL_EXECUTABLE}
-  -P "${AOM_ROOT}/build/cmake/version.cmake")
-
-if (NOT MSVC)
-  # Generate aom.pc (pkg-config file).
-  execute_process(
-    COMMAND ${CMAKE_COMMAND}
-    -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
-    -DAOM_ROOT=${AOM_ROOT}
-    -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}
-    -DCMAKE_PROJECT_NAME=${CMAKE_PROJECT_NAME}
-    -DCONFIG_MULTITHREAD=${CONFIG_MULTITHREAD}
-    -DHAVE_PTHREAD_H=${HAVE_PTHREAD_H}
-    -P "${AOM_ROOT}/build/cmake/pkg_config.cmake")
-endif ()
-
-endif ()  # AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_
+execute_process(COMMAND ${CMAKE_COMMAND} -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                        -DAOM_ROOT=${AOM_ROOT}
+                        -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
+                        -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P
+                        "${AOM_ROOT}/build/cmake/version.cmake")
+
+if(NOT MSVC) # Generate aom.pc (pkg-config file).
+  execute_process(COMMAND ${CMAKE_COMMAND} -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                          -DAOM_ROOT=${AOM_ROOT}
+                          -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}
+                          -DCMAKE_PROJECT_NAME=${CMAKE_PROJECT_NAME}
+                          -DCONFIG_MULTITHREAD=${CONFIG_MULTITHREAD}
+                          -DHAVE_PTHREAD_H=${HAVE_PTHREAD_H} -P
+                          "${AOM_ROOT}/build/cmake/pkg_config.cmake")
+endif()
diff --git a/third_party/aom/build/cmake/aom_experiment_deps.cmake b/third_party/aom/build/cmake/aom_experiment_deps.cmake
index 938af2386..e2c8102aa 100644
--- a/third_party/aom/build/cmake/aom_experiment_deps.cmake
+++ b/third_party/aom/build/cmake/aom_experiment_deps.cmake
@@ -1,139 +1,28 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_
 set(AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_ 1)
 
 # Adjusts CONFIG_* CMake variables to address conflicts between active AV1
 # experiments.
-macro (fix_experiment_configs)
-  if (CONFIG_ANALYZER)
-    if (NOT CONFIG_INSPECTION)
-      change_config_and_warn(CONFIG_INSPECTION 1 CONFIG_ANALYZER)
-    endif ()
-  endif ()
+macro(fix_experiment_configs)
 
-  if (CONFIG_VAR_TX_NO_TX_MODE AND NOT CONFIG_VAR_TX)
-     change_config_and_warn(CONFIG_VAR_TX 1 CONFIG_VAR_TX_NO_TX_MODE)
-  endif ()
+  if(CONFIG_ANALYZER)
+    change_config_and_warn(CONFIG_INSPECTION 1 CONFIG_ANALYZER)
+  endif()
 
-  if (CONFIG_CHROMA_2X2)
-    change_config_and_warn(CONFIG_CHROMA_SUB8X8 0 CONFIG_CHROMA_2X2)
-  endif ()
+  if(CONFIG_RD_DEBUG)
+    change_config_and_warn(CONFIG_RD_DEBUG 0 CONFIG_JNT_COMP)
+  endif()
 
-  if (CONFIG_DAALA_TX)
-     set(CONFIG_DAALA_DCT4 1)
-     set(CONFIG_DAALA_DCT8 1)
-     set(CONFIG_DAALA_DCT16 1)
-     set(CONFIG_DAALA_DCT32 1)
-     set(CONFIG_DAALA_DCT64 1)
-  endif ()
-
-  if (CONFIG_DAALA_DCT64)
-    if (NOT CONFIG_TX64X64)
-      set(CONFIG_DAALA_DCT64 0)
-      message("--- DAALA_DCT64 requires TX64X64: disabled DAALA_DCT64")
-    endif ()
-  endif ()
-
-  if (CONFIG_DAALA_DCT4 OR CONFIG_DAALA_DCT8 OR CONFIG_DAALA_DCT16 OR
-      CONFIG_DAALA_DCT32 OR CONFIG_DAALA_DCT64)
-    if (CONFIG_LGT)
-      change_config_and_warn(CONFIG_LGT 0 CONFIG_DAALA_DCTx)
-    endif ()
-    if (NOT CONFIG_LOWBITDEPTH)
-      change_config_and_warn(CONFIG_LOWBITDEPTH 1 CONFIG_DAALA_DCTx)
-    endif ()
-  endif ()
-
-  if (CONFIG_TXK_SEL)
-    if (NOT CONFIG_LV_MAP)
-      change_config_and_warn(CONFIG_LV_MAP 1 CONFIG_TXK_SEL)
-    endif ()
-  endif ()
-
-  if (CONFIG_CTX1D)
-    if (NOT CONFIG_LV_MAP)
-      change_config_and_warn(CONFIG_LV_MAP 1 CONFIG_CTX1D)
-    endif ()
-    if (NOT CONFIG_EXT_TX)
-      change_config_and_warn(CONFIG_EXT_TX 1 CONFIG_CTX1D)
-    endif ()
-  endif ()
-
-  if (CONFIG_EXT_COMP_REFS)
-    if (NOT CONFIG_EXT_REFS)
-      change_config_and_warn(CONFIG_EXT_REFS 1 CONFIG_EXT_COMP_REFS)
-    endif ()
-  endif ()
-
-  if (CONFIG_STRIPED_LOOP_RESTORATION)
-    if (NOT CONFIG_LOOP_RESTORATION)
-      change_config_and_warn(CONFIG_LOOP_RESTORATION 1
-                             CONFIG_STRIPED_LOOP_RESTORATION)
-    endif ()
-  endif ()
-
-  if (CONFIG_MFMV)
-    if (NOT CONFIG_FRAME_MARKER)
-      change_config_and_warn(CONFIG_FRAME_MARKER 1 CONFIG_MFMV)
-    endif ()
-  endif ()
-
-  if (CONFIG_NEW_MULTISYMBOL)
-    if (NOT CONFIG_RESTRICT_COMPRESSED_HDR)
-      change_config_and_warn(CONFIG_RESTRICT_COMPRESSED_HDR 1
-                             CONFIG_NEW_MULTISYMBOL)
-    endif ()
-  endif ()
-
-  if (CONFIG_EXT_PARTITION_TYPES)
-    if (CONFIG_SUPERTX)
-      change_config_and_warn(CONFIG_SUPERTX 0
-                             CONFIG_EXT_PARTITION_TYPES)
-    endif ()
-  endif ()
-
-  if (CONFIG_JNT_COMP)
-    if (NOT CONFIG_FRAME_MARKER)
-      change_config_and_warn(CONFIG_FRAME_MARKER 1 CONFIG_JNT_COMP)
-    endif ()
-  endif ()
-
-  if (CONFIG_AMVR)
-    change_config_and_warn(CONFIG_HASH_ME 1 CONFIG_AMVR)
-  endif ()
-
-  if (CONFIG_PVQ)
-    if (CONFIG_EXT_TX)
-      change_config_and_warn(CONFIG_EXT_TX 0 CONFIG_PVQ)
-    endif ()
-    if (CONFIG_HIGHBITDEPTH)
-      change_config_and_warn(CONFIG_HIGHBITDEPTH 0 CONFIG_PVQ)
-    endif ()
-    if (CONFIG_PALETTE_THROUGHPUT)
-      change_config_and_warn(CONFIG_PALETTE_THROUGHPUT 0 CONFIG_PVQ)
-    endif ()
-    if (CONFIG_RECT_TX)
-      change_config_and_warn(CONFIG_RECT_TX 0 CONFIG_PVQ)
-    endif ()
-    if (CONFIG_VAR_TX)
-      change_config_and_warn(CONFIG_VAR_TX 0 CONFIG_PVQ)
-    endif ()
-  endif ()
-
-  if (CONFIG_HORZONLY_FRAME_SUPERRES)
-    if (NOT CONFIG_FRAME_SUPERRES)
-      change_config_and_warn(CONFIG_FRAME_SUPERRES 1 CONFIG_HORZONLY_FRAME_SUPERRES)
-    endif ()
-  endif ()
-endmacro ()
-
-endif ()  # AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_
+endmacro()
diff --git a/third_party/aom/build/cmake/aom_optimization.cmake b/third_party/aom/build/cmake/aom_optimization.cmake
index c58c3993e..069ea1bb9 100644
--- a/third_party/aom/build/cmake/aom_optimization.cmake
+++ b/third_party/aom/build/cmake/aom_optimization.cmake
@@ -1,138 +1,146 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_
 set(AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_ 1)
 
 include("${AOM_ROOT}/build/cmake/util.cmake")
 
 # Translate $flag to one which MSVC understands, and write the new flag to the
 # variable named by $translated_flag (or unset it, when MSVC needs no flag).
-function (get_msvc_intrinsic_flag flag translated_flag)
-  if ("${flag}" STREQUAL "-mavx")
+function(get_msvc_intrinsic_flag flag translated_flag)
+  if("${flag}" STREQUAL "-mavx")
     set(${translated_flag} "/arch:AVX" PARENT_SCOPE)
-  elseif ("${flag}" STREQUAL "-mavx2")
+  elseif("${flag}" STREQUAL "-mavx2")
     set(${translated_flag} "/arch:AVX2" PARENT_SCOPE)
-  else ()
+  else()
+
     # MSVC does not need flags for intrinsics flavors other than AVX/AVX2.
     unset(${translated_flag} PARENT_SCOPE)
-  endif ()
-endfunction ()
+  endif()
+endfunction()
 
 # Adds an object library target. Terminates generation if $flag is not supported
 # by the current compiler. $flag is the intrinsics flag required by the current
 # compiler, and is added to the compile flags for all sources in $sources.
-# $opt_name is used to name the target. $target_to_update is made
-# dependent upon the created target.
+# $opt_name is used to name the target. $target_to_update is made dependent upon
+# the created target.
 #
 # Note: the libaom target is always updated because OBJECT libraries have rules
 # that disallow the direct addition of .o files to them as dependencies. Static
 # libraries do not have this limitation.
-function (add_intrinsics_object_library flag opt_name target_to_update sources
-          dependent_target)
+function(add_intrinsics_object_library flag opt_name target_to_update sources
+         dependent_target)
+  if("${${sources}}" STREQUAL "")
+    return()
+  endif()
   set(target_name ${target_to_update}_${opt_name}_intrinsics)
   add_library(${target_name} OBJECT ${${sources}})
 
-  if (MSVC)
+  if(MSVC)
     get_msvc_intrinsic_flag(${flag} "flag")
-  endif ()
+  endif()
 
-  if (flag)
+  if(flag)
+    separate_arguments(flag)
     target_compile_options(${target_name} PUBLIC ${flag})
-  endif ()
+  endif()
 
   target_sources(${dependent_target} PRIVATE $<TARGET_OBJECTS:${target_name}>)
 
   # Add the new lib target to the global list of aom library targets.
   list(APPEND AOM_LIB_TARGETS ${target_name})
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
-endfunction ()
+endfunction()
 
 # Adds sources in list named by $sources to $target and adds $flag to the
 # compile flags for each source file.
-function (add_intrinsics_source_to_target flag target sources)
+function(add_intrinsics_source_to_target flag target sources)
   target_sources(${target} PRIVATE ${${sources}})
-  if (MSVC)
+  if(MSVC)
     get_msvc_intrinsic_flag(${flag} "flag")
-  endif ()
-  if (flag)
-    foreach (source ${${sources}})
+  endif()
+  if(flag)
+    foreach(source ${${sources}})
       set_property(SOURCE ${source} APPEND PROPERTY COMPILE_FLAGS ${flag})
-    endforeach ()
-  endif ()
-endfunction ()
+    endforeach()
+  endif()
+endfunction()
 
 # Writes object format for the current target to the var named by $out_format,
 # or terminates the build when the object format for the current target is
 # unknown.
-function (get_asm_obj_format out_format)
-  if ("${AOM_TARGET_CPU}" STREQUAL "x86_64")
-    if ("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+function(get_asm_obj_format out_format)
+  if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+    if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
       set(objformat "macho64")
-    elseif ("${AOM_TARGET_SYSTEM}" STREQUAL "Linux")
+    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Linux")
       set(objformat "elf64")
-    elseif ("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS" OR
-            "${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
+    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS" OR "${AOM_TARGET_SYSTEM}"
+           STREQUAL "Windows")
       set(objformat "win64")
-    else ()
+    else()
       message(FATAL_ERROR "Unknown obj format: ${AOM_TARGET_SYSTEM}")
-    endif ()
-  elseif ("${AOM_TARGET_CPU}" STREQUAL "x86")
-    if ("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+    endif()
+  elseif("${AOM_TARGET_CPU}" STREQUAL "x86")
+    if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
       set(objformat "macho32")
-    elseif ("${AOM_TARGET_SYSTEM}" STREQUAL "Linux")
+    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Linux")
       set(objformat "elf32")
-    elseif ("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS" OR
-            "${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
+    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS" OR "${AOM_TARGET_SYSTEM}"
+           STREQUAL "Windows")
       set(objformat "win32")
-    else ()
+    else()
       message(FATAL_ERROR "Unknown obj format: ${AOM_TARGET_SYSTEM}")
-    endif ()
-  else ()
+    endif()
+  else()
     message(FATAL_ERROR
-            "Unknown obj format: ${AOM_TARGET_CPU}-${AOM_TARGET_SYSTEM}")
-  endif ()
+              "Unknown obj format: ${AOM_TARGET_CPU}-${AOM_TARGET_SYSTEM}")
+  endif()
 
   set(${out_format} ${objformat} PARENT_SCOPE)
-endfunction ()
+endfunction()
 
 # Adds library target named $lib_name for ASM files in variable named by
 # $asm_sources. Builds an output directory path from $lib_name. Links $lib_name
 # into $dependent_target. Generates a dummy C file with a dummy function to
 # ensure that all cmake generators can determine the linker language, and that
 # build tools don't complain that an object exposes no symbols.
-function (add_asm_library lib_name asm_sources dependent_target)
+function(add_asm_library lib_name asm_sources dependent_target)
+  if("${${asm_sources}}" STREQUAL "")
+    return()
+  endif()
   set(asm_lib_obj_dir "${AOM_CONFIG_DIR}/asm_objects/${lib_name}")
-  if (NOT EXISTS "${asm_lib_obj_dir}")
+  if(NOT EXISTS "${asm_lib_obj_dir}")
     file(MAKE_DIRECTORY "${asm_lib_obj_dir}")
-  endif ()
+  endif()
 
   # TODO(tomfinegan): If cmake ever allows addition of .o files to OBJECT lib
   # targets, make this OBJECT instead of STATIC to hide the target from
   # consumers of the AOM cmake build.
   add_library(${lib_name} STATIC ${${asm_sources}})
 
-  foreach (asm_source ${${asm_sources}})
+  foreach(asm_source ${${asm_sources}})
     get_filename_component(asm_source_name "${asm_source}" NAME)
     set(asm_object "${asm_lib_obj_dir}/${asm_source_name}.o")
     add_custom_command(OUTPUT "${asm_object}"
-                       COMMAND ${AS_EXECUTABLE}
-                       ARGS ${AOM_AS_FLAGS}
-                            -I${AOM_ROOT}/ -I${AOM_CONFIG_DIR}/
-                            -o "${asm_object}" "${asm_source}"
+                       COMMAND ${AS_EXECUTABLE} ARGS ${AOM_AS_FLAGS}
+                               -I${AOM_ROOT}/ -I${AOM_CONFIG_DIR}/ -o
+                               "${asm_object}" "${asm_source}"
                        DEPENDS "${asm_source}"
                        COMMENT "Building ASM object ${asm_object}"
-                       WORKING_DIRECTORY "${AOM_CONFIG_DIR}"
-                       VERBATIM)
+                       WORKING_DIRECTORY "${AOM_CONFIG_DIR}" VERBATIM)
     target_sources(aom PRIVATE "${asm_object}")
-  endforeach ()
+  endforeach()
 
   # The above created a target containing only ASM sources. Cmake needs help
   # here to determine the linker language. Add a dummy C file to force the
@@ -144,121 +152,61 @@ function (add_asm_library lib_name asm_sources dependent_target)
   # Add the new lib target to the global list of aom library targets.
   list(APPEND AOM_LIB_TARGETS ${lib_name})
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
-endfunction ()
-
-# Converts asm sources in $asm_sources using $AOM_ADS2GAS and calls
-# add_asm_library() to create a library from the converted sources. At
-# generation time the converted sources are created, and a custom rule is added
-# to ensure the sources are reconverted when the original asm source is updated.
-# See add_asm_library() for more information.
-function (add_gas_asm_library lib_name asm_sources dependent_target)
-  set(asm_converted_source_dir "${AOM_CONFIG_DIR}/asm_gas/${lib_name}")
-  if (NOT EXISTS "${asm_converted_source_dir}")
-    file(MAKE_DIRECTORY "${asm_converted_source_dir}")
-  endif ()
-
-  # Create the converted version of each assembly source at generation time.
-  unset(gas_target_sources)
-  foreach (neon_asm_source ${${asm_sources}})
-    get_filename_component(output_asm_source "${neon_asm_source}" NAME)
-    set(output_asm_source "${asm_converted_source_dir}/${output_asm_source}")
-    set(output_asm_source "${output_asm_source}.${AOM_GAS_EXT}")
-    execute_process(COMMAND "${PERL_EXECUTABLE}" "${AOM_ADS2GAS}"
-                    INPUT_FILE "${neon_asm_source}"
-                    OUTPUT_FILE "${output_asm_source}")
-    list(APPEND gas_target_sources "${output_asm_source}")
-  endforeach ()
-
-  add_asm_library("${lib_name}" "gas_target_sources" "${dependent_target}")
-
-  # For each of the converted sources, create a custom rule that will regenerate
-  # the converted source when its input is touched.
-  list(LENGTH gas_target_sources num_asm_files)
-  math(EXPR num_asm_files "${num_asm_files} - 1")
-  foreach(NUM RANGE ${num_asm_files})
-    list(GET ${asm_sources} ${NUM} neon_asm_source)
-    list(GET gas_target_sources ${NUM} gas_asm_source)
-
-    # Grab only the filename for the custom command output to keep build output
-    # reasonably sane.
-    get_filename_component(neon_name "${neon_asm_source}" NAME)
-    get_filename_component(gas_name "${gas_asm_source}" NAME)
-
-    add_custom_command(
-        OUTPUT "${gas_asm_source}"
-        COMMAND ${PERL_EXECUTABLE}
-        ARGS "${AOM_ADS2GAS}" < "${neon_asm_source}" > "${gas_asm_source}"
-        DEPENDS "${neon_asm_source}"
-        COMMENT "ads2gas conversion ${neon_name} -> ${gas_name}"
-        WORKING_DIRECTORY "${AOM_CONFIG_DIR}"
-        VERBATIM)
-  endforeach ()
-
-  # Update the sources list passed in to include the converted asm source files.
-  list(APPEND asm_sources ${gas_target_sources})
-  set(${asm_sources} ${${asm_sources}} PARENT_SCOPE)
-endfunction ()
+endfunction()
 
 # Terminates generation if nasm found in PATH does not meet requirements.
 # Currently checks only for presence of required object formats and support for
 # the -Ox argument (multipass optimization).
-function (test_nasm)
-  execute_process(COMMAND ${AS_EXECUTABLE} -hf
-                  OUTPUT_VARIABLE nasm_helptext)
+function(test_nasm)
+  execute_process(COMMAND ${AS_EXECUTABLE} -hf OUTPUT_VARIABLE nasm_helptext)
 
-  if (NOT "${nasm_helptext}" MATCHES "-Ox")
+  if(NOT "${nasm_helptext}" MATCHES "-Ox")
     message(FATAL_ERROR
-            "Unsupported nasm: multipass optimization not supported.")
-  endif ()
+              "Unsupported nasm: multipass optimization not supported.")
+  endif()
 
-  if ("${AOM_TARGET_CPU}" STREQUAL "x86")
-    if ("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
-      if (NOT "${nasm_helptext}" MATCHES "macho32")
+  if("${AOM_TARGET_CPU}" STREQUAL "x86")
+    if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+      if(NOT "${nasm_helptext}" MATCHES "macho32")
         message(FATAL_ERROR
-                "Unsupported nasm: macho32 object format not supported.")
-      endif ()
-    elseif ("${AOM_TARGET_SYSTEM}" STREQUAL "Linux")
-      if (NOT "${nasm_helptext}" MATCHES "elf32")
+                  "Unsupported nasm: macho32 object format not supported.")
+      endif()
+    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Linux")
+      if(NOT "${nasm_helptext}" MATCHES "elf32")
         message(FATAL_ERROR
-                "Unsupported nasm: elf32 object format not supported.")
-      endif ()
-    endif ()
-  else ()
-    if ("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
-      if (NOT "${nasm_helptext}" MATCHES "macho64")
+                  "Unsupported nasm: elf32 object format not supported.")
+      endif()
+    endif()
+  else()
+    if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+      if(NOT "${nasm_helptext}" MATCHES "macho64")
         message(FATAL_ERROR
-                "Unsupported nasm: macho64 object format not supported.")
-      endif ()
-    elseif ("${AOM_TARGET_SYSTEM}" STREQUAL "Linux")
-      if (NOT "${nasm_helptext}" MATCHES "elf64")
+                  "Unsupported nasm: macho64 object format not supported.")
+      endif()
+    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Linux")
+      if(NOT "${nasm_helptext}" MATCHES "elf64")
         message(FATAL_ERROR
-                "Unsupported nasm: elf64 object format not supported.")
-      endif ()
-    endif ()
-  endif ()
-endfunction ()
+                  "Unsupported nasm: elf64 object format not supported.")
+      endif()
+    endif()
+  endif()
+endfunction()
 
 # Adds build command for generation of rtcd C source files using
 # build/make/rtcd.pl. $config is the input perl file, $output is the output C
 # include file, $source is the C source file, and $symbol is used for the symbol
 # argument passed to rtcd.pl.
-function (add_rtcd_build_step config output source symbol)
+function(add_rtcd_build_step config output source symbol)
   add_custom_command(
     OUTPUT ${output}
-    COMMAND ${PERL_EXECUTABLE}
-    ARGS "${AOM_ROOT}/build/make/rtcd.pl"
-      --arch=${AOM_TARGET_CPU}
-      --sym=${symbol}
-      ${AOM_RTCD_FLAGS}
-      --config=${AOM_CONFIG_DIR}/${AOM_TARGET_CPU}_rtcd_config.rtcd
-      ${config}
-      > ${output}
+    COMMAND ${PERL_EXECUTABLE} ARGS "${AOM_ROOT}/build/make/rtcd.pl"
+            --arch=${AOM_TARGET_CPU}
+            --sym=${symbol} ${AOM_RTCD_FLAGS}
+            --config=${AOM_CONFIG_DIR}/${AOM_TARGET_CPU}_rtcd_config.rtcd
+            ${config} > ${output}
     DEPENDS ${config}
     COMMENT "Generating ${output}"
-    WORKING_DIRECTORY ${AOM_CONFIG_DIR}
-    VERBATIM)
+    WORKING_DIRECTORY ${AOM_CONFIG_DIR} VERBATIM)
   set_property(SOURCE ${source} PROPERTY OBJECT_DEPENDS ${output})
   set_property(SOURCE ${output} PROPERTY GENERATED)
-endfunction ()
-
-endif ()  # AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_
+endfunction()
diff --git a/third_party/aom/build/cmake/compiler_flags.cmake b/third_party/aom/build/cmake/compiler_flags.cmake
index aa34b83ae..79192c1fa 100644
--- a/third_party/aom/build/cmake/compiler_flags.cmake
+++ b/third_party/aom/build/cmake/compiler_flags.cmake
@@ -1,14 +1,16 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_)
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_
 set(AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_ 1)
 
 include(CheckCCompilerFlag)
@@ -27,302 +29,301 @@ set(AOM_FAILED_CXX_FLAGS)
 # otherwise.
 function(is_flag_present flag_cache flag out_is_present)
   string(FIND "${${flag_cache}}" "${flag}" flag_pos)
-  if (${flag_pos} EQUAL -1)
+  if(${flag_pos} EQUAL -1)
     set(${out_is_present} NO PARENT_SCOPE)
-  else ()
+  else()
     set(${out_is_present} YES PARENT_SCOPE)
-  endif ()
-endfunction ()
+  endif()
+endfunction()
 
 # Appends $flag to $flags. Ignores scope via use of FORCE with set() call.
-function (append_flag flags flag)
+function(append_flag flags flag)
   string(FIND "${${flags}}" "${flag}" found)
-  if (${found} EQUAL -1)
+  if(${found} EQUAL -1)
     set(${flags} "${${flags}} ${flag}" CACHE STRING "" FORCE)
-  endif ()
-endfunction ()
+  endif()
+endfunction()
 
 # Checks C compiler for support of $c_flag. Adds $c_flag to all
 # $CMAKE_C_FLAGS_<CONFIG>s stored in AOM_C_CONFIGS when the compile test passes.
 # Caches $c_flag in $AOM_C_FLAGS or $AOM_FAILED_C_FLAGS depending on test
 # outcome.
-function (add_c_flag_if_supported c_flag)
-  if (DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+function(add_c_flag_if_supported c_flag)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
     return()
-  endif ()
+  endif()
 
   is_flag_present(AOM_C_FLAGS "${c_flag}" flag_ok)
   is_flag_present(AOM_FAILED_C_FLAGS "${c_flag}" flag_failed)
-  if (${flag_ok} OR ${flag_failed})
-    return ()
-  endif ()
+  if(${flag_ok} OR ${flag_failed})
+    return()
+  endif()
 
   unset(C_FLAG_SUPPORTED CACHE)
   message("Checking C compiler flag support for: " ${c_flag})
   check_c_compiler_flag("${c_flag}" C_FLAG_SUPPORTED)
 
-  if (${C_FLAG_SUPPORTED})
+  if(${C_FLAG_SUPPORTED})
     append_flag(AOM_C_FLAGS "${c_flag}")
-    foreach (config ${AOM_C_CONFIGS})
+    foreach(config ${AOM_C_CONFIGS})
       unset(C_FLAG_FOUND)
       append_flag("${config}" "${c_flag}")
-    endforeach ()
-  else ()
+    endforeach()
+  else()
     append_flag(AOM_FAILED_C_FLAGS "${c_flag}")
-  endif ()
-endfunction ()
+  endif()
+endfunction()
 
 # Checks C++ compiler for support of $cxx_flag. Adds $cxx_flag to all
 # $CMAKE_CXX_FLAGS_<CONFIG>s stored in AOM_CXX_CONFIGS when the compile test
-# passes.
-# Caches $cxx_flag in $AOM_CXX_FLAGS or $AOM_FAILED_CXX_FLAGS depending on test
-# outcome.
-function (add_cxx_flag_if_supported cxx_flag)
-  if (DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+# passes. Caches $cxx_flag in $AOM_CXX_FLAGS or $AOM_FAILED_CXX_FLAGS depending
+# on test outcome.
+function(add_cxx_flag_if_supported cxx_flag)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
     return()
-  endif ()
+  endif()
 
   is_flag_present(AOM_CXX_FLAGS "${cxx_flag}" flag_ok)
   is_flag_present(AOM_FAILED_CXX_FLAGS "${cxx_flag}" flag_failed)
-  if (${flag_ok} OR ${flag_failed})
-    return ()
-  endif ()
+  if(${flag_ok} OR ${flag_failed})
+    return()
+  endif()
 
   unset(CXX_FLAG_SUPPORTED CACHE)
   message("Checking C++ compiler flag support for: " ${cxx_flag})
   check_cxx_compiler_flag("${cxx_flag}" CXX_FLAG_SUPPORTED)
 
-  if (${CXX_FLAG_SUPPORTED})
+  if(${CXX_FLAG_SUPPORTED})
     append_flag(AOM_CXX_FLAGS "${cxx_flag}")
-    foreach (config ${AOM_CXX_CONFIGS})
+    foreach(config ${AOM_CXX_CONFIGS})
       unset(CXX_FLAG_FOUND)
       append_flag("${config}" "${cxx_flag}")
-    endforeach ()
-  else ()
+    endforeach()
+  else()
     append_flag(AOM_FAILED_CXX_FLAGS "${cxx_flag}")
-  endif ()
-endfunction ()
+  endif()
+endfunction()
 
 # Convenience method for adding a flag to both the C and C++ compiler command
 # lines.
-function (add_compiler_flag_if_supported flag)
+function(add_compiler_flag_if_supported flag)
   add_c_flag_if_supported(${flag})
   add_cxx_flag_if_supported(${flag})
-endfunction ()
+endfunction()
 
 # Checks C compiler for support of $c_flag and terminates generation when
 # support is not present.
-function (require_c_flag c_flag update_c_flags)
-  if (DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+function(require_c_flag c_flag update_c_flags)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
     return()
-  endif ()
+  endif()
 
   is_flag_present(AOM_C_FLAGS "${c_flag}" flag_ok)
-  if (${flag_ok})
-    return ()
-  endif ()
+  if(${flag_ok})
+    return()
+  endif()
 
-  if (NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
+  if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
     aom_push_var(CMAKE_EXE_LINKER_FLAGS "${AOM_EXE_LINKER_FLAGS}")
-  endif ()
+  endif()
 
   unset(HAVE_C_FLAG CACHE)
   message("Checking C compiler flag support for: " ${c_flag})
   check_c_compiler_flag("${c_flag}" HAVE_C_FLAG)
-  if (NOT HAVE_C_FLAG)
+  if(NOT HAVE_C_FLAG)
     message(FATAL_ERROR
-            "${PROJECT_NAME} requires support for C flag: ${c_flag}.")
-  endif ()
+              "${PROJECT_NAME} requires support for C flag: ${c_flag}.")
+  endif()
 
-  if (NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
+  if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
     aom_pop_var(CMAKE_EXE_LINKER_FLAGS)
-  endif ()
+  endif()
 
   append_flag(AOM_C_FLAGS "${c_flag}")
-  if (update_c_flags)
-    foreach (config ${AOM_C_CONFIGS})
+  if(update_c_flags)
+    foreach(config ${AOM_C_CONFIGS})
       set(${config} "${${config}} ${c_flag}" CACHE STRING "" FORCE)
-    endforeach ()
-  endif ()
-endfunction ()
+    endforeach()
+  endif()
+endfunction()
 
 # Checks CXX compiler for support of $cxx_flag and terminates generation when
 # support is not present.
-function (require_cxx_flag cxx_flag update_cxx_flags)
-  if (DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+function(require_cxx_flag cxx_flag update_cxx_flags)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
     return()
-  endif ()
+  endif()
 
   is_flag_present(AOM_CXX_FLAGS "${cxx_flag}" flag_ok)
-  if (${flag_ok})
-    return ()
-  endif ()
+  if(${flag_ok})
+    return()
+  endif()
 
-  if (NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
+  if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
     aom_push_var(CMAKE_EXE_LINKER_FLAGS "${AOM_EXE_LINKER_FLAGS}")
-  endif ()
+  endif()
 
   unset(HAVE_CXX_FLAG CACHE)
   message("Checking C compiler flag support for: " ${cxx_flag})
   check_cxx_compiler_flag("${cxx_flag}" HAVE_CXX_FLAG)
-  if (NOT HAVE_CXX_FLAG)
+  if(NOT HAVE_CXX_FLAG)
     message(FATAL_ERROR
-            "${PROJECT_NAME} requires support for C flag: ${cxx_flag}.")
-  endif ()
+              "${PROJECT_NAME} requires support for C flag: ${cxx_flag}.")
+  endif()
 
-  if (NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
+  if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
     aom_pop_var(CMAKE_EXE_LINKER_FLAGS)
-  endif ()
+  endif()
 
   append_flag(AOM_CXX_FLAGS "${cxx_flag}")
-  if (update_cxx_flags)
-    foreach (config ${AOM_CXX_CONFIGS})
+  if(update_cxx_flags)
+    foreach(config ${AOM_CXX_CONFIGS})
       set(${config} "${${config}} ${cxx_flag}" CACHE STRING "" FORCE)
-    endforeach ()
-  endif ()
-endfunction ()
+    endforeach()
+  endif()
+endfunction()
 
 # Checks for support of $flag by both the C and CXX compilers. Terminates
 # generation when support is not present in both compilers.
-function (require_compiler_flag flag update_cmake_flags)
+function(require_compiler_flag flag update_cmake_flags)
   require_c_flag(${flag} ${update_cmake_flags})
   require_cxx_flag(${flag} ${update_cmake_flags})
-endfunction ()
+endfunction()
 
 # Checks only non-MSVC targets for support of $c_flag and terminates generation
 # when support is not present.
-function (require_c_flag_nomsvc c_flag update_c_flags)
-  if (NOT MSVC)
+function(require_c_flag_nomsvc c_flag update_c_flags)
+  if(NOT MSVC)
     require_c_flag(${c_flag} ${update_c_flags})
-  endif ()
-endfunction ()
+  endif()
+endfunction()
 
 # Checks only non-MSVC targets for support of $cxx_flag and terminates
 # generation when support is not present.
-function (require_cxx_flag_nomsvc cxx_flag update_cxx_flags)
-  if (NOT MSVC)
+function(require_cxx_flag_nomsvc cxx_flag update_cxx_flags)
+  if(NOT MSVC)
     require_cxx_flag(${cxx_flag} ${update_cxx_flags})
-  endif ()
-endfunction ()
+  endif()
+endfunction()
 
 # Checks only non-MSVC targets for support of $flag by both the C and CXX
 # compilers. Terminates generation when support is not present in both
 # compilers.
-function (require_compiler_flag_nomsvc flag update_cmake_flags)
+function(require_compiler_flag_nomsvc flag update_cmake_flags)
   require_c_flag_nomsvc(${flag} ${update_cmake_flags})
   require_cxx_flag_nomsvc(${flag} ${update_cmake_flags})
-endfunction ()
+endfunction()
 
 # Adds $preproc_def to C compiler command line (as -D$preproc_def) if not
 # already present.
-function (add_c_preproc_definition preproc_def)
+function(add_c_preproc_definition preproc_def)
   set(preproc_def "-D${preproc_def}")
   is_flag_present(AOM_C_FLAGS "${preproc_def}" flag_cached)
-  if (${flag_cached})
-    return ()
-  endif ()
+  if(${flag_cached})
+    return()
+  endif()
 
-  foreach (config ${AOM_C_CONFIGS})
+  foreach(config ${AOM_C_CONFIGS})
     set(${config} "${${config}} ${preproc_def}" CACHE STRING "" FORCE)
-  endforeach ()
-endfunction ()
+  endforeach()
+endfunction()
 
 # Adds $preproc_def to CXX compiler command line (as -D$preproc_def) if not
 # already present.
-function (add_cxx_preproc_definition preproc_def)
+function(add_cxx_preproc_definition preproc_def)
   set(preproc_def "-D${preproc_def}")
   is_flag_present(AOM_CXX_FLAGS "${preproc_def}" flag_cached)
-  if (${flag_cached})
-    return ()
-  endif ()
+  if(${flag_cached})
+    return()
+  endif()
 
-  foreach (config ${AOM_CXX_CONFIGS})
+  foreach(config ${AOM_CXX_CONFIGS})
     set(${config} "${${config}} ${preproc_def}" CACHE STRING "" FORCE)
-  endforeach ()
-endfunction ()
+  endforeach()
+endfunction()
 
 # Adds $preproc_def to C and CXX compiler command line (as -D$preproc_def) if
 # not already present.
-function (add_preproc_definition preproc_def)
+function(add_preproc_definition preproc_def)
   add_c_preproc_definition(${preproc_def})
   add_cxx_preproc_definition(${preproc_def})
-endfunction ()
+endfunction()
 
 # Adds $flag to assembler command line.
-function (append_as_flag flag)
+function(append_as_flag flag)
   is_flag_present(AOM_AS_FLAGS "${flag}" flag_cached)
-  if (${flag_cached})
-    return ()
-  endif ()
+  if(${flag_cached})
+    return()
+  endif()
   append_flag(AOM_AS_FLAGS "${flag}")
-endfunction ()
+endfunction()
 
 # Adds $flag to the C compiler command line.
-function (append_c_flag flag)
+function(append_c_flag flag)
   is_flag_present(AOM_C_FLAGS "${flag}" flag_cached)
-  if (${flag_cached})
-    return ()
-  endif ()
+  if(${flag_cached})
+    return()
+  endif()
 
-  foreach (config ${AOM_C_CONFIGS})
+  foreach(config ${AOM_C_CONFIGS})
     append_flag(${config} "${flag}")
-  endforeach ()
-endfunction ()
+  endforeach()
+endfunction()
 
 # Adds $flag to the CXX compiler command line.
-function (append_cxx_flag flag)
+function(append_cxx_flag flag)
   is_flag_present(AOM_CXX_FLAGS "${flag}" flag_cached)
-  if (${flag_cached})
-    return ()
-  endif ()
+  if(${flag_cached})
+    return()
+  endif()
 
-  foreach (config ${AOM_CXX_CONFIGS})
+  foreach(config ${AOM_CXX_CONFIGS})
     append_flag(${config} "${flag}")
-  endforeach ()
-endfunction ()
+  endforeach()
+endfunction()
 
 # Adds $flag to the C and CXX compiler command lines.
-function (append_compiler_flag flag)
+function(append_compiler_flag flag)
   append_c_flag(${flag})
   append_cxx_flag(${flag})
-endfunction ()
+endfunction()
 
 # Adds $flag to the executable linker command line when not present.
-function (append_exe_linker_flag flag)
+function(append_exe_linker_flag flag)
   is_flag_present(AOM_EXE_LINKER_FLAGS "${flag}" flag_cached)
-  if (${flag_cached})
+  if(${flag_cached})
     return()
-  endif ()
+  endif()
 
   append_flag(AOM_EXE_LINKER_FLAGS "${flag}")
-  foreach (config ${AOM_EXE_LINKER_CONFIGS})
+  foreach(config ${AOM_EXE_LINKER_CONFIGS})
     append_flag(${config} "${flag}")
-  endforeach ()
-endfunction ()
+  endforeach()
+endfunction()
 
 # Adds $flag to the link flags for $target.
-function (append_link_flag_to_target target flag)
+function(append_link_flag_to_target target flag)
   unset(target_link_flags)
   get_target_property(target_link_flags ${target} LINK_FLAGS)
 
-  if (target_link_flags)
+  if(target_link_flags)
     is_flag_present(target_link_flags "${flag}" flag_found)
-    if (${flag_found})
+    if(${flag_found})
       return()
-    endif ()
-    set(target_link_flags "${target_link_flags} ${flags}")
-  else ()
-    set(target_link_flags "${flags}")
-  endif ()
+    endif()
+    set(target_link_flags "${target_link_flags} ${flag}")
+  else()
+    set(target_link_flags "${flag}")
+  endif()
 
   set_target_properties(${target} PROPERTIES LINK_FLAGS ${target_link_flags})
-endfunction ()
+endfunction()
 
 # Adds $flag to executable linker flags, and makes sure C/CXX builds still work.
-function (require_linker_flag flag)
-  if (DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+function(require_linker_flag flag)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
     return()
-  endif ()
+  endif()
 
   append_exe_linker_flag(${flag})
 
@@ -331,42 +332,42 @@ function (require_linker_flag flag)
   unset(cxx_passed)
   aom_check_cxx_compiles("LINKER_FLAG_CXX_TEST(${flag})" "" cxx_passed)
 
-  if (NOT c_passed OR NOT cxx_passed)
+  if(NOT c_passed OR NOT cxx_passed)
     message(FATAL_ERROR "Linker flag test for ${flag} failed.")
-  endif ()
-endfunction ()
+  endif()
+endfunction()
 
 # Appends flags in $AOM_EXTRA_<TYPE>_FLAGS variables to the flags used at build
 # time.
-function (set_user_flags)
+function(set_user_flags)
+
   # Linker flags are handled first because some C/CXX flags require that a
   # linker flag is present at link time.
-  if (AOM_EXTRA_EXE_LINKER_FLAGS)
+  if(AOM_EXTRA_EXE_LINKER_FLAGS)
     is_flag_present(AOM_EXE_LINKER_FLAGS "${AOM_EXTRA_EXE_LINKER_FLAGS}"
                     extra_present)
-    if (NOT ${extra_present})
+    if(NOT ${extra_present})
       require_linker_flag("${AOM_EXTRA_EXE_LINKER_FLAGS}")
-    endif ()
-  endif ()
-  if (AOM_EXTRA_AS_FLAGS)
+    endif()
+  endif()
+  if(AOM_EXTRA_AS_FLAGS)
+
     # TODO(tomfinegan): assembler flag testing would be a good thing to have.
     is_flag_present(AOM_AS_FLAGS "${AOM_EXTRA_AS_FLAGS}" extra_present)
-    if (NOT ${extra_present})
+    if(NOT ${extra_present})
       append_flag(AOM_AS_FLAGS "${AOM_EXTRA_AS_FLAGS}")
-    endif ()
-  endif ()
-  if (AOM_EXTRA_C_FLAGS)
+    endif()
+  endif()
+  if(AOM_EXTRA_C_FLAGS)
     is_flag_present(AOM_C_FLAGS "${AOM_EXTRA_C_FLAGS}" extra_present)
-    if (NOT ${extra_present})
+    if(NOT ${extra_present})
       require_c_flag("${AOM_EXTRA_C_FLAGS}" YES)
-    endif ()
-  endif ()
-  if (AOM_EXTRA_CXX_FLAGS)
+    endif()
+  endif()
+  if(AOM_EXTRA_CXX_FLAGS)
     is_flag_present(AOM_CXX_FLAGS "${AOM_EXTRA_CXX_FLAGS}" extra_present)
-    if (NOT ${extra_present})
+    if(NOT ${extra_present})
       require_cxx_flag("${AOM_EXTRA_CXX_FLAGS}" YES)
-    endif ()
-  endif ()
-endfunction ()
-
-endif ()  # AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_
+    endif()
+  endif()
+endfunction()
diff --git a/third_party/aom/build/cmake/compiler_tests.cmake b/third_party/aom/build/cmake/compiler_tests.cmake
index 8a73ec8ff..f115610ba 100644
--- a/third_party/aom/build/cmake/compiler_tests.cmake
+++ b/third_party/aom/build/cmake/compiler_tests.cmake
@@ -1,26 +1,30 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_)
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_
 set(AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_ 1)
 
 include(CheckCSourceCompiles)
 include(CheckCXXSourceCompiles)
 
 # CMake passes command line flags like this:
-#   $compiler $lang_flags $lang_flags_config ...
+#
+# * $compiler $lang_flags $lang_flags_config ...
+#
 # To ensure the flags tested here and elsewhere are obeyed a list of active
 # build configuration types is built, and flags are applied to the flag strings
 # for each configuration currently active for C and CXX builds as determined by
 # reading $CMAKE_CONFIGURATION_TYPES and $CMAKE_BUILD_TYPE. When
-# $CMAKE_CONFIGURATION_TYPES is non-empty a multi-configuration generator is in
+# $CMAKE_CONFIGURATION_TYPES is non-empty a multi- configuration generator is in
 # use: currently this includes MSVC and Xcode. For other generators
 # $CMAKE_BUILD_TYPE is used. For both cases AOM_<LANG>_CONFIGS is populated with
 # CMake string variable names that contain flags for the currently available
@@ -28,19 +32,19 @@ include(CheckCXXSourceCompiles)
 unset(AOM_C_CONFIGS)
 unset(AOM_CXX_CONFIGS)
 list(LENGTH CMAKE_CONFIGURATION_TYPES num_configs)
-if (${num_configs} GREATER 0)
-  foreach (config ${CMAKE_CONFIGURATION_TYPES})
+if(${num_configs} GREATER 0)
+  foreach(config ${CMAKE_CONFIGURATION_TYPES})
     string(TOUPPER ${config} config)
     list(APPEND AOM_C_CONFIGS "CMAKE_C_FLAGS_${config}")
     list(APPEND AOM_CXX_CONFIGS "CMAKE_CXX_FLAGS_${config}")
     list(APPEND AOM_EXE_LINKER_CONFIGS "CMAKE_EXE_LINKER_FLAGS_${config}")
-  endforeach ()
-else ()
+  endforeach()
+else()
   string(TOUPPER ${CMAKE_BUILD_TYPE} config)
   set(AOM_C_CONFIGS "CMAKE_C_FLAGS_${config}")
   set(AOM_CXX_CONFIGS "CMAKE_CXX_FLAGS_${config}")
   set(AOM_EXE_LINKER_CONFIGS "CMAKE_EXE_LINKER_FLAGS_${config}")
-endif ()
+endif()
 
 # The basic main() function used in all compile tests.
 set(AOM_C_MAIN "\nint main(void) { return 0; }")
@@ -55,120 +59,117 @@ set(AOM_CXX_FAILED_TESTS)
 function(aom_push_var var new_value)
   set(SAVED_${var} ${${var}} PARENT_SCOPE)
   set(${var} "${${var}} ${new_value}" PARENT_SCOPE)
-endfunction ()
+endfunction()
 
 function(aom_pop_var var)
   set(var ${SAVED_${var}} PARENT_SCOPE)
   unset(SAVED_${var} PARENT_SCOPE)
-endfunction ()
+endfunction()
 
 # Confirms $test_source compiles and stores $test_name in one of
 # $AOM_C_PASSED_TESTS or $AOM_C_FAILED_TESTS depending on out come. When the
-# test passes $result_var is set to 1. When it fails $result_var is unset.
-# The test is not run if the test name is found in either of the passed or
-# failed test variables.
+# test passes $result_var is set to 1. When it fails $result_var is unset. The
+# test is not run if the test name is found in either of the passed or failed
+# test variables.
 function(aom_check_c_compiles test_name test_source result_var)
-  if (DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
     return()
-  endif ()
+  endif()
 
   unset(C_TEST_PASSED CACHE)
   unset(C_TEST_FAILED CACHE)
   string(FIND "${AOM_C_PASSED_TESTS}" "${test_name}" C_TEST_PASSED)
   string(FIND "${AOM_C_FAILED_TESTS}" "${test_name}" C_TEST_FAILED)
-  if (${C_TEST_PASSED} EQUAL -1 AND ${C_TEST_FAILED} EQUAL -1)
+  if(${C_TEST_PASSED} EQUAL -1 AND ${C_TEST_FAILED} EQUAL -1)
     unset(C_TEST_COMPILED CACHE)
     message("Running C compiler test: ${test_name}")
     check_c_source_compiles("${test_source} ${AOM_C_MAIN}" C_TEST_COMPILED)
     set(${result_var} ${C_TEST_COMPILED} PARENT_SCOPE)
 
-    if (C_TEST_COMPILED)
-      set(AOM_C_PASSED_TESTS "${AOM_C_PASSED_TESTS} ${test_name}" CACHE STRING
-          "" FORCE)
-    else ()
-      set(AOM_C_FAILED_TESTS "${AOM_C_FAILED_TESTS} ${test_name}" CACHE STRING
-          "" FORCE)
+    if(C_TEST_COMPILED)
+      set(AOM_C_PASSED_TESTS "${AOM_C_PASSED_TESTS} ${test_name}"
+          CACHE STRING "" FORCE)
+    else()
+      set(AOM_C_FAILED_TESTS "${AOM_C_FAILED_TESTS} ${test_name}"
+          CACHE STRING "" FORCE)
       message("C Compiler test ${test_name} failed.")
-    endif ()
-  elseif (NOT ${C_TEST_PASSED} EQUAL -1)
+    endif()
+  elseif(NOT ${C_TEST_PASSED} EQUAL -1)
     set(${result_var} 1 PARENT_SCOPE)
-  else ()  # ${C_TEST_FAILED} NOT EQUAL -1
+  else() # ${C_TEST_FAILED} NOT EQUAL -1
     unset(${result_var} PARENT_SCOPE)
-  endif ()
-endfunction ()
+  endif()
+endfunction()
 
 # Confirms $test_source compiles and stores $test_name in one of
 # $AOM_CXX_PASSED_TESTS or $AOM_CXX_FAILED_TESTS depending on out come. When the
-# test passes $result_var is set to 1. When it fails $result_var is unset.
-# The test is not run if the test name is found in either of the passed or
-# failed test variables.
+# test passes $result_var is set to 1. When it fails $result_var is unset. The
+# test is not run if the test name is found in either of the passed or failed
+# test variables.
 function(aom_check_cxx_compiles test_name test_source result_var)
-  if (DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
     return()
-  endif ()
+  endif()
 
   unset(CXX_TEST_PASSED CACHE)
   unset(CXX_TEST_FAILED CACHE)
   string(FIND "${AOM_CXX_PASSED_TESTS}" "${test_name}" CXX_TEST_PASSED)
   string(FIND "${AOM_CXX_FAILED_TESTS}" "${test_name}" CXX_TEST_FAILED)
-  if (${CXX_TEST_PASSED} EQUAL -1 AND ${CXX_TEST_FAILED} EQUAL -1)
+  if(${CXX_TEST_PASSED} EQUAL -1 AND ${CXX_TEST_FAILED} EQUAL -1)
     unset(CXX_TEST_COMPILED CACHE)
     message("Running CXX compiler test: ${test_name}")
     check_cxx_source_compiles("${test_source} ${AOM_CXX_MAIN}"
                               CXX_TEST_COMPILED)
     set(${result_var} ${CXX_TEST_COMPILED} PARENT_SCOPE)
 
-    if (CXX_TEST_COMPILED)
-      set(AOM_CXX_PASSED_TESTS "${AOM_CXX_PASSED_TESTS} ${test_name}" CACHE
-          STRING "" FORCE)
-    else ()
-      set(AOM_CXX_FAILED_TESTS "${AOM_CXX_FAILED_TESTS} ${test_name}" CACHE
-          STRING "" FORCE)
+    if(CXX_TEST_COMPILED)
+      set(AOM_CXX_PASSED_TESTS "${AOM_CXX_PASSED_TESTS} ${test_name}"
+          CACHE STRING "" FORCE)
+    else()
+      set(AOM_CXX_FAILED_TESTS "${AOM_CXX_FAILED_TESTS} ${test_name}"
+          CACHE STRING "" FORCE)
       message("CXX Compiler test ${test_name} failed.")
-    endif ()
-  elseif (NOT ${CXX_TEST_PASSED} EQUAL -1)
+    endif()
+  elseif(NOT ${CXX_TEST_PASSED} EQUAL -1)
     set(${result_var} 1 PARENT_SCOPE)
-  else ()  # ${CXX_TEST_FAILED} NOT EQUAL -1
+  else() # ${CXX_TEST_FAILED} NOT EQUAL -1
     unset(${result_var} PARENT_SCOPE)
-  endif ()
-endfunction ()
+  endif()
+endfunction()
 
 # Convenience function that confirms $test_source compiles as C and C++.
 # $result_var is set to 1 when both tests are successful, and 0 when one or both
-# tests fail.
-# Note: This function is intended to be used to write to result variables that
-# are expanded via configure_file(). $result_var is set to 1 or 0 to allow
-# direct usage of the value in generated source files.
+# tests fail. Note: This function is intended to be used to write to result
+# variables that are expanded via configure_file(). $result_var is set to 1 or 0
+# to allow direct usage of the value in generated source files.
 function(aom_check_source_compiles test_name test_source result_var)
   unset(C_PASSED)
   unset(CXX_PASSED)
   aom_check_c_compiles(${test_name} ${test_source} C_PASSED)
   aom_check_cxx_compiles(${test_name} ${test_source} CXX_PASSED)
-  if (C_PASSED AND CXX_PASSED)
+  if(C_PASSED AND CXX_PASSED)
     set(${result_var} 1 PARENT_SCOPE)
-  else ()
+  else()
     set(${result_var} 0 PARENT_SCOPE)
-  endif ()
-endfunction ()
+  endif()
+endfunction()
 
 # When inline support is detected for the current compiler the supported
 # inlining keyword is written to $result in caller scope.
-function (aom_get_inline result)
+function(aom_get_inline result)
   aom_check_source_compiles("inline_check_1"
                             "static inline void function(void) {}"
                             HAVE_INLINE_1)
-  if (HAVE_INLINE_1 EQUAL 1)
+  if(HAVE_INLINE_1 EQUAL 1)
     set(${result} "inline" PARENT_SCOPE)
     return()
-  endif ()
+  endif()
 
   # Check __inline.
   aom_check_source_compiles("inline_check_2"
                             "static __inline void function(void) {}"
                             HAVE_INLINE_2)
-  if (HAVE_INLINE_2 EQUAL 1)
+  if(HAVE_INLINE_2 EQUAL 1)
     set(${result} "__inline" PARENT_SCOPE)
-  endif ()
-endfunction ()
-
-endif ()  # AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_
+  endif()
+endfunction()
diff --git a/third_party/aom/build/cmake/cpu.cmake b/third_party/aom/build/cmake/cpu.cmake
index 5d0b1a6e8..6f866d04d 100644
--- a/third_party/aom/build/cmake/cpu.cmake
+++ b/third_party/aom/build/cmake/cpu.cmake
@@ -1,72 +1,99 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if ("${AOM_TARGET_CPU}" STREQUAL "arm64")
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+
+if("${AOM_TARGET_CPU}" STREQUAL "arm64")
   set(ARCH_ARM 1)
-  set(HAVE_NEON 1)
   set(RTCD_ARCH_ARM "yes")
-  set(RTCD_HAVE_NEON "yes")
-elseif ("${AOM_TARGET_CPU}" MATCHES "^armv7")
+
+  if(ENABLE_NEON)
+    set(HAVE_NEON 1)
+    set(RTCD_HAVE_NEON "yes")
+  else()
+    set(HAVE_NEON 0)
+    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-neon)
+  endif()
+elseif("${AOM_TARGET_CPU}" MATCHES "^armv7")
   set(ARCH_ARM 1)
-  set(HAVE_NEON 1)
-  set(HAVE_NEON_ASM 1)
   set(RTCD_ARCH_ARM "yes")
-  set(RTCD_HAVE_NEON "yes")
-  set(RTCD_HAVE_NEON_ASM "yes")
-elseif ("${AOM_TARGET_CPU}" MATCHES "^mips")
+
+  if(ENABLE_NEON)
+    set(HAVE_NEON 1)
+    set(RTCD_HAVE_NEON "yes")
+  else()
+    set(HAVE_NEON 0)
+    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-neon)
+  endif()
+elseif("${AOM_TARGET_CPU}" MATCHES "^mips")
   set(ARCH_MIPS 1)
+  set(RTCD_ARCH_MIPS "yes")
 
-  if ("${AOM_TARGET_CPU}" STREQUAL "mips32")
+  if("${AOM_TARGET_CPU}" STREQUAL "mips32")
     set(HAVE_MIPS32 1)
-  elseif ("${AOM_TARGET_CPU}" STREQUAL "mips64")
+    set(RTCD_HAVE_MIPS32 "yes")
+  elseif("${AOM_TARGET_CPU}" STREQUAL "mips64")
     set(HAVE_MIPS64 1)
-  endif ()
+    set(RTCD_HAVE_MIPS64 "yes")
+  endif()
 
-  set(RTCD_ARCH_MIPS "yes")
-
-  if (HAVE_DSPR2)
+  # HAVE_DSPR2 is set by mips toolchain files.
+  if(ENABLE_DSPR2 AND HAVE_DSPR2)
     set(RTCD_HAVE_DSPR2 "yes")
-  endif ()
+  else()
+    set(HAVE_DSPR2 0)
+    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-dspr2)
+  endif()
 
-  if (HAVE_MSA)
+  # HAVE_MSA is set by mips toolchain files.
+  if(ENABLE_MSA AND HAVE_MSA)
     set(RTCD_HAVE_MSA "yes")
-  endif ()
-elseif ("${AOM_TARGET_CPU}" MATCHES "^x86")
-  if ("${AOM_TARGET_CPU}" STREQUAL "x86")
+  else()
+    set(HAVE_MSA 0)
+    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-msa)
+  endif()
+elseif("${AOM_TARGET_CPU}" MATCHES "ppc")
+  set(ARCH_PPC 1)
+  set(RTCD_ARCH_PPC "yes")
+
+  if(ENABLE_VSX)
+    set(HAVE_VSX 1)
+    set(RTCD_HAVE_VSX "yes")
+  else()
+    set(HAVE_VSX 0)
+    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-vsx)
+  endif()
+elseif("${AOM_TARGET_CPU}" MATCHES "^x86")
+  if("${AOM_TARGET_CPU}" STREQUAL "x86")
     set(ARCH_X86 1)
     set(RTCD_ARCH_X86 "yes")
-  elseif ("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+  elseif("${AOM_TARGET_CPU}" STREQUAL "x86_64")
     set(ARCH_X86_64 1)
     set(RTCD_ARCH_X86_64 "yes")
-  endif ()
+  endif()
 
-  set(HAVE_MMX 1)
-  set(HAVE_SSE 1)
-  set(HAVE_SSE2 1)
-  set(HAVE_SSE3 1)
-  set(HAVE_SSSE3 1)
-  set(HAVE_SSE4_1 1)
-  set(HAVE_AVX 1)
-  set(HAVE_AVX2 1)
-  set(RTCD_HAVE_MMX "yes")
-  set(RTCD_HAVE_SSE "yes")
-  set(RTCD_HAVE_SSE2 "yes")
-  set(RTCD_HAVE_SSE3 "yes")
-  set(RTCD_HAVE_SSSE3 "yes")
-  set(RTCD_HAVE_SSE4_1 "yes")
-  set(RTCD_HAVE_AVX "yes")
-  set(RTCD_HAVE_AVX2 "yes")
-endif ()
+  set(X86_FLAVORS "MMX;SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;AVX;AVX2")
+  foreach(flavor ${X86_FLAVORS})
+    if(ENABLE_${flavor} AND NOT disable_remaining_flavors)
+      set(HAVE_${flavor} 1)
+      set(RTCD_HAVE_${flavor} "yes")
+    else()
+      set(disable_remaining_flavors 1)
+      set(HAVE_${flavor} 0)
+      string(TOLOWER ${flavor} flavor)
+      set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-${flavor})
+    endif()
+  endforeach()
+endif()
 
-foreach (config_var ${AOM_CONFIG_VARS})
-  if (${${config_var}})
+foreach(config_var ${AOM_CONFIG_VARS})
+  if(${${config_var}})
     set(RTCD_${config_var} yes)
-  endif ()
-endforeach ()
+  endif()
+endforeach()
diff --git a/third_party/aom/build/cmake/dist.cmake b/third_party/aom/build/cmake/dist.cmake
index ad1e069fe..6f81736f0 100644
--- a/third_party/aom/build/cmake/dist.cmake
+++ b/third_party/aom/build/cmake/dist.cmake
@@ -1,63 +1,64 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
 cmake_minimum_required(VERSION 3.5)
 
 # Converts spaces in $in_string to semicolons and writes the output to
 # $out_string. In CMake's eyes this converts the input string to a list.
-function (listify_string in_string out_string)
+function(listify_string in_string out_string)
   string(REPLACE " " ";" ${out_string} ${in_string})
   set(${out_string} "${${out_string}}" PARENT_SCOPE)
-endfunction ()
+endfunction()
 
-set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_DIST_DIR" "AOM_DIST_INCLUDES"
-    "AOM_DIST_LIBS" "ENABLE_DOCS")
+set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_DIST_DIR"
+    "AOM_DIST_INCLUDES" "AOM_DIST_LIBS" "ENABLE_DOCS")
 
-foreach (arg ${REQUIRED_ARGS})
-  if ("${${arg}}" STREQUAL "")
+foreach(arg ${REQUIRED_ARGS})
+  if("${${arg}}" STREQUAL "")
     message(FATAL_ERROR "${arg} must not be empty.")
-  endif ()
-endforeach ()
+  endif()
+endforeach()
 
-if (ENABLE_DOCS)
+if(ENABLE_DOCS)
   file(INSTALL "${AOM_CONFIG_DIR}/docs" DESTINATION "${AOM_DIST_DIR}")
-endif ()
+endif()
 
-if (AOM_DIST_EXAMPLES)
+if(AOM_DIST_EXAMPLES)
   listify_string("${AOM_DIST_EXAMPLES}" "AOM_DIST_EXAMPLES")
-  foreach (example ${AOM_DIST_EXAMPLES})
-    if (NOT "${example}" MATCHES "aomdec\|aomenc")
+  foreach(example ${AOM_DIST_EXAMPLES})
+    if(NOT "${example}" MATCHES "aomdec\|aomenc")
       file(INSTALL "${example}" DESTINATION "${AOM_DIST_DIR}/bin/examples")
-    endif ()
-  endforeach ()
-endif ()
+    endif()
+  endforeach()
+endif()
 
-if (AOM_DIST_TOOLS)
-  foreach (tool ${AOM_DIST_TOOLS})
+if(AOM_DIST_TOOLS)
+  listify_string("${AOM_DIST_TOOLS}" "AOM_DIST_TOOLS")
+  foreach(tool ${AOM_DIST_TOOLS})
     file(INSTALL "${tool}" DESTINATION "${AOM_DIST_DIR}/bin/tools")
-  endforeach ()
-endif ()
+  endforeach()
+endif()
 
-if (AOM_DIST_APPS)
+if(AOM_DIST_APPS)
   listify_string("${AOM_DIST_APPS}" "AOM_DIST_APPS")
-  foreach (app ${AOM_DIST_APPS})
+  foreach(app ${AOM_DIST_APPS})
     file(INSTALL "${app}" DESTINATION "${AOM_DIST_DIR}/bin")
-  endforeach ()
-endif ()
+  endforeach()
+endif()
 
 listify_string("${AOM_DIST_INCLUDES}" "AOM_DIST_INCLUDES")
-foreach (inc ${AOM_DIST_INCLUDES})
+foreach(inc ${AOM_DIST_INCLUDES})
   file(INSTALL "${inc}" DESTINATION "${AOM_DIST_DIR}/include/aom")
-endforeach ()
+endforeach()
 
 listify_string("${AOM_DIST_LIBS}" "AOM_DIST_LIBS")
-foreach (lib ${AOM_DIST_LIBS})
+foreach(lib ${AOM_DIST_LIBS})
   file(INSTALL "${lib}" DESTINATION "${AOM_DIST_DIR}/lib")
-endforeach ()
+endforeach()
diff --git a/third_party/aom/build/cmake/exports.cmake b/third_party/aom/build/cmake/exports.cmake
index 8153aad98..5abfc9a5f 100644
--- a/third_party/aom/build/cmake/exports.cmake
+++ b/third_party/aom/build/cmake/exports.cmake
@@ -1,42 +1,41 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_EXPORTS_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_EXPORTS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_EXPORTS_CMAKE_
 set(AOM_BUILD_CMAKE_EXPORTS_CMAKE_ 1)
 
 include("${AOM_ROOT}/build/cmake/exports_sources.cmake")
 
 # Creates the custom target which handles generation of the symbol export lists.
-function (setup_exports_target)
-  if ("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+function(setup_exports_target)
+  if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
     set(symbol_file_ext "syms")
-  elseif ("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND MSVC)
+  elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND MSVC)
     set(symbol_file_ext "def")
-  else ()
+  else()
     set(symbol_file_ext "ver")
-  endif ()
+  endif()
 
   set(aom_sym_file "${AOM_CONFIG_DIR}/libaom.${symbol_file_ext}")
 
   add_custom_target(generate_exports
-                    COMMAND ${CMAKE_COMMAND}
-                      -DAOM_ROOT="${AOM_ROOT}"
-                      -DAOM_CONFIG_DIR="${AOM_CONFIG_DIR}"
-                      -DAOM_TARGET_SYSTEM=${AOM_TARGET_SYSTEM}
-                      -DAOM_SYM_FILE="${aom_sym_file}"
-                      -DAOM_MSVC=${MSVC}
-                      -DAOM_XCODE=${XCODE}
-                      -DCONFIG_NAME=$<CONFIG>
-                      -DCONFIG_AV1_DECODER=${CONFIG_AV1_DECODER}
-                      -DCONFIG_AV1_ENCODER=${CONFIG_AV1_ENCODER}
-                      -P "${AOM_ROOT}/build/cmake/generate_exports.cmake"
+                    COMMAND ${CMAKE_COMMAND} -DAOM_ROOT="${AOM_ROOT}"
+                            -DAOM_CONFIG_DIR="${AOM_CONFIG_DIR}"
+                            -DAOM_TARGET_SYSTEM=${AOM_TARGET_SYSTEM}
+                            -DAOM_SYM_FILE="${aom_sym_file}" -DAOM_MSVC=${MSVC}
+                            -DAOM_XCODE=${XCODE} -DCONFIG_NAME=$<CONFIG>
+                            -DCONFIG_AV1_DECODER=${CONFIG_AV1_DECODER}
+                            -DCONFIG_AV1_ENCODER=${CONFIG_AV1_ENCODER} -P
+                            "${AOM_ROOT}/build/cmake/generate_exports.cmake"
                     SOURCES ${AOM_EXPORTS_SOURCES}
                     DEPENDS ${AOM_EXPORTS_SOURCES})
 
@@ -44,22 +43,20 @@ function (setup_exports_target)
   # creating the dylib.
   add_dependencies(aom generate_exports)
 
-  if (APPLE)
-    set_property(TARGET aom APPEND_STRING PROPERTY LINK_FLAGS
-                 "-exported_symbols_list ${aom_sym_file}")
-  elseif (WIN32)
+  if(APPLE)
+    set_property(TARGET aom APPEND_STRING
+                 PROPERTY LINK_FLAGS "-exported_symbols_list ${aom_sym_file}")
+  elseif(WIN32)
     message(FATAL_ERROR "Windows DLL builds not supported yet.")
-    if (NOT MSVC)
-      set_property(TARGET aom APPEND_STRING PROPERTY LINK_FLAGS
-                   "-Wl,--version-script ${aom_sym_file}")
-    endif ()
+    if(NOT MSVC)
+      set_property(TARGET aom APPEND_STRING
+                   PROPERTY LINK_FLAGS "-Wl,--version-script ${aom_sym_file}")
+    endif()
 
     # TODO(tomfinegan): Sort out the import lib situation and flags for MSVC.
 
-  else ()
-    set_property(TARGET aom APPEND_STRING PROPERTY LINK_FLAGS
-                 "-Wl,--version-script,${aom_sym_file}")
-  endif ()
-endfunction ()
-
-endif ()  # AOM_BUILD_CMAKE_EXPORTS_CMAKE_
+  else()
+    set_property(TARGET aom APPEND_STRING
+                 PROPERTY LINK_FLAGS "-Wl,--version-script,${aom_sym_file}")
+  endif()
+endfunction()
diff --git a/third_party/aom/build/cmake/exports_sources.cmake b/third_party/aom/build/cmake/exports_sources.cmake
index 36f79ee09..48790dbaa 100644
--- a/third_party/aom/build/cmake/exports_sources.cmake
+++ b/third_party/aom/build/cmake/exports_sources.cmake
@@ -1,30 +1,26 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_
 set(AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_ 1)
 
 set(AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_com")
 
-if (CONFIG_AV1_DECODER)
-  set(AOM_EXPORTS_SOURCES
-      ${AOM_EXPORTS_SOURCES}
-      "${AOM_ROOT}/aom/exports_dec"
+if(CONFIG_AV1_DECODER)
+  set(AOM_EXPORTS_SOURCES ${AOM_EXPORTS_SOURCES} "${AOM_ROOT}/aom/exports_dec"
       "${AOM_ROOT}/av1/exports_dec")
-endif ()
+endif()
 
-if (CONFIG_AV1_ENCODER)
-  set(AOM_EXPORTS_SOURCES
-      ${AOM_EXPORTS_SOURCES}
-      "${AOM_ROOT}/aom/exports_enc"
+if(CONFIG_AV1_ENCODER)
+  set(AOM_EXPORTS_SOURCES ${AOM_EXPORTS_SOURCES} "${AOM_ROOT}/aom/exports_enc"
       "${AOM_ROOT}/av1/exports_enc")
-endif ()
-
-endif ()  # AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_
+endif()
diff --git a/third_party/aom/build/cmake/generate_aom_config_templates.cmake b/third_party/aom/build/cmake/generate_aom_config_templates.cmake
index effa456fc..6ea02295c 100644
--- a/third_party/aom/build/cmake/generate_aom_config_templates.cmake
+++ b/third_party/aom/build/cmake/generate_aom_config_templates.cmake
@@ -1,18 +1,19 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
 cmake_minimum_required(VERSION 3.5)
 
 string(TIMESTAMP year "%Y")
-set(asm_file_header_block
-"\;
+set(
+  asm_file_header_block
+  "\;
 \; Copyright (c) ${year}, Alliance for Open Media. All rights reserved
 \;
 \; This source code is subject to the terms of the BSD 2 Clause License and
@@ -22,9 +23,11 @@ set(asm_file_header_block
 \; Media Patent License 1.0 was not distributed with this source code in the
 \; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 \;
-")
-set(h_file_header_block
-"/*
+"
+  )
+set(
+  h_file_header_block
+  "/*
  * Copyright (c) ${year}, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
@@ -36,9 +39,11 @@ set(h_file_header_block
  */
 \#ifndef AOM_CONFIG_H_
 \#define AOM_CONFIG_H_
-")
-set(cmake_file_header_block
-"##
+"
+  )
+set(
+  cmake_file_header_block
+  "##
 ## Copyright (c) ${year}, Alliance for Open Media. All rights reserved
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
@@ -48,59 +53,58 @@ set(cmake_file_header_block
 ## Media Patent License 1.0 was not distributed with this source code in the
 ## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ##
-")
+"
+  )
 
 # Terminates cmake execution when $var_name is an empty string, or the variable
 # name it contains does not expand to an existing directory.
-function (check_directory_var var_name)
-  if ("${var_name}" STREQUAL "")
+function(check_directory_var var_name)
+  if("${var_name}" STREQUAL "")
     message(FATAL_ERROR "The CMake variable ${var_name} must be defined.")
-  endif ()
+  endif()
 
-  if (NOT EXISTS "${${var_name}}")
+  if(NOT EXISTS "${${var_name}}")
     message(FATAL_ERROR "${${var_name}} (${var_name}) missing.")
-  endif ()
-endfunction ()
+  endif()
+endfunction()
 
 check_directory_var(AOM_CONFIG_DIR)
 check_directory_var(AOM_ROOT)
 
 set(AOM_DEFAULTS "${AOM_ROOT}/build/cmake/aom_config_defaults.cmake")
-if (NOT EXISTS "${AOM_DEFAULTS}")
+if(NOT EXISTS "${AOM_DEFAULTS}")
   message(FATAL_ERROR
-          "Configuration default values file (${AOM_DEFAULTS}) missing.")
-endif ()
+            "Configuration default values file (${AOM_DEFAULTS}) missing.")
+endif()
 
 include("${AOM_ROOT}/build/cmake/aom_config_defaults.cmake")
 get_cmake_property(cmake_cache_vars CACHE_VARIABLES)
 
-set(aom_config_h_template "${AOM_CONFIG_DIR}/aom_config.h.cmake")
+set(aom_config_h_template "${AOM_CONFIG_DIR}/config/aom_config.h.cmake")
 file(WRITE "${aom_config_h_template}" ${h_file_header_block})
 foreach(cache_var ${cmake_cache_vars})
-  if (NOT "${cache_var}" MATCHES "AOM_CONFIG_DIR\|AOM_ROOT\|^CMAKE_")
-    file(APPEND
-         "${aom_config_h_template}" "\#define ${cache_var} \${${cache_var}}\n")
-  endif ()
+  if(NOT "${cache_var}" MATCHES "AOM_CONFIG_DIR\|AOM_ROOT\|^CMAKE_")
+    file(APPEND "${aom_config_h_template}"
+                "\#define ${cache_var} \${${cache_var}}\n")
+  endif()
 endforeach()
 file(APPEND "${aom_config_h_template}" "\#endif  /* AOM_CONFIG_H_ */")
 
-set(aom_asm_config_template "${AOM_CONFIG_DIR}/aom_config.asm.cmake")
+set(aom_asm_config_template "${AOM_CONFIG_DIR}/config/aom_config.asm.cmake")
 file(WRITE "${aom_asm_config_template}" ${asm_file_header_block})
 foreach(cache_var ${cmake_cache_vars})
-  if (NOT "${cache_var}" MATCHES
-      "AOM_CONFIG_DIR\|AOM_ROOT\|^CMAKE_\|INLINE\|RESTRICT")
+  if(NOT "${cache_var}" MATCHES "AOM_CONFIG_DIR\|AOM_ROOT\|^CMAKE_\|INLINE")
     file(APPEND "${aom_asm_config_template}"
-         "${cache_var} equ \${${cache_var}}\n")
-  endif ()
-endforeach ()
+                "${cache_var} equ \${${cache_var}}\n")
+  endif()
+endforeach()
 
 set(aom_rtcd_config_template "${AOM_CONFIG_DIR}/rtcd_config.cmake")
 file(WRITE "${aom_rtcd_config_template}" ${cmake_file_header_block})
 foreach(cache_var ${cmake_cache_vars})
-  if (NOT "${cache_var}" MATCHES
-      "AOM_CONFIG_DIR\|AOM_ROOT\|^CMAKE_\|INLINE\|RESTRICT")
+  if(NOT "${cache_var}" MATCHES "AOM_CONFIG_DIR\|AOM_ROOT\|^CMAKE_\|INLINE")
     file(APPEND "${aom_rtcd_config_template}"
-         "${cache_var}=\${RTCD_${cache_var}}\n")
-  endif ()
-endforeach ()
+                "${cache_var}=\${RTCD_${cache_var}}\n")
+  endif()
+endforeach()
 
diff --git a/third_party/aom/build/cmake/generate_exports.cmake b/third_party/aom/build/cmake/generate_exports.cmake
index baa29828b..4dce3a671 100644
--- a/third_party/aom/build/cmake/generate_exports.cmake
+++ b/third_party/aom/build/cmake/generate_exports.cmake
@@ -1,64 +1,60 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
 cmake_minimum_required(VERSION 3.5)
 
-set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_TARGET_SYSTEM" "AOM_SYM_FILE"
-    "CONFIG_AV1_DECODER" "CONFIG_AV1_ENCODER")
+set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_TARGET_SYSTEM"
+    "AOM_SYM_FILE" "CONFIG_AV1_DECODER" "CONFIG_AV1_ENCODER")
 
-foreach (arg ${REQUIRED_ARGS})
-  if ("${${arg}}" STREQUAL "")
+foreach(arg ${REQUIRED_ARGS})
+  if("${${arg}}" STREQUAL "")
     message(FATAL_ERROR "${arg} must not be empty.")
-  endif ()
-endforeach ()
+  endif()
+endforeach()
 
 include("${AOM_ROOT}/build/cmake/exports_sources.cmake")
 
-if ("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
   set(symbol_prefix "_")
-elseif ("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND AOM_MSVC)
+elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND AOM_MSVC)
   set(symbol_prefix "_")
-  file(WRITE "${AOM_SYM_FILE}"
-       "LIBRARY libaom INITINSTANCE TERMINSTANCE\n"
-       "DATA MULTIPLE NONSHARED\n"
-       "EXPORTS\n")
-else ()
+  file(WRITE "${AOM_SYM_FILE}" "LIBRARY libaom INITINSTANCE TERMINSTANCE\n"
+             "DATA MULTIPLE NONSHARED\n" "EXPORTS\n")
+else()
   set(symbol_suffix ";")
-endif ()
+endif()
 
 set(aom_sym_file "${AOM_SYM_FILE}")
 
-if ("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
   file(REMOVE "${aom_sym_file}")
-elseif ("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS")
-  file(WRITE "${aom_sym_file}"
-       "LIBRARY libaom INITINSTANCE TERMINSTANCE\n"
-       "DATA MULTIPLE NONSHARED\n"
-       "EXPORTS\n")
-else ()
+elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS")
+  file(WRITE "${aom_sym_file}" "LIBRARY libaom INITINSTANCE TERMINSTANCE\n"
+             "DATA MULTIPLE NONSHARED\n" "EXPORTS\n")
+else()
   file(WRITE "${aom_sym_file}" "{ global:\n")
-endif ()
+endif()
 
-foreach (export_file ${AOM_EXPORTS_SOURCES})
+foreach(export_file ${AOM_EXPORTS_SOURCES})
   file(STRINGS "${export_file}" exported_file_data)
   set(exported_symbols "${exported_symbols} ${exported_file_data};")
   string(STRIP "${exported_symbols}" exported_symbols)
-endforeach ()
+endforeach()
 
-foreach (exported_symbol ${exported_symbols})
+foreach(exported_symbol ${exported_symbols})
   string(STRIP "${exported_symbol}" exported_symbol)
   string(REGEX REPLACE "text \|data " "" "exported_symbol" "${exported_symbol}")
   set(exported_symbol "${symbol_prefix}${exported_symbol}${symbol_suffix}")
   file(APPEND "${aom_sym_file}" "${exported_symbol}\n")
-endforeach ()
+endforeach()
 
-if ("${aom_sym_file}" MATCHES "ver$")
+if("${aom_sym_file}" MATCHES "ver$")
   file(APPEND "${aom_sym_file}" " };")
-endif ()
+endif()
diff --git a/third_party/aom/build/cmake/msvc_runtime.cmake b/third_party/aom/build/cmake/msvc_runtime.cmake
index 3da5f7df6..9e4cbea43 100644
--- a/third_party/aom/build/cmake/msvc_runtime.cmake
+++ b/third_party/aom/build/cmake/msvc_runtime.cmake
@@ -1,30 +1,37 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_MSVC_RUNTIME_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_MSVC_RUNTIME_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_MSVC_RUNTIME_CMAKE_
 set(AOM_BUILD_CMAKE_MSVC_RUNTIME_CMAKE_ 1)
 
-if (MSVC)
+if(MSVC)
+
   # CMake defaults to producing code linked to the DLL MSVC runtime. That will
   # not work with googletest, and isn't what we want anyway.
-  if (NOT "${MSVC_RUNTIME}" STREQUAL "dll")
-    foreach (flag_var
-             CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-             CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
-             CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-             CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-      if (${flag_var} MATCHES "/MD")
+  if(NOT "${MSVC_RUNTIME}" STREQUAL "dll")
+    foreach(flag_var
+            CMAKE_C_FLAGS
+            CMAKE_C_FLAGS_DEBUG
+            CMAKE_C_FLAGS_RELEASE
+            CMAKE_C_FLAGS_MINSIZEREL
+            CMAKE_C_FLAGS_RELWITHDEBINFO
+            CMAKE_CXX_FLAGS
+            CMAKE_CXX_FLAGS_DEBUG
+            CMAKE_CXX_FLAGS_RELEASE
+            CMAKE_CXX_FLAGS_MINSIZEREL
+            CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
         string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-      endif (${flag_var} MATCHES "/MD")
-    endforeach (flag_var)
-  endif ()
-endif ()
-
-endif ()  # AOM_BUILD_CMAKE_MSVC_RUNTIME_CMAKE_
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+  endif()
+endif()
diff --git a/third_party/aom/build/cmake/pkg_config.cmake b/third_party/aom/build/cmake/pkg_config.cmake
index aee375f43..64e20214e 100644
--- a/third_party/aom/build/cmake/pkg_config.cmake
+++ b/third_party/aom/build/cmake/pkg_config.cmake
@@ -1,37 +1,36 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
 cmake_minimum_required(VERSION 3.5)
 
-set(REQUIRED_ARGS
-    "AOM_ROOT" "AOM_CONFIG_DIR" "CMAKE_INSTALL_PREFIX" "CMAKE_PROJECT_NAME"
-    "CONFIG_MULTITHREAD" "HAVE_PTHREAD_H")
+set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "CMAKE_INSTALL_PREFIX"
+    "CMAKE_PROJECT_NAME" "CONFIG_MULTITHREAD" "HAVE_PTHREAD_H")
 
-foreach (arg ${REQUIRED_ARGS})
-  if ("${${arg}}" STREQUAL "")
+foreach(arg ${REQUIRED_ARGS})
+  if("${${arg}}" STREQUAL "")
     message(FATAL_ERROR "${arg} must not be empty.")
-  endif ()
-endforeach ()
+  endif()
+endforeach()
 
 include("${AOM_ROOT}/build/cmake/util.cmake")
 
-extract_version_string("${AOM_CONFIG_DIR}/aom_version.h" aom_version)
+extract_version_string("${AOM_CONFIG_DIR}/config/aom_version.h" aom_version)
 
 # Create a version string suitable for comparison using the RPM version compare
 # algorithm: strip out everything after the number.
 string(FIND "${aom_version}" "-" dash_pos)
-if (${dash_pos} EQUAL -1)
+if(${dash_pos} EQUAL -1)
   set(package_version "${aom_version}")
-else ()
+else()
   string(SUBSTRING "${aom_version}" 0 ${dash_pos} package_version)
-endif ()
+endif()
 
 # Write pkg-config info.
 set(prefix "${CMAKE_INSTALL_PREFIX}")
@@ -39,19 +38,21 @@ set(pkgconfig_file "${AOM_CONFIG_DIR}/aom.pc")
 string(TOLOWER ${CMAKE_PROJECT_NAME} pkg_name)
 file(WRITE "${pkgconfig_file}" "# libaom pkg-config.\n")
 file(APPEND "${pkgconfig_file}" "prefix=${prefix}\n")
-file(APPEND "${pkgconfig_file}" "exec_prefix=${prefix}/bin\n")
-file(APPEND "${pkgconfig_file}" "libdir=${prefix}/lib\n")
-file(APPEND "${pkgconfig_file}" "includedir=${prefix}/include\n\n")
+file(APPEND "${pkgconfig_file}" "exec_prefix=\${prefix}/bin\n")
+file(APPEND "${pkgconfig_file}" "libdir=\${prefix}/lib\n")
+file(APPEND "${pkgconfig_file}" "includedir=\${prefix}/include\n\n")
 file(APPEND "${pkgconfig_file}" "Name: ${pkg_name}\n")
 file(APPEND "${pkgconfig_file}"
-     "Description: AV1 codec library v${aom_version}.\n")
+            "Description: AV1 codec library v${aom_version}.\n")
 file(APPEND "${pkgconfig_file}" "Version: ${package_version}\n")
 file(APPEND "${pkgconfig_file}" "Requires:\n")
 file(APPEND "${pkgconfig_file}" "Conflicts:\n")
-file(APPEND "${pkgconfig_file}" "Libs: -L${prefix}/lib -l${pkg_name} -lm\n")
-if (CONFIG_MULTITHREAD AND HAVE_PTHREAD_H)
+if(CONFIG_MULTITHREAD AND HAVE_PTHREAD_H)
+  file(APPEND "${pkgconfig_file}"
+              "Libs: -L\${prefix}/lib -l${pkg_name} -lm -lpthread\n")
   file(APPEND "${pkgconfig_file}" "Libs.private: -lm -lpthread\n")
-else ()
+else()
+  file(APPEND "${pkgconfig_file}" "Libs: -L\${prefix}/lib -l${pkg_name} -lm\n")
   file(APPEND "${pkgconfig_file}" "Libs.private: -lm\n")
-endif ()
-file(APPEND "${pkgconfig_file}" "Cflags: -I${prefix}/include\n")
+endif()
+file(APPEND "${pkgconfig_file}" "Cflags: -I\${prefix}/include\n")
diff --git a/third_party/aom/build/cmake/sanitizers.cmake b/third_party/aom/build/cmake/sanitizers.cmake
index 2b9067868..77708e101 100644
--- a/third_party/aom/build/cmake/sanitizers.cmake
+++ b/third_party/aom/build/cmake/sanitizers.cmake
@@ -1,19 +1,21 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_SANITIZERS_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_SANITIZERS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_SANITIZERS_CMAKE_
 set(AOM_BUILD_CMAKE_SANITIZERS_CMAKE_ 1)
 
-if (MSVC OR NOT SANITIZE)
-  return ()
-endif ()
+if(MSVC OR NOT SANITIZE)
+  return()
+endif()
 
 include("${AOM_ROOT}/build/cmake/compiler_flags.cmake")
 
@@ -26,4 +28,11 @@ require_compiler_flag("-fsanitize=${SANITIZE}" YES)
 # Make callstacks accurate.
 require_compiler_flag("-fno-omit-frame-pointer -fno-optimize-sibling-calls" YES)
 
-endif()  # AOM_BUILD_CMAKE_SANITIZERS_CMAKE_
+# Fix link errors due to missing rt compiler lib in 32-bit builds.
+# http://llvm.org/bugs/show_bug.cgi?id=17693
+if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+  if(${CMAKE_SIZEOF_VOID_P} EQUAL 4 AND "${SANITIZE}" MATCHES
+     "integer|undefined")
+    require_linker_flag("--rtlib=compiler-rt -lgcc_s")
+  endif()
+endif()
diff --git a/third_party/aom/build/cmake/toolchains/arm-ios-common.cmake b/third_party/aom/build/cmake/toolchains/arm-ios-common.cmake
index 8929f3b1d..8f4095145 100644
--- a/third_party/aom/build/cmake/toolchains/arm-ios-common.cmake
+++ b/third_party/aom/build/cmake/toolchains/arm-ios-common.cmake
@@ -1,14 +1,16 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_TOOLCHAINS_ARM_IOS_COMMON_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM_IOS_COMMON_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM_IOS_COMMON_CMAKE_
 set(AOM_BUILD_CMAKE_ARM_IOS_COMMON_CMAKE_ 1)
 
 set(CMAKE_SYSTEM_NAME "Darwin")
@@ -18,14 +20,7 @@ set(CMAKE_C_COMPILER_ARG1 "-arch ${CMAKE_SYSTEM_PROCESSOR}")
 set(CMAKE_CXX_COMPILER clang++)
 set(CMAKE_CXX_COMPILER_ARG1 "-arch ${CMAKE_SYSTEM_PROCESSOR}")
 
-# Assembler sources must be converted for ARM iOS targets.
-set(AOM_ADS2GAS_REQUIRED 1)
-set(AOM_ADS2GAS "${CMAKE_CURRENT_SOURCE_DIR}/build/make/ads2gas_apple.pl")
-set(AOM_GAS_EXT "S")
-
 # No runtime cpu detect for arm*-ios targets.
 set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
 
 # TODO(tomfinegan): Handle bit code embedding.
-
-endif ()  # AOM_BUILD_CMAKE_TOOLCHAINS_ARM_IOS_COMMON_CMAKE_
diff --git a/third_party/aom/build/cmake/toolchains/arm64-ios.cmake b/third_party/aom/build/cmake/toolchains/arm64-ios.cmake
index 434809db9..6feb1090f 100644
--- a/third_party/aom/build/cmake/toolchains/arm64-ios.cmake
+++ b/third_party/aom/build/cmake/toolchains/arm64-ios.cmake
@@ -1,24 +1,23 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_IOS_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_IOS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_IOS_CMAKE_
 set(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_IOS_CMAKE_ 1)
 
-if (XCODE)
-  # TODO(tomfinegan): Handle arm builds in Xcode.
+if(XCODE) # TODO(tomfinegan): Handle arm builds in Xcode.
   message(FATAL_ERROR "This toolchain does not support Xcode.")
-endif ()
+endif()
 
 set(CMAKE_SYSTEM_PROCESSOR "arm64")
 set(CMAKE_OSX_ARCHITECTURES "arm64")
 
 include("${CMAKE_CURRENT_LIST_DIR}/arm-ios-common.cmake")
-
-endif ()  # AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_IOS_CMAKE_
diff --git a/third_party/aom/build/cmake/toolchains/arm64-linux-gcc.cmake b/third_party/aom/build/cmake/toolchains/arm64-linux-gcc.cmake
index 303f1cf9a..590a97a8e 100644
--- a/third_party/aom/build/cmake/toolchains/arm64-linux-gcc.cmake
+++ b/third_party/aom/build/cmake/toolchains/arm64-linux-gcc.cmake
@@ -1,22 +1,25 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_GCC_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_GCC_CMAKE_
 set(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_GCC_CMAKE_ 1)
 
 set(CMAKE_SYSTEM_NAME "Linux")
 
-if ("${CROSS}" STREQUAL "")
+if("${CROSS}" STREQUAL "")
+
   # Default the cross compiler prefix to something known to work.
   set(CROSS aarch64-linux-gnu-)
-endif ()
+endif()
 
 set(CMAKE_C_COMPILER ${CROSS}gcc)
 set(CMAKE_CXX_COMPILER ${CROSS}g++)
@@ -31,5 +34,3 @@ set(AOM_NEON_INTRIN_FLAG "")
 
 # No runtime cpu detect for arm64-linux-gcc.
 set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
-
-endif ()  # AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_GCC_CMAKE_
diff --git a/third_party/aom/build/cmake/toolchains/arm64-mingw-gcc.cmake b/third_party/aom/build/cmake/toolchains/arm64-mingw-gcc.cmake
new file mode 100644
index 000000000..b5b2ff1cd
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/arm64-mingw-gcc.cmake
@@ -0,0 +1,32 @@
+#
+# Copyright (c) 2018, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_MINGW_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_MINGW_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_MINGW_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_PROCESSOR "arm64")
+set(CMAKE_SYSTEM_NAME "Windows")
+
+if("${CROSS}" STREQUAL "")
+  set(CROSS aarch64-w64-mingw32-)
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
+set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
+
+# No runtime cpu detect for arm64-mingw-gcc.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
+
+# Disable the use of the gtest's CMake support.
+set(AOM_DISABLE_GTEST_CMAKE 1)
diff --git a/third_party/aom/build/cmake/toolchains/armv7-ios.cmake b/third_party/aom/build/cmake/toolchains/armv7-ios.cmake
index c1e72ce3d..32a1b534a 100644
--- a/third_party/aom/build/cmake/toolchains/armv7-ios.cmake
+++ b/third_party/aom/build/cmake/toolchains/armv7-ios.cmake
@@ -1,20 +1,23 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_IOS_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_IOS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_IOS_CMAKE_
 set(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_IOS_CMAKE_ 1)
 
-if (XCODE)
+if(XCODE)
+
   # TODO(tomfinegan): Handle arm builds in Xcode.
   message(FATAL_ERROR "This toolchain does not support Xcode.")
-endif ()
+endif()
 
 set(CMAKE_SYSTEM_PROCESSOR "armv7")
 set(CMAKE_OSX_ARCHITECTURES "armv7")
@@ -26,5 +29,3 @@ set(AOM_NEON_INTRIN_FLAG "")
 
 # No runtime cpu detect for armv7s-ios.
 set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
-
-endif ()  # AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_IOS_CMAKE_
diff --git a/third_party/aom/build/cmake/toolchains/armv7-linux-gcc.cmake b/third_party/aom/build/cmake/toolchains/armv7-linux-gcc.cmake
index 13a737784..7d3d63085 100644
--- a/third_party/aom/build/cmake/toolchains/armv7-linux-gcc.cmake
+++ b/third_party/aom/build/cmake/toolchains/armv7-linux-gcc.cmake
@@ -1,26 +1,29 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_LINUX_GCC_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_LINUX_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_LINUX_GCC_CMAKE_
 set(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_LINUX_GCC_CMAKE_ 1)
 
 set(CMAKE_SYSTEM_NAME "Linux")
 
-if ("${CROSS}" STREQUAL "")
+if("${CROSS}" STREQUAL "")
+
   # Default the cross compiler prefix to something known to work.
   set(CROSS arm-linux-gnueabihf-)
-endif ()
+endif()
 
-if (NOT ${CROSS} MATCHES hf-$)
+if(NOT ${CROSS} MATCHES hf-$)
   set(AOM_EXTRA_TOOLCHAIN_FLAGS "-mfloat-abi=softfp")
-endif ()
+endif()
 
 set(CMAKE_C_COMPILER ${CROSS}gcc)
 set(CMAKE_CXX_COMPILER ${CROSS}g++)
@@ -29,20 +32,12 @@ set(CMAKE_C_COMPILER_ARG1
     "-march=armv7-a -mfpu=neon ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
 set(CMAKE_CXX_COMPILER_ARG1
     "-march=armv7-a -mfpu=neon ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
-set(AOM_AS_FLAGS
-    --defsym ARCHITECTURE=7 -march=armv7-a -mfpu=neon
+set(AOM_AS_FLAGS --defsym ARCHITECTURE=7 -march=armv7-a -mfpu=neon
     ${AOM_EXTRA_TOOLCHAIN_FLAGS})
 set(CMAKE_SYSTEM_PROCESSOR "armv7")
 
 # No intrinsics flag required for armv7-linux-gcc.
 set(AOM_NEON_INTRIN_FLAG "")
 
-# Assembler sources must be converted for armv7-linux-gcc targets.
-set(AOM_ADS2GAS_REQUIRED 1)
-set(AOM_ADS2GAS "${CMAKE_CURRENT_SOURCE_DIR}/build/make/ads2gas.pl")
-set(AOM_GAS_EXT "S")
-
 # No runtime cpu detect for armv7-linux-gcc.
 set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
-
-endif ()  # AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_LINUX_GCC_CMAKE_
diff --git a/third_party/aom/build/cmake/toolchains/armv7-mingw-gcc.cmake b/third_party/aom/build/cmake/toolchains/armv7-mingw-gcc.cmake
new file mode 100644
index 000000000..cf06a11b3
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/armv7-mingw-gcc.cmake
@@ -0,0 +1,32 @@
+#
+# Copyright (c) 2018, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_MINGW_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_MINGW_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_MINGW_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_PROCESSOR "armv7")
+set(CMAKE_SYSTEM_NAME "Windows")
+
+if("${CROSS}" STREQUAL "")
+  set(CROSS armv7-w64-mingw32-)
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
+set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
+
+# No runtime cpu detect for armv7-mingw-gcc.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
+
+# Disable the use of the gtest's CMake support.
+set(AOM_DISABLE_GTEST_CMAKE 1)
diff --git a/third_party/aom/build/cmake/toolchains/armv7s-ios.cmake b/third_party/aom/build/cmake/toolchains/armv7s-ios.cmake
index bdb627c78..0940a6ee8 100644
--- a/third_party/aom/build/cmake/toolchains/armv7s-ios.cmake
+++ b/third_party/aom/build/cmake/toolchains/armv7s-ios.cmake
@@ -1,20 +1,23 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7S_IOS_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7S_IOS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7S_IOS_CMAKE_
 set(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7S_IOS_CMAKE_ 1)
 
-if (XCODE)
+if(XCODE)
+
   # TODO(tomfinegan): Handle arm builds in Xcode.
   message(FATAL_ERROR "This toolchain does not support Xcode.")
-endif ()
+endif()
 
 set(CMAKE_SYSTEM_PROCESSOR "armv7s")
 set(CMAKE_OSX_ARCHITECTURES "armv7s")
@@ -26,5 +29,3 @@ set(AOM_NEON_INTRIN_FLAG "")
 
 # No runtime cpu detect for armv7s-ios.
 set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
-
-endif ()  # AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7S_IOS_CMAKE_
diff --git a/third_party/aom/build/cmake/toolchains/ios-simulator-common.cmake b/third_party/aom/build/cmake/toolchains/ios-simulator-common.cmake
index 7a28e329c..76e0bd140 100644
--- a/third_party/aom/build/cmake/toolchains/ios-simulator-common.cmake
+++ b/third_party/aom/build/cmake/toolchains/ios-simulator-common.cmake
@@ -1,14 +1,16 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_TOOLCHAINS_IOS_SIMULATOR_COMMON_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_IOS_SIMULATOR_COMMON_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_IOS_SIMULATOR_COMMON_CMAKE_
 set(AOM_BUILD_CMAKE_IOS_SIMULATOR_COMMON_CMAKE_ 1)
 
 set(CMAKE_SYSTEM_NAME "Darwin")
@@ -19,5 +21,3 @@ set(CMAKE_CXX_COMPILER clang++)
 set(CMAKE_CXX_COMPILER_ARG1 "-arch ${CMAKE_SYSTEM_PROCESSOR}")
 
 # TODO(tomfinegan): Handle bit code embedding.
-
-endif ()  # AOM_BUILD_CMAKE_TOOLCHAINS_IOS_SIMULATOR_COMMON_CMAKE_
diff --git a/third_party/aom/build/cmake/toolchains/mips32-linux-gcc.cmake b/third_party/aom/build/cmake/toolchains/mips32-linux-gcc.cmake
index b452e368f..0f93490b1 100644
--- a/third_party/aom/build/cmake/toolchains/mips32-linux-gcc.cmake
+++ b/third_party/aom/build/cmake/toolchains/mips32-linux-gcc.cmake
@@ -1,62 +1,66 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_TOOLCHAINS_MIPS32_LINUX_GCC_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_MIPS32_LINUX_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_MIPS32_LINUX_GCC_CMAKE_
 set(AOM_BUILD_CMAKE_TOOLCHAINS_MIPS32_LINUX_GCC_CMAKE_ 1)
 
 set(CMAKE_SYSTEM_NAME "Linux")
 
-if (ENABLE_DSPR2 AND ENABLE_MSA)
+if(ENABLE_DSPR2 AND ENABLE_MSA)
   message(FATAL_ERROR "ENABLE_DSPR2 and ENABLE_MSA cannot be combined.")
-endif ()
+endif()
 
-if (ENABLE_DSPR2)
+if(ENABLE_DSPR2)
   set(HAVE_DSPR2 1 CACHE BOOL "" FORCE)
 
-  if ("${CROSS}" STREQUAL "")
+  if("${CROSS}" STREQUAL "")
+
     # Default the cross compiler prefix to something known to work.
     set(CROSS mips-linux-gnu-)
-  endif ()
+  endif()
 
   set(MIPS_CFLAGS "-mdspr2")
   set(MIPS_CXXFLAGS "-mdspr2")
-elseif (ENABLE_MSA)
+elseif(ENABLE_MSA)
   set(HAVE_MSA 1 CACHE BOOL "" FORCE)
 
-  if ("${CROSS}" STREQUAL "")
+  if("${CROSS}" STREQUAL "")
+
     # Default the cross compiler prefix to something known to work.
     set(CROSS mips-mti-linux-gnu-)
-  endif ()
+  endif()
 
   set(MIPS_CFLAGS "-mmsa")
   set(MIPS_CXXFLAGS "-mmsa")
-endif ()
+endif()
 
-if ("${CROSS}" STREQUAL "")
-  # TODO(tomfinegan): Make it possible to turn this off. The $CROSS prefix
-  # won't be desired on a mips host.
-  # Default cross compiler prefix to something that might work for an
-  # unoptimized build.
+if("${CROSS}" STREQUAL "")
+
+  # TODO(tomfinegan): Make it possible to turn this off. The $CROSS prefix won't
+  # be desired on a mips host.  Default cross compiler prefix to something that
+  # might work for an  unoptimized build.
   set(CROSS mips-linux-gnu-)
-endif ()
+endif()
 
-if ("${MIPS_CPU}" STREQUAL "")
+if("${MIPS_CPU}" STREQUAL "")
   set(MIPS_CFLAGS "${MIPS_CFLAGS} -mips32r2")
   set(MIPS_CXXFLAGS "${MIPS_CXXFLAGS} -mips32r2")
-elseif ("${MIPS_CPU}" STREQUAL "p5600")
+elseif("${MIPS_CPU}" STREQUAL "p5600")
   set(P56_FLAGS
       "-mips32r5 -mload-store-pairs -msched-weight -mhard-float -mfp64")
   set(MIPS_CFLAGS "${MIPS_CFLAGS} ${P56_FLAGS}")
   set(MIPS_CXXFLAGS "${MIPS_CXXFLAGS} ${P56_FLAGS}")
   set(CMAKE_EXE_LINKER_FLAGS "-mfp64 ${CMAKE_EXE_LINKER_FLAGS}")
-endif ()
+endif()
 
 set(CMAKE_C_COMPILER ${CROSS}gcc)
 set(CMAKE_CXX_COMPILER ${CROSS}g++)
@@ -66,10 +70,8 @@ set(CMAKE_CXX_COMPILER_ARG1 "-EL ${MIPS_CXXFLAGS}")
 set(CMAKE_SYSTEM_PROCESSOR "mips32")
 
 # No runtime cpu detect for mips32-linux-gcc.
-if (CONFIG_RUNTIME_CPU_DETECT)
+if(CONFIG_RUNTIME_CPU_DETECT)
   message("--- CONFIG_RUNTIME_CPU_DETECT not supported for mips32 targets.")
-endif ()
+endif()
 
 set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "" FORCE)
-
-endif ()  # AOM_BUILD_CMAKE_TOOLCHAINS_MIPS32_LINUX_GCC_CMAKE_
diff --git a/third_party/aom/build/cmake/toolchains/mips64-linux-gcc.cmake b/third_party/aom/build/cmake/toolchains/mips64-linux-gcc.cmake
index 0b63d778d..ad9aab09d 100644
--- a/third_party/aom/build/cmake/toolchains/mips64-linux-gcc.cmake
+++ b/third_party/aom/build/cmake/toolchains/mips64-linux-gcc.cmake
@@ -1,39 +1,43 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_TOOLCHAINS_MIPS64_LINUX_GCC_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_MIPS64_LINUX_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_MIPS64_LINUX_GCC_CMAKE_
 set(AOM_BUILD_CMAKE_TOOLCHAINS_MIPS64_LINUX_GCC_CMAKE_ 1)
 
 set(CMAKE_SYSTEM_NAME "Linux")
 
-if ("${CROSS}" STREQUAL "")
-  # TODO(tomfinegan): Make it possible to turn this off. The $CROSS prefix
-  # won't be desired on a mips host.
+if("${CROSS}" STREQUAL "")
+
+  # TODO(tomfinegan): Make it possible to turn this off. The $CROSS prefix won't
+  # be desired on a mips host.
+  #
   # Default the cross compiler prefix to something known to work.
   set(CROSS mips-img-linux-gnu-)
-endif ()
+endif()
 
-if (ENABLE_MSA)
+if(ENABLE_MSA)
   set(HAVE_MSA 1 CACHE BOOL "" FORCE)
   set(MIPS_CFLAGS "-mmsa")
   set(MIPS_CXXFLAGS "-mmsa")
-endif ()
+endif()
 
-if ("${MIPS_CPU}" STREQUAL "i6400" OR "${MIPS_CPU}" STREQUAL "p6600")
+if("${MIPS_CPU}" STREQUAL "i6400" OR "${MIPS_CPU}" STREQUAL "p6600")
   set(MIPS_CPU_FLAGS "-mips64r6 -mabi=64 -mload-store-pairs -msched-weight")
   set(MIPS_CPU_FLAGS "${MIPS_CPU_FLAGS} -mhard-float -mfp64")
   set(MIPS_CFLAGS "${MIPS_CFLAGS} ${MIPS_CPU_FLAGS}")
   set(MIPS_CXXFLAGS "${MIPS_CXXFLAGS} ${MIPS_CPU_FLAGS}")
   set(CMAKE_EXE_LINKER_FLAGS
       "-mips64r6 -mabi64 -mfp64 ${CMAKE_EXE_LINKER_FLAGS}")
-endif ()
+endif()
 
 set(CMAKE_C_COMPILER ${CROSS}gcc)
 set(CMAKE_CXX_COMPILER ${CROSS}g++)
@@ -43,10 +47,8 @@ set(CMAKE_CXX_COMPILER_ARG1 "-EL ${MIPS_CXXFLAGS}")
 set(CMAKE_SYSTEM_PROCESSOR "mips64")
 
 # No runtime cpu detect for mips64-linux-gcc.
-if (CONFIG_RUNTIME_CPU_DETECT)
+if(CONFIG_RUNTIME_CPU_DETECT)
   message("--- CONFIG_RUNTIME_CPU_DETECT not supported for mips64 targets.")
-endif ()
+endif()
 
 set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "" FORCE)
-
-endif ()  # AOM_BUILD_CMAKE_TOOLCHAINS_MIPS64_LINUX_GCC_CMAKE_
diff --git a/third_party/aom/build/cmake/toolchains/ppc-linux-gcc.cmake b/third_party/aom/build/cmake/toolchains/ppc-linux-gcc.cmake
new file mode 100644
index 000000000..c86cc27e3
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/ppc-linux-gcc.cmake
@@ -0,0 +1,29 @@
+#
+# Copyright (c) 2018, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_PPC_LINUX_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_PPC_LINUX_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_PPC_LINUX_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+
+  # Default the cross compiler prefix to something known to work.
+  set(CROSS powerpc64le-unknown-linux-gnu-)
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(AS_EXECUTABLE ${CROSS}as)
+set(CMAKE_SYSTEM_PROCESSOR "ppc")
+
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
diff --git a/third_party/aom/build/cmake/toolchains/x86-ios-simulator.cmake b/third_party/aom/build/cmake/toolchains/x86-ios-simulator.cmake
index 295156c3d..6b6f52cac 100644
--- a/third_party/aom/build/cmake/toolchains/x86-ios-simulator.cmake
+++ b/third_party/aom/build/cmake/toolchains/x86-ios-simulator.cmake
@@ -1,20 +1,23 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_TOOLCHAINS_X86_IOS_SIMULATOR_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_X86_IOS_SIMULATOR_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_X86_IOS_SIMULATOR_CMAKE_
 set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_IOS_SIMULATOR_CMAKE_ 1)
 
-if (XCODE)
+if(XCODE)
+
   # TODO(tomfinegan): Handle ios sim builds in Xcode.
   message(FATAL_ERROR "This toolchain does not support Xcode.")
-endif ()
+endif()
 
 set(CMAKE_SYSTEM_PROCESSOR "i386")
 set(CMAKE_OSX_ARCHITECTURES "i386")
@@ -23,5 +26,3 @@ set(CMAKE_OSX_ARCHITECTURES "i386")
 set(CONFIG_PIC 1 CACHE NUMBER "")
 
 include("${CMAKE_CURRENT_LIST_DIR}/ios-simulator-common.cmake")
-
-endif ()  # AOM_BUILD_CMAKE_TOOLCHAINS_X86_IOS_SIMULATOR_CMAKE_
diff --git a/third_party/aom/build/cmake/toolchains/x86-linux.cmake b/third_party/aom/build/cmake/toolchains/x86-linux.cmake
index 42cc61467..c2a700bfe 100644
--- a/third_party/aom/build/cmake/toolchains/x86-linux.cmake
+++ b/third_party/aom/build/cmake/toolchains/x86-linux.cmake
@@ -1,19 +1,19 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_TOOLCHAINS_X86_LINUX_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_X86_LINUX_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_X86_LINUX_CMAKE_
 set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_LINUX_CMAKE_ 1)
 
 set(CMAKE_SYSTEM_PROCESSOR "x86")
 set(CMAKE_SYSTEM_NAME "Linux")
 set(CMAKE_C_COMPILER_ARG1 "-m32")
 set(CMAKE_CXX_COMPILER_ARG1 "-m32")
-
-endif ()  # AOM_BUILD_CMAKE_TOOLCHAINS_X86_LINUX_CMAKE_
diff --git a/third_party/aom/build/cmake/toolchains/x86-macos.cmake b/third_party/aom/build/cmake/toolchains/x86-macos.cmake
index c9cab3d5b..7a46e06a9 100644
--- a/third_party/aom/build/cmake/toolchains/x86-macos.cmake
+++ b/third_party/aom/build/cmake/toolchains/x86-macos.cmake
@@ -1,13 +1,13 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
 set(CMAKE_SYSTEM_PROCESSOR "x86")
 set(CMAKE_SYSTEM_NAME "Darwin")
 set(CMAKE_OSX_ARCHITECTURES "i386")
diff --git a/third_party/aom/build/cmake/toolchains/x86-mingw-gcc.cmake b/third_party/aom/build/cmake/toolchains/x86-mingw-gcc.cmake
index bdd3fa539..c986c4ee3 100644
--- a/third_party/aom/build/cmake/toolchains/x86-mingw-gcc.cmake
+++ b/third_party/aom/build/cmake/toolchains/x86-mingw-gcc.cmake
@@ -1,14 +1,16 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_TOOLCHAINS_X86_MINGW_GCC_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_X86_MINGW_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_X86_MINGW_GCC_CMAKE_
 set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_MINGW_GCC_CMAKE_ 1)
 
 set(CMAKE_SYSTEM_PROCESSOR "x86")
@@ -16,14 +18,14 @@ set(CMAKE_SYSTEM_NAME "Windows")
 set(CMAKE_C_COMPILER_ARG1 "-m32")
 set(CMAKE_CXX_COMPILER_ARG1 "-m32")
 
-if ("${CROSS}" STREQUAL "")
+if("${CROSS}" STREQUAL "")
   set(CROSS i686-w64-mingw32-)
-endif ()
+endif()
 
 set(CMAKE_C_COMPILER ${CROSS}gcc)
 set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
+set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
 
 # Disable the use of the gtest's CMake support.
 set(AOM_DISABLE_GTEST_CMAKE 1)
-
-endif ()  # AOM_BUILD_CMAKE_TOOLCHAINS_X86_MINGW_GCC_CMAKE_
diff --git a/third_party/aom/build/cmake/toolchains/x86_64-ios-simulator.cmake b/third_party/aom/build/cmake/toolchains/x86_64-ios-simulator.cmake
index 884540a9d..d4b40ed09 100644
--- a/third_party/aom/build/cmake/toolchains/x86_64-ios-simulator.cmake
+++ b/third_party/aom/build/cmake/toolchains/x86_64-ios-simulator.cmake
@@ -1,24 +1,25 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_IOS_SIMULATOR_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_IOS_SIMULATOR_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_IOS_SIMULATOR_CMAKE_
 set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_IOS_SIMULATOR_CMAKE_ 1)
 
-if (XCODE)
+if(XCODE)
+
   # TODO(tomfinegan): Handle ios sim builds in Xcode.
   message(FATAL_ERROR "This toolchain does not support Xcode.")
-endif ()
+endif()
 
 set(CMAKE_SYSTEM_PROCESSOR "x86_64")
 set(CMAKE_OSX_ARCHITECTURES "x86_64")
 
 include("${CMAKE_CURRENT_LIST_DIR}/ios-simulator-common.cmake")
-
-endif ()  # AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_IOS_SIMULATOR_CMAKE_
diff --git a/third_party/aom/build/cmake/toolchains/x86_64-mingw-gcc.cmake b/third_party/aom/build/cmake/toolchains/x86_64-mingw-gcc.cmake
index be94332b4..00d94d5f1 100644
--- a/third_party/aom/build/cmake/toolchains/x86_64-mingw-gcc.cmake
+++ b/third_party/aom/build/cmake/toolchains/x86_64-mingw-gcc.cmake
@@ -1,27 +1,29 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_MINGW_GCC_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_MINGW_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_MINGW_GCC_CMAKE_
 set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_MINGW_GCC_CMAKE_ 1)
 
 set(CMAKE_SYSTEM_PROCESSOR "x86_64")
 set(CMAKE_SYSTEM_NAME "Windows")
 
-if ("${CROSS}" STREQUAL "")
+if("${CROSS}" STREQUAL "")
   set(CROSS x86_64-w64-mingw32-)
-endif ()
+endif()
 
 set(CMAKE_C_COMPILER ${CROSS}gcc)
 set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
+set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
 
 # Disable the use of the gtest's CMake support.
 set(AOM_DISABLE_GTEST_CMAKE 1)
-
-endif ()  # AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_MINGW_GCC_CMAKE_
diff --git a/third_party/aom/build/cmake/util.cmake b/third_party/aom/build/cmake/util.cmake
index d6c432229..a0c705691 100644
--- a/third_party/aom/build/cmake/util.cmake
+++ b/third_party/aom/build/cmake/util.cmake
@@ -1,84 +1,88 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_BUILD_CMAKE_UTIL_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_UTIL_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_UTIL_CMAKE_
 set(AOM_BUILD_CMAKE_UTIL_CMAKE_ 1)
 
 # Creates dummy source file in $AOM_CONFIG_DIR named $basename.$extension and
 # returns the full path to the dummy source file via the $out_file_path
 # parameter.
-function (create_dummy_source_file basename extension out_file_path)
-  set(dummy_source_file "${AOM_CONFIG_DIR}/${basename}.${extension}")
-  file(WRITE "${dummy_source_file}"
-       "// Generated file. DO NOT EDIT!\n"
-       "// ${target_name} needs a ${extension} file to force link language, \n"
-       "// or to silence a harmless CMake warning: Ignore me.\n"
-       "void ${target_name}_dummy_function(void) {}\n")
+function(create_dummy_source_file basename extension out_file_path)
+  set(dummy_source_file "${AOM_CONFIG_DIR}/${basename}_dummy.${extension}")
+  file(
+    WRITE
+      "${dummy_source_file}" "// Generated file. DO NOT EDIT!\n"
+      "// ${target_name} needs a ${extension} file to force link language, \n"
+      "// or to silence a harmless CMake warning: Ignore me.\n"
+      "void ${target_name}_dummy_function(void) {}\n")
   set(${out_file_path} ${dummy_source_file} PARENT_SCOPE)
-endfunction ()
+endfunction()
 
 # Convenience function for adding a dummy source file to $target_name using
 # $extension as the file extension. Wraps create_dummy_source_file().
-function (add_dummy_source_file_to_target target_name extension)
+function(add_dummy_source_file_to_target target_name extension)
   create_dummy_source_file("${target_name}" "${extension}" "dummy_source_file")
   target_sources(${target_name} PRIVATE ${dummy_source_file})
-endfunction ()
+endfunction()
 
 # Sets the value of the variable referenced by $feature to $value, and reports
 # the change to the user via call to message(WARNING ...). $cause is expected to
-# be a configuration variable that conflicts with $feature in some way.
-function (change_config_and_warn feature value cause)
+# be a configuration variable that conflicts with $feature in some way. This
+# function is a noop if $feature is already set to $value.
+function(change_config_and_warn feature value cause)
+  if(${feature} EQUAL ${value})
+    return()
+  endif()
   set(${feature} ${value} PARENT_SCOPE)
-  if (${value} EQUAL 1)
+  if(${value} EQUAL 1)
     set(verb "Enabled")
     set(reason "required for")
-  else ()
+  else()
     set(verb "Disabled")
     set(reason "incompatible with")
-  endif ()
+  endif()
   set(warning_message "${verb} ${feature}, ${reason} ${cause}.")
   message(WARNING "--- ${warning_message}")
-endfunction ()
+endfunction()
 
 # Extracts the version string from $version_file and returns it to the user via
 # $version_string_out_var. To achieve this VERSION_STRING_NOSP is located in
 # $version_file and then everything but the string literal assigned to the
-# variable is removed. Quotes and the leading 'v' are stripped from the
-# returned string.
-function (extract_version_string version_file version_string_out_var)
+# variable is removed. Quotes and the leading 'v' are stripped from the returned
+# string.
+function(extract_version_string version_file version_string_out_var)
   file(STRINGS "${version_file}" aom_version REGEX "VERSION_STRING_NOSP")
-  string(REPLACE "#define VERSION_STRING_NOSP " "" aom_version
-         "${aom_version}")
+  string(REPLACE "#define VERSION_STRING_NOSP " "" aom_version "${aom_version}")
   string(REPLACE "\"" "" aom_version "${aom_version}")
   string(REPLACE " " "" aom_version "${aom_version}")
   string(FIND "${aom_version}" "v" v_pos)
-  if (${v_pos} EQUAL 0)
+  if(${v_pos} EQUAL 0)
     string(SUBSTRING "${aom_version}" 1 -1 aom_version)
-  endif ()
+  endif()
   set("${version_string_out_var}" "${aom_version}" PARENT_SCOPE)
-endfunction ()
+endfunction()
 
 # Sets CMake compiler launcher to $launcher_name when $launcher_name is found in
 # $PATH. Warns user about ignoring build flag $launcher_flag when $launcher_name
 # is not found in $PATH.
-function (set_compiler_launcher launcher_flag launcher_name)
+function(set_compiler_launcher launcher_flag launcher_name)
   find_program(launcher_path "${launcher_name}")
-  if (launcher_path)
+  if(launcher_path)
     set(CMAKE_C_COMPILER_LAUNCHER "${launcher_path}" PARENT_SCOPE)
     set(CMAKE_CXX_COMPILER_LAUNCHER "${launcher_path}" PARENT_SCOPE)
     message("--- Using ${launcher_name} as compiler launcher.")
-  else ()
+  else()
     message(WARNING
-            "--- Cannot find ${launcher_name}, ${launcher_flag} ignored.")
-  endif ()
-endfunction ()
-
-endif()  # AOM_BUILD_CMAKE_UTIL_CMAKE_
+              "--- Cannot find ${launcher_name}, ${launcher_flag} ignored.")
+  endif()
+endfunction()
 
diff --git a/third_party/aom/build/cmake/version.cmake b/third_party/aom/build/cmake/version.cmake
index c2b3bdb61..d169b12ac 100644
--- a/third_party/aom/build/cmake/version.cmake
+++ b/third_party/aom/build/cmake/version.cmake
@@ -1,55 +1,57 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
 cmake_minimum_required(VERSION 3.5)
 
-set(REQUIRED_ARGS
-    "AOM_ROOT" "AOM_CONFIG_DIR" "GIT_EXECUTABLE" "PERL_EXECUTABLE")
+set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "GIT_EXECUTABLE"
+    "PERL_EXECUTABLE")
 
-foreach (arg ${REQUIRED_ARGS})
-  if ("${${arg}}" STREQUAL "")
+foreach(arg ${REQUIRED_ARGS})
+  if("${${arg}}" STREQUAL "")
     message(FATAL_ERROR "${arg} must not be empty.")
-  endif ()
-endforeach ()
+  endif()
+endforeach()
 
 include("${AOM_ROOT}/build/cmake/util.cmake")
 
 # Generate the version string for this run.
 unset(aom_version)
-if (EXISTS "${GIT_EXECUTABLE}")
+if(EXISTS "${GIT_EXECUTABLE}")
   execute_process(COMMAND ${GIT_EXECUTABLE} --git-dir=${AOM_ROOT}/.git describe
                   OUTPUT_VARIABLE aom_version ERROR_QUIET)
   string(STRIP "${aom_version}" aom_version)
 
   # Remove the leading 'v' from the version string.
   string(FIND "${aom_version}" "v" v_pos)
-  if (${v_pos} EQUAL 0)
+  if(${v_pos} EQUAL 0)
     string(SUBSTRING "${aom_version}" 1 -1 aom_version)
-  endif ()
-endif ()
+  endif()
+endif()
 
-if ("${aom_version}" STREQUAL "")
+if("${aom_version}" STREQUAL "")
   set(aom_version "${AOM_ROOT}/CHANGELOG")
-endif ()
+endif()
 
 unset(last_aom_version)
-if (EXISTS "${AOM_CONFIG_DIR}/aom_version.h")
-  extract_version_string("${AOM_CONFIG_DIR}/aom_version.h" last_aom_version)
-endif ()
+if(EXISTS "${AOM_CONFIG_DIR}/config/aom_version.h")
+  extract_version_string("${AOM_CONFIG_DIR}/config/aom_version.h"
+                         last_aom_version)
+endif()
+
+if(NOT "${aom_version}" STREQUAL "${last_aom_version}")
 
-if (NOT "${aom_version}" STREQUAL "${last_aom_version}")
   # TODO(tomfinegan): Perl dependency is unnecessary. CMake can do everything
   # that is done by version.pl on its own (if a bit more verbose...).
-  execute_process(
-    COMMAND ${PERL_EXECUTABLE} "${AOM_ROOT}/build/cmake/version.pl"
-    --version_data=${aom_version}
-    --version_filename=${AOM_CONFIG_DIR}/aom_version.h
-    VERBATIM)
-endif ()
+  execute_process(COMMAND
+                    ${PERL_EXECUTABLE} "${AOM_ROOT}/build/cmake/version.pl"
+                    --version_data=${aom_version}
+                    --version_filename=${AOM_CONFIG_DIR}/config/aom_version.h
+                    VERBATIM)
+endif()
diff --git a/third_party/aom/build/cmake/version.pl b/third_party/aom/build/cmake/version.pl
index 323e178f8..7c0608aeb 100755
--- a/third_party/aom/build/cmake/version.pl
+++ b/third_party/aom/build/cmake/version.pl
@@ -60,8 +60,9 @@ my $version_minor = $version_components[1];
 my $version_patch = $version_components[2];
 
 my $version_extra = "";
-if (@version_components > 3) {
-  $version_extra = $version_components[3];
+if (length($git_desc) > 0) {
+  my @git_desc_components = split('-', $git_desc, 2);
+  $version_extra = $git_desc_components[1];
 }
 
 open(my $version_file, '>', $version_filename) or
diff --git a/third_party/aom/build/make/Android.mk b/third_party/aom/build/make/Android.mk
deleted file mode 100644
index e50faef92..000000000
--- a/third_party/aom/build/make/Android.mk
+++ /dev/null
@@ -1,194 +0,0 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-
-#
-# This file is to be used for compiling libaom for Android using the NDK.
-# In an Android project place a libaom checkout in the jni directory.
-# Run the configure script from the jni directory.  Base libaom
-# encoder/decoder configuration will look similar to:
-# ./libaom/configure --target=armv7-android-gcc --disable-examples \
-#                    --sdk-path=/opt/android-ndk-r6b/
-#
-# When targeting Android, realtime-only is enabled by default.  This can
-# be overridden by adding the command line flag:
-#  --disable-realtime-only
-#
-# This will create .mk files that contain variables that contain the
-# source files to compile.
-#
-# Place an Android.mk file in the jni directory that references the
-# Android.mk file in the libaom directory:
-# LOCAL_PATH := $(call my-dir)
-# include $(CLEAR_VARS)
-# include jni/libaom/build/make/Android.mk
-#
-# There are currently two TARGET_ARCH_ABI targets for ARM.
-# armeabi and armeabi-v7a.  armeabi-v7a is selected by creating an
-# Application.mk in the jni directory that contains:
-# APP_ABI := armeabi-v7a
-#
-# By default libaom will detect at runtime the existance of NEON extension.
-# For this we import the 'cpufeatures' module from the NDK sources.
-# libaom can also be configured without this runtime detection method.
-# Configuring with --disable-runtime-cpu-detect will assume presence of NEON.
-# Configuring with --disable-runtime-cpu-detect --disable-neon \
-#     --disable-neon-asm
-# will remove any NEON dependency.
-#
-# Running ndk-build will build libaom and include it in your project.
-#
-
-CONFIG_DIR := $(LOCAL_PATH)/
-LIBAOM_PATH := $(LOCAL_PATH)/libaom
-ASM_CNV_PATH_LOCAL := $(TARGET_ARCH_ABI)/ads2gas
-ASM_CNV_PATH := $(LOCAL_PATH)/$(ASM_CNV_PATH_LOCAL)
-
-# Use the makefiles generated by upstream configure to determine which files to
-# build. Also set any architecture-specific flags.
-ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
-  include $(CONFIG_DIR)libs-armv7-android-gcc.mk
-  LOCAL_ARM_MODE := arm
-else ifeq  ($(TARGET_ARCH_ABI),arm64-v8a)
-  include $(CONFIG_DIR)libs-armv8-android-gcc.mk
-  LOCAL_ARM_MODE := arm
-else ifeq ($(TARGET_ARCH_ABI),x86)
-  include $(CONFIG_DIR)libs-x86-android-gcc.mk
-else ifeq ($(TARGET_ARCH_ABI),x86_64)
-  include $(CONFIG_DIR)libs-x86_64-android-gcc.mk
-else ifeq ($(TARGET_ARCH_ABI),mips)
-  include $(CONFIG_DIR)libs-mips-android-gcc.mk
-else
-  $(error Not a supported TARGET_ARCH_ABI: $(TARGET_ARCH_ABI))
-endif
-
-# Rule that is normally in Makefile created by libaom
-# configure.  Used to filter out source files based on configuration.
-enabled=$(filter-out $($(1)-no),$($(1)-yes))
-
-# Override the relative path that is defined by the libaom
-# configure process
-SRC_PATH_BARE := $(LIBAOM_PATH)
-
-# Include the list of files to be built
-include $(LIBAOM_PATH)/libs.mk
-
-# Optimise the code. May want to revisit this setting in the future.
-LOCAL_CFLAGS := -O3
-
-# For x86, include the source code in the search path so it will find files
-# like x86inc.asm and x86_abi_support.asm
-LOCAL_ASMFLAGS := -I$(LIBAOM_PATH)
-
-.PRECIOUS: %.asm.s
-$(ASM_CNV_PATH)/libaom/%.asm.s: $(LIBAOM_PATH)/%.asm
-	@mkdir -p $(dir $@)
-	@$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@
-
-# For building *_rtcd.h, which have rules in libs.mk
-TGT_ISA:=$(word 1, $(subst -, ,$(TOOLCHAIN)))
-target := libs
-
-LOCAL_SRC_FILES += aom_config.c
-
-# Remove duplicate entries
-CODEC_SRCS_UNIQUE = $(sort $(CODEC_SRCS))
-
-# Pull out C files.  aom_config.c is in the immediate directory and
-# so it does not need libaom/ prefixed like the rest of the source files.
-# The neon files with intrinsics need to have .neon appended so the proper
-# flags are applied.
-CODEC_SRCS_C = $(filter %.c, $(CODEC_SRCS_UNIQUE))
-LOCAL_NEON_SRCS_C = $(filter %_neon.c, $(CODEC_SRCS_C))
-LOCAL_CODEC_SRCS_C = $(filter-out aom_config.c %_neon.c, $(CODEC_SRCS_C))
-
-LOCAL_SRC_FILES += $(foreach file, $(LOCAL_CODEC_SRCS_C), libaom/$(file))
-ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
-  LOCAL_SRC_FILES += $(foreach file, $(LOCAL_NEON_SRCS_C), libaom/$(file).neon)
-else # If there are neon sources then we are building for arm64 and do not need to specify .neon
-  LOCAL_SRC_FILES += $(foreach file, $(LOCAL_NEON_SRCS_C), libaom/$(file))
-endif
-
-# Pull out assembly files, splitting NEON from the rest.  This is
-# done to specify that the NEON assembly files use NEON assembler flags.
-# x86 assembly matches %.asm, arm matches %.asm.s
-
-# x86:
-
-CODEC_SRCS_ASM_X86 = $(filter %.asm, $(CODEC_SRCS_UNIQUE))
-LOCAL_SRC_FILES += $(foreach file, $(CODEC_SRCS_ASM_X86), libaom/$(file))
-
-# arm:
-CODEC_SRCS_ASM_ARM_ALL = $(filter %.asm.s, $(CODEC_SRCS_UNIQUE))
-CODEC_SRCS_ASM_ARM = $(foreach v, \
-                     $(CODEC_SRCS_ASM_ARM_ALL), \
-                     $(if $(findstring neon,$(v)),,$(v)))
-CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.s, \
-                         $(ASM_CNV_PATH_LOCAL)/libaom/%.s, \
-                         $(CODEC_SRCS_ASM_ARM))
-LOCAL_SRC_FILES += $(CODEC_SRCS_ASM_ADS2GAS)
-
-ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
-  CODEC_SRCS_ASM_NEON = $(foreach v, \
-                        $(CODEC_SRCS_ASM_ARM_ALL),\
-                        $(if $(findstring neon,$(v)),$(v),))
-  CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.s, \
-                                $(ASM_CNV_PATH_LOCAL)/libaom/%.s, \
-                                $(CODEC_SRCS_ASM_NEON))
-  LOCAL_SRC_FILES += $(patsubst %.s, \
-                     %.s.neon, \
-                     $(CODEC_SRCS_ASM_NEON_ADS2GAS))
-endif
-
-LOCAL_CFLAGS += \
-    -DHAVE_CONFIG_H=aom_config.h \
-    -I$(LIBAOM_PATH) \
-    -I$(ASM_CNV_PATH)
-
-LOCAL_MODULE := libaom
-
-ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
-  LOCAL_STATIC_LIBRARIES := cpufeatures
-endif
-
-# Add a dependency to force generation of the RTCD files.
-define rtcd_dep_template
-rtcd_dep_template_SRCS := $(addprefix $(LOCAL_PATH)/, $(LOCAL_SRC_FILES))
-rtcd_dep_template_SRCS := $$(rtcd_dep_template_SRCS:.neon=)
-ifeq ($(CONFIG_AV1), yes)
-$$(rtcd_dep_template_SRCS): av1_rtcd.h
-endif
-$$(rtcd_dep_template_SRCS): aom_scale_rtcd.h
-$$(rtcd_dep_template_SRCS): aom_dsp_rtcd.h
-
-ifneq ($(findstring $(TARGET_ARCH_ABI),x86 x86_64),)
-$$(rtcd_dep_template_SRCS): aom_config.asm
-endif
-endef
-
-$(eval $(call rtcd_dep_template))
-
-.PHONY: clean
-clean:
-	@echo "Clean: ads2gas files [$(TARGET_ARCH_ABI)]"
-	@$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS)
-	@$(RM) -r $(ASM_CNV_PATH)
-	@$(RM) $(CLEAN-OBJS)
-
-ifeq ($(ENABLE_SHARED),1)
-  include $(BUILD_SHARED_LIBRARY)
-else
-  include $(BUILD_STATIC_LIBRARY)
-endif
-
-ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
-$(call import-module,cpufeatures)
-endif
diff --git a/third_party/aom/build/make/Makefile b/third_party/aom/build/make/Makefile
deleted file mode 100644
index 0b869db0a..000000000
--- a/third_party/aom/build/make/Makefile
+++ /dev/null
@@ -1,466 +0,0 @@
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-
-include config.mk
-quiet?=true
-ifeq ($(target),)
-# If a target wasn't specified, invoke for all enabled targets.
-.DEFAULT:
-	@for t in $(ALL_TARGETS); do \
-	     $(MAKE) --no-print-directory target=$$t $(MAKECMDGOALS) || exit $$?;\
-        done
-all: .DEFAULT
-clean:: .DEFAULT
-exampletest: .DEFAULT
-install:: .DEFAULT
-test:: .DEFAULT
-test-no-data-check:: .DEFAULT
-testdata:: .DEFAULT
-utiltest: .DEFAULT
-exampletest-no-data-check utiltest-no-data-check: .DEFAULT
-test_%: .DEFAULT ;
-
-# Note: md5sum is not installed on OS X, but openssl is. Openssl may not be
-# installed on cygwin, so we need to autodetect here.
-md5sum := $(firstword $(wildcard \
-          $(foreach e,md5sum openssl,\
-          $(foreach p,$(subst :, ,$(PATH)),$(p)/$(e)*))\
-          ))
-md5sum := $(if $(filter %openssl,$(md5sum)),$(md5sum) dgst -md5,$(md5sum))
-
-TGT_CC:=$(word 3, $(subst -, ,$(TOOLCHAIN)))
-dist:
-	@for t in $(ALL_TARGETS); do \
-	     $(MAKE) --no-print-directory target=$$t $(MAKECMDGOALS) || exit $$?;\
-        done
-        # Run configure for the user with the current toolchain.
-	@if [ -d "$(DIST_DIR)/src" ]; then \
-            mkdir -p "$(DIST_DIR)/build"; \
-            cd "$(DIST_DIR)/build"; \
-            echo "Rerunning configure $(CONFIGURE_ARGS)"; \
-            ../src/configure $(CONFIGURE_ARGS); \
-            $(if $(filter vs%,$(TGT_CC)),make NO_LAUNCH_DEVENV=1;) \
-        fi
-	@if [ -d "$(DIST_DIR)" ]; then \
-            echo "    [MD5SUM] $(DIST_DIR)"; \
-	    cd $(DIST_DIR) && \
-	    $(md5sum) `find . -name md5sums.txt -prune -o -type f -print` \
-                | sed -e 's/MD5(\(.*\))= \([0-9a-f]\{32\}\)/\2  \1/' \
-                > md5sums.txt;\
-        fi
-endif
-
-# Since we invoke make recursively for multiple targets we need to include the
-# .mk file for the correct target, but only when $(target) is non-empty.
-ifneq ($(target),)
-include $(target)-$(TOOLCHAIN).mk
-endif
-BUILD_ROOT?=.
-VPATH=$(SRC_PATH_BARE)
-CFLAGS+=-I$(BUILD_PFX)$(BUILD_ROOT) -I$(SRC_PATH)
-CXXFLAGS+=-I$(BUILD_PFX)$(BUILD_ROOT) -I$(SRC_PATH)
-ASFLAGS+=-I$(BUILD_PFX)$(BUILD_ROOT)/ -I$(SRC_PATH)/
-DIST_DIR?=dist
-HOSTCC?=gcc
-TGT_ISA:=$(word 1, $(subst -, ,$(TOOLCHAIN)))
-TGT_OS:=$(word 2, $(subst -, ,$(TOOLCHAIN)))
-TGT_CC:=$(word 3, $(subst -, ,$(TOOLCHAIN)))
-quiet:=$(if $(or $(verbose), $(V)),, yes)
-qexec=$(if $(quiet),@)
-
-# Cancel built-in implicit rules
-%: %.o
-%.asm:
-%.a:
-%: %.cc
-
-#
-# Common rules"
-#
-.PHONY: all
-all:
-
-.PHONY: clean
-clean::
-	rm -f $(OBJS-yes) $(OBJS-yes:.o=.d) $(OBJS-yes:.asm.s.o=.asm.s)
-	rm -f $(CLEAN-OBJS)
-
-.PHONY: clean
-distclean: clean
-	if [ -z "$(target)" ]; then \
-      rm -f Makefile; \
-      rm -f config.log config.mk; \
-      rm -f aom_config.[hc] aom_config.asm; \
-    else \
-      rm -f $(target)-$(TOOLCHAIN).mk; \
-    fi
-
-.PHONY: dist
-dist:
-.PHONY: exampletest
-exampletest:
-.PHONY: install
-install::
-.PHONY: test
-test::
-.PHONY: testdata
-testdata::
-.PHONY: utiltest
-utiltest:
-.PHONY: test-no-data-check exampletest-no-data-check utiltest-no-data-check
-test-no-data-check::
-exampletest-no-data-check utiltest-no-data-check:
-
-# Force to realign stack always on OS/2
-ifeq ($(TOOLCHAIN), x86-os2-gcc)
-CFLAGS += -mstackrealign
-endif
-
-$(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx
-$(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx
-$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2
-$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2
-$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3
-$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3
-$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3
-$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3
-$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1
-$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1
-$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx
-$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx
-$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2
-$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
-$(BUILD_PFX)%_mmx.cc.d: CXXFLAGS += -mmmx
-$(BUILD_PFX)%_mmx.cc.o: CXXFLAGS += -mmmx
-$(BUILD_PFX)%_sse2.cc.d: CXXFLAGS += -msse2
-$(BUILD_PFX)%_sse2.cc.o: CXXFLAGS += -msse2
-$(BUILD_PFX)%_sse3.cc.d: CXXFLAGS += -msse3
-$(BUILD_PFX)%_sse3.cc.o: CXXFLAGS += -msse3
-$(BUILD_PFX)%_ssse3.cc.d: CXXFLAGS += -mssse3
-$(BUILD_PFX)%_ssse3.cc.o: CXXFLAGS += -mssse3
-$(BUILD_PFX)%_sse4.cc.d: CXXFLAGS += -msse4.1
-$(BUILD_PFX)%_sse4.cc.o: CXXFLAGS += -msse4.1
-$(BUILD_PFX)%_avx.cc.d: CXXFLAGS += -mavx
-$(BUILD_PFX)%_avx.cc.o: CXXFLAGS += -mavx
-$(BUILD_PFX)%_avx2.cc.d: CXXFLAGS += -mavx2
-$(BUILD_PFX)%_avx2.cc.o: CXXFLAGS += -mavx2
-
-$(BUILD_PFX)%.c.d: %.c
-	$(if $(quiet),@echo "    [DEP] $@")
-	$(qexec)mkdir -p $(dir $@)
-	$(qexec)$(CC) $(INTERNAL_CFLAGS) $(CFLAGS) -M $< | $(fmt_deps) > $@
-
-$(BUILD_PFX)%.c.o: %.c
-	$(if $(quiet),@echo "    [CC] $@")
-	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
-	$(qexec)$(CC) $(INTERNAL_CFLAGS) $(CFLAGS) -c -o $@ $<
-
-$(BUILD_PFX)%.cc.d: %.cc
-	$(if $(quiet),@echo "    [DEP] $@")
-	$(qexec)mkdir -p $(dir $@)
-	$(qexec)$(CXX) $(INTERNAL_CFLAGS) $(CXXFLAGS) -M $< | $(fmt_deps) > $@
-
-$(BUILD_PFX)%.cc.o: %.cc
-	$(if $(quiet),@echo "    [CXX] $@")
-	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
-	$(qexec)$(CXX) $(INTERNAL_CFLAGS) $(CXXFLAGS) -c -o $@ $<
-
-$(BUILD_PFX)%.cpp.d: %.cpp
-	$(if $(quiet),@echo "    [DEP] $@")
-	$(qexec)mkdir -p $(dir $@)
-	$(qexec)$(CXX) $(INTERNAL_CFLAGS) $(CXXFLAGS) -M $< | $(fmt_deps) > $@
-
-$(BUILD_PFX)%.cpp.o: %.cpp
-	$(if $(quiet),@echo "    [CXX] $@")
-	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
-	$(qexec)$(CXX) $(INTERNAL_CFLAGS) $(CXXFLAGS) -c -o $@ $<
-
-$(BUILD_PFX)%.asm.d: %.asm
-	$(if $(quiet),@echo "    [DEP] $@")
-	$(qexec)mkdir -p $(dir $@)
-	$(qexec)$(SRC_PATH_BARE)/build/make/gen_asm_deps.sh \
-            --build-pfx=$(BUILD_PFX) --depfile=$@ $(ASFLAGS) $< > $@
-
-$(BUILD_PFX)%.asm.o: %.asm
-	$(if $(quiet),@echo "    [AS] $@")
-	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
-	$(qexec)$(AS) $(ASFLAGS) -o $@ $<
-
-$(BUILD_PFX)%.s.d: %.s
-	$(if $(quiet),@echo "    [DEP] $@")
-	$(qexec)mkdir -p $(dir $@)
-	$(qexec)$(SRC_PATH_BARE)/build/make/gen_asm_deps.sh \
-            --build-pfx=$(BUILD_PFX) --depfile=$@ $(ASFLAGS) $< > $@
-
-$(BUILD_PFX)%.s.o: %.s
-	$(if $(quiet),@echo "    [AS] $@")
-	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
-	$(qexec)$(AS) $(ASFLAGS) -o $@ $<
-
-.PRECIOUS: %.c.S
-%.c.S: CFLAGS += -DINLINE_ASM
-$(BUILD_PFX)%.c.S: %.c
-	$(if $(quiet),@echo "    [GEN] $@")
-	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
-	$(qexec)$(CC) -S $(CFLAGS) -o $@ $<
-
-.PRECIOUS: %.asm.s
-$(BUILD_PFX)%.asm.s: %.asm
-	$(if $(quiet),@echo "    [ASM CONVERSION] $@")
-	$(qexec)mkdir -p $(dir $@)
-	$(qexec)$(ASM_CONVERSION) <$< >$@
-
-# If we're in debug mode, pretend we don't have GNU strip, to fall back to
-# the copy implementation
-HAVE_GNU_STRIP := $(if $(CONFIG_DEBUG),,$(HAVE_GNU_STRIP))
-ifeq ($(HAVE_GNU_STRIP),yes)
-# Older binutils strip global symbols not needed for relocation processing
-# when given --strip-unneeded. Using nm and awk to identify globals and
-# keep them caused command line length issues under mingw and segfaults in
-# test_libaom were observed under OS/2: simply use --strip-debug.
-%.a: %_g.a
-	$(if $(quiet),@echo "    [STRIP] $@ < $<")
-	$(qexec)$(STRIP) --strip-debug \
-          -o $@ $<
-else
-%.a: %_g.a
-	$(if $(quiet),@echo "    [CP] $@ < $<")
-	$(qexec)cp $< $@
-endif
-
-#
-# Utility functions
-#
-pairmap=$(if $(strip $(2)),\
-    $(call $(1),$(word 1,$(2)),$(word 2,$(2)))\
-    $(call pairmap,$(1),$(wordlist 3,$(words $(2)),$(2)))\
-)
-
-enabled=$(filter-out $($(1)-no),$($(1)-yes))
-cond_enabled=$(if $(filter yes,$($(1))), $(call enabled,$(2)))
-
-find_file1=$(word 1,$(wildcard $(subst //,/,$(addsuffix /$(1),$(2)))))
-find_file=$(foreach f,$(1),$(call find_file1,$(strip $(f)),$(strip $(2))) )
-obj_pats=.c=.c.o $(AS_SFX)=$(AS_SFX).o .cc=.cc.o .cpp=.cpp.o
-objs=$(addprefix $(BUILD_PFX),$(foreach p,$(obj_pats),$(filter %.o,$(1:$(p))) ))
-
-install_map_templates=$(eval $(call install_map_template,$(1),$(2)))
-
-not=$(subst yes,no,$(1))
-
-ifeq ($(CONFIG_MSVS),yes)
-lib_file_name=$(1).lib
-else
-lib_file_name=lib$(1).a
-endif
-#
-# Rule Templates
-#
-define linker_template
-$(1): $(filter-out -%,$(2))
-$(1):
-	$(if $(quiet),@echo    "    [LD] $$@")
-	$(qexec)$$(LD) $$(strip $$(INTERNAL_LDFLAGS) $$(LDFLAGS) -o $$@ $(2) $(3) $$(extralibs))
-endef
-define linkerxx_template
-$(1): $(filter-out -%,$(2))
-$(1):
-	$(if $(quiet),@echo    "    [LD] $$@")
-	$(qexec)$$(CXX) $$(strip $$(INTERNAL_LDFLAGS) $$(LDFLAGS) -o $$@ $(2) $(3) $$(extralibs))
-endef
-# make-3.80 has a bug with expanding large input strings to the eval function,
-# which was triggered in some cases by the following component of
-# linker_template:
-#   $(1): $$(call find_file, $(patsubst -l%,lib%.a,$(filter -l%,$(2))),\
-#                           $$(patsubst -L%,%,$$(filter -L%,$$(LDFLAGS) $(2))))
-# This may be useful to revisit in the future (it tries to locate libraries
-# in a search path and add them as prerequisites
-
-define install_map_template
-$(DIST_DIR)/$(1): $(2)
-	$(if $(quiet),@echo "    [INSTALL] $$@")
-	$(qexec)mkdir -p $$(dir $$@)
-	$(qexec)cp -p $$< $$@
-endef
-
-define archive_template
-# Not using a pattern rule here because we don't want to generate empty
-# archives when they are listed as a dependency in files not responsible
-# for creating them.
-$(1):
-	$(if $(quiet),@echo "    [AR] $$@")
-	$(qexec)$$(AR) $$(ARFLAGS) $$@ $$^
-endef
-
-define so_template
-# Not using a pattern rule here because we don't want to generate empty
-# archives when they are listed as a dependency in files not responsible
-# for creating them.
-#
-# This needs further abstraction for dealing with non-GNU linkers.
-$(1):
-	$(if $(quiet),@echo "    [LD] $$@")
-	$(qexec)$$(LD) -shared $$(LDFLAGS) \
-            -Wl,--no-undefined -Wl,-soname,$$(SONAME) \
-            -Wl,--version-script,$$(EXPORTS_FILE) -o $$@ \
-            $$(filter %.o,$$^) $$(extralibs)
-endef
-
-define dl_template
-# Not using a pattern rule here because we don't want to generate empty
-# archives when they are listed as a dependency in files not responsible
-# for creating them.
-$(1):
-	$(if $(quiet),@echo "    [LD] $$@")
-	$(qexec)$$(LD) -dynamiclib $$(LDFLAGS) \
-	    -exported_symbols_list $$(EXPORTS_FILE) \
-        -Wl,-headerpad_max_install_names,-compatibility_version,1.0,-current_version,$$(VERSION_MAJOR) \
-        -o $$@ \
-        $$(filter %.o,$$^) $$(extralibs)
-endef
-
-define dll_template
-# Not using a pattern rule here because we don't want to generate empty
-# archives when they are listed as a dependency in files not responsible
-# for creating them.
-$(1):
-	$(if $(quiet),@echo "    [LD] $$@")
-	$(qexec)$$(LD) -Zdll $$(LDFLAGS) \
-        -o $$@ \
-        $$(filter %.o,$$^) $$(extralibs) $$(EXPORTS_FILE)
-endef
-
-
-#
-# Get current configuration
-#
-ifneq ($(target),)
-include $(SRC_PATH_BARE)/$(target:-$(TOOLCHAIN)=).mk
-endif
-
-skip_deps := $(filter %clean,$(MAKECMDGOALS))
-skip_deps += $(findstring testdata,$(MAKECMDGOALS))
-ifeq ($(strip $(skip_deps)),)
-  ifeq ($(CONFIG_DEPENDENCY_TRACKING),yes)
-    # Older versions of make don't like -include directives with no arguments
-    ifneq ($(filter %.d,$(OBJS-yes:.o=.d)),)
-      -include $(filter %.d,$(OBJS-yes:.o=.d))
-    endif
-  endif
-endif
-
-#
-# Configuration dependent rules
-#
-$(call pairmap,install_map_templates,$(INSTALL_MAPS))
-
-DOCS=$(call cond_enabled,CONFIG_INSTALL_DOCS,DOCS)
-.docs: $(DOCS)
-	@touch $@
-
-INSTALL-DOCS=$(call cond_enabled,CONFIG_INSTALL_DOCS,INSTALL-DOCS)
-ifeq ($(MAKECMDGOALS),dist)
-INSTALL-DOCS+=$(call cond_enabled,CONFIG_INSTALL_DOCS,DIST-DOCS)
-endif
-.install-docs: .docs $(addprefix $(DIST_DIR)/,$(INSTALL-DOCS))
-	@touch $@
-
-clean::
-	rm -f .docs .install-docs $(DOCS)
-
-BINS=$(call enabled,BINS)
-.bins: $(BINS)
-	@touch $@
-
-INSTALL-BINS=$(call cond_enabled,CONFIG_INSTALL_BINS,INSTALL-BINS)
-ifeq ($(MAKECMDGOALS),dist)
-INSTALL-BINS+=$(call cond_enabled,CONFIG_INSTALL_BINS,DIST-BINS)
-endif
-.install-bins: .bins $(addprefix $(DIST_DIR)/,$(INSTALL-BINS))
-	@touch $@
-
-clean::
-	rm -f .bins .install-bins $(BINS)
-
-LIBS=$(call enabled,LIBS)
-.libs: $(LIBS)
-	@touch $@
-$(foreach lib,$(filter %_g.a,$(LIBS)),$(eval $(call archive_template,$(lib))))
-$(foreach lib,$(filter %so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH),$(LIBS)),$(eval $(call so_template,$(lib))))
-$(foreach lib,$(filter %$(SO_VERSION_MAJOR).dylib,$(LIBS)),$(eval $(call dl_template,$(lib))))
-$(foreach lib,$(filter %$(SO_VERSION_MAJOR).dll,$(LIBS)),$(eval $(call dll_template,$(lib))))
-
-INSTALL-LIBS=$(call cond_enabled,CONFIG_INSTALL_LIBS,INSTALL-LIBS)
-ifeq ($(MAKECMDGOALS),dist)
-INSTALL-LIBS+=$(call cond_enabled,CONFIG_INSTALL_LIBS,DIST-LIBS)
-endif
-.install-libs: .libs $(addprefix $(DIST_DIR)/,$(INSTALL-LIBS))
-	@touch $@
-
-clean::
-	rm -f .libs .install-libs $(LIBS)
-
-ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
-PROJECTS=$(call enabled,PROJECTS)
-.projects: $(PROJECTS)
-	@touch $@
-
-INSTALL-PROJECTS=$(call cond_enabled,CONFIG_INSTALL_PROJECTS,INSTALL-PROJECTS)
-ifeq ($(MAKECMDGOALS),dist)
-INSTALL-PROJECTS+=$(call cond_enabled,CONFIG_INSTALL_PROJECTS,DIST-PROJECTS)
-endif
-.install-projects: .projects $(addprefix $(DIST_DIR)/,$(INSTALL-PROJECTS))
-	@touch $@
-
-clean::
-	rm -f .projects .install-projects $(PROJECTS)
-endif
-
-# If there are any source files to be distributed, then include the build
-# system too.
-ifneq ($(call enabled,DIST-SRCS),)
-    DIST-SRCS-yes            += configure
-    DIST-SRCS-yes            += build/make/configure.sh
-    DIST-SRCS-yes            += build/make/gen_asm_deps.sh
-    DIST-SRCS-yes            += build/make/Makefile
-    DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_def.sh
-    DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_sln.sh
-    DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_vcxproj.sh
-    DIST-SRCS-$(CONFIG_MSVS)  += build/make/msvs_common.sh
-    DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh
-    DIST-SRCS-$(ARCH_ARM)    += build/make/ads2gas.pl
-    DIST-SRCS-$(ARCH_ARM)    += build/make/ads2gas_apple.pl
-    DIST-SRCS-$(ARCH_ARM)    += build/make/ads2armasm_ms.pl
-    DIST-SRCS-$(ARCH_ARM)    += build/make/thumb.pm
-    DIST-SRCS-yes            += $(target:-$(TOOLCHAIN)=).mk
-endif
-INSTALL-SRCS := $(call cond_enabled,CONFIG_INSTALL_SRCS,INSTALL-SRCS)
-ifeq ($(MAKECMDGOALS),dist)
-INSTALL-SRCS += $(call cond_enabled,CONFIG_INSTALL_SRCS,DIST-SRCS)
-endif
-.install-srcs: $(addprefix $(DIST_DIR)/src/,$(INSTALL-SRCS))
-	@touch $@
-
-clean::
-	rm -f .install-srcs
-
-ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
-    BUILD_TARGETS += .projects
-    INSTALL_TARGETS += .install-projects
-endif
-BUILD_TARGETS += .docs .libs .bins
-INSTALL_TARGETS += .install-docs .install-srcs .install-libs .install-bins
-all: $(BUILD_TARGETS)
-install:: $(INSTALL_TARGETS)
-dist: $(INSTALL_TARGETS)
-test::
-
-.SUFFIXES:  # Delete default suffix rules
diff --git a/third_party/aom/build/make/ads2armasm_ms.pl b/third_party/aom/build/make/ads2armasm_ms.pl
deleted file mode 100755
index 8568a2dad..000000000
--- a/third_party/aom/build/make/ads2armasm_ms.pl
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env perl
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-use FindBin;
-use lib $FindBin::Bin;
-use thumb;
-
-print "; This file was created from a .asm file\n";
-print ";  using the ads2armasm_ms.pl script.\n";
-
-while (<STDIN>)
-{
-    undef $comment;
-    undef $line;
-
-    s/REQUIRE8//;
-    s/PRESERVE8//;
-    s/^\s*ARM\s*$//;
-    s/AREA\s+\|\|(.*)\|\|/AREA |$1|/;
-    s/qsubaddx/qsax/i;
-    s/qaddsubx/qasx/i;
-
-    thumb::FixThumbInstructions($_, 1);
-
-    s/ldrneb/ldrbne/i;
-    s/ldrneh/ldrhne/i;
-    s/^(\s*)ENDP.*/$&\n$1ALIGN 4/;
-
-    print;
-}
-
diff --git a/third_party/aom/build/make/ads2gas.pl b/third_party/aom/build/make/ads2gas.pl
deleted file mode 100755
index adf45a3c9..000000000
--- a/third_party/aom/build/make/ads2gas.pl
+++ /dev/null
@@ -1,236 +0,0 @@
-#!/usr/bin/env perl
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-
-# ads2gas.pl
-# Author: Eric Fung (efung (at) acm.org)
-#
-# Convert ARM Developer Suite 1.0.1 syntax assembly source to GNU as format
-#
-# Usage: cat inputfile | perl ads2gas.pl > outputfile
-#
-
-use FindBin;
-use lib $FindBin::Bin;
-use thumb;
-
-my $thumb = 0;
-
-foreach my $arg (@ARGV) {
-    $thumb = 1 if ($arg eq "-thumb");
-}
-
-print "@ This file was created from a .asm file\n";
-print "@  using the ads2gas.pl script.\n";
-print "\t.equ DO1STROUNDING, 0\n";
-if ($thumb) {
-    print "\t.syntax unified\n";
-    print "\t.thumb\n";
-}
-
-# Stack of procedure names.
-@proc_stack = ();
-
-while (<STDIN>)
-{
-    undef $comment;
-    undef $line;
-    $comment_char = ";";
-    $comment_sub = "@";
-
-    # Handle comments.
-    if (/$comment_char/)
-    {
-      $comment = "";
-      ($line, $comment) = /(.*?)$comment_char(.*)/;
-      $_ = $line;
-    }
-
-    # Load and store alignment
-    s/@/,:/g;
-
-    # Hexadecimal constants prefaced by 0x
-    s/#&/#0x/g;
-
-    # Convert :OR: to |
-    s/:OR:/ | /g;
-
-    # Convert :AND: to &
-    s/:AND:/ & /g;
-
-    # Convert :NOT: to ~
-    s/:NOT:/ ~ /g;
-
-    # Convert :SHL: to <<
-    s/:SHL:/ << /g;
-
-    # Convert :SHR: to >>
-    s/:SHR:/ >> /g;
-
-    # Convert ELSE to .else
-    s/\bELSE\b/.else/g;
-
-    # Convert ENDIF to .endif
-    s/\bENDIF\b/.endif/g;
-
-    # Convert ELSEIF to .elseif
-    s/\bELSEIF\b/.elseif/g;
-
-    # Convert LTORG to .ltorg
-    s/\bLTORG\b/.ltorg/g;
-
-    # Convert endfunc to nothing.
-    s/\bendfunc\b//ig;
-
-    # Convert FUNCTION to nothing.
-    s/\bFUNCTION\b//g;
-    s/\bfunction\b//g;
-
-    s/\bENTRY\b//g;
-    s/\bMSARMASM\b/0/g;
-    s/^\s+end\s+$//g;
-
-    # Convert IF :DEF:to .if
-    # gcc doesn't have the ability to do a conditional
-    # if defined variable that is set by IF :DEF: on
-    # armasm, so convert it to a normal .if and then
-    # make sure to define a value elesewhere
-    if (s/\bIF :DEF:\b/.if /g)
-    {
-        s/=/==/g;
-    }
-
-    # Convert IF to .if
-    if (s/\bIF\b/.if/g)
-    {
-        s/=+/==/g;
-    }
-
-    # Convert INCLUDE to .INCLUDE "file"
-    s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/;
-
-    # Code directive (ARM vs Thumb)
-    s/CODE([0-9][0-9])/.code $1/;
-
-    # No AREA required
-    # But ALIGNs in AREA must be obeyed
-    s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/;
-    # If no ALIGN, strip the AREA and align to 4 bytes
-    s/^\s*AREA.*$/.text\n.p2align 2/;
-
-    # DCD to .word
-    # This one is for incoming symbols
-    s/DCD\s+\|(\w*)\|/.long $1/;
-
-    # DCW to .short
-    s/DCW\s+\|(\w*)\|/.short $1/;
-    s/DCW(.*)/.short $1/;
-
-    # Constants defined in scope
-    s/DCD(.*)/.long $1/;
-    s/DCB(.*)/.byte $1/;
-
-    # RN to .req
-    if (s/RN\s+([Rr]\d+|lr)/.req $1/)
-    {
-        print;
-        print "$comment_sub$comment\n" if defined $comment;
-        next;
-    }
-
-    # Make function visible to linker, and make additional symbol with
-    # prepended underscore
-    s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/;
-    s/IMPORT\s+\|([\$\w]*)\|/.global $1/;
-
-    s/EXPORT\s+([\$\w]*)/.global $1/;
-    s/export\s+([\$\w]*)/.global $1/;
-
-    # No vertical bars required; make additional symbol with prepended
-    # underscore
-    s/^\|(\$?\w+)\|/_$1\n\t$1:/g;
-
-    # Labels need trailing colon
-#   s/^(\w+)/$1:/ if !/EQU/;
-    # put the colon at the end of the line in the macro
-    s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;
-
-    # ALIGN directive
-    s/\bALIGN\b/.balign/g;
-
-    if ($thumb) {
-        # ARM code - we force everything to thumb with the declaration in the header
-        s/\sARM//g;
-    } else {
-        # ARM code
-        s/\sARM/.arm/g;
-    }
-
-    # push/pop
-    s/(push\s+)(r\d+)/stmdb sp\!, \{$2\}/g;
-    s/(pop\s+)(r\d+)/ldmia sp\!, \{$2\}/g;
-
-    # NEON code
-    s/(vld1.\d+\s+)(q\d+)/$1\{$2\}/g;
-    s/(vtbl.\d+\s+[^,]+),([^,]+)/$1,\{$2\}/g;
-
-    if ($thumb) {
-        thumb::FixThumbInstructions($_, 0);
-    }
-
-    # eabi_attributes numerical equivalents can be found in the
-    # "ARM IHI 0045C" document.
-
-    # REQUIRE8 Stack is required to be 8-byte aligned
-    s/\sREQUIRE8/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g;
-
-    # PRESERVE8 Stack 8-byte align is preserved
-    s/\sPRESERVE8/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g;
-
-    # Use PROC and ENDP to give the symbols a .size directive.
-    # This makes them show up properly in debugging tools like gdb and valgrind.
-    if (/\bPROC\b/)
-    {
-        my $proc;
-        /^_([\.0-9A-Z_a-z]\w+)\b/;
-        $proc = $1;
-        push(@proc_stack, $proc) if ($proc);
-        s/\bPROC\b/@ $&/;
-    }
-    if (/\bENDP\b/)
-    {
-        my $proc;
-        s/\bENDP\b/@ $&/;
-        $proc = pop(@proc_stack);
-        $_ = "\t.size $proc, .-$proc".$_ if ($proc);
-    }
-
-    # EQU directive
-    s/(\S+\s+)EQU(\s+\S+)/.equ $1, $2/;
-
-    # Begin macro definition
-    if (/\bMACRO\b/) {
-        $_ = <STDIN>;
-        s/^/.macro/;
-        s/\$//g;                # remove formal param reference
-        s/;/@/g;                # change comment characters
-    }
-
-    # For macros, use \ to reference formal params
-    s/\$/\\/g;                  # End macro definition
-    s/\bMEND\b/.endm/;              # No need to tell it where to stop assembling
-    next if /^\s*END\s*$/;
-    print;
-    print "$comment_sub$comment\n" if defined $comment;
-}
-
-# Mark that this object doesn't need an executable stack.
-printf ("\t.section\t.note.GNU-stack,\"\",\%\%progbits\n");
diff --git a/third_party/aom/build/make/ads2gas_apple.pl b/third_party/aom/build/make/ads2gas_apple.pl
deleted file mode 100755
index 31ec91d56..000000000
--- a/third_party/aom/build/make/ads2gas_apple.pl
+++ /dev/null
@@ -1,235 +0,0 @@
-#!/usr/bin/env perl
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-
-# ads2gas_apple.pl
-# Author: Eric Fung (efung (at) acm.org)
-#
-# Convert ARM Developer Suite 1.0.1 syntax assembly source to GNU as format
-#
-# Usage: cat inputfile | perl ads2gas_apple.pl > outputfile
-#
-
-my $chromium = 0;
-
-foreach my $arg (@ARGV) {
-    $chromium = 1 if ($arg eq "-chromium");
-}
-
-print "@ This file was created from a .asm file\n";
-print "@  using the ads2gas_apple.pl script.\n\n";
-print "\t.set WIDE_REFERENCE, 0\n";
-print "\t.set ARCHITECTURE, 5\n";
-print "\t.set DO1STROUNDING, 0\n";
-
-my %register_aliases;
-my %macro_aliases;
-
-my @mapping_list = ("\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7", "\$8", "\$9");
-
-my @incoming_array;
-
-my @imported_functions;
-
-# Perl trim function to remove whitespace from the start and end of the string
-sub trim($)
-{
-    my $string = shift;
-    $string =~ s/^\s+//;
-    $string =~ s/\s+$//;
-    return $string;
-}
-
-while (<STDIN>)
-{
-    # Load and store alignment
-    s/@/,:/g;
-
-    # Comment character
-    s/;/ @/g;
-
-    # Hexadecimal constants prefaced by 0x
-    s/#&/#0x/g;
-
-    # Convert :OR: to |
-    s/:OR:/ | /g;
-
-    # Convert :AND: to &
-    s/:AND:/ & /g;
-
-    # Convert :NOT: to ~
-    s/:NOT:/ ~ /g;
-
-    # Convert :SHL: to <<
-    s/:SHL:/ << /g;
-
-    # Convert :SHR: to >>
-    s/:SHR:/ >> /g;
-
-    # Convert ELSE to .else
-    s/\bELSE\b/.else/g;
-
-    # Convert ENDIF to .endif
-    s/\bENDIF\b/.endif/g;
-
-    # Convert ELSEIF to .elseif
-    s/\bELSEIF\b/.elseif/g;
-
-    # Convert LTORG to .ltorg
-    s/\bLTORG\b/.ltorg/g;
-
-    # Convert IF :DEF:to .if
-    # gcc doesn't have the ability to do a conditional
-    # if defined variable that is set by IF :DEF: on
-    # armasm, so convert it to a normal .if and then
-    # make sure to define a value elesewhere
-    if (s/\bIF :DEF:\b/.if /g)
-    {
-        s/=/==/g;
-    }
-
-    # Convert IF to .if
-    if (s/\bIF\b/.if/g)
-    {
-        s/=/==/g;
-    }
-
-    # Convert INCLUDE to .INCLUDE "file"
-    s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/;
-
-    # Code directive (ARM vs Thumb)
-    s/CODE([0-9][0-9])/.code $1/;
-
-    # No AREA required
-    # But ALIGNs in AREA must be obeyed
-    s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/;
-    # If no ALIGN, strip the AREA and align to 4 bytes
-    s/^\s*AREA.*$/.text\n.p2align 2/;
-
-    # DCD to .word
-    # This one is for incoming symbols
-    s/DCD\s+\|(\w*)\|/.long $1/;
-
-    # DCW to .short
-    s/DCW\s+\|(\w*)\|/.short $1/;
-    s/DCW(.*)/.short $1/;
-
-    # Constants defined in scope
-    s/DCD(.*)/.long $1/;
-    s/DCB(.*)/.byte $1/;
-
-    # Build a hash of all the register - alias pairs.
-    if (s/(.*)RN(.*)/$1 .req $2/g)
-    {
-        $register_aliases{trim($1)} = trim($2);
-        next;
-    }
-
-    while (($key, $value) = each(%register_aliases))
-    {
-        s/\b$key\b/$value/g;
-    }
-
-    # Make function visible to linker, and make additional symbol with
-    # prepended underscore
-    s/EXPORT\s+\|([\$\w]*)\|/.globl _$1\n\t.globl $1/;
-
-    # Prepend imported functions with _
-    if (s/IMPORT\s+\|([\$\w]*)\|/.globl $1/)
-    {
-        $function = trim($1);
-        push(@imported_functions, $function);
-    }
-
-    foreach $function (@imported_functions)
-    {
-        s/$function/_$function/;
-    }
-
-    # No vertical bars required; make additional symbol with prepended
-    # underscore
-    s/^\|(\$?\w+)\|/_$1\n\t$1:/g;
-
-    # Labels need trailing colon
-#   s/^(\w+)/$1:/ if !/EQU/;
-    # put the colon at the end of the line in the macro
-    s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;
-
-    # ALIGN directive
-    s/\bALIGN\b/.balign/g;
-
-    # Strip ARM
-    s/\sARM/@ ARM/g;
-
-    # Strip REQUIRE8
-    #s/\sREQUIRE8/@ REQUIRE8/g;
-    s/\sREQUIRE8/@ /g;
-
-    # Strip PRESERVE8
-    s/\sPRESERVE8/@ PRESERVE8/g;
-
-    # Strip PROC and ENDPROC
-    s/\bPROC\b/@/g;
-    s/\bENDP\b/@/g;
-
-    # EQU directive
-    s/(.*)EQU(.*)/.set $1, $2/;
-
-    # Begin macro definition
-    if (/\bMACRO\b/)
-    {
-        # Process next line down, which will be the macro definition
-        $_ = <STDIN>;
-
-        $trimmed = trim($_);
-
-        # remove commas that are separating list
-        $trimmed =~ s/,//g;
-
-        # string to array
-        @incoming_array = split(/\s+/, $trimmed);
-
-        print ".macro @incoming_array[0]\n";
-
-        # remove the first element, as that is the name of the macro
-        shift (@incoming_array);
-
-        @macro_aliases{@incoming_array} = @mapping_list;
-
-        next;
-    }
-
-    while (($key, $value) = each(%macro_aliases))
-    {
-        $key =~ s/\$/\\\$/;
-        s/$key\b/$value/g;
-    }
-
-    # For macros, use \ to reference formal params
-#   s/\$/\\/g;                  # End macro definition
-    s/\bMEND\b/.endm/;              # No need to tell it where to stop assembling
-    next if /^\s*END\s*$/;
-
-    # Clang used by Chromium differs slightly from clang in XCode in what it
-    # will accept in the assembly.
-    if ($chromium) {
-        s/qsubaddx/qsax/i;
-        s/qaddsubx/qasx/i;
-        s/ldrneb/ldrbne/i;
-        s/ldrneh/ldrhne/i;
-        s/(vqshrun\.s16 .*, \#)0$/${1}8/i;
-
-        # http://llvm.org/bugs/show_bug.cgi?id=16022
-        s/\.include/#include/;
-    }
-
-    print;
-}
diff --git a/third_party/aom/build/make/armlink_adapter.sh b/third_party/aom/build/make/armlink_adapter.sh
deleted file mode 100755
index 85c6c96c1..000000000
--- a/third_party/aom/build/make/armlink_adapter.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/sh
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-verbose=0
-set -- $*
-for i; do
-    if [ "$i" = "-o" ]; then
-        on_of=1
-    elif [ "$i" = "-v" ]; then
-        verbose=1
-    elif [ "$i" = "-g" ]; then
-        args="${args} --debug"
-    elif [ "$on_of" = "1" ]; then
-        outfile=$i
-        on_of=0
-    elif [ -f "$i" ]; then
-        infiles="$infiles $i"
-    elif [ "${i#-l}" != "$i" ]; then
-        libs="$libs ${i#-l}"
-    elif [ "${i#-L}" != "$i" ]; then
-        libpaths="${libpaths} ${i#-L}"
-    else
-        args="${args} ${i}"
-    fi
-    shift
-done
-
-# Absolutize library file names
-for f in $libs; do
-    found=0
-    for d in $libpaths; do
-        [ -f "$d/$f" ] && infiles="$infiles $d/$f" && found=1 && break
-        [ -f "$d/lib${f}.so" ] && infiles="$infiles $d/lib${f}.so" && found=1 && break
-        [ -f "$d/lib${f}.a" ] && infiles="$infiles $d/lib${f}.a" && found=1 && break
-    done
-    [ $found -eq 0 ] && infiles="$infiles $f"
-done
-for d in $libpaths; do
-    [ -n "$libsearchpath" ] && libsearchpath="${libsearchpath},"
-    libsearchpath="${libsearchpath}$d"
-done
-
-cmd="armlink $args --userlibpath=$libsearchpath --output=$outfile $infiles"
-[ $verbose -eq 1 ] && echo $cmd
-$cmd
diff --git a/third_party/aom/build/make/configure.sh b/third_party/aom/build/make/configure.sh
deleted file mode 100644
index b18173b82..000000000
--- a/third_party/aom/build/make/configure.sh
+++ /dev/null
@@ -1,1522 +0,0 @@
-#!/bin/sh
-##
-##  configure.sh
-##
-##  This script is sourced by the main configure script and contains
-##  utility functions and other common bits that aren't strictly libaom
-##  related.
-##
-##  This build system is based in part on the FFmpeg configure script.
-##
-
-
-#
-# Logging / Output Functions
-#
-die_unknown(){
-  echo "Unknown option \"$1\"."
-  echo "See $0 --help for available options."
-  clean_temp_files
-  exit 1
-}
-
-die() {
-  echo "$@"
-  echo
-  echo "Configuration failed. This could reflect a misconfiguration of your"
-  echo "toolchains, improper options selected, or another problem. If you"
-  echo "don't see any useful error messages above, the next step is to look"
-  echo "at the configure error log file ($logfile) to determine what"
-  echo "configure was trying to do when it died."
-  clean_temp_files
-  exit 1
-}
-
-log(){
-  echo "$@" >>$logfile
-}
-
-log_file(){
-  log BEGIN $1
-  cat -n $1 >>$logfile
-  log END $1
-}
-
-log_echo() {
-  echo "$@"
-  log "$@"
-}
-
-fwrite () {
-  outfile=$1
-  shift
-  echo "$@" >> ${outfile}
-}
-
-show_help_pre(){
-  for opt in ${CMDLINE_SELECT}; do
-    opt2=`echo $opt | sed -e 's;_;-;g'`
-    if enabled $opt; then
-      eval "toggle_${opt}=\"--disable-${opt2}\""
-    else
-      eval "toggle_${opt}=\"--enable-${opt2} \""
-    fi
-  done
-
-  cat <<EOF
-Usage: configure [options]
-Options:
-
-Build options:
-  --help                      print this message
-  --log=yes|no|FILE           file configure log is written to [config.log]
-  --target=TARGET             target platform tuple [generic-gnu]
-  --cpu=CPU                   optimize for a specific cpu rather than a family
-  --extra-cflags=ECFLAGS      add ECFLAGS to CFLAGS [$CFLAGS]
-  --extra-cxxflags=ECXXFLAGS  add ECXXFLAGS to CXXFLAGS [$CXXFLAGS]
-  ${toggle_extra_warnings}    emit harmless warnings (always non-fatal)
-  ${toggle_werror}            treat warnings as errors, if possible
-                              (not available with all compilers)
-  ${toggle_optimizations}     turn on/off compiler optimization flags
-  ${toggle_pic}               turn on/off Position Independent Code
-  ${toggle_ccache}            turn on/off compiler cache
-  ${toggle_debug}             enable/disable debug mode
-  ${toggle_gprof}             enable/disable gprof profiling instrumentation
-  ${toggle_gcov}              enable/disable gcov coverage instrumentation
-  ${toggle_thumb}             enable/disable building arm assembly in thumb mode
-  ${toggle_dependency_tracking}
-                              disable to speed up one-time build
-
-Install options:
-  ${toggle_install_docs}      control whether docs are installed
-  ${toggle_install_bins}      control whether binaries are installed
-  ${toggle_install_libs}      control whether libraries are installed
-  ${toggle_install_srcs}      control whether sources are installed
-
-
-EOF
-}
-
-show_help_post(){
-  cat <<EOF
-
-
-NOTES:
-    Object files are built at the place where configure is launched.
-
-    All boolean options can be negated. The default value is the opposite
-    of that shown above. If the option --disable-foo is listed, then
-    the default value for foo is enabled.
-
-Supported targets:
-EOF
-  show_targets ${all_platforms}
-  echo
-  exit 1
-}
-
-show_targets() {
-  while [ -n "$*" ]; do
-    if [ "${1%%-*}" = "${2%%-*}" ]; then
-      if [ "${2%%-*}" = "${3%%-*}" ]; then
-        printf "    %-24s %-24s %-24s\n" "$1" "$2" "$3"
-        shift; shift; shift
-      else
-        printf "    %-24s %-24s\n" "$1" "$2"
-        shift; shift
-      fi
-    else
-      printf "    %-24s\n" "$1"
-      shift
-    fi
-  done
-}
-
-show_help() {
-  show_help_pre
-  show_help_post
-}
-
-#
-# List Processing Functions
-#
-set_all(){
-  value=$1
-  shift
-  for var in $*; do
-    eval $var=$value
-  done
-}
-
-is_in(){
-  value=$1
-  shift
-  for var in $*; do
-    [ $var = $value ] && return 0
-  done
-  return 1
-}
-
-add_cflags() {
-  CFLAGS="${CFLAGS} $@"
-  CXXFLAGS="${CXXFLAGS} $@"
-}
-
-add_cflags_only() {
-  CFLAGS="${CFLAGS} $@"
-}
-
-add_cxxflags_only() {
-  CXXFLAGS="${CXXFLAGS} $@"
-}
-
-add_ldflags() {
-  LDFLAGS="${LDFLAGS} $@"
-}
-
-add_asflags() {
-  ASFLAGS="${ASFLAGS} $@"
-}
-
-add_extralibs() {
-  extralibs="${extralibs} $@"
-}
-
-#
-# Boolean Manipulation Functions
-#
-
-enable_feature(){
-  set_all yes $*
-}
-
-disable_feature(){
-  set_all no $*
-}
-
-enabled(){
-  eval test "x\$$1" = "xyes"
-}
-
-disabled(){
-  eval test "x\$$1" = "xno"
-}
-
-enable_codec(){
-  enabled "${1}" || echo "  enabling ${1}"
-  enable_feature "${1}"
-
-  is_in "${1}" av1 && enable_feature "${1}_encoder" "${1}_decoder"
-}
-
-disable_codec(){
-  disabled "${1}" || echo "  disabling ${1}"
-  disable_feature "${1}"
-
-  is_in "${1}" av1 && disable_feature "${1}_encoder" "${1}_decoder"
-}
-
-# Iterates through positional parameters, checks to confirm the parameter has
-# not been explicitly (force) disabled, and enables the setting controlled by
-# the parameter when the setting is not disabled.
-# Note: Does NOT alter RTCD generation options ($RTCD_OPTIONS).
-soft_enable() {
-  for var in $*; do
-    if ! disabled $var; then
-      enabled $var || log_echo "  enabling $var"
-      enable_feature $var
-    fi
-  done
-}
-
-# Iterates through positional parameters, checks to confirm the parameter has
-# not been explicitly (force) enabled, and disables the setting controlled by
-# the parameter when the setting is not enabled.
-# Note: Does NOT alter RTCD generation options ($RTCD_OPTIONS).
-soft_disable() {
-  for var in $*; do
-    if ! enabled $var; then
-      disabled $var || log_echo "  disabling $var"
-      disable_feature $var
-    fi
-  done
-}
-
-#
-# Text Processing Functions
-#
-toupper(){
-  echo "$@" | tr abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ
-}
-
-tolower(){
-  echo "$@" | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz
-}
-
-#
-# Temporary File Functions
-#
-source_path=${0%/*}
-enable_feature source_path_used
-if [ -z "$source_path" ] || [ "$source_path" = "." ]; then
-  source_path="`pwd`"
-  disable_feature source_path_used
-fi
-
-if test ! -z "$TMPDIR" ; then
-  TMPDIRx="${TMPDIR}"
-elif test ! -z "$TEMPDIR" ; then
-  TMPDIRx="${TEMPDIR}"
-else
-  TMPDIRx="/tmp"
-fi
-RAND=$(awk 'BEGIN { srand(); printf "%d\n",(rand() * 32768)}')
-TMP_H="${TMPDIRx}/aom-conf-$$-${RAND}.h"
-TMP_C="${TMPDIRx}/aom-conf-$$-${RAND}.c"
-TMP_CC="${TMPDIRx}/aom-conf-$$-${RAND}.cc"
-TMP_O="${TMPDIRx}/aom-conf-$$-${RAND}.o"
-TMP_X="${TMPDIRx}/aom-conf-$$-${RAND}.x"
-TMP_ASM="${TMPDIRx}/aom-conf-$$-${RAND}.asm"
-
-clean_temp_files() {
-  rm -f ${TMP_C} ${TMP_CC} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}
-  enabled gcov && rm -f ${TMP_C%.c}.gcno ${TMP_CC%.cc}.gcno
-}
-
-#
-# Toolchain Check Functions
-#
-check_cmd() {
-  enabled external_build && return
-  log "$@"
-  "$@" >>${logfile} 2>&1
-}
-
-check_cc() {
-  log check_cc "$@"
-  cat >${TMP_C}
-  log_file ${TMP_C}
-  check_cmd ${CC} ${CFLAGS} "$@" -c -o ${TMP_O} ${TMP_C}
-}
-
-check_cxx() {
-  log check_cxx "$@"
-  cat >${TMP_CC}
-  log_file ${TMP_CC}
-  check_cmd ${CXX} ${CXXFLAGS} "$@" -c -o ${TMP_O} ${TMP_CC}
-}
-
-check_cpp() {
-  log check_cpp "$@"
-  cat > ${TMP_C}
-  log_file ${TMP_C}
-  check_cmd ${CC} ${CFLAGS} "$@" -E -o ${TMP_O} ${TMP_C}
-}
-
-check_ld() {
-  log check_ld "$@"
-  check_cc $@ \
-    && check_cmd ${LD} ${LDFLAGS} "$@" -o ${TMP_X} ${TMP_O} ${extralibs}
-}
-
-check_header(){
-  log check_header "$@"
-  header=$1
-  shift
-  var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
-  disable_feature $var
-  check_cpp "$@" <<EOF && enable_feature $var
-#include "$header"
-int x;
-EOF
-}
-
-check_cflags() {
- log check_cflags "$@"
- check_cc -Werror "$@" <<EOF
-int x;
-EOF
-}
-
-check_cxxflags() {
-  log check_cxxflags "$@"
-
-  # Catch CFLAGS that trigger CXX warnings
-  case "$CXX" in
-    *c++-analyzer|*clang++|*g++*)
-      check_cxx -Werror "$@" <<EOF
-int x;
-EOF
-      ;;
-    *)
-      check_cxx -Werror "$@" <<EOF
-int x;
-EOF
-      ;;
-    esac
-}
-
-check_add_cflags() {
-  check_cxxflags "$@" && add_cxxflags_only "$@"
-  check_cflags "$@" && add_cflags_only "$@"
-}
-
-check_add_cxxflags() {
-  check_cxxflags "$@" && add_cxxflags_only "$@"
-}
-
-check_add_asflags() {
-  log add_asflags "$@"
-  add_asflags "$@"
-}
-
-check_add_ldflags() {
-  log add_ldflags "$@"
-  add_ldflags "$@"
-}
-
-check_asm_align() {
-  log check_asm_align "$@"
-  cat >${TMP_ASM} <<EOF
-section .rodata
-align 16
-EOF
-  log_file ${TMP_ASM}
-  check_cmd ${AS} ${ASFLAGS} -o ${TMP_O} ${TMP_ASM}
-  readelf -WS ${TMP_O} >${TMP_X}
-  log_file ${TMP_X}
-  if ! grep -q '\.rodata .* 16$' ${TMP_X}; then
-    die "${AS} ${ASFLAGS} does not support section alignment (nasm <=2.08?)"
-  fi
-}
-
-# tests for -m$1 toggling the feature given in $2. If $2 is empty $1 is used.
-check_gcc_machine_option() {
-  opt="$1"
-  feature="$2"
-  [ -n "$feature" ] || feature="$opt"
-
-  if enabled gcc && ! disabled "$feature" && ! check_cflags "-m$opt"; then
-    RTCD_OPTIONS="${RTCD_OPTIONS}--disable-$feature "
-  else
-    soft_enable "$feature"
-  fi
-}
-
-write_common_config_banner() {
-  print_webm_license config.mk "##" ""
-  echo '# This file automatically generated by configure. Do not edit!' >> config.mk
-  echo "TOOLCHAIN := ${toolchain}" >> config.mk
-
-  case ${toolchain} in
-    *-linux-rvct)
-      echo "ALT_LIBC := ${alt_libc}" >> config.mk
-      ;;
-  esac
-}
-
-write_common_config_targets() {
-  for t in ${all_targets}; do
-    if enabled ${t}; then
-      if enabled child; then
-        fwrite config.mk "ALL_TARGETS += ${t}-${toolchain}"
-      else
-        fwrite config.mk "ALL_TARGETS += ${t}"
-      fi
-    fi
-    true;
-  done
-  true
-}
-
-write_common_target_config_mk() {
-  saved_CC="${CC}"
-  saved_CXX="${CXX}"
-  enabled ccache && CC="ccache ${CC}"
-  enabled ccache && CXX="ccache ${CXX}"
-  print_webm_license $1 "##" ""
-
-  cat >> $1 << EOF
-# This file automatically generated by configure. Do not edit!
-SRC_PATH="$source_path"
-SRC_PATH_BARE=$source_path
-BUILD_PFX=${BUILD_PFX}
-TOOLCHAIN=${toolchain}
-ASM_CONVERSION=${asm_conversion_cmd:-${source_path}/build/make/ads2gas.pl}
-GEN_VCPROJ=${gen_vcproj_cmd}
-MSVS_ARCH_DIR=${msvs_arch_dir}
-
-CC=${CC}
-CXX=${CXX}
-AR=${AR}
-LD=${LD}
-AS=${AS}
-STRIP=${STRIP}
-NM=${NM}
-
-CFLAGS  = ${CFLAGS}
-CXXFLAGS  = ${CXXFLAGS}
-ARFLAGS = crs\$(if \$(quiet),,v)
-LDFLAGS = ${LDFLAGS}
-ASFLAGS = ${ASFLAGS}
-extralibs = ${extralibs}
-AS_SFX    = ${AS_SFX:-.asm}
-EXE_SFX   = ${EXE_SFX}
-VCPROJ_SFX = ${VCPROJ_SFX}
-RTCD_OPTIONS = ${RTCD_OPTIONS}
-WX_CXXFLAGS = ${WX_CXXFLAGS}
-WX_LDFLAGS = ${WX_LDFLAGS}
-EOF
-
-  if enabled rvct; then cat >> $1 << EOF
-fmt_deps = sed -e 's;^__image.axf;\${@:.d=.o} \$@;' #hide
-EOF
-  else cat >> $1 << EOF
-fmt_deps = sed -e 's;^\([a-zA-Z0-9_]*\)\.o;\${@:.d=.o} \$@;'
-EOF
-  fi
-
-  print_config_mk ARCH   "${1}" ${ARCH_LIST}
-  print_config_mk HAVE   "${1}" ${HAVE_LIST}
-  print_config_mk CONFIG "${1}" ${CONFIG_LIST}
-  print_config_mk HAVE   "${1}" gnu_strip
-
-  enabled msvs && echo "CONFIG_VS_VERSION=${vs_version}" >> "${1}"
-
-  CC="${saved_CC}"
-  CXX="${saved_CXX}"
-}
-
-write_common_target_config_h() {
-  print_webm_license ${TMP_H} "/*" " */"
-  cat >> ${TMP_H} << EOF
-/* This file automatically generated by configure. Do not edit! */
-#ifndef AOM_CONFIG_H
-#define AOM_CONFIG_H
-#define RESTRICT    ${RESTRICT}
-#define INLINE      ${INLINE}
-EOF
-  print_config_h ARCH   "${TMP_H}" ${ARCH_LIST}
-  print_config_h HAVE   "${TMP_H}" ${HAVE_LIST}
-  print_config_h CONFIG "${TMP_H}" ${CONFIG_LIST}
-  print_config_vars_h   "${TMP_H}" ${VAR_LIST}
-  echo "#endif /* AOM_CONFIG_H */" >> ${TMP_H}
-  mkdir -p `dirname "$1"`
-  cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1"
-}
-
-process_common_cmdline() {
-  for opt in "$@"; do
-    optval="${opt#*=}"
-    case "$opt" in
-      --child)
-        enable_feature child
-        ;;
-      --log*)
-        logging="$optval"
-        if ! disabled logging ; then
-          enabled logging || logfile="$logging"
-        else
-          logfile=/dev/null
-        fi
-        ;;
-      --target=*)
-        toolchain="${toolchain:-${optval}}"
-        ;;
-      --force-target=*)
-        toolchain="${toolchain:-${optval}}"
-        enable_feature force_toolchain
-        ;;
-      --cpu=*)
-        tune_cpu="$optval"
-        ;;
-      --extra-cflags=*)
-        extra_cflags="${optval}"
-        ;;
-      --extra-cxxflags=*)
-        extra_cxxflags="${optval}"
-        ;;
-      --enable-?*|--disable-?*)
-        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
-        if is_in ${option} ${ARCH_EXT_LIST}; then
-          [ $action = "disable" ] && RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${option} "
-        elif [ $action = "disable" ] && ! disabled $option ; then
-          is_in ${option} ${CMDLINE_SELECT} || die_unknown $opt
-          log_echo "  disabling $option"
-        elif [ $action = "enable" ] && ! enabled $option ; then
-          is_in ${option} ${CMDLINE_SELECT} || die_unknown $opt
-          log_echo "  enabling $option"
-        fi
-        ${action}_feature $option
-        ;;
-      --require-?*)
-        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
-        if is_in ${option} ${ARCH_EXT_LIST}; then
-            RTCD_OPTIONS="${RTCD_OPTIONS}${opt} "
-        else
-            die_unknown $opt
-        fi
-        ;;
-      --force-enable-?*|--force-disable-?*)
-        eval `echo "$opt" | sed 's/--force-/action=/;s/-/ option=/;s/-/_/g'`
-        ${action}_feature $option
-        ;;
-      --libc=*)
-        [ -d "${optval}" ] || die "Not a directory: ${optval}"
-        disable_feature builtin_libc
-        alt_libc="${optval}"
-        ;;
-      --as=*)
-        [ "${optval}" = yasm ] || [ "${optval}" = nasm ] \
-          || [ "${optval}" = auto ] \
-          || die "Must be yasm, nasm or auto: ${optval}"
-        alt_as="${optval}"
-        ;;
-      --size-limit=*)
-        w="${optval%%x*}"
-        h="${optval##*x}"
-        VAR_LIST="DECODE_WIDTH_LIMIT ${w} DECODE_HEIGHT_LIMIT ${h}"
-        [ ${w} -gt 0 ] && [ ${h} -gt 0 ] || die "Invalid size-limit: too small."
-        [ ${w} -lt 65536 ] && [ ${h} -lt 65536 ] \
-            || die "Invalid size-limit: too big."
-        enable_feature size_limit
-        ;;
-      --prefix=*)
-        prefix="${optval}"
-        ;;
-      --libdir=*)
-        libdir="${optval}"
-        ;;
-      --sdk-path=*)
-        [ -d "${optval}" ] || die "Not a directory: ${optval}"
-        sdk_path="${optval}"
-        ;;
-      --libc|--as|--prefix|--libdir|--sdk-path)
-        die "Option ${opt} requires argument"
-        ;;
-      --help|-h)
-        show_help
-        ;;
-      *)
-        die_unknown $opt
-        ;;
-    esac
-  done
-}
-
-process_cmdline() {
-  for opt do
-    optval="${opt#*=}"
-    case "$opt" in
-      *)
-        process_common_cmdline $opt
-        ;;
-    esac
-  done
-}
-
-post_process_common_cmdline() {
-  prefix="${prefix:-/usr/local}"
-  prefix="${prefix%/}"
-  libdir="${libdir:-${prefix}/lib}"
-  libdir="${libdir%/}"
-  if [ "${libdir#${prefix}}" = "${libdir}" ]; then
-    die "Libdir ${libdir} must be a subdirectory of ${prefix}"
-  fi
-}
-
-post_process_cmdline() {
-  true;
-}
-
-setup_gnu_toolchain() {
-  CC=${CC:-${CROSS}gcc}
-  CXX=${CXX:-${CROSS}g++}
-  AR=${AR:-${CROSS}ar}
-  LD=${LD:-${CROSS}${link_with_cc:-ld}}
-  AS=${AS:-${CROSS}as}
-  STRIP=${STRIP:-${CROSS}strip}
-  NM=${NM:-${CROSS}nm}
-  AS_SFX=.s
-  EXE_SFX=
-}
-
-# Reliably find the newest available Darwin SDKs. (Older versions of
-# xcrun don't support --show-sdk-path.)
-show_darwin_sdk_path() {
-  xcrun --sdk $1 --show-sdk-path 2>/dev/null ||
-    xcodebuild -sdk $1 -version Path 2>/dev/null
-}
-
-# Print the major version number of the Darwin SDK specified by $1.
-show_darwin_sdk_major_version() {
-  xcrun --sdk $1 --show-sdk-version 2>/dev/null | cut -d. -f1
-}
-
-# Print the Xcode version.
-show_xcode_version() {
-  xcodebuild -version | head -n1 | cut -d' ' -f2
-}
-
-# Fails when Xcode version is less than 6.3.
-check_xcode_minimum_version() {
-  xcode_major=$(show_xcode_version | cut -f1 -d.)
-  xcode_minor=$(show_xcode_version | cut -f2 -d.)
-  xcode_min_major=6
-  xcode_min_minor=3
-  if [ ${xcode_major} -lt ${xcode_min_major} ]; then
-    return 1
-  fi
-  if [ ${xcode_major} -eq ${xcode_min_major} ] \
-    && [ ${xcode_minor} -lt ${xcode_min_minor} ]; then
-    return 1
-  fi
-}
-
-process_common_toolchain() {
-  case "$toolchain" in
-    *-vs*) ;;
-    *) add_cflags_only -std=c99 ;;
-  esac
-
-  if [ -z "$toolchain" ]; then
-    gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
-
-    # detect tgt_isa
-    case "$gcctarget" in
-      aarch64*)
-        tgt_isa=arm64
-        ;;
-      armv7*-hardfloat* | armv7*-gnueabihf | arm-*-gnueabihf)
-        tgt_isa=armv7
-        float_abi=hard
-        ;;
-      armv7*)
-        tgt_isa=armv7
-        float_abi=softfp
-        ;;
-      *x86_64*|*amd64*)
-        tgt_isa=x86_64
-        ;;
-      *i[3456]86*)
-        tgt_isa=x86
-        ;;
-      *sparc*)
-        tgt_isa=sparc
-        ;;
-    esac
-
-    # detect tgt_os
-    case "$gcctarget" in
-      *darwin10*)
-        tgt_isa=x86_64
-        tgt_os=darwin10
-        ;;
-      *darwin11*)
-        tgt_isa=x86_64
-        tgt_os=darwin11
-        ;;
-      *darwin12*)
-        tgt_isa=x86_64
-        tgt_os=darwin12
-        ;;
-      *darwin13*)
-        tgt_isa=x86_64
-        tgt_os=darwin13
-        ;;
-      *darwin14*)
-        tgt_isa=x86_64
-        tgt_os=darwin14
-        ;;
-      *darwin15*)
-        tgt_isa=x86_64
-        tgt_os=darwin15
-        ;;
-       *darwin16*)
-        tgt_isa=x86_64
-        tgt_os=darwin16
-        ;;
-      x86_64*mingw32*)
-        tgt_os=win64
-        ;;
-      *mingw32*|*cygwin*)
-        [ -z "$tgt_isa" ] && tgt_isa=x86
-        tgt_os=win32
-        ;;
-      *linux*|*bsd*)
-        tgt_os=linux
-        ;;
-      *solaris2.10)
-        tgt_os=solaris
-        ;;
-      *os2*)
-        tgt_os=os2
-        ;;
-    esac
-
-    if [ -n "$tgt_isa" ] && [ -n "$tgt_os" ]; then
-      toolchain=${tgt_isa}-${tgt_os}-gcc
-    fi
-  fi
-
-  toolchain=${toolchain:-generic-gnu}
-
-  is_in ${toolchain} ${all_platforms} || enabled force_toolchain \
-    || die "Unrecognized toolchain '${toolchain}'"
-
-  enabled child || log_echo "Configuring for target '${toolchain}'"
-
-  #
-  # Set up toolchain variables
-  #
-  tgt_isa=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $1}')
-  tgt_os=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $2}')
-  tgt_cc=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $3}')
-
-  # Mark the specific ISA requested as enabled
-  soft_enable ${tgt_isa}
-  enable_feature ${tgt_os}
-  enable_feature ${tgt_cc}
-
-  # Enable the architecture family
-  case ${tgt_isa} in
-    arm*)
-      enable_feature arm
-      ;;
-    mips*)
-      enable_feature mips
-      ;;
-  esac
-
-  # PIC is probably what we want when building shared libs
-  enabled shared && soft_enable pic
-
-  # Minimum iOS version for all target platforms (darwin and iphonesimulator).
-  # Shared library framework builds are only possible on iOS 8 and later.
-  if enabled shared; then
-    IOS_VERSION_OPTIONS="--enable-shared"
-    IOS_VERSION_MIN="8.0"
-  else
-    IOS_VERSION_OPTIONS=""
-    IOS_VERSION_MIN="6.0"
-  fi
-
-  # Handle darwin variants. Newer SDKs allow targeting older
-  # platforms, so use the newest one available.
-  case ${toolchain} in
-    arm*-darwin*)
-      add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
-      iphoneos_sdk_dir="$(show_darwin_sdk_path iphoneos)"
-      if [ -d "${iphoneos_sdk_dir}" ]; then
-        add_cflags  "-isysroot ${iphoneos_sdk_dir}"
-        add_ldflags "-isysroot ${iphoneos_sdk_dir}"
-      fi
-      ;;
-    x86*-darwin*)
-      osx_sdk_dir="$(show_darwin_sdk_path macosx)"
-      if [ -d "${osx_sdk_dir}" ]; then
-        add_cflags  "-isysroot ${osx_sdk_dir}"
-        add_ldflags "-isysroot ${osx_sdk_dir}"
-      fi
-      ;;
-  esac
-
-  case ${toolchain} in
-    *-darwin8-*)
-      add_cflags  "-mmacosx-version-min=10.4"
-      add_ldflags "-mmacosx-version-min=10.4"
-      ;;
-    *-darwin9-*)
-      add_cflags  "-mmacosx-version-min=10.5"
-      add_ldflags "-mmacosx-version-min=10.5"
-      ;;
-    *-darwin10-*)
-      add_cflags  "-mmacosx-version-min=10.6"
-      add_ldflags "-mmacosx-version-min=10.6"
-      ;;
-    *-darwin11-*)
-      add_cflags  "-mmacosx-version-min=10.7"
-      add_ldflags "-mmacosx-version-min=10.7"
-      ;;
-    *-darwin12-*)
-      add_cflags  "-mmacosx-version-min=10.8"
-      add_ldflags "-mmacosx-version-min=10.8"
-      ;;
-    *-darwin13-*)
-      add_cflags  "-mmacosx-version-min=10.9"
-      add_ldflags "-mmacosx-version-min=10.9"
-      ;;
-    *-darwin14-*)
-      add_cflags  "-mmacosx-version-min=10.10"
-      add_ldflags "-mmacosx-version-min=10.10"
-      ;;
-    *-darwin15-*)
-      add_cflags  "-mmacosx-version-min=10.11"
-      add_ldflags "-mmacosx-version-min=10.11"
-      ;;
-    *-darwin16-*)
-      add_cflags  "-mmacosx-version-min=10.12"
-      add_ldflags "-mmacosx-version-min=10.12"
-      ;;
-    *-iphonesimulator-*)
-      add_cflags  "-miphoneos-version-min=${IOS_VERSION_MIN}"
-      add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
-      iossim_sdk_dir="$(show_darwin_sdk_path iphonesimulator)"
-      if [ -d "${iossim_sdk_dir}" ]; then
-        add_cflags  "-isysroot ${iossim_sdk_dir}"
-        add_ldflags "-isysroot ${iossim_sdk_dir}"
-      fi
-      ;;
-  esac
-
-  # Handle Solaris variants. Solaris 10 needs -lposix4
-  case ${toolchain} in
-    sparc-solaris-*)
-      add_extralibs -lposix4
-      ;;
-    *-solaris-*)
-      add_extralibs -lposix4
-      ;;
-  esac
-
-  # Process ARM architecture variants
-  case ${toolchain} in
-    arm*)
-      # on arm, isa versions are supersets
-      case ${tgt_isa} in
-        arm64|armv8)
-          soft_enable neon
-          ;;
-        armv7|armv7s)
-          soft_enable neon
-          # Only enable neon_asm when neon is also enabled.
-          enabled neon && soft_enable neon_asm
-          # If someone tries to force it through, die.
-          if disabled neon && enabled neon_asm; then
-            die "Disabling neon while keeping neon-asm is not supported"
-          fi
-      esac
-
-      asm_conversion_cmd="cat"
-
-      case ${tgt_cc} in
-        gcc)
-          link_with_cc=gcc
-          setup_gnu_toolchain
-          arch_int=${tgt_isa##armv}
-          arch_int=${arch_int%%te}
-          check_add_asflags --defsym ARCHITECTURE=${arch_int}
-          tune_cflags="-mtune="
-          if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then
-            if [ -z "${float_abi}" ]; then
-              check_cpp <<EOF && float_abi=hard || float_abi=softfp
-#ifndef __ARM_PCS_VFP
-#error "not hardfp"
-#endif
-EOF
-            fi
-            check_add_cflags  -march=armv7-a -mfloat-abi=${float_abi}
-            check_add_asflags -march=armv7-a -mfloat-abi=${float_abi}
-
-            if enabled neon || enabled neon_asm; then
-              check_add_cflags -mfpu=neon #-ftree-vectorize
-              check_add_asflags -mfpu=neon
-            fi
-          elif [ ${tgt_isa} = "arm64" ] || [ ${tgt_isa} = "armv8" ]; then
-            check_add_cflags -march=armv8-a
-            check_add_asflags -march=armv8-a
-          else
-            check_add_cflags -march=${tgt_isa}
-            check_add_asflags -march=${tgt_isa}
-          fi
-
-          enabled debug && add_asflags -g
-          asm_conversion_cmd="${source_path}/build/make/ads2gas.pl"
-          if enabled thumb; then
-            asm_conversion_cmd="$asm_conversion_cmd -thumb"
-            check_add_cflags -mthumb
-            check_add_asflags -mthumb -mimplicit-it=always
-          fi
-          ;;
-        vs*)
-          asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
-          AS_SFX=.s
-          msvs_arch_dir=arm-msvs
-          disable_feature multithread
-          disable_feature unit_tests
-          vs_version=${tgt_cc##vs}
-          # MSVC 2013 doesn't allow doing plain .exe projects for ARM,
-          # only "AppContainerApplication" which requires an AppxManifest.
-          # Therefore disable the examples, just build the library.
-          disable_feature examples
-          ;;
-        rvct)
-          CC=armcc
-          AR=armar
-          AS=armasm
-          LD="${source_path}/build/make/armlink_adapter.sh"
-          STRIP=arm-none-linux-gnueabi-strip
-          NM=arm-none-linux-gnueabi-nm
-          tune_cflags="--cpu="
-          tune_asflags="--cpu="
-          if [ -z "${tune_cpu}" ]; then
-            if [ ${tgt_isa} = "armv7" ]; then
-              if enabled neon || enabled neon_asm
-              then
-                check_add_cflags --fpu=softvfp+vfpv3
-                check_add_asflags --fpu=softvfp+vfpv3
-              fi
-              check_add_cflags --cpu=Cortex-A8
-              check_add_asflags --cpu=Cortex-A8
-            else
-              check_add_cflags --cpu=${tgt_isa##armv}
-              check_add_asflags --cpu=${tgt_isa##armv}
-            fi
-          fi
-          arch_int=${tgt_isa##armv}
-          arch_int=${arch_int%%te}
-          check_add_asflags --pd "\"ARCHITECTURE SETA ${arch_int}\""
-          enabled debug && add_asflags -g
-          add_cflags --gnu
-          add_cflags --enum_is_int
-          add_cflags --wchar32
-          ;;
-      esac
-
-      case ${tgt_os} in
-        none*)
-          disable_feature multithread
-          disable_feature os_support
-          ;;
-
-        android*)
-          if [ -z "${sdk_path}" ]; then
-            die "Must specify --sdk-path for Android builds."
-          fi
-
-          SDK_PATH=${sdk_path}
-          COMPILER_LOCATION=`find "${SDK_PATH}" \
-                             -name "arm-linux-androideabi-gcc*" -print -quit`
-          TOOLCHAIN_PATH=${COMPILER_LOCATION%/*}/arm-linux-androideabi-
-          CC=${TOOLCHAIN_PATH}gcc
-          CXX=${TOOLCHAIN_PATH}g++
-          AR=${TOOLCHAIN_PATH}ar
-          LD=${TOOLCHAIN_PATH}gcc
-          AS=${TOOLCHAIN_PATH}as
-          STRIP=${TOOLCHAIN_PATH}strip
-          NM=${TOOLCHAIN_PATH}nm
-
-          if [ -z "${alt_libc}" ]; then
-            alt_libc=`find "${SDK_PATH}" -name arch-arm -print | \
-              awk '{n = split($0,a,"/"); \
-                split(a[n-1],b,"-"); \
-                print $0 " " b[2]}' | \
-                sort -g -k 2 | \
-                awk '{ print $1 }' | tail -1`
-          fi
-
-          if [ -d "${alt_libc}" ]; then
-            add_cflags "--sysroot=${alt_libc}"
-            add_ldflags "--sysroot=${alt_libc}"
-          fi
-
-          # linker flag that routes around a CPU bug in some
-          # Cortex-A8 implementations (NDK Dev Guide)
-          add_ldflags "-Wl,--fix-cortex-a8"
-
-          enable_feature pic
-          soft_enable realtime_only
-          if [ ${tgt_isa} = "armv7" ]; then
-            soft_enable runtime_cpu_detect
-          fi
-          if enabled runtime_cpu_detect; then
-            add_cflags "-I${SDK_PATH}/sources/android/cpufeatures"
-          fi
-          ;;
-
-        darwin*)
-          XCRUN_FIND="xcrun --sdk iphoneos --find"
-          CXX="$(${XCRUN_FIND} clang++)"
-          CC="$(${XCRUN_FIND} clang)"
-          AR="$(${XCRUN_FIND} ar)"
-          AS="$(${XCRUN_FIND} as)"
-          STRIP="$(${XCRUN_FIND} strip)"
-          NM="$(${XCRUN_FIND} nm)"
-          RANLIB="$(${XCRUN_FIND} ranlib)"
-          AS_SFX=.s
-          LD="${CXX:-$(${XCRUN_FIND} ld)}"
-
-          # ASFLAGS is written here instead of using check_add_asflags
-          # because we need to overwrite all of ASFLAGS and purge the
-          # options that were put in above
-          ASFLAGS="-arch ${tgt_isa} -g"
-
-          add_cflags -arch ${tgt_isa}
-          add_ldflags -arch ${tgt_isa}
-
-          alt_libc="$(show_darwin_sdk_path iphoneos)"
-          if [ -d "${alt_libc}" ]; then
-            add_cflags -isysroot ${alt_libc}
-          fi
-
-          if [ "${LD}" = "${CXX}" ]; then
-            add_ldflags -miphoneos-version-min="${IOS_VERSION_MIN}"
-          else
-            add_ldflags -ios_version_min "${IOS_VERSION_MIN}"
-          fi
-
-          for d in lib usr/lib usr/lib/system; do
-            try_dir="${alt_libc}/${d}"
-            [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
-          done
-
-          case ${tgt_isa} in
-            armv7|armv7s|armv8|arm64)
-              if enabled neon && ! check_xcode_minimum_version; then
-                soft_disable neon
-                log_echo "  neon disabled: upgrade Xcode (need v6.3+)."
-                if enabled neon_asm; then
-                  soft_disable neon_asm
-                  log_echo "  neon_asm disabled: upgrade Xcode (need v6.3+)."
-                fi
-              fi
-              ;;
-          esac
-
-          asm_conversion_cmd="${source_path}/build/make/ads2gas_apple.pl"
-
-          if [ "$(show_darwin_sdk_major_version iphoneos)" -gt 8 ]; then
-            check_add_cflags -fembed-bitcode
-            check_add_asflags -fembed-bitcode
-            check_add_ldflags -fembed-bitcode
-          fi
-          ;;
-
-        linux*)
-          enable_feature linux
-          if enabled rvct; then
-            # Check if we have CodeSourcery GCC in PATH. Needed for
-            # libraries
-            which arm-none-linux-gnueabi-gcc 2>&- || \
-              die "Couldn't find CodeSourcery GCC from PATH"
-
-            # Use armcc as a linker to enable translation of
-            # some gcc specific options such as -lm and -lpthread.
-            LD="armcc --translate_gcc"
-
-            # create configuration file (uses path to CodeSourcery GCC)
-            armcc --arm_linux_configure --arm_linux_config_file=arm_linux.cfg
-
-            add_cflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg
-            add_asflags --no_hide_all --apcs=/interwork
-            add_ldflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg
-            enabled pic && add_cflags --apcs=/fpic
-            enabled pic && add_asflags --apcs=/fpic
-            enabled shared && add_cflags --shared
-          fi
-          ;;
-      esac
-      ;;
-    mips*)
-      link_with_cc=gcc
-      setup_gnu_toolchain
-      tune_cflags="-mtune="
-      if enabled dspr2; then
-        check_add_cflags -mips32r2 -mdspr2
-      fi
-
-      if enabled runtime_cpu_detect; then
-        disable_feature runtime_cpu_detect
-      fi
-
-      if [ -n "${tune_cpu}" ]; then
-        case ${tune_cpu} in
-          p5600)
-            check_add_cflags -mips32r5 -mload-store-pairs
-            check_add_cflags -msched-weight -mhard-float -mfp64
-            check_add_asflags -mips32r5 -mhard-float -mfp64
-            check_add_ldflags -mfp64
-            ;;
-          i6400|p6600)
-            check_add_cflags -mips64r6 -mabi=64 -msched-weight
-            check_add_cflags  -mload-store-pairs -mhard-float -mfp64
-            check_add_asflags -mips64r6 -mabi=64 -mhard-float -mfp64
-            check_add_ldflags -mips64r6 -mabi=64 -mfp64
-            ;;
-        esac
-
-        if enabled msa; then
-          add_cflags -mmsa
-          add_asflags -mmsa
-          add_ldflags -mmsa
-        fi
-      fi
-
-      check_add_cflags -march=${tgt_isa}
-      check_add_asflags -march=${tgt_isa}
-      check_add_asflags -KPIC
-      ;;
-    x86*)
-      case  ${tgt_os} in
-        win*)
-          enabled gcc && add_cflags -fno-common
-          ;;
-        solaris*)
-          CC=${CC:-${CROSS}gcc}
-          CXX=${CXX:-${CROSS}g++}
-          LD=${LD:-${CROSS}gcc}
-          CROSS=${CROSS-g}
-          ;;
-        os2)
-          disable_feature pic
-          AS=${AS:-nasm}
-          add_ldflags -Zhigh-mem
-          ;;
-      esac
-
-      AS="${alt_as:-${AS:-auto}}"
-      case  ${tgt_cc} in
-        icc*)
-          CC=${CC:-icc}
-          LD=${LD:-icc}
-          setup_gnu_toolchain
-          add_cflags -use-msasm  # remove -use-msasm too?
-          # add -no-intel-extensions to suppress warning #10237
-          # refer to http://software.intel.com/en-us/forums/topic/280199
-          add_ldflags -i-static -no-intel-extensions
-          enabled x86_64 && add_cflags -ipo -static -O3 -no-prec-div
-          enabled x86_64 && AR=xiar
-          case ${tune_cpu} in
-            atom*)
-              tune_cflags="-x"
-              tune_cpu="SSE3_ATOM"
-              ;;
-            *)
-              tune_cflags="-march="
-              ;;
-          esac
-          ;;
-        gcc*)
-          link_with_cc=gcc
-          tune_cflags="-march="
-          setup_gnu_toolchain
-          #for 32 bit x86 builds, -O3 did not turn on this flag
-          enabled optimizations && disabled gprof && check_add_cflags -fomit-frame-pointer
-          ;;
-        vs*)
-          # When building with Microsoft Visual Studio the assembler is
-          # invoked directly. Checking at configure time is unnecessary.
-          # Skip the check by setting AS arbitrarily
-          AS=msvs
-          msvs_arch_dir=x86-msvs
-          vc_version=${tgt_cc##vs}
-          ;;
-      esac
-
-      bits=32
-      enabled x86_64 && bits=64
-      check_cpp <<EOF && bits=x32
-#if !defined(__ILP32__) || !defined(__x86_64__)
-#error "not x32"
-#endif
-EOF
-      case ${tgt_cc} in
-        gcc*)
-          add_cflags -m${bits}
-          add_ldflags -m${bits}
-          ;;
-      esac
-
-      soft_enable runtime_cpu_detect
-      # We can't use 'check_cflags' until the compiler is configured and CC is
-      # populated.
-      for ext in ${ARCH_EXT_LIST_X86}; do
-        # disable higher order extensions to simplify asm dependencies
-        if [ "$disable_exts" = "yes" ]; then
-          if ! disabled $ext; then
-            RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
-            disable_feature $ext
-          fi
-        elif disabled $ext; then
-          disable_exts="yes"
-        else
-          # use the shortened version for the flag: sse4_1 -> sse4
-          check_gcc_machine_option ${ext%_*} $ext
-        fi
-      done
-
-      if enabled external_build; then
-        log_echo "  skipping assembler detection"
-      else
-        case "${AS}" in
-          auto|"")
-            which nasm >/dev/null 2>&1 && AS=nasm
-            which yasm >/dev/null 2>&1 && AS=yasm
-            if [ "${AS}" = nasm ] ; then
-              # Apple ships version 0.98 of nasm through at least Xcode 6. Revisit
-              # this check if they start shipping a compatible version.
-              apple=`nasm -v | grep "Apple"`
-              [ -n "${apple}" ] \
-                && echo "Unsupported version of nasm: ${apple}" \
-                && AS=""
-            fi
-            [ "${AS}" = auto ] || [ -z "${AS}" ] \
-              && die "Neither yasm nor nasm have been found." \
-                     "See the prerequisites section in the README for more info."
-            ;;
-        esac
-        log_echo "  using $AS"
-      fi
-      [ "${AS##*/}" = nasm ] && add_asflags -Ox
-      AS_SFX=.asm
-      case  ${tgt_os} in
-        win32)
-          add_asflags -f win32
-          enabled debug && add_asflags -g cv8
-          EXE_SFX=.exe
-          ;;
-        win64)
-          add_asflags -f x64
-          enabled debug && add_asflags -g cv8
-          EXE_SFX=.exe
-          ;;
-        linux*|solaris*|android*)
-          add_asflags -f elf${bits}
-          enabled debug && [ "${AS}" = yasm ] && add_asflags -g dwarf2
-          enabled debug && [ "${AS}" = nasm ] && add_asflags -g
-          [ "${AS##*/}" = nasm ] && check_asm_align
-          ;;
-        darwin*)
-          add_asflags -f macho${bits}
-          enabled x86 && darwin_arch="-arch i386" || darwin_arch="-arch x86_64"
-          add_cflags  ${darwin_arch}
-          add_ldflags ${darwin_arch}
-          # -mdynamic-no-pic is still a bit of voodoo -- it was required at
-          # one time, but does not seem to be now, and it breaks some of the
-          # code that still relies on inline assembly.
-          # enabled icc && ! enabled pic && add_cflags -fno-pic -mdynamic-no-pic
-          enabled icc && ! enabled pic && add_cflags -fno-pic
-          ;;
-        iphonesimulator)
-          add_asflags -f macho${bits}
-          enabled x86 && sim_arch="-arch i386" || sim_arch="-arch x86_64"
-          add_cflags  ${sim_arch}
-          add_ldflags ${sim_arch}
-
-          if [ "$(show_darwin_sdk_major_version iphonesimulator)" -gt 8 ]; then
-            # yasm v1.3.0 doesn't know what -fembed-bitcode means, so turning it
-            # on is pointless (unless building a C-only lib). Warn the user, but
-            # do nothing here.
-            log "Warning: Bitcode embed disabled for simulator targets."
-          fi
-          ;;
-        os2)
-          add_asflags -f aout
-          enabled debug && add_asflags -g
-          EXE_SFX=.exe
-          ;;
-        *)
-          log "Warning: Unknown os $tgt_os while setting up $AS flags"
-          ;;
-      esac
-      ;;
-    *-gcc|generic-gnu)
-      link_with_cc=gcc
-      enable_feature gcc
-      setup_gnu_toolchain
-      ;;
-  esac
-
-  # Try to enable CPU specific tuning
-  if [ -n "${tune_cpu}" ]; then
-    if [ -n "${tune_cflags}" ]; then
-      check_add_cflags ${tune_cflags}${tune_cpu} || \
-        die "Requested CPU '${tune_cpu}' not supported by compiler"
-    fi
-    if [ -n "${tune_asflags}" ]; then
-      check_add_asflags ${tune_asflags}${tune_cpu} || \
-        die "Requested CPU '${tune_cpu}' not supported by assembler"
-    fi
-    if [ -z "${tune_cflags}${tune_asflags}" ]; then
-      log_echo "Warning: CPU tuning not supported by this toolchain"
-    fi
-  fi
-
-  if enabled debug; then
-    check_add_cflags -g && check_add_ldflags -g
-  else
-    check_add_cflags -DNDEBUG
-  fi
-
-  enabled gprof && check_add_cflags -pg && check_add_ldflags -pg
-  enabled gcov &&
-    check_add_cflags -fprofile-arcs -ftest-coverage &&
-    check_add_ldflags -fprofile-arcs -ftest-coverage
-
-  if enabled optimizations; then
-    if enabled rvct; then
-      enabled small && check_add_cflags -Ospace || check_add_cflags -Otime
-    else
-      enabled small && check_add_cflags -O2 ||  check_add_cflags -O3
-    fi
-  fi
-
-  # Position Independent Code (PIC) support, for building relocatable
-  # shared objects
-  enabled gcc && enabled pic && check_add_cflags -fPIC
-
-  # Work around longjmp interception on glibc >= 2.11, to improve binary
-  # compatibility. See http://code.google.com/p/webm/issues/detail?id=166
-  enabled linux && check_add_cflags -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0
-
-  # Check for strip utility variant
-  ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable_feature gnu_strip
-
-  # Try to determine target endianness
-  check_cc <<EOF
-unsigned int e = 'O'<<24 | '2'<<16 | 'B'<<8 | 'E';
-EOF
-    [ -f "${TMP_O}" ] && od -A n -t x1 "${TMP_O}" | tr -d '\n' |
-        grep '4f *32 *42 *45' >/dev/null 2>&1 && enable_feature big_endian
-
-    # Try to find which inline keywords are supported
-    check_cc <<EOF && INLINE="inline"
-static inline function() {}
-EOF
-
-  # Almost every platform uses pthreads.
-  if enabled multithread; then
-    case ${toolchain} in
-      *-win*-vs*)
-        ;;
-      *-android-gcc)
-        ;;
-      *)
-        check_header pthread.h && add_extralibs -lpthread
-        ;;
-    esac
-  fi
-
-  # only for MIPS platforms
-  case ${toolchain} in
-    mips*)
-      if enabled big_endian; then
-        if enabled dspr2; then
-          echo "dspr2 optimizations are available only for little endian platforms"
-          disable_feature dspr2
-        fi
-        if enabled msa; then
-          echo "msa optimizations are available only for little endian platforms"
-          disable_feature msa
-        fi
-      fi
-      ;;
-  esac
-
-  # glibc needs these
-  if enabled linux || [ "$toolchain" = "generic-gnu" ]; then
-    add_cflags -D_LARGEFILE_SOURCE
-    add_cflags -D_FILE_OFFSET_BITS=64
-  fi
-}
-
-process_toolchain() {
-  process_common_toolchain
-}
-
-print_config_mk() {
-  saved_prefix="${prefix}"
-  prefix=$1
-  makefile=$2
-  shift 2
-  for cfg; do
-    if enabled $cfg; then
-      upname="`toupper $cfg`"
-      echo "${prefix}_${upname}=yes" >> $makefile
-    fi
-  done
-  prefix="${saved_prefix}"
-}
-
-print_config_h() {
-  saved_prefix="${prefix}"
-  prefix=$1
-  header=$2
-  shift 2
-  for cfg; do
-    upname="`toupper $cfg`"
-    if enabled $cfg; then
-      echo "#define ${prefix}_${upname} 1" >> $header
-    else
-      echo "#define ${prefix}_${upname} 0" >> $header
-    fi
-  done
-  prefix="${saved_prefix}"
-}
-
-print_config_vars_h() {
-  header=$1
-  shift
-  while [ $# -gt 0 ]; do
-    upname="`toupper $1`"
-    echo "#define ${upname} $2" >> $header
-    shift 2
-  done
-}
-
-print_webm_license() {
-  saved_prefix="${prefix}"
-  destination=$1
-  prefix="$2"
-  suffix="$3"
-  shift 3
-  cat <<EOF > ${destination}
-${prefix} Copyright (c) 2016, Alliance for Open Media. All rights reserved.${suffix}
-${prefix} ${suffix}
-${prefix} This source code is subject to the terms of the BSD 2 Clause License and${suffix}
-${prefix} the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License${suffix}
-${prefix} was not distributed with this source code in the LICENSE file, you can${suffix}
-${prefix} obtain it at www.aomedia.org/license/software. If the Alliance for Open${suffix}
-${prefix} Media Patent License 1.0 was not distributed with this source code in the${suffix}
-${prefix} PATENTS file, you can obtain it at www.aomedia.org/license/patent.${suffix}
-EOF
-  prefix="${saved_prefix}"
-}
-
-process_targets() {
-  true;
-}
-
-process_detect() {
-  true;
-}
-
-enable_feature logging
-logfile="config.log"
-self=$0
-process() {
-  cmdline_args="$@"
-  process_cmdline "$@"
-  if enabled child; then
-    echo "# ${self} $@" >> ${logfile}
-  else
-    echo "# ${self} $@" > ${logfile}
-  fi
-  post_process_common_cmdline
-  post_process_cmdline
-  process_toolchain
-  process_detect
-  process_targets
-
-  OOT_INSTALLS="${OOT_INSTALLS}"
-  if enabled source_path_used; then
-  # Prepare the PWD for building.
-  for f in ${OOT_INSTALLS}; do
-    install -D "${source_path}/$f" "$f"
-  done
-  fi
-  cp "${source_path}/build/make/Makefile" .
-
-  clean_temp_files
-  true
-}
diff --git a/third_party/aom/build/make/gen_asm_deps.sh b/third_party/aom/build/make/gen_asm_deps.sh
deleted file mode 100755
index c867cc2bf..000000000
--- a/third_party/aom/build/make/gen_asm_deps.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/sh
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-
-self=$0
-show_help() {
-    echo "usage: $self [options] <srcfile>"
-    echo
-    echo "Generate Makefile dependency information from assembly code source"
-    echo
-    exit 1
-}
-die_unknown(){
-    echo "Unknown option \"$1\"."
-    echo "See $0 --help for available options."
-    exit 1
-}
-for opt do
-    optval="${opt#*=}"
-    case "$opt" in
-    --build-pfx=*) pfx="${optval}"
-    ;;
-    --depfile=*) out="${optval}"
-    ;;
-    -I*) raw_inc_paths="${raw_inc_paths} ${opt}"
-         inc_path="${inc_path} ${opt#-I}"
-    ;;
-    -h|--help) show_help
-    ;;
-    *) [ -f "$opt" ] && srcfile="$opt"
-    ;;
-    esac
-done
-
-[ -n "$srcfile" ] || show_help
-sfx=${sfx:-asm}
-includes=$(LC_ALL=C egrep -i "include +\"?[a-z0-9_/]+\.${sfx}" $srcfile |
-           perl -p -e "s;.*?([a-z0-9_/]+.${sfx}).*;\1;")
-#" restore editor state
-for inc in ${includes}; do
-    found_inc_path=
-    for idir in ${inc_path}; do
-        [ -f "${idir}/${inc}" ] && found_inc_path="${idir}" && break
-    done
-    if [ -f `dirname $srcfile`/$inc ]; then
-        # Handle include files in the same directory as the source
-        $self --build-pfx=$pfx --depfile=$out ${raw_inc_paths} `dirname $srcfile`/$inc
-    elif [ -n "${found_inc_path}" ]; then
-        # Handle include files on the include path
-        $self --build-pfx=$pfx --depfile=$out ${raw_inc_paths} "${found_inc_path}/$inc"
-    else
-        # Handle generated includes in the build root (which may not exist yet)
-        echo ${out} ${out%d}o: "${pfx}${inc}"
-    fi
-done
-echo ${out} ${out%d}o: $srcfile
diff --git a/third_party/aom/build/make/gen_msvs_def.sh b/third_party/aom/build/make/gen_msvs_def.sh
deleted file mode 100755
index dbb2674ac..000000000
--- a/third_party/aom/build/make/gen_msvs_def.sh
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-self=$0
-self_basename=${self##*/}
-EOL=$'\n'
-
-show_help() {
-    cat <<EOF
-Usage: ${self_basename} [options] file1 [file2 ...]
-
-This script generates a MSVC module definition file containing a list of symbols
-to export from a DLL. Source files are technically bash scripts (and thus may
-use #comment syntax) but in general, take the form of a list of symbols:
-
-  <kind> symbol1 [symbol2, symbol3, ...]
-
-where <kind> is either 'text' or 'data'
-
-
-Options:
-    --help                      Print this message
-    --out=filename              Write output to a file [stdout]
-    --name=project_name         Name of the library (required)
-EOF
-    exit 1
-}
-
-die() {
-    echo "${self_basename}: $@"
-    exit 1
-}
-
-die_unknown(){
-    echo "Unknown option \"$1\"."
-    echo "See ${self_basename} --help for available options."
-    exit 1
-}
-
-text() {
-    for sym in "$@"; do
-        echo "  $sym" >> ${outfile}
-    done
-}
-
-data() {
-    for sym in "$@"; do
-        printf "  %-40s DATA\n" "$sym" >> ${outfile}
-    done
-}
-
-# Process command line
-for opt in "$@"; do
-    optval="${opt#*=}"
-    case "$opt" in
-    --help|-h) show_help
-    ;;
-    --out=*) outfile="$optval"
-    ;;
-    --name=*) name="${optval}"
-    ;;
-     -*) die_unknown $opt
-    ;;
-    *) file_list[${#file_list[@]}]="$opt"
-    esac
-done
-outfile=${outfile:-/dev/stdout}
-[ -n "$name" ] || die "Library name (--name) must be specified!"
-
-echo "LIBRARY ${name}" > ${outfile}
-echo "EXPORTS" >> ${outfile}
-for f in "${file_list[@]}"; do
-    . $f
-done
diff --git a/third_party/aom/build/make/gen_msvs_sln.sh b/third_party/aom/build/make/gen_msvs_sln.sh
deleted file mode 100755
index 268a81706..000000000
--- a/third_party/aom/build/make/gen_msvs_sln.sh
+++ /dev/null
@@ -1,257 +0,0 @@
-#!/bin/bash
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-
-self=$0
-self_basename=${self##*/}
-EOL=$'\n'
-EOLDOS=$'\r'
-
-show_help() {
-    cat <<EOF
-Usage: ${self_basename} [options] file1 [file2 ...]
-
-This script generates a Visual Studio solution file from a list of project
-files.
-
-Options:
-    --help                      Print this message
-    --out=outfile               Redirect output to a file
-    --ver=version               Version (12,14,15) of visual studio to generate for
-    --target=isa-os-cc          Target specifier
-EOF
-    exit 1
-}
-
-die() {
-    echo "${self_basename}: $@" >&2
-    [ -f "${outfile}" ] && rm -f ${outfile}{,.mk}
-    exit 1
-}
-
-die_unknown(){
-    echo "Unknown option \"$1\"." >&2
-    echo "See ${self_basename} --help for available options." >&2
-    [ -f "${outfile}" ] && rm -f ${outfile}{,.mk}
-    exit 1
-}
-
-indent1=$'\t'
-indent=""
-indent_push() {
-    indent="${indent}${indent1}"
-}
-indent_pop() {
-    indent="${indent%${indent1}}"
-}
-
-parse_project() {
-    local file=$1
-    local name=`grep RootNamespace "$file" | sed 's,.*<.*>\(.*\)</.*>.*,\1,'`
-    local guid=`grep ProjectGuid "$file" | sed 's,.*<.*>\(.*\)</.*>.*,\1,'`
-
-    # save the project GUID to a varaible, normalizing to the basename of the
-    # vcxproj file without the extension
-    local var
-    var=${file##*/}
-    var=${var%%.${sfx}}
-    eval "${var}_file=\"$1\""
-    eval "${var}_name=$name"
-    eval "${var}_guid=$guid"
-
-    cur_config_list=`grep -B1 'Label="Configuration"' $file |
-        grep Condition | cut -d\' -f4`
-    new_config_list=$(for i in $config_list $cur_config_list; do
-        echo $i
-    done | sort | uniq)
-    if [ "$config_list" != "" ] && [ "$config_list" != "$new_config_list" ]; then
-        mixed_platforms=1
-    fi
-    config_list="$new_config_list"
-    eval "${var}_config_list=\"$cur_config_list\""
-    proj_list="${proj_list} ${var}"
-}
-
-process_project() {
-    eval "local file=\${$1_file}"
-    eval "local name=\${$1_name}"
-    eval "local guid=\${$1_guid}"
-
-    # save the project GUID to a varaible, normalizing to the basename of the
-    # vcproj file without the extension
-    local var
-    var=${file##*/}
-    var=${var%%.${sfx}}
-    eval "${var}_guid=$guid"
-
-    echo "Project(\"{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}\") = \"$name\", \"$file\", \"$guid\""
-    echo "EndProject"
-}
-
-process_global() {
-    echo "Global"
-    indent_push
-
-    #
-    # Solution Configuration Platforms
-    #
-    echo "${indent}GlobalSection(SolutionConfigurationPlatforms) = preSolution"
-    indent_push
-    IFS_bak=${IFS}
-    IFS=$'\r'$'\n'
-    if [ "$mixed_platforms" != "" ]; then
-        config_list="
-Release|Mixed Platforms
-Debug|Mixed Platforms"
-    fi
-    for config in ${config_list}; do
-        echo "${indent}$config = $config"
-    done
-    IFS=${IFS_bak}
-    indent_pop
-    echo "${indent}EndGlobalSection"
-
-    #
-    # Project Configuration Platforms
-    #
-    echo "${indent}GlobalSection(ProjectConfigurationPlatforms) = postSolution"
-    indent_push
-    for proj in ${proj_list}; do
-        eval "local proj_guid=\${${proj}_guid}"
-        eval "local proj_config_list=\${${proj}_config_list}"
-        IFS=$'\r'$'\n'
-        for config in ${proj_config_list}; do
-            if [ "$mixed_platforms" != "" ]; then
-                local c=${config%%|*}
-                echo "${indent}${proj_guid}.${c}|Mixed Platforms.ActiveCfg = ${config}"
-                echo "${indent}${proj_guid}.${c}|Mixed Platforms.Build.0 = ${config}"
-            else
-                echo "${indent}${proj_guid}.${config}.ActiveCfg = ${config}"
-                echo "${indent}${proj_guid}.${config}.Build.0 = ${config}"
-            fi
-
-        done
-        IFS=${IFS_bak}
-    done
-    indent_pop
-    echo "${indent}EndGlobalSection"
-
-    #
-    # Solution Properties
-    #
-    echo "${indent}GlobalSection(SolutionProperties) = preSolution"
-    indent_push
-    echo "${indent}HideSolutionNode = FALSE"
-    indent_pop
-    echo "${indent}EndGlobalSection"
-
-    indent_pop
-    echo "EndGlobal"
-}
-
-process_makefile() {
-    IFS_bak=${IFS}
-    IFS=$'\r'$'\n'
-    local TAB=$'\t'
-    cat <<EOF
-MSBUILD_TOOL := msbuild.exe
-found_devenv := \$(shell which \$(MSBUILD_TOOL) >/dev/null 2>&1 && echo yes)
-.nodevenv.once:
-${TAB}@echo "  * \$(MSBUILD_TOOL) not found in path."
-${TAB}@echo "  * "
-${TAB}@echo "  * You will have to build all configurations manually using the"
-${TAB}@echo "  * Visual Studio IDE. To allow make to build them automatically,"
-${TAB}@echo "  * add the Common7/IDE directory of your Visual Studio"
-${TAB}@echo "  * installation to your path, eg:"
-${TAB}@echo "  *   C:\Program Files\Microsoft Visual Studio 12.0\Common7\IDE"
-${TAB}@echo "  * "
-${TAB}@touch \$@
-CLEAN-OBJS += \$(if \$(found_devenv),,.nodevenv.once)
-
-EOF
-
-    for sln_config in ${config_list}; do
-        local config=${sln_config%%|*}
-        local platform=${sln_config##*|}
-        local nows_sln_config=`echo $sln_config | sed -e 's/[^a-zA-Z0-9]/_/g'`
-        cat <<EOF
-BUILD_TARGETS += \$(if \$(NO_LAUNCH_DEVENV),,$nows_sln_config)
-clean::
-${TAB}rm -rf "$platform"/"$config"
-.PHONY: $nows_sln_config
-ifneq (\$(found_devenv),)
-$nows_sln_config: $outfile
-${TAB}\$(MSBUILD_TOOL) $outfile -m -t:Build \\
-${TAB}${TAB}-p:Configuration="$config" -p:Platform="$platform"
-else
-$nows_sln_config: $outfile .nodevenv.once
-${TAB}@echo "  * Skipping build of $sln_config (\$(MSBUILD_TOOL) not in path)."
-${TAB}@echo "  * "
-endif
-
-EOF
-    done
-    IFS=${IFS_bak}
-}
-
-# Process command line
-outfile=/dev/stdout
-for opt in "$@"; do
-    optval="${opt#*=}"
-    case "$opt" in
-    --help|-h) show_help
-    ;;
-    --out=*) outfile="${optval}"; mkoutfile="${optval}".mk
-    ;;
-    --dep=*) eval "${optval%%:*}_deps=\"\${${optval%%:*}_deps} ${optval##*:}\""
-    ;;
-    --ver=*) vs_ver="$optval"
-             case $optval in
-             12|14|15)
-             ;;
-             *) die Unrecognized Visual Studio Version in $opt
-             ;;
-             esac
-    ;;
-    --target=*) target="${optval}"
-    ;;
-    -*) die_unknown $opt
-    ;;
-    *) file_list[${#file_list[@]}]="$opt"
-    esac
-done
-outfile=${outfile:-/dev/stdout}
-mkoutfile=${mkoutfile:-/dev/stdout}
-case "${vs_ver:-12}" in
-    12) sln_vers="12.00"
-       sln_vers_str="Visual Studio 2013"
-    ;;
-    14) sln_vers="12.00"
-       sln_vers_str="Visual Studio 2015"
-    ;;
-    15) sln_vers="12.00"
-       sln_vers_str="Visual Studio 2017"
-    ;;
-esac
-sfx=vcxproj
-
-for f in "${file_list[@]}"; do
-    parse_project $f
-done
-cat  >${outfile} <<EOF
-Microsoft Visual Studio Solution File, Format Version $sln_vers${EOLDOS}
-# $sln_vers_str${EOLDOS}
-EOF
-for proj in ${proj_list}; do
-    process_project $proj >>${outfile}
-done
-process_global >>${outfile}
-process_makefile >${mkoutfile}
diff --git a/third_party/aom/build/make/gen_msvs_vcxproj.sh b/third_party/aom/build/make/gen_msvs_vcxproj.sh
deleted file mode 100755
index a119b07aa..000000000
--- a/third_party/aom/build/make/gen_msvs_vcxproj.sh
+++ /dev/null
@@ -1,477 +0,0 @@
-#!/bin/bash
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-self=$0
-self_basename=${self##*/}
-self_dirname=$(dirname "$0")
-
-. "$self_dirname/msvs_common.sh"|| exit 127
-
-show_help() {
-    cat <<EOF
-Usage: ${self_basename} --name=projname [options] file1 [file2 ...]
-
-This script generates a Visual Studio project file from a list of source
-code files.
-
-Options:
-    --help                      Print this message
-    --exe                       Generate a project for building an Application
-    --lib                       Generate a project for creating a static library
-    --dll                       Generate a project for creating a dll
-    --static-crt                Use the static C runtime (/MT)
-    --enable-werror             Treat warnings as errors (/WX)
-    --target=isa-os-cc          Target specifier (required)
-    --out=filename              Write output to a file [stdout]
-    --name=project_name         Name of the project (required)
-    --proj-guid=GUID            GUID to use for the project
-    --module-def=filename       File containing export definitions (for DLLs)
-    --ver=version               Version (12,14,15) of visual studio to generate for
-    --src-path-bare=dir         Path to root of source tree
-    -Ipath/to/include           Additional include directories
-    -DFLAG[=value]              Preprocessor macros to define
-    -Lpath/to/lib               Additional library search paths
-    -llibname                   Library to link against
-EOF
-    exit 1
-}
-
-tag_content() {
-    local tag=$1
-    local content=$2
-    shift
-    shift
-    if [ $# -ne 0 ]; then
-        echo "${indent}<${tag}"
-        indent_push
-        tag_attributes "$@"
-        echo "${indent}>${content}</${tag}>"
-        indent_pop
-    else
-        echo "${indent}<${tag}>${content}</${tag}>"
-    fi
-}
-
-generate_filter() {
-    local name=$1
-    local pats=$2
-    local file_list_sz
-    local i
-    local f
-    local saveIFS="$IFS"
-    local pack
-    echo "generating filter '$name' from ${#file_list[@]} files" >&2
-    IFS=*
-
-    file_list_sz=${#file_list[@]}
-    for i in ${!file_list[@]}; do
-        f=${file_list[i]}
-        for pat in ${pats//;/$IFS}; do
-            if [ "${f##*.}" == "$pat" ]; then
-                unset file_list[i]
-
-                objf=$(echo ${f%.*}.obj \
-                       | sed -e "s,$src_path_bare,," \
-                             -e 's/^[\./]\+//g' -e 's,[:/ ],_,g')
-
-                if ([ "$pat" == "asm" ] || [ "$pat" == "s" ]) && $asm_use_custom_step; then
-                    # Avoid object file name collisions, i.e. aom_config.c and
-                    # aom_config.asm produce the same object file without
-                    # this additional suffix.
-                    objf=${objf%.obj}_asm.obj
-                    open_tag CustomBuild \
-                        Include="$f"
-                    for plat in "${platforms[@]}"; do
-                        for cfg in Debug Release; do
-                            tag_content Message "Assembling %(Filename)%(Extension)" \
-                                Condition="'\$(Configuration)|\$(Platform)'=='$cfg|$plat'"
-                            tag_content Command "$(eval echo \$asm_${cfg}_cmdline) -o \$(IntDir)$objf" \
-                                Condition="'\$(Configuration)|\$(Platform)'=='$cfg|$plat'"
-                            tag_content Outputs "\$(IntDir)$objf" \
-                                Condition="'\$(Configuration)|\$(Platform)'=='$cfg|$plat'"
-                        done
-                    done
-                    close_tag CustomBuild
-                elif [ "$pat" == "c" ] || \
-                     [ "$pat" == "cc" ] || [ "$pat" == "cpp" ]; then
-                    open_tag ClCompile \
-                        Include="$f"
-                    # Separate file names with Condition?
-                    tag_content ObjectFileName "\$(IntDir)$objf"
-                    # Check for AVX and turn it on to avoid warnings.
-                    if [[ $f =~ avx.?\.c$ ]]; then
-                        tag_content AdditionalOptions "/arch:AVX"
-                    fi
-                    close_tag ClCompile
-                elif [ "$pat" == "h" ] ; then
-                    tag ClInclude \
-                        Include="$f"
-                elif [ "$pat" == "vcxproj" ] ; then
-                    open_tag ProjectReference \
-                        Include="$f"
-                    depguid=`grep ProjectGuid "$f" | sed 's,.*<.*>\(.*\)</.*>.*,\1,'`
-                    tag_content Project "$depguid"
-                    tag_content ReferenceOutputAssembly false
-                    close_tag ProjectReference
-                else
-                    tag None \
-                        Include="$f"
-                fi
-
-                break
-            fi
-        done
-    done
-
-    IFS="$saveIFS"
-}
-
-# Process command line
-unset target
-for opt in "$@"; do
-    optval="${opt#*=}"
-    case "$opt" in
-        --help|-h) show_help
-        ;;
-        --target=*) target="${optval}"
-        ;;
-        --out=*) outfile="$optval"
-        ;;
-        --name=*) name="${optval}"
-        ;;
-        --proj-guid=*) guid="${optval}"
-        ;;
-        --module-def=*) module_def="${optval}"
-        ;;
-        --exe) proj_kind="exe"
-        ;;
-        --dll) proj_kind="dll"
-        ;;
-        --lib) proj_kind="lib"
-        ;;
-        --src-path-bare=*)
-            src_path_bare=$(fix_path "$optval")
-            src_path_bare=${src_path_bare%/}
-        ;;
-        --static-crt) use_static_runtime=true
-        ;;
-        --enable-werror) werror=true
-        ;;
-        --ver=*)
-            vs_ver="$optval"
-            case "$optval" in
-                12|14|15)
-                ;;
-                *) die Unrecognized Visual Studio Version in $opt
-                ;;
-            esac
-        ;;
-        -I*)
-            opt=${opt##-I}
-            opt=$(fix_path "$opt")
-            opt="${opt%/}"
-            incs="${incs}${incs:+;}&quot;${opt}&quot;"
-            yasmincs="${yasmincs} -I&quot;${opt}&quot;"
-        ;;
-        -D*) defines="${defines}${defines:+;}${opt##-D}"
-        ;;
-        -L*) # fudge . to $(OutDir)
-            if [ "${opt##-L}" == "." ]; then
-                libdirs="${libdirs}${libdirs:+;}&quot;\$(OutDir)&quot;"
-            else
-                 # Also try directories for this platform/configuration
-                 opt=${opt##-L}
-                 opt=$(fix_path "$opt")
-                 libdirs="${libdirs}${libdirs:+;}&quot;${opt}&quot;"
-                 libdirs="${libdirs}${libdirs:+;}&quot;${opt}/\$(PlatformName)/\$(Configuration)&quot;"
-                 libdirs="${libdirs}${libdirs:+;}&quot;${opt}/\$(PlatformName)&quot;"
-            fi
-        ;;
-        -l*) libs="${libs}${libs:+ }${opt##-l}.lib"
-        ;;
-        -*) die_unknown $opt
-        ;;
-        *)
-            # The paths in file_list are fixed outside of the loop.
-            file_list[${#file_list[@]}]="$opt"
-            case "$opt" in
-                 *.asm|*.s) uses_asm=true
-                 ;;
-            esac
-        ;;
-    esac
-done
-
-# Make one call to fix_path for file_list to improve performance.
-fix_file_list file_list
-
-outfile=${outfile:-/dev/stdout}
-guid=${guid:-`generate_uuid`}
-asm_use_custom_step=false
-uses_asm=${uses_asm:-false}
-case "${vs_ver:-12}" in
-    12|14|15)
-       asm_use_custom_step=$uses_asm
-    ;;
-esac
-
-[ -n "$name" ] || die "Project name (--name) must be specified!"
-[ -n "$target" ] || die "Target (--target) must be specified!"
-
-if ${use_static_runtime:-false}; then
-    release_runtime=MultiThreaded
-    debug_runtime=MultiThreadedDebug
-    lib_sfx=mt
-else
-    release_runtime=MultiThreadedDLL
-    debug_runtime=MultiThreadedDebugDLL
-    lib_sfx=md
-fi
-
-# Calculate debug lib names: If a lib ends in ${lib_sfx}.lib, then rename
-# it to ${lib_sfx}d.lib. This precludes linking to release libs from a
-# debug exe, so this may need to be refactored later.
-for lib in ${libs}; do
-    if [ "$lib" != "${lib%${lib_sfx}.lib}" ]; then
-        lib=${lib%.lib}d.lib
-    fi
-    debug_libs="${debug_libs}${debug_libs:+ }${lib}"
-done
-debug_libs=${debug_libs// /;}
-libs=${libs// /;}
-
-
-# List of all platforms supported for this target
-case "$target" in
-    x86_64*)
-        platforms[0]="x64"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
-        asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
-    ;;
-    x86*)
-        platforms[0]="Win32"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
-        asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
-    ;;
-    arm*)
-        platforms[0]="ARM"
-        asm_Debug_cmdline="armasm -nologo -oldit &quot;%(FullPath)&quot;"
-        asm_Release_cmdline="armasm -nologo -oldit &quot;%(FullPath)&quot;"
-    ;;
-    *) die "Unsupported target $target!"
-    ;;
-esac
-
-generate_vcxproj() {
-    echo "<?xml version=\"1.0\" encoding=\"utf-8\"?>"
-    open_tag Project \
-        DefaultTargets="Build" \
-        ToolsVersion="4.0" \
-        xmlns="http://schemas.microsoft.com/developer/msbuild/2003" \
-
-    open_tag ItemGroup \
-        Label="ProjectConfigurations"
-    for plat in "${platforms[@]}"; do
-        for config in Debug Release; do
-            open_tag ProjectConfiguration \
-                Include="$config|$plat"
-            tag_content Configuration $config
-            tag_content Platform $plat
-            close_tag ProjectConfiguration
-        done
-    done
-    close_tag ItemGroup
-
-    open_tag PropertyGroup \
-        Label="Globals"
-        tag_content ProjectGuid "{${guid}}"
-        tag_content RootNamespace ${name}
-        tag_content Keyword ManagedCProj
-        if [ "${platforms[0]}" = "ARM" ]; then
-            tag_content AppContainerApplication true
-            # The application type can be one of "Windows Store",
-            # "Windows Phone" or "Windows Phone Silverlight". The
-            # actual value doesn't matter from the libaom point of view,
-            # since a static library built for one works on the others.
-            # The PlatformToolset field needs to be set in sync with this;
-            # for Windows Store and Windows Phone Silverlight it should be
-            # v120 while it should be v120_wp81 if the type is Windows Phone.
-            tag_content ApplicationType "Windows Store"
-            tag_content ApplicationTypeRevision 8.1
-        fi
-    close_tag PropertyGroup
-
-    tag Import \
-        Project="\$(VCTargetsPath)\\Microsoft.Cpp.Default.props"
-
-    for plat in "${platforms[@]}"; do
-        for config in Release Debug; do
-            open_tag PropertyGroup \
-                Condition="'\$(Configuration)|\$(Platform)'=='$config|$plat'" \
-                Label="Configuration"
-            if [ "$proj_kind" = "exe" ]; then
-                tag_content ConfigurationType Application
-            elif [ "$proj_kind" = "dll" ]; then
-                tag_content ConfigurationType DynamicLibrary
-            else
-                tag_content ConfigurationType StaticLibrary
-            fi
-            if [ "$vs_ver" = "12" ]; then
-                # Setting a PlatformToolset indicating windows phone isn't
-                # enough to build code for arm with MSVC 2013, one strictly
-                # has to enable AppContainerApplication as well.
-                tag_content PlatformToolset v120
-            fi
-            if [ "$vs_ver" = "14" ]; then
-                tag_content PlatformToolset v140
-            fi
-            if [ "$vs_ver" = "15" ]; then
-                tag_content PlatformToolset v141
-            fi
-            tag_content CharacterSet Unicode
-            if [ "$config" = "Release" ]; then
-                tag_content WholeProgramOptimization true
-            fi
-            close_tag PropertyGroup
-        done
-    done
-
-    tag Import \
-        Project="\$(VCTargetsPath)\\Microsoft.Cpp.props"
-
-    open_tag ImportGroup \
-        Label="PropertySheets"
-        tag Import \
-            Project="\$(UserRootDir)\\Microsoft.Cpp.\$(Platform).user.props" \
-            Condition="exists('\$(UserRootDir)\\Microsoft.Cpp.\$(Platform).user.props')" \
-            Label="LocalAppDataPlatform"
-    close_tag ImportGroup
-
-    tag PropertyGroup \
-        Label="UserMacros"
-
-    for plat in "${platforms[@]}"; do
-        plat_no_ws=`echo $plat | sed 's/[^A-Za-z0-9_]/_/g'`
-        for config in Debug Release; do
-            open_tag PropertyGroup \
-                Condition="'\$(Configuration)|\$(Platform)'=='$config|$plat'"
-            tag_content OutDir "\$(SolutionDir)$plat_no_ws\\\$(Configuration)\\"
-            tag_content IntDir "$plat_no_ws\\\$(Configuration)\\${name}\\"
-            if [ "$proj_kind" == "lib" ]; then
-              if [ "$config" == "Debug" ]; then
-                config_suffix=d
-              else
-                config_suffix=""
-              fi
-              tag_content TargetName "${name}${lib_sfx}${config_suffix}"
-            fi
-            close_tag PropertyGroup
-        done
-    done
-
-    for plat in "${platforms[@]}"; do
-        for config in Debug Release; do
-            open_tag ItemDefinitionGroup \
-                Condition="'\$(Configuration)|\$(Platform)'=='$config|$plat'"
-            if [ "$name" == "aom" ]; then
-                hostplat=$plat
-                if [ "$hostplat" == "ARM" ]; then
-                    hostplat=Win32
-                fi
-            fi
-            open_tag ClCompile
-            if [ "$config" = "Debug" ]; then
-                opt=Disabled
-                runtime=$debug_runtime
-                curlibs=$debug_libs
-                debug=_DEBUG
-            else
-                opt=MaxSpeed
-                runtime=$release_runtime
-                curlibs=$libs
-                tag_content FavorSizeOrSpeed Speed
-                debug=NDEBUG
-            fi
-            extradefines=";$defines"
-            tag_content Optimization $opt
-            tag_content AdditionalIncludeDirectories "$incs;%(AdditionalIncludeDirectories)"
-            tag_content PreprocessorDefinitions "WIN32;$debug;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE$extradefines;%(PreprocessorDefinitions)"
-            tag_content RuntimeLibrary $runtime
-            tag_content WarningLevel Level3
-            if ${werror:-false}; then
-                tag_content TreatWarningAsError true
-            fi
-            # We need to override the defaults for these settings
-            # if AppContainerApplication is set.
-            tag_content CompileAsWinRT false
-            tag_content PrecompiledHeader NotUsing
-            tag_content SDLCheck false
-            close_tag ClCompile
-            case "$proj_kind" in
-            exe)
-                open_tag Link
-                tag_content GenerateDebugInformation true
-                # Console is the default normally, but if
-                # AppContainerApplication is set, we need to override it.
-                tag_content SubSystem Console
-                close_tag Link
-                ;;
-            dll)
-                open_tag Link
-                tag_content GenerateDebugInformation true
-                tag_content ModuleDefinitionFile $module_def
-                close_tag Link
-                ;;
-            lib)
-                ;;
-            esac
-            close_tag ItemDefinitionGroup
-        done
-
-    done
-
-    open_tag ItemGroup
-    generate_filter "Source Files"   "c;cc;cpp;def;odl;idl;hpj;bat;asm;asmx;s"
-    close_tag ItemGroup
-    open_tag ItemGroup
-    generate_filter "Header Files"   "h;hm;inl;inc;xsd"
-    close_tag ItemGroup
-    open_tag ItemGroup
-    generate_filter "Build Files"    "mk"
-    close_tag ItemGroup
-    open_tag ItemGroup
-    generate_filter "References"     "vcxproj"
-    close_tag ItemGroup
-
-    tag Import \
-        Project="\$(VCTargetsPath)\\Microsoft.Cpp.targets"
-
-    open_tag ImportGroup \
-        Label="ExtensionTargets"
-    close_tag ImportGroup
-
-    close_tag Project
-
-    # This must be done from within the {} subshell
-    echo "Ignored files list (${#file_list[@]} items) is:" >&2
-    for f in "${file_list[@]}"; do
-        echo "    $f" >&2
-    done
-}
-
-# This regexp doesn't catch most of the strings in the vcxproj format,
-# since they're like <tag>path</tag> instead of <tag attr="path" />
-# as previously. It still seems to work ok despite this.
-generate_vcxproj |
-    sed  -e '/"/s;\([^ "]\)/;\1\\;g' |
-    sed  -e '/xmlns/s;\\;/;g' > ${outfile}
-
-exit
diff --git a/third_party/aom/build/make/iosbuild.sh b/third_party/aom/build/make/iosbuild.sh
index ca8214b62..75f0b1b08 100755
--- a/third_party/aom/build/make/iosbuild.sh
+++ b/third_party/aom/build/make/iosbuild.sh
@@ -56,6 +56,7 @@ build_target() {
 
   mkdir "${target}"
   cd "${target}"
+  # TODO(tomfinegan@google.com): switch to cmake.
   eval "${LIBAOM_SOURCE_DIR}/configure" --target="${target}" \
     ${CONFIGURE_ARGS} ${EXTRA_CONFIGURE_ARGS} ${target_specific_flags} \
     ${devnull}
diff --git a/third_party/aom/build/make/msvs_common.sh b/third_party/aom/build/make/msvs_common.sh
deleted file mode 100644
index 2df27df8d..000000000
--- a/third_party/aom/build/make/msvs_common.sh
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/bin/bash
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-if [ "$(uname -o 2>/dev/null)" = "Cygwin" ] \
-   && cygpath --help >/dev/null 2>&1; then
-    FIXPATH='cygpath -m'
-else
-    FIXPATH='echo_path'
-fi
-
-die() {
-    echo "${self_basename}: $@" >&2
-    exit 1
-}
-
-die_unknown(){
-    echo "Unknown option \"$1\"." >&2
-    echo "See ${self_basename} --help for available options." >&2
-    exit 1
-}
-
-echo_path() {
-    for path; do
-        echo "$path"
-    done
-}
-
-# Output one, possibly changed based on the system, path per line.
-fix_path() {
-    $FIXPATH "$@"
-}
-
-# Corrects the paths in file_list in one pass for efficiency.
-# $1 is the name of the array to be modified.
-fix_file_list() {
-    declare -n array_ref=$1
-    files=$(fix_path "${array_ref[@]}")
-    local IFS=$'\n'
-    array_ref=($files)
-}
-
-generate_uuid() {
-    local hex="0123456789ABCDEF"
-    local i
-    local uuid=""
-    local j
-    #93995380-89BD-4b04-88EB-625FBE52EBFB
-    for ((i=0; i<32; i++)); do
-        (( j = $RANDOM % 16 ))
-        uuid="${uuid}${hex:$j:1}"
-    done
-    echo "${uuid:0:8}-${uuid:8:4}-${uuid:12:4}-${uuid:16:4}-${uuid:20:12}"
-}
-
-indent1="    "
-indent=""
-indent_push() {
-    indent="${indent}${indent1}"
-}
-indent_pop() {
-    indent="${indent%${indent1}}"
-}
-
-tag_attributes() {
-    for opt in "$@"; do
-        optval="${opt#*=}"
-        [ -n "${optval}" ] ||
-            die "Missing attribute value in '$opt' while generating $tag tag"
-        echo "${indent}${opt%%=*}=\"${optval}\""
-    done
-}
-
-open_tag() {
-    local tag=$1
-    shift
-    if [ $# -ne 0 ]; then
-        echo "${indent}<${tag}"
-        indent_push
-        tag_attributes "$@"
-        echo "${indent}>"
-    else
-        echo "${indent}<${tag}>"
-        indent_push
-    fi
-}
-
-close_tag() {
-    local tag=$1
-    indent_pop
-    echo "${indent}</${tag}>"
-}
-
-tag() {
-    local tag=$1
-    shift
-    if [ $# -ne 0 ]; then
-        echo "${indent}<${tag}"
-        indent_push
-        tag_attributes "$@"
-        indent_pop
-        echo "${indent}/>"
-    else
-        echo "${indent}<${tag}/>"
-    fi
-}
-
diff --git a/third_party/aom/build/make/rtcd.pl b/third_party/aom/build/make/rtcd.pl
index 7add5a036..8d8be25c0 100755
--- a/third_party/aom/build/make/rtcd.pl
+++ b/third_party/aom/build/make/rtcd.pl
@@ -1,5 +1,14 @@
 #!/usr/bin/env perl
-
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
 no strict 'refs';
 use warnings;
 use Getopt::Long;
@@ -205,6 +214,7 @@ sub filter {
 sub common_top() {
   my $include_guard = uc($opts{sym})."_H_";
   print <<EOF;
+// This file is generated. Do not edit.
 #ifndef ${include_guard}
 #define ${include_guard}
 
@@ -279,15 +289,12 @@ sub arm() {
   # Assign the helper variable for each enabled extension
   foreach my $opt (@ALL_ARCHS) {
     my $opt_uc = uc $opt;
-    # Enable neon assembly based on HAVE_NEON logic instead of adding new
-    # HAVE_NEON_ASM logic
-    if ($opt eq 'neon_asm') { $opt_uc = 'NEON' }
     eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
   }
 
   common_top;
   print <<EOF;
-#include "aom_config.h"
+#include "config/aom_config.h"
 
 #ifdef RTCD_C
 #include "aom_ports/arm.h"
@@ -310,10 +317,17 @@ EOF
 
 sub mips() {
   determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
   common_top;
 
   print <<EOF;
-#include "aom_config.h"
+#include "config/aom_config.h"
 
 #ifdef RTCD_C
 static void setup_rtcd_internal(void)
@@ -333,11 +347,44 @@ EOF
   common_bottom;
 }
 
+sub ppc() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+
+  print <<EOF;
+#include "config/aom_config.h"
+
+#ifdef RTCD_C
+#include "aom_ports/ppc.h"
+static void setup_rtcd_internal(void)
+{
+  int flags = ppc_simd_caps();
+
+  (void)flags;
+
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
 sub unoptimized() {
   determine_indirection "c";
   common_top;
   print <<EOF;
-#include "aom_config.h"
+#include "config/aom_config.h"
 
 #ifdef RTCD_C
 static void setup_rtcd_internal(void)
@@ -359,10 +406,10 @@ EOF
 
 &require("c");
 if ($opts{arch} eq 'x86') {
-  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/);
+  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 avx avx2/);
   x86;
 } elsif ($opts{arch} eq 'x86_64') {
-  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/);
+  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 avx avx2/);
   @REQUIRES = filter(keys %required ? keys %required : qw/mmx sse sse2/);
   &require(@REQUIRES);
   x86;
@@ -383,12 +430,14 @@ if ($opts{arch} eq 'x86') {
   close CONFIG_FILE;
   mips;
 } elsif ($opts{arch} =~ /armv7\w?/) {
-  @ALL_ARCHS = filter(qw/neon_asm neon/);
-  &require(@REQUIRES);
+  @ALL_ARCHS = filter(qw/neon/);
   arm;
 } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
   @ALL_ARCHS = filter(qw/neon/);
   arm;
+} elsif ($opts{arch} eq 'ppc') {
+  @ALL_ARCHS = filter(qw/vsx/);
+  ppc;
 } else {
   unoptimized;
 }
diff --git a/third_party/aom/build/make/thumb.pm b/third_party/aom/build/make/thumb.pm
index 8248694e9..0a6629d78 100644
--- a/third_party/aom/build/make/thumb.pm
+++ b/third_party/aom/build/make/thumb.pm
@@ -55,13 +55,6 @@ sub FixThumbInstructions($$)
     # "addne r0, r0, r2".
     s/^(\s*)((ldr|str)(ne)?[bhd]?)(\s+)(\w+),(\s*\w+,)?\s*\[(\w+)\],\s*(\w+)/$1$2$5$6,$7 [$8]\n$1add$4$5$8, $8, $9/g;
 
-    # Convert a conditional addition to the pc register into a series of
-    # instructions. This converts "addlt pc, pc, r3, lsl #2" into
-    # "itttt lt", "movlt.n r12, pc", "addlt.w r12, #12",
-    # "addlt.w r12, r12, r3, lsl #2", "movlt.n pc, r12".
-    # This assumes that r12 is free at this point.
-    s/^(\s*)addlt(\s+)pc,\s*pc,\s*(\w+),\s*lsl\s*#(\d+)/$1itttt$2lt\n$1movlt.n$2r12, pc\n$1addlt.w$2r12, #12\n$1addlt.w$2r12, r12, $3, lsl #($4-$branch_shift_offset)\n$1movlt.n$2pc, r12/g;
-
     # Convert "mov pc, lr" into "bx lr", since the former only works
     # for switching from arm to thumb (and only in armv7), but not
     # from thumb to arm.
diff --git a/third_party/aom/build/make/version.sh b/third_party/aom/build/make/version.sh
deleted file mode 100755
index 2a7090e4d..000000000
--- a/third_party/aom/build/make/version.sh
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/bin/sh
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-
-
-for opt in "$@"; do
-    optval="${opt#*=}"
-    case "$opt" in
-    --bare) bare=true ;;
-    *) break ;;
-    esac
-    shift
-done
-source_path=${1:-.}
-out_file=${2}
-id=${3:-VERSION_STRING}
-
-git_version_id=""
-if [ -e "${source_path}/.git" ]; then
-    # Source Path is a git working copy. Check for local modifications.
-    # Note that git submodules may have a file as .git, not a directory.
-    export GIT_DIR="${source_path}/.git"
-    git_version_id=$(git describe --match=v[0-9]* 2>/dev/null)
-fi
-
-changelog_version=""
-for p in "${source_path}" "${source_path}/.."; do
-    if [ -z "$git_version_id" -a -f "${p}/CHANGELOG" ]; then
-        changelog_version=$(grep -m 1 " v[0-9]" "${p}/CHANGELOG" \
-            | awk '{print $2}')
-        changelog_version="${changelog_version}"
-        break
-    fi
-done
-version_str="${changelog_version}${git_version_id}"
-bare_version=${version_str#v}
-major_version=${bare_version%%.*}
-bare_version=${bare_version#*.}
-minor_version=${bare_version%%.*}
-bare_version=${bare_version#*.}
-patch_version=${bare_version%%-*}
-bare_version=${bare_version#${patch_version}}
-extra_version=${bare_version##-}
-
-#since they'll be used as integers below make sure they are or force to 0
-for v in major_version minor_version patch_version; do
-    if eval echo \$$v |grep -E -q '[^[:digit:]]'; then
-        eval $v=0
-    fi
-done
-
-if [ ${bare} ]; then
-    echo "${changelog_version}${git_version_id}" > $$.tmp
-else
-    cat<<EOF>$$.tmp
-#define VERSION_MAJOR  $major_version
-#define VERSION_MINOR  $minor_version
-#define VERSION_PATCH  $patch_version
-#define VERSION_EXTRA  "$extra_version"
-#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define ${id}_NOSP "${version_str}"
-#define ${id}      " ${version_str}"
-EOF
-fi
-if [ -n "$out_file" ]; then
-diff $$.tmp ${out_file} >/dev/null 2>&1 || cat $$.tmp > ${out_file}
-else
-cat $$.tmp
-fi
-rm $$.tmp
diff --git a/third_party/aom/codereview.settings b/third_party/aom/codereview.settings
index 0f3690486..185e9344c 100644
--- a/third_party/aom/codereview.settings
+++ b/third_party/aom/codereview.settings
@@ -1,5 +1,4 @@
-# This file is used by gcl to get repository specific information.
-GERRIT_HOST: aomedia-review.googlesource.com
-GERRIT_PORT: 29418
-GERRIT_SQUASH_UPLOADS: false
+# This file is used by git cl to get repository specific information.
+GERRIT_HOST: True
 CODE_REVIEW_SERVER: aomedia-review.googlesource.com
+GERRIT_SQUASH_UPLOADS: False
diff --git a/third_party/aom/args.c b/third_party/aom/common/args.c
index b9384de70..7131e24de 100644
--- a/third_party/aom/args.c
+++ b/third_party/aom/common/args.c
@@ -9,10 +9,11 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "common/args.h"
+
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
-#include "args.h"
 
 #include "aom/aom_integer.h"
 #include "aom_ports/msvc.h"
@@ -34,6 +35,65 @@ struct arg arg_init(char **argv) {
   return a;
 }
 
+char *ignore_front_spaces(const char *str) {
+  while (str[0] == ' ' || str[0] == '\t') ++str;
+  return (char *)str;
+}
+
+void ignore_end_spaces(char *str) {
+  char *end = str + strlen(str);
+  while (end > str && (end[0] == ' ' || end[0] == '\t' || end[0] == '\n' ||
+                       end[0] == '\r' || end[0] == '\0'))
+    --end;
+  if (end >= str) end[1] = '\0';
+}
+
+int arg_cfg(int *argc, char ***argv, const char *file) {
+  char **argv_local = (char **)*argv;
+  char **argv_org = (char **)*argv;
+  char line[1024 * 10];
+  FILE *f = fopen(file, "r");
+  if (!f) return 1;
+
+  while (fgets(line, sizeof(line) - 1, f)) {
+    char *actual_line = ignore_front_spaces(line);
+    char *left, *right, *comment;
+    size_t length = strlen(actual_line);
+
+    if (length == 0 || actual_line[0] == '#') continue;
+    right = strchr(actual_line, ':');
+    if (right == NULL) continue;
+    right[0] = '\0';
+
+    left = ignore_front_spaces(actual_line);
+    right = ignore_front_spaces(right + 1);
+
+    comment = strchr(right, '#');
+    if (comment != NULL) comment[0] = '\0';
+
+    ignore_end_spaces(left);
+    ignore_end_spaces(right);
+
+    char **new_args = argv_dup(*argc, (const char **)argv_local);
+    char *new_line = (char *)malloc(sizeof(*new_line) * 128);
+
+    if (argv_local != argv_org) free(argv_local);
+
+    if (!strcmp(right, "ON"))
+      snprintf(new_line, sizeof(*new_line) * 128, "--%s", left);
+    else
+      snprintf(new_line, sizeof(*new_line) * 128, "--%s=%s", left, right);
+
+    new_args[(*argc) - 1] = new_args[(*argc) - 2];
+    new_args[(*argc) - 2] = new_line;
+    argv_local = new_args;
+    *argv = new_args;
+    (*argc)++;
+  }
+  fclose(f);
+  return 0;
+}
+
 int arg_match(struct arg *arg_, const struct arg_def *def, char **argv) {
   struct arg arg;
 
diff --git a/third_party/aom/args.h b/third_party/aom/common/args.h
index c3427bcfa..d12973666 100644
--- a/third_party/aom/args.h
+++ b/third_party/aom/common/args.h
@@ -48,6 +48,9 @@ typedef struct arg_def {
 
 struct arg arg_init(char **argv);
 int arg_match(struct arg *arg_, const struct arg_def *def, char **argv);
+char *ignore_front_spaces(const char *str);
+void ignore_end_spaces(char *str);
+int arg_cfg(int *argc, char ***argv, const char *file);
 const char *arg_next(struct arg *arg);
 void arg_show_usage(FILE *fp, const struct arg_def *const *defs);
 char **argv_dup(int argc, const char **argv);
diff --git a/third_party/aom/ivfdec.c b/third_party/aom/common/ivfdec.c
index fc11b9544..80d73b04c 100644
--- a/third_party/aom/ivfdec.c
+++ b/third_party/aom/common/ivfdec.c
@@ -9,13 +9,14 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "common/ivfdec.h"
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "aom_ports/mem_ops.h"
-
-#include "./ivfdec.h"
+#include "aom_ports/sanitizer.h"
 
 static const char *IVF_SIGNATURE = "DKIF";
 
@@ -61,17 +62,17 @@ int file_is_ivf(struct AvxInputContext *input_ctx) {
 }
 
 int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
-                   size_t *buffer_size) {
+                   size_t *buffer_size, aom_codec_pts_t *pts) {
   char raw_header[IVF_FRAME_HDR_SZ] = { 0 };
   size_t frame_size = 0;
 
   if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) {
-    if (!feof(infile)) warn("Failed to read frame size\n");
+    if (!feof(infile)) warn("Failed to read frame size");
   } else {
     frame_size = mem_get_le32(raw_header);
 
     if (frame_size > 256 * 1024 * 1024) {
-      warn("Read invalid frame size (%u)\n", (unsigned int)frame_size);
+      warn("Read invalid frame size (%u)", (unsigned int)frame_size);
       frame_size = 0;
     }
 
@@ -82,18 +83,25 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
         *buffer = new_buffer;
         *buffer_size = 2 * frame_size;
       } else {
-        warn("Failed to allocate compressed data buffer\n");
+        warn("Failed to allocate compressed data buffer");
         frame_size = 0;
       }
     }
+
+    if (pts) {
+      *pts = mem_get_le32(&raw_header[4]);
+      *pts += ((aom_codec_pts_t)mem_get_le32(&raw_header[8]) << 32);
+    }
   }
 
   if (!feof(infile)) {
+    ASAN_UNPOISON_MEMORY_REGION(*buffer, *buffer_size);
     if (fread(*buffer, 1, frame_size, infile) != frame_size) {
-      warn("Failed to read full frame\n");
+      warn("Failed to read full frame");
       return 1;
     }
 
+    ASAN_POISON_MEMORY_REGION(*buffer + frame_size, *buffer_size - frame_size);
     *bytes_read = frame_size;
     return 0;
   }
diff --git a/third_party/aom/ivfdec.h b/third_party/aom/common/ivfdec.h
index 36a6fb84e..9013dea64 100644
--- a/third_party/aom/ivfdec.h
+++ b/third_party/aom/common/ivfdec.h
@@ -11,7 +11,7 @@
 #ifndef IVFDEC_H_
 #define IVFDEC_H_
 
-#include "./tools_common.h"
+#include "common/tools_common.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -19,8 +19,9 @@ extern "C" {
 
 int file_is_ivf(struct AvxInputContext *input);
 
+typedef int64_t aom_codec_pts_t;
 int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
-                   size_t *buffer_size);
+                   size_t *buffer_size, aom_codec_pts_t *pts);
 
 #ifdef __cplusplus
 } /* extern "C" */
diff --git a/third_party/aom/ivfenc.c b/third_party/aom/common/ivfenc.c
index 80f4d14e3..64715f4d7 100644
--- a/third_party/aom/ivfenc.c
+++ b/third_party/aom/common/ivfenc.c
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./ivfenc.h"
+#include "common/ivfenc.h"
 
 #include "aom/aom_encoder.h"
 #include "aom_ports/mem_ops.h"
diff --git a/third_party/aom/ivfenc.h b/third_party/aom/common/ivfenc.h
index 62b3a9150..f0cab8178 100644
--- a/third_party/aom/ivfenc.h
+++ b/third_party/aom/common/ivfenc.h
@@ -11,7 +11,7 @@
 #ifndef IVFENC_H_
 #define IVFENC_H_
 
-#include "./tools_common.h"
+#include "common/tools_common.h"
 
 struct aom_codec_enc_cfg;
 struct aom_codec_cx_pkt;
diff --git a/third_party/aom/md5_utils.c b/third_party/aom/common/md5_utils.c
index 34012b205..b69e1cc72 100644
--- a/third_party/aom/md5_utils.c
+++ b/third_party/aom/common/md5_utils.c
@@ -22,7 +22,7 @@
 
 #include <string.h> /* for memcpy() */
 
-#include "md5_utils.h"
+#include "common/md5_utils.h"
 
 static void byteSwap(UWORD32 *buf, unsigned words) {
   md5byte *p;
diff --git a/third_party/aom/md5_utils.h b/third_party/aom/common/md5_utils.h
index bd4991b3a..bd4991b3a 100644
--- a/third_party/aom/md5_utils.h
+++ b/third_party/aom/common/md5_utils.h
diff --git a/third_party/aom/common/obudec.c b/third_party/aom/common/obudec.c
new file mode 100644
index 000000000..cd88f1648
--- /dev/null
+++ b/third_party/aom/common/obudec.c
@@ -0,0 +1,448 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/obudec.h"
+
+#include "aom_ports/mem_ops.h"
+#include "av1/common/common.h"
+#include "av1/decoder/obu.h"
+
+#define OBU_BUFFER_SIZE (500 * 1024)
+
+#define OBU_HEADER_SIZE 1
+#define OBU_EXTENSION_SIZE 1
+#define OBU_MAX_LENGTH_FIELD_SIZE 8
+#define OBU_DETECTION_SIZE \
+  (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE + 3 * OBU_MAX_LENGTH_FIELD_SIZE)
+
+// Reads unsigned LEB128 integer and returns 0 upon successful read and decode.
+// Stores raw bytes in 'value_buffer', length of the number in 'value_length',
+// and decoded value in 'value'.
+static int obudec_read_leb128(FILE *f, uint8_t *value_buffer,
+                              size_t *value_length, uint64_t *value) {
+  if (!f || !value_buffer || !value_length || !value) return -1;
+  size_t len;
+  for (len = 0; len < OBU_MAX_LENGTH_FIELD_SIZE; ++len) {
+    const size_t num_read = fread(&value_buffer[len], 1, 1, f);
+    if (num_read == 0) {
+      if (len == 0 && feof(f)) {
+        *value_length = 0;
+        return 0;
+      }
+      // Ran out of data before completing read of value.
+      return -1;
+    }
+    if ((value_buffer[len] >> 7) == 0) {
+      ++len;
+      *value_length = len;
+      break;
+    }
+  }
+
+  return aom_uleb_decode(value_buffer, len, value, NULL);
+}
+
+// Reads OBU header from 'f'. The 'buffer_capacity' passed in must be large
+// enough to store an OBU header with extension (2 bytes). Raw OBU data is
+// written to 'obu_data', parsed OBU header values are written to 'obu_header',
+// and total bytes read from file are written to 'bytes_read'. Returns 0 for
+// success, and non-zero on failure. When end of file is reached, the return
+// value is 0 and the 'bytes_read' value is set to 0.
+static int obudec_read_obu_header(FILE *f, size_t buffer_capacity,
+                                  int is_annexb, uint8_t *obu_data,
+                                  ObuHeader *obu_header, size_t *bytes_read) {
+  if (!f || buffer_capacity < (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE) ||
+      !obu_data || !obu_header || !bytes_read) {
+    return -1;
+  }
+  *bytes_read = fread(obu_data, 1, 1, f);
+
+  if (feof(f) && *bytes_read == 0) {
+    return 0;
+  } else if (*bytes_read != 1) {
+    fprintf(stderr, "obudec: Failure reading OBU header.\n");
+    return -1;
+  }
+
+  const int has_extension = (obu_data[0] >> 2) & 0x1;
+  if (has_extension) {
+    if (fread(&obu_data[1], 1, 1, f) != 1) {
+      fprintf(stderr, "obudec: Failure reading OBU extension.");
+      return -1;
+    }
+    ++*bytes_read;
+  }
+
+  size_t obu_bytes_parsed = 0;
+  const aom_codec_err_t parse_result = aom_read_obu_header(
+      obu_data, *bytes_read, &obu_bytes_parsed, obu_header, is_annexb);
+  if (parse_result != AOM_CODEC_OK || *bytes_read != obu_bytes_parsed) {
+    fprintf(stderr, "obudec: Error parsing OBU header.\n");
+    return -1;
+  }
+
+  return 0;
+}
+
+// Reads OBU payload from 'f' and returns 0 for success when all payload bytes
+// are read from the file. Payload data is written to 'obu_data', and actual
+// bytes read added to 'bytes_read'.
+static int obudec_read_obu_payload(FILE *f, size_t payload_length,
+                                   uint8_t *obu_data, size_t *bytes_read) {
+  if (!f || payload_length == 0 || !obu_data || !bytes_read) return -1;
+
+  if (fread(obu_data, 1, payload_length, f) != payload_length) {
+    fprintf(stderr, "obudec: Failure reading OBU payload.\n");
+    return -1;
+  }
+
+  *bytes_read += payload_length;
+  return 0;
+}
+
+static int obudec_read_obu_header_and_size(FILE *f, size_t buffer_capacity,
+                                           int is_annexb, uint8_t *buffer,
+                                           size_t *bytes_read,
+                                           size_t *payload_length,
+                                           ObuHeader *obu_header) {
+  const size_t kMinimumBufferSize =
+      (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE + OBU_MAX_LENGTH_FIELD_SIZE);
+  if (!f || !buffer || !bytes_read || !payload_length || !obu_header ||
+      buffer_capacity < kMinimumBufferSize) {
+    return -1;
+  }
+
+  size_t leb128_length = 0;
+  uint64_t obu_size = 0;
+  if (is_annexb) {
+    if (obudec_read_leb128(f, &buffer[0], &leb128_length, &obu_size) != 0) {
+      fprintf(stderr, "obudec: Failure reading OBU size length.\n");
+      return -1;
+    } else if (leb128_length == 0) {
+      *payload_length = 0;
+      return 0;
+    }
+    if (obu_size > UINT32_MAX) {
+      fprintf(stderr, "obudec: OBU payload length too large.\n");
+      return -1;
+    }
+  }
+
+  size_t header_size = 0;
+  if (obudec_read_obu_header(f, buffer_capacity - leb128_length, is_annexb,
+                             buffer + leb128_length, obu_header,
+                             &header_size) != 0) {
+    return -1;
+  } else if (header_size == 0) {
+    *payload_length = 0;
+    return 0;
+  }
+
+  if (is_annexb) {
+    if (obu_size < header_size) {
+      fprintf(stderr, "obudec: OBU size is too small.\n");
+      return -1;
+    }
+    *payload_length = (size_t)obu_size - header_size;
+  } else {
+    uint64_t u64_payload_length = 0;
+    if (obudec_read_leb128(f, &buffer[header_size], &leb128_length,
+                           &u64_payload_length) != 0) {
+      fprintf(stderr, "obudec: Failure reading OBU payload length.\n");
+      return -1;
+    }
+    if (u64_payload_length > UINT32_MAX) {
+      fprintf(stderr, "obudec: OBU payload length too large.\n");
+      return -1;
+    }
+
+    *payload_length = (size_t)u64_payload_length;
+  }
+
+  *bytes_read = leb128_length + header_size;
+  return 0;
+}
+
+static int obudec_read_one_obu(FILE *f, uint8_t **obu_buffer,
+                               size_t obu_bytes_buffered,
+                               size_t *obu_buffer_capacity, size_t *obu_length,
+                               ObuHeader *obu_header, int is_annexb) {
+  size_t available_buffer_capacity = *obu_buffer_capacity - obu_bytes_buffered;
+
+  if (!(*obu_buffer)) return -1;
+
+  size_t bytes_read = 0;
+  size_t obu_payload_length = 0;
+  const int status = obudec_read_obu_header_and_size(
+      f, available_buffer_capacity, is_annexb, *obu_buffer + obu_bytes_buffered,
+      &bytes_read, &obu_payload_length, obu_header);
+  if (status < 0) return status;
+
+  if (obu_payload_length > SIZE_MAX - bytes_read) return -1;
+
+  if (obu_payload_length > 256 * 1024 * 1024) {
+    fprintf(stderr, "obudec: Read invalid OBU size (%u)\n",
+            (unsigned int)obu_payload_length);
+    *obu_length = bytes_read + obu_payload_length;
+    return -1;
+  }
+
+  if (bytes_read + obu_payload_length > available_buffer_capacity) {
+    // TODO(tomfinegan): Add overflow check.
+    const size_t new_capacity =
+        obu_bytes_buffered + bytes_read + 2 * obu_payload_length;
+
+#if defined AOM_MAX_ALLOCABLE_MEMORY
+    if (new_capacity > AOM_MAX_ALLOCABLE_MEMORY) {
+      fprintf(stderr, "obudec: OBU size exceeds max alloc size.\n");
+      return -1;
+    }
+#endif
+
+    uint8_t *new_buffer = (uint8_t *)realloc(*obu_buffer, new_capacity);
+
+    if (new_buffer) {
+      *obu_buffer = new_buffer;
+      *obu_buffer_capacity = new_capacity;
+    } else {
+      fprintf(stderr, "obudec: Failed to allocate compressed data buffer\n");
+      *obu_length = bytes_read + obu_payload_length;
+      return -1;
+    }
+  }
+
+  if (obu_payload_length > 0 &&
+      obudec_read_obu_payload(f, obu_payload_length,
+                              *obu_buffer + obu_bytes_buffered + bytes_read,
+                              &bytes_read) != 0) {
+    return -1;
+  }
+
+  *obu_length = bytes_read;
+  return 0;
+}
+
+int file_is_obu(struct ObuDecInputContext *obu_ctx) {
+  if (!obu_ctx || !obu_ctx->avx_ctx) return 0;
+
+  struct AvxInputContext *avx_ctx = obu_ctx->avx_ctx;
+  uint8_t detect_buf[OBU_DETECTION_SIZE] = { 0 };
+  const int is_annexb = obu_ctx->is_annexb;
+  FILE *f = avx_ctx->file;
+  size_t payload_length = 0;
+  ObuHeader obu_header;
+  memset(&obu_header, 0, sizeof(obu_header));
+  size_t length_of_unit_size = 0;
+  size_t annexb_header_length = 0;
+  uint64_t unit_size = 0;
+
+  if (is_annexb) {
+    // read the size of first temporal unit
+    if (obudec_read_leb128(f, &detect_buf[0], &length_of_unit_size,
+                           &unit_size) != 0) {
+      fprintf(stderr, "obudec: Failure reading temporal unit header\n");
+      return 0;
+    }
+
+    // read the size of first frame unit
+    if (obudec_read_leb128(f, &detect_buf[length_of_unit_size],
+                           &annexb_header_length, &unit_size) != 0) {
+      fprintf(stderr, "obudec: Failure reading frame unit header\n");
+      return 0;
+    }
+    annexb_header_length += length_of_unit_size;
+  }
+
+  size_t bytes_read = 0;
+  if (obudec_read_obu_header_and_size(
+          f, OBU_DETECTION_SIZE - annexb_header_length, is_annexb,
+          &detect_buf[annexb_header_length], &bytes_read, &payload_length,
+          &obu_header) != 0) {
+    fprintf(stderr, "obudec: Failure reading first OBU.\n");
+    rewind(f);
+    return 0;
+  }
+
+  if (is_annexb) {
+    bytes_read += annexb_header_length;
+  }
+
+  if (obu_header.type != OBU_TEMPORAL_DELIMITER &&
+      obu_header.type != OBU_SEQUENCE_HEADER) {
+    return 0;
+  }
+
+  if (obu_header.has_size_field) {
+    if (obu_header.type == OBU_TEMPORAL_DELIMITER && payload_length != 0) {
+      fprintf(
+          stderr,
+          "obudec: Invalid OBU_TEMPORAL_DELIMITER payload length (non-zero).");
+      rewind(f);
+      return 0;
+    }
+  } else if (!is_annexb) {
+    fprintf(stderr, "obudec: OBU size fields required, cannot decode input.\n");
+    rewind(f);
+    return 0;
+  }
+
+  // Appears that input is valid Section 5 AV1 stream.
+  obu_ctx->buffer = (uint8_t *)malloc(OBU_BUFFER_SIZE);
+  if (!obu_ctx->buffer) {
+    fprintf(stderr, "Out of memory.\n");
+    rewind(f);
+    return 0;
+  }
+  obu_ctx->buffer_capacity = OBU_BUFFER_SIZE;
+
+  memcpy(obu_ctx->buffer, &detect_buf[0], bytes_read);
+  obu_ctx->bytes_buffered = bytes_read;
+  // If the first OBU is a SEQUENCE_HEADER, then it will have a payload.
+  // We need to read this in so that our buffer only contains complete OBUs.
+  if (payload_length > 0) {
+    if (payload_length > (obu_ctx->buffer_capacity - bytes_read)) {
+      fprintf(stderr, "obudec: First OBU's payload is too large\n");
+      rewind(f);
+      return 0;
+    }
+
+    size_t payload_bytes = 0;
+    const int status = obudec_read_obu_payload(
+        f, payload_length, &obu_ctx->buffer[bytes_read], &payload_bytes);
+    if (status < 0) {
+      rewind(f);
+      return 0;
+    }
+    obu_ctx->bytes_buffered += payload_bytes;
+  }
+  return 1;
+}
+
+int obudec_read_temporal_unit(struct ObuDecInputContext *obu_ctx,
+                              uint8_t **buffer, size_t *bytes_read,
+                              size_t *buffer_size) {
+  FILE *f = obu_ctx->avx_ctx->file;
+  if (!f) return -1;
+
+  *buffer_size = 0;
+  *bytes_read = 0;
+
+  if (feof(f)) {
+    return 1;
+  }
+
+  size_t tu_size;
+  size_t obu_size = 0;
+  size_t length_of_temporal_unit_size = 0;
+  uint8_t tuheader[OBU_MAX_LENGTH_FIELD_SIZE] = { 0 };
+
+  if (obu_ctx->is_annexb) {
+    uint64_t size = 0;
+
+    if (obu_ctx->bytes_buffered == 0) {
+      if (obudec_read_leb128(f, &tuheader[0], &length_of_temporal_unit_size,
+                             &size) != 0) {
+        fprintf(stderr, "obudec: Failure reading temporal unit header\n");
+        return -1;
+      }
+      if (size == 0 && feof(f)) {
+        return 1;
+      }
+    } else {
+      // temporal unit size was already stored in buffer
+      if (aom_uleb_decode(obu_ctx->buffer, obu_ctx->bytes_buffered, &size,
+                          &length_of_temporal_unit_size) != 0) {
+        fprintf(stderr, "obudec: Failure reading temporal unit header\n");
+        return -1;
+      }
+    }
+
+    if (size > UINT32_MAX || size + length_of_temporal_unit_size > UINT32_MAX) {
+      fprintf(stderr, "obudec: TU too large.\n");
+      return -1;
+    }
+
+    size += length_of_temporal_unit_size;
+    tu_size = (size_t)size;
+  } else {
+    while (1) {
+      ObuHeader obu_header;
+      memset(&obu_header, 0, sizeof(obu_header));
+
+      if (obudec_read_one_obu(f, &obu_ctx->buffer, obu_ctx->bytes_buffered,
+                              &obu_ctx->buffer_capacity, &obu_size, &obu_header,
+                              0) != 0) {
+        fprintf(stderr, "obudec: read_one_obu failed in TU loop\n");
+        return -1;
+      }
+
+      if (obu_header.type == OBU_TEMPORAL_DELIMITER || obu_size == 0) {
+        tu_size = obu_ctx->bytes_buffered;
+        break;
+      } else {
+        obu_ctx->bytes_buffered += obu_size;
+      }
+    }
+  }
+
+#if defined AOM_MAX_ALLOCABLE_MEMORY
+  if (tu_size > AOM_MAX_ALLOCABLE_MEMORY) {
+    fprintf(stderr, "obudec: Temporal Unit size exceeds max alloc size.\n");
+    return -1;
+  }
+#endif
+  uint8_t *new_buffer = (uint8_t *)realloc(*buffer, tu_size);
+  if (!new_buffer) {
+    free(*buffer);
+    fprintf(stderr, "obudec: Out of memory.\n");
+    return -1;
+  }
+  *buffer = new_buffer;
+  *bytes_read = tu_size;
+  *buffer_size = tu_size;
+
+  if (!obu_ctx->is_annexb) {
+    memcpy(*buffer, obu_ctx->buffer, tu_size);
+
+    // At this point, (obu_ctx->buffer + obu_ctx->bytes_buffered + obu_size)
+    // points to the end of the buffer.
+    memmove(obu_ctx->buffer, obu_ctx->buffer + obu_ctx->bytes_buffered,
+            obu_size);
+    obu_ctx->bytes_buffered = obu_size;
+  } else {
+    if (!feof(f)) {
+      size_t data_size;
+      size_t offset;
+      if (!obu_ctx->bytes_buffered) {
+        data_size = tu_size - length_of_temporal_unit_size;
+        memcpy(*buffer, &tuheader[0], length_of_temporal_unit_size);
+        offset = length_of_temporal_unit_size;
+      } else {
+        memcpy(*buffer, obu_ctx->buffer, obu_ctx->bytes_buffered);
+        offset = obu_ctx->bytes_buffered;
+        data_size = tu_size - obu_ctx->bytes_buffered;
+        obu_ctx->bytes_buffered = 0;
+      }
+
+      if (fread(*buffer + offset, 1, data_size, f) != data_size) {
+        fprintf(stderr, "obudec: Failed to read full temporal unit\n");
+        return -1;
+      }
+    }
+  }
+  return 0;
+}
+
+void obudec_free(struct ObuDecInputContext *obu_ctx) { free(obu_ctx->buffer); }
diff --git a/third_party/aom/common/obudec.h b/third_party/aom/common/obudec.h
new file mode 100644
index 000000000..c52a94e9d
--- /dev/null
+++ b/third_party/aom/common/obudec.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef OBUDEC_H_
+#define OBUDEC_H_
+
+#include "common/tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ObuDecInputContext {
+  struct AvxInputContext *avx_ctx;
+  uint8_t *buffer;
+  size_t buffer_capacity;
+  size_t bytes_buffered;
+  int is_annexb;
+};
+
+// Returns 1 when file data starts (if Annex B stream, after reading the
+// size of the OBU) with what appears to be a Temporal Delimiter
+// OBU as defined by Section 5 of the AV1 bitstream specification.
+int file_is_obu(struct ObuDecInputContext *obu_ctx);
+
+// Reads one Temporal Unit from the input file. Returns 0 when a TU is
+// successfully read, 1 when end of file is reached, and less than 0 when an
+// error occurs. Stores TU data in 'buffer'. Reallocs buffer to match TU size,
+// returns buffer capacity via 'buffer_size', and returns size of buffered data
+// via 'bytes_read'.
+int obudec_read_temporal_unit(struct ObuDecInputContext *obu_ctx,
+                              uint8_t **buffer, size_t *bytes_read,
+                              size_t *buffer_size);
+
+void obudec_free(struct ObuDecInputContext *obu_ctx);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif  // OBUDEC_H_
diff --git a/third_party/aom/tools_common.c b/third_party/aom/common/tools_common.c
index b7095e3f1..359ec7341 100644
--- a/third_party/aom/tools_common.c
+++ b/third_party/aom/common/tools_common.c
@@ -9,14 +9,14 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "common/tools_common.h"
+
 #include <math.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-#include "./tools_common.h"
-
 #if CONFIG_AV1_ENCODER
 #include "aom/aomcx.h"
 #endif
@@ -236,7 +236,6 @@ double sse_to_psnr(double samples, double peak, double sse) {
 }
 
 // TODO(debargha): Consolidate the functions below into a separate file.
-#if CONFIG_HIGHBITDEPTH
 static void highbd_img_upshift(aom_image_t *dst, aom_image_t *src,
                                int input_shift) {
   // Note the offset is 1 less than half.
@@ -251,8 +250,7 @@ static void highbd_img_upshift(aom_image_t *dst, aom_image_t *src,
   switch (src->fmt) {
     case AOM_IMG_FMT_I42016:
     case AOM_IMG_FMT_I42216:
-    case AOM_IMG_FMT_I44416:
-    case AOM_IMG_FMT_I44016: break;
+    case AOM_IMG_FMT_I44416: break;
     default: fatal("Unsupported image conversion"); break;
   }
   for (plane = 0; plane < 3; plane++) {
@@ -287,8 +285,7 @@ static void lowbd_img_upshift(aom_image_t *dst, aom_image_t *src,
   switch (src->fmt) {
     case AOM_IMG_FMT_I420:
     case AOM_IMG_FMT_I422:
-    case AOM_IMG_FMT_I444:
-    case AOM_IMG_FMT_I440: break;
+    case AOM_IMG_FMT_I444: break;
     default: fatal("Unsupported image conversion"); break;
   }
   for (plane = 0; plane < 3; plane++) {
@@ -328,8 +325,7 @@ void aom_img_truncate_16_to_8(aom_image_t *dst, aom_image_t *src) {
   switch (dst->fmt) {
     case AOM_IMG_FMT_I420:
     case AOM_IMG_FMT_I422:
-    case AOM_IMG_FMT_I444:
-    case AOM_IMG_FMT_I440: break;
+    case AOM_IMG_FMT_I444: break;
     default: fatal("Unsupported image conversion"); break;
   }
   for (plane = 0; plane < 3; plane++) {
@@ -363,8 +359,7 @@ static void highbd_img_downshift(aom_image_t *dst, aom_image_t *src,
   switch (src->fmt) {
     case AOM_IMG_FMT_I42016:
     case AOM_IMG_FMT_I42216:
-    case AOM_IMG_FMT_I44416:
-    case AOM_IMG_FMT_I44016: break;
+    case AOM_IMG_FMT_I44416: break;
     default: fatal("Unsupported image conversion"); break;
   }
   for (plane = 0; plane < 3; plane++) {
@@ -397,8 +392,7 @@ static void lowbd_img_downshift(aom_image_t *dst, aom_image_t *src,
   switch (dst->fmt) {
     case AOM_IMG_FMT_I420:
     case AOM_IMG_FMT_I422:
-    case AOM_IMG_FMT_I444:
-    case AOM_IMG_FMT_I440: break;
+    case AOM_IMG_FMT_I444: break;
     default: fatal("Unsupported image conversion"); break;
   }
   for (plane = 0; plane < 3; plane++) {
@@ -427,4 +421,3 @@ void aom_img_downshift(aom_image_t *dst, aom_image_t *src, int down_shift) {
     lowbd_img_downshift(dst, src, down_shift);
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/tools_common.h b/third_party/aom/common/tools_common.h
index 5fc8fbafd..abee4ea63 100644
--- a/third_party/aom/tools_common.h
+++ b/third_party/aom/common/tools_common.h
@@ -13,14 +13,15 @@
 
 #include <stdio.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_codec.h"
 #include "aom/aom_image.h"
 #include "aom/aom_integer.h"
 #include "aom_ports/msvc.h"
 
 #if CONFIG_AV1_ENCODER
-#include "./y4minput.h"
+#include "common/y4minput.h"
 #endif
 
 #if defined(_MSC_VER)
@@ -69,6 +70,7 @@ typedef long FileOffset; /* NOLINT */
 #define AV1_FOURCC 0x31305641
 
 enum VideoFileType {
+  FILE_TYPE_OBU,
   FILE_TYPE_RAW,
   FILE_TYPE_IVF,
   FILE_TYPE_Y4M,
@@ -150,12 +152,9 @@ void aom_img_write(const aom_image_t *img, FILE *file);
 int aom_img_read(aom_image_t *img, FILE *file);
 
 double sse_to_psnr(double samples, double peak, double mse);
-
-#if CONFIG_HIGHBITDEPTH
 void aom_img_upshift(aom_image_t *dst, aom_image_t *src, int input_shift);
 void aom_img_downshift(aom_image_t *dst, aom_image_t *src, int down_shift);
 void aom_img_truncate_16_to_8(aom_image_t *dst, aom_image_t *src);
-#endif
 
 #ifdef __cplusplus
 } /* extern "C" */
diff --git a/third_party/aom/video_common.h b/third_party/aom/common/video_common.h
index 1aa82f613..f96af4b7e 100644
--- a/third_party/aom/video_common.h
+++ b/third_party/aom/common/video_common.h
@@ -12,7 +12,7 @@
 #ifndef VIDEO_COMMON_H_
 #define VIDEO_COMMON_H_
 
-#include "./tools_common.h"
+#include "common/tools_common.h"
 
 typedef struct {
   uint32_t codec_fourcc;
diff --git a/third_party/aom/video_reader.c b/third_party/aom/common/video_reader.c
index 6a96af967..f5327c928 100644
--- a/third_party/aom/video_reader.c
+++ b/third_party/aom/common/video_reader.c
@@ -8,14 +8,13 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+#include "common/video_reader.h"
 
 #include <stdlib.h>
 #include <string.h>
 
-#include "./ivfdec.h"
-#include "./video_reader.h"
-
 #include "aom_ports/mem_ops.h"
+#include "common/ivfdec.h"
 
 static const char *const kIVFSignature = "DKIF";
 
@@ -25,6 +24,7 @@ struct AvxVideoReaderStruct {
   uint8_t *buffer;
   size_t buffer_size;
   size_t frame_size;
+  aom_codec_pts_t pts;
 };
 
 AvxVideoReader *aom_video_reader_open(const char *filename) {
@@ -63,7 +63,7 @@ void aom_video_reader_close(AvxVideoReader *reader) {
 
 int aom_video_reader_read_frame(AvxVideoReader *reader) {
   return !ivf_read_frame(reader->file, &reader->buffer, &reader->frame_size,
-                         &reader->buffer_size);
+                         &reader->buffer_size, &reader->pts);
 }
 
 const uint8_t *aom_video_reader_get_frame(AvxVideoReader *reader,
@@ -73,6 +73,12 @@ const uint8_t *aom_video_reader_get_frame(AvxVideoReader *reader,
   return reader->buffer;
 }
 
+int64_t aom_video_reader_get_frame_pts(AvxVideoReader *reader) {
+  return (int64_t)reader->pts;
+}
+
+FILE *aom_video_reader_get_file(AvxVideoReader *reader) { return reader->file; }
+
 const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader) {
   return &reader->info;
 }
diff --git a/third_party/aom/video_reader.h b/third_party/aom/common/video_reader.h
index 962c6653b..1eb33831a 100644
--- a/third_party/aom/video_reader.h
+++ b/third_party/aom/common/video_reader.h
@@ -12,7 +12,7 @@
 #ifndef VIDEO_READER_H_
 #define VIDEO_READER_H_
 
-#include "./video_common.h"
+#include "common/video_common.h"
 
 // The following code is work in progress. It is going to  support transparent
 // reading of input files. Right now only IVF format is supported for
@@ -42,6 +42,11 @@ int aom_video_reader_read_frame(AvxVideoReader *reader);
 // aom_video_reader_read_frame().
 const uint8_t *aom_video_reader_get_frame(AvxVideoReader *reader, size_t *size);
 
+// Returns the pts of the frame.
+int64_t aom_video_reader_get_frame_pts(AvxVideoReader *reader);
+// Return the reader file.
+FILE *aom_video_reader_get_file(AvxVideoReader *reader);
+
 // Fills AvxVideoInfo with information from opened video file.
 const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader);
 
diff --git a/third_party/aom/video_writer.c b/third_party/aom/common/video_writer.c
index 4e072c7dc..a7ec309fc 100644
--- a/third_party/aom/video_writer.c
+++ b/third_party/aom/common/video_writer.c
@@ -8,12 +8,12 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+#include "common/video_writer.h"
 
 #include <stdlib.h>
 
-#include "./ivfenc.h"
-#include "./video_writer.h"
 #include "aom/aom_encoder.h"
+#include "common/ivfenc.h"
 
 struct AvxVideoWriterStruct {
   AvxVideoInfo info;
diff --git a/third_party/aom/video_writer.h b/third_party/aom/common/video_writer.h
index ad9e6ebb9..16655d3a6 100644
--- a/third_party/aom/video_writer.h
+++ b/third_party/aom/common/video_writer.h
@@ -12,7 +12,7 @@
 #ifndef VIDEO_WRITER_H_
 #define VIDEO_WRITER_H_
 
-#include "./video_common.h"
+#include "common/video_common.h"
 
 typedef enum { kContainerIVF } AvxContainer;
 
diff --git a/third_party/aom/warnings.c b/third_party/aom/common/warnings.c
index 16d5c6c18..2facee252 100644
--- a/third_party/aom/warnings.c
+++ b/third_party/aom/common/warnings.c
@@ -9,7 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./warnings.h"
+#include "common/warnings.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -17,9 +17,8 @@
 #include <string.h>
 
 #include "aom/aom_encoder.h"
-
-#include "./tools_common.h"
-#include "./aomenc.h"
+#include "apps/aomenc.h"
+#include "common/tools_common.h"
 
 static const char quantizer_warning_string[] =
     "Bad quantizer values. Quantizer values should not be equal, and should "
diff --git a/third_party/aom/warnings.h b/third_party/aom/common/warnings.h
index 61db2dcf8..61db2dcf8 100644
--- a/third_party/aom/warnings.h
+++ b/third_party/aom/common/warnings.h
diff --git a/third_party/aom/webmdec.cc b/third_party/aom/common/webmdec.cc
index 39ecef706..17ac53c93 100644
--- a/third_party/aom/webmdec.cc
+++ b/third_party/aom/common/webmdec.cc
@@ -7,10 +7,11 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
-#include "./webmdec.h"
+#include "common/webmdec.h"
 
+#include <cassert>
 #include <cstring>
 #include <cstdio>
 
@@ -118,7 +119,8 @@ int file_is_webm(struct WebmInputContext *webm_ctx,
 }
 
 int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
-                    size_t *buffer_size) {
+                    size_t *bytes_read, size_t *buffer_size) {
+  assert(webm_ctx->buffer == *buffer);
   // This check is needed for frame parallel decoding, in which case this
   // function could be called even after it has reached end of input stream.
   if (webm_ctx->reached_eos) {
@@ -142,7 +144,7 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
     } else if (block_entry_eos || block_entry->EOS()) {
       cluster = segment->GetNext(cluster);
       if (cluster == NULL || cluster->EOS()) {
-        *buffer_size = 0;
+        *bytes_read = 0;
         webm_ctx->reached_eos = 1;
         return 1;
       }
@@ -180,12 +182,13 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
   if (frame.len > static_cast<long>(*buffer_size)) {
     delete[] * buffer;
     *buffer = new uint8_t[frame.len];
+    webm_ctx->buffer = *buffer;
     if (*buffer == NULL) {
       return -1;
     }
-    webm_ctx->buffer = *buffer;
+    *buffer_size = frame.len;
   }
-  *buffer_size = frame.len;
+  *bytes_read = frame.len;
   webm_ctx->timestamp_ns = block->GetTime(cluster);
   webm_ctx->is_key_frame = block->IsKey();
 
@@ -199,8 +202,10 @@ int webm_guess_framerate(struct WebmInputContext *webm_ctx,
   uint32_t i = 0;
   uint8_t *buffer = NULL;
   size_t buffer_size = 0;
+  size_t bytes_read = 0;
+  assert(webm_ctx->buffer == NULL);
   while (webm_ctx->timestamp_ns < 1000000000 && i < 50) {
-    if (webm_read_frame(webm_ctx, &buffer, &buffer_size)) {
+    if (webm_read_frame(webm_ctx, &buffer, &bytes_read, &buffer_size)) {
       break;
     }
     ++i;
@@ -209,6 +214,7 @@ int webm_guess_framerate(struct WebmInputContext *webm_ctx,
   aom_ctx->framerate.denominator =
       static_cast<int>(webm_ctx->timestamp_ns / 1000);
   delete[] buffer;
+  webm_ctx->buffer = NULL;
 
   get_first_cluster(webm_ctx);
   webm_ctx->block = NULL;
diff --git a/third_party/aom/webmdec.h b/third_party/aom/common/webmdec.h
index 329908eeb..d5b472a01 100644
--- a/third_party/aom/webmdec.h
+++ b/third_party/aom/common/webmdec.h
@@ -11,7 +11,7 @@
 #ifndef WEBMDEC_H_
 #define WEBMDEC_H_
 
-#include "./tools_common.h"
+#include "common/tools_common.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -48,13 +48,14 @@ int file_is_webm(struct WebmInputContext *webm_ctx,
 // Parameters:
 //      webm_ctx - WebmInputContext object
 //      buffer - pointer where the frame data will be filled.
+//      bytes_read - pointer to bytes read.
 //      buffer_size - pointer to buffer size.
 // Return values:
 //      0 - Success
 //      1 - End of Stream
 //     -1 - Error
 int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
-                    size_t *buffer_size);
+                    size_t *bytes_read, size_t *buffer_size);
 
 // Guesses the frame rate of the input file based on the container timestamps.
 int webm_guess_framerate(struct WebmInputContext *webm_ctx,
diff --git a/third_party/aom/webmenc.cc b/third_party/aom/common/webmenc.cc
index e3d209a27..58ab33670 100644
--- a/third_party/aom/webmenc.cc
+++ b/third_party/aom/common/webmenc.cc
@@ -7,9 +7,9 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
-#include "./webmenc.h"
+#include "common/webmenc.h"
 
 #include <string>
 
diff --git a/third_party/aom/webmenc.h b/third_party/aom/common/webmenc.h
index 74387fb8d..74387fb8d 100644
--- a/third_party/aom/webmenc.h
+++ b/third_party/aom/common/webmenc.h
diff --git a/third_party/aom/y4menc.c b/third_party/aom/common/y4menc.c
index b094f74fe..d33d0313e 100644
--- a/third_party/aom/y4menc.c
+++ b/third_party/aom/common/y4menc.c
@@ -10,7 +10,8 @@
  */
 
 #include <assert.h>
-#include "./y4menc.h"
+
+#include "common/y4menc.h"
 
 int y4m_write_file_header(char *buf, size_t len, int width, int height,
                           const struct AvxRational *framerate,
diff --git a/third_party/aom/y4menc.h b/third_party/aom/common/y4menc.h
index cb75eeb42..6344176ba 100644
--- a/third_party/aom/y4menc.h
+++ b/third_party/aom/common/y4menc.h
@@ -12,9 +12,8 @@
 #ifndef Y4MENC_H_
 #define Y4MENC_H_
 
-#include "./tools_common.h"
-
 #include "aom/aom_decoder.h"
+#include "common/tools_common.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/third_party/aom/y4minput.c b/third_party/aom/common/y4minput.c
index e009042b2..a1dca10cd 100644
--- a/third_party/aom/y4minput.c
+++ b/third_party/aom/common/y4minput.c
@@ -125,8 +125,8 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
   The number of taps is intentionally kept small to reduce computational
    overhead and limit ringing.
 
-  The taps from these filters are scaled so that their sum is 1, and the result
-   is scaled by 128 and rounded to integers to create a filter whose
+  The taps from these filters are scaled so that their sum is 1, and the
+  result is scaled by 128 and rounded to integers to create a filter whose
    intermediate values fit inside 16 bits.
   Coefficients are rounded in such a way as to ensure their sum is still 128,
    which is usually equivalent to normal rounding.
@@ -134,7 +134,6 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
   Conversions which require both horizontal and vertical filtering could
    have these steps pipelined, for less memory consumption and better cache
    performance, but we do them separately for simplicity.*/
-
 #define OC_MINI(_a, _b) ((_a) > (_b) ? (_b) : (_a))
 #define OC_MAXI(_a, _b) ((_a) < (_b) ? (_b) : (_a))
 #define OC_CLAMPI(_a, _b, _c) (OC_MAXI(_a, OC_MINI(_b, _c)))
diff --git a/third_party/aom/y4minput.h b/third_party/aom/common/y4minput.h
index db20190db..db20190db 100644
--- a/third_party/aom/y4minput.h
+++ b/third_party/aom/common/y4minput.h
diff --git a/third_party/aom/configure b/third_party/aom/configure
deleted file mode 100755
index 50a5fb266..000000000
--- a/third_party/aom/configure
+++ /dev/null
@@ -1,977 +0,0 @@
-#!/bin/sh
-##
-##  configure
-##
-##  This script is the front-end to the build system. It provides a similar
-##  interface to standard configure scripts with some extra bits for dealing
-##  with toolchains that differ from the standard POSIX interface and
-##  for extracting subsets of the source tree. In theory, reusable parts
-##  of this script were intended to live in build/make/configure.sh,
-##  but in practice, the line is pretty blurry.
-##
-##  This build system is based in part on the FFmpeg configure script.
-##
-
-#source_path="`dirname \"$0\"`"
-source_path=${0%/*}
-. "${source_path}/build/make/configure.sh"
-
-show_help(){
-    show_help_pre
-    cat << EOF
-Advanced options:
-  ${toggle_libs}                  libraries
-  ${toggle_examples}              examples
-  ${toggle_analyzer}              analyzer
-  ${toggle_docs}                  documentation
-  ${toggle_unit_tests}            unit tests
-  ${toggle_tools}                 tools
-  ${toggle_decode_perf_tests}     build decoder perf tests with unit tests
-  ${toggle_encode_perf_tests}     build encoder perf tests with unit tests
-  --cpu=CPU                       tune for the specified CPU (ARM: cortex-a8, X86: sse3)
-  --libc=PATH                     path to alternate libc
-  --size-limit=WxH                max size to allow in the decoder
-  --as={yasm|nasm|auto}           use specified assembler [auto, yasm preferred]
-  --sdk-path=PATH                 path to root of sdk (android builds only)
-  ${toggle_codec_srcs}            in/exclude codec library source code
-  ${toggle_debug_libs}            in/exclude debug version of libraries
-  ${toggle_static_msvcrt}         use static MSVCRT (VS builds only)
-  ${toggle_highbitdepth}          enable 16-bit generic pixel pipeline (used by high bitdepth profiles)
-  ${toggle_lowbitdepth}           enable 8-bit optimized pixel pipeline
-  ${toggle_av1}                  AV1 codec support
-  ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
-  ${toggle_postproc}              postprocessing
-  ${toggle_multithread}           multithreaded encoding and decoding
-  ${toggle_spatial_resampling}    spatial sampling (scaling) support
-  ${toggle_realtime_only}         enable this option while building for real-time encoding
-  ${toggle_coefficient_range_checking}
-                                  enable decoder to check if intermediate
-                                  transform coefficients are in valid range
-  ${toggle_runtime_cpu_detect}    runtime cpu detection
-  ${toggle_shared}                shared library support
-  ${toggle_static}                static library support
-  ${toggle_small}                 favor smaller size over speed
-  ${toggle_postproc_visualizer}   macro block / block level visualizers
-  ${toggle_webm_io}               enable input from and output to WebM container
-  ${toggle_libyuv}                enable libyuv
-  ${toggle_accounting}            enable bit accounting
-  ${toggle_inspection}            enable bitstream inspection
-
-Codecs:
-  Codecs can be selectively enabled or disabled individually, or by family:
-      --disable-<codec>
-  is equivalent to:
-      --disable-<codec>-encoder
-      --disable-<codec>-decoder
-
-  Codecs available in this distribution:
-EOF
-#restore editor state '
-
-    family="";
-    last_family="";
-    c="";
-    str="";
-    for c in ${CODECS}; do
-        family=${c%_*}
-        if [ "${family}" != "${last_family}" ]; then
-            [ -z "${str}" ] || echo "${str}"
-            str="$(printf '    %10s:' ${family})"
-        fi
-        str="${str} $(printf '%10s' ${c#*_})"
-        last_family=${family}
-    done
-    echo "${str}"
-    show_help_post
-}
-
-##
-## BEGIN APPLICATION SPECIFIC CONFIGURATION
-##
-
-# all_platforms is a list of all supported target platforms. Maintain
-# alphabetically by architecture, generic-gnu last.
-all_platforms="${all_platforms} arm64-darwin-gcc"
-all_platforms="${all_platforms} arm64-linux-gcc"
-all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
-all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
-all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
-all_platforms="${all_platforms} armv7-linux-gcc"     #neon Cortex-A8
-all_platforms="${all_platforms} armv7-none-rvct"     #neon Cortex-A8
-all_platforms="${all_platforms} armv7-win32-vs12"
-all_platforms="${all_platforms} armv7-win32-vs14"
-all_platforms="${all_platforms} armv7-win32-vs15"
-all_platforms="${all_platforms} armv7s-darwin-gcc"
-all_platforms="${all_platforms} armv8-linux-gcc"
-all_platforms="${all_platforms} mips32-linux-gcc"
-all_platforms="${all_platforms} mips64-linux-gcc"
-all_platforms="${all_platforms} sparc-solaris-gcc"
-all_platforms="${all_platforms} x86-android-gcc"
-all_platforms="${all_platforms} x86-darwin8-gcc"
-all_platforms="${all_platforms} x86-darwin8-icc"
-all_platforms="${all_platforms} x86-darwin9-gcc"
-all_platforms="${all_platforms} x86-darwin9-icc"
-all_platforms="${all_platforms} x86-darwin10-gcc"
-all_platforms="${all_platforms} x86-darwin11-gcc"
-all_platforms="${all_platforms} x86-darwin12-gcc"
-all_platforms="${all_platforms} x86-darwin13-gcc"
-all_platforms="${all_platforms} x86-darwin14-gcc"
-all_platforms="${all_platforms} x86-darwin15-gcc"
-all_platforms="${all_platforms} x86-darwin16-gcc"
-all_platforms="${all_platforms} x86-iphonesimulator-gcc"
-all_platforms="${all_platforms} x86-linux-gcc"
-all_platforms="${all_platforms} x86-linux-icc"
-all_platforms="${all_platforms} x86-os2-gcc"
-all_platforms="${all_platforms} x86-solaris-gcc"
-all_platforms="${all_platforms} x86-win32-gcc"
-all_platforms="${all_platforms} x86-win32-vs12"
-all_platforms="${all_platforms} x86-win32-vs14"
-all_platforms="${all_platforms} x86-win32-vs15"
-all_platforms="${all_platforms} x86_64-android-gcc"
-all_platforms="${all_platforms} x86_64-darwin9-gcc"
-all_platforms="${all_platforms} x86_64-darwin10-gcc"
-all_platforms="${all_platforms} x86_64-darwin11-gcc"
-all_platforms="${all_platforms} x86_64-darwin12-gcc"
-all_platforms="${all_platforms} x86_64-darwin13-gcc"
-all_platforms="${all_platforms} x86_64-darwin14-gcc"
-all_platforms="${all_platforms} x86_64-darwin15-gcc"
-all_platforms="${all_platforms} x86_64-darwin16-gcc"
-all_platforms="${all_platforms} x86_64-iphonesimulator-gcc"
-all_platforms="${all_platforms} x86_64-linux-gcc"
-all_platforms="${all_platforms} x86_64-linux-icc"
-all_platforms="${all_platforms} x86_64-solaris-gcc"
-all_platforms="${all_platforms} x86_64-win64-gcc"
-all_platforms="${all_platforms} x86_64-win64-vs12"
-all_platforms="${all_platforms} x86_64-win64-vs14"
-all_platforms="${all_platforms} x86_64-win64-vs15"
-all_platforms="${all_platforms} generic-gnu"
-
-# all_targets is a list of all targets that can be configured
-# note that these should be in dependency order for now.
-all_targets="libs examples docs tools"
-
-# all targets available are enabled, by default.
-for t in ${all_targets}; do
-    [ -f "${source_path}/${t}.mk" ] && enable_feature ${t}
-done
-
-if ! diff --version >/dev/null; then
-  die "diff missing: Try installing diffutils via your package manager."
-fi
-
-if ! perl --version >/dev/null; then
-    die "Perl is required to build"
-fi
-
-
-if [ "`cd \"${source_path}\" && pwd`" != "`pwd`" ]; then
-  # test to see if source_path already configured
-  if [ -f "${source_path}/aom_config.h" ]; then
-    die "source directory already configured; run 'make distclean' there first"
-  fi
-fi
-
-# check installed doxygen version
-doxy_version=$(doxygen --version 2>/dev/null)
-doxy_major=${doxy_version%%.*}
-if [ ${doxy_major:-0} -ge 1 ]; then
-    doxy_version=${doxy_version#*.}
-    doxy_minor=${doxy_version%%.*}
-    doxy_patch=${doxy_version##*.}
-
-    [ $doxy_major -gt 1 ] && enable_feature doxygen
-    [ $doxy_minor -gt 5 ] && enable_feature doxygen
-    [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable_feature doxygen
-fi
-
-# disable codecs when their source directory does not exist
-[ -d "${source_path}/av1" ] || disable_codec av1
-
-# install everything except the sources, by default. sources will have
-# to be enabled when doing dist builds, since that's no longer a common
-# case.
-enabled doxygen && enable_feature install_docs
-enable_feature install_bins
-enable_feature install_libs
-
-enable_feature static
-enable_feature optimizations
-enable_feature dependency_tracking
-enable_feature spatial_resampling
-enable_feature multithread
-enable_feature os_support
-enable_feature highbitdepth
-
-CODECS="
-    av1_encoder
-    av1_decoder
-"
-CODEC_FAMILIES="
-    av1
-"
-
-ARCH_LIST="
-    arm
-    mips
-    x86
-    x86_64
-"
-ARCH_EXT_LIST_X86="
-    mmx
-    sse
-    sse2
-    sse3
-    ssse3
-    sse4_1
-    avx
-    avx2
-"
-ARCH_EXT_LIST="
-    neon
-    neon_asm
-
-    mips32
-    dspr2
-    msa
-    mips64
-
-    ${ARCH_EXT_LIST_X86}
-"
-HAVE_LIST="
-    ${ARCH_EXT_LIST}
-    aom_ports
-    fexcept
-    pthread_h
-    unistd_h
-    wxwidgets
-"
-EXPERIMENT_LIST="
-    fp_mb_stats
-    cdef
-    cdef_singlepass
-    var_tx
-    rect_tx
-    rect_tx_ext
-    tpl_mv
-    dual_filter
-    convolve_round
-    compound_round
-    ext_tx
-    tx64x64
-    ext_intra
-    intra_interp
-    filter_intra
-    intra_edge
-    intrabc
-    interintra
-    wedge
-    compound_segment
-    ext_refs
-    global_motion
-    new_quant
-    supertx
-    ans
-    loop_restoration
-    striped_loop_restoration
-    ext_partition
-    ext_partition_types
-    ext_partition_types_ab
-    unpoison_partition_ctx
-    ext_tile
-    motion_var
-    ncobmc
-    warped_motion
-    q_adapt_probs
-    inter_stats_only
-    palette_delta_encoding
-    rawbits
-    kf_ctx
-    pvq
-    cfl
-    xiphrc
-    dct_only
-    daala_tx
-    daala_dct4
-    daala_dct8
-    daala_dct16
-    daala_dct32
-    daala_dct64
-    cb4x4
-    chroma_2x2
-    chroma_sub8x8
-    frame_size
-    ext_delta_q
-    adapt_scan
-    parallel_deblocking
-    deblock_13tap
-    loopfiltering_across_tiles
-    tempmv_signaling
-    rd_debug
-    reference_buffer
-    coef_interleave
-    entropy_stats
-    masked_tx
-    dependent_horztiles
-    dist_8x8
-    palette_throughput
-    ref_adapt
-    lv_map
-    ctx1d
-    txk_sel
-    mv_compress
-    segment_zeromv
-    frame_superres
-    new_multisymbol
-    compound_singleref
-    aom_qm
-    one_sided_compound
-    ext_comp_refs
-    smooth_hv
-    var_refs
-    lgt
-    lgt_from_pred
-    sbl_symbol
-    ncobmc_adapt_weight
-    bgsprite
-    var_tx_no_tx_mode
-    mrc_tx
-    lpf_direct
-    loopfilter_level
-    no_frame_context_signaling
-    txmg
-    max_tile
-    hash_me
-    colorspace_headers
-    mfmv
-    frame_marker
-    jnt_comp
-    frame_sign_bias
-    ext_skip
-    obu
-    amvr
-    lpf_sb
-    opt_ref_mv
-    tmv
-    restrict_compressed_hdr
-    horzonly_frame_superres
-"
-CONFIG_LIST="
-    dependency_tracking
-    external_build
-    install_docs
-    install_bins
-    install_libs
-    install_srcs
-    debug
-    gprof
-    gcov
-    rvct
-    gcc
-    msvs
-    pic
-    big_endian
-
-    codec_srcs
-    debug_libs
-
-    runtime_cpu_detect
-    postproc
-    multithread
-    internal_stats
-    ${CODECS}
-    ${CODEC_FAMILIES}
-    static_msvcrt
-    spatial_resampling
-    realtime_only
-    shared
-    static
-    small
-    postproc_visualizer
-    os_support
-    unit_tests
-    webm_io
-    libyuv
-    accounting
-    inspection
-    decode_perf_tests
-    encode_perf_tests
-    bitstream_debug
-    symbolrate
-    coefficient_range_checking
-    lowbitdepth
-    highbitdepth
-    experimental
-    size_limit
-    ${EXPERIMENT_LIST}
-    analyzer
-"
-CMDLINE_SELECT="
-    dependency_tracking
-    external_build
-    extra_warnings
-    werror
-    install_docs
-    install_bins
-    install_libs
-    install_srcs
-    debug
-    gprof
-    gcov
-    pic
-    optimizations
-    ccache
-    runtime_cpu_detect
-    thumb
-
-    libs
-    examples
-    analyzer
-    docs
-    tools
-    libc
-    as
-    size_limit
-    codec_srcs
-    debug_libs
-
-    postproc
-    multithread
-    internal_stats
-    ${CODECS}
-    ${CODEC_FAMILIES}
-    static_msvcrt
-    spatial_resampling
-    realtime_only
-    shared
-    static
-    small
-    postproc_visualizer
-    unit_tests
-    webm_io
-    libyuv
-    accounting
-    inspection
-    decode_perf_tests
-    encode_perf_tests
-    coefficient_range_checking
-    bitstream_debug
-    symbolrate
-    lowbitdepth
-    highbitdepth
-    experimental
-    adopted_experiments
-    colorspace_headers
-"
-
-process_cmdline() {
-    for opt do
-        optval="${opt#*=}"
-        case "$opt" in
-        --disable-codecs)
-          for c in ${CODEC_FAMILIES}; do disable_codec $c; done
-          ;;
-        --enable-?*|--disable-?*)
-        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
-        if is_in ${option} ${EXPERIMENT_LIST}; then
-            if enabled experimental; then
-                ${action}_feature $option
-            else
-                log_echo "Ignoring $opt -- not in experimental mode."
-            fi
-        elif is_in ${option} "${CODECS} ${CODEC_FAMILIES}"; then
-            ${action}_codec ${option}
-        else
-            process_common_cmdline $opt
-        fi
-        ;;
-        *) process_common_cmdline "$opt"
-        ;;
-        esac
-    done
-}
-
-post_process_cmdline() {
-    c=""
-
-    # Enable all detected codecs, if they haven't been disabled
-    for c in ${CODECS}; do soft_enable $c; done
-
-    # Enable the codec family if any component of that family is enabled
-    for c in ${CODECS}; do
-        enabled $c && enable_feature ${c%_*}
-    done
-
-    # Set the {en,de}coders variable if any algorithm in that class is enabled
-    for c in ${CODECS}; do
-        enabled ${c} && enable_feature ${c##*_}s
-    done
-
-    enable_feature cb4x4
-
-    # Enable adopted experiments by default
-    soft_enable adopted_experiments
-    if enabled adopted_experiments; then
-      soft_enable chroma_sub8x8
-      soft_enable rect_tx
-      soft_enable global_motion
-      soft_enable ext_tx
-      soft_enable cdef
-      soft_enable ext_intra
-      soft_enable intra_edge
-      soft_enable mv_compress
-      soft_enable ext_refs
-      soft_enable dual_filter
-      soft_enable motion_var
-      soft_enable warped_motion
-      soft_enable var_tx
-      soft_enable wedge
-      soft_enable compound_segment
-      soft_enable interintra
-      soft_enable one_sided_compound
-      soft_enable convolve_round
-      soft_enable aom_qm
-      soft_enable dist_8x8
-      soft_enable cdef_singlepass
-      soft_enable loop_restoration
-
-      # Provisional adopted
-      soft_enable reference_buffer
-      soft_enable loopfiltering_across_tiles
-      soft_enable palette_throughput
-      soft_enable smooth_hv
-      soft_enable tempmv_signaling
-      soft_enable ext_comp_refs
-      soft_enable ext_delta_q
-      soft_enable parallel_deblocking
-    fi
-
-    # Enable low-bitdepth pixel pipeline by default
-    soft_enable lowbitdepth
-
-    # Enable LBD/HBD txfm consistency tool
-    soft_enable txmg
-
-    # Fix up experiment dependencies
-    enabled pvq && disable_feature chroma_2x2
-    enabled pvq && disable_feature rect_tx
-    enabled pvq && disable_feature ext_tx
-    enabled pvq && disable_feature var_tx
-    enabled pvq && disable_feature highbitdepth
-    enabled pvq && disable_feature lgt
-    enabled pvq && disable_feature mrc_tx
-    enabled lv_map && disable_feature mrc_tx
-    enabled supertx && disable_feature mrc_tx
-    enabled coef_interleave && disable_feature mrc_tx
-    enabled pvq && disable_feature palette_throughput
-    enabled mrc_tx && enable_feature ext_tx
-    enabled mrc_tx && enable_feature var_tx
-    enabled txk_sel && soft_enable lv_map
-    enabled ctx1d && soft_enable lv_map
-    enabled ctx1d && soft_enable ext_tx
-    enabled compound_round && soft_enable convolve_round
-    enabled intra_edge && enable_feature ext_intra
-    enabled chroma_2x2 && disable_feature chroma_sub8x8
-    enabled chroma_sub8x8 && enable_feature cb4x4
-    enabled ncobmc_adapt_weight && enable_feature motion_var
-    enabled bgsprite && enable_feature global_motion
-    enabled ext_comp_refs && enable_feature ext_refs
-    enabled ext_comp_refs && enable_feature one_sided_compound
-    enabled rect_tx_ext && enable_feature rect_tx
-    enabled lgt_from_pred && enable_feature ext_tx
-    enabled lgt_from_pred && disable_feature mrc_tx
-    enabled cfl && enable_feature smooth_hv
-    enabled cdef_singlepass && enable_feature cdef
-    enabled new_multisymbol && enable_feature restrict_compressed_hdr
-    enabled mfmv && enable_feature frame_marker
-    enabled jnt_comp && enable_feature frame_marker
-    enabled frame_sign_bias && enable_feature frame_marker
-    enabled txmg && enable_feature highbitdepth
-    enabled ext_skip && enable_feature frame_marker
-    enabled ext_skip && enable_feature ext_refs
-    enabled horzonly_frame_superres && enable_feature frame_superres
-
-    if enabled rawbits && enabled ans; then
-      log_echo "rawbits requires not ans, so disabling rawbits"
-      disable_feature rawbits
-    fi
-    if enabled daala_tx; then
-      enable_feature daala_dct4
-      enable_feature daala_dct8
-      enable_feature daala_dct16
-      enable_feature daala_dct32
-      enable_feature daala_dct64
-    fi
-    if enabled daala_dct64 && ! enabled tx64x64; then
-      log_echo "daala_dct64 requires tx64x64, so disabling daala_dct64"
-      disable_feature daala_dct64
-    fi
-    if enabled daala_dct4 || enabled daala_dct8 || enabled daala_dct16 ||
-        enabled daala_dct32 || enabled daala_dct64; then
-      disable_feature lgt
-      enable_feature lowbitdepth
-    fi
-    if enabled var_tx_no_tx_mode && ! enabled var_tx; then
-      log_echo "var_tx_no_tx_mode requires var_tx, so disabling var_tx_no_tx_mode"
-      disable_feature var_tx_no_tx_mode
-    fi
-    if enabled ext_partition_types; then
-      if enabled fp_mb_stats; then
-        log_echo "ext_partition_types not compatible with fp_mb_stats;"
-        log_echo "disabling fp_mb_stats"
-        disable_feature fp_mb_stats
-      fi
-      if enabled supertx; then
-        log_echo "ext_partition_types not compatible with supertx;"
-        log_echo "disabling supertx"
-        disable_feature supertx
-      fi
-      if ! enabled rect_tx; then
-        log_echo "ext_partition_types requires rect_tx;"
-        log_echo "enabling rect_tx;"
-        enable_feature rect_tx
-      fi
-    fi
-    # Enable accounting and inspection when building the analyzer
-    if enabled analyzer; then
-      soft_enable accounting
-      soft_enable inspection
-    fi
-    # Enable hash_me if amvr is enabled
-    if enabled amvr; then
-      log_echo "amvr requires hash_me"
-      enable_feature hash_me
-    fi
-
-    if enabled striped_loop_restoration && ! enabled loop_restoration ; then
-      log_echo "striped_loop_restoration requires loop_restoration"
-      log_echo "enable loop_restoration"
-      enable_feature loop_restoration
-    fi
-    if enabled striped_loop_restoration && enabled frame_superres ; then
-      log_echo "striped_loop_restoration not compatible with frame_superres"
-      log_echo "disabling striped_loop_restoration"
-      disable_feature striped_loop_restoration
-    fi
-}
-
-process_targets() {
-    enabled child || write_common_config_banner
-    write_common_target_config_h ${BUILD_PFX}aom_config.h
-    write_common_config_targets
-
-    # Calculate the default distribution name, based on the enabled features
-    cf=""
-    DIST_DIR=aom
-    for cf in $CODEC_FAMILIES; do
-        if enabled ${cf}_encoder && enabled ${cf}_decoder; then
-            DIST_DIR="${DIST_DIR}-${cf}"
-        elif enabled ${cf}_encoder; then
-            DIST_DIR="${DIST_DIR}-${cf}cx"
-        elif enabled ${cf}_decoder; then
-            DIST_DIR="${DIST_DIR}-${cf}dx"
-        fi
-    done
-    enabled debug_libs && DIST_DIR="${DIST_DIR}-debug"
-    enabled codec_srcs && DIST_DIR="${DIST_DIR}-src"
-    ! enabled postproc && DIST_DIR="${DIST_DIR}-nopost"
-    ! enabled multithread && DIST_DIR="${DIST_DIR}-nomt"
-    ! enabled install_docs && DIST_DIR="${DIST_DIR}-nodocs"
-    DIST_DIR="${DIST_DIR}-${tgt_isa}-${tgt_os}"
-    case "${tgt_os}" in
-    win*) enabled static_msvcrt && DIST_DIR="${DIST_DIR}mt" || DIST_DIR="${DIST_DIR}md"
-          DIST_DIR="${DIST_DIR}-${tgt_cc}"
-          ;;
-    esac
-    if [ -f "${source_path}/build/make/version.sh" ]; then
-        ver=`"$source_path/build/make/version.sh" --bare "$source_path"`
-        DIST_DIR="${DIST_DIR}-${ver}"
-        VERSION_STRING=${ver}
-        ver=${ver%%-*}
-        VERSION_PATCH=${ver##*.}
-        ver=${ver%.*}
-        VERSION_MINOR=${ver##*.}
-        ver=${ver#v}
-        VERSION_MAJOR=${ver%.*}
-    fi
-    enabled child || cat <<EOF >> config.mk
-
-PREFIX=${prefix}
-ifeq (\$(MAKECMDGOALS),dist)
-DIST_DIR?=${DIST_DIR}
-else
-DIST_DIR?=\$(DESTDIR)${prefix}
-endif
-LIBSUBDIR=${libdir##${prefix}/}
-
-VERSION_STRING=${VERSION_STRING}
-
-VERSION_MAJOR=${VERSION_MAJOR}
-VERSION_MINOR=${VERSION_MINOR}
-VERSION_PATCH=${VERSION_PATCH}
-
-CONFIGURE_ARGS=${CONFIGURE_ARGS}
-EOF
-    enabled child || echo "CONFIGURE_ARGS?=${CONFIGURE_ARGS}" >> config.mk
-
-    #
-    # Write makefiles for all enabled targets
-    #
-    for tgt in libs examples docs tools solution; do
-        tgt_fn="$tgt-$toolchain.mk"
-
-        if enabled $tgt; then
-            echo "Creating makefiles for ${toolchain} ${tgt}"
-            write_common_target_config_mk $tgt_fn ${BUILD_PFX}aom_config.h
-            #write_${tgt}_config
-        fi
-    done
-
-}
-
-process_detect() {
-    if enabled shared; then
-        # Can only build shared libs on a subset of platforms. Doing this check
-        # here rather than at option parse time because the target auto-detect
-        # magic happens after the command line has been parsed.
-        case "${tgt_os}" in
-        linux|os2|darwin*|iphonesimulator*)
-            # Supported platforms
-            ;;
-        *)
-            if enabled gnu; then
-                echo "--enable-shared is only supported on ELF; assuming this is OK"
-            else
-                die "--enable-shared only supported on ELF, OS/2, and Darwin for now"
-            fi
-            ;;
-        esac
-    fi
-    if [ -z "$CC" ] || enabled external_build; then
-        echo "Bypassing toolchain for environment detection."
-        enable_feature external_build
-        check_header() {
-            log fake_check_header "$@"
-            header=$1
-            shift
-            var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
-            disable_feature $var
-            # Headers common to all environments
-            case $header in
-                stdio.h)
-                    true;
-                ;;
-                *)
-                    result=false
-                    for d in "$@"; do
-                        [ -f "${d##-I}/$header" ] && result=true && break
-                    done
-                    ${result:-true}
-            esac && enable_feature $var
-
-            # Specialize windows and POSIX environments.
-            case $toolchain in
-                *-win*-*)
-                    # Don't check for any headers in Windows builds.
-                    false
-                ;;
-                *)
-                    case $header in
-                        pthread.h) true;;
-                        unistd.h) true;;
-                        *) false;;
-                    esac && enable_feature $var
-            esac
-            enabled $var
-        }
-        check_ld() {
-            true
-        }
-    fi
-    check_header stdio.h || die "Unable to invoke compiler: ${CC} ${CFLAGS}"
-    check_ld <<EOF || die "Toolchain is unable to link executables"
-int main(void) {return 0;}
-EOF
-    # check system headers
-    check_header pthread.h
-    check_header unistd.h # for sysconf(3) and friends.
-
-    check_header aom/aom_integer.h -I${source_path} && enable_feature aom_ports
-
-    check_ld <<EOF && enable_feature fexcept
-#define _GNU_SOURCE
-#include <fenv.h>
-int main(void) { (void)feenableexcept(FE_DIVBYZERO | FE_INVALID); return 0; }
-EOF
-}
-
-process_toolchain() {
-    process_common_toolchain
-
-    # Enable some useful compiler flags
-    if enabled gcc; then
-        enabled werror && check_add_cflags -Werror
-        check_add_cflags -Wall
-        check_add_cflags -Wdisabled-optimization
-        check_add_cflags -Wfloat-conversion
-        check_add_cflags -Wpointer-arith
-        check_add_cflags -Wtype-limits
-        check_add_cflags -Wvla
-        check_add_cflags -Wimplicit-function-declaration
-        check_add_cflags -Wuninitialized
-        check_add_cflags -Wunused
-        check_add_cflags -Wsign-compare
-        check_add_cflags -Wstring-conversion
-        check_add_cflags -Wlogical-op
-        check_add_cflags -Wstack-usage=320000
-        # Enabling the following warning (in combination with -Wunused above)
-        # for C++ generates errors in third_party code including googletest and
-        # libyuv. So enable it only for C code.
-        check_cflags "-Wextra" && add_cflags_only "-Wextra"
-        # Enabling the following warning for C++ generates some useless warnings
-        # about some function parameters shadowing class member function names.
-        # So, only enable this warning for C code.
-        check_cflags "-Wshadow" && add_cflags_only "-Wshadow"
-        if enabled mips || [ -z "${INLINE}" ]; then
-          enabled extra_warnings || check_add_cflags -Wno-unused-function
-        fi
-        # gtest makes heavy use of undefined pre-processor symbols
-        check_cflags "-Wundef" && add_cflags_only "-Wundef"
-        # Avoid this warning for third_party C++ sources. Some reorganization
-        # would be needed to apply this only to test/*.cc.
-        check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32
-    fi
-
-    if enabled icc; then
-        enabled werror && check_add_cflags -Werror
-        check_add_cflags -Wall
-        check_add_cflags -Wpointer-arith
-
-        # ICC has a number of floating point optimizations that we disable
-        # in favor of deterministic output WRT to other compilers
-        add_cflags -fp-model precise
-    fi
-
-    if enabled analyzer; then
-      soft_enable wxwidgets
-      if ! wx-config --version > /dev/null; then
-        die "Couldn't find wx-config"
-      fi
-
-      add_cxxflags_only $(wx-config --cppflags)
-      add_extralibs $(wx-config --libs)
-    fi
-
-    # Enable extra, harmless warnings. These might provide additional insight
-    # to what the compiler is doing and why, but in general, but they shouldn't
-    # be treated as fatal, even if we're treating warnings as errors.
-    GCC_EXTRA_WARNINGS="
-        -Wdisabled-optimization
-        -Winline
-    "
-    enabled gcc && EXTRA_WARNINGS="${GCC_EXTRA_WARNINGS}"
-    RVCT_EXTRA_WARNINGS="
-        --remarks
-    "
-    enabled rvct && EXTRA_WARNINGS="${RVCT_EXTRA_WARNINGS}"
-    if enabled extra_warnings; then
-        for w in ${EXTRA_WARNINGS}; do
-            check_add_cflags ${w}
-            enabled gcc && enabled werror && check_add_cflags -Wno-error=${w}
-        done
-    fi
-
-    # ccache only really works on gcc toolchains
-    enabled gcc || soft_disable ccache
-
-    # Enable the postbuild target if building for visual studio.
-    case "$tgt_cc" in
-        vs*) enable_feature msvs
-             enable_feature solution
-             vs_version=${tgt_cc##vs}
-             VCPROJ_SFX=vcxproj
-             gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh
-             enabled werror && gen_vcproj_cmd="${gen_vcproj_cmd} --enable-werror"
-             all_targets="${all_targets} solution"
-             INLINE="__forceinline"
-        ;;
-    esac
-
-    # Other toolchain specific defaults
-    case $toolchain in x86*) soft_enable postproc;; esac
-
-    if enabled postproc_visualizer; then
-        enabled postproc || die "postproc_visualizer requires postproc to be enabled"
-    fi
-
-    # Enable unit tests by default if we have a working C++ compiler.
-    case "$toolchain" in
-        *-vs*)
-            soft_enable unit_tests
-            soft_enable webm_io
-            soft_enable libyuv
-        ;;
-        *-android-*)
-            soft_enable webm_io
-            soft_enable libyuv
-            # GTestLog must be modified to use Android logging utilities.
-        ;;
-        *-darwin-*)
-            # iOS/ARM builds do not work with gtest. This does not match
-            # x86 targets.
-        ;;
-        *-iphonesimulator-*)
-            soft_enable webm_io
-            soft_enable libyuv
-        ;;
-        *-win*)
-            # Some mingw toolchains don't have pthread available by default.
-            # Treat these more like visual studio where threading in gtest
-            # would be disabled for the same reason.
-            check_cxx "$@" <<EOF && soft_enable unit_tests
-int z;
-EOF
-            check_cxx "$@" <<EOF && soft_enable webm_io
-int z;
-EOF
-            check_cxx "$@" <<EOF && soft_enable libyuv
-int z;
-EOF
-        ;;
-        *)
-            enabled pthread_h && check_cxx "$@" <<EOF && soft_enable unit_tests
-int z;
-EOF
-            check_cxx "$@" <<EOF && soft_enable webm_io
-int z;
-EOF
-            check_cxx "$@" <<EOF && soft_enable libyuv
-int z;
-EOF
-        ;;
-    esac
-    # libwebm needs to be linked with C++ standard library
-    enabled webm_io && LD=${CXX}
-
-    # append any user defined extra cflags
-    if [ -n "${extra_cflags}" ] ; then
-        check_add_cflags ${extra_cflags} || \
-        die "Requested extra CFLAGS '${extra_cflags}' not supported by compiler"
-    fi
-    if [ -n "${extra_cxxflags}" ]; then
-        check_add_cxxflags ${extra_cxxflags} || \
-        die "Requested extra CXXFLAGS '${extra_cxxflags}' not supported by compiler"
-    fi
-}
-
-
-##
-## END APPLICATION SPECIFIC CONFIGURATION
-##
-CONFIGURE_ARGS="$@"
-process "$@"
-print_webm_license ${BUILD_PFX}aom_config.c "/*" " */"
-cat <<EOF >> ${BUILD_PFX}aom_config.c
-#include "aom/aom_codec.h"
-static const char* const cfg = "$CONFIGURE_ARGS";
-const char *aom_codec_build_config(void) {return cfg;}
-EOF
diff --git a/third_party/aom/docs.cmake b/third_party/aom/docs.cmake
index 7fc75635a..fd8d02ce6 100644
--- a/third_party/aom/docs.cmake
+++ b/third_party/aom/docs.cmake
@@ -1,14 +1,16 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_DOCS_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_DOCS_CMAKE_)
+  return()
+endif() # AOM_DOCS_CMAKE_
 set(AOM_DOCS_CMAKE_ 1)
 
 cmake_minimum_required(VERSION 3.5)
@@ -18,126 +20,138 @@ set(AOM_DOXYGEN_CONFIG_TEMPLATE "libs.doxy_template")
 set(AOM_DOXYGEN_OUTPUT_DIR "${AOM_CONFIG_DIR}/dox")
 set(AOM_DOXYGEN_SECTIONS "av1")
 
-set(AOM_DOXYGEN_SOURCES
-    "${AOM_ROOT}/aom/aom.h"
-    "${AOM_ROOT}/aom/aom_codec.h"
-    "${AOM_ROOT}/aom/aom_frame_buffer.h"
-    "${AOM_ROOT}/aom/aom_image.h"
-    "${AOM_ROOT}/aom/aom_integer.h"
-    "${AOM_ROOT}/keywords.dox"
-    "${AOM_ROOT}/mainpage.dox"
-    "${AOM_ROOT}/usage.dox")
-
-if (CONFIG_AV1_DECODER)
-  set(AOM_DOXYGEN_EXAMPLE_SOURCES
-      ${AOM_DOXYGEN_EXAMPLE_SOURCES}
-      "${AOM_ROOT}/aomdec.c"
-      "${AOM_ROOT}/examples/decode_to_md5.c"
+set(AOM_DOXYGEN_SOURCES "${AOM_ROOT}/aom/aom.h" "${AOM_ROOT}/aom/aom_codec.h"
+    "${AOM_ROOT}/aom/aom_frame_buffer.h" "${AOM_ROOT}/aom/aom_image.h"
+    "${AOM_ROOT}/aom/aom_integer.h" "${AOM_ROOT}/keywords.dox"
+    "${AOM_ROOT}/mainpage.dox" "${AOM_ROOT}/usage.dox")
+
+if(CONFIG_AV1_DECODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+      "${AOM_ROOT}/apps/aomdec.c" "${AOM_ROOT}/examples/decode_to_md5.c"
       "${AOM_ROOT}/examples/decode_with_drops.c"
       "${AOM_ROOT}/examples/simple_decoder.c")
 
-  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS
-      ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
-      "Full featured decoder."
-      "Frame by frame MD5 checksum."
-      "Drops frames while decoding."
-      "Simplified decoder loop.")
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+      "Full featured decoder." "Frame by frame MD5 checksum."
+      "Drops frames while decoding." "Simplified decoder loop.")
 
   set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_decoder decoder")
 
-  set(AOM_DOXYGEN_SOURCES
-      ${AOM_DOXYGEN_SOURCES}
-      "${AOM_ROOT}/aom/aom_decoder.h"
-      "${AOM_ROOT}/aom/aomdx.h"
+  set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES}
+      "${AOM_ROOT}/aom/aom_decoder.h" "${AOM_ROOT}/aom/aomdx.h"
       "${AOM_ROOT}/usage_dx.dox")
 
-  if (CONFIG_ANALYZER)
-    set(AOM_DOXYGEN_EXAMPLE_SOURCES
-        ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+  if(CONFIG_ANALYZER)
+    set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
         "${AOM_ROOT}/examples/analyzer.cc")
 
-    set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS
-        ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+    set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
         "Bitstream analyzer.")
-  endif ()
+  endif()
 
-  if (CONFIG_INSPECTION)
-     set(AOM_DOXYGEN_EXAMPLE_SOURCES
-         ${AOM_DOXYGEN_EXAMPLE_SOURCES}
-         "${AOM_ROOT}/examples/inspect.c")
+  if(CONFIG_INSPECTION)
+    set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+        "${AOM_ROOT}/examples/inspect.c")
 
-    set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS
-        ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+    set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
         "Bitstream inspector.")
-  endif ()
-endif ()
-
-if (CONFIG_AV1_ENCODER)
-  set(AOM_DOXYGEN_EXAMPLE_SOURCES
-      ${AOM_DOXYGEN_EXAMPLE_SOURCES}
-      "${AOM_ROOT}/aomenc.c"
-      "${AOM_ROOT}/examples/lossless_encoder.c"
-      "${AOM_ROOT}/examples/set_maps.c"
-      "${AOM_ROOT}/examples/simple_encoder.c"
+  endif()
+endif()
+
+if(CONFIG_AV1_ENCODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+      "${AOM_ROOT}/apps/aomenc.c" "${AOM_ROOT}/examples/lossless_encoder.c"
+      "${AOM_ROOT}/examples/set_maps.c" "${AOM_ROOT}/examples/simple_encoder.c"
       "${AOM_ROOT}/examples/twopass_encoder.c")
 
-  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS
-      ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
-      "Full featured encoder."
-      "Simplified lossless encoder."
-      "Set active and ROI maps."
-      "Simplified encoder loop."
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+      "Full featured encoder." "Simplified lossless encoder."
+      "Set active and ROI maps." "Simplified encoder loop."
       "Two-pass encoder loop.")
 
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+      "${AOM_ROOT}/examples/scalable_encoder.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+      "Scalable encoder loop.")
+
   set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_encoder encoder")
 
-  set(AOM_DOXYGEN_SOURCES
-      ${AOM_DOXYGEN_SOURCES}
-      "${AOM_ROOT}/aom/aomcx.h"
-      "${AOM_ROOT}/aom/aom_encoder.h"
-      "${AOM_ROOT}/usage_cx.dox")
-endif ()
+  set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomcx.h"
+      "${AOM_ROOT}/aom/aom_encoder.h" "${AOM_ROOT}/usage_cx.dox")
+endif()
 
-if (CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
-  set(AOM_DOXYGEN_EXAMPLE_SOURCES
-      ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
       "${AOM_ROOT}/examples/aom_cx_set_ref.c")
 
-  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS
-      ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
       "Set encoder reference frame.")
-endif ()
+endif()
+
+if(CONFIG_AV1_ENCODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+      "${AOM_ROOT}/examples/lightfield_encoder.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+      "Lightfield encoder example.")
+endif()
+
+if(CONFIG_AV1_DECODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+      "${AOM_ROOT}/examples/lightfield_tile_list_decoder.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+      "Lightfield tile list decoder example.")
+endif()
+
+if(CONFIG_AV1_DECODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+      "${AOM_ROOT}/examples/lightfield_decoder.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+      "Lightfield decoder example.")
+endif()
+
+if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+      "${AOM_ROOT}/examples/lightfield_bitstream_parsing.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+      "Lightfield bitstream parsing example.")
+endif()
 
 # Iterates over list named by $list_name and appends each item to $AOM_DOXYFILE
 # as values assigned to $var_name with no line breaks between list items.
 # Appends a new line after the entire config variable is expanded.
-function (write_cmake_list_to_doxygen_config_var var_name list_name)
+function(write_cmake_list_to_doxygen_config_var var_name list_name)
   unset(output_string)
-  foreach (list_item ${${list_name}})
+  foreach(list_item ${${list_name}})
     set(output_string "${output_string} ${list_item} ")
-  endforeach ()
+  endforeach()
   string(STRIP "${output_string}" output_string)
   file(APPEND "${AOM_DOXYFILE}" "${var_name} += ${output_string}\n")
-endfunction ()
+endfunction()
+
+function(get_name file_path name_var)
+  get_filename_component(file_basename ${file_path} NAME)
+  get_filename_component(${name_var} ${file_basename} NAME_WE)
+  set(${name_var} ${${name_var}} PARENT_SCOPE)
+endfunction()
 
-function (get_name file_path name_var)
-    get_filename_component(file_basename ${file_path} NAME)
-    get_filename_component(${name_var} ${file_basename} NAME_WE)
-    set(${name_var} ${${name_var}} PARENT_SCOPE)
-endfunction ()
+function(setup_documentation_targets)
 
-function (setup_documentation_targets)
   # Sanity check: the lengths of these lists must match.
   list(LENGTH AOM_DOXYGEN_EXAMPLE_SOURCES num_sources)
   list(LENGTH AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS num_descs)
-  if (NOT ${num_sources} EQUAL ${num_descs})
+  if(NOT ${num_sources} EQUAL ${num_descs})
     message(FATAL_ERROR "Unqeual example and description totals.")
-  endif ()
+  endif()
 
   # Take the list of examples and produce example_basename.dox for each file in
   # the list.
   file(MAKE_DIRECTORY "${AOM_DOXYGEN_OUTPUT_DIR}")
-  foreach (example_file ${AOM_DOXYGEN_EXAMPLE_SOURCES})
+  foreach(example_file ${AOM_DOXYGEN_EXAMPLE_SOURCES})
     unset(example_basename)
     get_name("${example_file}" "example_name")
     set(example_dox "${AOM_DOXYGEN_OUTPUT_DIR}/${example_name}.dox")
@@ -145,25 +159,29 @@ function (setup_documentation_targets)
     set(dox_string "${dox_string} \\includelineno ${example_file}\n*/\n")
     file(WRITE "${example_dox}" ${dox_string})
     set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${example_dox}")
-  endforeach ()
+  endforeach()
 
   # Generate samples.dox, an index page that refers to the example_basename.dox
   # files that were just created.
-  set(samples_header
-"
+  set(
+    samples_header
+    "
 /*!\\page samples Sample Code
 This SDK includes a number of sample applications. Each sample documents a
 feature of the SDK in both prose and the associated C code. The following
 samples are included:
-")
-
-  set(utils_desc
 "
+    )
+
+  set(
+    utils_desc
+    "
 In addition, the SDK contains a number of utilities. Since these utilities are
 built upon the concepts described in the sample code listed above, they are not
 documented in pieces like the samples are. Their source is included here for
 reference. The following utilities are included:
-")
+"
+    )
 
   # Write the description for the samples section.
   set(samples_dox "${AOM_CONFIG_DIR}/samples.dox")
@@ -173,28 +191,28 @@ reference. The following utilities are included:
   # $AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS and massage example names as required by
   # AV1's doxygen setup.
   math(EXPR max_example_index "${num_sources} - 1")
-  foreach (NUM RANGE ${max_example_index})
+  foreach(NUM RANGE ${max_example_index})
     list(GET AOM_DOXYGEN_EXAMPLE_SOURCES ${NUM} ex_name)
     get_name("${ex_name}" "ex_name")
 
     # AV1's doxygen lists aomdec and aomenc as utils apart from the examples.
     # Save the indexes for another pass.
-    if ("${ex_name}" MATCHES "aomdec\|aomenc")
+    if("${ex_name}" MATCHES "aomdec\|aomenc")
       set(util_indexes "${util_indexes}" "${NUM}")
       continue()
-    endif ()
+    endif()
     list(GET AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${NUM} ex_desc)
     file(APPEND "${samples_dox}" " - \\subpage example_${ex_name} ${ex_desc}\n")
-  endforeach ()
+  endforeach()
 
   # Write the description and index for the utils.
   file(APPEND "${samples_dox}" "${utils_desc}\n")
-  foreach (util_index ${util_indexes})
+  foreach(util_index ${util_indexes})
     list(GET AOM_DOXYGEN_EXAMPLE_SOURCES ${util_index} ex_name)
     get_name("${ex_name}" "ex_name")
     list(GET AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${util_index} ex_desc)
     file(APPEND "${samples_dox}" " - \\subpage example_${ex_name} ${ex_desc}\n")
-  endforeach ()
+  endforeach()
   file(APPEND "${samples_dox}" "*/")
 
   # Add $samples_dox to the doxygen inputs.
@@ -206,11 +224,11 @@ reference. The following utilities are included:
   file(READ "${AOM_ROOT}/${AOM_DOXYGEN_CONFIG_TEMPLATE}" doxygen_template_data)
   file(APPEND "${AOM_DOXYFILE}" ${doxygen_template_data})
   file(APPEND "${AOM_DOXYFILE}"
-       "EXAMPLE_PATH += ${AOM_ROOT} ${AOM_ROOT}/examples\n")
-  file(APPEND
-       "${AOM_DOXYFILE}" "INCLUDE_PATH += ${AOM_CONFIG_DIR} ${AOM_ROOT}\n")
+              "EXAMPLE_PATH += ${AOM_ROOT} ${AOM_ROOT}/examples\n")
   file(APPEND "${AOM_DOXYFILE}"
-       "STRIP_FROM_PATH += ${AOM_ROOT} ${AOM_CONFIG_DIR}\n")
+              "INCLUDE_PATH += ${AOM_CONFIG_DIR} ${AOM_ROOT}\n")
+  file(APPEND "${AOM_DOXYFILE}"
+              "STRIP_FROM_PATH += ${AOM_ROOT} ${AOM_CONFIG_DIR}\n")
   write_cmake_list_to_doxygen_config_var("INPUT" "AOM_DOXYGEN_SOURCES")
   write_cmake_list_to_doxygen_config_var("ENABLED_SECTIONS"
                                          "AOM_DOXYGEN_SECTIONS")
@@ -219,11 +237,9 @@ reference. The following utilities are included:
   add_custom_target(docs ALL
                     COMMAND "${DOXYGEN_EXECUTABLE}" "${AOM_DOXYFILE}"
                     DEPENDS "${AOM_DOXYFILE}" ${AOM_DOXYGEN_SOURCES}
-                             ${AOM_DOXYGEN_EXAMPLE_SOURCES}
-                             "${AOM_DOXYGEN_CONFIG_TEMPLATE}"
+                            ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+                            "${AOM_DOXYGEN_CONFIG_TEMPLATE}"
                     SOURCES "${AOM_DOXYFILE}" ${AOM_DOXYGEN_SOURCES}
-                             ${AOM_DOXYGEN_EXAMPLE_SOURCES}
-                             "${AOM_DOXYGEN_CONFIG_TEMPLATE}")
-endfunction ()
-
-endif ()  # AOM_DOCS_CMAKE_
+                            ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+                            "${AOM_DOXYGEN_CONFIG_TEMPLATE}")
+endfunction()
diff --git a/third_party/aom/docs.mk b/third_party/aom/docs.mk
deleted file mode 100644
index 0dfc65b75..000000000
--- a/third_party/aom/docs.mk
+++ /dev/null
@@ -1,50 +0,0 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-
-
-INSTALL_MAPS += docs/%    docs/%
-INSTALL_MAPS += src/%     %
-INSTALL_MAPS += %         %
-
-# Static documentation authored in doxygen
-CODEC_DOX :=    mainpage.dox \
-		keywords.dox \
-		usage.dox \
-		usage_cx.dox \
-		usage_dx.dox \
-
-# Other doxy files sourced in Markdown
-TXT_DOX = $(call enabled,TXT_DOX)
-
-EXAMPLE_PATH += $(SRC_PATH_BARE) #for CHANGELOG, README, etc
-EXAMPLE_PATH += $(SRC_PATH_BARE)/examples
-
-doxyfile: $(if $(findstring examples, $(ALL_TARGETS)),examples.doxy)
-doxyfile: libs.doxy_template libs.doxy
-	@echo "    [CREATE] $@"
-	@cat $^ > $@
-	@echo "STRIP_FROM_PATH += $(SRC_PATH_BARE) $(BUILD_ROOT)" >> $@
-	@echo "INPUT += $(addprefix $(SRC_PATH_BARE)/,$(CODEC_DOX))" >> $@;
-	@echo "INPUT += $(TXT_DOX)" >> $@;
-	@echo "EXAMPLE_PATH += $(EXAMPLE_PATH)" >> $@
-
-CLEAN-OBJS += doxyfile $(wildcard docs/html/*)
-docs/html/index.html: doxyfile $(CODEC_DOX) $(TXT_DOX)
-	@echo "    [DOXYGEN] $<"
-	@doxygen $<
-DOCS-yes += docs/html/index.html
-
-DIST-DOCS-yes = $(wildcard docs/html/*)
-DIST-DOCS-$(CONFIG_CODEC_SRCS) += $(addprefix src/,$(CODEC_DOX))
-DIST-DOCS-$(CONFIG_CODEC_SRCS) += src/libs.doxy_template
-DIST-DOCS-yes                  += CHANGELOG
-DIST-DOCS-yes                  += README
diff --git a/third_party/aom/examples.mk b/third_party/aom/examples.mk
deleted file mode 100644
index 329932670..000000000
--- a/third_party/aom/examples.mk
+++ /dev/null
@@ -1,403 +0,0 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-
-LIBYUV_SRCS +=  third_party/libyuv/include/libyuv/basic_types.h  \
-                third_party/libyuv/include/libyuv/convert.h \
-                third_party/libyuv/include/libyuv/convert_argb.h \
-                third_party/libyuv/include/libyuv/convert_from.h \
-                third_party/libyuv/include/libyuv/cpu_id.h  \
-                third_party/libyuv/include/libyuv/planar_functions.h  \
-                third_party/libyuv/include/libyuv/rotate.h  \
-                third_party/libyuv/include/libyuv/row.h  \
-                third_party/libyuv/include/libyuv/scale.h  \
-                third_party/libyuv/include/libyuv/scale_row.h  \
-                third_party/libyuv/source/cpu_id.cc \
-                third_party/libyuv/source/planar_functions.cc \
-                third_party/libyuv/source/row_any.cc \
-                third_party/libyuv/source/row_common.cc \
-                third_party/libyuv/source/row_gcc.cc \
-                third_party/libyuv/source/row_mips.cc \
-                third_party/libyuv/source/row_neon.cc \
-                third_party/libyuv/source/row_neon64.cc \
-                third_party/libyuv/source/row_win.cc \
-                third_party/libyuv/source/scale.cc \
-                third_party/libyuv/source/scale_any.cc \
-                third_party/libyuv/source/scale_common.cc \
-                third_party/libyuv/source/scale_gcc.cc \
-                third_party/libyuv/source/scale_mips.cc \
-                third_party/libyuv/source/scale_neon.cc \
-                third_party/libyuv/source/scale_neon64.cc \
-                third_party/libyuv/source/scale_win.cc \
-
-LIBWEBM_COMMON_SRCS += third_party/libwebm/common/hdr_util.cc \
-                       third_party/libwebm/common/hdr_util.h \
-                       third_party/libwebm/common/webmids.h
-
-LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer/mkvmuxer.cc \
-                      third_party/libwebm/mkvmuxer/mkvmuxerutil.cc \
-                      third_party/libwebm/mkvmuxer/mkvwriter.cc \
-                      third_party/libwebm/mkvmuxer/mkvmuxer.h \
-                      third_party/libwebm/mkvmuxer/mkvmuxertypes.h \
-                      third_party/libwebm/mkvmuxer/mkvmuxerutil.h \
-                      third_party/libwebm/mkvparser/mkvparser.h \
-                      third_party/libwebm/mkvmuxer/mkvwriter.h
-
-LIBWEBM_PARSER_SRCS = third_party/libwebm/mkvparser/mkvparser.cc \
-                      third_party/libwebm/mkvparser/mkvreader.cc \
-                      third_party/libwebm/mkvparser/mkvparser.h \
-                      third_party/libwebm/mkvparser/mkvreader.h
-
-# Add compile flags and include path for libwebm sources.
-ifeq ($(CONFIG_WEBM_IO),yes)
-  CXXFLAGS     += -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
-  INC_PATH-yes += $(SRC_PATH_BARE)/third_party/libwebm
-endif
-
-# List of examples to build. UTILS are tools meant for distribution
-# while EXAMPLES demonstrate specific portions of the API.
-UTILS-$(CONFIG_AV1_DECODER) += aomdec.c
-aomdec.SRCS                 += md5_utils.c md5_utils.h
-aomdec.SRCS                 += aom_ports/mem_ops.h
-aomdec.SRCS                 += aom_ports/mem_ops_aligned.h
-aomdec.SRCS                 += aom_ports/msvc.h
-aomdec.SRCS                 += aom_ports/aom_timer.h
-aomdec.SRCS                 += aom/aom_integer.h
-aomdec.SRCS                 += args.c args.h
-aomdec.SRCS                 += ivfdec.c ivfdec.h
-aomdec.SRCS                 += tools_common.c tools_common.h
-aomdec.SRCS                 += y4menc.c y4menc.h
-ifeq ($(CONFIG_LIBYUV),yes)
-  aomdec.SRCS                 += $(LIBYUV_SRCS)
-endif
-ifeq ($(CONFIG_WEBM_IO),yes)
-  aomdec.SRCS                 += $(LIBWEBM_COMMON_SRCS)
-  aomdec.SRCS                 += $(LIBWEBM_MUXER_SRCS)
-  aomdec.SRCS                 += $(LIBWEBM_PARSER_SRCS)
-  aomdec.SRCS                 += webmdec.cc webmdec.h
-endif
-aomdec.GUID                  = BA5FE66F-38DD-E034-F542-B1578C5FB950
-aomdec.DESCRIPTION           = Full featured decoder
-UTILS-$(CONFIG_AV1_ENCODER) += aomenc.c
-aomenc.SRCS                 += args.c args.h y4minput.c y4minput.h aomenc.h
-aomenc.SRCS                 += ivfdec.c ivfdec.h
-aomenc.SRCS                 += ivfenc.c ivfenc.h
-aomenc.SRCS                 += rate_hist.c rate_hist.h
-aomenc.SRCS                 += tools_common.c tools_common.h
-aomenc.SRCS                 += examples/encoder_util.h examples/encoder_util.c
-aomenc.SRCS                 += warnings.c warnings.h
-aomenc.SRCS                 += aom_ports/mem_ops.h
-aomenc.SRCS                 += aom_ports/mem_ops_aligned.h
-aomenc.SRCS                 += aom_ports/msvc.h
-aomenc.SRCS                 += aom_ports/aom_timer.h
-aomenc.SRCS                 += aomstats.c aomstats.h
-ifeq ($(CONFIG_LIBYUV),yes)
-  aomenc.SRCS                 += $(LIBYUV_SRCS)
-endif
-ifeq ($(CONFIG_WEBM_IO),yes)
-  aomenc.SRCS                 += $(LIBWEBM_COMMON_SRCS)
-  aomenc.SRCS                 += $(LIBWEBM_MUXER_SRCS)
-  aomenc.SRCS                 += $(LIBWEBM_PARSER_SRCS)
-  aomenc.SRCS                 += webmenc.cc webmenc.h
-endif
-aomenc.GUID                  = 548DEC74-7A15-4B2B-AFC3-AA102E7C25C1
-aomenc.DESCRIPTION           = Full featured encoder
-
-ifeq ($(CONFIG_ANALYZER),yes)
-  EXAMPLES-$(CONFIG_AV1_DECODER)     += analyzer.cc
-  analyzer.GUID                       = 83827a8c-e3c3-4b19-8832-0cfc206c4496
-  analyzer.SRCS                      += ivfdec.h ivfdec.c
-  analyzer.SRCS                      += av1/decoder/inspection.h
-  analyzer.SRCS                      += av1/decoder/inspection.c
-  analyzer.SRCS                      += video_reader.h video_reader.c
-  analyzer.SRCS                      += tools_common.h tools_common.c
-  analyzer.DESCRIPTION                = Bitstream analyzer
-endif
-
-ifeq ($(CONFIG_INSPECTION),yes)
-EXAMPLES-$(CONFIG_AV1_DECODER) += inspect.c
-inspect.GUID                   = FA46A420-3356-441F-B0FD-60AA1345C181
-inspect.SRCS                   += ivfdec.h ivfdec.c
-inspect.SRCS                   += args.c args.h
-inspect.SRCS                   += tools_common.h tools_common.c
-inspect.SRCS                   += video_common.h
-inspect.SRCS                   += video_reader.h video_reader.c
-inspect.SRCS                   += aom_ports/mem_ops.h
-inspect.SRCS                   += aom_ports/mem_ops_aligned.h
-inspect.SRCS                   += aom_ports/msvc.h
-inspect.DESCRIPTION             = Dump inspection data
-endif
-
-EXAMPLES-$(CONFIG_AV1_DECODER)     += simple_decoder.c
-simple_decoder.GUID                 = D3BBF1E9-2427-450D-BBFF-B2843C1D44CC
-simple_decoder.SRCS                += ivfdec.h ivfdec.c
-simple_decoder.SRCS                += tools_common.h tools_common.c
-simple_decoder.SRCS                += video_common.h
-simple_decoder.SRCS                += video_reader.h video_reader.c
-simple_decoder.SRCS                += aom_ports/mem_ops.h
-simple_decoder.SRCS                += aom_ports/mem_ops_aligned.h
-simple_decoder.SRCS                += aom_ports/msvc.h
-simple_decoder.DESCRIPTION          = Simplified decoder loop
-EXAMPLES-$(CONFIG_AV1_DECODER)     += decode_to_md5.c
-decode_to_md5.SRCS                 += md5_utils.h md5_utils.c
-decode_to_md5.SRCS                 += ivfdec.h ivfdec.c
-decode_to_md5.SRCS                 += tools_common.h tools_common.c
-decode_to_md5.SRCS                 += video_common.h
-decode_to_md5.SRCS                 += video_reader.h video_reader.c
-decode_to_md5.SRCS                 += aom_ports/mem_ops.h
-decode_to_md5.SRCS                 += aom_ports/mem_ops_aligned.h
-decode_to_md5.SRCS                 += aom_ports/msvc.h
-decode_to_md5.GUID                  = 59120B9B-2735-4BFE-B022-146CA340FE42
-decode_to_md5.DESCRIPTION           = Frame by frame MD5 checksum
-EXAMPLES-$(CONFIG_AV1_ENCODER)  += simple_encoder.c
-simple_encoder.SRCS             += ivfenc.h ivfenc.c
-simple_encoder.SRCS             += tools_common.h tools_common.c
-simple_encoder.SRCS             += video_common.h
-simple_encoder.SRCS             += video_writer.h video_writer.c
-simple_encoder.SRCS             += aom_ports/msvc.h
-simple_encoder.GUID              = 4607D299-8A71-4D2C-9B1D-071899B6FBFD
-simple_encoder.DESCRIPTION       = Simplified encoder loop
-EXAMPLES-$(CONFIG_AV1_ENCODER)  += lossless_encoder.c
-lossless_encoder.SRCS           += ivfenc.h ivfenc.c
-lossless_encoder.SRCS           += tools_common.h tools_common.c
-lossless_encoder.SRCS           += video_common.h
-lossless_encoder.SRCS           += video_writer.h video_writer.c
-lossless_encoder.SRCS           += aom_ports/msvc.h
-lossless_encoder.GUID            = B63C7C88-5348-46DC-A5A6-CC151EF93366
-lossless_encoder.DESCRIPTION     = Simplified lossless encoder
-EXAMPLES-$(CONFIG_AV1_ENCODER)  += twopass_encoder.c
-twopass_encoder.SRCS            += ivfenc.h ivfenc.c
-twopass_encoder.SRCS            += tools_common.h tools_common.c
-twopass_encoder.SRCS            += video_common.h
-twopass_encoder.SRCS            += video_writer.h video_writer.c
-twopass_encoder.SRCS            += aom_ports/msvc.h
-twopass_encoder.GUID             = 73494FA6-4AF9-4763-8FBB-265C92402FD8
-twopass_encoder.DESCRIPTION      = Two-pass encoder loop
-EXAMPLES-$(CONFIG_AV1_DECODER)  += decode_with_drops.c
-decode_with_drops.SRCS          += ivfdec.h ivfdec.c
-decode_with_drops.SRCS          += tools_common.h tools_common.c
-decode_with_drops.SRCS          += video_common.h
-decode_with_drops.SRCS          += video_reader.h video_reader.c
-decode_with_drops.SRCS          += aom_ports/mem_ops.h
-decode_with_drops.SRCS          += aom_ports/mem_ops_aligned.h
-decode_with_drops.SRCS          += aom_ports/msvc.h
-decode_with_drops.GUID           = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26
-decode_with_drops.DESCRIPTION    = Drops frames while decoding
-EXAMPLES-$(CONFIG_AV1_ENCODER)     += set_maps.c
-set_maps.SRCS                      += ivfenc.h ivfenc.c
-set_maps.SRCS                      += tools_common.h tools_common.c
-set_maps.SRCS                      += video_common.h
-set_maps.SRCS                      += video_writer.h video_writer.c
-set_maps.SRCS                      += aom_ports/msvc.h
-set_maps.GUID                       = ECB2D24D-98B8-4015-A465-A4AF3DCC145F
-set_maps.DESCRIPTION                = Set active and ROI maps
-ifeq ($(CONFIG_EXT_TILE),yes)
-EXAMPLES-$(CONFIG_AV1_ENCODER)     += lightfield_encoder.c
-lightfield_encoder.SRCS            += ivfenc.h ivfenc.c
-lightfield_encoder.SRCS            += tools_common.h tools_common.c
-lightfield_encoder.SRCS            += video_common.h
-lightfield_encoder.SRCS            += video_writer.h video_writer.c
-lightfield_encoder.SRCS            += aom_ports/msvc.h
-lightfield_encoder.GUID             = 73494FA6-4AF9-4763-8FBB-265C92402FD8
-lightfield_encoder.DESCRIPTION      = Lightfield encoder loop
-EXAMPLES-$(CONFIG_AV1_DECODER)     += lightfield_decoder.c
-lightfield_decoder.SRCS            += ivfdec.h ivfdec.c
-lightfield_decoder.SRCS            += tools_common.h tools_common.c
-lightfield_decoder.SRCS            += video_common.h
-lightfield_decoder.SRCS            += video_reader.h video_reader.c
-lightfield_decoder.SRCS            += aom_ports/mem_ops.h
-lightfield_decoder.SRCS            += aom_ports/mem_ops_aligned.h
-lightfield_decoder.SRCS            += aom_ports/msvc.h
-lightfield_decoder.GUID             = D3BBF1E9-2427-450D-BBFF-B2843C1D44CC
-lightfield_decoder.DESCRIPTION      = lightfield decoder loop
-endif
-ifeq ($(CONFIG_AV1_ENCODER),yes)
-ifeq ($(CONFIG_AV1_DECODER),yes)
-EXAMPLES-$(CONFIG_AV1_ENCODER)     += aom_cx_set_ref.c
-aom_cx_set_ref.SRCS                += ivfenc.h ivfenc.c
-aom_cx_set_ref.SRCS                += tools_common.h tools_common.c
-aom_cx_set_ref.SRCS                += examples/encoder_util.h
-aom_cx_set_ref.SRCS                += examples/encoder_util.c
-aom_cx_set_ref.SRCS                += video_common.h
-aom_cx_set_ref.SRCS                += video_writer.h video_writer.c
-aom_cx_set_ref.SRCS                += aom_ports/msvc.h
-aom_cx_set_ref.GUID                 = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A
-aom_cx_set_ref.DESCRIPTION          = AV1 set encoder reference frame
-endif
-endif
-
-# Handle extra library flags depending on codec configuration
-
-# We should not link to math library (libm) on RVCT
-# when building for bare-metal targets
-ifeq ($(CONFIG_OS_SUPPORT), yes)
-CODEC_EXTRA_LIBS-$(CONFIG_AV1)            += m
-else
-    ifeq ($(CONFIG_GCC), yes)
-    CODEC_EXTRA_LIBS-$(CONFIG_AV1)        += m
-    endif
-endif
-#
-# End of specified files. The rest of the build rules should happen
-# automagically from here.
-#
-
-
-# Examples need different flags based on whether we're building
-# from an installed tree or a version controlled tree. Determine
-# the proper paths.
-ifeq ($(HAVE_ALT_TREE_LAYOUT),yes)
-    LIB_PATH-yes := $(SRC_PATH_BARE)/../lib
-    INC_PATH-yes := $(SRC_PATH_BARE)/../include
-else
-    LIB_PATH-yes                     += $(if $(BUILD_PFX),$(BUILD_PFX),.)
-    INC_PATH-$(CONFIG_AV1_DECODER)   += $(SRC_PATH_BARE)/av1
-    INC_PATH-$(CONFIG_AV1_ENCODER)   += $(SRC_PATH_BARE)/av1
-endif
-INC_PATH-$(CONFIG_LIBYUV) += $(SRC_PATH_BARE)/third_party/libyuv/include
-LIB_PATH := $(call enabled,LIB_PATH)
-INC_PATH := $(call enabled,INC_PATH)
-INTERNAL_CFLAGS = $(addprefix -I,$(INC_PATH))
-INTERNAL_LDFLAGS += $(addprefix -L,$(LIB_PATH))
-
-
-# Expand list of selected examples to build (as specified above)
-UTILS           = $(call enabled,UTILS)
-EXAMPLES        = $(addprefix examples/,$(call enabled,EXAMPLES))
-ALL_EXAMPLES    = $(UTILS) $(EXAMPLES)
-UTIL_SRCS       = $(foreach ex,$(UTILS),$($(ex:.c=).SRCS) $($(ex:.cc=).SRCS))
-ALL_SRCS        = $(foreach ex, $(ALL_EXAMPLES),  \
-                      $($(notdir $(ex:.c=)).SRCS) \
-                      $($(notdir $(ex:.cc=)).SRCS))
-CODEC_EXTRA_LIBS=$(sort $(call enabled,CODEC_EXTRA_LIBS))
-
-
-# Expand all example sources into a variable containing all sources
-# for that example (not just them main one specified in UTILS/EXAMPLES)
-# and add this file to the list (for MSVS workspace generation)
-EXAMPLES_C = $(filter-out %.cc, $(ALL_EXAMPLES))
-$(foreach ex,$(EXAMPLES_C), \
-    $(eval $(notdir $(ex:.c=)).SRCS += $(ex) examples.mk))
-EXAMPLES_CXX = $(filter-out %.c, $(ALL_EXAMPLES))
-$(foreach ex,$(EXAMPLES_CXX), \
-    $(eval $(notdir $(ex:.cc=)).SRCS += $(ex) examples.mk))
-
-# Create build/install dependencies for all examples. The common case
-# is handled here. The MSVS case is handled below.
-NOT_MSVS = $(if $(CONFIG_MSVS),,yes)
-DIST-BINS-$(NOT_MSVS)      += $(addprefix bin/,$(EXAMPLES_C:.c=$(EXE_SFX)))
-DIST-BINS-$(NOT_MSVS)      += $(addprefix bin/,$(EXAMPLES_CXX:.cc=$(EXE_SFX)))
-INSTALL-BINS-$(NOT_MSVS)   += $(addprefix bin/,$(UTILS:.c=$(EXE_SFX)))
-DIST-SRCS-yes              += $(ALL_SRCS)
-INSTALL-SRCS-yes           += $(UTIL_SRCS)
-OBJS-$(NOT_MSVS)           += $(call objs,$(ALL_SRCS))
-BINS-$(NOT_MSVS)           += $(addprefix $(BUILD_PFX), \
-                                  $(EXAMPLES_C:.c=$(EXE_SFX)))
-BINS-$(NOT_MSVS)           += $(addprefix $(BUILD_PFX), \
-                                  $(EXAMPLES_CXX:.cc=$(EXE_SFX)))
-
-# Instantiate linker template for all examples.
-CODEC_LIB=$(if $(CONFIG_DEBUG_LIBS),aom_g,aom)
-ifneq ($(filter darwin%,$(TGT_OS)),)
-SHARED_LIB_SUF=.dylib
-else
-ifneq ($(filter os2%,$(TGT_OS)),)
-SHARED_LIB_SUF=_dll.a
-else
-SHARED_LIB_SUF=.so
-endif
-endif
-CODEC_LIB_SUF=$(if $(CONFIG_SHARED),$(SHARED_LIB_SUF),.a)
-$(foreach bin,$(BINS-yes),\
-    $(eval $(bin):$(LIB_PATH)/lib$(CODEC_LIB)$(CODEC_LIB_SUF))\
-    $(eval $(call linker_template,$(bin),\
-        $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) \
-        -l$(CODEC_LIB) $(addprefix -l,$(CODEC_EXTRA_LIBS))\
-        )))
-
-# The following pairs define a mapping of locations in the distribution
-# tree to locations in the source/build trees.
-INSTALL_MAPS += src/%.c   %.c
-INSTALL_MAPS += src/%     $(SRC_PATH_BARE)/%
-INSTALL_MAPS += bin/%     %
-INSTALL_MAPS += %         %
-
-
-# Set up additional MSVS environment
-ifeq ($(CONFIG_MSVS),yes)
-CODEC_LIB=$(if $(CONFIG_SHARED),aom,$(if $(CONFIG_STATIC_MSVCRT),aommt,aommd))
-# This variable uses deferred expansion intentionally, since the results of
-# $(wildcard) may change during the course of the Make.
-VS_PLATFORMS = $(foreach d,$(wildcard */Release/$(CODEC_LIB).lib),$(word 1,$(subst /, ,$(d))))
-INSTALL_MAPS += $(foreach p,$(VS_PLATFORMS),bin/$(p)/%  $(p)/Release/%)
-endif
-
-# Build Visual Studio Projects. We use a template here to instantiate
-# explicit rules rather than using an implicit rule because we want to
-# leverage make's VPATH searching rather than specifying the paths on
-# each file in ALL_EXAMPLES. This has the unfortunate side effect that
-# touching the source files trigger a rebuild of the project files
-# even though there is no real dependency there (the dependency is on
-# the makefiles). We may want to revisit this.
-define vcproj_template
-$(1): $($(1:.$(VCPROJ_SFX)=).SRCS) aom.$(VCPROJ_SFX)
-	$(if $(quiet),@echo "    [vcproj] $$@")
-	$(qexec)$$(GEN_VCPROJ)\
-            --exe\
-            --target=$$(TOOLCHAIN)\
-            --name=$$(@:.$(VCPROJ_SFX)=)\
-            --ver=$$(CONFIG_VS_VERSION)\
-            --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\
-            --src-path-bare="$(SRC_PATH_BARE)" \
-            $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
-            --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
-            $$(INTERNAL_LDFLAGS) $$(LDFLAGS) -l$$(CODEC_LIB) $$^
-endef
-ALL_EXAMPLES_BASENAME := $(notdir $(ALL_EXAMPLES))
-PROJECTS-$(CONFIG_MSVS) += $(ALL_EXAMPLES_BASENAME:.c=.$(VCPROJ_SFX))
-INSTALL-BINS-$(CONFIG_MSVS) += $(foreach p,$(VS_PLATFORMS),\
-                               $(addprefix bin/$(p)/,$(ALL_EXAMPLES_BASENAME:.c=.exe)))
-$(foreach proj,$(call enabled,PROJECTS),\
-    $(eval $(call vcproj_template,$(proj))))
-
-#
-# Documentation Rules
-#
-%.dox: %.c
-	@echo "    [DOXY] $@"
-	@mkdir -p $(dir $@)
-	@echo "/*!\page example_$(@F:.dox=) $(@F:.dox=)" > $@
-	@echo "   \includelineno $(<F)" >> $@
-	@echo "*/" >> $@
-
-samples.dox: examples.mk
-	@echo "    [DOXY] $@"
-	@echo "/*!\page samples Sample Code" > $@
-	@echo "    This SDK includes a number of sample applications."\
-	      "Each sample documents a feature of the SDK in both prose"\
-	      "and the associated C code."\
-	      "The following samples are included: ">>$@
-	@$(foreach ex,$(sort $(notdir $(EXAMPLES:.c=))),\
-	   echo "     - \subpage example_$(ex) $($(ex).DESCRIPTION)" >> $@;)
-	@echo >> $@
-	@echo "    In addition, the SDK contains a number of utilities."\
-              "Since these utilities are built upon the concepts described"\
-              "in the sample code listed above, they are not documented in"\
-              "pieces like the samples are. Their source is included here"\
-              "for reference. The following utilities are included:" >> $@
-	@$(foreach ex,$(sort $(UTILS:.c=)),\
-	   echo "     - \subpage example_$(ex) $($(ex).DESCRIPTION)" >> $@;)
-	@echo "*/" >> $@
-
-CLEAN-OBJS += examples.doxy samples.dox $(ALL_EXAMPLES:.c=.dox)
-DOCS-yes += examples.doxy samples.dox
-examples.doxy: samples.dox $(ALL_EXAMPLES:.c=.dox)
-	@echo "INPUT += $^" > $@
diff --git a/third_party/aom/examples/analyzer.cc b/third_party/aom/examples/analyzer.cc
index ae0801964..6a42eca24 100644
--- a/third_party/aom/examples/analyzer.cc
+++ b/third_party/aom/examples/analyzer.cc
@@ -12,13 +12,14 @@
 #include <wx/aboutdlg.h>
 #include <wx/cmdline.h>
 #include <wx/dcbuffer.h>
-#include "./tools_common.h"
-#include "./video_reader.h"
+
 #include "aom/aom_decoder.h"
 #include "aom/aomdx.h"
-#include "av1/decoder/accounting.h"
 #include "av1/common/onyxc_int.h"
+#include "av1/decoder/accounting.h"
 #include "av1/decoder/inspection.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
 
 #define OD_SIGNMASK(a) (-((a) < 0))
 #define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b))
@@ -108,7 +109,7 @@ bool AV1Decoder::step() {
     size_t frame_size;
     const unsigned char *frame_data;
     frame_data = aom_video_reader_get_frame(reader, &frame_size);
-    if (aom_codec_decode(&codec, frame_data, frame_size, NULL, 0)) {
+    if (aom_codec_decode(&codec, frame_data, frame_size, NULL)) {
       fprintf(stderr, "Failed to decode frame.");
       return false;
     } else {
@@ -129,11 +130,10 @@ int AV1Decoder::getWidth() const {
 }
 
 int AV1Decoder::getWidthPadding() const {
-  return show_padding
-             ? AOMMAX(info->frame_width + 16,
-                      ALIGN_POWER_OF_TWO(info->frame_width, 6)) -
-                   info->frame_width
-             : 0;
+  return show_padding ? AOMMAX(info->frame_width + 16,
+                               ALIGN_POWER_OF_TWO(info->frame_width, 6)) -
+                            info->frame_width
+                      : 0;
 }
 
 int AV1Decoder::getHeight() const {
@@ -141,11 +141,10 @@ int AV1Decoder::getHeight() const {
 }
 
 int AV1Decoder::getHeightPadding() const {
-  return show_padding
-             ? AOMMAX(info->frame_height + 16,
-                      ALIGN_POWER_OF_TWO(info->frame_height, 6)) -
-                   info->frame_height
-             : 0;
+  return show_padding ? AOMMAX(info->frame_height + 16,
+                               ALIGN_POWER_OF_TWO(info->frame_height, 6)) -
+                            info->frame_height
+                      : 0;
 }
 
 bool AV1Decoder::getAccountingStruct(Accounting **accounting) {
@@ -285,19 +284,22 @@ void AnalyzerPanel::render() {
       cbval = ((pmask & OD_CB_MASK) >> 1) * (cbval - 128);
       crval = ((pmask & OD_CR_MASK) >> 2) * (crval - 128);
       /*This is intentionally slow and very accurate.*/
-      rval = OD_CLAMPI(0, (int32_t)OD_DIV_ROUND(
-                              2916394880000LL * yval + 4490222169144LL * crval,
-                              9745792000LL),
-                       65535);
-      gval = OD_CLAMPI(0, (int32_t)OD_DIV_ROUND(2916394880000LL * yval -
-                                                    534117096223LL * cbval -
-                                                    1334761232047LL * crval,
-                                                9745792000LL),
-                       65535);
-      bval = OD_CLAMPI(0, (int32_t)OD_DIV_ROUND(
-                              2916394880000LL * yval + 5290866304968LL * cbval,
-                              9745792000LL),
+      rval = OD_CLAMPI(
+          0,
+          (int32_t)OD_DIV_ROUND(
+              2916394880000LL * yval + 4490222169144LL * crval, 9745792000LL),
+          65535);
+      gval = OD_CLAMPI(0,
+                       (int32_t)OD_DIV_ROUND(2916394880000LL * yval -
+                                                 534117096223LL * cbval -
+                                                 1334761232047LL * crval,
+                                             9745792000LL),
                        65535);
+      bval = OD_CLAMPI(
+          0,
+          (int32_t)OD_DIV_ROUND(
+              2916394880000LL * yval + 5290866304968LL * cbval, 9745792000LL),
+          65535);
       unsigned char *px_row = p;
       for (int v = 0; v < zoom; v++) {
         unsigned char *px = px_row;
@@ -701,8 +703,8 @@ bool Analyzer::OnCmdLineParsed(wxCmdLineParser &parser) {  // NOLINT
   bool bit_accounting = parser.Found(_("a"));
   if (bit_accounting && !CONFIG_ACCOUNTING) {
     fprintf(stderr,
-            "Bit accounting support not found.  "
-            "Recompile with:\n./configure --enable-accounting\n");
+            "Bit accounting support not found. "
+            "Recompile with:\n./cmake -DCONFIG_ACCOUNTING=1\n");
     return false;
   }
   frame = new AnalyzerFrame(parser.Found(_("a")));
diff --git a/third_party/aom/examples/aom_cx_set_ref.c b/third_party/aom/examples/aom_cx_set_ref.c
index 456e81300..e02e94c07 100644
--- a/third_party/aom/examples/aom_cx_set_ref.c
+++ b/third_party/aom/examples/aom_cx_set_ref.c
@@ -51,12 +51,14 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "aom/aomcx.h"
 #include "aom/aom_decoder.h"
 #include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
 #include "examples/encoder_util.h"
-#include "./tools_common.h"
-#include "./video_writer.h"
+
+#define AOM_BORDER_IN_PIXELS 288
 
 static const char *exec_name;
 
@@ -71,25 +73,41 @@ void usage_exit() {
 static void testing_decode(aom_codec_ctx_t *encoder, aom_codec_ctx_t *decoder,
                            unsigned int frame_out, int *mismatch_seen) {
   aom_image_t enc_img, dec_img;
-  struct av1_ref_frame ref_enc, ref_dec;
 
   if (*mismatch_seen) return;
 
-  ref_enc.idx = 0;
-  ref_dec.idx = 0;
-  if (aom_codec_control(encoder, AV1_GET_REFERENCE, &ref_enc))
+  /* Get the internal reference frame */
+  if (aom_codec_control(encoder, AV1_GET_NEW_FRAME_IMAGE, &enc_img))
     die_codec(encoder, "Failed to get encoder reference frame");
-  enc_img = ref_enc.img;
-  if (aom_codec_control(decoder, AV1_GET_REFERENCE, &ref_dec))
+  if (aom_codec_control(decoder, AV1_GET_NEW_FRAME_IMAGE, &dec_img))
     die_codec(decoder, "Failed to get decoder reference frame");
-  dec_img = ref_dec.img;
+
+  if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) !=
+      (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
+    if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_image_t enc_hbd_img;
+      aom_img_alloc(&enc_hbd_img, enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
+                    enc_img.d_w, enc_img.d_h, 16);
+      aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img);
+      enc_img = enc_hbd_img;
+    }
+    if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_image_t dec_hbd_img;
+      aom_img_alloc(&dec_hbd_img, dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
+                    dec_img.d_w, dec_img.d_h, 16);
+      aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img);
+      dec_img = dec_hbd_img;
+    }
+  }
 
   if (!aom_compare_img(&enc_img, &dec_img)) {
     int y[4], u[4], v[4];
+    if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_find_mismatch_high(&enc_img, &dec_img, y, u, v);
+    } else {
+      aom_find_mismatch(&enc_img, &dec_img, y, u, v);
+    }
 
-    *mismatch_seen = 1;
-
-    aom_find_mismatch(&enc_img, &dec_img, y, u, v);
     printf(
         "Encode/decode mismatch on frame %d at"
         " Y[%d, %d] {%d/%d},"
@@ -97,6 +115,7 @@ static void testing_decode(aom_codec_ctx_t *encoder, aom_codec_ctx_t *decoder,
         " V[%d, %d] {%d/%d}",
         frame_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], v[1],
         v[2], v[3]);
+    *mismatch_seen = 1;
   }
 
   aom_img_free(&enc_img);
@@ -106,13 +125,13 @@ static void testing_decode(aom_codec_ctx_t *encoder, aom_codec_ctx_t *decoder,
 static int encode_frame(aom_codec_ctx_t *ecodec, aom_image_t *img,
                         unsigned int frame_in, AvxVideoWriter *writer,
                         int test_decode, aom_codec_ctx_t *dcodec,
-                        unsigned int *frame_out, int *mismatch_seen) {
+                        unsigned int *frame_out, int *mismatch_seen,
+                        aom_image_t *ext_ref) {
   int got_pkts = 0;
   aom_codec_iter_t iter = NULL;
   const aom_codec_cx_pkt_t *pkt = NULL;
   int got_data;
-  const aom_codec_err_t res =
-      aom_codec_encode(ecodec, img, frame_in, 1, 0, AOM_DL_GOOD_QUALITY);
+  const aom_codec_err_t res = aom_codec_encode(ecodec, img, frame_in, 1, 0);
   if (res != AOM_CODEC_OK) die_codec(ecodec, "Failed to encode frame");
 
   got_data = 0;
@@ -139,8 +158,13 @@ static int encode_frame(aom_codec_ctx_t *ecodec, aom_image_t *img,
       // Decode 1 frame.
       if (test_decode) {
         if (aom_codec_decode(dcodec, pkt->data.frame.buf,
-                             (unsigned int)pkt->data.frame.sz, NULL, 0))
+                             (unsigned int)pkt->data.frame.sz, NULL))
           die_codec(dcodec, "Failed to decode frame.");
+
+        // Copy out first decoded frame, and use it as reference later.
+        if (*frame_out == 1 && ext_ref != NULL)
+          if (aom_codec_control(dcodec, AV1_GET_NEW_FRAME_IMAGE, ext_ref))
+            die_codec(dcodec, "Failed to get decoder new frame");
       }
     }
   }
@@ -160,10 +184,16 @@ int main(int argc, char **argv) {
   aom_codec_enc_cfg_t cfg;
   unsigned int frame_in = 0;
   aom_image_t raw;
+  aom_image_t raw_shift;
+  aom_image_t ext_ref;
   aom_codec_err_t res;
   AvxVideoInfo info;
   AvxVideoWriter *writer = NULL;
   const AvxInterface *encoder = NULL;
+  int flags = 0;
+  int allocated_raw_shift = 0;
+  aom_img_fmt_t raw_fmt = AOM_IMG_FMT_I420;
+  aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420;
 
   // Test encoder/decoder mismatch.
   int test_decode = 1;
@@ -225,13 +255,21 @@ int main(int argc, char **argv) {
   info.time_base.numerator = 1;
   info.time_base.denominator = fps;
 
-  if (info.frame_width <= 0 || info.frame_height <= 0 ||
-      (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
+  if (info.frame_width <= 0 || info.frame_height <= 0) {
     die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
   }
 
-  if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, info.frame_width,
-                     info.frame_height, 1)) {
+  // In this test, the bit depth of input video is 8-bit, and the input format
+  // is AOM_IMG_FMT_I420.
+  if (!aom_img_alloc(&raw, raw_fmt, info.frame_width, info.frame_height, 32)) {
+    die("Failed to allocate image.");
+  }
+
+  if (!CONFIG_LOWBITDEPTH) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  // Allocate memory with the border so that it can be used as a reference.
+  if (!aom_img_alloc_with_border(&ext_ref, ref_fmt, info.frame_width,
+                                 info.frame_height, 32, 8,
+                                 AOM_BORDER_IN_PIXELS)) {
     die("Failed to allocate image.");
   }
 
@@ -246,6 +284,11 @@ int main(int argc, char **argv) {
   cfg.g_timebase.den = info.time_base.denominator;
   cfg.rc_target_bitrate = bitrate;
   cfg.g_lag_in_frames = 3;
+  cfg.g_bit_depth = AOM_BITS_8;
+
+  flags |= (cfg.g_bit_depth > AOM_BITS_8 || !CONFIG_LOWBITDEPTH)
+               ? AOM_CODEC_USE_HIGHBITDEPTH
+               : 0;
 
   writer = aom_video_writer_open(outfile_arg, kContainerIVF, &info);
   if (!writer) die("Failed to open %s for writing.", outfile_arg);
@@ -253,7 +296,7 @@ int main(int argc, char **argv) {
   if (!(infile = fopen(infile_arg, "rb")))
     die("Failed to open %s for reading.", infile_arg);
 
-  if (aom_codec_enc_init(&ecodec, encoder->codec_interface(), &cfg, 0))
+  if (aom_codec_enc_init(&ecodec, encoder->codec_interface(), &cfg, flags))
     die_codec(&ecodec, "Failed to initialize encoder");
 
   // Disable alt_ref.
@@ -269,25 +312,43 @@ int main(int argc, char **argv) {
   // Encode frames.
   while (aom_img_read(&raw, infile)) {
     if (limit && frame_in >= limit) break;
+    aom_image_t *frame_to_encode;
+
+    if (!CONFIG_LOWBITDEPTH) {
+      // Need to allocate larger buffer to use hbd internal.
+      int input_shift = 0;
+      if (!allocated_raw_shift) {
+        aom_img_alloc(&raw_shift, raw_fmt | AOM_IMG_FMT_HIGHBITDEPTH,
+                      info.frame_width, info.frame_height, 32);
+        allocated_raw_shift = 1;
+      }
+      aom_img_upshift(&raw_shift, &raw, input_shift);
+      frame_to_encode = &raw_shift;
+    } else {
+      frame_to_encode = &raw;
+    }
+
     if (update_frame_num > 1 && frame_out + 1 == update_frame_num) {
       av1_ref_frame_t ref;
       ref.idx = 0;
-      ref.img = raw;
+      ref.use_external_ref = 0;
+      ref.img = ext_ref;
       // Set reference frame in encoder.
       if (aom_codec_control(&ecodec, AV1_SET_REFERENCE, &ref))
-        die_codec(&ecodec, "Failed to set reference frame");
+        die_codec(&ecodec, "Failed to set encoder reference frame");
       printf(" <SET_REF>");
 
       // If set_reference in decoder is commented out, the enc/dec mismatch
       // would be seen.
       if (test_decode) {
+        ref.use_external_ref = 1;
         if (aom_codec_control(&dcodec, AV1_SET_REFERENCE, &ref))
-          die_codec(&dcodec, "Failed to set reference frame");
+          die_codec(&dcodec, "Failed to set decoder reference frame");
       }
     }
 
-    encode_frame(&ecodec, &raw, frame_in, writer, test_decode, &dcodec,
-                 &frame_out, &mismatch_seen);
+    encode_frame(&ecodec, frame_to_encode, frame_in, writer, test_decode,
+                 &dcodec, &frame_out, &mismatch_seen, &ext_ref);
     frame_in++;
     if (mismatch_seen) break;
   }
@@ -295,7 +356,7 @@ int main(int argc, char **argv) {
   // Flush encoder.
   if (!mismatch_seen)
     while (encode_frame(&ecodec, NULL, frame_in, writer, test_decode, &dcodec,
-                        &frame_out, &mismatch_seen)) {
+                        &frame_out, &mismatch_seen, NULL)) {
     }
 
   printf("\n");
@@ -313,6 +374,8 @@ int main(int argc, char **argv) {
     if (aom_codec_destroy(&dcodec))
       die_codec(&dcodec, "Failed to destroy decoder");
 
+  if (allocated_raw_shift) aom_img_free(&raw_shift);
+  aom_img_free(&ext_ref);
   aom_img_free(&raw);
   if (aom_codec_destroy(&ecodec))
     die_codec(&ecodec, "Failed to destroy encoder.");
diff --git a/third_party/aom/examples/decode_to_md5.c b/third_party/aom/examples/decode_to_md5.c
index 5ab253209..bc127b78d 100644
--- a/third_party/aom/examples/decode_to_md5.c
+++ b/third_party/aom/examples/decode_to_md5.c
@@ -34,13 +34,11 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "aom/aomdx.h"
 #include "aom/aom_decoder.h"
-
-#include "../md5_utils.h"
-#include "../tools_common.h"
-#include "../video_reader.h"
-#include "./aom_config.h"
+#include "aom/aomdx.h"
+#include "common/md5_utils.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
 
 static void get_image_md5(const aom_image_t *img, unsigned char digest[16]) {
   int plane, y;
@@ -110,7 +108,7 @@ int main(int argc, char **argv) {
     size_t frame_size = 0;
     const unsigned char *frame =
         aom_video_reader_get_frame(reader, &frame_size);
-    if (aom_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0))
+    if (aom_codec_decode(&codec, frame, frame_size, NULL))
       die_codec(&codec, "Failed to decode frame");
 
     while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
diff --git a/third_party/aom/examples/decode_with_drops.c b/third_party/aom/examples/decode_with_drops.c
index 45e0fb027..214401958 100644
--- a/third_party/aom/examples/decode_with_drops.c
+++ b/third_party/aom/examples/decode_with_drops.c
@@ -57,12 +57,10 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "aom/aomdx.h"
 #include "aom/aom_decoder.h"
-
-#include "../tools_common.h"
-#include "../video_reader.h"
-#include "./aom_config.h"
+#include "aom/aomdx.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
 
 static const char *exec_name;
 
@@ -116,9 +114,6 @@ int main(int argc, char **argv) {
     int skip;
     const unsigned char *frame =
         aom_video_reader_get_frame(reader, &frame_size);
-    if (aom_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0))
-      die_codec(&codec, "Failed to decode frame.");
-
     ++frame_cnt;
 
     skip = (is_range && frame_cnt >= n && frame_cnt <= m) ||
@@ -126,6 +121,8 @@ int main(int argc, char **argv) {
 
     if (!skip) {
       putc('.', stdout);
+      if (aom_codec_decode(&codec, frame, frame_size, NULL))
+        die_codec(&codec, "Failed to decode frame.");
 
       while ((img = aom_codec_get_frame(&codec, &iter)) != NULL)
         aom_img_write(img, outfile);
diff --git a/third_party/aom/examples/encoder_util.c b/third_party/aom/examples/encoder_util.c
index 1aa3a7eef..e43b37250 100644
--- a/third_party/aom/examples/encoder_util.c
+++ b/third_party/aom/examples/encoder_util.c
@@ -11,10 +11,11 @@
 
 // Utility functions used by encoder binaries.
 
+#include "examples/encoder_util.h"
+
 #include <assert.h>
 #include <string.h>
 
-#include "./encoder_util.h"
 #include "aom/aom_integer.h"
 
 #define mmin(a, b) ((a) < (b) ? (a) : (b))
@@ -40,6 +41,7 @@ static void find_mismatch_plane(const aom_image_t *const img1,
   assert(img1->x_chroma_shift == img2->x_chroma_shift &&
          img1->y_chroma_shift == img2->y_chroma_shift);
   loc[0] = loc[1] = loc[2] = loc[3] = -1;
+  if (img1->monochrome && img2->monochrome && plane) return;
   int match = 1;
   uint32_t i, j;
   for (i = 0; match && i < c_h; i += bsizey) {
@@ -79,21 +81,16 @@ static void find_mismatch_helper(const aom_image_t *const img1,
                                  const aom_image_t *const img2,
                                  int use_highbitdepth, int yloc[4], int uloc[4],
                                  int vloc[4]) {
-#if !CONFIG_HIGHBITDEPTH
-  assert(!use_highbitdepth);
-#endif  // !CONFIG_HIGHBITDEPTH
   find_mismatch_plane(img1, img2, AOM_PLANE_Y, use_highbitdepth, yloc);
   find_mismatch_plane(img1, img2, AOM_PLANE_U, use_highbitdepth, uloc);
   find_mismatch_plane(img1, img2, AOM_PLANE_V, use_highbitdepth, vloc);
 }
 
-#if CONFIG_HIGHBITDEPTH
 void aom_find_mismatch_high(const aom_image_t *const img1,
                             const aom_image_t *const img2, int yloc[4],
                             int uloc[4], int vloc[4]) {
   find_mismatch_helper(img1, img2, 1, yloc, uloc, vloc);
 }
-#endif
 
 void aom_find_mismatch(const aom_image_t *const img1,
                        const aom_image_t *const img2, int yloc[4], int uloc[4],
@@ -103,37 +100,37 @@ void aom_find_mismatch(const aom_image_t *const img1,
 
 int aom_compare_img(const aom_image_t *const img1,
                     const aom_image_t *const img2) {
+  assert(img1->cp == img2->cp);
+  assert(img1->tc == img2->tc);
+  assert(img1->mc == img2->mc);
+  assert(img1->monochrome == img2->monochrome);
+
+  int num_planes = img1->monochrome ? 1 : 3;
+
   uint32_t l_w = img1->d_w;
   uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
   const uint32_t c_h =
       (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
-  uint32_t i;
   int match = 1;
 
   match &= (img1->fmt == img2->fmt);
   match &= (img1->d_w == img2->d_w);
   match &= (img1->d_h == img2->d_h);
-#if CONFIG_HIGHBITDEPTH
   if (img1->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
     l_w *= 2;
     c_w *= 2;
   }
-#endif
-
-  for (i = 0; i < img1->d_h; ++i)
-    match &= (memcmp(img1->planes[AOM_PLANE_Y] + i * img1->stride[AOM_PLANE_Y],
-                     img2->planes[AOM_PLANE_Y] + i * img2->stride[AOM_PLANE_Y],
-                     l_w) == 0);
-
-  for (i = 0; i < c_h; ++i)
-    match &= (memcmp(img1->planes[AOM_PLANE_U] + i * img1->stride[AOM_PLANE_U],
-                     img2->planes[AOM_PLANE_U] + i * img2->stride[AOM_PLANE_U],
-                     c_w) == 0);
-
-  for (i = 0; i < c_h; ++i)
-    match &= (memcmp(img1->planes[AOM_PLANE_V] + i * img1->stride[AOM_PLANE_V],
-                     img2->planes[AOM_PLANE_V] + i * img2->stride[AOM_PLANE_V],
-                     c_w) == 0);
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    uint32_t height = plane ? c_h : img1->d_h;
+    uint32_t width = plane ? c_w : l_w;
+
+    for (uint32_t i = 0; i < height; ++i) {
+      match &=
+          (memcmp(img1->planes[plane] + i * img1->stride[plane],
+                  img2->planes[plane] + i * img2->stride[plane], width) == 0);
+    }
+  }
 
   return match;
 }
diff --git a/third_party/aom/examples/encoder_util.h b/third_party/aom/examples/encoder_util.h
index 38deef03d..966f5e004 100644
--- a/third_party/aom/examples/encoder_util.h
+++ b/third_party/aom/examples/encoder_util.h
@@ -14,16 +14,13 @@
 #ifndef EXAMPLES_ENCODER_UTIL_H_
 #define EXAMPLES_ENCODER_UTIL_H_
 
-#include "./aom_config.h"
 #include "aom/aom_image.h"
 
 // Returns mismatch location (?loc[0],?loc[1]) and the values at that location
 // in img1 (?loc[2]) and img2 (?loc[3]).
-#if CONFIG_HIGHBITDEPTH
 void aom_find_mismatch_high(const aom_image_t *const img1,
                             const aom_image_t *const img2, int yloc[4],
                             int uloc[4], int vloc[4]);
-#endif  // CONFIG_HIGHBITDEPTH
 
 void aom_find_mismatch(const aom_image_t *const img1,
                        const aom_image_t *const img2, int yloc[4], int uloc[4],
diff --git a/third_party/aom/examples/inspect.c b/third_party/aom/examples/inspect.c
index 74e770b84..4887fc4a3 100644
--- a/third_party/aom/examples/inspect.c
+++ b/third_party/aom/examples/inspect.c
@@ -19,27 +19,27 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "./args.h"
 #ifdef __EMSCRIPTEN__
 #include <emscripten.h>
 #else
 #define EMSCRIPTEN_KEEPALIVE
 #endif
 
+#include "config/aom_config.h"
+
 #include "aom/aom_decoder.h"
-#include "./aom_config.h"
-#if CONFIG_ACCOUNTING
-#include "../av1/decoder/accounting.h"
-#endif
-#include "../av1/decoder/inspection.h"
 #include "aom/aomdx.h"
+#include "av1/common/onyxc_int.h"
 
-#include "../tools_common.h"
-#include "../video_reader.h"
-// #include "av1/av1_dx_iface.c"
-#include "../av1/common/onyxc_int.h"
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#endif
 
-#include "../video_common.h"
+#include "av1/decoder/inspection.h"
+#include "common/args.h"
+#include "common/tools_common.h"
+#include "common/video_common.h"
+#include "common/video_reader.h"
 
 // Max JSON buffer size.
 const int MAX_BUFFER = 1024 * 1024 * 32;
@@ -57,7 +57,10 @@ typedef enum {
   MOTION_VECTORS_LAYER = 1 << 9,
   UV_MODE_LAYER = 1 << 10,
   CFL_LAYER = 1 << 11,
-  ALL_LAYERS = (1 << 12) - 1
+  DUAL_FILTER_LAYER = 1 << 12,
+  Q_INDEX_LAYER = 1 << 13,
+  SEGMENT_ID_LAYER = 1 << 14,
+  ALL_LAYERS = (1 << 15) - 1
 } LayerType;
 
 static LayerType layers = 0;
@@ -87,12 +90,16 @@ static const arg_def_t dump_skip_arg = ARG_DEF("s", "skip", 0, "Dump Skip");
 static const arg_def_t dump_filter_arg =
     ARG_DEF("f", "filter", 0, "Dump Filter");
 static const arg_def_t dump_cdef_arg = ARG_DEF("c", "cdef", 0, "Dump CDEF");
-#if CONFIG_CFL
 static const arg_def_t dump_cfl_arg =
     ARG_DEF("cfl", "chroma_from_luma", 0, "Dump Chroma from Luma Alphas");
-#endif
+static const arg_def_t dump_dual_filter_type_arg =
+    ARG_DEF("df", "dualFilterType", 0, "Dump Dual Filter Type");
 static const arg_def_t dump_reference_frame_arg =
     ARG_DEF("r", "referenceFrame", 0, "Dump Reference Frame");
+static const arg_def_t dump_delta_q_arg =
+    ARG_DEF("dq", "delta_q", 0, "Dump QIndex");
+static const arg_def_t dump_seg_id_arg =
+    ARG_DEF("si", "seg_id", 0, "Dump Segment ID");
 static const arg_def_t usage_arg = ARG_DEF("h", "help", 0, "Help");
 
 static const arg_def_t *main_args[] = { &limit_arg,
@@ -108,14 +115,13 @@ static const arg_def_t *main_args[] = { &limit_arg,
                                         &dump_uv_mode_arg,
                                         &dump_skip_arg,
                                         &dump_filter_arg,
-#if CONFIG_CDEF
                                         &dump_cdef_arg,
-#endif
-#if CONFIG_CFL
+                                        &dump_dual_filter_type_arg,
                                         &dump_cfl_arg,
-#endif
                                         &dump_reference_frame_arg,
                                         &dump_motion_vectors_arg,
+                                        &dump_delta_q_arg,
+                                        &dump_seg_id_arg,
                                         &usage_arg,
                                         NULL };
 #define ENUM(name) \
@@ -127,58 +133,35 @@ typedef struct map_entry {
   int value;
 } map_entry;
 
-const map_entry refs_map[] = { ENUM(INTRA_FRAME),  ENUM(LAST_FRAME),
-#if CONFIG_EXT_REFS
-                               ENUM(LAST2_FRAME),  ENUM(LAST3_FRAME),
-                               ENUM(GOLDEN_FRAME), ENUM(BWDREF_FRAME),
-                               ENUM(ALTREF_FRAME),
-#else
-                               ENUM(GOLDEN_FRAME), ENUM(ALTREF_FRAME),
-#endif
-                               LAST_ENUM };
+const map_entry refs_map[] = {
+  ENUM(INTRA_FRAME),   ENUM(LAST_FRAME),   ENUM(LAST2_FRAME),
+  ENUM(LAST3_FRAME),   ENUM(GOLDEN_FRAME), ENUM(BWDREF_FRAME),
+  ENUM(ALTREF2_FRAME), ENUM(ALTREF_FRAME), LAST_ENUM
+};
 
 const map_entry block_size_map[] = {
-#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
-  ENUM(BLOCK_2X2),    ENUM(BLOCK_2X4),    ENUM(BLOCK_4X2),
-#endif
-  ENUM(BLOCK_4X4),    ENUM(BLOCK_4X8),    ENUM(BLOCK_8X4),
-  ENUM(BLOCK_8X8),    ENUM(BLOCK_8X16),   ENUM(BLOCK_16X8),
-  ENUM(BLOCK_16X16),  ENUM(BLOCK_16X32),  ENUM(BLOCK_32X16),
-  ENUM(BLOCK_32X32),  ENUM(BLOCK_32X64),  ENUM(BLOCK_64X32),
-  ENUM(BLOCK_64X64),
-#if CONFIG_EXT_PARTITION
-  ENUM(BLOCK_64X128), ENUM(BLOCK_128X64), ENUM(BLOCK_128X128),
-#endif
-  ENUM(BLOCK_4X16),   ENUM(BLOCK_16X4),   ENUM(BLOCK_8X32),
-  ENUM(BLOCK_32X8),   ENUM(BLOCK_16X64),  ENUM(BLOCK_64X16),
-#if CONFIG_EXT_PARTITION
-  ENUM(BLOCK_32X128), ENUM(BLOCK_128X32),
-#endif
-  LAST_ENUM
+  ENUM(BLOCK_4X4),     ENUM(BLOCK_4X8),    ENUM(BLOCK_8X4),
+  ENUM(BLOCK_8X8),     ENUM(BLOCK_8X16),   ENUM(BLOCK_16X8),
+  ENUM(BLOCK_16X16),   ENUM(BLOCK_16X32),  ENUM(BLOCK_32X16),
+  ENUM(BLOCK_32X32),   ENUM(BLOCK_32X64),  ENUM(BLOCK_64X32),
+  ENUM(BLOCK_64X64),   ENUM(BLOCK_64X128), ENUM(BLOCK_128X64),
+  ENUM(BLOCK_128X128), ENUM(BLOCK_4X16),   ENUM(BLOCK_16X4),
+  ENUM(BLOCK_8X32),    ENUM(BLOCK_32X8),   ENUM(BLOCK_16X64),
+  ENUM(BLOCK_64X16),   LAST_ENUM
 };
 
 const map_entry tx_size_map[] = {
-#if CONFIG_CHROMA_2X2
-  ENUM(TX_2X2),
-#endif
   ENUM(TX_4X4),   ENUM(TX_8X8),   ENUM(TX_16X16), ENUM(TX_32X32),
-#if CONFIG_TX64X64
-  ENUM(TX_64X64),
-#endif
-  ENUM(TX_4X8),   ENUM(TX_8X4),   ENUM(TX_8X16),  ENUM(TX_16X8),
-  ENUM(TX_16X32), ENUM(TX_32X16),
-#if CONFIG_TX64X64
-  ENUM(TX_32X64), ENUM(TX_64X32),
-#endif  // CONFIG_TX64X64
-  ENUM(TX_4X16),  ENUM(TX_16X4),  ENUM(TX_8X32),  ENUM(TX_32X8),
-  LAST_ENUM
+  ENUM(TX_64X64), ENUM(TX_4X8),   ENUM(TX_8X4),   ENUM(TX_8X16),
+  ENUM(TX_16X8),  ENUM(TX_16X32), ENUM(TX_32X16), ENUM(TX_32X64),
+  ENUM(TX_64X32), ENUM(TX_4X16),  ENUM(TX_16X4),  ENUM(TX_8X32),
+  ENUM(TX_32X8),  LAST_ENUM
 };
 
 const map_entry tx_type_map[] = { ENUM(DCT_DCT),
                                   ENUM(ADST_DCT),
                                   ENUM(DCT_ADST),
                                   ENUM(ADST_ADST),
-#if CONFIG_EXT_TX
                                   ENUM(FLIPADST_DCT),
                                   ENUM(DCT_FLIPADST),
                                   ENUM(FLIPADST_FLIPADST),
@@ -191,43 +174,35 @@ const map_entry tx_type_map[] = { ENUM(DCT_DCT),
                                   ENUM(H_ADST),
                                   ENUM(V_FLIPADST),
                                   ENUM(H_FLIPADST),
-#endif
                                   LAST_ENUM };
+const map_entry dual_filter_map[] = { ENUM(REG_REG),       ENUM(REG_SMOOTH),
+                                      ENUM(REG_SHARP),     ENUM(SMOOTH_REG),
+                                      ENUM(SMOOTH_SMOOTH), ENUM(SMOOTH_SHARP),
+                                      ENUM(SHARP_REG),     ENUM(SHARP_SMOOTH),
+                                      ENUM(SHARP_SHARP),   LAST_ENUM };
 
 const map_entry prediction_mode_map[] = {
-  ENUM(DC_PRED),       ENUM(V_PRED),        ENUM(H_PRED),
-  ENUM(D45_PRED),      ENUM(D135_PRED),     ENUM(D117_PRED),
-  ENUM(D153_PRED),     ENUM(D207_PRED),     ENUM(D63_PRED),
-  ENUM(SMOOTH_PRED),
-#if CONFIG_SMOOTH_HV
-  ENUM(SMOOTH_V_PRED), ENUM(SMOOTH_H_PRED),
-#endif  // CONFIG_SMOOTH_HV
-  ENUM(TM_PRED),       ENUM(NEARESTMV),     ENUM(NEARMV),
-  ENUM(ZEROMV),        ENUM(NEWMV),         ENUM(NEAREST_NEARESTMV),
-  ENUM(NEAR_NEARMV),   ENUM(NEAREST_NEWMV), ENUM(NEW_NEARESTMV),
-  ENUM(NEAR_NEWMV),    ENUM(NEW_NEARMV),    ENUM(ZERO_ZEROMV),
-  ENUM(NEW_NEWMV),     ENUM(INTRA_INVALID), LAST_ENUM
+  ENUM(DC_PRED),     ENUM(V_PRED),        ENUM(H_PRED),
+  ENUM(D45_PRED),    ENUM(D135_PRED),     ENUM(D113_PRED),
+  ENUM(D157_PRED),   ENUM(D203_PRED),     ENUM(D67_PRED),
+  ENUM(SMOOTH_PRED), ENUM(SMOOTH_V_PRED), ENUM(SMOOTH_H_PRED),
+  ENUM(PAETH_PRED),  ENUM(NEARESTMV),     ENUM(NEARMV),
+  ENUM(GLOBALMV),    ENUM(NEWMV),         ENUM(NEAREST_NEARESTMV),
+  ENUM(NEAR_NEARMV), ENUM(NEAREST_NEWMV), ENUM(NEW_NEARESTMV),
+  ENUM(NEAR_NEWMV),  ENUM(NEW_NEARMV),    ENUM(GLOBAL_GLOBALMV),
+  ENUM(NEW_NEWMV),   ENUM(INTRA_INVALID), LAST_ENUM
 };
 
-#if CONFIG_CFL
 const map_entry uv_prediction_mode_map[] = {
   ENUM(UV_DC_PRED),       ENUM(UV_V_PRED),
   ENUM(UV_H_PRED),        ENUM(UV_D45_PRED),
-  ENUM(UV_D135_PRED),     ENUM(UV_D117_PRED),
-  ENUM(UV_D153_PRED),     ENUM(UV_D207_PRED),
-  ENUM(UV_D63_PRED),      ENUM(UV_SMOOTH_PRED),
-#if CONFIG_SMOOTH_HV
+  ENUM(UV_D135_PRED),     ENUM(UV_D113_PRED),
+  ENUM(UV_D157_PRED),     ENUM(UV_D203_PRED),
+  ENUM(UV_D67_PRED),      ENUM(UV_SMOOTH_PRED),
   ENUM(UV_SMOOTH_V_PRED), ENUM(UV_SMOOTH_H_PRED),
-#endif  // CONFIG_SMOOTH_HV
-  ENUM(UV_TM_PRED),
-#if CONFIG_CFL
-  ENUM(UV_CFL_PRED),
-#endif
+  ENUM(UV_PAETH_PRED),    ENUM(UV_CFL_PRED),
   ENUM(UV_MODE_INVALID),  LAST_ENUM
 };
-#else
-#define uv_prediction_mode_map prediction_mode_map
-#endif
 #define NO_SKIP 0
 #define SKIP 1
 
@@ -327,7 +302,7 @@ int put_map(char *buffer, const map_entry *map) {
       *(buf++) = ',';
     }
   }
-  return buf - buffer;
+  return (int)(buf - buffer);
 }
 
 int put_reference_frame(char *buffer) {
@@ -365,7 +340,7 @@ int put_reference_frame(char *buffer) {
     if (r < mi_rows - 1) *(buf++) = ',';
   }
   buf += put_str(buf, "],\n");
-  return buf - buffer;
+  return (int)(buf - buffer);
 }
 
 int put_motion_vectors(char *buffer) {
@@ -404,15 +379,19 @@ int put_motion_vectors(char *buffer) {
     if (r < mi_rows - 1) *(buf++) = ',';
   }
   buf += put_str(buf, "],\n");
-  return buf - buffer;
+  return (int)(buf - buffer);
 }
 
 int put_block_info(char *buffer, const map_entry *map, const char *name,
-                   size_t offset) {
+                   size_t offset, int len) {
   const int mi_rows = frame_data.mi_rows;
   const int mi_cols = frame_data.mi_cols;
   char *buf = buffer;
-  int r, c, t, v;
+  int r, c, t, i;
+  if (compress && len == 1) {
+    die("Can't encode scalars as arrays when RLE compression is enabled.");
+    return -1;
+  }
   if (map) {
     buf += snprintf(buf, MAX_BUFFER, "  \"%sMap\": {", name);
     buf += put_map(buf, map);
@@ -422,13 +401,36 @@ int put_block_info(char *buffer, const map_entry *map, const char *name,
   for (r = 0; r < mi_rows; ++r) {
     *(buf++) = '[';
     for (c = 0; c < mi_cols; ++c) {
-      insp_mi_data *curr_mi = &frame_data.mi_grid[r * mi_cols + c];
-      v = *(((int8_t *)curr_mi) + offset);
-      buf += put_num(buf, 0, v, 0);
+      insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c];
+      int16_t *v = (int16_t *)(((int8_t *)mi) + offset);
+      if (len == 0) {
+        buf += put_num(buf, 0, v[0], 0);
+      } else {
+        buf += put_str(buf, "[");
+        for (i = 0; i < len; i++) {
+          buf += put_num(buf, 0, v[i], 0);
+          if (i < len - 1) {
+            buf += put_str(buf, ",");
+          }
+        }
+        buf += put_str(buf, "]");
+      }
       if (compress) {  // RLE
         for (t = c + 1; t < mi_cols; ++t) {
           insp_mi_data *next_mi = &frame_data.mi_grid[r * mi_cols + t];
-          if (v != *(((int8_t *)next_mi) + offset)) {
+          int16_t *nv = (int16_t *)(((int8_t *)next_mi) + offset);
+          int same = 0;
+          if (len == 0) {
+            same = v[0] == nv[0];
+          } else {
+            for (i = 0; i < len; i++) {
+              same = v[i] == nv[i];
+              if (!same) {
+                break;
+              }
+            }
+          }
+          if (!same) {
             break;
           }
         }
@@ -444,7 +446,7 @@ int put_block_info(char *buffer, const map_entry *map, const char *name,
     if (r < mi_rows - 1) *(buf++) = ',';
   }
   buf += put_str(buf, "],\n");
-  return buf - buffer;
+  return (int)(buf - buffer);
 }
 
 #if CONFIG_ACCOUNTING
@@ -484,7 +486,7 @@ int put_accounting(char *buffer) {
     if (i < num_syms - 1) *(buf++) = ',';
   }
   buf += put_str(buf, "],\n");
-  return buf - buffer;
+  return (int)(buf - buffer);
 }
 #endif
 
@@ -499,51 +501,62 @@ void inspect(void *pbi, void *data) {
   buf += put_str(buf, "{\n");
   if (layers & BLOCK_SIZE_LAYER) {
     buf += put_block_info(buf, block_size_map, "blockSize",
-                          offsetof(insp_mi_data, sb_type));
+                          offsetof(insp_mi_data, sb_type), 0);
   }
   if (layers & TRANSFORM_SIZE_LAYER) {
     buf += put_block_info(buf, tx_size_map, "transformSize",
-                          offsetof(insp_mi_data, tx_size));
+                          offsetof(insp_mi_data, tx_size), 0);
   }
   if (layers & TRANSFORM_TYPE_LAYER) {
     buf += put_block_info(buf, tx_type_map, "transformType",
-                          offsetof(insp_mi_data, tx_type));
+                          offsetof(insp_mi_data, tx_type), 0);
+  }
+  if (layers & DUAL_FILTER_LAYER) {
+    buf += put_block_info(buf, dual_filter_map, "dualFilterType",
+                          offsetof(insp_mi_data, dual_filter_type), 0);
   }
   if (layers & MODE_LAYER) {
     buf += put_block_info(buf, prediction_mode_map, "mode",
-                          offsetof(insp_mi_data, mode));
+                          offsetof(insp_mi_data, mode), 0);
   }
   if (layers & UV_MODE_LAYER) {
     buf += put_block_info(buf, uv_prediction_mode_map, "uv_mode",
-                          offsetof(insp_mi_data, uv_mode));
+                          offsetof(insp_mi_data, uv_mode), 0);
   }
   if (layers & SKIP_LAYER) {
-    buf += put_block_info(buf, skip_map, "skip", offsetof(insp_mi_data, skip));
+    buf +=
+        put_block_info(buf, skip_map, "skip", offsetof(insp_mi_data, skip), 0);
   }
   if (layers & FILTER_LAYER) {
-    buf += put_block_info(buf, NULL, "filter", offsetof(insp_mi_data, filter));
+    buf +=
+        put_block_info(buf, NULL, "filter", offsetof(insp_mi_data, filter), 2);
   }
-#if CONFIG_CDEF
   if (layers & CDEF_LAYER) {
     buf += put_block_info(buf, NULL, "cdef_level",
-                          offsetof(insp_mi_data, cdef_level));
+                          offsetof(insp_mi_data, cdef_level), 0);
     buf += put_block_info(buf, NULL, "cdef_strength",
-                          offsetof(insp_mi_data, cdef_strength));
+                          offsetof(insp_mi_data, cdef_strength), 0);
   }
-#endif
-#if CONFIG_CFL
   if (layers & CFL_LAYER) {
     buf += put_block_info(buf, NULL, "cfl_alpha_idx",
-                          offsetof(insp_mi_data, cfl_alpha_idx));
+                          offsetof(insp_mi_data, cfl_alpha_idx), 0);
     buf += put_block_info(buf, NULL, "cfl_alpha_sign",
-                          offsetof(insp_mi_data, cfl_alpha_sign));
+                          offsetof(insp_mi_data, cfl_alpha_sign), 0);
+  }
+  if (layers & Q_INDEX_LAYER) {
+    buf += put_block_info(buf, NULL, "delta_q",
+                          offsetof(insp_mi_data, current_qindex), 0);
+  }
+  if (layers & SEGMENT_ID_LAYER) {
+    buf += put_block_info(buf, NULL, "seg_id",
+                          offsetof(insp_mi_data, segment_id), 0);
   }
-#endif
   if (layers & MOTION_VECTORS_LAYER) {
     buf += put_motion_vectors(buf);
   }
   if (layers & REFERENCE_FRAME_LAYER) {
-    buf += put_reference_frame(buf);
+    buf += put_block_info(buf, refs_map, "referenceFrame",
+                          offsetof(insp_mi_data, ref_frame), 2);
   }
 #if CONFIG_ACCOUNTING
   if (layers & ACCOUNTING_LAYER) {
@@ -561,6 +574,10 @@ void inspect(void *pbi, void *data) {
                   frame_data.tile_mi_cols);
   buf += snprintf(buf, MAX_BUFFER, "  \"tileRows\": %d,\n",
                   frame_data.tile_mi_rows);
+  buf += snprintf(buf, MAX_BUFFER, "  \"deltaQPresentFlag\": %d,\n",
+                  frame_data.delta_q_present_flag);
+  buf += snprintf(buf, MAX_BUFFER, "  \"deltaQRes\": %d,\n",
+                  frame_data.delta_q_res);
   buf += put_str(buf, "  \"config\": {");
   buf += put_map(buf, config_map);
   buf += put_str(buf, "},\n");
@@ -608,15 +625,18 @@ int read_frame() {
   aom_codec_iter_t iter = NULL;
   size_t frame_size = 0;
   const unsigned char *frame = aom_video_reader_get_frame(reader, &frame_size);
-  if (aom_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0) !=
+  if (aom_codec_decode(&codec, frame, (unsigned int)frame_size, NULL) !=
       AOM_CODEC_OK) {
     die_codec(&codec, "Failed to decode frame.");
   }
-  img = aom_codec_get_frame(&codec, &iter);
-  if (img == NULL) {
+  int got_any_frames = 0;
+  while ((img = aom_codec_get_frame(&codec, &iter))) {
+    ++frame_count;
+    got_any_frames = 1;
+  }
+  if (!got_any_frames) {
     return EXIT_FAILURE;
   }
-  ++frame_count;
   return EXIT_SUCCESS;
 }
 
@@ -674,18 +694,20 @@ static void parse_args(char **argv) {
       layers |= SKIP_LAYER;
     else if (arg_match(&arg, &dump_filter_arg, argi))
       layers |= FILTER_LAYER;
-#if CONFIG_CDEF
     else if (arg_match(&arg, &dump_cdef_arg, argi))
       layers |= CDEF_LAYER;
-#endif
-#if CONFIG_CFL
     else if (arg_match(&arg, &dump_cfl_arg, argi))
       layers |= CFL_LAYER;
-#endif
     else if (arg_match(&arg, &dump_reference_frame_arg, argi))
       layers |= REFERENCE_FRAME_LAYER;
     else if (arg_match(&arg, &dump_motion_vectors_arg, argi))
       layers |= MOTION_VECTORS_LAYER;
+    else if (arg_match(&arg, &dump_dual_filter_type_arg, argi))
+      layers |= DUAL_FILTER_LAYER;
+    else if (arg_match(&arg, &dump_delta_q_arg, argi))
+      layers |= Q_INDEX_LAYER;
+    else if (arg_match(&arg, &dump_seg_id_arg, argi))
+      layers |= SEGMENT_ID_LAYER;
     else if (arg_match(&arg, &dump_all_arg, argi))
       layers |= ALL_LAYERS;
     else if (arg_match(&arg, &compress_arg, argi))
diff --git a/third_party/aom/examples/lightfield_bitstream_parsing.c b/third_party/aom/examples/lightfield_bitstream_parsing.c
new file mode 100644
index 000000000..d13f3f172
--- /dev/null
+++ b/third_party/aom/examples/lightfield_bitstream_parsing.c
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Lightfield Bitstream Parsing
+// ============================
+//
+// This is an lightfield bitstream parsing example. It takes an input file
+// containing the whole compressed lightfield bitstream(ivf file), and parses it
+// and constructs and outputs a new bitstream that can be decoded by an AV1
+// decoder. The output bitstream contains tile list OBUs. The lf_width and
+// lf_height arguments are the number of lightfield images in each dimension.
+// The lf_blocksize determines the number of reference images used.
+// After running the lightfield encoder, run lightfield bitstream parsing:
+// examples/lightfield_bitstream_parsing vase10x10.ivf vase_tile_list.ivf 10 10
+// 5
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aom_encoder.h"
+#include "aom/aom_integer.h"
+#include "aom/aomdx.h"
+#include "aom_dsp/bitwriter_buffer.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(
+      stderr,
+      "Usage: %s <infile> <outfile> <lf_width> <lf_height> <lf_blocksize> \n",
+      exec_name);
+  exit(EXIT_FAILURE);
+}
+
+#define ALIGN_POWER_OF_TWO(value, n) \
+  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
+
+// SB size: 64x64
+const uint8_t output_frame_width_in_tiles_minus_1 = 512 / 64 - 1;
+const uint8_t output_frame_height_in_tiles_minus_1 = 512 / 64 - 1;
+
+// Spec:
+// typedef struct {
+//   uint8_t anchor_frame_idx;
+//   uint8_t tile_row;
+//   uint8_t tile_col;
+//   uint16_t coded_tile_data_size_minus_1;
+//   uint8_t *coded_tile_data;
+// } TILE_LIST_ENTRY;
+
+// Tile list entry provided by the application
+typedef struct {
+  int image_idx;
+  int reference_idx;
+  int tile_col;
+  int tile_row;
+} TILE_LIST_INFO;
+
+// M references: 0 - M-1; N images(including references): 0 - N-1;
+// Note: order the image index incrementally, so that we only go through the
+// bitstream once to construct the tile list.
+const int num_tile_lists = 2;
+const uint16_t tile_count_minus_1 = 9 - 1;
+const TILE_LIST_INFO tile_list[2][9] = {
+  { { 16, 0, 4, 5 },
+    { 83, 3, 13, 2 },
+    { 57, 2, 2, 6 },
+    { 31, 1, 11, 5 },
+    { 2, 0, 7, 4 },
+    { 77, 3, 9, 9 },
+    { 49, 1, 0, 1 },
+    { 6, 0, 3, 10 },
+    { 63, 2, 5, 8 } },
+  { { 65, 2, 11, 1 },
+    { 42, 1, 3, 7 },
+    { 88, 3, 8, 4 },
+    { 76, 3, 1, 15 },
+    { 1, 0, 2, 2 },
+    { 19, 0, 5, 6 },
+    { 60, 2, 4, 0 },
+    { 25, 1, 11, 15 },
+    { 50, 2, 5, 4 } },
+};
+
+int main(int argc, char **argv) {
+  aom_codec_ctx_t codec;
+  AvxVideoReader *reader = NULL;
+  AvxVideoWriter *writer = NULL;
+  const AvxInterface *decoder = NULL;
+  const AvxVideoInfo *info = NULL;
+  const char *lf_width_arg;
+  const char *lf_height_arg;
+  const char *lf_blocksize_arg;
+  int width, height;
+  int lf_width, lf_height;
+  int lf_blocksize;
+  int u_blocks, v_blocks;
+  int n, i;
+  aom_codec_pts_t pts;
+
+  exec_name = argv[0];
+  if (argc != 6) die("Invalid number of arguments.");
+
+  reader = aom_video_reader_open(argv[1]);
+  if (!reader) die("Failed to open %s for reading.", argv[1]);
+
+  lf_width_arg = argv[3];
+  lf_height_arg = argv[4];
+  lf_blocksize_arg = argv[5];
+
+  lf_width = (int)strtol(lf_width_arg, NULL, 0);
+  lf_height = (int)strtol(lf_height_arg, NULL, 0);
+  lf_blocksize = (int)strtol(lf_blocksize_arg, NULL, 0);
+
+  info = aom_video_reader_get_info(reader);
+  width = info->frame_width;
+  height = info->frame_height;
+
+  // The writer to write out ivf file in tile list OBU, which can be decoded by
+  // AV1 decoder.
+  writer = aom_video_writer_open(argv[2], kContainerIVF, info);
+  if (!writer) die("Failed to open %s for writing", argv[2]);
+
+  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder) die("Unknown input codec.");
+  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder.");
+
+  // Decode anchor frames.
+  aom_codec_control_(&codec, AV1_SET_TILE_MODE, 0);
+
+  // How many anchor frames we have.
+  u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize;
+  v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize;
+
+  int num_references = v_blocks * u_blocks;
+  for (i = 0; i < num_references; ++i) {
+    aom_video_reader_read_frame(reader);
+
+    size_t frame_size = 0;
+    const unsigned char *frame =
+        aom_video_reader_get_frame(reader, &frame_size);
+    pts = (aom_codec_pts_t)aom_video_reader_get_frame_pts(reader);
+
+    // Copy references bitstream directly.
+    if (!aom_video_writer_write_frame(writer, frame, frame_size, pts))
+      die_codec(&codec, "Failed to copy compressed anchor frame.");
+
+    if (aom_codec_decode(&codec, frame, frame_size, NULL))
+      die_codec(&codec, "Failed to decode frame.");
+  }
+
+  // Decode camera frames.
+  aom_codec_control_(&codec, AV1_SET_TILE_MODE, 1);
+  aom_codec_control_(&codec, AV1D_EXT_TILE_DEBUG, 1);
+
+  FILE *infile = aom_video_reader_get_file(reader);
+  // Record the offset of the first camera image.
+  const FileOffset camera_frame_pos = ftello(infile);
+
+  // Read out the first camera frame.
+  aom_video_reader_read_frame(reader);
+
+  // Copy first camera frame for getting camera frame header. This is done
+  // only once.
+  {
+    size_t frame_size = 0;
+    const unsigned char *frame =
+        aom_video_reader_get_frame(reader, &frame_size);
+    pts = (aom_codec_pts_t)aom_video_reader_get_frame_pts(reader);
+    aom_tile_data frame_header_info = { 0, NULL, 0 };
+
+    // Need to decode frame header to get camera frame header info. So, here
+    // decoding 1 tile is enough.
+    aom_codec_control_(&codec, AV1_SET_DECODE_TILE_ROW, 0);
+    aom_codec_control_(&codec, AV1_SET_DECODE_TILE_COL, 0);
+
+    aom_codec_err_t aom_status =
+        aom_codec_decode(&codec, frame, frame_size, NULL);
+    if (aom_status) die_codec(&codec, "Failed to decode tile.");
+
+    aom_codec_control_(&codec, AV1D_GET_FRAME_HEADER_INFO, &frame_header_info);
+
+    size_t obu_size_offset =
+        (uint8_t *)frame_header_info.coded_tile_data - frame;
+    size_t length_field_size = frame_header_info.coded_tile_data_size;
+    // Remove ext-tile tile info.
+    uint32_t frame_header_size = (uint32_t)frame_header_info.extra_size - 1;
+    size_t bytes_to_copy =
+        obu_size_offset + length_field_size + frame_header_size;
+
+    unsigned char *frame_hdr_buf = (unsigned char *)malloc(bytes_to_copy);
+    if (frame_hdr_buf == NULL)
+      die_codec(&codec, "Failed to allocate frame header buffer.");
+
+    memcpy(frame_hdr_buf, frame, bytes_to_copy);
+
+    // Update frame header OBU size.
+    size_t bytes_written = 0;
+    if (aom_uleb_encode_fixed_size(
+            frame_header_size, length_field_size, length_field_size,
+            frame_hdr_buf + obu_size_offset, &bytes_written))
+      die_codec(&codec, "Failed to encode the tile list obu size.");
+
+    // Copy camera frame header bitstream.
+    if (!aom_video_writer_write_frame(writer, frame_hdr_buf, bytes_to_copy,
+                                      pts))
+      die_codec(&codec, "Failed to copy compressed camera frame header.");
+  }
+
+  // Allocate a buffer to store tile list bitstream. Image format
+  // AOM_IMG_FMT_I420.
+  size_t data_sz =
+      ALIGN_POWER_OF_TWO(width, 5) * ALIGN_POWER_OF_TWO(height, 5) * 12 / 8;
+  unsigned char *tl_buf = (unsigned char *)malloc(data_sz);
+  if (tl_buf == NULL) die_codec(&codec, "Failed to allocate tile list buffer.");
+
+  aom_codec_pts_t tl_pts = pts;
+
+  // Process 1 tile list.
+  for (n = 0; n < num_tile_lists; n++) {
+    unsigned char *tl = tl_buf;
+    struct aom_write_bit_buffer wb = { tl, 0 };
+    unsigned char *saved_obu_size_loc = NULL;
+    uint32_t tile_list_obu_header_size = 0;
+    uint32_t tile_list_obu_size = 0;
+
+    // Write the tile list OBU header that is 1 byte long.
+    aom_wb_write_literal(&wb, 0, 1);  // forbidden bit.
+    aom_wb_write_literal(&wb, 8, 4);  // tile list OBU: "1000"
+    aom_wb_write_literal(&wb, 0, 1);  // obu_extension = 0
+    aom_wb_write_literal(&wb, 1, 1);  // obu_has_size_field
+    aom_wb_write_literal(&wb, 0, 1);  // reserved
+    tl++;
+    tile_list_obu_header_size++;
+
+    // Write the OBU size using a fixed length_field_size of 4 bytes.
+    saved_obu_size_loc = tl;
+    aom_wb_write_literal(&wb, 0, 32);
+    tl += 4;
+    tile_list_obu_header_size += 4;
+
+    // write_tile_list_obu()
+    aom_wb_write_literal(&wb, output_frame_width_in_tiles_minus_1, 8);
+    aom_wb_write_literal(&wb, output_frame_height_in_tiles_minus_1, 8);
+    aom_wb_write_literal(&wb, tile_count_minus_1, 16);
+    tl += 4;
+    tile_list_obu_size += 4;
+
+    // Write each tile's data
+    for (i = 0; i <= tile_count_minus_1; i++) {
+      aom_tile_data tile_data = { 0, NULL, 0 };
+
+      int image_idx = tile_list[n][i].image_idx;
+      int ref_idx = tile_list[n][i].reference_idx;
+      int tc = tile_list[n][i].tile_col;
+      int tr = tile_list[n][i].tile_row;
+      int frame_cnt = -1;
+
+      // Reset bit writer to the right location.
+      wb.bit_buffer = tl;
+      wb.bit_offset = 0;
+
+      // Seek to the first camera image.
+      fseeko(infile, camera_frame_pos, SEEK_SET);
+
+      // Read out the camera image
+      while (frame_cnt != image_idx) {
+        aom_video_reader_read_frame(reader);
+        frame_cnt++;
+      }
+
+      size_t frame_size = 0;
+      const unsigned char *frame =
+          aom_video_reader_get_frame(reader, &frame_size);
+
+      aom_codec_control_(&codec, AV1_SET_DECODE_TILE_ROW, tr);
+      aom_codec_control_(&codec, AV1_SET_DECODE_TILE_COL, tc);
+
+      aom_codec_err_t aom_status =
+          aom_codec_decode(&codec, frame, frame_size, NULL);
+      if (aom_status) die_codec(&codec, "Failed to decode tile.");
+
+      aom_codec_control_(&codec, AV1D_GET_TILE_DATA, &tile_data);
+
+      // Copy over tile info.
+      //  uint8_t anchor_frame_idx;
+      //  uint8_t tile_row;
+      //  uint8_t tile_col;
+      //  uint16_t coded_tile_data_size_minus_1;
+      //  uint8_t *coded_tile_data;
+      uint32_t tile_info_bytes = 5;
+      aom_wb_write_literal(&wb, ref_idx, 8);
+      aom_wb_write_literal(&wb, tr, 8);
+      aom_wb_write_literal(&wb, tc, 8);
+      aom_wb_write_literal(&wb, (int)tile_data.coded_tile_data_size - 1, 16);
+      tl += tile_info_bytes;
+
+      memcpy(tl, (uint8_t *)tile_data.coded_tile_data,
+             tile_data.coded_tile_data_size);
+      tl += tile_data.coded_tile_data_size;
+
+      tile_list_obu_size +=
+          tile_info_bytes + (uint32_t)tile_data.coded_tile_data_size;
+    }
+
+    // Write tile list OBU size.
+    size_t bytes_written = 0;
+    if (aom_uleb_encode_fixed_size(tile_list_obu_size, 4, 4, saved_obu_size_loc,
+                                   &bytes_written))
+      die_codec(&codec, "Failed to encode the tile list obu size.");
+
+    // Copy the tile list.
+    if (!aom_video_writer_write_frame(
+            writer, tl_buf, tile_list_obu_header_size + tile_list_obu_size,
+            tl_pts))
+      die_codec(&codec, "Failed to copy compressed tile list.");
+
+    tl_pts++;
+  }
+
+  free(tl_buf);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+  aom_video_writer_close(writer);
+  aom_video_reader_close(reader);
+
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/lightfield_decoder.c b/third_party/aom/examples/lightfield_decoder.c
index 8743df9bc..625cddcac 100644
--- a/third_party/aom/examples/lightfield_decoder.c
+++ b/third_party/aom/examples/lightfield_decoder.c
@@ -14,14 +14,14 @@
 //
 // This is an example of a simple lightfield decoder. It builds upon the
 // simple_decoder.c example.  It takes an input file containing the compressed
-// data (in webm format), treating it as a lightfield instead of a video and
+// data (in ivf format), treating it as a lightfield instead of a video and
 // will decode a single lightfield tile. The lf_width and lf_height arguments
 // are the number of lightfield images in each dimension. The tile to decode
 // is specified by the tile_u, tile_v, tile_s, tile_t arguments. The tile_u,
 // tile_v specify the image and tile_s, tile_t specify the tile in the image.
 // After running the lightfield encoder, run lightfield decoder to decode a
 // single tile:
-// examples/lightfield_decoder vase10x10.webm vase_tile.yuv 10 10 3 4 5 10 5
+// examples/lightfield_decoder vase10x10.ivf vase_reference.yuv 10 10 5
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -29,47 +29,57 @@
 
 #include "aom/aom_decoder.h"
 #include "aom/aomdx.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
 
-#include "../tools_common.h"
-#include "../video_reader.h"
-#include "./aom_config.h"
+#define MAX_EXTERNAL_REFERENCES 128
+#define AOM_BORDER_IN_PIXELS 288
 
 static const char *exec_name;
 
 void usage_exit(void) {
-  fprintf(stderr,
-          "Usage: %s <infile> <outfile> <lf_width> <lf_height> <tlie_u>"
-          " <tile_v> <tile_s> <tile_t> <lf_blocksize>\n",
-          exec_name);
+  fprintf(
+      stderr,
+      "Usage: %s <infile> <outfile> <lf_width> <lf_height> <lf_blocksize>\n",
+      exec_name);
   exit(EXIT_FAILURE);
 }
 
-aom_image_t *aom_img_copy(aom_image_t *src, aom_image_t *dst) {
-  dst = aom_img_alloc(dst, src->fmt, src->d_w, src->d_h, 16);
-
-  int plane;
-
-  for (plane = 0; plane < 3; ++plane) {
-    uint8_t *src_buf = src->planes[plane];
-    const int src_stride = src->stride[plane];
-    const int src_w = plane == 0 ? src->d_w : src->d_w >> 1;
-    const int src_h = plane == 0 ? src->d_h : src->d_h >> 1;
-
-    uint8_t *dst_buf = dst->planes[plane];
-    const int dst_stride = dst->stride[plane];
-    int y;
-
-    for (y = 0; y < src_h; ++y) {
-      memcpy(dst_buf, src_buf, src_w);
-      src_buf += src_stride;
-      dst_buf += dst_stride;
-    }
-  }
-  return dst;
-}
+// Tile list entry provided by the application
+typedef struct {
+  int image_idx;
+  int reference_idx;
+  int tile_col;
+  int tile_row;
+} TILE_LIST_INFO;
+
+// M references: 0 - M-1; N images(including references): 0 - N-1;
+// Note: order the image index incrementally, so that we only go through the
+// bitstream once to construct the tile list.
+const int num_tile_lists = 2;
+const uint16_t tile_count_minus_1 = 9 - 1;
+const TILE_LIST_INFO tile_list[2][9] = {
+  { { 16, 0, 4, 5 },
+    { 83, 3, 13, 2 },
+    { 57, 2, 2, 6 },
+    { 31, 1, 11, 5 },
+    { 2, 0, 7, 4 },
+    { 77, 3, 9, 9 },
+    { 49, 1, 0, 1 },
+    { 6, 0, 3, 10 },
+    { 63, 2, 5, 8 } },
+  { { 65, 2, 11, 1 },
+    { 42, 1, 3, 7 },
+    { 88, 3, 8, 4 },
+    { 76, 3, 1, 15 },
+    { 1, 0, 2, 2 },
+    { 19, 0, 5, 6 },
+    { 60, 2, 4, 0 },
+    { 25, 1, 11, 15 },
+    { 50, 2, 5, 4 } },
+};
 
 int main(int argc, char **argv) {
-  int frame_cnt = 0;
   FILE *outfile = NULL;
   aom_codec_ctx_t codec;
   AvxVideoReader *reader = NULL;
@@ -77,20 +87,20 @@ int main(int argc, char **argv) {
   const AvxVideoInfo *info = NULL;
   const char *lf_width_arg;
   const char *lf_height_arg;
-  const char *tile_u_arg;
-  const char *tile_v_arg;
-  const char *tile_s_arg;
-  const char *tile_t_arg;
   const char *lf_blocksize_arg;
+  int width, height;
   int lf_width, lf_height;
-  int tile_u, tile_v, tile_s, tile_t;
   int lf_blocksize;
   int u_blocks;
   int v_blocks;
+  aom_image_t reference_images[MAX_EXTERNAL_REFERENCES];
+  size_t frame_size = 0;
+  const unsigned char *frame = NULL;
+  int n, i;
 
   exec_name = argv[0];
 
-  if (argc != 10) die("Invalid number of arguments.");
+  if (argc != 6) die("Invalid number of arguments.");
 
   reader = aom_video_reader_open(argv[1]);
   if (!reader) die("Failed to open %s for reading.", argv[1]);
@@ -100,86 +110,113 @@ int main(int argc, char **argv) {
 
   lf_width_arg = argv[3];
   lf_height_arg = argv[4];
-  tile_u_arg = argv[5];
-  tile_v_arg = argv[6];
-  tile_s_arg = argv[7];
-  tile_t_arg = argv[8];
-  lf_blocksize_arg = argv[9];
+  lf_blocksize_arg = argv[5];
   lf_width = (int)strtol(lf_width_arg, NULL, 0);
   lf_height = (int)strtol(lf_height_arg, NULL, 0);
-  tile_u = (int)strtol(tile_u_arg, NULL, 0);
-  tile_v = (int)strtol(tile_v_arg, NULL, 0);
-  tile_s = (int)strtol(tile_s_arg, NULL, 0);
-  tile_t = (int)strtol(tile_t_arg, NULL, 0);
   lf_blocksize = (int)strtol(lf_blocksize_arg, NULL, 0);
 
   info = aom_video_reader_get_info(reader);
+  width = info->frame_width;
+  height = info->frame_height;
 
   decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
   if (!decoder) die("Unknown input codec.");
-
   printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
 
   if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
     die_codec(&codec, "Failed to initialize decoder.");
 
-  // How many reference images we need to encode.
+  // How many anchor frames we have.
   u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize;
   v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize;
-  aom_image_t *reference_images =
-      (aom_image_t *)malloc(u_blocks * v_blocks * sizeof(aom_image_t));
-  for (int bv = 0; bv < v_blocks; ++bv) {
-    for (int bu = 0; bu < u_blocks; ++bu) {
-      aom_video_reader_read_frame(reader);
-      aom_codec_iter_t iter = NULL;
-      aom_image_t *img = NULL;
-      size_t frame_size = 0;
-      const unsigned char *frame =
-          aom_video_reader_get_frame(reader, &frame_size);
-      if (aom_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0))
-        die_codec(&codec, "Failed to decode frame.");
-
-      while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
-        aom_img_copy(img, &reference_images[bu + bv * u_blocks]);
-        char name[1024];
-        snprintf(name, sizeof(name), "ref_%d_%d.yuv", bu, bv);
-        printf("writing ref image to %s, %d, %d\n", name, img->d_w, img->d_h);
-        FILE *ref_file = fopen(name, "wb");
-        aom_img_write(img, ref_file);
-        fclose(ref_file);
-        ++frame_cnt;
-      }
+
+  int num_references = v_blocks * u_blocks;
+
+  // Allocate memory to store decoded references.
+  aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420;
+  if (!CONFIG_LOWBITDEPTH) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  // Allocate memory with the border so that it can be used as a reference.
+  for (i = 0; i < num_references; i++) {
+    unsigned int border = AOM_BORDER_IN_PIXELS;
+    if (!aom_img_alloc_with_border(&reference_images[i], ref_fmt, width, height,
+                                   32, 8, border)) {
+      die("Failed to allocate references.");
     }
   }
 
-  int decode_frame_index = tile_v * lf_width + tile_u;
-  do {
+  // Decode anchor frames.
+  aom_codec_control_(&codec, AV1_SET_TILE_MODE, 0);
+
+  for (i = 0; i < num_references; ++i) {
     aom_video_reader_read_frame(reader);
-  } while (frame_cnt++ != decode_frame_index);
-  size_t frame_size = 0;
-  const unsigned char *frame = aom_video_reader_get_frame(reader, &frame_size);
-
-  int ref_bu = tile_u / lf_blocksize;
-  int ref_bv = tile_v / lf_blocksize;
-  int ref_bi = ref_bu + ref_bv * u_blocks;
-  av1_ref_frame_t ref;
-  ref.idx = 0;
-  ref.img = reference_images[ref_bi];
-  // This is too slow for real lightfield rendering.  This copies the
-  // reference image bytes.  We need a way to just set a pointer
-  // in order to make this fast enough.
-  if (aom_codec_control(&codec, AV1_SET_REFERENCE, &ref)) {
-    die_codec(&codec, "Failed to set reference image.");
+    frame = aom_video_reader_get_frame(reader, &frame_size);
+    if (aom_codec_decode(&codec, frame, frame_size, NULL))
+      die_codec(&codec, "Failed to decode frame.");
+
+    if (aom_codec_control(&codec, AV1_COPY_NEW_FRAME_IMAGE,
+                          &reference_images[i]))
+      die_codec(&codec, "Failed to copy decoded reference frame");
+
+    aom_codec_iter_t iter = NULL;
+    aom_image_t *img = NULL;
+    while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
+      char name[1024];
+      snprintf(name, sizeof(name), "ref_%d.yuv", i);
+      printf("writing ref image to %s, %d, %d\n", name, img->d_w, img->d_h);
+      FILE *ref_file = fopen(name, "wb");
+      aom_img_write(img, ref_file);
+      fclose(ref_file);
+    }
+  }
+
+  FILE *infile = aom_video_reader_get_file(reader);
+  // Record the offset of the first camera image.
+  const FileOffset camera_frame_pos = ftello(infile);
+
+  // Process 1 tile.
+  for (n = 0; n < num_tile_lists; n++) {
+    for (i = 0; i <= tile_count_minus_1; i++) {
+      int image_idx = tile_list[n][i].image_idx;
+      int ref_idx = tile_list[n][i].reference_idx;
+      int tc = tile_list[n][i].tile_col;
+      int tr = tile_list[n][i].tile_row;
+      int frame_cnt = -1;
+
+      // Seek to the first camera image.
+      fseeko(infile, camera_frame_pos, SEEK_SET);
+
+      // Read out the camera image
+      while (frame_cnt != image_idx) {
+        aom_video_reader_read_frame(reader);
+        frame_cnt++;
+      }
+
+      frame = aom_video_reader_get_frame(reader, &frame_size);
+
+      aom_codec_control_(&codec, AV1_SET_TILE_MODE, 1);
+      aom_codec_control_(&codec, AV1D_EXT_TILE_DEBUG, 1);
+      aom_codec_control_(&codec, AV1_SET_DECODE_TILE_ROW, tr);
+      aom_codec_control_(&codec, AV1_SET_DECODE_TILE_COL, tc);
+
+      av1_ref_frame_t ref;
+      ref.idx = 0;
+      ref.use_external_ref = 1;
+      ref.img = reference_images[ref_idx];
+      if (aom_codec_control(&codec, AV1_SET_REFERENCE, &ref)) {
+        die_codec(&codec, "Failed to set reference frame.");
+      }
+
+      aom_codec_err_t aom_status =
+          aom_codec_decode(&codec, frame, frame_size, NULL);
+      if (aom_status) die_codec(&codec, "Failed to decode tile.");
+
+      aom_codec_iter_t iter = NULL;
+      aom_image_t *img = aom_codec_get_frame(&codec, &iter);
+      aom_img_write(img, outfile);
+    }
   }
-  aom_codec_control_(&codec, AV1_SET_DECODE_TILE_ROW, tile_t);
-  aom_codec_control_(&codec, AV1_SET_DECODE_TILE_COL, tile_s);
-  aom_codec_err_t aom_status =
-      aom_codec_decode(&codec, frame, frame_size, NULL, 0);
-  if (aom_status) die_codec(&codec, "Failed to decode tile.");
-  aom_codec_iter_t iter = NULL;
-  aom_image_t *img = aom_codec_get_frame(&codec, &iter);
-  aom_img_write(img, outfile);
 
+  for (i = 0; i < num_references; i++) aom_img_free(&reference_images[i]);
   if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
   aom_video_reader_close(reader);
   fclose(outfile);
diff --git a/third_party/aom/examples/lightfield_encoder.c b/third_party/aom/examples/lightfield_encoder.c
index 0a424db8c..22daf622c 100644
--- a/third_party/aom/examples/lightfield_encoder.c
+++ b/third_party/aom/examples/lightfield_encoder.c
@@ -24,7 +24,12 @@
 // image for MCP.
 // Run "make test" to download lightfield test data: vase10x10.yuv.
 // Run lightfield encoder to encode whole lightfield:
-// examples/lightfield_encoder 1024 1024 vase10x10.yuv vase10x10.webm 10 10 5
+// examples/lightfield_encoder 1024 1024 vase10x10.yuv vase10x10.ivf 10 10 5
+
+// Note: In bitstream.c and encoder.c, define EXT_TILE_DEBUG as 1 will print
+// out the uncompressed header and the frame contexts, which can be used to
+// test the bit exactness of the headers and the frame contexts for large scale
+// tile coded frames.
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -34,11 +39,13 @@
 #include "aom/aomcx.h"
 #include "av1/common/enums.h"
 
-#include "../tools_common.h"
-#include "../video_writer.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+#define MAX_EXTERNAL_REFERENCES 128
+#define AOM_BORDER_IN_PIXELS 288
 
 static const char *exec_name;
-static const unsigned int deadline = AOM_DL_GOOD_QUALITY;
 
 void usage_exit(void) {
   fprintf(stderr,
@@ -48,52 +55,26 @@ void usage_exit(void) {
   exit(EXIT_FAILURE);
 }
 
-static aom_image_t *aom_img_copy(aom_image_t *src, aom_image_t *dst) {
-  dst = aom_img_alloc(dst, src->fmt, src->d_w, src->d_h, 16);
-
-  int plane;
-
-  for (plane = 0; plane < 3; ++plane) {
-    unsigned char *src_buf = src->planes[plane];
-    const int src_stride = src->stride[plane];
-    const int src_w = plane == 0 ? src->d_w : src->d_w >> 1;
-    const int src_h = plane == 0 ? src->d_h : src->d_h >> 1;
-
-    unsigned char *dst_buf = dst->planes[plane];
-    const int dst_stride = dst->stride[plane];
-    int y;
-
-    for (y = 0; y < src_h; ++y) {
-      memcpy(dst_buf, src_buf, src_w);
-      src_buf += src_stride;
-      dst_buf += dst_stride;
-    }
-  }
-  return dst;
-}
-
 static int aom_img_size_bytes(aom_image_t *img) {
   int image_size_bytes = 0;
   int plane;
   for (plane = 0; plane < 3; ++plane) {
-    const int stride = img->stride[plane];
     const int w = aom_img_plane_width(img, plane) *
                   ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
     const int h = aom_img_plane_height(img, plane);
-    image_size_bytes += (w + stride) * h;
+    image_size_bytes += w * h;
   }
   return image_size_bytes;
 }
 
 static int get_frame_stats(aom_codec_ctx_t *ctx, const aom_image_t *img,
                            aom_codec_pts_t pts, unsigned int duration,
-                           aom_enc_frame_flags_t flags, unsigned int dl,
+                           aom_enc_frame_flags_t flags,
                            aom_fixed_buf_t *stats) {
   int got_pkts = 0;
   aom_codec_iter_t iter = NULL;
   const aom_codec_cx_pkt_t *pkt = NULL;
-  const aom_codec_err_t res =
-      aom_codec_encode(ctx, img, pts, duration, flags, dl);
+  const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags);
   if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to get frame stats.");
 
   while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) {
@@ -113,13 +94,11 @@ static int get_frame_stats(aom_codec_ctx_t *ctx, const aom_image_t *img,
 
 static int encode_frame(aom_codec_ctx_t *ctx, const aom_image_t *img,
                         aom_codec_pts_t pts, unsigned int duration,
-                        aom_enc_frame_flags_t flags, unsigned int dl,
-                        AvxVideoWriter *writer) {
+                        aom_enc_frame_flags_t flags, AvxVideoWriter *writer) {
   int got_pkts = 0;
   aom_codec_iter_t iter = NULL;
   const aom_codec_cx_pkt_t *pkt = NULL;
-  const aom_codec_err_t res =
-      aom_codec_encode(ctx, img, pts, duration, flags, dl);
+  const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags);
   if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to encode frame.");
 
   while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) {
@@ -139,33 +118,44 @@ static int encode_frame(aom_codec_ctx_t *ctx, const aom_image_t *img,
   return got_pkts;
 }
 
+static void get_raw_image(aom_image_t **frame_to_encode, aom_image_t *raw,
+                          aom_image_t *raw_shift) {
+  if (!CONFIG_LOWBITDEPTH) {
+    // Need to allocate larger buffer to use hbd internal.
+    int input_shift = 0;
+    aom_img_upshift(raw_shift, raw, input_shift);
+    *frame_to_encode = raw_shift;
+  } else {
+    *frame_to_encode = raw;
+  }
+}
+
 static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile,
                              const AvxInterface *encoder,
                              const aom_codec_enc_cfg_t *cfg, int lf_width,
-                             int lf_height, int lf_blocksize) {
+                             int lf_height, int lf_blocksize, int flags,
+                             aom_image_t *raw_shift) {
   aom_codec_ctx_t codec;
   int frame_count = 0;
-  int image_size_bytes = 0;
+  int image_size_bytes = aom_img_size_bytes(raw);
   int u_blocks, v_blocks;
   int bu, bv;
   aom_fixed_buf_t stats = { NULL, 0 };
+  aom_image_t *frame_to_encode;
 
-  if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, flags))
     die_codec(&codec, "Failed to initialize encoder");
-  if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 1))
-    die_codec(&codec, "Failed to set frame parallel decoding");
   if (aom_codec_control(&codec, AOME_SET_ENABLEAUTOALTREF, 0))
     die_codec(&codec, "Failed to turn off auto altref");
-  if (aom_codec_control(&codec, AV1E_SET_SINGLE_TILE_DECODING, 1))
-    die_codec(&codec, "Failed to turn on single tile decoding");
-
-  image_size_bytes = aom_img_size_bytes(raw);
+  if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 0))
+    die_codec(&codec, "Failed to set frame parallel decoding");
 
   // How many reference images we need to encode.
   u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize;
   v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize;
-  aom_image_t *reference_images =
-      (aom_image_t *)malloc(u_blocks * v_blocks * sizeof(aom_image_t));
+
+  printf("\n First pass: ");
+
   for (bv = 0; bv < v_blocks; ++bv) {
     for (bu = 0; bu < u_blocks; ++bu) {
       const int block_u_min = bu * lf_blocksize;
@@ -174,7 +164,6 @@ static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile,
       int block_v_end = (bv + 1) * lf_blocksize;
       int u_block_size, v_block_size;
       int block_ref_u, block_ref_v;
-      struct av1_ref_frame ref_frame;
 
       block_u_end = block_u_end < lf_width ? block_u_end : lf_width;
       block_v_end = block_v_end < lf_height ? block_v_end : lf_height;
@@ -182,22 +171,28 @@ static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile,
       v_block_size = block_v_end - block_v_min;
       block_ref_u = block_u_min + u_block_size / 2;
       block_ref_v = block_v_min + v_block_size / 2;
+
+      printf("A%d, ", (block_ref_u + block_ref_v * lf_width));
       fseek(infile, (block_ref_u + block_ref_v * lf_width) * image_size_bytes,
             SEEK_SET);
       aom_img_read(raw, infile);
-      if (aom_codec_control(&codec, AOME_USE_REFERENCE,
-                            AOM_LAST_FLAG | AOM_GOLD_FLAG | AOM_ALT_FLAG))
-        die_codec(&codec, "Failed to set reference flags");
+      get_raw_image(&frame_to_encode, raw, raw_shift);
+
       // Reference frames can be encoded encoded without tiles.
       ++frame_count;
-      get_frame_stats(&codec, raw, frame_count, 1,
-                      AOM_EFLAG_FORCE_GF | AOM_EFLAG_NO_UPD_ENTROPY, deadline,
+      get_frame_stats(&codec, frame_to_encode, frame_count, 1,
+                      AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+                          AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+                          AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+                          AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+                          AOM_EFLAG_NO_UPD_ARF,
                       &stats);
-      ref_frame.idx = 0;
-      aom_codec_control(&codec, AV1_GET_REFERENCE, &ref_frame);
-      aom_img_copy(&ref_frame.img, &reference_images[frame_count - 1]);
     }
   }
+
+  if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 1))
+    die_codec(&codec, "Failed to set frame parallel decoding");
+
   for (bv = 0; bv < v_blocks; ++bv) {
     for (bu = 0; bu < u_blocks; ++bu) {
       const int block_u_min = bu * lf_blocksize;
@@ -209,58 +204,39 @@ static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile,
       block_v_end = block_v_end < lf_height ? block_v_end : lf_height;
       for (v = block_v_min; v < block_v_end; ++v) {
         for (u = block_u_min; u < block_u_end; ++u) {
-          // This was a work around for a bug in libvpx.  I'm not sure if this
-          // same bug exists in current version of av1.  Need to call this,
-          // otherwise the default is to not use any reference frames.  Then
-          // if you don't have at least one AOM_EFLAG_NO_REF_* flag, all frames
-          // will be intra encoded.  I'm not sure why the default is not to use
-          // any reference frames.  It looks like there is something about the
-          // way I encode the reference frames above that sets that as
-          // default...
-          if (aom_codec_control(&codec, AOME_USE_REFERENCE,
-                                AOM_LAST_FLAG | AOM_GOLD_FLAG | AOM_ALT_FLAG))
-            die_codec(&codec, "Failed to set reference flags");
-
-          // Set tile size to 64 pixels. The tile_columns and
-          // tile_rows in the tile coding are overloaded to represent
-          // tile_width and tile_height, that range from 1 to 64, in the unit
-          // of 64 pixels.
-          if (aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS, 1))
-            die_codec(&codec, "Failed to set tile width");
-          if (aom_codec_control(&codec, AV1E_SET_TILE_ROWS, 1))
-            die_codec(&codec, "Failed to set tile height");
-
-          av1_ref_frame_t ref;
-          ref.idx = 0;
-          ref.img = reference_images[bv * u_blocks + bu];
-          if (aom_codec_control(&codec, AV1_SET_REFERENCE, &ref))
-            die_codec(&codec, "Failed to set reference frame");
-
+          printf("C%d, ", (u + v * lf_width));
           fseek(infile, (u + v * lf_width) * image_size_bytes, SEEK_SET);
           aom_img_read(raw, infile);
+          get_raw_image(&frame_to_encode, raw, raw_shift);
+
           ++frame_count;
-          get_frame_stats(&codec, raw, frame_count, 1,
-                          AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
-                              AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY |
-                              AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF,
-                          deadline, &stats);
+          get_frame_stats(&codec, frame_to_encode, frame_count, 1,
+                          AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+                              AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+                              AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+                              AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+                              AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY,
+                          &stats);
         }
       }
     }
   }
   // Flush encoder.
-  while (get_frame_stats(&codec, NULL, frame_count, 1, 0, deadline, &stats)) {
+  // No ARF, this should not be needed.
+  while (get_frame_stats(&codec, NULL, frame_count, 1, 0, &stats)) {
   }
 
-  printf("Pass 0 complete. Processed %d frames.\n", frame_count);
   if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
 
+  printf("\nFirst pass complete. Processed %d frames.\n", frame_count);
+
   return stats;
 }
 
 static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
-                  const AvxInterface *encoder, const aom_codec_enc_cfg_t *cfg,
-                  int lf_width, int lf_height, int lf_blocksize) {
+                  const AvxInterface *encoder, aom_codec_enc_cfg_t *cfg,
+                  int lf_width, int lf_height, int lf_blocksize, int flags,
+                  aom_image_t *raw_shift) {
   AvxVideoInfo info = { encoder->fourcc,
                         cfg->g_w,
                         cfg->g_h,
@@ -268,27 +244,48 @@ static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
   AvxVideoWriter *writer = NULL;
   aom_codec_ctx_t codec;
   int frame_count = 0;
-  int image_size_bytes;
+  int image_size_bytes = aom_img_size_bytes(raw);
   int bu, bv;
   int u_blocks, v_blocks;
+  aom_image_t *frame_to_encode;
+  aom_image_t reference_images[MAX_EXTERNAL_REFERENCES];
+  int reference_image_num = 0;
+  int i;
 
   writer = aom_video_writer_open(outfile_name, kContainerIVF, &info);
   if (!writer) die("Failed to open %s for writing", outfile_name);
 
-  if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, flags))
     die_codec(&codec, "Failed to initialize encoder");
-  if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 1))
-    die_codec(&codec, "Failed to set frame parallel decoding");
   if (aom_codec_control(&codec, AOME_SET_ENABLEAUTOALTREF, 0))
     die_codec(&codec, "Failed to turn off auto altref");
-  if (aom_codec_control(&codec, AV1E_SET_SINGLE_TILE_DECODING, 1))
-    die_codec(&codec, "Failed to turn on single tile decoding");
+  if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 0))
+    die_codec(&codec, "Failed to set frame parallel decoding");
+  // Note: The superblock is a sequence parameter and has to be the same for 1
+  // sequence. In lightfield application, must choose the superblock size(either
+  // 64x64 or 128x128) before the encoding starts. Otherwise, the default is
+  // AOM_SUPERBLOCK_SIZE_DYNAMIC, and the superblock size will be set to 64x64
+  // internally.
+  if (aom_codec_control(&codec, AV1E_SET_SUPERBLOCK_SIZE,
+                        AOM_SUPERBLOCK_SIZE_64X64))
+    die_codec(&codec, "Failed to set SB size");
 
-  image_size_bytes = aom_img_size_bytes(raw);
   u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize;
   v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize;
-  aom_image_t *reference_images =
-      (aom_image_t *)malloc(u_blocks * v_blocks * sizeof(aom_image_t));
+
+  reference_image_num = u_blocks * v_blocks;
+  aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420;
+  if (!CONFIG_LOWBITDEPTH) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  // Allocate memory with the border so that it can be used as a reference.
+  for (i = 0; i < reference_image_num; i++) {
+    if (!aom_img_alloc_with_border(&reference_images[i], ref_fmt, cfg->g_w,
+                                   cfg->g_h, 32, 8, AOM_BORDER_IN_PIXELS)) {
+      die("Failed to allocate image.");
+    }
+  }
+
+  printf("\n Second pass: ");
+
   // Encode reference images first.
   printf("Encoding Reference Images\n");
   for (bv = 0; bv < v_blocks; ++bv) {
@@ -299,7 +296,6 @@ static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
       int block_v_end = (bv + 1) * lf_blocksize;
       int u_block_size, v_block_size;
       int block_ref_u, block_ref_v;
-      struct av1_ref_frame ref_frame;
 
       block_u_end = block_u_end < lf_width ? block_u_end : lf_width;
       block_v_end = block_v_end < lf_height ? block_v_end : lf_height;
@@ -307,25 +303,52 @@ static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
       v_block_size = block_v_end - block_v_min;
       block_ref_u = block_u_min + u_block_size / 2;
       block_ref_v = block_v_min + v_block_size / 2;
+
+      printf("A%d, ", (block_ref_u + block_ref_v * lf_width));
       fseek(infile, (block_ref_u + block_ref_v * lf_width) * image_size_bytes,
             SEEK_SET);
       aom_img_read(raw, infile);
-      if (aom_codec_control(&codec, AOME_USE_REFERENCE,
-                            AOM_LAST_FLAG | AOM_GOLD_FLAG | AOM_ALT_FLAG))
-        die_codec(&codec, "Failed to set reference flags");
+
+      get_raw_image(&frame_to_encode, raw, raw_shift);
+
       // Reference frames may be encoded without tiles.
       ++frame_count;
       printf("Encoding reference image %d of %d\n", bv * u_blocks + bu,
              u_blocks * v_blocks);
-      encode_frame(&codec, raw, frame_count, 1,
-                   AOM_EFLAG_FORCE_GF | AOM_EFLAG_NO_UPD_ENTROPY, deadline,
+      encode_frame(&codec, frame_to_encode, frame_count, 1,
+                   AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+                       AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+                       AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+                       AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+                       AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY,
                    writer);
-      ref_frame.idx = 0;
-      aom_codec_control(&codec, AV1_GET_REFERENCE, &ref_frame);
-      aom_img_copy(&ref_frame.img, &reference_images[frame_count - 1]);
+
+      if (aom_codec_control(&codec, AV1_COPY_NEW_FRAME_IMAGE,
+                            &reference_images[frame_count - 1]))
+        die_codec(&codec, "Failed to copy decoder reference frame");
     }
   }
 
+  cfg->large_scale_tile = 1;
+  // Fixed q encoding for camera frames.
+  cfg->rc_end_usage = AOM_Q;
+  if (aom_codec_enc_config_set(&codec, cfg))
+    die_codec(&codec, "Failed to configure encoder");
+
+  // The fixed q value used in encoding.
+  if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 36))
+    die_codec(&codec, "Failed to set cq level");
+  if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 1))
+    die_codec(&codec, "Failed to set frame parallel decoding");
+  if (aom_codec_control(&codec, AV1E_SET_SINGLE_TILE_DECODING, 1))
+    die_codec(&codec, "Failed to turn on single tile decoding");
+  // Set tile_columns and tile_rows to MAX values, which guarantees the tile
+  // size of 64 x 64 pixels(i.e. 1 SB) for <= 4k resolution.
+  if (aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS, 6))
+    die_codec(&codec, "Failed to set tile width");
+  if (aom_codec_control(&codec, AV1E_SET_TILE_ROWS, 6))
+    die_codec(&codec, "Failed to set tile height");
+
   for (bv = 0; bv < v_blocks; ++bv) {
     for (bu = 0; bu < u_blocks; ++bu) {
       const int block_u_min = bu * lf_blocksize;
@@ -337,56 +360,44 @@ static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
       block_v_end = block_v_end < lf_height ? block_v_end : lf_height;
       for (v = block_v_min; v < block_v_end; ++v) {
         for (u = block_u_min; u < block_u_end; ++u) {
-          // This was a work around for a bug in libvpx.  I'm not sure if this
-          // same bug exists in current version of av1.  Need to call this,
-          // otherwise the default is to not use any reference frames.  Then
-          // if you don't have at least one AOM_EFLAG_NO_REF_* flag, all frames
-          // will be intra encoded.  I'm not sure why the default is not to use
-          // any reference frames.  It looks like there is something about the
-          // way I encode the reference frames above that sets that as
-          // default...
-          if (aom_codec_control(&codec, AOME_USE_REFERENCE,
-                                AOM_LAST_FLAG | AOM_GOLD_FLAG | AOM_ALT_FLAG))
-            die_codec(&codec, "Failed to set reference flags");
-
-          // Set tile size to 64 pixels. The tile_columns and
-          // tile_rows in the tile coding are overloaded to represent tile_width
-          // and tile_height, that range from 1 to 64, in the unit of 64 pixels.
-          if (aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS, 1))
-            die_codec(&codec, "Failed to set tile width");
-          if (aom_codec_control(&codec, AV1E_SET_TILE_ROWS, 1))
-            die_codec(&codec, "Failed to set tile height");
-
           av1_ref_frame_t ref;
           ref.idx = 0;
+          ref.use_external_ref = 1;
           ref.img = reference_images[bv * u_blocks + bu];
           if (aom_codec_control(&codec, AV1_SET_REFERENCE, &ref))
             die_codec(&codec, "Failed to set reference frame");
+
+          printf("C%d, ", (u + v * lf_width));
           fseek(infile, (u + v * lf_width) * image_size_bytes, SEEK_SET);
           aom_img_read(raw, infile);
-          ++frame_count;
+          get_raw_image(&frame_to_encode, raw, raw_shift);
 
+          ++frame_count;
           printf("Encoding image %d of %d\n",
                  frame_count - (u_blocks * v_blocks), lf_width * lf_height);
-          encode_frame(&codec, raw, frame_count, 1,
-                       AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
-                           AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY |
-                           AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF,
-                       deadline, writer);
+          encode_frame(&codec, frame_to_encode, frame_count, 1,
+                       AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+                           AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+                           AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+                           AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+                           AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY,
+                       writer);
         }
       }
     }
   }
 
   // Flush encoder.
-  while (encode_frame(&codec, NULL, -1, 1, 0, deadline, writer)) {
+  // No ARF, this should not be needed.
+  while (encode_frame(&codec, NULL, -1, 1, 0, writer)) {
   }
 
-  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+  for (i = 0; i < reference_image_num; i++) aom_img_free(&reference_images[i]);
 
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
   aom_video_writer_close(writer);
 
-  printf("Pass 1 complete. Processed %d frames.\n", frame_count);
+  printf("\nSecond pass complete. Processed %d frames.\n", frame_count);
 }
 
 int main(int argc, char **argv) {
@@ -401,8 +412,10 @@ int main(int argc, char **argv) {
   aom_codec_ctx_t codec;
   aom_codec_enc_cfg_t cfg;
   aom_image_t raw;
+  aom_image_t raw_shift;
   aom_codec_err_t res;
   aom_fixed_buf_t stats;
+  int flags = 0;
 
   const AvxInterface *encoder = NULL;
   const int fps = 30;
@@ -435,14 +448,19 @@ int main(int argc, char **argv) {
     die("Invalid lf_width and/or lf_height: %dx%d", lf_width, lf_height);
   if (lf_blocksize <= 0) die("Invalid lf_blocksize: %d", lf_blocksize);
 
-  if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, w, h, 1))
-    die("Failed to allocate image", w, h);
+  if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, w, h, 32)) {
+    die("Failed to allocate image.");
+  }
+  if (!CONFIG_LOWBITDEPTH) {
+    // Need to allocate larger buffer to use hbd internal.
+    aom_img_alloc(&raw_shift, AOM_IMG_FMT_I420 | AOM_IMG_FMT_HIGHBITDEPTH, w, h,
+                  32);
+  }
 
   printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
 
   // Configuration
   res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
-
   if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = w;
@@ -450,29 +468,32 @@ int main(int argc, char **argv) {
   cfg.g_timebase.num = 1;
   cfg.g_timebase.den = fps;
   cfg.rc_target_bitrate = bitrate;
-  cfg.g_error_resilient = AOM_ERROR_RESILIENT_DEFAULT;
-  // Need to set lag_in_frames to 1 or 0.  Otherwise the frame flags get
-  // overridden after the first frame in encode_frame_to_data_rate() (see where
-  // get_frame_flags() is called).
-  cfg.g_lag_in_frames = 0;
+  cfg.g_error_resilient = 0;  // This is required.
+  cfg.g_lag_in_frames = 0;    // need to set this since default is 19.
   cfg.kf_mode = AOM_KF_DISABLED;
-  cfg.large_scale_tile = 1;
+  cfg.large_scale_tile = 0;  // Only set it to 1 for camera frame encoding.
+  cfg.g_bit_depth = AOM_BITS_8;
+  flags |= (cfg.g_bit_depth > AOM_BITS_8 || !CONFIG_LOWBITDEPTH)
+               ? AOM_CODEC_USE_HIGHBITDEPTH
+               : 0;
 
   if (!(infile = fopen(infile_arg, "rb")))
     die("Failed to open %s for reading", infile_arg);
 
   // Pass 0
   cfg.g_pass = AOM_RC_FIRST_PASS;
-  stats = pass0(&raw, infile, encoder, &cfg, lf_width, lf_height, lf_blocksize);
+  stats = pass0(&raw, infile, encoder, &cfg, lf_width, lf_height, lf_blocksize,
+                flags, &raw_shift);
 
   // Pass 1
   rewind(infile);
   cfg.g_pass = AOM_RC_LAST_PASS;
   cfg.rc_twopass_stats_in = stats;
   pass1(&raw, infile, outfile_arg, encoder, &cfg, lf_width, lf_height,
-        lf_blocksize);
+        lf_blocksize, flags, &raw_shift);
   free(stats.buf);
 
+  if (!CONFIG_LOWBITDEPTH) aom_img_free(&raw_shift);
   aom_img_free(&raw);
   fclose(infile);
 
diff --git a/third_party/aom/examples/lightfield_tile_list_decoder.c b/third_party/aom/examples/lightfield_tile_list_decoder.c
new file mode 100644
index 000000000..cec6baa2c
--- /dev/null
+++ b/third_party/aom/examples/lightfield_tile_list_decoder.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Lightfield Tile List Decoder
+// ============================
+//
+// This is a lightfield tile list decoder example. It takes an input file that
+// contains the anchor frames that are references of the coded tiles, the camera
+// frame header, and tile list OBUs that include the tile information and the
+// compressed tile data. This input file is reconstructed from the encoded
+// lightfield ivf file, and is decodable by AV1 decoder. The lf_width and
+// lf_height arguments are the number of lightfield images in each dimension.
+// The lf_blocksize determines the number of reference images used.
+// Run lightfield tile list decoder to decode an AV1 tile list file:
+// examples/lightfield_tile_list_decoder vase_tile_list.ivf vase_tile_list.yuv
+// 10 10 5 2
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+#define MAX_EXTERNAL_REFERENCES 128
+#define AOM_BORDER_IN_PIXELS 288
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "Usage: %s <infile> <outfile> <lf_width> <lf_height> <lf_blocksize> "
+          "<num_tile_lists>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv) {
+  FILE *outfile = NULL;
+  aom_codec_ctx_t codec;
+  AvxVideoReader *reader = NULL;
+  const AvxInterface *decoder = NULL;
+  const AvxVideoInfo *info = NULL;
+  const char *lf_width_arg;
+  const char *lf_height_arg;
+  const char *lf_blocksize_arg;
+  int width, height;
+  int lf_width, lf_height, lf_blocksize;
+  int u_blocks, v_blocks;
+  int num_tile_lists;
+  aom_image_t reference_images[MAX_EXTERNAL_REFERENCES];
+  size_t frame_size = 0;
+  const unsigned char *frame = NULL;
+  int i, n;
+
+  exec_name = argv[0];
+
+  if (argc != 7) die("Invalid number of arguments.");
+
+  reader = aom_video_reader_open(argv[1]);
+  if (!reader) die("Failed to open %s for reading.", argv[1]);
+
+  if (!(outfile = fopen(argv[2], "wb")))
+    die("Failed to open %s for writing.", argv[2]);
+
+  lf_width_arg = argv[3];
+  lf_height_arg = argv[4];
+  lf_blocksize_arg = argv[5];
+  lf_width = (int)strtol(lf_width_arg, NULL, 0);
+  lf_height = (int)strtol(lf_height_arg, NULL, 0);
+  lf_blocksize = (int)strtol(lf_blocksize_arg, NULL, 0);
+  num_tile_lists = (int)strtol(argv[6], NULL, 0);
+
+  info = aom_video_reader_get_info(reader);
+  width = info->frame_width;
+  height = info->frame_height;
+
+  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder) die("Unknown input codec.");
+  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder.");
+
+  // How many anchor frames we have.
+  u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize;
+  v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize;
+
+  int num_references = v_blocks * u_blocks;
+
+  // Allocate memory to store decoded references.
+  aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420;
+  if (!CONFIG_LOWBITDEPTH) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  // Allocate memory with the border so that it can be used as a reference.
+  for (i = 0; i < num_references; i++) {
+    unsigned int border = AOM_BORDER_IN_PIXELS;
+    if (!aom_img_alloc_with_border(&reference_images[i], ref_fmt, width, height,
+                                   32, 8, border)) {
+      die("Failed to allocate references.");
+    }
+  }
+
+  // Decode anchor frames.
+  aom_codec_control_(&codec, AV1_SET_TILE_MODE, 0);
+
+  for (i = 0; i < num_references; ++i) {
+    aom_video_reader_read_frame(reader);
+    frame = aom_video_reader_get_frame(reader, &frame_size);
+    if (aom_codec_decode(&codec, frame, frame_size, NULL))
+      die_codec(&codec, "Failed to decode frame.");
+
+    if (aom_codec_control(&codec, AV1_COPY_NEW_FRAME_IMAGE,
+                          &reference_images[i]))
+      die_codec(&codec, "Failed to copy decoded reference frame");
+
+    aom_codec_iter_t iter = NULL;
+    aom_image_t *img = NULL;
+    while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
+      char name[1024];
+      snprintf(name, sizeof(name), "ref_%d.yuv", i);
+      printf("writing ref image to %s, %d, %d\n", name, img->d_w, img->d_h);
+      FILE *ref_file = fopen(name, "wb");
+      aom_img_write(img, ref_file);
+      fclose(ref_file);
+    }
+  }
+
+  // Decode the lightfield.
+  aom_codec_control_(&codec, AV1_SET_TILE_MODE, 1);
+
+  // Set external references.
+  av1_ext_ref_frame_t set_ext_ref = { &reference_images[0], num_references };
+  aom_codec_control_(&codec, AV1D_SET_EXT_REF_PTR, &set_ext_ref);
+
+  // Must decode the camera frame header first.
+  aom_video_reader_read_frame(reader);
+  frame = aom_video_reader_get_frame(reader, &frame_size);
+  if (aom_codec_decode(&codec, frame, frame_size, NULL))
+    die_codec(&codec, "Failed to decode the frame.");
+
+  // Decode tile lists one by one.
+  for (n = 0; n < num_tile_lists; n++) {
+    aom_video_reader_read_frame(reader);
+    frame = aom_video_reader_get_frame(reader, &frame_size);
+
+    if (aom_codec_decode(&codec, frame, frame_size, NULL))
+      die_codec(&codec, "Failed to decode the tile list.");
+
+    aom_codec_iter_t iter = NULL;
+    aom_image_t *img;
+    while ((img = aom_codec_get_frame(&codec, &iter)))
+      fwrite(img->img_data, 1, img->sz, outfile);
+  }
+
+  for (i = 0; i < num_references; i++) aom_img_free(&reference_images[i]);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+  aom_video_reader_close(reader);
+  fclose(outfile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/lossless_encoder.c b/third_party/aom/examples/lossless_encoder.c
index 32ab18a16..438ff21c6 100644
--- a/third_party/aom/examples/lossless_encoder.c
+++ b/third_party/aom/examples/lossless_encoder.c
@@ -15,9 +15,8 @@
 
 #include "aom/aom_encoder.h"
 #include "aom/aomcx.h"
-
-#include "../tools_common.h"
-#include "../video_writer.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
 
 static const char *exec_name;
 
@@ -35,7 +34,7 @@ static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img,
   aom_codec_iter_t iter = NULL;
   const aom_codec_cx_pkt_t *pkt = NULL;
   const aom_codec_err_t res =
-      aom_codec_encode(codec, img, frame_index, 1, flags, AOM_DL_GOOD_QUALITY);
+      aom_codec_encode(codec, img, frame_index, 1, flags);
   if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame");
 
   while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) {
diff --git a/third_party/aom/examples/noise_model.c b/third_party/aom/examples/noise_model.c
new file mode 100644
index 000000000..5a5b4d40d
--- /dev/null
+++ b/third_party/aom/examples/noise_model.c
@@ -0,0 +1,426 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief This is an sample binary to create noise params from input video.
+ *
+ * To allow for external denoising applications, this sample binary illustrates
+ * how to create a film grain table (film grain params as a function of time)
+ * from an input video and its corresponding denoised source.
+ *
+ * The --output-grain-table file can be passed as input to the encoder (in
+ * aomenc this is done through the "--film-grain-table" parameter).
+ *
+ * As an example, where the input source is an 854x480 yuv420p 8-bit video
+ * named "input.854_480.yuv" you would use steps similar to the following:
+ *
+ * # Run your denoiser (e.g, using hqdn3d filter):
+ * ffmpeg -vcodec rawvideo -video_size 854x480 -i input.854_480.yuv \
+ *    -vf hqdn3d=5:5:5:5 -vcodec rawvideo -an -f rawvideo \
+ *    denoised.854_480.yuv
+ *
+ * # Model the noise between the denoised version and original source:
+ * ./examples/noise_model --fps=25/1 --width=854 --height=480 --i420 \
+ *    --input-denoised=denoised.854_480.yuv --input=original.854_480.yuv \
+ *    --output-grain-table=film_grain.tbl
+ *
+ * # Encode with your favorite settings (including the grain table):
+ * aomenc --limit=100  --cpu-used=4 --input-bit-depth=8                  \
+ *    --i420 -w 854 -h 480 --end-usage=q --cq-level=25 --lag-in-frames=25 \
+ *    --auto-alt-ref=2 --bit-depth=8 --film-grain-table=film_grain.tbl \
+ *    -o denoised_with_grain_params.ivf denoised.854_480.yuv
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#if CONFIG_AV1_DECODER
+#include "aom_dsp/grain_synthesis.h"
+#endif
+
+#include "aom_dsp/grain_table.h"
+#include "aom_dsp/noise_model.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_mem/aom_mem.h"
+#include "common/args.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "Usage: %s --input=<input> --input-denoised=<denoised> "
+          "--output-grain-table=<outfile> "
+          "See comments in noise_model.c for more information.\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static const arg_def_t help =
+    ARG_DEF(NULL, "help", 0, "Show usage options and exit");
+static const arg_def_t width_arg =
+    ARG_DEF("w", "width", 1, "Input width (if rawvideo)");
+static const arg_def_t height_arg =
+    ARG_DEF("h", "height", 1, "Input height (if rawvideo)");
+static const arg_def_t skip_frames_arg =
+    ARG_DEF("s", "skip-frames", 1, "Number of frames to skip (default = 1)");
+static const arg_def_t fps_arg = ARG_DEF(NULL, "fps", 1, "Frame rate");
+static const arg_def_t input_arg = ARG_DEF("-i", "input", 1, "Input filename");
+static const arg_def_t output_grain_table_arg =
+    ARG_DEF("n", "output-grain-table", 1, "Output noise file");
+static const arg_def_t input_denoised_arg =
+    ARG_DEF("d", "input-denoised", 1, "Input denoised filename (YUV) only");
+static const arg_def_t flat_block_finder_arg =
+    ARG_DEF("b", "flat-block-finder", 1, "Run the flat block finder");
+static const arg_def_t block_size_arg =
+    ARG_DEF("b", "block-size", 1, "Block size");
+static const arg_def_t bit_depth_arg =
+    ARG_DEF(NULL, "bit-depth", 1, "Bit depth of input");
+static const arg_def_t use_i420 =
+    ARG_DEF(NULL, "i420", 0, "Input file (and denoised) is I420 (default)");
+static const arg_def_t use_i422 =
+    ARG_DEF(NULL, "i422", 0, "Input file (and denoised) is I422");
+static const arg_def_t use_i444 =
+    ARG_DEF(NULL, "i444", 0, "Input file (and denoised) is I444");
+static const arg_def_t debug_file_arg =
+    ARG_DEF(NULL, "debug-file", 1, "File to output debug info");
+
+typedef struct {
+  int width;
+  int height;
+  struct aom_rational fps;
+  const char *input;
+  const char *input_denoised;
+  const char *output_grain_table;
+  int img_fmt;
+  int block_size;
+  int bit_depth;
+  int run_flat_block_finder;
+  int force_flat_psd;
+  int skip_frames;
+  const char *debug_file;
+} noise_model_args_t;
+
+void parse_args(noise_model_args_t *noise_args, int *argc, char **argv) {
+  struct arg arg;
+  static const arg_def_t *main_args[] = { &help,
+                                          &input_arg,
+                                          &fps_arg,
+                                          &width_arg,
+                                          &height_arg,
+                                          &block_size_arg,
+                                          &output_grain_table_arg,
+                                          &input_denoised_arg,
+                                          &use_i420,
+                                          &use_i422,
+                                          &use_i444,
+                                          &debug_file_arg,
+                                          NULL };
+  for (int argi = *argc + 1; *argv; argi++, argv++) {
+    if (arg_match(&arg, &help, argv)) {
+      fprintf(stdout, "\nOptions:\n");
+      arg_show_usage(stdout, main_args);
+      exit(0);
+    } else if (arg_match(&arg, &width_arg, argv)) {
+      noise_args->width = atoi(arg.val);
+    } else if (arg_match(&arg, &height_arg, argv)) {
+      noise_args->height = atoi(arg.val);
+    } else if (arg_match(&arg, &input_arg, argv)) {
+      noise_args->input = arg.val;
+    } else if (arg_match(&arg, &input_denoised_arg, argv)) {
+      noise_args->input_denoised = arg.val;
+    } else if (arg_match(&arg, &output_grain_table_arg, argv)) {
+      noise_args->output_grain_table = arg.val;
+    } else if (arg_match(&arg, &block_size_arg, argv)) {
+      noise_args->block_size = atoi(arg.val);
+    } else if (arg_match(&arg, &bit_depth_arg, argv)) {
+      noise_args->bit_depth = atoi(arg.val);
+    } else if (arg_match(&arg, &flat_block_finder_arg, argv)) {
+      noise_args->run_flat_block_finder = atoi(arg.val);
+    } else if (arg_match(&arg, &fps_arg, argv)) {
+      noise_args->fps = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &use_i420, argv)) {
+      noise_args->img_fmt = AOM_IMG_FMT_I420;
+    } else if (arg_match(&arg, &use_i422, argv)) {
+      noise_args->img_fmt = AOM_IMG_FMT_I422;
+    } else if (arg_match(&arg, &use_i444, argv)) {
+      noise_args->img_fmt = AOM_IMG_FMT_I444;
+    } else if (arg_match(&arg, &skip_frames_arg, argv)) {
+      noise_args->skip_frames = atoi(arg.val);
+    } else if (arg_match(&arg, &debug_file_arg, argv)) {
+      noise_args->debug_file = arg.val;
+    } else {
+      fprintf(stdout, "Unknown arg: %s\n\nUsage:\n", *argv);
+      arg_show_usage(stdout, main_args);
+      exit(0);
+    }
+  }
+  if (noise_args->bit_depth > 8) {
+    noise_args->img_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  }
+}
+
+#if CONFIG_AV1_DECODER
+static void print_variance_y(FILE *debug_file, aom_image_t *raw,
+                             aom_image_t *denoised, const uint8_t *flat_blocks,
+                             int block_size, aom_film_grain_t *grain) {
+  aom_image_t renoised;
+  grain->apply_grain = 1;
+  grain->random_seed = 1071;
+  aom_img_alloc(&renoised, raw->fmt, raw->w, raw->h, 1);
+  av1_add_film_grain(grain, denoised, &renoised);
+
+  const int num_blocks_w = (raw->w + block_size - 1) / block_size;
+  const int num_blocks_h = (raw->h + block_size - 1) / block_size;
+  fprintf(debug_file, "x = [");
+  for (int by = 0; by < num_blocks_h; by++) {
+    for (int bx = 0; bx < num_blocks_w; bx++) {
+      double block_mean = 0;
+      double noise_std = 0, noise_mean = 0;
+      double renoise_std = 0, renoise_mean = 0;
+      for (int yi = 0; yi < block_size; ++yi) {
+        const int y = by * block_size + yi;
+        for (int xi = 0; xi < block_size; ++xi) {
+          const int x = bx * block_size + xi;
+          const double noise_v = (raw->planes[0][y * raw->stride[0] + x] -
+                                  denoised->planes[0][y * raw->stride[0] + x]);
+          noise_mean += noise_v;
+          noise_std += noise_v * noise_v;
+
+          block_mean += raw->planes[0][y * raw->stride[0] + x];
+
+          const double renoise_v =
+              (renoised.planes[0][y * raw->stride[0] + x] -
+               denoised->planes[0][y * raw->stride[0] + x]);
+          renoise_mean += renoise_v;
+          renoise_std += renoise_v * renoise_v;
+        }
+      }
+      int n = (block_size * block_size);
+      block_mean /= n;
+      noise_mean /= n;
+      renoise_mean /= n;
+      noise_std = sqrt(noise_std / n - noise_mean * noise_mean);
+      renoise_std = sqrt(renoise_std / n - renoise_mean * renoise_mean);
+      fprintf(debug_file, "%d %3.2lf %3.2lf %3.2lf  ",
+              flat_blocks[by * num_blocks_w + bx], block_mean, noise_std,
+              renoise_std);
+    }
+    fprintf(debug_file, "\n");
+  }
+  fprintf(debug_file, "];\n");
+
+  if (raw->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+    fprintf(stderr,
+            "Detailed debug info not supported for high bit"
+            "depth formats\n");
+  } else {
+    fprintf(debug_file, "figure(2); clf;\n");
+    fprintf(debug_file,
+            "scatter(x(:, 2:4:end), x(:, 3:4:end), 'r'); hold on;\n");
+    fprintf(debug_file, "scatter(x(:, 2:4:end), x(:, 4:4:end), 'b');\n");
+    fprintf(debug_file,
+            "plot(linspace(0, 255, length(noise_strength_0)), "
+            "noise_strength_0, 'b');\n");
+    fprintf(debug_file,
+            "title('Scatter plot of intensity vs noise strength');\n");
+    fprintf(debug_file,
+            "legend('Actual', 'Estimated', 'Estimated strength');\n");
+    fprintf(debug_file, "figure(3); clf;\n");
+    fprintf(debug_file, "scatter(x(:, 3:4:end), x(:, 4:4:end), 'k');\n");
+    fprintf(debug_file, "title('Actual vs Estimated');\n");
+    fprintf(debug_file, "pause(3);\n");
+  }
+  aom_img_free(&renoised);
+}
+#endif
+
+static void print_debug_info(FILE *debug_file, aom_image_t *raw,
+                             aom_image_t *denoised, uint8_t *flat_blocks,
+                             int block_size, aom_noise_model_t *noise_model) {
+  (void)raw;
+  (void)denoised;
+  (void)flat_blocks;
+  (void)block_size;
+  fprintf(debug_file, "figure(3); clf;\n");
+  fprintf(debug_file, "figure(2); clf;\n");
+  fprintf(debug_file, "figure(1); clf;\n");
+  for (int c = 0; c < 3; ++c) {
+    fprintf(debug_file, "noise_strength_%d = [\n", c);
+    const aom_equation_system_t *eqns =
+        &noise_model->combined_state[c].strength_solver.eqns;
+    for (int k = 0; k < eqns->n; ++k) {
+      fprintf(debug_file, "%lf ", eqns->x[k]);
+    }
+    fprintf(debug_file, "];\n");
+    fprintf(debug_file, "plot(noise_strength_%d); hold on;\n", c);
+  }
+  fprintf(debug_file, "legend('Y', 'cb', 'cr');\n");
+  fprintf(debug_file, "title('Noise strength function');\n");
+
+#if CONFIG_AV1_DECODER
+  aom_film_grain_t grain;
+  aom_noise_model_get_grain_parameters(noise_model, &grain);
+  print_variance_y(debug_file, raw, denoised, flat_blocks, block_size, &grain);
+#endif
+  fflush(debug_file);
+}
+
+int main(int argc, char *argv[]) {
+  noise_model_args_t args = { 0,  0, { 25, 1 }, 0, 0, 0,   AOM_IMG_FMT_I420,
+                              32, 8, 1,         0, 1, NULL };
+  aom_image_t raw, denoised;
+  FILE *infile = NULL;
+  AvxVideoInfo info;
+
+  memset(&info, 0, sizeof(info));
+
+  exec_name = argv[0];
+  parse_args(&args, &argc, argv + 1);
+
+  info.frame_width = args.width;
+  info.frame_height = args.height;
+  info.time_base.numerator = args.fps.den;
+  info.time_base.denominator = args.fps.num;
+
+  if (info.frame_width <= 0 || info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+  if (!aom_img_alloc(&raw, args.img_fmt, info.frame_width, info.frame_height,
+                     1)) {
+    die("Failed to allocate image.");
+  }
+  if (!aom_img_alloc(&denoised, args.img_fmt, info.frame_width,
+                     info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+  infile = fopen(args.input, "r");
+  if (!infile) {
+    die("Failed to open input file:", args.input);
+  }
+  fprintf(stderr, "Bit depth: %d  stride:%d\n", args.bit_depth, raw.stride[0]);
+
+  const int high_bd = args.bit_depth > 8;
+  const int block_size = args.block_size;
+  aom_flat_block_finder_t block_finder;
+  aom_flat_block_finder_init(&block_finder, block_size, args.bit_depth,
+                             high_bd);
+
+  const int num_blocks_w = (info.frame_width + block_size - 1) / block_size;
+  const int num_blocks_h = (info.frame_height + block_size - 1) / block_size;
+  uint8_t *flat_blocks = (uint8_t *)aom_malloc(num_blocks_w * num_blocks_h);
+  // Sets the random seed on the first entry in the output table
+  int16_t random_seed = 1071;
+  aom_noise_model_t noise_model;
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3, args.bit_depth,
+                                      high_bd };
+  aom_noise_model_init(&noise_model, params);
+
+  FILE *denoised_file = 0;
+  if (args.input_denoised) {
+    denoised_file = fopen(args.input_denoised, "rb");
+    if (!denoised_file)
+      die("Unable to open input_denoised: %s", args.input_denoised);
+  } else {
+    die("--input-denoised file must be specified");
+  }
+  FILE *debug_file = 0;
+  if (args.debug_file) {
+    debug_file = fopen(args.debug_file, "w");
+  }
+  aom_film_grain_table_t grain_table = { 0, 0 };
+
+  int64_t prev_timestamp = 0;
+  int frame_count = 0;
+  while (aom_img_read(&raw, infile)) {
+    if (args.input_denoised) {
+      if (!aom_img_read(&denoised, denoised_file)) {
+        die("Unable to read input denoised file");
+      }
+    }
+    if (frame_count % args.skip_frames == 0) {
+      int num_flat_blocks = num_blocks_w * num_blocks_h;
+      memset(flat_blocks, 1, num_flat_blocks);
+      if (args.run_flat_block_finder) {
+        memset(flat_blocks, 0, num_flat_blocks);
+        num_flat_blocks = aom_flat_block_finder_run(
+            &block_finder, raw.planes[0], info.frame_width, info.frame_height,
+            info.frame_width, flat_blocks);
+        fprintf(stdout, "Num flat blocks %d\n", num_flat_blocks);
+      }
+
+      const uint8_t *planes[3] = { raw.planes[0], raw.planes[1],
+                                   raw.planes[2] };
+      uint8_t *denoised_planes[3] = { denoised.planes[0], denoised.planes[1],
+                                      denoised.planes[2] };
+      int strides[3] = { raw.stride[0] >> high_bd, raw.stride[1] >> high_bd,
+                         raw.stride[2] >> high_bd };
+      int chroma_sub[3] = { raw.x_chroma_shift, raw.y_chroma_shift, 0 };
+
+      fprintf(stdout, "Updating noise model...\n");
+      aom_noise_status_t status = aom_noise_model_update(
+          &noise_model, (const uint8_t *const *)planes,
+          (const uint8_t *const *)denoised_planes, info.frame_width,
+          info.frame_height, strides, chroma_sub, flat_blocks, block_size);
+
+      int64_t cur_timestamp =
+          frame_count * 10000000ULL * args.fps.den / args.fps.num;
+      if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) {
+        fprintf(stdout,
+                "Noise type is different, updating parameters for time "
+                "[ %" PRId64 ", %" PRId64 ")\n",
+                prev_timestamp, cur_timestamp);
+        aom_film_grain_t grain;
+        aom_noise_model_get_grain_parameters(&noise_model, &grain);
+        grain.random_seed = random_seed;
+        random_seed = 0;
+        aom_film_grain_table_append(&grain_table, prev_timestamp, cur_timestamp,
+                                    &grain);
+        aom_noise_model_save_latest(&noise_model);
+        prev_timestamp = cur_timestamp;
+      }
+      if (debug_file) {
+        print_debug_info(debug_file, &raw, &denoised, flat_blocks, block_size,
+                         &noise_model);
+      }
+      fprintf(stdout, "Done noise model update, status = %d\n", status);
+    }
+    frame_count++;
+  }
+
+  aom_film_grain_t grain;
+  aom_noise_model_get_grain_parameters(&noise_model, &grain);
+  grain.random_seed = random_seed;
+  aom_film_grain_table_append(&grain_table, prev_timestamp, INT64_MAX, &grain);
+  if (args.output_grain_table) {
+    struct aom_internal_error_info error_info;
+    if (AOM_CODEC_OK != aom_film_grain_table_write(&grain_table,
+                                                   args.output_grain_table,
+                                                   &error_info)) {
+      die("Unable to write output film grain table");
+    }
+  }
+  aom_film_grain_table_free(&grain_table);
+
+  if (infile) fclose(infile);
+  if (denoised_file) fclose(denoised_file);
+  if (debug_file) fclose(debug_file);
+  aom_img_free(&raw);
+  aom_img_free(&denoised);
+
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/resize_util.c b/third_party/aom/examples/resize_util.c
index 5485691a8..6a84d5740 100644
--- a/third_party/aom/examples/resize_util.c
+++ b/third_party/aom/examples/resize_util.c
@@ -16,8 +16,8 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "../tools_common.h"
-#include "../av1/encoder/av1_resize.h"
+#include "av1/common/resize.h"
+#include "common/tools_common.h"
 
 static const char *exec_name = NULL;
 
diff --git a/third_party/aom/examples/scalable_decoder.c b/third_party/aom/examples/scalable_decoder.c
new file mode 100644
index 000000000..c22924223
--- /dev/null
+++ b/third_party/aom/examples/scalable_decoder.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Scalable Decoder
+// ==============
+//
+// This is an example of a scalable decoder loop. It takes a 2-spatial-layer
+// input file
+// containing the compressed data (in OBU format), passes it through the
+// decoder, and writes the decompressed frames to disk. The base layer and
+// enhancement layers are stored as separate files, out_lyr0.yuv and
+// out_lyr1.yuv, respectively.
+//
+// Standard Includes
+// -----------------
+// For decoders, you only have to include `aom_decoder.h` and then any
+// header files for the specific codecs you use. In this case, we're using
+// av1.
+//
+// Initializing The Codec
+// ----------------------
+// The libaom decoder is initialized by the call to aom_codec_dec_init().
+// Determining the codec interface to use is handled by AvxVideoReader and the
+// functions prefixed with aom_video_reader_. Discussion of those functions is
+// beyond the scope of this example, but the main gist is to open the input file
+// and parse just enough of it to determine if it's a AVx file and which AVx
+// codec is contained within the file.
+// Note the NULL pointer passed to aom_codec_dec_init(). We do that in this
+// example because we want the algorithm to determine the stream configuration
+// (width/height) and allocate memory automatically.
+//
+// Decoding A Frame
+// ----------------
+// Once the frame has been read into memory, it is decoded using the
+// `aom_codec_decode` function. The call takes a pointer to the data
+// (`frame`) and the length of the data (`frame_size`). No application data
+// is associated with the frame in this example, so the `user_priv`
+// parameter is NULL. The `deadline` parameter is left at zero for this
+// example. This parameter is generally only used when doing adaptive post
+// processing.
+//
+// Codecs may produce a variable number of output frames for every call to
+// `aom_codec_decode`. These frames are retrieved by the
+// `aom_codec_get_frame` iterator function. The iterator variable `iter` is
+// initialized to NULL each time `aom_codec_decode` is called.
+// `aom_codec_get_frame` is called in a loop, returning a pointer to a
+// decoded image or NULL to indicate the end of list.
+//
+// Processing The Decoded Data
+// ---------------------------
+// In this example, we simply write the encoded data to disk. It is
+// important to honor the image's `stride` values.
+//
+// Cleanup
+// -------
+// The `aom_codec_destroy` call frees any memory allocated by the codec.
+//
+// Error Handling
+// --------------
+// This example does not special case any error return codes. If there was
+// an error, a descriptive message is printed and the program exits. With
+// few exceptions, aom_codec functions return an enumerated error status,
+// with the value `0` indicating success.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "common/obudec.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+static const char *exec_name;
+
+#define MAX_LAYERS 5
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <infile>\n", exec_name);
+  exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv) {
+  int frame_cnt = 0;
+  FILE *outfile[MAX_LAYERS];
+  char filename[80];
+  aom_codec_ctx_t codec;
+  const AvxInterface *decoder = NULL;
+  FILE *inputfile = NULL;
+  uint8_t *buf = NULL;
+  size_t bytes_in_buffer = 0;
+  size_t buffer_size = 0;
+  struct AvxInputContext aom_input_ctx;
+  struct ObuDecInputContext obu_ctx = { &aom_input_ctx, NULL, 0, 0, 0 };
+  aom_codec_stream_info_t si;
+  uint8_t tmpbuf[32];
+  unsigned int i;
+
+  exec_name = argv[0];
+
+  if (argc != 2) die("Invalid number of arguments.");
+
+  if (!(inputfile = fopen(argv[1], "rb")))
+    die("Failed to open %s for read.", argv[1]);
+  obu_ctx.avx_ctx->file = inputfile;
+  obu_ctx.avx_ctx->filename = argv[1];
+
+  decoder = get_aom_decoder_by_index(0);
+  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder.");
+
+  if (aom_codec_control(&codec, AV1D_SET_OUTPUT_ALL_LAYERS, 1)) {
+    die_codec(&codec, "Failed to set output_all_layers control.");
+  }
+
+  // peak sequence header OBU to get number of spatial layers
+  const size_t ret = fread(tmpbuf, 1, 32, inputfile);
+  if (ret != 32) die_codec(&codec, "Input is not a valid obu file");
+  si.is_annexb = 0;
+  if (aom_codec_peek_stream_info(decoder->codec_interface(), tmpbuf, 32, &si)) {
+    die_codec(&codec, "Input is not a valid obu file");
+  }
+  fseek(inputfile, -32, SEEK_CUR);
+
+  if (!file_is_obu(&obu_ctx))
+    die_codec(&codec, "Input is not a valid obu file");
+
+  // open base layer output yuv file
+  snprintf(filename, sizeof(filename), "out_lyr%d.yuv", 0);
+  if (!(outfile[0] = fopen(filename, "wb")))
+    die("Failed top open output for writing.");
+
+  // open any enhancement layer output yuv files
+  for (i = 1; i < si.number_spatial_layers; i++) {
+    snprintf(filename, sizeof(filename), "out_lyr%d.yuv", i);
+    if (!(outfile[i] = fopen(filename, "wb")))
+      die("Failed to open output for writing.");
+  }
+
+  while (!obudec_read_temporal_unit(&obu_ctx, &buf, &bytes_in_buffer,
+                                    &buffer_size)) {
+    aom_codec_iter_t iter = NULL;
+    aom_image_t *img = NULL;
+    if (aom_codec_decode(&codec, buf, bytes_in_buffer, NULL))
+      die_codec(&codec, "Failed to decode frame.");
+
+    while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
+      aom_image_t *img_shifted =
+          aom_img_alloc(NULL, AOM_IMG_FMT_I420, img->d_w, img->d_h, 16);
+      img_shifted->bit_depth = 8;
+      aom_img_downshift(img_shifted, img,
+                        img->bit_depth - img_shifted->bit_depth);
+      if (img->spatial_id == 0) {
+        printf("Writing        base layer 0 %d\n", frame_cnt);
+        aom_img_write(img_shifted, outfile[0]);
+      } else if (img->spatial_id <= (int)(si.number_spatial_layers - 1)) {
+        printf("Writing enhancement layer %d %d\n", img->spatial_id, frame_cnt);
+        aom_img_write(img_shifted, outfile[img->spatial_id]);
+      } else {
+        die_codec(&codec, "Invalid bitstream. Layer id exceeds layer count");
+      }
+      if (img->spatial_id == (int)(si.number_spatial_layers - 1)) ++frame_cnt;
+    }
+  }
+
+  printf("Processed %d frames.\n", frame_cnt);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+
+  for (i = 0; i < si.number_spatial_layers; i++) fclose(outfile[i]);
+
+  fclose(inputfile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/scalable_encoder.c b/third_party/aom/examples/scalable_encoder.c
new file mode 100644
index 000000000..7af03e29f
--- /dev/null
+++ b/third_party/aom/examples/scalable_encoder.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Scalable Encoder
+// ==============
+//
+// This is an example of a scalable encoder loop. It takes two input files in
+// YV12 format, passes it through the encoder, and writes the compressed
+// frames to disk in OBU format.
+//
+// Getting The Default Configuration
+// ---------------------------------
+// Encoders have the notion of "usage profiles." For example, an encoder
+// may want to publish default configurations for both a video
+// conferencing application and a best quality offline encoder. These
+// obviously have very different default settings. Consult the
+// documentation for your codec to see if it provides any default
+// configurations. All codecs provide a default configuration, number 0,
+// which is valid for material in the vacinity of QCIF/QVGA.
+//
+// Updating The Configuration
+// ---------------------------------
+// Almost all applications will want to update the default configuration
+// with settings specific to their usage. Here we set the width and height
+// of the video file to that specified on the command line. We also scale
+// the default bitrate based on the ratio between the default resolution
+// and the resolution specified on the command line.
+//
+// Encoding A Frame
+// ----------------
+// The frame is read as a continuous block (size = width * height * 3 / 2)
+// from the input file. If a frame was read (the input file has not hit
+// EOF) then the frame is passed to the encoder. Otherwise, a NULL
+// is passed, indicating the End-Of-Stream condition to the encoder. The
+// `frame_cnt` is reused as the presentation time stamp (PTS) and each
+// frame is shown for one frame-time in duration. The flags parameter is
+// unused in this example.
+
+// Forced Keyframes
+// ----------------
+// Keyframes can be forced by setting the AOM_EFLAG_FORCE_KF bit of the
+// flags passed to `aom_codec_control()`. In this example, we force a
+// keyframe every <keyframe-interval> frames. Note, the output stream can
+// contain additional keyframes beyond those that have been forced using the
+// AOM_EFLAG_FORCE_KF flag because of automatic keyframe placement by the
+// encoder.
+//
+// Processing The Encoded Data
+// ---------------------------
+// Each packet of type `AOM_CODEC_CX_FRAME_PKT` contains the encoded data
+// for this frame. We write a IVF frame header, followed by the raw data.
+//
+// Cleanup
+// -------
+// The `aom_codec_destroy` call frees any memory allocated by the codec.
+//
+// Error Handling
+// --------------
+// This example does not special case any error return codes. If there was
+// an error, a descriptive message is printed and the program exits. With
+// few exeptions, aom_codec functions return an enumerated error status,
+// with the value `0` indicating success.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "av1/common/enums.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "Usage: %s <codec> <width> <height> <infile0> <infile1> "
+          "<outfile> <frames to encode>\n"
+          "See comments in scalable_encoder.c for more information.\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img,
+                        int frame_index, int flags, FILE *outfile) {
+  int got_pkts = 0;
+  aom_codec_iter_t iter = NULL;
+  const aom_codec_cx_pkt_t *pkt = NULL;
+  const aom_codec_err_t res =
+      aom_codec_encode(codec, img, frame_index, 1, flags);
+  if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame");
+
+  while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+      if (fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile) !=
+          pkt->data.frame.sz) {
+        die_codec(codec, "Failed to write compressed frame");
+      }
+      printf(keyframe ? "K" : ".");
+      printf(" %6d\n", (int)pkt->data.frame.sz);
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+int main(int argc, char **argv) {
+  FILE *infile0 = NULL;
+  FILE *infile1 = NULL;
+  aom_codec_ctx_t codec;
+  aom_codec_enc_cfg_t cfg;
+  int frame_count = 0;
+  aom_image_t raw0, raw1;
+  aom_codec_err_t res;
+  AvxVideoInfo info;
+  const AvxInterface *encoder = NULL;
+  const int fps = 30;
+  const int bitrate = 200;
+  int keyframe_interval = 0;
+  int max_frames = 0;
+  int frames_encoded = 0;
+  const char *codec_arg = NULL;
+  const char *width_arg = NULL;
+  const char *height_arg = NULL;
+  const char *infile0_arg = NULL;
+  const char *infile1_arg = NULL;
+  const char *outfile_arg = NULL;
+  //  const char *keyframe_interval_arg = NULL;
+  FILE *outfile = NULL;
+
+  exec_name = argv[0];
+
+  // Clear explicitly, as simply assigning "{ 0 }" generates
+  // "missing-field-initializers" warning in some compilers.
+  memset(&info, 0, sizeof(info));
+
+  if (argc != 8) die("Invalid number of arguments");
+
+  codec_arg = argv[1];
+  width_arg = argv[2];
+  height_arg = argv[3];
+  infile0_arg = argv[4];
+  infile1_arg = argv[5];
+  outfile_arg = argv[6];
+  max_frames = (int)strtol(argv[7], NULL, 0);
+
+  encoder = get_aom_encoder_by_name(codec_arg);
+  if (!encoder) die("Unsupported codec.");
+
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = (int)strtol(width_arg, NULL, 0);
+  info.frame_height = (int)strtol(height_arg, NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 || info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!aom_img_alloc(&raw0, AOM_IMG_FMT_I420, info.frame_width,
+                     info.frame_height, 1)) {
+    die("Failed to allocate image for layer 0.");
+  }
+  if (!aom_img_alloc(&raw1, AOM_IMG_FMT_I420, info.frame_width,
+                     info.frame_height, 1)) {
+    die("Failed to allocate image for layer 1.");
+  }
+
+  //  keyframe_interval = (int)strtol(keyframe_interval_arg, NULL, 0);
+  keyframe_interval = 100;
+  if (keyframe_interval < 0) die("Invalid keyframe interval value.");
+
+  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+
+  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res) die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+  cfg.rc_target_bitrate = bitrate;
+  cfg.g_error_resilient = 0;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_end_usage = AOM_Q;
+  cfg.save_as_annexb = 0;
+
+  outfile = fopen(outfile_arg, "wb");
+  if (!outfile) die("Failed to open %s for writing.", outfile_arg);
+
+  if (!(infile0 = fopen(infile0_arg, "rb")))
+    die("Failed to open %s for reading.", infile0_arg);
+  if (!(infile1 = fopen(infile1_arg, "rb")))
+    die("Failed to open %s for reading.", infile0_arg);
+
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_control(&codec, AOME_SET_CPUUSED, 8))
+    die_codec(&codec, "Failed to set cpu to 8");
+
+  if (aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS, 2))
+    die_codec(&codec, "Failed to set tile columns to 2");
+  if (aom_codec_control(&codec, AV1E_SET_NUM_TG, 3))
+    die_codec(&codec, "Failed to set num of tile groups to 3");
+
+  if (aom_codec_control(&codec, AOME_SET_NUMBER_SPATIAL_LAYERS, 2))
+    die_codec(&codec, "Failed to set number of spatial layers to 2");
+
+  // Encode frames.
+  while (aom_img_read(&raw0, infile0)) {
+    int flags = 0;
+
+    // configure and encode base layer
+
+    if (keyframe_interval > 0 && frames_encoded % keyframe_interval == 0)
+      flags |= AOM_EFLAG_FORCE_KF;
+    else
+      // use previous base layer (LAST) as sole reference
+      // save this frame as LAST to be used as reference by enhanmcent layer
+      // and next base layer
+      flags |= AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+               AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+               AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+               AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
+               AOM_EFLAG_NO_UPD_ENTROPY;
+    cfg.g_w = info.frame_width;
+    cfg.g_h = info.frame_height;
+    if (aom_codec_enc_config_set(&codec, &cfg))
+      die_codec(&codec, "Failed to set enc cfg for layer 0");
+    if (aom_codec_control(&codec, AOME_SET_SPATIAL_LAYER_ID, 0))
+      die_codec(&codec, "Failed to set layer id to 0");
+    if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 62))
+      die_codec(&codec, "Failed to set cq level");
+    encode_frame(&codec, &raw0, frame_count++, flags, outfile);
+
+    // configure and encode enhancement layer
+
+    //  use LAST (base layer) as sole reference
+    flags = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+            AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+            AOM_EFLAG_NO_REF_ARF2 | AOM_EFLAG_NO_UPD_LAST |
+            AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
+            AOM_EFLAG_NO_UPD_ENTROPY;
+    cfg.g_w = info.frame_width;
+    cfg.g_h = info.frame_height;
+    aom_img_read(&raw1, infile1);
+    if (aom_codec_enc_config_set(&codec, &cfg))
+      die_codec(&codec, "Failed to set enc cfg for layer 1");
+    if (aom_codec_control(&codec, AOME_SET_SPATIAL_LAYER_ID, 1))
+      die_codec(&codec, "Failed to set layer id to 1");
+    if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 10))
+      die_codec(&codec, "Failed to set cq level");
+    encode_frame(&codec, &raw1, frame_count++, flags, outfile);
+
+    frames_encoded++;
+
+    if (max_frames > 0 && frames_encoded >= max_frames) break;
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, 0, outfile)) continue;
+
+  printf("\n");
+  fclose(infile0);
+  fclose(infile1);
+  printf("Processed %d frames.\n", frame_count / 2);
+
+  aom_img_free(&raw0);
+  aom_img_free(&raw1);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  fclose(outfile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/set_maps.c b/third_party/aom/examples/set_maps.c
index 3a54e5f96..9aeb96e43 100644
--- a/third_party/aom/examples/set_maps.c
+++ b/third_party/aom/examples/set_maps.c
@@ -47,11 +47,10 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "aom/aomcx.h"
 #include "aom/aom_encoder.h"
-
-#include "../tools_common.h"
-#include "../video_writer.h"
+#include "aom/aomcx.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
 
 static const char *exec_name;
 
@@ -95,8 +94,7 @@ static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img,
   int got_pkts = 0;
   aom_codec_iter_t iter = NULL;
   const aom_codec_cx_pkt_t *pkt = NULL;
-  const aom_codec_err_t res =
-      aom_codec_encode(codec, img, frame_index, 1, 0, AOM_DL_GOOD_QUALITY);
+  const aom_codec_err_t res = aom_codec_encode(codec, img, frame_index, 1, 0);
   if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame");
 
   while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) {
diff --git a/third_party/aom/examples/simple_decoder.c b/third_party/aom/examples/simple_decoder.c
index 33a894539..d098d1e0b 100644
--- a/third_party/aom/examples/simple_decoder.c
+++ b/third_party/aom/examples/simple_decoder.c
@@ -49,9 +49,7 @@
 // `aom_codec_decode` function. The call takes a pointer to the data
 // (`frame`) and the length of the data (`frame_size`). No application data
 // is associated with the frame in this example, so the `user_priv`
-// parameter is NULL. The `deadline` parameter is left at zero for this
-// example. This parameter is generally only used when doing adaptive post
-// processing.
+// parameter is NULL.
 //
 // Codecs may produce a variable number of output frames for every call to
 // `aom_codec_decode`. These frames are retrieved by the
@@ -81,10 +79,8 @@
 #include <string.h>
 
 #include "aom/aom_decoder.h"
-
-#include "../tools_common.h"
-#include "../video_reader.h"
-#include "./aom_config.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
 
 static const char *exec_name;
 
@@ -127,7 +123,7 @@ int main(int argc, char **argv) {
     size_t frame_size = 0;
     const unsigned char *frame =
         aom_video_reader_get_frame(reader, &frame_size);
-    if (aom_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0))
+    if (aom_codec_decode(&codec, frame, frame_size, NULL))
       die_codec(&codec, "Failed to decode frame.");
 
     while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
diff --git a/third_party/aom/examples/simple_encoder.c b/third_party/aom/examples/simple_encoder.c
index 996f6dacf..01a37cf0c 100644
--- a/third_party/aom/examples/simple_encoder.c
+++ b/third_party/aom/examples/simple_encoder.c
@@ -100,9 +100,8 @@
 #include <string.h>
 
 #include "aom/aom_encoder.h"
-
-#include "../tools_common.h"
-#include "../video_writer.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
 
 static const char *exec_name;
 
@@ -121,7 +120,7 @@ static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img,
   aom_codec_iter_t iter = NULL;
   const aom_codec_cx_pkt_t *pkt = NULL;
   const aom_codec_err_t res =
-      aom_codec_encode(codec, img, frame_index, 1, flags, AOM_DL_GOOD_QUALITY);
+      aom_codec_encode(codec, img, frame_index, 1, flags);
   if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame");
 
   while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) {
diff --git a/third_party/aom/examples/twopass_encoder.c b/third_party/aom/examples/twopass_encoder.c
index e767bb5d7..1b134cce0 100644
--- a/third_party/aom/examples/twopass_encoder.c
+++ b/third_party/aom/examples/twopass_encoder.c
@@ -52,9 +52,8 @@
 #include <string.h>
 
 #include "aom/aom_encoder.h"
-
-#include "../tools_common.h"
-#include "../video_writer.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
 
 static const char *exec_name;
 
@@ -68,13 +67,12 @@ void usage_exit(void) {
 
 static int get_frame_stats(aom_codec_ctx_t *ctx, const aom_image_t *img,
                            aom_codec_pts_t pts, unsigned int duration,
-                           aom_enc_frame_flags_t flags, unsigned int deadline,
+                           aom_enc_frame_flags_t flags,
                            aom_fixed_buf_t *stats) {
   int got_pkts = 0;
   aom_codec_iter_t iter = NULL;
   const aom_codec_cx_pkt_t *pkt = NULL;
-  const aom_codec_err_t res =
-      aom_codec_encode(ctx, img, pts, duration, flags, deadline);
+  const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags);
   if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to get frame stats.");
 
   while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) {
@@ -94,13 +92,11 @@ static int get_frame_stats(aom_codec_ctx_t *ctx, const aom_image_t *img,
 
 static int encode_frame(aom_codec_ctx_t *ctx, const aom_image_t *img,
                         aom_codec_pts_t pts, unsigned int duration,
-                        aom_enc_frame_flags_t flags, unsigned int deadline,
-                        AvxVideoWriter *writer) {
+                        aom_enc_frame_flags_t flags, AvxVideoWriter *writer) {
   int got_pkts = 0;
   aom_codec_iter_t iter = NULL;
   const aom_codec_cx_pkt_t *pkt = NULL;
-  const aom_codec_err_t res =
-      aom_codec_encode(ctx, img, pts, duration, flags, deadline);
+  const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags);
   if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to encode frame.");
 
   while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) {
@@ -133,13 +129,11 @@ static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile,
   // Calculate frame statistics.
   while (aom_img_read(raw, infile) && frame_count < limit) {
     ++frame_count;
-    get_frame_stats(&codec, raw, frame_count, 1, 0, AOM_DL_GOOD_QUALITY,
-                    &stats);
+    get_frame_stats(&codec, raw, frame_count, 1, 0, &stats);
   }
 
   // Flush encoder.
-  while (get_frame_stats(&codec, NULL, frame_count, 1, 0, AOM_DL_GOOD_QUALITY,
-                         &stats)) {
+  while (get_frame_stats(&codec, NULL, frame_count, 1, 0, &stats)) {
   }
 
   printf("Pass 0 complete. Processed %d frames.\n", frame_count);
@@ -168,11 +162,11 @@ static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
   // Encode frames.
   while (aom_img_read(raw, infile) && frame_count < limit) {
     ++frame_count;
-    encode_frame(&codec, raw, frame_count, 1, 0, AOM_DL_GOOD_QUALITY, writer);
+    encode_frame(&codec, raw, frame_count, 1, 0, writer);
   }
 
   // Flush encoder.
-  while (encode_frame(&codec, NULL, -1, 1, 0, AOM_DL_GOOD_QUALITY, writer)) {
+  while (encode_frame(&codec, NULL, -1, 1, 0, writer)) {
   }
 
   printf("\n");
diff --git a/third_party/aom/libs.mk b/third_party/aom/libs.mk
deleted file mode 100644
index d481c47cf..000000000
--- a/third_party/aom/libs.mk
+++ /dev/null
@@ -1,583 +0,0 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-
-# ARM assembly files are written in RVCT-style. We use some make magic to
-# filter those files to allow GCC compilation
-ifeq ($(ARCH_ARM),yes)
-  ASM:=$(if $(filter yes,$(CONFIG_GCC)$(CONFIG_MSVS)),.asm.s,.asm)
-else
-  ASM:=.asm
-endif
-
-#
-# Rule to generate runtime cpu detection files
-#
-define rtcd_h_template
-$$(BUILD_PFX)$(1).h: $$(SRC_PATH_BARE)/$(2)
-	@echo "    [CREATE] $$@"
-	$$(qexec)$$(SRC_PATH_BARE)/build/make/rtcd.pl --arch=$$(TGT_ISA) \
-          --sym=$(1) \
-          --config=$$(CONFIG_DIR)$$(target)-$$(TOOLCHAIN).mk \
-          $$(RTCD_OPTIONS) $$^ > $$@
-CLEAN-OBJS += $$(BUILD_PFX)$(1).h
-RTCD += $$(BUILD_PFX)$(1).h
-endef
-
-CODEC_SRCS-yes += CHANGELOG
-CODEC_SRCS-yes += libs.mk
-
-include $(SRC_PATH_BARE)/aom/aom_codec.mk
-CODEC_SRCS-yes += $(addprefix aom/,$(call enabled,API_SRCS))
-CODEC_DOC_SRCS += $(addprefix aom/,$(call enabled,API_DOC_SRCS))
-
-include $(SRC_PATH_BARE)/aom_mem/aom_mem.mk
-CODEC_SRCS-yes += $(addprefix aom_mem/,$(call enabled,MEM_SRCS))
-
-include $(SRC_PATH_BARE)/aom_scale/aom_scale.mk
-CODEC_SRCS-yes += $(addprefix aom_scale/,$(call enabled,SCALE_SRCS))
-
-include $(SRC_PATH_BARE)/aom_ports/aom_ports.mk
-CODEC_SRCS-yes += $(addprefix aom_ports/,$(call enabled,PORTS_SRCS))
-
-include $(SRC_PATH_BARE)/aom_dsp/aom_dsp.mk
-CODEC_SRCS-yes += $(addprefix aom_dsp/,$(call enabled,DSP_SRCS))
-
-include $(SRC_PATH_BARE)/aom_util/aom_util.mk
-CODEC_SRCS-yes += $(addprefix aom_util/,$(call enabled,UTIL_SRCS))
-
-#  AV1 make file
-ifeq ($(CONFIG_AV1),yes)
-  AV1_PREFIX=av1/
-  include $(SRC_PATH_BARE)/$(AV1_PREFIX)av1_common.mk
-endif
-
-ifeq ($(CONFIG_AV1_ENCODER),yes)
-  AV1_PREFIX=av1/
-  include $(SRC_PATH_BARE)/$(AV1_PREFIX)av1_cx.mk
-  CODEC_SRCS-yes += $(addprefix $(AV1_PREFIX),$(call enabled,AV1_CX_SRCS))
-  CODEC_EXPORTS-yes += $(addprefix $(AV1_PREFIX),$(AV1_CX_EXPORTS))
-  CODEC_SRCS-yes += $(AV1_PREFIX)av1_cx.mk aom/aom.h aom/aomcx.h
-  INSTALL-LIBS-yes += include/aom/aom.h include/aom/aomcx.h
-  INSTALL_MAPS += include/aom/% $(SRC_PATH_BARE)/$(AV1_PREFIX)/%
-  CODEC_DOC_SRCS += aom/aom.h aom/aomcx.h
-  CODEC_DOC_SECTIONS += av1 av1_encoder encoder
-endif
-
-ifeq ($(CONFIG_AV1_DECODER),yes)
-  AV1_PREFIX=av1/
-  include $(SRC_PATH_BARE)/$(AV1_PREFIX)av1_dx.mk
-  CODEC_SRCS-yes += $(addprefix $(AV1_PREFIX),$(call enabled,AV1_DX_SRCS))
-  CODEC_EXPORTS-yes += $(addprefix $(AV1_PREFIX),$(AV1_DX_EXPORTS))
-  CODEC_SRCS-yes += $(AV1_PREFIX)av1_dx.mk aom/aom.h aom/aomdx.h
-  INSTALL-LIBS-yes += include/aom/aom.h include/aom/aomdx.h
-  INSTALL_MAPS += include/aom/% $(SRC_PATH_BARE)/$(AV1_PREFIX)/%
-  CODEC_DOC_SRCS += aom/aom.h aom/aomdx.h
-  CODEC_DOC_SECTIONS += av1 av1_decoder decoder
-endif
-
-AV1_PREFIX=av1/
-$(BUILD_PFX)$(AV1_PREFIX)%.c.o: CFLAGS += -Wextra
-
-ifeq ($(CONFIG_MSVS),yes)
-CODEC_LIB=$(if $(CONFIG_STATIC_MSVCRT),aommt,aommd)
-GTEST_LIB=$(if $(CONFIG_STATIC_MSVCRT),gtestmt,gtestmd)
-# This variable uses deferred expansion intentionally, since the results of
-# $(wildcard) may change during the course of the Make.
-VS_PLATFORMS = $(foreach d,$(wildcard */Release/$(CODEC_LIB).lib),$(word 1,$(subst /, ,$(d))))
-endif
-
-# The following pairs define a mapping of locations in the distribution
-# tree to locations in the source/build trees.
-INSTALL_MAPS += include/aom/% $(SRC_PATH_BARE)/aom/%
-INSTALL_MAPS += include/aom/% $(SRC_PATH_BARE)/aom_ports/%
-INSTALL_MAPS += $(LIBSUBDIR)/%     %
-INSTALL_MAPS += src/%     $(SRC_PATH_BARE)/%
-ifeq ($(CONFIG_MSVS),yes)
-INSTALL_MAPS += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/%  $(p)/Release/%)
-INSTALL_MAPS += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/%  $(p)/Debug/%)
-endif
-
-CODEC_SRCS-yes += build/make/version.sh
-CODEC_SRCS-yes += build/make/rtcd.pl
-CODEC_SRCS-yes += aom_ports/emmintrin_compat.h
-CODEC_SRCS-yes += aom_ports/mem_ops.h
-CODEC_SRCS-yes += aom_ports/mem_ops_aligned.h
-CODEC_SRCS-yes += aom_ports/aom_once.h
-CODEC_SRCS-yes += $(BUILD_PFX)aom_config.c
-INSTALL-SRCS-no += $(BUILD_PFX)aom_config.c
-ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
-INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm
-endif
-CODEC_EXPORTS-yes += aom/exports_com
-CODEC_EXPORTS-$(CONFIG_AV1_ENCODER) += aom/exports_enc
-CODEC_EXPORTS-$(CONFIG_AV1_DECODER) += aom/exports_dec
-
-INSTALL-LIBS-yes += include/aom/aom_codec.h
-INSTALL-LIBS-yes += include/aom/aom_frame_buffer.h
-INSTALL-LIBS-yes += include/aom/aom_image.h
-INSTALL-LIBS-yes += include/aom/aom_integer.h
-INSTALL-LIBS-$(CONFIG_AV1_DECODER) += include/aom/aom_decoder.h
-INSTALL-LIBS-$(CONFIG_AV1_ENCODER) += include/aom/aom_encoder.h
-ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
-ifeq ($(CONFIG_MSVS),yes)
-INSTALL-LIBS-yes                  += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB).lib)
-INSTALL-LIBS-$(CONFIG_DEBUG_LIBS) += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB)d.lib)
-INSTALL-LIBS-$(CONFIG_SHARED) += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/aom.dll)
-INSTALL-LIBS-$(CONFIG_SHARED) += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/aom.exp)
-endif
-else
-INSTALL-LIBS-$(CONFIG_STATIC) += $(LIBSUBDIR)/libaom.a
-INSTALL-LIBS-$(CONFIG_DEBUG_LIBS) += $(LIBSUBDIR)/libaom_g.a
-endif
-
-CODEC_SRCS=$(call enabled,CODEC_SRCS)
-INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(CODEC_SRCS)
-INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(call enabled,CODEC_EXPORTS)
-
-
-# Generate a list of all enabled sources, in particular for exporting to gyp
-# based build systems.
-libaom_srcs.txt:
-	@echo "    [CREATE] $@"
-	@echo $(CODEC_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
-CLEAN-OBJS += libaom_srcs.txt
-
-
-ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
-ifeq ($(CONFIG_MSVS),yes)
-
-aom.def: $(call enabled,CODEC_EXPORTS)
-	@echo "    [CREATE] $@"
-	$(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_def.sh\
-            --name=aom\
-            --out=$@ $^
-CLEAN-OBJS += aom.def
-
-# Assembly files that are included, but don't define symbols themselves.
-# Filtered out to avoid Visual Studio build warnings.
-ASM_INCLUDES := \
-    third_party/x86inc/x86inc.asm \
-    aom_config.asm \
-    aom_ports/x86_abi_support.asm \
-
-aom.$(VCPROJ_SFX): $(CODEC_SRCS) aom.def
-	@echo "    [CREATE] $@"
-	$(qexec)$(GEN_VCPROJ) \
-            $(if $(CONFIG_SHARED),--dll,--lib) \
-            --target=$(TOOLCHAIN) \
-            $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
-            --name=aom \
-            --proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74 \
-            --module-def=aom.def \
-            --ver=$(CONFIG_VS_VERSION) \
-            --src-path-bare="$(SRC_PATH_BARE)" \
-            --out=$@ $(CFLAGS) \
-            $(filter-out $(addprefix %, $(ASM_INCLUDES)), $^) \
-            --src-path-bare="$(SRC_PATH_BARE)" \
-
-PROJECTS-yes += aom.$(VCPROJ_SFX)
-
-aom.$(VCPROJ_SFX): aom_config.asm
-aom.$(VCPROJ_SFX): $(RTCD)
-
-endif
-else
-LIBAOM_OBJS=$(call objs,$(CODEC_SRCS))
-OBJS-yes += $(LIBAOM_OBJS)
-LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libaom.a $(BUILD_PFX)libaom_g.a
-$(BUILD_PFX)libaom_g.a: $(LIBAOM_OBJS)
-
-SO_VERSION_MAJOR := 0
-SO_VERSION_MINOR := 0
-SO_VERSION_PATCH := 0
-ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
-LIBAOM_SO               := libaom.$(SO_VERSION_MAJOR).dylib
-SHARED_LIB_SUF          := .dylib
-EXPORT_FILE             := libaom.syms
-LIBAOM_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, \
-                             libaom.dylib  )
-else
-ifeq ($(filter iphonesimulator%,$(TGT_OS)),$(TGT_OS))
-LIBAOM_SO               := libaom.$(SO_VERSION_MAJOR).dylib
-SHARED_LIB_SUF          := .dylib
-EXPORT_FILE             := libaom.syms
-LIBAOM_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, libaom.dylib)
-else
-ifeq ($(filter os2%,$(TGT_OS)),$(TGT_OS))
-LIBAOM_SO               := libaom$(SO_VERSION_MAJOR).dll
-SHARED_LIB_SUF          := _dll.a
-EXPORT_FILE             := libaom.def
-LIBAOM_SO_SYMLINKS      :=
-LIBAOM_SO_IMPLIB        := libaom_dll.a
-else
-LIBAOM_SO               := libaom.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH)
-SHARED_LIB_SUF          := .so
-EXPORT_FILE             := libaom.ver
-LIBAOM_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, \
-                             libaom.so libaom.so.$(SO_VERSION_MAJOR) \
-                             libaom.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR))
-endif
-endif
-endif
-
-LIBS-$(CONFIG_SHARED) += $(BUILD_PFX)$(LIBAOM_SO)\
-                           $(notdir $(LIBAOM_SO_SYMLINKS)) \
-                           $(if $(LIBAOM_SO_IMPLIB), $(BUILD_PFX)$(LIBAOM_SO_IMPLIB))
-$(BUILD_PFX)$(LIBAOM_SO): $(LIBAOM_OBJS) $(EXPORT_FILE)
-$(BUILD_PFX)$(LIBAOM_SO): extralibs += -lm
-$(BUILD_PFX)$(LIBAOM_SO): SONAME = libaom.so.$(SO_VERSION_MAJOR)
-$(BUILD_PFX)$(LIBAOM_SO): EXPORTS_FILE = $(EXPORT_FILE)
-
-libaom.ver: $(call enabled,CODEC_EXPORTS)
-	@echo "    [CREATE] $@"
-	$(qexec)echo "{ global:" > $@
-	$(qexec)for f in $?; do awk '{print $$2";"}' < $$f >>$@; done
-	$(qexec)echo "local: *; };" >> $@
-CLEAN-OBJS += libaom.ver
-
-libaom.syms: $(call enabled,CODEC_EXPORTS)
-	@echo "    [CREATE] $@"
-	$(qexec)awk '{print "_"$$2}' $^ >$@
-CLEAN-OBJS += libaom.syms
-
-libaom.def: $(call enabled,CODEC_EXPORTS)
-	@echo "    [CREATE] $@"
-	$(qexec)echo LIBRARY $(LIBAOM_SO:.dll=) INITINSTANCE TERMINSTANCE > $@
-	$(qexec)echo "DATA MULTIPLE NONSHARED" >> $@
-	$(qexec)echo "EXPORTS" >> $@
-	$(qexec)awk '{print "_"$$2}' $^ >>$@
-CLEAN-OBJS += libaom.def
-
-libaom_dll.a: $(LIBAOM_SO)
-	@echo "    [IMPLIB] $@"
-	$(qexec)emximp -o $@ $<
-CLEAN-OBJS += libaom_dll.a
-
-define libaom_symlink_template
-$(1): $(2)
-	@echo "    [LN]     $(2) $$@"
-	$(qexec)mkdir -p $$(dir $$@)
-	$(qexec)ln -sf $(2) $$@
-endef
-
-$(eval $(call libaom_symlink_template,\
-    $(addprefix $(BUILD_PFX),$(notdir $(LIBAOM_SO_SYMLINKS))),\
-    $(BUILD_PFX)$(LIBAOM_SO)))
-$(eval $(call libaom_symlink_template,\
-    $(addprefix $(DIST_DIR)/,$(LIBAOM_SO_SYMLINKS)),\
-    $(LIBAOM_SO)))
-
-
-INSTALL-LIBS-$(CONFIG_SHARED) += $(LIBAOM_SO_SYMLINKS)
-INSTALL-LIBS-$(CONFIG_SHARED) += $(LIBSUBDIR)/$(LIBAOM_SO)
-INSTALL-LIBS-$(CONFIG_SHARED) += $(if $(LIBAOM_SO_IMPLIB),$(LIBSUBDIR)/$(LIBAOM_SO_IMPLIB))
-
-
-LIBS-yes += aom.pc
-aom.pc: config.mk libs.mk
-	@echo "    [CREATE] $@"
-	$(qexec)echo '# pkg-config file from libaom $(VERSION_STRING)' > $@
-	$(qexec)echo 'prefix=$(PREFIX)' >> $@
-	$(qexec)echo 'exec_prefix=$${prefix}' >> $@
-	$(qexec)echo 'libdir=$${prefix}/$(LIBSUBDIR)' >> $@
-	$(qexec)echo 'includedir=$${prefix}/include' >> $@
-	$(qexec)echo '' >> $@
-	$(qexec)echo 'Name: aom' >> $@
-	$(qexec)echo 'Description: WebM Project AVx codec implementation' >> $@
-	$(qexec)echo 'Version: $(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)' >> $@
-	$(qexec)echo 'Requires:' >> $@
-	$(qexec)echo 'Conflicts:' >> $@
-	$(qexec)echo 'Libs: -L$${libdir} -laom -lm' >> $@
-ifeq ($(HAVE_PTHREAD_H),yes)
-	$(qexec)echo 'Libs.private: -lm -lpthread' >> $@
-else
-	$(qexec)echo 'Libs.private: -lm' >> $@
-endif
-	$(qexec)echo 'Cflags: -I$${includedir}' >> $@
-INSTALL-LIBS-yes += $(LIBSUBDIR)/pkgconfig/aom.pc
-INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc
-CLEAN-OBJS += aom.pc
-endif
-
-#
-# Rule to make assembler configuration file from C configuration file
-#
-ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
-# YASM
-$(BUILD_PFX)aom_config.asm: $(BUILD_PFX)aom_config.h
-	@echo "    [CREATE] $@"
-	@egrep "#define [A-Z0-9_]+ [01]" $< \
-	    | awk '{print $$2 " equ " $$3}' > $@
-else
-ADS2GAS=$(if $(filter yes,$(CONFIG_GCC)),| $(ASM_CONVERSION))
-$(BUILD_PFX)aom_config.asm: $(BUILD_PFX)aom_config.h
-	@echo "    [CREATE] $@"
-	@egrep "#define [A-Z0-9_]+ [01]" $< \
-	    | awk '{print $$2 " EQU " $$3}' $(ADS2GAS) > $@
-	@echo "        END" $(ADS2GAS) >> $@
-CLEAN-OBJS += $(BUILD_PFX)aom_config.asm
-endif
-
-#
-# Add assembler dependencies for configuration.
-#
-$(filter %.s.o,$(OBJS-yes)):     $(BUILD_PFX)aom_config.asm
-$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)aom_config.asm
-
-
-$(shell $(SRC_PATH_BARE)/build/make/version.sh "$(SRC_PATH_BARE)" $(BUILD_PFX)aom_version.h)
-CLEAN-OBJS += $(BUILD_PFX)aom_version.h
-
-#
-# Add include path for libwebm sources.
-#
-ifeq ($(CONFIG_WEBM_IO),yes)
-  CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/libwebm
-endif
-
-##
-## libaom test directives
-##
-ifeq ($(CONFIG_UNIT_TESTS),yes)
-LIBAOM_TEST_DATA_PATH ?= .
-
-include $(SRC_PATH_BARE)/test/test.mk
-LIBAOM_TEST_SRCS=$(addprefix test/,$(call enabled,LIBAOM_TEST_SRCS))
-LIBAOM_TEST_BIN=./test_libaom$(EXE_SFX)
-LIBAOM_TEST_DATA=$(addprefix $(LIBAOM_TEST_DATA_PATH)/,\
-                     $(call enabled,LIBAOM_TEST_DATA))
-libaom_test_data_url=https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx/$(1)
-
-TEST_INTRA_PRED_SPEED_BIN=./test_intra_pred_speed$(EXE_SFX)
-TEST_INTRA_PRED_SPEED_SRCS=$(addprefix test/,$(call enabled,TEST_INTRA_PRED_SPEED_SRCS))
-TEST_INTRA_PRED_SPEED_OBJS := $(sort $(call objs,$(TEST_INTRA_PRED_SPEED_SRCS)))
-
-libaom_test_srcs.txt:
-	@echo "    [CREATE] $@"
-	@echo $(LIBAOM_TEST_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
-CLEAN-OBJS += libaom_test_srcs.txt
-
-$(LIBAOM_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1
-	@echo "    [DOWNLOAD] $@"
-	$(qexec)trap 'rm -f $@' INT TERM &&\
-            curl -L -o $@ $(call libaom_test_data_url,$(@F))
-
-testdata:: $(LIBAOM_TEST_DATA)
-	$(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\
-          [ -x "$$(which shasum)" ] && sha1sum=shasum;\
-          [ -x "$$(which sha1)" ] && sha1sum=sha1;\
-          if [ -n "$${sha1sum}" ]; then\
-            set -e;\
-            echo "Checking test data:";\
-            for f in $(call enabled,LIBAOM_TEST_DATA); do\
-                grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\
-                    (cd $(LIBAOM_TEST_DATA_PATH); $${sha1sum} -c);\
-            done; \
-        else\
-            echo "Skipping test data integrity check, sha1sum not found.";\
-        fi
-
-ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
-ifeq ($(CONFIG_MSVS),yes)
-gtest.$(VCPROJ_SFX): $(SRC_PATH_BARE)/third_party/googletest/src/googletest/src/gtest-all.cc
-	@echo "    [CREATE] $@"
-	$(qexec)$(GEN_VCPROJ) \
-            --lib \
-            --target=$(TOOLCHAIN) \
-            $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
-            --name=gtest \
-            --proj-guid=EC00E1EC-AF68-4D92-A255-181690D1C9B1 \
-            --ver=$(CONFIG_VS_VERSION) \
-            --src-path-bare="$(SRC_PATH_BARE)" \
-            -D_VARIADIC_MAX=10 \
-            --out=gtest.$(VCPROJ_SFX) \
-	    $(SRC_PATH_BARE)/third_party/googletest/src/googletest/src/gtest-all.cc \
-            -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/googletest/include" \
-	    -I"$(SRC_PATH_BARE)/third_party/googletest/src/googletest"
-
-PROJECTS-$(CONFIG_MSVS) += gtest.$(VCPROJ_SFX)
-
-test_libaom.$(VCPROJ_SFX): $(LIBAOM_TEST_SRCS) aom.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX)
-	@echo "    [CREATE] $@"
-	$(qexec)$(GEN_VCPROJ) \
-            --exe \
-            --target=$(TOOLCHAIN) \
-            --name=test_libaom \
-            -D_VARIADIC_MAX=10 \
-            --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \
-            --ver=$(CONFIG_VS_VERSION) \
-            --src-path-bare="$(SRC_PATH_BARE)" \
-            $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
-            --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
-            -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/googletest/include" \
-            $(if $(CONFIG_WEBM_IO),-I"$(SRC_PATH_BARE)/third_party/libwebm") \
-            -L. -l$(CODEC_LIB) -l$(GTEST_LIB) $^
-
-PROJECTS-$(CONFIG_MSVS) += test_libaom.$(VCPROJ_SFX)
-
-LIBAOM_TEST_BIN := $(addprefix $(TGT_OS:win64=x64)/Release/,$(notdir $(LIBAOM_TEST_BIN)))
-
-ifneq ($(strip $(TEST_INTRA_PRED_SPEED_OBJS)),)
-PROJECTS-$(CONFIG_MSVS) += test_intra_pred_speed.$(VCPROJ_SFX)
-test_intra_pred_speed.$(VCPROJ_SFX): $(TEST_INTRA_PRED_SPEED_SRCS) aom.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX)
-	@echo "    [CREATE] $@"
-	$(qexec)$(GEN_VCPROJ) \
-            --exe \
-            --target=$(TOOLCHAIN) \
-            --name=test_intra_pred_speed \
-            -D_VARIADIC_MAX=10 \
-            --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \
-            --ver=$(CONFIG_VS_VERSION) \
-            --src-path-bare="$(SRC_PATH_BARE)" \
-            $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
-            --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
-            -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/googletest/include" \
-            -L. -l$(CODEC_LIB) -l$(GTEST_LIB) $^
-endif  # TEST_INTRA_PRED_SPEED
-endif
-else
-
-include $(SRC_PATH_BARE)/third_party/googletest/gtest.mk
-GTEST_SRCS := $(addprefix third_party/,$(call enabled,GTEST_SRCS))
-GTEST_OBJS=$(call objs,$(GTEST_SRCS))
-ifeq ($(filter win%,$(TGT_OS)),$(TGT_OS))
-# Disabling pthreads globally will cause issues on darwin and possibly elsewhere
-$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -DGTEST_HAS_PTHREAD=0
-endif
-GTEST_INCLUDES := -I$(SRC_PATH_BARE)/third_party/googletest/src/googletest
-GTEST_INCLUDES += -I$(SRC_PATH_BARE)/third_party/googletest/src/googletest/include
-$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += $(GTEST_INCLUDES)
-OBJS-yes += $(GTEST_OBJS)
-LIBS-yes += $(BUILD_PFX)libgtest.a $(BUILD_PFX)libgtest_g.a
-$(BUILD_PFX)libgtest_g.a: $(GTEST_OBJS)
-
-LIBAOM_TEST_OBJS=$(sort $(call objs,$(LIBAOM_TEST_SRCS)))
-$(LIBAOM_TEST_OBJS) $(LIBAOM_TEST_OBJS:.o=.d): CXXFLAGS += $(GTEST_INCLUDES)
-OBJS-yes += $(LIBAOM_TEST_OBJS)
-BINS-yes += $(LIBAOM_TEST_BIN)
-
-CODEC_LIB=$(if $(CONFIG_DEBUG_LIBS),aom_g,aom)
-CODEC_LIB_SUF=$(if $(CONFIG_SHARED),$(SHARED_LIB_SUF),.a)
-TEST_LIBS := lib$(CODEC_LIB)$(CODEC_LIB_SUF) libgtest.a
-$(LIBAOM_TEST_BIN): $(TEST_LIBS)
-$(eval $(call linkerxx_template,$(LIBAOM_TEST_BIN), \
-              $(LIBAOM_TEST_OBJS) \
-              -L. -laom -lgtest $(extralibs) -lm))
-
-ifneq ($(strip $(TEST_INTRA_PRED_SPEED_OBJS)),)
-$(TEST_INTRA_PRED_SPEED_OBJS) $(TEST_INTRA_PRED_SPEED_OBJS:.o=.d): CXXFLAGS += $(GTEST_INCLUDES)
-OBJS-yes += $(TEST_INTRA_PRED_SPEED_OBJS)
-BINS-yes += $(TEST_INTRA_PRED_SPEED_BIN)
-
-$(TEST_INTRA_PRED_SPEED_BIN): $(TEST_LIBS)
-$(eval $(call linkerxx_template,$(TEST_INTRA_PRED_SPEED_BIN), \
-              $(TEST_INTRA_PRED_SPEED_OBJS) \
-              -L. -laom -lgtest $(extralibs) -lm))
-endif  # TEST_INTRA_PRED_SPEED
-
-endif  # CONFIG_UNIT_TESTS
-
-# Install test sources only if codec source is included
-INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(patsubst $(SRC_PATH_BARE)/%,%,\
-    $(shell find $(SRC_PATH_BARE)/third_party/googletest -type f))
-INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(LIBAOM_TEST_SRCS)
-INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(TEST_INTRA_PRED_SPEED_SRCS)
-
-define test_shard_template
-test:: test_shard.$(1)
-test-no-data-check:: test_shard_ndc.$(1)
-test_shard.$(1) test_shard_ndc.$(1): $(LIBAOM_TEST_BIN)
-	@set -e; \
-	 export GTEST_SHARD_INDEX=$(1); \
-	 export GTEST_TOTAL_SHARDS=$(2); \
-	 $(LIBAOM_TEST_BIN)
-test_shard.$(1): testdata
-.PHONY: test_shard.$(1)
-endef
-
-NUM_SHARDS := 10
-SHARDS := 0 1 2 3 4 5 6 7 8 9
-$(foreach s,$(SHARDS),$(eval $(call test_shard_template,$(s),$(NUM_SHARDS))))
-
-endif
-
-##
-## documentation directives
-##
-CLEAN-OBJS += libs.doxy
-DOCS-yes += libs.doxy
-libs.doxy: $(CODEC_DOC_SRCS)
-	@echo "    [CREATE] $@"
-	@rm -f $@
-	@echo "INPUT += $^" >> $@
-	@echo "INCLUDE_PATH += ." >> $@;
-	@echo "ENABLED_SECTIONS += $(sort $(CODEC_DOC_SECTIONS))" >> $@
-
-## Generate rtcd.h for all objects
-ifeq ($(CONFIG_DEPENDENCY_TRACKING),yes)
-$(OBJS-yes:.o=.d): $(RTCD)
-else
-$(OBJS-yes): $(RTCD)
-endif
-
-## Update the global src list
-SRCS += $(CODEC_SRCS) $(LIBAOM_TEST_SRCS) $(GTEST_SRCS)
-
-##
-## aomdec/aomenc tests.
-##
-ifeq ($(CONFIG_UNIT_TESTS),yes)
-TEST_BIN_PATH = .
-ifeq ($(CONFIG_MSVS),yes)
-# MSVC will build both Debug and Release configurations of tools in a
-# sub directory named for the current target. Assume the user wants to
-# run the Release tools, and assign TEST_BIN_PATH accordingly.
-# TODO(tomfinegan): Is this adequate for ARM?
-# TODO(tomfinegan): Support running the debug versions of tools?
-TEST_BIN_PATH := $(addsuffix /$(TGT_OS:win64=x64)/Release, $(TEST_BIN_PATH))
-endif
-utiltest utiltest-no-data-check:
-	$(qexec)$(SRC_PATH_BARE)/test/aomdec.sh \
-		--test-data-path $(LIBAOM_TEST_DATA_PATH) \
-		--bin-path $(TEST_BIN_PATH)
-	$(qexec)$(SRC_PATH_BARE)/test/aomenc.sh \
-		--test-data-path $(LIBAOM_TEST_DATA_PATH) \
-		--bin-path $(TEST_BIN_PATH)
-utiltest: testdata
-else
-utiltest utiltest-no-data-check:
-	@echo Unit tests must be enabled to make the utiltest target.
-endif
-
-##
-## Example tests.
-##
-ifeq ($(CONFIG_UNIT_TESTS),yes)
-# All non-MSVC targets output example targets in a sub dir named examples.
-EXAMPLES_BIN_PATH = examples
-ifeq ($(CONFIG_MSVS),yes)
-# MSVC will build both Debug and Release configurations of the examples in a
-# sub directory named for the current target. Assume the user wants to
-# run the Release tools, and assign EXAMPLES_BIN_PATH accordingly.
-# TODO(tomfinegan): Is this adequate for ARM?
-# TODO(tomfinegan): Support running the debug versions of tools?
-EXAMPLES_BIN_PATH := $(TGT_OS:win64=x64)/Release
-endif
-exampletest exampletest-no-data-check: examples
-	$(qexec)$(SRC_PATH_BARE)/test/examples.sh \
-		--test-data-path $(LIBAOM_TEST_DATA_PATH) \
-		--bin-path $(EXAMPLES_BIN_PATH)
-exampletest: testdata
-else
-exampletest exampletest-no-data-check:
-	@echo Unit tests must be enabled to make the exampletest target.
-endif
diff --git a/third_party/aom/mainpage.dox b/third_party/aom/mainpage.dox
index 9a82f4360..03a299ae1 100644
--- a/third_party/aom/mainpage.dox
+++ b/third_party/aom/mainpage.dox
@@ -22,7 +22,7 @@
   \section main_startpoints Starting Points
   - Consult the \ref changelog for a complete list of improvements in this
     release.
-  - The \ref readme contains instructions on recompiling the sample applications.
+  - \ref readme contains instructions on compiling the sample applications.
   - Read the \ref usage "usage" for a narrative on codec usage.
   - Read the \ref samples "sample code" for examples of how to interact with the
     codec.
@@ -45,8 +45,8 @@
    \verbinclude CHANGELOG
 */
 
-/*!\page readme README
-   \verbinclude README
+/*!\page readme README.md
+   \include README.md
 */
 
 /*!\defgroup codecs Supported Codecs */
diff --git a/third_party/aom/solution.mk b/third_party/aom/solution.mk
deleted file mode 100644
index caa8bc17b..000000000
--- a/third_party/aom/solution.mk
+++ /dev/null
@@ -1,33 +0,0 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-
-# libaom reverse dependencies (targets that depend on libaom)
-AOM_NONDEPS=$(addsuffix .$(VCPROJ_SFX),aom gtest)
-AOM_RDEPS=$(foreach vcp,\
-              $(filter-out $(AOM_NONDEPS),$^), --dep=$(vcp:.$(VCPROJ_SFX)=):aom)
-
-aom.sln: $(wildcard *.$(VCPROJ_SFX))
-	@echo "    [CREATE] $@"
-	$(SRC_PATH_BARE)/build/make/gen_msvs_sln.sh \
-            $(if $(filter aom.$(VCPROJ_SFX),$^),$(AOM_RDEPS)) \
-            --dep=test_libaom:gtest \
-            --ver=$(CONFIG_VS_VERSION)\
-            --out=$@ $^
-aom.sln.mk: aom.sln
-	@true
-
-PROJECTS-yes += aom.sln aom.sln.mk
--include aom.sln.mk
-
-# Always install this file, as it is an unconditional post-build rule.
-INSTALL_MAPS += src/%     $(SRC_PATH_BARE)/%
-INSTALL-SRCS-yes            += $(target).mk
diff --git a/third_party/aom/aomstats.c b/third_party/aom/stats/aomstats.c
index 0cfeea2f1..4a15adf02 100644
--- a/third_party/aom/aomstats.c
+++ b/third_party/aom/stats/aomstats.c
@@ -9,13 +9,13 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aomstats.h"
+#include "stats/aomstats.h"
 
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
 
-#include "./tools_common.h"
+#include "common/tools_common.h"
 
 int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
   int res;
diff --git a/third_party/aom/aomstats.h b/third_party/aom/stats/aomstats.h
index 643809344..643809344 100644
--- a/third_party/aom/aomstats.h
+++ b/third_party/aom/stats/aomstats.h
diff --git a/third_party/aom/rate_hist.c b/third_party/aom/stats/rate_hist.c
index ffc7b8997..71eb78b72 100644
--- a/third_party/aom/rate_hist.c
+++ b/third_party/aom/stats/rate_hist.c
@@ -9,14 +9,14 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "stats/rate_hist.h"
+
 #include <assert.h>
 #include <stdlib.h>
 #include <limits.h>
 #include <stdio.h>
 #include <math.h>
 
-#include "./rate_hist.h"
-
 #define RATE_BINS 100
 #define HIST_BAR_MAX 40
 
diff --git a/third_party/aom/rate_hist.h b/third_party/aom/stats/rate_hist.h
index e6aa149ae..e6aa149ae 100644
--- a/third_party/aom/rate_hist.h
+++ b/third_party/aom/stats/rate_hist.h
diff --git a/third_party/aom/test/accounting_test.cc b/third_party/aom/test/accounting_test.cc
index e8387d0dc..8b5c8af13 100644
--- a/third_party/aom/test/accounting_test.cc
+++ b/third_party/aom/test/accounting_test.cc
@@ -7,7 +7,7 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <math.h>
 #include <stdlib.h>
@@ -35,10 +35,7 @@ TEST(AV1, TestAccounting) {
   }
   aom_stop_encode(&bw);
   aom_reader br;
-#if CONFIG_ANS && ANS_MAX_SYMBOLS
-  br.window_size = 1 << 16;
-#endif
-  aom_reader_init(&br, bw_buffer, bw.pos, NULL, NULL);
+  aom_reader_init(&br, bw_buffer, bw.pos);
 
   Accounting accounting;
   aom_accounting_init(&accounting);
@@ -54,7 +51,7 @@ TEST(AV1, TestAccounting) {
   GTEST_ASSERT_EQ(accounting.syms.num_syms, 0);
 
   // Should record 2 * kSymbols accounting symbols.
-  aom_reader_init(&br, bw_buffer, bw.pos, NULL, NULL);
+  aom_reader_init(&br, bw_buffer, bw.pos);
   br.accounting = &accounting;
   for (int i = 0; i < kSymbols; i++) {
     aom_read(&br, 32, "A");
diff --git a/third_party/aom/test/acm_random.h b/third_party/aom/test/acm_random.h
index 4842345ff..023387061 100644
--- a/third_party/aom/test/acm_random.h
+++ b/third_party/aom/test/acm_random.h
@@ -36,6 +36,19 @@ class ACMRandom {
     return (value >> 15) & 0xffff;
   }
 
+  int16_t Rand15Signed(void) {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    return (value >> 17) & 0xffff;
+  }
+
+  uint16_t Rand12(void) {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    // There's a bit more entropy in the upper bits of this implementation.
+    return (value >> 19) & 0xfff;
+  }
+
   int16_t Rand9Signed(void) {
     // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
     const uint32_t value = random_.Generate(512);
diff --git a/third_party/aom/test/active_map_refresh_test.cc b/third_party/aom/test/active_map_refresh_test.cc
deleted file mode 100644
index 184692ca8..000000000
--- a/third_party/aom/test/active_map_refresh_test.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
-
-#include <algorithm>
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "test/codec_factory.h"
-#include "test/encode_test_driver.h"
-#include "test/util.h"
-#include "test/y4m_video_source.h"
-
-namespace {
-
-// Check if any pixel in a 16x16 macroblock varies between frames.
-int CheckMb(const aom_image_t &current, const aom_image_t &previous, int mb_r,
-            int mb_c) {
-  for (int plane = 0; plane < 3; plane++) {
-    int r = 16 * mb_r;
-    int c0 = 16 * mb_c;
-    int r_top = std::min(r + 16, static_cast<int>(current.d_h));
-    int c_top = std::min(c0 + 16, static_cast<int>(current.d_w));
-    r = std::max(r, 0);
-    c0 = std::max(c0, 0);
-    if (plane > 0 && current.x_chroma_shift) {
-      c_top = (c_top + 1) >> 1;
-      c0 >>= 1;
-    }
-    if (plane > 0 && current.y_chroma_shift) {
-      r_top = (r_top + 1) >> 1;
-      r >>= 1;
-    }
-    for (; r < r_top; ++r) {
-      for (int c = c0; c < c_top; ++c) {
-        if (current.planes[plane][current.stride[plane] * r + c] !=
-            previous.planes[plane][previous.stride[plane] * r + c])
-          return 1;
-      }
-    }
-  }
-  return 0;
-}
-
-void GenerateMap(int mb_rows, int mb_cols, const aom_image_t &current,
-                 const aom_image_t &previous, uint8_t *map) {
-  for (int mb_r = 0; mb_r < mb_rows; ++mb_r) {
-    for (int mb_c = 0; mb_c < mb_cols; ++mb_c) {
-      map[mb_r * mb_cols + mb_c] = CheckMb(current, previous, mb_r, mb_c);
-    }
-  }
-}
-
-const int kAqModeCyclicRefresh = 3;
-
-class ActiveMapRefreshTest
-    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
-      public ::libaom_test::EncoderTest {
- protected:
-  ActiveMapRefreshTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~ActiveMapRefreshTest() {}
-
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
-    cpu_used_ = GET_PARAM(2);
-  }
-
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
-    ::libaom_test::Y4mVideoSource *y4m_video =
-        static_cast<libaom_test::Y4mVideoSource *>(video);
-    if (video->frame() == 1) {
-      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
-      encoder->Control(AV1E_SET_AQ_MODE, kAqModeCyclicRefresh);
-    } else if (video->frame() >= 2 && video->img()) {
-      aom_image_t *current = video->img();
-      aom_image_t *previous = y4m_holder_->img();
-      ASSERT_TRUE(previous != NULL);
-      aom_active_map_t map = aom_active_map_t();
-      const int width = static_cast<int>(current->d_w);
-      const int height = static_cast<int>(current->d_h);
-      const int mb_width = (width + 15) / 16;
-      const int mb_height = (height + 15) / 16;
-      uint8_t *active_map = new uint8_t[mb_width * mb_height];
-      GenerateMap(mb_height, mb_width, *current, *previous, active_map);
-      map.cols = mb_width;
-      map.rows = mb_height;
-      map.active_map = active_map;
-      encoder->Control(AOME_SET_ACTIVEMAP, &map);
-      delete[] active_map;
-    }
-    if (video->img()) {
-      y4m_video->SwapBuffers(y4m_holder_);
-    }
-  }
-
-  int cpu_used_;
-  ::libaom_test::Y4mVideoSource *y4m_holder_;
-};
-
-TEST_P(ActiveMapRefreshTest, Test) {
-  cfg_.g_lag_in_frames = 0;
-  cfg_.g_profile = 1;
-  cfg_.rc_target_bitrate = 600;
-  cfg_.rc_resize_mode = 0;
-  cfg_.rc_min_quantizer = 8;
-  cfg_.rc_max_quantizer = 30;
-  cfg_.g_pass = AOM_RC_ONE_PASS;
-  cfg_.rc_end_usage = AOM_CBR;
-  cfg_.kf_max_dist = 90000;
-
-  ::libaom_test::Y4mVideoSource video("desktop_credits.y4m", 0, 10);
-  ::libaom_test::Y4mVideoSource video_holder("desktop_credits.y4m", 0, 10);
-  video_holder.Begin();
-  y4m_holder_ = &video_holder;
-
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-}
-
-AV1_INSTANTIATE_TEST_CASE(ActiveMapRefreshTest,
-                          ::testing::Values(::libaom_test::kRealTime),
-                          ::testing::Range(5, 6));
-}  // namespace
diff --git a/third_party/aom/test/active_map_test.cc b/third_party/aom/test/active_map_test.cc
index 318a8518b..a2b0546ed 100644
--- a/third_party/aom/test/active_map_test.cc
+++ b/third_party/aom/test/active_map_test.cc
@@ -7,7 +7,7 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <climits>
 #include <vector>
diff --git a/third_party/aom/test/android/Android.mk b/third_party/aom/test/android/Android.mk
deleted file mode 100644
index 74f9d7cba..000000000
--- a/third_party/aom/test/android/Android.mk
+++ /dev/null
@@ -1,58 +0,0 @@
-#
-# Copyright (c) 2016, Alliance for Open Media. All rights reserved
-#
-# This source code is subject to the terms of the BSD 2 Clause License and
-# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-# was not distributed with this source code in the LICENSE file, you can
-# obtain it at www.aomedia.org/license/software. If the Alliance for Open
-# Media Patent License 1.0 was not distributed with this source code in the
-# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-#
-# This make file builds aom_test app for android.
-# The test app itself runs on the command line through adb shell
-# The paths are really messed up as the libaom make file
-# expects to be made from a parent directory.
-CUR_WD := $(call my-dir)
-BINDINGS_DIR := $(CUR_WD)/../../..
-LOCAL_PATH := $(CUR_WD)/../../..
-
-#libwebm
-include $(CLEAR_VARS)
-include $(BINDINGS_DIR)/libaom/third_party/libwebm/Android.mk
-LOCAL_PATH := $(CUR_WD)/../../..
-
-#libaom
-include $(CLEAR_VARS)
-LOCAL_STATIC_LIBRARIES := libwebm
-include $(BINDINGS_DIR)/libaom/build/make/Android.mk
-LOCAL_PATH := $(CUR_WD)/../..
-
-#libgtest
-include $(CLEAR_VARS)
-LOCAL_ARM_MODE := arm
-LOCAL_CPP_EXTENSION := .cc
-LOCAL_MODULE := gtest
-LOCAL_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/googletest/src
-LOCAL_C_INCLUDES += $(LOCAL_PATH)/third_party/googletest/src/googletest/include
-LOCAL_SRC_FILES := ./third_party/googletest/src/googletest/src/gtest-all.cc
-include $(BUILD_STATIC_LIBRARY)
-
-#libaom_test
-include $(CLEAR_VARS)
-LOCAL_ARM_MODE := arm
-LOCAL_MODULE := libaom_test
-LOCAL_STATIC_LIBRARIES := gtest libwebm
-
-ifeq ($(ENABLE_SHARED),1)
-  LOCAL_SHARED_LIBRARIES := aom
-else
-  LOCAL_STATIC_LIBRARIES += aom
-endif
-
-include $(LOCAL_PATH)/test/test.mk
-LOCAL_C_INCLUDES := $(BINDINGS_DIR)
-FILTERED_SRC := $(sort $(filter %.cc %.c, $(LIBAOM_TEST_SRCS-yes)))
-LOCAL_SRC_FILES := $(addprefix ./test/, $(FILTERED_SRC))
-# some test files depend on *_rtcd.h, ensure they're generated first.
-$(eval $(call rtcd_dep_template))
-include $(BUILD_EXECUTABLE)
diff --git a/third_party/aom/test/android/README b/third_party/aom/test/android/README
deleted file mode 100644
index 35c829738..000000000
--- a/third_party/aom/test/android/README
+++ /dev/null
@@ -1,32 +0,0 @@
-Android.mk will build aom unittests on android.
-1) Configure libaom from the parent directory:
-./libaom/configure --target=armv7-android-gcc --enable-external-build \
-  --enable-postproc --disable-install-srcs --enable-multi-res-encoding \
-  --enable-temporal-denoising --disable-unit-tests --disable-install-docs \
-  --disable-examples --disable-runtime-cpu-detect --sdk-path=$NDK
-
-2) From the parent directory, invoke ndk-build:
-NDK_PROJECT_PATH=. ndk-build APP_BUILD_SCRIPT=./libaom/test/android/Android.mk \
-  APP_ABI=armeabi-v7a APP_PLATFORM=android-18 APP_OPTIM=release \
-  APP_STL=gnustl_static
-
-Note: Both adb and ndk-build are available prebuilt at:
-  https://chromium.googlesource.com/android_tools
-
-3) Run get_files.py to download the test files:
-python get_files.py -i /path/to/test-data.sha1 -o /path/to/put/files \
-  -u http://downloads.webmproject.org/test_data/libaom
-
-4) Transfer files to device using adb. Ensure you have proper permissions for
-the target
-
-adb push /path/to/test_files /data/local/tmp
-adb push /path/to/built_libs /data/local/tmp
-
-NOTE: Built_libs defaults to parent_dir/libs/armeabi-v7a
-
-5) Run tests:
-adb shell
-(on device)
-cd /data/local/tmp
-LD_LIBRARY_PATH=. ./aom_test
diff --git a/third_party/aom/test/android/get_files.py b/third_party/aom/test/android/get_files.py
deleted file mode 100644
index bdae9a315..000000000
--- a/third_party/aom/test/android/get_files.py
+++ /dev/null
@@ -1,120 +0,0 @@
-#
-# Copyright (c) 2016, Alliance for Open Media. All rights reserved
-#
-# This source code is subject to the terms of the BSD 2 Clause License and
-# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-# was not distributed with this source code in the LICENSE file, you can
-# obtain it at www.aomedia.org/license/software. If the Alliance for Open
-# Media Patent License 1.0 was not distributed with this source code in the
-# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-#
-# This simple script pulls test files from the webm homepage
-# It is intelligent enough to only pull files if
-#   1) File / test_data folder does not exist
-#   2) SHA mismatch
-
-import pycurl
-import csv
-import hashlib
-import re
-import os.path
-import time
-import itertools
-import sys
-import getopt
-
-#globals
-url = ''
-file_list_path = ''
-local_resource_path = ''
-
-# Helper functions:
-# A simple function which returns the sha hash of a file in hex
-def get_file_sha(filename):
-  try:
-    sha_hash = hashlib.sha1()
-    with open(filename, 'rb') as file:
-      buf = file.read(HASH_CHUNK)
-      while len(buf) > 0:
-        sha_hash.update(buf)
-        buf = file.read(HASH_CHUNK)
-      return sha_hash.hexdigest()
-  except IOError:
-    print "Error reading " + filename
-
-# Downloads a file from a url, and then checks the sha against the passed
-# in sha
-def download_and_check_sha(url, filename, sha):
-  path = os.path.join(local_resource_path, filename)
-  fp = open(path, "wb")
-  curl = pycurl.Curl()
-  curl.setopt(pycurl.URL, url + "/" + filename)
-  curl.setopt(pycurl.WRITEDATA, fp)
-  curl.perform()
-  curl.close()
-  fp.close()
-  return get_file_sha(path) == sha
-
-#constants
-ftp_retries = 3
-
-SHA_COL = 0
-NAME_COL = 1
-EXPECTED_COL = 2
-HASH_CHUNK = 65536
-
-# Main script
-try:
-  opts, args = \
-      getopt.getopt(sys.argv[1:], \
-                    "u:i:o:", ["url=", "input_csv=", "output_dir="])
-except:
-  print 'get_files.py -u <url> -i <input_csv> -o <output_dir>'
-  sys.exit(2)
-
-for opt, arg in opts:
-  if opt == '-u':
-    url = arg
-  elif opt in ("-i", "--input_csv"):
-    file_list_path = os.path.join(arg)
-  elif opt in ("-o", "--output_dir"):
-    local_resource_path = os.path.join(arg)
-
-if len(sys.argv) != 7:
-  print "Expects two paths and a url!"
-  exit(1)
-
-if not os.path.isdir(local_resource_path):
-  os.makedirs(local_resource_path)
-
-file_list_csv = open(file_list_path, "rb")
-
-# Our 'csv' file uses multiple spaces as a delimiter, python's
-# csv class only uses single character delimiters, so we convert them below
-file_list_reader = csv.reader((re.sub(' +', ' ', line) \
-    for line in file_list_csv), delimiter = ' ')
-
-file_shas = []
-file_names = []
-
-for row in file_list_reader:
-  if len(row) != EXPECTED_COL:
-      continue
-  file_shas.append(row[SHA_COL])
-  file_names.append(row[NAME_COL])
-
-file_list_csv.close()
-
-# Download files, only if they don't already exist and have correct shas
-for filename, sha in itertools.izip(file_names, file_shas):
-  path = os.path.join(local_resource_path, filename)
-  if os.path.isfile(path) \
-      and get_file_sha(path) == sha:
-    print path + ' exists, skipping'
-    continue
-  for retry in range(0, ftp_retries):
-    print "Downloading " + path
-    if not download_and_check_sha(url, filename, sha):
-      print "Sha does not match, retrying..."
-    else:
-      break
diff --git a/third_party/aom/test/android/scrape_gtest_log.py b/third_party/aom/test/android/scrape_gtest_log.py
deleted file mode 100644
index e0c929a5d..000000000
--- a/third_party/aom/test/android/scrape_gtest_log.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#
-# Copyright (c) 2016, Alliance for Open Media. All rights reserved
-#
-# This source code is subject to the terms of the BSD 2 Clause License and
-# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-# was not distributed with this source code in the LICENSE file, you can
-# obtain it at www.aomedia.org/license/software. If the Alliance for Open
-# Media Patent License 1.0 was not distributed with this source code in the
-# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-#
-
-"""Standalone script which parses a gtest log for json.
-
-Json is returned returns as an array.  This script is used by the libaom
-waterfall to gather json results mixed in with gtest logs.  This is
-dubious software engineering.
-"""
-
-import getopt
-import json
-import os
-import re
-import sys
-
-
-def main():
-  if len(sys.argv) != 3:
-    print "Expects a file to write json to!"
-    exit(1)
-
-  try:
-    opts, _ = \
-        getopt.getopt(sys.argv[1:], \
-                      'o:', ['output-json='])
-  except getopt.GetOptError:
-    print 'scrape_gtest_log.py -o <output_json>'
-    sys.exit(2)
-
-  output_json = ''
-  for opt, arg in opts:
-    if opt in ('-o', '--output-json'):
-      output_json = os.path.join(arg)
-
-  blob = sys.stdin.read()
-  json_string = '[' + ','.join('{' + x + '}' for x in
-                               re.findall(r'{([^}]*.?)}', blob)) + ']'
-  print blob
-
-  output = json.dumps(json.loads(json_string), indent=4, sort_keys=True)
-  print output
-
-  path = os.path.dirname(output_json)
-  if path and not os.path.exists(path):
-    os.makedirs(path)
-
-  outfile = open(output_json, 'w')
-  outfile.write(output)
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/aom/test/ans_codec_test.cc b/third_party/aom/test/ans_codec_test.cc
deleted file mode 100644
index 59d352b2d..000000000
--- a/third_party/aom/test/ans_codec_test.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "test/codec_factory.h"
-#include "test/encode_test_driver.h"
-#include "test/util.h"
-#include "test/y4m_video_source.h"
-#include "aom_dsp/ans.h"
-#include "av1/av1_dx_iface.c"
-
-// A note on ANS_MAX_SYMBOLS == 0:
-// Fused gtest doesn't work with EXPECT_FATAL_FAILURE [1]. Just run with a
-// single iteration and don't try to check the window size if we are unwindowed.
-// [1] https://github.com/google/googletest/issues/356
-
-namespace {
-
-const char kTestVideoName[] = "niklas_1280_720_30.y4m";
-const int kTestVideoFrames = 10;
-
-class AnsCodecTest : public ::libaom_test::CodecTestWithParam<int>,
-                     public ::libaom_test::EncoderTest {
- protected:
-  AnsCodecTest()
-      : EncoderTest(GET_PARAM(0)), ans_window_size_log2_(GET_PARAM(1)) {}
-
-  virtual ~AnsCodecTest() {}
-
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(::libaom_test::kOnePassGood);
-    cfg_.g_lag_in_frames = 25;
-    cfg_.rc_end_usage = AOM_CQ;
-  }
-
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
-    if (video->frame() == 1) {
-#if ANS_MAX_SYMBOLS
-      encoder->Control(AV1E_SET_ANS_WINDOW_SIZE_LOG2, ans_window_size_log2_);
-#endif
-      // Try to push a high symbol count through the codec
-      encoder->Control(AOME_SET_CQ_LEVEL, 8);
-      encoder->Control(AOME_SET_CPUUSED, 2);
-      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
-      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
-      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
-      encoder->Control(AV1E_SET_TILE_COLUMNS, 0);
-      encoder->Control(AV1E_SET_TILE_ROWS, 0);
-    }
-  }
-
-  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  libaom_test::Decoder *decoder) {
-    aom_codec_ctx_t *const av1_decoder = decoder->GetDecoder();
-#if ANS_MAX_SYMBOLS
-    aom_codec_alg_priv_t *const priv =
-        reinterpret_cast<aom_codec_alg_priv_t *>(av1_decoder->priv);
-    FrameWorkerData *const worker_data =
-        reinterpret_cast<FrameWorkerData *>(priv->frame_workers[0].data1);
-    AV1_COMMON *const common = &worker_data->pbi->common;
-
-    EXPECT_EQ(ans_window_size_log2_, common->ans_window_size_log2);
-#endif
-
-    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
-    return AOM_CODEC_OK == res_dec;
-  }
-
- private:
-  int ans_window_size_log2_;
-};
-
-TEST_P(AnsCodecTest, BitstreamParms) {
-  testing::internal::scoped_ptr<libaom_test::VideoSource> video(
-      new libaom_test::Y4mVideoSource(kTestVideoName, 0, kTestVideoFrames));
-  ASSERT_TRUE(video.get() != NULL);
-
-  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-#if ANS_MAX_SYMBOLS
-AV1_INSTANTIATE_TEST_CASE(AnsCodecTest, ::testing::Range(8, 24));
-#else
-AV1_INSTANTIATE_TEST_CASE(AnsCodecTest, ::testing::Range(0, 1));
-#endif
-}  // namespace
diff --git a/third_party/aom/test/ans_test.cc b/third_party/aom/test/ans_test.cc
deleted file mode 100644
index fd460f409..000000000
--- a/third_party/aom/test/ans_test.cc
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-#include <ctime>
-#include <utility>
-#include <vector>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "test/acm_random.h"
-#include "aom_dsp/ansreader.h"
-#include "aom_dsp/buf_ans.h"
-
-namespace {
-typedef std::vector<std::pair<uint8_t, bool> > PvVec;
-
-const int kPrintStats = 0;
-// Use a small buffer size to exercise ANS window spills or buffer growth
-const int kBufAnsSize = 1 << 8;
-
-PvVec abs_encode_build_vals(int iters) {
-  PvVec ret;
-  libaom_test::ACMRandom gen(0x30317076);
-  double entropy = 0;
-  for (int i = 0; i < iters; ++i) {
-    uint8_t p;
-    do {
-      p = gen.Rand8();
-    } while (p == 0);  // zero is not a valid coding probability
-    bool b = gen.Rand8() < p;
-    ret.push_back(std::make_pair(static_cast<uint8_t>(p), b));
-    if (kPrintStats) {
-      double d = p / 256.;
-      entropy += -d * log2(d) - (1 - d) * log2(1 - d);
-    }
-  }
-  if (kPrintStats) printf("entropy %f\n", entropy);
-  return ret;
-}
-
-bool check_rabs(const PvVec &pv_vec, uint8_t *buf) {
-  BufAnsCoder a;
-  a.size = kBufAnsSize;
-  aom_buf_ans_alloc(&a, NULL);
-  buf_ans_write_init(&a, buf);
-
-  std::clock_t start = std::clock();
-  for (PvVec::const_iterator it = pv_vec.begin(); it != pv_vec.end(); ++it) {
-    buf_rabs_write(&a, it->second, 256 - it->first);
-  }
-  aom_buf_ans_flush(&a);
-  std::clock_t enc_time = std::clock() - start;
-  int offset = buf_ans_write_end(&a);
-  aom_buf_ans_free(&a);
-  bool okay = true;
-  AnsDecoder d;
-#if ANS_MAX_SYMBOLS
-  d.window_size = kBufAnsSize;
-#endif
-  if (ans_read_init(&d, buf, offset)) return false;
-  start = std::clock();
-  for (PvVec::const_iterator it = pv_vec.begin(); it != pv_vec.end(); ++it) {
-    okay = okay && (rabs_read(&d, 256 - it->first) != 0) == it->second;
-  }
-  std::clock_t dec_time = std::clock() - start;
-  if (!okay) return false;
-  if (kPrintStats)
-    printf("uABS size %d enc_time %f dec_time %f\n", offset,
-           static_cast<float>(enc_time) / CLOCKS_PER_SEC,
-           static_cast<float>(dec_time) / CLOCKS_PER_SEC);
-  return ans_read_end(&d) != 0;
-}
-
-const aom_cdf_prob spareto65[] = { 8320, 6018, 4402, 3254, 4259,
-                                   3919, 2057, 492,  45,   2 };
-
-const int kRansSymbols =
-    static_cast<int>(sizeof(spareto65) / sizeof(spareto65[0]));
-
-struct rans_sym {
-  aom_cdf_prob prob;
-  aom_cdf_prob cum_prob;  // not-inclusive
-};
-
-std::vector<int> ans_encode_build_vals(rans_sym *const tab, int iters) {
-  aom_cdf_prob sum = 0;
-  for (int i = 0; i < kRansSymbols; ++i) {
-    tab[i].cum_prob = sum;
-    tab[i].prob = spareto65[i];
-    sum += spareto65[i];
-  }
-  std::vector<int> p_to_sym;
-  for (int i = 0; i < kRansSymbols; ++i) {
-    p_to_sym.insert(p_to_sym.end(), tab[i].prob, i);
-  }
-  assert(p_to_sym.size() == RANS_PRECISION);
-  std::vector<int> ret;
-  libaom_test::ACMRandom gen(18543637);
-  for (int i = 0; i < iters; ++i) {
-    int sym =
-        p_to_sym[((gen.Rand8() << 8) + gen.Rand8()) & (RANS_PRECISION - 1)];
-    ret.push_back(sym);
-  }
-  return ret;
-}
-
-void rans_build_dec_tab(const struct rans_sym sym_tab[],
-                        aom_cdf_prob *dec_tab) {
-  unsigned int sum = 0;
-  for (int i = 0; sum < RANS_PRECISION; ++i) {
-    dec_tab[i] = sum += sym_tab[i].prob;
-  }
-}
-
-bool check_rans(const std::vector<int> &sym_vec, const rans_sym *const tab,
-                uint8_t *buf) {
-  BufAnsCoder a;
-  a.size = kBufAnsSize;
-  aom_buf_ans_alloc(&a, NULL);
-  buf_ans_write_init(&a, buf);
-  aom_cdf_prob dec_tab[kRansSymbols];
-  rans_build_dec_tab(tab, dec_tab);
-
-  std::clock_t start = std::clock();
-  for (std::vector<int>::const_iterator it = sym_vec.begin();
-       it != sym_vec.end(); ++it) {
-    buf_rans_write(&a, tab[*it].cum_prob, tab[*it].prob);
-  }
-  aom_buf_ans_flush(&a);
-  std::clock_t enc_time = std::clock() - start;
-  int offset = buf_ans_write_end(&a);
-  aom_buf_ans_free(&a);
-  bool okay = true;
-  AnsDecoder d;
-#if ANS_MAX_SYMBOLS
-  d.window_size = kBufAnsSize;
-#endif
-  if (ans_read_init(&d, buf, offset)) return false;
-  start = std::clock();
-  for (std::vector<int>::const_iterator it = sym_vec.begin();
-       it != sym_vec.end(); ++it) {
-    okay &= rans_read(&d, dec_tab) == *it;
-  }
-  std::clock_t dec_time = std::clock() - start;
-  if (!okay) return false;
-  if (kPrintStats)
-    printf("rANS size %d enc_time %f dec_time %f\n", offset,
-           static_cast<float>(enc_time) / CLOCKS_PER_SEC,
-           static_cast<float>(dec_time) / CLOCKS_PER_SEC);
-  return ans_read_end(&d) != 0;
-}
-
-class AbsTestFix : public ::testing::Test {
- protected:
-  static void SetUpTestCase() { pv_vec_ = abs_encode_build_vals(kNumBools); }
-  virtual void SetUp() { buf_ = new uint8_t[kNumBools / 8]; }
-  virtual void TearDown() { delete[] buf_; }
-  static const int kNumBools = 100000000;
-  static PvVec pv_vec_;
-  uint8_t *buf_;
-};
-PvVec AbsTestFix::pv_vec_;
-
-class AnsTestFix : public ::testing::Test {
- protected:
-  static void SetUpTestCase() {
-    sym_vec_ = ans_encode_build_vals(rans_sym_tab_, kNumSyms);
-  }
-  virtual void SetUp() { buf_ = new uint8_t[kNumSyms / 2]; }
-  virtual void TearDown() { delete[] buf_; }
-  static const int kNumSyms = 25000000;
-  static std::vector<int> sym_vec_;
-  static rans_sym rans_sym_tab_[kRansSymbols];
-  uint8_t *buf_;
-};
-std::vector<int> AnsTestFix::sym_vec_;
-rans_sym AnsTestFix::rans_sym_tab_[kRansSymbols];
-
-TEST_F(AbsTestFix, Rabs) { EXPECT_TRUE(check_rabs(pv_vec_, buf_)); }
-TEST_F(AnsTestFix, Rans) {
-  EXPECT_TRUE(check_rans(sym_vec_, rans_sym_tab_, buf_));
-}
-TEST(AnsTest, FinalStateSerialization) {
-  for (unsigned i = L_BASE; i < L_BASE * IO_BASE; ++i) {
-    uint8_t buf[8];
-    AnsCoder c;
-    ans_write_init(&c, buf);
-    c.state = i;
-    const int written_size = ans_write_end(&c);
-    ASSERT_LT(static_cast<size_t>(written_size), sizeof(buf));
-    AnsDecoder d;
-#if ANS_MAX_SYMBOLS
-    // There is no real data window here because no symbols are sent through
-    // ans (only synthetic states), so use a dummy value
-    d.window_size = 1024;
-#endif
-    const int read_init_status = ans_read_init(&d, buf, written_size);
-    EXPECT_EQ(read_init_status, 0);
-    EXPECT_EQ(d.state, i);
-  }
-}
-}  // namespace
diff --git a/third_party/aom/test/aom_integer_test.cc b/third_party/aom/test/aom_integer_test.cc
new file mode 100644
index 000000000..fe88a54e9
--- /dev/null
+++ b/third_party/aom/test/aom_integer_test.cc
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_integer.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+const uint64_t kMaximumLeb128CodedSize = 8;
+const uint8_t kLeb128PadByte = 0x80;  // Binary: 10000000
+const uint64_t kMaximumLeb128Value = UINT32_MAX;
+const uint32_t kSizeTestNumValues = 6;
+const uint32_t kSizeTestExpectedSizes[kSizeTestNumValues] = {
+  1, 1, 2, 3, 4, 5
+};
+const uint64_t kSizeTestInputs[kSizeTestNumValues] = {
+  0, 0x7f, 0x3fff, 0x1fffff, 0xffffff, 0x10000000
+};
+
+const uint8_t kOutOfRangeLeb128Value[5] = { 0x80, 0x80, 0x80, 0x80,
+                                            0x10 };  // UINT32_MAX + 1
+}  // namespace
+
+TEST(AomLeb128, DecodeTest) {
+  const size_t num_leb128_bytes = 3;
+  const uint8_t leb128_bytes[num_leb128_bytes] = { 0xE5, 0x8E, 0x26 };
+  const uint64_t expected_value = 0x98765;  // 624485
+  const size_t expected_length = 3;
+  uint64_t value = ~0ULL;  // make sure value is cleared by the function
+  size_t length;
+  ASSERT_EQ(
+      aom_uleb_decode(&leb128_bytes[0], num_leb128_bytes, &value, &length), 0);
+  ASSERT_EQ(expected_value, value);
+  ASSERT_EQ(expected_length, length);
+
+  // Make sure the decoder stops on the last marked LEB128 byte.
+  aom_uleb_decode(&leb128_bytes[0], num_leb128_bytes + 1, &value, &length);
+  ASSERT_EQ(expected_value, value);
+  ASSERT_EQ(expected_length, length);
+}
+
+TEST(AomLeb128, EncodeTest) {
+  const uint32_t test_value = 0x98765;  // 624485
+  const uint8_t expected_bytes[3] = { 0xE5, 0x8E, 0x26 };
+  const size_t kWriteBufferSize = 4;
+  uint8_t write_buffer[kWriteBufferSize] = { 0 };
+  size_t bytes_written = 0;
+  ASSERT_EQ(aom_uleb_encode(test_value, kWriteBufferSize, &write_buffer[0],
+                            &bytes_written),
+            0);
+  ASSERT_EQ(bytes_written, 3u);
+  for (size_t i = 0; i < bytes_written; ++i) {
+    ASSERT_EQ(write_buffer[i], expected_bytes[i]);
+  }
+}
+
+TEST(AomLeb128, EncodeDecodeTest) {
+  const uint32_t value = 0x98765;  // 624485
+  const size_t kWriteBufferSize = 4;
+  uint8_t write_buffer[kWriteBufferSize] = { 0 };
+  size_t bytes_written = 0;
+  ASSERT_EQ(aom_uleb_encode(value, kWriteBufferSize, &write_buffer[0],
+                            &bytes_written),
+            0);
+  ASSERT_EQ(bytes_written, 3u);
+  uint64_t decoded_value;
+  size_t decoded_length;
+  aom_uleb_decode(&write_buffer[0], bytes_written, &decoded_value,
+                  &decoded_length);
+  ASSERT_EQ(value, decoded_value);
+  ASSERT_EQ(bytes_written, decoded_length);
+}
+
+TEST(AomLeb128, FixedSizeEncodeTest) {
+  const uint32_t test_value = 0x123;
+  const uint8_t expected_bytes[4] = { 0xa3, 0x82, 0x80, 0x00 };
+  const size_t kWriteBufferSize = 4;
+  uint8_t write_buffer[kWriteBufferSize] = { 0 };
+  size_t bytes_written = 0;
+  ASSERT_EQ(0, aom_uleb_encode_fixed_size(test_value, kWriteBufferSize,
+                                          kWriteBufferSize, &write_buffer[0],
+                                          &bytes_written));
+  ASSERT_EQ(kWriteBufferSize, bytes_written);
+  for (size_t i = 0; i < bytes_written; ++i) {
+    ASSERT_EQ(write_buffer[i], expected_bytes[i]);
+  }
+}
+
+TEST(AomLeb128, FixedSizeEncodeDecodeTest) {
+  const uint32_t value = 0x1;
+  const size_t kWriteBufferSize = 4;
+  uint8_t write_buffer[kWriteBufferSize] = { 0 };
+  size_t bytes_written = 0;
+  ASSERT_EQ(
+      aom_uleb_encode_fixed_size(value, kWriteBufferSize, kWriteBufferSize,
+                                 &write_buffer[0], &bytes_written),
+      0);
+  ASSERT_EQ(bytes_written, 4u);
+  uint64_t decoded_value;
+  size_t decoded_length;
+  aom_uleb_decode(&write_buffer[0], bytes_written, &decoded_value,
+                  &decoded_length);
+  ASSERT_EQ(value, decoded_value);
+  ASSERT_EQ(bytes_written, decoded_length);
+}
+
+TEST(AomLeb128, SizeTest) {
+  for (size_t i = 0; i < kSizeTestNumValues; ++i) {
+    ASSERT_EQ(kSizeTestExpectedSizes[i],
+              aom_uleb_size_in_bytes(kSizeTestInputs[i]));
+  }
+}
+
+TEST(AomLeb128, DecodeFailTest) {
+  // Input buffer containing what would be a valid 9 byte LEB128 encoded
+  // unsigned integer.
+  const uint8_t kAllPadBytesBuffer[kMaximumLeb128CodedSize + 1] = {
+    kLeb128PadByte, kLeb128PadByte, kLeb128PadByte,
+    kLeb128PadByte, kLeb128PadByte, kLeb128PadByte,
+    kLeb128PadByte, kLeb128PadByte, 0
+  };
+  uint64_t decoded_value;
+
+  // Test that decode fails when result would be valid 9 byte integer.
+  ASSERT_EQ(aom_uleb_decode(&kAllPadBytesBuffer[0], kMaximumLeb128CodedSize + 1,
+                            &decoded_value, NULL),
+            -1);
+
+  // Test that encoded value missing terminator byte within available buffer
+  // range causes decode error.
+  ASSERT_EQ(aom_uleb_decode(&kAllPadBytesBuffer[0], kMaximumLeb128CodedSize,
+                            &decoded_value, NULL),
+            -1);
+
+  // Test that LEB128 input that decodes to a value larger than 32-bits fails.
+  size_t value_size = 0;
+  ASSERT_EQ(aom_uleb_decode(&kOutOfRangeLeb128Value[0],
+                            sizeof(kOutOfRangeLeb128Value), &decoded_value,
+                            &value_size),
+            -1);
+}
+
+TEST(AomLeb128, EncodeFailTest) {
+  const size_t kWriteBufferSize = 4;
+  const uint32_t kValidTestValue = 1;
+  uint8_t write_buffer[kWriteBufferSize] = { 0 };
+  size_t coded_size = 0;
+  ASSERT_EQ(
+      aom_uleb_encode(kValidTestValue, kWriteBufferSize, NULL, &coded_size),
+      -1);
+  ASSERT_EQ(aom_uleb_encode(kValidTestValue, kWriteBufferSize, &write_buffer[0],
+                            NULL),
+            -1);
+
+  const uint32_t kValueOutOfRangeForBuffer = 0xFFFFFFFF;
+  ASSERT_EQ(aom_uleb_encode(kValueOutOfRangeForBuffer, kWriteBufferSize,
+                            &write_buffer[0], &coded_size),
+            -1);
+
+  const uint64_t kValueOutOfRange = kMaximumLeb128Value + 1;
+  ASSERT_EQ(aom_uleb_encode(kValueOutOfRange, kWriteBufferSize,
+                            &write_buffer[0], &coded_size),
+            -1);
+
+  const size_t kPadSizeOutOfRange = 5;
+  ASSERT_EQ(aom_uleb_encode_fixed_size(kValidTestValue, kWriteBufferSize,
+                                       kPadSizeOutOfRange, &write_buffer[0],
+                                       &coded_size),
+            -1);
+}
diff --git a/third_party/aom/test/aomdec.sh b/third_party/aom/test/aomdec.sh
index 28901ed1b..5f54ae0af 100755
--- a/third_party/aom/test/aomdec.sh
+++ b/third_party/aom/test/aomdec.sh
@@ -17,10 +17,12 @@
 # Environment check: Make sure input is available.
 aomdec_verify_environment() {
   if [ "$(av1_encode_available)" != "yes" ] ; then
-    if [ ! -e "${AV1_WEBM_FILE}" ] || \
-      [ ! -e "${AV1_FPM_WEBM_FILE}" ] || \
-      [ ! -e "${AV1_LT_50_FRAMES_WEBM_FILE}" ] ; then
-      elog "Libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+    if [ ! -e "${AV1_IVF_FILE}" ] || \
+       [ ! -e "${AV1_OBU_ANNEXB_FILE}" ] || \
+       [ ! -e "${AV1_OBU_SEC5_FILE}" ] || \
+       [ ! -e "${AV1_WEBM_FILE}" ]; then
+      elog "Libaom test data must exist before running this test script when " \
+           " encoding is disabled. "
       return 1
     fi
   fi
@@ -38,10 +40,8 @@ aomdec_pipe() {
   local readonly input="$1"
   shift
   if [ ! -e "${input}" ]; then
-    local file="${AOM_TEST_OUTPUT_DIR}/test_encode.ivf"
-    encode_yuv_raw_input_av1 "${file}" --ivf
-  else
-    local file="${input}"
+    elog "Input file ($input) missing in aomdec_pipe()"
+    return 1
   fi
   cat "${file}" | aomdec - "$@" ${devnull}
 }
@@ -63,62 +63,85 @@ aomdec_can_decode_av1() {
   fi
 }
 
-aomdec_aom_ivf_pipe_input() {
+aomdec_av1_ivf() {
   if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
-    aomdec_pipe "${AOM_IVF_FILE}" --summary --noblit
+    local readonly file="${AV1_IVF_FILE}"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}" --ivf
+    fi
+    aomdec "${AV1_IVF_FILE}" --summary --noblit
   fi
 }
 
-aomdec_av1_webm() {
-  if [ "$(aomdec_can_decode_av1)" = "yes" ] && \
-     [ "$(webm_io_available)" = "yes" ]; then
-    if [ ! -e "${AV1_WEBM_FILE}" ]; then
-      local file="${AOM_TEST_OUTPUT_DIR}/test_encode.webm"
-      encode_yuv_raw_input_av1 "${file}"
-    else
-      aomdec "${AV1_WEBM_FILE}" --summary --noblit
+aomdec_av1_ivf_error_resilient() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+    local readonly file="av1.error-resilient.ivf"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}" --ivf --error-resilient=1
     fi
+    aomdec "${file}" --summary --noblit
   fi
 }
 
-aomdec_av1_webm_frame_parallel() {
-  if [ "$(aomdec_can_decode_av1)" = "yes" ] && \
-     [ "$(webm_io_available)" = "yes" ]; then
-    local file
-    if [ ! -e "${AV1_WEBM_FILE}" ]; then
-      file="${AOM_TEST_OUTPUT_DIR}/test_encode.webm"
-      encode_yuv_raw_input_av1 "${file}" "--ivf --error-resilient=1 "
-    else
-      file="${AV1_FPM_WEBM_FILE}"
+aomdec_av1_ivf_multithread() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+    local readonly file="${AV1_IVF_FILE}"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}" --ivf
     fi
     for threads in 2 3 4 5 6 7 8; do
-      aomdec "${file}" --summary --noblit --threads=$threads \
-        --frame-parallel
+      aomdec "${file}" --summary --noblit --threads=$threads
     done
   fi
 }
 
-# TODO(vigneshv): Enable or remove this test and associated code.
-DISABLED_aomdec_av1_webm_less_than_50_frames() {
-  # ensure that reaching eof in webm_guess_framerate doesn't result in invalid
-  # frames in actual webm_read_frame calls.
+aomdec_aom_ivf_pipe_input() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+    local readonly file="${AV1_IVF_FILE}"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}" --ivf
+    fi
+    aomdec_pipe "${AV1_IVF_FILE}" --summary --noblit
+  fi
+}
+
+aomdec_av1_obu_annexb() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+    local readonly file="${AV1_OBU_ANNEXB_FILE}"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}" --obu --annexb=1
+    fi
+    aomdec "${file}" --summary --noblit --annexb
+  fi
+}
+
+aomdec_av1_obu_section5() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+    local readonly file="${AV1_OBU_SEC5_FILE}"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}" --obu
+    fi
+    aomdec "${file}" --summary --noblit
+  fi
+}
+
+aomdec_av1_webm() {
   if [ "$(aomdec_can_decode_av1)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly decoder="$(aom_tool_path aomdec)"
-    local readonly expected=10
-    local readonly num_frames=$(${AOM_TEST_PREFIX} "${decoder}" \
-      "${AV1_LT_50_FRAMES_WEBM_FILE}" --summary --noblit 2>&1 \
-      | awk '/^[0-9]+ decoded frames/ { print $1 }')
-    if [ "$num_frames" -ne "$expected" ]; then
-      elog "Output frames ($num_frames) != expected ($expected)"
-      return 1
+    local readonly file="${AV1_WEBM_FILE}"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}"
     fi
+    aomdec "${AV1_WEBM_FILE}" --summary --noblit
   fi
 }
 
-aomdec_tests="aomdec_av1_webm
-              aomdec_av1_webm_frame_parallel
+aomdec_tests="aomdec_av1_ivf
+              aomdec_av1_ivf_error_resilient
+              aomdec_av1_ivf_multithread
               aomdec_aom_ivf_pipe_input
-              DISABLED_aomdec_av1_webm_less_than_50_frames"
+              aomdec_av1_obu_annexb
+              aomdec_av1_obu_section5
+              aomdec_av1_webm"
 
 run_tests aomdec_verify_environment "${aomdec_tests}"
diff --git a/third_party/aom/test/aomenc.sh b/third_party/aom/test/aomenc.sh
index 57a4c28a5..a0ab8c8aa 100755
--- a/third_party/aom/test/aomenc.sh
+++ b/third_party/aom/test/aomenc.sh
@@ -15,8 +15,6 @@
 ##
 . $(dirname $0)/tools_common.sh
 
-readonly TEST_FRAMES=5
-
 # Environment check: Make sure input is available.
 aomenc_verify_environment() {
   if [ ! -e "${YUV_RAW_INPUT}" ]; then
@@ -57,32 +55,6 @@ y4m_input_720p() {
   echo ""${Y4M_720P_INPUT}""
 }
 
-# Echo default aomenc real time encoding params. $1 is the codec, which defaults
-# to av1 if unspecified.
-aomenc_rt_params() {
-  local readonly codec="${1:-av1}"
-  echo "--codec=${codec}
-    --buf-initial-sz=500
-    --buf-optimal-sz=600
-    --buf-sz=1000
-    --cpu-used=-6
-    --end-usage=cbr
-    --error-resilient=1
-    --kf-max-dist=90000
-    --lag-in-frames=0
-    --max-intra-rate=300
-    --max-q=56
-    --min-q=2
-    --noise-sensitivity=0
-    --overshoot-pct=50
-    --passes=1
-    --profile=0
-    --resize-allowed=0
-    --rt
-    --static-thresh=0
-    --undershoot-pct=50"
-}
-
 # Wrapper function for running aomenc with pipe input. Requires that
 # LIBAOM_BIN_PATH points to the directory containing aomenc. $1 is used as the
 # input file path and shifted away. All remaining parameters are passed through
@@ -110,10 +82,12 @@ aomenc() {
 
 aomenc_av1_ivf() {
   if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
-    local readonly output="${AOM_TEST_OUTPUT_DIR}/av1.ivf"
+    local output="${AV1_IVF_FILE}"
+    if [ -e "${AV1_IVF_FILE}" ]; then
+      output="${AOM_TEST_OUTPUT_DIR}/av1_test.ivf"
+    fi
     aomenc $(yuv_raw_input) \
-      --codec=av1 \
-      --limit="${TEST_FRAMES}" \
+      $(aomenc_encode_test_fast_params) \
       --ivf \
       --output="${output}"
 
@@ -124,13 +98,52 @@ aomenc_av1_ivf() {
   fi
 }
 
+aomenc_av1_obu_annexb() {
+   if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+    local output="${AV1_OBU_ANNEXB_FILE}"
+    if [ -e "${AV1_OBU_ANNEXB_FILE}" ]; then
+      output="${AOM_TEST_OUTPUT_DIR}/av1_test.annexb.obu"
+    fi
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --obu \
+      --annexb=1 \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+aomenc_av1_obu_section5() {
+   if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+    local output="${AV1_OBU_SEC5_FILE}"
+    if [ -e "${AV1_OBU_SEC5_FILE}" ]; then
+      output="${AOM_TEST_OUTPUT_DIR}/av1_test.section5.obu"
+    fi
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --obu \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
 aomenc_av1_webm() {
   if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${AOM_TEST_OUTPUT_DIR}/av1.webm"
+    local output="${AV1_WEBM_FILE}"
+    if [ -e "${AV1_WEBM_FILE}" ]; then
+      output="${AOM_TEST_OUTPUT_DIR}/av1_test.webm"
+    fi
     aomenc $(yuv_raw_input) \
-      --codec=av1 \
-      --limit="${TEST_FRAMES}" \
+      $(aomenc_encode_test_fast_params) \
       --output="${output}"
 
     if [ ! -e "${output}" ]; then
@@ -140,15 +153,14 @@ aomenc_av1_webm() {
   fi
 }
 
-aomenc_av1_webm_2pass() {
+aomenc_av1_webm_1pass() {
   if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${AOM_TEST_OUTPUT_DIR}/av1.webm"
+    local readonly output="${AOM_TEST_OUTPUT_DIR}/av1_test.webm"
     aomenc $(yuv_raw_input) \
-      --codec=av1 \
-      --limit="${TEST_FRAMES}" \
-      --output="${output}" \
-      --passes=2
+      $(aomenc_encode_test_fast_params) \
+      --passes=1 \
+      --output="${output}"
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -161,8 +173,7 @@ aomenc_av1_ivf_lossless() {
   if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
     local readonly output="${AOM_TEST_OUTPUT_DIR}/av1_lossless.ivf"
     aomenc $(yuv_raw_input) \
-      --codec=av1 \
-      --limit="${TEST_FRAMES}" \
+      $(aomenc_encode_test_fast_params) \
       --ivf \
       --output="${output}" \
       --lossless=1
@@ -178,8 +189,7 @@ aomenc_av1_ivf_minq0_maxq0() {
   if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
     local readonly output="${AOM_TEST_OUTPUT_DIR}/av1_lossless_minq0_maxq0.ivf"
     aomenc $(yuv_raw_input) \
-      --codec=av1 \
-      --limit="${TEST_FRAMES}" \
+      $(aomenc_encode_test_fast_params) \
       --ivf \
       --output="${output}" \
       --min-q=0 \
@@ -199,12 +209,10 @@ aomenc_av1_webm_lag5_frames10() {
     local readonly lag_frames=5
     local readonly output="${AOM_TEST_OUTPUT_DIR}/av1_lag5_frames10.webm"
     aomenc $(yuv_raw_input) \
-      --codec=av1 \
-      --limit="${lag_total_frames}" \
-      --lag-in-frames="${lag_frames}" \
-      --output="${output}" \
-      --passes=2 \
-      --auto-alt-ref=1
+      $(aomenc_encode_test_fast_params) \
+      --limit=${lag_total_frames} \
+      --lag-in-frames=${lag_frames} \
+      --output="${output}"
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -219,8 +227,7 @@ aomenc_av1_webm_non_square_par() {
      [ "$(webm_io_available)" = "yes" ]; then
     local readonly output="${AOM_TEST_OUTPUT_DIR}/av1_non_square_par.webm"
     aomenc $(y4m_input_non_square_par) \
-      --codec=av1 \
-      --limit="${TEST_FRAMES}" \
+      $(aomenc_encode_test_fast_params) \
       --output="${output}"
 
     if [ ! -e "${output}" ]; then
@@ -230,12 +237,33 @@ aomenc_av1_webm_non_square_par() {
   fi
 }
 
+aomenc_av1_webm_cdf_update_mode() {
+  if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    for mode in 0 1 2; do
+      local readonly output="${AOM_TEST_OUTPUT_DIR}/cdf_mode_${mode}.webm"
+      aomenc $(yuv_raw_input) \
+        $(aomenc_encode_test_fast_params) \
+        --cdf-update-mode=${mode} \
+        --output="${output}"
+
+      if [ ! -e "${output}" ]; then
+        elog "Output file does not exist."
+        return 1
+      fi
+    done
+  fi
+}
+
 aomenc_tests="aomenc_av1_ivf
+              aomenc_av1_obu_annexb
+              aomenc_av1_obu_section5
               aomenc_av1_webm
-              aomenc_av1_webm_2pass
+              aomenc_av1_webm_1pass
               aomenc_av1_ivf_lossless
               aomenc_av1_ivf_minq0_maxq0
               aomenc_av1_webm_lag5_frames10
-              aomenc_av1_webm_non_square_par"
+              aomenc_av1_webm_non_square_par
+              aomenc_av1_webm_cdf_update_mode"
 
 run_tests aomenc_verify_environment "${aomenc_tests}"
diff --git a/third_party/aom/test/aq_segment_test.cc b/third_party/aom/test/aq_segment_test.cc
index 57db0d0ff..bbb5027d4 100644
--- a/third_party/aom/test/aq_segment_test.cc
+++ b/third_party/aom/test/aq_segment_test.cc
@@ -7,9 +7,10 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
+
+#include "config/aom_config.h"
 
-#include "./aom_config.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
@@ -37,18 +38,14 @@ class AqSegmentTest
     if (video->frame() == 1) {
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
       encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
-#if CONFIG_EXT_DELTA_Q
       encoder->Control(AV1E_SET_DELTAQ_MODE, deltaq_mode_);
-#endif
       encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100);
     }
   }
 
   void DoTest(int aq_mode) {
     aq_mode_ = aq_mode;
-#if CONFIG_EXT_DELTA_Q
     deltaq_mode_ = 0;
-#endif
     cfg_.kf_max_dist = 12;
     cfg_.rc_min_quantizer = 8;
     cfg_.rc_max_quantizer = 56;
@@ -65,9 +62,7 @@ class AqSegmentTest
 
   int set_cpu_used_;
   int aq_mode_;
-#if CONFIG_EXT_DELTA_Q
   int deltaq_mode_;
-#endif
 };
 
 // Validate that this AQ segmentation mode (AQ=1, variance_ap)
@@ -90,21 +85,6 @@ TEST_P(AqSegmentTestLarge, TestNoMisMatchAQ2) { DoTest(2); }
 
 TEST_P(AqSegmentTestLarge, TestNoMisMatchAQ3) { DoTest(3); }
 
-#if !CONFIG_EXT_DELTA_Q
-// Validate that this AQ mode (AQ=4, delta q)
-// encodes and decodes without a mismatch.
-TEST_P(AqSegmentTest, TestNoMisMatchAQ4) {
-  cfg_.rc_end_usage = AOM_CQ;
-  aq_mode_ = 4;
-
-  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 15);
-
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-}
-#endif
-
-#if CONFIG_EXT_DELTA_Q
 // Validate that this delta q mode
 // encodes and decodes without a mismatch.
 TEST_P(AqSegmentTest, TestNoMisMatchExtDeltaQ) {
@@ -116,7 +96,6 @@ TEST_P(AqSegmentTest, TestNoMisMatchExtDeltaQ) {
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
-#endif
 
 AV1_INSTANTIATE_TEST_CASE(AqSegmentTest,
                           ::testing::Values(::libaom_test::kRealTime,
diff --git a/third_party/aom/test/arf_freq_test.cc b/third_party/aom/test/arf_freq_test.cc
index b4b17c9ab..083f4022f 100644
--- a/third_party/aom/test/arf_freq_test.cc
+++ b/third_party/aom/test/arf_freq_test.cc
@@ -7,7 +7,7 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -50,9 +50,7 @@ const TestVideoParam kTestVectors[] = {
   { "hantro_collage_w352h288.yuv", 352, 288, 30, 1, 8, AOM_IMG_FMT_I420,
     AOM_BITS_8, 0 },
   { "rush_hour_444.y4m", 352, 288, 30, 1, 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 },
-#if CONFIG_HIGHBITDEPTH
-// Add list of profile 2/3 test videos here ...
-#endif  // CONFIG_HIGHBITDEPTH
+  // Add list of profile 2/3 test videos here ...
 };
 
 const TestEncodeParam kEncodeVectors[] = {
@@ -208,7 +206,6 @@ TEST_P(ArfFreqTestLarge, MinArfFreqTest) {
   }
 }
 
-#if CONFIG_HIGHBITDEPTH || CONFIG_EXT_REFS
 #if CONFIG_AV1_ENCODER
 // TODO(angiebird): 25-29 fail in high bitdepth mode.
 // TODO(zoeliu): This ArfFreqTest does not work with BWDREF_FRAME, as
@@ -223,9 +220,4 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::ValuesIn(kTestVectors), ::testing::ValuesIn(kEncodeVectors),
         ::testing::ValuesIn(kMinArfVectors)));
 #endif  // CONFIG_AV1_ENCODER
-#else
-AV1_INSTANTIATE_TEST_CASE(ArfFreqTestLarge, ::testing::ValuesIn(kTestVectors),
-                          ::testing::ValuesIn(kEncodeVectors),
-                          ::testing::ValuesIn(kMinArfVectors));
-#endif  // CONFIG_HIGHBITDEPTH || CONFIG_EXT_REFS
 }  // namespace
diff --git a/third_party/aom/test/av1_convolve_2d_test.cc b/third_party/aom/test/av1_convolve_2d_test.cc
index 002ede403..03286260e 100644
--- a/third_party/aom/test/av1_convolve_2d_test.cc
+++ b/third_party/aom/test/av1_convolve_2d_test.cc
@@ -12,29 +12,238 @@
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/av1_convolve_2d_test_util.h"
 
-using std::tr1::tuple;
-using std::tr1::make_tuple;
+using ::testing::make_tuple;
+using ::testing::tuple;
 using libaom_test::ACMRandom;
-using libaom_test::AV1Convolve2D::AV1Convolve2DTest;
-#if CONFIG_HIGHBITDEPTH
-using libaom_test::AV1HighbdConvolve2D::AV1HighbdConvolve2DTest;
-#endif
-
+using libaom_test::AV1Convolve2D::AV1Convolve2DSrTest;
+using libaom_test::AV1Convolve2D::AV1JntConvolve2DTest;
+using libaom_test::AV1HighbdConvolve2D::AV1HighbdConvolve2DSrTest;
+using libaom_test::AV1HighbdConvolve2D::AV1HighbdJntConvolve2DTest;
 namespace {
 
-TEST_P(AV1Convolve2DTest, CheckOutput) { RunCheckOutput(GET_PARAM(2)); }
+TEST_P(AV1Convolve2DSrTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
+
+TEST_P(AV1Convolve2DSrTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
+
+INSTANTIATE_TEST_CASE_P(
+    C_COPY, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_copy_sr_c, 0, 0));
+INSTANTIATE_TEST_CASE_P(
+    C_X, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_c, 1, 0));
+INSTANTIATE_TEST_CASE_P(
+    C_Y, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_c, 0, 1));
+INSTANTIATE_TEST_CASE_P(
+    C, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_c, 1, 1));
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2_COPY, AV1Convolve2DSrTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_convolve_2d_copy_sr_sse2, 0, 0));
+INSTANTIATE_TEST_CASE_P(
+    SSE2_X, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_sse2, 1, 0));
+INSTANTIATE_TEST_CASE_P(
+    SSE2_Y, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_sse2, 0, 1));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_sse2, 1, 1));
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2_COPY, AV1Convolve2DSrTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_convolve_2d_copy_sr_avx2, 0, 0));
+INSTANTIATE_TEST_CASE_P(
+    AVX2_X, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_avx2, 1, 0));
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2_Y, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_avx2, 0, 1));
 
 INSTANTIATE_TEST_CASE_P(
-    SSE2, AV1Convolve2DTest,
-    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sse2));
+    AVX2, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_avx2, 1, 1));
+#endif  // HAVE_AVX2
+#endif  // HAVE_SSE2
 
-#if CONFIG_HIGHBITDEPTH && HAVE_SSSE3
-TEST_P(AV1HighbdConvolve2DTest, CheckOutput) { RunCheckOutput(GET_PARAM(3)); }
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON_X, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_neon, 1, 0));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON_Y, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_neon, 0, 1));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_neon, 1, 1));
 
-INSTANTIATE_TEST_CASE_P(SSSE3, AV1HighbdConvolve2DTest,
+INSTANTIATE_TEST_CASE_P(NEON_COPY, AV1Convolve2DSrTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_convolve_2d_copy_sr_neon, 0, 0));
+#endif  // HAVE_NEON
+
+TEST_P(AV1JntConvolve2DTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
+TEST_P(AV1JntConvolve2DTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
+
+INSTANTIATE_TEST_CASE_P(
+    C_COPY, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_copy_c, 0, 0));
+
+INSTANTIATE_TEST_CASE_P(
+    C_X, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_x_c, 1, 0));
+
+INSTANTIATE_TEST_CASE_P(
+    C_Y, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_c, 0, 1));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2_COPY, AV1JntConvolve2DTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_jnt_convolve_2d_copy_sse2, 0, 0));
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE2_X, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_x_sse2, 1, 0));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2_Y, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_sse2, 0, 1));
+
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_ssse3, 1, 1));
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2_COPY, AV1JntConvolve2DTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_jnt_convolve_2d_copy_avx2, 0, 0));
+INSTANTIATE_TEST_CASE_P(
+    AVX2_X, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_x_avx2, 1, 0));
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2_Y, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_avx2, 0, 1));
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_avx2, 1, 1));
+#endif  // HAVE_AVX2
+#endif  // HAVE_SSE4_1
+#endif  // HAVE_SSE2
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON_COPY, AV1JntConvolve2DTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_jnt_convolve_2d_copy_neon, 0, 0));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_neon, 1, 1));
+INSTANTIATE_TEST_CASE_P(
+    NEON_X, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_x_neon, 1, 0));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON_Y, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_neon, 0, 1));
+#endif  // HAVE_NEON
+
+TEST_P(AV1HighbdConvolve2DSrTest, CheckOutput) { RunCheckOutput(GET_PARAM(1)); }
+TEST_P(AV1HighbdConvolve2DSrTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(1));
+}
+
+INSTANTIATE_TEST_CASE_P(C_X, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_x_sr_c, 1, 0));
+
+INSTANTIATE_TEST_CASE_P(C_Y, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_y_sr_c, 0, 1));
+
+INSTANTIATE_TEST_CASE_P(C_COPY, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_2d_copy_sr_c, 0, 0));
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2_COPY, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_2d_copy_sr_sse2, 0, 0));
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(SSSE3, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_2d_sr_ssse3, 1, 1));
+INSTANTIATE_TEST_CASE_P(SSSE3_X, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_x_sr_ssse3, 1, 0));
+INSTANTIATE_TEST_CASE_P(SSSE3_Y, AV1HighbdConvolve2DSrTest,
                         libaom_test::AV1HighbdConvolve2D::BuildParams(
-                            av1_highbd_convolve_2d_ssse3));
+                            av1_highbd_convolve_y_sr_ssse3, 0, 1));
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_2d_sr_avx2, 1, 1));
+INSTANTIATE_TEST_CASE_P(AVX2_X, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_x_sr_avx2, 1, 0));
+INSTANTIATE_TEST_CASE_P(AVX2_Y, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_y_sr_avx2, 0, 1));
+INSTANTIATE_TEST_CASE_P(AVX2_COPY, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_2d_copy_sr_avx2, 0, 0));
+#endif  // HAVE_AVX2
+#endif  // HAVE_SSSE3
+#endif  // HAVE_SSE2
+TEST_P(AV1HighbdJntConvolve2DTest, CheckOutput) {
+  RunCheckOutput(GET_PARAM(1));
+}
 
-#endif
+TEST_P(AV1HighbdJntConvolve2DTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(1));
+}
 
+INSTANTIATE_TEST_CASE_P(C_X, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_x_c, 1, 0));
+
+INSTANTIATE_TEST_CASE_P(C_Y, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_y_c, 0, 1));
+
+INSTANTIATE_TEST_CASE_P(C_COPY, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_2d_copy_c, 0, 0));
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(SSE4_1_COPY, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_2d_copy_sse4_1, 0, 0));
+INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_2d_sse4_1, 1, 1));
+INSTANTIATE_TEST_CASE_P(SSE4_1_X, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_x_sse4_1, 1, 0));
+INSTANTIATE_TEST_CASE_P(SSE4_1_Y, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_y_sse4_1, 0, 1));
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2_COPY, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_2d_copy_avx2, 0, 0));
+INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_2d_avx2, 1, 1));
+INSTANTIATE_TEST_CASE_P(AVX2_X, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_x_avx2, 1, 0));
+INSTANTIATE_TEST_CASE_P(AVX2_Y, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_y_avx2, 0, 1));
+#endif  // HAVE_AVX2
+#endif  // HAVE_SSE4_1
 }  // namespace
diff --git a/third_party/aom/test/av1_convolve_2d_test_util.cc b/third_party/aom/test/av1_convolve_2d_test_util.cc
index 3b61f6bb7..cbe3f8c9f 100644
--- a/third_party/aom/test/av1_convolve_2d_test_util.cc
+++ b/third_party/aom/test/av1_convolve_2d_test_util.cc
@@ -11,183 +11,695 @@
 
 #include "test/av1_convolve_2d_test_util.h"
 
+#include "aom_ports/aom_timer.h"
+#include "av1/common/common_data.h"
 #include "av1/common/convolve.h"
 
-using std::tr1::tuple;
-using std::tr1::make_tuple;
+using ::testing::make_tuple;
+using ::testing::tuple;
 
 namespace libaom_test {
 
+const int kMaxSize = 128 + 32;  // padding
 namespace AV1Convolve2D {
 
 ::testing::internal::ParamGenerator<Convolve2DParam> BuildParams(
-    convolve_2d_func filter) {
-  const Convolve2DParam params[] = {
-    make_tuple(4, 4, filter),   make_tuple(8, 8, filter),
-    make_tuple(64, 64, filter), make_tuple(4, 16, filter),
-    make_tuple(32, 8, filter),
-  };
-  return ::testing::ValuesIn(params);
+    convolve_2d_func filter, int has_subx, int has_suby) {
+  return ::testing::Combine(::testing::Values(filter),
+                            ::testing::Values(has_subx),
+                            ::testing::Values(has_suby),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
 }
 
-AV1Convolve2DTest::~AV1Convolve2DTest() {}
-void AV1Convolve2DTest::SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+AV1Convolve2DSrTest::~AV1Convolve2DSrTest() {}
+void AV1Convolve2DSrTest::SetUp() {
+  rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1Convolve2DSrTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1Convolve2DSrTest::RunCheckOutput(convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int has_subx = GET_PARAM(1);
+  const int has_suby = GET_PARAM(2);
+  const int block_idx = GET_PARAM(3);
+  int hfilter, vfilter, subx, suby;
+  uint8_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, uint8_t, output[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, output2[MAX_SB_SQUARE]);
+
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
+  for (int i = 0; i < MAX_SB_SQUARE; ++i)
+    output[i] = output2[i] = rnd_.Rand31();
+
+  // Make sure that sizes 2xN and Nx2 are also tested for chroma.
+  const int num_sizes =
+      (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2
+                                                                           : 1;
+  for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
+    const int out_w = block_size_wide[block_idx] >> shift;
+    const int out_h = block_size_high[block_idx] >> shift;
+    for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
+      for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL;
+           ++vfilter) {
+        InterpFilterParams filter_params_x =
+            av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                         out_w);
+        InterpFilterParams filter_params_y =
+            av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                         out_h);
+        for (int do_average = 0; do_average < 1; ++do_average) {
+          ConvolveParams conv_params1 =
+              get_conv_params_no_round(0, do_average, 0, NULL, 0, 0, 8);
+          ConvolveParams conv_params2 =
+              get_conv_params_no_round(0, do_average, 0, NULL, 0, 0, 8);
 
-void AV1Convolve2DTest::TearDown() { libaom_test::ClearSystemState(); }
+          const int subx_range = has_subx ? 16 : 1;
+          const int suby_range = has_suby ? 16 : 1;
+          for (subx = 0; subx < subx_range; ++subx) {
+            for (suby = 0; suby < suby_range; ++suby) {
+              // Choose random locations within the source block
+              const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+              const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+              av1_convolve_2d_sr_c(input + offset_r * w + offset_c, w, output,
+                                   MAX_SB_SIZE, out_w, out_h, &filter_params_x,
+                                   &filter_params_y, subx, suby, &conv_params1);
+              test_impl(input + offset_r * w + offset_c, w, output2,
+                        MAX_SB_SIZE, out_w, out_h, &filter_params_x,
+                        &filter_params_y, subx, suby, &conv_params2);
+
+              if (memcmp(output, output2, sizeof(output))) {
+                for (int i = 0; i < MAX_SB_SIZE; ++i) {
+                  for (int j = 0; j < MAX_SB_SIZE; ++j) {
+                    int idx = i * MAX_SB_SIZE + j;
+                    ASSERT_EQ(output[idx], output2[idx])
+                        << out_w << "x" << out_h << " Pixel mismatch at index "
+                        << idx << " = (" << i << ", " << j
+                        << "), sub pixel offset = (" << suby << ", " << subx
+                        << ")";
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
 
-void AV1Convolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
-  const int w = 128, h = 128;
-  const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
-  int i, j, k;
+void AV1Convolve2DSrTest::RunSpeedTest(convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int has_subx = GET_PARAM(1);
+  const int has_suby = GET_PARAM(2);
+  const int block_idx = GET_PARAM(3);
 
-  uint8_t *input = new uint8_t[h * w];
+  uint8_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, uint8_t, output[MAX_SB_SQUARE]);
 
-  int output_n = out_h * MAX_SB_SIZE;
-  CONV_BUF_TYPE *output = new CONV_BUF_TYPE[output_n];
-  CONV_BUF_TYPE *output2 = new CONV_BUF_TYPE[output_n];
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
 
-  for (i = 0; i < h; ++i)
-    for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
+  int hfilter = EIGHTTAP_REGULAR, vfilter = EIGHTTAP_REGULAR;
+  int subx = 0, suby = 0;
 
+  const int do_average = 0;
+  ConvolveParams conv_params2 =
+      get_conv_params_no_round(0, do_average, 0, NULL, 0, 0, 8);
+
+  // Make sure that sizes 2xN and Nx2 are also tested for chroma.
+  const int num_sizes =
+      (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2
+                                                                           : 1;
+  for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
+    const int out_w = block_size_wide[block_idx] >> shift;
+    const int out_h = block_size_high[block_idx] >> shift;
+    const int num_loops = 1000000000 / (out_w + out_h);
+
+    InterpFilterParams filter_params_x =
+        av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                     out_w);
+    InterpFilterParams filter_params_y =
+        av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                     out_h);
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(input, w, output, MAX_SB_SIZE, out_w, out_h, &filter_params_x,
+                &filter_params_y, subx, suby, &conv_params2);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("%d,%d convolve %3dx%-3d: %7.2f us\n", has_subx, has_suby, out_w,
+           out_h, 1000.0 * elapsed_time / num_loops);
+  }
+}
+
+AV1JntConvolve2DTest::~AV1JntConvolve2DTest() {}
+void AV1JntConvolve2DTest::SetUp() {
+  rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1JntConvolve2DTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int has_subx = GET_PARAM(1);
+  const int has_suby = GET_PARAM(2);
+  const int block_idx = GET_PARAM(3);
   int hfilter, vfilter, subx, suby;
+  uint8_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output2[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, output8_1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, output8_2[MAX_SB_SQUARE]);
+
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    output1[i] = output2[i] = rnd_.Rand16();
+    output8_1[i] = output8_2[i] = rnd_.Rand8();
+  }
+
+  const int out_w = block_size_wide[block_idx];
+  const int out_h = block_size_high[block_idx];
   for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
     for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) {
       InterpFilterParams filter_params_x =
-          av1_get_interp_filter_params((InterpFilter)hfilter);
+          av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                       out_w);
       InterpFilterParams filter_params_y =
-          av1_get_interp_filter_params((InterpFilter)vfilter);
-      const int do_average = rnd_.Rand8() & 1;
-      ConvolveParams conv_params1 =
-          get_conv_params_no_round(0, do_average, 0, output, MAX_SB_SIZE);
-      ConvolveParams conv_params2 =
-          get_conv_params_no_round(0, do_average, 0, output2, MAX_SB_SIZE);
-
-      for (subx = 0; subx < 16; ++subx)
-        for (suby = 0; suby < 16; ++suby) {
-          // av1_convolve_2d is designed for accumulate two predicted blocks for
-          // compound mode, so we set num_iter to two here.
-          // A larger number may introduce overflow
-          const int num_iters = 2;
-          memset(output, 0, output_n * sizeof(*output));
-          memset(output2, 0, output_n * sizeof(*output2));
-          for (i = 0; i < num_iters; ++i) {
+          av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                       out_h);
+      for (int do_average = 0; do_average <= 1; ++do_average) {
+        ConvolveParams conv_params1 = get_conv_params_no_round(
+            0, do_average, 0, output1, MAX_SB_SIZE, 1, 8);
+        ConvolveParams conv_params2 = get_conv_params_no_round(
+            0, do_average, 0, output2, MAX_SB_SIZE, 1, 8);
+
+        // Test special case where jnt_comp_avg is not used
+        conv_params1.use_jnt_comp_avg = 0;
+        conv_params2.use_jnt_comp_avg = 0;
+
+        const int subx_range = has_subx ? 16 : 1;
+        const int suby_range = has_suby ? 16 : 1;
+        for (subx = 0; subx < subx_range; ++subx) {
+          for (suby = 0; suby < suby_range; ++suby) {
             // Choose random locations within the source block
-            int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
-            int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-            av1_convolve_2d_c(input + offset_r * w + offset_c, w, output,
-                              MAX_SB_SIZE, out_w, out_h, &filter_params_x,
-                              &filter_params_y, subx, suby, &conv_params1);
-            test_impl(input + offset_r * w + offset_c, w, output2, MAX_SB_SIZE,
-                      out_w, out_h, &filter_params_x, &filter_params_y, subx,
-                      suby, &conv_params2);
-
-            for (j = 0; j < out_h; ++j)
-              for (k = 0; k < out_w; ++k) {
-                int idx = j * MAX_SB_SIZE + k;
-                ASSERT_EQ(output[idx], output2[idx])
-                    << "Pixel mismatch at index " << idx << " = (" << j << ", "
-                    << k << "), sub pixel offset = (" << suby << ", " << subx
-                    << ")";
+            const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+            const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+            av1_jnt_convolve_2d_c(input + offset_r * w + offset_c, w, output8_1,
+                                  MAX_SB_SIZE, out_w, out_h, &filter_params_x,
+                                  &filter_params_y, subx, suby, &conv_params1);
+            test_impl(input + offset_r * w + offset_c, w, output8_2,
+                      MAX_SB_SIZE, out_w, out_h, &filter_params_x,
+                      &filter_params_y, subx, suby, &conv_params2);
+
+            for (int i = 0; i < out_h; ++i) {
+              for (int j = 0; j < out_w; ++j) {
+                int idx = i * MAX_SB_SIZE + j;
+                ASSERT_EQ(output1[idx], output2[idx])
+                    << "Mismatch at unit tests for av1_jnt_convolve_2d\n"
+                    << out_w << "x" << out_h << " Pixel mismatch at index "
+                    << idx << " = (" << i << ", " << j
+                    << "), sub pixel offset = (" << suby << ", " << subx << ")";
+              }
+            }
+
+            if (memcmp(output8_1, output8_2, sizeof(output8_1))) {
+              for (int i = 0; i < MAX_SB_SIZE; ++i) {
+                for (int j = 0; j < MAX_SB_SIZE; ++j) {
+                  int idx = i * MAX_SB_SIZE + j;
+                  ASSERT_EQ(output8_1[idx], output8_2[idx])
+                      << out_w << "x" << out_h << " Pixel mismatch at index "
+                      << idx << " = (" << i << ", " << j
+                      << "), sub pixel offset = (" << suby << ", " << subx
+                      << ")";
+                }
+              }
+            }
+          }
+        }
+
+        // Test different combination of fwd and bck offset weights
+        for (int k = 0; k < 2; ++k) {
+          for (int l = 0; l < 4; ++l) {
+            conv_params1.use_jnt_comp_avg = 1;
+            conv_params2.use_jnt_comp_avg = 1;
+            conv_params1.fwd_offset = quant_dist_lookup_table[k][l][0];
+            conv_params1.bck_offset = quant_dist_lookup_table[k][l][1];
+            conv_params2.fwd_offset = quant_dist_lookup_table[k][l][0];
+            conv_params2.bck_offset = quant_dist_lookup_table[k][l][1];
+
+            for (subx = 0; subx < subx_range; ++subx) {
+              for (suby = 0; suby < suby_range; ++suby) {
+                // Choose random locations within the source block
+                const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+                const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+                av1_jnt_convolve_2d_c(input + offset_r * w + offset_c, w,
+                                      output8_1, MAX_SB_SIZE, out_w, out_h,
+                                      &filter_params_x, &filter_params_y, subx,
+                                      suby, &conv_params1);
+                test_impl(input + offset_r * w + offset_c, w, output8_2,
+                          MAX_SB_SIZE, out_w, out_h, &filter_params_x,
+                          &filter_params_y, subx, suby, &conv_params2);
+
+                for (int i = 0; i < out_h; ++i) {
+                  for (int j = 0; j < out_w; ++j) {
+                    int idx = i * MAX_SB_SIZE + j;
+                    ASSERT_EQ(output1[idx], output2[idx])
+                        << "Mismatch at unit tests for "
+                           "av1_jnt_convolve_2d\n"
+                        << out_w << "x" << out_h << " Pixel mismatch at index "
+                        << idx << " = (" << i << ", " << j
+                        << "), sub pixel offset = (" << suby << ", " << subx
+                        << ")";
+                  }
+                }
+                if (memcmp(output8_1, output8_2, sizeof(output8_1))) {
+                  for (int i = 0; i < MAX_SB_SIZE; ++i) {
+                    for (int j = 0; j < MAX_SB_SIZE; ++j) {
+                      int idx = i * MAX_SB_SIZE + j;
+                      ASSERT_EQ(output8_1[idx], output8_2[idx])
+                          << out_w << "x" << out_h
+                          << " Pixel mismatch at index " << idx << " = (" << i
+                          << ", " << j << "), sub pixel offset = (" << suby
+                          << ", " << subx << ")";
+                    }
+                  }
+                }
               }
+            }
           }
         }
+      }
     }
   }
-  delete[] input;
-  delete[] output;
-  delete[] output2;
+}
+
+void AV1JntConvolve2DTest::RunSpeedTest(convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int has_subx = GET_PARAM(1);
+  const int has_suby = GET_PARAM(2);
+  const int block_idx = GET_PARAM(3);
+
+  int subx = 0, suby = 0;
+  uint8_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, output8[MAX_SB_SQUARE]);
+  int hfilter = EIGHTTAP_REGULAR, vfilter = EIGHTTAP_REGULAR;
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    output[i] = rnd_.Rand16();
+    output8[i] = rnd_.Rand8();
+  }
+
+  const int out_w = block_size_wide[block_idx];
+  const int out_h = block_size_high[block_idx];
+  const int num_loops = 1000000000 / (out_w + out_h);
+  const int do_average = 0;
+
+  InterpFilterParams filter_params_x =
+      av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                   out_w);
+  InterpFilterParams filter_params_y =
+      av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                   out_h);
+
+  ConvolveParams conv_params =
+      get_conv_params_no_round(0, do_average, 0, output, MAX_SB_SIZE, 1, 8);
+
+  conv_params.use_jnt_comp_avg = 0;
+
+  // Choose random locations within the source block
+  const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+  const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+
+  for (int i = 0; i < num_loops; ++i)
+    test_impl(input + offset_r * w + offset_c, w, output8, MAX_SB_SIZE, out_w,
+              out_h, &filter_params_x, &filter_params_y, subx, suby,
+              &conv_params);
+
+  aom_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("%d,%d convolve %3dx%-3d: %7.2f us\n", has_subx, has_suby, out_w,
+         out_h, 1000.0 * elapsed_time / num_loops);
 }
 }  // namespace AV1Convolve2D
 
-#if CONFIG_HIGHBITDEPTH
 namespace AV1HighbdConvolve2D {
-
 ::testing::internal::ParamGenerator<HighbdConvolve2DParam> BuildParams(
-    highbd_convolve_2d_func filter) {
-  const HighbdConvolve2DParam params[] = {
-    make_tuple(4, 4, 8, filter),    make_tuple(8, 8, 8, filter),
-    make_tuple(64, 64, 8, filter),  make_tuple(4, 16, 8, filter),
-    make_tuple(32, 8, 8, filter),   make_tuple(4, 4, 10, filter),
-    make_tuple(8, 8, 10, filter),   make_tuple(64, 64, 10, filter),
-    make_tuple(4, 16, 10, filter),  make_tuple(32, 8, 10, filter),
-    make_tuple(4, 4, 12, filter),   make_tuple(8, 8, 12, filter),
-    make_tuple(64, 64, 12, filter), make_tuple(4, 16, 12, filter),
-    make_tuple(32, 8, 12, filter),
-  };
-  return ::testing::ValuesIn(params);
+    highbd_convolve_2d_func filter, int has_subx, int has_suby) {
+  return ::testing::Combine(
+      ::testing::Range(8, 13, 2), ::testing::Values(filter),
+      ::testing::Values(has_subx), ::testing::Values(has_suby),
+      ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
 }
 
-AV1HighbdConvolve2DTest::~AV1HighbdConvolve2DTest() {}
-void AV1HighbdConvolve2DTest::SetUp() {
+AV1HighbdConvolve2DSrTest::~AV1HighbdConvolve2DSrTest() {}
+void AV1HighbdConvolve2DSrTest::SetUp() {
   rnd_.Reset(ACMRandom::DeterministicSeed());
 }
 
-void AV1HighbdConvolve2DTest::TearDown() { libaom_test::ClearSystemState(); }
+void AV1HighbdConvolve2DSrTest::TearDown() { libaom_test::ClearSystemState(); }
 
-void AV1HighbdConvolve2DTest::RunCheckOutput(
+void AV1HighbdConvolve2DSrTest::RunSpeedTest(
     highbd_convolve_2d_func test_impl) {
-  const int w = 128, h = 128;
-  const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
-  const int bd = GET_PARAM(2);
-  int i, j, k;
+  const int w = kMaxSize, h = kMaxSize;
+  const int bd = GET_PARAM(0);
+  const int has_subx = GET_PARAM(2);
+  const int has_suby = GET_PARAM(3);
+  const int block_idx = GET_PARAM(4);
+  int hfilter, vfilter, subx, suby;
+  uint16_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, uint16_t, output[MAX_SB_SQUARE]);
+
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j)
+      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+
+  hfilter = EIGHTTAP_REGULAR;
+  vfilter = EIGHTTAP_REGULAR;
+  int do_average = 0;
+
+  const int offset_r = 3;
+  const int offset_c = 3;
+  subx = 0;
+  suby = 0;
+
+  ConvolveParams conv_params =
+      get_conv_params_no_round(0, do_average, 0, NULL, 0, 0, bd);
+
+  // Make sure that sizes 2xN and Nx2 are also tested for chroma.
+  const int num_sizes =
+      (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2
+                                                                           : 1;
+
+  for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
+    const int out_w = block_size_wide[block_idx] >> shift;
+    const int out_h = block_size_high[block_idx] >> shift;
+    const int num_loops = 1000000000 / (out_w + out_h);
 
-  uint16_t *input = new uint16_t[h * w];
+    InterpFilterParams filter_params_x =
+        av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                     out_w);
+    InterpFilterParams filter_params_y =
+        av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                     out_h);
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(input + offset_r * w + offset_c, w, output, MAX_SB_SIZE, out_w,
+                out_h, &filter_params_x, &filter_params_y, subx, suby,
+                &conv_params, bd);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("%d,%d convolve %3dx%-3d: %7.2f us\n", has_subx, has_suby, out_w,
+           out_h, 1000.0 * elapsed_time / num_loops);
+  }
+}
+
+void AV1HighbdConvolve2DSrTest::RunCheckOutput(
+    highbd_convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int bd = GET_PARAM(0);
+  const int has_subx = GET_PARAM(2);
+  const int has_suby = GET_PARAM(3);
+  const int block_idx = GET_PARAM(4);
+  int hfilter, vfilter, subx, suby;
+  uint16_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, uint16_t, output[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, output2[MAX_SB_SQUARE]);
 
-  int output_n = out_h * MAX_SB_SIZE;
-  CONV_BUF_TYPE *output = new CONV_BUF_TYPE[output_n];
-  CONV_BUF_TYPE *output2 = new CONV_BUF_TYPE[output_n];
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j)
+      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+  for (int i = 0; i < MAX_SB_SQUARE; ++i)
+    output[i] = output2[i] = rnd_.Rand31();
 
-  for (i = 0; i < h; ++i)
-    for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+  // Make sure that sizes 2xN and Nx2 are also tested for chroma.
+  const int num_sizes =
+      (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2
+                                                                           : 1;
+  for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
+    const int out_w = block_size_wide[block_idx] >> shift;
+    const int out_h = block_size_high[block_idx] >> shift;
+    for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
+      for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL;
+           ++vfilter) {
+        InterpFilterParams filter_params_x =
+            av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                         out_w);
+        InterpFilterParams filter_params_y =
+            av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                         out_h);
+        for (int do_average = 0; do_average < 1; ++do_average) {
+          ConvolveParams conv_params1 =
+              get_conv_params_no_round(0, do_average, 0, NULL, 0, 0, bd);
+          ConvolveParams conv_params2 =
+              get_conv_params_no_round(0, do_average, 0, NULL, 0, 0, bd);
 
+          const int subx_range = has_subx ? 16 : 1;
+          const int suby_range = has_suby ? 16 : 1;
+          for (subx = 0; subx < subx_range; ++subx) {
+            for (suby = 0; suby < suby_range; ++suby) {
+              // Choose random locations within the source block
+              const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+              const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+              av1_highbd_convolve_2d_sr_c(input + offset_r * w + offset_c, w,
+                                          output, MAX_SB_SIZE, out_w, out_h,
+                                          &filter_params_x, &filter_params_y,
+                                          subx, suby, &conv_params1, bd);
+              test_impl(input + offset_r * w + offset_c, w, output2,
+                        MAX_SB_SIZE, out_w, out_h, &filter_params_x,
+                        &filter_params_y, subx, suby, &conv_params2, bd);
+
+              if (memcmp(output, output2, sizeof(output))) {
+                for (int i = 0; i < MAX_SB_SIZE; ++i) {
+                  for (int j = 0; j < MAX_SB_SIZE; ++j) {
+                    int idx = i * MAX_SB_SIZE + j;
+                    ASSERT_EQ(output[idx], output2[idx])
+                        << out_w << "x" << out_h << " Pixel mismatch at index "
+                        << idx << " = (" << i << ", " << j
+                        << "), sub pixel offset = (" << suby << ", " << subx
+                        << ")";
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+AV1HighbdJntConvolve2DTest::~AV1HighbdJntConvolve2DTest() {}
+void AV1HighbdJntConvolve2DTest::SetUp() {
+  rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1HighbdJntConvolve2DTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1HighbdJntConvolve2DTest::RunSpeedTest(
+    highbd_convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int bd = GET_PARAM(0);
+  const int block_idx = GET_PARAM(4);
   int hfilter, vfilter, subx, suby;
+  uint16_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, output16[MAX_SB_SQUARE]);
+
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j)
+      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) output[i] = rnd_.Rand16();
+  hfilter = EIGHTTAP_REGULAR;
+  vfilter = EIGHTTAP_REGULAR;
+  int do_average = 0;
+  const int out_w = block_size_wide[block_idx];
+  const int out_h = block_size_high[block_idx];
+
+  InterpFilterParams filter_params_x =
+      av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                   out_w);
+  InterpFilterParams filter_params_y =
+      av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                   out_h);
+
+  ConvolveParams conv_params =
+      get_conv_params_no_round(0, do_average, 0, output, MAX_SB_SIZE, 1, bd);
+
+  // Test special case where jnt_comp_avg is not used
+  conv_params.use_jnt_comp_avg = 0;
+
+  subx = 0;
+  suby = 0;
+  // Choose random locations within the source block
+  const int offset_r = 3;
+  const int offset_c = 3;
+
+  const int num_loops = 1000000000 / (out_w + out_h);
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < num_loops; ++i)
+    test_impl(input + offset_r * w + offset_c, w, output16, MAX_SB_SIZE, out_w,
+              out_h, &filter_params_x, &filter_params_y, subx, suby,
+              &conv_params, bd);
+
+  aom_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("convolve %3dx%-3d: %7.2f us\n", out_w, out_h,
+         1000.0 * elapsed_time / num_loops);
+}
+
+void AV1HighbdJntConvolve2DTest::RunCheckOutput(
+    highbd_convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int bd = GET_PARAM(0);
+  const int has_subx = GET_PARAM(2);
+  const int has_suby = GET_PARAM(3);
+  const int block_idx = GET_PARAM(4);
+  int hfilter, vfilter, subx, suby;
+  uint16_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output2[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, output16_1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, output16_2[MAX_SB_SQUARE]);
+
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j)
+      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    output1[i] = output2[i] = rnd_.Rand16();
+    output16_1[i] = output16_2[i] = rnd_.Rand16();
+  }
+
+  const int out_w = block_size_wide[block_idx];
+  const int out_h = block_size_high[block_idx];
   for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
     for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) {
       InterpFilterParams filter_params_x =
-          av1_get_interp_filter_params((InterpFilter)hfilter);
+          av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                       out_w);
       InterpFilterParams filter_params_y =
-          av1_get_interp_filter_params((InterpFilter)vfilter);
-      ConvolveParams conv_params1 =
-          get_conv_params_no_round(0, 0, 0, output, MAX_SB_SIZE);
-      ConvolveParams conv_params2 =
-          get_conv_params_no_round(0, 0, 0, output2, MAX_SB_SIZE);
-
-      for (subx = 0; subx < 16; ++subx)
-        for (suby = 0; suby < 16; ++suby) {
-          // av1_convolve_2d is designed for accumulate two predicted blocks for
-          // compound mode, so we set num_iter to two here.
-          // A larger number may introduce overflow
-          const int num_iters = 2;
-          memset(output, 0, output_n * sizeof(*output));
-          memset(output2, 0, output_n * sizeof(*output2));
-          for (i = 0; i < num_iters; ++i) {
+          av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                       out_h);
+      for (int do_average = 0; do_average <= 1; ++do_average) {
+        ConvolveParams conv_params1 = get_conv_params_no_round(
+            0, do_average, 0, output1, MAX_SB_SIZE, 1, bd);
+        ConvolveParams conv_params2 = get_conv_params_no_round(
+            0, do_average, 0, output2, MAX_SB_SIZE, 1, bd);
+
+        // Test special case where jnt_comp_avg is not used
+        conv_params1.use_jnt_comp_avg = 0;
+        conv_params2.use_jnt_comp_avg = 0;
+
+        const int subx_range = has_subx ? 16 : 1;
+        const int suby_range = has_suby ? 16 : 1;
+        for (subx = 0; subx < subx_range; ++subx) {
+          for (suby = 0; suby < suby_range; ++suby) {
             // Choose random locations within the source block
-            int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
-            int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-            av1_highbd_convolve_2d_c(input + offset_r * w + offset_c, w, output,
-                                     MAX_SB_SIZE, out_w, out_h,
-                                     &filter_params_x, &filter_params_y, subx,
-                                     suby, &conv_params1, bd);
-            test_impl(input + offset_r * w + offset_c, w, output2, MAX_SB_SIZE,
-                      out_w, out_h, &filter_params_x, &filter_params_y, subx,
-                      suby, &conv_params2, bd);
-
-            for (j = 0; j < out_h; ++j)
-              for (k = 0; k < out_w; ++k) {
-                int idx = j * MAX_SB_SIZE + k;
-                ASSERT_EQ(output[idx], output2[idx])
-                    << "Pixel mismatch at index " << idx << " = (" << j << ", "
-                    << k << "), sub pixel offset = (" << suby << ", " << subx
-                    << ")";
+            const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+            const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+            av1_highbd_jnt_convolve_2d_c(input + offset_r * w + offset_c, w,
+                                         output16_1, MAX_SB_SIZE, out_w, out_h,
+                                         &filter_params_x, &filter_params_y,
+                                         subx, suby, &conv_params1, bd);
+            test_impl(input + offset_r * w + offset_c, w, output16_2,
+                      MAX_SB_SIZE, out_w, out_h, &filter_params_x,
+                      &filter_params_y, subx, suby, &conv_params2, bd);
+
+            for (int i = 0; i < out_h; ++i) {
+              for (int j = 0; j < out_w; ++j) {
+                int idx = i * MAX_SB_SIZE + j;
+                ASSERT_EQ(output1[idx], output2[idx])
+                    << out_w << "x" << out_h << " Pixel mismatch at index "
+                    << idx << " = (" << i << ", " << j
+                    << "), sub pixel offset = (" << suby << ", " << subx << ")";
+              }
+            }
+
+            if (memcmp(output16_1, output16_2, sizeof(output16_1))) {
+              for (int i = 0; i < MAX_SB_SIZE; ++i) {
+                for (int j = 0; j < MAX_SB_SIZE; ++j) {
+                  int idx = i * MAX_SB_SIZE + j;
+                  ASSERT_EQ(output16_1[idx], output16_2[idx])
+                      << out_w << "x" << out_h << " Pixel mismatch at index "
+                      << idx << " = (" << i << ", " << j
+                      << "), sub pixel offset = (" << suby << ", " << subx
+                      << ")";
+                }
+              }
+            }
+          }
+        }
+
+        // Test different combination of fwd and bck offset weights
+        for (int k = 0; k < 2; ++k) {
+          for (int l = 0; l < 4; ++l) {
+            conv_params1.use_jnt_comp_avg = 1;
+            conv_params2.use_jnt_comp_avg = 1;
+            conv_params1.fwd_offset = quant_dist_lookup_table[k][l][0];
+            conv_params1.bck_offset = quant_dist_lookup_table[k][l][1];
+            conv_params2.fwd_offset = quant_dist_lookup_table[k][l][0];
+            conv_params2.bck_offset = quant_dist_lookup_table[k][l][1];
+
+            const int subx_range = has_subx ? 16 : 1;
+            const int suby_range = has_suby ? 16 : 1;
+            for (subx = 0; subx < subx_range; ++subx) {
+              for (suby = 0; suby < suby_range; ++suby) {
+                // Choose random locations within the source block
+                const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+                const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+                av1_highbd_jnt_convolve_2d_c(
+                    input + offset_r * w + offset_c, w, output16_1, MAX_SB_SIZE,
+                    out_w, out_h, &filter_params_x, &filter_params_y, subx,
+                    suby, &conv_params1, bd);
+                test_impl(input + offset_r * w + offset_c, w, output16_2,
+                          MAX_SB_SIZE, out_w, out_h, &filter_params_x,
+                          &filter_params_y, subx, suby, &conv_params2, bd);
+
+                for (int i = 0; i < out_h; ++i) {
+                  for (int j = 0; j < out_w; ++j) {
+                    int idx = i * MAX_SB_SIZE + j;
+                    ASSERT_EQ(output1[idx], output2[idx])
+                        << out_w << "x" << out_h << " Pixel mismatch at index "
+                        << idx << " = (" << i << ", " << j
+                        << "), sub pixel offset = (" << suby << ", " << subx
+                        << ")";
+                  }
+                }
+
+                if (memcmp(output16_1, output16_2, sizeof(output16_1))) {
+                  for (int i = 0; i < MAX_SB_SIZE; ++i) {
+                    for (int j = 0; j < MAX_SB_SIZE; ++j) {
+                      int idx = i * MAX_SB_SIZE + j;
+                      ASSERT_EQ(output16_1[idx], output16_2[idx])
+                          << out_w << "x" << out_h
+                          << " Pixel mismatch at index " << idx << " = (" << i
+                          << ", " << j << "), sub pixel offset = (" << suby
+                          << ", " << subx << ")";
+                    }
+                  }
+                }
               }
+            }
           }
         }
+      }
     }
   }
-  delete[] input;
-  delete[] output;
-  delete[] output2;
 }
 }  // namespace AV1HighbdConvolve2D
-#endif  // CONFIG_HIGHBITDEPTH
 }  // namespace libaom_test
diff --git a/third_party/aom/test/av1_convolve_2d_test_util.h b/third_party/aom/test/av1_convolve_2d_test_util.h
index 013126b4a..3a53dbdfe 100644
--- a/third_party/aom/test/av1_convolve_2d_test_util.h
+++ b/third_party/aom/test/av1_convolve_2d_test_util.h
@@ -12,11 +12,13 @@
 #ifndef TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
 #define TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
 
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/util.h"
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
+
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 
@@ -25,62 +27,90 @@ namespace libaom_test {
 namespace AV1Convolve2D {
 
 typedef void (*convolve_2d_func)(const uint8_t *src, int src_stride,
-                                 CONV_BUF_TYPE *dst, int dst_stride, int w,
-                                 int h, InterpFilterParams *filter_params_x,
+                                 uint8_t *dst, int dst_stride, int w, int h,
+                                 InterpFilterParams *filter_params_x,
                                  InterpFilterParams *filter_params_y,
                                  const int subpel_x_q4, const int subpel_y_q4,
                                  ConvolveParams *conv_params);
 
-typedef std::tr1::tuple<int, int, convolve_2d_func> Convolve2DParam;
+typedef ::testing::tuple<convolve_2d_func, int, int, BLOCK_SIZE>
+    Convolve2DParam;
 
 ::testing::internal::ParamGenerator<Convolve2DParam> BuildParams(
-    convolve_2d_func filter);
+    convolve_2d_func filter, int subx_exist, int suby_exist);
 
-class AV1Convolve2DTest : public ::testing::TestWithParam<Convolve2DParam> {
+class AV1Convolve2DSrTest : public ::testing::TestWithParam<Convolve2DParam> {
  public:
-  virtual ~AV1Convolve2DTest();
+  virtual ~AV1Convolve2DSrTest();
   virtual void SetUp();
 
   virtual void TearDown();
 
  protected:
   void RunCheckOutput(convolve_2d_func test_impl);
+  void RunSpeedTest(convolve_2d_func test_impl);
 
   libaom_test::ACMRandom rnd_;
 };
 
+class AV1JntConvolve2DTest : public ::testing::TestWithParam<Convolve2DParam> {
+ public:
+  virtual ~AV1JntConvolve2DTest();
+  virtual void SetUp();
+
+  virtual void TearDown();
+
+ protected:
+  void RunCheckOutput(convolve_2d_func test_impl);
+  void RunSpeedTest(convolve_2d_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+};
 }  // namespace AV1Convolve2D
 
-#if CONFIG_HIGHBITDEPTH
 namespace AV1HighbdConvolve2D {
 typedef void (*highbd_convolve_2d_func)(
-    const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride,
-    int w, int h, InterpFilterParams *filter_params_x,
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, InterpFilterParams *filter_params_x,
     InterpFilterParams *filter_params_y, const int subpel_x_q4,
     const int subpel_y_q4, ConvolveParams *conv_params, int bd);
 
-typedef std::tr1::tuple<int, int, int, highbd_convolve_2d_func>
+typedef ::testing::tuple<int, highbd_convolve_2d_func, int, int, BLOCK_SIZE>
     HighbdConvolve2DParam;
 
 ::testing::internal::ParamGenerator<HighbdConvolve2DParam> BuildParams(
-    highbd_convolve_2d_func filter);
+    highbd_convolve_2d_func filter, int subx_exist, int suby_exist);
 
-class AV1HighbdConvolve2DTest
+class AV1HighbdConvolve2DSrTest
     : public ::testing::TestWithParam<HighbdConvolve2DParam> {
  public:
-  virtual ~AV1HighbdConvolve2DTest();
+  virtual ~AV1HighbdConvolve2DSrTest();
   virtual void SetUp();
 
   virtual void TearDown();
 
  protected:
   void RunCheckOutput(highbd_convolve_2d_func test_impl);
+  void RunSpeedTest(highbd_convolve_2d_func test_impl);
 
   libaom_test::ACMRandom rnd_;
 };
 
+class AV1HighbdJntConvolve2DTest
+    : public ::testing::TestWithParam<HighbdConvolve2DParam> {
+ public:
+  virtual ~AV1HighbdJntConvolve2DTest();
+  virtual void SetUp();
+
+  virtual void TearDown();
+
+ protected:
+  void RunCheckOutput(highbd_convolve_2d_func test_impl);
+  void RunSpeedTest(highbd_convolve_2d_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+};
 }  // namespace AV1HighbdConvolve2D
-#endif  // CONFIG_HIGHBITDEPTH
 
 }  // namespace libaom_test
 
diff --git a/third_party/aom/test/av1_convolve_optimz_test.cc b/third_party/aom/test/av1_convolve_optimz_test.cc
deleted file mode 100644
index 95bf63f44..000000000
--- a/third_party/aom/test/av1_convolve_optimz_test.cc
+++ /dev/null
@@ -1,405 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./av1_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-
-namespace {
-
-using std::tr1::tuple;
-using libaom_test::ACMRandom;
-
-typedef void (*ConvInit)();
-typedef void (*conv_filter_t)(const uint8_t *, int, uint8_t *, int, int, int,
-                              const InterpFilterParams, int, int,
-                              ConvolveParams *);
-#if CONFIG_HIGHBITDEPTH
-typedef void (*hbd_conv_filter_t)(const uint16_t *, int, uint16_t *, int, int,
-                                  int, const InterpFilterParams, int, int, int,
-                                  int);
-#endif
-
-// Test parameter list:
-//  <convolve_horiz_func, convolve_vert_func,
-//  <width, height>, filter_params, subpel_x_q4, avg>
-typedef tuple<int, int> BlockDimension;
-typedef tuple<ConvInit, conv_filter_t, conv_filter_t, BlockDimension,
-              InterpFilter, int, int>
-    ConvParams;
-#if CONFIG_HIGHBITDEPTH
-// Test parameter list:
-//  <convolve_horiz_func, convolve_vert_func,
-//  <width, height>, filter_params, subpel_x_q4, avg, bit_dpeth>
-typedef tuple<ConvInit, hbd_conv_filter_t, hbd_conv_filter_t, BlockDimension,
-              InterpFilter, int, int, int>
-    HbdConvParams;
-#endif
-
-// Note:
-//  src_ and src_ref_ have special boundary requirement
-//  dst_ and dst_ref_ don't
-const size_t maxWidth = 256;
-const size_t maxHeight = 256;
-const size_t maxBlockSize = maxWidth * maxHeight;
-const int horizOffset = 32;
-const int vertiOffset = 32;
-const int stride = 128;
-const int x_step_q4 = 16;
-
-class AV1ConvolveOptimzTest : public ::testing::TestWithParam<ConvParams> {
- public:
-  virtual ~AV1ConvolveOptimzTest() {}
-  virtual void SetUp() {
-    ConvInit conv_init = GET_PARAM(0);
-    conv_init();
-    conv_horiz_ = GET_PARAM(1);
-    conv_vert_ = GET_PARAM(2);
-    BlockDimension block = GET_PARAM(3);
-    width_ = std::tr1::get<0>(block);
-    height_ = std::tr1::get<1>(block);
-    filter_ = GET_PARAM(4);
-    subpel_ = GET_PARAM(5);
-    int ref = GET_PARAM(6);
-    const int plane = 0;
-    conv_params_ = get_conv_params(ref, ref, plane);
-
-    alloc_ = new uint8_t[maxBlockSize * 4];
-    src_ = alloc_ + (vertiOffset * maxWidth);
-    src_ += horizOffset;
-    src_ref_ = src_ + maxBlockSize;
-
-    dst_ = alloc_ + 2 * maxBlockSize;
-    dst_ref_ = alloc_ + 3 * maxBlockSize;
-  }
-
-  virtual void TearDown() {
-    delete[] alloc_;
-    libaom_test::ClearSystemState();
-  }
-
- protected:
-  void RunHorizFilterBitExactCheck();
-  void RunVertFilterBitExactCheck();
-
- private:
-  void PrepFilterBuffer();
-  void DiffFilterBuffer();
-  conv_filter_t conv_horiz_;
-  conv_filter_t conv_vert_;
-  uint8_t *alloc_;
-  uint8_t *src_;
-  uint8_t *dst_;
-  uint8_t *src_ref_;
-  uint8_t *dst_ref_;
-  int width_;
-  int height_;
-  InterpFilter filter_;
-  int subpel_;
-  ConvolveParams conv_params_;
-};
-
-void AV1ConvolveOptimzTest::PrepFilterBuffer() {
-  int r, c;
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-  memset(alloc_, 0, 4 * maxBlockSize * sizeof(alloc_[0]));
-
-  uint8_t *src_ptr = src_;
-  uint8_t *dst_ptr = dst_;
-  uint8_t *src_ref_ptr = src_ref_;
-  uint8_t *dst_ref_ptr = dst_ref_;
-
-  for (r = 0; r < height_; ++r) {
-    for (c = 0; c < width_; ++c) {
-      src_ptr[c] = rnd.Rand8();
-      src_ref_ptr[c] = src_ptr[c];
-      dst_ptr[c] = rnd.Rand8();
-      dst_ref_ptr[c] = dst_ptr[c];
-    }
-    src_ptr += stride;
-    src_ref_ptr += stride;
-    dst_ptr += stride;
-    dst_ref_ptr += stride;
-  }
-}
-
-void AV1ConvolveOptimzTest::DiffFilterBuffer() {
-  int r, c;
-  const uint8_t *dst_ptr = dst_;
-  const uint8_t *dst_ref_ptr = dst_ref_;
-  for (r = 0; r < height_; ++r) {
-    for (c = 0; c < width_; ++c) {
-      EXPECT_EQ((uint8_t)dst_ref_ptr[c], (uint8_t)dst_ptr[c])
-          << "Error at row: " << r << " col: " << c << " "
-          << "w = " << width_ << " "
-          << "h = " << height_ << " "
-          << "filter group index = " << filter_ << " "
-          << "filter index = " << subpel_;
-    }
-    dst_ptr += stride;
-    dst_ref_ptr += stride;
-  }
-}
-
-void AV1ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
-  PrepFilterBuffer();
-
-  InterpFilterParams filter_params = av1_get_interp_filter_params(filter_);
-
-  av1_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_, height_,
-                       filter_params, subpel_, x_step_q4, &conv_params_);
-
-  conv_horiz_(src_, stride, dst_, stride, width_, height_, filter_params,
-              subpel_, x_step_q4, &conv_params_);
-
-  DiffFilterBuffer();
-
-  // Note:
-  // Here we need calculate a height which is different from the specified one
-  // and test again.
-  int intermediate_height =
-      (((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
-  PrepFilterBuffer();
-
-  av1_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_,
-                       intermediate_height, filter_params, subpel_, x_step_q4,
-                       &conv_params_);
-
-  conv_horiz_(src_, stride, dst_, stride, width_, intermediate_height,
-              filter_params, subpel_, x_step_q4, &conv_params_);
-
-  DiffFilterBuffer();
-}
-
-void AV1ConvolveOptimzTest::RunVertFilterBitExactCheck() {
-  PrepFilterBuffer();
-
-  InterpFilterParams filter_params = av1_get_interp_filter_params(filter_);
-
-  av1_convolve_vert_c(src_ref_, stride, dst_ref_, stride, width_, height_,
-                      filter_params, subpel_, x_step_q4, &conv_params_);
-
-  conv_vert_(src_, stride, dst_, stride, width_, height_, filter_params,
-             subpel_, x_step_q4, &conv_params_);
-
-  DiffFilterBuffer();
-}
-
-TEST_P(AV1ConvolveOptimzTest, HorizBitExactCheck) {
-  RunHorizFilterBitExactCheck();
-}
-TEST_P(AV1ConvolveOptimzTest, VerticalBitExactCheck) {
-  RunVertFilterBitExactCheck();
-}
-
-using std::tr1::make_tuple;
-
-#if (HAVE_SSSE3 || HAVE_SSE4_1) && CONFIG_DUAL_FILTER
-const BlockDimension kBlockDim[] = {
-  make_tuple(2, 2),    make_tuple(2, 4),    make_tuple(4, 4),
-  make_tuple(4, 8),    make_tuple(8, 4),    make_tuple(8, 8),
-  make_tuple(8, 16),   make_tuple(16, 8),   make_tuple(16, 16),
-  make_tuple(16, 32),  make_tuple(32, 16),  make_tuple(32, 32),
-  make_tuple(32, 64),  make_tuple(64, 32),  make_tuple(64, 64),
-  make_tuple(64, 128), make_tuple(128, 64), make_tuple(128, 128),
-};
-
-// 10/12-tap filters
-const InterpFilter kFilter[] = { EIGHTTAP_REGULAR, BILINEAR, MULTITAP_SHARP };
-
-const int kSubpelQ4[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
-
-const int kAvg[] = { 0, 1 };
-#endif
-
-#if HAVE_SSSE3 && CONFIG_DUAL_FILTER
-INSTANTIATE_TEST_CASE_P(
-    SSSE3, AV1ConvolveOptimzTest,
-    ::testing::Combine(::testing::Values(av1_lowbd_convolve_init_ssse3),
-                       ::testing::Values(av1_convolve_horiz_ssse3),
-                       ::testing::Values(av1_convolve_vert_ssse3),
-                       ::testing::ValuesIn(kBlockDim),
-                       ::testing::ValuesIn(kFilter),
-                       ::testing::ValuesIn(kSubpelQ4),
-                       ::testing::ValuesIn(kAvg)));
-#endif  // HAVE_SSSE3 && CONFIG_DUAL_FILTER
-
-#if CONFIG_HIGHBITDEPTH
-typedef ::testing::TestWithParam<HbdConvParams> TestWithHbdConvParams;
-class AV1HbdConvolveOptimzTest : public TestWithHbdConvParams {
- public:
-  virtual ~AV1HbdConvolveOptimzTest() {}
-  virtual void SetUp() {
-    ConvInit conv_init = GET_PARAM(0);
-    conv_init();
-    conv_horiz_ = GET_PARAM(1);
-    conv_vert_ = GET_PARAM(2);
-    BlockDimension block = GET_PARAM(3);
-    width_ = std::tr1::get<0>(block);
-    height_ = std::tr1::get<1>(block);
-    filter_ = GET_PARAM(4);
-    subpel_ = GET_PARAM(5);
-    avg_ = GET_PARAM(6);
-    bit_depth_ = GET_PARAM(7);
-
-    alloc_ = new uint16_t[maxBlockSize * 4];
-    src_ = alloc_ + (vertiOffset * maxWidth);
-    src_ += horizOffset;
-    src_ref_ = src_ + maxBlockSize;
-
-    dst_ = alloc_ + 2 * maxBlockSize;
-    dst_ref_ = alloc_ + 3 * maxBlockSize;
-  }
-
-  virtual void TearDown() {
-    delete[] alloc_;
-    libaom_test::ClearSystemState();
-  }
-
- protected:
-  void RunHorizFilterBitExactCheck();
-  void RunVertFilterBitExactCheck();
-
- private:
-  void PrepFilterBuffer();
-  void DiffFilterBuffer();
-  hbd_conv_filter_t conv_horiz_;
-  hbd_conv_filter_t conv_vert_;
-  uint16_t *alloc_;
-  uint16_t *src_;
-  uint16_t *dst_;
-  uint16_t *src_ref_;
-  uint16_t *dst_ref_;
-  int width_;
-  int height_;
-  InterpFilter filter_;
-  int subpel_;
-  int avg_;
-  int bit_depth_;
-};
-
-void AV1HbdConvolveOptimzTest::PrepFilterBuffer() {
-  int r, c;
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-  memset(alloc_, 0, 4 * maxBlockSize * sizeof(alloc_[0]));
-
-  uint16_t *src_ptr = src_;
-  uint16_t *dst_ptr = dst_;
-  uint16_t *dst_ref_ptr = dst_ref_;
-  uint16_t hbd_mask = (1 << bit_depth_) - 1;
-
-  for (r = 0; r < height_; ++r) {
-    for (c = 0; c < width_; ++c) {
-      src_ptr[c] = rnd.Rand16() & hbd_mask;
-      dst_ptr[c] = rnd.Rand16() & hbd_mask;
-      dst_ref_ptr[c] = dst_ptr[c];
-    }
-    src_ptr += stride;
-    dst_ptr += stride;
-    dst_ref_ptr += stride;
-  }
-}
-
-void AV1HbdConvolveOptimzTest::DiffFilterBuffer() {
-  int r, c;
-  const uint16_t *dst_ptr = dst_;
-  const uint16_t *dst_ref_ptr = dst_ref_;
-  for (r = 0; r < height_; ++r) {
-    for (c = 0; c < width_; ++c) {
-      EXPECT_EQ((uint16_t)dst_ref_ptr[c], (uint16_t)dst_ptr[c])
-          << "Error at row: " << r << " col: " << c << " "
-          << "w = " << width_ << " "
-          << "h = " << height_ << " "
-          << "filter group index = " << filter_ << " "
-          << "filter index = " << subpel_ << " "
-          << "bit depth = " << bit_depth_;
-    }
-    dst_ptr += stride;
-    dst_ref_ptr += stride;
-  }
-}
-
-void AV1HbdConvolveOptimzTest::RunHorizFilterBitExactCheck() {
-  PrepFilterBuffer();
-
-  InterpFilterParams filter_params = av1_get_interp_filter_params(filter_);
-
-  av1_highbd_convolve_horiz_c(src_, stride, dst_ref_, stride, width_, height_,
-                              filter_params, subpel_, x_step_q4, avg_,
-                              bit_depth_);
-
-  conv_horiz_(src_, stride, dst_, stride, width_, height_, filter_params,
-              subpel_, x_step_q4, avg_, bit_depth_);
-
-  DiffFilterBuffer();
-
-  // Note:
-  // Here we need calculate a height which is different from the specified one
-  // and test again.
-  int intermediate_height =
-      (((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
-  PrepFilterBuffer();
-
-  av1_highbd_convolve_horiz_c(src_, stride, dst_ref_, stride, width_,
-                              intermediate_height, filter_params, subpel_,
-                              x_step_q4, avg_, bit_depth_);
-
-  conv_horiz_(src_, stride, dst_, stride, width_, intermediate_height,
-              filter_params, subpel_, x_step_q4, avg_, bit_depth_);
-
-  DiffFilterBuffer();
-}
-
-void AV1HbdConvolveOptimzTest::RunVertFilterBitExactCheck() {
-  PrepFilterBuffer();
-
-  InterpFilterParams filter_params = av1_get_interp_filter_params(filter_);
-
-  av1_highbd_convolve_vert_c(src_, stride, dst_ref_, stride, width_, height_,
-                             filter_params, subpel_, x_step_q4, avg_,
-                             bit_depth_);
-
-  conv_vert_(src_, stride, dst_, stride, width_, height_, filter_params,
-             subpel_, x_step_q4, avg_, bit_depth_);
-
-  DiffFilterBuffer();
-}
-
-TEST_P(AV1HbdConvolveOptimzTest, HorizBitExactCheck) {
-  RunHorizFilterBitExactCheck();
-}
-TEST_P(AV1HbdConvolveOptimzTest, VertBitExactCheck) {
-  RunVertFilterBitExactCheck();
-}
-
-#if HAVE_SSE4_1 && CONFIG_DUAL_FILTER
-
-const int kBitdepth[] = { 10, 12 };
-
-INSTANTIATE_TEST_CASE_P(
-    SSE4_1, AV1HbdConvolveOptimzTest,
-    ::testing::Combine(::testing::Values(av1_highbd_convolve_init_sse4_1),
-                       ::testing::Values(av1_highbd_convolve_horiz_sse4_1),
-                       ::testing::Values(av1_highbd_convolve_vert_sse4_1),
-                       ::testing::ValuesIn(kBlockDim),
-                       ::testing::ValuesIn(kFilter),
-                       ::testing::ValuesIn(kSubpelQ4),
-                       ::testing::ValuesIn(kAvg),
-                       ::testing::ValuesIn(kBitdepth)));
-#endif  // HAVE_SSE4_1 && CONFIG_DUAL_FILTER
-#endif  // CONFIG_HIGHBITDEPTH
-}  // namespace
diff --git a/third_party/aom/test/av1_convolve_scale_test.cc b/third_party/aom/test/av1_convolve_scale_test.cc
index 9d8be888d..e0571423c 100644
--- a/third_party/aom/test/av1_convolve_scale_test.cc
+++ b/third_party/aom/test/av1_convolve_scale_test.cc
@@ -13,13 +13,16 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom_ports/aom_timer.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
+#include "av1/common/common_data.h"
+
 namespace {
 const int kTestIters = 10;
 const int kPerfIters = 1000;
@@ -29,8 +32,8 @@ const int kHPad = 32;
 const int kXStepQn = 16;
 const int kYStepQn = 20;
 
-using std::tr1::tuple;
-using std::tr1::make_tuple;
+using ::testing::make_tuple;
+using ::testing::tuple;
 using libaom_test::ACMRandom;
 
 enum NTaps { EIGHT_TAP, TEN_TAP, TWELVE_TAP };
@@ -120,6 +123,7 @@ class TestImage {
     // Allocate image data
     src_data_.resize(2 * src_block_size());
     dst_data_.resize(2 * dst_block_size());
+    dst_16_data_.resize(2 * dst_block_size());
   }
 
   void Initialize(ACMRandom *rnd);
@@ -136,8 +140,13 @@ class TestImage {
     return borders ? block : block + kHPad + src_stride_ * kVPad;
   }
 
-  int32_t *GetDstData(bool ref, bool borders) {
-    int32_t *block = &dst_data_[ref ? 0 : dst_block_size()];
+  SrcPixel *GetDstData(bool ref, bool borders) {
+    SrcPixel *block = &dst_data_[ref ? 0 : dst_block_size()];
+    return borders ? block : block + kHPad + dst_stride_ * kVPad;
+  }
+
+  CONV_BUF_TYPE *GetDst16Data(bool ref, bool borders) {
+    CONV_BUF_TYPE *block = &dst_16_data_[ref ? 0 : dst_block_size()];
     return borders ? block : block + kHPad + dst_stride_ * kVPad;
   }
 
@@ -146,7 +155,8 @@ class TestImage {
   int src_stride_, dst_stride_;
 
   std::vector<SrcPixel> src_data_;
-  std::vector<int32_t> dst_data_;
+  std::vector<SrcPixel> dst_data_;
+  std::vector<CONV_BUF_TYPE> dst_16_data_;
 };
 
 template <typename Pixel>
@@ -190,17 +200,23 @@ template <typename SrcPixel>
 void TestImage<SrcPixel>::Initialize(ACMRandom *rnd) {
   PrepBuffers(rnd, w_, h_, src_stride_, bd_, false, &src_data_[0]);
   PrepBuffers(rnd, w_, h_, dst_stride_, bd_, true, &dst_data_[0]);
+  PrepBuffers(rnd, w_, h_, dst_stride_, bd_, true, &dst_16_data_[0]);
 }
 
 template <typename SrcPixel>
 void TestImage<SrcPixel>::Check() const {
   // If memcmp returns 0, there's nothing to do.
   const int num_pixels = dst_block_size();
-  const int32_t *ref_dst = &dst_data_[0];
-  const int32_t *tst_dst = &dst_data_[num_pixels];
+  const SrcPixel *ref_dst = &dst_data_[0];
+  const SrcPixel *tst_dst = &dst_data_[num_pixels];
 
-  if (0 == memcmp(ref_dst, tst_dst, sizeof(*ref_dst) * num_pixels)) return;
+  const CONV_BUF_TYPE *ref_16_dst = &dst_16_data_[0];
+  const CONV_BUF_TYPE *tst_16_dst = &dst_16_data_[num_pixels];
 
+  if (0 == memcmp(ref_dst, tst_dst, sizeof(*ref_dst) * num_pixels)) {
+    if (0 == memcmp(ref_16_dst, tst_16_dst, sizeof(*ref_16_dst) * num_pixels))
+      return;
+  }
   // Otherwise, iterate through the buffer looking for differences (including
   // the edges)
   const int stride = dst_stride_;
@@ -213,6 +229,17 @@ void TestImage<SrcPixel>::Check() const {
           << "Error at row: " << (r - kVPad) << ", col: " << (c - kHPad);
     }
   }
+
+  for (int r = 0; r < h_ + 2 * kVPad; ++r) {
+    for (int c = 0; c < w_ + 2 * kHPad; ++c) {
+      const int32_t ref_value = ref_16_dst[r * stride + c];
+      const int32_t tst_value = tst_16_dst[r * stride + c];
+
+      EXPECT_EQ(tst_value, ref_value)
+          << "Error in 16 bit buffer "
+          << "Error at row: " << (r - kVPad) << ", col: " << (c - kHPad);
+    }
+  }
 }
 
 typedef tuple<int, int> BlockDimension;
@@ -242,8 +269,8 @@ class ConvolveScaleTestBase : public ::testing::Test {
 
  protected:
   void SetParams(const BaseParams &params, int bd) {
-    width_ = std::tr1::get<0>(params.dims);
-    height_ = std::tr1::get<1>(params.dims);
+    width_ = ::testing::get<0>(params.dims);
+    height_ = ::testing::get<1>(params.dims);
     ntaps_x_ = params.ntaps_x;
     ntaps_y_ = params.ntaps_y;
     bd_ = bd;
@@ -251,19 +278,54 @@ class ConvolveScaleTestBase : public ::testing::Test {
 
     filter_x_.set(ntaps_x_, false);
     filter_y_.set(ntaps_y_, true);
-    convolve_params_ = get_conv_params_no_round(0, avg_ != false, 0, NULL, 0);
+    convolve_params_ =
+        get_conv_params_no_round(0, avg_ != false, 0, NULL, 0, 1, bd);
 
     delete image_;
     image_ = new TestImage<SrcPixel>(width_, height_, bd_);
   }
 
+  void SetConvParamOffset(int i, int j, int is_compound, int do_average,
+                          int use_jnt_comp_avg) {
+    if (i == -1 && j == -1) {
+      convolve_params_.use_jnt_comp_avg = use_jnt_comp_avg;
+      convolve_params_.is_compound = is_compound;
+      convolve_params_.do_average = do_average;
+    } else {
+      convolve_params_.use_jnt_comp_avg = use_jnt_comp_avg;
+      convolve_params_.fwd_offset = quant_dist_lookup_table[i][j][0];
+      convolve_params_.bck_offset = quant_dist_lookup_table[i][j][1];
+      convolve_params_.is_compound = is_compound;
+      convolve_params_.do_average = do_average;
+    }
+  }
+
   void Run() {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     for (int i = 0; i < kTestIters; ++i) {
+      int is_compound = 0;
+      SetConvParamOffset(-1, -1, is_compound, 0, 0);
       Prep(&rnd);
       RunOne(true);
       RunOne(false);
       image_->Check();
+
+      is_compound = 1;
+      for (int do_average = 0; do_average < 2; do_average++) {
+        for (int use_jnt_comp_avg = 0; use_jnt_comp_avg < 2;
+             use_jnt_comp_avg++) {
+          for (int j = 0; j < 2; ++j) {
+            for (int k = 0; k < 4; ++k) {
+              SetConvParamOffset(j, k, is_compound, do_average,
+                                 use_jnt_comp_avg);
+              Prep(&rnd);
+              RunOne(true);
+              RunOne(false);
+              image_->Check();
+            }
+          }
+        }
+      }
     }
   }
 
@@ -327,7 +389,7 @@ class ConvolveScaleTestBase : public ::testing::Test {
 typedef tuple<int, int> BlockDimension;
 
 typedef void (*LowbdConvolveFunc)(const uint8_t *src, int src_stride,
-                                  int32_t *dst, int dst_stride, int w, int h,
+                                  uint8_t *dst, int dst_stride, int w, int h,
                                   InterpFilterParams *filter_params_x,
                                   InterpFilterParams *filter_params_y,
                                   const int subpel_x_qn, const int x_step_qn,
@@ -359,10 +421,10 @@ class LowBDConvolveScaleTest
 
   void RunOne(bool ref) {
     const uint8_t *src = image_->GetSrcData(ref, false);
-    CONV_BUF_TYPE *dst = image_->GetDstData(ref, false);
+    uint8_t *dst = image_->GetDstData(ref, false);
+    convolve_params_.dst = image_->GetDst16Data(ref, false);
     const int src_stride = image_->src_stride();
     const int dst_stride = image_->dst_stride();
-
     if (ref) {
       av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, width_, height_,
                               &filter_x_.params_, &filter_y_.params_, subpel_x_,
@@ -387,7 +449,7 @@ const BlockDimension kBlockDim[] = {
   make_tuple(64, 128), make_tuple(128, 64), make_tuple(128, 128),
 };
 
-const NTaps kNTaps[] = { EIGHT_TAP, TEN_TAP, TWELVE_TAP };
+const NTaps kNTaps[] = { EIGHT_TAP };
 
 TEST_P(LowBDConvolveScaleTest, Check) { Run(); }
 TEST_P(LowBDConvolveScaleTest, DISABLED_Speed) { SpeedTest(); }
@@ -399,9 +461,8 @@ INSTANTIATE_TEST_CASE_P(
                        ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
                        ::testing::Bool()));
 
-#if CONFIG_HIGHBITDEPTH
 typedef void (*HighbdConvolveFunc)(const uint16_t *src, int src_stride,
-                                   int32_t *dst, int dst_stride, int w, int h,
+                                   uint16_t *dst, int dst_stride, int w, int h,
                                    InterpFilterParams *filter_params_x,
                                    InterpFilterParams *filter_params_y,
                                    const int subpel_x_qn, const int x_step_qn,
@@ -433,7 +494,8 @@ class HighBDConvolveScaleTest
 
   void RunOne(bool ref) {
     const uint16_t *src = image_->GetSrcData(ref, false);
-    CONV_BUF_TYPE *dst = image_->GetDstData(ref, false);
+    uint16_t *dst = image_->GetDstData(ref, false);
+    convolve_params_.dst = image_->GetDst16Data(ref, false);
     const int src_stride = image_->src_stride();
     const int dst_stride = image_->dst_stride();
 
@@ -464,6 +526,4 @@ INSTANTIATE_TEST_CASE_P(
                        ::testing::ValuesIn(kBlockDim),
                        ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
                        ::testing::Bool(), ::testing::ValuesIn(kBDs)));
-
-#endif  // CONFIG_HIGHBITDEPTH
 }  // namespace
diff --git a/third_party/aom/test/av1_convolve_test.cc b/third_party/aom/test/av1_convolve_test.cc
deleted file mode 100644
index aaef7cfe0..000000000
--- a/third_party/aom/test/av1_convolve_test.cc
+++ /dev/null
@@ -1,514 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <algorithm>
-#include <vector>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-#include "av1/common/filter.h"
-#include "av1/common/convolve.h"
-#include "test/acm_random.h"
-#include "test/util.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-using std::tr1::tuple;
-static void filter_block1d_horiz_c(const uint8_t *src_ptr, int src_stride,
-                                   const int16_t *filter, int tap,
-                                   uint8_t *dst_ptr, int dst_stride, int w,
-                                   int h) {
-  src_ptr -= tap / 2 - 1;
-  for (int r = 0; r < h; ++r) {
-    for (int c = 0; c < w; ++c) {
-      int sum = 0;
-      for (int i = 0; i < tap; ++i) {
-        sum += src_ptr[c + i] * filter[i];
-      }
-      dst_ptr[c] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-    }
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-static void filter_block1d_vert_c(const uint8_t *src_ptr, int src_stride,
-                                  const int16_t *filter, int tap,
-                                  uint8_t *dst_ptr, int dst_stride, int w,
-                                  int h) {
-  src_ptr -= (tap / 2 - 1) * src_stride;
-  for (int r = 0; r < h; ++r) {
-    for (int c = 0; c < w; ++c) {
-      int sum = 0;
-      for (int i = 0; i < tap; ++i) {
-        sum += src_ptr[c + i * src_stride] * filter[i];
-      }
-      dst_ptr[c] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-    }
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-static int match(const uint8_t *out, int out_stride, const uint8_t *ref_out,
-                 int ref_out_stride, int w, int h) {
-  for (int r = 0; r < h; ++r) {
-    for (int c = 0; c < w; ++c) {
-      if (out[r * out_stride + c] != ref_out[r * ref_out_stride + c]) return 0;
-    }
-  }
-  return 1;
-}
-
-typedef void (*ConvolveFunc)(const uint8_t *src, int src_stride, uint8_t *dst,
-                             int dst_stride, int w, int h,
-                             const InterpFilterParams filter_params,
-                             const int subpel_q4, int step_q4,
-                             ConvolveParams *conv_params);
-
-struct ConvolveFunctions {
-  ConvolveFunctions(ConvolveFunc hf, ConvolveFunc vf) : hf_(hf), vf_(vf) {}
-  ConvolveFunc hf_;
-  ConvolveFunc vf_;
-};
-
-typedef tuple<ConvolveFunctions *, InterpFilter /*filter_x*/,
-              InterpFilter /*filter_y*/>
-    ConvolveParam;
-
-class Av1ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
- public:
-  virtual void SetUp() {
-    rnd_(ACMRandom::DeterministicSeed());
-    cfs_ = GET_PARAM(0);
-    interp_filter_ls_[0] = GET_PARAM(2);
-    interp_filter_ls_[2] = interp_filter_ls_[0];
-    interp_filter_ls_[1] = GET_PARAM(1);
-    interp_filter_ls_[3] = interp_filter_ls_[1];
-  }
-  virtual void TearDown() {
-    while (buf_ls_.size() > 0) {
-      uint8_t *buf = buf_ls_.back();
-      aom_free(buf);
-      buf_ls_.pop_back();
-    }
-  }
-  virtual uint8_t *add_input(int w, int h, int *stride) {
-    uint8_t *buf =
-        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, kBufferSize));
-    buf_ls_.push_back(buf);
-    *stride = w + MAX_FILTER_TAP - 1;
-    int offset = MAX_FILTER_TAP / 2 - 1;
-    for (int r = 0; r < h + MAX_FILTER_TAP - 1; ++r) {
-      for (int c = 0; c < w + MAX_FILTER_TAP - 1; ++c) {
-        buf[r * (*stride) + c] = rnd_.Rand8();
-      }
-    }
-    return buf + offset * (*stride) + offset;
-  }
-  virtual uint8_t *add_output(int w, int /*h*/, int *stride) {
-    uint8_t *buf =
-        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, kBufferSize));
-    buf_ls_.push_back(buf);
-    *stride = w;
-    return buf;
-  }
-  virtual void random_init_buf(uint8_t *buf, int w, int h, int stride) {
-    for (int r = 0; r < h; ++r) {
-      for (int c = 0; c < w; ++c) {
-        buf[r * stride + c] = rnd_.Rand8();
-      }
-    }
-  }
-
- protected:
-  static const int kDataAlignment = 16;
-  static const int kOuterBlockSize = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
-  static const int kBufferSize = kOuterBlockSize * kOuterBlockSize;
-  std::vector<uint8_t *> buf_ls_;
-  InterpFilter interp_filter_ls_[4];
-  ConvolveFunctions *cfs_;
-  ACMRandom rnd_;
-};
-
-int bsize_ls[] = { 1, 2, 4, 8, 16, 32, 64, 3, 7, 15, 31, 63 };
-int bsize_num = NELEMENTS(bsize_ls);
-
-TEST_P(Av1ConvolveTest, av1_convolve_vert) {
-  const int y_step_q4 = 16;
-  ConvolveParams conv_params = get_conv_params(0, 0, 0);
-
-  int in_stride, out_stride, ref_out_stride, avg_out_stride, ref_avg_out_stride;
-  uint8_t *in = add_input(MAX_SB_SIZE, MAX_SB_SIZE, &in_stride);
-  uint8_t *out = add_output(MAX_SB_SIZE, MAX_SB_SIZE, &out_stride);
-  uint8_t *ref_out = add_output(MAX_SB_SIZE, MAX_SB_SIZE, &ref_out_stride);
-  uint8_t *avg_out = add_output(MAX_SB_SIZE, MAX_SB_SIZE, &avg_out_stride);
-  uint8_t *ref_avg_out =
-      add_output(MAX_SB_SIZE, MAX_SB_SIZE, &ref_avg_out_stride);
-  for (int hb_idx = 0; hb_idx < bsize_num; ++hb_idx) {
-    for (int vb_idx = 0; vb_idx < bsize_num; ++vb_idx) {
-      int w = bsize_ls[hb_idx];
-      int h = bsize_ls[vb_idx];
-      for (int subpel_y_q4 = 0; subpel_y_q4 < SUBPEL_SHIFTS; ++subpel_y_q4) {
-        InterpFilter filter_y = interp_filter_ls_[0];
-        InterpFilterParams param_vert = av1_get_interp_filter_params(filter_y);
-        const int16_t *filter_vert =
-            av1_get_interp_filter_subpel_kernel(param_vert, subpel_y_q4);
-
-        filter_block1d_vert_c(in, in_stride, filter_vert, param_vert.taps,
-                              ref_out, ref_out_stride, w, h);
-
-        conv_params.ref = 0;
-        conv_params.do_average = 0;
-        cfs_->vf_(in, in_stride, out, out_stride, w, h, param_vert, subpel_y_q4,
-                  y_step_q4, &conv_params);
-        EXPECT_EQ(match(out, out_stride, ref_out, ref_out_stride, w, h), 1)
-            << " hb_idx " << hb_idx << " vb_idx " << vb_idx << " filter_y "
-            << filter_y << " subpel_y_q4 " << subpel_y_q4;
-
-        random_init_buf(avg_out, w, h, avg_out_stride);
-        for (int r = 0; r < h; ++r) {
-          for (int c = 0; c < w; ++c) {
-            ref_avg_out[r * ref_avg_out_stride + c] = ROUND_POWER_OF_TWO(
-                avg_out[r * avg_out_stride + c] + out[r * out_stride + c], 1);
-          }
-        }
-        conv_params.ref = 1;
-        conv_params.do_average = 1;
-        cfs_->vf_(in, in_stride, avg_out, avg_out_stride, w, h, param_vert,
-                  subpel_y_q4, y_step_q4, &conv_params);
-        EXPECT_EQ(match(avg_out, avg_out_stride, ref_avg_out,
-                        ref_avg_out_stride, w, h),
-                  1)
-            << " hb_idx " << hb_idx << " vb_idx " << vb_idx << " filter_y "
-            << filter_y << " subpel_y_q4 " << subpel_y_q4;
-      }
-    }
-  }
-};
-
-TEST_P(Av1ConvolveTest, av1_convolve_horiz) {
-  const int x_step_q4 = 16;
-  ConvolveParams conv_params = get_conv_params(0, 0, 0);
-
-  int in_stride, out_stride, ref_out_stride, avg_out_stride, ref_avg_out_stride;
-  uint8_t *in = add_input(MAX_SB_SIZE, MAX_SB_SIZE, &in_stride);
-  uint8_t *out = add_output(MAX_SB_SIZE, MAX_SB_SIZE, &out_stride);
-  uint8_t *ref_out = add_output(MAX_SB_SIZE, MAX_SB_SIZE, &ref_out_stride);
-  uint8_t *avg_out = add_output(MAX_SB_SIZE, MAX_SB_SIZE, &avg_out_stride);
-  uint8_t *ref_avg_out =
-      add_output(MAX_SB_SIZE, MAX_SB_SIZE, &ref_avg_out_stride);
-  for (int hb_idx = 0; hb_idx < bsize_num; ++hb_idx) {
-    for (int vb_idx = 0; vb_idx < bsize_num; ++vb_idx) {
-      int w = bsize_ls[hb_idx];
-      int h = bsize_ls[vb_idx];
-      for (int subpel_x_q4 = 0; subpel_x_q4 < SUBPEL_SHIFTS; ++subpel_x_q4) {
-        InterpFilter filter_x = interp_filter_ls_[1];
-        InterpFilterParams param_horiz = av1_get_interp_filter_params(filter_x);
-        const int16_t *filter_horiz =
-            av1_get_interp_filter_subpel_kernel(param_horiz, subpel_x_q4);
-
-        filter_block1d_horiz_c(in, in_stride, filter_horiz, param_horiz.taps,
-                               ref_out, ref_out_stride, w, h);
-
-        conv_params.ref = 0;
-        conv_params.do_average = 0;
-        cfs_->hf_(in, in_stride, out, out_stride, w, h, param_horiz,
-                  subpel_x_q4, x_step_q4, &conv_params);
-        EXPECT_EQ(match(out, out_stride, ref_out, ref_out_stride, w, h), 1)
-            << " hb_idx " << hb_idx << " vb_idx " << vb_idx << " filter_x "
-            << filter_x << " subpel_x_q4 " << subpel_x_q4;
-
-        random_init_buf(avg_out, w, h, avg_out_stride);
-        for (int r = 0; r < h; ++r) {
-          for (int c = 0; c < w; ++c) {
-            ref_avg_out[r * ref_avg_out_stride + c] = ROUND_POWER_OF_TWO(
-                avg_out[r * avg_out_stride + c] + out[r * out_stride + c], 1);
-          }
-        }
-        conv_params.ref = 1;
-        conv_params.do_average = 1;
-        cfs_->hf_(in, in_stride, avg_out, avg_out_stride, w, h, param_horiz,
-                  subpel_x_q4, x_step_q4, &conv_params);
-        EXPECT_EQ(match(avg_out, avg_out_stride, ref_avg_out,
-                        ref_avg_out_stride, w, h),
-                  1)
-            << "hb_idx " << hb_idx << "vb_idx" << vb_idx << " filter_x "
-            << filter_x << "subpel_x_q4 " << subpel_x_q4;
-      }
-    }
-  }
-};
-
-ConvolveFunctions convolve_functions_c(av1_convolve_horiz_c,
-                                       av1_convolve_vert_c);
-
-InterpFilter filter_ls[] = { EIGHTTAP_REGULAR, EIGHTTAP_SMOOTH,
-                             MULTITAP_SHARP };
-
-INSTANTIATE_TEST_CASE_P(
-    C, Av1ConvolveTest,
-    ::testing::Combine(::testing::Values(&convolve_functions_c),
-                       ::testing::ValuesIn(filter_ls),
-                       ::testing::ValuesIn(filter_ls)));
-
-#if CONFIG_HIGHBITDEPTH
-#ifndef __clang_analyzer__
-TEST(AV1ConvolveTest, av1_highbd_convolve) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  InterpFilters interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-  InterpFilterParams filter_params =
-      av1_get_interp_filter_params(EIGHTTAP_REGULAR);
-  int filter_size = filter_params.taps;
-  int filter_center = filter_size / 2 - 1;
-  uint16_t src[12 * 12];
-  int src_stride = filter_size;
-  uint16_t dst[1] = { 0 };
-  int dst_stride = 1;
-  int x_step_q4 = 16;
-  int y_step_q4 = 16;
-  int avg = 0;
-  int bd = 10;
-  int w = 1;
-  int h = 1;
-
-  int subpel_x_q4;
-  int subpel_y_q4;
-
-  for (int i = 0; i < filter_size * filter_size; i++) {
-    src[i] = rnd.Rand16() % (1 << bd);
-  }
-
-  for (subpel_x_q4 = 0; subpel_x_q4 < SUBPEL_SHIFTS; subpel_x_q4++) {
-    for (subpel_y_q4 = 0; subpel_y_q4 < SUBPEL_SHIFTS; subpel_y_q4++) {
-      av1_highbd_convolve(
-          CONVERT_TO_BYTEPTR(src + src_stride * filter_center + filter_center),
-          src_stride, CONVERT_TO_BYTEPTR(dst), dst_stride, w, h, interp_filters,
-          subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, avg, bd);
-
-      const int16_t *x_filter =
-          av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
-      const int16_t *y_filter =
-          av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
-
-      int temp[12];
-      int dst_ref = 0;
-      for (int r = 0; r < filter_size; r++) {
-        temp[r] = 0;
-        for (int c = 0; c < filter_size; c++) {
-          temp[r] += x_filter[c] * src[r * filter_size + c];
-        }
-        temp[r] =
-            clip_pixel_highbd(ROUND_POWER_OF_TWO(temp[r], FILTER_BITS), bd);
-        dst_ref += temp[r] * y_filter[r];
-      }
-      dst_ref = clip_pixel_highbd(ROUND_POWER_OF_TWO(dst_ref, FILTER_BITS), bd);
-      EXPECT_EQ(dst[0], dst_ref);
-    }
-  }
-}
-#endif
-
-TEST(AV1ConvolveTest, av1_highbd_convolve_avg) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  InterpFilters interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-  InterpFilterParams filter_params =
-      av1_get_interp_filter_params(EIGHTTAP_REGULAR);
-  int filter_size = filter_params.taps;
-  int filter_center = filter_size / 2 - 1;
-  uint16_t src0[12 * 12];
-  uint16_t src1[12 * 12];
-  int src_stride = filter_size;
-  uint16_t dst0[1] = { 0 };
-  uint16_t dst1[1] = { 0 };
-  uint16_t dst[1] = { 0 };
-  int dst_stride = 1;
-  int x_step_q4 = 16;
-  int y_step_q4 = 16;
-  int avg = 0;
-  int bd = 10;
-
-  int w = 1;
-  int h = 1;
-
-  int subpel_x_q4;
-  int subpel_y_q4;
-
-  for (int i = 0; i < filter_size * filter_size; i++) {
-    src0[i] = rnd.Rand16() % (1 << bd);
-    src1[i] = rnd.Rand16() % (1 << bd);
-  }
-
-  for (subpel_x_q4 = 0; subpel_x_q4 < SUBPEL_SHIFTS; subpel_x_q4++) {
-    for (subpel_y_q4 = 0; subpel_y_q4 < SUBPEL_SHIFTS; subpel_y_q4++) {
-      int offset = filter_size * filter_center + filter_center;
-
-      avg = 0;
-      av1_highbd_convolve(CONVERT_TO_BYTEPTR(src0 + offset), src_stride,
-                          CONVERT_TO_BYTEPTR(dst0), dst_stride, w, h,
-                          interp_filters, subpel_x_q4, x_step_q4, subpel_y_q4,
-                          y_step_q4, avg, bd);
-      avg = 0;
-      av1_highbd_convolve(CONVERT_TO_BYTEPTR(src1 + offset), src_stride,
-                          CONVERT_TO_BYTEPTR(dst1), dst_stride, w, h,
-                          interp_filters, subpel_x_q4, x_step_q4, subpel_y_q4,
-                          y_step_q4, avg, bd);
-
-      avg = 0;
-      av1_highbd_convolve(CONVERT_TO_BYTEPTR(src0 + offset), src_stride,
-                          CONVERT_TO_BYTEPTR(dst), dst_stride, w, h,
-                          interp_filters, subpel_x_q4, x_step_q4, subpel_y_q4,
-                          y_step_q4, avg, bd);
-      avg = 1;
-      av1_highbd_convolve(CONVERT_TO_BYTEPTR(src1 + offset), src_stride,
-                          CONVERT_TO_BYTEPTR(dst), dst_stride, w, h,
-                          interp_filters, subpel_x_q4, x_step_q4, subpel_y_q4,
-                          y_step_q4, avg, bd);
-
-      EXPECT_EQ(dst[0], ROUND_POWER_OF_TWO(dst0[0] + dst1[0], 1));
-    }
-  }
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-#define CONVOLVE_SPEED_TEST 0
-#if CONVOLVE_SPEED_TEST
-#define highbd_convolve_speed(func, block_size, frame_size)                  \
-  TEST(AV1ConvolveTest, func##_speed_##block_size##_##frame_size) {          \
-    ACMRandom rnd(ACMRandom::DeterministicSeed());                           \
-    InterpFilter interp_filter = EIGHTTAP;                                   \
-    InterpFilterParams filter_params =                                       \
-        av1_get_interp_filter_params(interp_filter);                         \
-    int filter_size = filter_params.tap;                                     \
-    int filter_center = filter_size / 2 - 1;                                 \
-    DECLARE_ALIGNED(16, uint16_t,                                            \
-                    src[(frame_size + 7) * (frame_size + 7)]) = { 0 };       \
-    int src_stride = frame_size + 7;                                         \
-    DECLARE_ALIGNED(16, uint16_t, dst[frame_size * frame_size]) = { 0 };     \
-    int dst_stride = frame_size;                                             \
-    int x_step_q4 = 16;                                                      \
-    int y_step_q4 = 16;                                                      \
-    int subpel_x_q4 = 8;                                                     \
-    int subpel_y_q4 = 6;                                                     \
-    int bd = 10;                                                             \
-                                                                             \
-    int w = block_size;                                                      \
-    int h = block_size;                                                      \
-                                                                             \
-    const int16_t *filter_x =                                                \
-        av1_get_interp_filter_kernel(filter_params, subpel_x_q4);            \
-    const int16_t *filter_y =                                                \
-        av1_get_interp_filter_kernel(filter_params, subpel_y_q4);            \
-                                                                             \
-    for (int i = 0; i < src_stride * src_stride; i++) {                      \
-      src[i] = rnd.Rand16() % (1 << bd);                                     \
-    }                                                                        \
-                                                                             \
-    int offset = filter_center * src_stride + filter_center;                 \
-    int row_offset = 0;                                                      \
-    int col_offset = 0;                                                      \
-    for (int i = 0; i < 100000; i++) {                                       \
-      int src_total_offset = offset + col_offset * src_stride + row_offset;  \
-      int dst_total_offset = col_offset * dst_stride + row_offset;           \
-      func(CONVERT_TO_BYTEPTR(src + src_total_offset), src_stride,           \
-           CONVERT_TO_BYTEPTR(dst + dst_total_offset), dst_stride, filter_x, \
-           x_step_q4, filter_y, y_step_q4, w, h, bd);                        \
-      if (offset + w + w < frame_size) {                                     \
-        row_offset += w;                                                     \
-      } else {                                                               \
-        row_offset = 0;                                                      \
-        col_offset += h;                                                     \
-      }                                                                      \
-      if (col_offset + h >= frame_size) {                                    \
-        col_offset = 0;                                                      \
-      }                                                                      \
-    }                                                                        \
-  }
-
-#define lowbd_convolve_speed(func, block_size, frame_size)                  \
-  TEST(AV1ConvolveTest, func##_speed_l_##block_size##_##frame_size) {       \
-    ACMRandom rnd(ACMRandom::DeterministicSeed());                          \
-    InterpFilter interp_filter = EIGHTTAP;                                  \
-    InterpFilterParams filter_params =                                      \
-        av1_get_interp_filter_params(interp_filter);                        \
-    int filter_size = filter_params.tap;                                    \
-    int filter_center = filter_size / 2 - 1;                                \
-    DECLARE_ALIGNED(16, uint8_t, src[(frame_size + 7) * (frame_size + 7)]); \
-    int src_stride = frame_size + 7;                                        \
-    DECLARE_ALIGNED(16, uint8_t, dst[frame_size * frame_size]);             \
-    int dst_stride = frame_size;                                            \
-    int x_step_q4 = 16;                                                     \
-    int y_step_q4 = 16;                                                     \
-    int subpel_x_q4 = 8;                                                    \
-    int subpel_y_q4 = 6;                                                    \
-    int bd = 8;                                                             \
-                                                                            \
-    int w = block_size;                                                     \
-    int h = block_size;                                                     \
-                                                                            \
-    const int16_t *filter_x =                                               \
-        av1_get_interp_filter_kernel(filter_params, subpel_x_q4);           \
-    const int16_t *filter_y =                                               \
-        av1_get_interp_filter_kernel(filter_params, subpel_y_q4);           \
-                                                                            \
-    for (int i = 0; i < src_stride * src_stride; i++) {                     \
-      src[i] = rnd.Rand16() % (1 << bd);                                    \
-    }                                                                       \
-                                                                            \
-    int offset = filter_center * src_stride + filter_center;                \
-    int row_offset = 0;                                                     \
-    int col_offset = 0;                                                     \
-    for (int i = 0; i < 100000; i++) {                                      \
-      func(src + offset, src_stride, dst, dst_stride, filter_x, x_step_q4,  \
-           filter_y, y_step_q4, w, h);                                      \
-      if (offset + w + w < frame_size) {                                    \
-        row_offset += w;                                                    \
-      } else {                                                              \
-        row_offset = 0;                                                     \
-        col_offset += h;                                                    \
-      }                                                                     \
-      if (col_offset + h >= frame_size) {                                   \
-        col_offset = 0;                                                     \
-      }                                                                     \
-    }                                                                       \
-  }
-
-// This experiment shows that when frame size is 64x64
-// aom_highbd_convolve8_sse2 and aom_convolve8_sse2's speed are similar.
-// However when frame size becomes 1024x1024
-// aom_highbd_convolve8_sse2 is around 50% slower than aom_convolve8_sse2
-// we think the bottleneck is from memory IO
-highbd_convolve_speed(aom_highbd_convolve8_sse2, 8, 64);
-highbd_convolve_speed(aom_highbd_convolve8_sse2, 16, 64);
-highbd_convolve_speed(aom_highbd_convolve8_sse2, 32, 64);
-highbd_convolve_speed(aom_highbd_convolve8_sse2, 64, 64);
-
-lowbd_convolve_speed(aom_convolve8_sse2, 8, 64);
-lowbd_convolve_speed(aom_convolve8_sse2, 16, 64);
-lowbd_convolve_speed(aom_convolve8_sse2, 32, 64);
-lowbd_convolve_speed(aom_convolve8_sse2, 64, 64);
-
-highbd_convolve_speed(aom_highbd_convolve8_sse2, 8, 1024);
-highbd_convolve_speed(aom_highbd_convolve8_sse2, 16, 1024);
-highbd_convolve_speed(aom_highbd_convolve8_sse2, 32, 1024);
-highbd_convolve_speed(aom_highbd_convolve8_sse2, 64, 1024);
-
-lowbd_convolve_speed(aom_convolve8_sse2, 8, 1024);
-lowbd_convolve_speed(aom_convolve8_sse2, 16, 1024);
-lowbd_convolve_speed(aom_convolve8_sse2, 32, 1024);
-lowbd_convolve_speed(aom_convolve8_sse2, 64, 1024);
-#endif  // CONVOLVE_SPEED_TEST
-}  // namespace
diff --git a/third_party/aom/test/av1_dct_test.cc b/third_party/aom/test/av1_dct_test.cc
deleted file mode 100644
index fdaf9abb9..000000000
--- a/third_party/aom/test/av1_dct_test.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
-
-#include <math.h>
-#include <stdlib.h>
-#include <new>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "test/acm_random.h"
-#include "test/util.h"
-#include "./aom_config.h"
-#include "aom_ports/msvc.h"
-
-#undef CONFIG_COEFFICIENT_RANGE_CHECKING
-#define CONFIG_COEFFICIENT_RANGE_CHECKING 1
-#define AV1_DCT_GTEST
-#include "av1/encoder/dct.c"
-#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \
-    CONFIG_DAALA_DCT32
-#include "av1/common/daala_tx.c"
-#endif
-
-using libaom_test::ACMRandom;
-
-namespace {
-void reference_dct_1d(const double *in, double *out, int size) {
-  const double kInvSqrt2 = 0.707106781186547524400844362104;
-  for (int k = 0; k < size; ++k) {
-    out[k] = 0;
-    for (int n = 0; n < size; ++n) {
-      out[k] += in[n] * cos(PI * (2 * n + 1) * k / (2 * size));
-    }
-    if (k == 0) out[k] = out[k] * kInvSqrt2;
-  }
-}
-
-typedef void (*FdctFuncRef)(const double *in, double *out, int size);
-typedef void (*IdctFuncRef)(const double *in, double *out, int size);
-typedef void (*FdctFunc)(const tran_low_t *in, tran_low_t *out);
-typedef void (*IdctFunc)(const tran_low_t *in, tran_low_t *out);
-
-class TransTestBase {
- public:
-  virtual ~TransTestBase() {}
-
- protected:
-  void RunFwdAccuracyCheck() {
-    tran_low_t *input = new tran_low_t[txfm_size_];
-    tran_low_t *output = new tran_low_t[txfm_size_];
-    double *ref_input = new double[txfm_size_];
-    double *ref_output = new double[txfm_size_];
-
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 5000;
-    for (int ti = 0; ti < count_test_block; ++ti) {
-      for (int ni = 0; ni < txfm_size_; ++ni) {
-        input[ni] = rnd.Rand8() - rnd.Rand8();
-        ref_input[ni] = static_cast<double>(input[ni]);
-      }
-
-      fwd_txfm_(input, output);
-      fwd_txfm_ref_(ref_input, ref_output, txfm_size_);
-
-      for (int ni = 0; ni < txfm_size_; ++ni) {
-        EXPECT_LE(
-            abs(output[ni] - static_cast<tran_low_t>(round(ref_output[ni]))),
-            max_error_);
-      }
-    }
-
-    delete[] input;
-    delete[] output;
-    delete[] ref_input;
-    delete[] ref_output;
-  }
-
-  double max_error_;
-  int txfm_size_;
-  FdctFunc fwd_txfm_;
-  FdctFuncRef fwd_txfm_ref_;
-};
-
-typedef std::tr1::tuple<FdctFunc, FdctFuncRef, int, int> FdctParam;
-class AV1FwdTxfm : public TransTestBase,
-                   public ::testing::TestWithParam<FdctParam> {
- public:
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    fwd_txfm_ref_ = GET_PARAM(1);
-    txfm_size_ = GET_PARAM(2);
-    max_error_ = GET_PARAM(3);
-  }
-  virtual void TearDown() {}
-};
-
-TEST_P(AV1FwdTxfm, RunFwdAccuracyCheck) { RunFwdAccuracyCheck(); }
-
-INSTANTIATE_TEST_CASE_P(
-    C, AV1FwdTxfm,
-    ::testing::Values(FdctParam(&fdct4, &reference_dct_1d, 4, 1),
-                      FdctParam(&fdct8, &reference_dct_1d, 8, 1),
-                      FdctParam(&fdct16, &reference_dct_1d, 16, 2),
-                      FdctParam(&fdct32, &reference_dct_1d, 32, 3)));
-}  // namespace
diff --git a/third_party/aom/test/av1_ext_tile_test.cc b/third_party/aom/test/av1_ext_tile_test.cc
index 034b07167..d2abbab7f 100644
--- a/third_party/aom/test/av1_ext_tile_test.cc
+++ b/third_party/aom/test/av1_ext_tile_test.cc
@@ -46,6 +46,7 @@ class AV1ExtTileTest
     cfg.allow_lowbitdepth = 1;
 
     decoder_ = codec_->CreateDecoder(cfg, 0);
+    decoder_->Control(AV1_SET_TILE_MODE, 1);
     decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);
     decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
 
@@ -86,13 +87,8 @@ class AV1ExtTileTest
       encoder->Control(AV1E_SET_TILE_ROWS, kTileSize);
       // TODO(yunqingwang): test single_tile_decoding = 0.
       encoder->Control(AV1E_SET_SINGLE_TILE_DECODING, 1);
-#if CONFIG_EXT_PARTITION
       // Always use 64x64 max partition.
       encoder->Control(AV1E_SET_SUPERBLOCK_SIZE, AOM_SUPERBLOCK_SIZE_64X64);
-#endif
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-      encoder->Control(AV1E_SET_TILE_LOOPFILTER, 0);
-#endif
     }
 
     if (video->frame() == 1) {
@@ -174,6 +170,23 @@ class AV1ExtTileTest
     }
   }
 
+  void TestRoundTrip() {
+    ::libaom_test::I420VideoSource video(
+        "hantro_collage_w352h288.yuv", kImgWidth, kImgHeight, 30, 1, 0, kLimit);
+    cfg_.rc_target_bitrate = 500;
+    cfg_.g_error_resilient = AOM_ERROR_RESILIENT_DEFAULT;
+    cfg_.large_scale_tile = 1;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_threads = 1;
+
+    // Tile encoding
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+    // Compare to check if two vectors are equal.
+    ASSERT_EQ(md5_, tile_md5_);
+  }
+
   ::libaom_test::TestMode encoding_mode_;
   int set_cpu_used_;
   ::libaom_test::Decoder *decoder_;
@@ -182,25 +195,19 @@ class AV1ExtTileTest
   std::vector<std::string> tile_md5_;
 };
 
-TEST_P(AV1ExtTileTest, DecoderResultTest) {
-  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", kImgWidth,
-                                       kImgHeight, 30, 1, 0, kLimit);
-  cfg_.rc_target_bitrate = 500;
-  cfg_.g_error_resilient = AOM_ERROR_RESILIENT_DEFAULT;
-  cfg_.large_scale_tile = 1;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.g_threads = 1;
+TEST_P(AV1ExtTileTest, DISABLED_DecoderResultTest) { TestRoundTrip(); }
 
-  // Tile encoding
-  init_flags_ = AOM_CODEC_USE_PSNR;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+AV1_INSTANTIATE_TEST_CASE(
+    // Now only test 2-pass mode.
+    AV1ExtTileTest, ::testing::Values(::libaom_test::kTwoPassGood),
+    ::testing::Range(1, 4));
 
-  // Compare to check if two vectors are equal.
-  ASSERT_EQ(md5_, tile_md5_);
-}
+class AV1ExtTileTestLarge : public AV1ExtTileTest {};
+
+TEST_P(AV1ExtTileTestLarge, DISABLED_DecoderResultTest) { TestRoundTrip(); }
 
 AV1_INSTANTIATE_TEST_CASE(
     // Now only test 2-pass mode.
-    AV1ExtTileTest, ::testing::Values(::libaom_test::kTwoPassGood),
-    ::testing::Range(0, 4));
+    AV1ExtTileTestLarge, ::testing::Values(::libaom_test::kTwoPassGood),
+    ::testing::Range(0, 1));
 }  // namespace
diff --git a/third_party/aom/test/av1_fht16x16_test.cc b/third_party/aom/test/av1_fht16x16_test.cc
deleted file mode 100644
index 21235a837..000000000
--- a/third_party/aom/test/av1_fht16x16_test.cc
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/transform_test_base.h"
-#include "test/util.h"
-#include "aom_ports/mem.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        const TxfmParam *txfm_param);
-using std::tr1::tuple;
-using libaom_test::FhtFunc;
-typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht16x16Param;
-
-void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride,
-                  TxfmParam *txfm_param) {
-  av1_fht16x16_c(in, out, stride, txfm_param);
-}
-
-void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
-                  const TxfmParam *txfm_param) {
-  av1_iht16x16_256_add_c(in, dest, stride, txfm_param);
-}
-
-#if CONFIG_HIGHBITDEPTH
-typedef void (*IHbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                           TX_TYPE tx_type, int bd);
-typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
-                          TX_TYPE tx_type, int bd);
-
-// Target optimized function, tx_type, bit depth
-typedef tuple<HbdHtFunc, TX_TYPE, int> HighbdHt16x16Param;
-
-void highbd_fht16x16_ref(const int16_t *in, int32_t *out, int stride,
-                         TX_TYPE tx_type, int bd) {
-  av1_fwd_txfm2d_16x16_c(in, out, stride, tx_type, bd);
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-class AV1Trans16x16HT : public libaom_test::TransformTestBase,
-                        public ::testing::TestWithParam<Ht16x16Param> {
- public:
-  virtual ~AV1Trans16x16HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    pitch_ = 16;
-    height_ = 16;
-    fwd_txfm_ref = fht16x16_ref;
-    inv_txfm_ref = iht16x16_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-    num_coeffs_ = GET_PARAM(4);
-    txfm_param_.tx_type = GET_PARAM(2);
-  }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride, &txfm_param_);
-  }
-
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride, &txfm_param_);
-  }
-
-  FhtFunc fwd_txfm_;
-  IhtFunc inv_txfm_;
-};
-
-TEST_P(AV1Trans16x16HT, MemCheck) { RunMemCheck(); }
-TEST_P(AV1Trans16x16HT, AccuracyCheck) { RunAccuracyCheck(1, 0.001); }
-TEST_P(AV1Trans16x16HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
-TEST_P(AV1Trans16x16HT, CoeffCheck) { RunCoeffCheck(); }
-TEST_P(AV1Trans16x16HT, InvCoeffCheck) { RunInvCoeffCheck(); }
-
-#if CONFIG_HIGHBITDEPTH
-class AV1HighbdTrans16x16HT
-    : public ::testing::TestWithParam<HighbdHt16x16Param> {
- public:
-  virtual ~AV1HighbdTrans16x16HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    fwd_txfm_ref_ = highbd_fht16x16_ref;
-    tx_type_ = GET_PARAM(1);
-    bit_depth_ = GET_PARAM(2);
-    mask_ = (1 << bit_depth_) - 1;
-    num_coeffs_ = 256;
-
-    input_ = reinterpret_cast<int16_t *>(
-        aom_memalign(32, sizeof(int16_t) * num_coeffs_));
-    output_ = reinterpret_cast<int32_t *>(
-        aom_memalign(32, sizeof(int32_t) * num_coeffs_));
-    output_ref_ = reinterpret_cast<int32_t *>(
-        aom_memalign(32, sizeof(int32_t) * num_coeffs_));
-  }
-
-  virtual void TearDown() {
-    aom_free(input_);
-    aom_free(output_);
-    aom_free(output_ref_);
-    libaom_test::ClearSystemState();
-  }
-
- protected:
-  void RunBitexactCheck();
-
- private:
-  HbdHtFunc fwd_txfm_;
-  HbdHtFunc fwd_txfm_ref_;
-  TX_TYPE tx_type_;
-  int bit_depth_;
-  int mask_;
-  int num_coeffs_;
-  int16_t *input_;
-  int32_t *output_;
-  int32_t *output_ref_;
-};
-
-void AV1HighbdTrans16x16HT::RunBitexactCheck() {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int i, j;
-  const int stride = 16;
-  const int num_tests = 1000;
-
-  for (i = 0; i < num_tests; ++i) {
-    for (j = 0; j < num_coeffs_; ++j) {
-      input_[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
-    }
-
-    fwd_txfm_ref_(input_, output_ref_, stride, tx_type_, bit_depth_);
-    ASM_REGISTER_STATE_CHECK(
-        fwd_txfm_(input_, output_, stride, tx_type_, bit_depth_));
-
-    for (j = 0; j < num_coeffs_; ++j) {
-      EXPECT_EQ(output_ref_[j], output_[j])
-          << "Not bit-exact result at index: " << j << " at test block: " << i;
-    }
-  }
-}
-
-TEST_P(AV1HighbdTrans16x16HT, HighbdCoeffCheck) { RunBitexactCheck(); }
-#endif  // CONFIG_HIGHBITDEPTH
-
-using std::tr1::make_tuple;
-
-#if HAVE_SSE2 && !CONFIG_DAALA_DCT16
-const Ht16x16Param kArrayHt16x16Param_sse2[] = {
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, DCT_DCT,
-             AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, ADST_DCT,
-             AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, DCT_ADST,
-             AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, ADST_ADST,
-             AOM_BITS_8, 256),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, FLIPADST_DCT,
-             AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, DCT_FLIPADST,
-             AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, FLIPADST_FLIPADST,
-             AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, ADST_FLIPADST,
-             AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, FLIPADST_ADST,
-             AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, IDTX, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, V_DCT, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, H_DCT, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, V_ADST, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, H_ADST, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, V_FLIPADST,
-             AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, H_FLIPADST,
-             AOM_BITS_8, 256)
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans16x16HT,
-                        ::testing::ValuesIn(kArrayHt16x16Param_sse2));
-#endif  // HAVE_SSE2
-
-#if HAVE_AVX2 && !CONFIG_DAALA_DCT16
-const Ht16x16Param kArrayHt16x16Param_avx2[] = {
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, DCT_DCT,
-             AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, ADST_DCT,
-             AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, DCT_ADST,
-             AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, ADST_ADST,
-             AOM_BITS_8, 256),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, FLIPADST_DCT,
-             AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, DCT_FLIPADST,
-             AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, FLIPADST_FLIPADST,
-             AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, ADST_FLIPADST,
-             AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, FLIPADST_ADST,
-             AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, IDTX, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, V_DCT, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, H_DCT, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, V_ADST, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, H_ADST, AOM_BITS_8,
-             256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, V_FLIPADST,
-             AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, H_FLIPADST,
-             AOM_BITS_8, 256)
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(AVX2, AV1Trans16x16HT,
-                        ::testing::ValuesIn(kArrayHt16x16Param_avx2));
-#endif  // HAVE_AVX2
-
-#if HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_DCT16
-const HighbdHt16x16Param kArrayHBDHt16x16Param_sse4_1[] = {
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, DCT_DCT, 10),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, DCT_DCT, 12),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, ADST_DCT, 10),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, ADST_DCT, 12),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, DCT_ADST, 10),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, DCT_ADST, 12),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, ADST_ADST, 10),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, ADST_ADST, 12),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, FLIPADST_DCT, 10),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, FLIPADST_DCT, 12),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, DCT_FLIPADST, 10),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, DCT_FLIPADST, 12),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, FLIPADST_FLIPADST, 10),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, FLIPADST_FLIPADST, 12),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, ADST_FLIPADST, 10),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, ADST_FLIPADST, 12),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, FLIPADST_ADST, 10),
-  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, FLIPADST_ADST, 12),
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdTrans16x16HT,
-                        ::testing::ValuesIn(kArrayHBDHt16x16Param_sse4_1));
-#endif  // HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_DCT16
-
-}  // namespace
diff --git a/third_party/aom/test/av1_fht16x32_test.cc b/third_party/aom/test/av1_fht16x32_test.cc
deleted file mode 100644
index 0b3928f64..000000000
--- a/third_party/aom/test/av1_fht16x32_test.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
-
-#include "aom_ports/mem.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/transform_test_base.h"
-#include "test/util.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        const TxfmParam *txfm_param);
-using std::tr1::tuple;
-using libaom_test::FhtFunc;
-typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht16x32Param;
-
-void fht16x32_ref(const int16_t *in, tran_low_t *out, int stride,
-                  TxfmParam *txfm_param) {
-  av1_fht16x32_c(in, out, stride, txfm_param);
-}
-
-void iht16x32_ref(const tran_low_t *in, uint8_t *out, int stride,
-                  const TxfmParam *txfm_param) {
-  av1_iht16x32_512_add_c(in, out, stride, txfm_param);
-}
-
-class AV1Trans16x32HT : public libaom_test::TransformTestBase,
-                        public ::testing::TestWithParam<Ht16x32Param> {
- public:
-  virtual ~AV1Trans16x32HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    pitch_ = 16;
-    height_ = 32;
-    fwd_txfm_ref = fht16x32_ref;
-    inv_txfm_ref = iht16x32_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-    num_coeffs_ = GET_PARAM(4);
-    txfm_param_.tx_type = GET_PARAM(2);
-  }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride, &txfm_param_);
-  }
-
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride, &txfm_param_);
-  }
-
-  FhtFunc fwd_txfm_;
-  IhtFunc inv_txfm_;
-};
-
-TEST_P(AV1Trans16x32HT, AccuracyCheck) { RunAccuracyCheck(4, 0.2); }
-TEST_P(AV1Trans16x32HT, CoeffCheck) { RunCoeffCheck(); }
-TEST_P(AV1Trans16x32HT, MemCheck) { RunMemCheck(); }
-TEST_P(AV1Trans16x32HT, InvCoeffCheck) { RunInvCoeffCheck(); }
-TEST_P(AV1Trans16x32HT, InvAccuracyCheck) { RunInvAccuracyCheck(4); }
-
-using std::tr1::make_tuple;
-const Ht16x32Param kArrayHt16x32Param_c[] = {
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, DCT_DCT, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, ADST_DCT, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, DCT_ADST, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, ADST_ADST, AOM_BITS_8,
-             512),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, FLIPADST_DCT, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, DCT_FLIPADST, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, FLIPADST_FLIPADST,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, ADST_FLIPADST,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, FLIPADST_ADST,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, IDTX, AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, V_DCT, AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, H_DCT, AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, V_ADST, AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, H_ADST, AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, V_FLIPADST, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, H_FLIPADST, AOM_BITS_8,
-             512)
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(C, AV1Trans16x32HT,
-                        ::testing::ValuesIn(kArrayHt16x32Param_c));
-
-#if HAVE_SSE2
-const Ht16x32Param kArrayHt16x32Param_sse2[] = {
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, DCT_DCT,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, ADST_DCT,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, DCT_ADST,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, ADST_ADST,
-             AOM_BITS_8, 512),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, FLIPADST_DCT,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, DCT_FLIPADST,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, FLIPADST_FLIPADST,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, ADST_FLIPADST,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, FLIPADST_ADST,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, IDTX, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, V_DCT, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, H_DCT, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, V_ADST, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, H_ADST, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, V_FLIPADST,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, H_FLIPADST,
-             AOM_BITS_8, 512)
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans16x32HT,
-                        ::testing::ValuesIn(kArrayHt16x32Param_sse2));
-#endif  // HAVE_SSE2
-
-}  // namespace
diff --git a/third_party/aom/test/av1_fht16x8_test.cc b/third_party/aom/test/av1_fht16x8_test.cc
deleted file mode 100644
index 3ee1a0830..000000000
--- a/third_party/aom/test/av1_fht16x8_test.cc
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
-
-#include "aom_ports/mem.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/transform_test_base.h"
-#include "test/util.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        const TxfmParam *txfm_param);
-using std::tr1::tuple;
-using libaom_test::FhtFunc;
-typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht16x8Param;
-
-void fht16x8_ref(const int16_t *in, tran_low_t *out, int stride,
-                 TxfmParam *txfm_param) {
-  av1_fht16x8_c(in, out, stride, txfm_param);
-}
-
-void iht16x8_ref(const tran_low_t *in, uint8_t *out, int stride,
-                 const TxfmParam *txfm_param) {
-  av1_iht16x8_128_add_c(in, out, stride, txfm_param);
-}
-
-class AV1Trans16x8HT : public libaom_test::TransformTestBase,
-                       public ::testing::TestWithParam<Ht16x8Param> {
- public:
-  virtual ~AV1Trans16x8HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    pitch_ = 16;
-    height_ = 8;
-    inv_txfm_ref = iht16x8_ref;
-    fwd_txfm_ref = fht16x8_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-    num_coeffs_ = GET_PARAM(4);
-    txfm_param_.tx_type = GET_PARAM(2);
-  }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride, &txfm_param_);
-  }
-
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride, &txfm_param_);
-  }
-
-  FhtFunc fwd_txfm_;
-  IhtFunc inv_txfm_;
-};
-
-TEST_P(AV1Trans16x8HT, AccuracyCheck) { RunAccuracyCheck(1, 0.001); }
-TEST_P(AV1Trans16x8HT, CoeffCheck) { RunCoeffCheck(); }
-TEST_P(AV1Trans16x8HT, MemCheck) { RunMemCheck(); }
-TEST_P(AV1Trans16x8HT, InvCoeffCheck) { RunInvCoeffCheck(); }
-TEST_P(AV1Trans16x8HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
-
-using std::tr1::make_tuple;
-
-const Ht16x8Param kArrayHt16x8Param_c[] = {
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, DCT_DCT, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, ADST_DCT, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, DCT_ADST, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, ADST_ADST, AOM_BITS_8,
-             128),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, FLIPADST_DCT, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, DCT_FLIPADST, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, FLIPADST_FLIPADST,
-             AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, ADST_FLIPADST, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, FLIPADST_ADST, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, IDTX, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, V_DCT, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, H_DCT, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, V_ADST, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, H_ADST, AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, V_FLIPADST, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht16x8_c, &av1_iht16x8_128_add_c, H_FLIPADST, AOM_BITS_8,
-             128)
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(C, AV1Trans16x8HT,
-                        ::testing::ValuesIn(kArrayHt16x8Param_c));
-
-#if HAVE_SSE2
-const Ht16x8Param kArrayHt16x8Param_sse2[] = {
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, DCT_DCT, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, ADST_DCT, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, DCT_ADST, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, ADST_ADST,
-             AOM_BITS_8, 128),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, FLIPADST_DCT,
-             AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, DCT_FLIPADST,
-             AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, FLIPADST_FLIPADST,
-             AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, ADST_FLIPADST,
-             AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, FLIPADST_ADST,
-             AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, IDTX, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, V_DCT, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, H_DCT, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, V_ADST, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, H_ADST, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, V_FLIPADST,
-             AOM_BITS_8, 128),
-  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, H_FLIPADST,
-             AOM_BITS_8, 128)
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans16x8HT,
-                        ::testing::ValuesIn(kArrayHt16x8Param_sse2));
-#endif  // HAVE_SSE2
-
-}  // namespace
diff --git a/third_party/aom/test/av1_fht32x16_test.cc b/third_party/aom/test/av1_fht32x16_test.cc
deleted file mode 100644
index cbce074e5..000000000
--- a/third_party/aom/test/av1_fht32x16_test.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
-
-#include "aom_ports/mem.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/transform_test_base.h"
-#include "test/util.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        const TxfmParam *txfm_param);
-using std::tr1::tuple;
-using libaom_test::FhtFunc;
-typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht32x16Param;
-
-void fht32x16_ref(const int16_t *in, tran_low_t *out, int stride,
-                  TxfmParam *txfm_param) {
-  av1_fht32x16_c(in, out, stride, txfm_param);
-}
-
-void iht32x16_ref(const tran_low_t *in, uint8_t *out, int stride,
-                  const TxfmParam *txfm_param) {
-  av1_iht32x16_512_add_c(in, out, stride, txfm_param);
-}
-
-class AV1Trans32x16HT : public libaom_test::TransformTestBase,
-                        public ::testing::TestWithParam<Ht32x16Param> {
- public:
-  virtual ~AV1Trans32x16HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    pitch_ = 32;
-    height_ = 16;
-    fwd_txfm_ref = fht32x16_ref;
-    inv_txfm_ref = iht32x16_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-    num_coeffs_ = GET_PARAM(4);
-    txfm_param_.tx_type = GET_PARAM(2);
-  }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride, &txfm_param_);
-  }
-
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride, &txfm_param_);
-  }
-
-  FhtFunc fwd_txfm_;
-  IhtFunc inv_txfm_;
-};
-
-TEST_P(AV1Trans32x16HT, MemCheck) { RunMemCheck(); }
-TEST_P(AV1Trans32x16HT, AccuracyCheck) { RunAccuracyCheck(4, 0.2); }
-TEST_P(AV1Trans32x16HT, CoeffCheck) { RunCoeffCheck(); }
-TEST_P(AV1Trans32x16HT, InvCoeffCheck) { RunInvCoeffCheck(); }
-TEST_P(AV1Trans32x16HT, InvAccuracyCheck) { RunInvAccuracyCheck(4); }
-
-using std::tr1::make_tuple;
-const Ht32x16Param kArrayHt32x16Param_c[] = {
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, DCT_DCT, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, ADST_DCT, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, DCT_ADST, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, ADST_ADST, AOM_BITS_8,
-             512),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, FLIPADST_DCT, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, DCT_FLIPADST, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, FLIPADST_FLIPADST,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, ADST_FLIPADST,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, FLIPADST_ADST,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, IDTX, AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, V_DCT, AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, H_DCT, AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, V_ADST, AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, H_ADST, AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, V_FLIPADST, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, H_FLIPADST, AOM_BITS_8,
-             512)
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(C, AV1Trans32x16HT,
-                        ::testing::ValuesIn(kArrayHt32x16Param_c));
-
-#if HAVE_SSE2
-const Ht32x16Param kArrayHt32x16Param_sse2[] = {
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, DCT_DCT,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, ADST_DCT,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, DCT_ADST,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, ADST_ADST,
-             AOM_BITS_8, 512),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, FLIPADST_DCT,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, DCT_FLIPADST,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, FLIPADST_FLIPADST,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, ADST_FLIPADST,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, FLIPADST_ADST,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, IDTX, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, V_DCT, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, H_DCT, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, V_ADST, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, H_ADST, AOM_BITS_8,
-             512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, V_FLIPADST,
-             AOM_BITS_8, 512),
-  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, H_FLIPADST,
-             AOM_BITS_8, 512)
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans32x16HT,
-                        ::testing::ValuesIn(kArrayHt32x16Param_sse2));
-#endif  // HAVE_SSE2
-
-}  // namespace
diff --git a/third_party/aom/test/av1_fht32x32_test.cc b/third_party/aom/test/av1_fht32x32_test.cc
deleted file mode 100644
index 613bc9183..000000000
--- a/third_party/aom/test/av1_fht32x32_test.cc
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/transform_test_base.h"
-#include "test/util.h"
-#include "aom_ports/mem.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        const TxfmParam *txfm_param);
-using std::tr1::tuple;
-using libaom_test::FhtFunc;
-typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht32x32Param;
-
-void fht32x32_ref(const int16_t *in, tran_low_t *out, int stride,
-                  TxfmParam *txfm_param) {
-  av1_fht32x32_c(in, out, stride, txfm_param);
-}
-
-#if CONFIG_HIGHBITDEPTH
-typedef void (*IHbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                           TX_TYPE tx_type, int bd);
-typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
-                          TX_TYPE tx_type, int bd);
-
-// Target optimized function, tx_type, bit depth
-typedef tuple<HbdHtFunc, TX_TYPE, int> HighbdHt32x32Param;
-
-void highbd_fht32x32_ref(const int16_t *in, int32_t *out, int stride,
-                         TX_TYPE tx_type, int bd) {
-  av1_fwd_txfm2d_32x32_c(in, out, stride, tx_type, bd);
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if (HAVE_SSE2 || HAVE_AVX2) && !CONFIG_DAALA_DCT32
-void dummy_inv_txfm(const tran_low_t *in, uint8_t *out, int stride,
-                    const TxfmParam *txfm_param) {
-  (void)in;
-  (void)out;
-  (void)stride;
-  (void)txfm_param;
-}
-#endif
-
-class AV1Trans32x32HT : public libaom_test::TransformTestBase,
-                        public ::testing::TestWithParam<Ht32x32Param> {
- public:
-  virtual ~AV1Trans32x32HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    pitch_ = 32;
-    height_ = 32;
-    fwd_txfm_ref = fht32x32_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-    num_coeffs_ = GET_PARAM(4);
-    txfm_param_.tx_type = GET_PARAM(2);
-  }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride, &txfm_param_);
-  }
-
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride, &txfm_param_);
-  }
-
-  FhtFunc fwd_txfm_;
-  IhtFunc inv_txfm_;
-};
-
-TEST_P(AV1Trans32x32HT, CoeffCheck) { RunCoeffCheck(); }
-TEST_P(AV1Trans32x32HT, MemCheck) { RunMemCheck(); }
-
-#if CONFIG_HIGHBITDEPTH
-class AV1HighbdTrans32x32HT
-    : public ::testing::TestWithParam<HighbdHt32x32Param> {
- public:
-  virtual ~AV1HighbdTrans32x32HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    fwd_txfm_ref_ = highbd_fht32x32_ref;
-    tx_type_ = GET_PARAM(1);
-    bit_depth_ = GET_PARAM(2);
-    mask_ = (1 << bit_depth_) - 1;
-    num_coeffs_ = 1024;
-
-    input_ = reinterpret_cast<int16_t *>(
-        aom_memalign(32, sizeof(int16_t) * num_coeffs_));
-    output_ = reinterpret_cast<int32_t *>(
-        aom_memalign(32, sizeof(int32_t) * num_coeffs_));
-    output_ref_ = reinterpret_cast<int32_t *>(
-        aom_memalign(32, sizeof(int32_t) * num_coeffs_));
-  }
-
-  virtual void TearDown() {
-    aom_free(input_);
-    aom_free(output_);
-    aom_free(output_ref_);
-    libaom_test::ClearSystemState();
-  }
-
- protected:
-  void RunBitexactCheck();
-
- private:
-  HbdHtFunc fwd_txfm_;
-  HbdHtFunc fwd_txfm_ref_;
-  TX_TYPE tx_type_;
-  int bit_depth_;
-  int mask_;
-  int num_coeffs_;
-  int16_t *input_;
-  int32_t *output_;
-  int32_t *output_ref_;
-};
-
-void AV1HighbdTrans32x32HT::RunBitexactCheck() {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int i, j;
-  const int stride = 32;
-  const int num_tests = 1000;
-
-  for (i = 0; i < num_tests; ++i) {
-    for (j = 0; j < num_coeffs_; ++j) {
-      input_[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
-    }
-
-    fwd_txfm_ref_(input_, output_ref_, stride, tx_type_, bit_depth_);
-    ASM_REGISTER_STATE_CHECK(
-        fwd_txfm_(input_, output_, stride, tx_type_, bit_depth_));
-
-    for (j = 0; j < num_coeffs_; ++j) {
-      EXPECT_EQ(output_ref_[j], output_[j])
-          << "Not bit-exact result at index: " << j << " at test block: " << i;
-    }
-  }
-}
-
-TEST_P(AV1HighbdTrans32x32HT, HighbdCoeffCheck) { RunBitexactCheck(); }
-#endif  // CONFIG_HIGHBITDEPTH
-
-using std::tr1::make_tuple;
-
-#if HAVE_SSE2 && !CONFIG_DAALA_DCT32
-const Ht32x32Param kArrayHt32x32Param_sse2[] = {
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, DCT_DCT, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, ADST_DCT, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, DCT_ADST, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, ADST_ADST, AOM_BITS_8, 1024),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, FLIPADST_DCT, AOM_BITS_8,
-             1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, DCT_FLIPADST, AOM_BITS_8,
-             1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, FLIPADST_FLIPADST, AOM_BITS_8,
-             1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, ADST_FLIPADST, AOM_BITS_8,
-             1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, FLIPADST_ADST, AOM_BITS_8,
-             1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, IDTX, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, V_DCT, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, H_DCT, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, V_ADST, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, H_ADST, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, V_FLIPADST, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_sse2, &dummy_inv_txfm, H_FLIPADST, AOM_BITS_8, 1024)
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans32x32HT,
-                        ::testing::ValuesIn(kArrayHt32x32Param_sse2));
-#endif  // HAVE_SSE2 && !CONFIG_DAALA_DCT32
-
-#if HAVE_AVX2 && !CONFIG_DAALA_DCT32
-const Ht32x32Param kArrayHt32x32Param_avx2[] = {
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, DCT_DCT, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, ADST_DCT, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, DCT_ADST, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, ADST_ADST, AOM_BITS_8, 1024),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, FLIPADST_DCT, AOM_BITS_8,
-             1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, DCT_FLIPADST, AOM_BITS_8,
-             1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, FLIPADST_FLIPADST, AOM_BITS_8,
-             1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, ADST_FLIPADST, AOM_BITS_8,
-             1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, FLIPADST_ADST, AOM_BITS_8,
-             1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, IDTX, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, V_DCT, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, H_DCT, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, V_ADST, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, H_ADST, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, V_FLIPADST, AOM_BITS_8, 1024),
-  make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, H_FLIPADST, AOM_BITS_8, 1024)
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(AVX2, AV1Trans32x32HT,
-                        ::testing::ValuesIn(kArrayHt32x32Param_avx2));
-#endif  // HAVE_AVX2 && !CONFIG_DAALA_DCT32
-}  // namespace
diff --git a/third_party/aom/test/av1_fht4x4_test.cc b/third_party/aom/test/av1_fht4x4_test.cc
deleted file mode 100644
index 1d4fc1352..000000000
--- a/third_party/aom/test/av1_fht4x4_test.cc
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/transform_test_base.h"
-#include "test/util.h"
-#include "aom_ports/mem.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        const TxfmParam *txfm_param);
-using std::tr1::tuple;
-using libaom_test::FhtFunc;
-typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht4x4Param;
-
-void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
-                TxfmParam *txfm_param) {
-  av1_fht4x4_c(in, out, stride, txfm_param);
-}
-
-void iht4x4_ref(const tran_low_t *in, uint8_t *out, int stride,
-                const TxfmParam *txfm_param) {
-  av1_iht4x4_16_add_c(in, out, stride, txfm_param);
-}
-
-#if CONFIG_HIGHBITDEPTH
-typedef void (*IhighbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                              TX_TYPE tx_type, int bd);
-typedef void (*HBDFhtFunc)(const int16_t *input, int32_t *output, int stride,
-                           TX_TYPE tx_type, int bd);
-
-// HighbdHt4x4Param argument list:
-// <Target optimized function, tx_type, bit depth>
-typedef tuple<HBDFhtFunc, TX_TYPE, int> HighbdHt4x4Param;
-
-void highbe_fht4x4_ref(const int16_t *in, int32_t *out, int stride,
-                       TX_TYPE tx_type, int bd) {
-  av1_fwd_txfm2d_4x4_c(in, out, stride, tx_type, bd);
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-class AV1Trans4x4HT : public libaom_test::TransformTestBase,
-                      public ::testing::TestWithParam<Ht4x4Param> {
- public:
-  virtual ~AV1Trans4x4HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    pitch_ = 4;
-    height_ = 4;
-    fwd_txfm_ref = fht4x4_ref;
-    inv_txfm_ref = iht4x4_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-    num_coeffs_ = GET_PARAM(4);
-    txfm_param_.tx_type = GET_PARAM(2);
-  }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride, &txfm_param_);
-  }
-
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride, &txfm_param_);
-  }
-
-  FhtFunc fwd_txfm_;
-  IhtFunc inv_txfm_;
-};
-
-TEST_P(AV1Trans4x4HT, MemCheck) { RunMemCheck(); }
-TEST_P(AV1Trans4x4HT, CoeffCheck) { RunCoeffCheck(); }
-// Note:
-//  TODO(luoyi): Add tx_type, 9-15 for inverse transform.
-//  Need cleanup since same tests may be done in fdct4x4_test.cc
-// TEST_P(AV1Trans4x4HT, AccuracyCheck) { RunAccuracyCheck(0); }
-// TEST_P(AV1Trans4x4HT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
-// TEST_P(AV1Trans4x4HT, InvCoeffCheck) { RunInvCoeffCheck(); }
-
-#if CONFIG_HIGHBITDEPTH
-class AV1HighbdTrans4x4HT : public ::testing::TestWithParam<HighbdHt4x4Param> {
- public:
-  virtual ~AV1HighbdTrans4x4HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    fwd_txfm_ref_ = highbe_fht4x4_ref;
-    tx_type_ = GET_PARAM(1);
-    bit_depth_ = GET_PARAM(2);
-    mask_ = (1 << bit_depth_) - 1;
-    num_coeffs_ = 16;
-
-    input_ = reinterpret_cast<int16_t *>(
-        aom_memalign(16, sizeof(int16_t) * num_coeffs_));
-    output_ = reinterpret_cast<int32_t *>(
-        aom_memalign(16, sizeof(int32_t) * num_coeffs_));
-    output_ref_ = reinterpret_cast<int32_t *>(
-        aom_memalign(16, sizeof(int32_t) * num_coeffs_));
-  }
-
-  virtual void TearDown() {
-    aom_free(input_);
-    aom_free(output_);
-    aom_free(output_ref_);
-    libaom_test::ClearSystemState();
-  }
-
- protected:
-  void RunBitexactCheck();
-
- private:
-  HBDFhtFunc fwd_txfm_;
-  HBDFhtFunc fwd_txfm_ref_;
-  TX_TYPE tx_type_;
-  int bit_depth_;
-  int mask_;
-  int num_coeffs_;
-  int16_t *input_;
-  int32_t *output_;
-  int32_t *output_ref_;
-};
-
-void AV1HighbdTrans4x4HT::RunBitexactCheck() {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int i, j;
-  const int stride = 4;
-  const int num_tests = 1000;
-  const int num_coeffs = 16;
-
-  for (i = 0; i < num_tests; ++i) {
-    for (j = 0; j < num_coeffs; ++j) {
-      input_[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
-    }
-
-    fwd_txfm_ref_(input_, output_ref_, stride, tx_type_, bit_depth_);
-    fwd_txfm_(input_, output_, stride, tx_type_, bit_depth_);
-
-    for (j = 0; j < num_coeffs; ++j) {
-      EXPECT_EQ(output_[j], output_ref_[j])
-          << "Not bit-exact result at index: " << j << " at test block: " << i;
-    }
-  }
-}
-
-TEST_P(AV1HighbdTrans4x4HT, HighbdCoeffCheck) { RunBitexactCheck(); }
-#endif  // CONFIG_HIGHBITDEPTH
-
-using std::tr1::make_tuple;
-
-#if HAVE_SSE2 && !CONFIG_DAALA_DCT4
-const Ht4x4Param kArrayHt4x4Param_sse2[] = {
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, DCT_DCT, AOM_BITS_8,
-             16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, ADST_DCT, AOM_BITS_8,
-             16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, DCT_ADST, AOM_BITS_8,
-             16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, ADST_ADST, AOM_BITS_8,
-             16),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, FLIPADST_DCT,
-             AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, DCT_FLIPADST,
-             AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, FLIPADST_FLIPADST,
-             AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, ADST_FLIPADST,
-             AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, FLIPADST_ADST,
-             AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, IDTX, AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, V_DCT, AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, H_DCT, AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, V_ADST, AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, H_ADST, AOM_BITS_8, 16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, V_FLIPADST, AOM_BITS_8,
-             16),
-  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, H_FLIPADST, AOM_BITS_8,
-             16)
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans4x4HT,
-                        ::testing::ValuesIn(kArrayHt4x4Param_sse2));
-#endif  // HAVE_SSE2
-
-#if HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_DCT4
-const HighbdHt4x4Param kArrayHighbdHt4x4Param[] = {
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, DCT_DCT, 10),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, DCT_DCT, 12),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, ADST_DCT, 10),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, ADST_DCT, 12),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, DCT_ADST, 10),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, DCT_ADST, 12),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, ADST_ADST, 10),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, ADST_ADST, 12),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, FLIPADST_DCT, 10),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, FLIPADST_DCT, 12),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, DCT_FLIPADST, 10),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, DCT_FLIPADST, 12),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, FLIPADST_FLIPADST, 10),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, FLIPADST_FLIPADST, 12),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, ADST_FLIPADST, 10),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, ADST_FLIPADST, 12),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, FLIPADST_ADST, 10),
-  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, FLIPADST_ADST, 12),
-#endif  // CONFIG_EXT_TX
-};
-
-INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdTrans4x4HT,
-                        ::testing::ValuesIn(kArrayHighbdHt4x4Param));
-
-#endif  // HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_DCT4
-
-}  // namespace
diff --git a/third_party/aom/test/av1_fht4x8_test.cc b/third_party/aom/test/av1_fht4x8_test.cc
deleted file mode 100644
index f9d2120e0..000000000
--- a/third_party/aom/test/av1_fht4x8_test.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
-
-#include "aom_ports/mem.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/transform_test_base.h"
-#include "test/util.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        const TxfmParam *txfm_param);
-using std::tr1::tuple;
-using libaom_test::FhtFunc;
-typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht4x8Param;
-
-void fht4x8_ref(const int16_t *in, tran_low_t *out, int stride,
-                TxfmParam *txfm_param) {
-  av1_fht4x8_c(in, out, stride, txfm_param);
-}
-
-void iht4x8_ref(const tran_low_t *in, uint8_t *out, int stride,
-                const TxfmParam *txfm_param) {
-  av1_iht4x8_32_add_c(in, out, stride, txfm_param);
-}
-
-class AV1Trans4x8HT : public libaom_test::TransformTestBase,
-                      public ::testing::TestWithParam<Ht4x8Param> {
- public:
-  virtual ~AV1Trans4x8HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    pitch_ = 4;
-    height_ = 8;
-    fwd_txfm_ref = fht4x8_ref;
-    inv_txfm_ref = iht4x8_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-    num_coeffs_ = GET_PARAM(4);
-    txfm_param_.tx_type = GET_PARAM(2);
-  }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride, &txfm_param_);
-  }
-
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride, &txfm_param_);
-  }
-
-  FhtFunc fwd_txfm_;
-  IhtFunc inv_txfm_;
-};
-
-TEST_P(AV1Trans4x8HT, AccuracyCheck) { RunAccuracyCheck(0, 0.00001); }
-TEST_P(AV1Trans4x8HT, CoeffCheck) { RunCoeffCheck(); }
-TEST_P(AV1Trans4x8HT, MemCheck) { RunMemCheck(); }
-TEST_P(AV1Trans4x8HT, InvCoeffCheck) { RunInvCoeffCheck(); }
-TEST_P(AV1Trans4x8HT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
-
-using std::tr1::make_tuple;
-
-const Ht4x8Param kArrayHt4x8Param_c[] = {
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, DCT_DCT, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, ADST_DCT, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, DCT_ADST, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, ADST_ADST, AOM_BITS_8, 32),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, FLIPADST_DCT, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, DCT_FLIPADST, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, FLIPADST_FLIPADST, AOM_BITS_8,
-             32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, ADST_FLIPADST, AOM_BITS_8,
-             32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, FLIPADST_ADST, AOM_BITS_8,
-             32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, IDTX, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, V_DCT, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, H_DCT, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, V_ADST, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, H_ADST, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, V_FLIPADST, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_c, &av1_iht4x8_32_add_c, H_FLIPADST, AOM_BITS_8, 32)
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(C, AV1Trans4x8HT,
-                        ::testing::ValuesIn(kArrayHt4x8Param_c));
-
-#if HAVE_SSE2
-const Ht4x8Param kArrayHt4x8Param_sse2[] = {
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, DCT_DCT, AOM_BITS_8,
-             32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, ADST_DCT, AOM_BITS_8,
-             32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, DCT_ADST, AOM_BITS_8,
-             32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, ADST_ADST, AOM_BITS_8,
-             32),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, FLIPADST_DCT,
-             AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, DCT_FLIPADST,
-             AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, FLIPADST_FLIPADST,
-             AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, ADST_FLIPADST,
-             AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, FLIPADST_ADST,
-             AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, IDTX, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, V_DCT, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, H_DCT, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, V_ADST, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, H_ADST, AOM_BITS_8, 32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, V_FLIPADST, AOM_BITS_8,
-             32),
-  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, H_FLIPADST, AOM_BITS_8,
-             32)
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans4x8HT,
-                        ::testing::ValuesIn(kArrayHt4x8Param_sse2));
-#endif  // HAVE_SSE2
-
-}  // namespace
diff --git a/third_party/aom/test/av1_fht64x64_test.cc b/third_party/aom/test/av1_fht64x64_test.cc
deleted file mode 100644
index f2a03e7ee..000000000
--- a/third_party/aom/test/av1_fht64x64_test.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./av1_rtcd.h"
-
-#include "aom_ports/mem.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/transform_test_base.h"
-#include "test/util.h"
-
-#if CONFIG_TX64X64
-
-using libaom_test::ACMRandom;
-
-namespace {
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        const TxfmParam *txfm_param);
-using std::tr1::tuple;
-using libaom_test::FhtFunc;
-typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht64x64Param;
-
-void fht64x64_ref(const int16_t *in, tran_low_t *out, int stride,
-                  TxfmParam *txfm_param) {
-  av1_fht64x64_c(in, out, stride, txfm_param);
-}
-
-void iht64x64_ref(const tran_low_t *in, uint8_t *dest, int stride,
-                  const TxfmParam *txfm_param) {
-  av1_iht64x64_4096_add_c(in, dest, stride, txfm_param);
-}
-
-class AV1Trans64x64HT : public libaom_test::TransformTestBase,
-                        public ::testing::TestWithParam<Ht64x64Param> {
- public:
-  virtual ~AV1Trans64x64HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    pitch_ = 64;
-    height_ = 64;
-    fwd_txfm_ref = fht64x64_ref;
-    inv_txfm_ref = iht64x64_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-    num_coeffs_ = GET_PARAM(4);
-    txfm_param_.tx_type = GET_PARAM(2);
-  }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride, &txfm_param_);
-  }
-
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride, &txfm_param_);
-  }
-
-  FhtFunc fwd_txfm_;
-  IhtFunc inv_txfm_;
-};
-
-TEST_P(AV1Trans64x64HT, AccuracyCheck) { RunAccuracyCheck(4, 0.2); }
-TEST_P(AV1Trans64x64HT, CoeffCheck) { RunCoeffCheck(); }
-TEST_P(AV1Trans64x64HT, MemCheck) { RunMemCheck(); }
-TEST_P(AV1Trans64x64HT, InvCoeffCheck) { RunInvCoeffCheck(); }
-TEST_P(AV1Trans64x64HT, InvAccuracyCheck) { RunInvAccuracyCheck(4); }
-
-using std::tr1::make_tuple;
-
-const Ht64x64Param kArrayHt64x64Param_c[] = {
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, DCT_DCT, AOM_BITS_8,
-             4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, ADST_DCT, AOM_BITS_8,
-             4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, DCT_ADST, AOM_BITS_8,
-             4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, ADST_ADST, AOM_BITS_8,
-             4096),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, FLIPADST_DCT,
-             AOM_BITS_8, 4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, DCT_FLIPADST,
-             AOM_BITS_8, 4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, FLIPADST_FLIPADST,
-             AOM_BITS_8, 4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, ADST_FLIPADST,
-             AOM_BITS_8, 4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, FLIPADST_ADST,
-             AOM_BITS_8, 4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, IDTX, AOM_BITS_8, 4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, V_DCT, AOM_BITS_8,
-             4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, H_DCT, AOM_BITS_8,
-             4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, V_ADST, AOM_BITS_8,
-             4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, H_ADST, AOM_BITS_8,
-             4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, V_FLIPADST, AOM_BITS_8,
-             4096),
-  make_tuple(&av1_fht64x64_c, &av1_iht64x64_4096_add_c, H_FLIPADST, AOM_BITS_8,
-             4096)
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(C, AV1Trans64x64HT,
-                        ::testing::ValuesIn(kArrayHt64x64Param_c));
-
-}  // namespace
-
-#endif  // CONFIG_TX64X64
diff --git a/third_party/aom/test/av1_fht8x16_test.cc b/third_party/aom/test/av1_fht8x16_test.cc
deleted file mode 100644
index 689cb0b90..000000000
--- a/third_party/aom/test/av1_fht8x16_test.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
-
-#include "aom_ports/mem.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/transform_test_base.h"
-#include "test/util.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        const TxfmParam *txfm_param);
-using std::tr1::tuple;
-using libaom_test::FhtFunc;
-typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht8x16Param;
-
-void fht8x16_ref(const int16_t *in, tran_low_t *out, int stride,
-                 TxfmParam *txfm_param) {
-  av1_fht8x16_c(in, out, stride, txfm_param);
-}
-
-void iht8x16_ref(const tran_low_t *in, uint8_t *out, int stride,
-                 const TxfmParam *txfm_param) {
-  av1_iht8x16_128_add_c(in, out, stride, txfm_param);
-}
-
-class AV1Trans8x16HT : public libaom_test::TransformTestBase,
-                       public ::testing::TestWithParam<Ht8x16Param> {
- public:
-  virtual ~AV1Trans8x16HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    pitch_ = 8;
-    height_ = 16;
-    inv_txfm_ref = iht8x16_ref;
-    fwd_txfm_ref = fht8x16_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-    num_coeffs_ = GET_PARAM(4);
-    txfm_param_.tx_type = GET_PARAM(2);
-  }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride, &txfm_param_);
-  }
-
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride, &txfm_param_);
-  }
-
-  FhtFunc fwd_txfm_;
-  IhtFunc inv_txfm_;
-};
-
-TEST_P(AV1Trans8x16HT, AccuracyCheck) { RunAccuracyCheck(1, 0.001); }
-TEST_P(AV1Trans8x16HT, MemCheck) { RunMemCheck(); }
-TEST_P(AV1Trans8x16HT, CoeffCheck) { RunCoeffCheck(); }
-TEST_P(AV1Trans8x16HT, InvCoeffCheck) { RunInvCoeffCheck(); }
-TEST_P(AV1Trans8x16HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
-
-using std::tr1::make_tuple;
-
-const Ht8x16Param kArrayHt8x16Param_c[] = {
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, DCT_DCT, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, ADST_DCT, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, DCT_ADST, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, ADST_ADST, AOM_BITS_8,
-             128),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, FLIPADST_DCT, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, DCT_FLIPADST, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, FLIPADST_FLIPADST,
-             AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, ADST_FLIPADST, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, FLIPADST_ADST, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, IDTX, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, V_DCT, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, H_DCT, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, V_ADST, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, H_ADST, AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, V_FLIPADST, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht8x16_c, &av1_iht8x16_128_add_c, H_FLIPADST, AOM_BITS_8,
-             128)
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(C, AV1Trans8x16HT,
-                        ::testing::ValuesIn(kArrayHt8x16Param_c));
-
-#if HAVE_SSE2
-const Ht8x16Param kArrayHt8x16Param_sse2[] = {
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, DCT_DCT, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, ADST_DCT, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, DCT_ADST, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, ADST_ADST,
-             AOM_BITS_8, 128),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, FLIPADST_DCT,
-             AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, DCT_FLIPADST,
-             AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, FLIPADST_FLIPADST,
-             AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, ADST_FLIPADST,
-             AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, FLIPADST_ADST,
-             AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, IDTX, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, V_DCT, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, H_DCT, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, V_ADST, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, H_ADST, AOM_BITS_8,
-             128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, V_FLIPADST,
-             AOM_BITS_8, 128),
-  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, H_FLIPADST,
-             AOM_BITS_8, 128)
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans8x16HT,
-                        ::testing::ValuesIn(kArrayHt8x16Param_sse2));
-#endif  // HAVE_SSE2
-
-}  // namespace
diff --git a/third_party/aom/test/av1_fht8x4_test.cc b/third_party/aom/test/av1_fht8x4_test.cc
deleted file mode 100644
index e50a69457..000000000
--- a/third_party/aom/test/av1_fht8x4_test.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
-
-#include "aom_ports/mem.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/transform_test_base.h"
-#include "test/util.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        const TxfmParam *txfm_param);
-using std::tr1::tuple;
-using libaom_test::FhtFunc;
-typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht8x4Param;
-
-void fht8x4_ref(const int16_t *in, tran_low_t *out, int stride,
-                TxfmParam *txfm_param) {
-  av1_fht8x4_c(in, out, stride, txfm_param);
-}
-
-void iht8x4_ref(const tran_low_t *in, uint8_t *out, int stride,
-                const TxfmParam *txfm_param) {
-  av1_iht8x4_32_add_c(in, out, stride, txfm_param);
-}
-
-class AV1Trans8x4HT : public libaom_test::TransformTestBase,
-                      public ::testing::TestWithParam<Ht8x4Param> {
- public:
-  virtual ~AV1Trans8x4HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    pitch_ = 8;
-    height_ = 4;
-    fwd_txfm_ref = fht8x4_ref;
-    inv_txfm_ref = iht8x4_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-    num_coeffs_ = GET_PARAM(4);
-    txfm_param_.tx_type = GET_PARAM(2);
-  }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride, &txfm_param_);
-  }
-
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride, &txfm_param_);
-  }
-
-  FhtFunc fwd_txfm_;
-  IhtFunc inv_txfm_;
-};
-
-TEST_P(AV1Trans8x4HT, AccuracyCheck) { RunAccuracyCheck(0, 0.00001); }
-TEST_P(AV1Trans8x4HT, CoeffCheck) { RunCoeffCheck(); }
-TEST_P(AV1Trans8x4HT, MemCheck) { RunMemCheck(); }
-TEST_P(AV1Trans8x4HT, InvCoeffCheck) { RunInvCoeffCheck(); }
-TEST_P(AV1Trans8x4HT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
-
-using std::tr1::make_tuple;
-
-const Ht8x4Param kArrayHt8x4Param_c[] = {
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, DCT_DCT, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, ADST_DCT, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, DCT_ADST, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, ADST_ADST, AOM_BITS_8, 32),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, FLIPADST_DCT, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, DCT_FLIPADST, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, FLIPADST_FLIPADST, AOM_BITS_8,
-             32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, ADST_FLIPADST, AOM_BITS_8,
-             32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, FLIPADST_ADST, AOM_BITS_8,
-             32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, IDTX, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, V_DCT, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, H_DCT, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, V_ADST, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, H_ADST, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, V_FLIPADST, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_c, &av1_iht8x4_32_add_c, H_FLIPADST, AOM_BITS_8, 32)
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(C, AV1Trans8x4HT,
-                        ::testing::ValuesIn(kArrayHt8x4Param_c));
-
-#if HAVE_SSE2
-const Ht8x4Param kArrayHt8x4Param_sse2[] = {
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, DCT_DCT, AOM_BITS_8,
-             32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, ADST_DCT, AOM_BITS_8,
-             32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, DCT_ADST, AOM_BITS_8,
-             32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, ADST_ADST, AOM_BITS_8,
-             32),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, FLIPADST_DCT,
-             AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, DCT_FLIPADST,
-             AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, FLIPADST_FLIPADST,
-             AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, ADST_FLIPADST,
-             AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, FLIPADST_ADST,
-             AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, IDTX, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, V_DCT, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, H_DCT, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, V_ADST, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, H_ADST, AOM_BITS_8, 32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, V_FLIPADST, AOM_BITS_8,
-             32),
-  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, H_FLIPADST, AOM_BITS_8,
-             32)
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans8x4HT,
-                        ::testing::ValuesIn(kArrayHt8x4Param_sse2));
-#endif  // HAVE_SSE2
-
-}  // namespace
diff --git a/third_party/aom/test/av1_fht8x8_test.cc b/third_party/aom/test/av1_fht8x8_test.cc
deleted file mode 100644
index 499fcc338..000000000
--- a/third_party/aom/test/av1_fht8x8_test.cc
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/transform_test_base.h"
-#include "test/util.h"
-#include "aom_ports/mem.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        const TxfmParam *txfm_param);
-
-using libaom_test::FhtFunc;
-using std::tr1::tuple;
-typedef tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int> Ht8x8Param;
-
-void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride,
-                TxfmParam *txfm_param) {
-  av1_fht8x8_c(in, out, stride, txfm_param);
-}
-
-void iht8x8_ref(const tran_low_t *in, uint8_t *out, int stride,
-                const TxfmParam *txfm_param) {
-  av1_iht8x8_64_add_c(in, out, stride, txfm_param);
-}
-
-#if CONFIG_HIGHBITDEPTH
-typedef void (*IHbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                           TX_TYPE tx_type, int bd);
-typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
-                          TX_TYPE tx_type, int bd);
-// Target optimized function, tx_type, bit depth
-typedef tuple<HbdHtFunc, TX_TYPE, int> HighbdHt8x8Param;
-
-void highbd_fht8x8_ref(const int16_t *in, int32_t *out, int stride,
-                       TX_TYPE tx_type, int bd) {
-  av1_fwd_txfm2d_8x8_c(in, out, stride, tx_type, bd);
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-class AV1Trans8x8HT : public libaom_test::TransformTestBase,
-                      public ::testing::TestWithParam<Ht8x8Param> {
- public:
-  virtual ~AV1Trans8x8HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    pitch_ = 8;
-    height_ = 8;
-    fwd_txfm_ref = fht8x8_ref;
-    inv_txfm_ref = iht8x8_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-    num_coeffs_ = GET_PARAM(4);
-    txfm_param_.tx_type = GET_PARAM(2);
-  }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride, &txfm_param_);
-  }
-
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride, &txfm_param_);
-  }
-
-  FhtFunc fwd_txfm_;
-  IhtFunc inv_txfm_;
-};
-
-TEST_P(AV1Trans8x8HT, MemCheck) { RunMemCheck(); }
-TEST_P(AV1Trans8x8HT, CoeffCheck) { RunCoeffCheck(); }
-// Note:
-//  TODO(luoyi): Add tx_type, 9-15 for inverse transform.
-//  Need cleanup since same tests may be done in fdct8x8_test.cc
-// TEST_P(AV1Trans8x8HT, AccuracyCheck) { RunAccuracyCheck(0); }
-// TEST_P(AV1Trans8x8HT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
-// TEST_P(AV1Trans8x8HT, InvCoeffCheck) { RunInvCoeffCheck(); }
-
-#if CONFIG_HIGHBITDEPTH
-class AV1HighbdTrans8x8HT : public ::testing::TestWithParam<HighbdHt8x8Param> {
- public:
-  virtual ~AV1HighbdTrans8x8HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    fwd_txfm_ref_ = highbd_fht8x8_ref;
-    tx_type_ = GET_PARAM(1);
-    bit_depth_ = GET_PARAM(2);
-    mask_ = (1 << bit_depth_) - 1;
-    num_coeffs_ = 64;
-
-    input_ = reinterpret_cast<int16_t *>(
-        aom_memalign(16, sizeof(int16_t) * num_coeffs_));
-    output_ = reinterpret_cast<int32_t *>(
-        aom_memalign(16, sizeof(int32_t) * num_coeffs_));
-    output_ref_ = reinterpret_cast<int32_t *>(
-        aom_memalign(16, sizeof(int32_t) * num_coeffs_));
-  }
-
-  virtual void TearDown() {
-    aom_free(input_);
-    aom_free(output_);
-    aom_free(output_ref_);
-    libaom_test::ClearSystemState();
-  }
-
- protected:
-  void RunBitexactCheck();
-
- private:
-  HbdHtFunc fwd_txfm_;
-  HbdHtFunc fwd_txfm_ref_;
-  TX_TYPE tx_type_;
-  int bit_depth_;
-  int mask_;
-  int num_coeffs_;
-  int16_t *input_;
-  int32_t *output_;
-  int32_t *output_ref_;
-};
-
-void AV1HighbdTrans8x8HT::RunBitexactCheck() {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int i, j;
-  const int stride = 8;
-  const int num_tests = 1000;
-  const int num_coeffs = 64;
-
-  for (i = 0; i < num_tests; ++i) {
-    for (j = 0; j < num_coeffs; ++j) {
-      input_[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
-    }
-
-    fwd_txfm_ref_(input_, output_ref_, stride, tx_type_, bit_depth_);
-    ASM_REGISTER_STATE_CHECK(
-        fwd_txfm_(input_, output_, stride, tx_type_, bit_depth_));
-
-    for (j = 0; j < num_coeffs; ++j) {
-      EXPECT_EQ(output_ref_[j], output_[j])
-          << "Not bit-exact result at index: " << j << " at test block: " << i;
-    }
-  }
-}
-
-TEST_P(AV1HighbdTrans8x8HT, HighbdCoeffCheck) { RunBitexactCheck(); }
-#endif  // CONFIG_HIGHBITDEPTH
-
-using std::tr1::make_tuple;
-
-#if HAVE_SSE2 && !CONFIG_DAALA_DCT8
-const Ht8x8Param kArrayHt8x8Param_sse2[] = {
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, DCT_DCT, AOM_BITS_8,
-             64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, ADST_DCT, AOM_BITS_8,
-             64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, DCT_ADST, AOM_BITS_8,
-             64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, ADST_ADST, AOM_BITS_8,
-             64),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, FLIPADST_DCT,
-             AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, DCT_FLIPADST,
-             AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, FLIPADST_FLIPADST,
-             AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, ADST_FLIPADST,
-             AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, FLIPADST_ADST,
-             AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, IDTX, AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, V_DCT, AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, H_DCT, AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, V_ADST, AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, H_ADST, AOM_BITS_8, 64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, V_FLIPADST, AOM_BITS_8,
-             64),
-  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, H_FLIPADST, AOM_BITS_8,
-             64)
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans8x8HT,
-                        ::testing::ValuesIn(kArrayHt8x8Param_sse2));
-#endif  // HAVE_SSE2
-
-#if HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_DCT8
-const HighbdHt8x8Param kArrayHBDHt8x8Param_sse4_1[] = {
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, DCT_DCT, 10),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, DCT_DCT, 12),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, ADST_DCT, 10),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, ADST_DCT, 12),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, DCT_ADST, 10),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, DCT_ADST, 12),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, ADST_ADST, 10),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, ADST_ADST, 12),
-#if CONFIG_EXT_TX
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, FLIPADST_DCT, 10),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, FLIPADST_DCT, 12),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, DCT_FLIPADST, 10),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, DCT_FLIPADST, 12),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, FLIPADST_FLIPADST, 10),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, FLIPADST_FLIPADST, 12),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, ADST_FLIPADST, 10),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, ADST_FLIPADST, 12),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, FLIPADST_ADST, 10),
-  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, FLIPADST_ADST, 12),
-#endif  // CONFIG_EXT_TX
-};
-INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdTrans8x8HT,
-                        ::testing::ValuesIn(kArrayHBDHt8x8Param_sse4_1));
-#endif  // HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_DCT8
-
-}  // namespace
diff --git a/third_party/aom/test/av1_fwd_txfm1d_test.cc b/third_party/aom/test/av1_fwd_txfm1d_test.cc
index 9deef3c95..49a666879 100644
--- a/third_party/aom/test/av1_fwd_txfm1d_test.cc
+++ b/third_party/aom/test/av1_fwd_txfm1d_test.cc
@@ -9,36 +9,37 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "av1/common/av1_fwd_txfm1d.h"
+#include "av1/encoder/av1_fwd_txfm1d.h"
 #include "test/av1_txfm_test.h"
 
 using libaom_test::ACMRandom;
+using libaom_test::TYPE_ADST;
+using libaom_test::TYPE_DCT;
+using libaom_test::TYPE_IDTX;
+using libaom_test::TYPE_TXFM;
 using libaom_test::input_base;
 using libaom_test::reference_hybrid_1d;
-using libaom_test::TYPE_TXFM;
-using libaom_test::TYPE_DCT;
-using libaom_test::TYPE_ADST;
 
 namespace {
-const int txfm_type_num = 2;
-const TYPE_TXFM txfm_type_ls[2] = { TYPE_DCT, TYPE_ADST };
+const int txfm_type_num = 3;
+const TYPE_TXFM txfm_type_ls[txfm_type_num] = { TYPE_DCT, TYPE_ADST,
+                                                TYPE_IDTX };
 
 const int txfm_size_num = 5;
-const int txfm_size_ls[5] = { 4, 8, 16, 32, 64 };
-
-const TxfmFunc fwd_txfm_func_ls[2][5] = {
-#if CONFIG_TX64X64
-  { av1_fdct4_new, av1_fdct8_new, av1_fdct16_new, av1_fdct32_new,
-    av1_fdct64_new },
-#else
-  { av1_fdct4_new, av1_fdct8_new, av1_fdct16_new, av1_fdct32_new, NULL },
-#endif
-  { av1_fadst4_new, av1_fadst8_new, av1_fadst16_new, av1_fadst32_new, NULL }
+
+const int txfm_size_ls[] = { 4, 8, 16, 32, 64 };
+
+const TxfmFunc fwd_txfm_func_ls[][txfm_type_num] = {
+  { av1_fdct4_new, av1_fadst4_new, av1_fidentity4_c },
+  { av1_fdct8_new, av1_fadst8_new, av1_fidentity8_c },
+  { av1_fdct16_new, av1_fadst16_new, av1_fidentity16_c },
+  { av1_fdct32_new, NULL, av1_fidentity32_c },
+  { av1_fdct64_new, NULL, NULL },
 };
 
 // the maximum stage number of fwd/inv 1d dct/adst txfm is 12
-const int8_t cos_bit[12] = { 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14 };
-const int8_t range_bit[12] = { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 };
+const int8_t cos_bit = 14;
+const int8_t range_bit[12] = { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 };
 
 TEST(av1_fwd_txfm1d, round_shift) {
   EXPECT_EQ(round_shift(7, 1), 4);
@@ -51,10 +52,10 @@ TEST(av1_fwd_txfm1d, round_shift) {
   EXPECT_EQ(round_shift(-8, 2), -2);
 }
 
-TEST(av1_fwd_txfm1d, cospi_arr_data) {
+TEST(av1_fwd_txfm1d, av1_cospi_arr_data) {
   for (int i = 0; i < 7; i++) {
     for (int j = 0; j < 64; j++) {
-      EXPECT_EQ(cospi_arr_data[i][j],
+      EXPECT_EQ(av1_cospi_arr_data[i][j],
                 (int32_t)round(cos(M_PI * j / 128) * (1 << (cos_bit_min + i))));
     }
   }
@@ -71,7 +72,7 @@ TEST(av1_fwd_txfm1d, accuracy) {
 
     for (int ti = 0; ti < txfm_type_num; ++ti) {
       TYPE_TXFM txfm_type = txfm_type_ls[ti];
-      TxfmFunc fwd_txfm_func = fwd_txfm_func_ls[ti][si];
+      TxfmFunc fwd_txfm_func = fwd_txfm_func_ls[si][ti];
       int max_error = 7;
 
       const int count_test_block = 5000;
@@ -86,9 +87,10 @@ TEST(av1_fwd_txfm1d, accuracy) {
           reference_hybrid_1d(ref_input, ref_output, txfm_size, txfm_type);
 
           for (int ni = 0; ni < txfm_size; ++ni) {
-            EXPECT_LE(
+            ASSERT_LE(
                 abs(output[ni] - static_cast<int32_t>(round(ref_output[ni]))),
-                max_error);
+                max_error)
+                << "tx size = " << txfm_size << ", tx type = " << txfm_type;
           }
         }
       }
diff --git a/third_party/aom/test/av1_fwd_txfm2d_test.cc b/third_party/aom/test/av1_fwd_txfm2d_test.cc
index adf9a803c..e0294be4e 100644
--- a/third_party/aom/test/av1_fwd_txfm2d_test.cc
+++ b/third_party/aom/test/av1_fwd_txfm2d_test.cc
@@ -12,24 +12,26 @@
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <vector>
+
+#include "config/av1_rtcd.h"
 
 #include "test/acm_random.h"
 #include "test/util.h"
 #include "test/av1_txfm_test.h"
 #include "av1/common/av1_txfm.h"
-#include "./av1_rtcd.h"
 
 using libaom_test::ACMRandom;
-using libaom_test::input_base;
+using libaom_test::TYPE_TXFM;
 using libaom_test::bd;
 using libaom_test::compute_avg_abs_error;
-using libaom_test::Fwd_Txfm2d_Func;
-using libaom_test::TYPE_TXFM;
+using libaom_test::input_base;
+
+using std::vector;
 
 namespace {
-#if CONFIG_HIGHBITDEPTH
 // tx_type_, tx_size_, max_error_, max_avg_error_
-typedef std::tr1::tuple<TX_TYPE, TX_SIZE, double, double> AV1FwdTxfm2dParam;
+typedef ::testing::tuple<TX_TYPE, TX_SIZE, double, double> AV1FwdTxfm2dParam;
 
 class AV1FwdTxfm2d : public ::testing::TestWithParam<AV1FwdTxfm2dParam> {
  public:
@@ -39,22 +41,16 @@ class AV1FwdTxfm2d : public ::testing::TestWithParam<AV1FwdTxfm2dParam> {
     max_error_ = GET_PARAM(2);
     max_avg_error_ = GET_PARAM(3);
     count_ = 500;
-    TXFM_2D_FLIP_CFG fwd_txfm_flip_cfg =
-        av1_get_fwd_txfm_cfg(tx_type_, tx_size_);
-    // TODO(sarahparker) this test will need to be updated when these
-    // functions are extended to support rectangular transforms
-    int amplify_bit = fwd_txfm_flip_cfg.row_cfg->shift[0] +
-                      fwd_txfm_flip_cfg.row_cfg->shift[1] +
-                      fwd_txfm_flip_cfg.row_cfg->shift[2];
+    TXFM_2D_FLIP_CFG fwd_txfm_flip_cfg;
+    av1_get_fwd_txfm_cfg(tx_type_, tx_size_, &fwd_txfm_flip_cfg);
+    amplify_factor_ = libaom_test::get_amplification_factor(tx_type_, tx_size_);
+    tx_width_ = tx_size_wide[fwd_txfm_flip_cfg.tx_size];
+    tx_height_ = tx_size_high[fwd_txfm_flip_cfg.tx_size];
     ud_flip_ = fwd_txfm_flip_cfg.ud_flip;
     lr_flip_ = fwd_txfm_flip_cfg.lr_flip;
-    amplify_factor_ =
-        amplify_bit >= 0 ? (1 << amplify_bit) : (1.0 / (1 << -amplify_bit));
 
     fwd_txfm_ = libaom_test::fwd_txfm_func_ls[tx_size_];
-    txfm1d_size_ = libaom_test::get_txfm1d_size(tx_size_);
-    txfm2d_size_ = txfm1d_size_ * txfm1d_size_;
-    get_txfm1d_type(tx_type_, &type0_, &type1_);
+    txfm2d_size_ = tx_width_ * tx_height_;
     input_ = reinterpret_cast<int16_t *>(
         aom_memalign(16, sizeof(input_[0]) * txfm2d_size_));
     output_ = reinterpret_cast<int32_t *>(
@@ -76,33 +72,40 @@ class AV1FwdTxfm2d : public ::testing::TestWithParam<AV1FwdTxfm2dParam> {
         ref_output_[ni] = 0;
       }
 
-      fwd_txfm_(input_, output_, txfm1d_size_, tx_type_, bd);
+      fwd_txfm_(input_, output_, tx_width_, tx_type_, bd);
 
-      if (lr_flip_ && ud_flip_)
-        libaom_test::fliplrud(ref_input_, txfm1d_size_, txfm1d_size_);
-      else if (lr_flip_)
-        libaom_test::fliplr(ref_input_, txfm1d_size_, txfm1d_size_);
-      else if (ud_flip_)
-        libaom_test::flipud(ref_input_, txfm1d_size_, txfm1d_size_);
+      if (lr_flip_ && ud_flip_) {
+        libaom_test::fliplrud(ref_input_, tx_width_, tx_height_, tx_width_);
+      } else if (lr_flip_) {
+        libaom_test::fliplr(ref_input_, tx_width_, tx_height_, tx_width_);
+      } else if (ud_flip_) {
+        libaom_test::flipud(ref_input_, tx_width_, tx_height_, tx_width_);
+      }
 
-      reference_hybrid_2d(ref_input_, ref_output_, txfm1d_size_, type0_,
-                          type1_);
+      libaom_test::reference_hybrid_2d(ref_input_, ref_output_, tx_type_,
+                                       tx_size_);
 
+      double actual_max_error = 0;
       for (int ni = 0; ni < txfm2d_size_; ++ni) {
-        ref_output_[ni] = round(ref_output_[ni] * amplify_factor_);
-        EXPECT_GE(max_error_,
-                  fabs(output_[ni] - ref_output_[ni]) / amplify_factor_);
+        ref_output_[ni] = round(ref_output_[ni]);
+        const double this_error =
+            fabs(output_[ni] - ref_output_[ni]) / amplify_factor_;
+        actual_max_error = AOMMAX(actual_max_error, this_error);
+      }
+      EXPECT_GE(max_error_, actual_max_error)
+          << "tx_size = " << tx_size_ << ", tx_type = " << tx_type_;
+      if (actual_max_error > max_error_) {  // exit early.
+        break;
       }
+
       avg_abs_error += compute_avg_abs_error<int32_t, double>(
           output_, ref_output_, txfm2d_size_);
     }
 
     avg_abs_error /= amplify_factor_;
     avg_abs_error /= count_;
-    // max_abs_avg_error comes from upper bound of avg_abs_error
-    // printf("type0: %d type1: %d txfm_size: %d accuracy_avg_abs_error:
-    // %f\n", type0_, type1_, txfm1d_size_, avg_abs_error);
-    EXPECT_GE(max_avg_error_, avg_abs_error);
+    EXPECT_GE(max_avg_error_, avg_abs_error)
+        << "tx_size = " << tx_size_ << ", tx_type = " << tx_type_;
   }
 
   virtual void TearDown() {
@@ -119,11 +122,10 @@ class AV1FwdTxfm2d : public ::testing::TestWithParam<AV1FwdTxfm2dParam> {
   double amplify_factor_;
   TX_TYPE tx_type_;
   TX_SIZE tx_size_;
-  int txfm1d_size_;
+  int tx_width_;
+  int tx_height_;
   int txfm2d_size_;
-  Fwd_Txfm2d_Func fwd_txfm_;
-  TYPE_TXFM type0_;
-  TYPE_TXFM type1_;
+  FwdTxfm2dFunc fwd_txfm_;
   int16_t *input_;
   int32_t *output_;
   double *ref_input_;
@@ -132,76 +134,209 @@ class AV1FwdTxfm2d : public ::testing::TestWithParam<AV1FwdTxfm2dParam> {
   int lr_flip_;  // flip left to right
 };
 
-TEST_P(AV1FwdTxfm2d, RunFwdAccuracyCheck) { RunFwdAccuracyCheck(); }
-const AV1FwdTxfm2dParam av1_fwd_txfm2d_param_c[] = {
-#if CONFIG_EXT_TX
-  AV1FwdTxfm2dParam(FLIPADST_DCT, TX_4X4, 2, 0.2),
-  AV1FwdTxfm2dParam(DCT_FLIPADST, TX_4X4, 2, 0.2),
-  AV1FwdTxfm2dParam(FLIPADST_FLIPADST, TX_4X4, 2, 0.2),
-  AV1FwdTxfm2dParam(ADST_FLIPADST, TX_4X4, 2, 0.2),
-  AV1FwdTxfm2dParam(FLIPADST_ADST, TX_4X4, 2, 0.2),
-  AV1FwdTxfm2dParam(FLIPADST_DCT, TX_8X8, 5, 0.6),
-  AV1FwdTxfm2dParam(DCT_FLIPADST, TX_8X8, 5, 0.6),
-  AV1FwdTxfm2dParam(FLIPADST_FLIPADST, TX_8X8, 5, 0.6),
-  AV1FwdTxfm2dParam(ADST_FLIPADST, TX_8X8, 5, 0.6),
-  AV1FwdTxfm2dParam(FLIPADST_ADST, TX_8X8, 5, 0.6),
-  AV1FwdTxfm2dParam(FLIPADST_DCT, TX_16X16, 11, 1.5),
-  AV1FwdTxfm2dParam(DCT_FLIPADST, TX_16X16, 11, 1.5),
-  AV1FwdTxfm2dParam(FLIPADST_FLIPADST, TX_16X16, 11, 1.5),
-  AV1FwdTxfm2dParam(ADST_FLIPADST, TX_16X16, 11, 1.5),
-  AV1FwdTxfm2dParam(FLIPADST_ADST, TX_16X16, 11, 1.5),
-  AV1FwdTxfm2dParam(FLIPADST_DCT, TX_32X32, 70, 7),
-  AV1FwdTxfm2dParam(DCT_FLIPADST, TX_32X32, 70, 7),
-  AV1FwdTxfm2dParam(FLIPADST_FLIPADST, TX_32X32, 70, 7),
-  AV1FwdTxfm2dParam(ADST_FLIPADST, TX_32X32, 70, 7),
-  AV1FwdTxfm2dParam(FLIPADST_ADST, TX_32X32, 70, 7),
-#endif
-  AV1FwdTxfm2dParam(DCT_DCT, TX_4X4, 2, 0.2),
-  AV1FwdTxfm2dParam(ADST_DCT, TX_4X4, 2, 0.2),
-  AV1FwdTxfm2dParam(DCT_ADST, TX_4X4, 2, 0.2),
-  AV1FwdTxfm2dParam(ADST_ADST, TX_4X4, 2, 0.2),
-  AV1FwdTxfm2dParam(DCT_DCT, TX_8X8, 5, 0.6),
-  AV1FwdTxfm2dParam(ADST_DCT, TX_8X8, 5, 0.6),
-  AV1FwdTxfm2dParam(DCT_ADST, TX_8X8, 5, 0.6),
-  AV1FwdTxfm2dParam(ADST_ADST, TX_8X8, 5, 0.6),
-  AV1FwdTxfm2dParam(DCT_DCT, TX_16X16, 11, 1.5),
-  AV1FwdTxfm2dParam(ADST_DCT, TX_16X16, 11, 1.5),
-  AV1FwdTxfm2dParam(DCT_ADST, TX_16X16, 11, 1.5),
-  AV1FwdTxfm2dParam(ADST_ADST, TX_16X16, 11, 1.5),
-  AV1FwdTxfm2dParam(DCT_DCT, TX_32X32, 70, 7),
-  AV1FwdTxfm2dParam(ADST_DCT, TX_32X32, 70, 7),
-  AV1FwdTxfm2dParam(DCT_ADST, TX_32X32, 70, 7),
-  AV1FwdTxfm2dParam(ADST_ADST, TX_32X32, 70, 7)
+static double avg_error_ls[TX_SIZES_ALL] = {
+  0.5,   // 4x4 transform
+  0.5,   // 8x8 transform
+  1.2,   // 16x16 transform
+  6.1,   // 32x32 transform
+  3.4,   // 64x64 transform
+  0.57,  // 4x8 transform
+  0.68,  // 8x4 transform
+  0.92,  // 8x16 transform
+  1.1,   // 16x8 transform
+  4.1,   // 16x32 transform
+  6,     // 32x16 transform
+  3.5,   // 32x64 transform
+  5.7,   // 64x32 transform
+  0.6,   // 4x16 transform
+  0.9,   // 16x4 transform
+  1.2,   // 8x32 transform
+  1.7,   // 32x8 transform
+  2.0,   // 16x64 transform
+  4.7,   // 64x16 transform
 };
 
+static double max_error_ls[TX_SIZES_ALL] = {
+  3,    // 4x4 transform
+  5,    // 8x8 transform
+  11,   // 16x16 transform
+  70,   // 32x32 transform
+  64,   // 64x64 transform
+  3.9,  // 4x8 transform
+  4.3,  // 8x4 transform
+  12,   // 8x16 transform
+  12,   // 16x8 transform
+  32,   // 16x32 transform
+  46,   // 32x16 transform
+  136,  // 32x64 transform
+  136,  // 64x32 transform
+  5,    // 4x16 transform
+  6,    // 16x4 transform
+  21,   // 8x32 transform
+  13,   // 32x8 transform
+  30,   // 16x64 transform
+  36,   // 64x16 transform
+};
+
+vector<AV1FwdTxfm2dParam> GetTxfm2dParamList() {
+  vector<AV1FwdTxfm2dParam> param_list;
+  for (int s = 0; s < TX_SIZES; ++s) {
+    const double max_error = max_error_ls[s];
+    const double avg_error = avg_error_ls[s];
+    for (int t = 0; t < TX_TYPES; ++t) {
+      const TX_TYPE tx_type = static_cast<TX_TYPE>(t);
+      const TX_SIZE tx_size = static_cast<TX_SIZE>(s);
+      if (libaom_test::IsTxSizeTypeValid(tx_size, tx_type)) {
+        param_list.push_back(
+            AV1FwdTxfm2dParam(tx_type, tx_size, max_error, avg_error));
+      }
+    }
+  }
+  return param_list;
+}
+
 INSTANTIATE_TEST_CASE_P(C, AV1FwdTxfm2d,
-                        ::testing::ValuesIn(av1_fwd_txfm2d_param_c));
+                        ::testing::ValuesIn(GetTxfm2dParamList()));
+
+TEST_P(AV1FwdTxfm2d, RunFwdAccuracyCheck) { RunFwdAccuracyCheck(); }
 
 TEST(AV1FwdTxfm2d, CfgTest) {
   for (int bd_idx = 0; bd_idx < BD_NUM; ++bd_idx) {
     int bd = libaom_test::bd_arr[bd_idx];
     int8_t low_range = libaom_test::low_range_arr[bd_idx];
     int8_t high_range = libaom_test::high_range_arr[bd_idx];
-    // TODO(angiebird): include rect txfm in this test
-    for (int tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
+    for (int tx_size = 0; tx_size < TX_SIZES_ALL; ++tx_size) {
       for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
-        TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(
-            static_cast<TX_TYPE>(tx_type), static_cast<TX_SIZE>(tx_size));
+        if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(tx_size),
+                                           static_cast<TX_TYPE>(tx_type)) ==
+            false) {
+          continue;
+        }
+        TXFM_2D_FLIP_CFG cfg;
+        av1_get_fwd_txfm_cfg(static_cast<TX_TYPE>(tx_type),
+                             static_cast<TX_SIZE>(tx_size), &cfg);
         int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
         int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
         av1_gen_fwd_stage_range(stage_range_col, stage_range_row, &cfg, bd);
-        const TXFM_1D_CFG *col_cfg = cfg.col_cfg;
-        const TXFM_1D_CFG *row_cfg = cfg.row_cfg;
-        libaom_test::txfm_stage_range_check(stage_range_col, col_cfg->stage_num,
-                                            col_cfg->cos_bit, low_range,
+        libaom_test::txfm_stage_range_check(stage_range_col, cfg.stage_num_col,
+                                            cfg.cos_bit_col, low_range,
                                             high_range);
-        libaom_test::txfm_stage_range_check(stage_range_row, row_cfg->stage_num,
-                                            row_cfg->cos_bit, low_range,
+        libaom_test::txfm_stage_range_check(stage_range_row, cfg.stage_num_row,
+                                            cfg.cos_bit_row, low_range,
                                             high_range);
       }
     }
   }
 }
 
-#endif  // CONFIG_HIGHBITDEPTH
+typedef void (*lowbd_fwd_txfm_func)(const int16_t *src_diff, tran_low_t *coeff,
+                                    int diff_stride, TxfmParam *txfm_param);
+
+void AV1FwdTxfm2dMatchTest(TX_SIZE tx_size, lowbd_fwd_txfm_func target_func) {
+  const int bd = 8;
+  TxfmParam param;
+  memset(&param, 0, sizeof(param));
+  const int rows = tx_size_high[tx_size];
+  const int cols = tx_size_wide[tx_size];
+  // printf("%d x %d\n", cols, rows);
+  for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+    if (libaom_test::IsTxSizeTypeValid(
+            tx_size, static_cast<TX_TYPE>(tx_type)) == false) {
+      continue;
+    }
+
+    FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size];
+    if (ref_func != NULL) {
+      DECLARE_ALIGNED(16, int16_t, input[64 * 64]) = { 0 };
+      DECLARE_ALIGNED(16, int32_t, output[64 * 64]);
+      DECLARE_ALIGNED(16, int32_t, ref_output[64 * 64]);
+      int input_stride = 64;
+      ACMRandom rnd(ACMRandom::DeterministicSeed());
+      for (int cnt = 0; cnt < 500; ++cnt) {
+        if (cnt == 0) {
+          for (int r = 0; r < rows; ++r) {
+            for (int c = 0; c < cols; ++c) {
+              input[r * input_stride + c] = (1 << bd) - 1;
+            }
+          }
+        } else {
+          for (int r = 0; r < rows; ++r) {
+            for (int c = 0; c < cols; ++c) {
+              input[r * input_stride + c] = rnd.Rand16() % (1 << bd);
+            }
+          }
+        }
+        param.tx_type = (TX_TYPE)tx_type;
+        param.tx_size = (TX_SIZE)tx_size;
+        param.tx_set_type = EXT_TX_SET_ALL16;
+        param.bd = bd;
+        ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd);
+        target_func(input, output, input_stride, &param);
+        const int check_rows = AOMMIN(32, rows);
+        const int check_cols = AOMMIN(32, rows * cols / check_rows);
+        for (int r = 0; r < check_rows; ++r) {
+          for (int c = 0; c < check_cols; ++c) {
+            ASSERT_EQ(ref_output[r * check_cols + c],
+                      output[r * check_cols + c])
+                << "[" << r << "," << c << "] cnt:" << cnt
+                << " tx_size: " << tx_size << " tx_type: " << tx_type;
+          }
+        }
+      }
+    }
+  }
+}
+
+typedef ::testing::tuple<TX_SIZE, lowbd_fwd_txfm_func> LbdFwdTxfm2dParam;
+
+class AV1FwdTxfm2dTest : public ::testing::TestWithParam<LbdFwdTxfm2dParam> {};
+
+TEST_P(AV1FwdTxfm2dTest, match) {
+  AV1FwdTxfm2dMatchTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+using ::testing::Combine;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+#if HAVE_SSE2
+static TX_SIZE fwd_txfm_for_sse2[] = {
+  TX_4X4,
+  TX_8X8,
+  TX_16X16,
+  TX_32X32,
+  // TX_64X64,
+  TX_4X8,
+  TX_8X4,
+  TX_8X16,
+  TX_16X8,
+  TX_16X32,
+  TX_32X16,
+  // TX_32X64,
+  // TX_64X32,
+  TX_4X16,
+  TX_16X4,
+  TX_8X32,
+  TX_32X8,
+  TX_16X64,
+  TX_64X16,
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, AV1FwdTxfm2dTest,
+                        Combine(ValuesIn(fwd_txfm_for_sse2),
+                                Values(av1_lowbd_fwd_txfm_sse2)));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSE4_1
+static TX_SIZE fwd_txfm_for_sse41[] = {
+  TX_4X4,
+  TX_64X64,
+  TX_32X64,
+  TX_64X32,
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, AV1FwdTxfm2dTest,
+                        Combine(ValuesIn(fwd_txfm_for_sse41),
+                                Values(av1_lowbd_fwd_txfm_sse4_1)));
+#endif  // HAVE_SSE4_1
 }  // namespace
diff --git a/third_party/aom/test/av1_highbd_iht_test.cc b/third_party/aom/test/av1_highbd_iht_test.cc
index 45df5ed84..8cadc85e7 100644
--- a/third_party/aom/test/av1_highbd_iht_test.cc
+++ b/third_party/aom/test/av1_highbd_iht_test.cc
@@ -11,7 +11,8 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
@@ -22,7 +23,7 @@
 
 namespace {
 
-using std::tr1::tuple;
+using ::testing::tuple;
 using libaom_test::ACMRandom;
 
 typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
@@ -88,6 +89,8 @@ class AV1HighbdInvHTNxN : public ::testing::TestWithParam<IHbdHtParam> {
       return 16;
     } else if (1024 == num_coeffs_) {
       return 32;
+    } else if (4096 == num_coeffs_) {
+      return 64;
     } else {
       return 0;
     }
@@ -133,28 +136,24 @@ void AV1HighbdInvHTNxN::RunBitexactCheck() {
 
 TEST_P(AV1HighbdInvHTNxN, InvTransResultCheck) { RunBitexactCheck(); }
 
-using std::tr1::make_tuple;
+using ::testing::make_tuple;
 
-#if HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH && \
-    !(CONFIG_DAALA_DCT4 && CONFIG_DAALA_DCT8 && CONFIG_DAALA_DCT16)
-#if !CONFIG_DAALA_DCT4
+#if HAVE_SSE4_1
 #define PARAM_LIST_4X4                                   \
   &av1_fwd_txfm2d_4x4_c, &av1_inv_txfm2d_add_4x4_sse4_1, \
       &av1_inv_txfm2d_add_4x4_c, 16
-#endif
-#if !CONFIG_DAALA_DCT8
 #define PARAM_LIST_8X8                                   \
   &av1_fwd_txfm2d_8x8_c, &av1_inv_txfm2d_add_8x8_sse4_1, \
       &av1_inv_txfm2d_add_8x8_c, 64
-#endif
-#if !CONFIG_DAALA_DCT16
 #define PARAM_LIST_16X16                                     \
   &av1_fwd_txfm2d_16x16_c, &av1_inv_txfm2d_add_16x16_sse4_1, \
       &av1_inv_txfm2d_add_16x16_c, 256
-#endif
+#define PARAM_LIST_64X64                                     \
+  &av1_fwd_txfm2d_64x64_c, &av1_inv_txfm2d_add_64x64_sse4_1, \
+      &av1_inv_txfm2d_add_64x64_c, 4096
+
 const IHbdHtParam kArrayIhtParam[] = {
-// 16x16
-#if !CONFIG_DAALA_DCT16
+  // 16x16
   make_tuple(PARAM_LIST_16X16, DCT_DCT, 10),
   make_tuple(PARAM_LIST_16X16, DCT_DCT, 12),
   make_tuple(PARAM_LIST_16X16, ADST_DCT, 10),
@@ -163,7 +162,6 @@ const IHbdHtParam kArrayIhtParam[] = {
   make_tuple(PARAM_LIST_16X16, DCT_ADST, 12),
   make_tuple(PARAM_LIST_16X16, ADST_ADST, 10),
   make_tuple(PARAM_LIST_16X16, ADST_ADST, 12),
-#if CONFIG_EXT_TX
   make_tuple(PARAM_LIST_16X16, FLIPADST_DCT, 10),
   make_tuple(PARAM_LIST_16X16, FLIPADST_DCT, 12),
   make_tuple(PARAM_LIST_16X16, DCT_FLIPADST, 10),
@@ -174,10 +172,7 @@ const IHbdHtParam kArrayIhtParam[] = {
   make_tuple(PARAM_LIST_16X16, ADST_FLIPADST, 12),
   make_tuple(PARAM_LIST_16X16, FLIPADST_ADST, 10),
   make_tuple(PARAM_LIST_16X16, FLIPADST_ADST, 12),
-#endif
-#endif
-// 8x8
-#if !CONFIG_DAALA_DCT8
+  // 8x8
   make_tuple(PARAM_LIST_8X8, DCT_DCT, 10),
   make_tuple(PARAM_LIST_8X8, DCT_DCT, 12),
   make_tuple(PARAM_LIST_8X8, ADST_DCT, 10),
@@ -186,7 +181,6 @@ const IHbdHtParam kArrayIhtParam[] = {
   make_tuple(PARAM_LIST_8X8, DCT_ADST, 12),
   make_tuple(PARAM_LIST_8X8, ADST_ADST, 10),
   make_tuple(PARAM_LIST_8X8, ADST_ADST, 12),
-#if CONFIG_EXT_TX
   make_tuple(PARAM_LIST_8X8, FLIPADST_DCT, 10),
   make_tuple(PARAM_LIST_8X8, FLIPADST_DCT, 12),
   make_tuple(PARAM_LIST_8X8, DCT_FLIPADST, 10),
@@ -197,10 +191,7 @@ const IHbdHtParam kArrayIhtParam[] = {
   make_tuple(PARAM_LIST_8X8, ADST_FLIPADST, 12),
   make_tuple(PARAM_LIST_8X8, FLIPADST_ADST, 10),
   make_tuple(PARAM_LIST_8X8, FLIPADST_ADST, 12),
-#endif
-#endif
-// 4x4
-#if !CONFIG_DAALA_DCT4
+  // 4x4
   make_tuple(PARAM_LIST_4X4, DCT_DCT, 10),
   make_tuple(PARAM_LIST_4X4, DCT_DCT, 12),
   make_tuple(PARAM_LIST_4X4, ADST_DCT, 10),
@@ -209,7 +200,6 @@ const IHbdHtParam kArrayIhtParam[] = {
   make_tuple(PARAM_LIST_4X4, DCT_ADST, 12),
   make_tuple(PARAM_LIST_4X4, ADST_ADST, 10),
   make_tuple(PARAM_LIST_4X4, ADST_ADST, 12),
-#if CONFIG_EXT_TX
   make_tuple(PARAM_LIST_4X4, FLIPADST_DCT, 10),
   make_tuple(PARAM_LIST_4X4, FLIPADST_DCT, 12),
   make_tuple(PARAM_LIST_4X4, DCT_FLIPADST, 10),
@@ -220,16 +210,15 @@ const IHbdHtParam kArrayIhtParam[] = {
   make_tuple(PARAM_LIST_4X4, ADST_FLIPADST, 12),
   make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 10),
   make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 12),
-#endif
-#endif
+  make_tuple(PARAM_LIST_64X64, DCT_DCT, 10),
+  make_tuple(PARAM_LIST_64X64, DCT_DCT, 12),
 };
 
 INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdInvHTNxN,
                         ::testing::ValuesIn(kArrayIhtParam));
-#endif  // HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH &&
-        //  !(CONFIG_DAALA_DCT4 && CONFIG_DAALA_DCT8 && CONFIG_DAALA_DCT16)
+#endif  // HAVE_SSE4_1
 
-#if HAVE_AVX2 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_DCT32
+#if HAVE_AVX2
 #define PARAM_LIST_32X32                                   \
   &av1_fwd_txfm2d_32x32_c, &av1_inv_txfm2d_add_32x32_avx2, \
       &av1_inv_txfm2d_add_32x32_c, 1024
@@ -243,5 +232,5 @@ const IHbdHtParam kArrayIhtParam32x32[] = {
 INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdInvHTNxN,
                         ::testing::ValuesIn(kArrayIhtParam32x32));
 
-#endif  // HAVE_AVX2 && CONFIG_HIGHBITDEPTH
+#endif  // HAVE_AVX2
 }  // namespace
diff --git a/third_party/aom/test/av1_horz_only_frame_superres_test.cc b/third_party/aom/test/av1_horz_only_frame_superres_test.cc
new file mode 100644
index 000000000..fd77ef35d
--- /dev/null
+++ b/third_party/aom/test/av1_horz_only_frame_superres_test.cc
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/common/convolve.h"
+#include "av1/common/resize.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+const int kTestIters = 10;
+const int kPerfIters = 1000;
+
+const int kVPad = 32;
+const int kHPad = 32;
+
+using ::testing::make_tuple;
+using ::testing::tuple;
+using libaom_test::ACMRandom;
+
+template <typename Pixel>
+class TestImage {
+ public:
+  TestImage(int w_src, int h, int superres_denom, int x0, int bd)
+      : w_src_(w_src), h_(h), superres_denom_(superres_denom), x0_(x0),
+        bd_(bd) {
+    assert(bd < 16);
+    assert(bd <= 8 * static_cast<int>(sizeof(Pixel)));
+    assert(9 <= superres_denom && superres_denom <= 16);
+    assert(SCALE_NUMERATOR == 8);
+    assert(0 <= x0_ && x0_ <= RS_SCALE_SUBPEL_MASK);
+
+    w_dst_ = w_src_;
+    av1_calculate_unscaled_superres_size(&w_dst_, NULL, superres_denom);
+
+    src_stride_ = ALIGN_POWER_OF_TWO(w_src_ + 2 * kHPad, 4);
+    dst_stride_ = ALIGN_POWER_OF_TWO(w_dst_ + 2 * kHPad, 4);
+
+    // Allocate image data
+    src_data_.resize(2 * src_block_size());
+    dst_data_.resize(2 * dst_block_size());
+  }
+
+  void Initialize(ACMRandom *rnd);
+  void Check() const;
+
+  int src_stride() const { return src_stride_; }
+  int dst_stride() const { return dst_stride_; }
+
+  int src_block_size() const { return (h_ + 2 * kVPad) * src_stride(); }
+  int dst_block_size() const { return (h_ + 2 * kVPad) * dst_stride(); }
+
+  int src_width() const { return w_src_; }
+  int dst_width() const { return w_dst_; }
+  int height() const { return h_; }
+  int x0() const { return x0_; }
+
+  const Pixel *GetSrcData(bool ref, bool borders) const {
+    const Pixel *block = &src_data_[ref ? 0 : src_block_size()];
+    return borders ? block : block + kHPad + src_stride_ * kVPad;
+  }
+
+  Pixel *GetDstData(bool ref, bool borders) {
+    Pixel *block = &dst_data_[ref ? 0 : dst_block_size()];
+    return borders ? block : block + kHPad + dst_stride_ * kVPad;
+  }
+
+ private:
+  int w_src_, w_dst_, h_, superres_denom_, x0_, bd_;
+  int src_stride_, dst_stride_;
+
+  std::vector<Pixel> src_data_;
+  std::vector<Pixel> dst_data_;
+};
+
+template <typename Pixel>
+void FillEdge(ACMRandom *rnd, int num_pixels, int bd, bool trash, Pixel *data) {
+  if (!trash) {
+    memset(data, 0, sizeof(*data) * num_pixels);
+    return;
+  }
+  const Pixel mask = (1 << bd) - 1;
+  for (int i = 0; i < num_pixels; ++i) data[i] = rnd->Rand16() & mask;
+}
+
+template <typename Pixel>
+void PrepBuffers(ACMRandom *rnd, int w, int h, int stride, int bd,
+                 bool trash_edges, Pixel *data) {
+  assert(rnd);
+  const Pixel mask = (1 << bd) - 1;
+
+  // Fill in the first buffer with random data
+  // Top border
+  FillEdge(rnd, stride * kVPad, bd, trash_edges, data);
+  for (int r = 0; r < h; ++r) {
+    Pixel *row_data = data + (kVPad + r) * stride;
+    // Left border, contents, right border
+    FillEdge(rnd, kHPad, bd, trash_edges, row_data);
+    for (int c = 0; c < w; ++c) row_data[kHPad + c] = rnd->Rand16() & mask;
+    FillEdge(rnd, kHPad, bd, trash_edges, row_data + kHPad + w);
+  }
+  // Bottom border
+  FillEdge(rnd, stride * kVPad, bd, trash_edges, data + stride * (kVPad + h));
+
+  const int bpp = sizeof(*data);
+  const int block_elts = stride * (h + 2 * kVPad);
+  const int block_size = bpp * block_elts;
+
+  // Now copy that to the second buffer
+  memcpy(data + block_elts, data, block_size);
+}
+
+template <typename Pixel>
+void TestImage<Pixel>::Initialize(ACMRandom *rnd) {
+  PrepBuffers(rnd, w_src_, h_, src_stride_, bd_, false, &src_data_[0]);
+  PrepBuffers(rnd, w_dst_, h_, dst_stride_, bd_, true, &dst_data_[0]);
+}
+
+template <typename Pixel>
+void TestImage<Pixel>::Check() const {
+  const int num_pixels = dst_block_size();
+  const Pixel *ref_dst = &dst_data_[0];
+  const Pixel *tst_dst = &dst_data_[num_pixels];
+
+  // If memcmp returns 0, there's nothing to do.
+  if (0 == memcmp(ref_dst, tst_dst, sizeof(*ref_dst) * num_pixels)) return;
+
+  // Otherwise, iterate through the buffer looking for differences, *ignoring
+  // the edges*
+  const int stride = dst_stride_;
+  for (int r = kVPad; r < h_ + kVPad; ++r) {
+    for (int c = kVPad; c < w_dst_ + kHPad; ++c) {
+      const int32_t ref_value = ref_dst[r * stride + c];
+      const int32_t tst_value = tst_dst[r * stride + c];
+
+      EXPECT_EQ(tst_value, ref_value)
+          << "Error at row: " << (r - kVPad) << ", col: " << (c - kHPad)
+          << ", superres_denom: " << superres_denom_ << ", height: " << h_
+          << ", src_width: " << w_src_ << ", dst_width: " << w_dst_
+          << ", x0: " << x0_;
+    }
+  }
+}
+
+template <typename Pixel>
+class ConvolveHorizRSTestBase : public ::testing::Test {
+ public:
+  ConvolveHorizRSTestBase() : image_(NULL) {}
+  virtual ~ConvolveHorizRSTestBase() {}
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+  // Implemented by subclasses (SetUp depends on the parameters passed
+  // in and RunOne depends on the function to be tested. These can't
+  // be templated for low/high bit depths because they have different
+  // numbers of parameters)
+  virtual void SetUp() = 0;
+  virtual void RunOne(bool ref) = 0;
+
+ protected:
+  void SetBitDepth(int bd) { bd_ = bd; }
+
+  void CorrectnessTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    for (int i = 0; i < kTestIters; ++i) {
+      for (int superres_denom = 9; superres_denom <= 16; superres_denom++) {
+        // Get a random height between 512 and 767
+        int height = rnd.Rand8() + 512;
+
+        // Get a random src width between 128 and 383
+        int width_src = rnd.Rand8() + 128;
+
+        // x0 is normally calculated by get_upscale_convolve_x0 in
+        // av1/common/resize.c. However, this test should work for
+        // any value of x0 between 0 and RS_SCALE_SUBPEL_MASK
+        // (inclusive), so we choose one at random.
+        int x0 = rnd.Rand16() % (RS_SCALE_SUBPEL_MASK + 1);
+
+        image_ =
+            new TestImage<Pixel>(width_src, height, superres_denom, x0, bd_);
+
+        Prep(&rnd);
+        RunOne(true);
+        RunOne(false);
+        image_->Check();
+
+        delete image_;
+      }
+    }
+  }
+
+  void SpeedTest() {
+    // Pick some specific parameters to test
+    int height = 767;
+    int width_src = 129;
+    int superres_denom = 13;
+    int x0 = RS_SCALE_SUBPEL_MASK >> 1;
+
+    image_ = new TestImage<Pixel>(width_src, height, superres_denom, x0, bd_);
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    Prep(&rnd);
+
+    aom_usec_timer ref_timer;
+    aom_usec_timer_start(&ref_timer);
+    for (int i = 0; i < kPerfIters; ++i) RunOne(true);
+    aom_usec_timer_mark(&ref_timer);
+    const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+    aom_usec_timer tst_timer;
+    aom_usec_timer_start(&tst_timer);
+    for (int i = 0; i < kPerfIters; ++i) RunOne(false);
+    aom_usec_timer_mark(&tst_timer);
+    const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+    std::cout << "[          ] C time = " << ref_time / 1000
+              << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+    EXPECT_GT(ref_time, tst_time)
+        << "Error: ConvolveHorizRSTest (Speed Test), SIMD slower than C.\n"
+        << "C time: " << ref_time << " us\n"
+        << "SIMD time: " << tst_time << " us\n";
+  }
+
+  void Prep(ACMRandom *rnd) {
+    assert(rnd);
+    image_->Initialize(rnd);
+  }
+
+  int bd_;
+  TestImage<Pixel> *image_;
+};
+
+typedef void (*LowBDConvolveHorizRsFunc)(const uint8_t *src, int src_stride,
+                                         uint8_t *dst, int dst_stride, int w,
+                                         int h, const int16_t *x_filters,
+                                         const int x0_qn, const int x_step_qn);
+
+// Test parameter list:
+//  <tst_fun_>
+typedef tuple<LowBDConvolveHorizRsFunc> LowBDParams;
+
+class LowBDConvolveHorizRSTest
+    : public ConvolveHorizRSTestBase<uint8_t>,
+      public ::testing::WithParamInterface<LowBDParams> {
+ public:
+  virtual ~LowBDConvolveHorizRSTest() {}
+
+  void SetUp() {
+    tst_fun_ = GET_PARAM(0);
+    const int bd = 8;
+    SetBitDepth(bd);
+  }
+
+  void RunOne(bool ref) {
+    const uint8_t *src = image_->GetSrcData(ref, false);
+    uint8_t *dst = image_->GetDstData(ref, false);
+    const int src_stride = image_->src_stride();
+    const int dst_stride = image_->dst_stride();
+    const int width_src = image_->src_width();
+    const int width_dst = image_->dst_width();
+    const int height = image_->height();
+    const int x0_qn = image_->x0();
+
+    const int32_t x_step_qn =
+        av1_get_upscale_convolve_step(width_src, width_dst);
+
+    if (ref) {
+      av1_convolve_horiz_rs_c(src, src_stride, dst, dst_stride, width_dst,
+                              height, &av1_resize_filter_normative[0][0], x0_qn,
+                              x_step_qn);
+    } else {
+      tst_fun_(src, src_stride, dst, dst_stride, width_dst, height,
+               &av1_resize_filter_normative[0][0], x0_qn, x_step_qn);
+    }
+  }
+
+ private:
+  LowBDConvolveHorizRsFunc tst_fun_;
+};
+
+TEST_P(LowBDConvolveHorizRSTest, Correctness) { CorrectnessTest(); }
+TEST_P(LowBDConvolveHorizRSTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, LowBDConvolveHorizRSTest,
+                        ::testing::Values(av1_convolve_horiz_rs_sse4_1));
+
+typedef void (*HighBDConvolveHorizRsFunc)(const uint16_t *src, int src_stride,
+                                          uint16_t *dst, int dst_stride, int w,
+                                          int h, const int16_t *x_filters,
+                                          const int x0_qn, const int x_step_qn,
+                                          int bd);
+
+// Test parameter list:
+//  <tst_fun_, bd_>
+typedef tuple<HighBDConvolveHorizRsFunc, int> HighBDParams;
+
+class HighBDConvolveHorizRSTest
+    : public ConvolveHorizRSTestBase<uint16_t>,
+      public ::testing::WithParamInterface<HighBDParams> {
+ public:
+  virtual ~HighBDConvolveHorizRSTest() {}
+
+  void SetUp() {
+    tst_fun_ = GET_PARAM(0);
+    const int bd = GET_PARAM(1);
+    SetBitDepth(bd);
+  }
+
+  void RunOne(bool ref) {
+    const uint16_t *src = image_->GetSrcData(ref, false);
+    uint16_t *dst = image_->GetDstData(ref, false);
+    const int src_stride = image_->src_stride();
+    const int dst_stride = image_->dst_stride();
+    const int width_src = image_->src_width();
+    const int width_dst = image_->dst_width();
+    const int height = image_->height();
+    const int x0_qn = image_->x0();
+
+    const int32_t x_step_qn =
+        av1_get_upscale_convolve_step(width_src, width_dst);
+
+    if (ref) {
+      av1_highbd_convolve_horiz_rs_c(
+          src, src_stride, dst, dst_stride, width_dst, height,
+          &av1_resize_filter_normative[0][0], x0_qn, x_step_qn, bd_);
+    } else {
+      tst_fun_(src, src_stride, dst, dst_stride, width_dst, height,
+               &av1_resize_filter_normative[0][0], x0_qn, x_step_qn, bd_);
+    }
+  }
+
+ private:
+  HighBDConvolveHorizRsFunc tst_fun_;
+};
+
+const int kBDs[] = { 8, 10, 12 };
+
+TEST_P(HighBDConvolveHorizRSTest, Correctness) { CorrectnessTest(); }
+TEST_P(HighBDConvolveHorizRSTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, HighBDConvolveHorizRSTest,
+    ::testing::Combine(::testing::Values(av1_highbd_convolve_horiz_rs_sse4_1),
+                       ::testing::ValuesIn(kBDs)));
+
+}  // namespace
diff --git a/third_party/aom/test/av1_inv_txfm1d_test.cc b/third_party/aom/test/av1_inv_txfm1d_test.cc
index b44c04116..bf3a44ed1 100644
--- a/third_party/aom/test/av1_inv_txfm1d_test.cc
+++ b/third_party/aom/test/av1_inv_txfm1d_test.cc
@@ -13,39 +13,35 @@
 
 #include "test/av1_txfm_test.h"
 #include "test/util.h"
-#include "av1/common/av1_fwd_txfm1d.h"
 #include "av1/common/av1_inv_txfm1d.h"
+#include "av1/encoder/av1_fwd_txfm1d.h"
 
 using libaom_test::ACMRandom;
 using libaom_test::input_base;
 
 namespace {
 const int txfm_type_num = 2;
-const int txfm_size_ls[5] = { 4, 8, 16, 32, 64 };
+const int txfm_size_ls[] = { 4, 8, 16, 32, 64 };
 
-const TxfmFunc fwd_txfm_func_ls[][2] = {
+const TxfmFunc fwd_txfm_func_ls[][txfm_type_num] = {
   { av1_fdct4_new, av1_fadst4_new },
   { av1_fdct8_new, av1_fadst8_new },
   { av1_fdct16_new, av1_fadst16_new },
-  { av1_fdct32_new, av1_fadst32_new },
-#if CONFIG_TX64X64
+  { av1_fdct32_new, NULL },
   { av1_fdct64_new, NULL },
-#endif
 };
 
-const TxfmFunc inv_txfm_func_ls[][2] = {
+const TxfmFunc inv_txfm_func_ls[][txfm_type_num] = {
   { av1_idct4_new, av1_iadst4_new },
   { av1_idct8_new, av1_iadst8_new },
   { av1_idct16_new, av1_iadst16_new },
-  { av1_idct32_new, av1_iadst32_new },
-#if CONFIG_TX64X64
+  { av1_idct32_new, NULL },
   { av1_idct64_new, NULL },
-#endif
 };
 
 // the maximum stage number of fwd/inv 1d dct/adst txfm is 12
-const int8_t cos_bit[12] = { 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 };
-const int8_t range_bit[12] = { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 };
+const int8_t cos_bit = 13;
+const int8_t range_bit[12] = { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 };
 
 void reference_idct_1d_int(const int32_t *in, int32_t *out, int size) {
   double input[64];
@@ -54,8 +50,11 @@ void reference_idct_1d_int(const int32_t *in, int32_t *out, int size) {
   double output[64];
   libaom_test::reference_idct_1d(input, output, size);
 
-  for (int i = 0; i < size; ++i)
+  for (int i = 0; i < size; ++i) {
+    ASSERT_GE(output[i], INT32_MIN);
+    ASSERT_LE(output[i], INT32_MAX);
     out[i] = static_cast<int32_t>(round(output[i]));
+  }
 }
 
 void random_matrix(int32_t *dst, int len, ACMRandom *rnd) {
@@ -73,24 +72,32 @@ void random_matrix(int32_t *dst, int len, ACMRandom *rnd) {
 TEST(av1_inv_txfm1d, InvAccuracyCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 20000;
-  const int max_error[] = { 6, 10, 19, 28 };
+  const int max_error[] = { 6, 10, 19, 31, 40 };
+  ASSERT_EQ(NELEMENTS(max_error), TX_SIZES);
+  ASSERT_EQ(NELEMENTS(inv_txfm_func_ls), TX_SIZES);
   for (int k = 0; k < count_test_block; ++k) {
     // choose a random transform to test
-    const int txfm_type = rnd.Rand8() % NELEMENTS(inv_txfm_func_ls);
-    const int txfm_size = txfm_size_ls[txfm_type];
-    const TxfmFunc txfm_func = inv_txfm_func_ls[txfm_type][0];
+    const TX_SIZE tx_size = static_cast<TX_SIZE>(rnd.Rand8() % TX_SIZES);
+    const int tx_size_pix = txfm_size_ls[tx_size];
+    const TxfmFunc inv_txfm_func = inv_txfm_func_ls[tx_size][0];
 
     int32_t input[64];
-    random_matrix(input, txfm_size, &rnd);
+    random_matrix(input, tx_size_pix, &rnd);
+
+    // 64x64 transform assumes last 32 values are zero.
+    memset(input + 32, 0, 32 * sizeof(input[0]));
 
     int32_t ref_output[64];
-    reference_idct_1d_int(input, ref_output, txfm_size);
+    reference_idct_1d_int(input, ref_output, tx_size_pix);
 
     int32_t output[64];
-    txfm_func(input, output, cos_bit, range_bit);
+    inv_txfm_func(input, output, cos_bit, range_bit);
 
-    for (int i = 0; i < txfm_size; ++i) {
-      EXPECT_LE(abs(output[i] - ref_output[i]), max_error[txfm_type]);
+    for (int i = 0; i < tx_size_pix; ++i) {
+      EXPECT_LE(abs(output[i] - ref_output[i]), max_error[tx_size])
+          << "tx_size = " << tx_size << ", i = " << i
+          << ", output[i] = " << output[i]
+          << ", ref_output[i] = " << ref_output[i];
     }
   }
 }
diff --git a/third_party/aom/test/av1_inv_txfm2d_test.cc b/third_party/aom/test/av1_inv_txfm2d_test.cc
index bccbdeebf..461e7ebcd 100644
--- a/third_party/aom/test/av1_inv_txfm2d_test.cc
+++ b/third_party/aom/test/av1_inv_txfm2d_test.cc
@@ -12,26 +12,35 @@
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <vector>
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/scan.h"
 #include "test/acm_random.h"
-#include "test/util.h"
 #include "test/av1_txfm_test.h"
-#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "test/util.h"
 
 using libaom_test::ACMRandom;
-using libaom_test::input_base;
+using libaom_test::InvTxfm2dFunc;
+using libaom_test::LbdInvTxfm2dFunc;
 using libaom_test::bd;
 using libaom_test::compute_avg_abs_error;
-using libaom_test::Fwd_Txfm2d_Func;
-using libaom_test::Inv_Txfm2d_Func;
+using libaom_test::input_base;
+
+using ::testing::Combine;
+using ::testing::Range;
+using ::testing::Values;
+
+using std::vector;
 
 namespace {
 
-#if CONFIG_HIGHBITDEPTH
 // AV1InvTxfm2dParam argument list:
 // tx_type_, tx_size_, max_error_, max_avg_error_
-typedef std::tr1::tuple<TX_TYPE, TX_SIZE, int, double> AV1InvTxfm2dParam;
+typedef ::testing::tuple<TX_TYPE, TX_SIZE, int, double> AV1InvTxfm2dParam;
 
 class AV1InvTxfm2d : public ::testing::TestWithParam<AV1InvTxfm2dParam> {
  public:
@@ -46,171 +55,313 @@ class AV1InvTxfm2d : public ::testing::TestWithParam<AV1InvTxfm2dParam> {
     int tx_w = tx_size_wide[tx_size_];
     int tx_h = tx_size_high[tx_size_];
     int txfm2d_size = tx_w * tx_h;
-    const Fwd_Txfm2d_Func fwd_txfm_func =
-        libaom_test::fwd_txfm_func_ls[tx_size_];
-    const Inv_Txfm2d_Func inv_txfm_func =
-        libaom_test::inv_txfm_func_ls[tx_size_];
+    const FwdTxfm2dFunc fwd_txfm_func = libaom_test::fwd_txfm_func_ls[tx_size_];
+    const InvTxfm2dFunc inv_txfm_func = libaom_test::inv_txfm_func_ls[tx_size_];
     double avg_abs_error = 0;
     ACMRandom rnd(ACMRandom::DeterministicSeed());
 
     const int count = 500;
 
     for (int ci = 0; ci < count; ci++) {
-      int16_t expected[64 * 64] = { 0 };
-      ASSERT_LT(txfm2d_size, NELEMENTS(expected));
+      DECLARE_ALIGNED(16, int16_t, input[64 * 64]) = { 0 };
+      ASSERT_LE(txfm2d_size, NELEMENTS(input));
 
       for (int ni = 0; ni < txfm2d_size; ++ni) {
         if (ci == 0) {
           int extreme_input = input_base - 1;
-          expected[ni] = extreme_input;  // extreme case
+          input[ni] = extreme_input;  // extreme case
         } else {
-          expected[ni] = rnd.Rand16() % input_base;
+          input[ni] = rnd.Rand16() % input_base;
         }
       }
 
-      int32_t coeffs[64 * 64] = { 0 };
-      ASSERT_LT(txfm2d_size, NELEMENTS(coeffs));
-      fwd_txfm_func(expected, coeffs, tx_w, tx_type_, bd);
+      DECLARE_ALIGNED(16, uint16_t, expected[64 * 64]) = { 0 };
+      ASSERT_LE(txfm2d_size, NELEMENTS(expected));
+      if (TxfmUsesApproximation()) {
+        // Compare reference forward HT + inverse HT vs forward HT + inverse HT.
+        double ref_input[64 * 64];
+        ASSERT_LE(txfm2d_size, NELEMENTS(ref_input));
+        for (int ni = 0; ni < txfm2d_size; ++ni) {
+          ref_input[ni] = input[ni];
+        }
+        double ref_coeffs[64 * 64] = { 0 };
+        ASSERT_LE(txfm2d_size, NELEMENTS(ref_coeffs));
+        ASSERT_EQ(tx_type_, DCT_DCT);
+        libaom_test::reference_hybrid_2d(ref_input, ref_coeffs, tx_type_,
+                                         tx_size_);
+        DECLARE_ALIGNED(16, int32_t, ref_coeffs_int[64 * 64]) = { 0 };
+        ASSERT_LE(txfm2d_size, NELEMENTS(ref_coeffs_int));
+        for (int ni = 0; ni < txfm2d_size; ++ni) {
+          ref_coeffs_int[ni] = (int32_t)round(ref_coeffs[ni]);
+        }
+        inv_txfm_func(ref_coeffs_int, expected, tx_w, tx_type_, bd);
+      } else {
+        // Compare original input vs forward HT + inverse HT.
+        for (int ni = 0; ni < txfm2d_size; ++ni) {
+          expected[ni] = input[ni];
+        }
+      }
+
+      DECLARE_ALIGNED(16, int32_t, coeffs[64 * 64]) = { 0 };
+      ASSERT_LE(txfm2d_size, NELEMENTS(coeffs));
+      fwd_txfm_func(input, coeffs, tx_w, tx_type_, bd);
 
-      uint16_t actual[64 * 64] = { 0 };
-      ASSERT_LT(txfm2d_size, NELEMENTS(actual));
+      DECLARE_ALIGNED(16, uint16_t, actual[64 * 64]) = { 0 };
+      ASSERT_LE(txfm2d_size, NELEMENTS(actual));
       inv_txfm_func(coeffs, actual, tx_w, tx_type_, bd);
 
+      double actual_max_error = 0;
       for (int ni = 0; ni < txfm2d_size; ++ni) {
-        EXPECT_GE(max_error_, abs(expected[ni] - actual[ni]));
+        const double this_error = abs(expected[ni] - actual[ni]);
+        actual_max_error = AOMMAX(actual_max_error, this_error);
+      }
+      EXPECT_GE(max_error_, actual_max_error)
+          << " tx_w: " << tx_w << " tx_h " << tx_h << " tx_type: " << tx_type_;
+      if (actual_max_error > max_error_) {  // exit early.
+        break;
       }
-      avg_abs_error += compute_avg_abs_error<int16_t, uint16_t>(
+      avg_abs_error += compute_avg_abs_error<uint16_t, uint16_t>(
           expected, actual, txfm2d_size);
     }
 
     avg_abs_error /= count;
-    // max_abs_avg_error comes from upper bound of
-    // printf("txfm1d_size: %d accuracy_avg_abs_error: %f\n",
-    // txfm1d_size_, avg_abs_error);
     EXPECT_GE(max_avg_error_, avg_abs_error)
         << " tx_w: " << tx_w << " tx_h " << tx_h << " tx_type: " << tx_type_;
   }
 
  private:
+  bool TxfmUsesApproximation() {
+    if (tx_size_wide[tx_size_] == 64 || tx_size_high[tx_size_] == 64) {
+      return true;
+    }
+    return false;
+  }
+
   int max_error_;
   double max_avg_error_;
   TX_TYPE tx_type_;
   TX_SIZE tx_size_;
 };
 
-TEST_P(AV1InvTxfm2d, RunRoundtripCheck) { RunRoundtripCheck(); }
+static int max_error_ls[TX_SIZES_ALL] = {
+  2,  // 4x4 transform
+  2,  // 8x8 transform
+  2,  // 16x16 transform
+  4,  // 32x32 transform
+  3,  // 64x64 transform
+  2,  // 4x8 transform
+  2,  // 8x4 transform
+  2,  // 8x16 transform
+  2,  // 16x8 transform
+  3,  // 16x32 transform
+  3,  // 32x16 transform
+  5,  // 32x64 transform
+  5,  // 64x32 transform
+  2,  // 4x16 transform
+  2,  // 16x4 transform
+  2,  // 8x32 transform
+  2,  // 32x8 transform
+  3,  // 16x64 transform
+  3,  // 64x16 transform
+};
 
-const AV1InvTxfm2dParam av1_inv_txfm2d_param[] = {
-#if CONFIG_EXT_TX
-#if CONFIG_RECT_TX
-  AV1InvTxfm2dParam(DCT_DCT, TX_4X8, 2, 0.007),
-  AV1InvTxfm2dParam(ADST_DCT, TX_4X8, 2, 0.012),
-  AV1InvTxfm2dParam(DCT_ADST, TX_4X8, 2, 0.012),
-  AV1InvTxfm2dParam(ADST_ADST, TX_4X8, 2, 0.012),
-  AV1InvTxfm2dParam(FLIPADST_DCT, TX_4X8, 2, 0.012),
-  AV1InvTxfm2dParam(DCT_FLIPADST, TX_4X8, 2, 0.012),
-  AV1InvTxfm2dParam(FLIPADST_FLIPADST, TX_4X8, 2, 0.012),
-  AV1InvTxfm2dParam(ADST_FLIPADST, TX_4X8, 2, 0.012),
-  AV1InvTxfm2dParam(FLIPADST_ADST, TX_4X8, 2, 0.012),
-
-  AV1InvTxfm2dParam(DCT_DCT, TX_8X4, 2, 0.007),
-  AV1InvTxfm2dParam(ADST_DCT, TX_8X4, 2, 0.012),
-  AV1InvTxfm2dParam(DCT_ADST, TX_8X4, 2, 0.012),
-  AV1InvTxfm2dParam(ADST_ADST, TX_8X4, 2, 0.012),
-  AV1InvTxfm2dParam(FLIPADST_DCT, TX_8X4, 2, 0.007),
-  AV1InvTxfm2dParam(DCT_FLIPADST, TX_8X4, 2, 0.012),
-  AV1InvTxfm2dParam(FLIPADST_FLIPADST, TX_8X4, 2, 0.012),
-  AV1InvTxfm2dParam(ADST_FLIPADST, TX_8X4, 2, 0.012),
-  AV1InvTxfm2dParam(FLIPADST_ADST, TX_8X4, 2, 0.012),
-
-  AV1InvTxfm2dParam(DCT_DCT, TX_8X16, 2, 0.025),
-  AV1InvTxfm2dParam(ADST_DCT, TX_8X16, 2, 0.020),
-  AV1InvTxfm2dParam(DCT_ADST, TX_8X16, 2, 0.027),
-  AV1InvTxfm2dParam(ADST_ADST, TX_8X16, 2, 0.023),
-  AV1InvTxfm2dParam(FLIPADST_DCT, TX_8X16, 2, 0.020),
-  AV1InvTxfm2dParam(DCT_FLIPADST, TX_8X16, 2, 0.027),
-  AV1InvTxfm2dParam(FLIPADST_FLIPADST, TX_8X16, 2, 0.032),
-  AV1InvTxfm2dParam(ADST_FLIPADST, TX_8X16, 2, 0.023),
-  AV1InvTxfm2dParam(FLIPADST_ADST, TX_8X16, 2, 0.023),
-
-  AV1InvTxfm2dParam(DCT_DCT, TX_16X8, 2, 0.007),
-  AV1InvTxfm2dParam(ADST_DCT, TX_16X8, 2, 0.012),
-  AV1InvTxfm2dParam(DCT_ADST, TX_16X8, 2, 0.024),
-  AV1InvTxfm2dParam(ADST_ADST, TX_16X8, 2, 0.033),
-  AV1InvTxfm2dParam(FLIPADST_DCT, TX_16X8, 2, 0.015),
-  AV1InvTxfm2dParam(DCT_FLIPADST, TX_16X8, 2, 0.032),
-  AV1InvTxfm2dParam(FLIPADST_FLIPADST, TX_16X8, 2, 0.032),
-  AV1InvTxfm2dParam(ADST_FLIPADST, TX_16X8, 2, 0.033),
-  AV1InvTxfm2dParam(FLIPADST_ADST, TX_16X8, 2, 0.032),
-#endif
-  AV1InvTxfm2dParam(FLIPADST_DCT, TX_4X4, 2, 0.002),
-  AV1InvTxfm2dParam(DCT_FLIPADST, TX_4X4, 2, 0.002),
-  AV1InvTxfm2dParam(FLIPADST_FLIPADST, TX_4X4, 2, 0.002),
-  AV1InvTxfm2dParam(ADST_FLIPADST, TX_4X4, 2, 0.002),
-  AV1InvTxfm2dParam(FLIPADST_ADST, TX_4X4, 2, 0.002),
-  AV1InvTxfm2dParam(FLIPADST_DCT, TX_8X8, 2, 0.02),
-  AV1InvTxfm2dParam(DCT_FLIPADST, TX_8X8, 2, 0.02),
-  AV1InvTxfm2dParam(FLIPADST_FLIPADST, TX_8X8, 2, 0.02),
-  AV1InvTxfm2dParam(ADST_FLIPADST, TX_8X8, 2, 0.02),
-  AV1InvTxfm2dParam(FLIPADST_ADST, TX_8X8, 2, 0.02),
-  AV1InvTxfm2dParam(FLIPADST_DCT, TX_16X16, 2, 0.04),
-  AV1InvTxfm2dParam(DCT_FLIPADST, TX_16X16, 2, 0.04),
-  AV1InvTxfm2dParam(FLIPADST_FLIPADST, TX_16X16, 11, 0.04),
-  AV1InvTxfm2dParam(ADST_FLIPADST, TX_16X16, 2, 0.04),
-  AV1InvTxfm2dParam(FLIPADST_ADST, TX_16X16, 2, 0.04),
-  AV1InvTxfm2dParam(FLIPADST_DCT, TX_32X32, 4, 0.4),
-  AV1InvTxfm2dParam(DCT_FLIPADST, TX_32X32, 4, 0.4),
-  AV1InvTxfm2dParam(FLIPADST_FLIPADST, TX_32X32, 4, 0.4),
-  AV1InvTxfm2dParam(ADST_FLIPADST, TX_32X32, 4, 0.4),
-  AV1InvTxfm2dParam(FLIPADST_ADST, TX_32X32, 4, 0.4),
-#endif
-  AV1InvTxfm2dParam(DCT_DCT, TX_4X4, 2, 0.002),
-  AV1InvTxfm2dParam(ADST_DCT, TX_4X4, 2, 0.002),
-  AV1InvTxfm2dParam(DCT_ADST, TX_4X4, 2, 0.002),
-  AV1InvTxfm2dParam(ADST_ADST, TX_4X4, 2, 0.002),
-  AV1InvTxfm2dParam(DCT_DCT, TX_8X8, 2, 0.02),
-  AV1InvTxfm2dParam(ADST_DCT, TX_8X8, 2, 0.02),
-  AV1InvTxfm2dParam(DCT_ADST, TX_8X8, 2, 0.02),
-  AV1InvTxfm2dParam(ADST_ADST, TX_8X8, 2, 0.02),
-  AV1InvTxfm2dParam(DCT_DCT, TX_16X16, 2, 0.04),
-  AV1InvTxfm2dParam(ADST_DCT, TX_16X16, 2, 0.04),
-  AV1InvTxfm2dParam(DCT_ADST, TX_16X16, 2, 0.04),
-  AV1InvTxfm2dParam(ADST_ADST, TX_16X16, 2, 0.04),
-  AV1InvTxfm2dParam(DCT_DCT, TX_32X32, 4, 0.4),
-  AV1InvTxfm2dParam(ADST_DCT, TX_32X32, 4, 0.4),
-  AV1InvTxfm2dParam(DCT_ADST, TX_32X32, 4, 0.4),
-  AV1InvTxfm2dParam(ADST_ADST, TX_32X32, 4, 0.4)
+static double avg_error_ls[TX_SIZES_ALL] = {
+  0.002,  // 4x4 transform
+  0.05,   // 8x8 transform
+  0.07,   // 16x16 transform
+  0.4,    // 32x32 transform
+  0.3,    // 64x64 transform
+  0.02,   // 4x8 transform
+  0.02,   // 8x4 transform
+  0.04,   // 8x16 transform
+  0.07,   // 16x8 transform
+  0.4,    // 16x32 transform
+  0.5,    // 32x16 transform
+  0.38,   // 32x64 transform
+  0.39,   // 64x32 transform
+  0.2,    // 4x16 transform
+  0.2,    // 16x4 transform
+  0.2,    // 8x32 transform
+  0.2,    // 32x8 transform
+  0.38,   // 16x64 transform
+  0.38,   // 64x16 transform
 };
 
+vector<AV1InvTxfm2dParam> GetInvTxfm2dParamList() {
+  vector<AV1InvTxfm2dParam> param_list;
+  for (int s = 0; s < TX_SIZES; ++s) {
+    const int max_error = max_error_ls[s];
+    const double avg_error = avg_error_ls[s];
+    for (int t = 0; t < TX_TYPES; ++t) {
+      const TX_TYPE tx_type = static_cast<TX_TYPE>(t);
+      const TX_SIZE tx_size = static_cast<TX_SIZE>(s);
+      if (libaom_test::IsTxSizeTypeValid(tx_size, tx_type)) {
+        param_list.push_back(
+            AV1InvTxfm2dParam(tx_type, tx_size, max_error, avg_error));
+      }
+    }
+  }
+  return param_list;
+}
+
 INSTANTIATE_TEST_CASE_P(C, AV1InvTxfm2d,
-                        ::testing::ValuesIn(av1_inv_txfm2d_param));
+                        ::testing::ValuesIn(GetInvTxfm2dParamList()));
+
+TEST_P(AV1InvTxfm2d, RunRoundtripCheck) { RunRoundtripCheck(); }
 
 TEST(AV1InvTxfm2d, CfgTest) {
   for (int bd_idx = 0; bd_idx < BD_NUM; ++bd_idx) {
     int bd = libaom_test::bd_arr[bd_idx];
     int8_t low_range = libaom_test::low_range_arr[bd_idx];
     int8_t high_range = libaom_test::high_range_arr[bd_idx];
-    // TODO(angiebird): include rect txfm in this test
-    for (int tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
+    for (int tx_size = 0; tx_size < TX_SIZES_ALL; ++tx_size) {
       for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
-        TXFM_2D_FLIP_CFG cfg = av1_get_inv_txfm_cfg(
-            static_cast<TX_TYPE>(tx_type), static_cast<TX_SIZE>(tx_size));
+        if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(tx_size),
+                                           static_cast<TX_TYPE>(tx_type)) ==
+            false) {
+          continue;
+        }
+        TXFM_2D_FLIP_CFG cfg;
+        av1_get_inv_txfm_cfg(static_cast<TX_TYPE>(tx_type),
+                             static_cast<TX_SIZE>(tx_size), &cfg);
         int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
         int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
         av1_gen_inv_stage_range(stage_range_col, stage_range_row, &cfg,
-                                fwd_shift_sum[tx_size], bd);
-        const TXFM_1D_CFG *col_cfg = cfg.col_cfg;
-        const TXFM_1D_CFG *row_cfg = cfg.row_cfg;
-        libaom_test::txfm_stage_range_check(stage_range_col, col_cfg->stage_num,
-                                            col_cfg->cos_bit, low_range,
+                                (TX_SIZE)tx_size, bd);
+        libaom_test::txfm_stage_range_check(stage_range_col, cfg.stage_num_col,
+                                            cfg.cos_bit_col, low_range,
                                             high_range);
-        libaom_test::txfm_stage_range_check(stage_range_row, row_cfg->stage_num,
-                                            row_cfg->cos_bit, low_range,
+        libaom_test::txfm_stage_range_check(stage_range_row, cfg.stage_num_row,
+                                            cfg.cos_bit_row, low_range,
                                             high_range);
       }
     }
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
+
+typedef ::testing::tuple<const LbdInvTxfm2dFunc> AV1LbdInvTxfm2dParam;
+class AV1LbdInvTxfm2d : public ::testing::TestWithParam<AV1LbdInvTxfm2dParam> {
+ public:
+  virtual void SetUp() { target_func_ = GET_PARAM(0); }
+  void RunAV1InvTxfm2dTest(TX_TYPE tx_type, TX_SIZE tx_size, int run_times);
+
+ private:
+  LbdInvTxfm2dFunc target_func_;
+};
+
+void AV1LbdInvTxfm2d::RunAV1InvTxfm2dTest(TX_TYPE tx_type, TX_SIZE tx_size,
+                                          int run_times) {
+  FwdTxfm2dFunc fwd_func_ = libaom_test::fwd_txfm_func_ls[tx_size];
+  InvTxfm2dFunc ref_func_ = libaom_test::inv_txfm_func_ls[tx_size];
+  if (fwd_func_ == NULL || ref_func_ == NULL || target_func_ == NULL) {
+    return;
+  }
+  const int bd = 8;
+  const int BLK_WIDTH = 64;
+  const int BLK_SIZE = BLK_WIDTH * BLK_WIDTH;
+  DECLARE_ALIGNED(16, int16_t, input[BLK_SIZE]) = { 0 };
+  DECLARE_ALIGNED(32, int32_t, inv_input[BLK_SIZE]) = { 0 };
+  DECLARE_ALIGNED(16, uint8_t, output[BLK_SIZE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, ref_output[BLK_SIZE]) = { 0 };
+  int stride = BLK_WIDTH;
+  int rows = tx_size_high[tx_size];
+  int cols = tx_size_wide[tx_size];
+  const int rows_nonezero = AOMMIN(32, rows);
+  const int cols_nonezero = AOMMIN(32, cols);
+  run_times /= (rows * cols);
+  run_times = AOMMAX(1, run_times);
+  const SCAN_ORDER *scan_order = get_default_scan(tx_size, tx_type);
+  const int16_t *scan = scan_order->scan;
+  const int16_t eobmax = rows_nonezero * cols_nonezero;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int randTimes = run_times == 1 ? (eobmax + 500) : 1;
+  for (int cnt = 0; cnt < randTimes; ++cnt) {
+    const int16_t max_in = (1 << (bd)) - 1;
+    for (int r = 0; r < BLK_WIDTH; ++r) {
+      for (int c = 0; c < BLK_WIDTH; ++c) {
+        input[r * cols + c] = (cnt == 0) ? max_in : rnd.Rand8Extremes();
+        output[r * stride + c] = (cnt == 0) ? 128 : rnd.Rand8();
+        ref_output[r * stride + c] = output[r * stride + c];
+      }
+    }
+    fwd_func_(input, inv_input, stride, tx_type, bd);
+
+    // produce eob input by setting high freq coeffs to zero
+    const int eob = AOMMIN(cnt + 1, eobmax);
+    for (int i = eob; i < eobmax; i++) {
+      inv_input[scan[i]] = 0;
+    }
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      ref_func_(inv_input, ref_output, stride, tx_type, bd);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      target_func_(inv_input, output, stride, tx_type, tx_size, eob);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 10) {
+      printf("txfm[%d] %3dx%-3d:%7.2f/%7.2fns", tx_type, cols, rows, time1,
+             time2);
+      printf("(%3.2f)\n", time1 / time2);
+    }
+    for (int r = 0; r < rows; ++r) {
+      for (int c = 0; c < cols; ++c) {
+        uint8_t ref_value = static_cast<uint8_t>(ref_output[r * stride + c]);
+        ASSERT_EQ(ref_value, output[r * stride + c])
+            << "[" << r << "," << c << "] " << cnt
+            << " tx_size: " << static_cast<int>(tx_size)
+            << " tx_type: " << tx_type << " eob " << eob;
+      }
+    }
+  }
+}
+
+TEST_P(AV1LbdInvTxfm2d, match) {
+  for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+    for (int i = 0; i < (int)TX_TYPES; ++i) {
+      if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(j),
+                                         static_cast<TX_TYPE>(i))) {
+        RunAV1InvTxfm2dTest(static_cast<TX_TYPE>(i), static_cast<TX_SIZE>(j),
+                            1);
+      }
+    }
+  }
+}
+
+TEST_P(AV1LbdInvTxfm2d, DISABLED_Speed) {
+  for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+    for (int i = 0; i < (int)TX_TYPES; ++i) {
+      if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(j),
+                                         static_cast<TX_TYPE>(i))) {
+        RunAV1InvTxfm2dTest(static_cast<TX_TYPE>(i), static_cast<TX_SIZE>(j),
+                            10000000);
+      }
+    }
+  }
+}
+
+#if HAVE_SSSE3
+#if defined(_MSC_VER) || defined(__SSSE3__)
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+INSTANTIATE_TEST_CASE_P(SSSE3, AV1LbdInvTxfm2d,
+                        ::testing::Values(av1_lowbd_inv_txfm2d_add_ssse3));
+#endif  // _MSC_VER || __SSSE3__
+#endif  // HAVE_SSSE3
+
+#if HAVE_AVX2
+extern "C" void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input,
+                                              uint8_t *output, int stride,
+                                              TX_TYPE tx_type, TX_SIZE tx_size,
+                                              int eob);
+
+INSTANTIATE_TEST_CASE_P(AVX2, AV1LbdInvTxfm2d,
+                        ::testing::Values(av1_lowbd_inv_txfm2d_add_avx2));
+#endif  // HAVE_AVX2
 
 }  // namespace
diff --git a/third_party/aom/test/av1_inv_txfm_test.cc b/third_party/aom/test/av1_inv_txfm_test.cc
deleted file mode 100644
index 873e80685..000000000
--- a/third_party/aom/test/av1_inv_txfm_test.cc
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
-
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-#include "test/acm_random.h"
-#include "test/av1_txfm_test.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "av1/common/blockd.h"
-#include "av1/common/scan.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/inv_txfm.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-
-typedef void (*IdctFunc)(const tran_low_t *in, tran_low_t *out);
-
-class TransTestBase {
- public:
-  virtual ~TransTestBase() {}
-
- protected:
-  void RunInvAccuracyCheck() {
-    tran_low_t input[64];
-    tran_low_t output[64];
-    double ref_input[64];
-    double ref_output[64];
-
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 5000;
-    for (int ti = 0; ti < count_test_block; ++ti) {
-      for (int ni = 0; ni < txfm_size_; ++ni) {
-        input[ni] = rnd.Rand8() - rnd.Rand8();
-        ref_input[ni] = static_cast<double>(input[ni]);
-      }
-
-      inv_txfm_(input, output);
-      libaom_test::reference_idct_1d(ref_input, ref_output, txfm_size_);
-
-      for (int ni = 0; ni < txfm_size_; ++ni) {
-        EXPECT_LE(
-            abs(output[ni] - static_cast<tran_low_t>(round(ref_output[ni]))),
-            max_error_);
-      }
-    }
-  }
-
-  double max_error_;
-  int txfm_size_;
-  IdctFunc inv_txfm_;
-};
-
-typedef std::tr1::tuple<IdctFunc, int, int> IdctParam;
-class AV1InvTxfm : public TransTestBase,
-                   public ::testing::TestWithParam<IdctParam> {
- public:
-  virtual void SetUp() {
-    inv_txfm_ = GET_PARAM(0);
-    txfm_size_ = GET_PARAM(1);
-    max_error_ = GET_PARAM(2);
-  }
-  virtual void TearDown() {}
-};
-
-TEST_P(AV1InvTxfm, RunInvAccuracyCheck) { RunInvAccuracyCheck(); }
-
-INSTANTIATE_TEST_CASE_P(C, AV1InvTxfm,
-                        ::testing::Values(IdctParam(&aom_idct4_c, 4, 1),
-                                          IdctParam(&aom_idct8_c, 8, 2),
-                                          IdctParam(&aom_idct16_c, 16, 4),
-                                          IdctParam(&aom_idct32_c, 32, 6)));
-
-#if CONFIG_AV1_ENCODER
-typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, InvTxfmFunc, TX_SIZE, int>
-    PartialInvTxfmParam;
-#if !CONFIG_ADAPT_SCAN
-const int kMaxNumCoeffs = 1024;
-#endif
-class AV1PartialIDctTest
-    : public ::testing::TestWithParam<PartialInvTxfmParam> {
- public:
-  virtual ~AV1PartialIDctTest() {}
-  virtual void SetUp() {
-    ftxfm_ = GET_PARAM(0);
-    full_itxfm_ = GET_PARAM(1);
-    partial_itxfm_ = GET_PARAM(2);
-    tx_size_ = GET_PARAM(3);
-    last_nonzero_ = GET_PARAM(4);
-  }
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  int last_nonzero_;
-  TX_SIZE tx_size_;
-  FwdTxfmFunc ftxfm_;
-  InvTxfmFunc full_itxfm_;
-  InvTxfmFunc partial_itxfm_;
-};
-
-#if !CONFIG_ADAPT_SCAN
-static MB_MODE_INFO get_mbmi() {
-  MB_MODE_INFO mbmi;
-  mbmi.ref_frame[0] = LAST_FRAME;
-  assert(is_inter_block(&mbmi));
-  return mbmi;
-}
-
-TEST_P(AV1PartialIDctTest, RunQuantCheck) {
-  int size;
-  switch (tx_size_) {
-    case TX_4X4: size = 4; break;
-    case TX_8X8: size = 8; break;
-    case TX_16X16: size = 16; break;
-    case TX_32X32: size = 32; break;
-    default: FAIL() << "Wrong Size!"; break;
-  }
-  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
-
-  const int count_test_block = 1000;
-  const int block_size = size * size;
-
-  DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]);
-
-  int max_error = 0;
-  for (int m = 0; m < count_test_block; ++m) {
-    // clear out destination buffer
-    memset(dst1, 0, sizeof(*dst1) * block_size);
-    memset(dst2, 0, sizeof(*dst2) * block_size);
-    memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
-    memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
-
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-    for (int n = 0; n < count_test_block; ++n) {
-      // Initialize a test block with input range [-255, 255].
-      if (n == 0) {
-        for (int j = 0; j < block_size; ++j) input_extreme_block[j] = 255;
-      } else if (n == 1) {
-        for (int j = 0; j < block_size; ++j) input_extreme_block[j] = -255;
-      } else {
-        for (int j = 0; j < block_size; ++j) {
-          input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
-        }
-      }
-
-      ftxfm_(input_extreme_block, output_ref_block, size);
-
-      // quantization with maximum allowed step sizes
-      test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336;
-      MB_MODE_INFO mbmi = get_mbmi();
-      for (int j = 1; j < last_nonzero_; ++j)
-        test_coef_block1[get_scan((const AV1_COMMON *)NULL, tx_size_, DCT_DCT,
-                                  &mbmi)
-                             ->scan[j]] = (output_ref_block[j] / 1828) * 1828;
-    }
-
-    ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
-    ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block1, dst2, size));
-
-    for (int j = 0; j < block_size; ++j) {
-      const int diff = dst1[j] - dst2[j];
-      const int error = diff * diff;
-      if (max_error < error) max_error = error;
-    }
-  }
-
-  EXPECT_EQ(0, max_error)
-      << "Error: partial inverse transform produces different results";
-}
-
-TEST_P(AV1PartialIDctTest, ResultsMatch) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int size;
-  switch (tx_size_) {
-    case TX_4X4: size = 4; break;
-    case TX_8X8: size = 8; break;
-    case TX_16X16: size = 16; break;
-    case TX_32X32: size = 32; break;
-    default: FAIL() << "Wrong Size!"; break;
-  }
-  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
-  const int count_test_block = 1000;
-  const int max_coeff = 32766 / 4;
-  const int block_size = size * size;
-  int max_error = 0;
-  for (int i = 0; i < count_test_block; ++i) {
-    // clear out destination buffer
-    memset(dst1, 0, sizeof(*dst1) * block_size);
-    memset(dst2, 0, sizeof(*dst2) * block_size);
-    memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
-    memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
-    int max_energy_leftover = max_coeff * max_coeff;
-    for (int j = 0; j < last_nonzero_; ++j) {
-      int16_t coef = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
-                                          (rnd.Rand16() - 32768) / 65536);
-      max_energy_leftover -= coef * coef;
-      if (max_energy_leftover < 0) {
-        max_energy_leftover = 0;
-        coef = 0;
-      }
-      MB_MODE_INFO mbmi = get_mbmi();
-      test_coef_block1[get_scan((const AV1_COMMON *)NULL, tx_size_, DCT_DCT,
-                                &mbmi)
-                           ->scan[j]] = coef;
-    }
-
-    memcpy(test_coef_block2, test_coef_block1,
-           sizeof(*test_coef_block2) * block_size);
-
-    ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
-    ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block2, dst2, size));
-
-    for (int j = 0; j < block_size; ++j) {
-      const int diff = dst1[j] - dst2[j];
-      const int error = diff * diff;
-      if (max_error < error) max_error = error;
-    }
-  }
-
-  EXPECT_EQ(0, max_error)
-      << "Error: partial inverse transform produces different results";
-}
-#endif
-using std::tr1::make_tuple;
-
-INSTANTIATE_TEST_CASE_P(
-    C, AV1PartialIDctTest,
-    ::testing::Values(make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c,
-                                 &aom_idct32x32_34_add_c, TX_32X32, 34),
-                      make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c,
-                                 &aom_idct32x32_1_add_c, TX_32X32, 1),
-                      make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_c,
-                                 &aom_idct16x16_10_add_c, TX_16X16, 10),
-                      make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_c,
-                                 &aom_idct16x16_1_add_c, TX_16X16, 1),
-                      make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c,
-                                 &aom_idct8x8_12_add_c, TX_8X8, 12),
-                      make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c,
-                                 &aom_idct8x8_1_add_c, TX_8X8, 1),
-                      make_tuple(&aom_fdct4x4_c, &aom_idct4x4_16_add_c,
-                                 &aom_idct4x4_1_add_c, TX_4X4, 1)));
-#endif  // CONFIG_AV1_ENCODER
-}  // namespace
diff --git a/third_party/aom/test/av1_quantize_test.cc b/third_party/aom/test/av1_quantize_test.cc
index 36ac8c4ad..aaf093918 100644
--- a/third_party/aom/test/av1_quantize_test.cc
+++ b/third_party/aom/test/av1_quantize_test.cc
@@ -12,8 +12,9 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
@@ -22,8 +23,8 @@
 namespace {
 
 typedef void (*QuantizeFpFunc)(
-    const tran_low_t *coeff_ptr, intptr_t count, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan, int log_scale);
@@ -50,20 +51,19 @@ class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
   void RunQuantizeTest() {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
-    DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-    DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-    DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-    DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+    DECLARE_ALIGNED(16, int16_t, zbin_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, round_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, quant_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[8]);
     DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]);
     DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]);
     DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
     DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
-    DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+    DECLARE_ALIGNED(16, int16_t, dequant_ptr[8]);
     uint16_t eob;
     uint16_t ref_eob;
     int err_count_total = 0;
     int first_failure = -1;
-    int skip_block = 0;
     int count = params_.coeffCount;
     const TX_SIZE txSize = getTxSize(count);
     int log_scale = (txSize == TX_32X32);
@@ -86,20 +86,26 @@ class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
         quant_ptr[j] = (1 << 16) / dequant_ptr[j];
         round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
       }
-
-      quanFuncRef(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
+      for (int j = 2; j < 8; ++j) {
+        zbin_ptr[j] = zbin_ptr[1];
+        quant_shift_ptr[j] = quant_shift_ptr[1];
+        dequant_ptr[j] = dequant_ptr[1];
+        quant_ptr[j] = quant_ptr[1];
+        round_ptr[j] = round_ptr[1];
+      }
+      quanFuncRef(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
                   quant_shift_ptr, ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
                   &ref_eob, scanOrder.scan, scanOrder.iscan, log_scale);
 
       ASM_REGISTER_STATE_CHECK(
-          quanFunc(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
+          quanFunc(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
                    quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, &eob,
                    scanOrder.scan, scanOrder.iscan, log_scale));
 
       for (int j = 0; j < count; ++j) {
         err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
                      (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
-        EXPECT_EQ(ref_qcoeff_ptr[j], qcoeff_ptr[j])
+        ASSERT_EQ(ref_qcoeff_ptr[j], qcoeff_ptr[j])
             << "qcoeff error: i = " << i << " j = " << j << "\n";
         EXPECT_EQ(ref_dqcoeff_ptr[j], dqcoeff_ptr[j])
             << "dqcoeff error: i = " << i << " j = " << j << "\n";
@@ -120,18 +126,17 @@ class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
   void RunEobTest() {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
-    DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-    DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-    DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-    DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+    DECLARE_ALIGNED(16, int16_t, zbin_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, round_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, quant_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[8]);
     DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]);
     DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]);
     DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
     DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
-    DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+    DECLARE_ALIGNED(16, int16_t, dequant_ptr[8]);
     uint16_t eob;
     uint16_t ref_eob;
-    int skip_block = 0;
     int count = params_.coeffCount;
     const TX_SIZE txSize = getTxSize(count);
     int log_scale = (txSize == TX_32X32);
@@ -157,13 +162,20 @@ class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
         quant_ptr[j] = (1 << 16) / dequant_ptr[j];
         round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
       }
+      for (int j = 2; j < 8; ++j) {
+        zbin_ptr[j] = zbin_ptr[1];
+        quant_shift_ptr[j] = quant_shift_ptr[1];
+        dequant_ptr[j] = dequant_ptr[1];
+        quant_ptr[j] = quant_ptr[1];
+        round_ptr[j] = round_ptr[1];
+      }
 
-      quanFuncRef(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
+      quanFuncRef(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
                   quant_shift_ptr, ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
                   &ref_eob, scanOrder.scan, scanOrder.iscan, log_scale);
 
       ASM_REGISTER_STATE_CHECK(
-          quanFunc(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
+          quanFunc(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
                    quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, &eob,
                    scanOrder.scan, scanOrder.iscan, log_scale));
       EXPECT_EQ(ref_eob, eob) << "eob error: "
@@ -196,7 +208,7 @@ TEST_P(AV1QuantizeTest, EobVerify) { RunEobTest(); }
 
 #if HAVE_SSE4_1
 const QuantizeFuncParams qfps[4] = {
-  QuantizeFuncParams(av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c,
+  QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c,
                      16),
   QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c,
                      64),
@@ -208,4 +220,20 @@ const QuantizeFuncParams qfps[4] = {
 
 INSTANTIATE_TEST_CASE_P(SSE4_1, AV1QuantizeTest, ::testing::ValuesIn(qfps));
 #endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+const QuantizeFuncParams qfps_avx2[4] = {
+  QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+                     16),
+  QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+                     64),
+  QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+                     256),
+  QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+                     1024),
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, AV1QuantizeTest, ::testing::ValuesIn(qfps_avx2));
+#endif  // HAVE_AVX2
+
 }  // namespace
diff --git a/third_party/aom/test/av1_round_shift_array_test.cc b/third_party/aom/test/av1_round_shift_array_test.cc
new file mode 100644
index 000000000..825d1348e
--- /dev/null
+++ b/third_party/aom/test/av1_round_shift_array_test.cc
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace AV1CompRoundShift {
+
+typedef void (*comp_round_shift_array_func)(int32_t *arr, int size, int bit);
+
+const int kValidBitCheck[] = {
+  -4, -3, -2, -1, 0, 1, 2, 3, 4,
+};
+
+typedef ::testing::tuple<comp_round_shift_array_func, BLOCK_SIZE, int>
+    CompRoundShiftParam;
+
+class AV1CompRoundShiftTest
+    : public ::testing::TestWithParam<CompRoundShiftParam> {
+ public:
+  ~AV1CompRoundShiftTest();
+
+  void SetUp() { rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); }
+  void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunCheckOutput(comp_round_shift_array_func test_impl, BLOCK_SIZE bsize,
+                      int bit);
+  void RunSpeedTest(comp_round_shift_array_func test_impl, BLOCK_SIZE bsize,
+                    int bit);
+
+  libaom_test::ACMRandom rnd_;
+};
+
+AV1CompRoundShiftTest::~AV1CompRoundShiftTest() { ; }
+
+void AV1CompRoundShiftTest::RunCheckOutput(
+    comp_round_shift_array_func test_impl, BLOCK_SIZE bsize, int bit) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int blk_wd = 64;
+  DECLARE_ALIGNED(32, int32_t, pred_[blk_wd]);
+  DECLARE_ALIGNED(32, int32_t, ref_buffer_[blk_wd]);
+  for (int i = 0; i < (blk_wd); ++i) {
+    ref_buffer_[i] = pred_[i] = rnd_.Rand31() / 16;
+  }
+  av1_round_shift_array_c(ref_buffer_, w, bit);
+  test_impl(pred_, w, bit);
+  for (int x = 0; x < w; ++x) {
+    ASSERT_EQ(ref_buffer_[x], pred_[x]) << w << "x" << h << "mismatch @"
+                                        << "(" << x << ")";
+  }
+}
+
+void AV1CompRoundShiftTest::RunSpeedTest(comp_round_shift_array_func test_impl,
+                                         BLOCK_SIZE bsize, int bit) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int blk_wd = 64;
+  DECLARE_ALIGNED(32, int32_t, ref_buffer_[blk_wd]);
+  for (int i = 0; i < (blk_wd); ++i) {
+    ref_buffer_[i] = rnd_.Rand31();
+  }
+
+  const int num_loops = 1000000000 / (w + h);
+  comp_round_shift_array_func funcs[2] = { av1_round_shift_array_c, test_impl };
+  double elapsed_time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    comp_round_shift_array_func func = funcs[i];
+    for (int j = 0; j < num_loops; ++j) {
+      func(ref_buffer_, w, bit);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  printf("av1_round_shift_array %3dx%-3d: bit : %d %7.2f/%7.2fns", w, h, bit,
+         elapsed_time[0], elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1CompRoundShiftTest, CheckOutput) {
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
+}
+
+TEST_P(AV1CompRoundShiftTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, AV1CompRoundShiftTest,
+    ::testing::Combine(::testing::Values(&av1_round_shift_array_sse4_1),
+                       ::testing::ValuesIn(txsize_to_bsize),
+                       ::testing::ValuesIn(kValidBitCheck)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, AV1CompRoundShiftTest,
+    ::testing::Combine(::testing::Values(&av1_round_shift_array_neon),
+                       ::testing::ValuesIn(txsize_to_bsize),
+                       ::testing::ValuesIn(kValidBitCheck)));
+#endif
+
+};  // namespace AV1CompRoundShift
diff --git a/third_party/aom/test/av1_txfm_test.cc b/third_party/aom/test/av1_txfm_test.cc
index 4545de100..d5b0ce325 100644
--- a/third_party/aom/test/av1_txfm_test.cc
+++ b/third_party/aom/test/av1_txfm_test.cc
@@ -34,7 +34,6 @@ void get_txfm1d_type(TX_TYPE txfm2d_type, TYPE_TXFM *type0, TYPE_TXFM *type1) {
       *type0 = TYPE_ADST;
       *type1 = TYPE_ADST;
       break;
-#if CONFIG_EXT_TX
     case FLIPADST_DCT:
       *type0 = TYPE_ADST;
       *type1 = TYPE_DCT;
@@ -55,7 +54,34 @@ void get_txfm1d_type(TX_TYPE txfm2d_type, TYPE_TXFM *type0, TYPE_TXFM *type1) {
       *type0 = TYPE_ADST;
       *type1 = TYPE_ADST;
       break;
-#endif  // CONFIG_EXT_TX
+    case IDTX:
+      *type0 = TYPE_IDTX;
+      *type1 = TYPE_IDTX;
+      break;
+    case H_DCT:
+      *type0 = TYPE_IDTX;
+      *type1 = TYPE_DCT;
+      break;
+    case V_DCT:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_IDTX;
+      break;
+    case H_ADST:
+      *type0 = TYPE_IDTX;
+      *type1 = TYPE_ADST;
+      break;
+    case V_ADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_IDTX;
+      break;
+    case H_FLIPADST:
+      *type0 = TYPE_IDTX;
+      *type1 = TYPE_ADST;
+      break;
+    case V_FLIPADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_IDTX;
+      break;
     default:
       *type0 = TYPE_DCT;
       *type1 = TYPE_DCT;
@@ -64,6 +90,7 @@ void get_txfm1d_type(TX_TYPE txfm2d_type, TYPE_TXFM *type0, TYPE_TXFM *type1) {
   }
 }
 
+double Sqrt2 = pow(2, 0.5);
 double invSqrt2 = 1 / pow(2, 0.5);
 
 double dct_matrix(double n, double k, int size) {
@@ -92,7 +119,63 @@ void reference_idct_1d(const double *in, double *out, int size) {
   }
 }
 
+// TODO(any): Copied from the old 'fadst4' (same as the new 'av1_fadst4_new'
+// function). Should be replaced by a proper reference function that takes
+// 'double' input & output.
+static void fadst4_new(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t x0, x1, x2, x3;
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_4_9 * x0;
+  s2 = sinpi_2_9 * x1;
+  s3 = sinpi_1_9 * x1;
+  s4 = sinpi_3_9 * x2;
+  s5 = sinpi_4_9 * x3;
+  s6 = sinpi_2_9 * x3;
+  s7 = x0 + x1 - x3;
+
+  x0 = s0 + s2 + s5;
+  x1 = sinpi_3_9 * s7;
+  x2 = s1 - s3 + s6;
+  x3 = s4;
+
+  s0 = x0 + x3;
+  s1 = x1;
+  s2 = x2 - x3;
+  s3 = x2 - x0 + x3;
+
+  // 1-D transform scaling factor is sqrt(2).
+  output[0] = (tran_low_t)fdct_round_shift(s0);
+  output[1] = (tran_low_t)fdct_round_shift(s1);
+  output[2] = (tran_low_t)fdct_round_shift(s2);
+  output[3] = (tran_low_t)fdct_round_shift(s3);
+}
+
 void reference_adst_1d(const double *in, double *out, int size) {
+  if (size == 4) {  // Special case.
+    tran_low_t int_input[4];
+    for (int i = 0; i < 4; ++i) {
+      int_input[i] = static_cast<tran_low_t>(round(in[i]));
+    }
+    tran_low_t int_output[4];
+    fadst4_new(int_input, int_output);
+    for (int i = 0; i < 4; ++i) {
+      out[i] = int_output[i];
+    }
+    return;
+  }
+
   for (int k = 0; k < size; ++k) {
     out[k] = 0;
     for (int n = 0; n < size; ++n) {
@@ -101,96 +184,188 @@ void reference_adst_1d(const double *in, double *out, int size) {
   }
 }
 
+void reference_idtx_1d(const double *in, double *out, int size) {
+  double scale = 0;
+  if (size == 4)
+    scale = Sqrt2;
+  else if (size == 8)
+    scale = 2;
+  else if (size == 16)
+    scale = 2 * Sqrt2;
+  else if (size == 32)
+    scale = 4;
+  else if (size == 64)
+    scale = 4 * Sqrt2;
+  for (int k = 0; k < size; ++k) {
+    out[k] = in[k] * scale;
+  }
+}
+
 void reference_hybrid_1d(double *in, double *out, int size, int type) {
   if (type == TYPE_DCT)
     reference_dct_1d(in, out, size);
-  else
+  else if (type == TYPE_ADST)
     reference_adst_1d(in, out, size);
+  else
+    reference_idtx_1d(in, out, size);
 }
 
-void reference_hybrid_2d(double *in, double *out, int size, int type0,
-                         int type1) {
-  double *tempOut = new double[size * size];
+double get_amplification_factor(TX_TYPE tx_type, TX_SIZE tx_size) {
+  TXFM_2D_FLIP_CFG fwd_txfm_flip_cfg;
+  av1_get_fwd_txfm_cfg(tx_type, tx_size, &fwd_txfm_flip_cfg);
+  const int tx_width = tx_size_wide[fwd_txfm_flip_cfg.tx_size];
+  const int tx_height = tx_size_high[fwd_txfm_flip_cfg.tx_size];
+  const int8_t *shift = fwd_txfm_flip_cfg.shift;
+  const int amplify_bit = shift[0] + shift[1] + shift[2];
+  double amplify_factor =
+      amplify_bit >= 0 ? (1 << amplify_bit) : (1.0 / (1 << -amplify_bit));
+
+  // For rectangular transforms, we need to multiply by an extra factor.
+  const int rect_type = get_rect_tx_log_ratio(tx_width, tx_height);
+  if (abs(rect_type) == 1) {
+    amplify_factor *= pow(2, 0.5);
+  }
+  return amplify_factor;
+}
 
-  for (int r = 0; r < size; r++) {
-    // out ->tempOut
-    for (int c = 0; c < size; c++) {
-      tempOut[r * size + c] = in[c * size + r];
+void reference_hybrid_2d(double *in, double *out, TX_TYPE tx_type,
+                         TX_SIZE tx_size) {
+  // Get transform type and size of each dimension.
+  TYPE_TXFM type0;
+  TYPE_TXFM type1;
+  get_txfm1d_type(tx_type, &type0, &type1);
+  const int tx_width = tx_size_wide[tx_size];
+  const int tx_height = tx_size_high[tx_size];
+
+  double *const temp_in = new double[AOMMAX(tx_width, tx_height)];
+  double *const temp_out = new double[AOMMAX(tx_width, tx_height)];
+  double *const out_interm = new double[tx_width * tx_height];
+  const int stride = tx_width;
+
+  // Transform columns.
+  for (int c = 0; c < tx_width; ++c) {
+    for (int r = 0; r < tx_height; ++r) {
+      temp_in[r] = in[r * stride + c];
+    }
+    reference_hybrid_1d(temp_in, temp_out, tx_height, type0);
+    for (int r = 0; r < tx_height; ++r) {
+      out_interm[r * stride + c] = temp_out[r];
     }
   }
 
-  // dct each row: in -> out
-  for (int r = 0; r < size; r++) {
-    reference_hybrid_1d(tempOut + r * size, out + r * size, size, type0);
+  // Transform rows.
+  for (int r = 0; r < tx_height; ++r) {
+    reference_hybrid_1d(out_interm + r * stride, out + r * stride, tx_width,
+                        type1);
   }
 
-  for (int r = 0; r < size; r++) {
-    // out ->tempOut
-    for (int c = 0; c < size; c++) {
-      tempOut[r * size + c] = out[c * size + r];
+  delete[] temp_in;
+  delete[] temp_out;
+  delete[] out_interm;
+
+  // These transforms use an approximate 2D DCT transform, by only keeping the
+  // top-left quarter of the coefficients, and repacking them in the first
+  // quarter indices.
+  // TODO(urvang): Refactor this code.
+  if (tx_width == 64 && tx_height == 64) {  // tx_size == TX_64X64
+    // Zero out top-right 32x32 area.
+    for (int row = 0; row < 32; ++row) {
+      memset(out + row * 64 + 32, 0, 32 * sizeof(*out));
+    }
+    // Zero out the bottom 64x32 area.
+    memset(out + 32 * 64, 0, 32 * 64 * sizeof(*out));
+    // Re-pack non-zero coeffs in the first 32x32 indices.
+    for (int row = 1; row < 32; ++row) {
+      memcpy(out + row * 32, out + row * 64, 32 * sizeof(*out));
+    }
+  } else if (tx_width == 32 && tx_height == 64) {  // tx_size == TX_32X64
+    // Zero out the bottom 32x32 area.
+    memset(out + 32 * 32, 0, 32 * 32 * sizeof(*out));
+    // Note: no repacking needed here.
+  } else if (tx_width == 64 && tx_height == 32) {  // tx_size == TX_64X32
+    // Zero out right 32x32 area.
+    for (int row = 0; row < 32; ++row) {
+      memset(out + row * 64 + 32, 0, 32 * sizeof(*out));
+    }
+    // Re-pack non-zero coeffs in the first 32x32 indices.
+    for (int row = 1; row < 32; ++row) {
+      memcpy(out + row * 32, out + row * 64, 32 * sizeof(*out));
+    }
+  } else if (tx_width == 16 && tx_height == 64) {  // tx_size == TX_16X64
+    // Zero out the bottom 16x32 area.
+    memset(out + 16 * 32, 0, 16 * 32 * sizeof(*out));
+    // Note: no repacking needed here.
+  } else if (tx_width == 64 && tx_height == 16) {  // tx_size == TX_64X16
+    // Zero out right 32x16 area.
+    for (int row = 0; row < 16; ++row) {
+      memset(out + row * 64 + 32, 0, 32 * sizeof(*out));
+    }
+    // Re-pack non-zero coeffs in the first 32x16 indices.
+    for (int row = 1; row < 16; ++row) {
+      memcpy(out + row * 32, out + row * 64, 32 * sizeof(*out));
     }
   }
 
-  for (int r = 0; r < size; r++) {
-    reference_hybrid_1d(tempOut + r * size, out + r * size, size, type1);
+  // Apply appropriate scale.
+  const double amplify_factor = get_amplification_factor(tx_type, tx_size);
+  for (int c = 0; c < tx_width; ++c) {
+    for (int r = 0; r < tx_height; ++r) {
+      out[r * stride + c] *= amplify_factor;
+    }
   }
-  delete[] tempOut;
 }
 
 template <typename Type>
-void fliplr(Type *dest, int stride, int length) {
-  int i, j;
-  for (i = 0; i < length; ++i) {
-    for (j = 0; j < length / 2; ++j) {
-      const Type tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[i * stride + length - 1 - j];
-      dest[i * stride + length - 1 - j] = tmp;
+void fliplr(Type *dest, int width, int height, int stride) {
+  for (int r = 0; r < height; ++r) {
+    for (int c = 0; c < width / 2; ++c) {
+      const Type tmp = dest[r * stride + c];
+      dest[r * stride + c] = dest[r * stride + width - 1 - c];
+      dest[r * stride + width - 1 - c] = tmp;
     }
   }
 }
 
 template <typename Type>
-void flipud(Type *dest, int stride, int length) {
-  int i, j;
-  for (j = 0; j < length; ++j) {
-    for (i = 0; i < length / 2; ++i) {
-      const Type tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(length - 1 - i) * stride + j];
-      dest[(length - 1 - i) * stride + j] = tmp;
+void flipud(Type *dest, int width, int height, int stride) {
+  for (int c = 0; c < width; ++c) {
+    for (int r = 0; r < height / 2; ++r) {
+      const Type tmp = dest[r * stride + c];
+      dest[r * stride + c] = dest[(height - 1 - r) * stride + c];
+      dest[(height - 1 - r) * stride + c] = tmp;
     }
   }
 }
 
 template <typename Type>
-void fliplrud(Type *dest, int stride, int length) {
-  int i, j;
-  for (i = 0; i < length / 2; ++i) {
-    for (j = 0; j < length; ++j) {
-      const Type tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(length - 1 - i) * stride + length - 1 - j];
-      dest[(length - 1 - i) * stride + length - 1 - j] = tmp;
+void fliplrud(Type *dest, int width, int height, int stride) {
+  for (int r = 0; r < height / 2; ++r) {
+    for (int c = 0; c < width; ++c) {
+      const Type tmp = dest[r * stride + c];
+      dest[r * stride + c] = dest[(height - 1 - r) * stride + width - 1 - c];
+      dest[(height - 1 - r) * stride + width - 1 - c] = tmp;
     }
   }
 }
 
-template void fliplr<double>(double *dest, int stride, int length);
-template void flipud<double>(double *dest, int stride, int length);
-template void fliplrud<double>(double *dest, int stride, int length);
+template void fliplr<double>(double *dest, int width, int height, int stride);
+template void flipud<double>(double *dest, int width, int height, int stride);
+template void fliplrud<double>(double *dest, int width, int height, int stride);
 
 int bd_arr[BD_NUM] = { 8, 10, 12 };
-int8_t low_range_arr[BD_NUM] = { 16, 32, 32 };
+
+int8_t low_range_arr[BD_NUM] = { 18, 32, 32 };
 int8_t high_range_arr[BD_NUM] = { 32, 32, 32 };
 
 void txfm_stage_range_check(const int8_t *stage_range, int stage_num,
-                            const int8_t *cos_bit, int low_range,
-                            int high_range) {
+                            int8_t cos_bit, int low_range, int high_range) {
   for (int i = 0; i < stage_num; ++i) {
     EXPECT_LE(stage_range[i], low_range);
+    ASSERT_LE(stage_range[i] + cos_bit, high_range) << "stage = " << i;
   }
   for (int i = 0; i < stage_num - 1; ++i) {
     // make sure there is no overflow while doing half_btf()
-    EXPECT_LE(stage_range[i] + cos_bit[i], high_range);
-    EXPECT_LE(stage_range[i + 1] + cos_bit[i], high_range);
+    ASSERT_LE(stage_range[i + 1] + cos_bit, high_range) << "stage = " << i;
   }
 }
 }  // namespace libaom_test
diff --git a/third_party/aom/test/av1_txfm_test.h b/third_party/aom/test/av1_txfm_test.h
index 3e64e36ad..70d1a894f 100644
--- a/third_party/aom/test/av1_txfm_test.h
+++ b/third_party/aom/test/av1_txfm_test.h
@@ -19,17 +19,20 @@
 #endif
 #include <math.h>
 
+#include "config/av1_rtcd.h"
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 #include "test/acm_random.h"
-#include "av1/common/enums.h"
 #include "av1/common/av1_txfm.h"
-#include "./av1_rtcd.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
 
 namespace libaom_test {
 typedef enum {
   TYPE_DCT = 0,
   TYPE_ADST,
+  TYPE_IDTX,
   TYPE_IDCT,
   TYPE_IADST,
   TYPE_LAST
@@ -46,8 +49,10 @@ void reference_adst_1d(const double *in, double *out, int size);
 
 void reference_hybrid_1d(double *in, double *out, int size, int type);
 
-void reference_hybrid_2d(double *in, double *out, int size, int type0,
-                         int type1);
+double get_amplification_factor(TX_TYPE tx_type, TX_SIZE tx_size);
+
+void reference_hybrid_2d(double *in, double *out, TX_TYPE tx_type,
+                         TX_SIZE tx_size);
 template <typename Type1, typename Type2>
 static double compute_avg_abs_error(const Type1 *a, const Type2 *b,
                                     const int size) {
@@ -60,81 +65,62 @@ static double compute_avg_abs_error(const Type1 *a, const Type2 *b,
 }
 
 template <typename Type>
-void fliplr(Type *dest, int stride, int length);
+void fliplr(Type *dest, int width, int height, int stride);
 
 template <typename Type>
-void flipud(Type *dest, int stride, int length);
+void flipud(Type *dest, int width, int height, int stride);
 
 template <typename Type>
-void fliplrud(Type *dest, int stride, int length);
+void fliplrud(Type *dest, int width, int height, int stride);
 
-typedef void (*TxfmFunc)(const int32_t *in, int32_t *out, const int8_t *cos_bit,
+typedef void (*TxfmFunc)(const int32_t *in, int32_t *out, const int8_t cos_bit,
                          const int8_t *range_bit);
 
-typedef void (*Fwd_Txfm2d_Func)(const int16_t *, int32_t *, int, TX_TYPE, int);
-typedef void (*Inv_Txfm2d_Func)(const int32_t *, uint16_t *, int, TX_TYPE, int);
+typedef void (*InvTxfm2dFunc)(const int32_t *, uint16_t *, int, TX_TYPE, int);
+typedef void (*LbdInvTxfm2dFunc)(const int32_t *, uint8_t *, int, TX_TYPE,
+                                 TX_SIZE, int);
 
 static const int bd = 10;
 static const int input_base = (1 << bd);
 
-#if CONFIG_HIGHBITDEPTH
+static INLINE bool IsTxSizeTypeValid(TX_SIZE tx_size, TX_TYPE tx_type) {
+  const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
+  TxSetType tx_set_type;
+  if (tx_size_sqr_up > TX_32X32) {
+    tx_set_type = EXT_TX_SET_DCTONLY;
+  } else if (tx_size_sqr_up == TX_32X32) {
+    tx_set_type = EXT_TX_SET_DCT_IDTX;
+  } else {
+    tx_set_type = EXT_TX_SET_ALL16;
+  }
+  return av1_ext_tx_used[tx_set_type][tx_type] != 0;
+}
+
 #if CONFIG_AV1_ENCODER
 
-static const Fwd_Txfm2d_Func fwd_txfm_func_ls[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  NULL,
-#endif
-  av1_fwd_txfm2d_4x4_c,
-  av1_fwd_txfm2d_8x8_c,
-  av1_fwd_txfm2d_16x16_c,
-  av1_fwd_txfm2d_32x32_c,
-#if CONFIG_TX64X64
-  av1_fwd_txfm2d_64x64_c,
-#endif  // CONFIG_TX64X64
-  av1_fwd_txfm2d_4x8_c,
-  av1_fwd_txfm2d_8x4_c,
-  av1_fwd_txfm2d_8x16_c,
-  av1_fwd_txfm2d_16x8_c,
-  av1_fwd_txfm2d_16x32_c,
-  av1_fwd_txfm2d_32x16_c,
-#if CONFIG_TX64X64
-  av1_fwd_txfm2d_32x64_c,
-  av1_fwd_txfm2d_64x32_c,
-#endif  // CONFIG_TX64X64
-  NULL,
-  NULL,
-  NULL,
-  NULL,
+static const FwdTxfm2dFunc fwd_txfm_func_ls[TX_SIZES_ALL] = {
+  av1_fwd_txfm2d_4x4_c,   av1_fwd_txfm2d_8x8_c,   av1_fwd_txfm2d_16x16_c,
+  av1_fwd_txfm2d_32x32_c, av1_fwd_txfm2d_64x64_c, av1_fwd_txfm2d_4x8_c,
+  av1_fwd_txfm2d_8x4_c,   av1_fwd_txfm2d_8x16_c,  av1_fwd_txfm2d_16x8_c,
+  av1_fwd_txfm2d_16x32_c, av1_fwd_txfm2d_32x16_c, av1_fwd_txfm2d_32x64_c,
+  av1_fwd_txfm2d_64x32_c, av1_fwd_txfm2d_4x16_c,  av1_fwd_txfm2d_16x4_c,
+  av1_fwd_txfm2d_8x32_c,  av1_fwd_txfm2d_32x8_c,  av1_fwd_txfm2d_16x64_c,
+  av1_fwd_txfm2d_64x16_c,
 };
 #endif
 
-static const Inv_Txfm2d_Func inv_txfm_func_ls[TX_SIZES_ALL] = {
-#if CONFIG_CHROMA_2X2
-  NULL,
-#endif
-  av1_inv_txfm2d_add_4x4_c,
-  av1_inv_txfm2d_add_8x8_c,
-  av1_inv_txfm2d_add_16x16_c,
-  av1_inv_txfm2d_add_32x32_c,
-#if CONFIG_TX64X64
-  av1_inv_txfm2d_add_64x64_c,
-#endif  // CONFIG_TX64X64
-  av1_inv_txfm2d_add_4x8_c,
-  av1_inv_txfm2d_add_8x4_c,
-  av1_inv_txfm2d_add_8x16_c,
-  av1_inv_txfm2d_add_16x8_c,
-  av1_inv_txfm2d_add_16x32_c,
-  av1_inv_txfm2d_add_32x16_c,
-#if CONFIG_TX64X64
-  av1_inv_txfm2d_add_32x64_c,
-  av1_inv_txfm2d_add_64x32_c,
-#endif  // CONFIG_TX64X64
-  NULL,
-  NULL,
-  NULL,
-  NULL,
+static const InvTxfm2dFunc inv_txfm_func_ls[TX_SIZES_ALL] = {
+  av1_inv_txfm2d_add_4x4_c,   av1_inv_txfm2d_add_8x8_c,
+  av1_inv_txfm2d_add_16x16_c, av1_inv_txfm2d_add_32x32_c,
+  av1_inv_txfm2d_add_64x64_c, av1_inv_txfm2d_add_4x8_c,
+  av1_inv_txfm2d_add_8x4_c,   av1_inv_txfm2d_add_8x16_c,
+  av1_inv_txfm2d_add_16x8_c,  av1_inv_txfm2d_add_16x32_c,
+  av1_inv_txfm2d_add_32x16_c, av1_inv_txfm2d_add_32x64_c,
+  av1_inv_txfm2d_add_64x32_c, av1_inv_txfm2d_add_4x16_c,
+  av1_inv_txfm2d_add_16x4_c,  av1_inv_txfm2d_add_8x32_c,
+  av1_inv_txfm2d_add_32x8_c,  av1_inv_txfm2d_add_16x64_c,
+  av1_inv_txfm2d_add_64x16_c,
 };
-#endif  // CONFIG_HIGHBITDEPTH
 
 #define BD_NUM 3
 
@@ -143,7 +129,7 @@ extern int8_t low_range_arr[];
 extern int8_t high_range_arr[];
 
 void txfm_stage_range_check(const int8_t *stage_range, int stage_num,
-                            const int8_t *cos_bit, int low_range,
+                            const int8_t cos_bit, int low_range,
                             int high_range);
 }  // namespace libaom_test
 #endif  // AV1_TXFM_TEST_H_
diff --git a/third_party/aom/test/av1_wedge_utils_test.cc b/third_party/aom/test/av1_wedge_utils_test.cc
index d4b560fc1..cfdf2d36c 100644
--- a/third_party/aom/test/av1_wedge_utils_test.cc
+++ b/third_party/aom/test/av1_wedge_utils_test.cc
@@ -11,10 +11,9 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./aom_config.h"
-
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 
@@ -100,7 +99,7 @@ TEST_F(WedgeUtilsSSEFuncTest, ResidualBlendingEquiv) {
       p1[j] = clamp(s[j] + rng_(33) - 16, 0, UINT8_MAX);
     }
 
-    aom_blend_a64_mask(p, w, p0, w, p1, w, m, w, h, w, 0, 0);
+    aom_blend_a64_mask(p, w, p0, w, p1, w, m, w, w, h, 0, 0);
 
     aom_subtract_block(h, w, r0, w, s, w, p0, w);
     aom_subtract_block(h, w, r1, w, s, w, p1, w);
diff --git a/third_party/aom/test/avg_test.cc b/third_party/aom/test/avg_test.cc
deleted file mode 100644
index e83a75c1c..000000000
--- a/third_party/aom/test/avg_test.cc
+++ /dev/null
@@ -1,299 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
-
-#include <limits.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "aom_mem/aom_mem.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-class AverageTestBase : public ::testing::Test {
- public:
-  AverageTestBase(int width, int height) : width_(width), height_(height) {}
-
-  static void SetUpTestCase() {
-    source_data_ = reinterpret_cast<uint8_t *>(
-        aom_memalign(kDataAlignment, kDataBlockSize));
-  }
-
-  static void TearDownTestCase() {
-    aom_free(source_data_);
-    source_data_ = NULL;
-  }
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  // Handle blocks up to 4 blocks 64x64 with stride up to 128
-  static const int kDataAlignment = 16;
-  static const int kDataBlockSize = 64 * 128;
-
-  virtual void SetUp() {
-    source_stride_ = (width_ + 31) & ~31;
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-  }
-
-  void FillConstant(uint8_t fill_constant) {
-    for (int i = 0; i < width_ * height_; ++i) {
-      source_data_[i] = fill_constant;
-    }
-  }
-
-  void FillRandom() {
-    for (int i = 0; i < width_ * height_; ++i) {
-      source_data_[i] = rnd_.Rand8();
-    }
-  }
-
-  int width_, height_;
-  static uint8_t *source_data_;
-  int source_stride_;
-
-  ACMRandom rnd_;
-};
-
-typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,
-                              const int ref_stride, const int height);
-
-typedef std::tr1::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;
-
-class IntProRowTest : public AverageTestBase,
-                      public ::testing::WithParamInterface<IntProRowParam> {
- public:
-  IntProRowTest()
-      : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(NULL), hbuf_c_(NULL) {
-    asm_func_ = GET_PARAM(1);
-    c_func_ = GET_PARAM(2);
-  }
-
- protected:
-  virtual void SetUp() {
-    hbuf_asm_ = reinterpret_cast<int16_t *>(
-        aom_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));
-    hbuf_c_ = reinterpret_cast<int16_t *>(
-        aom_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));
-  }
-
-  virtual void TearDown() {
-    aom_free(hbuf_c_);
-    hbuf_c_ = NULL;
-    aom_free(hbuf_asm_);
-    hbuf_asm_ = NULL;
-  }
-
-  void RunComparison() {
-    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
-    ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
-    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
-        << "Output mismatch";
-  }
-
- private:
-  IntProRowFunc asm_func_;
-  IntProRowFunc c_func_;
-  int16_t *hbuf_asm_;
-  int16_t *hbuf_c_;
-};
-
-typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);
-
-typedef std::tr1::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;
-
-class IntProColTest : public AverageTestBase,
-                      public ::testing::WithParamInterface<IntProColParam> {
- public:
-  IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) {
-    asm_func_ = GET_PARAM(1);
-    c_func_ = GET_PARAM(2);
-  }
-
- protected:
-  void RunComparison() {
-    ASM_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_));
-    ASM_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
-    EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";
-  }
-
- private:
-  IntProColFunc asm_func_;
-  IntProColFunc c_func_;
-  int16_t sum_asm_;
-  int16_t sum_c_;
-};
-
-typedef int (*SatdFunc)(const int16_t *coeffs, int length);
-typedef std::tr1::tuple<int, SatdFunc> SatdTestParam;
-
-class SatdTest : public ::testing::Test,
-                 public ::testing::WithParamInterface<SatdTestParam> {
- protected:
-  virtual void SetUp() {
-    satd_size_ = GET_PARAM(0);
-    satd_func_ = GET_PARAM(1);
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-    src_ = reinterpret_cast<int16_t *>(
-        aom_memalign(16, sizeof(*src_) * satd_size_));
-    ASSERT_TRUE(src_ != NULL);
-  }
-
-  virtual void TearDown() {
-    libaom_test::ClearSystemState();
-    aom_free(src_);
-  }
-
-  void FillConstant(const int16_t val) {
-    for (int i = 0; i < satd_size_; ++i) src_[i] = val;
-  }
-
-  void FillRandom() {
-    for (int i = 0; i < satd_size_; ++i) src_[i] = rnd_.Rand16();
-  }
-
-  void Check(int expected) {
-    int total;
-    ASM_REGISTER_STATE_CHECK(total = satd_func_(src_, satd_size_));
-    EXPECT_EQ(expected, total);
-  }
-
-  int satd_size_;
-
- private:
-  int16_t *src_;
-  SatdFunc satd_func_;
-  ACMRandom rnd_;
-};
-
-uint8_t *AverageTestBase::source_data_ = NULL;
-
-TEST_P(IntProRowTest, MinValue) {
-  FillConstant(0);
-  RunComparison();
-}
-
-TEST_P(IntProRowTest, MaxValue) {
-  FillConstant(255);
-  RunComparison();
-}
-
-TEST_P(IntProRowTest, Random) {
-  FillRandom();
-  RunComparison();
-}
-
-TEST_P(IntProColTest, MinValue) {
-  FillConstant(0);
-  RunComparison();
-}
-
-TEST_P(IntProColTest, MaxValue) {
-  FillConstant(255);
-  RunComparison();
-}
-
-TEST_P(IntProColTest, Random) {
-  FillRandom();
-  RunComparison();
-}
-
-TEST_P(SatdTest, MinValue) {
-  const int kMin = -32640;
-  const int expected = -kMin * satd_size_;
-  FillConstant(kMin);
-  Check(expected);
-}
-
-TEST_P(SatdTest, MaxValue) {
-  const int kMax = 32640;
-  const int expected = kMax * satd_size_;
-  FillConstant(kMax);
-  Check(expected);
-}
-
-TEST_P(SatdTest, Random) {
-  int expected;
-  switch (satd_size_) {
-    case 16: expected = 205298; break;
-    case 64: expected = 1113950; break;
-    case 256: expected = 4268415; break;
-    case 1024: expected = 16954082; break;
-    default:
-      FAIL() << "Invalid satd size (" << satd_size_
-             << ") valid: 16/64/256/1024";
-  }
-  FillRandom();
-  Check(expected);
-}
-
-using std::tr1::make_tuple;
-
-INSTANTIATE_TEST_CASE_P(C, SatdTest,
-                        ::testing::Values(make_tuple(16, &aom_satd_c),
-                                          make_tuple(64, &aom_satd_c),
-                                          make_tuple(256, &aom_satd_c),
-                                          make_tuple(1024, &aom_satd_c)));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
-    SSE2, IntProRowTest,
-    ::testing::Values(make_tuple(16, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
-                      make_tuple(32, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
-                      make_tuple(64, &aom_int_pro_row_sse2,
-                                 &aom_int_pro_row_c)));
-
-INSTANTIATE_TEST_CASE_P(
-    SSE2, IntProColTest,
-    ::testing::Values(make_tuple(16, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
-                      make_tuple(32, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
-                      make_tuple(64, &aom_int_pro_col_sse2,
-                                 &aom_int_pro_col_c)));
-
-INSTANTIATE_TEST_CASE_P(SSE2, SatdTest,
-                        ::testing::Values(make_tuple(16, &aom_satd_sse2),
-                                          make_tuple(64, &aom_satd_sse2),
-                                          make_tuple(256, &aom_satd_sse2),
-                                          make_tuple(1024, &aom_satd_sse2)));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
-    NEON, IntProRowTest,
-    ::testing::Values(make_tuple(16, &aom_int_pro_row_neon, &aom_int_pro_row_c),
-                      make_tuple(32, &aom_int_pro_row_neon, &aom_int_pro_row_c),
-                      make_tuple(64, &aom_int_pro_row_neon,
-                                 &aom_int_pro_row_c)));
-
-INSTANTIATE_TEST_CASE_P(
-    NEON, IntProColTest,
-    ::testing::Values(make_tuple(16, &aom_int_pro_col_neon, &aom_int_pro_col_c),
-                      make_tuple(32, &aom_int_pro_col_neon, &aom_int_pro_col_c),
-                      make_tuple(64, &aom_int_pro_col_neon,
-                                 &aom_int_pro_col_c)));
-
-INSTANTIATE_TEST_CASE_P(NEON, SatdTest,
-                        ::testing::Values(make_tuple(16, &aom_satd_neon),
-                                          make_tuple(64, &aom_satd_neon),
-                                          make_tuple(256, &aom_satd_neon),
-                                          make_tuple(1024, &aom_satd_neon)));
-#endif
-
-}  // namespace
diff --git a/third_party/aom/test/best_encode.sh b/third_party/aom/test/best_encode.sh
new file mode 100755
index 000000000..fe31a01cb
--- /dev/null
+++ b/third_party/aom/test/best_encode.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+# Author: jimbankoski@google.com (Jim Bankoski)
+
+if [[ $# -ne 2 ]]; then
+  echo "Encodes a file using best known settings (slow!)"
+  echo "  Usage:    be [FILE] [BITRATE]"
+  echo "  Example:  be akiyo_cif.y4m 200"
+  exit
+fi
+
+f=$1  # file is first parameter
+b=$2  # bitrate is second parameter
+
+if [[ -e $f.fpf ]]; then
+  # First-pass file found, do second pass only
+  aomenc \
+    $f \
+    -o $f-$b.av1.webm \
+    -p 2 \
+    --pass=2 \
+    --fpf=$f.fpf \
+    --best \
+    --cpu-used=0 \
+    --target-bitrate=$b \
+    --auto-alt-ref=1 \
+    -v \
+    --minsection-pct=0 \
+    --maxsection-pct=800 \
+    --lag-in-frames=25 \
+    --kf-min-dist=0 \
+    --kf-max-dist=99999 \
+    --static-thresh=0 \
+    --min-q=0 \
+    --max-q=63 \
+    --drop-frame=0 \
+    --bias-pct=50 \
+    --minsection-pct=0 \
+    --maxsection-pct=800 \
+    --psnr \
+    --arnr-maxframes=7 \
+    --arnr-strength=3 \
+    --arnr-type=3
+else
+  # No first-pass file found, do 2-pass encode
+  aomenc \
+    $f \
+    -o $f-$b.av1.webm \
+    -p 2 \
+    --pass=1 \
+    --fpf=$f.fpf \
+    --best \
+    --cpu-used=0 \
+    --target-bitrate=$b \
+    --auto-alt-ref=1 \
+    -v \
+    --minsection-pct=0 \
+    --maxsection-pct=800 \
+    --lag-in-frames=25 \
+    --kf-min-dist=0 \
+    --kf-max-dist=99999 \
+    --static-thresh=0 \
+    --min-q=0 \
+    --max-q=63 \
+    --drop-frame=0
+
+  aomenc \
+    $f \
+    -o $f-$b.av1.webm \
+    -p 2 \
+    --pass=2 \
+    --fpf=$f.fpf \
+    --best \
+    --cpu-used=0 \
+    --target-bitrate=$b \
+    --auto-alt-ref=1 \
+    -v \
+    --minsection-pct=0 \
+    --maxsection-pct=800 \
+    --lag-in-frames=25 \
+    --kf-min-dist=0 \
+    --kf-max-dist=99999 \
+    --static-thresh=0 \
+    --min-q=0 \
+    --max-q=63 \
+    --drop-frame=0 \
+    --bias-pct=50 \
+    --minsection-pct=0 \
+    --maxsection-pct=800 \
+    --psnr \
+    --arnr-maxframes=7 \
+    --arnr-strength=3 \
+    --arnr-type=3
+fi
diff --git a/third_party/aom/test/binary_codes_test.cc b/third_party/aom/test/binary_codes_test.cc
index 41efec781..45660cf85 100644
--- a/third_party/aom/test/binary_codes_test.cc
+++ b/third_party/aom/test/binary_codes_test.cc
@@ -7,7 +7,7 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <math.h>
 #include <stdlib.h>
@@ -15,7 +15,8 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "test/acm_random.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/bitreader.h"
@@ -29,57 +30,6 @@ using libaom_test::ACMRandom;
 
 namespace {
 
-// Test for Bilevel code with reference
-TEST(AV1, TestPrimitiveRefbilivel) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int kBufferSize = 65536;
-  aom_writer bw;
-  uint8_t bw_buffer[kBufferSize];
-  const uint16_t kRanges = 8;
-  const uint16_t kNearRanges = 8;
-  const uint16_t kReferences = 8;
-  const uint16_t kValues = 16;
-  const uint16_t range_vals[kRanges] = { 1, 13, 64, 120, 230, 420, 1100, 8000 };
-  uint16_t enc_values[kRanges][kNearRanges][kReferences][kValues][4];
-  aom_start_encode(&bw, bw_buffer);
-  for (int n = 0; n < kRanges; ++n) {
-    const uint16_t range = range_vals[n];
-    for (int p = 0; p < kNearRanges; ++p) {
-      const uint16_t near_range = 1 + rnd(range);
-      for (int r = 0; r < kReferences; ++r) {
-        const uint16_t ref = rnd(range);
-        for (int v = 0; v < kValues; ++v) {
-          const uint16_t value = rnd(range);
-          enc_values[n][p][r][v][0] = range;
-          enc_values[n][p][r][v][1] = near_range;
-          enc_values[n][p][r][v][2] = ref;
-          enc_values[n][p][r][v][3] = value;
-          aom_write_primitive_refbilevel(&bw, range, near_range, ref, value);
-        }
-      }
-    }
-  }
-  aom_stop_encode(&bw);
-  aom_reader br;
-  aom_reader_init(&br, bw_buffer, bw.pos, NULL, NULL);
-  GTEST_ASSERT_GE(aom_reader_tell(&br), 0u);
-  GTEST_ASSERT_LE(aom_reader_tell(&br), 1u);
-  for (int n = 0; n < kRanges; ++n) {
-    for (int p = 0; p < kNearRanges; ++p) {
-      for (int r = 0; r < kReferences; ++r) {
-        for (int v = 0; v < kValues; ++v) {
-          const uint16_t range = enc_values[n][p][r][v][0];
-          const uint16_t near_range = enc_values[n][p][r][v][1];
-          const uint16_t ref = enc_values[n][p][r][v][2];
-          const uint16_t value = aom_read_primitive_refbilevel(
-              &br, range, near_range, ref, ACCT_STR);
-          GTEST_ASSERT_EQ(value, enc_values[n][p][r][v][3]);
-        }
-      }
-    }
-  }
-}
-
 // Test for Finite subexponential code with reference
 TEST(AV1, TestPrimitiveRefsubexpfin) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -111,7 +61,7 @@ TEST(AV1, TestPrimitiveRefsubexpfin) {
   }
   aom_stop_encode(&bw);
   aom_reader br;
-  aom_reader_init(&br, bw_buffer, bw.pos, NULL, NULL);
+  aom_reader_init(&br, bw_buffer, bw.pos);
   GTEST_ASSERT_GE(aom_reader_tell(&br), 0u);
   GTEST_ASSERT_LE(aom_reader_tell(&br), 1u);
   for (int n = 0; n < kRanges; ++n) {
diff --git a/third_party/aom/test/blend_a64_mask_1d_test.cc b/third_party/aom/test/blend_a64_mask_1d_test.cc
index 66e741a74..f8844eef8 100644
--- a/third_party/aom/test/blend_a64_mask_1d_test.cc
+++ b/third_party/aom/test/blend_a64_mask_1d_test.cc
@@ -17,11 +17,11 @@
 #include "test/register_state_check.h"
 #include "test/function_equivalence_test.h"
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
-#include "./av1_rtcd.h"
+#include "aom/aom_integer.h"
 
 #include "av1/common/enums.h"
 
@@ -46,8 +46,8 @@ class BlendA64Mask1DTest : public FunctionEquivalenceTest<F> {
   virtual void Execute(const T *p_src0, const T *p_src1) = 0;
 
   void Common() {
-    w_ = 1 << this->rng_(MAX_SB_SIZE_LOG2 + 1);
-    h_ = 1 << this->rng_(MAX_SB_SIZE_LOG2 + 1);
+    w_ = 2 << this->rng_(MAX_SB_SIZE_LOG2);
+    h_ = 2 << this->rng_(MAX_SB_SIZE_LOG2);
 
     dst_offset_ = this->rng_(33);
     dst_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
@@ -116,7 +116,7 @@ class BlendA64Mask1DTest : public FunctionEquivalenceTest<F> {
 
 typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
                     uint32_t src0_stride, const uint8_t *src1,
-                    uint32_t src1_stride, const uint8_t *mask, int h, int w);
+                    uint32_t src1_stride, const uint8_t *mask, int w, int h);
 typedef libaom_test::FuncParam<F8B> TestFuncs;
 
 class BlendA64Mask1DTest8B : public BlendA64Mask1DTest<F8B, uint8_t> {
@@ -124,10 +124,10 @@ class BlendA64Mask1DTest8B : public BlendA64Mask1DTest<F8B, uint8_t> {
   void Execute(const uint8_t *p_src0, const uint8_t *p_src1) {
     params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
                      src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_,
-                     h_, w_);
+                     w_, h_);
     ASM_REGISTER_STATE_CHECK(params_.tst_func(
         dst_tst_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
-        src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_, h_, w_));
+        src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_, w_, h_));
   }
 };
 
@@ -167,7 +167,7 @@ TEST_P(BlendA64Mask1DTest8B, ExtremeValues) {
 static void blend_a64_hmask_ref(uint8_t *dst, uint32_t dst_stride,
                                 const uint8_t *src0, uint32_t src0_stride,
                                 const uint8_t *src1, uint32_t src1_stride,
-                                const uint8_t *mask, int h, int w) {
+                                const uint8_t *mask, int w, int h) {
   uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize]
                 [BlendA64Mask1DTest8B::kMaxMaskSize];
 
@@ -175,14 +175,14 @@ static void blend_a64_hmask_ref(uint8_t *dst, uint32_t dst_stride,
     for (int col = 0; col < w; ++col) mask2d[row][col] = mask[col];
 
   aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                       &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize, h, w,
+                       &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize, w, h,
                        0, 0);
 }
 
 static void blend_a64_vmask_ref(uint8_t *dst, uint32_t dst_stride,
                                 const uint8_t *src0, uint32_t src0_stride,
                                 const uint8_t *src1, uint32_t src1_stride,
-                                const uint8_t *mask, int h, int w) {
+                                const uint8_t *mask, int w, int h) {
   uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize]
                 [BlendA64Mask1DTest8B::kMaxMaskSize];
 
@@ -190,7 +190,7 @@ static void blend_a64_vmask_ref(uint8_t *dst, uint32_t dst_stride,
     for (int col = 0; col < w; ++col) mask2d[row][col] = mask[row];
 
   aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                       &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize, h, w,
+                       &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize, w, h,
                        0, 0);
 }
 
@@ -207,14 +207,21 @@ INSTANTIATE_TEST_CASE_P(
         TestFuncs(blend_a64_vmask_ref, aom_blend_a64_vmask_sse4_1)));
 #endif  // HAVE_SSE4_1
 
-#if CONFIG_HIGHBITDEPTH
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, BlendA64Mask1DTest8B,
+                        ::testing::Values(TestFuncs(blend_a64_hmask_ref,
+                                                    aom_blend_a64_hmask_neon),
+                                          TestFuncs(blend_a64_vmask_ref,
+                                                    aom_blend_a64_vmask_neon)));
+#endif  // HAVE_NEON
+
 //////////////////////////////////////////////////////////////////////////////
 // High bit-depth version
 //////////////////////////////////////////////////////////////////////////////
 
 typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
                      uint32_t src0_stride, const uint8_t *src1,
-                     uint32_t src1_stride, const uint8_t *mask, int h, int w,
+                     uint32_t src1_stride, const uint8_t *mask, int w, int h,
                      int bd);
 typedef libaom_test::FuncParam<FHBD> TestFuncsHBD;
 
@@ -224,11 +231,11 @@ class BlendA64Mask1DTestHBD : public BlendA64Mask1DTest<FHBD, uint16_t> {
     params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
                      CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
                      CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
-                     mask_, h_, w_, bit_depth_);
+                     mask_, w_, h_, bit_depth_);
     ASM_REGISTER_STATE_CHECK(params_.tst_func(
         CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
         CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
-        CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, mask_, h_, w_,
+        CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, mask_, w_, h_,
         bit_depth_));
   }
 
@@ -287,7 +294,7 @@ TEST_P(BlendA64Mask1DTestHBD, ExtremeValues) {
 static void highbd_blend_a64_hmask_ref(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w, int bd) {
+    const uint8_t *mask, int w, int h, int bd) {
   uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize]
                 [BlendA64Mask1DTestHBD::kMaxMaskSize];
 
@@ -296,13 +303,13 @@ static void highbd_blend_a64_hmask_ref(
 
   aom_highbd_blend_a64_mask_c(
       dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask2d[0][0],
-      BlendA64Mask1DTestHBD::kMaxMaskSize, h, w, 0, 0, bd);
+      BlendA64Mask1DTestHBD::kMaxMaskSize, w, h, 0, 0, bd);
 }
 
 static void highbd_blend_a64_vmask_ref(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w, int bd) {
+    const uint8_t *mask, int w, int h, int bd) {
   uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize]
                 [BlendA64Mask1DTestHBD::kMaxMaskSize];
 
@@ -311,7 +318,7 @@ static void highbd_blend_a64_vmask_ref(
 
   aom_highbd_blend_a64_mask_c(
       dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask2d[0][0],
-      BlendA64Mask1DTestHBD::kMaxMaskSize, h, w, 0, 0, bd);
+      BlendA64Mask1DTestHBD::kMaxMaskSize, w, h, 0, 0, bd);
 }
 
 INSTANTIATE_TEST_CASE_P(
@@ -329,6 +336,4 @@ INSTANTIATE_TEST_CASE_P(
                       TestFuncsHBD(highbd_blend_a64_vmask_ref,
                                    aom_highbd_blend_a64_vmask_sse4_1)));
 #endif  // HAVE_SSE4_1
-
-#endif  // CONFIG_HIGHBITDEPTH
 }  // namespace
diff --git a/third_party/aom/test/blend_a64_mask_test.cc b/third_party/aom/test/blend_a64_mask_test.cc
index fef124d34..c9c6795ee 100644
--- a/third_party/aom/test/blend_a64_mask_test.cc
+++ b/third_party/aom/test/blend_a64_mask_test.cc
@@ -17,11 +17,11 @@
 #include "test/register_state_check.h"
 #include "test/function_equivalence_test.h"
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
-#include "./av1_rtcd.h"
+#include "aom/aom_integer.h"
 
 #include "av1/common/enums.h"
 
@@ -31,8 +31,8 @@ using libaom_test::FunctionEquivalenceTest;
 
 namespace {
 
-template <typename F, typename T>
-class BlendA64MaskTest : public FunctionEquivalenceTest<F> {
+template <typename BlendA64Func, typename SrcPixel, typename DstPixel>
+class BlendA64MaskTest : public FunctionEquivalenceTest<BlendA64Func> {
  protected:
   static const int kIterations = 10000;
   static const int kMaxWidth = MAX_SB_SIZE * 5;  // * 5 to cover longer strides
@@ -43,14 +43,44 @@ class BlendA64MaskTest : public FunctionEquivalenceTest<F> {
 
   virtual ~BlendA64MaskTest() {}
 
-  virtual void Execute(const T *p_src0, const T *p_src1) = 0;
+  virtual void Execute(const SrcPixel *p_src0, const SrcPixel *p_src1) = 0;
+
+  template <typename Pixel>
+  void GetSources(Pixel **src0, Pixel **src1, Pixel * /*dst*/) {
+    switch (this->rng_(3)) {
+      case 0:  // Separate sources
+        *src0 = src0_;
+        *src1 = src1_;
+        break;
+      case 1:  // src0 == dst
+        *src0 = dst_tst_;
+        src0_stride_ = dst_stride_;
+        src0_offset_ = dst_offset_;
+        *src1 = src1_;
+        break;
+      case 2:  // src1 == dst
+        *src0 = src0_;
+        *src1 = dst_tst_;
+        src1_stride_ = dst_stride_;
+        src1_offset_ = dst_offset_;
+        break;
+      default: FAIL();
+    }
+  }
+
+  void GetSources(uint16_t **src0, uint16_t **src1, uint8_t * /*dst*/) {
+    *src0 = src0_;
+    *src1 = src1_;
+  }
+
+  uint8_t Rand1() { return this->rng_.Rand8() & 1; }
 
-  void Common() {
-    w_ = 1 << this->rng_(MAX_SB_SIZE_LOG2 + 1);
-    h_ = 1 << this->rng_(MAX_SB_SIZE_LOG2 + 1);
+  void RunTest() {
+    w_ = 4 << this->rng_(MAX_SB_SIZE_LOG2 - 1);
+    h_ = 4 << this->rng_(MAX_SB_SIZE_LOG2 - 1);
 
-    subx_ = this->rng_(2);
-    suby_ = this->rng_(2);
+    subx_ = Rand1();
+    suby_ = Rand1();
 
     dst_offset_ = this->rng_(33);
     dst_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
@@ -64,49 +94,35 @@ class BlendA64MaskTest : public FunctionEquivalenceTest<F> {
     mask_stride_ =
         this->rng_(kMaxWidth + 1 - w_ * (subx_ ? 2 : 1)) + w_ * (subx_ ? 2 : 1);
 
-    T *p_src0;
-    T *p_src1;
+    SrcPixel *p_src0;
+    SrcPixel *p_src1;
 
-    switch (this->rng_(3)) {
-      case 0:  // Separate sources
-        p_src0 = src0_;
-        p_src1 = src1_;
-        break;
-      case 1:  // src0 == dst
-        p_src0 = dst_tst_;
-        src0_stride_ = dst_stride_;
-        src0_offset_ = dst_offset_;
-        p_src1 = src1_;
-        break;
-      case 2:  // src1 == dst
-        p_src0 = src0_;
-        p_src1 = dst_tst_;
-        src1_stride_ = dst_stride_;
-        src1_offset_ = dst_offset_;
-        break;
-      default: FAIL();
-    }
+    p_src0 = src0_;
+    p_src1 = src1_;
+
+    GetSources(&p_src0, &p_src1, &dst_ref_[0]);
 
     Execute(p_src0, p_src1);
 
     for (int r = 0; r < h_; ++r) {
       for (int c = 0; c < w_; ++c) {
         ASSERT_EQ(dst_ref_[dst_offset_ + r * dst_stride_ + c],
-                  dst_tst_[dst_offset_ + r * dst_stride_ + c]);
+                  dst_tst_[dst_offset_ + r * dst_stride_ + c])
+            << w_ << "x" << h_ << " r: " << r << " c: " << c;
       }
     }
   }
 
-  T dst_ref_[kBufSize];
-  T dst_tst_[kBufSize];
+  DstPixel dst_ref_[kBufSize];
+  DstPixel dst_tst_[kBufSize];
   uint32_t dst_stride_;
   uint32_t dst_offset_;
 
-  T src0_[kBufSize];
+  SrcPixel src0_[kBufSize];
   uint32_t src0_stride_;
   uint32_t src0_offset_;
 
-  T src1_[kBufSize];
+  SrcPixel src1_[kBufSize];
   uint32_t src1_stride_;
   uint32_t src1_offset_;
 
@@ -127,19 +143,19 @@ class BlendA64MaskTest : public FunctionEquivalenceTest<F> {
 typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
                     uint32_t src0_stride, const uint8_t *src1,
                     uint32_t src1_stride, const uint8_t *mask,
-                    uint32_t mask_stride, int h, int w, int suby, int subx);
+                    uint32_t mask_stride, int w, int h, int subx, int suby);
 typedef libaom_test::FuncParam<F8B> TestFuncs;
 
-class BlendA64MaskTest8B : public BlendA64MaskTest<F8B, uint8_t> {
+class BlendA64MaskTest8B : public BlendA64MaskTest<F8B, uint8_t, uint8_t> {
  protected:
   void Execute(const uint8_t *p_src0, const uint8_t *p_src1) {
     params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
                      src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_,
-                     kMaxMaskWidth, h_, w_, suby_, subx_);
+                     kMaxMaskWidth, w_, h_, subx_, suby_);
     ASM_REGISTER_STATE_CHECK(params_.tst_func(
         dst_tst_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
         src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_, kMaxMaskWidth,
-        h_, w_, suby_, subx_));
+        w_, h_, subx_, suby_));
   }
 };
 
@@ -156,7 +172,7 @@ TEST_P(BlendA64MaskTest8B, RandomValues) {
     for (int i = 0; i < kMaxMaskSize; ++i)
       mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
 
-    Common();
+    RunTest();
   }
 }
 
@@ -172,7 +188,7 @@ TEST_P(BlendA64MaskTest8B, ExtremeValues) {
     for (int i = 0; i < kMaxMaskSize; ++i)
       mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
 
-    Common();
+    RunTest();
   }
 }
 
@@ -182,7 +198,85 @@ INSTANTIATE_TEST_CASE_P(SSE4_1, BlendA64MaskTest8B,
                             aom_blend_a64_mask_c, aom_blend_a64_mask_sse4_1)));
 #endif  // HAVE_SSE4_1
 
-#if CONFIG_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit _d16 version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B_D16)(uint8_t *dst, uint32_t dst_stride, const uint16_t *src0,
+                        uint32_t src0_stride, const uint16_t *src1,
+                        uint32_t src1_stride, const uint8_t *mask,
+                        uint32_t mask_stride, int w, int h, int subx, int suby,
+                        ConvolveParams *conv_params);
+typedef libaom_test::FuncParam<F8B_D16> TestFuncs_d16;
+
+class BlendA64MaskTest8B_d16
+    : public BlendA64MaskTest<F8B_D16, uint16_t, uint8_t> {
+ protected:
+  // max number of bits used by the source
+  static const int kSrcMaxBitsMask = 0x3fff;
+
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1) {
+    ConvolveParams conv_params;
+    conv_params.round_0 = ROUND0_BITS;
+    conv_params.round_1 = COMPOUND_ROUND1_BITS;
+    params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
+                     src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_,
+                     kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(
+        dst_tst_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
+        src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_, kMaxMaskWidth,
+        w_, h_, subx_, suby_, &conv_params));
+  }
+};
+
+TEST_P(BlendA64MaskTest8B_d16, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_.Rand8();
+      dst_tst_[i] = rng_.Rand8();
+
+      src0_[i] = rng_.Rand16() & kSrcMaxBitsMask;
+      src1_[i] = rng_.Rand16() & kSrcMaxBitsMask;
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+    RunTest();
+  }
+}
+
+TEST_P(BlendA64MaskTest8B_d16, ExtremeValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = 255;
+      dst_tst_[i] = 255;
+
+      src0_[i] = kSrcMaxBitsMask;
+      src1_[i] = kSrcMaxBitsMask;
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = AOM_BLEND_A64_MAX_ALPHA - 1;
+
+    RunTest();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, BlendA64MaskTest8B_d16,
+    ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c,
+                                    aom_lowbd_blend_a64_d16_mask_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, BlendA64MaskTest8B_d16,
+    ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c,
+                                    aom_lowbd_blend_a64_d16_mask_neon)));
+#endif  // HAVE_NEON
+
 //////////////////////////////////////////////////////////////////////////////
 // High bit-depth version
 //////////////////////////////////////////////////////////////////////////////
@@ -190,22 +284,22 @@ INSTANTIATE_TEST_CASE_P(SSE4_1, BlendA64MaskTest8B,
 typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
                      uint32_t src0_stride, const uint8_t *src1,
                      uint32_t src1_stride, const uint8_t *mask,
-                     uint32_t mask_stride, int h, int w, int suby, int subx,
+                     uint32_t mask_stride, int w, int h, int subx, int suby,
                      int bd);
 typedef libaom_test::FuncParam<FHBD> TestFuncsHBD;
 
-class BlendA64MaskTestHBD : public BlendA64MaskTest<FHBD, uint16_t> {
+class BlendA64MaskTestHBD : public BlendA64MaskTest<FHBD, uint16_t, uint16_t> {
  protected:
   void Execute(const uint16_t *p_src0, const uint16_t *p_src1) {
     params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
                      CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
                      CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
-                     mask_, kMaxMaskWidth, h_, w_, suby_, subx_, bit_depth_);
+                     mask_, kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_);
     ASM_REGISTER_STATE_CHECK(params_.tst_func(
         CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
         CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
         CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, mask_,
-        kMaxMaskWidth, h_, w_, suby_, subx_, bit_depth_));
+        kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_));
   }
 
   int bit_depth_;
@@ -231,7 +325,7 @@ TEST_P(BlendA64MaskTestHBD, RandomValues) {
     for (int i = 0; i < kMaxMaskSize; ++i)
       mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
 
-    Common();
+    RunTest();
   }
 }
 
@@ -256,7 +350,7 @@ TEST_P(BlendA64MaskTestHBD, ExtremeValues) {
     for (int i = 0; i < kMaxMaskSize; ++i)
       mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
 
-    Common();
+    RunTest();
   }
 }
 
@@ -266,5 +360,104 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Values(TestFuncsHBD(aom_highbd_blend_a64_mask_c,
                                    aom_highbd_blend_a64_mask_sse4_1)));
 #endif  // HAVE_SSE4_1
-#endif  // CONFIG_HIGHBITDEPTH
+
+//////////////////////////////////////////////////////////////////////////////
+// HBD _d16 version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FHBD_D16)(uint8_t *dst, uint32_t dst_stride,
+                         const CONV_BUF_TYPE *src0, uint32_t src0_stride,
+                         const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+                         const uint8_t *mask, uint32_t mask_stride, int w,
+                         int h, int subx, int suby, ConvolveParams *conv_params,
+                         const int bd);
+typedef libaom_test::FuncParam<FHBD_D16> TestFuncsHBD_d16;
+
+class BlendA64MaskTestHBD_d16
+    : public BlendA64MaskTest<FHBD_D16, uint16_t, uint16_t> {
+ protected:
+  // max number of bits used by the source
+  static const int kSrcMaxBitsMask = (1 << 14) - 1;
+  static const int kSrcMaxBitsMaskHBD = (1 << 16) - 1;
+
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1) {
+    ConvolveParams conv_params;
+    conv_params.round_0 = (bit_depth_ == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+    conv_params.round_1 = COMPOUND_ROUND1_BITS;
+
+    params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+                     p_src0 + src0_offset_, src0_stride_, p_src1 + src1_offset_,
+                     src1_stride_, mask_, kMaxMaskWidth, w_, h_, subx_, suby_,
+                     &conv_params, bit_depth_);
+    if (params_.tst_func) {
+      ASM_REGISTER_STATE_CHECK(params_.tst_func(
+          CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
+          p_src0 + src0_offset_, src0_stride_, p_src1 + src1_offset_,
+          src1_stride_, mask_, kMaxMaskWidth, w_, h_, subx_, suby_,
+          &conv_params, bit_depth_));
+    }
+  }
+
+  int bit_depth_;
+  int src_max_bits_mask_;
+};
+
+TEST_P(BlendA64MaskTestHBD_d16, RandomValues) {
+  if (params_.tst_func == NULL) return;
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+      case 0: bit_depth_ = 8; break;
+      case 1: bit_depth_ = 10; break;
+      default: bit_depth_ = 12; break;
+    }
+    src_max_bits_mask_ =
+        (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD;
+
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_.Rand8();
+      dst_tst_[i] = rng_.Rand8();
+
+      src0_[i] = rng_.Rand16() & src_max_bits_mask_;
+      src1_[i] = rng_.Rand16() & src_max_bits_mask_;
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+    RunTest();
+  }
+}
+
+TEST_P(BlendA64MaskTestHBD_d16, SaturatedValues) {
+  for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
+    src_max_bits_mask_ =
+        (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD;
+
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = 0;
+      dst_tst_[i] = (1 << bit_depth_) - 1;
+
+      src0_[i] = src_max_bits_mask_;
+      src1_[i] = src_max_bits_mask_;
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = AOM_BLEND_A64_MAX_ALPHA;
+
+    RunTest();
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    C, BlendA64MaskTestHBD_d16,
+    ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c, NULL)));
+
+// TODO(slavarnway): Enable the following in the avx2 commit. (56501)
+#if 0
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, BlendA64MaskTestHBD,
+    ::testing::Values(TestFuncsHBD(aom_highbd_blend_a64_mask_c,
+                                   aom_highbd_blend_a64_mask_avx2)));
+#endif  // HAVE_AVX2
+#endif
 }  // namespace
diff --git a/third_party/aom/test/block_error_test.cc b/third_party/aom/test/block_error_test.cc
deleted file mode 100644
index 4364af422..000000000
--- a/third_party/aom/test/block_error_test.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
-
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-
-namespace {
-using libaom_test::ACMRandom;
-
-typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff,
-                                  const tran_low_t *dqcoeff, intptr_t size,
-                                  int64_t *ssz);
-#if CONFIG_HIGHBITDEPTH
-typedef int64_t (*HbdBlockErrorFunc)(const tran_low_t *coeff,
-                                     const tran_low_t *dqcoeff, intptr_t size,
-                                     int64_t *ssz, int bd);
-#endif
-
-typedef std::tr1::tuple<BlockErrorFunc, BlockErrorFunc, TX_SIZE,
-                        aom_bit_depth_t>
-    BlockErrorParam;
-
-const int kTestNum = 10000;
-
-class BlockErrorTest : public ::testing::TestWithParam<BlockErrorParam> {
- public:
-  BlockErrorTest()
-      : blk_err_ref_(GET_PARAM(0)), blk_err_(GET_PARAM(1)),
-        tx_size_(GET_PARAM(2)), bd_(GET_PARAM(3)) {}
-
-  virtual ~BlockErrorTest() {}
-
-  virtual void SetUp() {
-    const intptr_t block_size = getCoeffNum();
-    coeff_ = reinterpret_cast<tran_low_t *>(
-        aom_memalign(16, 2 * block_size * sizeof(tran_low_t)));
-  }
-
-  virtual void TearDown() {
-    aom_free(coeff_);
-    coeff_ = NULL;
-    libaom_test::ClearSystemState();
-  }
-
-  void BlockErrorRun(int testNum) {
-    int i;
-    int64_t error_ref, error;
-    int64_t sse_ref, sse;
-    const intptr_t block_size = getCoeffNum();
-    tran_low_t *dqcoeff = coeff_ + block_size;
-    for (i = 0; i < testNum; ++i) {
-      FillRandomData();
-
-      error_ref = blk_err_ref_(coeff_, dqcoeff, block_size, &sse_ref);
-      ASM_REGISTER_STATE_CHECK(error =
-                                   blk_err_(coeff_, dqcoeff, block_size, &sse));
-
-      EXPECT_EQ(error_ref, error) << "Error doesn't match on test: " << i;
-      EXPECT_EQ(sse_ref, sse) << "SSE doesn't match on test: " << i;
-    }
-  }
-
-  intptr_t getCoeffNum() { return tx_size_2d[tx_size_]; }
-
-  void FillRandomData() {
-    const intptr_t block_size = getCoeffNum();
-    tran_low_t *dqcoeff = coeff_ + block_size;
-    intptr_t i;
-    int16_t margin = 512;
-    for (i = 0; i < block_size; ++i) {
-      coeff_[i] = GetRandomNumWithRange(INT16_MIN + margin, INT16_MAX - margin);
-      dqcoeff[i] = coeff_[i] + GetRandomDeltaWithRange(margin);
-    }
-  }
-
-  void FillConstantData() {
-    const intptr_t block_size = getCoeffNum();
-    tran_low_t *dqcoeff = coeff_ + block_size;
-    intptr_t i;
-    for (i = 0; i < block_size; ++i) {
-      coeff_[i] = 5;
-      dqcoeff[i] = 7;
-    }
-  }
-
-  tran_low_t GetRandomNumWithRange(int16_t min, int16_t max) {
-    return clamp((int16_t)rnd_.Rand16(), min, max);
-  }
-
-  tran_low_t GetRandomDeltaWithRange(int16_t delta) {
-    tran_low_t value = (int16_t)rnd_.Rand16();
-    value %= delta;
-    return value;
-  }
-
-  BlockErrorFunc blk_err_ref_;
-  BlockErrorFunc blk_err_;
-  TX_SIZE tx_size_;
-  aom_bit_depth_t bd_;
-  ACMRandom rnd_;
-  tran_low_t *coeff_;
-};
-
-TEST_P(BlockErrorTest, BitExact) { BlockErrorRun(kTestNum); }
-
-using std::tr1::make_tuple;
-
-#if !CONFIG_HIGHBITDEPTH && HAVE_SSE2
-const BlockErrorParam kBlkErrParamArraySse2[] = { make_tuple(
-    &av1_block_error_c, &av1_block_error_sse2, TX_32X32, AOM_BITS_8) };
-INSTANTIATE_TEST_CASE_P(SSE2, BlockErrorTest,
-                        ::testing::ValuesIn(kBlkErrParamArraySse2));
-#endif
-
-#if HAVE_AVX2
-const BlockErrorParam kBlkErrParamArrayAvx2[] = { make_tuple(
-    &av1_block_error_c, &av1_block_error_avx2, TX_32X32, AOM_BITS_8) };
-INSTANTIATE_TEST_CASE_P(AVX2, BlockErrorTest,
-                        ::testing::ValuesIn(kBlkErrParamArrayAvx2));
-#endif
-}  // namespace
diff --git a/third_party/aom/test/boolcoder_test.cc b/third_party/aom/test/boolcoder_test.cc
index 916a54427..72182de10 100644
--- a/third_party/aom/test/boolcoder_test.cc
+++ b/third_party/aom/test/boolcoder_test.cc
@@ -7,7 +7,7 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <math.h>
 #include <stdlib.h>
@@ -69,7 +69,7 @@ TEST(AV1, TestBitIO) {
         aom_stop_encode(&bw);
 
         aom_reader br;
-        aom_reader_init(&br, bw_buffer, bw.pos, NULL, NULL);
+        aom_reader_init(&br, bw_buffer, bw.pos);
         bit_rnd.Reset(random_seed);
         for (int i = 0; i < kBitsToTest; ++i) {
           if (bit_method == 2) {
@@ -86,7 +86,7 @@ TEST(AV1, TestBitIO) {
   }
 }
 
-#define FRAC_DIFF_TOTAL_ERROR 0.16
+#define FRAC_DIFF_TOTAL_ERROR 0.18
 
 TEST(AV1, TestTell) {
   const int kBufferSize = 10000;
@@ -102,7 +102,7 @@ TEST(AV1, TestTell) {
     }
     aom_stop_encode(&bw);
     aom_reader br;
-    aom_reader_init(&br, bw_buffer, bw.pos, NULL, NULL);
+    aom_reader_init(&br, bw_buffer, bw.pos);
     uint32_t last_tell = aom_reader_tell(&br);
     uint32_t last_tell_frac = aom_reader_tell_frac(&br);
     double frac_diff_total = 0;
diff --git a/third_party/aom/test/borders_test.cc b/third_party/aom/test/borders_test.cc
index ee771707c..893237ef3 100644
--- a/third_party/aom/test/borders_test.cc
+++ b/third_party/aom/test/borders_test.cc
@@ -7,7 +7,7 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <climits>
 #include <vector>
@@ -19,12 +19,12 @@
 
 namespace {
 
-class BordersTest
+class BordersTestLarge
     : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
       public ::libaom_test::EncoderTest {
  protected:
-  BordersTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~BordersTest() {}
+  BordersTestLarge() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~BordersTestLarge() {}
 
   virtual void SetUp() {
     InitializeConfig();
@@ -47,7 +47,7 @@ class BordersTest
   }
 };
 
-TEST_P(BordersTest, TestEncodeHighBitrate) {
+TEST_P(BordersTestLarge, TestEncodeHighBitrate) {
   // Validate that this non multiple of 64 wide clip encodes and decodes
   // without a mismatch when passing in a very low max q.  This pushes
   // the encoder to producing lots of big partitions which will likely
@@ -63,7 +63,7 @@ TEST_P(BordersTest, TestEncodeHighBitrate) {
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
-TEST_P(BordersTest, TestLowBitrate) {
+TEST_P(BordersTestLarge, TestLowBitrate) {
   // Validate that this clip encodes and decodes without a mismatch
   // when passing in a very high min q.  This pushes the encoder to producing
   // lots of small partitions which might will test the other condition.
@@ -80,6 +80,6 @@ TEST_P(BordersTest, TestLowBitrate) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-AV1_INSTANTIATE_TEST_CASE(BordersTest,
+AV1_INSTANTIATE_TEST_CASE(BordersTestLarge,
                           ::testing::Values(::libaom_test::kTwoPassGood));
 }  // namespace
diff --git a/third_party/aom/test/cdef_test.cc b/third_party/aom/test/cdef_test.cc
index b6250b6e9..becc07291 100644
--- a/third_party/aom/test/cdef_test.cc
+++ b/third_party/aom/test/cdef_test.cc
@@ -7,15 +7,16 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <cstdlib>
 #include <string>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
 #include "aom_ports/aom_timer.h"
 #include "av1/common/cdef_block.h"
 #include "test/acm_random.h"
@@ -27,7 +28,8 @@ using libaom_test::ACMRandom;
 
 namespace {
 
-typedef std::tr1::tuple<cdef_filter_block_func, cdef_filter_block_func, int>
+typedef ::testing::tuple<cdef_filter_block_func, cdef_filter_block_func,
+                         BLOCK_SIZE, int, int>
     cdef_dir_param_t;
 
 class CDEFBlockTest : public ::testing::TestWithParam<cdef_dir_param_t> {
@@ -37,12 +39,16 @@ class CDEFBlockTest : public ::testing::TestWithParam<cdef_dir_param_t> {
     cdef = GET_PARAM(0);
     ref_cdef = GET_PARAM(1);
     bsize = GET_PARAM(2);
+    boundary = GET_PARAM(3);
+    depth = GET_PARAM(4);
   }
 
   virtual void TearDown() { libaom_test::ClearSystemState(); }
 
  protected:
   int bsize;
+  int boundary;
+  int depth;
   cdef_filter_block_func cdef;
   cdef_filter_block_func ref_cdef;
 };
@@ -50,7 +56,7 @@ class CDEFBlockTest : public ::testing::TestWithParam<cdef_dir_param_t> {
 typedef CDEFBlockTest CDEFSpeedTest;
 
 void test_cdef(int bsize, int iterations, cdef_filter_block_func cdef,
-               cdef_filter_block_func ref_cdef) {
+               cdef_filter_block_func ref_cdef, int boundary, int depth) {
   const int size = 8;
   const int ysize = size + 2 * CDEF_VBORDER;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -61,80 +67,73 @@ void test_cdef(int bsize, int iterations, cdef_filter_block_func cdef,
   memset(d, 0, sizeof(d));
 
   int error = 0, pristrength = 0, secstrength, dir;
-  int boundary, pridamping, secdamping, depth, bits, level, count,
+  int pridamping, secdamping, bits, level, count,
       errdepth = 0, errpristrength = 0, errsecstrength = 0, errboundary = 0,
       errpridamping = 0, errsecdamping = 0;
   unsigned int pos = 0;
 
-  for (boundary = 0; boundary < 16; boundary++) {
-    for (depth = 8; depth <= 12; depth += 2) {
-      const unsigned int max_pos = size * size >> (depth == 8);
-      for (pridamping = 3 + depth - 8;
-           pridamping < 7 - 3 * !!boundary + depth - 8; pridamping++) {
-        for (secdamping = 3 + depth - 8;
-             secdamping < 7 - 3 * !!boundary + depth - 8; secdamping++) {
-          for (count = 0; count < iterations; count++) {
-            for (level = 0; level < (1 << depth) && !error;
-                 level += (2 + 6 * !!boundary) << (depth - 8)) {
-              for (bits = 1; bits <= depth && !error;
-                   bits += 1 + 3 * !!boundary) {
-                for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++)
-                  s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
-                               (1 << depth) - 1);
-                if (boundary) {
-                  if (boundary & 1) {  // Left
-                    for (int i = 0; i < ysize; i++)
-                      for (int j = 0; j < CDEF_HBORDER; j++)
-                        s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
-                  }
-                  if (boundary & 2) {  // Right
-                    for (int i = 0; i < ysize; i++)
-                      for (int j = CDEF_HBORDER + size; j < CDEF_BSTRIDE; j++)
-                        s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
-                  }
-                  if (boundary & 4) {  // Above
-                    for (int i = 0; i < CDEF_VBORDER; i++)
-                      for (int j = 0; j < CDEF_BSTRIDE; j++)
-                        s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
-                  }
-                  if (boundary & 8) {  // Below
-                    for (int i = CDEF_VBORDER + size; i < ysize; i++)
-                      for (int j = 0; j < CDEF_BSTRIDE; j++)
-                        s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
-                  }
-                }
-                for (dir = 0; dir < 8; dir++) {
-                  for (pristrength = 0;
-                       pristrength <= 19 << (depth - 8) && !error;
-                       pristrength += (1 + 4 * !!boundary) << (depth - 8)) {
-                    if (pristrength == 16) pristrength = 19;
-                    for (secstrength = 0;
-                         secstrength <= 4 << (depth - 8) && !error;
-                         secstrength += 1 << (depth - 8)) {
-                      if (secstrength == 3 << (depth - 8)) continue;
-                      ref_cdef(depth == 8 ? (uint8_t *)ref_d : 0, ref_d, size,
-                               s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
-                               pristrength, secstrength, dir, pridamping,
-                               secdamping, bsize, (1 << depth) - 1);
-                      // If cdef and ref_cdef are the same, we're just testing
-                      // speed
-                      if (cdef != ref_cdef)
-                        ASM_REGISTER_STATE_CHECK(
-                            cdef(depth == 8 ? (uint8_t *)d : 0, d, size,
-                                 s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
-                                 pristrength, secstrength, dir, pridamping,
-                                 secdamping, bsize, (1 << depth) - 1));
-                      if (ref_cdef != cdef) {
-                        for (pos = 0; pos < max_pos && !error; pos++) {
-                          error = ref_d[pos] != d[pos];
-                          errdepth = depth;
-                          errpristrength = pristrength;
-                          errsecstrength = secstrength;
-                          errboundary = boundary;
-                          errpridamping = pridamping;
-                          errsecdamping = secdamping;
-                        }
-                      }
+  const unsigned int max_pos = size * size >> static_cast<int>(depth == 8);
+  for (pridamping = 3 + depth - 8; pridamping < 7 - 3 * !!boundary + depth - 8;
+       pridamping++) {
+    for (secdamping = 3 + depth - 8;
+         secdamping < 7 - 3 * !!boundary + depth - 8; secdamping++) {
+      for (count = 0; count < iterations; count++) {
+        for (level = 0; level < (1 << depth) && !error;
+             level += (2 + 6 * !!boundary) << (depth - 8)) {
+          for (bits = 1; bits <= depth && !error; bits += 1 + 3 * !!boundary) {
+            for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++)
+              s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+                           (1 << depth) - 1);
+            if (boundary) {
+              if (boundary & 1) {  // Left
+                for (int i = 0; i < ysize; i++)
+                  for (int j = 0; j < CDEF_HBORDER; j++)
+                    s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+              }
+              if (boundary & 2) {  // Right
+                for (int i = 0; i < ysize; i++)
+                  for (int j = CDEF_HBORDER + size; j < CDEF_BSTRIDE; j++)
+                    s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+              }
+              if (boundary & 4) {  // Above
+                for (int i = 0; i < CDEF_VBORDER; i++)
+                  for (int j = 0; j < CDEF_BSTRIDE; j++)
+                    s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+              }
+              if (boundary & 8) {  // Below
+                for (int i = CDEF_VBORDER + size; i < ysize; i++)
+                  for (int j = 0; j < CDEF_BSTRIDE; j++)
+                    s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+              }
+            }
+            for (dir = 0; dir < 8; dir++) {
+              for (pristrength = 0; pristrength <= 19 << (depth - 8) && !error;
+                   pristrength += (1 + 4 * !!boundary) << (depth - 8)) {
+                if (pristrength == 16) pristrength = 19;
+                for (secstrength = 0; secstrength <= 4 << (depth - 8) && !error;
+                     secstrength += 1 << (depth - 8)) {
+                  if (secstrength == 3 << (depth - 8)) continue;
+                  ref_cdef(depth == 8 ? (uint8_t *)ref_d : 0, ref_d, size,
+                           s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
+                           pristrength, secstrength, dir, pridamping,
+                           secdamping, bsize, (1 << depth) - 1, depth - 8);
+                  // If cdef and ref_cdef are the same, we're just testing
+                  // speed
+                  if (cdef != ref_cdef)
+                    ASM_REGISTER_STATE_CHECK(
+                        cdef(depth == 8 ? (uint8_t *)d : 0, d, size,
+                             s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
+                             pristrength, secstrength, dir, pridamping,
+                             secdamping, bsize, (1 << depth) - 1, depth - 8));
+                  if (ref_cdef != cdef) {
+                    for (pos = 0; pos < max_pos && !error; pos++) {
+                      error = ref_d[pos] != d[pos];
+                      errdepth = depth;
+                      errpristrength = pristrength;
+                      errsecstrength = secstrength;
+                      errboundary = boundary;
+                      errpridamping = pridamping;
+                      errsecdamping = secdamping;
                     }
                   }
                 }
@@ -145,6 +144,7 @@ void test_cdef(int bsize, int iterations, cdef_filter_block_func cdef,
       }
     }
   }
+
   pos--;
   EXPECT_EQ(0, error) << "Error: CDEFBlockTest, SIMD and C mismatch."
                       << std::endl
@@ -162,25 +162,20 @@ void test_cdef(int bsize, int iterations, cdef_filter_block_func cdef,
 }
 
 void test_cdef_speed(int bsize, int iterations, cdef_filter_block_func cdef,
-                     cdef_filter_block_func ref_cdef) {
+                     cdef_filter_block_func ref_cdef, int boundary, int depth) {
   aom_usec_timer ref_timer;
   aom_usec_timer timer;
 
   aom_usec_timer_start(&ref_timer);
-  test_cdef(bsize, iterations, ref_cdef, ref_cdef);
+  test_cdef(bsize, iterations, ref_cdef, ref_cdef, boundary, depth);
   aom_usec_timer_mark(&ref_timer);
   int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
 
   aom_usec_timer_start(&timer);
-  test_cdef(bsize, iterations, cdef, cdef);
+  test_cdef(bsize, iterations, cdef, cdef, boundary, depth);
   aom_usec_timer_mark(&timer);
   int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
 
-#if 0
-  std::cout << "[          ] C time = " << ref_elapsed_time / 1000
-            << " ms, SIMD time = " << elapsed_time / 1000 << " ms" << std::endl;
-#endif
-
   EXPECT_GT(ref_elapsed_time, elapsed_time)
       << "Error: CDEFSpeedTest, SIMD slower than C." << std::endl
       << "C time: " << ref_elapsed_time << " us" << std::endl
@@ -190,7 +185,7 @@ void test_cdef_speed(int bsize, int iterations, cdef_filter_block_func cdef,
 typedef int (*find_dir_t)(const uint16_t *img, int stride, int32_t *var,
                           int coeff_shift);
 
-typedef std::tr1::tuple<find_dir_t, find_dir_t> find_dir_param_t;
+typedef ::testing::tuple<find_dir_t, find_dir_t> find_dir_param_t;
 
 class CDEFFindDirTest : public ::testing::TestWithParam<find_dir_param_t> {
  public:
@@ -268,11 +263,6 @@ void test_finddir_speed(int (*finddir)(const uint16_t *img, int stride,
   aom_usec_timer_mark(&timer);
   int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
 
-#if 0
-  std::cout << "[          ] C time = " << ref_elapsed_time / 1000
-            << " ms, SIMD time = " << elapsed_time / 1000 << " ms" << std::endl;
-#endif
-
   EXPECT_GT(ref_elapsed_time, elapsed_time)
       << "Error: CDEFFindDirSpeedTest, SIMD slower than C." << std::endl
       << "C time: " << ref_elapsed_time << " us" << std::endl
@@ -280,11 +270,11 @@ void test_finddir_speed(int (*finddir)(const uint16_t *img, int stride,
 }
 
 TEST_P(CDEFBlockTest, TestSIMDNoMismatch) {
-  test_cdef(bsize, 1, cdef, ref_cdef);
+  test_cdef(bsize, 1, cdef, ref_cdef, boundary, depth);
 }
 
 TEST_P(CDEFSpeedTest, DISABLED_TestSpeed) {
-  test_cdef_speed(bsize, 4, cdef, ref_cdef);
+  test_cdef_speed(bsize, 4, cdef, ref_cdef, boundary, depth);
 }
 
 TEST_P(CDEFFindDirTest, TestSIMDNoMismatch) {
@@ -295,7 +285,7 @@ TEST_P(CDEFFindDirSpeedTest, DISABLED_TestSpeed) {
   test_finddir_speed(finddir, ref_finddir);
 }
 
-using std::tr1::make_tuple;
+using ::testing::make_tuple;
 
 // VS compiling for 32 bit targets does not support vector types in
 // structs as arguments, which makes the v256 type of the intrinsics
@@ -304,9 +294,11 @@ using std::tr1::make_tuple;
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, CDEFBlockTest,
-    ::testing::Values(
-        make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_4X4),
-        make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_8X8)));
+    ::testing::Combine(::testing::Values(&cdef_filter_block_sse2),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
 INSTANTIATE_TEST_CASE_P(SSE2, CDEFFindDirTest,
                         ::testing::Values(make_tuple(&cdef_find_dir_sse2,
                                                      &cdef_find_dir_c)));
@@ -314,9 +306,11 @@ INSTANTIATE_TEST_CASE_P(SSE2, CDEFFindDirTest,
 #if HAVE_SSSE3
 INSTANTIATE_TEST_CASE_P(
     SSSE3, CDEFBlockTest,
-    ::testing::Values(
-        make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_4X4),
-        make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_8X8)));
+    ::testing::Combine(::testing::Values(&cdef_filter_block_ssse3),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
 INSTANTIATE_TEST_CASE_P(SSSE3, CDEFFindDirTest,
                         ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
                                                      &cdef_find_dir_c)));
@@ -325,10 +319,11 @@ INSTANTIATE_TEST_CASE_P(SSSE3, CDEFFindDirTest,
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_CASE_P(
     SSE4_1, CDEFBlockTest,
-    ::testing::Values(make_tuple(&cdef_filter_block_sse4_1,
-                                 &cdef_filter_block_c, BLOCK_4X4),
-                      make_tuple(&cdef_filter_block_sse4_1,
-                                 &cdef_filter_block_c, BLOCK_8X8)));
+    ::testing::Combine(::testing::Values(&cdef_filter_block_sse4_1),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
 INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFFindDirTest,
                         ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
                                                      &cdef_find_dir_c)));
@@ -337,9 +332,11 @@ INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFFindDirTest,
 #if HAVE_AVX2
 INSTANTIATE_TEST_CASE_P(
     AVX2, CDEFBlockTest,
-    ::testing::Values(
-        make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_4X4),
-        make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_8X8)));
+    ::testing::Combine(::testing::Values(&cdef_filter_block_avx2),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
 INSTANTIATE_TEST_CASE_P(AVX2, CDEFFindDirTest,
                         ::testing::Values(make_tuple(&cdef_find_dir_avx2,
                                                      &cdef_find_dir_c)));
@@ -348,9 +345,11 @@ INSTANTIATE_TEST_CASE_P(AVX2, CDEFFindDirTest,
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(
     NEON, CDEFBlockTest,
-    ::testing::Values(
-        make_tuple(&cdef_filter_block_neon, &cdef_filter_block_c, BLOCK_4X4),
-        make_tuple(&cdef_filter_block_neon, &cdef_filter_block_c, BLOCK_8X8)));
+    ::testing::Combine(::testing::Values(&cdef_filter_block_neon),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
 INSTANTIATE_TEST_CASE_P(NEON, CDEFFindDirTest,
                         ::testing::Values(make_tuple(&cdef_find_dir_neon,
                                                      &cdef_find_dir_c)));
@@ -360,9 +359,11 @@ INSTANTIATE_TEST_CASE_P(NEON, CDEFFindDirTest,
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, CDEFSpeedTest,
-    ::testing::Values(
-        make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_4X4),
-        make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_8X8)));
+    ::testing::Combine(::testing::Values(&cdef_filter_block_sse2),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
 INSTANTIATE_TEST_CASE_P(SSE2, CDEFFindDirSpeedTest,
                         ::testing::Values(make_tuple(&cdef_find_dir_sse2,
                                                      &cdef_find_dir_c)));
@@ -371,9 +372,11 @@ INSTANTIATE_TEST_CASE_P(SSE2, CDEFFindDirSpeedTest,
 #if HAVE_SSSE3
 INSTANTIATE_TEST_CASE_P(
     SSSE3, CDEFSpeedTest,
-    ::testing::Values(
-        make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_4X4),
-        make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_8X8)));
+    ::testing::Combine(::testing::Values(&cdef_filter_block_ssse3),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
 INSTANTIATE_TEST_CASE_P(SSSE3, CDEFFindDirSpeedTest,
                         ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
                                                      &cdef_find_dir_c)));
@@ -382,10 +385,11 @@ INSTANTIATE_TEST_CASE_P(SSSE3, CDEFFindDirSpeedTest,
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_CASE_P(
     SSE4_1, CDEFSpeedTest,
-    ::testing::Values(make_tuple(&cdef_filter_block_sse4_1,
-                                 &cdef_filter_block_c, BLOCK_4X4),
-                      make_tuple(&cdef_filter_block_sse4_1,
-                                 &cdef_filter_block_c, BLOCK_8X8)));
+    ::testing::Combine(::testing::Values(&cdef_filter_block_sse4_1),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
 INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFFindDirSpeedTest,
                         ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
                                                      &cdef_find_dir_c)));
@@ -394,9 +398,11 @@ INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFFindDirSpeedTest,
 #if HAVE_AVX2
 INSTANTIATE_TEST_CASE_P(
     AVX2, CDEFSpeedTest,
-    ::testing::Values(
-        make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_4X4),
-        make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_8X8)));
+    ::testing::Combine(::testing::Values(&cdef_filter_block_avx2),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
 INSTANTIATE_TEST_CASE_P(AVX2, CDEFFindDirSpeedTest,
                         ::testing::Values(make_tuple(&cdef_find_dir_avx2,
                                                      &cdef_find_dir_c)));
@@ -405,9 +411,11 @@ INSTANTIATE_TEST_CASE_P(AVX2, CDEFFindDirSpeedTest,
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(
     NEON, CDEFSpeedTest,
-    ::testing::Values(
-        make_tuple(&cdef_filter_block_neon, &cdef_filter_block_c, BLOCK_4X4),
-        make_tuple(&cdef_filter_block_neon, &cdef_filter_block_c, BLOCK_8X8)));
+    ::testing::Combine(::testing::Values(&cdef_filter_block_neon),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
 INSTANTIATE_TEST_CASE_P(NEON, CDEFFindDirSpeedTest,
                         ::testing::Values(make_tuple(&cdef_find_dir_neon,
                                                      &cdef_find_dir_c)));
diff --git a/third_party/aom/test/cfl_test.cc b/third_party/aom/test/cfl_test.cc
new file mode 100644
index 000000000..e4d438d6a
--- /dev/null
+++ b/third_party/aom/test/cfl_test.cc
@@ -0,0 +1,567 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "test/util.h"
+#include "test/acm_random.h"
+
+using ::testing::make_tuple;
+
+using libaom_test::ACMRandom;
+
+#define NUM_ITERATIONS (100)
+#define NUM_ITERATIONS_SPEED (INT16_MAX)
+
+#define ALL_CFL_TX_SIZES(function)                                     \
+  make_tuple(TX_4X4, &function), make_tuple(TX_4X8, &function),        \
+      make_tuple(TX_4X16, &function), make_tuple(TX_8X4, &function),   \
+      make_tuple(TX_8X8, &function), make_tuple(TX_8X16, &function),   \
+      make_tuple(TX_8X32, &function), make_tuple(TX_16X4, &function),  \
+      make_tuple(TX_16X8, &function), make_tuple(TX_16X16, &function), \
+      make_tuple(TX_16X32, &function), make_tuple(TX_32X8, &function), \
+      make_tuple(TX_32X16, &function), make_tuple(TX_32X32, &function)
+
+#define ALL_CFL_TX_SIZES_SUBSAMPLE(fun420, fun422, fun444) \
+  make_tuple(TX_4X4, &fun420, &fun422, &fun444),           \
+      make_tuple(TX_4X8, &fun420, &fun422, &fun444),       \
+      make_tuple(TX_4X16, &fun420, &fun422, &fun444),      \
+      make_tuple(TX_8X4, &fun420, &fun422, &fun444),       \
+      make_tuple(TX_8X8, &fun420, &fun422, &fun444),       \
+      make_tuple(TX_8X16, &fun420, &fun422, &fun444),      \
+      make_tuple(TX_8X32, &fun420, &fun422, &fun444),      \
+      make_tuple(TX_16X4, &fun420, &fun422, &fun444),      \
+      make_tuple(TX_16X8, &fun420, &fun422, &fun444),      \
+      make_tuple(TX_16X16, &fun420, &fun422, &fun444),     \
+      make_tuple(TX_16X32, &fun420, &fun422, &fun444),     \
+      make_tuple(TX_32X8, &fun420, &fun422, &fun444),      \
+      make_tuple(TX_32X16, &fun420, &fun422, &fun444),     \
+      make_tuple(TX_32X32, &fun420, &fun422, &fun444)
+
+namespace {
+
+template <typename A>
+static void assert_eq(const A *a, const A *b, int width, int height) {
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      ASSERT_EQ(a[j * CFL_BUF_LINE + i], b[j * CFL_BUF_LINE + i]);
+    }
+  }
+}
+
+static void assertFaster(int ref_elapsed_time, int elapsed_time) {
+  EXPECT_GT(ref_elapsed_time, elapsed_time)
+      << "Error: CFLSubtractSpeedTest, SIMD slower than C." << std::endl
+      << "C time: " << ref_elapsed_time << " us" << std::endl
+      << "SIMD time: " << elapsed_time << " us" << std::endl;
+}
+
+static void printSpeed(int ref_elapsed_time, int elapsed_time, int width,
+                       int height) {
+  std::cout.precision(2);
+  std::cout << "[          ] " << width << "x" << height
+            << ": C time = " << ref_elapsed_time
+            << " us, SIMD time = " << elapsed_time << " us"
+            << " (~" << ref_elapsed_time / (double)elapsed_time << "x) "
+            << std::endl;
+}
+
+class CFLTest {
+ public:
+  virtual ~CFLTest() {}
+  void init(TX_SIZE tx) {
+    tx_size = tx;
+    width = tx_size_wide[tx_size];
+    height = tx_size_high[tx_size];
+    rnd(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  TX_SIZE tx_size;
+  int width;
+  int height;
+  ACMRandom rnd;
+};
+
+template <typename I>
+class CFLTestWithData : public CFLTest {
+ public:
+  virtual ~CFLTestWithData() {}
+
+ protected:
+  I data[CFL_BUF_SQUARE];
+  I data_ref[CFL_BUF_SQUARE];
+  void randData(I (ACMRandom::*random)()) {
+    for (int j = 0; j < this->height; j++) {
+      for (int i = 0; i < this->width; i++) {
+        const I d = (this->rnd.*random)();
+        data[j * CFL_BUF_LINE + i] = d;
+        data_ref[j * CFL_BUF_LINE + i] = d;
+      }
+    }
+  }
+};
+
+template <typename I>
+class CFLTestWithAlignedData : public CFLTest {
+ public:
+  CFLTestWithAlignedData() {
+    chroma_pels_ref =
+        reinterpret_cast<I *>(aom_memalign(32, sizeof(I) * CFL_BUF_SQUARE));
+    chroma_pels =
+        reinterpret_cast<I *>(aom_memalign(32, sizeof(I) * CFL_BUF_SQUARE));
+    sub_luma_pels_ref = reinterpret_cast<int16_t *>(
+        aom_memalign(32, sizeof(int16_t) * CFL_BUF_SQUARE));
+    sub_luma_pels = reinterpret_cast<int16_t *>(
+        aom_memalign(32, sizeof(int16_t) * CFL_BUF_SQUARE));
+    memset(chroma_pels_ref, 0, sizeof(I) * CFL_BUF_SQUARE);
+    memset(chroma_pels, 0, sizeof(I) * CFL_BUF_SQUARE);
+    memset(sub_luma_pels_ref, 0, sizeof(int16_t) * CFL_BUF_SQUARE);
+    memset(sub_luma_pels, 0, sizeof(int16_t) * CFL_BUF_SQUARE);
+  }
+  ~CFLTestWithAlignedData() {
+    aom_free(chroma_pels_ref);
+    aom_free(sub_luma_pels_ref);
+    aom_free(chroma_pels);
+    aom_free(sub_luma_pels);
+  }
+
+ protected:
+  I *chroma_pels_ref;
+  I *chroma_pels;
+  int16_t *sub_luma_pels_ref;
+  int16_t *sub_luma_pels;
+  int alpha_q3;
+  I dc;
+  void randData(int bd) {
+    alpha_q3 = this->rnd(33) - 16;
+    dc = this->rnd(1 << bd);
+    for (int j = 0; j < this->height; j++) {
+      for (int i = 0; i < this->width; i++) {
+        chroma_pels[j * CFL_BUF_LINE + i] = dc;
+        chroma_pels_ref[j * CFL_BUF_LINE + i] = dc;
+        sub_luma_pels_ref[j * CFL_BUF_LINE + i] =
+            sub_luma_pels[j * CFL_BUF_LINE + i] = this->rnd(1 << (bd + 3));
+      }
+    }
+  }
+};
+
+typedef cfl_subtract_average_fn (*sub_avg_fn)(TX_SIZE tx_size);
+typedef ::testing::tuple<TX_SIZE, sub_avg_fn> sub_avg_param;
+class CFLSubAvgTest : public ::testing::TestWithParam<sub_avg_param>,
+                      public CFLTestWithData<int16_t> {
+ public:
+  virtual void SetUp() {
+    CFLTest::init(::testing::get<0>(this->GetParam()));
+    sub_avg = ::testing::get<1>(this->GetParam())(tx_size);
+    sub_avg_ref = get_subtract_average_fn_c(tx_size);
+  }
+  virtual ~CFLSubAvgTest() {}
+
+ protected:
+  cfl_subtract_average_fn sub_avg;
+  cfl_subtract_average_fn sub_avg_ref;
+};
+
+TEST_P(CFLSubAvgTest, SubAvgTest) {
+  for (int it = 0; it < NUM_ITERATIONS; it++) {
+    randData(&ACMRandom::Rand15Signed);
+    sub_avg((uint16_t *)data, data);
+    sub_avg_ref((uint16_t *)data_ref, data_ref);
+    assert_eq<int16_t>(data, data_ref, width, height);
+  }
+}
+
+TEST_P(CFLSubAvgTest, DISABLED_SubAvgSpeedTest) {
+  aom_usec_timer ref_timer;
+  aom_usec_timer timer;
+  randData(&ACMRandom::Rand15Signed);
+  aom_usec_timer_start(&ref_timer);
+  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+    sub_avg_ref((uint16_t *)data_ref, data_ref);
+  }
+  aom_usec_timer_mark(&ref_timer);
+  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+  aom_usec_timer_start(&timer);
+  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+    sub_avg((uint16_t *)data, data);
+  }
+  aom_usec_timer_mark(&timer);
+  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+  printSpeed(ref_elapsed_time, elapsed_time, width, height);
+  assertFaster(ref_elapsed_time, elapsed_time);
+}
+
+template <typename S, typename T, typename I>
+class CFLSubsampleTest : public ::testing::TestWithParam<S>,
+                         public CFLTestWithData<I> {
+ public:
+  virtual void SetUp() {
+    CFLTest::init(::testing::get<0>(this->GetParam()));
+    fun_420 = ::testing::get<1>(this->GetParam())(this->tx_size);
+    fun_422 = ::testing::get<2>(this->GetParam())(this->tx_size);
+    fun_444 = ::testing::get<3>(this->GetParam())(this->tx_size);
+  }
+
+ protected:
+  T fun_420;
+  T fun_422;
+  T fun_444;
+  T fun_420_ref;
+  T fun_422_ref;
+  T fun_444_ref;
+
+  void subsampleTest(T fun, T fun_ref, int sub_width, int sub_height,
+                     I (ACMRandom::*random)()) {
+    uint16_t sub_luma_pels[CFL_BUF_SQUARE];
+    uint16_t sub_luma_pels_ref[CFL_BUF_SQUARE];
+
+    for (int it = 0; it < NUM_ITERATIONS; it++) {
+      CFLTestWithData<I>::randData(random);
+      fun(this->data, CFL_BUF_LINE, sub_luma_pels);
+      fun_ref(this->data_ref, CFL_BUF_LINE, sub_luma_pels_ref);
+      assert_eq<uint16_t>(sub_luma_pels, sub_luma_pels_ref, sub_width,
+                          sub_height);
+    }
+  }
+
+  void subsampleSpeedTest(T fun, T fun_ref, I (ACMRandom::*random)()) {
+    uint16_t sub_luma_pels[CFL_BUF_SQUARE];
+    uint16_t sub_luma_pels_ref[CFL_BUF_SQUARE];
+    aom_usec_timer ref_timer;
+    aom_usec_timer timer;
+
+    CFLTestWithData<I>::randData(random);
+    aom_usec_timer_start(&ref_timer);
+    for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+      fun_ref(this->data_ref, CFL_BUF_LINE, sub_luma_pels);
+    }
+    aom_usec_timer_mark(&ref_timer);
+    int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+    aom_usec_timer_start(&timer);
+    for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+      fun(this->data, CFL_BUF_LINE, sub_luma_pels_ref);
+    }
+    aom_usec_timer_mark(&timer);
+    int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+    printSpeed(ref_elapsed_time, elapsed_time, this->width, this->height);
+    assertFaster(ref_elapsed_time, elapsed_time);
+  }
+};
+
+typedef cfl_subsample_lbd_fn (*get_subsample_lbd_fn)(TX_SIZE tx_size);
+typedef ::testing::tuple<TX_SIZE, get_subsample_lbd_fn, get_subsample_lbd_fn,
+                         get_subsample_lbd_fn>
+    subsample_lbd_param;
+class CFLSubsampleLBDTest
+    : public CFLSubsampleTest<subsample_lbd_param, cfl_subsample_lbd_fn,
+                              uint8_t> {
+ public:
+  virtual ~CFLSubsampleLBDTest() {}
+  virtual void SetUp() {
+    CFLSubsampleTest::SetUp();
+    fun_420_ref = cfl_get_luma_subsampling_420_lbd_c(tx_size);
+    fun_422_ref = cfl_get_luma_subsampling_422_lbd_c(tx_size);
+    fun_444_ref = cfl_get_luma_subsampling_444_lbd_c(tx_size);
+  }
+};
+
+TEST_P(CFLSubsampleLBDTest, SubsampleLBD420Test) {
+  subsampleTest(fun_420, fun_420_ref, width >> 1, height >> 1,
+                &ACMRandom::Rand8);
+}
+
+TEST_P(CFLSubsampleLBDTest, DISABLED_SubsampleLBD420SpeedTest) {
+  subsampleSpeedTest(fun_420, fun_420_ref, &ACMRandom::Rand8);
+}
+
+TEST_P(CFLSubsampleLBDTest, SubsampleLBD422Test) {
+  subsampleTest(fun_422, fun_422_ref, width >> 1, height, &ACMRandom::Rand8);
+}
+
+TEST_P(CFLSubsampleLBDTest, DISABLED_SubsampleLBD422SpeedTest) {
+  subsampleSpeedTest(fun_422, fun_422_ref, &ACMRandom::Rand8);
+}
+
+TEST_P(CFLSubsampleLBDTest, SubsampleLBD444Test) {
+  subsampleTest(fun_444, fun_444_ref, width, height, &ACMRandom::Rand8);
+}
+
+TEST_P(CFLSubsampleLBDTest, DISABLED_SubsampleLBD444SpeedTest) {
+  subsampleSpeedTest(fun_444, fun_444_ref, &ACMRandom::Rand8);
+}
+
+typedef cfl_subsample_hbd_fn (*get_subsample_hbd_fn)(TX_SIZE tx_size);
+typedef ::testing::tuple<TX_SIZE, get_subsample_hbd_fn, get_subsample_hbd_fn,
+                         get_subsample_hbd_fn>
+    subsample_hbd_param;
+class CFLSubsampleHBDTest
+    : public CFLSubsampleTest<subsample_hbd_param, cfl_subsample_hbd_fn,
+                              uint16_t> {
+ public:
+  virtual ~CFLSubsampleHBDTest() {}
+  virtual void SetUp() {
+    CFLSubsampleTest::SetUp();
+    fun_420_ref = cfl_get_luma_subsampling_420_hbd_c(tx_size);
+    fun_422_ref = cfl_get_luma_subsampling_422_hbd_c(tx_size);
+    fun_444_ref = cfl_get_luma_subsampling_444_hbd_c(tx_size);
+  }
+};
+
+TEST_P(CFLSubsampleHBDTest, SubsampleHBD420Test) {
+  subsampleTest(fun_420, fun_420_ref, width >> 1, height >> 1,
+                &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, DISABLED_SubsampleHBD420SpeedTest) {
+  subsampleSpeedTest(fun_420, fun_420_ref, &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, SubsampleHBD422Test) {
+  subsampleTest(fun_422, fun_422_ref, width >> 1, height, &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, DISABLED_SubsampleHBD422SpeedTest) {
+  subsampleSpeedTest(fun_422, fun_422_ref, &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, SubsampleHBD444Test) {
+  subsampleTest(fun_444, fun_444_ref, width, height, &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, DISABLED_SubsampleHBD444SpeedTest) {
+  subsampleSpeedTest(fun_444, fun_444_ref, &ACMRandom::Rand12);
+}
+
+typedef cfl_predict_lbd_fn (*get_predict_fn)(TX_SIZE tx_size);
+typedef ::testing::tuple<TX_SIZE, get_predict_fn> predict_param;
+class CFLPredictTest : public ::testing::TestWithParam<predict_param>,
+                       public CFLTestWithAlignedData<uint8_t> {
+ public:
+  virtual void SetUp() {
+    CFLTest::init(::testing::get<0>(this->GetParam()));
+    predict = ::testing::get<1>(this->GetParam())(tx_size);
+    predict_ref = get_predict_lbd_fn_c(tx_size);
+  }
+  virtual ~CFLPredictTest() {}
+
+ protected:
+  cfl_predict_lbd_fn predict;
+  cfl_predict_lbd_fn predict_ref;
+};
+
+TEST_P(CFLPredictTest, PredictTest) {
+  for (int it = 0; it < NUM_ITERATIONS; it++) {
+    randData(8);
+    predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3);
+    predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3);
+    assert_eq<uint8_t>(chroma_pels, chroma_pels_ref, width, height);
+  }
+}
+TEST_P(CFLPredictTest, DISABLED_PredictSpeedTest) {
+  aom_usec_timer ref_timer;
+  aom_usec_timer timer;
+  randData(8);
+  aom_usec_timer_start(&ref_timer);
+  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+    predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3);
+  }
+  aom_usec_timer_mark(&ref_timer);
+  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+
+  aom_usec_timer_start(&timer);
+  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+    predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3);
+  }
+  aom_usec_timer_mark(&timer);
+  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+  printSpeed(ref_elapsed_time, elapsed_time, width, height);
+  assertFaster(ref_elapsed_time, elapsed_time);
+}
+
+typedef cfl_predict_hbd_fn (*get_predict_fn_hbd)(TX_SIZE tx_size);
+typedef ::testing::tuple<TX_SIZE, get_predict_fn_hbd> predict_param_hbd;
+class CFLPredictHBDTest : public ::testing::TestWithParam<predict_param_hbd>,
+                          public CFLTestWithAlignedData<uint16_t> {
+ public:
+  virtual void SetUp() {
+    CFLTest::init(::testing::get<0>(this->GetParam()));
+    predict = ::testing::get<1>(this->GetParam())(tx_size);
+    predict_ref = get_predict_hbd_fn_c(tx_size);
+  }
+  virtual ~CFLPredictHBDTest() {}
+
+ protected:
+  cfl_predict_hbd_fn predict;
+  cfl_predict_hbd_fn predict_ref;
+};
+
+TEST_P(CFLPredictHBDTest, PredictHBDTest) {
+  int bd = 12;
+  for (int it = 0; it < NUM_ITERATIONS; it++) {
+    randData(bd);
+    predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3, bd);
+    predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3, bd);
+    assert_eq<uint16_t>(chroma_pels, chroma_pels_ref, width, height);
+  }
+}
+TEST_P(CFLPredictHBDTest, DISABLED_PredictHBDSpeedTest) {
+  aom_usec_timer ref_timer;
+  aom_usec_timer timer;
+  const int bd = 12;
+  randData(bd);
+  aom_usec_timer_start(&ref_timer);
+  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+    predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3, bd);
+  }
+  aom_usec_timer_mark(&ref_timer);
+  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+
+  aom_usec_timer_start(&timer);
+  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+    predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3, bd);
+  }
+  aom_usec_timer_mark(&timer);
+  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+  printSpeed(ref_elapsed_time, elapsed_time, width, height);
+  assertFaster(ref_elapsed_time, elapsed_time);
+}
+
+#if HAVE_SSE2
+const sub_avg_param sub_avg_sizes_sse2[] = { ALL_CFL_TX_SIZES(
+    get_subtract_average_fn_sse2) };
+
+INSTANTIATE_TEST_CASE_P(SSE2, CFLSubAvgTest,
+                        ::testing::ValuesIn(sub_avg_sizes_sse2));
+
+#endif
+
+#if HAVE_SSSE3
+const subsample_lbd_param subsample_lbd_sizes_ssse3[] = {
+  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_lbd_ssse3,
+                             cfl_get_luma_subsampling_422_lbd_ssse3,
+                             cfl_get_luma_subsampling_444_lbd_ssse3)
+};
+
+const subsample_hbd_param subsample_hbd_sizes_ssse3[] = {
+  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_ssse3,
+                             cfl_get_luma_subsampling_422_hbd_ssse3,
+                             cfl_get_luma_subsampling_444_hbd_ssse3)
+};
+
+const predict_param predict_sizes_ssse3[] = { ALL_CFL_TX_SIZES(
+    get_predict_lbd_fn_ssse3) };
+
+const predict_param_hbd predict_sizes_hbd_ssse3[] = { ALL_CFL_TX_SIZES(
+    get_predict_hbd_fn_ssse3) };
+
+INSTANTIATE_TEST_CASE_P(SSSE3, CFLSubsampleLBDTest,
+                        ::testing::ValuesIn(subsample_lbd_sizes_ssse3));
+
+INSTANTIATE_TEST_CASE_P(SSSE3, CFLSubsampleHBDTest,
+                        ::testing::ValuesIn(subsample_hbd_sizes_ssse3));
+
+INSTANTIATE_TEST_CASE_P(SSSE3, CFLPredictTest,
+                        ::testing::ValuesIn(predict_sizes_ssse3));
+
+INSTANTIATE_TEST_CASE_P(SSSE3, CFLPredictHBDTest,
+                        ::testing::ValuesIn(predict_sizes_hbd_ssse3));
+#endif
+
+#if HAVE_AVX2
+const sub_avg_param sub_avg_sizes_avx2[] = { ALL_CFL_TX_SIZES(
+    get_subtract_average_fn_avx2) };
+
+const subsample_lbd_param subsample_lbd_sizes_avx2[] = {
+  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_lbd_avx2,
+                             cfl_get_luma_subsampling_422_lbd_avx2,
+                             cfl_get_luma_subsampling_444_lbd_avx2)
+};
+
+const subsample_hbd_param subsample_hbd_sizes_avx2[] = {
+  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_avx2,
+                             cfl_get_luma_subsampling_422_hbd_avx2,
+                             cfl_get_luma_subsampling_444_hbd_avx2)
+};
+
+const predict_param predict_sizes_avx2[] = { ALL_CFL_TX_SIZES(
+    get_predict_lbd_fn_avx2) };
+
+const predict_param_hbd predict_sizes_hbd_avx2[] = { ALL_CFL_TX_SIZES(
+    get_predict_hbd_fn_avx2) };
+
+INSTANTIATE_TEST_CASE_P(AVX2, CFLSubAvgTest,
+                        ::testing::ValuesIn(sub_avg_sizes_avx2));
+
+INSTANTIATE_TEST_CASE_P(AVX2, CFLSubsampleLBDTest,
+                        ::testing::ValuesIn(subsample_lbd_sizes_avx2));
+
+INSTANTIATE_TEST_CASE_P(AVX2, CFLSubsampleHBDTest,
+                        ::testing::ValuesIn(subsample_hbd_sizes_avx2));
+
+INSTANTIATE_TEST_CASE_P(AVX2, CFLPredictTest,
+                        ::testing::ValuesIn(predict_sizes_avx2));
+
+INSTANTIATE_TEST_CASE_P(AVX2, CFLPredictHBDTest,
+                        ::testing::ValuesIn(predict_sizes_hbd_avx2));
+#endif
+
+#if HAVE_NEON
+
+const sub_avg_param sub_avg_sizes_neon[] = { ALL_CFL_TX_SIZES(
+    get_subtract_average_fn_neon) };
+
+const subsample_lbd_param subsample_lbd_sizes_neon[] = {
+  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_lbd_neon,
+                             cfl_get_luma_subsampling_422_lbd_neon,
+                             cfl_get_luma_subsampling_444_lbd_neon)
+};
+
+const subsample_hbd_param subsample_hbd_sizes_neon[] = {
+  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_neon,
+                             cfl_get_luma_subsampling_422_hbd_neon,
+                             cfl_get_luma_subsampling_444_hbd_neon)
+};
+
+const predict_param predict_sizes_neon[] = { ALL_CFL_TX_SIZES(
+    get_predict_lbd_fn_neon) };
+
+const predict_param_hbd predict_sizes_hbd_neon[] = { ALL_CFL_TX_SIZES(
+    get_predict_hbd_fn_neon) };
+
+INSTANTIATE_TEST_CASE_P(NEON, CFLSubAvgTest,
+                        ::testing::ValuesIn(sub_avg_sizes_neon));
+
+INSTANTIATE_TEST_CASE_P(NEON, CFLSubsampleLBDTest,
+                        ::testing::ValuesIn(subsample_lbd_sizes_neon));
+
+INSTANTIATE_TEST_CASE_P(NEON, CFLSubsampleHBDTest,
+                        ::testing::ValuesIn(subsample_hbd_sizes_neon));
+
+INSTANTIATE_TEST_CASE_P(NEON, CFLPredictTest,
+                        ::testing::ValuesIn(predict_sizes_neon));
+
+INSTANTIATE_TEST_CASE_P(NEON, CFLPredictHBDTest,
+                        ::testing::ValuesIn(predict_sizes_hbd_neon));
+#endif
+
+#if HAVE_VSX
+const sub_avg_param sub_avg_sizes_vsx[] = { ALL_CFL_TX_SIZES(
+    get_subtract_average_fn_vsx) };
+
+INSTANTIATE_TEST_CASE_P(VSX, CFLSubAvgTest,
+                        ::testing::ValuesIn(sub_avg_sizes_vsx));
+#endif
+}  // namespace
diff --git a/third_party/aom/test/clear_system_state.h b/third_party/aom/test/clear_system_state.h
index 4f3c1eed0..7aa78243b 100644
--- a/third_party/aom/test/clear_system_state.h
+++ b/third_party/aom/test/clear_system_state.h
@@ -11,7 +11,8 @@
 #ifndef TEST_CLEAR_SYSTEM_STATE_H_
 #define TEST_CLEAR_SYSTEM_STATE_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #if ARCH_X86 || ARCH_X86_64
 #include "aom_ports/x86.h"
 #endif
diff --git a/third_party/aom/test/clpf_test.cc b/third_party/aom/test/clpf_test.cc
deleted file mode 100644
index ecb042876..000000000
--- a/third_party/aom/test/clpf_test.cc
+++ /dev/null
@@ -1,437 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
-
-#include <cstdlib>
-#include <string>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
-#include "aom_ports/aom_timer.h"
-#include "av1/common/cdef_block.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-
-typedef void (*clpf_block_t)(uint8_t *dst, const uint16_t *src, int dstride,
-                             int sstride, int sizex, int sizey,
-                             unsigned int strength, unsigned int bitdepth);
-
-typedef std::tr1::tuple<clpf_block_t, clpf_block_t, int, int>
-    clpf_block_param_t;
-
-class CDEFClpfBlockTest : public ::testing::TestWithParam<clpf_block_param_t> {
- public:
-  virtual ~CDEFClpfBlockTest() {}
-  virtual void SetUp() {
-    clpf = GET_PARAM(0);
-    ref_clpf = GET_PARAM(1);
-    sizex = GET_PARAM(2);
-    sizey = GET_PARAM(3);
-  }
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  int sizex;
-  int sizey;
-  clpf_block_t clpf;
-  clpf_block_t ref_clpf;
-};
-
-typedef CDEFClpfBlockTest CDEFClpfSpeedTest;
-
-#if CONFIG_HIGHBITDEPTH
-typedef void (*clpf_block_hbd_t)(uint16_t *dst, const uint16_t *src,
-                                 int dstride, int sstride, int sizex, int sizey,
-                                 unsigned int strength, unsigned int bitdepth);
-
-typedef std::tr1::tuple<clpf_block_hbd_t, clpf_block_hbd_t, int, int>
-    clpf_block_hbd_param_t;
-
-class CDEFClpfBlockHbdTest
-    : public ::testing::TestWithParam<clpf_block_hbd_param_t> {
- public:
-  virtual ~CDEFClpfBlockHbdTest() {}
-  virtual void SetUp() {
-    clpf = GET_PARAM(0);
-    ref_clpf = GET_PARAM(1);
-    sizex = GET_PARAM(2);
-    sizey = GET_PARAM(3);
-  }
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  int sizex;
-  int sizey;
-  clpf_block_hbd_t clpf;
-  clpf_block_hbd_t ref_clpf;
-};
-
-typedef CDEFClpfBlockHbdTest ClpfHbdSpeedTest;
-#endif
-
-template <typename pixel>
-void test_clpf(int w, int h, unsigned int depth, unsigned int iterations,
-               void (*clpf)(pixel *dst, const uint16_t *src, int dstride,
-                            int sstride, int sizex, int sizey,
-                            unsigned int strength, unsigned int bitdepth),
-               void (*ref_clpf)(pixel *dst, const uint16_t *src, int dstride,
-                                int sstride, int sizex, int sizey,
-                                unsigned int strength, unsigned int bitdepth)) {
-  const int size = 24;
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t, s[size * size]);
-  DECLARE_ALIGNED(16, pixel, d[size * size]);
-  DECLARE_ALIGNED(16, pixel, ref_d[size * size]);
-  memset(ref_d, 0, size * size * sizeof(*ref_d));
-  memset(d, 0, size * size * sizeof(*d));
-
-  int error = 0, pos = 0, xpos = 8, ypos = 8;
-  unsigned int strength = 0, bits, level, count, damp = 0, boundary = 0;
-
-  assert(size >= w + 16 && size >= h + 16);
-  assert(depth >= 8);
-
-  // Test every combination of:
-  // * Input with up to <depth> bits of noise
-  // * Noise level around every value from 0 to (1<<depth)-1
-  // * All strengths
-  // * All dampings
-  // * Boundaries
-  // If clpf and ref_clpf are the same, we're just testing speed
-  for (boundary = 0; boundary < 16; boundary++) {
-    for (count = 0; count < iterations; count++) {
-      for (level = 0; level < (1U << depth) && !error;
-           level += (1 + 4 * !!boundary) << (depth - 8)) {
-        for (bits = 1; bits <= depth && !error; bits++) {
-          for (damp = 4 + depth - 8; damp < depth - 1 && !error; damp++) {
-            for (int i = 0; i < size * size; i++)
-              s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
-                           (1 << depth) - 1);
-            if (boundary) {
-              if (boundary & 1) {  // Left
-                for (int i = 0; i < size; i++)
-                  for (int j = 0; j < xpos; j++)
-                    s[i * size + j] = CDEF_VERY_LARGE;
-              }
-              if (boundary & 2) {  // Right
-                for (int i = 0; i < size; i++)
-                  for (int j = xpos + w; j < size; j++)
-                    s[i * size + j] = CDEF_VERY_LARGE;
-              }
-              if (boundary & 4) {  // Above
-                for (int i = 0; i < ypos; i++)
-                  for (int j = 0; j < size; j++)
-                    s[i * size + j] = CDEF_VERY_LARGE;
-              }
-              if (boundary & 8) {  // Below
-                for (int i = ypos + h; i < size; i++)
-                  for (int j = 0; j < size; j++)
-                    s[i * size + j] = CDEF_VERY_LARGE;
-              }
-            }
-            for (strength = depth - 8; strength < depth - 5 && !error;
-                 strength += !error) {
-              ref_clpf(ref_d + ypos * size + xpos, s + ypos * size + xpos, size,
-                       size, w, h, 1 << strength, damp);
-              if (clpf != ref_clpf)
-                ASM_REGISTER_STATE_CHECK(clpf(d + ypos * size + xpos,
-                                              s + ypos * size + xpos, size,
-                                              size, w, h, 1 << strength, damp));
-              if (ref_clpf != clpf) {
-                for (pos = 0; pos < size * size && !error; pos++) {
-                  error = ref_d[pos] != d[pos];
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  pos--;
-  EXPECT_EQ(0, error)
-      << "Error: CDEFClpfBlockTest, SIMD and C mismatch." << std::endl
-      << "First error at " << pos % size << "," << pos / size << " ("
-      << (int16_t)ref_d[pos] << " != " << (int16_t)d[pos] << ") " << std::endl
-      << "strength: " << (1 << strength) << std::endl
-      << "damping: " << damp << std::endl
-      << "depth: " << depth << std::endl
-      << "boundary: " << boundary << std::endl
-      << "w: " << w << std::endl
-      << "h: " << h << std::endl
-      << "A=" << (pos > 2 * size ? (int16_t)s[pos - 2 * size] : -1) << std::endl
-      << "B=" << (pos > size ? (int16_t)s[pos - size] : -1) << std::endl
-      << "C=" << (pos % size - 2 >= 0 ? (int16_t)s[pos - 2] : -1) << std::endl
-      << "D=" << (pos % size - 1 >= 0 ? (int16_t)s[pos - 1] : -1) << std::endl
-      << "X=" << (int16_t)s[pos] << std::endl
-      << "E=" << (pos % size + 1 < size ? (int16_t)s[pos + 1] : -1) << std::endl
-      << "F=" << (pos % size + 2 < size ? (int16_t)s[pos + 2] : -1) << std::endl
-      << "G=" << (pos + size < size * size ? (int16_t)s[pos + size] : -1)
-      << std::endl
-      << "H="
-      << (pos + 2 * size < size * size ? (int16_t)s[pos + 2 * size] : -1)
-      << std::endl;
-}
-
-template <typename pixel>
-void test_clpf_speed(int w, int h, unsigned int depth, unsigned int iterations,
-                     void (*clpf)(pixel *dst, const uint16_t *src, int dstride,
-                                  int sstride, int sizex, int sizey,
-                                  unsigned int strength, unsigned int bitdepth),
-                     void (*ref_clpf)(pixel *dst, const uint16_t *src,
-                                      int dstride, int sstride, int sizex,
-                                      int sizey, unsigned int strength,
-                                      unsigned int bitdepth)) {
-  aom_usec_timer ref_timer;
-  aom_usec_timer timer;
-
-  aom_usec_timer_start(&ref_timer);
-  test_clpf(w, h, depth, iterations, ref_clpf, ref_clpf);
-  aom_usec_timer_mark(&ref_timer);
-  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
-
-  aom_usec_timer_start(&timer);
-  test_clpf(w, h, depth, iterations, clpf, clpf);
-  aom_usec_timer_mark(&timer);
-  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
-
-#if 0
-  std::cout << "[          ] C time = " << ref_elapsed_time / 1000
-            << " ms, SIMD time = " << elapsed_time / 1000 << " ms" << std::endl;
-#endif
-
-  EXPECT_GT(ref_elapsed_time, elapsed_time)
-      << "Error: CDEFClpfSpeedTest, SIMD slower than C." << std::endl
-      << "C time: " << ref_elapsed_time << " us" << std::endl
-      << "SIMD time: " << elapsed_time << " us" << std::endl;
-}
-
-TEST_P(CDEFClpfBlockTest, TestSIMDNoMismatch) {
-  test_clpf(sizex, sizey, 8, 1, clpf, ref_clpf);
-}
-
-TEST_P(CDEFClpfSpeedTest, DISABLED_TestSpeed) {
-  test_clpf_speed(sizex, sizey, 8, 16, clpf, ref_clpf);
-}
-
-#if CONFIG_HIGHBITDEPTH
-TEST_P(CDEFClpfBlockHbdTest, TestSIMDNoMismatch) {
-  test_clpf(sizex, sizey, 12, 1, clpf, ref_clpf);
-}
-
-TEST_P(ClpfHbdSpeedTest, DISABLED_TestSpeed) {
-  test_clpf_speed(sizex, sizey, 12, 4, clpf, ref_clpf);
-}
-#endif
-
-using std::tr1::make_tuple;
-
-// VS compiling for 32 bit targets does not support vector types in
-// structs as arguments, which makes the v256 type of the intrinsics
-// hard to support, so optimizations for this target are disabled.
-#if defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__)
-// Test all supported architectures and block sizes
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
-    SSE2, CDEFClpfBlockTest,
-    ::testing::Values(
-        make_tuple(&aom_clpf_block_sse2, &aom_clpf_block_c, 8, 8),
-        make_tuple(&aom_clpf_block_sse2, &aom_clpf_block_c, 8, 4),
-        make_tuple(&aom_clpf_block_sse2, &aom_clpf_block_c, 4, 8),
-        make_tuple(&aom_clpf_block_sse2, &aom_clpf_block_c, 4, 4),
-        make_tuple(&aom_clpf_hblock_sse2, &aom_clpf_hblock_c, 8, 8),
-        make_tuple(&aom_clpf_hblock_sse2, &aom_clpf_hblock_c, 8, 4),
-        make_tuple(&aom_clpf_hblock_sse2, &aom_clpf_hblock_c, 4, 8),
-        make_tuple(&aom_clpf_hblock_sse2, &aom_clpf_hblock_c, 4, 4)));
-#endif
-
-#if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
-    SSSE3, CDEFClpfBlockTest,
-    ::testing::Values(
-        make_tuple(&aom_clpf_block_ssse3, &aom_clpf_block_c, 8, 8),
-        make_tuple(&aom_clpf_block_ssse3, &aom_clpf_block_c, 8, 4),
-        make_tuple(&aom_clpf_block_ssse3, &aom_clpf_block_c, 4, 8),
-        make_tuple(&aom_clpf_block_ssse3, &aom_clpf_block_c, 4, 4),
-        make_tuple(&aom_clpf_hblock_ssse3, &aom_clpf_hblock_c, 8, 8),
-        make_tuple(&aom_clpf_hblock_ssse3, &aom_clpf_hblock_c, 8, 4),
-        make_tuple(&aom_clpf_hblock_ssse3, &aom_clpf_hblock_c, 4, 8),
-        make_tuple(&aom_clpf_hblock_ssse3, &aom_clpf_hblock_c, 4, 4)));
-#endif
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
-    SSE4_1, CDEFClpfBlockTest,
-    ::testing::Values(
-        make_tuple(&aom_clpf_block_sse4_1, &aom_clpf_block_c, 8, 8),
-        make_tuple(&aom_clpf_block_sse4_1, &aom_clpf_block_c, 8, 4),
-        make_tuple(&aom_clpf_block_sse4_1, &aom_clpf_block_c, 4, 8),
-        make_tuple(&aom_clpf_block_sse4_1, &aom_clpf_block_c, 4, 4),
-        make_tuple(&aom_clpf_hblock_sse4_1, &aom_clpf_hblock_c, 8, 8),
-        make_tuple(&aom_clpf_hblock_sse4_1, &aom_clpf_hblock_c, 8, 4),
-        make_tuple(&aom_clpf_hblock_sse4_1, &aom_clpf_hblock_c, 4, 8),
-        make_tuple(&aom_clpf_hblock_sse4_1, &aom_clpf_hblock_c, 4, 4)));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
-    NEON, CDEFClpfBlockTest,
-    ::testing::Values(
-        make_tuple(&aom_clpf_block_neon, &aom_clpf_block_c, 8, 8),
-        make_tuple(&aom_clpf_block_neon, &aom_clpf_block_c, 8, 4),
-        make_tuple(&aom_clpf_block_neon, &aom_clpf_block_c, 4, 8),
-        make_tuple(&aom_clpf_block_neon, &aom_clpf_block_c, 4, 4),
-        make_tuple(&aom_clpf_hblock_neon, &aom_clpf_hblock_c, 8, 8),
-        make_tuple(&aom_clpf_hblock_neon, &aom_clpf_hblock_c, 8, 4),
-        make_tuple(&aom_clpf_hblock_neon, &aom_clpf_hblock_c, 4, 8),
-        make_tuple(&aom_clpf_hblock_neon, &aom_clpf_hblock_c, 4, 4)));
-#endif
-
-#if CONFIG_HIGHBITDEPTH
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
-    SSE2, CDEFClpfBlockHbdTest,
-    ::testing::Values(
-        make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 8, 8),
-        make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 8, 4),
-        make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 4, 8),
-        make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 4, 4),
-        make_tuple(&aom_clpf_hblock_hbd_sse2, &aom_clpf_hblock_hbd_c, 8, 8),
-        make_tuple(&aom_clpf_hblock_hbd_sse2, &aom_clpf_hblock_hbd_c, 8, 4),
-        make_tuple(&aom_clpf_hblock_hbd_sse2, &aom_clpf_hblock_hbd_c, 4, 8),
-        make_tuple(&aom_clpf_hblock_hbd_sse2, &aom_clpf_hblock_hbd_c, 4, 4)));
-#endif
-
-#if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
-    SSSE3, CDEFClpfBlockHbdTest,
-    ::testing::Values(
-        make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 8, 8),
-        make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 8, 4),
-        make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 4, 8),
-        make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 4, 4),
-        make_tuple(&aom_clpf_hblock_hbd_ssse3, &aom_clpf_hblock_hbd_c, 8, 8),
-        make_tuple(&aom_clpf_hblock_hbd_ssse3, &aom_clpf_hblock_hbd_c, 8, 4),
-        make_tuple(&aom_clpf_hblock_hbd_ssse3, &aom_clpf_hblock_hbd_c, 4, 8),
-        make_tuple(&aom_clpf_hblock_hbd_ssse3, &aom_clpf_hblock_hbd_c, 4, 4)));
-#endif
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
-    SSE4_1, CDEFClpfBlockHbdTest,
-    ::testing::Values(
-        make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 8, 8),
-        make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 8, 4),
-        make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 4, 8),
-        make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 4, 4),
-        make_tuple(&aom_clpf_hblock_hbd_sse4_1, &aom_clpf_hblock_hbd_c, 8, 8),
-        make_tuple(&aom_clpf_hblock_hbd_sse4_1, &aom_clpf_hblock_hbd_c, 8, 4),
-        make_tuple(&aom_clpf_hblock_hbd_sse4_1, &aom_clpf_hblock_hbd_c, 4, 8),
-        make_tuple(&aom_clpf_hblock_hbd_sse4_1, &aom_clpf_hblock_hbd_c, 4, 4)));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
-    NEON, CDEFClpfBlockHbdTest,
-    ::testing::Values(
-        make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 8, 8),
-        make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 8, 4),
-        make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 4, 8),
-        make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 4, 4),
-        make_tuple(&aom_clpf_hblock_hbd_neon, &aom_clpf_hblock_hbd_c, 8, 8),
-        make_tuple(&aom_clpf_hblock_hbd_neon, &aom_clpf_hblock_hbd_c, 8, 4),
-        make_tuple(&aom_clpf_hblock_hbd_neon, &aom_clpf_hblock_hbd_c, 4, 8),
-        make_tuple(&aom_clpf_hblock_hbd_neon, &aom_clpf_hblock_hbd_c, 4, 4)));
-#endif
-#endif  // CONFIG_HIGHBITDEPTH
-
-// Test speed for all supported architectures
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
-    SSE2, CDEFClpfSpeedTest,
-    ::testing::Values(make_tuple(&aom_clpf_block_sse2, &aom_clpf_block_c, 8, 8),
-                      make_tuple(&aom_clpf_hblock_sse2, &aom_clpf_hblock_c, 8,
-                                 8)));
-#endif
-
-#if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(SSSE3, CDEFClpfSpeedTest,
-                        ::testing::Values(make_tuple(&aom_clpf_block_ssse3,
-                                                     &aom_clpf_block_c, 8, 8),
-                                          make_tuple(&aom_clpf_hblock_ssse3,
-                                                     &aom_clpf_hblock_c, 8,
-                                                     8)));
-#endif
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFClpfSpeedTest,
-                        ::testing::Values(make_tuple(&aom_clpf_block_sse4_1,
-                                                     &aom_clpf_block_c, 8, 8),
-                                          make_tuple(&aom_clpf_hblock_sse4_1,
-                                                     &aom_clpf_hblock_c, 8,
-                                                     8)));
-
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
-    NEON, CDEFClpfSpeedTest,
-    ::testing::Values(make_tuple(&aom_clpf_block_neon, &aom_clpf_block_c, 8, 8),
-                      make_tuple(&aom_clpf_hblock_neon, &aom_clpf_hblock_c, 8,
-                                 8)));
-#endif
-
-#if CONFIG_HIGHBITDEPTH
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
-    SSE2, ClpfHbdSpeedTest,
-    ::testing::Values(
-        make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 8, 8),
-        make_tuple(&aom_clpf_hblock_hbd_sse2, &aom_clpf_hblock_hbd_c, 8, 8)));
-#endif
-
-#if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
-    SSSE3, ClpfHbdSpeedTest,
-    ::testing::Values(
-        make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 8, 8),
-        make_tuple(&aom_clpf_hblock_hbd_ssse3, &aom_clpf_hblock_hbd_c, 8, 8)));
-#endif
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
-    SSE4_1, ClpfHbdSpeedTest,
-    ::testing::Values(
-        make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 8, 8),
-        make_tuple(&aom_clpf_hblock_hbd_sse4_1, &aom_clpf_hblock_hbd_c, 8, 8)));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
-    NEON, ClpfHbdSpeedTest,
-    ::testing::Values(
-        make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 8, 8),
-        make_tuple(&aom_clpf_hblock_hbd_neon, &aom_clpf_hblock_hbd_c, 8, 8)));
-#endif
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // defined(_WIN64) || !defined(_MSC_VER)
-
-}  // namespace
diff --git a/third_party/aom/test/codec_factory.h b/third_party/aom/test/codec_factory.h
index d2f20b832..65b76094c 100644
--- a/third_party/aom/test/codec_factory.h
+++ b/third_party/aom/test/codec_factory.h
@@ -11,7 +11,8 @@
 #ifndef TEST_CODEC_FACTORY_H_
 #define TEST_CODEC_FACTORY_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_decoder.h"
 #include "aom/aom_encoder.h"
 #if CONFIG_AV1_ENCODER
@@ -39,7 +40,6 @@ class CodecFactory {
                                  const aom_codec_flags_t flags) const = 0;
 
   virtual Encoder *CreateEncoder(aom_codec_enc_cfg_t cfg,
-                                 unsigned long deadline,
                                  const unsigned long init_flags,
                                  TwopassStatsStore *stats) const = 0;
 
@@ -54,22 +54,26 @@ class CodecFactory {
 template <class T1>
 class CodecTestWithParam
     : public ::testing::TestWithParam<
-          std::tr1::tuple<const libaom_test::CodecFactory *, T1> > {};
+          ::testing::tuple<const libaom_test::CodecFactory *, T1> > {};
 
 template <class T1, class T2>
 class CodecTestWith2Params
     : public ::testing::TestWithParam<
-          std::tr1::tuple<const libaom_test::CodecFactory *, T1, T2> > {};
+          ::testing::tuple<const libaom_test::CodecFactory *, T1, T2> > {};
 
 template <class T1, class T2, class T3>
 class CodecTestWith3Params
     : public ::testing::TestWithParam<
-          std::tr1::tuple<const libaom_test::CodecFactory *, T1, T2, T3> > {};
+          ::testing::tuple<const libaom_test::CodecFactory *, T1, T2, T3> > {};
+
+template <class T1, class T2, class T3, class T4>
+class CodecTestWith4Params
+    : public ::testing::TestWithParam< ::testing::tuple<
+          const libaom_test::CodecFactory *, T1, T2, T3, T4> > {};
 
 /*
  * AV1 Codec Definitions
  */
-#if CONFIG_AV1
 class AV1Decoder : public Decoder {
  public:
   explicit AV1Decoder(aom_codec_dec_cfg_t cfg) : Decoder(cfg) {}
@@ -89,9 +93,9 @@ class AV1Decoder : public Decoder {
 
 class AV1Encoder : public Encoder {
  public:
-  AV1Encoder(aom_codec_enc_cfg_t cfg, unsigned long deadline,
-             const unsigned long init_flags, TwopassStatsStore *stats)
-      : Encoder(cfg, deadline, init_flags, stats) {}
+  AV1Encoder(aom_codec_enc_cfg_t cfg, const uint32_t init_flags,
+             TwopassStatsStore *stats)
+      : Encoder(cfg, init_flags, stats) {}
 
  protected:
   virtual aom_codec_iface_t *CodecInterface() const {
@@ -123,14 +127,12 @@ class AV1CodecFactory : public CodecFactory {
   }
 
   virtual Encoder *CreateEncoder(aom_codec_enc_cfg_t cfg,
-                                 unsigned long deadline,
                                  const unsigned long init_flags,
                                  TwopassStatsStore *stats) const {
 #if CONFIG_AV1_ENCODER
-    return new AV1Encoder(cfg, deadline, init_flags, stats);
+    return new AV1Encoder(cfg, init_flags, stats);
 #else
     (void)cfg;
-    (void)deadline;
     (void)init_flags;
     (void)stats;
     return NULL;
@@ -158,9 +160,6 @@ const libaom_test::AV1CodecFactory kAV1;
           ::testing::Values(static_cast<const libaom_test::CodecFactory *>( \
               &libaom_test::kAV1)),                                         \
           __VA_ARGS__))
-#else
-#define AV1_INSTANTIATE_TEST_CASE(test, ...)
-#endif  // CONFIG_AV1
 
 }  // namespace libaom_test
 #endif  // TEST_CODEC_FACTORY_H_
diff --git a/third_party/aom/test/coding_path_sync.cc b/third_party/aom/test/coding_path_sync.cc
index 5b6409d03..51a506004 100644
--- a/third_party/aom/test/coding_path_sync.cc
+++ b/third_party/aom/test/coding_path_sync.cc
@@ -13,7 +13,7 @@
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #include "aom_ports/mem.h"  // ROUND_POWER_OF_TWO
 #include "aom/aomcx.h"
@@ -32,11 +32,17 @@ class CompressedSource {
     aom_codec_enc_cfg_t cfg;
     aom_codec_enc_config_default(algo, &cfg, 0);
 
-    const int max_q = cfg.rc_max_quantizer;
+    // force the quantizer, to reduce the sensitivity on encoding choices.
+    // e.g, we don't want this test to break when the rate control is modified.
+    {
+      const int max_q = cfg.rc_max_quantizer;
+      const int min_q = cfg.rc_min_quantizer;
+      const int q = rnd_.PseudoUniform(max_q - min_q + 1) + min_q;
 
-    cfg.rc_end_usage = AOM_CQ;
-    cfg.rc_max_quantizer = max_q;
-    cfg.rc_min_quantizer = max_q;
+      cfg.rc_end_usage = AOM_Q;
+      cfg.rc_max_quantizer = q;
+      cfg.rc_min_quantizer = q;
+    }
 
     // choose the picture size
     {
@@ -44,9 +50,26 @@ class CompressedSource {
       height_ = rnd_.PseudoUniform(kHeight - 8) + 8;
     }
 
+    // choose the chroma subsampling
+    {
+      const aom_img_fmt_t fmts[] = {
+        AOM_IMG_FMT_I420,
+        AOM_IMG_FMT_I422,
+        AOM_IMG_FMT_I444,
+      };
+
+      format_ = fmts[rnd_.PseudoUniform(NELEMENTS(fmts))];
+    }
+
     cfg.g_w = width_;
     cfg.g_h = height_;
     cfg.g_lag_in_frames = 0;
+    if (format_ == AOM_IMG_FMT_I420)
+      cfg.g_profile = 0;
+    else if (format_ == AOM_IMG_FMT_I444)
+      cfg.g_profile = 1;
+    else if (format_ == AOM_IMG_FMT_I422)
+      cfg.g_profile = 2;
 
     aom_codec_enc_init(&enc_, algo, &cfg, 0);
   }
@@ -54,7 +77,7 @@ class CompressedSource {
   ~CompressedSource() { aom_codec_destroy(&enc_); }
 
   const aom_codec_cx_pkt_t *ReadFrame() {
-    uint8_t buf[kWidth * kHeight * 3 / 2] = { 0 };
+    uint8_t buf[kWidth * kHeight * 3] = { 0 };
 
     // render regular pattern
     const int period = rnd_.Rand8() % 32 + 1;
@@ -67,8 +90,8 @@ class CompressedSource {
       buf[i] = (i + phase) % period < period / 2 ? val_a : val_b;
 
     aom_image_t img;
-    aom_img_wrap(&img, AOM_IMG_FMT_I420, width_, height_, 0, buf);
-    aom_codec_encode(&enc_, &img, frame_count_++, 1, 0, 0);
+    aom_img_wrap(&img, format_, width_, height_, 0, buf);
+    aom_codec_encode(&enc_, &img, frame_count_++, 1, 0);
 
     aom_codec_iter_t iter = NULL;
 
@@ -86,6 +109,7 @@ class CompressedSource {
   static const int kHeight = 128;
 
   ACMRandom rnd_;
+  aom_img_fmt_t format_;
   aom_codec_ctx_t enc_;
   int frame_count_;
   int width_, height_;
@@ -128,7 +152,7 @@ class Decoder {
 
   std::vector<int16_t> decode(const aom_codec_cx_pkt_t *pkt) {
     aom_codec_decode(&dec_, static_cast<uint8_t *>(pkt->data.frame.buf),
-                     static_cast<unsigned int>(pkt->data.frame.sz), NULL, 0);
+                     pkt->data.frame.sz, NULL);
 
     aom_codec_iter_t iter = NULL;
     return Serialize(aom_codec_get_frame(&dec_, &iter));
@@ -140,18 +164,41 @@ class Decoder {
 
 // Try to reveal a mismatch between LBD and HBD coding paths.
 TEST(CodingPathSync, SearchForHbdLbdMismatch) {
-  const int count_tests = 100;
+  const int count_tests = 10;
   for (int i = 0; i < count_tests; ++i) {
     Decoder dec_hbd(0);
     Decoder dec_lbd(1);
 
     CompressedSource enc(i);
-    const aom_codec_cx_pkt_t *frame = enc.ReadFrame();
 
-    std::vector<int16_t> lbd_yuv = dec_lbd.decode(frame);
-    std::vector<int16_t> hbd_yuv = dec_hbd.decode(frame);
+    for (int k = 0; k < 3; ++k) {
+      const aom_codec_cx_pkt_t *frame = enc.ReadFrame();
+
+      std::vector<int16_t> lbd_yuv = dec_lbd.decode(frame);
+      std::vector<int16_t> hbd_yuv = dec_hbd.decode(frame);
 
-    ASSERT_EQ(lbd_yuv, hbd_yuv);
+      ASSERT_EQ(lbd_yuv, hbd_yuv);
+    }
+  }
+}
+
+TEST(CodingPathSyncLarge, SearchForHbdLbdMismatchLarge) {
+  const int count_tests = 100;
+  const int seed = 1234;
+  for (int i = 0; i < count_tests; ++i) {
+    Decoder dec_hbd(0);
+    Decoder dec_lbd(1);
+
+    CompressedSource enc(seed + i);
+
+    for (int k = 0; k < 5; ++k) {
+      const aom_codec_cx_pkt_t *frame = enc.ReadFrame();
+
+      std::vector<int16_t> lbd_yuv = dec_lbd.decode(frame);
+      std::vector<int16_t> hbd_yuv = dec_hbd.decode(frame);
+
+      ASSERT_EQ(lbd_yuv, hbd_yuv);
+    }
   }
 }
 
diff --git a/third_party/aom/test/comp_avg_pred_test.cc b/third_party/aom/test/comp_avg_pred_test.cc
new file mode 100644
index 000000000..8bd826eb4
--- /dev/null
+++ b/third_party/aom/test/comp_avg_pred_test.cc
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "test/comp_avg_pred_test.h"
+
+using ::testing::make_tuple;
+using ::testing::tuple;
+using libaom_test::ACMRandom;
+using libaom_test::AV1JNTCOMPAVG::AV1HighBDJNTCOMPAVGTest;
+using libaom_test::AV1JNTCOMPAVG::AV1HighBDJNTCOMPAVGUPSAMPLEDTest;
+using libaom_test::AV1JNTCOMPAVG::AV1JNTCOMPAVGTest;
+using libaom_test::AV1JNTCOMPAVG::AV1JNTCOMPAVGUPSAMPLEDTest;
+
+namespace {
+
+TEST_P(AV1JNTCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
+
+TEST_P(AV1JNTCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, AV1JNTCOMPAVGTest,
+    libaom_test::AV1JNTCOMPAVG::BuildParams(aom_jnt_comp_avg_pred_ssse3));
+#endif
+
+TEST_P(AV1JNTCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0));
+}
+
+TEST_P(AV1JNTCOMPAVGUPSAMPLEDTest, CheckOutput) {
+  RunCheckOutput(GET_PARAM(0));
+}
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(SSSE3, AV1JNTCOMPAVGUPSAMPLEDTest,
+                        libaom_test::AV1JNTCOMPAVG::BuildParams(
+                            aom_jnt_comp_avg_upsampled_pred_ssse3));
+#endif
+
+TEST_P(AV1HighBDJNTCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(1)); }
+
+TEST_P(AV1HighBDJNTCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(1)); }
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, AV1HighBDJNTCOMPAVGTest,
+    libaom_test::AV1JNTCOMPAVG::BuildParams(aom_highbd_jnt_comp_avg_pred_sse2));
+#endif
+
+TEST_P(AV1HighBDJNTCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(1));
+}
+
+TEST_P(AV1HighBDJNTCOMPAVGUPSAMPLEDTest, CheckOutput) {
+  RunCheckOutput(GET_PARAM(1));
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, AV1HighBDJNTCOMPAVGUPSAMPLEDTest,
+                        libaom_test::AV1JNTCOMPAVG::BuildParams(
+                            aom_highbd_jnt_comp_avg_upsampled_pred_sse2));
+#endif
+
+}  // namespace
diff --git a/third_party/aom/test/comp_avg_pred_test.h b/third_party/aom/test/comp_avg_pred_test.h
new file mode 100644
index 000000000..ab2004c05
--- /dev/null
+++ b/third_party/aom/test/comp_avg_pred_test.h
@@ -0,0 +1,546 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef TEST_COMP_AVG_PRED_TEST_H_
+#define TEST_COMP_AVG_PRED_TEST_H_
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "av1/common/common_data.h"
+#include "aom_ports/aom_timer.h"
+
+namespace libaom_test {
+const int kMaxSize = 128 + 32;  // padding
+
+namespace AV1JNTCOMPAVG {
+
+typedef void (*jntcompavg_func)(uint8_t *comp_pred, const uint8_t *pred,
+                                int width, int height, const uint8_t *ref,
+                                int ref_stride,
+                                const JNT_COMP_PARAMS *jcp_param);
+
+typedef void (*jntcompavgupsampled_func)(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const JNT_COMP_PARAMS *jcp_param);
+
+typedef void (*highbdjntcompavg_func)(uint16_t *comp_pred, const uint8_t *pred8,
+                                      int width, int height,
+                                      const uint8_t *ref8, int ref_stride,
+                                      const JNT_COMP_PARAMS *jcp_param);
+
+typedef void (*highbdjntcompavgupsampled_func)(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param);
+
+typedef ::testing::tuple<jntcompavg_func, BLOCK_SIZE> JNTCOMPAVGParam;
+
+typedef ::testing::tuple<jntcompavgupsampled_func, BLOCK_SIZE>
+    JNTCOMPAVGUPSAMPLEDParam;
+
+typedef ::testing::tuple<int, highbdjntcompavg_func, BLOCK_SIZE>
+    HighbdJNTCOMPAVGParam;
+
+typedef ::testing::tuple<int, highbdjntcompavgupsampled_func, BLOCK_SIZE>
+    HighbdJNTCOMPAVGUPSAMPLEDParam;
+
+::testing::internal::ParamGenerator<JNTCOMPAVGParam> BuildParams(
+    jntcompavg_func filter) {
+  return ::testing::Combine(::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+
+::testing::internal::ParamGenerator<JNTCOMPAVGUPSAMPLEDParam> BuildParams(
+    jntcompavgupsampled_func filter) {
+  return ::testing::Combine(::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+
+::testing::internal::ParamGenerator<HighbdJNTCOMPAVGParam> BuildParams(
+    highbdjntcompavg_func filter) {
+  return ::testing::Combine(::testing::Range(8, 13, 2),
+                            ::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+
+::testing::internal::ParamGenerator<HighbdJNTCOMPAVGUPSAMPLEDParam> BuildParams(
+    highbdjntcompavgupsampled_func filter) {
+  return ::testing::Combine(::testing::Range(8, 13, 2),
+                            ::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+
+class AV1JNTCOMPAVGTest : public ::testing::TestWithParam<JNTCOMPAVGParam> {
+ public:
+  ~AV1JNTCOMPAVGTest() {}
+  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+  void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunCheckOutput(jntcompavg_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(1);
+
+    uint8_t pred8[kMaxSize * kMaxSize];
+    uint8_t ref8[kMaxSize * kMaxSize];
+    uint8_t output[kMaxSize * kMaxSize];
+    uint8_t output2[kMaxSize * kMaxSize];
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand8();
+        ref8[i * w + j] = rnd_.Rand8();
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    JNT_COMP_PARAMS jnt_comp_params;
+    jnt_comp_params.use_jnt_comp_avg = 1;
+
+    for (int ii = 0; ii < 2; ii++) {
+      for (int jj = 0; jj < 4; jj++) {
+        jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+        jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+
+        const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+        const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+        aom_jnt_comp_avg_pred_c(output, pred8 + offset_r * w + offset_c, in_w,
+                                in_h, ref8 + offset_r * w + offset_c, in_w,
+                                &jnt_comp_params);
+        test_impl(output2, pred8 + offset_r * w + offset_c, in_w, in_h,
+                  ref8 + offset_r * w + offset_c, in_w, &jnt_comp_params);
+
+        for (int i = 0; i < in_h; ++i) {
+          for (int j = 0; j < in_w; ++j) {
+            int idx = i * in_w + j;
+            ASSERT_EQ(output[idx], output2[idx])
+                << "Mismatch at unit tests for AV1JNTCOMPAVGTest\n"
+                << in_w << "x" << in_h << " Pixel mismatch at index " << idx
+                << " = (" << i << ", " << j << ")";
+          }
+        }
+      }
+    }
+  }
+  void RunSpeedTest(jntcompavg_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(1);
+
+    uint8_t pred8[kMaxSize * kMaxSize];
+    uint8_t ref8[kMaxSize * kMaxSize];
+    uint8_t output[kMaxSize * kMaxSize];
+    uint8_t output2[kMaxSize * kMaxSize];
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand8();
+        ref8[i * w + j] = rnd_.Rand8();
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    JNT_COMP_PARAMS jnt_comp_params;
+    jnt_comp_params.use_jnt_comp_avg = 1;
+
+    jnt_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
+    jnt_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+
+    const int num_loops = 1000000000 / (in_w + in_h);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      aom_jnt_comp_avg_pred_c(output, pred8, in_w, in_h, ref8, in_w,
+                              &jnt_comp_params);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("jntcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time / num_loops);
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(output2, pred8, in_w, in_h, ref8, in_w, &jnt_comp_params);
+
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("jntcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time1 / num_loops);
+  }
+
+  libaom_test::ACMRandom rnd_;
+};  // class AV1JNTCOMPAVGTest
+
+class AV1JNTCOMPAVGUPSAMPLEDTest
+    : public ::testing::TestWithParam<JNTCOMPAVGUPSAMPLEDParam> {
+ public:
+  ~AV1JNTCOMPAVGUPSAMPLEDTest() {}
+  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+  void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunCheckOutput(jntcompavgupsampled_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(1);
+
+    uint8_t pred8[kMaxSize * kMaxSize];
+    uint8_t ref8[kMaxSize * kMaxSize];
+    DECLARE_ALIGNED(16, uint8_t, output[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(16, uint8_t, output2[MAX_SB_SQUARE]);
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand8();
+        ref8[i * w + j] = rnd_.Rand8();
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    JNT_COMP_PARAMS jnt_comp_params;
+    jnt_comp_params.use_jnt_comp_avg = 1;
+    int sub_x_q3, sub_y_q3;
+    for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
+      for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
+        for (int ii = 0; ii < 2; ii++) {
+          for (int jj = 0; jj < 4; jj++) {
+            jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+            jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+
+            const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+            const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+
+            aom_jnt_comp_avg_upsampled_pred_c(
+                NULL, NULL, 0, 0, NULL, output, pred8 + offset_r * w + offset_c,
+                in_w, in_h, sub_x_q3, sub_y_q3, ref8 + offset_r * w + offset_c,
+                in_w, &jnt_comp_params);
+            test_impl(NULL, NULL, 0, 0, NULL, output2,
+                      pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
+                      sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
+                      &jnt_comp_params);
+
+            for (int i = 0; i < in_h; ++i) {
+              for (int j = 0; j < in_w; ++j) {
+                int idx = i * in_w + j;
+                ASSERT_EQ(output[idx], output2[idx])
+                    << "Mismatch at unit tests for AV1JNTCOMPAVGUPSAMPLEDTest\n"
+                    << in_w << "x" << in_h << " Pixel mismatch at index " << idx
+                    << " = (" << i << ", " << j << "), sub pixel offset = ("
+                    << sub_y_q3 << ", " << sub_x_q3 << ")";
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  void RunSpeedTest(jntcompavgupsampled_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(1);
+
+    uint8_t pred8[kMaxSize * kMaxSize];
+    uint8_t ref8[kMaxSize * kMaxSize];
+    DECLARE_ALIGNED(16, uint8_t, output[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(16, uint8_t, output2[MAX_SB_SQUARE]);
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand8();
+        ref8[i * w + j] = rnd_.Rand8();
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    JNT_COMP_PARAMS jnt_comp_params;
+    jnt_comp_params.use_jnt_comp_avg = 1;
+
+    jnt_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
+    jnt_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+
+    int sub_x_q3 = 0;
+    int sub_y_q3 = 0;
+
+    const int num_loops = 1000000000 / (in_w + in_h);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      aom_jnt_comp_avg_upsampled_pred_c(NULL, NULL, 0, 0, NULL, output, pred8,
+                                        in_w, in_h, sub_x_q3, sub_y_q3, ref8,
+                                        in_w, &jnt_comp_params);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("jntcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time / num_loops);
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(NULL, NULL, 0, 0, NULL, output2, pred8, in_w, in_h, sub_x_q3,
+                sub_y_q3, ref8, in_w, &jnt_comp_params);
+
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("jntcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time1 / num_loops);
+  }
+
+  libaom_test::ACMRandom rnd_;
+};  // class AV1JNTCOMPAVGUPSAMPLEDTest
+
+class AV1HighBDJNTCOMPAVGTest
+    : public ::testing::TestWithParam<HighbdJNTCOMPAVGParam> {
+ public:
+  ~AV1HighBDJNTCOMPAVGTest() {}
+  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
+  void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunCheckOutput(highbdjntcompavg_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(2);
+    const int bd = GET_PARAM(0);
+    uint16_t pred8[kMaxSize * kMaxSize];
+    uint16_t ref8[kMaxSize * kMaxSize];
+    uint16_t output[kMaxSize * kMaxSize];
+    uint16_t output2[kMaxSize * kMaxSize];
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+        ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    JNT_COMP_PARAMS jnt_comp_params;
+    jnt_comp_params.use_jnt_comp_avg = 1;
+
+    for (int ii = 0; ii < 2; ii++) {
+      for (int jj = 0; jj < 4; jj++) {
+        jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+        jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+
+        const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+        const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+        aom_highbd_jnt_comp_avg_pred_c(
+            output, CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
+            in_h, CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w,
+            &jnt_comp_params);
+        test_impl(output2, CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c,
+                  in_w, in_h,
+                  CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w,
+                  &jnt_comp_params);
+
+        for (int i = 0; i < in_h; ++i) {
+          for (int j = 0; j < in_w; ++j) {
+            int idx = i * in_w + j;
+            ASSERT_EQ(output[idx], output2[idx])
+                << "Mismatch at unit tests for AV1HighBDJNTCOMPAVGTest\n"
+                << in_w << "x" << in_h << " Pixel mismatch at index " << idx
+                << " = (" << i << ", " << j << ")";
+          }
+        }
+      }
+    }
+  }
+  void RunSpeedTest(highbdjntcompavg_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(2);
+    const int bd = GET_PARAM(0);
+    uint16_t pred8[kMaxSize * kMaxSize];
+    uint16_t ref8[kMaxSize * kMaxSize];
+    uint16_t output[kMaxSize * kMaxSize];
+    uint16_t output2[kMaxSize * kMaxSize];
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+        ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    JNT_COMP_PARAMS jnt_comp_params;
+    jnt_comp_params.use_jnt_comp_avg = 1;
+
+    jnt_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
+    jnt_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+
+    const int num_loops = 1000000000 / (in_w + in_h);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      aom_highbd_jnt_comp_avg_pred_c(output, CONVERT_TO_BYTEPTR(pred8), in_w,
+                                     in_h, CONVERT_TO_BYTEPTR(ref8), in_w,
+                                     &jnt_comp_params);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("highbdjntcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time / num_loops);
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(output2, CONVERT_TO_BYTEPTR(pred8), in_w, in_h,
+                CONVERT_TO_BYTEPTR(ref8), in_w, &jnt_comp_params);
+
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("highbdjntcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time1 / num_loops);
+  }
+
+  libaom_test::ACMRandom rnd_;
+};  // class AV1HighBDJNTCOMPAVGTest
+
+class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
+    : public ::testing::TestWithParam<HighbdJNTCOMPAVGUPSAMPLEDParam> {
+ public:
+  ~AV1HighBDJNTCOMPAVGUPSAMPLEDTest() {}
+  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+  void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunCheckOutput(highbdjntcompavgupsampled_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(2);
+    const int bd = GET_PARAM(0);
+    uint16_t pred8[kMaxSize * kMaxSize];
+    uint16_t ref8[kMaxSize * kMaxSize];
+    uint16_t output[kMaxSize * kMaxSize];
+    uint16_t output2[kMaxSize * kMaxSize];
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+        ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    JNT_COMP_PARAMS jnt_comp_params;
+    jnt_comp_params.use_jnt_comp_avg = 1;
+    int sub_x_q3, sub_y_q3;
+
+    for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
+      for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
+        for (int ii = 0; ii < 2; ii++) {
+          for (int jj = 0; jj < 4; jj++) {
+            jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+            jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+
+            const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+            const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+
+            aom_highbd_jnt_comp_avg_upsampled_pred_c(
+                NULL, NULL, 0, 0, NULL, output,
+                CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w, in_h,
+                sub_x_q3, sub_y_q3,
+                CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w, bd,
+                &jnt_comp_params);
+            test_impl(NULL, NULL, 0, 0, NULL, output2,
+                      CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
+                      in_h, sub_x_q3, sub_y_q3,
+                      CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w,
+                      bd, &jnt_comp_params);
+
+            for (int i = 0; i < in_h; ++i) {
+              for (int j = 0; j < in_w; ++j) {
+                int idx = i * in_w + j;
+                ASSERT_EQ(output[idx], output2[idx])
+                    << "Mismatch at unit tests for "
+                       "AV1HighBDJNTCOMPAVGUPSAMPLEDTest\n"
+                    << in_w << "x" << in_h << " Pixel mismatch at index " << idx
+                    << " = (" << i << ", " << j << "), sub pixel offset = ("
+                    << sub_y_q3 << ", " << sub_x_q3 << ")";
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  void RunSpeedTest(highbdjntcompavgupsampled_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(2);
+    const int bd = GET_PARAM(0);
+    uint16_t pred8[kMaxSize * kMaxSize];
+    uint16_t ref8[kMaxSize * kMaxSize];
+    uint16_t output[kMaxSize * kMaxSize];
+    uint16_t output2[kMaxSize * kMaxSize];
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+        ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    JNT_COMP_PARAMS jnt_comp_params;
+    jnt_comp_params.use_jnt_comp_avg = 1;
+
+    jnt_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
+    jnt_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+    int sub_x_q3 = 0;
+    int sub_y_q3 = 0;
+    const int num_loops = 1000000000 / (in_w + in_h);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      aom_highbd_jnt_comp_avg_upsampled_pred_c(
+          NULL, NULL, 0, 0, NULL, output, CONVERT_TO_BYTEPTR(pred8), in_w, in_h,
+          sub_x_q3, sub_y_q3, CONVERT_TO_BYTEPTR(ref8), in_w, bd,
+          &jnt_comp_params);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("highbdjntcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time / num_loops);
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(NULL, NULL, 0, 0, NULL, output2, CONVERT_TO_BYTEPTR(pred8),
+                in_w, in_h, sub_x_q3, sub_y_q3, CONVERT_TO_BYTEPTR(ref8), in_w,
+                bd, &jnt_comp_params);
+
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("highbdjntcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w,
+           in_h, 1000.0 * elapsed_time1 / num_loops);
+  }
+
+  libaom_test::ACMRandom rnd_;
+};  // class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
+
+}  // namespace AV1JNTCOMPAVG
+}  // namespace libaom_test
+
+#endif  // TEST_COMP_AVG_PRED_TEST_H_
diff --git a/third_party/aom/test/comp_mask_variance_test.cc b/third_party/aom/test/comp_mask_variance_test.cc
new file mode 100644
index 000000000..a5e3f3411
--- /dev/null
+++ b/third_party/aom/test/comp_mask_variance_test.cc
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <new>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/variance.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "av1/common/reconinter.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace AV1CompMaskVariance {
+typedef void (*comp_mask_pred_func)(uint8_t *comp_pred, const uint8_t *pred,
+                                    int width, int height, const uint8_t *ref,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask);
+#if HAVE_SSSE3 || HAVE_AV2
+const BLOCK_SIZE kValidBlockSize[] = {
+  BLOCK_8X8,   BLOCK_8X16, BLOCK_8X32,  BLOCK_16X8,  BLOCK_16X16,
+  BLOCK_16X32, BLOCK_32X8, BLOCK_32X16, BLOCK_32X32,
+};
+#endif
+typedef ::testing::tuple<comp_mask_pred_func, BLOCK_SIZE> CompMaskPredParam;
+
+class AV1CompMaskVarianceTest
+    : public ::testing::TestWithParam<CompMaskPredParam> {
+ public:
+  ~AV1CompMaskVarianceTest();
+  void SetUp();
+
+  void TearDown();
+
+ protected:
+  void RunCheckOutput(comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv);
+  void RunSpeedTest(comp_mask_pred_func test_impl, BLOCK_SIZE bsize);
+  bool CheckResult(int width, int height) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        const int idx = y * width + x;
+        if (comp_pred1_[idx] != comp_pred2_[idx]) {
+          printf("%dx%d mismatch @%d(%d,%d) ", width, height, idx, y, x);
+          printf("%d != %d ", comp_pred1_[idx], comp_pred2_[idx]);
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  libaom_test::ACMRandom rnd_;
+  uint8_t *comp_pred1_;
+  uint8_t *comp_pred2_;
+  uint8_t *pred_;
+  uint8_t *ref_buffer_;
+  uint8_t *ref_;
+};
+
+AV1CompMaskVarianceTest::~AV1CompMaskVarianceTest() { ; }
+
+void AV1CompMaskVarianceTest::SetUp() {
+  rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+  av1_init_wedge_masks();
+  comp_pred1_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+  comp_pred2_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+  pred_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+  ref_buffer_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE + (8 * MAX_SB_SIZE));
+  ref_ = ref_buffer_ + (8 * MAX_SB_SIZE);
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    pred_[i] = rnd_.Rand8();
+  }
+  for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
+    ref_buffer_[i] = rnd_.Rand8();
+  }
+}
+
+void AV1CompMaskVarianceTest::TearDown() {
+  aom_free(comp_pred1_);
+  aom_free(comp_pred2_);
+  aom_free(pred_);
+  aom_free(ref_buffer_);
+  libaom_test::ClearSystemState();
+}
+
+void AV1CompMaskVarianceTest::RunCheckOutput(comp_mask_pred_func test_impl,
+                                             BLOCK_SIZE bsize, int inv) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+    const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+    aom_comp_mask_pred_c(comp_pred1_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w,
+                         inv);
+    test_impl(comp_pred2_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w, inv);
+
+    ASSERT_EQ(CheckResult(w, h), true)
+        << " wedge " << wedge_index << " inv " << inv;
+  }
+}
+
+void AV1CompMaskVarianceTest::RunSpeedTest(comp_mask_pred_func test_impl,
+                                           BLOCK_SIZE bsize) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  int wedge_index = wedge_types / 2;
+  const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+  const int num_loops = 1000000000 / (w + h);
+
+  comp_mask_pred_func funcs[2] = { aom_comp_mask_pred_c, test_impl };
+  double elapsed_time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    comp_mask_pred_func func = funcs[i];
+    for (int j = 0; j < num_loops; ++j) {
+      func(comp_pred1_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w, 0);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  printf("compMask %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0],
+         elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1CompMaskVarianceTest, CheckOutput) {
+  // inv = 0, 1
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+TEST_P(AV1CompMaskVarianceTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, AV1CompMaskVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_comp_mask_pred_ssse3),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, AV1CompMaskVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_comp_mask_pred_avx2),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+#ifndef aom_comp_mask_pred
+// can't run this test if aom_comp_mask_pred is defined to aom_comp_mask_pred_c
+class AV1CompMaskUpVarianceTest : public AV1CompMaskVarianceTest {
+ public:
+  ~AV1CompMaskUpVarianceTest();
+
+ protected:
+  void RunCheckOutput(comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv);
+  void RunSpeedTest(comp_mask_pred_func test_impl, BLOCK_SIZE bsize,
+                    int havSub);
+};
+
+AV1CompMaskUpVarianceTest::~AV1CompMaskUpVarianceTest() { ; }
+
+void AV1CompMaskUpVarianceTest::RunCheckOutput(comp_mask_pred_func test_impl,
+                                               BLOCK_SIZE bsize, int inv) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+
+  // loop through subx and suby
+  for (int sub = 0; sub < 8 * 8; ++sub) {
+    int subx = sub & 0x7;
+    int suby = (sub >> 3);
+    for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+      const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+      aom_comp_mask_pred = aom_comp_mask_pred_c;  // ref
+      aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred1_, pred_,
+                                   w, h, subx, suby, ref_, MAX_SB_SIZE, mask, w,
+                                   inv);
+
+      aom_comp_mask_pred = test_impl;  // test
+      aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred2_, pred_,
+                                   w, h, subx, suby, ref_, MAX_SB_SIZE, mask, w,
+                                   inv);
+      ASSERT_EQ(CheckResult(w, h), true)
+          << " wedge " << wedge_index << " inv " << inv << "sub (" << subx
+          << "," << suby << ")";
+    }
+  }
+}
+
+void AV1CompMaskUpVarianceTest::RunSpeedTest(comp_mask_pred_func test_impl,
+                                             BLOCK_SIZE bsize, int havSub) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int subx = havSub ? 3 : 0;
+  const int suby = havSub ? 4 : 0;
+
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  int wedge_index = wedge_types / 2;
+  const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+  const int num_loops = 1000000000 / (w + h);
+  comp_mask_pred_func funcs[2] = { &aom_comp_mask_pred_c, test_impl };
+  double elapsed_time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    aom_comp_mask_pred = funcs[i];
+    for (int j = 0; j < num_loops; ++j) {
+      aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred1_, pred_,
+                                   w, h, subx, suby, ref_, MAX_SB_SIZE, mask, w,
+                                   0);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  printf("CompMaskUp[%d] %3dx%-3d:%7.2f/%7.2fns", havSub, w, h, elapsed_time[0],
+         elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1CompMaskUpVarianceTest, CheckOutput) {
+  // inv mask = 0, 1
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+TEST_P(AV1CompMaskUpVarianceTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, AV1CompMaskUpVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_comp_mask_pred_ssse3),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, AV1CompMaskUpVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_comp_mask_pred_avx2),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+#endif  // ifndef aom_comp_mask_pred
+}  // namespace AV1CompMaskVariance
diff --git a/third_party/aom/test/convolve_round_test.cc b/third_party/aom/test/convolve_round_test.cc
index 4976b03c8..2f801e7d4 100644
--- a/third_party/aom/test/convolve_round_test.cc
+++ b/third_party/aom/test/convolve_round_test.cc
@@ -11,7 +11,8 @@
 
 #include <assert.h>
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_ports/aom_timer.h"
 #include "test/acm_random.h"
@@ -51,7 +52,7 @@ void highbd_convolve_rounding_12(CONVOLVE_ROUNDING_PARAM) {
 
 typedef enum { LOWBITDEPTH_TEST, HIGHBITDEPTH_TEST } DataPathType;
 
-using std::tr1::tuple;
+using ::testing::tuple;
 
 typedef tuple<ConvolveRoundFunc, ConvolveRoundFunc, DataPathType>
     ConvolveRoundParam;
@@ -92,11 +93,9 @@ class ConvolveRoundTest : public ::testing::TestWithParam<ConvolveRoundParam> {
     if (data_path_ == LOWBITDEPTH_TEST) {
       dst = reinterpret_cast<uint8_t *>(dst_);
       dst_ref = reinterpret_cast<uint8_t *>(dst_ref_);
-#if CONFIG_HIGHBITDEPTH
     } else if (data_path_ == HIGHBITDEPTH_TEST) {
       dst = CONVERT_TO_BYTEPTR(dst_);
       dst_ref = CONVERT_TO_BYTEPTR(dst_ref_);
-#endif
     } else {
       assert(0);
     }
@@ -163,10 +162,8 @@ class ConvolveRoundTest : public ::testing::TestWithParam<ConvolveRoundParam> {
 
 TEST_P(ConvolveRoundTest, BitExactCheck) { ConvolveRoundingRun(); }
 
-using std::tr1::make_tuple;
-
+using ::testing::make_tuple;
 #if HAVE_AVX2
-#if CONFIG_HIGHBITDEPTH
 const ConvolveRoundParam kConvRndParamArray[] = {
   make_tuple(&av1_convolve_rounding_c, &av1_convolve_rounding_avx2,
              LOWBITDEPTH_TEST),
@@ -180,11 +177,6 @@ const ConvolveRoundParam kConvRndParamArray[] = {
              &highbd_convolve_rounding_12<av1_highbd_convolve_rounding_avx2>,
              HIGHBITDEPTH_TEST)
 };
-#else
-const ConvolveRoundParam kConvRndParamArray[] = { make_tuple(
-    &av1_convolve_rounding_c, &av1_convolve_rounding_avx2, LOWBITDEPTH_TEST) };
-#endif
-
 INSTANTIATE_TEST_CASE_P(AVX2, ConvolveRoundTest,
                         ::testing::ValuesIn(kConvRndParamArray));
 #endif  // HAVE_AVX2
diff --git a/third_party/aom/test/convolve_test.cc b/third_party/aom/test/convolve_test.cc
index ffe0b87d2..7098e8af6 100644
--- a/third_party/aom/test/convolve_test.cc
+++ b/third_party/aom/test/convolve_test.cc
@@ -7,24 +7,25 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <string.h>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
 #include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
 #include "av1/common/filter.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
 
 namespace {
 
@@ -37,35 +38,16 @@ typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
                              int w, int h);
 
 struct ConvolveFunctions {
-  ConvolveFunctions(ConvolveFunc copy, ConvolveFunc avg, ConvolveFunc h8,
-                    ConvolveFunc h8_avg, ConvolveFunc v8, ConvolveFunc v8_avg,
-                    ConvolveFunc hv8, ConvolveFunc hv8_avg, ConvolveFunc sh8,
-                    ConvolveFunc sh8_avg, ConvolveFunc sv8,
-                    ConvolveFunc sv8_avg, ConvolveFunc shv8,
-                    ConvolveFunc shv8_avg, int bd)
-      : copy_(copy), avg_(avg), h8_(h8), v8_(v8), hv8_(hv8), h8_avg_(h8_avg),
-        v8_avg_(v8_avg), hv8_avg_(hv8_avg), sh8_(sh8), sv8_(sv8), shv8_(shv8),
-        sh8_avg_(sh8_avg), sv8_avg_(sv8_avg), shv8_avg_(shv8_avg),
-        use_highbd_(bd) {}
+  ConvolveFunctions(ConvolveFunc copy, ConvolveFunc h8, ConvolveFunc v8, int bd)
+      : copy_(copy), h8_(h8), v8_(v8), use_highbd_(bd) {}
 
   ConvolveFunc copy_;
-  ConvolveFunc avg_;
   ConvolveFunc h8_;
   ConvolveFunc v8_;
-  ConvolveFunc hv8_;
-  ConvolveFunc h8_avg_;
-  ConvolveFunc v8_avg_;
-  ConvolveFunc hv8_avg_;
-  ConvolveFunc sh8_;       // scaled horiz
-  ConvolveFunc sv8_;       // scaled vert
-  ConvolveFunc shv8_;      // scaled horiz/vert
-  ConvolveFunc sh8_avg_;   // scaled avg horiz
-  ConvolveFunc sv8_avg_;   // scaled avg vert
-  ConvolveFunc shv8_avg_;  // scaled avg horiz/vert
   int use_highbd_;  // 0 if high bitdepth not used, else the actual bit depth.
 };
 
-typedef std::tr1::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
+typedef ::testing::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
 
 #define ALL_SIZES_64(convolve_fn)                                         \
   make_tuple(4, 4, &convolve_fn), make_tuple(8, 4, &convolve_fn),         \
@@ -76,13 +58,9 @@ typedef std::tr1::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
       make_tuple(64, 32, &convolve_fn), make_tuple(32, 64, &convolve_fn), \
       make_tuple(64, 64, &convolve_fn)
 
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
 #define ALL_SIZES(convolve_fn)                                          \
   make_tuple(128, 64, &convolve_fn), make_tuple(64, 128, &convolve_fn), \
       make_tuple(128, 128, &convolve_fn), ALL_SIZES_64(convolve_fn)
-#else
-#define ALL_SIZES ALL_SIZES_64
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
 
 // Reference 8-tap subpixel filter, slightly modified to fit into this test.
 #define AV1_FILTER_WEIGHT 128
@@ -186,7 +164,6 @@ void filter_average_block2d_8_c(const uint8_t *src_ptr,
                     output_height);
 }
 
-#if CONFIG_HIGHBITDEPTH
 void highbd_filter_block2d_8_c(const uint16_t *src_ptr,
                                const unsigned int src_stride,
                                const int16_t *HFilter, const int16_t *VFilter,
@@ -288,7 +265,6 @@ void highbd_filter_average_block2d_8_c(
   highbd_block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride,
                            output_width, output_height);
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
  public:
@@ -301,7 +277,6 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
         aom_memalign(kDataAlignment, kOutputBufferSize));
     output_ref_ = reinterpret_cast<uint8_t *>(
         aom_memalign(kDataAlignment, kOutputBufferSize));
-#if CONFIG_HIGHBITDEPTH
     input16_ = reinterpret_cast<uint16_t *>(aom_memalign(
                    kDataAlignment, (kInputBufferSize + 1) * sizeof(uint16_t))) +
                1;
@@ -309,7 +284,6 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
         aom_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t)));
     output16_ref_ = reinterpret_cast<uint16_t *>(
         aom_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t)));
-#endif
   }
 
   virtual void TearDown() { libaom_test::ClearSystemState(); }
@@ -321,14 +295,12 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
     output_ = NULL;
     aom_free(output_ref_);
     output_ref_ = NULL;
-#if CONFIG_HIGHBITDEPTH
     aom_free(input16_ - 1);
     input16_ = NULL;
     aom_free(output16_);
     output16_ = NULL;
     aom_free(output16_ref_);
     output16_ref_ = NULL;
-#endif
   }
 
  protected:
@@ -356,24 +328,18 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
 
   virtual void SetUp() {
     UUT_ = GET_PARAM(2);
-#if CONFIG_HIGHBITDEPTH
     if (UUT_->use_highbd_ != 0)
       mask_ = (1 << UUT_->use_highbd_) - 1;
     else
       mask_ = 255;
-#endif
     /* Set up guard blocks for an inner block centered in the outer block */
     for (int i = 0; i < kOutputBufferSize; ++i) {
       if (IsIndexInBorder(i)) {
         output_[i] = 255;
-#if CONFIG_HIGHBITDEPTH
         output16_[i] = mask_;
-#endif
       } else {
         output_[i] = 0;
-#if CONFIG_HIGHBITDEPTH
         output16_[i] = 0;
-#endif
       }
     }
 
@@ -381,31 +347,23 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
     for (int i = 0; i < kInputBufferSize; ++i) {
       if (i & 1) {
         input_[i] = 255;
-#if CONFIG_HIGHBITDEPTH
         input16_[i] = mask_;
-#endif
       } else {
         input_[i] = prng.Rand8Extremes();
-#if CONFIG_HIGHBITDEPTH
         input16_[i] = prng.Rand16() & mask_;
-#endif
       }
     }
   }
 
   void SetConstantInput(int value) {
     memset(input_, value, kInputBufferSize);
-#if CONFIG_HIGHBITDEPTH
     aom_memset16(input16_, value, kInputBufferSize);
-#endif
   }
 
   void CopyOutputToRef() {
     memcpy(output_ref_, output_, kOutputBufferSize);
-#if CONFIG_HIGHBITDEPTH
     // Copy 16-bit pixels values. The effective number of bytes is double.
     memcpy(output16_ref_, output16_, sizeof(output16_[0]) * kOutputBufferSize);
-#endif
   }
 
   void CheckGuardBlocks() {
@@ -418,72 +376,51 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
 
   uint8_t *input() const {
     const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
-#if CONFIG_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
       return input_ + offset;
     } else {
       return CONVERT_TO_BYTEPTR(input16_) + offset;
     }
-#else
-    return input_ + offset;
-#endif
   }
 
   uint8_t *output() const {
     const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
-#if CONFIG_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
       return output_ + offset;
     } else {
       return CONVERT_TO_BYTEPTR(output16_) + offset;
     }
-#else
-    return output_ + offset;
-#endif
   }
 
   uint8_t *output_ref() const {
     const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
-#if CONFIG_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
       return output_ref_ + offset;
     } else {
       return CONVERT_TO_BYTEPTR(output16_ref_) + offset;
     }
-#else
-    return output_ref_ + offset;
-#endif
   }
 
   uint16_t lookup(uint8_t *list, int index) const {
-#if CONFIG_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
       return list[index];
     } else {
       return CONVERT_TO_SHORTPTR(list)[index];
     }
-#else
-    return list[index];
-#endif
   }
 
   void assign_val(uint8_t *list, int index, uint16_t val) const {
-#if CONFIG_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
       list[index] = (uint8_t)val;
     } else {
       CONVERT_TO_SHORTPTR(list)[index] = val;
     }
-#else
-    list[index] = (uint8_t)val;
-#endif
   }
 
   void wrapper_filter_average_block2d_8_c(
       const uint8_t *src_ptr, unsigned int src_stride, const int16_t *HFilter,
       const int16_t *VFilter, uint8_t *dst_ptr, unsigned int dst_stride,
       unsigned int output_width, unsigned int output_height) {
-#if CONFIG_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
       filter_average_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, dst_ptr,
                                  dst_stride, output_width, output_height);
@@ -493,17 +430,12 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
           CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, output_width, output_height,
           UUT_->use_highbd_);
     }
-#else
-    filter_average_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, dst_ptr,
-                               dst_stride, output_width, output_height);
-#endif
   }
 
   void wrapper_filter_block2d_8_c(
       const uint8_t *src_ptr, unsigned int src_stride, const int16_t *HFilter,
       const int16_t *VFilter, uint8_t *dst_ptr, unsigned int dst_stride,
       unsigned int output_width, unsigned int output_height) {
-#if CONFIG_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
       filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, dst_ptr,
                          dst_stride, output_width, output_height);
@@ -513,32 +445,24 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
                                 dst_stride, output_width, output_height,
                                 UUT_->use_highbd_);
     }
-#else
-    filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, dst_ptr,
-                       dst_stride, output_width, output_height);
-#endif
   }
 
   const ConvolveFunctions *UUT_;
   static uint8_t *input_;
   static uint8_t *output_;
   static uint8_t *output_ref_;
-#if CONFIG_HIGHBITDEPTH
   static uint16_t *input16_;
   static uint16_t *output16_;
   static uint16_t *output16_ref_;
   int mask_;
-#endif
 };
 
 uint8_t *ConvolveTest::input_ = NULL;
 uint8_t *ConvolveTest::output_ = NULL;
 uint8_t *ConvolveTest::output_ref_ = NULL;
-#if CONFIG_HIGHBITDEPTH
 uint16_t *ConvolveTest::input16_ = NULL;
 uint16_t *ConvolveTest::output16_ = NULL;
 uint16_t *ConvolveTest::output16_ref_ = NULL;
-#endif
 
 TEST_P(ConvolveTest, GuardBlocks) { CheckGuardBlocks(); }
 
@@ -558,83 +482,6 @@ TEST_P(ConvolveTest, Copy) {
           << "(" << x << "," << y << ")";
 }
 
-TEST_P(ConvolveTest, Avg) {
-  uint8_t *const in = input();
-  uint8_t *const out = output();
-  uint8_t *const out_ref = output_ref();
-  CopyOutputToRef();
-
-  ASM_REGISTER_STATE_CHECK(UUT_->avg_(in, kInputStride, out, kOutputStride,
-                                      NULL, 0, NULL, 0, Width(), Height()));
-
-  CheckGuardBlocks();
-
-  for (int y = 0; y < Height(); ++y)
-    for (int x = 0; x < Width(); ++x)
-      ASSERT_EQ(lookup(out, y * kOutputStride + x),
-                ROUND_POWER_OF_TWO(lookup(in, y * kInputStride + x) +
-                                       lookup(out_ref, y * kOutputStride + x),
-                                   1))
-          << "(" << x << "," << y << ")";
-}
-
-TEST_P(ConvolveTest, CopyHoriz) {
-  uint8_t *const in = input();
-  uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
-
-  ASM_REGISTER_STATE_CHECK(UUT_->sh8_(in, kInputStride, out, kOutputStride,
-                                      filter8, 16, filter8, 16, Width(),
-                                      Height()));
-
-  CheckGuardBlocks();
-
-  for (int y = 0; y < Height(); ++y)
-    for (int x = 0; x < Width(); ++x)
-      ASSERT_EQ(lookup(out, y * kOutputStride + x),
-                lookup(in, y * kInputStride + x))
-          << "(" << x << "," << y << ")";
-}
-
-TEST_P(ConvolveTest, CopyVert) {
-  uint8_t *const in = input();
-  uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
-
-  ASM_REGISTER_STATE_CHECK(UUT_->sv8_(in, kInputStride, out, kOutputStride,
-                                      filter8, 16, filter8, 16, Width(),
-                                      Height()));
-
-  CheckGuardBlocks();
-
-  for (int y = 0; y < Height(); ++y)
-    for (int x = 0; x < Width(); ++x)
-      ASSERT_EQ(lookup(out, y * kOutputStride + x),
-                lookup(in, y * kInputStride + x))
-          << "(" << x << "," << y << ")";
-}
-
-TEST_P(ConvolveTest, Copy2D) {
-  uint8_t *const in = input();
-  uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
-
-  ASM_REGISTER_STATE_CHECK(UUT_->shv8_(in, kInputStride, out, kOutputStride,
-                                       filter8, 16, filter8, 16, Width(),
-                                       Height()));
-
-  CheckGuardBlocks();
-
-  for (int y = 0; y < Height(); ++y)
-    for (int x = 0; x < Width(); ++x)
-      ASSERT_EQ(lookup(out, y * kOutputStride + x),
-                lookup(in, y * kInputStride + x))
-          << "(" << x << "," << y << ")";
-}
-
 const int kNumFilterBanks = SWITCHABLE_FILTERS;
 const int kNumFilters = 16;
 
@@ -643,11 +490,9 @@ TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
     const InterpFilter filter = (InterpFilter)filter_bank;
     const InterpKernel *filters =
         (const InterpKernel *)av1_get_interp_filter_kernel(filter);
-#if CONFIG_DUAL_FILTER
     const InterpFilterParams filter_params =
-        av1_get_interp_filter_params(filter);
+        av1_get_interp_filter_params_with_block_size(filter, 8);
     if (filter_params.taps != SUBPEL_TAPS) continue;
-#endif
     for (int i = 0; i < kNumFilters; i++) {
       const int p0 = filters[i][0] + filters[i][1];
       const int p1 = filters[i][2] + filters[i][3];
@@ -670,7 +515,6 @@ const int16_t kInvalidFilter[8] = { 0 };
 TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
   uint8_t *const in = input();
   uint8_t *const out = output();
-#if CONFIG_HIGHBITDEPTH
   uint8_t ref8[kOutputStride * kMaxDimension];
   uint16_t ref16[kOutputStride * kMaxDimension];
   uint8_t *ref;
@@ -679,19 +523,14 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
   } else {
     ref = CONVERT_TO_BYTEPTR(ref16);
   }
-#else
-  uint8_t ref[kOutputStride * kMaxDimension];
-#endif
 
   for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
     const InterpFilter filter = (InterpFilter)filter_bank;
     const InterpKernel *filters =
         (const InterpKernel *)av1_get_interp_filter_kernel(filter);
-#if CONFIG_DUAL_FILTER
     const InterpFilterParams filter_params =
-        av1_get_interp_filter_params(filter);
+        av1_get_interp_filter_params_with_block_size(filter, 8);
     if (filter_params.taps != SUBPEL_TAPS) continue;
-#endif
 
     for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
       for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
@@ -700,9 +539,7 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
                                    Width(), Height());
 
         if (filter_x && filter_y)
-          ASM_REGISTER_STATE_CHECK(UUT_->hv8_(
-              in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-              filters[filter_y], 16, Width(), Height()));
+          continue;
         else if (filter_y)
           ASM_REGISTER_STATE_CHECK(
               UUT_->v8_(in, kInputStride, out, kOutputStride, kInvalidFilter,
@@ -730,93 +567,9 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
   }
 }
 
-TEST_P(ConvolveTest, MatchesReferenceAveragingSubpixelFilter) {
-  uint8_t *const in = input();
-  uint8_t *const out = output();
-#if CONFIG_HIGHBITDEPTH
-  uint8_t ref8[kOutputStride * kMaxDimension];
-  uint16_t ref16[kOutputStride * kMaxDimension];
-  uint8_t *ref;
-  if (UUT_->use_highbd_ == 0) {
-    ref = ref8;
-  } else {
-    ref = CONVERT_TO_BYTEPTR(ref16);
-  }
-#else
-  uint8_t ref[kOutputStride * kMaxDimension];
-#endif
-
-  // Populate ref and out with some random data
-  ::libaom_test::ACMRandom prng;
-  for (int y = 0; y < Height(); ++y) {
-    for (int x = 0; x < Width(); ++x) {
-      uint16_t r;
-#if CONFIG_HIGHBITDEPTH
-      if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) {
-        r = prng.Rand8Extremes();
-      } else {
-        r = prng.Rand16() & mask_;
-      }
-#else
-      r = prng.Rand8Extremes();
-#endif
-
-      assign_val(out, y * kOutputStride + x, r);
-      assign_val(ref, y * kOutputStride + x, r);
-    }
-  }
-
-  for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
-    const InterpFilter filter = (InterpFilter)filter_bank;
-    const InterpKernel *filters =
-        (const InterpKernel *)av1_get_interp_filter_kernel(filter);
-#if CONFIG_DUAL_FILTER
-    const InterpFilterParams filter_params =
-        av1_get_interp_filter_params(filter);
-    if (filter_params.taps != SUBPEL_TAPS) continue;
-#endif
-
-    for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
-      for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
-        wrapper_filter_average_block2d_8_c(in, kInputStride, filters[filter_x],
-                                           filters[filter_y], ref,
-                                           kOutputStride, Width(), Height());
-
-        if (filter_x && filter_y)
-          ASM_REGISTER_STATE_CHECK(UUT_->hv8_avg_(
-              in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-              filters[filter_y], 16, Width(), Height()));
-        else if (filter_y)
-          ASM_REGISTER_STATE_CHECK(UUT_->v8_avg_(
-              in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
-              filters[filter_y], 16, Width(), Height()));
-        else if (filter_x)
-          ASM_REGISTER_STATE_CHECK(UUT_->h8_avg_(
-              in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-              kInvalidFilter, 16, Width(), Height()));
-        else
-          ASM_REGISTER_STATE_CHECK(
-              UUT_->avg_(in, kInputStride, out, kOutputStride, kInvalidFilter,
-                         0, kInvalidFilter, 0, Width(), Height()));
-
-        CheckGuardBlocks();
-
-        for (int y = 0; y < Height(); ++y)
-          for (int x = 0; x < Width(); ++x)
-            ASSERT_EQ(lookup(ref, y * kOutputStride + x),
-                      lookup(out, y * kOutputStride + x))
-                << "mismatch at (" << x << "," << y << "), "
-                << "filters (" << filter_bank << "," << filter_x << ","
-                << filter_y << ")";
-      }
-    }
-  }
-}
-
 TEST_P(ConvolveTest, FilterExtremes) {
   uint8_t *const in = input();
   uint8_t *const out = output();
-#if CONFIG_HIGHBITDEPTH
   uint8_t ref8[kOutputStride * kMaxDimension];
   uint16_t ref16[kOutputStride * kMaxDimension];
   uint8_t *ref;
@@ -825,24 +578,17 @@ TEST_P(ConvolveTest, FilterExtremes) {
   } else {
     ref = CONVERT_TO_BYTEPTR(ref16);
   }
-#else
-  uint8_t ref[kOutputStride * kMaxDimension];
-#endif
 
   // Populate ref and out with some random data
   ::libaom_test::ACMRandom prng;
   for (int y = 0; y < Height(); ++y) {
     for (int x = 0; x < Width(); ++x) {
       uint16_t r;
-#if CONFIG_HIGHBITDEPTH
       if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) {
         r = prng.Rand8Extremes();
       } else {
         r = prng.Rand16() & mask_;
       }
-#else
-      r = prng.Rand8Extremes();
-#endif
       assign_val(out, y * kOutputStride + x, r);
       assign_val(ref, y * kOutputStride + x, r);
     }
@@ -853,13 +599,8 @@ TEST_P(ConvolveTest, FilterExtremes) {
     while (seed_val < 256) {
       for (int y = 0; y < 8; ++y) {
         for (int x = 0; x < 8; ++x) {
-#if CONFIG_HIGHBITDEPTH
           assign_val(in, y * kOutputStride + x - SUBPEL_TAPS / 2 + 1,
                      ((seed_val >> (axis ? y : x)) & 1) * mask_);
-#else
-          assign_val(in, y * kOutputStride + x - SUBPEL_TAPS / 2 + 1,
-                     ((seed_val >> (axis ? y : x)) & 1) * 255);
-#endif
           if (axis) seed_val++;
         }
         if (axis)
@@ -873,20 +614,16 @@ TEST_P(ConvolveTest, FilterExtremes) {
         const InterpFilter filter = (InterpFilter)filter_bank;
         const InterpKernel *filters =
             (const InterpKernel *)av1_get_interp_filter_kernel(filter);
-#if CONFIG_DUAL_FILTER
         const InterpFilterParams filter_params =
-            av1_get_interp_filter_params(filter);
+            av1_get_interp_filter_params_with_block_size(filter, 8);
         if (filter_params.taps != SUBPEL_TAPS) continue;
-#endif
         for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
           for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
             wrapper_filter_block2d_8_c(in, kInputStride, filters[filter_x],
                                        filters[filter_y], ref, kOutputStride,
                                        Width(), Height());
             if (filter_x && filter_y)
-              ASM_REGISTER_STATE_CHECK(UUT_->hv8_(
-                  in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-                  filters[filter_y], 16, Width(), Height()));
+              continue;
             else if (filter_y)
               ASM_REGISTER_STATE_CHECK(UUT_->v8_(
                   in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
@@ -914,37 +651,6 @@ TEST_P(ConvolveTest, FilterExtremes) {
   }
 }
 
-/* This test exercises that enough rows and columns are filtered with every
-   possible initial fractional positions and scaling steps. */
-TEST_P(ConvolveTest, CheckScalingFiltering) {
-  uint8_t *const in = input();
-  uint8_t *const out = output();
-  const InterpKernel *const eighttap =
-      (const InterpKernel *)av1_get_interp_filter_kernel(EIGHTTAP_REGULAR);
-
-  SetConstantInput(127);
-
-  for (int frac = 0; frac < 16; ++frac) {
-    for (int step = 1; step <= 32; ++step) {
-      /* Test the horizontal and vertical filters in combination. */
-      ASM_REGISTER_STATE_CHECK(UUT_->shv8_(in, kInputStride, out, kOutputStride,
-                                           eighttap[frac], step, eighttap[frac],
-                                           step, Width(), Height()));
-
-      CheckGuardBlocks();
-
-      for (int y = 0; y < Height(); ++y) {
-        for (int x = 0; x < Width(); ++x) {
-          ASSERT_EQ(lookup(in, y * kInputStride + x),
-                    lookup(out, y * kOutputStride + x))
-              << "x == " << x << ", y == " << y << ", frac == " << frac
-              << ", step == " << step;
-        }
-      }
-    }
-  }
-}
-
 TEST_P(ConvolveTest, DISABLED_Copy_Speed) {
   const uint8_t *const in = input();
   uint8_t *const out = output();
@@ -965,30 +671,9 @@ TEST_P(ConvolveTest, DISABLED_Copy_Speed) {
          UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
 }
 
-TEST_P(ConvolveTest, DISABLED_Avg_Speed) {
-  const uint8_t *const in = input();
-  uint8_t *const out = output();
-  const int kNumTests = 5000000;
-  const int width = Width();
-  const int height = Height();
-  aom_usec_timer timer;
-
-  aom_usec_timer_start(&timer);
-  for (int n = 0; n < kNumTests; ++n) {
-    UUT_->avg_(in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0, width,
-               height);
-  }
-  aom_usec_timer_mark(&timer);
-
-  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-  printf("convolve_avg_%dx%d_%d: %d us\n", width, height,
-         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
-}
-
 TEST_P(ConvolveTest, DISABLED_Speed) {
   uint8_t *const in = input();
   uint8_t *const out = output();
-#if CONFIG_HIGHBITDEPTH
   uint8_t ref8[kOutputStride * kMaxDimension];
   uint16_t ref16[kOutputStride * kMaxDimension];
   uint8_t *ref;
@@ -997,25 +682,17 @@ TEST_P(ConvolveTest, DISABLED_Speed) {
   } else {
     ref = CONVERT_TO_BYTEPTR(ref16);
   }
-#else
-  uint8_t ref[kOutputStride * kMaxDimension];
-#endif
 
   // Populate ref and out with some random data
   ::libaom_test::ACMRandom prng;
   for (int y = 0; y < Height(); ++y) {
     for (int x = 0; x < Width(); ++x) {
       uint16_t r;
-#if CONFIG_HIGHBITDEPTH
       if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) {
         r = prng.Rand8Extremes();
       } else {
         r = prng.Rand16() & mask_;
       }
-#else
-      r = prng.Rand8Extremes();
-#endif
-
       assign_val(out, y * kOutputStride + x, r);
       assign_val(ref, y * kOutputStride + x, r);
     }
@@ -1036,18 +713,13 @@ TEST_P(ConvolveTest, DISABLED_Speed) {
       const InterpFilter filter = (InterpFilter)filter_bank;
       const InterpKernel *filters =
           (const InterpKernel *)av1_get_interp_filter_kernel(filter);
-#if CONFIG_DUAL_FILTER
       const InterpFilterParams filter_params =
-          av1_get_interp_filter_params(filter);
+          av1_get_interp_filter_params_with_block_size(filter, 8);
       if (filter_params.taps != SUBPEL_TAPS) continue;
-#endif
 
       for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
         for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
-          if (filter_x && filter_y)
-            ASM_REGISTER_STATE_CHECK(UUT_->hv8_(
-                in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-                filters[filter_y], 16, Width(), Height()));
+          if (filter_x && filter_y) continue;
           if (filter_y)
             ASM_REGISTER_STATE_CHECK(
                 UUT_->v8_(in, kInputStride, out, kOutputStride, kInvalidFilter,
@@ -1069,11 +741,10 @@ TEST_P(ConvolveTest, DISABLED_Speed) {
          UUT_->use_highbd_, elapsed_time);
 }
 
-using std::tr1::make_tuple;
+using ::testing::make_tuple;
 
-#if CONFIG_HIGHBITDEPTH
 #define WRAP(func, bd)                                                       \
-  void wrap_##func##_##bd(                                                   \
+  static void wrap_##func##_##bd(                                            \
       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
       ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride,    \
       const int16_t *filter_y, int filter_y_stride, int w, int h) {          \
@@ -1082,173 +753,78 @@ using std::tr1::make_tuple;
   }
 #if HAVE_SSE2 && ARCH_X86_64
 WRAP(convolve_copy_sse2, 8)
-WRAP(convolve_avg_sse2, 8)
 WRAP(convolve_copy_sse2, 10)
-WRAP(convolve_avg_sse2, 10)
 WRAP(convolve_copy_sse2, 12)
-WRAP(convolve_avg_sse2, 12)
 WRAP(convolve8_horiz_sse2, 8)
-WRAP(convolve8_avg_horiz_sse2, 8)
 WRAP(convolve8_vert_sse2, 8)
-WRAP(convolve8_avg_vert_sse2, 8)
-WRAP(convolve8_sse2, 8)
-WRAP(convolve8_avg_sse2, 8)
 WRAP(convolve8_horiz_sse2, 10)
-WRAP(convolve8_avg_horiz_sse2, 10)
 WRAP(convolve8_vert_sse2, 10)
-WRAP(convolve8_avg_vert_sse2, 10)
-WRAP(convolve8_sse2, 10)
-WRAP(convolve8_avg_sse2, 10)
 WRAP(convolve8_horiz_sse2, 12)
-WRAP(convolve8_avg_horiz_sse2, 12)
 WRAP(convolve8_vert_sse2, 12)
-WRAP(convolve8_avg_vert_sse2, 12)
-WRAP(convolve8_sse2, 12)
-WRAP(convolve8_avg_sse2, 12)
 #endif  // HAVE_SSE2 && ARCH_X86_64
 
 WRAP(convolve_copy_c, 8)
-WRAP(convolve_avg_c, 8)
 WRAP(convolve8_horiz_c, 8)
-WRAP(convolve8_avg_horiz_c, 8)
 WRAP(convolve8_vert_c, 8)
-WRAP(convolve8_avg_vert_c, 8)
-WRAP(convolve8_c, 8)
-WRAP(convolve8_avg_c, 8)
 WRAP(convolve_copy_c, 10)
-WRAP(convolve_avg_c, 10)
 WRAP(convolve8_horiz_c, 10)
-WRAP(convolve8_avg_horiz_c, 10)
 WRAP(convolve8_vert_c, 10)
-WRAP(convolve8_avg_vert_c, 10)
-WRAP(convolve8_c, 10)
-WRAP(convolve8_avg_c, 10)
 WRAP(convolve_copy_c, 12)
-WRAP(convolve_avg_c, 12)
 WRAP(convolve8_horiz_c, 12)
-WRAP(convolve8_avg_horiz_c, 12)
 WRAP(convolve8_vert_c, 12)
-WRAP(convolve8_avg_vert_c, 12)
-WRAP(convolve8_c, 12)
-WRAP(convolve8_avg_c, 12)
 
 #if HAVE_AVX2
 WRAP(convolve_copy_avx2, 8)
-WRAP(convolve_avg_avx2, 8)
 WRAP(convolve8_horiz_avx2, 8)
-WRAP(convolve8_avg_horiz_avx2, 8)
 WRAP(convolve8_vert_avx2, 8)
-WRAP(convolve8_avg_vert_avx2, 8)
-WRAP(convolve8_avx2, 8)
-WRAP(convolve8_avg_avx2, 8)
 
 WRAP(convolve_copy_avx2, 10)
-WRAP(convolve_avg_avx2, 10)
-WRAP(convolve8_avx2, 10)
 WRAP(convolve8_horiz_avx2, 10)
 WRAP(convolve8_vert_avx2, 10)
-WRAP(convolve8_avg_avx2, 10)
-WRAP(convolve8_avg_horiz_avx2, 10)
-WRAP(convolve8_avg_vert_avx2, 10)
 
 WRAP(convolve_copy_avx2, 12)
-WRAP(convolve_avg_avx2, 12)
-WRAP(convolve8_avx2, 12)
 WRAP(convolve8_horiz_avx2, 12)
 WRAP(convolve8_vert_avx2, 12)
-WRAP(convolve8_avg_avx2, 12)
-WRAP(convolve8_avg_horiz_avx2, 12)
-WRAP(convolve8_avg_vert_avx2, 12)
 #endif  // HAVE_AVX2
 
 #undef WRAP
 
-const ConvolveFunctions convolve8_c(
-    wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_c_8,
-    wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_c_8,
-    wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8,
-    wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
-    wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8,
-    wrap_convolve8_avg_c_8, 8);
-const ConvolveFunctions convolve10_c(
-    wrap_convolve_copy_c_10, wrap_convolve_avg_c_10, wrap_convolve8_horiz_c_10,
-    wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10,
-    wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10,
-    wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
-    wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10,
-    wrap_convolve8_avg_c_10, 10);
-const ConvolveFunctions convolve12_c(
-    wrap_convolve_copy_c_12, wrap_convolve_avg_c_12, wrap_convolve8_horiz_c_12,
-    wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12,
-    wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12,
-    wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
-    wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
-    wrap_convolve8_avg_c_12, 12);
+const ConvolveFunctions convolve8_c(wrap_convolve_copy_c_8,
+                                    wrap_convolve8_horiz_c_8,
+                                    wrap_convolve8_vert_c_8, 8);
+const ConvolveFunctions convolve10_c(wrap_convolve_copy_c_10,
+                                     wrap_convolve8_horiz_c_10,
+                                     wrap_convolve8_vert_c_10, 10);
+const ConvolveFunctions convolve12_c(wrap_convolve_copy_c_12,
+                                     wrap_convolve8_horiz_c_12,
+                                     wrap_convolve8_vert_c_12, 12);
 const ConvolveParam kArrayConvolve_c[] = {
   ALL_SIZES(convolve8_c), ALL_SIZES(convolve10_c), ALL_SIZES(convolve12_c)
 };
 
-#else
-const ConvolveFunctions convolve8_c(
-    aom_convolve_copy_c, aom_convolve_avg_c, aom_convolve8_horiz_c,
-    aom_convolve8_avg_horiz_c, aom_convolve8_vert_c, aom_convolve8_avg_vert_c,
-    aom_convolve8_c, aom_convolve8_avg_c, aom_scaled_horiz_c,
-    aom_scaled_avg_horiz_c, aom_scaled_vert_c, aom_scaled_avg_vert_c,
-    aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
-const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(convolve8_c) };
-#endif
 INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::ValuesIn(kArrayConvolve_c));
 
 #if HAVE_SSE2 && ARCH_X86_64
-#if CONFIG_HIGHBITDEPTH
-const ConvolveFunctions convolve8_sse2(
-    wrap_convolve_copy_sse2_8, wrap_convolve_avg_sse2_8,
-    wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8,
-    wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
-    wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8,
-    wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8,
-    wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
-    wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8, 8);
-const ConvolveFunctions convolve10_sse2(
-    wrap_convolve_copy_sse2_10, wrap_convolve_avg_sse2_10,
-    wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10,
-    wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
-    wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10,
-    wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10,
-    wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
-    wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10, 10);
-const ConvolveFunctions convolve12_sse2(
-    wrap_convolve_copy_sse2_12, wrap_convolve_avg_sse2_12,
-    wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
-    wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
-    wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12,
-    wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
-    wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
-    wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12, 12);
+const ConvolveFunctions convolve8_sse2(wrap_convolve_copy_sse2_8,
+                                       wrap_convolve8_horiz_sse2_8,
+                                       wrap_convolve8_vert_sse2_8, 8);
+const ConvolveFunctions convolve10_sse2(wrap_convolve_copy_sse2_10,
+                                        wrap_convolve8_horiz_sse2_10,
+                                        wrap_convolve8_vert_sse2_10, 10);
+const ConvolveFunctions convolve12_sse2(wrap_convolve_copy_sse2_12,
+                                        wrap_convolve8_horiz_sse2_12,
+                                        wrap_convolve8_vert_sse2_12, 12);
 const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2),
                                               ALL_SIZES(convolve10_sse2),
                                               ALL_SIZES(convolve12_sse2) };
-#else
-const ConvolveFunctions convolve8_sse2(
-    aom_convolve_copy_sse2, aom_convolve_avg_sse2, aom_convolve8_horiz_sse2,
-    aom_convolve8_avg_horiz_sse2, aom_convolve8_vert_sse2,
-    aom_convolve8_avg_vert_sse2, aom_convolve8_sse2, aom_convolve8_avg_sse2,
-    aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
-    aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
-
-const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2) };
-#endif  // CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest,
                         ::testing::ValuesIn(kArrayConvolve_sse2));
 #endif
 
 #if HAVE_SSSE3
-const ConvolveFunctions convolve8_ssse3(
-    aom_convolve_copy_c, aom_convolve_avg_c, aom_convolve8_horiz_ssse3,
-    aom_convolve8_avg_horiz_ssse3, aom_convolve8_vert_ssse3,
-    aom_convolve8_avg_vert_ssse3, aom_convolve8_ssse3, aom_convolve8_avg_ssse3,
-    aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
-    aom_scaled_avg_vert_c, aom_scaled_2d_ssse3, aom_scaled_avg_2d_c, 0);
+const ConvolveFunctions convolve8_ssse3(aom_convolve_copy_c,
+                                        aom_convolve8_horiz_ssse3,
+                                        aom_convolve8_vert_ssse3, 0);
 
 const ConvolveParam kArrayConvolve8_ssse3[] = { ALL_SIZES(convolve8_ssse3) };
 INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest,
@@ -1256,95 +832,20 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest,
 #endif
 
 #if HAVE_AVX2
-#if CONFIG_HIGHBITDEPTH
-const ConvolveFunctions convolve8_avx2(
-    wrap_convolve_copy_avx2_8, wrap_convolve_avg_avx2_8,
-    wrap_convolve8_horiz_avx2_8, wrap_convolve8_avg_horiz_avx2_8,
-    wrap_convolve8_vert_avx2_8, wrap_convolve8_avg_vert_avx2_8,
-    wrap_convolve8_avx2_8, wrap_convolve8_avg_avx2_8, wrap_convolve8_horiz_c_8,
-    wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_c_8,
-    wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8);
-const ConvolveFunctions convolve10_avx2(
-    wrap_convolve_copy_avx2_10, wrap_convolve_avg_avx2_10,
-    wrap_convolve8_horiz_avx2_10, wrap_convolve8_avg_horiz_avx2_10,
-    wrap_convolve8_vert_avx2_10, wrap_convolve8_avg_vert_avx2_10,
-    wrap_convolve8_avx2_10, wrap_convolve8_avg_avx2_10,
-    wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
-    wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10,
-    wrap_convolve8_avg_c_10, 10);
-const ConvolveFunctions convolve12_avx2(
-    wrap_convolve_copy_avx2_12, wrap_convolve_avg_avx2_12,
-    wrap_convolve8_horiz_avx2_12, wrap_convolve8_avg_horiz_avx2_12,
-    wrap_convolve8_vert_avx2_12, wrap_convolve8_avg_vert_avx2_12,
-    wrap_convolve8_avx2_12, wrap_convolve8_avg_avx2_12,
-    wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
-    wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
-    wrap_convolve8_avg_c_12, 12);
+const ConvolveFunctions convolve8_avx2(wrap_convolve_copy_avx2_8,
+                                       wrap_convolve8_horiz_avx2_8,
+                                       wrap_convolve8_vert_avx2_8, 8);
+const ConvolveFunctions convolve10_avx2(wrap_convolve_copy_avx2_10,
+                                        wrap_convolve8_horiz_avx2_10,
+                                        wrap_convolve8_vert_avx2_10, 10);
+const ConvolveFunctions convolve12_avx2(wrap_convolve_copy_avx2_12,
+                                        wrap_convolve8_horiz_avx2_12,
+                                        wrap_convolve8_vert_avx2_12, 12);
 const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES_64(convolve8_avx2),
                                                ALL_SIZES_64(convolve10_avx2),
                                                ALL_SIZES_64(convolve12_avx2) };
-#else
-const ConvolveFunctions convolve8_avx2(
-    aom_convolve_copy_c, aom_convolve_avg_c, aom_convolve8_horiz_avx2,
-    aom_convolve8_avg_horiz_ssse3, aom_convolve8_vert_avx2,
-    aom_convolve8_avg_vert_ssse3, aom_convolve8_avx2, aom_convolve8_avg_ssse3,
-    aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
-    aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
-
-const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES_64(convolve8_avx2) };
-#endif  // CONFIG_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
                         ::testing::ValuesIn(kArrayConvolve8_avx2));
 #endif  // HAVE_AVX2
 
-// TODO(any): Make NEON versions support 128x128 128x64 64x128 block sizes
-#if HAVE_NEON && !(CONFIG_AV1 && CONFIG_EXT_PARTITION)
-#if HAVE_NEON_ASM
-const ConvolveFunctions convolve8_neon(
-    aom_convolve_copy_neon, aom_convolve_avg_neon, aom_convolve8_horiz_neon,
-    aom_convolve8_avg_horiz_neon, aom_convolve8_vert_neon,
-    aom_convolve8_avg_vert_neon, aom_convolve8_neon, aom_convolve8_avg_neon,
-    aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
-    aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
-#else   // HAVE_NEON
-const ConvolveFunctions convolve8_neon(
-    aom_convolve_copy_neon, aom_convolve_avg_neon, aom_convolve8_horiz_neon,
-    aom_convolve8_avg_horiz_neon, aom_convolve8_vert_neon,
-    aom_convolve8_avg_vert_neon, aom_convolve8_neon, aom_convolve8_avg_neon,
-    aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
-    aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
-#endif  // HAVE_NEON_ASM
-
-const ConvolveParam kArrayConvolve8_neon[] = { ALL_SIZES_64(convolve8_neon) };
-INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest,
-                        ::testing::ValuesIn(kArrayConvolve8_neon));
-#endif  // HAVE_NEON && !(CONFIG_AV1 && CONFIG_EXT_PARTITION)
-
-// TODO(any): Make DSPR2 versions support 128x128 128x64 64x128 block sizes
-#if HAVE_DSPR2 && !(CONFIG_AV1 && CONFIG_EXT_PARTITION)
-const ConvolveFunctions convolve8_dspr2(
-    aom_convolve_copy_dspr2, aom_convolve_avg_dspr2, aom_convolve8_horiz_dspr2,
-    aom_convolve8_avg_horiz_dspr2, aom_convolve8_vert_dspr2,
-    aom_convolve8_avg_vert_dspr2, aom_convolve8_dspr2, aom_convolve8_avg_dspr2,
-    aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
-    aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
-
-const ConvolveParam kArrayConvolve8_dspr2[] = { ALL_SIZES_64(convolve8_dspr2) };
-INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest,
-                        ::testing::ValuesIn(kArrayConvolve8_dspr2));
-#endif  // HAVE_DSPR2 && !(CONFIG_AV1 && CONFIG_EXT_PARTITION)
-
-// TODO(any): Make MSA versions support 128x128 128x64 64x128 block sizes
-#if HAVE_MSA && !(CONFIG_AV1 && CONFIG_EXT_PARTITION)
-const ConvolveFunctions convolve8_msa(
-    aom_convolve_copy_msa, aom_convolve_avg_msa, aom_convolve8_horiz_msa,
-    aom_convolve8_avg_horiz_msa, aom_convolve8_vert_msa,
-    aom_convolve8_avg_vert_msa, aom_convolve8_msa, aom_convolve8_avg_msa,
-    aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
-    aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
-
-const ConvolveParam kArrayConvolve8_msa[] = { ALL_SIZES_64(convolve8_msa) };
-INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest,
-                        ::testing::ValuesIn(kArrayConvolve8_msa));
-#endif  // HAVE_MSA && !(CONFIG_AV1 && CONFIG_EXT_PARTITION)
 }  // namespace
diff --git a/third_party/aom/test/corner_match_test.cc b/third_party/aom/test/corner_match_test.cc
index 2197fffee..58e3139c5 100644
--- a/third_party/aom/test/corner_match_test.cc
+++ b/third_party/aom/test/corner_match_test.cc
@@ -8,11 +8,11 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+#include "config/av1_rtcd.h"
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/util.h"
-#include "./av1_rtcd.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 
@@ -24,8 +24,8 @@ namespace AV1CornerMatch {
 
 using libaom_test::ACMRandom;
 
-using std::tr1::tuple;
-using std::tr1::make_tuple;
+using ::testing::make_tuple;
+using ::testing::tuple;
 typedef tuple<int> CornerMatchParam;
 
 class AV1CornerMatchTest : public ::testing::TestWithParam<CornerMatchParam> {
diff --git a/third_party/aom/test/cpu_speed_test.cc b/third_party/aom/test/cpu_speed_test.cc
index bde00472d..8ea3e6965 100644
--- a/third_party/aom/test/cpu_speed_test.cc
+++ b/third_party/aom/test/cpu_speed_test.cc
@@ -7,7 +7,7 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/codec_factory.h"
diff --git a/third_party/aom/test/datarate_test.cc b/third_party/aom/test/datarate_test.cc
index d577be35a..1588d3cc1 100644
--- a/third_party/aom/test/datarate_test.cc
+++ b/third_party/aom/test/datarate_test.cc
@@ -7,9 +7,10 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
+
+#include "config/aom_config.h"
 
-#include "./aom_config.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
@@ -215,6 +216,7 @@ TEST_P(DatarateTestLarge, ChangingDropFrameThresh) {
   cfg_.rc_end_usage = AOM_CBR;
   cfg_.rc_target_bitrate = 200;
   cfg_.g_lag_in_frames = 0;
+  cfg_.g_error_resilient = 1;
   // TODO(marpan): Investigate datarate target failures with a smaller keyframe
   // interval (128).
   cfg_.kf_max_dist = 9999;
diff --git a/third_party/aom/test/dct16x16_test.cc b/third_party/aom/test/dct16x16_test.cc
deleted file mode 100644
index 3cc0ed8c0..000000000
--- a/third_party/aom/test/dct16x16_test.cc
+++ /dev/null
@@ -1,888 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
-
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "av1/common/entropy.h"
-#include "av1/common/scan.h"
-#include "aom/aom_codec.h"
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_ports/msvc.h"  // for round()
-
-using libaom_test::ACMRandom;
-
-namespace {
-
-const int kNumCoeffs = 256;
-const double C1 = 0.995184726672197;
-const double C2 = 0.98078528040323;
-const double C3 = 0.956940335732209;
-const double C4 = 0.923879532511287;
-const double C5 = 0.881921264348355;
-const double C6 = 0.831469612302545;
-const double C7 = 0.773010453362737;
-const double C8 = 0.707106781186548;
-const double C9 = 0.634393284163646;
-const double C10 = 0.555570233019602;
-const double C11 = 0.471396736825998;
-const double C12 = 0.38268343236509;
-const double C13 = 0.290284677254462;
-const double C14 = 0.195090322016128;
-const double C15 = 0.098017140329561;
-
-void butterfly_16x16_dct_1d(double input[16], double output[16]) {
-  double step[16];
-  double intermediate[16];
-  double temp1, temp2;
-
-  // step 1
-  step[0] = input[0] + input[15];
-  step[1] = input[1] + input[14];
-  step[2] = input[2] + input[13];
-  step[3] = input[3] + input[12];
-  step[4] = input[4] + input[11];
-  step[5] = input[5] + input[10];
-  step[6] = input[6] + input[9];
-  step[7] = input[7] + input[8];
-  step[8] = input[7] - input[8];
-  step[9] = input[6] - input[9];
-  step[10] = input[5] - input[10];
-  step[11] = input[4] - input[11];
-  step[12] = input[3] - input[12];
-  step[13] = input[2] - input[13];
-  step[14] = input[1] - input[14];
-  step[15] = input[0] - input[15];
-
-  // step 2
-  output[0] = step[0] + step[7];
-  output[1] = step[1] + step[6];
-  output[2] = step[2] + step[5];
-  output[3] = step[3] + step[4];
-  output[4] = step[3] - step[4];
-  output[5] = step[2] - step[5];
-  output[6] = step[1] - step[6];
-  output[7] = step[0] - step[7];
-
-  temp1 = step[8] * C7;
-  temp2 = step[15] * C9;
-  output[8] = temp1 + temp2;
-
-  temp1 = step[9] * C11;
-  temp2 = step[14] * C5;
-  output[9] = temp1 - temp2;
-
-  temp1 = step[10] * C3;
-  temp2 = step[13] * C13;
-  output[10] = temp1 + temp2;
-
-  temp1 = step[11] * C15;
-  temp2 = step[12] * C1;
-  output[11] = temp1 - temp2;
-
-  temp1 = step[11] * C1;
-  temp2 = step[12] * C15;
-  output[12] = temp2 + temp1;
-
-  temp1 = step[10] * C13;
-  temp2 = step[13] * C3;
-  output[13] = temp2 - temp1;
-
-  temp1 = step[9] * C5;
-  temp2 = step[14] * C11;
-  output[14] = temp2 + temp1;
-
-  temp1 = step[8] * C9;
-  temp2 = step[15] * C7;
-  output[15] = temp2 - temp1;
-
-  // step 3
-  step[0] = output[0] + output[3];
-  step[1] = output[1] + output[2];
-  step[2] = output[1] - output[2];
-  step[3] = output[0] - output[3];
-
-  temp1 = output[4] * C14;
-  temp2 = output[7] * C2;
-  step[4] = temp1 + temp2;
-
-  temp1 = output[5] * C10;
-  temp2 = output[6] * C6;
-  step[5] = temp1 + temp2;
-
-  temp1 = output[5] * C6;
-  temp2 = output[6] * C10;
-  step[6] = temp2 - temp1;
-
-  temp1 = output[4] * C2;
-  temp2 = output[7] * C14;
-  step[7] = temp2 - temp1;
-
-  step[8] = output[8] + output[11];
-  step[9] = output[9] + output[10];
-  step[10] = output[9] - output[10];
-  step[11] = output[8] - output[11];
-
-  step[12] = output[12] + output[15];
-  step[13] = output[13] + output[14];
-  step[14] = output[13] - output[14];
-  step[15] = output[12] - output[15];
-
-  // step 4
-  output[0] = (step[0] + step[1]);
-  output[8] = (step[0] - step[1]);
-
-  temp1 = step[2] * C12;
-  temp2 = step[3] * C4;
-  temp1 = temp1 + temp2;
-  output[4] = 2 * (temp1 * C8);
-
-  temp1 = step[2] * C4;
-  temp2 = step[3] * C12;
-  temp1 = temp2 - temp1;
-  output[12] = 2 * (temp1 * C8);
-
-  output[2] = 2 * ((step[4] + step[5]) * C8);
-  output[14] = 2 * ((step[7] - step[6]) * C8);
-
-  temp1 = step[4] - step[5];
-  temp2 = step[6] + step[7];
-  output[6] = (temp1 + temp2);
-  output[10] = (temp1 - temp2);
-
-  intermediate[8] = step[8] + step[14];
-  intermediate[9] = step[9] + step[15];
-
-  temp1 = intermediate[8] * C12;
-  temp2 = intermediate[9] * C4;
-  temp1 = temp1 - temp2;
-  output[3] = 2 * (temp1 * C8);
-
-  temp1 = intermediate[8] * C4;
-  temp2 = intermediate[9] * C12;
-  temp1 = temp2 + temp1;
-  output[13] = 2 * (temp1 * C8);
-
-  output[9] = 2 * ((step[10] + step[11]) * C8);
-
-  intermediate[11] = step[10] - step[11];
-  intermediate[12] = step[12] + step[13];
-  intermediate[13] = step[12] - step[13];
-  intermediate[14] = step[8] - step[14];
-  intermediate[15] = step[9] - step[15];
-
-  output[15] = (intermediate[11] + intermediate[12]);
-  output[1] = -(intermediate[11] - intermediate[12]);
-
-  output[7] = 2 * (intermediate[13] * C8);
-
-  temp1 = intermediate[14] * C12;
-  temp2 = intermediate[15] * C4;
-  temp1 = temp1 - temp2;
-  output[11] = -2 * (temp1 * C8);
-
-  temp1 = intermediate[14] * C4;
-  temp2 = intermediate[15] * C12;
-  temp1 = temp2 + temp1;
-  output[5] = 2 * (temp1 * C8);
-}
-
-void reference_16x16_dct_2d(int16_t input[256], double output[256]) {
-  // First transform columns
-  for (int i = 0; i < 16; ++i) {
-    double temp_in[16], temp_out[16];
-    for (int j = 0; j < 16; ++j) temp_in[j] = input[j * 16 + i];
-    butterfly_16x16_dct_1d(temp_in, temp_out);
-    for (int j = 0; j < 16; ++j) output[j * 16 + i] = temp_out[j];
-  }
-  // Then transform rows
-  for (int i = 0; i < 16; ++i) {
-    double temp_in[16], temp_out[16];
-    for (int j = 0; j < 16; ++j) temp_in[j] = output[j + i * 16];
-    butterfly_16x16_dct_1d(temp_in, temp_out);
-    // Scale by some magic number
-    for (int j = 0; j < 16; ++j) output[j + i * 16] = temp_out[j] / 2;
-  }
-}
-
-typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
-                        TxfmParam *txfm_param);
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        const TxfmParam *txfm_param);
-
-typedef std::tr1::tuple<FdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t>
-    Dct16x16Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t>
-    Ht16x16Param;
-typedef std::tr1::tuple<IdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t>
-    Idct16x16Param;
-
-void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,
-                   TxfmParam * /*txfm_param*/) {
-  aom_fdct16x16_c(in, out, stride);
-}
-
-void idct16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
-                   const TxfmParam * /*txfm_param*/) {
-  aom_idct16x16_256_add_c(in, dest, stride);
-}
-
-void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride,
-                  TxfmParam *txfm_param) {
-  av1_fht16x16_c(in, out, stride, txfm_param);
-}
-
-void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
-                  const TxfmParam *txfm_param) {
-  av1_iht16x16_256_add_c(in, dest, stride, txfm_param);
-}
-
-#if CONFIG_HIGHBITDEPTH
-void fht16x16_10(const int16_t *in, tran_low_t *out, int stride,
-                 TxfmParam *txfm_param) {
-  av1_fwd_txfm2d_16x16_c(in, out, stride, txfm_param->tx_type, 10);
-}
-
-void fht16x16_12(const int16_t *in, tran_low_t *out, int stride,
-                 TxfmParam *txfm_param) {
-  av1_fwd_txfm2d_16x16_c(in, out, stride, txfm_param->tx_type, 12);
-}
-
-void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride,
-                 const TxfmParam *txfm_param) {
-  av1_inv_txfm2d_add_16x16_c(in, CONVERT_TO_SHORTPTR(out), stride,
-                             txfm_param->tx_type, 10);
-}
-
-void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride,
-                 const TxfmParam *txfm_param) {
-  av1_inv_txfm2d_add_16x16_c(in, CONVERT_TO_SHORTPTR(out), stride,
-                             txfm_param->tx_type, 12);
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-class Trans16x16TestBase {
- public:
-  virtual ~Trans16x16TestBase() {}
-
- protected:
-  virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
-
-  virtual void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) = 0;
-
-  void RunAccuracyCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    uint32_t max_error = 0;
-    int64_t total_error = 0;
-    const int count_test_block = 10000;
-    for (int i = 0; i < count_test_block; ++i) {
-      DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]);
-      DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_HIGHBITDEPTH
-      DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-#endif
-
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        if (bit_depth_ == AOM_BITS_8) {
-          src[j] = rnd.Rand8();
-          dst[j] = rnd.Rand8();
-          test_input_block[j] = src[j] - dst[j];
-#if CONFIG_HIGHBITDEPTH
-        } else {
-          src16[j] = rnd.Rand16() & mask_;
-          dst16[j] = rnd.Rand16() & mask_;
-          test_input_block[j] = src16[j] - dst16[j];
-#endif
-        }
-      }
-
-      ASM_REGISTER_STATE_CHECK(
-          RunFwdTxfm(test_input_block, test_temp_block, pitch_));
-      if (bit_depth_ == AOM_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
-#if CONFIG_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
-#endif
-      }
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_HIGHBITDEPTH
-        const int32_t diff =
-            bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        const int32_t diff = dst[j] - src[j];
-#endif
-        const uint32_t error = diff * diff;
-        if (max_error < error) max_error = error;
-        total_error += error;
-      }
-    }
-
-    EXPECT_GE(1u << 2 * (bit_depth_ - 8), max_error)
-        << "Error: 16x16 FHT/IHT has an individual round trip error > 1";
-
-    EXPECT_GE(count_test_block << 2 * (bit_depth_ - 8), total_error)
-        << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
-  }
-
-  void RunCoeffCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 1000;
-    DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j)
-        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
-
-      fwd_txfm_ref(input_block, output_ref_block, pitch_, &txfm_param_);
-      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
-
-      // The minimum quant value is 4.
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
-    }
-  }
-
-  void RunMemCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 1000;
-    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
-      }
-      if (i == 0) {
-        for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = mask_;
-      } else if (i == 1) {
-        for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = -mask_;
-      }
-
-      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, &txfm_param_);
-      ASM_REGISTER_STATE_CHECK(
-          RunFwdTxfm(input_extreme_block, output_block, pitch_));
-
-      // The minimum quant value is 4.
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
-        EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
-            << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      }
-    }
-  }
-
-  void RunQuantCheck(int dc_thred, int ac_thred) {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 100000;
-    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-
-    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
-#if CONFIG_HIGHBITDEPTH
-    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
-#endif
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
-      }
-      if (i == 0)
-        for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = mask_;
-      if (i == 1)
-        for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = -mask_;
-
-      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, &txfm_param_);
-
-      // clear reconstructed pixel buffers
-      memset(dst, 0, kNumCoeffs * sizeof(uint8_t));
-      memset(ref, 0, kNumCoeffs * sizeof(uint8_t));
-#if CONFIG_HIGHBITDEPTH
-      memset(dst16, 0, kNumCoeffs * sizeof(uint16_t));
-      memset(ref16, 0, kNumCoeffs * sizeof(uint16_t));
-#endif
-
-      // quantization with maximum allowed step sizes
-      output_ref_block[0] = (output_ref_block[0] / dc_thred) * dc_thred;
-      for (int j = 1; j < kNumCoeffs; ++j)
-        output_ref_block[j] = (output_ref_block[j] / ac_thred) * ac_thred;
-      if (bit_depth_ == AOM_BITS_8) {
-        inv_txfm_ref(output_ref_block, ref, pitch_, &txfm_param_);
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));
-#if CONFIG_HIGHBITDEPTH
-      } else {
-        inv_txfm_ref(output_ref_block, CONVERT_TO_BYTEPTR(ref16), pitch_,
-                     &txfm_param_);
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(output_ref_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
-#endif
-      }
-      if (bit_depth_ == AOM_BITS_8) {
-        for (int j = 0; j < kNumCoeffs; ++j) EXPECT_EQ(ref[j], dst[j]);
-#if CONFIG_HIGHBITDEPTH
-      } else {
-        for (int j = 0; j < kNumCoeffs; ++j) EXPECT_EQ(ref16[j], dst16[j]);
-#endif
-      }
-    }
-  }
-
-  void RunInvAccuracyCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 1000;
-    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_HIGHBITDEPTH
-    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-#endif  // CONFIG_HIGHBITDEPTH
-
-    for (int i = 0; i < count_test_block; ++i) {
-      double out_r[kNumCoeffs];
-
-      // Initialize a test block with input range [-255, 255].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        if (bit_depth_ == AOM_BITS_8) {
-          src[j] = rnd.Rand8();
-          dst[j] = rnd.Rand8();
-          in[j] = src[j] - dst[j];
-#if CONFIG_HIGHBITDEPTH
-        } else {
-          src16[j] = rnd.Rand16() & mask_;
-          dst16[j] = rnd.Rand16() & mask_;
-          in[j] = src16[j] - dst16[j];
-#endif  // CONFIG_HIGHBITDEPTH
-        }
-      }
-
-      reference_16x16_dct_2d(in, out_r);
-      for (int j = 0; j < kNumCoeffs; ++j)
-        coeff[j] = static_cast<tran_low_t>(round(out_r[j]));
-
-      if (bit_depth_ == AOM_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
-#if CONFIG_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), 16));
-#endif  // CONFIG_HIGHBITDEPTH
-      }
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_HIGHBITDEPTH
-        const int diff =
-            bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        const int diff = dst[j] - src[j];
-#endif  // CONFIG_HIGHBITDEPTH
-        const uint32_t error = diff * diff;
-        EXPECT_GE(1u, error)
-            << "Error: 16x16 IDCT has error " << error << " at index " << j;
-      }
-    }
-  }
-
-  void CompareInvReference(IdctFunc ref_txfm, int thresh) {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 10000;
-    const int eob = 10;
-    const int16_t *scan = av1_default_scan_orders[TX_16X16].scan;
-    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
-#if CONFIG_HIGHBITDEPTH
-    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
-#endif  // CONFIG_HIGHBITDEPTH
-
-    for (int i = 0; i < count_test_block; ++i) {
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        if (j < eob) {
-          // Random values less than the threshold, either positive or negative
-          coeff[scan[j]] = rnd(thresh) * (1 - 2 * (i % 2));
-        } else {
-          coeff[scan[j]] = 0;
-        }
-        if (bit_depth_ == AOM_BITS_8) {
-          dst[j] = 0;
-          ref[j] = 0;
-#if CONFIG_HIGHBITDEPTH
-        } else {
-          dst16[j] = 0;
-          ref16[j] = 0;
-#endif  // CONFIG_HIGHBITDEPTH
-        }
-      }
-      if (bit_depth_ == AOM_BITS_8) {
-        ref_txfm(coeff, ref, pitch_);
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
-      } else {
-#if CONFIG_HIGHBITDEPTH
-        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
-#endif  // CONFIG_HIGHBITDEPTH
-      }
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_HIGHBITDEPTH
-        const int diff =
-            bit_depth_ == AOM_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
-#else
-        const int diff = dst[j] - ref[j];
-#endif  // CONFIG_HIGHBITDEPTH
-        const uint32_t error = diff * diff;
-        EXPECT_EQ(0u, error) << "Error: 16x16 IDCT Comparison has error "
-                             << error << " at index " << j;
-      }
-    }
-  }
-
-  int pitch_;
-  aom_bit_depth_t bit_depth_;
-  int mask_;
-  FhtFunc fwd_txfm_ref;
-  IhtFunc inv_txfm_ref;
-  TxfmParam txfm_param_;
-};
-
-class Trans16x16DCT : public Trans16x16TestBase,
-                      public ::testing::TestWithParam<Dct16x16Param> {
- public:
-  virtual ~Trans16x16DCT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    bit_depth_ = GET_PARAM(3);
-    pitch_ = 16;
-    fwd_txfm_ref = fdct16x16_ref;
-    inv_txfm_ref = idct16x16_ref;
-    mask_ = (1 << bit_depth_) - 1;
-    inv_txfm_ref = idct16x16_ref;
-    txfm_param_.tx_type = GET_PARAM(2);
-  }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride);
-  }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride);
-  }
-
-  FdctFunc fwd_txfm_;
-  IdctFunc inv_txfm_;
-};
-
-TEST_P(Trans16x16DCT, AccuracyCheck) { RunAccuracyCheck(); }
-
-TEST_P(Trans16x16DCT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans16x16DCT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans16x16DCT, QuantCheck) {
-  // Use maximally allowed quantization step sizes for DC and AC
-  // coefficients respectively.
-  RunQuantCheck(1336, 1828);
-}
-
-TEST_P(Trans16x16DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); }
-
-class Trans16x16HT : public Trans16x16TestBase,
-                     public ::testing::TestWithParam<Ht16x16Param> {
- public:
-  virtual ~Trans16x16HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    bit_depth_ = GET_PARAM(3);
-    pitch_ = 16;
-    mask_ = (1 << bit_depth_) - 1;
-    txfm_param_.tx_type = GET_PARAM(2);
-#if CONFIG_HIGHBITDEPTH
-    switch (bit_depth_) {
-      case AOM_BITS_10:
-        fwd_txfm_ref = fht16x16_10;
-        inv_txfm_ref = iht16x16_10;
-        break;
-      case AOM_BITS_12:
-        fwd_txfm_ref = fht16x16_12;
-        inv_txfm_ref = iht16x16_12;
-        break;
-      default:
-        fwd_txfm_ref = fht16x16_ref;
-        inv_txfm_ref = iht16x16_ref;
-        break;
-    }
-#else
-    fwd_txfm_ref = fht16x16_ref;
-    inv_txfm_ref = iht16x16_ref;
-#endif
-  }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride, &txfm_param_);
-  }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride, &txfm_param_);
-  }
-
-  FhtFunc fwd_txfm_;
-  IhtFunc inv_txfm_;
-};
-
-TEST_P(Trans16x16HT, AccuracyCheck) { RunAccuracyCheck(); }
-
-TEST_P(Trans16x16HT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans16x16HT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans16x16HT, QuantCheck) {
-  // The encoder skips any non-DC intra prediction modes,
-  // when the quantization step size goes beyond 988.
-  RunQuantCheck(429, 729);
-}
-
-class InvTrans16x16DCT : public Trans16x16TestBase,
-                         public ::testing::TestWithParam<Idct16x16Param> {
- public:
-  virtual ~InvTrans16x16DCT() {}
-
-  virtual void SetUp() {
-    ref_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    thresh_ = GET_PARAM(2);
-    bit_depth_ = GET_PARAM(3);
-    pitch_ = 16;
-    mask_ = (1 << bit_depth_) - 1;
-  }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/, int /*stride*/) {}
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride);
-  }
-
-  IdctFunc ref_txfm_;
-  IdctFunc inv_txfm_;
-  int thresh_;
-};
-
-TEST_P(InvTrans16x16DCT, CompareReference) {
-  CompareInvReference(ref_txfm_, thresh_);
-}
-
-class PartialTrans16x16Test : public ::testing::TestWithParam<
-                                  std::tr1::tuple<FdctFunc, aom_bit_depth_t> > {
- public:
-  virtual ~PartialTrans16x16Test() {}
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    bit_depth_ = GET_PARAM(1);
-  }
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  aom_bit_depth_t bit_depth_;
-  FdctFunc fwd_txfm_;
-};
-
-TEST_P(PartialTrans16x16Test, Extremes) {
-#if CONFIG_HIGHBITDEPTH
-  const int16_t maxval =
-      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
-  const int16_t maxval = 255;
-#endif
-  const int minval = -maxval;
-  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-
-  for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval;
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
-  EXPECT_EQ((maxval * kNumCoeffs) >> 1, output[0]);
-
-  for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval;
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
-  EXPECT_EQ((minval * kNumCoeffs) >> 1, output[0]);
-}
-
-TEST_P(PartialTrans16x16Test, Random) {
-#if CONFIG_HIGHBITDEPTH
-  const int16_t maxval =
-      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
-  const int16_t maxval = 255;
-#endif
-  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-  int sum = 0;
-  for (int i = 0; i < kNumCoeffs; ++i) {
-    const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1);
-    input[i] = val;
-    sum += val;
-  }
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
-  EXPECT_EQ(sum >> 1, output[0]);
-}
-
-using std::tr1::make_tuple;
-
-#if CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(C, Trans16x16DCT,
-                        ::testing::Values(make_tuple(&aom_fdct16x16_c,
-                                                     &aom_idct16x16_256_add_c,
-                                                     DCT_DCT, AOM_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(C, Trans16x16DCT,
-                        ::testing::Values(make_tuple(&aom_fdct16x16_c,
-                                                     &aom_idct16x16_256_add_c,
-                                                     DCT_DCT, AOM_BITS_8)));
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    C, Trans16x16HT,
-    ::testing::Values(
-        make_tuple(&fht16x16_10, &iht16x16_10, DCT_DCT, AOM_BITS_10),
-        make_tuple(&fht16x16_10, &iht16x16_10, ADST_DCT, AOM_BITS_10),
-        make_tuple(&fht16x16_10, &iht16x16_10, DCT_ADST, AOM_BITS_10),
-        make_tuple(&fht16x16_10, &iht16x16_10, ADST_ADST, AOM_BITS_10),
-        make_tuple(&fht16x16_12, &iht16x16_12, DCT_DCT, AOM_BITS_12),
-        make_tuple(&fht16x16_12, &iht16x16_12, ADST_DCT, AOM_BITS_12),
-        make_tuple(&fht16x16_12, &iht16x16_12, DCT_ADST, AOM_BITS_12),
-        make_tuple(&fht16x16_12, &iht16x16_12, ADST_ADST, AOM_BITS_12),
-        make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, DCT_DCT,
-                   AOM_BITS_8),
-        make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, ADST_DCT,
-                   AOM_BITS_8),
-        make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, DCT_ADST,
-                   AOM_BITS_8),
-        make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, ADST_ADST,
-                   AOM_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(
-    C, Trans16x16HT,
-    ::testing::Values(make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c,
-                                 DCT_DCT, AOM_BITS_8),
-                      make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c,
-                                 ADST_DCT, AOM_BITS_8),
-                      make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c,
-                                 DCT_ADST, AOM_BITS_8),
-                      make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c,
-                                 ADST_ADST, AOM_BITS_8)));
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if HAVE_NEON_ASM && !CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    NEON, Trans16x16DCT,
-    ::testing::Values(make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_neon,
-                                 DCT_DCT, AOM_BITS_8)));
-#endif
-
-#if HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(SSE2, Trans16x16DCT,
-                        ::testing::Values(make_tuple(
-                            &aom_fdct16x16_sse2, &aom_idct16x16_256_add_sse2,
-                            DCT_DCT, AOM_BITS_8)));
-#if !CONFIG_DAALA_DCT16
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans16x16HT,
-    ::testing::Values(make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2,
-                                 DCT_DCT, AOM_BITS_8),
-                      make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2,
-                                 ADST_DCT, AOM_BITS_8),
-                      make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2,
-                                 DCT_ADST, AOM_BITS_8),
-                      make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2,
-                                 ADST_ADST, AOM_BITS_8)));
-#endif  // CONFIG_DAALA_DCT16
-#endif  // HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
-
-#if HAVE_SSE2 && CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(SSE2, Trans16x16DCT,
-                        ::testing::Values(make_tuple(&aom_fdct16x16_sse2,
-                                                     &aom_idct16x16_256_add_c,
-                                                     DCT_DCT, AOM_BITS_8)));
-#if !CONFIG_DAALA_DCT16
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans16x16HT,
-    ::testing::Values(make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c,
-                                 DCT_DCT, AOM_BITS_8),
-                      make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c,
-                                 ADST_DCT, AOM_BITS_8),
-                      make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c,
-                                 DCT_ADST, AOM_BITS_8),
-                      make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c,
-                                 ADST_ADST, AOM_BITS_8)));
-#endif
-#endif  // HAVE_SSE2 && CONFIG_HIGHBITDEPTH
-
-#if HAVE_MSA && !CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(MSA, Trans16x16DCT,
-                        ::testing::Values(make_tuple(&aom_fdct16x16_msa,
-                                                     &aom_idct16x16_256_add_msa,
-                                                     DCT_DCT, AOM_BITS_8)));
-#if !CONFIG_EXT_TX && !CONFIG_DAALA_DCT16
-// TODO(yaowu): re-enable this after msa versions are updated to match C.
-INSTANTIATE_TEST_CASE_P(
-    DISABLED_MSA, Trans16x16HT,
-    ::testing::Values(make_tuple(&av1_fht16x16_msa, &av1_iht16x16_256_add_msa,
-                                 DCT_DCT, AOM_BITS_8),
-                      make_tuple(&av1_fht16x16_msa, &av1_iht16x16_256_add_msa,
-                                 ADST_DCT, AOM_BITS_8),
-                      make_tuple(&av1_fht16x16_msa, &av1_iht16x16_256_add_msa,
-                                 DCT_ADST, AOM_BITS_8),
-                      make_tuple(&av1_fht16x16_msa, &av1_iht16x16_256_add_msa,
-                                 ADST_ADST, AOM_BITS_8)));
-#endif  // !CONFIG_EXT_TX && !CONFIG_DAALA_DCT16
-#endif  // HAVE_MSA && !CONFIG_HIGHBITDEPTH
-}  // namespace
diff --git a/third_party/aom/test/dct32x32_test.cc b/third_party/aom/test/dct32x32_test.cc
deleted file mode 100644
index 02a723a9c..000000000
--- a/third_party/aom/test/dct32x32_test.cc
+++ /dev/null
@@ -1,425 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
-
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "av1/common/entropy.h"
-#include "aom/aom_codec.h"
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_ports/msvc.h"  // for round()
-
-using libaom_test::ACMRandom;
-
-namespace {
-
-const int kNumCoeffs = 1024;
-const double kPi = 3.141592653589793238462643383279502884;
-void reference_32x32_dct_1d(const double in[32], double out[32]) {
-  const double kInvSqrt2 = 0.707106781186547524400844362104;
-  for (int k = 0; k < 32; k++) {
-    out[k] = 0.0;
-    for (int n = 0; n < 32; n++)
-      out[k] += in[n] * cos(kPi * (2 * n + 1) * k / 64.0);
-    if (k == 0) out[k] = out[k] * kInvSqrt2;
-  }
-}
-
-void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
-                            double output[kNumCoeffs]) {
-  // First transform columns
-  for (int i = 0; i < 32; ++i) {
-    double temp_in[32], temp_out[32];
-    for (int j = 0; j < 32; ++j) temp_in[j] = input[j * 32 + i];
-    reference_32x32_dct_1d(temp_in, temp_out);
-    for (int j = 0; j < 32; ++j) output[j * 32 + i] = temp_out[j];
-  }
-  // Then transform rows
-  for (int i = 0; i < 32; ++i) {
-    double temp_in[32], temp_out[32];
-    for (int j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
-    reference_32x32_dct_1d(temp_in, temp_out);
-    // Scale by some magic number
-    for (int j = 0; j < 32; ++j) output[j + i * 32] = temp_out[j] / 4;
-  }
-}
-
-typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
-
-typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, aom_bit_depth_t>
-    Trans32x32Param;
-
-class Trans32x32Test : public ::testing::TestWithParam<Trans32x32Param> {
- public:
-  virtual ~Trans32x32Test() {}
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    version_ = GET_PARAM(2);  // 0: high precision forward transform
-                              // 1: low precision version for rd loop
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-  }
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  int version_;
-  aom_bit_depth_t bit_depth_;
-  int mask_;
-  FwdTxfmFunc fwd_txfm_;
-  InvTxfmFunc inv_txfm_;
-};
-
-TEST_P(Trans32x32Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  uint32_t max_error = 0;
-  int64_t total_error = 0;
-  const int count_test_block = 10000;
-  DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]);
-  DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-  DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-  DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-#endif
-
-  for (int i = 0; i < count_test_block; ++i) {
-    // Initialize a test block with input range [-mask_, mask_].
-    for (int j = 0; j < kNumCoeffs; ++j) {
-      if (bit_depth_ == AOM_BITS_8) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        test_input_block[j] = src[j] - dst[j];
-#if CONFIG_HIGHBITDEPTH
-      } else {
-        src16[j] = rnd.Rand16() & mask_;
-        dst16[j] = rnd.Rand16() & mask_;
-        test_input_block[j] = src16[j] - dst16[j];
-#endif
-      }
-    }
-
-    ASM_REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, 32));
-    if (bit_depth_ == AOM_BITS_8) {
-      ASM_REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
-#if CONFIG_HIGHBITDEPTH
-    } else {
-      ASM_REGISTER_STATE_CHECK(
-          inv_txfm_(test_temp_block, CONVERT_TO_BYTEPTR(dst16), 32));
-#endif
-    }
-
-    for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_HIGHBITDEPTH
-      const int32_t diff =
-          bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-      const int32_t diff = dst[j] - src[j];
-#endif
-      const uint32_t error = diff * diff;
-      if (max_error < error) max_error = error;
-      total_error += error;
-    }
-  }
-
-  if (version_ == 1) {
-    max_error /= 2;
-    total_error /= 45;
-  }
-
-  EXPECT_GE(1u << 2 * (bit_depth_ - 8), max_error)
-      << "Error: 32x32 FDCT/IDCT has an individual round-trip error > 1";
-
-  EXPECT_GE(count_test_block << 2 * (bit_depth_ - 8), total_error)
-      << "Error: 32x32 FDCT/IDCT has average round-trip error > 1 per block";
-}
-
-TEST_P(Trans32x32Test, CoeffCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-
-  DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
-  for (int i = 0; i < count_test_block; ++i) {
-    for (int j = 0; j < kNumCoeffs; ++j)
-      input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
-
-    const int stride = 32;
-    aom_fdct32x32_c(input_block, output_ref_block, stride);
-    ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, stride));
-
-    if (version_ == 0) {
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_EQ(output_block[j], output_ref_block[j])
-            << "Error: 32x32 FDCT versions have mismatched coefficients";
-    } else {
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
-            << "Error: 32x32 FDCT rd has mismatched coefficients";
-    }
-  }
-}
-
-TEST_P(Trans32x32Test, MemCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 2000;
-
-  DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
-  for (int i = 0; i < count_test_block; ++i) {
-    // Initialize a test block with input range [-mask_, mask_].
-    for (int j = 0; j < kNumCoeffs; ++j) {
-      input_extreme_block[j] = rnd.Rand8() & 1 ? mask_ : -mask_;
-    }
-    if (i == 0) {
-      for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = mask_;
-    } else if (i == 1) {
-      for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = -mask_;
-    }
-
-    const int stride = 32;
-    aom_fdct32x32_c(input_extreme_block, output_ref_block, stride);
-    ASM_REGISTER_STATE_CHECK(
-        fwd_txfm_(input_extreme_block, output_block, stride));
-
-    // The minimum quant value is 4.
-    for (int j = 0; j < kNumCoeffs; ++j) {
-      if (version_ == 0) {
-        EXPECT_EQ(output_block[j], output_ref_block[j])
-            << "Error: 32x32 FDCT versions have mismatched coefficients";
-      } else {
-        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
-            << "Error: 32x32 FDCT rd has mismatched coefficients";
-      }
-      EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_ref_block[j]))
-          << "Error: 32x32 FDCT C has coefficient larger than 4*DCT_MAX_VALUE";
-      EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
-          << "Error: 32x32 FDCT has coefficient larger than "
-          << "4*DCT_MAX_VALUE";
-    }
-  }
-}
-
-TEST_P(Trans32x32Test, InverseAccuracy) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
-  DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-  DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-  DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-#endif
-
-  for (int i = 0; i < count_test_block; ++i) {
-    double out_r[kNumCoeffs];
-
-    // Initialize a test block with input range [-255, 255]
-    for (int j = 0; j < kNumCoeffs; ++j) {
-      if (bit_depth_ == AOM_BITS_8) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        in[j] = src[j] - dst[j];
-#if CONFIG_HIGHBITDEPTH
-      } else {
-        src16[j] = rnd.Rand16() & mask_;
-        dst16[j] = rnd.Rand16() & mask_;
-        in[j] = src16[j] - dst16[j];
-#endif
-      }
-    }
-
-    reference_32x32_dct_2d(in, out_r);
-    for (int j = 0; j < kNumCoeffs; ++j)
-      coeff[j] = static_cast<tran_low_t>(round(out_r[j]));
-    if (bit_depth_ == AOM_BITS_8) {
-      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
-#if CONFIG_HIGHBITDEPTH
-    } else {
-      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CONVERT_TO_BYTEPTR(dst16), 32));
-#endif
-    }
-    for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_HIGHBITDEPTH
-      const int diff =
-          bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-      const int diff = dst[j] - src[j];
-#endif
-      const int error = diff * diff;
-      EXPECT_GE(1, error) << "Error: 32x32 IDCT has error " << error
-                          << " at index " << j;
-    }
-  }
-}
-
-class PartialTrans32x32Test
-    : public ::testing::TestWithParam<
-          std::tr1::tuple<FwdTxfmFunc, aom_bit_depth_t> > {
- public:
-  virtual ~PartialTrans32x32Test() {}
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    bit_depth_ = GET_PARAM(1);
-  }
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  aom_bit_depth_t bit_depth_;
-  FwdTxfmFunc fwd_txfm_;
-};
-
-TEST_P(PartialTrans32x32Test, Extremes) {
-#if CONFIG_HIGHBITDEPTH
-  const int16_t maxval =
-      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
-  const int16_t maxval = 255;
-#endif
-  const int minval = -maxval;
-  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-
-  for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval;
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
-  EXPECT_EQ((maxval * kNumCoeffs) >> 3, output[0]);
-
-  for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval;
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
-  EXPECT_EQ((minval * kNumCoeffs) >> 3, output[0]);
-}
-
-TEST_P(PartialTrans32x32Test, Random) {
-#if CONFIG_HIGHBITDEPTH
-  const int16_t maxval =
-      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
-  const int16_t maxval = 255;
-#endif
-  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-  int sum = 0;
-  for (int i = 0; i < kNumCoeffs; ++i) {
-    const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1);
-    input[i] = val;
-    sum += val;
-  }
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
-  EXPECT_EQ(sum >> 3, output[0]);
-}
-
-using std::tr1::make_tuple;
-
-#if CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    C, Trans32x32Test,
-    ::testing::Values(make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c, 0,
-                                 AOM_BITS_8),
-                      make_tuple(&aom_fdct32x32_rd_c, &aom_idct32x32_1024_add_c,
-                                 1, AOM_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(
-    C, Trans32x32Test,
-    ::testing::Values(make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c, 0,
-                                 AOM_BITS_8),
-                      make_tuple(&aom_fdct32x32_rd_c, &aom_idct32x32_1024_add_c,
-                                 1, AOM_BITS_8)));
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if HAVE_NEON && !CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    NEON, Trans32x32Test,
-    ::testing::Values(make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_neon,
-                                 DCT_DCT, AOM_BITS_8),
-                      make_tuple(&aom_fdct32x32_rd_c,
-                                 &aom_idct32x32_1024_add_neon, ADST_DCT,
-                                 AOM_BITS_8)));
-#endif  // HAVE_NEON && !CONFIG_HIGHBITDEPTH
-
-#if HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans32x32Test,
-    ::testing::Values(make_tuple(&aom_fdct32x32_sse2,
-                                 &aom_idct32x32_1024_add_sse2, DCT_DCT,
-                                 AOM_BITS_8),
-                      make_tuple(&aom_fdct32x32_rd_sse2,
-                                 &aom_idct32x32_1024_add_sse2, ADST_DCT,
-                                 AOM_BITS_8)));
-#endif  // HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
-
-#if HAVE_SSE2 && CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(SSE2, Trans32x32Test,
-                        ::testing::Values(make_tuple(&aom_fdct32x32_sse2,
-                                                     &aom_idct32x32_1024_add_c,
-                                                     DCT_DCT, AOM_BITS_8),
-                                          make_tuple(&aom_fdct32x32_rd_sse2,
-                                                     &aom_idct32x32_1024_add_c,
-                                                     ADST_DCT, AOM_BITS_8)));
-#endif  // HAVE_SSE2 && CONFIG_HIGHBITDEPTH
-
-#if HAVE_AVX2 && !CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    AVX2, Trans32x32Test,
-    ::testing::Values(make_tuple(&aom_fdct32x32_avx2,
-                                 &aom_idct32x32_1024_add_sse2, DCT_DCT,
-                                 AOM_BITS_8),
-                      make_tuple(&aom_fdct32x32_rd_avx2,
-                                 &aom_idct32x32_1024_add_sse2, ADST_DCT,
-                                 AOM_BITS_8)));
-#endif  // HAVE_AVX2 && !CONFIG_HIGHBITDEPTH
-
-#if HAVE_AVX2 && CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    AVX2, Trans32x32Test,
-    ::testing::Values(make_tuple(&aom_fdct32x32_avx2,
-                                 &aom_idct32x32_1024_add_sse2, DCT_DCT,
-                                 AOM_BITS_8),
-                      make_tuple(&aom_fdct32x32_rd_avx2,
-                                 &aom_idct32x32_1024_add_sse2, ADST_DCT,
-                                 AOM_BITS_8)));
-#endif  // HAVE_AVX2 && CONFIG_HIGHBITDEPTH
-
-#if HAVE_MSA && !CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    MSA, Trans32x32Test,
-    ::testing::Values(make_tuple(&aom_fdct32x32_msa,
-                                 &aom_idct32x32_1024_add_msa, DCT_DCT,
-                                 AOM_BITS_8),
-                      make_tuple(&aom_fdct32x32_rd_msa,
-                                 &aom_idct32x32_1024_add_msa, ADST_DCT,
-                                 AOM_BITS_8)));
-#endif  // HAVE_MSA && !CONFIG_HIGHBITDEPTH
-}  // namespace
diff --git a/third_party/aom/test/decode_api_test.cc b/third_party/aom/test/decode_api_test.cc
index 187c8e06a..97cbd0655 100644
--- a/third_party/aom/test/decode_api_test.cc
+++ b/third_party/aom/test/decode_api_test.cc
@@ -7,12 +7,12 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./aom_config.h"
-#include "test/ivf_video_source.h"
+#include "config/aom_config.h"
+
 #include "test/util.h"
 #include "aom/aomdx.h"
 #include "aom/aom_decoder.h"
@@ -30,12 +30,12 @@ TEST(DecodeAPI, InvalidParams) {
 
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_dec_init(NULL, NULL, NULL, 0));
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_dec_init(&dec, NULL, NULL, 0));
-  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(NULL, NULL, 0, NULL, 0));
-  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(NULL, buf, 0, NULL, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(NULL, NULL, 0, NULL));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(NULL, buf, 0, NULL));
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-            aom_codec_decode(NULL, buf, NELEMENTS(buf), NULL, 0));
+            aom_codec_decode(NULL, buf, NELEMENTS(buf), NULL));
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-            aom_codec_decode(NULL, NULL, NELEMENTS(buf), NULL, 0));
+            aom_codec_decode(NULL, NULL, NELEMENTS(buf), NULL));
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_destroy(NULL));
   EXPECT_TRUE(aom_codec_error(NULL) != NULL);
 
@@ -44,14 +44,9 @@ TEST(DecodeAPI, InvalidParams) {
               aom_codec_dec_init(NULL, kCodecs[i], NULL, 0));
 
     EXPECT_EQ(AOM_CODEC_OK, aom_codec_dec_init(&dec, kCodecs[i], NULL, 0));
-#if !CONFIG_OBU
-    // Needs to be fixed
-    EXPECT_EQ(AOM_CODEC_UNSUP_BITSTREAM,
-              aom_codec_decode(&dec, buf, NELEMENTS(buf), NULL, 0));
-#endif
     EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-              aom_codec_decode(&dec, NULL, NELEMENTS(buf), NULL, 0));
-    EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(&dec, buf, 0, NULL, 0));
+              aom_codec_decode(&dec, NULL, NELEMENTS(buf), NULL));
+    EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(&dec, buf, 0, NULL));
 
     EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&dec));
   }
diff --git a/third_party/aom/test/decode_multithreaded_test.cc b/third_party/aom/test/decode_multithreaded_test.cc
new file mode 100644
index 000000000..ed9a9ceef
--- /dev/null
+++ b/third_party/aom/test/decode_multithreaded_test.cc
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+#include "aom_mem/aom_mem.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+static const int kNumMultiThreadDecoders = 3;
+
+class AV1DecodeMultiThreadedTest
+    : public ::libaom_test::CodecTestWith4Params<int, int, int, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AV1DecodeMultiThreadedTest()
+      : EncoderTest(GET_PARAM(0)), md5_single_thread_(), md5_multi_thread_(),
+        n_tile_cols_(GET_PARAM(1)), n_tile_rows_(GET_PARAM(2)),
+        n_tile_groups_(GET_PARAM(3)), set_cpu_used_(GET_PARAM(4)) {
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.w = 704;
+    cfg.h = 576;
+    cfg.threads = 1;
+    cfg.allow_lowbitdepth = 1;
+    single_thread_dec_ = codec_->CreateDecoder(cfg, 0);
+
+    // Test cfg.threads == powers of 2.
+    for (int i = 0; i < kNumMultiThreadDecoders; ++i) {
+      cfg.threads <<= 1;
+      multi_thread_dec_[i] = codec_->CreateDecoder(cfg, 0);
+    }
+
+    if (single_thread_dec_->IsAV1()) {
+      single_thread_dec_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+      single_thread_dec_->Control(AV1_SET_DECODE_TILE_COL, -1);
+    }
+    for (int i = 0; i < kNumMultiThreadDecoders; ++i) {
+      if (multi_thread_dec_[i]->IsAV1()) {
+        multi_thread_dec_[i]->Control(AV1_SET_DECODE_TILE_ROW, -1);
+        multi_thread_dec_[i]->Control(AV1_SET_DECODE_TILE_COL, -1);
+      }
+    }
+  }
+
+  virtual ~AV1DecodeMultiThreadedTest() {
+    delete single_thread_dec_;
+    for (int i = 0; i < kNumMultiThreadDecoders; ++i)
+      delete multi_thread_dec_[i];
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(libaom_test::kTwoPassGood);
+  }
+
+  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                                  libaom_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_);
+      encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_);
+      encoder->Control(AV1E_SET_NUM_TG, n_tile_groups_);
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+    }
+  }
+
+  void UpdateMD5(::libaom_test::Decoder *dec, const aom_codec_cx_pkt_t *pkt,
+                 ::libaom_test::MD5 *md5) {
+    const aom_codec_err_t res = dec->DecodeFrame(
+        reinterpret_cast<uint8_t *>(pkt->data.frame.buf), pkt->data.frame.sz);
+    if (res != AOM_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(AOM_CODEC_OK, res);
+    }
+    const aom_image_t *img = dec->GetDxData().Next();
+    md5->Add(img);
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    UpdateMD5(single_thread_dec_, pkt, &md5_single_thread_);
+
+    for (int i = 0; i < kNumMultiThreadDecoders; ++i)
+      UpdateMD5(multi_thread_dec_[i], pkt, &md5_multi_thread_[i]);
+  }
+
+  void DoTest() {
+    const aom_rational timebase = { 33333333, 1000000000 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_target_bitrate = 500;
+    cfg_.g_lag_in_frames = 12;
+    cfg_.rc_end_usage = AOM_VBR;
+
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 704, 576,
+                                       timebase.den, timebase.num, 0, 5);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+    const char *md5_single_thread_str = md5_single_thread_.Get();
+
+    for (int i = 0; i < kNumMultiThreadDecoders; ++i) {
+      const char *md5_multi_thread_str = md5_multi_thread_[i].Get();
+      ASSERT_STREQ(md5_single_thread_str, md5_multi_thread_str);
+    }
+  }
+
+  ::libaom_test::MD5 md5_single_thread_;
+  ::libaom_test::MD5 md5_multi_thread_[kNumMultiThreadDecoders];
+  ::libaom_test::Decoder *single_thread_dec_;
+  ::libaom_test::Decoder *multi_thread_dec_[kNumMultiThreadDecoders];
+
+ private:
+  int n_tile_cols_;
+  int n_tile_rows_;
+  int n_tile_groups_;
+  int set_cpu_used_;
+};
+
+// run an encode and do the decode both in single thread
+// and multi thread. Ensure that the MD5 of the output in both cases
+// is identical. If so, the test passes.
+TEST_P(AV1DecodeMultiThreadedTest, MD5Match) {
+  cfg_.large_scale_tile = 0;
+  single_thread_dec_->Control(AV1_SET_TILE_MODE, 0);
+  for (int i = 0; i < kNumMultiThreadDecoders; ++i)
+    multi_thread_dec_[i]->Control(AV1_SET_TILE_MODE, 0);
+  DoTest();
+}
+
+class AV1DecodeMultiThreadedTestLarge : public AV1DecodeMultiThreadedTest {};
+
+TEST_P(AV1DecodeMultiThreadedTestLarge, MD5Match) {
+  cfg_.large_scale_tile = 0;
+  single_thread_dec_->Control(AV1_SET_TILE_MODE, 0);
+  for (int i = 0; i < kNumMultiThreadDecoders; ++i)
+    multi_thread_dec_[i]->Control(AV1_SET_TILE_MODE, 0);
+  DoTest();
+}
+
+// TODO(ranjit): More tests have to be added using pre-generated MD5.
+AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedTest, ::testing::Values(1, 2),
+                          ::testing::Values(1, 2), ::testing::Values(1),
+                          ::testing::Values(3));
+AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedTestLarge,
+                          ::testing::Values(0, 1, 2, 6),
+                          ::testing::Values(0, 1, 2, 6),
+                          ::testing::Values(1, 4), ::testing::Values(0));
+
+class AV1DecodeMultiThreadedLSTestLarge
+    : public AV1DecodeMultiThreadedTestLarge {};
+
+TEST_P(AV1DecodeMultiThreadedLSTestLarge, DISABLED_MD5Match) {
+  cfg_.large_scale_tile = 1;
+  single_thread_dec_->Control(AV1_SET_TILE_MODE, 1);
+  for (int i = 0; i < kNumMultiThreadDecoders; ++i)
+    multi_thread_dec_[i]->Control(AV1_SET_TILE_MODE, 1);
+  DoTest();
+}
+
+AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedLSTestLarge,
+                          ::testing::Values(1, 2, 32),
+                          ::testing::Values(1, 2, 32), ::testing::Values(1),
+                          ::testing::Values(0, 3));
+
+}  // namespace
diff --git a/third_party/aom/test/decode_perf_test.cc b/third_party/aom/test/decode_perf_test.cc
index a24d02a6c..bb7b00032 100644
--- a/third_party/aom/test/decode_perf_test.cc
+++ b/third_party/aom/test/decode_perf_test.cc
@@ -7,9 +7,14 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <string>
+
+#include "config/aom_version.h"
+
+#include "aom_ports/aom_timer.h"
+#include "common/ivfenc.h"
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
 #include "test/encode_test_driver.h"
@@ -18,25 +23,21 @@
 #include "test/md5_helper.h"
 #include "test/util.h"
 #include "test/webm_video_source.h"
-#include "aom_ports/aom_timer.h"
-#include "./ivfenc.h"
-#include "./aom_version.h"
 
-using std::tr1::make_tuple;
+using ::testing::make_tuple;
 
 namespace {
 
 #define VIDEO_NAME 0
 #define THREADS 1
 
-const int kMaxPsnr = 100;
 const double kUsecsInSec = 1000000.0;
 const char kNewEncodeOutputFile[] = "new_encode.ivf";
 
 /*
  DecodePerfTest takes a tuple of filename + number of threads to decode with
  */
-typedef std::tr1::tuple<const char *, unsigned> DecodePerfParam;
+typedef ::testing::tuple<const char *, unsigned> DecodePerfParam;
 
 // TODO(jimbankoski): Add actual test vectors here when available.
 // const DecodePerfParam kAV1DecodePerfVectors[] = {};
@@ -129,7 +130,8 @@ class AV1NewEncodeDecodePerfTest
   }
 
   virtual void BeginPassHook(unsigned int /*pass*/) {
-    const std::string data_path = getenv("LIBAOM_TEST_DATA_PATH");
+    const char *const env = getenv("LIBAOM_TEST_DATA_PATH");
+    const std::string data_path(env ? env : ".");
     const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile;
     outfile_ = fopen(path_to_source.c_str(), "wb");
     ASSERT_TRUE(outfile_ != NULL);
@@ -157,7 +159,7 @@ class AV1NewEncodeDecodePerfTest
               pkt->data.frame.sz);
   }
 
-  virtual bool DoDecode() { return false; }
+  virtual bool DoDecode() const { return false; }
 
   void set_speed(unsigned int speed) { speed_ = speed; }
 
diff --git a/third_party/aom/test/decode_test_driver.cc b/third_party/aom/test/decode_test_driver.cc
index 9a465327e..ed261b527 100644
--- a/third_party/aom/test/decode_test_driver.cc
+++ b/third_party/aom/test/decode_test_driver.cc
@@ -7,7 +7,7 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -18,13 +18,12 @@
 
 namespace libaom_test {
 
-const char kVP8Name[] = "WebM Project VP8";
 const char kAV1Name[] = "AOMedia Project AV1 Decoder";
 
 aom_codec_err_t Decoder::PeekStream(const uint8_t *cxdata, size_t size,
                                     aom_codec_stream_info_t *stream_info) {
-  return aom_codec_peek_stream_info(
-      CodecInterface(), cxdata, static_cast<unsigned int>(size), stream_info);
+  return aom_codec_peek_stream_info(CodecInterface(), cxdata, size,
+                                    stream_info);
 }
 
 aom_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size) {
@@ -36,39 +35,22 @@ aom_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size,
   aom_codec_err_t res_dec;
   InitOnce();
   API_REGISTER_STATE_CHECK(
-      res_dec = aom_codec_decode(
-          &decoder_, cxdata, static_cast<unsigned int>(size), user_priv, 0));
+      res_dec = aom_codec_decode(&decoder_, cxdata, size, user_priv));
   return res_dec;
 }
 
-bool Decoder::IsVP8() const {
-  const char *codec_name = GetDecoderName();
-  return strncmp(kVP8Name, codec_name, sizeof(kVP8Name) - 1) == 0;
-}
-
 bool Decoder::IsAV1() const {
   const char *codec_name = GetDecoderName();
   return strncmp(kAV1Name, codec_name, sizeof(kAV1Name) - 1) == 0;
 }
 
-void DecoderTest::HandlePeekResult(Decoder *const decoder,
-                                   CompressedVideoSource *video,
+void DecoderTest::HandlePeekResult(Decoder *const /*decoder*/,
+                                   CompressedVideoSource * /*video*/,
                                    const aom_codec_err_t res_peek) {
-  const bool is_vp8 = decoder->IsVP8();
-  if (is_vp8) {
-    /* Vp8's implementation of PeekStream returns an error if the frame you
-     * pass it is not a keyframe, so we only expect AOM_CODEC_OK on the first
-     * frame, which must be a keyframe. */
-    if (video->frame_number() == 0) {
-      ASSERT_EQ(AOM_CODEC_OK, res_peek)
-          << "Peek return failed: " << aom_codec_err_to_string(res_peek);
-    }
-  } else {
-    /* The Av1 implementation of PeekStream returns an error only if the
-     * data passed to it isn't a valid Av1 chunk. */
-    ASSERT_EQ(AOM_CODEC_OK, res_peek)
-        << "Peek return failed: " << aom_codec_err_to_string(res_peek);
-  }
+  /* The Av1 implementation of PeekStream returns an error only if the
+   * data passed to it isn't a valid Av1 chunk. */
+  ASSERT_EQ(AOM_CODEC_OK, res_peek)
+      << "Peek return failed: " << aom_codec_err_to_string(res_peek);
 }
 
 void DecoderTest::RunLoop(CompressedVideoSource *video,
@@ -76,6 +58,7 @@ void DecoderTest::RunLoop(CompressedVideoSource *video,
   Decoder *const decoder = codec_->CreateDecoder(dec_cfg, flags_);
   ASSERT_TRUE(decoder != NULL);
   bool end_of_file = false;
+  bool peeked_stream = false;
 
   // Decode frames.
   for (video->Begin(); !::testing::Test::HasFailure() && !end_of_file;
@@ -83,15 +66,23 @@ void DecoderTest::RunLoop(CompressedVideoSource *video,
     PreDecodeFrameHook(*video, decoder);
 
     aom_codec_stream_info_t stream_info;
+    stream_info.is_annexb = 0;
+
     if (video->cxdata() != NULL) {
-      const aom_codec_err_t res_peek = decoder->PeekStream(
-          video->cxdata(), video->frame_size(), &stream_info);
-      HandlePeekResult(decoder, video, res_peek);
-      ASSERT_FALSE(::testing::Test::HasFailure());
+      if (!peeked_stream) {
+        // TODO(yaowu): PeekStream returns error for non-sequence_header_obu,
+        // therefore should only be tried once per sequence, this shall be fixed
+        // once PeekStream is updated to properly operate on other obus.
+        const aom_codec_err_t res_peek = decoder->PeekStream(
+            video->cxdata(), video->frame_size(), &stream_info);
+        HandlePeekResult(decoder, video, res_peek);
+        ASSERT_FALSE(::testing::Test::HasFailure());
+        peeked_stream = true;
+      }
 
       aom_codec_err_t res_dec =
           decoder->DecodeFrame(video->cxdata(), video->frame_size());
-      if (!HandleDecodeResult(res_dec, decoder)) break;
+      if (!HandleDecodeResult(res_dec, *video, decoder)) break;
     } else {
       // Signal end of the file to the decoder.
       const aom_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
diff --git a/third_party/aom/test/decode_test_driver.h b/third_party/aom/test/decode_test_driver.h
index e7deb389c..916efdad0 100644
--- a/third_party/aom/test/decode_test_driver.h
+++ b/third_party/aom/test/decode_test_driver.h
@@ -13,7 +13,9 @@
 #define TEST_DECODE_TEST_DRIVER_H_
 #include <cstring>
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "./aom_config.h"
+
+#include "config/aom_config.h"
+
 #include "aom/aom_decoder.h"
 
 namespace libaom_test {
@@ -93,8 +95,6 @@ class Decoder {
     return aom_codec_iface_name(CodecInterface());
   }
 
-  bool IsVP8() const;
-
   bool IsAV1() const;
 
   aom_codec_ctx_t *GetDecoder() { return &decoder_; }
@@ -134,6 +134,7 @@ class DecoderTest {
 
   // Hook to be called to handle decode result. Return true to continue.
   virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  const CompressedVideoSource & /*video*/,
                                   Decoder *decoder) {
     EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
     return AOM_CODEC_OK == res_dec;
diff --git a/third_party/aom/test/decode_to_md5.sh b/third_party/aom/test/decode_to_md5.sh
index 44c9f5f05..2edd1cb52 100755
--- a/third_party/aom/test/decode_to_md5.sh
+++ b/third_party/aom/test/decode_to_md5.sh
@@ -16,7 +16,7 @@
 . $(dirname $0)/tools_common.sh
 
 # Environment check: Make sure input is available:
-#   $AOM_IVF_FILE and $AV1_IVF_FILE are required.
+#   $AV1_IVF_FILE is required.
 decode_to_md5_verify_environment() {
   if [ "$(av1_encode_available)" != "yes" ] && [ ! -e "${AV1_IVF_FILE}" ]; then
     return 1
@@ -27,7 +27,7 @@ decode_to_md5_verify_environment() {
 # interpreted as codec name and used solely to name the output file. $3 is the
 # expected md5 sum: It must match that of the final frame.
 decode_to_md5() {
-  local decoder="${LIBAOM_BIN_PATH}/decode_to_md5${AOM_TEST_EXE_SUFFIX}"
+  local decoder="$(aom_tool_path decode_to_md5)"
   local input_file="$1"
   local codec="$2"
   local expected_md5="$3"
@@ -45,14 +45,23 @@ decode_to_md5() {
 
   local md5_last_frame="$(tail -n1 "${output_file}" | awk '{print $1}')"
   local actual_md5="$(echo "${md5_last_frame}" | awk '{print $1}')"
-  [ "${actual_md5}" = "${expected_md5}" ] || return 1
+  if [ "${actual_md5}" = "${expected_md5}" ]; then
+    return 0
+  else
+    elog "MD5 mismatch:"
+    elog "Expected: ${expected_md5}"
+    elog "Actual: ${actual_md5}"
+    return 1
+  fi
 }
 
-decode_to_md5_av1() {
+DISABLED_decode_to_md5_av1() {
   # expected MD5 sum for the last frame.
-  local expected_md5="26d3ef1d60754a1f6acb603c3763efbe"
+  local expected_md5="567dd6d4b7a7170edddbf58bbcc3aff1"
   local file="${AV1_IVF_FILE}"
 
+  # TODO(urvang): Check in the encoded file (like libvpx does) to avoid
+  # encoding every time.
   if [ "$(av1_decode_available)" = "yes" ]; then
     if [ ! -e "${AV1_IVF_FILE}" ]; then
       file="${AOM_TEST_OUTPUT_DIR}/test_encode.ivf"
@@ -62,6 +71,7 @@ decode_to_md5_av1() {
   fi
 }
 
-decode_to_md5_tests="decode_to_md5_av1"
+# TODO(tomfinegan): Enable when the bitstream stabilizes.
+decode_to_md5_tests="DISABLED_decode_to_md5_av1"
 
 run_tests decode_to_md5_verify_environment "${decode_to_md5_tests}"
diff --git a/third_party/aom/test/decode_with_drops.sh b/third_party/aom/test/decode_with_drops.sh
index 5978312f2..155ee9207 100755
--- a/third_party/aom/test/decode_with_drops.sh
+++ b/third_party/aom/test/decode_with_drops.sh
@@ -16,7 +16,7 @@
 . $(dirname $0)/tools_common.sh
 
 # Environment check: Make sure input is available:
-#   $AOM_IVF_FILE and $AV1_IVF_FILE are required.
+#   $AV1_IVF_FILE is required.
 decode_with_drops_verify_environment() {
   if [ "$(av1_encode_available)" != "yes" ] && [ ! -e "${AV1_IVF_FILE}" ]; then
     return 1
@@ -27,7 +27,7 @@ decode_with_drops_verify_environment() {
 # to name the output file. $3 is the drop mode, and is passed directly to
 # decode_with_drops.
 decode_with_drops() {
-  local decoder="${LIBAOM_BIN_PATH}/decode_with_drops${AOM_TEST_EXE_SUFFIX}"
+  local decoder="$(aom_tool_path decode_with_drops)"
   local input_file="$1"
   local codec="$2"
   local output_file="${AOM_TEST_OUTPUT_DIR}/decode_with_drops_${codec}"
@@ -47,21 +47,22 @@ decode_with_drops() {
 
 # Decodes $AV1_IVF_FILE while dropping frames, twice: once in sequence mode,
 # and once in pattern mode.
-decode_with_drops_av1() {
+DISABLED_decode_with_drops_av1() {
   if [ "$(av1_decode_available)" = "yes" ]; then
     local file="${AV1_IVF_FILE}"
     if [ ! -e "${AV1_IVF_FILE}" ]; then
       file="${AOM_TEST_OUTPUT_DIR}/test_encode.ivf"
       encode_yuv_raw_input_av1 "${file}" --ivf
     fi
-    # Drop frames 2 and 3.
-    decode_with_drops "${file}" "av1" "2-3"
+    # Drop frames 3 and 4.
+    decode_with_drops "${file}" "av1" "3-4"
 
     # Test pattern mode: Drop 3 of every 4 frames.
     decode_with_drops "${file}" "av1" "3/4"
   fi
 }
 
-decode_with_drops_tests="decode_with_drops_av1"
+# TODO(yaowu): Disable this test as trailing_bit check is expected to fail
+decode_with_drops_tests="DISABLED_decode_with_drops_av1"
 
 run_tests decode_with_drops_verify_environment "${decode_with_drops_tests}"
diff --git a/third_party/aom/test/dering_test.cc b/third_party/aom/test/dering_test.cc
deleted file mode 100644
index 6b76561c8..000000000
--- a/third_party/aom/test/dering_test.cc
+++ /dev/null
@@ -1,383 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
-
-#include <cstdlib>
-#include <string>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
-#include "aom_ports/aom_timer.h"
-#include "av1/common/cdef_block.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-
-typedef std::tr1::tuple<cdef_direction_func, cdef_direction_func, int>
-    dering_dir_param_t;
-
-class CDEFDeringDirTest : public ::testing::TestWithParam<dering_dir_param_t> {
- public:
-  virtual ~CDEFDeringDirTest() {}
-  virtual void SetUp() {
-    dering = GET_PARAM(0);
-    ref_dering = GET_PARAM(1);
-    bsize = GET_PARAM(2);
-  }
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  int bsize;
-  cdef_direction_func dering;
-  cdef_direction_func ref_dering;
-};
-
-typedef CDEFDeringDirTest CDEFDeringSpeedTest;
-
-void test_dering(int bsize, int iterations, cdef_direction_func dering,
-                 cdef_direction_func ref_dering) {
-  const int size = 8;
-  const int ysize = size + 2 * CDEF_VBORDER;
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t, s[ysize * CDEF_BSTRIDE]);
-  DECLARE_ALIGNED(16, static uint16_t, d[size * size]);
-  DECLARE_ALIGNED(16, static uint16_t, ref_d[size * size]);
-  memset(ref_d, 0, sizeof(ref_d));
-  memset(d, 0, sizeof(d));
-
-  int error = 0, threshold = 0, dir;
-  int boundary, damping, depth, bits, level, count,
-      errdepth = 0, errthreshold = 0, errboundary = 0, errdamping = 0;
-  unsigned int pos = 0;
-
-  for (boundary = 0; boundary < 16; boundary++) {
-    for (depth = 8; depth <= 12; depth += 2) {
-      for (damping = 5 + depth - 8; damping < 7 + depth - 8; damping++) {
-        for (count = 0; count < iterations; count++) {
-          for (level = 0; level < (1 << depth) && !error;
-               level += (1 + 4 * !!boundary) << (depth - 8)) {
-            for (bits = 1; bits <= depth && !error; bits++) {
-              for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++)
-                s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
-                             (1 << depth) - 1);
-              if (boundary) {
-                if (boundary & 1) {  // Left
-                  for (int i = 0; i < ysize; i++)
-                    for (int j = 0; j < CDEF_HBORDER; j++)
-                      s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
-                }
-                if (boundary & 2) {  // Right
-                  for (int i = 0; i < ysize; i++)
-                    for (int j = CDEF_HBORDER + size; j < CDEF_BSTRIDE; j++)
-                      s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
-                }
-                if (boundary & 4) {  // Above
-                  for (int i = 0; i < CDEF_VBORDER; i++)
-                    for (int j = 0; j < CDEF_BSTRIDE; j++)
-                      s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
-                }
-                if (boundary & 8) {  // Below
-                  for (int i = CDEF_VBORDER + size; i < ysize; i++)
-                    for (int j = 0; j < CDEF_BSTRIDE; j++)
-                      s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
-                }
-              }
-              for (dir = 0; dir < 8; dir++) {
-                for (threshold = 0; threshold < 64 << (depth - 8) && !error;
-                     threshold += (1 + 4 * !!boundary) << (depth - 8)) {
-                  ref_dering(ref_d, size,
-                             s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
-                             threshold, dir, damping);
-                  // If dering and ref_dering are the same, we're just testing
-                  // speed
-                  if (dering != ref_dering)
-                    ASM_REGISTER_STATE_CHECK(dering(
-                        d, size, s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
-                        threshold, dir, damping));
-                  if (ref_dering != dering) {
-                    for (pos = 0; pos < sizeof(d) / sizeof(*d) && !error;
-                         pos++) {
-                      error = ref_d[pos] != d[pos];
-                      errdepth = depth;
-                      errthreshold = threshold;
-                      errboundary = boundary;
-                      errdamping = damping;
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  pos--;
-  EXPECT_EQ(0, error) << "Error: CDEFDeringDirTest, SIMD and C mismatch."
-                      << std::endl
-                      << "First error at " << pos % size << "," << pos / size
-                      << " (" << (int16_t)ref_d[pos] << " : " << (int16_t)d[pos]
-                      << ") " << std::endl
-                      << "threshold: " << errthreshold << std::endl
-                      << "damping: " << errdamping << std::endl
-                      << "depth: " << errdepth << std::endl
-                      << "size: " << bsize << std::endl
-                      << "boundary: " << errboundary << std::endl
-                      << std::endl;
-}
-
-void test_dering_speed(int bsize, int iterations, cdef_direction_func dering,
-                       cdef_direction_func ref_dering) {
-  aom_usec_timer ref_timer;
-  aom_usec_timer timer;
-
-  aom_usec_timer_start(&ref_timer);
-  test_dering(bsize, iterations, ref_dering, ref_dering);
-  aom_usec_timer_mark(&ref_timer);
-  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
-
-  aom_usec_timer_start(&timer);
-  test_dering(bsize, iterations, dering, dering);
-  aom_usec_timer_mark(&timer);
-  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
-
-#if 0
-  std::cout << "[          ] C time = " << ref_elapsed_time / 1000
-            << " ms, SIMD time = " << elapsed_time / 1000 << " ms" << std::endl;
-#endif
-
-  EXPECT_GT(ref_elapsed_time, elapsed_time)
-      << "Error: CDEFDeringSpeedTest, SIMD slower than C." << std::endl
-      << "C time: " << ref_elapsed_time << " us" << std::endl
-      << "SIMD time: " << elapsed_time << " us" << std::endl;
-}
-
-typedef int (*find_dir_t)(const uint16_t *img, int stride, int32_t *var,
-                          int coeff_shift);
-
-typedef std::tr1::tuple<find_dir_t, find_dir_t> find_dir_param_t;
-
-class CDEFDeringFindDirTest
-    : public ::testing::TestWithParam<find_dir_param_t> {
- public:
-  virtual ~CDEFDeringFindDirTest() {}
-  virtual void SetUp() {
-    finddir = GET_PARAM(0);
-    ref_finddir = GET_PARAM(1);
-  }
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  find_dir_t finddir;
-  find_dir_t ref_finddir;
-};
-
-typedef CDEFDeringFindDirTest CDEFDeringFindDirSpeedTest;
-
-void test_finddir(int (*finddir)(const uint16_t *img, int stride, int32_t *var,
-                                 int coeff_shift),
-                  int (*ref_finddir)(const uint16_t *img, int stride,
-                                     int32_t *var, int coeff_shift)) {
-  const int size = 8;
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t, s[size * size]);
-
-  int error = 0;
-  int depth, bits, level, count, errdepth = 0;
-  int ref_res = 0, res = 0;
-  int32_t ref_var = 0, var = 0;
-
-  for (depth = 8; depth <= 12 && !error; depth += 2) {
-    for (count = 0; count < 512 && !error; count++) {
-      for (level = 0; level < (1 << depth) && !error;
-           level += 1 << (depth - 8)) {
-        for (bits = 1; bits <= depth && !error; bits++) {
-          for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++)
-            s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
-                         (1 << depth) - 1);
-          for (int c = 0; c < 1 + 9 * (finddir == ref_finddir); c++)
-            ref_res = ref_finddir(s, size, &ref_var, depth - 8);
-          if (finddir != ref_finddir)
-            ASM_REGISTER_STATE_CHECK(res = finddir(s, size, &var, depth - 8));
-          if (ref_finddir != finddir) {
-            if (res != ref_res || var != ref_var) error = 1;
-            errdepth = depth;
-          }
-        }
-      }
-    }
-  }
-
-  EXPECT_EQ(0, error) << "Error: CDEFDeringFindDirTest, SIMD and C mismatch."
-                      << std::endl
-                      << "return: " << res << " : " << ref_res << std::endl
-                      << "var: " << var << " : " << ref_var << std::endl
-                      << "depth: " << errdepth << std::endl
-                      << std::endl;
-}
-
-void test_finddir_speed(int (*finddir)(const uint16_t *img, int stride,
-                                       int32_t *var, int coeff_shift),
-                        int (*ref_finddir)(const uint16_t *img, int stride,
-                                           int32_t *var, int coeff_shift)) {
-  aom_usec_timer ref_timer;
-  aom_usec_timer timer;
-
-  aom_usec_timer_start(&ref_timer);
-  test_finddir(ref_finddir, ref_finddir);
-  aom_usec_timer_mark(&ref_timer);
-  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
-
-  aom_usec_timer_start(&timer);
-  test_finddir(finddir, finddir);
-  aom_usec_timer_mark(&timer);
-  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
-
-#if 0
-  std::cout << "[          ] C time = " << ref_elapsed_time / 1000
-            << " ms, SIMD time = " << elapsed_time / 1000 << " ms" << std::endl;
-#endif
-
-  EXPECT_GT(ref_elapsed_time, elapsed_time)
-      << "Error: CDEFDeringFindDirSpeedTest, SIMD slower than C." << std::endl
-      << "C time: " << ref_elapsed_time << " us" << std::endl
-      << "SIMD time: " << elapsed_time << " us" << std::endl;
-}
-
-TEST_P(CDEFDeringDirTest, TestSIMDNoMismatch) {
-  test_dering(bsize, 1, dering, ref_dering);
-}
-
-TEST_P(CDEFDeringSpeedTest, DISABLED_TestSpeed) {
-  test_dering_speed(bsize, 4, dering, ref_dering);
-}
-
-TEST_P(CDEFDeringFindDirTest, TestSIMDNoMismatch) {
-  test_finddir(finddir, ref_finddir);
-}
-
-TEST_P(CDEFDeringFindDirSpeedTest, DISABLED_TestSpeed) {
-  test_finddir_speed(finddir, ref_finddir);
-}
-
-using std::tr1::make_tuple;
-
-// VS compiling for 32 bit targets does not support vector types in
-// structs as arguments, which makes the v256 type of the intrinsics
-// hard to support, so optimizations for this target are disabled.
-#if defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__)
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, CDEFDeringDirTest,
-                        ::testing::Values(make_tuple(&cdef_direction_4x4_sse2,
-                                                     &cdef_direction_4x4_c, 4),
-                                          make_tuple(&cdef_direction_8x8_sse2,
-                                                     &cdef_direction_8x8_c,
-                                                     8)));
-INSTANTIATE_TEST_CASE_P(SSE2, CDEFDeringFindDirTest,
-                        ::testing::Values(make_tuple(&cdef_find_dir_sse2,
-                                                     &cdef_find_dir_c)));
-#endif
-#if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(SSSE3, CDEFDeringDirTest,
-                        ::testing::Values(make_tuple(&cdef_direction_4x4_ssse3,
-                                                     &cdef_direction_4x4_c, 4),
-                                          make_tuple(&cdef_direction_8x8_ssse3,
-                                                     &cdef_direction_8x8_c,
-                                                     8)));
-INSTANTIATE_TEST_CASE_P(SSSE3, CDEFDeringFindDirTest,
-                        ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
-                                                     &cdef_find_dir_c)));
-#endif
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFDeringDirTest,
-                        ::testing::Values(make_tuple(&cdef_direction_4x4_sse4_1,
-                                                     &cdef_direction_4x4_c, 4),
-                                          make_tuple(&cdef_direction_8x8_sse4_1,
-                                                     &cdef_direction_8x8_c,
-                                                     8)));
-INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFDeringFindDirTest,
-                        ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
-                                                     &cdef_find_dir_c)));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, CDEFDeringDirTest,
-                        ::testing::Values(make_tuple(&cdef_direction_4x4_neon,
-                                                     &cdef_direction_4x4_c, 4),
-                                          make_tuple(&cdef_direction_8x8_neon,
-                                                     &cdef_direction_8x8_c,
-                                                     8)));
-INSTANTIATE_TEST_CASE_P(NEON, CDEFDeringFindDirTest,
-                        ::testing::Values(make_tuple(&cdef_find_dir_neon,
-                                                     &cdef_find_dir_c)));
-#endif
-
-// Test speed for all supported architectures
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, CDEFDeringSpeedTest,
-                        ::testing::Values(make_tuple(&cdef_direction_4x4_sse2,
-                                                     &cdef_direction_4x4_c, 4),
-                                          make_tuple(&cdef_direction_8x8_sse2,
-                                                     &cdef_direction_8x8_c,
-                                                     8)));
-INSTANTIATE_TEST_CASE_P(SSE2, CDEFDeringFindDirSpeedTest,
-                        ::testing::Values(make_tuple(&cdef_find_dir_sse2,
-                                                     &cdef_find_dir_c)));
-#endif
-
-#if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(SSSE3, CDEFDeringSpeedTest,
-                        ::testing::Values(make_tuple(&cdef_direction_4x4_ssse3,
-                                                     &cdef_direction_4x4_c, 4),
-                                          make_tuple(&cdef_direction_8x8_ssse3,
-                                                     &cdef_direction_8x8_c,
-                                                     8)));
-INSTANTIATE_TEST_CASE_P(SSSE3, CDEFDeringFindDirSpeedTest,
-                        ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
-                                                     &cdef_find_dir_c)));
-#endif
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFDeringSpeedTest,
-                        ::testing::Values(make_tuple(&cdef_direction_4x4_sse4_1,
-                                                     &cdef_direction_4x4_c, 4),
-                                          make_tuple(&cdef_direction_8x8_sse4_1,
-                                                     &cdef_direction_8x8_c,
-                                                     8)));
-INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFDeringFindDirSpeedTest,
-                        ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
-                                                     &cdef_find_dir_c)));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, CDEFDeringSpeedTest,
-                        ::testing::Values(make_tuple(&cdef_direction_4x4_neon,
-                                                     &cdef_direction_4x4_c, 4),
-                                          make_tuple(&cdef_direction_8x8_neon,
-                                                     &cdef_direction_8x8_c,
-                                                     8)));
-INSTANTIATE_TEST_CASE_P(NEON, CDEFDeringFindDirSpeedTest,
-                        ::testing::Values(make_tuple(&cdef_find_dir_neon,
-                                                     &cdef_find_dir_c)));
-#endif
-
-#endif  // defined(_WIN64) || !defined(_MSC_VER)
-}  // namespace
diff --git a/third_party/aom/test/dr_prediction_test.cc b/third_party/aom/test/dr_prediction_test.cc
new file mode 100644
index 000000000..22b9832a1
--- /dev/null
+++ b/third_party/aom/test/dr_prediction_test.cc
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/common/blockd.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconintra.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+
+const int kZ1Start = 0;
+const int kZ2Start = 90;
+const int kZ3Start = 180;
+
+const TX_SIZE kTxSize[] = { TX_4X4,   TX_8X8,   TX_16X16, TX_32X32, TX_64X64,
+                            TX_4X8,   TX_8X4,   TX_8X16,  TX_16X8,  TX_16X32,
+                            TX_32X16, TX_32X64, TX_64X32, TX_4X16,  TX_16X4,
+                            TX_8X32,  TX_32X8,  TX_16X64, TX_64X16 };
+
+const char *const kTxSizeStrings[] = {
+  "TX_4X4",   "TX_8X8",   "TX_16X16", "TX_32X32", "TX_64X64",
+  "TX_4X8",   "TX_8X4",   "TX_8X16",  "TX_16X8",  "TX_16X32",
+  "TX_32X16", "TX_32X64", "TX_64X32", "TX_4X16",  "TX_16X4",
+  "TX_8X32",  "TX_32X8",  "TX_16X64", "TX_64X16"
+};
+
+using libaom_test::ACMRandom;
+
+typedef void (*DrPred_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                           const uint16_t *above, const uint16_t *left,
+                           int upsample_above, int upsample_left, int dx,
+                           int dy, int bd);
+
+typedef void (*DrPred)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint8_t *above, const uint8_t *left,
+                       int upsample_above, int upsample_left, int dx, int dy,
+                       int bd);
+
+typedef void (*Z1_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint8_t *above, const uint8_t *left,
+                       int upsample_above, int dx, int dy);
+template <Z1_Lbd fn>
+void z1_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                const uint8_t *above, const uint8_t *left, int upsample_above,
+                int /*upsample_left*/, int dx, int dy, int /*bd*/) {
+  fn(dst, stride, bw, bh, above, left, upsample_above, dx, dy);
+}
+
+typedef void (*Z2_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint8_t *above, const uint8_t *left,
+                       int upsample_above, int upsample_left, int dx, int dy);
+template <Z2_Lbd fn>
+void z2_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                const uint8_t *above, const uint8_t *left, int upsample_above,
+                int upsample_left, int dx, int dy, int /*bd*/) {
+  fn(dst, stride, bw, bh, above, left, upsample_above, upsample_left, dx, dy);
+}
+
+typedef void (*Z3_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint8_t *above, const uint8_t *left,
+                       int upsample_left, int dx, int dy);
+template <Z3_Lbd fn>
+void z3_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                const uint8_t *above, const uint8_t *left,
+                int /*upsample_above*/, int upsample_left, int dx, int dy,
+                int /*bd*/) {
+  fn(dst, stride, bw, bh, above, left, upsample_left, dx, dy);
+}
+
+typedef void (*Z1_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint16_t *above, const uint16_t *left,
+                       int upsample_above, int dx, int dy, int bd);
+template <Z1_Hbd fn>
+void z1_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                    const uint16_t *above, const uint16_t *left,
+                    int upsample_above, int /*upsample_left*/, int dx, int dy,
+                    int bd) {
+  fn(dst, stride, bw, bh, above, left, upsample_above, dx, dy, bd);
+}
+
+typedef void (*Z2_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint16_t *above, const uint16_t *left,
+                       int upsample_above, int upsample_left, int dx, int dy,
+                       int bd);
+template <Z2_Hbd fn>
+void z2_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                    const uint16_t *above, const uint16_t *left,
+                    int upsample_above, int upsample_left, int dx, int dy,
+                    int bd) {
+  fn(dst, stride, bw, bh, above, left, upsample_above, upsample_left, dx, dy,
+     bd);
+}
+
+typedef void (*Z3_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint16_t *above, const uint16_t *left,
+                       int upsample_left, int dx, int dy, int bd);
+template <Z3_Hbd fn>
+void z3_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                    const uint16_t *above, const uint16_t *left,
+                    int /*upsample_above*/, int upsample_left, int dx, int dy,
+                    int bd) {
+  fn(dst, stride, bw, bh, above, left, upsample_left, dx, dy, bd);
+}
+
+template <typename FuncType>
+struct DrPredFunc {
+  DrPredFunc(FuncType pred = NULL, FuncType tst = NULL, int bit_depth_value = 0,
+             int start_angle_value = 0)
+      : ref_fn(pred), tst_fn(tst), bit_depth(bit_depth_value),
+        start_angle(start_angle_value) {}
+
+  FuncType ref_fn;
+  FuncType tst_fn;
+  int bit_depth;
+  int start_angle;
+};
+
+template <typename Pixel, typename FuncType>
+class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
+ protected:
+  static const int kMaxNumTests = 100000;
+  static const int kIterations = 10;
+  static const int kDstStride = 64;
+  static const int kDstSize = kDstStride * kDstStride;
+  static const int kOffset = 16;
+  static const int kBufSize = ((2 * MAX_TX_SIZE) << 1) + 16;
+
+  DrPredTest()
+      : upsample_above_(0), upsample_left_(0), bw_(0), bh_(0), dx_(1), dy_(1),
+        bd_(8), txsize_(TX_4X4) {
+    params_ = this->GetParam();
+    start_angle_ = params_.start_angle;
+    stop_angle_ = start_angle_ + 90;
+
+    dst_ref_ = &dst_ref_data_[0];
+    dst_tst_ = &dst_tst_data_[0];
+    dst_stride_ = kDstStride;
+    above_ = &above_data_[kOffset];
+    left_ = &left_data_[kOffset];
+
+    for (int i = 0; i < kBufSize; ++i) {
+      above_data_[i] = rng_.Rand8();
+      left_data_[i] = rng_.Rand8();
+    }
+
+    for (int i = 0; i < kDstSize; ++i) {
+      dst_ref_[i] = 0;
+    }
+  }
+
+  virtual ~DrPredTest() {}
+
+  void Predict(bool speedtest, int tx) {
+    const int kNumTests = speedtest ? kMaxNumTests : 1;
+    aom_usec_timer timer;
+
+    aom_usec_timer_start(&timer);
+    for (int k = 0; k < kNumTests; ++k) {
+      params_.ref_fn(dst_ref_, dst_stride_, bw_, bh_, above_, left_,
+                     upsample_above_, upsample_left_, dx_, dy_, bd_);
+    }
+    aom_usec_timer_mark(&timer);
+    const int ref_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+    aom_usec_timer_start(&timer);
+    if (params_.tst_fn) {
+      for (int k = 0; k < kNumTests; ++k) {
+        ASM_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst_, dst_stride_, bw_, bh_,
+                                                above_, left_, upsample_above_,
+                                                upsample_left_, dx_, dy_, bd_));
+      }
+    }
+    aom_usec_timer_mark(&timer);
+    const int tst_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+    OutputTimes(kNumTests, ref_time, tst_time, tx);
+  }
+
+  void RunTest(bool speedtest) {
+    for (int i = 0; i < kBufSize; ++i) {
+      above_data_[i] = left_data_[i] = (1 << bd_) - 1;
+    }
+
+    for (int tx = 0; tx < TX_SIZES_ALL; ++tx) {
+      if (params_.tst_fn == NULL) {
+        for (int i = 0; i < kDstSize; ++i) {
+          dst_tst_[i] = (1 << bd_) - 1;
+        }
+      } else {
+        for (int i = 0; i < kDstSize; ++i) {
+          dst_tst_[i] = 0;
+        }
+      }
+
+      bw_ = tx_size_wide[kTxSize[tx]];
+      bh_ = tx_size_high[kTxSize[tx]];
+
+      Predict(speedtest, tx);
+
+      for (int r = 0; r < bh_; ++r) {
+        for (int c = 0; c < bw_; ++c) {
+          ASSERT_EQ(dst_ref_[r * dst_stride_ + c],
+                    dst_tst_[r * dst_stride_ + c])
+              << bw_ << "x" << bh_ << " r: " << r << " c: " << c
+              << " dx: " << dx_ << " dy: " << dy_
+              << " upsample_above: " << upsample_above_
+              << " upsample_left: " << upsample_left_;
+        }
+      }
+    }
+  }
+
+  void OutputTimes(int num_tests, int ref_time, int tst_time, int tx) {
+    if (num_tests > 1) {
+      if (params_.tst_fn) {
+        const float x = static_cast<float>(ref_time) / tst_time;
+        printf("\t[%8s] :: ref time %6d, tst time %6d     %3.2f\n",
+               kTxSizeStrings[tx], ref_time, tst_time, x);
+      } else {
+        printf("\t[%8s] :: ref time %6d\n", kTxSizeStrings[tx], ref_time);
+      }
+    }
+  }
+
+  Pixel dst_ref_data_[kDstSize];
+  Pixel dst_tst_data_[kDstSize];
+
+  Pixel left_data_[kBufSize];
+  Pixel dummy_data_[kBufSize];
+  Pixel above_data_[kBufSize];
+
+  Pixel *dst_ref_;
+  Pixel *dst_tst_;
+  Pixel *above_;
+  Pixel *left_;
+  int dst_stride_;
+
+  int upsample_above_;
+  int upsample_left_;
+  int bw_;
+  int bh_;
+  int dx_;
+  int dy_;
+  int bd_;
+  TX_SIZE txsize_;
+
+  int start_angle_;
+  int stop_angle_;
+
+  ACMRandom rng_;
+
+  DrPredFunc<FuncType> params_;
+};
+
+class LowbdDrPredTest : public DrPredTest<uint8_t, DrPred> {};
+
+TEST_P(LowbdDrPredTest, SaturatedValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    upsample_above_ = iter & 1;
+    for (int angle = start_angle_; angle < stop_angle_; ++angle) {
+      dx_ = av1_get_dx(angle);
+      dy_ = av1_get_dy(angle);
+      if (dx_ && dy_) RunTest(false);
+    }
+  }
+}
+
+TEST_P(LowbdDrPredTest, DISABLED_Speed) {
+  const int angles[] = { 3, 45, 87 };
+  for (upsample_above_ = 0; upsample_above_ < 2; ++upsample_above_) {
+    upsample_left_ = upsample_above_;
+    for (int i = 0; i < 3; ++i) {
+      dx_ = av1_get_dx(angles[i] + start_angle_);
+      dy_ = av1_get_dy(angles[i] + start_angle_);
+      printf("upsample_above: %d upsample_left: %d angle: %d ~~~~~~~~~~~~~~~\n",
+             upsample_above_, upsample_left_, angles[i] + start_angle_);
+      if (dx_ && dy_) RunTest(true);
+    }
+  }
+}
+
+using ::testing::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, LowbdDrPredTest,
+    ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
+                                         NULL, AOM_BITS_8, kZ1Start),
+                      DrPredFunc<DrPred>(&z2_wrapper<av1_dr_prediction_z2_c>,
+                                         NULL, AOM_BITS_8, kZ2Start),
+                      DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
+                                         NULL, AOM_BITS_8, kZ3Start)));
+
+class HighbdDrPredTest : public DrPredTest<uint16_t, DrPred_Hbd> {};
+
+TEST_P(HighbdDrPredTest, SaturatedValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    upsample_above_ = iter & 1;
+    for (int angle = start_angle_; angle < stop_angle_; ++angle) {
+      dx_ = av1_get_dx(angle);
+      dy_ = av1_get_dy(angle);
+      if (dx_ && dy_) RunTest(false);
+    }
+  }
+}
+
+TEST_P(HighbdDrPredTest, DISABLED_Speed) {
+  const int angles[] = { 3, 45, 87 };
+  for (upsample_above_ = 0; upsample_above_ < 2; ++upsample_above_) {
+    upsample_left_ = upsample_above_;
+    for (int i = 0; i < 3; ++i) {
+      dx_ = av1_get_dx(angles[i] + start_angle_);
+      dy_ = av1_get_dy(angles[i] + start_angle_);
+      printf("upsample_above: %d upsample_left: %d angle: %d ~~~~~~~~~~~~~~~\n",
+             upsample_above_, upsample_left_, angles[i] + start_angle_);
+      if (dx_ && dy_) RunTest(true);
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    C, HighbdDrPredTest,
+    ::testing::Values(
+        DrPredFunc<DrPred_Hbd>(&z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+                               NULL, AOM_BITS_8, kZ1Start),
+        DrPredFunc<DrPred_Hbd>(&z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+                               NULL, AOM_BITS_10, kZ1Start),
+        DrPredFunc<DrPred_Hbd>(&z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+                               NULL, AOM_BITS_12, kZ1Start),
+        DrPredFunc<DrPred_Hbd>(&z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+                               NULL, AOM_BITS_8, kZ2Start),
+        DrPredFunc<DrPred_Hbd>(&z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+                               NULL, AOM_BITS_10, kZ2Start),
+        DrPredFunc<DrPred_Hbd>(&z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+                               NULL, AOM_BITS_12, kZ2Start),
+        DrPredFunc<DrPred_Hbd>(&z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+                               NULL, AOM_BITS_8, kZ3Start),
+        DrPredFunc<DrPred_Hbd>(&z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+                               NULL, AOM_BITS_10, kZ3Start),
+        DrPredFunc<DrPred_Hbd>(&z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+                               NULL, AOM_BITS_12, kZ3Start)));
+
+}  // namespace
diff --git a/third_party/aom/test/dump_obu.sh b/third_party/aom/test/dump_obu.sh
new file mode 100755
index 000000000..182e894f5
--- /dev/null
+++ b/third_party/aom/test/dump_obu.sh
@@ -0,0 +1,70 @@
+#!/bin/sh
+## Copyright (c) 2018, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom dump_obu tool. To add new tests to this
+## file, do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to dump_obu_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+readonly dump_obu_test_file="${AOM_TEST_OUTPUT_DIR}/av1_obu_test.ivf"
+
+dump_obu_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    elog "The file ${YUV_RAW_INPUT##*/} must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ "$(dump_obu_available)" = "yes" ]; then
+    if [ -z "$(aom_tool_path dump_obu)" ]; then
+      elog "dump_obu not found in LIBAOM_BIN_PATH, its parent, or child tools/."
+    fi
+  fi
+}
+
+dump_obu_available() {
+  if [ "$(av1_decode_available)" = "yes" ] && \
+     [ "$(av1_encode_available)" = "yes" ]; then
+    echo yes
+  fi
+}
+
+aomenc_available() {
+  if [ -x "$(aom_tool_path aomenc)" ]; then
+    echo yes
+  fi
+}
+
+encode_test_file() {
+  if [ "$(aomenc_available)" = "yes" ]; then
+    local readonly encoder="$(aom_tool_path aomenc)"
+
+    eval "${encoder}" \
+      $(aomenc_encode_test_fast_params) \
+      $(yuv_raw_input) \
+      --ivf \
+      --output=${dump_obu_test_file} \
+      ${devnull}
+
+    if [ ! -e "${dump_obu_test_file}" ]; then
+      elog "dump_obu test input encode failed."
+      return 1
+    fi
+  fi
+}
+
+dump_obu() {
+  encode_test_file
+  eval $(aom_tool_path dump_obu) "${dump_obu_test_file}" ${devnull}
+}
+
+dump_obu_tests="dump_obu"
+
+run_tests dump_obu_verify_environment "${dump_obu_tests}"
diff --git a/third_party/aom/test/ec_test.cc b/third_party/aom/test/ec_test.cc
new file mode 100644
index 000000000..e6a5ea63b
--- /dev/null
+++ b/third_party/aom/test/ec_test.cc
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include <cstdlib>
+
+#include "aom_dsp/entenc.h"
+#include "aom_dsp/entdec.h"
+
+TEST(EC_TEST, random_ec_test) {
+  od_ec_enc enc;
+  od_ec_dec dec;
+  int sz;
+  int i;
+  int ret;
+  unsigned int sym;
+  unsigned int seed;
+  unsigned char *ptr;
+  uint32_t ptr_sz;
+  char *seed_str;
+  ret = 0;
+  seed_str = getenv("EC_TEST_SEED");
+  if (seed_str) {
+    seed = atoi(seed_str);
+  } else {
+    seed = 0xdaa1a;
+  }
+  srand(seed);
+  od_ec_enc_init(&enc, 1);
+  /*Test compatibility between multiple different encode/decode routines.*/
+  for (i = 0; i < 409600; i++) {
+    unsigned *fz;
+    unsigned *fts;
+    unsigned *data;
+    unsigned *tell;
+    unsigned *enc_method;
+    int j;
+    sz = rand() / ((RAND_MAX >> (rand() % 9U)) + 1U);
+    fz = (unsigned *)malloc(sz * sizeof(*fz));
+    fts = (unsigned *)malloc(sz * sizeof(*fts));
+    data = (unsigned *)malloc(sz * sizeof(*data));
+    tell = (unsigned *)malloc((sz + 1) * sizeof(*tell));
+    enc_method = (unsigned *)malloc(sz * sizeof(*enc_method));
+    od_ec_enc_reset(&enc);
+    tell[0] = od_ec_enc_tell_frac(&enc);
+    for (j = 0; j < sz; j++) {
+      data[j] = rand() / ((RAND_MAX >> 1) + 1);
+
+      fts[j] = CDF_PROB_BITS;
+      fz[j] = (rand() % (CDF_PROB_TOP - 2)) >> (CDF_PROB_BITS - fts[j]);
+      fz[j] = OD_MAXI(fz[j], 1);
+      enc_method[j] = 3 + (rand() & 1);
+      switch (enc_method[j]) {
+        case 3: {
+          od_ec_encode_bool_q15(&enc, data[j],
+                                OD_ICDF(fz[j] << (CDF_PROB_BITS - fts[j])));
+          break;
+        }
+        case 4: {
+          uint16_t cdf[2];
+          cdf[0] = OD_ICDF(fz[j]);
+          cdf[1] = OD_ICDF(1U << fts[j]);
+          od_ec_encode_cdf_q15(&enc, data[j], cdf, 2);
+          break;
+        }
+      }
+
+      tell[j + 1] = od_ec_enc_tell_frac(&enc);
+    }
+    ptr = od_ec_enc_done(&enc, &ptr_sz);
+    EXPECT_GE(((od_ec_enc_tell(&enc) + 7U) >> 3), ptr_sz)
+        << "od_ec_enc_tell() lied: "
+           "there's "
+        << ptr_sz << " bytes instead of " << ((od_ec_enc_tell(&enc) + 7) >> 3)
+        << " (Random seed: " << seed << ")\n";
+    od_ec_dec_init(&dec, ptr, ptr_sz);
+    EXPECT_EQ(od_ec_dec_tell_frac(&dec), tell[0])
+        << "od_ec_dec_tell() mismatch between encoder and decoder "
+           "at symbol 0: "
+        << (unsigned)od_ec_dec_tell_frac(&dec) << " instead of " << tell[0]
+        << " (Random seed: " << seed << ").\n";
+    for (j = 0; j < sz; j++) {
+      int dec_method;
+      if (CDF_SHIFT == 0) {
+        dec_method = 3 + (rand() & 1);
+      } else {
+        dec_method = enc_method[j];
+      }
+      switch (dec_method) {
+        case 3: {
+          sym = od_ec_decode_bool_q15(
+              &dec, OD_ICDF(fz[j] << (CDF_PROB_BITS - fts[j])));
+          break;
+        }
+        case 4: {
+          uint16_t cdf[2];
+          cdf[0] = OD_ICDF(fz[j]);
+          cdf[1] = OD_ICDF(1U << fts[j]);
+          sym = od_ec_decode_cdf_q15(&dec, cdf, 2);
+          break;
+        }
+      }
+
+      EXPECT_EQ(sym, data[j])
+          << "Decoded " << sym << " instead of " << data[j]
+          << " with fz=" << fz[j] << " and ftb=" << fts[j] << "at position "
+          << j << " of " << sz << " (Random seed: " << seed << ").\n"
+          << "Encoding method: " << enc_method[j]
+          << " decoding method: " << dec_method << "\n";
+      EXPECT_EQ(od_ec_dec_tell_frac(&dec), tell[j + 1])
+          << "od_ec_dec_tell() mismatch between encoder and "
+             "decoder at symbol "
+          << j + 1 << ": " << (unsigned)od_ec_dec_tell_frac(&dec)
+          << " instead of " << tell[j + 1] << " (Random seed: " << seed
+          << ").\n";
+    }
+    free(enc_method);
+    free(tell);
+    free(data);
+    free(fts);
+    free(fz);
+  }
+  od_ec_enc_reset(&enc);
+  if (CDF_SHIFT == 0) {
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(24576));
+    od_ec_enc_patch_initial_bits(&enc, 3, 2);
+    EXPECT_FALSE(enc.error) << "od_ec_enc_patch_initial_bits() failed.\n";
+    od_ec_enc_patch_initial_bits(&enc, 0, 5);
+    EXPECT_TRUE(enc.error)
+        << "od_ec_enc_patch_initial_bits() didn't fail when it should have.\n";
+    od_ec_enc_reset(&enc);
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+    od_ec_encode_bool_q15(&enc, 1, OD_ICDF(32256));
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(24576));
+    od_ec_enc_patch_initial_bits(&enc, 0, 2);
+    EXPECT_FALSE(enc.error) << "od_ec_enc_patch_initial_bits() failed.\n";
+    ptr = od_ec_enc_done(&enc, &ptr_sz);
+    EXPECT_EQ(ptr_sz, 2u);
+    EXPECT_EQ(ptr[0], 63)
+        << "Got " << ptr[0]
+        << " when expecting 63 for od_ec_enc_patch_initial_bits().\n";
+  }
+  od_ec_enc_clear(&enc);
+  EXPECT_EQ(ret, 0);
+}
diff --git a/third_party/aom/test/encode_api_test.cc b/third_party/aom/test/encode_api_test.cc
index 80c42fee4..c469d0871 100644
--- a/third_party/aom/test/encode_api_test.cc
+++ b/third_party/aom/test/encode_api_test.cc
@@ -7,11 +7,12 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "test/util.h"
 #include "aom/aomcx.h"
 #include "aom/aom_encoder.h"
@@ -33,8 +34,8 @@ TEST(EncodeAPI, InvalidParams) {
 
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(NULL, NULL, NULL, 0));
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, NULL, NULL, 0));
-  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_encode(NULL, NULL, 0, 0, 0, 0));
-  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_encode(NULL, &img, 0, 0, 0, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_encode(NULL, NULL, 0, 0, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_encode(NULL, &img, 0, 0, 0));
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_destroy(NULL));
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
             aom_codec_enc_config_default(NULL, NULL, 0));
@@ -53,7 +54,7 @@ TEST(EncodeAPI, InvalidParams) {
 
     EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(kCodecs[i], &cfg, 0));
     EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, kCodecs[i], &cfg, 0));
-    EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0, 0));
+    EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
 
     EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
   }
diff --git a/third_party/aom/test/encode_perf_test.cc b/third_party/aom/test/encode_perf_test.cc
index 5a37b480b..fe649b153 100644
--- a/third_party/aom/test/encode_perf_test.cc
+++ b/third_party/aom/test/encode_perf_test.cc
@@ -7,12 +7,14 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <string>
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "./aom_config.h"
-#include "./aom_version.h"
+
+#include "config/aom_config.h"
+#include "config/aom_version.h"
+
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
diff --git a/third_party/aom/test/encode_test_driver.cc b/third_party/aom/test/encode_test_driver.cc
index 6941f0148..b75d7be16 100644
--- a/third_party/aom/test/encode_test_driver.cc
+++ b/third_party/aom/test/encode_test_driver.cc
@@ -7,13 +7,14 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <string>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_ports/mem.h"
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
@@ -34,21 +35,6 @@ void Encoder::InitEncoder(VideoSource *video) {
 
     res = aom_codec_enc_init(&encoder_, CodecInterface(), &cfg_, init_flags_);
     ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
-
-#if CONFIG_AV1_ENCODER
-    if (CodecInterface() == &aom_codec_av1_cx_algo) {
-// Default to 1 tile column for AV1. With CONFIG_EXT_TILE, the
-// default is already the largest possible tile size
-#if !CONFIG_EXT_TILE
-      const int log2_tile_columns = 0;
-      res = aom_codec_control_(&encoder_, AV1E_SET_TILE_COLUMNS,
-                               log2_tile_columns);
-      ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
-#endif  // !CONFIG_EXT_TILE
-    } else
-#endif
-    {
-    }
   }
 }
 
@@ -82,15 +68,14 @@ void Encoder::EncodeFrameInternal(const VideoSource &video,
   }
 
   // Encode the frame
-  API_REGISTER_STATE_CHECK(res = aom_codec_encode(&encoder_, img, video.pts(),
-                                                  video.duration(), frame_flags,
-                                                  deadline_));
+  API_REGISTER_STATE_CHECK(res =
+                               aom_codec_encode(&encoder_, img, video.pts(),
+                                                video.duration(), frame_flags));
   ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
 }
 
 void Encoder::Flush() {
-  const aom_codec_err_t res =
-      aom_codec_encode(&encoder_, NULL, 0, 0, 0, deadline_);
+  const aom_codec_err_t res = aom_codec_encode(&encoder_, NULL, 0, 0, 0);
   if (!encoder_.priv)
     ASSERT_EQ(AOM_CODEC_ERROR, res) << EncoderError();
   else
@@ -105,11 +90,8 @@ void EncoderTest::InitializeConfig() {
 void EncoderTest::SetMode(TestMode mode) {
   switch (mode) {
     case kOnePassGood:
-    case kTwoPassGood: deadline_ = AOM_DL_GOOD_QUALITY; break;
-    case kRealTime:
-      deadline_ = AOM_DL_GOOD_QUALITY;
-      cfg_.g_lag_in_frames = 0;
-      break;
+    case kTwoPassGood: break;
+    case kRealTime: cfg_.g_lag_in_frames = 0; break;
     default: ASSERT_TRUE(false) << "Unexpected mode " << mode;
   }
   mode_ = mode;
@@ -149,14 +131,16 @@ static bool compare_img(const aom_image_t *img1, const aom_image_t *img2,
                         int *const mismatch_row, int *const mismatch_col,
                         int *const mismatch_plane, int *const mismatch_pix1,
                         int *const mismatch_pix2) {
-  if (img1->fmt != img2->fmt || img1->cs != img2->cs ||
-      img1->d_w != img2->d_w || img1->d_h != img2->d_h) {
+  if (img1->fmt != img2->fmt || img1->cp != img2->cp || img1->tc != img2->tc ||
+      img1->mc != img2->mc || img1->d_w != img2->d_w ||
+      img1->d_h != img2->d_h || img1->monochrome != img2->monochrome) {
     if (mismatch_row != NULL) *mismatch_row = -1;
     if (mismatch_col != NULL) *mismatch_col = -1;
     return false;
   }
 
-  for (int plane = 0; plane < 3; plane++) {
+  const int num_planes = img1->monochrome ? 1 : 3;
+  for (int plane = 0; plane < num_planes; plane++) {
     if (!compare_plane(img1->planes[plane], img1->stride[plane],
                        img2->planes[plane], img2->stride[plane],
                        aom_img_plane_width(img1, plane),
@@ -209,7 +193,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
 
     BeginPassHook(pass);
     testing::internal::scoped_ptr<Encoder> encoder(
-        codec_->CreateEncoder(cfg_, deadline_, init_flags_, &stats_));
+        codec_->CreateEncoder(cfg_, init_flags_, &stats_));
     ASSERT_TRUE(encoder.get() != NULL);
 
     ASSERT_NO_FATAL_FAILURE(video->Begin());
@@ -228,10 +212,11 @@ void EncoderTest::RunLoop(VideoSource *video) {
       dec_init_flags |= AOM_CODEC_USE_INPUT_FRAGMENTS;
     testing::internal::scoped_ptr<Decoder> decoder(
         codec_->CreateDecoder(dec_cfg, dec_init_flags));
-#if CONFIG_AV1 && CONFIG_EXT_TILE
+#if CONFIG_AV1_DECODER
     if (decoder->IsAV1()) {
       // Set dec_cfg.tile_row = -1 and dec_cfg.tile_col = -1 so that the whole
       // frame is decoded.
+      decoder->Control(AV1_SET_TILE_MODE, cfg_.large_scale_tile);
       decoder->Control(AV1_SET_DECODE_TILE_ROW, -1);
       decoder->Control(AV1_SET_DECODE_TILE_COL, -1);
     }
@@ -256,8 +241,16 @@ void EncoderTest::RunLoop(VideoSource *video) {
           case AOM_CODEC_CX_FRAME_PKT:
             has_cxdata = true;
             if (decoder.get() != NULL && DoDecode()) {
-              aom_codec_err_t res_dec = decoder->DecodeFrame(
-                  (const uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz);
+              aom_codec_err_t res_dec;
+              if (DoDecodeInvisible()) {
+                res_dec = decoder->DecodeFrame(
+                    (const uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz);
+              } else {
+                res_dec = decoder->DecodeFrame(
+                    (const uint8_t *)pkt->data.frame.buf +
+                        (pkt->data.frame.sz - pkt->data.frame.vis_frame_size),
+                    pkt->data.frame.vis_frame_size);
+              }
 
               if (!HandleDecodeResult(res_dec, decoder.get())) break;
 
diff --git a/third_party/aom/test/encode_test_driver.h b/third_party/aom/test/encode_test_driver.h
index 97c1bf860..138cd6a67 100644
--- a/third_party/aom/test/encode_test_driver.h
+++ b/third_party/aom/test/encode_test_driver.h
@@ -16,7 +16,8 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #if CONFIG_AV1_ENCODER
 #include "aom/aomcx.h"
 #endif
@@ -37,6 +38,9 @@ enum TestMode { kRealTime, kOnePassGood, kTwoPassGood };
 
 #define TWO_PASS_TEST_MODES ::testing::Values(::libaom_test::kTwoPassGood)
 
+#define NONREALTIME_TEST_MODES \
+  ::testing::Values(::libaom_test::kOnePassGood, ::libaom_test::kTwoPassGood)
+
 // Provides an object to handle the libaom get_cx_data() iteration pattern
 class CxDataIterator {
  public:
@@ -78,9 +82,9 @@ class TwopassStatsStore {
 // level of abstraction will be fleshed out as more tests are written.
 class Encoder {
  public:
-  Encoder(aom_codec_enc_cfg_t cfg, unsigned long deadline,
-          const unsigned long init_flags, TwopassStatsStore *stats)
-      : cfg_(cfg), deadline_(deadline), init_flags_(init_flags), stats_(stats) {
+  Encoder(aom_codec_enc_cfg_t cfg, const uint32_t init_flags,
+          TwopassStatsStore *stats)
+      : cfg_(cfg), init_flags_(init_flags), stats_(stats) {
     memset(&encoder_, 0, sizeof(encoder_));
   }
 
@@ -128,8 +132,6 @@ class Encoder {
     cfg_ = *cfg;
   }
 
-  void set_deadline(unsigned long deadline) { deadline_ = deadline; }
-
  protected:
   virtual aom_codec_iface_t *CodecInterface() const = 0;
 
@@ -147,7 +149,6 @@ class Encoder {
 
   aom_codec_ctx_t encoder_;
   aom_codec_enc_cfg_t cfg_;
-  unsigned long deadline_;
   unsigned long init_flags_;
   TwopassStatsStore *stats_;
 };
@@ -173,7 +174,7 @@ class EncoderTest {
   // Initialize the cfg_ member with the default configuration.
   void InitializeConfig();
 
-  // Map the TestMode enum to the deadline_ and passes_ variables.
+  // Map the TestMode enum to the passes_ variables.
   void SetMode(TestMode mode);
 
   // Set encoder flag.
@@ -206,9 +207,11 @@ class EncoderTest {
     return !(::testing::Test::HasFatalFailure() || abort_);
   }
 
-  const CodecFactory *codec_;
   // Hook to determine whether to decode frame after encoding
-  virtual bool DoDecode() const { return 1; }
+  virtual bool DoDecode() const { return true; }
+
+  // Hook to determine whether to decode invisible frames after encoding
+  virtual bool DoDecodeInvisible() const { return true; }
 
   // Hook to handle encode/decode mismatch
   virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2);
@@ -230,10 +233,10 @@ class EncoderTest {
     return pkt;
   }
 
+  const CodecFactory *codec_;
   bool abort_;
   aom_codec_enc_cfg_t cfg_;
   unsigned int passes_;
-  unsigned long deadline_;
   TwopassStatsStore stats_;
   unsigned long init_flags_;
   unsigned long frame_flags_;
diff --git a/third_party/aom/test/encoder_parms_get_to_decoder.cc b/third_party/aom/test/encoder_parms_get_to_decoder.cc
deleted file mode 100644
index 227ee8246..000000000
--- a/third_party/aom/test/encoder_parms_get_to_decoder.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "test/codec_factory.h"
-#include "test/encode_test_driver.h"
-#include "test/util.h"
-#include "test/y4m_video_source.h"
-#include "av1/av1_dx_iface.c"
-
-namespace {
-
-const int kCpuUsed = 2;
-
-struct EncodePerfTestVideo {
-  const char *name;
-  uint32_t width;
-  uint32_t height;
-  uint32_t bitrate;
-  int frames;
-};
-
-const EncodePerfTestVideo kAV1EncodePerfTestVectors[] = {
-  { "niklas_1280_720_30.y4m", 1280, 720, 600, 10 },
-};
-
-struct EncodeParameters {
-  int32_t tile_rows;
-  int32_t tile_cols;
-  int32_t lossless;
-  int32_t error_resilient;
-  int32_t frame_parallel;
-  aom_color_range_t color_range;
-  aom_color_space_t cs;
-#if CONFIG_COLORSPACE_HEADERS
-  aom_transfer_function_t tf;
-  aom_chroma_sample_position_t csp;
-#endif
-  int render_size[2];
-  // TODO(JBB): quantizers / bitrate
-};
-
-const EncodeParameters kAV1EncodeParameterSet[] = {
-  { 0, 0, 0, 1, 0, AOM_CR_STUDIO_RANGE, AOM_CS_BT_601, { 0, 0 } },
-  { 0, 0, 0, 0, 0, AOM_CR_FULL_RANGE, AOM_CS_BT_709, { 0, 0 } },
-#if CONFIG_COLORSPACE_HEADERS
-  { 0, 0, 1, 0, 0, AOM_CR_FULL_RANGE, AOM_CS_BT_2020_NCL, { 0, 0 } },
-#else
-  { 0, 0, 1, 0, 0, AOM_CR_FULL_RANGE, AOM_CS_BT_2020, { 0, 0 } },
-#endif
-  { 0, 2, 0, 0, 1, AOM_CR_STUDIO_RANGE, AOM_CS_UNKNOWN, { 640, 480 } },
-  // TODO(JBB): Test profiles (requires more work).
-};
-
-class AvxEncoderParmsGetToDecoder
-    : public ::libaom_test::CodecTestWith2Params<EncodeParameters,
-                                                 EncodePerfTestVideo>,
-      public ::libaom_test::EncoderTest,
-{
- protected:
-  AvxEncoderParmsGetToDecoder()
-      : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {}
-
-  virtual ~AvxEncoderParmsGetToDecoder() {}
-
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(::libaom_test::kTwoPassGood);
-    cfg_.g_lag_in_frames = 25;
-    cfg_.g_error_resilient = encode_parms.error_resilient;
-    dec_cfg_.threads = 4;
-    test_video_ = GET_PARAM(2);
-    cfg_.rc_target_bitrate = test_video_.bitrate;
-  }
-
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
-    if (video->frame() == 1) {
-      encoder->Control(AV1E_SET_COLOR_SPACE, encode_parms.cs);
-#if CONFIG_COLORSPACE_HEADERS
-      encoder->Control(AV1E_SET_TRANSFER_FUNCTION, encode_parms.tf);
-      encoder->Control(AV1E_SET_CHROMA_SAMPLE_POSITION, encode_parms.csp);
-#endif
-      encoder->Control(AV1E_SET_COLOR_RANGE, encode_parms.color_range);
-      encoder->Control(AV1E_SET_LOSSLESS, encode_parms.lossless);
-      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING,
-                       encode_parms.frame_parallel);
-      encoder->Control(AV1E_SET_TILE_ROWS, encode_parms.tile_rows);
-      encoder->Control(AV1E_SET_TILE_COLUMNS, encode_parms.tile_cols);
-      encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
-      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
-      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
-      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
-      if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0)
-        encoder->Control(AV1E_SET_RENDER_SIZE, encode_parms.render_size);
-    }
-  }
-
-  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  libaom_test::Decoder *decoder) {
-    aom_codec_ctx_t *const av1_decoder = decoder->GetDecoder();
-    aom_codec_alg_priv_t *const priv =
-        reinterpret_cast<aom_codec_alg_priv_t *>(av1_decoder->priv);
-    FrameWorkerData *const worker_data =
-        reinterpret_cast<FrameWorkerData *>(priv->frame_workers[0].data1);
-    AV1_COMMON *const common = &worker_data->pbi->common;
-
-    if (encode_parms.lossless) {
-      EXPECT_EQ(0, common->base_qindex);
-      EXPECT_EQ(0, common->y_dc_delta_q);
-      EXPECT_EQ(0, common->uv_dc_delta_q);
-      EXPECT_EQ(0, common->uv_ac_delta_q);
-      EXPECT_EQ(ONLY_4X4, common->tx_mode);
-    }
-    EXPECT_EQ(encode_parms.error_resilient, common->error_resilient_mode);
-    if (encode_parms.error_resilient) {
-      EXPECT_EQ(0, common->use_prev_frame_mvs);
-    }
-    EXPECT_EQ(encode_parms.color_range, common->color_range);
-    EXPECT_EQ(encode_parms.cs, common->color_space);
-#if CONFIG_COLORSPACE_HEADERS
-    EXPECT_EQ(encode_parms.tf, common->transfer_function);
-    EXPECT_EQ(encode_parms.csp, common->chroma_sample_position);
-#endif
-    if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0) {
-      EXPECT_EQ(encode_parms.render_size[0], common->render_width);
-      EXPECT_EQ(encode_parms.render_size[1], common->render_height);
-    }
-    EXPECT_EQ(encode_parms.tile_cols, common->log2_tile_cols);
-    EXPECT_EQ(encode_parms.tile_rows, common->log2_tile_rows);
-
-    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
-    return AOM_CODEC_OK == res_dec;
-  }
-
-  EncodePerfTestVideo test_video_;
-
- private:
-  EncodeParameters encode_parms;
-};
-
-TEST_P(AvxEncoderParmsGetToDecoder, BitstreamParms) {
-  init_flags_ = AOM_CODEC_USE_PSNR;
-
-  testing::internal::scoped_ptr<libaom_test::VideoSource> video(
-      new libaom_test::Y4mVideoSource(test_video_.name, 0, test_video_.frames));
-  ASSERT_TRUE(video.get() != NULL);
-
-  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-AV1_INSTANTIATE_TEST_CASE(AvxEncoderParmsGetToDecoder,
-                          ::testing::ValuesIn(kAV1EncodeParameterSet),
-                          ::testing::ValuesIn(kAV1EncodePerfTestVectors));
-}  // namespace
diff --git a/third_party/aom/test/encodetxb_test.cc b/third_party/aom/test/encodetxb_test.cc
new file mode 100644
index 000000000..ab6ec72c6
--- /dev/null
+++ b/third_party/aom/test/encodetxb_test.cc
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "av1/common/idct.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/scan.h"
+#include "av1/common/txb_common.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+using libaom_test::ACMRandom;
+
+typedef void (*GetNzMapContextsFunc)(const uint8_t *const levels,
+                                     const int16_t *const scan,
+                                     const uint16_t eob, const TX_SIZE tx_size,
+                                     const TX_CLASS tx_class,
+                                     int8_t *const coeff_contexts);
+
+class EncodeTxbTest : public ::testing::TestWithParam<GetNzMapContextsFunc> {
+ public:
+  EncodeTxbTest() : get_nz_map_contexts_func_(GetParam()) {}
+
+  virtual ~EncodeTxbTest() {}
+
+  virtual void SetUp() {
+    coeff_contexts_ref_ = reinterpret_cast<int8_t *>(
+        aom_memalign(16, sizeof(*coeff_contexts_ref_) * MAX_TX_SQUARE));
+    ASSERT_TRUE(coeff_contexts_ref_ != NULL);
+    coeff_contexts_ = reinterpret_cast<int8_t *>(
+        aom_memalign(16, sizeof(*coeff_contexts_) * MAX_TX_SQUARE));
+    ASSERT_TRUE(coeff_contexts_ != NULL);
+  }
+
+  virtual void TearDown() {
+    aom_free(coeff_contexts_ref_);
+    aom_free(coeff_contexts_);
+    libaom_test::ClearSystemState();
+  }
+
+  void GetNzMapContextsRun() {
+    const int kNumTests = 10;
+    int result = 0;
+
+    for (int is_inter = 0; is_inter < 2; ++is_inter) {
+      for (int tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+        const TX_CLASS tx_class = tx_type_to_class[tx_type];
+        for (int tx_size = TX_4X4; tx_size < TX_SIZES_ALL; ++tx_size) {
+          const int bwl = get_txb_bwl((TX_SIZE)tx_size);
+          const int width = get_txb_wide((TX_SIZE)tx_size);
+          const int height = get_txb_high((TX_SIZE)tx_size);
+          const int real_width = tx_size_wide[tx_size];
+          const int real_height = tx_size_high[tx_size];
+          const int16_t *const scan = av1_scan_orders[tx_size][tx_type].scan;
+
+          levels_ = set_levels(levels_buf_, width);
+          for (int i = 0; i < kNumTests && !result; ++i) {
+            for (int eob = 1; eob <= width * height && !result; ++eob) {
+              InitDataWithEob(scan, bwl, eob);
+
+              av1_get_nz_map_contexts_c(levels_, scan, eob, (TX_SIZE)tx_size,
+                                        tx_class, coeff_contexts_ref_);
+              get_nz_map_contexts_func_(levels_, scan, eob, (TX_SIZE)tx_size,
+                                        tx_class, coeff_contexts_);
+
+              result = Compare(scan, eob);
+
+              EXPECT_EQ(result, 0)
+                  << " tx_class " << tx_class << " width " << real_width
+                  << " height " << real_height << " eob " << eob;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void SpeedTestGetNzMapContextsRun() {
+    const int kNumTests = 2000000000;
+    aom_usec_timer timer;
+
+    printf("Note: Only test the largest possible eob case!\n");
+    for (int tx_size = TX_4X4; tx_size < TX_SIZES_ALL; ++tx_size) {
+      const int bwl = get_txb_bwl((TX_SIZE)tx_size);
+      const int width = get_txb_wide((TX_SIZE)tx_size);
+      const int height = get_txb_high((TX_SIZE)tx_size);
+      const int real_width = tx_size_wide[tx_size];
+      const int real_height = tx_size_high[tx_size];
+      const TX_TYPE tx_type = DCT_DCT;
+      const TX_CLASS tx_class = tx_type_to_class[tx_type];
+      const int16_t *const scan = av1_scan_orders[tx_size][tx_type].scan;
+      const int eob = width * height;
+      const int numTests = kNumTests / (width * height);
+
+      levels_ = set_levels(levels_buf_, width);
+      InitDataWithEob(scan, bwl, eob);
+
+      aom_usec_timer_start(&timer);
+      for (int i = 0; i < numTests; ++i) {
+        get_nz_map_contexts_func_(levels_, scan, eob, (TX_SIZE)tx_size,
+                                  tx_class, coeff_contexts_);
+      }
+      aom_usec_timer_mark(&timer);
+
+      const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+      printf("get_nz_map_contexts_%2dx%2d: %7.1f ms\n", real_width, real_height,
+             elapsed_time / 1000.0);
+    }
+  }
+
+ private:
+  void InitDataWithEob(const int16_t *const scan, const int bwl,
+                       const int eob) {
+    memset(levels_buf_, 0, sizeof(levels_buf_));
+    memset(coeff_contexts_, 0, sizeof(*coeff_contexts_) * MAX_TX_SQUARE);
+
+    for (int c = 0; c < eob; ++c) {
+      levels_[get_padded_idx(scan[c], bwl)] =
+          static_cast<uint8_t>(clamp(rnd_.Rand8(), 0, INT8_MAX));
+      coeff_contexts_[scan[c]] = rnd_.Rand16() >> 1;
+    }
+
+    memcpy(coeff_contexts_ref_, coeff_contexts_,
+           sizeof(*coeff_contexts_) * MAX_TX_SQUARE);
+  }
+
+  bool Compare(const int16_t *const scan, const int eob) const {
+    bool result = false;
+    if (memcmp(coeff_contexts_, coeff_contexts_ref_,
+               sizeof(*coeff_contexts_ref_) * MAX_TX_SQUARE)) {
+      for (int i = 0; i < eob; i++) {
+        const int pos = scan[i];
+        if (coeff_contexts_ref_[pos] != coeff_contexts_[pos]) {
+          printf("coeff_contexts_[%d] diff:%6d (ref),%6d (opt)\n", pos,
+                 coeff_contexts_ref_[pos], coeff_contexts_[pos]);
+          result = true;
+          break;
+        }
+      }
+    }
+    return result;
+  }
+
+  GetNzMapContextsFunc get_nz_map_contexts_func_;
+  ACMRandom rnd_;
+  uint8_t levels_buf_[TX_PAD_2D];
+  uint8_t *levels_;
+  int8_t *coeff_contexts_ref_;
+  int8_t *coeff_contexts_;
+};
+
+TEST_P(EncodeTxbTest, GetNzMapContexts) { GetNzMapContextsRun(); }
+
+TEST_P(EncodeTxbTest, DISABLED_SpeedTestGetNzMapContexts) {
+  SpeedTestGetNzMapContextsRun();
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, EncodeTxbTest,
+                        ::testing::Values(av1_get_nz_map_contexts_sse2));
+#endif
+
+#if HAVE_SSE4_1
+class EncodeTxbInitLevelTest : public ::testing::TestWithParam<int> {
+ public:
+  virtual ~EncodeTxbInitLevelTest() {}
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  void RunTest(int tx_size, int is_speed);
+};
+
+void EncodeTxbInitLevelTest::RunTest(int tx_size, int is_speed) {
+  const int width = get_txb_wide((TX_SIZE)tx_size);
+  const int height = get_txb_high((TX_SIZE)tx_size);
+  tran_low_t coeff[MAX_TX_SQUARE];
+
+  uint8_t levels_buf[2][TX_PAD_2D];
+  uint8_t *const levels0 = set_levels(levels_buf[0], width);
+  uint8_t *const levels1 = set_levels(levels_buf[1], width);
+
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int i = 0; i < width * height; i++) {
+    coeff[i] = rnd.Rand15Signed() + rnd.Rand15Signed();
+  }
+  for (int i = 0; i < TX_PAD_2D; i++) {
+    levels_buf[0][i] = rnd.Rand8();
+    levels_buf[1][i] = rnd.Rand8();
+  }
+  const int run_times = is_speed ? (width * height) * 10000 : 1;
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_times; ++i) {
+    av1_txb_init_levels_c(coeff, width, height, levels0);
+  }
+  const double t1 = get_time_mark(&timer);
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_times; ++i) {
+    av1_txb_init_levels_sse4_1(coeff, width, height, levels1);
+  }
+  const double t2 = get_time_mark(&timer);
+  if (is_speed) {
+    printf("init %3dx%-3d:%7.2f/%7.2fns", width, height, t1, t2);
+    printf("(%3.2f)\n", t1 / t2);
+  }
+  const int stride = width + TX_PAD_HOR;
+  for (int r = 0; r < height + TX_PAD_VER; ++r) {
+    for (int c = 0; c < stride; ++c) {
+      ASSERT_EQ(levels_buf[0][c + r * stride], levels_buf[1][c + r * stride])
+          << "[" << r << "," << c << "] " << run_times << width << "x"
+          << height;
+    }
+  }
+}
+
+TEST_P(EncodeTxbInitLevelTest, match) { RunTest(GetParam(), 0); }
+TEST_P(EncodeTxbInitLevelTest, DISABLED_Speed) { RunTest(GetParam(), 1); }
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, EncodeTxbInitLevelTest,
+                        ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1));
+#endif
+
+}  // namespace
diff --git a/third_party/aom/test/end_to_end_test.cc b/third_party/aom/test/end_to_end_test.cc
index e1a833ec4..1ac0ae931 100644
--- a/third_party/aom/test/end_to_end_test.cc
+++ b/third_party/aom/test/end_to_end_test.cc
@@ -7,7 +7,7 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -30,7 +30,7 @@ const double kPsnrThreshold[][5] = {
 // AV1 HBD average PSNR is slightly lower than AV1.
 // We make two cases here to enable the testing and
 // guard picture quality.
-#if CONFIG_AV1_ENCODER && CONFIG_HIGHBITDEPTH
+#if CONFIG_AV1_ENCODER
   { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 31.0, 36.0, 36.0, 36.0, 36.0 },
   { 31.0, 35.0, 35.0, 35.0, 35.0 }, { 31.0, 34.0, 34.0, 34.0, 34.0 },
   { 31.0, 33.0, 33.0, 33.0, 33.0 }, { 31.0, 32.0, 32.0, 32.0, 32.0 },
@@ -40,7 +40,7 @@ const double kPsnrThreshold[][5] = {
   { 34.0, 35.0, 35.0, 35.0, 35.0 }, { 33.0, 34.0, 34.0, 34.0, 34.0 },
   { 32.0, 33.0, 33.0, 33.0, 33.0 }, { 31.0, 32.0, 32.0, 32.0, 32.0 },
   { 30.0, 31.0, 31.0, 31.0, 31.0 }, { 29.0, 30.0, 30.0, 30.0, 30.0 },
-#endif  // CONFIG_HIGHBITDEPTH && CONFIG_AV1_ENCODER
+#endif  // CONFIG_AV1_ENCODER
 };
 
 typedef struct {
@@ -53,24 +53,20 @@ typedef struct {
 
 const TestVideoParam kTestVectors[] = {
   { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
-  { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, AOM_BITS_8, 1 },
+  { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, AOM_BITS_8, 2 },
   { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 },
-  { "park_joy_90p_8_440.yuv", 8, AOM_IMG_FMT_I440, AOM_BITS_8, 1 },
-#if CONFIG_HIGHBITDEPTH
-  { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016, AOM_BITS_10, 2 },
-  { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216, AOM_BITS_10, 3 },
-  { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416, AOM_BITS_10, 3 },
-  { "park_joy_90p_10_440.yuv", 10, AOM_IMG_FMT_I44016, AOM_BITS_10, 3 },
+  { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016, AOM_BITS_10, 0 },
+  { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216, AOM_BITS_10, 2 },
+  { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416, AOM_BITS_10, 1 },
   { "park_joy_90p_12_420.y4m", 12, AOM_IMG_FMT_I42016, AOM_BITS_12, 2 },
-  { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216, AOM_BITS_12, 3 },
-  { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416, AOM_BITS_12, 3 },
-  { "park_joy_90p_12_440.yuv", 12, AOM_IMG_FMT_I44016, AOM_BITS_12, 3 },
-#endif  // CONFIG_HIGHBITDEPTH
+  { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216, AOM_BITS_12, 2 },
+  { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416, AOM_BITS_12, 2 },
 };
 
 // Encoding modes tested
 const libaom_test::TestMode kEncodingModeVectors[] = {
-  ::libaom_test::kTwoPassGood, ::libaom_test::kOnePassGood,
+  ::libaom_test::kTwoPassGood,
+  ::libaom_test::kOnePassGood,
   ::libaom_test::kRealTime,
 };
 
@@ -150,6 +146,32 @@ class EndToEndTest
     return kPsnrThreshold[cpu_used_][encoding_mode_];
   }
 
+  void DoTest() {
+    cfg_.rc_target_bitrate = kBitrate;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    testing::internal::scoped_ptr<libaom_test::VideoSource> video;
+    if (is_extension_y4m(test_video_param_.filename)) {
+      video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                  kFrames));
+    } else {
+      video.reset(new libaom_test::YUVVideoSource(
+          test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight,
+          kFramerate, 1, 0, kFrames));
+    }
+    ASSERT_TRUE(video.get() != NULL);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr = GetAveragePsnr();
+    EXPECT_GT(psnr, GetPsnrThreshold())
+        << "cpu used = " << cpu_used_ << ", encoding mode = " << encoding_mode_;
+  }
+
   TestVideoParam test_video_param_;
   int cpu_used_;
 
@@ -161,55 +183,9 @@ class EndToEndTest
 
 class EndToEndTestLarge : public EndToEndTest {};
 
-TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) {
-  cfg_.rc_target_bitrate = kBitrate;
-  cfg_.g_error_resilient = 0;
-  cfg_.g_profile = test_video_param_.profile;
-  cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
-  cfg_.g_bit_depth = test_video_param_.bit_depth;
-  init_flags_ = AOM_CODEC_USE_PSNR;
-  if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
-
-  testing::internal::scoped_ptr<libaom_test::VideoSource> video;
-  if (is_extension_y4m(test_video_param_.filename)) {
-    video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
-                                                kFrames));
-  } else {
-    video.reset(new libaom_test::YUVVideoSource(
-        test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight,
-        kFramerate, 1, 0, kFrames));
-  }
-  ASSERT_TRUE(video.get() != NULL);
+TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) { DoTest(); }
 
-  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-  const double psnr = GetAveragePsnr();
-  EXPECT_GT(psnr, GetPsnrThreshold());
-}
-
-TEST_P(EndToEndTest, EndtoEndPSNRTest) {
-  cfg_.rc_target_bitrate = kBitrate;
-  cfg_.g_error_resilient = 0;
-  cfg_.g_profile = test_video_param_.profile;
-  cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
-  cfg_.g_bit_depth = test_video_param_.bit_depth;
-  init_flags_ = AOM_CODEC_USE_PSNR;
-  if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
-
-  testing::internal::scoped_ptr<libaom_test::VideoSource> video;
-  if (is_extension_y4m(test_video_param_.filename)) {
-    video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
-                                                kFrames));
-  } else {
-    video.reset(new libaom_test::YUVVideoSource(
-        test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight,
-        kFramerate, 1, 0, kFrames));
-  }
-  ASSERT_TRUE(video.get() != NULL);
-
-  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-  const double psnr = GetAveragePsnr();
-  EXPECT_GT(psnr, GetPsnrThreshold());
-}
+TEST_P(EndToEndTest, EndtoEndPSNRTest) { DoTest(); }
 
 AV1_INSTANTIATE_TEST_CASE(EndToEndTestLarge,
                           ::testing::ValuesIn(kEncodingModeVectors),
diff --git a/third_party/aom/test/error_block_test.cc b/third_party/aom/test/error_block_test.cc
index 227065fa9..353947c3d 100644
--- a/third_party/aom/test/error_block_test.cc
+++ b/third_party/aom/test/error_block_test.cc
@@ -7,7 +7,7 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <cmath>
 #include <cstdlib>
@@ -15,8 +15,9 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
@@ -28,14 +29,13 @@
 using libaom_test::ACMRandom;
 
 namespace {
-#if CONFIG_HIGHBITDEPTH
 const int kNumIterations = 1000;
 
 typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,
                                   const tran_low_t *dqcoeff,
                                   intptr_t block_size, int64_t *ssz, int bps);
 
-typedef std::tr1::tuple<ErrorBlockFunc, ErrorBlockFunc, aom_bit_depth_t>
+typedef ::testing::tuple<ErrorBlockFunc, ErrorBlockFunc, aom_bit_depth_t>
     ErrorBlockParam;
 
 class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {
@@ -156,8 +156,8 @@ TEST_P(ErrorBlockTest, ExtremeValues) {
       << "First failed at test case " << first_failure;
 }
 
-#if HAVE_SSE2 || HAVE_AVX
-using std::tr1::make_tuple;
+#if (HAVE_SSE2 || HAVE_AVX)
+using ::testing::make_tuple;
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, ErrorBlockTest,
@@ -168,6 +168,4 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(&av1_highbd_block_error_sse2,
                                  &av1_highbd_block_error_c, AOM_BITS_8)));
 #endif  // HAVE_SSE2
-
-#endif  // CONFIG_HIGHBITDEPTH
 }  // namespace
diff --git a/third_party/aom/test/error_resilience_test.cc b/third_party/aom/test/error_resilience_test.cc
index e9abdde6d..13ac0bf93 100644
--- a/third_party/aom/test/error_resilience_test.cc
+++ b/third_party/aom/test/error_resilience_test.cc
@@ -7,7 +7,7 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/codec_factory.h"
@@ -18,7 +18,13 @@
 namespace {
 
 const int kMaxErrorFrames = 12;
+const int kMaxInvisibleErrorFrames = 12;
 const int kMaxDroppableFrames = 12;
+const int kMaxErrorResilientFrames = 12;
+const int kMaxNoMFMVFrames = 12;
+const int kMaxPrimRefNoneFrames = 12;
+const int kMaxSFrames = 12;
+const int kCpuUsed = 1;
 
 class ErrorResilienceTestLarge
     : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
@@ -26,7 +32,7 @@ class ErrorResilienceTestLarge
  protected:
   ErrorResilienceTestLarge()
       : EncoderTest(GET_PARAM(0)), psnr_(0.0), nframes_(0), mismatch_psnr_(0.0),
-        mismatch_nframes_(0), encoding_mode_(GET_PARAM(1)) {
+        mismatch_nframes_(0), encoding_mode_(GET_PARAM(1)), allow_mismatch_(0) {
     Reset();
   }
 
@@ -34,8 +40,21 @@ class ErrorResilienceTestLarge
 
   void Reset() {
     error_nframes_ = 0;
+    invisible_error_nframes_ = 0;
     droppable_nframes_ = 0;
-    pattern_switch_ = 0;
+    error_resilient_nframes_ = 0;
+    nomfmv_nframes_ = 0;
+    prim_ref_none_nframes_ = 0;
+    s_nframes_ = 0;
+  }
+
+  void SetupEncoder(int bitrate, int lag) {
+    const aom_rational timebase = { 33333333, 1000000000 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_target_bitrate = bitrate;
+    cfg_.kf_mode = AOM_KF_DISABLED;
+    cfg_.g_lag_in_frames = lag;
+    init_flags_ = AOM_CODEC_USE_PSNR;
   }
 
   virtual void SetUp() {
@@ -46,6 +65,7 @@ class ErrorResilienceTestLarge
   virtual void BeginPassHook(unsigned int /*pass*/) {
     psnr_ = 0.0;
     nframes_ = 0;
+    decoded_nframes_ = 0;
     mismatch_psnr_ = 0.0;
     mismatch_nframes_ = 0;
   }
@@ -55,18 +75,71 @@ class ErrorResilienceTestLarge
     nframes_++;
   }
 
-  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video) {
+  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                                  libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
     frame_flags_ &=
-        ~(AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF);
+        ~(AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
+          AOM_EFLAG_NO_REF_FRAME_MVS | AOM_EFLAG_ERROR_RESILIENT |
+          AOM_EFLAG_SET_S_FRAME | AOM_EFLAG_SET_PRIMARY_REF_NONE);
     if (droppable_nframes_ > 0 &&
         (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
       for (unsigned int i = 0; i < droppable_nframes_; ++i) {
         if (droppable_frames_[i] == video->frame()) {
-          std::cout << "Encoding droppable frame: " << droppable_frames_[i]
-                    << "\n";
+          std::cout << "             Encoding droppable frame: "
+                    << droppable_frames_[i] << "\n";
           frame_flags_ |= (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
                            AOM_EFLAG_NO_UPD_ARF);
-          return;
+          break;
+        }
+      }
+    }
+
+    if (error_resilient_nframes_ > 0 &&
+        (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+      for (unsigned int i = 0; i < error_resilient_nframes_; ++i) {
+        if (error_resilient_frames_[i] == video->frame()) {
+          std::cout << "             Encoding error_resilient frame: "
+                    << error_resilient_frames_[i] << "\n";
+          frame_flags_ |= AOM_EFLAG_ERROR_RESILIENT;
+          break;
+        }
+      }
+    }
+
+    if (nomfmv_nframes_ > 0 &&
+        (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+      for (unsigned int i = 0; i < nomfmv_nframes_; ++i) {
+        if (nomfmv_frames_[i] == video->frame()) {
+          std::cout << "             Encoding no mfmv frame: "
+                    << nomfmv_frames_[i] << "\n";
+          frame_flags_ |= AOM_EFLAG_NO_REF_FRAME_MVS;
+          break;
+        }
+      }
+    }
+
+    if (prim_ref_none_nframes_ > 0 &&
+        (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+      for (unsigned int i = 0; i < prim_ref_none_nframes_; ++i) {
+        if (prim_ref_none_frames_[i] == video->frame()) {
+          std::cout << "             Encoding no PRIMARY_REF_NONE frame: "
+                    << prim_ref_none_frames_[i] << "\n";
+          frame_flags_ |= AOM_EFLAG_SET_PRIMARY_REF_NONE;
+          break;
+        }
+      }
+    }
+
+    encoder->Control(AV1E_SET_S_FRAME_MODE, 0);
+    if (s_nframes_ > 0 &&
+        (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+      for (unsigned int i = 0; i < s_nframes_; ++i) {
+        if (s_frames_[i] == video->frame()) {
+          std::cout << "             Encoding S frame: " << s_frames_[i]
+                    << "\n";
+          frame_flags_ |= AOM_EFLAG_SET_S_FRAME;
+          break;
         }
       }
     }
@@ -96,12 +169,37 @@ class ErrorResilienceTestLarge
     return 1;
   }
 
+  virtual bool DoDecodeInvisible() const {
+    if (invisible_error_nframes_ > 0 &&
+        (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+      for (unsigned int i = 0; i < invisible_error_nframes_; ++i) {
+        if (invisible_error_frames_[i] == nframes_ - 1) {
+          std::cout << "             Skipping decoding all invisible frames in "
+                       "frame pkt: "
+                    << invisible_error_frames_[i] << "\n";
+          return 0;
+        }
+      }
+    }
+    return 1;
+  }
+
   virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) {
-    double mismatch_psnr = compute_psnr(img1, img2);
-    mismatch_psnr_ += mismatch_psnr;
-    ++mismatch_nframes_;
-    // std::cout << "Mismatch frame psnr: " << mismatch_psnr << "\n";
-    ::libaom_test::EncoderTest::MismatchHook(img1, img2);
+    if (allow_mismatch_) {
+      double mismatch_psnr = compute_psnr(img1, img2);
+      mismatch_psnr_ += mismatch_psnr;
+      ++mismatch_nframes_;
+      // std::cout << "Mismatch frame psnr: " << mismatch_psnr << "\n";
+    } else {
+      ::libaom_test::EncoderTest::MismatchHook(img1, img2);
+    }
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     aom_codec_pts_t pts) {
+    (void)img;
+    (void)pts;
+    ++decoded_nframes_;
   }
 
   void SetErrorFrames(int num, unsigned int *list) {
@@ -114,6 +212,16 @@ class ErrorResilienceTestLarge
       error_frames_[i] = list[i];
   }
 
+  void SetInvisibleErrorFrames(int num, unsigned int *list) {
+    if (num > kMaxInvisibleErrorFrames)
+      num = kMaxInvisibleErrorFrames;
+    else if (num < 0)
+      num = 0;
+    invisible_error_nframes_ = num;
+    for (unsigned int i = 0; i < invisible_error_nframes_; ++i)
+      invisible_error_frames_[i] = list[i];
+  }
+
   void SetDroppableFrames(int num, unsigned int *list) {
     if (num > kMaxDroppableFrames)
       num = kMaxDroppableFrames;
@@ -124,42 +232,93 @@ class ErrorResilienceTestLarge
       droppable_frames_[i] = list[i];
   }
 
+  void SetErrorResilientFrames(int num, unsigned int *list) {
+    if (num > kMaxErrorResilientFrames)
+      num = kMaxErrorResilientFrames;
+    else if (num < 0)
+      num = 0;
+    error_resilient_nframes_ = num;
+    for (unsigned int i = 0; i < error_resilient_nframes_; ++i)
+      error_resilient_frames_[i] = list[i];
+  }
+
+  void SetNoMFMVFrames(int num, unsigned int *list) {
+    if (num > kMaxNoMFMVFrames)
+      num = kMaxNoMFMVFrames;
+    else if (num < 0)
+      num = 0;
+    nomfmv_nframes_ = num;
+    for (unsigned int i = 0; i < nomfmv_nframes_; ++i)
+      nomfmv_frames_[i] = list[i];
+  }
+
+  void SetPrimaryRefNoneFrames(int num, unsigned int *list) {
+    if (num > kMaxPrimRefNoneFrames)
+      num = kMaxPrimRefNoneFrames;
+    else if (num < 0)
+      num = 0;
+    prim_ref_none_nframes_ = num;
+    for (unsigned int i = 0; i < prim_ref_none_nframes_; ++i)
+      prim_ref_none_frames_[i] = list[i];
+  }
+
+  void SetSFrames(int num, unsigned int *list) {
+    if (num > kMaxSFrames)
+      num = kMaxSFrames;
+    else if (num < 0)
+      num = 0;
+    s_nframes_ = num;
+    for (unsigned int i = 0; i < s_nframes_; ++i) s_frames_[i] = list[i];
+  }
+
   unsigned int GetMismatchFrames() { return mismatch_nframes_; }
+  unsigned int GetEncodedFrames() { return nframes_; }
+  unsigned int GetDecodedFrames() { return decoded_nframes_; }
 
-  void SetPatternSwitch(int frame_switch) { pattern_switch_ = frame_switch; }
+  void SetAllowMismatch(int allow) { allow_mismatch_ = allow; }
 
  private:
   double psnr_;
   unsigned int nframes_;
+  unsigned int decoded_nframes_;
   unsigned int error_nframes_;
+  unsigned int invisible_error_nframes_;
   unsigned int droppable_nframes_;
-  unsigned int pattern_switch_;
+  unsigned int error_resilient_nframes_;
+  unsigned int nomfmv_nframes_;
+  unsigned int prim_ref_none_nframes_;
+  unsigned int s_nframes_;
   double mismatch_psnr_;
   unsigned int mismatch_nframes_;
   unsigned int error_frames_[kMaxErrorFrames];
+  unsigned int invisible_error_frames_[kMaxInvisibleErrorFrames];
   unsigned int droppable_frames_[kMaxDroppableFrames];
+  unsigned int error_resilient_frames_[kMaxErrorResilientFrames];
+  unsigned int nomfmv_frames_[kMaxNoMFMVFrames];
+  unsigned int prim_ref_none_frames_[kMaxPrimRefNoneFrames];
+  unsigned int s_frames_[kMaxSFrames];
   libaom_test::TestMode encoding_mode_;
+  int allow_mismatch_;
 };
 
 TEST_P(ErrorResilienceTestLarge, OnVersusOff) {
-  const aom_rational timebase = { 33333333, 1000000000 };
-  cfg_.g_timebase = timebase;
-  cfg_.rc_target_bitrate = 2000;
-  cfg_.g_lag_in_frames = 10;
-
-  init_flags_ = AOM_CODEC_USE_PSNR;
-
+  SetupEncoder(2000, 10);
   libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                     timebase.den, timebase.num, 0, 12);
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 12);
 
-  // Error resilient mode OFF.
+  // Global error resilient mode OFF.
   cfg_.g_error_resilient = 0;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   const double psnr_resilience_off = GetAveragePsnr();
   EXPECT_GT(psnr_resilience_off, 25.0);
 
-  // Error resilient mode ON.
-  cfg_.g_error_resilient = 1;
+  Reset();
+  // Error resilient mode ON for certain frames
+  unsigned int num_error_resilient_frames = 5;
+  unsigned int error_resilient_frame_list[] = { 3, 5, 6, 9, 11 };
+  SetErrorResilientFrames(num_error_resilient_frames,
+                          error_resilient_frame_list);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   const double psnr_resilience_on = GetAveragePsnr();
   EXPECT_GT(psnr_resilience_on, 25.0);
@@ -175,60 +334,105 @@ TEST_P(ErrorResilienceTestLarge, OnVersusOff) {
 // Check for successful decoding and no encoder/decoder mismatch
 // if we lose (i.e., drop before decoding) a set of droppable
 // frames (i.e., frames that don't update any reference buffers).
-// Check both isolated and consecutive loss.
 TEST_P(ErrorResilienceTestLarge, DropFramesWithoutRecovery) {
-  const aom_rational timebase = { 33333333, 1000000000 };
-  cfg_.g_timebase = timebase;
-  cfg_.rc_target_bitrate = 500;
-  // FIXME(debargha): Fix this to work for any lag.
-  // Currently this test only works for lag = 0
-  cfg_.g_lag_in_frames = 0;
-
-  init_flags_ = AOM_CODEC_USE_PSNR;
-
+  SetupEncoder(500, 10);
   libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                     timebase.den, timebase.num, 0, 20);
-
-  // Error resilient mode ON.
-  cfg_.g_error_resilient = 1;
-  cfg_.kf_mode = AOM_KF_DISABLED;
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 20);
 
   // Set an arbitrary set of error frames same as droppable frames.
-  // In addition to isolated loss/drop, add a long consecutive series
-  // (of size 9) of dropped frames.
-  unsigned int num_droppable_frames = 5;
-  unsigned int droppable_frame_list[] = { 5, 10, 13, 16, 19 };
+  unsigned int num_droppable_frames = 3;
+  unsigned int droppable_frame_list[] = { 5, 10, 13 };
   SetDroppableFrames(num_droppable_frames, droppable_frame_list);
   SetErrorFrames(num_droppable_frames, droppable_frame_list);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   // Test that no mismatches have been found
+  std::cout << "             Encoded frames: " << GetEncodedFrames() << "\n";
+  std::cout << "             Decoded frames: " << GetDecodedFrames() << "\n";
   std::cout << "             Mismatch frames: " << GetMismatchFrames() << "\n";
-  EXPECT_EQ(GetMismatchFrames(), (unsigned int)0);
-
-  // Reset previously set of error/droppable frames.
-  Reset();
+  EXPECT_EQ(GetEncodedFrames() - GetDecodedFrames(), num_droppable_frames);
+}
 
-#if 0
-  // TODO(jkoleszar): This test is disabled for the time being as too
-  // sensitive. It's not clear how to set a reasonable threshold for
-  // this behavior.
+// Check for ParseAbility property of an error-resilient frame.
+// Encode a frame in error-resilient mode (E-frame), and disallow all
+// subsequent frames from using MFMV. If frames are dropped before the
+// E frame, all frames starting from the E frame should be parse-able.
+TEST_P(ErrorResilienceTestLarge, ParseAbilityTest) {
+  SetupEncoder(500, 10);
 
-  // Now set an arbitrary set of error frames that are non-droppable
-  unsigned int num_error_frames = 3;
-  unsigned int error_frame_list[] = {3, 10, 20};
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 15);
+
+  SetAllowMismatch(1);
+
+  // Note that an E-frame cannot be forced on a frame that is a
+  // show_existing_frame, or a frame that comes directly after an invisible
+  // frame. Currently, this will cause an assertion failure.
+  // Set an arbitrary error resilient (E) frame
+  unsigned int num_error_resilient_frames = 1;
+  unsigned int error_resilient_frame_list[] = { 8 };
+  SetErrorResilientFrames(num_error_resilient_frames,
+                          error_resilient_frame_list);
+  // Ensure that any invisible frames before the E frame are dropped
+  SetInvisibleErrorFrames(num_error_resilient_frames,
+                          error_resilient_frame_list);
+  // Set all frames after the error resilient frame to not allow MFMV
+  unsigned int num_post_error_resilient_frames = 6;
+  unsigned int post_error_resilient_frame_list[] = { 9, 10, 11, 12, 13, 14 };
+  SetNoMFMVFrames(num_post_error_resilient_frames,
+                  post_error_resilient_frame_list);
+
+  // Set a few frames before the E frame that are lost (not decoded)
+  unsigned int num_error_frames = 5;
+  unsigned int error_frame_list[] = { 3, 4, 5, 6, 7 };
   SetErrorFrames(num_error_frames, error_frame_list);
+
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  std::cout << "             Encoded frames: " << GetEncodedFrames() << "\n";
+  std::cout << "             Decoded frames: " << GetDecodedFrames() << "\n";
+  std::cout << "             Mismatch frames: " << GetMismatchFrames() << "\n";
+  EXPECT_EQ(GetEncodedFrames() - GetDecodedFrames(), num_error_frames);
+  // All frames following the E-frame and the E-frame are expected to have
+  // mismatches, but still be parse-able.
+  EXPECT_LE(GetMismatchFrames(), num_post_error_resilient_frames + 1);
+}
 
-  // Test that dropping an arbitrary set of inter frames does not hurt too much
-  // Note the Average Mismatch PSNR is the average of the PSNR between
-  // decoded frame and encoder's version of the same frame for all frames
-  // with mismatch.
-  const double psnr_resilience_mismatch = GetAverageMismatchPsnr();
-  std::cout << "             Mismatch PSNR: "
-            << psnr_resilience_mismatch << "\n";
-  EXPECT_GT(psnr_resilience_mismatch, 20.0);
-#endif
+// Check for ParseAbility property of an S frame.
+// Encode an S-frame. If frames are dropped before the S-frame, all frames
+// starting from the S frame should be parse-able.
+TEST_P(ErrorResilienceTestLarge, SFrameTest) {
+  SetupEncoder(500, 10);
+
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 15);
+
+  SetAllowMismatch(1);
+
+  // Note that an S-frame cannot be forced on a frame that is a
+  // show_existing_frame. This issue still needs to be addressed.
+  // Set an arbitrary S-frame
+  unsigned int num_s_frames = 1;
+  unsigned int s_frame_list[] = { 6 };
+  SetSFrames(num_s_frames, s_frame_list);
+  // Ensure that any invisible frames before the S frame are dropped
+  SetInvisibleErrorFrames(num_s_frames, s_frame_list);
+
+  // Set a few frames before the S frame that are lost (not decoded)
+  unsigned int num_error_frames = 4;
+  unsigned int error_frame_list[] = { 2, 3, 4, 5 };
+  SetErrorFrames(num_error_frames, error_frame_list);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  std::cout << "             Encoded frames: " << GetEncodedFrames() << "\n";
+  std::cout << "             Decoded frames: " << GetDecodedFrames() << "\n";
+  std::cout << "             Mismatch frames: " << GetMismatchFrames() << "\n";
+  EXPECT_EQ(GetEncodedFrames() - GetDecodedFrames(), num_error_frames);
+  // All frames following the S-frame and the S-frame are expected to have
+  // mismatches, but still be parse-able.
+  EXPECT_LE(GetMismatchFrames(), GetEncodedFrames() - s_frame_list[0]);
 }
 
-AV1_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES);
+AV1_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, NONREALTIME_TEST_MODES);
 }  // namespace
diff --git a/third_party/aom/test/ethread_test.cc b/third_party/aom/test/ethread_test.cc
index 86eb3228e..3dcc2a707 100644
--- a/third_party/aom/test/ethread_test.cc
+++ b/third_party/aom/test/ethread_test.cc
@@ -7,7 +7,7 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <string>
 #include <vector>
@@ -16,7 +16,7 @@
 #include "test/encode_test_driver.h"
 #include "test/md5_helper.h"
 #include "test/util.h"
-#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
 
 namespace {
 class AVxEncoderThreadTest
@@ -32,12 +32,10 @@ class AVxEncoderThreadTest
     cfg.h = 720;
     cfg.allow_lowbitdepth = 1;
     decoder_ = codec_->CreateDecoder(cfg, 0);
-#if CONFIG_AV1
     if (decoder_->IsAV1()) {
       decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);
       decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
     }
-#endif
 
     size_enc_.clear();
     md5_dec_.clear();
@@ -71,9 +69,6 @@ class AVxEncoderThreadTest
                                   ::libaom_test::Encoder *encoder) {
     if (!encoder_initialized_) {
       SetTileSize(encoder);
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-      encoder->Control(AV1E_SET_TILE_LOOPFILTER, 0);
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
       if (encoding_mode_ != ::libaom_test::kRealTime) {
         encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
@@ -118,7 +113,8 @@ class AVxEncoderThreadTest
   }
 
   void DoTest() {
-    ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 15, 18);
+    ::libaom_test::YUVVideoSource video(
+        "niklas_640_480_30.yuv", AOM_IMG_FMT_I420, 640, 480, 30, 1, 15, 18);
     cfg_.rc_target_bitrate = 1000;
 
     // Encode using single thread.
@@ -164,18 +160,16 @@ class AVxEncoderThreadTest
 };
 
 TEST_P(AVxEncoderThreadTest, EncoderResultTest) {
-#if CONFIG_AV1 && CONFIG_EXT_TILE
   cfg_.large_scale_tile = 0;
-#endif  // CONFIG_AV1 && CONFIG_EXT_TILE
+  decoder_->Control(AV1_SET_TILE_MODE, 0);
   DoTest();
 }
 
 class AVxEncoderThreadTestLarge : public AVxEncoderThreadTest {};
 
 TEST_P(AVxEncoderThreadTestLarge, EncoderResultTest) {
-#if CONFIG_AV1 && CONFIG_EXT_TILE
   cfg_.large_scale_tile = 0;
-#endif  // CONFIG_AV1 && CONFIG_EXT_TILE
+  decoder_->Control(AV1_SET_TILE_MODE, 0);
   DoTest();
 }
 
@@ -190,7 +184,6 @@ AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTestLarge,
                                             ::libaom_test::kOnePassGood),
                           ::testing::Range(0, 2));
 
-#if CONFIG_AV1 && CONFIG_EXT_TILE
 class AVxEncoderThreadLSTest : public AVxEncoderThreadTest {
   virtual void SetTileSize(libaom_test::Encoder *encoder) {
     encoder->Control(AV1E_SET_TILE_COLUMNS, 1);
@@ -200,15 +193,17 @@ class AVxEncoderThreadLSTest : public AVxEncoderThreadTest {
   }
 };
 
-TEST_P(AVxEncoderThreadLSTest, EncoderResultTest) {
+TEST_P(AVxEncoderThreadLSTest, DISABLED_EncoderResultTest) {
   cfg_.large_scale_tile = 1;
+  decoder_->Control(AV1_SET_TILE_MODE, 1);
   DoTest();
 }
 
 class AVxEncoderThreadLSTestLarge : public AVxEncoderThreadLSTest {};
 
-TEST_P(AVxEncoderThreadLSTestLarge, EncoderResultTest) {
+TEST_P(AVxEncoderThreadLSTestLarge, DISABLED_EncoderResultTest) {
   cfg_.large_scale_tile = 1;
+  decoder_->Control(AV1_SET_TILE_MODE, 1);
   DoTest();
 }
 
@@ -220,5 +215,4 @@ AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadLSTestLarge,
                           ::testing::Values(::libaom_test::kTwoPassGood,
                                             ::libaom_test::kOnePassGood),
                           ::testing::Range(0, 2));
-#endif  // CONFIG_AV1 && CONFIG_EXT_TILE
 }  // namespace
diff --git a/third_party/aom/test/examples.sh b/third_party/aom/test/examples.sh
index d3152be7d..2cdb89dd0 100755
--- a/third_party/aom/test/examples.sh
+++ b/third_party/aom/test/examples.sh
@@ -12,10 +12,10 @@
 ##
 . $(dirname $0)/tools_common.sh
 
-example_tests=$(ls $(dirname $0)/*.sh)
+example_tests=$(ls -r $(dirname $0)/*.sh)
 
 # List of script names to exclude.
-exclude_list="examples tools_common decode_to_md5"
+exclude_list="best_encode examples run_encodes tools_common"
 
 # Filter out the scripts in $exclude_list.
 for word in ${exclude_list}; do
diff --git a/third_party/aom/test/fdct4x4_test.cc b/third_party/aom/test/fdct4x4_test.cc
deleted file mode 100644
index 5fad1667b..000000000
--- a/third_party/aom/test/fdct4x4_test.cc
+++ /dev/null
@@ -1,350 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
-
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/transform_test_base.h"
-#include "test/util.h"
-#include "av1/common/entropy.h"
-#include "aom/aom_codec.h"
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        const TxfmParam *txfm_param);
-using libaom_test::FhtFunc;
-
-typedef std::tr1::tuple<FdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t, int>
-    Dct4x4Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t, int>
-    Ht4x4Param;
-
-void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,
-                 TxfmParam * /*txfm_param*/) {
-  aom_fdct4x4_c(in, out, stride);
-}
-
-void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
-                TxfmParam *txfm_param) {
-  av1_fht4x4_c(in, out, stride, txfm_param);
-}
-
-void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
-                 TxfmParam * /*txfm_param*/) {
-  av1_fwht4x4_c(in, out, stride);
-}
-
-#if CONFIG_HIGHBITDEPTH
-void fht4x4_10(const int16_t *in, tran_low_t *out, int stride,
-               TxfmParam *txfm_param) {
-  av1_fwd_txfm2d_4x4_c(in, out, stride, txfm_param->tx_type, 10);
-}
-
-void fht4x4_12(const int16_t *in, tran_low_t *out, int stride,
-               TxfmParam *txfm_param) {
-  av1_fwd_txfm2d_4x4_c(in, out, stride, txfm_param->tx_type, 12);
-}
-
-void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride,
-               const TxfmParam *txfm_param) {
-  av1_inv_txfm2d_add_4x4_c(in, CONVERT_TO_SHORTPTR(out), stride,
-                           txfm_param->tx_type, 10);
-}
-
-void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride,
-               const TxfmParam *txfm_param) {
-  av1_inv_txfm2d_add_4x4_c(in, CONVERT_TO_SHORTPTR(out), stride,
-                           txfm_param->tx_type, 12);
-}
-
-void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
-  aom_highbd_iwht4x4_16_add_c(in, out, stride, 10);
-}
-
-void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
-  aom_highbd_iwht4x4_16_add_c(in, out, stride, 12);
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-class Trans4x4DCT : public libaom_test::TransformTestBase,
-                    public ::testing::TestWithParam<Dct4x4Param> {
- public:
-  virtual ~Trans4x4DCT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    pitch_ = 4;
-    height_ = 4;
-    fwd_txfm_ref = fdct4x4_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-    num_coeffs_ = GET_PARAM(4);
-    txfm_param_.tx_type = GET_PARAM(2);
-  }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride);
-  }
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride);
-  }
-
-  FdctFunc fwd_txfm_;
-  IdctFunc inv_txfm_;
-};
-
-TEST_P(Trans4x4DCT, AccuracyCheck) { RunAccuracyCheck(0, 0.00001); }
-
-TEST_P(Trans4x4DCT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans4x4DCT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans4x4DCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
-
-class Trans4x4HT : public libaom_test::TransformTestBase,
-                   public ::testing::TestWithParam<Ht4x4Param> {
- public:
-  virtual ~Trans4x4HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    pitch_ = 4;
-    height_ = 4;
-    fwd_txfm_ref = fht4x4_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-    num_coeffs_ = GET_PARAM(4);
-    txfm_param_.tx_type = GET_PARAM(2);
-#if CONFIG_HIGHBITDEPTH
-    switch (bit_depth_) {
-      case AOM_BITS_10: fwd_txfm_ref = fht4x4_10; break;
-      case AOM_BITS_12: fwd_txfm_ref = fht4x4_12; break;
-      default: fwd_txfm_ref = fht4x4_ref; break;
-    }
-#endif
-  }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride, &txfm_param_);
-  }
-
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride, &txfm_param_);
-  }
-
-  FhtFunc fwd_txfm_;
-  IhtFunc inv_txfm_;
-};
-
-TEST_P(Trans4x4HT, AccuracyCheck) { RunAccuracyCheck(1, 0.005); }
-
-TEST_P(Trans4x4HT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans4x4HT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans4x4HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
-
-class Trans4x4WHT : public libaom_test::TransformTestBase,
-                    public ::testing::TestWithParam<Dct4x4Param> {
- public:
-  virtual ~Trans4x4WHT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    pitch_ = 4;
-    height_ = 4;
-    fwd_txfm_ref = fwht4x4_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-    num_coeffs_ = GET_PARAM(4);
-  }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride);
-  }
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride);
-  }
-
-  FdctFunc fwd_txfm_;
-  IdctFunc inv_txfm_;
-};
-
-TEST_P(Trans4x4WHT, AccuracyCheck) { RunAccuracyCheck(0, 0.00001); }
-
-TEST_P(Trans4x4WHT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans4x4WHT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans4x4WHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
-using std::tr1::make_tuple;
-
-INSTANTIATE_TEST_CASE_P(C, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&aom_fdct4x4_c,
-                                                     &aom_idct4x4_16_add_c,
-                                                     DCT_DCT, AOM_BITS_8, 16)));
-
-#if CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    DISABLED_C, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&fht4x4_12, &iht4x4_12, DCT_DCT, AOM_BITS_12, 16),
-        make_tuple(&fht4x4_12, &iht4x4_12, ADST_DCT, AOM_BITS_12, 16),
-        make_tuple(&fht4x4_12, &iht4x4_12, DCT_ADST, AOM_BITS_12, 16),
-        make_tuple(&fht4x4_12, &iht4x4_12, ADST_ADST, AOM_BITS_12, 16)));
-
-INSTANTIATE_TEST_CASE_P(
-    C, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&fht4x4_10, &iht4x4_10, DCT_DCT, AOM_BITS_10, 16),
-        make_tuple(&fht4x4_10, &iht4x4_10, ADST_DCT, AOM_BITS_10, 16),
-        make_tuple(&fht4x4_10, &iht4x4_10, DCT_ADST, AOM_BITS_10, 16),
-        make_tuple(&fht4x4_10, &iht4x4_10, ADST_ADST, AOM_BITS_10, 16),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, DCT_DCT, AOM_BITS_8,
-                   16),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, ADST_DCT, AOM_BITS_8,
-                   16),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, DCT_ADST, AOM_BITS_8,
-                   16),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, ADST_ADST, AOM_BITS_8,
-                   16)));
-#else
-INSTANTIATE_TEST_CASE_P(
-    C, Trans4x4HT,
-    ::testing::Values(make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, DCT_DCT,
-                                 AOM_BITS_8, 16),
-                      make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, ADST_DCT,
-                                 AOM_BITS_8, 16),
-                      make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, DCT_ADST,
-                                 AOM_BITS_8, 16),
-                      make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, ADST_ADST,
-                                 AOM_BITS_8, 16)));
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    C, Trans4x4WHT,
-    ::testing::Values(make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_10, DCT_DCT,
-                                 AOM_BITS_10, 16),
-                      make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_12, DCT_DCT,
-                                 AOM_BITS_12, 16),
-                      make_tuple(&av1_fwht4x4_c, &aom_iwht4x4_16_add_c, DCT_DCT,
-                                 AOM_BITS_8, 16)));
-#else
-INSTANTIATE_TEST_CASE_P(C, Trans4x4WHT,
-                        ::testing::Values(make_tuple(&av1_fwht4x4_c,
-                                                     &aom_iwht4x4_16_add_c,
-                                                     DCT_DCT, AOM_BITS_8, 16)));
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if HAVE_NEON_ASM && !CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(NEON, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&aom_fdct4x4_c,
-                                                     &aom_idct4x4_16_add_neon,
-                                                     DCT_DCT, AOM_BITS_8, 16)));
-#endif  // HAVE_NEON_ASM && !CONFIG_HIGHBITDEPTH
-
-#if HAVE_NEON && !CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    NEON, Trans4x4HT,
-    ::testing::Values(make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon,
-                                 DCT_DCT, AOM_BITS_8, 16),
-                      make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon,
-                                 ADST_DCT, AOM_BITS_8, 16),
-                      make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon,
-                                 DCT_ADST, AOM_BITS_8, 16),
-                      make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon,
-                                 ADST_ADST, AOM_BITS_8, 16)));
-#endif  // HAVE_NEON && !CONFIG_HIGHBITDEPTH
-
-#if HAVE_SSE2 && !CONFIG_DAALA_DCT4
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans4x4WHT,
-    ::testing::Values(make_tuple(&av1_fwht4x4_c, &aom_iwht4x4_16_add_c, DCT_DCT,
-                                 AOM_BITS_8, 16),
-                      make_tuple(&av1_fwht4x4_c, &aom_iwht4x4_16_add_sse2,
-                                 DCT_DCT, AOM_BITS_8, 16)));
-#endif
-
-#if HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(SSE2, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&aom_fdct4x4_sse2,
-                                                     &aom_idct4x4_16_add_sse2,
-                                                     DCT_DCT, AOM_BITS_8, 16)));
-#if !CONFIG_DAALA_DCT4
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans4x4HT,
-    ::testing::Values(make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2,
-                                 DCT_DCT, AOM_BITS_8, 16),
-                      make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2,
-                                 ADST_DCT, AOM_BITS_8, 16),
-                      make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2,
-                                 DCT_ADST, AOM_BITS_8, 16),
-                      make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2,
-                                 ADST_ADST, AOM_BITS_8, 16)));
-#endif  // !CONFIG_DAALA_DCT4
-#endif  // HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
-
-#if HAVE_SSE2 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_DCT4
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans4x4HT,
-    ::testing::Values(make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c,
-                                 DCT_DCT, AOM_BITS_8, 16),
-                      make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c,
-                                 ADST_DCT, AOM_BITS_8, 16),
-                      make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c,
-                                 DCT_ADST, AOM_BITS_8, 16),
-                      make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c,
-                                 ADST_ADST, AOM_BITS_8, 16)));
-#endif  // HAVE_SSE2 && CONFIG_HIGHBITDEPTH && !CONFIG_DAALA_DCT4
-
-#if HAVE_MSA && !CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(MSA, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&aom_fdct4x4_msa,
-                                                     &aom_idct4x4_16_add_msa,
-                                                     DCT_DCT, AOM_BITS_8, 16)));
-#if !CONFIG_EXT_TX && !CONFIG_DAALA_DCT4
-INSTANTIATE_TEST_CASE_P(
-    MSA, Trans4x4HT,
-    ::testing::Values(make_tuple(&av1_fht4x4_msa, &av1_iht4x4_16_add_msa,
-                                 DCT_DCT, AOM_BITS_8, 16),
-                      make_tuple(&av1_fht4x4_msa, &av1_iht4x4_16_add_msa,
-                                 ADST_DCT, AOM_BITS_8, 16),
-                      make_tuple(&av1_fht4x4_msa, &av1_iht4x4_16_add_msa,
-                                 DCT_ADST, AOM_BITS_8, 16),
-                      make_tuple(&av1_fht4x4_msa, &av1_iht4x4_16_add_msa,
-                                 ADST_ADST, AOM_BITS_8, 16)));
-#endif  // !CONFIG_EXT_TX && && !CONFIG_DAALA_DCT4
-#endif  // HAVE_MSA && !CONFIG_HIGHBITDEPTH
-}  // namespace
diff --git a/third_party/aom/test/fdct8x8_test.cc b/third_party/aom/test/fdct8x8_test.cc
deleted file mode 100644
index 99ae8d677..000000000
--- a/third_party/aom/test/fdct8x8_test.cc
+++ /dev/null
@@ -1,738 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "av1/common/entropy.h"
-#include "av1/common/scan.h"
-#include "aom/aom_codec.h"
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-
-const int kNumCoeffs = 64;
-const double kPi = 3.141592653589793238462643383279502884;
-
-const int kSignBiasMaxDiff255 = 1500;
-const int kSignBiasMaxDiff15 = 10000;
-
-typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
-                        TxfmParam *txfm_param);
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        const TxfmParam *txfm_param);
-
-typedef std::tr1::tuple<FdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t>
-    Dct8x8Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, TX_TYPE, aom_bit_depth_t> Ht8x8Param;
-typedef std::tr1::tuple<IdctFunc, IdctFunc, int, aom_bit_depth_t> Idct8x8Param;
-
-void reference_8x8_dct_1d(const double in[8], double out[8]) {
-  const double kInvSqrt2 = 0.707106781186547524400844362104;
-  for (int k = 0; k < 8; k++) {
-    out[k] = 0.0;
-    for (int n = 0; n < 8; n++)
-      out[k] += in[n] * cos(kPi * (2 * n + 1) * k / 16.0);
-    if (k == 0) out[k] = out[k] * kInvSqrt2;
-  }
-}
-
-void reference_8x8_dct_2d(const int16_t input[kNumCoeffs],
-                          double output[kNumCoeffs]) {
-  // First transform columns
-  for (int i = 0; i < 8; ++i) {
-    double temp_in[8], temp_out[8];
-    for (int j = 0; j < 8; ++j) temp_in[j] = input[j * 8 + i];
-    reference_8x8_dct_1d(temp_in, temp_out);
-    for (int j = 0; j < 8; ++j) output[j * 8 + i] = temp_out[j];
-  }
-  // Then transform rows
-  for (int i = 0; i < 8; ++i) {
-    double temp_in[8], temp_out[8];
-    for (int j = 0; j < 8; ++j) temp_in[j] = output[j + i * 8];
-    reference_8x8_dct_1d(temp_in, temp_out);
-    // Scale by some magic number
-    for (int j = 0; j < 8; ++j) output[j + i * 8] = temp_out[j] * 2;
-  }
-}
-
-void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride,
-                 TxfmParam * /*txfm_param*/) {
-  aom_fdct8x8_c(in, out, stride);
-}
-
-void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride,
-                TxfmParam *txfm_param) {
-  av1_fht8x8_c(in, out, stride, txfm_param);
-}
-
-#if CONFIG_HIGHBITDEPTH
-void fht8x8_10(const int16_t *in, tran_low_t *out, int stride,
-               TxfmParam *txfm_param) {
-  av1_fwd_txfm2d_8x8_c(in, out, stride, txfm_param->tx_type, 10);
-}
-
-void fht8x8_12(const int16_t *in, tran_low_t *out, int stride,
-               TxfmParam *txfm_param) {
-  av1_fwd_txfm2d_8x8_c(in, out, stride, txfm_param->tx_type, 12);
-}
-
-void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride,
-               const TxfmParam *txfm_param) {
-  av1_inv_txfm2d_add_8x8_c(in, CONVERT_TO_SHORTPTR(out), stride,
-                           txfm_param->tx_type, 10);
-}
-
-void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride,
-               const TxfmParam *txfm_param) {
-  av1_inv_txfm2d_add_8x8_c(in, CONVERT_TO_SHORTPTR(out), stride,
-                           txfm_param->tx_type, 12);
-}
-
-#endif  // CONFIG_HIGHBITDEPTH
-
-class FwdTrans8x8TestBase {
- public:
-  virtual ~FwdTrans8x8TestBase() {}
-
- protected:
-  virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
-  virtual void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) = 0;
-
-  void RunSignBiasCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    DECLARE_ALIGNED(16, int16_t, test_input_block[64]);
-    DECLARE_ALIGNED(16, tran_low_t, test_output_block[64]);
-    int count_sign_block[64][2];
-    const int count_test_block = 100000;
-
-    memset(count_sign_block, 0, sizeof(count_sign_block));
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
-      for (int j = 0; j < 64; ++j)
-        test_input_block[j] = ((rnd.Rand16() >> (16 - bit_depth_)) & mask_) -
-                              ((rnd.Rand16() >> (16 - bit_depth_)) & mask_);
-      ASM_REGISTER_STATE_CHECK(
-          RunFwdTxfm(test_input_block, test_output_block, pitch_));
-
-      for (int j = 0; j < 64; ++j) {
-        if (test_output_block[j] < 0)
-          ++count_sign_block[j][0];
-        else if (test_output_block[j] > 0)
-          ++count_sign_block[j][1];
-      }
-    }
-
-    for (int j = 0; j < 64; ++j) {
-      const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
-      const int max_diff = kSignBiasMaxDiff255;
-      EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
-          << "Error: 8x8 FDCT/FHT has a sign bias > "
-          << 1. * max_diff / count_test_block * 100 << "%"
-          << " for input range [-255, 255] at index " << j
-          << " count0: " << count_sign_block[j][0]
-          << " count1: " << count_sign_block[j][1] << " diff: " << diff;
-    }
-
-    memset(count_sign_block, 0, sizeof(count_sign_block));
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_ / 16, mask_ / 16].
-      for (int j = 0; j < 64; ++j)
-        test_input_block[j] =
-            ((rnd.Rand16() & mask_) >> 4) - ((rnd.Rand16() & mask_) >> 4);
-      ASM_REGISTER_STATE_CHECK(
-          RunFwdTxfm(test_input_block, test_output_block, pitch_));
-
-      for (int j = 0; j < 64; ++j) {
-        if (test_output_block[j] < 0)
-          ++count_sign_block[j][0];
-        else if (test_output_block[j] > 0)
-          ++count_sign_block[j][1];
-      }
-    }
-
-    for (int j = 0; j < 64; ++j) {
-      const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
-      const int max_diff = kSignBiasMaxDiff15;
-      EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
-          << "Error: 8x8 FDCT/FHT has a sign bias > "
-          << 1. * max_diff / count_test_block * 100 << "%"
-          << " for input range [-15, 15] at index " << j
-          << " count0: " << count_sign_block[j][0]
-          << " count1: " << count_sign_block[j][1] << " diff: " << diff;
-    }
-  }
-
-  void RunRoundTripErrorCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    int max_error = 0;
-    int total_error = 0;
-    const int count_test_block = 100000;
-    DECLARE_ALIGNED(16, int16_t, test_input_block[64]);
-    DECLARE_ALIGNED(16, tran_low_t, test_temp_block[64]);
-    DECLARE_ALIGNED(16, uint8_t, dst[64]);
-    DECLARE_ALIGNED(16, uint8_t, src[64]);
-#if CONFIG_HIGHBITDEPTH
-    DECLARE_ALIGNED(16, uint16_t, dst16[64]);
-    DECLARE_ALIGNED(16, uint16_t, src16[64]);
-#endif
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < 64; ++j) {
-        if (bit_depth_ == AOM_BITS_8) {
-          src[j] = rnd.Rand8();
-          dst[j] = rnd.Rand8();
-          test_input_block[j] = src[j] - dst[j];
-#if CONFIG_HIGHBITDEPTH
-        } else {
-          src16[j] = rnd.Rand16() & mask_;
-          dst16[j] = rnd.Rand16() & mask_;
-          test_input_block[j] = src16[j] - dst16[j];
-#endif
-        }
-      }
-
-      ASM_REGISTER_STATE_CHECK(
-          RunFwdTxfm(test_input_block, test_temp_block, pitch_));
-      for (int j = 0; j < 64; ++j) {
-        if (test_temp_block[j] > 0) {
-          test_temp_block[j] += 2;
-          test_temp_block[j] /= 4;
-          test_temp_block[j] *= 4;
-        } else {
-          test_temp_block[j] -= 2;
-          test_temp_block[j] /= 4;
-          test_temp_block[j] *= 4;
-        }
-      }
-      if (bit_depth_ == AOM_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
-#if CONFIG_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
-#endif
-      }
-
-      for (int j = 0; j < 64; ++j) {
-#if CONFIG_HIGHBITDEPTH
-        const int diff =
-            bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        const int diff = dst[j] - src[j];
-#endif
-        const int error = diff * diff;
-        if (max_error < error) max_error = error;
-        total_error += error;
-      }
-    }
-
-    EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)
-        << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual"
-        << " roundtrip error > 1";
-
-    EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error)
-        << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip "
-        << "error > 1/5 per block";
-  }
-
-  void RunExtremalCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    int max_error = 0;
-    int total_error = 0;
-    int total_coeff_error = 0;
-    const int count_test_block = 100000;
-    DECLARE_ALIGNED(16, int16_t, test_input_block[64]);
-    DECLARE_ALIGNED(16, tran_low_t, test_temp_block[64]);
-    DECLARE_ALIGNED(16, tran_low_t, ref_temp_block[64]);
-    DECLARE_ALIGNED(16, uint8_t, dst[64]);
-    DECLARE_ALIGNED(16, uint8_t, src[64]);
-#if CONFIG_HIGHBITDEPTH
-    DECLARE_ALIGNED(16, uint16_t, dst16[64]);
-    DECLARE_ALIGNED(16, uint16_t, src16[64]);
-#endif
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < 64; ++j) {
-        if (bit_depth_ == AOM_BITS_8) {
-          if (i == 0) {
-            src[j] = 255;
-            dst[j] = 0;
-          } else if (i == 1) {
-            src[j] = 0;
-            dst[j] = 255;
-          } else {
-            src[j] = rnd.Rand8() % 2 ? 255 : 0;
-            dst[j] = rnd.Rand8() % 2 ? 255 : 0;
-          }
-          test_input_block[j] = src[j] - dst[j];
-#if CONFIG_HIGHBITDEPTH
-        } else {
-          if (i == 0) {
-            src16[j] = mask_;
-            dst16[j] = 0;
-          } else if (i == 1) {
-            src16[j] = 0;
-            dst16[j] = mask_;
-          } else {
-            src16[j] = rnd.Rand8() % 2 ? mask_ : 0;
-            dst16[j] = rnd.Rand8() % 2 ? mask_ : 0;
-          }
-          test_input_block[j] = src16[j] - dst16[j];
-#endif
-        }
-      }
-
-      ASM_REGISTER_STATE_CHECK(
-          RunFwdTxfm(test_input_block, test_temp_block, pitch_));
-      ASM_REGISTER_STATE_CHECK(
-          fwd_txfm_ref(test_input_block, ref_temp_block, pitch_, &txfm_param_));
-      if (bit_depth_ == AOM_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
-#if CONFIG_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
-#endif
-      }
-
-      for (int j = 0; j < 64; ++j) {
-#if CONFIG_HIGHBITDEPTH
-        const int diff =
-            bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        const int diff = dst[j] - src[j];
-#endif
-        const int error = diff * diff;
-        if (max_error < error) max_error = error;
-        total_error += error;
-
-        const int coeff_diff = test_temp_block[j] - ref_temp_block[j];
-        total_coeff_error += abs(coeff_diff);
-      }
-
-      EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)
-          << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has"
-          << "an individual roundtrip error > 1";
-
-      EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error)
-          << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"
-          << " roundtrip error > 1/5 per block";
-
-      EXPECT_EQ(0, total_coeff_error)
-          << "Error: Extremal 8x8 FDCT/FHT has"
-          << "overflow issues in the intermediate steps > 1";
-    }
-  }
-
-  void RunInvAccuracyCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 1000;
-    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_HIGHBITDEPTH
-    DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-#endif
-
-    for (int i = 0; i < count_test_block; ++i) {
-      double out_r[kNumCoeffs];
-
-      // Initialize a test block with input range [-255, 255].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        if (bit_depth_ == AOM_BITS_8) {
-          src[j] = rnd.Rand8() % 2 ? 255 : 0;
-          dst[j] = src[j] > 0 ? 0 : 255;
-          in[j] = src[j] - dst[j];
-#if CONFIG_HIGHBITDEPTH
-        } else {
-          src16[j] = rnd.Rand8() % 2 ? mask_ : 0;
-          dst16[j] = src16[j] > 0 ? 0 : mask_;
-          in[j] = src16[j] - dst16[j];
-#endif
-        }
-      }
-
-      reference_8x8_dct_2d(in, out_r);
-      for (int j = 0; j < kNumCoeffs; ++j)
-        coeff[j] = static_cast<tran_low_t>(round(out_r[j]));
-
-      if (bit_depth_ == AOM_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
-#if CONFIG_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
-#endif
-      }
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_HIGHBITDEPTH
-        const int diff =
-            bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        const int diff = dst[j] - src[j];
-#endif
-        const uint32_t error = diff * diff;
-        EXPECT_GE(1u << 2 * (bit_depth_ - 8), error)
-            << "Error: 8x8 IDCT has error " << error << " at index " << j;
-      }
-    }
-  }
-
-  void RunFwdAccuracyCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 1000;
-    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, coeff_r[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      double out_r[kNumCoeffs];
-
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j)
-        in[j] = rnd.Rand8() % 2 == 0 ? mask_ : -mask_;
-
-      RunFwdTxfm(in, coeff, pitch_);
-      reference_8x8_dct_2d(in, out_r);
-      for (int j = 0; j < kNumCoeffs; ++j)
-        coeff_r[j] = static_cast<tran_low_t>(round(out_r[j]));
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        const int32_t diff = coeff[j] - coeff_r[j];
-        const uint32_t error = diff * diff;
-        EXPECT_GE(9u << 2 * (bit_depth_ - 8), error)
-            << "Error: 8x8 DCT has error " << error << " at index " << j;
-      }
-    }
-  }
-
-  void CompareInvReference(IdctFunc ref_txfm, int thresh) {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 10000;
-    const int eob = 12;
-    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
-#if CONFIG_HIGHBITDEPTH
-    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
-#endif
-    const int16_t *scan = av1_default_scan_orders[TX_8X8].scan;
-
-    for (int i = 0; i < count_test_block; ++i) {
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        if (j < eob) {
-          // Random values less than the threshold, either positive or negative
-          coeff[scan[j]] = rnd(thresh) * (1 - 2 * (i % 2));
-        } else {
-          coeff[scan[j]] = 0;
-        }
-        if (bit_depth_ == AOM_BITS_8) {
-          dst[j] = 0;
-          ref[j] = 0;
-#if CONFIG_HIGHBITDEPTH
-        } else {
-          dst16[j] = 0;
-          ref16[j] = 0;
-#endif
-        }
-      }
-      if (bit_depth_ == AOM_BITS_8) {
-        ref_txfm(coeff, ref, pitch_);
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
-#if CONFIG_HIGHBITDEPTH
-      } else {
-        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
-#endif
-      }
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_HIGHBITDEPTH
-        const int diff =
-            bit_depth_ == AOM_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
-#else
-        const int diff = dst[j] - ref[j];
-#endif
-        const uint32_t error = diff * diff;
-        EXPECT_EQ(0u, error)
-            << "Error: 8x8 IDCT has error " << error << " at index " << j;
-      }
-    }
-  }
-  int pitch_;
-  FhtFunc fwd_txfm_ref;
-  aom_bit_depth_t bit_depth_;
-  int mask_;
-  TxfmParam txfm_param_;
-};
-
-class FwdTrans8x8DCT : public FwdTrans8x8TestBase,
-                       public ::testing::TestWithParam<Dct8x8Param> {
- public:
-  virtual ~FwdTrans8x8DCT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    pitch_ = 8;
-    fwd_txfm_ref = fdct8x8_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-    txfm_param_.tx_type = GET_PARAM(2);
-  }
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride);
-  }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride);
-  }
-
-  FdctFunc fwd_txfm_;
-  IdctFunc inv_txfm_;
-};
-
-TEST_P(FwdTrans8x8DCT, SignBiasCheck) { RunSignBiasCheck(); }
-
-TEST_P(FwdTrans8x8DCT, RoundTripErrorCheck) { RunRoundTripErrorCheck(); }
-
-TEST_P(FwdTrans8x8DCT, ExtremalCheck) { RunExtremalCheck(); }
-
-TEST_P(FwdTrans8x8DCT, FwdAccuracyCheck) { RunFwdAccuracyCheck(); }
-
-TEST_P(FwdTrans8x8DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); }
-
-class FwdTrans8x8HT : public FwdTrans8x8TestBase,
-                      public ::testing::TestWithParam<Ht8x8Param> {
- public:
-  virtual ~FwdTrans8x8HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    pitch_ = 8;
-    fwd_txfm_ref = fht8x8_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-    txfm_param_.tx_type = GET_PARAM(2);
-#if CONFIG_HIGHBITDEPTH
-    switch (bit_depth_) {
-      case AOM_BITS_10: fwd_txfm_ref = fht8x8_10; break;
-      case AOM_BITS_12: fwd_txfm_ref = fht8x8_12; break;
-      default: fwd_txfm_ref = fht8x8_ref; break;
-    }
-#endif
-  }
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride, &txfm_param_);
-  }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride, &txfm_param_);
-  }
-
-  FhtFunc fwd_txfm_;
-  IhtFunc inv_txfm_;
-};
-
-TEST_P(FwdTrans8x8HT, SignBiasCheck) { RunSignBiasCheck(); }
-
-TEST_P(FwdTrans8x8HT, RoundTripErrorCheck) { RunRoundTripErrorCheck(); }
-
-TEST_P(FwdTrans8x8HT, ExtremalCheck) { RunExtremalCheck(); }
-
-class InvTrans8x8DCT : public FwdTrans8x8TestBase,
-                       public ::testing::TestWithParam<Idct8x8Param> {
- public:
-  virtual ~InvTrans8x8DCT() {}
-
-  virtual void SetUp() {
-    ref_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    thresh_ = GET_PARAM(2);
-    pitch_ = 8;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-  }
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride);
-  }
-  void RunFwdTxfm(int16_t * /*out*/, tran_low_t * /*dst*/, int /*stride*/) {}
-
-  IdctFunc ref_txfm_;
-  IdctFunc inv_txfm_;
-  int thresh_;
-};
-
-TEST_P(InvTrans8x8DCT, CompareReference) {
-  CompareInvReference(ref_txfm_, thresh_);
-}
-
-using std::tr1::make_tuple;
-
-#if CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(C, FwdTrans8x8DCT,
-                        ::testing::Values(make_tuple(&aom_fdct8x8_c,
-                                                     &aom_idct8x8_64_add_c,
-                                                     DCT_DCT, AOM_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(C, FwdTrans8x8DCT,
-                        ::testing::Values(make_tuple(&aom_fdct8x8_c,
-                                                     &aom_idct8x8_64_add_c,
-                                                     DCT_DCT, AOM_BITS_8)));
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    C, FwdTrans8x8HT,
-    ::testing::Values(
-        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, DCT_DCT, AOM_BITS_8),
-        make_tuple(&fht8x8_10, &iht8x8_10, DCT_DCT, AOM_BITS_10),
-        make_tuple(&fht8x8_10, &iht8x8_10, ADST_DCT, AOM_BITS_10),
-        make_tuple(&fht8x8_10, &iht8x8_10, DCT_ADST, AOM_BITS_10),
-        make_tuple(&fht8x8_10, &iht8x8_10, ADST_ADST, AOM_BITS_10),
-        make_tuple(&fht8x8_12, &iht8x8_12, DCT_DCT, AOM_BITS_12),
-        make_tuple(&fht8x8_12, &iht8x8_12, ADST_DCT, AOM_BITS_12),
-        make_tuple(&fht8x8_12, &iht8x8_12, DCT_ADST, AOM_BITS_12),
-        make_tuple(&fht8x8_12, &iht8x8_12, ADST_ADST, AOM_BITS_12),
-        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, ADST_DCT, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, DCT_ADST, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, ADST_ADST,
-                   AOM_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(
-    C, FwdTrans8x8HT,
-    ::testing::Values(
-        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, DCT_DCT, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, ADST_DCT, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, DCT_ADST, AOM_BITS_8),
-        make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_c, ADST_ADST,
-                   AOM_BITS_8)));
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if HAVE_NEON_ASM && !CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT,
-                        ::testing::Values(make_tuple(&aom_fdct8x8_neon,
-                                                     &aom_idct8x8_64_add_neon,
-                                                     DCT_DCT, AOM_BITS_8)));
-#endif  // HAVE_NEON_ASM && !CONFIG_HIGHBITDEPTH
-
-#if HAVE_NEON && !CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    NEON, FwdTrans8x8HT,
-    ::testing::Values(make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_neon,
-                                 DCT_DCT, AOM_BITS_8),
-                      make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_neon,
-                                 ADST_DCT, AOM_BITS_8),
-                      make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_neon,
-                                 DCT_ADST, AOM_BITS_8),
-                      make_tuple(&av1_fht8x8_c, &av1_iht8x8_64_add_neon,
-                                 ADST_ADST, AOM_BITS_8)));
-#endif  // HAVE_NEON && !CONFIG_HIGHBITDEPTH
-
-#if HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(SSE2, FwdTrans8x8DCT,
-                        ::testing::Values(make_tuple(&aom_fdct8x8_sse2,
-                                                     &aom_idct8x8_64_add_sse2,
-                                                     DCT_DCT, AOM_BITS_8)));
-#if !CONFIG_DAALA_DCT8
-INSTANTIATE_TEST_CASE_P(
-    SSE2, FwdTrans8x8HT,
-    ::testing::Values(make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2,
-                                 DCT_DCT, AOM_BITS_8),
-                      make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2,
-                                 ADST_DCT, AOM_BITS_8),
-                      make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2,
-                                 DCT_ADST, AOM_BITS_8),
-                      make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2,
-                                 ADST_ADST, AOM_BITS_8)));
-#endif  // !CONFIG_DAALA_DCT8
-#endif  // HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
-
-#if HAVE_SSE2 && CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(SSE2, FwdTrans8x8DCT,
-                        ::testing::Values(make_tuple(&aom_fdct8x8_sse2,
-                                                     &aom_idct8x8_64_add_c,
-                                                     DCT_DCT, AOM_BITS_8)));
-#if !CONFIG_DAALA_DCT8
-INSTANTIATE_TEST_CASE_P(
-    SSE2, FwdTrans8x8HT,
-    ::testing::Values(make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_c,
-                                 DCT_DCT, AOM_BITS_8),
-                      make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_c,
-                                 ADST_DCT, AOM_BITS_8),
-                      make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_c,
-                                 DCT_ADST, AOM_BITS_8),
-                      make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_c,
-                                 ADST_ADST, AOM_BITS_8)));
-#endif  // !CONFIG_DAALA_DCT8
-#endif  // HAVE_SSE2 && CONFIG_HIGHBITDEPTH
-
-#if HAVE_SSSE3 && ARCH_X86_64
-INSTANTIATE_TEST_CASE_P(SSSE3, FwdTrans8x8DCT,
-                        ::testing::Values(make_tuple(&aom_fdct8x8_ssse3,
-                                                     &aom_idct8x8_64_add_ssse3,
-                                                     DCT_DCT, AOM_BITS_8)));
-#endif
-
-#if HAVE_MSA && !CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(MSA, FwdTrans8x8DCT,
-                        ::testing::Values(make_tuple(&aom_fdct8x8_msa,
-                                                     &aom_idct8x8_64_add_msa,
-                                                     DCT_DCT, AOM_BITS_8)));
-#if !CONFIG_EXT_TX && !CONFIG_DAALA_DCT8
-INSTANTIATE_TEST_CASE_P(
-    MSA, FwdTrans8x8HT,
-    ::testing::Values(make_tuple(&av1_fht8x8_msa, &av1_iht8x8_64_add_msa,
-                                 DCT_DCT, AOM_BITS_8),
-                      make_tuple(&av1_fht8x8_msa, &av1_iht8x8_64_add_msa,
-                                 ADST_DCT, AOM_BITS_8),
-                      make_tuple(&av1_fht8x8_msa, &av1_iht8x8_64_add_msa,
-                                 DCT_ADST, AOM_BITS_8),
-                      make_tuple(&av1_fht8x8_msa, &av1_iht8x8_64_add_msa,
-                                 ADST_ADST, AOM_BITS_8)));
-#endif  // !CONFIG_EXT_TX && !CONFIG_DAALA_DCT8
-#endif  // HAVE_MSA && !CONFIG_HIGHBITDEPTH
-}  // namespace
diff --git a/third_party/aom/test/fft_test.cc b/third_party/aom/test/fft_test.cc
new file mode 100644
index 000000000..56187cdbb
--- /dev/null
+++ b/third_party/aom/test/fft_test.cc
@@ -0,0 +1,263 @@
+#include <math.h>
+
+#include <algorithm>
+#include <complex>
+#include <vector>
+
+#include "aom_dsp/fft_common.h"
+#include "aom_mem/aom_mem.h"
+#if ARCH_X86 || ARCH_X86_64
+#include "aom_ports/x86.h"
+#endif
+#include "av1/common/common.h"
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+typedef void (*tform_fun_t)(const float *input, float *temp, float *output);
+
+// Simple 1D FFT implementation
+template <typename InputType>
+void fft(const InputType *data, std::complex<float> *result, int n) {
+  if (n == 1) {
+    result[0] = data[0];
+    return;
+  }
+  std::vector<InputType> temp(n);
+  for (int k = 0; k < n / 2; ++k) {
+    temp[k] = data[2 * k];
+    temp[n / 2 + k] = data[2 * k + 1];
+  }
+  fft(&temp[0], result, n / 2);
+  fft(&temp[n / 2], result + n / 2, n / 2);
+  for (int k = 0; k < n / 2; ++k) {
+    std::complex<float> w = std::complex<float>((float)cos(2. * PI * k / n),
+                                                (float)-sin(2. * PI * k / n));
+    std::complex<float> a = result[k];
+    std::complex<float> b = result[n / 2 + k];
+    result[k] = a + w * b;
+    result[n / 2 + k] = a - w * b;
+  }
+}
+
+void transpose(std::vector<std::complex<float> > *data, int n) {
+  for (int y = 0; y < n; ++y) {
+    for (int x = y + 1; x < n; ++x) {
+      std::swap((*data)[y * n + x], (*data)[x * n + y]);
+    }
+  }
+}
+
+// Simple 2D FFT implementation
+template <class InputType>
+std::vector<std::complex<float> > fft2d(const InputType *input, int n) {
+  std::vector<std::complex<float> > rowfft(n * n);
+  std::vector<std::complex<float> > result(n * n);
+  for (int y = 0; y < n; ++y) {
+    fft(input + y * n, &rowfft[y * n], n);
+  }
+  transpose(&rowfft, n);
+  for (int y = 0; y < n; ++y) {
+    fft(&rowfft[y * n], &result[y * n], n);
+  }
+  transpose(&result, n);
+  return result;
+}
+
+struct FFTTestArg {
+  int n;
+  void (*fft)(const float *input, float *temp, float *output);
+  int flag;
+  FFTTestArg(int n_in, tform_fun_t fft_in, int flag_in)
+      : n(n_in), fft(fft_in), flag(flag_in) {}
+};
+
+std::ostream &operator<<(std::ostream &os, const FFTTestArg &test_arg) {
+  return os << "fft_arg { n:" << test_arg.n << " fft:" << test_arg.fft
+            << " flag:" << test_arg.flag << "}";
+}
+
+class FFT2DTest : public ::testing::TestWithParam<FFTTestArg> {
+ protected:
+  void SetUp() {
+    int n = GetParam().n;
+    input_ = (float *)aom_memalign(32, sizeof(*input_) * n * n);
+    temp_ = (float *)aom_memalign(32, sizeof(*temp_) * n * n);
+    output_ = (float *)aom_memalign(32, sizeof(*output_) * n * n * 2);
+    memset(input_, 0, sizeof(*input_) * n * n);
+    memset(temp_, 0, sizeof(*temp_) * n * n);
+    memset(output_, 0, sizeof(*output_) * n * n * 2);
+#if ARCH_X86 || ARCH_X86_64
+    disabled_ = GetParam().flag != 0 && !(x86_simd_caps() & GetParam().flag);
+#else
+    disabled_ = GetParam().flag != 0;
+#endif
+  }
+  void TearDown() {
+    aom_free(input_);
+    aom_free(temp_);
+    aom_free(output_);
+  }
+  int disabled_;
+  float *input_;
+  float *temp_;
+  float *output_;
+};
+
+TEST_P(FFT2DTest, Correct) {
+  if (disabled_) return;
+
+  int n = GetParam().n;
+  for (int i = 0; i < n * n; ++i) {
+    input_[i] = 1;
+    std::vector<std::complex<float> > expected = fft2d<float>(&input_[0], n);
+    GetParam().fft(&input_[0], &temp_[0], &output_[0]);
+    for (int y = 0; y < n; ++y) {
+      for (int x = 0; x < (n / 2) + 1; ++x) {
+        EXPECT_NEAR(expected[y * n + x].real(), output_[2 * (y * n + x)], 1e-5);
+        EXPECT_NEAR(expected[y * n + x].imag(), output_[2 * (y * n + x) + 1],
+                    1e-5);
+      }
+    }
+    input_[i] = 0;
+  }
+}
+
+TEST_P(FFT2DTest, Benchmark) {
+  if (disabled_) return;
+
+  int n = GetParam().n;
+  float sum = 0;
+  for (int i = 0; i < 1000 * (64 - n); ++i) {
+    input_[i % (n * n)] = 1;
+    GetParam().fft(&input_[0], &temp_[0], &output_[0]);
+    sum += output_[0];
+    input_[i % (n * n)] = 0;
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    FFT2DTestC, FFT2DTest,
+    ::testing::Values(FFTTestArg(2, aom_fft2x2_float_c, 0),
+                      FFTTestArg(4, aom_fft4x4_float_c, 0),
+                      FFTTestArg(8, aom_fft8x8_float_c, 0),
+                      FFTTestArg(16, aom_fft16x16_float_c, 0),
+                      FFTTestArg(32, aom_fft32x32_float_c, 0)));
+#if ARCH_X86 || ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(
+    FFT2DTestSSE2, FFT2DTest,
+    ::testing::Values(FFTTestArg(4, aom_fft4x4_float_sse2, HAS_SSE2),
+                      FFTTestArg(8, aom_fft8x8_float_sse2, HAS_SSE2),
+                      FFTTestArg(16, aom_fft16x16_float_sse2, HAS_SSE2),
+                      FFTTestArg(32, aom_fft32x32_float_sse2, HAS_SSE2)));
+
+INSTANTIATE_TEST_CASE_P(
+    FFT2DTestAVX2, FFT2DTest,
+    ::testing::Values(FFTTestArg(8, aom_fft8x8_float_avx2, HAS_AVX2),
+                      FFTTestArg(16, aom_fft16x16_float_avx2, HAS_AVX2),
+                      FFTTestArg(32, aom_fft32x32_float_avx2, HAS_AVX2)));
+#endif
+
+struct IFFTTestArg {
+  int n;
+  tform_fun_t ifft;
+  int flag;
+  IFFTTestArg(int n_in, tform_fun_t ifft_in, int flag_in)
+      : n(n_in), ifft(ifft_in), flag(flag_in) {}
+};
+
+std::ostream &operator<<(std::ostream &os, const IFFTTestArg &test_arg) {
+  return os << "ifft_arg { n:" << test_arg.n << " fft:" << test_arg.ifft
+            << " flag:" << test_arg.flag << "}";
+}
+
+class IFFT2DTest : public ::testing::TestWithParam<IFFTTestArg> {
+ protected:
+  void SetUp() {
+    int n = GetParam().n;
+    input_ = (float *)aom_memalign(32, sizeof(*input_) * n * n * 2);
+    temp_ = (float *)aom_memalign(32, sizeof(*temp_) * n * n * 2);
+    output_ = (float *)aom_memalign(32, sizeof(*output_) * n * n);
+    memset(input_, 0, sizeof(*input_) * n * n * 2);
+    memset(temp_, 0, sizeof(*temp_) * n * n * 2);
+    memset(output_, 0, sizeof(*output_) * n * n);
+#if ARCH_X86 || ARCH_X86_64
+    disabled_ = GetParam().flag != 0 && !(x86_simd_caps() & GetParam().flag);
+#else
+    disabled_ = GetParam().flag != 0;
+#endif
+  }
+  void TearDown() {
+    aom_free(input_);
+    aom_free(temp_);
+    aom_free(output_);
+  }
+  int disabled_;
+  float *input_;
+  float *temp_;
+  float *output_;
+};
+
+TEST_P(IFFT2DTest, Correctness) {
+  if (disabled_) return;
+  int n = GetParam().n;
+  ASSERT_GE(n, 2);
+  std::vector<float> expected(n * n);
+  std::vector<float> actual(n * n);
+  // Do forward transform then invert to make sure we get back expected
+  for (int y = 0; y < n; ++y) {
+    for (int x = 0; x < n; ++x) {
+      expected[y * n + x] = 1;
+      std::vector<std::complex<float> > input_c = fft2d(&expected[0], n);
+      for (int i = 0; i < n * n; ++i) {
+        input_[2 * i + 0] = input_c[i].real();
+        input_[2 * i + 1] = input_c[i].imag();
+      }
+      GetParam().ifft(&input_[0], &temp_[0], &output_[0]);
+
+      for (int yy = 0; yy < n; ++yy) {
+        for (int xx = 0; xx < n; ++xx) {
+          EXPECT_NEAR(expected[yy * n + xx], output_[yy * n + xx] / (n * n),
+                      1e-5);
+        }
+      }
+      expected[y * n + x] = 0;
+    }
+  }
+};
+
+TEST_P(IFFT2DTest, Benchmark) {
+  if (disabled_) return;
+  int n = GetParam().n;
+  float sum = 0;
+  for (int i = 0; i < 1000 * (64 - n); ++i) {
+    input_[i % (n * n)] = 1;
+    GetParam().ifft(&input_[0], &temp_[0], &output_[0]);
+    sum += output_[0];
+    input_[i % (n * n)] = 0;
+  }
+}
+INSTANTIATE_TEST_CASE_P(
+    IFFT2DTestC, IFFT2DTest,
+    ::testing::Values(IFFTTestArg(2, aom_ifft2x2_float_c, 0),
+                      IFFTTestArg(4, aom_ifft4x4_float_c, 0),
+                      IFFTTestArg(8, aom_ifft8x8_float_c, 0),
+                      IFFTTestArg(16, aom_ifft16x16_float_c, 0),
+                      IFFTTestArg(32, aom_ifft32x32_float_c, 0)));
+#if ARCH_X86 || ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(
+    IFFT2DTestSSE2, IFFT2DTest,
+    ::testing::Values(IFFTTestArg(4, aom_ifft4x4_float_sse2, HAS_SSE2),
+                      IFFTTestArg(8, aom_ifft8x8_float_sse2, HAS_SSE2),
+                      IFFTTestArg(16, aom_ifft16x16_float_sse2, HAS_SSE2),
+                      IFFTTestArg(32, aom_ifft32x32_float_sse2, HAS_SSE2)));
+
+INSTANTIATE_TEST_CASE_P(
+    IFFT2DTestAVX2, IFFT2DTest,
+    ::testing::Values(IFFTTestArg(8, aom_ifft8x8_float_avx2, HAS_AVX2),
+                      IFFTTestArg(16, aom_ifft16x16_float_avx2, HAS_AVX2),
+                      IFFTTestArg(32, aom_ifft32x32_float_avx2, HAS_AVX2)));
+#endif
+}  // namespace
diff --git a/third_party/aom/test/film_grain_table_test.cc b/third_party/aom/test/film_grain_table_test.cc
new file mode 100644
index 000000000..068814635
--- /dev/null
+++ b/third_party/aom/test/film_grain_table_test.cc
@@ -0,0 +1,239 @@
+#include <string>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "aom_dsp/grain_table.h"
+#include "aom/internal/aom_codec_internal.h"
+#include "av1/encoder/grain_test_vectors.h"
+#include "test/video_source.h"
+
+void grain_equal(const aom_film_grain_t *expected,
+                 const aom_film_grain_t *actual) {
+  EXPECT_EQ(expected->apply_grain, actual->apply_grain);
+  EXPECT_EQ(expected->update_parameters, actual->update_parameters);
+  if (!expected->update_parameters) return;
+  EXPECT_EQ(expected->num_y_points, actual->num_y_points);
+  EXPECT_EQ(expected->num_cb_points, actual->num_cb_points);
+  EXPECT_EQ(expected->num_cr_points, actual->num_cr_points);
+  EXPECT_EQ(0, memcmp(expected->scaling_points_y, actual->scaling_points_y,
+                      expected->num_y_points *
+                          sizeof(expected->scaling_points_y[0])));
+  EXPECT_EQ(0, memcmp(expected->scaling_points_cb, actual->scaling_points_cb,
+                      expected->num_cb_points *
+                          sizeof(expected->scaling_points_cb[0])));
+  EXPECT_EQ(0, memcmp(expected->scaling_points_cr, actual->scaling_points_cr,
+                      expected->num_cr_points *
+                          sizeof(expected->scaling_points_cr[0])));
+  EXPECT_EQ(expected->scaling_shift, actual->scaling_shift);
+  EXPECT_EQ(expected->ar_coeff_lag, actual->ar_coeff_lag);
+  EXPECT_EQ(expected->ar_coeff_shift, actual->ar_coeff_shift);
+
+  const int num_pos_luma =
+      2 * expected->ar_coeff_lag * (expected->ar_coeff_lag + 1);
+  const int num_pos_chroma = num_pos_luma;
+  EXPECT_EQ(0, memcmp(expected->ar_coeffs_y, actual->ar_coeffs_y,
+                      sizeof(expected->ar_coeffs_y[0]) * num_pos_luma));
+  if (actual->num_cb_points || actual->chroma_scaling_from_luma) {
+    EXPECT_EQ(0, memcmp(expected->ar_coeffs_cb, actual->ar_coeffs_cb,
+                        sizeof(expected->ar_coeffs_cb[0]) * num_pos_chroma));
+  }
+  if (actual->num_cr_points || actual->chroma_scaling_from_luma) {
+    EXPECT_EQ(0, memcmp(expected->ar_coeffs_cr, actual->ar_coeffs_cr,
+                        sizeof(expected->ar_coeffs_cr[0]) * num_pos_chroma));
+  }
+  EXPECT_EQ(expected->overlap_flag, actual->overlap_flag);
+  EXPECT_EQ(expected->chroma_scaling_from_luma,
+            actual->chroma_scaling_from_luma);
+  EXPECT_EQ(expected->grain_scale_shift, actual->grain_scale_shift);
+  // EXPECT_EQ(expected->random_seed, actual->random_seed);
+
+  // clip_to_restricted and bit_depth aren't written
+  if (expected->num_cb_points) {
+    EXPECT_EQ(expected->cb_mult, actual->cb_mult);
+    EXPECT_EQ(expected->cb_luma_mult, actual->cb_luma_mult);
+    EXPECT_EQ(expected->cb_offset, actual->cb_offset);
+  }
+  if (expected->num_cr_points) {
+    EXPECT_EQ(expected->cr_mult, actual->cr_mult);
+    EXPECT_EQ(expected->cr_luma_mult, actual->cr_luma_mult);
+    EXPECT_EQ(expected->cr_offset, actual->cr_offset);
+  }
+}
+
+TEST(FilmGrainTableTest, AddAndLookupSingleSegment) {
+  aom_film_grain_table_t table;
+  memset(&table, 0, sizeof(table));
+
+  aom_film_grain_t grain;
+  EXPECT_FALSE(aom_film_grain_table_lookup(&table, 0, 1000, false, &grain));
+
+  aom_film_grain_table_append(&table, 1000, 2000, film_grain_test_vectors + 0);
+  EXPECT_FALSE(aom_film_grain_table_lookup(&table, 0, 1000, false, &grain));
+  EXPECT_FALSE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain));
+
+  EXPECT_TRUE(aom_film_grain_table_lookup(&table, 1000, 2000, false, &grain));
+
+  grain.bit_depth = film_grain_test_vectors[0].bit_depth;
+  EXPECT_EQ(0, memcmp(&grain, film_grain_test_vectors + 0, sizeof(table)));
+
+  // Extend the existing segment
+  aom_film_grain_table_append(&table, 2000, 3000, film_grain_test_vectors + 0);
+  EXPECT_EQ(0, table.head->next);
+
+  // Lookup and remove and check that the entry is no longer there
+  EXPECT_TRUE(aom_film_grain_table_lookup(&table, 1000, 2000, true, &grain));
+  EXPECT_FALSE(aom_film_grain_table_lookup(&table, 1000, 2000, false, &grain));
+
+  EXPECT_TRUE(aom_film_grain_table_lookup(&table, 2000, 3000, true, &grain));
+  EXPECT_FALSE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain));
+
+  EXPECT_EQ(0, table.head);
+  EXPECT_EQ(0, table.tail);
+  aom_film_grain_table_free(&table);
+}
+
+TEST(FilmGrainTableTest, SplitSingleSegment) {
+  aom_film_grain_table_t table;
+  aom_film_grain_t grain;
+  memset(&table, 0, sizeof(table));
+
+  aom_film_grain_table_append(&table, 0, 1000, film_grain_test_vectors + 0);
+
+  // Test lookup and remove that adjusts start time
+  EXPECT_TRUE(aom_film_grain_table_lookup(&table, 0, 100, true, &grain));
+  EXPECT_EQ(NULL, table.head->next);
+  EXPECT_EQ(100, table.head->start_time);
+
+  // Test lookup and remove that adjusts end time
+  EXPECT_TRUE(aom_film_grain_table_lookup(&table, 900, 1000, true, &grain));
+  EXPECT_EQ(NULL, table.head->next);
+  EXPECT_EQ(100, table.head->start_time);
+  EXPECT_EQ(900, table.head->end_time);
+
+  // Test lookup and remove that splits the first entry
+  EXPECT_TRUE(aom_film_grain_table_lookup(&table, 400, 600, true, &grain));
+  EXPECT_EQ(100, table.head->start_time);
+  EXPECT_EQ(400, table.head->end_time);
+
+  ASSERT_NE((void *)NULL, table.head->next);
+  EXPECT_EQ(table.tail, table.head->next);
+  EXPECT_EQ(600, table.head->next->start_time);
+  EXPECT_EQ(900, table.head->next->end_time);
+
+  aom_film_grain_table_free(&table);
+}
+
+TEST(FilmGrainTableTest, AddAndLookupMultipleSegments) {
+  aom_film_grain_table_t table;
+  memset(&table, 0, sizeof(table));
+
+  aom_film_grain_t grain;
+  const int kNumTestVectors =
+      sizeof(film_grain_test_vectors) / sizeof(film_grain_test_vectors[0]);
+  for (int i = 0; i < kNumTestVectors; ++i) {
+    aom_film_grain_table_append(&table, i * 1000, (i + 1) * 1000,
+                                film_grain_test_vectors + i);
+  }
+
+  for (int i = kNumTestVectors - 1; i >= 0; --i) {
+    EXPECT_TRUE(aom_film_grain_table_lookup(&table, i * 1000, (i + 1) * 1000,
+                                            true, &grain));
+    grain_equal(film_grain_test_vectors + i, &grain);
+    EXPECT_FALSE(aom_film_grain_table_lookup(&table, i * 1000, (i + 1) * 1000,
+                                             true, &grain));
+  }
+
+  // Verify that all the data has been removed
+  for (int i = 0; i < kNumTestVectors; ++i) {
+    EXPECT_FALSE(aom_film_grain_table_lookup(&table, i * 1000, (i + 1) * 1000,
+                                             true, &grain));
+  }
+  aom_film_grain_table_free(&table);
+}
+
+class FilmGrainTableIOTest : public ::testing::Test {
+ protected:
+  void SetUp() { memset(&error_, 0, sizeof(error_)); }
+  struct aom_internal_error_info error_;
+};
+
+TEST_F(FilmGrainTableIOTest, ReadMissingFile) {
+  aom_film_grain_table_t table;
+  memset(&table, 0, sizeof(table));
+  ASSERT_EQ(AOM_CODEC_ERROR, aom_film_grain_table_read(
+                                 &table, "/path/to/missing/file", &error_));
+}
+
+TEST_F(FilmGrainTableIOTest, ReadTruncatedFile) {
+  aom_film_grain_table_t table;
+  memset(&table, 0, sizeof(table));
+
+  std::string grain_file;
+  FILE *file = libaom_test::GetTempOutFile(&grain_file);
+  fwrite("deadbeef", 8, 1, file);
+  fclose(file);
+  ASSERT_EQ(AOM_CODEC_ERROR,
+            aom_film_grain_table_read(&table, grain_file.c_str(), &error_));
+  EXPECT_EQ(0, remove(grain_file.c_str()));
+}
+
+TEST_F(FilmGrainTableIOTest, RoundTripReadWrite) {
+  aom_film_grain_table_t table;
+  memset(&table, 0, sizeof(table));
+
+  aom_film_grain_t expected_grain[16];
+  const int kNumTestVectors =
+      sizeof(film_grain_test_vectors) / sizeof(film_grain_test_vectors[0]);
+  for (int i = 0; i < kNumTestVectors; ++i) {
+    expected_grain[i] = film_grain_test_vectors[i];
+    expected_grain[i].random_seed = i;
+    expected_grain[i].update_parameters = i % 2;
+    expected_grain[i].apply_grain = (i + 1) % 2;
+    expected_grain[i].bit_depth = 0;
+    aom_film_grain_table_append(&table, i * 1000, (i + 1) * 1000,
+                                expected_grain + i);
+  }
+  std::string grain_file;
+  fclose(libaom_test::GetTempOutFile(&grain_file));
+  ASSERT_EQ(AOM_CODEC_OK,
+            aom_film_grain_table_write(&table, grain_file.c_str(), &error_));
+  aom_film_grain_table_free(&table);
+
+  memset(&table, 0, sizeof(table));
+  ASSERT_EQ(AOM_CODEC_OK,
+            aom_film_grain_table_read(&table, grain_file.c_str(), &error_));
+  for (int i = 0; i < kNumTestVectors; ++i) {
+    aom_film_grain_t grain;
+    EXPECT_TRUE(aom_film_grain_table_lookup(&table, i * 1000, (i + 1) * 1000,
+                                            true, &grain));
+    grain_equal(expected_grain + i, &grain);
+  }
+  aom_film_grain_table_free(&table);
+  EXPECT_EQ(0, remove(grain_file.c_str()));
+}
+
+TEST_F(FilmGrainTableIOTest, RoundTripSplit) {
+  std::string grain_file;
+  fclose(libaom_test::GetTempOutFile(&grain_file));
+
+  aom_film_grain_table_t table;
+  memset(&table, 0, sizeof(table));
+
+  aom_film_grain_t grain = film_grain_test_vectors[0];
+  aom_film_grain_table_append(&table, 0, 3000, &grain);
+  ASSERT_TRUE(aom_film_grain_table_lookup(&table, 1000, 2000, true, &grain));
+  ASSERT_TRUE(aom_film_grain_table_lookup(&table, 0, 1000, false, &grain));
+  EXPECT_FALSE(aom_film_grain_table_lookup(&table, 1000, 2000, false, &grain));
+  ASSERT_TRUE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain));
+  ASSERT_EQ(AOM_CODEC_OK,
+            aom_film_grain_table_write(&table, grain_file.c_str(), &error_));
+  aom_film_grain_table_free(&table);
+
+  memset(&table, 0, sizeof(table));
+  ASSERT_EQ(AOM_CODEC_OK,
+            aom_film_grain_table_read(&table, grain_file.c_str(), &error_));
+  ASSERT_TRUE(aom_film_grain_table_lookup(&table, 0, 1000, false, &grain));
+  ASSERT_FALSE(aom_film_grain_table_lookup(&table, 1000, 2000, false, &grain));
+  ASSERT_TRUE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain));
+  aom_film_grain_table_free(&table);
+
+  EXPECT_EQ(0, remove(grain_file.c_str()));
+}
diff --git a/third_party/aom/test/filterintra_predictors_test.cc b/third_party/aom/test/filterintra_predictors_test.cc
deleted file mode 100644
index 5c6b56d14..000000000
--- a/third_party/aom/test/filterintra_predictors_test.cc
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./av1_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "av1/common/enums.h"
-
-namespace {
-
-using std::tr1::tuple;
-using libaom_test::ACMRandom;
-
-typedef void (*Predictor)(uint8_t *dst, ptrdiff_t stride, int bs,
-                          const uint8_t *above, const uint8_t *left);
-
-// Note:
-//  Test parameter list:
-//  Reference predictor, optimized predictor, prediction mode, block size
-//
-typedef tuple<Predictor, Predictor, int> PredFuncMode;
-typedef tuple<PredFuncMode, int> PredParams;
-
-#if CONFIG_HIGHBITDEPTH
-typedef void (*HbdPredictor)(uint16_t *dst, ptrdiff_t stride, int bs,
-                             const uint16_t *above, const uint16_t *left,
-                             int bd);
-
-// Note:
-//  Test parameter list:
-//  Reference predictor, optimized predictor, prediction mode, block size,
-//  bit depth
-//
-typedef tuple<HbdPredictor, HbdPredictor, int> HbdPredFuncMode;
-typedef tuple<HbdPredFuncMode, int, int> HbdPredParams;
-#endif
-
-const int MaxBlkSize = 32;
-
-// By default, disable speed test
-#define PREDICTORS_SPEED_TEST (0)
-
-#if PREDICTORS_SPEED_TEST
-const int MaxTestNum = 100000;
-#else
-const int MaxTestNum = 100;
-#endif
-
-class AV1FilterIntraPredOptimzTest
-    : public ::testing::TestWithParam<PredParams> {
- public:
-  virtual ~AV1FilterIntraPredOptimzTest() {}
-  virtual void SetUp() {
-    PredFuncMode funcMode = GET_PARAM(0);
-    predFuncRef_ = std::tr1::get<0>(funcMode);
-    predFunc_ = std::tr1::get<1>(funcMode);
-    mode_ = std::tr1::get<2>(funcMode);
-    blockSize_ = GET_PARAM(1);
-
-    alloc_ = new uint8_t[3 * MaxBlkSize + 2];
-    predRef_ = new uint8_t[MaxBlkSize * MaxBlkSize];
-    pred_ = new uint8_t[MaxBlkSize * MaxBlkSize];
-  }
-
-  virtual void TearDown() {
-    delete[] alloc_;
-    delete[] predRef_;
-    delete[] pred_;
-    libaom_test::ClearSystemState();
-  }
-
- protected:
-  void RunTest() const {
-    int tstIndex = 0;
-    int stride = blockSize_;
-    uint8_t *left = alloc_;
-    uint8_t *above = alloc_ + MaxBlkSize + 1;
-    while (tstIndex < MaxTestNum) {
-      PrepareBuffer();
-      predFuncRef_(predRef_, stride, blockSize_, &above[1], left);
-      ASM_REGISTER_STATE_CHECK(
-          predFunc_(pred_, stride, blockSize_, &above[1], left));
-      DiffPred(tstIndex);
-      tstIndex += 1;
-    }
-  }
-
-  void RunSpeedTestC() const {
-    int tstIndex = 0;
-    int stride = blockSize_;
-    uint8_t *left = alloc_;
-    uint8_t *above = alloc_ + MaxBlkSize + 1;
-    PrepareBuffer();
-    while (tstIndex < MaxTestNum) {
-      predFuncRef_(predRef_, stride, blockSize_, &above[1], left);
-      tstIndex += 1;
-    }
-  }
-
-  void RunSpeedTestSSE() const {
-    int tstIndex = 0;
-    int stride = blockSize_;
-    uint8_t *left = alloc_;
-    uint8_t *above = alloc_ + MaxBlkSize + 1;
-    PrepareBuffer();
-    while (tstIndex < MaxTestNum) {
-      predFunc_(predRef_, stride, blockSize_, &above[1], left);
-      tstIndex += 1;
-    }
-  }
-
- private:
-  void PrepareBuffer() const {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    int i = 0;
-    while (i < (3 * MaxBlkSize + 2)) {
-      alloc_[i] = rnd.Rand8();
-      i += 1;
-    }
-  }
-
-  void DiffPred(int testNum) const {
-    int i = 0;
-    while (i < blockSize_ * blockSize_) {
-      EXPECT_EQ(predRef_[i], pred_[i]) << "Error at position: " << i << " "
-                                       << "Block size: " << blockSize_ << " "
-                                       << "Test number: " << testNum;
-      i += 1;
-    }
-  }
-
-  Predictor predFunc_;
-  Predictor predFuncRef_;
-  int mode_;
-  int blockSize_;
-  uint8_t *alloc_;
-  uint8_t *pred_;
-  uint8_t *predRef_;
-};
-
-#if CONFIG_HIGHBITDEPTH
-class AV1HbdFilterIntraPredOptimzTest
-    : public ::testing::TestWithParam<HbdPredParams> {
- public:
-  virtual ~AV1HbdFilterIntraPredOptimzTest() {}
-  virtual void SetUp() {
-    HbdPredFuncMode funcMode = GET_PARAM(0);
-    predFuncRef_ = std::tr1::get<0>(funcMode);
-    predFunc_ = std::tr1::get<1>(funcMode);
-    mode_ = std::tr1::get<2>(funcMode);
-    blockSize_ = GET_PARAM(1);
-    bd_ = GET_PARAM(2);
-
-    alloc_ = new uint16_t[3 * MaxBlkSize + 2];
-    predRef_ = new uint16_t[MaxBlkSize * MaxBlkSize];
-    pred_ = new uint16_t[MaxBlkSize * MaxBlkSize];
-  }
-
-  virtual void TearDown() {
-    delete[] alloc_;
-    delete[] predRef_;
-    delete[] pred_;
-    libaom_test::ClearSystemState();
-  }
-
- protected:
-  void RunTest() const {
-    int tstIndex = 0;
-    int stride = blockSize_;
-    uint16_t *left = alloc_;
-    uint16_t *above = alloc_ + MaxBlkSize + 1;
-    while (tstIndex < MaxTestNum) {
-      PrepareBuffer();
-      predFuncRef_(predRef_, stride, blockSize_, &above[1], left, bd_);
-      ASM_REGISTER_STATE_CHECK(
-          predFunc_(pred_, stride, blockSize_, &above[1], left, bd_));
-      DiffPred(tstIndex);
-      tstIndex += 1;
-    }
-  }
-
-  void RunSpeedTestC() const {
-    int tstIndex = 0;
-    int stride = blockSize_;
-    uint16_t *left = alloc_;
-    uint16_t *above = alloc_ + MaxBlkSize + 1;
-    PrepareBuffer();
-    while (tstIndex < MaxTestNum) {
-      predFuncRef_(predRef_, stride, blockSize_, &above[1], left, bd_);
-      tstIndex += 1;
-    }
-  }
-
-  void RunSpeedTestSSE() const {
-    int tstIndex = 0;
-    int stride = blockSize_;
-    uint16_t *left = alloc_;
-    uint16_t *above = alloc_ + MaxBlkSize + 1;
-    PrepareBuffer();
-    while (tstIndex < MaxTestNum) {
-      predFunc_(predRef_, stride, blockSize_, &above[1], left, bd_);
-      tstIndex += 1;
-    }
-  }
-
- private:
-  void PrepareBuffer() const {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    int i = 0;
-    while (i < (3 * MaxBlkSize + 2)) {
-      alloc_[i] = rnd.Rand16() & ((1 << bd_) - 1);
-      i += 1;
-    }
-  }
-
-  void DiffPred(int testNum) const {
-    int i = 0;
-    while (i < blockSize_ * blockSize_) {
-      EXPECT_EQ(predRef_[i], pred_[i]) << "Error at position: " << i << " "
-                                       << "Block size: " << blockSize_ << " "
-                                       << "Bit depth: " << bd_ << " "
-                                       << "Test number: " << testNum;
-      i += 1;
-    }
-  }
-
-  HbdPredictor predFunc_;
-  HbdPredictor predFuncRef_;
-  int mode_;
-  int blockSize_;
-  int bd_;
-  uint16_t *alloc_;
-  uint16_t *pred_;
-  uint16_t *predRef_;
-};
-#endif  // CONFIG_HIGHBITDEPTH
-
-TEST_P(AV1FilterIntraPredOptimzTest, BitExactCheck) { RunTest(); }
-
-#if PREDICTORS_SPEED_TEST
-TEST_P(AV1FilterIntraPredOptimzTest, SpeedCheckC) { RunSpeedTestC(); }
-
-TEST_P(AV1FilterIntraPredOptimzTest, SpeedCheckSSE) { RunSpeedTestSSE(); }
-#endif
-
-#if CONFIG_HIGHBITDEPTH
-TEST_P(AV1HbdFilterIntraPredOptimzTest, BitExactCheck) { RunTest(); }
-
-#if PREDICTORS_SPEED_TEST
-TEST_P(AV1HbdFilterIntraPredOptimzTest, SpeedCheckC) { RunSpeedTestC(); }
-
-TEST_P(AV1HbdFilterIntraPredOptimzTest, SpeedCheckSSE) { RunSpeedTestSSE(); }
-#endif  // PREDICTORS_SPEED_TEST
-#endif  // CONFIG_HIGHBITDEPTH
-
-using std::tr1::make_tuple;
-
-const PredFuncMode kPredFuncMdArray[] = {
-  make_tuple(av1_dc_filter_predictor_c, av1_dc_filter_predictor_sse4_1,
-             DC_PRED),
-  make_tuple(av1_v_filter_predictor_c, av1_v_filter_predictor_sse4_1, V_PRED),
-  make_tuple(av1_h_filter_predictor_c, av1_h_filter_predictor_sse4_1, H_PRED),
-  make_tuple(av1_d45_filter_predictor_c, av1_d45_filter_predictor_sse4_1,
-             D45_PRED),
-  make_tuple(av1_d135_filter_predictor_c, av1_d135_filter_predictor_sse4_1,
-             D135_PRED),
-  make_tuple(av1_d117_filter_predictor_c, av1_d117_filter_predictor_sse4_1,
-             D117_PRED),
-  make_tuple(av1_d153_filter_predictor_c, av1_d153_filter_predictor_sse4_1,
-             D153_PRED),
-  make_tuple(av1_d207_filter_predictor_c, av1_d207_filter_predictor_sse4_1,
-             D207_PRED),
-  make_tuple(av1_d63_filter_predictor_c, av1_d63_filter_predictor_sse4_1,
-             D63_PRED),
-  make_tuple(av1_tm_filter_predictor_c, av1_tm_filter_predictor_sse4_1,
-             TM_PRED),
-};
-
-const int kBlkSize[] = { 4, 8, 16, 32 };
-
-INSTANTIATE_TEST_CASE_P(
-    SSE4_1, AV1FilterIntraPredOptimzTest,
-    ::testing::Combine(::testing::ValuesIn(kPredFuncMdArray),
-                       ::testing::ValuesIn(kBlkSize)));
-
-#if CONFIG_HIGHBITDEPTH
-const HbdPredFuncMode kHbdPredFuncMdArray[] = {
-  make_tuple(av1_highbd_dc_filter_predictor_c,
-             av1_highbd_dc_filter_predictor_sse4_1, DC_PRED),
-  make_tuple(av1_highbd_v_filter_predictor_c,
-             av1_highbd_v_filter_predictor_sse4_1, V_PRED),
-  make_tuple(av1_highbd_h_filter_predictor_c,
-             av1_highbd_h_filter_predictor_sse4_1, H_PRED),
-  make_tuple(av1_highbd_d45_filter_predictor_c,
-             av1_highbd_d45_filter_predictor_sse4_1, D45_PRED),
-  make_tuple(av1_highbd_d135_filter_predictor_c,
-             av1_highbd_d135_filter_predictor_sse4_1, D135_PRED),
-  make_tuple(av1_highbd_d117_filter_predictor_c,
-             av1_highbd_d117_filter_predictor_sse4_1, D117_PRED),
-  make_tuple(av1_highbd_d153_filter_predictor_c,
-             av1_highbd_d153_filter_predictor_sse4_1, D153_PRED),
-  make_tuple(av1_highbd_d207_filter_predictor_c,
-             av1_highbd_d207_filter_predictor_sse4_1, D207_PRED),
-  make_tuple(av1_highbd_d63_filter_predictor_c,
-             av1_highbd_d63_filter_predictor_sse4_1, D63_PRED),
-  make_tuple(av1_highbd_tm_filter_predictor_c,
-             av1_highbd_tm_filter_predictor_sse4_1, TM_PRED),
-};
-
-const int kBd[] = { 10, 12 };
-
-INSTANTIATE_TEST_CASE_P(
-    SSE4_1, AV1HbdFilterIntraPredOptimzTest,
-    ::testing::Combine(::testing::ValuesIn(kHbdPredFuncMdArray),
-                       ::testing::ValuesIn(kBlkSize),
-                       ::testing::ValuesIn(kBd)));
-#endif  // CONFIG_HIGHBITDEPTH
-
-}  // namespace
diff --git a/third_party/aom/test/filterintra_test.cc b/third_party/aom/test/filterintra_test.cc
new file mode 100644
index 000000000..597134940
--- /dev/null
+++ b/third_party/aom/test/filterintra_test.cc
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/enums.h"
+
+namespace {
+
+using ::testing::tuple;
+using libaom_test::ACMRandom;
+
+typedef void (*Predictor)(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
+                          const uint8_t *above, const uint8_t *left, int mode);
+
+// Note:
+//  Test parameter list:
+//  Reference predictor, optimized predictor, prediction mode, tx size
+//
+typedef tuple<Predictor, Predictor, int> PredFuncMode;
+typedef tuple<PredFuncMode, TX_SIZE> PredParams;
+
+const int MaxTxSize = 32;
+
+const int MaxTestNum = 100;
+
+class AV1FilterIntraPredTest : public ::testing::TestWithParam<PredParams> {
+ public:
+  virtual ~AV1FilterIntraPredTest() {}
+  virtual void SetUp() {
+    PredFuncMode funcMode = GET_PARAM(0);
+    predFuncRef_ = ::testing::get<0>(funcMode);
+    predFunc_ = ::testing::get<1>(funcMode);
+    mode_ = ::testing::get<2>(funcMode);
+    txSize_ = GET_PARAM(1);
+
+    alloc_ = new uint8_t[2 * MaxTxSize + 1];
+    predRef_ = new uint8_t[MaxTxSize * MaxTxSize];
+    pred_ = new uint8_t[MaxTxSize * MaxTxSize];
+  }
+
+  virtual void TearDown() {
+    delete[] alloc_;
+    delete[] predRef_;
+    delete[] pred_;
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RunTest() const {
+    int tstIndex = 0;
+    int stride = tx_size_wide[txSize_];
+    uint8_t *left = alloc_;
+    uint8_t *above = alloc_ + MaxTxSize;
+    while (tstIndex < MaxTestNum) {
+      PrepareBuffer();
+      predFuncRef_(predRef_, stride, txSize_, &above[1], left, mode_);
+      ASM_REGISTER_STATE_CHECK(
+          predFunc_(pred_, stride, txSize_, &above[1], left, mode_));
+      DiffPred(tstIndex);
+      tstIndex += 1;
+    }
+  }
+
+ private:
+  void PrepareBuffer() const {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    int i = 0;
+    while (i < (2 * MaxTxSize + 1)) {
+      alloc_[i] = rnd.Rand8();
+      i++;
+    }
+  }
+
+  void DiffPred(int testNum) const {
+    int i = 0;
+    while (i < tx_size_wide[txSize_] * tx_size_high[txSize_]) {
+      EXPECT_EQ(predRef_[i], pred_[i]) << "Error at position: " << i << " "
+                                       << "Tx size: " << tx_size_wide[txSize_]
+                                       << "x" << tx_size_high[txSize_] << " "
+                                       << "Test number: " << testNum;
+      i++;
+    }
+  }
+
+  Predictor predFunc_;
+  Predictor predFuncRef_;
+  int mode_;
+  TX_SIZE txSize_;
+  uint8_t *alloc_;
+  uint8_t *pred_;
+  uint8_t *predRef_;
+};
+
+TEST_P(AV1FilterIntraPredTest, BitExactCheck) { RunTest(); }
+
+using ::testing::make_tuple;
+
+const PredFuncMode kPredFuncMdArray[] = {
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
+             FILTER_DC_PRED),
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
+             FILTER_V_PRED),
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
+             FILTER_H_PRED),
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
+             FILTER_D157_PRED),
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
+             FILTER_PAETH_PRED),
+};
+
+const TX_SIZE kTxSize[] = { TX_4X4,  TX_8X8,  TX_16X16, TX_32X32, TX_4X8,
+                            TX_8X4,  TX_8X16, TX_16X8,  TX_16X32, TX_32X16,
+                            TX_4X16, TX_16X4, TX_8X32,  TX_32X8 };
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, AV1FilterIntraPredTest,
+    ::testing::Combine(::testing::ValuesIn(kPredFuncMdArray),
+                       ::testing::ValuesIn(kTxSize)));
+}  // namespace
diff --git a/third_party/aom/test/frame_size_tests.cc b/third_party/aom/test/frame_size_tests.cc
index 442f2523d..eaf0b8370 100644
--- a/third_party/aom/test/frame_size_tests.cc
+++ b/third_party/aom/test/frame_size_tests.cc
@@ -7,7 +7,7 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/codec_factory.h"
diff --git a/third_party/aom/test/fwht4x4_test.cc b/third_party/aom/test/fwht4x4_test.cc
new file mode 100644
index 000000000..c8d98c519
--- /dev/null
+++ b/third_party/aom/test/fwht4x4_test.cc
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+#include "av1/common/entropy.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+
+using libaom_test::FhtFunc;
+
+typedef ::testing::tuple<FdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t, int>
+    Dct4x4Param;
+
+void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
+                 TxfmParam * /*txfm_param*/) {
+  av1_fwht4x4_c(in, out, stride);
+}
+
+void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
+  av1_highbd_iwht4x4_16_add_c(in, out, stride, 10);
+}
+
+void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
+  av1_highbd_iwht4x4_16_add_c(in, out, stride, 12);
+}
+
+class Trans4x4WHT : public libaom_test::TransformTestBase,
+                    public ::testing::TestWithParam<Dct4x4Param> {
+ public:
+  virtual ~Trans4x4WHT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    pitch_ = 4;
+    height_ = 4;
+    fwd_txfm_ref = fwht4x4_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
+  }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride);
+  }
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+
+  FdctFunc fwd_txfm_;
+  IdctFunc inv_txfm_;
+};
+
+TEST_P(Trans4x4WHT, AccuracyCheck) { RunAccuracyCheck(0, 0.00001); }
+
+TEST_P(Trans4x4WHT, CoeffCheck) { RunCoeffCheck(); }
+
+TEST_P(Trans4x4WHT, MemCheck) { RunMemCheck(); }
+
+TEST_P(Trans4x4WHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
+using ::testing::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4WHT,
+    ::testing::Values(make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_10, DCT_DCT,
+                                 AOM_BITS_10, 16),
+                      make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_12, DCT_DCT,
+                                 AOM_BITS_12, 16)));
+}  // namespace
diff --git a/third_party/aom/test/gviz_api.py b/third_party/aom/test/gviz_api.py
new file mode 100755
index 000000000..d3a443dab
--- /dev/null
+++ b/third_party/aom/test/gviz_api.py
@@ -0,0 +1,1087 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+
+"""Converts Python data into data for Google Visualization API clients.
+
+This library can be used to create a google.visualization.DataTable usable by
+visualizations built on the Google Visualization API. Output formats are raw
+JSON, JSON response, JavaScript, CSV, and HTML table.
+
+See http://code.google.com/apis/visualization/ for documentation on the
+Google Visualization API.
+"""
+
+__author__ = "Amit Weinstein, Misha Seltzer, Jacob Baskin"
+
+import cgi
+import cStringIO
+import csv
+import datetime
+try:
+  import json
+except ImportError:
+  import simplejson as json
+import types
+
+
+class DataTableException(Exception):
+  """The general exception object thrown by DataTable."""
+  pass
+
+
+class DataTableJSONEncoder(json.JSONEncoder):
+  """JSON encoder that handles date/time/datetime objects correctly."""
+
+  def __init__(self):
+    json.JSONEncoder.__init__(self,
+                              separators=(",", ":"),
+                              ensure_ascii=False)
+
+  def default(self, o):
+    if isinstance(o, datetime.datetime):
+      if o.microsecond == 0:
+        # If the time doesn't have ms-resolution, leave it out to keep
+        # things smaller.
+        return "Date(%d,%d,%d,%d,%d,%d)" % (
+            o.year, o.month - 1, o.day, o.hour, o.minute, o.second)
+      else:
+        return "Date(%d,%d,%d,%d,%d,%d,%d)" % (
+            o.year, o.month - 1, o.day, o.hour, o.minute, o.second,
+            o.microsecond / 1000)
+    elif isinstance(o, datetime.date):
+      return "Date(%d,%d,%d)" % (o.year, o.month - 1, o.day)
+    elif isinstance(o, datetime.time):
+      return [o.hour, o.minute, o.second]
+    else:
+      return super(DataTableJSONEncoder, self).default(o)
+
+
+class DataTable(object):
+  """Wraps the data to convert to a Google Visualization API DataTable.
+
+  Create this object, populate it with data, then call one of the ToJS...
+  methods to return a string representation of the data in the format described.
+
+  You can clear all data from the object to reuse it, but you cannot clear
+  individual cells, rows, or columns. You also cannot modify the table schema
+  specified in the class constructor.
+
+  You can add new data one or more rows at a time. All data added to an
+  instantiated DataTable must conform to the schema passed in to __init__().
+
+  You can reorder the columns in the output table, and also specify row sorting
+  order by column. The default column order is according to the original
+  table_description parameter. Default row sort order is ascending, by column
+  1 values. For a dictionary, we sort the keys for order.
+
+  The data and the table_description are closely tied, as described here:
+
+  The table schema is defined in the class constructor's table_description
+  parameter. The user defines each column using a tuple of
+  (id[, type[, label[, custom_properties]]]). The default value for type is
+  string, label is the same as ID if not specified, and custom properties is
+  an empty dictionary if not specified.
+
+  table_description is a dictionary or list, containing one or more column
+  descriptor tuples, nested dictionaries, and lists. Each dictionary key, list
+  element, or dictionary element must eventually be defined as
+  a column description tuple. Here's an example of a dictionary where the key
+  is a tuple, and the value is a list of two tuples:
+    {('a', 'number'): [('b', 'number'), ('c', 'string')]}
+
+  This flexibility in data entry enables you to build and manipulate your data
+  in a Python structure that makes sense for your program.
+
+  Add data to the table using the same nested design as the table's
+  table_description, replacing column descriptor tuples with cell data, and
+  each row is an element in the top level collection. This will be a bit
+  clearer after you look at the following examples showing the
+  table_description, matching data, and the resulting table:
+
+  Columns as list of tuples [col1, col2, col3]
+    table_description: [('a', 'number'), ('b', 'string')]
+    AppendData( [[1, 'z'], [2, 'w'], [4, 'o'], [5, 'k']] )
+    Table:
+    a  b   <--- these are column ids/labels
+    1  z
+    2  w
+    4  o
+    5  k
+
+  Dictionary of columns, where key is a column, and value is a list of
+  columns  {col1: [col2, col3]}
+    table_description: {('a', 'number'): [('b', 'number'), ('c', 'string')]}
+    AppendData( data: {1: [2, 'z'], 3: [4, 'w']}
+    Table:
+    a  b  c
+    1  2  z
+    3  4  w
+
+  Dictionary where key is a column, and the value is itself a dictionary of
+  columns {col1: {col2, col3}}
+    table_description: {('a', 'number'): {'b': 'number', 'c': 'string'}}
+    AppendData( data: {1: {'b': 2, 'c': 'z'}, 3: {'b': 4, 'c': 'w'}}
+    Table:
+    a  b  c
+    1  2  z
+    3  4  w
+  """
+
+  def __init__(self, table_description, data=None, custom_properties=None):
+    """Initialize the data table from a table schema and (optionally) data.
+
+    See the class documentation for more information on table schema and data
+    values.
+
+    Args:
+      table_description: A table schema, following one of the formats described
+                         in TableDescriptionParser(). Schemas describe the
+                         column names, data types, and labels. See
+                         TableDescriptionParser() for acceptable formats.
+      data: Optional. If given, fills the table with the given data. The data
+            structure must be consistent with schema in table_description. See
+            the class documentation for more information on acceptable data. You
+            can add data later by calling AppendData().
+      custom_properties: Optional. A dictionary from string to string that
+                         goes into the table's custom properties. This can be
+                         later changed by changing self.custom_properties.
+
+    Raises:
+      DataTableException: Raised if the data and the description did not match,
+                          or did not use the supported formats.
+    """
+    self.__columns = self.TableDescriptionParser(table_description)
+    self.__data = []
+    self.custom_properties = {}
+    if custom_properties is not None:
+      self.custom_properties = custom_properties
+    if data:
+      self.LoadData(data)
+
+  @staticmethod
+  def CoerceValue(value, value_type):
+    """Coerces a single value into the type expected for its column.
+
+    Internal helper method.
+
+    Args:
+      value: The value which should be converted
+      value_type: One of "string", "number", "boolean", "date", "datetime" or
+                  "timeofday".
+
+    Returns:
+      An item of the Python type appropriate to the given value_type. Strings
+      are also converted to Unicode using UTF-8 encoding if necessary.
+      If a tuple is given, it should be in one of the following forms:
+        - (value, formatted value)
+        - (value, formatted value, custom properties)
+      where the formatted value is a string, and custom properties is a
+      dictionary of the custom properties for this cell.
+      To specify custom properties without specifying formatted value, one can
+      pass None as the formatted value.
+      One can also have a null-valued cell with formatted value and/or custom
+      properties by specifying None for the value.
+      This method ignores the custom properties except for checking that it is a
+      dictionary. The custom properties are handled in the ToJSon and ToJSCode
+      methods.
+      The real type of the given value is not strictly checked. For example,
+      any type can be used for string - as we simply take its str( ) and for
+      boolean value we just check "if value".
+      Examples:
+        CoerceValue(None, "string") returns None
+        CoerceValue((5, "5$"), "number") returns (5, "5$")
+        CoerceValue(100, "string") returns "100"
+        CoerceValue(0, "boolean") returns False
+
+    Raises:
+      DataTableException: The value and type did not match in a not-recoverable
+                          way, for example given value 'abc' for type 'number'.
+    """
+    if isinstance(value, tuple):
+      # In case of a tuple, we run the same function on the value itself and
+      # add the formatted value.
+      if (len(value) not in [2, 3] or
+          (len(value) == 3 and not isinstance(value[2], dict))):
+        raise DataTableException("Wrong format for value and formatting - %s." %
+                                 str(value))
+      if not isinstance(value[1], types.StringTypes + (types.NoneType,)):
+        raise DataTableException("Formatted value is not string, given %s." %
+                                 type(value[1]))
+      js_value = DataTable.CoerceValue(value[0], value_type)
+      return (js_value,) + value[1:]
+
+    t_value = type(value)
+    if value is None:
+      return value
+    if value_type == "boolean":
+      return bool(value)
+
+    elif value_type == "number":
+      if isinstance(value, (int, long, float)):
+        return value
+      raise DataTableException("Wrong type %s when expected number" % t_value)
+
+    elif value_type == "string":
+      if isinstance(value, unicode):
+        return value
+      else:
+        return str(value).decode("utf-8")
+
+    elif value_type == "date":
+      if isinstance(value, datetime.datetime):
+        return datetime.date(value.year, value.month, value.day)
+      elif isinstance(value, datetime.date):
+        return value
+      else:
+        raise DataTableException("Wrong type %s when expected date" % t_value)
+
+    elif value_type == "timeofday":
+      if isinstance(value, datetime.datetime):
+        return datetime.time(value.hour, value.minute, value.second)
+      elif isinstance(value, datetime.time):
+        return value
+      else:
+        raise DataTableException("Wrong type %s when expected time" % t_value)
+
+    elif value_type == "datetime":
+      if isinstance(value, datetime.datetime):
+        return value
+      else:
+        raise DataTableException("Wrong type %s when expected datetime" %
+                                 t_value)
+    # If we got here, it means the given value_type was not one of the
+    # supported types.
+    raise DataTableException("Unsupported type %s" % value_type)
+
+  @staticmethod
+  def EscapeForJSCode(encoder, value):
+    if value is None:
+      return "null"
+    elif isinstance(value, datetime.datetime):
+      if value.microsecond == 0:
+        # If it's not ms-resolution, leave that out to save space.
+        return "new Date(%d,%d,%d,%d,%d,%d)" % (value.year,
+                                                value.month - 1,  # To match JS
+                                                value.day,
+                                                value.hour,
+                                                value.minute,
+                                                value.second)
+      else:
+        return "new Date(%d,%d,%d,%d,%d,%d,%d)" % (value.year,
+                                                   value.month - 1,  # match JS
+                                                   value.day,
+                                                   value.hour,
+                                                   value.minute,
+                                                   value.second,
+                                                   value.microsecond / 1000)
+    elif isinstance(value, datetime.date):
+      return "new Date(%d,%d,%d)" % (value.year, value.month - 1, value.day)
+    else:
+      return encoder.encode(value)
+
+  @staticmethod
+  def ToString(value):
+    if value is None:
+      return "(empty)"
+    elif isinstance(value, (datetime.datetime,
+                            datetime.date,
+                            datetime.time)):
+      return str(value)
+    elif isinstance(value, unicode):
+      return value
+    elif isinstance(value, bool):
+      return str(value).lower()
+    else:
+      return str(value).decode("utf-8")
+
+  @staticmethod
+  def ColumnTypeParser(description):
+    """Parses a single column description. Internal helper method.
+
+    Args:
+      description: a column description in the possible formats:
+       'id'
+       ('id',)
+       ('id', 'type')
+       ('id', 'type', 'label')
+       ('id', 'type', 'label', {'custom_prop1': 'custom_val1'})
+    Returns:
+      Dictionary with the following keys: id, label, type, and
+      custom_properties where:
+        - If label not given, it equals the id.
+        - If type not given, string is used by default.
+        - If custom properties are not given, an empty dictionary is used by
+          default.
+
+    Raises:
+      DataTableException: The column description did not match the RE, or
+          unsupported type was passed.
+    """
+    if not description:
+      raise DataTableException("Description error: empty description given")
+
+    if not isinstance(description, (types.StringTypes, tuple)):
+      raise DataTableException("Description error: expected either string or "
+                               "tuple, got %s." % type(description))
+
+    if isinstance(description, types.StringTypes):
+      description = (description,)
+
+    # According to the tuple's length, we fill the keys
+    # We verify everything is of type string
+    for elem in description[:3]:
+      if not isinstance(elem, types.StringTypes):
+        raise DataTableException("Description error: expected tuple of "
+                                 "strings, current element of type %s." %
+                                 type(elem))
+    desc_dict = {"id": description[0],
+                 "label": description[0],
+                 "type": "string",
+                 "custom_properties": {}}
+    if len(description) > 1:
+      desc_dict["type"] = description[1].lower()
+      if len(description) > 2:
+        desc_dict["label"] = description[2]
+        if len(description) > 3:
+          if not isinstance(description[3], dict):
+            raise DataTableException("Description error: expected custom "
+                                     "properties of type dict, current element "
+                                     "of type %s." % type(description[3]))
+          desc_dict["custom_properties"] = description[3]
+          if len(description) > 4:
+            raise DataTableException("Description error: tuple of length > 4")
+    if desc_dict["type"] not in ["string", "number", "boolean",
+                                 "date", "datetime", "timeofday"]:
+      raise DataTableException(
+          "Description error: unsupported type '%s'" % desc_dict["type"])
+    return desc_dict
+
+  @staticmethod
+  def TableDescriptionParser(table_description, depth=0):
+    """Parses the table_description object for internal use.
+
+    Parses the user-submitted table description into an internal format used
+    by the Python DataTable class. Returns the flat list of parsed columns.
+
+    Args:
+      table_description: A description of the table which should comply
+                         with one of the formats described below.
+      depth: Optional. The depth of the first level in the current description.
+             Used by recursive calls to this function.
+
+    Returns:
+      List of columns, where each column represented by a dictionary with the
+      keys: id, label, type, depth, container which means the following:
+      - id: the id of the column
+      - name: The name of the column
+      - type: The datatype of the elements in this column. Allowed types are
+              described in ColumnTypeParser().
+      - depth: The depth of this column in the table description
+      - container: 'dict', 'iter' or 'scalar' for parsing the format easily.
+      - custom_properties: The custom properties for this column.
+      The returned description is flattened regardless of how it was given.
+
+    Raises:
+      DataTableException: Error in a column description or in the description
+                          structure.
+
+    Examples:
+      A column description can be of the following forms:
+       'id'
+       ('id',)
+       ('id', 'type')
+       ('id', 'type', 'label')
+       ('id', 'type', 'label', {'custom_prop1': 'custom_val1'})
+       or as a dictionary:
+       'id': 'type'
+       'id': ('type',)
+       'id': ('type', 'label')
+       'id': ('type', 'label', {'custom_prop1': 'custom_val1'})
+      If the type is not specified, we treat it as string.
+      If no specific label is given, the label is simply the id.
+      If no custom properties are given, we use an empty dictionary.
+
+      input: [('a', 'date'), ('b', 'timeofday', 'b', {'foo': 'bar'})]
+      output: [{'id': 'a', 'label': 'a', 'type': 'date',
+                'depth': 0, 'container': 'iter', 'custom_properties': {}},
+               {'id': 'b', 'label': 'b', 'type': 'timeofday',
+                'depth': 0, 'container': 'iter',
+                'custom_properties': {'foo': 'bar'}}]
+
+      input: {'a': [('b', 'number'), ('c', 'string', 'column c')]}
+      output: [{'id': 'a', 'label': 'a', 'type': 'string',
+                'depth': 0, 'container': 'dict', 'custom_properties': {}},
+               {'id': 'b', 'label': 'b', 'type': 'number',
+                'depth': 1, 'container': 'iter', 'custom_properties': {}},
+               {'id': 'c', 'label': 'column c', 'type': 'string',
+                'depth': 1, 'container': 'iter', 'custom_properties': {}}]
+
+      input:  {('a', 'number', 'column a'): { 'b': 'number', 'c': 'string'}}
+      output: [{'id': 'a', 'label': 'column a', 'type': 'number',
+                'depth': 0, 'container': 'dict', 'custom_properties': {}},
+               {'id': 'b', 'label': 'b', 'type': 'number',
+                'depth': 1, 'container': 'dict', 'custom_properties': {}},
+               {'id': 'c', 'label': 'c', 'type': 'string',
+                'depth': 1, 'container': 'dict', 'custom_properties': {}}]
+
+      input: { ('w', 'string', 'word'): ('c', 'number', 'count') }
+      output: [{'id': 'w', 'label': 'word', 'type': 'string',
+                'depth': 0, 'container': 'dict', 'custom_properties': {}},
+               {'id': 'c', 'label': 'count', 'type': 'number',
+                'depth': 1, 'container': 'scalar', 'custom_properties': {}}]
+
+      input: {'a': ('number', 'column a'), 'b': ('string', 'column b')}
+      output: [{'id': 'a', 'label': 'column a', 'type': 'number', 'depth': 0,
+               'container': 'dict', 'custom_properties': {}},
+               {'id': 'b', 'label': 'column b', 'type': 'string', 'depth': 0,
+               'container': 'dict', 'custom_properties': {}}
+
+      NOTE: there might be ambiguity in the case of a dictionary representation
+      of a single column. For example, the following description can be parsed
+      in 2 different ways: {'a': ('b', 'c')} can be thought of a single column
+      with the id 'a', of type 'b' and the label 'c', or as 2 columns: one named
+      'a', and the other named 'b' of type 'c'. We choose the first option by
+      default, and in case the second option is the right one, it is possible to
+      make the key into a tuple (i.e. {('a',): ('b', 'c')}) or add more info
+      into the tuple, thus making it look like this: {'a': ('b', 'c', 'b', {})}
+      -- second 'b' is the label, and {} is the custom properties field.
+    """
+    # For the recursion step, we check for a scalar object (string or tuple)
+    if isinstance(table_description, (types.StringTypes, tuple)):
+      parsed_col = DataTable.ColumnTypeParser(table_description)
+      parsed_col["depth"] = depth
+      parsed_col["container"] = "scalar"
+      return [parsed_col]
+
+    # Since it is not scalar, table_description must be iterable.
+    if not hasattr(table_description, "__iter__"):
+      raise DataTableException("Expected an iterable object, got %s" %
+                               type(table_description))
+    if not isinstance(table_description, dict):
+      # We expects a non-dictionary iterable item.
+      columns = []
+      for desc in table_description:
+        parsed_col = DataTable.ColumnTypeParser(desc)
+        parsed_col["depth"] = depth
+        parsed_col["container"] = "iter"
+        columns.append(parsed_col)
+      if not columns:
+        raise DataTableException("Description iterable objects should not"
+                                 " be empty.")
+      return columns
+    # The other case is a dictionary
+    if not table_description:
+      raise DataTableException("Empty dictionaries are not allowed inside"
+                               " description")
+
+    # To differentiate between the two cases of more levels below or this is
+    # the most inner dictionary, we consider the number of keys (more then one
+    # key is indication for most inner dictionary) and the type of the key and
+    # value in case of only 1 key (if the type of key is string and the type of
+    # the value is a tuple of 0-3 items, we assume this is the most inner
+    # dictionary).
+    # NOTE: this way of differentiating might create ambiguity. See docs.
+    if (len(table_description) != 1 or
+        (isinstance(table_description.keys()[0], types.StringTypes) and
+         isinstance(table_description.values()[0], tuple) and
+         len(table_description.values()[0]) < 4)):
+      # This is the most inner dictionary. Parsing types.
+      columns = []
+      # We sort the items, equivalent to sort the keys since they are unique
+      for key, value in sorted(table_description.items()):
+        # We parse the column type as (key, type) or (key, type, label) using
+        # ColumnTypeParser.
+        if isinstance(value, tuple):
+          parsed_col = DataTable.ColumnTypeParser((key,) + value)
+        else:
+          parsed_col = DataTable.ColumnTypeParser((key, value))
+        parsed_col["depth"] = depth
+        parsed_col["container"] = "dict"
+        columns.append(parsed_col)
+      return columns
+    # This is an outer dictionary, must have at most one key.
+    parsed_col = DataTable.ColumnTypeParser(table_description.keys()[0])
+    parsed_col["depth"] = depth
+    parsed_col["container"] = "dict"
+    return ([parsed_col] +
+            DataTable.TableDescriptionParser(table_description.values()[0],
+                                             depth=depth + 1))
+
+  @property
+  def columns(self):
+    """Returns the parsed table description."""
+    return self.__columns
+
+  def NumberOfRows(self):
+    """Returns the number of rows in the current data stored in the table."""
+    return len(self.__data)
+
+  def SetRowsCustomProperties(self, rows, custom_properties):
+    """Sets the custom properties for given row(s).
+
+    Can accept a single row or an iterable of rows.
+    Sets the given custom properties for all specified rows.
+
+    Args:
+      rows: The row, or rows, to set the custom properties for.
+      custom_properties: A string to string dictionary of custom properties to
+      set for all rows.
+    """
+    if not hasattr(rows, "__iter__"):
+      rows = [rows]
+    for row in rows:
+      self.__data[row] = (self.__data[row][0], custom_properties)
+
+  def LoadData(self, data, custom_properties=None):
+    """Loads new rows to the data table, clearing existing rows.
+
+    May also set the custom_properties for the added rows. The given custom
+    properties dictionary specifies the dictionary that will be used for *all*
+    given rows.
+
+    Args:
+      data: The rows that the table will contain.
+      custom_properties: A dictionary of string to string to set as the custom
+                         properties for all rows.
+    """
+    self.__data = []
+    self.AppendData(data, custom_properties)
+
+  def AppendData(self, data, custom_properties=None):
+    """Appends new data to the table.
+
+    Data is appended in rows. Data must comply with
+    the table schema passed in to __init__(). See CoerceValue() for a list
+    of acceptable data types. See the class documentation for more information
+    and examples of schema and data values.
+
+    Args:
+      data: The row to add to the table. The data must conform to the table
+            description format.
+      custom_properties: A dictionary of string to string, representing the
+                         custom properties to add to all the rows.
+
+    Raises:
+      DataTableException: The data structure does not match the description.
+    """
+    # If the maximal depth is 0, we simply iterate over the data table
+    # lines and insert them using _InnerAppendData. Otherwise, we simply
+    # let the _InnerAppendData handle all the levels.
+    if not self.__columns[-1]["depth"]:
+      for row in data:
+        self._InnerAppendData(({}, custom_properties), row, 0)
+    else:
+      self._InnerAppendData(({}, custom_properties), data, 0)
+
+  def _InnerAppendData(self, prev_col_values, data, col_index):
+    """Inner function to assist LoadData."""
+    # We first check that col_index has not exceeded the columns size
+    if col_index >= len(self.__columns):
+      raise DataTableException("The data does not match description, too deep")
+
+    # Dealing with the scalar case, the data is the last value.
+    if self.__columns[col_index]["container"] == "scalar":
+      prev_col_values[0][self.__columns[col_index]["id"]] = data
+      self.__data.append(prev_col_values)
+      return
+
+    if self.__columns[col_index]["container"] == "iter":
+      if not hasattr(data, "__iter__") or isinstance(data, dict):
+        raise DataTableException("Expected iterable object, got %s" %
+                                 type(data))
+      # We only need to insert the rest of the columns
+      # If there are less items than expected, we only add what there is.
+      for value in data:
+        if col_index >= len(self.__columns):
+          raise DataTableException("Too many elements given in data")
+        prev_col_values[0][self.__columns[col_index]["id"]] = value
+        col_index += 1
+      self.__data.append(prev_col_values)
+      return
+
+    # We know the current level is a dictionary, we verify the type.
+    if not isinstance(data, dict):
+      raise DataTableException("Expected dictionary at current level, got %s" %
+                               type(data))
+    # We check if this is the last level
+    if self.__columns[col_index]["depth"] == self.__columns[-1]["depth"]:
+      # We need to add the keys in the dictionary as they are
+      for col in self.__columns[col_index:]:
+        if col["id"] in data:
+          prev_col_values[0][col["id"]] = data[col["id"]]
+      self.__data.append(prev_col_values)
+      return
+
+    # We have a dictionary in an inner depth level.
+    if not data.keys():
+      # In case this is an empty dictionary, we add a record with the columns
+      # filled only until this point.
+      self.__data.append(prev_col_values)
+    else:
+      for key in sorted(data):
+        col_values = dict(prev_col_values[0])
+        col_values[self.__columns[col_index]["id"]] = key
+        self._InnerAppendData((col_values, prev_col_values[1]),
+                              data[key], col_index + 1)
+
+  def _PreparedData(self, order_by=()):
+    """Prepares the data for enumeration - sorting it by order_by.
+
+    Args:
+      order_by: Optional. Specifies the name of the column(s) to sort by, and
+                (optionally) which direction to sort in. Default sort direction
+                is asc. Following formats are accepted:
+                "string_col_name"  -- For a single key in default (asc) order.
+                ("string_col_name", "asc|desc") -- For a single key.
+                [("col_1","asc|desc"), ("col_2","asc|desc")] -- For more than
+                    one column, an array of tuples of (col_name, "asc|desc").
+
+    Returns:
+      The data sorted by the keys given.
+
+    Raises:
+      DataTableException: Sort direction not in 'asc' or 'desc'
+    """
+    if not order_by:
+      return self.__data
+
+    proper_sort_keys = []
+    if isinstance(order_by, types.StringTypes) or (
+        isinstance(order_by, tuple) and len(order_by) == 2 and
+        order_by[1].lower() in ["asc", "desc"]):
+      order_by = (order_by,)
+    for key in order_by:
+      if isinstance(key, types.StringTypes):
+        proper_sort_keys.append((key, 1))
+      elif (isinstance(key, (list, tuple)) and len(key) == 2 and
+            key[1].lower() in ("asc", "desc")):
+        proper_sort_keys.append((key[0], key[1].lower() == "asc" and 1 or -1))
+      else:
+        raise DataTableException("Expected tuple with second value: "
+                                 "'asc' or 'desc'")
+
+    def SortCmpFunc(row1, row2):
+      """cmp function for sorted. Compares by keys and 'asc'/'desc' keywords."""
+      for key, asc_mult in proper_sort_keys:
+        cmp_result = asc_mult * cmp(row1[0].get(key), row2[0].get(key))
+        if cmp_result:
+          return cmp_result
+      return 0
+
+    return sorted(self.__data, cmp=SortCmpFunc)
+
+  def ToJSCode(self, name, columns_order=None, order_by=()):
+    """Writes the data table as a JS code string.
+
+    This method writes a string of JS code that can be run to
+    generate a DataTable with the specified data. Typically used for debugging
+    only.
+
+    Args:
+      name: The name of the table. The name would be used as the DataTable's
+            variable name in the created JS code.
+      columns_order: Optional. Specifies the order of columns in the
+                     output table. Specify a list of all column IDs in the order
+                     in which you want the table created.
+                     Note that you must list all column IDs in this parameter,
+                     if you use it.
+      order_by: Optional. Specifies the name of the column(s) to sort by.
+                Passed as is to _PreparedData.
+
+    Returns:
+      A string of JS code that, when run, generates a DataTable with the given
+      name and the data stored in the DataTable object.
+      Example result:
+        "var tab1 = new google.visualization.DataTable();
+         tab1.addColumn("string", "a", "a");
+         tab1.addColumn("number", "b", "b");
+         tab1.addColumn("boolean", "c", "c");
+         tab1.addRows(10);
+         tab1.setCell(0, 0, "a");
+         tab1.setCell(0, 1, 1, null, {"foo": "bar"});
+         tab1.setCell(0, 2, true);
+         ...
+         tab1.setCell(9, 0, "c");
+         tab1.setCell(9, 1, 3, "3$");
+         tab1.setCell(9, 2, false);"
+
+    Raises:
+      DataTableException: The data does not match the type.
+    """
+
+    encoder = DataTableJSONEncoder()
+
+    if columns_order is None:
+      columns_order = [col["id"] for col in self.__columns]
+    col_dict = dict([(col["id"], col) for col in self.__columns])
+
+    # We first create the table with the given name
+    jscode = "var %s = new google.visualization.DataTable();\n" % name
+    if self.custom_properties:
+      jscode += "%s.setTableProperties(%s);\n" % (
+          name, encoder.encode(self.custom_properties))
+
+    # We add the columns to the table
+    for i, col in enumerate(columns_order):
+      jscode += "%s.addColumn(%s, %s, %s);\n" % (
+          name,
+          encoder.encode(col_dict[col]["type"]),
+          encoder.encode(col_dict[col]["label"]),
+          encoder.encode(col_dict[col]["id"]))
+      if col_dict[col]["custom_properties"]:
+        jscode += "%s.setColumnProperties(%d, %s);\n" % (
+            name, i, encoder.encode(col_dict[col]["custom_properties"]))
+    jscode += "%s.addRows(%d);\n" % (name, len(self.__data))
+
+    # We now go over the data and add each row
+    for (i, (row, cp)) in enumerate(self._PreparedData(order_by)):
+      # We add all the elements of this row by their order
+      for (j, col) in enumerate(columns_order):
+        if col not in row or row[col] is None:
+          continue
+        value = self.CoerceValue(row[col], col_dict[col]["type"])
+        if isinstance(value, tuple):
+          cell_cp = ""
+          if len(value) == 3:
+            cell_cp = ", %s" % encoder.encode(row[col][2])
+          # We have a formatted value or custom property as well
+          jscode += ("%s.setCell(%d, %d, %s, %s%s);\n" %
+                     (name, i, j,
+                      self.EscapeForJSCode(encoder, value[0]),
+                      self.EscapeForJSCode(encoder, value[1]), cell_cp))
+        else:
+          jscode += "%s.setCell(%d, %d, %s);\n" % (
+              name, i, j, self.EscapeForJSCode(encoder, value))
+      if cp:
+        jscode += "%s.setRowProperties(%d, %s);\n" % (
+            name, i, encoder.encode(cp))
+    return jscode
+
+  def ToHtml(self, columns_order=None, order_by=()):
+    """Writes the data table as an HTML table code string.
+
+    Args:
+      columns_order: Optional. Specifies the order of columns in the
+                     output table. Specify a list of all column IDs in the order
+                     in which you want the table created.
+                     Note that you must list all column IDs in this parameter,
+                     if you use it.
+      order_by: Optional. Specifies the name of the column(s) to sort by.
+                Passed as is to _PreparedData.
+
+    Returns:
+      An HTML table code string.
+      Example result (the result is without the newlines):
+       <html><body><table border="1">
+        <thead><tr><th>a</th><th>b</th><th>c</th></tr></thead>
+        <tbody>
+         <tr><td>1</td><td>"z"</td><td>2</td></tr>
+         <tr><td>"3$"</td><td>"w"</td><td></td></tr>
+        </tbody>
+       </table></body></html>
+
+    Raises:
+      DataTableException: The data does not match the type.
+    """
+    table_template = "<html><body><table border=\"1\">%s</table></body></html>"
+    columns_template = "<thead><tr>%s</tr></thead>"
+    rows_template = "<tbody>%s</tbody>"
+    row_template = "<tr>%s</tr>"
+    header_cell_template = "<th>%s</th>"
+    cell_template = "<td>%s</td>"
+
+    if columns_order is None:
+      columns_order = [col["id"] for col in self.__columns]
+    col_dict = dict([(col["id"], col) for col in self.__columns])
+
+    columns_list = []
+    for col in columns_order:
+      columns_list.append(header_cell_template %
+                          cgi.escape(col_dict[col]["label"]))
+    columns_html = columns_template % "".join(columns_list)
+
+    rows_list = []
+    # We now go over the data and add each row
+    for row, unused_cp in self._PreparedData(order_by):
+      cells_list = []
+      # We add all the elements of this row by their order
+      for col in columns_order:
+        # For empty string we want empty quotes ("").
+        value = ""
+        if col in row and row[col] is not None:
+          value = self.CoerceValue(row[col], col_dict[col]["type"])
+        if isinstance(value, tuple):
+          # We have a formatted value and we're going to use it
+          cells_list.append(cell_template % cgi.escape(self.ToString(value[1])))
+        else:
+          cells_list.append(cell_template % cgi.escape(self.ToString(value)))
+      rows_list.append(row_template % "".join(cells_list))
+    rows_html = rows_template % "".join(rows_list)
+
+    return table_template % (columns_html + rows_html)
+
+  def ToCsv(self, columns_order=None, order_by=(), separator=","):
+    """Writes the data table as a CSV string.
+
+    Output is encoded in UTF-8 because the Python "csv" module can't handle
+    Unicode properly according to its documentation.
+
+    Args:
+      columns_order: Optional. Specifies the order of columns in the
+                     output table. Specify a list of all column IDs in the order
+                     in which you want the table created.
+                     Note that you must list all column IDs in this parameter,
+                     if you use it.
+      order_by: Optional. Specifies the name of the column(s) to sort by.
+                Passed as is to _PreparedData.
+      separator: Optional. The separator to use between the values.
+
+    Returns:
+      A CSV string representing the table.
+      Example result:
+       'a','b','c'
+       1,'z',2
+       3,'w',''
+
+    Raises:
+      DataTableException: The data does not match the type.
+    """
+
+    csv_buffer = cStringIO.StringIO()
+    writer = csv.writer(csv_buffer, delimiter=separator)
+
+    if columns_order is None:
+      columns_order = [col["id"] for col in self.__columns]
+    col_dict = dict([(col["id"], col) for col in self.__columns])
+
+    writer.writerow([col_dict[col]["label"].encode("utf-8")
+                     for col in columns_order])
+
+    # We now go over the data and add each row
+    for row, unused_cp in self._PreparedData(order_by):
+      cells_list = []
+      # We add all the elements of this row by their order
+      for col in columns_order:
+        value = ""
+        if col in row and row[col] is not None:
+          value = self.CoerceValue(row[col], col_dict[col]["type"])
+        if isinstance(value, tuple):
+          # We have a formatted value. Using it only for date/time types.
+          if col_dict[col]["type"] in ["date", "datetime", "timeofday"]:
+            cells_list.append(self.ToString(value[1]).encode("utf-8"))
+          else:
+            cells_list.append(self.ToString(value[0]).encode("utf-8"))
+        else:
+          cells_list.append(self.ToString(value).encode("utf-8"))
+      writer.writerow(cells_list)
+    return csv_buffer.getvalue()
+
+  def ToTsvExcel(self, columns_order=None, order_by=()):
+    """Returns a file in tab-separated-format readable by MS Excel.
+
+    Returns a file in UTF-16 little endian encoding, with tabs separating the
+    values.
+
+    Args:
+      columns_order: Delegated to ToCsv.
+      order_by: Delegated to ToCsv.
+
+    Returns:
+      A tab-separated little endian UTF16 file representing the table.
+    """
+    return (self.ToCsv(columns_order, order_by, separator="\t")
+            .decode("utf-8").encode("UTF-16LE"))
+
+  def _ToJSonObj(self, columns_order=None, order_by=()):
+    """Returns an object suitable to be converted to JSON.
+
+    Args:
+      columns_order: Optional. A list of all column IDs in the order in which
+                     you want them created in the output table. If specified,
+                     all column IDs must be present.
+      order_by: Optional. Specifies the name of the column(s) to sort by.
+                Passed as is to _PreparedData().
+
+    Returns:
+      A dictionary object for use by ToJSon or ToJSonResponse.
+    """
+    if columns_order is None:
+      columns_order = [col["id"] for col in self.__columns]
+    col_dict = dict([(col["id"], col) for col in self.__columns])
+
+    # Creating the column JSON objects
+    col_objs = []
+    for col_id in columns_order:
+      col_obj = {"id": col_dict[col_id]["id"],
+                 "label": col_dict[col_id]["label"],
+                 "type": col_dict[col_id]["type"]}
+      if col_dict[col_id]["custom_properties"]:
+        col_obj["p"] = col_dict[col_id]["custom_properties"]
+      col_objs.append(col_obj)
+
+    # Creating the rows jsons
+    row_objs = []
+    for row, cp in self._PreparedData(order_by):
+      cell_objs = []
+      for col in columns_order:
+        value = self.CoerceValue(row.get(col, None), col_dict[col]["type"])
+        if value is None:
+          cell_obj = None
+        elif isinstance(value, tuple):
+          cell_obj = {"v": value[0]}
+          if len(value) > 1 and value[1] is not None:
+            cell_obj["f"] = value[1]
+          if len(value) == 3:
+            cell_obj["p"] = value[2]
+        else:
+          cell_obj = {"v": value}
+        cell_objs.append(cell_obj)
+      row_obj = {"c": cell_objs}
+      if cp:
+        row_obj["p"] = cp
+      row_objs.append(row_obj)
+
+    json_obj = {"cols": col_objs, "rows": row_objs}
+    if self.custom_properties:
+      json_obj["p"] = self.custom_properties
+
+    return json_obj
+
+  def ToJSon(self, columns_order=None, order_by=()):
+    """Returns a string that can be used in a JS DataTable constructor.
+
+    This method writes a JSON string that can be passed directly into a Google
+    Visualization API DataTable constructor. Use this output if you are
+    hosting the visualization HTML on your site, and want to code the data
+    table in Python. Pass this string into the
+    google.visualization.DataTable constructor, e.g,:
+      ... on my page that hosts my visualization ...
+      google.setOnLoadCallback(drawTable);
+      function drawTable() {
+        var data = new google.visualization.DataTable(_my_JSon_string, 0.6);
+        myTable.draw(data);
+      }
+
+    Args:
+      columns_order: Optional. Specifies the order of columns in the
+                     output table. Specify a list of all column IDs in the order
+                     in which you want the table created.
+                     Note that you must list all column IDs in this parameter,
+                     if you use it.
+      order_by: Optional. Specifies the name of the column(s) to sort by.
+                Passed as is to _PreparedData().
+
+    Returns:
+      A JSon constructor string to generate a JS DataTable with the data
+      stored in the DataTable object.
+      Example result (the result is without the newlines):
+       {cols: [{id:"a",label:"a",type:"number"},
+               {id:"b",label:"b",type:"string"},
+              {id:"c",label:"c",type:"number"}],
+        rows: [{c:[{v:1},{v:"z"},{v:2}]}, c:{[{v:3,f:"3$"},{v:"w"},{v:null}]}],
+        p:    {'foo': 'bar'}}
+
+    Raises:
+      DataTableException: The data does not match the type.
+    """
+
+    encoder = DataTableJSONEncoder()
+    return encoder.encode(
+        self._ToJSonObj(columns_order, order_by)).encode("utf-8")
+
+  def ToJSonResponse(self, columns_order=None, order_by=(), req_id=0,
+                     response_handler="google.visualization.Query.setResponse"):
+    """Writes a table as a JSON response that can be returned as-is to a client.
+
+    This method writes a JSON response to return to a client in response to a
+    Google Visualization API query. This string can be processed by the calling
+    page, and is used to deliver a data table to a visualization hosted on
+    a different page.
+
+    Args:
+      columns_order: Optional. Passed straight to self.ToJSon().
+      order_by: Optional. Passed straight to self.ToJSon().
+      req_id: Optional. The response id, as retrieved by the request.
+      response_handler: Optional. The response handler, as retrieved by the
+          request.
+
+    Returns:
+      A JSON response string to be received by JS the visualization Query
+      object. This response would be translated into a DataTable on the
+      client side.
+      Example result (newlines added for readability):
+       google.visualization.Query.setResponse({
+          'version':'0.6', 'reqId':'0', 'status':'OK',
+          'table': {cols: [...], rows: [...]}});
+
+    Note: The URL returning this string can be used as a data source by Google
+          Visualization Gadgets or from JS code.
+    """
+
+    response_obj = {
+        "version": "0.6",
+        "reqId": str(req_id),
+        "table": self._ToJSonObj(columns_order, order_by),
+        "status": "ok"
+    }
+    encoder = DataTableJSONEncoder()
+    return "%s(%s);" % (response_handler,
+                        encoder.encode(response_obj).encode("utf-8"))
+
+  def ToResponse(self, columns_order=None, order_by=(), tqx=""):
+    """Writes the right response according to the request string passed in tqx.
+
+    This method parses the tqx request string (format of which is defined in
+    the documentation for implementing a data source of Google Visualization),
+    and returns the right response according to the request.
+    It parses out the "out" parameter of tqx, calls the relevant response
+    (ToJSonResponse() for "json", ToCsv() for "csv", ToHtml() for "html",
+    ToTsvExcel() for "tsv-excel") and passes the response function the rest of
+    the relevant request keys.
+
+    Args:
+      columns_order: Optional. Passed as is to the relevant response function.
+      order_by: Optional. Passed as is to the relevant response function.
+      tqx: Optional. The request string as received by HTTP GET. Should be in
+           the format "key1:value1;key2:value2...". All keys have a default
+           value, so an empty string will just do the default (which is calling
+           ToJSonResponse() with no extra parameters).
+
+    Returns:
+      A response string, as returned by the relevant response function.
+
+    Raises:
+      DataTableException: One of the parameters passed in tqx is not supported.
+    """
+    tqx_dict = {}
+    if tqx:
+      tqx_dict = dict(opt.split(":") for opt in tqx.split(";"))
+    if tqx_dict.get("version", "0.6") != "0.6":
+      raise DataTableException(
+          "Version (%s) passed by request is not supported."
+          % tqx_dict["version"])
+
+    if tqx_dict.get("out", "json") == "json":
+      response_handler = tqx_dict.get("responseHandler",
+                                      "google.visualization.Query.setResponse")
+      return self.ToJSonResponse(columns_order, order_by,
+                                 req_id=tqx_dict.get("reqId", 0),
+                                 response_handler=response_handler)
+    elif tqx_dict["out"] == "html":
+      return self.ToHtml(columns_order, order_by)
+    elif tqx_dict["out"] == "csv":
+      return self.ToCsv(columns_order, order_by)
+    elif tqx_dict["out"] == "tsv-excel":
+      return self.ToTsvExcel(columns_order, order_by)
+    else:
+      raise DataTableException(
+          "'out' parameter: '%s' is not supported" % tqx_dict["out"])
diff --git a/third_party/aom/test/hadamard_test.cc b/third_party/aom/test/hadamard_test.cc
deleted file mode 100644
index db5cb7474..000000000
--- a/third_party/aom/test/hadamard_test.cc
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <algorithm>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./aom_dsp_rtcd.h"
-
-#include "test/acm_random.h"
-#include "test/register_state_check.h"
-
-namespace {
-
-using ::libaom_test::ACMRandom;
-
-typedef void (*HadamardFunc)(const int16_t *a, int a_stride, int16_t *b);
-
-void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) {
-  int16_t b[8];
-  for (int i = 0; i < 8; i += 2) {
-    b[i + 0] = a[i * a_stride] + a[(i + 1) * a_stride];
-    b[i + 1] = a[i * a_stride] - a[(i + 1) * a_stride];
-  }
-  int16_t c[8];
-  for (int i = 0; i < 8; i += 4) {
-    c[i + 0] = b[i + 0] + b[i + 2];
-    c[i + 1] = b[i + 1] + b[i + 3];
-    c[i + 2] = b[i + 0] - b[i + 2];
-    c[i + 3] = b[i + 1] - b[i + 3];
-  }
-  out[0] = c[0] + c[4];
-  out[7] = c[1] + c[5];
-  out[3] = c[2] + c[6];
-  out[4] = c[3] + c[7];
-  out[2] = c[0] - c[4];
-  out[6] = c[1] - c[5];
-  out[1] = c[2] - c[6];
-  out[5] = c[3] - c[7];
-}
-
-void reference_hadamard8x8(const int16_t *a, int a_stride, int16_t *b) {
-  int16_t buf[64];
-  for (int i = 0; i < 8; ++i) {
-    hadamard_loop(a + i, a_stride, buf + i * 8);
-  }
-
-  for (int i = 0; i < 8; ++i) {
-    hadamard_loop(buf + i, 8, b + i * 8);
-  }
-}
-
-void reference_hadamard16x16(const int16_t *a, int a_stride, int16_t *b) {
-  /* The source is a 16x16 block. The destination is rearranged to 8x32.
-   * Input is 9 bit. */
-  reference_hadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0);
-  reference_hadamard8x8(a + 8 + 0 * a_stride, a_stride, b + 64);
-  reference_hadamard8x8(a + 0 + 8 * a_stride, a_stride, b + 128);
-  reference_hadamard8x8(a + 8 + 8 * a_stride, a_stride, b + 192);
-
-  /* Overlay the 8x8 blocks and combine. */
-  for (int i = 0; i < 64; ++i) {
-    /* 8x8 steps the range up to 15 bits. */
-    const int16_t a0 = b[0];
-    const int16_t a1 = b[64];
-    const int16_t a2 = b[128];
-    const int16_t a3 = b[192];
-
-    /* Prevent the result from escaping int16_t. */
-    const int16_t b0 = (a0 + a1) >> 1;
-    const int16_t b1 = (a0 - a1) >> 1;
-    const int16_t b2 = (a2 + a3) >> 1;
-    const int16_t b3 = (a2 - a3) >> 1;
-
-    /* Store a 16 bit value. */
-    b[0] = b0 + b2;
-    b[64] = b1 + b3;
-    b[128] = b0 - b2;
-    b[192] = b1 - b3;
-
-    ++b;
-  }
-}
-
-class HadamardTestBase : public ::testing::TestWithParam<HadamardFunc> {
- public:
-  virtual void SetUp() {
-    h_func_ = GetParam();
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-  }
-
- protected:
-  HadamardFunc h_func_;
-  ACMRandom rnd_;
-};
-
-class Hadamard8x8Test : public HadamardTestBase {};
-
-TEST_P(Hadamard8x8Test, CompareReferenceRandom) {
-  DECLARE_ALIGNED(16, int16_t, a[64]);
-  DECLARE_ALIGNED(16, int16_t, b[64]);
-  int16_t b_ref[64];
-  for (int i = 0; i < 64; ++i) {
-    a[i] = rnd_.Rand9Signed();
-  }
-  memset(b, 0, sizeof(b));
-  memset(b_ref, 0, sizeof(b_ref));
-
-  reference_hadamard8x8(a, 8, b_ref);
-  ASM_REGISTER_STATE_CHECK(h_func_(a, 8, b));
-
-  // The order of the output is not important. Sort before checking.
-  std::sort(b, b + 64);
-  std::sort(b_ref, b_ref + 64);
-  EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
-}
-
-TEST_P(Hadamard8x8Test, VaryStride) {
-  DECLARE_ALIGNED(16, int16_t, a[64 * 8]);
-  DECLARE_ALIGNED(16, int16_t, b[64]);
-  int16_t b_ref[64];
-  for (int i = 0; i < 64 * 8; ++i) {
-    a[i] = rnd_.Rand9Signed();
-  }
-
-  for (int i = 8; i < 64; i += 8) {
-    memset(b, 0, sizeof(b));
-    memset(b_ref, 0, sizeof(b_ref));
-
-    reference_hadamard8x8(a, i, b_ref);
-    ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
-
-    // The order of the output is not important. Sort before checking.
-    std::sort(b, b + 64);
-    std::sort(b_ref, b_ref + 64);
-    EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
-  }
-}
-
-INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test,
-                        ::testing::Values(&aom_hadamard_8x8_c));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, Hadamard8x8Test,
-                        ::testing::Values(&aom_hadamard_8x8_sse2));
-#endif  // HAVE_SSE2
-
-#if HAVE_SSSE3 && ARCH_X86_64
-INSTANTIATE_TEST_CASE_P(SSSE3, Hadamard8x8Test,
-                        ::testing::Values(&aom_hadamard_8x8_ssse3));
-#endif  // HAVE_SSSE3 && ARCH_X86_64
-
-#if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, Hadamard8x8Test,
-                        ::testing::Values(&aom_hadamard_8x8_neon));
-#endif  // HAVE_NEON
-
-class Hadamard16x16Test : public HadamardTestBase {};
-
-TEST_P(Hadamard16x16Test, CompareReferenceRandom) {
-  DECLARE_ALIGNED(16, int16_t, a[16 * 16]);
-  DECLARE_ALIGNED(16, int16_t, b[16 * 16]);
-  int16_t b_ref[16 * 16];
-  for (int i = 0; i < 16 * 16; ++i) {
-    a[i] = rnd_.Rand9Signed();
-  }
-  memset(b, 0, sizeof(b));
-  memset(b_ref, 0, sizeof(b_ref));
-
-  reference_hadamard16x16(a, 16, b_ref);
-  ASM_REGISTER_STATE_CHECK(h_func_(a, 16, b));
-
-  // The order of the output is not important. Sort before checking.
-  std::sort(b, b + 16 * 16);
-  std::sort(b_ref, b_ref + 16 * 16);
-  EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
-}
-
-TEST_P(Hadamard16x16Test, VaryStride) {
-  DECLARE_ALIGNED(16, int16_t, a[16 * 16 * 8]);
-  DECLARE_ALIGNED(16, int16_t, b[16 * 16]);
-  int16_t b_ref[16 * 16];
-  for (int i = 0; i < 16 * 16 * 8; ++i) {
-    a[i] = rnd_.Rand9Signed();
-  }
-
-  for (int i = 8; i < 64; i += 8) {
-    memset(b, 0, sizeof(b));
-    memset(b_ref, 0, sizeof(b_ref));
-
-    reference_hadamard16x16(a, i, b_ref);
-    ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
-
-    // The order of the output is not important. Sort before checking.
-    std::sort(b, b + 16 * 16);
-    std::sort(b_ref, b_ref + 16 * 16);
-    EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
-  }
-}
-
-INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test,
-                        ::testing::Values(&aom_hadamard_16x16_c));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test,
-                        ::testing::Values(&aom_hadamard_16x16_sse2));
-#endif  // HAVE_SSE2
-
-#if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test,
-                        ::testing::Values(&aom_hadamard_16x16_neon));
-#endif  // HAVE_NEON
-}  // namespace
diff --git a/third_party/aom/test/hash_test.cc b/third_party/aom/test/hash_test.cc
new file mode 100644
index 000000000..e9f7f63c9
--- /dev/null
+++ b/third_party/aom/test/hash_test.cc
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <new>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/encoder/hash.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+typedef uint32_t (*get_crc32c_value_func)(void *calculator, uint8_t *p,
+                                          int length);
+
+typedef ::testing::tuple<get_crc32c_value_func, int> HashParam;
+
+class AV1Crc32cHashTest : public ::testing::TestWithParam<HashParam> {
+ public:
+  ~AV1Crc32cHashTest();
+  void SetUp();
+
+  void TearDown();
+
+ protected:
+  void RunCheckOutput(get_crc32c_value_func test_impl);
+  void RunSpeedTest(get_crc32c_value_func test_impl);
+
+  void RunZeroTest(get_crc32c_value_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+  CRC32C calc_;
+  uint8_t *buffer_;
+  int bsize_;
+  int length_;
+};
+
+AV1Crc32cHashTest::~AV1Crc32cHashTest() { ; }
+
+void AV1Crc32cHashTest::SetUp() {
+  rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+  av1_crc32c_calculator_init(&calc_);
+
+  bsize_ = GET_PARAM(1);
+  length_ = bsize_ * bsize_ * sizeof(uint16_t);
+  buffer_ = new uint8_t[length_];
+  ASSERT_TRUE(buffer_ != NULL);
+  for (int i = 0; i < length_; ++i) {
+    buffer_[i] = rnd_.Rand8();
+  }
+}
+
+void AV1Crc32cHashTest::TearDown() { delete[] buffer_; }
+
+void AV1Crc32cHashTest::RunCheckOutput(get_crc32c_value_func test_impl) {
+  get_crc32c_value_func ref_impl = av1_get_crc32c_value_c;
+  // for the same buffer crc should be the same
+  uint32_t crc0 = test_impl(&calc_, buffer_, length_);
+  uint32_t crc1 = test_impl(&calc_, buffer_, length_);
+  uint32_t crc2 = ref_impl(&calc_, buffer_, length_);
+  ASSERT_EQ(crc0, crc1);
+  ASSERT_EQ(crc0, crc2);  // should equal to software version
+  // modify buffer
+  buffer_[0] += 1;
+  uint32_t crc3 = test_impl(&calc_, buffer_, length_);
+  uint32_t crc4 = ref_impl(&calc_, buffer_, length_);
+  ASSERT_NE(crc0, crc3);  // crc shoud not equal to previous one
+  ASSERT_EQ(crc3, crc4);
+}
+
+void AV1Crc32cHashTest::RunSpeedTest(get_crc32c_value_func test_impl) {
+  get_crc32c_value_func impls[] = { av1_get_crc32c_value_c, test_impl };
+  const int repeat = 10000000 / (bsize_ + bsize_);
+
+  aom_usec_timer timer;
+  double time[2];
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer_start(&timer);
+    for (int j = 0; j < repeat; ++j) {
+      impls[i](&calc_, buffer_, length_);
+    }
+    aom_usec_timer_mark(&timer);
+    time[i] = static_cast<double>(aom_usec_timer_elapsed(&timer));
+  }
+  printf("hash %3dx%-3d:%7.2f/%7.2fus", bsize_, bsize_, time[0], time[1]);
+  printf("(%3.2f)\n", time[0] / time[1]);
+}
+
+void AV1Crc32cHashTest::RunZeroTest(get_crc32c_value_func test_impl) {
+  uint8_t buffer0[1024] = { 0 };
+  // for buffer with different size the crc should not be the same
+  const uint32_t crc0 = test_impl(&calc_, buffer0, 32);
+  const uint32_t crc1 = test_impl(&calc_, buffer0, 128);
+  const uint32_t crc2 = test_impl(&calc_, buffer0, 1024);
+  ASSERT_NE(crc0, crc1);
+  ASSERT_NE(crc0, crc2);
+  ASSERT_NE(crc1, crc2);
+}
+
+TEST_P(AV1Crc32cHashTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
+
+TEST_P(AV1Crc32cHashTest, CheckZero) { RunZeroTest(GET_PARAM(0)); }
+
+TEST_P(AV1Crc32cHashTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
+
+const int kValidBlockSize[] = { 64, 32, 8, 4 };
+
+INSTANTIATE_TEST_CASE_P(
+    C, AV1Crc32cHashTest,
+    ::testing::Combine(::testing::Values(&av1_get_crc32c_value_c),
+                       ::testing::ValuesIn(kValidBlockSize)));
+
+#if HAVE_SSE4_2
+INSTANTIATE_TEST_CASE_P(
+    SSE4_2, AV1Crc32cHashTest,
+    ::testing::Combine(::testing::Values(&av1_get_crc32c_value_sse4_2),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+}  // namespace
diff --git a/third_party/aom/test/hbd_metrics_test.cc b/third_party/aom/test/hbd_metrics_test.cc
index 4def53b21..09df9bde4 100644
--- a/third_party/aom/test/hbd_metrics_test.cc
+++ b/third_party/aom/test/hbd_metrics_test.cc
@@ -7,7 +7,7 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <math.h>
 #include <stdlib.h>
@@ -16,7 +16,9 @@
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/util.h"
-#include "./aom_config.h"
+
+#include "config/aom_config.h"
+
 #include "aom_dsp/psnr.h"
 #include "aom_dsp/ssim.h"
 #include "aom_ports/mem.h"
@@ -171,7 +173,7 @@ class HBDMetricsTestBase {
   HBDMetricFunc hbd_metric_;
 };
 
-typedef std::tr1::tuple<LBDMetricFunc, HBDMetricFunc, int, int, double>
+typedef ::testing::tuple<LBDMetricFunc, HBDMetricFunc, int, int, double>
     MetricTestTParam;
 class HBDMetricsTest : public HBDMetricsTestBase,
                        public ::testing::TestWithParam<MetricTestTParam> {
diff --git a/third_party/aom/test/hiprec_convolve_test.cc b/third_party/aom/test/hiprec_convolve_test.cc
index 78e109c9d..f94a0730c 100644
--- a/third_party/aom/test/hiprec_convolve_test.cc
+++ b/third_party/aom/test/hiprec_convolve_test.cc
@@ -12,33 +12,51 @@
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/hiprec_convolve_test_util.h"
 
-using std::tr1::tuple;
-using std::tr1::make_tuple;
+using ::testing::make_tuple;
+using ::testing::tuple;
 using libaom_test::ACMRandom;
-using libaom_test::AV1HiprecConvolve::AV1HiprecConvolveTest;
-#if CONFIG_HIGHBITDEPTH
 using libaom_test::AV1HighbdHiprecConvolve::AV1HighbdHiprecConvolveTest;
-#endif
+using libaom_test::AV1HiprecConvolve::AV1HiprecConvolveTest;
 
 namespace {
 
-#if HAVE_SSE2
 TEST_P(AV1HiprecConvolveTest, CheckOutput) { RunCheckOutput(GET_PARAM(3)); }
-
+TEST_P(AV1HiprecConvolveTest, DISABLED_SpeedTest) {
+  RunSpeedTest(GET_PARAM(3));
+}
+#if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(SSE2, AV1HiprecConvolveTest,
                         libaom_test::AV1HiprecConvolve::BuildParams(
-                            aom_convolve8_add_src_hip_sse2));
+                            av1_wiener_convolve_add_src_sse2));
+#endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, AV1HiprecConvolveTest,
+                        libaom_test::AV1HiprecConvolve::BuildParams(
+                            av1_wiener_convolve_add_src_avx2));
+#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, AV1HiprecConvolveTest,
+                        libaom_test::AV1HiprecConvolve::BuildParams(
+                            av1_wiener_convolve_add_src_neon));
 #endif
 
-#if CONFIG_HIGHBITDEPTH && HAVE_SSSE3
+#if HAVE_SSSE3 || HAVE_AVX2
 TEST_P(AV1HighbdHiprecConvolveTest, CheckOutput) {
   RunCheckOutput(GET_PARAM(4));
 }
-
+TEST_P(AV1HighbdHiprecConvolveTest, DISABLED_SpeedTest) {
+  RunSpeedTest(GET_PARAM(4));
+}
+#if HAVE_SSSE3
 INSTANTIATE_TEST_CASE_P(SSSE3, AV1HighbdHiprecConvolveTest,
                         libaom_test::AV1HighbdHiprecConvolve::BuildParams(
-                            aom_highbd_convolve8_add_src_hip_ssse3));
-
+                            av1_highbd_wiener_convolve_add_src_ssse3));
+#endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdHiprecConvolveTest,
+                        libaom_test::AV1HighbdHiprecConvolve::BuildParams(
+                            av1_highbd_wiener_convolve_add_src_avx2));
+#endif
 #endif
 
 }  // namespace
diff --git a/third_party/aom/test/hiprec_convolve_test_util.cc b/third_party/aom/test/hiprec_convolve_test_util.cc
index 4dee6ab4d..2672bcec3 100644
--- a/third_party/aom/test/hiprec_convolve_test_util.cc
+++ b/third_party/aom/test/hiprec_convolve_test_util.cc
@@ -13,8 +13,8 @@
 
 #include "av1/common/restoration.h"
 
-using std::tr1::tuple;
-using std::tr1::make_tuple;
+using ::testing::make_tuple;
+using ::testing::tuple;
 
 namespace libaom_test {
 
@@ -52,8 +52,13 @@ namespace AV1HiprecConvolve {
 ::testing::internal::ParamGenerator<HiprecConvolveParam> BuildParams(
     hiprec_convolve_func filter) {
   const HiprecConvolveParam params[] = {
-    make_tuple(8, 8, 50000, filter), make_tuple(64, 64, 1000, filter),
-    make_tuple(32, 8, 10000, filter),
+    make_tuple(8, 8, 50000, filter),   make_tuple(8, 4, 50000, filter),
+    make_tuple(64, 24, 1000, filter),  make_tuple(64, 64, 1000, filter),
+    make_tuple(64, 56, 1000, filter),  make_tuple(32, 8, 10000, filter),
+    make_tuple(32, 28, 10000, filter), make_tuple(32, 32, 10000, filter),
+    make_tuple(16, 34, 10000, filter), make_tuple(32, 34, 10000, filter),
+    make_tuple(64, 34, 1000, filter),  make_tuple(8, 17, 10000, filter),
+    make_tuple(16, 17, 10000, filter), make_tuple(32, 17, 10000, filter)
   };
   return ::testing::ValuesIn(params);
 }
@@ -70,14 +75,15 @@ void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
   const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
   const int num_iters = GET_PARAM(2);
   int i, j;
+  const ConvolveParams conv_params = get_conv_params_wiener(8);
 
   uint8_t *input_ = new uint8_t[h * w];
   uint8_t *input = input_;
 
-  // The convolve functions always write rows with widths that are multiples of
-  // 8.
-  // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
-  int output_n = ((out_w + 7) & ~7) * out_h;
+  // The AVX2 convolve functions always write rows with widths that are
+  // multiples of 16. So to avoid a buffer overflow, we may need to pad
+  // rows to a multiple of 16.
+  int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
   uint8_t *output = new uint8_t[output_n];
   uint8_t *output2 = new uint8_t[output_n];
 
@@ -94,10 +100,11 @@ void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
     // Choose random locations within the source block
     int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
     int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-    aom_convolve8_add_src_hip_c(input + offset_r * w + offset_c, w, output,
-                                out_w, hkernel, 16, vkernel, 16, out_w, out_h);
+    av1_wiener_convolve_add_src_c(input + offset_r * w + offset_c, w, output,
+                                  out_w, hkernel, 16, vkernel, 16, out_w, out_h,
+                                  &conv_params);
     test_impl(input + offset_r * w + offset_c, w, output2, out_w, hkernel, 16,
-              vkernel, 16, out_w, out_h);
+              vkernel, 16, out_w, out_h, &conv_params);
 
     for (j = 0; j < out_w * out_h; ++j)
       ASSERT_EQ(output[j], output2[j])
@@ -108,9 +115,74 @@ void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
   delete[] output;
   delete[] output2;
 }
+
+void AV1HiprecConvolveTest::RunSpeedTest(hiprec_convolve_func test_impl) {
+  const int w = 128, h = 128;
+  const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
+  const int num_iters = GET_PARAM(2) / 500;
+  int i, j, k;
+  const ConvolveParams conv_params = get_conv_params_wiener(8);
+
+  uint8_t *input_ = new uint8_t[h * w];
+  uint8_t *input = input_;
+
+  // The AVX2 convolve functions always write rows with widths that are
+  // multiples of 16. So to avoid a buffer overflow, we may need to pad
+  // rows to a multiple of 16.
+  int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
+  uint8_t *output = new uint8_t[output_n];
+  uint8_t *output2 = new uint8_t[output_n];
+
+  // Generate random filter kernels
+  DECLARE_ALIGNED(16, InterpKernel, hkernel);
+  DECLARE_ALIGNED(16, InterpKernel, vkernel);
+
+  generate_kernels(&rnd_, hkernel, vkernel);
+
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
+
+  aom_usec_timer ref_timer;
+  aom_usec_timer_start(&ref_timer);
+  for (i = 0; i < num_iters; ++i) {
+    for (j = 3; j < h - out_h - 4; j++) {
+      for (k = 3; k < w - out_w - 4; k++) {
+        av1_wiener_convolve_add_src_c(input + j * w + k, w, output, out_w,
+                                      hkernel, 16, vkernel, 16, out_w, out_h,
+                                      &conv_params);
+      }
+    }
+  }
+  aom_usec_timer_mark(&ref_timer);
+  const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+  aom_usec_timer tst_timer;
+  aom_usec_timer_start(&tst_timer);
+  for (i = 0; i < num_iters; ++i) {
+    for (j = 3; j < h - out_h - 4; j++) {
+      for (k = 3; k < w - out_w - 4; k++) {
+        test_impl(input + j * w + k, w, output2, out_w, hkernel, 16, vkernel,
+                  16, out_w, out_h, &conv_params);
+      }
+    }
+  }
+  aom_usec_timer_mark(&tst_timer);
+  const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+  std::cout << "[          ] C time = " << ref_time / 1000
+            << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+  EXPECT_GT(ref_time, tst_time)
+      << "Error: AV1HiprecConvolveTest.SpeedTest, SIMD slower than C.\n"
+      << "C time: " << ref_time << " us\n"
+      << "SIMD time: " << tst_time << " us\n";
+
+  delete[] input_;
+  delete[] output;
+  delete[] output2;
+}
 }  // namespace AV1HiprecConvolve
 
-#if CONFIG_HIGHBITDEPTH
 namespace AV1HighbdHiprecConvolve {
 
 ::testing::internal::ParamGenerator<HighbdHiprecConvolveParam> BuildParams(
@@ -141,13 +213,14 @@ void AV1HighbdHiprecConvolveTest::RunCheckOutput(
   const int num_iters = GET_PARAM(2);
   const int bd = GET_PARAM(3);
   int i, j;
+  const ConvolveParams conv_params = get_conv_params_wiener(bd);
 
   uint16_t *input = new uint16_t[h * w];
 
-  // The convolve functions always write rows with widths that are multiples of
-  // 8.
-  // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
-  int output_n = ((out_w + 7) & ~7) * out_h;
+  // The AVX2 convolve functions always write rows with widths that are
+  // multiples of 16. So to avoid a buffer overflow, we may need to pad
+  // rows to a multiple of 16.
+  int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
   uint16_t *output = new uint16_t[output_n];
   uint16_t *output2 = new uint16_t[output_n];
 
@@ -168,11 +241,11 @@ void AV1HighbdHiprecConvolveTest::RunCheckOutput(
     // Choose random locations within the source block
     int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
     int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-    aom_highbd_convolve8_add_src_hip_c(input_ptr + offset_r * w + offset_c, w,
-                                       output_ptr, out_w, hkernel, 16, vkernel,
-                                       16, out_w, out_h, bd);
+    av1_highbd_wiener_convolve_add_src_c(
+        input_ptr + offset_r * w + offset_c, w, output_ptr, out_w, hkernel, 16,
+        vkernel, 16, out_w, out_h, &conv_params, bd);
     test_impl(input_ptr + offset_r * w + offset_c, w, output2_ptr, out_w,
-              hkernel, 16, vkernel, 16, out_w, out_h, bd);
+              hkernel, 16, vkernel, 16, out_w, out_h, &conv_params, bd);
 
     for (j = 0; j < out_w * out_h; ++j)
       ASSERT_EQ(output[j], output2[j])
@@ -183,6 +256,76 @@ void AV1HighbdHiprecConvolveTest::RunCheckOutput(
   delete[] output;
   delete[] output2;
 }
+
+void AV1HighbdHiprecConvolveTest::RunSpeedTest(
+    highbd_hiprec_convolve_func test_impl) {
+  const int w = 128, h = 128;
+  const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
+  const int num_iters = GET_PARAM(2) / 500;
+  const int bd = GET_PARAM(3);
+  int i, j, k;
+  const ConvolveParams conv_params = get_conv_params_wiener(bd);
+
+  uint16_t *input = new uint16_t[h * w];
+
+  // The AVX2 convolve functions always write rows with widths that are
+  // multiples of 16. So to avoid a buffer overflow, we may need to pad
+  // rows to a multiple of 16.
+  int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
+  uint16_t *output = new uint16_t[output_n];
+  uint16_t *output2 = new uint16_t[output_n];
+
+  // Generate random filter kernels
+  DECLARE_ALIGNED(16, InterpKernel, hkernel);
+  DECLARE_ALIGNED(16, InterpKernel, vkernel);
+
+  generate_kernels(&rnd_, hkernel, vkernel);
+
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+
+  uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input);
+  uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output);
+  uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2);
+
+  aom_usec_timer ref_timer;
+  aom_usec_timer_start(&ref_timer);
+  for (i = 0; i < num_iters; ++i) {
+    for (j = 3; j < h - out_h - 4; j++) {
+      for (k = 3; k < w - out_w - 4; k++) {
+        av1_highbd_wiener_convolve_add_src_c(
+            input_ptr + j * w + k, w, output_ptr, out_w, hkernel, 16, vkernel,
+            16, out_w, out_h, &conv_params, bd);
+      }
+    }
+  }
+  aom_usec_timer_mark(&ref_timer);
+  const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+  aom_usec_timer tst_timer;
+  aom_usec_timer_start(&tst_timer);
+  for (i = 0; i < num_iters; ++i) {
+    for (j = 3; j < h - out_h - 4; j++) {
+      for (k = 3; k < w - out_w - 4; k++) {
+        test_impl(input_ptr + j * w + k, w, output2_ptr, out_w, hkernel, 16,
+                  vkernel, 16, out_w, out_h, &conv_params, bd);
+      }
+    }
+  }
+  aom_usec_timer_mark(&tst_timer);
+  const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+  std::cout << "[          ] C time = " << ref_time / 1000
+            << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+  EXPECT_GT(ref_time, tst_time)
+      << "Error: AV1HighbdHiprecConvolveTest.SpeedTest, SIMD slower than C.\n"
+      << "C time: " << ref_time << " us\n"
+      << "SIMD time: " << tst_time << " us\n";
+
+  delete[] input;
+  delete[] output;
+  delete[] output2;
+}
 }  // namespace AV1HighbdHiprecConvolve
-#endif  // CONFIG_HIGHBITDEPTH
 }  // namespace libaom_test
diff --git a/third_party/aom/test/hiprec_convolve_test_util.h b/third_party/aom/test/hiprec_convolve_test_util.h
index fe31570f5..81471c8b9 100644
--- a/third_party/aom/test/hiprec_convolve_test_util.h
+++ b/third_party/aom/test/hiprec_convolve_test_util.h
@@ -12,14 +12,16 @@
 #ifndef TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
 #define TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "config/av1_rtcd.h"
+
 #include "test/acm_random.h"
 #include "test/util.h"
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
+#include "aom_ports/aom_timer.h"
+#include "av1/common/convolve.h"
 #include "av1/common/mv.h"
 
 namespace libaom_test {
@@ -30,9 +32,10 @@ typedef void (*hiprec_convolve_func)(const uint8_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const int16_t *filter_x, int x_step_q4,
                                      const int16_t *filter_y, int y_step_q4,
-                                     int w, int h);
+                                     int w, int h,
+                                     const ConvolveParams *conv_params);
 
-typedef std::tr1::tuple<int, int, int, hiprec_convolve_func>
+typedef ::testing::tuple<int, int, int, hiprec_convolve_func>
     HiprecConvolveParam;
 
 ::testing::internal::ParamGenerator<HiprecConvolveParam> BuildParams(
@@ -48,20 +51,21 @@ class AV1HiprecConvolveTest
 
  protected:
   void RunCheckOutput(hiprec_convolve_func test_impl);
+  void RunSpeedTest(hiprec_convolve_func test_impl);
 
   libaom_test::ACMRandom rnd_;
 };
 
 }  // namespace AV1HiprecConvolve
 
-#if CONFIG_HIGHBITDEPTH
 namespace AV1HighbdHiprecConvolve {
 typedef void (*highbd_hiprec_convolve_func)(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
-    const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+    const int16_t *filter_y, int y_step_q4, int w, int h,
+    const ConvolveParams *conv_params, int bps);
 
-typedef std::tr1::tuple<int, int, int, int, highbd_hiprec_convolve_func>
+typedef ::testing::tuple<int, int, int, int, highbd_hiprec_convolve_func>
     HighbdHiprecConvolveParam;
 
 ::testing::internal::ParamGenerator<HighbdHiprecConvolveParam> BuildParams(
@@ -77,12 +81,12 @@ class AV1HighbdHiprecConvolveTest
 
  protected:
   void RunCheckOutput(highbd_hiprec_convolve_func test_impl);
+  void RunSpeedTest(highbd_hiprec_convolve_func test_impl);
 
   libaom_test::ACMRandom rnd_;
 };
 
 }  // namespace AV1HighbdHiprecConvolve
-#endif  // CONFIG_HIGHBITDEPTH
 
 }  // namespace libaom_test
 
diff --git a/third_party/aom/test/horz_superres_test.cc b/third_party/aom/test/horz_superres_test.cc
new file mode 100644
index 000000000..973f55b66
--- /dev/null
+++ b/third_party/aom/test/horz_superres_test.cc
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "av1/encoder/encoder.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+using ::testing::make_tuple;
+using ::testing::tuple;
+
+/* TESTING PARAMETERS */
+
+#define NUM_TEST_VIDEOS 3
+
+const int kBitrate = 40;
+
+// PSNR thresholds found by experiment
+const double kPSNRThresholds[] = { 26.0, 28.0, 20.0 };
+
+typedef struct {
+  const char *filename;
+  aom_img_fmt fmt;
+  aom_bit_depth_t bit_depth;
+  unsigned int profile;
+  unsigned int limit;
+  unsigned int screen_content;
+} TestVideoParam;
+
+const TestVideoParam kTestVideoVectors[] = {
+  { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0 },
+  { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0 },
+  { "screendata.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 4, 1 },
+};
+
+// Superres modes tested
+// SUPERRES_QTHRESH is not included, as it has its own test
+const SUPERRES_MODE kSuperresModesNotQThresh[] = { SUPERRES_FIXED,
+                                                   SUPERRES_RANDOM };
+
+// Superres denominators and superres kf denominators to be tested
+typedef tuple<int, int> SuperresDenominatorPair;
+const SuperresDenominatorPair kSuperresDenominators[] = {
+  make_tuple(16, 9),  make_tuple(13, 11), make_tuple(9, 9),
+  make_tuple(13, 13), make_tuple(11, 16), make_tuple(8, 16),
+  make_tuple(16, 8),  make_tuple(8, 8),   make_tuple(9, 14),
+};
+
+// Superres q thresholds and superres kf q thresholds to be tested
+typedef tuple<int, int> SuperresQThresholdPair;
+const SuperresQThresholdPair kSuperresQThresholds[] = {
+  make_tuple(63, 63), make_tuple(63, 41), make_tuple(17, 63),
+  make_tuple(41, 11), make_tuple(1, 37),  make_tuple(11, 11),
+  make_tuple(1, 1),   make_tuple(17, 29), make_tuple(29, 11),
+};
+
+/* END (TESTING PARAMETERS) */
+
+// Test parameter list:
+//  <[needed for EncoderTest], test_video_idx_, superres_mode_,
+//  tuple(superres_denom_, superres_kf_denom_)>
+typedef tuple<const libaom_test::CodecFactory *, int, SUPERRES_MODE,
+              SuperresDenominatorPair>
+    HorzSuperresTestParam;
+
+class HorzSuperresEndToEndTest
+    : public ::testing::TestWithParam<HorzSuperresTestParam>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  HorzSuperresEndToEndTest()
+      : EncoderTest(GET_PARAM(0)), test_video_idx_(GET_PARAM(1)),
+        superres_mode_(GET_PARAM(2)), psnr_(0.0), frame_count_(0) {
+    test_video_param_ = kTestVideoVectors[test_video_idx_];
+
+    SuperresDenominatorPair denoms = GET_PARAM(3);
+    superres_denom_ = ::testing::get<0>(denoms);
+    superres_kf_denom_ = ::testing::get<1>(denoms);
+  }
+
+  virtual ~HorzSuperresEndToEndTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libaom_test::kTwoPassGood);
+    cfg_.g_lag_in_frames = 5;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.rc_target_bitrate = kBitrate;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = (unsigned int)test_video_param_.bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    // Set superres parameters
+    cfg_.rc_superres_mode = superres_mode_;
+    cfg_.rc_superres_denominator = superres_denom_;
+    cfg_.rc_superres_kf_denominator = superres_kf_denom_;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    frame_count_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    frame_count_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+
+      // Set cpu-used = 8 for speed
+      encoder->Control(AOME_SET_CPUUSED, 8);
+
+      // Test screen coding tools
+      if (test_video_param_.screen_content)
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+      else
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (frame_count_) return psnr_ / frame_count_;
+    return 0.0;
+  }
+
+  double GetPsnrThreshold() { return kPSNRThresholds[test_video_idx_]; }
+
+  void DoTest() {
+    testing::internal::scoped_ptr<libaom_test::VideoSource> video;
+    video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                test_video_param_.limit));
+    ASSERT_TRUE(video.get() != NULL);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr = GetAveragePsnr();
+    EXPECT_GT(psnr, GetPsnrThreshold())
+        << "superres_mode_ = " << superres_mode_
+        << ", superres_denom_ = " << superres_denom_
+        << ", superres_kf_denom_ = " << superres_kf_denom_;
+
+    EXPECT_EQ(test_video_param_.limit, frame_count_)
+        << "superres_mode_ = " << superres_mode_
+        << ", superres_denom_ = " << superres_denom_
+        << ", superres_kf_denom_ = " << superres_kf_denom_;
+  }
+
+  int test_video_idx_;
+  TestVideoParam test_video_param_;
+  SUPERRES_MODE superres_mode_;
+  int superres_denom_;
+  int superres_kf_denom_;
+
+ private:
+  double psnr_;
+  unsigned int frame_count_;
+};
+
+TEST_P(HorzSuperresEndToEndTest, HorzSuperresEndToEndPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_CASE(HorzSuperresEndToEndTest,
+                          ::testing::Range(0, NUM_TEST_VIDEOS),
+                          ::testing::ValuesIn(kSuperresModesNotQThresh),
+                          ::testing::ValuesIn(kSuperresDenominators));
+
+// Test parameter list:
+//  <[needed for EncoderTest], test_video_idx_, tuple(superres_denom_,
+//  superres_kf_denom_), tuple(superres_qthresh_,superres_kf_qthresh_)>
+typedef tuple<const libaom_test::CodecFactory *, int, SuperresDenominatorPair,
+              SuperresQThresholdPair>
+    HorzSuperresQThreshTestParam;
+
+class HorzSuperresQThreshEndToEndTest
+    : public ::testing::TestWithParam<HorzSuperresQThreshTestParam>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  HorzSuperresQThreshEndToEndTest()
+      : EncoderTest(GET_PARAM(0)), test_video_idx_(GET_PARAM(1)),
+        superres_mode_(SUPERRES_QTHRESH), psnr_(0.0), frame_count_(0) {
+    test_video_param_ = kTestVideoVectors[test_video_idx_];
+
+    SuperresDenominatorPair denoms = GET_PARAM(2);
+    superres_denom_ = ::testing::get<0>(denoms);
+    superres_kf_denom_ = ::testing::get<1>(denoms);
+
+    SuperresQThresholdPair qthresholds = GET_PARAM(3);
+    superres_qthresh_ = ::testing::get<0>(qthresholds);
+    superres_kf_qthresh_ = ::testing::get<1>(qthresholds);
+  }
+
+  virtual ~HorzSuperresQThreshEndToEndTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libaom_test::kTwoPassGood);
+    cfg_.g_lag_in_frames = 5;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.rc_target_bitrate = kBitrate;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = (unsigned int)test_video_param_.bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    // Set superres parameters
+    cfg_.rc_superres_mode = superres_mode_;
+    cfg_.rc_superres_denominator = superres_denom_;
+    cfg_.rc_superres_kf_denominator = superres_kf_denom_;
+    cfg_.rc_superres_qthresh = superres_qthresh_;
+    cfg_.rc_superres_kf_qthresh = superres_kf_qthresh_;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    frame_count_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    frame_count_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 0);
+
+      // Set cpu-used = 8 for speed
+      encoder->Control(AOME_SET_CPUUSED, 8);
+
+      // Test screen coding tools
+      if (test_video_param_.screen_content)
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+      else
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (frame_count_) return psnr_ / frame_count_;
+    return 0.0;
+  }
+
+  double GetPsnrThreshold() { return kPSNRThresholds[test_video_idx_]; }
+
+  void DoTest() {
+    testing::internal::scoped_ptr<libaom_test::VideoSource> video;
+    video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                test_video_param_.limit));
+    ASSERT_TRUE(video.get() != NULL);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr = GetAveragePsnr();
+    EXPECT_GT(psnr, GetPsnrThreshold())
+        << "superres_mode_ = " << superres_mode_
+        << ", superres_denom_ = " << superres_denom_
+        << ", superres_kf_denom_ = " << superres_kf_denom_
+        << ", superres_qthresh_ = " << superres_qthresh_
+        << ", superres_kf_qthresh_ = " << superres_kf_qthresh_;
+
+    EXPECT_EQ(test_video_param_.limit, frame_count_)
+        << "superres_mode_ = " << superres_mode_
+        << ", superres_denom_ = " << superres_denom_
+        << ", superres_kf_denom_ = " << superres_kf_denom_
+        << ", superres_qthresh_ = " << superres_qthresh_
+        << ", superres_kf_qthresh_ = " << superres_kf_qthresh_;
+  }
+
+  int test_video_idx_;
+  TestVideoParam test_video_param_;
+  SUPERRES_MODE superres_mode_;
+  int superres_denom_;
+  int superres_kf_denom_;
+  int superres_qthresh_;
+  int superres_kf_qthresh_;
+
+ private:
+  double psnr_;
+  unsigned int frame_count_;
+};
+
+TEST_P(HorzSuperresQThreshEndToEndTest, HorzSuperresQThreshEndToEndPSNRTest) {
+  DoTest();
+}
+
+AV1_INSTANTIATE_TEST_CASE(HorzSuperresQThreshEndToEndTest,
+                          ::testing::Range(0, NUM_TEST_VIDEOS),
+                          ::testing::ValuesIn(kSuperresDenominators),
+                          ::testing::ValuesIn(kSuperresQThresholds));
+
+}  // namespace
diff --git a/third_party/aom/test/idct8x8_test.cc b/third_party/aom/test/idct8x8_test.cc
deleted file mode 100644
index f99a4075f..000000000
--- a/third_party/aom/test/idct8x8_test.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
-
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./aom_dsp_rtcd.h"
-#include "test/acm_random.h"
-#include "aom/aom_integer.h"
-#include "aom_ports/msvc.h"  // for round()
-
-using libaom_test::ACMRandom;
-
-namespace {
-
-void reference_dct_1d(double input[8], double output[8]) {
-  const double kPi = 3.141592653589793238462643383279502884;
-  const double kInvSqrt2 = 0.707106781186547524400844362104;
-  for (int k = 0; k < 8; k++) {
-    output[k] = 0.0;
-    for (int n = 0; n < 8; n++)
-      output[k] += input[n] * cos(kPi * (2 * n + 1) * k / 16.0);
-    if (k == 0) output[k] = output[k] * kInvSqrt2;
-  }
-}
-
-void reference_dct_2d(int16_t input[64], double output[64]) {
-  // First transform columns
-  for (int i = 0; i < 8; ++i) {
-    double temp_in[8], temp_out[8];
-    for (int j = 0; j < 8; ++j) temp_in[j] = input[j * 8 + i];
-    reference_dct_1d(temp_in, temp_out);
-    for (int j = 0; j < 8; ++j) output[j * 8 + i] = temp_out[j];
-  }
-  // Then transform rows
-  for (int i = 0; i < 8; ++i) {
-    double temp_in[8], temp_out[8];
-    for (int j = 0; j < 8; ++j) temp_in[j] = output[j + i * 8];
-    reference_dct_1d(temp_in, temp_out);
-    for (int j = 0; j < 8; ++j) output[j + i * 8] = temp_out[j];
-  }
-  // Scale by some magic number
-  for (int i = 0; i < 64; ++i) output[i] *= 2;
-}
-
-TEST(AV1Idct8x8Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 10000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t input[64];
-    tran_low_t coeff[64];
-    double output_r[64];
-    uint8_t dst[64], src[64];
-
-    for (int j = 0; j < 64; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-    }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 64; ++j) input[j] = src[j] - dst[j];
-
-    reference_dct_2d(input, output_r);
-    for (int j = 0; j < 64; ++j)
-      coeff[j] = static_cast<tran_low_t>(round(output_r[j]));
-    aom_idct8x8_64_add_c(coeff, dst, 8);
-    for (int j = 0; j < 64; ++j) {
-      const int diff = dst[j] - src[j];
-      const int error = diff * diff;
-      EXPECT_GE(1, error) << "Error: 8x8 FDCT/IDCT has error " << error
-                          << " at index " << j;
-    }
-  }
-}
-
-}  // namespace
diff --git a/third_party/aom/test/intra_edge_test.cc b/third_party/aom/test/intra_edge_test.cc
new file mode 100644
index 000000000..ce61402ac
--- /dev/null
+++ b/third_party/aom/test/intra_edge_test.cc
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+#include "test/function_equivalence_test.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/enums.h"
+
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+template <typename F, typename T>
+class UpsampleTest : public FunctionEquivalenceTest<F> {
+ protected:
+  static const int kIterations = 1000000;
+  static const int kMinEdge = 4;
+  static const int kMaxEdge = 24;
+  static const int kBufSize = 2 * 64 + 32;
+  static const int kOffset = 16;
+
+  virtual ~UpsampleTest() {}
+
+  virtual void Execute(T *edge_tst) = 0;
+
+  void Common() {
+    edge_ref_ = &edge_ref_data_[kOffset];
+    edge_tst_ = &edge_tst_data_[kOffset];
+
+    Execute(edge_tst_);
+
+    const int max_idx = (size_ - 1) * 2;
+    for (int r = -2; r <= max_idx; ++r) {
+      ASSERT_EQ(edge_ref_[r], edge_tst_[r]);
+    }
+  }
+
+  T edge_ref_data_[kBufSize];
+  T edge_tst_data_[kBufSize];
+
+  T *edge_ref_;
+  T *edge_tst_;
+
+  int size_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*UP8B)(uint8_t *p, int size);
+typedef libaom_test::FuncParam<UP8B> TestFuncs;
+
+class UpsampleTest8B : public UpsampleTest<UP8B, uint8_t> {
+ protected:
+  void Execute(uint8_t *edge_tst) {
+    params_.ref_func(edge_ref_, size_);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_));
+  }
+};
+
+TEST_P(UpsampleTest8B, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    size_ = 4 * (this->rng_(4) + 1);
+
+    int i, pix = 0;
+    for (i = 0; i < kOffset + size_; ++i) {
+      pix = rng_.Rand8();
+      edge_ref_data_[i] = pix;
+      edge_tst_data_[i] = edge_ref_data_[i];
+    }
+
+    // Extend final sample
+    while (i < kBufSize) {
+      edge_ref_data_[i] = pix;
+      edge_tst_data_[i] = pix;
+      i++;
+    }
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, UpsampleTest8B,
+    ::testing::Values(TestFuncs(av1_upsample_intra_edge_c,
+                                av1_upsample_intra_edge_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*UPHB)(uint16_t *p, int size, int bd);
+typedef libaom_test::FuncParam<UPHB> TestFuncsHBD;
+
+class UpsampleTestHB : public UpsampleTest<UPHB, uint16_t> {
+ protected:
+  void Execute(uint16_t *edge_tst) {
+    params_.ref_func(edge_ref_, size_, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, bit_depth_));
+  }
+  int bit_depth_;
+};
+
+TEST_P(UpsampleTestHB, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+      case 0: bit_depth_ = 8; break;
+      case 1: bit_depth_ = 10; break;
+      default: bit_depth_ = 12; break;
+    }
+    const int hi = 1 << bit_depth_;
+
+    size_ = 4 * (this->rng_(4) + 1);
+
+    int i, pix = 0;
+    for (i = 0; i < kOffset + size_; ++i) {
+      pix = rng_(hi);
+      edge_ref_data_[i] = pix;
+      edge_tst_data_[i] = pix;
+    }
+
+    // Extend final sample
+    while (i < kBufSize) {
+      edge_ref_data_[i] = pix;
+      edge_tst_data_[i] = pix;
+      i++;
+    }
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, UpsampleTestHB,
+    ::testing::Values(TestFuncsHBD(av1_upsample_intra_edge_high_c,
+                                   av1_upsample_intra_edge_high_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+template <typename F, typename T>
+class FilterEdgeTest : public FunctionEquivalenceTest<F> {
+ protected:
+  static const int kIterations = 1000000;
+  static const int kMaxEdge = 2 * 64;
+  static const int kBufSize = kMaxEdge + 32;
+  static const int kOffset = 15;
+
+  virtual ~FilterEdgeTest() {}
+
+  virtual void Execute(T *edge_tst) = 0;
+
+  void Common() {
+    edge_ref_ = &edge_ref_data_[kOffset];
+    edge_tst_ = &edge_tst_data_[kOffset];
+
+    Execute(edge_tst_);
+
+    for (int r = 0; r < size_; ++r) {
+      ASSERT_EQ(edge_ref_[r], edge_tst_[r]);
+    }
+  }
+
+  T edge_ref_data_[kBufSize];
+  T edge_tst_data_[kBufSize];
+
+  T *edge_ref_;
+  T *edge_tst_;
+
+  int size_;
+  int strength_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FE8B)(uint8_t *p, int size, int strength);
+typedef libaom_test::FuncParam<FE8B> FilterEdgeTestFuncs;
+
+class FilterEdgeTest8B : public FilterEdgeTest<FE8B, uint8_t> {
+ protected:
+  void Execute(uint8_t *edge_tst) {
+    params_.ref_func(edge_ref_, size_, strength_);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
+  }
+};
+
+TEST_P(FilterEdgeTest8B, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    strength_ = this->rng_(4);
+    size_ = 4 * (this->rng_(128 / 4) + 1) + 1;
+
+    int i, pix = 0;
+    for (i = 0; i < kOffset + size_; ++i) {
+      pix = rng_.Rand8();
+      edge_ref_data_[i] = pix;
+      edge_tst_data_[i] = pix;
+    }
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, FilterEdgeTest8B,
+    ::testing::Values(FilterEdgeTestFuncs(av1_filter_intra_edge_c,
+                                          av1_filter_intra_edge_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FEHB)(uint16_t *p, int size, int strength);
+typedef libaom_test::FuncParam<FEHB> FilterEdgeTestFuncsHBD;
+
+class FilterEdgeTestHB : public FilterEdgeTest<FEHB, uint16_t> {
+ protected:
+  void Execute(uint16_t *edge_tst) {
+    params_.ref_func(edge_ref_, size_, strength_);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
+  }
+  int bit_depth_;
+};
+
+TEST_P(FilterEdgeTestHB, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+      case 0: bit_depth_ = 8; break;
+      case 1: bit_depth_ = 10; break;
+      default: bit_depth_ = 12; break;
+    }
+    const int hi = 1 << bit_depth_;
+    strength_ = this->rng_(4);
+    size_ = 4 * (this->rng_(128 / 4) + 1) + 1;
+
+    int i, pix = 0;
+    for (i = 0; i < kOffset + size_; ++i) {
+      pix = rng_(hi);
+      edge_ref_data_[i] = pix;
+      edge_tst_data_[i] = pix;
+    }
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(SSE4_1, FilterEdgeTestHB,
+                        ::testing::Values(FilterEdgeTestFuncsHBD(
+                            av1_filter_intra_edge_high_c,
+                            av1_filter_intra_edge_high_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+// Speed tests
+
+TEST_P(UpsampleTest8B, DISABLED_Speed) {
+  const int test_count = 10000000;
+  size_ = kMaxEdge;
+  for (int i = 0; i < kOffset + size_; ++i) {
+    edge_tst_data_[i] = rng_.Rand8();
+  }
+  edge_tst_ = &edge_tst_data_[kOffset];
+  for (int iter = 0; iter < test_count; ++iter) {
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_));
+  }
+}
+
+TEST_P(UpsampleTestHB, DISABLED_Speed) {
+  const int test_count = 10000000;
+  size_ = kMaxEdge;
+  bit_depth_ = 12;
+  const int hi = 1 << bit_depth_;
+  for (int i = 0; i < kOffset + size_; ++i) {
+    edge_tst_data_[i] = rng_(hi);
+  }
+  edge_tst_ = &edge_tst_data_[kOffset];
+  for (int iter = 0; iter < test_count; ++iter) {
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, bit_depth_));
+  }
+}
+
+TEST_P(FilterEdgeTest8B, DISABLED_Speed) {
+  const int test_count = 10000000;
+  size_ = kMaxEdge;
+  strength_ = 1;
+  for (int i = 0; i < kOffset + size_; ++i) {
+    edge_tst_data_[i] = rng_.Rand8();
+  }
+  edge_tst_ = &edge_tst_data_[kOffset];
+  for (int iter = 0; iter < test_count; ++iter) {
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
+    // iterate over filter strengths (1,2,3)
+    strength_ = (strength_ == 3) ? 1 : strength_ + 1;
+  }
+}
+
+TEST_P(FilterEdgeTestHB, DISABLED_Speed) {
+  const int test_count = 10000000;
+  size_ = kMaxEdge;
+  strength_ = 1;
+  bit_depth_ = 12;
+  const int hi = 1 << bit_depth_;
+  for (int i = 0; i < kOffset + size_; ++i) {
+    edge_tst_data_[i] = rng_(hi);
+  }
+  edge_tst_ = &edge_tst_data_[kOffset];
+  for (int iter = 0; iter < test_count; ++iter) {
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
+    // iterate over filter strengths (1,2,3)
+    strength_ = (strength_ == 3) ? 1 : strength_ + 1;
+  }
+}
+
+}  // namespace
diff --git a/third_party/aom/test/intrabc_test.cc b/third_party/aom/test/intrabc_test.cc
index 84cfa5c48..3ea421708 100644
--- a/third_party/aom/test/intrabc_test.cc
+++ b/third_party/aom/test/intrabc_test.cc
@@ -11,10 +11,13 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
+#include "av1/common/blockd.h"
 #include "av1/common/enums.h"
 #include "av1/common/mv.h"
 #include "av1/common/mvref_common.h"
+#include "av1/common/onyxc_int.h"
 #include "av1/common/tile_common.h"
 
 namespace {
@@ -29,9 +32,7 @@ TEST(IntrabcTest, DvValidation) {
   const int kSubPelScale = 8;
   const int kTileMaxMibWidth = 8;
   const DvTestCase kDvCases[] = {
-#if CONFIG_EXT_PARTITION
     { { 0, 0 }, 0, 0, BLOCK_128X128, false },
-#endif
     { { 0, 0 }, 0, 0, BLOCK_64X64, false },
     { { 0, 0 }, 0, 0, BLOCK_32X32, false },
     { { 0, 0 }, 0, 0, BLOCK_16X16, false },
@@ -46,7 +47,7 @@ TEST(IntrabcTest, DvValidation) {
       MAX_SB_SIZE / MI_SIZE,
       MAX_SB_SIZE / MI_SIZE,
       BLOCK_16X16,
-      true },
+      false },
     { { -MAX_SB_SIZE * kSubPelScale, 0 },
       MAX_SB_SIZE / MI_SIZE,
       MAX_SB_SIZE / MI_SIZE,
@@ -111,7 +112,7 @@ TEST(IntrabcTest, DvValidation) {
       MAX_SB_SIZE / MI_SIZE,
       MAX_SB_SIZE / MI_SIZE,
       BLOCK_LARGEST,
-      true },
+      false },
     { { -MAX_SB_SIZE * kSubPelScale, -(MAX_SB_SIZE - 1) * kSubPelScale },
       MAX_SB_SIZE / MI_SIZE,
       MAX_SB_SIZE / MI_SIZE,
@@ -126,13 +127,13 @@ TEST(IntrabcTest, DvValidation) {
       MAX_SB_SIZE / MI_SIZE,
       MAX_SB_SIZE / MI_SIZE,
       BLOCK_LARGEST,
-      true },
+      false },
     { { -MAX_SB_SIZE * kSubPelScale,
         (kTileMaxMibWidth - 2) * MAX_SB_SIZE * kSubPelScale },
       MAX_SB_SIZE / MI_SIZE,
       MAX_SB_SIZE / MI_SIZE,
       BLOCK_LARGEST,
-      true },
+      false },
     { { -MAX_SB_SIZE * kSubPelScale,
         ((kTileMaxMibWidth - 2) * MAX_SB_SIZE + 1) * kSubPelScale },
       MAX_SB_SIZE / MI_SIZE,
@@ -140,17 +141,27 @@ TEST(IntrabcTest, DvValidation) {
       BLOCK_LARGEST,
       false },
   };
-  TileInfo tile;
-  tile.mi_row_start = 8 * MAX_MIB_SIZE;
-  tile.mi_row_end = 16 * MAX_MIB_SIZE;
-  tile.mi_col_start = 24 * MAX_MIB_SIZE;
-  tile.mi_col_end = tile.mi_col_start + kTileMaxMibWidth * MAX_MIB_SIZE;
+
+  MACROBLOCKD xd;
+  memset(&xd, 0, sizeof(xd));
+  xd.tile.mi_row_start = 8 * MAX_MIB_SIZE;
+  xd.tile.mi_row_end = 16 * MAX_MIB_SIZE;
+  xd.tile.mi_col_start = 24 * MAX_MIB_SIZE;
+  xd.tile.mi_col_end = xd.tile.mi_col_start + kTileMaxMibWidth * MAX_MIB_SIZE;
+  xd.plane[1].subsampling_x = 1;
+  xd.plane[1].subsampling_y = 1;
+  xd.plane[2].subsampling_x = 1;
+  xd.plane[2].subsampling_y = 1;
+
+  AV1_COMMON cm;
+  memset(&cm, 0, sizeof(cm));
+
   for (int i = 0; i < static_cast<int>(GTEST_ARRAY_SIZE_(kDvCases)); ++i) {
-    EXPECT_EQ(kDvCases[i].valid,
-              is_dv_valid(kDvCases[i].dv, &tile,
-                          tile.mi_row_start + kDvCases[i].mi_row_offset,
-                          tile.mi_col_start + kDvCases[i].mi_col_offset,
-                          kDvCases[i].bsize))
+    EXPECT_EQ(static_cast<int>(kDvCases[i].valid),
+              av1_is_dv_valid(kDvCases[i].dv, &cm, &xd,
+                              xd.tile.mi_row_start + kDvCases[i].mi_row_offset,
+                              xd.tile.mi_col_start + kDvCases[i].mi_col_offset,
+                              kDvCases[i].bsize, MAX_MIB_SIZE_LOG2))
         << "DvCases[" << i << "]";
   }
 }
diff --git a/third_party/aom/test/intrapred_test.cc b/third_party/aom/test/intrapred_test.cc
index 12da1601c..82f191449 100644
--- a/third_party/aom/test/intrapred_test.cc
+++ b/third_party/aom/test/intrapred_test.cc
@@ -7,19 +7,21 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <string>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "av1/common/blockd.h"
+#include "av1/common/common.h"
 #include "av1/common/pred_common.h"
 #include "aom_mem/aom_mem.h"
 
@@ -139,33 +141,42 @@ class LowbdIntraPredTest : public AV1IntraPredTest<IntraPred, uint8_t> {
   }
 };
 
+// Suppress an unitialized warning. Once there are implementations to test then
+// this can be restored.
 TEST_P(HighbdIntraPredTest, Bitexact) {
-  // max block size is 32
-  DECLARE_ALIGNED(16, uint16_t, left_col[2 * 32]);
-  DECLARE_ALIGNED(16, uint16_t, above_data[2 * 32 + 32]);
-  DECLARE_ALIGNED(16, uint16_t, dst[3 * 32 * 32]);
-  DECLARE_ALIGNED(16, uint16_t, ref_dst[3 * 32 * 32]);
+  // max block size is 64
+  DECLARE_ALIGNED(16, uint16_t, left_col[2 * 64]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[2 * 64 + 64]);
+  DECLARE_ALIGNED(16, uint16_t, dst[3 * 64 * 64]);
+  DECLARE_ALIGNED(16, uint16_t, ref_dst[3 * 64 * 64]);
+  av1_zero(left_col);
+  av1_zero(above_data);
   RunTest(left_col, above_data, dst, ref_dst);
 }
 
+// Same issue as above but for arm.
+#if !HAVE_NEON
 TEST_P(LowbdIntraPredTest, Bitexact) {
   // max block size is 32
   DECLARE_ALIGNED(16, uint8_t, left_col[2 * 32]);
   DECLARE_ALIGNED(16, uint8_t, above_data[2 * 32 + 32]);
   DECLARE_ALIGNED(16, uint8_t, dst[3 * 32 * 32]);
   DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 32 * 32]);
+  av1_zero(left_col);
+  av1_zero(above_data);
   RunTest(left_col, above_data, dst, ref_dst);
 }
+#endif  // !HAVE_NEON
 
 // -----------------------------------------------------------------------------
 // High Bit Depth Tests
-
 #define highbd_entry(type, width, height, opt, bd)                          \
   IntraPredFunc<HighbdIntraPred>(                                           \
       &aom_highbd_##type##_predictor_##width##x##height##_##opt,            \
       &aom_highbd_##type##_predictor_##width##x##height##_c, width, height, \
       bd)
 
+#if 0
 #define highbd_intrapred(type, opt, bd)                                       \
   highbd_entry(type, 4, 4, opt, bd), highbd_entry(type, 4, 8, opt, bd),       \
       highbd_entry(type, 8, 4, opt, bd), highbd_entry(type, 8, 8, opt, bd),   \
@@ -173,111 +184,10 @@ TEST_P(LowbdIntraPredTest, Bitexact) {
       highbd_entry(type, 16, 16, opt, bd),                                    \
       highbd_entry(type, 16, 32, opt, bd),                                    \
       highbd_entry(type, 32, 16, opt, bd), highbd_entry(type, 32, 32, opt, bd)
+#endif
 
-#if CONFIG_HIGHBITDEPTH
-#if HAVE_SSE2
-const IntraPredFunc<HighbdIntraPred> IntraPredTestVector8[] = {
-  highbd_intrapred(dc, sse2, 8),     highbd_intrapred(dc_left, sse2, 8),
-  highbd_intrapred(dc_top, sse2, 8), highbd_intrapred(dc_128, sse2, 8),
-  highbd_intrapred(h, sse2, 8),      highbd_intrapred(v, sse2, 8),
-  highbd_entry(d117, 4, 4, sse2, 8), highbd_entry(d135, 4, 4, sse2, 8),
-  highbd_entry(d153, 4, 4, sse2, 8), highbd_entry(d45e, 4, 4, sse2, 8),
-  highbd_entry(d45e, 4, 8, sse2, 8), highbd_entry(d45e, 8, 4, sse2, 8),
-  highbd_entry(d45e, 8, 8, sse2, 8), highbd_entry(d45e, 8, 16, sse2, 8),
-};
-
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, HighbdIntraPredTest,
-                        ::testing::ValuesIn(IntraPredTestVector8));
-
-const IntraPredFunc<HighbdIntraPred> IntraPredTestVector10[] = {
-  highbd_intrapred(dc, sse2, 10),     highbd_intrapred(dc_left, sse2, 10),
-  highbd_intrapred(dc_top, sse2, 10), highbd_intrapred(dc_128, sse2, 10),
-  highbd_intrapred(h, sse2, 10),      highbd_intrapred(v, sse2, 10),
-  highbd_entry(d117, 4, 4, sse2, 10), highbd_entry(d135, 4, 4, sse2, 10),
-  highbd_entry(d153, 4, 4, sse2, 10), highbd_entry(d45e, 4, 4, sse2, 10),
-  highbd_entry(d45e, 4, 8, sse2, 10), highbd_entry(d45e, 8, 4, sse2, 10),
-  highbd_entry(d45e, 8, 8, sse2, 10), highbd_entry(d45e, 8, 16, sse2, 10),
-};
-
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, HighbdIntraPredTest,
-                        ::testing::ValuesIn(IntraPredTestVector10));
-
-const IntraPredFunc<HighbdIntraPred> IntraPredTestVector12[] = {
-  highbd_intrapred(dc, sse2, 12),     highbd_intrapred(dc_left, sse2, 12),
-  highbd_intrapred(dc_top, sse2, 12), highbd_intrapred(dc_128, sse2, 12),
-  highbd_intrapred(h, sse2, 12),      highbd_intrapred(v, sse2, 12),
-  highbd_entry(d117, 4, 4, sse2, 12), highbd_entry(d135, 4, 4, sse2, 12),
-  highbd_entry(d153, 4, 4, sse2, 12), highbd_entry(d45e, 4, 4, sse2, 12),
-  highbd_entry(d45e, 4, 8, sse2, 12), highbd_entry(d45e, 8, 4, sse2, 12),
-  highbd_entry(d45e, 8, 8, sse2, 12), highbd_entry(d45e, 8, 16, sse2, 12),
-};
-
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, HighbdIntraPredTest,
-                        ::testing::ValuesIn(IntraPredTestVector12));
-
-#endif  // HAVE_SSE2
-
-#if HAVE_SSSE3
-const IntraPredFunc<HighbdIntraPred> IntraPredTestVectorSsse3_8[] = {
-  highbd_entry(d117, 8, 8, ssse3, 8),   highbd_entry(d117, 16, 16, ssse3, 8),
-  highbd_entry(d117, 32, 32, ssse3, 8), highbd_entry(d135, 8, 8, ssse3, 8),
-  highbd_entry(d135, 16, 16, ssse3, 8), highbd_entry(d135, 32, 32, ssse3, 8),
-  highbd_entry(d153, 8, 8, ssse3, 8),   highbd_entry(d153, 16, 16, ssse3, 8),
-  highbd_entry(d153, 32, 32, ssse3, 8),
-};
-INSTANTIATE_TEST_CASE_P(SSSE3_TO_C_8, HighbdIntraPredTest,
-                        ::testing::ValuesIn(IntraPredTestVectorSsse3_8));
-
-const IntraPredFunc<HighbdIntraPred> IntraPredTestVectorSsse3_10[] = {
-  highbd_entry(d117, 8, 8, ssse3, 10),   highbd_entry(d117, 16, 16, ssse3, 10),
-  highbd_entry(d117, 32, 32, ssse3, 10), highbd_entry(d135, 8, 8, ssse3, 10),
-  highbd_entry(d135, 16, 16, ssse3, 10), highbd_entry(d135, 32, 32, ssse3, 10),
-  highbd_entry(d153, 8, 8, ssse3, 10),   highbd_entry(d153, 16, 16, ssse3, 10),
-  highbd_entry(d153, 32, 32, ssse3, 10),
-};
-INSTANTIATE_TEST_CASE_P(SSSE3_TO_C_10, HighbdIntraPredTest,
-                        ::testing::ValuesIn(IntraPredTestVectorSsse3_10));
-
-const IntraPredFunc<HighbdIntraPred> IntraPredTestVectorSsse3_12[] = {
-  highbd_entry(d117, 8, 8, ssse3, 12),   highbd_entry(d117, 16, 16, ssse3, 12),
-  highbd_entry(d117, 32, 32, ssse3, 12), highbd_entry(d135, 8, 8, ssse3, 12),
-  highbd_entry(d135, 16, 16, ssse3, 12), highbd_entry(d135, 32, 32, ssse3, 12),
-  highbd_entry(d153, 8, 8, ssse3, 12),   highbd_entry(d153, 16, 16, ssse3, 12),
-  highbd_entry(d153, 32, 32, ssse3, 12),
-};
-INSTANTIATE_TEST_CASE_P(SSSE3_TO_C_12, HighbdIntraPredTest,
-                        ::testing::ValuesIn(IntraPredTestVectorSsse3_12));
-#endif  // HAVE_SSSE3
-
-#if HAVE_AVX2
-const IntraPredFunc<HighbdIntraPred> IntraPredTestVectorAvx2_8[] = {
-  highbd_entry(d45e, 16, 8, avx2, 8),  highbd_entry(d45e, 16, 16, avx2, 8),
-  highbd_entry(d45e, 16, 32, avx2, 8), highbd_entry(d45e, 32, 16, avx2, 8),
-  highbd_entry(d45e, 32, 32, avx2, 8),
-};
-INSTANTIATE_TEST_CASE_P(AVX2_TO_C_8, HighbdIntraPredTest,
-                        ::testing::ValuesIn(IntraPredTestVectorAvx2_8));
-
-const IntraPredFunc<HighbdIntraPred> IntraPredTestVectorAvx2_10[] = {
-  highbd_entry(d45e, 16, 8, avx2, 10),  highbd_entry(d45e, 16, 16, avx2, 10),
-  highbd_entry(d45e, 16, 32, avx2, 10), highbd_entry(d45e, 32, 16, avx2, 10),
-  highbd_entry(d45e, 32, 32, avx2, 10),
-};
-INSTANTIATE_TEST_CASE_P(AVX2_TO_C_10, HighbdIntraPredTest,
-                        ::testing::ValuesIn(IntraPredTestVectorAvx2_10));
-
-const IntraPredFunc<HighbdIntraPred> IntraPredTestVectorAvx2_12[] = {
-  highbd_entry(d45e, 16, 8, avx2, 12),  highbd_entry(d45e, 16, 16, avx2, 12),
-  highbd_entry(d45e, 16, 32, avx2, 12), highbd_entry(d45e, 32, 16, avx2, 12),
-  highbd_entry(d45e, 32, 32, avx2, 12),
-};
-INSTANTIATE_TEST_CASE_P(AVX2_TO_C_12, HighbdIntraPredTest,
-                        ::testing::ValuesIn(IntraPredTestVectorAvx2_12));
-#endif  // HAVE_AVX2
-#endif  // CONFIG_HIGHBITDEPTH
-
-// -----------------------------------------------------------------------------
-// Low Bit Depth Tests
+  // ---------------------------------------------------------------------------
+  // Low Bit Depth Tests
 
 #define lowbd_entry(type, width, height, opt)                                  \
   IntraPredFunc<IntraPred>(&aom_##type##_predictor_##width##x##height##_##opt, \
@@ -303,6 +213,17 @@ INSTANTIATE_TEST_CASE_P(SSE2, LowbdIntraPredTest,
 
 #endif  // HAVE_SSE2
 
+#if HAVE_SSSE3
+const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorSsse3[] = {
+  lowbd_intrapred(paeth, ssse3),
+  lowbd_intrapred(smooth, ssse3),
+};
+
+INSTANTIATE_TEST_CASE_P(SSSE3, LowbdIntraPredTest,
+                        ::testing::ValuesIn(LowbdIntraPredTestVectorSsse3));
+
+#endif  // HAVE_SSSE3
+
 #if HAVE_AVX2
 const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorAvx2[] = {
   lowbd_entry(dc, 32, 32, avx2),      lowbd_entry(dc_top, 32, 32, avx2),
@@ -318,16 +239,17 @@ const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorAvx2[] = {
 INSTANTIATE_TEST_CASE_P(AVX2, LowbdIntraPredTest,
                         ::testing::ValuesIn(LowbdIntraPredTestVectorAvx2));
 
-#endif  // HAVE_SSE2
+#endif  // HAVE_AVX2
 
-#if HAVE_SSSE3
-const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorSsse3[] = {
-  lowbd_intrapred(paeth, ssse3), lowbd_intrapred(smooth, ssse3),
+#if HAVE_NEON
+const IntraPredFunc<HighbdIntraPred> HighbdIntraPredTestVectorNeon[] = {
+  highbd_entry(dc, 4, 4, neon, 8),   highbd_entry(dc, 8, 8, neon, 8),
+  highbd_entry(dc, 16, 16, neon, 8), highbd_entry(dc, 32, 32, neon, 8),
+  highbd_entry(dc, 64, 64, neon, 8),
 };
 
-INSTANTIATE_TEST_CASE_P(SSSE3, LowbdIntraPredTest,
-                        ::testing::ValuesIn(LowbdIntraPredTestVectorSsse3));
-
-#endif  // HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(NEON, HighbdIntraPredTest,
+                        ::testing::ValuesIn(HighbdIntraPredTestVectorNeon));
 
+#endif  // HAVE_NEON
 }  // namespace
diff --git a/third_party/aom/test/invalid_file_test.cc b/third_party/aom/test/invalid_file_test.cc
new file mode 100644
index 000000000..869f3da66
--- /dev/null
+++ b/third_party/aom/test/invalid_file_test.cc
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdio>
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/ivf_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+
+namespace {
+
+struct DecodeParam {
+  int threads;
+  const char *filename;
+};
+
+std::ostream &operator<<(std::ostream &os, const DecodeParam &dp) {
+  return os << "threads: " << dp.threads << " file: " << dp.filename;
+}
+
+class InvalidFileTest : public ::libaom_test::DecoderTest,
+                        public ::libaom_test::CodecTestWithParam<DecodeParam> {
+ protected:
+  InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(NULL) {}
+
+  virtual ~InvalidFileTest() {
+    if (res_file_ != NULL) fclose(res_file_);
+  }
+
+  void OpenResFile(const std::string &res_file_name) {
+    res_file_ = libaom_test::OpenTestDataFile(res_file_name);
+    ASSERT_TRUE(res_file_ != NULL)
+        << "Result file open failed. Filename: " << res_file_name;
+  }
+
+  virtual bool HandleDecodeResult(
+      const aom_codec_err_t res_dec,
+      const libaom_test::CompressedVideoSource &video,
+      libaom_test::Decoder *decoder) {
+    EXPECT_TRUE(res_file_ != NULL);
+    int expected_res_dec;
+
+    // Read integer result.
+    const int res = fscanf(res_file_, "%d", &expected_res_dec);
+    EXPECT_NE(res, EOF) << "Read result data failed";
+
+    // Check results match.
+    const DecodeParam input = GET_PARAM(1);
+    if (input.threads > 1) {
+      // The serial decode check is too strict for tile-threaded decoding as
+      // there is no guarantee on the decode order nor which specific error
+      // will take precedence. Currently a tile-level error is not forwarded so
+      // the frame will simply be marked corrupt.
+      EXPECT_TRUE(res_dec == expected_res_dec ||
+                  res_dec == AOM_CODEC_CORRUPT_FRAME)
+          << "Results don't match: frame number = " << video.frame_number()
+          << ". (" << decoder->DecodeError()
+          << "). Expected: " << expected_res_dec << " or "
+          << AOM_CODEC_CORRUPT_FRAME;
+    } else {
+      EXPECT_EQ(expected_res_dec, res_dec)
+          << "Results don't match: frame number = " << video.frame_number()
+          << ". (" << decoder->DecodeError() << ")";
+    }
+
+    return !HasFailure();
+  }
+
+  virtual void HandlePeekResult(libaom_test::Decoder *const /*decoder*/,
+                                libaom_test::CompressedVideoSource * /*video*/,
+                                const aom_codec_err_t /*res_peek*/) {}
+
+  void RunTest() {
+    const DecodeParam input = GET_PARAM(1);
+    aom_codec_dec_cfg_t cfg = { 0, 0, 0, CONFIG_LOWBITDEPTH, { 1 } };
+    cfg.threads = input.threads;
+    const std::string filename = input.filename;
+    libaom_test::IVFVideoSource decode_video(filename);
+    decode_video.Init();
+
+    // Construct result file name. The file holds a list of expected integer
+    // results, one for each decoded frame.  Any result that doesn't match
+    // the files list will cause a test failure.
+    const std::string res_filename = filename + ".res";
+    OpenResFile(res_filename);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&decode_video, cfg));
+  }
+
+ private:
+  FILE *res_file_;
+};
+
+TEST_P(InvalidFileTest, ReturnCode) { RunTest(); }
+
+const DecodeParam kAV1InvalidFileTests[] = {
+  { 1, "invalid-bug-1814.ivf" },
+};
+
+AV1_INSTANTIATE_TEST_CASE(InvalidFileTest,
+                          ::testing::ValuesIn(kAV1InvalidFileTests));
+
+}  // namespace
diff --git a/third_party/aom/test/ivf_video_source.h b/third_party/aom/test/ivf_video_source.h
index 956c145ac..4b2713537 100644
--- a/third_party/aom/test/ivf_video_source.h
+++ b/third_party/aom/test/ivf_video_source.h
@@ -10,14 +10,17 @@
  */
 #ifndef TEST_IVF_VIDEO_SOURCE_H_
 #define TEST_IVF_VIDEO_SOURCE_H_
+
 #include <cstdio>
 #include <cstdlib>
 #include <new>
 #include <string>
+
+#include "aom_ports/sanitizer.h"
 #include "test/video_source.h"
 
 namespace libaom_test {
-const unsigned int kCodeBufferSize = 256 * 1024;
+const unsigned int kCodeBufferSize = 256 * 1024 * 1024;
 const unsigned int kIvfFileHdrSize = 32;
 const unsigned int kIvfFrameHdrSize = 12;
 
@@ -41,9 +44,10 @@ class IVFVideoSource : public CompressedVideoSource {
 
   virtual void Init() {
     // Allocate a buffer for read in the compressed video frame.
-    compressed_frame_buf_ = new uint8_t[libaom_test::kCodeBufferSize];
+    compressed_frame_buf_ = new uint8_t[kCodeBufferSize];
     ASSERT_TRUE(compressed_frame_buf_ != NULL)
         << "Allocate frame buffer failed";
+    ASAN_POISON_MEMORY_REGION(compressed_frame_buf_, kCodeBufferSize);
   }
 
   virtual void Begin() {
@@ -81,9 +85,12 @@ class IVFVideoSource : public CompressedVideoSource {
       frame_sz_ = MemGetLe32(frame_hdr);
       ASSERT_LE(frame_sz_, kCodeBufferSize)
           << "Frame is too big for allocated code buffer";
+      ASAN_UNPOISON_MEMORY_REGION(compressed_frame_buf_, kCodeBufferSize);
       ASSERT_EQ(frame_sz_,
                 fread(compressed_frame_buf_, 1, frame_sz_, input_file_))
           << "Failed to read complete frame";
+      ASAN_POISON_MEMORY_REGION(compressed_frame_buf_ + frame_sz_,
+                                kCodeBufferSize - frame_sz_);
     }
   }
 
diff --git a/third_party/aom/test/level_test.cc b/third_party/aom/test/level_test.cc
deleted file mode 100644
index 12f391817..000000000
--- a/third_party/aom/test/level_test.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "test/codec_factory.h"
-#include "test/encode_test_driver.h"
-#include "test/i420_video_source.h"
-#include "test/util.h"
-
-namespace {
-class LevelTest
-    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
-      public ::libaom_test::EncoderTest {
- protected:
-  LevelTest()
-      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
-        cpu_used_(GET_PARAM(2)), min_gf_internal_(24), target_level_(0),
-        level_(0) {}
-  virtual ~LevelTest() {}
-
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(encoding_mode_);
-    if (encoding_mode_ != ::libaom_test::kRealTime) {
-      cfg_.g_lag_in_frames = 25;
-      cfg_.rc_end_usage = AOM_VBR;
-    } else {
-      cfg_.g_lag_in_frames = 0;
-      cfg_.rc_end_usage = AOM_CBR;
-    }
-    cfg_.rc_2pass_vbr_minsection_pct = 5;
-    cfg_.rc_2pass_vbr_maxsection_pct = 2000;
-    cfg_.rc_target_bitrate = 400;
-    cfg_.rc_max_quantizer = 63;
-    cfg_.rc_min_quantizer = 0;
-  }
-
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
-    if (video->frame() == 0) {
-      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
-      encoder->Control(AV1E_SET_TARGET_LEVEL, target_level_);
-      encoder->Control(AV1E_SET_MIN_GF_INTERVAL, min_gf_internal_);
-      if (encoding_mode_ != ::libaom_test::kRealTime) {
-        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
-        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
-        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
-      }
-    }
-    encoder->Control(AV1E_GET_LEVEL, &level_);
-    ASSERT_LE(level_, 51);
-    ASSERT_GE(level_, 0);
-  }
-
-  ::libaom_test::TestMode encoding_mode_;
-  int cpu_used_;
-  int min_gf_internal_;
-  int target_level_;
-  int level_;
-};
-
-// Test for keeping level stats only
-TEST_P(LevelTest, TestTargetLevel0) {
-  ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
-                                       40);
-  target_level_ = 0;
-  min_gf_internal_ = 4;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_EQ(11, level_);
-
-  cfg_.rc_target_bitrate = 1600;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_EQ(20, level_);
-}
-
-// Test for level control being turned off
-TEST_P(LevelTest, TestTargetLevel255) {
-  ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
-                                       30);
-  target_level_ = 255;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-}
-
-TEST_P(LevelTest, TestTargetLevelApi) {
-  ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, 1);
-  static const aom_codec_iface_t *codec = &aom_codec_av1_cx_algo;
-  aom_codec_ctx_t enc;
-  aom_codec_enc_cfg_t cfg;
-  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(codec, &cfg, 0));
-  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, codec, &cfg, 0));
-  for (int level = 0; level <= 256; ++level) {
-    if (level == 10 || level == 11 || level == 20 || level == 21 ||
-        level == 30 || level == 31 || level == 40 || level == 41 ||
-        level == 50 || level == 51 || level == 52 || level == 60 ||
-        level == 61 || level == 62 || level == 0 || level == 255)
-      EXPECT_EQ(AOM_CODEC_OK,
-                aom_codec_control(&enc, AV1E_SET_TARGET_LEVEL, level));
-    else
-      EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-                aom_codec_control(&enc, AV1E_SET_TARGET_LEVEL, level));
-  }
-  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
-}
-
-AV1_INSTANTIATE_TEST_CASE(LevelTest,
-                          ::testing::Values(::libaom_test::kTwoPassGood,
-                                            ::libaom_test::kOnePassGood),
-                          ::testing::Range(0, 9));
-}  // namespace
diff --git a/third_party/aom/test/lossless_test.cc b/third_party/aom/test/lossless_test.cc
index f4978fe21..3f8e89c81 100644
--- a/third_party/aom/test/lossless_test.cc
+++ b/third_party/aom/test/lossless_test.cc
@@ -7,11 +7,12 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
diff --git a/third_party/aom/test/lpf_8_test.cc b/third_party/aom/test/lpf_8_test.cc
deleted file mode 100644
index 4859a8ee7..000000000
--- a/third_party/aom/test/lpf_8_test.cc
+++ /dev/null
@@ -1,775 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
-
-#include <cmath>
-#include <cstdlib>
-#include <string>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "av1/common/av1_loopfilter.h"
-#include "av1/common/entropy.h"
-#include "aom/aom_integer.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-// Horizontally and Vertically need 32x32: 8  Coeffs preceeding filtered section
-//                                         16 Coefs within filtered section
-//                                         8  Coeffs following filtered section
-const int kNumCoeffs = 1024;
-
-const int number_of_iterations = 10000;
-
-const int kSpeedTestNum = 500000;
-
-#if CONFIG_HIGHBITDEPTH
-typedef void (*loop_op_t)(uint16_t *s, int p, const uint8_t *blimit,
-                          const uint8_t *limit, const uint8_t *thresh, int bd);
-typedef void (*dual_loop_op_t)(uint16_t *s, int p, const uint8_t *blimit0,
-                               const uint8_t *limit0, const uint8_t *thresh0,
-                               const uint8_t *blimit1, const uint8_t *limit1,
-                               const uint8_t *thresh1, int bd);
-#else
-typedef void (*loop_op_t)(uint8_t *s, int p, const uint8_t *blimit,
-                          const uint8_t *limit, const uint8_t *thresh);
-typedef void (*dual_loop_op_t)(uint8_t *s, int p, const uint8_t *blimit0,
-                               const uint8_t *limit0, const uint8_t *thresh0,
-                               const uint8_t *blimit1, const uint8_t *limit1,
-                               const uint8_t *thresh1);
-#endif  // CONFIG_HIGHBITDEPTH
-
-typedef std::tr1::tuple<loop_op_t, loop_op_t, int> loop8_param_t;
-typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;
-
-class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
- public:
-  virtual ~Loop8Test6Param() {}
-  virtual void SetUp() {
-    loopfilter_op_ = GET_PARAM(0);
-    ref_loopfilter_op_ = GET_PARAM(1);
-    bit_depth_ = GET_PARAM(2);
-    mask_ = (1 << bit_depth_) - 1;
-  }
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  int bit_depth_;
-  int mask_;
-  loop_op_t loopfilter_op_;
-  loop_op_t ref_loopfilter_op_;
-};
-
-class Loop8Test9Param : public ::testing::TestWithParam<dualloop8_param_t> {
- public:
-  virtual ~Loop8Test9Param() {}
-  virtual void SetUp() {
-    loopfilter_op_ = GET_PARAM(0);
-    ref_loopfilter_op_ = GET_PARAM(1);
-    bit_depth_ = GET_PARAM(2);
-    mask_ = (1 << bit_depth_) - 1;
-  }
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  int bit_depth_;
-  int mask_;
-  dual_loop_op_t loopfilter_op_;
-  dual_loop_op_t ref_loopfilter_op_;
-};
-
-TEST_P(Loop8Test6Param, OperationCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = number_of_iterations;
-#if CONFIG_HIGHBITDEPTH
-  int32_t bd = bit_depth_;
-  DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
-  DECLARE_ALIGNED(16, uint16_t, ref_s[kNumCoeffs]);
-#else
-  DECLARE_ALIGNED(8, uint8_t, s[kNumCoeffs]);
-  DECLARE_ALIGNED(8, uint8_t, ref_s[kNumCoeffs]);
-#endif  // CONFIG_HIGHBITDEPTH
-  int err_count_total = 0;
-  int first_failure = -1;
-  for (int i = 0; i < count_test_block; ++i) {
-    int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
-    DECLARE_ALIGNED(16, const uint8_t,
-                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
-    DECLARE_ALIGNED(16, const uint8_t,
-                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
-    DECLARE_ALIGNED(16, const uint8_t,
-                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    int32_t p = kNumCoeffs / 32;
-
-    uint16_t tmp_s[kNumCoeffs];
-    int j = 0;
-    while (j < kNumCoeffs) {
-      uint8_t val = rnd.Rand8();
-      if (val & 0x80) {  // 50% chance to choose a new value.
-        tmp_s[j] = rnd.Rand16();
-        j++;
-      } else {  // 50% chance to repeat previous value in row X times
-        int k = 0;
-        while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
-          if (j < 1) {
-            tmp_s[j] = rnd.Rand16();
-          } else if (val & 0x20) {  // Increment by an value within the limit
-            tmp_s[j] = (tmp_s[j - 1] + (*limit - 1));
-          } else {  // Decrement by an value within the limit
-            tmp_s[j] = (tmp_s[j - 1] - (*limit - 1));
-          }
-          j++;
-        }
-      }
-    }
-    for (j = 0; j < kNumCoeffs; j++) {
-      if (i % 2) {
-        s[j] = tmp_s[j] & mask_;
-      } else {
-        s[j] = tmp_s[p * (j % p) + j / p] & mask_;
-      }
-      ref_s[j] = s[j];
-    }
-#if CONFIG_HIGHBITDEPTH
-    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, bd);
-    ASM_REGISTER_STATE_CHECK(
-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, bd));
-#else
-    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh);
-    ASM_REGISTER_STATE_CHECK(
-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh));
-#endif  // CONFIG_HIGHBITDEPTH
-
-    for (j = 0; j < kNumCoeffs; ++j) {
-      err_count += ref_s[j] != s[j];
-    }
-    if (err_count && !err_count_total) {
-      first_failure = i;
-    }
-    err_count_total += err_count;
-  }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Loop8Test6Param, C output doesn't match SSE2 "
-         "loopfilter output. "
-      << "First failed at test case " << first_failure;
-}
-
-TEST_P(Loop8Test6Param, ValueCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = number_of_iterations;
-#if CONFIG_HIGHBITDEPTH
-  const int32_t bd = bit_depth_;
-  DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
-  DECLARE_ALIGNED(16, uint16_t, ref_s[kNumCoeffs]);
-#else
-  DECLARE_ALIGNED(8, uint8_t, s[kNumCoeffs]);
-  DECLARE_ALIGNED(8, uint8_t, ref_s[kNumCoeffs]);
-#endif  // CONFIG_HIGHBITDEPTH
-  int err_count_total = 0;
-  int first_failure = -1;
-
-  // NOTE: The code in av1_loopfilter.c:update_sharpness computes mblim as a
-  // function of sharpness_lvl and the loopfilter lvl as:
-  // block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
-  // ...
-  // memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
-  //        SIMD_WIDTH);
-  // This means that the largest value for mblim will occur when sharpness_lvl
-  // is equal to 0, and lvl is equal to its greatest value (MAX_LOOP_FILTER).
-  // In this case block_inside_limit will be equal to MAX_LOOP_FILTER and
-  // therefore mblim will be equal to (2 * (lvl + 2) + block_inside_limit) =
-  // 2 * (MAX_LOOP_FILTER + 2) + MAX_LOOP_FILTER = 3 * MAX_LOOP_FILTER + 4
-
-  for (int i = 0; i < count_test_block; ++i) {
-    int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
-    DECLARE_ALIGNED(16, const uint8_t,
-                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
-    DECLARE_ALIGNED(16, const uint8_t,
-                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
-    DECLARE_ALIGNED(16, const uint8_t,
-                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    int32_t p = kNumCoeffs / 32;
-    for (int j = 0; j < kNumCoeffs; ++j) {
-      s[j] = rnd.Rand16() & mask_;
-      ref_s[j] = s[j];
-    }
-#if CONFIG_HIGHBITDEPTH
-    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, bd);
-    ASM_REGISTER_STATE_CHECK(
-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, bd));
-#else
-    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh);
-    ASM_REGISTER_STATE_CHECK(
-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh));
-#endif  // CONFIG_HIGHBITDEPTH
-    for (int j = 0; j < kNumCoeffs; ++j) {
-      err_count += ref_s[j] != s[j];
-    }
-    if (err_count && !err_count_total) {
-      first_failure = i;
-    }
-    err_count_total += err_count;
-  }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Loop8Test6Param, C output doesn't match SSE2 "
-         "loopfilter output. "
-      << "First failed at test case " << first_failure;
-}
-
-TEST_P(Loop8Test6Param, DISABLED_Speed) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = kSpeedTestNum;
-#if CONFIG_HIGHBITDEPTH
-  const int32_t bd = bit_depth_;
-  DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
-#else
-  DECLARE_ALIGNED(8, uint8_t, s[kNumCoeffs]);
-#endif  // CONFIG_HIGHBITDEPTH
-
-  uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
-  DECLARE_ALIGNED(16, const uint8_t,
-                  blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-  tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
-  DECLARE_ALIGNED(16, const uint8_t,
-                  limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                 tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-  tmp = rnd.Rand8();
-  DECLARE_ALIGNED(16, const uint8_t,
-                  thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-
-  int32_t p = kNumCoeffs / 32;
-  for (int j = 0; j < kNumCoeffs; ++j) {
-    s[j] = rnd.Rand16() & mask_;
-  }
-
-  for (int i = 0; i < count_test_block; ++i) {
-#if CONFIG_HIGHBITDEPTH
-    loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, bd);
-#else
-    loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh);
-#endif  // CONFIG_HIGHBITDEPTH
-  }
-}
-
-TEST_P(Loop8Test9Param, OperationCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = number_of_iterations;
-#if CONFIG_HIGHBITDEPTH
-  const int32_t bd = bit_depth_;
-  DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
-  DECLARE_ALIGNED(16, uint16_t, ref_s[kNumCoeffs]);
-#else
-  DECLARE_ALIGNED(8, uint8_t, s[kNumCoeffs]);
-  DECLARE_ALIGNED(8, uint8_t, ref_s[kNumCoeffs]);
-#endif  // CONFIG_HIGHBITDEPTH
-  int err_count_total = 0;
-  int first_failure = -1;
-  for (int i = 0; i < count_test_block; ++i) {
-    int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
-    DECLARE_ALIGNED(16, const uint8_t,
-                    blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
-    DECLARE_ALIGNED(16, const uint8_t,
-                    limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
-    DECLARE_ALIGNED(16, const uint8_t,
-                    thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
-    DECLARE_ALIGNED(16, const uint8_t,
-                    blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
-    DECLARE_ALIGNED(16, const uint8_t,
-                    limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
-    DECLARE_ALIGNED(16, const uint8_t,
-                    thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    int32_t p = kNumCoeffs / 32;
-    uint16_t tmp_s[kNumCoeffs];
-    int j = 0;
-    const uint8_t limit = *limit0 < *limit1 ? *limit0 : *limit1;
-    while (j < kNumCoeffs) {
-      uint8_t val = rnd.Rand8();
-      if (val & 0x80) {  // 50% chance to choose a new value.
-        tmp_s[j] = rnd.Rand16();
-        j++;
-      } else {  // 50% chance to repeat previous value in row X times.
-        int k = 0;
-        while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
-          if (j < 1) {
-            tmp_s[j] = rnd.Rand16();
-          } else if (val & 0x20) {  // Increment by a value within the limit.
-            tmp_s[j] = (tmp_s[j - 1] + (limit - 1));
-          } else {  // Decrement by an value within the limit.
-            tmp_s[j] = (tmp_s[j - 1] - (limit - 1));
-          }
-          j++;
-        }
-      }
-    }
-    for (j = 0; j < kNumCoeffs; j++) {
-      if (i % 2) {
-        s[j] = tmp_s[j] & mask_;
-      } else {
-        s[j] = tmp_s[p * (j % p) + j / p] & mask_;
-      }
-      ref_s[j] = s[j];
-    }
-#if CONFIG_HIGHBITDEPTH
-    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,
-                       limit1, thresh1, bd);
-    ASM_REGISTER_STATE_CHECK(loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0,
-                                            thresh0, blimit1, limit1, thresh1,
-                                            bd));
-#else
-    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,
-                       limit1, thresh1);
-    ASM_REGISTER_STATE_CHECK(loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0,
-                                            thresh0, blimit1, limit1, thresh1));
-#endif  // CONFIG_HIGHBITDEPTH
-    for (j = 0; j < kNumCoeffs; ++j) {
-      err_count += ref_s[j] != s[j];
-    }
-    if (err_count && !err_count_total) {
-      first_failure = i;
-    }
-    err_count_total += err_count;
-  }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Loop8Test9Param, C output doesn't match SSE2 "
-         "loopfilter output. "
-      << "First failed at test case " << first_failure;
-}
-
-TEST_P(Loop8Test9Param, ValueCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = number_of_iterations;
-#if CONFIG_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
-  DECLARE_ALIGNED(16, uint16_t, ref_s[kNumCoeffs]);
-#else
-  DECLARE_ALIGNED(8, uint8_t, s[kNumCoeffs]);
-  DECLARE_ALIGNED(8, uint8_t, ref_s[kNumCoeffs]);
-#endif  // CONFIG_HIGHBITDEPTH
-  int err_count_total = 0;
-  int first_failure = -1;
-  for (int i = 0; i < count_test_block; ++i) {
-    int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
-    DECLARE_ALIGNED(16, const uint8_t,
-                    blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
-    DECLARE_ALIGNED(16, const uint8_t,
-                    limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
-    DECLARE_ALIGNED(16, const uint8_t,
-                    thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
-    DECLARE_ALIGNED(16, const uint8_t,
-                    blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
-    DECLARE_ALIGNED(16, const uint8_t,
-                    limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
-    DECLARE_ALIGNED(16, const uint8_t,
-                    thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    int32_t p = kNumCoeffs / 32;  // TODO(pdlf) can we have non-square here?
-    for (int j = 0; j < kNumCoeffs; ++j) {
-      s[j] = rnd.Rand16() & mask_;
-      ref_s[j] = s[j];
-    }
-#if CONFIG_HIGHBITDEPTH
-    const int32_t bd = bit_depth_;
-    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,
-                       limit1, thresh1, bd);
-    ASM_REGISTER_STATE_CHECK(loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0,
-                                            thresh0, blimit1, limit1, thresh1,
-                                            bd));
-#else
-    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,
-                       limit1, thresh1);
-    ASM_REGISTER_STATE_CHECK(loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0,
-                                            thresh0, blimit1, limit1, thresh1));
-#endif  // CONFIG_HIGHBITDEPTH
-    for (int j = 0; j < kNumCoeffs; ++j) {
-      err_count += ref_s[j] != s[j];
-    }
-    if (err_count && !err_count_total) {
-      first_failure = i;
-    }
-    err_count_total += err_count;
-  }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Loop8Test9Param, C output doesn't match SSE2"
-         "loopfilter output. "
-      << "First failed at test case " << first_failure;
-}
-
-TEST_P(Loop8Test9Param, DISABLED_Speed) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = kSpeedTestNum;
-#if CONFIG_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
-#else
-  DECLARE_ALIGNED(8, uint8_t, s[kNumCoeffs]);
-#endif  // CONFIG_HIGHBITDEPTH
-
-  uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
-  DECLARE_ALIGNED(16, const uint8_t,
-                  blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-  tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
-  DECLARE_ALIGNED(16, const uint8_t,
-                  limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-  tmp = rnd.Rand8();
-  DECLARE_ALIGNED(16, const uint8_t,
-                  thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-  tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
-  DECLARE_ALIGNED(16, const uint8_t,
-                  blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-  tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
-  DECLARE_ALIGNED(16, const uint8_t,
-                  limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-  tmp = rnd.Rand8();
-  DECLARE_ALIGNED(16, const uint8_t,
-                  thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
-                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-  int32_t p = kNumCoeffs / 32;  // TODO(pdlf) can we have non-square here?
-  for (int j = 0; j < kNumCoeffs; ++j) {
-    s[j] = rnd.Rand16() & mask_;
-  }
-
-  for (int i = 0; i < count_test_block; ++i) {
-#if CONFIG_HIGHBITDEPTH
-    const int32_t bd = bit_depth_;
-    loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, limit1,
-                   thresh1, bd);
-#else
-    loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, limit1,
-                   thresh1);
-#endif  // CONFIG_HIGHBITDEPTH
-  }
-}
-
-using std::tr1::make_tuple;
-
-#if HAVE_SSE2
-#if CONFIG_HIGHBITDEPTH
-
-const loop8_param_t kHbdLoop8Test6[] = {
-  make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
-             8),
-  make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 8),
-  make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
-             8),
-  make_tuple(&aom_highbd_lpf_horizontal_edge_8_sse2,
-             &aom_highbd_lpf_horizontal_edge_8_c, 8),
-  make_tuple(&aom_highbd_lpf_horizontal_edge_16_sse2,
-             &aom_highbd_lpf_horizontal_edge_16_c, 8),
-  make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 8),
-  make_tuple(&aom_highbd_lpf_vertical_16_sse2, &aom_highbd_lpf_vertical_16_c,
-             8),
-  make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
-             10),
-  make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 10),
-  make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
-             10),
-  make_tuple(&aom_highbd_lpf_horizontal_edge_8_sse2,
-             &aom_highbd_lpf_horizontal_edge_8_c, 10),
-  make_tuple(&aom_highbd_lpf_horizontal_edge_16_sse2,
-             &aom_highbd_lpf_horizontal_edge_16_c, 10),
-  make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 10),
-  make_tuple(&aom_highbd_lpf_vertical_16_sse2, &aom_highbd_lpf_vertical_16_c,
-             10),
-  make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
-             12),
-  make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 12),
-  make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
-             12),
-  make_tuple(&aom_highbd_lpf_horizontal_edge_8_sse2,
-             &aom_highbd_lpf_horizontal_edge_8_c, 12),
-  make_tuple(&aom_highbd_lpf_horizontal_edge_16_sse2,
-             &aom_highbd_lpf_horizontal_edge_16_c, 12),
-  make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 12),
-  make_tuple(&aom_highbd_lpf_vertical_16_sse2, &aom_highbd_lpf_vertical_16_c,
-             12),
-  make_tuple(&aom_highbd_lpf_vertical_16_dual_sse2,
-             &aom_highbd_lpf_vertical_16_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_vertical_16_dual_sse2,
-             &aom_highbd_lpf_vertical_16_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_vertical_16_dual_sse2,
-             &aom_highbd_lpf_vertical_16_dual_c, 12)
-};
-
-INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param,
-                        ::testing::ValuesIn(kHbdLoop8Test6));
-#else
-const loop8_param_t kLoop8Test6[] = {
-  make_tuple(&aom_lpf_horizontal_4_sse2, &aom_lpf_horizontal_4_c, 8),
-  make_tuple(&aom_lpf_horizontal_8_sse2, &aom_lpf_horizontal_8_c, 8),
-  make_tuple(&aom_lpf_horizontal_edge_8_sse2, &aom_lpf_horizontal_edge_8_c, 8),
-  make_tuple(&aom_lpf_horizontal_edge_16_sse2, &aom_lpf_horizontal_edge_16_c,
-             8),
-  make_tuple(&aom_lpf_vertical_4_sse2, &aom_lpf_vertical_4_c, 8),
-  make_tuple(&aom_lpf_vertical_8_sse2, &aom_lpf_vertical_8_c, 8),
-  make_tuple(&aom_lpf_vertical_16_sse2, &aom_lpf_vertical_16_c, 8),
-#if !CONFIG_PARALLEL_DEBLOCKING
-  make_tuple(&aom_lpf_vertical_16_dual_sse2, &aom_lpf_vertical_16_dual_c, 8)
-#endif
-};
-
-INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param,
-                        ::testing::ValuesIn(kLoop8Test6));
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // HAVE_SSE2
-
-#if HAVE_AVX2
-#if CONFIG_HIGHBITDEPTH
-
-const loop8_param_t kHbdLoop8Test6Avx2[] = {
-  make_tuple(&aom_highbd_lpf_horizontal_edge_16_avx2,
-             &aom_highbd_lpf_horizontal_edge_16_c, 8),
-  make_tuple(&aom_highbd_lpf_horizontal_edge_16_avx2,
-             &aom_highbd_lpf_horizontal_edge_16_c, 10),
-  make_tuple(&aom_highbd_lpf_horizontal_edge_16_avx2,
-             &aom_highbd_lpf_horizontal_edge_16_c, 12),
-  make_tuple(&aom_highbd_lpf_vertical_16_dual_avx2,
-             &aom_highbd_lpf_vertical_16_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_vertical_16_dual_avx2,
-             &aom_highbd_lpf_vertical_16_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_vertical_16_dual_avx2,
-             &aom_highbd_lpf_vertical_16_dual_c, 12)
-};
-
-INSTANTIATE_TEST_CASE_P(AVX2, Loop8Test6Param,
-                        ::testing::ValuesIn(kHbdLoop8Test6Avx2));
-
-#endif
-#endif
-
-#if HAVE_AVX2 && (!CONFIG_HIGHBITDEPTH) && (!CONFIG_PARALLEL_DEBLOCKING)
-INSTANTIATE_TEST_CASE_P(
-    AVX2, Loop8Test6Param,
-    ::testing::Values(make_tuple(&aom_lpf_horizontal_edge_8_avx2,
-                                 &aom_lpf_horizontal_edge_8_c, 8),
-                      make_tuple(&aom_lpf_horizontal_edge_16_avx2,
-                                 &aom_lpf_horizontal_edge_16_c, 8)));
-#endif
-
-#if HAVE_SSE2
-#if CONFIG_HIGHBITDEPTH
-const dualloop8_param_t kHbdLoop8Test9[] = {
-  make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
-             &aom_highbd_lpf_horizontal_4_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
-             &aom_highbd_lpf_horizontal_8_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
-             &aom_highbd_lpf_vertical_4_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
-             &aom_highbd_lpf_vertical_8_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
-             &aom_highbd_lpf_horizontal_4_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
-             &aom_highbd_lpf_horizontal_8_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
-             &aom_highbd_lpf_vertical_4_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
-             &aom_highbd_lpf_vertical_8_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
-             &aom_highbd_lpf_horizontal_4_dual_c, 12),
-  make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
-             &aom_highbd_lpf_horizontal_8_dual_c, 12),
-  make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
-             &aom_highbd_lpf_vertical_4_dual_c, 12),
-  make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
-             &aom_highbd_lpf_vertical_8_dual_c, 12)
-};
-
-INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param,
-                        ::testing::ValuesIn(kHbdLoop8Test9));
-#else
-#if !CONFIG_PARALLEL_DEBLOCKING
-const dualloop8_param_t kLoop8Test9[] = {
-  make_tuple(&aom_lpf_horizontal_4_dual_sse2, &aom_lpf_horizontal_4_dual_c, 8),
-  make_tuple(&aom_lpf_horizontal_8_dual_sse2, &aom_lpf_horizontal_8_dual_c, 8),
-  make_tuple(&aom_lpf_vertical_4_dual_sse2, &aom_lpf_vertical_4_dual_c, 8),
-  make_tuple(&aom_lpf_vertical_8_dual_sse2, &aom_lpf_vertical_8_dual_c, 8)
-};
-
-INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param,
-                        ::testing::ValuesIn(kLoop8Test9));
-#endif
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // HAVE_SSE2
-
-#if HAVE_AVX2
-#if CONFIG_HIGHBITDEPTH
-const dualloop8_param_t kHbdLoop8Test9Avx2[] = {
-  make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
-             &aom_highbd_lpf_horizontal_4_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
-             &aom_highbd_lpf_horizontal_4_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
-             &aom_highbd_lpf_horizontal_4_dual_c, 12),
-  make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
-             &aom_highbd_lpf_horizontal_8_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
-             &aom_highbd_lpf_horizontal_8_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
-             &aom_highbd_lpf_horizontal_8_dual_c, 12),
-  make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
-             &aom_highbd_lpf_vertical_4_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
-             &aom_highbd_lpf_vertical_4_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
-             &aom_highbd_lpf_vertical_4_dual_c, 12),
-  make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
-             &aom_highbd_lpf_vertical_8_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
-             &aom_highbd_lpf_vertical_8_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
-             &aom_highbd_lpf_vertical_8_dual_c, 12),
-};
-
-INSTANTIATE_TEST_CASE_P(AVX2, Loop8Test9Param,
-                        ::testing::ValuesIn(kHbdLoop8Test9Avx2));
-#endif
-#endif
-
-#if HAVE_NEON && (!CONFIG_PARALLEL_DEBLOCKING)
-#if CONFIG_HIGHBITDEPTH
-// No neon high bitdepth functions.
-#else
-INSTANTIATE_TEST_CASE_P(
-    NEON, Loop8Test6Param,
-    ::testing::Values(
-#if HAVE_NEON_ASM
-        // Using #if inside the macro is unsupported on MSVS but the tests are
-        // not
-        // currently built for MSVS with ARM and NEON.
-        make_tuple(&aom_lpf_horizontal_edge_8_neon,
-                   &aom_lpf_horizontal_edge_8_c, 8),
-        make_tuple(&aom_lpf_horizontal_edge_16_neon,
-                   &aom_lpf_horizontal_edge_16_c, 8),
-        make_tuple(&aom_lpf_vertical_16_neon, &aom_lpf_vertical_16_c, 8),
-        make_tuple(&aom_lpf_vertical_16_dual_neon, &aom_lpf_vertical_16_dual_c,
-                   8),
-#endif  // HAVE_NEON_ASM
-        make_tuple(&aom_lpf_horizontal_8_neon, &aom_lpf_horizontal_8_c, 8),
-        make_tuple(&aom_lpf_vertical_8_neon, &aom_lpf_vertical_8_c, 8),
-        make_tuple(&aom_lpf_horizontal_4_neon, &aom_lpf_horizontal_4_c, 8),
-        make_tuple(&aom_lpf_vertical_4_neon, &aom_lpf_vertical_4_c, 8)));
-INSTANTIATE_TEST_CASE_P(NEON, Loop8Test9Param,
-                        ::testing::Values(
-#if HAVE_NEON_ASM
-                            make_tuple(&aom_lpf_horizontal_8_dual_neon,
-                                       &aom_lpf_horizontal_8_dual_c, 8),
-                            make_tuple(&aom_lpf_vertical_8_dual_neon,
-                                       &aom_lpf_vertical_8_dual_c, 8),
-#endif  // HAVE_NEON_ASM
-                            make_tuple(&aom_lpf_horizontal_4_dual_neon,
-                                       &aom_lpf_horizontal_4_dual_c, 8),
-                            make_tuple(&aom_lpf_vertical_4_dual_neon,
-                                       &aom_lpf_vertical_4_dual_c, 8)));
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // HAVE_NEON && (!CONFIG_PARALLEL_DEBLOCKING)
-
-#if HAVE_DSPR2 && !CONFIG_HIGHBITDEPTH && (!CONFIG_PARALLEL_DEBLOCKING)
-INSTANTIATE_TEST_CASE_P(
-    DSPR2, Loop8Test6Param,
-    ::testing::Values(
-        make_tuple(&aom_lpf_horizontal_4_dspr2, &aom_lpf_horizontal_4_c, 8),
-        make_tuple(&aom_lpf_horizontal_8_dspr2, &aom_lpf_horizontal_8_c, 8),
-        make_tuple(&aom_lpf_horizontal_edge_8, &aom_lpf_horizontal_edge_8, 8),
-        make_tuple(&aom_lpf_horizontal_edge_16, &aom_lpf_horizontal_edge_16, 8),
-        make_tuple(&aom_lpf_vertical_4_dspr2, &aom_lpf_vertical_4_c, 8),
-        make_tuple(&aom_lpf_vertical_8_dspr2, &aom_lpf_vertical_8_c, 8),
-        make_tuple(&aom_lpf_vertical_16_dspr2, &aom_lpf_vertical_16_c, 8),
-        make_tuple(&aom_lpf_vertical_16_dual_dspr2, &aom_lpf_vertical_16_dual_c,
-                   8)));
-
-INSTANTIATE_TEST_CASE_P(
-    DSPR2, Loop8Test9Param,
-    ::testing::Values(make_tuple(&aom_lpf_horizontal_4_dual_dspr2,
-                                 &aom_lpf_horizontal_4_dual_c, 8),
-                      make_tuple(&aom_lpf_horizontal_8_dual_dspr2,
-                                 &aom_lpf_horizontal_8_dual_c, 8),
-                      make_tuple(&aom_lpf_vertical_4_dual_dspr2,
-                                 &aom_lpf_vertical_4_dual_c, 8),
-                      make_tuple(&aom_lpf_vertical_8_dual_dspr2,
-                                 &aom_lpf_vertical_8_dual_c, 8)));
-#endif  // HAVE_DSPR2 && !CONFIG_HIGHBITDEPTH && (!CONFIG_PARALLEL_DEBLOCKING)
-
-#if HAVE_MSA && (!CONFIG_HIGHBITDEPTH) && (!CONFIG_PARALLEL_DEBLOCKING)
-INSTANTIATE_TEST_CASE_P(
-    MSA, Loop8Test6Param,
-    ::testing::Values(
-        make_tuple(&aom_lpf_horizontal_4_msa, &aom_lpf_horizontal_4_c, 8),
-        make_tuple(&aom_lpf_horizontal_8_msa, &aom_lpf_horizontal_8_c, 8),
-        make_tuple(&aom_lpf_horizontal_edge_8_msa, &aom_lpf_horizontal_edge_8_c,
-                   8),
-        make_tuple(&aom_lpf_horizontal_edge_16_msa,
-                   &aom_lpf_horizontal_edge_16_c, 8),
-        make_tuple(&aom_lpf_vertical_4_msa, &aom_lpf_vertical_4_c, 8),
-        make_tuple(&aom_lpf_vertical_8_msa, &aom_lpf_vertical_8_c, 8),
-        make_tuple(&aom_lpf_vertical_16_msa, &aom_lpf_vertical_16_c, 8)));
-
-INSTANTIATE_TEST_CASE_P(
-    MSA, Loop8Test9Param,
-    ::testing::Values(make_tuple(&aom_lpf_horizontal_4_dual_msa,
-                                 &aom_lpf_horizontal_4_dual_c, 8),
-                      make_tuple(&aom_lpf_horizontal_8_dual_msa,
-                                 &aom_lpf_horizontal_8_dual_c, 8),
-                      make_tuple(&aom_lpf_vertical_4_dual_msa,
-                                 &aom_lpf_vertical_4_dual_c, 8),
-                      make_tuple(&aom_lpf_vertical_8_dual_msa,
-                                 &aom_lpf_vertical_8_dual_c, 8)));
-#endif  // HAVE_MSA && (!CONFIG_HIGHBITDEPTH) && (!CONFIG_PARALLEL_DEBLOCKING)
-
-}  // namespace
diff --git a/third_party/aom/test/lpf_test.cc b/third_party/aom/test/lpf_test.cc
new file mode 100644
index 000000000..1e2862ac8
--- /dev/null
+++ b/third_party/aom/test/lpf_test.cc
@@ -0,0 +1,623 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/entropy.h"
+#include "aom/aom_integer.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+// Horizontally and Vertically need 32x32: 8  Coeffs preceeding filtered section
+//                                         16 Coefs within filtered section
+//                                         8  Coeffs following filtered section
+const int kNumCoeffs = 1024;
+
+const int number_of_iterations = 10000;
+
+const int kSpeedTestNum = 500000;
+
+#define LOOP_PARAM \
+  int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh
+#define DUAL_LOOP_PARAM                                                      \
+  int p, const uint8_t *blimit0, const uint8_t *limit0,                      \
+      const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, \
+      const uint8_t *thresh1
+
+typedef void (*loop_op_t)(uint8_t *s, LOOP_PARAM);
+typedef void (*dual_loop_op_t)(uint8_t *s, DUAL_LOOP_PARAM);
+typedef void (*hbdloop_op_t)(uint16_t *s, LOOP_PARAM, int bd);
+typedef void (*hbddual_loop_op_t)(uint16_t *s, DUAL_LOOP_PARAM, int bd);
+
+typedef ::testing::tuple<hbdloop_op_t, hbdloop_op_t, int> hbdloop_param_t;
+typedef ::testing::tuple<hbddual_loop_op_t, hbddual_loop_op_t, int>
+    hbddual_loop_param_t;
+typedef ::testing::tuple<loop_op_t, loop_op_t, int> loop_param_t;
+typedef ::testing::tuple<dual_loop_op_t, dual_loop_op_t, int> dual_loop_param_t;
+
+template <typename Pixel_t, int PIXEL_WIDTH_t>
+void InitInput(Pixel_t *s, Pixel_t *ref_s, ACMRandom *rnd, const uint8_t limit,
+               const int mask, const int32_t p, const int i) {
+  uint16_t tmp_s[kNumCoeffs];
+
+  for (int j = 0; j < kNumCoeffs;) {
+    const uint8_t val = rnd->Rand8();
+    if (val & 0x80) {  // 50% chance to choose a new value.
+      tmp_s[j] = rnd->Rand16();
+      j++;
+    } else {  // 50% chance to repeat previous value in row X times.
+      int k = 0;
+      while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
+        if (j < 1) {
+          tmp_s[j] = rnd->Rand16();
+        } else if (val & 0x20) {  // Increment by a value within the limit.
+          tmp_s[j] = tmp_s[j - 1] + (limit - 1);
+        } else {  // Decrement by a value within the limit.
+          tmp_s[j] = tmp_s[j - 1] - (limit - 1);
+        }
+        j++;
+      }
+    }
+  }
+
+  for (int j = 0; j < kNumCoeffs;) {
+    const uint8_t val = rnd->Rand8();
+    if (val & 0x80) {
+      j++;
+    } else {  // 50% chance to repeat previous value in column X times.
+      int k = 0;
+      while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
+        if (j < 1) {
+          tmp_s[j] = rnd->Rand16();
+        } else if (val & 0x20) {  // Increment by a value within the limit.
+          tmp_s[(j % 32) * 32 + j / 32] =
+              tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] + (limit - 1);
+        } else {  // Decrement by a value within the limit.
+          tmp_s[(j % 32) * 32 + j / 32] =
+              tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] - (limit - 1);
+        }
+        j++;
+      }
+    }
+  }
+
+  for (int j = 0; j < kNumCoeffs; j++) {
+    if (i % 2) {
+      s[j] = tmp_s[j] & mask;
+    } else {
+      s[j] = tmp_s[p * (j % p) + j / p] & mask;
+    }
+    ref_s[j] = s[j];
+  }
+}
+
+uint8_t GetOuterThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->PseudoUniform(3 * MAX_LOOP_FILTER + 5));
+}
+
+uint8_t GetInnerThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->PseudoUniform(MAX_LOOP_FILTER + 1));
+}
+
+uint8_t GetHevThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->PseudoUniform(MAX_LOOP_FILTER + 1) >> 4);
+}
+
+template <typename func_type_t, typename params_t>
+class LoopTestParam : public ::testing::TestWithParam<params_t> {
+ public:
+  virtual ~LoopTestParam() {}
+  virtual void SetUp() {
+    loopfilter_op_ = ::testing::get<0>(this->GetParam());
+    ref_loopfilter_op_ = ::testing::get<1>(this->GetParam());
+    bit_depth_ = ::testing::get<2>(this->GetParam());
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  int bit_depth_;
+  int mask_;
+  func_type_t loopfilter_op_;
+  func_type_t ref_loopfilter_op_;
+};
+
+void call_filter(uint16_t *s, LOOP_PARAM, int bd, hbdloop_op_t op) {
+  op(s, p, blimit, limit, thresh, bd);
+}
+void call_filter(uint8_t *s, LOOP_PARAM, int bd, loop_op_t op) {
+  (void)bd;
+  op(s, p, blimit, limit, thresh);
+}
+void call_dualfilter(uint16_t *s, DUAL_LOOP_PARAM, int bd,
+                     hbddual_loop_op_t op) {
+  op(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd);
+}
+void call_dualfilter(uint8_t *s, DUAL_LOOP_PARAM, int bd, dual_loop_op_t op) {
+  (void)bd;
+  op(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1);
+};
+
+typedef LoopTestParam<hbdloop_op_t, hbdloop_param_t> Loop8Test6Param_hbd;
+typedef LoopTestParam<loop_op_t, loop_param_t> Loop8Test6Param_lbd;
+typedef LoopTestParam<hbddual_loop_op_t, hbddual_loop_param_t>
+    Loop8Test9Param_hbd;
+typedef LoopTestParam<dual_loop_op_t, dual_loop_param_t> Loop8Test9Param_lbd;
+
+#define OPCHECK(a, b)                                                          \
+  ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
+  const int count_test_block = number_of_iterations;                           \
+  const int32_t p = kNumCoeffs / 32;                                           \
+  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                        \
+  DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                    \
+  int err_count_total = 0;                                                     \
+  int first_failure = -1;                                                      \
+  for (int i = 0; i < count_test_block; ++i) {                                 \
+    int err_count = 0;                                                         \
+    uint8_t tmp = GetOuterThresh(&rnd);                                        \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    tmp = GetInnerThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,     \
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };   \
+    tmp = GetHevThresh(&rnd);                                                  \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    InitInput<a, b>(s, ref_s, &rnd, *limit, mask_, p, i);                      \
+    call_filter(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_,       \
+                ref_loopfilter_op_);                                           \
+    ASM_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit,      \
+                                         thresh, bit_depth_, loopfilter_op_)); \
+    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+      err_count += ref_s[j] != s[j];                                           \
+    }                                                                          \
+    if (err_count && !err_count_total) {                                       \
+      first_failure = i;                                                       \
+    }                                                                          \
+    err_count_total += err_count;                                              \
+  }                                                                            \
+  EXPECT_EQ(0, err_count_total)                                                \
+      << "Error: Loop8Test6Param, C output doesn't match SIMD "                \
+         "loopfilter output. "                                                 \
+      << "First failed at test case " << first_failure;
+
+TEST_P(Loop8Test6Param_hbd, OperationCheck) { OPCHECK(uint16_t, 16); }
+TEST_P(Loop8Test6Param_lbd, OperationCheck) { OPCHECK(uint8_t, 8); }
+
+#define VALCHECK(a, b)                                                         \
+  ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
+  const int count_test_block = number_of_iterations;                           \
+  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                        \
+  DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                    \
+  int err_count_total = 0;                                                     \
+  int first_failure = -1;                                                      \
+  for (int i = 0; i < count_test_block; ++i) {                                 \
+    int err_count = 0;                                                         \
+    uint8_t tmp = GetOuterThresh(&rnd);                                        \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    tmp = GetInnerThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,     \
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };   \
+    tmp = GetHevThresh(&rnd);                                                  \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    int32_t p = kNumCoeffs / 32;                                               \
+    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+      s[j] = rnd.Rand16() & mask_;                                             \
+      ref_s[j] = s[j];                                                         \
+    }                                                                          \
+    call_filter(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_,       \
+                ref_loopfilter_op_);                                           \
+    ASM_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit,      \
+                                         thresh, bit_depth_, loopfilter_op_)); \
+    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+      err_count += ref_s[j] != s[j];                                           \
+    }                                                                          \
+    if (err_count && !err_count_total) {                                       \
+      first_failure = i;                                                       \
+    }                                                                          \
+    err_count_total += err_count;                                              \
+  }                                                                            \
+  EXPECT_EQ(0, err_count_total)                                                \
+      << "Error: Loop8Test6Param, C output doesn't match SIMD "                \
+         "loopfilter output. "                                                 \
+      << "First failed at test case " << first_failure;
+
+TEST_P(Loop8Test6Param_hbd, ValueCheck) { VALCHECK(uint16_t, 16); }
+TEST_P(Loop8Test6Param_lbd, ValueCheck) { VALCHECK(uint8_t, 8); }
+
+#define SPEEDCHECK(a, b)                                                      \
+  ACMRandom rnd(ACMRandom::DeterministicSeed());                              \
+  const int count_test_block = kSpeedTestNum;                                 \
+  const int32_t bd = bit_depth_;                                              \
+  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                       \
+  uint8_t tmp = GetOuterThresh(&rnd);                                         \
+  DECLARE_ALIGNED(16, const uint8_t,                                          \
+                  blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,     \
+                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };   \
+  tmp = GetInnerThresh(&rnd);                                                 \
+  DECLARE_ALIGNED(16, const uint8_t,                                          \
+                  limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,      \
+                                 tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };    \
+  tmp = GetHevThresh(&rnd);                                                   \
+  DECLARE_ALIGNED(16, const uint8_t,                                          \
+                  thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,     \
+                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };   \
+  int32_t p = kNumCoeffs / 32;                                                \
+  for (int j = 0; j < kNumCoeffs; ++j) {                                      \
+    s[j] = rnd.Rand16() & mask_;                                              \
+  }                                                                           \
+  for (int i = 0; i < count_test_block; ++i) {                                \
+    call_filter(s + 8 + p * 8, p, blimit, limit, thresh, bd, loopfilter_op_); \
+  }
+
+TEST_P(Loop8Test6Param_hbd, DISABLED_Speed) { SPEEDCHECK(uint16_t, 16); }
+TEST_P(Loop8Test6Param_lbd, DISABLED_Speed) { SPEEDCHECK(uint8_t, 8); }
+
+#define OPCHECKd(a, b)                                                         \
+  ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
+  const int count_test_block = number_of_iterations;                           \
+  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                        \
+  DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                    \
+  int err_count_total = 0;                                                     \
+  int first_failure = -1;                                                      \
+  for (int i = 0; i < count_test_block; ++i) {                                 \
+    int err_count = 0;                                                         \
+    uint8_t tmp = GetOuterThresh(&rnd);                                        \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    tmp = GetInnerThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    tmp = GetHevThresh(&rnd);                                                  \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    tmp = GetOuterThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    tmp = GetInnerThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    tmp = GetHevThresh(&rnd);                                                  \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    int32_t p = kNumCoeffs / 32;                                               \
+    const uint8_t limit = *limit0 < *limit1 ? *limit0 : *limit1;               \
+    InitInput<a, b>(s, ref_s, &rnd, limit, mask_, p, i);                       \
+    call_dualfilter(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
+                    limit1, thresh1, bit_depth_, ref_loopfilter_op_);          \
+    ASM_REGISTER_STATE_CHECK(                                                  \
+        call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
+                        limit1, thresh1, bit_depth_, loopfilter_op_));         \
+    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+      err_count += ref_s[j] != s[j];                                           \
+    }                                                                          \
+    if (err_count && !err_count_total) {                                       \
+      first_failure = i;                                                       \
+    }                                                                          \
+    err_count_total += err_count;                                              \
+  }                                                                            \
+  EXPECT_EQ(0, err_count_total)                                                \
+      << "Error: Loop8Test9Param, C output doesn't match SIMD "                \
+         "loopfilter output. "                                                 \
+      << "First failed at test case " << first_failure;
+
+TEST_P(Loop8Test9Param_hbd, OperationCheck) { OPCHECKd(uint16_t, 16); }
+TEST_P(Loop8Test9Param_lbd, OperationCheck) { OPCHECKd(uint8_t, 8); }
+
+#define VALCHECKd(a, b)                                                        \
+  ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
+  const int count_test_block = number_of_iterations;                           \
+  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                        \
+  DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                    \
+  int err_count_total = 0;                                                     \
+  int first_failure = -1;                                                      \
+  for (int i = 0; i < count_test_block; ++i) {                                 \
+    int err_count = 0;                                                         \
+    uint8_t tmp = GetOuterThresh(&rnd);                                        \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    tmp = GetInnerThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    tmp = GetHevThresh(&rnd);                                                  \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    tmp = GetOuterThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    tmp = GetInnerThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    tmp = GetHevThresh(&rnd);                                                  \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    int32_t p = kNumCoeffs / 32;                                               \
+    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+      s[j] = rnd.Rand16() & mask_;                                             \
+      ref_s[j] = s[j];                                                         \
+    }                                                                          \
+    call_dualfilter(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
+                    limit1, thresh1, bit_depth_, ref_loopfilter_op_);          \
+    ASM_REGISTER_STATE_CHECK(                                                  \
+        call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
+                        limit1, thresh1, bit_depth_, loopfilter_op_));         \
+    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+      err_count += ref_s[j] != s[j];                                           \
+    }                                                                          \
+    if (err_count && !err_count_total) {                                       \
+      first_failure = i;                                                       \
+    }                                                                          \
+    err_count_total += err_count;                                              \
+  }                                                                            \
+  EXPECT_EQ(0, err_count_total)                                                \
+      << "Error: Loop8Test9Param, C output doesn't match SIMD "                \
+         "loopfilter output. "                                                 \
+      << "First failed at test case " << first_failure;
+
+TEST_P(Loop8Test9Param_hbd, ValueCheck) { VALCHECKd(uint16_t, 16); }
+TEST_P(Loop8Test9Param_lbd, ValueCheck) { VALCHECKd(uint8_t, 8); }
+
+#define SPEEDCHECKd(a, b)                                                    \
+  ACMRandom rnd(ACMRandom::DeterministicSeed());                             \
+  const int count_test_block = kSpeedTestNum;                                \
+  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                      \
+  uint8_t tmp = GetOuterThresh(&rnd);                                        \
+  DECLARE_ALIGNED(16, const uint8_t,                                         \
+                  blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+  tmp = GetInnerThresh(&rnd);                                                \
+  DECLARE_ALIGNED(16, const uint8_t,                                         \
+                  limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+  tmp = GetHevThresh(&rnd);                                                  \
+  DECLARE_ALIGNED(16, const uint8_t,                                         \
+                  thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+  tmp = GetOuterThresh(&rnd);                                                \
+  DECLARE_ALIGNED(16, const uint8_t,                                         \
+                  blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+  tmp = GetInnerThresh(&rnd);                                                \
+  DECLARE_ALIGNED(16, const uint8_t,                                         \
+                  limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+  tmp = GetHevThresh(&rnd);                                                  \
+  DECLARE_ALIGNED(16, const uint8_t,                                         \
+                  thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+  int32_t p = kNumCoeffs / 32;                                               \
+  for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+    s[j] = rnd.Rand16() & mask_;                                             \
+  }                                                                          \
+  for (int i = 0; i < count_test_block; ++i) {                               \
+    call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,     \
+                    limit1, thresh1, bit_depth_, loopfilter_op_);            \
+  }
+
+TEST_P(Loop8Test9Param_hbd, DISABLED_Speed) { SPEEDCHECKd(uint16_t, 16); }
+TEST_P(Loop8Test9Param_lbd, DISABLED_Speed) { SPEEDCHECKd(uint8_t, 8); }
+
+using ::testing::make_tuple;
+
+#if HAVE_SSE2
+
+const hbdloop_param_t kHbdLoop8Test6[] = {
+  make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
+             8),
+  make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c,
+             8),
+  make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
+             8),
+  make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
+             &aom_highbd_lpf_horizontal_14_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 8),
+
+  make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
+             8),
+  make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
+             10),
+  make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c,
+             10),
+  make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
+             10),
+  make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
+             &aom_highbd_lpf_horizontal_14_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
+             10),
+  make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
+             12),
+  make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c,
+             12),
+  make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
+             12),
+  make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
+             &aom_highbd_lpf_horizontal_14_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
+             12),
+  make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 12)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param_hbd,
+                        ::testing::ValuesIn(kHbdLoop8Test6));
+
+const loop_param_t kLoop8Test6[] = {
+  make_tuple(&aom_lpf_horizontal_4_sse2, &aom_lpf_horizontal_4_c, 8),
+  make_tuple(&aom_lpf_horizontal_8_sse2, &aom_lpf_horizontal_8_c, 8),
+  make_tuple(&aom_lpf_horizontal_6_sse2, &aom_lpf_horizontal_6_c, 8),
+  make_tuple(&aom_lpf_vertical_6_sse2, &aom_lpf_vertical_6_c, 8),
+  make_tuple(&aom_lpf_horizontal_14_sse2, &aom_lpf_horizontal_14_c, 8),
+  make_tuple(&aom_lpf_vertical_4_sse2, &aom_lpf_vertical_4_c, 8),
+  make_tuple(&aom_lpf_vertical_8_sse2, &aom_lpf_vertical_8_c, 8),
+  make_tuple(&aom_lpf_vertical_14_sse2, &aom_lpf_vertical_14_c, 8),
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param_lbd,
+                        ::testing::ValuesIn(kLoop8Test6));
+
+const dual_loop_param_t kLoop8Test9[] = {
+  make_tuple(&aom_lpf_horizontal_4_dual_sse2, &aom_lpf_horizontal_4_dual_c, 8),
+  make_tuple(&aom_lpf_vertical_4_dual_sse2, &aom_lpf_vertical_4_dual_c, 8),
+  make_tuple(&aom_lpf_horizontal_6_dual_sse2, &aom_lpf_horizontal_6_dual_c, 8),
+  make_tuple(&aom_lpf_vertical_6_dual_sse2, &aom_lpf_vertical_6_dual_c, 8),
+  make_tuple(&aom_lpf_horizontal_8_dual_sse2, &aom_lpf_horizontal_8_dual_c, 8),
+  make_tuple(&aom_lpf_vertical_8_dual_sse2, &aom_lpf_vertical_8_dual_c, 8),
+  make_tuple(&aom_lpf_horizontal_14_dual_sse2, &aom_lpf_horizontal_14_dual_c,
+             8),
+  make_tuple(&aom_lpf_vertical_14_dual_sse2, &aom_lpf_vertical_14_dual_c, 8)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param_lbd,
+                        ::testing::ValuesIn(kLoop8Test9));
+
+#endif  // HAVE_SSE2
+
+#if HAVE_SSE2
+const hbddual_loop_param_t kHbdLoop8Test9[] = {
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2,
+             &aom_highbd_lpf_horizontal_6_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
+             &aom_highbd_lpf_horizontal_14_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
+             &aom_highbd_lpf_vertical_4_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2,
+             &aom_highbd_lpf_vertical_6_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
+             &aom_highbd_lpf_vertical_8_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
+             &aom_highbd_lpf_vertical_14_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2,
+             &aom_highbd_lpf_horizontal_6_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
+             &aom_highbd_lpf_horizontal_14_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
+             &aom_highbd_lpf_vertical_4_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2,
+             &aom_highbd_lpf_vertical_6_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
+             &aom_highbd_lpf_vertical_8_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
+             &aom_highbd_lpf_vertical_14_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2,
+             &aom_highbd_lpf_horizontal_6_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
+             &aom_highbd_lpf_horizontal_14_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
+             &aom_highbd_lpf_vertical_4_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2,
+             &aom_highbd_lpf_vertical_6_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
+             &aom_highbd_lpf_vertical_8_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
+             &aom_highbd_lpf_vertical_14_dual_c, 12),
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param_hbd,
+                        ::testing::ValuesIn(kHbdLoop8Test9));
+
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+const loop_param_t kLoop8Test6[] = {
+  make_tuple(&aom_lpf_vertical_14_neon, &aom_lpf_vertical_14_c, 8),
+  make_tuple(&aom_lpf_vertical_8_neon, &aom_lpf_vertical_8_c, 8),
+  make_tuple(&aom_lpf_horizontal_8_neon, &aom_lpf_horizontal_8_c, 8),
+  make_tuple(&aom_lpf_horizontal_6_neon, &aom_lpf_horizontal_6_c, 8)
+};
+
+INSTANTIATE_TEST_CASE_P(NEON, Loop8Test6Param_lbd,
+                        ::testing::ValuesIn(kLoop8Test6));
+#endif  // HAVE_NEON
+
+#if HAVE_AVX2
+const hbddual_loop_param_t kHbdLoop8Test9Avx2[] = {
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
+             &aom_highbd_lpf_vertical_4_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
+             &aom_highbd_lpf_vertical_4_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
+             &aom_highbd_lpf_vertical_4_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
+             &aom_highbd_lpf_vertical_8_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
+             &aom_highbd_lpf_vertical_8_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
+             &aom_highbd_lpf_vertical_8_dual_c, 12),
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, Loop8Test9Param_hbd,
+                        ::testing::ValuesIn(kHbdLoop8Test9Avx2));
+#endif
+}  // namespace
diff --git a/third_party/aom/test/masked_sad_test.cc b/third_party/aom/test/masked_sad_test.cc
index 19f97718d..1a393a001 100644
--- a/third_party/aom/test/masked_sad_test.cc
+++ b/third_party/aom/test/masked_sad_test.cc
@@ -18,8 +18,9 @@
 #include "test/register_state_check.h"
 #include "test/util.h"
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 using libaom_test::ACMRandom;
@@ -32,7 +33,7 @@ typedef unsigned int (*MaskedSADFunc)(const uint8_t *src, int src_stride,
                                       const uint8_t *second_pred,
                                       const uint8_t *msk, int msk_stride,
                                       int invert_mask);
-typedef std::tr1::tuple<MaskedSADFunc, MaskedSADFunc> MaskedSADParam;
+typedef ::testing::tuple<MaskedSADFunc, MaskedSADFunc> MaskedSADParam;
 
 class MaskedSADTest : public ::testing::TestWithParam<MaskedSADParam> {
  public:
@@ -89,13 +90,12 @@ TEST_P(MaskedSADTest, OperationCheck) {
       << "First failed at test case " << first_failure;
 }
 
-#if CONFIG_HIGHBITDEPTH
 typedef unsigned int (*HighbdMaskedSADFunc)(const uint8_t *src, int src_stride,
                                             const uint8_t *ref, int ref_stride,
                                             const uint8_t *second_pred,
                                             const uint8_t *msk, int msk_stride,
                                             int invert_mask);
-typedef std::tr1::tuple<HighbdMaskedSADFunc, HighbdMaskedSADFunc>
+typedef ::testing::tuple<HighbdMaskedSADFunc, HighbdMaskedSADFunc>
     HighbdMaskedSADParam;
 
 class HighbdMaskedSADTest
@@ -155,17 +155,14 @@ TEST_P(HighbdMaskedSADTest, OperationCheck) {
       << "Error: High BD Masked SAD Test, C output doesn't match SSSE3 output. "
       << "First failed at test case " << first_failure;
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
-using std::tr1::make_tuple;
+using ::testing::make_tuple;
 
 #if HAVE_SSSE3
 const MaskedSADParam msad_test[] = {
-#if CONFIG_EXT_PARTITION
   make_tuple(&aom_masked_sad128x128_ssse3, &aom_masked_sad128x128_c),
   make_tuple(&aom_masked_sad128x64_ssse3, &aom_masked_sad128x64_c),
   make_tuple(&aom_masked_sad64x128_ssse3, &aom_masked_sad64x128_c),
-#endif  // CONFIG_EXT_PARTITION
   make_tuple(&aom_masked_sad64x64_ssse3, &aom_masked_sad64x64_c),
   make_tuple(&aom_masked_sad64x32_ssse3, &aom_masked_sad64x32_c),
   make_tuple(&aom_masked_sad32x64_ssse3, &aom_masked_sad32x64_c),
@@ -183,16 +180,13 @@ const MaskedSADParam msad_test[] = {
 
 INSTANTIATE_TEST_CASE_P(SSSE3_C_COMPARE, MaskedSADTest,
                         ::testing::ValuesIn(msad_test));
-#if CONFIG_HIGHBITDEPTH
 const HighbdMaskedSADParam hbd_msad_test[] = {
-#if CONFIG_EXT_PARTITION
   make_tuple(&aom_highbd_masked_sad128x128_ssse3,
              &aom_highbd_masked_sad128x128_c),
   make_tuple(&aom_highbd_masked_sad128x64_ssse3,
              &aom_highbd_masked_sad128x64_c),
   make_tuple(&aom_highbd_masked_sad64x128_ssse3,
              &aom_highbd_masked_sad64x128_c),
-#endif  // CONFIG_EXT_PARTITION
   make_tuple(&aom_highbd_masked_sad64x64_ssse3, &aom_highbd_masked_sad64x64_c),
   make_tuple(&aom_highbd_masked_sad64x32_ssse3, &aom_highbd_masked_sad64x32_c),
   make_tuple(&aom_highbd_masked_sad32x64_ssse3, &aom_highbd_masked_sad32x64_c),
@@ -210,6 +204,5 @@ const HighbdMaskedSADParam hbd_msad_test[] = {
 
 INSTANTIATE_TEST_CASE_P(SSSE3_C_COMPARE, HighbdMaskedSADTest,
                         ::testing::ValuesIn(hbd_msad_test));
-#endif  // CONFIG_HIGHBITDEPTH
 #endif  // HAVE_SSSE3
 }  // namespace
diff --git a/third_party/aom/test/masked_variance_test.cc b/third_party/aom/test/masked_variance_test.cc
index a9cbdc80d..275b9feb6 100644
--- a/third_party/aom/test/masked_variance_test.cc
+++ b/third_party/aom/test/masked_variance_test.cc
@@ -19,8 +19,9 @@
 #include "test/register_state_check.h"
 #include "test/util.h"
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_codec.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_filter.h"
@@ -36,7 +37,7 @@ typedef unsigned int (*MaskedSubPixelVarianceFunc)(
     const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
     const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
 
-typedef std::tr1::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc>
+typedef ::testing::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc>
     MaskedSubPixelVarianceParam;
 
 class MaskedSubPixelVarianceTest
@@ -169,9 +170,8 @@ TEST_P(MaskedSubPixelVarianceTest, ExtremeValues) {
                           << " y_offset = " << first_failure_y;
 }
 
-#if CONFIG_HIGHBITDEPTH
-typedef std::tr1::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc,
-                        aom_bit_depth_t>
+typedef ::testing::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc,
+                         aom_bit_depth_t>
     HighbdMaskedSubPixelVarianceParam;
 
 class HighbdMaskedSubPixelVarianceTest
@@ -311,21 +311,18 @@ TEST_P(HighbdMaskedSubPixelVarianceTest, ExtremeValues) {
                           << " x_offset = " << first_failure_x
                           << " y_offset = " << first_failure_y;
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
-using std::tr1::make_tuple;
+using ::testing::make_tuple;
 
 #if HAVE_SSSE3
 
 const MaskedSubPixelVarianceParam sub_pel_var_test[] = {
-#if CONFIG_EXT_PARTITION
   make_tuple(&aom_masked_sub_pixel_variance128x128_ssse3,
              &aom_masked_sub_pixel_variance128x128_c),
   make_tuple(&aom_masked_sub_pixel_variance128x64_ssse3,
              &aom_masked_sub_pixel_variance128x64_c),
   make_tuple(&aom_masked_sub_pixel_variance64x128_ssse3,
              &aom_masked_sub_pixel_variance64x128_c),
-#endif  // CONFIG_EXT_PARTITION
   make_tuple(&aom_masked_sub_pixel_variance64x64_ssse3,
              &aom_masked_sub_pixel_variance64x64_c),
   make_tuple(&aom_masked_sub_pixel_variance64x32_ssse3,
@@ -357,16 +354,13 @@ const MaskedSubPixelVarianceParam sub_pel_var_test[] = {
 INSTANTIATE_TEST_CASE_P(SSSE3_C_COMPARE, MaskedSubPixelVarianceTest,
                         ::testing::ValuesIn(sub_pel_var_test));
 
-#if CONFIG_HIGHBITDEPTH
 const HighbdMaskedSubPixelVarianceParam hbd_sub_pel_var_test[] = {
-#if CONFIG_EXT_PARTITION
   make_tuple(&aom_highbd_8_masked_sub_pixel_variance128x128_ssse3,
              &aom_highbd_8_masked_sub_pixel_variance128x128_c, AOM_BITS_8),
   make_tuple(&aom_highbd_8_masked_sub_pixel_variance128x64_ssse3,
              &aom_highbd_8_masked_sub_pixel_variance128x64_c, AOM_BITS_8),
   make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x128_ssse3,
              &aom_highbd_8_masked_sub_pixel_variance64x128_c, AOM_BITS_8),
-#endif  // CONFIG_EXT_PARTITION
   make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x64_ssse3,
              &aom_highbd_8_masked_sub_pixel_variance64x64_c, AOM_BITS_8),
   make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x32_ssse3,
@@ -393,14 +387,12 @@ const HighbdMaskedSubPixelVarianceParam hbd_sub_pel_var_test[] = {
              &aom_highbd_8_masked_sub_pixel_variance4x8_c, AOM_BITS_8),
   make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x4_ssse3,
              &aom_highbd_8_masked_sub_pixel_variance4x4_c, AOM_BITS_8),
-#if CONFIG_EXT_PARTITION
   make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x128_ssse3,
              &aom_highbd_10_masked_sub_pixel_variance128x128_c, AOM_BITS_10),
   make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x64_ssse3,
              &aom_highbd_10_masked_sub_pixel_variance128x64_c, AOM_BITS_10),
   make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x128_ssse3,
              &aom_highbd_10_masked_sub_pixel_variance64x128_c, AOM_BITS_10),
-#endif  // CONFIG_EXT_PARTITION
   make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x64_ssse3,
              &aom_highbd_10_masked_sub_pixel_variance64x64_c, AOM_BITS_10),
   make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x32_ssse3,
@@ -427,14 +419,12 @@ const HighbdMaskedSubPixelVarianceParam hbd_sub_pel_var_test[] = {
              &aom_highbd_10_masked_sub_pixel_variance4x8_c, AOM_BITS_10),
   make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x4_ssse3,
              &aom_highbd_10_masked_sub_pixel_variance4x4_c, AOM_BITS_10),
-#if CONFIG_EXT_PARTITION
   make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x128_ssse3,
              &aom_highbd_12_masked_sub_pixel_variance128x128_c, AOM_BITS_12),
   make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x64_ssse3,
              &aom_highbd_12_masked_sub_pixel_variance128x64_c, AOM_BITS_12),
   make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x128_ssse3,
              &aom_highbd_12_masked_sub_pixel_variance64x128_c, AOM_BITS_12),
-#endif  // CONFIG_EXT_PARTITION
   make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x64_ssse3,
              &aom_highbd_12_masked_sub_pixel_variance64x64_c, AOM_BITS_12),
   make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x32_ssse3,
@@ -465,7 +455,5 @@ const HighbdMaskedSubPixelVarianceParam hbd_sub_pel_var_test[] = {
 
 INSTANTIATE_TEST_CASE_P(SSSE3_C_COMPARE, HighbdMaskedSubPixelVarianceTest,
                         ::testing::ValuesIn(hbd_sub_pel_var_test));
-#endif  // CONFIG_HIGHBITDEPTH
-
 #endif  // HAVE_SSSE3
 }  // namespace
diff --git a/third_party/aom/test/md5_helper.h b/third_party/aom/test/md5_helper.h
index 8c9d4f706..b2b14cf21 100644
--- a/third_party/aom/test/md5_helper.h
+++ b/third_party/aom/test/md5_helper.h
@@ -12,8 +12,8 @@
 #ifndef TEST_MD5_HELPER_H_
 #define TEST_MD5_HELPER_H_
 
-#include "./md5_utils.h"
 #include "aom/aom_decoder.h"
+#include "common/md5_utils.h"
 
 namespace libaom_test {
 class MD5 {
diff --git a/third_party/aom/test/metrics_template.html b/third_party/aom/test/metrics_template.html
new file mode 100644
index 000000000..b57c62314
--- /dev/null
+++ b/third_party/aom/test/metrics_template.html
@@ -0,0 +1,422 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<title>Video Codec Test Results</title>
+<style type="text/css">
+<!-- Begin 960 reset -->
+a,abbr,acronym,address,applet,article,aside,audio,b,big,blockquote,body,canvas,caption,center,cite,c
+ode,dd,del,details,dfn,dialog,div,dl,dt,em,embed,fieldset,figcaption,figure,font,footer,form,h1,h2,h
+3,h4,h5,h6,header,hgroup,hr,html,i,iframe,img,ins,kbd,label,legend,li,mark,menu,meter,nav,object,ol,
+output,p,pre,progress,q,rp,rt,ruby,s,samp,section,small,span,strike,strong,sub,summary,sup,table,tbo
+dy,td,tfoot,th,thead,time,tr,tt,u,ul,var,video,xmp{border:0;margin:0;padding:0;font-size:100%}html,b
+ody{height:100%}article,aside,details,figcaption,figure,footer,header,hgroup,menu,nav,section{displa
+y:block}b,strong{font-weight:bold}img{color:transparent;font-size:0;vertical-align:middle;-ms-interp
+olation-mode:bicubic}ol,ul{list-style:none}li{display:list-item}table{border-collapse:collapse;borde
+r-spacing:0}th,td,caption{font-weight:normal;vertical-align:top;text-align:left}q{quotes:none}q:befo
+re,q:after{content:'';content:none}sub,sup,small{font-size:75%}sub,sup{line-height:0;position:relati
+ve;vertical-align:baseline}sub{bottom:-0.25em}sup{top:-0.5em}svg{overflow:hidden}
+<!-- End 960 reset -->
+<!-- Begin 960 text -->
+body{font:13px/1.5 'Helvetica Neue',Arial,'Liberation Sans',FreeSans,sans-serif}pre,code{font-family
+:'DejaVu Sans Mono',Menlo,Consolas,monospace}hr{border:0 #ccc solid;border-top-width:1px;clear:both;
+height:0}h1{font-size:25px}h2{font-size:23px}h3{font-size:21px}h4{font-size:19px}h5{font-size:17px}h
+6{font-size:15px}ol{list-style:decimal}ul{list-style:disc}li{margin-left:30px}p,dl,hr,h1,h2,h3,h4,h5
+,h6,ol,ul,pre,table,address,fieldset,figure{margin-bottom:20px}
+<!-- End 960 text -->
+<!-- Begin 960 grid (fluid variant)
+     12 columns, 1152px total width
+     http://960.gs/ | http://grids.heroku.com/ -->
+.container_12{width:92%;margin-left:4%;margin-right:4%}.grid_1,.grid_2,.grid_3,.grid_4,.grid_5,.grid
+_6,.grid_7,.grid_8,.grid_9,.grid_10,.grid_11,.grid_12{display:inline;float:left;position:relative;ma
+rgin-left:1%;margin-right:1%}.alpha{margin-left:0}.omega{margin-right:0}.container_12 .grid_1{width:
+6.333%}.container_12 .grid_2{width:14.667%}.container_12 .grid_3{width:23.0%}.container_12 .grid_4{w
+idth:31.333%}.container_12 .grid_5{width:39.667%}.container_12 .grid_6{width:48.0%}.container_12 .gr
+id_7{width:56.333%}.container_12 .grid_8{width:64.667%}.container_12 .grid_9{width:73.0%}.container_
+12 .grid_10{width:81.333%}.container_12 .grid_11{width:89.667%}.container_12 .grid_12{width:98.0%}.c
+ontainer_12 .prefix_1{padding-left:8.333%}.container_12 .prefix_2{padding-left:16.667%}.container_12
+ .prefix_3{padding-left:25.0%}.container_12 .prefix_4{padding-left:33.333%}.container_12 .prefix_5{p
+adding-left:41.667%}.container_12 .prefix_6{padding-left:50.0%}.container_12 .prefix_7{padding-left:
+58.333%}.container_12 .prefix_8{padding-left:66.667%}.container_12 .prefix_9{padding-left:75.0%}.con
+tainer_12 .prefix_10{padding-left:83.333%}.container_12 .prefix_11{padding-left:91.667%}.container_1
+2 .suffix_1{padding-right:8.333%}.container_12 .suffix_2{padding-right:16.667%}.container_12 .suffix
+_3{padding-right:25.0%}.container_12 .suffix_4{padding-right:33.333%}.container_12 .suffix_5{padding
+-right:41.667%}.container_12 .suffix_6{padding-right:50.0%}.container_12 .suffix_7{padding-right:58.
+333%}.container_12 .suffix_8{padding-right:66.667%}.container_12 .suffix_9{padding-right:75.0%}.cont
+ainer_12 .suffix_10{padding-right:83.333%}.container_12 .suffix_11{padding-right:91.667%}.container_
+12 .push_1{left:8.333%}.container_12 .push_2{left:16.667%}.container_12 .push_3{left:25.0%}.containe
+r_12 .push_4{left:33.333%}.container_12 .push_5{left:41.667%}.container_12 .push_6{left:50.0%}.conta
+iner_12 .push_7{left:58.333%}.container_12 .push_8{left:66.667%}.container_12 .push_9{left:75.0%}.co
+ntainer_12 .push_10{left:83.333%}.container_12 .push_11{left:91.667%}.container_12 .pull_1{left:-8.3
+33%}.container_12 .pull_2{left:-16.667%}.container_12 .pull_3{left:-25.0%}.container_12 .pull_4{left
+:-33.333%}.container_12 .pull_5{left:-41.667%}.container_12 .pull_6{left:-50.0%}.container_12 .pull_
+7{left:-58.333%}.container_12 .pull_8{left:-66.667%}.container_12 .pull_9{left:-75.0%}.container_12
+.pull_10{left:-83.333%}.container_12 .pull_11{left:-91.667%}.clear{clear:both;display:block;overflow
+:hidden;visibility:hidden;width:0;height:0}.clearfix:after{clear:both;content:' ';display:block;font
+-size:0;line-height:0;visibility:hidden;width:0;height:0}.clearfix{display:inline-block}* html .clea
+rfix{height:1%}.clearfix{display:block}
+<!-- End 960 grid -->
+
+div.metricgraph {
+
+}
+
+body {
+
+}
+
+div.header {
+  font-family: Arial, sans-serif;
+}
+
+div.header h2 {
+  margin: .5em auto;
+}
+
+div.radio {
+  font-family: Arial, sans-serif;
+  margin-bottom: 1em;
+}
+
+div.main {
+
+}
+
+div.cliplist {
+  font-family: Arial, sans-serif;
+  margin-top: 6px;
+}
+
+div.chartarea {
+  font-family: Arial, sans-serif;
+}
+
+div.indicators {
+  font-family: Arial, sans-serif;
+  font-size: 13px;
+  margin-top: 6px;
+  min-height: 600px;
+  background-color: #f7f7f7;
+}
+
+div.indicators div.content {
+  margin: 1em;
+}
+
+div.indicators div.content h5 {
+  font-size: 13px;
+  text-align: center;
+  margin: 0;
+}
+
+div.indicators div.content ul {
+  margin-left: 0;
+  padding-left: 0;
+  margin-top: 0;
+}
+
+div.indicators div.content ul li {
+  margin-left: 1.5em;
+}
+
+div.indicators div.content p:first-child {
+  margin-bottom: .5em;
+}
+
+span.google-visualization-table-sortind {
+  color: #000;
+}
+.header-style {
+  font-weight: bold;
+  border: 1px solid #fff;
+  background-color: #ccc;
+}
+
+td.header-style+td {
+
+}
+
+.orange-background {
+  background-color: orange;
+}
+
+.light-gray-background {
+  background-color: #f0f0f0;
+}
+</style>
+<script type="text/javascript" src="https://www.google.com/jsapi"></script>
+<script type="text/javascript">
+var chart_left   = 40;
+var chart_top    = 6;
+var chart_height = document.documentElement.clientHeight-100;
+var chart_width  = "100%";
+ftable='filestable_avg'
+var snrs = [];
+var filestable_dsnr = [];
+var filestable_drate = [];
+var filestable_avg = [];
+
+// Python template code replaces the following 2 lines.
+//%%metrics_js%%//
+//%%filestable_dpsnr%%//
+//%%filestable_avg%%//
+//%%filestable_drate%%//
+//%%snrs%%//
+
+var selected = 0
+var imagestr = '';
+var bettertable=0;
+var chart=0;
+var better=0;
+var metricdata=0;
+var metricView=0;
+var column=1;
+var formatter=0;
+
+function changeColumn(col) {
+  column = col;
+  console.log(col)
+  draw_files();
+}
+
+function changeMetric(m) {
+  ftable=m
+  draw_files()
+}
+
+function setup_vis() {
+  chart = new google.visualization.ScatterChart(
+      document.getElementById("metricgraph"));
+
+  bettertable = new google.visualization.Table(
+      document.getElementById("bettertable"));
+
+  draw_files();
+  build_metrics_radio();
+}
+
+function build_metrics_radio() {
+  for (metric=1; metric < metrics.length; metric++) {
+    var rb = document.createElement('input');
+    var l = document.createElement('label');
+    rb.setAttribute('type','radio');
+    rb.setAttribute('name','metric');
+    rb.setAttribute('onClick', "changeColumn('"+metric.toString()+"')");
+    l.innerHTML = metrics[metric];
+    document.getElementById('metrics').appendChild(rb);
+    document.getElementById('metrics').appendChild(l);
+  }
+}
+
+function draw_files() {
+  var options = {'allowHtml': true, 'width': "100%", 'height': "50%"};
+  if (better != 0) delete better;
+
+  col=eval(ftable+'[column]')
+  better = new google.visualization.DataTable(col)
+
+  // Python Template code replaces the following line with a list of
+  // formatters.
+  if (ftable == 'filestable_dsnr')
+    formatter = new google.visualization.NumberFormat(
+      {fractionDigits: 4, suffix:" db"});
+  else
+    formatter = new google.visualization.NumberFormat(
+       {fractionDigits: 4, suffix:"%"});
+
+  //%%formatters%%//
+
+  bettertable.draw(better,options);
+  google.visualization.events.addListener(bettertable, 'select',
+                                          selectBetterHandler);
+  query_file()
+}
+
+function query_file() {
+  imagestr = better.getFormattedValue(selected, 0)
+  var metricjson = eval('(' + snrs[column][selected] + ')');
+  metricdata = new google.visualization.DataTable(metricjson, 0.6);
+  if( metricView != 0 ) delete metricView;
+  metricView = new google.visualization.DataView(metricdata);
+
+  chart.draw(metricView, {curveType:'function',
+      explorer: {},
+      chartArea:{left:chart_left, top:chart_top, width:chart_width,
+      height:chart_height-90},
+      hAxis:{title:"Datarate in kbps"},
+      vAxis:{title:"Quality in decibels", format: '##.0', textPosition: 'in'},
+      legend:{position:"in"}, title:imagestr, pointSize:2, lineWidth:1,
+      width:chart_width, height:chart_height-50 });
+
+  google.visualization.events.addListener(chart, 'select', chartSelect);
+  google.visualization.events.addListener(chart, 'onmouseover', chartMouseOver);
+  google.visualization.events.addListener(chart, 'onmouseout', chartMouseOut);
+}
+
+function chartMouseOut(e) {
+  statusbar = document.getElementById('status');
+  statusbar.style.display = 'none';
+}
+
+function chartMouseOver(e) {
+  pointDifference(e.row, e.column)
+}
+
+function pointDifference(row, col) {
+  if(!row || !col)
+    return;
+
+  var cols = metricdata.getNumberOfColumns();
+  var rows = metricdata.getNumberOfRows();
+
+  var sel_bitrate = metricView.getValue(row, 0 );
+  var sel_metric = metricView.getValue(row, col);
+
+  var message = '<ul>' + metricView.getColumnLabel(col) +
+     ' (' + sel_bitrate.toFixed(0) + ' kbps, ' + sel_metric.toFixed(2) + ')' + ' is ';
+
+
+  // col 0 is datarate
+  for( var i=1;i<cols;++i) {
+
+    var metric_greatest_thats_less = 0;
+    var rate_greatest_thats_less = 0;
+    var metric_smallest_thats_greater = 999;
+    var rate_smallest_thats_greater = 0;
+
+    if(i==col)
+      continue;
+
+    // Find the lowest metric for the column that's greater than sel_metric and
+    // the highest metric for this column that's less than the metric.
+    for(var line_count = 0; line_count < rows; ++line_count) {
+      this_metric = metricdata.getValue(line_count, i)
+      this_rate = metricdata.getValue(line_count, 0)
+      if(!this_metric)
+        continue;
+
+      if(this_metric > metric_greatest_thats_less &&
+         this_metric <= sel_metric) {
+        metric_greatest_thats_less = this_metric;
+        rate_greatest_thats_less = this_rate;
+      }
+      if(this_metric < metric_smallest_thats_greater &&
+        this_metric > sel_metric) {
+        metric_smallest_thats_greater = this_metric;
+        rate_smallest_thats_greater = this_rate;
+      }
+    }
+
+    if(rate_smallest_thats_greater == 0 || rate_greatest_thats_less == 0) {
+      message = message + " <li> Couldn't find a point on both sides.</li>"
+    } else {
+      metric_slope = ( rate_smallest_thats_greater - rate_greatest_thats_less) /
+          ( metric_smallest_thats_greater - metric_greatest_thats_less);
+
+      projected_rate = ( sel_metric - metric_greatest_thats_less) *
+          metric_slope + rate_greatest_thats_less;
+
+      difference = 100 * (projected_rate / sel_bitrate - 1);
+
+
+      if (difference > 0)
+        message = message + "<li>  " + difference.toFixed(2) +
+                  "% smaller than <em>" +
+                  metricdata.getColumnLabel(i) + "</em></li> "
+      else
+        message = message + "<li>  " + -difference.toFixed(2) +
+                  "% bigger than <em>" +
+                  metricdata.getColumnLabel(i) + "</em></li> "
+    }
+
+  }
+  message = message + "</ul>"
+  statusbar = document.getElementById('status');
+  statusbar.innerHTML = "<p>" + message + "</p>";
+  statusbar.style.display = 'block';
+}
+
+function chartSelect() {
+  var selection = chart.getSelection();
+  var message = '';
+  var min = metricView.getFormattedValue(selection[0].row, 0);
+  var max = metricView.getFormattedValue(selection[selection.length-1].row, 0);
+  var val = metricView.getFormattedValue(selection[0].row,selection[0].column);
+
+  pointDifference(selection[0].row, selection[0].column)
+  min = min / 3
+  max = max * 3
+  metricView.setRows(metricdata.getFilteredRows(
+      [{column: 0,minValue: min, maxValue:max}]));
+
+  chart.draw(metricView, {curveType:'function',
+      chartArea:{left:40, top:10, width:chart_width, height:chart_height - 110},
+      hAxis:{title:"datarate in kbps"}, vAxis:{title:"quality in decibels"},
+      legend:{position:"in"}, title:imagestr, pointSize:2, lineWidth:1,
+      width:chart_width, height:chart_height - 50});
+}
+
+function selectBetterHandler() {
+  var selection = bettertable.getSelection();
+  for (var i = 0; i < selection.length; i++) {
+    item = selection[i];
+  }
+  selected = item.row
+  query_file()
+}
+
+
+google.load('visualization', '1', {'packages' : ['corechart','table']});
+google.setOnLoadCallback(setup_vis);
+</script>
+</head>
+
+<body>
+
+  <div class="container_12">
+
+    <div class="grid_12 header">
+      <h2>Codec Comparison Results</h2>
+    </div>
+
+    <div class="grid_12 radio">
+
+      <form name="myform">
+        Method For Combining Points
+        <input type="radio" checked name="column" value="1"
+          onClick="changeMetric('filestable_avg')" />Average of bitrates difference
+        <input type="radio" name="column" value="2"
+          onClick="changeMetric('filestable_dsnr')" />BDSNR
+        <input type="radio" name="column" value="3"
+          onClick="changeMetric('filestable_drate')" />BDRATE
+      </form>
+
+      <form id="metrics" name="myform">
+      </form>
+
+    </div>
+
+    <div class="grid_12 main">
+
+      <div class="grid_5 alpha cliplist">
+        <div id="bettertable"></div>
+      </div>
+
+      <div class="grid_5 chartarea">
+        <div id="metricgraph"></div>
+      </div>
+
+      <div class="grid_2 omega indicators">
+        <div class="content">
+          <h5>Indicators</h5>
+          <hr>
+          <div id="status"></div>
+        </div>
+      </div>
+
+    </div>
+
+  </div>
+
+</body>
+</html>
diff --git a/third_party/aom/test/minmax_test.cc b/third_party/aom/test/minmax_test.cc
deleted file mode 100644
index aaac72c65..000000000
--- a/third_party/aom/test/minmax_test.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./aom_dsp_rtcd.h"
-#include "aom/aom_integer.h"
-
-#include "test/acm_random.h"
-#include "test/register_state_check.h"
-
-namespace {
-
-using ::libaom_test::ACMRandom;
-
-typedef void (*MinMaxFunc)(const uint8_t *a, int a_stride, const uint8_t *b,
-                           int b_stride, int *min, int *max);
-
-class MinMaxTest : public ::testing::TestWithParam<MinMaxFunc> {
- public:
-  virtual void SetUp() {
-    mm_func_ = GetParam();
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-  }
-
- protected:
-  MinMaxFunc mm_func_;
-  ACMRandom rnd_;
-};
-
-void reference_minmax(const uint8_t *a, int a_stride, const uint8_t *b,
-                      int b_stride, int *min_ret, int *max_ret) {
-  int min = 255;
-  int max = 0;
-  for (int i = 0; i < 8; i++) {
-    for (int j = 0; j < 8; j++) {
-      const int diff = abs(a[i * a_stride + j] - b[i * b_stride + j]);
-      if (min > diff) min = diff;
-      if (max < diff) max = diff;
-    }
-  }
-
-  *min_ret = min;
-  *max_ret = max;
-}
-
-TEST_P(MinMaxTest, MinValue) {
-  for (int i = 0; i < 64; i++) {
-    uint8_t a[64], b[64];
-    memset(a, 0, sizeof(a));
-    memset(b, 255, sizeof(b));
-    b[i] = i;  // Set a minimum difference of i.
-
-    int min, max;
-    ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
-    EXPECT_EQ(255, max);
-    EXPECT_EQ(i, min);
-  }
-}
-
-TEST_P(MinMaxTest, MaxValue) {
-  for (int i = 0; i < 64; i++) {
-    uint8_t a[64], b[64];
-    memset(a, 0, sizeof(a));
-    memset(b, 0, sizeof(b));
-    b[i] = i;  // Set a maximum difference of i.
-
-    int min, max;
-    ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
-    EXPECT_EQ(i, max);
-    EXPECT_EQ(0, min);
-  }
-}
-
-TEST_P(MinMaxTest, CompareReference) {
-  uint8_t a[64], b[64];
-  for (int j = 0; j < 64; j++) {
-    a[j] = rnd_.Rand8();
-    b[j] = rnd_.Rand8();
-  }
-
-  int min_ref, max_ref, min, max;
-  reference_minmax(a, 8, b, 8, &min_ref, &max_ref);
-  ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
-  EXPECT_EQ(max_ref, max);
-  EXPECT_EQ(min_ref, min);
-}
-
-TEST_P(MinMaxTest, CompareReferenceAndVaryStride) {
-  uint8_t a[8 * 64], b[8 * 64];
-  for (int i = 0; i < 8 * 64; i++) {
-    a[i] = rnd_.Rand8();
-    b[i] = rnd_.Rand8();
-  }
-  for (int a_stride = 8; a_stride <= 64; a_stride += 8) {
-    for (int b_stride = 8; b_stride <= 64; b_stride += 8) {
-      int min_ref, max_ref, min, max;
-      reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref);
-      ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max));
-      EXPECT_EQ(max_ref, max)
-          << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
-      EXPECT_EQ(min_ref, min)
-          << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
-    }
-  }
-}
-
-INSTANTIATE_TEST_CASE_P(C, MinMaxTest, ::testing::Values(&aom_minmax_8x8_c));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, MinMaxTest,
-                        ::testing::Values(&aom_minmax_8x8_sse2));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, MinMaxTest,
-                        ::testing::Values(&aom_minmax_8x8_neon));
-#endif
-
-}  // namespace
diff --git a/third_party/aom/test/monochrome_test.cc b/third_party/aom/test/monochrome_test.cc
new file mode 100644
index 000000000..ebccba584
--- /dev/null
+++ b/third_party/aom/test/monochrome_test.cc
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class MonochromeTest
+    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  MonochromeTest() : EncoderTest(GET_PARAM(0)), frame0_psnr_y_(0.) {}
+
+  virtual ~MonochromeTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     aom_codec_pts_t pts) {
+    (void)pts;
+
+    // Get value of top-left corner pixel of U plane
+    int chroma_value = img.planes[AOM_PLANE_U][0];
+
+    bool is_chroma_constant =
+        ComparePlaneToValue(img, AOM_PLANE_U, chroma_value) &&
+        ComparePlaneToValue(img, AOM_PLANE_V, chroma_value);
+
+    // Chroma planes should be constant
+    EXPECT_TRUE(is_chroma_constant);
+
+    // Monochrome flag on image should be set
+    EXPECT_EQ(img.monochrome, 1);
+
+    chroma_value_list_.push_back(chroma_value);
+  }
+
+  // Returns true if all pixels on the plane are equal to value, and returns
+  // false otherwise.
+  bool ComparePlaneToValue(const aom_image_t &img, const int plane,
+                           const int value) {
+    const int w = aom_img_plane_width(&img, plane);
+    const int h = aom_img_plane_height(&img, plane);
+    const uint8_t *const buf = img.planes[plane];
+    const int stride = img.stride[plane];
+
+    for (int r = 0; r < h; ++r) {
+      for (int c = 0; c < w; ++c) {
+        if (buf[r * stride + c] != value) return false;
+      }
+    }
+    return true;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    // Check that the initial Y PSNR value is 'high enough', and check that
+    // subsequent Y PSNR values are 'close' to this initial value.
+    if (frame0_psnr_y_ == 0.) {
+      frame0_psnr_y_ = pkt->data.psnr.psnr[1];
+      EXPECT_GT(frame0_psnr_y_, 29.);
+    }
+    EXPECT_NEAR(pkt->data.psnr.psnr[1], frame0_psnr_y_, 2.5);
+  }
+
+  std::vector<int> chroma_value_list_;
+  double frame0_psnr_y_;
+};
+
+TEST_P(MonochromeTest, TestMonochromeEncoding) {
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 5);
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 600;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_undershoot_pct = 50;
+  cfg_.rc_overshoot_pct = 50;
+  cfg_.rc_end_usage = AOM_CBR;
+  cfg_.kf_mode = AOM_KF_AUTO;
+  cfg_.g_lag_in_frames = 1;
+  cfg_.kf_min_dist = cfg_.kf_max_dist = 3000;
+  // Enable dropped frames.
+  cfg_.rc_dropframe_thresh = 1;
+  // Disable error_resilience mode.
+  cfg_.g_error_resilient = 0;
+  // Run at low bitrate.
+  cfg_.rc_target_bitrate = 40;
+  // Set monochrome encoding flag
+  cfg_.monochrome = 1;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Check that the chroma planes are equal across all frames
+  std::vector<int>::const_iterator iter = chroma_value_list_.begin();
+  int initial_chroma_value = *iter;
+  for (; iter != chroma_value_list_.end(); ++iter) {
+    // Check that all decoded frames have the same constant chroma planes.
+    EXPECT_EQ(*iter, initial_chroma_value);
+  }
+}
+
+AV1_INSTANTIATE_TEST_CASE(MonochromeTest,
+                          ::testing::Values(::libaom_test::kTwoPassGood));
+
+}  // namespace
diff --git a/third_party/aom/test/motion_vector_test.cc b/third_party/aom/test/motion_vector_test.cc
index fe20fd10a..27eb93893 100644
--- a/third_party/aom/test/motion_vector_test.cc
+++ b/third_party/aom/test/motion_vector_test.cc
@@ -7,7 +7,7 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -22,7 +22,8 @@ namespace {
 
 // Encoding modes
 const libaom_test::TestMode kEncodingModeVectors[] = {
-  ::libaom_test::kTwoPassGood, ::libaom_test::kOnePassGood,
+  ::libaom_test::kTwoPassGood,
+  ::libaom_test::kOnePassGood,
 };
 
 // Encoding speeds
@@ -82,7 +83,7 @@ TEST_P(MotionVectorTestLarge, OverallTest) {
   // Reduce the test clip's resolution while testing on 32-bit system.
   if (sizeof(void *) == 4) {
     width = 2048;
-    height = 1080;
+    height = 360;
   }
 
   cfg_.rc_target_bitrate = 24000;
diff --git a/third_party/aom/test/noise_model_test.cc b/third_party/aom/test/noise_model_test.cc
new file mode 100644
index 000000000..9b7fff8a2
--- /dev/null
+++ b/third_party/aom/test/noise_model_test.cc
@@ -0,0 +1,1332 @@
+#include <math.h>
+#include <algorithm>
+#include <vector>
+
+#include "aom_dsp/noise_model.h"
+#include "aom_dsp/noise_util.h"
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+// Return normally distrbuted values with standard deviation of sigma.
+double randn(libaom_test::ACMRandom *random, double sigma) {
+  while (1) {
+    const double u = 2.0 * ((double)random->Rand31() /
+                            testing::internal::Random::kMaxRange) -
+                     1.0;
+    const double v = 2.0 * ((double)random->Rand31() /
+                            testing::internal::Random::kMaxRange) -
+                     1.0;
+    const double s = u * u + v * v;
+    if (s > 0 && s < 1) {
+      return sigma * (u * sqrt(-2.0 * log(s) / s));
+    }
+  }
+  return 0;
+}
+
+// Synthesizes noise using the auto-regressive filter of the given lag,
+// with the provided n coefficients sampled at the given coords.
+void noise_synth(libaom_test::ACMRandom *random, int lag, int n,
+                 const int (*coords)[2], const double *coeffs, double *data,
+                 int w, int h) {
+  const int pad_size = 3 * lag;
+  const int padded_w = w + pad_size;
+  const int padded_h = h + pad_size;
+  int x = 0, y = 0;
+  std::vector<double> padded(padded_w * padded_h);
+
+  for (y = 0; y < padded_h; ++y) {
+    for (x = 0; x < padded_w; ++x) {
+      padded[y * padded_w + x] = randn(random, 1.0);
+    }
+  }
+  for (y = lag; y < padded_h; ++y) {
+    for (x = lag; x < padded_w; ++x) {
+      double sum = 0;
+      int i = 0;
+      for (i = 0; i < n; ++i) {
+        const int dx = coords[i][0];
+        const int dy = coords[i][1];
+        sum += padded[(y + dy) * padded_w + (x + dx)] * coeffs[i];
+      }
+      padded[y * padded_w + x] += sum;
+    }
+  }
+  // Copy over the padded rows to the output
+  for (y = 0; y < h; ++y) {
+    memcpy(data + y * w, &padded[0] + y * padded_w, sizeof(*data) * w);
+  }
+}
+
+std::vector<float> get_noise_psd(double *noise, int width, int height,
+                                 int block_size) {
+  float *block =
+      (float *)aom_memalign(32, block_size * block_size * sizeof(block));
+  std::vector<float> psd(block_size * block_size);
+  int num_blocks = 0;
+  struct aom_noise_tx_t *tx = aom_noise_tx_malloc(block_size);
+  for (int y = 0; y <= height - block_size; y += block_size / 2) {
+    for (int x = 0; x <= width - block_size; x += block_size / 2) {
+      for (int yy = 0; yy < block_size; ++yy) {
+        for (int xx = 0; xx < block_size; ++xx) {
+          block[yy * block_size + xx] = (float)noise[(y + yy) * width + x + xx];
+        }
+      }
+      aom_noise_tx_forward(tx, &block[0]);
+      aom_noise_tx_add_energy(tx, &psd[0]);
+      num_blocks++;
+    }
+  }
+  for (int yy = 0; yy < block_size; ++yy) {
+    for (int xx = 0; xx <= block_size / 2; ++xx) {
+      psd[yy * block_size + xx] /= num_blocks;
+    }
+  }
+  // Fill in the data that is missing due to symmetries
+  for (int xx = 1; xx < block_size / 2; ++xx) {
+    psd[(block_size - xx)] = psd[xx];
+  }
+  for (int yy = 1; yy < block_size; ++yy) {
+    for (int xx = 1; xx < block_size / 2; ++xx) {
+      psd[(block_size - yy) * block_size + (block_size - xx)] =
+          psd[yy * block_size + xx];
+    }
+  }
+  aom_noise_tx_free(tx);
+  aom_free(block);
+  return psd;
+}
+
+}  // namespace
+
+TEST(NoiseStrengthSolver, GetCentersTwoBins) {
+  aom_noise_strength_solver_t solver;
+  aom_noise_strength_solver_init(&solver, 2, 8);
+  EXPECT_NEAR(0, aom_noise_strength_solver_get_center(&solver, 0), 1e-5);
+  EXPECT_NEAR(255, aom_noise_strength_solver_get_center(&solver, 1), 1e-5);
+  aom_noise_strength_solver_free(&solver);
+}
+
+TEST(NoiseStrengthSolver, GetCentersTwoBins10bit) {
+  aom_noise_strength_solver_t solver;
+  aom_noise_strength_solver_init(&solver, 2, 10);
+  EXPECT_NEAR(0, aom_noise_strength_solver_get_center(&solver, 0), 1e-5);
+  EXPECT_NEAR(1023, aom_noise_strength_solver_get_center(&solver, 1), 1e-5);
+  aom_noise_strength_solver_free(&solver);
+}
+
+TEST(NoiseStrengthSolver, GetCenters256Bins) {
+  const int num_bins = 256;
+  aom_noise_strength_solver_t solver;
+  aom_noise_strength_solver_init(&solver, num_bins, 8);
+
+  for (int i = 0; i < 256; ++i) {
+    EXPECT_NEAR(i, aom_noise_strength_solver_get_center(&solver, i), 1e-5);
+  }
+  aom_noise_strength_solver_free(&solver);
+}
+
+// Tests that the noise strength solver returns the identity transform when
+// given identity-like constraints.
+TEST(NoiseStrengthSolver, ObserveIdentity) {
+  const int num_bins = 256;
+  aom_noise_strength_solver_t solver;
+  EXPECT_EQ(1, aom_noise_strength_solver_init(&solver, num_bins, 8));
+
+  // We have to add a big more strength to constraints at the boundary to
+  // overcome any regularization.
+  for (int j = 0; j < 5; ++j) {
+    aom_noise_strength_solver_add_measurement(&solver, 0, 0);
+    aom_noise_strength_solver_add_measurement(&solver, 255, 255);
+  }
+  for (int i = 0; i < 256; ++i) {
+    aom_noise_strength_solver_add_measurement(&solver, i, i);
+  }
+  EXPECT_EQ(1, aom_noise_strength_solver_solve(&solver));
+  for (int i = 2; i < num_bins - 2; ++i) {
+    EXPECT_NEAR(i, solver.eqns.x[i], 0.1);
+  }
+
+  aom_noise_strength_lut_t lut;
+  EXPECT_EQ(1, aom_noise_strength_solver_fit_piecewise(&solver, 2, &lut));
+
+  ASSERT_EQ(2, lut.num_points);
+  EXPECT_NEAR(0.0, lut.points[0][0], 1e-5);
+  EXPECT_NEAR(0.0, lut.points[0][1], 0.5);
+  EXPECT_NEAR(255.0, lut.points[1][0], 1e-5);
+  EXPECT_NEAR(255.0, lut.points[1][1], 0.5);
+
+  aom_noise_strength_lut_free(&lut);
+  aom_noise_strength_solver_free(&solver);
+}
+
+TEST(NoiseStrengthSolver, SimplifiesCurve) {
+  const int num_bins = 256;
+  aom_noise_strength_solver_t solver;
+  EXPECT_EQ(1, aom_noise_strength_solver_init(&solver, num_bins, 8));
+
+  // Create a parabolic input
+  for (int i = 0; i < 256; ++i) {
+    const double x = (i - 127.5) / 63.5;
+    aom_noise_strength_solver_add_measurement(&solver, i, x * x);
+  }
+  EXPECT_EQ(1, aom_noise_strength_solver_solve(&solver));
+
+  // First try to fit an unconstrained lut
+  aom_noise_strength_lut_t lut;
+  EXPECT_EQ(1, aom_noise_strength_solver_fit_piecewise(&solver, -1, &lut));
+  ASSERT_LE(20, lut.num_points);
+  aom_noise_strength_lut_free(&lut);
+
+  // Now constrain the maximum number of points
+  const int kMaxPoints = 9;
+  EXPECT_EQ(1,
+            aom_noise_strength_solver_fit_piecewise(&solver, kMaxPoints, &lut));
+  ASSERT_EQ(kMaxPoints, lut.num_points);
+
+  // Check that the input parabola is still well represented
+  EXPECT_NEAR(0.0, lut.points[0][0], 1e-5);
+  EXPECT_NEAR(4.0, lut.points[0][1], 0.1);
+  for (int i = 1; i < lut.num_points - 1; ++i) {
+    const double x = (lut.points[i][0] - 128.) / 64.;
+    EXPECT_NEAR(x * x, lut.points[i][1], 0.1);
+  }
+  EXPECT_NEAR(255.0, lut.points[kMaxPoints - 1][0], 1e-5);
+
+  EXPECT_NEAR(4.0, lut.points[kMaxPoints - 1][1], 0.1);
+  aom_noise_strength_lut_free(&lut);
+  aom_noise_strength_solver_free(&solver);
+}
+
+TEST(NoiseStrengthLut, LutEvalSinglePoint) {
+  aom_noise_strength_lut_t lut;
+  ASSERT_TRUE(aom_noise_strength_lut_init(&lut, 1));
+  ASSERT_EQ(1, lut.num_points);
+  lut.points[0][0] = 0;
+  lut.points[0][1] = 1;
+  EXPECT_EQ(1, aom_noise_strength_lut_eval(&lut, -1));
+  EXPECT_EQ(1, aom_noise_strength_lut_eval(&lut, 0));
+  EXPECT_EQ(1, aom_noise_strength_lut_eval(&lut, 1));
+  aom_noise_strength_lut_free(&lut);
+}
+
+TEST(NoiseStrengthLut, LutEvalMultiPointInterp) {
+  const double kEps = 1e-5;
+  aom_noise_strength_lut_t lut;
+  ASSERT_TRUE(aom_noise_strength_lut_init(&lut, 4));
+  ASSERT_EQ(4, lut.num_points);
+
+  lut.points[0][0] = 0;
+  lut.points[0][1] = 0;
+
+  lut.points[1][0] = 1;
+  lut.points[1][1] = 1;
+
+  lut.points[2][0] = 2;
+  lut.points[2][1] = 1;
+
+  lut.points[3][0] = 100;
+  lut.points[3][1] = 1001;
+
+  // Test lower boundary
+  EXPECT_EQ(0, aom_noise_strength_lut_eval(&lut, -1));
+  EXPECT_EQ(0, aom_noise_strength_lut_eval(&lut, 0));
+
+  // Test first part that should be identity
+  EXPECT_NEAR(0.25, aom_noise_strength_lut_eval(&lut, 0.25), kEps);
+  EXPECT_NEAR(0.75, aom_noise_strength_lut_eval(&lut, 0.75), kEps);
+
+  // This is a constant section (should evaluate to 1)
+  EXPECT_NEAR(1.0, aom_noise_strength_lut_eval(&lut, 1.25), kEps);
+  EXPECT_NEAR(1.0, aom_noise_strength_lut_eval(&lut, 1.75), kEps);
+
+  // Test interpolation between to non-zero y coords.
+  EXPECT_NEAR(1, aom_noise_strength_lut_eval(&lut, 2), kEps);
+  EXPECT_NEAR(251, aom_noise_strength_lut_eval(&lut, 26.5), kEps);
+  EXPECT_NEAR(751, aom_noise_strength_lut_eval(&lut, 75.5), kEps);
+
+  // Test upper boundary
+  EXPECT_EQ(1001, aom_noise_strength_lut_eval(&lut, 100));
+  EXPECT_EQ(1001, aom_noise_strength_lut_eval(&lut, 101));
+
+  aom_noise_strength_lut_free(&lut);
+}
+
+TEST(NoiseModel, InitSuccessWithValidSquareShape) {
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 2, 8, 0 };
+  aom_noise_model_t model;
+
+  EXPECT_TRUE(aom_noise_model_init(&model, params));
+
+  const int kNumCoords = 12;
+  const int kCoords[][2] = { { -2, -2 }, { -1, -2 }, { 0, -2 },  { 1, -2 },
+                             { 2, -2 },  { -2, -1 }, { -1, -1 }, { 0, -1 },
+                             { 1, -1 },  { 2, -1 },  { -2, 0 },  { -1, 0 } };
+  EXPECT_EQ(kNumCoords, model.n);
+  for (int i = 0; i < kNumCoords; ++i) {
+    const int *coord = kCoords[i];
+    EXPECT_EQ(coord[0], model.coords[i][0]);
+    EXPECT_EQ(coord[1], model.coords[i][1]);
+  }
+  aom_noise_model_free(&model);
+}
+
+TEST(NoiseModel, InitSuccessWithValidDiamondShape) {
+  aom_noise_model_t model;
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_DIAMOND, 2, 8, 0 };
+  EXPECT_TRUE(aom_noise_model_init(&model, params));
+  EXPECT_EQ(6, model.n);
+  const int kNumCoords = 6;
+  const int kCoords[][2] = { { 0, -2 }, { -1, -1 }, { 0, -1 },
+                             { 1, -1 }, { -2, 0 },  { -1, 0 } };
+  EXPECT_EQ(kNumCoords, model.n);
+  for (int i = 0; i < kNumCoords; ++i) {
+    const int *coord = kCoords[i];
+    EXPECT_EQ(coord[0], model.coords[i][0]);
+    EXPECT_EQ(coord[1], model.coords[i][1]);
+  }
+  aom_noise_model_free(&model);
+}
+
+TEST(NoiseModel, InitFailsWithTooLargeLag) {
+  aom_noise_model_t model;
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 10, 8, 0 };
+  EXPECT_FALSE(aom_noise_model_init(&model, params));
+  aom_noise_model_free(&model);
+}
+
+TEST(NoiseModel, InitFailsWithTooSmallLag) {
+  aom_noise_model_t model;
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 0, 8, 0 };
+  EXPECT_FALSE(aom_noise_model_init(&model, params));
+  aom_noise_model_free(&model);
+}
+
+TEST(NoiseModel, InitFailsWithInvalidShape) {
+  aom_noise_model_t model;
+  aom_noise_model_params_t params = { aom_noise_shape(100), 3, 8, 0 };
+  EXPECT_FALSE(aom_noise_model_init(&model, params));
+  aom_noise_model_free(&model);
+}
+
+// A container template class to hold a data type and extra arguments.
+// All of these args are bundled into one struct so that we can use
+// parameterized tests on combinations of supported data types
+// (uint8_t and uint16_t) and bit depths (8, 10, 12).
+template <typename T, int bit_depth, bool use_highbd>
+struct BitDepthParams {
+  typedef T data_type_t;
+  static const int kBitDepth = bit_depth;
+  static const bool kUseHighBD = use_highbd;
+};
+
+template <typename T>
+class FlatBlockEstimatorTest : public ::testing::Test, public T {
+ public:
+  virtual void SetUp() { random_.Reset(171); }
+  typedef std::vector<typename T::data_type_t> VecType;
+  VecType data_;
+  libaom_test::ACMRandom random_;
+};
+
+TYPED_TEST_CASE_P(FlatBlockEstimatorTest);
+
+TYPED_TEST_P(FlatBlockEstimatorTest, ExtractBlock) {
+  const int kBlockSize = 16;
+  aom_flat_block_finder_t flat_block_finder;
+  ASSERT_EQ(1, aom_flat_block_finder_init(&flat_block_finder, kBlockSize,
+                                          this->kBitDepth, this->kUseHighBD));
+  const double normalization = flat_block_finder.normalization;
+
+  // Test with an image of more than one block.
+  const int h = 2 * kBlockSize;
+  const int w = 2 * kBlockSize;
+  const int stride = 2 * kBlockSize;
+  this->data_.resize(h * stride, 128);
+
+  // Set up the (0,0) block to be a plane and the (0,1) block to be a
+  // checkerboard
+  const int shift = this->kBitDepth - 8;
+  for (int y = 0; y < kBlockSize; ++y) {
+    for (int x = 0; x < kBlockSize; ++x) {
+      this->data_[y * stride + x] = (-y + x + 128) << shift;
+      this->data_[y * stride + x + kBlockSize] =
+          ((x % 2 + y % 2) % 2 ? 128 - 20 : 128 + 20) << shift;
+    }
+  }
+  std::vector<double> block(kBlockSize * kBlockSize, 1);
+  std::vector<double> plane(kBlockSize * kBlockSize, 1);
+
+  // The block data should be a constant (zero) and the rest of the plane
+  // trend is covered in the plane data.
+  aom_flat_block_finder_extract_block(&flat_block_finder,
+                                      (uint8_t *)&this->data_[0], w, h, stride,
+                                      0, 0, &plane[0], &block[0]);
+  for (int y = 0; y < kBlockSize; ++y) {
+    for (int x = 0; x < kBlockSize; ++x) {
+      EXPECT_NEAR(0, block[y * kBlockSize + x], 1e-5);
+      EXPECT_NEAR((double)(this->data_[y * stride + x]) / normalization,
+                  plane[y * kBlockSize + x], 1e-5);
+    }
+  }
+
+  // The plane trend is a constant, and the block is a zero mean checkerboard.
+  aom_flat_block_finder_extract_block(&flat_block_finder,
+                                      (uint8_t *)&this->data_[0], w, h, stride,
+                                      kBlockSize, 0, &plane[0], &block[0]);
+  const int mid = 128 << shift;
+  for (int y = 0; y < kBlockSize; ++y) {
+    for (int x = 0; x < kBlockSize; ++x) {
+      EXPECT_NEAR(((double)this->data_[y * stride + x + kBlockSize] - mid) /
+                      normalization,
+                  block[y * kBlockSize + x], 1e-5);
+      EXPECT_NEAR(mid / normalization, plane[y * kBlockSize + x], 1e-5);
+    }
+  }
+  aom_flat_block_finder_free(&flat_block_finder);
+}
+
+TYPED_TEST_P(FlatBlockEstimatorTest, FindFlatBlocks) {
+  const int kBlockSize = 32;
+  aom_flat_block_finder_t flat_block_finder;
+  ASSERT_EQ(1, aom_flat_block_finder_init(&flat_block_finder, kBlockSize,
+                                          this->kBitDepth, this->kUseHighBD));
+
+  const int num_blocks_w = 8;
+  const int h = kBlockSize;
+  const int w = kBlockSize * num_blocks_w;
+  const int stride = w;
+  this->data_.resize(h * stride, 128);
+  std::vector<uint8_t> flat_blocks(num_blocks_w, 0);
+
+  const int shift = this->kBitDepth - 8;
+  for (int y = 0; y < kBlockSize; ++y) {
+    for (int x = 0; x < kBlockSize; ++x) {
+      // Block 0 (not flat): constant doesn't have enough variance to qualify
+      this->data_[y * stride + x + 0 * kBlockSize] = 128 << shift;
+
+      // Block 1 (not flat): too high of variance is hard to validate as flat
+      this->data_[y * stride + x + 1 * kBlockSize] =
+          ((uint8_t)(128 + randn(&this->random_, 5))) << shift;
+
+      // Block 2 (flat): slight checkerboard added to constant
+      const int check = (x % 2 + y % 2) % 2 ? -2 : 2;
+      this->data_[y * stride + x + 2 * kBlockSize] = (128 + check) << shift;
+
+      // Block 3 (flat): planar block with checkerboard pattern is also flat
+      this->data_[y * stride + x + 3 * kBlockSize] =
+          (y * 2 - x / 2 + 128 + check) << shift;
+
+      // Block 4 (flat): gaussian random with standard deviation 1.
+      this->data_[y * stride + x + 4 * kBlockSize] =
+          ((uint8_t)(randn(&this->random_, 1) + x + 128.0)) << shift;
+
+      // Block 5 (flat): gaussian random with standard deviation 2.
+      this->data_[y * stride + x + 5 * kBlockSize] =
+          ((uint8_t)(randn(&this->random_, 2) + y + 128.0)) << shift;
+
+      // Block 6 (not flat): too high of directional gradient.
+      const int strong_edge = x > kBlockSize / 2 ? 64 : 0;
+      this->data_[y * stride + x + 6 * kBlockSize] =
+          ((uint8_t)(randn(&this->random_, 1) + strong_edge + 128.0)) << shift;
+
+      // Block 7 (not flat): too high gradient.
+      const int big_check = ((x >> 2) % 2 + (y >> 2) % 2) % 2 ? -16 : 16;
+      this->data_[y * stride + x + 7 * kBlockSize] =
+          ((uint8_t)(randn(&this->random_, 1) + big_check + 128.0)) << shift;
+    }
+  }
+
+  EXPECT_EQ(4, aom_flat_block_finder_run(&flat_block_finder,
+                                         (uint8_t *)&this->data_[0], w, h,
+                                         stride, &flat_blocks[0]));
+
+  // First two blocks are not flat
+  EXPECT_EQ(0, flat_blocks[0]);
+  EXPECT_EQ(0, flat_blocks[1]);
+
+  // Next 4 blocks are flat.
+  EXPECT_EQ(255, flat_blocks[2]);
+  EXPECT_EQ(255, flat_blocks[3]);
+  EXPECT_EQ(255, flat_blocks[4]);
+  EXPECT_EQ(255, flat_blocks[5]);
+
+  // Last 2 are not flat by threshold
+  EXPECT_EQ(0, flat_blocks[6]);
+  EXPECT_EQ(0, flat_blocks[7]);
+
+  // Add the noise from non-flat block 1 to every block.
+  for (int y = 0; y < kBlockSize; ++y) {
+    for (int x = 0; x < kBlockSize * num_blocks_w; ++x) {
+      this->data_[y * stride + x] +=
+          (this->data_[y * stride + x % kBlockSize + kBlockSize] -
+           (128 << shift));
+    }
+  }
+  // Now the scored selection will pick the one that is most likely flat (block
+  // 0)
+  EXPECT_EQ(1, aom_flat_block_finder_run(&flat_block_finder,
+                                         (uint8_t *)&this->data_[0], w, h,
+                                         stride, &flat_blocks[0]));
+  EXPECT_EQ(1, flat_blocks[0]);
+  EXPECT_EQ(0, flat_blocks[1]);
+  EXPECT_EQ(0, flat_blocks[2]);
+  EXPECT_EQ(0, flat_blocks[3]);
+  EXPECT_EQ(0, flat_blocks[4]);
+  EXPECT_EQ(0, flat_blocks[5]);
+  EXPECT_EQ(0, flat_blocks[6]);
+  EXPECT_EQ(0, flat_blocks[7]);
+
+  aom_flat_block_finder_free(&flat_block_finder);
+}
+
+REGISTER_TYPED_TEST_CASE_P(FlatBlockEstimatorTest, ExtractBlock,
+                           FindFlatBlocks);
+
+typedef ::testing::Types<BitDepthParams<uint8_t, 8, false>,   // lowbd
+                         BitDepthParams<uint16_t, 8, true>,   // lowbd in 16-bit
+                         BitDepthParams<uint16_t, 10, true>,  // highbd data
+                         BitDepthParams<uint16_t, 12, true> >
+    AllBitDepthParams;
+INSTANTIATE_TYPED_TEST_CASE_P(FlatBlockInstatiation, FlatBlockEstimatorTest,
+                              AllBitDepthParams);
+
+template <typename T>
+class NoiseModelUpdateTest : public ::testing::Test, public T {
+ public:
+  static const int kWidth = 128;
+  static const int kHeight = 128;
+  static const int kBlockSize = 16;
+  static const int kNumBlocksX = kWidth / kBlockSize;
+  static const int kNumBlocksY = kHeight / kBlockSize;
+
+  virtual void SetUp() {
+    const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3,
+                                              T::kBitDepth, T::kUseHighBD };
+    ASSERT_TRUE(aom_noise_model_init(&model_, params));
+
+    random_.Reset(100171);
+
+    data_.resize(kWidth * kHeight * 3);
+    denoised_.resize(kWidth * kHeight * 3);
+    noise_.resize(kWidth * kHeight * 3);
+    renoise_.resize(kWidth * kHeight);
+    flat_blocks_.resize(kNumBlocksX * kNumBlocksY);
+
+    for (int c = 0, offset = 0; c < 3; ++c, offset += kWidth * kHeight) {
+      data_ptr_[c] = &data_[offset];
+      noise_ptr_[c] = &noise_[offset];
+      denoised_ptr_[c] = &denoised_[offset];
+      strides_[c] = kWidth;
+
+      data_ptr_raw_[c] = (uint8_t *)&data_[offset];
+      denoised_ptr_raw_[c] = (uint8_t *)&denoised_[offset];
+    }
+    chroma_sub_[0] = 0;
+    chroma_sub_[1] = 0;
+  }
+
+  int NoiseModelUpdate(int block_size = kBlockSize) {
+    return aom_noise_model_update(&model_, data_ptr_raw_, denoised_ptr_raw_,
+                                  kWidth, kHeight, strides_, chroma_sub_,
+                                  &flat_blocks_[0], block_size);
+  }
+
+  void TearDown() { aom_noise_model_free(&model_); }
+
+ protected:
+  aom_noise_model_t model_;
+  std::vector<typename T::data_type_t> data_;
+  std::vector<typename T::data_type_t> denoised_;
+
+  std::vector<double> noise_;
+  std::vector<double> renoise_;
+  std::vector<uint8_t> flat_blocks_;
+
+  typename T::data_type_t *data_ptr_[3];
+  typename T::data_type_t *denoised_ptr_[3];
+
+  double *noise_ptr_[3];
+  int strides_[3];
+  int chroma_sub_[2];
+  libaom_test::ACMRandom random_;
+
+ private:
+  uint8_t *data_ptr_raw_[3];
+  uint8_t *denoised_ptr_raw_[3];
+};
+
+TYPED_TEST_CASE_P(NoiseModelUpdateTest);
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateFailsNoFlatBlocks) {
+  EXPECT_EQ(AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS,
+            this->NoiseModelUpdate());
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateSuccessForZeroNoiseAllFlat) {
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+  this->denoised_.assign(this->denoised_.size(), 128);
+  this->data_.assign(this->denoised_.size(), 128);
+  EXPECT_EQ(AOM_NOISE_STATUS_INTERNAL_ERROR, this->NoiseModelUpdate());
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateFailsBlockSizeTooSmall) {
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+  this->denoised_.assign(this->denoised_.size(), 128);
+  this->data_.assign(this->denoised_.size(), 128);
+  EXPECT_EQ(AOM_NOISE_STATUS_INVALID_ARGUMENT,
+            this->NoiseModelUpdate(6 /* block_size=6 is too small*/));
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateSuccessForWhiteRandomNoise) {
+  aom_noise_model_t &model = this->model_;
+  const int kWidth = this->kWidth;
+  const int kHeight = this->kHeight;
+
+  const int shift = this->kBitDepth - 8;
+  for (int y = 0; y < kHeight; ++y) {
+    for (int x = 0; x < kWidth; ++x) {
+      this->data_ptr_[0][y * kWidth + x] =
+          int(64 + y + randn(&this->random_, 1)) << shift;
+      this->denoised_ptr_[0][y * kWidth + x] = (64 + y) << shift;
+      // Make the chroma planes completely correlated with the Y plane
+      for (int c = 1; c < 3; ++c) {
+        this->data_ptr_[c][y * kWidth + x] = this->data_ptr_[0][y * kWidth + x];
+        this->denoised_ptr_[c][y * kWidth + x] =
+            this->denoised_ptr_[0][y * kWidth + x];
+      }
+    }
+  }
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+  EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+  const double kCoeffEps = 0.075;
+  const int n = model.n;
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < n; ++i) {
+      EXPECT_NEAR(0, model.latest_state[c].eqns.x[i], kCoeffEps);
+      EXPECT_NEAR(0, model.combined_state[c].eqns.x[i], kCoeffEps);
+    }
+    // The second and third channels are highly correlated with the first.
+    if (c > 0) {
+      ASSERT_EQ(n + 1, model.latest_state[c].eqns.n);
+      ASSERT_EQ(n + 1, model.combined_state[c].eqns.n);
+
+      EXPECT_NEAR(1, model.latest_state[c].eqns.x[n], kCoeffEps);
+      EXPECT_NEAR(1, model.combined_state[c].eqns.x[n], kCoeffEps);
+    }
+  }
+
+  // The fitted noise strength should be close to the standard deviation
+  // for all intensity bins.
+  const double kStdEps = 0.1;
+  const double normalize = 1 << shift;
+
+  for (int i = 0; i < model.latest_state[0].strength_solver.eqns.n; ++i) {
+    EXPECT_NEAR(1.0,
+                model.latest_state[0].strength_solver.eqns.x[i] / normalize,
+                kStdEps);
+    EXPECT_NEAR(1.0,
+                model.combined_state[0].strength_solver.eqns.x[i] / normalize,
+                kStdEps);
+  }
+
+  aom_noise_strength_lut_t lut;
+  aom_noise_strength_solver_fit_piecewise(
+      &model.latest_state[0].strength_solver, -1, &lut);
+  ASSERT_EQ(2, lut.num_points);
+  EXPECT_NEAR(0.0, lut.points[0][0], 1e-5);
+  EXPECT_NEAR(1.0, lut.points[0][1] / normalize, kStdEps);
+  EXPECT_NEAR((1 << this->kBitDepth) - 1, lut.points[1][0], 1e-5);
+  EXPECT_NEAR(1.0, lut.points[1][1] / normalize, kStdEps);
+  aom_noise_strength_lut_free(&lut);
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateSuccessForScaledWhiteNoise) {
+  aom_noise_model_t &model = this->model_;
+  const int kWidth = this->kWidth;
+  const int kHeight = this->kHeight;
+
+  const double kCoeffEps = 0.055;
+  const double kLowStd = 1;
+  const double kHighStd = 4;
+  const int shift = this->kBitDepth - 8;
+  for (int y = 0; y < kHeight; ++y) {
+    for (int x = 0; x < kWidth; ++x) {
+      for (int c = 0; c < 3; ++c) {
+        // The image data is bimodal:
+        // Bottom half has low intensity and low noise strength
+        // Top half has high intensity and high noise strength
+        const int avg = (y < kHeight / 2) ? 4 : 245;
+        const double std = (y < kHeight / 2) ? kLowStd : kHighStd;
+        this->data_ptr_[c][y * kWidth + x] =
+            ((uint8_t)std::min((int)255,
+                               (int)(2 + avg + randn(&this->random_, std))))
+            << shift;
+        this->denoised_ptr_[c][y * kWidth + x] = (2 + avg) << shift;
+      }
+    }
+  }
+  // Label all blocks as flat for the update
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+  EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+  const int n = model.n;
+  // The noise is uncorrelated spatially and with the y channel.
+  // All coefficients should be reasonably close to zero.
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < n; ++i) {
+      EXPECT_NEAR(0, model.latest_state[c].eqns.x[i], kCoeffEps);
+      EXPECT_NEAR(0, model.combined_state[c].eqns.x[i], kCoeffEps);
+    }
+    if (c > 0) {
+      ASSERT_EQ(n + 1, model.latest_state[c].eqns.n);
+      ASSERT_EQ(n + 1, model.combined_state[c].eqns.n);
+
+      // The correlation to the y channel should be low (near zero)
+      EXPECT_NEAR(0, model.latest_state[c].eqns.x[n], kCoeffEps);
+      EXPECT_NEAR(0, model.combined_state[c].eqns.x[n], kCoeffEps);
+    }
+  }
+
+  // Noise strength should vary between kLowStd and kHighStd.
+  const double kStdEps = 0.15;
+  // We have to normalize fitted standard deviation based on bit depth.
+  const double normalize = (1 << shift);
+
+  ASSERT_EQ(20, model.latest_state[0].strength_solver.eqns.n);
+  for (int i = 0; i < model.latest_state[0].strength_solver.eqns.n; ++i) {
+    const double a = i / 19.0;
+    const double expected = (kLowStd * (1.0 - a) + kHighStd * a);
+    EXPECT_NEAR(expected,
+                model.latest_state[0].strength_solver.eqns.x[i] / normalize,
+                kStdEps);
+    EXPECT_NEAR(expected,
+                model.combined_state[0].strength_solver.eqns.x[i] / normalize,
+                kStdEps);
+  }
+
+  // If we fit a piecewise linear model, there should be two points:
+  // one near kLowStd at 0, and the other near kHighStd and 255.
+  aom_noise_strength_lut_t lut;
+  aom_noise_strength_solver_fit_piecewise(
+      &model.latest_state[0].strength_solver, 2, &lut);
+  ASSERT_EQ(2, lut.num_points);
+  EXPECT_NEAR(0, lut.points[0][0], 1e-4);
+  EXPECT_NEAR(kLowStd, lut.points[0][1] / normalize, kStdEps);
+  EXPECT_NEAR((1 << this->kBitDepth) - 1, lut.points[1][0], 1e-5);
+  EXPECT_NEAR(kHighStd, lut.points[1][1] / normalize, kStdEps);
+  aom_noise_strength_lut_free(&lut);
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateSuccessForCorrelatedNoise) {
+  aom_noise_model_t &model = this->model_;
+  const int kWidth = this->kWidth;
+  const int kHeight = this->kHeight;
+  const int kNumCoeffs = 24;
+  const double kStd = 4;
+  const double kStdEps = 0.3;
+  const double kCoeffEps = 0.065;
+  // Use different coefficients for each channel
+  const double kCoeffs[3][24] = {
+    { 0.02884, -0.03356, 0.00633,  0.01757,  0.02849,  -0.04620,
+      0.02833, -0.07178, 0.07076,  -0.11603, -0.10413, -0.16571,
+      0.05158, -0.07969, 0.02640,  -0.07191, 0.02530,  0.41968,
+      0.21450, -0.00702, -0.01401, -0.03676, -0.08713, 0.44196 },
+    { 0.00269, -0.01291, -0.01513, 0.07234,  0.03208,   0.00477,
+      0.00226, -0.00254, 0.03533,  0.12841,  -0.25970,  -0.06336,
+      0.05238, -0.00845, -0.03118, 0.09043,  -0.36558,  0.48903,
+      0.00595, -0.11938, 0.02106,  0.095956, -0.350139, 0.59305 },
+    { -0.00643, -0.01080, -0.01466, 0.06951, 0.03707,  -0.00482,
+      0.00817,  -0.00909, 0.02949,  0.12181, -0.25210, -0.07886,
+      0.06083,  -0.01210, -0.03108, 0.08944, -0.35875, 0.49150,
+      0.00415,  -0.12905, 0.02870,  0.09740, -0.34610, 0.58824 },
+  };
+
+  ASSERT_EQ(model.n, kNumCoeffs);
+  this->chroma_sub_[0] = this->chroma_sub_[1] = 1;
+
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+
+  // Add different noise onto each plane
+  const int shift = this->kBitDepth - 8;
+  for (int c = 0; c < 3; ++c) {
+    noise_synth(&this->random_, model.params.lag, model.n, model.coords,
+                kCoeffs[c], this->noise_ptr_[c], kWidth, kHeight);
+    const int x_shift = c > 0 ? this->chroma_sub_[0] : 0;
+    const int y_shift = c > 0 ? this->chroma_sub_[1] : 0;
+    for (int y = 0; y < (kHeight >> y_shift); ++y) {
+      for (int x = 0; x < (kWidth >> x_shift); ++x) {
+        const uint8_t value = 64 + x / 2 + y / 4;
+        this->data_ptr_[c][y * kWidth + x] =
+            (uint8_t(value + this->noise_ptr_[c][y * kWidth + x] * kStd))
+            << shift;
+        this->denoised_ptr_[c][y * kWidth + x] = value << shift;
+      }
+    }
+  }
+  EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+  // For the Y plane, the solved coefficients should be close to the original
+  const int n = model.n;
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < n; ++i) {
+      EXPECT_NEAR(kCoeffs[c][i], model.latest_state[c].eqns.x[i], kCoeffEps);
+      EXPECT_NEAR(kCoeffs[c][i], model.combined_state[c].eqns.x[i], kCoeffEps);
+    }
+    // The chroma planes should be uncorrelated with the luma plane
+    if (c > 0) {
+      EXPECT_NEAR(0, model.latest_state[c].eqns.x[n], kCoeffEps);
+      EXPECT_NEAR(0, model.combined_state[c].eqns.x[n], kCoeffEps);
+    }
+    // Correlation between the coefficient vector and the fitted coefficients
+    // should be close to 1.
+    EXPECT_LT(0.98, aom_normalized_cross_correlation(
+                        model.latest_state[c].eqns.x, kCoeffs[c], kNumCoeffs));
+
+    noise_synth(&this->random_, model.params.lag, model.n, model.coords,
+                model.latest_state[c].eqns.x, &this->renoise_[0], kWidth,
+                kHeight);
+
+    EXPECT_TRUE(aom_noise_data_validate(&this->renoise_[0], kWidth, kHeight));
+  }
+
+  // Check fitted noise strength
+  const double normalize = 1 << shift;
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < model.latest_state[c].strength_solver.eqns.n; ++i) {
+      EXPECT_NEAR(kStd,
+                  model.latest_state[c].strength_solver.eqns.x[i] / normalize,
+                  kStdEps);
+    }
+  }
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest,
+             NoiseStrengthChangeSignalsDifferentNoiseType) {
+  aom_noise_model_t &model = this->model_;
+  const int kWidth = this->kWidth;
+  const int kHeight = this->kHeight;
+  const int kBlockSize = this->kBlockSize;
+  // Create a gradient image with std = 2 uncorrelated noise
+  const double kStd = 2;
+  const int shift = this->kBitDepth - 8;
+
+  for (int i = 0; i < kWidth * kHeight; ++i) {
+    const uint8_t val = (i % kWidth) < kWidth / 2 ? 64 : 192;
+    for (int c = 0; c < 3; ++c) {
+      this->noise_ptr_[c][i] = randn(&this->random_, 1);
+      this->data_ptr_[c][i] = ((uint8_t)(this->noise_ptr_[c][i] * kStd + val))
+                              << shift;
+      this->denoised_ptr_[c][i] = val << shift;
+    }
+  }
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+  EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+  const int kNumBlocks = kWidth * kHeight / kBlockSize / kBlockSize;
+  EXPECT_EQ(kNumBlocks, model.latest_state[0].strength_solver.num_equations);
+  EXPECT_EQ(kNumBlocks, model.latest_state[1].strength_solver.num_equations);
+  EXPECT_EQ(kNumBlocks, model.latest_state[2].strength_solver.num_equations);
+  EXPECT_EQ(kNumBlocks, model.combined_state[0].strength_solver.num_equations);
+  EXPECT_EQ(kNumBlocks, model.combined_state[1].strength_solver.num_equations);
+  EXPECT_EQ(kNumBlocks, model.combined_state[2].strength_solver.num_equations);
+
+  // Bump up noise by an insignificant amount
+  for (int i = 0; i < kWidth * kHeight; ++i) {
+    const uint8_t val = (i % kWidth) < kWidth / 2 ? 64 : 192;
+    this->data_ptr_[0][i] =
+        ((uint8_t)(this->noise_ptr_[0][i] * (kStd + 0.085) + val)) << shift;
+  }
+  EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+  const double kARGainTolerance = 0.02;
+  for (int c = 0; c < 3; ++c) {
+    EXPECT_EQ(kNumBlocks, model.latest_state[c].strength_solver.num_equations);
+    EXPECT_EQ(15250, model.latest_state[c].num_observations);
+    EXPECT_NEAR(1, model.latest_state[c].ar_gain, kARGainTolerance);
+
+    EXPECT_EQ(2 * kNumBlocks,
+              model.combined_state[c].strength_solver.num_equations);
+    EXPECT_EQ(2 * 15250, model.combined_state[c].num_observations);
+    EXPECT_NEAR(1, model.combined_state[c].ar_gain, kARGainTolerance);
+  }
+
+  // Bump up the noise strength on half the image for one channel by a
+  // significant amount.
+  for (int i = 0; i < kWidth * kHeight; ++i) {
+    const uint8_t val = (i % kWidth) < kWidth / 2 ? 64 : 128;
+    if (i % kWidth < kWidth / 2) {
+      this->data_ptr_[0][i] =
+          ((uint8_t)(randn(&this->random_, kStd + 0.5) + val)) << shift;
+    }
+  }
+  EXPECT_EQ(AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE, this->NoiseModelUpdate());
+
+  // Since we didn't update the combined state, it should still be at 2 *
+  // num_blocks
+  EXPECT_EQ(kNumBlocks, model.latest_state[0].strength_solver.num_equations);
+  EXPECT_EQ(2 * kNumBlocks,
+            model.combined_state[0].strength_solver.num_equations);
+
+  // In normal operation, the "latest" estimate can be saved to the "combined"
+  // state for continued updates.
+  aom_noise_model_save_latest(&model);
+  for (int c = 0; c < 3; ++c) {
+    EXPECT_EQ(kNumBlocks, model.latest_state[c].strength_solver.num_equations);
+    EXPECT_EQ(15250, model.latest_state[c].num_observations);
+    EXPECT_NEAR(1, model.latest_state[c].ar_gain, kARGainTolerance);
+
+    EXPECT_EQ(kNumBlocks,
+              model.combined_state[c].strength_solver.num_equations);
+    EXPECT_EQ(15250, model.combined_state[c].num_observations);
+    EXPECT_NEAR(1, model.combined_state[c].ar_gain, kARGainTolerance);
+  }
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, NoiseCoeffsSignalsDifferentNoiseType) {
+  aom_noise_model_t &model = this->model_;
+  const int kWidth = this->kWidth;
+  const int kHeight = this->kHeight;
+  const double kCoeffs[2][24] = {
+    { 0.02884, -0.03356, 0.00633,  0.01757,  0.02849,  -0.04620,
+      0.02833, -0.07178, 0.07076,  -0.11603, -0.10413, -0.16571,
+      0.05158, -0.07969, 0.02640,  -0.07191, 0.02530,  0.41968,
+      0.21450, -0.00702, -0.01401, -0.03676, -0.08713, 0.44196 },
+    { 0.00269, -0.01291, -0.01513, 0.07234,  0.03208,   0.00477,
+      0.00226, -0.00254, 0.03533,  0.12841,  -0.25970,  -0.06336,
+      0.05238, -0.00845, -0.03118, 0.09043,  -0.36558,  0.48903,
+      0.00595, -0.11938, 0.02106,  0.095956, -0.350139, 0.59305 }
+  };
+
+  noise_synth(&this->random_, model.params.lag, model.n, model.coords,
+              kCoeffs[0], this->noise_ptr_[0], kWidth, kHeight);
+  for (int i = 0; i < kWidth * kHeight; ++i) {
+    this->data_ptr_[0][i] = (uint8_t)(128 + this->noise_ptr_[0][i]);
+  }
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+  EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+  // Now try with the second set of AR coefficients
+  noise_synth(&this->random_, model.params.lag, model.n, model.coords,
+              kCoeffs[1], this->noise_ptr_[0], kWidth, kHeight);
+  for (int i = 0; i < kWidth * kHeight; ++i) {
+    this->data_ptr_[0][i] = (uint8_t)(128 + this->noise_ptr_[0][i]);
+  }
+  EXPECT_EQ(AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE, this->NoiseModelUpdate());
+}
+REGISTER_TYPED_TEST_CASE_P(NoiseModelUpdateTest, UpdateFailsNoFlatBlocks,
+                           UpdateSuccessForZeroNoiseAllFlat,
+                           UpdateFailsBlockSizeTooSmall,
+                           UpdateSuccessForWhiteRandomNoise,
+                           UpdateSuccessForScaledWhiteNoise,
+                           UpdateSuccessForCorrelatedNoise,
+                           NoiseStrengthChangeSignalsDifferentNoiseType,
+                           NoiseCoeffsSignalsDifferentNoiseType);
+
+INSTANTIATE_TYPED_TEST_CASE_P(NoiseModelUpdateTestInstatiation,
+                              NoiseModelUpdateTest, AllBitDepthParams);
+
+TEST(NoiseModelGetGrainParameters, TestLagSize) {
+  aom_film_grain_t film_grain;
+  for (int lag = 1; lag <= 3; ++lag) {
+    aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 };
+    aom_noise_model_t model;
+    EXPECT_TRUE(aom_noise_model_init(&model, params));
+    EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain));
+    EXPECT_EQ(lag, film_grain.ar_coeff_lag);
+    aom_noise_model_free(&model);
+  }
+
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 4, 8, 0 };
+  aom_noise_model_t model;
+  EXPECT_TRUE(aom_noise_model_init(&model, params));
+  EXPECT_FALSE(aom_noise_model_get_grain_parameters(&model, &film_grain));
+  aom_noise_model_free(&model);
+}
+
+TEST(NoiseModelGetGrainParameters, TestARCoeffShiftBounds) {
+  struct TestCase {
+    double max_input_value;
+    int expected_ar_coeff_shift;
+    int expected_value;
+  };
+  const int lag = 1;
+  const int kNumTestCases = 19;
+  const TestCase test_cases[] = {
+    // Test cases for ar_coeff_shift = 9
+    { 0, 9, 0 },
+    { 0.125, 9, 64 },
+    { -0.125, 9, -64 },
+    { 0.2499, 9, 127 },
+    { -0.25, 9, -128 },
+    // Test cases for ar_coeff_shift = 8
+    { 0.25, 8, 64 },
+    { -0.2501, 8, -64 },
+    { 0.499, 8, 127 },
+    { -0.5, 8, -128 },
+    // Test cases for ar_coeff_shift = 7
+    { 0.5, 7, 64 },
+    { -0.5001, 7, -64 },
+    { 0.999, 7, 127 },
+    { -1, 7, -128 },
+    // Test cases for ar_coeff_shift = 6
+    { 1.0, 6, 64 },
+    { -1.0001, 6, -64 },
+    { 2.0, 6, 127 },
+    { -2.0, 6, -128 },
+    { 4, 6, 127 },
+    { -4, 6, -128 },
+  };
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 };
+  aom_noise_model_t model;
+  EXPECT_TRUE(aom_noise_model_init(&model, params));
+
+  for (int i = 0; i < kNumTestCases; ++i) {
+    const TestCase &test_case = test_cases[i];
+    model.combined_state[0].eqns.x[0] = test_case.max_input_value;
+
+    aom_film_grain_t film_grain;
+    EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain));
+    EXPECT_EQ(1, film_grain.ar_coeff_lag);
+    EXPECT_EQ(test_case.expected_ar_coeff_shift, film_grain.ar_coeff_shift);
+    EXPECT_EQ(test_case.expected_value, film_grain.ar_coeffs_y[0]);
+  }
+  aom_noise_model_free(&model);
+}
+
+TEST(NoiseModelGetGrainParameters, TestNoiseStrengthShiftBounds) {
+  struct TestCase {
+    double max_input_value;
+    int expected_scaling_shift;
+    int expected_value;
+  };
+  const int kNumTestCases = 10;
+  const TestCase test_cases[] = {
+    { 0, 11, 0 },      { 1, 11, 64 },     { 2, 11, 128 }, { 3.99, 11, 255 },
+    { 4, 10, 128 },    { 7.99, 10, 255 }, { 8, 9, 128 },  { 16, 8, 128 },
+    { 31.99, 8, 255 }, { 64, 8, 255 },  // clipped
+  };
+  const int lag = 1;
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 };
+  aom_noise_model_t model;
+  EXPECT_TRUE(aom_noise_model_init(&model, params));
+
+  for (int i = 0; i < kNumTestCases; ++i) {
+    const TestCase &test_case = test_cases[i];
+    aom_equation_system_t &eqns = model.combined_state[0].strength_solver.eqns;
+    // Set the fitted scale parameters to be a constant value.
+    for (int j = 0; j < eqns.n; ++j) {
+      eqns.x[j] = test_case.max_input_value;
+    }
+    aom_film_grain_t film_grain;
+    EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain));
+    // We expect a single constant segemnt
+    EXPECT_EQ(test_case.expected_scaling_shift, film_grain.scaling_shift);
+    EXPECT_EQ(test_case.expected_value, film_grain.scaling_points_y[0][1]);
+    EXPECT_EQ(test_case.expected_value, film_grain.scaling_points_y[1][1]);
+  }
+  aom_noise_model_free(&model);
+}
+
+// The AR coefficients are the same inputs used to generate "Test 2" in the test
+// vectors
+TEST(NoiseModelGetGrainParameters, GetGrainParametersReal) {
+  const double kInputCoeffsY[] = { 0.0315,  0.0073,  0.0218,  0.00235, 0.00511,
+                                   -0.0222, 0.0627,  -0.022,  0.05575, -0.1816,
+                                   0.0107,  -0.1966, 0.00065, -0.0809, 0.04934,
+                                   -0.1349, -0.0352, 0.41772, 0.27973, 0.04207,
+                                   -0.0429, -0.1372, 0.06193, 0.52032 };
+  const double kInputCoeffsCB[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,
+                                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5 };
+  const double kInputCoeffsCR[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0,
+                                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.5 };
+  const int kExpectedARCoeffsY[] = { 4,  1,   3,  0,   1,  -3,  8, -3,
+                                     7,  -23, 1,  -25, 0,  -10, 6, -17,
+                                     -5, 53,  36, 5,   -5, -18, 8, 67 };
+  const int kExpectedARCoeffsCB[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84 };
+  const int kExpectedARCoeffsCR[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0,
+                                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -126 };
+  // Scaling function is initialized analytically with a sqrt function.
+  const int kNumScalingPointsY = 12;
+  const int kExpectedScalingPointsY[][2] = {
+    { 0, 0 },     { 13, 44 },   { 27, 62 },   { 40, 76 },
+    { 54, 88 },   { 67, 98 },   { 94, 117 },  { 121, 132 },
+    { 148, 146 }, { 174, 159 }, { 201, 171 }, { 255, 192 },
+  };
+
+  const int lag = 3;
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 };
+  aom_noise_model_t model;
+  EXPECT_TRUE(aom_noise_model_init(&model, params));
+
+  // Setup the AR coeffs
+  memcpy(model.combined_state[0].eqns.x, kInputCoeffsY, sizeof(kInputCoeffsY));
+  memcpy(model.combined_state[1].eqns.x, kInputCoeffsCB,
+         sizeof(kInputCoeffsCB));
+  memcpy(model.combined_state[2].eqns.x, kInputCoeffsCR,
+         sizeof(kInputCoeffsCR));
+  for (int i = 0; i < model.combined_state[0].strength_solver.num_bins; ++i) {
+    const double x =
+        ((double)i) / (model.combined_state[0].strength_solver.num_bins - 1.0);
+    model.combined_state[0].strength_solver.eqns.x[i] = 6 * sqrt(x);
+    model.combined_state[1].strength_solver.eqns.x[i] = 3;
+    model.combined_state[2].strength_solver.eqns.x[i] = 2;
+
+    // Inject some observations into the strength solver, as during film grain
+    // parameter extraction an estimate of the average strength will be used to
+    // adjust correlation.
+    const int n = model.combined_state[0].strength_solver.num_bins;
+    for (int j = 0; j < model.combined_state[0].strength_solver.num_bins; ++j) {
+      model.combined_state[0].strength_solver.eqns.A[i * n + j] = 1;
+      model.combined_state[1].strength_solver.eqns.A[i * n + j] = 1;
+      model.combined_state[2].strength_solver.eqns.A[i * n + j] = 1;
+    }
+  }
+
+  aom_film_grain_t film_grain;
+  EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain));
+  EXPECT_EQ(lag, film_grain.ar_coeff_lag);
+  EXPECT_EQ(3, film_grain.ar_coeff_lag);
+  EXPECT_EQ(7, film_grain.ar_coeff_shift);
+  EXPECT_EQ(10, film_grain.scaling_shift);
+  EXPECT_EQ(kNumScalingPointsY, film_grain.num_y_points);
+  EXPECT_EQ(1, film_grain.update_parameters);
+  EXPECT_EQ(1, film_grain.apply_grain);
+
+  const int kNumARCoeffs = 24;
+  for (int i = 0; i < kNumARCoeffs; ++i) {
+    EXPECT_EQ(kExpectedARCoeffsY[i], film_grain.ar_coeffs_y[i]);
+  }
+  for (int i = 0; i < kNumARCoeffs + 1; ++i) {
+    EXPECT_EQ(kExpectedARCoeffsCB[i], film_grain.ar_coeffs_cb[i]);
+  }
+  for (int i = 0; i < kNumARCoeffs + 1; ++i) {
+    EXPECT_EQ(kExpectedARCoeffsCR[i], film_grain.ar_coeffs_cr[i]);
+  }
+  for (int i = 0; i < kNumScalingPointsY; ++i) {
+    EXPECT_EQ(kExpectedScalingPointsY[i][0], film_grain.scaling_points_y[i][0]);
+    EXPECT_EQ(kExpectedScalingPointsY[i][1], film_grain.scaling_points_y[i][1]);
+  }
+
+  // CB strength should just be a piecewise segment
+  EXPECT_EQ(2, film_grain.num_cb_points);
+  EXPECT_EQ(0, film_grain.scaling_points_cb[0][0]);
+  EXPECT_EQ(255, film_grain.scaling_points_cb[1][0]);
+  EXPECT_EQ(96, film_grain.scaling_points_cb[0][1]);
+  EXPECT_EQ(96, film_grain.scaling_points_cb[1][1]);
+
+  // CR strength should just be a piecewise segment
+  EXPECT_EQ(2, film_grain.num_cr_points);
+  EXPECT_EQ(0, film_grain.scaling_points_cr[0][0]);
+  EXPECT_EQ(255, film_grain.scaling_points_cr[1][0]);
+  EXPECT_EQ(64, film_grain.scaling_points_cr[0][1]);
+  EXPECT_EQ(64, film_grain.scaling_points_cr[1][1]);
+
+  EXPECT_EQ(128, film_grain.cb_mult);
+  EXPECT_EQ(192, film_grain.cb_luma_mult);
+  EXPECT_EQ(256, film_grain.cb_offset);
+  EXPECT_EQ(128, film_grain.cr_mult);
+  EXPECT_EQ(192, film_grain.cr_luma_mult);
+  EXPECT_EQ(256, film_grain.cr_offset);
+  EXPECT_EQ(0, film_grain.chroma_scaling_from_luma);
+  EXPECT_EQ(0, film_grain.grain_scale_shift);
+
+  aom_noise_model_free(&model);
+}
+
+template <typename T>
+class WienerDenoiseTest : public ::testing::Test, public T {
+ public:
+  static void SetUpTestCase() { aom_dsp_rtcd(); }
+
+ protected:
+  void SetUp() {
+    static const float kNoiseLevel = 5.f;
+    static const float kStd = 4.0;
+    static const double kMaxValue = (1 << T::kBitDepth) - 1;
+
+    chroma_sub_[0] = 1;
+    chroma_sub_[1] = 1;
+    stride_[0] = kWidth;
+    stride_[1] = kWidth / 2;
+    stride_[2] = kWidth / 2;
+    for (int k = 0; k < 3; ++k) {
+      data_[k].resize(kWidth * kHeight);
+      denoised_[k].resize(kWidth * kHeight);
+      noise_psd_[k].resize(kBlockSize * kBlockSize);
+    }
+
+    const double kCoeffsY[] = { 0.0406, -0.116, -0.078, -0.152, 0.0033, -0.093,
+                                0.048,  0.404,  0.2353, -0.035, -0.093, 0.441 };
+    const int kCoords[12][2] = {
+      { -2, -2 }, { -1, -2 }, { 0, -2 }, { 1, -2 }, { 2, -2 }, { -2, -1 },
+      { -1, -1 }, { 0, -1 },  { 1, -1 }, { 2, -1 }, { -2, 0 }, { -1, 0 }
+    };
+    const int kLag = 2;
+    const int kLength = 12;
+    libaom_test::ACMRandom random;
+    std::vector<double> noise(kWidth * kHeight);
+    noise_synth(&random, kLag, kLength, kCoords, kCoeffsY, &noise[0], kWidth,
+                kHeight);
+    noise_psd_[0] = get_noise_psd(&noise[0], kWidth, kHeight, kBlockSize);
+    for (int i = 0; i < kBlockSize * kBlockSize; ++i) {
+      noise_psd_[0][i] = (float)(noise_psd_[0][i] * kStd * kStd * kScaleNoise *
+                                 kScaleNoise / (kMaxValue * kMaxValue));
+    }
+
+    float psd_value =
+        aom_noise_psd_get_default_value(kBlockSizeChroma, kNoiseLevel);
+    for (int i = 0; i < kBlockSizeChroma * kBlockSizeChroma; ++i) {
+      noise_psd_[1][i] = psd_value;
+      noise_psd_[2][i] = psd_value;
+    }
+    for (int y = 0; y < kHeight; ++y) {
+      for (int x = 0; x < kWidth; ++x) {
+        data_[0][y * stride_[0] + x] = (typename T::data_type_t)fclamp(
+            (x + noise[y * stride_[0] + x] * kStd) * kScaleNoise, 0, kMaxValue);
+      }
+    }
+
+    for (int c = 1; c < 3; ++c) {
+      for (int y = 0; y < (kHeight >> 1); ++y) {
+        for (int x = 0; x < (kWidth >> 1); ++x) {
+          data_[c][y * stride_[c] + x] = (typename T::data_type_t)fclamp(
+              (x + randn(&random, kStd)) * kScaleNoise, 0, kMaxValue);
+        }
+      }
+    }
+    for (int k = 0; k < 3; ++k) {
+      noise_psd_ptrs_[k] = &noise_psd_[k][0];
+    }
+  }
+  static const int kBlockSize = 32;
+  static const int kBlockSizeChroma = 16;
+  static const int kWidth = 256;
+  static const int kHeight = 256;
+  static const int kScaleNoise = 1 << (T::kBitDepth - 8);
+
+  std::vector<typename T::data_type_t> data_[3];
+  std::vector<typename T::data_type_t> denoised_[3];
+  std::vector<float> noise_psd_[3];
+  int chroma_sub_[2];
+  float *noise_psd_ptrs_[3];
+  int stride_[3];
+};
+
+TYPED_TEST_CASE_P(WienerDenoiseTest);
+
+TYPED_TEST_P(WienerDenoiseTest, InvalidBlockSize) {
+  const uint8_t *const data_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->data_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[2][0]),
+  };
+  uint8_t *denoised_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->denoised_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[2][0]),
+  };
+  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                     this->kHeight, this->stride_,
+                                     this->chroma_sub_, this->noise_psd_ptrs_,
+                                     18, this->kBitDepth, this->kUseHighBD));
+  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                     this->kHeight, this->stride_,
+                                     this->chroma_sub_, this->noise_psd_ptrs_,
+                                     48, this->kBitDepth, this->kUseHighBD));
+  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                     this->kHeight, this->stride_,
+                                     this->chroma_sub_, this->noise_psd_ptrs_,
+                                     64, this->kBitDepth, this->kUseHighBD));
+}
+
+TYPED_TEST_P(WienerDenoiseTest, InvalidChromaSubsampling) {
+  const uint8_t *const data_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->data_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[2][0]),
+  };
+  uint8_t *denoised_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->denoised_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[2][0]),
+  };
+  int chroma_sub[2] = { 1, 0 };
+  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                     this->kHeight, this->stride_, chroma_sub,
+                                     this->noise_psd_ptrs_, 32, this->kBitDepth,
+                                     this->kUseHighBD));
+
+  chroma_sub[0] = 0;
+  chroma_sub[1] = 1;
+  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                     this->kHeight, this->stride_, chroma_sub,
+                                     this->noise_psd_ptrs_, 32, this->kBitDepth,
+                                     this->kUseHighBD));
+}
+
+TYPED_TEST_P(WienerDenoiseTest, GradientTest) {
+  const int kWidth = this->kWidth;
+  const int kHeight = this->kHeight;
+  const int kBlockSize = this->kBlockSize;
+  const uint8_t *const data_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->data_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[2][0]),
+  };
+  uint8_t *denoised_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->denoised_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[2][0]),
+  };
+  const int ret = aom_wiener_denoise_2d(
+      data_ptrs, denoised_ptrs, kWidth, kHeight, this->stride_,
+      this->chroma_sub_, this->noise_psd_ptrs_, this->kBlockSize,
+      this->kBitDepth, this->kUseHighBD);
+  EXPECT_EQ(1, ret);
+
+  // Check the noise on the denoised image (from the analytical gradient)
+  // and make sure that it is less than what we added.
+  for (int c = 0; c < 3; ++c) {
+    std::vector<double> measured_noise(kWidth * kHeight);
+
+    double var = 0;
+    const int shift = (c > 0);
+    for (int x = 0; x < (kWidth >> shift); ++x) {
+      for (int y = 0; y < (kHeight >> shift); ++y) {
+        const double diff = this->denoised_[c][y * this->stride_[c] + x] -
+                            x * this->kScaleNoise;
+        var += diff * diff;
+        measured_noise[y * kWidth + x] = diff;
+      }
+    }
+    var /= (kWidth * kHeight);
+    const double std = sqrt(std::max(0.0, var));
+    EXPECT_LE(std, 1.25f * this->kScaleNoise);
+    if (c == 0) {
+      std::vector<float> measured_psd =
+          get_noise_psd(&measured_noise[0], kWidth, kHeight, kBlockSize);
+      std::vector<double> measured_psd_d(kBlockSize * kBlockSize);
+      std::vector<double> noise_psd_d(kBlockSize * kBlockSize);
+      std::copy(measured_psd.begin(), measured_psd.end(),
+                measured_psd_d.begin());
+      std::copy(this->noise_psd_[0].begin(), this->noise_psd_[0].end(),
+                noise_psd_d.begin());
+      EXPECT_LT(
+          aom_normalized_cross_correlation(&measured_psd_d[0], &noise_psd_d[0],
+                                           (int)(noise_psd_d.size())),
+          0.35);
+    }
+  }
+}
+
+REGISTER_TYPED_TEST_CASE_P(WienerDenoiseTest, InvalidBlockSize,
+                           InvalidChromaSubsampling, GradientTest);
+
+INSTANTIATE_TYPED_TEST_CASE_P(WienerDenoiseTestInstatiation, WienerDenoiseTest,
+                              AllBitDepthParams);
diff --git a/third_party/aom/test/obmc_sad_test.cc b/third_party/aom/test/obmc_sad_test.cc
index 219c5d810..1820da266 100644
--- a/third_party/aom/test/obmc_sad_test.cc
+++ b/third_party/aom/test/obmc_sad_test.cc
@@ -14,8 +14,9 @@
 #include "test/function_equivalence_test.h"
 #include "test/register_state_check.h"
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 #define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
@@ -84,13 +85,10 @@ TEST_P(ObmcSadTest, ExtremeValues) {
 }
 
 #if HAVE_SSE4_1
-#if CONFIG_MOTION_VAR
 const ObmcSadTest::ParamType sse4_functions[] = {
-#if CONFIG_EXT_PARTITION
   TestFuncs(aom_obmc_sad128x128_c, aom_obmc_sad128x128_sse4_1),
   TestFuncs(aom_obmc_sad128x64_c, aom_obmc_sad128x64_sse4_1),
   TestFuncs(aom_obmc_sad64x128_c, aom_obmc_sad64x128_sse4_1),
-#endif  // CONFIG_EXT_PARTITION
   TestFuncs(aom_obmc_sad64x64_c, aom_obmc_sad64x64_sse4_1),
   TestFuncs(aom_obmc_sad64x32_c, aom_obmc_sad64x32_sse4_1),
   TestFuncs(aom_obmc_sad32x64_c, aom_obmc_sad32x64_sse4_1),
@@ -108,14 +106,12 @@ const ObmcSadTest::ParamType sse4_functions[] = {
 
 INSTANTIATE_TEST_CASE_P(SSE4_1, ObmcSadTest,
                         ::testing::ValuesIn(sse4_functions));
-#endif  // CONFIG_MOTION_VAR
 #endif  // HAVE_SSE4_1
 
 ////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
 
-#if CONFIG_HIGHBITDEPTH
 class ObmcSadHBDTest : public FunctionEquivalenceTest<ObmcSadF> {};
 
 TEST_P(ObmcSadHBDTest, RandomValues) {
@@ -169,13 +165,10 @@ TEST_P(ObmcSadHBDTest, ExtremeValues) {
 }
 
 #if HAVE_SSE4_1
-#if CONFIG_MOTION_VAR
 ObmcSadHBDTest::ParamType sse4_functions_hbd[] = {
-#if CONFIG_EXT_PARTITION
   TestFuncs(aom_highbd_obmc_sad128x128_c, aom_highbd_obmc_sad128x128_sse4_1),
   TestFuncs(aom_highbd_obmc_sad128x64_c, aom_highbd_obmc_sad128x64_sse4_1),
   TestFuncs(aom_highbd_obmc_sad64x128_c, aom_highbd_obmc_sad64x128_sse4_1),
-#endif  // CONFIG_EXT_PARTITION
   TestFuncs(aom_highbd_obmc_sad64x64_c, aom_highbd_obmc_sad64x64_sse4_1),
   TestFuncs(aom_highbd_obmc_sad64x32_c, aom_highbd_obmc_sad64x32_sse4_1),
   TestFuncs(aom_highbd_obmc_sad32x64_c, aom_highbd_obmc_sad32x64_sse4_1),
@@ -193,7 +186,5 @@ ObmcSadHBDTest::ParamType sse4_functions_hbd[] = {
 
 INSTANTIATE_TEST_CASE_P(SSE4_1, ObmcSadHBDTest,
                         ::testing::ValuesIn(sse4_functions_hbd));
-#endif  // CONFIG_MOTION_VAR
 #endif  // HAVE_SSE4_1
-#endif  // CONFIG_HIGHBITDEPTH
 }  // namespace
diff --git a/third_party/aom/test/obmc_variance_test.cc b/third_party/aom/test/obmc_variance_test.cc
index 1b30645a5..04fee8285 100644
--- a/third_party/aom/test/obmc_variance_test.cc
+++ b/third_party/aom/test/obmc_variance_test.cc
@@ -15,8 +15,9 @@
 #include "test/function_equivalence_test.h"
 #include "test/register_state_check.h"
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 #define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
@@ -93,13 +94,10 @@ TEST_P(ObmcVarianceTest, ExtremeValues) {
 }
 
 #if HAVE_SSE4_1
-#if CONFIG_MOTION_VAR
 const ObmcVarianceTest::ParamType sse4_functions[] = {
-#if CONFIG_EXT_PARTITION
   TestFuncs(aom_obmc_variance128x128_c, aom_obmc_variance128x128_sse4_1),
   TestFuncs(aom_obmc_variance128x64_c, aom_obmc_variance128x64_sse4_1),
   TestFuncs(aom_obmc_variance64x128_c, aom_obmc_variance64x128_sse4_1),
-#endif  // CONFIG_EXT_PARTITION
   TestFuncs(aom_obmc_variance64x64_c, aom_obmc_variance64x64_sse4_1),
   TestFuncs(aom_obmc_variance64x32_c, aom_obmc_variance64x32_sse4_1),
   TestFuncs(aom_obmc_variance32x64_c, aom_obmc_variance32x64_sse4_1),
@@ -117,14 +115,12 @@ const ObmcVarianceTest::ParamType sse4_functions[] = {
 
 INSTANTIATE_TEST_CASE_P(SSE4_1, ObmcVarianceTest,
                         ::testing::ValuesIn(sse4_functions));
-#endif  // CONFIG_MOTION_VAR
 #endif  // HAVE_SSE4_1
 
 ////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
 
-#if CONFIG_HIGHBITDEPTH
 class ObmcVarianceHBDTest : public FunctionEquivalenceTest<ObmcVarF> {};
 
 TEST_P(ObmcVarianceHBDTest, RandomValues) {
@@ -183,16 +179,13 @@ TEST_P(ObmcVarianceHBDTest, ExtremeValues) {
 }
 
 #if HAVE_SSE4_1
-#if CONFIG_MOTION_VAR
 ObmcVarianceHBDTest::ParamType sse4_functions_hbd[] = {
-#if CONFIG_EXT_PARTITION
   TestFuncs(aom_highbd_obmc_variance128x128_c,
             aom_highbd_obmc_variance128x128_sse4_1, 8),
   TestFuncs(aom_highbd_obmc_variance128x64_c,
             aom_highbd_obmc_variance128x64_sse4_1, 8),
   TestFuncs(aom_highbd_obmc_variance64x128_c,
             aom_highbd_obmc_variance64x128_sse4_1, 8),
-#endif  // CONFIG_EXT_PARTITION
   TestFuncs(aom_highbd_obmc_variance64x64_c,
             aom_highbd_obmc_variance64x64_sse4_1, 8),
   TestFuncs(aom_highbd_obmc_variance64x32_c,
@@ -219,14 +212,12 @@ ObmcVarianceHBDTest::ParamType sse4_functions_hbd[] = {
             8),
   TestFuncs(aom_highbd_obmc_variance4x4_c, aom_highbd_obmc_variance4x4_sse4_1,
             8),
-#if CONFIG_EXT_PARTITION
   TestFuncs(aom_highbd_10_obmc_variance128x128_c,
             aom_highbd_10_obmc_variance128x128_sse4_1, 10),
   TestFuncs(aom_highbd_10_obmc_variance128x64_c,
             aom_highbd_10_obmc_variance128x64_sse4_1, 10),
   TestFuncs(aom_highbd_10_obmc_variance64x128_c,
             aom_highbd_10_obmc_variance64x128_sse4_1, 10),
-#endif  // CONFIG_EXT_PARTITION
   TestFuncs(aom_highbd_10_obmc_variance64x64_c,
             aom_highbd_10_obmc_variance64x64_sse4_1, 10),
   TestFuncs(aom_highbd_10_obmc_variance64x32_c,
@@ -253,14 +244,12 @@ ObmcVarianceHBDTest::ParamType sse4_functions_hbd[] = {
             aom_highbd_10_obmc_variance4x8_sse4_1, 10),
   TestFuncs(aom_highbd_10_obmc_variance4x4_c,
             aom_highbd_10_obmc_variance4x4_sse4_1, 10),
-#if CONFIG_EXT_PARTITION
   TestFuncs(aom_highbd_12_obmc_variance128x128_c,
             aom_highbd_12_obmc_variance128x128_sse4_1, 12),
   TestFuncs(aom_highbd_12_obmc_variance128x64_c,
             aom_highbd_12_obmc_variance128x64_sse4_1, 12),
   TestFuncs(aom_highbd_12_obmc_variance64x128_c,
             aom_highbd_12_obmc_variance64x128_sse4_1, 12),
-#endif  // CONFIG_EXT_PARTITION
   TestFuncs(aom_highbd_12_obmc_variance64x64_c,
             aom_highbd_12_obmc_variance64x64_sse4_1, 12),
   TestFuncs(aom_highbd_12_obmc_variance64x32_c,
@@ -291,7 +280,5 @@ ObmcVarianceHBDTest::ParamType sse4_functions_hbd[] = {
 
 INSTANTIATE_TEST_CASE_P(SSE4_1, ObmcVarianceHBDTest,
                         ::testing::ValuesIn(sse4_functions_hbd));
-#endif  // CONFIG_MOTION_VAR
 #endif  // HAVE_SSE4_1
-#endif  // CONFIG_HIGHBITDEPTH
 }  // namespace
diff --git a/third_party/aom/av1/encoder/bgsprite.h b/third_party/aom/test/onyxc_int_test.cc
index 711b00e40..388959518 100644
--- a/third_party/aom/av1/encoder/bgsprite.h
+++ b/third_party/aom/test/onyxc_int_test.cc
@@ -9,22 +9,14 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_ENCODER_BGSPRITE_H_
-#define AV1_ENCODER_BGSPRITE_H_
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#ifdef __cplusplus
-extern "C" {
-#endif
+#include "av1/common/onyxc_int.h"
 
-#include "av1/encoder/encoder.h"
-
-// Creates alternate reference frame staring from source image + frames up to
-// 'distance' past source frame.
-// Returns 0 on success and 1 on failure.
-int av1_background_sprite(AV1_COMP *cpi, int distance);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AV1_ENCODER_BGSPRITE_H_
+TEST(OnyxcInt, TestGetTxSize) {
+  for (int t = TX_4X4; t < TX_SIZES_ALL; t++) {
+    TX_SIZE t2 = get_tx_size(tx_size_wide[t], tx_size_high[t]);
+    GTEST_ASSERT_EQ(tx_size_wide[t], tx_size_wide[t2]);
+    GTEST_ASSERT_EQ(tx_size_high[t], tx_size_high[t2]);
+  }
+}
diff --git a/third_party/aom/test/partial_idct_test.cc b/third_party/aom/test/partial_idct_test.cc
deleted file mode 100644
index b2ea176e8..000000000
--- a/third_party/aom/test/partial_idct_test.cc
+++ /dev/null
@@ -1,471 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
-
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "av1/common/blockd.h"
-#include "av1/common/scan.h"
-#include "aom/aom_integer.h"
-#include "aom_ports/aom_timer.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef void (*InvTxfmWithBdFunc)(const tran_low_t *in, uint8_t *out,
-                                  int stride, int bd);
-
-template <InvTxfmFunc fn>
-void wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) {
-  (void)bd;
-  fn(in, out, stride);
-}
-
-typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmWithBdFunc, InvTxfmWithBdFunc,
-                        TX_SIZE, int, int, int>
-    PartialInvTxfmParam;
-const int kMaxNumCoeffs = 1024;
-const int kCountTestBlock = 10000;
-
-class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
- public:
-  virtual ~PartialIDctTest() {}
-  virtual void SetUp() {
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-    ftxfm_ = GET_PARAM(0);
-    full_itxfm_ = GET_PARAM(1);
-    partial_itxfm_ = GET_PARAM(2);
-    tx_size_ = GET_PARAM(3);
-    last_nonzero_ = GET_PARAM(4);
-    bit_depth_ = GET_PARAM(5);
-    pixel_size_ = GET_PARAM(6);
-    mask_ = (1 << bit_depth_) - 1;
-
-    switch (tx_size_) {
-      case TX_4X4: size_ = 4; break;
-      case TX_8X8: size_ = 8; break;
-      case TX_16X16: size_ = 16; break;
-      case TX_32X32: size_ = 32; break;
-      default: FAIL() << "Wrong Size!"; break;
-    }
-
-    // Randomize stride_ to a value less than or equal to 1024
-    stride_ = rnd_(1024) + 1;
-    if (stride_ < size_) {
-      stride_ = size_;
-    }
-    // Align stride_ to 16 if it's bigger than 16.
-    if (stride_ > 16) {
-      stride_ &= ~15;
-    }
-
-    input_block_size_ = size_ * size_;
-    output_block_size_ = size_ * stride_;
-
-    input_block_ = reinterpret_cast<tran_low_t *>(
-        aom_memalign(16, sizeof(*input_block_) * input_block_size_));
-    output_block_ = reinterpret_cast<uint8_t *>(
-        aom_memalign(16, pixel_size_ * output_block_size_));
-    output_block_ref_ = reinterpret_cast<uint8_t *>(
-        aom_memalign(16, pixel_size_ * output_block_size_));
-  }
-
-  virtual void TearDown() {
-    aom_free(input_block_);
-    input_block_ = NULL;
-    aom_free(output_block_);
-    output_block_ = NULL;
-    aom_free(output_block_ref_);
-    output_block_ref_ = NULL;
-    libaom_test::ClearSystemState();
-  }
-
-  void InitMem() {
-    memset(input_block_, 0, sizeof(*input_block_) * input_block_size_);
-    if (pixel_size_ == 1) {
-      for (int j = 0; j < output_block_size_; ++j) {
-        output_block_[j] = output_block_ref_[j] = rnd_.Rand16() & mask_;
-      }
-    } else {
-      ASSERT_EQ(2, pixel_size_);
-      uint16_t *const output = reinterpret_cast<uint16_t *>(output_block_);
-      uint16_t *const output_ref =
-          reinterpret_cast<uint16_t *>(output_block_ref_);
-      for (int j = 0; j < output_block_size_; ++j) {
-        output[j] = output_ref[j] = rnd_.Rand16() & mask_;
-      }
-    }
-  }
-
-  void InitInput() {
-    const int max_coeff = 32766 / 4;
-    int max_energy_leftover = max_coeff * max_coeff;
-    for (int j = 0; j < last_nonzero_; ++j) {
-      int16_t coeff = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
-                                           (rnd_.Rand16() - 32768) / 65536);
-      max_energy_leftover -= coeff * coeff;
-      if (max_energy_leftover < 0) {
-        max_energy_leftover = 0;
-        coeff = 0;
-      }
-      input_block_[av1_default_scan_orders[tx_size_].scan[j]] = coeff;
-    }
-  }
-
- protected:
-  int last_nonzero_;
-  TX_SIZE tx_size_;
-  tran_low_t *input_block_;
-  uint8_t *output_block_;
-  uint8_t *output_block_ref_;
-  int size_;
-  int stride_;
-  int pixel_size_;
-  int input_block_size_;
-  int output_block_size_;
-  int bit_depth_;
-  int mask_;
-  FwdTxfmFunc ftxfm_;
-  InvTxfmWithBdFunc full_itxfm_;
-  InvTxfmWithBdFunc partial_itxfm_;
-  ACMRandom rnd_;
-};
-
-TEST_P(PartialIDctTest, RunQuantCheck) {
-  DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]);
-
-  InitMem();
-  for (int i = 0; i < kCountTestBlock; ++i) {
-    // Initialize a test block with input range [-mask_, mask_].
-    if (i == 0) {
-      for (int k = 0; k < input_block_size_; ++k) {
-        input_extreme_block[k] = mask_;
-      }
-    } else if (i == 1) {
-      for (int k = 0; k < input_block_size_; ++k) {
-        input_extreme_block[k] = -mask_;
-      }
-    } else {
-      for (int k = 0; k < input_block_size_; ++k) {
-        input_extreme_block[k] = rnd_.Rand8() % 2 ? mask_ : -mask_;
-      }
-    }
-
-    ftxfm_(input_extreme_block, output_ref_block, size_);
-
-    // quantization with minimum allowed step sizes
-    input_block_[0] = (output_ref_block[0] / 4) * 4;
-    for (int k = 1; k < last_nonzero_; ++k) {
-      const int pos = av1_default_scan_orders[tx_size_].scan[k];
-      input_block_[pos] = (output_ref_block[pos] / 4) * 4;
-    }
-
-    ASM_REGISTER_STATE_CHECK(
-        full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
-    ASM_REGISTER_STATE_CHECK(
-        partial_itxfm_(input_block_, output_block_, stride_, bit_depth_));
-    ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
-                        pixel_size_ * output_block_size_))
-        << "Error: partial inverse transform produces different results";
-  }
-}
-
-TEST_P(PartialIDctTest, ResultsMatch) {
-  for (int i = 0; i < kCountTestBlock; ++i) {
-    InitMem();
-    InitInput();
-
-    ASM_REGISTER_STATE_CHECK(
-        full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
-    ASM_REGISTER_STATE_CHECK(
-        partial_itxfm_(input_block_, output_block_, stride_, bit_depth_));
-    ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
-                        pixel_size_ * output_block_size_))
-        << "Error: partial inverse transform produces different results";
-  }
-}
-
-TEST_P(PartialIDctTest, AddOutputBlock) {
-  for (int i = 0; i < kCountTestBlock; ++i) {
-    InitMem();
-    for (int j = 0; j < last_nonzero_; ++j) {
-      input_block_[av1_default_scan_orders[tx_size_].scan[j]] = 10;
-    }
-
-    ASM_REGISTER_STATE_CHECK(
-        full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
-    ASM_REGISTER_STATE_CHECK(
-        partial_itxfm_(input_block_, output_block_, stride_, bit_depth_));
-    ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
-                        pixel_size_ * output_block_size_))
-        << "Error: Transform results are not correctly added to output.";
-  }
-}
-
-TEST_P(PartialIDctTest, SingleExtremeCoeff) {
-  const int16_t max_coeff = INT16_MAX;
-  const int16_t min_coeff = INT16_MIN;
-  for (int i = 0; i < last_nonzero_; ++i) {
-    memset(input_block_, 0, sizeof(*input_block_) * input_block_size_);
-    // Run once for min and once for max.
-    for (int j = 0; j < 2; ++j) {
-      const int coeff = j ? min_coeff : max_coeff;
-
-      memset(output_block_, 0, pixel_size_ * output_block_size_);
-      memset(output_block_ref_, 0, pixel_size_ * output_block_size_);
-      input_block_[av1_default_scan_orders[tx_size_].scan[i]] = coeff;
-
-      ASM_REGISTER_STATE_CHECK(
-          full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
-      ASM_REGISTER_STATE_CHECK(
-          partial_itxfm_(input_block_, output_block_, stride_, bit_depth_));
-      ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
-                          pixel_size_ * output_block_size_))
-          << "Error: Fails with single coeff of " << coeff << " at " << i
-          << ".";
-    }
-  }
-}
-
-TEST_P(PartialIDctTest, DISABLED_Speed) {
-  // Keep runtime stable with transform size.
-  const int kCountSpeedTestBlock = 500000000 / input_block_size_;
-  InitMem();
-  InitInput();
-
-  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
-    ASM_REGISTER_STATE_CHECK(
-        full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
-  }
-  aom_usec_timer timer;
-  aom_usec_timer_start(&timer);
-  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
-    partial_itxfm_(input_block_, output_block_, stride_, bit_depth_);
-  }
-  libaom_test::ClearSystemState();
-  aom_usec_timer_mark(&timer);
-  const int elapsed_time =
-      static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);
-  printf("idct%dx%d_%d (bitdepth %d) time: %5d ms\n", size_, size_,
-         last_nonzero_, bit_depth_, elapsed_time);
-
-  ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
-                      pixel_size_ * output_block_size_))
-      << "Error: partial inverse transform produces different results";
-}
-
-using std::tr1::make_tuple;
-
-const PartialInvTxfmParam c_partial_idct_tests[] = {
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_1024_add_c>, TX_32X32, 1024, 8, 1),
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_135_add_c>, TX_32X32, 135, 8, 1),
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_34_add_c>, TX_32X32, 34, 8, 1),
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_1_add_c>, TX_32X32, 1, 8, 1),
-  make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
-             &wrapper<aom_idct16x16_256_add_c>, TX_16X16, 256, 8, 1),
-  make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
-             &wrapper<aom_idct16x16_38_add_c>, TX_16X16, 38, 8, 1),
-  make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
-             &wrapper<aom_idct16x16_10_add_c>, TX_16X16, 10, 8, 1),
-  make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
-             &wrapper<aom_idct16x16_1_add_c>, TX_16X16, 1, 8, 1),
-  make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
-             &wrapper<aom_idct8x8_64_add_c>, TX_8X8, 64, 8, 1),
-  make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
-             &wrapper<aom_idct8x8_12_add_c>, TX_8X8, 12, 8, 1),
-  make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
-             &wrapper<aom_idct8x8_1_add_c>, TX_8X8, 1, 8, 1),
-  make_tuple(&aom_fdct4x4_c, &wrapper<aom_idct4x4_16_add_c>,
-             &wrapper<aom_idct4x4_16_add_c>, TX_4X4, 16, 8, 1),
-  make_tuple(&aom_fdct4x4_c, &wrapper<aom_idct4x4_16_add_c>,
-             &wrapper<aom_idct4x4_1_add_c>, TX_4X4, 1, 8, 1)
-};
-
-INSTANTIATE_TEST_CASE_P(C, PartialIDctTest,
-                        ::testing::ValuesIn(c_partial_idct_tests));
-
-#if HAVE_NEON && !CONFIG_HIGHBITDEPTH
-const PartialInvTxfmParam neon_partial_idct_tests[] = {
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_1_add_neon>, TX_32X32, 1, 8, 1),
-  make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
-             &wrapper<aom_idct16x16_10_add_neon>, TX_16X16, 10, 8, 1),
-  make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
-             &wrapper<aom_idct16x16_1_add_neon>, TX_16X16, 1, 8, 1),
-  make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
-             &wrapper<aom_idct8x8_12_add_neon>, TX_8X8, 12, 8, 1),
-  make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
-             &wrapper<aom_idct8x8_1_add_neon>, TX_8X8, 1, 8, 1),
-  make_tuple(&aom_fdct4x4_c, &wrapper<aom_idct4x4_16_add_c>,
-             &wrapper<aom_idct4x4_1_add_neon>, TX_4X4, 1, 8, 1)
-};
-
-INSTANTIATE_TEST_CASE_P(NEON, PartialIDctTest,
-                        ::testing::ValuesIn(neon_partial_idct_tests));
-#endif  // HAVE_NEON && !CONFIG_HIGHBITDEPTH
-
-#if HAVE_SSE2
-const PartialInvTxfmParam sse2_partial_idct_tests[] = {
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_1024_add_sse2>, TX_32X32, 1024, 8, 1),
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_1024_add_sse2>, TX_32X32, 135, 8, 1),
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_34_add_sse2>, TX_32X32, 34, 8, 1),
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_1_add_sse2>, TX_32X32, 1, 8, 1),
-  make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
-             &wrapper<aom_idct16x16_256_add_sse2>, TX_16X16, 256, 8, 1),
-  make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
-             &wrapper<aom_idct16x16_10_add_sse2>, TX_16X16, 10, 8, 1),
-  make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
-             &wrapper<aom_idct16x16_1_add_sse2>, TX_16X16, 1, 8, 1),
-  make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
-             &wrapper<aom_idct8x8_64_add_sse2>, TX_8X8, 64, 8, 1),
-  make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
-             &wrapper<aom_idct8x8_12_add_sse2>, TX_8X8, 12, 8, 1),
-  make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
-             &wrapper<aom_idct8x8_1_add_sse2>, TX_8X8, 1, 8, 1),
-  make_tuple(&aom_fdct4x4_c, &wrapper<aom_idct4x4_16_add_c>,
-             &wrapper<aom_idct4x4_16_add_sse2>, TX_4X4, 16, 8, 1),
-  make_tuple(&aom_fdct4x4_c, &wrapper<aom_idct4x4_16_add_c>,
-             &wrapper<aom_idct4x4_1_add_sse2>, TX_4X4, 1, 8, 1)
-};
-
-INSTANTIATE_TEST_CASE_P(SSE2, PartialIDctTest,
-                        ::testing::ValuesIn(sse2_partial_idct_tests));
-
-#endif  // HAVE_SSE2
-
-#if HAVE_SSSE3
-const PartialInvTxfmParam ssse3_partial_idct_tests[] = {
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_1024_add_ssse3>, TX_32X32, 1024, 8, 1),
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_135_add_ssse3>, TX_32X32, 135, 8, 1),
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_34_add_ssse3>, TX_32X32, 34, 8, 1),
-  make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
-             &wrapper<aom_idct8x8_64_add_ssse3>, TX_8X8, 64, 8, 1),
-  make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
-             &wrapper<aom_idct8x8_12_add_ssse3>, TX_8X8, 12, 8, 1)
-};
-
-INSTANTIATE_TEST_CASE_P(SSSE3, PartialIDctTest,
-                        ::testing::ValuesIn(ssse3_partial_idct_tests));
-#endif  // HAVE_SSSE3
-
-#if HAVE_AVX2
-const PartialInvTxfmParam avx2_partial_idct_tests[] = {
-  make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
-             &wrapper<aom_idct16x16_256_add_avx2>, TX_16X16, 256, 8, 1),
-  make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
-             &wrapper<aom_idct16x16_38_add_avx2>, TX_16X16, 38, 8, 1),
-  make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
-             &wrapper<aom_idct16x16_10_add_avx2>, TX_16X16, 10, 8, 1),
-  make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
-             &wrapper<aom_idct16x16_1_add_avx2>, TX_16X16, 1, 8, 1),
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_1024_add_avx2>, TX_32X32, 1024, 8, 1),
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_135_add_avx2>, TX_32X32, 135, 8, 1),
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_34_add_avx2>, TX_32X32, 34, 8, 1),
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_1_add_avx2>, TX_32X32, 1, 8, 1),
-};
-
-INSTANTIATE_TEST_CASE_P(AVX2, PartialIDctTest,
-                        ::testing::ValuesIn(avx2_partial_idct_tests));
-#endif  // HAVE_AVX2
-
-#if HAVE_DSPR2 && !CONFIG_HIGHBITDEPTH
-const PartialInvTxfmParam dspr2_partial_idct_tests[] = {
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_1024_add_dspr2>, TX_32X32, 1024, 8, 1),
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_1024_add_dspr2>, TX_32X32, 135, 8, 1),
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_34_add_dspr2>, TX_32X32, 34, 8, 1),
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_1_add_dspr2>, TX_32X32, 1, 8, 1),
-  make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
-             &wrapper<aom_idct16x16_256_add_dspr2>, TX_16X16, 256, 8, 1),
-  make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
-             &wrapper<aom_idct16x16_10_add_dspr2>, TX_16X16, 10, 8, 1),
-  make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
-             &wrapper<aom_idct16x16_1_add_dspr2>, TX_16X16, 1, 8, 1),
-  make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
-             &wrapper<aom_idct8x8_64_add_dspr2>, TX_8X8, 64, 8, 1),
-  make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
-             &wrapper<aom_idct8x8_12_add_dspr2>, TX_8X8, 12, 8, 1),
-  make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
-             &wrapper<aom_idct8x8_1_add_dspr2>, TX_8X8, 1, 8, 1),
-  make_tuple(&aom_fdct4x4_c, &wrapper<aom_idct4x4_16_add_c>,
-             &wrapper<aom_idct4x4_16_add_dspr2>, TX_4X4, 16, 8, 1),
-  make_tuple(&aom_fdct4x4_c, &wrapper<aom_idct4x4_16_add_c>,
-             &wrapper<aom_idct4x4_1_add_dspr2>, TX_4X4, 1, 8, 1)
-};
-
-INSTANTIATE_TEST_CASE_P(DSPR2, PartialIDctTest,
-                        ::testing::ValuesIn(dspr2_partial_idct_tests));
-#endif  // HAVE_DSPR2 && !CONFIG_HIGHBITDEPTH
-
-#if HAVE_MSA && !CONFIG_HIGHBITDEPTH
-const PartialInvTxfmParam msa_partial_idct_tests[] = {
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_1024_add_msa>, TX_32X32, 1024, 8, 1),
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_1024_add_msa>, TX_32X32, 135, 8, 1),
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_34_add_msa>, TX_32X32, 34, 8, 1),
-  make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
-             &wrapper<aom_idct32x32_1_add_msa>, TX_32X32, 1, 8, 1),
-  make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
-             &wrapper<aom_idct16x16_256_add_msa>, TX_16X16, 256, 8, 1),
-  make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
-             &wrapper<aom_idct16x16_10_add_msa>, TX_16X16, 10, 8, 1),
-  make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
-             &wrapper<aom_idct16x16_1_add_msa>, TX_16X16, 1, 8, 1),
-  make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
-             &wrapper<aom_idct8x8_64_add_msa>, TX_8X8, 64, 8, 1),
-  make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
-             &wrapper<aom_idct8x8_12_add_msa>, TX_8X8, 12, 8, 1),
-  make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
-             &wrapper<aom_idct8x8_1_add_msa>, TX_8X8, 1, 8, 1),
-  make_tuple(&aom_fdct4x4_c, &wrapper<aom_idct4x4_16_add_c>,
-             &wrapper<aom_idct4x4_16_add_msa>, TX_4X4, 16, 8, 1),
-  make_tuple(&aom_fdct4x4_c, &wrapper<aom_idct4x4_16_add_c>,
-             &wrapper<aom_idct4x4_1_add_msa>, TX_4X4, 1, 8, 1)
-};
-
-INSTANTIATE_TEST_CASE_P(MSA, PartialIDctTest,
-                        ::testing::ValuesIn(msa_partial_idct_tests));
-#endif  // HAVE_MSA && !CONFIG_HIGHBITDEPTH
-
-}  // namespace
diff --git a/third_party/aom/test/qm_test.cc b/third_party/aom/test/qm_test.cc
new file mode 100644
index 000000000..c87506b41
--- /dev/null
+++ b/third_party/aom/test/qm_test.cc
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "config/aom_config.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class QMTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  QMTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~QMTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AV1E_SET_ENABLE_QM, 1);
+      encoder->Control(AV1E_SET_QM_MIN, qm_min_);
+      encoder->Control(AV1E_SET_QM_MAX, qm_max_);
+
+      encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100);
+    }
+  }
+
+  void DoTest(int qm_min, int qm_max) {
+    qm_min_ = qm_min;
+    qm_max_ = qm_max;
+    cfg_.kf_max_dist = 12;
+    cfg_.rc_min_quantizer = 8;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 6;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_target_bitrate = 300;
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 15);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  int set_cpu_used_;
+  int qm_min_;
+  int qm_max_;
+};
+
+// encodes and decodes without a mismatch.
+TEST_P(QMTest, TestNoMisMatchQM1) { DoTest(5, 9); }
+
+// encodes and decodes without a mismatch.
+TEST_P(QMTest, TestNoMisMatchQM2) { DoTest(0, 8); }
+
+// encodes and decodes without a mismatch.
+TEST_P(QMTest, TestNoMisMatchQM3) { DoTest(9, 15); }
+
+AV1_INSTANTIATE_TEST_CASE(QMTest,
+                          ::testing::Values(::libaom_test::kRealTime,
+                                            ::libaom_test::kOnePassGood),
+                          ::testing::Range(5, 9));
+}  // namespace
diff --git a/third_party/aom/test/quantize_func_test.cc b/third_party/aom/test/quantize_func_test.cc
index 2e4829021..97e73bff0 100644
--- a/third_party/aom/test/quantize_func_test.cc
+++ b/third_party/aom/test/quantize_func_test.cc
@@ -11,13 +11,14 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "./av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "aom/aom_codec.h"
 #include "aom_ports/aom_timer.h"
 #include "av1/encoder/encoder.h"
-#include "av1/encoder/av1_quantize.h"
+#include "av1/common/scan.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
@@ -34,36 +35,56 @@ using libaom_test::ACMRandom;
       const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, \
       const int16_t *iscan
 
+#define QUAN_PARAM_LIST_NO_SKIP                                               \
+  const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,    \
+      const int16_t *round_ptr, const int16_t *quant_ptr,                     \
+      const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,                 \
+      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, \
+      const int16_t *scan, const int16_t *iscan
+
 typedef void (*QuantizeFunc)(QUAN_PARAM_LIST);
-typedef void (*QuantizeFuncHbd)(QUAN_PARAM_LIST, int log_scale);
+typedef void (*QuantizeFuncHbd)(QUAN_PARAM_LIST_NO_SKIP, int log_scale);
+typedef void (*QuantizeFuncNoSkip)(QUAN_PARAM_LIST_NO_SKIP);
 
 #define HBD_QUAN_FUNC                                                      \
-  fn(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, quant_ptr,      \
-     quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, \
-     iscan, log_scale)
+  fn(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, \
+     qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, log_scale)
+
+#define LBD_QUAN_FUNC_NO_SKIP                                              \
+  fn(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, \
+     qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan)
 
 template <QuantizeFuncHbd fn>
 void highbd_quan16x16_wrapper(QUAN_PARAM_LIST) {
+  (void)skip_block;
   const int log_scale = 0;
   HBD_QUAN_FUNC;
 }
 
 template <QuantizeFuncHbd fn>
 void highbd_quan32x32_wrapper(QUAN_PARAM_LIST) {
+  (void)skip_block;
   const int log_scale = 1;
   HBD_QUAN_FUNC;
 }
 
 template <QuantizeFuncHbd fn>
 void highbd_quan64x64_wrapper(QUAN_PARAM_LIST) {
+  (void)skip_block;
   const int log_scale = 2;
   HBD_QUAN_FUNC;
 }
 
+template <QuantizeFuncNoSkip fn>
+void lowbd_quan_wrapper(QUAN_PARAM_LIST) {
+  (void)skip_block;
+  LBD_QUAN_FUNC_NO_SKIP;
+}
+
 typedef enum { TYPE_B, TYPE_DC, TYPE_FP } QuantType;
 
-typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, TX_SIZE, QuantType,
-                        aom_bit_depth_t>
+using ::testing::tuple;
+typedef tuple<QuantizeFunc, QuantizeFunc, TX_SIZE, QuantType, aom_bit_depth_t>
     QuantizeParam;
 
 typedef struct {
@@ -98,7 +119,7 @@ class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
   }
 
   void InitQuantizer() {
-    av1_build_quantizer(bd_, 0, 0, 0, &qtab_->quant, &qtab_->dequant);
+    av1_build_quantizer(bd_, 0, 0, 0, 0, 0, &qtab_->quant, &qtab_->dequant);
   }
 
   void QuantizeRun(bool is_loop, int q = 0, int test_num = 1) {
@@ -114,7 +135,7 @@ class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
     uint16_t *eob = (uint16_t *)(dqcoeff + n_coeffs);
 
     // Testing uses 2-D DCT scan order table
-    const SCAN_ORDER *const sc = get_default_scan(tx_size_, DCT_DCT, 0);
+    const SCAN_ORDER *const sc = get_default_scan(tx_size_, DCT_DCT);
 
     // Testing uses luminance quantization table
     const int16_t *zbin = qtab_->quant.y_zbin[q];
@@ -130,7 +151,7 @@ class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
     }
 
     const int16_t *quant_shift = qtab_->quant.y_quant_shift[q];
-    const int16_t *dequant = qtab_->dequant.y_dequant[q];
+    const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q];
 
     for (int i = 0; i < test_num; ++i) {
       if (is_loop) FillCoeffRandom();
@@ -171,7 +192,7 @@ class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
     }
   }
 
-  int coeff_num() const { return tx_size_2d[tx_size_]; }
+  int coeff_num() const { return av1_get_max_eob(tx_size_); }
 
   void FillCoeff(tran_low_t c) {
     const int n_coeffs = coeff_num();
@@ -255,6 +276,13 @@ TEST_P(QuantizeTest, MultipleQ) {
   }
 }
 
+// Force the coeff to be half the value of the dequant.  This exposes a
+// mismatch found in av1_quantize_fp_sse2().
+TEST_P(QuantizeTest, CoeffHalfDequant) {
+  FillCoeff(16);
+  QuantizeRun(false, 25, 1);
+}
+
 TEST_P(QuantizeTest, DISABLED_Speed) {
   tran_low_t *coeff_ptr = coeff_;
   const intptr_t n_coeffs = coeff_num();
@@ -268,7 +296,7 @@ TEST_P(QuantizeTest, DISABLED_Speed) {
   uint16_t *eob = (uint16_t *)(dqcoeff + n_coeffs);
 
   // Testing uses 2-D DCT scan order table
-  const SCAN_ORDER *const sc = get_default_scan(tx_size_, DCT_DCT, 0);
+  const SCAN_ORDER *const sc = get_default_scan(tx_size_, DCT_DCT);
 
   // Testing uses luminance quantization table
   const int q = 22;
@@ -276,7 +304,7 @@ TEST_P(QuantizeTest, DISABLED_Speed) {
   const int16_t *round_fp = qtab_->quant.y_round_fp[q];
   const int16_t *quant_fp = qtab_->quant.y_quant_fp[q];
   const int16_t *quant_shift = qtab_->quant.y_quant_shift[q];
-  const int16_t *dequant = qtab_->dequant.y_dequant[q];
+  const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q];
   const int kNumTests = 5000000;
   aom_usec_timer timer;
 
@@ -293,15 +321,37 @@ TEST_P(QuantizeTest, DISABLED_Speed) {
   printf("Elapsed time: %d us\n", elapsed_time);
 }
 
-using std::tr1::make_tuple;
+using ::testing::make_tuple;
 
 #if HAVE_AVX2
 const QuantizeParam kQParamArrayAvx2[] = {
-  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, TX_16X16, TYPE_FP,
+  make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
+             &lowbd_quan_wrapper<av1_quantize_fp_avx2>, TX_16X16, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
+             &lowbd_quan_wrapper<av1_quantize_fp_avx2>, TX_4X16, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
+             &lowbd_quan_wrapper<av1_quantize_fp_avx2>, TX_16X4, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
+             &lowbd_quan_wrapper<av1_quantize_fp_avx2>, TX_32X8, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
+             &lowbd_quan_wrapper<av1_quantize_fp_avx2>, TX_8X32, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_32x32_c>,
+             &lowbd_quan_wrapper<av1_quantize_fp_32x32_avx2>, TX_32X32, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_32x32_c>,
+             &lowbd_quan_wrapper<av1_quantize_fp_32x32_avx2>, TX_16X64, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_32x32_c>,
+             &lowbd_quan_wrapper<av1_quantize_fp_32x32_avx2>, TX_64X16, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_64x64_c>,
+             &lowbd_quan_wrapper<av1_quantize_fp_64x64_avx2>, TX_64X64, TYPE_FP,
              AOM_BITS_8),
-  make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2, TX_32X32,
-             TYPE_FP, AOM_BITS_8),
-#if CONFIG_HIGHBITDEPTH
   make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
              &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>, TX_16X16,
              TYPE_FP, AOM_BITS_8),
@@ -320,7 +370,6 @@ const QuantizeParam kQParamArrayAvx2[] = {
   make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>,
              &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>, TX_32X32,
              TYPE_FP, AOM_BITS_12),
-#if CONFIG_TX64X64
   make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>,
              &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>, TX_64X64,
              TYPE_FP, AOM_BITS_8),
@@ -330,14 +379,12 @@ const QuantizeParam kQParamArrayAvx2[] = {
   make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>,
              &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>, TX_64X64,
              TYPE_FP, AOM_BITS_12),
-#endif  // CONFIG_TX64X64
   make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2, TX_16X16,
              TYPE_B, AOM_BITS_8),
   make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2, TX_16X16,
              TYPE_B, AOM_BITS_10),
   make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2, TX_16X16,
              TYPE_B, AOM_BITS_12),
-#endif  // CONFIG_HIGHBITDEPTH
 };
 
 INSTANTIATE_TEST_CASE_P(AVX2, QuantizeTest,
@@ -346,9 +393,21 @@ INSTANTIATE_TEST_CASE_P(AVX2, QuantizeTest,
 
 #if HAVE_SSE2
 const QuantizeParam kQParamArraySSE2[] = {
-  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, TX_16X16, TYPE_FP,
+  make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
+             &lowbd_quan_wrapper<av1_quantize_fp_sse2>, TX_16X16, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
+             &lowbd_quan_wrapper<av1_quantize_fp_sse2>, TX_4X16, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
+             &lowbd_quan_wrapper<av1_quantize_fp_sse2>, TX_16X4, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
+             &lowbd_quan_wrapper<av1_quantize_fp_sse2>, TX_8X32, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
+             &lowbd_quan_wrapper<av1_quantize_fp_sse2>, TX_32X8, TYPE_FP,
              AOM_BITS_8),
-#if CONFIG_HIGHBITDEPTH
   make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2, TX_16X16,
              TYPE_B, AOM_BITS_8),
   make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2, TX_16X16,
@@ -361,28 +420,10 @@ const QuantizeParam kQParamArraySSE2[] = {
              TX_32X32, TYPE_B, AOM_BITS_10),
   make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2,
              TX_32X32, TYPE_B, AOM_BITS_12),
-#endif
 };
 
 INSTANTIATE_TEST_CASE_P(SSE2, QuantizeTest,
                         ::testing::ValuesIn(kQParamArraySSE2));
 #endif
 
-#if !CONFIG_HIGHBITDEPTH && HAVE_SSSE3 && ARCH_X86_64
-const QuantizeParam kQ16x16ParamArraySSSE3[] = {
-  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_ssse3, TX_16X16, TYPE_FP,
-             AOM_BITS_8),
-};
-INSTANTIATE_TEST_CASE_P(SSSE3, QuantizeTest,
-                        ::testing::ValuesIn(kQ16x16ParamArraySSSE3));
-
-// TODO(any):
-//  The following test does not pass yet
-const QuantizeParam kQ32x32ParamArraySSSE3[] = { make_tuple(
-    &av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_ssse3, TX_32X32, TYPE_FP,
-    AOM_BITS_8) };
-INSTANTIATE_TEST_CASE_P(DISABLED_SSSE3, QuantizeTest,
-                        ::testing::ValuesIn(kQ32x32ParamArraySSSE3));
-#endif
-
 }  // namespace
diff --git a/third_party/aom/test/realtime_test.cc b/third_party/aom/test/realtime_test.cc
deleted file mode 100644
index 11d2a3241..000000000
--- a/third_party/aom/test/realtime_test.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "test/codec_factory.h"
-#include "test/encode_test_driver.h"
-#include "test/util.h"
-#include "test/video_source.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-namespace {
-
-const int kVideoSourceWidth = 320;
-const int kVideoSourceHeight = 240;
-const int kFramesToEncode = 2;
-
-class RealtimeTest
-    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
-      public ::libaom_test::EncoderTest {
- protected:
-  RealtimeTest() : EncoderTest(GET_PARAM(0)), frame_packets_(0) {}
-  virtual ~RealtimeTest() {}
-
-  virtual void SetUp() {
-    InitializeConfig();
-    cfg_.g_lag_in_frames = 0;
-    SetMode(::libaom_test::kRealTime);
-  }
-
-  virtual void BeginPassHook(unsigned int /*pass*/) {
-    // TODO(tomfinegan): We're changing the pass value here to make sure
-    // we get frames when real time mode is combined with |g_pass| set to
-    // AOM_RC_FIRST_PASS. This is necessary because EncoderTest::RunLoop() sets
-    // the pass value based on the mode passed into EncoderTest::SetMode(),
-    // which overrides the one specified in SetUp() above.
-    cfg_.g_pass = AOM_RC_FIRST_PASS;
-  }
-  virtual void FramePktHook(const aom_codec_cx_pkt_t * /*pkt*/) {
-    frame_packets_++;
-  }
-
-  int frame_packets_;
-};
-
-TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) {
-  ::libaom_test::RandomVideoSource video;
-  video.SetSize(kVideoSourceWidth, kVideoSourceHeight);
-  video.set_limit(kFramesToEncode);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  EXPECT_EQ(kFramesToEncode, frame_packets_);
-}
-
-AV1_INSTANTIATE_TEST_CASE(RealtimeTest,
-                          ::testing::Values(::libaom_test::kRealTime));
-
-}  // namespace
diff --git a/third_party/aom/test/reconinter_test.cc b/third_party/aom/test/reconinter_test.cc
new file mode 100644
index 000000000..4f74c817e
--- /dev/null
+++ b/third_party/aom/test/reconinter_test.cc
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/scan.h"
+#include "av1/common/txb_common.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+using libaom_test::ACMRandom;
+
+class BuildCompDiffwtdMaskTest : public ::testing::TestWithParam<int> {
+ public:
+  virtual ~BuildCompDiffwtdMaskTest() {}
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  void RunTest(const int sb_type, const int is_speed,
+               const DIFFWTD_MASK_TYPE type);
+
+ private:
+  ACMRandom rnd_;
+};
+
+typedef void (*buildcompdiffwtdmaskd16_func)(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd);
+
+typedef ::testing::tuple<int, buildcompdiffwtdmaskd16_func, BLOCK_SIZE>
+    BuildCompDiffwtdMaskD16Param;
+
+#if HAVE_SSE4_1 || HAVE_NEON
+::testing::internal::ParamGenerator<BuildCompDiffwtdMaskD16Param> BuildParams(
+    buildcompdiffwtdmaskd16_func filter) {
+  return ::testing::Combine(::testing::Range(8, 13, 2),
+                            ::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+#endif
+class BuildCompDiffwtdMaskD16Test
+    : public ::testing::TestWithParam<BuildCompDiffwtdMaskD16Param> {
+ public:
+  ~BuildCompDiffwtdMaskD16Test() {}
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
+ protected:
+  void RunCheckOutput(buildcompdiffwtdmaskd16_func test_impl);
+  void RunSpeedTest(buildcompdiffwtdmaskd16_func test_impl);
+  libaom_test::ACMRandom rnd_;
+};  // class BuildCompDiffwtdMaskD16Test
+
+void BuildCompDiffwtdMaskD16Test::RunCheckOutput(
+    buildcompdiffwtdmaskd16_func test_impl) {
+  const int block_idx = GET_PARAM(2);
+  const int bd = GET_PARAM(0);
+  const int width = block_size_wide[block_idx];
+  const int height = block_size_high[block_idx];
+  DECLARE_ALIGNED(16, uint8_t, mask_ref[2 * MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, mask_test[2 * MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);
+
+  ConvolveParams conv_params =
+      get_conv_params_no_round(0, 0, 0, NULL, 0, 1, bd);
+
+  int in_precision =
+      bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
+
+  for (int i = 0; i < MAX_SB_SQUARE; i++) {
+    src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+    src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+  }
+
+  for (int mask_type = 0; mask_type < DIFFWTD_MASK_TYPES; mask_type++) {
+    av1_build_compound_diffwtd_mask_d16_c(
+        mask_ref, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1, width,
+        height, width, &conv_params, bd);
+
+    test_impl(mask_test, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1, width,
+              height, width, &conv_params, bd);
+
+    for (int r = 0; r < height; ++r) {
+      for (int c = 0; c < width; ++c) {
+        ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
+            << "Mismatch at unit tests for BuildCompDiffwtdMaskD16Test\n"
+            << " Pixel mismatch at index "
+            << "[" << r << "," << c << "] "
+            << " @ " << width << "x" << height << " inv " << mask_type;
+      }
+    }
+  }
+}
+
+void BuildCompDiffwtdMaskD16Test::RunSpeedTest(
+    buildcompdiffwtdmaskd16_func test_impl) {
+  const int block_idx = GET_PARAM(2);
+  const int bd = GET_PARAM(0);
+  const int width = block_size_wide[block_idx];
+  const int height = block_size_high[block_idx];
+  DECLARE_ALIGNED(16, uint8_t, mask[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);
+
+  ConvolveParams conv_params =
+      get_conv_params_no_round(0, 0, 0, NULL, 0, 1, bd);
+
+  int in_precision =
+      bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
+
+  for (int i = 0; i < MAX_SB_SQUARE; i++) {
+    src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+    src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+  }
+
+  const int num_loops = 1000000000 / (width + height);
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+
+  for (int i = 0; i < num_loops; ++i)
+    av1_build_compound_diffwtd_mask_d16_c(mask, DIFFWTD_38, src0, width, src1,
+                                          width, height, width, &conv_params,
+                                          bd);
+
+  aom_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("av1_build_compound_diffwtd_mask_d16 c_code %3dx%-3d: %7.2f us\n",
+         width, height, 1000.0 * elapsed_time / num_loops);
+
+  aom_usec_timer timer1;
+  aom_usec_timer_start(&timer1);
+
+  for (int i = 0; i < num_loops; ++i)
+    test_impl(mask, DIFFWTD_38, src0, width, src1, width, height, width,
+              &conv_params, bd);
+
+  aom_usec_timer_mark(&timer1);
+  const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+  printf("av1_build_compound_diffwtd_mask_d16 test_code %3dx%-3d: %7.2f us\n",
+         width, height, 1000.0 * elapsed_time1 / num_loops);
+}
+#if HAVE_SSE4_1
+void BuildCompDiffwtdMaskTest::RunTest(const int sb_type, const int is_speed,
+                                       const DIFFWTD_MASK_TYPE type) {
+  const int width = block_size_wide[sb_type];
+  const int height = block_size_high[sb_type];
+  DECLARE_ALIGNED(16, uint8_t, mask_ref[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, mask_test[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, src0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, src1[MAX_SB_SQUARE]);
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int i = 0; i < width * height; i++) {
+    src0[i] = rnd.Rand8();
+    src1[i] = rnd.Rand8();
+  }
+  const int run_times = is_speed ? (10000000 / (width + height)) : 1;
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_times; ++i) {
+    av1_build_compound_diffwtd_mask_c(mask_ref, type, src0, width, src1, width,
+                                      height, width);
+  }
+  const double t1 = get_time_mark(&timer);
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_times; ++i) {
+    av1_build_compound_diffwtd_mask_sse4_1(mask_test, type, src0, width, src1,
+                                           width, height, width);
+  }
+  const double t2 = get_time_mark(&timer);
+  if (is_speed) {
+    printf("mask %d %3dx%-3d:%7.2f/%7.2fns", type, width, height, t1, t2);
+    printf("(%3.2f)\n", t1 / t2);
+  }
+  for (int r = 0; r < height; ++r) {
+    for (int c = 0; c < width; ++c) {
+      ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
+          << "[" << r << "," << c << "] " << run_times << " @ " << width << "x"
+          << height << " inv " << type;
+    }
+  }
+}
+
+TEST_P(BuildCompDiffwtdMaskTest, match) {
+  RunTest(GetParam(), 0, DIFFWTD_38);
+  RunTest(GetParam(), 0, DIFFWTD_38_INV);
+}
+TEST_P(BuildCompDiffwtdMaskTest, DISABLED_Speed) {
+  RunTest(GetParam(), 1, DIFFWTD_38);
+  RunTest(GetParam(), 1, DIFFWTD_38_INV);
+}
+#endif
+TEST_P(BuildCompDiffwtdMaskD16Test, CheckOutput) {
+  RunCheckOutput(GET_PARAM(1));
+}
+
+TEST_P(BuildCompDiffwtdMaskD16Test, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(1));
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(SSE4_1, BuildCompDiffwtdMaskTest,
+                        ::testing::Range(0, static_cast<int>(BLOCK_SIZES_ALL),
+                                         1));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, BuildCompDiffwtdMaskD16Test,
+    BuildParams(av1_build_compound_diffwtd_mask_d16_sse4_1));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, BuildCompDiffwtdMaskD16Test,
+                        BuildParams(av1_build_compound_diffwtd_mask_d16_neon));
+#endif
+
+}  // namespace
diff --git a/third_party/aom/test/register_state_check.h b/third_party/aom/test/register_state_check.h
index cce662a6d..ef1f775e5 100644
--- a/third_party/aom/test/register_state_check.h
+++ b/third_party/aom/test/register_state_check.h
@@ -13,7 +13,9 @@
 #define TEST_REGISTER_STATE_CHECK_H_
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "./aom_config.h"
+
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 
 // ASM_REGISTER_STATE_CHECK(asm_function)
@@ -29,7 +31,7 @@
 //   See platform implementations of RegisterStateCheckXXX for details.
 //
 
-#if defined(_WIN64)
+#if defined(_WIN64) && ARCH_X86_64
 
 #undef NOMINMAX
 #define NOMINMAX
@@ -88,53 +90,6 @@ class RegisterStateCheck {
 
 }  // namespace libaom_test
 
-#elif defined(CONFIG_SHARED) && defined(HAVE_NEON_ASM) && !CONFIG_SHARED && \
-    HAVE_NEON_ASM && CONFIG_AV1
-
-extern "C" {
-// Save the d8-d15 registers into store.
-void aom_push_neon(int64_t *store);
-}
-
-namespace libaom_test {
-
-// Compares the state of d8-d15 at construction with their state at
-// destruction. These registers should be preserved by the callee on
-// arm platform.
-class RegisterStateCheck {
- public:
-  RegisterStateCheck() { initialized_ = StoreRegisters(pre_store_); }
-  ~RegisterStateCheck() { Check(); }
-
- private:
-  static bool StoreRegisters(int64_t store[8]) {
-    aom_push_neon(store);
-    return true;
-  }
-
-  // Compares the register state. Returns true if the states match.
-  void Check() const {
-    ASSERT_TRUE(initialized_);
-    int64_t post_store[8];
-    aom_push_neon(post_store);
-    for (int i = 0; i < 8; ++i) {
-      EXPECT_EQ(pre_store_[i], post_store[i])
-          << "d" << i + 8 << " has been modified";
-    }
-  }
-
-  bool initialized_;
-  int64_t pre_store_[8];
-};
-
-#define ASM_REGISTER_STATE_CHECK(statement)    \
-  do {                                         \
-    libaom_test::RegisterStateCheck reg_check; \
-    statement;                                 \
-  } while (false)
-
-}  // namespace libaom_test
-
 #else
 
 namespace libaom_test {
@@ -144,7 +99,7 @@ class RegisterStateCheck {};
 
 }  // namespace libaom_test
 
-#endif  // _WIN64
+#endif  // _WIN64 && ARCH_X86_64
 
 #if ARCH_X86 || ARCH_X86_64
 #if defined(__GNUC__)
diff --git a/third_party/aom/test/resize_test.cc b/third_party/aom/test/resize_test.cc
index c4e924de0..e1c4e9fa5 100644
--- a/third_party/aom/test/resize_test.cc
+++ b/third_party/aom/test/resize_test.cc
@@ -7,10 +7,11 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <climits>
 #include <vector>
+#include "aom_dsp/aom_dsp_common.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
@@ -149,83 +150,8 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     *h = initial_h;
     return;
   }
+  // Go down very low
   if (frame < 120) {
-    *w = initial_w * 3 / 4;
-    *h = initial_h * 3 / 4;
-    return;
-  }
-  if (frame < 130) {
-    *w = initial_w / 2;
-    *h = initial_h / 2;
-    return;
-  }
-  if (frame < 140) {
-    *w = initial_w * 3 / 4;
-    *h = initial_h * 3 / 4;
-    return;
-  }
-  if (frame < 150) {
-    *w = initial_w;
-    *h = initial_h;
-    return;
-  }
-  if (frame < 160) {
-    *w = initial_w * 3 / 4;
-    *h = initial_h * 3 / 4;
-    return;
-  }
-  if (frame < 170) {
-    *w = initial_w / 2;
-    *h = initial_h / 2;
-    return;
-  }
-  if (frame < 180) {
-    *w = initial_w * 3 / 4;
-    *h = initial_h * 3 / 4;
-    return;
-  }
-  if (frame < 190) {
-    *w = initial_w;
-    *h = initial_h;
-    return;
-  }
-  if (frame < 200) {
-    *w = initial_w * 3 / 4;
-    *h = initial_h * 3 / 4;
-    return;
-  }
-  if (frame < 210) {
-    *w = initial_w / 2;
-    *h = initial_h / 2;
-    return;
-  }
-  if (frame < 220) {
-    *w = initial_w * 3 / 4;
-    *h = initial_h * 3 / 4;
-    return;
-  }
-  if (frame < 230) {
-    *w = initial_w;
-    *h = initial_h;
-    return;
-  }
-  if (frame < 240) {
-    *w = initial_w * 3 / 4;
-    *h = initial_h * 3 / 4;
-    return;
-  }
-  if (frame < 250) {
-    *w = initial_w / 2;
-    *h = initial_h / 2;
-    return;
-  }
-  if (frame < 260) {
-    *w = initial_w;
-    *h = initial_h;
-    return;
-  }
-  // Go down very low.
-  if (frame < 270) {
     *w = initial_w / 4;
     *h = initial_h / 4;
     return;
@@ -233,7 +159,7 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
   if (flag_codec == 1) {
     // Cases that only works for AV1.
     // For AV1: Swap width and height of original.
-    if (frame < 320) {
+    if (frame < 140) {
       *w = initial_h;
       *h = initial_w;
       return;
@@ -247,7 +173,7 @@ class ResizingVideoSource : public ::libaom_test::DummyVideoSource {
  public:
   ResizingVideoSource() {
     SetSize(kInitialWidth, kInitialHeight);
-    limit_ = 350;
+    limit_ = 150;
   }
   int flag_codec_;
   virtual ~ResizingVideoSource() {}
@@ -289,8 +215,15 @@ TEST_P(ResizeTest, TestExternalResizeWorks) {
   ResizingVideoSource video;
   video.flag_codec_ = 0;
   cfg_.g_lag_in_frames = 0;
+  // We use max(kInitialWidth, kInitialHeight) because during the test
+  // the width and height of the frame are swapped
+  cfg_.g_forced_max_frame_width = cfg_.g_forced_max_frame_height =
+      AOMMAX(kInitialWidth, kInitialHeight);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
+  // Check we decoded the same number of frames as we attempted to encode
+  ASSERT_EQ(frame_info_list_.size(), video.limit());
+
   for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
        info != frame_info_list_.end(); ++info) {
     const unsigned int frame = static_cast<unsigned>(info->pts);
@@ -308,16 +241,16 @@ TEST_P(ResizeTest, TestExternalResizeWorks) {
 const unsigned int kStepDownFrame = 3;
 const unsigned int kStepUpFrame = 6;
 
-class ResizeInternalTest : public ResizeTest {
+class ResizeInternalTestLarge : public ResizeTest {
  protected:
 #if WRITE_COMPRESSED_STREAM
-  ResizeInternalTest()
+  ResizeInternalTestLarge()
       : ResizeTest(), frame0_psnr_(0.0), outfile_(NULL), out_frames_(0) {}
 #else
-  ResizeInternalTest() : ResizeTest(), frame0_psnr_(0.0) {}
+  ResizeInternalTestLarge() : ResizeTest(), frame0_psnr_(0.0) {}
 #endif
 
-  virtual ~ResizeInternalTest() {}
+  virtual ~ResizeInternalTestLarge() {}
 
   virtual void BeginPassHook(unsigned int /*pass*/) {
 #if WRITE_COMPRESSED_STREAM
@@ -388,7 +321,7 @@ class ResizeInternalTest : public ResizeTest {
 #endif
 };
 
-TEST_P(ResizeInternalTest, TestInternalResizeWorks) {
+TEST_P(ResizeInternalTestLarge, TestInternalResizeWorks) {
   ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 10);
   init_flags_ = AOM_CODEC_USE_PSNR;
@@ -420,7 +353,7 @@ TEST_P(ResizeInternalTest, TestInternalResizeWorks) {
   }
 }
 
-TEST_P(ResizeInternalTest, TestInternalResizeChangeConfig) {
+TEST_P(ResizeInternalTestLarge, TestInternalResizeChangeConfig) {
   ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 10);
   cfg_.g_w = 352;
@@ -483,10 +416,14 @@ class ResizeRealtimeTest
     cfg_.kf_min_dist = cfg_.kf_max_dist = 3000;
     // Enable dropped frames.
     cfg_.rc_dropframe_thresh = 1;
-    // Enable error_resilience mode.
-    cfg_.g_error_resilient = 1;
+    // Disable error_resilience mode.
+    cfg_.g_error_resilient = 0;
     // Run at low bitrate.
     cfg_.rc_target_bitrate = 200;
+    // We use max(kInitialWidth, kInitialHeight) because during the test
+    // the width and height of the frame are swapped
+    cfg_.g_forced_max_frame_width = cfg_.g_forced_max_frame_height =
+        AOMMAX(kInitialWidth, kInitialHeight);
   }
 
   std::vector<FrameInfo> frame_info_list_;
@@ -505,6 +442,9 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
   mismatch_nframes_ = 0;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
+  // Check we decoded the same number of frames as we attempted to encode
+  ASSERT_EQ(frame_info_list_.size(), video.limit());
+
   for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
        info != frame_info_list_.end(); ++info) {
     const unsigned int frame = static_cast<unsigned>(info->pts);
@@ -706,11 +646,14 @@ TEST_P(ResizeCspTest, TestResizeCspWorks) {
   cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
   cfg_.g_lag_in_frames = 0;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Check we decoded the same number of frames as we attempted to encode
+  ASSERT_EQ(frame_info_list_.size(), video.limit());
 }
 
 AV1_INSTANTIATE_TEST_CASE(ResizeTest,
                           ::testing::Values(::libaom_test::kRealTime));
-AV1_INSTANTIATE_TEST_CASE(ResizeInternalTest,
+AV1_INSTANTIATE_TEST_CASE(ResizeInternalTestLarge,
                           ::testing::Values(::libaom_test::kOnePassGood));
 AV1_INSTANTIATE_TEST_CASE(ResizeRealtimeTest,
                           ::testing::Values(::libaom_test::kRealTime),
diff --git a/third_party/aom/test/run_encodes.sh b/third_party/aom/test/run_encodes.sh
new file mode 100755
index 000000000..2096d8b15
--- /dev/null
+++ b/third_party/aom/test/run_encodes.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved.
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+# Author: jimbankoski@google.com (Jim Bankoski)
+
+if [[ $# -ne 4 ]]; then
+  echo Encodes all the y4m files in the directory at the bitrates specified by
+  echo the first 3 parameters and stores the results in a subdirectory named by
+  echo the 4th parameter:
+  echo
+  echo Usage:    run_encodes.sh start-kbps end-kbps step-kbps output-directory
+  echo Example:  run_encodes.sh 200 500 50 baseline
+  exit
+fi
+
+s=$1
+e=$2
+step=$3
+newdir=$4
+
+for i in ./*y4m; do
+  for (( b=$s; b<= $e; b+= $step ))
+  do
+    best_encode.sh $i $b
+  done
+  mv opsnr.stt $i.stt
+done
+
+mkdir $newdir
+mv *.stt $newdir
+mv *.webm $newdir
diff --git a/third_party/aom/test/sad_test.cc b/third_party/aom/test/sad_test.cc
index 9ac58e653..845fe79da 100644
--- a/third_party/aom/test/sad_test.cc
+++ b/third_party/aom/test/sad_test.cc
@@ -7,7 +7,7 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <string.h>
 #include <limits.h>
@@ -15,8 +15,9 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
@@ -27,17 +28,34 @@
 
 typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr, int src_stride,
                                    const uint8_t *ref_ptr, int ref_stride);
-typedef std::tr1::tuple<int, int, SadMxNFunc, int> SadMxNParam;
+typedef ::testing::tuple<int, int, SadMxNFunc, int> SadMxNParam;
 
 typedef uint32_t (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
                                   const uint8_t *ref_ptr, int ref_stride,
                                   const uint8_t *second_pred);
-typedef std::tr1::tuple<int, int, SadMxNAvgFunc, int> SadMxNAvgParam;
+typedef ::testing::tuple<int, int, SadMxNAvgFunc, int> SadMxNAvgParam;
+
+typedef void (*JntCompAvgFunc)(uint8_t *comp_pred, const uint8_t *pred,
+                               int width, int height, const uint8_t *ref,
+                               int ref_stride,
+                               const JNT_COMP_PARAMS *jcp_param);
+typedef ::testing::tuple<int, int, JntCompAvgFunc, int> JntCompAvgParam;
+
+typedef unsigned int (*JntSadMxhFunc)(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride,
+                                      int width, int height);
+typedef ::testing::tuple<int, int, JntSadMxhFunc, int> JntSadMxhParam;
+
+typedef uint32_t (*JntSadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride,
+                                     const uint8_t *second_pred,
+                                     const JNT_COMP_PARAMS *jcp_param);
+typedef ::testing::tuple<int, int, JntSadMxNAvgFunc, int> JntSadMxNAvgParam;
 
 typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *const ref_ptr[], int ref_stride,
                              uint32_t *sad_array);
-typedef std::tr1::tuple<int, int, SadMxNx4Func, int> SadMxNx4Param;
+typedef ::testing::tuple<int, int, SadMxNx4Func, int> SadMxNx4Param;
 
 using libaom_test::ACMRandom;
 
@@ -54,12 +72,20 @@ class SADTestBase : public ::testing::Test {
         aom_memalign(kDataAlignment, kDataBufferSize));
     second_pred8_ =
         reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+    comp_pred8_ =
+        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+    comp_pred8_test_ =
+        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
     source_data16_ = reinterpret_cast<uint16_t *>(
         aom_memalign(kDataAlignment, kDataBlockSize * sizeof(uint16_t)));
     reference_data16_ = reinterpret_cast<uint16_t *>(
         aom_memalign(kDataAlignment, kDataBufferSize * sizeof(uint16_t)));
     second_pred16_ = reinterpret_cast<uint16_t *>(
         aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+    comp_pred16_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+    comp_pred16_test_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
   }
 
   static void TearDownTestCase() {
@@ -69,12 +95,20 @@ class SADTestBase : public ::testing::Test {
     reference_data8_ = NULL;
     aom_free(second_pred8_);
     second_pred8_ = NULL;
+    aom_free(comp_pred8_);
+    comp_pred8_ = NULL;
+    aom_free(comp_pred8_test_);
+    comp_pred8_test_ = NULL;
     aom_free(source_data16_);
     source_data16_ = NULL;
     aom_free(reference_data16_);
     reference_data16_ = NULL;
     aom_free(second_pred16_);
     second_pred16_ = NULL;
+    aom_free(comp_pred16_);
+    comp_pred16_ = NULL;
+    aom_free(comp_pred16_test_);
+    comp_pred16_test_ = NULL;
   }
 
   virtual void TearDown() { libaom_test::ClearSystemState(); }
@@ -92,14 +126,16 @@ class SADTestBase : public ::testing::Test {
       source_data_ = source_data8_;
       reference_data_ = reference_data8_;
       second_pred_ = second_pred8_;
-#if CONFIG_HIGHBITDEPTH
+      comp_pred_ = comp_pred8_;
+      comp_pred_test_ = comp_pred8_test_;
     } else {
       use_high_bit_depth_ = true;
       bit_depth_ = static_cast<aom_bit_depth_t>(bd_);
       source_data_ = CONVERT_TO_BYTEPTR(source_data16_);
       reference_data_ = CONVERT_TO_BYTEPTR(reference_data16_);
       second_pred_ = CONVERT_TO_BYTEPTR(second_pred16_);
-#endif  // CONFIG_HIGHBITDEPTH
+      comp_pred_ = CONVERT_TO_BYTEPTR(comp_pred16_);
+      comp_pred_test_ = CONVERT_TO_BYTEPTR(comp_pred16_test_);
     }
     mask_ = (1 << bit_depth_) - 1;
     source_stride_ = (width_ + 31) & ~31;
@@ -108,11 +144,9 @@ class SADTestBase : public ::testing::Test {
   }
 
   virtual uint8_t *GetReference(int block_idx) {
-#if CONFIG_HIGHBITDEPTH
     if (use_high_bit_depth_)
       return CONVERT_TO_BYTEPTR(CONVERT_TO_SHORTPTR(reference_data_) +
                                 block_idx * kDataBlockSize);
-#endif  // CONFIG_HIGHBITDEPTH
     return reference_data_ + block_idx * kDataBlockSize;
   }
 
@@ -122,21 +156,17 @@ class SADTestBase : public ::testing::Test {
     unsigned int sad = 0;
     const uint8_t *const reference8 = GetReference(block_idx);
     const uint8_t *const source8 = source_data_;
-#if CONFIG_HIGHBITDEPTH
     const uint16_t *const reference16 =
         CONVERT_TO_SHORTPTR(GetReference(block_idx));
     const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
-#endif  // CONFIG_HIGHBITDEPTH
     for (int h = 0; h < height_; ++h) {
       for (int w = 0; w < width_; ++w) {
         if (!use_high_bit_depth_) {
           sad += abs(source8[h * source_stride_ + w] -
                      reference8[h * reference_stride_ + w]);
-#if CONFIG_HIGHBITDEPTH
         } else {
           sad += abs(source16[h * source_stride_ + w] -
                      reference16[h * reference_stride_ + w]);
-#endif  // CONFIG_HIGHBITDEPTH
         }
       }
     }
@@ -151,12 +181,10 @@ class SADTestBase : public ::testing::Test {
     const uint8_t *const reference8 = GetReference(block_idx);
     const uint8_t *const source8 = source_data_;
     const uint8_t *const second_pred8 = second_pred_;
-#if CONFIG_HIGHBITDEPTH
     const uint16_t *const reference16 =
         CONVERT_TO_SHORTPTR(GetReference(block_idx));
     const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
     const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
-#endif  // CONFIG_HIGHBITDEPTH
     for (int h = 0; h < height_; ++h) {
       for (int w = 0; w < width_; ++w) {
         if (!use_high_bit_depth_) {
@@ -164,13 +192,65 @@ class SADTestBase : public ::testing::Test {
                           reference8[h * reference_stride_ + w];
           const uint8_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1);
           sad += abs(source8[h * source_stride_ + w] - comp_pred);
-#if CONFIG_HIGHBITDEPTH
         } else {
           const int tmp = second_pred16[h * width_ + w] +
                           reference16[h * reference_stride_ + w];
           const uint16_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1);
           sad += abs(source16[h * source_stride_ + w] - comp_pred);
-#endif  // CONFIG_HIGHBITDEPTH
+        }
+      }
+    }
+    return sad;
+  }
+
+  void ReferenceJntCompAvg(int block_idx) {
+    const uint8_t *const reference8 = GetReference(block_idx);
+    const uint8_t *const second_pred8 = second_pred_;
+    uint8_t *const comp_pred8 = comp_pred_;
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReference(block_idx));
+    const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
+    uint16_t *const comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred_);
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          const int tmp =
+              second_pred8[h * width_ + w] * jcp_param_.bck_offset +
+              reference8[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+          comp_pred8[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4);
+        } else {
+          const int tmp =
+              second_pred16[h * width_ + w] * jcp_param_.bck_offset +
+              reference16[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+          comp_pred16[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4);
+        }
+      }
+    }
+  }
+
+  unsigned int ReferenceJntSADavg(int block_idx) {
+    unsigned int sad = 0;
+    const uint8_t *const reference8 = GetReference(block_idx);
+    const uint8_t *const source8 = source_data_;
+    const uint8_t *const second_pred8 = second_pred_;
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReference(block_idx));
+    const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+    const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          const int tmp =
+              second_pred8[h * width_ + w] * jcp_param_.bck_offset +
+              reference8[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+          const uint8_t comp_pred = ROUND_POWER_OF_TWO(tmp, 4);
+          sad += abs(source8[h * source_stride_ + w] - comp_pred);
+        } else {
+          const int tmp =
+              second_pred16[h * width_ + w] * jcp_param_.bck_offset +
+              reference16[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+          const uint16_t comp_pred = ROUND_POWER_OF_TWO(tmp, 4);
+          sad += abs(source16[h * source_stride_ + w] - comp_pred);
         }
       }
     }
@@ -179,17 +259,13 @@ class SADTestBase : public ::testing::Test {
 
   void FillConstant(uint8_t *data, int stride, uint16_t fill_constant) {
     uint8_t *data8 = data;
-#if CONFIG_HIGHBITDEPTH
     uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
-#endif  // CONFIG_HIGHBITDEPTH
     for (int h = 0; h < height_; ++h) {
       for (int w = 0; w < width_; ++w) {
         if (!use_high_bit_depth_) {
           data8[h * stride + w] = static_cast<uint8_t>(fill_constant);
-#if CONFIG_HIGHBITDEPTH
         } else {
           data16[h * stride + w] = fill_constant;
-#endif  // CONFIG_HIGHBITDEPTH
         }
       }
     }
@@ -197,17 +273,13 @@ class SADTestBase : public ::testing::Test {
 
   void FillRandom(uint8_t *data, int stride) {
     uint8_t *data8 = data;
-#if CONFIG_HIGHBITDEPTH
     uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
-#endif  // CONFIG_HIGHBITDEPTH
     for (int h = 0; h < height_; ++h) {
       for (int w = 0; w < width_; ++w) {
         if (!use_high_bit_depth_) {
           data8[h * stride + w] = rnd_.Rand8();
-#if CONFIG_HIGHBITDEPTH
         } else {
           data16[h * stride + w] = rnd_.Rand16() & mask_;
-#endif  // CONFIG_HIGHBITDEPTH
         }
       }
     }
@@ -227,6 +299,13 @@ class SADTestBase : public ::testing::Test {
   static uint16_t *reference_data16_;
   static uint16_t *second_pred16_;
   int reference_stride_;
+  static uint8_t *comp_pred_;
+  static uint8_t *comp_pred8_;
+  static uint16_t *comp_pred16_;
+  static uint8_t *comp_pred_test_;
+  static uint8_t *comp_pred8_test_;
+  static uint16_t *comp_pred16_test_;
+  JNT_COMP_PARAMS jcp_param_;
 
   ACMRandom rnd_;
 };
@@ -312,15 +391,116 @@ class SADavgTest : public ::testing::WithParamInterface<SadMxNAvgParam>,
   }
 };
 
+class JntCompAvgTest : public ::testing::WithParamInterface<JntCompAvgParam>,
+                       public SADTestBase {
+ public:
+  JntCompAvgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  void jnt_comp_avg(int block_idx) {
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(comp_pred_test_, second_pred_, width_,
+                                          height_, reference, reference_stride_,
+                                          &jcp_param_));
+  }
+
+  void CheckCompAvg() {
+    for (int j = 0; j < 2; ++j) {
+      for (int i = 0; i < 4; ++i) {
+        jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0];
+        jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1];
+
+        ReferenceJntCompAvg(0);
+        jnt_comp_avg(0);
+
+        for (int y = 0; y < height_; ++y)
+          for (int x = 0; x < width_; ++x)
+            ASSERT_EQ(comp_pred_[y * width_ + x],
+                      comp_pred_test_[y * width_ + x]);
+      }
+    }
+  }
+};
+
+class JntSADTest : public ::testing::WithParamInterface<JntSadMxhParam>,
+                   public SADTestBase {
+ public:
+  JntSADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  unsigned int SAD(int block_idx) {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                                reference, reference_stride_,
+                                                GET_PARAM(0), GET_PARAM(1)));
+    return ret;
+  }
+
+  void CheckSAD() {
+    const unsigned int reference_sad = ReferenceSAD(0);
+    const unsigned int exp_sad = SAD(0);
+
+    ASSERT_EQ(reference_sad, exp_sad);
+  }
+
+  void SpeedSAD() {
+    int test_count = 20000000;
+    while (test_count > 0) {
+      SAD(0);
+      test_count -= 1;
+    }
+  }
+};
+
+class JntSADavgTest : public ::testing::WithParamInterface<JntSadMxNAvgParam>,
+                      public SADTestBase {
+ public:
+  JntSADavgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  unsigned int jnt_SAD_avg(int block_idx) {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                                reference, reference_stride_,
+                                                second_pred_, &jcp_param_));
+    return ret;
+  }
+
+  void CheckSAD() {
+    for (int j = 0; j < 2; ++j) {
+      for (int i = 0; i < 4; ++i) {
+        jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0];
+        jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1];
+
+        const unsigned int reference_sad = ReferenceJntSADavg(0);
+        const unsigned int exp_sad = jnt_SAD_avg(0);
+
+        ASSERT_EQ(reference_sad, exp_sad);
+      }
+    }
+  }
+};
+
 uint8_t *SADTestBase::source_data_ = NULL;
 uint8_t *SADTestBase::reference_data_ = NULL;
 uint8_t *SADTestBase::second_pred_ = NULL;
+uint8_t *SADTestBase::comp_pred_ = NULL;
+uint8_t *SADTestBase::comp_pred_test_ = NULL;
 uint8_t *SADTestBase::source_data8_ = NULL;
 uint8_t *SADTestBase::reference_data8_ = NULL;
 uint8_t *SADTestBase::second_pred8_ = NULL;
+uint8_t *SADTestBase::comp_pred8_ = NULL;
+uint8_t *SADTestBase::comp_pred8_test_ = NULL;
 uint16_t *SADTestBase::source_data16_ = NULL;
 uint16_t *SADTestBase::reference_data16_ = NULL;
 uint16_t *SADTestBase::second_pred16_ = NULL;
+uint16_t *SADTestBase::comp_pred16_ = NULL;
+uint16_t *SADTestBase::comp_pred16_test_ = NULL;
 
 TEST_P(SADTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
@@ -428,6 +608,132 @@ TEST_P(SADavgTest, ShortSrc) {
   source_stride_ = tmp_stride;
 }
 
+TEST_P(JntCompAvgTest, MaxRef) {
+  FillConstant(reference_data_, reference_stride_, mask_);
+  FillConstant(second_pred_, width_, 0);
+  CheckCompAvg();
+}
+
+TEST_P(JntCompAvgTest, MaxSecondPred) {
+  FillConstant(reference_data_, reference_stride_, 0);
+  FillConstant(second_pred_, width_, mask_);
+  CheckCompAvg();
+}
+
+TEST_P(JntCompAvgTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckCompAvg();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(JntCompAvgTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckCompAvg();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(JntSADTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  CheckSAD();
+}
+
+TEST_P(JntSADTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSAD();
+}
+
+TEST_P(JntSADTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(JntSADTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(JntSADTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  int test_count = 2000;
+  while (test_count > 0) {
+    FillRandom(source_data_, source_stride_);
+    FillRandom(reference_data_, reference_stride_);
+    CheckSAD();
+    test_count -= 1;
+  }
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(JntSADavgTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  FillConstant(second_pred_, width_, 0);
+  CheckSAD();
+}
+TEST_P(JntSADavgTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  FillConstant(second_pred_, width_, 0);
+  CheckSAD();
+}
+
+TEST_P(JntSADavgTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(JntSADavgTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(JntSADavgTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  int test_count = 2000;
+  while (test_count > 0) {
+    FillRandom(source_data_, source_stride_);
+    FillRandom(reference_data_, reference_stride_);
+    FillRandom(second_pred_, width_);
+    CheckSAD();
+    test_count -= 1;
+  }
+  source_stride_ = tmp_stride;
+}
+
 TEST_P(SADx4Test, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(GetReference(0), reference_stride_, mask_);
@@ -500,16 +806,14 @@ TEST_P(SADx4Test, SrcAlignedByWidth) {
   source_data_ = tmp_source_data;
 }
 
-using std::tr1::make_tuple;
+using ::testing::make_tuple;
 
 //------------------------------------------------------------------------------
 // C functions
 const SadMxNParam c_tests[] = {
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(128, 128, &aom_sad128x128_c, -1),
   make_tuple(128, 64, &aom_sad128x64_c, -1),
   make_tuple(64, 128, &aom_sad64x128_c, -1),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_sad64x64_c, -1),
   make_tuple(64, 32, &aom_sad64x32_c, -1),
   make_tuple(32, 64, &aom_sad32x64_c, -1),
@@ -523,12 +827,9 @@ const SadMxNParam c_tests[] = {
   make_tuple(8, 4, &aom_sad8x4_c, -1),
   make_tuple(4, 8, &aom_sad4x8_c, -1),
   make_tuple(4, 4, &aom_sad4x4_c, -1),
-#if CONFIG_HIGHBITDEPTH
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(128, 128, &aom_highbd_sad128x128_c, 8),
   make_tuple(128, 64, &aom_highbd_sad128x64_c, 8),
   make_tuple(64, 128, &aom_highbd_sad64x128_c, 8),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_highbd_sad64x64_c, 8),
   make_tuple(64, 32, &aom_highbd_sad64x32_c, 8),
   make_tuple(32, 64, &aom_highbd_sad32x64_c, 8),
@@ -542,11 +843,9 @@ const SadMxNParam c_tests[] = {
   make_tuple(8, 4, &aom_highbd_sad8x4_c, 8),
   make_tuple(4, 8, &aom_highbd_sad4x8_c, 8),
   make_tuple(4, 4, &aom_highbd_sad4x4_c, 8),
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(128, 128, &aom_highbd_sad128x128_c, 10),
   make_tuple(128, 64, &aom_highbd_sad128x64_c, 10),
   make_tuple(64, 128, &aom_highbd_sad64x128_c, 10),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_highbd_sad64x64_c, 10),
   make_tuple(64, 32, &aom_highbd_sad64x32_c, 10),
   make_tuple(32, 64, &aom_highbd_sad32x64_c, 10),
@@ -560,11 +859,9 @@ const SadMxNParam c_tests[] = {
   make_tuple(8, 4, &aom_highbd_sad8x4_c, 10),
   make_tuple(4, 8, &aom_highbd_sad4x8_c, 10),
   make_tuple(4, 4, &aom_highbd_sad4x4_c, 10),
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(128, 128, &aom_highbd_sad128x128_c, 12),
   make_tuple(128, 64, &aom_highbd_sad128x64_c, 12),
   make_tuple(64, 128, &aom_highbd_sad64x128_c, 12),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_highbd_sad64x64_c, 12),
   make_tuple(64, 32, &aom_highbd_sad64x32_c, 12),
   make_tuple(32, 64, &aom_highbd_sad32x64_c, 12),
@@ -578,16 +875,13 @@ const SadMxNParam c_tests[] = {
   make_tuple(8, 4, &aom_highbd_sad8x4_c, 12),
   make_tuple(4, 8, &aom_highbd_sad4x8_c, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4_c, 12),
-#endif  // CONFIG_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests));
 
 const SadMxNAvgParam avg_c_tests[] = {
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(128, 128, &aom_sad128x128_avg_c, -1),
   make_tuple(128, 64, &aom_sad128x64_avg_c, -1),
   make_tuple(64, 128, &aom_sad64x128_avg_c, -1),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_sad64x64_avg_c, -1),
   make_tuple(64, 32, &aom_sad64x32_avg_c, -1),
   make_tuple(32, 64, &aom_sad32x64_avg_c, -1),
@@ -601,12 +895,9 @@ const SadMxNAvgParam avg_c_tests[] = {
   make_tuple(8, 4, &aom_sad8x4_avg_c, -1),
   make_tuple(4, 8, &aom_sad4x8_avg_c, -1),
   make_tuple(4, 4, &aom_sad4x4_avg_c, -1),
-#if CONFIG_HIGHBITDEPTH
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(128, 128, &aom_highbd_sad128x128_avg_c, 8),
   make_tuple(128, 64, &aom_highbd_sad128x64_avg_c, 8),
   make_tuple(64, 128, &aom_highbd_sad64x128_avg_c, 8),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_highbd_sad64x64_avg_c, 8),
   make_tuple(64, 32, &aom_highbd_sad64x32_avg_c, 8),
   make_tuple(32, 64, &aom_highbd_sad32x64_avg_c, 8),
@@ -620,11 +911,9 @@ const SadMxNAvgParam avg_c_tests[] = {
   make_tuple(8, 4, &aom_highbd_sad8x4_avg_c, 8),
   make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 8),
   make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 8),
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(128, 128, &aom_highbd_sad128x128_avg_c, 10),
   make_tuple(128, 64, &aom_highbd_sad128x64_avg_c, 10),
   make_tuple(64, 128, &aom_highbd_sad64x128_avg_c, 10),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_highbd_sad64x64_avg_c, 10),
   make_tuple(64, 32, &aom_highbd_sad64x32_avg_c, 10),
   make_tuple(32, 64, &aom_highbd_sad32x64_avg_c, 10),
@@ -638,11 +927,9 @@ const SadMxNAvgParam avg_c_tests[] = {
   make_tuple(8, 4, &aom_highbd_sad8x4_avg_c, 10),
   make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 10),
   make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 10),
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(128, 128, &aom_highbd_sad128x128_avg_c, 12),
   make_tuple(128, 64, &aom_highbd_sad128x64_avg_c, 12),
   make_tuple(64, 128, &aom_highbd_sad64x128_avg_c, 12),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_highbd_sad64x64_avg_c, 12),
   make_tuple(64, 32, &aom_highbd_sad64x32_avg_c, 12),
   make_tuple(32, 64, &aom_highbd_sad32x64_avg_c, 12),
@@ -656,16 +943,56 @@ const SadMxNAvgParam avg_c_tests[] = {
   make_tuple(8, 4, &aom_highbd_sad8x4_avg_c, 12),
   make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 12),
-#endif  // CONFIG_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_CASE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
 
+// TODO(chengchen): add highbd tests
+const JntCompAvgParam jnt_comp_avg_c_tests[] = {
+  make_tuple(128, 128, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(128, 64, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(64, 128, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(64, 64, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(64, 32, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(32, 64, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(32, 32, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(32, 16, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(16, 32, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(16, 16, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(16, 8, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(8, 16, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(8, 8, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(8, 4, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(4, 8, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(4, 4, &aom_jnt_comp_avg_pred_c, -1),
+};
+
+INSTANTIATE_TEST_CASE_P(C, JntCompAvgTest,
+                        ::testing::ValuesIn(jnt_comp_avg_c_tests));
+
+const JntSadMxNAvgParam jnt_avg_c_tests[] = {
+  make_tuple(128, 128, &aom_jnt_sad128x128_avg_c, -1),
+  make_tuple(128, 64, &aom_jnt_sad128x64_avg_c, -1),
+  make_tuple(64, 128, &aom_jnt_sad64x128_avg_c, -1),
+  make_tuple(64, 64, &aom_jnt_sad64x64_avg_c, -1),
+  make_tuple(64, 32, &aom_jnt_sad64x32_avg_c, -1),
+  make_tuple(32, 64, &aom_jnt_sad32x64_avg_c, -1),
+  make_tuple(32, 32, &aom_jnt_sad32x32_avg_c, -1),
+  make_tuple(32, 16, &aom_jnt_sad32x16_avg_c, -1),
+  make_tuple(16, 32, &aom_jnt_sad16x32_avg_c, -1),
+  make_tuple(16, 16, &aom_jnt_sad16x16_avg_c, -1),
+  make_tuple(16, 8, &aom_jnt_sad16x8_avg_c, -1),
+  make_tuple(8, 16, &aom_jnt_sad8x16_avg_c, -1),
+  make_tuple(8, 8, &aom_jnt_sad8x8_avg_c, -1),
+  make_tuple(8, 4, &aom_jnt_sad8x4_avg_c, -1),
+  make_tuple(4, 8, &aom_jnt_sad4x8_avg_c, -1),
+  make_tuple(4, 4, &aom_jnt_sad4x4_avg_c, -1),
+};
+INSTANTIATE_TEST_CASE_P(C, JntSADavgTest, ::testing::ValuesIn(jnt_avg_c_tests));
+
 const SadMxNx4Param x4d_c_tests[] = {
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(128, 128, &aom_sad128x128x4d_c, -1),
   make_tuple(128, 64, &aom_sad128x64x4d_c, -1),
   make_tuple(64, 128, &aom_sad64x128x4d_c, -1),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_sad64x64x4d_c, -1),
   make_tuple(64, 32, &aom_sad64x32x4d_c, -1),
   make_tuple(32, 64, &aom_sad32x64x4d_c, -1),
@@ -679,12 +1006,9 @@ const SadMxNx4Param x4d_c_tests[] = {
   make_tuple(8, 4, &aom_sad8x4x4d_c, -1),
   make_tuple(4, 8, &aom_sad4x8x4d_c, -1),
   make_tuple(4, 4, &aom_sad4x4x4d_c, -1),
-#if CONFIG_HIGHBITDEPTH
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(128, 128, &aom_highbd_sad128x128x4d_c, 8),
   make_tuple(128, 64, &aom_highbd_sad128x64x4d_c, 8),
   make_tuple(64, 128, &aom_highbd_sad64x128x4d_c, 8),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_highbd_sad64x64x4d_c, 8),
   make_tuple(64, 32, &aom_highbd_sad64x32x4d_c, 8),
   make_tuple(32, 64, &aom_highbd_sad32x64x4d_c, 8),
@@ -698,11 +1022,9 @@ const SadMxNx4Param x4d_c_tests[] = {
   make_tuple(8, 4, &aom_highbd_sad8x4x4d_c, 8),
   make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 8),
   make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 8),
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(128, 128, &aom_highbd_sad128x128x4d_c, 10),
   make_tuple(128, 64, &aom_highbd_sad128x64x4d_c, 10),
   make_tuple(64, 128, &aom_highbd_sad64x128x4d_c, 10),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_highbd_sad64x64x4d_c, 10),
   make_tuple(64, 32, &aom_highbd_sad64x32x4d_c, 10),
   make_tuple(32, 64, &aom_highbd_sad32x64x4d_c, 10),
@@ -716,11 +1038,9 @@ const SadMxNx4Param x4d_c_tests[] = {
   make_tuple(8, 4, &aom_highbd_sad8x4x4d_c, 10),
   make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 10),
   make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 10),
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(128, 128, &aom_highbd_sad128x128x4d_c, 12),
   make_tuple(128, 64, &aom_highbd_sad128x64x4d_c, 12),
   make_tuple(64, 128, &aom_highbd_sad64x128x4d_c, 12),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_highbd_sad64x64x4d_c, 12),
   make_tuple(64, 32, &aom_highbd_sad64x32x4d_c, 12),
   make_tuple(32, 64, &aom_highbd_sad32x64x4d_c, 12),
@@ -734,7 +1054,6 @@ const SadMxNx4Param x4d_c_tests[] = {
   make_tuple(8, 4, &aom_highbd_sad8x4x4d_c, 12),
   make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 12),
-#endif  // CONFIG_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
 
@@ -764,11 +1083,9 @@ INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
 // x86 functions
 #if HAVE_SSE2
 const SadMxNParam sse2_tests[] = {
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(128, 128, &aom_sad128x128_sse2, -1),
   make_tuple(128, 64, &aom_sad128x64_sse2, -1),
   make_tuple(64, 128, &aom_sad64x128_sse2, -1),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_sad64x64_sse2, -1),
   make_tuple(64, 32, &aom_sad64x32_sse2, -1),
   make_tuple(32, 64, &aom_sad32x64_sse2, -1),
@@ -782,7 +1099,6 @@ const SadMxNParam sse2_tests[] = {
   make_tuple(8, 4, &aom_sad8x4_sse2, -1),
   make_tuple(4, 8, &aom_sad4x8_sse2, -1),
   make_tuple(4, 4, &aom_sad4x4_sse2, -1),
-#if CONFIG_HIGHBITDEPTH
   make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 8),
   make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 8),
   make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 8),
@@ -816,16 +1132,13 @@ const SadMxNParam sse2_tests[] = {
   make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 12),
   make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 12),
   make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 12),
-#endif  // CONFIG_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 
 const SadMxNAvgParam avg_sse2_tests[] = {
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(128, 128, &aom_sad128x128_avg_sse2, -1),
   make_tuple(128, 64, &aom_sad128x64_avg_sse2, -1),
   make_tuple(64, 128, &aom_sad64x128_avg_sse2, -1),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_sad64x64_avg_sse2, -1),
   make_tuple(64, 32, &aom_sad64x32_avg_sse2, -1),
   make_tuple(32, 64, &aom_sad32x64_avg_sse2, -1),
@@ -839,7 +1152,6 @@ const SadMxNAvgParam avg_sse2_tests[] = {
   make_tuple(8, 4, &aom_sad8x4_avg_sse2, -1),
   make_tuple(4, 8, &aom_sad4x8_avg_sse2, -1),
   make_tuple(4, 4, &aom_sad4x4_avg_sse2, -1),
-#if CONFIG_HIGHBITDEPTH
   make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 8),
   make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 8),
   make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 8),
@@ -873,16 +1185,13 @@ const SadMxNAvgParam avg_sse2_tests[] = {
   make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 12),
   make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 12),
   make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 12),
-#endif  // CONFIG_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_CASE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests));
 
 const SadMxNx4Param x4d_sse2_tests[] = {
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(128, 128, &aom_sad128x128x4d_sse2, -1),
   make_tuple(128, 64, &aom_sad128x64x4d_sse2, -1),
   make_tuple(64, 128, &aom_sad64x128x4d_sse2, -1),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_sad64x64x4d_sse2, -1),
   make_tuple(64, 32, &aom_sad64x32x4d_sse2, -1),
   make_tuple(32, 64, &aom_sad32x64x4d_sse2, -1),
@@ -896,7 +1205,6 @@ const SadMxNx4Param x4d_sse2_tests[] = {
   make_tuple(8, 4, &aom_sad8x4x4d_sse2, -1),
   make_tuple(4, 8, &aom_sad4x8x4d_sse2, -1),
   make_tuple(4, 4, &aom_sad4x4x4d_sse2, -1),
-#if CONFIG_HIGHBITDEPTH
   make_tuple(64, 64, &aom_highbd_sad64x64x4d_sse2, 8),
   make_tuple(64, 32, &aom_highbd_sad64x32x4d_sse2, 8),
   make_tuple(32, 64, &aom_highbd_sad32x64x4d_sse2, 8),
@@ -936,17 +1244,90 @@ const SadMxNx4Param x4d_sse2_tests[] = {
   make_tuple(8, 4, &aom_highbd_sad8x4x4d_sse2, 12),
   make_tuple(4, 8, &aom_highbd_sad4x8x4d_sse2, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4x4d_sse2, 12),
-#endif  // CONFIG_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
 #endif  // HAVE_SSE2
 
+#if HAVE_SSSE3
+// Note: These are named sse2, but part of ssse3 file and only built and linked
+// when ssse3 is enabled.
+const JntSadMxhParam jnt_sad_sse2_tests[] = {
+  make_tuple(4, 4, &aom_sad4xh_sse2, -1),
+  make_tuple(4, 8, &aom_sad4xh_sse2, -1),
+  make_tuple(8, 4, &aom_sad8xh_sse2, -1),
+  make_tuple(8, 8, &aom_sad8xh_sse2, -1),
+  make_tuple(8, 16, &aom_sad8xh_sse2, -1),
+  make_tuple(16, 8, &aom_sad16xh_sse2, -1),
+  make_tuple(16, 16, &aom_sad16xh_sse2, -1),
+  make_tuple(16, 32, &aom_sad16xh_sse2, -1),
+  make_tuple(32, 16, &aom_sad32xh_sse2, -1),
+  make_tuple(32, 32, &aom_sad32xh_sse2, -1),
+  make_tuple(32, 64, &aom_sad32xh_sse2, -1),
+  make_tuple(64, 32, &aom_sad64xh_sse2, -1),
+  make_tuple(64, 64, &aom_sad64xh_sse2, -1),
+  make_tuple(128, 128, &aom_sad128xh_sse2, -1),
+  make_tuple(128, 64, &aom_sad128xh_sse2, -1),
+  make_tuple(64, 128, &aom_sad64xh_sse2, -1),
+  make_tuple(4, 16, &aom_sad4xh_sse2, -1),
+  make_tuple(16, 4, &aom_sad16xh_sse2, -1),
+  make_tuple(8, 32, &aom_sad8xh_sse2, -1),
+  make_tuple(32, 8, &aom_sad32xh_sse2, -1),
+  make_tuple(16, 64, &aom_sad16xh_sse2, -1),
+  make_tuple(64, 16, &aom_sad64xh_sse2, -1),
+};
+INSTANTIATE_TEST_CASE_P(SSE2, JntSADTest,
+                        ::testing::ValuesIn(jnt_sad_sse2_tests));
+
+#endif  // HAVE_SSSE3
+
 #if HAVE_SSE3
 // Only functions are x3, which do not have tests.
 #endif  // HAVE_SSE3
 
 #if HAVE_SSSE3
-// Only functions are x3, which do not have tests.
+const JntCompAvgParam jnt_comp_avg_ssse3_tests[] = {
+  make_tuple(128, 128, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(128, 64, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(64, 128, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(64, 64, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(64, 32, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(32, 64, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(32, 32, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(32, 16, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 32, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 16, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 8, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(8, 16, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(8, 8, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(8, 4, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(4, 8, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(4, 4, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 16, &aom_jnt_comp_avg_pred_ssse3, -1),
+};
+
+INSTANTIATE_TEST_CASE_P(SSSE3, JntCompAvgTest,
+                        ::testing::ValuesIn(jnt_comp_avg_ssse3_tests));
+
+const JntSadMxNAvgParam jnt_avg_ssse3_tests[] = {
+  make_tuple(128, 128, &aom_jnt_sad128x128_avg_ssse3, -1),
+  make_tuple(128, 64, &aom_jnt_sad128x64_avg_ssse3, -1),
+  make_tuple(64, 128, &aom_jnt_sad64x128_avg_ssse3, -1),
+  make_tuple(64, 64, &aom_jnt_sad64x64_avg_ssse3, -1),
+  make_tuple(64, 32, &aom_jnt_sad64x32_avg_ssse3, -1),
+  make_tuple(32, 64, &aom_jnt_sad32x64_avg_ssse3, -1),
+  make_tuple(32, 32, &aom_jnt_sad32x32_avg_ssse3, -1),
+  make_tuple(32, 16, &aom_jnt_sad32x16_avg_ssse3, -1),
+  make_tuple(16, 32, &aom_jnt_sad16x32_avg_ssse3, -1),
+  make_tuple(16, 16, &aom_jnt_sad16x16_avg_ssse3, -1),
+  make_tuple(16, 8, &aom_jnt_sad16x8_avg_ssse3, -1),
+  make_tuple(8, 16, &aom_jnt_sad8x16_avg_ssse3, -1),
+  make_tuple(8, 8, &aom_jnt_sad8x8_avg_ssse3, -1),
+  make_tuple(8, 4, &aom_jnt_sad8x4_avg_ssse3, -1),
+  make_tuple(4, 8, &aom_jnt_sad4x8_avg_ssse3, -1),
+  make_tuple(4, 4, &aom_jnt_sad4x4_avg_ssse3, -1),
+};
+INSTANTIATE_TEST_CASE_P(SSSE3, JntSADavgTest,
+                        ::testing::ValuesIn(jnt_avg_ssse3_tests));
 #endif  // HAVE_SSSE3
 
 #if HAVE_SSE4_1
@@ -955,18 +1336,14 @@ INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
 
 #if HAVE_AVX2
 const SadMxNParam avx2_tests[] = {
-#if CONFIG_EXT_PARTITION
   make_tuple(64, 128, &aom_sad64x128_avx2, -1),
   make_tuple(128, 64, &aom_sad128x64_avx2, -1),
   make_tuple(128, 128, &aom_sad128x128_avx2, -1),
-#endif
   make_tuple(64, 64, &aom_sad64x64_avx2, -1),
   make_tuple(64, 32, &aom_sad64x32_avx2, -1),
   make_tuple(32, 64, &aom_sad32x64_avx2, -1),
   make_tuple(32, 32, &aom_sad32x32_avx2, -1),
   make_tuple(32, 16, &aom_sad32x16_avx2, -1),
-#if CONFIG_HIGHBITDEPTH
-#if CONFIG_EXT_PARTITION
   make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 8),
   make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 10),
   make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 12),
@@ -976,7 +1353,6 @@ const SadMxNParam avx2_tests[] = {
   make_tuple(64, 128, &aom_highbd_sad64x128_avx2, 8),
   make_tuple(64, 128, &aom_highbd_sad64x128_avx2, 10),
   make_tuple(64, 128, &aom_highbd_sad64x128_avx2, 12),
-#endif
   make_tuple(64, 64, &aom_highbd_sad64x64_avx2, 8),
   make_tuple(64, 64, &aom_highbd_sad64x64_avx2, 10),
   make_tuple(64, 64, &aom_highbd_sad64x64_avx2, 12),
@@ -1001,23 +1377,18 @@ const SadMxNParam avx2_tests[] = {
   make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 8),
   make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 10),
   make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 12),
-#endif
 };
 INSTANTIATE_TEST_CASE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
 
 const SadMxNAvgParam avg_avx2_tests[] = {
-#if CONFIG_EXT_PARTITION
   make_tuple(64, 128, &aom_sad64x128_avg_avx2, -1),
   make_tuple(128, 64, &aom_sad128x64_avg_avx2, -1),
   make_tuple(128, 128, &aom_sad128x128_avg_avx2, -1),
-#endif
   make_tuple(64, 64, &aom_sad64x64_avg_avx2, -1),
   make_tuple(64, 32, &aom_sad64x32_avg_avx2, -1),
   make_tuple(32, 64, &aom_sad32x64_avg_avx2, -1),
   make_tuple(32, 32, &aom_sad32x32_avg_avx2, -1),
   make_tuple(32, 16, &aom_sad32x16_avg_avx2, -1),
-#if CONFIG_HIGHBITDEPTH
-#if CONFIG_EXT_PARTITION
   make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 8),
   make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 10),
   make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 12),
@@ -1027,7 +1398,6 @@ const SadMxNAvgParam avg_avx2_tests[] = {
   make_tuple(64, 128, &aom_highbd_sad64x128_avg_avx2, 8),
   make_tuple(64, 128, &aom_highbd_sad64x128_avg_avx2, 10),
   make_tuple(64, 128, &aom_highbd_sad64x128_avg_avx2, 12),
-#endif
   make_tuple(64, 64, &aom_highbd_sad64x64_avg_avx2, 8),
   make_tuple(64, 64, &aom_highbd_sad64x64_avg_avx2, 10),
   make_tuple(64, 64, &aom_highbd_sad64x64_avg_avx2, 12),
@@ -1052,22 +1422,17 @@ const SadMxNAvgParam avg_avx2_tests[] = {
   make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 8),
   make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 10),
   make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 12),
-#endif
 };
 INSTANTIATE_TEST_CASE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
 
 const SadMxNx4Param x4d_avx2_tests[] = {
-#if CONFIG_EXT_PARTITION
   make_tuple(64, 128, &aom_sad64x128x4d_avx2, -1),
   make_tuple(128, 64, &aom_sad128x64x4d_avx2, -1),
   make_tuple(128, 128, &aom_sad128x128x4d_avx2, -1),
-#endif
   make_tuple(64, 64, &aom_sad64x64x4d_avx2, -1),
   make_tuple(32, 64, &aom_sad32x64x4d_avx2, -1),
   make_tuple(64, 32, &aom_sad64x32x4d_avx2, -1),
   make_tuple(32, 32, &aom_sad32x32x4d_avx2, -1),
-#if CONFIG_HIGHBITDEPTH
-#if CONFIG_EXT_PARTITION
   make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 8),
   make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 10),
   make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 12),
@@ -1077,7 +1442,6 @@ const SadMxNx4Param x4d_avx2_tests[] = {
   make_tuple(64, 128, &aom_highbd_sad64x128x4d_avx2, 8),
   make_tuple(64, 128, &aom_highbd_sad64x128x4d_avx2, 10),
   make_tuple(64, 128, &aom_highbd_sad64x128x4d_avx2, 12),
-#endif
   make_tuple(64, 64, &aom_highbd_sad64x64x4d_avx2, 8),
   make_tuple(64, 64, &aom_highbd_sad64x64x4d_avx2, 10),
   make_tuple(64, 64, &aom_highbd_sad64x64x4d_avx2, 12),
@@ -1102,7 +1466,6 @@ const SadMxNx4Param x4d_avx2_tests[] = {
   make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 8),
   make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 10),
   make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 12),
-#endif
 };
 INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
 #endif  // HAVE_AVX2
diff --git a/third_party/aom/test/scalability_test.cc b/third_party/aom/test/scalability_test.cc
new file mode 100644
index 000000000..b39918861
--- /dev/null
+++ b/third_party/aom/test/scalability_test.cc
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+const int kCpuUsed = 8;
+const int kBaseLayerQp = 55;
+const int kEnhancementLayerQp = 20;
+
+class ScalabilityTest
+    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ScalabilityTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~ScalabilityTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    num_spatial_layers_ = 2;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
+      encoder->Control(AOME_SET_NUMBER_SPATIAL_LAYERS, num_spatial_layers_);
+    } else if (video->frame() % num_spatial_layers_) {
+      frame_flags_ = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+                     AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+                     AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+                     AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+                     AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY;
+      encoder->Control(AOME_SET_SPATIAL_LAYER_ID, 1);
+      encoder->Control(AOME_SET_CQ_LEVEL, kEnhancementLayerQp);
+    } else {
+      frame_flags_ = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+                     AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+                     AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+                     AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
+                     AOM_EFLAG_NO_UPD_ENTROPY;
+      encoder->Control(AOME_SET_SPATIAL_LAYER_ID, 0);
+      encoder->Control(AOME_SET_CQ_LEVEL, kBaseLayerQp);
+    }
+  }
+
+  void DoTest(int num_spatial_layers) {
+    num_spatial_layers_ = num_spatial_layers;
+    cfg_.rc_end_usage = AOM_Q;
+    cfg_.g_lag_in_frames = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 18);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  int num_spatial_layers_;
+};
+
+TEST_P(ScalabilityTest, TestNoMismatch2SpatialLayers) { DoTest(2); }
+
+TEST_P(ScalabilityTest, TestNoMismatch3SpatialLayers) { DoTest(3); }
+
+AV1_INSTANTIATE_TEST_CASE(ScalabilityTest,
+                          ::testing::Values(::libaom_test::kRealTime));
+
+}  // namespace
diff --git a/third_party/aom/test/scan_test.cc b/third_party/aom/test/scan_test.cc
index 2b11bd1fb..dee2ab5a6 100644
--- a/third_party/aom/test/scan_test.cc
+++ b/third_party/aom/test/scan_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -9,122 +9,125 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "av1/common/common_data.h"
-#include "av1/common/scan.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "av1/common/scan.h"
+#include "av1/common/txb_common.h"
+#include "test/av1_txfm_test.h"
 
-namespace {
-
-TEST(ScanTest, av1_augment_prob) {
-  const TX_SIZE tx_size = TX_4X4;
-  const TX_TYPE tx_type = DCT_DCT;
-  const int tx1d_size = tx_size_wide[tx_size];
-  uint32_t prob[16] = { 8, 8, 7, 7, 8, 8, 4, 2, 3, 3, 2, 2, 2, 2, 2, 2 };
-  const uint32_t ref_prob[16] = {
-    8, 8, 7, 7, 8, 8, 4, 2, 3, 3, 2, 2, 2, 2, 2, 2
-  };
-  av1_augment_prob(tx_size, tx_type, prob);
-  for (int r = 0; r < tx1d_size; ++r) {
-    for (int c = 0; c < tx1d_size; ++c) {
-      const uint32_t idx = r * tx1d_size + c;
-      EXPECT_EQ(ref_prob[idx], prob[idx] >> 16);
-    }
-  }
-
-  const SCAN_ORDER *sc = get_default_scan(tx_size, tx_type, 0);
-  const uint32_t mask = (1 << 16) - 1;
-  for (int r = 0; r < tx1d_size; ++r) {
-    for (int c = 0; c < tx1d_size; ++c) {
-      const uint32_t ref_idx = r * tx1d_size + c;
-      const uint32_t scan_idx = mask ^ (prob[r * tx1d_size + c] & mask);
-      const uint32_t idx = sc->scan[scan_idx];
-      EXPECT_EQ(ref_idx, idx);
-    }
-  }
-}
-
-#if USE_TOPOLOGICAL_SORT
-TEST(ScanTest, av1_update_sort_order) {
-  const TX_SIZE tx_size = TX_4X4;
-  const TX_TYPE tx_type = DCT_DCT;
-  const uint32_t prob[16] = { 15, 14, 11, 10, 13, 12, 9, 5,
-                              8,  7,  4,  2,  6,  3,  1, 0 };
-  const int16_t ref_sort_order[16] = { 0, 1,  4, 5,  2,  3,  6,  8,
-                                       9, 12, 7, 10, 13, 11, 14, 15 };
-  int16_t sort_order[16];
-  av1_update_sort_order(tx_size, tx_type, prob, sort_order);
-  for (int i = 0; i < 16; ++i) EXPECT_EQ(ref_sort_order[i], sort_order[i]);
-}
-#endif
-
-#if USE_TOPOLOGICAL_SORT
-TEST(ScanTest, av1_update_scan_order) {
-  TX_SIZE tx_size = TX_4X4;
-  const TX_TYPE tx_type = DCT_DCT;
-  const uint32_t prob[16] = { 10, 12, 14, 9, 11, 13, 15, 5,
-                              8,  7,  4,  2, 6,  3,  1,  0 };
-  int16_t sort_order[16];
-  int16_t scan[16];
-  int16_t iscan[16];
-  const int16_t ref_iscan[16] = { 0, 1, 2,  6,  3, 4,  5,  10,
-                                  7, 8, 11, 13, 9, 12, 14, 15 };
-
-  av1_update_sort_order(tx_size, tx_type, prob, sort_order);
-  av1_update_scan_order(tx_size, sort_order, scan, iscan);
-
-  for (int i = 0; i < 16; ++i) {
-    EXPECT_EQ(ref_iscan[i], iscan[i]);
-    EXPECT_EQ(i, scan[ref_iscan[i]]);
+static int scan_test(const int16_t *scan, const int16_t *iscan, int si, int r,
+                     int c, int w) {
+  if (iscan[r * w + c] != si || scan[si] != r * w + c) {
+    printf("r %d c %d ref_iscan %d iscan %d ref_scan %d scan %d\n", r, c, si,
+           iscan[r * w + c], r * w + c, scan[si]);
+    return 1;
+  } else {
+    return 0;
   }
 }
-#endif
-
-TEST(ScanTest, av1_update_neighbors) {
-  TX_SIZE tx_size = TX_4X4;
-  // raster order
-  const int16_t scan[16] = { 0, 1, 2,  3,  4,  5,  6,  7,
-                             8, 9, 10, 11, 12, 13, 14, 15 };
-  int16_t nb[(16 + 1) * 2];
-  const int16_t ref_nb[(16 + 1) * 2] = { 0,  0,  0,  0,  1,  1,  2, 2, 0,
-                                         1,  1,  4,  2,  5,  3,  6, 4, 5,
-                                         5,  8,  6,  9,  7,  10, 8, 9, 9,
-                                         12, 10, 13, 11, 14, 0,  0 };
 
-  // raster order's scan and iscan are the same
-  av1_update_neighbors(tx_size, scan, scan, nb);
-
-  for (int i = 0; i < (16 + 1) * 2; ++i) {
-    EXPECT_EQ(ref_nb[i], nb[i]);
+int scan_order_test(const SCAN_ORDER *scan_order, int w, int h,
+                    SCAN_MODE mode) {
+  const int16_t *scan = scan_order->scan;
+  const int16_t *iscan = scan_order->iscan;
+  int dim = w + h - 1;
+  if (mode == SCAN_MODE_ZIG_ZAG) {
+    int si = 0;
+    for (int i = 0; i < dim; ++i) {
+      if (i % 2 == 0) {
+        for (int c = 0; c < w; ++c) {
+          int r = i - c;
+          if (r >= 0 && r < h) {
+            if (scan_test(scan, iscan, si, r, c, w)) return 1;
+            ++si;
+          }
+        }
+      } else {
+        for (int r = 0; r < h; ++r) {
+          int c = i - r;
+          if (c >= 0 && c < w) {
+            if (scan_test(scan, iscan, si, r, c, w)) return 1;
+            ++si;
+          }
+        }
+      }
+    }
+  } else if (mode == SCAN_MODE_COL_DIAG) {
+    int si = 0;
+    for (int i = 0; i < dim; ++i) {
+      for (int c = 0; c < w; ++c) {
+        int r = i - c;
+        if (r >= 0 && r < h) {
+          if (scan_test(scan, iscan, si, r, c, w)) return 1;
+          ++si;
+        }
+      }
+    }
+  } else if (mode == SCAN_MODE_ROW_DIAG) {
+    int si = 0;
+    for (int i = 0; i < dim; ++i) {
+      for (int r = 0; r < h; ++r) {
+        int c = i - r;
+        if (c >= 0 && c < w) {
+          if (scan_test(scan, iscan, si, r, c, w)) return 1;
+          ++si;
+        }
+      }
+    }
+  } else if (mode == SCAN_MODE_ROW_1D) {
+    int si = 0;
+    for (int r = 0; r < h; ++r) {
+      for (int c = 0; c < w; ++c) {
+        if (scan_test(scan, iscan, si, r, c, w)) return 1;
+        ++si;
+      }
+    }
+  } else {
+    assert(mode == SCAN_MODE_COL_1D);
+    int si = 0;
+    for (int c = 0; c < w; ++c) {
+      for (int r = 0; r < h; ++r) {
+        if (scan_test(scan, iscan, si, r, c, w)) return 1;
+        ++si;
+      }
+    }
   }
+  return 0;
 }
 
-#if USE_2X2_PROB
-TEST(ScanTest, av1_down_sample_scan_count) {
-  const uint32_t non_zero_count[256] = {
-    13, 12, 11, 10, 0,  0, 0, 0,  0, 0, 0,  0,  0, 0, 0, 0, 13, 9, 10, 8, 0, 0,
-    0,  0,  0,  0,  0,  0, 0, 0,  0, 0, 11, 12, 9, 8, 0, 0, 0,  0, 0,  0, 0, 0,
-    0,  0,  0,  0,  13, 9, 9, 10, 0, 0, 0,  0,  0, 0, 0, 0, 0,  0, 0,  0, 0, 0,
-    0,  0,  0,  0,  0,  0, 0, 0,  0, 0, 0,  0,  0, 0, 0, 0, 0,  0, 0,  0, 0, 0,
-    0,  0,  0,  0,  0,  0, 0, 0,  0, 0, 0,  0,  0, 0, 0, 0, 0,  0, 0,  0, 0, 0,
-    0,  0,  0,  0,  0,  0, 0, 0,  0, 0, 0,  0,  0, 0, 0, 0, 0,  0, 0,  0, 0, 0,
-    0,  0,  0,  0,  0,  0, 0, 0,  0, 0, 0,  0,  0, 0, 0, 0, 0,  0, 0,  0, 0, 0,
-    0,  0,  0,  0,  0,  0, 0, 0,  0, 0, 0,  0,  0, 0, 0, 0, 0,  0, 0,  0, 0, 0,
-    0,  0,  0,  0,  0,  0, 0, 0,  0, 0, 0,  0,  0, 0, 0, 0, 0,  0, 0,  0, 0, 0,
-    0,  0,  0,  0,  0,  0, 0, 0,  0, 0, 0,  0,  0, 0, 0, 0, 0,  0, 0,  0, 0, 0,
-    0,  0,  0,  0,  0,  0, 0, 0,  0, 0, 0,  0,  0, 0, 0, 0, 0,  0, 0,  0, 0, 0,
-    0,  0,  0,  0,  0,  0, 0, 0,  0, 0, 0,  0,  0, 0,
-  };
-  const uint32_t ref_non_zero_count_ds[64] = {
-    13, 11, 0, 0, 0, 0, 0, 0, 11, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0,  0,  0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0,  0,  0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  };
-  uint32_t non_zero_count_ds[64];
-  av1_down_sample_scan_count(non_zero_count_ds, non_zero_count, TX_16X16);
-  for (int i = 0; i < 64; ++i) {
-    EXPECT_EQ(ref_non_zero_count_ds[i], non_zero_count_ds[i]);
+TEST(Av1ScanTest, Dependency) {
+  for (int tx_size = TX_4X4; tx_size < TX_SIZES_ALL; ++tx_size) {
+    const int org_rows = tx_size_high[(TX_SIZE)tx_size];
+    const int org_cols = tx_size_wide[(TX_SIZE)tx_size];
+    const int rows = get_txb_high((TX_SIZE)tx_size);
+    const int cols = get_txb_wide((TX_SIZE)tx_size);
+    for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+      if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(tx_size),
+                                         static_cast<TX_TYPE>(tx_type)) ==
+          false) {
+        continue;
+      }
+      SCAN_MODE scan_mode;
+      TX_CLASS tx_class = tx_type_to_class[(TX_TYPE)tx_type];
+      if (tx_class == TX_CLASS_2D) {
+        if (rows == cols) {
+          scan_mode = SCAN_MODE_ZIG_ZAG;
+        } else if (rows > cols) {
+          scan_mode = SCAN_MODE_ROW_DIAG;
+        } else {
+          scan_mode = SCAN_MODE_COL_DIAG;
+        }
+      } else if (tx_class == TX_CLASS_VERT) {
+        scan_mode = SCAN_MODE_ROW_1D;
+      } else {
+        assert(tx_class == TX_CLASS_HORIZ);
+        scan_mode = SCAN_MODE_COL_1D;
+      }
+      const SCAN_ORDER *scan_order =
+          get_default_scan((TX_SIZE)tx_size, (TX_TYPE)tx_type);
+      ASSERT_EQ(scan_order_test(scan_order, cols, rows, scan_mode), 0)
+          << "scan mismatch tx_class " << tx_class << " tx_type " << tx_type
+          << " tx_w " << org_cols << " tx_h " << org_rows << " scan_mode "
+          << scan_mode << "\n";
+    }
   }
 }
-#endif
-
-}  // namespace
diff --git a/third_party/aom/test/segment_binarization_sync.cc b/third_party/aom/test/segment_binarization_sync.cc
new file mode 100644
index 000000000..bd8cf1141
--- /dev/null
+++ b/third_party/aom/test/segment_binarization_sync.cc
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+
+using libaom_test::ACMRandom;
+
+extern "C" {
+int av1_neg_interleave(int x, int ref, int max);
+int av1_neg_deinterleave(int diff, int ref, int max);
+}
+
+namespace {
+
+struct Segment {
+  int id;
+  int pred;
+  int last_id;
+};
+
+Segment GenerateSegment(int seed) {
+  static const int MAX_SEGMENTS = 8;
+
+  ACMRandom rnd_(seed);
+
+  Segment segment;
+  const int last_segid = rnd_.PseudoUniform(MAX_SEGMENTS);
+  segment.last_id = last_segid;
+  segment.pred = rnd_.PseudoUniform(MAX_SEGMENTS);
+  segment.id = rnd_.PseudoUniform(last_segid + 1);
+
+  return segment;
+}
+
+// Try to reveal a mismatch between segment binarization and debinarization
+TEST(SegmentBinarizationSync, SearchForBinarizationMismatch) {
+  const int count_tests = 1000;
+  const int seed_init = 4321;
+
+  for (int i = 0; i < count_tests; ++i) {
+    const Segment seg = GenerateSegment(seed_init + i);
+
+    const int max_segid = seg.last_id + 1;
+    const int seg_diff = av1_neg_interleave(seg.id, seg.pred, max_segid);
+    const int decoded_segid =
+        av1_neg_deinterleave(seg_diff, seg.pred, max_segid);
+
+    ASSERT_EQ(decoded_segid, seg.id);
+  }
+}
+
+}  // namespace
diff --git a/third_party/aom/test/selfguided_filter_test.cc b/third_party/aom/test/selfguided_filter_test.cc
index 55ce1d5de..4506a90db 100644
--- a/third_party/aom/test/selfguided_filter_test.cc
+++ b/third_party/aom/test/selfguided_filter_test.cc
@@ -13,22 +13,30 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./av1_rtcd.h"
+#include "config/av1_rtcd.h"
+
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
+#include "aom_ports/aom_timer.h"
 #include "av1/common/mv.h"
 #include "av1/common/restoration.h"
 
 namespace {
 
-using std::tr1::tuple;
-using std::tr1::make_tuple;
+using ::testing::make_tuple;
+using ::testing::tuple;
 using libaom_test::ACMRandom;
 
-typedef tuple<> FilterTestParam;
+typedef void (*SgrFunc)(const uint8_t *dat8, int width, int height, int stride,
+                        int eps, const int *xqd, uint8_t *dst8, int dst_stride,
+                        int32_t *tmpbuf, int bit_depth, int highbd);
+
+// Test parameter list:
+//  <tst_fun_>
+typedef tuple<SgrFunc> FilterTestParam;
 
 class AV1SelfguidedFilterTest
     : public ::testing::TestWithParam<FilterTestParam> {
@@ -40,6 +48,7 @@ class AV1SelfguidedFilterTest
 
  protected:
   void RunSpeedTest() {
+    tst_fun_ = GET_PARAM(0);
     const int pu_width = RESTORATION_PROC_UNIT_SIZE;
     const int pu_height = RESTORATION_PROC_UNIT_SIZE;
     const int width = 256, height = 256, stride = 288, out_stride = 288;
@@ -47,10 +56,10 @@ class AV1SelfguidedFilterTest
     int i, j, k;
 
     uint8_t *input_ =
-        (uint8_t *)aom_memalign(16, stride * (height + 32) * sizeof(uint8_t));
+        (uint8_t *)aom_memalign(32, stride * (height + 32) * sizeof(uint8_t));
     uint8_t *output_ = (uint8_t *)aom_memalign(
-        16, out_stride * (height + 32) * sizeof(uint8_t));
-    int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
+        32, out_stride * (height + 32) * sizeof(uint8_t));
+    int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
     uint8_t *input = input_ + stride * 16 + 16;
     uint8_t *output = output_ + out_stride * 16 + 16;
 
@@ -60,19 +69,18 @@ class AV1SelfguidedFilterTest
       for (j = -16; j < width + 16; ++j)
         input[i * stride + j] = rnd.Rand16() & 0xFF;
 
-    int xqd[2] = {
-      SGRPROJ_PRJ_MIN0 +
-          rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 - SGRPROJ_PRJ_MIN0),
-      SGRPROJ_PRJ_MIN1 +
-          rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 - SGRPROJ_PRJ_MIN1)
-    };
+    int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 -
+                                                        SGRPROJ_PRJ_MIN0),
+                   SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 -
+                                                        SGRPROJ_PRJ_MIN1) };
     // Fix a parameter set, since the speed depends slightly on r.
     // Change this to test different combinations of values of r.
     int eps = 15;
 
     av1_loop_restoration_precal();
 
-    std::clock_t start = std::clock();
+    aom_usec_timer ref_timer;
+    aom_usec_timer_start(&ref_timer);
     for (i = 0; i < NUM_ITERS; ++i) {
       for (k = 0; k < height; k += pu_height)
         for (j = 0; j < width; j += pu_width) {
@@ -80,15 +88,36 @@ class AV1SelfguidedFilterTest
           int h = AOMMIN(pu_height, height - k);
           uint8_t *input_p = input + k * stride + j;
           uint8_t *output_p = output + k * out_stride + j;
-          apply_selfguided_restoration(input_p, w, h, stride, eps, xqd,
-                                       output_p, out_stride, tmpbuf);
+          apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
+                                         output_p, out_stride, tmpbuf, 8, 0);
         }
     }
-    std::clock_t end = std::clock();
-    double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
+    aom_usec_timer_mark(&ref_timer);
+    const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
 
-    printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, width,
-           height, elapsed, elapsed * 1000000. / NUM_ITERS);
+    aom_usec_timer tst_timer;
+    aom_usec_timer_start(&tst_timer);
+    for (i = 0; i < NUM_ITERS; ++i) {
+      for (k = 0; k < height; k += pu_height)
+        for (j = 0; j < width; j += pu_width) {
+          int w = AOMMIN(pu_width, width - j);
+          int h = AOMMIN(pu_height, height - k);
+          uint8_t *input_p = input + k * stride + j;
+          uint8_t *output_p = output + k * out_stride + j;
+          tst_fun_(input_p, w, h, stride, eps, xqd, output_p, out_stride,
+                   tmpbuf, 8, 0);
+        }
+    }
+    aom_usec_timer_mark(&tst_timer);
+    const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+    std::cout << "[          ] C time = " << ref_time / 1000
+              << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+    EXPECT_GT(ref_time, tst_time)
+        << "Error: AV1SelfguidedFilterTest.SpeedTest, SIMD slower than C.\n"
+        << "C time: " << ref_time << " us\n"
+        << "SIMD time: " << tst_time << " us\n";
 
     aom_free(input_);
     aom_free(output_);
@@ -96,6 +125,7 @@ class AV1SelfguidedFilterTest
   }
 
   void RunCorrectnessTest() {
+    tst_fun_ = GET_PARAM(0);
     const int pu_width = RESTORATION_PROC_UNIT_SIZE;
     const int pu_height = RESTORATION_PROC_UNIT_SIZE;
     // Set the maximum width/height to test here. We actually test a small
@@ -106,12 +136,12 @@ class AV1SelfguidedFilterTest
     int i, j, k;
 
     uint8_t *input_ =
-        (uint8_t *)aom_memalign(16, stride * (max_h + 32) * sizeof(uint8_t));
+        (uint8_t *)aom_memalign(32, stride * (max_h + 32) * sizeof(uint8_t));
     uint8_t *output_ = (uint8_t *)aom_memalign(
-        16, out_stride * (max_h + 32) * sizeof(uint8_t));
+        32, out_stride * (max_h + 32) * sizeof(uint8_t));
     uint8_t *output2_ = (uint8_t *)aom_memalign(
-        16, out_stride * (max_h + 32) * sizeof(uint8_t));
-    int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
+        32, out_stride * (max_h + 32) * sizeof(uint8_t));
+    int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
 
     uint8_t *input = input_ + stride * 16 + 16;
     uint8_t *output = output_ + out_stride * 16 + 16;
@@ -126,12 +156,10 @@ class AV1SelfguidedFilterTest
         for (k = -16; k < max_w + 16; ++k)
           input[j * stride + k] = rnd.Rand16() & 0xFF;
 
-      int xqd[2] = {
-        SGRPROJ_PRJ_MIN0 +
-            rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 - SGRPROJ_PRJ_MIN0),
-        SGRPROJ_PRJ_MIN1 +
-            rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 - SGRPROJ_PRJ_MIN1)
-      };
+      int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 -
+                                                          SGRPROJ_PRJ_MIN0),
+                     SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 -
+                                                          SGRPROJ_PRJ_MIN1) };
       int eps = rnd.PseudoUniform(1 << SGRPROJ_PARAMS_BITS);
 
       // Test various tile sizes around 256x256
@@ -145,17 +173,12 @@ class AV1SelfguidedFilterTest
           uint8_t *input_p = input + k * stride + j;
           uint8_t *output_p = output + k * out_stride + j;
           uint8_t *output2_p = output2 + k * out_stride + j;
-          apply_selfguided_restoration(input_p, w, h, stride, eps, xqd,
-                                       output_p, out_stride, tmpbuf);
+          tst_fun_(input_p, w, h, stride, eps, xqd, output_p, out_stride,
+                   tmpbuf, 8, 0);
           apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
-                                         output2_p, out_stride, tmpbuf);
+                                         output2_p, out_stride, tmpbuf, 8, 0);
         }
-      /*
-      apply_selfguided_restoration(input, test_w, test_h, stride, eps, xqd,
-                                   output, out_stride, tmpbuf);
-      apply_selfguided_restoration_c(input, test_w, test_h, stride, eps, xqd,
-                                     output2, out_stride, tmpbuf);
-                                     */
+
       for (j = 0; j < test_h; ++j)
         for (k = 0; k < test_w; ++k) {
           ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
@@ -167,20 +190,27 @@ class AV1SelfguidedFilterTest
     aom_free(output2_);
     aom_free(tmpbuf);
   }
+
+ private:
+  SgrFunc tst_fun_;
 };
 
-TEST_P(AV1SelfguidedFilterTest, SpeedTest) { RunSpeedTest(); }
+TEST_P(AV1SelfguidedFilterTest, DISABLED_SpeedTest) { RunSpeedTest(); }
 TEST_P(AV1SelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
 
 #if HAVE_SSE4_1
-const FilterTestParam params[] = { make_tuple() };
 INSTANTIATE_TEST_CASE_P(SSE4_1, AV1SelfguidedFilterTest,
-                        ::testing::ValuesIn(params));
+                        ::testing::Values(apply_selfguided_restoration_sse4_1));
 #endif
 
-#if CONFIG_HIGHBITDEPTH
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, AV1SelfguidedFilterTest,
+                        ::testing::Values(apply_selfguided_restoration_avx2));
+#endif
 
-typedef tuple<int> HighbdFilterTestParam;
+// Test parameter list:
+//  <tst_fun_, bit_depth>
+typedef tuple<SgrFunc, int> HighbdFilterTestParam;
 
 class AV1HighbdSelfguidedFilterTest
     : public ::testing::TestWithParam<HighbdFilterTestParam> {
@@ -192,19 +222,20 @@ class AV1HighbdSelfguidedFilterTest
 
  protected:
   void RunSpeedTest() {
+    tst_fun_ = GET_PARAM(0);
     const int pu_width = RESTORATION_PROC_UNIT_SIZE;
     const int pu_height = RESTORATION_PROC_UNIT_SIZE;
     const int width = 256, height = 256, stride = 288, out_stride = 288;
     const int NUM_ITERS = 2000;
     int i, j, k;
-    int bit_depth = GET_PARAM(0);
+    int bit_depth = GET_PARAM(1);
     int mask = (1 << bit_depth) - 1;
 
     uint16_t *input_ =
-        (uint16_t *)aom_memalign(16, stride * (height + 32) * sizeof(uint16_t));
+        (uint16_t *)aom_memalign(32, stride * (height + 32) * sizeof(uint16_t));
     uint16_t *output_ = (uint16_t *)aom_memalign(
-        16, out_stride * (height + 32) * sizeof(uint16_t));
-    int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
+        32, out_stride * (height + 32) * sizeof(uint16_t));
+    int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
     uint16_t *input = input_ + stride * 16 + 16;
     uint16_t *output = output_ + out_stride * 16 + 16;
 
@@ -214,19 +245,18 @@ class AV1HighbdSelfguidedFilterTest
       for (j = -16; j < width + 16; ++j)
         input[i * stride + j] = rnd.Rand16() & mask;
 
-    int xqd[2] = {
-      SGRPROJ_PRJ_MIN0 +
-          rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 - SGRPROJ_PRJ_MIN0),
-      SGRPROJ_PRJ_MIN1 +
-          rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 - SGRPROJ_PRJ_MIN1)
-    };
+    int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 -
+                                                        SGRPROJ_PRJ_MIN0),
+                   SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 -
+                                                        SGRPROJ_PRJ_MIN1) };
     // Fix a parameter set, since the speed depends slightly on r.
     // Change this to test different combinations of values of r.
     int eps = 15;
 
     av1_loop_restoration_precal();
 
-    std::clock_t start = std::clock();
+    aom_usec_timer ref_timer;
+    aom_usec_timer_start(&ref_timer);
     for (i = 0; i < NUM_ITERS; ++i) {
       for (k = 0; k < height; k += pu_height)
         for (j = 0; j < width; j += pu_width) {
@@ -234,16 +264,39 @@ class AV1HighbdSelfguidedFilterTest
           int h = AOMMIN(pu_height, height - k);
           uint16_t *input_p = input + k * stride + j;
           uint16_t *output_p = output + k * out_stride + j;
-          apply_selfguided_restoration_highbd(input_p, w, h, stride, bit_depth,
-                                              eps, xqd, output_p, out_stride,
-                                              tmpbuf);
+          apply_selfguided_restoration_c(
+              CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
+              CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth, 1);
         }
     }
-    std::clock_t end = std::clock();
-    double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
+    aom_usec_timer_mark(&ref_timer);
+    const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
 
-    printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, width,
-           height, elapsed, elapsed * 1000000. / NUM_ITERS);
+    aom_usec_timer tst_timer;
+    aom_usec_timer_start(&tst_timer);
+    for (i = 0; i < NUM_ITERS; ++i) {
+      for (k = 0; k < height; k += pu_height)
+        for (j = 0; j < width; j += pu_width) {
+          int w = AOMMIN(pu_width, width - j);
+          int h = AOMMIN(pu_height, height - k);
+          uint16_t *input_p = input + k * stride + j;
+          uint16_t *output_p = output + k * out_stride + j;
+          tst_fun_(CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
+                   CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth,
+                   1);
+        }
+    }
+    aom_usec_timer_mark(&tst_timer);
+    const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+    std::cout << "[          ] C time = " << ref_time / 1000
+              << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+    EXPECT_GT(ref_time, tst_time)
+        << "Error: AV1HighbdSelfguidedFilterTest.SpeedTest, SIMD slower than "
+           "C.\n"
+        << "C time: " << ref_time << " us\n"
+        << "SIMD time: " << tst_time << " us\n";
 
     aom_free(input_);
     aom_free(output_);
@@ -251,6 +304,7 @@ class AV1HighbdSelfguidedFilterTest
   }
 
   void RunCorrectnessTest() {
+    tst_fun_ = GET_PARAM(0);
     const int pu_width = RESTORATION_PROC_UNIT_SIZE;
     const int pu_height = RESTORATION_PROC_UNIT_SIZE;
     // Set the maximum width/height to test here. We actually test a small
@@ -259,16 +313,16 @@ class AV1HighbdSelfguidedFilterTest
     const int max_w = 260, max_h = 260, stride = 672, out_stride = 672;
     const int NUM_ITERS = 81;
     int i, j, k;
-    int bit_depth = GET_PARAM(0);
+    int bit_depth = GET_PARAM(1);
     int mask = (1 << bit_depth) - 1;
 
     uint16_t *input_ =
-        (uint16_t *)aom_memalign(16, stride * (max_h + 32) * sizeof(uint16_t));
+        (uint16_t *)aom_memalign(32, stride * (max_h + 32) * sizeof(uint16_t));
     uint16_t *output_ = (uint16_t *)aom_memalign(
-        16, out_stride * (max_h + 32) * sizeof(uint16_t));
+        32, out_stride * (max_h + 32) * sizeof(uint16_t));
     uint16_t *output2_ = (uint16_t *)aom_memalign(
-        16, out_stride * (max_h + 32) * sizeof(uint16_t));
-    int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
+        32, out_stride * (max_h + 32) * sizeof(uint16_t));
+    int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
 
     uint16_t *input = input_ + stride * 16 + 16;
     uint16_t *output = output_ + out_stride * 16 + 16;
@@ -283,12 +337,10 @@ class AV1HighbdSelfguidedFilterTest
         for (k = -16; k < max_w + 16; ++k)
           input[j * stride + k] = rnd.Rand16() & mask;
 
-      int xqd[2] = {
-        SGRPROJ_PRJ_MIN0 +
-            rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 - SGRPROJ_PRJ_MIN0),
-        SGRPROJ_PRJ_MIN1 +
-            rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 - SGRPROJ_PRJ_MIN1)
-      };
+      int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 -
+                                                          SGRPROJ_PRJ_MIN0),
+                     SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 -
+                                                          SGRPROJ_PRJ_MIN1) };
       int eps = rnd.PseudoUniform(1 << SGRPROJ_PARAMS_BITS);
 
       // Test various tile sizes around 256x256
@@ -302,22 +354,14 @@ class AV1HighbdSelfguidedFilterTest
           uint16_t *input_p = input + k * stride + j;
           uint16_t *output_p = output + k * out_stride + j;
           uint16_t *output2_p = output2 + k * out_stride + j;
-          apply_selfguided_restoration_highbd(input_p, w, h, stride, bit_depth,
-                                              eps, xqd, output_p, out_stride,
-                                              tmpbuf);
-          apply_selfguided_restoration_highbd_c(input_p, w, h, stride,
-                                                bit_depth, eps, xqd, output2_p,
-                                                out_stride, tmpbuf);
+          tst_fun_(CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
+                   CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth,
+                   1);
+          apply_selfguided_restoration_c(
+              CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
+              CONVERT_TO_BYTEPTR(output2_p), out_stride, tmpbuf, bit_depth, 1);
         }
 
-      /*
-      apply_selfguided_restoration_highbd(input, test_w, test_h, stride,
-                                          bit_depth, eps, xqd, output,
-                                          out_stride, tmpbuf);
-      apply_selfguided_restoration_highbd_c(input, test_w, test_h, stride,
-                                            bit_depth, eps, xqd, output2,
-                                            out_stride, tmpbuf);
-                                            */
       for (j = 0; j < test_h; ++j)
         for (k = 0; k < test_w; ++k)
           ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
@@ -328,17 +372,28 @@ class AV1HighbdSelfguidedFilterTest
     aom_free(output2_);
     aom_free(tmpbuf);
   }
+
+ private:
+  SgrFunc tst_fun_;
 };
 
-TEST_P(AV1HighbdSelfguidedFilterTest, SpeedTest) { RunSpeedTest(); }
+TEST_P(AV1HighbdSelfguidedFilterTest, DISABLED_SpeedTest) { RunSpeedTest(); }
 TEST_P(AV1HighbdSelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
 
 #if HAVE_SSE4_1
-const HighbdFilterTestParam highbd_params[] = { make_tuple(8), make_tuple(10),
-                                                make_tuple(12) };
-INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdSelfguidedFilterTest,
-                        ::testing::ValuesIn(highbd_params));
+const int highbd_params_sse4_1[] = { 8, 10, 12 };
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, AV1HighbdSelfguidedFilterTest,
+    ::testing::Combine(::testing::Values(apply_selfguided_restoration_sse4_1),
+                       ::testing::ValuesIn(highbd_params_sse4_1)));
 #endif
+
+#if HAVE_AVX2
+const int highbd_params_avx2[] = { 8, 10, 12 };
+INSTANTIATE_TEST_CASE_P(
+    AVX2, AV1HighbdSelfguidedFilterTest,
+    ::testing::Combine(::testing::Values(apply_selfguided_restoration_avx2),
+                       ::testing::ValuesIn(highbd_params_avx2)));
 #endif
 
 }  // namespace
diff --git a/third_party/aom/test/simd_avx2_test.cc b/third_party/aom/test/simd_avx2_test.cc
index d54d201b9..8a012bff8 100644
--- a/third_party/aom/test/simd_avx2_test.cc
+++ b/third_party/aom/test/simd_avx2_test.cc
@@ -12,4 +12,4 @@
 #define ARCH AVX2
 #define ARCH_POSTFIX(name) name##_avx2
 #define SIMD_NAMESPACE simd_test_avx2
-#include "./simd_impl.h"
+#include "test/simd_impl.h"
diff --git a/third_party/aom/test/simd_cmp_avx2.cc b/third_party/aom/test/simd_cmp_avx2.cc
index 47ae11c62..cda632bcd 100644
--- a/third_party/aom/test/simd_cmp_avx2.cc
+++ b/third_party/aom/test/simd_cmp_avx2.cc
@@ -12,4 +12,4 @@
 #define ARCH AVX2
 #define ARCH_POSTFIX(name) name##_avx2
 #define SIMD_NAMESPACE simd_test_avx2
-#include "./simd_cmp_impl.h"
+#include "test/simd_cmp_impl.h"
diff --git a/third_party/aom/test/simd_cmp_impl.h b/third_party/aom/test/simd_cmp_impl.h
index 03fe703d9..b98af9aad 100644
--- a/third_party/aom/test/simd_cmp_impl.h
+++ b/third_party/aom/test/simd_cmp_impl.h
@@ -7,11 +7,13 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <assert.h>
 #include <string>
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_dsp_rtcd.h"
+
 #include "test/acm_random.h"
 #include "aom_dsp/aom_simd.h"
 #undef SIMD_INLINE
@@ -22,6 +24,14 @@
 // simd_cmp_sse2.cc, simd_cmp_ssse3.cc etc which define the macros
 // ARCH (=neon, sse2, ssse3, etc), SIMD_NAMESPACE and ARCH_POSTFIX().
 
+#ifdef _MSC_VER
+// Disable "value of intrinsic immediate argument 'value' is out of range
+// 'lowerbound - upperbound'" warning. Visual Studio emits this warning though
+// the parameters are conditionally checked in e.g., v256_shr_n_byte. Adding a
+// mask doesn't always appear to be sufficient.
+#pragma warning(disable : 4556)
+#endif
+
 using libaom_test::ACMRandom;
 
 namespace SIMD_NAMESPACE {
@@ -171,6 +181,18 @@ v128 imm_v128_shr_n_s32(v128 a) {
   return v128_shr_n_s32(a, shift);
 }
 template <int shift>
+v128 imm_v128_shl_n_64(v128 a) {
+  return v128_shl_n_64(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_u64(v128 a) {
+  return v128_shr_n_u64(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_s64(v128 a) {
+  return v128_shr_n_s64(a, shift);
+}
+template <int shift>
 v128 imm_v128_align(v128 a, v128 b) {
   return v128_align(a, b, shift);
 }
@@ -220,11 +242,31 @@ c_v128 c_imm_v128_shr_n_s32(c_v128 a) {
   return c_v128_shr_n_s32(a, shift);
 }
 template <int shift>
+c_v128 c_imm_v128_shl_n_64(c_v128 a) {
+  return c_v128_shl_n_64(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_u64(c_v128 a) {
+  return c_v128_shr_n_u64(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_s64(c_v128 a) {
+  return c_v128_shr_n_s64(a, shift);
+}
+template <int shift>
 c_v128 c_imm_v128_align(c_v128 a, c_v128 b) {
   return c_v128_align(a, b, shift);
 }
 
 template <int shift>
+v256 imm_v256_shl_n_word(v256 a) {
+  return v256_shl_n_word(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_word(v256 a) {
+  return v256_shr_n_word(a, shift);
+}
+template <int shift>
 v256 imm_v256_shl_n_byte(v256 a) {
   return v256_shl_n_byte(a, shift);
 }
@@ -269,11 +311,31 @@ v256 imm_v256_shr_n_s32(v256 a) {
   return v256_shr_n_s32(a, shift);
 }
 template <int shift>
+v256 imm_v256_shl_n_64(v256 a) {
+  return v256_shl_n_64(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_u64(v256 a) {
+  return v256_shr_n_u64(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_s64(v256 a) {
+  return v256_shr_n_s64(a, shift);
+}
+template <int shift>
 v256 imm_v256_align(v256 a, v256 b) {
   return v256_align(a, b, shift);
 }
 
 template <int shift>
+c_v256 c_imm_v256_shl_n_word(c_v256 a) {
+  return c_v256_shl_n_word(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_word(c_v256 a) {
+  return c_v256_shr_n_word(a, shift);
+}
+template <int shift>
 c_v256 c_imm_v256_shl_n_byte(c_v256 a) {
   return c_v256_shl_n_byte(a, shift);
 }
@@ -318,6 +380,18 @@ c_v256 c_imm_v256_shr_n_s32(c_v256 a) {
   return c_v256_shr_n_s32(a, shift);
 }
 template <int shift>
+c_v256 c_imm_v256_shl_n_64(c_v256 a) {
+  return c_v256_shl_n_64(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_u64(c_v256 a) {
+  return c_v256_shr_n_u64(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_s64(c_v256 a) {
+  return c_v256_shr_n_s64(a, shift);
+}
+template <int shift>
 c_v256 c_imm_v256_align(c_v256 a, c_v256 b) {
   return c_v256_align(a, b, shift);
 }
@@ -348,6 +422,18 @@ uint32_t c_v128_sad_u8(c_v128 a, c_v128 b) {
 uint32_t c_v128_ssd_u8(c_v128 a, c_v128 b) {
   return c_v128_ssd_u8_sum(::c_v128_ssd_u8(c_v128_ssd_u8_init(), a, b));
 }
+uint32_t v128_sad_u16(v128 a, v128 b) {
+  return v128_sad_u16_sum(::v128_sad_u16(v128_sad_u16_init(), a, b));
+}
+uint64_t v128_ssd_s16(v128 a, v128 b) {
+  return v128_ssd_s16_sum(::v128_ssd_s16(v128_ssd_s16_init(), a, b));
+}
+uint32_t c_v128_sad_u16(c_v128 a, c_v128 b) {
+  return c_v128_sad_u16_sum(::c_v128_sad_u16(c_v128_sad_u16_init(), a, b));
+}
+uint64_t c_v128_ssd_s16(c_v128 a, c_v128 b) {
+  return c_v128_ssd_s16_sum(::c_v128_ssd_s16(c_v128_ssd_s16_init(), a, b));
+}
 uint32_t v256_sad_u8(v256 a, v256 b) {
   return v256_sad_u8_sum(::v256_sad_u8(v256_sad_u8_init(), a, b));
 }
@@ -360,6 +446,18 @@ uint32_t c_v256_sad_u8(c_v256 a, c_v256 b) {
 uint32_t c_v256_ssd_u8(c_v256 a, c_v256 b) {
   return c_v256_ssd_u8_sum(::c_v256_ssd_u8(c_v256_ssd_u8_init(), a, b));
 }
+uint32_t v256_sad_u16(v256 a, v256 b) {
+  return v256_sad_u16_sum(::v256_sad_u16(v256_sad_u16_init(), a, b));
+}
+uint64_t v256_ssd_s16(v256 a, v256 b) {
+  return v256_ssd_s16_sum(::v256_ssd_s16(v256_ssd_s16_init(), a, b));
+}
+uint32_t c_v256_sad_u16(c_v256 a, c_v256 b) {
+  return c_v256_sad_u16_sum(::c_v256_sad_u16(c_v256_sad_u16_init(), a, b));
+}
+uint64_t c_v256_ssd_s16(c_v256 a, c_v256 b) {
+  return c_v256_ssd_s16_sum(::c_v256_ssd_s16(c_v256_ssd_s16_init(), a, b));
+}
 
 namespace {
 
@@ -371,16 +469,18 @@ typedef struct {
   fptr simd;
 } mapping;
 
-#define MAP(name)                                                              \
-  {                                                                            \
-    #name,                                                                     \
-        reinterpret_cast < fptr > (c_##name), reinterpret_cast < fptr > (name) \
+#define MAP(name)                                \
+  {                                              \
+    #name, reinterpret_cast < fptr > (c_##name), \
+        reinterpret_cast < fptr > (name)         \
   }
 
 const mapping m[] = { MAP(v64_sad_u8),
                       MAP(v64_ssd_u8),
                       MAP(v64_add_8),
                       MAP(v64_add_16),
+                      MAP(v64_sadd_s8),
+                      MAP(v64_sadd_u8),
                       MAP(v64_sadd_s16),
                       MAP(v64_add_32),
                       MAP(v64_sub_8),
@@ -396,6 +496,7 @@ const mapping m[] = { MAP(v64_sad_u8),
                       MAP(v64_ziphi_16),
                       MAP(v64_ziplo_32),
                       MAP(v64_ziphi_32),
+                      MAP(v64_pack_s32_u16),
                       MAP(v64_pack_s32_s16),
                       MAP(v64_pack_s16_u8),
                       MAP(v64_pack_s16_s8),
@@ -414,6 +515,7 @@ const mapping m[] = { MAP(v64_sad_u8),
                       MAP(v64_madd_us8),
                       MAP(v64_avg_u8),
                       MAP(v64_rdavg_u8),
+                      MAP(v64_rdavg_u16),
                       MAP(v64_avg_u16),
                       MAP(v64_min_u8),
                       MAP(v64_max_u8),
@@ -554,10 +656,15 @@ const mapping m[] = { MAP(v64_sad_u8),
                       MAP(v64_from_16),
                       MAP(v128_sad_u8),
                       MAP(v128_ssd_u8),
+                      MAP(v128_sad_u16),
+                      MAP(v128_ssd_s16),
                       MAP(v128_add_8),
                       MAP(v128_add_16),
+                      MAP(v128_sadd_s8),
+                      MAP(v128_sadd_u8),
                       MAP(v128_sadd_s16),
                       MAP(v128_add_32),
+                      MAP(v128_add_64),
                       MAP(v128_sub_8),
                       MAP(v128_ssub_u8),
                       MAP(v128_ssub_s8),
@@ -565,6 +672,7 @@ const mapping m[] = { MAP(v64_sad_u8),
                       MAP(v128_ssub_s16),
                       MAP(v128_ssub_u16),
                       MAP(v128_sub_32),
+                      MAP(v128_sub_64),
                       MAP(v128_ziplo_8),
                       MAP(v128_ziphi_8),
                       MAP(v128_ziplo_16),
@@ -579,6 +687,7 @@ const mapping m[] = { MAP(v64_sad_u8),
                       MAP(v128_unziplo_16),
                       MAP(v128_unziphi_32),
                       MAP(v128_unziplo_32),
+                      MAP(v128_pack_s32_u16),
                       MAP(v128_pack_s32_s16),
                       MAP(v128_pack_s16_u8),
                       MAP(v128_pack_s16_s8),
@@ -593,6 +702,7 @@ const mapping m[] = { MAP(v64_sad_u8),
                       MAP(v128_madd_us8),
                       MAP(v128_avg_u8),
                       MAP(v128_rdavg_u8),
+                      MAP(v128_rdavg_u16),
                       MAP(v128_avg_u16),
                       MAP(v128_min_u8),
                       MAP(v128_max_u8),
@@ -600,12 +710,17 @@ const mapping m[] = { MAP(v64_sad_u8),
                       MAP(v128_max_s8),
                       MAP(v128_min_s16),
                       MAP(v128_max_s16),
+                      MAP(v128_min_s32),
+                      MAP(v128_max_s32),
                       MAP(v128_cmpgt_s8),
                       MAP(v128_cmplt_s8),
                       MAP(v128_cmpeq_8),
                       MAP(v128_cmpgt_s16),
                       MAP(v128_cmpeq_16),
                       MAP(v128_cmplt_s16),
+                      MAP(v128_cmpgt_s32),
+                      MAP(v128_cmpeq_32),
+                      MAP(v128_cmplt_s32),
                       MAP(v128_shuffle_8),
                       MAP(imm_v128_align<1>),
                       MAP(imm_v128_align<2>),
@@ -624,6 +739,7 @@ const mapping m[] = { MAP(v64_sad_u8),
                       MAP(imm_v128_align<15>),
                       MAP(v128_abs_s8),
                       MAP(v128_abs_s16),
+                      MAP(v128_padd_u8),
                       MAP(v128_padd_s16),
                       MAP(v128_unpacklo_u16_s32),
                       MAP(v128_unpacklo_s16_s32),
@@ -728,6 +844,54 @@ const mapping m[] = { MAP(v64_sad_u8),
                       MAP(imm_v128_shr_n_s32<20>),
                       MAP(imm_v128_shr_n_s32<24>),
                       MAP(imm_v128_shr_n_s32<28>),
+                      MAP(imm_v128_shl_n_64<1>),
+                      MAP(imm_v128_shl_n_64<4>),
+                      MAP(imm_v128_shl_n_64<8>),
+                      MAP(imm_v128_shl_n_64<12>),
+                      MAP(imm_v128_shl_n_64<16>),
+                      MAP(imm_v128_shl_n_64<20>),
+                      MAP(imm_v128_shl_n_64<24>),
+                      MAP(imm_v128_shl_n_64<28>),
+                      MAP(imm_v128_shl_n_64<32>),
+                      MAP(imm_v128_shl_n_64<36>),
+                      MAP(imm_v128_shl_n_64<40>),
+                      MAP(imm_v128_shl_n_64<44>),
+                      MAP(imm_v128_shl_n_64<48>),
+                      MAP(imm_v128_shl_n_64<52>),
+                      MAP(imm_v128_shl_n_64<56>),
+                      MAP(imm_v128_shl_n_64<60>),
+                      MAP(imm_v128_shr_n_u64<1>),
+                      MAP(imm_v128_shr_n_u64<4>),
+                      MAP(imm_v128_shr_n_u64<8>),
+                      MAP(imm_v128_shr_n_u64<12>),
+                      MAP(imm_v128_shr_n_u64<16>),
+                      MAP(imm_v128_shr_n_u64<20>),
+                      MAP(imm_v128_shr_n_u64<24>),
+                      MAP(imm_v128_shr_n_u64<28>),
+                      MAP(imm_v128_shr_n_u64<32>),
+                      MAP(imm_v128_shr_n_u64<36>),
+                      MAP(imm_v128_shr_n_u64<40>),
+                      MAP(imm_v128_shr_n_u64<44>),
+                      MAP(imm_v128_shr_n_u64<48>),
+                      MAP(imm_v128_shr_n_u64<52>),
+                      MAP(imm_v128_shr_n_u64<56>),
+                      MAP(imm_v128_shr_n_u64<60>),
+                      MAP(imm_v128_shr_n_s64<1>),
+                      MAP(imm_v128_shr_n_s64<4>),
+                      MAP(imm_v128_shr_n_s64<8>),
+                      MAP(imm_v128_shr_n_s64<12>),
+                      MAP(imm_v128_shr_n_s64<16>),
+                      MAP(imm_v128_shr_n_s64<20>),
+                      MAP(imm_v128_shr_n_s64<24>),
+                      MAP(imm_v128_shr_n_s64<28>),
+                      MAP(imm_v128_shr_n_s64<32>),
+                      MAP(imm_v128_shr_n_s64<36>),
+                      MAP(imm_v128_shr_n_s64<40>),
+                      MAP(imm_v128_shr_n_s64<44>),
+                      MAP(imm_v128_shr_n_s64<48>),
+                      MAP(imm_v128_shr_n_s64<52>),
+                      MAP(imm_v128_shr_n_s64<56>),
+                      MAP(imm_v128_shr_n_s64<60>),
                       MAP(v128_from_v64),
                       MAP(v128_zip_8),
                       MAP(v128_zip_16),
@@ -746,21 +910,29 @@ const mapping m[] = { MAP(v64_sad_u8),
                       MAP(v128_shl_32),
                       MAP(v128_shr_u32),
                       MAP(v128_shr_s32),
+                      MAP(v128_shl_64),
+                      MAP(v128_shr_u64),
+                      MAP(v128_shr_s64),
                       MAP(v128_hadd_u8),
+                      MAP(v128_dotp_su8),
                       MAP(v128_dotp_s16),
+                      MAP(v128_dotp_s32),
                       MAP(v128_low_u32),
                       MAP(v128_low_v64),
                       MAP(v128_high_v64),
                       MAP(v128_from_64),
                       MAP(v128_from_32),
+                      MAP(v128_movemask_8),
                       MAP(v128_zero),
                       MAP(v128_dup_8),
                       MAP(v128_dup_16),
                       MAP(v128_dup_32),
+                      MAP(v128_dup_64),
                       MAP(v128_unpacklo_u8_s16),
                       MAP(v128_unpackhi_u8_s16),
                       MAP(v128_unpacklo_s8_s16),
                       MAP(v128_unpackhi_s8_s16),
+                      MAP(v128_blend_8),
                       MAP(u32_load_unaligned),
                       MAP(u32_store_unaligned),
                       MAP(v64_load_unaligned),
@@ -769,12 +941,20 @@ const mapping m[] = { MAP(v64_sad_u8),
                       MAP(v128_store_unaligned),
                       MAP(v256_sad_u8),
                       MAP(v256_ssd_u8),
+                      MAP(v256_sad_u16),
+                      MAP(v256_ssd_s16),
                       MAP(v256_hadd_u8),
+                      MAP(v256_low_u64),
+                      MAP(v256_dotp_su8),
                       MAP(v256_dotp_s16),
+                      MAP(v256_dotp_s32),
                       MAP(v256_add_8),
                       MAP(v256_add_16),
+                      MAP(v256_sadd_s8),
+                      MAP(v256_sadd_u8),
                       MAP(v256_sadd_s16),
                       MAP(v256_add_32),
+                      MAP(v256_add_64),
                       MAP(v256_sub_8),
                       MAP(v256_ssub_u8),
                       MAP(v256_ssub_s8),
@@ -782,6 +962,7 @@ const mapping m[] = { MAP(v64_sad_u8),
                       MAP(v256_ssub_u16),
                       MAP(v256_ssub_s16),
                       MAP(v256_sub_32),
+                      MAP(v256_sub_64),
                       MAP(v256_ziplo_8),
                       MAP(v256_ziphi_8),
                       MAP(v256_ziplo_16),
@@ -796,6 +977,9 @@ const mapping m[] = { MAP(v64_sad_u8),
                       MAP(v256_unziplo_16),
                       MAP(v256_unziphi_32),
                       MAP(v256_unziplo_32),
+                      MAP(v256_unziphi_64),
+                      MAP(v256_unziplo_64),
+                      MAP(v256_pack_s32_u16),
                       MAP(v256_pack_s32_s16),
                       MAP(v256_pack_s16_u8),
                       MAP(v256_pack_s16_s8),
@@ -810,6 +994,7 @@ const mapping m[] = { MAP(v64_sad_u8),
                       MAP(v256_madd_us8),
                       MAP(v256_avg_u8),
                       MAP(v256_rdavg_u8),
+                      MAP(v256_rdavg_u16),
                       MAP(v256_avg_u16),
                       MAP(v256_min_u8),
                       MAP(v256_max_u8),
@@ -817,14 +1002,20 @@ const mapping m[] = { MAP(v64_sad_u8),
                       MAP(v256_max_s8),
                       MAP(v256_min_s16),
                       MAP(v256_max_s16),
+                      MAP(v256_min_s32),
+                      MAP(v256_max_s32),
                       MAP(v256_cmpgt_s8),
                       MAP(v256_cmplt_s8),
                       MAP(v256_cmpeq_8),
                       MAP(v256_cmpgt_s16),
                       MAP(v256_cmplt_s16),
                       MAP(v256_cmpeq_16),
+                      MAP(v256_cmpgt_s32),
+                      MAP(v256_cmplt_s32),
+                      MAP(v256_cmpeq_32),
                       MAP(v256_shuffle_8),
                       MAP(v256_pshuffle_8),
+                      MAP(v256_wideshuffle_8),
                       MAP(imm_v256_align<1>),
                       MAP(imm_v256_align<2>),
                       MAP(imm_v256_align<3>),
@@ -874,13 +1065,47 @@ const mapping m[] = { MAP(v64_sad_u8),
                       MAP(v256_shl_32),
                       MAP(v256_shr_u32),
                       MAP(v256_shr_s32),
+                      MAP(v256_shl_64),
+                      MAP(v256_shr_u64),
+                      MAP(v256_shr_s64),
                       MAP(v256_abs_s8),
                       MAP(v256_abs_s16),
+                      MAP(v256_padd_u8),
                       MAP(v256_padd_s16),
                       MAP(v256_unpacklo_u16_s32),
                       MAP(v256_unpacklo_s16_s32),
                       MAP(v256_unpackhi_u16_s32),
                       MAP(v256_unpackhi_s16_s32),
+                      MAP(imm_v256_shr_n_word<1>),
+                      MAP(imm_v256_shr_n_word<2>),
+                      MAP(imm_v256_shr_n_word<3>),
+                      MAP(imm_v256_shr_n_word<4>),
+                      MAP(imm_v256_shr_n_word<5>),
+                      MAP(imm_v256_shr_n_word<6>),
+                      MAP(imm_v256_shr_n_word<7>),
+                      MAP(imm_v256_shr_n_word<8>),
+                      MAP(imm_v256_shr_n_word<9>),
+                      MAP(imm_v256_shr_n_word<10>),
+                      MAP(imm_v256_shr_n_word<11>),
+                      MAP(imm_v256_shr_n_word<12>),
+                      MAP(imm_v256_shr_n_word<13>),
+                      MAP(imm_v256_shr_n_word<14>),
+                      MAP(imm_v256_shr_n_word<15>),
+                      MAP(imm_v256_shl_n_word<1>),
+                      MAP(imm_v256_shl_n_word<2>),
+                      MAP(imm_v256_shl_n_word<3>),
+                      MAP(imm_v256_shl_n_word<4>),
+                      MAP(imm_v256_shl_n_word<5>),
+                      MAP(imm_v256_shl_n_word<6>),
+                      MAP(imm_v256_shl_n_word<7>),
+                      MAP(imm_v256_shl_n_word<8>),
+                      MAP(imm_v256_shl_n_word<9>),
+                      MAP(imm_v256_shl_n_word<10>),
+                      MAP(imm_v256_shl_n_word<11>),
+                      MAP(imm_v256_shl_n_word<12>),
+                      MAP(imm_v256_shl_n_word<13>),
+                      MAP(imm_v256_shl_n_word<14>),
+                      MAP(imm_v256_shl_n_word<15>),
                       MAP(imm_v256_shr_n_byte<1>),
                       MAP(imm_v256_shr_n_byte<2>),
                       MAP(imm_v256_shr_n_byte<3>),
@@ -1012,10 +1237,60 @@ const mapping m[] = { MAP(v64_sad_u8),
                       MAP(imm_v256_shr_n_s32<20>),
                       MAP(imm_v256_shr_n_s32<24>),
                       MAP(imm_v256_shr_n_s32<28>),
+                      MAP(imm_v256_shl_n_64<1>),
+                      MAP(imm_v256_shl_n_64<4>),
+                      MAP(imm_v256_shl_n_64<8>),
+                      MAP(imm_v256_shl_n_64<12>),
+                      MAP(imm_v256_shl_n_64<16>),
+                      MAP(imm_v256_shl_n_64<20>),
+                      MAP(imm_v256_shl_n_64<24>),
+                      MAP(imm_v256_shl_n_64<28>),
+                      MAP(imm_v256_shl_n_64<32>),
+                      MAP(imm_v256_shl_n_64<36>),
+                      MAP(imm_v256_shl_n_64<40>),
+                      MAP(imm_v256_shl_n_64<44>),
+                      MAP(imm_v256_shl_n_64<48>),
+                      MAP(imm_v256_shl_n_64<52>),
+                      MAP(imm_v256_shl_n_64<56>),
+                      MAP(imm_v256_shl_n_64<60>),
+                      MAP(imm_v256_shr_n_u64<1>),
+                      MAP(imm_v256_shr_n_u64<4>),
+                      MAP(imm_v256_shr_n_u64<8>),
+                      MAP(imm_v256_shr_n_u64<12>),
+                      MAP(imm_v256_shr_n_u64<16>),
+                      MAP(imm_v256_shr_n_u64<20>),
+                      MAP(imm_v256_shr_n_u64<24>),
+                      MAP(imm_v256_shr_n_u64<28>),
+                      MAP(imm_v256_shr_n_u64<32>),
+                      MAP(imm_v256_shr_n_u64<36>),
+                      MAP(imm_v256_shr_n_u64<40>),
+                      MAP(imm_v256_shr_n_u64<44>),
+                      MAP(imm_v256_shr_n_u64<48>),
+                      MAP(imm_v256_shr_n_u64<52>),
+                      MAP(imm_v256_shr_n_u64<56>),
+                      MAP(imm_v256_shr_n_u64<60>),
+                      MAP(imm_v256_shr_n_s64<1>),
+                      MAP(imm_v256_shr_n_s64<4>),
+                      MAP(imm_v256_shr_n_s64<8>),
+                      MAP(imm_v256_shr_n_s64<12>),
+                      MAP(imm_v256_shr_n_s64<16>),
+                      MAP(imm_v256_shr_n_s64<20>),
+                      MAP(imm_v256_shr_n_s64<24>),
+                      MAP(imm_v256_shr_n_s64<28>),
+                      MAP(imm_v256_shr_n_s64<32>),
+                      MAP(imm_v256_shr_n_s64<36>),
+                      MAP(imm_v256_shr_n_s64<40>),
+                      MAP(imm_v256_shr_n_s64<44>),
+                      MAP(imm_v256_shr_n_s64<48>),
+                      MAP(imm_v256_shr_n_s64<52>),
+                      MAP(imm_v256_shr_n_s64<56>),
+                      MAP(imm_v256_shr_n_s64<60>),
+                      MAP(v256_movemask_8),
                       MAP(v256_zero),
                       MAP(v256_dup_8),
                       MAP(v256_dup_16),
                       MAP(v256_dup_32),
+                      MAP(v256_dup_64),
                       MAP(v256_low_u32),
                       MAP(v256_low_v64),
                       MAP(v256_from_64),
@@ -1026,6 +1301,7 @@ const mapping m[] = { MAP(v64_sad_u8),
                       MAP(v256_unpackhi_u8_s16),
                       MAP(v256_unpacklo_s8_s16),
                       MAP(v256_unpackhi_s8_s16),
+                      MAP(v256_blend_8),
                       { NULL, NULL, NULL } };
 #undef MAP
 
@@ -1042,7 +1318,7 @@ void Map(const char *name, fptr *ref, fptr *simd) {
   *simd = m[i].simd;
 }
 
-// Used for printing errors in TestSimd1Arg and TestSimd2Args
+// Used for printing errors in TestSimd1Arg, TestSimd2Args and TestSimd3Args
 std::string Print(const uint8_t *a, int size) {
   std::string text = "0x";
   for (int i = 0; i < size; i++) {
@@ -1055,7 +1331,8 @@ std::string Print(const uint8_t *a, int size) {
   return text;
 }
 
-// Used in TestSimd1Arg and TestSimd2Args to restrict argument ranges
+// Used in TestSimd1Arg, TestSimd2Args and TestSimd3Args to restrict argument
+// ranges
 void SetMask(uint8_t *s, int size, uint32_t mask, uint32_t maskwidth) {
   switch (maskwidth) {
     case 0: {
@@ -1133,16 +1410,16 @@ uint8_t c_u8_load_aligned(const void *p) {
   return *(reinterpret_cast<const uint8_t *>(p));
 }
 
-// CompareSimd1Arg and CompareSimd2Args compare intrinsics taking 1 or
-// 2 arguments respectively with their corresponding C reference.
-// Ideally, the loads and stores should have gone into the template
-// parameter list, but v64 and v128 could be typedef'ed to the same
-// type (which is the case on x86) and then we can't instantiate both
-// v64 and v128, so the function return and argument types, including
-// the always differing types in the C equivalent are used instead.
-// The function arguments must be void pointers and then go through a
-// cast to avoid matching errors in the branches eliminated by the
-// typeid tests in the calling function.
+// CompareSimd1Arg, CompareSimd2Args and CompareSimd3Args compare
+// intrinsics taking 1, 2 or 3 arguments respectively with their
+// corresponding C reference.  Ideally, the loads and stores should
+// have gone into the template parameter list, but v64 and v128 could
+// be typedef'ed to the same type (which is the case on x86) and then
+// we can't instantiate both v64 and v128, so the function return and
+// argument types, including the always differing types in the C
+// equivalent are used instead.  The function arguments must be void
+// pointers and then go through a cast to avoid matching errors in the
+// branches eliminated by the typeid tests in the calling function.
 template <typename Ret, typename Arg, typename CRet, typename CArg>
 int CompareSimd1Arg(fptr store, fptr load, fptr simd, void *d, fptr c_store,
                     fptr c_load, fptr c_simd, void *ref_d, const void *a) {
@@ -1185,6 +1462,35 @@ int CompareSimd2Args(fptr store, fptr load1, fptr load2, fptr simd, void *d,
   return memcmp(ref_d, d, sizeof(CRet));
 }
 
+template <typename Ret, typename Arg1, typename Arg2, typename Arg3,
+          typename CRet, typename CArg1, typename CArg2, typename CArg3>
+int CompareSimd3Args(fptr store, fptr load1, fptr load2, fptr load3, fptr simd,
+                     void *d, fptr c_store, fptr c_load1, fptr c_load2,
+                     fptr c_load3, fptr c_simd, void *ref_d, const void *a,
+                     const void *b, const void *c) {
+  void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
+  Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1;
+  Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2;
+  Arg3 (*const my_load3)(const void *) = (Arg3(*const)(const void *))load3;
+  Ret (*const my_simd)(Arg1, Arg2, Arg3) = (Ret(*const)(Arg1, Arg2, Arg3))simd;
+  void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
+  CArg1 (*const my_c_load1)(const void *) =
+      (CArg1(*const)(const void *))c_load1;
+  CArg2 (*const my_c_load2)(const void *) =
+      (CArg2(*const)(const void *))c_load2;
+  CArg2 (*const my_c_load3)(const void *) =
+      (CArg2(*const)(const void *))c_load3;
+  CRet (*const my_c_simd)(CArg1, CArg2, CArg3) =
+      (CRet(*const)(CArg1, CArg2, CArg3))c_simd;
+
+  // Call reference and intrinsic
+  my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b), my_c_load3(c)));
+  my_store(d, my_simd(my_load1(a), my_load2(b), my_load3(c)));
+
+  // Compare results
+  return memcmp(ref_d, d, sizeof(CRet));
+}
+
 }  // namespace
 
 template <typename CRet, typename CArg>
@@ -1194,9 +1500,10 @@ void TestSimd1Arg(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
   fptr ref_simd;
   fptr simd;
   int error = 0;
-  DECLARE_ALIGNED(32, uint8_t, s[sizeof(CArg)]);
-  DECLARE_ALIGNED(32, uint8_t, d[sizeof(CRet)]);
-  DECLARE_ALIGNED(32, uint8_t, ref_d[sizeof(CRet)]);
+  DECLARE_ALIGNED(32, uint8_t, s[32]);
+  DECLARE_ALIGNED(32, uint8_t, d[32]);
+  DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
+  assert(sizeof(CArg) <= 32 && sizeof(CRet) <= 32);
   memset(ref_d, 0, sizeof(ref_d));
   memset(d, 0, sizeof(d));
 
@@ -1347,6 +1654,14 @@ void TestSimd1Arg(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
           reinterpret_cast<fptr>(u32_load_aligned), simd, d,
           reinterpret_cast<fptr>(c_v128_store_aligned),
           reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg) == typeid(uint64_t)) {
+      // V128_U64
+      error = CompareSimd1Arg<v128, uint64_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(u64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s);
     } else if (typeid(CRet) == typeid(c_v256) &&
                typeid(CArg) == typeid(c_v256)) {
       // V256_V256
@@ -1387,6 +1702,14 @@ void TestSimd1Arg(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
           reinterpret_cast<fptr>(u32_load_aligned), simd, d,
           reinterpret_cast<fptr>(c_v256_store_aligned),
           reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg) == typeid(uint64_t)) {
+      // V256_U64
+      error = CompareSimd1Arg<v256, uint64_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(u64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s);
     } else if (typeid(CRet) == typeid(uint32_t) &&
                typeid(CArg) == typeid(c_v256)) {
       // U32_V256
@@ -1422,10 +1745,11 @@ void TestSimd2Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
   fptr ref_simd;
   fptr simd;
   int error = 0;
-  DECLARE_ALIGNED(32, uint8_t, s1[sizeof(CArg1)]);
-  DECLARE_ALIGNED(32, uint8_t, s2[sizeof(CArg2)]);
-  DECLARE_ALIGNED(32, uint8_t, d[sizeof(CRet)]);
-  DECLARE_ALIGNED(32, uint8_t, ref_d[sizeof(CRet)]);
+  DECLARE_ALIGNED(32, uint8_t, s1[32]);
+  DECLARE_ALIGNED(32, uint8_t, s2[32]);
+  DECLARE_ALIGNED(32, uint8_t, d[32]);
+  DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
+  assert(sizeof(CArg1) <= 32 && sizeof(CArg2) <= 32 && sizeof(CRet) <= 32);
   memset(ref_d, 0, sizeof(ref_d));
   memset(d, 0, sizeof(d));
 
@@ -1525,6 +1849,18 @@ void TestSimd2Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
           reinterpret_cast<fptr>(c_v128_load_aligned),
           reinterpret_cast<fptr>(c_v128_load_aligned),
           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(uint64_t) &&
+               typeid(CArg1) == typeid(c_v128) &&
+               typeid(CArg2) == typeid(c_v128)) {
+      // U64_V128V128
+      error = CompareSimd2Args<uint64_t, v128, v128, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(u64_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u64_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
     } else if (typeid(CRet) == typeid(int64_t) &&
                typeid(CArg1) == typeid(c_v128) &&
                typeid(CArg2) == typeid(c_v128)) {
@@ -1585,6 +1921,18 @@ void TestSimd2Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
           reinterpret_cast<fptr>(c_v256_load_aligned),
           reinterpret_cast<fptr>(c_v256_load_aligned),
           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(uint64_t) &&
+               typeid(CArg1) == typeid(c_v256) &&
+               typeid(CArg2) == typeid(c_v256)) {
+      // U64_V256V256
+      error = CompareSimd2Args<uint64_t, v256, v256, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(u64_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u64_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
     } else if (typeid(CRet) == typeid(int64_t) &&
                typeid(CArg1) == typeid(c_v256) &&
                typeid(CArg2) == typeid(c_v256)) {
@@ -1647,6 +1995,83 @@ void TestSimd2Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
                       << Print(ref_d, sizeof(ref_d)) << " (ref)";
 }
 
+template <typename CRet, typename CArg1, typename CArg2, typename CArg3>
+void TestSimd3Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+                   const char *name) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  fptr ref_simd;
+  fptr simd;
+  int error = 0;
+  DECLARE_ALIGNED(32, uint8_t, s1[32]);
+  DECLARE_ALIGNED(32, uint8_t, s2[32]);
+  DECLARE_ALIGNED(32, uint8_t, s3[32]);
+  DECLARE_ALIGNED(32, uint8_t, d[32]);
+  DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
+  assert(sizeof(CArg1) <= 32 && sizeof(CArg2) <= 32 && sizeof(CArg3) <= 32 &&
+         sizeof(CRet) <= 32);
+  memset(ref_d, 0, sizeof(ref_d));
+  memset(d, 0, sizeof(d));
+
+  Map(name, &ref_simd, &simd);
+  if (simd == NULL || ref_simd == NULL) {
+    FAIL() << "Internal error: Unknown intrinsic function " << name;
+  }
+
+  for (unsigned int count = 0;
+       count < iterations && !error && !testing::Test::HasFailure(); count++) {
+    for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8();
+
+    for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8();
+
+    for (unsigned int c = 0; c < sizeof(CArg3); c++) s3[c] = rnd.Rand8();
+
+    if (maskwidth) SetMask(s3, sizeof(CArg3), mask, maskwidth);
+
+    if (typeid(CRet) == typeid(c_v128) && typeid(CArg1) == typeid(c_v128) &&
+        typeid(CArg2) == typeid(c_v128) && typeid(CArg3) == typeid(c_v128)) {
+      // V128_V128V128V128
+      error =
+          CompareSimd3Args<v128, v128, v128, v128, CRet, CArg1, CArg2, CArg3>(
+              reinterpret_cast<fptr>(v128_store_aligned),
+              reinterpret_cast<fptr>(v128_load_aligned),
+              reinterpret_cast<fptr>(v128_load_aligned),
+              reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+              reinterpret_cast<fptr>(c_v128_store_aligned),
+              reinterpret_cast<fptr>(c_v128_load_aligned),
+              reinterpret_cast<fptr>(c_v128_load_aligned),
+              reinterpret_cast<fptr>(c_v128_load_aligned),
+              reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg1) == typeid(c_v256) &&
+               typeid(CArg2) == typeid(c_v256) &&
+               typeid(CArg3) == typeid(c_v256)) {
+      // V256_V256V256V256
+      error =
+          CompareSimd3Args<v256, v256, v256, v256, CRet, CArg1, CArg2, CArg3>(
+              reinterpret_cast<fptr>(v256_store_aligned),
+              reinterpret_cast<fptr>(v256_load_aligned),
+              reinterpret_cast<fptr>(v256_load_aligned),
+              reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+              reinterpret_cast<fptr>(c_v256_store_aligned),
+              reinterpret_cast<fptr>(c_v256_load_aligned),
+              reinterpret_cast<fptr>(c_v256_load_aligned),
+              reinterpret_cast<fptr>(c_v256_load_aligned),
+              reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3);
+    } else {
+      FAIL() << "Internal error: Unknown intrinsic function "
+             << typeid(CRet).name() << " " << name << "("
+             << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ", "
+             << typeid(CArg3).name() << ")";
+    }
+  }
+
+  EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
+                      << Print(s1, sizeof(s1)) << ", " << Print(s2, sizeof(s2))
+                      << ", " << Print(s3, sizeof(s3)) << ") -> "
+                      << Print(d, sizeof(d)) << " (simd), "
+                      << Print(ref_d, sizeof(ref_d)) << " (ref)";
+}
+
 // Instantiations to make the functions callable from another files
 template void TestSimd1Arg<c_v64, uint8_t>(uint32_t, uint32_t, uint32_t,
                                            const char *);
@@ -1682,6 +2107,8 @@ template void TestSimd1Arg<c_v128, uint16_t>(uint32_t, uint32_t, uint32_t,
                                              const char *);
 template void TestSimd1Arg<c_v128, uint32_t>(uint32_t, uint32_t, uint32_t,
                                              const char *);
+template void TestSimd1Arg<c_v128, uint64_t>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
 template void TestSimd1Arg<c_v128, c_v64>(uint32_t, uint32_t, uint32_t,
                                           const char *);
 template void TestSimd1Arg<uint32_t, c_v128>(uint32_t, uint32_t, uint32_t,
@@ -1698,10 +2125,15 @@ template void TestSimd2Args<c_v128, uint64_t, uint64_t>(uint32_t, uint32_t,
                                                         uint32_t, const char *);
 template void TestSimd2Args<c_v128, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
                                                   const char *);
+template void TestSimd2Args<uint64_t, c_v128, c_v128>(uint32_t, uint32_t,
+                                                      uint32_t, const char *);
 template void TestSimd2Args<int64_t, c_v128, c_v128>(uint32_t, uint32_t,
                                                      uint32_t, const char *);
 template void TestSimd2Args<uint32_t, c_v128, c_v128>(uint32_t, uint32_t,
                                                       uint32_t, const char *);
+template void TestSimd3Args<c_v128, c_v128, c_v128, c_v128>(uint32_t, uint32_t,
+                                                            uint32_t,
+                                                            const char *);
 template void TestSimd1Arg<c_v256, c_v128>(uint32_t, uint32_t, uint32_t,
                                            const char *);
 template void TestSimd1Arg<c_v256, c_v256>(uint32_t, uint32_t, uint32_t,
@@ -1714,6 +2146,8 @@ template void TestSimd1Arg<c_v256, uint16_t>(uint32_t, uint32_t, uint32_t,
                                              const char *);
 template void TestSimd1Arg<c_v256, uint32_t>(uint32_t, uint32_t, uint32_t,
                                              const char *);
+template void TestSimd1Arg<c_v256, uint64_t>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
 template void TestSimd1Arg<uint32_t, c_v256>(uint32_t, uint32_t, uint32_t,
                                              const char *);
 template void TestSimd1Arg<c_v64, c_v256>(uint32_t, uint32_t, uint32_t,
@@ -1724,9 +2158,14 @@ template void TestSimd2Args<c_v256, c_v256, c_v256>(uint32_t, uint32_t,
                                                     uint32_t, const char *);
 template void TestSimd2Args<c_v256, c_v256, uint32_t>(uint32_t, uint32_t,
                                                       uint32_t, const char *);
+template void TestSimd2Args<uint64_t, c_v256, c_v256>(uint32_t, uint32_t,
+                                                      uint32_t, const char *);
 template void TestSimd2Args<int64_t, c_v256, c_v256>(uint32_t, uint32_t,
                                                      uint32_t, const char *);
 template void TestSimd2Args<uint32_t, c_v256, c_v256>(uint32_t, uint32_t,
                                                       uint32_t, const char *);
+template void TestSimd3Args<c_v256, c_v256, c_v256, c_v256>(uint32_t, uint32_t,
+                                                            uint32_t,
+                                                            const char *);
 
 }  // namespace SIMD_NAMESPACE
diff --git a/third_party/aom/test/simd_cmp_neon.cc b/third_party/aom/test/simd_cmp_neon.cc
index c8004cc8b..53c1e2a07 100644
--- a/third_party/aom/test/simd_cmp_neon.cc
+++ b/third_party/aom/test/simd_cmp_neon.cc
@@ -13,5 +13,5 @@
 #define ARCH NEON
 #define ARCH_POSTFIX(name) name##_neon
 #define SIMD_NAMESPACE simd_test_neon
-#include "./simd_cmp_impl.h"
+#include "test/simd_cmp_impl.h"
 #endif
diff --git a/third_party/aom/test/simd_cmp_sse2.cc b/third_party/aom/test/simd_cmp_sse2.cc
index 67cb43c10..f7827a7fa 100644
--- a/third_party/aom/test/simd_cmp_sse2.cc
+++ b/third_party/aom/test/simd_cmp_sse2.cc
@@ -14,5 +14,5 @@
 #define ARCH SSE2
 #define ARCH_POSTFIX(name) name##_sse2
 #define SIMD_NAMESPACE simd_test_sse2
-#include "./simd_cmp_impl.h"
+#include "test/simd_cmp_impl.h"
 #endif
diff --git a/third_party/aom/test/simd_cmp_sse4.cc b/third_party/aom/test/simd_cmp_sse4.cc
index ba826d898..3566764b6 100644
--- a/third_party/aom/test/simd_cmp_sse4.cc
+++ b/third_party/aom/test/simd_cmp_sse4.cc
@@ -14,5 +14,5 @@
 #define ARCH SSE4_1
 #define ARCH_POSTFIX(name) name##_sse4_1
 #define SIMD_NAMESPACE simd_test_sse4_1
-#include "./simd_cmp_impl.h"
+#include "test/simd_cmp_impl.h"
 #endif
diff --git a/third_party/aom/test/simd_cmp_ssse3.cc b/third_party/aom/test/simd_cmp_ssse3.cc
index a6c7000fd..57bf135dd 100644
--- a/third_party/aom/test/simd_cmp_ssse3.cc
+++ b/third_party/aom/test/simd_cmp_ssse3.cc
@@ -14,5 +14,5 @@
 #define ARCH SSSE3
 #define ARCH_POSTFIX(name) name##_ssse3
 #define SIMD_NAMESPACE simd_test_ssse3
-#include "./simd_cmp_impl.h"
+#include "test/simd_cmp_impl.h"
 #endif
diff --git a/third_party/aom/test/simd_impl.h b/third_party/aom/test/simd_impl.h
index c3dfbc400..fd06f67fd 100644
--- a/third_party/aom/test/simd_impl.h
+++ b/third_party/aom/test/simd_impl.h
@@ -7,7 +7,7 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #define SIMD_CHECK 1
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -23,9 +23,9 @@ class TestIntrinsic : public ::testing::TestWithParam<param_signature> {
  public:
   virtual ~TestIntrinsic() {}
   virtual void SetUp() {
-    mask = std::tr1::get<0>(this->GetParam());
-    maskwidth = std::tr1::get<1>(this->GetParam());
-    name = std::tr1::get<2>(this->GetParam());
+    mask = ::testing::get<0>(this->GetParam());
+    maskwidth = ::testing::get<1>(this->GetParam());
+    name = ::testing::get<2>(this->GetParam());
   }
 
   virtual void TearDown() { libaom_test::ClearSystemState(); }
@@ -36,8 +36,8 @@ class TestIntrinsic : public ::testing::TestWithParam<param_signature> {
 };
 
 // Create one typedef for each function signature
-#define TYPEDEF_SIMD(name)                                                  \
-  typedef TestIntrinsic<std::tr1::tuple<uint32_t, uint32_t, const char *> > \
+#define TYPEDEF_SIMD(name)                                                    \
+  typedef TestIntrinsic< ::testing::tuple<uint32_t, uint32_t, const char *> > \
       ARCH_POSTFIX(name)
 
 TYPEDEF_SIMD(V64_U8);
@@ -61,23 +61,29 @@ TYPEDEF_SIMD(V64_V128);
 TYPEDEF_SIMD(V128_U8);
 TYPEDEF_SIMD(V128_U16);
 TYPEDEF_SIMD(V128_U32);
+TYPEDEF_SIMD(V128_U64);
 TYPEDEF_SIMD(V128_U64U64);
 TYPEDEF_SIMD(V128_V64V64);
 TYPEDEF_SIMD(V128_V128V128);
+TYPEDEF_SIMD(V128_V128V128V128);
 TYPEDEF_SIMD(S64_V128V128);
 TYPEDEF_SIMD(V128_V128U32);
 TYPEDEF_SIMD(U32_V128V128);
+TYPEDEF_SIMD(U64_V128V128);
 TYPEDEF_SIMD(V256_V128);
 TYPEDEF_SIMD(V256_V256);
 TYPEDEF_SIMD(U64_V256);
 TYPEDEF_SIMD(V256_V128V128);
 TYPEDEF_SIMD(V256_V256V256);
+TYPEDEF_SIMD(V256_V256V256V256);
+TYPEDEF_SIMD(U64_V256V256);
 TYPEDEF_SIMD(S64_V256V256);
 TYPEDEF_SIMD(V256_V256U32);
 TYPEDEF_SIMD(U32_V256V256);
 TYPEDEF_SIMD(V256_U8);
 TYPEDEF_SIMD(V256_U16);
 TYPEDEF_SIMD(V256_U32);
+TYPEDEF_SIMD(V256_U64);
 TYPEDEF_SIMD(U32_V256);
 TYPEDEF_SIMD(V64_V256);
 
@@ -86,9 +92,12 @@ typedef ARCH_POSTFIX(V64_V64) ARCH_POSTFIX(V64_V64_Part2);
 typedef ARCH_POSTFIX(V64_V64V64) ARCH_POSTFIX(V64_V64V64_Part2);
 typedef ARCH_POSTFIX(V128_V128) ARCH_POSTFIX(V128_V128_Part2);
 typedef ARCH_POSTFIX(V128_V128) ARCH_POSTFIX(V128_V128_Part3);
+typedef ARCH_POSTFIX(V128_V128) ARCH_POSTFIX(V128_V128_Part4);
 typedef ARCH_POSTFIX(V128_V128V128) ARCH_POSTFIX(V128_V128V128_Part2);
 typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part2);
 typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part3);
+typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part4);
+typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part5);
 typedef ARCH_POSTFIX(V256_V256V256) ARCH_POSTFIX(V256_V256V256_Part2);
 
 // These functions are machine tuned located elsewhere
@@ -100,6 +109,10 @@ template <typename c_ret, typename c_arg1, typename c_arg2>
 void TestSimd2Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
                    const char *name);
 
+template <typename c_ret, typename c_arg1, typename c_arg2, typename c_arg3>
+void TestSimd3Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+                   const char *name);
+
 const int kIterations = 65536;
 
 // Add a macro layer since TEST_P will quote the name so we need to
@@ -195,6 +208,10 @@ MY_TEST_P(ARCH_POSTFIX(V128_U32), TestIntrinsics) {
   TestSimd1Arg<c_v128, uint32_t>(kIterations, mask, maskwidth, name);
 }
 
+MY_TEST_P(ARCH_POSTFIX(V128_U64), TestIntrinsics) {
+  TestSimd1Arg<c_v128, uint64_t>(kIterations, mask, maskwidth, name);
+}
+
 MY_TEST_P(ARCH_POSTFIX(V128_V64), TestIntrinsics) {
   TestSimd1Arg<c_v128, c_v64>(kIterations, mask, maskwidth, name);
 }
@@ -203,10 +220,19 @@ MY_TEST_P(ARCH_POSTFIX(V128_V128V128), TestIntrinsics) {
   TestSimd2Args<c_v128, c_v128, c_v128>(kIterations, mask, maskwidth, name);
 }
 
+MY_TEST_P(ARCH_POSTFIX(V128_V128V128V128), TestIntrinsics) {
+  TestSimd3Args<c_v128, c_v128, c_v128, c_v128>(kIterations, mask, maskwidth,
+                                                name);
+}
+
 MY_TEST_P(ARCH_POSTFIX(U32_V128V128), TestIntrinsics) {
   TestSimd2Args<uint32_t, c_v128, c_v128>(kIterations, mask, maskwidth, name);
 }
 
+MY_TEST_P(ARCH_POSTFIX(U64_V128V128), TestIntrinsics) {
+  TestSimd2Args<uint64_t, c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
 MY_TEST_P(ARCH_POSTFIX(S64_V128V128), TestIntrinsics) {
   TestSimd2Args<int64_t, c_v128, c_v128>(kIterations, mask, maskwidth, name);
 }
@@ -235,6 +261,10 @@ MY_TEST_P(ARCH_POSTFIX(V128_V128_Part3), TestIntrinsics) {
   TestSimd1Arg<c_v128, c_v128>(kIterations, mask, maskwidth, name);
 }
 
+MY_TEST_P(ARCH_POSTFIX(V128_V128_Part4), TestIntrinsics) {
+  TestSimd1Arg<c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
 MY_TEST_P(ARCH_POSTFIX(U64_V256), TestIntrinsics) {
   TestSimd1Arg<uint64_t, c_v256>(kIterations, mask, maskwidth, name);
 }
@@ -251,6 +281,11 @@ MY_TEST_P(ARCH_POSTFIX(V256_V256V256), TestIntrinsics) {
   TestSimd2Args<c_v256, c_v256, c_v256>(kIterations, mask, maskwidth, name);
 }
 
+MY_TEST_P(ARCH_POSTFIX(V256_V256V256V256), TestIntrinsics) {
+  TestSimd3Args<c_v256, c_v256, c_v256, c_v256>(kIterations, mask, maskwidth,
+                                                name);
+}
+
 MY_TEST_P(ARCH_POSTFIX(V256_V128V128), TestIntrinsics) {
   TestSimd2Args<c_v256, c_v128, c_v128>(kIterations, mask, maskwidth, name);
 }
@@ -259,6 +294,10 @@ MY_TEST_P(ARCH_POSTFIX(U32_V256V256), TestIntrinsics) {
   TestSimd2Args<uint32_t, c_v256, c_v256>(kIterations, mask, maskwidth, name);
 }
 
+MY_TEST_P(ARCH_POSTFIX(U64_V256V256), TestIntrinsics) {
+  TestSimd2Args<uint64_t, c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
 MY_TEST_P(ARCH_POSTFIX(S64_V256V256), TestIntrinsics) {
   TestSimd2Args<int64_t, c_v256, c_v256>(kIterations, mask, maskwidth, name);
 }
@@ -279,6 +318,14 @@ MY_TEST_P(ARCH_POSTFIX(V256_V256_Part3), TestIntrinsics) {
   TestSimd1Arg<c_v256, c_v256>(kIterations, mask, maskwidth, name);
 }
 
+MY_TEST_P(ARCH_POSTFIX(V256_V256_Part4), TestIntrinsics) {
+  TestSimd1Arg<c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256_Part5), TestIntrinsics) {
+  TestSimd1Arg<c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
 MY_TEST_P(ARCH_POSTFIX(V256_U8), TestIntrinsics) {
   TestSimd1Arg<c_v256, uint8_t>(kIterations, mask, maskwidth, name);
 }
@@ -291,6 +338,10 @@ MY_TEST_P(ARCH_POSTFIX(V256_U32), TestIntrinsics) {
   TestSimd1Arg<c_v256, uint32_t>(kIterations, mask, maskwidth, name);
 }
 
+MY_TEST_P(ARCH_POSTFIX(V256_U64), TestIntrinsics) {
+  TestSimd1Arg<c_v256, uint64_t>(kIterations, mask, maskwidth, name);
+}
+
 MY_TEST_P(ARCH_POSTFIX(U32_V256), TestIntrinsics) {
   TestSimd1Arg<uint32_t, c_v256>(kIterations, mask, maskwidth, name);
 }
@@ -305,7 +356,7 @@ MY_TEST_P(ARCH_POSTFIX(V64_V256), TestIntrinsics) {
   INSTANTIATE_TEST_CASE_P(name, type, ::testing::Values(__VA_ARGS__))
 
 #define SIMD_TUPLE(name, mask, maskwidth) \
-  std::tr1::make_tuple(mask, maskwidth, static_cast<const char *>(#name))
+  ::testing::make_tuple(mask, maskwidth, static_cast<const char *>(#name))
 
 INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V64V64),
             (SIMD_TUPLE(v64_sad_u8, 0U, 0U), SIMD_TUPLE(v64_ssd_u8, 0U, 0U)));
@@ -339,6 +390,8 @@ INSTANTIATE(
 
 INSTANTIATE(
     ARCH, ARCH_POSTFIX(V64_V64V64_Part2), SIMD_TUPLE(v64_shuffle_8, 7U, 8U),
+    SIMD_TUPLE(v64_pack_s32_u16, 0U, 0U), SIMD_TUPLE(v64_rdavg_u16, 0U, 0U),
+    SIMD_TUPLE(v64_sadd_s8, 0U, 0U), SIMD_TUPLE(v64_sadd_u8, 0U, 0U),
     SIMD_TUPLE(imm_v64_align<1>, 0U, 0U), SIMD_TUPLE(imm_v64_align<2>, 0U, 0U),
     SIMD_TUPLE(imm_v64_align<3>, 0U, 0U), SIMD_TUPLE(imm_v64_align<4>, 0U, 0U),
     SIMD_TUPLE(imm_v64_align<5>, 0U, 0U), SIMD_TUPLE(imm_v64_align<6>, 0U, 0U),
@@ -470,7 +523,8 @@ INSTANTIATE(ARCH, ARCH_POSTFIX(V64_U32), SIMD_TUPLE(v64_dup_32, 0U, 0U));
 INSTANTIATE(ARCH, ARCH_POSTFIX(V64_U32U32), SIMD_TUPLE(v64_from_32, 0U, 0U));
 
 INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V128V128), SIMD_TUPLE(v128_sad_u8, 0U, 0U),
-            SIMD_TUPLE(v128_ssd_u8, 0U, 0U));
+            SIMD_TUPLE(v128_ssd_u8, 0U, 0U), SIMD_TUPLE(v128_sad_u16, 0U, 0U));
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V128V128), SIMD_TUPLE(v128_ssd_s16, 0U, 0U));
 
 INSTANTIATE(
     ARCH, ARCH_POSTFIX(V128_V128V128), SIMD_TUPLE(v128_add_8, 0U, 0U),
@@ -501,9 +555,16 @@ INSTANTIATE(
     SIMD_TUPLE(v128_cmpgt_s16, 0U, 0U));
 
 INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128V128_Part2),
-            SIMD_TUPLE(v128_cmpeq_16, 0U, 0U),
+            SIMD_TUPLE(v128_pack_s32_u16, 0U, 0U),
+            SIMD_TUPLE(v128_rdavg_u16, 0U, 0U), SIMD_TUPLE(v128_add_64, 0U, 0U),
+            SIMD_TUPLE(v128_sub_64, 0U, 0U), SIMD_TUPLE(v128_sadd_s8, 0U, 0U),
+            SIMD_TUPLE(v128_sadd_u8, 0U, 0U), SIMD_TUPLE(v128_cmpeq_16, 0U, 0U),
             SIMD_TUPLE(v128_cmplt_s16, 0U, 0U),
+            SIMD_TUPLE(v128_cmplt_s32, 0U, 0U),
+            SIMD_TUPLE(v128_cmpeq_32, 0U, 0U),
+            SIMD_TUPLE(v128_cmpgt_s32, 0U, 0U),
             SIMD_TUPLE(v128_shuffle_8, 15U, 8U),
+            SIMD_TUPLE(v128_min_s32, 0U, 0U), SIMD_TUPLE(v128_max_s32, 0U, 0U),
             SIMD_TUPLE(imm_v128_align<1>, 0U, 0U),
             SIMD_TUPLE(imm_v128_align<2>, 0U, 0U),
             SIMD_TUPLE(imm_v128_align<3>, 0U, 0U),
@@ -520,6 +581,9 @@ INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128V128_Part2),
             SIMD_TUPLE(imm_v128_align<14>, 0U, 0U),
             SIMD_TUPLE(imm_v128_align<15>, 0U, 0U));
 
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128V128V128),
+            SIMD_TUPLE(v128_blend_8, 0U, 0U));
+
 INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128), SIMD_TUPLE(v128_abs_s8, 0U, 0U),
             SIMD_TUPLE(v128_abs_s16, 0U, 0U), SIMD_TUPLE(v128_padd_s16, 0U, 0U),
             SIMD_TUPLE(v128_unpacklo_u8_s16, 0U, 0U),
@@ -634,6 +698,57 @@ INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part3),
             SIMD_TUPLE(imm_v128_shr_n_s32<24>, 0U, 0U),
             SIMD_TUPLE(imm_v128_shr_n_s32<28>, 0U, 0U));
 
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part4),
+            SIMD_TUPLE(imm_v128_shl_n_64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<60>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<60>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<60>, 0U, 0U),
+            SIMD_TUPLE(v128_padd_u8, 0U, 0U));
+
 INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V64V64), SIMD_TUPLE(v128_from_v64, 0U, 0U),
             SIMD_TUPLE(v128_zip_8, 0U, 0U), SIMD_TUPLE(v128_zip_16, 0U, 0U),
             SIMD_TUPLE(v128_zip_32, 0U, 0U), SIMD_TUPLE(v128_mul_s16, 0U, 0U));
@@ -646,16 +761,17 @@ INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V64),
             SIMD_TUPLE(v128_unpack_u16_s32, 0U, 0U),
             SIMD_TUPLE(v128_unpack_s16_s32, 0U, 0U));
 
-INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128U32), SIMD_TUPLE(v128_shl_8, 7U, 32U),
-            SIMD_TUPLE(v128_shr_u8, 7U, 32U), SIMD_TUPLE(v128_shr_s8, 7U, 32U),
-            SIMD_TUPLE(v128_shl_16, 15U, 32U),
-            SIMD_TUPLE(v128_shr_u16, 15U, 32U),
-            SIMD_TUPLE(v128_shr_s16, 15U, 32U),
-            SIMD_TUPLE(v128_shl_32, 31U, 32U),
-            SIMD_TUPLE(v128_shr_u32, 31U, 32U),
-            SIMD_TUPLE(v128_shr_s32, 31U, 32U));
+INSTANTIATE(
+    ARCH, ARCH_POSTFIX(V128_V128U32), SIMD_TUPLE(v128_shl_8, 7U, 32U),
+    SIMD_TUPLE(v128_shr_u8, 7U, 32U), SIMD_TUPLE(v128_shr_s8, 7U, 32U),
+    SIMD_TUPLE(v128_shl_16, 15U, 32U), SIMD_TUPLE(v128_shr_u16, 15U, 32U),
+    SIMD_TUPLE(v128_shr_s16, 15U, 32U), SIMD_TUPLE(v128_shl_32, 31U, 32U),
+    SIMD_TUPLE(v128_shr_u32, 31U, 32U), SIMD_TUPLE(v128_shr_s32, 31U, 32U),
+    SIMD_TUPLE(v128_shl_64, 63U, 32U), SIMD_TUPLE(v128_shr_u64, 63U, 32U),
+    SIMD_TUPLE(v128_shr_s64, 63U, 32U));
 
-INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V128), SIMD_TUPLE(v128_low_u32, 0U, 0U));
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V128), SIMD_TUPLE(v128_low_u32, 0U, 0U),
+            SIMD_TUPLE(v128_movemask_8, 0U, 0U));
 
 INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V128), SIMD_TUPLE(v128_hadd_u8, 0U, 0U));
 
@@ -668,16 +784,23 @@ INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U16), SIMD_TUPLE(v128_dup_16, 0U, 0U));
 
 INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U32), SIMD_TUPLE(v128_dup_32, 0U, 0U));
 
-INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V128V128),
-            SIMD_TUPLE(v128_dotp_s16, 0U, 0U));
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U64), SIMD_TUPLE(v128_dup_64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V128V128), SIMD_TUPLE(v128_dotp_s16, 0U, 0U),
+            SIMD_TUPLE(v128_dotp_s32, 0U, 0U),
+            SIMD_TUPLE(v128_dotp_su8, 0U, 0U));
 
 INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V256V256), SIMD_TUPLE(v256_sad_u8, 0U, 0U),
-            SIMD_TUPLE(v256_ssd_u8, 0U, 0U));
+            SIMD_TUPLE(v256_ssd_u8, 0U, 0U), SIMD_TUPLE(v256_sad_u16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V256), SIMD_TUPLE(v256_hadd_u8, 0U, 0U),
+            SIMD_TUPLE(v256_low_u64, 0U, 0U));
 
-INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V256), SIMD_TUPLE(v256_hadd_u8, 0U, 0U));
+INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V256V256), SIMD_TUPLE(v256_dotp_s16, 0U, 0U),
+            SIMD_TUPLE(v256_dotp_s32, 0U, 0U),
+            SIMD_TUPLE(v256_dotp_su8, 0U, 0U));
 
-INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V256V256),
-            SIMD_TUPLE(v256_dotp_s16, 0U, 0U));
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V256V256), SIMD_TUPLE(v256_ssd_s16, 0U, 0U));
 
 INSTANTIATE(
     ARCH, ARCH_POSTFIX(V256_V256V256), SIMD_TUPLE(v256_add_8, 0U, 0U),
@@ -709,10 +832,16 @@ INSTANTIATE(
 
 INSTANTIATE(
     ARCH, ARCH_POSTFIX(V256_V256V256_Part2), SIMD_TUPLE(v256_cmpeq_8, 0U, 0U),
+    SIMD_TUPLE(v256_min_s32, 0U, 0U), SIMD_TUPLE(v256_max_s32, 0U, 0U),
+    SIMD_TUPLE(v256_add_64, 0U, 0U), SIMD_TUPLE(v256_sub_64, 0U, 0U),
     SIMD_TUPLE(v256_cmpgt_s16, 0U, 0U), SIMD_TUPLE(v256_cmplt_s16, 0U, 0U),
-    SIMD_TUPLE(v256_cmpeq_16, 0U, 0U), SIMD_TUPLE(v256_shuffle_8, 15U, 8U),
-    SIMD_TUPLE(v256_pshuffle_8, 15U, 8U), SIMD_TUPLE(imm_v256_align<1>, 0U, 0U),
-    SIMD_TUPLE(imm_v256_align<2>, 0U, 0U),
+    SIMD_TUPLE(v256_cmpeq_16, 0U, 0U), SIMD_TUPLE(v256_cmpgt_s32, 0U, 0U),
+    SIMD_TUPLE(v256_cmplt_s32, 0U, 0U), SIMD_TUPLE(v256_cmpeq_32, 0U, 0U),
+    SIMD_TUPLE(v256_shuffle_8, 31U, 8U), SIMD_TUPLE(v256_pshuffle_8, 15U, 8U),
+    SIMD_TUPLE(imm_v256_align<1>, 0U, 0U), SIMD_TUPLE(v256_sadd_s8, 0U, 0U),
+    SIMD_TUPLE(v256_sadd_u8, 0U, 0U), SIMD_TUPLE(v256_pack_s32_u16, 0U, 0U),
+    SIMD_TUPLE(v256_rdavg_u16, 0U, 0U), SIMD_TUPLE(imm_v256_align<2>, 0U, 0U),
+    SIMD_TUPLE(v256_unziphi_64, 0U, 0U), SIMD_TUPLE(v256_unziplo_64, 0U, 0U),
     SIMD_TUPLE(imm_v256_align<3>, 0U, 0U),
     SIMD_TUPLE(imm_v256_align<4>, 0U, 0U),
     SIMD_TUPLE(imm_v256_align<5>, 0U, 0U),
@@ -754,14 +883,14 @@ INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V128),
             SIMD_TUPLE(v256_unpack_u16_s32, 0U, 0U),
             SIMD_TUPLE(v256_unpack_s16_s32, 0U, 0U));
 
-INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256U32), SIMD_TUPLE(v256_shl_8, 7U, 32U),
-            SIMD_TUPLE(v256_shr_u8, 7U, 32U), SIMD_TUPLE(v256_shr_s8, 7U, 32U),
-            SIMD_TUPLE(v256_shl_16, 15U, 32U),
-            SIMD_TUPLE(v256_shr_u16, 15U, 32U),
-            SIMD_TUPLE(v256_shr_s16, 15U, 32U),
-            SIMD_TUPLE(v256_shl_32, 31U, 32U),
-            SIMD_TUPLE(v256_shr_u32, 31U, 32U),
-            SIMD_TUPLE(v256_shr_s32, 31U, 32U));
+INSTANTIATE(
+    ARCH, ARCH_POSTFIX(V256_V256U32), SIMD_TUPLE(v256_shl_8, 7U, 32U),
+    SIMD_TUPLE(v256_shr_u8, 7U, 32U), SIMD_TUPLE(v256_shr_s8, 7U, 32U),
+    SIMD_TUPLE(v256_shl_16, 15U, 32U), SIMD_TUPLE(v256_shr_u16, 15U, 32U),
+    SIMD_TUPLE(v256_shr_s16, 15U, 32U), SIMD_TUPLE(v256_shl_32, 31U, 32U),
+    SIMD_TUPLE(v256_shr_u32, 31U, 32U), SIMD_TUPLE(v256_shr_s32, 31U, 32U),
+    SIMD_TUPLE(v256_shl_64, 63U, 32U), SIMD_TUPLE(v256_shr_u64, 63U, 32U),
+    SIMD_TUPLE(v256_shr_s64, 63U, 32U));
 
 INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256), SIMD_TUPLE(v256_abs_s8, 0U, 0U),
             SIMD_TUPLE(v256_abs_s16, 0U, 0U), SIMD_TUPLE(v256_padd_s16, 0U, 0U),
@@ -909,13 +1038,103 @@ INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256_Part3),
             SIMD_TUPLE(imm_v256_shr_n_s32<24>, 0U, 0U),
             SIMD_TUPLE(imm_v256_shr_n_s32<28>, 0U, 0U));
 
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256_Part4),
+            SIMD_TUPLE(imm_v256_shl_n_64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<60>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<60>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<60>, 0U, 0U),
+            SIMD_TUPLE(v256_padd_u8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256_Part5),
+            SIMD_TUPLE(imm_v256_shr_n_word<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<9>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<11>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<13>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<15>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<9>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<11>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<13>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<15>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256V256V256),
+            SIMD_TUPLE(v256_blend_8, 0U, 0U),
+            SIMD_TUPLE(v256_wideshuffle_8, 63U, 8U));
+
 INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U8), SIMD_TUPLE(v256_dup_8, 0U, 0U));
 
 INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U16), SIMD_TUPLE(v256_dup_16, 0U, 0U));
 
 INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U32), SIMD_TUPLE(v256_dup_32, 0U, 0U));
 
-INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V256), SIMD_TUPLE(v256_low_u32, 0U, 0U));
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U64), SIMD_TUPLE(v256_dup_64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V256), SIMD_TUPLE(v256_low_u32, 0U, 0U),
+            SIMD_TUPLE(v256_movemask_8, 0U, 0U));
 
 INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V256), SIMD_TUPLE(v256_low_v64, 0U, 0U));
 
diff --git a/third_party/aom/test/simd_neon_test.cc b/third_party/aom/test/simd_neon_test.cc
index 0565fb4e2..b67b18895 100644
--- a/third_party/aom/test/simd_neon_test.cc
+++ b/third_party/aom/test/simd_neon_test.cc
@@ -13,5 +13,5 @@
 #define ARCH NEON
 #define ARCH_POSTFIX(name) name##_neon
 #define SIMD_NAMESPACE simd_test_neon
-#include "./simd_impl.h"
+#include "test/simd_impl.h"
 #endif
diff --git a/third_party/aom/test/simd_sse2_test.cc b/third_party/aom/test/simd_sse2_test.cc
index a0b49d77e..b37a931b3 100644
--- a/third_party/aom/test/simd_sse2_test.cc
+++ b/third_party/aom/test/simd_sse2_test.cc
@@ -14,5 +14,5 @@
 #define ARCH SSE2
 #define ARCH_POSTFIX(name) name##_sse2
 #define SIMD_NAMESPACE simd_test_sse2
-#include "./simd_impl.h"
+#include "test/simd_impl.h"
 #endif
diff --git a/third_party/aom/test/simd_sse4_test.cc b/third_party/aom/test/simd_sse4_test.cc
index 73c96427f..b1c9d5cd8 100644
--- a/third_party/aom/test/simd_sse4_test.cc
+++ b/third_party/aom/test/simd_sse4_test.cc
@@ -14,5 +14,5 @@
 #define ARCH SSE4_1
 #define ARCH_POSTFIX(name) name##_sse4_1
 #define SIMD_NAMESPACE simd_test_sse4_1
-#include "./simd_impl.h"
+#include "test/simd_impl.h"
 #endif
diff --git a/third_party/aom/test/simd_ssse3_test.cc b/third_party/aom/test/simd_ssse3_test.cc
index 9ebeeef1b..d95c26fb5 100644
--- a/third_party/aom/test/simd_ssse3_test.cc
+++ b/third_party/aom/test/simd_ssse3_test.cc
@@ -14,5 +14,5 @@
 #define ARCH SSSE3
 #define ARCH_POSTFIX(name) name##_ssse3
 #define SIMD_NAMESPACE simd_test_ssse3
-#include "./simd_impl.h"
+#include "test/simd_impl.h"
 #endif
diff --git a/third_party/aom/test/simple_decoder.sh b/third_party/aom/test/simple_decoder.sh
index ac3a07b18..5f39ad206 100755
--- a/third_party/aom/test/simple_decoder.sh
+++ b/third_party/aom/test/simple_decoder.sh
@@ -25,7 +25,7 @@ simple_decoder_verify_environment() {
 # Runs simple_decoder using $1 as input file. $2 is the codec name, and is used
 # solely to name the output file.
 simple_decoder() {
-  local decoder="${LIBAOM_BIN_PATH}/simple_decoder${AOM_TEST_EXE_SUFFIX}"
+  local decoder="$(aom_tool_path simple_decoder)"
   local input_file="$1"
   local codec="$2"
   local output_file="${AOM_TEST_OUTPUT_DIR}/simple_decoder_${codec}.raw"
diff --git a/third_party/aom/test/subtract_test.cc b/third_party/aom/test/subtract_test.cc
index 725a6a2c6..7dcedf56d 100644
--- a/third_party/aom/test/subtract_test.cc
+++ b/third_party/aom/test/subtract_test.cc
@@ -7,24 +7,21 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
-#if CONFIG_AV1
 #include "av1/common/blockd.h"
-#endif
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 
-#define USE_SPEED_TEST (0)
-
 typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr,
                              ptrdiff_t diff_stride, const uint8_t *src_ptr,
                              ptrdiff_t src_stride, const uint8_t *pred_ptr,
@@ -112,14 +109,13 @@ typedef void (*HBDSubtractFunc)(int rows, int cols, int16_t *diff_ptr,
                                 ptrdiff_t src_stride, const uint8_t *pred_ptr,
                                 ptrdiff_t pred_stride, int bd);
 
-using ::std::tr1::get;
-using ::std::tr1::make_tuple;
-using ::std::tr1::tuple;
+using ::testing::get;
+using ::testing::make_tuple;
+using ::testing::tuple;
 
 // <width, height, bit_dpeth, subtract>
 typedef tuple<int, int, int, HBDSubtractFunc> Params;
 
-#if CONFIG_HIGHBITDEPTH
 class AV1HBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
  public:
   virtual void SetUp() {
@@ -130,11 +126,7 @@ class AV1HBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
 
-#if CONFIG_EXT_PARTITION
     const size_t max_width = 128;
-#else
-    const size_t max_width = 64;
-#endif
     const size_t max_block_size = max_width * max_width;
     src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
         aom_memalign(16, max_block_size * sizeof(uint16_t))));
@@ -167,11 +159,7 @@ class AV1HBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
 
 void AV1HBDSubtractBlockTest::CheckResult() {
   const int test_num = 100;
-#if CONFIG_EXT_PARTITION
   const size_t max_width = 128;
-#else
-  const size_t max_width = 64;
-#endif
   const int max_block_size = max_width * max_width;
   const int mask = (1 << bit_depth_) - 1;
   int i, j;
@@ -200,11 +188,7 @@ TEST_P(AV1HBDSubtractBlockTest, CheckResult) { CheckResult(); }
 
 void AV1HBDSubtractBlockTest::RunForSpeed() {
   const int test_num = 200000;
-#if CONFIG_EXT_PARTITION
   const size_t max_width = 128;
-#else
-  const size_t max_width = 64;
-#endif
   const int max_block_size = max_width * max_width;
   const int mask = (1 << bit_depth_) - 1;
   int i, j;
@@ -251,18 +235,15 @@ const Params kAV1HBDSubtractBlock_sse2[] = {
   make_tuple(64, 32, 12, &aom_highbd_subtract_block_c),
   make_tuple(64, 64, 12, &aom_highbd_subtract_block_sse2),
   make_tuple(64, 64, 12, &aom_highbd_subtract_block_c),
-#if CONFIG_EXT_PARTITION
   make_tuple(64, 128, 12, &aom_highbd_subtract_block_sse2),
   make_tuple(64, 128, 12, &aom_highbd_subtract_block_c),
   make_tuple(128, 64, 12, &aom_highbd_subtract_block_sse2),
   make_tuple(128, 64, 12, &aom_highbd_subtract_block_c),
   make_tuple(128, 128, 12, &aom_highbd_subtract_block_sse2),
   make_tuple(128, 128, 12, &aom_highbd_subtract_block_c)
-#endif  // CONFIG_EXT_PARTITION
 };
 
 INSTANTIATE_TEST_CASE_P(SSE2, AV1HBDSubtractBlockTest,
                         ::testing::ValuesIn(kAV1HBDSubtractBlock_sse2));
 #endif  // HAVE_SSE2
-#endif  // CONFIG_HIGHBITDEPTH
 }  // namespace
diff --git a/third_party/aom/test/sum_squares_test.cc b/third_party/aom/test/sum_squares_test.cc
index b8701c196..c03ebad4a 100644
--- a/third_party/aom/test/sum_squares_test.cc
+++ b/third_party/aom/test/sum_squares_test.cc
@@ -15,8 +15,9 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_ports/mem.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
@@ -39,89 +40,82 @@ typedef libaom_test::FuncParam<SSI16Func> TestFuncs;
 class SumSquaresTest : public ::testing::TestWithParam<TestFuncs> {
  public:
   virtual ~SumSquaresTest() {}
-  virtual void SetUp() { params_ = this->GetParam(); }
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  TestFuncs params_;
-};
-
-TEST_P(SumSquaresTest, OperationCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, int16_t, src[256 * 256]);
-
-  int failed = 0;
-
-  const int msb = 11;  // Up to 12 bit input
-  const int limit = 1 << (msb + 1);
+  virtual void SetUp() {
+    params_ = this->GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<int16_t *>(aom_memalign(16, 256 * 256 * 2));
+    ASSERT_TRUE(src_ != NULL);
+  }
 
-  for (int k = 0; k < kNumIterations; k++) {
-    int width = 4 * rnd(32);   // Up to 128x128
-    int height = 4 * rnd(32);  // Up to 128x128
-    int stride = 4 << rnd(7);  // Up to 256 stride
-    while (stride < width) {   // Make sure it's valid
-      stride = 4 << rnd(7);
-    }
+  virtual void TearDown() {
+    libaom_test::ClearSystemState();
+    aom_free(src_);
+  }
+  void RunTest(int isRandom);
 
+  void GenRandomData(int width, int height, int stride) {
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
     for (int ii = 0; ii < height; ii++) {
       for (int jj = 0; jj < width; jj++) {
-        src[ii * stride + jj] = rnd(2) ? rnd(limit) : -rnd(limit);
+        src_[ii * stride + jj] = rnd_(2) ? rnd_(limit) : -rnd_(limit);
       }
     }
+  }
 
-    const uint64_t res_ref = params_.ref_func(src, stride, width, height);
-    uint64_t res_tst;
-    ASM_REGISTER_STATE_CHECK(res_tst =
-                                 params_.tst_func(src, stride, width, height));
-
-    if (!failed) {
-      failed = res_ref != res_tst;
-      EXPECT_EQ(res_ref, res_tst)
-          << "Error: Sum Squares Test"
-          << " C output does not match optimized output.";
+  void GenExtremeData(int width, int height, int stride) {
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
+    const int val = rnd_(2) ? limit - 1 : -(limit - 1);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src_[ii * stride + jj] = val;
+      }
     }
   }
-}
 
-TEST_P(SumSquaresTest, ExtremeValues) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, int16_t, src[256 * 256]);
+ protected:
+  TestFuncs params_;
+  int16_t *src_;
+  ACMRandom rnd_;
+};
 
+void SumSquaresTest::RunTest(int isRandom) {
   int failed = 0;
-
-  const int msb = 11;  // Up to 12 bit input
-  const int limit = 1 << (msb + 1);
-
   for (int k = 0; k < kNumIterations; k++) {
-    int width = 4 * rnd(32);   // Up to 128x128
-    int height = 4 * rnd(32);  // Up to 128x128
-    int stride = 4 << rnd(7);  // Up to 256 stride
-    while (stride < width) {   // Make sure it's valid
-      stride = 4 << rnd(7);
+    const int width = 4 * (rnd_(31) + 1);   // Up to 128x128
+    const int height = 4 * (rnd_(31) + 1);  // Up to 128x128
+    int stride = 4 << rnd_(7);              // Up to 256 stride
+    while (stride < width) {                // Make sure it's valid
+      stride = 4 << rnd_(7);
     }
-
-    int val = rnd(2) ? limit - 1 : -(limit - 1);
-    for (int ii = 0; ii < height; ii++) {
-      for (int jj = 0; jj < width; jj++) {
-        src[ii * stride + jj] = val;
-      }
+    if (isRandom) {
+      GenRandomData(width, height, stride);
+    } else {
+      GenExtremeData(width, height, stride);
     }
-
-    const uint64_t res_ref = params_.ref_func(src, stride, width, height);
+    const uint64_t res_ref = params_.ref_func(src_, stride, width, height);
     uint64_t res_tst;
     ASM_REGISTER_STATE_CHECK(res_tst =
-                                 params_.tst_func(src, stride, width, height));
+                                 params_.tst_func(src_, stride, width, height));
 
     if (!failed) {
       failed = res_ref != res_tst;
       EXPECT_EQ(res_ref, res_tst)
-          << "Error: Sum Squares Test"
-          << " C output does not match optimized output.";
+          << "Error: Sum Squares Test [" << width << "x" << height
+          << "] C output does not match optimized output.";
     }
   }
 }
 
+TEST_P(SumSquaresTest, OperationCheck) {
+  RunTest(1);  // GenRandomData
+}
+
+TEST_P(SumSquaresTest, ExtremeValues) {
+  RunTest(0);  // GenExtremeData
+}
+
 #if HAVE_SSE2
 
 INSTANTIATE_TEST_CASE_P(
diff --git a/third_party/aom/test/superframe_test.cc b/third_party/aom/test/superframe_test.cc
index c8f663e5c..7be18f72a 100644
--- a/third_party/aom/test/superframe_test.cc
+++ b/third_party/aom/test/superframe_test.cc
@@ -7,7 +7,7 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <climits>
 #include <vector>
@@ -23,7 +23,7 @@ const int kTestMode = 0;
 const int kTileCols = 1;
 const int kTileRows = 2;
 
-typedef std::tr1::tuple<libaom_test::TestMode, int, int> SuperframeTestParam;
+typedef ::testing::tuple<libaom_test::TestMode, int, int> SuperframeTestParam;
 
 class SuperframeTest
     : public ::libaom_test::CodecTestWithParam<SuperframeTestParam>,
@@ -35,12 +35,12 @@ class SuperframeTest
   virtual void SetUp() {
     InitializeConfig();
     const SuperframeTestParam input = GET_PARAM(1);
-    const libaom_test::TestMode mode = std::tr1::get<kTestMode>(input);
+    const libaom_test::TestMode mode = ::testing::get<kTestMode>(input);
     SetMode(mode);
     sf_count_ = 0;
     sf_count_max_ = INT_MAX;
-    n_tile_cols_ = std::tr1::get<kTileCols>(input);
-    n_tile_rows_ = std::tr1::get<kTileRows>(input);
+    n_tile_cols_ = ::testing::get<kTileCols>(input);
+    n_tile_rows_ = ::testing::get<kTileRows>(input);
   }
 
   virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
@@ -50,9 +50,6 @@ class SuperframeTest
       encoder->Control(AOME_SET_CPUUSED, 2);
       encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_);
       encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_);
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-      encoder->Control(AV1E_SET_TILE_LOOPFILTER, 0);
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
     }
   }
 
@@ -100,31 +97,13 @@ class SuperframeTest
 TEST_P(SuperframeTest, TestSuperframeIndexIsOptional) {
   sf_count_max_ = 0;  // early exit on successful test.
   cfg_.g_lag_in_frames = 25;
-#if CONFIG_EXT_TILE
   cfg_.large_scale_tile = 1;
-#endif  // CONFIG_EXT_TILE
   ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 40);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-#if CONFIG_EXT_REFS
   // NOTE: The use of BWDREF_FRAME will enable the coding of more non-show
   //       frames besides ALTREF_FRAME.
   EXPECT_GE(sf_count_, 1);
-#else
-  EXPECT_EQ(sf_count_, 1);
-#endif  // CONFIG_EXT_REFS
 }
 
-// The superframe index is currently mandatory with both ANS and DAALA_EC due
-// to the decoder starting at the end of the buffer.
-#if CONFIG_EXT_TILE
-// Single tile does not work with ANS (see comment above).
-const int tile_col_values[] = { 1, 2 };
-const int tile_row_values[] = { 1, 2, 32 };
-AV1_INSTANTIATE_TEST_CASE(
-    SuperframeTest,
-    ::testing::Combine(::testing::Values(::libaom_test::kTwoPassGood),
-                       ::testing::ValuesIn(tile_col_values),
-                       ::testing::ValuesIn(tile_row_values)));
-#endif  // CONFIG_EXT_TILE
 }  // namespace
diff --git a/third_party/aom/test/test-data.mk b/third_party/aom/test/test-data.mk
deleted file mode 100644
index d82033e3b..000000000
--- a/third_party/aom/test/test-data.mk
+++ /dev/null
@@ -1,49 +0,0 @@
-LIBAOM_TEST_SRCS-yes += test-data.mk
-
-# Encoder test source
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += hantro_collage_w352h288.yuv
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += hantro_odd.yuv
-
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += park_joy_90p_10_420.y4m
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += park_joy_90p_10_422.y4m
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += park_joy_90p_10_444.y4m
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += park_joy_90p_10_440.yuv
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += park_joy_90p_12_420.y4m
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += park_joy_90p_12_422.y4m
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += park_joy_90p_12_444.y4m
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += park_joy_90p_12_440.yuv
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += park_joy_90p_8_420_a10-1.y4m
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += park_joy_90p_8_420.y4m
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += park_joy_90p_8_422.y4m
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += park_joy_90p_8_444.y4m
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += park_joy_90p_8_440.yuv
-
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += desktop_credits.y4m
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += niklas_1280_720_30.y4m
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += rush_hour_444.y4m
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += screendata.y4m
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += niklas_640_480_30.yuv
-
-ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
-# Encode / Decode test
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += niklas_1280_720_30.yuv
-endif  # CONFIG_DECODE_PERF_TESTS
-
-ifeq ($(CONFIG_ENCODE_PERF_TESTS),yes)
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += desktop_640_360_30.yuv
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += kirland_640_480_30.yuv
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += macmarcomoving_640_480_30.yuv
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += macmarcostationary_640_480_30.yuv
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += niklas_1280_720_30.yuv
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += tacomanarrows_640_480_30.yuv
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += tacomasmallcameramovement_640_480_30.yuv
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += thaloundeskmtg_640_480_30.yuv
-endif  # CONFIG_ENCODE_PERF_TESTS
-
-ifeq ($(CONFIG_EXT_TILE),yes)
-LIBAOM_TEST_DATA-$(CONFIG_AV1_ENCODER) += vase10x10.yuv
-endif  # CONFIG_EXT_TILE
-
-# sort and remove duplicates
-LIBAOM_TEST_DATA-yes := $(sort $(LIBAOM_TEST_DATA-yes))
-
diff --git a/third_party/aom/test/test-data.sha1 b/third_party/aom/test/test-data.sha1
index 0caf21e1e..67aeb5208 100644
--- a/third_party/aom/test/test-data.sha1
+++ b/third_party/aom/test/test-data.sha1
@@ -1,5 +1,7 @@
 d5dfb0151c9051f8c85999255645d7a23916d3c0 *hantro_collage_w352h288.yuv
 b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv
+26b7f64399b84db4b4c9c915d743ec5c2619d4b9 *invalid-bug-1814.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-1814.ivf.res
 a432f96ff0a787268e2f94a8092ab161a18d1b06 *park_joy_90p_10_420.y4m
 0b194cc312c3a2e84d156a221b0a5eb615dfddc5 *park_joy_90p_10_422.y4m
 ff0e0a21dc2adc95b8c1b37902713700655ced17 *park_joy_90p_10_444.y4m
@@ -27,3 +29,331 @@ e7d315dbf4f3928779e0dc624311196d44491d32 *niklas_1280_720_30.yuv
 9cfc855459e7549fd015c79e8eca512b2f2cb7e3 *niklas_1280_720_30.y4m
 5b5763b388b1b52a81bb82b39f7ec25c4bd3d0e1 *desktop_credits.y4m
 36ddab9b99eb7545aa0bf362d6f498212d596516 *vase10x10.yuv
+c2e1ec9936b95254187a359e94aa32a9f3dad1b7 *av1-1-b8-00-quantizer-00.ivf
+26cd2a0321d01d9db5f6dace8b43a40cd5b9d58d *av1-1-b8-00-quantizer-00.ivf.md5
+a56dd02c0258d4afea1ee358a22b54e99e39d5e1 *av1-1-b8-00-quantizer-01.ivf
+b3d24124d81f1fbb26f5eb0036accb54f3ec69b2 *av1-1-b8-00-quantizer-01.ivf.md5
+3466327cb842a91d69839b11ef930a74f086f4c6 *av1-1-b8-00-quantizer-02.ivf
+c111dce946100efeaad34203080eee1d55464df6 *av1-1-b8-00-quantizer-02.ivf.md5
+d3f1f32de5e2c0c19a58bb8ef096108388c6a820 *av1-1-b8-00-quantizer-03.ivf
+6265321b31130545b4454982ca93e412a56845b8 *av1-1-b8-00-quantizer-03.ivf.md5
+f37c393ebe73266a5ec8508a2ca33c586ff28e64 *av1-1-b8-00-quantizer-04.ivf
+c6e979da71aecc593c0abb40135dd304152b00dd *av1-1-b8-00-quantizer-04.ivf.md5
+ac9c5e93cb19942a9be259d0567ec96c54dcdc7c *av1-1-b8-00-quantizer-05.ivf
+49e35a7399568a0e4f015ce323d5a45ea780ca87 *av1-1-b8-00-quantizer-05.ivf.md5
+461142b1b50ae74c6b698d23f5ed3b764eadfb89 *av1-1-b8-00-quantizer-06.ivf
+6477ff260624e0f76c94ac872d1e7d5576af4177 *av1-1-b8-00-quantizer-06.ivf.md5
+7f8113cd13d8faaa06fdbaaa50dc328daf037e6d *av1-1-b8-00-quantizer-07.ivf
+b26795c6cb408487c20737977cd6b77311772bf7 *av1-1-b8-00-quantizer-07.ivf.md5
+4218f7945a172e1fe4f9e77ec35085a394eda9f4 *av1-1-b8-00-quantizer-08.ivf
+ea5d7d501e9a69d805251e4871515d28468d8676 *av1-1-b8-00-quantizer-08.ivf.md5
+837f3bcadfe56cf302db2ebaf9a990446fb35801 *av1-1-b8-00-quantizer-09.ivf
+eede995cdac5fd01a411da2e74e86e8394138be1 *av1-1-b8-00-quantizer-09.ivf.md5
+adc229b3780a4968c18ded1bcbe72e3f04643833 *av1-1-b8-00-quantizer-10.ivf
+0799b7e54e54ee97bf0e8aad2b75509ce59c7097 *av1-1-b8-00-quantizer-10.ivf.md5
+44bac8247160a8d9a0ab19f890fc89cc9298de1d *av1-1-b8-00-quantizer-11.ivf
+cc6b2bf167e114599b242aba574e8c6f1fa2f047 *av1-1-b8-00-quantizer-11.ivf.md5
+ebb3af7dfc15567188bcb617021cdc95ebc560e3 *av1-1-b8-00-quantizer-12.ivf
+b716ae29d56cd0c052dbfa1b5dcf850cd0fa8ca7 *av1-1-b8-00-quantizer-12.ivf.md5
+46159641f981a26fb9c374a5ca41e44f0ce0a9f0 *av1-1-b8-00-quantizer-13.ivf
+c6db1b8b4a74f83e4a0647e053cea0fc00f6abab *av1-1-b8-00-quantizer-13.ivf.md5
+fadc909d18eb640760fbb075f922fb050e715470 *av1-1-b8-00-quantizer-14.ivf
+e36bb6b23273633ba3ef7d28160a7258840a1476 *av1-1-b8-00-quantizer-14.ivf.md5
+8befbd9cc1601dcd36ec6911613855f68e6fd40e *av1-1-b8-00-quantizer-15.ivf
+cfc2334b76fb5e7aa9d8607e89d37cbc7716d62e *av1-1-b8-00-quantizer-15.ivf.md5
+ca42e00ae27c6b7f684fe3d2a787d50d2827cb3f *av1-1-b8-00-quantizer-16.ivf
+f11278218a7c3c73cfaab2332bab55f06cedcc81 *av1-1-b8-00-quantizer-16.ivf.md5
+05270d365bdc067f9446eda3029a6f41571a5229 *av1-1-b8-00-quantizer-17.ivf
+fb6482f35e7ad04bf231ea1806226760abcb3c26 *av1-1-b8-00-quantizer-17.ivf.md5
+617bc72037165efbff478d5a0d342b3c20ffcafd *av1-1-b8-00-quantizer-18.ivf
+1ff68d5424f91322123fe0d58f436b8e49cfa99d *av1-1-b8-00-quantizer-18.ivf.md5
+821c3b1ae6054c7a91b2f64428806e57f1157ca6 *av1-1-b8-00-quantizer-19.ivf
+f2fd118e786697553d6987f786660a2bb9f00680 *av1-1-b8-00-quantizer-19.ivf.md5
+48bcf17c27d9a4eb73632a68c09f42eff9f9af99 *av1-1-b8-00-quantizer-20.ivf
+64d55e4c858414bc2837c9c3e2d5fb6d2208c4b8 *av1-1-b8-00-quantizer-20.ivf.md5
+d61ecdd4f0950bc5c8bae1270b22e711bdd22763 *av1-1-b8-00-quantizer-21.ivf
+9d447938596096704fd5f4d41bcdf6fabf9cdfb9 *av1-1-b8-00-quantizer-21.ivf.md5
+59b4b65d8e56ccdd1bddff26a03e991a63409334 *av1-1-b8-00-quantizer-22.ivf
+aa1be0c7c7622d612af85f9bf96a212f6fe5ab56 *av1-1-b8-00-quantizer-22.ivf.md5
+95ed96988eb9916cad956db9b929718769de49f1 *av1-1-b8-00-quantizer-23.ivf
+596b8a3aea468996d609624367465c412751f52b *av1-1-b8-00-quantizer-23.ivf.md5
+e6c2dc4ce725003152797b3d7b34d7eb34da50c8 *av1-1-b8-00-quantizer-24.ivf
+1cd3d7e8b3813a9e5591b94eaeb72d471780e64a *av1-1-b8-00-quantizer-24.ivf.md5
+6734e353008824e523939d1a18daa3f2ab2d8ec6 *av1-1-b8-00-quantizer-25.ivf
+c45cf440a05802c1f9e29472175ed397d130d988 *av1-1-b8-00-quantizer-25.ivf.md5
+3372b1c69fb39811156adcea4f6dba802c0918c2 *av1-1-b8-00-quantizer-26.ivf
+b1751d55bb3fb788751fe28fb7434bee153bda68 *av1-1-b8-00-quantizer-26.ivf.md5
+e7ddb19a6e2a798d6a4e7dfdfc10b4df777b60e3 *av1-1-b8-00-quantizer-27.ivf
+0e19d6b79cd71de69d03e0455349568af979b170 *av1-1-b8-00-quantizer-27.ivf.md5
+7f1c90a35543d6b673e353b3702baf3aa1caeaa7 *av1-1-b8-00-quantizer-28.ivf
+d9a4f9cb88103249a05a7e6aa616bf0c16bf9c95 *av1-1-b8-00-quantizer-28.ivf.md5
+28d741b923011c7fcc50a7318256a638d3110a07 *av1-1-b8-00-quantizer-29.ivf
+c68cacf2b2ff2694945a99ad836dcf1ee3961c09 *av1-1-b8-00-quantizer-29.ivf.md5
+9a5d9ea4bc76dd40d04e92f33f45e9c2e120e85d *av1-1-b8-00-quantizer-30.ivf
+eb02bb8c16c4c0368ddff83e05e516e84ec9eaf3 *av1-1-b8-00-quantizer-30.ivf.md5
+20193c372f44f522e094c2c05fc7e4aaa0717fa8 *av1-1-b8-00-quantizer-31.ivf
+a4c1a4ac332f4911f0d5abbd826ebecfb8432d6c *av1-1-b8-00-quantizer-31.ivf.md5
+9617bbd691f093d259dbc8a642a57a153c1fc00c *av1-1-b8-00-quantizer-32.ivf
+73d60a348454b126ea6368ea604954bc23f210ae *av1-1-b8-00-quantizer-32.ivf.md5
+d9aea9d72a686c59b60584d827f60ca1ee8eee26 *av1-1-b8-00-quantizer-33.ivf
+fbf64de376a63d2d3051da83b0e4e56579b55c0a *av1-1-b8-00-quantizer-33.ivf.md5
+791aaf067f125e5cf4a247cf06a2e29ab071ec90 *av1-1-b8-00-quantizer-34.ivf
+8e2e6efe4c069e54844da19125c4280b95990c69 *av1-1-b8-00-quantizer-34.ivf.md5
+01ba67bba5cbf7c94c65da8f4c9bd6e7db24cf3a *av1-1-b8-00-quantizer-35.ivf
+0c5e60704a4a6bd27e67b6fd72ca7d2cf7fff50f *av1-1-b8-00-quantizer-35.ivf.md5
+3e255b4a320c9522dcec539fef770b6920b9a102 *av1-1-b8-00-quantizer-36.ivf
+1241aab865fd7b4bae73736cbeec1866ea9c90ec *av1-1-b8-00-quantizer-36.ivf.md5
+44fa6fca109747d8f43f6c6aa46d782e5d476d54 *av1-1-b8-00-quantizer-37.ivf
+947f0f887c5ac9149cf85e8114a709d6f410fc32 *av1-1-b8-00-quantizer-37.ivf.md5
+8319ac1ddd6ce3279da5780175dff7a3a5fa1054 *av1-1-b8-00-quantizer-38.ivf
+5f571b7f88678eab9e54f162cc9898f14e437770 *av1-1-b8-00-quantizer-38.ivf.md5
+5975e7056e17608593a8c40619b68e6576d373d9 *av1-1-b8-00-quantizer-39.ivf
+7c870192d6eb70ce5367147a3d2c6a52e11f7bec *av1-1-b8-00-quantizer-39.ivf.md5
+47da942f1e455f1422fc65f06dd57304541d16ac *av1-1-b8-00-quantizer-40.ivf
+6ea7116c9ce3a1641c7060bab2f5e06fd0910d61 *av1-1-b8-00-quantizer-40.ivf.md5
+ab35c15dfde21c2572b14e04dbfd5fac1adae449 *av1-1-b8-00-quantizer-41.ivf
+19596f9849653b913186b9d6b7072984ede96177 *av1-1-b8-00-quantizer-41.ivf.md5
+23a5fa6c3d0eaffaf13f6402465f5dd33d8ea7f1 *av1-1-b8-00-quantizer-42.ivf
+5a2726f0d1b1799d4f70883f1bfe5c9d976c6cf5 *av1-1-b8-00-quantizer-42.ivf.md5
+86cddfc463d2b186ec5a1aa25c4562c05201e3c3 *av1-1-b8-00-quantizer-43.ivf
+674c64ec8487ee774ad09350380fa6ac43815807 *av1-1-b8-00-quantizer-43.ivf.md5
+6894c154eb56c4f3fe44d54fc4f9af468b03d175 *av1-1-b8-00-quantizer-44.ivf
+eca679a2781eb894d18b3d578e3aaf4f48019a15 *av1-1-b8-00-quantizer-44.ivf.md5
+0960bf018ada4224b8344519cf091850d50a57bd *av1-1-b8-00-quantizer-45.ivf
+291bb43b9e1ab167040b51019daf1ccf94fd1e50 *av1-1-b8-00-quantizer-45.ivf.md5
+ea644a4732f1a2534332802c2fa5073344f3c356 *av1-1-b8-00-quantizer-46.ivf
+4c7915382b1d6d08709c95525b04ab8830f20ca1 *av1-1-b8-00-quantizer-46.ivf.md5
+d1f8832d33234e2c74a2280090850153ea24ea82 *av1-1-b8-00-quantizer-47.ivf
+90eb9959e612602934dcc512fe6f54abf0c88d9c *av1-1-b8-00-quantizer-47.ivf.md5
+69c93f760e8b666eb5b98f510e09d90f9230ac9b *av1-1-b8-00-quantizer-48.ivf
+931f869e14bd455de9dac2101b383c29e7d6f04c *av1-1-b8-00-quantizer-48.ivf.md5
+8b660c577d95c031d6711c1134b8d115097f8d7e *av1-1-b8-00-quantizer-49.ivf
+0e3fe8b49d497050dc1a0eac5f3ad60f5fe068fe *av1-1-b8-00-quantizer-49.ivf.md5
+d40bb21448a6da0fc9b88cbcf76d2f4226573acb *av1-1-b8-00-quantizer-50.ivf
+bcd2a9c9a021ba44fc5dc74ae02194fe49ca76a4 *av1-1-b8-00-quantizer-50.ivf.md5
+3b5a1d464aa89b0f1a6ad4f5a03602292b826172 *av1-1-b8-00-quantizer-51.ivf
+49bcde0c56cf8b7fbe429336981be22d39025b74 *av1-1-b8-00-quantizer-51.ivf.md5
+38970a02fb38ddb4954fe4240164cb75de5fc744 *av1-1-b8-00-quantizer-52.ivf
+fd02b034d79d4be150efb02bd4349edfd0e41311 *av1-1-b8-00-quantizer-52.ivf.md5
+2fde7a7cf3014d5196d011c47de4a144227ed122 *av1-1-b8-00-quantizer-53.ivf
+0cb66e6d8fbb29962a69ae1703e22da50db2c92b *av1-1-b8-00-quantizer-53.ivf.md5
+89a69e9b9a601e40cb491ac3a1d32491f2468ac8 *av1-1-b8-00-quantizer-54.ivf
+2f8af51acc73c99b5af81db2bdd1883b611ad311 *av1-1-b8-00-quantizer-54.ivf.md5
+31ee4f56fcb0043e95fff7af49e4ef82aafa5543 *av1-1-b8-00-quantizer-55.ivf
+04a7104e02bdd0fa38c118202dbbecdbd11ace02 *av1-1-b8-00-quantizer-55.ivf.md5
+f262f0b234006a2652fceb77b1a8711aa53abb54 *av1-1-b8-00-quantizer-56.ivf
+bdd54dc25bc5a147c76163af0bced45c56435d79 *av1-1-b8-00-quantizer-56.ivf.md5
+1ef00617091db4b2b839de623bd6b4fb0b2f5f83 *av1-1-b8-00-quantizer-57.ivf
+714c65363a87ed5e6e4ad75c79ddb6af57d41fd9 *av1-1-b8-00-quantizer-57.ivf.md5
+43c9b02feccbb3c709d96015f126b7e3d4c24c64 *av1-1-b8-00-quantizer-58.ivf
+bae22b8d6377862bff8219470c0d87205d186a68 *av1-1-b8-00-quantizer-58.ivf.md5
+ca5f780abe4c02e48cceb9c804f3625723c359bf *av1-1-b8-00-quantizer-59.ivf
+c60a20bbf60b0b0a442ef3f7b682979053909d6e *av1-1-b8-00-quantizer-59.ivf.md5
+1f6f047e9f0e1da22fb514370d92c3c7c66dcf89 *av1-1-b8-00-quantizer-60.ivf
+86dc7fa59d363cf1ae4b027a57b119bda893c1c1 *av1-1-b8-00-quantizer-60.ivf.md5
+bcf0c3353568c47a043f2dc34c9abd3fc04eebd4 *av1-1-b8-00-quantizer-61.ivf
+66fc4f729c5915aa19939d1b6e28e5b398e747bb *av1-1-b8-00-quantizer-61.ivf.md5
+ac8d3c54451b52cf557ef435d33e7638088d66df *av1-1-b8-00-quantizer-62.ivf
+b57f4e1276ead626a3662339a86111ae6fda49d2 *av1-1-b8-00-quantizer-62.ivf.md5
+2a8aa33513d8e01ae9410c4bf5fe1e471b775482 *av1-1-b8-00-quantizer-63.ivf
+9f646ec35a168f495e144c64ba7ce9aeb41cd0a2 *av1-1-b8-00-quantizer-63.ivf.md5
+838388fbda4a1d91be81ff62694c3bf13c460d38 *av1-1-b8-01-size-16x16.ivf
+4229c1caf8e25eb3073456fb90ceed206753901e *av1-1-b8-01-size-16x16.ivf.md5
+23f4253bf71e02b2e8ead66da4b3de875e879ef2 *av1-1-b8-01-size-18x16.ivf
+af125644436d4b6897dade68336cedad663b6610 *av1-1-b8-01-size-18x16.ivf.md5
+94e4a75bd93052f79998e9e08e6b5dd73dc27e50 *av1-1-b8-01-size-32x16.ivf
+e7b3fbc5e4b2469838e7ae36512bd3ce0a81040c *av1-1-b8-01-size-32x16.ivf.md5
+f297bde01c05ec5c07ff8118a0280bd36c52b246 *av1-1-b8-01-size-34x16.ivf
+f6bbd94d6063c689de3c7cf94afa2c68b969d12c *av1-1-b8-01-size-34x16.ivf.md5
+1e18bdf68bab7e7282aacc77e423bc7d93d04a8e *av1-1-b8-01-size-64x16.ivf
+de75732fccfb385294b23c17f0f1a57b455edcf7 *av1-1-b8-01-size-64x16.ivf.md5
+26b1f6ae80b161e971468085778cc1ece502b330 *av1-1-b8-01-size-66x16.ivf
+48bd99813557c314d398e6952da78da07c79d416 *av1-1-b8-01-size-66x16.ivf.md5
+ff213ecf31b982a3a7f009c9739f64e066e1ffe9 *av1-1-b8-01-size-16x18.ivf
+86b20a13b1939dc5f678e80491f190d376233d58 *av1-1-b8-01-size-16x18.ivf.md5
+c90bd878c59263a15c6a6f515d1c7e071f141559 *av1-1-b8-01-size-18x18.ivf
+6f659036ffcd3dd380cf970cf1a06f7755e0b2de *av1-1-b8-01-size-18x18.ivf.md5
+e16a1411381b34817a4c0d8e5eeaeb8cddcc9c46 *av1-1-b8-01-size-32x18.ivf
+fdb1c4ec56f5aa690eadbe897340fee86a06ae2f *av1-1-b8-01-size-32x18.ivf.md5
+fac7052b39bd2d0ae107e0e94050226712c770c2 *av1-1-b8-01-size-34x18.ivf
+adb0d5a99228027eaa3b016963df447c9818c447 *av1-1-b8-01-size-34x18.ivf.md5
+b8be5e55d9be42746c2b547d0e26e80b21c9802a *av1-1-b8-01-size-64x18.ivf
+8f8f6da34cdf78c5a6551c637e1afe279cc3884e *av1-1-b8-01-size-64x18.ivf.md5
+9e066bdcc2cd789cdf551bd4c9c85c178887b880 *av1-1-b8-01-size-66x18.ivf
+e8ec6effa936423ae2eec2b60a3160720d2de912 *av1-1-b8-01-size-66x18.ivf.md5
+6ebe45085cdeebc2acd6da5abd542a59312c0ff4 *av1-1-b8-01-size-16x32.ivf
+044695669103dbf158591dce9c649317a177d5f6 *av1-1-b8-01-size-16x32.ivf.md5
+9fabb4f60641b8c7995d1dc451419165d41258ff *av1-1-b8-01-size-18x32.ivf
+7263764680dfec864c3fad5df824ab1973489a14 *av1-1-b8-01-size-18x32.ivf.md5
+3f72841a24a13e601d79cf029aa1fdb02970ce0b *av1-1-b8-01-size-32x32.ivf
+bbe1ae2888d291ec6bc98cd0784937580c554103 *av1-1-b8-01-size-32x32.ivf.md5
+392131a7c7609acd0dba88fee14f1ed042d23ab1 *av1-1-b8-01-size-34x32.ivf
+eea68165ebe9acd28693374bf2266374b9c77786 *av1-1-b8-01-size-34x32.ivf.md5
+78afdd96265811ab9466e906347b57161e5c010d *av1-1-b8-01-size-64x32.ivf
+47b317af582700b67f6e77659db1dfaa26c8cde6 *av1-1-b8-01-size-64x32.ivf.md5
+2b4d01f2c9f23044c0d886482c7073bd4d5d37d1 *av1-1-b8-01-size-66x32.ivf
+3ad5a58a0ee5086af370b22ab2b5b7592a4f33e7 *av1-1-b8-01-size-66x32.ivf.md5
+78ddae04eb8277ae605bd7017ad7ad27bfc82d39 *av1-1-b8-01-size-16x34.ivf
+d0c18e679f1fc51e4f7409831321eed9c4858f6f *av1-1-b8-01-size-16x34.ivf.md5
+38d8ed885f46aead6ec1271d8a5d4aee79b8eb68 *av1-1-b8-01-size-18x34.ivf
+097ddbd69b8f54826a35efeb0b8b07ec198bba6b *av1-1-b8-01-size-18x34.ivf.md5
+91a42720bc2e7ba701f4d97b463a098b6707cdbd *av1-1-b8-01-size-32x34.ivf
+c590d43d37095bd2e8f8d12c9278477419b72d1a *av1-1-b8-01-size-32x34.ivf.md5
+4cc2a437dba56e8878113d9b390b980522542028 *av1-1-b8-01-size-34x34.ivf
+57eeb971f00e64abde25be69dbcb4e3ce5065a57 *av1-1-b8-01-size-34x34.ivf.md5
+b36fee1b6ad69d1206466615d69c05e0a4407939 *av1-1-b8-01-size-64x34.ivf
+a78aea0250d0b32657dc0eaf2d8394bc766c0e35 *av1-1-b8-01-size-64x34.ivf.md5
+10e441209262e082e31fef8c15b51579c9e81509 *av1-1-b8-01-size-66x34.ivf
+558b46f6ef1662c208012d0b66d1857eeff3244e *av1-1-b8-01-size-66x34.ivf.md5
+dd44aad500c7ca0fc97e3d8f0abed3c83b24c79c *av1-1-b8-01-size-16x64.ivf
+a5b64e8063abcf3e4872dc4baf1c32384dc5cf83 *av1-1-b8-01-size-16x64.ivf.md5
+aa849f0d09bcb2ead44719d63043536932d5c9f2 *av1-1-b8-01-size-18x64.ivf
+bcdf2dea3590c7031158ffe7b907d9ee35e2fe57 *av1-1-b8-01-size-18x64.ivf.md5
+36e856d30e160ba2fbb00510296202f61afaae49 *av1-1-b8-01-size-32x64.ivf
+99299f75b82c40c13f168adf2d124f57044a39a2 *av1-1-b8-01-size-32x64.ivf.md5
+e3e03ec5d38eb25e97e4ec3adc6ed40ecdebd278 *av1-1-b8-01-size-34x64.ivf
+84625abf8a200a7d20dd3dd3b277b50b3d62ce32 *av1-1-b8-01-size-34x64.ivf.md5
+7d017daebef2d39ed42a505a8e6103ab0c0988c1 *av1-1-b8-01-size-64x64.ivf
+1ff38d5ecba82fb2e6ac3b09c29c9fe74885ac29 *av1-1-b8-01-size-64x64.ivf.md5
+e1b58ba0b462508593399a2ed84db5f1c59ffcd2 *av1-1-b8-01-size-66x64.ivf
+a6b2c84c94fe79ab0373d157d1203f8d66de0706 *av1-1-b8-01-size-66x64.ivf.md5
+7b4faa7eb7b73392b62de6613282a98dddc13bb6 *av1-1-b8-01-size-16x66.ivf
+a2dacf2bae3c4ab352af66a9600946d29ab9a6ee *av1-1-b8-01-size-16x66.ivf.md5
+0f97805fa30497d4cf39665150f00dfdea52d862 *av1-1-b8-01-size-18x66.ivf
+33d8ea0765953250f998da3fe161f2a8cfca2353 *av1-1-b8-01-size-18x66.ivf.md5
+c8bb00256de973e3b3ee31b924f554336d310cdb *av1-1-b8-01-size-32x66.ivf
+6a6588e6edc68ff7739968a9e7cc6d9eaaeed356 *av1-1-b8-01-size-32x66.ivf.md5
+75ec54fec5c36eecde6d0a16e0389a5f7ad8ec22 *av1-1-b8-01-size-34x66.ivf
+36101dfa9495c18696c0d7d61f25e748f4de7425 *av1-1-b8-01-size-34x66.ivf.md5
+7e5491716e70f8199156b8843513c935667b281e *av1-1-b8-01-size-64x66.ivf
+da38755bb0c9ef56b81617835ddf1340242c6dce *av1-1-b8-01-size-64x66.ivf.md5
+68b47b386f61d67cb5b824a7e6bf87c8b9c2bf7b *av1-1-b8-01-size-66x66.ivf
+25974893956ebd92df474325946130c34f880ea7 *av1-1-b8-01-size-66x66.ivf.md5
+9f386d19c87dbfd6ac84a06d2393dd88863ac003 *av1-1-b8-01-size-196x196.ivf
+788f77f655f55de3db94dd69870316134c149116 *av1-1-b8-01-size-196x196.ivf.md5
+ed3bb2bb52a9d1786e233ef38142b15b85097875 *av1-1-b8-01-size-198x196.ivf
+3bb6b6721ad9b2838b2d07e47b29d6c0117526b1 *av1-1-b8-01-size-198x196.ivf.md5
+49461772caaaa7b824d48f4e9c77a906b0dc02d5 *av1-1-b8-01-size-200x196.ivf
+f1cba00c36909c56097c8785df476d42bc91f259 *av1-1-b8-01-size-200x196.ivf.md5
+44a656a22958e26ed169a69deb8f373117224f06 *av1-1-b8-01-size-202x196.ivf
+69be876b52fe42811bba52d36d0bcc88d6c25b3f *av1-1-b8-01-size-202x196.ivf.md5
+0a6fe9b478363faedbfd465a75790b4c2661b9ba *av1-1-b8-01-size-208x196.ivf
+fc8e95a6860a8a37ccdf1dfe49828502fcf96a08 *av1-1-b8-01-size-208x196.ivf.md5
+8e05b5a20ec95afd92bb615a7daa2e17a7ef55a8 *av1-1-b8-01-size-210x196.ivf
+0add512bffbda3300d8f684a53b13b996fe2e46d *av1-1-b8-01-size-210x196.ivf.md5
+a15f12652c6b4d0c30f13a439c941bfc4a431d1a *av1-1-b8-01-size-224x196.ivf
+b904b93252175f79e0e2b28896131ce93d5fc925 *av1-1-b8-01-size-224x196.ivf.md5
+1a57b913443b267f4a31a6925c39f5b58022f550 *av1-1-b8-01-size-226x196.ivf
+7cf3087de5804763a82d2a798243a66459664772 *av1-1-b8-01-size-226x196.ivf.md5
+2cc28541a2a72e8b45a368f71e70fc294e2de3ab *av1-1-b8-01-size-196x198.ivf
+bb736eedb4bd1e39bf9d60435b4b27a12842e112 *av1-1-b8-01-size-196x198.ivf.md5
+c4ebf93fbf3ae52108fd7b39ddef3afae48188ea *av1-1-b8-01-size-198x198.ivf
+fa4de6881511728bafa15b5f441a0cfdf683cc75 *av1-1-b8-01-size-198x198.ivf.md5
+55fce983186d454b0eb15527393bb2465ba41c6b *av1-1-b8-01-size-200x198.ivf
+1ac8fb1ee622cbc4aa1b83cb46b4731c85efae62 *av1-1-b8-01-size-200x198.ivf.md5
+67d276c67886f0a91a7ee06751a64f95eeb7bc1f *av1-1-b8-01-size-202x198.ivf
+1633b62d9e4ea41737c42f70cbde9a5671da0cef *av1-1-b8-01-size-202x198.ivf.md5
+081cb3f29d3956d4d858d9661fd3d62c94b68867 *av1-1-b8-01-size-208x198.ivf
+871d1c99167408dd32fa7603a7296c9b99ccda15 *av1-1-b8-01-size-208x198.ivf.md5
+b2d80b42468d5f296ae240cfb1fc0b3dd3d96bbc *av1-1-b8-01-size-210x198.ivf
+6a3382656cb17b532a97b1061697f9a878fc58d1 *av1-1-b8-01-size-210x198.ivf.md5
+84d7994fa20fcf6c1d8dbd4c2060c988a6fce831 *av1-1-b8-01-size-224x198.ivf
+42ea12e15de81f2e8617b6de7bae76de2da4d648 *av1-1-b8-01-size-224x198.ivf.md5
+c74a9281cf98c597121df6bff0ac5312b887f969 *av1-1-b8-01-size-226x198.ivf
+4133aae0001804e2bbc7928fc065517a6dd8b288 *av1-1-b8-01-size-226x198.ivf.md5
+27adbf148c63f807bd617cfd78aeaedb8b0f2304 *av1-1-b8-01-size-196x200.ivf
+9253e525e6207ef1ce0839b8f88ea781e9abe41e *av1-1-b8-01-size-196x200.ivf.md5
+21c9ea4d882e48353d3df66fcde0e4746168163f *av1-1-b8-01-size-198x200.ivf
+3d5ee59fde9194f0eaff736051cfd1d7b7daeff1 *av1-1-b8-01-size-198x200.ivf.md5
+c27b0b57667910847122a0309c703315e444110f *av1-1-b8-01-size-200x200.ivf
+7b2a15a17b421ef07e285ca4e8a224f0512c434d *av1-1-b8-01-size-200x200.ivf.md5
+780de549e4163a52590f7c0f488e027a8a4aa053 *av1-1-b8-01-size-202x200.ivf
+cb0ec0969522ca60d79a639e9b9509363468ffd0 *av1-1-b8-01-size-202x200.ivf.md5
+2c59821904863e264ae61401cbd494a79bc04f13 *av1-1-b8-01-size-208x200.ivf
+9963955966a52b65cdd13465c9fb2ba3b5356755 *av1-1-b8-01-size-208x200.ivf.md5
+ff63121611ea9c0628c7e5af13de5e7786611ca6 *av1-1-b8-01-size-210x200.ivf
+2a5993be234e3af2af6d185b2a6f3aaf1979b83a *av1-1-b8-01-size-210x200.ivf.md5
+b8485ada95440d78b51153227231b1aced1a8273 *av1-1-b8-01-size-224x200.ivf
+9c3cd32ea6c006a91eb37d69dbeccf878de5d214 *av1-1-b8-01-size-224x200.ivf.md5
+1aa0ce3e3a74f9b600a146e98b05547a0b454c48 *av1-1-b8-01-size-226x200.ivf
+e045be96c3af16a9ddc10a9933e8ddfb3319d716 *av1-1-b8-01-size-226x200.ivf.md5
+e92b76480f4339855d998b97182f36b28deadcfa *av1-1-b8-01-size-196x202.ivf
+480c707abcd2a650e2160ec397f8348cecb45770 *av1-1-b8-01-size-196x202.ivf.md5
+137b9c0d10a3bdbdf6f97b3e6331f3e8acaf8f91 *av1-1-b8-01-size-198x202.ivf
+7429642146d0da55161ab13024a261094ee2ce87 *av1-1-b8-01-size-198x202.ivf.md5
+9cea71c44ad015ac702d675bacca17876e65cb1a *av1-1-b8-01-size-200x202.ivf
+76b1ec6c42da55f47e389a561590d1a7c713e495 *av1-1-b8-01-size-200x202.ivf.md5
+26dffdcd0dac9becf68d12e31fcd91eddf1f7154 *av1-1-b8-01-size-202x202.ivf
+ddb75e99123fed4ef05d9b85200cefd8985bc84c *av1-1-b8-01-size-202x202.ivf.md5
+04007e83bb66ba547d09f8926ea5bfc7fd9e4b2a *av1-1-b8-01-size-208x202.ivf
+5b72eb58db22087ad416c499119f41e718395b52 *av1-1-b8-01-size-208x202.ivf.md5
+721ff7c0ae0e2ed896b5acac230113f1404e769c *av1-1-b8-01-size-210x202.ivf
+187d2ef939fc26e1a1c7de65abe8e058d8aae17a *av1-1-b8-01-size-210x202.ivf.md5
+dba41421cc938bcf0234254f96be0325ab66186e *av1-1-b8-01-size-224x202.ivf
+58856038c1eb13a7bf0353a30b1affe844cd31b1 *av1-1-b8-01-size-224x202.ivf.md5
+55eba14878d25dcc351ee5e92fa06e559035b409 *av1-1-b8-01-size-226x202.ivf
+e295b3d791d40d7c1fff2c40a260078dccaef24a *av1-1-b8-01-size-226x202.ivf.md5
+6c777223990ddfd92040a8526646ed0f39299b0d *av1-1-b8-01-size-196x208.ivf
+5210daff766cddaf3945610ee05ff242aef8175a *av1-1-b8-01-size-196x208.ivf.md5
+252831abfb9f4a9a8556c21cc3bf60adfe88210f *av1-1-b8-01-size-198x208.ivf
+35ed9601e608a829980cec81e41b7bd3e5f4c2ce *av1-1-b8-01-size-198x208.ivf.md5
+e800ed893a88704a4576d4984957f3664560daa9 *av1-1-b8-01-size-200x208.ivf
+82c038f9072a2fcf8d55fb4a474fdd791ba9a290 *av1-1-b8-01-size-200x208.ivf.md5
+9ce7bb932dd99f86da8ff2ab89fa4d3089a78da8 *av1-1-b8-01-size-202x208.ivf
+0611bf0179abe3c820a447a2bd3a04c3790f3a87 *av1-1-b8-01-size-202x208.ivf.md5
+e5900d9150c8bebc49776227afd3b0a21f5a6ac6 *av1-1-b8-01-size-208x208.ivf
+86d6b9a3840aa0a77938547c905bd6f45d069681 *av1-1-b8-01-size-208x208.ivf.md5
+2758ba5dad16f4a91334f2ed07a4a037201bb873 *av1-1-b8-01-size-210x208.ivf
+78453b1fda2ccc6f35e0d762567807757bcddb16 *av1-1-b8-01-size-210x208.ivf.md5
+fff88fb8e833f6b4ad64cb591b219c7cceb7f2d2 *av1-1-b8-01-size-224x208.ivf
+87266fc34aaed82cdb98cbc309b221ad52eccd81 *av1-1-b8-01-size-224x208.ivf.md5
+dec839fe64046461015b56cda191835284f42a52 *av1-1-b8-01-size-226x208.ivf
+d7a15264fc3fd55d3aec0ccfaa7c434c6d90969f *av1-1-b8-01-size-226x208.ivf.md5
+584782e93ed1cb7797a90fece44becdd1e23bf0d *av1-1-b8-01-size-196x210.ivf
+ed76ec841b18a457853e368576967c4768fc2730 *av1-1-b8-01-size-196x210.ivf.md5
+dab625599b9f01398b593e865d9a4a95a029d60f *av1-1-b8-01-size-198x210.ivf
+b90e8d96a1f5b329b088b467a11fed2d055d74ca *av1-1-b8-01-size-198x210.ivf.md5
+6774bee17b9e50d2d8630e2e1afc30ded67e662d *av1-1-b8-01-size-200x210.ivf
+343a86bd54eb3dd5e9902eb62a3d776dcff2f4f3 *av1-1-b8-01-size-200x210.ivf.md5
+0456c3b8e242eeee019ca97d155f81124de62c90 *av1-1-b8-01-size-202x210.ivf
+5a6a6428c9858a0d3561db42ceaf981c143fe479 *av1-1-b8-01-size-202x210.ivf.md5
+6a3a8f65bf806b1be7726b983427880f772c9986 *av1-1-b8-01-size-208x210.ivf
+5563ea6d8c65887553ff3000addc6418913f1650 *av1-1-b8-01-size-208x210.ivf.md5
+5a8b69489f8e9b917ea7718ad2645101cdbe5644 *av1-1-b8-01-size-210x210.ivf
+f4b01604036fa23000d44fbf42097ae1181bcd62 *av1-1-b8-01-size-210x210.ivf.md5
+fb6f5b08a048698cfe324557ee8cd840c4a3f6ce *av1-1-b8-01-size-224x210.ivf
+3ce5c404e3ca09c8e994b3043bad42cd555b00c0 *av1-1-b8-01-size-224x210.ivf.md5
+2e9fc8510d2131b2f3c9a93bececac985e4426d2 *av1-1-b8-01-size-226x210.ivf
+897c537e259331ca86cdd6e4d2bd343f8538402e *av1-1-b8-01-size-226x210.ivf.md5
+8300512106fce3424eb74b5d4bc0f4f19f7c9af8 *av1-1-b8-01-size-196x224.ivf
+43662ea025ea79afe4964fd4d12a77f4aa4e565e *av1-1-b8-01-size-196x224.ivf.md5
+640f8fda7ade8f2850e2275a9f5e233e33a0ba8d *av1-1-b8-01-size-198x224.ivf
+9ac690bdbbce47d7b169128b568f955e70076f8c *av1-1-b8-01-size-198x224.ivf.md5
+ce2e9379c72fc924e364d5727605394a1438a211 *av1-1-b8-01-size-200x224.ivf
+1ec35a53d88072b96b255202f678178bc7e5bb20 *av1-1-b8-01-size-200x224.ivf.md5
+5d3af7921623deccb578115c8ce207c019f97f50 *av1-1-b8-01-size-202x224.ivf
+14eafd55b0cda3a3476cae7ad500dbd5ee899dd5 *av1-1-b8-01-size-202x224.ivf.md5
+6b6d78e466cf94a5ef8dfe252caa0948dd2ec175 *av1-1-b8-01-size-208x224.ivf
+e178b0c272dfcfe614c6b49cb28dad11781af0b6 *av1-1-b8-01-size-208x224.ivf.md5
+dd2232b9e18971d7e19650a1e3218aef1010247f *av1-1-b8-01-size-210x224.ivf
+40a66198c47820f5fa2d2e389ec0c1191ea4ffcc *av1-1-b8-01-size-210x224.ivf.md5
+9ec028b81a5ea311683328d856f436e6d0b0e6a0 *av1-1-b8-01-size-224x224.ivf
+143b9530ce722385db2c2d883daa649ed42b8d40 *av1-1-b8-01-size-224x224.ivf.md5
+bf833947e62935c54e1e727ccb36157f7c1e9e5d *av1-1-b8-01-size-226x224.ivf
+ca4f3b44463106e4f0bb54e490c3bd457d7d780b *av1-1-b8-01-size-226x224.ivf.md5
+5525f7e312ec073f480ed5a2be5bdc4f0ce51a09 *av1-1-b8-01-size-196x226.ivf
+062d4b240741184458d2d2abd243ed7877631de8 *av1-1-b8-01-size-196x226.ivf.md5
+e6b911142394b94c23191eaa63c9eb41a00f80b0 *av1-1-b8-01-size-198x226.ivf
+3b580d903dddf47082f5e055bfb01a4f05c09b7d *av1-1-b8-01-size-198x226.ivf.md5
+70feb5efeb28df25f7d1a661c73bf013c5ada9b4 *av1-1-b8-01-size-200x226.ivf
+f0b894e7f787e62f1492be62f3dedeb065062160 *av1-1-b8-01-size-200x226.ivf.md5
+7f9a10831e2389b31497fad50080b4d5452d6e91 *av1-1-b8-01-size-202x226.ivf
+45b7194eba9367c8059403c23ca4ae49e988dfaf *av1-1-b8-01-size-202x226.ivf.md5
+967837a2cfbf9aa3131f73aec6a52dcdd82926c7 *av1-1-b8-01-size-208x226.ivf
+c8baedb48fd5d4c956aa8d73fd957370f718f047 *av1-1-b8-01-size-208x226.ivf.md5
+9c926226b9f6b015501d8ac1e3f95e8570283a05 *av1-1-b8-01-size-210x226.ivf
+57d4837667fd4c5a7aeb908626d701b632852c60 *av1-1-b8-01-size-210x226.ivf.md5
+25a4940922761239809d82c45c2be1c5e4f48785 *av1-1-b8-01-size-224x226.ivf
+87ae7e7558241bf3575a333f56fbad4dfdade8ff *av1-1-b8-01-size-224x226.ivf.md5
+40dd208eb525cd90d7c0674cf787097fb909afae *av1-1-b8-01-size-226x226.ivf
+34bdef682a4eae0e0a05e4486a968af1df8b220a *av1-1-b8-01-size-226x226.ivf.md5
+\ No newline at end of file
diff --git a/third_party/aom/test/test.cmake b/third_party/aom/test/test.cmake
index 26937c96a..8594d059c 100644
--- a/third_party/aom/test/test.cmake
+++ b/third_party/aom/test/test.cmake
@@ -1,14 +1,16 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_TEST_TEST_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_TEST_TEST_CMAKE_)
+  return()
+endif() # AOM_TEST_TEST_CMAKE_
 set(AOM_TEST_TEST_CMAKE_ 1)
 
 include(FindPythonInterp)
@@ -18,507 +20,408 @@ include("${AOM_ROOT}/test/test_data_util.cmake")
 
 set(AOM_UNIT_TEST_DATA_LIST_FILE "${AOM_ROOT}/test/test-data.sha1")
 
-set(AOM_UNIT_TEST_WRAPPER_SOURCES
-    "${AOM_CONFIG_DIR}/usage_exit.c"
-    "${AOM_ROOT}/test/test_libaom.cc")
-
-set(AOM_UNIT_TEST_COMMON_SOURCES
-    "${AOM_ROOT}/test/acm_random.h"
-    "${AOM_ROOT}/test/clear_system_state.h"
-    "${AOM_ROOT}/test/codec_factory.h"
-    "${AOM_ROOT}/test/decode_test_driver.cc"
-    "${AOM_ROOT}/test/decode_test_driver.h"
-    "${AOM_ROOT}/test/function_equivalence_test.h"
-    "${AOM_ROOT}/test/md5_helper.h"
-    "${AOM_ROOT}/test/register_state_check.h"
-    "${AOM_ROOT}/test/transform_test_base.h"
-    "${AOM_ROOT}/test/util.h"
-    "${AOM_ROOT}/test/video_source.h")
-
-if (NOT BUILD_SHARED_LIBS)
-  set(AOM_UNIT_TEST_COMMON_SOURCES
-      ${AOM_UNIT_TEST_COMMON_SOURCES}
-      "${AOM_ROOT}/test/convolve_test.cc"
-      "${AOM_ROOT}/test/simd_impl.h")
-
-  if (HAVE_NEON)
-    set(AOM_UNIT_TEST_COMMON_SOURCES
-        ${AOM_UNIT_TEST_COMMON_SOURCES}
-        "${AOM_ROOT}/test/simd_neon_test.cc")
-  endif ()
-  if (HAVE_SSE2)
-    set(AOM_UNIT_TEST_COMMON_SOURCES
-        ${AOM_UNIT_TEST_COMMON_SOURCES}
-        "${AOM_ROOT}/test/simd_sse2_test.cc")
-  endif ()
-  if (HAVE_SSSE3)
-    set(AOM_UNIT_TEST_COMMON_SOURCES
-        ${AOM_UNIT_TEST_COMMON_SOURCES}
-        "${AOM_ROOT}/test/simd_ssse3_test.cc")
-  endif ()
-  if (HAVE_SSE4)
-    set(AOM_UNIT_TEST_COMMON_SOURCES
-        ${AOM_UNIT_TEST_COMMON_SOURCES}
-        "${AOM_ROOT}/test/simd_sse4_test.cc")
-  endif ()
-  if (HAVE_AVX2)
-    set(AOM_UNIT_TEST_COMMON_SOURCES
-        ${AOM_UNIT_TEST_COMMON_SOURCES}
-        "${AOM_ROOT}/test/simd_avx2_test.cc")
-  endif ()
-
-  if (CONFIG_ACCOUNTING)
-    set(AOM_UNIT_TEST_COMMON_SOURCES
-        ${AOM_UNIT_TEST_COMMON_SOURCES}
-        "${AOM_ROOT}/test/accounting_test.cc")
-  endif ()
-
-  if (CONFIG_ADAPT_SCAN)
-    set(AOM_UNIT_TEST_COMMON_SOURCES
-        ${AOM_UNIT_TEST_COMMON_SOURCES}
-        "${AOM_ROOT}/test/scan_test.cc")
-  endif ()
-
-  if (CONFIG_GLOBAL_MOTION OR CONFIG_WARPED_MOTION)
-    if (HAVE_SSE2)
-      set(AOM_UNIT_TEST_COMMON_SOURCES
-          ${AOM_UNIT_TEST_COMMON_SOURCES}
-          "${AOM_ROOT}/test/warp_filter_test.cc"
-          "${AOM_ROOT}/test/warp_filter_test_util.cc"
-          "${AOM_ROOT}/test/warp_filter_test_util.h")
-    endif ()
-  endif ()
-endif ()
-
-set(AOM_UNIT_TEST_DECODER_SOURCES
-    "${AOM_ROOT}/test/decode_api_test.cc"
-    "${AOM_ROOT}/test/ivf_video_source.h")
-
-set(AOM_UNIT_TEST_ENCODER_SOURCES
-    "${AOM_ROOT}/test/altref_test.cc"
-    "${AOM_ROOT}/test/aq_segment_test.cc"
-    "${AOM_ROOT}/test/datarate_test.cc"
-    "${AOM_ROOT}/test/encode_api_test.cc"
-    "${AOM_ROOT}/test/encode_test_driver.cc"
-    "${AOM_ROOT}/test/encode_test_driver.h"
-    "${AOM_ROOT}/test/error_resilience_test.cc"
-    "${AOM_ROOT}/test/i420_video_source.h"
-    "${AOM_ROOT}/test/resize_test.cc"
-    "${AOM_ROOT}/test/y4m_test.cc"
-    "${AOM_ROOT}/test/y4m_video_source.h"
-    "${AOM_ROOT}/test/yuv_video_source.h")
-
-if (NOT BUILD_SHARED_LIBS)
-  set(AOM_UNIT_TEST_ENCODER_SOURCES
-      ${AOM_UNIT_TEST_ENCODER_SOURCES}
-      "${AOM_ROOT}/test/dct16x16_test.cc"
-      "${AOM_ROOT}/test/dct32x32_test.cc"
-      "${AOM_ROOT}/test/sad_test.cc")
-endif ()
-
-set(AOM_DECODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/decode_perf_test.cc")
-set(AOM_ENCODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/encode_perf_test.cc")
-set(AOM_UNIT_TEST_WEBM_SOURCES "${AOM_ROOT}/test/webm_video_source.h")
-
-set(AOM_TEST_INTRA_PRED_SPEED_SOURCES
-    "${AOM_CONFIG_DIR}/usage_exit.c"
-    "${AOM_ROOT}/test/test_intra_pred_speed.cc")
-
-if (NOT BUILD_SHARED_LIBS)
-  if (CONFIG_AV1_DECODER OR CONFIG_AV1_ENCODER)
-    set(AOM_UNIT_TEST_COMMON_SOURCES
-        ${AOM_UNIT_TEST_COMMON_SOURCES}
-        "${AOM_ROOT}/test/av1_convolve_optimz_test.cc"
-        "${AOM_ROOT}/test/av1_convolve_test.cc"
-        "${AOM_ROOT}/test/av1_txfm_test.cc"
-        "${AOM_ROOT}/test/av1_txfm_test.h"
-        "${AOM_ROOT}/test/intrapred_test.cc"
-        "${AOM_ROOT}/test/lpf_8_test.cc"
-        "${AOM_ROOT}/test/simd_cmp_impl.h")
-
-    set(AOM_UNIT_TEST_ENCODER_SOURCES
-        ${AOM_UNIT_TEST_ENCODER_SOURCES}
-        "${AOM_ROOT}/test/motion_vector_test.cc")
-
-    if (CONFIG_CDEF)
-      if (CONFIG_CDEF_SINGLEPASS)
-        set(AOM_UNIT_TEST_COMMON_SOURCES
-            ${AOM_UNIT_TEST_COMMON_SOURCES}
-            "${AOM_ROOT}/test/cdef_test.cc")
-      else ()
-        set(AOM_UNIT_TEST_COMMON_SOURCES
-            ${AOM_UNIT_TEST_COMMON_SOURCES}
-            "${AOM_ROOT}/test/clpf_test.cc"
-            "${AOM_ROOT}/test/dering_test.cc")
-      endif ()
-    endif ()
-
-    # Omit 4-tap filter intra predictor test-- currently a 3-tap filter is in
-    # use.
-    #if (CONFIG_FILTER_INTRA)
-    #  if (HAVE_SSE4_1)
-    #    set(AOM_UNIT_TEST_COMMON_SOURCES
-    #        ${AOM_UNIT_TEST_COMMON_SOURCES}
-    #        "${AOM_ROOT}/test/filterintra_predictors_test.cc")
-    #  endif ()
-    #endif ()
-
-    if (CONFIG_INTRABC)
-        set(AOM_UNIT_TEST_COMMON_SOURCES
-            ${AOM_UNIT_TEST_COMMON_SOURCES}
-            "${AOM_ROOT}/test/intrabc_test.cc")
-    endif ()
-
-    if (CONFIG_LOOP_RESTORATION)
-      set(AOM_UNIT_TEST_COMMON_SOURCES
-          ${AOM_UNIT_TEST_COMMON_SOURCES}
-          "${AOM_ROOT}/test/selfguided_filter_test.cc")
-
-      if (HAVE_SSE2)
-        set(AOM_UNIT_TEST_COMMON_SOURCES
-            ${AOM_UNIT_TEST_COMMON_SOURCES}
-            "${AOM_ROOT}/test/hiprec_convolve_test.cc"
-            "${AOM_ROOT}/test/hiprec_convolve_test_util.cc"
-            "${AOM_ROOT}/test/hiprec_convolve_test_util.h")
-      endif ()
-    endif ()
-
-    set(AOM_UNIT_TEST_COMMON_INTRIN_NEON
-        ${AOM_UNIT_TEST_COMMON_INTRIN_NEON}
-        "${AOM_ROOT}/test/simd_cmp_neon.cc")
-    set(AOM_UNIT_TEST_COMMON_INTRIN_SSE2
-        ${AOM_UNIT_TEST_COMMON_INTRIN_SSE2}
-        "${AOM_ROOT}/test/simd_cmp_sse2.cc")
-    set(AOM_UNIT_TEST_COMMON_INTRIN_SSSE3
-        ${AOM_UNIT_TEST_COMMON_INTRIN_SSSE3}
-        "${AOM_ROOT}/test/simd_cmp_ssse3.cc")
-    set(AOM_UNIT_TEST_COMMON_INTRIN_SSE4_1
-        ${AOM_UNIT_TEST_COMMON_INTRIN_SSE4_1}
-        "${AOM_ROOT}/test/simd_cmp_sse4.cc")
-    set(AOM_UNIT_TEST_COMMON_INTRIN_AVX2
-        ${AOM_UNIT_TEST_COMMON_INTRIN_AVX2}
-        "${AOM_ROOT}/test/simd_cmp_avx2.cc")
-  endif ()
-endif ()
-
-if (CONFIG_AV1_ENCODER)
-  set(AOM_UNIT_TEST_ENCODER_SOURCES
-      ${AOM_UNIT_TEST_ENCODER_SOURCES}
-      "${AOM_ROOT}/test/active_map_test.cc"
-      "${AOM_ROOT}/test/borders_test.cc"
-      "${AOM_ROOT}/test/cpu_speed_test.cc"
-      "${AOM_ROOT}/test/end_to_end_test.cc"
-      "${AOM_ROOT}/test/frame_size_tests.cc"
-      "${AOM_ROOT}/test/lossless_test.cc")
-
-  if (NOT BUILD_SHARED_LIBS)
-    set(AOM_UNIT_TEST_ENCODER_SOURCES
-        ${AOM_UNIT_TEST_ENCODER_SOURCES}
-        "${AOM_ROOT}/test/arf_freq_test.cc"
-        "${AOM_ROOT}/test/av1_dct_test.cc"
-        "${AOM_ROOT}/test/av1_fht16x16_test.cc"
-        "${AOM_ROOT}/test/av1_fht32x32_test.cc"
-        "${AOM_ROOT}/test/av1_fht8x8_test.cc"
-        "${AOM_ROOT}/test/av1_fwd_txfm1d_test.cc"
-        "${AOM_ROOT}/test/av1_fwd_txfm2d_test.cc"
-        "${AOM_ROOT}/test/av1_inv_txfm1d_test.cc"
-        "${AOM_ROOT}/test/av1_inv_txfm2d_test.cc"
-        "${AOM_ROOT}/test/av1_inv_txfm_test.cc"
-        "${AOM_ROOT}/test/av1_wedge_utils_test.cc"
-        "${AOM_ROOT}/test/avg_test.cc"
-        "${AOM_ROOT}/test/blend_a64_mask_1d_test.cc"
-        "${AOM_ROOT}/test/blend_a64_mask_test.cc"
-        "${AOM_ROOT}/test/error_block_test.cc"
-        "${AOM_ROOT}/test/fdct4x4_test.cc"
-        "${AOM_ROOT}/test/fdct8x8_test.cc"
-        "${AOM_ROOT}/test/hadamard_test.cc"
-        "${AOM_ROOT}/test/masked_sad_test.cc"
-        "${AOM_ROOT}/test/masked_variance_test.cc"
-        "${AOM_ROOT}/test/minmax_test.cc"
-        "${AOM_ROOT}/test/subtract_test.cc"
-        "${AOM_ROOT}/test/sum_squares_test.cc"
-        "${AOM_ROOT}/test/variance_test.cc")
-
-    if (NOT CONFIG_AOM_QM AND NOT CONFIG_NEW_QUANT)
-      set(AOM_UNIT_TEST_ENCODER_SOURCES
-          ${AOM_UNIT_TEST_ENCODER_SOURCES}
-          "${AOM_ROOT}/test/quantize_func_test.cc")
-    endif ()
-
-    if (CONFIG_CONVOLVE_ROUND)
-      set(AOM_UNIT_TEST_ENCODER_SOURCES
-          ${AOM_UNIT_TEST_ENCODER_SOURCES}
-          "${AOM_ROOT}/test/convolve_round_test.cc")
-      if (HAVE_SSE2)
-        set(AOM_UNIT_TEST_ENCODER_SOURCES
-            ${AOM_UNIT_TEST_ENCODER_SOURCES}
-            "${AOM_ROOT}/test/av1_convolve_2d_test.cc"
-            "${AOM_ROOT}/test/av1_convolve_2d_test_util.cc"
-            "${AOM_ROOT}/test/av1_convolve_2d_test_util.h")
-      endif ()
-      if (NOT CONFIG_COMPOUND_ROUND)
-        if (HAVE_SSE4_1)
-          set(AOM_UNIT_TEST_ENCODER_SOURCES
-              ${AOM_UNIT_TEST_ENCODER_SOURCES}
-              "${AOM_ROOT}/test/av1_convolve_scale_test.cc")
-        endif ()
-      endif ()
-    endif ()
-
-    if (CONFIG_EXT_TX)
-      set(AOM_UNIT_TEST_ENCODER_SOURCES
-          ${AOM_UNIT_TEST_ENCODER_SOURCES}
-          "${AOM_ROOT}/test/av1_fht16x32_test.cc"
-          "${AOM_ROOT}/test/av1_fht16x8_test.cc"
-          "${AOM_ROOT}/test/av1_fht32x16_test.cc"
-          "${AOM_ROOT}/test/av1_fht4x4_test.cc"
-          "${AOM_ROOT}/test/av1_fht4x8_test.cc"
-          "${AOM_ROOT}/test/av1_fht8x16_test.cc"
-          "${AOM_ROOT}/test/av1_fht8x4_test.cc")
-    endif ()
-
-    if (CONFIG_GLOBAL_MOTION)
-      set(AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1
-          ${AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1}
-          "${AOM_ROOT}/test/corner_match_test.cc")
-    endif ()
-
-    if (CONFIG_MOTION_VAR)
-      set(AOM_UNIT_TEST_ENCODER_SOURCES
-          ${AOM_UNIT_TEST_ENCODER_SOURCES}
-          "${AOM_ROOT}/test/obmc_sad_test.cc"
-          "${AOM_ROOT}/test/obmc_variance_test.cc")
-    endif ()
-
-    if (CONFIG_TX64X64)
-      set(AOM_UNIT_TEST_ENCODER_SOURCES
-          ${AOM_UNIT_TEST_ENCODER_SOURCES}
-          "${AOM_ROOT}/test/av1_fht64x64_test.cc")
-    endif ()
-  endif ()
-endif ()
-
-if (NOT BUILD_SHARED_LIBS)
-  if (CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
-    set(AOM_UNIT_TEST_COMMON_SOURCES
-        ${AOM_UNIT_TEST_COMMON_SOURCES}
-        "${AOM_ROOT}/test/divu_small_test.cc"
-        "${AOM_ROOT}/test/ethread_test.cc"
-        "${AOM_ROOT}/test/coding_path_sync.cc"
-        "${AOM_ROOT}/test/idct8x8_test.cc"
-        "${AOM_ROOT}/test/partial_idct_test.cc"
-        "${AOM_ROOT}/test/superframe_test.cc"
-        "${AOM_ROOT}/test/tile_independence_test.cc")
-
-    if (CONFIG_ANS)
-      set(AOM_UNIT_TEST_COMMON_SOURCES
-          ${AOM_UNIT_TEST_COMMON_SOURCES}
-          "${AOM_ROOT}/test/ans_codec_test.cc"
-          "${AOM_ROOT}/test/ans_test.cc")
-    else ()
-      set(AOM_UNIT_TEST_COMMON_SOURCES
-          ${AOM_UNIT_TEST_COMMON_SOURCES}
-          "${AOM_ROOT}/test/binary_codes_test.cc"
-          "${AOM_ROOT}/test/boolcoder_test.cc")
-    endif ()
-
-    if (CONFIG_EXT_TILE)
-      set(AOM_UNIT_TEST_COMMON_SOURCES
-          ${AOM_UNIT_TEST_COMMON_SOURCES}
-          "${AOM_ROOT}/test/av1_ext_tile_test.cc")
-    endif ()
-  endif ()
-endif ()
-
-if (CONFIG_HIGHBITDEPTH)
-  if (CONFIG_AV1_ENCODER AND NOT BUILD_SHARED_LIBS)
-    set(AOM_UNIT_TEST_COMMON_INTRIN_SSE4_1
-        ${AOM_UNIT_TEST_COMMON_INTRIN_SSE4_1}
-        "${AOM_ROOT}/test/av1_highbd_iht_test.cc"
-        "${AOM_ROOT}/test/av1_quantize_test.cc")
-  endif ()
-
-  if (CONFIG_INTERNAL_STATS)
-    set(AOM_UNIT_TEST_COMMON_SOURCES
-        ${AOM_UNIT_TEST_COMMON_SOURCES}
-        "${AOM_ROOT}/test/hbd_metrics_test.cc")
-  endif ()
-endif ()
-
-if (CONFIG_UNIT_TESTS)
+list(APPEND AOM_UNIT_TEST_WRAPPER_SOURCES "${AOM_CONFIG_DIR}/usage_exit.c"
+            "${AOM_ROOT}/test/test_libaom.cc")
+
+list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+            "${AOM_ROOT}/test/acm_random.h"
+            "${AOM_ROOT}/test/aom_integer_test.cc"
+            "${AOM_ROOT}/test/clear_system_state.h"
+            "${AOM_ROOT}/test/codec_factory.h"
+            "${AOM_ROOT}/test/decode_test_driver.cc"
+            "${AOM_ROOT}/test/decode_test_driver.h"
+            "${AOM_ROOT}/test/function_equivalence_test.h"
+            "${AOM_ROOT}/test/md5_helper.h"
+            "${AOM_ROOT}/test/register_state_check.h"
+            "${AOM_ROOT}/test/test_vectors.cc"
+            "${AOM_ROOT}/test/test_vectors.h"
+            "${AOM_ROOT}/test/transform_test_base.h"
+            "${AOM_ROOT}/test/util.h"
+            "${AOM_ROOT}/test/video_source.h")
+
+if(CONFIG_INTERNAL_STATS)
+  list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+              "${AOM_ROOT}/test/hbd_metrics_test.cc")
+endif()
+
+list(APPEND AOM_UNIT_TEST_DECODER_SOURCES "${AOM_ROOT}/test/decode_api_test.cc"
+            "${AOM_ROOT}/test/invalid_file_test.cc"
+            "${AOM_ROOT}/test/test_vector_test.cc"
+            "${AOM_ROOT}/test/ivf_video_source.h")
+
+list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
+            "${AOM_ROOT}/test/active_map_test.cc"
+            "${AOM_ROOT}/test/altref_test.cc"
+            "${AOM_ROOT}/test/aq_segment_test.cc"
+            "${AOM_ROOT}/test/av1_txfm_test.cc"
+            "${AOM_ROOT}/test/av1_txfm_test.h"
+            "${AOM_ROOT}/test/borders_test.cc"
+            "${AOM_ROOT}/test/cpu_speed_test.cc"
+            "${AOM_ROOT}/test/datarate_test.cc"
+            "${AOM_ROOT}/test/encode_api_test.cc"
+            "${AOM_ROOT}/test/encode_test_driver.cc"
+            "${AOM_ROOT}/test/encode_test_driver.h"
+            "${AOM_ROOT}/test/end_to_end_test.cc"
+            "${AOM_ROOT}/test/error_resilience_test.cc"
+            "${AOM_ROOT}/test/frame_size_tests.cc"
+            "${AOM_ROOT}/test/horz_superres_test.cc"
+            "${AOM_ROOT}/test/i420_video_source.h"
+            "${AOM_ROOT}/test/lossless_test.cc"
+            "${AOM_ROOT}/test/monochrome_test.cc"
+            "${AOM_ROOT}/test/qm_test.cc"
+            "${AOM_ROOT}/test/resize_test.cc"
+            "${AOM_ROOT}/test/scalability_test.cc"
+            "${AOM_ROOT}/test/y4m_test.cc"
+            "${AOM_ROOT}/test/y4m_video_source.h"
+            "${AOM_ROOT}/test/yuv_video_source.h")
+
+list(APPEND AOM_DECODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/decode_perf_test.cc")
+list(APPEND AOM_ENCODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/encode_perf_test.cc")
+list(APPEND AOM_UNIT_TEST_WEBM_SOURCES "${AOM_ROOT}/test/webm_video_source.h")
+list(APPEND AOM_TEST_INTRA_PRED_SPEED_SOURCES "${AOM_CONFIG_DIR}/usage_exit.c"
+            "${AOM_ROOT}/test/test_intra_pred_speed.cc")
+
+if(NOT BUILD_SHARED_LIBS)
+  list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+              "${AOM_ROOT}/test/cdef_test.cc"
+              "${AOM_ROOT}/test/cfl_test.cc"
+              "${AOM_ROOT}/test/convolve_test.cc"
+              "${AOM_ROOT}/test/hiprec_convolve_test.cc"
+              "${AOM_ROOT}/test/hiprec_convolve_test_util.cc"
+              "${AOM_ROOT}/test/hiprec_convolve_test_util.h"
+              "${AOM_ROOT}/test/intrabc_test.cc"
+              "${AOM_ROOT}/test/intrapred_test.cc"
+              "${AOM_ROOT}/test/lpf_test.cc"
+              "${AOM_ROOT}/test/onyxc_int_test.cc"
+              "${AOM_ROOT}/test/scan_test.cc"
+              "${AOM_ROOT}/test/selfguided_filter_test.cc"
+              "${AOM_ROOT}/test/simd_cmp_impl.h"
+              "${AOM_ROOT}/test/simd_impl.h")
+
+  if(CONFIG_ACCOUNTING)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/accounting_test.cc")
+  endif()
+
+  if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/av1_ext_tile_test.cc"
+                "${AOM_ROOT}/test/binary_codes_test.cc"
+                "${AOM_ROOT}/test/boolcoder_test.cc"
+                "${AOM_ROOT}/test/coding_path_sync.cc"
+                "${AOM_ROOT}/test/decode_multithreaded_test.cc"
+                "${AOM_ROOT}/test/divu_small_test.cc"
+                "${AOM_ROOT}/test/dr_prediction_test.cc"
+                "${AOM_ROOT}/test/ec_test.cc"
+                "${AOM_ROOT}/test/ethread_test.cc"
+                "${AOM_ROOT}/test/film_grain_table_test.cc"
+                "${AOM_ROOT}/test/segment_binarization_sync.cc"
+                "${AOM_ROOT}/test/superframe_test.cc"
+                "${AOM_ROOT}/test/tile_independence_test.cc")
+  endif()
+
+  list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_NEON
+              "${AOM_ROOT}/test/simd_cmp_neon.cc")
+  if(HAVE_NEON)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/simd_neon_test.cc")
+  endif()
+
+  list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSE2
+              "${AOM_ROOT}/test/simd_cmp_sse2.cc")
+  if(HAVE_SSE2)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/simd_sse2_test.cc")
+  endif()
+
+  list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSSE3
+              "${AOM_ROOT}/test/simd_cmp_ssse3.cc")
+  if(HAVE_SSSE3)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/simd_ssse3_test.cc")
+  endif()
+
+  if(HAVE_SSE4)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/simd_sse4_test.cc")
+  endif()
+
+  if(HAVE_SSE4_1)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/filterintra_test.cc")
+  endif()
+
+  list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_AVX2
+              "${AOM_ROOT}/test/simd_cmp_avx2.cc")
+  if(HAVE_AVX2)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/simd_avx2_test.cc")
+  endif()
+
+  list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
+              "${AOM_ROOT}/test/arf_freq_test.cc"
+              "${AOM_ROOT}/test/av1_convolve_2d_test.cc"
+              "${AOM_ROOT}/test/av1_convolve_2d_test_util.cc"
+              "${AOM_ROOT}/test/av1_convolve_2d_test_util.h"
+              "${AOM_ROOT}/test/av1_fwd_txfm1d_test.cc"
+              "${AOM_ROOT}/test/av1_fwd_txfm2d_test.cc"
+              "${AOM_ROOT}/test/av1_inv_txfm1d_test.cc"
+              "${AOM_ROOT}/test/av1_inv_txfm2d_test.cc"
+              "${AOM_ROOT}/test/av1_round_shift_array_test.cc"
+              "${AOM_ROOT}/test/av1_wedge_utils_test.cc"
+              "${AOM_ROOT}/test/blend_a64_mask_1d_test.cc"
+              "${AOM_ROOT}/test/blend_a64_mask_test.cc"
+              "${AOM_ROOT}/test/comp_avg_pred_test.cc"
+              "${AOM_ROOT}/test/comp_avg_pred_test.h"
+              "${AOM_ROOT}/test/comp_mask_variance_test.cc"
+              "${AOM_ROOT}/test/encodetxb_test.cc"
+              "${AOM_ROOT}/test/error_block_test.cc"
+              "${AOM_ROOT}/test/fft_test.cc"
+              "${AOM_ROOT}/test/fwht4x4_test.cc"
+              "${AOM_ROOT}/test/masked_sad_test.cc"
+              "${AOM_ROOT}/test/masked_variance_test.cc"
+              "${AOM_ROOT}/test/motion_vector_test.cc"
+              "${AOM_ROOT}/test/noise_model_test.cc"
+              "${AOM_ROOT}/test/obmc_sad_test.cc"
+              "${AOM_ROOT}/test/obmc_variance_test.cc"
+              "${AOM_ROOT}/test/sad_test.cc"
+              "${AOM_ROOT}/test/subtract_test.cc"
+              "${AOM_ROOT}/test/reconinter_test.cc"
+              "${AOM_ROOT}/test/sum_squares_test.cc"
+              "${AOM_ROOT}/test/variance_test.cc")
+
+  list(APPEND AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1
+              "${AOM_ROOT}/test/av1_highbd_iht_test.cc"
+              "${AOM_ROOT}/test/av1_quantize_test.cc"
+              "${AOM_ROOT}/test/corner_match_test.cc"
+              "${AOM_ROOT}/test/quantize_func_test.cc"
+              "${AOM_ROOT}/test/simd_cmp_sse4.cc")
+
+  if(HAVE_SSE4_1)
+    list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
+                "${AOM_ROOT}/test/av1_convolve_scale_test.cc"
+                "${AOM_ROOT}/test/av1_horz_only_frame_superres_test.cc"
+                "${AOM_ROOT}/test/intra_edge_test.cc"
+                "${AOM_ROOT}/test/warp_filter_test.cc"
+                "${AOM_ROOT}/test/warp_filter_test_util.cc"
+                "${AOM_ROOT}/test/warp_filter_test_util.h")
+  endif()
+
+  if(HAVE_SSE4_2)
+    list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES "${AOM_ROOT}/test/hash_test.cc")
+  endif()
+
+endif()
+
+if(ENABLE_TESTS)
   find_package(PythonInterp)
-  if (NOT PYTHONINTERP_FOUND)
-    message(WARNING "--- Unit tests disabled: Python not found.")
-    set(CONFIG_UNIT_TESTS 0)
-  endif ()
-
-  if (MSVC)
-    # Force static run time to avoid collisions with googletest.
+  if(NOT PYTHONINTERP_FOUND)
+    message(FATAL_ERROR
+              "--- Unit tests require Python, rerun cmake with "
+              "-DENABLE_TESTS=0 to avoid this error, or install Python and "
+              "make sure it's in your PATH.")
+  endif()
+
+  if(MSVC) # Force static run time to avoid collisions with googletest.
     include("${AOM_ROOT}/build/cmake/msvc_runtime.cmake")
-  endif ()
+  endif()
 
-  if (BUILD_SHARED_LIBS AND APPLE)
-    # Silence an RPATH warning.
+  if(BUILD_SHARED_LIBS AND APPLE) # Silence an RPATH warning.
     set(CMAKE_MACOSX_RPATH 1)
-  endif ()
+  endif()
 
   include_directories(
-    "${AOM_ROOT}/third_party/googletest/src/googletest/src"
     "${AOM_ROOT}/third_party/googletest/src/googletest/include")
 
-  if (AOM_DISABLE_GTEST_CMAKE)
+  if(AOM_DISABLE_GTEST_CMAKE)
     include_directories("${AOM_ROOT}/third_party/googletest/src/googletest")
-    add_library(gtest STATIC
+    add_library(
+      gtest
+      STATIC
       "${AOM_ROOT}/third_party/googletest/src/googletest/src/gtest-all.cc")
-  else ()
+  else()
     add_subdirectory("${AOM_ROOT}/third_party/googletest/src/googletest"
                      EXCLUDE_FROM_ALL)
-  endif ()
-
-endif ()
-
-# Setup the targets for CONFIG_UNIT_TESTS. The libaom and app util targets must
-# exist before this function is called.
-function (setup_aom_test_targets)
+  endif()
+endif()
+
+# Setup testdata download targets, test build targets, and test run targets. The
+# libaom and app util targets must exist before this function is called.
+function(setup_aom_test_targets)
+
+  # TODO(tomfinegan): Build speed optimization. $AOM_UNIT_TEST_COMMON_SOURCES
+  # and $AOM_UNIT_TEST_ENCODER_SOURCES are very large. The build of test targets
+  # could be sped up (on multicore build machines) by compiling sources in each
+  # list into separate object library targets, and then linking them into
+  # test_libaom.
   add_library(test_aom_common OBJECT ${AOM_UNIT_TEST_COMMON_SOURCES})
   add_dependencies(test_aom_common aom)
 
-  if (CONFIG_AV1_DECODER)
+  if(CONFIG_AV1_DECODER)
     add_library(test_aom_decoder OBJECT ${AOM_UNIT_TEST_DECODER_SOURCES})
     add_dependencies(test_aom_decoder aom)
-  endif ()
+  endif()
 
-  if (CONFIG_AV1_ENCODER)
+  if(CONFIG_AV1_ENCODER)
     add_library(test_aom_encoder OBJECT ${AOM_UNIT_TEST_ENCODER_SOURCES})
     add_dependencies(test_aom_encoder aom)
-  endif ()
+  endif()
 
   add_executable(test_libaom ${AOM_UNIT_TEST_WRAPPER_SOURCES}
                  $<TARGET_OBJECTS:aom_common_app_util>
                  $<TARGET_OBJECTS:test_aom_common>)
-  set(AOM_APP_TARGETS ${AOM_APP_TARGETS} test_libaom)
+  list(APPEND AOM_APP_TARGETS test_libaom)
 
-  if (CONFIG_AV1_DECODER)
-    target_sources(test_libaom PRIVATE
-                   $<TARGET_OBJECTS:aom_decoder_app_util>
+  if(CONFIG_AV1_DECODER)
+    target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:aom_decoder_app_util>
                    $<TARGET_OBJECTS:test_aom_decoder>)
 
-    if (CONFIG_DECODE_PERF_TESTS AND CONFIG_WEBM_IO)
+    if(ENABLE_DECODE_PERF_TESTS AND CONFIG_WEBM_IO)
       target_sources(test_libaom PRIVATE ${AOM_DECODE_PERF_TEST_SOURCES})
-    endif ()
-  endif ()
+    endif()
+  endif()
 
-  if (CONFIG_AV1_ENCODER)
-    target_sources(test_libaom PRIVATE
-                   $<TARGET_OBJECTS:test_aom_encoder>
+  if(CONFIG_AV1_ENCODER)
+    target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:test_aom_encoder>
                    $<TARGET_OBJECTS:aom_encoder_app_util>)
 
-    if (CONFIG_ENCODE_PERF_TESTS)
+    if(ENABLE_ENCODE_PERF_TESTS)
       target_sources(test_libaom PRIVATE ${AOM_ENCODE_PERF_TEST_SOURCES})
-    endif ()
+    endif()
 
-    if (NOT BUILD_SHARED_LIBS)
-      add_executable(test_intra_pred_speed
-                     ${AOM_TEST_INTRA_PRED_SPEED_SOURCES}
+    if(NOT BUILD_SHARED_LIBS)
+      add_executable(test_intra_pred_speed ${AOM_TEST_INTRA_PRED_SPEED_SOURCES}
                      $<TARGET_OBJECTS:aom_common_app_util>)
-      target_link_libraries(test_intra_pred_speed ${AOM_LIB_LINK_TYPE}
-                            aom gtest)
-      set(AOM_APP_TARGETS ${AOM_APP_TARGETS} test_intra_pred_speed)
-    endif ()
-  endif ()
+      target_link_libraries(test_intra_pred_speed ${AOM_LIB_LINK_TYPE} aom
+                            gtest)
+      list(APPEND AOM_APP_TARGETS test_intra_pred_speed)
+    endif()
+  endif()
 
   target_link_libraries(test_libaom ${AOM_LIB_LINK_TYPE} aom gtest)
 
-  if (CONFIG_LIBYUV)
+  if(CONFIG_LIBYUV)
     target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:yuv>)
-  endif ()
-  if (CONFIG_WEBM_IO)
+  endif()
+  if(CONFIG_WEBM_IO)
     target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:webm>)
-  endif ()
-  if (HAVE_SSE2)
+  endif()
+  if(HAVE_SSE2)
     add_intrinsics_source_to_target("-msse2" "test_libaom"
                                     "AOM_UNIT_TEST_COMMON_INTRIN_SSE2")
-  endif ()
-  if (HAVE_SSSE3)
+  endif()
+  if(HAVE_SSSE3)
     add_intrinsics_source_to_target("-mssse3" "test_libaom"
                                     "AOM_UNIT_TEST_COMMON_INTRIN_SSSE3")
-  endif ()
-  if (HAVE_SSE4_1)
+  endif()
+  if(HAVE_SSE4_1)
     add_intrinsics_source_to_target("-msse4.1" "test_libaom"
                                     "AOM_UNIT_TEST_COMMON_INTRIN_SSE4_1")
-    if (CONFIG_AV1_ENCODER)
-      if (AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1)
+    if(CONFIG_AV1_ENCODER)
+      if(AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1)
         add_intrinsics_source_to_target("-msse4.1" "test_libaom"
                                         "AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1")
-      endif ()
-    endif ()
-  endif ()
-  if (HAVE_AVX2)
+      endif()
+    endif()
+  endif()
+  if(HAVE_AVX2)
     add_intrinsics_source_to_target("-mavx2" "test_libaom"
                                     "AOM_UNIT_TEST_COMMON_INTRIN_AVX2")
-  endif ()
-  if (HAVE_NEON)
+  endif()
+  if(HAVE_NEON)
     add_intrinsics_source_to_target("${AOM_NEON_INTRIN_FLAG}" "test_libaom"
                                     "AOM_UNIT_TEST_COMMON_INTRIN_NEON")
-  endif ()
-
-  if (NOT ENABLE_IDE_TEST_HOSTING)
-    if (MSVC OR XCODE)
-      # Skip creation of test data download and test run targets when generating
-      # for Visual Studio and Xcode unless the user explicitly requests IDE test
-      # hosting. This is done to make build cycles in the IDE tolerable when the
-      # IDE command for build project is used to build AOM. Default behavior in
-      # IDEs is to build all targets, and the test run takes hours.
-      return ()
-    endif ()
-  endif ()
-
-  make_test_data_lists("${AOM_UNIT_TEST_DATA_LIST_FILE}"
-                       test_files test_file_checksums)
-  list(LENGTH test_files num_test_files)
-  list(LENGTH test_file_checksums num_test_file_checksums)
-
-  math(EXPR max_file_index "${num_test_files} - 1")
-  foreach (test_index RANGE ${max_file_index})
-    list(GET test_files ${test_index} test_file)
-    list(GET test_file_checksums ${test_index} test_file_checksum)
-    add_custom_target(testdata_${test_index}
-                      COMMAND ${CMAKE_COMMAND}
-                        -DAOM_CONFIG_DIR="${AOM_CONFIG_DIR}"
-                        -DAOM_ROOT="${AOM_ROOT}"
-                        -DAOM_TEST_FILE="${test_file}"
-                        -DAOM_TEST_CHECKSUM=${test_file_checksum}
-                        -P "${AOM_ROOT}/test/test_data_download_worker.cmake")
-    set(testdata_targets ${testdata_targets} testdata_${test_index})
-  endforeach ()
-
-  # Create a custom build target for running each test data download target.
-  add_custom_target(testdata)
-  add_dependencies(testdata ${testdata_targets})
-
-  # Pick a reasonable number of targets (this controls parallelization).
-  ProcessorCount(num_test_targets)
-  if (num_test_targets EQUAL 0)
-    # Just default to 10 targets when there's no processor count available.
-    set(num_test_targets 10)
-  endif ()
-
-  math(EXPR max_shard_index "${num_test_targets} - 1")
-  foreach (shard_index RANGE ${max_shard_index})
-    set(test_name "test_${shard_index}")
-    add_custom_target(${test_name}
-                      COMMAND ${CMAKE_COMMAND}
-                      -DGTEST_SHARD_INDEX=${shard_index}
-                      -DGTEST_TOTAL_SHARDS=${num_test_targets}
-                      -DTEST_LIBAOM=$<TARGET_FILE:test_libaom>
-                      -P "${AOM_ROOT}/test/test_runner.cmake"
-                      DEPENDS testdata test_libaom)
-    set(test_targets ${test_targets} ${test_name})
-  endforeach ()
-  add_custom_target(runtests)
-  add_dependencies(runtests ${test_targets})
+  endif()
+
+  if(ENABLE_TESTDATA)
+    make_test_data_lists("${AOM_UNIT_TEST_DATA_LIST_FILE}" test_files
+                         test_file_checksums)
+    list(LENGTH test_files num_test_files)
+    list(LENGTH test_file_checksums num_test_file_checksums)
+
+    math(EXPR max_file_index "${num_test_files} - 1")
+    foreach(test_index RANGE ${max_file_index})
+      list(GET test_files ${test_index} test_file)
+      list(GET test_file_checksums ${test_index} test_file_checksum)
+      add_custom_target(testdata_${test_index}
+                        COMMAND
+                          ${CMAKE_COMMAND} -DAOM_CONFIG_DIR="${AOM_CONFIG_DIR}"
+                          -DAOM_ROOT="${AOM_ROOT}"
+                          -DAOM_TEST_FILE="${test_file}"
+                          -DAOM_TEST_CHECKSUM=${test_file_checksum} -P
+                          "${AOM_ROOT}/test/test_data_download_worker.cmake")
+      list(APPEND testdata_targets testdata_${test_index})
+    endforeach()
+
+    # Create a custom build target for running each test data download target.
+    add_custom_target(testdata)
+    add_dependencies(testdata ${testdata_targets})
+
+    # Skip creation of test run targets when generating for Visual Studio and
+    # Xcode unless the user explicitly requests IDE test hosting. This is done
+    # to make build cycles in the IDE tolerable when the IDE command for build
+    # project is used to build AOM. Default behavior in IDEs is to build all
+    # targets, and the test run takes hours.
+    if(((NOT MSVC) AND (NOT XCODE)) OR ENABLE_IDE_TEST_HOSTING)
+
+      # Pick a reasonable number of targets (this controls parallelization).
+      processorcount(num_test_targets)
+      if(num_test_targets EQUAL 0) # Just default to 10 targets when there's no
+                                   # processor count available.
+        set(num_test_targets 10)
+      endif()
+
+      math(EXPR max_shard_index "${num_test_targets} - 1")
+      foreach(shard_index RANGE ${max_shard_index})
+        set(test_name "test_${shard_index}")
+        add_custom_target(${test_name}
+                          COMMAND ${CMAKE_COMMAND}
+                                  -DGTEST_SHARD_INDEX=${shard_index}
+                                  -DGTEST_TOTAL_SHARDS=${num_test_targets}
+                                  -DTEST_LIBAOM=$<TARGET_FILE:test_libaom> -P
+                                  "${AOM_ROOT}/test/test_runner.cmake"
+                          DEPENDS testdata test_libaom)
+        list(APPEND test_targets ${test_name})
+      endforeach()
+      add_custom_target(runtests)
+      add_dependencies(runtests ${test_targets})
+    endif()
+  endif()
+
+  # Collect all variables containing libaom test source files.
+  get_cmake_property(all_cmake_vars VARIABLES)
+  foreach(var ${all_cmake_vars})
+
+    # https://github.com/cheshirekow/cmake_format/issues/34
+# cmake-format: off
+    if (("${var}" MATCHES "_TEST_" AND NOT
+         "${var}" MATCHES
+         "_DATA_\|_CMAKE_\|INTRA_PRED\|_COMPILED\|_HOSTING\|_PERF_\|CODER_")
+        OR (CONFIG_AV1_ENCODER AND ENABLE_ENCODE_PERF_TESTS AND
+            "${var}" MATCHES "_ENCODE_PERF_TEST_")
+        OR (CONFIG_AV1_DECODER AND ENABLE_DECODE_PERF_TESTS AND
+            "${var}" MATCHES "_DECODE_PERF_TEST_")
+        OR (CONFIG_AV1_ENCODER AND "${var}" MATCHES "_TEST_ENCODER_")
+        OR (CONFIG_AV1_DECODER AND  "${var}" MATCHES "_TEST_DECODER_"))
+      list(APPEND aom_test_source_vars ${var})
+    endif()
+    # cmake-format:on
+  endforeach()
+
+  # Libaom_test_srcs.txt generation.
+  set(libaom_test_srcs_txt_file "${AOM_CONFIG_DIR}/libaom_test_srcs.txt")
+  file(WRITE "${libaom_test_srcs_txt_file}"
+             "# This file is generated. DO NOT EDIT.\n")
+
+  # Static source file list first.
+  foreach(aom_test_source_var ${aom_test_source_vars})
+    foreach(file ${${aom_test_source_var}})
+      if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}")
+        string(REPLACE "${AOM_ROOT}/" "" file "${file}")
+        file(APPEND "${libaom_test_srcs_txt_file}" "${file}\n")
+      endif()
+    endforeach()
+  endforeach()
 
   set(AOM_APP_TARGETS ${AOM_APP_TARGETS} PARENT_SCOPE)
-endfunction ()
-
-endif ()  # AOM_TEST_TEST_CMAKE_
+endfunction()
diff --git a/third_party/aom/test/test.mk b/third_party/aom/test/test.mk
deleted file mode 100644
index e6b0c534c..000000000
--- a/third_party/aom/test/test.mk
+++ /dev/null
@@ -1,268 +0,0 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-LIBAOM_TEST_SRCS-yes += acm_random.h
-LIBAOM_TEST_SRCS-yes += clear_system_state.h
-LIBAOM_TEST_SRCS-yes += codec_factory.h
-LIBAOM_TEST_SRCS-yes += md5_helper.h
-LIBAOM_TEST_SRCS-yes += register_state_check.h
-LIBAOM_TEST_SRCS-yes += test.mk
-LIBAOM_TEST_SRCS-yes += test_libaom.cc
-LIBAOM_TEST_SRCS-yes += util.h
-LIBAOM_TEST_SRCS-yes += video_source.h
-LIBAOM_TEST_SRCS-yes += transform_test_base.h
-LIBAOM_TEST_SRCS-yes += function_equivalence_test.h
-
-##
-## BLACK BOX TESTS
-##
-## Black box tests only use the public API.
-##
-LIBAOM_TEST_SRCS-yes                   += ../md5_utils.h ../md5_utils.c
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_DECODER)    += ivf_video_source.h
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += ../y4minput.h ../y4minput.c
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += altref_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += aq_segment_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += datarate_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += encode_api_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += error_resilience_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += i420_video_source.h
-#LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += realtime_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += resize_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += y4m_video_source.h
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += yuv_video_source.h
-
-#LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += level_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += active_map_refresh_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += active_map_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += borders_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += cpu_speed_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += frame_size_tests.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += lossless_test.cc
-
-LIBAOM_TEST_SRCS-yes                   += decode_test_driver.cc
-LIBAOM_TEST_SRCS-yes                   += decode_test_driver.h
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += encode_test_driver.cc
-LIBAOM_TEST_SRCS-yes                   += encode_test_driver.h
-
-## IVF writing.
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += ../ivfenc.c ../ivfenc.h
-
-## Y4m parsing.
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER)    += y4m_test.cc ../y4menc.c ../y4menc.h
-
-## WebM Parsing
-ifeq ($(CONFIG_WEBM_IO), yes)
-LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvparser.cc
-LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvreader.cc
-LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvparser.h
-LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvreader.h
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_DECODER) += $(LIBWEBM_PARSER_SRCS)
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_DECODER) += ../tools_common.h
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_DECODER) += ../webmdec.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_DECODER) += ../webmdec.h
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_DECODER) += webm_video_source.h
-endif
-
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_DECODER) += decode_api_test.cc
-
-# Currently we only support decoder perf tests for av1. Also they read from WebM
-# files, so WebM IO is required.
-ifeq ($(CONFIG_DECODE_PERF_TESTS)$(CONFIG_AV1_DECODER)$(CONFIG_WEBM_IO), \
-      yesyesyes)
-LIBAOM_TEST_SRCS-yes                   += decode_perf_test.cc
-endif
-
-ifeq ($(CONFIG_ENCODE_PERF_TESTS)$(CONFIG_AV1_ENCODER), yesyes)
-LIBAOM_TEST_SRCS-yes += encode_perf_test.cc
-endif
-
-## Multi-codec / unconditional black box tests.
-ifeq ($(findstring yes,$(CONFIG_AV1_ENCODER)),yes)
-LIBAOM_TEST_SRCS-yes += active_map_refresh_test.cc
-LIBAOM_TEST_SRCS-yes += active_map_test.cc
-LIBAOM_TEST_SRCS-yes += end_to_end_test.cc
-endif
-
-##
-## WHITE BOX TESTS
-##
-## Whitebox tests invoke functions not exposed via the public API. Certain
-## shared library builds don't make these functions accessible.
-##
-ifeq ($(CONFIG_SHARED),)
-
-## AV1
-ifeq ($(CONFIG_AV1),yes)
-
-# These tests require both the encoder and decoder to be built.
-ifeq ($(CONFIG_AV1_ENCODER)$(CONFIG_AV1_DECODER),yesyes)
-# IDCT test currently depends on FDCT function
-LIBAOM_TEST_SRCS-yes                   += coding_path_sync.cc
-LIBAOM_TEST_SRCS-yes                   += idct8x8_test.cc
-LIBAOM_TEST_SRCS-yes                   += partial_idct_test.cc
-LIBAOM_TEST_SRCS-yes                   += superframe_test.cc
-LIBAOM_TEST_SRCS-yes                   += tile_independence_test.cc
-LIBAOM_TEST_SRCS-yes                   += ethread_test.cc
-LIBAOM_TEST_SRCS-yes                   += motion_vector_test.cc
-ifneq ($(CONFIG_ANS),yes)
-LIBAOM_TEST_SRCS-yes                   += binary_codes_test.cc
-endif
-ifeq ($(CONFIG_EXT_TILE),yes)
-LIBAOM_TEST_SRCS-yes                   += av1_ext_tile_test.cc
-endif
-ifeq ($(CONFIG_ANS),yes)
-LIBAOM_TEST_SRCS-yes                   += ans_test.cc
-LIBAOM_TEST_SRCS-yes                   += ans_codec_test.cc
-else
-LIBAOM_TEST_SRCS-yes                   += boolcoder_test.cc
-ifeq ($(CONFIG_ACCOUNTING),yes)
-LIBAOM_TEST_SRCS-yes                   += accounting_test.cc
-endif
-endif
-LIBAOM_TEST_SRCS-yes                   += divu_small_test.cc
-#LIBAOM_TEST_SRCS-yes                   += encoder_parms_get_to_decoder.cc
-endif
-
-LIBAOM_TEST_SRCS-$(CONFIG_ADAPT_SCAN)  += scan_test.cc
-LIBAOM_TEST_SRCS-yes                   += convolve_test.cc
-LIBAOM_TEST_SRCS-yes                   += lpf_8_test.cc
-ifeq ($(CONFIG_CDEF_SINGLEPASS),yes)
-LIBAOM_TEST_SRCS-$(CONFIG_CDEF)        += cdef_test.cc
-else
-LIBAOM_TEST_SRCS-$(CONFIG_CDEF)        += dering_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_CDEF)        += clpf_test.cc
-endif
-LIBAOM_TEST_SRCS-yes                   += simd_cmp_impl.h
-LIBAOM_TEST_SRCS-$(HAVE_SSE2)          += simd_cmp_sse2.cc
-LIBAOM_TEST_SRCS-$(HAVE_SSSE3)         += simd_cmp_ssse3.cc
-LIBAOM_TEST_SRCS-$(HAVE_SSE4_1)        += simd_cmp_sse4.cc
-LIBAOM_TEST_SRCS-$(HAVE_AVX2)          += simd_cmp_avx2.cc
-LIBAOM_TEST_SRCS-$(HAVE_NEON)          += simd_cmp_neon.cc
-LIBAOM_TEST_SRCS-yes                   += simd_impl.h
-LIBAOM_TEST_SRCS-$(HAVE_SSE2)          += simd_sse2_test.cc
-LIBAOM_TEST_SRCS-$(HAVE_SSSE3)         += simd_ssse3_test.cc
-LIBAOM_TEST_SRCS-$(HAVE_SSE4_1)        += simd_sse4_test.cc
-LIBAOM_TEST_SRCS-$(HAVE_AVX2)          += simd_avx2_test.cc
-LIBAOM_TEST_SRCS-$(HAVE_NEON)          += simd_neon_test.cc
-LIBAOM_TEST_SRCS-yes                   += intrapred_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_INTRABC)     += intrabc_test.cc
-#LIBAOM_TEST_SRCS-$(CONFIG_AV1_DECODER) += av1_thread_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += dct16x16_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += dct32x32_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += fdct4x4_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += fdct8x8_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += hadamard_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += minmax_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += variance_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += error_block_test.cc
-#LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_quantize_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += subtract_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += arf_freq_test.cc
-ifneq ($(CONFIG_NEW_QUANT), yes)
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += quantize_func_test.cc
-endif
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += block_error_test.cc
-
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_inv_txfm_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_dct_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht4x4_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht8x8_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht16x16_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht32x32_test.cc
-ifeq ($(CONFIG_TX64X64),yes)
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht64x64_test.cc
-endif
-ifeq ($(CONFIG_EXT_TX),yes)
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht4x8_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht8x4_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht8x16_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht16x8_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht16x32_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht32x16_test.cc
-endif
-
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += sum_squares_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += subtract_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += blend_a64_mask_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += blend_a64_mask_1d_test.cc
-
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += masked_variance_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += masked_sad_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_wedge_utils_test.cc
-
-## Skip the unit test written for 4-tap filter intra predictor, because we
-## revert to 3-tap filter.
-## ifeq ($(CONFIG_FILTER_INTRA),yes)
-## LIBAOM_TEST_SRCS-$(HAVE_SSE4_1) += filterintra_predictors_test.cc
-## endif
-
-ifeq ($(CONFIG_MOTION_VAR),yes)
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += obmc_sad_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += obmc_variance_test.cc
-endif
-
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-ifeq ($(CONFIG_AV1_ENCODER),yes)
-LIBAOM_TEST_SRCS-$(HAVE_SSE4_1) += av1_quantize_test.cc
-LIBAOM_TEST_SRCS-$(HAVE_SSE4_1) += av1_highbd_iht_test.cc
-endif
-endif # CONFIG_HIGHBITDEPTH
-endif # AV1
-
-## Multi-codec / unconditional whitebox tests.
-
-ifeq ($(CONFIG_AV1_ENCODER),yes)
-LIBAOM_TEST_SRCS-yes += avg_test.cc
-endif
-ifeq ($(CONFIG_INTERNAL_STATS),yes)
-LIBAOM_TEST_SRCS-$(CONFIG_HIGHBITDEPTH) += hbd_metrics_test.cc
-endif
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += sad_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1) += av1_txfm_test.h
-LIBAOM_TEST_SRCS-$(CONFIG_AV1) += av1_txfm_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fwd_txfm1d_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_inv_txfm1d_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fwd_txfm2d_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_inv_txfm2d_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1) += av1_convolve_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1) += av1_convolve_optimz_test.cc
-ifneq ($(findstring yes,$(CONFIG_GLOBAL_MOTION)$(CONFIG_WARPED_MOTION)),)
-LIBAOM_TEST_SRCS-$(HAVE_SSE2) += warp_filter_test_util.h
-LIBAOM_TEST_SRCS-$(HAVE_SSE2) += warp_filter_test.cc warp_filter_test_util.cc
-endif
-ifeq ($(CONFIG_LOOP_RESTORATION),yes)
-LIBAOM_TEST_SRCS-$(HAVE_SSE2) += hiprec_convolve_test_util.h
-LIBAOM_TEST_SRCS-$(HAVE_SSE2) += hiprec_convolve_test.cc
-LIBAOM_TEST_SRCS-$(HAVE_SSE2) += hiprec_convolve_test_util.cc
-LIBAOM_TEST_SRCS-$(HAVE_SSE4_1) += selfguided_filter_test.cc
-endif
-ifeq ($(CONFIG_CONVOLVE_ROUND),yes)
-LIBAOM_TEST_SRCS-$(HAVE_SSE2) += av1_convolve_2d_test_util.h
-LIBAOM_TEST_SRCS-$(HAVE_SSE2) += av1_convolve_2d_test.cc
-LIBAOM_TEST_SRCS-$(HAVE_SSE2) += av1_convolve_2d_test_util.cc
-LIBAOM_TEST_SRCS-yes          += convolve_round_test.cc
-endif
-
-ifeq (yesx,$(CONFIG_CONVOLVE_ROUND)x$(CONFIG_COMPOUND_ROUND))
-LIBAOM_TEST_SRCS-$(HAVE_SSE4_1) += av1_convolve_scale_test.cc
-endif
-
-ifeq ($(CONFIG_GLOBAL_MOTION)$(CONFIG_AV1_ENCODER),yesyes)
-LIBAOM_TEST_SRCS-$(HAVE_SSE4_1) += corner_match_test.cc
-endif
-
-TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
-TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
-
-endif # CONFIG_SHARED
-
-include $(SRC_PATH_BARE)/test/test-data.mk
diff --git a/third_party/aom/test/test_data_download_worker.cmake b/third_party/aom/test/test_data_download_worker.cmake
index b252dd960..dc803497d 100644
--- a/third_party/aom/test/test_data_download_worker.cmake
+++ b/third_party/aom/test/test_data_download_worker.cmake
@@ -1,43 +1,46 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
 include("${AOM_ROOT}/test/test_data_util.cmake")
 
+# https://github.com/cheshirekow/cmake_format/issues/34
+# cmake-format: off
 if (NOT AOM_ROOT OR NOT AOM_CONFIG_DIR OR NOT AOM_TEST_FILE
     OR NOT AOM_TEST_CHECKSUM)
   message(FATAL_ERROR
           "AOM_ROOT, AOM_CONFIG_DIR, AOM_TEST_FILE and AOM_TEST_CHECKSUM must be
           defined.")
 endif ()
+# cmake-format: on
 
-set(AOM_TEST_DATA_URL
-    "https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx")
+set(AOM_TEST_DATA_URL "http://storage.googleapis.com/aom-test-data")
 
-if (NOT AOM_TEST_DATA_PATH)
+if(NOT AOM_TEST_DATA_PATH)
   set(AOM_TEST_DATA_PATH "$ENV{LIBAOM_TEST_DATA_PATH}")
-endif ()
+endif()
 
-if ("${AOM_TEST_DATA_PATH}" STREQUAL "")
-  message(WARNING "Writing test data to ${AOM_CONFIG_DIR}, set "
-          "$LIBAOM_TEST_DATA_PATH in your environment to avoid this warning.")
+if("${AOM_TEST_DATA_PATH}" STREQUAL "")
+  message(WARNING
+            "Writing test data to ${AOM_CONFIG_DIR}, set "
+            "$LIBAOM_TEST_DATA_PATH in your environment to avoid this warning.")
   set(AOM_TEST_DATA_PATH "${AOM_CONFIG_DIR}")
-endif ()
+endif()
 
-if (NOT EXISTS "${AOM_TEST_DATA_PATH}")
+if(NOT EXISTS "${AOM_TEST_DATA_PATH}")
   file(MAKE_DIRECTORY "${AOM_TEST_DATA_PATH}")
-endif ()
+endif()
 
 expand_test_file_paths("AOM_TEST_FILE" "${AOM_TEST_DATA_PATH}" "filepath")
 expand_test_file_paths("AOM_TEST_FILE" "${AOM_TEST_DATA_URL}" "url")
 
 check_file("${filepath}" "${AOM_TEST_CHECKSUM}" "needs_download")
-if (needs_download)
+if(needs_download)
   download_test_file("${url}" "${AOM_TEST_CHECKSUM}" "${filepath}")
-endif ()
+endif()
diff --git a/third_party/aom/test/test_data_util.cmake b/third_party/aom/test/test_data_util.cmake
index 3904734b5..bbdd5f4a2 100644
--- a/third_party/aom/test/test_data_util.cmake
+++ b/third_party/aom/test/test_data_util.cmake
@@ -1,68 +1,394 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-set(AOM_TEST_DATA_FILE_NAMES
-    "hantro_collage_w352h288.yuv"
-    "hantro_odd.yuv"
-    "park_joy_90p_10_420.y4m"
-    "park_joy_90p_10_422.y4m"
-    "park_joy_90p_10_444.y4m"
-    "park_joy_90p_10_440.yuv"
-    "park_joy_90p_12_420.y4m"
-    "park_joy_90p_12_422.y4m"
-    "park_joy_90p_12_444.y4m"
-    "park_joy_90p_12_440.yuv"
-    "park_joy_90p_8_420_a10-1.y4m"
-    "park_joy_90p_8_420.y4m"
-    "park_joy_90p_8_422.y4m"
-    "park_joy_90p_8_444.y4m"
-    "park_joy_90p_8_440.yuv"
-    "desktop_credits.y4m"
-    "niklas_1280_720_30.y4m"
-    "rush_hour_444.y4m"
-    "screendata.y4m"
-    "niklas_640_480_30.yuv")
-
-if (CONFIG_DECODE_PERF_TESTS AND CONFIG_AV1_ENCODER)
-  set(AOM_TEST_DATA_FILE_NAMES
-      ${AOM_TEST_DATA_FILE_NAMES}
-      "niklas_1280_720_30.yuv")
-endif ()
-
-if (CONFIG_ENCODE_PERF_TESTS AND CONFIG_AV1_ENCODER)
-  set(AOM_TEST_DATA_FILE_NAMES
-      ${AOM_TEST_DATA_FILE_NAMES}
-      "desktop_640_360_30.yuv"
-      "kirland_640_480_30.yuv"
-      "macmarcomoving_640_480_30.yuv"
-      "macmarcostationary_640_480_30.yuv"
-      "niklas_1280_720_30.yuv"
-      "tacomanarrows_640_480_30.yuv"
-      "tacomasmallcameramovement_640_480_30.yuv"
-      "thaloundeskmtg_640_480_30.yuv")
-endif ()
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+
+list(APPEND AOM_TEST_DATA_FILE_NAMES
+            "hantro_collage_w352h288.yuv"
+            "hantro_odd.yuv"
+            "invalid-bug-1814.ivf"
+            "invalid-bug-1814.ivf.res"
+            "park_joy_90p_10_420.y4m"
+            "park_joy_90p_10_422.y4m"
+            "park_joy_90p_10_444.y4m"
+            "park_joy_90p_12_420.y4m"
+            "park_joy_90p_12_422.y4m"
+            "park_joy_90p_12_444.y4m"
+            "park_joy_90p_8_420_a10-1.y4m"
+            "park_joy_90p_8_420.y4m"
+            "park_joy_90p_8_422.y4m"
+            "park_joy_90p_8_444.y4m"
+            "desktop_credits.y4m"
+            "niklas_1280_720_30.y4m"
+            "rush_hour_444.y4m"
+            "screendata.y4m"
+            "niklas_640_480_30.yuv"
+            "vase10x10.yuv")
+
+if(ENABLE_DECODE_PERF_TESTS AND CONFIG_AV1_ENCODER)
+  list(APPEND AOM_TEST_DATA_FILE_NAMES "niklas_1280_720_30.yuv")
+endif()
+
+if(CONFIG_AV1_DECODER)
+  list(APPEND AOM_TEST_DATA_FILE_NAMES
+              "av1-1-b8-00-quantizer-00.ivf"
+              "av1-1-b8-00-quantizer-00.ivf.md5"
+              "av1-1-b8-00-quantizer-01.ivf"
+              "av1-1-b8-00-quantizer-01.ivf.md5"
+              "av1-1-b8-00-quantizer-02.ivf"
+              "av1-1-b8-00-quantizer-02.ivf.md5"
+              "av1-1-b8-00-quantizer-03.ivf"
+              "av1-1-b8-00-quantizer-03.ivf.md5"
+              "av1-1-b8-00-quantizer-04.ivf"
+              "av1-1-b8-00-quantizer-04.ivf.md5"
+              "av1-1-b8-00-quantizer-05.ivf"
+              "av1-1-b8-00-quantizer-05.ivf.md5"
+              "av1-1-b8-00-quantizer-06.ivf"
+              "av1-1-b8-00-quantizer-06.ivf.md5"
+              "av1-1-b8-00-quantizer-07.ivf"
+              "av1-1-b8-00-quantizer-07.ivf.md5"
+              "av1-1-b8-00-quantizer-08.ivf"
+              "av1-1-b8-00-quantizer-08.ivf.md5"
+              "av1-1-b8-00-quantizer-09.ivf"
+              "av1-1-b8-00-quantizer-09.ivf.md5"
+              "av1-1-b8-00-quantizer-10.ivf"
+              "av1-1-b8-00-quantizer-10.ivf.md5"
+              "av1-1-b8-00-quantizer-11.ivf"
+              "av1-1-b8-00-quantizer-11.ivf.md5"
+              "av1-1-b8-00-quantizer-12.ivf"
+              "av1-1-b8-00-quantizer-12.ivf.md5"
+              "av1-1-b8-00-quantizer-13.ivf"
+              "av1-1-b8-00-quantizer-13.ivf.md5"
+              "av1-1-b8-00-quantizer-14.ivf"
+              "av1-1-b8-00-quantizer-14.ivf.md5"
+              "av1-1-b8-00-quantizer-15.ivf"
+              "av1-1-b8-00-quantizer-15.ivf.md5"
+              "av1-1-b8-00-quantizer-16.ivf"
+              "av1-1-b8-00-quantizer-16.ivf.md5"
+              "av1-1-b8-00-quantizer-17.ivf"
+              "av1-1-b8-00-quantizer-17.ivf.md5"
+              "av1-1-b8-00-quantizer-18.ivf"
+              "av1-1-b8-00-quantizer-18.ivf.md5"
+              "av1-1-b8-00-quantizer-19.ivf"
+              "av1-1-b8-00-quantizer-19.ivf.md5"
+              "av1-1-b8-00-quantizer-20.ivf"
+              "av1-1-b8-00-quantizer-20.ivf.md5"
+              "av1-1-b8-00-quantizer-21.ivf"
+              "av1-1-b8-00-quantizer-21.ivf.md5"
+              "av1-1-b8-00-quantizer-22.ivf"
+              "av1-1-b8-00-quantizer-22.ivf.md5"
+              "av1-1-b8-00-quantizer-23.ivf"
+              "av1-1-b8-00-quantizer-23.ivf.md5"
+              "av1-1-b8-00-quantizer-24.ivf"
+              "av1-1-b8-00-quantizer-24.ivf.md5"
+              "av1-1-b8-00-quantizer-25.ivf"
+              "av1-1-b8-00-quantizer-25.ivf.md5"
+              "av1-1-b8-00-quantizer-26.ivf"
+              "av1-1-b8-00-quantizer-26.ivf.md5"
+              "av1-1-b8-00-quantizer-27.ivf"
+              "av1-1-b8-00-quantizer-27.ivf.md5"
+              "av1-1-b8-00-quantizer-28.ivf"
+              "av1-1-b8-00-quantizer-28.ivf.md5"
+              "av1-1-b8-00-quantizer-29.ivf"
+              "av1-1-b8-00-quantizer-29.ivf.md5"
+              "av1-1-b8-00-quantizer-30.ivf"
+              "av1-1-b8-00-quantizer-30.ivf.md5"
+              "av1-1-b8-00-quantizer-31.ivf"
+              "av1-1-b8-00-quantizer-31.ivf.md5"
+              "av1-1-b8-00-quantizer-32.ivf"
+              "av1-1-b8-00-quantizer-32.ivf.md5"
+              "av1-1-b8-00-quantizer-33.ivf"
+              "av1-1-b8-00-quantizer-33.ivf.md5"
+              "av1-1-b8-00-quantizer-34.ivf"
+              "av1-1-b8-00-quantizer-34.ivf.md5"
+              "av1-1-b8-00-quantizer-35.ivf"
+              "av1-1-b8-00-quantizer-35.ivf.md5"
+              "av1-1-b8-00-quantizer-36.ivf"
+              "av1-1-b8-00-quantizer-36.ivf.md5"
+              "av1-1-b8-00-quantizer-37.ivf"
+              "av1-1-b8-00-quantizer-37.ivf.md5"
+              "av1-1-b8-00-quantizer-38.ivf"
+              "av1-1-b8-00-quantizer-38.ivf.md5"
+              "av1-1-b8-00-quantizer-39.ivf"
+              "av1-1-b8-00-quantizer-39.ivf.md5"
+              "av1-1-b8-00-quantizer-40.ivf"
+              "av1-1-b8-00-quantizer-40.ivf.md5"
+              "av1-1-b8-00-quantizer-41.ivf"
+              "av1-1-b8-00-quantizer-41.ivf.md5"
+              "av1-1-b8-00-quantizer-42.ivf"
+              "av1-1-b8-00-quantizer-42.ivf.md5"
+              "av1-1-b8-00-quantizer-43.ivf"
+              "av1-1-b8-00-quantizer-43.ivf.md5"
+              "av1-1-b8-00-quantizer-44.ivf"
+              "av1-1-b8-00-quantizer-44.ivf.md5"
+              "av1-1-b8-00-quantizer-45.ivf"
+              "av1-1-b8-00-quantizer-45.ivf.md5"
+              "av1-1-b8-00-quantizer-46.ivf"
+              "av1-1-b8-00-quantizer-46.ivf.md5"
+              "av1-1-b8-00-quantizer-47.ivf"
+              "av1-1-b8-00-quantizer-47.ivf.md5"
+              "av1-1-b8-00-quantizer-48.ivf"
+              "av1-1-b8-00-quantizer-48.ivf.md5"
+              "av1-1-b8-00-quantizer-49.ivf"
+              "av1-1-b8-00-quantizer-49.ivf.md5"
+              "av1-1-b8-00-quantizer-50.ivf"
+              "av1-1-b8-00-quantizer-50.ivf.md5"
+              "av1-1-b8-00-quantizer-51.ivf"
+              "av1-1-b8-00-quantizer-51.ivf.md5"
+              "av1-1-b8-00-quantizer-52.ivf"
+              "av1-1-b8-00-quantizer-52.ivf.md5"
+              "av1-1-b8-00-quantizer-53.ivf"
+              "av1-1-b8-00-quantizer-53.ivf.md5"
+              "av1-1-b8-00-quantizer-54.ivf"
+              "av1-1-b8-00-quantizer-54.ivf.md5"
+              "av1-1-b8-00-quantizer-55.ivf"
+              "av1-1-b8-00-quantizer-55.ivf.md5"
+              "av1-1-b8-00-quantizer-56.ivf"
+              "av1-1-b8-00-quantizer-56.ivf.md5"
+              "av1-1-b8-00-quantizer-57.ivf"
+              "av1-1-b8-00-quantizer-57.ivf.md5"
+              "av1-1-b8-00-quantizer-58.ivf"
+              "av1-1-b8-00-quantizer-58.ivf.md5"
+              "av1-1-b8-00-quantizer-59.ivf"
+              "av1-1-b8-00-quantizer-59.ivf.md5"
+              "av1-1-b8-00-quantizer-60.ivf"
+              "av1-1-b8-00-quantizer-60.ivf.md5"
+              "av1-1-b8-00-quantizer-61.ivf"
+              "av1-1-b8-00-quantizer-61.ivf.md5"
+              "av1-1-b8-00-quantizer-62.ivf"
+              "av1-1-b8-00-quantizer-62.ivf.md5"
+              "av1-1-b8-00-quantizer-63.ivf"
+              "av1-1-b8-00-quantizer-63.ivf.md5"
+              "av1-1-b8-01-size-16x16.ivf"
+              "av1-1-b8-01-size-16x16.ivf.md5"
+              "av1-1-b8-01-size-16x18.ivf"
+              "av1-1-b8-01-size-16x18.ivf.md5"
+              "av1-1-b8-01-size-16x32.ivf"
+              "av1-1-b8-01-size-16x32.ivf.md5"
+              "av1-1-b8-01-size-16x34.ivf"
+              "av1-1-b8-01-size-16x34.ivf.md5"
+              "av1-1-b8-01-size-16x64.ivf"
+              "av1-1-b8-01-size-16x64.ivf.md5"
+              "av1-1-b8-01-size-16x66.ivf"
+              "av1-1-b8-01-size-16x66.ivf.md5"
+              "av1-1-b8-01-size-18x16.ivf"
+              "av1-1-b8-01-size-18x16.ivf.md5"
+              "av1-1-b8-01-size-18x18.ivf"
+              "av1-1-b8-01-size-18x18.ivf.md5"
+              "av1-1-b8-01-size-18x32.ivf"
+              "av1-1-b8-01-size-18x32.ivf.md5"
+              "av1-1-b8-01-size-18x34.ivf"
+              "av1-1-b8-01-size-18x34.ivf.md5"
+              "av1-1-b8-01-size-18x64.ivf"
+              "av1-1-b8-01-size-18x64.ivf.md5"
+              "av1-1-b8-01-size-18x66.ivf"
+              "av1-1-b8-01-size-18x66.ivf.md5"
+              "av1-1-b8-01-size-196x196.ivf"
+              "av1-1-b8-01-size-196x196.ivf.md5"
+              "av1-1-b8-01-size-196x198.ivf"
+              "av1-1-b8-01-size-196x198.ivf.md5"
+              "av1-1-b8-01-size-196x200.ivf"
+              "av1-1-b8-01-size-196x200.ivf.md5"
+              "av1-1-b8-01-size-196x202.ivf"
+              "av1-1-b8-01-size-196x202.ivf.md5"
+              "av1-1-b8-01-size-196x208.ivf"
+              "av1-1-b8-01-size-196x208.ivf.md5"
+              "av1-1-b8-01-size-196x210.ivf"
+              "av1-1-b8-01-size-196x210.ivf.md5"
+              "av1-1-b8-01-size-196x224.ivf"
+              "av1-1-b8-01-size-196x224.ivf.md5"
+              "av1-1-b8-01-size-196x226.ivf"
+              "av1-1-b8-01-size-196x226.ivf.md5"
+              "av1-1-b8-01-size-198x196.ivf"
+              "av1-1-b8-01-size-198x196.ivf.md5"
+              "av1-1-b8-01-size-198x198.ivf"
+              "av1-1-b8-01-size-198x198.ivf.md5"
+              "av1-1-b8-01-size-198x200.ivf"
+              "av1-1-b8-01-size-198x200.ivf.md5"
+              "av1-1-b8-01-size-198x202.ivf"
+              "av1-1-b8-01-size-198x202.ivf.md5"
+              "av1-1-b8-01-size-198x208.ivf"
+              "av1-1-b8-01-size-198x208.ivf.md5"
+              "av1-1-b8-01-size-198x210.ivf"
+              "av1-1-b8-01-size-198x210.ivf.md5"
+              "av1-1-b8-01-size-198x224.ivf"
+              "av1-1-b8-01-size-198x224.ivf.md5"
+              "av1-1-b8-01-size-198x226.ivf"
+              "av1-1-b8-01-size-198x226.ivf.md5"
+              "av1-1-b8-01-size-200x196.ivf"
+              "av1-1-b8-01-size-200x196.ivf.md5"
+              "av1-1-b8-01-size-200x198.ivf"
+              "av1-1-b8-01-size-200x198.ivf.md5"
+              "av1-1-b8-01-size-200x200.ivf"
+              "av1-1-b8-01-size-200x200.ivf.md5"
+              "av1-1-b8-01-size-200x202.ivf"
+              "av1-1-b8-01-size-200x202.ivf.md5"
+              "av1-1-b8-01-size-200x208.ivf"
+              "av1-1-b8-01-size-200x208.ivf.md5"
+              "av1-1-b8-01-size-200x210.ivf"
+              "av1-1-b8-01-size-200x210.ivf.md5"
+              "av1-1-b8-01-size-200x224.ivf"
+              "av1-1-b8-01-size-200x224.ivf.md5"
+              "av1-1-b8-01-size-200x226.ivf"
+              "av1-1-b8-01-size-200x226.ivf.md5"
+              "av1-1-b8-01-size-202x196.ivf"
+              "av1-1-b8-01-size-202x196.ivf.md5"
+              "av1-1-b8-01-size-202x198.ivf"
+              "av1-1-b8-01-size-202x198.ivf.md5"
+              "av1-1-b8-01-size-202x200.ivf"
+              "av1-1-b8-01-size-202x200.ivf.md5"
+              "av1-1-b8-01-size-202x202.ivf"
+              "av1-1-b8-01-size-202x202.ivf.md5"
+              "av1-1-b8-01-size-202x208.ivf"
+              "av1-1-b8-01-size-202x208.ivf.md5"
+              "av1-1-b8-01-size-202x210.ivf"
+              "av1-1-b8-01-size-202x210.ivf.md5"
+              "av1-1-b8-01-size-202x224.ivf"
+              "av1-1-b8-01-size-202x224.ivf.md5"
+              "av1-1-b8-01-size-202x226.ivf"
+              "av1-1-b8-01-size-202x226.ivf.md5"
+              "av1-1-b8-01-size-208x196.ivf"
+              "av1-1-b8-01-size-208x196.ivf.md5"
+              "av1-1-b8-01-size-208x198.ivf"
+              "av1-1-b8-01-size-208x198.ivf.md5"
+              "av1-1-b8-01-size-208x200.ivf"
+              "av1-1-b8-01-size-208x200.ivf.md5"
+              "av1-1-b8-01-size-208x202.ivf"
+              "av1-1-b8-01-size-208x202.ivf.md5"
+              "av1-1-b8-01-size-208x208.ivf"
+              "av1-1-b8-01-size-208x208.ivf.md5"
+              "av1-1-b8-01-size-208x210.ivf"
+              "av1-1-b8-01-size-208x210.ivf.md5"
+              "av1-1-b8-01-size-208x224.ivf"
+              "av1-1-b8-01-size-208x224.ivf.md5"
+              "av1-1-b8-01-size-208x226.ivf"
+              "av1-1-b8-01-size-208x226.ivf.md5"
+              "av1-1-b8-01-size-210x196.ivf"
+              "av1-1-b8-01-size-210x196.ivf.md5"
+              "av1-1-b8-01-size-210x198.ivf"
+              "av1-1-b8-01-size-210x198.ivf.md5"
+              "av1-1-b8-01-size-210x200.ivf"
+              "av1-1-b8-01-size-210x200.ivf.md5"
+              "av1-1-b8-01-size-210x202.ivf"
+              "av1-1-b8-01-size-210x202.ivf.md5"
+              "av1-1-b8-01-size-210x208.ivf"
+              "av1-1-b8-01-size-210x208.ivf.md5"
+              "av1-1-b8-01-size-210x210.ivf"
+              "av1-1-b8-01-size-210x210.ivf.md5"
+              "av1-1-b8-01-size-210x224.ivf"
+              "av1-1-b8-01-size-210x224.ivf.md5"
+              "av1-1-b8-01-size-210x226.ivf"
+              "av1-1-b8-01-size-210x226.ivf.md5"
+              "av1-1-b8-01-size-224x196.ivf"
+              "av1-1-b8-01-size-224x196.ivf.md5"
+              "av1-1-b8-01-size-224x198.ivf"
+              "av1-1-b8-01-size-224x198.ivf.md5"
+              "av1-1-b8-01-size-224x200.ivf"
+              "av1-1-b8-01-size-224x200.ivf.md5"
+              "av1-1-b8-01-size-224x202.ivf"
+              "av1-1-b8-01-size-224x202.ivf.md5"
+              "av1-1-b8-01-size-224x208.ivf"
+              "av1-1-b8-01-size-224x208.ivf.md5"
+              "av1-1-b8-01-size-224x210.ivf"
+              "av1-1-b8-01-size-224x210.ivf.md5"
+              "av1-1-b8-01-size-224x224.ivf"
+              "av1-1-b8-01-size-224x224.ivf.md5"
+              "av1-1-b8-01-size-224x226.ivf"
+              "av1-1-b8-01-size-224x226.ivf.md5"
+              "av1-1-b8-01-size-226x196.ivf"
+              "av1-1-b8-01-size-226x196.ivf.md5"
+              "av1-1-b8-01-size-226x198.ivf"
+              "av1-1-b8-01-size-226x198.ivf.md5"
+              "av1-1-b8-01-size-226x200.ivf"
+              "av1-1-b8-01-size-226x200.ivf.md5"
+              "av1-1-b8-01-size-226x202.ivf"
+              "av1-1-b8-01-size-226x202.ivf.md5"
+              "av1-1-b8-01-size-226x208.ivf"
+              "av1-1-b8-01-size-226x208.ivf.md5"
+              "av1-1-b8-01-size-226x210.ivf"
+              "av1-1-b8-01-size-226x210.ivf.md5"
+              "av1-1-b8-01-size-226x224.ivf"
+              "av1-1-b8-01-size-226x224.ivf.md5"
+              "av1-1-b8-01-size-226x226.ivf"
+              "av1-1-b8-01-size-226x226.ivf.md5"
+              "av1-1-b8-01-size-32x16.ivf"
+              "av1-1-b8-01-size-32x16.ivf.md5"
+              "av1-1-b8-01-size-32x18.ivf"
+              "av1-1-b8-01-size-32x18.ivf.md5"
+              "av1-1-b8-01-size-32x32.ivf"
+              "av1-1-b8-01-size-32x32.ivf.md5"
+              "av1-1-b8-01-size-32x34.ivf"
+              "av1-1-b8-01-size-32x34.ivf.md5"
+              "av1-1-b8-01-size-32x64.ivf"
+              "av1-1-b8-01-size-32x64.ivf.md5"
+              "av1-1-b8-01-size-32x66.ivf"
+              "av1-1-b8-01-size-32x66.ivf.md5"
+              "av1-1-b8-01-size-34x16.ivf"
+              "av1-1-b8-01-size-34x16.ivf.md5"
+              "av1-1-b8-01-size-34x18.ivf"
+              "av1-1-b8-01-size-34x18.ivf.md5"
+              "av1-1-b8-01-size-34x32.ivf"
+              "av1-1-b8-01-size-34x32.ivf.md5"
+              "av1-1-b8-01-size-34x34.ivf"
+              "av1-1-b8-01-size-34x34.ivf.md5"
+              "av1-1-b8-01-size-34x64.ivf"
+              "av1-1-b8-01-size-34x64.ivf.md5"
+              "av1-1-b8-01-size-34x66.ivf"
+              "av1-1-b8-01-size-34x66.ivf.md5"
+              "av1-1-b8-01-size-64x16.ivf"
+              "av1-1-b8-01-size-64x16.ivf.md5"
+              "av1-1-b8-01-size-64x18.ivf"
+              "av1-1-b8-01-size-64x18.ivf.md5"
+              "av1-1-b8-01-size-64x32.ivf"
+              "av1-1-b8-01-size-64x32.ivf.md5"
+              "av1-1-b8-01-size-64x34.ivf"
+              "av1-1-b8-01-size-64x34.ivf.md5"
+              "av1-1-b8-01-size-64x64.ivf"
+              "av1-1-b8-01-size-64x64.ivf.md5"
+              "av1-1-b8-01-size-64x66.ivf"
+              "av1-1-b8-01-size-64x66.ivf.md5"
+              "av1-1-b8-01-size-66x16.ivf"
+              "av1-1-b8-01-size-66x16.ivf.md5"
+              "av1-1-b8-01-size-66x18.ivf"
+              "av1-1-b8-01-size-66x18.ivf.md5"
+              "av1-1-b8-01-size-66x32.ivf"
+              "av1-1-b8-01-size-66x32.ivf.md5"
+              "av1-1-b8-01-size-66x34.ivf"
+              "av1-1-b8-01-size-66x34.ivf.md5"
+              "av1-1-b8-01-size-66x64.ivf"
+              "av1-1-b8-01-size-66x64.ivf.md5"
+              "av1-1-b8-01-size-66x66.ivf"
+              "av1-1-b8-01-size-66x66.ivf.md5")
+endif()
+
+if(ENABLE_ENCODE_PERF_TESTS AND CONFIG_AV1_ENCODER)
+  list(APPEND AOM_TEST_DATA_FILE_NAMES "desktop_640_360_30.yuv"
+              "kirland_640_480_30.yuv" "macmarcomoving_640_480_30.yuv"
+              "macmarcostationary_640_480_30.yuv" "niklas_1280_720_30.yuv"
+              "tacomanarrows_640_480_30.yuv"
+              "tacomasmallcameramovement_640_480_30.yuv"
+              "thaloundeskmtg_640_480_30.yuv")
+endif()
 
 # Parses test/test-data.sha1 and writes captured file names and checksums to
 # $out_files and $out_checksums as lists.
-function (make_test_data_lists test_data_file out_files out_checksums)
-  if (NOT test_data_file OR NOT EXISTS "${test_data_file}")
+function(make_test_data_lists test_data_file out_files out_checksums)
+  if(NOT test_data_file OR NOT EXISTS "${test_data_file}")
     message(FATAL_ERROR "Test info file missing or empty (${test_data_file})")
-  endif ()
+  endif()
 
   # Read $test_data_file into $files_and_checksums. $files_and_checksums becomes
   # a list with an entry for each line from $test_data_file.
   file(STRINGS "${test_data_file}" files_and_checksums)
 
   # Iterate over the list of lines and split it into $checksums and $filenames.
-  foreach (line ${files_and_checksums})
+  foreach(line ${files_and_checksums})
     string(FIND "${line}" " *" delim_pos)
 
     math(EXPR filename_pos "${delim_pos} + 2")
@@ -70,55 +396,55 @@ function (make_test_data_lists test_data_file out_files out_checksums)
     string(SUBSTRING "${line}" ${filename_pos} -1 filename)
 
     list(FIND AOM_TEST_DATA_FILE_NAMES ${filename} list_index)
-    if (NOT ${list_index} EQUAL -1)
+    if(NOT ${list_index} EQUAL -1)
+
       # Include the name and checksum in output only when the file is needed.
       set(checksums ${checksums} ${checksum})
       set(filenames ${filenames} ${filename})
-    endif ()
-  endforeach ()
+    endif()
+  endforeach()
 
   list(LENGTH filenames num_files)
   list(LENGTH checksums num_checksums)
-  if (NOT checksums OR NOT filenames OR NOT num_files EQUAL num_checksums)
+  if(NOT checksums OR NOT filenames OR NOT num_files EQUAL num_checksums)
     message(FATAL_ERROR "Parsing of ${test_data_file} failed.")
-  endif ()
+  endif()
 
   set(${out_checksums} ${checksums} PARENT_SCOPE)
   set(${out_files} ${filenames} PARENT_SCOPE)
-endfunction ()
+endfunction()
 
 # Appends each file name in $test_files to $test_dir and adds the result path to
 # $out_path_list.
-function (expand_test_file_paths test_files test_dir out_path_list)
-  foreach (filename ${${test_files}})
+function(expand_test_file_paths test_files test_dir out_path_list)
+  foreach(filename ${${test_files}})
     set(path_list ${path_list} "${test_dir}/${filename}")
-  endforeach ()
+  endforeach()
   set(${out_path_list} ${path_list} PARENT_SCOPE)
-endfunction ()
+endfunction()
 
-function (check_file local_path expected_checksum out_needs_update)
-  if (EXISTS "${local_path}")
+function(check_file local_path expected_checksum out_needs_update)
+  if(EXISTS "${local_path}")
     file(SHA1 "${local_path}" file_checksum)
-  else ()
+  else()
     set(${out_needs_update} 1 PARENT_SCOPE)
-    return ()
-  endif ()
+    return()
+  endif()
 
-  if ("${file_checksum}" STREQUAL "${expected_checksum}")
+  if("${file_checksum}" STREQUAL "${expected_checksum}")
     unset(${out_needs_update} PARENT_SCOPE)
-  else ()
+  else()
     set(${out_needs_update} 1 PARENT_SCOPE)
-    return ()
-  endif ()
+    return()
+  endif()
   message("${local_path} up to date.")
-endfunction ()
+endfunction()
 
 # Downloads data from $file_url, confirms that $file_checksum matches, and
 # writes it to $local_path.
-function (download_test_file file_url file_checksum local_path)
+function(download_test_file file_url file_checksum local_path)
   message("Downloading ${file_url} ...")
-  file(DOWNLOAD "${file_url}" "${local_path}"
-       SHOW_PROGRESS
+  file(DOWNLOAD "${file_url}" "${local_path}" SHOW_PROGRESS
        EXPECTED_HASH SHA1=${file_checksum})
   message("Download of ${file_url} complete.")
-endfunction ()
+endfunction()
diff --git a/third_party/aom/test/test_intra_pred_speed.cc b/third_party/aom/test/test_intra_pred_speed.cc
index 25289446f..b72ac1167 100644
--- a/third_party/aom/test/test_intra_pred_speed.cc
+++ b/third_party/aom/test/test_intra_pred_speed.cc
@@ -7,54 +7,68 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 //  Test and time AOM intra-predictor functions
 
 #include <stdio.h>
-#include <string.h>
+#include <string>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/md5_helper.h"
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/aom_timer.h"
+#include "av1/common/common_data.h"
 
 // -----------------------------------------------------------------------------
 
 namespace {
 
+// Note:
+// APPLY_UNIT_TESTS
+// 1: Do unit tests
+// 0: Generate MD5 array as required
+#define APPLY_UNIT_TESTS 1
+
 typedef void (*AvxPredFunc)(uint8_t *dst, ptrdiff_t y_stride,
                             const uint8_t *above, const uint8_t *left);
 
-const int kBPS = 32;
+const int kBPS = 64;
 const int kTotalPixels = kBPS * kBPS;
-const int kNumAv1IntraFuncs = INTRA_MODES + 3;  // 4 DC predictor variants.
+// 4 DC variants, V, H, PAETH, SMOOTH, SMOOTH_V, SMOOTH_H
+const int kNumAv1IntraFuncs = 10;
+
+#if APPLY_UNIT_TESTS
 const char *kAv1IntraPredNames[kNumAv1IntraFuncs] = {
-  "DC_PRED",       "DC_LEFT_PRED",  "DC_TOP_PRED", "DC_128_PRED", "V_PRED",
-  "H_PRED",        "D45_PRED",      "D135_PRED",   "D117_PRED",   "D153_PRED",
-  "D207_PRED",     "D63_PRED",      "TM_PRED",     "SMOOTH_PRED",
-#if CONFIG_SMOOTH_HV
-  "SMOOTH_V_PRED", "SMOOTH_H_PRED",
-#endif  // CONFIG_SMOOTH_HV
+  "DC_PRED", "DC_LEFT_PRED", "DC_TOP_PRED", "DC_128_PRED",   "V_PRED",
+  "H_PRED",  "PAETH_PRED",   "SMOOTH_PRED", "SMOOTH_V_PRED", "SMOOTH_H_PRED",
 };
+#endif  // APPLY_UNIT_TESTS
 
 template <typename Pixel>
 struct IntraPredTestMem {
-  void Init(int block_width, int bd) {
+  void Init(int block_width, int block_height, int bd) {
+    ASSERT_LE(block_width, kBPS);
+    ASSERT_LE(block_height, kBPS);
+    // Note: for blocks having width <= 32 and height <= 32, we generate 32x32
+    // random pixels as before to avoid having to recalculate all hashes again.
+    const int block_size_upto_32 = (block_width <= 32) && (block_height <= 32);
+    stride = block_size_upto_32 ? 32 : kBPS;
+    num_pixels = stride * stride;
     libaom_test::ACMRandom rnd(libaom_test::ACMRandom::DeterministicSeed());
-    Pixel *const above = above_mem + 16;
+    above = above_mem + 16;
     const int mask = (1 << bd) - 1;
-    for (int i = 0; i < kTotalPixels; ++i) ref_src[i] = rnd.Rand16() & mask;
-    for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand16() & mask;
-    for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand16() & mask;
+    for (int i = 0; i < num_pixels; ++i) ref_src[i] = rnd.Rand16() & mask;
+    for (int i = 0; i < stride; ++i) left[i] = rnd.Rand16() & mask;
+    for (int i = -1; i < stride; ++i) above[i] = rnd.Rand16() & mask;
 
-    ASSERT_LE(block_width, kBPS);
-    for (int i = kBPS; i < 2 * kBPS; ++i) {
+    for (int i = stride; i < 2 * stride; ++i) {
       left[i] = rnd.Rand16() & mask;
       above[i] = rnd.Rand16() & mask;
     }
@@ -63,6 +77,11 @@ struct IntraPredTestMem {
   DECLARE_ALIGNED(16, Pixel, src[kTotalPixels]);
   DECLARE_ALIGNED(16, Pixel, ref_src[kTotalPixels]);
   DECLARE_ALIGNED(16, Pixel, left[2 * kBPS]);
+  Pixel *above;
+  int stride;
+  int num_pixels;
+
+ private:
   DECLARE_ALIGNED(16, Pixel, above_mem[2 * kBPS + 16]);
 };
 
@@ -71,36 +90,40 @@ struct IntraPredTestMem {
 
 typedef IntraPredTestMem<uint8_t> Av1IntraPredTestMem;
 
-// Note:
-// APPLY_UNIT_TESTS
-// 1: Do unit tests
-// 0: Generate MD5 array as required
-#define APPLY_UNIT_TESTS 1
+static const char *const kTxSizeStrings[TX_SIZES_ALL] = {
+  "4X4",  "8X8",  "16X16", "32X32", "64X64", "4X8",   "8X4",
+  "8X16", "16X8", "16X32", "32X16", "32X64", "64X32", "4X16",
+  "16X4", "8X32", "32X8",  "16X64", "64X16",
+};
 
-void CheckMd5Signature(const char name[], const char *const signatures[],
-                       const void *data, size_t data_size, int elapsed_time,
-                       int idx) {
+void CheckMd5Signature(TX_SIZE tx_size, bool is_hbd,
+                       const char *const signatures[], const void *data,
+                       size_t data_size, int elapsed_time, int idx) {
+  const std::string hbd_str = is_hbd ? "Hbd " : "";
+  const std::string name_str = hbd_str + "Intra" + kTxSizeStrings[tx_size];
   libaom_test::MD5 md5;
   md5.Add(reinterpret_cast<const uint8_t *>(data), data_size);
 #if APPLY_UNIT_TESTS
-  printf("Mode %s[%13s]: %5d ms     MD5: %s\n", name, kAv1IntraPredNames[idx],
-         elapsed_time, md5.Get());
+  printf("Mode %s[%13s]: %5d ms     MD5: %s\n", name_str.c_str(),
+         kAv1IntraPredNames[idx], elapsed_time, md5.Get());
   EXPECT_STREQ(signatures[idx], md5.Get());
 #else
+  (void)signatures;
+  (void)elapsed_time;
+  (void)idx;
   printf("\"%s\",\n", md5.Get());
 #endif
 }
 
-void TestIntraPred(const char name[], AvxPredFunc const *pred_funcs,
-                   const char *const signatures[], int block_width,
-                   int block_height) {
+void TestIntraPred(TX_SIZE tx_size, AvxPredFunc const *pred_funcs,
+                   const char *const signatures[]) {
+  const int block_width = tx_size_wide[tx_size];
+  const int block_height = tx_size_high[tx_size];
   const int num_pixels_per_test =
       block_width * block_height * kNumAv1IntraFuncs;
   const int kNumTests = static_cast<int>(2.e10 / num_pixels_per_test);
   Av1IntraPredTestMem intra_pred_test_mem;
-  const uint8_t *const above = intra_pred_test_mem.above_mem + 16;
-
-  intra_pred_test_mem.Init(block_width, 8);
+  intra_pred_test_mem.Init(block_width, block_height, 8);
 
   for (int k = 0; k < kNumAv1IntraFuncs; ++k) {
     if (pred_funcs[k] == NULL) continue;
@@ -109,715 +132,754 @@ void TestIntraPred(const char name[], AvxPredFunc const *pred_funcs,
     aom_usec_timer timer;
     aom_usec_timer_start(&timer);
     for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
-      pred_funcs[k](intra_pred_test_mem.src, kBPS, above,
-                    intra_pred_test_mem.left);
+      pred_funcs[k](intra_pred_test_mem.src, intra_pred_test_mem.stride,
+                    intra_pred_test_mem.above, intra_pred_test_mem.left);
     }
     libaom_test::ClearSystemState();
     aom_usec_timer_mark(&timer);
     const int elapsed_time =
         static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);
-    CheckMd5Signature(name, signatures, intra_pred_test_mem.src,
-                      sizeof(intra_pred_test_mem.src), elapsed_time, k);
-  }
-}
-
-void TestIntraPred4(const char *block_name, AvxPredFunc const *pred_funcs) {
-  static const char *const kSignatures4x4[kNumAv1IntraFuncs] = {
-    "e7ed7353c3383fff942e500e9bfe82fe",
-    "2a4a26fcc6ce005eadc08354d196c8a9",
-    "269d92eff86f315d9c38fe7640d85b15",
-    "ae2960eea9f71ee3dabe08b282ec1773",
-    "6c1abcc44e90148998b51acd11144e9c",
-    "f7bb3186e1ef8a2b326037ff898cad8e",
-    "87e72798518d62e84bcc77dcb17d0f3b",
-    "141624072a4a56773f68fadbdd07c4a7",
-    "7be49b08687a5f24df3a2c612fca3876",
-    "459bb5d9fd5b238348179c9a22108cd6",
-    "3d98810f418a9de92acfe2c68909c61c",
-    "6310eecda3cc9496987ca10186255558",
-    "59fc0e923a08cfac0a493fb38988e2bb",
-    "9ff8bb37d9c830e6ab8ecb0c435d3c91",
-#if CONFIG_SMOOTH_HV
-    "de6937fca02354f2874dbc5dbec5d5b3",
-    "723cf948137f7d8c7860d814e55ae67d",
-#endif  // CONFIG_SMOOTH_HV
-  };
-  static const char *const kSignatures4x8[kNumAv1IntraFuncs] = {
-    "d9fbebdc85f71ab1e18461b2db4a2adc",
-    "5ccb2a68284bc9714d94b8a06ccadbb2",
-    "735d059abc2744f3ff3f9590f7191b37",
-    "d9fbebdc85f71ab1e18461b2db4a2adc",
-    "6819497c44cd0ace120add83672996ee",
-    "7e3244f5a2d3edf81c7e962a842b97f9",
-    "3fa52ee9acf5a25594cac684be263f32",
-    "c18dd23d57def4df4c6147c572dfc827",
-    "d007fbf7e43cb8f49702daa20f0c9153",
-    "5c0226c44c5df285728296b80cc6de4b",
-    "b55d7b558bebc8c2042dfac58b3c4688",
-    "6549362baa389b8faa2d954926b64e2f",
-    "809350f164cd4d1650850bb0f59c3260",
-    "1b60a394331eeab6927a6f8aaff57040",
-#if CONFIG_SMOOTH_HV
-    "5307de1bd7329ba6b281d2c1b0b457f9",
-    "24c58a8138339846d95568efb91751db",
-#endif
-  };
-  if (!strcmp(block_name, "intra4x4")) {
-    TestIntraPred(block_name, pred_funcs, kSignatures4x4, 4, 4);
-  }
-  if (!strcmp(block_name, "intra4x8")) {
-    TestIntraPred(block_name, pred_funcs, kSignatures4x8, 4, 8);
-  }
-}
-
-void TestIntraPred8(const char *block_name, AvxPredFunc const *pred_funcs) {
-  static const char *const kSignatures8x8[kNumAv1IntraFuncs] = {
-    "d8bbae5d6547cfc17e4f5f44c8730e88",
-    "373bab6d931868d41a601d9d88ce9ac3",
-    "6fdd5ff4ff79656c14747598ca9e3706",
-    "d9661c2811d6a73674f40ffb2b841847",
-    "7c722d10b19ccff0b8c171868e747385",
-    "f81dd986eb2b50f750d3a7da716b7e27",
-    "e0b1292448f3350bf1c92ca283ca872a",
-    "0e3523f9cab2142dd37fd07ec0760bce",
-    "79ac4efe907f0a0f1885d43066cfedee",
-    "19ecf2432ac305057de3b6578474eec6",
-    "7ae38292cbe47b4aa0807c3bd5a543df",
-    "d0ecffec1bb01f4b61ab5738164695c4",
-    "064404361748dd111a890a1470d7f0ea",
-    "dc29b7e1f78cc8e7525d5ea4c0ab9b78",
-#if CONFIG_SMOOTH_HV
-    "97111eb1bc26bade6272015df829f1ae",
-    "d19a8a73cc46b807f2c5e817576cc1e1",
-#endif  // CONFIG_SMOOTH_HV
-  };
-  static const char *const kSignatures8x4[kNumAv1IntraFuncs] = {
-    "23f9fc11344426c9bee2e06d57dfd628",
-    "2d71a26d1bae1fb34734de7b42fc5eb7",
-    "5af9c1b2fd9d5721fad67b67b3f7c816",
-    "00d71b17be662753813d515f197d145e",
-    "bef10ec984427e28f4390f43809d10af",
-    "77773cdfb7ed6bc882ab202a64b0a470",
-    "cba356970f6b9a1b6024e1dbe4a66f9b",
-    "c58c21efc804242848e6f29a93a7984d",
-    "dc92cc45a51c7a397506cab19f74e66d",
-    "391f6a12224f81a3719ea09a2cf7a5ad",
-    "b74b8b11f7eb2bbf723b25f381104ca9",
-    "2234aaa06ca245624211cf53a0261017",
-    "2cc48bd66d6b0121b5221d52ccd732af",
-    "b302155e1c9eeeafe2ba2bf68e807a46",
-#if CONFIG_SMOOTH_HV
-    "561bc8d0e76d5041ebd5168fc6a115e1",
-    "81d0113fb1d0a9a24ffd6f1987b77948",
-#endif
-  };
-  static const char *const kSignatures8x16[kNumAv1IntraFuncs] = {
-    "c849de88b24f773dfcdd1d48d1209796",
-    "6cb807c1897b94866a0f3d3c56ed8695",
-    "d56db05a8ac7981762f5b877f486c4ef",
-    "b4bc01eb6e59a40922ad17715cafb04b",
-    "09d178439534f4062ae687c351f66d64",
-    "644501399cf73080ac606e5cef7ca09b",
-    "0e8e968fa177204d7e73d7e04ce69ebb",
-    "1d25f9287fdf7ba48a5105f1529b7e75",
-    "02cacccf3752451763a6a6e2e784494f",
-    "6044a1416d53e324ddc012d2e7763339",
-    "57ac6e8f3ab5e943c9280043eeb174b8",
-    "d51b9d65471194d9caebc7d67e75ef10",
-    "278076495180e17c065a95ab7278539a",
-    "9dd7f324816f242be408ffeb0c673732",
-#if CONFIG_SMOOTH_HV
-    "f520c4a20acfa0bea1d253c6f0f040fd",
-    "85f38df809df2c2d7c8b4a157a65cd44",
-#endif
-  };
-  if (!strcmp(block_name, "intra8x8")) {
-    TestIntraPred(block_name, pred_funcs, kSignatures8x8, 8, 8);
-  }
-  if (!strcmp(block_name, "intra8x4")) {
-    TestIntraPred(block_name, pred_funcs, kSignatures8x4, 8, 4);
-  }
-  if (!strcmp(block_name, "intra8x16")) {
-    TestIntraPred(block_name, pred_funcs, kSignatures8x16, 8, 16);
-  }
-}
-
-void TestIntraPred16(const char *block_name, AvxPredFunc const *pred_funcs) {
-  static const char *const kSignatures16x16[kNumAv1IntraFuncs] = {
-    "50971c07ce26977d30298538fffec619",
-    "527a6b9e0dc5b21b98cf276305432bef",
-    "7eff2868f80ebc2c43a4f367281d80f7",
-    "67cd60512b54964ef6aff1bd4816d922",
-    "48371c87dc95c08a33b2048f89cf6468",
-    "b0acf2872ee411d7530af6d2625a7084",
-    "31d901ab2289d1e61e704e40240382a7",
-    "dae208f3dca583529cff49b73f7c4183",
-    "7af66a2f4c8e0b4908e40f047e60c47c",
-    "125e3ab6ab9bc961f183ec366a7afa88",
-    "ff230677e800977757d14b85a9eba404",
-    "eb42dc39140515dd4f3ab1afe6c3e71b",
-    "93d6b5352b571805ab16a55e1bbed86a",
-    "03764e4c0aebbc180e4e2c68fb06df2b",
-#if CONFIG_SMOOTH_HV
-    "bb6c74c9076c9f266ab11fb57060d8e6",
-    "0c5162bc28489756ddb847b5678e6f07",
-#endif  // CONFIG_SMOOTH_HV
-  };
-  static const char *const kSignatures16x8[kNumAv1IntraFuncs] = {
-    "b4cbdbdf10ce13300b4063a3daf99e04",
-    "3731e1e6202064a9d0604d7c293ecee4",
-    "6c856188c4256a06452f0d5d70cac436",
-    "1f2192b4c8c497589484ea7bf9c944e8",
-    "84011bd4b7f565119d06787840e333a0",
-    "0e48949f7a6aa36f0d76b5d01f91124a",
-    "58114c06f6b9d8285e5020c7afd834ab",
-    "e37afe84a8b3c5e0f048d4652ecbe09e",
-    "c216348473fb029b45f8fb4f2862a7bd",
-    "0b7385155dcef742cc456d5741ae93a3",
-    "d55fadb221f0ea20266e57cd413e7b94",
-    "9bd6eb226c7e169b8d53cf70aea98b3a",
-    "60eff8064634b6c73b10681356baeee9",
-    "1559aeb081a9c0c71111d6093c2ff9fd",
-#if CONFIG_SMOOTH_HV
-    "c15479b739713773e5cabb748451987b",
-    "72e33ec12c9b67aea26d8d005fb82de2",
-#endif
-  };
-  static const char *const kSignatures16x32[kNumAv1IntraFuncs] = {
-    "abe5233d189cdbf79424721571bbaa7b",
-    "282759f81e3cfb2e2d396fe406b72a8b",
-    "e2224926c264f6f174cbc3167a233168",
-    "6814e85c2b33f8c9415d62e80394b47b",
-    "99cbbb60459c08a3061d72c4e4f6276a",
-    "1d1567d40b8e816f8c1f71e576fe0f87",
-    "5e989f9c748a0d2cd8c4ebf9d3fe1278",
-    "7135a2f419452a3a192a35156f68b019",
-    "06e10af5a726d2c81b8f8c708204f9fb",
-    "c0882f0e7ba1ffa0aeef6d5c751df6de",
-    "8477429e17d39a423f30e2082f651549",
-    "ba35068a30c2d1d10901e4bfabd02a11",
-    "36fdd371b624a075814d497c4832ec85",
-    "8ab8da61b727442b6ff692b40d0df018",
-#if CONFIG_SMOOTH_HV
-    "e35a10ad7fdf2327e821504a90f6a6eb",
-    "1f7211e727dc1de7d6a55d082fbdd821",
-#endif
-  };
-  if (!strcmp(block_name, "intra16x16")) {
-    TestIntraPred(block_name, pred_funcs, kSignatures16x16, 16, 16);
-  }
-  if (!strcmp(block_name, "intra16x8")) {
-    TestIntraPred(block_name, pred_funcs, kSignatures16x8, 16, 8);
-  }
-  if (!strcmp(block_name, "intra16x32")) {
-    TestIntraPred(block_name, pred_funcs, kSignatures16x32, 16, 32);
+    CheckMd5Signature(
+        tx_size, false, signatures, intra_pred_test_mem.src,
+        intra_pred_test_mem.num_pixels * sizeof(*intra_pred_test_mem.src),
+        elapsed_time, k);
   }
 }
 
-void TestIntraPred32(const char *block_name, AvxPredFunc const *pred_funcs) {
-  static const char *const kSignatures32x32[kNumAv1IntraFuncs] = {
-    "a0a618c900e65ae521ccc8af789729f2",
-    "985aaa7c72b4a6c2fb431d32100cf13a",
-    "10662d09febc3ca13ee4e700120daeb5",
-    "b3b01379ba08916ef6b1b35f7d9ad51c",
-    "9f4261755795af97e34679c333ec7004",
-    "bc2c9da91ad97ef0d1610fb0a9041657",
-    "f524b1a7e31c7bb9bfb2487fac3e16d8",
-    "4039bb7da0f6860090d3c57b5c85468f",
-    "b29fff7b61804e68383e3a609b33da58",
-    "e1aa5e49067fd8dba66c2eb8d07b7a89",
-    "db217e7891581cf93895ef5974bebb21",
-    "beb6cdc52b52c8976b4d2407ec8d2313",
-    "ef1653982b69e1f64bee3759f3e1ec45",
-    "1a51a675deba2c83282142eb48d3dc3d",
-#if CONFIG_SMOOTH_HV
-    "866c224746dc260cda861a7b1b383fb3",
-    "cea23799fc3526e1b6a6ff02b42b82af",
-#endif  // CONFIG_SMOOTH_HV
-  };
-  static const char *const kSignatures32x16[kNumAv1IntraFuncs] = {
-    "d1aeb8d5fdcfd3307922af01a798a4dc",
-    "b0bcb514ebfbee065faea9d34c12ae75",
-    "d6a18c63b4e909871c0137ca652fad23",
-    "fd047f2fc1b8ffb95d0eeef3e8796a45",
-    "645ab60779ea348fd93c81561c31bab9",
-    "4409633c9db8dff41ade4292a3a56e7f",
-    "b9b2935b2287a9a461ac5c11251ac706",
-    "43b05f808c0ac4fe8accd84d293b0488",
-    "1d2cb43872d20c205ffb185102bcd22a",
-    "2c1551b5e99592fd21053b5d14e397d9",
-    "cd499ef0dd41e2e38d5dac3319dfdd97",
-    "cd2610426637003f3b5d3984cb3320d5",
-    "5e36a11e069b31c2a739f3a9c7b37c24",
-    "e83b9483d702cfae496991c3c7fa92c0",
-#if CONFIG_SMOOTH_HV
-    "12f6ddf98c7f30a277307f1ea935b030",
-    "354321d6c32bbdb0739e4fa2acbf41e1",
-#endif
-  };
-  if (!strcmp(block_name, "intra32x32")) {
-    TestIntraPred(block_name, pred_funcs, kSignatures32x32, 32, 32);
-  }
-  if (!strcmp(block_name, "intra32x16")) {
-    TestIntraPred(block_name, pred_funcs, kSignatures32x16, 32, 16);
-  }
-}
+static const char *const kSignatures[TX_SIZES_ALL][kNumAv1IntraFuncs] = {
+  {
+      // 4X4
+      "e7ed7353c3383fff942e500e9bfe82fe",
+      "2a4a26fcc6ce005eadc08354d196c8a9",
+      "269d92eff86f315d9c38fe7640d85b15",
+      "ae2960eea9f71ee3dabe08b282ec1773",
+      "6c1abcc44e90148998b51acd11144e9c",
+      "f7bb3186e1ef8a2b326037ff898cad8e",
+      "59fc0e923a08cfac0a493fb38988e2bb",
+      "9ff8bb37d9c830e6ab8ecb0c435d3c91",
+      "de6937fca02354f2874dbc5dbec5d5b3",
+      "723cf948137f7d8c7860d814e55ae67d",
+  },
+  {
+      // 8X8
+      "d8bbae5d6547cfc17e4f5f44c8730e88",
+      "373bab6d931868d41a601d9d88ce9ac3",
+      "6fdd5ff4ff79656c14747598ca9e3706",
+      "d9661c2811d6a73674f40ffb2b841847",
+      "7c722d10b19ccff0b8c171868e747385",
+      "f81dd986eb2b50f750d3a7da716b7e27",
+      "064404361748dd111a890a1470d7f0ea",
+      "dc29b7e1f78cc8e7525d5ea4c0ab9b78",
+      "97111eb1bc26bade6272015df829f1ae",
+      "d19a8a73cc46b807f2c5e817576cc1e1",
+  },
+  {
+      // 16X16
+      "50971c07ce26977d30298538fffec619",
+      "527a6b9e0dc5b21b98cf276305432bef",
+      "7eff2868f80ebc2c43a4f367281d80f7",
+      "67cd60512b54964ef6aff1bd4816d922",
+      "48371c87dc95c08a33b2048f89cf6468",
+      "b0acf2872ee411d7530af6d2625a7084",
+      "93d6b5352b571805ab16a55e1bbed86a",
+      "03764e4c0aebbc180e4e2c68fb06df2b",
+      "bb6c74c9076c9f266ab11fb57060d8e6",
+      "0c5162bc28489756ddb847b5678e6f07",
+  },
+  {
+      // 32X32
+      "a0a618c900e65ae521ccc8af789729f2",
+      "985aaa7c72b4a6c2fb431d32100cf13a",
+      "10662d09febc3ca13ee4e700120daeb5",
+      "b3b01379ba08916ef6b1b35f7d9ad51c",
+      "9f4261755795af97e34679c333ec7004",
+      "bc2c9da91ad97ef0d1610fb0a9041657",
+      "ef1653982b69e1f64bee3759f3e1ec45",
+      "1a51a675deba2c83282142eb48d3dc3d",
+      "866c224746dc260cda861a7b1b383fb3",
+      "cea23799fc3526e1b6a6ff02b42b82af",
+  },
+  {
+      // 64X64
+      "6e1094fa7b50bc813aa2ba29f5df8755",
+      "afe020786b83b793c2bbd9468097ff6e",
+      "be91585259bc37bf4dc1651936e90b3e",
+      "a1650dbcd56e10288c3e269eca37967d",
+      "9e5c34f3797e0cdd3cd9d4c05b0d8950",
+      "bc87be7ac899cc6a28f399d7516c49fe",
+      "9811fd0d2dd515f06122f5d1bd18b784",
+      "3c140e466f2c2c0d9cb7d2157ab8dc27",
+      "9543de76c925a8f6adc884cc7f98dc91",
+      "df1df0376cc944afe7e74e94f53e575a",
+  },
+  {
+      // 4X8
+      "d9fbebdc85f71ab1e18461b2db4a2adc",
+      "5ccb2a68284bc9714d94b8a06ccadbb2",
+      "735d059abc2744f3ff3f9590f7191b37",
+      "d9fbebdc85f71ab1e18461b2db4a2adc",
+      "6819497c44cd0ace120add83672996ee",
+      "7e3244f5a2d3edf81c7e962a842b97f9",
+      "809350f164cd4d1650850bb0f59c3260",
+      "1b60a394331eeab6927a6f8aaff57040",
+      "5307de1bd7329ba6b281d2c1b0b457f9",
+      "24c58a8138339846d95568efb91751db",
+  },
+  {
+      // 8X4
+      "23f9fc11344426c9bee2e06d57dfd628",
+      "2d71a26d1bae1fb34734de7b42fc5eb7",
+      "5af9c1b2fd9d5721fad67b67b3f7c816",
+      "00d71b17be662753813d515f197d145e",
+      "bef10ec984427e28f4390f43809d10af",
+      "77773cdfb7ed6bc882ab202a64b0a470",
+      "2cc48bd66d6b0121b5221d52ccd732af",
+      "b302155e1c9eeeafe2ba2bf68e807a46",
+      "561bc8d0e76d5041ebd5168fc6a115e1",
+      "81d0113fb1d0a9a24ffd6f1987b77948",
+  },
+  {
+      // 8X16
+      "c849de88b24f773dfcdd1d48d1209796",
+      "6cb807c1897b94866a0f3d3c56ed8695",
+      "d56db05a8ac7981762f5b877f486c4ef",
+      "b4bc01eb6e59a40922ad17715cafb04b",
+      "09d178439534f4062ae687c351f66d64",
+      "644501399cf73080ac606e5cef7ca09b",
+      "278076495180e17c065a95ab7278539a",
+      "9dd7f324816f242be408ffeb0c673732",
+      "f520c4a20acfa0bea1d253c6f0f040fd",
+      "85f38df809df2c2d7c8b4a157a65cd44",
+  },
+  {
+      // 16X8
+      "b4cbdbdf10ce13300b4063a3daf99e04",
+      "3731e1e6202064a9d0604d7c293ecee4",
+      "6c856188c4256a06452f0d5d70cac436",
+      "1f2192b4c8c497589484ea7bf9c944e8",
+      "84011bd4b7f565119d06787840e333a0",
+      "0e48949f7a6aa36f0d76b5d01f91124a",
+      "60eff8064634b6c73b10681356baeee9",
+      "1559aeb081a9c0c71111d6093c2ff9fd",
+      "c15479b739713773e5cabb748451987b",
+      "72e33ec12c9b67aea26d8d005fb82de2",
+  },
+  {
+      // 16X32
+      "abe5233d189cdbf79424721571bbaa7b",
+      "282759f81e3cfb2e2d396fe406b72a8b",
+      "e2224926c264f6f174cbc3167a233168",
+      "6814e85c2b33f8c9415d62e80394b47b",
+      "99cbbb60459c08a3061d72c4e4f6276a",
+      "1d1567d40b8e816f8c1f71e576fe0f87",
+      "36fdd371b624a075814d497c4832ec85",
+      "8ab8da61b727442b6ff692b40d0df018",
+      "e35a10ad7fdf2327e821504a90f6a6eb",
+      "1f7211e727dc1de7d6a55d082fbdd821",
+  },
+  {
+      // 32X16
+      "d1aeb8d5fdcfd3307922af01a798a4dc",
+      "b0bcb514ebfbee065faea9d34c12ae75",
+      "d6a18c63b4e909871c0137ca652fad23",
+      "fd047f2fc1b8ffb95d0eeef3e8796a45",
+      "645ab60779ea348fd93c81561c31bab9",
+      "4409633c9db8dff41ade4292a3a56e7f",
+      "5e36a11e069b31c2a739f3a9c7b37c24",
+      "e83b9483d702cfae496991c3c7fa92c0",
+      "12f6ddf98c7f30a277307f1ea935b030",
+      "354321d6c32bbdb0739e4fa2acbf41e1",
+  },
+  {
+      // 32X64
+      "0ce332b343934b34cd4417725faa85cb",
+      "4e2a2cfd8f56f15939bdfc753145b303",
+      "0f46d124ba9f48cdd5d5290acf786d6d",
+      "e1e8ed803236367821981500a3d9eebe",
+      "1d2f8e48e3adb7c448be05d9f66f4954",
+      "9fb2e176636a5689b26f73ca73fcc512",
+      "e720ebccae7e25e36f23da53ae5b5d6a",
+      "86fe4364734169aaa4520d799890d530",
+      "b1870290764bb1b100d1974e2bd70f1d",
+      "ce5b238e19d85ef69d85badfab4e63ae",
+  },
+  {
+      // 64X32
+      "a6c5aeb722615089efbca80b02951ceb",
+      "538424b24bd0830f21788e7238ca762f",
+      "80c15b303235f9bc2259027bb92dfdc4",
+      "e48e1ac15e97191a8fda08d62fff343e",
+      "12604b37875533665078405ef4582e35",
+      "0048afa17bd3e1632d68b96048836530",
+      "07a0cfcb56a5eed50c4bd6c26814336b",
+      "529d8a070de5bc6531fa3ee8f450c233",
+      "33c50a11c7d78f72434064f634305e95",
+      "e0ef7f0559c1a50ec5a8c12011b962f7",
+  },
+  {
+      // 4X16
+      "750491056568eb8fe15387b86bdf06b8",
+      "3a52dae9f599f08cfb3bd1b910dc0e11",
+      "af79f71e3e03dbeca44e2e13561f70c7",
+      "ca7dfd7624afc0c06fb5552f44398535",
+      "b591af115444bf43140c29c269f68fb2",
+      "483d942ae36e69e62f31eb215331416f",
+      "f14b58525e81870bc5d95c7ac71a347f",
+      "371208bb4027d9badb04095d1590bbc4",
+      "c7049c21b2924d70c7c12784d6b6b796",
+      "7d87233f4b5b0f12086045e5d7b2d4c2",
+  },
+  {
+      // 16X4
+      "7c6e325a65e77e732b3adbe237e045e4",
+      "24478f93ffcec47852e004d0fe948464",
+      "258d042c67d4ba3ecfa667f0adc9aebf",
+      "b2cd21d06959f159a1f3c4d9768ee7fb",
+      "b4e1f38157bf8410e7c3da02f687a343",
+      "869e703729eb0fc0711c254944ff5d5a",
+      "9638dd77105a640b146a8201ea7a0801",
+      "919d932c6af8a1cc7486e8ce996dd487",
+      "e1c9be493b6714c7ae48f30044c43140",
+      "bf0fe3889d654b2f6eb98c8fc751f9e4",
+  },
+  {
+      // 8X32
+      "8dfac4319fe0bd40013ffb3102da8c72",
+      "feb46b6dc4e2ca0a09533bfc51d4dcb0",
+      "850837ec714c37262216527aaf4cbbe9",
+      "4603c7800fb08361f163daca876e8bda",
+      "1ff95e7d2debc27b05806fb25abfd624",
+      "d81b9a51a062b23ca7823804cb7bec22",
+      "f1d8978158766f46335203608cb807e7",
+      "f3527096256258c0878d644a9d7d53ca",
+      "cbde98ac8b009953eb112807ad2ea29e",
+      "654fb1153415747feae599f538122af5",
+  },
+  {
+      // 32X8
+      "3d4ee16fab374357474f60b845327bc7",
+      "bc17c5059473a476df4e85f56395ad55",
+      "3d4ee16fab374357474f60b845327bc7",
+      "c14b8db34dc2355b84e3735c9ba16c7f",
+      "a71d25b5d47a92a8b9223c98f18458ee",
+      "6c1cfe2b1893f4576a80675687cb6426",
+      "92d11bbef8b85bb48d799bb055de3514",
+      "bcf81d1db8ae5cc03360467f44f498ec",
+      "79f8c564163555592e808e145eaf5c60",
+      "46fff139cef2ef773938bcc8b0e5abb8",
+  },
+  {
+      // 16X64
+      "3b2a053ee8b05a8ac35ad23b0422a151",
+      "12b0c69595328c465e0b25e0c9e3e9fc",
+      "f77c544ac8035e01920deae40cee7b07",
+      "727797ef15ccd8d325476fe8f12006a3",
+      "f3be77c0fe67eb5d9d515e92bec21eb7",
+      "f1ece6409e01e9dd98b800d49628247d",
+      "efd2ec9bfbbd4fd1f6604ea369df1894",
+      "ec703de918422b9e03197ba0ed60a199",
+      "739418efb89c07f700895deaa5d0b3e3",
+      "9943ae1bbeeebfe1d3a92dc39e049d63",
+  },
+  {
+      // 64X16
+      "821b76b1494d4f84d20817840f719a1a",
+      "69e462c3338a9aaf993c3f7cfbc15649",
+      "516d8f6eb054d74d150e7b444185b6b9",
+      "de1b736e9d99129609d6ef3a491507a0",
+      "fd9b4276e7affe1e0e4ce4f428058994",
+      "cd82fd361a4767ac29a9f406b480b8f3",
+      "2792c2f810157a4a6cb13c28529ff779",
+      "1220442d90c4255ba0969d28b91e93a6",
+      "c7253e10b45f7f67dfee3256c9b94825",
+      "879792198071c7e0b50b9b5010d8c18f",
+  },
+};
 
 }  // namespace
 
 // Defines a test case for |arch| (e.g., C, SSE2, ...) passing the predictors
-// to |test_func|. The test name is 'arch.test_func', e.g., C.TestIntraPred4.
-#define INTRA_PRED_TEST(arch, test_func, blk, dc, dc_left, dc_top, dc_128, v, \
-                        h, d45e, d135, d117, d153, d207e, d63e, tm, smooth,   \
-                        smooth_v, smooth_h)                                   \
-  TEST(arch, DISABLED_##test_func) {                                          \
-    static const AvxPredFunc aom_intra_pred[] = {                             \
-      dc,   dc_left, dc_top, dc_128, v,  h,      d45e,     d135,              \
-      d117, d153,    d207e,  d63e,   tm, smooth, smooth_v, smooth_h           \
-    };                                                                        \
-    test_func(blk, aom_intra_pred);                                           \
+// to TestIntraPred. The test name is 'arch.TestIntraPred_tx_size', e.g.,
+// C.TestIntraPred.0
+#define INTRA_PRED_TEST(arch, tx_size, dc, dc_left, dc_top, dc_128, v, h,  \
+                        paeth, smooth, smooth_v, smooth_h)                 \
+  TEST(arch, DISABLED_##TestIntraPred_##tx_size) {                         \
+    static const AvxPredFunc aom_intra_pred[] = {                          \
+      dc, dc_left, dc_top, dc_128, v, h, paeth, smooth, smooth_v, smooth_h \
+    };                                                                     \
+    TestIntraPred(tx_size, aom_intra_pred, kSignatures[tx_size]);          \
   }
 
 // -----------------------------------------------------------------------------
-// 4x4
-
-#if CONFIG_SMOOTH_HV
-#define smooth_v_pred_func aom_smooth_v_predictor_4x4_c
-#define smooth_h_pred_func aom_smooth_h_predictor_4x4_c
-#else
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_SMOOTH_HV
+// 4x4, 4x8, 4x16
 
-INTRA_PRED_TEST(C_1, TestIntraPred4, "intra4x4", aom_dc_predictor_4x4_c,
+INTRA_PRED_TEST(C_1, TX_4X4, aom_dc_predictor_4x4_c,
                 aom_dc_left_predictor_4x4_c, aom_dc_top_predictor_4x4_c,
                 aom_dc_128_predictor_4x4_c, aom_v_predictor_4x4_c,
-                aom_h_predictor_4x4_c, aom_d45e_predictor_4x4_c,
-                aom_d135_predictor_4x4_c, aom_d117_predictor_4x4_c,
-                aom_d153_predictor_4x4_c, aom_d207e_predictor_4x4_c,
-                aom_d63e_predictor_4x4_c, aom_paeth_predictor_4x4_c,
-                aom_smooth_predictor_4x4_c, smooth_v_pred_func,
-                smooth_h_pred_func)
-
-#undef smooth_v_pred_func
-#undef smooth_h_pred_func
-
-#if CONFIG_SMOOTH_HV
-#define smooth_v_pred_func aom_smooth_v_predictor_4x8_c
-#define smooth_h_pred_func aom_smooth_h_predictor_4x8_c
-#else
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_SMOOTH_HV
+                aom_h_predictor_4x4_c, aom_paeth_predictor_4x4_c,
+                aom_smooth_predictor_4x4_c, aom_smooth_v_predictor_4x4_c,
+                aom_smooth_h_predictor_4x4_c)
 
-INTRA_PRED_TEST(C_2, TestIntraPred4, "intra4x8", aom_dc_predictor_4x8_c,
+INTRA_PRED_TEST(C_2, TX_4X8, aom_dc_predictor_4x8_c,
                 aom_dc_left_predictor_4x8_c, aom_dc_top_predictor_4x8_c,
                 aom_dc_128_predictor_4x8_c, aom_v_predictor_4x8_c,
-                aom_h_predictor_4x8_c, aom_d45e_predictor_4x8_c,
-                aom_d135_predictor_4x8_c, aom_d117_predictor_4x8_c,
-                aom_d153_predictor_4x8_c, aom_d207e_predictor_4x8_c,
-                aom_d63e_predictor_4x8_c, aom_paeth_predictor_4x8_c,
-                aom_smooth_predictor_4x8_c, smooth_v_pred_func,
-                smooth_h_pred_func)
+                aom_h_predictor_4x8_c, aom_paeth_predictor_4x8_c,
+                aom_smooth_predictor_4x8_c, aom_smooth_v_predictor_4x8_c,
+                aom_smooth_h_predictor_4x8_c)
 
-#undef smooth_v_pred_func
-#undef smooth_h_pred_func
+INTRA_PRED_TEST(C_3, TX_4X16, aom_dc_predictor_4x16_c,
+                aom_dc_left_predictor_4x16_c, aom_dc_top_predictor_4x16_c,
+                aom_dc_128_predictor_4x16_c, aom_v_predictor_4x16_c,
+                aom_h_predictor_4x16_c, aom_paeth_predictor_4x16_c,
+                aom_smooth_predictor_4x16_c, aom_smooth_v_predictor_4x16_c,
+                aom_smooth_h_predictor_4x16_c)
 
 #if HAVE_SSE2
-INTRA_PRED_TEST(SSE2_1, TestIntraPred4, "intra4x4", aom_dc_predictor_4x4_sse2,
+INTRA_PRED_TEST(SSE2_1, TX_4X4, aom_dc_predictor_4x4_sse2,
                 aom_dc_left_predictor_4x4_sse2, aom_dc_top_predictor_4x4_sse2,
                 aom_dc_128_predictor_4x4_sse2, aom_v_predictor_4x4_sse2,
-                aom_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_2, TestIntraPred4, "intra4x8", aom_dc_predictor_4x8_sse2,
+                aom_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_2, TX_4X8, aom_dc_predictor_4x8_sse2,
                 aom_dc_left_predictor_4x8_sse2, aom_dc_top_predictor_4x8_sse2,
                 aom_dc_128_predictor_4x8_sse2, aom_v_predictor_4x8_sse2,
-                aom_h_predictor_4x8_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL)
+                aom_h_predictor_4x8_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_3, TX_4X16, aom_dc_predictor_4x16_sse2,
+                aom_dc_left_predictor_4x16_sse2, aom_dc_top_predictor_4x16_sse2,
+                aom_dc_128_predictor_4x16_sse2, aom_v_predictor_4x16_sse2,
+                aom_h_predictor_4x16_sse2, NULL, NULL, NULL, NULL)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3_1, TestIntraPred4, "intra4x4", NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL, NULL, aom_d153_predictor_4x4_ssse3,
-                NULL, aom_d63e_predictor_4x4_ssse3,
+INTRA_PRED_TEST(SSSE3_1, TX_4X4, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_4x4_ssse3, aom_smooth_predictor_4x4_ssse3,
-                NULL, NULL)
-INTRA_PRED_TEST(SSSE3_2, TestIntraPred4, "intra4x8", NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_smooth_v_predictor_4x4_ssse3,
+                aom_smooth_h_predictor_4x4_ssse3)
+INTRA_PRED_TEST(SSSE3_2, TX_4X8, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_4x8_ssse3, aom_smooth_predictor_4x8_ssse3,
-                NULL, NULL)
+                aom_smooth_v_predictor_4x8_ssse3,
+                aom_smooth_h_predictor_4x8_ssse3)
+INTRA_PRED_TEST(SSSE3_3, TX_4X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_4x16_ssse3, aom_smooth_predictor_4x16_ssse3,
+                aom_smooth_v_predictor_4x16_ssse3,
+                aom_smooth_h_predictor_4x16_ssse3)
 #endif  // HAVE_SSSE3
 
 #if HAVE_DSPR2
-INTRA_PRED_TEST(DSPR2, TestIntraPred4, "intra4x4", aom_dc_predictor_4x4_dspr2,
-                NULL, NULL, NULL, NULL, aom_h_predictor_4x4_dspr2, NULL, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(DSPR2, TX_4X4, aom_dc_predictor_4x4_dspr2, NULL, NULL, NULL,
+                NULL, aom_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL)
 #endif  // HAVE_DSPR2
 
 #if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred4, "intra4x4", aom_dc_predictor_4x4_neon,
+INTRA_PRED_TEST(NEON, TX_4X4, aom_dc_predictor_4x4_neon,
                 aom_dc_left_predictor_4x4_neon, aom_dc_top_predictor_4x4_neon,
                 aom_dc_128_predictor_4x4_neon, aom_v_predictor_4x4_neon,
-                aom_h_predictor_4x4_neon, NULL, aom_d135_predictor_4x4_neon,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+                aom_h_predictor_4x4_neon, NULL, NULL, NULL, NULL)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
-INTRA_PRED_TEST(MSA, TestIntraPred4, "intra4x4", aom_dc_predictor_4x4_msa,
+INTRA_PRED_TEST(MSA, TX_4X4, aom_dc_predictor_4x4_msa,
                 aom_dc_left_predictor_4x4_msa, aom_dc_top_predictor_4x4_msa,
                 aom_dc_128_predictor_4x4_msa, aom_v_predictor_4x4_msa,
-                aom_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL)
+                aom_h_predictor_4x4_msa, NULL, NULL, NULL, NULL)
 #endif  // HAVE_MSA
 
 // -----------------------------------------------------------------------------
-// 8x8
+// 8x8, 8x4, 8x16, 8x32
 
-#if CONFIG_SMOOTH_HV
-#define smooth_v_pred_func aom_smooth_v_predictor_8x8_c
-#define smooth_h_pred_func aom_smooth_h_predictor_8x8_c
-#else
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_SMOOTH_HV
-INTRA_PRED_TEST(C_1, TestIntraPred8, "intra8x8", aom_dc_predictor_8x8_c,
+INTRA_PRED_TEST(C_1, TX_8X8, aom_dc_predictor_8x8_c,
                 aom_dc_left_predictor_8x8_c, aom_dc_top_predictor_8x8_c,
                 aom_dc_128_predictor_8x8_c, aom_v_predictor_8x8_c,
-                aom_h_predictor_8x8_c, aom_d45e_predictor_8x8_c,
-                aom_d135_predictor_8x8_c, aom_d117_predictor_8x8_c,
-                aom_d153_predictor_8x8_c, aom_d207e_predictor_8x8_c,
-                aom_d63e_predictor_8x8_c, aom_paeth_predictor_8x8_c,
-                aom_smooth_predictor_8x8_c, smooth_v_pred_func,
-                smooth_h_pred_func)
-#undef smooth_v_pred_func
-#undef smooth_h_pred_func
-
-#if CONFIG_SMOOTH_HV
-#define smooth_v_pred_func aom_smooth_v_predictor_8x4_c
-#define smooth_h_pred_func aom_smooth_h_predictor_8x4_c
-#else
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_SMOOTH_HV
-INTRA_PRED_TEST(C_2, TestIntraPred8, "intra8x4", aom_dc_predictor_8x4_c,
+                aom_h_predictor_8x8_c, aom_paeth_predictor_8x8_c,
+                aom_smooth_predictor_8x8_c, aom_smooth_v_predictor_8x8_c,
+                aom_smooth_h_predictor_8x8_c)
+
+INTRA_PRED_TEST(C_2, TX_8X4, aom_dc_predictor_8x4_c,
                 aom_dc_left_predictor_8x4_c, aom_dc_top_predictor_8x4_c,
                 aom_dc_128_predictor_8x4_c, aom_v_predictor_8x4_c,
-                aom_h_predictor_8x4_c, aom_d45e_predictor_8x4_c,
-                aom_d135_predictor_8x4_c, aom_d117_predictor_8x4_c,
-                aom_d153_predictor_8x4_c, aom_d207e_predictor_8x4_c,
-                aom_d63e_predictor_8x4_c, aom_paeth_predictor_8x4_c,
-                aom_smooth_predictor_8x4_c, smooth_v_pred_func,
-                smooth_h_pred_func)
-#undef smooth_v_pred_func
-#undef smooth_h_pred_func
-
-#if CONFIG_SMOOTH_HV
-#define smooth_v_pred_func aom_smooth_v_predictor_8x16_c
-#define smooth_h_pred_func aom_smooth_h_predictor_8x16_c
-#else
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_SMOOTH_HV
-INTRA_PRED_TEST(C_3, TestIntraPred8, "intra8x16", aom_dc_predictor_8x16_c,
+                aom_h_predictor_8x4_c, aom_paeth_predictor_8x4_c,
+                aom_smooth_predictor_8x4_c, aom_smooth_v_predictor_8x4_c,
+                aom_smooth_h_predictor_8x4_c)
+
+INTRA_PRED_TEST(C_3, TX_8X16, aom_dc_predictor_8x16_c,
                 aom_dc_left_predictor_8x16_c, aom_dc_top_predictor_8x16_c,
                 aom_dc_128_predictor_8x16_c, aom_v_predictor_8x16_c,
-                aom_h_predictor_8x16_c, aom_d45e_predictor_8x16_c,
-                aom_d135_predictor_8x16_c, aom_d117_predictor_8x16_c,
-                aom_d153_predictor_8x16_c, aom_d207e_predictor_8x16_c,
-                aom_d63e_predictor_8x16_c, aom_paeth_predictor_8x16_c,
-                aom_smooth_predictor_8x16_c, smooth_v_pred_func,
-                smooth_h_pred_func)
-#undef smooth_v_pred_func
-#undef smooth_h_pred_func
+                aom_h_predictor_8x16_c, aom_paeth_predictor_8x16_c,
+                aom_smooth_predictor_8x16_c, aom_smooth_v_predictor_8x16_c,
+                aom_smooth_h_predictor_8x16_c)
+
+INTRA_PRED_TEST(C_4, TX_8X32, aom_dc_predictor_8x32_c,
+                aom_dc_left_predictor_8x32_c, aom_dc_top_predictor_8x32_c,
+                aom_dc_128_predictor_8x32_c, aom_v_predictor_8x32_c,
+                aom_h_predictor_8x32_c, aom_paeth_predictor_8x32_c,
+                aom_smooth_predictor_8x32_c, aom_smooth_v_predictor_8x32_c,
+                aom_smooth_h_predictor_8x32_c)
 
 #if HAVE_SSE2
-INTRA_PRED_TEST(SSE2_1, TestIntraPred8, "intra8x8", aom_dc_predictor_8x8_sse2,
+INTRA_PRED_TEST(SSE2_1, TX_8X8, aom_dc_predictor_8x8_sse2,
                 aom_dc_left_predictor_8x8_sse2, aom_dc_top_predictor_8x8_sse2,
                 aom_dc_128_predictor_8x8_sse2, aom_v_predictor_8x8_sse2,
-                aom_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_2, TestIntraPred8, "intra8x4", aom_dc_predictor_8x4_sse2,
+                aom_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_2, TX_8X4, aom_dc_predictor_8x4_sse2,
                 aom_dc_left_predictor_8x4_sse2, aom_dc_top_predictor_8x4_sse2,
                 aom_dc_128_predictor_8x4_sse2, aom_v_predictor_8x4_sse2,
-                aom_h_predictor_8x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_3, TestIntraPred8, "intra8x16", aom_dc_predictor_8x16_sse2,
+                aom_h_predictor_8x4_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_3, TX_8X16, aom_dc_predictor_8x16_sse2,
                 aom_dc_left_predictor_8x16_sse2, aom_dc_top_predictor_8x16_sse2,
                 aom_dc_128_predictor_8x16_sse2, aom_v_predictor_8x16_sse2,
-                aom_h_predictor_8x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL)
+                aom_h_predictor_8x16_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_4, TX_8X32, aom_dc_predictor_8x32_sse2,
+                aom_dc_left_predictor_8x32_sse2, aom_dc_top_predictor_8x32_sse2,
+                aom_dc_128_predictor_8x32_sse2, aom_v_predictor_8x32_sse2,
+                aom_h_predictor_8x32_sse2, NULL, NULL, NULL, NULL)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3_1, TestIntraPred8, "intra8x8", NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL, NULL, aom_d153_predictor_8x8_ssse3,
-                NULL, NULL, aom_paeth_predictor_8x8_ssse3,
-                aom_smooth_predictor_8x8_ssse3, NULL, NULL)
-INTRA_PRED_TEST(SSSE3_2, TestIntraPred8, "intra8x4", NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3_1, TX_8X8, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_8x8_ssse3, aom_smooth_predictor_8x8_ssse3,
+                aom_smooth_v_predictor_8x8_ssse3,
+                aom_smooth_h_predictor_8x8_ssse3)
+INTRA_PRED_TEST(SSSE3_2, TX_8X4, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_8x4_ssse3, aom_smooth_predictor_8x4_ssse3,
-                NULL, NULL)
-INTRA_PRED_TEST(SSSE3_3, TestIntraPred8, "intra8x16", NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_smooth_v_predictor_8x4_ssse3,
+                aom_smooth_h_predictor_8x4_ssse3)
+INTRA_PRED_TEST(SSSE3_3, TX_8X16, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_8x16_ssse3, aom_smooth_predictor_8x16_ssse3,
-                NULL, NULL)
+                aom_smooth_v_predictor_8x16_ssse3,
+                aom_smooth_h_predictor_8x16_ssse3)
+INTRA_PRED_TEST(SSSE3_4, TX_8X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_8x32_ssse3, aom_smooth_predictor_8x32_ssse3,
+                aom_smooth_v_predictor_8x32_ssse3,
+                aom_smooth_h_predictor_8x32_ssse3)
 #endif  // HAVE_SSSE3
 
 #if HAVE_DSPR2
-INTRA_PRED_TEST(DSPR2, TestIntraPred8, "intra8x8", aom_dc_predictor_8x8_dspr2,
-                NULL, NULL, NULL, NULL, aom_h_predictor_8x8_dspr2, NULL, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(DSPR2, TX_8X8, aom_dc_predictor_8x8_dspr2, NULL, NULL, NULL,
+                NULL, aom_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL)
 #endif  // HAVE_DSPR2
 
 #if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred8, "intra8x8", aom_dc_predictor_8x8_neon,
+INTRA_PRED_TEST(NEON, TX_8X8, aom_dc_predictor_8x8_neon,
                 aom_dc_left_predictor_8x8_neon, aom_dc_top_predictor_8x8_neon,
                 aom_dc_128_predictor_8x8_neon, aom_v_predictor_8x8_neon,
-                aom_h_predictor_8x8_neon, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL)
+                aom_h_predictor_8x8_neon, NULL, NULL, NULL, NULL)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
-INTRA_PRED_TEST(MSA, TestIntraPred8, "intra8x8", aom_dc_predictor_8x8_msa,
+INTRA_PRED_TEST(MSA, TX_8X8, aom_dc_predictor_8x8_msa,
                 aom_dc_left_predictor_8x8_msa, aom_dc_top_predictor_8x8_msa,
                 aom_dc_128_predictor_8x8_msa, aom_v_predictor_8x8_msa,
-                aom_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL)
+                aom_h_predictor_8x8_msa, NULL, NULL, NULL, NULL)
 #endif  // HAVE_MSA
 
 // -----------------------------------------------------------------------------
-// 16x16
+// 16x16, 16x8, 16x32, 16x4, 16x64
 
-#if CONFIG_SMOOTH_HV
-#define smooth_v_pred_func aom_smooth_v_predictor_16x16_c
-#define smooth_h_pred_func aom_smooth_h_predictor_16x16_c
-#else
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_SMOOTH_HV
-INTRA_PRED_TEST(C_1, TestIntraPred16, "intra16x16", aom_dc_predictor_16x16_c,
+INTRA_PRED_TEST(C_1, TX_16X16, aom_dc_predictor_16x16_c,
                 aom_dc_left_predictor_16x16_c, aom_dc_top_predictor_16x16_c,
                 aom_dc_128_predictor_16x16_c, aom_v_predictor_16x16_c,
-                aom_h_predictor_16x16_c, aom_d45e_predictor_16x16_c,
-                aom_d135_predictor_16x16_c, aom_d117_predictor_16x16_c,
-                aom_d153_predictor_16x16_c, aom_d207e_predictor_16x16_c,
-                aom_d63e_predictor_16x16_c, aom_paeth_predictor_16x16_c,
-                aom_smooth_predictor_16x16_c, smooth_v_pred_func,
-                smooth_h_pred_func)
-#undef smooth_v_pred_func
-#undef smooth_h_pred_func
-
-#if CONFIG_SMOOTH_HV
-#define smooth_v_pred_func aom_smooth_v_predictor_16x8_c
-#define smooth_h_pred_func aom_smooth_h_predictor_16x8_c
-#else
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_SMOOTH_HV
-INTRA_PRED_TEST(C_2, TestIntraPred16, "intra16x8", aom_dc_predictor_16x8_c,
+                aom_h_predictor_16x16_c, aom_paeth_predictor_16x16_c,
+                aom_smooth_predictor_16x16_c, aom_smooth_v_predictor_16x16_c,
+                aom_smooth_h_predictor_16x16_c)
+
+INTRA_PRED_TEST(C_2, TX_16X8, aom_dc_predictor_16x8_c,
                 aom_dc_left_predictor_16x8_c, aom_dc_top_predictor_16x8_c,
                 aom_dc_128_predictor_16x8_c, aom_v_predictor_16x8_c,
-                aom_h_predictor_16x8_c, aom_d45e_predictor_16x8_c,
-                aom_d135_predictor_16x8_c, aom_d117_predictor_16x8_c,
-                aom_d153_predictor_16x8_c, aom_d207e_predictor_16x8_c,
-                aom_d63e_predictor_16x8_c, aom_paeth_predictor_16x8_c,
-                aom_smooth_predictor_16x8_c, smooth_v_pred_func,
-                smooth_h_pred_func)
-#undef smooth_v_pred_func
-#undef smooth_h_pred_func
-
-#if CONFIG_SMOOTH_HV
-#define smooth_v_pred_func aom_smooth_v_predictor_16x32_c
-#define smooth_h_pred_func aom_smooth_h_predictor_16x32_c
-#else
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_SMOOTH_HV
-INTRA_PRED_TEST(C_3, TestIntraPred16, "intra16x32", aom_dc_predictor_16x32_c,
+                aom_h_predictor_16x8_c, aom_paeth_predictor_16x8_c,
+                aom_smooth_predictor_16x8_c, aom_smooth_v_predictor_16x8_c,
+                aom_smooth_h_predictor_16x8_c)
+
+INTRA_PRED_TEST(C_3, TX_16X32, aom_dc_predictor_16x32_c,
                 aom_dc_left_predictor_16x32_c, aom_dc_top_predictor_16x32_c,
                 aom_dc_128_predictor_16x32_c, aom_v_predictor_16x32_c,
-                aom_h_predictor_16x32_c, aom_d45e_predictor_16x32_c,
-                aom_d135_predictor_16x32_c, aom_d117_predictor_16x32_c,
-                aom_d153_predictor_16x32_c, aom_d207e_predictor_16x32_c,
-                aom_d63e_predictor_16x32_c, aom_paeth_predictor_16x32_c,
-                aom_smooth_predictor_16x32_c, smooth_v_pred_func,
-                smooth_h_pred_func)
-#undef smooth_v_pred_func
-#undef smooth_h_pred_func
+                aom_h_predictor_16x32_c, aom_paeth_predictor_16x32_c,
+                aom_smooth_predictor_16x32_c, aom_smooth_v_predictor_16x32_c,
+                aom_smooth_h_predictor_16x32_c)
+
+INTRA_PRED_TEST(C_4, TX_16X4, aom_dc_predictor_16x4_c,
+                aom_dc_left_predictor_16x4_c, aom_dc_top_predictor_16x4_c,
+                aom_dc_128_predictor_16x4_c, aom_v_predictor_16x4_c,
+                aom_h_predictor_16x4_c, aom_paeth_predictor_16x4_c,
+                aom_smooth_predictor_16x4_c, aom_smooth_v_predictor_16x4_c,
+                aom_smooth_h_predictor_16x4_c)
+
+INTRA_PRED_TEST(C_5, TX_16X64, aom_dc_predictor_16x64_c,
+                aom_dc_left_predictor_16x64_c, aom_dc_top_predictor_16x64_c,
+                aom_dc_128_predictor_16x64_c, aom_v_predictor_16x64_c,
+                aom_h_predictor_16x64_c, aom_paeth_predictor_16x64_c,
+                aom_smooth_predictor_16x64_c, aom_smooth_v_predictor_16x64_c,
+                aom_smooth_h_predictor_16x64_c)
 
 #if HAVE_SSE2
-INTRA_PRED_TEST(SSE2_1, TestIntraPred16, "intra16x16",
-                aom_dc_predictor_16x16_sse2, aom_dc_left_predictor_16x16_sse2,
+INTRA_PRED_TEST(SSE2_1, TX_16X16, aom_dc_predictor_16x16_sse2,
+                aom_dc_left_predictor_16x16_sse2,
                 aom_dc_top_predictor_16x16_sse2,
                 aom_dc_128_predictor_16x16_sse2, aom_v_predictor_16x16_sse2,
-                aom_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_2, TestIntraPred16, "intra16x8",
-                aom_dc_predictor_16x8_sse2, aom_dc_left_predictor_16x8_sse2,
-                aom_dc_top_predictor_16x8_sse2, aom_dc_128_predictor_16x8_sse2,
-                aom_v_predictor_16x8_sse2, aom_h_predictor_16x8_sse2, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_3, TestIntraPred16, "intra16x32",
-                aom_dc_predictor_16x32_sse2, aom_dc_left_predictor_16x32_sse2,
+                aom_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_2, TX_16X8, aom_dc_predictor_16x8_sse2,
+                aom_dc_left_predictor_16x8_sse2, aom_dc_top_predictor_16x8_sse2,
+                aom_dc_128_predictor_16x8_sse2, aom_v_predictor_16x8_sse2,
+                aom_h_predictor_16x8_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_3, TX_16X32, aom_dc_predictor_16x32_sse2,
+                aom_dc_left_predictor_16x32_sse2,
                 aom_dc_top_predictor_16x32_sse2,
                 aom_dc_128_predictor_16x32_sse2, aom_v_predictor_16x32_sse2,
-                aom_h_predictor_16x32_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL)
+                aom_h_predictor_16x32_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_4, TX_16X64, aom_dc_predictor_16x64_sse2,
+                aom_dc_left_predictor_16x64_sse2,
+                aom_dc_top_predictor_16x64_sse2,
+                aom_dc_128_predictor_16x64_sse2, aom_v_predictor_16x64_sse2,
+                aom_h_predictor_16x64_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_5, TX_16X4, aom_dc_predictor_16x4_sse2,
+                aom_dc_left_predictor_16x4_sse2, aom_dc_top_predictor_16x4_sse2,
+                aom_dc_128_predictor_16x4_sse2, aom_v_predictor_16x4_sse2,
+                aom_h_predictor_16x4_sse2, NULL, NULL, NULL, NULL)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3_1, TestIntraPred16, "intra16x16", NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL, NULL, aom_d153_predictor_16x16_ssse3,
-                NULL, NULL, aom_paeth_predictor_16x16_ssse3,
-                aom_smooth_predictor_16x16_ssse3, NULL, NULL)
-INTRA_PRED_TEST(SSSE3_2, TestIntraPred16, "intra16x8", NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x16_ssse3,
+                aom_smooth_predictor_16x16_ssse3,
+                aom_smooth_v_predictor_16x16_ssse3,
+                aom_smooth_h_predictor_16x16_ssse3)
+INTRA_PRED_TEST(SSSE3_2, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x8_ssse3, aom_smooth_predictor_16x8_ssse3,
-                NULL, NULL)
-INTRA_PRED_TEST(SSSE3_3, TestIntraPred16, "intra16x32", NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_smooth_v_predictor_16x8_ssse3,
+                aom_smooth_h_predictor_16x8_ssse3)
+INTRA_PRED_TEST(SSSE3_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x32_ssse3,
-                aom_smooth_predictor_16x32_ssse3, NULL, NULL)
+                aom_smooth_predictor_16x32_ssse3,
+                aom_smooth_v_predictor_16x32_ssse3,
+                aom_smooth_h_predictor_16x32_ssse3)
+INTRA_PRED_TEST(SSSE3_4, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x64_ssse3,
+                aom_smooth_predictor_16x64_ssse3,
+                aom_smooth_v_predictor_16x64_ssse3,
+                aom_smooth_h_predictor_16x64_ssse3)
+INTRA_PRED_TEST(SSSE3_5, TX_16X4, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x4_ssse3, aom_smooth_predictor_16x4_ssse3,
+                aom_smooth_v_predictor_16x4_ssse3,
+                aom_smooth_h_predictor_16x4_ssse3)
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX2
-INTRA_PRED_TEST(AVX2_1, TestIntraPred16, "intra16x16", NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(AVX2_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x16_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_2, TestIntraPred16, "intra16x8", NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(AVX2_2, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x8_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_3, TestIntraPred16, "intra16x32", NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(AVX2_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x32_avx2, NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_4, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x64_avx2, NULL, NULL, NULL)
 #endif  // HAVE_AVX2
 
 #if HAVE_DSPR2
-INTRA_PRED_TEST(DSPR2, TestIntraPred16, "intra16x16",
-                aom_dc_predictor_16x16_dspr2, NULL, NULL, NULL, NULL,
-                aom_h_predictor_16x16_dspr2, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(DSPR2, TX_16X16, aom_dc_predictor_16x16_dspr2, NULL, NULL, NULL,
+                NULL, aom_h_predictor_16x16_dspr2, NULL, NULL, NULL, NULL)
 #endif  // HAVE_DSPR2
 
 #if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred16, "intra16x16",
-                aom_dc_predictor_16x16_neon, aom_dc_left_predictor_16x16_neon,
+INTRA_PRED_TEST(NEON, TX_16X16, aom_dc_predictor_16x16_neon,
+                aom_dc_left_predictor_16x16_neon,
                 aom_dc_top_predictor_16x16_neon,
                 aom_dc_128_predictor_16x16_neon, aom_v_predictor_16x16_neon,
-                aom_h_predictor_16x16_neon, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL)
+                aom_h_predictor_16x16_neon, NULL, NULL, NULL, NULL)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
-INTRA_PRED_TEST(MSA, TestIntraPred16, "intra16x16", aom_dc_predictor_16x16_msa,
+INTRA_PRED_TEST(MSA, TX_16X16, aom_dc_predictor_16x16_msa,
                 aom_dc_left_predictor_16x16_msa, aom_dc_top_predictor_16x16_msa,
                 aom_dc_128_predictor_16x16_msa, aom_v_predictor_16x16_msa,
-                aom_h_predictor_16x16_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL)
+                aom_h_predictor_16x16_msa, NULL, NULL, NULL, NULL)
 #endif  // HAVE_MSA
 
 // -----------------------------------------------------------------------------
-// 32x32
+// 32x32, 32x16, 32x64, 32x8
 
-#if CONFIG_SMOOTH_HV
-#define smooth_v_pred_func aom_smooth_v_predictor_32x32_c
-#define smooth_h_pred_func aom_smooth_h_predictor_32x32_c
-#else
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_SMOOTH_HV
-INTRA_PRED_TEST(C_1, TestIntraPred32, "intra32x32", aom_dc_predictor_32x32_c,
+INTRA_PRED_TEST(C_1, TX_32X32, aom_dc_predictor_32x32_c,
                 aom_dc_left_predictor_32x32_c, aom_dc_top_predictor_32x32_c,
                 aom_dc_128_predictor_32x32_c, aom_v_predictor_32x32_c,
-                aom_h_predictor_32x32_c, aom_d45e_predictor_32x32_c,
-                aom_d135_predictor_32x32_c, aom_d117_predictor_32x32_c,
-                aom_d153_predictor_32x32_c, aom_d207e_predictor_32x32_c,
-                aom_d63e_predictor_32x32_c, aom_paeth_predictor_32x32_c,
-                aom_smooth_predictor_32x32_c, smooth_v_pred_func,
-                smooth_h_pred_func)
-#undef smooth_v_pred_func
-#undef smooth_h_pred_func
-
-#if CONFIG_SMOOTH_HV
-#define smooth_v_pred_func aom_smooth_v_predictor_32x16_c
-#define smooth_h_pred_func aom_smooth_h_predictor_32x16_c
-#else
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_SMOOTH_HV
-INTRA_PRED_TEST(C_2, TestIntraPred32, "intra32x16", aom_dc_predictor_32x16_c,
+                aom_h_predictor_32x32_c, aom_paeth_predictor_32x32_c,
+                aom_smooth_predictor_32x32_c, aom_smooth_v_predictor_32x32_c,
+                aom_smooth_h_predictor_32x32_c)
+
+INTRA_PRED_TEST(C_2, TX_32X16, aom_dc_predictor_32x16_c,
                 aom_dc_left_predictor_32x16_c, aom_dc_top_predictor_32x16_c,
                 aom_dc_128_predictor_32x16_c, aom_v_predictor_32x16_c,
-                aom_h_predictor_32x16_c, aom_d45e_predictor_32x16_c,
-                aom_d135_predictor_32x16_c, aom_d117_predictor_32x16_c,
-                aom_d153_predictor_32x16_c, aom_d207e_predictor_32x16_c,
-                aom_d63e_predictor_32x16_c, aom_paeth_predictor_32x16_c,
-                aom_smooth_predictor_32x16_c, smooth_v_pred_func,
-                smooth_h_pred_func)
-#undef smooth_v_pred_func
-#undef smooth_h_pred_func
+                aom_h_predictor_32x16_c, aom_paeth_predictor_32x16_c,
+                aom_smooth_predictor_32x16_c, aom_smooth_v_predictor_32x16_c,
+                aom_smooth_h_predictor_32x16_c)
+
+INTRA_PRED_TEST(C_3, TX_32X64, aom_dc_predictor_32x64_c,
+                aom_dc_left_predictor_32x64_c, aom_dc_top_predictor_32x64_c,
+                aom_dc_128_predictor_32x64_c, aom_v_predictor_32x64_c,
+                aom_h_predictor_32x64_c, aom_paeth_predictor_32x64_c,
+                aom_smooth_predictor_32x64_c, aom_smooth_v_predictor_32x64_c,
+                aom_smooth_h_predictor_32x64_c)
+
+INTRA_PRED_TEST(C_4, TX_32X8, aom_dc_predictor_32x8_c,
+                aom_dc_left_predictor_32x8_c, aom_dc_top_predictor_32x8_c,
+                aom_dc_128_predictor_32x8_c, aom_v_predictor_32x8_c,
+                aom_h_predictor_32x8_c, aom_paeth_predictor_32x8_c,
+                aom_smooth_predictor_32x8_c, aom_smooth_v_predictor_32x8_c,
+                aom_smooth_h_predictor_32x8_c)
 
 #if HAVE_SSE2
-INTRA_PRED_TEST(SSE2_1, TestIntraPred32, "intra32x32",
-                aom_dc_predictor_32x32_sse2, aom_dc_left_predictor_32x32_sse2,
+INTRA_PRED_TEST(SSE2_1, TX_32X32, aom_dc_predictor_32x32_sse2,
+                aom_dc_left_predictor_32x32_sse2,
                 aom_dc_top_predictor_32x32_sse2,
                 aom_dc_128_predictor_32x32_sse2, aom_v_predictor_32x32_sse2,
-                aom_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_2, TestIntraPred32, "intra32x16",
-                aom_dc_predictor_32x16_sse2, aom_dc_left_predictor_32x16_sse2,
+                aom_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_2, TX_32X16, aom_dc_predictor_32x16_sse2,
+                aom_dc_left_predictor_32x16_sse2,
                 aom_dc_top_predictor_32x16_sse2,
                 aom_dc_128_predictor_32x16_sse2, aom_v_predictor_32x16_sse2,
-                aom_h_predictor_32x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL)
+                aom_h_predictor_32x16_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_3, TX_32X64, aom_dc_predictor_32x64_sse2,
+                aom_dc_left_predictor_32x64_sse2,
+                aom_dc_top_predictor_32x64_sse2,
+                aom_dc_128_predictor_32x64_sse2, aom_v_predictor_32x64_sse2,
+                aom_h_predictor_32x64_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_4, TX_32X8, aom_dc_predictor_32x8_sse2,
+                aom_dc_left_predictor_32x8_sse2, aom_dc_top_predictor_32x8_sse2,
+                aom_dc_128_predictor_32x8_sse2, aom_v_predictor_32x8_sse2,
+                aom_h_predictor_32x8_sse2, NULL, NULL, NULL, NULL)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3_1, TestIntraPred32, "intra32x32", NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL, NULL, aom_d153_predictor_32x32_ssse3,
-                NULL, NULL, aom_paeth_predictor_32x32_ssse3,
-                aom_smooth_predictor_32x32_ssse3, NULL, NULL)
-INTRA_PRED_TEST(SSSE3_2, TestIntraPred32, "intra32x16", NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3_1, TX_32X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_32x32_ssse3,
+                aom_smooth_predictor_32x32_ssse3,
+                aom_smooth_v_predictor_32x32_ssse3,
+                aom_smooth_h_predictor_32x32_ssse3)
+INTRA_PRED_TEST(SSSE3_2, TX_32X16, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_32x16_ssse3,
-                aom_smooth_predictor_32x16_ssse3, NULL, NULL)
+                aom_smooth_predictor_32x16_ssse3,
+                aom_smooth_v_predictor_32x16_ssse3,
+                aom_smooth_h_predictor_32x16_ssse3)
+INTRA_PRED_TEST(SSSE3_3, TX_32X64, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_32x64_ssse3,
+                aom_smooth_predictor_32x64_ssse3,
+                aom_smooth_v_predictor_32x64_ssse3,
+                aom_smooth_h_predictor_32x64_ssse3)
+INTRA_PRED_TEST(SSSE3_4, TX_32X8, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_32x8_ssse3, aom_smooth_predictor_32x8_ssse3,
+                aom_smooth_v_predictor_32x8_ssse3,
+                aom_smooth_h_predictor_32x8_ssse3)
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX2
-INTRA_PRED_TEST(AVX2_1, TestIntraPred32, "intra32x32",
-                aom_dc_predictor_32x32_avx2, aom_dc_left_predictor_32x32_avx2,
+INTRA_PRED_TEST(AVX2_1, TX_32X32, aom_dc_predictor_32x32_avx2,
+                aom_dc_left_predictor_32x32_avx2,
                 aom_dc_top_predictor_32x32_avx2,
                 aom_dc_128_predictor_32x32_avx2, aom_v_predictor_32x32_avx2,
-                aom_h_predictor_32x32_avx2, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_32x32_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_2, TestIntraPred32, "intra32x16",
-                aom_dc_predictor_32x16_avx2, aom_dc_left_predictor_32x16_avx2,
+                aom_h_predictor_32x32_avx2, aom_paeth_predictor_32x32_avx2,
+                NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_2, TX_32X16, aom_dc_predictor_32x16_avx2,
+                aom_dc_left_predictor_32x16_avx2,
                 aom_dc_top_predictor_32x16_avx2,
                 aom_dc_128_predictor_32x16_avx2, aom_v_predictor_32x16_avx2,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_32x16_avx2, NULL, NULL, NULL)
+                NULL, aom_paeth_predictor_32x16_avx2, NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_3, TX_32X64, aom_dc_predictor_32x64_avx2,
+                aom_dc_left_predictor_32x64_avx2,
+                aom_dc_top_predictor_32x64_avx2,
+                aom_dc_128_predictor_32x64_avx2, aom_v_predictor_32x64_avx2,
+                NULL, aom_paeth_predictor_32x64_avx2, NULL, NULL, NULL)
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred32, "intra32x32",
-                aom_dc_predictor_32x32_neon, aom_dc_left_predictor_32x32_neon,
+INTRA_PRED_TEST(NEON, TX_32X32, aom_dc_predictor_32x32_neon,
+                aom_dc_left_predictor_32x32_neon,
                 aom_dc_top_predictor_32x32_neon,
                 aom_dc_128_predictor_32x32_neon, aom_v_predictor_32x32_neon,
-                aom_h_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL)
+                aom_h_predictor_32x32_neon, NULL, NULL, NULL, NULL)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
-INTRA_PRED_TEST(MSA, TestIntraPred32, "intra32x32", aom_dc_predictor_32x32_msa,
+INTRA_PRED_TEST(MSA, TX_32X32, aom_dc_predictor_32x32_msa,
                 aom_dc_left_predictor_32x32_msa, aom_dc_top_predictor_32x32_msa,
                 aom_dc_128_predictor_32x32_msa, aom_v_predictor_32x32_msa,
-                aom_h_predictor_32x32_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL)
+                aom_h_predictor_32x32_msa, NULL, NULL, NULL, NULL)
 #endif  // HAVE_MSA
 
 // -----------------------------------------------------------------------------
+// 64x64, 64x32, 64x16
+
+INTRA_PRED_TEST(C_1, TX_64X64, aom_dc_predictor_64x64_c,
+                aom_dc_left_predictor_64x64_c, aom_dc_top_predictor_64x64_c,
+                aom_dc_128_predictor_64x64_c, aom_v_predictor_64x64_c,
+                aom_h_predictor_64x64_c, aom_paeth_predictor_64x64_c,
+                aom_smooth_predictor_64x64_c, aom_smooth_v_predictor_64x64_c,
+                aom_smooth_h_predictor_64x64_c)
+
+INTRA_PRED_TEST(C_2, TX_64X32, aom_dc_predictor_64x32_c,
+                aom_dc_left_predictor_64x32_c, aom_dc_top_predictor_64x32_c,
+                aom_dc_128_predictor_64x32_c, aom_v_predictor_64x32_c,
+                aom_h_predictor_64x32_c, aom_paeth_predictor_64x32_c,
+                aom_smooth_predictor_64x32_c, aom_smooth_v_predictor_64x32_c,
+                aom_smooth_h_predictor_64x32_c)
+
+INTRA_PRED_TEST(C_3, TX_64X16, aom_dc_predictor_64x16_c,
+                aom_dc_left_predictor_64x16_c, aom_dc_top_predictor_64x16_c,
+                aom_dc_128_predictor_64x16_c, aom_v_predictor_64x16_c,
+                aom_h_predictor_64x16_c, aom_paeth_predictor_64x16_c,
+                aom_smooth_predictor_64x16_c, aom_smooth_v_predictor_64x16_c,
+                aom_smooth_h_predictor_64x16_c)
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2_4, TX_64X64, aom_dc_predictor_64x64_sse2,
+                aom_dc_left_predictor_64x64_sse2,
+                aom_dc_top_predictor_64x64_sse2,
+                aom_dc_128_predictor_64x64_sse2, aom_v_predictor_64x64_sse2,
+                aom_h_predictor_64x64_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_5, TX_64X32, aom_dc_predictor_64x32_sse2,
+                aom_dc_left_predictor_64x32_sse2,
+                aom_dc_top_predictor_64x32_sse2,
+                aom_dc_128_predictor_64x32_sse2, aom_v_predictor_64x32_sse2,
+                aom_h_predictor_64x32_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_6, TX_64X16, aom_dc_predictor_64x16_sse2,
+                aom_dc_left_predictor_64x16_sse2,
+                aom_dc_top_predictor_64x16_sse2,
+                aom_dc_128_predictor_64x16_sse2, aom_v_predictor_64x16_sse2,
+                aom_h_predictor_64x16_sse2, NULL, NULL, NULL, NULL)
+#endif
+
+#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3_4, TX_64X64, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_64x64_ssse3,
+                aom_smooth_predictor_64x64_ssse3,
+                aom_smooth_v_predictor_64x64_ssse3,
+                aom_smooth_h_predictor_64x64_ssse3)
+INTRA_PRED_TEST(SSSE3_5, TX_64X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_64x32_ssse3,
+                aom_smooth_predictor_64x32_ssse3,
+                aom_smooth_v_predictor_64x32_ssse3,
+                aom_smooth_h_predictor_64x32_ssse3)
+INTRA_PRED_TEST(SSSE3_6, TX_64X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_64x16_ssse3,
+                aom_smooth_predictor_64x16_ssse3,
+                aom_smooth_v_predictor_64x16_ssse3,
+                aom_smooth_h_predictor_64x16_ssse3)
+#endif
+
+#if HAVE_AVX2
+INTRA_PRED_TEST(AVX2_4, TX_64X64, aom_dc_predictor_64x64_avx2,
+                aom_dc_left_predictor_64x64_avx2,
+                aom_dc_top_predictor_64x64_avx2,
+                aom_dc_128_predictor_64x64_avx2, aom_v_predictor_64x64_avx2,
+                NULL, aom_paeth_predictor_64x64_avx2, NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_5, TX_64X32, aom_dc_predictor_64x32_avx2,
+                aom_dc_left_predictor_64x32_avx2,
+                aom_dc_top_predictor_64x32_avx2,
+                aom_dc_128_predictor_64x32_avx2, aom_v_predictor_64x32_avx2,
+                NULL, aom_paeth_predictor_64x32_avx2, NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_6, TX_64X16, aom_dc_predictor_64x16_avx2,
+                aom_dc_left_predictor_64x16_avx2,
+                aom_dc_top_predictor_64x16_avx2,
+                aom_dc_128_predictor_64x16_avx2, aom_v_predictor_64x16_avx2,
+                NULL, aom_paeth_predictor_64x16_avx2, NULL, NULL, NULL)
+#endif
+// -----------------------------------------------------------------------------
 // High Bitdepth
-#if CONFIG_HIGHBITDEPTH
 namespace {
 
 typedef void (*AvxHighbdPredFunc)(uint16_t *dst, ptrdiff_t y_stride,
@@ -826,17 +888,16 @@ typedef void (*AvxHighbdPredFunc)(uint16_t *dst, ptrdiff_t y_stride,
 
 typedef IntraPredTestMem<uint16_t> Av1HighbdIntraPredTestMem;
 
-void TestHighbdIntraPred(const char name[], AvxHighbdPredFunc const *pred_funcs,
-                         const char *const signatures[], int block_width,
-                         int block_height) {
+void TestHighbdIntraPred(TX_SIZE tx_size, AvxHighbdPredFunc const *pred_funcs,
+                         const char *const signatures[]) {
+  const int block_width = tx_size_wide[tx_size];
+  const int block_height = tx_size_high[tx_size];
   const int num_pixels_per_test =
       block_width * block_height * kNumAv1IntraFuncs;
   const int kNumTests = static_cast<int>(2.e10 / num_pixels_per_test);
   Av1HighbdIntraPredTestMem intra_pred_test_mem;
-  const uint16_t *const above = intra_pred_test_mem.above_mem + 16;
   const int bd = 12;
-
-  intra_pred_test_mem.Init(block_width, bd);
+  intra_pred_test_mem.Init(block_width, block_height, bd);
 
   for (int k = 0; k < kNumAv1IntraFuncs; ++k) {
     if (pred_funcs[k] == NULL) continue;
@@ -845,646 +906,559 @@ void TestHighbdIntraPred(const char name[], AvxHighbdPredFunc const *pred_funcs,
     aom_usec_timer timer;
     aom_usec_timer_start(&timer);
     for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
-      pred_funcs[k](intra_pred_test_mem.src, kBPS, above,
-                    intra_pred_test_mem.left, bd);
+      pred_funcs[k](intra_pred_test_mem.src, intra_pred_test_mem.stride,
+                    intra_pred_test_mem.above, intra_pred_test_mem.left, bd);
     }
     libaom_test::ClearSystemState();
     aom_usec_timer_mark(&timer);
     const int elapsed_time =
         static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);
-    CheckMd5Signature(name, signatures, intra_pred_test_mem.src,
-                      sizeof(intra_pred_test_mem.src), elapsed_time, k);
-  }
-}
-
-void TestHighbdIntraPred4(const char *block_name,
-                          AvxHighbdPredFunc const *pred_funcs) {
-  static const char *const kSignatures4x4[kNumAv1IntraFuncs] = {
-    "11f74af6c5737df472f3275cbde062fa",
-    "51bea056b6447c93f6eb8f6b7e8f6f71",
-    "27e97f946766331795886f4de04c5594",
-    "53ab15974b049111fb596c5168ec7e3f",
-    "f0b640bb176fbe4584cf3d32a9b0320a",
-    "729783ca909e03afd4b47111c80d967b",
-    "d631a8544ccc87702db3e98fac494657",
-    "293fc903254a33754133314c6cdba81f",
-    "f8074d704233e73dfd35b458c6092374",
-    "aa6363d08544a1ec4da33d7a0be5640d",
-    "0bdc21a3acdebc393bc2c22e71bbeada",
-    "a48f7a484ba4ad3916055c7160665b56",
-    "6e30009c45474a22032678b1bd579c8f",
-    "e57cba016d808aa8a35619df2a65f049",
-#if CONFIG_SMOOTH_HV
-    "55a6c37f39afcbbf5abca4a985b96459",
-    "a623d45b37dafec1f8a75c4c5218913d",
-#endif  // CONFIG_SMOOTH_HV
-  };
-  static const char *const kSignatures4x8[kNumAv1IntraFuncs] = {
-    "22d519b796d59644043466320e4ccd14",
-    "09513a738c49b3f9542d27f34abbe1d5",
-    "807ae5e8813443ff01e71be6efacfb69",
-    "cbfa18d0293430b6e9708b0be1fd2394",
-    "346c354c34ec7fa780b576db355dab88",
-    "f97dae85c35359632380b09ca98d611e",
-    "aed1beef71de33856c814ff7d63dd9db",
-    "49c47c04dd3d23d6fc5cc32bf9d40ae4",
-    "a24aade6e22b323ee28c8bf08aa2d234",
-    "aefef502f9e144e71cd27dc7383b3c28",
-    "b284ae5277b85ebdd16b5952149f7458",
-    "8dc5791167271f6f347582e07379f580",
-    "698ae351d8896d89ed9e4e67b6e53eda",
-    "dcc197034a9c45a3d8238bf085835f4e",
-#if CONFIG_SMOOTH_HV
-    "7a35e2c42ffdc2efc2d6d1d75a100fc7",
-    "41ab6cebd4516c87a91b2a593e2c2506",
-#endif
-  };
-
-  if (!strcmp(block_name, "Hbd Intra4x4")) {
-    TestHighbdIntraPred(block_name, pred_funcs, kSignatures4x4, 4, 4);
-  }
-  if (!strcmp(block_name, "Hbd Intra4x8")) {
-    TestHighbdIntraPred(block_name, pred_funcs, kSignatures4x8, 4, 8);
-  }
-}
-
-void TestHighbdIntraPred8(const char *block_name,
-                          AvxHighbdPredFunc const *pred_funcs) {
-  static const char *const kSignatures8x8[kNumAv1IntraFuncs] = {
-    "03da8829fe94663047fd108c5fcaa71d",
-    "ecdb37b8120a2d3a4c706b016bd1bfd7",
-    "1d4543ed8d2b9368cb96898095fe8a75",
-    "f791c9a67b913cbd82d9da8ecede30e2",
-    "065c70646f4dbaff913282f55a45a441",
-    "51f87123616662ef7c35691497dfd0ba",
-    "4f53cf8e5f43894dc0759f43c7081f60",
-    "9ffe186a6bc7db95275f1bbddd6f7aba",
-    "a3258a2eae2e2bd55cb8f71351b22998",
-    "8d909f0a2066e39b3216092c6289ece4",
-    "6751f60655aba44aff78aaaf4e967377",
-    "d31a449872fab968a8d41de578338780",
-    "85c01ba03df68f9ece7bd3fa0f8980e6",
-    "ad19b7dac092f56df6d054e1f67f21e7",
-#if CONFIG_SMOOTH_HV
-    "0edc415b5dd7299f7a34fb9f71d31d78",
-    "2bc8ec19e9f4b77a64b8a0a1f6aec7e7",
-#endif  // CONFIG_SMOOTH_HV
-  };
-  static const char *const kSignatures8x4[kNumAv1IntraFuncs] = {
-    "d58cd4c4bf3b7bbaa5db5e1a5622ec78",
-    "6e572c35aa782d00cafcb99e9ea047ea",
-    "e8c22a3702b416dc9ab974505afbed09",
-    "aaa4e4762a795aad7ad74de0c662c4e4",
-    "a19f9101967383c3dcbd516dc317a291",
-    "9ab8cb91f1a595b9ebe3fe8de58031aa",
-    "c6c7d65264397d4d31e378e1f1cfd921",
-    "5804158e463ff794b6b8a623f5d2c10d",
-    "c342cdeb39aae4c4f7be10e057029298",
-    "c1bbbcfe4b25f6b8eca6ad2f7ee793d3",
-    "98d1dab8b949859b9c65298ee9f105f8",
-    "396e803aaf6d7a03a231edc48b396051",
-    "2cf9021d5f1169268699807ee118b65f",
-    "ee9605fcbd6fb871f1c5cd81a6989327",
-#if CONFIG_SMOOTH_HV
-    "0edc415b5dd7299f7a34fb9f71d31d78",
-    "2bc8ec19e9f4b77a64b8a0a1f6aec7e7",
-#endif
-  };
-  static const char *const kSignatures8x16[kNumAv1IntraFuncs] = {
-    "4562de1d0336610880fdd5685498a9ec",
-    "16310fa7076394f16fc85c4b149d89c9",
-    "0e94af88e1dc573b6f0f499cddd1f530",
-    "dfd245ee20d091c67809160340365aa9",
-    "d3562504327f70c096c5be23fd8a3747",
-    "601b853558502acbb5135eadd2da117a",
-    "e83f9a8bc16b507d2ed0b6b31a25d6f5",
-    "fc8427d942246e8cba81247bb294afb5",
-    "89cde712e4c1ef675ea156ad679c62c7",
-    "0a68c2b28c3b171ad797cf76a7058f10",
-    "e70724010e12d8f374cedd3910ceb0d5",
-    "ad7987e91267503ba6fd3e8be42eb48c",
-    "3c624345a723a1b2b1bea05a6a08bc99",
-    "2a9c781de609e0184cc7ab442050f4e5",
-#if CONFIG_SMOOTH_HV
-    "0ddc5035c22252747126b61fc238c74d",
-    "e43f5d83bab759af69c7b6773fc8f9b2",
-#endif
-  };
-  if (!strcmp(block_name, "Hbd Intra8x8")) {
-    TestHighbdIntraPred(block_name, pred_funcs, kSignatures8x8, 8, 8);
-  }
-  if (!strcmp(block_name, "Hbd Intra8x4")) {
-    TestHighbdIntraPred(block_name, pred_funcs, kSignatures8x4, 8, 4);
-  }
-  if (!strcmp(block_name, "Hbd Intra8x16")) {
-    TestHighbdIntraPred(block_name, pred_funcs, kSignatures8x16, 8, 16);
-  }
-}
-
-void TestHighbdIntraPred16(const char *block_name,
-                           AvxHighbdPredFunc const *pred_funcs) {
-  static const char *const kSignatures16x16[kNumAv1IntraFuncs] = {
-    "e33cb3f56a878e2fddb1b2fc51cdd275",
-    "c7bff6f04b6052c8ab335d726dbbd52d",
-    "d0b0b47b654a9bcc5c6008110a44589b",
-    "78f5da7b10b2b9ab39f114a33b6254e9",
-    "c78e31d23831abb40d6271a318fdd6f3",
-    "90d1347f4ec9198a0320daecb6ff90b8",
-    "e38e12830e2ee5a01a064ec5998d5948",
-    "cf28bd387b81ad3e5f1a1c779a4b70a0",
-    "24c304330431ddeaf630f6ce94af2eac",
-    "91a329798036bf64e8e00a87b131b8b1",
-    "e536338d1a8ee192b9e591855db1a222",
-    "54ecd47737f71c62d24e3779585113f2",
-    "e63ded54ab3d0e8728b6f24d4f01e53f",
-    "35ce21fbe0ea114c089fc3489a78155d",
-#if CONFIG_SMOOTH_HV
-    "f277f6ef8e4d717f1f0dfe2706ac197d",
-    "e8014d3f41256976c02e0f1e622ba2b9",
-#endif  // CONFIG_SMOOTH_HV
-  };
-  static const char *const kSignatures16x8[kNumAv1IntraFuncs] = {
-    "a57d6b5a9bfd30c29591d8717ace9c51",
-    "f5907ba97ee6c53e339e953fc8d845ee",
-    "ea3aa727913ce45af06f89dd1808db5f",
-    "408af4f23e48d14b48ee35ae094fcd18",
-    "85c41cbcb5d744f7961e8950026fbffe",
-    "8a4e588a837638887ba671f8d4910485",
-    "caae3cc3d419bbd28aa389dbe4febee1",
-    "ea67fb80d71b6471467c79662af1186c",
-    "c83f7252412dd1ad2fc6af848e7f6be8",
-    "f45af3d697f42f1b9b8def4e46bac78c",
-    "dca4a2aaf5f63db387e264ba5963943a",
-    "d01b1bcc50b4b66c1231142eae628cd3",
-    "b792d8826b67a21757ea7097cff9e05b",
-    "f94ce7101bb87fd3bb9312112527dbf4",
-#if CONFIG_SMOOTH_HV
-    "688c6660a6dc6fa61fa1aa38e708c209",
-    "0cdf641b4f81d69509c92ae0b93ef5ff",
-#endif
-  };
-  static const char *const kSignatures16x32[kNumAv1IntraFuncs] = {
-    "aee4b3b0e3cc02d48e2c40d77f807927",
-    "8baef2b2e789f79c8df9d90ad10f34a4",
-    "038c38ee3c4f090bb8d736eab136aafc",
-    "1a3de2aaeaffd68a9fd6c7f6557b83f3",
-    "385c6e0ea29421dd81011a2934641e26",
-    "6cf96c285d1a2d4787f955dad715b08c",
-    "21f82421fda1c3afca8baca0dc048a52",
-    "eac3734852c99a051f6d15a921d9e7b9",
-    "c81f7ffec79508bf78d0f2c67d8abe96",
-    "14b8c62304f65a06653b9b35dfe12d97",
-    "e0893310042511275ae04e5186ee5326",
-    "b4f05903a6191093be719794417ac6fd",
-    "2d7f75dcd73b9528c8396279ff09ff3a",
-    "5a63cd1841e4ed470e4ca5ef845f2281",
-#if CONFIG_SMOOTH_HV
-    "610d899ca945fbead33287d4335a8b32",
-    "6bafaad81fce37be46730187e78d8b11",
-#endif
-  };
-  if (!strcmp(block_name, "Hbd Intra16x16")) {
-    TestHighbdIntraPred(block_name, pred_funcs, kSignatures16x16, 16, 16);
-  }
-  if (!strcmp(block_name, "Hbd Intra16x8")) {
-    TestHighbdIntraPred(block_name, pred_funcs, kSignatures16x8, 16, 8);
-  }
-  if (!strcmp(block_name, "Hbd Intra16x32")) {
-    TestHighbdIntraPred(block_name, pred_funcs, kSignatures16x32, 16, 32);
+    CheckMd5Signature(
+        tx_size, true, signatures, intra_pred_test_mem.src,
+        intra_pred_test_mem.num_pixels * sizeof(*intra_pred_test_mem.src),
+        elapsed_time, k);
   }
 }
 
-void TestHighbdIntraPred32(const char *block_name,
-                           AvxHighbdPredFunc const *pred_funcs) {
-  static const char *const kSignatures32x32[kNumAv1IntraFuncs] = {
-    "a3e8056ba7e36628cce4917cd956fedd",
-    "cc7d3024fe8748b512407edee045377e",
-    "2aab0a0f330a1d3e19b8ecb8f06387a3",
-    "a547bc3fb7b06910bf3973122a426661",
-    "26f712514da95042f93d6e8dc8e431dc",
-    "bb08c6e16177081daa3d936538dbc2e3",
-    "4e10f10b082a5b4265080c102d34eb47",
-    "42867c8553285e94ee8e4df7abafbda8",
-    "6496bdee96100667833f546e1be3d640",
-    "2ebfa25bf981377e682e580208504300",
-    "1788695b10a6f82ae1a56686dcbcd0a9",
-    "c3b9c506604a7132bbb5f4e97bdb03f0",
-    "84bf83f94a51b33654ca940c6f8bc057",
-    "7168b03fc31bf29596a344d6a35d007c",
-#if CONFIG_SMOOTH_HV
-    "b073a70d3672f1282236994f5d12e94b",
-    "c51607aebad5dcb3c1e3b58ef9e5b84e",
-#endif  // CONFIG_SMOOTH_HV
-  };
-  static const char *const kSignatures32x16[kNumAv1IntraFuncs] = {
-    "290b23c9f5a1de7905bfa71a942da29b",
-    "701e7b82593c66da5052fc4b6afd79ce",
-    "4da828c5455cd246735a663fbb204989",
-    "e3fbeaf234efece8dbd752b77226200c",
-    "4d1d8c969f05155a7e7e84cf7aad021b",
-    "c22e4877c2c946d5bdc0d542e29e70cf",
-    "ffd86b234d65c2e1386a5b5b5c188a69",
-    "50aaaa7d90e300b635ab18cdd73e189b",
-    "a945dc7429df168e2169d81b58a15859",
-    "66725070d7fad02dee78730ba0843e19",
-    "33d873cb05d45df2af4ff59033833db7",
-    "0dd783695b69271f65d56f5516fa6dc0",
-    "8ac1ce815e7780500f842b0beb0bb980",
-    "9fee2e2502b507f25bfad30a55b0b610",
-#if CONFIG_SMOOTH_HV
-    "4ced9c212ec6f9956e27f68a91b59fef",
-    "4a7a0b93f138bb0863e4e465b01ec0b1",
-#endif
-  };
-  if (!strcmp(block_name, "Hbd Intra32x32")) {
-    TestHighbdIntraPred(block_name, pred_funcs, kSignatures32x32, 32, 32);
-  }
-  if (!strcmp(block_name, "Hbd Intra32x16")) {
-    TestHighbdIntraPred(block_name, pred_funcs, kSignatures32x16, 32, 16);
-  }
-}
+static const char *const kHighbdSignatures[TX_SIZES_ALL][kNumAv1IntraFuncs] = {
+  {
+      // 4X4
+      "11f74af6c5737df472f3275cbde062fa",
+      "51bea056b6447c93f6eb8f6b7e8f6f71",
+      "27e97f946766331795886f4de04c5594",
+      "53ab15974b049111fb596c5168ec7e3f",
+      "f0b640bb176fbe4584cf3d32a9b0320a",
+      "729783ca909e03afd4b47111c80d967b",
+      "6e30009c45474a22032678b1bd579c8f",
+      "e57cba016d808aa8a35619df2a65f049",
+      "55a6c37f39afcbbf5abca4a985b96459",
+      "a623d45b37dafec1f8a75c4c5218913d",
+  },
+  {
+      // 8X8
+      "03da8829fe94663047fd108c5fcaa71d",
+      "ecdb37b8120a2d3a4c706b016bd1bfd7",
+      "1d4543ed8d2b9368cb96898095fe8a75",
+      "f791c9a67b913cbd82d9da8ecede30e2",
+      "065c70646f4dbaff913282f55a45a441",
+      "51f87123616662ef7c35691497dfd0ba",
+      "85c01ba03df68f9ece7bd3fa0f8980e6",
+      "ad19b7dac092f56df6d054e1f67f21e7",
+      "0edc415b5dd7299f7a34fb9f71d31d78",
+      "2bc8ec19e9f4b77a64b8a0a1f6aec7e7",
+  },
+  {
+      // 16X16
+      "e33cb3f56a878e2fddb1b2fc51cdd275",
+      "c7bff6f04b6052c8ab335d726dbbd52d",
+      "d0b0b47b654a9bcc5c6008110a44589b",
+      "78f5da7b10b2b9ab39f114a33b6254e9",
+      "c78e31d23831abb40d6271a318fdd6f3",
+      "90d1347f4ec9198a0320daecb6ff90b8",
+      "e63ded54ab3d0e8728b6f24d4f01e53f",
+      "35ce21fbe0ea114c089fc3489a78155d",
+      "f277f6ef8e4d717f1f0dfe2706ac197d",
+      "e8014d3f41256976c02e0f1e622ba2b9",
+  },
+  {
+      // 32X32
+      "a3e8056ba7e36628cce4917cd956fedd",
+      "cc7d3024fe8748b512407edee045377e",
+      "2aab0a0f330a1d3e19b8ecb8f06387a3",
+      "a547bc3fb7b06910bf3973122a426661",
+      "26f712514da95042f93d6e8dc8e431dc",
+      "bb08c6e16177081daa3d936538dbc2e3",
+      "84bf83f94a51b33654ca940c6f8bc057",
+      "7168b03fc31bf29596a344d6a35d007c",
+      "b073a70d3672f1282236994f5d12e94b",
+      "c51607aebad5dcb3c1e3b58ef9e5b84e",
+  },
+  {
+      // 64X64
+      "a6baa0d4bfb2269a94c7a38f86a4bccf",
+      "3f1ef5f473a49eba743f17a3324adf9d",
+      "12ac11889ae5f55b7781454efd706a6a",
+      "d9a906c0e692b22e1b4414e71a704b7e",
+      "47d4cadd56f70c11ff8f3e5d8df81161",
+      "de997744cf24c16c5ac2a36b02b351cc",
+      "23781211ae178ddeb6c4bb97a6bd7d83",
+      "a79d2e28340ca34b9e37daabbf030f63",
+      "0372bd3ddfc258750a6ac106b70587f4",
+      "228ef625d9460cbf6fa253a16a730976",
+  },
+  {
+      // 4X8
+      "22d519b796d59644043466320e4ccd14",
+      "09513a738c49b3f9542d27f34abbe1d5",
+      "807ae5e8813443ff01e71be6efacfb69",
+      "cbfa18d0293430b6e9708b0be1fd2394",
+      "346c354c34ec7fa780b576db355dab88",
+      "f97dae85c35359632380b09ca98d611e",
+      "698ae351d8896d89ed9e4e67b6e53eda",
+      "dcc197034a9c45a3d8238bf085835f4e",
+      "7a35e2c42ffdc2efc2d6d1d75a100fc7",
+      "41ab6cebd4516c87a91b2a593e2c2506",
+  },
+  {
+      // 8X4
+      "d58cd4c4bf3b7bbaa5db5e1a5622ec78",
+      "6e572c35aa782d00cafcb99e9ea047ea",
+      "e8c22a3702b416dc9ab974505afbed09",
+      "aaa4e4762a795aad7ad74de0c662c4e4",
+      "a19f9101967383c3dcbd516dc317a291",
+      "9ab8cb91f1a595b9ebe3fe8de58031aa",
+      "2cf9021d5f1169268699807ee118b65f",
+      "ee9605fcbd6fb871f1c5cd81a6989327",
+      "b4871af8316089e3e23522175df7e93f",
+      "d33301e1c2cb173be46792a22d19881a",
+  },
+  {
+      // 8X16
+      "4562de1d0336610880fdd5685498a9ec",
+      "16310fa7076394f16fc85c4b149d89c9",
+      "0e94af88e1dc573b6f0f499cddd1f530",
+      "dfd245ee20d091c67809160340365aa9",
+      "d3562504327f70c096c5be23fd8a3747",
+      "601b853558502acbb5135eadd2da117a",
+      "3c624345a723a1b2b1bea05a6a08bc99",
+      "2a9c781de609e0184cc7ab442050f4e5",
+      "0ddc5035c22252747126b61fc238c74d",
+      "e43f5d83bab759af69c7b6773fc8f9b2",
+  },
+  {
+      // 16X8
+      "a57d6b5a9bfd30c29591d8717ace9c51",
+      "f5907ba97ee6c53e339e953fc8d845ee",
+      "ea3aa727913ce45af06f89dd1808db5f",
+      "408af4f23e48d14b48ee35ae094fcd18",
+      "85c41cbcb5d744f7961e8950026fbffe",
+      "8a4e588a837638887ba671f8d4910485",
+      "b792d8826b67a21757ea7097cff9e05b",
+      "f94ce7101bb87fd3bb9312112527dbf4",
+      "688c6660a6dc6fa61fa1aa38e708c209",
+      "0cdf641b4f81d69509c92ae0b93ef5ff",
+  },
+  {
+      // 16X32
+      "aee4b3b0e3cc02d48e2c40d77f807927",
+      "8baef2b2e789f79c8df9d90ad10f34a4",
+      "038c38ee3c4f090bb8d736eab136aafc",
+      "1a3de2aaeaffd68a9fd6c7f6557b83f3",
+      "385c6e0ea29421dd81011a2934641e26",
+      "6cf96c285d1a2d4787f955dad715b08c",
+      "2d7f75dcd73b9528c8396279ff09ff3a",
+      "5a63cd1841e4ed470e4ca5ef845f2281",
+      "610d899ca945fbead33287d4335a8b32",
+      "6bafaad81fce37be46730187e78d8b11",
+  },
+  {
+      // 32X16
+      "290b23c9f5a1de7905bfa71a942da29b",
+      "701e7b82593c66da5052fc4b6afd79ce",
+      "4da828c5455cd246735a663fbb204989",
+      "e3fbeaf234efece8dbd752b77226200c",
+      "4d1d8c969f05155a7e7e84cf7aad021b",
+      "c22e4877c2c946d5bdc0d542e29e70cf",
+      "8ac1ce815e7780500f842b0beb0bb980",
+      "9fee2e2502b507f25bfad30a55b0b610",
+      "4ced9c212ec6f9956e27f68a91b59fef",
+      "4a7a0b93f138bb0863e4e465b01ec0b1",
+  },
+  {
+      // 32X64
+      "ad9cfc395a5c5644a21d958c7274ac14",
+      "f29d6d03c143ddf96fef04c19f2c8333",
+      "a8bdc852ef704dd4975c61893e8fbc3f",
+      "7d0bd7dea26226741dbca9a97f27fa74",
+      "45c27c5cca9a91b6ae8379feb0881c9f",
+      "8a0b78df1e001b85c874d686eac4aa1b",
+      "ce9fa75fac54a3f6c0cc3f2083b938f1",
+      "c0dca10d88762c954af18dc9e3791a39",
+      "61df229eddfccab913b8fda4bb02f9ac",
+      "4f4df6bc8d50a5600b573f0e44d70e66",
+  },
+  {
+      // 64X32
+      "db9d82921fd88b24fdff6f849f2f9c87",
+      "5ecc7fdc52d2f575ad4f2d0e9e6b1e11",
+      "b4581311a0a73d95dfac7f8f44591032",
+      "68bd283cfd1a125f6b2ee47cee874d36",
+      "804179f05c032908a5e36077bb87c994",
+      "fc5fd041a8ee779015394d0c066ee43c",
+      "68f5579ccadfe9a1baafb158334a3db2",
+      "fe237e45e215ab06d79046da9ad71e84",
+      "9a8a938a6824551bf7d21b8fd1d70ea1",
+      "eb7332f2017cd96882c76e7136aeaf53",
+  },
+  {
+      // 4X16
+      "7bafa307d507747b8132e7735b7f1c73",
+      "e58bc2d8213a97d1fea9cfb73d7a9633",
+      "435f8a8e8bbf14dbf2fe16b2be9e97aa",
+      "1d0e767b68d84acbfb50b7a04e633836",
+      "5f713bd7b324fe73bb7063e35ee14e5e",
+      "0dac4e1fa3d59814202715468c01ed56",
+      "47709d1db4a330c7a8900f450e6fddd1",
+      "258e0b930bb27db28f05da9cf7d1ee7c",
+      "36cf030fbae767912593efea045bfff5",
+      "248d7aceabb7499febae663fae41a920",
+  },
+  {
+      // 16X4
+      "04dde98e632670e393704742c89f9067",
+      "8c72543f1664651ae1fa08e2ac0adb9b",
+      "2354a2cdc2773aa2df8ab4010db1be39",
+      "6300ad3221c26da39b10e0e6d87ee3be",
+      "8ea30b661c6ba60b28d3167f19e449b8",
+      "fb6c1e4ff101a371cede63c2955cdb7e",
+      "a517c06433d6d7927b16a72184a23e92",
+      "393828be5d62ab6c48668bea5e2f801a",
+      "b1e510c542013eb9d6fb188dea2ce90a",
+      "569a8f2fe01679ca216535ecbcdccb62",
+  },
+  {
+      // 8X32
+      "9d541865c185ca7607852852613ac1fc",
+      "b96be67f08c6b5fa5ebd3411299c2f7c",
+      "75a2dcf50004b9d188849b048239767e",
+      "429492ff415c9fd9b050d73b2ad500f8",
+      "64b3606c1ccd036bd766bd5711392cf4",
+      "cb59844a0f01660ac955bae3511f1100",
+      "3e076155b7a70e8828618e3f33b51e3d",
+      "ed2d1f597ab7c50beff690f737cf9726",
+      "7909c6a26aaf20c59d996d3e5b5f9c29",
+      "965798807240c98c6f7cc9b457ed0773",
+  },
+  {
+      // 32X8
+      "36f391aa31619eec1f4d9ee95ea454cc",
+      "b82648f14eeba2527357cb50bc3223cb",
+      "7a7b2adf429125e8bee9d1d00a66e13f",
+      "4198e4d6ba503b7cc2d7e96bb845f661",
+      "96c160d2ec1be9fe0cdea9682f14d257",
+      "19a450bcebaa75afb4fc6bd1fd6434af",
+      "2bd2e35967d43d0ec1c6587a36f204d5",
+      "49799a99aa4ccfbd989bee92a99422f1",
+      "955530e99813812a74659edeac3f5475",
+      "f0316b84e378a19cd11b19a6e40b2914",
+  },
+  {
+      // 16X64
+      "8cba1b70a0bde29e8ef235cedc5faa7d",
+      "96d00ddc7537bf7f196006591b733b4e",
+      "cbf69d5d157c9f3355a4757b1d6e3414",
+      "3ac1f642019493dec1b737d7a3a1b4e5",
+      "35f9ee300d7fa3c97338e81a6f21dcd4",
+      "aae335442e77c8ebc280f16ea50ba9c7",
+      "a6140fdac2278644328be094d88731db",
+      "2df93621b6ff100f7008432d509f4161",
+      "c77bf5aee39e7ed4a3dd715f816f452a",
+      "02109bd63557d90225c32a8f1338258e",
+  },
+  {
+      // 64X16
+      "a5e2f9fb685d5f4a048e9a96affd25a4",
+      "1348f249690d9eefe09d9ad7ead2c801",
+      "525da4b187acd81b1ff1116b60461141",
+      "e99d072de858094c98b01bd4a6772634",
+      "873bfa9dc24693f19721f7c8d527f7d3",
+      "0acfc6507bd3468e9679efc127d6e4b9",
+      "57d03f8d079c7264854e22ac1157cfae",
+      "6c2c4036f70c7d957a9399b5436c0774",
+      "42b8e4a97b7f8416c72a5148c031c0b1",
+      "a38a2c5f79993dfae8530e9e25800893",
+  },
+};
 
 }  // namespace
 
-#define HIGHBD_INTRA_PRED_TEST(arch, test_func, block_size, dc, dc_left,     \
-                               dc_top, dc_128, v, h, d45e, d135, d117, d153, \
-                               d207e, d63e, tm, smooth, smooth_v, smooth_h)  \
-  TEST(arch, DISABLED_##test_func) {                                         \
-    static const AvxHighbdPredFunc aom_intra_pred[] = {                      \
-      dc,   dc_left, dc_top, dc_128, v,  h,      d45e,     d135,             \
-      d117, d153,    d207e,  d63e,   tm, smooth, smooth_v, smooth_h          \
-    };                                                                       \
-    test_func(block_size, aom_intra_pred);                                   \
+#define HIGHBD_INTRA_PRED_TEST(arch, tx_size, dc, dc_left, dc_top, dc_128, v, \
+                               h, paeth, smooth, smooth_v, smooth_h)          \
+  TEST(arch, DISABLED_##TestHighbdIntraPred_##tx_size) {                      \
+    static const AvxHighbdPredFunc aom_intra_pred[] = {                       \
+      dc, dc_left, dc_top, dc_128, v, h, paeth, smooth, smooth_v, smooth_h    \
+    };                                                                        \
+    TestHighbdIntraPred(tx_size, aom_intra_pred, kHighbdSignatures[tx_size]); \
   }
 
 // -----------------------------------------------------------------------------
-// 4x4
-
-#if CONFIG_SMOOTH_HV
-#define smooth_v_pred_func aom_highbd_smooth_v_predictor_4x4_c
-#define smooth_h_pred_func aom_highbd_smooth_h_predictor_4x4_c
-#else
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_SMOOTH_HV
+// 4x4, 4x8, 4x16
 
 HIGHBD_INTRA_PRED_TEST(
-    C_1, TestHighbdIntraPred4, "Hbd Intra4x4", aom_highbd_dc_predictor_4x4_c,
+    C_1, TX_4X4, aom_highbd_dc_predictor_4x4_c,
     aom_highbd_dc_left_predictor_4x4_c, aom_highbd_dc_top_predictor_4x4_c,
     aom_highbd_dc_128_predictor_4x4_c, aom_highbd_v_predictor_4x4_c,
-    aom_highbd_h_predictor_4x4_c, aom_highbd_d45e_predictor_4x4_c,
-    aom_highbd_d135_predictor_4x4_c, aom_highbd_d117_predictor_4x4_c,
-    aom_highbd_d153_predictor_4x4_c, aom_highbd_d207e_predictor_4x4_c,
-    aom_highbd_d63e_predictor_4x4_c, aom_highbd_paeth_predictor_4x4_c,
-    aom_highbd_smooth_predictor_4x4_c, smooth_v_pred_func, smooth_h_pred_func)
-#undef smooth_v_pred_func
-#undef smooth_h_pred_func
+    aom_highbd_h_predictor_4x4_c, aom_highbd_paeth_predictor_4x4_c,
+    aom_highbd_smooth_predictor_4x4_c, aom_highbd_smooth_v_predictor_4x4_c,
+    aom_highbd_smooth_h_predictor_4x4_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_2, TX_4X8, aom_highbd_dc_predictor_4x8_c,
+    aom_highbd_dc_left_predictor_4x8_c, aom_highbd_dc_top_predictor_4x8_c,
+    aom_highbd_dc_128_predictor_4x8_c, aom_highbd_v_predictor_4x8_c,
+    aom_highbd_h_predictor_4x8_c, aom_highbd_paeth_predictor_4x8_c,
+    aom_highbd_smooth_predictor_4x8_c, aom_highbd_smooth_v_predictor_4x8_c,
+    aom_highbd_smooth_h_predictor_4x8_c)
 
-#if HAVE_SSE2
 HIGHBD_INTRA_PRED_TEST(
-    SSE2_1, TestHighbdIntraPred4, "Hbd Intra4x4",
-    aom_highbd_dc_predictor_4x4_sse2, aom_highbd_dc_left_predictor_4x4_sse2,
-    aom_highbd_dc_top_predictor_4x4_sse2, aom_highbd_dc_128_predictor_4x4_sse2,
-    aom_highbd_v_predictor_4x4_sse2, aom_highbd_h_predictor_4x4_sse2,
-    aom_highbd_d45e_predictor_4x4_sse2, aom_highbd_d135_predictor_4x4_sse2,
-    aom_highbd_d117_predictor_4x4_sse2, aom_highbd_d153_predictor_4x4_sse2,
-    NULL, NULL, NULL, NULL, NULL, NULL)
-
-HIGHBD_INTRA_PRED_TEST(SSE2_2, TestHighbdIntraPred4, "Hbd Intra4x8",
-                       aom_highbd_dc_predictor_4x8_sse2,
+    C_3, TX_4X16, aom_highbd_dc_predictor_4x16_c,
+    aom_highbd_dc_left_predictor_4x16_c, aom_highbd_dc_top_predictor_4x16_c,
+    aom_highbd_dc_128_predictor_4x16_c, aom_highbd_v_predictor_4x16_c,
+    aom_highbd_h_predictor_4x16_c, aom_highbd_paeth_predictor_4x16_c,
+    aom_highbd_smooth_predictor_4x16_c, aom_highbd_smooth_v_predictor_4x16_c,
+    aom_highbd_smooth_h_predictor_4x16_c)
+
+#if HAVE_SSE2
+HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_4X4, aom_highbd_dc_predictor_4x4_sse2,
+                       aom_highbd_dc_left_predictor_4x4_sse2,
+                       aom_highbd_dc_top_predictor_4x4_sse2,
+                       aom_highbd_dc_128_predictor_4x4_sse2,
+                       aom_highbd_v_predictor_4x4_sse2,
+                       aom_highbd_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL)
+
+HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_4X8, aom_highbd_dc_predictor_4x8_sse2,
                        aom_highbd_dc_left_predictor_4x8_sse2,
                        aom_highbd_dc_top_predictor_4x8_sse2,
                        aom_highbd_dc_128_predictor_4x8_sse2,
                        aom_highbd_v_predictor_4x8_sse2,
-                       aom_highbd_h_predictor_4x8_sse2,
-                       aom_highbd_d45e_predictor_4x8_sse2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL, NULL, NULL)
+                       aom_highbd_h_predictor_4x8_sse2, NULL, NULL, NULL, NULL)
 #endif
 
-#if CONFIG_SMOOTH_HV
-#define smooth_v_pred_func aom_highbd_smooth_v_predictor_4x8_c
-#define smooth_h_pred_func aom_highbd_smooth_h_predictor_4x8_c
-#else
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_SMOOTH_HV
+// -----------------------------------------------------------------------------
+// 8x8, 8x4, 8x16, 8x32
 
 HIGHBD_INTRA_PRED_TEST(
-    C_2, TestHighbdIntraPred4, "Hbd Intra4x8", aom_highbd_dc_predictor_4x8_c,
-    aom_highbd_dc_left_predictor_4x8_c, aom_highbd_dc_top_predictor_4x8_c,
-    aom_highbd_dc_128_predictor_4x8_c, aom_highbd_v_predictor_4x8_c,
-    aom_highbd_h_predictor_4x8_c, aom_highbd_d45e_predictor_4x8_c,
-    aom_highbd_d135_predictor_4x8_c, aom_highbd_d117_predictor_4x8_c,
-    aom_highbd_d153_predictor_4x8_c, aom_highbd_d207e_predictor_4x8_c,
-    aom_highbd_d63e_predictor_4x8_c, aom_highbd_paeth_predictor_4x8_c,
-    aom_highbd_smooth_predictor_4x8_c, smooth_v_pred_func, smooth_h_pred_func)
-#undef smooth_v_pred_func
-#undef smooth_h_pred_func
+    C_1, TX_8X8, aom_highbd_dc_predictor_8x8_c,
+    aom_highbd_dc_left_predictor_8x8_c, aom_highbd_dc_top_predictor_8x8_c,
+    aom_highbd_dc_128_predictor_8x8_c, aom_highbd_v_predictor_8x8_c,
+    aom_highbd_h_predictor_8x8_c, aom_highbd_paeth_predictor_8x8_c,
+    aom_highbd_smooth_predictor_8x8_c, aom_highbd_smooth_v_predictor_8x8_c,
+    aom_highbd_smooth_h_predictor_8x8_c)
 
-// -----------------------------------------------------------------------------
-// 8x8
+HIGHBD_INTRA_PRED_TEST(
+    C_2, TX_8X4, aom_highbd_dc_predictor_8x4_c,
+    aom_highbd_dc_left_predictor_8x4_c, aom_highbd_dc_top_predictor_8x4_c,
+    aom_highbd_dc_128_predictor_8x4_c, aom_highbd_v_predictor_8x4_c,
+    aom_highbd_h_predictor_8x4_c, aom_highbd_paeth_predictor_8x4_c,
+    aom_highbd_smooth_predictor_8x4_c, aom_highbd_smooth_v_predictor_8x4_c,
+    aom_highbd_smooth_h_predictor_8x4_c)
 
-#if CONFIG_SMOOTH_HV
-#define smooth_v_pred_func aom_highbd_smooth_v_predictor_8x8_c
-#define smooth_h_pred_func aom_highbd_smooth_h_predictor_8x8_c
-#else
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_SMOOTH_HV
+HIGHBD_INTRA_PRED_TEST(
+    C_3, TX_8X16, aom_highbd_dc_predictor_8x16_c,
+    aom_highbd_dc_left_predictor_8x16_c, aom_highbd_dc_top_predictor_8x16_c,
+    aom_highbd_dc_128_predictor_8x16_c, aom_highbd_v_predictor_8x16_c,
+    aom_highbd_h_predictor_8x16_c, aom_highbd_paeth_predictor_8x16_c,
+    aom_highbd_smooth_predictor_8x16_c, aom_highbd_smooth_v_predictor_8x16_c,
+    aom_highbd_smooth_h_predictor_8x16_c)
 
 HIGHBD_INTRA_PRED_TEST(
-    C_1, TestHighbdIntraPred8, "Hbd Intra8x8", aom_highbd_dc_predictor_8x8_c,
-    aom_highbd_dc_left_predictor_8x8_c, aom_highbd_dc_top_predictor_8x8_c,
-    aom_highbd_dc_128_predictor_8x8_c, aom_highbd_v_predictor_8x8_c,
-    aom_highbd_h_predictor_8x8_c, aom_highbd_d45e_predictor_8x8_c,
-    aom_highbd_d135_predictor_8x8_c, aom_highbd_d117_predictor_8x8_c,
-    aom_highbd_d153_predictor_8x8_c, aom_highbd_d207e_predictor_8x8_c,
-    aom_highbd_d63e_predictor_8x8_c, aom_highbd_paeth_predictor_8x8_c,
-    aom_highbd_smooth_predictor_8x8_c, smooth_v_pred_func, smooth_h_pred_func)
-#undef smooth_v_pred_func
-#undef smooth_h_pred_func
+    C_4, TX_8X32, aom_highbd_dc_predictor_8x32_c,
+    aom_highbd_dc_left_predictor_8x32_c, aom_highbd_dc_top_predictor_8x32_c,
+    aom_highbd_dc_128_predictor_8x32_c, aom_highbd_v_predictor_8x32_c,
+    aom_highbd_h_predictor_8x32_c, aom_highbd_paeth_predictor_8x32_c,
+    aom_highbd_smooth_predictor_8x32_c, aom_highbd_smooth_v_predictor_8x32_c,
+    aom_highbd_smooth_h_predictor_8x32_c)
 
 #if HAVE_SSE2
-HIGHBD_INTRA_PRED_TEST(SSE2_1, TestHighbdIntraPred8, "Hbd Intra8x8",
-                       aom_highbd_dc_predictor_8x8_sse2,
+HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_8X8, aom_highbd_dc_predictor_8x8_sse2,
                        aom_highbd_dc_left_predictor_8x8_sse2,
                        aom_highbd_dc_top_predictor_8x8_sse2,
                        aom_highbd_dc_128_predictor_8x8_sse2,
                        aom_highbd_v_predictor_8x8_sse2,
-                       aom_highbd_h_predictor_8x8_sse2,
-                       aom_highbd_d45e_predictor_8x8_sse2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL, NULL, NULL)
-HIGHBD_INTRA_PRED_TEST(SSE2_2, TestHighbdIntraPred8, "Hbd Intra8x4",
-                       aom_highbd_dc_predictor_8x4_sse2,
+                       aom_highbd_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_8X4, aom_highbd_dc_predictor_8x4_sse2,
                        aom_highbd_dc_left_predictor_8x4_sse2,
                        aom_highbd_dc_top_predictor_8x4_sse2,
                        aom_highbd_dc_128_predictor_8x4_sse2,
                        aom_highbd_v_predictor_8x4_sse2,
-                       aom_highbd_h_predictor_8x4_sse2,
-                       aom_highbd_d45e_predictor_8x4_sse2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL, NULL, NULL)
-HIGHBD_INTRA_PRED_TEST(SSE2_3, TestHighbdIntraPred8, "Hbd Intra8x16",
-                       aom_highbd_dc_predictor_8x16_sse2,
+                       aom_highbd_h_predictor_8x4_sse2, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(SSE2_3, TX_8X16, aom_highbd_dc_predictor_8x16_sse2,
                        aom_highbd_dc_left_predictor_8x16_sse2,
                        aom_highbd_dc_top_predictor_8x16_sse2,
                        aom_highbd_dc_128_predictor_8x16_sse2,
                        aom_highbd_v_predictor_8x16_sse2,
-                       aom_highbd_h_predictor_8x16_sse2,
-                       aom_highbd_d45e_predictor_8x16_sse2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL, NULL, NULL)
+                       aom_highbd_h_predictor_8x16_sse2, NULL, NULL, NULL, NULL)
 #endif
 
 #if HAVE_SSSE3
-HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred8, "Hbd Intra8x8", NULL, NULL,
-                       NULL, NULL, NULL, NULL, NULL,
-                       aom_highbd_d135_predictor_8x8_ssse3,
-                       aom_highbd_d117_predictor_8x8_ssse3,
-                       aom_highbd_d153_predictor_8x8_ssse3, NULL, NULL, NULL,
+HIGHBD_INTRA_PRED_TEST(SSSE3, TX_8X8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
                        NULL, NULL, NULL)
 #endif
 
-#if CONFIG_SMOOTH_HV
-#define smooth_v_pred_func aom_highbd_smooth_v_predictor_8x4_c
-#define smooth_h_pred_func aom_highbd_smooth_h_predictor_8x4_c
-#else
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_SMOOTH_HV
+// -----------------------------------------------------------------------------
+// 16x16, 16x8, 16x32, 16x4, 16x64
 
 HIGHBD_INTRA_PRED_TEST(
-    C_2, TestHighbdIntraPred8, "Hbd Intra8x4", aom_highbd_dc_predictor_8x4_c,
-    aom_highbd_dc_left_predictor_8x4_c, aom_highbd_dc_top_predictor_8x4_c,
-    aom_highbd_dc_128_predictor_8x4_c, aom_highbd_v_predictor_8x4_c,
-    aom_highbd_h_predictor_8x4_c, aom_highbd_d45e_predictor_8x4_c,
-    aom_highbd_d135_predictor_8x4_c, aom_highbd_d117_predictor_8x4_c,
-    aom_highbd_d153_predictor_8x4_c, aom_highbd_d207e_predictor_8x4_c,
-    aom_highbd_d63e_predictor_8x4_c, aom_highbd_paeth_predictor_8x4_c,
-    aom_highbd_smooth_predictor_8x4_c, smooth_v_pred_func, smooth_h_pred_func)
-#undef smooth_v_pred_func
-#undef smooth_h_pred_func
-
-#if CONFIG_SMOOTH_HV
-#define smooth_v_pred_func aom_highbd_smooth_v_predictor_8x16_c
-#define smooth_h_pred_func aom_highbd_smooth_h_predictor_8x16_c
-#else
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_SMOOTH_HV
+    C_1, TX_16X16, aom_highbd_dc_predictor_16x16_c,
+    aom_highbd_dc_left_predictor_16x16_c, aom_highbd_dc_top_predictor_16x16_c,
+    aom_highbd_dc_128_predictor_16x16_c, aom_highbd_v_predictor_16x16_c,
+    aom_highbd_h_predictor_16x16_c, aom_highbd_paeth_predictor_16x16_c,
+    aom_highbd_smooth_predictor_16x16_c, aom_highbd_smooth_v_predictor_16x16_c,
+    aom_highbd_smooth_h_predictor_16x16_c)
 
 HIGHBD_INTRA_PRED_TEST(
-    C_3, TestHighbdIntraPred8, "Hbd Intra8x16", aom_highbd_dc_predictor_8x16_c,
-    aom_highbd_dc_left_predictor_8x16_c, aom_highbd_dc_top_predictor_8x16_c,
-    aom_highbd_dc_128_predictor_8x16_c, aom_highbd_v_predictor_8x16_c,
-    aom_highbd_h_predictor_8x16_c, aom_highbd_d45e_predictor_8x16_c,
-    aom_highbd_d135_predictor_8x16_c, aom_highbd_d117_predictor_8x16_c,
-    aom_highbd_d153_predictor_8x16_c, aom_highbd_d207e_predictor_8x16_c,
-    aom_highbd_d63e_predictor_8x16_c, aom_highbd_paeth_predictor_8x16_c,
-    aom_highbd_smooth_predictor_8x16_c, smooth_v_pred_func, smooth_h_pred_func)
-#undef smooth_v_pred_func
-#undef smooth_h_pred_func
+    C_2, TX_16X8, aom_highbd_dc_predictor_16x8_c,
+    aom_highbd_dc_left_predictor_16x8_c, aom_highbd_dc_top_predictor_16x8_c,
+    aom_highbd_dc_128_predictor_16x8_c, aom_highbd_v_predictor_16x8_c,
+    aom_highbd_h_predictor_16x8_c, aom_highbd_paeth_predictor_16x8_c,
+    aom_highbd_smooth_predictor_16x8_c, aom_highbd_smooth_v_predictor_16x8_c,
+    aom_highbd_smooth_h_predictor_16x8_c)
 
-// -----------------------------------------------------------------------------
-// 16x16
+HIGHBD_INTRA_PRED_TEST(
+    C_3, TX_16X32, aom_highbd_dc_predictor_16x32_c,
+    aom_highbd_dc_left_predictor_16x32_c, aom_highbd_dc_top_predictor_16x32_c,
+    aom_highbd_dc_128_predictor_16x32_c, aom_highbd_v_predictor_16x32_c,
+    aom_highbd_h_predictor_16x32_c, aom_highbd_paeth_predictor_16x32_c,
+    aom_highbd_smooth_predictor_16x32_c, aom_highbd_smooth_v_predictor_16x32_c,
+    aom_highbd_smooth_h_predictor_16x32_c)
 
-#if CONFIG_SMOOTH_HV
-#define smooth_v_pred_func aom_highbd_smooth_v_predictor_16x16_c
-#define smooth_h_pred_func aom_highbd_smooth_h_predictor_16x16_c
-#else
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_SMOOTH_HV
+HIGHBD_INTRA_PRED_TEST(
+    C_4, TX_16X4, aom_highbd_dc_predictor_16x4_c,
+    aom_highbd_dc_left_predictor_16x4_c, aom_highbd_dc_top_predictor_16x4_c,
+    aom_highbd_dc_128_predictor_16x4_c, aom_highbd_v_predictor_16x4_c,
+    aom_highbd_h_predictor_16x4_c, aom_highbd_paeth_predictor_16x4_c,
+    aom_highbd_smooth_predictor_16x4_c, aom_highbd_smooth_v_predictor_16x4_c,
+    aom_highbd_smooth_h_predictor_16x4_c)
 
 HIGHBD_INTRA_PRED_TEST(
-    C_1, TestHighbdIntraPred16, "Hbd Intra16x16",
-    aom_highbd_dc_predictor_16x16_c, aom_highbd_dc_left_predictor_16x16_c,
-    aom_highbd_dc_top_predictor_16x16_c, aom_highbd_dc_128_predictor_16x16_c,
-    aom_highbd_v_predictor_16x16_c, aom_highbd_h_predictor_16x16_c,
-    aom_highbd_d45e_predictor_16x16_c, aom_highbd_d135_predictor_16x16_c,
-    aom_highbd_d117_predictor_16x16_c, aom_highbd_d153_predictor_16x16_c,
-    aom_highbd_d207e_predictor_16x16_c, aom_highbd_d63e_predictor_16x16_c,
-    aom_highbd_paeth_predictor_16x16_c, aom_highbd_smooth_predictor_16x16_c,
-    smooth_v_pred_func, smooth_h_pred_func)
-#undef smooth_v_pred_func
-#undef smooth_h_pred_func
+    C_5, TX_16X64, aom_highbd_dc_predictor_16x64_c,
+    aom_highbd_dc_left_predictor_16x64_c, aom_highbd_dc_top_predictor_16x64_c,
+    aom_highbd_dc_128_predictor_16x64_c, aom_highbd_v_predictor_16x64_c,
+    aom_highbd_h_predictor_16x64_c, aom_highbd_paeth_predictor_16x64_c,
+    aom_highbd_smooth_predictor_16x64_c, aom_highbd_smooth_v_predictor_16x64_c,
+    aom_highbd_smooth_h_predictor_16x64_c)
 
 #if HAVE_SSE2
-HIGHBD_INTRA_PRED_TEST(SSE2_1, TestHighbdIntraPred16, "Hbd Intra16x16",
-                       aom_highbd_dc_predictor_16x16_sse2,
+HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_16X16, aom_highbd_dc_predictor_16x16_sse2,
                        aom_highbd_dc_left_predictor_16x16_sse2,
                        aom_highbd_dc_top_predictor_16x16_sse2,
                        aom_highbd_dc_128_predictor_16x16_sse2,
                        aom_highbd_v_predictor_16x16_sse2,
                        aom_highbd_h_predictor_16x16_sse2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL, NULL, NULL, NULL)
-HIGHBD_INTRA_PRED_TEST(SSE2_2, TestHighbdIntraPred16, "Hbd Intra16x8",
-                       aom_highbd_dc_predictor_16x8_sse2,
+                       NULL)
+HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_16X8, aom_highbd_dc_predictor_16x8_sse2,
                        aom_highbd_dc_left_predictor_16x8_sse2,
                        aom_highbd_dc_top_predictor_16x8_sse2,
                        aom_highbd_dc_128_predictor_16x8_sse2,
                        aom_highbd_v_predictor_16x8_sse2,
-                       aom_highbd_h_predictor_16x8_sse2, NULL, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL, NULL, NULL)
-HIGHBD_INTRA_PRED_TEST(SSE2_3, TestHighbdIntraPred16, "Hbd Intra16x32",
-                       aom_highbd_dc_predictor_16x32_sse2,
+                       aom_highbd_h_predictor_16x8_sse2, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(SSE2_3, TX_16X32, aom_highbd_dc_predictor_16x32_sse2,
                        aom_highbd_dc_left_predictor_16x32_sse2,
                        aom_highbd_dc_top_predictor_16x32_sse2,
                        aom_highbd_dc_128_predictor_16x32_sse2,
                        aom_highbd_v_predictor_16x32_sse2,
                        aom_highbd_h_predictor_16x32_sse2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+                       NULL)
 #endif
 
 #if HAVE_SSSE3
-HIGHBD_INTRA_PRED_TEST(SSSE3_1, TestHighbdIntraPred16, "Hbd Intra16x16", NULL,
-                       NULL, NULL, NULL, NULL, NULL, NULL,
-                       aom_highbd_d135_predictor_16x16_ssse3,
-                       aom_highbd_d117_predictor_16x16_ssse3,
-                       aom_highbd_d153_predictor_16x16_ssse3, NULL, NULL, NULL,
-                       NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(SSSE3_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
 #endif
 
 #if HAVE_AVX2
-HIGHBD_INTRA_PRED_TEST(AVX2_1, TestHighbdIntraPred16, "Hbd Intra16x16", NULL,
-                       NULL, NULL, NULL, NULL, NULL,
-                       aom_highbd_d45e_predictor_16x16_avx2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL, NULL, NULL)
-
-HIGHBD_INTRA_PRED_TEST(AVX2_2, TestHighbdIntraPred16, "Hbd Intra16x8", NULL,
-                       NULL, NULL, NULL, NULL, NULL,
-                       aom_highbd_d45e_predictor_16x8_avx2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL, NULL, NULL)
-
-HIGHBD_INTRA_PRED_TEST(AVX2_3, TestHighbdIntraPred16, "Hbd Intra16x32", NULL,
-                       NULL, NULL, NULL, NULL, NULL,
-                       aom_highbd_d45e_predictor_16x32_avx2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(AVX2_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
+
+HIGHBD_INTRA_PRED_TEST(AVX2_2, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
+
+HIGHBD_INTRA_PRED_TEST(AVX2_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
 #endif
 
-#if CONFIG_SMOOTH_HV
-#define smooth_v_pred_func aom_highbd_smooth_v_predictor_16x8_c
-#define smooth_h_pred_func aom_highbd_smooth_h_predictor_16x8_c
-#else
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_SMOOTH_HV
+// -----------------------------------------------------------------------------
+// 32x32, 32x16, 32x64, 32x8
 
 HIGHBD_INTRA_PRED_TEST(
-    C_2, TestHighbdIntraPred16, "Hbd Intra16x8", aom_highbd_dc_predictor_16x8_c,
-    aom_highbd_dc_left_predictor_16x8_c, aom_highbd_dc_top_predictor_16x8_c,
-    aom_highbd_dc_128_predictor_16x8_c, aom_highbd_v_predictor_16x8_c,
-    aom_highbd_h_predictor_16x8_c, aom_highbd_d45e_predictor_16x8_c,
-    aom_highbd_d135_predictor_16x8_c, aom_highbd_d117_predictor_16x8_c,
-    aom_highbd_d153_predictor_16x8_c, aom_highbd_d207e_predictor_16x8_c,
-    aom_highbd_d63e_predictor_16x8_c, aom_highbd_paeth_predictor_16x8_c,
-    aom_highbd_smooth_predictor_16x8_c, smooth_v_pred_func, smooth_h_pred_func)
-#undef smooth_v_pred_func
-#undef smooth_h_pred_func
-
-#if CONFIG_SMOOTH_HV
-#define smooth_v_pred_func aom_highbd_smooth_v_predictor_16x32_c
-#define smooth_h_pred_func aom_highbd_smooth_h_predictor_16x32_c
-#else
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_SMOOTH_HV
+    C_1, TX_32X32, aom_highbd_dc_predictor_32x32_c,
+    aom_highbd_dc_left_predictor_32x32_c, aom_highbd_dc_top_predictor_32x32_c,
+    aom_highbd_dc_128_predictor_32x32_c, aom_highbd_v_predictor_32x32_c,
+    aom_highbd_h_predictor_32x32_c, aom_highbd_paeth_predictor_32x32_c,
+    aom_highbd_smooth_predictor_32x32_c, aom_highbd_smooth_v_predictor_32x32_c,
+    aom_highbd_smooth_h_predictor_32x32_c)
 
 HIGHBD_INTRA_PRED_TEST(
-    C_3, TestHighbdIntraPred16, "Hbd Intra16x32",
-    aom_highbd_dc_predictor_16x32_c, aom_highbd_dc_left_predictor_16x32_c,
-    aom_highbd_dc_top_predictor_16x32_c, aom_highbd_dc_128_predictor_16x32_c,
-    aom_highbd_v_predictor_16x32_c, aom_highbd_h_predictor_16x32_c,
-    aom_highbd_d45e_predictor_16x32_c, aom_highbd_d135_predictor_16x32_c,
-    aom_highbd_d117_predictor_16x32_c, aom_highbd_d153_predictor_16x32_c,
-    aom_highbd_d207e_predictor_16x32_c, aom_highbd_d63e_predictor_16x32_c,
-    aom_highbd_paeth_predictor_16x32_c, aom_highbd_smooth_predictor_16x32_c,
-    smooth_v_pred_func, smooth_h_pred_func)
-#undef smooth_v_pred_func
-#undef smooth_h_pred_func
+    C_2, TX_32X16, aom_highbd_dc_predictor_32x16_c,
+    aom_highbd_dc_left_predictor_32x16_c, aom_highbd_dc_top_predictor_32x16_c,
+    aom_highbd_dc_128_predictor_32x16_c, aom_highbd_v_predictor_32x16_c,
+    aom_highbd_h_predictor_32x16_c, aom_highbd_paeth_predictor_32x16_c,
+    aom_highbd_smooth_predictor_32x16_c, aom_highbd_smooth_v_predictor_32x16_c,
+    aom_highbd_smooth_h_predictor_32x16_c)
 
-// -----------------------------------------------------------------------------
-// 32x32
-
-#if CONFIG_SMOOTH_HV
-#define smooth_v_pred_func aom_highbd_smooth_v_predictor_32x32_c
-#define smooth_h_pred_func aom_highbd_smooth_h_predictor_32x32_c
-#else
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_SMOOTH_HV
+HIGHBD_INTRA_PRED_TEST(
+    C_3, TX_32X64, aom_highbd_dc_predictor_32x64_c,
+    aom_highbd_dc_left_predictor_32x64_c, aom_highbd_dc_top_predictor_32x64_c,
+    aom_highbd_dc_128_predictor_32x64_c, aom_highbd_v_predictor_32x64_c,
+    aom_highbd_h_predictor_32x64_c, aom_highbd_paeth_predictor_32x64_c,
+    aom_highbd_smooth_predictor_32x64_c, aom_highbd_smooth_v_predictor_32x64_c,
+    aom_highbd_smooth_h_predictor_32x64_c)
 
 HIGHBD_INTRA_PRED_TEST(
-    C_1, TestHighbdIntraPred32, "Hbd Intra32x32",
-    aom_highbd_dc_predictor_32x32_c, aom_highbd_dc_left_predictor_32x32_c,
-    aom_highbd_dc_top_predictor_32x32_c, aom_highbd_dc_128_predictor_32x32_c,
-    aom_highbd_v_predictor_32x32_c, aom_highbd_h_predictor_32x32_c,
-    aom_highbd_d45e_predictor_32x32_c, aom_highbd_d135_predictor_32x32_c,
-    aom_highbd_d117_predictor_32x32_c, aom_highbd_d153_predictor_32x32_c,
-    aom_highbd_d207e_predictor_32x32_c, aom_highbd_d63e_predictor_32x32_c,
-    aom_highbd_paeth_predictor_32x32_c, aom_highbd_smooth_predictor_32x32_c,
-    smooth_v_pred_func, smooth_h_pred_func)
-#undef smooth_v_pred_func
-#undef smooth_h_pred_func
+    C_4, TX_32X8, aom_highbd_dc_predictor_32x8_c,
+    aom_highbd_dc_left_predictor_32x8_c, aom_highbd_dc_top_predictor_32x8_c,
+    aom_highbd_dc_128_predictor_32x8_c, aom_highbd_v_predictor_32x8_c,
+    aom_highbd_h_predictor_32x8_c, aom_highbd_paeth_predictor_32x8_c,
+    aom_highbd_smooth_predictor_32x8_c, aom_highbd_smooth_v_predictor_32x8_c,
+    aom_highbd_smooth_h_predictor_32x8_c)
 
 #if HAVE_SSE2
-HIGHBD_INTRA_PRED_TEST(SSE2_1, TestHighbdIntraPred32, "Hbd Intra32x32",
-                       aom_highbd_dc_predictor_32x32_sse2,
+HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_32X32, aom_highbd_dc_predictor_32x32_sse2,
                        aom_highbd_dc_left_predictor_32x32_sse2,
                        aom_highbd_dc_top_predictor_32x32_sse2,
                        aom_highbd_dc_128_predictor_32x32_sse2,
                        aom_highbd_v_predictor_32x32_sse2,
                        aom_highbd_h_predictor_32x32_sse2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL, NULL, NULL, NULL)
-HIGHBD_INTRA_PRED_TEST(SSE2_2, TestHighbdIntraPred32, "Hbd Intra32x16",
-                       aom_highbd_dc_predictor_32x16_sse2,
+                       NULL)
+HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_32X16, aom_highbd_dc_predictor_32x16_sse2,
                        aom_highbd_dc_left_predictor_32x16_sse2,
                        aom_highbd_dc_top_predictor_32x16_sse2,
                        aom_highbd_dc_128_predictor_32x16_sse2,
                        aom_highbd_v_predictor_32x16_sse2,
                        aom_highbd_h_predictor_32x16_sse2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+                       NULL)
 #endif
 
 #if HAVE_SSSE3
-HIGHBD_INTRA_PRED_TEST(SSSE3_1, TestHighbdIntraPred32, "Hbd Intra32x32", NULL,
-                       NULL, NULL, NULL, NULL, NULL, NULL,
-                       aom_highbd_d135_predictor_32x32_ssse3,
-                       aom_highbd_d117_predictor_32x32_ssse3,
-                       aom_highbd_d153_predictor_32x32_ssse3, NULL, NULL, NULL,
-                       NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(SSSE3_1, TX_32X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
 #endif
 
 #if HAVE_AVX2
-HIGHBD_INTRA_PRED_TEST(AVX2_1, TestHighbdIntraPred32, "Hbd Intra32x32", NULL,
-                       NULL, NULL, NULL, NULL, NULL,
-                       aom_highbd_d45e_predictor_32x32_avx2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL, NULL, NULL)
-
-HIGHBD_INTRA_PRED_TEST(AVX2_2, TestHighbdIntraPred32, "Hbd Intra32x16", NULL,
-                       NULL, NULL, NULL, NULL, NULL,
-                       aom_highbd_d45e_predictor_32x16_avx2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(AVX2_1, TX_32X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
+
+HIGHBD_INTRA_PRED_TEST(AVX2_2, TX_32X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
 #endif
 
-#if CONFIG_SMOOTH_HV
-#define smooth_v_pred_func aom_highbd_smooth_v_predictor_32x16_c
-#define smooth_h_pred_func aom_highbd_smooth_h_predictor_32x16_c
-#else
-#define smooth_v_pred_func NULL
-#define smooth_h_pred_func NULL
-#endif  // CONFIG_SMOOTH_HV
+// -----------------------------------------------------------------------------
+// 64x64, 64x32, 64x16
+
+HIGHBD_INTRA_PRED_TEST(
+    C_1, TX_64X64, aom_highbd_dc_predictor_64x64_c,
+    aom_highbd_dc_left_predictor_64x64_c, aom_highbd_dc_top_predictor_64x64_c,
+    aom_highbd_dc_128_predictor_64x64_c, aom_highbd_v_predictor_64x64_c,
+    aom_highbd_h_predictor_64x64_c, aom_highbd_paeth_predictor_64x64_c,
+    aom_highbd_smooth_predictor_64x64_c, aom_highbd_smooth_v_predictor_64x64_c,
+    aom_highbd_smooth_h_predictor_64x64_c)
 
 HIGHBD_INTRA_PRED_TEST(
-    C_2, TestHighbdIntraPred32, "Hbd Intra32x16",
-    aom_highbd_dc_predictor_32x16_c, aom_highbd_dc_left_predictor_32x16_c,
-    aom_highbd_dc_top_predictor_32x16_c, aom_highbd_dc_128_predictor_32x16_c,
-    aom_highbd_v_predictor_32x16_c, aom_highbd_h_predictor_32x16_c,
-    aom_highbd_d45e_predictor_32x16_c, aom_highbd_d135_predictor_32x16_c,
-    aom_highbd_d117_predictor_32x16_c, aom_highbd_d153_predictor_32x16_c,
-    aom_highbd_d207e_predictor_32x16_c, aom_highbd_d63e_predictor_32x16_c,
-    aom_highbd_paeth_predictor_32x16_c, aom_highbd_smooth_predictor_32x16_c,
-    smooth_v_pred_func, smooth_h_pred_func)
-#undef smooth_v_pred_func
-#undef smooth_h_pred_func
-#endif  // CONFIG_HIGHBITDEPTH
+    C_2, TX_64X32, aom_highbd_dc_predictor_64x32_c,
+    aom_highbd_dc_left_predictor_64x32_c, aom_highbd_dc_top_predictor_64x32_c,
+    aom_highbd_dc_128_predictor_64x32_c, aom_highbd_v_predictor_64x32_c,
+    aom_highbd_h_predictor_64x32_c, aom_highbd_paeth_predictor_64x32_c,
+    aom_highbd_smooth_predictor_64x32_c, aom_highbd_smooth_v_predictor_64x32_c,
+    aom_highbd_smooth_h_predictor_64x32_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_3, TX_64X16, aom_highbd_dc_predictor_64x16_c,
+    aom_highbd_dc_left_predictor_64x16_c, aom_highbd_dc_top_predictor_64x16_c,
+    aom_highbd_dc_128_predictor_64x16_c, aom_highbd_v_predictor_64x16_c,
+    aom_highbd_h_predictor_64x16_c, aom_highbd_paeth_predictor_64x16_c,
+    aom_highbd_smooth_predictor_64x16_c, aom_highbd_smooth_v_predictor_64x16_c,
+    aom_highbd_smooth_h_predictor_64x16_c)
+
+// -----------------------------------------------------------------------------
 
 #include "test/test_libaom.cc"
diff --git a/third_party/aom/test/test_libaom.cc b/third_party/aom/test/test_libaom.cc
index 6d83ce66e..b55d76237 100644
--- a/third_party/aom/test/test_libaom.cc
+++ b/third_party/aom/test/test_libaom.cc
@@ -7,20 +7,21 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
+
+#include <string.h>
 
 #include <string>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #if ARCH_X86 || ARCH_X86_64
 #include "aom_ports/x86.h"
 #endif
 extern "C" {
-#if CONFIG_AV1
 extern void av1_rtcd();
-#endif  // CONFIG_AV1
 extern void aom_dsp_rtcd();
 extern void aom_scale_rtcd();
 }
@@ -30,7 +31,17 @@ static void append_negative_gtest_filter(const char *str) {
   std::string filter = ::testing::FLAGS_gtest_filter;
   // Negative patterns begin with one '-' followed by a ':' separated list.
   if (filter.find('-') == std::string::npos) filter += '-';
-  filter += str;
+  // OPT.* matches TEST() functions
+  // OPT/* matches TEST_P() functions
+  // OPT_* matches tests which have been manually sharded.
+  // We do not match OPT* because of SSE/SSE2 collisions.
+  const char *search_terminators = "./_";
+  for (size_t pos = 0; pos < strlen(search_terminators); ++pos) {
+    filter += ":";
+    filter += str;
+    filter += search_terminators[pos];
+    filter += "*";
+  }
   ::testing::FLAGS_gtest_filter = filter;
 }
 #endif  // ARCH_X86 || ARCH_X86_64
@@ -40,25 +51,21 @@ int main(int argc, char **argv) {
 
 #if ARCH_X86 || ARCH_X86_64
   const int simd_caps = x86_simd_caps();
-  if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*");
-  if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter(":SSE.*:SSE/*");
-  if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter(":SSE2.*:SSE2/*");
-  if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter(":SSE3.*:SSE3/*");
-  if (!(simd_caps & HAS_SSSE3))
-    append_negative_gtest_filter(":SSSE3.*:SSSE3/*");
-  if (!(simd_caps & HAS_SSE4_1))
-    append_negative_gtest_filter(":SSE4_1.*:SSE4_1/*");
-  if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*");
-  if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*");
+  if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter("MMX");
+  if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter("SSE");
+  if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter("SSE2");
+  if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter("SSE3");
+  if (!(simd_caps & HAS_SSSE3)) append_negative_gtest_filter("SSSE3");
+  if (!(simd_caps & HAS_SSE4_1)) append_negative_gtest_filter("SSE4_1");
+  if (!(simd_caps & HAS_SSE4_2)) append_negative_gtest_filter("SSE4_2");
+  if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter("AVX");
+  if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter("AVX2");
 #endif  // ARCH_X86 || ARCH_X86_64
 
+// Shared library builds don't support whitebox tests that exercise internal
+// symbols.
 #if !CONFIG_SHARED
-// Shared library builds don't support whitebox tests
-// that exercise internal symbols.
-
-#if CONFIG_AV1
   av1_rtcd();
-#endif  // CONFIG_AV1
   aom_dsp_rtcd();
   aom_scale_rtcd();
 #endif  // !CONFIG_SHARED
diff --git a/third_party/aom/test/test_runner.cmake b/third_party/aom/test/test_runner.cmake
index a1f399642..d3747b1e3 100644
--- a/third_party/aom/test/test_runner.cmake
+++ b/third_party/aom/test/test_runner.cmake
@@ -1,22 +1,28 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT GTEST_TOTAL_SHARDS OR "${GTEST_SHARD_INDEX}" STREQUAL ""
-    OR NOT TEST_LIBAOM)
-  message(FATAL_ERROR
-          "The variables GTEST_SHARD_INDEX, GTEST_TOTAL_SHARDS and TEST_LIBAOM
-          must be defined.")
-endif ()
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(NOT GTEST_TOTAL_SHARDS OR "${GTEST_SHARD_INDEX}" STREQUAL "" OR NOT
+   TEST_LIBAOM)
+  message(
+    FATAL_ERROR
+      "The variables GTEST_SHARD_INDEX, GTEST_TOTAL_SHARDS and TEST_LIBAOM
+          must be defined."
+    )
+endif()
 
 set($ENV{GTEST_SHARD_INDEX} ${GTEST_SHARD_INDEX})
 set($ENV{GTEST_TOTAL_SHARDS} ${GTEST_TOTAL_SHARDS})
 execute_process(COMMAND ${TEST_LIBAOM} RESULT_VARIABLE test_result)
 set(test_message "Test shard ${GTEST_SHARD_INDEX}/${GTEST_TOTAL_SHARDS} result")
 message("${test_message}: ${test_result}")
+
+if(NOT "${test_result}" STREQUAL "0")
+  message(FATAL_ERROR "${test_message}: FAILED, non-zero exit code.")
+endif()
diff --git a/third_party/aom/test/test_vector_test.cc b/third_party/aom/test/test_vector_test.cc
new file mode 100644
index 000000000..85223177c
--- /dev/null
+++ b/third_party/aom/test/test_vector_test.cc
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <set>
+#include <string>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "common/tools_common.h"
+#include "config/aom_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/test_vectors.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+
+namespace {
+
+const int kThreads = 0;
+const int kFileName = 1;
+
+typedef ::testing::tuple<int, const char *> DecodeParam;
+
+class TestVectorTest : public ::libaom_test::DecoderTest,
+                       public ::libaom_test::CodecTestWithParam<DecodeParam> {
+ protected:
+  TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(NULL) {}
+
+  virtual ~TestVectorTest() {
+    if (md5_file_) fclose(md5_file_);
+  }
+
+  void OpenMD5File(const std::string &md5_file_name_) {
+    md5_file_ = libaom_test::OpenTestDataFile(md5_file_name_);
+    ASSERT_TRUE(md5_file_ != NULL)
+        << "Md5 file open failed. Filename: " << md5_file_name_;
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     const unsigned int frame_number) {
+    ASSERT_TRUE(md5_file_ != NULL);
+    char expected_md5[33];
+    char junk[128];
+
+    // Read correct md5 checksums.
+    const int res = fscanf(md5_file_, "%s  %s", expected_md5, junk);
+    ASSERT_NE(res, EOF) << "Read md5 data failed";
+    expected_md5[32] = '\0';
+
+    ::libaom_test::MD5 md5_res;
+    md5_res.Add(&img);
+    const char *actual_md5 = md5_res.Get();
+
+    // Check md5 match.
+    ASSERT_STREQ(expected_md5, actual_md5)
+        << "Md5 checksums don't match: frame number = " << frame_number;
+  }
+
+ private:
+  FILE *md5_file_;
+};
+
+// This test runs through the whole set of test vectors, and decodes them.
+// The md5 checksums are computed for each frame in the video file. If md5
+// checksums match the correct md5 data, then the test is passed. Otherwise,
+// the test failed.
+TEST_P(TestVectorTest, MD5Match) {
+  const DecodeParam input = GET_PARAM(1);
+  const std::string filename = ::testing::get<kFileName>(input);
+  aom_codec_flags_t flags = 0;
+  aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+  char str[256];
+
+  cfg.threads = ::testing::get<kThreads>(input);
+
+  snprintf(str, sizeof(str) / sizeof(str[0]) - 1, "file: %s threads: %d",
+           filename.c_str(), cfg.threads);
+  SCOPED_TRACE(str);
+
+  // Open compressed video file.
+  testing::internal::scoped_ptr<libaom_test::CompressedVideoSource> video;
+  if (filename.substr(filename.length() - 3, 3) == "ivf") {
+    video.reset(new libaom_test::IVFVideoSource(filename));
+  } else if (filename.substr(filename.length() - 4, 4) == "webm") {
+#if CONFIG_WEBM_IO
+    video.reset(new libaom_test::WebMVideoSource(filename));
+#else
+    fprintf(stderr, "WebM IO is disabled, skipping test vector %s\n",
+            filename.c_str());
+    return;
+#endif
+  }
+  ASSERT_TRUE(video.get() != NULL);
+  video->Init();
+
+  // Construct md5 file name.
+  const std::string md5_filename = filename + ".md5";
+  OpenMD5File(md5_filename);
+
+  // Set decode config and flags.
+  cfg.allow_lowbitdepth = CONFIG_LOWBITDEPTH;
+  set_cfg(cfg);
+  set_flags(flags);
+
+  // Decode frame, and check the md5 matching.
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg));
+}
+
+// TODO(yaowu): Current md5 check works only when CONFIG_LOWBITDEPTH is enabled,
+// remove CONFIG_LOWBITDEPTH when md5 check is reworked to be compatible with
+// CONFIG_LOWBITDEPTH = 0
+#if CONFIG_AV1_DECODER && CONFIG_LOWBITDEPTH
+AV1_INSTANTIATE_TEST_CASE(
+    TestVectorTest,
+    ::testing::Combine(
+        ::testing::Values(1),  // Single thread.
+        ::testing::ValuesIn(libaom_test::kAV1TestVectors,
+                            libaom_test::kAV1TestVectors +
+                                libaom_test::kNumAV1TestVectors)));
+
+// Test AV1 decode in with different numbers of threads.
+INSTANTIATE_TEST_CASE_P(
+    AV1MultiThreaded, TestVectorTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libaom_test::CodecFactory *>(&libaom_test::kAV1)),
+        ::testing::Combine(
+            ::testing::Range(2, 9),  // With 2 ~ 8 threads.
+            ::testing::ValuesIn(libaom_test::kAV1TestVectors,
+                                libaom_test::kAV1TestVectors +
+                                    libaom_test::kNumAV1TestVectors))));
+
+#endif  // CONFIG_AV1_DECODER
+
+}  // namespace
diff --git a/third_party/aom/test/test_vectors.cc b/third_party/aom/test/test_vectors.cc
new file mode 100644
index 000000000..a9edf7520
--- /dev/null
+++ b/third_party/aom/test/test_vectors.cc
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "test/test_vectors.h"
+
+namespace libaom_test {
+
+#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
+
+#if CONFIG_AV1_DECODER
+const char *const kAV1TestVectors[] = {
+  "av1-1-b8-00-quantizer-00.ivf", "av1-1-b8-00-quantizer-01.ivf",
+  "av1-1-b8-00-quantizer-02.ivf", "av1-1-b8-00-quantizer-03.ivf",
+  "av1-1-b8-00-quantizer-04.ivf", "av1-1-b8-00-quantizer-05.ivf",
+  "av1-1-b8-00-quantizer-06.ivf", "av1-1-b8-00-quantizer-07.ivf",
+  "av1-1-b8-00-quantizer-08.ivf", "av1-1-b8-00-quantizer-09.ivf",
+  "av1-1-b8-00-quantizer-10.ivf", "av1-1-b8-00-quantizer-11.ivf",
+  "av1-1-b8-00-quantizer-12.ivf", "av1-1-b8-00-quantizer-13.ivf",
+  "av1-1-b8-00-quantizer-14.ivf", "av1-1-b8-00-quantizer-15.ivf",
+  "av1-1-b8-00-quantizer-16.ivf", "av1-1-b8-00-quantizer-17.ivf",
+  "av1-1-b8-00-quantizer-18.ivf", "av1-1-b8-00-quantizer-19.ivf",
+  "av1-1-b8-00-quantizer-20.ivf", "av1-1-b8-00-quantizer-21.ivf",
+  "av1-1-b8-00-quantizer-22.ivf", "av1-1-b8-00-quantizer-23.ivf",
+  "av1-1-b8-00-quantizer-24.ivf", "av1-1-b8-00-quantizer-25.ivf",
+  "av1-1-b8-00-quantizer-26.ivf", "av1-1-b8-00-quantizer-27.ivf",
+  "av1-1-b8-00-quantizer-28.ivf", "av1-1-b8-00-quantizer-29.ivf",
+  "av1-1-b8-00-quantizer-30.ivf", "av1-1-b8-00-quantizer-31.ivf",
+  "av1-1-b8-00-quantizer-32.ivf", "av1-1-b8-00-quantizer-33.ivf",
+  "av1-1-b8-00-quantizer-34.ivf", "av1-1-b8-00-quantizer-35.ivf",
+  "av1-1-b8-00-quantizer-36.ivf", "av1-1-b8-00-quantizer-37.ivf",
+  "av1-1-b8-00-quantizer-38.ivf", "av1-1-b8-00-quantizer-39.ivf",
+  "av1-1-b8-00-quantizer-40.ivf", "av1-1-b8-00-quantizer-41.ivf",
+  "av1-1-b8-00-quantizer-42.ivf", "av1-1-b8-00-quantizer-43.ivf",
+  "av1-1-b8-00-quantizer-44.ivf", "av1-1-b8-00-quantizer-45.ivf",
+  "av1-1-b8-00-quantizer-46.ivf", "av1-1-b8-00-quantizer-47.ivf",
+  "av1-1-b8-00-quantizer-48.ivf", "av1-1-b8-00-quantizer-49.ivf",
+  "av1-1-b8-00-quantizer-50.ivf", "av1-1-b8-00-quantizer-51.ivf",
+  "av1-1-b8-00-quantizer-52.ivf", "av1-1-b8-00-quantizer-53.ivf",
+  "av1-1-b8-00-quantizer-54.ivf", "av1-1-b8-00-quantizer-55.ivf",
+  "av1-1-b8-00-quantizer-56.ivf", "av1-1-b8-00-quantizer-57.ivf",
+  "av1-1-b8-00-quantizer-58.ivf", "av1-1-b8-00-quantizer-59.ivf",
+  "av1-1-b8-00-quantizer-60.ivf", "av1-1-b8-00-quantizer-61.ivf",
+  "av1-1-b8-00-quantizer-62.ivf", "av1-1-b8-00-quantizer-63.ivf",
+  "av1-1-b8-01-size-16x16.ivf",   "av1-1-b8-01-size-16x18.ivf",
+  "av1-1-b8-01-size-16x32.ivf",   "av1-1-b8-01-size-16x34.ivf",
+  "av1-1-b8-01-size-16x64.ivf",   "av1-1-b8-01-size-16x66.ivf",
+  "av1-1-b8-01-size-18x16.ivf",   "av1-1-b8-01-size-18x18.ivf",
+  "av1-1-b8-01-size-18x32.ivf",   "av1-1-b8-01-size-18x34.ivf",
+  "av1-1-b8-01-size-18x64.ivf",   "av1-1-b8-01-size-18x66.ivf",
+  "av1-1-b8-01-size-196x196.ivf", "av1-1-b8-01-size-196x198.ivf",
+  "av1-1-b8-01-size-196x200.ivf", "av1-1-b8-01-size-196x202.ivf",
+  "av1-1-b8-01-size-196x208.ivf", "av1-1-b8-01-size-196x210.ivf",
+  "av1-1-b8-01-size-196x224.ivf", "av1-1-b8-01-size-196x226.ivf",
+  "av1-1-b8-01-size-198x196.ivf", "av1-1-b8-01-size-198x198.ivf",
+  "av1-1-b8-01-size-198x200.ivf", "av1-1-b8-01-size-198x202.ivf",
+  "av1-1-b8-01-size-198x208.ivf", "av1-1-b8-01-size-198x210.ivf",
+  "av1-1-b8-01-size-198x224.ivf", "av1-1-b8-01-size-198x226.ivf",
+  "av1-1-b8-01-size-200x196.ivf", "av1-1-b8-01-size-200x198.ivf",
+  "av1-1-b8-01-size-200x200.ivf", "av1-1-b8-01-size-200x202.ivf",
+  "av1-1-b8-01-size-200x208.ivf", "av1-1-b8-01-size-200x210.ivf",
+  "av1-1-b8-01-size-200x224.ivf", "av1-1-b8-01-size-200x226.ivf",
+  "av1-1-b8-01-size-202x196.ivf", "av1-1-b8-01-size-202x198.ivf",
+  "av1-1-b8-01-size-202x200.ivf", "av1-1-b8-01-size-202x202.ivf",
+  "av1-1-b8-01-size-202x208.ivf", "av1-1-b8-01-size-202x210.ivf",
+  "av1-1-b8-01-size-202x224.ivf", "av1-1-b8-01-size-202x226.ivf",
+  "av1-1-b8-01-size-208x196.ivf", "av1-1-b8-01-size-208x198.ivf",
+  "av1-1-b8-01-size-208x200.ivf", "av1-1-b8-01-size-208x202.ivf",
+  "av1-1-b8-01-size-208x208.ivf", "av1-1-b8-01-size-208x210.ivf",
+  "av1-1-b8-01-size-208x224.ivf", "av1-1-b8-01-size-208x226.ivf",
+  "av1-1-b8-01-size-210x196.ivf", "av1-1-b8-01-size-210x198.ivf",
+  "av1-1-b8-01-size-210x200.ivf", "av1-1-b8-01-size-210x202.ivf",
+  "av1-1-b8-01-size-210x208.ivf", "av1-1-b8-01-size-210x210.ivf",
+  "av1-1-b8-01-size-210x224.ivf", "av1-1-b8-01-size-210x226.ivf",
+  "av1-1-b8-01-size-224x196.ivf", "av1-1-b8-01-size-224x198.ivf",
+  "av1-1-b8-01-size-224x200.ivf", "av1-1-b8-01-size-224x202.ivf",
+  "av1-1-b8-01-size-224x208.ivf", "av1-1-b8-01-size-224x210.ivf",
+  "av1-1-b8-01-size-224x224.ivf", "av1-1-b8-01-size-224x226.ivf",
+  "av1-1-b8-01-size-226x196.ivf", "av1-1-b8-01-size-226x198.ivf",
+  "av1-1-b8-01-size-226x200.ivf", "av1-1-b8-01-size-226x202.ivf",
+  "av1-1-b8-01-size-226x208.ivf", "av1-1-b8-01-size-226x210.ivf",
+  "av1-1-b8-01-size-226x224.ivf", "av1-1-b8-01-size-226x226.ivf",
+  "av1-1-b8-01-size-32x16.ivf",   "av1-1-b8-01-size-32x18.ivf",
+  "av1-1-b8-01-size-32x32.ivf",   "av1-1-b8-01-size-32x34.ivf",
+  "av1-1-b8-01-size-32x64.ivf",   "av1-1-b8-01-size-32x66.ivf",
+  "av1-1-b8-01-size-34x16.ivf",   "av1-1-b8-01-size-34x18.ivf",
+  "av1-1-b8-01-size-34x32.ivf",   "av1-1-b8-01-size-34x34.ivf",
+  "av1-1-b8-01-size-34x64.ivf",   "av1-1-b8-01-size-34x66.ivf",
+  "av1-1-b8-01-size-64x16.ivf",   "av1-1-b8-01-size-64x18.ivf",
+  "av1-1-b8-01-size-64x32.ivf",   "av1-1-b8-01-size-64x34.ivf",
+  "av1-1-b8-01-size-64x64.ivf",   "av1-1-b8-01-size-64x66.ivf",
+  "av1-1-b8-01-size-66x16.ivf",   "av1-1-b8-01-size-66x18.ivf",
+  "av1-1-b8-01-size-66x32.ivf",   "av1-1-b8-01-size-66x34.ivf",
+  "av1-1-b8-01-size-66x64.ivf",   "av1-1-b8-01-size-66x66.ivf",
+};
+const int kNumAV1TestVectors = NELEMENTS(kAV1TestVectors);
+#endif  // CONFIG_AV1_DECODER
+
+}  // namespace libaom_test
diff --git a/third_party/aom/av1/common/clpf_sse4.c b/third_party/aom/test/test_vectors.h
index 537139f17..229f063a6 100644
--- a/third_party/aom/av1/common/clpf_sse4.c
+++ b/third_party/aom/test/test_vectors.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -9,6 +9,18 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "aom_dsp/aom_simd.h"
-#define SIMD_FUNC(name) name##_sse4_1
-#include "./clpf_simd.h"
+#ifndef TEST_TEST_VECTORS_H_
+#define TEST_TEST_VECTORS_H_
+
+#include "config/aom_config.h"
+
+namespace libaom_test {
+
+#if CONFIG_AV1_DECODER
+extern const int kNumAV1TestVectors;
+extern const char *const kAV1TestVectors[];
+#endif
+
+}  // namespace libaom_test
+
+#endif  // TEST_TEST_VECTORS_H_
diff --git a/third_party/aom/test/tile_independence_test.cc b/third_party/aom/test/tile_independence_test.cc
index 832227fb8..e8b2e1fe4 100644
--- a/third_party/aom/test/tile_independence_test.cc
+++ b/third_party/aom/test/tile_independence_test.cc
@@ -7,7 +7,7 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <cstdio>
 #include <cstdlib>
@@ -22,30 +22,29 @@
 
 namespace {
 class TileIndependenceTest
-    : public ::libaom_test::CodecTestWith2Params<int, int>,
+    : public ::libaom_test::CodecTestWith3Params<int, int, int>,
       public ::libaom_test::EncoderTest {
  protected:
   TileIndependenceTest()
       : EncoderTest(GET_PARAM(0)), md5_fw_order_(), md5_inv_order_(),
-        n_tile_cols_(GET_PARAM(1)), n_tile_rows_(GET_PARAM(2)) {
+        n_tile_cols_(GET_PARAM(1)), n_tile_rows_(GET_PARAM(2)),
+        n_tile_groups_(GET_PARAM(3)) {
     init_flags_ = AOM_CODEC_USE_PSNR;
     aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
     cfg.w = 704;
-    cfg.h = 144;
+    cfg.h = 576;
     cfg.threads = 1;
     cfg.allow_lowbitdepth = 1;
     fw_dec_ = codec_->CreateDecoder(cfg, 0);
     inv_dec_ = codec_->CreateDecoder(cfg, 0);
     inv_dec_->Control(AV1_INVERT_TILE_DECODE_ORDER, 1);
 
-#if CONFIG_AV1
     if (fw_dec_->IsAV1() && inv_dec_->IsAV1()) {
       fw_dec_->Control(AV1_SET_DECODE_TILE_ROW, -1);
       fw_dec_->Control(AV1_SET_DECODE_TILE_COL, -1);
       inv_dec_->Control(AV1_SET_DECODE_TILE_ROW, -1);
       inv_dec_->Control(AV1_SET_DECODE_TILE_COL, -1);
     }
-#endif
   }
 
   virtual ~TileIndependenceTest() {
@@ -63,10 +62,9 @@ class TileIndependenceTest
     if (video->frame() == 1) {
       encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_);
       encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_);
-#if CONFIG_LOOPFILTERING_ACROSS_TILES
-      encoder->Control(AV1E_SET_TILE_LOOPFILTER, 0);
-#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
       SetCpuUsed(encoder);
+    } else if (video->frame() == 3) {
+      encoder->Control(AV1E_SET_NUM_TG, n_tile_groups_);
     }
   }
 
@@ -114,15 +112,16 @@ class TileIndependenceTest
  private:
   int n_tile_cols_;
   int n_tile_rows_;
+  int n_tile_groups_;
 };
 
 // run an encode with 2 or 4 tiles, and do the decode both in normal and
 // inverted tile ordering. Ensure that the MD5 of the output in both cases
 // is identical. If so, tiles are considered independent and the test passes.
 TEST_P(TileIndependenceTest, MD5Match) {
-#if CONFIG_EXT_TILE
   cfg_.large_scale_tile = 0;
-#endif  // CONFIG_EXT_TILE
+  fw_dec_->Control(AV1_SET_TILE_MODE, 0);
+  inv_dec_->Control(AV1_SET_TILE_MODE, 0);
   DoTest();
 }
 
@@ -134,36 +133,38 @@ class TileIndependenceTestLarge : public TileIndependenceTest {
 };
 
 TEST_P(TileIndependenceTestLarge, MD5Match) {
-#if CONFIG_EXT_TILE
   cfg_.large_scale_tile = 0;
-#endif  // CONFIG_EXT_TILE
+  fw_dec_->Control(AV1_SET_TILE_MODE, 0);
+  inv_dec_->Control(AV1_SET_TILE_MODE, 0);
   DoTest();
 }
 
 AV1_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Values(0, 1),
-                          ::testing::Values(0, 1));
+                          ::testing::Values(0, 1), ::testing::Values(1, 2, 4));
 AV1_INSTANTIATE_TEST_CASE(TileIndependenceTestLarge, ::testing::Values(0, 1),
-                          ::testing::Values(0, 1));
+                          ::testing::Values(0, 1), ::testing::Values(1, 2, 4));
 
-#if CONFIG_EXT_TILE
 class TileIndependenceLSTest : public TileIndependenceTest {};
 
-TEST_P(TileIndependenceLSTest, MD5Match) {
+TEST_P(TileIndependenceLSTest, DISABLED_MD5Match) {
   cfg_.large_scale_tile = 1;
+  fw_dec_->Control(AV1_SET_TILE_MODE, 1);
+  inv_dec_->Control(AV1_SET_TILE_MODE, 1);
   DoTest();
 }
 
 class TileIndependenceLSTestLarge : public TileIndependenceTestLarge {};
 
-TEST_P(TileIndependenceLSTestLarge, MD5Match) {
+TEST_P(TileIndependenceLSTestLarge, DISABLED_MD5Match) {
   cfg_.large_scale_tile = 1;
+  fw_dec_->Control(AV1_SET_TILE_MODE, 1);
+  inv_dec_->Control(AV1_SET_TILE_MODE, 1);
   DoTest();
 }
 
 AV1_INSTANTIATE_TEST_CASE(TileIndependenceLSTest, ::testing::Values(1, 2, 32),
-                          ::testing::Values(1, 2, 32));
+                          ::testing::Values(1, 2, 32), ::testing::Values(1));
 AV1_INSTANTIATE_TEST_CASE(TileIndependenceLSTestLarge,
                           ::testing::Values(1, 2, 32),
-                          ::testing::Values(1, 2, 32));
-#endif  // CONFIG_EXT_TILE
+                          ::testing::Values(1, 2, 32), ::testing::Values(1));
 }  // namespace
diff --git a/third_party/aom/test/tools_common.sh b/third_party/aom/test/tools_common.sh
index c1262b6c8..21a6b9b8e 100755
--- a/third_party/aom/test/tools_common.sh
+++ b/third_party/aom/test/tools_common.sh
@@ -76,61 +76,46 @@ cleanup() {
   if [ -n "${AOM_TOOL_TEST}" ] && [ "${AOM_TOOL_TEST}" != '<unset>' ]; then
     echo "FAIL: $AOM_TOOL_TEST"
   fi
+  if [ "${AOM_TEST_PRESERVE_OUTPUT}" = "yes" ]; then
+    return
+  fi
   if [ -n "${AOM_TEST_OUTPUT_DIR}" ] && [ -d "${AOM_TEST_OUTPUT_DIR}" ]; then
     rm -rf "${AOM_TEST_OUTPUT_DIR}"
   fi
 }
 
-# Echoes the git hash portion of the VERSION_STRING variable defined in
-# $LIBAOM_CONFIG_PATH/config.mk to stdout, or the version number string when
-# no git hash is contained in VERSION_STRING.
-config_hash() {
-  aom_config_mk="${LIBAOM_CONFIG_PATH}/config.mk"
-  if [ ! -f "${aom_config_mk}" ]; then
-    aom_config_c="${LIBAOM_CONFIG_PATH}/aom_config.c"
-    # Clean up the aom_git_hash pointer line from aom_config.c.
-    # 1. Run grep on aom_config.c for aom_git_hash and limit results to 1.
-    # 2. Split the line using ' = "' as separator.
-    # 3. Abuse sed to consume the trailing "; from the assignment to the
-    #    aom_git_hash pointer.
-    awk -F ' = "' '/aom_git_hash/ { print $NF; exit }' "${aom_config_c}" \
-      | sed s/\"\;//
-    return
-  fi
+# Echoes the version string assigned to the VERSION_STRING_NOSP variable defined
+# in $LIBAOM_CONFIG_PATH/config/aom_version.h to stdout.
+cmake_version() {
+  aom_version_h="${LIBAOM_CONFIG_PATH}/config/aom_version.h"
 
-  # Find VERSION_STRING line, split it with "-g" and print the last field to
-  # output the git hash to stdout.
-  aom_version=$(awk -F -g '/VERSION_STRING/ {print $NF}' "${aom_config_mk}")
-  # Handle two situations here:
-  # 1. The default case: $aom_version is a git hash, so echo it unchanged.
-  # 2. When being run a non-dev tree, the -g portion is not present in the
-  #    version string: It's only the version number.
-  #    In this case $aom_version is something like 'VERSION_STRING=v1.3.0', so
-  #    we echo only what is after the '='.
-  echo "${aom_version##*=}"
+  # Find VERSION_STRING_NOSP line, split it with '"' and print the next to last
+  # field to output the version string to stdout.
+  aom_version=$(awk -F \" '/VERSION_STRING_NOSP/ {print $(NF-1)}' \
+    "${aom_version_h}")
+  echo "v${aom_version}"
 }
 
-# Echoes the short form of the current git hash.
-current_hash() {
+# Echoes current git version as reported by running 'git describe', or the
+# version used by the cmake build when git is unavailable.
+source_version() {
   if git --version > /dev/null 2>&1; then
     (cd "$(dirname "${0}")"
-    git rev-parse HEAD)
+    git describe)
   else
-    # Return the config hash if git is unavailable: Fail silently, git hashes
-    # are used only for warnings.
-    config_hash
+    cmake_version
   fi
 }
 
-# Echoes warnings to stdout when git hash in aom_config.h does not match the
-# current git hash.
-check_git_hashes() {
-  hash_at_configure_time=$(config_hash)
-  hash_now=$(current_hash)
+# Echoes warnings to stdout when source version and CMake build generated
+# version are out of sync.
+check_version_strings() {
+  cmake_version=$(cmake_version)
+  source_version=$(source_version)
 
-  if [ "${hash_at_configure_time}" != "${hash_now}" ]; then
-    echo "Warning: git hash has changed since last configure."
-    vlog "  config hash: ${hash_at_configure_time} hash now: ${hash_now}"
+  if [ "${cmake_version}" != "${source_version}" ]; then
+    echo "Warning: version has changed since last cmake run."
+    vlog "  cmake version: ${cmake_version} version now: ${source_version}"
   fi
 }
 
@@ -159,7 +144,7 @@ verify_aom_test_environment() {
 # is available.
 aom_config_option_enabled() {
   aom_config_option="${1}"
-  aom_config_file="${LIBAOM_CONFIG_PATH}/aom_config.h"
+  aom_config_file="${LIBAOM_CONFIG_PATH}/config/aom_config.h"
   config_line=$(grep "${aom_config_option}" "${aom_config_file}")
   if echo "${config_line}" | egrep -q '1$'; then
     echo yes
@@ -174,22 +159,29 @@ is_windows_target() {
   fi
 }
 
-# Echoes path to $1 when it's executable and exists in ${LIBAOM_BIN_PATH}, or an
-# empty string. Caller is responsible for testing the string once the function
-# returns.
+# Echoes path to $1 when it's executable and exists in one of the directories
+# included in $tool_paths, or an empty string. Caller is responsible for testing
+# the string once the function returns.
 aom_tool_path() {
   local readonly tool_name="$1"
-  local tool_path="${LIBAOM_BIN_PATH}/${tool_name}${AOM_TEST_EXE_SUFFIX}"
-  if [ ! -x "${tool_path}" ]; then
-    # Try one directory up: when running via examples.sh the tool could be in
-    # the parent directory of $LIBAOM_BIN_PATH.
-    tool_path="${LIBAOM_BIN_PATH}/../${tool_name}${AOM_TEST_EXE_SUFFIX}"
-  fi
+  local readonly root_path="${LIBAOM_BIN_PATH}"
+  local readonly suffix="${AOM_TEST_EXE_SUFFIX}"
+  local readonly tool_paths="\
+    ${root_path}/${tool_name}${suffix} \
+    ${root_path}/../${tool_name}${suffix} \
+    ${root_path}/tools/${tool_name}${suffix} \
+    ${root_path}/../tools/${tool_name}${suffix}"
+
+  local toolpath=""
+
+  for tool_path in ${tool_paths}; do
+    if [ -x "${tool_path}" ] && [ -f "${tool_path}" ]; then
+      echo "${tool_path}"
+      return 0
+    fi
+  done
 
-  if [ ! -x "${tool_path}" ]; then
-    tool_path=""
-  fi
-  echo "${tool_path}"
+  return 1
 }
 
 # Echoes yes to stdout when the file named by positional parameter one exists
@@ -212,6 +204,14 @@ av1_encode_available() {
   [ "$(aom_config_option_enabled CONFIG_AV1_ENCODER)" = "yes" ] && echo yes
 }
 
+# Echoes "fast" encode params for use with aomenc.
+aomenc_encode_test_fast_params() {
+  echo "--cpu-used=1
+        --limit=${AV1_ENCODE_TEST_FRAME_LIMIT}
+        --lag-in-frames=0
+        --test-decode=fatal"
+}
+
 # Echoes yes to stdout when aom_config_option_enabled() reports yes for
 # CONFIG_WEBM_IO.
 webm_io_available() {
@@ -285,7 +285,7 @@ run_tests() {
   # Combine environment and actual tests.
   local tests_to_run="${env_tests} ${tests_to_filter}"
 
-  check_git_hashes
+  check_version_strings
 
   # Run tests.
   for test in ${tests_to_run}; do
@@ -296,7 +296,7 @@ run_tests() {
     test_end "${test}"
   done
 
-  local tested_config="$(test_configuration_target) @ $(current_hash)"
+  local tested_config="$(test_configuration_target) @ $(source_version)"
   echo "${test_name}: Done, all tests pass for ${tested_config}."
 }
 
@@ -352,10 +352,9 @@ encode_yuv_raw_input_av1() {
     local readonly encoder="$(aom_tool_path aomenc)"
     shift
     eval "${encoder}" $(yuv_raw_input) \
-      --codec=av1 \
-      $@ \
-      --limit=5 \
+      $(aomenc_encode_test_fast_params) \
       --output="${output}" \
+      $@ \
       ${devnull}
 
     if [ ! -e "${output}" ]; then
@@ -427,7 +426,7 @@ else
   AOM_TEST_TEMP_ROOT=/tmp
 fi
 
-AOM_TEST_OUTPUT_DIR="${AOM_TEST_TEMP_ROOT}/aom_test_$$"
+AOM_TEST_OUTPUT_DIR="${AOM_TEST_OUTPUT_DIR:-${AOM_TEST_TEMP_ROOT}/aom_test_$$}"
 
 if ! mkdir -p "${AOM_TEST_OUTPUT_DIR}" || \
    [ ! -d "${AOM_TEST_OUTPUT_DIR}" ]; then
@@ -436,17 +435,19 @@ if ! mkdir -p "${AOM_TEST_OUTPUT_DIR}" || \
   exit 1
 fi
 
+AOM_TEST_PRESERVE_OUTPUT=${AOM_TEST_PRESERVE_OUTPUT:-no}
+
 if [ "$(is_windows_target)" = "yes" ]; then
   AOM_TEST_EXE_SUFFIX=".exe"
 fi
 
 # Variables shared by tests.
-VP8_IVF_FILE="${LIBAOM_TEST_DATA_PATH}/vp80-00-comprehensive-001.ivf"
-AV1_IVF_FILE="${LIBAOM_TEST_DATA_PATH}/vp90-2-09-subpixel-00.ivf"
-
-AV1_WEBM_FILE="${LIBAOM_TEST_DATA_PATH}/vp90-2-00-quantizer-00.webm"
-AV1_FPM_WEBM_FILE="${LIBAOM_TEST_DATA_PATH}/vp90-2-07-frame_parallel-1.webm"
-AV1_LT_50_FRAMES_WEBM_FILE="${LIBAOM_TEST_DATA_PATH}/vp90-2-02-size-32x08.webm"
+AV1_ENCODE_CPU_USED=${AV1_ENCODE_CPU_USED:-1}
+AV1_ENCODE_TEST_FRAME_LIMIT=${AV1_ENCODE_TEST_FRAME_LIMIT:-5}
+AV1_IVF_FILE="${AV1_IVF_FILE:-${AOM_TEST_OUTPUT_DIR}/av1.ivf}"
+AV1_OBU_ANNEXB_FILE="${AV1_OBU_ANNEXB_FILE:-${AOM_TEST_OUTPUT_DIR}/av1.annexb.obu}"
+AV1_OBU_SEC5_FILE="${AV1_OBU_SEC5_FILE:-${AOM_TEST_OUTPUT_DIR}/av1.section5.obu}"
+AV1_WEBM_FILE="${AV1_WEBM_FILE:-${AOM_TEST_OUTPUT_DIR}/av1.webm}"
 
 YUV_RAW_INPUT="${LIBAOM_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
 YUV_RAW_INPUT_WIDTH=352
@@ -462,18 +463,22 @@ vlog "$(basename "${0%.*}") test configuration:
   LIBAOM_BIN_PATH=${LIBAOM_BIN_PATH}
   LIBAOM_CONFIG_PATH=${LIBAOM_CONFIG_PATH}
   LIBAOM_TEST_DATA_PATH=${LIBAOM_TEST_DATA_PATH}
-  AOM_IVF_FILE=${AOM_IVF_FILE}
-  AV1_IVF_FILE=${AV1_IVF_FILE}
-  AV1_WEBM_FILE=${AV1_WEBM_FILE}
   AOM_TEST_EXE_SUFFIX=${AOM_TEST_EXE_SUFFIX}
   AOM_TEST_FILTER=${AOM_TEST_FILTER}
   AOM_TEST_LIST_TESTS=${AOM_TEST_LIST_TESTS}
   AOM_TEST_OUTPUT_DIR=${AOM_TEST_OUTPUT_DIR}
   AOM_TEST_PREFIX=${AOM_TEST_PREFIX}
+  AOM_TEST_PRESERVE_OUTPUT=${AOM_TEST_PRESERVE_OUTPUT}
   AOM_TEST_RUN_DISABLED_TESTS=${AOM_TEST_RUN_DISABLED_TESTS}
   AOM_TEST_SHOW_PROGRAM_OUTPUT=${AOM_TEST_SHOW_PROGRAM_OUTPUT}
   AOM_TEST_TEMP_ROOT=${AOM_TEST_TEMP_ROOT}
   AOM_TEST_VERBOSE_OUTPUT=${AOM_TEST_VERBOSE_OUTPUT}
+  AV1_ENCODE_CPU_USED=${AV1_ENCODE_CPU_USED}
+  AV1_ENCODE_TEST_FRAME_LIMIT=${AV1_ENCODE_TEST_FRAME_LIMIT}
+  AV1_IVF_FILE=${AV1_IVF_FILE}
+  AV1_OBU_ANNEXB_FILE=${AV1_OBU_ANNEXB_FILE}
+  AV1_OBU_SEC5_FILE=${AV1_OBU_SEC5_FILE}
+  AV1_WEBM_FILE=${AV1_WEBM_FILE}
   YUV_RAW_INPUT=${YUV_RAW_INPUT}
   YUV_RAW_INPUT_WIDTH=${YUV_RAW_INPUT_WIDTH}
   YUV_RAW_INPUT_HEIGHT=${YUV_RAW_INPUT_HEIGHT}
diff --git a/third_party/aom/test/transform_test_base.h b/third_party/aom/test/transform_test_base.h
index 21441beef..67e8faf33 100644
--- a/third_party/aom/test/transform_test_base.h
+++ b/third_party/aom/test/transform_test_base.h
@@ -12,7 +12,8 @@
 #ifndef TEST_TRANSFORM_TEST_BASE_H_
 #define TEST_TRANSFORM_TEST_BASE_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_mem/aom_mem.h"
 #include "aom/aom_codec.h"
 #include "aom_dsp/txfm_common.h"
@@ -57,12 +58,10 @@ class TransformTestBase {
         aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
     uint8_t *src = reinterpret_cast<uint8_t *>(
         aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
-#if CONFIG_HIGHBITDEPTH
     uint16_t *dst16 = reinterpret_cast<uint16_t *>(
         aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
     uint16_t *src16 = reinterpret_cast<uint16_t *>(
         aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
-#endif
 
     for (int i = 0; i < count_test_block; ++i) {
       // Initialize a test block with input range [-255, 255].
@@ -71,12 +70,10 @@ class TransformTestBase {
           src[j] = rnd.Rand8();
           dst[j] = rnd.Rand8();
           test_input_block[j] = src[j] - dst[j];
-#if CONFIG_HIGHBITDEPTH
         } else {
           src16[j] = rnd.Rand16() & mask_;
           dst16[j] = rnd.Rand16() & mask_;
           test_input_block[j] = src16[j] - dst16[j];
-#endif
         }
       }
 
@@ -84,21 +81,14 @@ class TransformTestBase {
           RunFwdTxfm(test_input_block, test_temp_block, pitch_));
       if (bit_depth_ == AOM_BITS_8) {
         ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
-#if CONFIG_HIGHBITDEPTH
       } else {
         ASM_REGISTER_STATE_CHECK(
             RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
-#endif
       }
 
       for (int j = 0; j < num_coeffs_; ++j) {
-#if CONFIG_HIGHBITDEPTH
         const int diff =
             bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        ASSERT_EQ(AOM_BITS_8, bit_depth_);
-        const int diff = dst[j] - src[j];
-#endif
         const uint32_t error = diff * diff;
         if (max_error < error) max_error = error;
         total_error += error;
@@ -119,10 +109,8 @@ class TransformTestBase {
     aom_free(test_temp_block);
     aom_free(dst);
     aom_free(src);
-#if CONFIG_HIGHBITDEPTH
     aom_free(dst16);
     aom_free(src16);
-#endif
   }
 
   void RunCoeffCheck() {
@@ -149,11 +137,9 @@ class TransformTestBase {
           input_block[in_idx] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
           if (bit_depth_ == AOM_BITS_8) {
             output_block[out_idx] = output_ref_block[out_idx] = rnd.Rand8();
-#if CONFIG_HIGHBITDEPTH
           } else {
             output_block[out_idx] = output_ref_block[out_idx] =
                 rnd.Rand16() & mask_;
-#endif
           }
         }
       }
@@ -255,7 +241,7 @@ class TransformTestBase {
       int row_length = FindRowLength();
       // The minimum quant value is 4.
       for (int j = 0; j < num_coeffs_; ++j) {
-        EXPECT_EQ(output_block[j], output_ref_block[j])
+        ASSERT_EQ(output_block[j], output_ref_block[j])
             << "Not bit-exact at test index: " << i << ", "
             << "j = " << j << std::endl;
         EXPECT_GE(row_length * kDctMaxValue << (bit_depth_ - 8),
@@ -281,12 +267,10 @@ class TransformTestBase {
     uint8_t *src = reinterpret_cast<uint8_t *>(
         aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
 
-#if CONFIG_HIGHBITDEPTH
     uint16_t *dst16 = reinterpret_cast<uint16_t *>(
         aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
     uint16_t *src16 = reinterpret_cast<uint16_t *>(
         aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
-#endif
 
     for (int i = 0; i < count_test_block; ++i) {
       // Initialize a test block with input range [-mask_, mask_].
@@ -295,12 +279,10 @@ class TransformTestBase {
           src[j] = rnd.Rand8();
           dst[j] = rnd.Rand8();
           in[j] = src[j] - dst[j];
-#if CONFIG_HIGHBITDEPTH
         } else {
           src16[j] = rnd.Rand16() & mask_;
           dst16[j] = rnd.Rand16() & mask_;
           in[j] = src16[j] - dst16[j];
-#endif
         }
       }
 
@@ -308,22 +290,16 @@ class TransformTestBase {
 
       if (bit_depth_ == AOM_BITS_8) {
         ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
-#if CONFIG_HIGHBITDEPTH
       } else {
         ASM_REGISTER_STATE_CHECK(
             RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
-#endif
       }
 
       for (int j = 0; j < num_coeffs_; ++j) {
-#if CONFIG_HIGHBITDEPTH
         const int diff =
             bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        const int diff = dst[j] - src[j];
-#endif
         const uint32_t error = diff * diff;
-        EXPECT_GE(static_cast<uint32_t>(limit), error)
+        ASSERT_GE(static_cast<uint32_t>(limit), error)
             << "Error: 4x4 IDCT has error " << error << " at index " << j;
       }
     }
@@ -331,10 +307,8 @@ class TransformTestBase {
     aom_free(coeff);
     aom_free(dst);
     aom_free(src);
-#if CONFIG_HIGHBITDEPTH
     aom_free(src16);
     aom_free(dst16);
-#endif
   }
 
   int pitch_;
diff --git a/third_party/aom/test/twopass_encoder.sh b/third_party/aom/test/twopass_encoder.sh
index 3abb7628b..cca44ced8 100755
--- a/third_party/aom/test/twopass_encoder.sh
+++ b/third_party/aom/test/twopass_encoder.sh
@@ -26,7 +26,7 @@ twopass_encoder_verify_environment() {
 # Runs twopass_encoder using the codec specified by $1 with a frame limit of
 # 100.
 twopass_encoder() {
-  local encoder="${LIBAOM_BIN_PATH}/twopass_encoder${AOM_TEST_EXE_SUFFIX}"
+  local encoder="$(aom_tool_path twopass_encoder)"
   local codec="$1"
   local output_file="${AOM_TEST_OUTPUT_DIR}/twopass_encoder_${codec}.ivf"
   local limit=7
diff --git a/third_party/aom/test/user_priv_test.cc b/third_party/aom/test/user_priv_test.cc
deleted file mode 100644
index da289c990..000000000
--- a/third_party/aom/test/user_priv_test.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "./aom_config.h"
-#include "test/acm_random.h"
-#include "test/codec_factory.h"
-#include "test/decode_test_driver.h"
-#include "test/ivf_video_source.h"
-#include "test/md5_helper.h"
-#include "test/util.h"
-#if CONFIG_WEBM_IO
-#include "test/webm_video_source.h"
-#endif
-#include "aom_mem/aom_mem.h"
-#include "aom/aom.h"
-
-namespace {
-
-using std::string;
-using libaom_test::ACMRandom;
-
-#if CONFIG_WEBM_IO
-
-void CheckUserPrivateData(void *user_priv, int *target) {
-  // actual pointer value should be the same as expected.
-  EXPECT_EQ(reinterpret_cast<void *>(target), user_priv)
-      << "user_priv pointer value does not match.";
-}
-
-// Decodes |filename|. Passes in user_priv data when calling DecodeFrame and
-// compares the user_priv from return img with the original user_priv to see if
-// they match. Both the pointer values and the values inside the addresses
-// should match.
-string DecodeFile(const string &filename) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  libaom_test::WebMVideoSource video(filename);
-  video.Init();
-
-  aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
-  cfg.allow_lowbitdepth = 1;
-  libaom_test::AV1Decoder decoder(cfg, 0);
-
-  libaom_test::MD5 md5;
-  int frame_num = 0;
-  for (video.Begin(); !::testing::Test::HasFailure() && video.cxdata();
-       video.Next()) {
-    void *user_priv = reinterpret_cast<void *>(&frame_num);
-    const aom_codec_err_t res =
-        decoder.DecodeFrame(video.cxdata(), video.frame_size(),
-                            (frame_num == 0) ? NULL : user_priv);
-    if (res != AOM_CODEC_OK) {
-      EXPECT_EQ(AOM_CODEC_OK, res) << decoder.DecodeError();
-      break;
-    }
-    libaom_test::DxDataIterator dec_iter = decoder.GetDxData();
-    const aom_image_t *img = NULL;
-
-    // Get decompressed data.
-    while ((img = dec_iter.Next())) {
-      if (frame_num == 0) {
-        CheckUserPrivateData(img->user_priv, NULL);
-      } else {
-        CheckUserPrivateData(img->user_priv, &frame_num);
-
-        // Also test ctrl_get_reference api.
-        struct av1_ref_frame ref;
-        // Randomly fetch a reference frame.
-        ref.idx = rnd.Rand8() % 3;
-        decoder.Control(AV1_GET_REFERENCE, &ref);
-
-        CheckUserPrivateData(ref.img.user_priv, NULL);
-      }
-      md5.Add(img);
-    }
-
-    frame_num++;
-  }
-  return string(md5.Get());
-}
-
-TEST(UserPrivTest, VideoDecode) {
-  // no tiles or frame parallel; this exercises the decoding to test the
-  // user_priv.
-  EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc",
-               DecodeFile("av10-2-03-size-226x226.webm").c_str());
-}
-
-#endif  // CONFIG_WEBM_IO
-
-}  // namespace
diff --git a/third_party/aom/test/util.h b/third_party/aom/test/util.h
index d1587b6ea..db00875ef 100644
--- a/third_party/aom/test/util.h
+++ b/third_party/aom/test/util.h
@@ -17,9 +17,10 @@
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "aom/aom_integer.h"
 #include "aom/aom_image.h"
+#include "aom_ports/aom_timer.h"
 
 // Macros
-#define GET_PARAM(k) std::tr1::get<k>(GetParam())
+#define GET_PARAM(k) ::testing::get<k>(GetParam())
 
 inline double compute_psnr(const aom_image_t *img1, const aom_image_t *img2) {
   assert((img1->fmt == img2->fmt) && (img1->d_w == img2->d_w) &&
@@ -44,4 +45,9 @@ inline double compute_psnr(const aom_image_t *img1, const aom_image_t *img2) {
   return psnr;
 }
 
+static INLINE double get_time_mark(aom_usec_timer *t) {
+  aom_usec_timer_mark(t);
+  return static_cast<double>(aom_usec_timer_elapsed(t));
+}
+
 #endif  // TEST_UTIL_H_
diff --git a/third_party/aom/test/variance_test.cc b/third_party/aom/test/variance_test.cc
index 2f5c22283..eb801b442 100644
--- a/third_party/aom/test/variance_test.cc
+++ b/third_party/aom/test/variance_test.cc
@@ -7,15 +7,16 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <cstdlib>
 #include <new>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
@@ -41,6 +42,10 @@ typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride,
 typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride,
                                       const uint8_t *b, int b_stride);
 typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
+typedef unsigned int (*JntSubpixAvgVarMxNFunc)(
+    const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
+    int b_stride, uint32_t *sse, const uint8_t *second_pred,
+    const JNT_COMP_PARAMS *jcp_param);
 
 using libaom_test::ACMRandom;
 
@@ -89,13 +94,11 @@ static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w,
         diff = src[y * src_stride + x] - ref[y * ref_stride + x];
         se += diff;
         sse += diff * diff;
-#if CONFIG_HIGHBITDEPTH
       } else {
         diff = CONVERT_TO_SHORTPTR(src)[y * src_stride + x] -
                CONVERT_TO_SHORTPTR(ref)[y * ref_stride + x];
         se += diff;
         sse += diff * diff;
-#endif  // CONFIG_HIGHBITDEPTH
       }
     }
   }
@@ -136,7 +139,6 @@ static uint32_t subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
         const int diff = r - src[w * y + x];
         se += diff;
         sse += diff * diff;
-#if CONFIG_HIGHBITDEPTH
       } else {
         uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
         uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
@@ -150,7 +152,6 @@ static uint32_t subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
         const int diff = r - src16[w * y + x];
         se += diff;
         sse += diff * diff;
-#endif  // CONFIG_HIGHBITDEPTH
       }
     }
   }
@@ -188,7 +189,6 @@ static uint32_t subpel_avg_variance_ref(const uint8_t *ref, const uint8_t *src,
             ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
         se += diff;
         sse += diff * diff;
-#if CONFIG_HIGHBITDEPTH
       } else {
         const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
         const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
@@ -203,7 +203,64 @@ static uint32_t subpel_avg_variance_ref(const uint8_t *ref, const uint8_t *src,
         const int diff = ((r + sec16[w * y + x] + 1) >> 1) - src16[w * y + x];
         se += diff;
         sse += diff * diff;
-#endif  // CONFIG_HIGHBITDEPTH
+      }
+    }
+  }
+  RoundHighBitDepth(bit_depth, &se, &sse);
+  *sse_ptr = static_cast<uint32_t>(sse);
+  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
+}
+
+static uint32_t jnt_subpel_avg_variance_ref(
+    const uint8_t *ref, const uint8_t *src, const uint8_t *second_pred, int l2w,
+    int l2h, int xoff, int yoff, uint32_t *sse_ptr, bool use_high_bit_depth,
+    aom_bit_depth_t bit_depth, JNT_COMP_PARAMS *jcp_param) {
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+
+  xoff <<= 1;
+  yoff <<= 1;
+
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // bilinear interpolation at a 16th pel step
+      if (!use_high_bit_depth) {
+        const int a1 = ref[(w + 0) * (y + 0) + x + 0];
+        const int a2 = ref[(w + 0) * (y + 0) + x + 1];
+        const int b1 = ref[(w + 0) * (y + 1) + x + 0];
+        const int b2 = ref[(w + 0) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int avg = ROUND_POWER_OF_TWO(
+            r * jcp_param->fwd_offset +
+                second_pred[w * y + x] * jcp_param->bck_offset,
+            DIST_PRECISION_BITS);
+        const int diff = avg - src[w * y + x];
+
+        se += diff;
+        sse += diff * diff;
+      } else {
+        const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+        const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+        const uint16_t *sec16 = CONVERT_TO_SHORTPTR(second_pred);
+        const int a1 = ref16[(w + 0) * (y + 0) + x + 0];
+        const int a2 = ref16[(w + 0) * (y + 0) + x + 1];
+        const int b1 = ref16[(w + 0) * (y + 1) + x + 0];
+        const int b2 = ref16[(w + 0) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int avg =
+            ROUND_POWER_OF_TWO(r * jcp_param->fwd_offset +
+                                   sec16[w * y + x] * jcp_param->bck_offset,
+                               DIST_PRECISION_BITS);
+        const int diff = avg - src16[w * y + x];
+
+        se += diff;
+        sse += diff * diff;
       }
     }
   }
@@ -287,7 +344,7 @@ struct TestParams {
 
 template <typename Func>
 std::ostream &operator<<(std::ostream &os, const TestParams<Func> &p) {
-  return os << "log2width/height:" << p.log2width << "/" << p.log2height
+  return os << "width/height:" << p.width << "/" << p.height
             << " function:" << reinterpret_cast<const void *>(p.func)
             << " bit-depth:" << p.bit_depth;
 }
@@ -307,23 +364,19 @@ class MainTestClass
     ref_ = new uint8_t[block_size() * unit];
     ASSERT_TRUE(src_ != NULL);
     ASSERT_TRUE(ref_ != NULL);
-#if CONFIG_HIGHBITDEPTH
     if (use_high_bit_depth()) {
       // TODO(skal): remove!
       src_ = CONVERT_TO_BYTEPTR(src_);
       ref_ = CONVERT_TO_BYTEPTR(ref_);
     }
-#endif
   }
 
   virtual void TearDown() {
-#if CONFIG_HIGHBITDEPTH
     if (use_high_bit_depth()) {
       // TODO(skal): remove!
       src_ = reinterpret_cast<uint8_t *>(CONVERT_TO_SHORTPTR(src_));
       ref_ = reinterpret_cast<uint8_t *>(CONVERT_TO_SHORTPTR(ref_));
     }
-#endif
 
     aom_free(src_);
     delete[] ref_;
@@ -343,6 +396,7 @@ class MainTestClass
   void RefTest();
   void RefStrideTest();
   void OneQuarterTest();
+  void SpeedTest();
 
   // MSE/SSE tests
   void RefTestMse();
@@ -398,11 +452,9 @@ void MainTestClass<VarianceFunctionType>::RefTest() {
       if (!use_high_bit_depth()) {
         src_[j] = rnd_.Rand8();
         ref_[j] = rnd_.Rand8();
-#if CONFIG_HIGHBITDEPTH
       } else {
         CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
         CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
-#endif  // CONFIG_HIGHBITDEPTH
       }
     }
     unsigned int sse1, sse2, var1, var2;
@@ -428,11 +480,9 @@ void MainTestClass<VarianceFunctionType>::RefStrideTest() {
       if (!use_high_bit_depth()) {
         src_[src_ind] = rnd_.Rand8();
         ref_[ref_ind] = rnd_.Rand8();
-#if CONFIG_HIGHBITDEPTH
       } else {
         CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() & mask();
         CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() & mask();
-#endif  // CONFIG_HIGHBITDEPTH
       }
     }
     unsigned int sse1, sse2;
@@ -455,12 +505,10 @@ void MainTestClass<VarianceFunctionType>::OneQuarterTest() {
     memset(src_, 255, block_size());
     memset(ref_, 255, half);
     memset(ref_ + half, 0, half);
-#if CONFIG_HIGHBITDEPTH
   } else {
     aom_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size());
     aom_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << byte_shift(), half);
     aom_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half);
-#endif  // CONFIG_HIGHBITDEPTH
   }
   unsigned int sse, var, expected;
   ASM_REGISTER_STATE_CHECK(
@@ -469,6 +517,31 @@ void MainTestClass<VarianceFunctionType>::OneQuarterTest() {
   EXPECT_EQ(expected, var);
 }
 
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::SpeedTest() {
+  for (int j = 0; j < block_size(); j++) {
+    if (!use_high_bit_depth()) {
+      src_[j] = rnd_.Rand8();
+      ref_[j] = rnd_.Rand8();
+    } else {
+      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+      CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+    }
+  }
+  unsigned int sse1, sse2, var1, var2;
+  const int stride = width();
+  int run_time = 1000000000 / block_size();
+
+  ASM_REGISTER_STATE_CHECK(var1 =
+                               params_.func(src_, stride, ref_, stride, &sse1));
+  for (int i = 0; i < run_time; ++i) {
+    ASM_REGISTER_STATE_CHECK(
+        var2 = params_.func(src_, stride, ref_, stride, &sse2));
+  }
+  EXPECT_EQ(var1, var2);
+  EXPECT_EQ(sse1, sse2);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // Tests related to MSE / SSE.
 
@@ -527,9 +600,9 @@ void MainTestClass<FunctionType>::MaxTestSse() {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-using ::std::tr1::get;
-using ::std::tr1::make_tuple;
-using ::std::tr1::tuple;
+using ::testing::get;
+using ::testing::make_tuple;
+using ::testing::tuple;
 
 template <typename FunctionType>
 class SubpelVarianceTest
@@ -540,18 +613,17 @@ class SubpelVarianceTest
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
     if (!use_high_bit_depth()) {
-      src_ = reinterpret_cast<uint8_t *>(aom_memalign(16, block_size()));
-      sec_ = reinterpret_cast<uint8_t *>(aom_memalign(16, block_size()));
-      ref_ = new uint8_t[block_size() + width() + height() + 1];
-#if CONFIG_HIGHBITDEPTH
+      src_ = reinterpret_cast<uint8_t *>(aom_memalign(32, block_size()));
+      sec_ = reinterpret_cast<uint8_t *>(aom_memalign(32, block_size()));
+      ref_ = reinterpret_cast<uint8_t *>(
+          aom_memalign(32, block_size() + width() + height() + 1));
     } else {
       src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
-          aom_memalign(16, block_size() * sizeof(uint16_t))));
+          aom_memalign(32, block_size() * sizeof(uint16_t))));
       sec_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
-          aom_memalign(16, block_size() * sizeof(uint16_t))));
+          aom_memalign(32, block_size() * sizeof(uint16_t))));
       ref_ = CONVERT_TO_BYTEPTR(aom_memalign(
-          16, (block_size() + width() + height() + 1) * sizeof(uint16_t)));
-#endif  // CONFIG_HIGHBITDEPTH
+          32, (block_size() + width() + height() + 1) * sizeof(uint16_t)));
     }
     ASSERT_TRUE(src_ != NULL);
     ASSERT_TRUE(sec_ != NULL);
@@ -561,14 +633,12 @@ class SubpelVarianceTest
   virtual void TearDown() {
     if (!use_high_bit_depth()) {
       aom_free(src_);
-      delete[] ref_;
+      aom_free(ref_);
       aom_free(sec_);
-#if CONFIG_HIGHBITDEPTH
     } else {
       aom_free(CONVERT_TO_SHORTPTR(src_));
       aom_free(CONVERT_TO_SHORTPTR(ref_));
       aom_free(CONVERT_TO_SHORTPTR(sec_));
-#endif  // CONFIG_HIGHBITDEPTH
     }
     libaom_test::ClearSystemState();
   }
@@ -582,6 +652,7 @@ class SubpelVarianceTest
   uint8_t *ref_;
   uint8_t *sec_;
   TestParams<FunctionType> params_;
+  JNT_COMP_PARAMS jcp_param_;
 
   // some relay helpers
   bool use_high_bit_depth() const { return params_.use_high_bit_depth; }
@@ -603,7 +674,6 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
         for (int j = 0; j < block_size() + width() + height() + 1; j++) {
           ref_[j] = rnd_.Rand8();
         }
-#if CONFIG_HIGHBITDEPTH
       } else {
         for (int j = 0; j < block_size(); j++) {
           CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
@@ -611,7 +681,6 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
         for (int j = 0; j < block_size() + width() + height() + 1; j++) {
           CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
         }
-#endif  // CONFIG_HIGHBITDEPTH
       }
       unsigned int sse1, sse2;
       unsigned int var1;
@@ -639,14 +708,12 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
         memset(src_ + half, 255, half);
         memset(ref_, 255, half);
         memset(ref_ + half, 0, half + width() + height() + 1);
-#if CONFIG_HIGHBITDEPTH
       } else {
         aom_memset16(CONVERT_TO_SHORTPTR(src_), mask(), half);
         aom_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half);
         aom_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half);
         aom_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask(),
                      half + width() + height() + 1);
-#endif  // CONFIG_HIGHBITDEPTH
       }
       unsigned int sse1, sse2;
       unsigned int var1;
@@ -673,7 +740,6 @@ void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
         for (int j = 0; j < block_size() + width() + height() + 1; j++) {
           ref_[j] = rnd_.Rand8();
         }
-#if CONFIG_HIGHBITDEPTH
       } else {
         for (int j = 0; j < block_size(); j++) {
           CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
@@ -682,7 +748,6 @@ void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
         for (int j = 0; j < block_size() + width() + height() + 1; j++) {
           CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
         }
-#endif  // CONFIG_HIGHBITDEPTH
       }
       uint32_t sse1, sse2;
       uint32_t var1, var2;
@@ -697,11 +762,53 @@ void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
   }
 }
 
+template <>
+void SubpelVarianceTest<JntSubpixAvgVarMxNFunc>::RefTest() {
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      if (!use_high_bit_depth()) {
+        for (int j = 0; j < block_size(); j++) {
+          src_[j] = rnd_.Rand8();
+          sec_[j] = rnd_.Rand8();
+        }
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          ref_[j] = rnd_.Rand8();
+        }
+      } else {
+        for (int j = 0; j < block_size(); j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+          CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask();
+        }
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+        }
+      }
+      for (int x0 = 0; x0 < 2; ++x0) {
+        for (int y0 = 0; y0 < 4; ++y0) {
+          uint32_t sse1, sse2;
+          uint32_t var1, var2;
+          jcp_param_.fwd_offset = quant_dist_lookup_table[x0][y0][0];
+          jcp_param_.bck_offset = quant_dist_lookup_table[x0][y0][1];
+          ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 0, x, y,
+                                                       src_, width(), &sse1,
+                                                       sec_, &jcp_param_));
+          var2 = jnt_subpel_avg_variance_ref(
+              ref_, src_, sec_, params_.log2width, params_.log2height, x, y,
+              &sse2, use_high_bit_depth(), params_.bit_depth, &jcp_param_);
+          EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+          EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+        }
+      }
+    }
+  }
+}
+
 typedef MainTestClass<Get4x4SseFunc> AvxSseTest;
 typedef MainTestClass<VarianceMxNFunc> AvxMseTest;
 typedef MainTestClass<VarianceMxNFunc> AvxVarianceTest;
 typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxSubpelVarianceTest;
 typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxSubpelAvgVarianceTest;
+typedef SubpelVarianceTest<JntSubpixAvgVarMxNFunc> AvxJntSubpelAvgVarianceTest;
 
 TEST_P(AvxSseTest, RefSse) { RefTestSse(); }
 TEST_P(AvxSseTest, MaxSse) { MaxTestSse(); }
@@ -711,11 +818,13 @@ TEST_P(AvxVarianceTest, Zero) { ZeroTest(); }
 TEST_P(AvxVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxVarianceTest, RefStride) { RefStrideTest(); }
 TEST_P(AvxVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(AvxVarianceTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
 TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
 TEST_P(AvxSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(AvxSubpelAvgVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxJntSubpelAvgVarianceTest, Ref) { RefTest(); }
 
 INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
                         ::testing::Values(aom_get_mb_ss_c));
@@ -735,7 +844,10 @@ INSTANTIATE_TEST_CASE_P(C, AvxMseTest,
 typedef TestParams<VarianceMxNFunc> VarianceParams;
 INSTANTIATE_TEST_CASE_P(
     C, AvxVarianceTest,
-    ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_c),
+    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_c),
+                      VarianceParams(7, 6, &aom_variance128x64_c),
+                      VarianceParams(6, 7, &aom_variance64x128_c),
+                      VarianceParams(6, 6, &aom_variance64x64_c),
                       VarianceParams(6, 5, &aom_variance64x32_c),
                       VarianceParams(5, 6, &aom_variance32x64_c),
                       VarianceParams(5, 5, &aom_variance32x32_c),
@@ -753,6 +865,9 @@ typedef TestParams<SubpixVarMxNFunc> SubpelVarianceParams;
 INSTANTIATE_TEST_CASE_P(
     C, AvxSubpelVarianceTest,
     ::testing::Values(
+        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_c, 0),
+        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_c, 0),
+        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_c, 0),
         SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_c, 0),
         SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_c, 0),
         SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_c, 0),
@@ -771,6 +886,9 @@ typedef TestParams<SubpixAvgVarMxNFunc> SubpelAvgVarianceParams;
 INSTANTIATE_TEST_CASE_P(
     C, AvxSubpelAvgVarianceTest,
     ::testing::Values(
+        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_c, 0),
+        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_c, 0),
+        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_c, 0),
         SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_c, 0),
         SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_c, 0),
         SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_c, 0),
@@ -785,7 +903,37 @@ INSTANTIATE_TEST_CASE_P(
         SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_c, 0),
         SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_c, 0)));
 
-#if CONFIG_HIGHBITDEPTH
+typedef TestParams<JntSubpixAvgVarMxNFunc> JntSubpelAvgVarianceParams;
+INSTANTIATE_TEST_CASE_P(
+    C, AvxJntSubpelAvgVarianceTest,
+    ::testing::Values(
+        JntSubpelAvgVarianceParams(6, 6, &aom_jnt_sub_pixel_avg_variance64x64_c,
+                                   0),
+        JntSubpelAvgVarianceParams(6, 5, &aom_jnt_sub_pixel_avg_variance64x32_c,
+                                   0),
+        JntSubpelAvgVarianceParams(5, 6, &aom_jnt_sub_pixel_avg_variance32x64_c,
+                                   0),
+        JntSubpelAvgVarianceParams(5, 5, &aom_jnt_sub_pixel_avg_variance32x32_c,
+                                   0),
+        JntSubpelAvgVarianceParams(5, 4, &aom_jnt_sub_pixel_avg_variance32x16_c,
+                                   0),
+        JntSubpelAvgVarianceParams(4, 5, &aom_jnt_sub_pixel_avg_variance16x32_c,
+                                   0),
+        JntSubpelAvgVarianceParams(4, 4, &aom_jnt_sub_pixel_avg_variance16x16_c,
+                                   0),
+        JntSubpelAvgVarianceParams(4, 3, &aom_jnt_sub_pixel_avg_variance16x8_c,
+                                   0),
+        JntSubpelAvgVarianceParams(3, 4, &aom_jnt_sub_pixel_avg_variance8x16_c,
+                                   0),
+        JntSubpelAvgVarianceParams(3, 3, &aom_jnt_sub_pixel_avg_variance8x8_c,
+                                   0),
+        JntSubpelAvgVarianceParams(3, 2, &aom_jnt_sub_pixel_avg_variance8x4_c,
+                                   0),
+        JntSubpelAvgVarianceParams(2, 3, &aom_jnt_sub_pixel_avg_variance4x8_c,
+                                   0),
+        JntSubpelAvgVarianceParams(2, 2, &aom_jnt_sub_pixel_avg_variance4x4_c,
+                                   0)));
+
 typedef MainTestClass<VarianceMxNFunc> AvxHBDMseTest;
 typedef MainTestClass<VarianceMxNFunc> AvxHBDVarianceTest;
 typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxHBDSubpelVarianceTest;
@@ -819,11 +967,9 @@ INSTANTIATE_TEST_CASE_P(
 */
 
 const VarianceParams kArrayHBDVariance_c[] = {
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   VarianceParams(7, 7, &aom_highbd_12_variance128x128_c, 12),
   VarianceParams(7, 6, &aom_highbd_12_variance128x64_c, 12),
   VarianceParams(6, 7, &aom_highbd_12_variance64x128_c, 12),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   VarianceParams(6, 6, &aom_highbd_12_variance64x64_c, 12),
   VarianceParams(6, 5, &aom_highbd_12_variance64x32_c, 12),
   VarianceParams(5, 6, &aom_highbd_12_variance32x64_c, 12),
@@ -837,11 +983,9 @@ const VarianceParams kArrayHBDVariance_c[] = {
   VarianceParams(3, 2, &aom_highbd_12_variance8x4_c, 12),
   VarianceParams(2, 3, &aom_highbd_12_variance4x8_c, 12),
   VarianceParams(2, 2, &aom_highbd_12_variance4x4_c, 12),
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   VarianceParams(7, 7, &aom_highbd_10_variance128x128_c, 10),
   VarianceParams(7, 6, &aom_highbd_10_variance128x64_c, 10),
   VarianceParams(6, 7, &aom_highbd_10_variance64x128_c, 10),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   VarianceParams(6, 6, &aom_highbd_10_variance64x64_c, 10),
   VarianceParams(6, 5, &aom_highbd_10_variance64x32_c, 10),
   VarianceParams(5, 6, &aom_highbd_10_variance32x64_c, 10),
@@ -855,11 +999,9 @@ const VarianceParams kArrayHBDVariance_c[] = {
   VarianceParams(3, 2, &aom_highbd_10_variance8x4_c, 10),
   VarianceParams(2, 3, &aom_highbd_10_variance4x8_c, 10),
   VarianceParams(2, 2, &aom_highbd_10_variance4x4_c, 10),
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   VarianceParams(7, 7, &aom_highbd_8_variance128x128_c, 8),
   VarianceParams(7, 6, &aom_highbd_8_variance128x64_c, 8),
   VarianceParams(6, 7, &aom_highbd_8_variance64x128_c, 8),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   VarianceParams(6, 6, &aom_highbd_8_variance64x64_c, 8),
   VarianceParams(6, 5, &aom_highbd_8_variance64x32_c, 8),
   VarianceParams(5, 6, &aom_highbd_8_variance32x64_c, 8),
@@ -877,21 +1019,19 @@ const VarianceParams kArrayHBDVariance_c[] = {
 INSTANTIATE_TEST_CASE_P(C, AvxHBDVarianceTest,
                         ::testing::ValuesIn(kArrayHBDVariance_c));
 
-#if HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH
+#if HAVE_SSE4_1
 INSTANTIATE_TEST_CASE_P(
     SSE4_1, AvxHBDVarianceTest,
     ::testing::Values(
         VarianceParams(2, 2, &aom_highbd_8_variance4x4_sse4_1, 8),
         VarianceParams(2, 2, &aom_highbd_10_variance4x4_sse4_1, 10),
         VarianceParams(2, 2, &aom_highbd_12_variance4x4_sse4_1, 12)));
-#endif  // HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH
+#endif  // HAVE_SSE4_1
 
 const SubpelVarianceParams kArrayHBDSubpelVariance_c[] = {
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   SubpelVarianceParams(7, 7, &aom_highbd_8_sub_pixel_variance128x128_c, 8),
   SubpelVarianceParams(7, 6, &aom_highbd_8_sub_pixel_variance128x64_c, 8),
   SubpelVarianceParams(6, 7, &aom_highbd_8_sub_pixel_variance64x128_c, 8),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   SubpelVarianceParams(6, 6, &aom_highbd_8_sub_pixel_variance64x64_c, 8),
   SubpelVarianceParams(6, 5, &aom_highbd_8_sub_pixel_variance64x32_c, 8),
   SubpelVarianceParams(5, 6, &aom_highbd_8_sub_pixel_variance32x64_c, 8),
@@ -905,11 +1045,9 @@ const SubpelVarianceParams kArrayHBDSubpelVariance_c[] = {
   SubpelVarianceParams(3, 2, &aom_highbd_8_sub_pixel_variance8x4_c, 8),
   SubpelVarianceParams(2, 3, &aom_highbd_8_sub_pixel_variance4x8_c, 8),
   SubpelVarianceParams(2, 2, &aom_highbd_8_sub_pixel_variance4x4_c, 8),
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   SubpelVarianceParams(7, 7, &aom_highbd_10_sub_pixel_variance128x128_c, 10),
   SubpelVarianceParams(7, 6, &aom_highbd_10_sub_pixel_variance128x64_c, 10),
   SubpelVarianceParams(6, 7, &aom_highbd_10_sub_pixel_variance64x128_c, 10),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   SubpelVarianceParams(6, 6, &aom_highbd_10_sub_pixel_variance64x64_c, 10),
   SubpelVarianceParams(6, 5, &aom_highbd_10_sub_pixel_variance64x32_c, 10),
   SubpelVarianceParams(5, 6, &aom_highbd_10_sub_pixel_variance32x64_c, 10),
@@ -923,11 +1061,9 @@ const SubpelVarianceParams kArrayHBDSubpelVariance_c[] = {
   SubpelVarianceParams(3, 2, &aom_highbd_10_sub_pixel_variance8x4_c, 10),
   SubpelVarianceParams(2, 3, &aom_highbd_10_sub_pixel_variance4x8_c, 10),
   SubpelVarianceParams(2, 2, &aom_highbd_10_sub_pixel_variance4x4_c, 10),
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   SubpelVarianceParams(7, 7, &aom_highbd_12_sub_pixel_variance128x128_c, 12),
   SubpelVarianceParams(7, 6, &aom_highbd_12_sub_pixel_variance128x64_c, 12),
   SubpelVarianceParams(6, 7, &aom_highbd_12_sub_pixel_variance64x128_c, 12),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   SubpelVarianceParams(6, 6, &aom_highbd_12_sub_pixel_variance64x64_c, 12),
   SubpelVarianceParams(6, 5, &aom_highbd_12_sub_pixel_variance64x32_c, 12),
   SubpelVarianceParams(5, 6, &aom_highbd_12_sub_pixel_variance32x64_c, 12),
@@ -946,14 +1082,12 @@ INSTANTIATE_TEST_CASE_P(C, AvxHBDSubpelVarianceTest,
                         ::testing::ValuesIn(kArrayHBDSubpelVariance_c));
 
 const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_c[] = {
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   SubpelAvgVarianceParams(7, 7, &aom_highbd_8_sub_pixel_avg_variance128x128_c,
                           8),
   SubpelAvgVarianceParams(7, 6, &aom_highbd_8_sub_pixel_avg_variance128x64_c,
                           8),
   SubpelAvgVarianceParams(6, 7, &aom_highbd_8_sub_pixel_avg_variance64x128_c,
                           8),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   SubpelAvgVarianceParams(6, 6, &aom_highbd_8_sub_pixel_avg_variance64x64_c, 8),
   SubpelAvgVarianceParams(6, 5, &aom_highbd_8_sub_pixel_avg_variance64x32_c, 8),
   SubpelAvgVarianceParams(5, 6, &aom_highbd_8_sub_pixel_avg_variance32x64_c, 8),
@@ -967,14 +1101,12 @@ const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_c[] = {
   SubpelAvgVarianceParams(3, 2, &aom_highbd_8_sub_pixel_avg_variance8x4_c, 8),
   SubpelAvgVarianceParams(2, 3, &aom_highbd_8_sub_pixel_avg_variance4x8_c, 8),
   SubpelAvgVarianceParams(2, 2, &aom_highbd_8_sub_pixel_avg_variance4x4_c, 8),
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   SubpelAvgVarianceParams(7, 7, &aom_highbd_10_sub_pixel_avg_variance128x128_c,
                           10),
   SubpelAvgVarianceParams(7, 6, &aom_highbd_10_sub_pixel_avg_variance128x64_c,
                           10),
   SubpelAvgVarianceParams(6, 7, &aom_highbd_10_sub_pixel_avg_variance64x128_c,
                           10),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   SubpelAvgVarianceParams(6, 6, &aom_highbd_10_sub_pixel_avg_variance64x64_c,
                           10),
   SubpelAvgVarianceParams(6, 5, &aom_highbd_10_sub_pixel_avg_variance64x32_c,
@@ -997,14 +1129,12 @@ const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_c[] = {
   SubpelAvgVarianceParams(3, 2, &aom_highbd_10_sub_pixel_avg_variance8x4_c, 10),
   SubpelAvgVarianceParams(2, 3, &aom_highbd_10_sub_pixel_avg_variance4x8_c, 10),
   SubpelAvgVarianceParams(2, 2, &aom_highbd_10_sub_pixel_avg_variance4x4_c, 10),
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
   SubpelAvgVarianceParams(7, 7, &aom_highbd_12_sub_pixel_avg_variance128x128_c,
                           12),
   SubpelAvgVarianceParams(7, 6, &aom_highbd_12_sub_pixel_avg_variance128x64_c,
                           12),
   SubpelAvgVarianceParams(6, 7, &aom_highbd_12_sub_pixel_avg_variance64x128_c,
                           12),
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   SubpelAvgVarianceParams(6, 6, &aom_highbd_12_sub_pixel_avg_variance64x64_c,
                           12),
   SubpelAvgVarianceParams(6, 5, &aom_highbd_12_sub_pixel_avg_variance64x32_c,
@@ -1030,7 +1160,6 @@ const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_c[] = {
 };
 INSTANTIATE_TEST_CASE_P(C, AvxHBDSubpelAvgVarianceTest,
                         ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_c));
-#endif  // CONFIG_HIGHBITDEPTH
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
@@ -1044,23 +1173,35 @@ INSTANTIATE_TEST_CASE_P(SSE2, AvxMseTest,
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, AvxVarianceTest,
-    ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_sse2),
+    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_sse2),
+                      VarianceParams(7, 6, &aom_variance128x64_sse2),
+                      VarianceParams(6, 7, &aom_variance64x128_sse2),
+                      VarianceParams(6, 6, &aom_variance64x64_sse2),
                       VarianceParams(6, 5, &aom_variance64x32_sse2),
+                      VarianceParams(6, 4, &aom_variance64x16_sse2),
                       VarianceParams(5, 6, &aom_variance32x64_sse2),
                       VarianceParams(5, 5, &aom_variance32x32_sse2),
                       VarianceParams(5, 4, &aom_variance32x16_sse2),
+                      VarianceParams(5, 3, &aom_variance32x8_sse2),
+                      VarianceParams(4, 6, &aom_variance16x64_sse2),
                       VarianceParams(4, 5, &aom_variance16x32_sse2),
                       VarianceParams(4, 4, &aom_variance16x16_sse2),
                       VarianceParams(4, 3, &aom_variance16x8_sse2),
+                      VarianceParams(4, 2, &aom_variance16x4_sse2),
+                      VarianceParams(3, 5, &aom_variance8x32_sse2),
                       VarianceParams(3, 4, &aom_variance8x16_sse2),
                       VarianceParams(3, 3, &aom_variance8x8_sse2),
                       VarianceParams(3, 2, &aom_variance8x4_sse2),
+                      VarianceParams(2, 4, &aom_variance4x16_sse2),
                       VarianceParams(2, 3, &aom_variance4x8_sse2),
                       VarianceParams(2, 2, &aom_variance4x4_sse2)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, AvxSubpelVarianceTest,
     ::testing::Values(
+        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_sse2, 0),
+        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_sse2, 0),
+        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_sse2, 0),
         SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_sse2, 0),
         SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_sse2, 0),
         SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_sse2, 0),
@@ -1078,6 +1219,12 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     SSE2, AvxSubpelAvgVarianceTest,
     ::testing::Values(
+        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_sse2,
+                                0),
+        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_sse2,
+                                0),
+        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_sse2,
+                                0),
         SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_sse2, 0),
         SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_sse2, 0),
         SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_sse2, 0),
@@ -1092,7 +1239,7 @@ INSTANTIATE_TEST_CASE_P(
         SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_sse2, 0),
         SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_sse2, 0)));
 
-#if HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH
+#if HAVE_SSE4_1
 INSTANTIATE_TEST_CASE_P(
     SSE4_1, AvxSubpelVarianceTest,
     ::testing::Values(
@@ -1115,9 +1262,8 @@ INSTANTIATE_TEST_CASE_P(
         SubpelAvgVarianceParams(2, 2,
                                 &aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1,
                                 12)));
-#endif  // HAVE_SSE4_1 && CONFIG_HIGHBITDEPTH
+#endif  // HAVE_SSE4_1
 
-#if CONFIG_HIGHBITDEPTH
 /* TODO(debargha): This test does not support the highbd version
 INSTANTIATE_TEST_CASE_P(
     SSE2, AvxHBDMseTest,
@@ -1278,13 +1424,15 @@ const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_sse2[] = {
 
 INSTANTIATE_TEST_CASE_P(SSE2, AvxHBDSubpelAvgVarianceTest,
                         ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_sse2));
-#endif  // CONFIG_HIGHBITDEPTH
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
 INSTANTIATE_TEST_CASE_P(
     SSSE3, AvxSubpelVarianceTest,
     ::testing::Values(
+        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_ssse3, 0),
+        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_ssse3, 0),
+        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_ssse3, 0),
         SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_ssse3, 0),
         SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_ssse3, 0),
         SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_ssse3, 0),
@@ -1302,6 +1450,12 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     SSSE3, AvxSubpelAvgVarianceTest,
     ::testing::Values(
+        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_ssse3,
+                                0),
+        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_ssse3,
+                                0),
+        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_ssse3,
+                                0),
         SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_ssse3,
                                 0),
         SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_ssse3,
@@ -1323,6 +1477,46 @@ INSTANTIATE_TEST_CASE_P(
         SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_ssse3, 0),
         SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_ssse3,
                                 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, AvxJntSubpelAvgVarianceTest,
+    ::testing::Values(
+        JntSubpelAvgVarianceParams(6, 6,
+                                   &aom_jnt_sub_pixel_avg_variance64x64_ssse3,
+                                   0),
+        JntSubpelAvgVarianceParams(6, 5,
+                                   &aom_jnt_sub_pixel_avg_variance64x32_ssse3,
+                                   0),
+        JntSubpelAvgVarianceParams(5, 6,
+                                   &aom_jnt_sub_pixel_avg_variance32x64_ssse3,
+                                   0),
+        JntSubpelAvgVarianceParams(5, 5,
+                                   &aom_jnt_sub_pixel_avg_variance32x32_ssse3,
+                                   0),
+        JntSubpelAvgVarianceParams(5, 4,
+                                   &aom_jnt_sub_pixel_avg_variance32x16_ssse3,
+                                   0),
+        JntSubpelAvgVarianceParams(4, 5,
+                                   &aom_jnt_sub_pixel_avg_variance16x32_ssse3,
+                                   0),
+        JntSubpelAvgVarianceParams(4, 4,
+                                   &aom_jnt_sub_pixel_avg_variance16x16_ssse3,
+                                   0),
+        JntSubpelAvgVarianceParams(4, 3,
+                                   &aom_jnt_sub_pixel_avg_variance16x8_ssse3,
+                                   0),
+        JntSubpelAvgVarianceParams(3, 4,
+                                   &aom_jnt_sub_pixel_avg_variance8x16_ssse3,
+                                   0),
+        JntSubpelAvgVarianceParams(3, 3,
+                                   &aom_jnt_sub_pixel_avg_variance8x8_ssse3, 0),
+        JntSubpelAvgVarianceParams(3, 2,
+                                   &aom_jnt_sub_pixel_avg_variance8x4_ssse3, 0),
+        JntSubpelAvgVarianceParams(2, 3,
+                                   &aom_jnt_sub_pixel_avg_variance4x8_ssse3, 0),
+        JntSubpelAvgVarianceParams(2, 2,
+                                   &aom_jnt_sub_pixel_avg_variance4x4_ssse3,
+                                   0)));
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX2
@@ -1331,23 +1525,48 @@ INSTANTIATE_TEST_CASE_P(AVX2, AvxMseTest,
 
 INSTANTIATE_TEST_CASE_P(
     AVX2, AvxVarianceTest,
-    ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_avx2),
+    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_avx2),
+                      VarianceParams(7, 6, &aom_variance128x64_avx2),
+                      VarianceParams(6, 7, &aom_variance64x128_avx2),
+                      VarianceParams(6, 6, &aom_variance64x64_avx2),
                       VarianceParams(6, 5, &aom_variance64x32_avx2),
+                      VarianceParams(6, 4, &aom_variance64x16_avx2),
+                      VarianceParams(5, 6, &aom_variance32x64_avx2),
                       VarianceParams(5, 5, &aom_variance32x32_avx2),
                       VarianceParams(5, 4, &aom_variance32x16_avx2),
-                      VarianceParams(4, 4, &aom_variance16x16_avx2)));
+                      VarianceParams(5, 3, &aom_variance32x8_avx2),
+                      VarianceParams(4, 6, &aom_variance16x64_avx2),
+                      VarianceParams(4, 5, &aom_variance16x32_avx2),
+                      VarianceParams(4, 4, &aom_variance16x16_avx2),
+                      VarianceParams(4, 3, &aom_variance16x8_avx2),
+                      VarianceParams(4, 2, &aom_variance16x4_avx2)));
 
 INSTANTIATE_TEST_CASE_P(
     AVX2, AvxSubpelVarianceTest,
     ::testing::Values(
+        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_avx2, 0),
+        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_avx2, 0),
+        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_avx2, 0),
         SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_avx2, 0),
-        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_avx2, 0)));
+        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_avx2, 0),
+        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_avx2, 0),
+        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_avx2, 0),
+        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_avx2, 0)));
 
 INSTANTIATE_TEST_CASE_P(
     AVX2, AvxSubpelAvgVarianceTest,
     ::testing::Values(
+        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_avx2,
+                                0),
+        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_avx2,
+                                0),
+        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_avx2,
+                                0),
         SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_avx2, 0),
-        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_avx2,
+        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_avx2, 0),
+        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_avx2, 0),
+        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_avx2, 0),
+        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_avx2,
                                 0)));
 #endif  // HAVE_AVX2
 
diff --git a/third_party/aom/test/video_source.h b/third_party/aom/test/video_source.h
index e986ffb37..dc39b5a80 100644
--- a/third_party/aom/test/video_source.h
+++ b/third_party/aom/test/video_source.h
@@ -71,7 +71,10 @@ static FILE *GetTempOutFile(std::string *file_name) {
   }
   return NULL;
 #else
-  return tmpfile();
+  char name_template[] = "/tmp/libaomtest.XXXXXX";
+  const int fd = mkstemp(name_template);
+  *file_name = name_template;
+  return fdopen(fd, "wb+");
 #endif
 }
 
diff --git a/third_party/aom/test/visual_metrics.py b/third_party/aom/test/visual_metrics.py
new file mode 100755
index 000000000..9055feb33
--- /dev/null
+++ b/third_party/aom/test/visual_metrics.py
@@ -0,0 +1,466 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+
+"""Converts video encoding result data from text files to visualization
+data source."""
+
+__author__ = "jzern@google.com (James Zern),"
+__author__ += "jimbankoski@google.com (Jim Bankoski)"
+
+import fnmatch
+import numpy as np
+import scipy as sp
+import scipy.interpolate
+import os
+import re
+import string
+import sys
+import math
+import warnings
+
+import gviz_api
+
+from os.path import basename
+from os.path import splitext
+
+warnings.simplefilter('ignore', np.RankWarning)
+warnings.simplefilter('ignore', RuntimeWarning)
+
+def bdsnr2(metric_set1, metric_set2):
+  """
+  BJONTEGAARD    Bjontegaard metric calculation adapted
+  Bjontegaard's snr metric allows to compute the average % saving in decibels
+  between two rate-distortion curves [1].  This is an adaptation of that
+  method that fixes inconsistencies when the curve fit operation goes awry
+  by replacing the curve fit function with a Piecewise Cubic Hermite
+  Interpolating Polynomial and then integrating that by evaluating that
+  function at small intervals using the trapezoid method to calculate
+  the integral.
+
+  metric_set1 - list of tuples ( bitrate,  metric ) for first graph
+  metric_set2 - list of tuples ( bitrate,  metric ) for second graph
+  """
+
+  if not metric_set1 or not metric_set2:
+    return 0.0
+
+  try:
+
+    # pchip_interlopate requires keys sorted by x axis. x-axis will
+    # be our metric not the bitrate so sort by metric.
+    metric_set1.sort()
+    metric_set2.sort()
+
+    # Pull the log of the rate and clamped psnr from metric_sets.
+    log_rate1 = [math.log(x[0]) for x in metric_set1]
+    metric1 = [100.0 if x[1] == float('inf') else x[1] for x in metric_set1]
+    log_rate2 = [math.log(x[0]) for x in metric_set2]
+    metric2 = [100.0 if x[1] == float('inf') else x[1] for x in metric_set2]
+
+    # Integration interval.  This metric only works on the area that's
+    # overlapping.   Extrapolation of these things is sketchy so we avoid.
+    min_int = max([min(log_rate1), min(log_rate2)])
+    max_int = min([max(log_rate1), max(log_rate2)])
+
+    # No overlap means no sensible metric possible.
+    if max_int <= min_int:
+      return 0.0
+
+    # Use Piecewise Cubic Hermite Interpolating Polynomial interpolation to
+    # create 100 new samples points separated by interval.
+    lin = np.linspace(min_int, max_int, num=100, retstep=True)
+    interval = lin[1]
+    samples = lin[0]
+    v1 = scipy.interpolate.pchip_interpolate(log_rate1, metric1, samples)
+    v2 = scipy.interpolate.pchip_interpolate(log_rate2, metric2, samples)
+
+    # Calculate the integral using the trapezoid method on the samples.
+    int_v1 = np.trapz(v1, dx=interval)
+    int_v2 = np.trapz(v2, dx=interval)
+
+    # Calculate the average improvement.
+    avg_exp_diff = (int_v2 - int_v1) / (max_int - min_int)
+
+  except (TypeError, ZeroDivisionError, ValueError, np.RankWarning) as e:
+    return 0
+
+  return avg_exp_diff
+
+def bdrate2(metric_set1, metric_set2):
+  """
+  BJONTEGAARD    Bjontegaard metric calculation adapted
+  Bjontegaard's metric allows to compute the average % saving in bitrate
+  between two rate-distortion curves [1].  This is an adaptation of that
+  method that fixes inconsistencies when the curve fit operation goes awry
+  by replacing the curve fit function with a Piecewise Cubic Hermite
+  Interpolating Polynomial and then integrating that by evaluating that
+  function at small intervals using the trapezoid method to calculate
+  the integral.
+
+  metric_set1 - list of tuples ( bitrate,  metric ) for first graph
+  metric_set2 - list of tuples ( bitrate,  metric ) for second graph
+  """
+
+  if not metric_set1 or not metric_set2:
+    return 0.0
+
+  try:
+
+    # pchip_interlopate requires keys sorted by x axis. x-axis will
+    # be our metric not the bitrate so sort by metric.
+    metric_set1.sort(key=lambda tup: tup[1])
+    metric_set2.sort(key=lambda tup: tup[1])
+
+    # Pull the log of the rate and clamped psnr from metric_sets.
+    log_rate1 = [math.log(x[0]) for x in metric_set1]
+    metric1 = [100.0 if x[1] == float('inf') else x[1] for x in metric_set1]
+    log_rate2 = [math.log(x[0]) for x in metric_set2]
+    metric2 = [100.0 if x[1] == float('inf') else x[1] for x in metric_set2]
+
+    # Integration interval.  This metric only works on the area that's
+    # overlapping.   Extrapolation of these things is sketchy so we avoid.
+    min_int = max([min(metric1), min(metric2)])
+    max_int = min([max(metric1), max(metric2)])
+
+    # No overlap means no sensible metric possible.
+    if max_int <= min_int:
+      return 0.0
+
+    # Use Piecewise Cubic Hermite Interpolating Polynomial interpolation to
+    # create 100 new samples points separated by interval.
+    lin = np.linspace(min_int, max_int, num=100, retstep=True)
+    interval = lin[1]
+    samples = lin[0]
+    v1 = scipy.interpolate.pchip_interpolate(metric1, log_rate1, samples)
+    v2 = scipy.interpolate.pchip_interpolate(metric2, log_rate2, samples)
+
+    # Calculate the integral using the trapezoid method on the samples.
+    int_v1 = np.trapz(v1, dx=interval)
+    int_v2 = np.trapz(v2, dx=interval)
+
+    # Calculate the average improvement.
+    avg_exp_diff = (int_v2 - int_v1) / (max_int - min_int)
+
+  except (TypeError, ZeroDivisionError, ValueError, np.RankWarning) as e:
+    return 0
+
+  # Convert to a percentage.
+  avg_diff = (math.exp(avg_exp_diff) - 1) * 100
+
+  return avg_diff
+
+
+
+def FillForm(string_for_substitution, dictionary_of_vars):
+  """
+  This function substitutes all matches of the command string //%% ... %%//
+  with the variable represented by ...  .
+  """
+  return_string = string_for_substitution
+  for i in re.findall("//%%(.*)%%//", string_for_substitution):
+    return_string = re.sub("//%%" + i + "%%//", dictionary_of_vars[i],
+                           return_string)
+  return return_string
+
+
+def HasMetrics(line):
+  """
+  The metrics files produced by aomenc are started with a B for headers.
+  """
+  # If the first char of the first word on the line is a digit
+  if len(line) == 0:
+    return False
+  if len(line.split()) == 0:
+    return False
+  if line.split()[0][0:1].isdigit():
+    return True
+  return False
+
+def GetMetrics(file_name):
+  metric_file = open(file_name, "r")
+  return metric_file.readline().split();
+
+def ParseMetricFile(file_name, metric_column):
+  metric_set1 = set([])
+  metric_file = open(file_name, "r")
+  for line in metric_file:
+    metrics = string.split(line)
+    if HasMetrics(line):
+      if metric_column < len(metrics):
+        try:
+          tuple = float(metrics[0]), float(metrics[metric_column])
+        except:
+          tuple = float(metrics[0]), 0
+      else:
+        tuple = float(metrics[0]), 0
+      metric_set1.add(tuple)
+  metric_set1_sorted = sorted(metric_set1)
+  return metric_set1_sorted
+
+
+def FileBetter(file_name_1, file_name_2, metric_column, method):
+  """
+  Compares two data files and determines which is better and by how
+  much. Also produces a histogram of how much better, by PSNR.
+  metric_column is the metric.
+  """
+  # Store and parse our two files into lists of unique tuples.
+
+  # Read the two files, parsing out lines starting with bitrate.
+  metric_set1_sorted = ParseMetricFile(file_name_1, metric_column)
+  metric_set2_sorted = ParseMetricFile(file_name_2, metric_column)
+
+
+  def GraphBetter(metric_set1_sorted, metric_set2_sorted, base_is_set_2):
+    """
+    Search through the sorted metric file for metrics on either side of
+    the metric from file 1.  Since both lists are sorted we really
+    should not have to search through the entire range, but these
+    are small files."""
+    total_bitrate_difference_ratio = 0.0
+    count = 0
+    for bitrate, metric in metric_set1_sorted:
+      if bitrate == 0:
+        continue
+      for i in range(len(metric_set2_sorted) - 1):
+        s2_bitrate_0, s2_metric_0 = metric_set2_sorted[i]
+        s2_bitrate_1, s2_metric_1 = metric_set2_sorted[i + 1]
+        # We have a point on either side of our metric range.
+        if metric > s2_metric_0 and metric <= s2_metric_1:
+
+          # Calculate a slope.
+          if s2_metric_1 - s2_metric_0 != 0:
+            metric_slope = ((s2_bitrate_1 - s2_bitrate_0) /
+                            (s2_metric_1 - s2_metric_0))
+          else:
+            metric_slope = 0
+
+          estimated_s2_bitrate = (s2_bitrate_0 + (metric - s2_metric_0) *
+                                  metric_slope)
+
+          if estimated_s2_bitrate == 0:
+            continue
+          # Calculate percentage difference as given by base.
+          if base_is_set_2 == 0:
+            bitrate_difference_ratio = ((bitrate - estimated_s2_bitrate) /
+                                        bitrate)
+          else:
+            bitrate_difference_ratio = ((bitrate - estimated_s2_bitrate) /
+                                        estimated_s2_bitrate)
+
+          total_bitrate_difference_ratio += bitrate_difference_ratio
+          count += 1
+          break
+
+    # Calculate the average improvement between graphs.
+    if count != 0:
+      avg = total_bitrate_difference_ratio / count
+
+    else:
+      avg = 0.0
+
+    return avg
+
+  # Be fair to both graphs by testing all the points in each.
+  if method == 'avg':
+    avg_improvement = 50 * (
+                       GraphBetter(metric_set1_sorted, metric_set2_sorted, 1) -
+                       GraphBetter(metric_set2_sorted, metric_set1_sorted, 0))
+  elif method == 'dsnr':
+      avg_improvement = bdsnr2(metric_set1_sorted, metric_set2_sorted)
+  else:
+      avg_improvement = bdrate2(metric_set2_sorted, metric_set1_sorted)
+
+  return avg_improvement
+
+
+def HandleFiles(variables):
+  """
+  This script creates html for displaying metric data produced from data
+  in a video stats file,  as created by the AOM project when enable_psnr
+  is turned on:
+
+  Usage: visual_metrics.py template.html pattern base_dir sub_dir [ sub_dir2 ..]
+
+  The script parses each metrics file [see below] that matches the
+  statfile_pattern  in the baseline directory and looks for the file that
+  matches that same file in each of the sub_dirs, and compares the resultant
+  metrics bitrate, avg psnr, glb psnr, and ssim. "
+
+  It provides a table in which each row is a file in the line directory,
+  and a column for each subdir, with the cells representing how that clip
+  compares to baseline for that subdir.   A graph is given for each which
+  compares filesize to that metric.  If you click on a point in the graph it
+  zooms in on that point.
+
+  a SAMPLE metrics file:
+
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+   25.911   38.242   38.104   38.258   38.121   75.790    14103
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+   49.982   41.264   41.129   41.255   41.122   83.993    19817
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+   74.967   42.911   42.767   42.899   42.756   87.928    17332
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  100.012   43.983   43.838   43.881   43.738   89.695    25389
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  149.980   45.338   45.203   45.184   45.043   91.591    25438
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  199.852   46.225   46.123   46.113   45.999   92.679    28302
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  249.922   46.864   46.773   46.777   46.673   93.334    27244
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  299.998   47.366   47.281   47.317   47.220   93.844    27137
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  349.769   47.746   47.677   47.722   47.648   94.178    32226
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  399.773   48.032   47.971   48.013   47.946   94.362    36203
+
+  sample use:
+  visual_metrics.py template.html "*stt" aom aom_b aom_c > metrics.html
+  """
+
+  # The template file is the html file into which we will write the
+  # data from the stats file, formatted correctly for the gviz_api.
+  template_file = open(variables[1], "r")
+  page_template = template_file.read()
+  template_file.close()
+
+  # This is the path match pattern for finding stats files amongst
+  # all the other files it could be.  eg: *.stt
+  file_pattern = variables[2]
+
+  # This is the directory with files that we will use to do the comparison
+  # against.
+  baseline_dir = variables[3]
+  snrs = ''
+  filestable = {}
+
+  filestable['dsnr'] = ''
+  filestable['drate'] = ''
+  filestable['avg'] = ''
+
+  # Dirs is directories after the baseline to compare to the base.
+  dirs = variables[4:len(variables)]
+
+  # Find the metric files in the baseline directory.
+  dir_list = sorted(fnmatch.filter(os.listdir(baseline_dir), file_pattern))
+
+  metrics = GetMetrics(baseline_dir + "/" + dir_list[0])
+
+  metrics_js = 'metrics = ["' + '", "'.join(metrics) + '"];'
+
+  for column in range(1, len(metrics)):
+
+    for metric in ['avg','dsnr','drate']:
+      description = {"file": ("string", "File")}
+
+      # Go through each directory and add a column header to our description.
+      countoverall = {}
+      sumoverall = {}
+
+      for directory in dirs:
+        description[directory] = ("number", directory)
+        countoverall[directory] = 0
+        sumoverall[directory] = 0
+
+      # Data holds the data for the visualization, name given comes from
+      # gviz_api sample code.
+      data = []
+      for filename in dir_list:
+        row = {'file': splitext(basename(filename))[0] }
+        baseline_file_name = baseline_dir + "/" + filename
+
+        # Read the metric file from each of the directories in our list.
+        for directory in dirs:
+          metric_file_name = directory + "/" + filename
+
+          # If there is a metric file in the current directory, open it
+          # and calculate its overall difference between it and the baseline
+          # directory's metric file.
+          if os.path.isfile(metric_file_name):
+            overall = FileBetter(baseline_file_name, metric_file_name,
+                                 column, metric)
+            row[directory] = overall
+
+            sumoverall[directory] += overall
+            countoverall[directory] += 1
+
+        data.append(row)
+
+      # Add the overall numbers.
+      row = {"file": "OVERALL" }
+      for directory in dirs:
+        row[directory] = sumoverall[directory] / countoverall[directory]
+      data.append(row)
+
+      # write the tables out
+      data_table = gviz_api.DataTable(description)
+      data_table.LoadData(data)
+
+      filestable[metric] = ( filestable[metric] + "filestable_" + metric +
+                             "[" + str(column) + "]=" +
+                             data_table.ToJSon(columns_order=["file"]+dirs) + "\n" )
+
+    filestable_avg = filestable['avg']
+    filestable_dpsnr = filestable['dsnr']
+    filestable_drate = filestable['drate']
+
+    # Now we collect all the data for all the graphs.  First the column
+    # headers which will be Datarate and then each directory.
+    columns = ("datarate",baseline_dir)
+    description = {"datarate":("number", "Datarate")}
+    for directory in dirs:
+      description[directory] = ("number", directory)
+
+    description[baseline_dir] = ("number", baseline_dir)
+
+    snrs = snrs + "snrs[" + str(column) + "] = ["
+
+    # Now collect the data for the graphs, file by file.
+    for filename in dir_list:
+
+      data = []
+
+      # Collect the file in each directory and store all of its metrics
+      # in the associated gviz metrics table.
+      all_dirs = dirs + [baseline_dir]
+      for directory in all_dirs:
+
+        metric_file_name = directory + "/" + filename
+        if not os.path.isfile(metric_file_name):
+          continue
+
+        # Read and parse the metrics file storing it to the data we'll
+        # use for the gviz_api.Datatable.
+        metrics = ParseMetricFile(metric_file_name, column)
+        for bitrate, metric in metrics:
+          data.append({"datarate": bitrate, directory: metric})
+
+      data_table = gviz_api.DataTable(description)
+      data_table.LoadData(data)
+      snrs = snrs + "'" + data_table.ToJSon(
+         columns_order=tuple(["datarate",baseline_dir]+dirs)) + "',"
+
+    snrs = snrs + "]\n"
+
+    formatters = ""
+    for i in range(len(dirs)):
+      formatters = "%s   formatter.format(better, %d);" % (formatters, i+1)
+
+  print FillForm(page_template, vars())
+  return
+
+if len(sys.argv) < 3:
+  print HandleFiles.__doc__
+else:
+  HandleFiles(sys.argv)
diff --git a/third_party/aom/test/warp_filter_test.cc b/third_party/aom/test/warp_filter_test.cc
index 0353c074e..15f8a285c 100644
--- a/third_party/aom/test/warp_filter_test.cc
+++ b/third_party/aom/test/warp_filter_test.cc
@@ -8,39 +8,29 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/warp_filter_test_util.h"
-
-using std::tr1::tuple;
-using std::tr1::make_tuple;
+using ::testing::make_tuple;
+using ::testing::tuple;
 using libaom_test::ACMRandom;
-using libaom_test::AV1WarpFilter::AV1WarpFilterTest;
-#if CONFIG_HIGHBITDEPTH
 using libaom_test::AV1HighbdWarpFilter::AV1HighbdWarpFilterTest;
-#endif
+using libaom_test::AV1WarpFilter::AV1WarpFilterTest;
 
 namespace {
-
+#if HAVE_SSE4_1
 TEST_P(AV1WarpFilterTest, CheckOutput) { RunCheckOutput(GET_PARAM(3)); }
+TEST_P(AV1WarpFilterTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(3)); }
 
 INSTANTIATE_TEST_CASE_P(
-    SSE2, AV1WarpFilterTest,
-    libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_sse2));
-
-#if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
-    SSSE3, AV1WarpFilterTest,
-    libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_ssse3));
-#endif
+    SSE4_1, AV1WarpFilterTest,
+    libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_sse4_1));
 
-#if CONFIG_HIGHBITDEPTH && HAVE_SSSE3
-TEST_P(AV1HighbdWarpFilterTest, CheckOutput) {
-  RunCheckOutput(av1_highbd_warp_affine_ssse3);
-}
+TEST_P(AV1HighbdWarpFilterTest, CheckOutput) { RunCheckOutput(GET_PARAM(4)); }
+TEST_P(AV1HighbdWarpFilterTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(4)); }
 
-INSTANTIATE_TEST_CASE_P(SSSE3, AV1HighbdWarpFilterTest,
-                        libaom_test::AV1HighbdWarpFilter::GetDefaultParams());
-#endif
+INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdWarpFilterTest,
+                        libaom_test::AV1HighbdWarpFilter::BuildParams(
+                            av1_highbd_warp_affine_sse4_1));
 
+#endif  // HAVE_SSE4_1
 }  // namespace
diff --git a/third_party/aom/test/warp_filter_test_util.cc b/third_party/aom/test/warp_filter_test_util.cc
index 47ce6c371..b341cd0c2 100644
--- a/third_party/aom/test/warp_filter_test_util.cc
+++ b/third_party/aom/test/warp_filter_test_util.cc
@@ -8,55 +8,38 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-
+#include "aom_ports/aom_timer.h"
 #include "test/warp_filter_test_util.h"
 
-using std::tr1::tuple;
-using std::tr1::make_tuple;
+using ::testing::make_tuple;
+using ::testing::tuple;
 
 namespace libaom_test {
 
-namespace AV1WarpFilter {
-
-::testing::internal::ParamGenerator<WarpTestParam> BuildParams(
-    warp_affine_func filter) {
-  const WarpTestParam params[] = {
-    make_tuple(4, 4, 100, filter),   make_tuple(8, 8, 100, filter),
-    make_tuple(64, 64, 100, filter), make_tuple(4, 16, 100, filter),
-    make_tuple(32, 8, 100, filter),
-  };
-  return ::testing::ValuesIn(params);
-}
-
-AV1WarpFilterTest::~AV1WarpFilterTest() {}
-void AV1WarpFilterTest::SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
-
-void AV1WarpFilterTest::TearDown() { libaom_test::ClearSystemState(); }
-
-int32_t AV1WarpFilterTest::random_param(int bits) {
+int32_t random_warped_param(libaom_test::ACMRandom *rnd, int bits) {
   // 1 in 8 chance of generating zero (arbitrarily chosen)
-  if (((rnd_.Rand8()) & 7) == 0) return 0;
+  if (((rnd->Rand8()) & 7) == 0) return 0;
   // Otherwise, enerate uniform values in the range
   // [-(1 << bits), 1] U [1, 1<<bits]
-  int32_t v = 1 + (rnd_.Rand16() & ((1 << bits) - 1));
-  if ((rnd_.Rand8()) & 1) return -v;
+  int32_t v = 1 + (rnd->Rand16() & ((1 << bits) - 1));
+  if ((rnd->Rand8()) & 1) return -v;
   return v;
 }
 
-void AV1WarpFilterTest::generate_model(int32_t *mat, int16_t *alpha,
-                                       int16_t *beta, int16_t *gamma,
-                                       int16_t *delta) {
+void generate_warped_model(libaom_test::ACMRandom *rnd, int32_t *mat,
+                           int16_t *alpha, int16_t *beta, int16_t *gamma,
+                           int16_t *delta) {
   while (1) {
-    mat[0] = random_param(WARPEDMODEL_PREC_BITS + 6);
-    mat[1] = random_param(WARPEDMODEL_PREC_BITS + 6);
-    mat[2] = (random_param(WARPEDMODEL_PREC_BITS - 3)) +
+    mat[0] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS + 6);
+    mat[1] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS + 6);
+    mat[2] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) +
              (1 << WARPEDMODEL_PREC_BITS);
-    mat[3] = random_param(WARPEDMODEL_PREC_BITS - 3);
+    mat[3] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3);
     // 50/50 chance of generating ROTZOOM vs. AFFINE models
-    if (rnd_.Rand8() & 1) {
+    if (rnd->Rand8() & 1) {
       // AFFINE
-      mat[4] = random_param(WARPEDMODEL_PREC_BITS - 3);
-      mat[5] = (random_param(WARPEDMODEL_PREC_BITS - 3)) +
+      mat[4] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3);
+      mat[5] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) +
                (1 << WARPEDMODEL_PREC_BITS);
     } else {
       mat[4] = -mat[3];
@@ -94,13 +77,29 @@ void AV1WarpFilterTest::generate_model(int32_t *mat, int16_t *alpha,
   }
 }
 
-void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
+namespace AV1WarpFilter {
+::testing::internal::ParamGenerator<WarpTestParam> BuildParams(
+    warp_affine_func filter) {
+  const WarpTestParam params[] = {
+    make_tuple(4, 4, 50000, filter),  make_tuple(8, 8, 50000, filter),
+    make_tuple(64, 64, 1000, filter), make_tuple(4, 16, 20000, filter),
+    make_tuple(32, 8, 10000, filter),
+  };
+  return ::testing::ValuesIn(params);
+}
+
+AV1WarpFilterTest::~AV1WarpFilterTest() {}
+void AV1WarpFilterTest::SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
+void AV1WarpFilterTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1WarpFilterTest::RunSpeedTest(warp_affine_func test_impl) {
   const int w = 128, h = 128;
   const int border = 16;
   const int stride = w + 2 * border;
   const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
-  const int num_iters = GET_PARAM(2);
-  int i, j, sub_x, sub_y;
+  int sub_x, sub_y;
+  const int bd = 8;
 
   uint8_t *input_ = new uint8_t[h * stride];
   uint8_t *input = input_ + border;
@@ -109,14 +108,66 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
   // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
   int output_n = ((out_w + 7) & ~7) * out_h;
   uint8_t *output = new uint8_t[output_n];
+  int32_t mat[8];
+  int16_t alpha, beta, gamma, delta;
+  ConvolveParams conv_params = get_conv_params(0, 0, 0, bd);
+  CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
+
+  generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta);
+
+  for (int r = 0; r < h; ++r)
+    for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand8();
+  for (int r = 0; r < h; ++r) {
+    memset(input + r * stride - border, input[r * stride], border);
+    memset(input + r * stride + w, input[r * stride + (w - 1)], border);
+  }
+
+  sub_x = 0;
+  sub_y = 0;
+  int do_average = 0;
+
+  conv_params = get_conv_params_no_round(0, do_average, 0, dsta, out_w, 1, bd);
+  conv_params.use_jnt_comp_avg = 0;
+
+  const int num_loops = 1000000000 / (out_w + out_h);
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < num_loops; ++i)
+    test_impl(mat, input, w, h, stride, output, 32, 32, out_w, out_h, out_w,
+              sub_x, sub_y, &conv_params, alpha, beta, gamma, delta);
+
+  aom_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("warp %3dx%-3d: %7.2f ns\n", out_w, out_h,
+         1000.0 * elapsed_time / num_loops);
+
+  delete[] input_;
+  delete[] output;
+  delete[] dsta;
+}
+
+void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
+  const int w = 128, h = 128;
+  const int border = 16;
+  const int stride = w + 2 * border;
+  const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
+  const int num_iters = GET_PARAM(2);
+  int i, j, sub_x, sub_y;
+  const int bd = 8;
+
+  // The warp functions always write rows with widths that are multiples of 8.
+  // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
+  int output_n = ((out_w + 7) & ~7) * out_h;
+  uint8_t *input_ = new uint8_t[h * stride];
+  uint8_t *input = input_ + border;
+  uint8_t *output = new uint8_t[output_n];
   uint8_t *output2 = new uint8_t[output_n];
   int32_t mat[8];
   int16_t alpha, beta, gamma, delta;
-  ConvolveParams conv_params = get_conv_params(0, 0, 0);
-#if CONFIG_CONVOLVE_ROUND
-  int32_t *dsta = new int32_t[output_n];
-  int32_t *dstb = new int32_t[output_n];
-#endif
+  ConvolveParams conv_params = get_conv_params(0, 0, 0, bd);
+  CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
+  CONV_BUF_TYPE *dstb = new CONV_BUF_TYPE[output_n];
+  for (int i = 0; i < output_n; ++i) output[i] = output2[i] = rnd_.Rand8();
 
   for (i = 0; i < num_iters; ++i) {
     // Generate an input block and extend its borders horizontally
@@ -126,81 +177,88 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
       memset(input + r * stride - border, input[r * stride], border);
       memset(input + r * stride + w, input[r * stride + (w - 1)], border);
     }
-#if CONFIG_CONVOLVE_ROUND
     const int use_no_round = rnd_.Rand8() & 1;
-#endif
     for (sub_x = 0; sub_x < 2; ++sub_x)
       for (sub_y = 0; sub_y < 2; ++sub_y) {
-        generate_model(mat, &alpha, &beta, &gamma, &delta);
-#if CONFIG_CONVOLVE_ROUND
-        if (use_no_round) {
-          // Prepare two copies of the destination
-          for (j = 0; j < out_w * out_h; ++j) {
-            int32_t v = rnd_.Rand16();
-            dsta[j] = v;
-            dstb[j] = v;
+        generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta);
+        for (int ii = 0; ii < 2; ++ii) {
+          for (int jj = 0; jj < 5; ++jj) {
+            for (int do_average = 0; do_average <= 1; ++do_average) {
+              if (use_no_round) {
+                conv_params = get_conv_params_no_round(0, do_average, 0, dsta,
+                                                       out_w, 1, bd);
+              } else {
+                conv_params = get_conv_params(0, 0, 0, bd);
+              }
+              if (jj >= 4) {
+                conv_params.use_jnt_comp_avg = 0;
+              } else {
+                conv_params.use_jnt_comp_avg = 1;
+                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+              }
+              av1_warp_affine_c(mat, input, w, h, stride, output, 32, 32, out_w,
+                                out_h, out_w, sub_x, sub_y, &conv_params, alpha,
+                                beta, gamma, delta);
+              if (use_no_round) {
+                conv_params = get_conv_params_no_round(0, do_average, 0, dstb,
+                                                       out_w, 1, bd);
+              }
+              if (jj >= 4) {
+                conv_params.use_jnt_comp_avg = 0;
+              } else {
+                conv_params.use_jnt_comp_avg = 1;
+                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+              }
+              test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
+                        out_w, sub_x, sub_y, &conv_params, alpha, beta, gamma,
+                        delta);
+              if (use_no_round) {
+                for (j = 0; j < out_w * out_h; ++j)
+                  ASSERT_EQ(dsta[j], dstb[j])
+                      << "Pixel mismatch at index " << j << " = ("
+                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+                      << i;
+                for (j = 0; j < out_w * out_h; ++j)
+                  ASSERT_EQ(output[j], output2[j])
+                      << "Pixel mismatch at index " << j << " = ("
+                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+                      << i;
+              } else {
+                for (j = 0; j < out_w * out_h; ++j)
+                  ASSERT_EQ(output[j], output2[j])
+                      << "Pixel mismatch at index " << j << " = ("
+                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+                      << i;
+              }
+            }
           }
-          conv_params = get_conv_params_no_round(0, 0, 0, dsta, out_w);
-        } else {
-          conv_params = get_conv_params(0, 0, 0);
-        }
-#endif
-        av1_warp_affine_c(mat, input, w, h, stride, output, 32, 32, out_w,
-                          out_h, out_w, sub_x, sub_y, &conv_params, alpha, beta,
-                          gamma, delta);
-#if CONFIG_CONVOLVE_ROUND
-        if (use_no_round) {
-          conv_params = get_conv_params_no_round(0, 0, 0, dstb, out_w);
         }
-#endif
-        test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
-                  out_w, sub_x, sub_y, &conv_params, alpha, beta, gamma, delta);
-
-#if CONFIG_CONVOLVE_ROUND
-        if (use_no_round) {
-          for (j = 0; j < out_w * out_h; ++j)
-            ASSERT_EQ(dsta[j], dstb[j])
-                << "Pixel mismatch at index " << j << " = (" << (j % out_w)
-                << ", " << (j / out_w) << ") on iteration " << i;
-        } else {
-          for (j = 0; j < out_w * out_h; ++j)
-            ASSERT_EQ(output[j], output2[j])
-                << "Pixel mismatch at index " << j << " = (" << (j % out_w)
-                << ", " << (j / out_w) << ") on iteration " << i;
-        }
-#else
-        for (j = 0; j < out_w * out_h; ++j)
-          ASSERT_EQ(output[j], output2[j])
-              << "Pixel mismatch at index " << j << " = (" << (j % out_w)
-              << ", " << (j / out_w) << ") on iteration " << i;
-#endif
       }
   }
   delete[] input_;
   delete[] output;
   delete[] output2;
-#if CONFIG_CONVOLVE_ROUND
   delete[] dsta;
   delete[] dstb;
-#endif
 }
 }  // namespace AV1WarpFilter
 
-#if CONFIG_HIGHBITDEPTH
 namespace AV1HighbdWarpFilter {
-
-::testing::internal::ParamGenerator<HighbdWarpTestParam> GetDefaultParams() {
-  const HighbdWarpTestParam defaultParams[] = {
-    make_tuple(4, 4, 100, 8),    make_tuple(8, 8, 100, 8),
-    make_tuple(64, 64, 100, 8),  make_tuple(4, 16, 100, 8),
-    make_tuple(32, 8, 100, 8),   make_tuple(4, 4, 100, 10),
-    make_tuple(8, 8, 100, 10),   make_tuple(64, 64, 100, 10),
-    make_tuple(4, 16, 100, 10),  make_tuple(32, 8, 100, 10),
-    make_tuple(4, 4, 100, 12),   make_tuple(8, 8, 100, 12),
-    make_tuple(64, 64, 100, 12), make_tuple(4, 16, 100, 12),
-    make_tuple(32, 8, 100, 12),
+::testing::internal::ParamGenerator<HighbdWarpTestParam> BuildParams(
+    highbd_warp_affine_func filter) {
+  const HighbdWarpTestParam params[] = {
+    make_tuple(4, 4, 100, 8, filter),    make_tuple(8, 8, 100, 8, filter),
+    make_tuple(64, 64, 100, 8, filter),  make_tuple(4, 16, 100, 8, filter),
+    make_tuple(32, 8, 100, 8, filter),   make_tuple(4, 4, 100, 10, filter),
+    make_tuple(8, 8, 100, 10, filter),   make_tuple(64, 64, 100, 10, filter),
+    make_tuple(4, 16, 100, 10, filter),  make_tuple(32, 8, 100, 10, filter),
+    make_tuple(4, 4, 100, 12, filter),   make_tuple(8, 8, 100, 12, filter),
+    make_tuple(64, 64, 100, 12, filter), make_tuple(4, 16, 100, 12, filter),
+    make_tuple(32, 8, 100, 12, filter),
   };
-  return ::testing::ValuesIn(defaultParams);
+  return ::testing::ValuesIn(params);
 }
 
 AV1HighbdWarpFilterTest::~AV1HighbdWarpFilterTest() {}
@@ -210,65 +268,59 @@ void AV1HighbdWarpFilterTest::SetUp() {
 
 void AV1HighbdWarpFilterTest::TearDown() { libaom_test::ClearSystemState(); }
 
-int32_t AV1HighbdWarpFilterTest::random_param(int bits) {
-  // 1 in 8 chance of generating zero (arbitrarily chosen)
-  if (((rnd_.Rand8()) & 7) == 0) return 0;
-  // Otherwise, enerate uniform values in the range
-  // [-(1 << bits), 1] U [1, 1<<bits]
-  int32_t v = 1 + (rnd_.Rand16() & ((1 << bits) - 1));
-  if ((rnd_.Rand8()) & 1) return -v;
-  return v;
-}
+void AV1HighbdWarpFilterTest::RunSpeedTest(highbd_warp_affine_func test_impl) {
+  const int w = 128, h = 128;
+  const int border = 16;
+  const int stride = w + 2 * border;
+  const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
+  const int bd = GET_PARAM(3);
+  const int mask = (1 << bd) - 1;
+  int sub_x, sub_y;
 
-void AV1HighbdWarpFilterTest::generate_model(int32_t *mat, int16_t *alpha,
-                                             int16_t *beta, int16_t *gamma,
-                                             int16_t *delta) {
-  while (1) {
-    mat[0] = random_param(WARPEDMODEL_PREC_BITS + 6);
-    mat[1] = random_param(WARPEDMODEL_PREC_BITS + 6);
-    mat[2] = (random_param(WARPEDMODEL_PREC_BITS - 3)) +
-             (1 << WARPEDMODEL_PREC_BITS);
-    mat[3] = random_param(WARPEDMODEL_PREC_BITS - 3);
-    // 50/50 chance of generating ROTZOOM vs. AFFINE models
-    if (rnd_.Rand8() & 1) {
-      // AFFINE
-      mat[4] = random_param(WARPEDMODEL_PREC_BITS - 3);
-      mat[5] = (random_param(WARPEDMODEL_PREC_BITS - 3)) +
-               (1 << WARPEDMODEL_PREC_BITS);
-    } else {
-      mat[4] = -mat[3];
-      mat[5] = mat[2];
+  // The warp functions always write rows with widths that are multiples of 8.
+  // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
+  int output_n = ((out_w + 7) & ~7) * out_h;
+  uint16_t *input_ = new uint16_t[h * stride];
+  uint16_t *input = input_ + border;
+  uint16_t *output = new uint16_t[output_n];
+  int32_t mat[8];
+  int16_t alpha, beta, gamma, delta;
+  ConvolveParams conv_params = get_conv_params(0, 0, 0, bd);
+  CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
+
+  generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta);
+  // Generate an input block and extend its borders horizontally
+  for (int r = 0; r < h; ++r)
+    for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand16() & mask;
+  for (int r = 0; r < h; ++r) {
+    for (int c = 0; c < border; ++c) {
+      input[r * stride - border + c] = input[r * stride];
+      input[r * stride + w + c] = input[r * stride + (w - 1)];
     }
+  }
 
-    // Calculate the derived parameters and check that they are suitable
-    // for the warp filter.
-    assert(mat[2] != 0);
+  sub_x = 0;
+  sub_y = 0;
+  int do_average = 0;
+  conv_params.use_jnt_comp_avg = 0;
+  conv_params = get_conv_params_no_round(0, do_average, 0, dsta, out_w, 1, bd);
 
-    *alpha = clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
-    *beta = clamp(mat[3], INT16_MIN, INT16_MAX);
-    *gamma = clamp(((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) / mat[2],
-                   INT16_MIN, INT16_MAX);
-    *delta =
-        clamp(mat[5] - (((int64_t)mat[3] * mat[4] + (mat[2] / 2)) / mat[2]) -
-                  (1 << WARPEDMODEL_PREC_BITS),
-              INT16_MIN, INT16_MAX);
+  const int num_loops = 1000000000 / (out_w + out_h);
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
 
-    if ((4 * abs(*alpha) + 7 * abs(*beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
-        (4 * abs(*gamma) + 4 * abs(*delta) >= (1 << WARPEDMODEL_PREC_BITS)))
-      continue;
+  for (int i = 0; i < num_loops; ++i)
+    test_impl(mat, input, w, h, stride, output, 32, 32, out_w, out_h, out_w,
+              sub_x, sub_y, bd, &conv_params, alpha, beta, gamma, delta);
 
-    *alpha = ROUND_POWER_OF_TWO_SIGNED(*alpha, WARP_PARAM_REDUCE_BITS) *
-             (1 << WARP_PARAM_REDUCE_BITS);
-    *beta = ROUND_POWER_OF_TWO_SIGNED(*beta, WARP_PARAM_REDUCE_BITS) *
-            (1 << WARP_PARAM_REDUCE_BITS);
-    *gamma = ROUND_POWER_OF_TWO_SIGNED(*gamma, WARP_PARAM_REDUCE_BITS) *
-             (1 << WARP_PARAM_REDUCE_BITS);
-    *delta = ROUND_POWER_OF_TWO_SIGNED(*delta, WARP_PARAM_REDUCE_BITS) *
-             (1 << WARP_PARAM_REDUCE_BITS);
+  aom_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("highbd warp %3dx%-3d: %7.2f ns\n", out_w, out_h,
+         1000.0 * elapsed_time / num_loops);
 
-    // We have a valid model, so finish
-    return;
-  }
+  delete[] input_;
+  delete[] output;
+  delete[] dsta;
 }
 
 void AV1HighbdWarpFilterTest::RunCheckOutput(
@@ -291,11 +343,10 @@ void AV1HighbdWarpFilterTest::RunCheckOutput(
   uint16_t *output2 = new uint16_t[output_n];
   int32_t mat[8];
   int16_t alpha, beta, gamma, delta;
-  ConvolveParams conv_params = get_conv_params(0, 0, 0);
-#if CONFIG_CONVOLVE_ROUND
-  int32_t *dsta = new int32_t[output_n];
-  int32_t *dstb = new int32_t[output_n];
-#endif
+  ConvolveParams conv_params = get_conv_params(0, 0, 0, bd);
+  CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
+  CONV_BUF_TYPE *dstb = new CONV_BUF_TYPE[output_n];
+  for (int i = 0; i < output_n; ++i) output[i] = output2[i] = rnd_.Rand16();
 
   for (i = 0; i < num_iters; ++i) {
     // Generate an input block and extend its borders horizontally
@@ -307,68 +358,76 @@ void AV1HighbdWarpFilterTest::RunCheckOutput(
         input[r * stride + w + c] = input[r * stride + (w - 1)];
       }
     }
-#if CONFIG_CONVOLVE_ROUND
     const int use_no_round = rnd_.Rand8() & 1;
-#endif
     for (sub_x = 0; sub_x < 2; ++sub_x)
       for (sub_y = 0; sub_y < 2; ++sub_y) {
-        generate_model(mat, &alpha, &beta, &gamma, &delta);
-#if CONFIG_CONVOLVE_ROUND
-        if (use_no_round) {
-          // Prepare two copies of the destination
-          for (j = 0; j < out_w * out_h; ++j) {
-            int32_t v = rnd_.Rand16();
-            dsta[j] = v;
-            dstb[j] = v;
+        generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta);
+        for (int ii = 0; ii < 2; ++ii) {
+          for (int jj = 0; jj < 5; ++jj) {
+            for (int do_average = 0; do_average <= 1; ++do_average) {
+              if (use_no_round) {
+                conv_params = get_conv_params_no_round(0, do_average, 0, dsta,
+                                                       out_w, 1, bd);
+              } else {
+                conv_params = get_conv_params(0, 0, 0, bd);
+              }
+              if (jj >= 4) {
+                conv_params.use_jnt_comp_avg = 0;
+              } else {
+                conv_params.use_jnt_comp_avg = 1;
+                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+              }
+
+              av1_highbd_warp_affine_c(mat, input, w, h, stride, output, 32, 32,
+                                       out_w, out_h, out_w, sub_x, sub_y, bd,
+                                       &conv_params, alpha, beta, gamma, delta);
+              if (use_no_round) {
+                // TODO(angiebird): Change this to test_impl once we have SIMD
+                // implementation
+                conv_params = get_conv_params_no_round(0, do_average, 0, dstb,
+                                                       out_w, 1, bd);
+              }
+              if (jj >= 4) {
+                conv_params.use_jnt_comp_avg = 0;
+              } else {
+                conv_params.use_jnt_comp_avg = 1;
+                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+              }
+              test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
+                        out_w, sub_x, sub_y, bd, &conv_params, alpha, beta,
+                        gamma, delta);
+
+              if (use_no_round) {
+                for (j = 0; j < out_w * out_h; ++j)
+                  ASSERT_EQ(dsta[j], dstb[j])
+                      << "Pixel mismatch at index " << j << " = ("
+                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+                      << i;
+                for (j = 0; j < out_w * out_h; ++j)
+                  ASSERT_EQ(output[j], output2[j])
+                      << "Pixel mismatch at index " << j << " = ("
+                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+                      << i;
+              } else {
+                for (j = 0; j < out_w * out_h; ++j)
+                  ASSERT_EQ(output[j], output2[j])
+                      << "Pixel mismatch at index " << j << " = ("
+                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+                      << i;
+              }
+            }
           }
-          conv_params = get_conv_params_no_round(0, 0, 0, dsta, out_w);
-        } else {
-          conv_params = get_conv_params(0, 0, 0);
-        }
-#endif
-        av1_highbd_warp_affine_c(mat, input, w, h, stride, output, 32, 32,
-                                 out_w, out_h, out_w, sub_x, sub_y, bd,
-                                 &conv_params, alpha, beta, gamma, delta);
-#if CONFIG_CONVOLVE_ROUND
-        if (use_no_round) {
-          // TODO(angiebird): Change this to test_impl once we have SIMD
-          // implementation
-          conv_params = get_conv_params_no_round(0, 0, 0, dstb, out_w);
-        }
-#endif
-        test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
-                  out_w, sub_x, sub_y, bd, &conv_params, alpha, beta, gamma,
-                  delta);
-
-#if CONFIG_CONVOLVE_ROUND
-        if (use_no_round) {
-          for (j = 0; j < out_w * out_h; ++j)
-            ASSERT_EQ(dsta[j], dstb[j])
-                << "Pixel mismatch at index " << j << " = (" << (j % out_w)
-                << ", " << (j / out_w) << ") on iteration " << i;
-        } else {
-          for (j = 0; j < out_w * out_h; ++j)
-            ASSERT_EQ(output[j], output2[j])
-                << "Pixel mismatch at index " << j << " = (" << (j % out_w)
-                << ", " << (j / out_w) << ") on iteration " << i;
         }
-#else
-        for (j = 0; j < out_w * out_h; ++j)
-          ASSERT_EQ(output[j], output2[j])
-              << "Pixel mismatch at index " << j << " = (" << (j % out_w)
-              << ", " << (j / out_w) << ") on iteration " << i;
-#endif
       }
   }
 
   delete[] input_;
   delete[] output;
   delete[] output2;
-#if CONFIG_CONVOLVE_ROUND
   delete[] dsta;
   delete[] dstb;
-#endif
 }
 }  // namespace AV1HighbdWarpFilter
-#endif  // CONFIG_HIGHBITDEPTH
 }  // namespace libaom_test
diff --git a/third_party/aom/test/warp_filter_test_util.h b/third_party/aom/test/warp_filter_test_util.h
index 4828f3785..cf72d9db6 100644
--- a/third_party/aom/test/warp_filter_test_util.h
+++ b/third_party/aom/test/warp_filter_test_util.h
@@ -12,18 +12,24 @@
 #ifndef TEST_WARP_FILTER_TEST_UTIL_H_
 #define TEST_WARP_FILTER_TEST_UTIL_H_
 
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/util.h"
-#include "./av1_rtcd.h"
-#include "./aom_dsp_rtcd.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 
 #include "av1/common/mv.h"
+#include "av1/common/common_data.h"
 
 namespace libaom_test {
 
+void generate_warped_model(libaom_test::ACMRandom *rnd, int32_t *mat,
+                           int16_t *alpha, int16_t *beta, int16_t *gamma,
+                           int16_t *delta);
+
 namespace AV1WarpFilter {
 
 typedef void (*warp_affine_func)(const int32_t *mat, const uint8_t *ref,
@@ -34,7 +40,7 @@ typedef void (*warp_affine_func)(const int32_t *mat, const uint8_t *ref,
                                  ConvolveParams *conv_params, int16_t alpha,
                                  int16_t beta, int16_t gamma, int16_t delta);
 
-typedef std::tr1::tuple<int, int, int, warp_affine_func> WarpTestParam;
+typedef ::testing::tuple<int, int, int, warp_affine_func> WarpTestParam;
 
 ::testing::internal::ParamGenerator<WarpTestParam> BuildParams(
     warp_affine_func filter);
@@ -47,18 +53,14 @@ class AV1WarpFilterTest : public ::testing::TestWithParam<WarpTestParam> {
   virtual void TearDown();
 
  protected:
-  int32_t random_param(int bits);
-  void generate_model(int32_t *mat, int16_t *alpha, int16_t *beta,
-                      int16_t *gamma, int16_t *delta);
-
   void RunCheckOutput(warp_affine_func test_impl);
+  void RunSpeedTest(warp_affine_func test_impl);
 
   libaom_test::ACMRandom rnd_;
 };
 
 }  // namespace AV1WarpFilter
 
-#if CONFIG_HIGHBITDEPTH
 namespace AV1HighbdWarpFilter {
 typedef void (*highbd_warp_affine_func)(const int32_t *mat, const uint16_t *ref,
                                         int width, int height, int stride,
@@ -69,9 +71,11 @@ typedef void (*highbd_warp_affine_func)(const int32_t *mat, const uint16_t *ref,
                                         int16_t alpha, int16_t beta,
                                         int16_t gamma, int16_t delta);
 
-typedef std::tr1::tuple<int, int, int, int> HighbdWarpTestParam;
+typedef ::testing::tuple<int, int, int, int, highbd_warp_affine_func>
+    HighbdWarpTestParam;
 
-::testing::internal::ParamGenerator<HighbdWarpTestParam> GetDefaultParams();
+::testing::internal::ParamGenerator<HighbdWarpTestParam> BuildParams(
+    highbd_warp_affine_func filter);
 
 class AV1HighbdWarpFilterTest
     : public ::testing::TestWithParam<HighbdWarpTestParam> {
@@ -82,17 +86,13 @@ class AV1HighbdWarpFilterTest
   virtual void TearDown();
 
  protected:
-  int32_t random_param(int bits);
-  void generate_model(int32_t *mat, int16_t *alpha, int16_t *beta,
-                      int16_t *gamma, int16_t *delta);
-
   void RunCheckOutput(highbd_warp_affine_func test_impl);
+  void RunSpeedTest(highbd_warp_affine_func test_impl);
 
   libaom_test::ACMRandom rnd_;
 };
 
 }  // namespace AV1HighbdWarpFilter
-#endif  // CONFIG_HIGHBITDEPTH
 
 }  // namespace libaom_test
 
diff --git a/third_party/aom/test/webm_video_source.h b/third_party/aom/test/webm_video_source.h
index b6c998042..482f5dea2 100644
--- a/third_party/aom/test/webm_video_source.h
+++ b/third_party/aom/test/webm_video_source.h
@@ -15,8 +15,8 @@
 #include <cstdlib>
 #include <new>
 #include <string>
-#include "../tools_common.h"
-#include "../webmdec.h"
+#include "common/tools_common.h"
+#include "common/webmdec.h"
 #include "test/video_source.h"
 
 namespace libaom_test {
@@ -27,8 +27,8 @@ class WebMVideoSource : public CompressedVideoSource {
  public:
   explicit WebMVideoSource(const std::string &file_name)
       : file_name_(file_name), aom_ctx_(new AvxInputContext()),
-        webm_ctx_(new WebmInputContext()), buf_(NULL), buf_sz_(0), frame_(0),
-        end_of_file_(false) {}
+        webm_ctx_(new WebmInputContext()), buf_(NULL), buf_sz_(0), frame_sz_(0),
+        frame_number_(0), end_of_file_(false) {}
 
   virtual ~WebMVideoSource() {
     if (aom_ctx_->file != NULL) fclose(aom_ctx_->file);
@@ -50,13 +50,13 @@ class WebMVideoSource : public CompressedVideoSource {
   }
 
   virtual void Next() {
-    ++frame_;
+    ++frame_number_;
     FillFrame();
   }
 
   void FillFrame() {
     ASSERT_TRUE(aom_ctx_->file != NULL);
-    const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_);
+    const int status = webm_read_frame(webm_ctx_, &buf_, &frame_sz_, &buf_sz_);
     ASSERT_GE(status, 0) << "webm_read_frame failed";
     if (status == 1) {
       end_of_file_ = true;
@@ -66,9 +66,10 @@ class WebMVideoSource : public CompressedVideoSource {
   void SeekToNextKeyFrame() {
     ASSERT_TRUE(aom_ctx_->file != NULL);
     do {
-      const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_);
+      const int status =
+          webm_read_frame(webm_ctx_, &buf_, &frame_sz_, &buf_sz_);
       ASSERT_GE(status, 0) << "webm_read_frame failed";
-      ++frame_;
+      ++frame_number_;
       if (status == 1) {
         end_of_file_ = true;
       }
@@ -76,16 +77,17 @@ class WebMVideoSource : public CompressedVideoSource {
   }
 
   virtual const uint8_t *cxdata() const { return end_of_file_ ? NULL : buf_; }
-  virtual size_t frame_size() const { return buf_sz_; }
-  virtual unsigned int frame_number() const { return frame_; }
+  virtual size_t frame_size() const { return frame_sz_; }
+  virtual unsigned int frame_number() const { return frame_number_; }
 
  protected:
   std::string file_name_;
   AvxInputContext *aom_ctx_;
   WebmInputContext *webm_ctx_;
-  uint8_t *buf_;
+  uint8_t *buf_;  // Owned by webm_ctx_ and freed when webm_ctx_ is freed.
   size_t buf_sz_;
-  unsigned int frame_;
+  size_t frame_sz_;
+  unsigned int frame_number_;
   bool end_of_file_;
 };
 
diff --git a/third_party/aom/test/y4m_test.cc b/third_party/aom/test/y4m_test.cc
index fc9fff514..b8011935d 100644
--- a/third_party/aom/test/y4m_test.cc
+++ b/third_party/aom/test/y4m_test.cc
@@ -7,17 +7,17 @@
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-*/
+ */
 
 #include <string>
 
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "config/aom_config.h"
 
-#include "./aom_config.h"
-#include "./y4menc.h"
+#include "common/y4menc.h"
 #include "test/md5_helper.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace {
 
diff --git a/third_party/aom/test/y4m_video_source.h b/third_party/aom/test/y4m_video_source.h
index f70c30be6..277ded9eb 100644
--- a/third_party/aom/test/y4m_video_source.h
+++ b/third_party/aom/test/y4m_video_source.h
@@ -13,8 +13,8 @@
 #include <algorithm>
 #include <string>
 
+#include "common/y4minput.h"
 #include "test/video_source.h"
-#include "./y4minput.h"
 
 namespace libaom_test {
 
diff --git a/third_party/aom/test/yuv_video_source.h b/third_party/aom/test/yuv_video_source.h
index 88cabd5bb..51554af6f 100644
--- a/third_party/aom/test/yuv_video_source.h
+++ b/third_party/aom/test/yuv_video_source.h
@@ -86,11 +86,9 @@ class YUVVideoSource : public VideoSource {
       switch (format) {
         case AOM_IMG_FMT_I420: raw_size_ = width * height * 3 / 2; break;
         case AOM_IMG_FMT_I422: raw_size_ = width * height * 2; break;
-        case AOM_IMG_FMT_I440: raw_size_ = width * height * 2; break;
         case AOM_IMG_FMT_I444: raw_size_ = width * height * 3; break;
         case AOM_IMG_FMT_I42016: raw_size_ = width * height * 3; break;
         case AOM_IMG_FMT_I42216: raw_size_ = width * height * 4; break;
-        case AOM_IMG_FMT_I44016: raw_size_ = width * height * 4; break;
         case AOM_IMG_FMT_I44416: raw_size_ = width * height * 6; break;
         default: ASSERT_TRUE(0);
       }
diff --git a/third_party/aom/third_party/libwebm/Android.mk b/third_party/aom/third_party/libwebm/Android.mk
index 8149a083f..b46ba101d 100644
--- a/third_party/aom/third_party/libwebm/Android.mk
+++ b/third_party/aom/third_party/libwebm/Android.mk
@@ -3,7 +3,7 @@ LOCAL_PATH:= $(call my-dir)
 include $(CLEAR_VARS)
 LOCAL_MODULE:= libwebm
 LOCAL_CPPFLAGS:=-D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS
-LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -Wno-extern-c-compat
+LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -std=c++11
 LOCAL_C_INCLUDES:= $(LOCAL_PATH)
 LOCAL_EXPORT_C_INCLUDES:= $(LOCAL_PATH)
 
diff --git a/third_party/aom/third_party/libwebm/README.libaom b/third_party/aom/third_party/libwebm/README.libaom
index df543c44c..bd288d201 100644
--- a/third_party/aom/third_party/libwebm/README.libaom
+++ b/third_party/aom/third_party/libwebm/README.libaom
@@ -1,5 +1,5 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: 7baf4cb898f5e39fcdca2d4583fd2b14f488c725
+Version: af81f26025b7435fa9a14ad07c58b44cf9280430
 License: BSD
 License File: LICENSE.txt
 
@@ -7,4 +7,16 @@ Description:
 libwebm is used to handle WebM container I/O.
 
 Local Changes:
-* Write out AV1 tracks as WebM.
+Add av1 codec as an eligible codec for webm:
+ https://aomedia-review.googlesource.com/c/aom/+/15103
+Only keep:
+ - Android.mk
+ - AUTHORS.TXT
+ - common/
+    file_util.cc/h
+    hdr_util.cc/h
+    webmids.h
+ - LICENSE.TXT
+ - mkvmuxer/
+ - mkvparser/
+ - PATENTS.TXT
diff --git a/third_party/aom/third_party/libwebm/common/hdr_util.cc b/third_party/aom/third_party/libwebm/common/hdr_util.cc
index e1618ce75..916f7170b 100644
--- a/third_party/aom/third_party/libwebm/common/hdr_util.cc
+++ b/third_party/aom/third_party/libwebm/common/hdr_util.cc
@@ -36,10 +36,10 @@ bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm,
   if (MasteringMetadataValuePresent(parser_mm.luminance_min))
     muxer_mm->set_luminance_min(parser_mm.luminance_min);
 
-  PrimaryChromaticityPtr r_ptr(NULL);
-  PrimaryChromaticityPtr g_ptr(NULL);
-  PrimaryChromaticityPtr b_ptr(NULL);
-  PrimaryChromaticityPtr wp_ptr(NULL);
+  PrimaryChromaticityPtr r_ptr(nullptr);
+  PrimaryChromaticityPtr g_ptr(nullptr);
+  PrimaryChromaticityPtr b_ptr(nullptr);
+  PrimaryChromaticityPtr wp_ptr(nullptr);
 
   if (parser_mm.r) {
     if (!CopyPrimaryChromaticity(*parser_mm.r, &r_ptr))
diff --git a/third_party/aom/third_party/libwebm/common/hdr_util.h b/third_party/aom/third_party/libwebm/common/hdr_util.h
index 3ef5388fd..78e2eeb70 100644
--- a/third_party/aom/third_party/libwebm/common/hdr_util.h
+++ b/third_party/aom/third_party/libwebm/common/hdr_util.h
@@ -47,15 +47,7 @@ struct Vp9CodecFeatures {
   int chroma_subsampling;
 };
 
-// disable deprecation warnings for auto_ptr
-#if defined(__GNUC__) && __GNUC__ >= 5
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-typedef std::auto_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr;
-#if defined(__GNUC__) && __GNUC__ >= 5
-#pragma GCC diagnostic pop
-#endif
+typedef std::unique_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr;
 
 bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
                              PrimaryChromaticityPtr* muxer_pc);
diff --git a/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxer.cc
index 27e831023..bae2c99b8 100644
--- a/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxer.cc
+++ b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -26,11 +26,6 @@
 #include "mkvmuxer/mkvwriter.h"
 #include "mkvparser/mkvparser.h"
 
-// disable deprecation warnings for auto_ptr
-#if defined(__GNUC__) && __GNUC__ >= 5
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
 namespace mkvmuxer {
 
 const float PrimaryChromaticity::kChromaticityMin = 0.0f;
@@ -74,7 +69,7 @@ bool StrCpy(const char* src, char** dst_ptr) {
   return true;
 }
 
-typedef std::auto_ptr<PrimaryChromaticity> PrimaryChromaticityPtr;
+typedef std::unique_ptr<PrimaryChromaticity> PrimaryChromaticityPtr;
 bool CopyChromaticity(const PrimaryChromaticity* src,
                       PrimaryChromaticityPtr* dst) {
   if (!dst)
@@ -1059,22 +1054,22 @@ bool MasteringMetadata::Write(IMkvWriter* writer) const {
 bool MasteringMetadata::SetChromaticity(
     const PrimaryChromaticity* r, const PrimaryChromaticity* g,
     const PrimaryChromaticity* b, const PrimaryChromaticity* white_point) {
-  PrimaryChromaticityPtr r_ptr(NULL);
+  PrimaryChromaticityPtr r_ptr(nullptr);
   if (r) {
     if (!CopyChromaticity(r, &r_ptr))
       return false;
   }
-  PrimaryChromaticityPtr g_ptr(NULL);
+  PrimaryChromaticityPtr g_ptr(nullptr);
   if (g) {
     if (!CopyChromaticity(g, &g_ptr))
       return false;
   }
-  PrimaryChromaticityPtr b_ptr(NULL);
+  PrimaryChromaticityPtr b_ptr(nullptr);
   if (b) {
     if (!CopyChromaticity(b, &b_ptr))
       return false;
   }
-  PrimaryChromaticityPtr wp_ptr(NULL);
+  PrimaryChromaticityPtr wp_ptr(nullptr);
   if (white_point) {
     if (!CopyChromaticity(white_point, &wp_ptr))
       return false;
@@ -1240,7 +1235,7 @@ bool Colour::Write(IMkvWriter* writer) const {
 }
 
 bool Colour::SetMasteringMetadata(const MasteringMetadata& mastering_metadata) {
-  std::auto_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
+  std::unique_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
   if (!mm_ptr.get())
     return false;
 
@@ -1548,7 +1543,7 @@ bool VideoTrack::Write(IMkvWriter* writer) const {
 }
 
 bool VideoTrack::SetColour(const Colour& colour) {
-  std::auto_ptr<Colour> colour_ptr(new Colour());
+  std::unique_ptr<Colour> colour_ptr(new Colour());
   if (!colour_ptr.get())
     return false;
 
@@ -1576,7 +1571,7 @@ bool VideoTrack::SetColour(const Colour& colour) {
 }
 
 bool VideoTrack::SetProjection(const Projection& projection) {
-  std::auto_ptr<Projection> projection_ptr(new Projection());
+  std::unique_ptr<Projection> projection_ptr(new Projection());
   if (!projection_ptr.get())
     return false;
 
diff --git a/third_party/aom/third_party/libwebm/mkvparser/mkvparser.cc b/third_party/aom/third_party/libwebm/mkvparser/mkvparser.cc
index 1eeaa1365..e7b76f7da 100644
--- a/third_party/aom/third_party/libwebm/mkvparser/mkvparser.cc
+++ b/third_party/aom/third_party/libwebm/mkvparser/mkvparser.cc
@@ -22,12 +22,8 @@
 
 #include "common/webmids.h"
 
-// disable deprecation warnings for auto_ptr
-#if defined(__GNUC__) && __GNUC__ >= 5
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
 namespace mkvparser {
+const long long kStringElementSizeLimit = 20 * 1000 * 1000;
 const float MasteringMetadata::kValueNotPresent = FLT_MAX;
 const long long Colour::kValueNotPresent = LLONG_MAX;
 const float Projection::kValueNotPresent = FLT_MAX;
@@ -330,7 +326,7 @@ long UnserializeString(IMkvReader* pReader, long long pos, long long size,
   delete[] str;
   str = NULL;
 
-  if (size >= LONG_MAX || size < 0)
+  if (size >= LONG_MAX || size < 0 || size > kStringElementSizeLimit)
     return E_FILE_FORMAT_INVALID;
 
   // +1 for '\0' terminator
@@ -5015,7 +5011,7 @@ bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start,
   if (!reader || *mm)
     return false;
 
-  std::auto_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
+  std::unique_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
   if (!mm_ptr.get())
     return false;
 
@@ -5104,7 +5100,7 @@ bool Colour::Parse(IMkvReader* reader, long long colour_start,
   if (!reader || *colour)
     return false;
 
-  std::auto_ptr<Colour> colour_ptr(new Colour());
+  std::unique_ptr<Colour> colour_ptr(new Colour());
   if (!colour_ptr.get())
     return false;
 
@@ -5202,7 +5198,7 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size,
   if (!reader || *projection)
     return false;
 
-  std::auto_ptr<Projection> projection_ptr(new Projection());
+  std::unique_ptr<Projection> projection_ptr(new Projection());
   if (!projection_ptr.get())
     return false;
 
@@ -7976,6 +7972,11 @@ long long Block::GetTimeCode(const Cluster* pCluster) const {
   const long long tc0 = pCluster->GetTimeCode();
   assert(tc0 >= 0);
 
+  // Check if tc0 + m_timecode would overflow.
+  if (tc0 < 0 || LLONG_MAX - tc0 < m_timecode) {
+    return -1;
+  }
+
   const long long tc = tc0 + m_timecode;
 
   return tc;  // unscaled timecode units
diff --git a/third_party/aom/third_party/vector/vector.c b/third_party/aom/third_party/vector/vector.c
index 2f0a38e86..fe46246a1 100644
--- a/third_party/aom/third_party/vector/vector.c
+++ b/third_party/aom/third_party/vector/vector.c
@@ -28,7 +28,7 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "third_party/vector/vector.h"
 
-int vector_setup(Vector *vector, size_t capacity, size_t element_size) {
+int aom_vector_setup(Vector *vector, size_t capacity, size_t element_size) {
   assert(vector != NULL);
 
   if (vector == NULL) return VECTOR_ERROR;
@@ -41,16 +41,16 @@ int vector_setup(Vector *vector, size_t capacity, size_t element_size) {
   return vector->data == NULL ? VECTOR_ERROR : VECTOR_SUCCESS;
 }
 
-int vector_copy(Vector *destination, Vector *source) {
+int aom_vector_copy(Vector *destination, Vector *source) {
   assert(destination != NULL);
   assert(source != NULL);
-  assert(vector_is_initialized(source));
-  assert(!vector_is_initialized(destination));
+  assert(aom_vector_is_initialized(source));
+  assert(!aom_vector_is_initialized(destination));
 
   if (destination == NULL) return VECTOR_ERROR;
   if (source == NULL) return VECTOR_ERROR;
-  if (vector_is_initialized(destination)) return VECTOR_ERROR;
-  if (!vector_is_initialized(source)) return VECTOR_ERROR;
+  if (aom_vector_is_initialized(destination)) return VECTOR_ERROR;
+  if (!aom_vector_is_initialized(source)) return VECTOR_ERROR;
 
   /* Copy ALL the data */
   destination->size = source->size;
@@ -61,28 +61,28 @@ int vector_copy(Vector *destination, Vector *source) {
   destination->data = malloc(destination->capacity * source->element_size);
   if (destination->data == NULL) return VECTOR_ERROR;
 
-  memcpy(destination->data, source->data, vector_byte_size(source));
+  memcpy(destination->data, source->data, aom_vector_byte_size(source));
 
   return VECTOR_SUCCESS;
 }
 
-int vector_copy_assign(Vector *destination, Vector *source) {
+int aom_vector_copy_assign(Vector *destination, Vector *source) {
   assert(destination != NULL);
   assert(source != NULL);
-  assert(vector_is_initialized(source));
-  assert(vector_is_initialized(destination));
+  assert(aom_vector_is_initialized(source));
+  assert(aom_vector_is_initialized(destination));
 
   if (destination == NULL) return VECTOR_ERROR;
   if (source == NULL) return VECTOR_ERROR;
-  if (!vector_is_initialized(destination)) return VECTOR_ERROR;
-  if (!vector_is_initialized(source)) return VECTOR_ERROR;
+  if (!aom_vector_is_initialized(destination)) return VECTOR_ERROR;
+  if (!aom_vector_is_initialized(source)) return VECTOR_ERROR;
 
-  vector_destroy(destination);
+  aom_vector_destroy(destination);
 
-  return vector_copy(destination, source);
+  return aom_vector_copy(destination, source);
 }
 
-int vector_move(Vector *destination, Vector *source) {
+int aom_vector_move(Vector *destination, Vector *source) {
   assert(destination != NULL);
   assert(source != NULL);
 
@@ -95,23 +95,23 @@ int vector_move(Vector *destination, Vector *source) {
   return VECTOR_SUCCESS;
 }
 
-int vector_move_assign(Vector *destination, Vector *source) {
-  vector_swap(destination, source);
-  return vector_destroy(source);
+int aom_vector_move_assign(Vector *destination, Vector *source) {
+  aom_vector_swap(destination, source);
+  return aom_vector_destroy(source);
 }
 
-int vector_swap(Vector *destination, Vector *source) {
+int aom_vector_swap(Vector *destination, Vector *source) {
   void *temp;
 
   assert(destination != NULL);
   assert(source != NULL);
-  assert(vector_is_initialized(source));
-  assert(vector_is_initialized(destination));
+  assert(aom_vector_is_initialized(source));
+  assert(aom_vector_is_initialized(destination));
 
   if (destination == NULL) return VECTOR_ERROR;
   if (source == NULL) return VECTOR_ERROR;
-  if (!vector_is_initialized(destination)) return VECTOR_ERROR;
-  if (!vector_is_initialized(source)) return VECTOR_ERROR;
+  if (!aom_vector_is_initialized(destination)) return VECTOR_ERROR;
+  if (!aom_vector_is_initialized(source)) return VECTOR_ERROR;
 
   _vector_swap(&destination->size, &source->size);
   _vector_swap(&destination->capacity, &source->capacity);
@@ -124,7 +124,7 @@ int vector_swap(Vector *destination, Vector *source) {
   return VECTOR_SUCCESS;
 }
 
-int vector_destroy(Vector *vector) {
+int aom_vector_destroy(Vector *vector) {
   assert(vector != NULL);
 
   if (vector == NULL) return VECTOR_ERROR;
@@ -136,7 +136,7 @@ int vector_destroy(Vector *vector) {
 }
 
 /* Insertion */
-int vector_push_back(Vector *vector, void *element) {
+int aom_vector_push_back(Vector *vector, void *element) {
   assert(vector != NULL);
   assert(element != NULL);
 
@@ -153,11 +153,11 @@ int vector_push_back(Vector *vector, void *element) {
   return VECTOR_SUCCESS;
 }
 
-int vector_push_front(Vector *vector, void *element) {
-  return vector_insert(vector, 0, element);
+int aom_vector_push_front(Vector *vector, void *element) {
+  return aom_vector_insert(vector, 0, element);
 }
 
-int vector_insert(Vector *vector, size_t index, void *element) {
+int aom_vector_insert(Vector *vector, size_t index, void *element) {
   void *offset;
 
   assert(vector != NULL);
@@ -188,7 +188,7 @@ int vector_insert(Vector *vector, size_t index, void *element) {
   return VECTOR_SUCCESS;
 }
 
-int vector_assign(Vector *vector, size_t index, void *element) {
+int aom_vector_assign(Vector *vector, size_t index, void *element) {
   assert(vector != NULL);
   assert(element != NULL);
   assert(index < vector->size);
@@ -204,7 +204,7 @@ int vector_assign(Vector *vector, size_t index, void *element) {
 }
 
 /* Deletion */
-int vector_pop_back(Vector *vector) {
+int aom_vector_pop_back(Vector *vector) {
   assert(vector != NULL);
   assert(vector->size > 0);
 
@@ -222,9 +222,9 @@ int vector_pop_back(Vector *vector) {
   return VECTOR_SUCCESS;
 }
 
-int vector_pop_front(Vector *vector) { return vector_erase(vector, 0); }
+int aom_vector_pop_front(Vector *vector) { return aom_vector_erase(vector, 0); }
 
-int vector_erase(Vector *vector, size_t index) {
+int aom_vector_erase(Vector *vector, size_t index) {
   assert(vector != NULL);
   assert(index < vector->size);
 
@@ -244,10 +244,10 @@ int vector_erase(Vector *vector, size_t index) {
   return VECTOR_SUCCESS;
 }
 
-int vector_clear(Vector *vector) { return vector_resize(vector, 0); }
+int aom_vector_clear(Vector *vector) { return aom_vector_resize(vector, 0); }
 
 /* Lookup */
-void *vector_get(Vector *vector, size_t index) {
+void *aom_vector_get(Vector *vector, size_t index) {
   assert(vector != NULL);
   assert(index < vector->size);
 
@@ -258,7 +258,7 @@ void *vector_get(Vector *vector, size_t index) {
   return _vector_offset(vector, index);
 }
 
-const void *vector_const_get(const Vector *vector, size_t index) {
+const void *aom_vector_const_get(const Vector *vector, size_t index) {
   assert(vector != NULL);
   assert(index < vector->size);
 
@@ -269,30 +269,30 @@ const void *vector_const_get(const Vector *vector, size_t index) {
   return _vector_const_offset(vector, index);
 }
 
-void *vector_front(Vector *vector) { return vector_get(vector, 0); }
+void *aom_vector_front(Vector *vector) { return aom_vector_get(vector, 0); }
 
-void *vector_back(Vector *vector) {
-  return vector_get(vector, vector->size - 1);
+void *aom_vector_back(Vector *vector) {
+  return aom_vector_get(vector, vector->size - 1);
 }
 
 /* Information */
 
-bool vector_is_initialized(const Vector *vector) {
+bool aom_vector_is_initialized(const Vector *vector) {
   return vector->data != NULL;
 }
 
-size_t vector_byte_size(const Vector *vector) {
+size_t aom_vector_byte_size(const Vector *vector) {
   return vector->size * vector->element_size;
 }
 
-size_t vector_free_space(const Vector *vector) {
+size_t aom_vector_free_space(const Vector *vector) {
   return vector->capacity - vector->size;
 }
 
-bool vector_is_empty(const Vector *vector) { return vector->size == 0; }
+bool aom_vector_is_empty(const Vector *vector) { return vector->size == 0; }
 
 /* Memory management */
-int vector_resize(Vector *vector, size_t new_size) {
+int aom_vector_resize(Vector *vector, size_t new_size) {
   if (new_size <= vector->capacity * VECTOR_SHRINK_THRESHOLD) {
     vector->size = new_size;
     if (_vector_reallocate(vector, new_size * VECTOR_GROWTH_FACTOR) == -1) {
@@ -309,7 +309,7 @@ int vector_resize(Vector *vector, size_t new_size) {
   return VECTOR_SUCCESS;
 }
 
-int vector_reserve(Vector *vector, size_t minimum_capacity) {
+int aom_vector_reserve(Vector *vector, size_t minimum_capacity) {
   if (minimum_capacity > vector->capacity) {
     if (_vector_reallocate(vector, minimum_capacity) == VECTOR_ERROR) {
       return VECTOR_ERROR;
@@ -319,18 +319,18 @@ int vector_reserve(Vector *vector, size_t minimum_capacity) {
   return VECTOR_SUCCESS;
 }
 
-int vector_shrink_to_fit(Vector *vector) {
+int aom_vector_shrink_to_fit(Vector *vector) {
   return _vector_reallocate(vector, vector->size);
 }
 
 /* Iterators */
-Iterator vector_begin(Vector *vector) { return vector_iterator(vector, 0); }
+Iterator aom_vector_begin(Vector *vector) { return aom_vector_iterator(vector, 0); }
 
-Iterator vector_end(Vector *vector) {
-  return vector_iterator(vector, vector->size);
+Iterator aom_vector_end(Vector *vector) {
+  return aom_vector_iterator(vector, vector->size);
 }
 
-Iterator vector_iterator(Vector *vector, size_t index) {
+Iterator aom_vector_iterator(Vector *vector, size_t index) {
   Iterator iterator = { NULL, 0 };
 
   assert(vector != NULL);
@@ -351,11 +351,11 @@ void *iterator_get(Iterator *iterator) { return iterator->pointer; }
 int iterator_erase(Vector *vector, Iterator *iterator) {
   size_t index = iterator_index(vector, iterator);
 
-  if (vector_erase(vector, index) == VECTOR_ERROR) {
+  if (aom_vector_erase(vector, index) == VECTOR_ERROR) {
     return VECTOR_ERROR;
   }
 
-  *iterator = vector_iterator(vector, index);
+  *iterator = aom_vector_iterator(vector, index);
 
   return VECTOR_SUCCESS;
 }
@@ -424,7 +424,7 @@ bool _vector_should_shrink(Vector *vector) {
 }
 
 size_t _vector_free_bytes(const Vector *vector) {
-  return vector_free_space(vector) * vector->element_size;
+  return aom_vector_free_space(vector) * vector->element_size;
 }
 
 void *_vector_offset(Vector *vector, size_t index) {
@@ -521,12 +521,12 @@ int _vector_reallocate(Vector *vector, size_t new_capacity) {
     if (memcpy_s(vector->data,
                              new_capacity_in_bytes,
                              old,
-                             vector_byte_size(vector)) != 0) {
+                             aom_vector_byte_size(vector)) != 0) {
         return VECTOR_ERROR;
     }
 /* clang-format on */
 #else
-  memcpy(vector->data, old, vector_byte_size(vector));
+  memcpy(vector->data, old, aom_vector_byte_size(vector));
 #endif
 
   vector->capacity = new_capacity;
diff --git a/third_party/aom/third_party/vector/vector.h b/third_party/aom/third_party/vector/vector.h
index 2bf1a9a8d..02743f5f1 100644
--- a/third_party/aom/third_party/vector/vector.h
+++ b/third_party/aom/third_party/vector/vector.h
@@ -57,60 +57,60 @@ typedef struct Iterator {
 /***** METHODS *****/
 
 /* Constructor */
-int vector_setup(Vector *vector, size_t capacity, size_t element_size);
+int aom_vector_setup(Vector *vector, size_t capacity, size_t element_size);
 
 /* Copy Constructor */
-int vector_copy(Vector *destination, Vector *source);
+int aom_vector_copy(Vector *destination, Vector *source);
 
 /* Copy Assignment */
-int vector_copy_assign(Vector *destination, Vector *source);
+int aom_vector_copy_assign(Vector *destination, Vector *source);
 
 /* Move Constructor */
-int vector_move(Vector *destination, Vector *source);
+int aom_vector_move(Vector *destination, Vector *source);
 
 /* Move Assignment */
-int vector_move_assign(Vector *destination, Vector *source);
+int aom_vector_move_assign(Vector *destination, Vector *source);
 
-int vector_swap(Vector *destination, Vector *source);
+int aom_vector_swap(Vector *destination, Vector *source);
 
 /* Destructor */
-int vector_destroy(Vector *vector);
+int aom_vector_destroy(Vector *vector);
 
 /* Insertion */
-int vector_push_back(Vector *vector, void *element);
-int vector_push_front(Vector *vector, void *element);
-int vector_insert(Vector *vector, size_t index, void *element);
-int vector_assign(Vector *vector, size_t index, void *element);
+int aom_vector_push_back(Vector *vector, void *element);
+int aom_vector_push_front(Vector *vector, void *element);
+int aom_vector_insert(Vector *vector, size_t index, void *element);
+int aom_vector_assign(Vector *vector, size_t index, void *element);
 
 /* Deletion */
-int vector_pop_back(Vector *vector);
-int vector_pop_front(Vector *vector);
-int vector_erase(Vector *vector, size_t index);
-int vector_clear(Vector *vector);
+int aom_vector_pop_back(Vector *vector);
+int aom_vector_pop_front(Vector *vector);
+int aom_vector_erase(Vector *vector, size_t index);
+int aom_vector_clear(Vector *vector);
 
 /* Lookup */
-void *vector_get(Vector *vector, size_t index);
-const void *vector_const_get(const Vector *vector, size_t index);
-void *vector_front(Vector *vector);
-void *vector_back(Vector *vector);
-#define VECTOR_GET_AS(type, vector_pointer, index) \
-  *((type *)vector_get((vector_pointer), (index)))
+void *aom_vector_get(Vector *vector, size_t index);
+const void *aom_vector_const_get(const Vector *vector, size_t index);
+void *aom_vector_front(Vector *vector);
+void *aom_vector_back(Vector *vector);
+#define VECTOR_GET_AS(type, aom_vector_pointer, index) \
+  *((type *)aom_vector_get((aom_vector_pointer), (index)))
 
 /* Information */
-bool vector_is_initialized(const Vector *vector);
-size_t vector_byte_size(const Vector *vector);
-size_t vector_free_space(const Vector *vector);
-bool vector_is_empty(const Vector *vector);
+bool aom_vector_is_initialized(const Vector *vector);
+size_t aom_vector_byte_size(const Vector *vector);
+size_t aom_vector_free_space(const Vector *vector);
+bool aom_vector_is_empty(const Vector *vector);
 
 /* Memory management */
-int vector_resize(Vector *vector, size_t new_size);
-int vector_reserve(Vector *vector, size_t minimum_capacity);
-int vector_shrink_to_fit(Vector *vector);
+int aom_vector_resize(Vector *vector, size_t new_size);
+int aom_vector_reserve(Vector *vector, size_t minimum_capacity);
+int aom_vector_shrink_to_fit(Vector *vector);
 
 /* Iterators */
-Iterator vector_begin(Vector *vector);
-Iterator vector_end(Vector *vector);
-Iterator vector_iterator(Vector *vector, size_t index);
+Iterator aom_vector_begin(Vector *vector);
+Iterator aom_vector_end(Vector *vector);
+Iterator aom_vector_iterator(Vector *vector, size_t index);
 
 void *iterator_get(Iterator *iterator);
 #define ITERATOR_GET_AS(type, iterator) *((type *)iterator_get((iterator)))
@@ -129,9 +129,9 @@ bool iterator_is_after(Iterator *first, Iterator *second);
 
 size_t iterator_index(Vector *vector, Iterator *iterator);
 
-#define VECTOR_FOR_EACH(vector_pointer, iterator_name)           \
-  for (Iterator(iterator_name) = vector_begin((vector_pointer)), \
-      end = vector_end((vector_pointer));                        \
+#define VECTOR_FOR_EACH(aom_vector_pointer, iterator_name)           \
+  for (Iterator(iterator_name) = aom_vector_begin((aom_vector_pointer)), \
+      end = aom_vector_end((aom_vector_pointer));                        \
        !iterator_equals(&(iterator_name), &end);                 \
        iterator_increment(&(iterator_name)))
 
diff --git a/third_party/aom/third_party/x86inc/x86inc.asm b/third_party/aom/third_party/x86inc/x86inc.asm
index cfee99c78..adaf2d99e 100644
--- a/third_party/aom/third_party/x86inc/x86inc.asm
+++ b/third_party/aom/third_party/x86inc/x86inc.asm
@@ -34,7 +34,7 @@
 ; as this feature might be useful for others as well.  Send patches or ideas
 ; to x264-devel@videolan.org .
 
-%include "aom_config.asm"
+%include "config/aom_config.asm"
 
 %ifndef private_prefix
     %define private_prefix aom
diff --git a/third_party/aom/tools.mk b/third_party/aom/tools.mk
deleted file mode 100644
index 26c20d433..000000000
--- a/third_party/aom/tools.mk
+++ /dev/null
@@ -1,126 +0,0 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-# List of tools to build.
-ifeq ($(CONFIG_ENTROPY_STATS), yes)
-TOOLS-$(CONFIG_AV1_ENCODER)      += aom_entropy_optimizer.c
-aom_entropy_optimizer.GUID        = 3afa9b05-940b-4d68-b5aa-55157d8ed7b4
-aom_entropy_optimizer.DESCRIPTION = Offline default probability optimizer
-endif
-
-#
-# End of specified files. The rest of the build rules should happen
-# automagically from here.
-#
-
-# Tools need different flags based on whether we're building
-# from an installed tree or a version controlled tree. Determine
-# the proper paths.
-ifeq ($(HAVE_ALT_TREE_LAYOUT),yes)
-    LIB_PATH-yes := $(SRC_PATH_BARE)/../lib
-    INC_PATH-yes := $(SRC_PATH_BARE)/../include
-else
-    LIB_PATH-yes                     += $(if $(BUILD_PFX),$(BUILD_PFX),.)
-    INC_PATH-$(CONFIG_AV1_DECODER)   += $(SRC_PATH_BARE)/av1
-    INC_PATH-$(CONFIG_AV1_ENCODER)   += $(SRC_PATH_BARE)/av1
-endif
-INC_PATH-$(CONFIG_LIBYUV) += $(SRC_PATH_BARE)/third_party/libyuv/include
-LIB_PATH := $(call enabled,LIB_PATH)
-INC_PATH := $(call enabled,INC_PATH)
-INTERNAL_CFLAGS = $(addprefix -I,$(INC_PATH))
-INTERNAL_LDFLAGS += $(addprefix -L,$(LIB_PATH))
-
-# Expand list of selected tools to build (as specified above)
-TOOLS           = $(addprefix tools/,$(call enabled,TOOLS))
-ALL_SRCS        = $(foreach ex,$(TOOLS),$($(notdir $(ex:.c=)).SRCS))
-CFLAGS += -I../include
-CODEC_EXTRA_LIBS=$(sort $(call enabled,CODEC_EXTRA_LIBS))
-
-ifneq ($(CONFIG_CODEC_SRCS), yes)
-  CFLAGS += -I../include/vpx
-endif
-
-# Expand all tools sources into a variable containing all sources
-# for that tools (not just them main one specified in TOOLS)
-# and add this file to the list (for MSVS workspace generation)
-$(foreach ex,$(TOOLS),$(eval $(notdir $(ex:.c=)).SRCS += $(ex) tools.mk))
-
-
-# Create build/install dependencies for all tools. The common case
-# is handled here. The MSVS case is handled below.
-NOT_MSVS = $(if $(CONFIG_MSVS),,yes)
-DIST-BINS-$(NOT_MSVS)      += $(addprefix bin/,$(TOOLS:.c=$(EXE_SFX)))
-DIST-SRCS-yes              += $(ALL_SRCS)
-OBJS-$(NOT_MSVS)           += $(call objs,$(ALL_SRCS))
-BINS-$(NOT_MSVS)           += $(addprefix $(BUILD_PFX),$(TOOLS:.c=$(EXE_SFX)))
-
-# Instantiate linker template for all tools.
-ifeq ($(CONFIG_OS_SUPPORT), yes)
-CODEC_EXTRA_LIBS-$(CONFIG_AV1)            += m
-else
-    ifeq ($(CONFIG_GCC), yes)
-    CODEC_EXTRA_LIBS-$(CONFIG_AV1)        += m
-    endif
-endif
-
-CODEC_LIB=$(if $(CONFIG_DEBUG_LIBS),aom_g,aom)
-ifneq ($(filter darwin%,$(TGT_OS)),)
-SHARED_LIB_SUF=.dylib
-else
-ifneq ($(filter os2%,$(TGT_OS)),)
-SHARED_LIB_SUF=_dll.a
-else
-SHARED_LIB_SUF=.so
-endif
-endif
-CODEC_LIB_SUF=$(if $(CONFIG_SHARED),$(SHARED_LIB_SUF),.a)
-$(foreach bin,$(BINS-yes),\
-    $(eval $(bin):$(LIB_PATH)/lib$(CODEC_LIB)$(CODEC_LIB_SUF))\
-    $(eval $(call linker_template,$(bin),\
-        $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) \
-        -l$(CODEC_LIB) $(addprefix -l,$(CODEC_EXTRA_LIBS))\
-        )))
-
-# The following pairs define a mapping of locations in the distribution
-# tree to locations in the source/build trees.
-INSTALL_MAPS += src/%.c   %.c
-INSTALL_MAPS += src/%     $(SRC_PATH_BARE)/%
-INSTALL_MAPS += bin/%     %
-INSTALL_MAPS += %         %
-
-
-# Build Visual Studio Projects. We use a template here to instantiate
-# explicit rules rather than using an implicit rule because we want to
-# leverage make's VPATH searching rather than specifying the paths on
-# each file in TOOLS. This has the unfortunate side effect that
-# touching the source files trigger a rebuild of the project files
-# even though there is no real dependency there (the dependency is on
-# the makefiles). We may want to revisit this.
-define vcproj_template
-$(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX)
-	$(if $(quiet),@echo "    [vcproj] $$@")
-	$(qexec)$$(GEN_VCPROJ)\
-            --exe\
-            --target=$$(TOOLCHAIN)\
-            --name=$$(@:.$(VCPROJ_SFX)=)\
-            --ver=$$(CONFIG_VS_VERSION)\
-            --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\
-            --src-path-bare="$(SRC_PATH_BARE)" \
-            $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
-            --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
-            $$(INTERNAL_LDFLAGS) $$(LDFLAGS) $$^
-endef
-TOOLS_BASENAME := $(notdir $(TOOLS))
-PROJECTS-$(CONFIG_MSVS) += $(TOOLS_BASENAME:.c=.$(VCPROJ_SFX))
-INSTALL-BINS-$(CONFIG_MSVS) += $(foreach p,$(VS_PLATFORMS),\
-                               $(addprefix bin/$(p)/,$(TOOLS_BASENAME:.c=.exe)))
-$(foreach proj,$(call enabled,PROJECTS),\
-    $(eval $(call vcproj_template,$(proj))))
diff --git a/third_party/aom/tools/all_builds.py b/third_party/aom/tools/all_builds.py
deleted file mode 100755
index d1f0c80c0..000000000
--- a/third_party/aom/tools/all_builds.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/python
-
-import getopt
-import subprocess
-import sys
-
-LONG_OPTIONS = ["shard=", "shards="]
-BASE_COMMAND = "./configure --enable-internal-stats --enable-experimental"
-
-def RunCommand(command):
-  run = subprocess.Popen(command, shell=True)
-  output = run.communicate()
-  if run.returncode:
-    print "Non-zero return code: " + str(run.returncode) + " => exiting!"
-    sys.exit(1)
-
-def list_of_experiments():
-  experiments = []
-  configure_file = open("configure")
-  list_start = False
-  for line in configure_file.read().split("\n"):
-    if line == 'EXPERIMENT_LIST="':
-      list_start = True
-    elif line == '"':
-      list_start = False
-    elif list_start:
-      currently_broken = ["csm"]
-      experiment = line[4:]
-      if experiment not in currently_broken:
-        experiments.append(experiment)
-  return experiments
-
-def main(argv):
-  # Parse arguments
-  options = {"--shard": 0, "--shards": 1}
-  if "--" in argv:
-    opt_end_index = argv.index("--")
-  else:
-    opt_end_index = len(argv)
-  try:
-    o, _ = getopt.getopt(argv[1:opt_end_index], None, LONG_OPTIONS)
-  except getopt.GetoptError, err:
-    print str(err)
-    print "Usage: %s [--shard=<n> --shards=<n>] -- [configure flag ...]"%argv[0]
-    sys.exit(2)
-
-  options.update(o)
-  extra_args = argv[opt_end_index + 1:]
-
-  # Shard experiment list
-  shard = int(options["--shard"])
-  shards = int(options["--shards"])
-  experiments = list_of_experiments()
-  base_command = " ".join([BASE_COMMAND] + extra_args)
-  configs = [base_command]
-  configs += ["%s --enable-%s" % (base_command, e) for e in experiments]
-  my_configs = zip(configs, range(len(configs)))
-  my_configs = filter(lambda x: x[1] % shards == shard, my_configs)
-  my_configs = [e[0] for e in my_configs]
-
-  # Run configs for this shard
-  for config in my_configs:
-    test_build(config)
-
-def test_build(configure_command):
-  print "\033[34m\033[47mTesting %s\033[0m" % (configure_command)
-  RunCommand(configure_command)
-  RunCommand("make clean")
-  RunCommand("make")
-
-if __name__ == "__main__":
-  main(sys.argv)
diff --git a/third_party/aom/tools/aom_entropy_optimizer.c b/third_party/aom/tools/aom_entropy_optimizer.c
index 962c1af36..551adf4f2 100644
--- a/third_party/aom/tools/aom_entropy_optimizer.c
+++ b/third_party/aom/tools/aom_entropy_optimizer.c
@@ -25,209 +25,36 @@
 
 #include <assert.h>
 #include <stdio.h>
-#include "./aom_config.h"
-#include "av1/common/entropymode.h"
-
-#if CONFIG_SMOOTH_HV
-const aom_tree_index av1_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
-  -DC_PRED,
-  2, /* 0 = DC_NODE */
-  -TM_PRED,
-  4, /* 1 = TM_NODE */
-  -V_PRED,
-  6, /* 2 = V_NODE */
-  8,
-  12, /* 3 = COM_NODE */
-  -H_PRED,
-  10, /* 4 = H_NODE */
-  -D135_PRED,
-  -D117_PRED, /* 5 = D135_NODE */
-  -D45_PRED,
-  14, /* 6 = D45_NODE */
-  -D63_PRED,
-  16, /* 7 = D63_NODE */
-  -D153_PRED,
-  18, /* 8 = D153_NODE */
-  -D207_PRED,
-  20, /* 9 = D207_NODE */
-  -SMOOTH_PRED,
-  22, /* 10 = SMOOTH_NODE */
-  -SMOOTH_V_PRED,
-  -SMOOTH_H_PRED /* 11 = SMOOTH_V_NODE */
-};
-#else
-const aom_tree_index av1_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
-  -DC_PRED,   2,            /* 0 = DC_NODE */
-  -TM_PRED,   4,            /* 1 = TM_NODE */
-  -V_PRED,    6,            /* 2 = V_NODE */
-  8,          12,           /* 3 = COM_NODE */
-  -H_PRED,    10,           /* 4 = H_NODE */
-  -D135_PRED, -D117_PRED,   /* 5 = D135_NODE */
-  -D45_PRED,  14,           /* 6 = D45_NODE */
-  -D63_PRED,  16,           /* 7 = D63_NODE */
-  -D153_PRED, 18,           /* 8 = D153_NODE */
-  -D207_PRED, -SMOOTH_PRED, /* 9 = D207_NODE */
-};
-#endif  // CONFIG_SMOOTH_HV
+
+#include "config/aom_config.h"
+
+#include "av1/encoder/encoder.h"
 
 #define SPACES_PER_TAB 2
+#define CDF_MAX_SIZE 16
 
 typedef unsigned int aom_count_type;
 // A log file recording parsed counts
 static FILE *logfile;  // TODO(yuec): make it a command line option
 
-static INLINE aom_prob get_binary_prob_new(unsigned int n0, unsigned int n1) {
-  // The "+1" will prevent this function from generating extreme probability
-  // when both n0 and n1 are small
-  const unsigned int den = n0 + 1 + n1 + 1;
-  return get_prob(n0 + 1, den);
-}
+static void counts_to_cdf(const aom_count_type *counts, aom_cdf_prob *cdf,
+                          int modes) {
+  int64_t csum[CDF_MAX_SIZE];
+  assert(modes <= CDF_MAX_SIZE);
 
-// Optimized probabilities will be stored in probs[].
-static unsigned int optimize_tree_probs(const aom_tree_index *tree,
-                                        unsigned int idx,
-                                        const unsigned int *counts,
-                                        aom_prob *probs) {
-  const int l = tree[idx];
-  const unsigned int left_count =
-      (l <= 0) ? counts[-l] : optimize_tree_probs(tree, l, counts, probs);
-  const int r = tree[idx + 1];
-  const unsigned int right_count =
-      (r <= 0) ? counts[-r] : optimize_tree_probs(tree, r, counts, probs);
-  probs[idx >> 1] = get_binary_prob_new(left_count, right_count);
-  return left_count + right_count;
-}
-
-static int parse_stats(aom_count_type **ct_ptr, FILE *const probsfile, int tabs,
-                       int dim_of_cts, int *cts_each_dim,
-                       const aom_tree_index *tree, int flatten_last_dim) {
-  if (dim_of_cts < 1) {
-    fprintf(stderr, "The dimension of a counts vector should be at least 1!\n");
-    return 1;
-  }
-  if (dim_of_cts == 1) {
-    const int total_modes = cts_each_dim[0];
-    aom_count_type *counts1d = *ct_ptr;
-    aom_prob *probs = aom_malloc(sizeof(*probs) * (total_modes - 1));
-
-    if (probs == NULL) {
-      fprintf(stderr, "Allocating prob array failed!\n");
-      return 1;
-    }
+  csum[0] = counts[0] + 1;
+  for (int i = 1; i < modes; ++i) csum[i] = counts[i] + 1 + csum[i - 1];
 
-    (*ct_ptr) += total_modes;
-    if (tree != NULL) {
-      optimize_tree_probs(tree, 0, counts1d, probs);
-    } else {
-      assert(total_modes == 2);
-      probs[0] = get_binary_prob_new(counts1d[0], counts1d[1]);
-    }
-    if (tabs > 0) fprintf(probsfile, "%*c", tabs * SPACES_PER_TAB, ' ');
-    for (int k = 0; k < total_modes - 1; ++k) {
-      if (k == total_modes - 2)
-        fprintf(probsfile, " %3d ", probs[k]);
-      else
-        fprintf(probsfile, " %3d,", probs[k]);
-      fprintf(logfile, "%d ", counts1d[k]);
-    }
-    fprintf(logfile, "%d\n", counts1d[total_modes - 1]);
-  } else if (dim_of_cts == 2 && flatten_last_dim) {
-    assert(cts_each_dim[1] == 2);
-
-    for (int k = 0; k < cts_each_dim[0]; ++k) {
-      if (k == cts_each_dim[0] - 1) {
-        fprintf(probsfile, " %3d ",
-                get_binary_prob_new((*ct_ptr)[0], (*ct_ptr)[1]));
-      } else {
-        fprintf(probsfile, " %3d,",
-                get_binary_prob_new((*ct_ptr)[0], (*ct_ptr)[1]));
-      }
-      fprintf(logfile, "%d %d\n", (*ct_ptr)[0], (*ct_ptr)[1]);
-      (*ct_ptr) += 2;
-    }
-  } else {
-    for (int k = 0; k < cts_each_dim[0]; ++k) {
-      int tabs_next_level;
-      if (dim_of_cts == 2 || (dim_of_cts == 3 && flatten_last_dim)) {
-        fprintf(probsfile, "%*c{", tabs * SPACES_PER_TAB, ' ');
-        tabs_next_level = 0;
-      } else {
-        fprintf(probsfile, "%*c{\n", tabs * SPACES_PER_TAB, ' ');
-        tabs_next_level = tabs + 1;
-      }
-      if (parse_stats(ct_ptr, probsfile, tabs_next_level, dim_of_cts - 1,
-                      cts_each_dim + 1, tree, flatten_last_dim)) {
-        return 1;
-      }
-      if (dim_of_cts == 2 || (dim_of_cts == 3 && flatten_last_dim)) {
-        if (k == cts_each_dim[0] - 1)
-          fprintf(probsfile, "}\n");
-        else
-          fprintf(probsfile, "},\n");
-      } else {
-        if (k == cts_each_dim[0] - 1)
-          fprintf(probsfile, "%*c}\n", tabs * SPACES_PER_TAB, ' ');
-        else
-          fprintf(probsfile, "%*c},\n", tabs * SPACES_PER_TAB, ' ');
-      }
-    }
-  }
-  return 0;
-}
-
-// This function parses the stats of a syntax, either binary or multi-symbol,
-// in different contexts, and writes the optimized probability table to
-// probsfile.
-//   counts: pointer of the first count element in counts array
-//   probsfile: output file
-//   dim_of_cts: number of dimensions of counts array
-//   cts_each_dim: an array storing size of each dimension of counts array
-//   tree: binary tree for a multi-symbol syntax, or NULL for a binary one
-//   flatten_last_dim: for a binary syntax, if flatten_last_dim is 0, probs in
-//                     different contexts will be written separately, e.g.,
-//                     {{p1}, {p2}, ...};
-//                     otherwise will be grouped together at the second last
-//                     dimension, i.e.,
-//                     {p1, p2, ...}.
-//   prefix: declaration header for the entropy table
-static void optimize_entropy_table(aom_count_type *counts,
-                                   FILE *const probsfile, int dim_of_cts,
-                                   int *cts_each_dim,
-                                   const aom_tree_index *tree,
-                                   int flatten_last_dim, char *prefix) {
-  aom_count_type *ct_ptr = counts;
-
-  assert(!flatten_last_dim || cts_each_dim[dim_of_cts - 1] == 2);
-
-  fprintf(probsfile, "%s = {\n", prefix);
-  if (parse_stats(&ct_ptr, probsfile, 1, dim_of_cts, cts_each_dim, tree,
-                  flatten_last_dim)) {
-    fprintf(probsfile, "Optimizer failed!\n");
-  }
-  fprintf(probsfile, "};\n\n");
+  for (int i = 0; i < modes; ++i) fprintf(logfile, "%d ", counts[i]);
   fprintf(logfile, "\n");
-}
-
-static int counts_to_cdf(const aom_count_type *counts, aom_cdf_prob *cdf,
-                         int modes) {
-  int64_t *csum = aom_malloc(sizeof(*csum) * modes);
-
-  if (csum == NULL) {
-    fprintf(stderr, "Allocating csum array failed!\n");
-    return 1;
-  }
-  csum[0] = counts[0];
-  for (int i = 1; i < modes; ++i) csum[i] = counts[i] + csum[i - 1];
 
   int64_t sum = csum[modes - 1];
-  int64_t round_shift = sum >> 1;
+  const int64_t round_shift = sum >> 1;
   for (int i = 0; i < modes; ++i) {
-    if (sum <= 0)
-      cdf[i] = CDF_PROB_TOP;
-    else
-      cdf[i] = (csum[i] * CDF_PROB_TOP + round_shift) / sum;
+    cdf[i] = (csum[i] * CDF_PROB_TOP + round_shift) / sum;
+    cdf[i] = AOMMIN(cdf[i], CDF_PROB_TOP - (modes - 1 + i) * 4);
+    cdf[i] = (i == 0) ? AOMMAX(cdf[i], 4) : AOMMAX(cdf[i], cdf[i - 1] + 4);
   }
-  return 0;
 }
 
 static int parse_counts_for_cdf_opt(aom_count_type **ct_ptr,
@@ -237,29 +64,28 @@ static int parse_counts_for_cdf_opt(aom_count_type **ct_ptr,
     fprintf(stderr, "The dimension of a counts vector should be at least 1!\n");
     return 1;
   }
+  const int total_modes = cts_each_dim[0];
   if (dim_of_cts == 1) {
-    const int total_modes = cts_each_dim[0];
+    assert(total_modes <= CDF_MAX_SIZE);
+    aom_cdf_prob cdfs[CDF_MAX_SIZE];
     aom_count_type *counts1d = *ct_ptr;
-    aom_cdf_prob *cdfs = aom_malloc(sizeof(*cdfs) * total_modes);
-
-    if (cdfs == NULL) {
-      fprintf(stderr, "Allocating cdf array failed!\n");
-      return 1;
-    }
 
     counts_to_cdf(counts1d, cdfs, total_modes);
     (*ct_ptr) += total_modes;
 
     if (tabs > 0) fprintf(probsfile, "%*c", tabs * SPACES_PER_TAB, ' ');
-    for (int k = 0; k < total_modes; ++k)
-      fprintf(probsfile, " AOM_ICDF(%d),", cdfs[k]);
-    fprintf(probsfile, " 0 ");
+    fprintf(probsfile, "AOM_CDF%d(", total_modes);
+    for (int k = 0; k < total_modes - 1; ++k) {
+      fprintf(probsfile, "%d", cdfs[k]);
+      if (k < total_modes - 2) fprintf(probsfile, ", ");
+    }
+    fprintf(probsfile, ")");
   } else {
-    for (int k = 0; k < cts_each_dim[0]; ++k) {
+    for (int k = 0; k < total_modes; ++k) {
       int tabs_next_level;
 
       if (dim_of_cts == 2)
-        fprintf(probsfile, "%*c{", tabs * SPACES_PER_TAB, ' ');
+        fprintf(probsfile, "%*c{ ", tabs * SPACES_PER_TAB, ' ');
       else
         fprintf(probsfile, "%*c{\n", tabs * SPACES_PER_TAB, ' ');
       tabs_next_level = dim_of_cts == 2 ? 0 : tabs + 1;
@@ -270,19 +96,18 @@ static int parse_counts_for_cdf_opt(aom_count_type **ct_ptr,
       }
 
       if (dim_of_cts == 2) {
-        if (k == cts_each_dim[0] - 1)
-          fprintf(probsfile, "}\n");
+        if (k == total_modes - 1)
+          fprintf(probsfile, " }\n");
         else
-          fprintf(probsfile, "},\n");
+          fprintf(probsfile, " },\n");
       } else {
-        if (k == cts_each_dim[0] - 1)
+        if (k == total_modes - 1)
           fprintf(probsfile, "%*c}\n", tabs * SPACES_PER_TAB, ' ');
         else
           fprintf(probsfile, "%*c},\n", tabs * SPACES_PER_TAB, ' ');
       }
     }
   }
-
   return 0;
 }
 
@@ -292,11 +117,144 @@ static void optimize_cdf_table(aom_count_type *counts, FILE *const probsfile,
   aom_count_type *ct_ptr = counts;
 
   fprintf(probsfile, "%s = {\n", prefix);
+  fprintf(logfile, "%s\n", prefix);
   if (parse_counts_for_cdf_opt(&ct_ptr, probsfile, 1, dim_of_cts,
                                cts_each_dim)) {
     fprintf(probsfile, "Optimizer failed!\n");
   }
   fprintf(probsfile, "};\n\n");
+  fprintf(logfile, "============================\n");
+}
+
+static void optimize_uv_mode(aom_count_type *counts, FILE *const probsfile,
+                             int dim_of_cts, int *cts_each_dim, char *prefix) {
+  aom_count_type *ct_ptr = counts;
+
+  fprintf(probsfile, "%s = {\n", prefix);
+  fprintf(probsfile, "%*c{\n", SPACES_PER_TAB, ' ');
+  fprintf(logfile, "%s\n", prefix);
+  cts_each_dim[2] = UV_INTRA_MODES - 1;
+  for (int k = 0; k < cts_each_dim[1]; ++k) {
+    fprintf(probsfile, "%*c{ ", 2 * SPACES_PER_TAB, ' ');
+    parse_counts_for_cdf_opt(&ct_ptr, probsfile, 0, dim_of_cts - 2,
+                             cts_each_dim + 2);
+    if (k + 1 == cts_each_dim[1]) {
+      fprintf(probsfile, " }\n");
+    } else {
+      fprintf(probsfile, " },\n");
+    }
+    ++ct_ptr;
+  }
+  fprintf(probsfile, "%*c},\n", SPACES_PER_TAB, ' ');
+  fprintf(probsfile, "%*c{\n", SPACES_PER_TAB, ' ');
+  cts_each_dim[2] = UV_INTRA_MODES;
+  parse_counts_for_cdf_opt(&ct_ptr, probsfile, 2, dim_of_cts - 1,
+                           cts_each_dim + 1);
+  fprintf(probsfile, "%*c}\n", SPACES_PER_TAB, ' ');
+  fprintf(probsfile, "};\n\n");
+  fprintf(logfile, "============================\n");
+}
+
+static void optimize_cdf_table_var_modes_2d(aom_count_type *counts,
+                                            FILE *const probsfile,
+                                            int dim_of_cts, int *cts_each_dim,
+                                            int *modes_each_ctx, char *prefix) {
+  aom_count_type *ct_ptr = counts;
+
+  assert(dim_of_cts == 2);
+  (void)dim_of_cts;
+
+  fprintf(probsfile, "%s = {\n", prefix);
+  fprintf(logfile, "%s\n", prefix);
+
+  for (int d0_idx = 0; d0_idx < cts_each_dim[0]; ++d0_idx) {
+    int num_of_modes = modes_each_ctx[d0_idx];
+
+    if (num_of_modes > 0) {
+      fprintf(probsfile, "%*c{ ", SPACES_PER_TAB, ' ');
+      parse_counts_for_cdf_opt(&ct_ptr, probsfile, 0, 1, &num_of_modes);
+      ct_ptr += cts_each_dim[1] - num_of_modes;
+      fprintf(probsfile, " },\n");
+    } else {
+      fprintf(probsfile, "%*c{ 0 },\n", SPACES_PER_TAB, ' ');
+      fprintf(logfile, "dummy cdf, no need to optimize\n");
+      ct_ptr += cts_each_dim[1];
+    }
+  }
+  fprintf(probsfile, "};\n\n");
+  fprintf(logfile, "============================\n");
+}
+
+static void optimize_cdf_table_var_modes_3d(aom_count_type *counts,
+                                            FILE *const probsfile,
+                                            int dim_of_cts, int *cts_each_dim,
+                                            int *modes_each_ctx, char *prefix) {
+  aom_count_type *ct_ptr = counts;
+
+  assert(dim_of_cts == 3);
+  (void)dim_of_cts;
+
+  fprintf(probsfile, "%s = {\n", prefix);
+  fprintf(logfile, "%s\n", prefix);
+
+  for (int d0_idx = 0; d0_idx < cts_each_dim[0]; ++d0_idx) {
+    fprintf(probsfile, "%*c{\n", SPACES_PER_TAB, ' ');
+    for (int d1_idx = 0; d1_idx < cts_each_dim[1]; ++d1_idx) {
+      int num_of_modes = modes_each_ctx[d0_idx];
+
+      if (num_of_modes > 0) {
+        fprintf(probsfile, "%*c{ ", 2 * SPACES_PER_TAB, ' ');
+        parse_counts_for_cdf_opt(&ct_ptr, probsfile, 0, 1, &num_of_modes);
+        ct_ptr += cts_each_dim[2] - num_of_modes;
+        fprintf(probsfile, " },\n");
+      } else {
+        fprintf(probsfile, "%*c{ 0 },\n", 2 * SPACES_PER_TAB, ' ');
+        fprintf(logfile, "dummy cdf, no need to optimize\n");
+        ct_ptr += cts_each_dim[2];
+      }
+    }
+    fprintf(probsfile, "%*c},\n", SPACES_PER_TAB, ' ');
+  }
+  fprintf(probsfile, "};\n\n");
+  fprintf(logfile, "============================\n");
+}
+
+static void optimize_cdf_table_var_modes_4d(aom_count_type *counts,
+                                            FILE *const probsfile,
+                                            int dim_of_cts, int *cts_each_dim,
+                                            int *modes_each_ctx, char *prefix) {
+  aom_count_type *ct_ptr = counts;
+
+  assert(dim_of_cts == 4);
+  (void)dim_of_cts;
+
+  fprintf(probsfile, "%s = {\n", prefix);
+  fprintf(logfile, "%s\n", prefix);
+
+  for (int d0_idx = 0; d0_idx < cts_each_dim[0]; ++d0_idx) {
+    fprintf(probsfile, "%*c{\n", SPACES_PER_TAB, ' ');
+    for (int d1_idx = 0; d1_idx < cts_each_dim[1]; ++d1_idx) {
+      fprintf(probsfile, "%*c{\n", 2 * SPACES_PER_TAB, ' ');
+      for (int d2_idx = 0; d2_idx < cts_each_dim[2]; ++d2_idx) {
+        int num_of_modes = modes_each_ctx[d0_idx];
+
+        if (num_of_modes > 0) {
+          fprintf(probsfile, "%*c{ ", 3 * SPACES_PER_TAB, ' ');
+          parse_counts_for_cdf_opt(&ct_ptr, probsfile, 0, 1, &num_of_modes);
+          ct_ptr += cts_each_dim[3] - num_of_modes;
+          fprintf(probsfile, " },\n");
+        } else {
+          fprintf(probsfile, "%*c{ 0 },\n", 3 * SPACES_PER_TAB, ' ');
+          fprintf(logfile, "dummy cdf, no need to optimize\n");
+          ct_ptr += cts_each_dim[3];
+        }
+      }
+      fprintf(probsfile, "%*c},\n", 2 * SPACES_PER_TAB, ' ');
+    }
+    fprintf(probsfile, "%*c},\n", SPACES_PER_TAB, ' ');
+  }
+  fprintf(probsfile, "};\n\n");
+  fprintf(logfile, "============================\n");
 }
 
 int main(int argc, const char **argv) {
@@ -312,7 +270,8 @@ int main(int argc, const char **argv) {
   }
 
   FRAME_COUNTS fc;
-  fread(&fc, sizeof(FRAME_COUNTS), 1, statsfile);
+  const size_t bytes = fread(&fc, sizeof(FRAME_COUNTS), 1, statsfile);
+  if (!bytes) return 1;
 
   FILE *const probsfile = fopen("optimized_probs.c", "w");
   if (probsfile == NULL) {
@@ -330,67 +289,81 @@ int main(int argc, const char **argv) {
   int cts_each_dim[10];
 
   /* Intra mode (keyframe luma) */
-  cts_each_dim[0] = INTRA_MODES;
-  cts_each_dim[1] = INTRA_MODES;
+  cts_each_dim[0] = KF_MODE_CONTEXTS;
+  cts_each_dim[1] = KF_MODE_CONTEXTS;
   cts_each_dim[2] = INTRA_MODES;
-  optimize_entropy_table(
-      &fc.kf_y_mode[0][0][0], probsfile, 3, cts_each_dim, av1_intra_mode_tree,
-      0,
-      "const aom_prob av1_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]"
-      "[INTRA_MODES - 1]");
-  optimize_cdf_table(
-      &fc.kf_y_mode[0][0][0], probsfile, 3, cts_each_dim,
-      "const aom_cdf_prob\n"
-      "av1_kf_y_mode_cdf[INTRA_MODES][INTRA_MODES][CDF_SIZE(INTRA_MODES)]");
+  optimize_cdf_table(&fc.kf_y_mode[0][0][0], probsfile, 3, cts_each_dim,
+                     "const aom_cdf_prob\n"
+                     "default_kf_y_mode_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS]"
+                     "[CDF_SIZE(INTRA_MODES)]");
+
+  cts_each_dim[0] = DIRECTIONAL_MODES;
+  cts_each_dim[1] = 2 * MAX_ANGLE_DELTA + 1;
+  optimize_cdf_table(&fc.angle_delta[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob default_angle_delta_cdf"
+                     "[DIRECTIONAL_MODES][CDF_SIZE(2 * MAX_ANGLE_DELTA + 1)]");
 
   /* Intra mode (non-keyframe luma) */
   cts_each_dim[0] = BLOCK_SIZE_GROUPS;
   cts_each_dim[1] = INTRA_MODES;
-  optimize_entropy_table(
-      &fc.y_mode[0][0], probsfile, 2, cts_each_dim, av1_intra_mode_tree, 0,
-      "static const aom_prob default_if_y_probs[BLOCK_SIZE_GROUPS]"
-      "[INTRA_MODES - 1]");
   optimize_cdf_table(
       &fc.y_mode[0][0], probsfile, 2, cts_each_dim,
       "static const aom_cdf_prob\n"
       "default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)]");
 
   /* Intra mode (chroma) */
-  cts_each_dim[0] = INTRA_MODES;
-  cts_each_dim[1] = UV_INTRA_MODES;
-  optimize_entropy_table(&fc.uv_mode[0][0], probsfile, 2, cts_each_dim,
-                         av1_intra_mode_tree, 0,
-                         "static const aom_prob default_uv_probs[INTRA_MODES]"
-                         "[UV_INTRA_MODES - 1]");
-  optimize_cdf_table(
-      &fc.uv_mode[0][0], probsfile, 2, cts_each_dim,
-      "static const aom_cdf_prob\n"
-      "default_uv_mode_cdf[INTRA_MODES][CDF_SIZE(UV_INTRA_MODES)]");
+  cts_each_dim[0] = CFL_ALLOWED_TYPES;
+  cts_each_dim[1] = INTRA_MODES;
+  cts_each_dim[2] = UV_INTRA_MODES;
+  optimize_uv_mode(&fc.uv_mode[0][0][0], probsfile, 3, cts_each_dim,
+                   "static const aom_cdf_prob\n"
+                   "default_uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES]"
+                   "[CDF_SIZE(UV_INTRA_MODES)]");
 
-  /* Partition */
+  /* block partition */
   cts_each_dim[0] = PARTITION_CONTEXTS;
-#if CONFIG_EXT_PARTITION_TYPES
   cts_each_dim[1] = EXT_PARTITION_TYPES;
-  // TODO(yuec): Wrong prob for context = 0, because the old tree is used
-  optimize_entropy_table(&fc.partition[0][0], probsfile, 2, cts_each_dim,
-                         av1_ext_partition_tree, 0,
-                         "static const aom_prob default_partition_probs"
-                         "[PARTITION_CONTEXTS][EXT_PARTITION_TYPES - 1]");
-  optimize_cdf_table(&fc.partition[0][0], probsfile, 2, cts_each_dim,
+  int part_types_each_ctx[PARTITION_CONTEXTS] = {
+    4, 4, 4, 4, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 8, 8, 8
+  };
+  optimize_cdf_table_var_modes_2d(
+      &fc.partition[0][0], probsfile, 2, cts_each_dim, part_types_each_ctx,
+      "static const aom_cdf_prob default_partition_cdf[PARTITION_CONTEXTS]"
+      "[CDF_SIZE(EXT_PARTITION_TYPES)]");
+
+  /* tx type */
+  cts_each_dim[0] = EXT_TX_SETS_INTRA;
+  cts_each_dim[1] = EXT_TX_SIZES;
+  cts_each_dim[2] = INTRA_MODES;
+  cts_each_dim[3] = TX_TYPES;
+  int intra_ext_tx_types_each_ctx[EXT_TX_SETS_INTRA] = { 0, 7, 5 };
+  optimize_cdf_table_var_modes_4d(
+      &fc.intra_ext_tx[0][0][0][0], probsfile, 4, cts_each_dim,
+      intra_ext_tx_types_each_ctx,
+      "static const aom_cdf_prob default_intra_ext_tx_cdf[EXT_TX_SETS_INTRA]"
+      "[EXT_TX_SIZES][INTRA_MODES][CDF_SIZE(TX_TYPES)]");
+
+  cts_each_dim[0] = EXT_TX_SETS_INTER;
+  cts_each_dim[1] = EXT_TX_SIZES;
+  cts_each_dim[2] = TX_TYPES;
+  int inter_ext_tx_types_each_ctx[EXT_TX_SETS_INTER] = { 0, 16, 12, 2 };
+  optimize_cdf_table_var_modes_3d(
+      &fc.inter_ext_tx[0][0][0], probsfile, 3, cts_each_dim,
+      inter_ext_tx_types_each_ctx,
+      "static const aom_cdf_prob default_inter_ext_tx_cdf[EXT_TX_SETS_INTER]"
+      "[EXT_TX_SIZES][CDF_SIZE(TX_TYPES)]");
+
+  /* Chroma from Luma */
+  cts_each_dim[0] = CFL_JOINT_SIGNS;
+  optimize_cdf_table(&fc.cfl_sign[0], probsfile, 1, cts_each_dim,
                      "static const aom_cdf_prob\n"
-                     "default_partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(EXT_"
-                     "PARTITION_TYPES)]");
-#else
-  cts_each_dim[1] = PARTITION_TYPES;
-  optimize_entropy_table(&fc.partition[0][0], probsfile, 2, cts_each_dim,
-                         av1_partition_tree, 0,
-                         "static const aom_prob default_partition_probs"
-                         "[PARTITION_CONTEXTS][PARTITION_TYPES - 1]");
-  optimize_cdf_table(
-      &fc.partition[0][0], probsfile, 2, cts_each_dim,
-      "static const aom_cdf_prob\n"
-      "default_partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(PARTITION_TYPES)]");
-#endif
+                     "default_cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)]");
+  cts_each_dim[0] = CFL_ALPHA_CONTEXTS;
+  cts_each_dim[1] = CFL_ALPHABET_SIZE;
+  optimize_cdf_table(&fc.cfl_alpha[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob\n"
+                     "default_cfl_alpha_cdf[CFL_ALPHA_CONTEXTS]"
+                     "[CDF_SIZE(CFL_ALPHABET_SIZE)]");
 
   /* Interpolation filter */
   cts_each_dim[0] = SWITCHABLE_FILTER_CONTEXTS;
@@ -403,36 +376,24 @@ int main(int argc, const char **argv) {
   /* Motion vector referencing */
   cts_each_dim[0] = NEWMV_MODE_CONTEXTS;
   cts_each_dim[1] = 2;
-  optimize_entropy_table(
-      &fc.newmv_mode[0][0], probsfile, 2, cts_each_dim, NULL, 1,
-      "static const aom_prob default_newmv_prob[NEWMV_MODE_CONTEXTS]");
   optimize_cdf_table(&fc.newmv_mode[0][0], probsfile, 2, cts_each_dim,
                      "static const aom_cdf_prob "
                      "default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)]");
 
-  cts_each_dim[0] = ZEROMV_MODE_CONTEXTS;
+  cts_each_dim[0] = GLOBALMV_MODE_CONTEXTS;
   cts_each_dim[1] = 2;
-  optimize_entropy_table(
-      &fc.zeromv_mode[0][0], probsfile, 2, cts_each_dim, NULL, 1,
-      "static const aom_prob default_zeromv_prob[ZEROMV_MODE_CONTEXTS]");
   optimize_cdf_table(&fc.zeromv_mode[0][0], probsfile, 2, cts_each_dim,
                      "static const aom_cdf_prob "
-                     "default_zeromv_cdf[ZEROMV_MODE_CONTEXTS][CDF_SIZE(2)]");
+                     "default_zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(2)]");
 
   cts_each_dim[0] = REFMV_MODE_CONTEXTS;
   cts_each_dim[1] = 2;
-  optimize_entropy_table(
-      &fc.refmv_mode[0][0], probsfile, 2, cts_each_dim, NULL, 1,
-      "static const aom_prob default_refmv_prob[REFMV_MODE_CONTEXTS]");
   optimize_cdf_table(&fc.refmv_mode[0][0], probsfile, 2, cts_each_dim,
                      "static const aom_cdf_prob "
                      "default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)]");
 
   cts_each_dim[0] = DRL_MODE_CONTEXTS;
   cts_each_dim[1] = 2;
-  optimize_entropy_table(
-      &fc.drl_mode[0][0], probsfile, 2, cts_each_dim, NULL, 1,
-      "static const aom_prob default_drl_prob[DRL_MODE_CONTEXTS]");
   optimize_cdf_table(&fc.drl_mode[0][0], probsfile, 2, cts_each_dim,
                      "static const aom_cdf_prob "
                      "default_drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)]");
@@ -441,48 +402,20 @@ int main(int argc, const char **argv) {
   /* New compound mode */
   cts_each_dim[0] = INTER_MODE_CONTEXTS;
   cts_each_dim[1] = INTER_COMPOUND_MODES;
-  optimize_entropy_table(
-      &fc.inter_compound_mode[0][0], probsfile, 2, cts_each_dim,
-      av1_inter_compound_mode_tree, 0,
-      "static const aom_prob default_inter_compound_mode_probs\n"
-      "[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES - 1]");
   optimize_cdf_table(&fc.inter_compound_mode[0][0], probsfile, 2, cts_each_dim,
                      "static const aom_cdf_prob\n"
                      "default_inter_compound_mode_cdf[INTER_MODE_CONTEXTS][CDF_"
                      "SIZE(INTER_COMPOUND_MODES)]");
-#if CONFIG_COMPOUND_SINGLEREF
-  /* Compound singleref mode */
-  cts_each_dim[0] = INTER_MODE_CONTEXTS;
-  cts_each_dim[1] = INTER_SINGLEREF_COMP_MODES;
-  optimize_entropy_table(
-      &fc.inter_singleref_comp_mode[0][0], probsfile, 2, cts_each_dim,
-      av1_inter_singleref_comp_mode_tree, 0,
-      "static const aom_prob default_inter_singleref_comp_mode_probs\n"
-      "[INTER_MODE_CONTEXTS][INTER_SINGLEREF_COMP_MODES - 1]");
-  optimize_cdf_table(&fc.inter_singleref_comp_mode[0][0], probsfile, 2,
-                     cts_each_dim,
-                     "static const aom_cdf_prob\n"
-                     "default_inter_singleref_comp_mode_cdf[INTER_MODE_"
-                     "CONTEXTS][CDF_SIZE(INTER_SINGLEREF_COMP_MODES)]");
-#endif
-#if CONFIG_INTERINTRA
+
   /* Interintra */
   cts_each_dim[0] = BLOCK_SIZE_GROUPS;
   cts_each_dim[1] = 2;
-  optimize_entropy_table(
-      &fc.interintra[0][0], probsfile, 2, cts_each_dim, NULL, 1,
-      "static const aom_prob default_interintra_prob[BLOCK_SIZE_GROUPS]");
   optimize_cdf_table(&fc.interintra[0][0], probsfile, 2, cts_each_dim,
                      "static const aom_cdf_prob "
                      "default_interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)]");
 
   cts_each_dim[0] = BLOCK_SIZE_GROUPS;
   cts_each_dim[1] = INTERINTRA_MODES;
-  optimize_entropy_table(
-      &fc.interintra_mode[0][0], probsfile, 2, cts_each_dim,
-      av1_interintra_mode_tree, 0,
-      "static const aom_prob "
-      "default_interintra_mode_prob[BLOCK_SIZE_GROUPS][INTERINTRA_MODES - 1]");
   optimize_cdf_table(&fc.interintra_mode[0][0], probsfile, 2, cts_each_dim,
                      "static const aom_cdf_prob\n"
                      "default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE("
@@ -490,83 +423,40 @@ int main(int argc, const char **argv) {
 
   cts_each_dim[0] = BLOCK_SIZES_ALL;
   cts_each_dim[1] = 2;
-  optimize_entropy_table(
-      &fc.wedge_interintra[0][0], probsfile, 2, cts_each_dim, NULL, 1,
-      "static const aom_prob default_wedge_interintra_prob[BLOCK_SIZES_ALL]");
   optimize_cdf_table(
       &fc.wedge_interintra[0][0], probsfile, 2, cts_each_dim,
       "static const aom_cdf_prob\n"
       "default_wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)]");
-#endif
+
   /* Compound type */
   cts_each_dim[0] = BLOCK_SIZES_ALL;
-  cts_each_dim[1] = COMPOUND_TYPES;
-  optimize_entropy_table(&fc.compound_interinter[0][0], probsfile, 2,
-                         cts_each_dim, av1_compound_type_tree, 0,
-                         "static const aom_prob default_compound_type_probs"
-                         "[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1]");
-  optimize_cdf_table(
-      &fc.compound_interinter[0][0], probsfile, 2, cts_each_dim,
-      "static const aom_cdf_prob\n"
-      "default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES)]");
+  cts_each_dim[1] = COMPOUND_TYPES - 1;
+  optimize_cdf_table(&fc.compound_type[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob default_compound_type_cdf"
+                     "[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)]");
+
+  cts_each_dim[0] = BLOCK_SIZES_ALL;
+  cts_each_dim[1] = 16;
+  optimize_cdf_table(&fc.wedge_idx[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)]");
 
-/* motion_var and warped_motion experiments */
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  /* motion_var and warped_motion experiments */
   cts_each_dim[0] = BLOCK_SIZES_ALL;
   cts_each_dim[1] = MOTION_MODES;
-  optimize_entropy_table(
-      &fc.motion_mode[0][0], probsfile, 2, cts_each_dim, av1_motion_mode_tree,
-      0,
-      "static const aom_prob default_motion_mode_prob[BLOCK_SIZES]"
-      "[MOTION_MODES - 1]");
   optimize_cdf_table(
       &fc.motion_mode[0][0], probsfile, 2, cts_each_dim,
       "static const aom_cdf_prob\n"
       "default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)]");
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
   cts_each_dim[0] = BLOCK_SIZES_ALL;
   cts_each_dim[1] = 2;
-  optimize_entropy_table(
-      &fc.obmc[0][0], probsfile, 2, cts_each_dim, NULL, 1,
-      "static const aom_prob default_obmc_prob[BLOCK_SIZES_ALL]");
   optimize_cdf_table(&fc.obmc[0][0], probsfile, 2, cts_each_dim,
                      "static const aom_cdf_prob "
                      "default_obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)]");
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  cts_each_dim[0] = ADAPT_OVERLAP_BLOCKS;
-  cts_each_dim[1] = MAX_NCOBMC_MODES;
-  optimize_entropy_table(
-      &fc.ncobmc_mode[0][0], probsfile, 2, cts_each_dim, av1_ncobmc_mode_tree,
-      0,
-      "static const aom_prob default_ncobmc_mode_prob[ADAPT_OVERLAP_BLOCKS]"
-      "[MAX_NCOBMC_MODES - 1]");
-  optimize_cdf_table(&fc.ncobmc_mode[0][0], probsfile, 2, cts_each_dim,
-                     "static const aom_cdf_prob\n"
-                     "default_ncobmc_mode_cdf[ADAPT_OVERLAP_BLOCKS]"
-                     "[CDF_SIZE(MAX_NCOBMC_MODES)]");
-#if CONFIG_WARPED_MOTION
-  cts_each_dim[0] = BLOCK_SIZES_ALL;
-  cts_each_dim[1] = OBMC_FAMILY_MODES;
-  optimize_entropy_table(
-      &fc.ncobmc[0][0], probsfile, 2, cts_each_dim, av1_ncobmc_tree, 0,
-      "static const aom_prob default_ncobmc_prob[BLOCK_SIZES_ALL]"
-      "[OBMC_FAMILY_MODES - 1]");
-  optimize_cdf_table(&fc.ncobmc[0][0], probsfile, 2, cts_each_dim,
-                     "static const aom_cdf_prob\n"
-                     "default_ncobmc_cdf[BLOCK_SIZES_ALL]"
-                     "[CDF_SIZE(OBMC_FAMILY_MODES)]");
-#endif
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
   /* Intra/inter flag */
   cts_each_dim[0] = INTRA_INTER_CONTEXTS;
   cts_each_dim[1] = 2;
-  optimize_entropy_table(&fc.intra_inter[0][0], probsfile, 2, cts_each_dim,
-                         NULL, 1,
-                         "static const aom_prob default_intra_inter_p"
-                         "[INTRA_INTER_CONTEXTS]");
   optimize_cdf_table(
       &fc.intra_inter[0][0], probsfile, 2, cts_each_dim,
       "static const aom_cdf_prob\n"
@@ -575,22 +465,14 @@ int main(int argc, const char **argv) {
   /* Single/comp ref flag */
   cts_each_dim[0] = COMP_INTER_CONTEXTS;
   cts_each_dim[1] = 2;
-  optimize_entropy_table(&fc.comp_inter[0][0], probsfile, 2, cts_each_dim, NULL,
-                         1,
-                         "static const aom_prob default_comp_inter_p"
-                         "[COMP_INTER_CONTEXTS]");
   optimize_cdf_table(
       &fc.comp_inter[0][0], probsfile, 2, cts_each_dim,
       "static const aom_cdf_prob\n"
       "default_comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(2)]");
 
-/* ext_comp_refs experiment */
-#if CONFIG_EXT_COMP_REFS
+  /* ext_comp_refs experiment */
   cts_each_dim[0] = COMP_REF_TYPE_CONTEXTS;
   cts_each_dim[1] = 2;
-  optimize_entropy_table(
-      &fc.comp_ref_type[0][0], probsfile, 2, cts_each_dim, NULL, 1,
-      "static const aom_prob default_comp_ref_type_p[COMP_REF_TYPE_CONTEXTS]");
   optimize_cdf_table(
       &fc.comp_ref_type[0][0], probsfile, 2, cts_each_dim,
       "static const aom_cdf_prob\n"
@@ -599,37 +481,24 @@ int main(int argc, const char **argv) {
   cts_each_dim[0] = UNI_COMP_REF_CONTEXTS;
   cts_each_dim[1] = UNIDIR_COMP_REFS - 1;
   cts_each_dim[2] = 2;
-  optimize_entropy_table(
-      &fc.uni_comp_ref[0][0][0], probsfile, 3, cts_each_dim, NULL, 1,
-      "static const aom_prob\n"
-      "default_uni_comp_ref_p[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1]");
   optimize_cdf_table(&fc.uni_comp_ref[0][0][0], probsfile, 3, cts_each_dim,
                      "static const aom_cdf_prob\n"
                      "default_uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_"
                      "COMP_REFS - 1][CDF_SIZE(2)]");
-#endif
 
   /* Reference frame (single ref) */
   cts_each_dim[0] = REF_CONTEXTS;
   cts_each_dim[1] = SINGLE_REFS - 1;
   cts_each_dim[2] = 2;
-  optimize_entropy_table(
-      &fc.single_ref[0][0][0], probsfile, 3, cts_each_dim, NULL, 1,
-      "static const aom_prob default_single_ref_p[REF_CONTEXTS]"
-      "[SINGLE_REFS - 1]");
   optimize_cdf_table(
       &fc.single_ref[0][0][0], probsfile, 3, cts_each_dim,
       "static const aom_cdf_prob\n"
       "default_single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1][CDF_SIZE(2)]");
 
-#if CONFIG_EXT_REFS
   /* ext_refs experiment */
   cts_each_dim[0] = REF_CONTEXTS;
   cts_each_dim[1] = FWD_REFS - 1;
   cts_each_dim[2] = 2;
-  optimize_entropy_table(
-      &fc.comp_ref[0][0][0], probsfile, 3, cts_each_dim, NULL, 1,
-      "static const aom_prob default_comp_ref_p[REF_CONTEXTS][FWD_REFS - 1]");
   optimize_cdf_table(
       &fc.comp_ref[0][0][0], probsfile, 3, cts_each_dim,
       "static const aom_cdf_prob\n"
@@ -638,297 +507,248 @@ int main(int argc, const char **argv) {
   cts_each_dim[0] = REF_CONTEXTS;
   cts_each_dim[1] = BWD_REFS - 1;
   cts_each_dim[2] = 2;
-  optimize_entropy_table(&fc.comp_bwdref[0][0][0], probsfile, 3, cts_each_dim,
-                         NULL, 1,
-                         "static const aom_prob "
-                         "default_comp_bwdref_p[REF_CONTEXTS][BWD_REFS - 1]");
   optimize_cdf_table(
       &fc.comp_bwdref[0][0][0], probsfile, 3, cts_each_dim,
       "static const aom_cdf_prob\n"
       "default_comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)]");
-#else
-  /* Reference frame (compound refs) */
-  cts_each_dim[0] = REF_CONTEXTS;
-  cts_each_dim[1] = COMP_REFS - 1;
+
+  /* palette */
+  cts_each_dim[0] = PALATTE_BSIZE_CTXS;
+  cts_each_dim[1] = PALETTE_SIZES;
+  optimize_cdf_table(&fc.palette_y_size[0][0], probsfile, 2, cts_each_dim,
+                     "const aom_cdf_prob default_palette_y_size_cdf"
+                     "[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)]");
+
+  cts_each_dim[0] = PALATTE_BSIZE_CTXS;
+  cts_each_dim[1] = PALETTE_SIZES;
+  optimize_cdf_table(&fc.palette_uv_size[0][0], probsfile, 2, cts_each_dim,
+                     "const aom_cdf_prob default_palette_uv_size_cdf"
+                     "[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)]");
+
+  cts_each_dim[0] = PALATTE_BSIZE_CTXS;
+  cts_each_dim[1] = PALETTE_Y_MODE_CONTEXTS;
   cts_each_dim[2] = 2;
-  optimize_entropy_table(
-      &fc.comp_ref[0][0][0], probsfile, 3, cts_each_dim, NULL, 1,
-      "static const aom_prob default_comp_ref_p[REF_CONTEXTS]"
-      "[COMP_REFS - 1]");
-  optimize_cdf_table(
-      &fc.comp_ref[0][0][0], probsfile, 3, cts_each_dim,
-      "static const aom_cdf_prob\n"
-      "default_comp_ref_cdf[REF_CONTEXTS][COMP_REFS - 1][CDF_SIZE(2)]");
-#endif  // CONFIG_EXT_REFS
+  optimize_cdf_table(&fc.palette_y_mode[0][0][0], probsfile, 3, cts_each_dim,
+                     "const aom_cdf_prob default_palette_y_mode_cdf"
+                     "[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS]"
+                     "[CDF_SIZE(2)]");
 
-/* Compound single ref inter mode */
-#if CONFIG_COMPOUND_SINGLEREF
-  cts_each_dim[0] = COMP_INTER_MODE_CONTEXTS;
+  cts_each_dim[0] = PALETTE_UV_MODE_CONTEXTS;
   cts_each_dim[1] = 2;
-  optimize_entropy_table(&fc.comp_inter_mode[0][0], probsfile, 2, cts_each_dim,
-                         NULL, 1,
-                         "static const aom_prob "
-                         "default_comp_inter_mode_p[COMP_INTER_MODE_CONTEXTS]");
-  optimize_cdf_table(&fc.comp_inter_mode[0][0], probsfile, 2, cts_each_dim,
-                     "static const aom_cdf_prob "
-                     "default_comp_inter_mode_cdf[COMP_INTER_MODE_CONTEXTS]["
-                     "CDF_SIZE(2)]");
-#endif
-
-/* Transform size */
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-  cts_each_dim[0] = 2;
-  optimize_entropy_table(&fc.quarter_tx_size[0], probsfile, 1, cts_each_dim,
-                         NULL, 1,
-                         "static const aom_prob default_quarter_tx_size_prob");
-  optimize_cdf_table(
-      &fc.quarter_tx_size[0], probsfile, 1, cts_each_dim,
-      "static const aom_cdf_prob default_quarter_tx_size_cdf[CDF_SIZE(2)]");
-#endif
-#if CONFIG_VAR_TX
+  optimize_cdf_table(&fc.palette_uv_mode[0][0], probsfile, 2, cts_each_dim,
+                     "const aom_cdf_prob default_palette_uv_mode_cdf"
+                     "[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)]");
+
+  cts_each_dim[0] = PALETTE_SIZES;
+  cts_each_dim[1] = PALETTE_COLOR_INDEX_CONTEXTS;
+  cts_each_dim[2] = PALETTE_COLORS;
+  int palette_color_indexes_each_ctx[PALETTE_SIZES] = { 2, 3, 4, 5, 6, 7, 8 };
+  optimize_cdf_table_var_modes_3d(
+      &fc.palette_y_color_index[0][0][0], probsfile, 3, cts_each_dim,
+      palette_color_indexes_each_ctx,
+      "const aom_cdf_prob default_palette_y_color_index_cdf[PALETTE_SIZES]"
+      "[PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)]");
+
+  cts_each_dim[0] = PALETTE_SIZES;
+  cts_each_dim[1] = PALETTE_COLOR_INDEX_CONTEXTS;
+  cts_each_dim[2] = PALETTE_COLORS;
+  optimize_cdf_table_var_modes_3d(
+      &fc.palette_uv_color_index[0][0][0], probsfile, 3, cts_each_dim,
+      palette_color_indexes_each_ctx,
+      "const aom_cdf_prob default_palette_uv_color_index_cdf[PALETTE_SIZES]"
+      "[PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)]");
+
+  /* Transform size */
   cts_each_dim[0] = TXFM_PARTITION_CONTEXTS;
   cts_each_dim[1] = 2;
-  optimize_entropy_table(
-      &fc.txfm_partition[0][0], probsfile, 2, cts_each_dim, NULL, 1,
-      "static const aom_prob "
-      "default_txfm_partition_probs[TXFM_PARTITION_CONTEXTS]");
   optimize_cdf_table(
       &fc.txfm_partition[0][0], probsfile, 2, cts_each_dim,
       "static const aom_cdf_prob\n"
       "default_txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)]");
-#endif
 
   /* Skip flag */
   cts_each_dim[0] = SKIP_CONTEXTS;
   cts_each_dim[1] = 2;
-  optimize_entropy_table(
-      &fc.skip[0][0], probsfile, 2, cts_each_dim, NULL, 1,
-      "static const aom_prob default_skip_probs[SKIP_CONTEXTS]");
   optimize_cdf_table(&fc.skip[0][0], probsfile, 2, cts_each_dim,
                      "static const aom_cdf_prob "
                      "default_skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)]");
 
-/* intrabc experiment */
-#if CONFIG_INTRABC
+  /* Skip mode flag */
+  cts_each_dim[0] = SKIP_MODE_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.skip_mode[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE(2)]");
+
+  /* joint compound flag */
+  cts_each_dim[0] = COMP_INDEX_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.compound_index[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob default_compound_idx_cdfs"
+                     "[COMP_INDEX_CONTEXTS][CDF_SIZE(2)]");
+
+  cts_each_dim[0] = COMP_GROUP_IDX_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.comp_group_idx[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob default_comp_group_idx_cdfs"
+                     "[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)]");
+
+  /* intrabc */
   cts_each_dim[0] = 2;
-  optimize_entropy_table(&fc.intrabc[0], probsfile, 1, cts_each_dim, NULL, 1,
-                         "INTRABC_PROB_DEFAULT");
   optimize_cdf_table(
       &fc.intrabc[0], probsfile, 1, cts_each_dim,
       "static const aom_cdf_prob default_intrabc_cdf[CDF_SIZE(2)]");
-#endif
 
-  /* delta_q */
-  cts_each_dim[0] = DELTA_Q_PROBS;
-  cts_each_dim[1] = 2;
-  optimize_entropy_table(
-      &fc.delta_q[0][0], probsfile, 2, cts_each_dim, NULL, 1,
-      "static const aom_prob default_delta_q_probs[DELTA_Q_PROBS]");
-#if CONFIG_EXT_DELTA_Q
-  cts_each_dim[0] = DELTA_LF_PROBS;
+  /* filter_intra experiment */
+  cts_each_dim[0] = FILTER_INTRA_MODES;
+  optimize_cdf_table(
+      &fc.filter_intra_mode[0], probsfile, 1, cts_each_dim,
+      "static const aom_cdf_prob "
+      "default_filter_intra_mode_cdf[CDF_SIZE(FILTER_INTRA_MODES)]");
+
+  cts_each_dim[0] = BLOCK_SIZES_ALL;
   cts_each_dim[1] = 2;
-  optimize_entropy_table(
-      &fc.delta_lf[0][0], probsfile, 2, cts_each_dim, NULL, 1,
-      "static const aom_prob default_delta_lf_probs[DELTA_LF_PROBS]");
-#endif
-
-/* Transform type */
-#if CONFIG_EXT_TX
-// TODO(yuec): different trees are used depending on selected ext tx set
-#else
-  // TODO(yuec): intra_ext_tx use different trees depending on the context
-  cts_each_dim[0] = EXT_TX_SIZES;
-  cts_each_dim[1] = TX_TYPES;
-  optimize_entropy_table(&fc.inter_ext_tx[0][0], probsfile, 2, cts_each_dim,
-                         av1_ext_tx_tree, 0,
-                         "static const aom_prob default_inter_ext_tx_prob"
-                         "[EXT_TX_SIZES][TX_TYPES - 1]");
-  optimize_cdf_table(&fc.inter_ext_tx[0][0], probsfile, 2, cts_each_dim,
+  optimize_cdf_table(&fc.filter_intra[0][0], probsfile, 2, cts_each_dim,
                      "static const aom_cdf_prob "
-                     "default_inter_ext_tx_prob[EXT_TX_SIZES][CDF_SIZE(TX_"
-                     "TYPES)]");
-#endif
+                     "default_filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(2)]");
 
-/* supertx experiment */
-#if CONFIG_SUPERTX
-  cts_each_dim[0] = PARTITION_SUPERTX_CONTEXTS;
-  cts_each_dim[1] = TX_SIZES;
-  cts_each_dim[2] = 2;
-  optimize_entropy_table(
-      &fc.supertx[0][0][0], probsfile, 3, cts_each_dim, NULL, 1,
-      "static const aom_prob\n"
-      "default_supertx_prob[PARTITION_SUPERTX_CONTEXTS][TX_SIZES]");
-  optimize_cdf_table(&fc.supertx[0][0][0], probsfile, 3, cts_each_dim,
-                     "static const aom_cdf_prob "
-                     "default_supertx_cdf[PARTITION_SUPERTX_CONTEXTS][TX_SIZES]"
+  /* restoration type */
+  cts_each_dim[0] = RESTORE_SWITCHABLE_TYPES;
+  optimize_cdf_table(&fc.switchable_restore[0], probsfile, 1, cts_each_dim,
+                     "static const aom_cdf_prob default_switchable_restore_cdf"
+                     "[CDF_SIZE(RESTORE_SWITCHABLE_TYPES)]");
+
+  cts_each_dim[0] = 2;
+  optimize_cdf_table(&fc.wiener_restore[0], probsfile, 1, cts_each_dim,
+                     "static const aom_cdf_prob default_wiener_restore_cdf"
+                     "[CDF_SIZE(2)]");
+
+  cts_each_dim[0] = 2;
+  optimize_cdf_table(&fc.sgrproj_restore[0], probsfile, 1, cts_each_dim,
+                     "static const aom_cdf_prob default_sgrproj_restore_cdf"
                      "[CDF_SIZE(2)]");
-#endif
-
-/* ext_intra experiment */
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-  cts_each_dim[0] = INTRA_FILTERS + 1;
-  cts_each_dim[1] = INTRA_FILTERS;
-  optimize_entropy_table(
-      &fc.intra_filter[0][0], probsfile, 2, cts_each_dim, av1_intra_filter_tree,
-      0,
-      "static const aom_prob\n"
-      "default_intra_filter_probs[INTRA_FILTERS + 1][INTRA_FILTERS - 1]");
-  optimize_cdf_table(&fc.intra_filter[0][0], probsfile, 2, cts_each_dim,
+
+  /* intra tx size */
+  cts_each_dim[0] = MAX_TX_CATS;
+  cts_each_dim[1] = TX_SIZE_CONTEXTS;
+  cts_each_dim[2] = MAX_TX_DEPTH + 1;
+  int intra_tx_sizes_each_ctx[MAX_TX_CATS] = { 2, 3, 3, 3 };
+  optimize_cdf_table_var_modes_3d(
+      &fc.intra_tx_size[0][0][0], probsfile, 3, cts_each_dim,
+      intra_tx_sizes_each_ctx,
+      "static const aom_cdf_prob default_tx_size_cdf"
+      "[MAX_TX_CATS][TX_SIZE_CONTEXTS][CDF_SIZE(MAX_TX_DEPTH + 1)]");
+
+  /* transform coding */
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = TX_SIZES;
+  cts_each_dim[2] = TXB_SKIP_CONTEXTS;
+  cts_each_dim[3] = 2;
+  optimize_cdf_table(&fc.txb_skip[0][0][0][0], probsfile, 4, cts_each_dim,
                      "static const aom_cdf_prob "
-                     "default_intra_filter_cdf[INTRA_FILTERS + "
-                     "1][CDF_SIZE(INTRA_FILTERS)]");
-#endif
-#endif
-
-/* filter_intra experiment */
-#if CONFIG_FILTER_INTRA
-  cts_each_dim[0] = PLANE_TYPES;
-  cts_each_dim[1] = 2;
-  optimize_entropy_table(&fc.filter_intra[0][0], probsfile, 2, cts_each_dim,
-                         NULL, 1,
-                         "static const aom_prob default_filter_intra_probs[2]");
-  optimize_cdf_table(
-      &fc.filter_intra[0][0], probsfile, 2, cts_each_dim,
-      "static const aom_cdf_prob default_filter_intra_cdf[2][CDF_SIZE(2)]");
-#endif
+                     "av1_default_txb_skip_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES]"
+                     "[TXB_SKIP_CONTEXTS][CDF_SIZE(2)]");
 
-#if CONFIG_LV_MAP
-  cts_each_dim[0] = TX_SIZES;
-  cts_each_dim[1] = PLANE_TYPES;
-  cts_each_dim[2] = NUM_BASE_LEVELS;
-  cts_each_dim[3] = COEFF_BASE_CONTEXTS;
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = TX_SIZES;
+  cts_each_dim[2] = PLANE_TYPES;
+  cts_each_dim[3] = EOB_COEF_CONTEXTS;
   cts_each_dim[4] = 2;
-  optimize_entropy_table(&fc.coeff_base[0][0][0][0][0], probsfile, 5,
-                         cts_each_dim, NULL, 1,
-                         "static const aom_prob "
-                         "default_coeff_base[TX_SIZES][PLANE_TYPES][NUM_BASE_"
-                         "LEVELS][COEFF_BASE_CONTEXTS]");
-  optimize_cdf_table(&fc.coeff_base[0][0][0][0][0], probsfile, 5, cts_each_dim,
-                     "static const aom_cdf_prob "
-                     "default_coeff_base_cdf[TX_SIZES][PLANE_TYPES][NUM_BASE_"
-                     "LEVELS][COEFF_BASE_CONTEXTS][CDF_SIZE(2)]");
+  optimize_cdf_table(
+      &fc.eob_extra[0][0][0][0][0], probsfile, 5, cts_each_dim,
+      "static const aom_cdf_prob av1_default_eob_extra_cdfs "
+      "[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS]"
+      "[CDF_SIZE(2)]");
 
-  cts_each_dim[0] = TX_SIZES;
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
   cts_each_dim[1] = PLANE_TYPES;
-  cts_each_dim[2] = SIG_COEF_CONTEXTS;
-  cts_each_dim[3] = 2;
-  optimize_entropy_table(
-      &fc.nz_map[0][0][0][0], probsfile, 4, cts_each_dim, NULL, 1,
-      "static const aom_prob "
-      "default_nz_map[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]");
-  optimize_cdf_table(&fc.nz_map[0][0][0][0], probsfile, 4, cts_each_dim,
-                     "static const aom_cdf_prob "
-                     "default_nz_map_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_"
-                     "CONTEXTS][CDF_SIZE(2)]");
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 5;
+  optimize_cdf_table(&fc.eob_multi16[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi16_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(5)]");
 
-  cts_each_dim[0] = TX_SIZES;
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
   cts_each_dim[1] = PLANE_TYPES;
-  cts_each_dim[2] = EOB_COEF_CONTEXTS;
-  cts_each_dim[3] = 2;
-  optimize_entropy_table(
-      &fc.eob_flag[0][0][0][0], probsfile, 4, cts_each_dim, NULL, 1,
-      "static const aom_prob "
-      "default_eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS]");
-  optimize_cdf_table(&fc.eob_flag[0][0][0][0], probsfile, 4, cts_each_dim,
-                     "static const aom_cdf_prob "
-                     "default_eob_flag_cdf[TX_SIZES][PLANE_TYPES][EOB_COEF_"
-                     "CONTEXTS][CDF_SIZE(2)]");
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 6;
+  optimize_cdf_table(&fc.eob_multi32[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi32_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(6)]");
 
-  cts_each_dim[0] = TX_SIZES;
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
   cts_each_dim[1] = PLANE_TYPES;
-  cts_each_dim[2] = LEVEL_CONTEXTS;
-  cts_each_dim[3] = 2;
-  optimize_entropy_table(
-      &fc.coeff_lps[0][0][0][0], probsfile, 4, cts_each_dim, NULL, 1,
-      "static const aom_prob "
-      "default_coeff_lps[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS]");
-  optimize_cdf_table(&fc.coeff_lps[0][0][0][0], probsfile, 4, cts_each_dim,
-                     "static const aom_cdf_prob "
-                     "default_coeff_lps_cdf[TX_SIZES][PLANE_TYPES][LEVEL_"
-                     "CONTEXTS][CDF_SIZE(2)]");
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 7;
+  optimize_cdf_table(&fc.eob_multi64[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi64_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(7)]");
 
-#if BR_NODE
-  cts_each_dim[0] = TX_SIZES;
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
   cts_each_dim[1] = PLANE_TYPES;
-  cts_each_dim[2] = BASE_RANGE_SETS;
-  cts_each_dim[3] = LEVEL_CONTEXTS;
-  cts_each_dim[4] = 2;
-  optimize_entropy_table(&fc.coeff_br[0][0][0][0][0], probsfile, 5,
-                         cts_each_dim, NULL, 1,
-                         "static const aom_prob "
-                         "default_coeff_br[TX_SIZES][PLANE_TYPES][BASE_RANGE_"
-                         "SETS][LEVEL_CONTEXTS]");
-  optimize_cdf_table(&fc.coeff_br[0][0][0][0][0], probsfile, 5, cts_each_dim,
-                     "static const aom_cdf_prob "
-                     "default_coeff_br_cdf[TX_SIZES][PLANE_TYPES][BASE_RANGE_"
-                     "SETS][LEVEL_CONTEXTS][CDF_SIZE(2)]");
-#endif  // BR_NODE
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 8;
+  optimize_cdf_table(&fc.eob_multi128[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi128_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(8)]");
 
-#if CONFIG_CTX1D
-  cts_each_dim[0] = TX_SIZES;
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
   cts_each_dim[1] = PLANE_TYPES;
-  cts_each_dim[2] = TX_CLASSES;
-  cts_each_dim[3] = 2;
-  optimize_entropy_table(&fc.eob_mode[0][0][0][0], probsfile, 4, cts_each_dim,
-                         NULL, 1,
-                         "static const aom_prob "
-                         "default_eob_mode[TX_SIZES][PLANE_TYPES][TX_CLASSES]");
-  optimize_cdf_table(&fc.eob_mode[0][0][0][0], probsfile, 4, cts_each_dim,
-                     "static const aom_cdf_prob "
-                     "default_eob_mode_cdf[TX_SIZES][PLANE_TYPES][TX_CLASSES]["
-                     "CDF_SIZE(2)]");
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 9;
+  optimize_cdf_table(&fc.eob_multi256[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi256_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(9)]");
 
-  cts_each_dim[0] = TX_SIZES;
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
   cts_each_dim[1] = PLANE_TYPES;
-  cts_each_dim[2] = TX_CLASSES;
-  cts_each_dim[3] = EMPTY_LINE_CONTEXTS;
-  cts_each_dim[4] = 2;
-  optimize_entropy_table(&fc.empty_line[0][0][0][0][0], probsfile, 5,
-                         cts_each_dim, NULL, 1,
-                         "static const aom_prob "
-                         "default_empty_line[TX_SIZES][PLANE_TYPES][TX_CLASSES]"
-                         "[EMPTY_LINE_CONTEXTS]");
-  optimize_cdf_table(&fc.empty_line[0][0][0][0][0], probsfile, 5, cts_each_dim,
-                     "static const aom_cdf_prob "
-                     "default_empty_line_cdf[TX_SIZES][PLANE_TYPES][TX_CLASSES]"
-                     "[EMPTY_LINE_CONTEXTS][CDF_SIZE(2)]");
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 10;
+  optimize_cdf_table(&fc.eob_multi512[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi512_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(10)]");
 
-  cts_each_dim[0] = TX_SIZES;
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
   cts_each_dim[1] = PLANE_TYPES;
-  cts_each_dim[2] = TX_CLASSES;
-  cts_each_dim[3] = HV_EOB_CONTEXTS;
-  cts_each_dim[4] = 2;
-  optimize_entropy_table(
-      &fc.hv_eob[0][0][0][0][0], probsfile, 5, cts_each_dim, NULL, 1,
-      "static const aom_prob "
-      "default_hv_eob[TX_SIZES][PLANE_TYPES][TX_CLASSES][HV_EOB_CONTEXTS]");
-  optimize_cdf_table(&fc.hv_eob[0][0][0][0][0], probsfile, 5, cts_each_dim,
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 11;
+  optimize_cdf_table(&fc.eob_multi1024[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi1024_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(11)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = TX_SIZES;
+  cts_each_dim[2] = PLANE_TYPES;
+  cts_each_dim[3] = LEVEL_CONTEXTS;
+  cts_each_dim[4] = BR_CDF_SIZE;
+  optimize_cdf_table(&fc.coeff_lps_multi[0][0][0][0][0], probsfile, 5,
+                     cts_each_dim,
                      "static const aom_cdf_prob "
-                     "default_hv_eob_cdf[TX_SIZES][PLANE_TYPES][TX_CLASSES][HV_"
-                     "EOB_CONTEXTS][CDF_SIZE(2)]");
-#endif  // CONFIG_CTX1D
-#endif  // CONFIG_LV_MAP
-
-/* lgt_from_pred experiment */
-#if CONFIG_LGT_FROM_PRED
-  cts_each_dim[0] = LGT_SIZES;
-  if (LGT_FROM_PRED_INTRA) {
-    cts_each_dim[1] = INTRA_MODES;
-    cts_each_dim[2] = 2;
-    optimize_entropy_table(&fc.intra_lgt[0][0][0], probsfile, 3, cts_each_dim,
-                           NULL, 1,
-                           "static const aom_prob default_intra_lgt_prob"
-                           "[LGT_SIZES][INTRA_MODES][2]");
-  }
-  if (LGT_FROM_PRED_INTER) {
-    cts_each_dim[1] = 2;
-    optimize_entropy_table(&fc.inter_lgt[0][0], probsfile, 2, cts_each_dim,
-                           NULL, 1,
-                           "static const aom_prob default_inter_lgt_prob"
-                           "[LGT_SIZES][2]");
-  }
-#endif  // CONFIG_LGT_FROM_PRED
+                     "av1_default_coeff_lps_multi_cdfs[TOKEN_CDF_Q_CTXS]"
+                     "[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS]"
+                     "[CDF_SIZE(BR_CDF_SIZE)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = TX_SIZES;
+  cts_each_dim[2] = PLANE_TYPES;
+  cts_each_dim[3] = SIG_COEF_CONTEXTS;
+  cts_each_dim[4] = NUM_BASE_LEVELS + 2;
+  optimize_cdf_table(
+      &fc.coeff_base_multi[0][0][0][0][0], probsfile, 5, cts_each_dim,
+      "static const aom_cdf_prob av1_default_coeff_base_multi_cdfs"
+      "[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]"
+      "[CDF_SIZE(NUM_BASE_LEVELS + 2)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = TX_SIZES;
+  cts_each_dim[2] = PLANE_TYPES;
+  cts_each_dim[3] = SIG_COEF_CONTEXTS_EOB;
+  cts_each_dim[4] = NUM_BASE_LEVELS + 1;
+  optimize_cdf_table(
+      &fc.coeff_base_eob_multi[0][0][0][0][0], probsfile, 5, cts_each_dim,
+      "static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs"
+      "[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB]"
+      "[CDF_SIZE(NUM_BASE_LEVELS + 1)]");
 
   fclose(statsfile);
   fclose(logfile);
diff --git a/third_party/aom/tools/author_first_release.sh b/third_party/aom/tools/author_first_release.sh
deleted file mode 100755
index 7b0b79721..000000000
--- a/third_party/aom/tools/author_first_release.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-##
-## List the release each author first contributed to.
-##
-## Usage: author_first_release.sh [TAGS]
-##
-## If the TAGS arguments are unspecified, all tags reported by `git tag`
-## will be considered.
-##
-tags=${@:-$(git tag)}
-for tag in $tags; do
-  git shortlog -n -e -s $tag |
-      cut -f2- |
-      awk "{print \"${tag#v}\t\"\$0}"
-done | sort -k2  | uniq -f2
diff --git a/third_party/aom/tools/build_inspector.sh b/third_party/aom/tools/build_inspector.sh
deleted file mode 100755
index 25e0de62f..000000000
--- a/third_party/aom/tools/build_inspector.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/sh
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-if ! [ -x "$(command -v emcc)" ] \
-    || ! [ -x "$(command -v emconfigure)" ] \
-    || ! [ -x "$(command -v emmake)" ]; then
-  cat << EOF >& 2
-Emscripten SDK is not available (emcc, emconfigure or emmake is missing).
-Install it from
-https://kripken.github.io/emscripten-site/docs/getting_started/downloads.html
-and try again.
-EOF
-  exit 1
-fi
-
-echo 'Building JS Inspector'
-if [ ! -d ".inspect" ]; then
-  mkdir .inspect
-  cd .inspect && emconfigure ../../configure \
-    --disable-multithread \
-    --disable-runtime-cpu-detect \
-    --target=generic-gnu \
-    --enable-accounting \
-    --disable-docs \
-    --disable-unit-tests \
-    --enable-inspection \
-    --enable-highbitdepth \
-    --extra-cflags="-D_POSIX_SOURCE"
-  cd ..
-fi
-
-cd .inspect
-emmake make -j 8
-cp examples/inspect inspect.bc
-emcc -O3 inspect.bc -o inspect.js \
-  -s TOTAL_MEMORY=134217728 \
-  -s MODULARIZE=1 \
-  -s EXPORT_NAME="'DecoderModule'" \
-  --post-js "../inspect-post.js" \
-  --memory-init-file 0
-cp inspect.js ../inspect.js
diff --git a/third_party/aom/tools/dump_obu.cc b/third_party/aom/tools/dump_obu.cc
new file mode 100644
index 000000000..30ee5e7a1
--- /dev/null
+++ b/third_party/aom/tools/dump_obu.cc
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <memory>
+#include <string>
+
+#include "config/aom_config.h"
+
+#include "common/ivfdec.h"
+#include "common/obudec.h"
+#include "common/tools_common.h"
+#include "common/webmdec.h"
+#include "tools/obu_parser.h"
+
+namespace {
+
+const size_t kInitialBufferSize = 100 * 1024;
+
+struct InputContext {
+  InputContext() = default;
+  ~InputContext() { free(unit_buffer); }
+
+  void Init() {
+    memset(avx_ctx, 0, sizeof(*avx_ctx));
+    memset(obu_ctx, 0, sizeof(*obu_ctx));
+    obu_ctx->avx_ctx = avx_ctx;
+#if CONFIG_WEBM_IO
+    memset(webm_ctx, 0, sizeof(*webm_ctx));
+#endif
+  }
+
+  AvxInputContext *avx_ctx = nullptr;
+  ObuDecInputContext *obu_ctx = nullptr;
+#if CONFIG_WEBM_IO
+  WebmInputContext *webm_ctx = nullptr;
+#endif
+  uint8_t *unit_buffer = nullptr;
+  size_t unit_buffer_size = 0;
+};
+
+void PrintUsage() {
+  printf("Libaom OBU dump.\nUsage: dump_obu <input_file>\n");
+}
+
+VideoFileType GetFileType(InputContext *ctx) {
+  if (file_is_ivf(ctx->avx_ctx)) return FILE_TYPE_IVF;
+  if (file_is_obu(ctx->obu_ctx)) return FILE_TYPE_OBU;
+#if CONFIG_WEBM_IO
+  if (file_is_webm(ctx->webm_ctx, ctx->avx_ctx)) return FILE_TYPE_WEBM;
+#endif
+  return FILE_TYPE_RAW;
+}
+
+bool ReadTemporalUnit(InputContext *ctx, size_t *unit_size) {
+  const VideoFileType file_type = ctx->avx_ctx->file_type;
+  switch (file_type) {
+    case FILE_TYPE_IVF: {
+      if (ivf_read_frame(ctx->avx_ctx->file, &ctx->unit_buffer, unit_size,
+                         &ctx->unit_buffer_size, NULL)) {
+        return false;
+      }
+      break;
+    }
+    case FILE_TYPE_OBU: {
+      if (obudec_read_temporal_unit(ctx->obu_ctx, &ctx->unit_buffer, unit_size,
+                                    &ctx->unit_buffer_size)) {
+        return false;
+      }
+      break;
+    }
+#if CONFIG_WEBM_IO
+    case FILE_TYPE_WEBM: {
+      if (webm_read_frame(ctx->webm_ctx, &ctx->unit_buffer, unit_size,
+                          &ctx->unit_buffer_size)) {
+        return false;
+      }
+      break;
+    }
+#endif
+    default:
+      // TODO(tomfinegan): Abuse FILE_TYPE_RAW for AV1/OBU elementary streams?
+      fprintf(stderr, "Error: Unsupported file type.\n");
+      return false;
+  }
+
+  return true;
+}
+
+}  // namespace
+
+int main(int argc, const char *argv[]) {
+  // TODO(tomfinegan): Could do with some params for verbosity.
+  if (argc < 2) {
+    PrintUsage();
+    return EXIT_SUCCESS;
+  }
+
+  const std::string filename = argv[1];
+
+  using FilePtr = std::unique_ptr<FILE, decltype(&fclose)>;
+  FilePtr input_file(fopen(filename.c_str(), "rb"), &fclose);
+  if (input_file.get() == nullptr) {
+    input_file.release();
+    fprintf(stderr, "Error: Cannot open input file.\n");
+    return EXIT_FAILURE;
+  }
+
+  AvxInputContext avx_ctx;
+  InputContext input_ctx;
+  input_ctx.avx_ctx = &avx_ctx;
+  ObuDecInputContext obu_ctx;
+  input_ctx.obu_ctx = &obu_ctx;
+#if CONFIG_WEBM_IO
+  WebmInputContext webm_ctx;
+  input_ctx.webm_ctx = &webm_ctx;
+#endif
+
+  input_ctx.Init();
+  avx_ctx.file = input_file.get();
+  avx_ctx.file_type = GetFileType(&input_ctx);
+
+  // Note: the reader utilities will realloc the buffer using realloc() etc.
+  // Can't have nice things like unique_ptr wrappers with that type of
+  // behavior underneath the function calls.
+  input_ctx.unit_buffer =
+      reinterpret_cast<uint8_t *>(calloc(kInitialBufferSize, 1));
+  if (!input_ctx.unit_buffer) {
+    fprintf(stderr, "Error: No memory, can't alloc input buffer.\n");
+    return EXIT_FAILURE;
+  }
+  input_ctx.unit_buffer_size = kInitialBufferSize;
+
+  size_t unit_size = 0;
+  int unit_number = 0;
+  int64_t obu_overhead_bytes_total = 0;
+  while (ReadTemporalUnit(&input_ctx, &unit_size)) {
+    printf("Temporal unit %d\n", unit_number);
+
+    int obu_overhead_current_unit = 0;
+    if (!aom_tools::DumpObu(input_ctx.unit_buffer, static_cast<int>(unit_size),
+                            &obu_overhead_current_unit)) {
+      fprintf(stderr, "Error: Temporal Unit parse failed on unit number %d.\n",
+              unit_number);
+      return EXIT_FAILURE;
+    }
+    printf("  OBU overhead:    %d\n", obu_overhead_current_unit);
+    ++unit_number;
+    obu_overhead_bytes_total += obu_overhead_current_unit;
+  }
+
+  printf("File total OBU overhead: %" PRId64 "\n", obu_overhead_bytes_total);
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/tools/ftfy.sh b/third_party/aom/tools/ftfy.sh
deleted file mode 100755
index 315da1af5..000000000
--- a/third_party/aom/tools/ftfy.sh
+++ /dev/null
@@ -1,158 +0,0 @@
-#!/bin/sh
-self="$0"
-dirname_self=$(dirname "$self")
-
-usage() {
-  cat <<EOF >&2
-Usage: $self [option]
-
-This script applies a whitespace transformation to the commit at HEAD. If no
-options are given, then the modified files are left in the working tree.
-
-Options:
-  -h, --help     Shows this message
-  -n, --dry-run  Shows a diff of the changes to be made.
-  --amend        Squashes the changes into the commit at HEAD
-                     This option will also reformat the commit message.
-  --commit       Creates a new commit containing only the whitespace changes
-  --msg-only     Reformat the commit message only, ignore the patch itself.
-
-EOF
-  rm -f ${CLEAN_FILES}
-  exit 1
-}
-
-
-log() {
-  echo "${self##*/}: $@" >&2
-}
-
-
-aom_style() {
-  for f; do
-    case "$f" in
-      *.h|*.c|*.cc)
-        clang-format -i --style=file "$f"
-        ;;
-    esac
-  done
-}
-
-
-apply() {
-  [ $INTERSECT_RESULT -ne 0 ] && patch -p1 < "$1"
-}
-
-
-commit() {
-  LAST_CHANGEID=$(git show | awk '/Change-Id:/{print $2}')
-  if [ -z "$LAST_CHANGEID" ]; then
-    log "HEAD doesn't have a Change-Id, unable to generate a new commit"
-    exit 1
-  fi
-
-  # Build a deterministic Change-Id from the parent's
-  NEW_CHANGEID=${LAST_CHANGEID}-styled
-  NEW_CHANGEID=I$(echo $NEW_CHANGEID | git hash-object --stdin)
-
-  # Commit, preserving authorship from the parent commit.
-  git commit -a -C HEAD > /dev/null
-  git commit --amend -F- << EOF
-Cosmetic: Fix whitespace in change ${LAST_CHANGEID:0:9}
-
-Change-Id: ${NEW_CHANGEID}
-EOF
-}
-
-
-show_commit_msg_diff() {
-  if [ $DIFF_MSG_RESULT -ne 0 ]; then
-    log "Modified commit message:"
-    diff -u "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG" | tail -n +3
-  fi
-}
-
-
-amend() {
-  show_commit_msg_diff
-  if [ $DIFF_MSG_RESULT -ne 0 ] || [ $INTERSECT_RESULT -ne 0 ]; then
-    git commit -a --amend -F "$NEW_COMMIT_MSG"
-  fi
-}
-
-
-diff_msg() {
-  git log -1 --format=%B > "$ORIG_COMMIT_MSG"
-  "${dirname_self}"/wrap-commit-msg.py \
-      < "$ORIG_COMMIT_MSG" > "$NEW_COMMIT_MSG"
-  cmp -s "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG"
-  DIFF_MSG_RESULT=$?
-}
-
-
-# Temporary files
-ORIG_DIFF=orig.diff.$$
-MODIFIED_DIFF=modified.diff.$$
-FINAL_DIFF=final.diff.$$
-ORIG_COMMIT_MSG=orig.commit-msg.$$
-NEW_COMMIT_MSG=new.commit-msg.$$
-CLEAN_FILES="${ORIG_DIFF} ${MODIFIED_DIFF} ${FINAL_DIFF}"
-CLEAN_FILES="${CLEAN_FILES} ${ORIG_COMMIT_MSG} ${NEW_COMMIT_MSG}"
-
-# Preconditions
-[ $# -lt 2 ] || usage
-
-if ! clang-format -version >/dev/null 2>&1; then
-  log "clang-format not found"
-  exit 1
-fi
-
-if ! git diff --quiet HEAD; then
-  log "Working tree is dirty, commit your changes first"
-  exit 1
-fi
-
-# Need to be in the root
-cd "$(git rev-parse --show-toplevel)"
-
-# Collect the original diff
-git show > "${ORIG_DIFF}"
-
-# Apply the style guide on new and modified files and collect its diff
-for f in $(git diff HEAD^ --name-only -M90 --diff-filter=AM); do
-  case "$f" in
-    third_party/*) continue;;
-  esac
-  aom_style "$f"
-done
-git diff --no-color --no-ext-diff > "${MODIFIED_DIFF}"
-
-# Intersect the two diffs
-"${dirname_self}"/intersect-diffs.py \
-    "${ORIG_DIFF}" "${MODIFIED_DIFF}" > "${FINAL_DIFF}"
-INTERSECT_RESULT=$?
-git reset --hard >/dev/null
-
-# Fixup the commit message
-diff_msg
-
-# Handle options
-if [ -n "$1" ]; then
-  case "$1" in
-    -h|--help) usage;;
-    -n|--dry-run) cat "${FINAL_DIFF}"; show_commit_msg_diff;;
-    --commit) apply "${FINAL_DIFF}"; commit;;
-    --amend) apply "${FINAL_DIFF}"; amend;;
-    --msg-only) amend;;
-    *) usage;;
-  esac
-else
-  apply "${FINAL_DIFF}"
-  if ! git diff --quiet; then
-    log "Formatting changes applied, verify and commit."
-    log "See also: http://www.webmproject.org/code/contribute/conventions/"
-    git diff --stat
-  fi
-fi
-
-rm -f ${CLEAN_FILES}
diff --git a/third_party/aom/tools/obu_parser.cc b/third_party/aom/tools/obu_parser.cc
new file mode 100644
index 000000000..2d0f5b27c
--- /dev/null
+++ b/third_party/aom/tools/obu_parser.cc
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdio>
+#include <string>
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem_ops.h"
+#include "av1/decoder/obu.h"
+#include "tools/obu_parser.h"
+
+namespace aom_tools {
+
+// Basic OBU syntax
+// 8 bits: Header
+//   7
+//     forbidden bit
+//   6,5,4,3
+//     type bits
+//   2
+//     extension flag bit
+//   1
+//     has size field bit
+//   0
+//     reserved bit
+const uint32_t kObuForbiddenBitMask = 0x1;
+const uint32_t kObuForbiddenBitShift = 7;
+const uint32_t kObuTypeBitsMask = 0xF;
+const uint32_t kObuTypeBitsShift = 3;
+const uint32_t kObuExtensionFlagBitMask = 0x1;
+const uint32_t kObuExtensionFlagBitShift = 2;
+const uint32_t kObuHasSizeFieldBitMask = 0x1;
+const uint32_t kObuHasSizeFieldBitShift = 1;
+
+// When extension flag bit is set:
+// 8 bits: extension header
+// 7,6,5
+//   temporal ID
+// 4,3
+//   spatial ID
+// 2,1,0
+//   reserved bits
+const uint32_t kObuExtTemporalIdBitsMask = 0x7;
+const uint32_t kObuExtTemporalIdBitsShift = 5;
+const uint32_t kObuExtSpatialIdBitsMask = 0x3;
+const uint32_t kObuExtSpatialIdBitsShift = 3;
+
+bool ValidObuType(int obu_type) {
+  switch (obu_type) {
+    case OBU_SEQUENCE_HEADER:
+    case OBU_TEMPORAL_DELIMITER:
+    case OBU_FRAME_HEADER:
+    case OBU_TILE_GROUP:
+    case OBU_METADATA:
+    case OBU_FRAME:
+    case OBU_REDUNDANT_FRAME_HEADER:
+    case OBU_TILE_LIST:
+    case OBU_PADDING: return true;
+  }
+  return false;
+}
+
+bool ParseObuHeader(uint8_t obu_header_byte, ObuHeader *obu_header) {
+  const int forbidden_bit =
+      (obu_header_byte >> kObuForbiddenBitShift) & kObuForbiddenBitMask;
+  if (forbidden_bit) {
+    fprintf(stderr, "Invalid OBU, forbidden bit set.\n");
+    return false;
+  }
+
+  obu_header->type = static_cast<OBU_TYPE>(
+      (obu_header_byte >> kObuTypeBitsShift) & kObuTypeBitsMask);
+  if (!ValidObuType(obu_header->type)) {
+    fprintf(stderr, "Invalid OBU type: %d.\n", obu_header->type);
+    return false;
+  }
+
+  obu_header->has_extension =
+      (obu_header_byte >> kObuExtensionFlagBitShift) & kObuExtensionFlagBitMask;
+  obu_header->has_size_field =
+      (obu_header_byte >> kObuHasSizeFieldBitShift) & kObuHasSizeFieldBitMask;
+  return true;
+}
+
+bool ParseObuExtensionHeader(uint8_t ext_header_byte, ObuHeader *obu_header) {
+  obu_header->temporal_layer_id =
+      (ext_header_byte >> kObuExtTemporalIdBitsShift) &
+      kObuExtTemporalIdBitsMask;
+  obu_header->spatial_layer_id =
+      (ext_header_byte >> kObuExtSpatialIdBitsShift) & kObuExtSpatialIdBitsMask;
+
+  return true;
+}
+
+void PrintObuHeader(const ObuHeader *header) {
+  printf(
+      "  OBU type:        %s\n"
+      "      extension:   %s\n",
+      aom_obu_type_to_string(static_cast<OBU_TYPE>(header->type)),
+      header->has_extension ? "yes" : "no");
+  if (header->has_extension) {
+    printf(
+        "      temporal_id: %d\n"
+        "      spatial_id:  %d\n",
+        header->temporal_layer_id, header->temporal_layer_id);
+  }
+}
+
+bool DumpObu(const uint8_t *data, int length, int *obu_overhead_bytes) {
+  const int kObuHeaderSizeBytes = 1;
+  const int kMinimumBytesRequired = 1 + kObuHeaderSizeBytes;
+  int consumed = 0;
+  int obu_overhead = 0;
+  ObuHeader obu_header;
+  while (consumed < length) {
+    const int remaining = length - consumed;
+    if (remaining < kMinimumBytesRequired) {
+      fprintf(stderr,
+              "OBU parse error. Did not consume all data, %d bytes remain.\n",
+              remaining);
+      return false;
+    }
+
+    int obu_header_size = 0;
+
+    memset(&obu_header, 0, sizeof(obu_header));
+    const uint8_t obu_header_byte = *(data + consumed);
+    if (!ParseObuHeader(obu_header_byte, &obu_header)) {
+      fprintf(stderr, "OBU parsing failed at offset %d.\n", consumed);
+      return false;
+    }
+
+    ++obu_overhead;
+    ++obu_header_size;
+
+    if (obu_header.has_extension) {
+      const uint8_t obu_ext_header_byte =
+          *(data + consumed + kObuHeaderSizeBytes);
+      if (!ParseObuExtensionHeader(obu_ext_header_byte, &obu_header)) {
+        fprintf(stderr, "OBU extension parsing failed at offset %d.\n",
+                consumed + kObuHeaderSizeBytes);
+        return false;
+      }
+
+      ++obu_overhead;
+      ++obu_header_size;
+    }
+
+    PrintObuHeader(&obu_header);
+
+    uint64_t obu_size = 0;
+    size_t length_field_size = 0;
+    if (aom_uleb_decode(data + consumed + obu_header_size,
+                        remaining - obu_header_size, &obu_size,
+                        &length_field_size) != 0) {
+      fprintf(stderr, "OBU size parsing failed at offset %d.\n",
+              consumed + obu_header_size);
+      return false;
+    }
+    int current_obu_length = static_cast<int>(obu_size);
+    if (obu_header_size + static_cast<int>(length_field_size) +
+            current_obu_length >
+        remaining) {
+      fprintf(stderr, "OBU parsing failed: not enough OBU data.\n");
+      return false;
+    }
+    consumed += obu_header_size + static_cast<int>(length_field_size) +
+                current_obu_length;
+    printf("      length:      %d\n",
+           static_cast<int>(obu_header_size + length_field_size +
+                            current_obu_length));
+  }
+
+  if (obu_overhead_bytes != nullptr) *obu_overhead_bytes = obu_overhead;
+  printf("  TU size: %d\n", consumed);
+
+  return true;
+}
+
+}  // namespace aom_tools
diff --git a/third_party/aom/tools/obu_parser.h b/third_party/aom/tools/obu_parser.h
new file mode 100644
index 000000000..86e7c4581
--- /dev/null
+++ b/third_party/aom/tools/obu_parser.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef TOOLS_OBU_PARSER_H_
+#define TOOLS_OBU_PARSER_H_
+
+#include <cstdint>
+
+namespace aom_tools {
+
+// Print information obtained from OBU(s) in data until data is exhausted or an
+// error occurs. Returns true when all data is consumed successfully, and
+// optionally reports OBU storage overhead via obu_overhead_bytes when the
+// pointer is non-null.
+bool DumpObu(const uint8_t *data, int length, int *obu_overhead_bytes);
+
+}  // namespace aom_tools
+
+#endif  // TOOLS_OBU_PARSER_H_
diff --git a/third_party/aom/tools/txfm_analyzer/txfm_gen_code.cc b/third_party/aom/tools/txfm_analyzer/txfm_gen_code.cc
new file mode 100644
index 000000000..7c5400b91
--- /dev/null
+++ b/third_party/aom/tools/txfm_analyzer/txfm_gen_code.cc
@@ -0,0 +1,580 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+#include <string.h>
+
+#include "tools/txfm_analyzer/txfm_graph.h"
+
+typedef enum CODE_TYPE {
+  CODE_TYPE_C,
+  CODE_TYPE_SSE2,
+  CODE_TYPE_SSE4_1
+} CODE_TYPE;
+
+int get_cos_idx(double value, int mod) {
+  return round(acos(fabs(value)) / PI * mod);
+}
+
+char *cos_text_arr(double value, int mod, char *text, int size) {
+  int num = get_cos_idx(value, mod);
+  if (value < 0) {
+    snprintf(text, size, "-cospi[%2d]", num);
+  } else {
+    snprintf(text, size, " cospi[%2d]", num);
+  }
+
+  if (num == 0)
+    printf("v: %f -> %d/%d v==-1 is %d\n", value, num, mod, value == -1);
+
+  return text;
+}
+
+char *cos_text_sse2(double w0, double w1, int mod, char *text, int size) {
+  int idx0 = get_cos_idx(w0, mod);
+  int idx1 = get_cos_idx(w1, mod);
+  char p[] = "p";
+  char n[] = "m";
+  char *sgn0 = w0 < 0 ? n : p;
+  char *sgn1 = w1 < 0 ? n : p;
+  snprintf(text, size, "cospi_%s%02d_%s%02d", sgn0, idx0, sgn1, idx1);
+  return text;
+}
+
+char *cos_text_sse4_1(double w, int mod, char *text, int size) {
+  int idx = get_cos_idx(w, mod);
+  char p[] = "p";
+  char n[] = "m";
+  char *sgn = w < 0 ? n : p;
+  snprintf(text, size, "cospi_%s%02d", sgn, idx);
+  return text;
+}
+
+void node_to_code_c(Node *node, const char *buf0, const char *buf1) {
+  int cnt = 0;
+  for (int i = 0; i < 2; i++) {
+    if (fabs(node->inWeight[i]) == 1 || fabs(node->inWeight[i]) == 0) cnt++;
+  }
+  if (cnt == 2) {
+    int cnt2 = 0;
+    printf("  %s[%d] =", buf1, node->nodeIdx);
+    for (int i = 0; i < 2; i++) {
+      if (fabs(node->inWeight[i]) == 1) {
+        cnt2++;
+      }
+    }
+    if (cnt2 == 2) {
+      printf(" apply_value(");
+    }
+    int cnt1 = 0;
+    for (int i = 0; i < 2; i++) {
+      if (node->inWeight[i] == 1) {
+        if (cnt1 > 0)
+          printf(" + %s[%d]", buf0, node->inNodeIdx[i]);
+        else
+          printf(" %s[%d]", buf0, node->inNodeIdx[i]);
+        cnt1++;
+      } else if (node->inWeight[i] == -1) {
+        if (cnt1 > 0)
+          printf(" - %s[%d]", buf0, node->inNodeIdx[i]);
+        else
+          printf("-%s[%d]", buf0, node->inNodeIdx[i]);
+        cnt1++;
+      }
+    }
+    if (cnt2 == 2) {
+      printf(", stage_range[stage])");
+    }
+    printf(";\n");
+  } else {
+    char w0[100];
+    char w1[100];
+    printf(
+        "  %s[%d] = half_btf(%s, %s[%d], %s, %s[%d], "
+        "cos_bit);\n",
+        buf1, node->nodeIdx, cos_text_arr(node->inWeight[0], COS_MOD, w0, 100),
+        buf0, node->inNodeIdx[0],
+        cos_text_arr(node->inWeight[1], COS_MOD, w1, 100), buf0,
+        node->inNodeIdx[1]);
+  }
+}
+
+void gen_code_c(Node *node, int stage_num, int node_num, TYPE_TXFM type) {
+  char *fun_name = new char[100];
+  get_fun_name(fun_name, 100, type, node_num);
+
+  printf("\n");
+  printf(
+      "void av1_%s(const int32_t *input, int32_t *output, int8_t cos_bit, "
+      "const int8_t* stage_range) "
+      "{\n",
+      fun_name);
+  printf("  assert(output != input);\n");
+  printf("  const int32_t size = %d;\n", node_num);
+  printf("  const int32_t *cospi = cospi_arr(cos_bit);\n");
+  printf("\n");
+
+  printf("  int32_t stage = 0;\n");
+  printf("  int32_t *bf0, *bf1;\n");
+  printf("  int32_t step[%d];\n", node_num);
+
+  const char *buf0 = "bf0";
+  const char *buf1 = "bf1";
+  const char *input = "input";
+
+  int si = 0;
+  printf("\n");
+  printf("  // stage %d;\n", si);
+  printf("  apply_range(stage, input, %s, size, stage_range[stage]);\n", input);
+
+  si = 1;
+  printf("\n");
+  printf("  // stage %d;\n", si);
+  printf("  stage++;\n");
+  if (si % 2 == (stage_num - 1) % 2) {
+    printf("  %s = output;\n", buf1);
+  } else {
+    printf("  %s = step;\n", buf1);
+  }
+
+  for (int ni = 0; ni < node_num; ni++) {
+    int idx = get_idx(si, ni, node_num);
+    node_to_code_c(node + idx, input, buf1);
+  }
+
+  printf("  range_check_buf(stage, input, bf1, size, stage_range[stage]);\n");
+
+  for (int si = 2; si < stage_num; si++) {
+    printf("\n");
+    printf("  // stage %d\n", si);
+    printf("  stage++;\n");
+    if (si % 2 == (stage_num - 1) % 2) {
+      printf("  %s = step;\n", buf0);
+      printf("  %s = output;\n", buf1);
+    } else {
+      printf("  %s = output;\n", buf0);
+      printf("  %s = step;\n", buf1);
+    }
+
+    // computation code
+    for (int ni = 0; ni < node_num; ni++) {
+      int idx = get_idx(si, ni, node_num);
+      node_to_code_c(node + idx, buf0, buf1);
+    }
+
+    if (si != stage_num - 1) {
+      printf(
+          "  range_check_buf(stage, input, bf1, size, stage_range[stage]);\n");
+    }
+  }
+  printf("  apply_range(stage, input, output, size, stage_range[stage]);\n");
+  printf("}\n");
+}
+
+void single_node_to_code_sse2(Node *node, const char *buf0, const char *buf1) {
+  printf("  %s[%2d] =", buf1, node->nodeIdx);
+  if (node->inWeight[0] == 1 && node->inWeight[1] == 1) {
+    printf(" _mm_adds_epi16(%s[%d], %s[%d])", buf0, node->inNodeIdx[0], buf0,
+           node->inNodeIdx[1]);
+  } else if (node->inWeight[0] == 1 && node->inWeight[1] == -1) {
+    printf(" _mm_subs_epi16(%s[%d], %s[%d])", buf0, node->inNodeIdx[0], buf0,
+           node->inNodeIdx[1]);
+  } else if (node->inWeight[0] == -1 && node->inWeight[1] == 1) {
+    printf(" _mm_subs_epi16(%s[%d], %s[%d])", buf0, node->inNodeIdx[1], buf0,
+           node->inNodeIdx[0]);
+  } else if (node->inWeight[0] == 1 && node->inWeight[1] == 0) {
+    printf(" %s[%d]", buf0, node->inNodeIdx[0]);
+  } else if (node->inWeight[0] == 0 && node->inWeight[1] == 1) {
+    printf(" %s[%d]", buf0, node->inNodeIdx[1]);
+  } else if (node->inWeight[0] == -1 && node->inWeight[1] == 0) {
+    printf(" _mm_subs_epi16(__zero, %s[%d])", buf0, node->inNodeIdx[0]);
+  } else if (node->inWeight[0] == 0 && node->inWeight[1] == -1) {
+    printf(" _mm_subs_epi16(__zero, %s[%d])", buf0, node->inNodeIdx[1]);
+  }
+  printf(";\n");
+}
+
+void pair_node_to_code_sse2(Node *node, Node *partnerNode, const char *buf0,
+                            const char *buf1) {
+  char temp0[100];
+  char temp1[100];
+  // btf_16_sse2_type0(w0, w1, in0, in1, out0, out1)
+  if (node->inNodeIdx[0] != partnerNode->inNodeIdx[0])
+    printf("  btf_16_sse2(%s, %s, %s[%d], %s[%d], %s[%d], %s[%d]);\n",
+           cos_text_sse2(node->inWeight[0], node->inWeight[1], COS_MOD, temp0,
+                         100),
+           cos_text_sse2(partnerNode->inWeight[1], partnerNode->inWeight[0],
+                         COS_MOD, temp1, 100),
+           buf0, node->inNodeIdx[0], buf0, node->inNodeIdx[1], buf1,
+           node->nodeIdx, buf1, partnerNode->nodeIdx);
+  else
+    printf("  btf_16_sse2(%s, %s, %s[%d], %s[%d], %s[%d], %s[%d]);\n",
+           cos_text_sse2(node->inWeight[0], node->inWeight[1], COS_MOD, temp0,
+                         100),
+           cos_text_sse2(partnerNode->inWeight[0], partnerNode->inWeight[1],
+                         COS_MOD, temp1, 100),
+           buf0, node->inNodeIdx[0], buf0, node->inNodeIdx[1], buf1,
+           node->nodeIdx, buf1, partnerNode->nodeIdx);
+}
+
+Node *get_partner_node(Node *node) {
+  int diff = node->inNode[1]->nodeIdx - node->nodeIdx;
+  return node + diff;
+}
+
+void node_to_code_sse2(Node *node, const char *buf0, const char *buf1) {
+  int cnt = 0;
+  int cnt1 = 0;
+  if (node->visited == 0) {
+    node->visited = 1;
+    for (int i = 0; i < 2; i++) {
+      if (fabs(node->inWeight[i]) == 1 || fabs(node->inWeight[i]) == 0) cnt++;
+      if (fabs(node->inWeight[i]) == 1) cnt1++;
+    }
+    if (cnt == 2) {
+      if (cnt1 == 2) {
+        // has a partner
+        Node *partnerNode = get_partner_node(node);
+        partnerNode->visited = 1;
+        single_node_to_code_sse2(node, buf0, buf1);
+        single_node_to_code_sse2(partnerNode, buf0, buf1);
+      } else {
+        single_node_to_code_sse2(node, buf0, buf1);
+      }
+    } else {
+      Node *partnerNode = get_partner_node(node);
+      partnerNode->visited = 1;
+      pair_node_to_code_sse2(node, partnerNode, buf0, buf1);
+    }
+  }
+}
+
+void gen_cospi_list_sse2(Node *node, int stage_num, int node_num) {
+  int visited[65][65][2][2];
+  memset(visited, 0, sizeof(visited));
+  char text[100];
+  char text1[100];
+  char text2[100];
+  int size = 100;
+  printf("\n");
+  for (int si = 1; si < stage_num; si++) {
+    for (int ni = 0; ni < node_num; ni++) {
+      int idx = get_idx(si, ni, node_num);
+      int cnt = 0;
+      Node *node0 = node + idx;
+      if (node0->visited == 0) {
+        node0->visited = 1;
+        for (int i = 0; i < 2; i++) {
+          if (fabs(node0->inWeight[i]) == 1 || fabs(node0->inWeight[i]) == 0)
+            cnt++;
+        }
+        if (cnt != 2) {
+          {
+            double w0 = node0->inWeight[0];
+            double w1 = node0->inWeight[1];
+            int idx0 = get_cos_idx(w0, COS_MOD);
+            int idx1 = get_cos_idx(w1, COS_MOD);
+            int sgn0 = w0 < 0 ? 1 : 0;
+            int sgn1 = w1 < 0 ? 1 : 0;
+
+            if (!visited[idx0][idx1][sgn0][sgn1]) {
+              visited[idx0][idx1][sgn0][sgn1] = 1;
+              printf("  __m128i %s = pair_set_epi16(%s, %s);\n",
+                     cos_text_sse2(w0, w1, COS_MOD, text, size),
+                     cos_text_arr(w0, COS_MOD, text1, size),
+                     cos_text_arr(w1, COS_MOD, text2, size));
+            }
+          }
+          Node *node1 = get_partner_node(node0);
+          node1->visited = 1;
+          if (node1->inNode[0]->nodeIdx != node0->inNode[0]->nodeIdx) {
+            double w0 = node1->inWeight[0];
+            double w1 = node1->inWeight[1];
+            int idx0 = get_cos_idx(w0, COS_MOD);
+            int idx1 = get_cos_idx(w1, COS_MOD);
+            int sgn0 = w0 < 0 ? 1 : 0;
+            int sgn1 = w1 < 0 ? 1 : 0;
+
+            if (!visited[idx1][idx0][sgn1][sgn0]) {
+              visited[idx1][idx0][sgn1][sgn0] = 1;
+              printf("  __m128i %s = pair_set_epi16(%s, %s);\n",
+                     cos_text_sse2(w1, w0, COS_MOD, text, size),
+                     cos_text_arr(w1, COS_MOD, text1, size),
+                     cos_text_arr(w0, COS_MOD, text2, size));
+            }
+          } else {
+            double w0 = node1->inWeight[0];
+            double w1 = node1->inWeight[1];
+            int idx0 = get_cos_idx(w0, COS_MOD);
+            int idx1 = get_cos_idx(w1, COS_MOD);
+            int sgn0 = w0 < 0 ? 1 : 0;
+            int sgn1 = w1 < 0 ? 1 : 0;
+
+            if (!visited[idx0][idx1][sgn0][sgn1]) {
+              visited[idx0][idx1][sgn0][sgn1] = 1;
+              printf("  __m128i %s = pair_set_epi16(%s, %s);\n",
+                     cos_text_sse2(w0, w1, COS_MOD, text, size),
+                     cos_text_arr(w0, COS_MOD, text1, size),
+                     cos_text_arr(w1, COS_MOD, text2, size));
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void gen_code_sse2(Node *node, int stage_num, int node_num, TYPE_TXFM type) {
+  char *fun_name = new char[100];
+  get_fun_name(fun_name, 100, type, node_num);
+
+  printf("\n");
+  printf(
+      "void %s_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) "
+      "{\n",
+      fun_name);
+
+  printf("  const int32_t* cospi = cospi_arr(cos_bit);\n");
+  printf("  const __m128i __zero = _mm_setzero_si128();\n");
+  printf("  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));\n");
+
+  graph_reset_visited(node, stage_num, node_num);
+  gen_cospi_list_sse2(node, stage_num, node_num);
+  graph_reset_visited(node, stage_num, node_num);
+  for (int si = 1; si < stage_num; si++) {
+    char in[100];
+    char out[100];
+    printf("\n");
+    printf("  // stage %d\n", si);
+    if (si == 1)
+      snprintf(in, 100, "%s", "input");
+    else
+      snprintf(in, 100, "x%d", si - 1);
+    if (si == stage_num - 1) {
+      snprintf(out, 100, "%s", "output");
+    } else {
+      snprintf(out, 100, "x%d", si);
+      printf("  __m128i %s[%d];\n", out, node_num);
+    }
+    // computation code
+    for (int ni = 0; ni < node_num; ni++) {
+      int idx = get_idx(si, ni, node_num);
+      node_to_code_sse2(node + idx, in, out);
+    }
+  }
+
+  printf("}\n");
+}
+void gen_cospi_list_sse4_1(Node *node, int stage_num, int node_num) {
+  int visited[65][2];
+  memset(visited, 0, sizeof(visited));
+  char text[100];
+  char text1[100];
+  int size = 100;
+  printf("\n");
+  for (int si = 1; si < stage_num; si++) {
+    for (int ni = 0; ni < node_num; ni++) {
+      int idx = get_idx(si, ni, node_num);
+      Node *node0 = node + idx;
+      if (node0->visited == 0) {
+        int cnt = 0;
+        node0->visited = 1;
+        for (int i = 0; i < 2; i++) {
+          if (fabs(node0->inWeight[i]) == 1 || fabs(node0->inWeight[i]) == 0)
+            cnt++;
+        }
+        if (cnt != 2) {
+          for (int i = 0; i < 2; i++) {
+            if (fabs(node0->inWeight[i]) != 1 &&
+                fabs(node0->inWeight[i]) != 0) {
+              double w = node0->inWeight[i];
+              int idx = get_cos_idx(w, COS_MOD);
+              int sgn = w < 0 ? 1 : 0;
+
+              if (!visited[idx][sgn]) {
+                visited[idx][sgn] = 1;
+                printf("  __m128i %s = _mm_set1_epi32(%s);\n",
+                       cos_text_sse4_1(w, COS_MOD, text, size),
+                       cos_text_arr(w, COS_MOD, text1, size));
+              }
+            }
+          }
+          Node *node1 = get_partner_node(node0);
+          node1->visited = 1;
+        }
+      }
+    }
+  }
+}
+
+void single_node_to_code_sse4_1(Node *node, const char *buf0,
+                                const char *buf1) {
+  printf("  %s[%2d] =", buf1, node->nodeIdx);
+  if (node->inWeight[0] == 1 && node->inWeight[1] == 1) {
+    printf(" _mm_add_epi32(%s[%d], %s[%d])", buf0, node->inNodeIdx[0], buf0,
+           node->inNodeIdx[1]);
+  } else if (node->inWeight[0] == 1 && node->inWeight[1] == -1) {
+    printf(" _mm_sub_epi32(%s[%d], %s[%d])", buf0, node->inNodeIdx[0], buf0,
+           node->inNodeIdx[1]);
+  } else if (node->inWeight[0] == -1 && node->inWeight[1] == 1) {
+    printf(" _mm_sub_epi32(%s[%d], %s[%d])", buf0, node->inNodeIdx[1], buf0,
+           node->inNodeIdx[0]);
+  } else if (node->inWeight[0] == 1 && node->inWeight[1] == 0) {
+    printf(" %s[%d]", buf0, node->inNodeIdx[0]);
+  } else if (node->inWeight[0] == 0 && node->inWeight[1] == 1) {
+    printf(" %s[%d]", buf0, node->inNodeIdx[1]);
+  } else if (node->inWeight[0] == -1 && node->inWeight[1] == 0) {
+    printf(" _mm_sub_epi32(__zero, %s[%d])", buf0, node->inNodeIdx[0]);
+  } else if (node->inWeight[0] == 0 && node->inWeight[1] == -1) {
+    printf(" _mm_sub_epi32(__zero, %s[%d])", buf0, node->inNodeIdx[1]);
+  }
+  printf(";\n");
+}
+
+void pair_node_to_code_sse4_1(Node *node, Node *partnerNode, const char *buf0,
+                              const char *buf1) {
+  char temp0[100];
+  char temp1[100];
+  if (node->inWeight[0] * partnerNode->inWeight[0] < 0) {
+    /* type0
+     * cos  sin
+     * sin -cos
+     */
+    // btf_32_sse2_type0(w0, w1, in0, in1, out0, out1)
+    // out0 = w0*in0 + w1*in1
+    // out1 = -w0*in1 + w1*in0
+    printf(
+        "  btf_32_type0_sse4_1_new(%s, %s, %s[%d], %s[%d], %s[%d], %s[%d], "
+        "__rounding, cos_bit);\n",
+        cos_text_sse4_1(node->inWeight[0], COS_MOD, temp0, 100),
+        cos_text_sse4_1(node->inWeight[1], COS_MOD, temp1, 100), buf0,
+        node->inNodeIdx[0], buf0, node->inNodeIdx[1], buf1, node->nodeIdx, buf1,
+        partnerNode->nodeIdx);
+  } else {
+    /* type1
+     *  cos sin
+     * -sin cos
+     */
+    // btf_32_sse2_type1(w0, w1, in0, in1, out0, out1)
+    // out0 = w0*in0 + w1*in1
+    // out1 = w0*in1 - w1*in0
+    printf(
+        "  btf_32_type1_sse4_1_new(%s, %s, %s[%d], %s[%d], %s[%d], %s[%d], "
+        "__rounding, cos_bit);\n",
+        cos_text_sse4_1(node->inWeight[0], COS_MOD, temp0, 100),
+        cos_text_sse4_1(node->inWeight[1], COS_MOD, temp1, 100), buf0,
+        node->inNodeIdx[0], buf0, node->inNodeIdx[1], buf1, node->nodeIdx, buf1,
+        partnerNode->nodeIdx);
+  }
+}
+
+void node_to_code_sse4_1(Node *node, const char *buf0, const char *buf1) {
+  int cnt = 0;
+  int cnt1 = 0;
+  if (node->visited == 0) {
+    node->visited = 1;
+    for (int i = 0; i < 2; i++) {
+      if (fabs(node->inWeight[i]) == 1 || fabs(node->inWeight[i]) == 0) cnt++;
+      if (fabs(node->inWeight[i]) == 1) cnt1++;
+    }
+    if (cnt == 2) {
+      if (cnt1 == 2) {
+        // has a partner
+        Node *partnerNode = get_partner_node(node);
+        partnerNode->visited = 1;
+        single_node_to_code_sse4_1(node, buf0, buf1);
+        single_node_to_code_sse4_1(partnerNode, buf0, buf1);
+      } else {
+        single_node_to_code_sse2(node, buf0, buf1);
+      }
+    } else {
+      Node *partnerNode = get_partner_node(node);
+      partnerNode->visited = 1;
+      pair_node_to_code_sse4_1(node, partnerNode, buf0, buf1);
+    }
+  }
+}
+
+void gen_code_sse4_1(Node *node, int stage_num, int node_num, TYPE_TXFM type) {
+  char *fun_name = new char[100];
+  get_fun_name(fun_name, 100, type, node_num);
+
+  printf("\n");
+  printf(
+      "void %s_sse4_1(const __m128i *input, __m128i *output, int8_t cos_bit) "
+      "{\n",
+      fun_name);
+
+  printf("  const int32_t* cospi = cospi_arr(cos_bit);\n");
+  printf("  const __m128i __zero = _mm_setzero_si128();\n");
+  printf("  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));\n");
+
+  graph_reset_visited(node, stage_num, node_num);
+  gen_cospi_list_sse4_1(node, stage_num, node_num);
+  graph_reset_visited(node, stage_num, node_num);
+  for (int si = 1; si < stage_num; si++) {
+    char in[100];
+    char out[100];
+    printf("\n");
+    printf("  // stage %d\n", si);
+    if (si == 1)
+      snprintf(in, 100, "%s", "input");
+    else
+      snprintf(in, 100, "x%d", si - 1);
+    if (si == stage_num - 1) {
+      snprintf(out, 100, "%s", "output");
+    } else {
+      snprintf(out, 100, "x%d", si);
+      printf("  __m128i %s[%d];\n", out, node_num);
+    }
+    // computation code
+    for (int ni = 0; ni < node_num; ni++) {
+      int idx = get_idx(si, ni, node_num);
+      node_to_code_sse4_1(node + idx, in, out);
+    }
+  }
+
+  printf("}\n");
+}
+
+void gen_hybrid_code(CODE_TYPE code_type, TYPE_TXFM txfm_type, int node_num) {
+  int stage_num = get_hybrid_stage_num(txfm_type, node_num);
+
+  Node *node = new Node[node_num * stage_num];
+  init_graph(node, stage_num, node_num);
+
+  gen_hybrid_graph_1d(node, stage_num, node_num, 0, 0, node_num, txfm_type);
+
+  switch (code_type) {
+    case CODE_TYPE_C: gen_code_c(node, stage_num, node_num, txfm_type); break;
+    case CODE_TYPE_SSE2:
+      gen_code_sse2(node, stage_num, node_num, txfm_type);
+      break;
+    case CODE_TYPE_SSE4_1:
+      gen_code_sse4_1(node, stage_num, node_num, txfm_type);
+      break;
+  }
+
+  delete[] node;
+}
+
+int main(int argc, char **argv) {
+  CODE_TYPE code_type = CODE_TYPE_SSE4_1;
+  for (int txfm_type = TYPE_DCT; txfm_type < TYPE_LAST; txfm_type++) {
+    for (int node_num = 4; node_num <= 64; node_num *= 2) {
+      gen_hybrid_code(code_type, (TYPE_TXFM)txfm_type, node_num);
+    }
+  }
+  return 0;
+}
diff --git a/third_party/aom/tools/txfm_analyzer/txfm_graph.cc b/third_party/aom/tools/txfm_analyzer/txfm_graph.cc
new file mode 100644
index 000000000..a24906100
--- /dev/null
+++ b/third_party/aom/tools/txfm_analyzer/txfm_graph.cc
@@ -0,0 +1,943 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "tools/txfm_analyzer/txfm_graph.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+typedef struct Node Node;
+
+void get_fun_name(char *str_fun_name, int str_buf_size, const TYPE_TXFM type,
+                  const int txfm_size) {
+  if (type == TYPE_DCT)
+    snprintf(str_fun_name, str_buf_size, "fdct%d_new", txfm_size);
+  else if (type == TYPE_ADST)
+    snprintf(str_fun_name, str_buf_size, "fadst%d_new", txfm_size);
+  else if (type == TYPE_IDCT)
+    snprintf(str_fun_name, str_buf_size, "idct%d_new", txfm_size);
+  else if (type == TYPE_IADST)
+    snprintf(str_fun_name, str_buf_size, "iadst%d_new", txfm_size);
+}
+
+void get_txfm_type_name(char *str_fun_name, int str_buf_size,
+                        const TYPE_TXFM type, const int txfm_size) {
+  if (type == TYPE_DCT)
+    snprintf(str_fun_name, str_buf_size, "TXFM_TYPE_DCT%d", txfm_size);
+  else if (type == TYPE_ADST)
+    snprintf(str_fun_name, str_buf_size, "TXFM_TYPE_ADST%d", txfm_size);
+  else if (type == TYPE_IDCT)
+    snprintf(str_fun_name, str_buf_size, "TXFM_TYPE_DCT%d", txfm_size);
+  else if (type == TYPE_IADST)
+    snprintf(str_fun_name, str_buf_size, "TXFM_TYPE_ADST%d", txfm_size);
+}
+
+void get_hybrid_2d_type_name(char *buf, int buf_size, const TYPE_TXFM type0,
+                             const TYPE_TXFM type1, const int txfm_size0,
+                             const int txfm_size1) {
+  if (type0 == TYPE_DCT && type1 == TYPE_DCT)
+    snprintf(buf, buf_size, "_dct_dct_%dx%d", txfm_size1, txfm_size0);
+  else if (type0 == TYPE_DCT && type1 == TYPE_ADST)
+    snprintf(buf, buf_size, "_dct_adst_%dx%d", txfm_size1, txfm_size0);
+  else if (type0 == TYPE_ADST && type1 == TYPE_ADST)
+    snprintf(buf, buf_size, "_adst_adst_%dx%d", txfm_size1, txfm_size0);
+  else if (type0 == TYPE_ADST && type1 == TYPE_DCT)
+    snprintf(buf, buf_size, "_adst_dct_%dx%d", txfm_size1, txfm_size0);
+}
+
+TYPE_TXFM get_inv_type(TYPE_TXFM type) {
+  if (type == TYPE_DCT)
+    return TYPE_IDCT;
+  else if (type == TYPE_ADST)
+    return TYPE_IADST;
+  else if (type == TYPE_IDCT)
+    return TYPE_DCT;
+  else if (type == TYPE_IADST)
+    return TYPE_ADST;
+  else
+    return TYPE_LAST;
+}
+
+void reference_dct_1d(double *in, double *out, int size) {
+  const double kInvSqrt2 = 0.707106781186547524400844362104;
+  for (int k = 0; k < size; k++) {
+    out[k] = 0;  // initialize out[k]
+    for (int n = 0; n < size; n++) {
+      out[k] += in[n] * cos(PI * (2 * n + 1) * k / (2 * size));
+    }
+    if (k == 0) out[k] = out[k] * kInvSqrt2;
+  }
+}
+
+void reference_dct_2d(double *in, double *out, int size) {
+  double *tempOut = new double[size * size];
+  // dct each row: in -> out
+  for (int r = 0; r < size; r++) {
+    reference_dct_1d(in + r * size, out + r * size, size);
+  }
+
+  for (int r = 0; r < size; r++) {
+    // out ->tempOut
+    for (int c = 0; c < size; c++) {
+      tempOut[r * size + c] = out[c * size + r];
+    }
+  }
+  for (int r = 0; r < size; r++) {
+    reference_dct_1d(tempOut + r * size, out + r * size, size);
+  }
+  delete[] tempOut;
+}
+
+void reference_adst_1d(double *in, double *out, int size) {
+  for (int k = 0; k < size; k++) {
+    out[k] = 0;  // initialize out[k]
+    for (int n = 0; n < size; n++) {
+      out[k] += in[n] * sin(PI * (2 * n + 1) * (2 * k + 1) / (4 * size));
+    }
+  }
+}
+
+void reference_hybrid_2d(double *in, double *out, int size, int type0,
+                         int type1) {
+  double *tempOut = new double[size * size];
+  // dct each row: in -> out
+  for (int r = 0; r < size; r++) {
+    if (type0 == TYPE_DCT)
+      reference_dct_1d(in + r * size, out + r * size, size);
+    else
+      reference_adst_1d(in + r * size, out + r * size, size);
+  }
+
+  for (int r = 0; r < size; r++) {
+    // out ->tempOut
+    for (int c = 0; c < size; c++) {
+      tempOut[r * size + c] = out[c * size + r];
+    }
+  }
+  for (int r = 0; r < size; r++) {
+    if (type1 == TYPE_DCT)
+      reference_dct_1d(tempOut + r * size, out + r * size, size);
+    else
+      reference_adst_1d(tempOut + r * size, out + r * size, size);
+  }
+  delete[] tempOut;
+}
+
+void reference_hybrid_2d_new(double *in, double *out, int size0, int size1,
+                             int type0, int type1) {
+  double *tempOut = new double[size0 * size1];
+  // dct each row: in -> out
+  for (int r = 0; r < size1; r++) {
+    if (type0 == TYPE_DCT)
+      reference_dct_1d(in + r * size0, out + r * size0, size0);
+    else
+      reference_adst_1d(in + r * size0, out + r * size0, size0);
+  }
+
+  for (int r = 0; r < size1; r++) {
+    // out ->tempOut
+    for (int c = 0; c < size0; c++) {
+      tempOut[c * size1 + r] = out[r * size0 + c];
+    }
+  }
+  for (int r = 0; r < size0; r++) {
+    if (type1 == TYPE_DCT)
+      reference_dct_1d(tempOut + r * size1, out + r * size1, size1);
+    else
+      reference_adst_1d(tempOut + r * size1, out + r * size1, size1);
+  }
+  delete[] tempOut;
+}
+
+unsigned int get_max_bit(unsigned int x) {
+  int max_bit = -1;
+  while (x) {
+    x = x >> 1;
+    max_bit++;
+  }
+  return max_bit;
+}
+
+unsigned int bitwise_reverse(unsigned int x, int max_bit) {
+  x = ((x >> 16) & 0x0000ffff) | ((x & 0x0000ffff) << 16);
+  x = ((x >> 8) & 0x00ff00ff) | ((x & 0x00ff00ff) << 8);
+  x = ((x >> 4) & 0x0f0f0f0f) | ((x & 0x0f0f0f0f) << 4);
+  x = ((x >> 2) & 0x33333333) | ((x & 0x33333333) << 2);
+  x = ((x >> 1) & 0x55555555) | ((x & 0x55555555) << 1);
+  x = x >> (31 - max_bit);
+  return x;
+}
+
+int get_idx(int ri, int ci, int cSize) { return ri * cSize + ci; }
+
+void add_node(Node *node, int stage_num, int node_num, int stage_idx,
+              int node_idx, int in, double w) {
+  int outIdx = get_idx(stage_idx, node_idx, node_num);
+  int inIdx = get_idx(stage_idx - 1, in, node_num);
+  int idx = node[outIdx].inNodeNum;
+  if (idx < 2) {
+    node[outIdx].inNode[idx] = &node[inIdx];
+    node[outIdx].inNodeIdx[idx] = in;
+    node[outIdx].inWeight[idx] = w;
+    idx++;
+    node[outIdx].inNodeNum = idx;
+  } else {
+    printf("Error: inNode is full");
+  }
+}
+
+void connect_node(Node *node, int stage_num, int node_num, int stage_idx,
+                  int node_idx, int in0, double w0, int in1, double w1) {
+  int outIdx = get_idx(stage_idx, node_idx, node_num);
+  int inIdx0 = get_idx(stage_idx - 1, in0, node_num);
+  int inIdx1 = get_idx(stage_idx - 1, in1, node_num);
+
+  int idx = 0;
+  // if(w0 != 0) {
+  node[outIdx].inNode[idx] = &node[inIdx0];
+  node[outIdx].inNodeIdx[idx] = in0;
+  node[outIdx].inWeight[idx] = w0;
+  idx++;
+  //}
+
+  // if(w1 != 0) {
+  node[outIdx].inNode[idx] = &node[inIdx1];
+  node[outIdx].inNodeIdx[idx] = in1;
+  node[outIdx].inWeight[idx] = w1;
+  idx++;
+  //}
+
+  node[outIdx].inNodeNum = idx;
+}
+
+void propagate(Node *node, int stage_num, int node_num, int stage_idx) {
+  for (int ni = 0; ni < node_num; ni++) {
+    int outIdx = get_idx(stage_idx, ni, node_num);
+    node[outIdx].value = 0;
+    for (int k = 0; k < node[outIdx].inNodeNum; k++) {
+      node[outIdx].value +=
+          node[outIdx].inNode[k]->value * node[outIdx].inWeight[k];
+    }
+  }
+}
+
+int64_t round_shift(int64_t value, int bit) {
+  if (bit > 0) {
+    if (value < 0) {
+      return -round_shift(-value, bit);
+    } else {
+      return (value + (1 << (bit - 1))) >> bit;
+    }
+  } else {
+    return value << (-bit);
+  }
+}
+
+void round_shift_array(int32_t *arr, int size, int bit) {
+  if (bit == 0) {
+    return;
+  } else {
+    for (int i = 0; i < size; i++) {
+      arr[i] = round_shift(arr[i], bit);
+    }
+  }
+}
+
+void graph_reset_visited(Node *node, int stage_num, int node_num) {
+  for (int si = 0; si < stage_num; si++) {
+    for (int ni = 0; ni < node_num; ni++) {
+      int idx = get_idx(si, ni, node_num);
+      node[idx].visited = 0;
+    }
+  }
+}
+
+void estimate_value(Node *node, int stage_num, int node_num, int stage_idx,
+                    int node_idx, int estimate_bit) {
+  if (stage_idx > 0) {
+    int outIdx = get_idx(stage_idx, node_idx, node_num);
+    int64_t out = 0;
+    node[outIdx].value = 0;
+    for (int k = 0; k < node[outIdx].inNodeNum; k++) {
+      int64_t w = round(node[outIdx].inWeight[k] * (1 << estimate_bit));
+      int64_t v = round(node[outIdx].inNode[k]->value);
+      out += v * w;
+    }
+    node[outIdx].value = round_shift(out, estimate_bit);
+  }
+}
+
+void amplify_value(Node *node, int stage_num, int node_num, int stage_idx,
+                   int node_idx, int amplify_bit) {
+  int outIdx = get_idx(stage_idx, node_idx, node_num);
+  node[outIdx].value = round_shift(round(node[outIdx].value), -amplify_bit);
+}
+
+void propagate_estimate_amlify(Node *node, int stage_num, int node_num,
+                               int stage_idx, int amplify_bit,
+                               int estimate_bit) {
+  for (int ni = 0; ni < node_num; ni++) {
+    estimate_value(node, stage_num, node_num, stage_idx, ni, estimate_bit);
+    amplify_value(node, stage_num, node_num, stage_idx, ni, amplify_bit);
+  }
+}
+
+void init_graph(Node *node, int stage_num, int node_num) {
+  for (int si = 0; si < stage_num; si++) {
+    for (int ni = 0; ni < node_num; ni++) {
+      int outIdx = get_idx(si, ni, node_num);
+      node[outIdx].stageIdx = si;
+      node[outIdx].nodeIdx = ni;
+      node[outIdx].value = 0;
+      node[outIdx].inNodeNum = 0;
+      if (si >= 1) {
+        connect_node(node, stage_num, node_num, si, ni, ni, 1, ni, 0);
+      }
+    }
+  }
+}
+
+void gen_B_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                 int node_idx, int N, int star) {
+  for (int i = 0; i < N / 2; i++) {
+    int out = node_idx + i;
+    int in1 = node_idx + N - 1 - i;
+    if (star == 1) {
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, out, -1, in1,
+                   1);
+    } else {
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, in1,
+                   1);
+    }
+  }
+  for (int i = N / 2; i < N; i++) {
+    int out = node_idx + i;
+    int in1 = node_idx + N - 1 - i;
+    if (star == 1) {
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, in1,
+                   1);
+    } else {
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, out, -1, in1,
+                   1);
+    }
+  }
+}
+
+void gen_P_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                 int node_idx, int N) {
+  int max_bit = get_max_bit(N - 1);
+  for (int i = 0; i < N; i++) {
+    int out = node_idx + bitwise_reverse(i, max_bit);
+    int in = node_idx + i;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+  }
+}
+
+void gen_type1_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int N) {
+  int max_bit = get_max_bit(N);
+  for (int ni = 0; ni < N / 2; ni++) {
+    int ai = bitwise_reverse(N + ni, max_bit);
+    int out = node_idx + ni;
+    int in1 = node_idx + N - ni - 1;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, out,
+                 sin(PI * ai / (2 * 2 * N)), in1, cos(PI * ai / (2 * 2 * N)));
+  }
+  for (int ni = N / 2; ni < N; ni++) {
+    int ai = bitwise_reverse(N + ni, max_bit);
+    int out = node_idx + ni;
+    int in1 = node_idx + N - ni - 1;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, out,
+                 cos(PI * ai / (2 * 2 * N)), in1, -sin(PI * ai / (2 * 2 * N)));
+  }
+}
+
+void gen_type2_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int N) {
+  for (int ni = 0; ni < N / 4; ni++) {
+    int out = node_idx + ni;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, out, 0);
+  }
+
+  for (int ni = N / 4; ni < N / 2; ni++) {
+    int out = node_idx + ni;
+    int in1 = node_idx + N - ni - 1;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, out,
+                 -cos(PI / 4), in1, cos(-PI / 4));
+  }
+
+  for (int ni = N / 2; ni < N * 3 / 4; ni++) {
+    int out = node_idx + ni;
+    int in1 = node_idx + N - ni - 1;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, out,
+                 cos(-PI / 4), in1, cos(PI / 4));
+  }
+
+  for (int ni = N * 3 / 4; ni < N; ni++) {
+    int out = node_idx + ni;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, out, 0);
+  }
+}
+
+void gen_type3_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int idx, int N) {
+  // TODO(angiebird): Simplify and clarify this function
+
+  int i = 2 * N / (1 << (idx / 2));
+  int max_bit =
+      get_max_bit(i / 2) - 1;  // the max_bit counts on i/2 instead of N here
+  int N_over_i = 2 << (idx / 2);
+
+  for (int nj = 0; nj < N / 2; nj += N_over_i) {
+    int j = nj / (N_over_i);
+    int kj = bitwise_reverse(i / 4 + j, max_bit);
+    // printf("kj = %d\n", kj);
+
+    // I_N/2i   --- 0
+    int offset = nj;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in = out;
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+    }
+
+    // -C_Kj/i --- S_Kj/i
+    offset += N_over_i / 4;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in0 = out;
+      double w0 = -cos(kj * PI / i);
+      int in1 = N - (offset + ni) - 1 + node_idx;
+      double w1 = sin(kj * PI / i);
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in0, w0, in1,
+                   w1);
+    }
+
+    // S_kj/i  --- -C_Kj/i
+    offset += N_over_i / 4;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in0 = out;
+      double w0 = -sin(kj * PI / i);
+      int in1 = N - (offset + ni) - 1 + node_idx;
+      double w1 = -cos(kj * PI / i);
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in0, w0, in1,
+                   w1);
+    }
+
+    // I_N/2i   --- 0
+    offset += N_over_i / 4;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in = out;
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+    }
+  }
+
+  for (int nj = N / 2; nj < N; nj += N_over_i) {
+    int j = nj / N_over_i;
+    int kj = bitwise_reverse(i / 4 + j, max_bit);
+
+    // I_N/2i --- 0
+    int offset = nj;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in = out;
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+    }
+
+    // C_kj/i --- -S_Kj/i
+    offset += N_over_i / 4;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in0 = out;
+      double w0 = cos(kj * PI / i);
+      int in1 = N - (offset + ni) - 1 + node_idx;
+      double w1 = -sin(kj * PI / i);
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in0, w0, in1,
+                   w1);
+    }
+
+    // S_kj/i --- C_Kj/i
+    offset += N_over_i / 4;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in0 = out;
+      double w0 = sin(kj * PI / i);
+      int in1 = N - (offset + ni) - 1 + node_idx;
+      double w1 = cos(kj * PI / i);
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in0, w0, in1,
+                   w1);
+    }
+
+    // I_N/2i --- 0
+    offset += N_over_i / 4;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in = out;
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+    }
+  }
+}
+
+void gen_type4_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int idx, int N) {
+  int B_size = 1 << ((idx + 1) / 2);
+  for (int ni = 0; ni < N; ni += B_size) {
+    gen_B_graph(node, stage_num, node_num, stage_idx, node_idx + ni, B_size,
+                (ni / B_size) % 2);
+  }
+}
+
+void gen_R_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                 int node_idx, int N) {
+  int max_idx = 2 * (get_max_bit(N) + 1) - 3;
+  for (int idx = 0; idx < max_idx; idx++) {
+    int s = stage_idx + max_idx - idx - 1;
+    if (idx == 0) {
+      // type 1
+      gen_type1_graph(node, stage_num, node_num, s, node_idx, N);
+    } else if (idx == max_idx - 1) {
+      // type 2
+      gen_type2_graph(node, stage_num, node_num, s, node_idx, N);
+    } else if ((idx + 1) % 2 == 0) {
+      // type 4
+      gen_type4_graph(node, stage_num, node_num, s, node_idx, idx, N);
+    } else if ((idx + 1) % 2 == 1) {
+      // type 3
+      gen_type3_graph(node, stage_num, node_num, s, node_idx, idx, N);
+    } else {
+      printf("check gen_R_graph()\n");
+    }
+  }
+}
+
+void gen_DCT_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                   int node_idx, int N) {
+  if (N > 2) {
+    gen_B_graph(node, stage_num, node_num, stage_idx, node_idx, N, 0);
+    gen_DCT_graph(node, stage_num, node_num, stage_idx + 1, node_idx, N / 2);
+    gen_R_graph(node, stage_num, node_num, stage_idx + 1, node_idx + N / 2,
+                N / 2);
+  } else {
+    // generate dct_2
+    connect_node(node, stage_num, node_num, stage_idx + 1, node_idx, node_idx,
+                 cos(PI / 4), node_idx + 1, cos(PI / 4));
+    connect_node(node, stage_num, node_num, stage_idx + 1, node_idx + 1,
+                 node_idx + 1, -cos(PI / 4), node_idx, cos(PI / 4));
+  }
+}
+
+int get_dct_stage_num(int size) { return 2 * get_max_bit(size); }
+
+void gen_DCT_graph_1d(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int dct_node_num) {
+  gen_DCT_graph(node, stage_num, node_num, stage_idx, node_idx, dct_node_num);
+  int dct_stage_num = get_dct_stage_num(dct_node_num);
+  gen_P_graph(node, stage_num, node_num, stage_idx + dct_stage_num - 2,
+              node_idx, dct_node_num);
+}
+
+void gen_adst_B_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx) {
+  int size = 1 << (adst_idx + 1);
+  for (int ni = 0; ni < size / 2; ni++) {
+    int nOut = node_idx + ni;
+    int nIn = nOut + size / 2;
+    connect_node(node, stage_num, node_num, stage_idx + 1, nOut, nOut, 1, nIn,
+                 1);
+    // printf("nOut: %d nIn: %d\n", nOut, nIn);
+  }
+  for (int ni = size / 2; ni < size; ni++) {
+    int nOut = node_idx + ni;
+    int nIn = nOut - size / 2;
+    connect_node(node, stage_num, node_num, stage_idx + 1, nOut, nOut, -1, nIn,
+                 1);
+    // printf("ndctOut: %d nIn: %d\n", nOut, nIn);
+  }
+}
+
+void gen_adst_U_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx, int adst_node_num) {
+  int size = 1 << (adst_idx + 1);
+  for (int ni = 0; ni < adst_node_num; ni += size) {
+    gen_adst_B_graph(node, stage_num, node_num, stage_idx, node_idx + ni,
+                     adst_idx);
+  }
+}
+
+void gen_adst_T_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, double freq) {
+  connect_node(node, stage_num, node_num, stage_idx + 1, node_idx, node_idx,
+               cos(freq * PI), node_idx + 1, sin(freq * PI));
+  connect_node(node, stage_num, node_num, stage_idx + 1, node_idx + 1,
+               node_idx + 1, -cos(freq * PI), node_idx, sin(freq * PI));
+}
+
+void gen_adst_E_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx) {
+  int size = 1 << (adst_idx);
+  for (int i = 0; i < size / 2; i++) {
+    int ni = i * 2;
+    double fi = (1 + 4 * i) * 1.0 / (1 << (adst_idx + 1));
+    gen_adst_T_graph(node, stage_num, node_num, stage_idx, node_idx + ni, fi);
+  }
+}
+
+void gen_adst_V_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx, int adst_node_num) {
+  int size = 1 << (adst_idx);
+  for (int i = 0; i < adst_node_num / size; i++) {
+    if (i % 2 == 1) {
+      int ni = i * size;
+      gen_adst_E_graph(node, stage_num, node_num, stage_idx, node_idx + ni,
+                       adst_idx);
+    }
+  }
+}
+void gen_adst_VJ_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                       int node_idx, int adst_node_num) {
+  for (int i = 0; i < adst_node_num / 2; i++) {
+    int ni = i * 2;
+    double fi = (1 + 4 * i) * 1.0 / (4 * adst_node_num);
+    gen_adst_T_graph(node, stage_num, node_num, stage_idx, node_idx + ni, fi);
+  }
+}
+void gen_adst_Q_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_node_num) {
+  // reverse order when idx is 1, 3, 5, 7 ...
+  // example of adst_node_num = 8:
+  //   0 1 2 3 4 5 6 7
+  // --> 0 7 2 5 4 3 6 1
+  for (int ni = 0; ni < adst_node_num; ni++) {
+    if (ni % 2 == 0) {
+      int out = node_idx + ni;
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, out,
+                   0);
+    } else {
+      int out = node_idx + ni;
+      int in = node_idx + adst_node_num - ni;
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+    }
+  }
+}
+void gen_adst_Ibar_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                         int node_idx, int adst_node_num) {
+  // reverse order
+  // 0 1 2 3 --> 3 2 1 0
+  for (int ni = 0; ni < adst_node_num; ni++) {
+    int out = node_idx + ni;
+    int in = node_idx + adst_node_num - ni - 1;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+  }
+}
+
+int get_Q_out2in(int adst_node_num, int out) {
+  int in;
+  if (out % 2 == 0) {
+    in = out;
+  } else {
+    in = adst_node_num - out;
+  }
+  return in;
+}
+
+int get_Ibar_out2in(int adst_node_num, int out) {
+  return adst_node_num - out - 1;
+}
+
+void gen_adst_IbarQ_graph(Node *node, int stage_num, int node_num,
+                          int stage_idx, int node_idx, int adst_node_num) {
+  // in -> Ibar -> Q -> out
+  for (int ni = 0; ni < adst_node_num; ni++) {
+    int out = node_idx + ni;
+    int in = node_idx +
+             get_Ibar_out2in(adst_node_num, get_Q_out2in(adst_node_num, ni));
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+  }
+}
+
+void gen_adst_D_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_node_num) {
+  // reverse order
+  for (int ni = 0; ni < adst_node_num; ni++) {
+    int out = node_idx + ni;
+    int in = out;
+    if (ni % 2 == 0) {
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+    } else {
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, -1, in,
+                   0);
+    }
+  }
+}
+
+int get_hadamard_idx(int x, int adst_node_num) {
+  int max_bit = get_max_bit(adst_node_num - 1);
+  x = bitwise_reverse(x, max_bit);
+
+  // gray code
+  int c = x & 1;
+  int p = x & 1;
+  int y = c;
+
+  for (int i = 1; i <= max_bit; i++) {
+    p = c;
+    c = (x >> i) & 1;
+    y += (c ^ p) << i;
+  }
+  return y;
+}
+
+void gen_adst_Ht_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                       int node_idx, int adst_node_num) {
+  for (int ni = 0; ni < adst_node_num; ni++) {
+    int out = node_idx + ni;
+    int in = node_idx + get_hadamard_idx(ni, adst_node_num);
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+  }
+}
+
+void gen_adst_HtD_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                        int node_idx, int adst_node_num) {
+  for (int ni = 0; ni < adst_node_num; ni++) {
+    int out = node_idx + ni;
+    int in = node_idx + get_hadamard_idx(ni, adst_node_num);
+    double inW;
+    if (ni % 2 == 0)
+      inW = 1;
+    else
+      inW = -1;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, in, inW, in, 0);
+  }
+}
+
+int get_adst_stage_num(int adst_node_num) {
+  return 2 * get_max_bit(adst_node_num) + 2;
+}
+
+int gen_iadst_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                    int node_idx, int adst_node_num) {
+  int max_bit = get_max_bit(adst_node_num);
+  int si = 0;
+  gen_adst_IbarQ_graph(node, stage_num, node_num, stage_idx + si, node_idx,
+                       adst_node_num);
+  si++;
+  gen_adst_VJ_graph(node, stage_num, node_num, stage_idx + si, node_idx,
+                    adst_node_num);
+  si++;
+  for (int adst_idx = max_bit - 1; adst_idx >= 1; adst_idx--) {
+    gen_adst_U_graph(node, stage_num, node_num, stage_idx + si, node_idx,
+                     adst_idx, adst_node_num);
+    si++;
+    gen_adst_V_graph(node, stage_num, node_num, stage_idx + si, node_idx,
+                     adst_idx, adst_node_num);
+    si++;
+  }
+  gen_adst_HtD_graph(node, stage_num, node_num, stage_idx + si, node_idx,
+                     adst_node_num);
+  si++;
+  return si + 1;
+}
+
+int gen_adst_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                   int node_idx, int adst_node_num) {
+  int hybrid_stage_num = get_hybrid_stage_num(TYPE_ADST, adst_node_num);
+  // generate a adst tempNode
+  Node *tempNode = new Node[hybrid_stage_num * adst_node_num];
+  init_graph(tempNode, hybrid_stage_num, adst_node_num);
+  int si = gen_iadst_graph(tempNode, hybrid_stage_num, adst_node_num, 0, 0,
+                           adst_node_num);
+
+  // tempNode's inverse graph to node[stage_idx][node_idx]
+  gen_inv_graph(tempNode, hybrid_stage_num, adst_node_num, node, stage_num,
+                node_num, stage_idx, node_idx);
+  delete[] tempNode;
+  return si;
+}
+
+void connect_layer_2d(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int dct_node_num) {
+  for (int first = 0; first < dct_node_num; first++) {
+    for (int second = 0; second < dct_node_num; second++) {
+      // int sIn = stage_idx;
+      int sOut = stage_idx + 1;
+      int nIn = node_idx + first * dct_node_num + second;
+      int nOut = node_idx + second * dct_node_num + first;
+
+      // printf("sIn: %d nIn: %d sOut: %d nOut: %d\n", sIn, nIn, sOut, nOut);
+
+      connect_node(node, stage_num, node_num, sOut, nOut, nIn, 1, nIn, 0);
+    }
+  }
+}
+
+void connect_layer_2d_new(Node *node, int stage_num, int node_num,
+                          int stage_idx, int node_idx, int dct_node_num0,
+                          int dct_node_num1) {
+  for (int i = 0; i < dct_node_num1; i++) {
+    for (int j = 0; j < dct_node_num0; j++) {
+      // int sIn = stage_idx;
+      int sOut = stage_idx + 1;
+      int nIn = node_idx + i * dct_node_num0 + j;
+      int nOut = node_idx + j * dct_node_num1 + i;
+
+      // printf("sIn: %d nIn: %d sOut: %d nOut: %d\n", sIn, nIn, sOut, nOut);
+
+      connect_node(node, stage_num, node_num, sOut, nOut, nIn, 1, nIn, 0);
+    }
+  }
+}
+
+void gen_DCT_graph_2d(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int dct_node_num) {
+  int dct_stage_num = get_dct_stage_num(dct_node_num);
+  // put 2 layers of dct_node_num DCTs on the graph
+  for (int ni = 0; ni < dct_node_num; ni++) {
+    gen_DCT_graph_1d(node, stage_num, node_num, stage_idx,
+                     node_idx + ni * dct_node_num, dct_node_num);
+    gen_DCT_graph_1d(node, stage_num, node_num, stage_idx + dct_stage_num,
+                     node_idx + ni * dct_node_num, dct_node_num);
+  }
+  // connect first layer and second layer
+  connect_layer_2d(node, stage_num, node_num, stage_idx + dct_stage_num - 1,
+                   node_idx, dct_node_num);
+}
+
+int get_hybrid_stage_num(int type, int hybrid_node_num) {
+  if (type == TYPE_DCT || type == TYPE_IDCT) {
+    return get_dct_stage_num(hybrid_node_num);
+  } else if (type == TYPE_ADST || type == TYPE_IADST) {
+    return get_adst_stage_num(hybrid_node_num);
+  }
+  return 0;
+}
+
+int get_hybrid_2d_stage_num(int type0, int type1, int hybrid_node_num) {
+  int stage_num = 0;
+  stage_num += get_hybrid_stage_num(type0, hybrid_node_num);
+  stage_num += get_hybrid_stage_num(type1, hybrid_node_num);
+  return stage_num;
+}
+
+int get_hybrid_2d_stage_num_new(int type0, int type1, int hybrid_node_num0,
+                                int hybrid_node_num1) {
+  int stage_num = 0;
+  stage_num += get_hybrid_stage_num(type0, hybrid_node_num0);
+  stage_num += get_hybrid_stage_num(type1, hybrid_node_num1);
+  return stage_num;
+}
+
+int get_hybrid_amplify_factor(int type, int hybrid_node_num) {
+  return get_max_bit(hybrid_node_num) - 1;
+}
+
+void gen_hybrid_graph_1d(Node *node, int stage_num, int node_num, int stage_idx,
+                         int node_idx, int hybrid_node_num, int type) {
+  if (type == TYPE_DCT) {
+    gen_DCT_graph_1d(node, stage_num, node_num, stage_idx, node_idx,
+                     hybrid_node_num);
+  } else if (type == TYPE_ADST) {
+    gen_adst_graph(node, stage_num, node_num, stage_idx, node_idx,
+                   hybrid_node_num);
+  } else if (type == TYPE_IDCT) {
+    int hybrid_stage_num = get_hybrid_stage_num(type, hybrid_node_num);
+    // generate a dct tempNode
+    Node *tempNode = new Node[hybrid_stage_num * hybrid_node_num];
+    init_graph(tempNode, hybrid_stage_num, hybrid_node_num);
+    gen_DCT_graph_1d(tempNode, hybrid_stage_num, hybrid_node_num, 0, 0,
+                     hybrid_node_num);
+
+    // tempNode's inverse graph to node[stage_idx][node_idx]
+    gen_inv_graph(tempNode, hybrid_stage_num, hybrid_node_num, node, stage_num,
+                  node_num, stage_idx, node_idx);
+    delete[] tempNode;
+  } else if (type == TYPE_IADST) {
+    int hybrid_stage_num = get_hybrid_stage_num(type, hybrid_node_num);
+    // generate a adst tempNode
+    Node *tempNode = new Node[hybrid_stage_num * hybrid_node_num];
+    init_graph(tempNode, hybrid_stage_num, hybrid_node_num);
+    gen_adst_graph(tempNode, hybrid_stage_num, hybrid_node_num, 0, 0,
+                   hybrid_node_num);
+
+    // tempNode's inverse graph to node[stage_idx][node_idx]
+    gen_inv_graph(tempNode, hybrid_stage_num, hybrid_node_num, node, stage_num,
+                  node_num, stage_idx, node_idx);
+    delete[] tempNode;
+  }
+}
+
+void gen_hybrid_graph_2d(Node *node, int stage_num, int node_num, int stage_idx,
+                         int node_idx, int hybrid_node_num, int type0,
+                         int type1) {
+  int hybrid_stage_num = get_hybrid_stage_num(type0, hybrid_node_num);
+
+  for (int ni = 0; ni < hybrid_node_num; ni++) {
+    gen_hybrid_graph_1d(node, stage_num, node_num, stage_idx,
+                        node_idx + ni * hybrid_node_num, hybrid_node_num,
+                        type0);
+    gen_hybrid_graph_1d(node, stage_num, node_num, stage_idx + hybrid_stage_num,
+                        node_idx + ni * hybrid_node_num, hybrid_node_num,
+                        type1);
+  }
+
+  // connect first layer and second layer
+  connect_layer_2d(node, stage_num, node_num, stage_idx + hybrid_stage_num - 1,
+                   node_idx, hybrid_node_num);
+}
+
+void gen_hybrid_graph_2d_new(Node *node, int stage_num, int node_num,
+                             int stage_idx, int node_idx, int hybrid_node_num0,
+                             int hybrid_node_num1, int type0, int type1) {
+  int hybrid_stage_num0 = get_hybrid_stage_num(type0, hybrid_node_num0);
+
+  for (int ni = 0; ni < hybrid_node_num1; ni++) {
+    gen_hybrid_graph_1d(node, stage_num, node_num, stage_idx,
+                        node_idx + ni * hybrid_node_num0, hybrid_node_num0,
+                        type0);
+  }
+  for (int ni = 0; ni < hybrid_node_num0; ni++) {
+    gen_hybrid_graph_1d(
+        node, stage_num, node_num, stage_idx + hybrid_stage_num0,
+        node_idx + ni * hybrid_node_num1, hybrid_node_num1, type1);
+  }
+
+  // connect first layer and second layer
+  connect_layer_2d_new(node, stage_num, node_num,
+                       stage_idx + hybrid_stage_num0 - 1, node_idx,
+                       hybrid_node_num0, hybrid_node_num1);
+}
+
+void gen_inv_graph(Node *node, int stage_num, int node_num, Node *invNode,
+                   int inv_stage_num, int inv_node_num, int inv_stage_idx,
+                   int inv_node_idx) {
+  // clean up inNodeNum in invNode because of add_node
+  for (int si = 1 + inv_stage_idx; si < inv_stage_idx + stage_num; si++) {
+    for (int ni = inv_node_idx; ni < inv_node_idx + node_num; ni++) {
+      int idx = get_idx(si, ni, inv_node_num);
+      invNode[idx].inNodeNum = 0;
+    }
+  }
+  // generate inverse graph of node on invNode
+  for (int si = 1; si < stage_num; si++) {
+    for (int ni = 0; ni < node_num; ni++) {
+      int invSi = stage_num - si;
+      int idx = get_idx(si, ni, node_num);
+      for (int k = 0; k < node[idx].inNodeNum; k++) {
+        int invNi = node[idx].inNodeIdx[k];
+        add_node(invNode, inv_stage_num, inv_node_num, invSi + inv_stage_idx,
+                 invNi + inv_node_idx, ni + inv_node_idx,
+                 node[idx].inWeight[k]);
+      }
+    }
+  }
+}
diff --git a/third_party/aom/tools/txfm_analyzer/txfm_graph.h b/third_party/aom/tools/txfm_analyzer/txfm_graph.h
new file mode 100644
index 000000000..76a9bc732
--- /dev/null
+++ b/third_party/aom/tools/txfm_analyzer/txfm_graph.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef TOOLS_TXFM_ANALYZER_H_
+#define TOOLS_TXFM_ANALYZER_H_
+
+struct Node {
+  Node *inNode[2];
+  int inNodeNum;
+  int inNodeIdx[2];
+  double inWeight[2];
+  double value;
+  int nodeIdx;
+  int stageIdx;
+  int visited;
+};
+
+#define PI (3.141592653589793238462643383279502884)
+#define STAGENUM (10)
+#define NODENUM (32)
+#define COS_MOD (128)
+
+typedef enum {
+  TYPE_DCT = 0,
+  TYPE_ADST,
+  TYPE_IDCT,
+  TYPE_IADST,
+  TYPE_LAST
+} TYPE_TXFM;
+
+TYPE_TXFM get_inv_type(TYPE_TXFM type);
+void get_fun_name(char *str_fun_name, int str_buf_size, const TYPE_TXFM type,
+                  const int txfm_size);
+
+void get_txfm_type_name(char *str_fun_name, int str_buf_size,
+                        const TYPE_TXFM type, const int txfm_size);
+void get_hybrid_2d_type_name(char *buf, int buf_size, const TYPE_TXFM type0,
+                             const TYPE_TXFM type1, const int txfm_size0,
+                             const int txfm_size1);
+unsigned int get_max_bit(unsigned int x);
+unsigned int bitwise_reverse(unsigned int x, int max_bit);
+int get_idx(int ri, int ci, int cSize);
+
+int get_dct_stage_num(int size);
+void reference_dct_1d(double *in, double *out, int size);
+void reference_dct_2d(double *in, double *out, int size);
+void connect_node(Node *node, int stage_num, int node_num, int stage_idx,
+                  int node_idx, int in0, double w0, int in1, double w1);
+void propagate(Node *node, int stage_num, int node_num, int stage);
+void init_graph(Node *node, int stage_num, int node_num);
+void graph_reset_visited(Node *node, int stage_num, int node_num);
+void gen_B_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                 int node_idx, int N, int star);
+void gen_P_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                 int node_idx, int N);
+
+void gen_type1_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int N);
+void gen_type2_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int N);
+void gen_type3_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int idx, int N);
+void gen_type4_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int idx, int N);
+
+void gen_R_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                 int node_idx, int N);
+
+void gen_DCT_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                   int node_idx, int N);
+
+void gen_DCT_graph_1d(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int dct_node_num);
+void connect_layer_2d(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int dct_node_num);
+
+void gen_DCT_graph_2d(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int dct_node_num);
+
+void gen_adst_B_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx);
+
+void gen_adst_U_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx, int adst_node_num);
+void gen_adst_T_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, double freq);
+
+void gen_adst_E_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx);
+
+void gen_adst_V_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx, int adst_node_num);
+
+void gen_adst_VJ_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                       int node_idx, int adst_node_num);
+void gen_adst_Q_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_node_num);
+void gen_adst_Ibar_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                         int node_idx, int adst_node_num);
+
+void gen_adst_D_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_node_num);
+
+int get_hadamard_idx(int x, int adst_node_num);
+void gen_adst_Ht_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                       int node_idx, int adst_node_num);
+
+int gen_adst_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                   int node_idx, int adst_node_num);
+int gen_iadst_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                    int node_idx, int adst_node_num);
+void reference_adst_1d(double *in, double *out, int size);
+
+int get_adst_stage_num(int adst_node_num);
+int get_hybrid_stage_num(int type, int hybrid_node_num);
+int get_hybrid_2d_stage_num(int type0, int type1, int hybrid_node_num);
+int get_hybrid_2d_stage_num_new(int type0, int type1, int hybrid_node_num0,
+                                int hybrid_node_num1);
+int get_hybrid_amplify_factor(int type, int hybrid_node_num);
+void gen_hybrid_graph_1d(Node *node, int stage_num, int node_num, int stage_idx,
+                         int node_idx, int hybrid_node_num, int type);
+void gen_hybrid_graph_2d(Node *node, int stage_num, int node_num, int stage_idx,
+                         int node_idx, int hybrid_node_num, int type0,
+                         int type1);
+void gen_hybrid_graph_2d_new(Node *node, int stage_num, int node_num,
+                             int stage_idx, int node_idx, int hybrid_node_num0,
+                             int hybrid_node_num1, int type0, int type1);
+
+void reference_hybrid_2d(double *in, double *out, int size, int type0,
+                         int type1);
+
+void reference_hybrid_2d_new(double *in, double *out, int size0, int size1,
+                             int type0, int type1);
+void reference_adst_dct_2d(double *in, double *out, int size);
+
+void gen_code(Node *node, int stage_num, int node_num, TYPE_TXFM type);
+
+void gen_inv_graph(Node *node, int stage_num, int node_num, Node *invNode,
+                   int inv_stage_num, int inv_node_num, int inv_stage_idx,
+                   int inv_node_idx);
+
+TYPE_TXFM hybrid_char_to_int(char ctype);
+
+int64_t round_shift(int64_t value, int bit);
+void round_shift_array(int32_t *arr, int size, int bit);
+void estimate_value(Node *node, int stage_num, int node_num, int stage_idx,
+                    int node_idx, int estimate_bit);
+void amplify_value(Node *node, int stage_num, int node_num, int stage_idx,
+                   int node_idx, int estimate_bit);
+void propagate_estimate_amlify(Node *node, int stage_num, int node_num,
+                               int stage_idx, int amplify_bit,
+                               int estimate_bit);
+#endif  // TOOLS_TXFM_ANALYZER_H_
diff --git a/third_party/aom/usage.dox b/third_party/aom/usage.dox
index 59239e8f1..062d35a83 100644
--- a/third_party/aom/usage.dox
+++ b/third_party/aom/usage.dox
@@ -108,28 +108,4 @@
     (comprised of characters in the set [a-z_a-Z0-9+/]). This information is not
     useful to an application at runtime, but may be of use to aom for support.
 
-
-    \section usage_deadline Deadline
-    Both the encoding and decoding functions have a <code>deadline</code>
-    parameter. This parameter indicates the amount of time, in microseconds
-    (us), that the application wants the codec to spend processing before
-    returning. This is a soft deadline -- that is, the semantics of the
-    requested operation take precedence over meeting the deadline. If, for
-    example, an application sets a <code>deadline</code> of 1000us, and the
-    frame takes 2000us to decode, the call to aom_codec_decode() will return
-    after 2000us. In this case the deadline is not met, but the semantics of the
-    function are preserved. If, for the same frame, an application instead sets
-    a <code>deadline</code> of 5000us, the decoder will see that it has 3000us
-    remaining in its time slice when decoding completes. It could then choose to
-    run a set of \ref usage_postproc filters, and perhaps would return after
-    4000us (instead of the allocated 5000us). In this case the deadline is met,
-    and the semantics of the call are preserved, as before.
-
-    The special value <code>0</code> is reserved to represent an infinite
-    deadline. In this case, the codec will perform as much processing as
-    possible to yield the highest quality frame.
-
-    By convention, the value <code>1</code> is used to mean "return as fast as
-    possible."
-
 */
diff --git a/third_party/aom/usage_cx.dox b/third_party/aom/usage_cx.dox
index dcf267ce4..51b4e8e3e 100644
--- a/third_party/aom/usage_cx.dox
+++ b/third_party/aom/usage_cx.dox
@@ -2,11 +2,7 @@
 
     The aom_codec_encode() function is at the core of the encode loop. It
     processes raw images passed by the application, producing packets of
-    compressed data. The <code>deadline</code> parameter controls the amount
-    of time in microseconds the encoder should spend working on the frame. For
-    more information on the <code>deadline</code> parameter, see
-    \ref usage_deadline.
-
+    compressed data.
 
     \ref samples
 
diff --git a/third_party/aom/usage_dx.dox b/third_party/aom/usage_dx.dox
index 6b76bf7b0..eef78376f 100644
--- a/third_party/aom/usage_dx.dox
+++ b/third_party/aom/usage_dx.dox
@@ -5,11 +5,7 @@
     decoded images. The decoder expects packets to comprise exactly one image
     frame of data. Packets \ref MUST be passed in decode order. If the
     application wishes to associate some data with the frame, the
-    <code>user_priv</code> member may be set. The <code>deadline</code>
-    parameter controls the amount of time in microseconds the decoder should
-    spend working on the frame. This is typically used to support adaptive
-    \ref usage_postproc based on the amount of free CPU time. For more
-    information on the <code>deadline</code> parameter, see \ref usage_deadline.
+    <code>user_priv</code> member may be set.
 
     \ref samples
 
@@ -55,8 +51,7 @@
     postprocessing filters, and the available filters may differ from platform
     to platform. Embedded devices often do not have enough CPU to implement
     postprocessing in software. The filter selection is generally handled
-    automatically by the codec, depending on the amount of time remaining before
-    hitting the user-specified \ref usage_deadline after decoding the frame.
+    automatically by the codec.
 
 
 */